diff --git a/.azure_pipelines/build-pipeline.yml b/.azure_pipelines/build-pipeline.yml deleted file mode 100644 index 34e39517e350..000000000000 --- a/.azure_pipelines/build-pipeline.yml +++ /dev/null @@ -1,63 +0,0 @@ -# PyTorch CI Builds Pipeline on Azure DevOps -# -# This pipeline: -# 1) builds PyTorch on select configurations -# 2) runs only TestTorch unit tests. - -stages: -- stage: 'Build' - displayName: 'Build PyTorch' - jobs: - - template: job_templates/build-verify-publish-template-unix.yml - parameters: - name: ubuntu_1804_CPU_docker - pool: 'PyTorch-Linux-CPU' - container_endpoint: pytorchms.azurecr.io - build_stage: True - is_ci_build: True - os: ubuntu - cuda: cpu - customMatrixes: - Py_38: - configuration: ubuntu_1804_py_38_cpu - container_image: pytorchms.azurecr.io/ubuntu_1804_py_38_cpu_dev - - - template: job_templates/build-verify-publish-template-unix.yml - parameters: - name: ubuntu_1804_GPU_docker - pool: 'PyTorch-Linux-GPU' - container_endpoint: pytorchms.azurecr.io - build_stage: True - is_ci_build: True - os: ubuntu - cuda: gpu - customMatrixes: - Py_39_CUDA_112_cuDNN_810: - configuration: ubuntu_1804_py_39_cuda_112_cudnn_810 - container_image: pytorchms.azurecr.io/ubuntu_1804_py_39_cuda_112_cudnn_8_dev - CUDA_VERSION: 112 - - - template: job_templates/build-verify-publish-template-win.yml - parameters: - name: windows_2019_CPU - pool: 'PyTorch-Win-CPU' - build_stage: True - is_ci_build: True - os: windows - cuda: cpu - customMatrixes: - Py_37: - configuration: windows_2019_py_37_cpu - - - template: job_templates/build-verify-publish-template-win.yml - parameters: - name: windows_2019_GPU - pool: 'PyTorch-Win-GPU' - build_stage: True - is_ci_build: True - os: windows - cuda: gpu - customMatrixes: - Py_38_CUDA_102_cuDNN_765: - configuration: windows_2019_py_38_cuda_102_cudnn_765 - CUDA_VERSION: 102 diff --git a/.azure_pipelines/daily-pipeline.yml b/.azure_pipelines/daily-pipeline.yml deleted file mode 100644 index 2c5c382befc3..000000000000 --- a/.azure_pipelines/daily-pipeline.yml +++ /dev/null @@ -1,82 +0,0 @@ -# PyTorch Daily Builds Pipeline on Azure DevOps -# -# This pipeline: -# 1) builds PyTorch on all available configurations -# 2) runs all PyTorch unit tests - -stages: -- stage: 'BuildTest' - displayName: 'Build and Test PyTorch' - jobs: - - template: job_templates/build-verify-publish-template-unix.yml - parameters: - name: ubuntu_1804_CPU_docker - pool: 'PyTorch-Linux-CPU' - container_endpoint: pytorchms.azurecr.io - build_stage: True - is_daily_build: True - os: ubuntu - cuda: cpu - customMatrixes: - Py_38: - configuration: ubuntu_1804_py_38_cpu - container_image: pytorchms.azurecr.io/ubuntu_1804_py_38_cpu_dev - Py_37: - configuration: ubuntu_1804_py_37_cpu - container_image: pytorchms.azurecr.io/ubuntu_1804_py_37_cpu_dev - - - template: job_templates/build-verify-publish-template-unix.yml - parameters: - name: ubuntu_1804_GPU_docker - pool: 'PyTorch-Linux-GPU' - container_endpoint: pytorchms.azurecr.io - build_stage: True - is_daily_build: True - os: ubuntu - cuda: gpu - customMatrixes: - Py_39_CUDA_112_cuDNN_810: - configuration: ubuntu_1804_py_39_cuda_112_cudnn_810 - container_image: pytorchms.azurecr.io/ubuntu_1804_py_39_cuda_112_cudnn_8_dev - CUDA_VERSION: 112 - Py_38_CUDA_102_cuDNN_810: - configuration: ubuntu_1804_py_38_cuda_102_cudnn_810 - container_image: pytorchms.azurecr.io/ubuntu_1804_py_38_cuda_102_cudnn_8_dev - CUDA_VERSION: 102 - Py_37_CUDA_101_cuDNN_765: - configuration: ubuntu_1804_py_37_cuda_101_cudnn_765 - container_image: pytorchms.azurecr.io/ubuntu_1804_py_37_cuda_101_cudnn_7_dev - CUDA_VERSION: 101 - - - template: job_templates/build-verify-publish-template-win.yml - parameters: - name: windows_2019_CPU - pool: 'PyTorch-Win-CPU' - build_stage: True - is_daily_build: True - os: windows - cuda: cpu - customMatrixes: - Py_38: - configuration: windows_2019_py_38_cpu - Py_37: - configuration: windows_2019_py_37_cpu - - - template: job_templates/build-verify-publish-template-win.yml - parameters: - name: windows_2019_GPU - pool: 'PyTorch-Win-GPU' - build_stage: True - is_daily_build: True - os: windows - cuda: gpu - customMatrixes: - Py_39_CUDA_112_cuDNN_810: - configuration: windows_2019_py_39_cuda_112_cudnn_810 - CUDA_VERSION: 112 - Py_38_CUDA_102_cuDNN_765: - configuration: windows_2019_py_38_cuda_102_cudnn_765 - CUDA_VERSION: 102 - Py_37_CUDA_101_cuDNN_764: - configuration: windows_2019_py_37_cuda_101_cudnn_764 - CUDA_VERSION: 101 diff --git a/.azure_pipelines/job_templates/build-verify-publish-template-unix.yml b/.azure_pipelines/job_templates/build-verify-publish-template-unix.yml deleted file mode 100644 index 6d428c1c6647..000000000000 --- a/.azure_pipelines/job_templates/build-verify-publish-template-unix.yml +++ /dev/null @@ -1,134 +0,0 @@ -# PyTorch build steps template with Unix images Azure DevOps Instances -# -# This build depends on 3 parameters set as environment variables in the pipeline: -# - AZURE_DEVOPS_CLI_PAT: Secret var for authenticating to Azure DevOps -# - AZURE_DEVOPS_ARTIFACTS_ORGANIZATION: Azure Artifacts Organization name to publish artifacts -# - AZURE_DEVOPS_ARTIFACTS_PROJECT: Azure Artifacts Project name to publish artifacts - -parameters: - name: '' - pool: '' - container_endpoint: '' - os: '' - cuda: '' - is_ci_build: False - is_official_build: False - is_daily_build: False - build_stage: False - verify_stage: False - publish_stage: False - customMatrixes: '' - -jobs: -- job: ${{parameters.name}} - timeoutInMinutes: 300 - strategy: - matrix: - ${{ insert }}: ${{parameters.customMatrixes}} - pool: - name: ${{ parameters.pool}} - variables: - DECODE_PERCENTS: false - container: - image: $[variables['container_image']] - endpoint: ${{parameters.container_endpoint}} - - steps: - # Build stage - - ${{ if eq(parameters.build_stage, 'True') }}: - # Set up environment variables for specific pipeline build - - template: set-environment-variables.yml - parameters: - os: ${{ parameters.os}} - cuda: ${{ parameters.cuda}} - is_official_build: ${{ parameters.is_official_build}} - - # Sync and update PyTorch submodules - - bash: git submodule update --init --recursive --jobs 0 - displayName: Update PyTorch submodules - - # Build PyTorch and run unit tests - no packaging - - ${{ if or(eq(parameters.is_ci_build, 'True'), eq(parameters.is_daily_build, 'True')) }}: - # Build PyTorch from source in develop mode - - bash: python setup.py develop - displayName: Build PyTorch from source - - - ${{ if eq(parameters.is_ci_build, 'True') }}: - # Run TestTorch unit tests to demonstrate successful PyTorch build - - bash: python test/test_torch.py TestTorch - displayName: Run TestTorch unit tests - - - ${{ if eq(parameters.is_daily_build, 'True') }}: - # Run all unit tests to demonstrate successful PyTorch build - - bash: python test/run_test.py --continue-through-error --exclude-jit-executor --verbose - displayName: Run all unit tests - - # Run ComponentGovernance - - task: ComponentGovernanceComponentDetection@0 - inputs: - scanType: 'Register' - verbosity: 'Verbose' - alertWarningLevel: 'High' - - # Build PyTorch and produce artifacts for verification stage - - ${{ if eq(parameters.is_official_build, 'True') }}: - # Build PyTorch from source in install mode and exclude test binaries - - bash: python setup.py install - displayName: Build PyTorch from source without test binaries - - # Package PyTorch Wheel - - bash: python setup.py bdist_wheel - displayName: Package PyTorch Wheel - - # Publish PyTorch Wheel - - task: PublishPipelineArtifact@1 - inputs: - targetPath: $(Build.SourcesDirectory)/dist/ - artifactName: Build_$(Build.BuildNumber)_$(configuration) - displayName: Publish PyTorch Wheel to Pipeline Artifacts - - # Verification stage - - ${{ if eq(parameters.verify_stage, 'True') }}: - # Download PyTorch Wheel - - task: DownloadPipelineArtifact@2 - inputs: - artifact: Build_$(Build.BuildNumber)_$(configuration) - path: $(Build.SourcesDirectory)/verify - displayName: Download PyTorch Wheel - - # Install PyTorch Wheel on Windows - - bash: python -m pip install $(Build.SourcesDirectory)/verify/torch*linux*.whl - displayName: Install PyTorch Wheel - - # Ensure PyTorch installed correctly from produced wheel - - bash: | - cd $(Build.SourcesDirectory)/verify - python -c "import torch; print('Installed Torch version: ' + torch.__version__)" - displayName: Check PyTorch correctly installed from wheel - - # Publishing stage - - ${{ if eq(parameters.publish_stage, 'True') }}: - # Download PyTorch Wheel - - task: DownloadPipelineArtifact@2 - inputs: - artifact: Build_$(Build.BuildNumber)_$(configuration) - path: $(Build.SourcesDirectory)/publish - displayName: Download PyTorch Wheel - - # Publish wheel to Azure Artifacts - # The flag continueOnError=true is needed as the artifact to be published - # may already exist, because the artifact is differentiated based on the - # last commit date. - - bash: | - export TORCH_VERSION=$(head -c 5 ./version.txt) - export LAST_COMMIT=$(git rev-parse --short HEAD) - export LAST_COMMIT_DATE=$(git log -1 --pretty=%ad --date=format:%Y%m%d) - cd $(Build.SourcesDirectory)/publish - export TORCH_WHEEL=$(echo torch*linux*whl) - az extension add -n azure-devops - echo $ADOTOKEN | az devops login - az artifacts universal publish --organization $AZURE_DEVOPS_ARTIFACTS_ORGANIZATION --project $AZURE_DEVOPS_ARTIFACTS_PROJECT --scope project --feed "PyTorch" --name $TORCH_WHEEL --description "PyTorch Official Build Artifact" --version $TORCH_VERSION-$LAST_COMMIT_DATE-$LAST_COMMIT --path . - env: - ADOTOKEN: $(AZURE_DEVOPS_CLI_PAT) - continueOnError: true - displayName: Upload PyTorch Official Build package to Azure Artifacts diff --git a/.azure_pipelines/job_templates/build-verify-publish-template-win.yml b/.azure_pipelines/job_templates/build-verify-publish-template-win.yml deleted file mode 100644 index 42f701e1edb9..000000000000 --- a/.azure_pipelines/job_templates/build-verify-publish-template-win.yml +++ /dev/null @@ -1,150 +0,0 @@ -# PyTorch build steps template with Windows images Azure DevOps Instances -# -# This build depends on 3 parameters set as environment variables in the pipeline: -# - AZURE_DEVOPS_CLI_PAT: Secret var for authenticating to Azure DevOps -# - AZURE_DEVOPS_ARTIFACTS_ORGANIZATION: Azure Artifacts Organization name to publish artifacts -# - AZURE_DEVOPS_ARTIFACTS_PROJECT: Azure Artifacts Project name to publish artifacts - -parameters: - name: '' - pool: '' - os: '' - cuda: '' - is_ci_build: False - is_official_build: False - is_daily_build: False - build_stage: False - verify_stage: False - publish_stage: False - customMatrixes: '' - -jobs: -- job: ${{parameters.name}} - timeoutInMinutes: 300 - strategy: - matrix: - ${{ insert }}: ${{parameters.customMatrixes}} - pool: - name: ${{ parameters.pool}} - variables: - CMAKE_GENERATOR: Ninja - PACKAGE_PDBS: 0 - - steps: - # Prepare for PyTorch build on Windows - - template: prepare-build-template.yml - parameters: - configuration: $(configuration) - build_stage: ${{ parameters.build_stage}} - - # Build Stage - - ${{ if eq(parameters.build_stage, 'True') }}: - # Set up environment variables for specific pipeline build - - template: set-environment-variables.yml - parameters: - os: ${{ parameters.os}} - cuda: ${{ parameters.cuda}} - is_official_build: ${{ parameters.is_official_build}} - - # Sync and update PyTorch submodules - - script: git submodule update --init --recursive --jobs 0 - displayName: Update PyTorch submodules - - # Build PyTorch and run unit tests - no packaging - - ${{ if or(eq(parameters.is_ci_build, 'True'), eq(parameters.is_daily_build, 'True')) }}: - # Build PyTorch from source in develop mode with Ninja - - script: call activate $(configuration) && python setup.py develop - displayName: Build PyTorch from source - - - ${{ if eq(parameters.is_ci_build, 'True') }}: - # Run TestTorch unit tests to demonstrate successful PyTorch build - - script: call activate $(configuration) && python test\test_torch.py TestTorch - displayName: Run TestTorch unit tests - - - ${{ if eq(parameters.is_daily_build, 'True') }}: - # Run all unit tests to demonstrate successful PyTorch build - - script: call activate $(configuration) && python test/run_test.py --continue-through-error --exclude-jit-executor --verbose - displayName: Run all unit tests - - # Run ComponentGovernance - - task: ComponentGovernanceComponentDetection@0 - inputs: - scanType: 'Register' - verbosity: 'Verbose' - alertWarningLevel: 'High' - - # Build PyTorch and produce artifacts for verification stage - - ${{ if eq(parameters.is_official_build, 'True') }}: - # Build PyTorch from source in install mode with Ninja and exclude test binaries - - script: call activate $(configuration) && python setup.py install - displayName: Build PyTorch from source without test binaries - - # Package PyTorch Wheel - - script: call activate $(configuration) && python setup.py bdist_wheel - displayName: Package PyTorch Wheel - - # Publish PyTorch Wheel - - task: PublishPipelineArtifact@1 - inputs: - targetPath: $(Build.SourcesDirectory)\dist\ - artifactName: Build_$(Build.BuildNumber)_$(configuration) - displayName: Publish PyTorch Wheel to Pipeline Artifacts - - # Verification Stage - - ${{ if eq(parameters.verify_stage, 'True') }}: - # Download PyTorch Wheel - - task: DownloadPipelineArtifact@2 - inputs: - artifact: Build_$(Build.BuildNumber)_$(configuration) - path: $(Build.SourcesDirectory)\verify - displayName: Download PyTorch Wheel - - # Install PyTorch Wheel on Windows - - script: | - call activate $(configuration) - cd $(Build.SourcesDirectory)\verify - dir torch*win*.whl /b > whl.txt - set /p whl= < whl.txt - python -m pip install %whl% - displayName: Install PyTorch Wheel - - # Ensure PyTorch installed correctly from produced wheel - - script: | - call activate $(configuration) - cd $(Build.SourcesDirectory)\verify - python -c "import torch; print('Installed Torch version: ' + torch.__version__)" - displayName: Check PyTorch correctly installed from wheel - - # Publishing stage - - ${{ if eq(parameters.publish_stage, 'True') }}: - # Download PyTorch Wheel - - task: DownloadPipelineArtifact@2 - inputs: - artifact: Build_$(Build.BuildNumber)_$(configuration) - path: $(Build.SourcesDirectory)\publish - displayName: Download PyTorch Wheel - - # Set up Azure Artifacts for Windows - # The pip install --upgrade command is a bug fix for Azure CLI on Windows - # More info: https://github.com/Azure/azure-cli/issues/16858 - - script: | - pip install --upgrade pip --target \opt\az\lib\python3.6\site-packages\ - az extension add -n azure-devops - displayName: Set up Azure Artifacts download on Windows - - # Publish wheel to Azure Artifacts - # The flag continueOnError=true is needed as the artifact to be published - # may already exist, because the artifact is differentiated based on the - # last commit date. - - script: | - set /p TORCH_VERSION= < version.txt - cd $(Build.SourcesDirectory)\publish - git rev-parse --short HEAD > last_commit.txt && set /p LAST_COMMIT= < last_commit.txt - git log -1 --pretty=%ad --date=format:%Y%m%d > last_commit_date.txt && set /p LAST_COMMIT_DATE= < last_commit_date.txt - dir torch*win*.whl /b > whl.txt && set /p TORCH_WHEEL= < whl.txt - echo %ADOTOKEN% | az devops login - az artifacts universal publish --organization %AZURE_DEVOPS_ARTIFACTS_ORGANIZATION% --project %AZURE_DEVOPS_ARTIFACTS_PROJECT% --scope project --feed "PyTorch" --name %TORCH_WHEEL% --description "PyTorch Official Build Artifact" --version %TORCH_VERSION:~0,5%-%LAST_COMMIT_DATE%-%LAST_COMMIT% --path . - env: - ADOTOKEN: $(AZURE_DEVOPS_CLI_PAT) - continueOnError: true - displayName: Upload PyTorch nigthly package to Azure Artifacts diff --git a/.azure_pipelines/job_templates/common-packages.yml b/.azure_pipelines/job_templates/common-packages.yml deleted file mode 100644 index 2760f673cb77..000000000000 --- a/.azure_pipelines/job_templates/common-packages.yml +++ /dev/null @@ -1,17 +0,0 @@ -dependencies: - - python=PYTHON_VERSION - - numpy - - ninja - - pyyaml - - mkl - - mkl-include - - setuptools - - cmake - - cffi - - typing_extensions - - future - - six - - requests - - dataclasses - - pip: - - -r ../../requirements.txt diff --git a/.azure_pipelines/job_templates/notify-webapp-template.yml b/.azure_pipelines/job_templates/notify-webapp-template.yml deleted file mode 100644 index 3b6a5314e11a..000000000000 --- a/.azure_pipelines/job_templates/notify-webapp-template.yml +++ /dev/null @@ -1,26 +0,0 @@ -parameters: - name: '' - pool: '' - customMatrixes: '' - -jobs: -- job: ${{parameters.name}} - timeoutInMinutes: 600 - strategy: - matrix: - ${{ insert }}: ${{parameters.customMatrixes}} - pool: - name: ${{ parameters.pool}} - steps: - # Clone PyTorch Tests repository - - bash: | - B64_PAT=$(echo -n ":$_ADOTOKEN" | base64) - git -c http.extraHeader="Authorization: Basic ${B64_PAT}" clone $(AZURE_DEVOPS_PYTORCH_TESTS_REPO_URL) - cd pytorch_tests - git checkout $(PYTORCH_TESTS_CHECKOUT_BRANCH) - env: - _ADOTOKEN: $(AZURE_DEVOPS_CLI_PAT) - displayName: Clone PyTorch Tests repo - - bash: | - bash $(Build.SourcesDirectory)/pytorch_tests/webapp/notify_webapp.sh - displayName: Notify Webapp diff --git a/.azure_pipelines/job_templates/prepare-build-template.yml b/.azure_pipelines/job_templates/prepare-build-template.yml deleted file mode 100644 index 0755c07e2672..000000000000 --- a/.azure_pipelines/job_templates/prepare-build-template.yml +++ /dev/null @@ -1,62 +0,0 @@ -# Build prepare steps for PyTorch on Azure DevOps to build from source. -# These steps share between normal build process and semmle security scan tasks - -parameters: - build_stage: False - configuration: '' - -steps: -# End Python tasks that may be lingering over from previous runs -# Note: If python.exe isn't currently running, exit code becomes 128, -# which fails the run. Here exit code is set to 0 to avoid failed run. -- script: | - taskkill /f /im python.exe - IF %ERRORLEVEL% EQU 128 exit 0 - displayName: End previous Python processes - -# Clean up env directory in conda for fresh builds and set up conda environment YAML -- powershell: | - Remove-Item 'C:\Miniconda\envs' -Recurse -ErrorAction Ignore - $env:PYTHON_VERSION = $env:SYSTEM_JOBNAME.Substring(3,1) + '.' + $env:SYSTEM_JOBNAME.Substring(4,1) - (Get-Content .azure_pipelines\job_templates\common-packages.yml) -replace 'PYTHON_VERSION', $env:PYTHON_VERSION | Out-File -encoding ASCII .azure_pipelines\job_templates\common-packages.yml - displayName: Clean up previous environments and Set up conda environment YAML - -# Make conda environment and install required packages -- script: | - call conda clean --all -y - call conda env create -n $(configuration) --file .azure_pipelines\job_templates\common-packages.yml - call activate $(configuration) - call conda install -c conda-forge libuv=1.39 - displayName: Set up conda environment for building from source - -- ${{ if eq(parameters.build_stage, 'True') }}: - # Install MKL - - script: | - rmdir /s /q mkl - del mkl_2020.2.254.7z - curl https://s3.amazonaws.com/ossci-windows/mkl_2020.2.254.7z -k -O - 7z x -aoa mkl_2020.2.254.7z -omkl - displayName: Install MKL - - # Install sccache and randomtemp - # Related PyTorch GitHub issue: https://github.com/pytorch/pytorch/issues/25393 - # Related fix: https://github.com/pytorch/builder/pull/448/ - - script: | - mkdir .\tmp_bin - curl -k https://s3.amazonaws.com/ossci-windows/sccache.exe --output .\tmp_bin\sccache.exe - curl -k https://s3.amazonaws.com/ossci-windows/sccache-cl.exe --output .\tmp_bin\sccache-cl.exe - copy .\tmp_bin\sccache.exe .\tmp_bin\nvcc.exe - curl -kL https://github.com/peterjc123/randomtemp-rust/releases/download/v0.4/randomtemp.exe --output .\tmp_bin\randomtemp.exe - displayName: Install sccache and randomtemp - condition: not(eq(variables.CUDA_VERSION, '')) - - # CUDA 11.2's CUB directory conflicts with CUDA 10.2 and 10.1 - # builds, where CUDA 11.2's CUB is injected into non-CUDA - # 11.2 builds. - - powershell: Remove-Item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\include\cub" -Recurse -ErrorAction Ignore - displayName: Remove conflicting CUB from CUDA installation - condition: not(eq(variables.CUDA_VERSION, '')) - - - powershell: Copy-Item -Path "F:\cuda_11_2\cub\" -Destination "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\include" -Recurse - displayName: Copy CUDA CUB for CUDA 11.2 build - condition: eq(variables.CUDA_VERSION, '112') diff --git a/.azure_pipelines/job_templates/pytorch-template-unix.yml b/.azure_pipelines/job_templates/pytorch-template-unix.yml deleted file mode 100644 index 7f826e7cd382..000000000000 --- a/.azure_pipelines/job_templates/pytorch-template-unix.yml +++ /dev/null @@ -1,61 +0,0 @@ -# PyTorch build steps template with Unix images Azure DevOps Instances -# -# This build depends on 5 parameters set as an environment variables in the pipeline: -# - AZURE_DEVOPS_CLI_PAT: Secret var for authenticating to Azure DevOps -# - AZURE_STORAGE_KEY: Secret var for authenticating to Azure Storage -# - _TS_CLONE_P, _TS_P, _TS_SM_P: Secret vars for specific unit tests - -parameters: - name: '' - pool: '' - container_endpoint: '' - customMatrixes: '' - -jobs: -- job: ${{parameters.name}} - timeoutInMinutes: 600 - strategy: - matrix: - ${{ insert }}: ${{parameters.customMatrixes}} - pool: - name: ${{ parameters.pool}} - variables: - DECODE_PERCENTS: false - - steps: - # Don't checkout repo contents to save time and CPU compute. Environment variables - # related to checkout branch such as $(BUILD_SOURCEBRANCH) are still available. - - checkout: none - - # Delete pytorch_tests repo from previous builds if exists - - bash: rm -rf pytorch_tests/ - displayName: Delete pytorch_tests repo from previous builds if exists - - # Clone PyTorch Tests repository - - bash: | - B64_PAT=$(echo -n ":$_ADOTOKEN" | base64) - git -c http.extraHeader="Authorization: Basic ${B64_PAT}" clone $(AZURE_DEVOPS_PYTORCH_TESTS_REPO_URL) - cd pytorch_tests - git checkout $(PYTORCH_TESTS_CHECKOUT_BRANCH) - env: - _ADOTOKEN: $(AZURE_DEVOPS_CLI_PAT) - displayName: Clone PyTorch Tests repo - - # Run PyTorch Unit Tests - - bash: bash $(Build.SourcesDirectory)/pytorch_tests/scripts/linux/run.sh - env: - _AZURE_STORAGE_KEY: $(AZURE_STORAGE_KEY) - _TS_CLONE_P: $(TS_CLONE_PASSWORD) - _TS_P: $(TS_PAT) - _TS_SM_P: $(TS_SM_PAT) - _AZUREML_CLONE_PASSWORD: $(AZUREML_CLONE_PASSWORD) - _SPPASSWORD: $(SPPASSWORD) - displayName: Run PyTorch Unit Tests - - # Tests results are available outside the docker container since - # the current directory is mounted as a volume of the container. - - task: PublishTestResults@2 - condition: always() - inputs: - testResultsFiles: '**/test-*.xml' - testRunTitle: 'Publish test results for Python' diff --git a/.azure_pipelines/job_templates/pytorch-template-win.yml b/.azure_pipelines/job_templates/pytorch-template-win.yml deleted file mode 100644 index 5d3704313010..000000000000 --- a/.azure_pipelines/job_templates/pytorch-template-win.yml +++ /dev/null @@ -1,57 +0,0 @@ -# PyTorch build steps template with Windows images Azure DevOps Instances -# -# This build depends on 5 parameters set as an environment variables in the pipeline: -# - AZURE_DEVOPS_CLI_PAT: Secret var for authenticating to Azure DevOps -# - AZURE_STORAGE_KEY: Secret var for authenticating to Azure Storage -# - _TS_CLONE_P, _TS_P, _TS_SM_P: Secret vars for specific unit tests - -parameters: - name: '' - pool: '' - customMatrixes: '' - -jobs: -- job: ${{parameters.name}} - timeoutInMinutes: 600 - strategy: - matrix: - ${{ insert }}: ${{parameters.customMatrixes}} - pool: - name: ${{ parameters.pool}} - - steps: - # Don't checkout repo contents to save time and CPU compute. Environment variables - # related to checkout branch such as $(BUILD_SOURCEBRANCH) are still available. - - checkout: none - - # Delete pytorch_tests repo from previous builds if exists - - script: if exist "pytorch_tests/" rmdir "pytorch_tests/" /q /s - displayName: Delete pytorch_tests repo from previous builds if exists - - # Clone PyTorch Tests repository - - powershell: | - $env:B64Pat = [Convert]::ToBase64String([System.Text.Encoding]::UTF8.GetBytes(":$env:_ADOTOKEN")) - git -c http.extraHeader="Authorization: Basic $env:B64Pat" clone $env:AZURE_DEVOPS_pytorch_tests_REPO_URL - cd pytorch_tests - git checkout $(PYTORCH_TESTS_CHECKOUT_BRANCH) - env: - _ADOTOKEN: $(AZURE_DEVOPS_CLI_PAT) - displayName: Clone PyTorch Tests repo - - # Run PyTorch Unit Tests - - script: call $(Build.SourcesDirectory)\pytorch_tests\scripts\windows\run.bat - env: - _ADOTOKEN: $(AZURE_DEVOPS_CLI_PAT) - _AZURE_STORAGE_KEY: $(AZURE_STORAGE_KEY) - _TS_CLONE_P: $(TS_CLONE_PASSWORD) - _TS_P: $(TS_PAT) - _TS_SM_P: $(TS_SM_PAT) - displayName: Run PyTorch Unit Tests - - # Tests results are available outside the docker container since - # the current directory is mounted as a volume of the container. - - task: PublishTestResults@2 - condition: always() - inputs: - testResultsFiles: '**\test-*.xml' - testRunTitle: 'Publish test results for Python' diff --git a/.azure_pipelines/job_templates/set-environment-variables.yml b/.azure_pipelines/job_templates/set-environment-variables.yml deleted file mode 100644 index 40d1cb384b2a..000000000000 --- a/.azure_pipelines/job_templates/set-environment-variables.yml +++ /dev/null @@ -1,129 +0,0 @@ -# Set environment variables for specific configurations - -parameters: - is_official_build: False - os: '' - cuda: '' - -steps: - # Environment configuration steps for Ubuntu builds - - ${{ if contains(parameters.os, 'ubuntu') }}: - # Set configuration specific build flags - - ${{ if eq(parameters.is_official_build, True) }}: - - bash: | - echo "##vso[task.setvariable variable=INSTALL_TEST;]0" - echo "##vso[task.setvariable variable=PYTORCH_BUILD_NUMBER;]1" - export PYTORCH_VERSION=$(head -c 5 ./version.txt) - echo "##vso[task.setvariable variable=PYTORCH_BUILD_VERSION;]$PYTORCH_VERSION.dev" - displayName: Set configuration-specific build flags - - # Set PyTorch CPU/GPU build flags. - - ${{ if contains(parameters.cuda, 'cpu') }}: - - bash: | - echo "##vso[task.setvariable variable=USE_CUDA;]0" - echo "##vso[task.setvariable variable=PYTORCH_BUILD_VERSION;]$(PYTORCH_BUILD_VERSION).cpu" - displayName: Set CUDA-specific build flag for CPU builds - - - ${{ if contains(parameters.cuda, 'gpu') }}: - - bash: | - echo "##vso[task.setvariable variable=USE_CUDA;]1" - echo "##vso[task.setvariable variable=PYTORCH_BUILD_VERSION;]$(PYTORCH_BUILD_VERSION).cu$(CUDA_VERSION)" - displayName: Set CUDA-specific build flag for GPU builds - - # Set MKL environment variables - - bash: | - echo "##vso[task.setvariable variable=CMAKE_LIBRARY_PATH;]/opt/intel/lib:$CMAKE_LIBRARY_PATH" - echo "##vso[task.setvariable variable=CMAKE_INCLUDE_PATH;]/opt/intel/include:$CMAKE_INCLUDE_PATH" - displayName: Set MKL paths - - # View current environment variables - - bash: - printenv - displayName: Show environment variables - - # Environment configuration steps for Windows builds - - ${{ if contains(parameters.os, 'windows') }}: - # Set Conda Lib Path - - powershell: Write-Host "##vso[task.setvariable variable=CONDA_LIB_PATH;]C:\Miniconda\envs\$(configuration)\Library\bin" - displayName: Set Conda Lib Path - - # Set configuration specific build flags - - ${{ if eq(parameters.is_official_build, True) }}: - - powershell: | - Write-Host "##vso[task.setvariable variable=INSTALL_TEST;]0" - Write-Host "##vso[task.setvariable variable=PYTORCH_BUILD_NUMBER;]1" - Set-Variable -Name PYTORCH_VERSION -Value (Get-Content .\version.txt).Substring(0,5) - Write-Host "##vso[task.setvariable variable=PYTORCH_BUILD_VERSION;]$PYTORCH_VERSION.dev" - displayName: Set configuration-specific build flags - - # Set PyTorch CPU/GPU build flags.. - - ${{ if contains(parameters.cuda, 'cpu') }}: - - powershell: | - Write-Host "##vso[task.setvariable variable=USE_CUDA;]0" - Write-Host "##vso[task.setvariable variable=PYTORCH_BUILD_VERSION;]$(PYTORCH_BUILD_VERSION).cpu" - displayName: Set CUDA-specific build flag for CPU build - - - ${{ if contains(parameters.cuda, 'gpu') }}: - - powershell: | - Write-Host "##vso[task.setvariable variable=USE_CUDA;]1" - Write-Host "##vso[task.setvariable variable=PYTORCH_BUILD_VERSION;]$(PYTORCH_BUILD_VERSION).cu$(CUDA_VERSION)" - displayName: Set CUDA-specific build flag for GPU build - - # Set CUDA 11.2, 10.2 or 10.1 specific build flags - - ${{ if eq(parameters.cuda, 'gpu') }}: - - powershell: | - Write-Host "##vso[task.setvariable variable=TORCH_CUDA_ARCH_LIST;]3.7+PTX;5.0;6.0;6.1;7.0;7.5;8.0;8.6" - Write-Host "##vso[task.setvariable variable=CUDA_PATH;]C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\" - displayName: Set CUDA 11.2 specific build flags - condition: eq(variables.CUDA_VERSION, '112') - - - powershell: | - Write-Host "##vso[task.setvariable variable=TORCH_CUDA_ARCH_LIST;]3.7+PTX;5.0;6.0;6.1;7.0;7.5" - Write-Host "##vso[task.setvariable variable=CUDA_PATH;]C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\" - displayName: Set CUDA 10.2 specific build flags - condition: eq(variables.CUDA_VERSION, '102') - - - powershell: | - Write-Host "##vso[task.setvariable variable=TORCH_CUDA_ARCH_LIST;]3.7+PTX;5.0;6.0;6.1;7.0;7.5" - Write-Host "##vso[task.setvariable variable=CUDA_PATH;]C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1\" - displayName: Set CUDA 10.1 specific build flags - condition: eq(variables.CUDA_VERSION, '101') - - - powershell: | - Write-Host "##vso[task.setvariable variable=CUDA_BIN_PATH;]$env:CUDA_PATH\bin\" - Write-Host "##vso[task.setvariable variable=CUDNN_ROOT;]$env:CUDA_PATH" - Write-Host "##vso[task.setvariable variable=CUDNN_INCLUDE_DIR;]$env:CUDA_PATH\include\" - Write-Host "##vso[task.setvariable variable=CUDNN_LIBRARY;]$env:CUDA_PATH\lib\x64\" - Write-Host "##vso[task.prependpath]$env:CUDA_PATH\bin" - Write-Host "##vso[task.setvariable variable=TORCH_NVCC_FLAGS;]-Xfatbin -compress-all --no-host-device-move-forward" - Write-Host "##vso[task.setvariable variable=THRUST_IGNORE_CUB_VERSION_CHECK;]1" - Write-Host "##vso[task.setvariable variable=NVTOOLSEXT_PATH;]C:\Program Files\NVIDIA Corporation\NvToolsExt\" - displayName: Set CUDA environment variables - - - powershell: | - copy "$(CUDA_BIN_PATH)\cusparse*64_*.dll*" $(Build.SourcesDirectory)\torch\lib - copy "$(CUDA_BIN_PATH)\cublas*64_*.dll*" $(Build.SourcesDirectory)\torch\lib - copy "$(CUDA_BIN_PATH)\cudart*64_*.dll*" $(Build.SourcesDirectory)\torch\lib - copy "$(CUDA_BIN_PATH)\curand*64_*.dll*" $(Build.SourcesDirectory)\torch\lib - copy "$(CUDA_BIN_PATH)\cufft*64_*.dll*" $(Build.SourcesDirectory)\torch\lib - copy "$(CUDA_BIN_PATH)\cusolver*64_*.dll*" $(Build.SourcesDirectory)\torch\lib - copy "$(CUDA_BIN_PATH)\cudnn*64_*.dll*" $(Build.SourcesDirectory)\torch\lib - copy "$(CUDA_BIN_PATH)\nvrtc*64_*.dll*" $(Build.SourcesDirectory)\torch\lib - copy "C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64\nvToolsExt64_1.dll*" $(Build.SourcesDirectory)\torch\lib - copy "$(CONDA_LIB_PATH)\libiomp*5md.dll" $(Build.SourcesDirectory)\torch\lib - copy "$(CONDA_LIB_PATH)\uv.dll" $(Build.SourcesDirectory)\torch\lib - displayName: Copy CUDA/cuDNN/libomp/libuv dlls to torch\lib - - # Set MKL, sccache and randomtemp environment variables - - powershell: | - Write-Host "##vso[task.setvariable variable=CMAKE_INCLUDE_PATH;]$(Build.SourcesDirectory)\mkl\include" - Write-Host "##vso[task.setvariable variable=CMAKE_LIBRARY_PATH;]$(Build.SourcesDirectory)\mkl\lib;$env:CMAKE_LIBRARY_PATH" - Write-Host "##vso[task.setvariable variable=ADDITIONAL_PATH;]$(Build.SourcesDirectory)\tmp_bin" - Write-Host "##vso[task.setvariable variable=SCCACHE_IDLE_TIMEOUT;]1500" - Write-Host "##vso[task.setvariable variable=CMAKE_CUDA_COMPILER_LAUNCHER;]$(Build.SourcesDirectory)/tmp_bin/randomtemp.exe;$(Build.SourcesDirectory)/tmp_bin/sccache.exe" - displayName: Set MKL, sccache and randomtemp environment variables - - # View current environment variables - - script: - set - displayName: Show environment variables diff --git a/.azure_pipelines/job_templates/wheel-wait-job-template.yml b/.azure_pipelines/job_templates/wheel-wait-job-template.yml deleted file mode 100644 index 816eea9cca20..000000000000 --- a/.azure_pipelines/job_templates/wheel-wait-job-template.yml +++ /dev/null @@ -1,14 +0,0 @@ -# Main logic to initiate wait for PR artifact to be ready - -steps: -- task: InvokeRESTAPI@1 - displayName: 'Wait for job success and wheel ready' - timeoutInMinutes: 60 - inputs: - connectionType: 'connectedServiceName' - serviceConnection: circleciconn - method: 'POST' - headers: '{"Content-Type":"application/json", "BranchName":"$(_TARGET_BRANCH_TO_CHECK)", "JobName":"$(TARGET_CIRCLECI_BUILD_PR)", "PRNumber":"$(_TARGET_PR_NUMBER)", "TargetCommit":"$(_TARGET_COMMIT)", "PlanUrl":"$(System.CollectionUri)", "ProjectId":"$(System.TeamProjectId)", "HubName":"$(System.HostType)", "PlanId":"$(System.PlanId)", "JobId":"$(System.JobId)", "TimelineId":"$(System.TimelineId)", "TaskInstanceId":"$(System.TaskInstanceId)", "AuthToken":"$(System.AccessToken)"}' - body: '' - urlSuffix: 'api/JobStatus' - waitForCompletion: true diff --git a/.azure_pipelines/job_templates/wheel-wait-template.yml b/.azure_pipelines/job_templates/wheel-wait-template.yml deleted file mode 100644 index cd2f76ac4d84..000000000000 --- a/.azure_pipelines/job_templates/wheel-wait-template.yml +++ /dev/null @@ -1,92 +0,0 @@ -# Initiate 5 agentless-server waiting jobs to check on the -# status of PR artifact builds, for a maximum wait time of -# 11*60 min=660 mins. These jobs will pass immediately -# once targeted CircleCI build is ready. - -jobs: -- job: checkjob1 - pool: server - timeoutInMinutes: 60 - continueOnError: true - steps: - - template: wheel-wait-job-template.yml - -- job: checkjob2 - pool: server - timeoutInMinutes: 60 - dependsOn: checkjob1 - continueOnError: true - steps: - - template: wheel-wait-job-template.yml - -- job: checkjob3 - pool: server - timeoutInMinutes: 60 - dependsOn: checkjob2 - continueOnError: true - steps: - - template: wheel-wait-job-template.yml - -- job: checkjob4 - pool: server - timeoutInMinutes: 60 - dependsOn: checkjob3 - continueOnError: true - steps: - - template: wheel-wait-job-template.yml - -- job: checkjob5 - pool: server - timeoutInMinutes: 60 - dependsOn: checkjob4 - continueOnError: true - steps: - - template: wheel-wait-job-template.yml - -- job: checkjob6 - pool: server - timeoutInMinutes: 60 - dependsOn: checkjob5 - continueOnError: true - steps: - - template: wheel-wait-job-template.yml - -- job: checkjob7 - pool: server - timeoutInMinutes: 60 - dependsOn: checkjob6 - continueOnError: true - steps: - - template: wheel-wait-job-template.yml - -- job: checkjob8 - pool: server - timeoutInMinutes: 60 - dependsOn: checkjob7 - continueOnError: true - steps: - - template: wheel-wait-job-template.yml - -- job: checkjob9 - pool: server - timeoutInMinutes: 60 - dependsOn: checkjob8 - continueOnError: true - steps: - - template: wheel-wait-job-template.yml - -- job: checkjob10 - pool: server - timeoutInMinutes: 60 - dependsOn: checkjob9 - continueOnError: true - steps: - - template: wheel-wait-job-template.yml - -- job: checkjob11 - pool: server - timeoutInMinutes: 60 - dependsOn: checkjob10 - continueOnError: true - steps: - - template: wheel-wait-job-template.yml diff --git a/.azure_pipelines/nightly-pytorch-tests-pipeline.yml b/.azure_pipelines/nightly-pytorch-tests-pipeline.yml deleted file mode 100644 index 79273c1d3922..000000000000 --- a/.azure_pipelines/nightly-pytorch-tests-pipeline.yml +++ /dev/null @@ -1,60 +0,0 @@ -# PyTorch Nightly PyTorch Tests Builds Pipeline on Azure DevOps -# -# This pipeline runs custom PyTorch unit-tests on nightly -# PyTorch wheels. - -stages: -- stage: 'NightlyCustomTests' - displayName: 'Run custom unit tests on PyTorch wheels' - jobs: - - template: job_templates/pytorch-template-unix.yml - parameters: - name: ubuntu_1804_CPU_docker - pool: $(BUILD_POOL_LIN_1) - customMatrixes: - Nightly_Custom_Tests: - _DOCKER_IMAGE: $(DOCKER_IMAGE_LIN_1) - _PYTHON_VERSION: $(PYTHON_VERSION_LIN_1) - _CUDA_BUILD_VERSION: $(CUDA_BUILD_VERSION_LIN_1) - _RUN_TESTS: $(RUN_TESTS_LIN) - - - template: job_templates/pytorch-template-unix.yml - parameters: - name: ubuntu_1804_GPU_docker - pool: $(BUILD_POOL_LIN_2) - customMatrixes: - Nightly_Custom_Tests: - _DOCKER_IMAGE: $(DOCKER_IMAGE_LIN_2) - _PYTHON_VERSION: $(PYTHON_VERSION_LIN_2) - _CUDA_BUILD_VERSION: $(CUDA_BUILD_VERSION_LIN_2) - _RUN_TESTS: $(RUN_TESTS_LIN) - - - template: job_templates/pytorch-template-win.yml - parameters: - name: windows_2019_CPU - pool: $(BUILD_POOL_WIN_1) - customMatrixes: - Nightly_Custom_Tests: - _PYTHON_VERSION: $(PYTHON_VERSION_WIN_1) - _CUDA_BUILD_VERSION: $(CUDA_BUILD_VERSION_WIN_1) - _RUN_TESTS: $(RUN_TESTS_WIN) - - - template: job_templates/pytorch-template-win.yml - parameters: - name: windows_2019_GPU - pool: $(BUILD_POOL_WIN_2) - customMatrixes: - Nightly_Custom_Tests: - _PYTHON_VERSION: $(PYTHON_VERSION_WIN_2) - _CUDA_BUILD_VERSION: $(CUDA_BUILD_VERSION_WIN_2) - _RUN_TESTS: $(RUN_TESTS_WIN) - -- stage: 'NotifyWebapp' - displayName: 'Notify Webapp that pipeline is finished' - dependsOn: NightlyCustomTests - condition: succeededOrFailed() - jobs: - - template: job_templates/notify-webapp-template.yml - parameters: - name: ubuntu_1804_CPU - pool: $(BUILD_POOL_LIN_1) diff --git a/.azure_pipelines/pytorch-tests-pipeline.yml b/.azure_pipelines/pytorch-tests-pipeline.yml deleted file mode 100644 index 56813572169d..000000000000 --- a/.azure_pipelines/pytorch-tests-pipeline.yml +++ /dev/null @@ -1,62 +0,0 @@ -# PyTorch PR PyTorch Tests Builds Pipeline on Azure DevOps -# -# This pipeline: -# 1) ensures that CircleCI builds for a given PR -# have finished, and that its artifacts are -# ready for download -# 2) runs custom PyTorch unit-tests on PyTorch -# wheels generated during PR builds. - -resources: - webhooks: - - webhook: GitHubPyTorchPRTrigger - connection: GitHubPyTorchPRTriggerConnection - filters: - - path: repositoryName - value: pytorch_tests - -stages: -- stage: 'EnsureArtifactsReady' - displayName: 'Ensure PyTorch PR Artifacts are ready' - jobs: - - template: job_templates/wheel-wait-template.yml - variables: - _TARGET_BRANCH_TO_CHECK: ${{parameters.GitHubPyTorchPRTrigger.TARGET_BRANCH_TO_CHECK_AZ_DEVOPS_PR}} - _TARGET_PR_NUMBER: ${{parameters.GitHubPyTorchPRTrigger.PR_NUMBER}} - _TARGET_COMMIT: ${{parameters.GitHubPyTorchPRTrigger.TARGET_COMMIT}} - -- stage: 'PRCustomTests' - displayName: 'Run custom unit tests on PyTorch wheels' - dependsOn: EnsureArtifactsReady - condition: succeeded() - jobs: - - template: job_templates/pytorch-template-unix.yml - parameters: - name: ubuntu_1804_GPU_docker - pool: $(BUILD_POOL_PR) - customMatrixes: - PR_Custom_Tests: - _PYTHON_VERSION: $(PYTHON_VERSION_PR) - _CUDA_BUILD_VERSION: $(CUDA_BUILD_VERSION_PR) - _TARGET_CIRCLECI_BUILD: $(TARGET_CIRCLECI_BUILD_PR) - _TARGET_BRANCH_TO_CHECK: ${{parameters.GitHubPyTorchPRTrigger.TARGET_BRANCH_TO_CHECK_AZ_DEVOPS_PR}} - _TARGET_PR_NUMBER: ${{parameters.GitHubPyTorchPRTrigger.PR_NUMBER}} - _TARGET_COMMIT: ${{parameters.GitHubPyTorchPRTrigger.TARGET_COMMIT}} - _DOCKER_IMAGE: $(DOCKER_IMAGE_PR) - _RUN_TESTS: $(RUN_TESTS_PR) - -- stage: 'NotifyWebapp' - displayName: 'Notify Webapp that pipeline is finished' - dependsOn: PRCustomTests - condition: succeededOrFailed() - jobs: - - template: job_templates/notify-webapp-template.yml - parameters: - name: ubuntu_1804_CPU - pool: $(BUILD_POOL_LIN_1) - customMatrixes: - PR_Notify_WebApp: - _TARGET_CIRCLECI_BUILD: $(TARGET_CIRCLECI_BUILD_PR) - _TARGET_BRANCH_TO_CHECK: ${{parameters.GitHubPyTorchPRTrigger.TARGET_BRANCH_TO_CHECK_AZ_DEVOPS_PR}} - _TARGET_PR_NUMBER: ${{parameters.GitHubPyTorchPRTrigger.PR_NUMBER}} - _TARGET_COMMIT: ${{parameters.GitHubPyTorchPRTrigger.TARGET_COMMIT}} diff --git a/.azure_pipelines/verify-pipeline.yml b/.azure_pipelines/verify-pipeline.yml deleted file mode 100644 index e0ab4e372a75..000000000000 --- a/.azure_pipelines/verify-pipeline.yml +++ /dev/null @@ -1,224 +0,0 @@ -# PyTorch Official Builds Pipeline on Azure DevOps -# -# This pipeline: -# 1) builds PyTorch on all available configurations -# 2) verifies PyTorch artifacts by installing them in a clean environment -# and checking torch.__version_ -# 3) publishes official PyTorch artifacts to Azure DevOps Artifacts for consumption - -stages: -- stage: 'Build' - displayName: 'Build PyTorch' - jobs: - - template: job_templates/build-verify-publish-template-unix.yml - parameters: - name: ubuntu_1804_CPU_docker - pool: 'PyTorch-Linux-CPU' - container_endpoint: pytorchms.azurecr.io - build_stage: True - is_official_build: True - os: ubuntu - cuda: cpu - customMatrixes: - Py_38: - configuration: ubuntu_1804_py_38_cpu - container_image: pytorchms.azurecr.io/ubuntu_1804_py_38_cpu_dev - Py_37: - configuration: ubuntu_1804_py_37_cpu - container_image: pytorchms.azurecr.io/ubuntu_1804_py_37_cpu_dev - - - template: job_templates/build-verify-publish-template-unix.yml - parameters: - name: ubuntu_1804_GPU_docker - pool: 'PyTorch-Linux-GPU' - container_endpoint: pytorchms.azurecr.io - build_stage: True - is_official_build: True - os: ubuntu - cuda: gpu - customMatrixes: - Py_39_CUDA_112_cuDNN_810: - configuration: ubuntu_1804_py_39_cuda_112_cudnn_810 - container_image: pytorchms.azurecr.io/ubuntu_1804_py_39_cuda_112_cudnn_8_dev - CUDA_VERSION: 112 - Py_38_CUDA_102_cuDNN_810: - configuration: ubuntu_1804_py_38_cuda_102_cudnn_810 - container_image: pytorchms.azurecr.io/ubuntu_1804_py_38_cuda_102_cudnn_8_dev - CUDA_VERSION: 102 - Py_37_CUDA_101_cuDNN_765: - configuration: ubuntu_1804_py_37_cuda_101_cudnn_765 - container_image: pytorchms.azurecr.io/ubuntu_1804_py_37_cuda_101_cudnn_7_dev - CUDA_VERSION: 101 - - - template: job_templates/build-verify-publish-template-win.yml - parameters: - name: windows_2019_CPU - pool: 'PyTorch-Win-CPU' - build_stage: True - is_official_build: True - os: windows - cuda: cpu - customMatrixes: - Py_38: - configuration: windows_2019_py_38_cpu - Py_37: - configuration: windows_2019_py_37_cpu - - - template: job_templates/build-verify-publish-template-win.yml - parameters: - name: windows_2019_GPU - pool: 'PyTorch-Win-GPU' - build_stage: True - is_official_build: True - os: windows - cuda: gpu - customMatrixes: - Py_39_CUDA_112_cuDNN_810: - configuration: windows_2019_py_39_cuda_112_cudnn_810 - CUDA_VERSION: 112 - Py_38_CUDA_102_cuDNN_765: - configuration: windows_2019_py_38_cuda_102_cudnn_765 - CUDA_VERSION: 102 - Py_37_CUDA_101_cuDNN_764: - configuration: windows_2019_py_37_cuda_101_cudnn_764 - CUDA_VERSION: 101 - -- stage: 'Verify' - displayName: 'Verify PyTorch wheels' - dependsOn: Build - condition: succeeded() - jobs: - - template: job_templates/build-verify-publish-template-unix.yml - parameters: - name: ubuntu_1804_CPU_docker - pool: 'PyTorch-Linux-CPU' - container_endpoint: pytorchms.azurecr.io - verify_stage: True - is_official_build: True - customMatrixes: - Py_38: - configuration: ubuntu_1804_py_38_cpu - container_image: pytorchms.azurecr.io/ubuntu_1804_py_38_cpu_dev - Py_37: - configuration: ubuntu_1804_py_37_cpu - container_image: pytorchms.azurecr.io/ubuntu_1804_py_37_cpu_dev - - - template: job_templates/build-verify-publish-template-unix.yml - parameters: - name: ubuntu_1804_GPU_docker - pool: 'PyTorch-Linux-GPU' - container_endpoint: pytorchms.azurecr.io - verify_stage: True - is_official_build: True - customMatrixes: - Py_39_CUDA_112_cuDNN_810: - configuration: ubuntu_1804_py_39_cuda_112_cudnn_810 - container_image: pytorchms.azurecr.io/ubuntu_1804_py_39_cuda_112_cudnn_8_dev - CUDA_VERSION: 112 - Py_38_CUDA_102_cuDNN_810: - configuration: ubuntu_1804_py_38_cuda_102_cudnn_810 - container_image: pytorchms.azurecr.io/ubuntu_1804_py_38_cuda_102_cudnn_8_dev - CUDA_VERSION: 102 - Py_37_CUDA_101_cuDNN_765: - configuration: ubuntu_1804_py_37_cuda_101_cudnn_765 - container_image: pytorchms.azurecr.io/ubuntu_1804_py_37_cuda_101_cudnn_7_dev - CUDA_VERSION: 101 - - - template: job_templates/build-verify-publish-template-win.yml - parameters: - name: windows_2019_CPU - pool: 'PyTorch-Win-CPU' - verify_stage: True - is_official_build: True - customMatrixes: - Py_38: - configuration: windows_2019_py_38_cpu - Py_37: - configuration: windows_2019_py_37_cpu - - - template: job_templates/build-verify-publish-template-win.yml - parameters: - name: windows_2019_GPU - pool: 'PyTorch-Win-GPU' - verify_stage: True - is_official_build: True - customMatrixes: - Py_39_CUDA_112_cuDNN_810: - configuration: windows_2019_py_39_cuda_112_cudnn_810 - CUDA_VERSION: 112 - Py_38_CUDA_102_cuDNN_765: - configuration: windows_2019_py_38_cuda_102_cudnn_765 - CUDA_VERSION: 102 - Py_37_CUDA_101_cuDNN_764: - configuration: windows_2019_py_37_cuda_101_cudnn_764 - CUDA_VERSION: 101 - -- stage: 'Publish' - displayName: 'Publish PyTorch wheels' - dependsOn: Verify - condition: succeeded() - jobs: - - template: job_templates/build-verify-publish-template-unix.yml - parameters: - name: ubuntu_1804_CPU_docker - pool: 'PyTorch-Linux-CPU' - container_endpoint: pytorchms.azurecr.io - publish_stage: True - is_official_build: True - customMatrixes: - Py_38: - configuration: ubuntu_1804_py_38_cpu - container_image: pytorchms.azurecr.io/ubuntu_1804_py_38_cpu_dev - Py_37: - configuration: ubuntu_1804_py_37_cpu - container_image: pytorchms.azurecr.io/ubuntu_1804_py_37_cpu_dev - - - template: job_templates/build-verify-publish-template-unix.yml - parameters: - name: ubuntu_1804_GPU_docker - pool: 'PyTorch-Linux-GPU' - container_endpoint: pytorchms.azurecr.io - publish_stage: True - is_official_build: True - customMatrixes: - Py_39_CUDA_112_cuDNN_810: - configuration: ubuntu_1804_py_39_cuda_112_cudnn_810 - container_image: pytorchms.azurecr.io/ubuntu_1804_py_39_cuda_112_cudnn_8_dev - CUDA_VERSION: 112 - Py_38_CUDA_102_cuDNN_810: - configuration: ubuntu_1804_py_38_cuda_102_cudnn_810 - container_image: pytorchms.azurecr.io/ubuntu_1804_py_38_cuda_102_cudnn_8_dev - CUDA_VERSION: 102 - Py_37_CUDA_101_cuDNN_765: - configuration: ubuntu_1804_py_37_cuda_101_cudnn_765 - container_image: pytorchms.azurecr.io/ubuntu_1804_py_37_cuda_101_cudnn_7_dev - CUDA_VERSION: 101 - - - template: job_templates/build-verify-publish-template-win.yml - parameters: - name: windows_2019_CPU - pool: 'PyTorch-Win-CPU' - publish_stage: True - is_official_build: True - customMatrixes: - Py_38: - configuration: windows_2019_py_38_cpu - Py_37: - configuration: windows_2019_py_37_cpu - - - template: job_templates/build-verify-publish-template-win.yml - parameters: - name: windows_2019_GPU - pool: 'PyTorch-Win-GPU' - publish_stage: True - is_official_build: True - customMatrixes: - Py_39_CUDA_112_cuDNN_810: - configuration: windows_2019_py_39_cuda_112_cudnn_810 - CUDA_VERSION: 112 - Py_38_CUDA_102_cuDNN_765: - configuration: windows_2019_py_38_cuda_102_cudnn_765 - CUDA_VERSION: 102 - Py_37_CUDA_101_cuDNN_764: - configuration: windows_2019_py_37_cuda_101_cudnn_764 - CUDA_VERSION: 101 diff --git a/.bazelrc b/.bazelrc index 1e847054613e..bbde3075f2af 100644 --- a/.bazelrc +++ b/.bazelrc @@ -1,10 +1,11 @@ -build --copt=--std=c++14 +build --cxxopt=--std=c++14 build --copt=-I. # Bazel does not support including its cc_library targets as system # headers. We work around this for generated code # (e.g. c10/macros/cmake_macros.h) by making the generated directory a # system include path. build --copt=-isystem --copt bazel-out/k8-fastbuild/bin +build --copt=-isystem --copt bazel-out/darwin-fastbuild/bin build --experimental_ui_max_stdouterr_bytes=2048576 # Configuration to disable tty features for environments like CI diff --git a/.buckconfig.oss b/.buckconfig.oss new file mode 100644 index 000000000000..638870587d84 --- /dev/null +++ b/.buckconfig.oss @@ -0,0 +1,15 @@ +[buildfile] +name = BUILD.buck + +[repositories] + bazel_skylib = third_party/bazel-skylib/ + +[download] + in_build = true + +[cxx] + cxxflags = -std=c++17 + should_remap_host_platform = true + +[project] + default_flavors_mode=all diff --git a/.circleci/README.md b/.circleci/README.md deleted file mode 100644 index 5b0d56d1df2e..000000000000 --- a/.circleci/README.md +++ /dev/null @@ -1,498 +0,0 @@ -Structure of CI -=============== - -setup job: -1. Does a git checkout -2. Persists CircleCI scripts (everything in `.circleci`) into a workspace. Why? - We don't always do a Git checkout on all subjobs, but we usually - still want to be able to call scripts one way or another in a subjob. - Persisting files this way lets us have access to them without doing a - checkout. This workspace is conventionally mounted on `~/workspace` - (this is distinguished from `~/project`, which is the conventional - working directory that CircleCI will default to starting your jobs - in.) -3. Write out the commit message to `.circleci/COMMIT_MSG`. This is so - we can determine in subjobs if we should actually run the jobs or - not, even if there isn't a Git checkout. - - - - -CircleCI configuration generator -================================ - -One may no longer make changes to the `.circleci/config.yml` file directly. -Instead, one must edit these Python scripts or files in the `verbatim-sources/` directory. - - -Usage ----------- - -1. Make changes to these scripts. -2. Run the `regenerate.sh` script in this directory and commit the script changes and the resulting change to `config.yml`. - -You'll see a build failure on GitHub if the scripts don't agree with the checked-in version. - - -Motivation ----------- - -These scripts establish a single, authoritative source of documentation for the CircleCI configuration matrix. -The documentation, in the form of diagrams, is automatically generated and cannot drift out of sync with the YAML content. - -Furthermore, consistency is enforced within the YAML config itself, by using a single source of data to generate -multiple parts of the file. - -* Facilitates one-off culling/enabling of CI configs for testing PRs on special targets - -Also see https://github.com/pytorch/pytorch/issues/17038 - - -Future direction ----------------- - -### Declaring sparse config subsets -See comment [here](https://github.com/pytorch/pytorch/pull/17323#pullrequestreview-206945747): - -In contrast with a full recursive tree traversal of configuration dimensions, -> in the future I think we actually want to decrease our matrix somewhat and have only a few mostly-orthogonal builds that taste as many different features as possible on PRs, plus a more complete suite on every PR and maybe an almost full suite nightly/weekly (we don't have this yet). Specifying PR jobs in the future might be easier to read with an explicit list when we come to this. - ----------------- ----------------- - -# How do the binaries / nightlies / releases work? - -### What is a binary? - -A binary or package (used interchangeably) is a pre-built collection of c++ libraries, header files, python bits, and other files. We build these and distribute them so that users do not need to install from source. - -A **binary configuration** is a collection of - -* release or nightly - * releases are stable, nightlies are beta and built every night -* python version - * linux: 3.7m (mu is wide unicode or something like that. It usually doesn't matter but you should know that it exists) - * macos: 3.7, 3.8 - * windows: 3.7, 3.8 -* cpu version - * cpu, cuda 9.0, cuda 10.0 - * The supported cuda versions occasionally change -* operating system - * Linux - these are all built on CentOS. There haven't been any problems in the past building on CentOS and using on Ubuntu - * MacOS - * Windows - these are built on Azure pipelines -* devtoolset version (gcc compiler version) - * This only matters on Linux cause only Linux uses gcc. tldr is gcc made a backwards incompatible change from gcc 4.8 to gcc 5, because it had to change how it implemented std::vector and std::string - -### Where are the binaries? - -The binaries are built in CircleCI. There are nightly binaries built every night at 9pm PST (midnight EST) and release binaries corresponding to Pytorch releases, usually every few months. - -We have 3 types of binary packages - -* pip packages - nightlies are stored on s3 (pip install -f \). releases are stored in a pip repo (pip install torch) (ask Soumith about this) -* conda packages - nightlies and releases are both stored in a conda repo. Nighty packages have a '_nightly' suffix -* libtorch packages - these are zips of all the c++ libraries, header files, and sometimes dependencies. These are c++ only - * shared with dependencies (the only supported option for Windows) - * static with dependencies - * shared without dependencies - * static without dependencies - -All binaries are built in CircleCI workflows except Windows. There are checked-in workflows (committed into the .circleci/config.yml) to build the nightlies every night. Releases are built by manually pushing a PR that builds the suite of release binaries (overwrite the config.yml to build the release) - -# CircleCI structure of the binaries - -Some quick vocab: - -* A \**workflow** is a CircleCI concept; it is a DAG of '**jobs**'. ctrl-f 'workflows' on https://github.com/pytorch/pytorch/blob/master/.circleci/config.yml to see the workflows. -* **jobs** are a sequence of '**steps**' -* **steps** are usually just a bash script or a builtin CircleCI command. *All steps run in new environments, environment variables declared in one script DO NOT persist to following steps* -* CircleCI has a **workspace**, which is essentially a cache between steps of the *same job* in which you can store artifacts between steps. - -## How are the workflows structured? - -The nightly binaries have 3 workflows. We have one job (actually 3 jobs: build, test, and upload) per binary configuration - -1. binary_builds - 1. every day midnight EST - 2. linux: https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/linux-binary-build-defaults.yml - 3. macos: https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/macos-binary-build-defaults.yml - 4. For each binary configuration, e.g. linux_conda_3.7_cpu there is a - 1. binary_linux_conda_3.7_cpu_build - 1. Builds the build. On linux jobs this uses the 'docker executor'. - 2. Persists the package to the workspace - 2. binary_linux_conda_3.7_cpu_test - 1. Loads the package to the workspace - 2. Spins up a docker image (on Linux), mapping the package and code repos into the docker - 3. Runs some smoke tests in the docker - 4. (Actually, for macos this is a step rather than a separate job) - 3. binary_linux_conda_3.7_cpu_upload - 1. Logs in to aws/conda - 2. Uploads the package -2. update_s3_htmls - 1. every day 5am EST - 2. https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/binary_update_htmls.yml - 3. See below for what these are for and why they're needed - 4. Three jobs that each examine the current contents of aws and the conda repo and update some html files in s3 -3. binarysmoketests - 1. every day - 2. https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/nightly-build-smoke-tests-defaults.yml - 3. For each binary configuration, e.g. linux_conda_3.7_cpu there is a - 1. smoke_linux_conda_3.7_cpu - 1. Downloads the package from the cloud, e.g. using the official pip or conda instructions - 2. Runs the smoke tests - -## How are the jobs structured? - -The jobs are in https://github.com/pytorch/pytorch/tree/master/.circleci/verbatim-sources. Jobs are made of multiple steps. There are some shared steps used by all the binaries/smokes. Steps of these jobs are all delegated to scripts in https://github.com/pytorch/pytorch/tree/master/.circleci/scripts . - -* Linux jobs: https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/linux-binary-build-defaults.yml - * binary_linux_build.sh - * binary_linux_test.sh - * binary_linux_upload.sh -* MacOS jobs: https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/macos-binary-build-defaults.yml - * binary_macos_build.sh - * binary_macos_test.sh - * binary_macos_upload.sh -* Update html jobs: https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/binary_update_htmls.yml - * These delegate from the pytorch/builder repo - * https://github.com/pytorch/builder/blob/master/cron/update_s3_htmls.sh - * https://github.com/pytorch/builder/blob/master/cron/upload_binary_sizes.sh -* Smoke jobs (both linux and macos): https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/nightly-build-smoke-tests-defaults.yml - * These delegate from the pytorch/builder repo - * https://github.com/pytorch/builder/blob/master/run_tests.sh - * https://github.com/pytorch/builder/blob/master/smoke_test.sh - * https://github.com/pytorch/builder/blob/master/check_binary.sh -* Common shared code (shared across linux and macos): https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/nightly-binary-build-defaults.yml - * binary_checkout.sh - checks out pytorch/builder repo. Right now this also checks out pytorch/pytorch, but it shouldn't. pytorch/pytorch should just be shared through the workspace. This can handle being run before binary_populate_env.sh - * binary_populate_env.sh - parses BUILD_ENVIRONMENT into the separate env variables that make up a binary configuration. Also sets lots of default values, the date, the version strings, the location of folders in s3, all sorts of things. This generally has to be run before other steps. - * binary_install_miniconda.sh - Installs miniconda, cross platform. Also hacks this for the update_binary_sizes job that doesn't have the right env variables - * binary_run_in_docker.sh - Takes a bash script file (the actual test code) from a hardcoded location, spins up a docker image, and runs the script inside the docker image - -### **Why do the steps all refer to scripts?** - -CircleCI creates a final yaml file by inlining every <<* segment, so if we were to keep all the code in the config.yml itself then the config size would go over 4 MB and cause infra problems. - -### **What is binary_run_in_docker for?** - -So, CircleCI has several executor types: macos, machine, and docker are the ones we use. The 'machine' executor gives you two cores on some linux vm. The 'docker' executor gives you considerably more cores (nproc was 32 instead of 2 back when I tried in February). Since the dockers are faster, we try to run everything that we can in dockers. Thus - -* linux build jobs use the docker executor. Running them on the docker executor was at least 2x faster than running them on the machine executor -* linux test jobs use the machine executor in order for them to properly interface with GPUs since docker executors cannot execute with attached GPUs -* linux upload jobs use the machine executor. The upload jobs are so short that it doesn't really matter what they use -* linux smoke test jobs use the machine executor for the same reason as the linux test jobs - -binary_run_in_docker.sh is a way to share the docker start-up code between the binary test jobs and the binary smoke test jobs - -### **Why does binary_checkout also checkout pytorch? Why shouldn't it?** - -We want all the nightly binary jobs to run on the exact same git commit, so we wrote our own checkout logic to ensure that the same commit was always picked. Later circleci changed that to use a single pytorch checkout and persist it through the workspace (they did this because our config file was too big, so they wanted to take a lot of the setup code into scripts, but the scripts needed the code repo to exist to be called, so they added a prereq step called 'setup' to checkout the code and persist the needed scripts to the workspace). The changes to the binary jobs were not properly tested, so they all broke from missing pytorch code no longer existing. We hotfixed the problem by adding the pytorch checkout back to binary_checkout, so now there's two checkouts of pytorch on the binary jobs. This problem still needs to be fixed, but it takes careful tracing of which code is being called where. - -# Azure Pipelines structure of the binaries - -TODO: fill in stuff - -## How are the workflows structured? - -TODO: fill in stuff - -## How are the jobs structured? - -TODO: fill in stuff - -# Code structure of the binaries (circleci agnostic) - -## Overview - -The code that runs the binaries lives in two places, in the normal [github.com/pytorch/pytorch](http://github.com/pytorch/pytorch), but also in [github.com/pytorch/builder](http://github.com/pytorch/builder), which is a repo that defines how all the binaries are built. The relevant code is - - -``` -# All code needed to set-up environments for build code to run in, -# but only code that is specific to the current CI system -pytorch/pytorch -- .circleci/ # Folder that holds all circleci related stuff - - config.yml # GENERATED file that actually controls all circleci behavior - - verbatim-sources # Used to generate job/workflow sections in ^ - - scripts/ # Code needed to prepare circleci environments for binary build scripts - -- setup.py # Builds pytorch. This is wrapped in pytorch/builder -- cmake files # used in normal building of pytorch - -# All code needed to prepare a binary build, given an environment -# with all the right variables/packages/paths. -pytorch/builder - -# Given an installed binary and a proper python env, runs some checks -# to make sure the binary was built the proper way. Checks things like -# the library dependencies, symbols present, etc. -- check_binary.sh - -# Given an installed binary, runs python tests to make sure everything -# is in order. These should be de-duped. Right now they both run smoke -# tests, but are called from different places. Usually just call some -# import statements, but also has overlap with check_binary.sh above -- run_tests.sh -- smoke_test.sh - -# Folders that govern how packages are built. See paragraphs below - -- conda/ - - build_pytorch.sh # Entrypoint. Delegates to proper conda build folder - - switch_cuda_version.sh # Switches activate CUDA installation in Docker - - pytorch-nightly/ # Build-folder -- manywheel/ - - build_cpu.sh # Entrypoint for cpu builds - - build.sh # Entrypoint for CUDA builds - - build_common.sh # Actual build script that ^^ call into -- wheel/ - - build_wheel.sh # Entrypoint for wheel builds -- windows/ - - build_pytorch.bat # Entrypoint for wheel builds on Windows -``` - -Every type of package has an entrypoint build script that handles the all the important logic. - -## Conda - -Linux, MacOS and Windows use the same code flow for the conda builds. - -Conda packages are built with conda-build, see https://conda.io/projects/conda-build/en/latest/resources/commands/conda-build.html - -Basically, you pass `conda build` a build folder (pytorch-nightly/ above) that contains a build script and a meta.yaml. The meta.yaml specifies in what python environment to build the package in, and what dependencies the resulting package should have, and the build script gets called in the env to build the thing. -tl;dr on conda-build is - -1. Creates a brand new conda environment, based off of deps in the meta.yaml - 1. Note that environment variables do not get passed into this build env unless they are specified in the meta.yaml - 2. If the build fails this environment will stick around. You can activate it for much easier debugging. The “General Python” section below explains what exactly a python “environment” is. -2. Calls build.sh in the environment -3. Copies the finished package to a new conda env, also specified by the meta.yaml -4. Runs some simple import tests (if specified in the meta.yaml) -5. Saves the finished package as a tarball - -The build.sh we use is essentially a wrapper around `python setup.py build`, but it also manually copies in some of our dependent libraries into the resulting tarball and messes with some rpaths. - -The entrypoint file `builder/conda/build_conda.sh` is complicated because - -* It works for Linux, MacOS and Windows - * The mac builds used to create their own environments, since they all used to be on the same machine. There’s now a lot of extra logic to handle conda envs. This extra machinery could be removed -* It used to handle testing too, which adds more logic messing with python environments too. This extra machinery could be removed. - -## Manywheels (linux pip and libtorch packages) - -Manywheels are pip packages for linux distros. Note that these manywheels are not actually manylinux compliant. - -`builder/manywheel/build_cpu.sh` and `builder/manywheel/build.sh` (for CUDA builds) just set different env vars and then call into `builder/manywheel/build_common.sh` - -The entrypoint file `builder/manywheel/build_common.sh` is really really complicated because - -* This used to handle building for several different python versions at the same time. The loops have been removed, but there's still unnecessary folders and movements here and there. - * The script is never used this way anymore. This extra machinery could be removed. -* This used to handle testing the pip packages too. This is why there’s testing code at the end that messes with python installations and stuff - * The script is never used this way anymore. This extra machinery could be removed. -* This also builds libtorch packages - * This should really be separate. libtorch packages are c++ only and have no python. They should not share infra with all the python specific stuff in this file. -* There is a lot of messing with rpaths. This is necessary, but could be made much much simpler if the above issues were fixed. - -## Wheels (MacOS pip and libtorch packages) - -The entrypoint file `builder/wheel/build_wheel.sh` is complicated because - -* The mac builds used to all run on one machine (we didn’t have autoscaling mac machines till circleci). So this script handled siloing itself by setting-up and tearing-down its build env and siloing itself into its own build directory. - * The script is never used this way anymore. This extra machinery could be removed. -* This also builds libtorch packages - * Ditto the comment above. This should definitely be separated out. - -Note that the MacOS Python wheels are still built in conda environments. Some of the dependencies present during build also come from conda. - -## Windows Wheels (Windows pip and libtorch packages) - -The entrypoint file `builder/windows/build_pytorch.bat` is complicated because - -* This used to handle building for several different python versions at the same time. This is why there are loops everywhere - * The script is never used this way anymore. This extra machinery could be removed. -* This used to handle testing the pip packages too. This is why there’s testing code at the end that messes with python installations and stuff - * The script is never used this way anymore. This extra machinery could be removed. -* This also builds libtorch packages - * This should really be separate. libtorch packages are c++ only and have no python. They should not share infra with all the python specific stuff in this file. - -Note that the Windows Python wheels are still built in conda environments. Some of the dependencies present during build also come from conda. - -## General notes - -### Note on run_tests.sh, smoke_test.sh, and check_binary.sh - -* These should all be consolidated -* These must run on all OS types: MacOS, Linux, and Windows -* These all run smoke tests at the moment. They inspect the packages some, maybe run a few import statements. They DO NOT run the python tests nor the cpp tests. The idea is that python tests on master and PR merges will catch all breakages. All these tests have to do is make sure the special binary machinery didn’t mess anything up. -* There are separate run_tests.sh and smoke_test.sh because one used to be called by the smoke jobs and one used to be called by the binary test jobs (see circleci structure section above). This is still true actually, but these could be united into a single script that runs these checks, given an installed pytorch package. - -### Note on libtorch - -Libtorch packages are built in the wheel build scripts: manywheel/build_*.sh for linux and build_wheel.sh for mac. There are several things wrong with this - -* It’s confusing. Most of those scripts deal with python specifics. -* The extra conditionals everywhere severely complicate the wheel build scripts -* The process for building libtorch is different from the official instructions (a plain call to cmake, or a call to a script) - -### Note on docker images / Dockerfiles - -All linux builds occur in docker images. The docker images are - -* pytorch/conda-cuda - * Has ALL CUDA versions installed. The script pytorch/builder/conda/switch_cuda_version.sh sets /usr/local/cuda to a symlink to e.g. /usr/local/cuda-10.0 to enable different CUDA builds - * Also used for cpu builds -* pytorch/manylinux-cuda90 -* pytorch/manylinux-cuda100 - * Also used for cpu builds - -The Dockerfiles are available in pytorch/builder, but there is no circleci job or script to build these docker images, and they cannot be run locally (unless you have the correct local packages/paths). Only Soumith can build them right now. - -### General Python - -* This is still a good explanation of python installations https://caffe2.ai/docs/faq.html#why-do-i-get-import-errors-in-python-when-i-try-to-use-caffe2 - -# How to manually rebuild the binaries - -tl;dr make a PR that looks like https://github.com/pytorch/pytorch/pull/21159 - -Sometimes we want to push a change to master and then rebuild all of today's binaries after that change. As of May 30, 2019 there isn't a way to manually run a workflow in the UI. You can manually re-run a workflow, but it will use the exact same git commits as the first run and will not include any changes. So we have to make a PR and then force circleci to run the binary workflow instead of the normal tests. The above PR is an example of how to do this; essentially you copy-paste the binarybuilds workflow steps into the default workflow steps. If you need to point the builder repo to a different commit then you'd need to change https://github.com/pytorch/pytorch/blob/master/.circleci/scripts/binary_checkout.sh#L42-L45 to checkout what you want. - -## How to test changes to the binaries via .circleci - -Writing PRs that test the binaries is annoying, since the default circleci jobs that run on PRs are not the jobs that you want to run. Likely, changes to the binaries will touch something under .circleci/ and require that .circleci/config.yml be regenerated (.circleci/config.yml controls all .circleci behavior, and is generated using `.circleci/regenerate.sh` in python 3.7). But you also need to manually hardcode the binary jobs that you want to test into the .circleci/config.yml workflow, so you should actually make at least two commits, one for your changes and one to temporarily hardcode jobs. See https://github.com/pytorch/pytorch/pull/22928 as an example of how to do this. - -```sh -# Make your changes -touch .circleci/verbatim-sources/nightly-binary-build-defaults.yml - -# Regenerate the yaml, has to be in python 3.7 -.circleci/regenerate.sh - -# Make a commit -git add .circleci * -git commit -m "My real changes" -git push origin my_branch - -# Now hardcode the jobs that you want in the .circleci/config.yml workflows section -# Also eliminate ensure-consistency and should_run_job checks -# e.g. https://github.com/pytorch/pytorch/commit/2b3344bfed8772fe86e5210cc4ee915dee42b32d - -# Make a commit you won't keep -git add .circleci -git commit -m "[DO NOT LAND] testing binaries for above changes" -git push origin my_branch - -# Now you need to make some changes to the first commit. -git rebase -i HEAD~2 # mark the first commit as 'edit' - -# Make the changes -touch .circleci/verbatim-sources/nightly-binary-build-defaults.yml -.circleci/regenerate.sh - -# Ammend the commit and recontinue -git add .circleci -git commit --amend -git rebase --continue - -# Update the PR, need to force since the commits are different now -git push origin my_branch --force -``` - -The advantage of this flow is that you can make new changes to the base commit and regenerate the .circleci without having to re-write which binary jobs you want to test on. The downside is that all updates will be force pushes. - -## How to build a binary locally - -### Linux - -You can build Linux binaries locally easily using docker. - -```sh -# Run the docker -# Use the correct docker image, pytorch/conda-cuda used here as an example -# -# -v path/to/foo:path/to/bar makes path/to/foo on your local machine (the -# machine that you're running the command on) accessible to the docker -# container at path/to/bar. So if you then run `touch path/to/bar/baz` -# in the docker container then you will see path/to/foo/baz on your local -# machine. You could also clone the pytorch and builder repos in the docker. -# -# If you know how, add ccache as a volume too and speed up everything -docker run \ - -v your/pytorch/repo:/pytorch \ - -v your/builder/repo:/builder \ - -v where/you/want/packages/to/appear:/final_pkgs \ - -it pytorch/conda-cuda /bin/bash - -# Export whatever variables are important to you. All variables that you'd -# possibly need are in .circleci/scripts/binary_populate_env.sh -# You should probably always export at least these 3 variables -export PACKAGE_TYPE=conda -export DESIRED_PYTHON=3.7 -export DESIRED_CUDA=cpu - -# Call the entrypoint -# `|& tee foo.log` just copies all stdout and stderr output to foo.log -# The builds generate lots of output so you probably need this when -# building locally. -/builder/conda/build_pytorch.sh |& tee build_output.log -``` - -**Building CUDA binaries on docker** - -You can build CUDA binaries on CPU only machines, but you can only run CUDA binaries on CUDA machines. This means that you can build a CUDA binary on a docker on your laptop if you so choose (though it’s gonna take a long time). - -For Facebook employees, ask about beefy machines that have docker support and use those instead of your laptop; it will be 5x as fast. - -### MacOS - -There’s no easy way to generate reproducible hermetic MacOS environments. If you have a Mac laptop then you can try emulating the .circleci environments as much as possible, but you probably have packages in /usr/local/, possibly installed by brew, that will probably interfere with the build. If you’re trying to repro an error on a Mac build in .circleci and you can’t seem to repro locally, then my best advice is actually to iterate on .circleci :/ - -But if you want to try, then I’d recommend - -```sh -# Create a new terminal -# Clear your LD_LIBRARY_PATH and trim as much out of your PATH as you -# know how to do - -# Install a new miniconda -# First remove any other python or conda installation from your PATH -# Always install miniconda 3, even if building for Python <3 -new_conda="~/my_new_conda" -conda_sh="$new_conda/install_miniconda.sh" -curl -o "$conda_sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -chmod +x "$conda_sh" -"$conda_sh" -b -p "$MINICONDA_ROOT" -rm -f "$conda_sh" -export PATH="~/my_new_conda/bin:$PATH" - -# Create a clean python env -# All MacOS builds use conda to manage the python env and dependencies -# that are built with, even the pip packages -conda create -yn binary python=2.7 -conda activate binary - -# Export whatever variables are important to you. All variables that you'd -# possibly need are in .circleci/scripts/binary_populate_env.sh -# You should probably always export at least these 3 variables -export PACKAGE_TYPE=conda -export DESIRED_PYTHON=3.7 -export DESIRED_CUDA=cpu - -# Call the entrypoint you want -path/to/builder/wheel/build_wheel.sh -``` - -N.B. installing a brand new miniconda is important. This has to do with how conda installations work. See the “General Python” section above, but tldr; is that - -1. You make the ‘conda’ command accessible by prepending `path/to/conda_root/bin` to your PATH. -2. You make a new env and activate it, which then also gets prepended to your PATH. Now you have `path/to/conda_root/envs/new_env/bin:path/to/conda_root/bin:$PATH` -3. Now say you (or some code that you ran) call python executable `foo` - 1. if you installed `foo` in `new_env`, then `path/to/conda_root/envs/new_env/bin/foo` will get called, as expected. - 2. But if you forgot to installed `foo` in `new_env` but happened to previously install it in your root conda env (called ‘base’), then unix/linux will still find `path/to/conda_root/bin/foo` . This is dangerous, since `foo` can be a different version than you want; `foo` can even be for an incompatible python version! - -Newer conda versions and proper python hygiene can prevent this, but just install a new miniconda to be safe. - -### Windows - -TODO: fill in diff --git a/.circleci/cimodel/data/binary_build_data.py b/.circleci/cimodel/data/binary_build_data.py index 1c714186568f..5df203b6ce39 100644 --- a/.circleci/cimodel/data/binary_build_data.py +++ b/.circleci/cimodel/data/binary_build_data.py @@ -31,13 +31,6 @@ def get_processor_arch_name(gpu_version): ) CONFIG_TREE_DATA = OrderedDict( - windows=( - # Stop building Win+CU102, see https://github.com/pytorch/pytorch/issues/65648 - [v for v in dimensions.GPU_VERSIONS if v not in dimensions.ROCM_VERSION_LABELS and v != "cuda102"], - OrderedDict( - conda=dimensions.STANDARD_PYTHON_VERSIONS, - ) - ), ) # GCC config variants: diff --git a/.circleci/cimodel/data/dimensions.py b/.circleci/cimodel/data/dimensions.py index 1a411856a8b2..7f9ebccbcc89 100644 --- a/.circleci/cimodel/data/dimensions.py +++ b/.circleci/cimodel/data/dimensions.py @@ -2,9 +2,8 @@ CUDA_VERSIONS = [ "102", - "111", "113", - "115", + "116", ] ROCM_VERSIONS = [ diff --git a/.circleci/cimodel/data/pytorch_build_data.py b/.circleci/cimodel/data/pytorch_build_data.py index b8c83ada6534..09756135fe64 100644 --- a/.circleci/cimodel/data/pytorch_build_data.py +++ b/.circleci/cimodel/data/pytorch_build_data.py @@ -71,10 +71,10 @@ def child_constructor(self): next_nodes = { "asan": AsanConfigNode, "xla": XlaConfigNode, - "mlc": MLCConfigNode, + "mps": MPSConfigNode, "vulkan": VulkanConfigNode, "parallel_tbb": ParallelTBBConfigNode, - "noarch": NoarchConfigNode, + "crossref": CrossRefConfigNode, "parallel_native": ParallelNativeConfigNode, "onnx": ONNXConfigNode, "libtorch": LibTorchConfigNode, @@ -116,12 +116,12 @@ def init2(self, node_name): def child_constructor(self): return ImportantConfigNode -class MLCConfigNode(TreeConfigNode): +class MPSConfigNode(TreeConfigNode): def modify_label(self, label): - return "MLC=" + str(label) + return "MPS=" + str(label) def init2(self, node_name): - self.props["is_mlc"] = node_name + self.props["is_mps"] = node_name def child_constructor(self): return ImportantConfigNode @@ -171,9 +171,9 @@ def child_constructor(self): return ImportantConfigNode -class NoarchConfigNode(TreeConfigNode): +class CrossRefConfigNode(TreeConfigNode): def init2(self, node_name): - self.props["is_noarch"] = node_name + self.props["is_crossref"] = node_name def child_constructor(self): return ImportantConfigNode diff --git a/.circleci/cimodel/data/pytorch_build_definitions.py b/.circleci/cimodel/data/pytorch_build_definitions.py index 036e8a599191..0eb7b5ec5210 100644 --- a/.circleci/cimodel/data/pytorch_build_definitions.py +++ b/.circleci/cimodel/data/pytorch_build_definitions.py @@ -185,7 +185,7 @@ def gen_docs_configs(xenial_parent_config): HiddenConf( "pytorch_python_doc_build", parent_build=xenial_parent_config, - filters=gen_filter_dict(branches_list=["master", "nightly"], + filters=gen_filter_dict(branches_list=["master", "main", "nightly"], tags_list=RC_PATTERN), ) ) @@ -201,7 +201,7 @@ def gen_docs_configs(xenial_parent_config): HiddenConf( "pytorch_cpp_doc_build", parent_build=xenial_parent_config, - filters=gen_filter_dict(branches_list=["master", "nightly"], + filters=gen_filter_dict(branches_list=["master", "main", "nightly"], tags_list=RC_PATTERN), ) ) @@ -239,7 +239,7 @@ def instantiate_configs(only_slow_gradcheck): compiler_version = fc.find_prop("compiler_version") is_xla = fc.find_prop("is_xla") or False is_asan = fc.find_prop("is_asan") or False - is_noarch = fc.find_prop("is_noarch") or False + is_crossref = fc.find_prop("is_crossref") or False is_onnx = fc.find_prop("is_onnx") or False is_pure_torch = fc.find_prop("is_pure_torch") or False is_vulkan = fc.find_prop("is_vulkan") or False @@ -283,8 +283,8 @@ def instantiate_configs(only_slow_gradcheck): python_version = fc.find_prop("pyver") parms_list[0] = fc.find_prop("abbreviated_pyver") - if is_noarch: - parms_list_ignored_for_docker_image.append("noarch") + if is_crossref: + parms_list_ignored_for_docker_image.append("crossref") if is_onnx: parms_list.append("onnx") diff --git a/.circleci/cimodel/data/simple/binary_smoketest.py b/.circleci/cimodel/data/simple/binary_smoketest.py deleted file mode 100644 index 6d1d421d029c..000000000000 --- a/.circleci/cimodel/data/simple/binary_smoketest.py +++ /dev/null @@ -1,193 +0,0 @@ -""" -TODO: Refactor circleci/cimodel/data/binary_build_data.py to generate this file - instead of doing one offs here - Binary builds (subset, to smoke test that they'll work) - - NB: If you modify this file, you need to also modify - the binary_and_smoke_tests_on_pr variable in - pytorch-ci-hud to adjust the allowed build list - at https://github.com/ezyang/pytorch-ci-hud/blob/master/src/BuildHistoryDisplay.js - - Note: - This binary build is currently broken, see https://github_com/pytorch/pytorch/issues/16710 - - binary_linux_conda_3_6_cu90_devtoolset7_build - - binary_linux_conda_3_6_cu90_devtoolset7_test - - TODO - we should test a libtorch cuda build, but they take too long - - binary_linux_libtorch_3_6m_cu90_devtoolset7_static-without-deps_build -""" - -import cimodel.lib.miniutils as miniutils -import cimodel.data.simple.util.branch_filters - - -class SmoketestJob: - def __init__(self, - template_name, - build_env_parts, - docker_image, - job_name, - is_master_only=False, - requires=None, - has_libtorch_variant=False, - extra_props=None): - - self.template_name = template_name - self.build_env_parts = build_env_parts - self.docker_image = docker_image - self.job_name = job_name - self.is_master_only = is_master_only - self.requires = requires or [] - self.has_libtorch_variant = has_libtorch_variant - self.extra_props = extra_props or {} - - def gen_tree(self): - - props_dict = { - "build_environment": " ".join(self.build_env_parts), - "name": self.job_name, - "requires": self.requires, - } - - if self.docker_image: - props_dict["docker_image"] = self.docker_image - - if self.is_master_only: - props_dict["filters"] = cimodel.data.simple.util.branch_filters.gen_filter_dict() - - if self.has_libtorch_variant: - props_dict["libtorch_variant"] = "shared-with-deps" - - props_dict.update(self.extra_props) - - return [{self.template_name: props_dict}] - - -WORKFLOW_DATA = [ - SmoketestJob( - "binary_linux_build", - ["manywheel", "3.7m", "cu102", "devtoolset7"], - "pytorch/manylinux-cuda102", - "binary_linux_manywheel_3_7m_cu102_devtoolset7_build", - is_master_only=True, - ), - SmoketestJob( - "binary_linux_build", - ["libtorch", "3.7m", "cpu", "devtoolset7"], - "pytorch/manylinux-cuda102", - "binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_build", - is_master_only=True, - has_libtorch_variant=True, - ), - SmoketestJob( - "binary_linux_build", - ["libtorch", "3.7m", "cpu", "gcc5.4_cxx11-abi"], - "pytorch/pytorch-binary-docker-image-ubuntu16.04:latest", - "binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build", - is_master_only=False, - has_libtorch_variant=True, - ), - SmoketestJob( - "binary_mac_build", - ["wheel", "3.7", "cpu"], - None, - "binary_macos_wheel_3_7_cpu_build", - is_master_only=True, - ), - # This job has an average run time of 3 hours o.O - # Now only running this on master to reduce overhead - SmoketestJob( - "binary_mac_build", - ["libtorch", "3.7", "cpu"], - None, - "binary_macos_libtorch_3_7_cpu_build", - is_master_only=True, - ), - SmoketestJob( - "binary_windows_build", - ["libtorch", "3.7", "cpu", "debug"], - None, - "binary_windows_libtorch_3_7_cpu_debug_build", - is_master_only=True, - ), - SmoketestJob( - "binary_windows_build", - ["libtorch", "3.7", "cpu", "release"], - None, - "binary_windows_libtorch_3_7_cpu_release_build", - is_master_only=True, - ), - SmoketestJob( - "binary_windows_build", - ["wheel", "3.7", "cu113"], - None, - "binary_windows_wheel_3_7_cu113_build", - is_master_only=True, - ), - - SmoketestJob( - "binary_windows_test", - ["libtorch", "3.7", "cpu", "debug"], - None, - "binary_windows_libtorch_3_7_cpu_debug_test", - is_master_only=True, - requires=["binary_windows_libtorch_3_7_cpu_debug_build"], - ), - SmoketestJob( - "binary_windows_test", - ["libtorch", "3.7", "cpu", "release"], - None, - "binary_windows_libtorch_3_7_cpu_release_test", - is_master_only=False, - requires=["binary_windows_libtorch_3_7_cpu_release_build"], - ), - SmoketestJob( - "binary_windows_test", - ["wheel", "3.7", "cu113"], - None, - "binary_windows_wheel_3_7_cu113_test", - is_master_only=True, - requires=["binary_windows_wheel_3_7_cu113_build"], - extra_props={ - "executor": "windows-with-nvidia-gpu", - }, - ), - - - - SmoketestJob( - "binary_linux_test", - ["manywheel", "3.7m", "cu102", "devtoolset7"], - "pytorch/manylinux-cuda102", - "binary_linux_manywheel_3_7m_cu102_devtoolset7_test", - is_master_only=True, - requires=["binary_linux_manywheel_3_7m_cu102_devtoolset7_build"], - extra_props={ - "resource_class": "gpu.nvidia.small", - "use_cuda_docker_runtime": miniutils.quote((str(1))), - }, - ), - SmoketestJob( - "binary_linux_test", - ["libtorch", "3.7m", "cpu", "devtoolset7"], - "pytorch/manylinux-cuda102", - "binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_test", - is_master_only=True, - requires=["binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_build"], - has_libtorch_variant=True, - ), - SmoketestJob( - "binary_linux_test", - ["libtorch", "3.7m", "cpu", "gcc5.4_cxx11-abi"], - "pytorch/pytorch-binary-docker-image-ubuntu16.04:latest", - "binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_test", - is_master_only=True, - requires=["binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build"], - has_libtorch_variant=True, - ), -] - - -def get_workflow_jobs(): - return [item.gen_tree() for item in WORKFLOW_DATA] diff --git a/.circleci/cimodel/data/simple/util/branch_filters.py b/.circleci/cimodel/data/simple/util/branch_filters.py index dfbc6e4d63bc..ba4e00a059ef 100644 --- a/.circleci/cimodel/data/simple/util/branch_filters.py +++ b/.circleci/cimodel/data/simple/util/branch_filters.py @@ -1,4 +1,5 @@ NON_PR_BRANCH_LIST = [ + "main", "master", r"/ci-all\/.*/", r"/release\/.*/", diff --git a/.circleci/config.yml b/.circleci/config.yml index 1a4bfd3418ec..8828d86294b8 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -436,22 +436,7 @@ binary_windows_params: &binary_windows_params default: "windows-xlarge-cpu-with-nvidia-cuda" environment: BUILD_ENVIRONMENT: << parameters.build_environment >> - BUILD_FOR_SYSTEM: windows JOB_EXECUTOR: <> - -promote_common: &promote_common - docker: - - image: pytorch/release - parameters: - package_name: - description: "package name to promote" - type: string - default: "" - environment: - PACKAGE_NAME: << parameters.package_name >> - ANACONDA_API_TOKEN: ${CONDA_PYTORCHBOT_TOKEN} - AWS_ACCESS_KEY_ID: ${PYTORCH_BINARY_AWS_ACCESS_KEY_ID} - AWS_SECRET_ACCESS_KEY: ${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY} ############################################################################## # Job specs ############################################################################## @@ -619,6 +604,7 @@ jobs: <<: *binary_mac_params macos: xcode: "12.0" + resource_class: "large" steps: # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml - checkout @@ -857,7 +843,7 @@ jobs: parameters: branch: type: string - default: "master" + default: "main" steps: - attach_workspace: at: /tmp/workspace @@ -897,7 +883,7 @@ jobs: echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE} # turn v1.12.0rc3 into 1.12 tag=$(echo $CIRCLE_TAG | sed -e 's/v*\([0-9]*\.[0-9]*\).*/\1/') - target=${tag:-master} + target=${tag:-main} echo "building for ${target}" time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE}) @@ -907,7 +893,7 @@ jobs: echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts mkdir -p ~/workspace/build_artifacts - docker cp $id:/var/lib/jenkins/workspace/pytorch.github.io/docs/master ~/workspace/build_artifacts + docker cp $id:/var/lib/jenkins/workspace/pytorch.github.io/docs/main ~/workspace/build_artifacts docker cp $id:/var/lib/jenkins/workspace/pytorch.github.io /tmp/workspace # Save the docs build so we can debug any problems @@ -919,7 +905,7 @@ jobs: paths: - . - store_artifacts: - path: ~/workspace/build_artifacts/master + path: ~/workspace/build_artifacts/main destination: docs pytorch_cpp_doc_build: @@ -943,12 +929,12 @@ jobs: echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE} # turn v1.12.0rc3 into 1.12 tag=$(echo $CIRCLE_TAG | sed -e 's/v*\([0-9]*\.[0-9]*\).*/\1/') - target=${tag:-master} + target=${tag:-main} echo "building for ${target}" time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE}) - export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && '"export CIRCLE_SHA1='$CIRCLE_SHA1'"' && . ./.circleci/scripts/cpp_doc_push_script.sh docs/"$target" master") | docker exec -u jenkins -i "$id" bash) 2>&1' + export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && '"export CIRCLE_SHA1='$CIRCLE_SHA1'"' && . ./.circleci/scripts/cpp_doc_push_script.sh docs/"$target" main") | docker exec -u jenkins -i "$id" bash) 2>&1' echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts @@ -1432,7 +1418,7 @@ jobs: time docker pull ${DOCKER_IMAGE}:${DOCKER_TAG} >/dev/null export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${DOCKER_IMAGE}:${DOCKER_TAG}) - echo "Do NOT merge master branch into $CIRCLE_BRANCH in environment $BUILD_ENVIRONMENT" + echo "Do NOT merge main branch into $CIRCLE_BRANCH in environment $BUILD_ENVIRONMENT" git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0 @@ -1532,24 +1518,6 @@ jobs: export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.jenkins/pytorch/docs-test.sh") | docker exec -u jenkins -i "$id" bash) 2>&1' echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts - promote_s3: - <<: *promote_common - steps: - - checkout - - run: - name: Running promote script - command: | - scripts/release/promote/wheel_to_s3.sh - - promote_conda: - <<: *promote_common - steps: - - checkout - - run: - name: Running promote script - command: | - scripts/release/promote/conda_to_conda.sh - # update_s3_htmls job # These jobs create html files for every cpu/cu## folder in s3. The html # files just store the names of all the files in that folder (which are @@ -1676,738 +1644,8 @@ jobs: # Workflows ############################################################################## workflows: - binary_builds: - jobs: - - binary_windows_build: - name: binary_windows_conda_3_7_cpu_nightly_build - build_environment: "conda 3.7 cpu" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - - binary_windows_build: - name: binary_windows_conda_3_8_cpu_nightly_build - build_environment: "conda 3.8 cpu" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - - binary_windows_build: - name: binary_windows_conda_3_9_cpu_nightly_build - build_environment: "conda 3.9 cpu" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - - binary_windows_build: - name: binary_windows_conda_3_10_cpu_nightly_build - build_environment: "conda 3.10 cpu" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - - binary_windows_build: - name: binary_windows_conda_3_7_cu111_nightly_build - build_environment: "conda 3.7 cu111" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - - binary_windows_build: - name: binary_windows_conda_3_8_cu111_nightly_build - build_environment: "conda 3.8 cu111" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - - binary_windows_build: - name: binary_windows_conda_3_9_cu111_nightly_build - build_environment: "conda 3.9 cu111" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - - binary_windows_build: - name: binary_windows_conda_3_10_cu111_nightly_build - build_environment: "conda 3.10 cu111" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - - binary_windows_build: - name: binary_windows_conda_3_7_cu113_nightly_build - build_environment: "conda 3.7 cu113" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - - binary_windows_build: - name: binary_windows_conda_3_8_cu113_nightly_build - build_environment: "conda 3.8 cu113" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - - binary_windows_build: - name: binary_windows_conda_3_9_cu113_nightly_build - build_environment: "conda 3.9 cu113" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - - binary_windows_build: - name: binary_windows_conda_3_10_cu113_nightly_build - build_environment: "conda 3.10 cu113" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - - binary_windows_build: - name: binary_windows_conda_3_7_cu115_nightly_build - build_environment: "conda 3.7 cu115" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - - binary_windows_build: - name: binary_windows_conda_3_8_cu115_nightly_build - build_environment: "conda 3.8 cu115" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - - binary_windows_build: - name: binary_windows_conda_3_9_cu115_nightly_build - build_environment: "conda 3.9 cu115" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - - binary_windows_build: - name: binary_windows_conda_3_10_cu115_nightly_build - build_environment: "conda 3.10 cu115" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - - binary_windows_test: - name: binary_windows_conda_3_7_cpu_nightly_test - build_environment: "conda 3.7 cpu" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - requires: - - binary_windows_conda_3_7_cpu_nightly_build - - binary_windows_test: - name: binary_windows_conda_3_8_cpu_nightly_test - build_environment: "conda 3.8 cpu" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - requires: - - binary_windows_conda_3_8_cpu_nightly_build - - binary_windows_test: - name: binary_windows_conda_3_9_cpu_nightly_test - build_environment: "conda 3.9 cpu" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - requires: - - binary_windows_conda_3_9_cpu_nightly_build - - binary_windows_test: - name: binary_windows_conda_3_10_cpu_nightly_test - build_environment: "conda 3.10 cpu" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - requires: - - binary_windows_conda_3_10_cpu_nightly_build - - binary_windows_test: - name: binary_windows_conda_3_7_cu111_nightly_test - build_environment: "conda 3.7 cu111" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - requires: - - binary_windows_conda_3_7_cu111_nightly_build - executor: windows-with-nvidia-gpu - - binary_windows_test: - name: binary_windows_conda_3_8_cu111_nightly_test - build_environment: "conda 3.8 cu111" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - requires: - - binary_windows_conda_3_8_cu111_nightly_build - executor: windows-with-nvidia-gpu - - binary_windows_test: - name: binary_windows_conda_3_9_cu111_nightly_test - build_environment: "conda 3.9 cu111" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - requires: - - binary_windows_conda_3_9_cu111_nightly_build - executor: windows-with-nvidia-gpu - - binary_windows_test: - name: binary_windows_conda_3_10_cu111_nightly_test - build_environment: "conda 3.10 cu111" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - requires: - - binary_windows_conda_3_10_cu111_nightly_build - executor: windows-with-nvidia-gpu - - binary_windows_test: - name: binary_windows_conda_3_7_cu113_nightly_test - build_environment: "conda 3.7 cu113" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - requires: - - binary_windows_conda_3_7_cu113_nightly_build - executor: windows-with-nvidia-gpu - - binary_windows_test: - name: binary_windows_conda_3_8_cu113_nightly_test - build_environment: "conda 3.8 cu113" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - requires: - - binary_windows_conda_3_8_cu113_nightly_build - executor: windows-with-nvidia-gpu - - binary_windows_test: - name: binary_windows_conda_3_9_cu113_nightly_test - build_environment: "conda 3.9 cu113" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - requires: - - binary_windows_conda_3_9_cu113_nightly_build - executor: windows-with-nvidia-gpu - - binary_windows_test: - name: binary_windows_conda_3_10_cu113_nightly_test - build_environment: "conda 3.10 cu113" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - requires: - - binary_windows_conda_3_10_cu113_nightly_build - executor: windows-with-nvidia-gpu - - binary_windows_test: - name: binary_windows_conda_3_7_cu115_nightly_test - build_environment: "conda 3.7 cu115" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - requires: - - binary_windows_conda_3_7_cu115_nightly_build - executor: windows-with-nvidia-gpu - - binary_windows_test: - name: binary_windows_conda_3_8_cu115_nightly_test - build_environment: "conda 3.8 cu115" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - requires: - - binary_windows_conda_3_8_cu115_nightly_build - executor: windows-with-nvidia-gpu - - binary_windows_test: - name: binary_windows_conda_3_9_cu115_nightly_test - build_environment: "conda 3.9 cu115" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - requires: - - binary_windows_conda_3_9_cu115_nightly_build - executor: windows-with-nvidia-gpu - - binary_windows_test: - name: binary_windows_conda_3_10_cu115_nightly_test - build_environment: "conda 3.10 cu115" - filters: - branches: - only: - - /.*/ - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - requires: - - binary_windows_conda_3_10_cu115_nightly_build - executor: windows-with-nvidia-gpu - - binary_upload: - name: binary_windows_conda_3_7_cpu_nightly_upload - context: org-member - requires: - - binary_windows_conda_3_7_cpu_nightly_test - filters: - branches: - only: - - nightly - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - package_type: conda - upload_subfolder: cpu - - binary_upload: - name: binary_windows_conda_3_8_cpu_nightly_upload - context: org-member - requires: - - binary_windows_conda_3_8_cpu_nightly_test - filters: - branches: - only: - - nightly - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - package_type: conda - upload_subfolder: cpu - - binary_upload: - name: binary_windows_conda_3_9_cpu_nightly_upload - context: org-member - requires: - - binary_windows_conda_3_9_cpu_nightly_test - filters: - branches: - only: - - nightly - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - package_type: conda - upload_subfolder: cpu - - binary_upload: - name: binary_windows_conda_3_10_cpu_nightly_upload - context: org-member - requires: - - binary_windows_conda_3_10_cpu_nightly_test - filters: - branches: - only: - - nightly - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - package_type: conda - upload_subfolder: cpu - - binary_upload: - name: binary_windows_conda_3_7_cu111_nightly_upload - context: org-member - requires: - - binary_windows_conda_3_7_cu111_nightly_test - filters: - branches: - only: - - nightly - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - package_type: conda - upload_subfolder: cu111 - - binary_upload: - name: binary_windows_conda_3_8_cu111_nightly_upload - context: org-member - requires: - - binary_windows_conda_3_8_cu111_nightly_test - filters: - branches: - only: - - nightly - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - package_type: conda - upload_subfolder: cu111 - - binary_upload: - name: binary_windows_conda_3_9_cu111_nightly_upload - context: org-member - requires: - - binary_windows_conda_3_9_cu111_nightly_test - filters: - branches: - only: - - nightly - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - package_type: conda - upload_subfolder: cu111 - - binary_upload: - name: binary_windows_conda_3_10_cu111_nightly_upload - context: org-member - requires: - - binary_windows_conda_3_10_cu111_nightly_test - filters: - branches: - only: - - nightly - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - package_type: conda - upload_subfolder: cu111 - - binary_upload: - name: binary_windows_conda_3_7_cu113_nightly_upload - context: org-member - requires: - - binary_windows_conda_3_7_cu113_nightly_test - filters: - branches: - only: - - nightly - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - package_type: conda - upload_subfolder: cu113 - - binary_upload: - name: binary_windows_conda_3_8_cu113_nightly_upload - context: org-member - requires: - - binary_windows_conda_3_8_cu113_nightly_test - filters: - branches: - only: - - nightly - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - package_type: conda - upload_subfolder: cu113 - - binary_upload: - name: binary_windows_conda_3_9_cu113_nightly_upload - context: org-member - requires: - - binary_windows_conda_3_9_cu113_nightly_test - filters: - branches: - only: - - nightly - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - package_type: conda - upload_subfolder: cu113 - - binary_upload: - name: binary_windows_conda_3_10_cu113_nightly_upload - context: org-member - requires: - - binary_windows_conda_3_10_cu113_nightly_test - filters: - branches: - only: - - nightly - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - package_type: conda - upload_subfolder: cu113 - - binary_upload: - name: binary_windows_conda_3_7_cu115_nightly_upload - context: org-member - requires: - - binary_windows_conda_3_7_cu115_nightly_test - filters: - branches: - only: - - nightly - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - package_type: conda - upload_subfolder: cu115 - - binary_upload: - name: binary_windows_conda_3_8_cu115_nightly_upload - context: org-member - requires: - - binary_windows_conda_3_8_cu115_nightly_test - filters: - branches: - only: - - nightly - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - package_type: conda - upload_subfolder: cu115 - - binary_upload: - name: binary_windows_conda_3_9_cu115_nightly_upload - context: org-member - requires: - - binary_windows_conda_3_9_cu115_nightly_test - filters: - branches: - only: - - nightly - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - package_type: conda - upload_subfolder: cu115 - - binary_upload: - name: binary_windows_conda_3_10_cu115_nightly_upload - context: org-member - requires: - - binary_windows_conda_3_10_cu115_nightly_test - filters: - branches: - only: - - nightly - tags: - only: - - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - package_type: conda - upload_subfolder: cu115 - when: << pipeline.parameters.run_binary_tests >> build: jobs: - - binary_linux_build: - build_environment: manywheel 3.7m cu102 devtoolset7 - docker_image: pytorch/manylinux-cuda102 - filters: - branches: - only: - - master - - /ci-all\/.*/ - - /release\/.*/ - name: binary_linux_manywheel_3_7m_cu102_devtoolset7_build - - binary_linux_build: - build_environment: libtorch 3.7m cpu devtoolset7 - docker_image: pytorch/manylinux-cuda102 - filters: - branches: - only: - - master - - /ci-all\/.*/ - - /release\/.*/ - libtorch_variant: shared-with-deps - name: binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_build - - binary_linux_build: - build_environment: libtorch 3.7m cpu gcc5.4_cxx11-abi - docker_image: pytorch/pytorch-binary-docker-image-ubuntu16.04:latest - libtorch_variant: shared-with-deps - name: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build - - binary_mac_build: - build_environment: wheel 3.7 cpu - filters: - branches: - only: - - master - - /ci-all\/.*/ - - /release\/.*/ - name: binary_macos_wheel_3_7_cpu_build - - binary_mac_build: - build_environment: libtorch 3.7 cpu - filters: - branches: - only: - - master - - /ci-all\/.*/ - - /release\/.*/ - name: binary_macos_libtorch_3_7_cpu_build - - binary_windows_build: - build_environment: libtorch 3.7 cpu debug - filters: - branches: - only: - - master - - /ci-all\/.*/ - - /release\/.*/ - name: binary_windows_libtorch_3_7_cpu_debug_build - - binary_windows_build: - build_environment: libtorch 3.7 cpu release - filters: - branches: - only: - - master - - /ci-all\/.*/ - - /release\/.*/ - name: binary_windows_libtorch_3_7_cpu_release_build - - binary_windows_build: - build_environment: wheel 3.7 cu113 - filters: - branches: - only: - - master - - /ci-all\/.*/ - - /release\/.*/ - name: binary_windows_wheel_3_7_cu113_build - - binary_windows_test: - build_environment: libtorch 3.7 cpu debug - filters: - branches: - only: - - master - - /ci-all\/.*/ - - /release\/.*/ - name: binary_windows_libtorch_3_7_cpu_debug_test - requires: - - binary_windows_libtorch_3_7_cpu_debug_build - - binary_windows_test: - build_environment: libtorch 3.7 cpu release - name: binary_windows_libtorch_3_7_cpu_release_test - requires: - - binary_windows_libtorch_3_7_cpu_release_build - - binary_windows_test: - build_environment: wheel 3.7 cu113 - executor: windows-with-nvidia-gpu - filters: - branches: - only: - - master - - /ci-all\/.*/ - - /release\/.*/ - name: binary_windows_wheel_3_7_cu113_test - requires: - - binary_windows_wheel_3_7_cu113_build - - binary_linux_test: - build_environment: manywheel 3.7m cu102 devtoolset7 - docker_image: pytorch/manylinux-cuda102 - filters: - branches: - only: - - master - - /ci-all\/.*/ - - /release\/.*/ - name: binary_linux_manywheel_3_7m_cu102_devtoolset7_test - requires: - - binary_linux_manywheel_3_7m_cu102_devtoolset7_build - resource_class: gpu.nvidia.small - use_cuda_docker_runtime: "1" - - binary_linux_test: - build_environment: libtorch 3.7m cpu devtoolset7 - docker_image: pytorch/manylinux-cuda102 - filters: - branches: - only: - - master - - /ci-all\/.*/ - - /release\/.*/ - libtorch_variant: shared-with-deps - name: binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_test - requires: - - binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_build - - binary_linux_test: - build_environment: libtorch 3.7m cpu gcc5.4_cxx11-abi - docker_image: pytorch/pytorch-binary-docker-image-ubuntu16.04:latest - filters: - branches: - only: - - master - - /ci-all\/.*/ - - /release\/.*/ - libtorch_variant: shared-with-deps - name: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_test - requires: - - binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build - binary_ios_build: build_environment: libtorch-ios-12.5.1-nightly-x86_64-build context: org-member @@ -2491,278 +1729,4 @@ workflows: branches: only: - postnightly - - update_s3_htmls: - context: org-member - filters: - branches: - only: - - postnightly - name: update_s3_htmls - - smoke_windows_test: - name: smoke_windows_conda_3_7_cpu_nightly - build_environment: "conda 3.7 cpu" - requires: - - update_s3_htmls - filters: - branches: - only: - - postnightly - - smoke_windows_test: - name: smoke_windows_conda_3_8_cpu_nightly - build_environment: "conda 3.8 cpu" - requires: - - update_s3_htmls - filters: - branches: - only: - - postnightly - - smoke_windows_test: - name: smoke_windows_conda_3_9_cpu_nightly - build_environment: "conda 3.9 cpu" - requires: - - update_s3_htmls - filters: - branches: - only: - - postnightly - - smoke_windows_test: - name: smoke_windows_conda_3_10_cpu_nightly - build_environment: "conda 3.10 cpu" - requires: - - update_s3_htmls - filters: - branches: - only: - - postnightly - - smoke_windows_test: - name: smoke_windows_conda_3_7_cu111_nightly - build_environment: "conda 3.7 cu111" - requires: - - update_s3_htmls - filters: - branches: - only: - - postnightly - executor: windows-with-nvidia-gpu - - smoke_windows_test: - name: smoke_windows_conda_3_8_cu111_nightly - build_environment: "conda 3.8 cu111" - requires: - - update_s3_htmls - filters: - branches: - only: - - postnightly - executor: windows-with-nvidia-gpu - - smoke_windows_test: - name: smoke_windows_conda_3_9_cu111_nightly - build_environment: "conda 3.9 cu111" - requires: - - update_s3_htmls - filters: - branches: - only: - - postnightly - executor: windows-with-nvidia-gpu - - smoke_windows_test: - name: smoke_windows_conda_3_10_cu111_nightly - build_environment: "conda 3.10 cu111" - requires: - - update_s3_htmls - filters: - branches: - only: - - postnightly - executor: windows-with-nvidia-gpu - - smoke_windows_test: - name: smoke_windows_conda_3_7_cu113_nightly - build_environment: "conda 3.7 cu113" - requires: - - update_s3_htmls - filters: - branches: - only: - - postnightly - executor: windows-with-nvidia-gpu - - smoke_windows_test: - name: smoke_windows_conda_3_8_cu113_nightly - build_environment: "conda 3.8 cu113" - requires: - - update_s3_htmls - filters: - branches: - only: - - postnightly - executor: windows-with-nvidia-gpu - - smoke_windows_test: - name: smoke_windows_conda_3_9_cu113_nightly - build_environment: "conda 3.9 cu113" - requires: - - update_s3_htmls - filters: - branches: - only: - - postnightly - executor: windows-with-nvidia-gpu - - smoke_windows_test: - name: smoke_windows_conda_3_10_cu113_nightly - build_environment: "conda 3.10 cu113" - requires: - - update_s3_htmls - filters: - branches: - only: - - postnightly - executor: windows-with-nvidia-gpu - - smoke_windows_test: - name: smoke_windows_conda_3_7_cu115_nightly - build_environment: "conda 3.7 cu115" - requires: - - update_s3_htmls - filters: - branches: - only: - - postnightly - executor: windows-with-nvidia-gpu - - smoke_windows_test: - name: smoke_windows_conda_3_8_cu115_nightly - build_environment: "conda 3.8 cu115" - requires: - - update_s3_htmls - filters: - branches: - only: - - postnightly - executor: windows-with-nvidia-gpu - - smoke_windows_test: - name: smoke_windows_conda_3_9_cu115_nightly - build_environment: "conda 3.9 cu115" - requires: - - update_s3_htmls - filters: - branches: - only: - - postnightly - executor: windows-with-nvidia-gpu - - smoke_windows_test: - name: smoke_windows_conda_3_10_cu115_nightly - build_environment: "conda 3.10 cu115" - requires: - - update_s3_htmls - filters: - branches: - only: - - postnightly - executor: windows-with-nvidia-gpu when: << pipeline.parameters.run_build >> - master_build: - jobs: - - binary_linux_build: - build_environment: manywheel 3.7m cu102 devtoolset7 - docker_image: pytorch/manylinux-cuda102 - name: binary_linux_manywheel_3_7m_cu102_devtoolset7_build - - binary_linux_build: - build_environment: libtorch 3.7m cpu devtoolset7 - docker_image: pytorch/manylinux-cuda102 - libtorch_variant: shared-with-deps - name: binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_build - - binary_linux_build: - build_environment: libtorch 3.7m cpu gcc5.4_cxx11-abi - docker_image: pytorch/pytorch-binary-docker-image-ubuntu16.04:latest - libtorch_variant: shared-with-deps - name: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build - - binary_mac_build: - build_environment: wheel 3.7 cpu - name: binary_macos_wheel_3_7_cpu_build - - binary_mac_build: - build_environment: libtorch 3.7 cpu - name: binary_macos_libtorch_3_7_cpu_build - - binary_windows_build: - build_environment: libtorch 3.7 cpu debug - name: binary_windows_libtorch_3_7_cpu_debug_build - - binary_windows_build: - build_environment: libtorch 3.7 cpu release - name: binary_windows_libtorch_3_7_cpu_release_build - - binary_windows_build: - build_environment: wheel 3.7 cu113 - name: binary_windows_wheel_3_7_cu113_build - - binary_windows_test: - build_environment: libtorch 3.7 cpu debug - name: binary_windows_libtorch_3_7_cpu_debug_test - requires: - - binary_windows_libtorch_3_7_cpu_debug_build - - binary_windows_test: - build_environment: wheel 3.7 cu113 - executor: windows-with-nvidia-gpu - name: binary_windows_wheel_3_7_cu113_test - requires: - - binary_windows_wheel_3_7_cu113_build - - binary_linux_test: - build_environment: manywheel 3.7m cu102 devtoolset7 - docker_image: pytorch/manylinux-cuda102 - name: binary_linux_manywheel_3_7m_cu102_devtoolset7_test - requires: - - binary_linux_manywheel_3_7m_cu102_devtoolset7_build - resource_class: gpu.nvidia.small - use_cuda_docker_runtime: "1" - - binary_linux_test: - build_environment: libtorch 3.7m cpu devtoolset7 - docker_image: pytorch/manylinux-cuda102 - libtorch_variant: shared-with-deps - name: binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_test - requires: - - binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_build - - binary_linux_test: - build_environment: libtorch 3.7m cpu gcc5.4_cxx11-abi - docker_image: pytorch/pytorch-binary-docker-image-ubuntu16.04:latest - libtorch_variant: shared-with-deps - name: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_test - requires: - - binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build - when: << pipeline.parameters.run_master_build >> - # Promotion workflow - promote: - jobs: - # Requires manual approval by someone in org-member - # CircleCI security context - - promote_approval: - context: org-member - filters: - branches: - ignore: /.*/ - tags: - only: /v[0-9]+(\.[0-9]+)*/ - type: approval - - promote_s3: - context: org-member - filters: - branches: - ignore: /.*/ - tags: - only: /v[0-9]+(\.[0-9]+)*/ - name: promote_s3_libtorch - package_name: libtorch - requires: - - promote_approval - - promote_s3: - context: org-member - filters: - branches: - ignore: /.*/ - tags: - only: /v[0-9]+(\.[0-9]+)*/ - name: promote_s3_torch - package_name: torch - requires: - - promote_approval - - promote_conda: - context: org-member - filters: - branches: - ignore: /.*/ - tags: - only: /v[0-9]+(\.[0-9]+)*/ - name: promote_conda_pytorch - package_name: pytorch - requires: - - promote_approval diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh index bfab08d8bd0b..330dbbc6f8e8 100755 --- a/.circleci/docker/build.sh +++ b/.circleci/docker/build.sh @@ -40,6 +40,12 @@ function extract_all_from_image_name() { done } +# Use the same pre-built XLA test image from PyTorch/XLA +if [[ "$image" == *xla* ]]; then + echo "Using pre-built XLA test image..." + exit 0 +fi + if [[ "$image" == *-xenial* ]]; then UBUNTU_VERSION=16.04 elif [[ "$image" == *-artful* ]]; then @@ -84,7 +90,7 @@ case "$image" in ;; pytorch-linux-xenial-py3.7-gcc5.4) ANACONDA_PYTHON_VERSION=3.7 - CMAKE_VERSION=3.10.3 + CMAKE_VERSION=3.12.4 # To make sure XNNPACK is enabled for the BACKWARDS_COMPAT_TEST used with this image GCC_VERSION=5 PROTOBUF=yes DB=yes @@ -116,9 +122,10 @@ case "$image" in VISION=yes KATEX=yes ;; - pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7) - CUDA_VERSION=11.1 + pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7) + CUDA_VERSION=11.3.0 # Deviating from major.minor to conform to nvidia's Docker image names CUDNN_VERSION=8 + TENSORRT_VERSION=8.0.1.6 ANACONDA_PYTHON_VERSION=3.7 CMAKE_VERSION=3.10.3 GCC_VERSION=7 @@ -127,20 +134,20 @@ case "$image" in VISION=yes KATEX=yes ;; - pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7) + pytorch-linux-bionic-cuda11.3-cudnn8-py3-clang9) CUDA_VERSION=11.3.0 # Deviating from major.minor to conform to nvidia's Docker image names CUDNN_VERSION=8 TENSORRT_VERSION=8.0.1.6 ANACONDA_PYTHON_VERSION=3.7 CMAKE_VERSION=3.10.3 - GCC_VERSION=7 + CLANG_VERSION=9 PROTOBUF=yes DB=yes VISION=yes KATEX=yes ;; - pytorch-linux-bionic-cuda11.5-cudnn8-py3-gcc7) - CUDA_VERSION=11.5.0 + pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7) + CUDA_VERSION=11.6.0 CUDNN_VERSION=8 ANACONDA_PYTHON_VERSION=3.7 CMAKE_VERSION=3.10.3 @@ -227,31 +234,21 @@ case "$image" in DB=yes VISION=yes ;; - pytorch-linux-bionic-cuda11.0-cudnn8-py3.7-gcc9) - CUDA_VERSION=11.0 - CUDNN_VERSION=8 + pytorch-linux-bionic-rocm5.0-py3.7) ANACONDA_PYTHON_VERSION=3.7 GCC_VERSION=9 PROTOBUF=yes DB=yes VISION=yes - ROCM_VERSION=3.9 + ROCM_VERSION=5.0 ;; - pytorch-linux-bionic-rocm4.3.1-py3.7) + pytorch-linux-bionic-rocm5.1-py3.7) ANACONDA_PYTHON_VERSION=3.7 GCC_VERSION=9 PROTOBUF=yes DB=yes VISION=yes - ROCM_VERSION=4.3.1 - ;; - pytorch-linux-bionic-rocm4.5-py3.7) - ANACONDA_PYTHON_VERSION=3.7 - GCC_VERSION=9 - PROTOBUF=yes - DB=yes - VISION=yes - ROCM_VERSION=4.5.2 + ROCM_VERSION=5.1.1 ;; *) # Catch-all for builds that are not hardcoded. @@ -298,6 +295,13 @@ fi tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]') +#when using cudnn version 8 install it separately from cuda +if [[ "$image" == *cuda* && ${OS} == "ubuntu" ]]; then + IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu${UBUNTU_VERSION}" + if [[ ${CUDNN_VERSION} == 8 ]]; then + IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}" + fi +fi # Build image # TODO: build-arg THRIFT is not turned on for any image, remove it once we confirm @@ -336,6 +340,7 @@ docker build \ --build-arg "KATEX=${KATEX:-}" \ --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \ --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx900;gfx906}" \ + --build-arg "IMAGE_NAME=${IMAGE_NAME}" \ -f $(dirname ${DOCKERFILE})/Dockerfile \ -t "$tmp_tag" \ "$@" \ diff --git a/.circleci/docker/centos-rocm/Dockerfile b/.circleci/docker/centos-rocm/Dockerfile index 832e09c7f664..e0ef9e3296fe 100644 --- a/.circleci/docker/centos-rocm/Dockerfile +++ b/.circleci/docker/centos-rocm/Dockerfile @@ -40,8 +40,10 @@ RUN bash ./install_user.sh && rm install_user.sh # Install conda and other packages (e.g., numpy, pytest) ENV PATH /opt/conda/bin:$PATH ARG ANACONDA_PYTHON_VERSION +ADD requirements-ci.txt /opt/conda/requirements-ci.txt ADD ./common/install_conda.sh install_conda.sh RUN bash ./install_conda.sh && rm install_conda.sh +RUN rm /opt/conda/requirements-ci.txt # (optional) Install protobuf for ONNX ARG PROTOBUF diff --git a/.circleci/docker/common/install_base.sh b/.circleci/docker/common/install_base.sh index e2663d6b3bb8..1dc6b0cbaa55 100755 --- a/.circleci/docker/common/install_base.sh +++ b/.circleci/docker/common/install_base.sh @@ -20,6 +20,11 @@ install_ubuntu() { maybe_libiomp_dev="libiomp-dev" fi + # TODO: Remove this once nvidia package repos are back online + # Comment out nvidia repositories to prevent them from getting apt-get updated, see https://github.com/pytorch/pytorch/issues/74968 + # shellcheck disable=SC2046 + sed -i 's/.*nvidia.*/# &/' $(find /etc/apt/ -type f -name "*.list") + # Install common dependencies apt-get update # TODO: Some of these may not be necessary @@ -45,8 +50,8 @@ install_ubuntu() { libasound2-dev \ libsndfile-dev \ software-properties-common \ - sudo \ wget \ + sudo \ vim # Should resolve issues related to various apt package repository cert issues diff --git a/.circleci/docker/common/install_conda.sh b/.circleci/docker/common/install_conda.sh index 82cfcc6c6e6a..cc7696762a45 100755 --- a/.circleci/docker/common/install_conda.sh +++ b/.circleci/docker/common/install_conda.sh @@ -21,7 +21,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then ;; esac - mkdir /opt/conda + mkdir -p /opt/conda chown jenkins:jenkins /opt/conda # Work around bug where devtoolset replaces sudo and breaks it. @@ -68,14 +68,16 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then as_jenkins conda install -q -y python="$ANACONDA_PYTHON_VERSION" $* } + pip_install() { + as_jenkins pip install --progress-bar off $* + } + # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README # DO NOT install cmake here as it would install a version newer than 3.10, but # we want to pin to version 3.10. - SCIPY_VERSION=1.1.0 if [ "$ANACONDA_PYTHON_VERSION" = "3.9" ]; then # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source conda_install numpy=1.19.2 astunparse pyyaml mkl mkl-include setuptools cffi future six llvmdev=8.0.0 -c conda-forge - SCIPY_VERSION=1.6.0 elif [ "$ANACONDA_PYTHON_VERSION" = "3.8" ]; then # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source conda_install numpy=1.18.5 astunparse pyyaml mkl mkl-include setuptools cffi future six llvmdev=8.0.0 @@ -96,34 +98,14 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then conda_install nnpack -c killeent # Install some other packages, including those needed for Python test reporting - # TODO: Why is scipy pinned - # Pin MyPy version because new errors are likely to appear with each release - # Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136 - as_jenkins pip install --progress-bar off pytest \ - scipy==$SCIPY_VERSION \ - scikit-image \ - psutil \ - unittest-xml-reporting \ - boto3==1.16.34 \ - hypothesis==4.53.2 \ - expecttest==0.1.3 \ - mypy==0.812 \ - tb-nightly - - # Install numba only on python-3.8 or below - # For numba issue see https://github.com/pytorch/pytorch/issues/51511 - if [[ $(python -c "import sys; print(int(sys.version_info < (3, 9)))") == "1" ]]; then - as_jenkins pip install --progress-bar off numba==0.54.1 "librosa>=0.6.2,<0.9.0" - else - as_jenkins pip install --progress-bar off numba==0.49.0 "librosa>=0.6.2,<0.9.0" - fi + pip_install -r /opt/conda/requirements-ci.txt # Update scikit-learn to a python-3.8 compatible version if [[ $(python -c "import sys; print(int(sys.version_info >= (3, 8)))") == "1" ]]; then - as_jenkins pip install --progress-bar off -U scikit-learn + pip_install -U scikit-learn else # Pinned scikit-learn due to https://github.com/scikit-learn/scikit-learn/issues/14485 (affects gcc 5.5 only) - as_jenkins pip install --progress-bar off scikit-learn==0.20.3 + pip_install scikit-learn==0.20.3 fi popd diff --git a/.circleci/docker/common/install_cudnn.sh b/.circleci/docker/common/install_cudnn.sh new file mode 100644 index 000000000000..1f1c34ea200d --- /dev/null +++ b/.circleci/docker/common/install_cudnn.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +if [[ ${CUDNN_VERSION} == 8 ]]; then + # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement + mkdir tmp_cudnn && cd tmp_cudnn + CUDNN_NAME="cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive" + curl -OLs https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/${CUDNN_NAME}.tar.xz + tar xf ${CUDNN_NAME}.tar.xz + cp -a ${CUDNN_NAME}/include/* /usr/include/ + cp -a ${CUDNN_NAME}/include/* /usr/local/cuda/include/ + cp -a ${CUDNN_NAME}/include/* /usr/include/x86_64-linux-gnu/ + + cp -a ${CUDNN_NAME}/lib/* /usr/local/cuda/lib64/ + cp -a ${CUDNN_NAME}/lib/* /usr/lib/x86_64-linux-gnu/ + cd .. + rm -rf tmp_cudnn + ldconfig +fi diff --git a/.circleci/docker/common/install_rocm.sh b/.circleci/docker/common/install_rocm.sh index 4ba3ed73db90..4cda40bbdca5 100644 --- a/.circleci/docker/common/install_rocm.sh +++ b/.circleci/docker/common/install_rocm.sh @@ -6,8 +6,8 @@ install_magma() { # "install" hipMAGMA into /opt/rocm/magma by copying after build git clone https://bitbucket.org/icl/magma.git pushd magma - # fix for magma_queue memory leak issue - git checkout c62d700d880c7283b33fb1d615d62fc9c7f7ca21 + # Fixes memory leaks of magma found while executing linalg UTs + git checkout 5959b8783e45f1809812ed96ae762f38ee701972 cp make.inc-examples/make.inc.hip-gcc-mkl make.inc echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib' >> make.inc @@ -35,7 +35,7 @@ ver() { } # Map ROCm version to AMDGPU version -declare -A AMDGPU_VERSIONS=( ["4.5.2"]="21.40.2" ) +declare -A AMDGPU_VERSIONS=( ["4.5.2"]="21.40.2" ["5.0"]="21.50" ["5.1.1"]="22.10.1" ) install_ubuntu() { apt-get update diff --git a/.circleci/docker/common/install_user.sh b/.circleci/docker/common/install_user.sh index 69c762350bbf..93a436cbfc78 100755 --- a/.circleci/docker/common/install_user.sh +++ b/.circleci/docker/common/install_user.sh @@ -3,8 +3,11 @@ set -ex # Mirror jenkins user in container -echo "jenkins:x:1014:1014::/var/lib/jenkins:" >> /etc/passwd -echo "jenkins:x:1014:" >> /etc/group +# jenkins user as ec2-user should have the same user-id +echo "jenkins:x:1000:1000::/var/lib/jenkins:" >> /etc/passwd +echo "jenkins:x:1000:" >> /etc/group +# Needed on focal or newer +echo "jenkins:*:19110:0:99999:7:::" >>/etc/shadow # Create $HOME mkdir -p /var/lib/jenkins @@ -18,3 +21,6 @@ chown jenkins:jenkins /usr/local # Allow sudo # TODO: Maybe we shouldn't echo 'jenkins ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/jenkins + +# Test that sudo works +sudo -u jenkins sudo -v diff --git a/.circleci/docker/requirements-ci.txt b/.circleci/docker/requirements-ci.txt new file mode 100644 index 000000000000..ff5a9ba33b7b --- /dev/null +++ b/.circleci/docker/requirements-ci.txt @@ -0,0 +1,212 @@ +# Python dependencies required for unit tests + +#awscli==1.6 #this breaks some platforms +#Description: AWS command line interface +#Pinned versions: 1.6 +#test that import: + +boto3==1.19.12 +#Description: AWS SDK for python +#Pinned versions: 1.19.12, 1.16.34 +#test that import: + +click +#Description: Command Line Interface Creation Kit +#Pinned versions: +#test that import: + +coremltools==5.0b5 +#Description: Apple framework for ML integration +#Pinned versions: 5.0b5 +#test that import: + +#dataclasses #this breaks some platforms +#Description: Provides decorators for auto adding special methods to user classes +#Pinned versions: +#test that import: + +expecttest==0.1.3 +#Description: method for writing tests where test framework auto populates +# the expected output based on previous runs +#Pinned versions: 0.1.3 +#test that import: + +flatbuffers==2.0 +#Description: cross platform serialization library +#Pinned versions: 2.0 +#test that import: + +#future #this breaks linux-bionic-rocm4.5-py3.7 +#Description: compatibility layer between python 2 and python 3 +#Pinned versions: +#test that import: + +hypothesis==4.53.2 +# Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136 +#Description: advanced library for generating parametrized tests +#Pinned versions: 3.44.6, 4.53.2 +#test that import: test_xnnpack_integration.py, test_pruning_op.py, test_nn.py + +junitparser==2.1.1 +#Description: unitparser handles JUnit/xUnit Result XML files +#Pinned versions: 2.1.1 +#test that import: + +librosa>=0.6.2 +#Description: A python package for music and audio analysis +#Pinned versions: >=0.6.2 +#test that import: test_spectral_ops.py + +#mkl #this breaks linux-bionic-rocm4.5-py3.7 +#Description: Intel oneAPI Math Kernel Library +#Pinned versions: +#test that import: test_profiler.py, test_public_bindings.py, test_testing.py, +#test_nn.py, test_mkldnn.py, test_jit.py, test_fx_experimental.py, +#test_autograd.py + +#mkl-devel +# see mkl + +#mock # breaks ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c +#Description: A testing library that allows you to replace parts of your +#system under test with mock objects +#Pinned versions: +#test that import: test_module_init.py, test_modules.py, test_nn.py, +#test_testing.py + +#MonkeyType # breaks pytorch-xla-linux-bionic-py3.7-clang8 +#Description: collects runtime types of function arguments and return +#values, and can automatically generate stub files +#Pinned versions: +#test that import: + +mypy==0.812 +# Pin MyPy version because new errors are likely to appear with each release +#Description: linter +#Pinned versions: 0.812 +#test that import: test_typing.py, test_type_hints.py + +#networkx +#Description: creation, manipulation, and study of +#the structure, dynamics, and functions of complex networks +#Pinned versions: 2.0 +#test that import: + +#ninja +#Description: build system. Note that it install from +#here breaks things so it is commented out +#Pinned versions: 1.10.0.post1 +#test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py + +numba==0.49.0 ; python_version < "3.9" +numba==0.54.1 ; python_version == "3.9" +#Description: Just-In-Time Compiler for Numerical Functions +#Pinned versions: 0.54.1, 0.49.0, <=0.49.1 +#test that import: test_numba_integration.py +#For numba issue see https://github.com/pytorch/pytorch/issues/51511 + +#numpy +#Description: Provides N-dimensional arrays and linear algebra +#Pinned versions: 1.20 +#test that import: test_view_ops.py, test_unary_ufuncs.py, test_type_promotion.py, +#test_type_info.py, test_torch.py, test_tensorexpr_pybind.py, test_tensorexpr.py, +#test_tensorboard.py, test_tensor_creation_ops.py, test_static_runtime.py, +#test_spectral_ops.py, test_sort_and_select.py, test_shape_ops.py, +#test_segment_reductions.py, test_reductions.py, test_pruning_op.py, +#test_overrides.py, test_numpy_interop.py, test_numba_integration.py +#test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py, +#test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py, +#test_binary_ufuncs.py + +#onnxruntime +#Description: scoring engine for Open Neural Network Exchange (ONNX) models +#Pinned versions: 1.9.0 +#test that import: + +#pillow +#Description: Python Imaging Library fork +#Pinned versions: +#test that import: + +#protobuf +#Description: Google’s data interchange format +#Pinned versions: +#test that import: test_tensorboard.py + +psutil +#Description: information on running processes and system utilization +#Pinned versions: +#test that import: test_profiler.py, test_openmp.py, test_dataloader.py + +pytest +#Description: testing framework +#Pinned versions: +#test that import: test_typing.py, test_cpp_extensions_aot.py, run_test.py + +#pytest-benchmark +#Description: fixture for benchmarking code +#Pinned versions: 3.2.3 +#test that import: + +#pytest-sugar +#Description: shows failures and errors instantly +#Pinned versions: +#test that import: + +#PyYAML +#Description: data serialization format +#Pinned versions: +#test that import: + +#requests +#Description: HTTP library +#Pinned versions: +#test that import: test_type_promotion.py + +#rich +#Description: rich text and beautiful formatting in the terminal +#Pinned versions: 10.9.0 +#test that import: + +scikit-image +#Description: image processing routines +#Pinned versions: +#test that import: test_nn.py + +#scikit-learn +#Description: machine learning package +#Pinned versions: 0.20.3 +#test that import: + +scipy==1.6.3 +# Pin SciPy because of failing distribution tests (see #60347) +#Description: scientific python +#Pinned versions: 1.6.3 +#test that import: test_unary_ufuncs.py, test_torch.py,test_tensor_creation_ops.py +#test_spectral_ops.py, test_sparse_csr.py, test_reductions.py,test_nn.py +#test_linalg.py, test_binary_ufuncs.py + +#tabulate +#Description: Pretty-print tabular data +#Pinned versions: +#test that import: + +tb-nightly +#Description: TensorBoard +#Pinned versions: +#test that import: + +#typing-extensions +#Description: type hints for python +#Pinned versions: +#test that import: + +#virtualenv +#Description: virtual environment for python +#Pinned versions: +#test that import: + +unittest-xml-reporting<=3.2.0,>=2.0.0 +#Description: saves unit test results to xml +#Pinned versions: +#test that import: diff --git a/.circleci/docker/ubuntu-cuda/Dockerfile b/.circleci/docker/ubuntu-cuda/Dockerfile index 9c9e40387066..241b91cff394 100644 --- a/.circleci/docker/ubuntu-cuda/Dockerfile +++ b/.circleci/docker/ubuntu-cuda/Dockerfile @@ -1,12 +1,11 @@ ARG UBUNTU_VERSION ARG CUDA_VERSION -ARG CUDNN_VERSION +ARG IMAGE_NAME -FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu${UBUNTU_VERSION} +FROM ${IMAGE_NAME} ARG UBUNTU_VERSION ARG CUDA_VERSION -ARG CUDNN_VERSION ENV DEBIAN_FRONTEND noninteractive @@ -27,8 +26,10 @@ RUN bash ./install_katex.sh && rm install_katex.sh # Install conda and other packages (e.g., numpy, pytest) ENV PATH /opt/conda/bin:$PATH ARG ANACONDA_PYTHON_VERSION +ADD requirements-ci.txt /opt/conda/requirements-ci.txt ADD ./common/install_conda.sh install_conda.sh RUN bash ./install_conda.sh && rm install_conda.sh +RUN rm /opt/conda/requirements-ci.txt # Install gcc ARG GCC_VERSION @@ -99,5 +100,11 @@ ENV CUDA_PATH /usr/local/cuda # Install LLVM dev version (Defined in the pytorch/builder github repository) COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm +# Install CUDNN +ARG CUDNN_VERSION +ADD ./common/install_cudnn.sh install_cudnn.sh +RUN if [ "${CUDNN_VERSION}" -eq 8 ]; then bash install_cudnn.sh; fi +RUN rm install_cudnn.sh + USER jenkins CMD ["bash"] diff --git a/.circleci/docker/ubuntu-rocm/Dockerfile b/.circleci/docker/ubuntu-rocm/Dockerfile index 73f0e1822e89..260592876363 100644 --- a/.circleci/docker/ubuntu-rocm/Dockerfile +++ b/.circleci/docker/ubuntu-rocm/Dockerfile @@ -28,8 +28,10 @@ RUN bash ./install_user.sh && rm install_user.sh # Install conda and other packages (e.g., numpy, pytest) ENV PATH /opt/conda/bin:$PATH ARG ANACONDA_PYTHON_VERSION +ADD requirements-ci.txt /opt/conda/requirements-ci.txt ADD ./common/install_conda.sh install_conda.sh RUN bash ./install_conda.sh && rm install_conda.sh +RUN rm /opt/conda/requirements-ci.txt # Install gcc ARG GCC_VERSION diff --git a/.circleci/docker/ubuntu/Dockerfile b/.circleci/docker/ubuntu/Dockerfile index e0ae5c096ec9..d5940c7a1d55 100644 --- a/.circleci/docker/ubuntu/Dockerfile +++ b/.circleci/docker/ubuntu/Dockerfile @@ -36,8 +36,10 @@ RUN bash ./install_katex.sh && rm install_katex.sh # Install conda and other packages (e.g., numpy, pytest) ENV PATH /opt/conda/bin:$PATH ARG ANACONDA_PYTHON_VERSION +ADD requirements-ci.txt /opt/conda/requirements-ci.txt ADD ./common/install_conda.sh install_conda.sh RUN bash ./install_conda.sh && rm install_conda.sh +RUN rm /opt/conda/requirements-ci.txt # Install gcc ARG GCC_VERSION diff --git a/.circleci/generate_config_yml.py b/.circleci/generate_config_yml.py index a801aa978482..e068dd98fd8e 100755 --- a/.circleci/generate_config_yml.py +++ b/.circleci/generate_config_yml.py @@ -10,8 +10,6 @@ import sys from collections import namedtuple -import cimodel.data.binary_build_definitions as binary_build_definitions -import cimodel.data.simple.binary_smoketest import cimodel.data.simple.docker_definitions import cimodel.data.simple.mobile_definitions import cimodel.data.simple.nightly_ios @@ -81,11 +79,11 @@ def _for_all_items(items, functor) -> None: functor(item_type, item) def filter_master_only_jobs(items): - def _is_master_item(item): + def _is_main_or_master_item(item): filters = item.get('filters', None) branches = filters.get('branches', None) if filters is not None else None branches_only = branches.get('only', None) if branches is not None else None - return 'master' in branches_only if branches_only is not None else False + return ('main' in branches_only or 'master' in branches_only) if branches_only is not None else False master_deps = set() @@ -94,7 +92,7 @@ def _save_requires_if_master(item_type, item): item_name = item.get("name", None) if not isinstance(requires, list): return - if _is_master_item(item) or item_name in master_deps: + if _is_main_or_master_item(item) or item_name in master_deps: master_deps.update([n.strip('"') for n in requires]) def _do_filtering(items): @@ -105,7 +103,7 @@ def _do_filtering(items): item_type, item = next(iter(items.items())) item_name = item.get("name", None) item_name = item_name.strip('"') if item_name is not None else None - if not _is_master_item(item) and item_name not in master_deps: + if not _is_main_or_master_item(item) and item_name not in master_deps: return None if 'filters' in item: item = item.copy() @@ -113,7 +111,7 @@ def _do_filtering(items): return {item_type: item} # Scan of dependencies twice to pick up nested required jobs - # I.e. jobs depending on jobs that master-only job depend on + # I.e. jobs depending on jobs that main-only job depend on _for_all_items(items, _save_requires_if_master) _for_all_items(items, _save_requires_if_master) return _do_filtering(items) @@ -136,11 +134,8 @@ def _requires_docker_image(item_type, item): def gen_build_workflows_tree(): build_workflows_functions = [ cimodel.data.simple.mobile_definitions.get_workflow_jobs, - cimodel.data.simple.binary_smoketest.get_workflow_jobs, cimodel.data.simple.nightly_ios.get_workflow_jobs, cimodel.data.simple.anaconda_prune_defintions.get_workflow_jobs, - binary_build_definitions.get_post_upload_jobs, - binary_build_definitions.get_binary_smoke_test_jobs, ] build_jobs = [f() for f in build_workflows_functions] build_jobs.extend( @@ -151,28 +146,20 @@ def gen_build_workflows_tree(): ) master_build_jobs = filter_master_only_jobs(build_jobs) - binary_build_functions = [ - binary_build_definitions.get_binary_build_jobs, - binary_build_definitions.get_nightly_tests, - binary_build_definitions.get_nightly_uploads, - ] - - return { + rc = { "workflows": { - "binary_builds": { - "when": r"<< pipeline.parameters.run_binary_tests >>", - "jobs": [f() for f in binary_build_functions], - }, "build": { "when": r"<< pipeline.parameters.run_build >>", "jobs": build_jobs, }, - "master_build": { - "when": r"<< pipeline.parameters.run_master_build >>", - "jobs": master_build_jobs, - }, } } + if len(master_build_jobs) > 0: + rc["workflows"]["master_build"] = { + "when": r"<< pipeline.parameters.run_master_build >>", + "jobs": master_build_jobs, + } + return rc # Order of this list matters to the generated config.yml. @@ -183,17 +170,14 @@ def gen_build_workflows_tree(): Header("Build parameters"), File("build-parameters/pytorch-build-params.yml"), File("build-parameters/binary-build-params.yml"), - File("build-parameters/promote-build-params.yml"), Header("Job specs"), File("job-specs/binary-job-specs.yml"), File("job-specs/job-specs-custom.yml"), - File("job-specs/job-specs-promote.yml"), File("job-specs/binary_update_htmls.yml"), File("job-specs/binary-build-tests.yml"), File("job-specs/docker_jobs.yml"), Header("Workflows"), Treegen(gen_build_workflows_tree, 0), - File("workflows/workflows-promote.yml"), ] diff --git a/.circleci/scripts/binary_checkout.sh b/.circleci/scripts/binary_checkout.sh index db2b0660d9f5..86bfeb77e6ac 100755 --- a/.circleci/scripts/binary_checkout.sh +++ b/.circleci/scripts/binary_checkout.sh @@ -49,8 +49,9 @@ if [[ -n "${CIRCLE_PR_NUMBER:-}" ]]; then git reset --hard "$CIRCLE_SHA1" elif [[ -n "${CIRCLE_SHA1:-}" ]]; then # Scheduled workflows & "smoke" binary build on master on PR merges + DEFAULT_BRANCH="$(git remote show $CIRCLE_REPOSITORY_URL | awk '/HEAD branch/ {print $NF}')" git reset --hard "$CIRCLE_SHA1" - git checkout -q -B master + git checkout -q -B $DEFAULT_BRANCH else echo "Can't tell what to checkout" exit 1 diff --git a/.circleci/scripts/binary_linux_build.sh b/.circleci/scripts/binary_linux_build.sh index 42aa728d55a6..88561fcd80ec 100755 --- a/.circleci/scripts/binary_linux_build.sh +++ b/.circleci/scripts/binary_linux_build.sh @@ -26,7 +26,7 @@ else build_script='manywheel/build.sh' fi -if [[ "$CIRCLE_BRANCH" == "master" ]] || [[ "$CIRCLE_BRANCH" == release/* ]]; then +if [[ "$CIRCLE_BRANCH" == "main" ]] || [[ "$CIRCLE_BRANCH" == "master" ]] || [[ "$CIRCLE_BRANCH" == release/* ]]; then export BUILD_DEBUG_INFO=1 fi diff --git a/.circleci/scripts/binary_linux_test.sh b/.circleci/scripts/binary_linux_test.sh index 5be7f7cae213..bdec35d6d5d9 100755 --- a/.circleci/scripts/binary_linux_test.sh +++ b/.circleci/scripts/binary_linux_test.sh @@ -53,7 +53,7 @@ if [[ "\$python_nodot" = *39* ]]; then NUMPY_PIN=">=1.20" fi -if [[ "$DESIRED_CUDA" == "cu112" || "$DESIRED_CUDA" == "cu115" ]]; then +if [[ "$DESIRED_CUDA" == "cu116" ]]; then EXTRA_CONDA_FLAGS="-c=conda-forge" fi @@ -67,7 +67,8 @@ mv /final_pkgs/debug-*.zip /tmp/debug_final_pkgs || echo "no debug packages to m # TODO there is duplicated and inconsistent test-python-env setup across this # file, builder/smoke_test.sh, and builder/run_tests.sh, and also in the # conda build scripts themselves. These should really be consolidated -pkg="/final_pkgs/\$(ls /final_pkgs)" +# Pick only one package of multiple available (which happens as result of workflow re-runs) +pkg="/final_pkgs/\$(ls -1 /final_pkgs|sort|tail -1)" if [[ "$PACKAGE_TYPE" == conda ]]; then ( # For some reason conda likes to re-activate the conda environment when attempting this install diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh index eab7c2b727fe..b42d58549d68 100755 --- a/.circleci/scripts/binary_populate_env.sh +++ b/.circleci/scripts/binary_populate_env.sh @@ -50,7 +50,7 @@ if [[ -z ${IS_GHA:-} ]]; then export PACKAGE_TYPE="${configs[0]}" export DESIRED_PYTHON="${configs[1]}" export DESIRED_CUDA="${configs[2]}" - if [[ "${BUILD_FOR_SYSTEM:-}" == "windows" ]]; then + if [[ "${OSTYPE}" == "msys" ]]; then export DESIRED_DEVTOOLSET="" export LIBTORCH_CONFIG="${configs[3]:-}" if [[ "$LIBTORCH_CONFIG" == 'debug' ]]; then @@ -91,11 +91,6 @@ if [[ ${DESIRED_CUDA} == "cpu" ]]; then USE_GOLD_LINKER="ON" fi -USE_WHOLE_CUDNN="OFF" -# Link whole cuDNN for CUDA-11.1 to include fp16 fast kernels -if [[ "$(uname)" == "Linux" && "${DESIRED_CUDA}" == "cu111" ]]; then - USE_WHOLE_CUDNN="ON" -fi # Default to nightly, since that's where this normally uploads to PIP_UPLOAD_FOLDER='nightly/' @@ -158,10 +153,14 @@ export DESIRED_PYTHON="${DESIRED_PYTHON:-}" export DESIRED_CUDA="$DESIRED_CUDA" export LIBTORCH_VARIANT="${LIBTORCH_VARIANT:-}" export BUILD_PYTHONLESS="${BUILD_PYTHONLESS:-}" -export DESIRED_DEVTOOLSET="${DESIRED_DEVTOOLSET:-}" -if [[ "${BUILD_FOR_SYSTEM:-}" == "windows" ]]; then +if [[ "${OSTYPE}" == "msys" ]]; then export LIBTORCH_CONFIG="${LIBTORCH_CONFIG:-}" - export DEBUG="${DEBUG:-}" + if [[ "${LIBTORCH_CONFIG:-}" == 'debug' ]]; then + export DEBUG=1 + fi + export DESIRED_DEVTOOLSET="" +else + export DESIRED_DEVTOOLSET="${DESIRED_DEVTOOLSET:-}" fi export DATE="$DATE" @@ -184,7 +183,6 @@ export DOCKER_IMAGE="$DOCKER_IMAGE" export USE_GOLD_LINKER="${USE_GOLD_LINKER}" export USE_GLOO_WITH_OPENSSL="ON" -export USE_WHOLE_CUDNN="${USE_WHOLE_CUDNN}" # =================== The above code will be executed inside Docker container =================== EOL diff --git a/.circleci/scripts/binary_windows_build.sh b/.circleci/scripts/binary_windows_build.sh index 439b2c981cfe..e6500b8d9c93 100644 --- a/.circleci/scripts/binary_windows_build.sh +++ b/.circleci/scripts/binary_windows_build.sh @@ -7,15 +7,17 @@ mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" export CUDA_VERSION="${DESIRED_CUDA/cu/}" export USE_SCCACHE=1 export SCCACHE_BUCKET=ossci-compiler-cache-windows -export NIGHTLIES_PYTORCH_ROOT="$PYTORCH_ROOT" +export SCCACHE_IGNORE_SERVER_IO_ERROR=1 export VC_YEAR=2019 if [[ "${DESIRED_CUDA}" == *"cu11"* ]]; then export BUILD_SPLIT_CUDA=ON fi + echo "Free Space for CUDA DEBUG BUILD" if [[ "${CIRCLECI:-}" == 'true' ]]; then + export NIGHTLIES_PYTORCH_ROOT="$PYTORCH_ROOT" if [[ -d "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community" ]]; then rm -rf "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community" fi @@ -70,6 +72,7 @@ pushd "$BUILDER_ROOT" if [[ "$PACKAGE_TYPE" == 'conda' ]]; then ./windows/internal/build_conda.bat elif [[ "$PACKAGE_TYPE" == 'wheel' || "$PACKAGE_TYPE" == 'libtorch' ]]; then + export NIGHTLIES_PYTORCH_ROOT="$PYTORCH_ROOT" ./windows/internal/build_wheels.bat fi diff --git a/.circleci/scripts/cpp_doc_push_script.sh b/.circleci/scripts/cpp_doc_push_script.sh index fa68d07e537e..1ade86b31264 100755 --- a/.circleci/scripts/cpp_doc_push_script.sh +++ b/.circleci/scripts/cpp_doc_push_script.sh @@ -34,9 +34,9 @@ echo "error: cpp_doc_push_script.sh: install_path (arg1) not specified" exit 1 fi -is_master_doc=false +is_main_doc=false if [ "$version" == "master" ]; then - is_master_doc=true + is_main_doc=true fi echo "install_path: $install_path version: $version" @@ -56,7 +56,7 @@ sudo apt-get -y install doxygen # Generate ATen files pushd "${pt_checkout}" pip install -r requirements.txt -time python -m tools.codegen.gen \ +time python -m torchgen.gen \ -s aten/src/ATen \ -d build/aten/src/ATen @@ -66,7 +66,7 @@ cp torch/_utils_internal.py tools/shared # Generate PyTorch files time python tools/setup_helpers/generate_code.py \ --native-functions-path aten/src/ATen/native/native_functions.yaml \ - --nn-path aten/src/ + --tags-path aten/src/ATen/native/tags.yaml # Build the docs pushd docs/cpp diff --git a/.circleci/scripts/python_doc_push_script.sh b/.circleci/scripts/python_doc_push_script.sh index ccfc44917400..f9b019ec069b 100755 --- a/.circleci/scripts/python_doc_push_script.sh +++ b/.circleci/scripts/python_doc_push_script.sh @@ -37,9 +37,9 @@ echo "error: python_doc_push_script.sh: install_path (arg1) not specified" exit 1 fi -is_master_doc=false +is_main_doc=false if [ "$version" == "master" ]; then - is_master_doc=true + is_main_doc=true fi # Argument 3: The branch to push to. Usually is "site" @@ -86,7 +86,7 @@ pushd docs # Build the docs pip -q install -r requirements.txt -if [ "$is_master_doc" = true ]; then +if [ "$is_main_doc" = true ]; then build_docs html [ $? -eq 0 ] || exit $? make coverage diff --git a/.circleci/scripts/setup_ci_environment.sh b/.circleci/scripts/setup_ci_environment.sh index 1f2e6bfaef61..dab183d907a6 100755 --- a/.circleci/scripts/setup_ci_environment.sh +++ b/.circleci/scripts/setup_ci_environment.sh @@ -32,7 +32,7 @@ if ! command -v aws >/dev/null; then fi if [ -n "${USE_CUDA_DOCKER_RUNTIME:-}" ]; then - DRIVER_FN="NVIDIA-Linux-x86_64-495.44.run" + DRIVER_FN="NVIDIA-Linux-x86_64-510.60.02.run" wget "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN" sudo /bin/bash "$DRIVER_FN" -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false) nvidia-smi diff --git a/.circleci/scripts/trigger_azure_pipeline.py b/.circleci/scripts/trigger_azure_pipeline.py index b35ee5ce9def..9dc9dff2d54d 100644 --- a/.circleci/scripts/trigger_azure_pipeline.py +++ b/.circleci/scripts/trigger_azure_pipeline.py @@ -11,7 +11,7 @@ AZURE_DEVOPS_PAT_BASE64 = os.environ.get("AZURE_DEVOPS_PAT_BASE64_SECRET", "") PIPELINE_ID = "911" PROJECT_ID = "0628bce4-2d33-499e-bac5-530e12db160f" -TARGET_BRANCH = os.environ.get("CIRCLE_BRANCH", "master") +TARGET_BRANCH = os.environ.get("CIRCLE_BRANCH", "main") TARGET_COMMIT = os.environ.get("CIRCLE_SHA1", "") build_base_url = AZURE_PIPELINE_BASE_URL + "_apis/build/builds?api-version=6.0" diff --git a/.circleci/scripts/windows_cuda_install.sh b/.circleci/scripts/windows_cuda_install.sh index abcdcf134b37..f06a2b0ab096 100644 --- a/.circleci/scripts/windows_cuda_install.sh +++ b/.circleci/scripts/windows_cuda_install.sh @@ -2,25 +2,17 @@ set -eux -o pipefail case ${CUDA_VERSION} in - 10.1) - cuda_installer_name="cuda_10.1.243_426.00_win10" - cuda_install_packages="nvcc_10.1 cuobjdump_10.1 nvprune_10.1 cupti_10.1 cublas_10.1 cublas_dev_10.1 cudart_10.1 cufft_10.1 cufft_dev_10.1 curand_10.1 curand_dev_10.1 cusolver_10.1 cusolver_dev_10.1 cusparse_10.1 cusparse_dev_10.1 nvgraph_10.1 nvgraph_dev_10.1 npp_10.1 npp_dev_10.1 nvrtc_10.1 nvrtc_dev_10.1 nvml_dev_10.1" - ;; 10.2) cuda_installer_name="cuda_10.2.89_441.22_win10" cuda_install_packages="nvcc_10.2 cuobjdump_10.2 nvprune_10.2 cupti_10.2 cublas_10.2 cublas_dev_10.2 cudart_10.2 cufft_10.2 cufft_dev_10.2 curand_10.2 curand_dev_10.2 cusolver_10.2 cusolver_dev_10.2 cusparse_10.2 cusparse_dev_10.2 nvgraph_10.2 nvgraph_dev_10.2 npp_10.2 npp_dev_10.2 nvrtc_10.2 nvrtc_dev_10.2 nvml_dev_10.2" ;; - 11.1) - cuda_installer_name="cuda_11.1.1_456.81_win10" - cuda_install_packages="nvcc_11.1 cuobjdump_11.1 nvprune_11.1 nvprof_11.1 cupti_11.1 cublas_11.1 cublas_dev_11.1 cudart_11.1 cufft_11.1 cufft_dev_11.1 curand_11.1 curand_dev_11.1 cusolver_11.1 cusolver_dev_11.1 cusparse_11.1 cusparse_dev_11.1 npp_11.1 npp_dev_11.1 nvrtc_11.1 nvrtc_dev_11.1 nvml_dev_11.1" - ;; 11.3) cuda_installer_name="cuda_11.3.0_465.89_win10" cuda_install_packages="thrust_11.3 nvcc_11.3 cuobjdump_11.3 nvprune_11.3 nvprof_11.3 cupti_11.3 cublas_11.3 cublas_dev_11.3 cudart_11.3 cufft_11.3 cufft_dev_11.3 curand_11.3 curand_dev_11.3 cusolver_11.3 cusolver_dev_11.3 cusparse_11.3 cusparse_dev_11.3 npp_11.3 npp_dev_11.3 nvrtc_11.3 nvrtc_dev_11.3 nvml_dev_11.3" ;; - 11.5) - cuda_installer_name="cuda_11.5.0_496.13_win10" - cuda_install_packages="thrust_11.5 nvcc_11.5 cuobjdump_11.5 nvprune_11.5 nvprof_11.5 cupti_11.5 cublas_11.5 cublas_dev_11.5 cudart_11.5 cufft_11.5 cufft_dev_11.5 curand_11.5 curand_dev_11.5 cusolver_11.5 cusolver_dev_11.5 cusparse_11.5 cusparse_dev_11.5 npp_11.5 npp_dev_11.5 nvrtc_11.5 nvrtc_dev_11.5 nvml_dev_11.5" + 11.6) + cuda_installer_name="cuda_11.6.0_511.23_windows" + cuda_install_packages="thrust_11.6 nvcc_11.6 cuobjdump_11.6 nvprune_11.6 nvprof_11.6 cupti_11.6 cublas_11.6 cublas_dev_11.6 cudart_11.6 cufft_11.6 cufft_dev_11.6 curand_11.6 curand_dev_11.6 cusolver_11.6 cusolver_dev_11.6 cusparse_11.6 cusparse_dev_11.6 npp_11.6 npp_dev_11.6 nvrtc_11.6 nvrtc_dev_11.6 nvml_dev_11.6" ;; *) echo "CUDA_VERSION $CUDA_VERSION is not supported yet" diff --git a/.circleci/scripts/windows_cudnn_install.sh b/.circleci/scripts/windows_cudnn_install.sh index 87e8a8dd09bf..a815008ee1e0 100644 --- a/.circleci/scripts/windows_cudnn_install.sh +++ b/.circleci/scripts/windows_cudnn_install.sh @@ -5,22 +5,16 @@ set -eux -o pipefail windows_s3_link="https://ossci-windows.s3.amazonaws.com" case ${CUDA_VERSION} in - 10.1) - # This is typically blank but for CUDA 10* it'll be set to 10 - cudnn_file_name="cudnn-${CUDA_VERSION}-windows10-x64-v7.6.4.38" - ;; 10.2) cudnn_file_name="cudnn-${CUDA_VERSION}-windows10-x64-v7.6.5.32" ;; - 11.1) - cudnn_file_name="cudnn-${CUDA_VERSION}-windows-x64-v8.0.5.39" - ;; 11.3) - cudnn_file_name="cudnn-${CUDA_VERSION}-windows-x64-v8.2.0.53" + # Use cudnn8.3 with hard-coded cuda11.3 version + cudnn_file_name="cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive" ;; - 11.5) - # Since cudnn 8.3 the filename have changed - cudnn_file_name="cudnn-windows-x86_64-8.3.2.44_cuda${CUDA_VERSION}-archive" + 11.6) + # Use cudnn8.3 with hard-coded cuda11.5 version + cudnn_file_name="cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive" ;; *) echo "CUDA_VERSION: ${CUDA_VERSION} not supported yet" diff --git a/.circleci/verbatim-sources/build-parameters/binary-build-params.yml b/.circleci/verbatim-sources/build-parameters/binary-build-params.yml index ca1d1486fef8..6f34c30d5248 100644 --- a/.circleci/verbatim-sources/build-parameters/binary-build-params.yml +++ b/.circleci/verbatim-sources/build-parameters/binary-build-params.yml @@ -62,5 +62,4 @@ binary_windows_params: &binary_windows_params default: "windows-xlarge-cpu-with-nvidia-cuda" environment: BUILD_ENVIRONMENT: << parameters.build_environment >> - BUILD_FOR_SYSTEM: windows JOB_EXECUTOR: <> diff --git a/.circleci/verbatim-sources/build-parameters/promote-build-params.yml b/.circleci/verbatim-sources/build-parameters/promote-build-params.yml deleted file mode 100644 index 2827c805f10a..000000000000 --- a/.circleci/verbatim-sources/build-parameters/promote-build-params.yml +++ /dev/null @@ -1,14 +0,0 @@ - -promote_common: &promote_common - docker: - - image: pytorch/release - parameters: - package_name: - description: "package name to promote" - type: string - default: "" - environment: - PACKAGE_NAME: << parameters.package_name >> - ANACONDA_API_TOKEN: ${CONDA_PYTORCHBOT_TOKEN} - AWS_ACCESS_KEY_ID: ${PYTORCH_BINARY_AWS_ACCESS_KEY_ID} - AWS_SECRET_ACCESS_KEY: ${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY} diff --git a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml index 581b76c8f942..f6f16ef7dd65 100644 --- a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml +++ b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml @@ -162,6 +162,7 @@ jobs: <<: *binary_mac_params macos: xcode: "12.0" + resource_class: "large" steps: # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml - checkout diff --git a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml index a3c1d932d93e..f0f12e09b2d9 100644 --- a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml +++ b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml @@ -5,7 +5,7 @@ parameters: branch: type: string - default: "master" + default: "main" steps: - attach_workspace: at: /tmp/workspace @@ -45,7 +45,7 @@ echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE} # turn v1.12.0rc3 into 1.12 tag=$(echo $CIRCLE_TAG | sed -e 's/v*\([0-9]*\.[0-9]*\).*/\1/') - target=${tag:-master} + target=${tag:-main} echo "building for ${target}" time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE}) @@ -55,7 +55,7 @@ echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts mkdir -p ~/workspace/build_artifacts - docker cp $id:/var/lib/jenkins/workspace/pytorch.github.io/docs/master ~/workspace/build_artifacts + docker cp $id:/var/lib/jenkins/workspace/pytorch.github.io/docs/main ~/workspace/build_artifacts docker cp $id:/var/lib/jenkins/workspace/pytorch.github.io /tmp/workspace # Save the docs build so we can debug any problems @@ -67,7 +67,7 @@ paths: - . - store_artifacts: - path: ~/workspace/build_artifacts/master + path: ~/workspace/build_artifacts/main destination: docs pytorch_cpp_doc_build: @@ -91,12 +91,12 @@ echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE} # turn v1.12.0rc3 into 1.12 tag=$(echo $CIRCLE_TAG | sed -e 's/v*\([0-9]*\.[0-9]*\).*/\1/') - target=${tag:-master} + target=${tag:-main} echo "building for ${target}" time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE}) - export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && '"export CIRCLE_SHA1='$CIRCLE_SHA1'"' && . ./.circleci/scripts/cpp_doc_push_script.sh docs/"$target" master") | docker exec -u jenkins -i "$id" bash) 2>&1' + export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && '"export CIRCLE_SHA1='$CIRCLE_SHA1'"' && . ./.circleci/scripts/cpp_doc_push_script.sh docs/"$target" main") | docker exec -u jenkins -i "$id" bash) 2>&1' echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts @@ -580,7 +580,7 @@ time docker pull ${DOCKER_IMAGE}:${DOCKER_TAG} >/dev/null export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${DOCKER_IMAGE}:${DOCKER_TAG}) - echo "Do NOT merge master branch into $CIRCLE_BRANCH in environment $BUILD_ENVIRONMENT" + echo "Do NOT merge main branch into $CIRCLE_BRANCH in environment $BUILD_ENVIRONMENT" git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0 diff --git a/.circleci/verbatim-sources/workflows/workflows-promote.yml b/.circleci/verbatim-sources/workflows/workflows-promote.yml deleted file mode 100644 index d3afc0862d63..000000000000 --- a/.circleci/verbatim-sources/workflows/workflows-promote.yml +++ /dev/null @@ -1,46 +0,0 @@ - # Promotion workflow - promote: - jobs: - # Requires manual approval by someone in org-member - # CircleCI security context - - promote_approval: - context: org-member - filters: - branches: - ignore: /.*/ - tags: - only: /v[0-9]+(\.[0-9]+)*/ - type: approval - - promote_s3: - context: org-member - filters: - branches: - ignore: /.*/ - tags: - only: /v[0-9]+(\.[0-9]+)*/ - name: promote_s3_libtorch - package_name: libtorch - requires: - - promote_approval - - promote_s3: - context: org-member - filters: - branches: - ignore: /.*/ - tags: - only: /v[0-9]+(\.[0-9]+)*/ - name: promote_s3_torch - package_name: torch - requires: - - promote_approval - - promote_conda: - context: org-member - filters: - branches: - ignore: /.*/ - tags: - only: /v[0-9]+(\.[0-9]+)*/ - name: promote_conda_pytorch - package_name: pytorch - requires: - - promote_approval diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 000000000000..51ae28c6e058 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,24 @@ +# 2020-11-12 Enabled ShellCheck on `.jenkins/pytorch` +65d5004b09fd8d5deac173a3aaa259f46eaa0d67 +# 2021-01-20 Replaced ` ` with `...` in many doctests +c147aa306c6386a753fdff24b48d04e803070a63 +# 2021-03-05 Removed all trailing whitespace +8c798e062216278673a75bac0848ea69a8bd3f03 +# 2021-03-30 Normalized trailing newlines +5bcbbf537327f6e8328289c25a3a453a2444d984 +# 2021-03-31 Autogenerated Markdown ToCs +a74b10def961ab090385f291ee06e66db99c1a2f +# 2021-04-02 Enabled more ShellCheck warnings +09670c7d43b9abce862a6bf71d8cc89e64764bdb +# 2021-04-08 Removed all non-breaking spaces +cc11aaaa60aadf28e3ec278bce26a42c1cd68a4f +# 2021-04-13 Expanded many wildcard imports +4753100a3baa96273204c361c8452afb7b59836f +# 2021-04-19 Removed all unqualified `noqa` +e3900d2ba5c9f91a24a9ce34520794c8366d5c54 +# 2021-04-21 Removed all unqualified `type: ignore` +75024e228ca441290b6a1c2e564300ad507d7af6 +# 2021-05-14 Removed all versionless Python shebangs +2e26976ad3b06ce95dd6afccfdbe124802edf28f +# 2021-06-07 Strictly typed everything in `.github` and `tools` +737d920b21db9b4292d056ee1329945990656304 diff --git a/.gitattributes b/.gitattributes index 70246abe9bbb..8bccf04bbb7d 100644 --- a/.gitattributes +++ b/.gitattributes @@ -2,3 +2,5 @@ .circleci/config.yml linguist-generated=true .github/workflows/generated-*.yml linguist-generated=true .github/generated-* linguist-generated=true +.github/scripts/gql_mocks.json linguist-generated=true +third_party/LICENSES_BUNDLED.txt linguist-generated=true diff --git a/.github/ISSUE_TEMPLATE/ci-sev.md b/.github/ISSUE_TEMPLATE/ci-sev.md index b248963cfd4d..8178c68d978b 100644 --- a/.github/ISSUE_TEMPLATE/ci-sev.md +++ b/.github/ISSUE_TEMPLATE/ci-sev.md @@ -1,5 +1,5 @@ --- -name: "⚠️CI SEV" +name: "⚠️ CI SEV" about: Tracking incidents for PyTorch's CI infra. --- diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 7ffccd6e58e7..cd98b00b0646 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -2,4 +2,4 @@ blank_issues_enabled: true contact_links: - name: Questions url: https://discuss.pytorch.org/ - about: Ask questions and discuss with other pytorch community members + about: Ask questions and discuss with other PyTorch community members diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml index 42c2317b5cfc..e18d5412dced 100644 --- a/.github/ISSUE_TEMPLATE/feature-request.yml +++ b/.github/ISSUE_TEMPLATE/feature-request.yml @@ -1,5 +1,5 @@ name: 🚀 Feature request -description: Submit a proposal/request for a new pytorch feature +description: Submit a proposal/request for a new PyTorch feature body: - type: textarea diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index 18329c526258..91b6d5af421b 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -1,5 +1,7 @@ self-hosted-runner: labels: + - linux.20_04.4x + - linux.20_04.16x - linux.large - linux.2xlarge - linux.4xlarge @@ -9,3 +11,5 @@ self-hosted-runner: - windows.4xlarge - windows.8xlarge.nvidia.gpu - bm-runner + - linux.rocm.gpu + - macos-12 diff --git a/.github/actions/build-android/action.yml b/.github/actions/build-android/action.yml new file mode 100644 index 000000000000..2493bb3a7606 --- /dev/null +++ b/.github/actions/build-android/action.yml @@ -0,0 +1,82 @@ +name: build android + +description: build android for a specific arch + +inputs: + arch: + description: arch to build + required: true + arch-for-build-env: + description: | + arch to pass to build environment. + This is currently different than the arch name we use elswhere, which + should be fixed. + required: true + github-secret: + description: github token + required: true + build-environment: + required: true + description: Top-level label for what's being built/tested. + docker-image: + required: true + description: Name of the base docker image to build with. + branch: + required: true + description: What branch we are building on. +outputs: + container_id: + description: Docker container identifier used to build the artifacts + value: ${{ steps.build.outputs.container_id }} + +runs: + using: composite + steps: + - name: Build-${{ inputs.arch }} + id: build + shell: bash + env: + BRANCH: ${{ inputs.branch }} + JOB_BASE_NAME: ${{ inputs.build-environment }}-build-and-test + BUILD_ENVIRONMENT: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-${{ inputs.arch-for-build-env }}-build" + AWS_DEFAULT_REGION: us-east-1 + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts + SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + DOCKER_IMAGE: ${{ inputs.docker-image }} + MATRIX_ARCH: ${{ inputs.arch }} + run: | + # detached container should get cleaned up by teardown_ec2_linux + set -exo pipefail + export container_name + container_name=$(docker run \ + -e BUILD_ENVIRONMENT \ + -e JOB_BASE_NAME \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e AWS_DEFAULT_REGION \ + -e IS_GHA \ + -e PR_NUMBER \ + -e SHA1 \ + -e BRANCH \ + -e GITHUB_RUN_ID \ + -e SCCACHE_BUCKET \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e SKIP_SCCACHE_INITIALIZATION=1 \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --tty \ + --detach \ + --user jenkins \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0 + docker cp "${GITHUB_WORKSPACE}/." "${container_name}:/var/lib/jenkins/workspace" + (echo "sudo chown -R jenkins . && .jenkins/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete" | docker exec -u jenkins -i "${container_name}" bash) 2>&1 + + # Copy install binaries back + mkdir -p "${GITHUB_WORKSPACE}/build_android_install_${MATRIX_ARCH}" + docker cp "${container_name}:/var/lib/jenkins/workspace/build_android/install" "${GITHUB_WORKSPACE}/build_android_install_${MATRIX_ARCH}" + echo "::set-output name=container_id::${container_name}" diff --git a/.github/actions/calculate-docker-image/action.yml b/.github/actions/calculate-docker-image/action.yml new file mode 100644 index 000000000000..d32179ac78a7 --- /dev/null +++ b/.github/actions/calculate-docker-image/action.yml @@ -0,0 +1,93 @@ +name: Calculate docker image + +description: Determine docker image to pull, building a new one if necessary. + +inputs: + docker-image-name: + description: The name of a docker image, like `pytorch-linux-xenial-py3.7-gcc7` + required: true + xla: + description: | + Whether or not to use a pre-build XLA docker image. + Note that this is a string, either "true" or "false" due to GHA limitations. + required: false + always-rebuild: + description: If set to any value, always build a fresh docker image. + required: false + pull: + description: If set to any value, run `docker pull`` on the calculated image. + required: false + +outputs: + docker-image: + description: The docker image to use for the rest of the workflow + value: ${{ steps.calculate-tag.outputs.docker-image }} + +runs: + using: composite + steps: + - name: Calculate docker image tag + shell: bash + id: calculate-tag + env: + IS_XLA: ${{ inputs.xla == 'true' && 'true' || '' }} + XLA_IMAGE_TAG: v0.2 + DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/${{ inputs.docker-image-name }} + run: | + if [ -n "${IS_XLA}" ]; then + echo "XLA workflow uses pre-built test image at ${XLA_IMAGE_TAG}" + DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) + echo "::set-output name=docker-tag::${DOCKER_TAG}" + echo "::set-output name=docker-image::${DOCKER_IMAGE_BASE}:${XLA_IMAGE_TAG}" + else + DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) + echo "::set-output name=docker-tag::${DOCKER_TAG}" + echo "::set-output name=docker-image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" + fi + + - name: Check if image should be built + shell: bash + id: check + if: ${{ !inputs.always-rebuild }} + env: + BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} + DOCKER_IMAGE: ${{ steps.calculate-tag.outputs.docker-image }} + DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker-tag }} + run: | + set -x + # Check if image already exists, if it does then skip building it + if docker manifest inspect "${DOCKER_IMAGE}"; then + exit 0 + fi + if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then + # if we're on the base branch then use the parent commit + MERGE_BASE=$(git rev-parse HEAD~) + else + # otherwise we're on a PR, so use the most recent base commit + MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") + fi + # Covers the case where a previous tag doesn't exist for the tree + # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly + if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then + echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" + exit 1 + fi + PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") + # If no image exists but the hash is the same as the previous hash then we should error out here + if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then + echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" + echo " contact the PyTorch team to restore the original images" + exit 1 + fi + echo ::set-output name=rebuild::yes + + - name: Build and push docker image + if: inputs.always-rebuild || steps.check.outputs.rebuild + env: + IMAGE_NAME: ${{inputs.docker-image-name}} + DOCKER_SKIP_S3_UPLOAD: "1" + DOCKER_TAG: ${{ steps.calculate-tag.outputs.docker-tag }} + working-directory: .circleci/docker + shell: bash + run: | + ./build_docker.sh diff --git a/.github/actions/checkout-pytorch/action.yml b/.github/actions/checkout-pytorch/action.yml new file mode 100644 index 000000000000..6523dab0c64f --- /dev/null +++ b/.github/actions/checkout-pytorch/action.yml @@ -0,0 +1,41 @@ +name: Checkout PyTorch + +description: Clean workspace and check out PyTorch + +inputs: + no-sudo: + description: If set to any value, don't use sudo to clean the workspace + required: false + submodules: + description: Works as stated in actions/checkout, but the default value is recursive + required: false + default: recursive + fetch-depth: + description: Works as stated in actions/checkout, but the default value is 0 + required: false + default: "0" + +runs: + using: composite + steps: + - name: Clean workspace + shell: bash + env: + NO_SUDO: ${{ inputs.no-sudo }} + run: | + echo "${GITHUB_WORKSPACE}" + if [ -z "${NO_SUDO}" ]; then + sudo rm -rf "${GITHUB_WORKSPACE}" + else + rm -rf "${GITHUB_WORKSPACE}" + fi + mkdir "${GITHUB_WORKSPACE}" + + - name: Checkout PyTorch + uses: malfet/checkout@silent-checkout + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # --depth=1 for speed, manually fetch history and other refs as necessary + fetch-depth: ${{ inputs.fetch-depth }} + submodules: ${{ inputs.submodules }} + quiet-checkout: true diff --git a/.github/actions/chown-workspace/action.yml b/.github/actions/chown-workspace/action.yml new file mode 100644 index 000000000000..6adc6cdc217d --- /dev/null +++ b/.github/actions/chown-workspace/action.yml @@ -0,0 +1,11 @@ +name: Chown workspace + +description: Ensure that the working directory gets chowned back to the current user + +runs: + using: composite + steps: + - run: docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + shell: bash + env: + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" diff --git a/.github/actions/download-build-artifacts/action.yml b/.github/actions/download-build-artifacts/action.yml new file mode 100644 index 000000000000..a3c9444c1b98 --- /dev/null +++ b/.github/actions/download-build-artifacts/action.yml @@ -0,0 +1,34 @@ +name: Download PyTorch Build Artifacts + +description: Download and unzip artifacts from a previous PyTorch build. + +inputs: + name: + description: Name of what artifact to download + required: true + use-gha: + description: If set to any value, use GHA to download the artifact. Otherwise use s3. + required: false + +runs: + using: composite + steps: + - name: Download PyTorch Build Artifacts from S3 + if: ${{ !inputs.use-gha }} + uses: seemethere/download-artifact-s3@v3 + with: + name: ${{ inputs.name }} + + - name: Download PyTorch Build Artifacts from GHA + if: inputs.use-gha + uses: actions/download-artifact@v2 + with: + name: ${{ inputs.name }} + + - name: Unzip artifacts + shell: bash + run: unzip -o artifacts.zip + + - name: Output disk space left + shell: bash + run: df -H diff --git a/.github/actions/get-workflow-job-id/action.yml b/.github/actions/get-workflow-job-id/action.yml new file mode 100644 index 000000000000..c7ca1e07d6be --- /dev/null +++ b/.github/actions/get-workflow-job-id/action.yml @@ -0,0 +1,31 @@ +name: Get workflow job id + +description: Get the ID of the workflow job that is currently running. + +inputs: + github-token: + description: GITHUB_TOKEN + required: true + +outputs: + job-id: + description: The retrieved workflow job id + value: ${{ steps.get-job-id.outputs.job-id }} + +runs: + using: composite + steps: + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + id: get-job-id + env: + GITHUB_TOKEN: ${{ inputs.github-token }} + with: + shell: bash + timeout_minutes: 10 + max_attempts: 5 + retry_wait_seconds: 30 + command: | + set -x + python3 -m pip install requests==2.26.0 + GHA_WORKFLOW_JOB_ID=$(python3 .github/scripts/get_workflow_job_id.py "${GITHUB_RUN_ID}" "${RUNNER_NAME}") + echo "::set-output name=job-id::${GHA_WORKFLOW_JOB_ID}" diff --git a/.github/actions/pull-docker-image/action.yml b/.github/actions/pull-docker-image/action.yml new file mode 100644 index 000000000000..ad1cc1baf9d3 --- /dev/null +++ b/.github/actions/pull-docker-image/action.yml @@ -0,0 +1,19 @@ +name: Pull docker image + +description: pull a specific docker image + +inputs: + docker-image: + description: the image to pull + required: true + +runs: + using: composite + steps: + - name: Pull Docker image + shell: bash + env: + DOCKER_IMAGE: ${{ inputs.docker-image }} + run: | + retry () { "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") } + retry docker pull "${DOCKER_IMAGE}" diff --git a/.github/actions/setup-linux/action.yml b/.github/actions/setup-linux/action.yml new file mode 100644 index 000000000000..d7500f11de7d --- /dev/null +++ b/.github/actions/setup-linux/action.yml @@ -0,0 +1,47 @@ +name: Setup Linux + +description: Set up Docker workspace on EC2 + +runs: + using: composite + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + + - name: Start docker if docker deamon is not running + shell: bash + run: | + if systemctl is-active --quiet docker; then + echo "Docker daemon is running..."; + else + echo "Starting docker deamon..." && sudo systemctl start docker; + fi + + - name: Log in to ECR + shell: bash + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: "5" + AWS_DEFAULT_REGION: us-east-1 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + + - name: Preserve github env variables for use in docker + shell: bash + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" diff --git a/.github/actions/setup-rocm/action.yml b/.github/actions/setup-rocm/action.yml new file mode 100644 index 000000000000..1a109830ee32 --- /dev/null +++ b/.github/actions/setup-rocm/action.yml @@ -0,0 +1,64 @@ +name: Setup ROCm host + +description: Set up ROCm host for CI + +runs: + using: composite + steps: + - name: Set DOCKER_HOST + shell: bash + run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" + + - name: Runner health check system info + if: always() + shell: bash + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/rocm.list || true + cat /opt/rocm/.info/version || true + whoami + + - name: Runner health check rocm-smi + if: always() + shell: bash + run: | + rocm-smi + + - name: Runner health check rocminfo + if: always() + shell: bash + run: | + rocminfo + + - name: Runner health check GPU count + if: always() + shell: bash + run: | + ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') + if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then + echo "Failed to detect GPUs on the runner" + exit 1 + fi + + - name: Runner health check disconnect on failure + if: ${{ failure() }} + shell: bash + run: | + killall runsvc.sh + + - name: Preserve github env variables for use in docker + shell: bash + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + + - name: ROCm set GPU_FLAG + shell: bash + run: | + # Examine the runner name. If it ends with "-2", this is the second runner on the host. + if [[ ${{ runner.name }} == *-2 ]]; then + # select the last two GPUs on the host + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri/renderD130 --device=/dev/dri/renderD131 --group-add video --group-add daemon" >> "${GITHUB_ENV}" + else + # select the first two GPUs on the host + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri/renderD128 --device=/dev/dri/renderD129 --group-add video --group-add daemon" >> "${GITHUB_ENV}" + fi diff --git a/.github/actions/setup-ssh/action.yml b/.github/actions/setup-ssh/action.yml new file mode 100644 index 000000000000..c2be35a805c4 --- /dev/null +++ b/.github/actions/setup-ssh/action.yml @@ -0,0 +1,17 @@ +name: Setup SSH + +description: Adds ssh keys for current user to machine + +inputs: + github-secret: + description: GitHub token + required: true + +runs: + using: composite + steps: + - name: "Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ inputs.github-secret }} + activate-with-label: false diff --git a/.github/actions/setup-win/action.yml b/.github/actions/setup-win/action.yml new file mode 100644 index 000000000000..12f287b23089 --- /dev/null +++ b/.github/actions/setup-win/action.yml @@ -0,0 +1,60 @@ +name: Setup Windows + +description: Set up for windows jobs + +inputs: + cuda-version: + description: which cuda version to install, 'cpu' for none + required: true + +runs: + using: composite + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + + - name: Install Visual Studio 2019 toolchain + shell: powershell + env: + VS_VERSION: "16.8.6" + INSTALL_WINDOWS_SDK: "1" + run: | + .\.circleci\scripts\vs_install.ps1 + + - name: Install CUDA and CUDNN + shell: bash + if: inputs.cuda-version != 'cpu' + env: + CUDA_VERSION: ${{ inputs.cuda-version }} + run: | + .circleci/scripts/windows_cuda_install.sh + .circleci/scripts/windows_cudnn_install.sh + + - name: Setup Python3 + uses: actions/setup-python@v2 + with: + python-version: "3.x" diff --git a/.github/actions/teardown-linux/action.yml b/.github/actions/teardown-linux/action.yml new file mode 100644 index 000000000000..9238a073a6b6 --- /dev/null +++ b/.github/actions/teardown-linux/action.yml @@ -0,0 +1,28 @@ +name: Teardown Linux + +description: Stuff that should always run at the end of a linux job + +inputs: + skip-wait-ssh: + description: If set, don't wait for ssh to drain before tearing down + required: false + default: "" + +runs: + using: composite + steps: + - name: Hold runner for 2 hours or until ssh sessions have drained + # TODO working-directory: !{{ pytorch_directory }} + # Always hold for active ssh sessions + shell: bash + if: inputs.skip-wait-ssh == '' + run: .github/scripts/wait_for_ssh_to_drain.sh + + - name: Kill containers, clean up images + shell: bash + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af diff --git a/.github/actions/teardown-rocm/action.yml b/.github/actions/teardown-rocm/action.yml new file mode 100644 index 000000000000..f23d8e1e2422 --- /dev/null +++ b/.github/actions/teardown-rocm/action.yml @@ -0,0 +1,25 @@ +name: Teardown ROCm host + +description: Teardown ROCm host for CI + +runs: + using: composite + steps: + - name: Kill containers, clean up images + if: always() + shell: bash + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker containers + docker container prune -f + # Prune everything docker if there are more than 10 images (~200GB). + # This is easier than using a time filter, e.g., "until=24h". + image_count=$(docker images | wc -l) + if [[ ${image_count} -gt 10 ]]; then + echo "Purging all docker caches" + docker system prune -af + else + echo "Will not purge docker, only ${image_count} images found" + fi diff --git a/.github/actions/teardown-win/action.yml b/.github/actions/teardown-win/action.yml new file mode 100644 index 000000000000..49c509444e09 --- /dev/null +++ b/.github/actions/teardown-win/action.yml @@ -0,0 +1,33 @@ +name: Teardown Windows + +description: Set up Docker workspace on linux + +inputs: + extra-delete-dir: + description: If set, cleaning up the workspace will delete this too + required: false + default: "" + +runs: + using: composite + steps: + - name: Wait until all sessions have drained + shell: powershell + if: always() + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + + - name: Cleanup workspace + if: always() + shell: bash + env: + EXTRA_DELETE_DIR: ${{ inputs.extra-delete-dir }} + run: | + [ ! -z "${EXTRA_DELETE_DIR}" ] || rm -rf "${EXTRA_DELETE_DIR}" + rm -rf ./* diff --git a/.github/actions/upload-test-artifacts/action.yml b/.github/actions/upload-test-artifacts/action.yml new file mode 100644 index 000000000000..7a00a377fca4 --- /dev/null +++ b/.github/actions/upload-test-artifacts/action.yml @@ -0,0 +1,94 @@ +name: Upload test artifacts + +description: Upload various artifacts produced by our testing process + +inputs: + use-gha: + description: If set to any value, upload GHA. Otherwise upload to S3. + required: false + file-suffix: + description: | + Suffix to add to the filename of the artifacts. This should include the + workflow job id, see [Job id in artifacts]. + required: true + +runs: + using: composite + steps: + # Mac/Linux zip + - name: Zip JSONs for upload + if: runner.os != 'Windows' && !inputs.use-gha + shell: bash + env: + FILE_SUFFIX: ${{ inputs.file-suffix }} + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + + - name: Zip test reports for upload + if: runner.os != 'Windows' && !inputs.use-gha + shell: bash + env: + FILE_SUFFIX: ${{ inputs.file-suffix }} + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + + # Windows zip + - name: Zip JSONs for upload + if: runner.os == 'Windows' && !inputs.use-gha + shell: powershell + env: + FILE_SUFFIX: ${{ inputs.file-suffix }} + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + + - name: Zip test reports for upload + if: runner.os == 'Windows' && !inputs.use-gha + shell: powershell + env: + FILE_SUFFIX: ${{ inputs.file-suffix }} + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + + # S3 upload + - name: Store Test Downloaded JSONs on S3 + uses: seemethere/upload-artifact-s3@v4 + if: ${{ !inputs.use-gha }} + with: + retention-days: 14 + if-no-files-found: warn + path: test-jsons-*.zip + + - name: Store Test Reports on S3 + uses: seemethere/upload-artifact-s3@v4 + if: ${{ !inputs.use-gha }} + with: + retention-days: 14 + if-no-files-found: error + path: test-reports-*.zip + + # GHA upload + - name: Store Test Downloaded JSONs on Github + uses: actions/upload-artifact@v2 + if: inputs.use-gha + with: + # Add the run attempt, see [Artifact run attempt] + name: test-jsons-runattempt${{ github.run_attempt }}-${{ inputs.file-suffix }}.zip + retention-days: 14 + if-no-files-found: warn + path: test/**/*.json + + - name: Store Test Reports on Github + uses: actions/upload-artifact@v2 + if: inputs.use-gha + with: + # Add the run attempt, see [Artifact run attempt] + name: test-reports-runattempt${{ github.run_attempt }}-${{ inputs.file-suffix }}.zip + retention-days: 14 + if-no-files-found: error + path: test/**/*.xml diff --git a/.github/generated-ciflow-ruleset.json b/.github/generated-ciflow-ruleset.json deleted file mode 100644 index 186441321a70..000000000000 --- a/.github/generated-ciflow-ruleset.json +++ /dev/null @@ -1,304 +0,0 @@ -{ - "__comment": "@generated DO NOT EDIT MANUALLY, Generation script: .github/scripts/generate_ci_workflows.py", - "label_rules": { - "ciflow/all": [ - "caffe2-linux-xenial-py3.7-gcc5.4", - "docker-builds", - "ios-12-5-1-arm64", - "ios-12-5-1-arm64-coreml", - "ios-12-5-1-arm64-custom-ops", - "ios-12-5-1-arm64-full-jit", - "ios-12-5-1-arm64-metal", - "ios-12-5-1-x86-64", - "ios-12-5-1-x86-64-coreml", - "ios-12-5-1-x86-64-full-jit", - "libtorch-linux-xenial-cuda10.2-py3.7-gcc7", - "libtorch-linux-xenial-cuda11.3-py3.7-gcc7", - "linux-bionic-cuda10.2-py3.9-gcc7", - "linux-bionic-py3.7-clang9", - "linux-bionic-rocm4.5-py3.7", - "linux-docs", - "linux-docs-push", - "linux-vulkan-bionic-py3.7-clang9", - "linux-xenial-cuda11.3-py3.7-gcc7", - "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test", - "linux-xenial-cuda11.3-py3.7-gcc7-no-ops", - "linux-xenial-py3-clang5-mobile-build", - "linux-xenial-py3-clang5-mobile-custom-build-static", - "linux-xenial-py3.7-clang7-asan", - "linux-xenial-py3.7-clang7-onnx", - "linux-xenial-py3.7-gcc5.4", - "linux-xenial-py3.7-gcc7", - "linux-xenial-py3.7-gcc7-no-ops", - "macos-10-15-py3-arm64", - "macos-10-15-py3-lite-interpreter-x86-64", - "macos-11-py3-x86-64", - "parallelnative-linux-xenial-py3.7-gcc5.4", - "periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7", - "periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7", - "periodic-linux-bionic-cuda11.5-py3.7-gcc7", - "periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck", - "periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug", - "periodic-win-vs2019-cuda11.1-py3", - "periodic-win-vs2019-cuda11.5-py3", - "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build", - "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single", - "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit", - "win-vs2019-cpu-py3", - "win-vs2019-cuda11.3-py3" - ], - "ciflow/android": [ - "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build", - "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single", - "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit" - ], - "ciflow/bazel": [ - "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test" - ], - "ciflow/binaries": [ - "linux-binary-conda", - "linux-binary-libtorch-cxx11-abi", - "linux-binary-libtorch-pre-cxx11", - "linux-binary-manywheel", - "macos-arm64-binary-conda", - "macos-arm64-binary-wheel", - "macos-binary-conda", - "macos-binary-libtorch-cxx11-abi", - "macos-binary-libtorch-pre-cxx11", - "macos-binary-wheel", - "windows-binary-libtorch-cxx11-abi", - "windows-binary-libtorch-pre-cxx11", - "windows-binary-wheel" - ], - "ciflow/binaries_conda": [ - "linux-binary-conda", - "macos-arm64-binary-conda", - "macos-binary-conda" - ], - "ciflow/binaries_libtorch": [ - "linux-binary-libtorch-cxx11-abi", - "linux-binary-libtorch-pre-cxx11", - "macos-binary-libtorch-cxx11-abi", - "macos-binary-libtorch-pre-cxx11", - "windows-binary-libtorch-cxx11-abi", - "windows-binary-libtorch-pre-cxx11" - ], - "ciflow/binaries_wheel": [ - "linux-binary-manywheel", - "macos-arm64-binary-wheel", - "macos-binary-wheel", - "windows-binary-wheel" - ], - "ciflow/cpu": [ - "caffe2-linux-xenial-py3.7-gcc5.4", - "linux-bionic-py3.7-clang9", - "linux-docs", - "linux-docs-push", - "linux-vulkan-bionic-py3.7-clang9", - "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test", - "linux-xenial-py3.7-clang7-asan", - "linux-xenial-py3.7-clang7-onnx", - "linux-xenial-py3.7-gcc5.4", - "linux-xenial-py3.7-gcc7", - "linux-xenial-py3.7-gcc7-no-ops", - "parallelnative-linux-xenial-py3.7-gcc5.4", - "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build", - "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single", - "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit", - "win-vs2019-cpu-py3" - ], - "ciflow/cuda": [ - "libtorch-linux-xenial-cuda10.2-py3.7-gcc7", - "libtorch-linux-xenial-cuda11.3-py3.7-gcc7", - "linux-bionic-cuda10.2-py3.9-gcc7", - "linux-xenial-cuda11.3-py3.7-gcc7", - "linux-xenial-cuda11.3-py3.7-gcc7-no-ops", - "periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7", - "periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7", - "periodic-linux-bionic-cuda11.5-py3.7-gcc7", - "periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck", - "periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug", - "periodic-win-vs2019-cuda11.1-py3", - "periodic-win-vs2019-cuda11.5-py3", - "win-vs2019-cuda11.3-py3" - ], - "ciflow/default": [ - "linux-binary-conda", - "linux-binary-libtorch-cxx11-abi", - "linux-binary-libtorch-pre-cxx11", - "linux-binary-manywheel", - "linux-bionic-py3.7-clang9", - "linux-bionic-rocm4.5-py3.7", - "linux-docs", - "linux-vulkan-bionic-py3.7-clang9", - "linux-xenial-cuda11.3-py3.7-gcc7", - "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test", - "linux-xenial-py3-clang5-mobile-build", - "linux-xenial-py3-clang5-mobile-custom-build-static", - "linux-xenial-py3.7-clang7-asan", - "linux-xenial-py3.7-clang7-onnx", - "linux-xenial-py3.7-gcc5.4", - "linux-xenial-py3.7-gcc7", - "linux-xenial-py3.7-gcc7-no-ops", - "macos-arm64-binary-conda", - "macos-arm64-binary-wheel", - "macos-binary-conda", - "macos-binary-libtorch-cxx11-abi", - "macos-binary-libtorch-pre-cxx11", - "macos-binary-wheel", - "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single", - "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit", - "win-vs2019-cpu-py3", - "win-vs2019-cuda11.3-py3", - "windows-binary-libtorch-cxx11-abi", - "windows-binary-libtorch-pre-cxx11", - "windows-binary-wheel" - ], - "ciflow/docs": [ - "linux-docs" - ], - "ciflow/ios": [ - "ios-12-5-1-arm64", - "ios-12-5-1-arm64-coreml", - "ios-12-5-1-arm64-custom-ops", - "ios-12-5-1-arm64-full-jit", - "ios-12-5-1-arm64-metal", - "ios-12-5-1-x86-64", - "ios-12-5-1-x86-64-coreml", - "ios-12-5-1-x86-64-full-jit" - ], - "ciflow/libtorch": [ - "libtorch-linux-xenial-cuda10.2-py3.7-gcc7", - "libtorch-linux-xenial-cuda11.3-py3.7-gcc7", - "periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7", - "periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7" - ], - "ciflow/linux": [ - "caffe2-linux-xenial-py3.7-gcc5.4", - "libtorch-linux-xenial-cuda10.2-py3.7-gcc7", - "libtorch-linux-xenial-cuda11.3-py3.7-gcc7", - "linux-bionic-cuda10.2-py3.9-gcc7", - "linux-bionic-py3.7-clang9", - "linux-bionic-rocm4.5-py3.7", - "linux-docs", - "linux-docs-push", - "linux-vulkan-bionic-py3.7-clang9", - "linux-xenial-cuda11.3-py3.7-gcc7", - "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test", - "linux-xenial-cuda11.3-py3.7-gcc7-no-ops", - "linux-xenial-py3-clang5-mobile-build", - "linux-xenial-py3-clang5-mobile-custom-build-static", - "linux-xenial-py3.7-clang7-asan", - "linux-xenial-py3.7-clang7-onnx", - "linux-xenial-py3.7-gcc5.4", - "linux-xenial-py3.7-gcc7", - "linux-xenial-py3.7-gcc7-no-ops", - "parallelnative-linux-xenial-py3.7-gcc5.4", - "periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7", - "periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7", - "periodic-linux-bionic-cuda11.5-py3.7-gcc7", - "periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck", - "periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug", - "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build", - "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single", - "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit" - ], - "ciflow/macos": [ - "ios-12-5-1-arm64", - "ios-12-5-1-arm64-coreml", - "ios-12-5-1-arm64-custom-ops", - "ios-12-5-1-arm64-full-jit", - "ios-12-5-1-arm64-metal", - "ios-12-5-1-x86-64", - "ios-12-5-1-x86-64-coreml", - "ios-12-5-1-x86-64-full-jit", - "macos-10-15-py3-arm64", - "macos-10-15-py3-lite-interpreter-x86-64", - "macos-11-py3-x86-64" - ], - "ciflow/mobile": [ - "linux-xenial-py3-clang5-mobile-build", - "linux-xenial-py3-clang5-mobile-custom-build-static" - ], - "ciflow/noarch": [ - "linux-bionic-py3.7-clang9" - ], - "ciflow/onnx": [ - "linux-xenial-py3.7-clang7-onnx" - ], - "ciflow/rocm": [ - "linux-bionic-rocm4.5-py3.7" - ], - "ciflow/sanitizers": [ - "linux-xenial-py3.7-clang7-asan" - ], - "ciflow/scheduled": [ - "linux-docs-push", - "periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7", - "periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7", - "periodic-linux-bionic-cuda11.5-py3.7-gcc7", - "periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck", - "periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug", - "periodic-win-vs2019-cuda11.1-py3", - "periodic-win-vs2019-cuda11.5-py3" - ], - "ciflow/slow": [ - "linux-bionic-cuda10.2-py3.9-gcc7", - "periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck" - ], - "ciflow/slow-gradcheck": [ - "periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck" - ], - "ciflow/trunk": [ - "caffe2-linux-xenial-py3.7-gcc5.4", - "docker-builds", - "ios-12-5-1-arm64", - "ios-12-5-1-arm64-coreml", - "ios-12-5-1-arm64-custom-ops", - "ios-12-5-1-arm64-full-jit", - "ios-12-5-1-arm64-metal", - "ios-12-5-1-x86-64", - "ios-12-5-1-x86-64-coreml", - "ios-12-5-1-x86-64-full-jit", - "libtorch-linux-xenial-cuda10.2-py3.7-gcc7", - "libtorch-linux-xenial-cuda11.3-py3.7-gcc7", - "linux-bionic-cuda10.2-py3.9-gcc7", - "linux-bionic-py3.7-clang9", - "linux-bionic-rocm4.5-py3.7", - "linux-docs", - "linux-vulkan-bionic-py3.7-clang9", - "linux-xenial-cuda11.3-py3.7-gcc7", - "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test", - "linux-xenial-cuda11.3-py3.7-gcc7-no-ops", - "linux-xenial-py3-clang5-mobile-build", - "linux-xenial-py3-clang5-mobile-custom-build-static", - "linux-xenial-py3.7-clang7-asan", - "linux-xenial-py3.7-clang7-onnx", - "linux-xenial-py3.7-gcc5.4", - "linux-xenial-py3.7-gcc7", - "linux-xenial-py3.7-gcc7-no-ops", - "macos-10-15-py3-arm64", - "macos-10-15-py3-lite-interpreter-x86-64", - "macos-11-py3-x86-64", - "parallelnative-linux-xenial-py3.7-gcc5.4", - "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build", - "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single", - "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit", - "win-vs2019-cpu-py3", - "win-vs2019-cuda11.3-py3" - ], - "ciflow/vulkan": [ - "linux-vulkan-bionic-py3.7-clang9" - ], - "ciflow/win": [ - "periodic-win-vs2019-cuda11.1-py3", - "periodic-win-vs2019-cuda11.5-py3", - "win-vs2019-cpu-py3", - "win-vs2019-cuda11.3-py3" - ], - "ciflow/xla": [ - "linux-bionic-py3.7-clang9" - ] - }, - "version": "v1" -} diff --git a/.github/merge_rules.json b/.github/merge_rules.json index 6b0e452683fc..2dbd6c4f3107 100644 --- a/.github/merge_rules.json +++ b/.github/merge_rules.json @@ -1,20 +1,114 @@ [ { - "name": "ONNX exporter", - "patterns": ["torch/onnx/**", "torch/csrc/jit/passes/onnx/**", "torch/csrc/jit/passes/onnx.*", "test/onnx/**", "docs/source/onnx.rst"], - "approved_by": ["BowenBao", "garymm"], - "mandatory_app_id": 12274 + "name": "ONNX exporter", + "patterns": [ + ".jenkins/caffe2/*", + "scripts/onnx/**", + "docs/source/onnx.rst", + "test/onnx/**", + "test/jit/test_export_modes.py", + "aten/src/ATen/core/interned_strings.h", + "tools/onnx/**", + "torch/_C/__init__.pyi.in", + "torch/csrc/jit/passes/onnx.*", + "torch/csrc/jit/passes/onnx/**", + "torch/csrc/jit/serialization/export.*", + "torch/csrc/jit/serialization/onnx.*", + "torch/csrc/onnx/**", + "torch/onnx/**" + ], + "approved_by": ["BowenBao", "garymm"], + "mandatory_checks_name": ["Facebook CLA Check", "Lint"] }, { - "name": "NVFuser", - "patterns": ["torch/csrc/jit/codegen/fuser/cuda/**", "torch/csrc/jit/codegen/cuda/**", "benchmarks/cpp/nvfuser/**"], - "approved_by": ["csarofeen", "ngimel"], - "mandatory_app_id": 12274 + "name": "NVFuser", + "patterns": [ + "test/test_jit_cuda_fuser.py", + "torch/csrc/jit/codegen/fuser/cuda/**", + "torch/csrc/jit/codegen/cuda/**", + "benchmarks/cpp/nvfuser/**" + ], + "approved_by": ["csarofeen", "ngimel", "jjsjann123"], + "mandatory_checks_name": ["Facebook CLA Check", "Lint"] }, { - "name": "OSS CI", - "patterns": [".github/**", ".circleci/**", ".jenkins/**", "scripts/**"], - "approved_by": ["seemethere", "malfet", "suo"], - "mandatory_app_id": 12274 + "name": "OSS CI", + "patterns": [".github/**", ".circleci/**", ".jenkins/**", "scripts/**", "tools/**"], + "approved_by": ["ezyang", "pytorch/pytorch-dev-infra"], + "mandatory_checks_name": ["Facebook CLA Check", "Lint"] + }, + { + "name": "Documentation", + "patterns": ["docs/**", "torch/*docs.py"], + "approved_by": ["mruberry", "ngimel", "janeyx99"], + "mandatory_checks_name": ["Facebook CLA Check", "Lint"] + }, + { + "name": "Mobile", + "patterns": ["ios/**", "android/**", "test/mobile/**"], + "approved_by": ["linbinyu", "kit1980", "IvanKobzarev", "dreiss"], + "mandatory_checks_name": ["Facebook CLA Check", "Lint"] + }, + { + "name": "Linear Algebra", + "patterns": [ + "aten/src/ATen/native/cuda/linalg/**", + "aten/src/ATen/LinalgBackend.h", + "aten/src/ATen/native/**/*LinearAlgebra*", + "docs/source/linalg.rst", + "torch/linalg/**", + "torch/_linalg_utils.py", + "torch/**/python_linalg_functions.*", + "torch/**/linalg.h", + "tools/autograd/templates/python_linalg_functions.cpp", + "test/test_linalg.py" + ], + "approved_by": ["nikitaved", "mruberry", "pearu", "Lezcano", "IvanYashchuk"], + "mandatory_checks_name": ["Facebook CLA Check", "Lint"] + }, + { + "name": "FFT", + "patterns": [ + "aten/src/ATen/native/cuda/*FFT*.h", + "aten/src/ATen/native/SpectralOps.cpp", + "aten/src/ATen/native/mkl/SpectralOps.cpp", + "aten/src/ATen/native/cuda/SpectralOps.*", + "docs/source/fft.rst", + "torch/fft/**", + "torch/csrc/api/include/torch/fft.h", + "torch/**/python_fft_functions.*", + "tools/autograd/templates/python_fft_functions.cpp", + "test/cpp/api/fft.cpp" + ], + "approved_by": ["mruberry", "peterbell10"], + "mandatory_checks_name": ["Facebook CLA Check", "Lint"] + }, + { + "name": "Sparse", + "patterns": [ + "benchmarks/sparse", + "c10/util/sparse_bitset.h", + "docs/source/sparse.rst", + "torch/**/sparse/**", + "torch/**/*sparse*", + "torch/optim/sparse*", + "torch/ao/nn/sparse/**", + "torch/utils/benchmark/**/*sparse*", + "aten/src/ATen/native/ao_sparse/**", + "aten/src/ATen/native/sparse/**", + "aten/src/ATen/**/*Sparse*", + "aten/src/ATen/*Sparse*", + "torch/_masked/**", + "test/*_masked*", + "test/**/*sparse*" + ], + "approved_by": ["nikitaved", "cpuhrsch", "pearu", "IvanYashchuk"], + "mandatory_checks_name": ["Facebook CLA Check", "Lint"] + }, + { + "name": "superuser", + "patterns": ["*"], + "approved_by": ["pytorch/metamates"], + "mandatory_checks_name": ["Facebook CLA Check", "Lint"] } ] diff --git a/.github/scale-config.yml b/.github/scale-config.yml index 0670ed9598ae..931ca0ef5f1e 100644 --- a/.github/scale-config.yml +++ b/.github/scale-config.yml @@ -30,25 +30,25 @@ runner_types: linux.2xlarge: instance_type: c5.2xlarge os: linux - max_available: 500 + max_available: 1000 disk_size: 150 is_ephemeral: false linux.4xlarge: # for binary-builds instance_type: c5.4xlarge os: linux - max_available: 250 + max_available: 500 disk_size: 150 is_ephemeral: false linux.8xlarge.nvidia.gpu: instance_type: g3.8xlarge os: linux - max_available: 125 + max_available: 200 disk_size: 150 is_ephemeral: false linux.4xlarge.nvidia.gpu: instance_type: g3.4xlarge os: linux - max_available: 125 + max_available: 250 disk_size: 150 is_ephemeral: false linux.16xlarge.nvidia.gpu: diff --git a/.github/scripts/README.md b/.github/scripts/README.md new file mode 100644 index 000000000000..22099c3732ea --- /dev/null +++ b/.github/scripts/README.md @@ -0,0 +1,58 @@ +# pytorch/.github + +> NOTE: This README contains information for the `.github` directory but cannot be located there because it will overwrite the +repo README. + +This directory contains workflows and scripts to support our CI infrastructure that runs on Github Actions. + +## Workflows + +- Pull CI (`pull.yml`) is run on PRs and on master. +- Trunk CI (`trunk.yml`) is run on trunk to validate incoming commits. Trunk jobs are usually more expensive to run so we do not run them on PRs unless specified. +- Scheduled CI (`periodic.yml`) is a subset of trunk CI that is run every few hours on master. +- Binary CI is run to package binaries for distribution for all platforms. + +## Templates + +Templates written in [Jinja](https://jinja.palletsprojects.com/en/3.0.x/) are located in the `.github/templates` directory +and used to generate workflow files for binary jobs found in the `.github/workflows/` directory. These are also a +couple of utility templates used to discern common utilities that can be used amongst different templates. + +### (Re)Generating workflow files + +You will need `jinja2` in order to regenerate the workflow files which can be installed using: +```bash +pip install -r .github/requirements.txt +``` + +Workflows can be generated / regenerated using the following command: +```bash +.github/regenerate.sh +``` + +### Adding a new generated binary workflow + +New generated binary workflows can be added in the `.github/scripts/generate_ci_workflows.py` script. You can reference +examples from that script in order to add the workflow to the stream that is relevant to what you particularly +care about. + +Different parameters can be used to acheive different goals, i.e. running jobs on a cron, running only on trunk, etc. + +#### ciflow (trunk) + +The label `ciflow/trunk` can be used to run `trunk` only workflows. This is especially useful if trying to re-land a PR that was +reverted for failing a `non-default` workflow. + +## Infra + +Currently most of our self hosted runners are hosted on AWS, for a comprehensive list of available runner types you +can reference `.github/scale-config.yml`. + +Exceptions to AWS for self hosted: +* ROCM runners + +### Adding new runner types + +New runner types can be added by committing changes to `.github/scale-config.yml`. Example: https://github.com/pytorch/pytorch/pull/70474 + +> NOTE: New runner types can only be used once the changes to `.github/scale-config.yml` have made their way into the default branch diff --git a/.github/scripts/build_publish_nightly_docker.sh b/.github/scripts/build_publish_nightly_docker.sh index 55c764596eb1..db84704aa3e4 100644 --- a/.github/scripts/build_publish_nightly_docker.sh +++ b/.github/scripts/build_publish_nightly_docker.sh @@ -1,9 +1,9 @@ -#!/bin/sh +#!/usr/bin/env bash set -xeuo pipefail PYTORCH_DOCKER_TAG=$(git describe --tags --always)-devel -CUDA_VERSION=11.1 +CUDA_VERSION=11.3.1 # Build PyTorch nightly docker make -f docker.Makefile \ @@ -25,18 +25,20 @@ docker tag ghcr.io/pytorch/pytorch-nightly:${PYTORCH_DOCKER_TAG} \ docker tag ghcr.io/pytorch/pytorch-nightly:${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION} \ ghcr.io/pytorch/pytorch-nightly:latest -# Push the nightly docker to GitHub Container Registry -echo $GHCR_PAT | docker login ghcr.io -u pytorch --password-stdin -make -f docker.Makefile \ - DOCKER_REGISTRY=ghcr.io \ - DOCKER_ORG=pytorch \ - DOCKER_IMAGE=pytorch-nightly \ - DOCKER_TAG=${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION} \ - devel-push - -make -f docker.Makefile \ - DOCKER_REGISTRY=ghcr.io \ - DOCKER_ORG=pytorch \ - DOCKER_IMAGE=pytorch-nightly \ - DOCKER_TAG=latest \ - devel-push +if [[ ${WITH_PUSH:-} == "true" ]]; then + # Push the nightly docker to GitHub Container Registry + echo $GHCR_PAT | docker login ghcr.io -u pytorch --password-stdin + make -f docker.Makefile \ + DOCKER_REGISTRY=ghcr.io \ + DOCKER_ORG=pytorch \ + DOCKER_IMAGE=pytorch-nightly \ + DOCKER_TAG=${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION} \ + devel-push + + make -f docker.Makefile \ + DOCKER_REGISTRY=ghcr.io \ + DOCKER_ORG=pytorch \ + DOCKER_IMAGE=pytorch-nightly \ + DOCKER_TAG=latest \ + devel-push +fi diff --git a/.github/scripts/convert_lintrunner_annotations_to_github.py b/.github/scripts/convert_lintrunner_annotations_to_github.py new file mode 100644 index 000000000000..11901bc300e3 --- /dev/null +++ b/.github/scripts/convert_lintrunner_annotations_to_github.py @@ -0,0 +1,63 @@ +import json +import subprocess +import sys + +from enum import Enum +from pathlib import Path +from typing import NamedTuple, Optional + +# From: https://docs.github.com/en/rest/reference/checks +class GitHubAnnotationLevel(str, Enum): + NOTICE = "notice" + WARNING = "warning" + FAILURE = "failure" + + +class GitHubAnnotation(NamedTuple): + path: str + start_line: int + end_line: int + start_column: Optional[int] + end_column: Optional[int] + annotation_level: GitHubAnnotationLevel + message: str + title: Optional[str] + raw_details: Optional[str] + +PYTORCH_ROOT = Path(subprocess.check_output(['git', 'rev-parse', '--show-toplevel']).decode('ascii').strip()) + +annotations = [] +for line in sys.stdin: + lint_message = json.loads(line) + + path = lint_message.get("path") + line = lint_message.get("line") + + + code = lint_message["code"] + severity = lint_message["severity"] + name = lint_message["name"] + description = lint_message.get("description") + + # These fields are required by the GitHub API, but optional in lintrunner. + # If they don't exist, just skip. + if path is None or line is None: + print(f"No path/line for lint: ({code}) {name}", file=sys.stderr) + continue + + # normalize path relative to git root + path = Path(path).relative_to(PYTORCH_ROOT) + + annotations.append(GitHubAnnotation( + path=str(path), + start_line=int(line), + end_line=int(line), + start_column=None, + end_column=None, + annotation_level=GitHubAnnotationLevel.FAILURE, + message=description, + title=f"({code}) {name}", + raw_details=None, + )._asdict()) + +print(json.dumps(annotations), flush=True) diff --git a/.github/scripts/ensure_actions_will_cancel.py b/.github/scripts/ensure_actions_will_cancel.py index a07f4359dd04..c479aefb9fc4 100755 --- a/.github/scripts/ensure_actions_will_cancel.py +++ b/.github/scripts/ensure_actions_will_cancel.py @@ -9,14 +9,8 @@ REPO_ROOT = Path(__file__).resolve().parent.parent.parent WORKFLOWS = REPO_ROOT / ".github" / "workflows" - - -def concurrency_key(filename: Path) -> str: - workflow_name = filename.with_suffix("").name.replace("_", "-") - if workflow_name.startswith("generated-"): - workflow_name = workflow_name[len("generated-"):] - return f"{workflow_name}-${{{{ github.event.pull_request.number || github.sha }}}}" \ - "-${{ github.event_name == 'workflow_dispatch' }}" +EXPECTED_GROUP = "${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}" \ + "-${{ github.event_name == 'workflow_dispatch' }}" def should_check(filename: Path) -> bool: @@ -38,12 +32,19 @@ def should_check(filename: Path) -> bool: errors_found = False files = [f for f in files if should_check(f)] + names = set() for filename in files: with open(filename, "r") as f: data = yaml.safe_load(f) + name = data.get("name") + if name is not None and name in names: + print("ERROR: duplicate workflow name:", name, file=sys.stderr) + errors_found = True + names.add(name) + expected = { - "group": concurrency_key(filename), + "group": EXPECTED_GROUP, "cancel-in-progress": True, } actual = data.get("concurrency", None) diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py index d3aaf1844fe5..1d81f72edd8e 100644 --- a/.github/scripts/generate_binary_build_matrix.py +++ b/.github/scripts/generate_binary_build_matrix.py @@ -10,13 +10,13 @@ * Latest ROCM """ -from typing import Dict, List, Tuple +from typing import Dict, List, Tuple, Optional -CUDA_ARCHES = ["10.2", "11.1", "11.3", "11.5"] +CUDA_ARCHES = ["10.2", "11.3", "11.6"] -ROCM_ARCHES = ["4.3.1", "4.5.2"] +ROCM_ARCHES = ["5.0", "5.1.1"] def arch_type(arch_version: str) -> str: @@ -47,6 +47,8 @@ def arch_type(arch_version: str) -> str: PRE_CXX11_ABI = "pre-cxx11" CXX11_ABI = "cxx11-abi" +RELEASE = "release" +DEBUG = "debug" LIBTORCH_CONTAINER_IMAGES: Dict[Tuple[str, str], str] = { **{ @@ -57,6 +59,14 @@ def arch_type(arch_version: str) -> str: (gpu_arch, CXX11_ABI): f"pytorch/libtorch-cxx11-builder:cuda{gpu_arch}" for gpu_arch in CUDA_ARCHES }, + **{ + (gpu_arch, PRE_CXX11_ABI): f"pytorch/manylinux-builder:rocm{gpu_arch}" + for gpu_arch in ROCM_ARCHES + }, + **{ + (gpu_arch, CXX11_ABI): f"pytorch/libtorch-cxx11-builder:rocm{gpu_arch}" + for gpu_arch in ROCM_ARCHES + }, ("cpu", PRE_CXX11_ABI): "pytorch/manylinux-builder:cpu", ("cpu", CXX11_ABI): "pytorch/libtorch-cxx11-builder:cpu", } @@ -110,28 +120,37 @@ def generate_conda_matrix(os: str) -> List[Dict[str, str]]: return ret -def generate_libtorch_matrix(os: str, abi_version: str) -> List[Dict[str, str]]: - libtorch_variants = [ - "shared-with-deps", - "shared-without-deps", - "static-with-deps", - "static-without-deps", - ] +def generate_libtorch_matrix(os: str, abi_version: str, + arches: Optional[List[str]] = None, + libtorch_variants: Optional[List[str]] = None) -> List[Dict[str, str]]: + if arches is None: + arches = ["cpu"] + if os == "linux": + arches += CUDA_ARCHES + arches += ROCM_ARCHES + elif os == "windows": + # We don't build CUDA 10.2 for window see https://github.com/pytorch/pytorch/issues/65648 + arches += list_without(CUDA_ARCHES, ["10.2"]) + + if libtorch_variants is None: + libtorch_variants = [ + "shared-with-deps", + "shared-without-deps", + "static-with-deps", + "static-without-deps", + ] + ret: List[Dict[str, str]] = [] - arches = ["cpu"] - if os == "linux": - arches += CUDA_ARCHES - elif os == "windows": - # We don't build CUDA 10.2 for window see https://github.com/pytorch/pytorch/issues/65648 - arches += list_without(CUDA_ARCHES, ["10.2"]) for arch_version in arches: for libtorch_variant in libtorch_variants: - # We don't currently build libtorch for rocm # one of the values in the following list must be exactly # CXX11_ABI, but the precise value of the other one doesn't # matter gpu_arch_type = arch_type(arch_version) gpu_arch_version = "" if arch_version == "cpu" else arch_version + # ROCm builds without-deps failed even in ROCm runners; skip for now + if gpu_arch_type == "rocm" and "without-deps" in libtorch_variant: + continue ret.append( { "gpu_arch_type": gpu_arch_type, @@ -140,10 +159,11 @@ def generate_libtorch_matrix(os: str, abi_version: str) -> List[Dict[str, str]]: gpu_arch_type, gpu_arch_version ), "libtorch_variant": libtorch_variant, - "devtoolset": abi_version, + "libtorch_config": abi_version if os == "windows" else "", + "devtoolset": abi_version if os != "windows" else "", "container_image": LIBTORCH_CONTAINER_IMAGES[ (arch_version, abi_version) - ], + ] if os != "windows" else "", "package_type": "libtorch", "build_name": f"libtorch-{gpu_arch_type}{gpu_arch_version}-{libtorch_variant}-{abi_version}".replace( ".", "_" @@ -153,19 +173,29 @@ def generate_libtorch_matrix(os: str, abi_version: str) -> List[Dict[str, str]]: return ret -def generate_wheels_matrix(os: str) -> List[Dict[str, str]]: - arches = ["cpu"] +def generate_wheels_matrix(os: str, + arches: Optional[List[str]] = None, + python_versions: Optional[List[str]] = None) -> List[Dict[str, str]]: package_type = "wheel" - python_versions = FULL_PYTHON_VERSIONS if os == "linux": - arches += CUDA_ARCHES + ROCM_ARCHES # NOTE: We only build manywheel packages for linux package_type = "manywheel" - elif os == "windows": - # We don't build CUDA 10.2 for window see https://github.com/pytorch/pytorch/issues/65648 - arches += list_without(CUDA_ARCHES, ["10.2"]) - elif os == "macos-arm64": - python_versions = list_without(python_versions, ["3.7"]) + + if python_versions is None: + # Define default python version + python_versions = FULL_PYTHON_VERSIONS + if os == "macos-arm64": + python_versions = list_without(python_versions, ["3.7"]) + + if arches is None: + # Define default compute archivectures + arches = ["cpu"] + if os == "linux": + arches += CUDA_ARCHES + ROCM_ARCHES + elif os == "windows": + # We don't build CUDA 10.2 for window see https://github.com/pytorch/pytorch/issues/65648 + arches += list_without(CUDA_ARCHES, ["10.2"]) + ret: List[Dict[str, str]] = [] for python_version in python_versions: for arch_version in arches: diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index f90690f2f952..c8b815bf0180 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -5,98 +5,24 @@ from typing import Dict, Set, List, Iterable import jinja2 -import json + import os import sys -from typing_extensions import Literal +from typing_extensions import Literal, TypedDict import generate_binary_build_matrix # type: ignore[import] -YamlShellBool = Literal["''", 1] Arch = Literal["windows", "linux", "macos"] -DOCKER_REGISTRY = "308535385114.dkr.ecr.us-east-1.amazonaws.com" GITHUB_DIR = Path(__file__).resolve().parent.parent -WINDOWS_CPU_TEST_RUNNER = "windows.4xlarge" -# contains 1 gpu -WINDOWS_CUDA_TEST_RUNNER = "windows.8xlarge.nvidia.gpu" -WINDOWS_RUNNERS = { - WINDOWS_CPU_TEST_RUNNER, - WINDOWS_CUDA_TEST_RUNNER, -} - -LINUX_CPU_TEST_RUNNER = "linux.2xlarge" -# contains 1 gpu -LINUX_CUDA_TEST_RUNNER = "linux.4xlarge.nvidia.gpu" -# contains at least 2 gpus -LINUX_ROCM_TEST_RUNNER = "linux.rocm.gpu" -LINUX_RUNNERS = { - LINUX_CPU_TEST_RUNNER, - LINUX_CUDA_TEST_RUNNER, - LINUX_ROCM_TEST_RUNNER, -} - -LINUX_DISTRIBUTED_GPU_RUNNERS = { - LINUX_CUDA_TEST_RUNNER : "linux.8xlarge.nvidia.gpu", - LINUX_ROCM_TEST_RUNNER : LINUX_ROCM_TEST_RUNNER, -} - -LINUX_MULTIGPU_RUNNERS = { - LINUX_CUDA_TEST_RUNNER : "linux.16xlarge.nvidia.gpu", - LINUX_ROCM_TEST_RUNNER : LINUX_ROCM_TEST_RUNNER, -} - -MACOS_TEST_RUNNER_10_15 = "macos-10.15" -MACOS_TEST_RUNNER_11 = "macos-11" - -MACOS_RUNNERS = { - MACOS_TEST_RUNNER_10_15, - MACOS_TEST_RUNNER_11, -} - -CUDA_RUNNERS = { - WINDOWS_CUDA_TEST_RUNNER, - LINUX_CUDA_TEST_RUNNER, -} -ROCM_RUNNERS = { - LINUX_ROCM_TEST_RUNNER, -} -CPU_RUNNERS = { - WINDOWS_CPU_TEST_RUNNER, - LINUX_CPU_TEST_RUNNER, -} - -LABEL_CIFLOW_ALL = "ciflow/all" -LABEL_CIFLOW_BAZEL = "ciflow/bazel" -LABEL_CIFLOW_CPU = "ciflow/cpu" -LABEL_CIFLOW_CUDA = "ciflow/cuda" -LABEL_CIFLOW_ROCM = "ciflow/rocm" -LABEL_CIFLOW_DOCS = "ciflow/docs" -LABEL_CIFLOW_DEFAULT = "ciflow/default" -LABEL_CIFLOW_LIBTORCH = "ciflow/libtorch" -LABEL_CIFLOW_LINUX = "ciflow/linux" -LABEL_CIFLOW_MOBILE = "ciflow/mobile" -LABEL_CIFLOW_ANDROID = "ciflow/android" -LABEL_CIFLOW_SANITIZERS = "ciflow/sanitizers" -LABEL_CIFLOW_ONNX = "ciflow/onnx" -LABEL_CIFLOW_SCHEDULED = "ciflow/scheduled" -LABEL_CIFLOW_SLOW = "ciflow/slow" -LABEL_CIFLOW_WIN = "ciflow/win" -LABEL_CIFLOW_XLA = "ciflow/xla" -LABEL_CIFLOW_NOARCH = "ciflow/noarch" -LABEL_CIFLOW_VULKAN = "ciflow/vulkan" -LABEL_CIFLOW_PREFIX = "ciflow/" -LABEL_CIFLOW_SLOW_GRADCHECK = "ciflow/slow-gradcheck" -LABEL_CIFLOW_DOCKER = "ciflow/docker" -LABEL_CIFLOW_IOS = "ciflow/ios" -LABEL_CIFLOW_MACOS = "ciflow/macos" LABEL_CIFLOW_TRUNK = "ciflow/trunk" +LABEL_CIFLOW_ALL = "ciflow/all" LABEL_CIFLOW_BINARIES = "ciflow/binaries" -LABEL_CIFLOW_BINARIES_WHEEL = "ciflow/binaries_wheel" -LABEL_CIFLOW_BINARIES_CONDA = "ciflow/binaries_conda" +LABEL_CIFLOW_PERIODIC = "ciflow/periodic" LABEL_CIFLOW_BINARIES_LIBTORCH = "ciflow/binaries_libtorch" - +LABEL_CIFLOW_BINARIES_CONDA = "ciflow/binaries_conda" +LABEL_CIFLOW_BINARIES_WHEEL = "ciflow/binaries_wheel" @dataclass class CIFlowConfig: @@ -109,175 +35,12 @@ class CIFlowConfig: def __post_init__(self) -> None: if not self.isolated_workflow: self.labels.add(LABEL_CIFLOW_ALL) - if LABEL_CIFLOW_SCHEDULED not in self.labels: + if LABEL_CIFLOW_PERIODIC not in self.labels: self.labels.add(LABEL_CIFLOW_TRUNK) - assert all(label.startswith(LABEL_CIFLOW_PREFIX) for label in self.labels) - - -@dataclass -class CIFlowRuleset: - version = 'v1' - output_file = f'{GITHUB_DIR}/generated-ciflow-ruleset.json' - label_rules: Dict[str, Set[str]] = field(default_factory=dict) - - def add_label_rule(self, labels: Set[str], workflow_name: str) -> None: - for label in labels: - if label in self.label_rules: - self.label_rules[label].add(workflow_name) - else: - self.label_rules[label] = {workflow_name} - - def generate_json(self) -> None: - GENERATED = "generated" # Note that please keep the variable GENERATED otherwise phabricator will hide the whole file - output = { - "__comment": f"@{GENERATED} DO NOT EDIT MANUALLY, Generation script: .github/scripts/generate_ci_workflows.py", - "version": self.version, - "label_rules": { - label: sorted(list(workflows)) - for label, workflows in self.label_rules.items() - } - } - with open(self.output_file, 'w') as outfile: - json.dump(output, outfile, indent=2, sort_keys=True) - outfile.write('\n') - -@dataclass -class CIWorkflow: - # Required fields - arch: Arch - build_environment: str - - # Optional fields - test_runner_type: str = '' - multigpu_runner_type: str = '' - distributed_gpu_runner_type: str = '' - ciflow_config: CIFlowConfig = field(default_factory=CIFlowConfig) - cuda_version: str = '' - docker_image_base: str = '' - enable_doc_jobs: bool = False - exclude_test: bool = False - build_generates_artifacts: bool = True - build_with_debug: bool = False - is_scheduled: str = '' - is_default: bool = False - num_test_shards: int = 1 - only_run_smoke_tests_on_pull_request: bool = False - num_test_shards_on_pull_request: int = -1 - distributed_test: bool = True - timeout_after: int = 240 - xcode_version: str = '' - only_on_pr: bool = False - ios_arch: str = '' - ios_platform: str = '' - - # The following variables will be set as environment variables, - # so it's easier for both shell and Python scripts to consume it if false is represented as the empty string. - enable_jit_legacy_test: YamlShellBool = "''" - enable_distributed_test: YamlShellBool = "''" - enable_multigpu_test: YamlShellBool = "''" - enable_nogpu_no_avx_test: YamlShellBool = "''" - enable_nogpu_no_avx2_test: YamlShellBool = "''" - enable_slow_test: YamlShellBool = "''" - enable_docs_test: YamlShellBool = "''" - enable_backwards_compat_test: YamlShellBool = "''" - enable_xla_test: YamlShellBool = "''" - enable_noarch_test: YamlShellBool = "''" - enable_force_on_cpu_test: YamlShellBool = "''" - - def __post_init__(self) -> None: - if not self.build_generates_artifacts: - self.exclude_test = True - - if self.distributed_test: - self.enable_distributed_test = 1 - - self.multigpu_runner_type = LINUX_MULTIGPU_RUNNERS.get(self.test_runner_type, "linux.16xlarge.nvidia.gpu") - self.distributed_gpu_runner_type = LINUX_DISTRIBUTED_GPU_RUNNERS.get(self.test_runner_type, "linux.8xlarge.nvidia.gpu") - - if LABEL_CIFLOW_DEFAULT in self.ciflow_config.labels: - self.is_default = True - - # If num_test_shards_on_pull_request is not user-defined, default to num_test_shards unless we are - # only running smoke tests on the pull request. - if self.num_test_shards_on_pull_request == -1: - # Don't run the default if we are only running smoke tests - if self.only_run_smoke_tests_on_pull_request: - self.num_test_shards_on_pull_request = 0 - else: - self.num_test_shards_on_pull_request = self.num_test_shards - self.assert_valid() - - def assert_valid(self) -> None: - err_message = f"invalid test_runner_type for {self.arch}: {self.test_runner_type}" - if self.arch == 'linux': - assert self.test_runner_type in LINUX_RUNNERS, err_message - if self.arch == 'windows': - assert self.test_runner_type in WINDOWS_RUNNERS, err_message - - if not self.ciflow_config.isolated_workflow: - assert LABEL_CIFLOW_ALL in self.ciflow_config.labels - if self.arch == 'linux': - assert LABEL_CIFLOW_LINUX in self.ciflow_config.labels - if self.arch == 'windows': - assert LABEL_CIFLOW_WIN in self.ciflow_config.labels - if self.arch == 'macos': - assert LABEL_CIFLOW_MACOS in self.ciflow_config.labels - # Make sure that jobs with tests have a test_runner_type - if not self.exclude_test: - assert self.test_runner_type != '' - if self.test_runner_type in CUDA_RUNNERS: - assert LABEL_CIFLOW_CUDA in self.ciflow_config.labels - if self.test_runner_type in ROCM_RUNNERS: - assert LABEL_CIFLOW_ROCM in self.ciflow_config.labels - if self.test_runner_type in CPU_RUNNERS and not self.exclude_test: - assert LABEL_CIFLOW_CPU in self.ciflow_config.labels - if self.is_scheduled: - assert LABEL_CIFLOW_DEFAULT not in self.ciflow_config.labels - assert LABEL_CIFLOW_TRUNK not in self.ciflow_config.labels - assert LABEL_CIFLOW_SCHEDULED in self.ciflow_config.labels - if self.build_with_debug: - assert self.build_environment.endswith("-debug") - - def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: - output_file_path = GITHUB_DIR / f"workflows/generated-{self.build_environment}.yml" - with open(output_file_path, "w") as output_file: - GENERATED = "generated" # Note that please keep the variable GENERATED otherwise phabricator will hide the whole file - output_file.writelines([f"# @{GENERATED} DO NOT EDIT MANUALLY\n"]) - try: - content = workflow_template.render(asdict(self)) - except Exception as e: - print(f"Failed on template: {workflow_template}", file=sys.stderr) - raise e - output_file.write(content) - if content[-1] != "\n": - output_file.write("\n") - print(output_file_path) - -@dataclass -class DockerWorkflow: - build_environment: str - docker_images: List[str] - - # Optional fields - ciflow_config: CIFlowConfig = field(default_factory=CIFlowConfig) - cuda_version: str = '' - is_scheduled: str = '' - - def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: - output_file_path = GITHUB_DIR / "workflows/generated-docker-builds.yml" - with open(output_file_path, "w") as output_file: - GENERATED = "generated" # Note that please keep the variable GENERATED otherwise phabricator will hide the whole file - output_file.writelines([f"# @{GENERATED} DO NOT EDIT MANUALLY\n"]) - try: - content = workflow_template.render(asdict(self)) - except Exception as e: - print(f"Failed on template: {workflow_template}", file=sys.stderr) - raise e - output_file.write(content) - if content[-1] != "\n": - output_file.write("\n") - print(output_file_path) +class Config(TypedDict): + num_shards: int + runner: str @dataclass class BinaryBuildWorkflow: @@ -290,6 +53,7 @@ class BinaryBuildWorkflow: abi_version: str = '' ciflow_config: CIFlowConfig = field(default_factory=CIFlowConfig) is_scheduled: str = '' + branches: str = 'nightly' # Mainly for macos cross_compile_arm64: bool = False xcode_version: str = '' @@ -301,7 +65,7 @@ def __post_init__(self) -> None: self.build_environment = f"{self.os}-binary-{self.package_type}" def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: - output_file_path = GITHUB_DIR / f"workflows/generated-{self.build_environment}.yml" + output_file_path = GITHUB_DIR / f"workflows/generated-{self.build_environment}-{self.branches}.yml" with open(output_file_path, "w") as output_file: GENERATED = "generated" # Note that please keep the variable GENERATED otherwise phabricator will hide the whole file output_file.writelines([f"# @{GENERATED} DO NOT EDIT MANUALLY\n"]) @@ -315,543 +79,6 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None: output_file.write("\n") print(output_file_path) -WINDOWS_WORKFLOWS = [ - CIWorkflow( - arch="windows", - build_environment="win-vs2019-cpu-py3", - cuda_version="cpu", - test_runner_type=WINDOWS_CPU_TEST_RUNNER, - num_test_shards=2, - ciflow_config=CIFlowConfig( - run_on_canary=True, - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_CPU, LABEL_CIFLOW_WIN} - ), - ), - CIWorkflow( - arch="windows", - build_environment="win-vs2019-cuda11.3-py3", - cuda_version="11.3", - test_runner_type=WINDOWS_CUDA_TEST_RUNNER, - num_test_shards=2, - only_run_smoke_tests_on_pull_request=True, - enable_force_on_cpu_test=1, - ciflow_config=CIFlowConfig( - run_on_canary=True, - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_CUDA, LABEL_CIFLOW_WIN} - ), - ), - CIWorkflow( - arch="windows", - build_environment="periodic-win-vs2019-cuda11.5-py3", - cuda_version="11.5", - test_runner_type=WINDOWS_CUDA_TEST_RUNNER, - num_test_shards=2, - enable_force_on_cpu_test=1, - is_scheduled="45 4,10,16,22 * * *", - ciflow_config=CIFlowConfig( - run_on_canary=True, - labels={LABEL_CIFLOW_SCHEDULED, LABEL_CIFLOW_CUDA, LABEL_CIFLOW_WIN} - ), - ), - CIWorkflow( - arch="windows", - build_environment="periodic-win-vs2019-cuda11.1-py3", - cuda_version="11.1", - test_runner_type=WINDOWS_CUDA_TEST_RUNNER, - num_test_shards=2, - is_scheduled="45 0,4,8,12,16,20 * * *", - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_SCHEDULED, LABEL_CIFLOW_WIN, LABEL_CIFLOW_CUDA} - ), - ), -] - -LINUX_WORKFLOWS = [ - CIWorkflow( - arch="linux", - build_environment="linux-xenial-py3.7-gcc5.4", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.7-gcc5.4", - test_runner_type=LINUX_CPU_TEST_RUNNER, - enable_jit_legacy_test=1, - enable_backwards_compat_test=1, - enable_docs_test=1, - num_test_shards=2, - ciflow_config=CIFlowConfig( - run_on_canary=True, - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU} - ), - ), - CIWorkflow( - arch="linux", - build_environment="linux-docs", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.7-gcc5.4", - test_runner_type=LINUX_CPU_TEST_RUNNER, - enable_doc_jobs=True, - exclude_test=True, - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_DOCS, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU} - ), - ), - CIWorkflow( - arch="linux", - build_environment="linux-docs-push", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.7-gcc5.4", - test_runner_type=LINUX_CPU_TEST_RUNNER, - enable_doc_jobs=True, - exclude_test=True, - is_scheduled="0 0 * * *", # run pushes only on a nightly schedule - # NOTE: This is purposefully left without LABEL_CIFLOW_DOCS so that you can run - # docs builds on your PR without the fear of anything pushing - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_SCHEDULED, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU} - ), - ), - CIWorkflow( - arch="linux", - build_environment="linux-xenial-py3.7-gcc7", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.7-gcc7", - test_runner_type=LINUX_CPU_TEST_RUNNER, - num_test_shards=2, - ciflow_config=CIFlowConfig( - run_on_canary=True, - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU} - ), - ), - # ParallelTBB does not have a maintainer and is currently flaky - # CIWorkflow( - # arch="linux", - # build_environment="paralleltbb-linux-xenial-py3.6-gcc5.4", - # docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.6-gcc5.4", - # test_runner_type=LINUX_CPU_TEST_RUNNER, - # ciflow_config=CIFlowConfig( - # labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU}, - # ), - # ), - CIWorkflow( - arch="linux", - build_environment="parallelnative-linux-xenial-py3.7-gcc5.4", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.7-gcc5.4", - test_runner_type=LINUX_CPU_TEST_RUNNER, - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU}, - ), - ), - # Build PyTorch with BUILD_CAFFE2=ON - CIWorkflow( - arch="linux", - build_environment="caffe2-linux-xenial-py3.7-gcc5.4", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.7-gcc5.4", - test_runner_type=LINUX_CPU_TEST_RUNNER, - exclude_test=True, - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU}, - ), - ), - CIWorkflow( - arch="linux", - build_environment="linux-xenial-py3-clang5-mobile-build", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-asan", - test_runner_type=LINUX_CPU_TEST_RUNNER, - build_generates_artifacts=False, - exclude_test=True, - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_MOBILE, LABEL_CIFLOW_DEFAULT}, - ), - ), - CIWorkflow( - arch="linux", - build_environment="linux-xenial-py3-clang5-mobile-custom-build-static", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c", - test_runner_type=LINUX_CPU_TEST_RUNNER, - build_generates_artifacts=False, - exclude_test=True, - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_MOBILE, LABEL_CIFLOW_DEFAULT}, - ), - ), - CIWorkflow( - arch="linux", - build_environment="linux-xenial-py3.7-clang7-asan", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang7-asan", - test_runner_type=LINUX_CPU_TEST_RUNNER, - num_test_shards=3, - distributed_test=False, - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_SANITIZERS, LABEL_CIFLOW_CPU}, - ), - ), - CIWorkflow( - arch="linux", - build_environment="linux-xenial-py3.7-clang7-onnx", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang7-onnx", - test_runner_type=LINUX_CPU_TEST_RUNNER, - num_test_shards=2, - distributed_test=False, - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_ONNX, LABEL_CIFLOW_CPU}, - ), - ), - CIWorkflow( - arch="linux", - build_environment="linux-bionic-cuda10.2-py3.9-gcc7", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7", - test_runner_type=LINUX_CUDA_TEST_RUNNER, - enable_jit_legacy_test=1, - enable_multigpu_test=1, - enable_nogpu_no_avx_test=1, - enable_nogpu_no_avx2_test=1, - enable_slow_test=1, - num_test_shards=2, - ciflow_config=CIFlowConfig( - run_on_canary=True, - labels={LABEL_CIFLOW_SLOW, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA} - ), - ), - CIWorkflow( - arch="linux", - build_environment="libtorch-linux-xenial-cuda10.2-py3.7-gcc7", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7", - test_runner_type=LINUX_CUDA_TEST_RUNNER, - build_generates_artifacts=False, - exclude_test=True, - ciflow_config=CIFlowConfig( - labels=set([LABEL_CIFLOW_LIBTORCH, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA]), - ), - ), - CIWorkflow( - arch="linux", - build_environment="periodic-linux-bionic-cuda11.5-py3.7-gcc7", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda11.5-cudnn8-py3-gcc7", - test_runner_type=LINUX_CUDA_TEST_RUNNER, - num_test_shards=2, - is_scheduled="45 4,10,16,22 * * *", - ciflow_config=CIFlowConfig( - labels=set([LABEL_CIFLOW_SCHEDULED, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA]), - ), - ), - CIWorkflow( - arch="linux", - build_environment="periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda11.5-cudnn8-py3-gcc7", - test_runner_type=LINUX_CUDA_TEST_RUNNER, - build_generates_artifacts=False, - is_scheduled="45 4,10,16,22 * * *", - exclude_test=True, - ciflow_config=CIFlowConfig( - labels=set([LABEL_CIFLOW_SCHEDULED, LABEL_CIFLOW_LIBTORCH, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA]), - ), - ), - CIWorkflow( - arch="linux", - build_environment="linux-xenial-cuda11.3-py3.7-gcc7", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7", - test_runner_type=LINUX_CUDA_TEST_RUNNER, - num_test_shards=2, - ciflow_config=CIFlowConfig( - labels=set([LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA]), - ), - ), - # no-ops builds test USE_PER_OPERATOR_HEADERS=0 where ATen/ops is not generated - CIWorkflow( - arch="linux", - build_environment="linux-xenial-cuda11.3-py3.7-gcc7-no-ops", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7", - test_runner_type=LINUX_CUDA_TEST_RUNNER, - exclude_test=True, - ciflow_config=CIFlowConfig( - labels=set([LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA]), - ), - ), - CIWorkflow( - arch="linux", - build_environment="linux-xenial-py3.7-gcc7-no-ops", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.7-gcc7", - test_runner_type=LINUX_CPU_TEST_RUNNER, - exclude_test=True, - ciflow_config=CIFlowConfig( - labels=set([LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU]), - ), - ), - CIWorkflow( - arch="linux", - build_environment="linux-bionic-rocm4.5-py3.7", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-rocm4.5-py3.7", - test_runner_type=LINUX_ROCM_TEST_RUNNER, - num_test_shards=2, - ciflow_config=CIFlowConfig( - labels=set([LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_ROCM]), - ), - ), - CIWorkflow( - arch="linux", - build_environment="libtorch-linux-xenial-cuda11.3-py3.7-gcc7", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7", - test_runner_type=LINUX_CUDA_TEST_RUNNER, - build_generates_artifacts=False, - exclude_test=True, - ciflow_config=CIFlowConfig( - labels=set([LABEL_CIFLOW_LIBTORCH, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA]), - ), - ), - CIWorkflow( - arch="linux", - build_environment="periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7", - test_runner_type=LINUX_CUDA_TEST_RUNNER, - num_test_shards=2, - build_with_debug=True, - is_scheduled="45 0,4,8,12,16,20 * * *", - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_SCHEDULED, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA} - ), - ), - CIWorkflow( - arch="linux", - build_environment="periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7", - test_runner_type=LINUX_CUDA_TEST_RUNNER, - build_generates_artifacts=False, - exclude_test=True, - is_scheduled="45 0,4,8,12,16,20 * * *", - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_SCHEDULED, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_LIBTORCH, LABEL_CIFLOW_CUDA}, - ), - ), - CIWorkflow( - arch="linux", - build_environment="linux-bionic-py3.7-clang9", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.7-clang9", - test_runner_type=LINUX_CPU_TEST_RUNNER, - num_test_shards=2, - distributed_test=False, - enable_noarch_test=1, - enable_xla_test=1, - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_XLA, LABEL_CIFLOW_NOARCH}, - ), - ), - CIWorkflow( - arch="linux", - build_environment="linux-vulkan-bionic-py3.7-clang9", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.7-clang9", - test_runner_type=LINUX_CPU_TEST_RUNNER, - num_test_shards=1, - distributed_test=False, - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_VULKAN}, - ), - ), - CIWorkflow( - arch="linux", - build_environment="periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7", - test_runner_type=LINUX_CUDA_TEST_RUNNER, - num_test_shards=2, - distributed_test=False, - timeout_after=360, - # Only run this on master 4 times per day since it does take a while - is_scheduled="0 */4 * * *", - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA, LABEL_CIFLOW_SLOW_GRADCHECK, LABEL_CIFLOW_SLOW, LABEL_CIFLOW_SCHEDULED}, - ), - ), -] - -ANDROID_SHORT_WORKFLOWS = [ - CIWorkflow( - arch="linux", - build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c", - test_runner_type=LINUX_CPU_TEST_RUNNER, - exclude_test=True, - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_ANDROID, LABEL_CIFLOW_DEFAULT}, - ), - ), - CIWorkflow( - arch="linux", - build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c", - test_runner_type=LINUX_CPU_TEST_RUNNER, - exclude_test=True, - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_ANDROID, LABEL_CIFLOW_DEFAULT}, - ), - ), -] - -ANDROID_WORKFLOWS = [ - CIWorkflow( - arch="linux", - build_environment="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c", - test_runner_type=LINUX_CPU_TEST_RUNNER, - exclude_test=True, - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_ANDROID}, - ), - ), -] - -BAZEL_WORKFLOWS = [ - CIWorkflow( - arch="linux", - build_environment="linux-xenial-cuda11.3-py3.7-gcc7-bazel-test", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7", - test_runner_type=LINUX_CPU_TEST_RUNNER, - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BAZEL, LABEL_CIFLOW_CPU, LABEL_CIFLOW_LINUX}, - ), - ), -] - -IOS_WORKFLOWS = [ - CIWorkflow( - arch="macos", - build_environment="ios-12-5-1-arm64", - ios_arch="arm64", - ios_platform="OS", - test_runner_type=MACOS_TEST_RUNNER_10_15, - exclude_test=True, - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS}, - ), - ), - CIWorkflow( - arch="macos", - build_environment="ios-12-5-1-arm64-coreml", - ios_arch="arm64", - ios_platform="OS", - test_runner_type=MACOS_TEST_RUNNER_10_15, - exclude_test=True, - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS}, - ), - ), - CIWorkflow( - arch="macos", - build_environment="ios-12-5-1-arm64-full-jit", - ios_arch="arm64", - ios_platform="OS", - test_runner_type=MACOS_TEST_RUNNER_10_15, - exclude_test=True, - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS}, - ), - ), - CIWorkflow( - arch="macos", - build_environment="ios-12-5-1-arm64-custom-ops", - ios_arch="arm64", - ios_platform="OS", - test_runner_type=MACOS_TEST_RUNNER_10_15, - exclude_test=True, - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS}, - ), - ), - CIWorkflow( - arch="macos", - build_environment="ios-12-5-1-arm64-metal", - ios_arch="arm64", - ios_platform="OS", - test_runner_type=MACOS_TEST_RUNNER_10_15, - exclude_test=True, - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS}, - ), - ), - CIWorkflow( - arch="macos", - build_environment="ios-12-5-1-x86-64", - ios_arch="x86_64", - ios_platform="SIMULATOR", - test_runner_type=MACOS_TEST_RUNNER_10_15, - exclude_test=True, - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS}, - ), - ), - CIWorkflow( - arch="macos", - build_environment="ios-12-5-1-x86-64-coreml", - ios_arch="x86_64", - ios_platform="SIMULATOR", - test_runner_type=MACOS_TEST_RUNNER_10_15, - exclude_test=True, - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS}, - ), - ), - CIWorkflow( - arch="macos", - build_environment="ios-12-5-1-x86-64-full-jit", - ios_arch="x86_64", - ios_platform="SIMULATOR", - test_runner_type=MACOS_TEST_RUNNER_10_15, - exclude_test=True, - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS}, - ), - ), -] - -MACOS_WORKFLOWS = [ - # Distributed tests are still run on MacOS, but part of regular shards - CIWorkflow( - arch="macos", - build_environment="macos-11-py3-x86-64", - xcode_version="12.4", - test_runner_type=MACOS_TEST_RUNNER_11, - num_test_shards=2, - distributed_test=False, - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_MACOS}, - ), - ), - CIWorkflow( - arch="macos", - build_environment="macos-10-15-py3-lite-interpreter-x86-64", - xcode_version="12", - test_runner_type=MACOS_TEST_RUNNER_10_15, - exclude_test=True, - build_generates_artifacts=False, - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_MACOS}, - ), - ), - CIWorkflow( - arch="macos", - build_environment="macos-10-15-py3-arm64", - test_runner_type=MACOS_TEST_RUNNER_10_15, - exclude_test=True, - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_MACOS}, - ), - ), -] - -DOCKER_IMAGES = { - f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.7-clang9", # for pytorch/xla - f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-rocm4.3.1-py3.7", # for rocm - f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-rocm4.5-py3.7", # for rocm -} - -DOCKER_IMAGES.update({ - workflow.docker_image_base - for workflow in [*LINUX_WORKFLOWS, *BAZEL_WORKFLOWS, *ANDROID_WORKFLOWS] - if workflow.docker_image_base -}) - -DOCKER_WORKFLOWS = [ - DockerWorkflow( - build_environment="docker-builds", - docker_images=sorted(DOCKER_IMAGES), - # Run every Wednesday at 3:01am to ensure they can build - is_scheduled="1 3 * * 3", - ), -] - class OperatingSystem: LINUX = "linux" WINDOWS = "windows" @@ -864,7 +91,7 @@ class OperatingSystem: package_type="manywheel", build_configs=generate_binary_build_matrix.generate_wheels_matrix(OperatingSystem.LINUX), ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL}, + labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL}, isolated_workflow=True, ), ), @@ -873,7 +100,7 @@ class OperatingSystem: package_type="conda", build_configs=generate_binary_build_matrix.generate_conda_matrix(OperatingSystem.LINUX), ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_CONDA}, + labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_CONDA}, isolated_workflow=True, ), ), @@ -885,7 +112,7 @@ class OperatingSystem: OperatingSystem.LINUX, generate_binary_build_matrix.CXX11_ABI ), ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH}, + labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH}, isolated_workflow=True, ), ), @@ -897,58 +124,123 @@ class OperatingSystem: OperatingSystem.LINUX, generate_binary_build_matrix.PRE_CXX11_ABI ), ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH}, + labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH}, isolated_workflow=True, ), ), ] +LINUX_BINARY_SMOKE_WORKFLOWS = [ + BinaryBuildWorkflow( + os=OperatingSystem.LINUX, + package_type="manywheel", + build_configs=generate_binary_build_matrix.generate_wheels_matrix( + OperatingSystem.LINUX, + arches=["10.2"], + python_versions=["3.7"]), + branches="master", + ), + BinaryBuildWorkflow( + os=OperatingSystem.LINUX, + package_type="libtorch", + abi_version=generate_binary_build_matrix.CXX11_ABI, + build_configs=generate_binary_build_matrix.generate_libtorch_matrix( + OperatingSystem.LINUX, generate_binary_build_matrix.CXX11_ABI, + arches=["cpu"], + libtorch_variants=["shared-with-deps"], + ), + branches="master", + ), + BinaryBuildWorkflow( + os=OperatingSystem.LINUX, + package_type="libtorch", + abi_version=generate_binary_build_matrix.PRE_CXX11_ABI, + build_configs=generate_binary_build_matrix.generate_libtorch_matrix( + OperatingSystem.LINUX, generate_binary_build_matrix.CXX11_ABI, + arches=["cpu"], + libtorch_variants=["shared-with-deps"], + ), + branches="master", + ), +] + WINDOWS_BINARY_BUILD_WORKFLOWS = [ BinaryBuildWorkflow( os=OperatingSystem.WINDOWS, package_type="wheel", build_configs=generate_binary_build_matrix.generate_wheels_matrix(OperatingSystem.WINDOWS), ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL}, + labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL}, + isolated_workflow=True, + ), + ), + BinaryBuildWorkflow( + os=OperatingSystem.WINDOWS, + package_type="conda", + build_configs=generate_binary_build_matrix.generate_conda_matrix(OperatingSystem.WINDOWS), + ciflow_config=CIFlowConfig( + labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_CONDA}, isolated_workflow=True, ), ), - # NOTE: conda binaries are currently bugged on the installation step - # See, https://github.com/pytorch/pytorch/pull/71484#issuecomment-1022617195 - # BinaryBuildWorkflow( - # os=OperatingSystem.WINDOWS, - # package_type="conda", - # build_configs=generate_binary_build_matrix.generate_conda_matrix(OperatingSystem.WINDOWS), - # ciflow_config=CIFlowConfig( - # labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_CONDA}, - # isolated_workflow=True, - # ), - # ), BinaryBuildWorkflow( os=OperatingSystem.WINDOWS, package_type="libtorch", - abi_version=generate_binary_build_matrix.CXX11_ABI, + abi_version=generate_binary_build_matrix.RELEASE, build_configs=generate_binary_build_matrix.generate_libtorch_matrix( - OperatingSystem.WINDOWS, generate_binary_build_matrix.CXX11_ABI + OperatingSystem.WINDOWS, generate_binary_build_matrix.RELEASE ), ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH}, + labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH}, isolated_workflow=True, ), ), BinaryBuildWorkflow( os=OperatingSystem.WINDOWS, package_type="libtorch", - abi_version=generate_binary_build_matrix.PRE_CXX11_ABI, + abi_version=generate_binary_build_matrix.DEBUG, build_configs=generate_binary_build_matrix.generate_libtorch_matrix( - OperatingSystem.WINDOWS, generate_binary_build_matrix.PRE_CXX11_ABI + OperatingSystem.WINDOWS, generate_binary_build_matrix.DEBUG ), ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH}, + labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH}, isolated_workflow=True, ), ), ] +WINDOWS_BINARY_SMOKE_WORKFLOWS = [ + BinaryBuildWorkflow( + os=OperatingSystem.WINDOWS, + package_type="wheel", + build_configs=generate_binary_build_matrix.generate_wheels_matrix( + OperatingSystem.WINDOWS, + arches=["11.3"], + python_versions=["3.7"]), + branches="master", + ), + BinaryBuildWorkflow( + os=OperatingSystem.WINDOWS, + package_type="libtorch", + abi_version=generate_binary_build_matrix.RELEASE, + build_configs=generate_binary_build_matrix.generate_libtorch_matrix( + OperatingSystem.WINDOWS, generate_binary_build_matrix.RELEASE, + arches=["cpu"], + libtorch_variants=["shared-with-deps"], + ), + branches="master", + ), + BinaryBuildWorkflow( + os=OperatingSystem.WINDOWS, + package_type="libtorch", + abi_version=generate_binary_build_matrix.DEBUG, + build_configs=generate_binary_build_matrix.generate_libtorch_matrix( + OperatingSystem.WINDOWS, generate_binary_build_matrix.DEBUG, + arches=["cpu"], + libtorch_variants=["shared-with-deps"], + ), + branches="master", + ), +] MACOS_BINARY_BUILD_WORKFLOWS = [ BinaryBuildWorkflow( @@ -956,7 +248,7 @@ class OperatingSystem: package_type="wheel", build_configs=generate_binary_build_matrix.generate_wheels_matrix(OperatingSystem.MACOS), ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL}, + labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL}, isolated_workflow=True, ), ), @@ -965,7 +257,7 @@ class OperatingSystem: package_type="conda", build_configs=generate_binary_build_matrix.generate_conda_matrix(OperatingSystem.MACOS), ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_CONDA}, + labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_CONDA}, isolated_workflow=True, ), ), @@ -977,7 +269,7 @@ class OperatingSystem: OperatingSystem.MACOS, generate_binary_build_matrix.CXX11_ABI ), ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH}, + labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH}, isolated_workflow=True, ), ), @@ -989,7 +281,7 @@ class OperatingSystem: OperatingSystem.MACOS, generate_binary_build_matrix.PRE_CXX11_ABI ), ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH}, + labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH}, isolated_workflow=True, ), ), @@ -999,7 +291,7 @@ class OperatingSystem: build_configs=generate_binary_build_matrix.generate_wheels_matrix(OperatingSystem.MACOS), cross_compile_arm64=True, ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL}, + labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL}, isolated_workflow=True, ), ), @@ -1009,7 +301,7 @@ class OperatingSystem: cross_compile_arm64=True, build_configs=generate_binary_build_matrix.generate_conda_matrix(OperatingSystem.MACOS_ARM64), ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_CONDA}, + labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_CONDA}, isolated_workflow=True, ), ), @@ -1021,17 +313,13 @@ def main() -> None: loader=jinja2.FileSystemLoader(str(GITHUB_DIR.joinpath("templates"))), undefined=jinja2.StrictUndefined, ) + + # not ported yet template_and_workflows = [ - (jinja_env.get_template("linux_ci_workflow.yml.j2"), LINUX_WORKFLOWS), - (jinja_env.get_template("windows_ci_workflow.yml.j2"), WINDOWS_WORKFLOWS), - (jinja_env.get_template("bazel_ci_workflow.yml.j2"), BAZEL_WORKFLOWS), - (jinja_env.get_template("ios_ci_workflow.yml.j2"), IOS_WORKFLOWS), - (jinja_env.get_template("macos_ci_workflow.yml.j2"), MACOS_WORKFLOWS), - (jinja_env.get_template("docker_builds_ci_workflow.yml.j2"), DOCKER_WORKFLOWS), - (jinja_env.get_template("android_ci_full_workflow.yml.j2"), ANDROID_WORKFLOWS), - (jinja_env.get_template("android_ci_workflow.yml.j2"), ANDROID_SHORT_WORKFLOWS), (jinja_env.get_template("linux_binary_build_workflow.yml.j2"), LINUX_BINARY_BUILD_WORFKLOWS), + (jinja_env.get_template("linux_binary_build_workflow.yml.j2"), LINUX_BINARY_SMOKE_WORKFLOWS), (jinja_env.get_template("windows_binary_build_workflow.yml.j2"), WINDOWS_BINARY_BUILD_WORKFLOWS), + (jinja_env.get_template("windows_binary_build_workflow.yml.j2"), WINDOWS_BINARY_SMOKE_WORKFLOWS), (jinja_env.get_template("macos_binary_build_workflow.yml.j2"), MACOS_BINARY_BUILD_WORKFLOWS), ] # Delete the existing generated files first, this should align with .gitattributes file description. @@ -1042,16 +330,12 @@ def main() -> None: except Exception as e: print(f"Error occurred when deleting file {w}: {e}") - ciflow_ruleset = CIFlowRuleset() for template, workflows in template_and_workflows: # added Iterable check to appease the mypy gods if not isinstance(workflows, Iterable): raise Exception(f"How is workflows not iterable? {workflows}") for workflow in workflows: workflow.generate_workflow_file(workflow_template=template) - ciflow_ruleset.add_label_rule(workflow.ciflow_config.labels, workflow.build_environment) - ciflow_ruleset.generate_json() - if __name__ == "__main__": main() diff --git a/.github/scripts/generate_pytorch_test_matrix.py b/.github/scripts/generate_pytorch_test_matrix.py deleted file mode 100755 index 967f7222dd36..000000000000 --- a/.github/scripts/generate_pytorch_test_matrix.py +++ /dev/null @@ -1,132 +0,0 @@ -#!/usr/bin/env python3 - -"""Generates a matrix to be utilized through github actions - -Will output a matrix to represent our testing configurations, which is currently -dictated by just sharding. - -""" - -import json -import os -import re -from typing import Dict - -from typing_extensions import TypedDict - - -BUILD_ENVIRONMENT = os.getenv('BUILD_ENVIRONMENT') -assert BUILD_ENVIRONMENT is not None - -class Config(TypedDict): - num_shards: int - runner: str - - -def get_disabled_issues() -> str: - pr_body = os.getenv('PR_BODY', '') - # The below regex is meant to match all *case-insensitive* keywords that - # GitHub has delineated would link PRs to issues, more details here: - # https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue. - # E.g., "Close #62851", "fixES #62851" and "RESOLVED #62851" would all match, but not - # "closes #62851" --> extra space, "fixing #62851" --> not a keyword, nor "fix 62851" --> no # - regex = '(?i)(Close(d|s)?|Resolve(d|s)?|Fix(ed|es)?) #([0-9]+)' - issue_numbers = [x[4] for x in re.findall(regex, pr_body)] - return ','.join(issue_numbers) - -# When the user specifies labels that are NOT ciflow/default, the expectation is -# that the workflows should be triggered as if they are on trunk. For example, when -# ciflow/all is specified, we should run the full test suite for Windows CUDA -# and NOT only the smoke tests. -def run_as_if_on_trunk() -> bool: - ON_PULL_REQUEST = os.getenv('GITHUB_HEAD_REF') - if not ON_PULL_REQUEST: - return True - - from pathlib import Path - GITHUB_DIR = Path(__file__).resolve().parent.parent - - with open(f'{GITHUB_DIR}/generated-ciflow-ruleset.json') as f: - labels_to_workflows = json.load(f)['label_rules'] - - pr_labels = json.loads(os.getenv('PR_LABELS', '[]')) - current_workflow_triggered_by_label = False - for label in pr_labels: - if label != 'ciflow/default' and label in labels_to_workflows: - workflows_triggered_by_label = labels_to_workflows[label] - if any([BUILD_ENVIRONMENT in workflow for workflow in workflows_triggered_by_label]): - current_workflow_triggered_by_label = True - break - - return current_workflow_triggered_by_label - -def main() -> None: - TEST_RUNNER_TYPE = os.getenv('TEST_RUNNER_TYPE') - assert TEST_RUNNER_TYPE is not None - RUN_SMOKE_TESTS_ONLY_ON_PR = os.getenv('RUN_SMOKE_TESTS_ONLY_ON_PR') - RUN_SMOKE_TESTS = RUN_SMOKE_TESTS_ONLY_ON_PR == "true" and not run_as_if_on_trunk() - NUM_TEST_SHARDS_ON_PULL_REQUEST = os.getenv('NUM_TEST_SHARDS_ON_PULL_REQUEST') - NUM_TEST_SHARDS = int(os.getenv('NUM_TEST_SHARDS', '0')) - if not run_as_if_on_trunk() and NUM_TEST_SHARDS_ON_PULL_REQUEST: - NUM_TEST_SHARDS = int(NUM_TEST_SHARDS_ON_PULL_REQUEST) - MULTIGPU_RUNNER_TYPE = os.getenv('MULTIGPU_RUNNER_TYPE') - DISTRIBUTED_GPU_RUNNER_TYPE = os.getenv('DISTRIBUTED_GPU_RUNNER_TYPE', TEST_RUNNER_TYPE) - NOGPU_RUNNER_TYPE = os.getenv('NOGPU_RUNNER_TYPE') - configs: Dict[str, Config] = {} - if os.getenv('ENABLE_JIT_LEGACY_TEST'): - configs['jit_legacy'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} - if MULTIGPU_RUNNER_TYPE is not None and os.getenv('ENABLE_MULTIGPU_TEST'): - configs['multigpu'] = {'num_shards': 1, 'runner': MULTIGPU_RUNNER_TYPE} - if NOGPU_RUNNER_TYPE is not None: - if os.getenv('ENABLE_NOGPU_NO_AVX_TEST'): - configs['nogpu_NO_AVX'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE} - if os.getenv('ENABLE_NOGPU_NO_AVX2_TEST'): - configs['nogpu_NO_AVX2'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE} - if os.getenv('ENABLE_FORCE_ON_CPU_TEST'): - configs['force_on_cpu'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE} - if os.getenv('ENABLE_DISTRIBUTED_TEST'): - configs['distributed'] = { - 'num_shards': 1, - 'runner': DISTRIBUTED_GPU_RUNNER_TYPE if "cuda" in str(BUILD_ENVIRONMENT) else TEST_RUNNER_TYPE - } - if os.getenv('ENABLE_SLOW_TEST'): - configs['slow'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} - if os.getenv('ENABLE_DOCS_TEST'): - configs['docs_test'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} - if os.getenv('ENABLE_BACKWARDS_COMPAT_TEST'): - configs['backwards_compat'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} - if os.getenv('ENABLE_XLA_TEST'): - configs['xla'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} - if os.getenv('ENABLE_NOARCH_TEST'): - configs['noarch'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} - if RUN_SMOKE_TESTS: - configs['smoke_tests'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} - matrix = { - 'include': [ - { - 'config': 'default', - 'shard': shard, - 'num_shards': NUM_TEST_SHARDS, - 'runner': TEST_RUNNER_TYPE, - } - for shard in range(1, NUM_TEST_SHARDS + 1) - ] + [ - { - 'config': name, - 'shard': shard, - 'num_shards': config['num_shards'], - 'runner': config['runner'], - } - for name, config in configs.items() - for shard in range(1, config['num_shards'] + 1) - ] - } - render_matrix = {'config': list(dict.fromkeys(x['config'] for x in matrix['include']))} - print(json.dumps({'matrix': matrix, 'render-matrix': render_matrix}, indent=2)) - print(f'::set-output name=matrix::{json.dumps(matrix)}') - print(f'::set-output name=render-matrix::{json.dumps(render_matrix)}') - print(f'::set-output name=ignore-disabled-issues::{get_disabled_issues()}') - - -if __name__ == "__main__": - main() diff --git a/.github/scripts/get_workflow_job_id.py b/.github/scripts/get_workflow_job_id.py new file mode 100644 index 000000000000..72aed91d55ca --- /dev/null +++ b/.github/scripts/get_workflow_job_id.py @@ -0,0 +1,60 @@ +# Helper to get the id of the currently running job in a GitHub Actions +# workflow. GitHub does not provide this information to workflow runs, so we +# need to figure it out based on what they *do* provide. + +import requests +import os +import argparse + +# Our strategy is to retrieve the parent workflow run, then filter its jobs on +# RUNNER_NAME to figure out which job we're currently running. +# +# Why RUNNER_NAME? Because it's the only thing that uniquely identifies a job within a workflow. +# GITHUB_JOB doesn't work, as it corresponds to the job yaml id +# (https://bit.ly/37e78oI), which has two problems: +# 1. It's not present in the workflow job JSON object, so we can't use it as a filter. +# 2. It isn't unique; for matrix jobs the job yaml id is the same for all jobs in the matrix. +# +# RUNNER_NAME on the other hand is unique across the pool of runners. Also, +# since only one job can be scheduled on a runner at a time, we know that +# looking for RUNNER_NAME will uniquely identify the job we're currently +# running. +parser = argparse.ArgumentParser() +parser.add_argument( + "workflow_run_id", help="The id of the workflow run, should be GITHUB_RUN_ID" +) +parser.add_argument( + "runner_name", + help="The name of the runner to retrieve the job id, should be RUNNER_NAME", +) + +args = parser.parse_args() + + +PYTORCH_REPO = "https://api.github.com/repos/pytorch/pytorch" +GITHUB_TOKEN = os.environ["GITHUB_TOKEN"] +REQUEST_HEADERS = { + "Accept": "application/vnd.github.v3+json", + "Authorization": "token " + GITHUB_TOKEN, +} + +response = requests.get( + f"{PYTORCH_REPO}/actions/runs/{args.workflow_run_id}/jobs?per_page=100", + headers=REQUEST_HEADERS, +) + +jobs = response.json()["jobs"] +while "next" in response.links.keys(): + response = requests.get(response.links["next"]["url"], headers=REQUEST_HEADERS) + jobs.extend(response.json()["jobs"]) + +# Sort the jobs list by start time, in descending order. We want to get the most +# recently scheduled job on the runner. +jobs.sort(key=lambda job: job["started_at"], reverse=True) + +for job in jobs: + if job["runner_name"] == args.runner_name: + print(job["id"]) + exit(0) + +exit(1) diff --git a/.github/scripts/gitutils.py b/.github/scripts/gitutils.py index 46070c25e632..4c43fc251fb1 100644 --- a/.github/scripts/gitutils.py +++ b/.github/scripts/gitutils.py @@ -1,10 +1,11 @@ #!/usr/bin/env python3 +import os +import re +import tempfile from collections import defaultdict from datetime import datetime from typing import cast, Any, Dict, Iterator, List, Optional, Tuple, Union -import os -import re RE_GITHUB_URL_MATCH = re.compile("^https://.*@?github.com/(.+)/(.+)$") @@ -30,17 +31,17 @@ def fuzzy_list_to_dict(items: List[Tuple[str, str]]) -> Dict[str, List[str]]: def _check_output(items: List[str], encoding: str = "utf-8") -> str: - from subprocess import check_output, CalledProcessError + from subprocess import check_output, CalledProcessError, STDOUT try: - return check_output(items).decode(encoding) + return check_output(items, stderr=STDOUT).decode(encoding) except CalledProcessError as e: msg = f"Command `{' '.join(e.cmd)}` returned non-zero exit code {e.returncode}" stdout = e.stdout.decode(encoding) if e.stdout is not None else "" stderr = e.stderr.decode(encoding) if e.stderr is not None else "" if len(stderr) == 0: - msg += f"\n{stdout}" + msg += f"\n```\n{stdout}```" else: - msg += f"\nstdout:\n{stdout}\nstderr:\n{stderr}" + msg += f"\nstdout:\n```\n{stdout}```\nstderr:\n```\n{stderr}```" raise RuntimeError(msg) from e @@ -127,7 +128,15 @@ def current_branch(self) -> str: return self._run_git("symbolic-ref", "--short", "HEAD").strip() def checkout(self, branch: str) -> None: - self._run_git('checkout', branch) + self._run_git("checkout", branch) + + def fetch(self, ref: Optional[str] = None, branch: Optional[str] = None) -> None: + if branch is None and ref is None: + self._run_git("fetch", self.remote) + elif branch is None: + self._run_git("fetch", self.remote, ref) + else: + self._run_git("fetch", self.remote, f"{ref}:{branch}") def show_ref(self, name: str) -> str: refs = self._run_git('show-ref', '-s', name).strip().split('\n') @@ -185,8 +194,19 @@ def compute_branch_diffs(self, from_branch: str, to_branch: str) -> Tuple[List[s while len(from_values) > 0 and len(to_values) > 0: frc = self.get_commit(from_values.pop()) toc = self.get_commit(to_values.pop()) + # FRC branch might have PR number added to the title if frc.title != toc.title or frc.author_date != toc.author_date: - raise RuntimeError(f"Unexpected differences between {frc} and {toc}") + # HACK: Same commit were merged, reverted and landed again + # which creates a tracking problem + if ( + "pytorch/pytorch" not in self.remote_url() or + frc.commit_hash not in {"0a6a1b27a464ba5be5f587cce2ee12ab8c504dbf", + "6d0f4a1d545a8f161df459e8d4ccafd4b9017dbe", + "edf909e58f06150f7be41da2f98a3b9de3167bca", + "a58c6aea5a0c9f8759a4154e46f544c8b03b8db1", + "7106d216c29ca16a3504aa2bedad948ebcf4abc2"} + ): + raise RuntimeError(f"Unexpected differences between {frc} and {toc}") from_commits.remove(frc.commit_hash) to_commits.remove(toc.commit_hash) continue @@ -194,6 +214,17 @@ def compute_branch_diffs(self, from_branch: str, to_branch: str) -> Tuple[List[s from_commits.remove(commit) for commit in to_values: to_commits.remove(commit) + # Another HACK: Patch-id is not stable for commits with binary files or for big changes across commits + # I.e. cherry-picking those from one branch into another will change patchid + if "pytorch/pytorch" in self.remote_url(): + for excluded_commit in {"8e09e20c1dafcdbdb45c2d1574da68a32e54a3a5", + "5f37e5c2a39c3acb776756a17730b865f0953432", + "b5222584e6d6990c6585981a936defd1af14c0ba", + "84d9a2e42d5ed30ec3b8b4140c38dd83abbce88d", + "f211ec90a6cdc8a2a5795478b5b5c8d7d7896f7e"}: + if excluded_commit in from_commits: + from_commits.remove(excluded_commit) + return (from_commits, to_commits) def cherry_pick_commits(self, from_branch: str, to_branch: str) -> None: @@ -209,11 +240,17 @@ def cherry_pick_commits(self, from_branch: str, to_branch: str) -> None: self.cherry_pick(commit) self.checkout(orig_branch) - def push(self, branch: str, dry_run: bool) -> None: - if dry_run: - self._run_git("push", "--dry-run", self.remote, branch) - else: - self._run_git("push", self.remote, branch) + def push(self, branch: str, dry_run: bool, retry: int = 3) -> None: + for cnt in range(retry): + try: + if dry_run: + self._run_git("push", "--dry-run", self.remote, branch) + else: + self._run_git("push", self.remote, branch) + except RuntimeError as e: + print(f"{cnt} push attempt failed with {e}") + self.fetch() + self._run_git("rebase", f"{self.remote}/{branch}") def head_hash(self) -> str: return self._run_git("show-ref", "--hash", "HEAD").strip() @@ -237,6 +274,12 @@ def amend_commit_message(self, msg: str) -> None: self._run_git("commit", "--amend", "-m", msg) +def clone_repo(username: str, password: str, org: str, project: str) -> GitRepo: + path = tempfile.mkdtemp() + _check_output(['git', 'clone', f'https://{username}:{password}@github.com/{org}/{project}', path]).strip() + return GitRepo(path=path) + + class PeekableIterator(Iterator[str]): def __init__(self, val: str) -> None: self._val = val diff --git a/.github/scripts/gql_mocks.json b/.github/scripts/gql_mocks.json new file mode 100644 index 000000000000..1b97bf35f47e --- /dev/null +++ b/.github/scripts/gql_mocks.json @@ -0,0 +1,16082 @@ +{ + "query_sha=f357fcdcc09e1433307da0c33debf9c08279ffd70d5bbc31323994f6346c629f name=pytorch number=73811 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "closed": true, + "isCrossRepository": false, + "author": { + "login": "seemethere" + }, + "title": "ci: Migrate metrics credentials to managed IAM", + "body": "Stack from [ghstack](https://github.com/ezyang/ghstack):\n* __->__ #73811\n\r\nMigrates our credentials to upload metrics statistics to managed IAM\r\ncredentials in order to make it easier to know where the credentials are\r\ncoming from and to make it easier to add more permissions / less\r\npermissions later on.\r\n\r\nRelates to work done in [D34535827](https://www.internalfb.com/diff/D34535827)\r\n\r\nSigned-off-by: Eli Uriegas ", + "headRefName": "gh/seemethere/215/head", + "headRepository": { + "nameWithOwner": "pytorch/pytorch" + }, + "baseRefName": "gh/seemethere/215/base", + "baseRepository": { + "nameWithOwner": "pytorch/pytorch", + "isPrivate": false, + "defaultBranchRef": { + "name": "master" + } + }, + "mergeCommit": null, + "commits_with_authors": { + "nodes": [ + { + "commit": { + "author": { + "user": { + "login": "seemethere" + }, + "email": "eliuriegas@fb.com", + "name": "Eli Uriegas" + }, + "oid": "13c44d16a876a56bca479b4cf30715d21fa16e99" + } + }, + { + "commit": { + "author": { + "user": { + "login": "seemethere" + }, + "email": "eliuriegas@fb.com", + "name": "Eli Uriegas" + }, + "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7" + } + } + ], + "pageInfo": { + "endCursor": "Mg", + "hasNextPage": false + }, + "totalCount": 2 + }, + "commits": { + "nodes": [ + { + "commit": { + "checkSuites": { + "nodes": [ + { + "app": { + "name": "Facebook GitHub Tools", + "databaseId": 12274 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [ + { + "name": "Facebook CLA Check", + "conclusion": "SUCCESS", + "detailsUrl": "https://code.intern.facebook.com/cla/" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOaHA=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658275867" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276090" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "win-vs2019-cpu-py3" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276092" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3-clang5-mobile-build" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276094" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276095" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Lint" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276097" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "TorchBench CI (pytorch-linux-py3.7-cu102)" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276098" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-gcc7-no-ops" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815315?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObRM=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276099" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Test tools" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276100" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-clang7-asan" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276101" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQU=", + "hasNextPage": true + } + }, + "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7" + } + } + ] + }, + "changedFiles": 3, + "files": { + "nodes": [ + { + "path": ".github/templates/common.yml.j2" + }, + { + "path": ".github/workflows/generated-macos-11-py3-x86-64.yml" + }, + { + "path": ".github/workflows/update_pytorch_labels.yml" + } + ], + "pageInfo": { + "endCursor": "Mw", + "hasNextPage": false + } + }, + "reviews": { + "nodes": [ + { + "author": { + "login": "kit1980" + }, + "state": "APPROVED" + }, + { + "author": { + "login": "janeyx99" + }, + "state": "APPROVED" + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMy0wNFQxNDoyNDo0OC0wODowMLkyMDIyLTAzLTA0VDE0OjI0OjQ4LTA4OjAwzjWwwqA=", + "hasPreviousPage": false + } + }, + "comments": { + "nodes": [ + { + "bodyText": "Merge failed due to Too many checksuites for commit\nRaised by https://github.com/pytorch/pytorch/actions/runs/1988337976", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1068270969 + }, + { + "bodyText": "@pytorchbot force merge this", + "author": { + "login": "seemethere" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1068436128 + }, + { + "bodyText": "Merge failed due to Too many checksuites for commit\nRaised by https://github.com/pytorch/pytorch/actions/runs/1989076952", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1068437098 + }, + { + "bodyText": "@pytorchbot merge this", + "author": { + "login": "seemethere" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1068482921 + }, + { + "bodyText": "Hey @seemethere.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.", + "author": { + "login": "github-actions" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 1068484404 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOP6yFeQ==", + "hasPreviousPage": true + } + } + } + } + } + }, + "query_sha=f357fcdcc09e1433307da0c33debf9c08279ffd70d5bbc31323994f6346c629f name=pytorch number=31093 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "closed": false, + "isCrossRepository": true, + "author": { + "login": "mingxiaoh" + }, + "title": "improve mkldnn convolution test coverage", + "body": "This pr will improve the test coverage of mkldnn convolution.\r\n1.test input: specific sensitive numbers\r\n2.pass criteria: output of mkldnn convolution matches output of thnn convolution\r\n3.coverage: by using coverage tool, we found out the following sensitive parameters. Overall the case will test 4352 patterns, takes 8.8s on my machine.\r\n\r\nto run the test case:\r\n\r\npython test_mkldnn_conv2d_ext.py\r\nor\r\npython run_test.py -i mkldnn_conv2d_ext\r\n\r\nIn case of failure, the pattern will be printed in the log for further debugging.\r\n\r\nactually, this PR is created to replace and improve that PR we created before(https://github.com/pytorch/pytorch/pull/25085) ", + "headRefName": "master", + "headRepository": { + "nameWithOwner": "mingxiaoh/pytorch" + }, + "baseRefName": "master", + "baseRepository": { + "nameWithOwner": "pytorch/pytorch", + "isPrivate": false, + "defaultBranchRef": { + "name": "master" + } + }, + "mergeCommit": null, + "commits_with_authors": { + "nodes": [ + { + "commit": { + "author": { + "user": { + "login": "11pikachu" + }, + "email": "junx.du@intel.com", + "name": "dujun" + }, + "oid": "29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9" + } + } + ], + "pageInfo": { + "endCursor": "MQ", + "hasNextPage": false + }, + "totalCount": 1 + }, + "commits": { + "nodes": [ + { + "commit": { + "checkSuites": { + "nodes": [ + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "clang-format" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "clang-format", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/1099676797?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHOQYu8fQ==", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9/checks?check_suite_id=1175281097" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Lint" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "flake8-py3", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/1099676800?check_suite_focus=true" + }, + { + "name": "quick-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/1099676817?check_suite_focus=true" + }, + { + "name": "clang-tidy", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/1099676829?check_suite_focus=true" + }, + { + "name": "cmakelint", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/1099676840?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHOQYu8qA==", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9/checks?check_suite_id=1175281099" + }, + { + "app": { + "name": "Codecov", + "databaseId": 254 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [ + { + "name": "codecov/project", + "conclusion": "SUCCESS", + "detailsUrl": "https://codecov.io" + }, + { + "name": "codecov/patch", + "conclusion": "SUCCESS", + "detailsUrl": "https://codecov.io" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHOQZhcFQ==", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9/checks?check_suite_id=1176100822" + }, + { + "app": { + "name": "Codecov", + "databaseId": 254 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [ + { + "name": "codecov/patch", + "conclusion": "SUCCESS", + "detailsUrl": "https://codecov.io" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHOQZZsEQ==", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9/checks?check_suite_id=1176100824" + }, + { + "app": { + "name": "Facebook GitHub Tools", + "databaseId": 12274 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [ + { + "name": "Facebook CLA Check", + "conclusion": "SUCCESS", + "detailsUrl": "https://code.facebook.com/cla/" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHOUquzJg==", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9/checks?check_suite_id=1487517306" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHOWKm2eg==", + "hasNextPage": false + } + }, + "oid": "29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9" + } + } + ] + }, + "changedFiles": 5, + "files": { + "nodes": [ + { + "path": "test/math_libraries/convolutions.py" + }, + { + "path": "test/math_libraries/convolutions_cases/shapes_googlenet_v3.json" + }, + { + "path": "test/math_libraries/convolutions_cases/shapes_maskrcnn_p1.json" + }, + { + "path": "test/math_libraries/convolutions_cases/shapes_mobilenet.json" + }, + { + "path": "test/math_libraries/convolutions_cases/shapes_resnet_50.json" + } + ], + "pageInfo": { + "endCursor": "NQ", + "hasNextPage": false + } + }, + "reviews": { + "nodes": [ + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "CHANGES_REQUESTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "CHANGES_REQUESTED" + }, + { + "author": { + "login": "ailzhang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ngimel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "VitalyFedyunin" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ngimel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mingxiaoh" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mingxiaoh" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "VitalyFedyunin" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "VitalyFedyunin" + }, + "state": "APPROVED" + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpO5MjAxOS0xMi0zMFQxMDoxOToxMS0wODowMLkyMDE5LTEyLTMwVDEwOjE5OjExLTA4OjAwzhQZLuY=", + "hasPreviousPage": false + } + }, + "comments": { + "nodes": [ + { + "bodyText": "I cloned your repo and ran the tests:\n~/pytorch/test/math_libraries$ python convolutions.py\nFFFF\n======================================================================\nFAIL: test_conv2d_ext_cpu_float32 (__main__.TestConvExtCPU)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n result = test(self, *args)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n return fn(self, device, *args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n return fn(slf, device, *args, **kwargs)\n File \"convolutions.py\", line 114, in test_conv2d_ext\n \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float16 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n result = test(self, *args)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n return fn(self, device, *args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n return fn(slf, device, *args, **kwargs)\n File \"convolutions.py\", line 114, in test_conv2d_ext\n \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float32 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n result = test(self, *args)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n return fn(self, device, *args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n return fn(slf, device, *args, **kwargs)\n File \"convolutions.py\", line 114, in test_conv2d_ext\n \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float64 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n result = test(self, *args)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n return fn(self, device, *args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n return fn(slf, device, *args, **kwargs)\n File \"convolutions.py\", line 114, in test_conv2d_ext\n \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n----------------------------------------------------------------------\nRan 4 tests in 33.838s\n\nFAILED (failures=4)\n\nStill fails.", + "author": { + "login": "mruberry" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 673760580 + }, + { + "bodyText": "I cloned your repo and ran the tests:\n~/pytorch/test/math_libraries$ python convolutions.py\nFFFF\n======================================================================\nFAIL: test_conv2d_ext_cpu_float32 (__main__.TestConvExtCPU)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n result = test(self, *args)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n return fn(self, device, *args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n return fn(slf, device, *args, **kwargs)\n File \"convolutions.py\", line 114, in test_conv2d_ext\n \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float16 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n result = test(self, *args)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n return fn(self, device, *args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n return fn(slf, device, *args, **kwargs)\n File \"convolutions.py\", line 114, in test_conv2d_ext\n \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float32 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n result = test(self, *args)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n return fn(self, device, *args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n return fn(slf, device, *args, **kwargs)\n File \"convolutions.py\", line 114, in test_conv2d_ext\n \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float64 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n result = test(self, *args)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n return fn(self, device, *args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n return fn(slf, device, *args, **kwargs)\n File \"convolutions.py\", line 114, in test_conv2d_ext\n \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n----------------------------------------------------------------------\nRan 4 tests in 33.838s\n\nFAILED (failures=4)\n\nStill fails.\n\n@mruberry It is suggested by @VitalyFedyunin that, we need to display fail test to avoid invalid inputs, I guess we should set it as expected failures under the pytest test framework, right? we will change it as expected failure cases under pytest test framework. The result will looks like be low, is it ok?\n2500 passed, 136 skipped, 0 failed, 0 errors, 2 expected failures, 0 unexpected passes", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": { + "login": "mingxiaoh" + }, + "databaseId": 673816925 + }, + { + "bodyText": "Displaying tests that fail is fine, but I don't think @VitalyFedyunin meant that it was OK if the tests didn't pass. If these are expected failures then yes, you can use with self.assertRaises(RuntimeError):... when testing them. If you also want to report that the test has test cases with these properties you can print or warn, which will appear in the test output.", + "author": { + "login": "mruberry" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 673858224 + }, + { + "bodyText": "Codecov Report\n\nMerging #31093 into master will not change coverage.\nThe diff coverage is n/a.\n\n\n@@ Coverage Diff @@\n## master #31093 +/- ##\n=======================================\n Coverage 68.00% 68.00% \n=======================================\n Files 382 382 \n Lines 49527 49527 \n=======================================\n Hits 33679 33679 \n Misses 15848 15848 \n\nContinue to review full report at Codecov.\n\nLegend - Click here to learn more\n\u0394 = absolute (impact), \u00f8 = not affected, ? = missing data\nPowered by Codecov. Last update 69f6d94...29f6aa6. Read the comment docs.", + "author": { + "login": "codecov" + }, + "authorAssociation": "NONE", + "editor": { + "login": "codecov" + }, + "databaseId": 686921371 + }, + { + "bodyText": "Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as Stale. Feel free to remove the Stale label if you feel this was a mistake. If you are unable to remove the Stale label please contact a maintainer in order to do so. Stale pull requests will automatically be closed 30 days after being marked Stale", + "author": { + "login": "pytorchbot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1095860944 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOKCjFRA==", + "hasPreviousPage": true + } + } + } + } + } + }, + "query_sha=62ce809793481ce6ddce6e1a19d9b0761755ff0ff75decaf8a79419eaf793110 cursor=Y3Vyc29yOnYyOpHOKCjFRA== name=pytorch number=31093 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "comments": { + "nodes": [ + { + "bodyText": "Hi, @mingfeima @soumith @Jianhui-Li\nthis will improve the test coverage of mkldnn convolution, would you please review it?\nThe current code is forward only, do we need to cover backward, if yes, we can add backward.", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 564806270 + }, + { + "bodyText": "@mingxiaoh, what is the value in testing DNNL as part of Pytorch validation for the Pytorch developers? Shouldn't having these tests run in DNNL validation be enough?", + "author": { + "login": "vpirogov" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 564808528 + }, + { + "bodyText": "@vpirogov The main value is to serve as a blind test to DNNL. If DNNL adds these test to DNNL test sets, it lost the value as a blind test. The spirit of validation is to cross check.\n@gottbrath @gchanan The test was developed per the request of Pytorch team. Mingxiao made an effort to reduce the execution time to a few second but still with good coverage. Although the test today is focused on DNNL, it could be easily extended to be blind test for any conv implementation used in Pytorch.", + "author": { + "login": "Jianhui-Li" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 567826907 + }, + { + "bodyText": "@mruberry thanks for the comment. As for the chainer dependency, we import it is because we would like to use its testing function for pytest test cases combinations, other wise we need to write much more code to achieve same effect. So, can we use it?", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 574563012 + }, + { + "bodyText": "@mingxiaoh You cannot import chainer. Looking at the code you should be able to achieve the same effect without it.", + "author": { + "login": "mruberry" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 575272358 + }, + { + "bodyText": "@mruberry ok, we will change it according to your requirement. Thanks", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 583917522 + }, + { + "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/31093\n\ud83d\udd27 \u00a0Opt-in to CIFlow to control what jobs run on your PRs\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 29f6aa6 (more details on the Dr. CI page):\n\nCommit 29f6aa6 was recently pushed. Waiting for builds...\n\nThis comment was automatically generated by Dr. CI (expand for details).Follow this link to opt-out of these comments for your Pull Requests.\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here to manually regenerate this comment.", + "author": { + "login": "dr-ci" + }, + "authorAssociation": "NONE", + "editor": { + "login": "facebook-github-bot" + }, + "databaseId": 628466876 + }, + { + "bodyText": "@mruberry how about those cudnn UT error? we add check for it but it should be NV to fix cudnn bugs.", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 629955767 + }, + { + "bodyText": "Hey @mingxiaoh! You're right, of course, that you shouldn't have to fix cuDNN bugs. Would you please:\n\nAssert that the test case fails, so we know it's failing and if someone fixes it they'll know what test to update.\nFile a new issue explaining the behavior and providing a short PyTorch program to reproduce the issue.\n\nThen we can ping NVIDIA on that issue.", + "author": { + "login": "mruberry" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 629997129 + }, + { + "bodyText": "about the suggestion 'Assert that the test case fails, so we know it's failing and if someone fixes it they'll know what test to update. ', if we only assert it and continue the following test, I guess users might always ignore them in later test. Anyway, any similar example case for reference?", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 630010734 + }, + { + "bodyText": "In this recent PR https://github.com/pytorch/pytorch/pull/38505/files, for example, you can see that the construction of bool tensors wasn't working properly, so the test author cited the relevant issue and asserted that the incorrect behavior happened, as expected. You can also see how these lines are being removed by https://github.com/pytorch/pytorch/pull/38392/files, which fixes the issue.\nAnother common pattern is to use with self.assertRaises(RuntimeError/AssertionError/etc.):.", + "author": { + "login": "mruberry" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 630014823 + }, + { + "bodyText": "@mruberry the failed UT case is not introduced by our modification, how to handle this issue?", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 631187735 + }, + { + "bodyText": "@mingxiaoh You mean the failures on ROCm? You may ignore them. Be sure to re-request review when you're ready.", + "author": { + "login": "mruberry" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 631191425 + }, + { + "bodyText": "@mruberry we already skipped those ROCm errors, but there are stil somel error caused by the original code, they are not introduced by our modification.", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 631886529 + }, + { + "bodyText": "I understand. Let me know when you're ready for me to review.", + "author": { + "login": "mruberry" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 631908011 + }, + { + "bodyText": "@mruberry thanks, we are ready for review now.", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 631909442 + }, + { + "bodyText": "@mingxiaoh Great! I'll take a look ASAP.", + "author": { + "login": "mruberry" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 631910556 + }, + { + "bodyText": "@mruberry we just pull the latest code and updated the patch according to your comment, may you please help double check it? BTW, the new failed case in preci is not introduced by our modification.", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 633430458 + }, + { + "bodyText": "@ailzhang would you please check the comment below? Thanks.\nIs there a reason why this TestConv2dExt is a new class instead a test inside TestNN?\n//comment: it is actually suggested by Tongzhou Wang in another thread before.\nAlthough this test sits in generic testing framework, it's actually comparing thnn/mkldnn/cudnn results specially. I feel it's better to make it truly generic so that it compares any device result with CPU result. Alternatively you can mark this test only run when torch.backends.mkldnn.is_available()=True\n//comment: but our goal is to compare the result with that of thnn. Anyway, if you insist, we can start to compare it with cpu.", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": { + "login": "mingxiaoh" + }, + "databaseId": 634432326 + }, + { + "bodyText": "Pruning reviewers. @ngimel, @VitalyFedyunin, this PR is looking pretty good from a test framework perspective. Would one of you like to review?", + "author": { + "login": "mruberry" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 634557563 + }, + { + "bodyText": "@mruberry Thanks, would you please help review it again. BTW: failed case is not introduced by our modification.", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 635256214 + }, + { + "bodyText": "@mruberry we moved our case to TestNNDeviceType class, would you please help review it again? BTW, those failed cases are not introduced by our code", + "author": { + "login": "1pikachu" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 637364148 + }, + { + "bodyText": "@mruberry we moved our case to TestNNDeviceType class, would you please help review it again? BTW, those failed cases are not introduced by our code\n\n@ngimel will follow-up on the test itself sometime this week or early next week.", + "author": { + "login": "mruberry" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 637444457 + }, + { + "bodyText": "@mruberry we moved our case to TestNNDeviceType class, would you please help review it again? BTW, those failed cases are not introduced by our code\n\n@ngimel will follow-up on the test itself sometime this week or early next week.\n\n@mruberry thank you", + "author": { + "login": "1pikachu" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 637479226 + }, + { + "bodyText": "Improving test coverage of math libraries is certainly a good goal and this PR is moving towards it. I have some doubts about implementation decisions made, and about running this PR as part of regular pytorch CI.\nIf the primary goal of this PR is to test correctness of the convolution implementations in the vendor library, then it does not serve this purpose. The absolute majority of the 4000+ test cases come from group 1, where different kernel sizes/strides/dilations are used to produce the output of size 1x1. This can test whether pytorch correctly passes convolution parameters to the backends (although there are cheaper ways to do that), but as actual library correctness check it is almost useless - libraries use very different kernels depending in the input/output sizes, and tests with toy sizes like this don't invoke the real bread-and-butter kernels.\nAlso, if this test suite is meant as primary a means of testing vendor libraries (which is a good goal!) it does not have a place as a part of pytorch regular CI, and should be run when the corresponding vendor libraries are updated. I'd suggest moving this test out into a separate file (maybe even outside of torch/test directory) and have it as a part of library update/qualification process rather than regular CI.\nAlso, if the primary goal is to enable easier testing of vendor libraries correctness, perhaps we should rethink the mechanism of the generation of test cases. It should be easy to add a test case with a particular set of parameters that was found to be buggy. Also, running a cross-product of cases in a multi-dimensional space (as this PR does) is rarely an efficient way of getting a signal, some forms of random sampling usually provide a way to get better correctness signal why using less resources.\nAlso, when testing libraries it is important to test both forward and backward functions, whereas this PR does forward only. I'm openminded on whether convTransposed should be tested or not - if we are testing vendor libraries, then it's not necessary, convTransposed calls the same underlying functions, if we are testing pytorch, then it makes sense to test it separately because it takes different codepaths.", + "author": { + "login": "ngimel" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 637827507 + }, + { + "bodyText": "@mruberry ngimel is quite responsible, but it seems that she is not familiar with the background of this pull-request, since this pull-request is pending for so such a long time, each time we are almost done, then reviewer changes, each reviewer has different idea, it is good, but, would it be better if you help review it or ask the same reviewer to review it considering that you are more familiar with the background/change history? Thanks in advance.", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 637912105 + }, + { + "bodyText": "@mruberry ngimel is quite responsible, but it seems that she is not familiar with the background of this pull-request, since this pull-request is pending for so such a long time, each time we are almost done, then reviewer changes, each reviewer has different idea, it is good, but, would it be better if you help review it or ask the same reviewer to review it considering that you are more familiar with the background/change history? Thanks in advance.\n\nWe know this PR has been open for awhile and we respect that your time is valuable, but we want to make sure we're making the right change here, and I think @ngimel's comments reflect that and should not be too difficult to address. As I understand, her points are:\n\nThis is a good PR with an exciting idea. To let it run longer and test more cases maybe it should run outside the regular PyTorch CI.\nTo remedy this, let's create a test/math_libraries folder and put this test there: test/math_libaries/convolutions.py. Yes, this is different from our requests in the past, which is our mistake, but it should be an easy change.\nTo make the test more interesting it'd be good for the test cases to resemble convolutions used in practice. The current test cases seem like similar \"toy\" examples. Without time pressure we should be able to run larger, more computationally intensive convolutions.\nLet's change the test cases to include some practical convolutions, make it easy to add test cases, and think about how we might generate other interesting cases. (We should also test backwards once we have more time!)\n\nAnd I think these are good points. Maybe the PR doesn't create a new way to generate interesting convolutions to start and instead only runs a few representative convolutions, but @ngimel is positioning the work for success so that it's useful and we can continue to improve on it in the future.\nDoes that make sense?", + "author": { + "login": "mruberry" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 637924703 + }, + { + "bodyText": "@mruberry we were required to finish the test in limited time long long before, at that time, jianhui discussed this issue with you, and you are all agreed with the current test scope and test case number and test time, so you meant you change your mind now? you are not care about the test time currently? Sorry, this issue is pending so long, we are struggling with it now and would like to finish it asap. Given this, it would be be better if you raise all the requirement at a time, considering that we have many tasks at hand, we are hoping so eagerly that we can finish this PR and use it for further test for bugs finding.", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": { + "login": "mingxiaoh" + }, + "databaseId": 637960626 + }, + { + "bodyText": "@mruberry we were required to finish the test in limited time long long before, at that time, jianhui discussed this issue with you, and you are all agreed with the current test scope and test case number and test time, so you meant you change your mind now? you are not care about the test time currently? Sorry, this issue is pending so long, we are struggling with it now and would like to finish it asap. Given this, it would be be better if you raise all the requirement at a time, considering that we have many tasks at hand, we are hoping so eagerly that we can finish this PR and use it for further test for bugs finding.\n\nI'm sorry, I don't think I've talked to @Jianhui-Li before. It's true that the team we expressed a concern about timing if the test was to be run in the CI initially, but I think now that we understand what the test is trying to do better we're not sure the CI is the best place for it. The PR was also closed after a lengthy period of inactivity, and we assumed it had simply been abandoned.\nDo you know who @Jianhui-Li spoke with about this issue originally? Maybe I can follow-up with them for more context.", + "author": { + "login": "mruberry" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 637967153 + }, + { + "bodyText": "@mruberry it is reviewed and discussed with @soumith before. Anyway, since current reviewer is you, so, it should be decided by you. So, what we should do next?", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 637978356 + }, + { + "bodyText": "@mruberry it is reviewed and discussed with @soumith before. Anyway, since current reviewer is you, so, it should be decided by you. So, what we should do next?\n\nI think this will be easier to discuss at the regular Intel-FB meeting.", + "author": { + "login": "mruberry" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 638446723 + }, + { + "bodyText": "@mruberry it is reviewed and discussed with @soumith before. Anyway, since current reviewer is you, so, it should be decided by you. So, what we should do next?\n\nI think this will be easier to discuss at the regular Intel-FB meeting.\n\nLet me sync with Mingxiao and follow up with this. Thanks.", + "author": { + "login": "Jianhui-Li" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 638451670 + }, + { + "bodyText": "@mruberry would you please help review it again?", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 653028208 + }, + { + "bodyText": "@mruberry would you please help review it again?\n\nHappy to help out, but as last discussed this needs some follow-up at the Intel-FB meeting. Did you get a chance to discuss it there, yet? If so, what did you decide?", + "author": { + "login": "mruberry" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 654443242 + }, + { + "bodyText": "@mruberry would you please help review it again?\n\nHappy to help out, but as last discussed this needs some follow-up at the Intel-FB meeting. Did you get a chance to discuss it there, yet? If so, what did you decide?\n\nyes, we talked it with jianhui, and we decided to follow your ideas. Anyway, we would like to do so modification later, will contact you for review tomorrow. Thanks", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 656062287 + }, + { + "bodyText": "@mruberry would you please help review it again?\n\nHappy to help out, but as last discussed this needs some follow-up at the Intel-FB meeting. Did you get a chance to discuss it there, yet? If so, what did you decide?\n\nyes, we talked it with jianhui, and we decided to follow your ideas. Anyway, we would like to do so modification later, will contact you for review tomorrow. Thanks\n\n@mruberry the code is ready for review now, would you please take time for it? Thanks.", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 658071151 + }, + { + "bodyText": "super nit: renaming files to .json will make it more IDE friendly.", + "author": { + "login": "VitalyFedyunin" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 658464685 + }, + { + "bodyText": "@mruberry would you please help review it again?\n\nHappy to help out, but as last discussed this needs some follow-up at the Intel-FB meeting. Did you get a chance to discuss it there, yet? If so, what did you decide?\n\nyes, we talked it with jianhui, and we decided to follow your ideas. Anyway, we would like to do so modification later, will contact you for review tomorrow. Thanks\n\n@mruberry the code is ready for review now, would you please take time for it? Thanks.\n\nCool! I took a look with @ngimel, once these issues are addressed I think we're good to go!", + "author": { + "login": "mruberry" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 659164401 + }, + { + "bodyText": "@ngimel & @VitalyFedyunin We have changed the code according to your suggestions, would you please review it again? Thanks.", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 660884305 + }, + { + "bodyText": "@ngimel & @VitalyFedyunin We have changed the code according to your suggestions, would you please review it again? Thanks.\n\nUpdated: one more question about tolerances, one code cleanup recommendation, and one task leftover from the last review.", + "author": { + "login": "mruberry" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 662678464 + }, + { + "bodyText": "Updated: one more question about tolerances, one code cleanup recommendation, and one task leftover from the last review.\n@mruberry we have finished the modification according to your comment, would you please review it again? Thanks.", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 662930687 + }, + { + "bodyText": "The code looks good, but I tried running the test suite and hit the following failures:\n======================================================================\nFAIL: test_conv2d_ext_cuda_float16 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 241, in instantiated_test\n result = test(self, device_arg, dtype)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 542, in only_fn\n return fn(self, device, *args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 411, in dep_fn\n return fn(slf, device, *args, **kwargs)\n File \"convolutions.py\", line 102, in test_conv2d_ext\n msg=msg\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 1085, in assertEqual\n self.assertTrue(result, msg=msg)\nAssertionError: False is not true : device:cuda:0, dtype:torch.float16, group:1, batchsize:22input channel:448, output channel:384, bias:False, padding:[1, 1], dilation:[1, 1], stride:[1, 1], kernel:[3, 3]\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float32 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 241, in instantiated_test\n result = test(self, device_arg, dtype)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 542, in only_fn\n return fn(self, device, *args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 411, in dep_fn\n return fn(slf, device, *args, **kwargs)\n File \"convolutions.py\", line 102, in test_conv2d_ext\n msg=msg\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 1085, in assertEqual\n self.assertTrue(result, msg=msg)\nAssertionError: False is not true : device:cuda:0, dtype:torch.float32, group:1, batchsize:22input channel:80, output channel:192, bias:False, padding:[0, 0], dilation:[1, 1], stride:[1, 1], kernel:[3, 3]\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float64 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n method(*args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 241, in instantiated_test\n result = test(self, device_arg, dtype)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 542, in only_fn\n return fn(self, device, *args, **kwargs)\n File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 411, in dep_fn\n return fn(slf, device, *args, **kwargs)\n File \"convolutions.py\", line 106, in test_conv2d_ext\n \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\nLooking at the first invalid convolution, for example, it's:\n {\n \"case_name\":\"masknet_p1:conv33\",\n \"mb\":1,\n \"g\":1,\n \"ic\":512,\n \"ih\":64,\n \"iw\":64,\n \"oc\":12,\n \"kh\":1,\n \"kw\":1,\n \"sh\":1,\n \"sw\":1,\n \"ph\":0,\n \"pw\":0,\n \"dh\":0,\n \"dw\":0,\n \"bias\":\"False\"\n },\n\nwhich has a dh and dw of zero, causing it to be added to invalid cases here:\ndh, dw = case['dh'], case['dw']\n has_bias = case['bias']\n if dh == 0 or dw == 0:\n invalid_cases.append(case_name)", + "author": { + "login": "mruberry" + }, + "authorAssociation": "MEMBER", + "editor": { + "login": "mruberry" + }, + "databaseId": 663240268 + }, + { + "bodyText": "@mruberry the failure was not detected is because we did not export the cudnn path. Yes, you are right, we need to a large atol of 1e-2 . Would you please help review it again? Thanks.", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 664373079 + }, + { + "bodyText": "@mruberry the failure was not detected is because we did not export the cudnn path. Yes, you are right, we need to a large atol of 1e-2 . Would you please help review it again? Thanks.\n\nBefore I run these tests again, is an atol of 1e-2 needed for all types or just half? Also, how does 1e-2 compare to the values that are being compared?", + "author": { + "login": "mruberry" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 664569507 + }, + { + "bodyText": "@mruberry 1e-2 is experimental result, details see below, random means it might be failed sometimes.\n\n\n\natol,rtol\n1e-2,1e-2\n1e-2,1e-3\n1e-3,1e-2\n1e-3,1e-3\n1e-4,1e-3\n1e-3,1e-4\n1e-4,1e-4\n1e-4,1e-5\n1e-5,1e-4\n\n\n\n\nCuda float16\npass\npass\npass\npass\npass\nfail\nFail\nFail\nfail\n\n\nCuda float32\npass\nrandom\nrandom\nrandom\nrandom\nrandom\nrandom\nrandom\nfail", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 666894774 + }, + { + "bodyText": "@mruberry would you please find time to review it again? Thanks.", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 668380451 + }, + { + "bodyText": "@mruberry would you please find time to review it again? Thanks.\n\nI was just about to try and run this again locally but it looks like the files describing the convolutions are missing?", + "author": { + "login": "mruberry" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 670306210 + }, + { + "bodyText": "@mruberry sorry but what is missing actually?", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 670322557 + }, + { + "bodyText": "@mruberry sorry but what is missing actually?\n\nThe JSON files.", + "author": { + "login": "mruberry" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 670591170 + }, + { + "bodyText": "@mruberry sorry but what is missing actually?\n\nThe JSON files.\n\n@mruberry sorry, we add them now, would you please check it again? Thanks.", + "author": { + "login": "mingxiaoh" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 673402901 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOIapCfg==", + "hasPreviousPage": false + } + } + } + } + } + }, + "query_sha=f357fcdcc09e1433307da0c33debf9c08279ffd70d5bbc31323994f6346c629f name=pytorch number=76118 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "closed": false, + "isCrossRepository": false, + "author": { + "login": "malfet" + }, + "title": "Dummy change with lots of commits", + "body": "Draft PR with 100+ commits, to test mergebot ", + "headRefName": "malfet/pr-with-lots-of-commits", + "headRepository": { + "nameWithOwner": "pytorch/pytorch" + }, + "baseRefName": "master", + "baseRepository": { + "nameWithOwner": "pytorch/pytorch", + "isPrivate": false, + "defaultBranchRef": { + "name": "master" + } + }, + "mergeCommit": null, + "commits_with_authors": { + "nodes": [ + { + "commit": { + "author": { + "user": { + "login": "malfet" + }, + "email": "nshulga@fb.com", + "name": "Nikita Shulga" + }, + "oid": "3067f2240afc7a29dc348000aa19eccbd9772303" + } + }, + { + "commit": { + "author": { + "user": { + "login": "andrewor14" + }, + "email": "andrewor@fb.com", + "name": "Andrew Or" + }, + "oid": "2f655b71f70c496c4e645f6cdb27d7bb7e825701" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "0c6dcaa7f58a19c42a530f4ee14bb6f0f03ca9fb" + } + }, + { + "commit": { + "author": { + "user": { + "login": "dzdang" + }, + "email": "dzdang@umich.edu", + "name": "dzdang" + }, + "oid": "cad11c563d41ebcffb1683fe1f1288b8157413b3" + } + }, + { + "commit": { + "author": { + "user": { + "login": "alanwaketan" + }, + "email": "jwtan@fb.com", + "name": "Jiewen Tan" + }, + "oid": "4dfd0875a68d87fccb5ad0d81692db480043b86e" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "2d37e74690582a4a26890e4c8b98f1f80e589c82" + } + }, + { + "commit": { + "author": { + "user": { + "login": "alanwaketan" + }, + "email": "jwtan@fb.com", + "name": "Jiewen Tan" + }, + "oid": "d4aee60947e1a3ef23c7c42990621e0746fdd0a8" + } + }, + { + "commit": { + "author": { + "user": { + "login": "peterbell10" + }, + "email": "peterbell10@live.co.uk", + "name": "Peter Bell" + }, + "oid": "aac6204bf710beb5e50a383d426ae6222396335a" + } + }, + { + "commit": { + "author": { + "user": { + "login": "dzdang" + }, + "email": "dzdang@umich.edu", + "name": "dzdang" + }, + "oid": "4b0362cab884584c24f5834b3874f5f357f56b5d" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "7536df613cbc645a9e68e6a3b0a8450753260fd1" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "20a50cb966d28d7bf82924adf781cf72a01ef90e" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "486387e8644afb46edff5aa5925b55c8119f67f0" + } + }, + { + "commit": { + "author": { + "user": { + "login": "dzdang" + }, + "email": "dzdang@umich.edu", + "name": "dzdang" + }, + "oid": "acb9d78b9b732d3667b881727e6ed9f92a8c549f" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "683bb7959a5b973f8470c081ad02e8fc508e784a" + } + }, + { + "commit": { + "author": { + "user": { + "login": "qihqi" + }, + "email": "qihan@fb.com", + "name": "Han Qi" + }, + "oid": "a870cb40af65adf0b77d55f6b554d7093d284d7a" + } + }, + { + "commit": { + "author": { + "user": { + "login": "Krovatkin" + }, + "email": "korovaikon@gmail.com", + "name": "Nikolay Korovaiko" + }, + "oid": "70793b9f328ddf52cc86336104c3a064c8582ef4" + } + }, + { + "commit": { + "author": { + "user": { + "login": "suo" + }, + "email": "suo@fb.com", + "name": "Michael Suo" + }, + "oid": "f70b31f62b1c5159eef2725484b175983517c88c" + } + }, + { + "commit": { + "author": { + "user": { + "login": "dagitses" + }, + "email": "mikeyd@fb.com", + "name": "Michael Andreas Dagitses" + }, + "oid": "04d3ec1db60defe1c6904bf77e9f8dfa87dc0b63" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "46b754a55b63e3168ad5854ad412c124934b675d" + } + }, + { + "commit": { + "author": { + "user": { + "login": "robieta" + }, + "email": "taylorrobie@fb.com", + "name": "Taylor Robie" + }, + "oid": "13df69e13ee571fdd716139419a00aec47ade7d6" + } + }, + { + "commit": { + "author": { + "user": { + "login": "malfet" + }, + "email": "nshulga@fb.com", + "name": "Nikita Shulga" + }, + "oid": "70642e911ec80a47cdbf4a50aac475c11aa129b6" + } + }, + { + "commit": { + "author": { + "user": { + "login": "pytorchmergebot" + }, + "email": "pytorchmergebot@users.noreply.github.com", + "name": "PyTorch MergeBot" + }, + "oid": "59bb7c39384bf3e0b284a037adef8b3caa53c1c4" + } + }, + { + "commit": { + "author": { + "user": { + "login": "malfet" + }, + "email": "nshulga@fb.com", + "name": "Nikita Shulga" + }, + "oid": "007cfb97b55d70ff63e1ed71d1a674638f847376" + } + }, + { + "commit": { + "author": { + "user": { + "login": "pytorchmergebot" + }, + "email": "pytorchmergebot@users.noreply.github.com", + "name": "PyTorch MergeBot" + }, + "oid": "0a7b858a5af1393fa3cf2853f92eca0e1d408dde" + } + }, + { + "commit": { + "author": { + "user": { + "login": "qihqi" + }, + "email": "qihan@fb.com", + "name": "Han Qi" + }, + "oid": "7917d789f0a523715041ade5177d271082628236" + } + }, + { + "commit": { + "author": { + "user": { + "login": "kit1980" + }, + "email": "sdym@fb.com", + "name": "Sergii Dymchenko (Meta Employee)" + }, + "oid": "91eb6017f0fb8a1b29e8cb48fac93bc9709f73b3" + } + }, + { + "commit": { + "author": { + "user": { + "login": "dagitses" + }, + "email": "mikeyd@fb.com", + "name": "Michael Andreas Dagitses" + }, + "oid": "bd04dca5fabb0c2a51ac87063a515f256ef274fa" + } + }, + { + "commit": { + "author": { + "user": { + "login": "dagitses" + }, + "email": "mikeyd@fb.com", + "name": "Michael Andreas Dagitses" + }, + "oid": "1f805a5defda7dabc49d0059edb9ccb06bc29352" + } + }, + { + "commit": { + "author": { + "user": { + "login": "mruberry" + }, + "email": "mruberry@fb.com", + "name": "Mike Ruberry" + }, + "oid": "4982c0a8db8f23d15ec4bfcbca4ce939afc04954" + } + }, + { + "commit": { + "author": { + "user": { + "login": "pearu" + }, + "email": "pearu.peterson@gmail.com", + "name": "Pearu Peterson" + }, + "oid": "28502265cb5925cb7db8dcb2dd2334963092714a" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "e03fcaedb1342e6d65c7f7f20243000938ba60b2" + } + }, + { + "commit": { + "author": { + "user": { + "login": "pritamdamania" + }, + "email": "pritam.damania@fb.com", + "name": "pritam" + }, + "oid": "efb28f5a1a5d18aa96bd668ab2ab5c651be359f3" + } + }, + { + "commit": { + "author": { + "user": { + "login": "MagiaSN" + }, + "email": "magialiao@tencent.com", + "name": "magialiao" + }, + "oid": "52cc1b9994f861ebdd3908759ed1ab11cba1f8de" + } + }, + { + "commit": { + "author": { + "user": { + "login": "pytorchmergebot" + }, + "email": "pytorchmergebot@users.noreply.github.com", + "name": "PyTorch MergeBot" + }, + "oid": "3cd99f23d1acd6a5bedf6f3b02be79d64350a5b6" + } + }, + { + "commit": { + "author": { + "user": { + "login": "awgu" + }, + "email": "andgu@fb.com", + "name": "Andrew Gu" + }, + "oid": "b00502c634a5146f4d996bd90e84d317f049e7b0" + } + }, + { + "commit": { + "author": { + "user": { + "login": "davidberard98" + }, + "email": "dberard@fb.com", + "name": "David Berard" + }, + "oid": "976eb7cee799dddfbe6a4122b249aaee1b6c8854" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ngimel" + }, + "email": "ngimel@fb.com", + "name": "Natalia Gimelshein" + }, + "oid": "9608ab28744d5cae32f371490557b248c9549c66" + } + }, + { + "commit": { + "author": { + "user": { + "login": "malfet" + }, + "email": "nshulga@fb.com", + "name": "Nikita Shulga" + }, + "oid": "4e119f0c39eb5ff0777f0e71561e6b633d85fb34" + } + }, + { + "commit": { + "author": { + "user": { + "login": "rohan-varma" + }, + "email": "rvarm1@fb.com", + "name": "Rohan Varma" + }, + "oid": "447580dc565f3660eddb2c996c6ed25b88338684" + } + }, + { + "commit": { + "author": { + "user": { + "login": "malfet" + }, + "email": "nshulga@fb.com", + "name": "Nikita Shulga" + }, + "oid": "2bc8f43e9233008ea23053fab87b83ab36fca5e3" + } + }, + { + "commit": { + "author": { + "user": { + "login": "dzdang" + }, + "email": "dzdang@umich.edu", + "name": "dzdang" + }, + "oid": "c13a8e891c3e3e714f60649ca1e3b082e090e9fe" + } + }, + { + "commit": { + "author": { + "user": { + "login": "dzdang" + }, + "email": "dzdang@umich.edu", + "name": "dzdang" + }, + "oid": "fddc861b7ee473f57d3c2161e4618a2663a237e8" + } + }, + { + "commit": { + "author": { + "user": { + "login": "jiyuanzFB" + }, + "email": "jiyuanz@fb.com", + "name": "Jiyuan Zhang" + }, + "oid": "e2336dbc539d6c021720cbe43c92c9e4c8463299" + } + }, + { + "commit": { + "author": { + "user": { + "login": "bdhirsh" + }, + "email": "hirsheybar@fb.com", + "name": "Brian Hirsh" + }, + "oid": "26e2759d1ad59aac12168b74d1ca55e42ba9455c" + } + }, + { + "commit": { + "author": { + "user": { + "login": "bdhirsh" + }, + "email": "hirsheybar@fb.com", + "name": "Brian Hirsh" + }, + "oid": "ad7aa914ee3b3d1252e31514f010ba96c40aae87" + } + }, + { + "commit": { + "author": { + "user": { + "login": "bdhirsh" + }, + "email": "hirsheybar@fb.com", + "name": "Brian Hirsh" + }, + "oid": "f113c5d78065aafbe7b1c0e611945bfe9f67b3c0" + } + }, + { + "commit": { + "author": { + "user": { + "login": "bdhirsh" + }, + "email": "hirsheybar@fb.com", + "name": "Brian Hirsh" + }, + "oid": "a366fd01136292544b7862968ae92feba4b6d8fe" + } + }, + { + "commit": { + "author": { + "user": { + "login": "seemethere" + }, + "email": "eliuriegas@fb.com", + "name": "Eli Uriegas" + }, + "oid": "afeba0773749da5883c378a2e6ac066e1ce62ca0" + } + }, + { + "commit": { + "author": { + "user": { + "login": "bdhirsh" + }, + "email": "hirsheybar@fb.com", + "name": "Brian Hirsh" + }, + "oid": "d306c99addc543908f64666baeecacbd0749f4a7" + } + }, + { + "commit": { + "author": { + "user": { + "login": "awgu" + }, + "email": "andgu@fb.com", + "name": "Andrew Gu" + }, + "oid": "c2456ea658f41f64ea054a422edf22a9c977399f" + } + }, + { + "commit": { + "author": { + "user": { + "login": "awgu" + }, + "email": "andgu@fb.com", + "name": "Andrew Gu" + }, + "oid": "a8b0a1b681c9fe41e0d553c962a5c93e81d92503" + } + }, + { + "commit": { + "author": { + "user": { + "login": "anjali411" + }, + "email": "chourdiaanjali123@gmail.com", + "name": "anjali411" + }, + "oid": "af761d9a5d058c9188f16589bae4f307d35185be" + } + }, + { + "commit": { + "author": { + "user": { + "login": "clee2000" + }, + "email": "csl@fb.com", + "name": "Catherine Lee" + }, + "oid": "beceb417baef35b15c2716e23178fb49f7fd6f9d" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "1516554e22136db89d0aeba43a1a1a987e995d68" + } + }, + { + "commit": { + "author": { + "user": { + "login": "qihqi" + }, + "email": "qihan@fb.com", + "name": "Han Qi" + }, + "oid": "68eb1fa8374eff6cbdcf0be5e37ed6775d22e722" + } + }, + { + "commit": { + "author": { + "user": { + "login": "janeyx99" + }, + "email": "janeyx@fb.com", + "name": "Jane Xu" + }, + "oid": "3c7bcb99b5c0c879c2610f427880b03881f82f38" + } + }, + { + "commit": { + "author": { + "user": { + "login": "janeyx99" + }, + "email": "janeyx@fb.com", + "name": "Jane Xu" + }, + "oid": "38c1a2028090353e40a019c673c9ab16b39e4825" + } + }, + { + "commit": { + "author": { + "user": { + "login": "albanD" + }, + "email": "albandes@fb.com", + "name": "Alban Desmaison" + }, + "oid": "8091cbea2c95ed2c4c406b3c61547a27c6319bae" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ezyang" + }, + "email": "ezyang@fb.com", + "name": "Edward Z. Yang" + }, + "oid": "d81f59121969a47c8b2213a88e02cf9be0219be9" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ezyang" + }, + "email": "ezyang@fb.com", + "name": "Edward Z. Yang" + }, + "oid": "20d798b319cd107a767fe220f7a3027c18a1c844" + } + }, + { + "commit": { + "author": { + "user": { + "login": "dzdang" + }, + "email": "dzdang@umich.edu", + "name": "dzdang" + }, + "oid": "eb35381a770b58c1cd41e935910cb4df2f3d8f14" + } + }, + { + "commit": { + "author": { + "user": { + "login": "pytorchmergebot" + }, + "email": "pytorchmergebot@users.noreply.github.com", + "name": "PyTorch MergeBot" + }, + "oid": "e6498a657b9aa47546dcd92d1b4ffb2e1a50ebdb" + } + }, + { + "commit": { + "author": { + "user": { + "login": "dzdang" + }, + "email": "dzdang@umich.edu", + "name": "dzdang" + }, + "oid": "7f821382db5ad08efe5b09a145c606852b8a9272" + } + }, + { + "commit": { + "author": { + "user": { + "login": "albanD" + }, + "email": "albandes@fb.com", + "name": "Alban Desmaison" + }, + "oid": "995c0e11a97d854ff969962bd81d7341e46ecb07" + } + }, + { + "commit": { + "author": { + "user": { + "login": "davidberard98" + }, + "email": "dberard@fb.com", + "name": "David Berard" + }, + "oid": "28d6258e62c9fc361a18689877c962c69889dc23" + } + }, + { + "commit": { + "author": { + "user": { + "login": "HarborYuan" + }, + "email": "yuanhaobo@whu.edu.cn", + "name": "Haobo Yuan" + }, + "oid": "2350fad8391367ebf81c7236a2c883644b4ff622" + } + }, + { + "commit": { + "author": { + "user": { + "login": "zou3519" + }, + "email": "zou3519@gmail.com", + "name": "Richard Zou" + }, + "oid": "3f789c9ccecdd7e2e52269453646e992a68c6b92" + } + }, + { + "commit": { + "author": { + "user": { + "login": "jeffdaily" + }, + "email": "jeff.daily@amd.com", + "name": "Jeff Daily" + }, + "oid": "20f79f610c1a3314da96d49515bbfbee9442e4f8" + } + }, + { + "commit": { + "author": { + "user": { + "login": "peterbell10" + }, + "email": "peterbell10@live.co.uk", + "name": "Peter Bell" + }, + "oid": "5823958f047f3b71a5dc8c52a20eb8ae3291bd3e" + } + }, + { + "commit": { + "author": { + "user": { + "login": "peterbell10" + }, + "email": "peterbell10@live.co.uk", + "name": "Peter Bell" + }, + "oid": "a0b15c49ecf3844daf2c0dcaef44f0214259db20" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ezyang" + }, + "email": "ezyang@fb.com", + "name": "Edward Z. Yang" + }, + "oid": "4afc38c25ca2ca126ba4987a419a58a5c572223b" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ezyang" + }, + "email": "ezyang@fb.com", + "name": "Edward Z. Yang" + }, + "oid": "b606f58d4a36683fbe0a7d02adfdde7d5cc694c2" + } + }, + { + "commit": { + "author": { + "user": { + "login": "albanD" + }, + "email": "albandes@fb.com", + "name": "Alban Desmaison" + }, + "oid": "2d61b4d630f6482a6c3cc7437091fad6d27c347e" + } + }, + { + "commit": { + "author": { + "user": { + "login": "george-qi" + }, + "email": "georgeqi94@gmail.com", + "name": "George Qi" + }, + "oid": "bc5384c47036a6cda94129f3e2f9e43c43393698" + } + }, + { + "commit": { + "author": { + "user": { + "login": "malfet" + }, + "email": "nshulga@fb.com", + "name": "Nikita Shulga" + }, + "oid": "60fc3277634365b64465712b13db2acb76d6c890" + } + }, + { + "commit": { + "author": { + "user": { + "login": "pytorchmergebot" + }, + "email": "pytorchmergebot@users.noreply.github.com", + "name": "PyTorch MergeBot" + }, + "oid": "1b8762e95bc38d1847fe99ed3230546c8b800bfd" + } + }, + { + "commit": { + "author": { + "user": { + "login": "jerryzh168" + }, + "email": "jerryzh168@gmail.com", + "name": "Jerry Zhang" + }, + "oid": "6acf60f95f59ecbc6e8ce830dea0abba7d3ec763" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ysiraichi" + }, + "email": "yukio.siraichi@gmail.com", + "name": "Yukio Siraichi" + }, + "oid": "8fb0276561fdd530c5a06ea195e930e0584f8705" + } + }, + { + "commit": { + "author": { + "user": { + "login": "albanD" + }, + "email": "albandes@fb.com", + "name": "Alban Desmaison" + }, + "oid": "1da7aed95a8700406671425eac1e4bbc2c7a24b5" + } + }, + { + "commit": { + "author": { + "user": { + "login": "thiagocrepaldi" + }, + "email": "thiago.crepaldi@microsoft.com", + "name": "Thiago Crepaldi" + }, + "oid": "83208e7dee4503c1bee1df9f6632794694dffa01" + } + }, + { + "commit": { + "author": { + "user": { + "login": "kshitij12345" + }, + "email": "kshitijkalambarkar@gmail.com", + "name": "kshitij12345" + }, + "oid": "1a46cf08dcd3d3564604c17b2c02d7e4eb45a7ff" + } + }, + { + "commit": { + "author": { + "user": { + "login": "malfet" + }, + "email": "nshulga@fb.com", + "name": "Nikita Shulga" + }, + "oid": "b7f9b6689445f826c83694652fea5f7cfc7070d7" + } + }, + { + "commit": { + "author": { + "user": { + "login": "fatcat-z" + }, + "email": "jiz@microsoft.com", + "name": "Jay Zhang" + }, + "oid": "f273961c1696b156e35f8c76f7ad37934031050d" + } + }, + { + "commit": { + "author": { + "user": { + "login": "pavithranrao" + }, + "email": "pavithran@fb.com", + "name": "Pavithran Ramachandran" + }, + "oid": "eb410a51fcbc716873fd80a970eb932d4aaaea61" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ngimel" + }, + "email": "ngimel@fb.com", + "name": "Natalia Gimelshein" + }, + "oid": "7dbb12cdc02332fa64264ed0df576511a5070d7e" + } + }, + { + "commit": { + "author": { + "user": { + "login": "pytorchmergebot" + }, + "email": "pytorchmergebot@users.noreply.github.com", + "name": "PyTorch MergeBot" + }, + "oid": "43675665fa6b5154de8b25125dd03d7be35c884f" + } + }, + { + "commit": { + "author": { + "user": { + "login": "albanD" + }, + "email": "albandes@fb.com", + "name": "Alban Desmaison" + }, + "oid": "6c4d23c402c413667463770d9a2fa801f493d3c5" + } + }, + { + "commit": { + "author": { + "user": { + "login": "pytorchmergebot" + }, + "email": "pytorchmergebot@users.noreply.github.com", + "name": "PyTorch MergeBot" + }, + "oid": "cf3778a35129a40dee14366515201b7ed2c0f346" + } + }, + { + "commit": { + "author": { + "user": { + "login": "dzdang" + }, + "email": "dzdang@umich.edu", + "name": "dzdang" + }, + "oid": "9d00a051373cb81f79cb6375942cf3ec9fff2fe6" + } + }, + { + "commit": { + "author": { + "user": { + "login": "pytorchmergebot" + }, + "email": "pytorchmergebot@users.noreply.github.com", + "name": "PyTorch MergeBot" + }, + "oid": "1eae67cf404aa8dffb80b8e85180f943878d52a6" + } + }, + { + "commit": { + "author": { + "user": { + "login": "janeyx99" + }, + "email": "janeyx@fb.com", + "name": "Jane Xu" + }, + "oid": "ce0e69dcda0fe41a6e964d6ac70ce8016979c71a" + } + }, + { + "commit": { + "author": { + "user": { + "login": "swolchok" + }, + "email": "swolchok@fb.com", + "name": "Scott Wolchok" + }, + "oid": "6faba554f6e49777f24911928edb3061b6ed0e3d" + } + }, + { + "commit": { + "author": { + "user": { + "login": "IvanYashchuk" + }, + "email": "ivan.yashchuk@aalto.fi", + "name": "Ivan Yashchuk" + }, + "oid": "d1d0e03f57a359f8f95331f9a34b8bed3e7cc845" + } + }, + { + "commit": { + "author": { + "user": { + "login": "Chillee" + }, + "email": "chilli@fb.com", + "name": "Horace He" + }, + "oid": "bb46bd9233a9fc631802a902cb48a4c13c2722ca" + } + }, + { + "commit": { + "author": { + "user": { + "login": "mehtanirav" + }, + "email": "niravmehta@fb.com", + "name": "Nirav Mehta" + }, + "oid": "3b1007fe4be12e483f2620fbac67cae42e703efc" + } + }, + { + "commit": { + "author": { + "user": { + "login": "mehtanirav" + }, + "email": "niravmehta@fb.com", + "name": "Nirav Mehta" + }, + "oid": "b4b65228dd0c109f5fdf17c7d9e56f60a98e398b" + } + }, + { + "commit": { + "author": { + "user": { + "login": "albanD" + }, + "email": "albandes@fb.com", + "name": "Alban Desmaison" + }, + "oid": "d629e300705196d3ae0bac5ed983b197101fa2ee" + } + }, + { + "commit": { + "author": { + "user": { + "login": "bigfootjon" + }, + "email": "jonjanzen@fb.com", + "name": "Jon Janzen" + }, + "oid": "52754b9e515f378f8476ad44d75b0a692bad8cde" + } + }, + { + "commit": { + "author": { + "user": { + "login": "samdow" + }, + "email": "samdow@fb.com", + "name": "samdow" + }, + "oid": "128c3ad747093f4970329a82c7c4720420faeff2" + } + }, + { + "commit": { + "author": { + "user": { + "login": "arindamroy-eng" + }, + "email": "61168652+arindamroy-eng@users.noreply.github.com", + "name": "arindamroy-eng" + }, + "oid": "2a0bda7d32a5bcc9827f7254a7b77cceb16ba973" + } + } + ], + "pageInfo": { + "endCursor": "MTAw", + "hasNextPage": true + }, + "totalCount": 131 + }, + "commits": { + "nodes": [ + { + "commit": { + "checkSuites": { + "nodes": [ + { + "app": { + "name": "Facebook GitHub Tools", + "databaseId": 12274 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [ + { + "name": "Facebook CLA Check", + "conclusion": "SUCCESS", + "detailsUrl": "https://code.intern.facebook.com/cla/" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuNRg4=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193693698" + }, + { + "app": { + "name": "Netlify", + "databaseId": 13473 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193693712" + }, + { + "app": { + "name": "Azure Pipelines", + "databaseId": 9426 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193693725" + }, + { + "app": { + "name": "Dependabot", + "databaseId": 29110 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193693741" + }, + { + "app": { + "name": "Codecov", + "databaseId": 254 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193693761" + }, + { + "app": { + "name": "PyTorch Bot", + "databaseId": 40112 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193693774" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "TorchBench CI (pytorch-linux-py3.7-cu102)" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "run-torchbench", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099388390?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuNR-Y=", + "hasNextPage": false + } + }, + "conclusion": "SKIPPED", + "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193694412" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Lint" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "Test collect_env (with_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099431378?check_suite_focus=true" + }, + { + "name": "Test collect_env (without_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099431511?check_suite_focus=true" + }, + { + "name": "toc", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099431693?check_suite_focus=true" + }, + { + "name": "Test tools", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099431829?check_suite_focus=true" + }, + { + "name": "quick-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099432018?check_suite_focus=true" + }, + { + "name": "lintrunner", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099432195?check_suite_focus=true" + }, + { + "name": "workflow-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099432331?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuN84s=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193694417" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pull" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "linux-xenial-py3.7-gcc7-no-ops / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099430906?check_suite_focus=true" + }, + { + "name": "linux-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099431117?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099431312?check_suite_focus=true" + }, + { + "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099431677?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099431819?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099432057?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099432191?check_suite_focus=true" + }, + { + "name": "linux-bionic-rocm5.0-py3.7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099432334?check_suite_focus=true" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099432446?check_suite_focus=true" + }, + { + "name": "linux-bionic-cuda11.3-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099432577?check_suite_focus=true" + }, + { + "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099432685?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099432822?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099432932?check_suite_focus=true" + }, + { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099433128?check_suite_focus=true" + }, + { + "name": "win-vs2019-cuda11.3-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099433280?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3-clang5-mobile-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099433402?check_suite_focus=true" + }, + { + "name": "win-vs2019-cpu-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099433542?check_suite_focus=true" + }, + { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099433675?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099433758?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099433859?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099554424?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099554523?check_suite_focus=true" + }, + { + "name": "linux-docs / build-docs (cpp)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099557184?check_suite_focus=true" + }, + { + "name": "linux-docs / build-docs (python)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099557310?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099557449?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099557512?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099557588?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099557655?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099557717?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099557795?check_suite_focus=true" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099565740?check_suite_focus=true" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099565906?check_suite_focus=true" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099565972?check_suite_focus=true" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099566036?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099580613?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099580676?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099608194?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099608322?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099608371?check_suite_focus=true" + }, + { + "name": "pytorch-xla-linux-bionic-py3.7-clang8 / test (xla, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099619007?check_suite_focus=true" + }, + { + "name": "linux-bionic-rocm5.0-py3.7 / test (default, 1, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099645951?check_suite_focus=true" + }, + { + "name": "linux-bionic-rocm5.0-py3.7 / test (default, 2, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099646089?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099685555?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099685664?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099685757?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099689530?check_suite_focus=true" + }, + { + "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099757872?check_suite_focus=true" + }, + { + "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099757955?check_suite_focus=true" + }, + { + "name": "win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099898234?check_suite_focus=true" + }, + { + "name": "win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6099898323?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuVD9M=", + "hasNextPage": true + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193694439" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRuc=", + "hasNextPage": false + } + }, + "oid": "5696e8357cf38f852ef3d680381513e26f202371" + } + } + ] + }, + "changedFiles": 348, + "files": { + "nodes": [ + { + "path": ".circleci/cimodel/data/pytorch_build_data.py" + }, + { + "path": ".circleci/cimodel/data/pytorch_build_definitions.py" + }, + { + "path": ".circleci/scripts/cpp_doc_push_script.sh" + }, + { + "path": ".circleci/scripts/python_doc_push_script.sh" + }, + { + "path": ".github/actions/checkout-pytorch/action.yml" + }, + { + "path": ".github/merge_rules.json" + }, + { + "path": ".github/scripts/gitutils.py" + }, + { + "path": ".github/scripts/gql_mocks.json" + }, + { + "path": ".github/scripts/trymerge.py" + }, + { + "path": ".github/workflows/_bazel-build-test.yml" + }, + { + "path": ".github/workflows/_linux-build.yml" + }, + { + "path": ".github/workflows/_linux-test.yml" + }, + { + "path": ".github/workflows/_mac-test.yml" + }, + { + "path": ".github/workflows/_rocm-test.yml" + }, + { + "path": ".github/workflows/_win-test.yml" + }, + { + "path": ".github/workflows/buck_build_test.yml" + }, + { + "path": ".github/workflows/lint.yml" + }, + { + "path": ".github/workflows/periodic.yml" + }, + { + "path": ".github/workflows/pull.yml" + }, + { + "path": ".github/workflows/trunk.yml" + }, + { + "path": ".jenkins/pytorch/macos-test.sh" + }, + { + "path": ".jenkins/pytorch/test.sh" + }, + { + "path": ".jenkins/pytorch/win-test.sh" + }, + { + "path": ".lintrunner.toml" + }, + { + "path": "BUILD.bazel" + }, + { + "path": "CODEOWNERS" + }, + { + "path": "README.md" + }, + { + "path": "aten/src/ATen/BatchingRegistrations.cpp" + }, + { + "path": "aten/src/ATen/Dispatch.h" + }, + { + "path": "aten/src/ATen/ExpandUtils.h" + }, + { + "path": "aten/src/ATen/FunctionalInverses.cpp" + }, + { + "path": "aten/src/ATen/FunctionalStorageImpl.cpp" + }, + { + "path": "aten/src/ATen/FunctionalStorageImpl.h" + }, + { + "path": "aten/src/ATen/FunctionalTensorWrapper.cpp" + }, + { + "path": "aten/src/ATen/FunctionalTensorWrapper.h" + }, + { + "path": "aten/src/ATen/FunctionalizeFallbackKernel.cpp" + }, + { + "path": "aten/src/ATen/NestedTensorImpl.cpp" + }, + { + "path": "aten/src/ATen/OpMathType.h" + }, + { + "path": "aten/src/ATen/SparseCsrTensorUtils.h" + }, + { + "path": "aten/src/ATen/ThreadLocalState.cpp" + }, + { + "path": "aten/src/ATen/ThreadLocalState.h" + }, + { + "path": "aten/src/ATen/autocast_mode.cpp" + }, + { + "path": "aten/src/ATen/autocast_mode.h" + }, + { + "path": "aten/src/ATen/core/SymIntArrayRef.cpp" + }, + { + "path": "aten/src/ATen/core/SymIntArrayRef.h" + }, + { + "path": "aten/src/ATen/core/TensorBase.h" + }, + { + "path": "aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h" + }, + { + "path": "aten/src/ATen/core/dispatch/Dispatcher.h" + }, + { + "path": "aten/src/ATen/core/interned_strings.h" + }, + { + "path": "aten/src/ATen/core/ivalue.cpp" + }, + { + "path": "aten/src/ATen/core/ivalue.h" + }, + { + "path": "aten/src/ATen/core/ivalue_inl.h" + }, + { + "path": "aten/src/ATen/core/jit_type.h" + }, + { + "path": "aten/src/ATen/core/jit_type_base.h" + }, + { + "path": "aten/src/ATen/core/type.cpp" + }, + { + "path": "aten/src/ATen/cuda/CUDASparse.h" + }, + { + "path": "aten/src/ATen/cuda/llvm_complex.cpp" + }, + { + "path": "aten/src/ATen/cuda/llvm_jit_strings.h" + }, + { + "path": "aten/src/ATen/native/Blas.cpp" + }, + { + "path": "aten/src/ATen/native/Itertools.cpp" + }, + { + "path": "aten/src/ATen/native/LinearAlgebra.cpp" + }, + { + "path": "aten/src/ATen/native/SoftMax.cpp" + }, + { + "path": "aten/src/ATen/native/TensorConversions.cpp" + }, + { + "path": "aten/src/ATen/native/TensorShape.cpp" + }, + { + "path": "aten/src/ATen/native/TensorShape.h" + }, + { + "path": "aten/src/ATen/native/Unique.cpp" + }, + { + "path": "aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu" + }, + { + "path": "aten/src/ATen/native/cuda/CUDAJitLoops.cuh" + }, + { + "path": "aten/src/ATen/native/cuda/JitLoops.cuh" + }, + { + "path": "aten/src/ATen/native/cuda/Lerp.cu" + }, + { + "path": "aten/src/ATen/native/cuda/PersistentSoftmax.cuh" + }, + { + "path": "aten/src/ATen/native/cuda/SoftMax.cu" + }, + { + "path": "aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu" + }, + { + "path": "aten/src/ATen/native/cuda/Unique.cu" + }, + { + "path": "aten/src/ATen/native/cuda/jit_utils.cpp" + }, + { + "path": "aten/src/ATen/native/cuda/jit_utils.h" + }, + { + "path": "aten/src/ATen/native/native_functions.yaml" + }, + { + "path": "aten/src/ATen/native/nested/NestedTensorMath.cpp" + }, + { + "path": "aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp" + }, + { + "path": "aten/src/ATen/native/quantized/cpu/qsoftmax.cpp" + }, + { + "path": "aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp" + }, + { + "path": "aten/src/ATen/native/quantized/cudnn/Linear.cpp" + }, + { + "path": "aten/src/ATen/native/quantized/cudnn/utils.h" + }, + { + "path": "aten/src/ATen/native/sparse/SparseCsrTensor.cpp" + }, + { + "path": "aten/src/ATen/native/ts_native_functions.yaml" + }, + { + "path": "aten/src/ATen/record_function.cpp" + }, + { + "path": "aten/src/ATen/record_function.h" + }, + { + "path": "aten/src/ATen/templates/Operators.h" + }, + { + "path": "aten/src/ATen/templates/RegisterFunctionalization.cpp" + }, + { + "path": "aten/src/ATen/test/basic.cpp" + }, + { + "path": "aten/src/ATen/test/vmap_test.cpp" + }, + { + "path": "binaries/record_function_benchmark.cc" + }, + { + "path": "c10/core/DispatchKey.cpp" + }, + { + "path": "c10/core/DispatchKey.h" + }, + { + "path": "c10/core/DispatchKeySet.h" + }, + { + "path": "c10/test/core/DispatchKeySet_test.cpp" + }, + { + "path": "c10/util/ArrayRef.h" + }, + { + "path": "caffe2/core/tensor.h" + }, + { + "path": "docs/source/conf.py" + }, + { + "path": "docs/source/fx.rst" + } + ], + "pageInfo": { + "endCursor": "MTAw", + "hasNextPage": true + } + }, + "reviews": { + "nodes": [], + "pageInfo": { + "startCursor": null, + "hasPreviousPage": false + } + }, + "comments": { + "nodes": [ + { + "bodyText": "Merge failed due to Matched rule superuser, but it was not reviewed yet by any of:hongxiayang,janeyx99,mehdimashayekhi,tvalentius,yidawang-oss, ...", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1104214220 + }, + { + "bodyText": "Merge failed due to Matched rule superuser, but it was not reviewed yet by any of:zou3519,abhikrish,mehtanirav,wconstab,lc0, ...", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1104215370 + }, + { + "bodyText": "Merge failed due to Matched rule superuser, but PR has not been reviewed yet", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1104220908 + }, + { + "bodyText": "@pytorchbot merge this", + "author": { + "login": "malfet" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1104378397 + }, + { + "bodyText": "Merge failed due to Matched rule superuser, but PR has not been reviewed yet\nRaised by https://github.com/pytorch/pytorch/actions/runs/2197877090", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1104379712 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOQdD4zA==", + "hasPreviousPage": true + } + } + } + } + } + }, + "query_sha=9a7ea963024cb39819e4a560d8d95f41bb3e0dad12c6f05539a994d6f7c38c34 cursor=MTAw name=pytorch number=76118 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "commits_with_authors": { + "nodes": [ + { + "commit": { + "author": { + "user": { + "login": "clee2000" + }, + "email": "csl@fb.com", + "name": "Catherine Lee" + }, + "oid": "7f560351ae04ea43e58fbfda885bcf216aa26cde" + } + }, + { + "commit": { + "author": { + "user": { + "login": "pytorchmergebot" + }, + "email": "pytorchmergebot@users.noreply.github.com", + "name": "PyTorch MergeBot" + }, + "oid": "e8677ed168a036bc7e590d800fe98dd15f10581b" + } + }, + { + "commit": { + "author": { + "user": { + "login": "robieta" + }, + "email": "taylorrobie@fb.com", + "name": "Taylor Robie" + }, + "oid": "ac5611caa13642ef8dbe0db453b283b42cbd900b" + } + }, + { + "commit": { + "author": { + "user": { + "login": "robieta" + }, + "email": "taylorrobie@fb.com", + "name": "Taylor Robie" + }, + "oid": "1184afbd3bfde0f46133aef09e55e18d3bfb3c3e" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "msi@fb.com", + "name": "Min Si" + }, + "oid": "1c05604f3d049c67dc678d0295c0add470bff3dc" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "eellison@devfair044.h1.fair", + "name": "Elias Ellison" + }, + "oid": "76ab5101bd36e8d73637d31bbea125240b7b27f0" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "eellison@devfair044.h1.fair", + "name": "Elias Ellison" + }, + "oid": "c774050e92c3d8e52968e1eb635dd3e9491104b3" + } + }, + { + "commit": { + "author": { + "user": { + "login": "guoyejun" + }, + "email": "yejun.guo@intel.com", + "name": "Guo Yejun" + }, + "oid": "8981595c5361f07186f4534f3be71f1d829a3046" + } + }, + { + "commit": { + "author": { + "user": { + "login": "BowenBao" + }, + "email": "bowbao@microsoft.com", + "name": "BowenBao" + }, + "oid": "036f362904024ac9481248965009f312bec6656b" + } + }, + { + "commit": { + "author": { + "user": { + "login": "janeyx99" + }, + "email": "janeyx@fb.com", + "name": "Jane Xu" + }, + "oid": "457d994933f164a9fd70da5ca2733dd6c046a28b" + } + }, + { + "commit": { + "author": { + "user": { + "login": "janeyx99" + }, + "email": "janeyx@fb.com", + "name": "Jane Xu" + }, + "oid": "f49ebc77520774e71722111d554a0215a26956df" + } + }, + { + "commit": { + "author": { + "user": { + "login": "mikeiovine" + }, + "email": "mikeiovine@fb.com", + "name": "Mike Iovine" + }, + "oid": "f069e1a4a5f98d3fe961e4fc562ede59f59b4026" + } + }, + { + "commit": { + "author": { + "user": { + "login": "salilsdesai" + }, + "email": "salilsdesai@fb.com", + "name": "Salil Desai" + }, + "oid": "30bccf58393b288412a0f5a2423a1a41ffce258e" + } + }, + { + "commit": { + "author": { + "user": { + "login": "angelayi" + }, + "email": "angelayi@fb.com", + "name": "Angela Yi" + }, + "oid": "f4ba440fe8a632c1ee88e01f7746a8a92c8f3902" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "shirong@fb.com", + "name": "Shirong Wu" + }, + "oid": "d203346c93ba96d626c6c02910888198c789ba69" + } + }, + { + "commit": { + "author": { + "user": { + "login": "jamesr66a" + }, + "email": "jamesreed@fb.com", + "name": "James Reed" + }, + "oid": "73a4e34963e212b799a191fd031d2fa31d17e0ac" + } + }, + { + "commit": { + "author": { + "user": { + "login": "Krovatkin" + }, + "email": "korovaikon@gmail.com", + "name": "Nikolay Korovaiko" + }, + "oid": "b9d5206dfb46f09f953aba3ffb0e1e33a99032ee" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ngimel" + }, + "email": "ngimel@fb.com", + "name": "Natalia Gimelshein" + }, + "oid": "12114e6937573fead54e11ae6cdebe5b31dee302" + } + }, + { + "commit": { + "author": { + "user": { + "login": "s4ayub" + }, + "email": "shababayub@fb.com", + "name": "Shabab Ayub" + }, + "oid": "f2323f76ad6f7f590285bf9c6d20c14a79542563" + } + }, + { + "commit": { + "author": { + "user": { + "login": "jaglinux" + }, + "email": "jagdish.krishna@gmail.com", + "name": "Jagadish Krishnamoorthy" + }, + "oid": "acd4b5abe2739c09c1a02524eceda46ff93fd385" + } + }, + { + "commit": { + "author": { + "user": { + "login": "cccclai" + }, + "email": "chenlai@fb.com", + "name": "Chen Lai" + }, + "oid": "04179f533283132fa334a9f91a070b1712f7323d" + } + }, + { + "commit": { + "author": { + "user": { + "login": "zaxtax" + }, + "email": "rob@zinkov.com", + "name": "Rob Zinkov" + }, + "oid": "5097cdcd6994ad82b3cec942b70e75dbeaee8ca4" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ezyang" + }, + "email": "ezyang@fb.com", + "name": "Edward Z. Yang" + }, + "oid": "5015ecb5a2b86943f457d71f5a977444dd062732" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ezyang" + }, + "email": "ezyang@fb.com", + "name": "Edward Z. Yang" + }, + "oid": "1c42b7789d3966cd541b08fce359b9738fee69f6" + } + }, + { + "commit": { + "author": { + "user": { + "login": "albanD" + }, + "email": "albandes@fb.com", + "name": "Alban Desmaison" + }, + "oid": "893ac3d334fd3e85e22423a06fe986ce453fe304" + } + }, + { + "commit": { + "author": { + "user": { + "login": "emcastillo" + }, + "email": "ecastill@preferred.jp", + "name": "Emilio Castillo" + }, + "oid": "aa5d1b6b031ee2b8bb85f793a842ac1327ae4a19" + } + }, + { + "commit": { + "author": { + "user": { + "login": "dzdang" + }, + "email": "dzdang@umich.edu", + "name": "dzdang" + }, + "oid": "0707a1d00f33d7098f56de339cb30436e8c2ea44" + } + }, + { + "commit": { + "author": { + "user": { + "login": "NivekT" + }, + "email": "ktse@fb.com", + "name": "Kevin Tse" + }, + "oid": "ccb082d42af99f6374183cf914cc712bac585f0f" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ryandaryl" + }, + "email": "ryandarylmills@gmail.com", + "name": "ryandaryl" + }, + "oid": "4f2909cc8747808786a1871b0a6825cc4566f48c" + } + }, + { + "commit": { + "author": { + "user": { + "login": "clee2000" + }, + "email": "csl@fb.com", + "name": "Catherine Lee" + }, + "oid": "f764010648a29223d9ed4b955073d9d2fb1b2f43" + } + }, + { + "commit": { + "author": { + "user": { + "login": "malfet" + }, + "email": "nshulga@fb.com", + "name": "Nikita Shulga" + }, + "oid": "5696e8357cf38f852ef3d680381513e26f202371" + } + } + ], + "pageInfo": { + "endCursor": "MTMx", + "hasNextPage": false + } + } + } + } + } + }, + "query_sha=f357fcdcc09e1433307da0c33debf9c08279ffd70d5bbc31323994f6346c629f name=pytorch number=76123 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "closed": true, + "isCrossRepository": true, + "author": { + "login": "kumpera" + }, + "title": "Introduce distributed checkpoint with ShardedTensor.", + "body": "Co-authored-by: Wen Zhang \r\nCo-authored-by: Yifu Wang \r\n\r\n", + "headRefName": "st_checkpoint", + "headRepository": { + "nameWithOwner": "kumpera/pytorch" + }, + "baseRefName": "master", + "baseRepository": { + "nameWithOwner": "pytorch/pytorch", + "isPrivate": false, + "defaultBranchRef": { + "name": "master" + } + }, + "mergeCommit": null, + "commits_with_authors": { + "nodes": [ + { + "commit": { + "author": { + "user": { + "login": "kumpera" + }, + "email": "kumpera@fb.com", + "name": "Rodrigo Kumpera" + }, + "oid": "6bf248bc20a71f248064b795f38276326fe43aae" + } + }, + { + "commit": { + "author": { + "user": { + "login": "kumpera" + }, + "email": "kumpera@fb.com", + "name": "Rodrigo Kumpera" + }, + "oid": "10f84fb90bf02d7062e565ebf2c1da6352b64db7" + } + }, + { + "commit": { + "author": { + "user": { + "login": "kumpera" + }, + "email": "kumpera@fb.com", + "name": "Rodrigo Kumpera" + }, + "oid": "96c5299740ec791f3cf0975c03a40a7b219b6747" + } + } + ], + "pageInfo": { + "endCursor": "Mw", + "hasNextPage": false + }, + "totalCount": 3 + }, + "commits": { + "nodes": [ + { + "commit": { + "checkSuites": { + "nodes": [ + { + "app": { + "name": "Facebook GitHub Tools", + "databaseId": 12274 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [ + { + "name": "Facebook CLA Check", + "conclusion": "SUCCESS", + "detailsUrl": "https://code.intern.facebook.com/cla/" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgS2l4=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6380755666" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "TorchBench CI (pytorch-linux-py3.7-cu102)" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "run-torchbench", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299234164?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXd2r3Q=", + "hasNextPage": false + } + }, + "conclusion": "SKIPPED", + "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6380755785" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Lint" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "quick-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299234165?check_suite_focus=true" + }, + { + "name": "toc", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299234428?check_suite_focus=true" + }, + { + "name": "lintrunner", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299234555?check_suite_focus=true" + }, + { + "name": "Test collect_env (with_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299234642?check_suite_focus=true" + }, + { + "name": "Test collect_env (without_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299234701?check_suite_focus=true" + }, + { + "name": "Test tools", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299234761?check_suite_focus=true" + }, + { + "name": "workflow-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299234837?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXd2shU=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6380755786" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pull" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299245858?check_suite_focus=true" + }, + { + "name": "linux-bionic-cuda11.3-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299245958?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299246168?check_suite_focus=true" + }, + { + "name": "linux-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299246250?check_suite_focus=true" + }, + { + "name": "win-vs2019-cpu-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299246281?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299246329?check_suite_focus=true" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299246373?check_suite_focus=true" + }, + { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299246442?check_suite_focus=true" + }, + { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299246517?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299246547?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299246591?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299246687?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299246843?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc7-no-ops / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299246972?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3-clang5-mobile-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299247064?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299247163?check_suite_focus=true" + }, + { + "name": "win-vs2019-cuda11.3-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299247261?check_suite_focus=true" + }, + { + "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299247380?check_suite_focus=true" + }, + { + "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299247471?check_suite_focus=true" + }, + { + "name": "linux-bionic-rocm5.1-py3.7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299247519?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299305596?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299305656?check_suite_focus=true" + }, + { + "name": "linux-docs / build-docs (cpp)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299307925?check_suite_focus=true" + }, + { + "name": "linux-docs / build-docs (python)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299307961?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299308001?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299308035?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299308082?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299308120?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299308169?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299308217?check_suite_focus=true" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299312986?check_suite_focus=true" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299313146?check_suite_focus=true" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299313195?check_suite_focus=true" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299313235?check_suite_focus=true" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299313977?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299314888?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299314937?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 4, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299332358?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 4, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299332420?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 4, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299332476?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 4, 4, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299332526?check_suite_focus=true" + }, + { + "name": "pytorch-xla-linux-bionic-py3.7-clang8 / test (xla, 1, 1, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299335580?check_suite_focus=true" + }, + { + "name": "linux-bionic-rocm5.1-py3.7 / test (default, 1, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299375031?check_suite_focus=true" + }, + { + "name": "linux-bionic-rocm5.1-py3.7 / test (default, 2, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299375079?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299377190?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299378010?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299378053?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299378105?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299378136?check_suite_focus=true" + }, + { + "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6299437798?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXd5yuY=", + "hasNextPage": true + } + }, + "conclusion": "FAILURE", + "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6380755806" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Lint" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "lintrunner", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309468155?check_suite_focus=true" + }, + { + "name": "quick-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309468457?check_suite_focus=true" + }, + { + "name": "Test collect_env (with_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309468841?check_suite_focus=true" + }, + { + "name": "Test collect_env (without_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309468942?check_suite_focus=true" + }, + { + "name": "Test tools", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309469180?check_suite_focus=true" + }, + { + "name": "workflow-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309469314?check_suite_focus=true" + }, + { + "name": "toc", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309469473?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgS3SE=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6390363240" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "TorchBench CI (pytorch-linux-py3.7-cu102)" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "run-torchbench", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309468138?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgS1-o=", + "hasNextPage": false + } + }, + "conclusion": "SKIPPED", + "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6390363271" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pull" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "linux-bionic-rocm5.1-py3.7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309468956?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309469237?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309469475?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc7-no-ops / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309469750?check_suite_focus=true" + }, + { + "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309470049?check_suite_focus=true" + }, + { + "name": "linux-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309470368?check_suite_focus=true" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309470787?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309471290?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3-clang5-mobile-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309471585?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309471734?check_suite_focus=true" + }, + { + "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309472014?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309472172?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309472411?check_suite_focus=true" + }, + { + "name": "linux-bionic-cuda11.3-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309472715?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309473041?check_suite_focus=true" + }, + { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309473226?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309473414?check_suite_focus=true" + }, + { + "name": "win-vs2019-cpu-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309473700?check_suite_focus=true" + }, + { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309473992?check_suite_focus=true" + }, + { + "name": "win-vs2019-cuda11.3-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309474162?check_suite_focus=true" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309647069?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309647413?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309647538?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309657055?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309657196?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309657332?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309657575?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309657726?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309657858?check_suite_focus=true" + }, + { + "name": "linux-docs / build-docs (cpp)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309658314?check_suite_focus=true" + }, + { + "name": "linux-docs / build-docs (python)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309658433?check_suite_focus=true" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309665388?check_suite_focus=true" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309665513?check_suite_focus=true" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309665597?check_suite_focus=true" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309665697?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309672367?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309672499?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 4, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309696458?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 4, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309696554?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 4, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309696638?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 4, 4, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309696725?check_suite_focus=true" + }, + { + "name": "pytorch-xla-linux-bionic-py3.7-clang8 / test (xla, 1, 1, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309712838?check_suite_focus=true" + }, + { + "name": "linux-bionic-rocm5.1-py3.7 / test (default, 1, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309767601?check_suite_focus=true" + }, + { + "name": "linux-bionic-rocm5.1-py3.7 / test (default, 2, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309767717?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309792321?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309792407?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309792546?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309792639?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309792972?check_suite_focus=true" + }, + { + "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6309939578?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgaCXo=", + "hasNextPage": true + } + }, + "conclusion": "FAILURE", + "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6390363300" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXzlNKQ=", + "hasNextPage": false + } + }, + "oid": "96c5299740ec791f3cf0975c03a40a7b219b6747" + } + } + ] + }, + "changedFiles": 11, + "files": { + "nodes": [ + { + "path": "test/distributed/_shard/checkpoint/test_checkpoint.py" + }, + { + "path": "test/distributed/_shard/checkpoint/test_file_system_checkpoint.py" + }, + { + "path": "test/distributed/_shard/sharded_tensor/test_sharded_tensor.py" + }, + { + "path": "torch/distributed/_shard/checkpoint/__init__.py" + }, + { + "path": "torch/distributed/_shard/checkpoint/filesystem.py" + }, + { + "path": "torch/distributed/_shard/checkpoint/metadata.py" + }, + { + "path": "torch/distributed/_shard/checkpoint/resharding.py" + }, + { + "path": "torch/distributed/_shard/checkpoint/state_dict_loader.py" + }, + { + "path": "torch/distributed/_shard/checkpoint/state_dict_saver.py" + }, + { + "path": "torch/distributed/_shard/checkpoint/storage.py" + }, + { + "path": "torch/testing/_internal/distributed/_shard/sharded_tensor/_test_st_common.py" + } + ], + "pageInfo": { + "endCursor": "MTE", + "hasNextPage": false + } + }, + "reviews": { + "nodes": [ + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "zzzwen" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "zzzwen" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "wanchaol" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "zzzwen" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "zzzwen" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "simpkins" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "zzzwen" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "zzzwen" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "simpkins" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "simpkins" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "pritamdamania87" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "pritamdamania87" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "pritamdamania87" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "wilson100hong" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "wilson100hong" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "wilson100hong" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "xunnanxu" + }, + "state": "DISMISSED" + }, + { + "author": { + "login": "xunnanxu" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "xunnanxu" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "xunnanxu" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "xunnanxu" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "xunnanxu" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "pritamdamania87" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "pritamdamania87" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "pritamdamania87" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "pritamdamania87" + }, + "state": "APPROVED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNC0yNVQxMTozNTowMS0wNzowMLkyMDIyLTA0LTI1VDExOjM1OjAwLTA3OjAwzjjC2d0=", + "hasPreviousPage": true + } + }, + "comments": { + "nodes": [ + { + "bodyText": "Merge failed due to Can't fetch all PR reviews\nRaised by https://github.com/pytorch/pytorch/actions/runs/2275691136", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1118495479 + }, + { + "bodyText": "Merge failed due to Can't fetch all PR reviews\nRaised by https://github.com/pytorch/pytorch/actions/runs/2275691136", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1118511287 + }, + { + "bodyText": "Merge failed due to Can't fetch all PR reviews\nRaised by https://github.com/pytorch/pytorch/actions/runs/2275691136", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1118662274 + }, + { + "bodyText": "Merge failed due to Can't fetch all PR reviews Raised by https://github.com/pytorch/pytorch/actions/runs/2275691136\n\n@osalpekar @malfet This is failing because there are 109 review comments on this PR but we only fetch the first 100. This could be solved with a similar concept as how we fetch more comments/check_runs.", + "author": { + "login": "janeyx99" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1118689010 + }, + { + "bodyText": "On a side note, has the test_fsdp_clip_grad_norm_norm_type_2_0_nested_fsdp_False_cpu_offload_CPUOffload failure on the distributed test first shard of this PR been addressed?", + "author": { + "login": "janeyx99" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1118693497 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOQqri9w==", + "hasPreviousPage": true + } + } + } + } + } + }, + "query_sha=cc0db92500f836c7fc4f9a0235a75b77562e6e4ab939b5cbe5584078df1c22d2 cursor=Y3Vyc29yOnYyOpO5MjAyMi0wNC0yNVQxMTozNTowMS0wNzowMLkyMDIyLTA0LTI1VDExOjM1OjAwLTA3OjAwzjjC2d0= name=pytorch number=76123 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "reviews": { + "nodes": [ + { + "author": { + "login": "pritamdamania87" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "kumpera" + }, + "state": "COMMENTED" + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNC0yMlQyMDozNzo1NC0wNzowMLkyMDIyLTA0LTIyVDE2OjAyOjA5LTA3OjAwzjip7G8=", + "hasPreviousPage": false + } + } + } + } + } + }, + "query_sha=f357fcdcc09e1433307da0c33debf9c08279ffd70d5bbc31323994f6346c629f name=pytorch number=71759 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "closed": true, + "isCrossRepository": true, + "author": { + "login": "coolteemf" + }, + "title": "Optimize grid sample 3d", + "body": "Fixes #71415\r\nI have implemented the changes that replicate what @to-mi did in this [PR](https://github.com/pytorch/pytorch/pull/65986#issue-1012959443) for the 3D case :\r\n\r\n> Fixes #64977\r\n> \r\n> Avoids creating a tensor for and calculating `input` gradient if it's not needed in the backward pass of `grid_sample` (2d case, native CPU & CUDA kernels). Especially the tensor creation seemed time consuming (see #64977).\r\n> \r\n> Brief description of the changes:\r\n> \r\n> * I have tried to go with rather minimal changes. It would probably be possible to make a more elegant version with a bit larger refactoring (or possibly with better understanding of PyTorch internals and C++ functionalities).\r\n> \r\n> * Changed the `native_functions.yaml` and `derivatives.yaml` so that the gradient input mask is passed to the functions.\r\n> \r\n> * Changed the CPU kernels:\r\n> (1) added `bool input_requires_grad` template parameter to the `backward` function,\r\n> (2) added if branches based on it to remove `input` gradient computations if it's not requested,\r\n> (3) feed in `TensorAccessor* gInp_slice_ptr` instead of `TensorAccessor& gInp_slice` so that I can pass a `nullptr` in case gradient for `input` is not requested. (A bit inelegant perhaps, but allows to keep one signature for `backward` function and not require breaking it to smaller pieces. Perhaps there's a more elegant way to achieve this?)\r\n> \r\n> * Changed CUDA kernel:\r\n> (1) added ~`bool input_requires_grad` template parameter~ `const bool input_requires_grad` argument to the `backward` function,\r\n> (2) added if branches based on it to remove `input` gradient computations if it's not requested,\r\n> (3) feed in `TensorInfo()` instead of `getTensorInfo(grad_input)` in case gradient for `input` is not requested.\r\n> \r\n> * Modified tests in `test/test_nn.py` so that they run also cases with no `input` gradient needed.\r\n> \r\n> * Have not touched the CPU fallback kernel.\r\n\r\nNote: the changes number (3) are N/A in this case.\r\n\r\n", + "headRefName": "optimize_grid_sample_3d", + "headRepository": { + "nameWithOwner": "coolteemf/pytorch" + }, + "baseRefName": "master", + "baseRepository": { + "nameWithOwner": "pytorch/pytorch", + "isPrivate": false, + "defaultBranchRef": { + "name": "master" + } + }, + "mergeCommit": null, + "commits_with_authors": { + "nodes": [ + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "e0b0d1e695aeddceaf265da602c4704592053e9e" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "563ec73747ad53b63b36736c47c4342f962c2a09" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "51abe41a132d9dd5b1c0551bdca902aacc028ff8" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "be9898205992034a00e8ace8a55c2ecdcee2c2f8" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "2929c60b64384c2deae0f7dea8bab94ad4bc9ec8" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "9241b737e7e2b257905cc74ad9c50b737d7f9d0a" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "64d6b795d0636928a8aa2fd3da01302fb5f5f7af" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "4503577e53760a0006f1e80ca6bfe04d2be90470" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "b16f4b11ffbbbf2ca2098f9702af4ef6b6fc5e1f" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "7ffc23368a604afdc92d2818747f730ce31a2bb5" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "b85292604b9ad6c31706b76b5a5498c4f6d94309" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "9d81d7bae8ad91aaa24b3ceab83e3138894dbc69" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "e79f6a2202512b294c55bf4bfb2e0524fafd4c48" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "f683e8aec7aea76097a264eec01511e704c31154" + } + }, + { + "commit": { + "author": { + "user": { + "login": "coolteemf" + }, + "email": "67541941+coolteemf@users.noreply.github.com", + "name": "Fran\u00e7ois Lecomte" + }, + "oid": "b932e9e286c22aaf352375186df851ef060b295a" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "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", + "name": "coolteemf" + }, + "oid": "346e0c547953d98eb84d23c1391a95badb9c4a22" + } + } + ], + "pageInfo": { + "endCursor": "MTY", + "hasNextPage": false + }, + "totalCount": 16 + }, + "commits": { + "nodes": [ + { + "commit": { + "checkSuites": { + "nodes": [ + { + "app": { + "name": "Facebook GitHub Tools", + "databaseId": 12274 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [ + { + "name": "Facebook CLA Check", + "conclusion": "SUCCESS", + "detailsUrl": "https://code.intern.facebook.com/cla/" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGYqY=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801320" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-clang7-onnx" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020089?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302165846?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302165949?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwIob0=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801849" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3-clang5-mobile-build" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302019921?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ1E=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801852" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-bionic-rocm4.5-py3.7" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302019934?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302431993?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302432078?check_suite_focus=true" + }, + { + "name": "test (distributed, 1, 1, linux.rocm.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302432150?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwMsZY=", + "hasNextPage": false + } + }, + "conclusion": "FAILURE", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801853" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "win-vs2019-cuda11.3-py3" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302019928?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, windows.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5303266925?check_suite_focus=true" + }, + { + "name": "test (force_on_cpu, 1, 1, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5303267017?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, windows.8xlarge.nvidia.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5303267128?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwZbzg=", + "hasNextPage": false + } + }, + "conclusion": "FAILURE", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801855" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Lint" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "mypy", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302019930?check_suite_focus=true" + }, + { + "name": "shellcheck", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020111?check_suite_focus=true" + }, + { + "name": "py2-setup-validate-errormsg", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020318?check_suite_focus=true" + }, + { + "name": "clang-format", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020421?check_suite_focus=true" + }, + { + "name": "cmakelint", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020539?check_suite_focus=true" + }, + { + "name": "toc", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020668?check_suite_focus=true" + }, + { + "name": "quick-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020780?check_suite_focus=true" + }, + { + "name": "clang-tidy", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020970?check_suite_focus=true" + }, + { + "name": "flake8-py3", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302021124?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGbAQ=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801856" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-clang7-asan" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020084?check_suite_focus=true" + }, + { + "name": "test (default, 3, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302192846?check_suite_focus=true" + }, + { + "name": "test (default, 1, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302192926?check_suite_focus=true" + }, + { + "name": "test (default, 2, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302193029?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwJC4U=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801857" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020092?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ_w=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801862" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-gcc5.4" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020048?check_suite_focus=true" + }, + { + "name": "test (backwards_compat, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302147216?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302147336?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302147409?check_suite_focus=true" + }, + { + "name": "test (distributed, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302147493?check_suite_focus=true" + }, + { + "name": "test (jit_legacy, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302147622?check_suite_focus=true" + }, + { + "name": "test (docs_test, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302147822?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwIWu4=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801866" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302019929?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ1k=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801869" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Uc0=", + "hasNextPage": true + } + }, + "oid": "346e0c547953d98eb84d23c1391a95badb9c4a22" + } + } + ] + }, + "changedFiles": 9, + "files": { + "nodes": [ + { + "path": "aten/src/ATen/native/GridSampler.cpp" + }, + { + "path": "aten/src/ATen/native/cpu/GridSamplerKernel.cpp" + }, + { + "path": "aten/src/ATen/native/cuda/GridSampler.cpp" + }, + { + "path": "aten/src/ATen/native/cuda/GridSampler.cu" + }, + { + "path": "aten/src/ATen/native/cuda/GridSampler.h" + }, + { + "path": "aten/src/ATen/native/native_functions.yaml" + }, + { + "path": "test/forward_backward_compatibility/check_forward_backward_compatibility.py" + }, + { + "path": "test/test_nn.py" + }, + { + "path": "tools/autograd/derivatives.yaml" + } + ], + "pageInfo": { + "endCursor": "OQ", + "hasNextPage": false + } + }, + "reviews": { + "nodes": [ + { + "author": { + "login": "albanD" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "coolteemf" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "albanD" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "coolteemf" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "albanD" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "coolteemf" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "coolteemf" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "albanD" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "coolteemf" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "albanD" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "albanD" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "coolteemf" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "albanD" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "coolteemf" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "albanD" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "albanD" + }, + "state": "APPROVED" + }, + { + "author": { + "login": "albanD" + }, + "state": "APPROVED" + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMS0yNVQwODoyODoxMC0wODowMLkyMDIyLTAxLTI1VDA3OjU0OjA1LTA4OjAwzjNooqI=", + "hasPreviousPage": false + } + }, + "comments": { + "nodes": [ + { + "bodyText": "Merge failed due to 'NoneType' object is not subscriptable\nRaised by https://github.com/pytorch/pytorch/actions/runs/1887945630", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1048868910 + }, + { + "bodyText": "Thanks for the update! The windows failure is not your fault, you can ignore it!\n\nThank you very much for all of your feedback and sorry for the delay !", + "author": { + "login": "coolteemf" + }, + "authorAssociation": "CONTRIBUTOR", + "editor": null, + "databaseId": 1048983572 + }, + { + "bodyText": "@coolteemf can you please send either me or @albanD an email? (or I can send you and invite to collab on private repo)", + "author": { + "login": "malfet" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1049048119 + }, + { + "bodyText": "@pytorchbot merge this please", + "author": { + "login": "albanD" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1049131992 + }, + { + "bodyText": "Hey @coolteemf.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.", + "author": { + "login": "github-actions" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 1049134520 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOPoR4Lg==", + "hasPreviousPage": true + } + } + } + } + } + }, + "query_sha=f357fcdcc09e1433307da0c33debf9c08279ffd70d5bbc31323994f6346c629f name=pytorch number=75095 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "closed": true, + "isCrossRepository": false, + "author": { + "login": "mruberry" + }, + "title": "Initial prims, references, and test architecture for them", + "body": "This PR adds an initial set of experimental primitive operations and Python references that reimplement existing PyTorch operations using them. See https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-0/577 for additional context.\r\n\r\nThe following experimental primitives are added:\r\n\r\n- Elementwise unary prims -- abs, acos, acosh, asin, atan, cos, cosh, bessel_i0e, bessel_i1e, cbrt, ceil, digamma, erf, erf_inv, erfc, exp, expm1, floor, igamma, igammac, is_finite, lgamma, log, log1p, neg, reciprocal, round, sign, sinh, sqrt, square, tan. \r\n- Elementwise binary prims -- add, atan2, bitwise_and, bitwise_not, bitwise_or, bitwise_xor, div, eq, ge, gt, le, lt, max, min, mul, ne, nextafter, pow, rsqrt, shift_left, shift_right_arithmetic\r\n- View prims -- brodcast_in_dim, collapse_view, split_dim, squeeze\r\n- Shape prims -- collapse, concatenate, reshape\r\n- Conditional prims -- select\r\n- Data conversion & movement prims -- convert_element_type, device_put\r\n- Inplace prims -- copy_to, resize\r\n\r\nThese primitives do not add any new functionality to PyTorch, but are intended to be the semantic building blocks for reference operators. We have tried to make them consistent with the operations in [jax.lax](https://jax.readthedocs.io/en/latest/jax.lax.html) where possible (because PyTorch prefers being consistent with other frameworks), although there are key differences between these prims and operations in jax.lax. Most notably is that these prims model view semantics and inplace operations.\r\n\r\nIn addition to these primitives the following elementwise binary Python references are added:\r\n\r\n- Elementwise binary Python references -- add, atan2, bitwise_and, bitwise_left_shift, bitwise_or, bitwise_right_shift, bitwise_xor, eq, float_power, ge, gt, le, lt, maximum, minimum, mul, ne, nextafter, pow, sub, true_divide\r\n- Conditional Python references - where\r\n- Data conversion & movement references - copy_to\r\n\r\nA Python reference implements the same behavior as its corresponding PyTorch operator (excepting slight numerical differences, bug fixes, and in some cases additional features). \r\n\r\nThe start of an OpInfo-based test architecture for these references is also included in this PR. A new list, `python_ref_db`, is added to `common_methods_invocations.py`. This list introduces the new `ElementwiseBinaryPythonRefInfo`, which inherits input arguments from the original operators' OpInfo, allows them to be overridden, and then constructs the OpInfo for the Python reference using the (potentially modified) arguments. OpInfo-based tests can opt-into testing references by including this new list in the Sequence passed to the `@ops` decorator. \r\n\r\ncc @ngimel @csarofeen @kevinstephano @Lezcano ", + "headRefName": "prims_and_references", + "headRepository": { + "nameWithOwner": "pytorch/pytorch" + }, + "baseRefName": "master", + "baseRepository": { + "nameWithOwner": "pytorch/pytorch", + "isPrivate": false, + "defaultBranchRef": { + "name": "master" + } + }, + "mergeCommit": null, + "commits_with_authors": { + "nodes": [ + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "a790467c650be92775103cde5e866c90b56f5376" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "bd6fcf50692e208ebecdc2eaa517a2bfcdcd35cf" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "4a119c8f21529fe1375e7e8789b91f41a3df80c5" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "ea6750dc34d66be759fdfe84b09fb0e23ee59c79" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "2eef8a55fe0227e1921b51bf1f56f9d0a29b49ac" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "b886ed6c20dd1785fd31ed6fa6a8c5b6d0d0b16c" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "9ad9b63d09aa4f7a8549bcf1d88ea4ff0674299c" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "63fdd580118477416ae160e0670ae722ea248090" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "0ccf7dc292af1d40d0a094eb2b2fb0c7ab4ccc70" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "e8a8a4d1fbe35f20eb88e1a43cf5a653883638e5" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "186634dfdd25645c05b58a212f9e8d77c4125fc0" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "f5b4741312b5c42a79f6c8a1d3930b79db38ed8f" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ezyang" + }, + "email": "ezyang@fb.com", + "name": "Edward Z. Yang" + }, + "oid": "23d50391bb0fd12111fd3171591c4235ffb2fc1a" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ezyang" + }, + "email": "ezyang@fb.com", + "name": "Edward Z. Yang" + }, + "oid": "bac9d45422d58f513b60b4b854441cfdc253d4c5" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ezyang" + }, + "email": "ezyang@fb.com", + "name": "Edward Z. Yang" + }, + "oid": "13240ae0b4a0332c3167b65ac026a3172da90cb7" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ezyang" + }, + "email": "ezyang@fb.com", + "name": "Edward Z. Yang" + }, + "oid": "1ee34468cb1db3dc6cbae204669f4fec20e2a466" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ezyang" + }, + "email": "ezyang@fb.com", + "name": "Edward Z. Yang" + }, + "oid": "561d132bc686d00e8911f7feb3da5901b2bdc574" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ngimel" + }, + "email": "ngimel@fb.com", + "name": "Natalia Gimelshein" + }, + "oid": "ac42bedc84b7c96256376ad09917263bb020b2c3" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ngimel" + }, + "email": "ngimel@fb.com", + "name": "Natalia Gimelshein" + }, + "oid": "7f7d5ba40a0b5e10526d90b018b30b54673d12d8" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "37a6b4a8b1adb712d5777c7c3479866c27fb3c4e" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ngimel" + }, + "email": "ngimel@fb.com", + "name": "Natalia Gimelshein" + }, + "oid": "65b613868c44e519c1777af79b9fd3498c5a7e58" + } + }, + { + "commit": { + "author": { + "user": { + "login": "ngimel" + }, + "email": "ngimel@fb.com", + "name": "Natalia Gimelshein" + }, + "oid": "442c405e9da0d66744ef03e379224c41eedf5b57" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "031ac49ae9c192989385986b6707fa781e3229e0" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "9a6c3b00039c0c985c1c9cb59490012d1c0b38ba" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "d5c30e408af1889b90012d2e09f6ec3cda333bcb" + } + }, + { + "commit": { + "author": { + "user": null, + "email": "mruberry@devfair044.h1.fair", + "name": "Mike Ruberry" + }, + "oid": "db355d55655bb252a699cd532441bb98e52b98d5" + } + } + ], + "pageInfo": { + "endCursor": "MjY", + "hasNextPage": false + }, + "totalCount": 26 + }, + "commits": { + "nodes": [ + { + "commit": { + "checkSuites": { + "nodes": [ + { + "app": { + "name": "Facebook GitHub Tools", + "databaseId": 12274 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [ + { + "name": "Facebook CLA Check", + "conclusion": "SUCCESS", + "detailsUrl": "https://code.intern.facebook.com/cla/" + }, + { + "name": "Meta Internal-Only Changes Check", + "conclusion": "SUCCESS", + "detailsUrl": "https://opensource.facebook.com/" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6ux14=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241454954" + }, + { + "app": { + "name": "Netlify", + "databaseId": 13473 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241454956" + }, + { + "app": { + "name": "Azure Pipelines", + "databaseId": 9426 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241454965" + }, + { + "app": { + "name": "Dependabot", + "databaseId": 29110 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241454970" + }, + { + "app": { + "name": "Codecov", + "databaseId": 254 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241454974" + }, + { + "app": { + "name": "PyTorch Bot", + "databaseId": 40112 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241454977" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "TorchBench CI (pytorch-linux-py3.7-cu102)" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "run-torchbench", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150879695?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6e-c8=", + "hasNextPage": false + } + }, + "conclusion": "SKIPPED", + "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241455322" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Lint" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "quick-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150879696?check_suite_focus=true" + }, + { + "name": "lintrunner", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150879758?check_suite_focus=true" + }, + { + "name": "Test tools", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150879835?check_suite_focus=true" + }, + { + "name": "Test collect_env (with_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150879901?check_suite_focus=true" + }, + { + "name": "Test collect_env (without_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150879942?check_suite_focus=true" + }, + { + "name": "toc", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150880005?check_suite_focus=true" + }, + { + "name": "workflow-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150880051?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6e-zM=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241455334" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pull" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "linux-vulkan-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150895177?check_suite_focus=true" + }, + { + "name": "linux-bionic-rocm5.0-py3.7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150895295?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150895365?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150895428?check_suite_focus=true" + }, + { + "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150895554?check_suite_focus=true" + }, + { + "name": "win-vs2019-cuda11.3-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150895614?check_suite_focus=true" + }, + { + "name": "linux-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150895698?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150895758?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150895866?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3-clang5-mobile-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150895923?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150895991?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150896053?check_suite_focus=true" + }, + { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150896146?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150896213?check_suite_focus=true" + }, + { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150896256?check_suite_focus=true" + }, + { + "name": "win-vs2019-cpu-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150896288?check_suite_focus=true" + }, + { + "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150896313?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc7-no-ops / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150896352?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150896403?check_suite_focus=true" + }, + { + "name": "linux-bionic-cuda11.3-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150896443?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150970691?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150970749?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150970796?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150970831?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150970876?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150970911?check_suite_focus=true" + }, + { + "name": "linux-docs / build-docs (cpp)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150970959?check_suite_focus=true" + }, + { + "name": "linux-docs / build-docs (python)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150971013?check_suite_focus=true" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150976613?check_suite_focus=true" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150976667?check_suite_focus=true" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150976694?check_suite_focus=true" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150977190?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150980317?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150980363?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150989669?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6150989736?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151003389?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151003429?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151003460?check_suite_focus=true" + }, + { + "name": "pytorch-xla-linux-bionic-py3.7-clang8", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151007051?check_suite_focus=true" + }, + { + "name": "linux-bionic-rocm5.0-py3.7 / test (default, 1, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151023043?check_suite_focus=true" + }, + { + "name": "linux-bionic-rocm5.0-py3.7 / test (default, 2, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151023077?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151040240?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151041874?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151041915?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151041959?check_suite_focus=true" + }, + { + "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151065166?check_suite_focus=true" + }, + { + "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151065218?check_suite_focus=true" + }, + { + "name": "win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151165045?check_suite_focus=true" + }, + { + "name": "win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/6151165103?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6jVK8=", + "hasNextPage": true + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241455360" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXQFDQA=", + "hasNextPage": false + } + }, + "oid": "db355d55655bb252a699cd532441bb98e52b98d5" + } + } + ] + }, + "changedFiles": 5, + "files": { + "nodes": [ + { + "path": "test/test_ops.py" + }, + { + "path": "torch/_prims/__init__.py" + }, + { + "path": "torch/_prims/utils.py" + }, + { + "path": "torch/_refs/__init__.py" + }, + { + "path": "torch/testing/_internal/common_methods_invocations.py" + } + ], + "pageInfo": { + "endCursor": "NQ", + "hasNextPage": false + } + }, + "reviews": { + "nodes": [ + { + "author": { + "login": "Lezcano" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "Lezcano" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "Lezcano" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "Lezcano" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "Lezcano" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "Lezcano" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "Lezcano" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ngimel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ngimel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "Lezcano" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "zou3519" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "peterbell10" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "Lezcano" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "Lezcano" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ngimel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ngimel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "Lezcano" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "Lezcano" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "Lezcano" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "ngimel" + }, + "state": "APPROVED" + }, + { + "author": { + "login": "ezyang" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "mruberry" + }, + "state": "COMMENTED" + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNC0wNlQxMjo1NjoyNC0wNzowMLkyMDIyLTA0LTA2VDA4OjQwOjM4LTA3OjAwzjenO6Y=", + "hasPreviousPage": false + } + }, + "comments": { + "nodes": [ + { + "bodyText": "Ref implementations by themselves can handle any shapes (and broadcast ops by themselves don't bake in any shapes). The question is can we decide if a particular trace is applicable for a different input, but that depends on the tracing technology and what we are caching on, so out of scope for initial PR.", + "author": { + "login": "ngimel" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1105643418 + }, + { + "bodyText": "@pytorchbot merge this please", + "author": { + "login": "mruberry" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1108072887 + }, + { + "bodyText": "Merge failed due to 'mruberry'\nRaised by https://github.com/pytorch/pytorch/actions/runs/2218044244", + "author": { + "login": "pytorchmergebot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1108073536 + }, + { + "bodyText": "@mruberry has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1108075965 + }, + { + "bodyText": "Hey @mruberry.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.", + "author": { + "login": "github-actions" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 1108351107 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOQebHmg==", + "hasPreviousPage": true + } + } + } + } + } + }, + "query_sha=f357fcdcc09e1433307da0c33debf9c08279ffd70d5bbc31323994f6346c629f name=pytorch number=68111 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "closed": true, + "isCrossRepository": true, + "author": { + "login": "chunyuan-w" + }, + "title": "Add JIT graph fuser for oneDNN Graph API (Preview4)", + "body": "## Description\r\nPreview4 PR of this [RFC](https://github.com/pytorch/pytorch/issues/49444).\r\n\r\nOn the basis of https://github.com/pytorch/pytorch/pull/50256, the below improvements are included:\r\n\r\n- The [preview4 release branch](https://github.com/oneapi-src/oneDNN/releases/tag/graph-v0.4.1) of the oneDNN Graph API is used\r\n- The fuser now works with the profiling graph executor. We have inserted type check nodes to guard the profiled tensor properties.\r\n\r\n### User API:\r\nThe optimization pass is disabled by default. Users could enable it by:\r\n```\r\ntorch.jit.enable_onednn_fusion(True)\r\n```\r\n\r\n### Performance:\r\n[pytorch/benchmark](https://github.com/pytorch/benchmark) tool is used to compare the performance:\r\n- SkyLake 8180 (1 socket of 28 cores):\r\n\r\n ![image](https://user-images.githubusercontent.com/65992142/151162305-05e44425-a24e-4d5e-94e1-743b40b87a8c.png)\r\n\r\n- SkyLake 8180 (single thread):\r\n\r\n ![image](https://user-images.githubusercontent.com/65992142/151162528-69f90b79-d08d-46b8-8775-d80a6ccbce8a.png)\r\n \\* By mapping hardswish to oneDNN Graph, it\u2019s 8% faster than PyTorch JIT (NNC + OFI)\r\n \\** We expect performance gain after mapping transpose, contiguous & view to oneDNN graph ops\r\n\r\n\r\n### Directory structure of the integration code\r\nFuser-related code are placed under:\r\n```\r\ntorch/csrc/jit/codegen/onednn/\r\n```\r\n\r\nOptimization pass registration is done in:\r\n```\r\ntorch/csrc/jit/passes/onednn_graph_fuser.h\r\n```\r\n\r\nCMake for the integration code is:\r\n```\r\ncaffe2/CMakeLists.txt\r\n```\r\n\r\n## Limitations\r\n\r\n- In this PR, we have only supported the optimization on Linux platform. The support on Windows and MacOS will be enabled as the next step.\r\n- We have only optimized the inference use case.", + "headRefName": "chunyuan/llga_preview2", + "headRepository": { + "nameWithOwner": "chunyuan-w/pytorch" + }, + "baseRefName": "master", + "baseRepository": { + "nameWithOwner": "pytorch/pytorch", + "isPrivate": false, + "defaultBranchRef": { + "name": "master" + } + }, + "mergeCommit": null, + "commits_with_authors": { + "nodes": [ + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "0096fcc49f277fd8e006fcb42e0cb28a1422ec98" + } + }, + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "7bcc4de26a5472f1d252735dd425b46794b0844f" + } + }, + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "3a2a588bfe6bbf9bf74d88d441cd22affda207da" + } + }, + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "ca7df12fbfaa3ddbabeca39b76300d17f4a33f2f" + } + }, + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "81d44f35b8bc043c38837d0694e5bc072203b832" + } + }, + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "14fd5d1bfc2c58a71379f778871e3fca0a8e79b2" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "954dc23663125897f4b199eb2a8607dc5fca3274" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "9f77a0b476accc678b6f0569e4ff33fa6bbe97fc" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchitintel" + }, + "oid": "fbf3b23bc1288697e1aec539a7c4ee3dc0bcb84c" + } + }, + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "f8b8e78f786586c3cdf3966fd83ffa124d3eda70" + } + }, + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "6fffa2f7453ee7e0f8d8e2f73ea8a65230539589" + } + }, + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "849385404e6f3cd1cf7cef19f931ecf4fa28afdb" + } + }, + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "adbae7b77f8c0dbc59fccf15207d97ba86cfade2" + } + }, + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "6dcf2a4981aff24fa16fc7461ae4ec29690f956f" + } + }, + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "54f3e05ad524cffd0911ee93be3c50f589b51f58" + } + }, + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "edbfc640ea79a0af85757d9e73796dcc90231519" + } + }, + { + "commit": { + "author": { + "user": { + "login": "chunyuan-w" + }, + "email": "chunyuan.wu@intel.com", + "name": "chunyuan" + }, + "oid": "67654db7cba562809d1b4a44cdda58af5cc9daaf" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "9c9d99b930b11af9ff03f52d45bf49c652df758d" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "ffb25119cd9ce815cc4d9d14a2317fcbbfa9ea86" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "ab9eee84512ca1bdfbc81e25c6eb67b29d0f302a" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "62a4642cf3330524990a69ac29e002c97812320a" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "ca9b1223be4af2c8b4929303d498eafd71793128" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "6f4a23d24514a02954d2ec792830085f612223c9" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchitintel" + }, + "oid": "b2a9a9c0926b02d0b2e87722ed61450f224a61d0" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "e88b492be733f24b6aa395829c76add67d0901e7" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "c44336d7a914952bfb78e012e08d9a6d6dde5937" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "5157930f7b3921d41a586260582b574c915f6ca1" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "04cb8353813f6bbd0d913a994923cc7e1e291406" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchitintel" + }, + "oid": "62991eaad0e638bb0bced327e03f932f66f68732" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchitintel" + }, + "oid": "7496bf1588050191595d833d23b8972b2f22655e" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchitintel" + }, + "oid": "d9d35f23cca0cd29c78a845731b24826152dcf1c" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "f74ec134f18a65a7c72455bdf44f72e3ebb27105" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "eb32cc65a975361160948bfc3d6a577991ea262e" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "c7665f8d695b680c54db0bad2b7b7df46d886b50" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "e6321ad8f59ea01130568c202d186448bb9cb9d0" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "a72cd0d02693f45e5354a70654581ad514581ec7" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "b3cd3028b4ed31805e82f7eaf02217ab74ca59b9" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "49a592d9788d08e6cd0593882f867e129057c1cc" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "0575766b2144b13f6a38227c4e2b8d22ec8db80f" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "b5c9b10ff87d622350e8ca64fae3a476eb70d5aa" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "66bc652a30ccc329adb929870a4ac726bb98b38c" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "72b9ca9c8e2dac98cbb7199b3dfac7c7305b80c5" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "a7892ed7373207d96406c8b5734a089643c5cdbd" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchitintel" + }, + "oid": "d54cb084e1daad8a08c3f8de0ad3f7afb5b05ac1" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchitintel" + }, + "oid": "aef71d692a8a159e0ca56be363e2cc1225ce7647" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "bf618e205ec31cff962dcc8ab478e0a699a9572d" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "e4a331f1088448f7d7d86256ce71e0e71da006b0" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "0b743523d1430fec759d5fefbb687f17c89335a5" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "e80a351a62d98b810ec8985c4b25257af1d6c5bb" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "c189eca154b6691919d0e21489d1c322c7435c0b" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchitintel" + }, + "oid": "e080a067c75d7b888a8a362682a2d5ba70e0c3a8" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchitintel" + }, + "oid": "028561fbf8f3ed90e074e6e0e3a4ca4dd7ffa2a8" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "d550cf14037badd4caa2f52202e2f20bc4db8432" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "574159ebadd1dec24daaf883879ffeca8d9e71b7" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "9eb3ee98ea756067ed1c8f52f309f6d3e211a904" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "29929f48be03dcdd1bbfade572de7feafa825547" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "8a7358ca8da547b40ea1a99ddc57ebed19959684" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "6606637d2c5525b43e294a8b366a85052e1be0c6" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "5ecfd1f28b87045deb8bc8ffe33b3d8b906f3264" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchit.jain" + }, + "oid": "be2d4345c65442c4cfbe8afdfb2ae0893945da42" + } + }, + { + "commit": { + "author": { + "user": { + "login": "sanchitintel" + }, + "email": "sanchit.jain@intel.com", + "name": "sanchitintel" + }, + "oid": "b5b89d3644a43e2dbda841cafb71b32edbe07c8a" + } + }, + { + "commit": { + "author": { + "user": { + "login": "malfet" + }, + "email": "nikita.shulga@gmail.com", + "name": "Nikita Shulga" + }, + "oid": "73881411e2bfb3aaa2e89926a82390b4c587ad75" + } + } + ], + "pageInfo": { + "endCursor": "NjI", + "hasNextPage": false + }, + "totalCount": 62 + }, + "commits": { + "nodes": [ + { + "commit": { + "checkSuites": { + "nodes": [ + { + "app": { + "name": "Facebook GitHub Tools", + "databaseId": 12274 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [ + { + "name": "Facebook CLA Check", + "conclusion": "SUCCESS", + "detailsUrl": "https://code.intern.facebook.com/cla/" + }, + { + "name": "Meta Internal-Only Changes Check", + "conclusion": "SUCCESS", + "detailsUrl": "https://opensource.facebook.com/" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_NXnc=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/73881411e2bfb3aaa2e89926a82390b4c587ad75/checks?check_suite_id=5743625010" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Lint" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "clang-format", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633826958?check_suite_focus=true" + }, + { + "name": "py2-setup-validate-errormsg", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633827084?check_suite_focus=true" + }, + { + "name": "quick-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633827160?check_suite_focus=true" + }, + { + "name": "shellcheck", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633827410?check_suite_focus=true" + }, + { + "name": "toc", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633827566?check_suite_focus=true" + }, + { + "name": "clang-tidy", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633827701?check_suite_focus=true" + }, + { + "name": "cmakelint", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633827899?check_suite_focus=true" + }, + { + "name": "flake8-py3", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633828081?check_suite_focus=true" + }, + { + "name": "Test collect_env (with_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633828249?check_suite_focus=true" + }, + { + "name": "Test collect_env (without_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633828312?check_suite_focus=true" + }, + { + "name": "Test tools", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633828407?check_suite_focus=true" + }, + { + "name": "mypy", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633828524?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_NZqw=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/73881411e2bfb3aaa2e89926a82390b4c587ad75/checks?check_suite_id=5743625458" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "TorchBench CI (pytorch-linux-py3.7-cu102)" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "run-torchbench", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633826956?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_NYIw=", + "hasNextPage": false + } + }, + "conclusion": "SKIPPED", + "url": "https://github.com/pytorch/pytorch/commit/73881411e2bfb3aaa2e89926a82390b4c587ad75/checks?check_suite_id=5743625463" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pull" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "pytorch-xla-linux-bionic-py3.7-clang8", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633827223?check_suite_focus=true" + }, + { + "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633827451?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633827729?check_suite_focus=true" + }, + { + "name": "linux-bionic-rocm4.5-py3.7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633827956?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633828089?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633828258?check_suite_focus=true" + }, + { + "name": "linux-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633828406?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633828523?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633828594?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3-clang5-mobile-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633828765?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc7-no-ops / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633828992?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633829085?check_suite_focus=true" + }, + { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633829195?check_suite_focus=true" + }, + { + "name": "win-vs2019-cpu-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633829321?check_suite_focus=true" + }, + { + "name": "win-vs2019-cuda11.3-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633829420?check_suite_focus=true" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633829488?check_suite_focus=true" + }, + { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633829666?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633829746?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633829845?check_suite_focus=true" + }, + { + "name": "pytorch-xla-linux-bionic-py3.7-clang8", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5633829904?check_suite_focus=true" + }, + { + "name": "linux-docs / build-docs (cpp)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634453168?check_suite_focus=true" + }, + { + "name": "linux-docs / build-docs (python)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634453232?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634453388?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634453444?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634453499?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634453573?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634453624?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634453683?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634462211?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634462270?check_suite_focus=true" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634602176?check_suite_focus=true" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634602239?check_suite_focus=true" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (noarch, 1, 1, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634602319?check_suite_focus=true" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634602425?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634622529?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634622639?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634622730?check_suite_focus=true" + }, + { + "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634637718?check_suite_focus=true" + }, + { + "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634637817?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634775159?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634775273?check_suite_focus=true" + }, + { + "name": "win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634823038?check_suite_focus=true" + }, + { + "name": "win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634823099?check_suite_focus=true" + }, + { + "name": "win-vs2019-cuda11.3-py3 / test (force_on_cpu, 1, 1, windows.4xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634823171?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634920855?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634921428?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634921484?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634921543?check_suite_focus=true" + }, + { + "name": "linux-bionic-rocm4.5-py3.7 / test (default, 1, 2, linux.rocm.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634995986?check_suite_focus=true" + }, + { + "name": "linux-bionic-rocm4.5-py3.7 / test (default, 2, 2, linux.rocm.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5634996056?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_fN1g=", + "hasNextPage": false + } + }, + "conclusion": "FAILURE", + "url": "https://github.com/pytorch/pytorch/commit/73881411e2bfb3aaa2e89926a82390b4c587ad75/checks?check_suite_id=5743625483" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVZYxQs=", + "hasNextPage": false + } + }, + "oid": "73881411e2bfb3aaa2e89926a82390b4c587ad75" + } + } + ] + }, + "changedFiles": 37, + "files": { + "nodes": [ + { + "path": "aten/src/ATen/core/interned_strings.h" + }, + { + "path": "caffe2/CMakeLists.txt" + }, + { + "path": "cmake/Dependencies.cmake" + }, + { + "path": "cmake/Modules/FindMKLDNN.cmake" + }, + { + "path": "cmake/public/mkldnn.cmake" + }, + { + "path": "docs/source/jit.rst" + }, + { + "path": "test/test_jit_llga_fuser.py" + }, + { + "path": "torch/_C/__init__.pyi.in" + }, + { + "path": "torch/csrc/jit/codegen/onednn/LlgaTensorImpl.cpp" + }, + { + "path": "torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h" + }, + { + "path": "torch/csrc/jit/codegen/onednn/README.md" + }, + { + "path": "torch/csrc/jit/codegen/onednn/defer_size_check.cpp" + }, + { + "path": "torch/csrc/jit/codegen/onednn/defer_size_check.h" + }, + { + "path": "torch/csrc/jit/codegen/onednn/graph_fuser.cpp" + }, + { + "path": "torch/csrc/jit/codegen/onednn/graph_fuser.h" + }, + { + "path": "torch/csrc/jit/codegen/onednn/graph_helper.cpp" + }, + { + "path": "torch/csrc/jit/codegen/onednn/graph_helper.h" + }, + { + "path": "torch/csrc/jit/codegen/onednn/graph_rewriter.cpp" + }, + { + "path": "torch/csrc/jit/codegen/onednn/guard_shape.cpp" + }, + { + "path": "torch/csrc/jit/codegen/onednn/guard_shape.h" + }, + { + "path": "torch/csrc/jit/codegen/onednn/interface.cpp" + }, + { + "path": "torch/csrc/jit/codegen/onednn/interface.h" + }, + { + "path": "torch/csrc/jit/codegen/onednn/kernel.cpp" + }, + { + "path": "torch/csrc/jit/codegen/onednn/kernel.h" + }, + { + "path": "torch/csrc/jit/codegen/onednn/layout_propagation.cpp" + }, + { + "path": "torch/csrc/jit/codegen/onednn/layout_propagation.h" + }, + { + "path": "torch/csrc/jit/codegen/onednn/operator.h" + }, + { + "path": "torch/csrc/jit/codegen/onednn/prepare_binary.cpp" + }, + { + "path": "torch/csrc/jit/codegen/onednn/prepare_binary.h" + }, + { + "path": "torch/csrc/jit/codegen/onednn/register_interface.cpp" + }, + { + "path": "torch/csrc/jit/ir/alias_analysis.cpp" + }, + { + "path": "torch/csrc/jit/ir/ir.cpp" + }, + { + "path": "torch/csrc/jit/passes/inline_autodiff_subgraphs.cpp" + }, + { + "path": "torch/csrc/jit/passes/onednn_graph_fuser.h" + }, + { + "path": "torch/csrc/jit/python/init.cpp" + }, + { + "path": "torch/csrc/jit/runtime/operator.cpp" + }, + { + "path": "torch/jit/__init__.py" + } + ], + "pageInfo": { + "endCursor": "Mzc", + "hasNextPage": false + } + }, + "reviews": { + "nodes": [ + { + "author": { + "login": "pinzhenx" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "pinzhenx" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "pinzhenx" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "chunyuan-w" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "eellison" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "wukong1992" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "eellison" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "eellison" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "eellison" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "eellison" + }, + "state": "APPROVED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "eellison" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "malfet" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "malfet" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "malfet" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + }, + { + "author": { + "login": "sanchitintel" + }, + "state": "COMMENTED" + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpO5MjAyMS0xMi0xMFQwOToyNDoxOS0wODowMLkyMDIxLTEyLTEwVDA5OjI0OjE5LTA4OjAwzjFryLE=", + "hasPreviousPage": false + } + }, + "comments": { + "nodes": [ + { + "bodyText": "Looks like this broke master https://hud.pytorch.org/pytorch/pytorch/commit/7dd08230117f4fa8bb82b3524e90fb00340198c7. I am reverting.", + "author": { + "login": "suo" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1074498483 + }, + { + "bodyText": "@pytorchbot revert this", + "author": { + "login": "suo" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1074498550 + }, + { + "bodyText": "Looks like this broke master https://hud.pytorch.org/pytorch/pytorch/commit/7dd08230117f4fa8bb82b3524e90fb00340198c7. I am reverting.\n\nOops! Will fix it ASAP.", + "author": { + "login": "sanchitintel" + }, + "authorAssociation": "CONTRIBUTOR", + "editor": null, + "databaseId": 1074499668 + }, + { + "bodyText": "This pull request has been reverted by e5bf879. To re-land this change, please open another pull request, assignthe same reviewers, fix the CI failures that caused the revert and make sure that the failing CI runs on the PR by applying the proper ciflow label (e.g., ciflow/trunk).", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1074508608 + }, + { + "bodyText": "This pull request has been reverted by e5bf879. To re-land this change, please open another pull request, assignthe same reviewers, fix the CI failures that caused the revert and make sure that the failing CI runs on the PR by applying the proper ciflow label (e.g., ciflow/trunk).", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1082508130 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOQAuLsw==", + "hasPreviousPage": true + } + } + } + } + } + }, + "query_sha=62ce809793481ce6ddce6e1a19d9b0761755ff0ff75decaf8a79419eaf793110 cursor=Y3Vyc29yOnYyOpHOQAuLsw== name=pytorch number=68111 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "comments": { + "nodes": [ + { + "bodyText": "CI Flow Status\n\u269b\ufe0f CI Flow\nRuleset - Version: v1\nRuleset - File: https://github.com/chunyuan-w/pytorch/blob/7496bf1588050191595d833d23b8972b2f22655e/.github/generated-ciflow-ruleset.json\nPR ciflow labels: ciflow/default\n\n\n\nWorkflows\nLabels (bold enabled)\nStatus\n\n\n\n\nTriggered Workflows\n\n\n\n\nlinux-bionic-py3.7-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/noarch, ciflow/trunk\n\u2705 triggered\n\n\nlinux-docs\nciflow/all, ciflow/cpu, ciflow/default, ciflow/docs, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-vulkan-bionic-py3.7-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk, ciflow/vulkan\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7-bazel-test\nciflow/all, ciflow/bazel, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3-clang5-mobile-build\nciflow/all, ciflow/default, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3-clang5-mobile-custom-build-static\nciflow/all, ciflow/default, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-clang7-asan\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/sanitizers, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-clang7-onnx\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/onnx, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc7\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc7-no-ops\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single\nciflow/all, ciflow/android, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit\nciflow/all, ciflow/android, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nwin-vs2019-cpu-py3\nciflow/all, ciflow/cpu, ciflow/default, ciflow/trunk, ciflow/win\n\u2705 triggered\n\n\nwin-vs2019-cuda11.3-py3\nciflow/all, ciflow/cuda, ciflow/default, ciflow/trunk, ciflow/win\n\u2705 triggered\n\n\nSkipped Workflows\n\n\n\n\ncaffe2-linux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\ndocker-builds\nciflow/all, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-coreml\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-custom-ops\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-full-jit\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-metal\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64-coreml\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64-full-jit\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda10.2-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda11.3-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlinux-binary-conda\nciflow/binaries, ciflow/binaries/conda\n\ud83d\udeab skipped\n\n\nlinux-binary-libtorch-cxx11-abi\nciflow/binaries, ciflow/binaries/libtorch\n\ud83d\udeab skipped\n\n\nlinux-binary-libtorch-pre-cxx11\nciflow/binaries, ciflow/binaries/libtorch\n\ud83d\udeab skipped\n\n\nlinux-binary-manywheel\nciflow/binaries, ciflow/binaries/wheel\n\ud83d\udeab skipped\n\n\nlinux-bionic-cuda10.2-py3.9-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/slow, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlinux-docs-push\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7-no-ops\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-10-15-py3-arm64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-10-15-py3-lite-interpreter-x86-64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-11-py3-x86-64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nparallelnative-linux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nperiodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-bionic-cuda11.5-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled, ciflow/slow, ciflow/slow-gradcheck\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda11.1-py3.7-gcc7-debug\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-win-vs2019-cuda11.1-py3\nciflow/all, ciflow/cuda, ciflow/scheduled, ciflow/win\n\ud83d\udeab skipped\n\n\nperiodic-win-vs2019-cuda11.5-py3\nciflow/all, ciflow/cuda, ciflow/scheduled, ciflow/win\n\ud83d\udeab skipped\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-build\nciflow/all, ciflow/android, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\n\n\nYou can add a comment to the PR and tag @pytorchbot with the following commands:\n\n# ciflow rerun, \"ciflow/default\" will always be added automatically\n@pytorchbot ciflow rerun\n\n# ciflow rerun with additional labels \"-l \", which is equivalent to adding these labels manually and trigger the rerun\n@pytorchbot ciflow rerun -l ciflow/scheduled -l ciflow/slow\n\n\nFor more information, please take a look at the CI Flow Wiki.", + "author": { + "login": "pytorch-probot" + }, + "authorAssociation": "NONE", + "editor": { + "login": "pytorch-probot" + }, + "databaseId": 964902865 + }, + { + "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/68111\nNeed help or want to give feedback on the CI? Visit our office hours\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 7388141 (more details on the Dr. CI page):\n\n\n29/29 failures introduced in this PR\n\n\n\ud83d\udd75\ufe0f 29 new failures recognized by patterns\nThe following CI failures do not appear to be due to upstream breakages:\n pull / linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge) (1/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:31:38.6978776Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:31:38.3001628Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:31:38.5169168Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:31:38.5362923Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:31:38.5413452Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:31:38.5458747Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:31:38.5484014Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:31:38.5497924Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:31:38.5656491Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:31:38.5678893Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:31:38.6888479Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0f6488c20adb4dca4\n2022-03-21T21:31:38.6978776Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:31:38.6992648Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:31:38.7003010Z ##[error]Process completed with exit code 2.\n2022-03-21T21:31:38.7044027Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:31:38.7044261Z with:\n2022-03-21T21:31:38.7044413Z env:\n2022-03-21T21:31:38.7044565Z IN_CI: 1\n2022-03-21T21:31:38.7044709Z IS_GHA: 1\n2022-03-21T21:31:38.7044885Z GIT_DEFAULT_BRANCH: master\n2022-03-21T21:31:38.7045067Z ##[endgroup]\n2022-03-21T21:31:38.7060958Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge) (2/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:35:19.2635222Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:35:18.9028722Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:35:19.1132721Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:35:19.1310590Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:35:19.1360251Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:35:19.1386865Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:35:19.1429182Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:35:19.1441925Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:35:19.1468280Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:35:19.1617667Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:35:19.2545368Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-098be2985e0392130\n2022-03-21T21:35:19.2635222Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:35:19.2648463Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:35:19.2658727Z ##[error]Process completed with exit code 2.\n2022-03-21T21:35:19.2706355Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:35:19.2706591Z with:\n2022-03-21T21:35:19.2706748Z env:\n2022-03-21T21:35:19.2706908Z IN_CI: 1\n2022-03-21T21:35:19.2707061Z IS_GHA: 1\n2022-03-21T21:35:19.2707246Z GIT_DEFAULT_BRANCH: master\n2022-03-21T21:35:19.2707438Z ##[endgroup]\n2022-03-21T21:35:19.2724554Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / win-vs2019-cuda11.3-py3 / test (force_on_cpu, 1, 1, windows.4xlarge) (3/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T23:11:57.5531419Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T23:11:52.7662022Z Downloading botocore-1.22.12-py3-none-any.whl (8.1 MB)\n2022-03-21T23:11:53.1213298Z ---------------------------------------- 8.1/8.1 MB 23.6 MB/s eta 0:00:00\n2022-03-21T23:11:53.1644665Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T23:11:53.2218699Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-21T23:11:53.2389674Z Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-21T23:11:53.2787295Z -------------------------------------- 247.7/247.7 KB 7.4 MB/s eta 0:00:00\n2022-03-21T23:11:53.3761842Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T23:11:53.5457622Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-21T23:11:57.4175080Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-21T23:11:57.5296815Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0105d4db093574f40\n2022-03-21T23:11:57.5531419Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T23:11:57.5564814Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T23:11:57.5587712Z ##[error]Process completed with exit code 2.\n2022-03-21T23:11:57.5790311Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-21T23:11:57.5790832Z with:\n2022-03-21T23:11:57.5791104Z env:\n2022-03-21T23:11:57.5791358Z IN_CI: 1\n2022-03-21T23:11:57.5791620Z IS_GHA: 1\n2022-03-21T23:11:57.5791939Z GIT_DEFAULT_BRANCH: master\n2022-03-21T23:11:57.5792425Z pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-21T23:11:57.5792884Z ##[endgroup]\n\n\n pull / linux-bionic-rocm4.5-py3.7 / test (default, 1, 2, linux.rocm.gpu) (4/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-22T02:17:12.6257577Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-22T02:17:11.9280556Z Using cached https://files.pythonhosted.org/packages/7b/9c/f51775ebe7df5a7aa4e7c79ed671bde94e154bd968aca8d65bb24aba0c8c/s3transfer-0.5.2-py3-none-any.whl\n2022-03-22T02:17:11.9335199Z Collecting urllib3<1.27,>=1.25.4 (from botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T02:17:11.9682045Z Using cached https://files.pythonhosted.org/packages/ec/03/062e6444ce4baf1eac17a6a0ebfe36bb1ad05e1df0e20b110de59c278498/urllib3-1.26.9-py2.py3-none-any.whl\n2022-03-22T02:17:11.9850357Z Collecting python-dateutil<3.0.0,>=2.1 (from botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T02:17:12.0403171Z Using cached https://files.pythonhosted.org/packages/36/7a/87837f39d0296e723bb9b62bbb257d0355c7f6128853c78955f57342a56d/python_dateutil-2.8.2-py2.py3-none-any.whl\n2022-03-22T02:17:12.0468875Z Collecting six>=1.5 (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T02:17:12.0590000Z Using cached https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl\n2022-03-22T02:17:12.0607093Z Installing collected packages: jmespath, urllib3, six, python-dateutil, botocore, s3transfer, boto3\n2022-03-22T02:17:12.5273459Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2 six-1.16.0 urllib3-1.26.9\n2022-03-22T02:17:12.6032812Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 worker-rocm-amd-114\n2022-03-22T02:17:12.6257577Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-22T02:17:12.6259543Z + GHA_WORKFLOW_JOB_ID=\n2022-03-22T02:17:12.6291924Z ##[error]Process completed with exit code 2.\n2022-03-22T02:17:12.6387977Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-22T02:17:12.6388298Z with:\n2022-03-22T02:17:12.6388521Z wait-ssh: false\n2022-03-22T02:17:12.6388727Z env:\n2022-03-22T02:17:12.6388932Z IN_CI: 1\n2022-03-22T02:17:12.6389143Z IS_GHA: 1\n2022-03-22T02:17:12.6389368Z GIT_DEFAULT_BRANCH: master\n2022-03-22T02:17:12.6389669Z DOCKER_HOST: unix:///run/user/1121/docker.sock\n\n\n pull / linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge) (5/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:19:24.4890693Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:19:24.0962005Z + python3 -m pip install boto3==1.19.12\n2022-03-21T22:19:24.3152253Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T22:19:24.3341183Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T22:19:24.3391374Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T22:19:24.3436392Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T22:19:24.3448982Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T22:19:24.3474092Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T22:19:24.3502003Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:19:24.3655072Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:19:24.4799309Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0bc9250521f338cae\n2022-03-21T22:19:24.4890693Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:19:24.4903625Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:19:24.4913841Z ##[error]Process completed with exit code 2.\n2022-03-21T22:19:24.4957338Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T22:19:24.4957575Z with:\n2022-03-21T22:19:24.4957735Z env:\n2022-03-21T22:19:24.4957900Z IN_CI: 1\n2022-03-21T22:19:24.4958055Z IS_GHA: 1\n2022-03-21T22:19:24.4958246Z GIT_DEFAULT_BRANCH: master\n2022-03-21T22:19:24.4958437Z ##[endgroup]\n2022-03-21T22:19:24.4989649Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-bionic-rocm4.5-py3.7 / test (default, 2, 2, linux.rocm.gpu) (6/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-22T01:05:07.6983899Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-22T01:05:06.8364546Z Using cached https://files.pythonhosted.org/packages/7b/9c/f51775ebe7df5a7aa4e7c79ed671bde94e154bd968aca8d65bb24aba0c8c/s3transfer-0.5.2-py3-none-any.whl\n2022-03-22T01:05:06.8431763Z Collecting urllib3<1.27,>=1.25.4 (from botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T01:05:06.8949391Z Using cached https://files.pythonhosted.org/packages/ec/03/062e6444ce4baf1eac17a6a0ebfe36bb1ad05e1df0e20b110de59c278498/urllib3-1.26.9-py2.py3-none-any.whl\n2022-03-22T01:05:06.9180079Z Collecting python-dateutil<3.0.0,>=2.1 (from botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T01:05:06.9803351Z Using cached https://files.pythonhosted.org/packages/36/7a/87837f39d0296e723bb9b62bbb257d0355c7f6128853c78955f57342a56d/python_dateutil-2.8.2-py2.py3-none-any.whl\n2022-03-22T01:05:06.9882133Z Collecting six>=1.5 (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T01:05:07.0067062Z Using cached https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl\n2022-03-22T01:05:07.0088676Z Installing collected packages: urllib3, jmespath, six, python-dateutil, botocore, s3transfer, boto3\n2022-03-22T01:05:07.5819667Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2 six-1.16.0 urllib3-1.26.9\n2022-03-22T01:05:07.6774717Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 worker-rocm-amd-60\n2022-03-22T01:05:07.6983899Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-22T01:05:07.6988652Z + GHA_WORKFLOW_JOB_ID=\n2022-03-22T01:05:07.7023073Z ##[error]Process completed with exit code 2.\n2022-03-22T01:05:07.7102087Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-22T01:05:07.7102389Z with:\n2022-03-22T01:05:07.7102603Z wait-ssh: false\n2022-03-22T01:05:07.7102820Z env:\n2022-03-22T01:05:07.7103015Z IN_CI: 1\n2022-03-22T01:05:07.7103224Z IS_GHA: 1\n2022-03-22T01:05:07.7103458Z GIT_DEFAULT_BRANCH: master\n2022-03-22T01:05:07.7103737Z DOCKER_HOST: unix:///run/user/1502/docker.sock\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge) (7/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T20:51:39.3637996Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T20:51:39.2041249Z Attempting uninstall: s3transfer\n2022-03-21T20:51:39.2043010Z Found existing installation: s3transfer 0.3.7\n2022-03-21T20:51:39.2083799Z Uninstalling s3transfer-0.3.7:\n2022-03-21T20:51:39.2089675Z Successfully uninstalled s3transfer-0.3.7\n2022-03-21T20:51:39.2480546Z Attempting uninstall: boto3\n2022-03-21T20:51:39.2482953Z Found existing installation: boto3 1.16.34\n2022-03-21T20:51:39.2584292Z Uninstalling boto3-1.16.34:\n2022-03-21T20:51:39.2599474Z Successfully uninstalled boto3-1.16.34\n2022-03-21T20:51:39.3130921Z Successfully installed boto3-1.19.12 botocore-1.22.12 s3transfer-0.5.2\n2022-03-21T20:51:39.3550598Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-03ef7efc3078e3da5\n2022-03-21T20:51:39.3637996Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T20:51:39.3650651Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T20:51:39.3660484Z ##[error]Process completed with exit code 2.\n2022-03-21T20:51:39.3696465Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T20:51:39.3696693Z with:\n2022-03-21T20:51:39.3696850Z env:\n2022-03-21T20:51:39.3697012Z IN_CI: 1\n2022-03-21T20:51:39.3697161Z IS_GHA: 1\n2022-03-21T20:51:39.3697342Z GIT_DEFAULT_BRANCH: master\n2022-03-21T20:51:39.3697528Z ##[endgroup]\n2022-03-21T20:51:39.3730420Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge) (8/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:03:36.3916860Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:03:36.0096309Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:03:36.2278560Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:03:36.2461618Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:03:36.2513260Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:03:36.2541524Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:03:36.2554899Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:03:36.2598277Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:03:36.2758299Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:03:36.2780690Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:03:36.3825021Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0a4a552890e6ef7d3\n2022-03-21T21:03:36.3916860Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:03:36.3930343Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:03:36.3941263Z ##[error]Process completed with exit code 2.\n2022-03-21T21:03:36.3979258Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:03:36.3979496Z with:\n2022-03-21T21:03:36.3979654Z env:\n2022-03-21T21:03:36.3979814Z IN_CI: 1\n2022-03-21T21:03:36.3979968Z IS_GHA: 1\n2022-03-21T21:03:36.3980157Z GIT_DEFAULT_BRANCH: master\n2022-03-21T21:03:36.3980360Z ##[endgroup]\n2022-03-21T21:03:36.3996257Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu) (9/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-22T00:41:15.5325784Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-22T00:41:10.3015614Z Downloading s3transfer-0.5.2-py3-none-any.whl (79 kB)\n2022-03-22T00:41:10.3625659Z ---------------------------------------- 79.5/79.5 KB 1.1 MB/s eta 0:00:00\n2022-03-22T00:41:10.4120236Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-22T00:41:10.4170155Z Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-22T00:41:10.4722115Z -------------------------------------- 247.7/247.7 KB 5.2 MB/s eta 0:00:00\n2022-03-22T00:41:10.4843512Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-22T00:41:10.6596108Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-22T00:41:10.8733354Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-22T00:41:15.3745408Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-22T00:41:15.4987162Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-09cacc848abc3dd32\n2022-03-22T00:41:15.5325784Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-22T00:41:15.5373630Z + GHA_WORKFLOW_JOB_ID=\n2022-03-22T00:41:15.5404353Z ##[error]Process completed with exit code 2.\n2022-03-22T00:41:15.5790508Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-22T00:41:15.5791192Z with:\n2022-03-22T00:41:15.5791530Z env:\n2022-03-22T00:41:15.5791849Z IN_CI: 1\n2022-03-22T00:41:15.5792186Z IS_GHA: 1\n2022-03-22T00:41:15.5792599Z GIT_DEFAULT_BRANCH: master\n2022-03-22T00:41:15.5793237Z pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-22T00:41:15.5793831Z ##[endgroup]\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge) (10/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T20:50:32.9799307Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T20:50:32.8167560Z Attempting uninstall: s3transfer\n2022-03-21T20:50:32.8169351Z Found existing installation: s3transfer 0.3.7\n2022-03-21T20:50:32.8213295Z Uninstalling s3transfer-0.3.7:\n2022-03-21T20:50:32.8219209Z Successfully uninstalled s3transfer-0.3.7\n2022-03-21T20:50:32.8602320Z Attempting uninstall: boto3\n2022-03-21T20:50:32.8603289Z Found existing installation: boto3 1.16.34\n2022-03-21T20:50:32.8704535Z Uninstalling boto3-1.16.34:\n2022-03-21T20:50:32.8719403Z Successfully uninstalled boto3-1.16.34\n2022-03-21T20:50:32.9244278Z Successfully installed boto3-1.19.12 botocore-1.22.12 s3transfer-0.5.2\n2022-03-21T20:50:32.9710449Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0c568461a276d4a71\n2022-03-21T20:50:32.9799307Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T20:50:32.9812238Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T20:50:32.9823052Z ##[error]Process completed with exit code 2.\n2022-03-21T20:50:32.9859290Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T20:50:32.9859527Z with:\n2022-03-21T20:50:32.9859664Z env:\n2022-03-21T20:50:32.9859817Z IN_CI: 1\n2022-03-21T20:50:32.9859977Z IS_GHA: 1\n2022-03-21T20:50:32.9860144Z GIT_DEFAULT_BRANCH: master\n2022-03-21T20:50:32.9860327Z ##[endgroup]\n2022-03-21T20:50:32.9893642Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge) (11/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:05:00.7163042Z SUMMARY: Undefined.../jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in\n\n2022-03-21T21:05:00.6660824Z #10 0x55fc8a3ea801 in run_mod /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:1037\n2022-03-21T21:05:00.6661768Z #11 0x55fc8a3f57a9 in PyRun_StringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:961\n2022-03-21T21:05:00.6662455Z #12 0x55fc8a3f580b in PyRun_SimpleStringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:455\n2022-03-21T21:05:00.6663570Z #13 0x55fc8a3f5908 in pymain_run_command /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:420\n2022-03-21T21:05:00.6663952Z #14 0x55fc8a3f5908 in pymain_run_python /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:2907\n2022-03-21T21:05:00.6664431Z #15 0x55fc8a3f5908 in pymain_main /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3460\n2022-03-21T21:05:00.6665304Z #16 0x55fc8a3f5ccb in _Py_UnixMain /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3495\n2022-03-21T21:05:00.7162113Z #17 0x7f940d00f83f in __libc_start_main /build/glibc-S7Ft5T/glibc-2.23/csu/../csu/libc-start.c:291\n2022-03-21T21:05:00.7162534Z #18 0x55fc8a39a554 in _start (/opt/conda/bin/python3.7+0x1d7554)\n2022-03-21T21:05:00.7162711Z \n2022-03-21T21:05:00.7163042Z SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior /var/lib/jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in \n2022-03-21T21:05:00.7334595Z + retcode=1\n2022-03-21T21:05:00.7334954Z + set -e\n2022-03-21T21:05:00.7335215Z + return 1\n2022-03-21T21:05:00.7338688Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX-* ]]\n2022-03-21T21:05:00.7339232Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X ]]\n2022-03-21T21:05:00.7340113Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX2-* ]]\n2022-03-21T21:05:00.7340612Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\2 ]]\n2022-03-21T21:05:00.7341187Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX512-* ]]\n2022-03-21T21:05:00.7341668Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\5\\1\\2 ]]\n2022-03-21T21:05:00.7344466Z + [[ linux-xenial-py3.7-clang7-asan-default == *tbb* ]]\n\n\n pull / linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge) (12/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:06:03.4437430Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:06:03.0752199Z + python3 -m pip install boto3==1.19.12\n2022-03-21T22:06:03.2853252Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T22:06:03.3032326Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T22:06:03.3081589Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T22:06:03.3093911Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T22:06:03.3120244Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T22:06:03.3162406Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T22:06:03.3188431Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:06:03.3337181Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:06:03.4348072Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0ee48c8811fafc444\n2022-03-21T22:06:03.4437430Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:06:03.4450920Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:06:03.4461263Z ##[error]Process completed with exit code 2.\n2022-03-21T22:06:03.4502346Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T22:06:03.4502576Z with:\n2022-03-21T22:06:03.4502730Z env:\n2022-03-21T22:06:03.4502888Z IN_CI: 1\n2022-03-21T22:06:03.4503038Z IS_GHA: 1\n2022-03-21T22:06:03.4503302Z GIT_DEFAULT_BRANCH: master\n2022-03-21T22:06:03.4503492Z ##[endgroup]\n2022-03-21T22:06:03.4519156Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge) (13/29)\nStep: \"Test\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T20:50:13.2205634Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T20:50:12.8679322Z + python3 -m pip install boto3==1.19.12\n2022-03-21T20:50:13.0744228Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T20:50:13.0916284Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T20:50:13.0964264Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T20:50:13.1005656Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T20:50:13.1017299Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T20:50:13.1041042Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T20:50:13.1189450Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T20:50:13.1208751Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T20:50:13.2119445Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0d02da60fd18c22f5\n2022-03-21T20:50:13.2205634Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T20:50:13.2217939Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T20:50:13.2220259Z ##[error]Process completed with exit code 2.\n2022-03-21T20:50:13.2248664Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T20:50:13.2249012Z with:\n2022-03-21T20:50:13.2249260Z env:\n2022-03-21T20:50:13.2249500Z IN_CI: 1\n2022-03-21T20:50:13.2249738Z IS_GHA: 1\n2022-03-21T20:50:13.2250025Z GIT_DEFAULT_BRANCH: master\n2022-03-21T20:50:13.2250329Z ##[endgroup]\n2022-03-21T20:50:13.2272735Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu) (14/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T23:47:38.0451999Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T23:47:37.5554508Z + python3 -m pip install boto3==1.19.12\n2022-03-21T23:47:37.8411473Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T23:47:37.8631484Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T23:47:37.8699561Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T23:47:37.8737037Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T23:47:37.8754443Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T23:47:37.8814393Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T23:47:37.8849540Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T23:47:37.9059579Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T23:47:38.0336298Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0b44f47f4292089a2\n2022-03-21T23:47:38.0451999Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T23:47:38.0469471Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T23:47:38.0484106Z ##[error]Process completed with exit code 2.\n2022-03-21T23:47:38.0532678Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T23:47:38.0533007Z with:\n2022-03-21T23:47:38.0533223Z env:\n2022-03-21T23:47:38.0533440Z IN_CI: 1\n2022-03-21T23:47:38.0533649Z IS_GHA: 1\n2022-03-21T23:47:38.0533902Z GIT_DEFAULT_BRANCH: master\n2022-03-21T23:47:38.0534170Z GPU_FLAG: --gpus all\n2022-03-21T23:47:38.0534401Z ##[endgroup]\n\n\n pull / linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge) (15/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:04:59.3115800Z SUMMARY: Undefined.../jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in\n\n2022-03-21T21:04:59.2595213Z #10 0x55a7f39a4801 in run_mod /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:1037\n2022-03-21T21:04:59.2595707Z #11 0x55a7f39af7a9 in PyRun_StringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:961\n2022-03-21T21:04:59.2597203Z #12 0x55a7f39af80b in PyRun_SimpleStringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:455\n2022-03-21T21:04:59.2598205Z #13 0x55a7f39af908 in pymain_run_command /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:420\n2022-03-21T21:04:59.2598697Z #14 0x55a7f39af908 in pymain_run_python /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:2907\n2022-03-21T21:04:59.2599178Z #15 0x55a7f39af908 in pymain_main /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3460\n2022-03-21T21:04:59.2599747Z #16 0x55a7f39afccb in _Py_UnixMain /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3495\n2022-03-21T21:04:59.3114751Z #17 0x7f3b3822383f in __libc_start_main /build/glibc-S7Ft5T/glibc-2.23/csu/../csu/libc-start.c:291\n2022-03-21T21:04:59.3115277Z #18 0x55a7f3954554 in _start (/opt/conda/bin/python3.7+0x1d7554)\n2022-03-21T21:04:59.3115468Z \n2022-03-21T21:04:59.3115800Z SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior /var/lib/jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in \n2022-03-21T21:04:59.3292385Z + retcode=1\n2022-03-21T21:04:59.3292781Z + set -e\n2022-03-21T21:04:59.3293062Z + return 1\n2022-03-21T21:04:59.3295462Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX-* ]]\n2022-03-21T21:04:59.3295802Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X ]]\n2022-03-21T21:04:59.3296394Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX2-* ]]\n2022-03-21T21:04:59.3296700Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\2 ]]\n2022-03-21T21:04:59.3297055Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX512-* ]]\n2022-03-21T21:04:59.3297416Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\5\\1\\2 ]]\n2022-03-21T21:04:59.3299623Z + [[ linux-xenial-py3.7-clang7-asan-default == *tbb* ]]\n\n\n pull / win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge) (16/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:14:31.7846086Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:14:25.5525714Z Collecting jmespath<1.0.0,>=0.7.1\n2022-03-21T22:14:25.5568155Z Downloading jmespath-0.10.0-py2.py3-none-any.whl (24 kB)\n2022-03-21T22:14:25.5952617Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-21T22:14:25.6169392Z Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-21T22:14:25.6629996Z -------------------------------------- 247.7/247.7 KB 5.1 MB/s eta 0:00:00\n2022-03-21T22:14:25.6710247Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:14:25.8284354Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:14:25.9816751Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-21T22:14:31.6672236Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-21T22:14:31.7630473Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0ed0915ecee5d2424\n2022-03-21T22:14:31.7846086Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:14:31.7876742Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:14:31.7897140Z ##[error]Process completed with exit code 2.\n2022-03-21T22:14:31.8195621Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-21T22:14:31.8196110Z with:\n2022-03-21T22:14:31.8196356Z env:\n2022-03-21T22:14:31.8196614Z IN_CI: 1\n2022-03-21T22:14:31.8196876Z IS_GHA: 1\n2022-03-21T22:14:31.8197169Z GIT_DEFAULT_BRANCH: master\n2022-03-21T22:14:31.8197652Z pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-21T22:14:31.8198093Z ##[endgroup]\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge) (17/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:19:15.8845728Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:19:15.5116060Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:19:15.7231476Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:19:15.7409711Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:19:15.7458478Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:19:15.7470508Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:19:15.7496799Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:19:15.7538362Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:19:15.7566161Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:19:15.7711630Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:19:15.8753543Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0e2b3b4ddb246ff2a\n2022-03-21T21:19:15.8845728Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:19:15.8859814Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:19:15.8870165Z ##[error]Process completed with exit code 2.\n2022-03-21T21:19:15.8917039Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:19:15.8917279Z with:\n2022-03-21T21:19:15.8917433Z env:\n2022-03-21T21:19:15.8917586Z IN_CI: 1\n2022-03-21T21:19:15.8917734Z IS_GHA: 1\n2022-03-21T21:19:15.8917917Z GIT_DEFAULT_BRANCH: master\n2022-03-21T21:19:15.8918102Z ##[endgroup]\n2022-03-21T21:19:15.8934572Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu) (18/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T23:19:48.5900162Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T23:19:48.0742254Z + python3 -m pip install boto3==1.19.12\n2022-03-21T23:19:48.3742563Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T23:19:48.3976536Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T23:19:48.4048700Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T23:19:48.4065374Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T23:19:48.4128076Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T23:19:48.4164273Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T23:19:48.4202610Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T23:19:48.4416723Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T23:19:48.5773033Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-07ab7a3c4a5402af2\n2022-03-21T23:19:48.5900162Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T23:19:48.5919822Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T23:19:48.5936087Z ##[error]Process completed with exit code 2.\n2022-03-21T23:19:48.6007930Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T23:19:48.6008268Z with:\n2022-03-21T23:19:48.6008483Z env:\n2022-03-21T23:19:48.6008701Z IN_CI: 1\n2022-03-21T23:19:48.6008920Z IS_GHA: 1\n2022-03-21T23:19:48.6009170Z GIT_DEFAULT_BRANCH: master\n2022-03-21T23:19:48.6009440Z GPU_FLAG: --gpus all\n2022-03-21T23:19:48.6009671Z ##[endgroup]\n\n\n pull / win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu) (19/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:54:04.2844259Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:53:59.0889659Z Downloading botocore-1.22.12-py3-none-any.whl (8.1 MB)\n2022-03-21T22:53:59.6881416Z ---------------------------------------- 8.1/8.1 MB 14.0 MB/s eta 0:00:00\n2022-03-21T22:53:59.7427779Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:53:59.7691882Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-21T22:53:59.7779847Z Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-21T22:53:59.8281663Z -------------------------------------- 247.7/247.7 KB 5.1 MB/s eta 0:00:00\n2022-03-21T22:54:00.0185115Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:54:00.2359770Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-21T22:54:04.1208891Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-21T22:54:04.2505862Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-03b4fbe63be8ef4b0\n2022-03-21T22:54:04.2844259Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:54:04.2891082Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:54:04.2919900Z ##[error]Process completed with exit code 2.\n2022-03-21T22:54:04.3377901Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-21T22:54:04.3378575Z with:\n2022-03-21T22:54:04.3378930Z env:\n2022-03-21T22:54:04.3379275Z IN_CI: 1\n2022-03-21T22:54:04.3379600Z IS_GHA: 1\n2022-03-21T22:54:04.3380023Z GIT_DEFAULT_BRANCH: master\n2022-03-21T22:54:04.3380691Z pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-21T22:54:04.3381278Z ##[endgroup]\n\n\n pull / linux-bionic-py3.7-clang9 / test (noarch, 1, 1, linux.2xlarge) (20/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:09:34.0074610Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:09:33.6365531Z + python3 -m pip install boto3==1.19.12\n2022-03-21T22:09:33.8475619Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T22:09:33.8655152Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T22:09:33.8704395Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T22:09:33.8716774Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T22:09:33.8760145Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T22:09:33.8785000Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T22:09:33.8811316Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:09:33.8960134Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:09:33.9984866Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0d325eb9fd156146f\n2022-03-21T22:09:34.0074610Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:09:34.0087465Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:09:34.0101743Z ##[error]Process completed with exit code 2.\n2022-03-21T22:09:34.0154014Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T22:09:34.0154246Z with:\n2022-03-21T22:09:34.0154412Z env:\n2022-03-21T22:09:34.0154574Z IN_CI: 1\n2022-03-21T22:09:34.0154728Z IS_GHA: 1\n2022-03-21T22:09:34.0154917Z GIT_DEFAULT_BRANCH: master\n2022-03-21T22:09:34.0155112Z ##[endgroup]\n2022-03-21T22:09:34.0191047Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge) (21/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:03:17.8502655Z [E request_callbac...yUniqueId(created_on=0, local_id=0) to be created.\n\n2022-03-21T21:03:14.4669960Z INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmpxgdsmeer\n2022-03-21T21:03:14.4671407Z INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmpxgdsmeer/_remote_module_non_sriptable.py\n2022-03-21T21:03:14.4973023Z INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmp1i2hfmpc\n2022-03-21T21:03:14.4973800Z INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmp1i2hfmpc/_remote_module_non_sriptable.py\n2022-03-21T21:03:14.5532339Z INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmpgx4da7b0\n2022-03-21T21:03:14.5533064Z INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmpgx4da7b0/_remote_module_non_sriptable.py\n2022-03-21T21:03:14.7050673Z INFO:torch.testing._internal.common_distributed:Starting event listener thread for rank 0\n2022-03-21T21:03:14.7097127Z INFO:torch.testing._internal.common_distributed:Starting event listener thread for rank 3\n2022-03-21T21:03:14.7398339Z INFO:torch.testing._internal.common_distributed:Starting event listener thread for rank 2\n2022-03-21T21:03:14.7922283Z INFO:torch.testing._internal.common_distributed:Starting event listener thread for rank 1\n2022-03-21T21:03:17.8502655Z [E request_callback_no_python.cpp:559] Received error while processing request type 261: false INTERNAL ASSERT FAILED at \"/var/lib/jenkins/workspace/torch/csrc/distributed/rpc/rref_context.cpp\":387, please report a bug to PyTorch. Expected OwnerRRef with id GloballyUniqueId(created_on=0, local_id=0) to be created.\n2022-03-21T21:03:17.8503603Z Exception raised from getOwnerRRef at /var/lib/jenkins/workspace/torch/csrc/distributed/rpc/rref_context.cpp:387 (most recent call first):\n2022-03-21T21:03:17.8504385Z frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string, std::allocator >) + 0x69 (0x7f180df19e19 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)\n2022-03-21T21:03:17.8505131Z frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::__cxx11::basic_string, std::allocator > const&) + 0xd2 (0x7f180df160e2 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)\n2022-03-21T21:03:17.8505927Z frame #2: c10::detail::torchInternalAssertFail(char const*, char const*, unsigned int, char const*, std::__cxx11::basic_string, std::allocator > const&) + 0x4e (0x7f180df17a7e in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)\n2022-03-21T21:03:17.8506674Z frame #3: torch::distributed::rpc::RRefContext::getOwnerRRef(torch::distributed::rpc::GloballyUniqueId const&, bool) + 0x4b4 (0x7f18118b7b64 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)\n2022-03-21T21:03:17.8507642Z frame #4: torch::distributed::rpc::RequestCallbackNoPython::assignOwnerRRef(torch::distributed::rpc::GloballyUniqueId const&, torch::distributed::rpc::GloballyUniqueId const&, c10::intrusive_ptr >) const + 0x70 (0x7f18118a7bf0 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)\n2022-03-21T21:03:17.8508613Z frame #5: torch::distributed::rpc::RequestCallbackImpl::processPythonRemoteCall(torch::distributed::rpc::RpcCommandBase&, std::vector >) const + 0xc8 (0x7f1819736208 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)\n2022-03-21T21:03:17.8509749Z frame #6: torch::distributed::rpc::RequestCallbackNoPython::processRpc(torch::distributed::rpc::RpcCommandBase&, torch::distributed::rpc::MessageType const&, std::vector >) const + 0x194 (0x7f18118ac914 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)\n2022-03-21T21:03:17.8510708Z frame #7: torch::distributed::rpc::RequestCallbackImpl::processRpcWithErrors(torch::distributed::rpc::RpcCommandBase&, torch::distributed::rpc::MessageType const&, std::vector >) const + 0x65 (0x7f1819735865 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)\n2022-03-21T21:03:17.8511369Z frame #8: + 0x375249a (0x7f18118a949a in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test (22/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T20:01:07.7015580Z \ufffd[36;1m echo \"ERR...t available for the merge-base of your branch\"\ufffd[0m\n\n2022-03-21T20:01:07.7012399Z \ufffd[36;1mfi\ufffd[0m\n2022-03-21T20:01:07.7012634Z \ufffd[36;1m# Covers the case where a previous tag doesn't exist for the tree\ufffd[0m\n2022-03-21T20:01:07.7012992Z \ufffd[36;1m# this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly\ufffd[0m\n2022-03-21T20:01:07.7013373Z \ufffd[36;1mif ! git rev-parse \"$MERGE_BASE:.circleci/docker\"; then\ufffd[0m\n2022-03-21T20:01:07.7013784Z \ufffd[36;1m echo \"Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit\"\ufffd[0m\n2022-03-21T20:01:07.7014149Z \ufffd[36;1m exit 1\ufffd[0m\n2022-03-21T20:01:07.7014325Z \ufffd[36;1mfi\ufffd[0m\n2022-03-21T20:01:07.7014573Z \ufffd[36;1mPREVIOUS_DOCKER_TAG=$(git rev-parse \"$MERGE_BASE:.circleci/docker\")\ufffd[0m\n2022-03-21T20:01:07.7014907Z \ufffd[36;1m# If no image exists but the hash is the same as the previous hash then we should error out here\ufffd[0m\n2022-03-21T20:01:07.7015231Z \ufffd[36;1mif [[ \"${PREVIOUS_DOCKER_TAG}\" = \"${DOCKER_TAG}\" ]]; then\ufffd[0m\n2022-03-21T20:01:07.7015580Z \ufffd[36;1m echo \"ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch\"\ufffd[0m\n2022-03-21T20:01:07.7015931Z \ufffd[36;1m echo \" contact the PyTorch team to restore the original images\"\ufffd[0m\n2022-03-21T20:01:07.7016225Z \ufffd[36;1m exit 1\ufffd[0m\n2022-03-21T20:01:07.7016400Z \ufffd[36;1mfi\ufffd[0m\n2022-03-21T20:01:07.7016608Z \ufffd[36;1mecho ::set-output name=rebuild::yes\ufffd[0m\n2022-03-21T20:01:07.7027605Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}\n2022-03-21T20:01:07.7027837Z env:\n2022-03-21T20:01:07.7028006Z IN_CI: 1\n2022-03-21T20:01:07.7028159Z IS_GHA: 1\n2022-03-21T20:01:07.7028346Z GIT_DEFAULT_BRANCH: master\n2022-03-21T20:01:07.7028589Z BASE_REVISION: 6643522db9ff595f564b8081de58b3a33c546178\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu) (23/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-22T00:49:54.2949572Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-22T00:49:53.8049151Z + python3 -m pip install boto3==1.19.12\n2022-03-22T00:49:54.0981629Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-22T00:49:54.1207562Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-22T00:49:54.1277146Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-22T00:49:54.1315027Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-22T00:49:54.1331813Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-22T00:49:54.1391622Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-22T00:49:54.1609217Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-22T00:49:54.1637417Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-22T00:49:54.2830197Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0f7c32fe13be12fea\n2022-03-22T00:49:54.2949572Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-22T00:49:54.2966933Z + GHA_WORKFLOW_JOB_ID=\n2022-03-22T00:49:54.2982588Z ##[error]Process completed with exit code 2.\n2022-03-22T00:49:54.3031464Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-22T00:49:54.3031794Z with:\n2022-03-22T00:49:54.3032012Z env:\n2022-03-22T00:49:54.3032227Z IN_CI: 1\n2022-03-22T00:49:54.3032434Z IS_GHA: 1\n2022-03-22T00:49:54.3032681Z GIT_DEFAULT_BRANCH: master\n2022-03-22T00:49:54.3033084Z GPU_FLAG: --gpus all\n2022-03-22T00:49:54.3033312Z ##[endgroup]\n\n\n pull / win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge) (24/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:56:12.5872636Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:56:07.3365589Z Downloading botocore-1.22.12-py3-none-any.whl (8.1 MB)\n2022-03-21T21:56:07.7926584Z ---------------------------------------- 8.1/8.1 MB 17.3 MB/s eta 0:00:00\n2022-03-21T21:56:07.9319362Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-21T21:56:07.9366132Z Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-21T21:56:08.0077590Z -------------------------------------- 247.7/247.7 KB 3.0 MB/s eta 0:00:00\n2022-03-21T21:56:08.0164070Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:56:08.1775537Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:56:08.3393469Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-21T21:56:12.4576766Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-21T21:56:12.5641959Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0afad69838118af0e\n2022-03-21T21:56:12.5872636Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:56:12.5905611Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:56:12.5927729Z ##[error]Process completed with exit code 2.\n2022-03-21T21:56:12.6239531Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-21T21:56:12.6240039Z with:\n2022-03-21T21:56:12.6240299Z env:\n2022-03-21T21:56:12.6240557Z IN_CI: 1\n2022-03-21T21:56:12.6240805Z IS_GHA: 1\n2022-03-21T21:56:12.6241118Z GIT_DEFAULT_BRANCH: master\n2022-03-21T21:56:12.6241613Z pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-21T21:56:12.6242052Z ##[endgroup]\n\n\n pull / linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge) (25/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:46:39.5474616Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:46:39.1884210Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:46:39.3928976Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:46:39.4105069Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:46:39.4152571Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:46:39.4194931Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:46:39.4218947Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:46:39.4230812Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:46:39.4380089Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:46:39.4399461Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:46:39.5387703Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0888bed1149cca415\n2022-03-21T21:46:39.5474616Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:46:39.5487145Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:46:39.5497480Z ##[error]Process completed with exit code 2.\n2022-03-21T21:46:39.5541319Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:46:39.5541544Z with:\n2022-03-21T21:46:39.5541698Z env:\n2022-03-21T21:46:39.5541851Z IN_CI: 1\n2022-03-21T21:46:39.5541997Z IS_GHA: 1\n2022-03-21T21:46:39.5542176Z GIT_DEFAULT_BRANCH: master\n2022-03-21T21:46:39.5542361Z ##[endgroup]\n2022-03-21T21:46:39.5557878Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge) (26/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:34:57.0623859Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:34:56.9039884Z Attempting uninstall: s3transfer\n2022-03-21T21:34:56.9041446Z Found existing installation: s3transfer 0.3.7\n2022-03-21T21:34:56.9090783Z Uninstalling s3transfer-0.3.7:\n2022-03-21T21:34:56.9095968Z Successfully uninstalled s3transfer-0.3.7\n2022-03-21T21:34:56.9453014Z Attempting uninstall: boto3\n2022-03-21T21:34:56.9454356Z Found existing installation: boto3 1.16.34\n2022-03-21T21:34:56.9564320Z Uninstalling boto3-1.16.34:\n2022-03-21T21:34:56.9578035Z Successfully uninstalled boto3-1.16.34\n2022-03-21T21:34:57.0091363Z Successfully installed boto3-1.19.12 botocore-1.22.12 s3transfer-0.5.2\n2022-03-21T21:34:57.0536230Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-034a3afd5d80b91fd\n2022-03-21T21:34:57.0623859Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:34:57.0637167Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:34:57.0647396Z ##[error]Process completed with exit code 2.\n2022-03-21T21:34:57.0688237Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:34:57.0688481Z with:\n2022-03-21T21:34:57.0688631Z env:\n2022-03-21T21:34:57.0688769Z IN_CI: 1\n2022-03-21T21:34:57.0688930Z IS_GHA: 1\n2022-03-21T21:34:57.0689109Z GIT_DEFAULT_BRANCH: master\n2022-03-21T21:34:57.0689462Z ##[endgroup]\n2022-03-21T21:34:57.0704768Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge) (27/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:05:00.7896545Z SUMMARY: Undefined.../jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in\n\n2022-03-21T21:05:00.7395504Z #10 0x5597fd5a9801 in run_mod /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:1037\n2022-03-21T21:05:00.7396330Z #11 0x5597fd5b47a9 in PyRun_StringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:961\n2022-03-21T21:05:00.7396688Z #12 0x5597fd5b480b in PyRun_SimpleStringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:455\n2022-03-21T21:05:00.7398664Z #13 0x5597fd5b4908 in pymain_run_command /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:420\n2022-03-21T21:05:00.7399177Z #14 0x5597fd5b4908 in pymain_run_python /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:2907\n2022-03-21T21:05:00.7399663Z #15 0x5597fd5b4908 in pymain_main /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3460\n2022-03-21T21:05:00.7399986Z #16 0x5597fd5b4ccb in _Py_UnixMain /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3495\n2022-03-21T21:05:00.7895241Z #17 0x7f0a5905983f in __libc_start_main /build/glibc-S7Ft5T/glibc-2.23/csu/../csu/libc-start.c:291\n2022-03-21T21:05:00.7895772Z #18 0x5597fd559554 in _start (/opt/conda/bin/python3.7+0x1d7554)\n2022-03-21T21:05:00.7896033Z \n2022-03-21T21:05:00.7896545Z SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior /var/lib/jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in \n2022-03-21T21:05:00.8063448Z + retcode=1\n2022-03-21T21:05:00.8063787Z + set -e\n2022-03-21T21:05:00.8064058Z + return 1\n2022-03-21T21:05:00.8067638Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX-* ]]\n2022-03-21T21:05:00.8068127Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X ]]\n2022-03-21T21:05:00.8069018Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX2-* ]]\n2022-03-21T21:05:00.8069500Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\2 ]]\n2022-03-21T21:05:00.8070105Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX512-* ]]\n2022-03-21T21:05:00.8070580Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\5\\1\\2 ]]\n2022-03-21T21:05:00.8072640Z + [[ linux-xenial-py3.7-clang7-asan-default == *tbb* ]]\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu) (28/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:48:17.3384813Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:48:16.8599645Z + python3 -m pip install boto3==1.19.12\n2022-03-21T22:48:17.1464241Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T22:48:17.1685222Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T22:48:17.1754164Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T22:48:17.1771662Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T22:48:17.1808722Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T22:48:17.1868636Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T22:48:17.1903889Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:48:17.2113746Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:48:17.3267404Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-01fe178c405417375\n2022-03-21T22:48:17.3384813Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:48:17.3402286Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:48:17.3418376Z ##[error]Process completed with exit code 2.\n2022-03-21T22:48:17.3470528Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T22:48:17.3470874Z with:\n2022-03-21T22:48:17.3471096Z env:\n2022-03-21T22:48:17.3471327Z IN_CI: 1\n2022-03-21T22:48:17.3471538Z IS_GHA: 1\n2022-03-21T22:48:17.3471802Z GIT_DEFAULT_BRANCH: master\n2022-03-21T22:48:17.3472083Z GPU_FLAG: --gpus all\n2022-03-21T22:48:17.3472322Z ##[endgroup]\n\n\n pull / linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge) (29/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:16:38.9646300Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:16:38.7995969Z Attempting uninstall: s3transfer\n2022-03-21T21:16:38.7998039Z Found existing installation: s3transfer 0.3.7\n2022-03-21T21:16:38.8066994Z Uninstalling s3transfer-0.3.7:\n2022-03-21T21:16:38.8072844Z Successfully uninstalled s3transfer-0.3.7\n2022-03-21T21:16:38.8449275Z Attempting uninstall: boto3\n2022-03-21T21:16:38.8451430Z Found existing installation: boto3 1.16.34\n2022-03-21T21:16:38.8559828Z Uninstalling boto3-1.16.34:\n2022-03-21T21:16:38.8574290Z Successfully uninstalled boto3-1.16.34\n2022-03-21T21:16:38.9100438Z Successfully installed boto3-1.19.12 botocore-1.22.12 s3transfer-0.5.2\n2022-03-21T21:16:38.9558098Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0d779c59d277d32ee\n2022-03-21T21:16:38.9646300Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:16:38.9658894Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:16:38.9673240Z ##[error]Process completed with exit code 2.\n2022-03-21T21:16:38.9720106Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:16:38.9720333Z with:\n2022-03-21T21:16:38.9720485Z env:\n2022-03-21T21:16:38.9720645Z IN_CI: 1\n2022-03-21T21:16:38.9720793Z IS_GHA: 1\n2022-03-21T21:16:38.9720970Z GIT_DEFAULT_BRANCH: master\n2022-03-21T21:16:38.9721151Z ##[endgroup]\n2022-03-21T21:16:38.9736762Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here to manually regenerate this comment.", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": { + "login": "facebook-github-bot" + }, + "databaseId": 964902894 + }, + { + "bodyText": "@vitaly-fedyunin @gottbrath FYI that this is the oneDNN Graph API integration. It depends on the #63748.", + "author": { + "login": "Jianhui-Li" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 970451860 + }, + { + "bodyText": "CI failures are currently being caused by some issues in the CI infra, and are also occurring with other PRs.", + "author": { + "login": "sanchitintel" + }, + "authorAssociation": "CONTRIBUTOR", + "editor": null, + "databaseId": 990641309 + }, + { + "bodyText": "CI failures are unrelated.", + "author": { + "login": "sanchitintel" + }, + "authorAssociation": "CONTRIBUTOR", + "editor": null, + "databaseId": 991281407 + }, + { + "bodyText": "The CI failure is unrelated.", + "author": { + "login": "sanchitintel" + }, + "authorAssociation": "CONTRIBUTOR", + "editor": null, + "databaseId": 995389295 + }, + { + "bodyText": "Hi, thank you for the PR!\nDo you mind running a larger amount of torchbench and reporting numbers ? You can look at Jason's post here for what models are supported in script. Initially just the vision models would be useful. @Krovatkin also did some benchmarking of a traced Bert model and found on average a ~16% speedup with this PR.", + "author": { + "login": "eellison" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1015689390 + }, + { + "bodyText": "Thanks a lot for reviewing, @eellison & @Krovatkin!\nWe just wanted to let you know that we're working on the benchmarking & will get back to you in a day, or two.\nUPDATE (Jan 21): While running some TorchBench models, we discovered some composability issues, and are working to ensure that oneDNN Graph would complement PyTorch's existing fusion capabilities, not hinder them.\nUPDATE (Jan 24): We've resolved the issues & will update this PR later today. Thanks!", + "author": { + "login": "sanchitintel" + }, + "authorAssociation": "CONTRIBUTOR", + "editor": { + "login": "sanchitintel" + }, + "databaseId": 1016996190 + }, + { + "bodyText": "Hello @eellison,\nWe used this TorchBench branch for comparison. compare_llga.sh can be run for comparison.\nFor benchmarking mobilenet_v3_large with hardswish support in oneDNN Graph, this oneDNN Graph branch can be used in third_party/ideep/mkl-dnn. It delivers a speedup over PyTorch JIT (NNC + OFI) because 21 additional reorders are prevented (the major factor here), and fusion with conv also helps further.\nThe next release of oneDNN Graph would have hardswish support.\nWe're also exploring adding a hardsigmoid op in oneDNN Graph.\nThank you!", + "author": { + "login": "sanchitintel" + }, + "authorAssociation": "CONTRIBUTOR", + "editor": { + "login": "sanchitintel" + }, + "databaseId": 1022709513 + }, + { + "bodyText": "Please note that this PR should be merged after #71546, as #71546 changes the third_party/ideep commit (this PR also uses that ideep commit, but it'd probably be better to merge #71546 first, so that oneDNN v2.5.2 upgrade would be in a separate PR). Thank you!", + "author": { + "login": "sanchitintel" + }, + "authorAssociation": "CONTRIBUTOR", + "editor": null, + "databaseId": 1026330085 + }, + { + "bodyText": "@sanchitintel mind rebasing and i'll land ?", + "author": { + "login": "eellison" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1055813984 + }, + { + "bodyText": "@eellison has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1057203495 + }, + { + "bodyText": "Thanks a lot for taking a look, @eellison! To fix this error, we would enable Bazel build for oneDNN Graph.", + "author": { + "login": "sanchitintel" + }, + "authorAssociation": "CONTRIBUTOR", + "editor": { + "login": "sanchitintel" + }, + "databaseId": 1061230087 + }, + { + "bodyText": "@eellison has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1063276600 + }, + { + "bodyText": "@malfet has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1074355779 + }, + { + "bodyText": "And graph_rewriter.cpp is full of DOS newlines...", + "author": { + "login": "malfet" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1074407452 + }, + { + "bodyText": "Hey @chunyuan-w.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.", + "author": { + "login": "github-actions" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 1074471758 + }, + { + "bodyText": "Thanks a ton for your help, @malfet & @eellison! :)\nWe'll incorporate your suggestions in subsequent PR(s).", + "author": { + "login": "sanchitintel" + }, + "authorAssociation": "CONTRIBUTOR", + "editor": { + "login": "sanchitintel" + }, + "databaseId": 1074492365 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOOYM_0Q==", + "hasPreviousPage": false + } + } + } + } + } + }, + "query_sha=f357fcdcc09e1433307da0c33debf9c08279ffd70d5bbc31323994f6346c629f name=pytorch number=73969 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "closed": true, + "isCrossRepository": true, + "author": { + "login": "malfet" + }, + "title": "Dummy change", + "body": "Test Plan: None at all\n\nDifferential Revision: D34753911\n\n", + "headRefName": "export-D34753911", + "headRepository": { + "nameWithOwner": "malfet/pytorch" + }, + "baseRefName": "master", + "baseRepository": { + "nameWithOwner": "pytorch/pytorch", + "isPrivate": false, + "defaultBranchRef": { + "name": "master" + } + }, + "mergeCommit": null, + "commits_with_authors": { + "nodes": [ + { + "commit": { + "author": { + "user": { + "login": "malfet" + }, + "email": "nshulga@fb.com", + "name": "Nikita Shulga" + }, + "oid": "4746da707a9912356f5179625da89616b228dc21" + } + } + ], + "pageInfo": { + "endCursor": "MQ", + "hasNextPage": false + }, + "totalCount": 1 + }, + "commits": { + "nodes": [ + { + "commit": { + "checkSuites": { + "nodes": [ + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-vulkan-bionic-py3.7-clang9" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928580?check_suite_focus=true" + }, + { + "name": "test (default, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483086020?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRQMQ=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592963" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928547?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2aM=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592965" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-bionic-rocm4.5-py3.7" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928602?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483235366?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483235570?check_suite_focus=true" + }, + { + "name": "test (distributed, 1, 1, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483235708?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbTiXw=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592966" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "win-vs2019-cuda11.3-py3" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928594?check_suite_focus=true" + }, + { + "name": "test (force_on_cpu, 1, 1, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483593208?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, windows.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483593337?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, windows.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483593461?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbY_vU=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592967" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928554?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2ao=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592969" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-docs" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928595?check_suite_focus=true" + }, + { + "name": "build-docs (cpp)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483078289?check_suite_focus=true" + }, + { + "name": "build-docs (python)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483078365?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRIt0=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592970" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-gcc7" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928553?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483074693?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483074951?check_suite_focus=true" + }, + { + "name": "test (distributed, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483075182?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRFm4=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592971" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3-clang5-mobile-build" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928556?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2aw=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592974" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Lint" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "shellcheck", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928552?check_suite_focus=true" + }, + { + "name": "quick-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928797?check_suite_focus=true" + }, + { + "name": "clang-tidy", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482929069?check_suite_focus=true" + }, + { + "name": "clang-format", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482929350?check_suite_focus=true" + }, + { + "name": "cmakelint", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482929628?check_suite_focus=true" + }, + { + "name": "toc", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482929838?check_suite_focus=true" + }, + { + "name": "py2-setup-validate-errormsg", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482929972?check_suite_focus=true" + }, + { + "name": "flake8-py3", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482930102?check_suite_focus=true" + }, + { + "name": "mypy", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482930251?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO4Es=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592975" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928573?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2b0=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592976" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-RA=", + "hasNextPage": true + } + }, + "oid": "4746da707a9912356f5179625da89616b228dc21" + } + } + ] + }, + "changedFiles": 1, + "files": { + "nodes": [ + { + "path": "tools/build_variables.bzl" + } + ], + "pageInfo": { + "endCursor": "MQ", + "hasNextPage": false + } + }, + "reviews": { + "nodes": [], + "pageInfo": { + "startCursor": null, + "hasPreviousPage": false + } + }, + "comments": { + "nodes": [ + { + "bodyText": "CI Flow Status\n\u269b\ufe0f CI Flow\nRuleset - Version: v1\nRuleset - File: https://github.com/malfet/pytorch/blob/4746da707a9912356f5179625da89616b228dc21/.github/generated-ciflow-ruleset.json\nPR ciflow labels: ciflow/default\nAdd ciflow labels to this PR to trigger more builds:\n\n\n\nWorkflows\nLabels (bold enabled)\nStatus\n\n\n\n\nTriggered Workflows\n\n\n\n\nlinux-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nlinux-binary-libtorch-cxx11-abi\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nlinux-binary-libtorch-pre-cxx11\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nlinux-binary-manywheel\nciflow/all, ciflow/binaries, ciflow/binaries_wheel, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nlinux-bionic-py3.7-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/noarch, ciflow/trunk\n\u2705 triggered\n\n\nlinux-bionic-rocm4.5-py3.7\nciflow/all, ciflow/default, ciflow/linux, ciflow/rocm, ciflow/trunk\n\u2705 triggered\n\n\nlinux-docs\nciflow/all, ciflow/cpu, ciflow/default, ciflow/docs, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-vulkan-bionic-py3.7-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk, ciflow/vulkan\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7-bazel-test\nciflow/all, ciflow/bazel, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3-clang5-mobile-build\nciflow/all, ciflow/default, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3-clang5-mobile-custom-build-static\nciflow/all, ciflow/default, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-clang7-asan\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/sanitizers, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-clang7-onnx\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/onnx, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build\nciflow/all, ciflow/cpu, ciflow/default, ciflow/libtorch, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc7\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc7-no-ops\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nmacos-arm64-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nmacos-arm64-binary-wheel\nciflow/binaries, ciflow/binaries_wheel, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-libtorch-cxx11-abi\nciflow/binaries, ciflow/binaries_libtorch, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-libtorch-pre-cxx11\nciflow/binaries, ciflow/binaries_libtorch, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-wheel\nciflow/binaries, ciflow/binaries_wheel, ciflow/default\n\u2705 triggered\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single\nciflow/all, ciflow/android, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit\nciflow/all, ciflow/android, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nwin-vs2019-cpu-py3\nciflow/all, ciflow/cpu, ciflow/default, ciflow/trunk, ciflow/win\n\u2705 triggered\n\n\nwin-vs2019-cuda11.3-py3\nciflow/all, ciflow/cuda, ciflow/default, ciflow/trunk, ciflow/win\n\u2705 triggered\n\n\nwindows-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nwindows-binary-libtorch-debug\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nwindows-binary-libtorch-release\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nwindows-binary-wheel\nciflow/all, ciflow/binaries, ciflow/binaries_wheel, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nSkipped Workflows\n\n\n\n\ncaffe2-linux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\ndocker-builds\nciflow/all, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-coreml\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-custom-ops\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-metal\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64-coreml\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda10.2-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda11.3-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlinux-bionic-cuda10.2-py3.9-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/slow, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlinux-docs-push\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7-no-ops\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-10-15-py3-arm64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-10-15-py3-lite-interpreter-x86-64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-11-py3-x86-64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nparallelnative-linux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nperiodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-bionic-cuda11.5-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled, ciflow/slow, ciflow/slow-gradcheck\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda11.3-py3.7-gcc7-debug\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-win-vs2019-cuda11.5-py3\nciflow/all, ciflow/cuda, ciflow/scheduled, ciflow/win\n\ud83d\udeab skipped\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-build\nciflow/all, ciflow/android, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\npytorch-xla-linux-bionic-py3.7-clang8\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk, ciflow/xla\n\ud83d\udeab skipped", + "author": { + "login": "pytorch-bot" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 1063079053 + }, + { + "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/73969\n\ud83d\udcc4 \u00a0Preview docs built from this PR\n\ud83d\udcc4 \u00a0Preview C++ docs built from this PR\n\ud83d\udd27 \u00a0Opt-in to CIFlow to control what jobs run on your PRs\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 4746da7 (more details on the Dr. CI page):\n\n\ud83d\udc9a \ud83d\udc9a Looks good so far! There are no failures yet. \ud83d\udc9a \ud83d\udc9a\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here to manually regenerate this comment.", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": { + "login": "facebook-github-bot" + }, + "databaseId": 1063079113 + }, + { + "bodyText": "This pull request was exported from Phabricator. Differential Revision: D34753911", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1063079731 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOP11MjQ==", + "hasPreviousPage": false + } + } + } + } + } + }, + "query_sha=f357fcdcc09e1433307da0c33debf9c08279ffd70d5bbc31323994f6346c629f name=pytorch number=73099 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "closed": true, + "isCrossRepository": false, + "author": { + "login": "BowenBao" + }, + "title": "[ONNX] Make graph name spec-compliant (#71961)", + "body": "Stack from [ghstack](https://github.com/ezyang/ghstack):\n* #73104\n* #73103\n* #73102\n* #73101\n* #73100\n* __->__ #73099\n\n[According to the ONNX spec](https://github.com/onnx/onnx/blob/main/docs/IR.md#names-within-a-graph),\nall names must adhere to C90 identifier syntax rules, which means no\ndashes.\n\nFixes: #30952", + "headRefName": "gh/BowenBao/138/head", + "headRepository": { + "nameWithOwner": "pytorch/pytorch" + }, + "baseRefName": "gh/BowenBao/138/base", + "baseRepository": { + "nameWithOwner": "pytorch/pytorch", + "isPrivate": false, + "defaultBranchRef": { + "name": "master" + } + }, + "mergeCommit": null, + "commits_with_authors": { + "nodes": [ + { + "commit": { + "author": { + "user": { + "login": "BowenBao" + }, + "email": "bowbao@microsoft.com", + "name": "BowenBao" + }, + "oid": "3038b939eb2069653305c419326a0f47d2598e39" + } + } + ], + "pageInfo": { + "endCursor": "MQ", + "hasNextPage": false + }, + "totalCount": 1 + }, + "commits": { + "nodes": [ + { + "commit": { + "checkSuites": { + "nodes": [ + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "TorchBench CI (pytorch-linux-py3.7-cu102)" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "run-torchbench", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252161498?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNn9o=", + "hasNextPage": false + } + }, + "conclusion": "SKIPPED", + "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189561" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-cuda11.3-py3.7-gcc7" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252161648?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252387496?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252387628?check_suite_focus=true" + }, + { + "name": "test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252387825?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkRE_E=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189562" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-gcc7-no-ops" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252161681?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoJE=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189563" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3-clang5-mobile-build" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252161670?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoIY=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189564" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252161691?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoJs=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189566" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-bionic-py3.7-clang9" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252161678?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252286900?check_suite_focus=true" + }, + { + "name": "test (noarch, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252287072?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252287232?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkPiwA=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189567" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-vulkan-bionic-py3.7-clang9" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252161699?check_suite_focus=true" + }, + { + "name": "test (default, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252302340?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkPxgQ=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189568" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3-clang5-mobile-custom-build-static" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252161696?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoKA=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189570" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "win-vs2019-cpu-py3" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252161646?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252830090?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252830141?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkX070=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189571" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-gcc7" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252161666?check_suite_focus=true" + }, + { + "name": "test (distributed, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252286386?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252286526?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5252286720?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkPiQA=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189572" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS8Q=", + "hasNextPage": true + } + }, + "oid": "3038b939eb2069653305c419326a0f47d2598e39" + } + } + ] + }, + "changedFiles": 162, + "files": { + "nodes": [ + { + "path": "test/onnx/expect/TestOperators.test_acos.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_add_broadcast.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_add_left_broadcast.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_add_size1_broadcast.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_addconstant.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_addmm.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_arange_dynamic.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_argmax.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_asin.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_at_op.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_atan.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_aten_embedding_1.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_aten_embedding_2.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_avg_pool2d.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_baddbmm.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_basic.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_batchnorm.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_batchnorm_1d.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_batchnorm_onnx_irv4.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_batchnorm_training.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_bitshift.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_c2_op.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_chunk.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_clip.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_clip_max.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_clip_min.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_concat2.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_conv.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_conv_onnx_irv4.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_conv_onnx_irv4_opset8.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_convtranspose.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_cos.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_cumsum.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_det.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_dict.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_dict_str.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_dim.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_dropout.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_dropout_default.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_dropout_opset12.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_dropout_training.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_dropout_training_opset12.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_dynamic_axes_add.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_dynamic_axes_add_inputs_same_symbolic_shape.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_dynamic_axes_matmul.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_dynamic_axes_reduce_mean.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_dynamic_axes_unchange.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_elu.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_embedding_bags.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_empty_like.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_empty_like_opset7.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_equal.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_erf.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_exp.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_expand.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_flatten.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_flatten2D.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_fmod.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_frobenius_norm.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_full.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_full_like.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_gather.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_gather_opset11.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_ge.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_gelu.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_gt.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_hardtanh.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_implicit_expand.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_index.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_isnan.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_layer_norm_aten.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_le.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_linear.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_log_sigmoid.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_logsoftmax.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_lstm_none_sequence_lens.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_lt.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_master_opset.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_max.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_maxpool.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_maxpool_dilations.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_maxpool_indices.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_mean.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_mean_dtype.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_meshgrid.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_min.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_mm.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_narrow.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_ne.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_nonzero.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_norm_p1.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_norm_p2.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_ones_like.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_pad.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_params.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_params_onnx_irv4.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_permute2.expect" + } + ], + "pageInfo": { + "endCursor": "MTAw", + "hasNextPage": true + } + }, + "reviews": { + "nodes": [ + { + "author": { + "login": "garymm" + }, + "state": "APPROVED" + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMi0xOFQxNzoxODo0NC0wODowMLkyMDIyLTAyLTE4VDE3OjE4OjQ0LTA4OjAwzjTr0H0=", + "hasPreviousPage": false + } + }, + "comments": { + "nodes": [ + { + "bodyText": "This PR cannot be merged by bot due to changing > 100 files. @malfet \n \n \n pytorch/.github/scripts/trymerge.py\n \n \n Line 63\n in\n 932adf2\n \n \n \n \n\n \n \n files(last: 100) { \n \n \n \n\n Can this be relaxed? If not please import.", + "author": { + "login": "BowenBao" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 1048084569 + }, + { + "bodyText": "This PR cannot be merged by bot due to changing > 100 files. @malfet\nCan this be relaxed? If not please import.\n\nWow, you've hit a really interesting problem. 100 is a limitation enforced by GitHub, see https://docs.github.com/en/graphql/overview/resource-limitations, but I can implement a pagination. Do you mind keeping it like that for a bit, want to land a fix soonish.", + "author": { + "login": "malfet" + }, + "authorAssociation": "MEMBER", + "editor": null, + "databaseId": 1048088691 + }, + { + "bodyText": "@malfet Thank you for info. Sure, I have separated the rest of stack from this one, we'll wait for the fix to try again.", + "author": { + "login": "BowenBao" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 1048090640 + }, + { + "bodyText": "@pytorchbot merge this", + "author": { + "login": "BowenBao" + }, + "authorAssociation": "COLLABORATOR", + "editor": null, + "databaseId": 1050293881 + }, + { + "bodyText": "Hey @BowenBao.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.", + "author": { + "login": "github-actions" + }, + "authorAssociation": "NONE", + "editor": null, + "databaseId": 1050295451 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOPniAWQ==", + "hasPreviousPage": true + } + } + } + } + } + }, + "query_sha=0a34acb829d8aca9dd28a8ba388dfa52f6ecdde7e903ace1caabdcfaba87de98 cursor=MTAw name=pytorch number=73099 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "files": { + "nodes": [ + { + "path": "test/onnx/expect/TestOperators.test_pixel_shuffle.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_pow.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_prelu.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_prod.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_prod_dtype.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_rand.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_randn.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_reduce_sum_negative_indices.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_reduced_mean.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_reduced_mean_dtype.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_reduced_prod.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_reduced_prod_dtype.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_reduced_sum.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_reduced_sum_dtype.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_reducemax.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_reducemin.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_remainder.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_repeat.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_round.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_rrelu.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_rsqrt.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_rsub.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_scatter_add.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_scatter_add_opset11.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_selu.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_shape_value_map.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_sign.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_sin.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_slice.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_slice_dynamic.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_softmaxcrossentropy.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d_none.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_softmaxcrossentropy_4d.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_softmaxcrossentropy_ignore_index.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_softmaxcrossentropy_weights.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_split.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_split_with_sizes.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_sqrt.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_std.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_sum.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_sum_dtype.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_tan.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_topk.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_topk_smallest_unsorted.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_transpose.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_type_as.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_unfold.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_unique.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_unsqueeze.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_upsample_nearest_scale.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_upsample_nearest_scale_default_scale_factor.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_upsample_nearest_size.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_view.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_view_flatten.expect" + }, + { + "path": "test/onnx/expect/TestOperators.test_zeros_like.expect" + }, + { + "path": "torch/csrc/jit/serialization/export.cpp" + }, + { + "path": "torch/csrc/jit/serialization/export.h" + } + ], + "pageInfo": { + "endCursor": "MTYy", + "hasNextPage": false + } + } + } + } + } + }, + "query_sha=f357fcdcc09e1433307da0c33debf9c08279ffd70d5bbc31323994f6346c629f name=pytorch number=74649 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "closed": true, + "isCrossRepository": false, + "author": { + "login": "malfet" + }, + "title": "This should fail flake8", + "body": "Test issue for GHF mandatory checks", + "headRefName": "malfet-patch-8", + "headRepository": { + "nameWithOwner": "pytorch/pytorch" + }, + "baseRefName": "master", + "baseRepository": { + "nameWithOwner": "pytorch/pytorch", + "isPrivate": false, + "defaultBranchRef": { + "name": "master" + } + }, + "mergeCommit": null, + "commits_with_authors": { + "nodes": [ + { + "commit": { + "author": { + "user": { + "login": "malfet" + }, + "email": "nshulga@fb.com", + "name": "Nikita Shulga" + }, + "oid": "57c86ff1c5ab948888fd329986c9d55796680e33" + } + }, + { + "commit": { + "author": { + "user": { + "login": "malfet" + }, + "email": "nshulga@fb.com", + "name": "Nikita Shulga" + }, + "oid": "6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4" + } + } + ], + "pageInfo": { + "endCursor": "Mg", + "hasNextPage": false + }, + "totalCount": 2 + }, + "commits": { + "nodes": [ + { + "commit": { + "checkSuites": { + "nodes": [ + { + "app": { + "name": "Facebook GitHub Tools", + "databaseId": 12274 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [ + { + "name": "Facebook CLA Check", + "conclusion": "SUCCESS", + "detailsUrl": "https://code.intern.facebook.com/cla/" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVHsK3w=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018129" + }, + { + "app": { + "name": "Netlify", + "databaseId": 13473 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018131" + }, + { + "app": { + "name": "Azure Pipelines", + "databaseId": 9426 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018132" + }, + { + "app": { + "name": "Dependabot", + "databaseId": 29110 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018134" + }, + { + "app": { + "name": "Codecov", + "databaseId": 254 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018139" + }, + { + "app": { + "name": "PyTorch Bot", + "databaseId": 40112 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018142" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Lint" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "clang-format", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669399915?check_suite_focus=true" + }, + { + "name": "clang-tidy", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669399990?check_suite_focus=true" + }, + { + "name": "cmakelint", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669400052?check_suite_focus=true" + }, + { + "name": "flake8-py3", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669400154?check_suite_focus=true" + }, + { + "name": "mypy", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669400239?check_suite_focus=true" + }, + { + "name": "Test collect_env (with_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669400327?check_suite_focus=true" + }, + { + "name": "Test collect_env (without_torch)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669400361?check_suite_focus=true" + }, + { + "name": "Test tools", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669400470?check_suite_focus=true" + }, + { + "name": "py2-setup-validate-errormsg", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669400681?check_suite_focus=true" + }, + { + "name": "quick-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669400789?check_suite_focus=true" + }, + { + "name": "toc", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669400953?check_suite_focus=true" + }, + { + "name": "shellcheck", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669401126?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVHsMiY=", + "hasNextPage": false + } + }, + "conclusion": "FAILURE", + "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018384" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "TorchBench CI (pytorch-linux-py3.7-cu102)" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "run-torchbench", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669399917?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVHsLW0=", + "hasNextPage": false + } + }, + "conclusion": "SKIPPED", + "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018395" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pull" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "pytorch-xla-linux-bionic-py3.7-clang8", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669414276?check_suite_focus=true" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669414324?check_suite_focus=true" + }, + { + "name": "linux-bionic-py3.7-clang9 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669414430?check_suite_focus=true" + }, + { + "name": "linux-bionic-rocm4.5-py3.7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669414605?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669414697?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669414841?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3-clang5-mobile-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669414951?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669415003?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669415060?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669415120?check_suite_focus=true" + }, + { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669415166?check_suite_focus=true" + }, + { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669415236?check_suite_focus=true" + }, + { + "name": "win-vs2019-cuda11.3-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669415288?check_suite_focus=true" + }, + { + "name": "win-vs2019-cpu-py3 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669415348?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc7-no-ops / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669415451?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669415561?check_suite_focus=true" + }, + { + "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669415607?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669415642?check_suite_focus=true" + }, + { + "name": "pytorch-xla-linux-bionic-py3.7-clang8", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669415706?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669415757?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669488974?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669489019?check_suite_focus=true" + }, + { + "name": "linux-docs / build-docs (cpp)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669492162?check_suite_focus=true" + }, + { + "name": "linux-docs / build-docs (python)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669492211?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669492293?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669492341?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669492396?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669492440?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669492497?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669492558?check_suite_focus=true" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669496296?check_suite_focus=true" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669496350?check_suite_focus=true" + }, + { + "name": "linux-bionic-py3.7-clang9 / test (noarch, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669496393?check_suite_focus=true" + }, + { + "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669498726?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669500818?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669500848?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669518721?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669518760?check_suite_focus=true" + }, + { + "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669518798?check_suite_focus=true" + }, + { + "name": "linux-bionic-rocm4.5-py3.7 / test (default, 1, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669549301?check_suite_focus=true" + }, + { + "name": "linux-bionic-rocm4.5-py3.7 / test (default, 2, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669549318?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669559843?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669567414?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669567499?check_suite_focus=true" + }, + { + "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669567553?check_suite_focus=true" + }, + { + "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669619773?check_suite_focus=true" + }, + { + "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669619803?check_suite_focus=true" + }, + { + "name": "win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669724420?check_suite_focus=true" + }, + { + "name": "win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669724451?check_suite_focus=true" + }, + { + "name": "win-vs2019-cuda11.3-py3 / test (force_on_cpu, 1, 1, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5669724478?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVHxIT4=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018405" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVhlkGU=", + "hasNextPage": false + } + }, + "oid": "6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4" + } + } + ] + }, + "changedFiles": 1, + "files": { + "nodes": [ + { + "path": "torch/nn/cpp.py" + } + ], + "pageInfo": { + "endCursor": "MQ", + "hasNextPage": false + } + }, + "reviews": { + "nodes": [ + { + "author": { + "login": "seemethere" + }, + "state": "APPROVED" + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMy0yM1QxNTo1MDo0NS0wNzowMLkyMDIyLTAzLTIzVDE1OjUwOjQ1LTA3OjAwzjbPEDg=", + "hasPreviousPage": false + } + }, + "comments": { + "nodes": [ + { + "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/74649\n\u21a9\ufe0f \u00a0[fb-only] Re-run with SSH instructions\nNeed help or want to give feedback on the CI? Visit our office hours\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 6c3c3de (more details on the Dr. CI page):\n\n\n1/1 failures introduced in this PR\n\n\n1 failure not recognized by patterns:\n\n\n\nJob\nStep\nAction\n\n\n\n\n Lint / flake8-py3\nFail if there were any warnings\n\ud83d\udd01 rerun\n\n\n\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here to manually regenerate this comment.", + "author": { + "login": "facebook-github-bot" + }, + "authorAssociation": "MEMBER", + "editor": { + "login": "facebook-github-bot" + }, + "databaseId": 1076891218 + } + ], + "pageInfo": { + "startCursor": "Y3Vyc29yOnYyOpHOQDAOUg==", + "hasPreviousPage": false + } + } + } + } + } + }, + "query_sha=a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5 cursor=None name=metamates org=pytorch": { + "data": { + "organization": { + "team": { + "members": { + "nodes": [ + { + "login": "dreiss" + }, + { + "login": "kumpera" + }, + { + "login": "ezyang" + }, + { + "login": "stephenroller" + }, + { + "login": "swolchok" + }, + { + "login": "hyuen" + }, + { + "login": "orionr" + }, + { + "login": "dhruvbird" + }, + { + "login": "likethesky" + }, + { + "login": "lw" + }, + { + "login": "raziel" + }, + { + "login": "simpkins" + }, + { + "login": "ebyrne" + }, + { + "login": "Babar" + }, + { + "login": "kostmo" + }, + { + "login": "0x00b1" + }, + { + "login": "bhosmer" + }, + { + "login": "zdevito" + }, + { + "login": "bugra" + }, + { + "login": "caraya10" + }, + { + "login": "kit1980" + }, + { + "login": "shoumikhin" + }, + { + "login": "teytaud" + }, + { + "login": "xuzhao9" + }, + { + "login": "jansel" + }, + { + "login": "abhinavarora" + }, + { + "login": "b0noI" + }, + { + "login": "djthorne" + }, + { + "login": "nairbv" + }, + { + "login": "Mortimerp9" + }, + { + "login": "dadkins20" + }, + { + "login": "colesbury" + }, + { + "login": "laurencer" + }, + { + "login": "nickgg" + }, + { + "login": "yzhao30" + }, + { + "login": "bearzx" + }, + { + "login": "mattjgalloway" + }, + { + "login": "chenyang78" + }, + { + "login": "yns88" + }, + { + "login": "lc0" + }, + { + "login": "wenleix" + }, + { + "login": "jingsh" + }, + { + "login": "mthrok" + }, + { + "login": "drdarshan" + }, + { + "login": "tvalentius" + }, + { + "login": "d4l3k" + }, + { + "login": "jamiemccrindle" + }, + { + "login": "kazhang" + }, + { + "login": "simonhollis" + }, + { + "login": "lqiao" + }, + { + "login": "ajyu" + }, + { + "login": "govardhan" + }, + { + "login": "yinghai" + }, + { + "login": "zyan0" + }, + { + "login": "ajtulloch" + }, + { + "login": "pbelevich" + }, + { + "login": "VitalyFedyunin" + }, + { + "login": "dbish" + }, + { + "login": "NicolasHug" + }, + { + "login": "efaust" + }, + { + "login": "idning" + }, + { + "login": "soumith" + }, + { + "login": "nimin98" + }, + { + "login": "chaekit" + }, + { + "login": "radkris-git" + }, + { + "login": "javier-m" + }, + { + "login": "jmdetloff" + }, + { + "login": "mostafaelhoushi" + }, + { + "login": "brianjo" + }, + { + "login": "ShijunK" + }, + { + "login": "suo" + }, + { + "login": "vkuzo" + }, + { + "login": "seemethere" + }, + { + "login": "cpuhrsch" + }, + { + "login": "qihqi" + }, + { + "login": "jackm321" + }, + { + "login": "linbinyu" + }, + { + "login": "neerajprad" + }, + { + "login": "rsemenov" + }, + { + "login": "ziky90" + }, + { + "login": "gmagogsfm" + }, + { + "login": "zzzwen" + }, + { + "login": "ikriv" + }, + { + "login": "deeptigp" + }, + { + "login": "andrewor14" + }, + { + "login": "jianyuh" + }, + { + "login": "cykustcc" + }, + { + "login": "highker" + }, + { + "login": "navahgar" + }, + { + "login": "beauby" + }, + { + "login": "jeffreyksmithjr" + }, + { + "login": "suphoff" + }, + { + "login": "smessmer" + }, + { + "login": "ananthsub" + }, + { + "login": "d1jang" + }, + { + "login": "firstprayer" + }, + { + "login": "malfet" + }, + { + "login": "fegin" + }, + { + "login": "hanton" + }, + { + "login": "zanqi" + } + ], + "pageInfo": { + "hasNextPage": true, + "endCursor": "Y3Vyc29yOnYyOpHOACa60A==" + } + } + } + } + } + }, + "query_sha=a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5 cursor=Y3Vyc29yOnYyOpHOACa60A== name=metamates org=pytorch": { + "data": { + "organization": { + "team": { + "members": { + "nodes": [ + { + "login": "bujar" + }, + { + "login": "supriyar" + }, + { + "login": "kausv" + }, + { + "login": "divchenko" + }, + { + "login": "dagitses" + }, + { + "login": "rahuln32" + }, + { + "login": "bilgeacun" + }, + { + "login": "caogao" + }, + { + "login": "blefaudeux" + }, + { + "login": "miguelmartin75" + }, + { + "login": "penguinwu" + }, + { + "login": "shz117" + }, + { + "login": "ajliu" + }, + { + "login": "saketh-are" + }, + { + "login": "jessebrizzi" + }, + { + "login": "msaroufim" + }, + { + "login": "mdundas" + }, + { + "login": "davides" + }, + { + "login": "alannnna" + }, + { + "login": "hlin09" + }, + { + "login": "terrychenism" + }, + { + "login": "xiaomengy" + }, + { + "login": "jisaacso" + }, + { + "login": "fkhan1337" + }, + { + "login": "xing-liu" + }, + { + "login": "alanadakotashine" + }, + { + "login": "desertfire" + }, + { + "login": "banitag1" + }, + { + "login": "letterx" + }, + { + "login": "gchanan" + }, + { + "login": "dbort" + }, + { + "login": "bilalsal" + }, + { + "login": "jaceyca" + }, + { + "login": "serhaty" + }, + { + "login": "yf225" + }, + { + "login": "yifuwang" + }, + { + "login": "piyushmh" + }, + { + "login": "z-a-f" + }, + { + "login": "superzgc" + }, + { + "login": "tenpercent" + }, + { + "login": "bertmaher" + }, + { + "login": "chauhang" + }, + { + "login": "jiayisuse" + }, + { + "login": "bradleyhd" + }, + { + "login": "ZolotukhinM" + }, + { + "login": "jamesr66a" + }, + { + "login": "mullachv" + }, + { + "login": "voznesenskym" + }, + { + "login": "charliechen0401" + }, + { + "login": "bwasti" + }, + { + "login": "cryptopic" + }, + { + "login": "chinannyang" + }, + { + "login": "NivekT" + }, + { + "login": "zhxchen17" + }, + { + "login": "jerryzh168" + }, + { + "login": "MohammadMahdiJavanmard" + }, + { + "login": "rajkar86" + }, + { + "login": "wconstab" + }, + { + "login": "Hangjun" + }, + { + "login": "davidberard98" + }, + { + "login": "Krovatkin" + }, + { + "login": "CamiWilliams" + }, + { + "login": "J0Nreynolds" + }, + { + "login": "datumbox" + }, + { + "login": "aartibasant" + }, + { + "login": "xta0" + }, + { + "login": "zou3519" + }, + { + "login": "xman1979" + }, + { + "login": "suraj813" + }, + { + "login": "gqchen" + }, + { + "login": "jayleverett" + }, + { + "login": "george-qi" + }, + { + "login": "abhikrish" + }, + { + "login": "zhangguanheng66" + }, + { + "login": "mikeiovine" + }, + { + "login": "Adolfo-Karim" + }, + { + "login": "Chillee" + }, + { + "login": "albanD" + }, + { + "login": "bigfootjon" + }, + { + "login": "robotal" + }, + { + "login": "MarcioPorto" + }, + { + "login": "srsuryadev" + }, + { + "login": "IvanKobzarev" + }, + { + "login": "eprivezentsev" + }, + { + "login": "kwen2501" + }, + { + "login": "linux-jedi" + }, + { + "login": "chandlerzuo" + }, + { + "login": "prateek1404" + }, + { + "login": "otsneh" + }, + { + "login": "husthyc" + }, + { + "login": "briancoutinho" + }, + { + "login": "fduwjj" + }, + { + "login": "frank-wei" + }, + { + "login": "esqu1" + }, + { + "login": "prabhat00155" + }, + { + "login": "Gamrix" + }, + { + "login": "QuentinDuval" + }, + { + "login": "atalman" + }, + { + "login": "xush6528" + }, + { + "login": "dracifer" + } + ], + "pageInfo": { + "hasNextPage": true, + "endCursor": "Y3Vyc29yOnYyOpHOAHSKuw==" + } + } + } + } + } + }, + "query_sha=a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5 cursor=Y3Vyc29yOnYyOpHOAHSKuw== name=metamates org=pytorch": { + "data": { + "organization": { + "team": { + "members": { + "nodes": [ + { + "login": "SS-JIA" + }, + { + "login": "helunwencser" + }, + { + "login": "xw285cornell" + }, + { + "login": "hhbyyh" + }, + { + "login": "rohan-varma" + }, + { + "login": "teng-li" + }, + { + "login": "larryliu0820" + }, + { + "login": "lyoka" + }, + { + "login": "cbalioglu" + }, + { + "login": "hl475" + }, + { + "login": "hwangjeff" + }, + { + "login": "Jack-Khuu" + }, + { + "login": "alanwaketan" + }, + { + "login": "mehtanirav" + }, + { + "login": "nateanl" + }, + { + "login": "fuqianz" + }, + { + "login": "boyuantan" + }, + { + "login": "muntaqim" + }, + { + "login": "dennysem" + }, + { + "login": "ymao1993" + }, + { + "login": "fmassa" + }, + { + "login": "esantorella" + }, + { + "login": "HamidShojanazeri" + }, + { + "login": "jubinchheda" + }, + { + "login": "mehdimashayekhi" + }, + { + "login": "rkindi" + }, + { + "login": "wanchaol" + }, + { + "login": "zephirefaith" + }, + { + "login": "alexbeloi" + }, + { + "login": "kapilsh" + }, + { + "login": "plahera" + }, + { + "login": "SherlockNoMad" + }, + { + "login": "pritamdamania87" + }, + { + "login": "rahxephon89" + }, + { + "login": "iseeyuan" + }, + { + "login": "Matphyler" + }, + { + "login": "protonu" + }, + { + "login": "terhuhf" + }, + { + "login": "aruntonic" + }, + { + "login": "gcatron" + }, + { + "login": "yingrliu" + }, + { + "login": "alexanderguzhva" + }, + { + "login": "angelayi" + }, + { + "login": "zhaoalex" + }, + { + "login": "shahofblah" + }, + { + "login": "vivekmig" + }, + { + "login": "jspisak" + }, + { + "login": "akshaypandian" + }, + { + "login": "HarutMov" + }, + { + "login": "tktrungna" + }, + { + "login": "eellison" + }, + { + "login": "ziab" + }, + { + "login": "NarineK" + }, + { + "login": "andrewconnors" + }, + { + "login": "wenwei202" + }, + { + "login": "jg2912" + }, + { + "login": "jwpark1985" + }, + { + "login": "robieta" + }, + { + "login": "amirhmk" + }, + { + "login": "davidxili" + }, + { + "login": "mreso" + }, + { + "login": "soulitzer" + }, + { + "login": "prigoyal" + }, + { + "login": "PaliC" + }, + { + "login": "anijain2305" + }, + { + "login": "pvtuan10" + }, + { + "login": "huangyi1979" + }, + { + "login": "osalpekar" + }, + { + "login": "xiaohui-zhang" + }, + { + "login": "jerry39213gh" + }, + { + "login": "jarodhou" + }, + { + "login": "hlu1" + }, + { + "login": "huiguoo" + }, + { + "login": "H-Huang" + }, + { + "login": "vtsyvina" + }, + { + "login": "qchip" + }, + { + "login": "Nitrokitty" + }, + { + "login": "satgera" + }, + { + "login": "ngimel" + }, + { + "login": "dongreenberg" + }, + { + "login": "sijiac" + }, + { + "login": "markkm" + }, + { + "login": "EscapeZero" + }, + { + "login": "bdhirsh" + }, + { + "login": "cccclai" + }, + { + "login": "carolineechen" + }, + { + "login": "tugsbayasgalan" + }, + { + "login": "agunapal" + }, + { + "login": "frankseide" + }, + { + "login": "YazhiGao" + }, + { + "login": "pavithranrao" + }, + { + "login": "VirgileHlav" + }, + { + "login": "mrshenli" + }, + { + "login": "lena-kashtelyan" + }, + { + "login": "brad-mengchi" + }, + { + "login": "kimishpatel" + }, + { + "login": "aaronenyeshi" + }, + { + "login": "shajrawi" + }, + { + "login": "samdow" + }, + { + "login": "dzhulgakov" + } + ], + "pageInfo": { + "hasNextPage": true, + "endCursor": "Y3Vyc29yOnYyOpHOARD9PA==" + } + } + } + } + } + }, + "query_sha=a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5 cursor=Y3Vyc29yOnYyOpHOARD9PA== name=metamates org=pytorch": { + "data": { + "organization": { + "team": { + "members": { + "nodes": [ + { + "login": "great-way" + }, + { + "login": "ashkan-software" + }, + { + "login": "garroud" + }, + { + "login": "knottb" + }, + { + "login": "jbitton" + }, + { + "login": "jdsgomes" + }, + { + "login": "zhangxy988" + }, + { + "login": "samlurye" + }, + { + "login": "EdwardTyantov" + }, + { + "login": "anjali411" + }, + { + "login": "kryanchun" + }, + { + "login": "842974287" + }, + { + "login": "JacobSzwejbka" + }, + { + "login": "nishantpdce" + }, + { + "login": "srinivas212" + }, + { + "login": "cherie11" + }, + { + "login": "shreyanb98" + }, + { + "login": "kavoor" + }, + { + "login": "dzdang" + }, + { + "login": "naveedgol" + }, + { + "login": "Nayef211" + }, + { + "login": "zrphercule" + }, + { + "login": "HengruiX" + }, + { + "login": "langong347" + }, + { + "login": "soapisnotfat" + }, + { + "login": "ebsmothers" + }, + { + "login": "anshuljain1" + }, + { + "login": "b-koopman" + }, + { + "login": "salilsdesai" + }, + { + "login": "vmoens" + }, + { + "login": "printfoo" + }, + { + "login": "xinyang0" + }, + { + "login": "ramvenkat98" + }, + { + "login": "fbbradheintz" + }, + { + "login": "kauterry" + }, + { + "login": "VenkatSubramaniam" + }, + { + "login": "yxia11" + }, + { + "login": "anirbanraywork" + }, + { + "login": "houseroad" + }, + { + "login": "erichan1" + }, + { + "login": "hsrussell" + }, + { + "login": "ilia-cher" + }, + { + "login": "ajitmaths" + }, + { + "login": "awgu" + }, + { + "login": "wz337" + }, + { + "login": "LynneD" + }, + { + "login": "qxy11" + }, + { + "login": "janeyx99" + }, + { + "login": "msedwar" + }, + { + "login": "dustinh1999" + }, + { + "login": "glaringlee" + }, + { + "login": "anj-s" + }, + { + "login": "liuchen9494" + }, + { + "login": "drisspg" + }, + { + "login": "RdoubleA" + }, + { + "login": "jramseyer" + }, + { + "login": "zengk95" + }, + { + "login": "gtarjun" + }, + { + "login": "mikaylagawarecki" + }, + { + "login": "xianxl" + }, + { + "login": "lucasgadams" + }, + { + "login": "mingzhe09088" + }, + { + "login": "Vucibatina" + }, + { + "login": "aazzolini" + }, + { + "login": "nataliakliushkina" + }, + { + "login": "mruberry" + }, + { + "login": "HDCharles" + }, + { + "login": "mcr229" + }, + { + "login": "manuelcandales" + }, + { + "login": "guangy10" + }, + { + "login": "mengwa41" + }, + { + "login": "hx89" + }, + { + "login": "kiukchung" + }, + { + "login": "hanhsienhuang" + }, + { + "login": "clee2000" + }, + { + "login": "lhuang04" + }, + { + "login": "sidneyfletcher" + }, + { + "login": "gottbrath" + }, + { + "login": "lessw2020" + }, + { + "login": "choward232" + }, + { + "login": "mmh683" + }, + { + "login": "dwarakrajagopal" + }, + { + "login": "lazysjb" + }, + { + "login": "zhaojuanmao" + }, + { + "login": "johncalab" + }, + { + "login": "dhthompson" + }, + { + "login": "superwizard2019" + }, + { + "login": "fbhuba" + }, + { + "login": "shunting314" + }, + { + "login": "edward-io" + }, + { + "login": "sean-ngo" + }, + { + "login": "bzinodev" + }, + { + "login": "xcheng16" + }, + { + "login": "adamomainz" + }, + { + "login": "sluks" + }, + { + "login": "poojahp" + }, + { + "login": "ansley" + }, + { + "login": "mvsampath" + }, + { + "login": "cheetah2216" + }, + { + "login": "pinaki-mukerji" + } + ], + "pageInfo": { + "hasNextPage": true, + "endCursor": "Y3Vyc29yOnYyOpHOA7KsGw==" + } + } + } + } + } + }, + "query_sha=a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5 cursor=Y3Vyc29yOnYyOpHOA7KsGw== name=metamates org=pytorch": { + "data": { + "organization": { + "team": { + "members": { + "nodes": [ + { + "login": "hongxiayang" + }, + { + "login": "kyulee-com" + }, + { + "login": "sstsai-adl" + }, + { + "login": "dahsh" + }, + { + "login": "ohgnoes" + }, + { + "login": "szewaiyuen7" + }, + { + "login": "byterover" + }, + { + "login": "ejguan" + }, + { + "login": "nimaelyasi" + }, + { + "login": "nikithamalgifb" + }, + { + "login": "qxu-fb" + }, + { + "login": "sshawnwu" + }, + { + "login": "andrewyounkins" + }, + { + "login": "njuvekar" + }, + { + "login": "iramazanli" + }, + { + "login": "jnkwok1" + }, + { + "login": "kurman" + }, + { + "login": "jbschlosser" + }, + { + "login": "ccongge" + }, + { + "login": "haichuan-fb" + }, + { + "login": "wwang84" + }, + { + "login": "JustinPinero" + }, + { + "login": "gcramer23" + }, + { + "login": "woo-kim" + }, + { + "login": "yuguo68" + }, + { + "login": "chowarfb" + }, + { + "login": "priyaramani" + }, + { + "login": "yidawang-oss" + }, + { + "login": "beback4u" + }, + { + "login": "asalioufb" + }, + { + "login": "four4fish" + }, + { + "login": "kkosik20" + }, + { + "login": "KZFB" + }, + { + "login": "sisilmehta2000" + }, + { + "login": "henryliu-bluehills" + }, + { + "login": "madhu-fb" + }, + { + "login": "muchulee8" + }, + { + "login": "anirbanr-fb-r2p" + } + ], + "pageInfo": { + "hasNextPage": false, + "endCursor": "Y3Vyc29yOnYyOpHOBkbBhA==" + } + } + } + } + } + }, + "query_sha=0a34acb829d8aca9dd28a8ba388dfa52f6ecdde7e903ace1caabdcfaba87de98 cursor=MTAw name=pytorch number=76118 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "files": { + "nodes": [ + { + "path": "docs/source/quantization.rst" + }, + { + "path": "docs/source/scripts/build_quantization_configs.py" + }, + { + "path": "test/allowlist_for_publicAPI.json" + }, + { + "path": "test/cpp/jit/source_range_test.cpp" + }, + { + "path": "test/cpp/jit/test_backend.cpp" + }, + { + "path": "test/cpp/jit/test_flatbuffer.cpp" + }, + { + "path": "test/cpp/jit/test_misc.cpp" + }, + { + "path": "test/cpp/jit/test_utils.h" + }, + { + "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_float_v2.ptl.ff" + }, + { + "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_float_v2.ptl.ff" + }, + { + "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_int_v2.ptl.ff" + }, + { + "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_int_v2.ptl.ff" + }, + { + "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_float_v2.ptl.ff" + }, + { + "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_int_v2.ptl.ff" + }, + { + "path": "test/cpp/jit/upgrader_models/test_versioned_div_scalar_scalar_v2.ptl.ff" + }, + { + "path": "test/cpp/jit/upgrader_models/test_versioned_div_tensor_inplace_v2.ptl.ff" + }, + { + "path": "test/cpp/jit/upgrader_models/test_versioned_div_tensor_out_v2.ptl.ff" + }, + { + "path": "test/cpp/jit/upgrader_models/test_versioned_div_tensor_v2.ptl.ff" + }, + { + "path": "test/cpp/profiler/record_function.cpp" + }, + { + "path": "test/distributed/_shard/sharded_tensor/test_sharded_tensor.py" + }, + { + "path": "test/distributed/_shard/test_replicated_tensor.py" + }, + { + "path": "test/distributed/fsdp/test_fsdp_comm.py" + }, + { + "path": "test/distributed/fsdp/test_fsdp_optim_state.py" + }, + { + "path": "test/distributed/optim/test_zero_redundancy_optimizer.py" + }, + { + "path": "test/jit/test_export_modes.py" + }, + { + "path": "test/jit/test_if_hoisting.py" + }, + { + "path": "test/jit/test_tracer.py" + }, + { + "path": "test/jit/test_upgraders.py" + }, + { + "path": "test/mobile/test_lite_script_type.py" + }, + { + "path": "test/onnx/expect/TestOperators.test_layer_norm_aten.expect" + }, + { + "path": "test/onnx/test_operators.py" + }, + { + "path": "test/onnx/test_pytorch_onnx_onnxruntime.py" + }, + { + "path": "test/quantization/ao_migration/test_quantization_fx.py" + }, + { + "path": "test/quantization/core/test_quantized_op.py" + }, + { + "path": "test/quantization/core/test_quantized_tensor.py" + }, + { + "path": "test/quantization/fx/test_numeric_suite_fx.py" + }, + { + "path": "test/quantization/fx/test_quantize_fx.py" + }, + { + "path": "test/test_autograd.py" + }, + { + "path": "test/test_binary_ufuncs.py" + }, + { + "path": "test/test_expanded_weights.py" + }, + { + "path": "test/test_functionalization.py" + }, + { + "path": "test/test_fx_experimental.py" + }, + { + "path": "test/test_jit.py" + }, + { + "path": "test/test_jit_cuda_fuser.py" + }, + { + "path": "test/test_linalg.py" + }, + { + "path": "test/test_nestedtensor.py" + }, + { + "path": "test/test_nn.py" + }, + { + "path": "test/test_ops.py" + }, + { + "path": "test/test_ops_gradients.py" + }, + { + "path": "test/test_ops_jit.py" + }, + { + "path": "test/test_optim.py" + }, + { + "path": "test/test_overrides.py" + }, + { + "path": "test/test_profiler.py" + }, + { + "path": "test/test_public_bindings.py" + }, + { + "path": "test/test_pytree.py" + }, + { + "path": "test/test_reductions.py" + }, + { + "path": "test/test_sort_and_select.py" + }, + { + "path": "test/test_sparse.py" + }, + { + "path": "test/test_sparse_csr.py" + }, + { + "path": "test/test_spectral_ops.py" + }, + { + "path": "test/test_tensor_creation_ops.py" + }, + { + "path": "test/test_tensorboard.py" + }, + { + "path": "test/test_testing.py" + }, + { + "path": "test/test_torch.py" + }, + { + "path": "test/test_unary_ufuncs.py" + }, + { + "path": "third_party/BUCK.github" + }, + { + "path": "third_party/fbgemm" + }, + { + "path": "tools/autograd/derivatives.yaml" + }, + { + "path": "tools/autograd/gen_inplace_or_view_type.py" + }, + { + "path": "tools/autograd/load_derivatives.py" + }, + { + "path": "tools/build_variables.bzl" + }, + { + "path": "tools/codegen/api/autograd.py" + }, + { + "path": "tools/codegen/api/cpp.py" + }, + { + "path": "tools/codegen/api/dispatcher.py" + }, + { + "path": "tools/codegen/api/functionalization.py" + }, + { + "path": "tools/codegen/api/lazy.py" + }, + { + "path": "tools/codegen/api/meta.py" + }, + { + "path": "tools/codegen/api/native.py" + }, + { + "path": "tools/codegen/api/python.py" + }, + { + "path": "tools/codegen/api/structured.py" + }, + { + "path": "tools/codegen/api/translate.py" + }, + { + "path": "tools/codegen/api/types.py" + }, + { + "path": "tools/codegen/api/ufunc.py" + }, + { + "path": "tools/codegen/api/unboxing.py" + }, + { + "path": "tools/codegen/code_template.py" + }, + { + "path": "tools/codegen/context.py" + }, + { + "path": "tools/codegen/decompositions/gen_jit_decompositions.py" + }, + { + "path": "tools/codegen/dest/__init__.py" + }, + { + "path": "tools/codegen/dest/lazy_ir.py" + }, + { + "path": "tools/codegen/dest/lazy_ts_lowering.py" + }, + { + "path": "tools/codegen/dest/native_functions.py" + }, + { + "path": "tools/codegen/dest/register_dispatch_key.py" + }, + { + "path": "tools/codegen/dest/ufunc.py" + }, + { + "path": "tools/codegen/gen.py" + }, + { + "path": "tools/codegen/gen_backend_stubs.py" + }, + { + "path": "tools/codegen/gen_functionalization_type.py" + }, + { + "path": "tools/codegen/gen_lazy_tensor.py" + }, + { + "path": "tools/codegen/local.py" + }, + { + "path": "tools/codegen/model.py" + }, + { + "path": "tools/codegen/operator_versions/gen_mobile_upgraders.py" + } + ], + "pageInfo": { + "endCursor": "MjAw", + "hasNextPage": true + } + } + } + } + } + }, + "query_sha=0a34acb829d8aca9dd28a8ba388dfa52f6ecdde7e903ace1caabdcfaba87de98 cursor=MjAw name=pytorch number=76118 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "files": { + "nodes": [ + { + "path": "tools/codegen/selective_build/operator.py" + }, + { + "path": "tools/codegen/selective_build/selector.py" + }, + { + "path": "tools/codegen/shape_functions/gen_jit_shape_functions.py" + }, + { + "path": "tools/codegen/static_runtime/config.py" + }, + { + "path": "tools/codegen/static_runtime/gen_static_runtime_ops.py" + }, + { + "path": "tools/codegen/static_runtime/gen_structured.py" + }, + { + "path": "tools/codegen/utils.py" + }, + { + "path": "tools/linter/adapters/circleci_linter.py" + }, + { + "path": "tools/linter/adapters/clangformat_linter.py" + }, + { + "path": "tools/linter/adapters/grep_linter.py" + }, + { + "path": "tools/linter/adapters/nativefunctions_linter.py" + }, + { + "path": "tools/setup_helpers/BUILD.bazel" + }, + { + "path": "tools/setup_helpers/generate_code.py" + }, + { + "path": "torch/_C/__init__.pyi.in" + }, + { + "path": "torch/amp/autocast_mode.py" + }, + { + "path": "torch/ao/ns/fx/pattern_utils.py" + }, + { + "path": "torch/ao/quantization/backend_config/README.md" + }, + { + "path": "torch/ao/quantization/backend_config/__init__.py" + }, + { + "path": "torch/ao/quantization/backend_config/native.py" + }, + { + "path": "torch/ao/quantization/backend_config/observation_type.py" + }, + { + "path": "torch/ao/quantization/backend_config/tensorrt.py" + }, + { + "path": "torch/ao/quantization/backend_config/utils.py" + }, + { + "path": "torch/ao/quantization/fx/__init__.py" + }, + { + "path": "torch/ao/quantization/fx/backend_config/fuse_handler.py" + }, + { + "path": "torch/ao/quantization/fx/backend_config/quantize_handler.py" + }, + { + "path": "torch/ao/quantization/fx/backend_config_utils.py" + }, + { + "path": "torch/ao/quantization/fx/convert.py" + }, + { + "path": "torch/ao/quantization/fx/fuse.py" + }, + { + "path": "torch/ao/quantization/fx/fusion_patterns.py" + }, + { + "path": "torch/ao/quantization/fx/match_utils.py" + }, + { + "path": "torch/ao/quantization/fx/pattern_utils.py" + }, + { + "path": "torch/ao/quantization/fx/prepare.py" + }, + { + "path": "torch/ao/quantization/fx/quantization_patterns.py" + }, + { + "path": "torch/ao/quantization/qconfig.py" + }, + { + "path": "torch/ao/quantization/quantization_types.py" + }, + { + "path": "torch/ao/quantization/quantize_fx.py" + }, + { + "path": "torch/autograd/__init__.py" + }, + { + "path": "torch/csrc/Module.cpp" + }, + { + "path": "torch/csrc/autograd/FunctionsManual.cpp" + }, + { + "path": "torch/csrc/autograd/FunctionsManual.h" + }, + { + "path": "torch/csrc/autograd/engine.cpp" + }, + { + "path": "torch/csrc/autograd/function.h" + }, + { + "path": "torch/csrc/autograd/functions/accumulate_grad.h" + }, + { + "path": "torch/csrc/autograd/init.cpp" + }, + { + "path": "torch/csrc/autograd/python_torch_functions_manual.cpp" + }, + { + "path": "torch/csrc/autograd/python_variable.cpp" + }, + { + "path": "torch/csrc/autograd/record_function_ops.h" + }, + { + "path": "torch/csrc/autograd/utils/grad_layout_contract.h" + }, + { + "path": "torch/csrc/deploy/CMakeLists.txt" + }, + { + "path": "torch/csrc/distributed/c10d/logger.cpp" + }, + { + "path": "torch/csrc/jit/codegen/cuda/graph_fuser.cpp" + }, + { + "path": "torch/csrc/jit/codegen/cuda/parser.cpp" + }, + { + "path": "torch/csrc/jit/frontend/function_schema_parser.cpp" + }, + { + "path": "torch/csrc/jit/frontend/lexer.h" + }, + { + "path": "torch/csrc/jit/frontend/parser.cpp" + }, + { + "path": "torch/csrc/jit/frontend/parser.h" + }, + { + "path": "torch/csrc/jit/frontend/script_type_parser.cpp" + }, + { + "path": "torch/csrc/jit/frontend/source_range.cpp" + }, + { + "path": "torch/csrc/jit/frontend/source_range.h" + }, + { + "path": "torch/csrc/jit/frontend/source_ref.h" + }, + { + "path": "torch/csrc/jit/frontend/tracer.cpp" + }, + { + "path": "torch/csrc/jit/frontend/tracer.h" + }, + { + "path": "torch/csrc/jit/mobile/debug_info.cpp" + }, + { + "path": "torch/csrc/jit/mobile/debug_info.h" + }, + { + "path": "torch/csrc/jit/mobile/flatbuffer_loader.cpp" + }, + { + "path": "torch/csrc/jit/mobile/module.h" + }, + { + "path": "torch/csrc/jit/passes/common_expression_hoisting.cpp" + }, + { + "path": "torch/csrc/jit/passes/common_expression_hoisting.h" + }, + { + "path": "torch/csrc/jit/passes/frozen_graph_optimizations.cpp" + }, + { + "path": "torch/csrc/jit/passes/onnx/pattern_conversion/common.cpp" + }, + { + "path": "torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp" + }, + { + "path": "torch/csrc/jit/python/init.cpp" + }, + { + "path": "torch/csrc/jit/python/python_tree_views.cpp" + }, + { + "path": "torch/csrc/jit/python/script_init.cpp" + }, + { + "path": "torch/csrc/jit/runtime/graph_executor.cpp" + }, + { + "path": "torch/csrc/jit/runtime/interpreter.cpp" + }, + { + "path": "torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp" + }, + { + "path": "torch/csrc/jit/runtime/script_profile.cpp" + }, + { + "path": "torch/csrc/jit/runtime/serialized_shape_function_registry.cpp" + }, + { + "path": "torch/csrc/jit/runtime/serialized_shape_function_registry.h" + }, + { + "path": "torch/csrc/jit/runtime/shape_function_registry.h" + }, + { + "path": "torch/csrc/jit/runtime/shape_functions.h" + }, + { + "path": "torch/csrc/jit/runtime/shape_functions_1.h" + }, + { + "path": "torch/csrc/jit/runtime/static/impl.cpp" + }, + { + "path": "torch/csrc/jit/runtime/static/passes.cpp" + }, + { + "path": "torch/csrc/jit/runtime/symbolic_shape_registry.cpp" + }, + { + "path": "torch/csrc/jit/runtime/symbolic_shape_registry.h" + }, + { + "path": "torch/csrc/jit/serialization/export_module.cpp" + }, + { + "path": "torch/csrc/jit/serialization/flatbuffer_serializer.cpp" + }, + { + "path": "torch/csrc/jit/serialization/import.cpp" + }, + { + "path": "torch/csrc/jit/serialization/import_export_helpers.cpp" + }, + { + "path": "torch/csrc/jit/serialization/import_export_helpers.h" + }, + { + "path": "torch/csrc/jit/serialization/import_source.cpp" + }, + { + "path": "torch/csrc/jit/serialization/import_source.h" + }, + { + "path": "torch/csrc/jit/serialization/source_range_serialization.cpp" + }, + { + "path": "torch/csrc/jit/serialization/source_range_serialization.h" + }, + { + "path": "torch/csrc/jit/testing/file_check.cpp" + }, + { + "path": "torch/csrc/lazy/core/dynamic_ir.cpp" + }, + { + "path": "torch/csrc/lazy/core/dynamic_ir.h" + }, + { + "path": "torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp" + } + ], + "pageInfo": { + "endCursor": "MzAw", + "hasNextPage": true + } + } + } + } + } + }, + "query_sha=0a34acb829d8aca9dd28a8ba388dfa52f6ecdde7e903ace1caabdcfaba87de98 cursor=MzAw name=pytorch number=76118 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "files": { + "nodes": [ + { + "path": "torch/csrc/lazy/ts_backend/ts_native_functions.cpp" + }, + { + "path": "torch/csrc/utils/python_arg_parser.cpp" + }, + { + "path": "torch/csrc/utils/python_arg_parser.h" + }, + { + "path": "torch/csrc/utils/tensor_list.cpp" + }, + { + "path": "torch/csrc/utils/tensor_new.cpp" + }, + { + "path": "torch/csrc/utils/tensor_new.h" + }, + { + "path": "torch/distributed/_shard/__init__.py" + }, + { + "path": "torch/distributed/_shard/api.py" + }, + { + "path": "torch/distributed/_shard/replicated_tensor.py" + }, + { + "path": "torch/distributed/_shard/sharded_tensor/__init__.py" + }, + { + "path": "torch/distributed/_shard/sharded_tensor/api.py" + }, + { + "path": "torch/distributed/_shard/sharded_tensor/utils.py" + }, + { + "path": "torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py" + }, + { + "path": "torch/distributed/algorithms/model_averaging/utils.py" + }, + { + "path": "torch/distributed/fsdp/_optim_utils.py" + }, + { + "path": "torch/distributed/fsdp/fully_sharded_data_parallel.py" + }, + { + "path": "torch/distributed/nn/__init__.py" + }, + { + "path": "torch/distributed/nn/functional.py" + }, + { + "path": "torch/distributed/optim/functional_adagrad.py" + }, + { + "path": "torch/fx/experimental/meta_tracer.py" + }, + { + "path": "torch/fx/graph.py" + }, + { + "path": "torch/jit/_shape_functions.py" + }, + { + "path": "torch/nn/parallel/_replicated_tensor_ddp_interop.py" + }, + { + "path": "torch/nn/parallel/_replicated_tensor_ddp_utils.py" + }, + { + "path": "torch/nn/parallel/distributed.py" + }, + { + "path": "torch/nn/utils/_expanded_weights/__init__.py" + }, + { + "path": "torch/nn/utils/_expanded_weights/instance_norm_expanded_weights.py" + }, + { + "path": "torch/onnx/symbolic_opset11.py" + }, + { + "path": "torch/onnx/symbolic_opset12.py" + }, + { + "path": "torch/onnx/symbolic_opset9.py" + }, + { + "path": "torch/optim/adagrad.py" + }, + { + "path": "torch/optim/lr_scheduler.py" + }, + { + "path": "torch/overrides.py" + }, + { + "path": "torch/quantization/fx/pattern_utils.py" + }, + { + "path": "torch/quantization/fx/quantization_patterns.py" + }, + { + "path": "torch/quantization/fx/quantization_types.py" + }, + { + "path": "torch/return_types.py" + }, + { + "path": "torch/testing/_internal/common_device_type.py" + }, + { + "path": "torch/testing/_internal/common_distributed.py" + }, + { + "path": "torch/testing/_internal/common_fx2trt.py" + }, + { + "path": "torch/testing/_internal/common_methods_invocations.py" + }, + { + "path": "torch/testing/_internal/common_utils.py" + }, + { + "path": "torch/testing/_internal/composite_compliance.py" + }, + { + "path": "torch/testing/_internal/distributed/distributed_test.py" + }, + { + "path": "torch/testing/_internal/jit_metaprogramming_utils.py" + }, + { + "path": "torch/utils/cpp_extension.py" + }, + { + "path": "torch/utils/data/datapipes/_typing.py" + }, + { + "path": "torch/utils/model_dump/__init__.py" + } + ], + "pageInfo": { + "endCursor": "MzQ4", + "hasNextPage": false + } + } + } + } + } + }, + "query_sha=a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5 cursor=None name=pytorch-dev-infra org=pytorch": { + "data": { + "organization": { + "team": { + "members": { + "nodes": [ + { + "login": "kit1980" + }, + { + "login": "b0noI" + }, + { + "login": "seemethere" + }, + { + "login": "malfet" + }, + { + "login": "tenpercent" + }, + { + "login": "atalman" + }, + { + "login": "osalpekar" + }, + { + "login": "janeyx99" + }, + { + "login": "clee2000" + } + ], + "pageInfo": { + "hasNextPage": false, + "endCursor": "Y3Vyc29yOnYyOpHOAqnOlw==" + } + } + } + } + } + }, + "query_sha=a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5 cursor=None name=qwertyuiop org=pytorch": { + "data": { + "organization": { + "team": null + } + } + }, + "query_sha=5ffb180a8ade981d13f6f672ed610279046db440b571254f884c52419c54dd79 cursor=Y3Vyc29yOnYyOpHPAAAAAVFCcQU= name=pytorch number=73811 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "commits": { + "nodes": [ + { + "commit": { + "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7", + "checkSuites": { + "nodes": [ + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276102" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-bionic-py3.7-clang9" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276103" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-clang7-onnx" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276104" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-gcc7" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815361?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545915218?check_suite_focus=true" + }, + { + "name": "test (distributed, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545915270?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545915344?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqP89A=", + "hasNextPage": false + } + }, + "conclusion": "FAILURE", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276105" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3-clang5-mobile-custom-build-static" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276106" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815353?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObTk=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276107" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-docs" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276110" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "win-vs2019-cuda11.3-py3" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276111" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-cuda11.3-py3.7-gcc7" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815317?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5546189850?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5546189908?check_suite_focus=true" + }, + { + "name": "test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5546189954?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqUJII=", + "hasNextPage": false + } + }, + "conclusion": "FAILURE", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276112" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pytorch-xla-linux-bionic-py3.7-clang8" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276114" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcRI=", + "hasNextPage": true + } + } + } + } + ] + } + } + } + } + }, + "query_sha=5ffb180a8ade981d13f6f672ed610279046db440b571254f884c52419c54dd79 cursor=Y3Vyc29yOnYyOpHPAAAAAVFCcRI= name=pytorch number=73811 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "commits": { + "nodes": [ + { + "commit": { + "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7", + "checkSuites": { + "nodes": [ + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-gcc5.4" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276115" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-vulkan-bionic-py3.7-clang9" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276117" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-bionic-py3.7-clang9" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815309?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545918134?check_suite_focus=true" + }, + { + "name": "test (noarch, 1, 1, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545918256?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545918319?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqP_28=", + "hasNextPage": false + } + }, + "conclusion": "FAILURE", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276119" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-gcc7-no-ops" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276122" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-clang7-onnx" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815351?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545931419?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545931552?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqQMyA=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276123" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-clang7-asan" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815311?check_suite_focus=true" + }, + { + "name": "test (default, 3, 3, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545947543?check_suite_focus=true" + }, + { + "name": "test (default, 1, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545947625?check_suite_focus=true" + }, + { + "name": "test (default, 2, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545947792?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqQcpA=", + "hasNextPage": false + } + }, + "conclusion": "FAILURE", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276124" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Lint" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "cmakelint", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815342?check_suite_focus=true" + }, + { + "name": "clang-format", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815564?check_suite_focus=true" + }, + { + "name": "clang-tidy", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815688?check_suite_focus=true" + }, + { + "name": "flake8-py3", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815821?check_suite_focus=true" + }, + { + "name": "quick-checks", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816003?check_suite_focus=true" + }, + { + "name": "mypy", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816076?check_suite_focus=true" + }, + { + "name": "py2-setup-validate-errormsg", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816154?check_suite_focus=true" + }, + { + "name": "shellcheck", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816266?check_suite_focus=true" + }, + { + "name": "toc", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816398?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcU4=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276126" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "TorchBench CI (pytorch-linux-py3.7-cu102)" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "run-torchbench", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815207?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObKc=", + "hasNextPage": false + } + }, + "conclusion": "SKIPPED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276127" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-cuda11.3-py3.7-gcc7" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276129" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276130" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcSI=", + "hasNextPage": true + } + } + } + } + ] + } + } + } + } + }, + "query_sha=5ffb180a8ade981d13f6f672ed610279046db440b571254f884c52419c54dd79 cursor=Y3Vyc29yOnYyOpHPAAAAAVFCcSI= name=pytorch number=73811 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "commits": { + "nodes": [ + { + "commit": { + "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7", + "checkSuites": { + "nodes": [ + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pytorch-xla-linux-bionic-py3.7-clang8" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815348?check_suite_focus=true" + }, + { + "name": "test (xla, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545954339?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqQjCM=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276131" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "win-vs2019-cuda11.3-py3" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815322?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, windows.8xlarge.nvidia.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5546226404?check_suite_focus=true" + }, + { + "name": "test (force_on_cpu, 1, 1, windows.4xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5546226489?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, windows.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5546226540?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqUs2w=", + "hasNextPage": false + } + }, + "conclusion": "FAILURE", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276132" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815307?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObQs=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276133" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815362?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObUI=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276134" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815337?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObSk=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276135" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-vulkan-bionic-py3.7-clang9" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815561?check_suite_focus=true" + }, + { + "name": "test (default, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545929390?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqQKq4=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276136" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-docs" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815356?check_suite_focus=true" + }, + { + "name": "build-docs (cpp)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545920544?check_suite_focus=true" + }, + { + "name": "build-docs (python)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545920612?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqQCGQ=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276137" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-bionic-rocm4.5-py3.7" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815326?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, linux.rocm.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545983951?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, linux.rocm.gpu)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545984049?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqRADE=", + "hasNextPage": false + } + }, + "conclusion": "FAILURE", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276140" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3-clang5-mobile-build" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815205?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObKU=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276141" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "win-vs2019-cpu-py3" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815314?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5546093287?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, windows.4xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5546093438?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqSq34=", + "hasNextPage": false + } + }, + "conclusion": "FAILURE", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276143" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcS8=", + "hasNextPage": true + } + } + } + } + ] + } + } + } + } + }, + "query_sha=5ffb180a8ade981d13f6f672ed610279046db440b571254f884c52419c54dd79 cursor=Y3Vyc29yOnYyOpHPAAAAAVFCcS8= name=pytorch number=73811 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "commits": { + "nodes": [ + { + "commit": { + "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7", + "checkSuites": { + "nodes": [ + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-gcc5.4" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815359?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545923802?check_suite_focus=true" + }, + { + "name": "test (distributed, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545923899?check_suite_focus=true" + }, + { + "name": "test (backwards_compat, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545924024?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, linux.2xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545924110?check_suite_focus=true" + }, + { + "name": "test (jit_legacy, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545924249?check_suite_focus=true" + }, + { + "name": "test (docs_test, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545924341?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqQFvU=", + "hasNextPage": false + } + }, + "conclusion": "FAILURE", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276145" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-gcc7" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276149" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-bionic-rocm4.5-py3.7" + } + }, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": "CANCELLED", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276152" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Test tools" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815310?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObQ4=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276157" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3-clang5-mobile-custom-build-static" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545815320?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObRg=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276159" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "macos-10-15-py3-arm64" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816079?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcA8=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276857" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "ios-12-5-1-arm64-coreml" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816078?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcA4=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276860" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "ios-12-5-1-arm64" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816071?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcAc=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276861" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "macos-11-py3-x86-64" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816073?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, macos-11)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5546066712?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, macos-11)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5546066787?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqSQ2M=", + "hasNextPage": false + } + }, + "conclusion": "FAILURE", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276862" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "ios-12-5-1-arm64-custom-ops" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816081?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcBE=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276864" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVFCdAA=", + "hasNextPage": true + } + } + } + } + ] + } + } + } + } + }, + "query_sha=5ffb180a8ade981d13f6f672ed610279046db440b571254f884c52419c54dd79 cursor=Y3Vyc29yOnYyOpHPAAAAAVFCdAA= name=pytorch number=73811 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "commits": { + "nodes": [ + { + "commit": { + "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7", + "checkSuites": { + "nodes": [ + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "ios-12-5-1-x86-64-coreml" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816077?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcA0=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276867" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "ios-12-5-1-arm64-metal" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816080?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcBA=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276869" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "macos-10-15-py3-lite-interpreter-x86-64" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816075?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcAs=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276873" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "ios-12-5-1-x86-64" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5545816068?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOcAQ=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276881" + }, + { + "app": { + "name": "Netlify", + "databaseId": 13473 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658277331" + }, + { + "app": { + "name": "Azure Pipelines", + "databaseId": 9426 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658277340" + }, + { + "app": { + "name": "Dependabot", + "databaseId": 29110 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658277346" + }, + { + "app": { + "name": "Codecov", + "databaseId": 254 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658277350" + }, + { + "app": { + "name": "PyTorch Bot", + "databaseId": 40112 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [], + "pageInfo": { + "endCursor": null, + "hasNextPage": false + } + }, + "conclusion": null, + "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658277355" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVFCdes=", + "hasNextPage": false + } + } + } + } + ] + } + } + } + } + }, + "query_sha=5ffb180a8ade981d13f6f672ed610279046db440b571254f884c52419c54dd79 cursor=Y3Vyc29yOnYyOpHPAAAAAU2F-RA= name=pytorch number=73969 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "commits": { + "nodes": [ + { + "commit": { + "oid": "4746da707a9912356f5179625da89616b228dc21", + "checkSuites": { + "nodes": [ + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "TorchBench CI (pytorch-linux-py3.7-cu102)" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "run-torchbench", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928591?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2c8=", + "hasNextPage": false + } + }, + "conclusion": "SKIPPED", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592977" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Test tools" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928555?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2as=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592978" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-cuda11.3-py3.7-gcc7" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928570?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483302702?check_suite_focus=true" + }, + { + "name": "test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483302867?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483303104?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbUkMA=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592980" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-gcc7-no-ops" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928607?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2d8=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592981" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "win-vs2019-cpu-py3" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928611?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483400398?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483400575?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbWDX8=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592982" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928548?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2aQ=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592983" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-clang7-asan" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928603?check_suite_focus=true" + }, + { + "name": "test (default, 3, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483138456?check_suite_focus=true" + }, + { + "name": "test (default, 1, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483138698?check_suite_focus=true" + }, + { + "name": "test (default, 2, 3, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483139049?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbSD-k=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592985" + }, + { + "app": { + "name": "Facebook GitHub Tools", + "databaseId": 12274 + }, + "workflowRun": null, + "checkRuns": { + "nodes": [ + { + "name": "Facebook CLA Check", + "conclusion": "SUCCESS", + "detailsUrl": "https://code.intern.facebook.com/cla/" + }, + { + "name": "Meta Internal-Only Changes Check", + "conclusion": "NEUTRAL", + "detailsUrl": "https://opensource.facebook.com/" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO574=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592986" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pytorch-xla-linux-bionic-py3.7-clang8" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928559?check_suite_focus=true" + }, + { + "name": "test (xla, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483141123?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbSGAM=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592987" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-gcc5.4" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928593?check_suite_focus=true" + }, + { + "name": "test (backwards_compat, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483106295?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483106609?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483106835?check_suite_focus=true" + }, + { + "name": "test (distributed, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483107050?check_suite_focus=true" + }, + { + "name": "test (docs_test, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483107208?check_suite_focus=true" + }, + { + "name": "test (jit_legacy, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483107483?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRlJs=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592997" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-SU=", + "hasNextPage": true + } + } + } + } + ] + } + } + } + } + }, + "query_sha=5ffb180a8ade981d13f6f672ed610279046db440b571254f884c52419c54dd79 cursor=Y3Vyc29yOnYyOpHPAAAAAU2F-SU= name=pytorch number=73969 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "commits": { + "nodes": [ + { + "commit": { + "oid": "4746da707a9912356f5179625da89616b228dc21", + "checkSuites": { + "nodes": [ + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-bionic-py3.7-clang9" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928550?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483083368?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483083553?check_suite_focus=true" + }, + { + "name": "test (noarch, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483083767?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRN_c=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595593001" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-clang7-onnx" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928572?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483120691?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5483120938?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRySo=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595593014" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3-clang5-mobile-custom-build-static" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5482928605?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2d0=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595593026" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-UI=", + "hasNextPage": false + } + } + } + } + ] + } + } + } + } + }, + "query_sha=5ffb180a8ade981d13f6f672ed610279046db440b571254f884c52419c54dd79 cursor=Y3Vyc29yOnYyOpHPAAAAAUK_Uc0= name=pytorch number=71759 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "commits": { + "nodes": [ + { + "commit": { + "oid": "346e0c547953d98eb84d23c1391a95badb9c4a22", + "checkSuites": { + "nodes": [ + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-cuda11.3-py3.7-gcc7" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020053?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302536958?check_suite_focus=true" + }, + { + "name": "test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302537118?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, linux.4xlarge.nvidia.gpu)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302537373?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwOTJ0=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801870" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "Test tools" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020045?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ80=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801872" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-bionic-py3.7-clang9" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020051?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302145103?check_suite_focus=true" + }, + { + "name": "test (noarch, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302145224?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302145353?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwIUUk=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801874" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-docs" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020056?check_suite_focus=true" + }, + { + "name": "build-docs (cpp)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302148279?check_suite_focus=true" + }, + { + "name": "build-docs (python)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302148361?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwIXQk=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801876" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3-clang5-mobile-custom-build-static" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020057?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ9k=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801877" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "TorchBench CI (pytorch-linux-py3.7-cu102)" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "run-torchbench", + "conclusion": "NEUTRAL", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302019919?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ08=", + "hasNextPage": false + } + }, + "conclusion": "SKIPPED", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801878" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-gcc7" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020088?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302151055?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302151166?check_suite_focus=true" + }, + { + "name": "test (distributed, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302151251?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwIaFM=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801880" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build-and-test", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020054?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ9Y=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801882" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "win-vs2019-cpu-py3" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302019942?check_suite_focus=true" + }, + { + "name": "test (default, 1, 2, windows.4xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5303136931?check_suite_focus=true" + }, + { + "name": "test (default, 2, 2, windows.4xlarge)", + "conclusion": "FAILURE", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5303137019?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwXcvs=", + "hasNextPage": false + } + }, + "conclusion": "FAILURE", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801885" + }, + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-vulkan-bionic-py3.7-clang9" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020058?check_suite_focus=true" + }, + { + "name": "test (default, 1, 1, linux.2xlarge)", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302161211?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwIjzs=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801895" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Uec=", + "hasNextPage": true + } + } + } + } + ] + } + } + } + } + }, + "query_sha=5ffb180a8ade981d13f6f672ed610279046db440b571254f884c52419c54dd79 cursor=Y3Vyc29yOnYyOpHPAAAAAUK_Uec= name=pytorch number=71759 owner=pytorch": { + "data": { + "repository": { + "pullRequest": { + "commits": { + "nodes": [ + { + "commit": { + "oid": "346e0c547953d98eb84d23c1391a95badb9c4a22", + "checkSuites": { + "nodes": [ + { + "app": { + "name": "GitHub Actions", + "databaseId": 15368 + }, + "workflowRun": { + "workflow": { + "name": "linux-xenial-py3.7-gcc7-no-ops" + } + }, + "checkRuns": { + "nodes": [ + { + "name": "build", + "conclusion": "SUCCESS", + "detailsUrl": "https://github.com/pytorch/pytorch/runs/5302020052?check_suite_focus=true" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ9Q=", + "hasNextPage": false + } + }, + "conclusion": "SUCCESS", + "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801896" + } + ], + "pageInfo": { + "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Ueg=", + "hasNextPage": false + } + } + } + } + ] + } + } + } + } + } +} diff --git a/.github/scripts/install_nvidia_utils_linux.sh b/.github/scripts/install_nvidia_utils_linux.sh index 0db7de71f4fc..b854320c9eaa 100755 --- a/.github/scripts/install_nvidia_utils_linux.sh +++ b/.github/scripts/install_nvidia_utils_linux.sh @@ -3,7 +3,7 @@ set -eou pipefail DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID) \ -DRIVER_FN="NVIDIA-Linux-x86_64-495.44.run" +DRIVER_FN="NVIDIA-Linux-x86_64-510.60.02.run" YUM_REPO_URL="https://nvidia.github.io/nvidia-docker/${DISTRIBUTION}/nvidia-docker.repo" install_nvidia_docker2_amzn2() { diff --git a/.github/scripts/lint_native_functions.py b/.github/scripts/lint_native_functions.py index 2e6d4e3e7675..70c43605c54d 100755 --- a/.github/scripts/lint_native_functions.py +++ b/.github/scripts/lint_native_functions.py @@ -27,9 +27,9 @@ def fn(base: str) -> str: contents = f.read() yaml = ruamel.yaml.YAML() # type: ignore[attr-defined] -yaml.preserve_quotes = True -yaml.width = 1000 -yaml.boolean_representation = ['False', 'True'] +yaml.preserve_quotes = True # type: ignore[assignment] +yaml.width = 1000 # type: ignore[assignment] +yaml.boolean_representation = ['False', 'True'] # type: ignore[attr-defined] r = yaml.load(contents) # Cuz ruamel's author intentionally didn't include conversion to string diff --git a/.github/scripts/lint_test_ownership.py b/.github/scripts/lint_test_ownership.py deleted file mode 100755 index 270019c0f563..000000000000 --- a/.github/scripts/lint_test_ownership.py +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env python3 -''' -Test ownership was introduced in https://github.com/pytorch/pytorch/issues/66232. - -This lint verifies that every Python test file (file that matches test_*.py or *_test.py in the test folder) -has valid ownership information in a comment header. Valid means: - - The format of the header follows the pattern "# Owner(s): ["list", "of owner", "labels"] - - Each owner label actually exists in PyTorch - - Each owner label starts with "module: " or "oncall: " or is in ACCEPTABLE_OWNER_LABELS - -This file is expected to run in the root directory of pytorch/pytorch. -''' -import boto3 # type: ignore[import] -import botocore # type: ignore[import] -import fnmatch -import json -import sys -from pathlib import Path -from typing import List, Any - - -# Team/owner labels usually start with "module: " or "oncall: ", but the following are acceptable exceptions -ACCEPTABLE_OWNER_LABELS = ["NNC", "high priority"] -GLOB_EXCEPTIONS = [ - "**/test/run_test.py" -] - -PYTORCH_ROOT = Path(__file__).resolve().parent.parent.parent -TEST_DIR = PYTORCH_ROOT / "test" -CURRENT_FILE_NAME = Path(__file__).resolve().relative_to(PYTORCH_ROOT) - -S3_RESOURCE_READ_ONLY = boto3.resource("s3", config=botocore.config.Config(signature_version=botocore.UNSIGNED)) - - -def get_all_test_files() -> List[Path]: - test_files = list(TEST_DIR.glob("**/test_*.py")) - test_files.extend(list(TEST_DIR.glob("**/*_test.py"))) - return [f for f in test_files if not any([fnmatch.fnmatch(str(f), g) for g in GLOB_EXCEPTIONS])] - - -def get_pytorch_labels() -> Any: - bucket = S3_RESOURCE_READ_ONLY.Bucket("ossci-metrics") - summaries = bucket.objects.filter(Prefix="pytorch_labels.json") - for summary in summaries: - labels = summary.get()["Body"].read() - return json.loads(labels) - - -# Returns a string denoting the error invalidating the label OR an empty string if nothing is wrong -def validate_label(label: str, pytorch_labels: List[str]) -> str: - if label not in pytorch_labels: - return f"{label} is not a PyTorch label (please choose from https://github.com/pytorch/pytorch/labels)" - if label.startswith("module:") or label.startswith("oncall:") or label in ACCEPTABLE_OWNER_LABELS: - return "" - return f"{label} is not an acceptable owner (please update to another label or edit ACCEPTABLE_OWNERS_LABELS " \ - "in {CURRENT_FILE_NAME}" - - -# Returns a string denoting the error invalidating the file OR an empty string if nothing is wrong -def validate_file(filename: Path, pytorch_labels: List[str]) -> str: - prefix = "# Owner(s): " - relative_name = Path(filename).relative_to(PYTORCH_ROOT) - with open(filename) as f: - for line in f.readlines(): - if line.startswith(prefix): - labels = json.loads(line[len(prefix):]) - labels_msgs = [validate_label(label, pytorch_labels) for label in labels] - file_msg = ", ".join([x for x in labels_msgs if x != ""]) - return f"{relative_name}: {file_msg}" if file_msg != "" else "" - return f"{relative_name}: missing a comment header with ownership information." - - -def main() -> None: - test_file_paths = get_all_test_files() - pytorch_labels = get_pytorch_labels() - - file_msgs = [validate_file(f, pytorch_labels) for f in test_file_paths] - err_msg = "\n".join([x for x in file_msgs if x != ""]) - if err_msg != "": - err_msg = err_msg + "\n\nIf you see files with missing ownership information above, " \ - "please add the following line\n\n# Owner(s): [\"\"]\n\nto the top of each test file. " \ - "The owner should be an existing pytorch/pytorch label." - print(err_msg) - sys.exit(1) - - -if __name__ == '__main__': - main() diff --git a/.github/scripts/process_commit.py b/.github/scripts/process_commit.py index a7bc4709d6b8..1bfca3237984 100644 --- a/.github/scripts/process_commit.py +++ b/.github/scripts/process_commit.py @@ -68,7 +68,7 @@ def get_repo_labels() -> List[str]: page_labels = list(map(lambda x: str(x["name"]), response)) if not page_labels: break - collected_labels += page_labels + collected_labels += page_labels return collected_labels def post_pytorch_comment(pr_number: int, merger: str) -> Any: diff --git a/.github/scripts/syncbranches.py b/.github/scripts/syncbranches.py index 163c4b3759b8..8437e1fa9c18 100755 --- a/.github/scripts/syncbranches.py +++ b/.github/scripts/syncbranches.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -from gitutils import get_git_repo_dir, GitRepo +from gitutils import get_git_repo_dir, get_git_remote_name, GitRepo from typing import Any @@ -16,7 +16,7 @@ def parse_args() -> Any: def main() -> None: args = parse_args() - repo = GitRepo(get_git_repo_dir(), debug=args.debug) + repo = GitRepo(get_git_repo_dir(), get_git_remote_name(), debug=args.debug) repo.cherry_pick_commits(args.sync_branch, args.default_branch) repo.push(args.default_branch, args.dry_run) diff --git a/.github/scripts/test_trymerge.py b/.github/scripts/test_trymerge.py new file mode 100755 index 000000000000..a668431e3b3a --- /dev/null +++ b/.github/scripts/test_trymerge.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +# Tests implemented in this file are relying on GitHub GraphQL APIs +# In order to avoid test flakiness, results of the queries +# are cached in gql_mocks.json +# PyTorch Lint workflow does not have GITHUB_TOKEN defined to avoid +# flakiness, so if you are making changes to merge_rules or +# GraphQL queries in trymerge.py, please make sure to delete `gql_mocks.json` +# And re-run the test locally with ones PAT + +import json +import os +from hashlib import sha256 + +from trymerge import find_matching_merge_rule, gh_graphql, gh_get_team_members, GitHubPR, MergeRule, MandatoryChecksMissingError +from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo +from typing import cast, Any, List, Optional +from unittest import TestCase, main, mock +from urllib.error import HTTPError + +def mocked_gh_graphql(query: str, **kwargs: Any) -> Any: + gql_db_fname = os.path.join(os.path.dirname(__file__), "gql_mocks.json") + + def get_mocked_queries() -> Any: + if not os.path.exists(gql_db_fname): + return {} + with open(gql_db_fname, encoding="utf-8") as f: + return json.load(f) + + def save_mocked_queries(obj: Any) -> None: + with open(gql_db_fname, encoding="utf-8", mode="w") as f: + json.dump(obj, f, indent=2) + f.write("\n") + + key = f"query_sha={sha256(query.encode('utf-8')).hexdigest()} " + " ".join([f"{k}={kwargs[k]}" for k in sorted(kwargs.keys())]) + mocked_queries = get_mocked_queries() + + if key in mocked_queries: + return mocked_queries[key] + + try: + rc = gh_graphql(query, **kwargs) + except HTTPError as err: + if err.code == 401: + err_msg = "If you are seeing this message during workflow run, please make sure to update gql_mocks.json" + err_msg += f" locally, by deleting it and running {os.path.basename(__file__)} with " + err_msg += " GitHub Personal Access Token passed via GITHUB_TOKEN environment variable" + if os.getenv("GITHUB_TOKEN") is None: + err_msg = "Failed to update cached GraphQL queries as GITHUB_TOKEN is not defined." + err_msg + raise RuntimeError(err_msg) from err + mocked_queries[key] = rc + + save_mocked_queries(mocked_queries) + + return rc + + +def mocked_read_merge_rules(repo: Optional[GitRepo], org: str, project: str) -> List[MergeRule]: + mock_merge_rules = """ + [ + { + "name": "mock with nonexistent check", + "patterns": ["*"], + "approved_by": [], + "mandatory_checks_name": [ + "Facebook CLA Check", + "Lint", + "nonexistent" + ] + } + ] + """ + rc = json.loads(mock_merge_rules, object_hook=lambda x: MergeRule(**x)) + return cast(List[MergeRule], rc) + + +class TestGitHubPR(TestCase): + @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql) + def test_match_rules(self, mocked_gql: Any) -> None: + "Tests that PR passes merge rules" + pr = GitHubPR("pytorch", "pytorch", 71759) + repo = GitRepo(get_git_repo_dir(), get_git_remote_name()) + self.assertTrue(find_matching_merge_rule(pr, repo) is not None) + + @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql) + def test_lint_fails(self, mocked_gql: Any) -> None: + "Tests that PR fails mandatory lint check" + pr = GitHubPR("pytorch", "pytorch", 74649) + repo = GitRepo(get_git_repo_dir(), get_git_remote_name()) + self.assertRaises(RuntimeError, lambda: find_matching_merge_rule(pr, repo)) + + @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql) + def test_get_last_comment(self, mocked_gql: Any) -> None: + "Tests that last comment can be fetched" + pr = GitHubPR("pytorch", "pytorch", 71759) + comment = pr.get_last_comment() + self.assertEqual(comment.author_login, "github-actions") + self.assertIsNone(comment.editor_login) + self.assertTrue("You've committed this PR" in comment.body_text) + + @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql) + def test_get_author_null(self, mocked_gql: Any) -> None: + """ Tests that PR author can be computed + If reply contains NULL + """ + pr = GitHubPR("pytorch", "pytorch", 71759) + author = pr.get_author() + self.assertTrue(author is not None) + self.assertTrue("@" in author) + self.assertTrue(pr.get_diff_revision() is None) + + # PR with multiple contributors, but creator id is not among authors + pr = GitHubPR("pytorch", "pytorch", 75095) + self.assertEqual(pr.get_pr_creator_login(), "mruberry") + author = pr.get_author() + self.assertTrue(author is not None) + + @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql) + def test_large_diff(self, mocked_gql: Any) -> None: + "Tests that PR with 100+ files can be fetched" + pr = GitHubPR("pytorch", "pytorch", 73099) + self.assertTrue(pr.get_changed_files_count() > 100) + flist = pr.get_changed_files() + self.assertEqual(len(flist), pr.get_changed_files_count()) + + @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql) + def test_internal_changes(self, mocked_gql: Any) -> None: + "Tests that PR with internal changes is detected" + pr = GitHubPR("pytorch", "pytorch", 73969) + self.assertTrue(pr.has_internal_changes()) + + @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql) + def test_checksuites_pagination(self, mocked_gql: Any) -> None: + "Tests that PR with lots of checksuits can be fetched" + pr = GitHubPR("pytorch", "pytorch", 73811) + self.assertGreater(len(pr.get_checkrun_conclusions()), 0) + + @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql) + def test_comments_pagination(self, mocked_gql: Any) -> None: + "Tests that PR with 50+ comments can be fetched" + pr = GitHubPR("pytorch", "pytorch", 31093) + self.assertGreater(len(pr.get_comments()), 50) + + @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql) + def test_gql_complexity(self, mocked_gql: Any) -> None: + "Fetch comments and conclusions for PR with 60 commits" + # Previous version of GrapQL query used to cause HTTP/502 error + # see https://gist.github.com/malfet/9b93bc7eeddeaf1d84546efc4f0c577f + pr = GitHubPR("pytorch", "pytorch", 68111) + self.assertGreater(len(pr.get_comments()), 20) + self.assertGreater(len(pr.get_checkrun_conclusions()), 3) + self.assertGreater(pr.get_commit_count(), 60) + + @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql) + def test_team_members(self, mocked_gql: Any) -> None: + "Test fetching team members works" + dev_infra_team = gh_get_team_members("pytorch", "pytorch-dev-infra") + self.assertGreater(len(dev_infra_team), 2) + with self.assertWarns(Warning): + non_existing_team = gh_get_team_members("pytorch", "qwertyuiop") + self.assertEqual(len(non_existing_team), 0) + + @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql) + def test_get_author_many_commits(self, mocked_gql: Any) -> None: + """ Tests that authors for all commits can be fetched + """ + pr = GitHubPR("pytorch", "pytorch", 76118) + authors = pr.get_authors() + self.assertGreater(pr.get_commit_count(), 100) + self.assertGreater(len(authors), 50) + self.assertTrue("@" in pr.get_author()) + + @mock.patch('trymerge.read_merge_rules', side_effect=mocked_read_merge_rules) + @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql) + def test_pending_status_check(self, mocked_gql: Any, mocked_read_merge_rules: Any) -> None: + """ Tests that PR with nonexistent/pending status checks fails with the right reason. + """ + pr = GitHubPR("pytorch", "pytorch", 76118) + repo = GitRepo(get_git_repo_dir(), get_git_remote_name()) + self.assertRaisesRegex(MandatoryChecksMissingError, + ".*are pending/not yet run.*", + lambda: find_matching_merge_rule(pr, repo)) + + @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql) + def test_get_author_many_reviews(self, mocked_gql: Any) -> None: + """ Tests that all reviews can be fetched + """ + pr = GitHubPR("pytorch", "pytorch", 76123) + approved_by = pr.get_approved_by() + self.assertGreater(len(approved_by), 0) + assert pr._reviews is not None # to pacify mypy + self.assertGreater(len(pr._reviews), 100) + + +if __name__ == "__main__": + main() diff --git a/.github/scripts/test_tryrebase.py b/.github/scripts/test_tryrebase.py new file mode 100644 index 000000000000..399f03933633 --- /dev/null +++ b/.github/scripts/test_tryrebase.py @@ -0,0 +1,42 @@ +from unittest import TestCase, mock, main +from test_trymerge import mocked_gh_graphql +from trymerge import GitHubPR +from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo +from typing import Any +from tryrebase import rebase_onto + + +class TestRebase(TestCase): + + @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql) + @mock.patch('gitutils.GitRepo._run_git') + @mock.patch('tryrebase.gh_post_comment') + def test_rebase(self, mocked_post_comment: Any, mocked_run_git: Any, mocked_gql: Any) -> None: + "Tests rebase successfully" + pr = GitHubPR("pytorch", "pytorch", 31093) + repo = GitRepo(get_git_repo_dir(), get_git_remote_name()) + rebase_onto(pr, repo) + calls = [mock.call('fetch', 'origin', 'pull/31093/head:pull/31093/head'), + mock.call('rebase', 'master', 'pull/31093/head'), + mock.call('push', '-f', 'https://github.com/mingxiaoh/pytorch.git', 'pull/31093/head:master')] + mocked_run_git.assert_has_calls(calls) + self.assertTrue("Successfully rebased `master` onto `master`" in mocked_post_comment.call_args[0][3]) + + @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql) + @mock.patch('gitutils.GitRepo._run_git', return_value="Everything up-to-date") + @mock.patch('tryrebase.gh_post_comment') + def test_no_need_to_rebase(self, mocked_post_comment: Any, mocked_run_git: Any, mocked_gql: Any) -> None: + "Tests branch already up to date" + pr = GitHubPR("pytorch", "pytorch", 31093) + repo = GitRepo(get_git_repo_dir(), get_git_remote_name()) + rebase_onto(pr, repo) + calls = [mock.call('fetch', 'origin', 'pull/31093/head:pull/31093/head'), + mock.call('rebase', 'master', 'pull/31093/head'), + mock.call('push', '-f', 'https://github.com/mingxiaoh/pytorch.git', 'pull/31093/head:master')] + mocked_run_git.assert_has_calls(calls) + self.assertTrue( + "Tried to rebase and push PR #31093, but it was already up to date" in mocked_post_comment.call_args[0][3]) + + +if __name__ == "__main__": + main() diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py index 927edd685a5e..7747fd0208bd 100755 --- a/.github/scripts/trymerge.py +++ b/.github/scripts/trymerge.py @@ -1,13 +1,17 @@ #!/usr/bin/env python3 +import base64 import json import os import re +import time from dataclasses import dataclass from urllib.request import urlopen, Request from urllib.error import HTTPError from typing import cast, Any, Callable, Dict, List, Optional, Tuple, Union from gitutils import get_git_remote_name, get_git_repo_dir, patterns_to_regex, GitRepo +from functools import lru_cache +from warnings import warn GH_GET_PR_INFO_QUERY = """ @@ -36,7 +40,7 @@ mergeCommit { oid } - commits(first: 100) { + commits_with_authors:commits(first: 100) { nodes { commit { author { @@ -47,34 +51,164 @@ name } oid - checkSuites(filterBy: {appId: 12274}, first: 1) { + } + } + pageInfo { + endCursor + hasNextPage + } + totalCount + } + commits(last: 1) { + nodes { + commit { + checkSuites(first: 10) { nodes { app { + name databaseId } + workflowRun { + workflow { + name + } + } + checkRuns(first: 50) { + nodes { + name + conclusion + detailsUrl + } + pageInfo { + endCursor + hasNextPage + } + } conclusion + url + } + pageInfo { + endCursor + hasNextPage } } + oid } } - totalCount } changedFiles - files(last: 100) { + files(first: 100) { nodes { path } + pageInfo { + endCursor + hasNextPage + } } - latestReviews(last: 100) { + reviews(last: 100) { nodes { author { login } state } - totalCount + pageInfo { + startCursor + hasPreviousPage + } + } + comments(last: 5) { + nodes { + bodyText + author { + login + } + authorAssociation + editor { + login + } + databaseId + } + pageInfo { + startCursor + hasPreviousPage + } + } + } + } +} +""" + +GH_GET_PR_NEXT_FILES_QUERY = """ +query ($owner: String!, $name: String!, $number: Int!, $cursor: String!) { + repository(name: $name, owner: $owner) { + pullRequest(number: $number) { + files(first: 100, after: $cursor) { + nodes { + path + } + pageInfo { + endCursor + hasNextPage + } + } + } + } +} +""" + +GH_GET_PR_NEXT_CHECK_RUNS = """ +query ($owner: String!, $name: String!, $number: Int!, $cursor: String!) { + repository(name: $name, owner: $owner) { + pullRequest(number: $number) { + commits(last: 1) { + nodes { + commit { + oid + checkSuites(first: 10, after: $cursor) { + nodes { + app { + name + databaseId + } + workflowRun { + workflow { + name + } + } + checkRuns(first: 50) { + nodes { + name + conclusion + detailsUrl + } + pageInfo { + endCursor + hasNextPage + } + } + conclusion + url + } + pageInfo { + endCursor + hasNextPage + } + } + } + } } - comments(last: 1) { + } + } +} +""" + +GH_GET_PR_PREV_COMMENTS = """ +query ($owner: String!, $name: String!, $number: Int!, $cursor: String!) { + repository(name: $name, owner: $owner) { + pullRequest(number: $number) { + comments(last: 100, before: $cursor) { nodes { bodyText author { @@ -84,6 +218,78 @@ editor { login } + databaseId + } + pageInfo { + startCursor + hasPreviousPage + } + } + } + } +} +""" + +# This query needs read-org permission +GH_GET_TEAM_MEMBERS_QUERY = """ +query($org: String!, $name: String!, $cursor: String) { + organization(login: $org) { + team(slug: $name) { + members(first: 100, after: $cursor) { + nodes { + login + } + pageInfo { + hasNextPage + endCursor + } + } + } + } +} +""" + +GH_GET_PR_NEXT_AUTHORS_QUERY = """ +query ($owner: String!, $name: String!, $number: Int!, $cursor: String) { + repository(name: $name, owner: $owner) { + pullRequest(number: $number) { + commits_with_authors: commits(first: 100, after: $cursor) { + nodes { + commit { + author { + user { + login + } + email + name + } + oid + } + } + pageInfo { + endCursor + hasNextPage + } + } + } + } +} +""" + +GH_GET_PR_PREV_REVIEWS_QUERY = """ +query ($owner: String!, $name: String!, $number: Int!, $cursor: String!) { + repository(name: $name, owner: $owner) { + pullRequest(number: $number) { + reviews(last: 100, before: $cursor) { + nodes { + author { + login + } + state + } + pageInfo { + startCursor + hasPreviousPage } } } @@ -99,6 +305,7 @@ re.MULTILINE ) RE_REVERT_CMD = re.compile(r"@pytorch(merge|)bot\s+revert\s+this") +RE_REVERT_CMD_CLI = re.compile(r"@pytorch(merge|)bot\s+revert\s+(-m.*-c.*|-c.*-m.*)") RE_DIFF_REV = re.compile(r'^Differential Revision:.+?(D[0-9]+)', re.MULTILINE) @@ -147,7 +354,7 @@ def gh_add_labels(org: str, project: str, pr_num: int, labels: Union[str, List[s def gh_graphql(query: str, **kwargs: Any) -> Dict[str, Any]: rc = _fetch_url("https://api.github.com/graphql", data={"query": query, "variables": kwargs}, reader=json.load) if "errors" in rc: - raise RuntimeError(f"GraphQL query {query} failed: {rc['errors']}") + raise RuntimeError(f"GraphQL query {query}, args {kwargs} failed: {rc['errors']}") return cast(Dict[str, Any], rc) @@ -156,14 +363,49 @@ def gh_get_pr_info(org: str, proj: str, pr_no: int) -> Any: return rc["data"]["repository"]["pullRequest"] +@lru_cache(maxsize=None) +def gh_get_team_members(org: str, name: str) -> List[str]: + rc: List[str] = [] + team_members: Dict[str, Any] = {"pageInfo": {"hasNextPage": "true", "endCursor": None}} + while bool(team_members["pageInfo"]["hasNextPage"]): + query = gh_graphql(GH_GET_TEAM_MEMBERS_QUERY, org=org, name=name, cursor=team_members["pageInfo"]["endCursor"]) + team = query["data"]["organization"]["team"] + if team is None: + warn(f"Requested non-existing team {org}/{name}") + return [] + team_members = team["members"] + rc += [member["login"] for member in team_members["nodes"]] + return rc + + def parse_args() -> Any: from argparse import ArgumentParser parser = ArgumentParser("Merge PR into default branch") parser.add_argument("--dry-run", action="store_true") + parser.add_argument("--on-green", action="store_true") parser.add_argument("--revert", action="store_true") + parser.add_argument("--force", action="store_true") + parser.add_argument("--comment-id", type=int) parser.add_argument("pr_num", type=int) return parser.parse_args() +def can_skip_internal_checks(pr: "GitHubPR", comment_id: Optional[int] = None) -> bool: + if comment_id is None: + return False + comment = pr.get_comment_by_id(comment_id) + if comment.editor_login is not None: + return False + return comment.author_login == "facebook-github-bot" + + +@dataclass +class GitHubComment: + body_text: str + author_login: str + author_association: str + editor_login: Optional[str] + database_id: int + class GitHubPR: def __init__(self, org: str, project: str, pr_num: int) -> None: @@ -172,6 +414,11 @@ def __init__(self, org: str, project: str, pr_num: int) -> None: self.project = project self.pr_num = pr_num self.info = gh_get_pr_info(org, project, pr_num) + self.changed_files: Optional[List[str]] = None + self.conclusions: Optional[Dict[str, Tuple[str, str]]] = None + self.comments: Optional[List[GitHubComment]] = None + self._authors: Optional[List[Tuple[str, str]]] = None + self._reviews: Optional[List[Tuple[str, str]]] = None def is_closed(self) -> bool: return bool(self.info["closed"]) @@ -198,39 +445,124 @@ def get_changed_files_count(self) -> int: return int(self.info["changedFiles"]) def get_changed_files(self) -> List[str]: - rc = [x["path"] for x in self.info["files"]["nodes"]] - if len(rc) != self.get_changed_files_count(): + if self.changed_files is None: + info = self.info + self.changed_files = [] + # Do not try to fetch more than 10K files + for _ in range(100): + self.changed_files += [x["path"] for x in info["files"]["nodes"]] + if not info["files"]["pageInfo"]["hasNextPage"]: + break + rc = gh_graphql(GH_GET_PR_NEXT_FILES_QUERY, + name=self.project, + owner=self.org, + number=self.pr_num, + cursor=info["files"]["pageInfo"]["endCursor"]) + info = rc["data"]["repository"]["pullRequest"] + + if len(self.changed_files) != self.get_changed_files_count(): raise RuntimeError("Changed file count mismatch") - return rc - - def _get_reviewers(self) -> List[Tuple[str, str]]: - reviews_count = int(self.info["latestReviews"]["totalCount"]) - if len(self.info["latestReviews"]["nodes"]) != reviews_count: - raise RuntimeError("Can't fetch all PR reviews") - return [(x["author"]["login"], x["state"]) for x in self.info["latestReviews"]["nodes"]] + return self.changed_files + + def _get_reviews(self) -> List[Tuple[str, str]]: + if self._reviews is None: + self._reviews = [] + info = self.info + for _ in range(100): + nodes = info["reviews"]["nodes"] + self._reviews = [(node["author"]["login"], node["state"]) for node in nodes] + self._reviews + if not info["reviews"]["pageInfo"]["hasPreviousPage"]: + break + rc = gh_graphql(GH_GET_PR_PREV_REVIEWS_QUERY, + name=self.project, + owner=self.org, + number=self.pr_num, + cursor=info["reviews"]["pageInfo"]["startCursor"]) + info = rc["data"]["repository"]["pullRequest"] + reviews = {} + for (author, state) in self._reviews: + if state != "COMMENTED": + reviews[author] = state + return list(reviews.items()) def get_approved_by(self) -> List[str]: - return [login for (login, state) in self._get_reviewers() if state == "APPROVED"] + return [login for (login, state) in self._get_reviews() if state == "APPROVED"] def get_commit_count(self) -> int: - return int(self.info["commits"]["totalCount"]) + return int(self.info["commits_with_authors"]["totalCount"]) def get_pr_creator_login(self) -> str: return cast(str, self.info["author"]["login"]) + def _fetch_authors(self) -> List[Tuple[str, str]]: + if self._authors is not None: + return self._authors + authors: List[Tuple[str, str]] = [] + + def add_authors(info: Dict[str, Any]) -> None: + for node in info["commits_with_authors"]["nodes"]: + author_node = node["commit"]["author"] + user_node = author_node["user"] + author = f"{author_node['name']} <{author_node['email']}>" + if user_node is None: + # If author is not github user, user node will be null + authors.append(("", author)) + else: + authors.append((cast(str, user_node["login"]), author)) + + info = self.info + for _ in range(100): + add_authors(info) + if not info["commits_with_authors"]["pageInfo"]["hasNextPage"]: + break + rc = gh_graphql(GH_GET_PR_NEXT_AUTHORS_QUERY, + name=self.project, + owner=self.org, + number=self.pr_num, + cursor=info["commits_with_authors"]["pageInfo"]["endCursor"]) + info = rc["data"]["repository"]["pullRequest"] + self._authors = authors + return authors + def get_committer_login(self, num: int = 0) -> str: - return cast(str, self.info["commits"]["nodes"][num]["commit"]["author"]["user"]["login"]) + return self._fetch_authors()[num][0] def get_committer_author(self, num: int = 0) -> str: - node = self.info["commits"]["nodes"][num]["commit"]["author"] - return f"{node['name']} <{node['email']}>" - - def get_check_suite_conclusions(self) -> Dict[int, str]: - last_commit = self.info["commits"]["nodes"][-1]["commit"] - rc = {} - for node in last_commit["checkSuites"]["nodes"]: - rc[int(node["app"]["databaseId"])] = node["conclusion"] - return rc + return self._fetch_authors()[num][1] + + def get_checkrun_conclusions(self) -> Dict[str, Tuple[str, str]]: + """ Returns dict of checkrun -> [conclusion, url] """ + if self.conclusions is not None: + return self.conclusions + orig_last_commit = self.info["commits"]["nodes"][-1]["commit"] + checksuites = orig_last_commit["checkSuites"] + conclusions = {} + + def add_conclusions(nodes: List[Dict[str, Any]]) -> None: + for node in nodes: + workflow_run = node["workflowRun"] + checkruns = node["checkRuns"] + if workflow_run is not None: + conclusions[workflow_run["workflow"]["name"]] = (node["conclusion"], node["url"]) + if checkruns is not None: + for checkrun_node in checkruns["nodes"]: + conclusions[checkrun_node["name"]] = (checkrun_node["conclusion"], checkrun_node["detailsUrl"]) + + add_conclusions(checksuites["nodes"]) + while bool(checksuites["pageInfo"]["hasNextPage"]): + rc = gh_graphql(GH_GET_PR_NEXT_CHECK_RUNS, + name=self.project, + owner=self.org, + number=self.pr_num, + cursor=checksuites["pageInfo"]["endCursor"]) + info = rc["data"]["repository"]["pullRequest"] + last_commit = info["commits"]["nodes"][-1]["commit"] + if last_commit["oid"] != orig_last_commit["oid"]: + raise RuntimeError("Last commit changed on PR") + checksuites = last_commit["checkSuites"] + add_conclusions(checksuites["nodes"]) + self.conclusions = conclusions + return conclusions def get_authors(self) -> Dict[str, str]: rc = {} @@ -243,7 +575,12 @@ def get_author(self) -> str: authors = self.get_authors() if len(authors) == 1: return next(iter(authors.values())) - return self.get_authors()[self.get_pr_creator_login()] + creator = self.get_pr_creator_login() + # If PR creator is not among authors + # Assume it was authored by first commit author + if creator not in authors: + return self.get_committer_author(0) + return authors[creator] def get_title(self) -> str: return cast(str, self.info["title"]) @@ -258,21 +595,66 @@ def get_merge_commit(self) -> Optional[str]: def get_pr_url(self) -> str: return f"https://github.com/{self.org}/{self.project}/pull/{self.pr_num}" - def get_comment_body(self, num: int = -1) -> str: - return cast(str, self.info["comments"]["nodes"][num]["bodyText"]) - - def get_comment_author_login(self, num: int = -1) -> str: - return cast(str, self.info["comments"]["nodes"][num]["author"]["login"]) - - def get_comment_editor_login(self, num: int = -1) -> Optional[str]: - rc = self.info["comments"]["nodes"][num]["editor"] - return rc["login"] if rc is not None else None - - def get_comment_author_association(self, num: int = -1) -> str: - return cast(str, self.info["comments"]["nodes"][num]["authorAssociation"]) - - def merge_ghstack_into(self, repo: GitRepo) -> None: + @staticmethod + def _comment_from_node(node: Any) -> GitHubComment: + editor = node["editor"] + return GitHubComment(body_text=node["bodyText"], + author_login=node["author"]["login"], + author_association=node["authorAssociation"], + editor_login=editor["login"] if editor else None, + database_id=node["databaseId"] + ) + + def get_comments(self) -> List[GitHubComment]: + if self.comments is not None: + return self.comments + self.comments = [] + info = self.info["comments"] + # Do not try to fetch more than 10K comments + for _ in range(100): + self.comments = [self._comment_from_node(node) for node in info["nodes"]] + self.comments + if not info["pageInfo"]["hasPreviousPage"]: + break + rc = gh_graphql(GH_GET_PR_PREV_COMMENTS, + name=self.project, + owner=self.org, + number=self.pr_num, + cursor=info["pageInfo"]["startCursor"]) + info = rc["data"]["repository"]["pullRequest"]["comments"] + return self.comments + + def get_last_comment(self) -> GitHubComment: + return self._comment_from_node(self.info["comments"]["nodes"][-1]) + + def get_comment_by_id(self, database_id: int) -> GitHubComment: + if self.comments is None: + # Fastpath - try searching in partial prefetched comments + for node in self.info["comments"]["nodes"]: + comment = self._comment_from_node(node) + if comment.database_id == database_id: + return comment + + for comment in self.get_comments(): + if comment.database_id == database_id: + return comment + raise RuntimeError(f"Comment with id {database_id} not found") + + def get_diff_revision(self) -> Optional[str]: + rc = RE_DIFF_REV.search(self.get_body()) + return rc.group(1) if rc is not None else None + + def has_internal_changes(self) -> bool: + checkrun_name = "Meta Internal-Only Changes Check" + if self.get_diff_revision() is None: + return False + checks = self.get_checkrun_conclusions() + if checks is None or checkrun_name not in checks: + return False + return checks[checkrun_name][0] != "SUCCESS" + + def merge_ghstack_into(self, repo: GitRepo, force: bool, comment_id: Optional[int] = None) -> None: assert self.is_ghstack_pr() + approved_by = self.get_approved_by() # For ghstack, cherry-pick commits based from origin orig_ref = f"{repo.remote}/{re.sub(r'/head$', '/orig', self.head_ref())}" rev_list = repo.revlist(f"{self.default_branch()}..{orig_ref}") @@ -289,98 +671,179 @@ def merge_ghstack_into(self, repo: GitRepo) -> None: if pr.is_closed(): print(f"Skipping {idx+1} of {len(rev_list)} PR (#{pr_num}) as its already been merged") continue + approved_by = pr.get_approved_by() # Raises exception if matching rule is not found - find_matching_merge_rule(pr, repo) + find_matching_merge_rule(pr, repo, force=force, skip_internal_checks=can_skip_internal_checks(self, comment_id)) + # Adding the url here makes it clickable within the Github UI + approved_by_urls = ', '.join(prefix_with_github_url(login) for login in approved_by) repo.cherry_pick(rev) - repo.amend_commit_message(re.sub(RE_GHSTACK_SOURCE_ID, "", msg)) + msg = re.sub(RE_GHSTACK_SOURCE_ID, "", msg) + msg += f"\nApproved by: {approved_by_urls}\n" + repo.amend_commit_message(msg) - def merge_into(self, repo: GitRepo, dry_run: bool = False) -> None: + def merge_into(self, repo: GitRepo, *, force: bool = False, dry_run: bool = False, comment_id: Optional[int] = None) -> None: # Raises exception if matching rule is not found - find_matching_merge_rule(self, repo) + find_matching_merge_rule(self, repo, force=force, skip_internal_checks=can_skip_internal_checks(self, comment_id)) if repo.current_branch() != self.default_branch(): repo.checkout(self.default_branch()) if not self.is_ghstack_pr(): - msg = self.get_title() + "\n\n" + self.get_body() + # Adding the url here makes it clickable within the Github UI + approved_by_urls = ', '.join(prefix_with_github_url(login) for login in self.get_approved_by()) + msg = self.get_title() + f" (#{self.pr_num})\n\n" + self.get_body() msg += f"\nPull Request resolved: {self.get_pr_url()}\n" - repo._run_git("merge", "--squash", f"{repo.remote}/{self.head_ref()}") + msg += f"Approved by: {approved_by_urls}\n" + pr_branch_name = f"__pull-request-{self.pr_num}__init__" + repo.fetch(f"pull/{self.pr_num}/head", pr_branch_name) + repo._run_git("merge", "--squash", pr_branch_name) repo._run_git("commit", f"--author=\"{self.get_author()}\"", "-m", msg) else: - self.merge_ghstack_into(repo) + self.merge_ghstack_into(repo, force, comment_id=comment_id) repo.push(self.default_branch(), dry_run) + if not dry_run: + gh_add_labels(self.org, self.project, self.pr_num, ["merged"]) - +class MandatoryChecksMissingError(Exception): + pass @dataclass class MergeRule: name: str patterns: List[str] approved_by: List[str] - mandatory_app_id: Optional[int] + mandatory_checks_name: Optional[List[str]] -def read_merge_rules(repo: GitRepo) -> List[MergeRule]: +def read_merge_rules(repo: Optional[GitRepo], org: str, project: str) -> List[MergeRule]: from pathlib import Path - rules_path = Path(repo.repo_dir) / ".github" / "merge_rules.json" - if not rules_path.exists(): - print(f"{rules_path} does not exist, returning empty rules") - return [] - with open(rules_path) as fp: - rc = json.load(fp, object_hook=lambda x: MergeRule(**x)) - return cast(List[MergeRule], rc) - - -def find_matching_merge_rule(pr: GitHubPR, repo: GitRepo) -> MergeRule: + repo_relative_rules_path = Path(".github") / "merge_rules.json" + if repo is None: + json_data = _fetch_url( + f"https://api.github.com/repos/{org}/{project}/contents/{repo_relative_rules_path}", + headers={'Accept': 'application/vnd.github.v3+json'}, + reader=json.load, + ) + content = base64.b64decode(json_data["content"]) + return cast(List[MergeRule], json.loads(content, object_hook=lambda x: MergeRule(**x))) + else: + rules_path = Path(repo.repo_dir) / repo_relative_rules_path + if not rules_path.exists(): + print(f"{rules_path} does not exist, returning empty rules") + return [] + with open(rules_path) as fp: + rc = json.load(fp, object_hook=lambda x: MergeRule(**x)) + return cast(List[MergeRule], rc) + + +def find_matching_merge_rule(pr: GitHubPR, + repo: Optional[GitRepo] = None, + force: bool = False, + skip_internal_checks: bool = False + ) -> MergeRule: """Returns merge rule matching to this pr or raises an exception""" changed_files = pr.get_changed_files() approved_by = set(pr.get_approved_by()) - rules = read_merge_rules(repo) + rules = read_merge_rules(repo, pr.org, pr.project) + reject_reason = f"PR {pr.pr_num} does not match merge rules" + # Used to determine best rejection reason + # Score 0 to 10K - how many files rule matched + # Score 10K - matched all files, but no overlapping approvers + # Score 20K - matched all files and approvers, but mandatory checks are pending + # Score 30k - Matched all files and approvers, but mandatory checks failed + reject_reason_score = 0 for rule in rules: rule_name = rule.name - rule_approvers_set = set(rule.approved_by) patterns_re = patterns_to_regex(rule.patterns) - approvers_intersection = approved_by.intersection(rule_approvers_set) - # If rule requires approvers but they aren't the ones that reviewed PR - if len(approvers_intersection) == 0 and len(rule_approvers_set) > 0: - print(f"Skipping rule {rule_name} due to no approvers overlap") - continue - if rule.mandatory_app_id is not None: - cs_conslusions = pr.get_check_suite_conclusions() - mandatory_app_id = rule.mandatory_app_id - if mandatory_app_id not in cs_conslusions or cs_conslusions[mandatory_app_id] != "SUCCESS": - print(f"Skipping rule {rule_name} as mandatory app {mandatory_app_id} is not in {cs_conslusions}") - continue non_matching_files = [] for fname in changed_files: if not patterns_re.match(fname): non_matching_files.append(fname) if len(non_matching_files) > 0: - print(f"Skipping rule {rule_name} due to non-matching files: {non_matching_files}") + num_matching_files = len(changed_files) - len(non_matching_files) + if num_matching_files > reject_reason_score: + reject_reason_score = num_matching_files + reject_reason = (f"{num_matching_files} files matched rule {rule_name}, but there are still non-matching files: " + + f"{','.join(non_matching_files[:5])}{', ...' if len(non_matching_files) > 5 else ''}") + continue + # If rule needs approvers but PR has not been reviewed, skip it + if len(rule.approved_by) > 0 and len(approved_by) == 0: + if reject_reason_score < 10000: + reject_reason_score = 10000 + reject_reason = f"Matched rule {rule_name}, but PR has not been reviewed yet" continue - print(f"Matched rule {rule_name} for {pr.pr_num}") + + rule_approvers_set = set() + for approver in rule.approved_by: + if "/" in approver: + org, name = approver.split("/") + rule_approvers_set.update(gh_get_team_members(org, name)) + else: + rule_approvers_set.add(approver) + approvers_intersection = approved_by.intersection(rule_approvers_set) + # If rule requires approvers but they aren't the ones that reviewed PR + if len(approvers_intersection) == 0 and len(rule_approvers_set) > 0: + if reject_reason_score < 10000: + reject_reason_score = 10000 + reject_reason = (f"Matched rule {rule_name}, but it was not reviewed yet by any of:" + + f"{','.join(list(rule_approvers_set)[:5])}{', ...' if len(rule_approvers_set) > 5 else ''}") + continue + if rule.mandatory_checks_name is not None: + pending_checks: List[Tuple[str, Optional[str]]] = [] + failed_checks: List[Tuple[str, Optional[str]]] = [] + checks = pr.get_checkrun_conclusions() + # HACK: We don't want to skip CLA check, even when forced + for checkname in filter(lambda x: force is False or "CLA Check" in x, rule.mandatory_checks_name): + if checkname not in checks: + pending_checks.append((checkname, None)) + elif checks[checkname][0] is None: + pending_checks.append((checkname, checks[checkname][1])) + elif checks[checkname][0] != 'SUCCESS': + failed_checks.append((checkname, checks[checkname][1])) + + def checks_to_str(checks: List[Tuple[str, Optional[str]]]) -> str: + return ", ".join(f"[{c[0]}]({c[1]})" if c[1] is not None else c[0] for c in checks) + + if len(failed_checks) > 0: + if reject_reason_score < 30000: + reject_reason_score = 30000 + reject_reason = ("Refusing to merge as mandatory check(s)" + + checks_to_str(failed_checks) + f" failed for rule {rule_name}") + continue + elif len(pending_checks) > 0: + if reject_reason_score < 20000: + reject_reason_score = 20000 + reject_reason = f"Refusing to merge as mandatory check(s) {checks_to_str(pending_checks)}" + reject_reason += f" are pending/not yet run for rule {rule_name}" + continue + if not skip_internal_checks and pr.has_internal_changes(): + raise RuntimeError("This PR has internal changes and must be landed via Phabricator") return rule - raise RuntimeError(f"PR {pr.pr_num} does not match merge rules") + if reject_reason_score == 20000: + raise MandatoryChecksMissingError(reject_reason) + raise RuntimeError(reject_reason) -def try_revert(repo: GitRepo, pr: GitHubPR, dry_run: bool = False) -> None: +def try_revert(repo: GitRepo, pr: GitHubPR, *, dry_run: bool = False, comment_id: Optional[int] = None) -> None: def post_comment(msg: str) -> None: gh_post_comment(pr.org, pr.project, pr.pr_num, msg, dry_run=dry_run) if not pr.is_closed(): return post_comment(f"Can't revert open PR #{pr.pr_num}") - if not RE_REVERT_CMD.match(pr.get_comment_body()): - raise RuntimeError(f"Comment {pr.get_comment_body()} does not seem to be a valid revert command") - if pr.get_comment_editor_login() is not None: + comment = pr.get_last_comment() if comment_id is None else pr.get_comment_by_id(comment_id) + if not RE_REVERT_CMD.match(comment.body_text) and not RE_REVERT_CMD_CLI.match(comment.body_text): + raise RuntimeError(f"Comment {comment.body_text} does not seem to be a valid revert command") + if comment.editor_login is not None: return post_comment("Don't want to revert based on edited command") - author_association = pr.get_comment_author_association() - author_login = pr.get_comment_author_login() + author_association = comment.author_association + author_login = comment.author_login # For some reason, one can not be a member of private repo, only CONTRIBUTOR expected_association = "CONTRIBUTOR" if pr.is_base_repo_private() else "MEMBER" if author_association != expected_association and author_association != "OWNER": return post_comment(f"Will not revert as @{author_login} is not a {expected_association}, but {author_association}") + skip_internal_checks = can_skip_internal_checks(pr, comment_id) - # Raises exception if matching rule is not found - find_matching_merge_rule(pr, repo) + # Raises exception if matching rule is not found, but ignores all status checks + find_matching_merge_rule(pr, repo, force=True, skip_internal_checks=skip_internal_checks) commit_sha = pr.get_merge_commit() if commit_sha is None: commits = repo.commits_resolving_gh_pr(pr.pr_num) @@ -389,51 +852,88 @@ def post_comment(msg: str) -> None: commit_sha = commits[0] msg = repo.commit_message(commit_sha) rc = RE_DIFF_REV.search(msg) - if rc is not None: + if rc is not None and not can_skip_internal_checks: raise RuntimeError(f"Can't revert PR that was landed via phabricator as {rc.group(1)}") repo.checkout(pr.default_branch()) repo.revert(commit_sha) msg = repo.commit_message("HEAD") msg = re.sub(RE_PULL_REQUEST_RESOLVED, "", msg) - msg += f"\nReverted {pr.get_pr_url()} on behalf of @{author_login}\n" + msg += f"\nReverted {pr.get_pr_url()} on behalf of {prefix_with_github_url(author_login)}\n" repo.amend_commit_message(msg) repo.push(pr.default_branch(), dry_run) if not dry_run: gh_add_labels(pr.org, pr.project, pr.pr_num, ["reverted"]) + +def prefix_with_github_url(suffix_str: str) -> str: + return f"https://github.com/{suffix_str}" + + +def merge_on_green(pr_num: int, repo: GitRepo, dry_run: bool = False, timeout_minutes: int = 400) -> None: + repo = GitRepo(get_git_repo_dir(), get_git_remote_name()) + org, project = repo.gh_owner_and_name() + start_time = time.time() + last_exception = '' + elapsed_time = 0.0 + while elapsed_time < timeout_minutes * 60: + current_time = time.time() + elapsed_time = current_time - start_time + + + pr = GitHubPR(org, project, pr_num) + try: + return pr.merge_into(repo, dry_run=dry_run) + except MandatoryChecksMissingError as ex: + last_exception = str(ex) + print(f"Merged failed due to: {ex}. Retrying in 60 seconds.") + time.sleep(60) + # Finally report timeout back + msg = f"Merged timed out after {timeout_minutes} minutes. Please contact the pytorch_dev_infra team." + msg += f"The last exception was: {last_exception}" + if not dry_run: + gh_add_labels(org, project, pr_num, ["land-failed"]) + raise RuntimeError(msg) + def main() -> None: args = parse_args() repo = GitRepo(get_git_repo_dir(), get_git_remote_name()) org, project = repo.gh_owner_and_name() - pr = GitHubPR(org, project, args.pr_num) + + def handle_exception(e: Exception, msg: str = "Merge failed") -> None: + msg += f" due to {e}" + run_url = os.getenv("GH_RUN_URL") + if run_url is not None: + msg += f"\nRaised by {run_url}" + gh_post_comment(org, project, args.pr_num, msg, dry_run=args.dry_run) + import traceback + traceback.print_exc() + if args.revert: try: - try_revert(repo, pr, dry_run=args.dry_run) + try_revert(repo, pr, dry_run=args.dry_run, comment_id=args.comment_id) except Exception as e: - msg = f"Reverting PR {args.pr_num} failed due to {e}" - run_url = os.getenv("GH_RUN_URL") - if run_url is not None: - msg += f"\nRaised by {run_url}" - gh_post_comment(org, project, args.pr_num, msg, dry_run=args.dry_run) + handle_exception(e, f"Reverting PR {args.pr_num} failed") return if pr.is_closed(): gh_post_comment(org, project, args.pr_num, f"Can't merge closed PR #{args.pr_num}", dry_run=args.dry_run) return - if pr.is_cross_repo(): - gh_post_comment(org, project, args.pr_num, "Cross-repo merges are not supported at the moment", dry_run=args.dry_run) + if pr.is_cross_repo() and pr.is_ghstack_pr(): + gh_post_comment(org, project, args.pr_num, "Cross-repo ghstack merges are not supported", dry_run=args.dry_run) return - try: - pr.merge_into(repo, dry_run=args.dry_run) - except Exception as e: - msg = f"Merge failed due to {e}" - run_url = os.getenv("GH_RUN_URL") - if run_url is not None: - msg += f"\nRaised by {run_url}" - gh_post_comment(org, project, args.pr_num, msg, dry_run=args.dry_run) + if args.on_green: + try: + merge_on_green(args.pr_num, repo, dry_run=args.dry_run) + except Exception as e: + handle_exception(e) + else: + try: + pr.merge_into(repo, dry_run=args.dry_run, force=args.force, comment_id=args.comment_id) + except Exception as e: + handle_exception(e) if __name__ == "__main__": diff --git a/.github/scripts/tryrebase.py b/.github/scripts/tryrebase.py new file mode 100755 index 000000000000..a382de511a41 --- /dev/null +++ b/.github/scripts/tryrebase.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 + +import os +from typing import Any +from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo +from trymerge import gh_post_comment, GitHubPR + + +def parse_args() -> Any: + from argparse import ArgumentParser + parser = ArgumentParser("Rebase PR into branch") + parser.add_argument("--dry-run", action="store_true") + parser.add_argument("pr_num", type=int) + return parser.parse_args() + + +def rebase_onto(pr: GitHubPR, repo: GitRepo, dry_run: bool = False) -> None: + branch = f"pull/{pr.pr_num}/head" + onto_branch = pr.default_branch() + remote_url = f"https://github.com/{pr.info['headRepository']['nameWithOwner']}.git" + refspec = f"{branch}:{pr.head_ref()}" + + repo.fetch(branch, branch) + repo._run_git("rebase", onto_branch, branch) + if dry_run: + push_result = repo._run_git("push", "--dry-run", "-f", remote_url, refspec) + else: + push_result = repo._run_git("push", "-f", remote_url, refspec) + if "Everything up-to-date" in push_result: + gh_post_comment(pr.org, pr.project, pr.pr_num, + f"Tried to rebase and push PR #{pr.pr_num}, but it was already up to date", dry_run=dry_run) + else: + gh_post_comment(pr.org, pr.project, pr.pr_num, + f"Successfully rebased `{pr.head_ref()}` onto `{onto_branch}`, please pull locally " + + f"before adding more changes (for example, via `git checkout {pr.head_ref()} && " + + "git pull --rebase`)", dry_run=dry_run) + + +def main() -> None: + args = parse_args() + repo = GitRepo(get_git_repo_dir(), get_git_remote_name(), debug=True) + org, project = repo.gh_owner_and_name() + + pr = GitHubPR(org, project, args.pr_num) + + if pr.is_closed(): + gh_post_comment(org, project, args.pr_num, f"PR #{args.pr_num} is closed, won't rebase", dry_run=args.dry_run) + return + + if pr.is_ghstack_pr(): + gh_post_comment(org, project, args.pr_num, + f"PR #{args.pr_num} is a ghstack, which is currently not supported", dry_run=args.dry_run) + return + + try: + rebase_onto(pr, repo, dry_run=args.dry_run) + except Exception as e: + msg = f"Rebase failed due to {e}" + run_url = os.getenv("GH_RUN_URL") + if run_url is not None: + msg += f"\nRaised by {run_url}" + gh_post_comment(org, project, args.pr_num, msg, dry_run=args.dry_run) + + +if __name__ == "__main__": + main() diff --git a/.github/templates/android_ci_full_workflow.yml.j2 b/.github/templates/android_ci_full_workflow.yml.j2 deleted file mode 100644 index b89ae9fd94a5..000000000000 --- a/.github/templates/android_ci_full_workflow.yml.j2 +++ /dev/null @@ -1,165 +0,0 @@ -{%- extends "linux_ci_workflow.yml.j2" -%} -{% import 'common_android.yml.j2' as common_android %} -{%- set exclude_test = true -%} -{% block name -%} -# Template is at: .github/templates/android_ci_full_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: !{{ build_environment }} -{%- endblock %} - -on: -{%- if is_default %} - pull_request: -{%- endif -%} -{%- for label in ciflow_config.labels | sort %} - {%- if loop.first %} - push: - tags: - {%- endif %} - {%- if label != "ciflow/default" %} - - '!{{ label }}/*' - {%- endif %} -{%- endfor %} - -{% block build +%} - # building and testing in a single job since bazel runs only small subset of tests - build-and-test: - runs-on: !{{ test_runner_type }} - env: - JOB_BASE_NAME: !{{ build_environment }}-build-and-test - NUM_TEST_SHARDS: !{{ num_test_shards }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - !{{ common.setup_ec2_linux() }} - !{{ common.checkout() }} - !{{ common.calculate_docker_image(false) }} - - name: Pull Docker image - run: | - !{{ common.add_retry_to_env() }} - retry docker pull "${DOCKER_IMAGE}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - name: Output disk space left - run: | - sudo df -H - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - !{{ common.parse_ref() }} - !{{ common_android.build_android("pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v7a-build", "arm-v7a") }} - !{{ common_android.build_android("pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v8a-build", "arm-v8a") }} - !{{ common_android.build_android("pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32-build", "x86_32") }} - !{{ common_android.build_android("pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_64-build", "x86_64") }} - - name: Build-Final-Artifcact - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - set -eux - - docker_image_libtorch_android_x86_32="${DOCKER_IMAGE}-x86_32" - docker_image_libtorch_android_x86_64="${DOCKER_IMAGE}-x86_64" - docker_image_libtorch_android_arm_v7a="${DOCKER_IMAGE}-arm-v7a" - docker_image_libtorch_android_arm_v8a="${DOCKER_IMAGE}-arm-v8a" - - echo "docker_image_commit: ${DOCKER_IMAGE}" - echo "docker_image_libtorch_android_x86_32: ${docker_image_libtorch_android_x86_32}" - echo "docker_image_libtorch_android_x86_64: ${docker_image_libtorch_android_x86_64}" - echo "docker_image_libtorch_android_arm_v7a: ${docker_image_libtorch_android_arm_v7a}" - echo "docker_image_libtorch_android_arm_v8a: ${docker_image_libtorch_android_arm_v8a}" - - # x86_32 - time docker pull "${docker_image_libtorch_android_x86_32}" >/dev/null - export id_x86_32 - id_x86_32=$(docker run -e GRADLE_OFFLINE=1 --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins "${docker_image_libtorch_android_x86_32}") - - # shellcheck disable=SC1105 - ((echo "sudo chown -R jenkins workspace") | docker exec -u jenkins -i "${id_x86_32}" bash) 2>&1 - - # arm-v7a - time docker pull "${docker_image_libtorch_android_arm_v7a}" >/dev/null - export id_arm_v7a - id_arm_v7a=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins "${docker_image_libtorch_android_arm_v7a}") - - # shellcheck disable=SC1105 - ((echo "sudo chown -R jenkins workspace") | docker exec -u jenkins -i "${id_arm_v7a}" bash) 2>&1 - - mkdir -p "${GITHUB_WORKSPACE}/build_android_install_arm_v7a" - docker cp "${id_arm_v7a}:/var/lib/jenkins/workspace/build_android/install" "${GITHUB_WORKSPACE}/build_android_install_arm_v7a" - - # x86_64 - time docker pull "${docker_image_libtorch_android_x86_64}" >/dev/null - export id_x86_64 - id_x86_64=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins "${docker_image_libtorch_android_x86_64}") - - # shellcheck disable=SC1105 - ((echo "sudo chown -R jenkins workspace") | docker exec -u jenkins -i "${id_x86_64}" bash) 2>&1 - - mkdir -p "${GITHUB_WORKSPACE}/build_android_install_x86_64" - docker cp "${id_x86_64}:/var/lib/jenkins/workspace/build_android/install" "${GITHUB_WORKSPACE}/build_android_install_x86_64" - - # arm-v8a - time docker pull "${docker_image_libtorch_android_arm_v8a}" >/dev/null - export id_arm_v8a - id_arm_v8a=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins "${docker_image_libtorch_android_arm_v8a}") - - # shellcheck disable=SC1105 - ((echo "sudo chown -R jenkins workspace") | docker exec -u jenkins -i "$id_arm_v8a" bash) 2>&1 - - mkdir -p "${GITHUB_WORKSPACE}/build_android_install_arm_v8a" - docker cp "${id_arm_v8a}:/var/lib/jenkins/workspace/build_android/install" "${GITHUB_WORKSPACE}/build_android_install_arm_v8a" - - # Putting everything together - docker cp "${GITHUB_WORKSPACE}/build_android_install_arm_v7a" "${id_x86_32}:/var/lib/jenkins/workspace/build_android_install_arm_v7a" - docker cp "${GITHUB_WORKSPACE}/build_android_install_x86_64" "${id_x86_32}:/var/lib/jenkins/workspace/build_android_install_x86_64" - docker cp "${GITHUB_WORKSPACE}/build_android_install_arm_v8a" "${id_x86_32}:/var/lib/jenkins/workspace/build_android_install_arm_v8a" - - # run gradle buildRelease - # shellcheck disable=SC1105 - ((echo "sudo chown -R jenkins workspace && cd workspace && ./.circleci/scripts/build_android_gradle.sh") | docker exec \ - -e BUILD_ENVIRONMENT="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build" \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="!{{ common.squid_proxy }}" -e https_proxy="!{{ common.squid_proxy }}" -e no_proxy="!{{ common.squid_no_proxy }}" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --user jenkins \ - -u jenkins -i "${id_x86_32}" bash) 2>&1 - - mkdir -p "${GITHUB_WORKSPACE}/build_android_artifacts" - docker cp "${id_x86_32}:/var/lib/jenkins/workspace/android/artifacts.tgz" "${GITHUB_WORKSPACE}/build_android_artifacts/" - - output_image="${DOCKER_IMAGE}-android-x86_32-gradle" - docker commit "${id_x86_32}" "${output_image}" - time docker push "${output_image}" - !{{ common_android.upload_androind_binary_size("prebuilt", "${GITHUB_WORKSPACE}/build_android_artifacts/artifacts.tgz") }} - - uses: !{{ common.upload_artifact_s3_action }} - name: Store PyTorch Android Build Artifacts on S3 - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - build_android_artifacts/artifacts.tgz - !{{ common.teardown_ec2_linux() }} -{%- endblock %} diff --git a/.github/templates/android_ci_workflow.yml.j2 b/.github/templates/android_ci_workflow.yml.j2 deleted file mode 100644 index c86b94c1ad48..000000000000 --- a/.github/templates/android_ci_workflow.yml.j2 +++ /dev/null @@ -1,111 +0,0 @@ -{%- extends "linux_ci_workflow.yml.j2" -%} -{% import 'common_android.yml.j2' as common_android %} -{%- set exclude_test = true -%} -{% block name -%} -# Template is at: .github/templates/android_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: !{{ build_environment }} -{%- endblock %} - -on: -{%- if is_default %} - pull_request: -{%- endif -%} -{%- for label in ciflow_config.labels | sort %} - {%- if loop.first %} - push: - tags: - {%- endif %} - {%- if label != "ciflow/default" %} - - '!{{ label }}/*' - {%- endif %} -{%- endfor %} - -{% block build +%} - # building and testing in a single job since bazel runs only small subset of tests - build-and-test: - runs-on: !{{ test_runner_type }} - env: - JOB_BASE_NAME: !{{ build_environment }}-build-and-test - NUM_TEST_SHARDS: !{{ num_test_shards }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - !{{ common.setup_ec2_linux() }} - !{{ common.checkout() }} - !{{ common.calculate_docker_image(false) }} - - name: Pull Docker image - run: | - !{{ common.add_retry_to_env() }} - retry docker pull "${DOCKER_IMAGE}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - name: Output disk space left - run: | - sudo df -H - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Build - run: | - set -e - # Unlike other gradle jobs, it's not worth building libtorch in a separate CI job and share via docker, because: - # 1) Not shareable: it's custom selective build, which is different from default libtorch mobile build; - # 2) Not parallelizable by architecture: it only builds libtorch for one architecture; - - echo "DOCKER_IMAGE: ${DOCKER_IMAGE}" - time docker pull "${DOCKER_IMAGE}" >/dev/null - - export BUILD_LITE_INTERPRETER - BUILD_LITE_INTERPRETER="1" - if [[ "${BUILD_ENVIRONMENT}" == *"full-jit" ]]; then - BUILD_LITE_INTERPRETER="0" - fi - - git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0 - # shellcheck disable=SC2016 - export id - id=$(docker run -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e PR_LABELS \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e BUILD_LITE_INTERPRETER \ - -e http_proxy="!{{ common.squid_proxy }}" -e https_proxy="!{{ common.squid_proxy }}" -e no_proxy="!{{ common.squid_no_proxy }}" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "$(pwd):/var/lib/jenkins/workspace" \ - --cap-add=SYS_PTRACE \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --security-opt seccomp=unconfined \ - -t -d -w /var/lib/jenkins "${DOCKER_IMAGE}") - - # shellcheck disable=SC2016 - export COMMAND - # shellcheck disable=SC2016 - COMMAND='((echo "export GRADLE_OFFLINE=1" && echo "export BUILD_LITE_INTERPRETER=${BUILD_LITE_INTERPRETER}" && echo "sudo chown -R jenkins workspace && cd workspace && ./.circleci/scripts/build_android_gradle.sh") | docker exec -u jenkins -i "$id" bash) 2>&1' - echo "${COMMAND}" > ./command.sh && bash ./command.sh - # Skip docker push as this job is purely for size analysis purpose. - # Result binaries are already in `/home/circleci/project/` as it's mounted instead of copied. - !{{ common.parse_ref() }} - !{{ common_android.upload_androind_binary_size("custom-build-single", "") }} - !{{ common.teardown_ec2_linux() }} -{%- endblock %} diff --git a/.github/templates/bazel_ci_workflow.yml.j2 b/.github/templates/bazel_ci_workflow.yml.j2 deleted file mode 100644 index 0480835794bc..000000000000 --- a/.github/templates/bazel_ci_workflow.yml.j2 +++ /dev/null @@ -1,127 +0,0 @@ -{%- extends "linux_ci_workflow.yml.j2" -%} -{% import 'common_android.yml.j2' as common_android %} -{%- set exclude_test = true -%} -{% block name -%} -# Template is at: .github/templates/bazel_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: !{{ build_environment }} -{%- endblock %} - -on: -{%- if is_default %} - pull_request: -{%- endif -%} -{%- for label in ciflow_config.labels | sort %} - {%- if loop.first %} - push: - tags: - {%- endif %} - {%- if label != "ciflow/default" %} - - '!{{ label }}/*' - {%- endif %} -{%- endfor %} - -{% block build +%} - # building and testing in a single job since bazel runs only small subset of tests - build-and-test: - runs-on: !{{ test_runner_type }} - env: - JOB_BASE_NAME: !{{ build_environment }}-build-and-test - NUM_TEST_SHARDS: !{{ num_test_shards }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - !{{ common.setup_ec2_linux() }} - !{{ common.checkout() }} - !{{ common.calculate_docker_image(false) }} - - name: Pull Docker image - run: | - !{{ common.add_retry_to_env() }} - retry docker pull "${DOCKER_IMAGE}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - name: Output disk space left - run: | - sudo df -H - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Build - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e PR_LABELS \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e http_proxy="!{{ common.squid_proxy }}" -e https_proxy="!{{ common.squid_proxy }}" -e no_proxy="!{{ common.squid_no_proxy }}" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && sudo chown -R jenkins /dev && .jenkins/pytorch/build.sh' - !{{ common.parse_ref() }} - !{{ common_android.upload_androind_binary_size("", "")}} - - name: Test - # Time out the test phase after 3.5 hours - timeout-minutes: 210 - run: | - # detached container should get cleaned up by teardown_ec2_linux - export SHARD_NUMBER=0 - # TODO: Stop building test binaries as part of the build phase - # Make sure we copy test results from bazel-testlogs symlink to - # a regular directory ./test/test-reports - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e SHARD_NUMBER \ - -e NUM_TEST_SHARDS \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e PR_LABELS \ - -e http_proxy="!{{ common.squid_proxy }}" -e https_proxy="!{{ common.squid_proxy }}" -e no_proxy="!{{ common.squid_no_proxy }}" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && sudo chown -R jenkins /dev && .jenkins/pytorch/test.sh && cp -Lr ./bazel-testlogs ./test/test-reports' - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - !{{ common.upload_test_reports(name='bazel') }} - !{{ common.upload_downloaded_files(name='bazel') }} - !{{ common.upload_test_statistics(build_environment) }} - !{{ common.teardown_ec2_linux() }} -{%- endblock %} diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2 index 3df9cec23254..f701f92cf64c 100644 --- a/.github/templates/common.yml.j2 +++ b/.github/templates/common.yml.j2 @@ -1,4 +1,4 @@ -{%- set upload_artifact_s3_action = "seemethere/upload-artifact-s3@v3" -%} +{%- set upload_artifact_s3_action = "seemethere/upload-artifact-s3@v4" -%} {# squid_proxy is an private ELB that only available for GHA custom runners #} {%- set squid_proxy = "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -%} @@ -6,6 +6,10 @@ {%- set squid_no_proxy = "localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" -%} {%- set timeout_minutes = 240 -%} +# NOTE: If testing pytorch/builder changes you can change this variable to change what pytorch/builder reference +# the binary builds will check out +{%- set builder_branch = "main" -%} + {%- macro concurrency(build_environment) -%} concurrency: group: !{{ build_environment }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} @@ -18,6 +22,37 @@ concurrency: } {%- endmacro -%} +{%- macro gen_dispatch_rules(on_pull_request, is_scheduled, ciflow_labels, branches = ['master', 'main', 'release/*'], enable_doc_jobs = True) -%} +on: +{%- if on_pull_request %} + pull_request: +{%- endif %} + push: +{%- if enable_doc_jobs and is_scheduled %} + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ +{%- endif %} +{%- for label in ciflow_labels | sort %} + {%- if loop.first and not (enable_doc_jobs and is_scheduled) %} + tags: + {%- endif %} + - '!{{ label }}/*' +{%- endfor %} +{%- if not is_scheduled %} + branches: +{%- for branch in branches %} + - !{{ branch }} +{%- endfor %} +{%- endif %} +{%- if is_scheduled %} + schedule: + - cron: !{{ is_scheduled }} +{%- endif %} + workflow_dispatch: +{%- endmacro -%} + {%- macro display_ec2_information() -%} - name: Display EC2 information shell: bash @@ -32,37 +67,46 @@ concurrency: echo "ami-id: $(get_ec2_metadata ami-id)" echo "instance-id: $(get_ec2_metadata instance-id)" echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" {%- endmacro -%} {%- macro parse_ref(pytorch_directory="") -%} - name: Parse ref + shell: bash {%- if pytorch_directory %} working-directory: !{{ pytorch_directory }} {%- endif %} id: parse-ref - run: .github/scripts/parse_ref.py + run: ./.github/scripts/parse_ref.py {%- endmacro -%} -{%- macro upload_test_statistics(build_environment, when="always()", pytorch_directory="") -%} - - name: Display and upload test statistics (Click Me) +{%- macro upload_test_statistics(build_environment, when="always()", pytorch_directory="", needs_credentials=False) -%} + - name: Upload test statistics {%- if pytorch_directory %} working-directory: !{{ pytorch_directory }} {%- endif %} if: !{{ when }} - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 + GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} BRANCH: ${{ steps.parse-ref.outputs.branch }} JOB_BASE_NAME: !{{ build_environment }}-test PR_NUMBER: ${{ github.event.pull_request.number }} SHA1: ${{ github.event.pull_request.head.sha || github.sha }} TAG: ${{ steps.parse-ref.outputs.tag }} WORKFLOW_ID: '${{ github.run_id }}' + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} +{%- if needs_credentials %} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }} +{%- endif %} shell: bash run: | + set -x python3 -m pip install -r requirements.txt python3 -m pip install boto3==1.19.12 + GHA_WORKFLOW_JOB_ID=$(python3 .github/scripts/get_workflow_job_id.py "${GITHUB_RUN_ID}" "${RUNNER_NAME}") + export GHA_WORKFLOW_JOB_ID python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test {%- endmacro -%} @@ -80,19 +124,23 @@ concurrency: uses: seemethere/add-github-ssh-key@v1 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore {%- endmacro -%} {%- macro setup_ec2_linux() -%} - !{{ display_ec2_information() }} - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - !{{ add_retry_to_env() }} - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux - name: Chown workspace run: | !{{ add_retry_to_env() }} @@ -107,9 +155,6 @@ concurrency: uses: seemethere/add-github-ssh-key@v1 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" {%- endmacro -%} {%- macro setup_rocm_linux() -%} @@ -185,10 +230,15 @@ concurrency: docker system prune -af {%- endmacro -%} -{%- macro checkout(submodules="recursive", deep_clone=True, directory="", repository="pytorch/pytorch") -%} +{%- macro checkout(submodules="recursive", deep_clone=True, directory="", repository="pytorch/pytorch", branch="", checkout_pr_head=True) -%} - name: Checkout !{{ 'PyTorch' if repository == "pytorch/pytorch" else repository }} uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + {%- if branch %} + ref: !{{ branch }} + {%- elif checkout_pr_head %} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + {%- endif %} {%- if deep_clone %} # deep clone, to allow use of git merge-base fetch-depth: 0 @@ -209,13 +259,12 @@ concurrency: {%- endif %} {%- endmacro -%} -{%- macro upload_downloaded_files(name, artifact_name="", use_s3=True, when="always()") -%} +{%- macro upload_downloaded_files(name, config=None, shard=None, num_shards=None, runner=None, artifact_name="", use_s3=True, when="always()") -%} - name: Zip JSONs for upload if: !{{ when }} env: {%- if name == 'linux' or name == 'windows' or name == 'macos' %} - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' -{%- else %} + FILE_SUFFIX: '${{ github.job }}-!{{ config }}-!{{ shard }}-!{{ num_shards }}-!{{ runner }}'{%- else %} FILE_SUFFIX: '!{{ name }}-${{ github.job }}' {%- endif %} {%- if name == 'windows' %} @@ -247,12 +296,12 @@ concurrency: test-jsons-*.zip {%- endmacro -%} -{%- macro upload_test_reports(name, artifact_name="", use_s3=True) -%} +{%- macro upload_test_reports(name, config=None, shard=None, num_shards=None, runner=None, artifact_name="", use_s3=True) -%} - name: Zip test reports for upload if: always() env: {%- if name == 'linux' or name == 'windows' or name == 'macos' %} - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-!{{ config }}-!{{ shard }}-!{{ num_shards }}-!{{ runner }}' {%- else %} FILE_SUFFIX: '!{{ name }}-${{ github.job }}' {%- endif %} @@ -285,6 +334,25 @@ concurrency: test-reports-*.zip {%- endmacro -%} +{%- macro upload_cores(artifact_name="coredumps", config=None, shard=None, use_s3=True) -%} +{%- if use_s3 %}- uses: !{{ upload_artifact_s3_action }} + name: Store Core dumps on S3 +{%- else %}- uses: actions/upload-artifact@v2 + name: Store Core dumps on Github +{%- endif %} + if: failure() + with: +{%- if config != "" and shard != "" %} + name: !{{ artifact_name }}-!{{ config }}-!{{ shard }} +{%- else %} + name: !{{ artifact_name }} +{%- endif %} + retention-days: 14 + if-no-files-found: ignore + path: + ./**/core.[1-9]* +{%- endmacro -%} + {%- macro render_test_results() -%} - name: Install render_test_results dependencies if: always() diff --git a/.github/templates/common_android.yml.j2 b/.github/templates/common_android.yml.j2 deleted file mode 100644 index a0e4e781b6ad..000000000000 --- a/.github/templates/common_android.yml.j2 +++ /dev/null @@ -1,81 +0,0 @@ -{% import 'common.yml.j2' as common %} - -{%- macro upload_androind_binary_size(build_type, artifacts) -%} - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - # The artifact file is created inside docker container, which contains the result binaries. - # Now unpackage it into the project folder. The subsequent script will scan project folder - # to locate result binaries and report their sizes. - # If artifact file is not provided it assumes that the project folder has been mounted in - # the docker during build and already contains the result binaries, so this step can be skipped. - export ARTIFACTS=!{{ artifacts }} - if [ -n "${ARTIFACTS}" ]; then - tar xf "${ARTIFACTS}" -C "${GITHUB_WORKSPACE}" - cd "${GITHUB_WORKSPACE}" - fi - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - ANDROID_BUILD_TYPE=!{{ build_type}} - export ANDROID_BUILD_TYPE - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba "android" || exit 0 -{%- endmacro -%} - -{%- macro build_android(env_name, container_suffix) -%} - - name: Build-!{{ container_suffix }} - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - #!/bin/bash -eo pipefail - # Pull Docker image and run build - time docker pull "${DOCKER_IMAGE}" >/dev/null - echo "${DOCKER_IMAGE}" - export container_name - container_name=$(docker run \ - -e BUILD_ENVIRONMENT=!{{ env_name }} \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="!{{ common.squid_proxy }}" -e https_proxy="!{{ common.squid_proxy }}" -e no_proxy="!{{ common.squid_no_proxy }}" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0 - docker cp "${GITHUB_WORKSPACE}/." "${container_name}:/var/lib/jenkins/workspace" - # shellcheck disable=SC1105 - ((echo "sudo chown -R jenkins . && .jenkins/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete") | docker exec -u jenkins -i "${container_name}" bash) 2>&1 - - # Copy dist folder back - export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-!{{ container_suffix }} - docker cp "${container_name}:/var/lib/jenkins/workspace/dist" "${GITHUB_WORKSPACE}/." || echo "Dist folder not found" - docker commit "${container_name}" "${COMMIT_DOCKER_IMAGE}" - time docker push "${COMMIT_DOCKER_IMAGE}" -{%- endmacro -%} diff --git a/.github/templates/docker_builds_ci_workflow.yml.j2 b/.github/templates/docker_builds_ci_workflow.yml.j2 deleted file mode 100644 index 224f683a35a4..000000000000 --- a/.github/templates/docker_builds_ci_workflow.yml.j2 +++ /dev/null @@ -1,60 +0,0 @@ -{% import 'common.yml.j2' as common %} - -{%- block name -%} -# Template is at: .github/templates/docker_builds_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: !{{ build_environment }} -{%- endblock %} - -on: - workflow_dispatch: - pull_request: - types: [opened, synchronize, reopened] - paths: - - '.circleci/docker/**' - - '.github/workflows/generated-docker-builds.yml' -{%- if is_scheduled %} - schedule: - - cron: !{{ is_scheduled }} -{%- endif %} -!{{ common.concurrency(build_environment) }} - -env: - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - AWS_DEFAULT_REGION: us-east-1 - -jobs: -{% block docker_build +%} - docker-build: - runs-on: linux.2xlarge - timeout-minutes: !{{ common.timeout_minutes }} - strategy: - matrix: - include: - {%- for docker_image in docker_images %} - - docker_image_base: '!{{ docker_image }}' - docker_image_short_name: '!{{ docker_image.split('/')[-1] }}' - {%- endfor %} - env: - DOCKER_IMAGE_BASE: '${{ matrix.docker_image_base }}' - name: docker-build (${{ matrix.docker_image_short_name }}) - steps: - !{{ common.setup_ec2_linux() }} - !{{ common.checkout() }} - !{{ common.calculate_docker_image(true) }} - - name: Pull Docker image - run: | - !{{ common.add_retry_to_env() }} - retry docker pull "${DOCKER_IMAGE}" - !{{ common.parse_ref() }} - !{{ common.teardown_ec2_linux() }} - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af -{%- endblock %} diff --git a/.github/templates/ios_ci_workflow.yml.j2 b/.github/templates/ios_ci_workflow.yml.j2 deleted file mode 100644 index f837a500a264..000000000000 --- a/.github/templates/ios_ci_workflow.yml.j2 +++ /dev/null @@ -1,183 +0,0 @@ -{% import 'common.yml.j2' as common %} - -{%- block name -%} -# Template is at: .github/templates/ios_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: !{{ build_environment }} -{%- endblock %} - -on: -{%- if is_default %} - pull_request: -{%- endif -%} - -{%- if is_scheduled %} - schedule: - - cron: !{{ is_scheduled }} -{%- else %} - push: - branches: - - master - - release/* -{%- endif %} -{%- for label in ciflow_config.labels | sort %} - {%- if loop.first %} - tags: - {%- endif %} - {%- if label != "ciflow/default" %} - - '!{{ label }}/*' - {%- endif %} -{%- endfor %} - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: !{{ build_environment }} - IN_CI: 1 - IS_GHA: 1 - IOS_PLATFORM: !{{ ios_platform }} - IOS_ARCH: !{{ ios_arch }} -!{{ common.set_xcode_version(xcode_version) }} - -jobs: -{% block build +%} - build: - # NOTE: These builds will not run successfully without running on `pytorch/pytorch` due to the limitations - # of accessing secrets from forked pull requests and IOS' dependency on secrets for their build/test - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - runs-on: macos-10.15 - timeout-minutes: !{{ common.timeout_minutes }} - env: - JOB_BASE_NAME: !{{ build_environment }}-build - IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }} - IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET }} - IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID }} - IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }} - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - !{{ common.checkout() }} - - name: Populate CI build options - run: | - # Most builds use the lite interpreter, if certain builds shouldn't - # build the lite interpreter this env variable should get over-written - # in the following case statement - echo "BUILD_LITE_INTERPRETER=1" >> "${GITHUB_ENV}" - - case ${BUILD_ENVIRONMENT} in - *metal*) - echo "USE_PYTORCH_METAL=1" >> "${GITHUB_ENV}" - ;; - *full_jit*) - echo "BUILD_LITE_INTERPRETER=0" >> "${GITHUB_ENV}" - ;; - *custom*) - echo "SELECTED_OP_LIST=${GITHUB_WORKSPACE}/ios/TestApp/custom_build/mobilenetv2.yaml" >> "${GITHUB_ENV}" - ;; - *coreml*) - echo "USE_COREML_DELEGATE=1" >> "${GITHUB_ENV}" - ;; - esac - - name: Install brew dependencies - run: | - # Install dependencies - brew install libtool - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - conda install -y \ - cffi \ - cmake \ - mkl \ - mkl-include \ - ninja \ - numpy \ - pyyaml \ - requests \ - setuptools \ - typing_extensions - - name: Run Fastlane - run: | - set -x - cd ios/TestApp - # install fastlane - sudo gem install bundler && bundle install - # install certificates - echo "${IOS_CERT_KEY_2022}" >> cert.txt - base64 --decode cert.txt -o Certificates.p12 - rm cert.txt - bundle exec fastlane install_root_cert - bundle exec fastlane install_dev_cert - # install the provisioning profile - PROFILE=PyTorch_CI_2022.mobileprovision - PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles - mkdir -pv "${PROVISIONING_PROFILES}" - cd "${PROVISIONING_PROFILES}" - echo "${IOS_SIGN_KEY_2022}" >> cert.txt - base64 --decode cert.txt -o ${PROFILE} - rm cert.txt - - name: Build - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - export TCLLIBPATH="/usr/local/lib" - python -VV - export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"} - scripts/build_ios.sh - - name: Run Build Test - run: | - PROFILE=PyTorch_CI_2022 - # run the ruby build script - if ! [ -x "$(command -v xcodebuild)" ]; then - echo 'Error: xcodebuild is not installed.' - exit 1 - fi - if [ "${IOS_PLATFORM}" != "SIMULATOR" ]; then - ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" -c "${PROFILE}" -t "${IOS_DEV_TEAM_ID}" - else - ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" - fi -{%- if ios_platform == "SIMULATOR" %} - - name: Run Simulator Tests - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - pip3 install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html - # generate models for differnet backends - cd "${GITHUB_WORKSPACE}/ios/TestApp/benchmark" - mkdir -p ../models - if [ "${USE_COREML_DELEGATE}" == 1 ]; then - pip install coremltools==5.0b5 - pip install six==1.16.0 - python coreml_backend.py - else - python trace_model.py - fi - if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then - echo "Setting up the TestApp for LiteInterpreter" - ruby setup.rb --lite 1 - else - echo "Setting up the TestApp for Full JIT" - ruby setup.rb - fi - cd "${GITHUB_WORKSPACE}/ios/TestApp" - instruments -s -devices - if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then - if [ "${USE_COREML_DELEGATE}" == 1 ]; then - fastlane scan --only_testing TestAppTests/TestAppTests/testCoreML - else - fastlane scan --only_testing TestAppTests/TestAppTests/testLiteInterpreter - fi - else - fastlane scan --only_testing TestAppTests/TestAppTests/testFullJIT - fi -{%- endif -%} -{% endblock +%} - -!{{ common.concurrency(build_environment) }} diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2 index 86144ff3ddd3..e183a374ffea 100644 --- a/.github/templates/linux_binary_build_workflow.yml.j2 +++ b/.github/templates/linux_binary_build_workflow.yml.j2 @@ -9,17 +9,22 @@ name: !{{ build_environment }} on: push: + {%- if branches == "nightly" %} # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build + {%- endif %} branches: - - nightly + - !{{ branches }} + {%- if branches == "nightly" %} tags: # NOTE: Binary build pipelines should only get triggered on release candidate builds # Release candidate tags look like: v1.11.0-rc1 - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + {%- endif %} {%- for label in ciflow_config.labels | sort %} - {%- if label != "ciflow/default" %} + {%- if loop.first and branches != "nightly" %} + tags: + {%- endif %} - '!{{ label }}/*' - {%- endif %} {%- endfor %} workflow_dispatch: @@ -53,7 +58,7 @@ jobs: steps: !{{ common.setup_ec2_linux() }} !{{ common.checkout(deep_clone=False, directory="pytorch") }} - !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder") }} + !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }} {%- if config["gpu_arch_type"] == 'cuda' and config["gpu_arch_version"].startswith('11') %} - name: Set BUILD_SPLIT_CUDA run: | @@ -105,7 +110,9 @@ jobs: !{{ config["build_name"] }}-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: !{{ config["build_name"] }}-build -{%- if config["gpu_arch_type"] == "cuda" %} +{%- if config["gpu_arch_type"] == "rocm" %} + runs-on: linux.rocm.gpu +{%- elif config["gpu_arch_type"] == "cuda" %} runs-on: linux.4xlarge.nvidia.gpu {%- else %} runs-on: linux.4xlarge @@ -113,28 +120,34 @@ jobs: timeout-minutes: !{{ common.timeout_minutes }} !{{ upload.binary_env(config) }} steps: +{%- if config["gpu_arch_type"] == "rocm" %} + !{{ common.setup_rocm_linux() }} +{%- else %} !{{ common.setup_ec2_linux() }} - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b +{%- endif %} + - uses: seemethere/download-artifact-s3@v3 name: Download Build Artifacts with: name: !{{ config["build_name"] }} path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder -{%- if config["gpu_arch_type"] == "cuda" %} - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ + !{{ common.checkout(deep_clone=False, directory="pytorch") }} + !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }} +{%- if config["gpu_arch_type"] == "rocm" %} + - name: ROCm set GPU_FLAG run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" +{%- elif config["gpu_arch_type"] == "cuda" %} + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd {%- endif %} - name: Pull Docker image run: | @@ -173,6 +186,12 @@ jobs: # Generate test script docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" +{%- if config["gpu_arch_type"] == "rocm" %} + !{{ common.teardown_rocm_linux() }} +{%- else %} !{{ common.teardown_ec2_linux("pytorch/") }} +{%- endif %} + {%- if branches == "nightly" %} !{{ upload.upload_binaries(config) }} + {%- endif %} {%- endfor %} diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2 deleted file mode 100644 index 660c0a74ba59..000000000000 --- a/.github/templates/linux_ci_workflow.yml.j2 +++ /dev/null @@ -1,455 +0,0 @@ -{% import 'common.yml.j2' as common %} - -{%- block name -%} -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: !{{ build_environment }} -{%- endblock %} - -on: -{%- if is_default %} - pull_request: -{%- endif %} - push: -{%- if enable_doc_jobs and is_scheduled %} - tags: - # NOTE: Binary build pipelines should only get triggered on release candidate builds - # Release candidate tags look like: v1.11.0-rc1 - - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ -{%- endif %} -{%- for label in ciflow_config.labels | sort %} - {%- if loop.first and not (enable_doc_jobs and is_scheduled) %} - tags: - {%- endif %} - {%- if label != "ciflow/default" %} - - '!{{ label }}/*' - {%- endif %} -{%- endfor %} -{%- if not is_scheduled and not only_on_pr %} - branches: - - master - - release/* -{%- endif %} -{%- if is_scheduled and not only_on_pr %} - schedule: - - cron: !{{ is_scheduled }} -{%- endif %} - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: !{{ build_environment }} - DOCKER_IMAGE_BASE: !{{ docker_image_base }} - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -{%- if build_with_debug %} - DEBUG: 1 -{%- endif %} -!{{ common.concurrency(build_environment) }} - -jobs: -{% block build +%} - build: - runs-on: linux.2xlarge - timeout-minutes: !{{ common.timeout_minutes }} - env: - JOB_BASE_NAME: !{{ build_environment }}-build - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - !{{ common.setup_ec2_linux() }} - !{{ common.checkout() }} - !{{ common.calculate_docker_image(false) }} - - name: Pull Docker image - run: | - !{{ common.add_retry_to_env() }} - retry docker pull "${DOCKER_IMAGE}" - !{{ common.parse_ref() }} - - name: Build - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="!{{ common.squid_proxy }}" -e https_proxy="!{{ common.squid_proxy }}" -e no_proxy="!{{ common.squid_no_proxy }}" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - {%- if build_generates_artifacts %} - - name: Archive artifacts into zip - run: | - zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: !{{ common.upload_artifact_s3_action }} - name: Store PyTorch Build Artifacts on S3 - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip - {%- endif %} - !{{ common.teardown_ec2_linux() }} - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af -{%- endblock %} -{%- if not exclude_test %} -{% block test +%} - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: !{{ common.timeout_minutes }} - env: - TEST_RUNNER_TYPE: !{{ test_runner_type }} - ENABLE_DISTRIBUTED_TEST: !{{ enable_distributed_test }} - ENABLE_JIT_LEGACY_TEST: !{{ enable_jit_legacy_test }} - ENABLE_MULTIGPU_TEST: !{{ enable_multigpu_test }} - ENABLE_NOGPU_NO_AVX_TEST: !{{ enable_nogpu_no_avx_test }} - ENABLE_NOGPU_NO_AVX2_TEST: !{{ enable_nogpu_no_avx2_test }} - ENABLE_SLOW_TEST: !{{ enable_slow_test }} - ENABLE_DOCS_TEST: !{{ enable_docs_test }} - ENABLE_BACKWARDS_COMPAT_TEST: !{{ enable_backwards_compat_test }} - ENABLE_XLA_TEST: !{{ enable_xla_test }} - ENABLE_NOARCH_TEST: !{{ enable_noarch_test }} - NUM_TEST_SHARDS: !{{ num_test_shards }} - MULTIGPU_RUNNER_TYPE: !{{ multigpu_runner_type }} - DISTRIBUTED_GPU_RUNNER_TYPE: !{{ distributed_gpu_runner_type }} - NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} - timeout-minutes: !{{ common.timeout_minutes }} - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: !{{ build_environment }}-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} - steps: -{%- if 'rocm' in test_runner_type %} - !{{ common.setup_rocm_linux() }} -{%- else %} - !{{ common.setup_ec2_linux() }} -{%- endif %} - !{{ common.checkout() }} - - name: Pull Docker image - run: | - !{{ common.add_retry_to_env() }} - retry docker pull "${DOCKER_IMAGE}" -{%- if 'rocm' in test_runner_type %} - - name: ROCm set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'rocm') && !contains(matrix.config, 'nogpu') }} - run: | - echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" -{%- else %} - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" -{%- endif %} - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | -{%- if 'rocm' in test_runner_type %} - df -H -{%- else %} - sudo df -H -{%- endif %} - !{{ common.parse_ref() }} - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after !{{ timeout_after }} minutes - timeout-minutes: !{{ timeout_after }} - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi -{%- if 'rocm' not in test_runner_type %} - PROXY_ENV= - # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now - # We should investigate whether or not there's a list of hostnames we can add to no_proxy to - # make it so that we shouldn't have to fully disable squid for XLA tests - if [[ $TEST_CONFIG != 'xla' ]]; then - # shellcheck disable=SC2089 - PROXY_ENV="-e http_proxy=!{{ common.squid_proxy }} -e https_proxy=!{{ common.squid_proxy }} -e no_proxy=!{{ common.squid_no_proxy }}" - fi -{%- endif %} - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ -{%- if 'rocm' not in test_runner_type %} - ${PROXY_ENV} \ -{%- endif %} - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ -{%- if 'rocm' not in test_runner_type %} - --ipc=host \ -{%- endif %} - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) -{%- if 'rocm' in test_runner_type %} - # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home - docker exec -t "${container_name}" sh -c "cd .. && cp -R workspace pytorch && cd pytorch && pip install dist/*.whl && ${TEST_COMMAND}" - # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct - docker exec -t "${container_name}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test" -{%- else %} - docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" -{%- endif %} -{%- if 'rocm' not in test_runner_type %} - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . -{%- endif %} - !{{ common.render_test_results() }} -{%- if 'rocm' in test_runner_type %} - !{{ common.upload_downloaded_files(name='linux', use_s3=False) }} - !{{ common.upload_test_reports(name='linux', artifact_name="test-reports", use_s3=False) }} -{%- else %} - !{{ common.upload_downloaded_files(name='linux') }} - !{{ common.upload_test_reports(name='linux') }} -{%- endif %} - !{{ common.upload_test_statistics(build_environment) }} -{%- if 'rocm' in test_runner_type %} - !{{ common.teardown_rocm_linux() }} -{%- else %} - !{{ common.teardown_ec2_linux() }} -{%- endif %} -{% endblock %} -{%- endif -%} -{%- if enable_doc_jobs %} - build-docs: - runs-on: linux.2xlarge - timeout-minutes: !{{ common.timeout_minutes }} - strategy: - matrix: - docs_type: [cpp, python] - needs: [build] - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - DOCS_TYPE: ${{ matrix.docs_type }} - WITH_PUSH: ${{ github.event_name == 'schedule' || startsWith(github.event.ref, 'refs/tags/v') }} - steps: - !{{ common.setup_ec2_linux() }} - !{{ common.checkout() }} - - name: Pull Docker image - run: | - !{{ common.add_retry_to_env() }} - retry docker pull "${DOCKER_IMAGE}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip -{%- if is_scheduled %} - - name: Generate netrc (only for docs-push) - if: ${{ github.event_name == 'schedule' || startsWith(github.event.ref, 'refs/tags/v') }} - env: - GITHUB_PYTORCHBOT_TOKEN: ${{ secrets.GH_PYTORCHBOT_TOKEN }} - run: | - # set credentials for https pushing - echo "machine github.com" > "${RUNNER_TEMP}/.netrc" - echo "login pytorchbot" >> "${RUNNER_TEMP}/.netrc" - echo "password ${GITHUB_PYTORCHBOT_TOKEN}" >> "${RUNNER_TEMP}/.netrc" -{%- endif %} - - name: Build ${{ matrix.docs_type }} docs - run: | - set -ex - time docker pull "${DOCKER_IMAGE}" > /dev/null - # Convert refs/tags/v1.12.0rc3 into 1.12 - if [[ "${GITHUB_REF}" =~ ^refs/tags/v([0-9]+\.[0-9]+)\.* ]]; then - target="${BASH_REMATCH[1]}" - else - target="master" - fi - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e IN_CI \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SHA1="$GITHUB_SHA" \ - -e DOCS_VERSION="${target}" \ - -e DOCS_TYPE \ - -e PR_LABELS \ - -e WITH_PUSH \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ -{%- if is_scheduled %} - -v "${RUNNER_TEMP}/.netrc":/var/lib/jenkins/.netrc \ -{%- endif %} - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/${DOCS_TYPE}_doc_push_script.sh" - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: !{{ common.upload_artifact_s3_action }} - name: Upload Python Docs Preview - if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'python' }} - with: - retention-days: 14 - s3-bucket: doc-previews - if-no-files-found: error - path: pytorch.github.io/docs/master/ - s3-prefix: pytorch/${{ github.event.pull_request.number }} - - uses: !{{ common.upload_artifact_s3_action }} - name: Upload C++ Docs Preview - if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'cpp' }} - with: - retention-days: 14 - if-no-files-found: error - s3-bucket: doc-previews - path: cppdocs/ - s3-prefix: pytorch/${{ github.event.pull_request.number }}/cppdocs -{%- endif -%} diff --git a/.github/templates/macos_binary_build_workflow.yml.j2 b/.github/templates/macos_binary_build_workflow.yml.j2 index 604d8251bc9c..2640aab74fc8 100644 --- a/.github/templates/macos_binary_build_workflow.yml.j2 +++ b/.github/templates/macos_binary_build_workflow.yml.j2 @@ -1,4 +1,5 @@ {% import 'common.yml.j2' as common %} +{% import 'upload.yml.j2' as upload %} {%- block name -%} # Template is at: .github/templates/macos_binary_build_workflow.yml.j2 @@ -6,24 +7,6 @@ name: !{{ build_environment }} {%- endblock %} -{%- macro binary_env(config) -%} - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: !{{ config["package_type"] }} - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu -{%- if config["package_type"] == "libtorch" %} - LIBTORCH_VARIANT: !{{ config["libtorch_variant"] }} - DESIRED_DEVTOOLSET: !{{ config["devtoolset"] }} - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" -{%- else %} - DESIRED_PYTHON: "!{{ config["python_version"] }}" -{%- endif %} -{%- endmacro %} - {%- macro set_runner_specific_vars() -%} # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -50,9 +33,10 @@ on: # Release candidate tags look like: v1.11.0-rc1 - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ {%- for label in ciflow_config.labels | sort %} - {%- if label != "ciflow/default" %} + {%- if loop.first and branches != "nightly" %} + tags: + {%- endif %} - '!{{ label }}/*' - {%- endif %} {%- endfor %} workflow_dispatch: @@ -76,14 +60,19 @@ env: jobs: {%- for config in build_configs %} !{{ config["build_name"] }}-build: + if: ${{ github.repository_owner == 'pytorch' }} + {%- if cross_compile_arm64 %} + runs-on: macos-12 + {%- else %} runs-on: macos-10.15 + {%- endif %} {%- if config["package_type"] == "libtorch" %} # libtorch builds take a long time on github hosted runners timeout-minutes: 720 {%- else %} timeout-minutes: !{{ common.timeout_minutes }} {%- endif %} - !{{ binary_env(config) }} + !{{ upload.binary_env(config, true) }} # For sccache access (only on non-forked PRs) AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} @@ -96,16 +85,8 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + !{{ common.checkout(deep_clone=False, directory="pytorch") }} + !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }} - name: Install sccache (only for non-forked PRs, and pushes to trunk) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} run: | @@ -129,53 +110,5 @@ jobs: retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - !{{ config["build_name"] }}-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: !{{ config["build_name"] }}-build - !{{ binary_env(config) }} - steps: - !{{ common.setup_ec2_linux() }} - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: actions/download-artifact@v2 - name: Download Build Artifacts - with: - name: !{{ config["build_name"] }} - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - !{{ common.teardown_ec2_linux() }} + !{{ upload.upload_binaries(config, has_test=False, use_s3=False) }} {%- endfor %} diff --git a/.github/templates/macos_ci_workflow.yml.j2 b/.github/templates/macos_ci_workflow.yml.j2 deleted file mode 100644 index f8b0d4cc30eb..000000000000 --- a/.github/templates/macos_ci_workflow.yml.j2 +++ /dev/null @@ -1,154 +0,0 @@ -{% import 'common.yml.j2' as common %} - -{%- block name -%} -# Template is at: .github/templates/macos_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: !{{ build_environment }} -{%- endblock %} - -on: -{%- if is_default -%} - pull_request: -{%- endif -%} - -{%- if is_scheduled %} - schedule: - - cron: !{{ is_scheduled }} -{%- else %} - push: - branches: - - master - - release/* -{%- endif %} -{%- for label in ciflow_config.labels | sort %} - {%- if loop.first %} - tags: - {%- endif %} - {%- if label != "ciflow/default" %} - - '!{{ label }}/*' - {%- endif %} -{%- endfor %} - workflow_dispatch: - -# For setup-miniconda, see https://github.com/conda-incubator/setup-miniconda/issues/179 -defaults: - run: - shell: bash -e -l {0} -env: - BUILD_ENVIRONMENT: !{{ build_environment }} - COMPACT_JOB_NAME: !{{ build_environment }} - IN_CI: 1 - IS_GHA: 1 - PYTORCH_RETRY_TEST_CASES: 1 -!{{ common.set_xcode_version(xcode_version) }} - -jobs: -{% block build +%} - build: - runs-on: !{{ test_runner_type }} - env: - JOB_BASE_NAME: !{{ build_environment }} - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - !{{ common.checkout() }} - !{{ common.setup_miniconda("3.8") }} - - name: Install macOS homebrew dependencies - run: | - # Install dependencies - brew install libomp - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Build - run: | - echo "CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}" >> "${GITHUB_ENV}" - .jenkins/pytorch/macos-build.sh -{%- if build_generates_artifacts %} - - name: Archive artifacts into zip - run: | - zip -1 -r artifacts.zip dist/ - - uses: actions/upload-artifact@v2 - name: Store PyTorch Build Artifacts on GHA - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip -{%- endif %} -{% endblock +%} -{%- if not exclude_test %} -{% block test +%} - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: !{{ common.timeout_minutes }} - env: - TEST_RUNNER_TYPE: !{{ test_runner_type }} - ENABLE_DISTRIBUTED_TEST: !{{ enable_distributed_test }} - NUM_TEST_SHARDS: !{{ num_test_shards }} - PR_BODY: ${{ github.event.pull_request.body }} - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} - timeout-minutes: !{{ common.timeout_minutes }} - env: - JOB_BASE_NAME: !{{ build_environment }}-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} - steps: - !{{ common.checkout(submodules="false") }} - - uses: actions/download-artifact@v2 - name: Download PyTorch Build Artifacts from GHA - with: - name: ${{ env.BUILD_ENVIRONMENT }} - path: . - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - !{{ common.setup_miniconda("3.8") }} - - name: Install macOS homebrew dependencies - run: | - # Install dependencies - brew install libomp - !{{ common.parse_ref() }} - - name: Test - run: | - python3 -mpip install dist/*.whl - .jenkins/pytorch/macos-test.sh - !{{ common.render_test_results() }} - !{{ common.upload_downloaded_files(name='macos', artifact_name="test-jsons", use_s3=False) }} - !{{ common.upload_test_reports("macos", artifact_name="test-reports", use_s3=False) }} - !{{ common.upload_test_statistics(build_environment) }} -{% endblock +%} -{%- endif %} - -!{{ common.concurrency(build_environment) }} diff --git a/.github/templates/upload.yml.j2 b/.github/templates/upload.yml.j2 index 4dc13971da1f..63bec412997e 100644 --- a/.github/templates/upload.yml.j2 +++ b/.github/templates/upload.yml.j2 @@ -19,8 +19,13 @@ {%- endif %} SKIP_ALL_TESTS: 1 {%- if config["package_type"] == "libtorch" %} +{%- if config["libtorch_config"] %} + LIBTORCH_CONFIG: !{{ config["libtorch_config"] }} +{%- endif %} LIBTORCH_VARIANT: !{{ config["libtorch_variant"] }} +{%- if config["devtoolset"] %} DESIRED_DEVTOOLSET: !{{ config["devtoolset"] }} +{%- endif %} {%- if is_windows %} # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason @@ -32,17 +37,25 @@ {%- endmacro %} -{%- macro upload_binaries(config, is_windows=False) -%} +{%- macro upload_binaries(config, is_windows=False, has_test=True, use_s3=True) -%} !{{ config["build_name"] }}-upload: # Uploading runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts if: ${{ github.repository_owner == 'pytorch' }} +{%- if has_test %} needs: !{{ config["build_name"] }}-test +{%- else %} + needs: !{{ config["build_name"] }}-build +{%- endif %} !{{ binary_env(config, is_windows) }} steps: !{{ common.setup_ec2_linux() }} - name: Clone pytorch/pytorch uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b +{%- if use_s3 %} + - uses: seemethere/download-artifact-s3@v3 +{%- else %} + - uses: actions/download-artifact@v2 +{%- endif %} name: Download Build Artifacts with: name: !{{ config["build_name"] }} @@ -63,8 +76,8 @@ PKG_DIR: "${{ runner.temp }}/artifacts" UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} run: | docker run --rm -i \ diff --git a/.github/templates/windows_binary_build_workflow.yml.j2 b/.github/templates/windows_binary_build_workflow.yml.j2 index 5f491767c06a..0fcfbf9096b8 100644 --- a/.github/templates/windows_binary_build_workflow.yml.j2 +++ b/.github/templates/windows_binary_build_workflow.yml.j2 @@ -21,17 +21,22 @@ name: !{{ build_environment }} on: push: + {%- if branches == "nightly" %} # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build + {%- endif %} branches: - - nightly + - !{{ branches }} + {%- if branches == "nightly" %} tags: # NOTE: Binary build pipelines should only get triggered on release candidate builds # Release candidate tags look like: v1.11.0-rc1 - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + {%- endif %} {%- for label in ciflow_config.labels | sort %} - {%- if label != "ciflow/default" %} + {%- if loop.first and branches != "nightly" %} + tags: + {%- endif %} - '!{{ label }}/*' - {%- endif %} {%- endfor %} workflow_dispatch: @@ -54,22 +59,15 @@ env: jobs: {%- for config in build_configs %} !{{ config["build_name"] }}-build: + if: ${{ github.repository_owner == 'pytorch' }} runs-on: windows.4xlarge timeout-minutes: !{{ common.timeout_minutes }} !{{ upload.binary_env(config, True) }} steps: !{{ common.setup_ec2_windows() }} !{{ set_runner_specific_vars() }} - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + !{{ common.checkout(deep_clone=False, directory="pytorch") }} + !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }} - name: Populate binary env shell: bash run: | @@ -99,21 +97,13 @@ jobs: steps: !{{ common.setup_ec2_windows() }} !{{ set_runner_specific_vars() }} - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + - uses: seemethere/download-artifact-s3@v3 name: Download Build Artifacts with: name: !{{ config["build_name"] }} path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + !{{ common.checkout(deep_clone=False, directory="pytorch") }} + !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }} - name: Populate binary env shell: bash run: | @@ -123,5 +113,7 @@ jobs: run: | "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" !{{ common.wait_and_kill_ssh_windows('pytorch') }} + {%- if branches == "nightly" %} !{{ upload.upload_binaries(config, True) }} + {%- endif %} {%- endfor %} diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2 deleted file mode 100644 index 21f067101d9c..000000000000 --- a/.github/templates/windows_ci_workflow.yml.j2 +++ /dev/null @@ -1,231 +0,0 @@ -{% import 'common.yml.j2' as common %} - -{%- macro wait_and_kill_ssh() -%} - - name: Wait until all sessions have drained - shell: powershell - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 -{%- endmacro -%} - -# Template is at: .github/templates/windows_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: !{{ build_environment }} - -on: -{%- if is_default %} - pull_request: -{%- endif %} - push: -{%- for label in ciflow_config.labels | sort %} - {%- if loop.first %} - tags: - {%- endif %} - {%- if label != "ciflow/default" %} - - '!{{ label }}/*' - {%- endif %} -{%- endfor %} -{%- if not is_scheduled %} - branches: - - master - - release/* -{%- else %} - schedule: - - cron: !{{ is_scheduled }} -{%- endif %} - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: !{{ build_environment }} - BUILD_WHEEL: 1 - MAX_JOBS: 8 - CUDA_VERSION: "!{{ cuda_version }}" - IN_CI: 1 - IS_GHA: 1 - INSTALL_WINDOWS_SDK: 1 - PYTHON_VERSION: "3.8" - PYTORCH_RETRY_TEST_CASES: 1 - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - SCCACHE_BUCKET: "ossci-compiler-cache" - VC_PRODUCT: "BuildTools" - VC_VERSION: "" - VS_VERSION: "16.8.6" - VC_YEAR: "2019" - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - no_proxy: !{{ common.squid_no_proxy }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} -{%- if build_with_debug %} - DEBUG: 1 -{%- endif %} -{%- if cuda_version != "cpu" %} - TORCH_CUDA_ARCH_LIST: "7.0" -{%- endif %} - USE_CUDA: !{{ 1 if cuda_version != "cpu" else 0 }} - -!{{ common.concurrency(build_environment) }} - -jobs: - build: - runs-on: "windows.4xlarge" - timeout-minutes: !{{ common.timeout_minutes }} - env: - JOB_BASE_NAME: !{{ build_environment }}-build - http_proxy: "!{{ common. squid_proxy }}" - https_proxy: "!{{ common.squid_proxy }}" - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - !{{ common.checkout() }} - !{{ common.display_ec2_information() }} - - name: Install Visual Studio 2019 toolchain - shell: powershell - run: | - .\.circleci\scripts\vs_install.ps1 -{%- if cuda_version != "cpu" %} - - name: Install Cuda - shell: bash - run: | - .circleci/scripts/windows_cuda_install.sh - - name: Install Cudnn - shell: bash - run: | - .circleci/scripts/windows_cudnn_install.sh -{%- endif %} - !{{ common.parse_ref() }} - - name: Build - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - .jenkins/pytorch/win-build.sh - # Upload to github so that people can click and download artifacts - - name: Upload artifacts to s3 - uses: !{{ common.upload_artifact_s3_action }} - with: - retention-days: 14 - if-no-files-found: error - name: ${{ env.BUILD_ENVIRONMENT }} - path: C:\${{ github.run_id }}\build-results - !{{ common.wait_and_kill_ssh_windows() }} - - name: Cleanup build-results and workspaces - if: always() - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - # Should remove the entirety of pytorch-${{ github.run_id }} - run: | - rm -rf "${PYTORCH_FINAL_PACKAGE_DIR}" - rm -rf ./* - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: !{{ common.timeout_minutes }} - env: - TEST_RUNNER_TYPE: !{{ test_runner_type }} - NUM_TEST_SHARDS: !{{ num_test_shards }} - NUM_TEST_SHARDS_ON_PULL_REQUEST: !{{ num_test_shards_on_pull_request }} - PR_BODY: ${{ github.event.pull_request.body }} - NOGPU_RUNNER_TYPE: windows.4xlarge - ENABLE_FORCE_ON_CPU_TEST: !{{ enable_force_on_cpu_test }} - RUN_SMOKE_TESTS_ONLY_ON_PR: !{{ only_run_smoke_tests_on_pull_request }} - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - timeout-minutes: !{{ common.timeout_minutes }} - env: - JOB_BASE_NAME: !{{ build_environment }}-test - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - TEST_CONFIG: ${{ matrix.config }} - http_proxy: "!{{ common.squid_proxy }}" - https_proxy: "!{{ common.squid_proxy }}" - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} - steps: - !{{ common.display_ec2_information() }} - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - !{{ common.checkout() }} - - name: Install Visual Studio 2019 toolchain - shell: powershell - run: | - .\.circleci\scripts\vs_install.ps1 -{%- if cuda_version != "cpu" %} - - name: Install Cuda - if: ${{ matrix.config != 'force_on_cpu' }} - shell: bash - run: | - .circleci/scripts/windows_cuda_install.sh - - name: Install Cudnn - if: ${{ matrix.config != 'force_on_cpu' }} - shell: bash - run: | - .circleci/scripts/windows_cudnn_install.sh -{%- endif %} - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - path: C:\${{ github.run_id }}\build-results - - name: Check build-results folder - shell: powershell - run: | - tree /F C:\$Env:GITHUB_RUN_ID\build-results - # Needed for coverage in win-test.sh - - uses: actions/setup-python@v2 - name: Setup Python3 - with: - python-version: '3.x' - - name: Test - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - # Time out the test phase after 3.5 hours - timeout-minutes: 210 - run: | - .jenkins/pytorch/win-test.sh - !{{ common.upload_downloaded_files(name='windows') }} - !{{ common.upload_test_reports(name='windows') }} - !{{ common.render_test_results() }} - !{{ common.wait_and_kill_ssh_windows() }} - !{{ common.parse_ref() }} - !{{ common.upload_test_statistics(build_environment) }} - - name: Cleanup workspace - if: always() - shell: bash - # Should remove the entirety of pytorch-${{ github.run_id }} - run: | - rm -rf ./* diff --git a/.github/workflows/_android-build-test.yml b/.github/workflows/_android-build-test.yml new file mode 100644 index 000000000000..a489d7d7e002 --- /dev/null +++ b/.github/workflows/_android-build-test.yml @@ -0,0 +1,150 @@ +name: android-build-test + +on: + workflow_call: + inputs: + build-environment: + required: true + type: string + description: Top-level label for what's being built/tested. + docker-image-name: + required: true + type: string + description: Name of the base docker image to build with. + +env: + IN_CI: 1 # TODO delete in favor of GITHUB_ACTIONS + IS_GHA: 1 # TODO delete in favor of GITHUB_ACTIONS + GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + +jobs: + build-and-test: + # Don't run on forked repos. + if: github.repository_owner == 'pytorch' + runs-on: [self-hosted, linux.2xlarge] + steps: + # [see note: pytorch repo ref] + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + + - name: Setup Linux + uses: ./.github/actions/setup-linux + + - name: Setup SSH (Click me for login details) + uses: ./.github/actions/setup-ssh + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + + - name: Calculate docker image + id: calculate-docker-image + uses: ./.github/actions/calculate-docker-image + with: + docker-image-name: ${{ inputs.docker-image-name }} + xla: ${{ contains(inputs.build-environment, 'xla') }} + + - name: Pull docker image + uses: ./.github/actions/pull-docker-image + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + + - name: Output disk space left + run: | + sudo df -H + + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + + - name: Build + env: + BUILD_ENVIRONMENT: ${{ inputs.build-environment }} + JOB_BASE_NAME: ${{ inputs.build-environment }}-build-and-test + CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts + TORCH_CUDA_ARCH_LIST: 5.2 + SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + run: | + set -e + # Unlike other gradle jobs, it's not worth building libtorch in a separate CI job and share via docker, because: + # 1) Not shareable: it's custom selective build, which is different from default libtorch mobile build; + # 2) Not parallelizable by architecture: it only builds libtorch for one architecture; + + echo "DOCKER_IMAGE: ${DOCKER_IMAGE}" + time docker pull "${DOCKER_IMAGE}" >/dev/null + + export BUILD_LITE_INTERPRETER + BUILD_LITE_INTERPRETER="1" + if [[ "${BUILD_ENVIRONMENT}" == *"full-jit" ]]; then + BUILD_LITE_INTERPRETER="0" + fi + + git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0 + export id + id=$(docker run -e BUILD_ENVIRONMENT \ + -e JOB_BASE_NAME \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e PR_LABELS \ + -e SKIP_SCCACHE_INITIALIZATION=1 \ + -e TORCH_CUDA_ARCH_LIST \ + -e BUILD_LITE_INTERPRETER \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --tty \ + --detach \ + --user jenkins \ + -v "$(pwd):/var/lib/jenkins/workspace" \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + -t -d -w /var/lib/jenkins "${DOCKER_IMAGE}") + + export COMMAND + # shellcheck disable=SC2016 + COMMAND='(echo "sudo chown -R jenkins workspace && cd workspace && ./.circleci/scripts/build_android_gradle.sh" | docker exec -u jenkins -e BUILD_LITE_INTERPRETER -e GRADLE_OFFLINE=1 -i "$id" bash) 2>&1' + echo "${COMMAND}" > ./command.sh && bash ./command.sh + # Skip docker push as this job is purely for size analysis purpose. + # Result binaries are already in `/home/circleci/project/` as it's mounted instead of copied. + + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + + - name: Display and upload binary build size statistics (Click Me) + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: ${{ github.run_id }} + ARTIFACTS: "" + ANDROID_BUILD_TYPE: custom-build-single + run: | + # The artifact file is created inside docker container, which contains the result binaries. + # Now unpackage it into the project folder. The subsequent script will scan project folder + # to locate result binaries and report their sizes. + # If artifact file is not provided it assumes that the project folder has been mounted in + # the docker during build and already contains the result binaries, so this step can be skipped. + if [ -n "${ARTIFACTS}" ]; then + tar xf "${ARTIFACTS}" -C "${GITHUB_WORKSPACE}" + cd "${GITHUB_WORKSPACE}" + fi + COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) + export COMMIT_TIME + pip3 install requests==2.26 boto3==1.16.34 + python3 -m tools.stats.upload_binary_size_to_scuba "android" || exit 0 + + - name: Chown workspace + uses: ./.github/actions/chown-workspace + if: always() + + - name: Teardown Linux + uses: ./.github/actions/teardown-linux + if: always() diff --git a/.github/workflows/_android-full-build-test.yml b/.github/workflows/_android-full-build-test.yml new file mode 100644 index 000000000000..d0b8845a6620 --- /dev/null +++ b/.github/workflows/_android-full-build-test.yml @@ -0,0 +1,222 @@ +name: android-full-build-test + +on: + workflow_call: + inputs: + build-environment: + required: true + type: string + description: Top-level label for what's being built/tested. + docker-image-name: + required: true + type: string + description: Name of the base docker image to build with. + + secrets: + SONATYPE_NEXUS_USERNAME: + description: nexus user + required: true + SONATYPE_NEXUS_PASSWORD: + description: nexus pass + required: true + ANDROID_SIGN_KEY: + description: android key + required: true + ANDROID_SIGN_PASS: + description: android pass + required: true + SCRIBE_GRAPHQL_ACCESS_TOKEN: + description: token for writing to scribe/scuba + required: true + +env: + IN_CI: 1 # TODO delete in favor of GITHUB_ACTIONS + IS_GHA: 1 # TODO delete in favor of GITHUB_ACTIONS + GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + +jobs: + build: + # Don't run on forked repos. + if: github.repository_owner == 'pytorch' + runs-on: [self-hosted, linux.2xlarge] + steps: + # [see note: pytorch repo ref] + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + + - name: Setup Linux + uses: ./.github/actions/setup-linux + + - name: Setup SSH (Click me for login details) + uses: ./.github/actions/setup-ssh + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + + - name: Calculate docker image + id: calculate-docker-image + uses: ./.github/actions/calculate-docker-image + with: + docker-image-name: ${{ inputs.docker-image-name }} + + - name: Pull docker image + uses: ./.github/actions/pull-docker-image + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + + - name: Output disk space left + shell: bash + run: | + sudo df -H + + - name: Preserve github env variables for use in docker + shell: bash + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + + - name: Build arm-v7a + uses: ./.github/actions/build-android + with: + arch: arm_v7a + arch-for-build-env: arm-v7a + github-secret: ${{ secrets.GITHUB_TOKEN }} + build-environment: ${{ inputs.build-environment }} + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + branch: ${{ steps.parse-ref.outputs.branch }} + + - name: Build arm-v8a + uses: ./.github/actions/build-android + with: + arch: arm_v8a + arch-for-build-env: arm-v8a + github-secret: ${{ secrets.GITHUB_TOKEN }} + build-environment: ${{ inputs.build-environment }} + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + branch: ${{ steps.parse-ref.outputs.branch }} + + - name: Build x86_32 + id: build-x86_32 + uses: ./.github/actions/build-android + with: + arch: x86_32 + arch-for-build-env: x86_32 + github-secret: ${{ secrets.GITHUB_TOKEN }} + build-environment: ${{ inputs.build-environment }} + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + branch: ${{ steps.parse-ref.outputs.branch }} + + - name: Build x86_64 + uses: ./.github/actions/build-android + with: + arch: x86_64 + arch-for-build-env: x86_64 + github-secret: ${{ secrets.GITHUB_TOKEN }} + build-environment: ${{ inputs.build-environment }} + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + branch: ${{ steps.parse-ref.outputs.branch }} + + - name: Build final artifact + env: + BRANCH: ${{ steps.parse-ref.outputs.branch }} + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + AWS_DEFAULT_REGION: us-east-1 + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts + SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + ID_X86_32: ${{ steps.build-x86_32.outputs.container_id }} + run: | + set -eux + + # Putting everything together + # ID_X86_32 container were created during build-x86_32 step + docker cp "${GITHUB_WORKSPACE}/build_android_install_arm_v7a" "${ID_X86_32}:/var/lib/jenkins/workspace/build_android_install_arm_v7a" + docker cp "${GITHUB_WORKSPACE}/build_android_install_x86_64" "${ID_X86_32}:/var/lib/jenkins/workspace/build_android_install_x86_64" + docker cp "${GITHUB_WORKSPACE}/build_android_install_arm_v8a" "${ID_X86_32}:/var/lib/jenkins/workspace/build_android_install_arm_v8a" + docker cp "${GITHUB_WORKSPACE}/build_android_install_x86_32" "${ID_X86_32}:/var/lib/jenkins/workspace/build_android_install_x86_32" + + # run gradle buildRelease + (echo "./.circleci/scripts/build_android_gradle.sh" | docker exec \ + -e BUILD_ENVIRONMENT="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build" \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e AWS_DEFAULT_REGION \ + -e IS_GHA \ + -e PR_NUMBER \ + -e SHA1 \ + -e BRANCH \ + -e GITHUB_RUN_ID \ + -e SCCACHE_BUCKET \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e SKIP_SCCACHE_INITIALIZATION=1 \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --user jenkins \ + -u jenkins -i "${ID_X86_32}" bash) 2>&1 + + mkdir -p "${GITHUB_WORKSPACE}/build_android_artifacts" + docker cp "${ID_X86_32}:/var/lib/jenkins/workspace/android/artifacts.tgz" "${GITHUB_WORKSPACE}/build_android_artifacts/" + + - name: Display and upload binary build size statistics (Click Me) + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: ${{ github.run_id }} + ANDROID_BUILD_TYPE: prebuilt + SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} + run: | + # The artifact file is created inside docker container, which contains the result binaries. + # Now unpackage it into the project folder. The subsequent script will scan project folder + # to locate result binaries and report their sizes. + # If artifact file is not provided it assumes that the project folder has been mounted in + # the docker during build and already contains the result binaries, so this step can be skipped. + export ARTIFACTS=${GITHUB_WORKSPACE}/build_android_artifacts/artifacts.tgz + if [ -n "${ARTIFACTS}" ]; then + tar xf "${ARTIFACTS}" -C "${GITHUB_WORKSPACE}" + cd "${GITHUB_WORKSPACE}" + fi + COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) + export COMMIT_TIME + pip3 install requests==2.26 boto3==1.16.34 + python3 -m tools.stats.upload_binary_size_to_scuba "android" || exit 0 + + - name: Publish android snapshot + if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/nightly' }} + env: + SONATYPE_NEXUS_USERNAME: ${{ secrets.SONATYPE_NEXUS_USERNAME }} + SONATYPE_NEXUS_PASSWORD: ${{ secrets.SONATYPE_NEXUS_PASSWORD }} + ANDROID_SIGN_KEY: ${{ secrets.ANDROID_SIGN_KEY }} + ANDROID_SIGN_PASS: ${{ secrets.ANDROID_SIGN_PASS }} + ID_X86_32: ${{ steps.build-x86_32.outputs.container_id }} + run: | + set -eux + (echo "./.circleci/scripts/publish_android_snapshot.sh" | docker exec \ + -e BUILD_ENVIRONMENT="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-publish-snapshot" \ + -e SONATYPE_NEXUS_USERNAME \ + -e SONATYPE_NEXUS_PASSWORD \ + -e ANDROID_SIGN_KEY \ + -e ANDROID_SIGN_PASS \ + -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + -u jenkins -i "${ID_X86_32}" bash) 2>&1 + + - name: Store PyTorch Android Build Artifacts on S3 + uses: seemethere/upload-artifact-s3@v4 + with: + name: ${{ inputs.build-environment }} + retention-days: 14 + if-no-files-found: error + path: build_android_artifacts/artifacts.tgz + + - name: Chown workspace + uses: ./.github/actions/chown-workspace + if: always() + + - name: Teardown Linux + uses: ./.github/actions/teardown-linux + if: always() diff --git a/.github/workflows/_bazel-build-test.yml b/.github/workflows/_bazel-build-test.yml new file mode 100644 index 000000000000..0b782aa9708b --- /dev/null +++ b/.github/workflows/_bazel-build-test.yml @@ -0,0 +1,185 @@ +name: bazel + +on: + workflow_call: + inputs: + build-environment: + required: true + type: string + description: Top-level label for what's being built/tested. + docker-image-name: + required: true + type: string + description: Name of the base docker image to build with. + +env: + IN_CI: 1 # TODO delete in favor of GITHUB_ACTIONS + IS_GHA: 1 # TODO delete in favor of GITHUB_ACTIONS + GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + +jobs: + build-and-test: + # Don't run on forked repos. + if: github.repository_owner == 'pytorch' + runs-on: [self-hosted, linux.2xlarge] + steps: + # [see note: pytorch repo ref] + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + + - name: Setup Linux + uses: ./.github/actions/setup-linux + + - name: Setup SSH (Click me for login details) + uses: ./.github/actions/setup-ssh + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + + - name: Calculate docker image + id: calculate-docker-image + uses: ./.github/actions/calculate-docker-image + with: + docker-image-name: ${{ inputs.docker-image-name }} + + - name: Pull docker image + uses: ./.github/actions/pull-docker-image + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + + - name: Output disk space left + run: | + sudo df -H + + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + + - name: Build + env: + BUILD_ENVIRONMENT: ${{ inputs.build-environment }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: ${{ inputs.build-environment }}-build-and-test + # TODO duplicated + AWS_DEFAULT_REGION: us-east-1 + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + TORCH_CUDA_ARCH_LIST: 5.2 + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + run: | + # detached container should get cleaned up by teardown_ec2_linux + container_name=$(docker run \ + -e BUILD_ENVIRONMENT \ + -e JOB_BASE_NAME \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e PR_LABELS \ + -e SKIP_SCCACHE_INITIALIZATION=1 \ + -e TORCH_CUDA_ARCH_LIST \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --tty \ + --detach \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && sudo chown -R jenkins /dev && .jenkins/pytorch/build.sh' + + # !{{ common_android.upload_android_binary_size("", "")}} + - name: Test + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + env: + JOB_BASE_NAME: ${{ inputs.build-environment }}-build-and-test + BUILD_ENVIRONMENT: ${{ inputs.build-environment }} + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + PYTORCH_RETRY_TEST_CASES: 1 + PR_BODY: ${{ github.event.pull_request.body }} + SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + run: | + # detached container should get cleaned up by teardown_ec2_linux + export SHARD_NUMBER=0 + COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}") + export COMMIT_MESSAGES + # TODO: Stop building test binaries as part of the build phase + # Make sure we copy test results from bazel-testlogs symlink to + # a regular directory ./test/test-reports + container_name=$(docker run \ + -e BUILD_ENVIRONMENT \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e GIT_DEFAULT_BRANCH="$GIT_DEFAULT_BRANCH" \ + -e IN_CI \ + -e SHARD_NUMBER \ + -e NUM_TEST_SHARDS \ + -e JOB_BASE_NAME \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e PR_LABELS \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --shm-size="1g" \ + --tty \ + --detach \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && sudo chown -R jenkins /dev && .jenkins/pytorch/test.sh && cp -Lr ./bazel-testlogs ./test/test-reports' + + - name: Chown workspace + uses: ./.github/actions/chown-workspace + if: always() + + - name: Get workflow job id + id: get-job-id + uses: pytorch/pytorch/.github/actions/get-workflow-job-id@master + if: always() + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Upload test artifacts + uses: ./.github/actions/upload-test-artifacts + if: always() + with: + file-suffix: bazel-${{ github.job }}_${{ steps.get-job-id.outputs.job-id }} + + - name: Upload test statistics + if: always() + env: + AWS_DEFAULT_REGION: us-east-1 + GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + BUILD_ENVIRONMENT: ${{ inputs.build-environment }} + JOB_BASE_NAME: ${{ inputs.build-environment }}-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: ${{ github.run_id }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GHA_WORKFLOW_JOB_ID: ${{ steps.get-job-id.outputs.job-id }} + shell: bash + run: | + set -x + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + + - name: Teardown Linux + uses: ./.github/actions/teardown-linux + if: always() diff --git a/.github/workflows/_docs.yml b/.github/workflows/_docs.yml new file mode 100644 index 000000000000..96ed63cbb0f6 --- /dev/null +++ b/.github/workflows/_docs.yml @@ -0,0 +1,132 @@ +name: build docs + +on: + workflow_call: + inputs: + build-environment: + required: true + type: string + description: Top-level label for what's being built/tested. + docker-image: + required: true + type: string + description: Docker image to run in. + push: + required: false + type: boolean + default: false + description: If set, push the docs to the docs website. + + secrets: + GH_PYTORCHBOT_TOKEN: + required: false + description: Permissions for pushing to the docs site. + +env: + IN_CI: 1 # TODO delete in favor of GITHUB_ACTIONS + IS_GHA: 1 # TODO delete in favor of GITHUB_ACTIONS + +jobs: + build-docs: + # Don't run on forked repos. + if: github.repository_owner == 'pytorch' + runs-on: [self-hosted, linux.2xlarge] + strategy: + matrix: + docs_type: [cpp, python] + steps: + # [see note: pytorch repo ref] + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + + - name: Setup Linux + uses: ./.github/actions/setup-linux + + - name: Setup SSH (Click me for login details) + uses: ./.github/actions/setup-ssh + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + + - name: Pull docker image + uses: ./.github/actions/pull-docker-image + with: + docker-image: ${{ inputs.docker-image }} + + - name: Download build artifacts + uses: ./.github/actions/download-build-artifacts + with: + name: ${{ inputs.build-environment }} + + - name: Generate netrc (only for docs-push) + if: inputs.push + env: + GITHUB_PYTORCHBOT_TOKEN: ${{ secrets.GH_PYTORCHBOT_TOKEN }} + run: | + # set credentials for https pushing + echo "machine github.com" > "${RUNNER_TEMP}/.netrc" + echo "login pytorchbot" >> "${RUNNER_TEMP}/.netrc" + echo "password ${GITHUB_PYTORCHBOT_TOKEN}" >> "${RUNNER_TEMP}/.netrc" + + - name: Build ${{ matrix.docs_type }} docs + env: + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts + WITH_PUSH: ${{ github.event_name == 'schedule' || startsWith(github.event.ref, 'refs/tags/v') }} + DOCKER_IMAGE: ${{ inputs.docker-image }} + DOCS_TYPE: ${{ matrix.docs_type }} + BUILD_ENVIRONMENT: ${{ inputs.build-environment }} + run: | + set -ex + # Convert refs/tags/v1.12.0rc3 into 1.12 + if [[ "${GITHUB_REF}" =~ ^refs/tags/v([0-9]+\.[0-9]+)\.* ]]; then + target="${BASH_REMATCH[1]}" + else + target="master" + fi + # detached container should get cleaned up by teardown_ec2_linux + container_name=$(docker run \ + -e BUILD_ENVIRONMENT \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e IN_CI \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SHA1="$GITHUB_SHA" \ + -e DOCS_VERSION="${target}" \ + -e DOCS_TYPE \ + -e PR_LABELS \ + -e WITH_PUSH \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --tty \ + --detach \ + --user jenkins \ + -v "${RUNNER_TEMP}/.netrc":/var/lib/jenkins/.netrc \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/${DOCS_TYPE}_doc_push_script.sh" + + - name: Chown workspace + uses: ./.github/actions/chown-workspace + if: always() + + - name: Upload Python Docs Preview + uses: seemethere/upload-artifact-s3@v4 + if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'python' }} + with: + retention-days: 14 + s3-bucket: doc-previews + if-no-files-found: error + path: pytorch.github.io/docs/master/ + s3-prefix: pytorch/${{ github.event.pull_request.number }} + + - name: Upload C++ Docs Preview + uses: seemethere/upload-artifact-s3@v4 + if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'cpp' }} + with: + retention-days: 14 + if-no-files-found: error + s3-bucket: doc-previews + path: cppdocs/ + s3-prefix: pytorch/${{ github.event.pull_request.number }}/cppdocs diff --git a/.github/workflows/_ios-build-test.yml b/.github/workflows/_ios-build-test.yml new file mode 100644 index 000000000000..fa3b7e2836f8 --- /dev/null +++ b/.github/workflows/_ios-build-test.yml @@ -0,0 +1,187 @@ +name: ios-build-test + +on: + workflow_call: + inputs: + build-environment: + required: true + type: string + description: Top-level label for what's being built/tested. + ios-platform: + required: true + type: string + description: Which iOS platform to build for. + ios-arch: + required: true + type: string + description: Which iOS arch to build for. + + secrets: + IOS_CERT_KEY_2022: + required: true + description: ios cert + IOS_CERT_SECRET: + required: true + description: ios cert + IOS_DEV_TEAM_ID: + required: true + description: ios cert + IOS_SIGN_KEY_2022: + required: true + description: ios cert + +env: + IN_CI: 1 + IS_GHA: 1 + GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + BUILD_ENVIRONMENT: ${{ inputs.build-environment }} + IOS_PLATFORM: ${{ inputs.ios-platform }} + IOS_ARCH: ${{ inputs.ios-arch }} + +jobs: + build: + # NOTE: These builds will not run successfully without running on `pytorch/pytorch` due to the limitations + # of accessing secrets from forked pull requests and IOS' dependency on secrets for their build/test + if: github.repository_owner == 'pytorch' + runs-on: macos-10.15 + timeout-minutes: 240 + env: + JOB_BASE_NAME: ${{ inputs.build-environment }}-build + IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }} + IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET }} + IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID }} + IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }} + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + steps: + # [see note: pytorch repo ref] + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + + - name: Populate CI build options + run: | + # Most builds use the lite interpreter, if certain builds shouldn't + # build the lite interpreter this env variable should get over-written + # in the following case statement + echo "BUILD_LITE_INTERPRETER=1" >> "${GITHUB_ENV}" + + case ${BUILD_ENVIRONMENT} in + *metal*) + echo "USE_PYTORCH_METAL=1" >> "${GITHUB_ENV}" + ;; + *full_jit*) + echo "BUILD_LITE_INTERPRETER=0" >> "${GITHUB_ENV}" + ;; + *custom*) + echo "SELECTED_OP_LIST=${GITHUB_WORKSPACE}/ios/TestApp/custom_build/mobilenetv2.yaml" >> "${GITHUB_ENV}" + ;; + *coreml*) + echo "USE_COREML_DELEGATE=1" >> "${GITHUB_ENV}" + ;; + esac + + - name: Install brew dependencies + run: | + # Install dependencies + brew install libtool + + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + conda install -y \ + cffi \ + cmake \ + mkl \ + mkl-include \ + ninja \ + numpy \ + pyyaml \ + requests \ + setuptools \ + typing_extensions + + - name: Run Fastlane + run: | + set -x + cd ios/TestApp + # install fastlane + sudo gem install bundler && bundle install + # install certificates + echo "${IOS_CERT_KEY_2022}" >> cert.txt + base64 --decode cert.txt -o Certificates.p12 + rm cert.txt + bundle exec fastlane install_root_cert + bundle exec fastlane install_dev_cert + # install the provisioning profile + PROFILE=PyTorch_CI_2022.mobileprovision + PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles + mkdir -pv "${PROVISIONING_PROFILES}" + cd "${PROVISIONING_PROFILES}" + echo "${IOS_SIGN_KEY_2022}" >> cert.txt + base64 --decode cert.txt -o ${PROFILE} + rm cert.txt + + - name: Build + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + export TCLLIBPATH="/usr/local/lib" + python -VV + export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"} + scripts/build_ios.sh + + - name: Run Build Test + run: | + PROFILE=PyTorch_CI_2022 + # run the ruby build script + if ! [ -x "$(command -v xcodebuild)" ]; then + echo 'Error: xcodebuild is not installed.' + exit 1 + fi + if [ "${IOS_PLATFORM}" != "SIMULATOR" ]; then + ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" -c "${PROFILE}" -t "${IOS_DEV_TEAM_ID}" + else + ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" + fi + + - name: Run Simulator Tests + if: inputs.ios-platform == 'SIMULATOR' + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + pip3 install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html + # generate models for differnet backends + cd "${GITHUB_WORKSPACE}/ios/TestApp/benchmark" + mkdir -p ../models + if [ "${USE_COREML_DELEGATE}" == 1 ]; then + pip install coremltools==5.0b5 + pip install six==1.16.0 + python coreml_backend.py + else + cd "${GITHUB_WORKSPACE}" + python test/mobile/model_test/gen_test_model.py ios-test + fi + cd "${GITHUB_WORKSPACE}/ios/TestApp/benchmark" + if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then + echo "Setting up the TestApp for LiteInterpreter" + ruby setup.rb --lite 1 + else + echo "Setting up the TestApp for Full JIT" + ruby setup.rb + fi + cd "${GITHUB_WORKSPACE}/ios/TestApp" + instruments -s -devices + if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then + if [ "${USE_COREML_DELEGATE}" == 1 ]; then + fastlane scan --only_testing TestAppTests/TestAppTests/testCoreML + else + fastlane scan --skip_testing TestAppTests/TestAppTests/testCoreML + fi + else + fastlane scan --only_testing TestAppTests/TestAppTests/testFullJIT + fi diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml new file mode 100644 index 000000000000..cf6419f208e2 --- /dev/null +++ b/.github/workflows/_linux-build.yml @@ -0,0 +1,158 @@ +name: linux-build + +on: + workflow_call: + inputs: + build-environment: + required: true + type: string + description: Top-level label for what's being built/tested. + docker-image-name: + required: true + type: string + description: Name of the base docker image to build with. + build-generates-artifacts: + required: false + type: boolean + default: true + description: If set, upload generated build artifacts. + build-with-debug: + required: false + type: boolean + default: false + description: If set, build in debug mode. + + outputs: + docker-image: + value: ${{ jobs.build.outputs.docker-image }} + description: The docker image containing the built PyTorch. + +env: + IN_CI: 1 # TODO delete in favor of GITHUB_ACTIONS + IS_GHA: 1 # TODO delete in favor of GITHUB_ACTIONS + +jobs: + build: + # Don't run on forked repos. + if: github.repository_owner == 'pytorch' + runs-on: [self-hosted, linux.2xlarge] + timeout-minutes: 240 + outputs: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + steps: + # [pytorch repo ref] + # Use a pytorch/pytorch reference instead of a reference to the local + # checkout because when we run this action we don't *have* a local + # checkout. In other cases you should prefer a local checkout. + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + + - name: Check for new workflows + run: | + if [ ! -f "./.github/actions/setup-linux/action.yml" ]; then + echo "::error::Your PR is based on a version of master that is too old for our CI to work. Please rebase your PR on latest master and resubmit." + exit 1 + fi + + - name: Setup Linux + uses: ./.github/actions/setup-linux + + - name: Setup SSH (Click me for login details) + uses: ./.github/actions/setup-ssh + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + + - name: Calculate docker image + id: calculate-docker-image + uses: ./.github/actions/calculate-docker-image + with: + docker-image-name: ${{ inputs.docker-image-name }} + xla: ${{ contains(inputs.build-environment, 'xla') }} + + - name: Pull docker image + uses: ./.github/actions/pull-docker-image + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + + - name: Build + env: + BUILD_ENVIRONMENT: ${{ inputs.build-environment }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: ${{ inputs.build-environment }}-build + # TODO duplicated + AWS_DEFAULT_REGION: us-east-1 + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla + CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + TORCH_CUDA_ARCH_LIST: 5.2 + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }} + DEBUG: ${{ inputs.build-with-debug && '1' || '0' }} + run: | + # detached container should get cleaned up by teardown_ec2_linux + container_name=$(docker run \ + -e BUILD_ENVIRONMENT \ + -e JOB_BASE_NAME \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e AWS_DEFAULT_REGION \ + -e IS_GHA \ + -e PR_NUMBER \ + -e SHA1 \ + -e BRANCH \ + -e GITHUB_RUN_ID \ + -e SCCACHE_BUCKET \ + -e XLA_CUDA \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e SKIP_SCCACHE_INITIALIZATION=1 \ + -e TORCH_CUDA_ARCH_LIST \ + -e PR_LABELS \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --tty \ + --detach \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c '.jenkins/pytorch/build.sh' + + - name: Display and upload binary build size statistics (Click Me) + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + BRANCH: ${{ steps.parse-ref.outputs.branch }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: ${{ github.run_id }} + run: | + COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) + export COMMIT_TIME + pip3 install requests==2.26 boto3==1.16.34 + python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 + + - name: Archive artifacts into zip + if: inputs.build-generates-artifacts + run: | + zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json + + - name: Store PyTorch Build Artifacts on S3 + uses: seemethere/upload-artifact-s3@v4 + if: inputs.build-generates-artifacts + with: + name: ${{ inputs.build-environment }} + retention-days: 14 + if-no-files-found: error + path: artifacts.zip + + - name: Teardown Linux + uses: ./.github/actions/teardown-linux + if: always() diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml new file mode 100644 index 000000000000..37ea69e531da --- /dev/null +++ b/.github/workflows/_linux-test.yml @@ -0,0 +1,194 @@ +name: linux-test + +on: + workflow_call: + inputs: + build-environment: + required: true + type: string + description: Top-level label for what's being built/tested. + test-matrix: + required: true + type: string + description: JSON description of what test configs to run. + docker-image: + required: true + type: string + description: Docker image to run in. + +env: + IN_CI: 1 # TODO delete in favor of GITHUB_ACTIONS + IS_GHA: 1 # TODO delete in favor of GITHUB_ACTIONS + GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + +jobs: + test: + # Don't run on forked repos. + if: github.repository_owner == 'pytorch' + strategy: + matrix: ${{ fromJSON(inputs.test-matrix) }} + fail-fast: false + runs-on: ${{ matrix.runner }} + steps: + # [see note: pytorch repo ref] + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + + - name: Setup Linux + uses: ./.github/actions/setup-linux + + - name: Setup SSH (Click me for login details) + uses: ./.github/actions/setup-ssh + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + + - name: Pull docker image + uses: ./.github/actions/pull-docker-image + with: + docker-image: ${{ inputs.docker-image }} + + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + if: contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + + - name: Download build artifacts + uses: ./.github/actions/download-build-artifacts + with: + name: ${{ inputs.build-environment }} + + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + + - name: Test + env: + BUILD_ENVIRONMENT: ${{ inputs.build-environment }} + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + PYTORCH_RETRY_TEST_CASES: 1 + JOB_BASE_NAME: ${{ inputs.build-environment }}-test + TEST_CONFIG: ${{ matrix.config }} + SHARD_NUMBER: ${{ matrix.shard }} + NUM_TEST_SHARDS: ${{ matrix.num_shards }} + PR_BODY: ${{ github.event.pull_request.body }} + SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }} + DOCKER_IMAGE: ${{ inputs.docker-image }} + XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }} + XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + + COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}") + export COMMIT_MESSAGES + + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e COMMIT_MESSAGES \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CUDA \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "pip install dist/*.whl && ${TEST_COMMAND}" + + - name: Get workflow job id + id: get-job-id + uses: pytorch/pytorch/.github/actions/get-workflow-job-id@master + if: always() + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Upload test artifacts + uses: ./.github/actions/upload-test-artifacts + if: always() + with: + file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }} + + - name: Store Core dumps on S3 + uses: seemethere/upload-artifact-s3@v4 + if: failure() + with: + name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }} + retention-days: 14 + if-no-files-found: ignore + path: ./**/core.[1-9]* + + - name: Upload test statistics + if: always() + env: + AWS_DEFAULT_REGION: us-east-1 + GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: ${{ inputs.build-environment }}-test + TEST_CONFIG: ${{ matrix.config }} + SHARD_NUMBER: ${{ matrix.shard }} + BUILD_ENVIRONMENT: ${{ inputs.build-environment }} + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: ${{ github.run_id }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GHA_WORKFLOW_JOB_ID: ${{ steps.get-job-id.outputs.job-id }} + shell: bash + run: | + set -x + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + + - name: Teardown Linux + uses: ./.github/actions/teardown-linux + if: always() diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml new file mode 100644 index 000000000000..c5a93c7c32f4 --- /dev/null +++ b/.github/workflows/_mac-build.yml @@ -0,0 +1,103 @@ +name: mac-build + +on: + workflow_call: + inputs: + build-environment: + required: true + type: string + description: Top-level label for what's being built/tested. + runner-type: + required: true + type: string + description: Name of the GitHub-managed runner type to use for the build. + build-generates-artifacts: + required: true + type: boolean + description: If set, upload generated build artifacts. + xcode-version: + required: false + type: string + default: "" + description: What xcode version to build with. + + secrets: + MACOS_SCCACHE_S3_ACCESS_KEY_ID: + required: true + description: Access key for S3 bucket for macOS sccache. + MACOS_SCCACHE_S3_SECRET_ACCESS_KEY: + required: true + description: Secret for S3 bucket for macOS sccache. + +env: + IN_CI: 1 # TODO delete in favor of GITHUB_ACTIONS + IS_GHA: 1 # TODO delete in favor of GITHUB_ACTIONS + +# For setup-miniconda, see https://github.com/conda-incubator/setup-miniconda/issues/179 +defaults: + run: + shell: bash -e -l {0} + +jobs: + build: + # Don't run on forked repos. + if: github.repository_owner == 'pytorch' + runs-on: ${{ inputs.runner-type }} + env: + JOB_BASE_NAME: ${{ inputs.build-environment }} + # For sccache access (only on non-forked PRs) + AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + BUILD_ENVIRONMENT: ${{ inputs.build-environment }} + COMPACT_JOB_NAME: ${{ inputs.build-environment }} + steps: + # [see note: pytorch repo ref] + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + + - name: Set xcode version + env: + XCODE_VERSION: ${{ inputs.xcode-version }} + run: | + if [ -n "${XCODE_VERSION}" ]; then + echo "DEVELOPER_DIR=/Applications/Xcode_${XCODE_VERSION}.app/Contents/Developer" >> "${GITHUB_ENV}" + fi + + - name: Setup miniconda + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + python-version: 3.8 + activate-environment: build + miniconda-version: 4.7.12 + + - name: Install macOS homebrew dependencies + run: | + # Install dependencies + brew install libomp + + - name: Install sccache (only for non-forked PRs, and pushes to trunk) + if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} + run: | + sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache + sudo chmod +x /usr/local/bin/sccache + echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" + + - name: Build + run: | + echo "CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}" >> "${GITHUB_ENV}" + .jenkins/pytorch/macos-build.sh + + - name: Archive artifacts into zip + if: inputs.build-generates-artifacts + run: | + zip -1 -r artifacts.zip dist/ + + - name: Store PyTorch Build Artifacts on GHA + uses: actions/upload-artifact@v2 + if: inputs.build-generates-artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + retention-days: 14 + if-no-files-found: error + path: artifacts.zip diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml new file mode 100644 index 000000000000..e0d11034e0d4 --- /dev/null +++ b/.github/workflows/_mac-test.yml @@ -0,0 +1,123 @@ +name: mac-test + +on: + workflow_call: + inputs: + build-environment: + required: true + type: string + description: Top-level label for what's being built/tested. + test-matrix: + required: true + type: string + description: JSON description of what test configs to run. + + secrets: + AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: + required: true + description: access key id for test stats upload + AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY: + required: true + description: secret acess key for test stats upload + +env: + IN_CI: 1 # TODO delete in favor of GITHUB_ACTIONS + IS_GHA: 1 # TODO delete in favor of GITHUB_ACTIONS + +# For setup-miniconda, see https://github.com/conda-incubator/setup-miniconda/issues/179 +defaults: + run: + shell: bash -e -l {0} + +jobs: + test: + # Don't run on forked repos. + if: github.repository_owner == 'pytorch' + strategy: + matrix: ${{ fromJSON(inputs.test-matrix) }} + fail-fast: false + runs-on: ${{ matrix.runner }} + timeout-minutes: 240 + env: + GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + BUILD_ENVIRONMENT: ${{ inputs.build-environment }} + COMPACT_JOB_NAME: ${{ inputs.build-environment }} + JOB_BASE_NAME: ${{ inputs.build-environment }}-test + TEST_CONFIG: ${{ matrix.config }} + SHARD_NUMBER: ${{ matrix.shard }} + NUM_TEST_SHARDS: ${{ matrix.num_shards }} + PR_BODY: ${{ github.event.pull_request.body }} + PYTORCH_RETRY_TEST_CASES: 1 + steps: + # [see note: pytorch repo ref] + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + + - name: Download build artifacts + uses: ./.github/actions/download-build-artifacts + with: + name: ${{ inputs.build-environment }} + use-gha: true + + - name: Setup miniconda + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + python-version: 3.8 + activate-environment: build + miniconda-version: 4.7.12 + + - name: Install macOS homebrew dependencies + run: | + # Install dependencies + brew install libomp + + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + + - name: Test + run: | + COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}") + export COMMIT_MESSAGES + python3 -mpip install dist/*.whl + .jenkins/pytorch/macos-test.sh + + - name: Get workflow job id + id: get-job-id + uses: pytorch/pytorch/.github/actions/get-workflow-job-id@master + if: always() + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Upload test artifacts + uses: ./.github/actions/upload-test-artifacts + if: always() + with: + use-gha: true + file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }} + + - name: Upload test statistics + if: always() + env: + AWS_DEFAULT_REGION: us-east-1 + GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: ${{ inputs.build-environment }}-test + TEST_CONFIG: ${{ matrix.config }} + SHARD_NUMBER: ${{ matrix.shard }} + BUILD_ENVIRONMENT: ${{ inputs.build-environment }} + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: ${{ github.run_id }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }} + GHA_WORKFLOW_JOB_ID: ${{ steps.get-job-id.outputs.job-id }} + shell: bash + run: | + set -x + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml new file mode 100644 index 000000000000..894938fb7d5a --- /dev/null +++ b/.github/workflows/_rocm-test.yml @@ -0,0 +1,192 @@ +# TODO: this looks sort of similar to _linux-test, but there are like a dozen +# places where you would have to insert an if statement. Probably it's better to +# just use a different workflow altogether + +name: test + +on: + workflow_call: + inputs: + build-environment: + required: true + type: string + description: Top-level label for what's being built/tested. + test-matrix: + required: true + type: string + description: JSON description of what test configs to run. + docker-image: + required: true + type: string + description: Docker image to run in. + + secrets: + AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: + required: true + description: access key id for test stats upload + AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY: + required: true + description: secret acess key for test stats upload + +env: + IN_CI: 1 # TODO delete in favor of GITHUB_ACTIONS + IS_GHA: 1 # TODO delete in favor of GITHUB_ACTIONS + GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + +jobs: + test: + # Don't run on forked repos. + if: github.repository_owner == 'pytorch' + timeout-minutes: 300 + strategy: + matrix: ${{ fromJSON(inputs.test-matrix) }} + fail-fast: false + runs-on: ${{ matrix.runner }} + steps: + # [see note: pytorch repo ref] + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + with: + no-sudo: true + + - name: Setup ROCm + uses: ./.github/actions/setup-rocm + + - name: Pull docker image + uses: ./.github/actions/pull-docker-image + with: + docker-image: ${{ inputs.docker-image }} + + - name: Download build artifacts + uses: ./.github/actions/download-build-artifacts + with: + name: ${{ inputs.build-environment }} + + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + + - name: Test + env: + BUILD_ENVIRONMENT: ${{ inputs.build-environment }} + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + PYTORCH_RETRY_TEST_CASES: 1 + JOB_BASE_NAME: ${{ inputs.build-environment }}-test + TEST_CONFIG: ${{ matrix.config }} + SHARD_NUMBER: ${{ matrix.shard }} + NUM_TEST_SHARDS: ${{ matrix.num_shards }} + PR_BODY: ${{ github.event.pull_request.body }} + SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + DOCKER_IMAGE: ${{ inputs.docker-image }} + XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla + timeout-minutes: 270 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + + COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}") + export COMMIT_MESSAGES + + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e COMMIT_MESSAGES \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --shm-size="8g" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + # save container name for later step + echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV" + # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home + docker exec -t "${container_name}" sh -c "cd .. && cp -R workspace pytorch && cd pytorch && pip install dist/*.whl && ${TEST_COMMAND}" + + - name: Save test results + if: always() + run: | + # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct + docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test" + + - name: Get workflow job id + id: get-job-id + uses: pytorch/pytorch/.github/actions/get-workflow-job-id@master + if: always() + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Upload test artifacts + uses: ./.github/actions/upload-test-artifacts + if: always() + with: + use-gha: true + file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }} + + - name: Upload test statistics + if: always() + env: + AWS_DEFAULT_REGION: us-east-1 + GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: ${{ inputs.build-environment }}-test + TEST_CONFIG: ${{ matrix.config }} + SHARD_NUMBER: ${{ matrix.shard }} + BUILD_ENVIRONMENT: ${{ inputs.build-environment }} + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: ${{ github.run_id }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }} + GHA_WORKFLOW_JOB_ID: ${{ steps.get-job-id.outputs.job-id }} + shell: bash + run: | + set -x + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + + - name: Teardown ROCm + uses: ./.github/actions/teardown-rocm + if: always() diff --git a/.github/workflows/_win-build.yml b/.github/workflows/_win-build.yml new file mode 100644 index 000000000000..abd7aca07f7a --- /dev/null +++ b/.github/workflows/_win-build.yml @@ -0,0 +1,94 @@ +name: windows-build + +on: + workflow_call: + inputs: + build-environment: + required: true + type: string + description: Top-level label for what's being built/tested. + cuda-version: + required: true + type: string + description: What CUDA version to build with, "cpu" for none. + build-with-debug: + required: false + type: boolean + default: false + description: If set, build in debug mode. + +env: + IN_CI: 1 # TODO delete in favor of GITHUB_ACTIONS + IS_GHA: 1 # TODO delete in favor of GITHUB_ACTIONS + GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + +jobs: + build: + # Don't run on forked repos. + if: github.repository_owner == 'pytorch' + runs-on: [self-hosted, windows.4xlarge] + timeout-minutes: 240 + env: + JOB_BASE_NAME: ${{ inputs.build-environment }}-build + steps: + # [see note: pytorch repo ref] + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + with: + no-sudo: true + + - name: Setup Windows + uses: ./.github/actions/setup-win + with: + cuda-version: ${{ inputs.cuda-version }} + + - name: Setup SSH (Click me for login details) + uses: ./.github/actions/setup-ssh + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + + - name: Build + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + BRANCH: ${{ steps.parse-ref.outputs.branch }} + BUILD_ENVIRONMENT: ${{ inputs.build-environment }} + BUILD_WHEEL: 1 + MAX_JOBS: 8 + CUDA_VERSION: ${{ inputs.cuda-version }} + PYTHON_VERSION: "3.8" + PYTORCH_RETRY_TEST_CASES: 1 + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + SCCACHE_BUCKET: "ossci-compiler-cache" + VC_PRODUCT: "BuildTools" + VC_VERSION: "" + VC_YEAR: "2019" + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + AWS_DEFAULT_REGION: us-east-1 + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + DEBUG: ${{ inputs.build-with-debug && '1' || '0' }} + TORCH_CUDA_ARCH_LIST: "7.0" + USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }} + run: | + .jenkins/pytorch/win-build.sh + + # Upload to github so that people can click and download artifacts + - name: Upload artifacts to s3 + uses: seemethere/upload-artifact-s3@v4 + with: + retention-days: 14 + if-no-files-found: error + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + + - name: Teardown Windows + uses: ./.github/actions/teardown-win + if: always() + timeout-minutes: 120 + with: + extra-delete-dir: /c/${{ github.run_id }}/build-results/ diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml new file mode 100644 index 000000000000..07f66b36ee7a --- /dev/null +++ b/.github/workflows/_win-test.yml @@ -0,0 +1,134 @@ +name: win-test + +on: + workflow_call: + inputs: + build-environment: + required: true + type: string + description: Top-level label for what's being built/tested. + cuda-version: + required: true + type: string + description: What CUDA version to build with, "cpu" for none. + test-matrix: + required: true + type: string + description: JSON description of what test configs to run. + +env: + IN_CI: 1 # TODO delete in favor of GITHUB_ACTIONS + IS_GHA: 1 # TODO delete in favor of GITHUB_ACTIONS + GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + +jobs: + test: + # Don't run on forked repos. + if: github.repository_owner == 'pytorch' + strategy: + matrix: ${{ fromJSON(inputs.test-matrix) }} + fail-fast: false + runs-on: ${{ matrix.runner }} + timeout-minutes: 300 + steps: + # [see note: pytorch repo ref] + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + with: + no-sudo: true + + - name: Setup Windows + uses: ./.github/actions/setup-win + with: + cuda-version: ${{ inputs.cuda-version }} + + - name: Setup SSH (Click me for login details) + uses: ./.github/actions/setup-ssh + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + + - name: Download PyTorch Build Artifacts + uses: seemethere/download-artifact-s3@v3 + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + + - name: Test + shell: bash + env: + USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }} + INSTALL_WINDOWS_SDK: 1 + PYTHON_VERSION: 3.8 + PYTORCH_RETRY_TEST_CASES: 1 + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + VC_PRODUCT: "BuildTools" + VC_VERSION: "" + VS_VERSION: "16.8.6" + VC_YEAR: "2019" + AWS_DEFAULT_REGION: us-east-1 + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + CUDA_VERSION: ${{ inputs.cuda-version }} + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + BUILD_ENVIRONMENT: ${{ inputs.build-environment }} + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + SHARD_NUMBER: ${{ matrix.shard }} + NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: ${{ matrix.config }} + JOB_BASE_NAME: ${{ inputs.build-environment }}-test + PR_BODY: ${{ github.event.pull_request.body }} + TORCH_CUDA_ARCH_LIST: "7.0" + run: | + COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}") + export COMMIT_MESSAGES + .jenkins/pytorch/win-test.sh + + - name: Get workflow job id + id: get-job-id + uses: pytorch/pytorch/.github/actions/get-workflow-job-id@master + if: always() + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Upload test artifacts + uses: ./.github/actions/upload-test-artifacts + if: always() + with: + file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }} + + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + + - name: Upload test statistics + if: always() + env: + AWS_DEFAULT_REGION: us-east-1 + GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: ${{ inputs.build-environment }}-test + TEST_CONFIG: ${{ matrix.config }} + SHARD_NUMBER: ${{ matrix.shard }} + BUILD_ENVIRONMENT: ${{ inputs.build-environment }} + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: ${{ github.run_id }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GHA_WORKFLOW_JOB_ID: ${{ steps.get-job-id.outputs.job-id }} + shell: bash + run: | + set -x + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + + - name: Teardown Windows + uses: ./.github/actions/teardown-win + if: always() + timeout-minutes: 120 diff --git a/.github/workflows/buck_build_test.yml b/.github/workflows/buck_build_test.yml new file mode 100644 index 000000000000..3104a9982895 --- /dev/null +++ b/.github/workflows/buck_build_test.yml @@ -0,0 +1,116 @@ +name: buck + +on: + push: + tags: + # Trigger on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + - 'ciflow/trunk/*' + branches: + - master + - main + - release/* + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +defaults: + run: + shell: bash -e -l {0} + +jobs: + + buck-build-test: + runs-on: ubuntu-latest + env: + JOB_BASE_NAME: ubuntu-latest-buck + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + + - name: Set up JDK 8 + uses: actions/setup-java@v3 + with: + java-version: '8' + distribution: 'temurin' + + - name: Setup miniconda + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + python-version: 3.8 + activate-environment: build + + - name: Install dependencies + run: | + conda install -y \ + cffi \ + cmake \ + mkl \ + mkl-include \ + ninja \ + numpy \ + pyyaml \ + requests \ + setuptools \ + typing_extensions + + - name: Install Buck + run: | + wget https://github.com/facebook/buck/releases/download/v2021.01.12.01/buck.2021.01.12.01_all.deb + sudo apt install ./buck.2021.01.12.01_all.deb + + - name: Download third party libraries and generate wrappers + run: | + sh scripts/buck_setup.sh + + - name: Build glog + run: | + buck build third_party:glog + + - name: Build C10 + run: | + buck build c10:c10 + + - name: Build cpuinfo + run: | + buck build third_party:cpuinfo + + - name: Build pthreadpool + run: | + buck build third_party:pthreadpool + + - name: Build XNNPACK + run: | + buck build third_party:XNNPACK + + - name: Build QNNPACK + run: | + buck build aten/src/ATen/native/quantized/cpu/qnnpack/... --keep-going + + - name: Build aten_cpu + run: | + buck build :aten_cpu + + - name: Build torch_mobile_core + run: | + buck build :torch_mobile_core + + - name: Build torch_mobile_all_ops + run: | + buck build :torch_mobile_all_ops + + - name: Build mobile benchmark + run: | + buck build :ptmobile_benchmark + + - name: Run lite interpreter model + run: | + buck run :ptmobile_benchmark -- --model=ios/TestApp/models/mobilenet_v2.ptl --input_dims=1,3,224,224 --input_type=float + + - name: Build everything + run: | + buck build //... --keep-going diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml index f5432f0b40c9..605aa8b05b49 100644 --- a/.github/workflows/create_release.yml +++ b/.github/workflows/create_release.yml @@ -3,7 +3,10 @@ name: Create Release on: push: tags: ['v*'] - branches: [master] + branches: + - master + - main + - nightly release: types: [published] pull_request: @@ -18,6 +21,7 @@ jobs: - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: submodules: 'recursive' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - name: Fake name for PRs if: ${{ github.event_name == 'pull_request' }} run: echo "PT_GITHUB_REF=refs/tags/pr-tag" >> "$GITHUB_ENV" @@ -37,7 +41,7 @@ jobs: cp -r "$PWD" "/tmp/$PT_RELEASE_NAME" mv "/tmp/$PT_RELEASE_NAME" . # Cleanup - rm -r "$PT_RELEASE_NAME"/{.azure_pipelines,.circleci,.jenkins} + rm -rf "$PT_RELEASE_NAME"/{.circleci,.jenkins} find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true # Create archive tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME" @@ -49,5 +53,5 @@ jobs: files: ${{env.PT_RELEASE_FILE}} concurrency: - group: create-release-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml new file mode 100644 index 000000000000..8cfca9514b11 --- /dev/null +++ b/.github/workflows/docker-builds.yml @@ -0,0 +1,78 @@ +name: docker-builds + +on: + workflow_dispatch: + pull_request: + paths: + - .circleci/docker/** + - .github/workflows/docker-builds.yml + schedule: + - cron: 1 3 * * 3 + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +env: + ALPINE_IMAGE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine + AWS_DEFAULT_REGION: us-east-1 + +jobs: + docker-build: + runs-on: [self-hosted, linux.2xlarge] + timeout-minutes: 240 + strategy: + matrix: + include: + - docker-image-name: pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7 + - docker-image-name: pytorch-linux-bionic-cuda11.3-cudnn8-py3-clang9 + - docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7 + - docker-image-name: pytorch-linux-bionic-py3.7-clang9 + - docker-image-name: pytorch-linux-bionic-rocm5.0-py3.7 + - docker-image-name: pytorch-linux-bionic-rocm5.1-py3.7 + - docker-image-name: pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7 + - docker-image-name: pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7 + - docker-image-name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c + - docker-image-name: pytorch-linux-xenial-py3-clang5-asan + - docker-image-name: pytorch-linux-xenial-py3-clang7-asan + - docker-image-name: pytorch-linux-xenial-py3-clang7-onnx + - docker-image-name: pytorch-linux-xenial-py3.7-gcc5.4 + - docker-image-name: pytorch-linux-xenial-py3.7-gcc7 + - docker-image-name: pytorch-linux-focal-py3.7-gcc7 + env: + DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/${{ matrix.docker-image-name }} + steps: + - name: Clean workspace + shell: bash + run: | + echo "${GITHUB_WORKSPACE}" + sudo rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + + # [see note: pytorch repo ref] + # deep clone (fetch-depth 0) required for git merge-base + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + + - name: Setup Linux + uses: ./.github/actions/setup-linux + + - name: Build docker image + id: build-docker-image + uses: ./.github/actions/calculate-docker-image + with: + docker-image-name: ${{ matrix.docker-image-name }} + always-rebuild: true + + - name: Pull docker image + uses: ./.github/actions/pull-docker-image + with: + docker-image: ${{ steps.build-docker-image.outputs.docker-image }} + + - name: Chown workspace + uses: ./.github/actions/chown-workspace + if: always() + + - name: Teardown Linux + uses: ./.github/actions/teardown-linux + if: always() diff --git a/.github/workflows/generated-caffe2-linux-xenial-py3.7-gcc5.4.yml b/.github/workflows/generated-caffe2-linux-xenial-py3.7-gcc5.4.yml deleted file mode 100644 index c1932cbf09e8..000000000000 --- a/.github/workflows/generated-caffe2-linux-xenial-py3.7-gcc5.4.yml +++ /dev/null @@ -1,248 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: caffe2-linux-xenial-py3.7-gcc5.4 - -on: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/cpu/*' - - 'ciflow/linux/*' - - 'ciflow/trunk/*' - branches: - - master - - release/* - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: caffe2-linux-xenial-py3.7-gcc5.4 - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.7-gcc5.4 - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: caffe2-linux-xenial-py3.7-gcc5.4-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - build: - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - JOB_BASE_NAME: caffe2-linux-xenial-py3.7-gcc5.4-build - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Archive artifacts into zip - run: | - zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@v3 - name: Store PyTorch Build Artifacts on S3 - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-docker-builds.yml b/.github/workflows/generated-docker-builds.yml deleted file mode 100644 index 785c65d45b9b..000000000000 --- a/.github/workflows/generated-docker-builds.yml +++ /dev/null @@ -1,175 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/docker_builds_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: docker-builds - -on: - workflow_dispatch: - pull_request: - types: [opened, synchronize, reopened] - paths: - - '.circleci/docker/**' - - '.github/workflows/generated-docker-builds.yml' - schedule: - - cron: 1 3 * * 3 -concurrency: - group: docker-builds-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -env: - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - AWS_DEFAULT_REGION: us-east-1 - -jobs: - - docker-build: - runs-on: linux.2xlarge - timeout-minutes: 240 - strategy: - matrix: - include: - - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.7-clang9' - docker_image_short_name: 'pytorch-linux-bionic-cuda10.2-cudnn7-py3.7-clang9' - - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7' - docker_image_short_name: 'pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7' - - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda11.5-cudnn8-py3-gcc7' - docker_image_short_name: 'pytorch-linux-bionic-cuda11.5-cudnn8-py3-gcc7' - - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.7-clang9' - docker_image_short_name: 'pytorch-linux-bionic-py3.7-clang9' - - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-rocm4.3.1-py3.7' - docker_image_short_name: 'pytorch-linux-bionic-rocm4.3.1-py3.7' - - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-rocm4.5-py3.7' - docker_image_short_name: 'pytorch-linux-bionic-rocm4.5-py3.7' - - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7' - docker_image_short_name: 'pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7' - - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7' - docker_image_short_name: 'pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7' - - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7' - docker_image_short_name: 'pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7' - - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c' - docker_image_short_name: 'pytorch-linux-xenial-py3-clang5-android-ndk-r19c' - - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan' - docker_image_short_name: 'pytorch-linux-xenial-py3-clang5-asan' - - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang7-asan' - docker_image_short_name: 'pytorch-linux-xenial-py3-clang7-asan' - - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang7-onnx' - docker_image_short_name: 'pytorch-linux-xenial-py3-clang7-onnx' - - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.7-gcc5.4' - docker_image_short_name: 'pytorch-linux-xenial-py3.7-gcc5.4' - - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.7-gcc7' - docker_image_short_name: 'pytorch-linux-xenial-py3.7-gcc7' - env: - DOCKER_IMAGE_BASE: '${{ matrix.docker_image_base }}' - name: docker-build (${{ matrix.docker_image_short_name }}) - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-ios-12-5-1-arm64-coreml.yml b/.github/workflows/generated-ios-12-5-1-arm64-coreml.yml deleted file mode 100644 index 6995b22347e1..000000000000 --- a/.github/workflows/generated-ios-12-5-1-arm64-coreml.yml +++ /dev/null @@ -1,143 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/ios_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: ios-12-5-1-arm64-coreml - -on: - push: - branches: - - master - - release/* - tags: - - 'ciflow/all/*' - - 'ciflow/ios/*' - - 'ciflow/macos/*' - - 'ciflow/trunk/*' - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: ios-12-5-1-arm64-coreml - IN_CI: 1 - IS_GHA: 1 - IOS_PLATFORM: OS - IOS_ARCH: arm64 - - -jobs: - - build: - # NOTE: These builds will not run successfully without running on `pytorch/pytorch` due to the limitations - # of accessing secrets from forked pull requests and IOS' dependency on secrets for their build/test - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - runs-on: macos-10.15 - timeout-minutes: 240 - env: - JOB_BASE_NAME: ios-12-5-1-arm64-coreml-build - IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }} - IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET }} - IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID }} - IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }} - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Populate CI build options - run: | - # Most builds use the lite interpreter, if certain builds shouldn't - # build the lite interpreter this env variable should get over-written - # in the following case statement - echo "BUILD_LITE_INTERPRETER=1" >> "${GITHUB_ENV}" - - case ${BUILD_ENVIRONMENT} in - *metal*) - echo "USE_PYTORCH_METAL=1" >> "${GITHUB_ENV}" - ;; - *full_jit*) - echo "BUILD_LITE_INTERPRETER=0" >> "${GITHUB_ENV}" - ;; - *custom*) - echo "SELECTED_OP_LIST=${GITHUB_WORKSPACE}/ios/TestApp/custom_build/mobilenetv2.yaml" >> "${GITHUB_ENV}" - ;; - *coreml*) - echo "USE_COREML_DELEGATE=1" >> "${GITHUB_ENV}" - ;; - esac - - name: Install brew dependencies - run: | - # Install dependencies - brew install libtool - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - conda install -y \ - cffi \ - cmake \ - mkl \ - mkl-include \ - ninja \ - numpy \ - pyyaml \ - requests \ - setuptools \ - typing_extensions - - name: Run Fastlane - run: | - set -x - cd ios/TestApp - # install fastlane - sudo gem install bundler && bundle install - # install certificates - echo "${IOS_CERT_KEY_2022}" >> cert.txt - base64 --decode cert.txt -o Certificates.p12 - rm cert.txt - bundle exec fastlane install_root_cert - bundle exec fastlane install_dev_cert - # install the provisioning profile - PROFILE=PyTorch_CI_2022.mobileprovision - PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles - mkdir -pv "${PROVISIONING_PROFILES}" - cd "${PROVISIONING_PROFILES}" - echo "${IOS_SIGN_KEY_2022}" >> cert.txt - base64 --decode cert.txt -o ${PROFILE} - rm cert.txt - - name: Build - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - export TCLLIBPATH="/usr/local/lib" - python -VV - export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"} - scripts/build_ios.sh - - name: Run Build Test - run: | - PROFILE=PyTorch_CI_2022 - # run the ruby build script - if ! [ -x "$(command -v xcodebuild)" ]; then - echo 'Error: xcodebuild is not installed.' - exit 1 - fi - if [ "${IOS_PLATFORM}" != "SIMULATOR" ]; then - ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" -c "${PROFILE}" -t "${IOS_DEV_TEAM_ID}" - else - ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" - fi - -concurrency: - group: ios-12-5-1-arm64-coreml-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true diff --git a/.github/workflows/generated-ios-12-5-1-arm64-custom-ops.yml b/.github/workflows/generated-ios-12-5-1-arm64-custom-ops.yml deleted file mode 100644 index 0fd77eef8605..000000000000 --- a/.github/workflows/generated-ios-12-5-1-arm64-custom-ops.yml +++ /dev/null @@ -1,143 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/ios_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: ios-12-5-1-arm64-custom-ops - -on: - push: - branches: - - master - - release/* - tags: - - 'ciflow/all/*' - - 'ciflow/ios/*' - - 'ciflow/macos/*' - - 'ciflow/trunk/*' - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: ios-12-5-1-arm64-custom-ops - IN_CI: 1 - IS_GHA: 1 - IOS_PLATFORM: OS - IOS_ARCH: arm64 - - -jobs: - - build: - # NOTE: These builds will not run successfully without running on `pytorch/pytorch` due to the limitations - # of accessing secrets from forked pull requests and IOS' dependency on secrets for their build/test - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - runs-on: macos-10.15 - timeout-minutes: 240 - env: - JOB_BASE_NAME: ios-12-5-1-arm64-custom-ops-build - IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }} - IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET }} - IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID }} - IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }} - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Populate CI build options - run: | - # Most builds use the lite interpreter, if certain builds shouldn't - # build the lite interpreter this env variable should get over-written - # in the following case statement - echo "BUILD_LITE_INTERPRETER=1" >> "${GITHUB_ENV}" - - case ${BUILD_ENVIRONMENT} in - *metal*) - echo "USE_PYTORCH_METAL=1" >> "${GITHUB_ENV}" - ;; - *full_jit*) - echo "BUILD_LITE_INTERPRETER=0" >> "${GITHUB_ENV}" - ;; - *custom*) - echo "SELECTED_OP_LIST=${GITHUB_WORKSPACE}/ios/TestApp/custom_build/mobilenetv2.yaml" >> "${GITHUB_ENV}" - ;; - *coreml*) - echo "USE_COREML_DELEGATE=1" >> "${GITHUB_ENV}" - ;; - esac - - name: Install brew dependencies - run: | - # Install dependencies - brew install libtool - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - conda install -y \ - cffi \ - cmake \ - mkl \ - mkl-include \ - ninja \ - numpy \ - pyyaml \ - requests \ - setuptools \ - typing_extensions - - name: Run Fastlane - run: | - set -x - cd ios/TestApp - # install fastlane - sudo gem install bundler && bundle install - # install certificates - echo "${IOS_CERT_KEY_2022}" >> cert.txt - base64 --decode cert.txt -o Certificates.p12 - rm cert.txt - bundle exec fastlane install_root_cert - bundle exec fastlane install_dev_cert - # install the provisioning profile - PROFILE=PyTorch_CI_2022.mobileprovision - PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles - mkdir -pv "${PROVISIONING_PROFILES}" - cd "${PROVISIONING_PROFILES}" - echo "${IOS_SIGN_KEY_2022}" >> cert.txt - base64 --decode cert.txt -o ${PROFILE} - rm cert.txt - - name: Build - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - export TCLLIBPATH="/usr/local/lib" - python -VV - export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"} - scripts/build_ios.sh - - name: Run Build Test - run: | - PROFILE=PyTorch_CI_2022 - # run the ruby build script - if ! [ -x "$(command -v xcodebuild)" ]; then - echo 'Error: xcodebuild is not installed.' - exit 1 - fi - if [ "${IOS_PLATFORM}" != "SIMULATOR" ]; then - ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" -c "${PROFILE}" -t "${IOS_DEV_TEAM_ID}" - else - ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" - fi - -concurrency: - group: ios-12-5-1-arm64-custom-ops-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true diff --git a/.github/workflows/generated-ios-12-5-1-arm64-full-jit.yml b/.github/workflows/generated-ios-12-5-1-arm64-full-jit.yml deleted file mode 100644 index 876e1e811f1b..000000000000 --- a/.github/workflows/generated-ios-12-5-1-arm64-full-jit.yml +++ /dev/null @@ -1,143 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/ios_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: ios-12-5-1-arm64-full-jit - -on: - push: - branches: - - master - - release/* - tags: - - 'ciflow/all/*' - - 'ciflow/ios/*' - - 'ciflow/macos/*' - - 'ciflow/trunk/*' - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: ios-12-5-1-arm64-full-jit - IN_CI: 1 - IS_GHA: 1 - IOS_PLATFORM: OS - IOS_ARCH: arm64 - - -jobs: - - build: - # NOTE: These builds will not run successfully without running on `pytorch/pytorch` due to the limitations - # of accessing secrets from forked pull requests and IOS' dependency on secrets for their build/test - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - runs-on: macos-10.15 - timeout-minutes: 240 - env: - JOB_BASE_NAME: ios-12-5-1-arm64-full-jit-build - IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }} - IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET }} - IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID }} - IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }} - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Populate CI build options - run: | - # Most builds use the lite interpreter, if certain builds shouldn't - # build the lite interpreter this env variable should get over-written - # in the following case statement - echo "BUILD_LITE_INTERPRETER=1" >> "${GITHUB_ENV}" - - case ${BUILD_ENVIRONMENT} in - *metal*) - echo "USE_PYTORCH_METAL=1" >> "${GITHUB_ENV}" - ;; - *full_jit*) - echo "BUILD_LITE_INTERPRETER=0" >> "${GITHUB_ENV}" - ;; - *custom*) - echo "SELECTED_OP_LIST=${GITHUB_WORKSPACE}/ios/TestApp/custom_build/mobilenetv2.yaml" >> "${GITHUB_ENV}" - ;; - *coreml*) - echo "USE_COREML_DELEGATE=1" >> "${GITHUB_ENV}" - ;; - esac - - name: Install brew dependencies - run: | - # Install dependencies - brew install libtool - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - conda install -y \ - cffi \ - cmake \ - mkl \ - mkl-include \ - ninja \ - numpy \ - pyyaml \ - requests \ - setuptools \ - typing_extensions - - name: Run Fastlane - run: | - set -x - cd ios/TestApp - # install fastlane - sudo gem install bundler && bundle install - # install certificates - echo "${IOS_CERT_KEY_2022}" >> cert.txt - base64 --decode cert.txt -o Certificates.p12 - rm cert.txt - bundle exec fastlane install_root_cert - bundle exec fastlane install_dev_cert - # install the provisioning profile - PROFILE=PyTorch_CI_2022.mobileprovision - PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles - mkdir -pv "${PROVISIONING_PROFILES}" - cd "${PROVISIONING_PROFILES}" - echo "${IOS_SIGN_KEY_2022}" >> cert.txt - base64 --decode cert.txt -o ${PROFILE} - rm cert.txt - - name: Build - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - export TCLLIBPATH="/usr/local/lib" - python -VV - export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"} - scripts/build_ios.sh - - name: Run Build Test - run: | - PROFILE=PyTorch_CI_2022 - # run the ruby build script - if ! [ -x "$(command -v xcodebuild)" ]; then - echo 'Error: xcodebuild is not installed.' - exit 1 - fi - if [ "${IOS_PLATFORM}" != "SIMULATOR" ]; then - ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" -c "${PROFILE}" -t "${IOS_DEV_TEAM_ID}" - else - ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" - fi - -concurrency: - group: ios-12-5-1-arm64-full-jit-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true diff --git a/.github/workflows/generated-ios-12-5-1-arm64-metal.yml b/.github/workflows/generated-ios-12-5-1-arm64-metal.yml deleted file mode 100644 index 065f311e90f9..000000000000 --- a/.github/workflows/generated-ios-12-5-1-arm64-metal.yml +++ /dev/null @@ -1,143 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/ios_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: ios-12-5-1-arm64-metal - -on: - push: - branches: - - master - - release/* - tags: - - 'ciflow/all/*' - - 'ciflow/ios/*' - - 'ciflow/macos/*' - - 'ciflow/trunk/*' - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: ios-12-5-1-arm64-metal - IN_CI: 1 - IS_GHA: 1 - IOS_PLATFORM: OS - IOS_ARCH: arm64 - - -jobs: - - build: - # NOTE: These builds will not run successfully without running on `pytorch/pytorch` due to the limitations - # of accessing secrets from forked pull requests and IOS' dependency on secrets for their build/test - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - runs-on: macos-10.15 - timeout-minutes: 240 - env: - JOB_BASE_NAME: ios-12-5-1-arm64-metal-build - IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }} - IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET }} - IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID }} - IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }} - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Populate CI build options - run: | - # Most builds use the lite interpreter, if certain builds shouldn't - # build the lite interpreter this env variable should get over-written - # in the following case statement - echo "BUILD_LITE_INTERPRETER=1" >> "${GITHUB_ENV}" - - case ${BUILD_ENVIRONMENT} in - *metal*) - echo "USE_PYTORCH_METAL=1" >> "${GITHUB_ENV}" - ;; - *full_jit*) - echo "BUILD_LITE_INTERPRETER=0" >> "${GITHUB_ENV}" - ;; - *custom*) - echo "SELECTED_OP_LIST=${GITHUB_WORKSPACE}/ios/TestApp/custom_build/mobilenetv2.yaml" >> "${GITHUB_ENV}" - ;; - *coreml*) - echo "USE_COREML_DELEGATE=1" >> "${GITHUB_ENV}" - ;; - esac - - name: Install brew dependencies - run: | - # Install dependencies - brew install libtool - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - conda install -y \ - cffi \ - cmake \ - mkl \ - mkl-include \ - ninja \ - numpy \ - pyyaml \ - requests \ - setuptools \ - typing_extensions - - name: Run Fastlane - run: | - set -x - cd ios/TestApp - # install fastlane - sudo gem install bundler && bundle install - # install certificates - echo "${IOS_CERT_KEY_2022}" >> cert.txt - base64 --decode cert.txt -o Certificates.p12 - rm cert.txt - bundle exec fastlane install_root_cert - bundle exec fastlane install_dev_cert - # install the provisioning profile - PROFILE=PyTorch_CI_2022.mobileprovision - PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles - mkdir -pv "${PROVISIONING_PROFILES}" - cd "${PROVISIONING_PROFILES}" - echo "${IOS_SIGN_KEY_2022}" >> cert.txt - base64 --decode cert.txt -o ${PROFILE} - rm cert.txt - - name: Build - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - export TCLLIBPATH="/usr/local/lib" - python -VV - export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"} - scripts/build_ios.sh - - name: Run Build Test - run: | - PROFILE=PyTorch_CI_2022 - # run the ruby build script - if ! [ -x "$(command -v xcodebuild)" ]; then - echo 'Error: xcodebuild is not installed.' - exit 1 - fi - if [ "${IOS_PLATFORM}" != "SIMULATOR" ]; then - ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" -c "${PROFILE}" -t "${IOS_DEV_TEAM_ID}" - else - ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" - fi - -concurrency: - group: ios-12-5-1-arm64-metal-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true diff --git a/.github/workflows/generated-ios-12-5-1-arm64.yml b/.github/workflows/generated-ios-12-5-1-arm64.yml deleted file mode 100644 index 2de63df26293..000000000000 --- a/.github/workflows/generated-ios-12-5-1-arm64.yml +++ /dev/null @@ -1,143 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/ios_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: ios-12-5-1-arm64 - -on: - push: - branches: - - master - - release/* - tags: - - 'ciflow/all/*' - - 'ciflow/ios/*' - - 'ciflow/macos/*' - - 'ciflow/trunk/*' - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: ios-12-5-1-arm64 - IN_CI: 1 - IS_GHA: 1 - IOS_PLATFORM: OS - IOS_ARCH: arm64 - - -jobs: - - build: - # NOTE: These builds will not run successfully without running on `pytorch/pytorch` due to the limitations - # of accessing secrets from forked pull requests and IOS' dependency on secrets for their build/test - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - runs-on: macos-10.15 - timeout-minutes: 240 - env: - JOB_BASE_NAME: ios-12-5-1-arm64-build - IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }} - IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET }} - IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID }} - IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }} - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Populate CI build options - run: | - # Most builds use the lite interpreter, if certain builds shouldn't - # build the lite interpreter this env variable should get over-written - # in the following case statement - echo "BUILD_LITE_INTERPRETER=1" >> "${GITHUB_ENV}" - - case ${BUILD_ENVIRONMENT} in - *metal*) - echo "USE_PYTORCH_METAL=1" >> "${GITHUB_ENV}" - ;; - *full_jit*) - echo "BUILD_LITE_INTERPRETER=0" >> "${GITHUB_ENV}" - ;; - *custom*) - echo "SELECTED_OP_LIST=${GITHUB_WORKSPACE}/ios/TestApp/custom_build/mobilenetv2.yaml" >> "${GITHUB_ENV}" - ;; - *coreml*) - echo "USE_COREML_DELEGATE=1" >> "${GITHUB_ENV}" - ;; - esac - - name: Install brew dependencies - run: | - # Install dependencies - brew install libtool - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - conda install -y \ - cffi \ - cmake \ - mkl \ - mkl-include \ - ninja \ - numpy \ - pyyaml \ - requests \ - setuptools \ - typing_extensions - - name: Run Fastlane - run: | - set -x - cd ios/TestApp - # install fastlane - sudo gem install bundler && bundle install - # install certificates - echo "${IOS_CERT_KEY_2022}" >> cert.txt - base64 --decode cert.txt -o Certificates.p12 - rm cert.txt - bundle exec fastlane install_root_cert - bundle exec fastlane install_dev_cert - # install the provisioning profile - PROFILE=PyTorch_CI_2022.mobileprovision - PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles - mkdir -pv "${PROVISIONING_PROFILES}" - cd "${PROVISIONING_PROFILES}" - echo "${IOS_SIGN_KEY_2022}" >> cert.txt - base64 --decode cert.txt -o ${PROFILE} - rm cert.txt - - name: Build - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - export TCLLIBPATH="/usr/local/lib" - python -VV - export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"} - scripts/build_ios.sh - - name: Run Build Test - run: | - PROFILE=PyTorch_CI_2022 - # run the ruby build script - if ! [ -x "$(command -v xcodebuild)" ]; then - echo 'Error: xcodebuild is not installed.' - exit 1 - fi - if [ "${IOS_PLATFORM}" != "SIMULATOR" ]; then - ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" -c "${PROFILE}" -t "${IOS_DEV_TEAM_ID}" - else - ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" - fi - -concurrency: - group: ios-12-5-1-arm64-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true diff --git a/.github/workflows/generated-ios-12-5-1-x86-64-coreml.yml b/.github/workflows/generated-ios-12-5-1-x86-64-coreml.yml deleted file mode 100644 index 4306711a6210..000000000000 --- a/.github/workflows/generated-ios-12-5-1-x86-64-coreml.yml +++ /dev/null @@ -1,176 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/ios_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: ios-12-5-1-x86-64-coreml - -on: - push: - branches: - - master - - release/* - tags: - - 'ciflow/all/*' - - 'ciflow/ios/*' - - 'ciflow/macos/*' - - 'ciflow/trunk/*' - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: ios-12-5-1-x86-64-coreml - IN_CI: 1 - IS_GHA: 1 - IOS_PLATFORM: SIMULATOR - IOS_ARCH: x86_64 - - -jobs: - - build: - # NOTE: These builds will not run successfully without running on `pytorch/pytorch` due to the limitations - # of accessing secrets from forked pull requests and IOS' dependency on secrets for their build/test - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - runs-on: macos-10.15 - timeout-minutes: 240 - env: - JOB_BASE_NAME: ios-12-5-1-x86-64-coreml-build - IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }} - IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET }} - IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID }} - IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }} - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Populate CI build options - run: | - # Most builds use the lite interpreter, if certain builds shouldn't - # build the lite interpreter this env variable should get over-written - # in the following case statement - echo "BUILD_LITE_INTERPRETER=1" >> "${GITHUB_ENV}" - - case ${BUILD_ENVIRONMENT} in - *metal*) - echo "USE_PYTORCH_METAL=1" >> "${GITHUB_ENV}" - ;; - *full_jit*) - echo "BUILD_LITE_INTERPRETER=0" >> "${GITHUB_ENV}" - ;; - *custom*) - echo "SELECTED_OP_LIST=${GITHUB_WORKSPACE}/ios/TestApp/custom_build/mobilenetv2.yaml" >> "${GITHUB_ENV}" - ;; - *coreml*) - echo "USE_COREML_DELEGATE=1" >> "${GITHUB_ENV}" - ;; - esac - - name: Install brew dependencies - run: | - # Install dependencies - brew install libtool - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - conda install -y \ - cffi \ - cmake \ - mkl \ - mkl-include \ - ninja \ - numpy \ - pyyaml \ - requests \ - setuptools \ - typing_extensions - - name: Run Fastlane - run: | - set -x - cd ios/TestApp - # install fastlane - sudo gem install bundler && bundle install - # install certificates - echo "${IOS_CERT_KEY_2022}" >> cert.txt - base64 --decode cert.txt -o Certificates.p12 - rm cert.txt - bundle exec fastlane install_root_cert - bundle exec fastlane install_dev_cert - # install the provisioning profile - PROFILE=PyTorch_CI_2022.mobileprovision - PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles - mkdir -pv "${PROVISIONING_PROFILES}" - cd "${PROVISIONING_PROFILES}" - echo "${IOS_SIGN_KEY_2022}" >> cert.txt - base64 --decode cert.txt -o ${PROFILE} - rm cert.txt - - name: Build - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - export TCLLIBPATH="/usr/local/lib" - python -VV - export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"} - scripts/build_ios.sh - - name: Run Build Test - run: | - PROFILE=PyTorch_CI_2022 - # run the ruby build script - if ! [ -x "$(command -v xcodebuild)" ]; then - echo 'Error: xcodebuild is not installed.' - exit 1 - fi - if [ "${IOS_PLATFORM}" != "SIMULATOR" ]; then - ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" -c "${PROFILE}" -t "${IOS_DEV_TEAM_ID}" - else - ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" - fi - - name: Run Simulator Tests - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - pip3 install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html - # generate models for differnet backends - cd "${GITHUB_WORKSPACE}/ios/TestApp/benchmark" - mkdir -p ../models - if [ "${USE_COREML_DELEGATE}" == 1 ]; then - pip install coremltools==5.0b5 - pip install six==1.16.0 - python coreml_backend.py - else - python trace_model.py - fi - if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then - echo "Setting up the TestApp for LiteInterpreter" - ruby setup.rb --lite 1 - else - echo "Setting up the TestApp for Full JIT" - ruby setup.rb - fi - cd "${GITHUB_WORKSPACE}/ios/TestApp" - instruments -s -devices - if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then - if [ "${USE_COREML_DELEGATE}" == 1 ]; then - fastlane scan --only_testing TestAppTests/TestAppTests/testCoreML - else - fastlane scan --only_testing TestAppTests/TestAppTests/testLiteInterpreter - fi - else - fastlane scan --only_testing TestAppTests/TestAppTests/testFullJIT - fi - -concurrency: - group: ios-12-5-1-x86-64-coreml-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true diff --git a/.github/workflows/generated-ios-12-5-1-x86-64-full-jit.yml b/.github/workflows/generated-ios-12-5-1-x86-64-full-jit.yml deleted file mode 100644 index 18553b414499..000000000000 --- a/.github/workflows/generated-ios-12-5-1-x86-64-full-jit.yml +++ /dev/null @@ -1,176 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/ios_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: ios-12-5-1-x86-64-full-jit - -on: - push: - branches: - - master - - release/* - tags: - - 'ciflow/all/*' - - 'ciflow/ios/*' - - 'ciflow/macos/*' - - 'ciflow/trunk/*' - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: ios-12-5-1-x86-64-full-jit - IN_CI: 1 - IS_GHA: 1 - IOS_PLATFORM: SIMULATOR - IOS_ARCH: x86_64 - - -jobs: - - build: - # NOTE: These builds will not run successfully without running on `pytorch/pytorch` due to the limitations - # of accessing secrets from forked pull requests and IOS' dependency on secrets for their build/test - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - runs-on: macos-10.15 - timeout-minutes: 240 - env: - JOB_BASE_NAME: ios-12-5-1-x86-64-full-jit-build - IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }} - IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET }} - IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID }} - IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }} - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Populate CI build options - run: | - # Most builds use the lite interpreter, if certain builds shouldn't - # build the lite interpreter this env variable should get over-written - # in the following case statement - echo "BUILD_LITE_INTERPRETER=1" >> "${GITHUB_ENV}" - - case ${BUILD_ENVIRONMENT} in - *metal*) - echo "USE_PYTORCH_METAL=1" >> "${GITHUB_ENV}" - ;; - *full_jit*) - echo "BUILD_LITE_INTERPRETER=0" >> "${GITHUB_ENV}" - ;; - *custom*) - echo "SELECTED_OP_LIST=${GITHUB_WORKSPACE}/ios/TestApp/custom_build/mobilenetv2.yaml" >> "${GITHUB_ENV}" - ;; - *coreml*) - echo "USE_COREML_DELEGATE=1" >> "${GITHUB_ENV}" - ;; - esac - - name: Install brew dependencies - run: | - # Install dependencies - brew install libtool - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - conda install -y \ - cffi \ - cmake \ - mkl \ - mkl-include \ - ninja \ - numpy \ - pyyaml \ - requests \ - setuptools \ - typing_extensions - - name: Run Fastlane - run: | - set -x - cd ios/TestApp - # install fastlane - sudo gem install bundler && bundle install - # install certificates - echo "${IOS_CERT_KEY_2022}" >> cert.txt - base64 --decode cert.txt -o Certificates.p12 - rm cert.txt - bundle exec fastlane install_root_cert - bundle exec fastlane install_dev_cert - # install the provisioning profile - PROFILE=PyTorch_CI_2022.mobileprovision - PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles - mkdir -pv "${PROVISIONING_PROFILES}" - cd "${PROVISIONING_PROFILES}" - echo "${IOS_SIGN_KEY_2022}" >> cert.txt - base64 --decode cert.txt -o ${PROFILE} - rm cert.txt - - name: Build - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - export TCLLIBPATH="/usr/local/lib" - python -VV - export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"} - scripts/build_ios.sh - - name: Run Build Test - run: | - PROFILE=PyTorch_CI_2022 - # run the ruby build script - if ! [ -x "$(command -v xcodebuild)" ]; then - echo 'Error: xcodebuild is not installed.' - exit 1 - fi - if [ "${IOS_PLATFORM}" != "SIMULATOR" ]; then - ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" -c "${PROFILE}" -t "${IOS_DEV_TEAM_ID}" - else - ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" - fi - - name: Run Simulator Tests - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - pip3 install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html - # generate models for differnet backends - cd "${GITHUB_WORKSPACE}/ios/TestApp/benchmark" - mkdir -p ../models - if [ "${USE_COREML_DELEGATE}" == 1 ]; then - pip install coremltools==5.0b5 - pip install six==1.16.0 - python coreml_backend.py - else - python trace_model.py - fi - if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then - echo "Setting up the TestApp for LiteInterpreter" - ruby setup.rb --lite 1 - else - echo "Setting up the TestApp for Full JIT" - ruby setup.rb - fi - cd "${GITHUB_WORKSPACE}/ios/TestApp" - instruments -s -devices - if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then - if [ "${USE_COREML_DELEGATE}" == 1 ]; then - fastlane scan --only_testing TestAppTests/TestAppTests/testCoreML - else - fastlane scan --only_testing TestAppTests/TestAppTests/testLiteInterpreter - fi - else - fastlane scan --only_testing TestAppTests/TestAppTests/testFullJIT - fi - -concurrency: - group: ios-12-5-1-x86-64-full-jit-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true diff --git a/.github/workflows/generated-ios-12-5-1-x86-64.yml b/.github/workflows/generated-ios-12-5-1-x86-64.yml deleted file mode 100644 index 0a92814866ab..000000000000 --- a/.github/workflows/generated-ios-12-5-1-x86-64.yml +++ /dev/null @@ -1,176 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/ios_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: ios-12-5-1-x86-64 - -on: - push: - branches: - - master - - release/* - tags: - - 'ciflow/all/*' - - 'ciflow/ios/*' - - 'ciflow/macos/*' - - 'ciflow/trunk/*' - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: ios-12-5-1-x86-64 - IN_CI: 1 - IS_GHA: 1 - IOS_PLATFORM: SIMULATOR - IOS_ARCH: x86_64 - - -jobs: - - build: - # NOTE: These builds will not run successfully without running on `pytorch/pytorch` due to the limitations - # of accessing secrets from forked pull requests and IOS' dependency on secrets for their build/test - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - runs-on: macos-10.15 - timeout-minutes: 240 - env: - JOB_BASE_NAME: ios-12-5-1-x86-64-build - IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }} - IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET }} - IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID }} - IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }} - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Populate CI build options - run: | - # Most builds use the lite interpreter, if certain builds shouldn't - # build the lite interpreter this env variable should get over-written - # in the following case statement - echo "BUILD_LITE_INTERPRETER=1" >> "${GITHUB_ENV}" - - case ${BUILD_ENVIRONMENT} in - *metal*) - echo "USE_PYTORCH_METAL=1" >> "${GITHUB_ENV}" - ;; - *full_jit*) - echo "BUILD_LITE_INTERPRETER=0" >> "${GITHUB_ENV}" - ;; - *custom*) - echo "SELECTED_OP_LIST=${GITHUB_WORKSPACE}/ios/TestApp/custom_build/mobilenetv2.yaml" >> "${GITHUB_ENV}" - ;; - *coreml*) - echo "USE_COREML_DELEGATE=1" >> "${GITHUB_ENV}" - ;; - esac - - name: Install brew dependencies - run: | - # Install dependencies - brew install libtool - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - conda install -y \ - cffi \ - cmake \ - mkl \ - mkl-include \ - ninja \ - numpy \ - pyyaml \ - requests \ - setuptools \ - typing_extensions - - name: Run Fastlane - run: | - set -x - cd ios/TestApp - # install fastlane - sudo gem install bundler && bundle install - # install certificates - echo "${IOS_CERT_KEY_2022}" >> cert.txt - base64 --decode cert.txt -o Certificates.p12 - rm cert.txt - bundle exec fastlane install_root_cert - bundle exec fastlane install_dev_cert - # install the provisioning profile - PROFILE=PyTorch_CI_2022.mobileprovision - PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles - mkdir -pv "${PROVISIONING_PROFILES}" - cd "${PROVISIONING_PROFILES}" - echo "${IOS_SIGN_KEY_2022}" >> cert.txt - base64 --decode cert.txt -o ${PROFILE} - rm cert.txt - - name: Build - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - export TCLLIBPATH="/usr/local/lib" - python -VV - export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"} - scripts/build_ios.sh - - name: Run Build Test - run: | - PROFILE=PyTorch_CI_2022 - # run the ruby build script - if ! [ -x "$(command -v xcodebuild)" ]; then - echo 'Error: xcodebuild is not installed.' - exit 1 - fi - if [ "${IOS_PLATFORM}" != "SIMULATOR" ]; then - ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" -c "${PROFILE}" -t "${IOS_DEV_TEAM_ID}" - else - ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" - fi - - name: Run Simulator Tests - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - pip3 install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html - # generate models for differnet backends - cd "${GITHUB_WORKSPACE}/ios/TestApp/benchmark" - mkdir -p ../models - if [ "${USE_COREML_DELEGATE}" == 1 ]; then - pip install coremltools==5.0b5 - pip install six==1.16.0 - python coreml_backend.py - else - python trace_model.py - fi - if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then - echo "Setting up the TestApp for LiteInterpreter" - ruby setup.rb --lite 1 - else - echo "Setting up the TestApp for Full JIT" - ruby setup.rb - fi - cd "${GITHUB_WORKSPACE}/ios/TestApp" - instruments -s -devices - if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then - if [ "${USE_COREML_DELEGATE}" == 1 ]; then - fastlane scan --only_testing TestAppTests/TestAppTests/testCoreML - else - fastlane scan --only_testing TestAppTests/TestAppTests/testLiteInterpreter - fi - else - fastlane scan --only_testing TestAppTests/TestAppTests/testFullJIT - fi - -concurrency: - group: ios-12-5-1-x86-64-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.7-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.7-gcc7.yml deleted file mode 100644 index fc55ce8dc285..000000000000 --- a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.7-gcc7.yml +++ /dev/null @@ -1,238 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: libtorch-linux-xenial-cuda10.2-py3.7-gcc7 - -on: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/cuda/*' - - 'ciflow/libtorch/*' - - 'ciflow/linux/*' - - 'ciflow/trunk/*' - branches: - - master - - release/* - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: libtorch-linux-xenial-cuda10.2-py3.7-gcc7 - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7 - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: libtorch-linux-xenial-cuda10.2-py3.7-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - build: - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - JOB_BASE_NAME: libtorch-linux-xenial-cuda10.2-py3.7-gcc7-build - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.7-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.7-gcc7.yml deleted file mode 100644 index 452c20076104..000000000000 --- a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.7-gcc7.yml +++ /dev/null @@ -1,238 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: libtorch-linux-xenial-cuda11.3-py3.7-gcc7 - -on: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/cuda/*' - - 'ciflow/libtorch/*' - - 'ciflow/linux/*' - - 'ciflow/trunk/*' - branches: - - master - - release/* - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: libtorch-linux-xenial-cuda11.3-py3.7-gcc7 - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7 - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: libtorch-linux-xenial-cuda11.3-py3.7-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - build: - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - JOB_BASE_NAME: libtorch-linux-xenial-cuda11.3-py3.7-gcc7-build - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-linux-binary-conda-nightly.yml b/.github/workflows/generated-linux-binary-conda-nightly.yml new file mode 100644 index 000000000000..2a057f2a3fe8 --- /dev/null +++ b/.github/workflows/generated-linux-binary-conda-nightly.yml @@ -0,0 +1,5594 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/linux_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: linux-binary-conda + +on: + push: + # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build + branches: + - nightly + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + - 'ciflow/binaries/*' + - 'ciflow/binaries_conda/*' + workflow_dispatch: + +env: + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + ANACONDA_USER: pytorch + AWS_DEFAULT_REGION: us-east-1 + BINARY_ENV_FILE: /tmp/env + BUILD_ENVIRONMENT: linux-binary-conda + BUILDER_ROOT: /builder + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IN_CI: 1 + IS_GHA: 1 + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + PR_NUMBER: ${{ github.event.pull_request.number }} + PYTORCH_FINAL_PACKAGE_DIR: /artifacts + PYTORCH_RETRY_TEST_CASES: 1 + PYTORCH_ROOT: /pytorch + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + SKIP_ALL_TESTS: 1 +concurrency: + group: linux-binary-conda-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + conda-py3_7-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/conda-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: conda-py3_7-cpu + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_7-cpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_7-cpu-build + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/conda-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_7-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_7-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_7-cpu-test + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/conda-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_7-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_7-cuda10_2-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: conda-py3_7-cuda10_2 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_7-cuda10_2-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_7-cuda10_2-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_7-cuda10_2 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_7-cuda10_2-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_7-cuda10_2-test + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_7-cuda10_2 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_7-cuda11_3-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: conda-py3_7-cuda11_3 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_7-cuda11_3-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_7-cuda11_3-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_7-cuda11_3 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_7-cuda11_3-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_7-cuda11_3-test + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_7-cuda11_3 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_7-cuda11_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: conda-py3_7-cuda11_6 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_7-cuda11_6-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_7-cuda11_6-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_7-cuda11_6 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_7-cuda11_6-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_7-cuda11_6-test + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_7-cuda11_6 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_8-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/conda-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: conda-py3_8-cpu + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_8-cpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_8-cpu-build + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/conda-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_8-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_8-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_8-cpu-test + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/conda-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_8-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_8-cuda10_2-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: conda-py3_8-cuda10_2 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_8-cuda10_2-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_8-cuda10_2-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_8-cuda10_2 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_8-cuda10_2-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_8-cuda10_2-test + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_8-cuda10_2 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_8-cuda11_3-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: conda-py3_8-cuda11_3 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_8-cuda11_3-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_8-cuda11_3-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_8-cuda11_3 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_8-cuda11_3-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_8-cuda11_3-test + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_8-cuda11_3 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_8-cuda11_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: conda-py3_8-cuda11_6 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_8-cuda11_6-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_8-cuda11_6-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_8-cuda11_6 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_8-cuda11_6-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_8-cuda11_6-test + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_8-cuda11_6 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_9-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/conda-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: conda-py3_9-cpu + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_9-cpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_9-cpu-build + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/conda-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_9-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_9-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_9-cpu-test + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/conda-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_9-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_9-cuda10_2-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: conda-py3_9-cuda10_2 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_9-cuda10_2-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_9-cuda10_2-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_9-cuda10_2 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_9-cuda10_2-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_9-cuda10_2-test + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_9-cuda10_2 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_9-cuda11_3-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: conda-py3_9-cuda11_3 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_9-cuda11_3-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_9-cuda11_3-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_9-cuda11_3 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_9-cuda11_3-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_9-cuda11_3-test + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_9-cuda11_3 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_9-cuda11_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: conda-py3_9-cuda11_6 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_9-cuda11_6-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_9-cuda11_6-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_9-cuda11_6 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_9-cuda11_6-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_9-cuda11_6-test + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_9-cuda11_6 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_10-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/conda-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: conda-py3_10-cpu + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_10-cpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_10-cpu-build + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/conda-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_10-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_10-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_10-cpu-test + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/conda-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_10-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_10-cuda10_2-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: conda-py3_10-cuda10_2 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_10-cuda10_2-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_10-cuda10_2-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_10-cuda10_2 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_10-cuda10_2-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_10-cuda10_2-test + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_10-cuda10_2 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_10-cuda11_3-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: conda-py3_10-cuda11_3 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_10-cuda11_3-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_10-cuda11_3-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_10-cuda11_3 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_10-cuda11_3-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_10-cuda11_3-test + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_10-cuda11_3 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_10-cuda11_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: conda-py3_10-cuda11_6 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_10-cuda11_6-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_10-cuda11_6-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_10-cuda11_6 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_10-cuda11_6-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_10-cuda11_6-test + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_10-cuda11_6 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af diff --git a/.github/workflows/generated-linux-binary-conda.yml b/.github/workflows/generated-linux-binary-conda.yml deleted file mode 100644 index 6b3a74dec474..000000000000 --- a/.github/workflows/generated-linux-binary-conda.yml +++ /dev/null @@ -1,7986 +0,0 @@ -# @generated DO NOT EDIT MANUALLY - -# Template is at: .github/templates/linux_binary_build_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: linux-binary-conda - -on: - push: - # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build - branches: - - nightly - tags: - # NOTE: Binary build pipelines should only get triggered on release candidate builds - # Release candidate tags look like: v1.11.0-rc1 - - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ - - 'ciflow/binaries/*' - - 'ciflow/binaries_conda/*' - workflow_dispatch: - -env: - # Needed for conda builds - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - ANACONDA_USER: pytorch - AWS_DEFAULT_REGION: us-east-1 - BINARY_ENV_FILE: /tmp/env - BUILD_ENVIRONMENT: linux-binary-conda - BUILDER_ROOT: /builder - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - IN_CI: 1 - IS_GHA: 1 - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - PR_NUMBER: ${{ github.event.pull_request.number }} - PYTORCH_FINAL_PACKAGE_DIR: /artifacts - PYTORCH_RETRY_TEST_CASES: 1 - PYTORCH_ROOT: /pytorch - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - SKIP_ALL_TESTS: 1 -concurrency: - group: linux-binary-conda-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - conda-py3_7-cpu-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/conda-builder:cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: conda-py3_7-cpu - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_7-cpu-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_7-cpu-build - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/conda-builder:cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_7-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_7-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_7-cpu-test - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/conda-builder:cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_7-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_7-cuda10_2-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: conda-py3_7-cuda10_2 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_7-cuda10_2-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_7-cuda10_2-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_7-cuda10_2 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_7-cuda10_2-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_7-cuda10_2-test - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_7-cuda10_2 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_7-cuda11_1-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: conda-py3_7-cuda11_1 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_7-cuda11_1-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_7-cuda11_1-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_7-cuda11_1 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_7-cuda11_1-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_7-cuda11_1-test - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_7-cuda11_1 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_7-cuda11_3-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: conda-py3_7-cuda11_3 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_7-cuda11_3-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_7-cuda11_3-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_7-cuda11_3 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_7-cuda11_3-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_7-cuda11_3-test - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_7-cuda11_3 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_7-cuda11_5-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: conda-py3_7-cuda11_5 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_7-cuda11_5-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_7-cuda11_5-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_7-cuda11_5 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_7-cuda11_5-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_7-cuda11_5-test - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_7-cuda11_5 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_8-cpu-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/conda-builder:cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: conda-py3_8-cpu - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_8-cpu-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_8-cpu-build - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/conda-builder:cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_8-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_8-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_8-cpu-test - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/conda-builder:cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_8-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_8-cuda10_2-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: conda-py3_8-cuda10_2 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_8-cuda10_2-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_8-cuda10_2-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_8-cuda10_2 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_8-cuda10_2-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_8-cuda10_2-test - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_8-cuda10_2 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_8-cuda11_1-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: conda-py3_8-cuda11_1 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_8-cuda11_1-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_8-cuda11_1-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_8-cuda11_1 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_8-cuda11_1-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_8-cuda11_1-test - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_8-cuda11_1 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_8-cuda11_3-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: conda-py3_8-cuda11_3 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_8-cuda11_3-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_8-cuda11_3-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_8-cuda11_3 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_8-cuda11_3-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_8-cuda11_3-test - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_8-cuda11_3 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_8-cuda11_5-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: conda-py3_8-cuda11_5 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_8-cuda11_5-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_8-cuda11_5-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_8-cuda11_5 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_8-cuda11_5-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_8-cuda11_5-test - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_8-cuda11_5 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_9-cpu-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/conda-builder:cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: conda-py3_9-cpu - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_9-cpu-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_9-cpu-build - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/conda-builder:cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_9-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_9-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_9-cpu-test - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/conda-builder:cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_9-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_9-cuda10_2-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: conda-py3_9-cuda10_2 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_9-cuda10_2-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_9-cuda10_2-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_9-cuda10_2 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_9-cuda10_2-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_9-cuda10_2-test - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_9-cuda10_2 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_9-cuda11_1-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: conda-py3_9-cuda11_1 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_9-cuda11_1-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_9-cuda11_1-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_9-cuda11_1 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_9-cuda11_1-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_9-cuda11_1-test - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_9-cuda11_1 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_9-cuda11_3-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: conda-py3_9-cuda11_3 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_9-cuda11_3-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_9-cuda11_3-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_9-cuda11_3 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_9-cuda11_3-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_9-cuda11_3-test - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_9-cuda11_3 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_9-cuda11_5-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: conda-py3_9-cuda11_5 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_9-cuda11_5-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_9-cuda11_5-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_9-cuda11_5 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_9-cuda11_5-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_9-cuda11_5-test - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_9-cuda11_5 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_10-cpu-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/conda-builder:cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: conda-py3_10-cpu - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_10-cpu-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_10-cpu-build - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/conda-builder:cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_10-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_10-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_10-cpu-test - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/conda-builder:cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_10-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_10-cuda10_2-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: conda-py3_10-cuda10_2 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_10-cuda10_2-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_10-cuda10_2-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_10-cuda10_2 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_10-cuda10_2-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_10-cuda10_2-test - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_10-cuda10_2 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_10-cuda11_1-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: conda-py3_10-cuda11_1 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_10-cuda11_1-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_10-cuda11_1-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_10-cuda11_1 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_10-cuda11_1-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_10-cuda11_1-test - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_10-cuda11_1 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_10-cuda11_3-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: conda-py3_10-cuda11_3 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_10-cuda11_3-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_10-cuda11_3-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_10-cuda11_3 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_10-cuda11_3-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_10-cuda11_3-test - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_10-cuda11_3 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_10-cuda11_5-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/conda/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: conda-py3_10-cuda11_5 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_10-cuda11_5-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_10-cuda11_5-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_10-cuda11_5 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_10-cuda11_5-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_10-cuda11_5-test - env: - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/conda-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: conda-py3_10-cuda11_5 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-master.yml b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-master.yml new file mode 100644 index 000000000000..3fa24203231b --- /dev/null +++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-master.yml @@ -0,0 +1,283 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/linux_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: linux-binary-libtorch-cxx11-abi + +on: + push: + branches: + - master + tags: + - 'ciflow/all/*' + - 'ciflow/trunk/*' + workflow_dispatch: + +env: + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + ANACONDA_USER: pytorch + AWS_DEFAULT_REGION: us-east-1 + BINARY_ENV_FILE: /tmp/env + BUILD_ENVIRONMENT: linux-binary-libtorch-cxx11-abi + BUILDER_ROOT: /builder + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IN_CI: 1 + IS_GHA: 1 + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + PR_NUMBER: ${{ github.event.pull_request.number }} + PYTORCH_FINAL_PACKAGE_DIR: /artifacts + PYTORCH_RETRY_TEST_CASES: 1 + PYTORCH_ROOT: /pytorch + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + SKIP_ALL_TESTS: 1 +concurrency: + group: linux-binary-libtorch-cxx11-abi-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + libtorch-cpu-shared-with-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cpu-shared-with-deps-cxx11-abi + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-shared-with-deps-cxx11-abi-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-shared-with-deps-cxx11-abi-build + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af diff --git a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml new file mode 100644 index 000000000000..096fd2617423 --- /dev/null +++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml @@ -0,0 +1,7042 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/linux_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: linux-binary-libtorch-cxx11-abi + +on: + push: + # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build + branches: + - nightly + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + - 'ciflow/binaries/*' + - 'ciflow/binaries_libtorch/*' + workflow_dispatch: + +env: + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + ANACONDA_USER: pytorch + AWS_DEFAULT_REGION: us-east-1 + BINARY_ENV_FILE: /tmp/env + BUILD_ENVIRONMENT: linux-binary-libtorch-cxx11-abi + BUILDER_ROOT: /builder + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IN_CI: 1 + IS_GHA: 1 + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + PR_NUMBER: ${{ github.event.pull_request.number }} + PYTORCH_FINAL_PACKAGE_DIR: /artifacts + PYTORCH_RETRY_TEST_CASES: 1 + PYTORCH_ROOT: /pytorch + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + SKIP_ALL_TESTS: 1 +concurrency: + group: linux-binary-libtorch-cxx11-abi-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + libtorch-cpu-shared-with-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cpu-shared-with-deps-cxx11-abi + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-shared-with-deps-cxx11-abi-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-shared-with-deps-cxx11-abi-build + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-shared-with-deps-cxx11-abi-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-shared-with-deps-cxx11-abi-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-shared-without-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cpu-shared-without-deps-cxx11-abi + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-shared-without-deps-cxx11-abi-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-shared-without-deps-cxx11-abi-build + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-without-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-shared-without-deps-cxx11-abi-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-shared-without-deps-cxx11-abi-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-without-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-static-with-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cpu-static-with-deps-cxx11-abi + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-static-with-deps-cxx11-abi-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-static-with-deps-cxx11-abi-build + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-static-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-static-with-deps-cxx11-abi-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-static-with-deps-cxx11-abi-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-static-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-static-without-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cpu-static-without-deps-cxx11-abi + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-static-without-deps-cxx11-abi-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-static-without-deps-cxx11-abi-build + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-static-without-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-static-without-deps-cxx11-abi-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-static-without-deps-cxx11-abi-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-static-without-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda10_2-shared-with-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cuda10_2-shared-with-deps-cxx11-abi + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda10_2-shared-with-deps-cxx11-abi-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda10_2-shared-with-deps-cxx11-abi-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda10_2-shared-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda10_2-shared-with-deps-cxx11-abi-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda10_2-shared-with-deps-cxx11-abi-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda10_2-shared-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda10_2-shared-without-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cuda10_2-shared-without-deps-cxx11-abi + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda10_2-shared-without-deps-cxx11-abi-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda10_2-shared-without-deps-cxx11-abi-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda10_2-shared-without-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda10_2-shared-without-deps-cxx11-abi-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda10_2-shared-without-deps-cxx11-abi-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda10_2-shared-without-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda10_2-static-with-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cuda10_2-static-with-deps-cxx11-abi + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda10_2-static-with-deps-cxx11-abi-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda10_2-static-with-deps-cxx11-abi-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda10_2-static-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda10_2-static-with-deps-cxx11-abi-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda10_2-static-with-deps-cxx11-abi-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda10_2-static-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda10_2-static-without-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cuda10_2-static-without-deps-cxx11-abi + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda10_2-static-without-deps-cxx11-abi-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda10_2-static-without-deps-cxx11-abi-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda10_2-static-without-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda10_2-static-without-deps-cxx11-abi-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda10_2-static-without-deps-cxx11-abi-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda10_2-static-without-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-shared-with-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cuda11_3-shared-with-deps-cxx11-abi + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-shared-with-deps-cxx11-abi-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-shared-with-deps-cxx11-abi-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-shared-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-shared-with-deps-cxx11-abi-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-shared-with-deps-cxx11-abi-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-shared-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-shared-without-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cuda11_3-shared-without-deps-cxx11-abi + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-shared-without-deps-cxx11-abi-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-shared-without-deps-cxx11-abi-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-shared-without-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-shared-without-deps-cxx11-abi-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-shared-without-deps-cxx11-abi-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-shared-without-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-static-with-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cuda11_3-static-with-deps-cxx11-abi + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-static-with-deps-cxx11-abi-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-static-with-deps-cxx11-abi-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-static-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-static-with-deps-cxx11-abi-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-static-with-deps-cxx11-abi-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-static-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-static-without-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cuda11_3-static-without-deps-cxx11-abi + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-static-without-deps-cxx11-abi-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-static-without-deps-cxx11-abi-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-static-without-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-static-without-deps-cxx11-abi-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-static-without-deps-cxx11-abi-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-static-without-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-shared-with-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cuda11_6-shared-with-deps-cxx11-abi + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-shared-with-deps-cxx11-abi-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-shared-with-deps-cxx11-abi-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-shared-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-shared-with-deps-cxx11-abi-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-shared-with-deps-cxx11-abi-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-shared-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-shared-without-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cuda11_6-shared-without-deps-cxx11-abi + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-shared-without-deps-cxx11-abi-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-shared-without-deps-cxx11-abi-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-shared-without-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-shared-without-deps-cxx11-abi-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-shared-without-deps-cxx11-abi-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-shared-without-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-static-with-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cuda11_6-static-with-deps-cxx11-abi + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-static-with-deps-cxx11-abi-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-static-with-deps-cxx11-abi-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-static-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-static-with-deps-cxx11-abi-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-static-with-deps-cxx11-abi-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-static-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-static-without-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cuda11_6-static-without-deps-cxx11-abi + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-static-without-deps-cxx11-abi-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-static-without-deps-cxx11-abi-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-static-without-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-static-without-deps-cxx11-abi-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-static-without-deps-cxx11-abi-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-static-without-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-rocm5_0-shared-with-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.0 + GPU_ARCH_VERSION: 5.0 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.0 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-rocm5_0-shared-with-deps-cxx11-abi + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-rocm5_0-shared-with-deps-cxx11-abi-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-rocm5_0-shared-with-deps-cxx11-abi-build + runs-on: linux.rocm.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.0 + GPU_ARCH_VERSION: 5.0 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.0 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: Set DOCKER_HOST + run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" + - name: Runner health check system info + if: always() + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/rocm.list || true + cat /opt/rocm/.info/version || true + whoami + - name: Runner health check rocm-smi + if: always() + run: | + rocm-smi + - name: Runner health check rocminfo + if: always() + run: | + rocminfo + - name: Runner health check GPU count + if: always() + run: | + ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') + if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then + echo "Failed to detect GPUs on the runner" + exit 1 + fi + - name: Runner health check disconnect on failure + if: ${{ failure() }} + run: | + killall runsvc.sh + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-rocm5_0-shared-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-rocm5_0-shared-with-deps-cxx11-abi-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-rocm5_0-shared-with-deps-cxx11-abi-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.0 + GPU_ARCH_VERSION: 5.0 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.0 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-rocm5_0-shared-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-rocm5_0-static-with-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.0 + GPU_ARCH_VERSION: 5.0 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.0 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-rocm5_0-static-with-deps-cxx11-abi + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-rocm5_0-static-with-deps-cxx11-abi-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-rocm5_0-static-with-deps-cxx11-abi-build + runs-on: linux.rocm.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.0 + GPU_ARCH_VERSION: 5.0 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.0 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: Set DOCKER_HOST + run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" + - name: Runner health check system info + if: always() + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/rocm.list || true + cat /opt/rocm/.info/version || true + whoami + - name: Runner health check rocm-smi + if: always() + run: | + rocm-smi + - name: Runner health check rocminfo + if: always() + run: | + rocminfo + - name: Runner health check GPU count + if: always() + run: | + ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') + if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then + echo "Failed to detect GPUs on the runner" + exit 1 + fi + - name: Runner health check disconnect on failure + if: ${{ failure() }} + run: | + killall runsvc.sh + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-rocm5_0-static-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-rocm5_0-static-with-deps-cxx11-abi-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-rocm5_0-static-with-deps-cxx11-abi-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.0 + GPU_ARCH_VERSION: 5.0 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.0 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-rocm5_0-static-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-rocm5_1_1-shared-with-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.1.1 + GPU_ARCH_VERSION: 5.1.1 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.1.1 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-rocm5_1_1-shared-with-deps-cxx11-abi + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-rocm5_1_1-shared-with-deps-cxx11-abi-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-rocm5_1_1-shared-with-deps-cxx11-abi-build + runs-on: linux.rocm.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.1.1 + GPU_ARCH_VERSION: 5.1.1 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.1.1 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: Set DOCKER_HOST + run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" + - name: Runner health check system info + if: always() + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/rocm.list || true + cat /opt/rocm/.info/version || true + whoami + - name: Runner health check rocm-smi + if: always() + run: | + rocm-smi + - name: Runner health check rocminfo + if: always() + run: | + rocminfo + - name: Runner health check GPU count + if: always() + run: | + ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') + if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then + echo "Failed to detect GPUs on the runner" + exit 1 + fi + - name: Runner health check disconnect on failure + if: ${{ failure() }} + run: | + killall runsvc.sh + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-rocm5_1_1-shared-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-rocm5_1_1-shared-with-deps-cxx11-abi-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-rocm5_1_1-shared-with-deps-cxx11-abi-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.1.1 + GPU_ARCH_VERSION: 5.1.1 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.1.1 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-rocm5_1_1-shared-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-rocm5_1_1-static-with-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.1.1 + GPU_ARCH_VERSION: 5.1.1 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.1.1 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-rocm5_1_1-static-with-deps-cxx11-abi + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-rocm5_1_1-static-with-deps-cxx11-abi-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-rocm5_1_1-static-with-deps-cxx11-abi-build + runs-on: linux.rocm.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.1.1 + GPU_ARCH_VERSION: 5.1.1 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.1.1 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: Set DOCKER_HOST + run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" + - name: Runner health check system info + if: always() + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/rocm.list || true + cat /opt/rocm/.info/version || true + whoami + - name: Runner health check rocm-smi + if: always() + run: | + rocm-smi + - name: Runner health check rocminfo + if: always() + run: | + rocminfo + - name: Runner health check GPU count + if: always() + run: | + ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') + if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then + echo "Failed to detect GPUs on the runner" + exit 1 + fi + - name: Runner health check disconnect on failure + if: ${{ failure() }} + run: | + killall runsvc.sh + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-rocm5_1_1-static-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-rocm5_1_1-static-with-deps-cxx11-abi-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-rocm5_1_1-static-with-deps-cxx11-abi-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.1.1 + GPU_ARCH_VERSION: 5.1.1 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.1.1 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-rocm5_1_1-static-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af diff --git a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi.yml b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi.yml deleted file mode 100644 index 6cfdc08cd046..000000000000 --- a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi.yml +++ /dev/null @@ -1,8046 +0,0 @@ -# @generated DO NOT EDIT MANUALLY - -# Template is at: .github/templates/linux_binary_build_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: linux-binary-libtorch-cxx11-abi - -on: - push: - # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build - branches: - - nightly - tags: - # NOTE: Binary build pipelines should only get triggered on release candidate builds - # Release candidate tags look like: v1.11.0-rc1 - - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ - - 'ciflow/binaries/*' - - 'ciflow/binaries_libtorch/*' - workflow_dispatch: - -env: - # Needed for conda builds - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - ANACONDA_USER: pytorch - AWS_DEFAULT_REGION: us-east-1 - BINARY_ENV_FILE: /tmp/env - BUILD_ENVIRONMENT: linux-binary-libtorch-cxx11-abi - BUILDER_ROOT: /builder - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - IN_CI: 1 - IS_GHA: 1 - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - PR_NUMBER: ${{ github.event.pull_request.number }} - PYTORCH_FINAL_PACKAGE_DIR: /artifacts - PYTORCH_RETRY_TEST_CASES: 1 - PYTORCH_ROOT: /pytorch - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - SKIP_ALL_TESTS: 1 -concurrency: - group: linux-binary-libtorch-cxx11-abi-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - libtorch-cpu-shared-with-deps-cxx11-abi-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cpu-shared-with-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-shared-with-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-shared-with-deps-cxx11-abi-build - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-shared-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-shared-with-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-shared-with-deps-cxx11-abi-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-shared-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-shared-without-deps-cxx11-abi-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cpu-shared-without-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-shared-without-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-shared-without-deps-cxx11-abi-build - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-shared-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-shared-without-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-shared-without-deps-cxx11-abi-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-shared-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-static-with-deps-cxx11-abi-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cpu-static-with-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-static-with-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-static-with-deps-cxx11-abi-build - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-static-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-static-with-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-static-with-deps-cxx11-abi-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-static-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-static-without-deps-cxx11-abi-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cpu-static-without-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-static-without-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-static-without-deps-cxx11-abi-build - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-static-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-static-without-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-static-without-deps-cxx11-abi-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-static-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda10_2-shared-with-deps-cxx11-abi-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda10_2-shared-with-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda10_2-shared-with-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda10_2-shared-with-deps-cxx11-abi-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda10_2-shared-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda10_2-shared-with-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda10_2-shared-with-deps-cxx11-abi-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda10_2-shared-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda10_2-shared-without-deps-cxx11-abi-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda10_2-shared-without-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda10_2-shared-without-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda10_2-shared-without-deps-cxx11-abi-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda10_2-shared-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda10_2-shared-without-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda10_2-shared-without-deps-cxx11-abi-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda10_2-shared-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda10_2-static-with-deps-cxx11-abi-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda10_2-static-with-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda10_2-static-with-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda10_2-static-with-deps-cxx11-abi-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda10_2-static-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda10_2-static-with-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda10_2-static-with-deps-cxx11-abi-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda10_2-static-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda10_2-static-without-deps-cxx11-abi-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda10_2-static-without-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda10_2-static-without-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda10_2-static-without-deps-cxx11-abi-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda10_2-static-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda10_2-static-without-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda10_2-static-without-deps-cxx11-abi-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda10_2-static-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-shared-with-deps-cxx11-abi-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda11_1-shared-with-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-shared-with-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-shared-with-deps-cxx11-abi-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-shared-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-shared-with-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-shared-with-deps-cxx11-abi-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-shared-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-shared-without-deps-cxx11-abi-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda11_1-shared-without-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-shared-without-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-shared-without-deps-cxx11-abi-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-shared-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-shared-without-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-shared-without-deps-cxx11-abi-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-shared-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-static-with-deps-cxx11-abi-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda11_1-static-with-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-static-with-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-static-with-deps-cxx11-abi-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-static-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-static-with-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-static-with-deps-cxx11-abi-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-static-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-static-without-deps-cxx11-abi-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda11_1-static-without-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-static-without-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-static-without-deps-cxx11-abi-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-static-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-static-without-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-static-without-deps-cxx11-abi-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-static-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-shared-with-deps-cxx11-abi-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda11_3-shared-with-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-shared-with-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-shared-with-deps-cxx11-abi-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-shared-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-shared-with-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-shared-with-deps-cxx11-abi-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-shared-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-shared-without-deps-cxx11-abi-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda11_3-shared-without-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-shared-without-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-shared-without-deps-cxx11-abi-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-shared-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-shared-without-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-shared-without-deps-cxx11-abi-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-shared-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-static-with-deps-cxx11-abi-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda11_3-static-with-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-static-with-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-static-with-deps-cxx11-abi-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-static-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-static-with-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-static-with-deps-cxx11-abi-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-static-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-static-without-deps-cxx11-abi-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda11_3-static-without-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-static-without-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-static-without-deps-cxx11-abi-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-static-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-static-without-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-static-without-deps-cxx11-abi-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-static-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-shared-with-deps-cxx11-abi-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda11_5-shared-with-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-shared-with-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-shared-with-deps-cxx11-abi-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-shared-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-shared-with-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-shared-with-deps-cxx11-abi-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-shared-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-shared-without-deps-cxx11-abi-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda11_5-shared-without-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-shared-without-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-shared-without-deps-cxx11-abi-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-shared-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-shared-without-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-shared-without-deps-cxx11-abi-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-shared-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-static-with-deps-cxx11-abi-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda11_5-static-with-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-static-with-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-static-with-deps-cxx11-abi-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-static-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-static-with-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-static-with-deps-cxx11-abi-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-static-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-static-without-deps-cxx11-abi-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda11_5-static-without-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-static-without-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-static-without-deps-cxx11-abi-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-static-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-static-without-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-static-without-deps-cxx11-abi-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-static-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-master.yml b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-master.yml new file mode 100644 index 000000000000..922dbc27b7f2 --- /dev/null +++ b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-master.yml @@ -0,0 +1,283 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/linux_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: linux-binary-libtorch-pre-cxx11 + +on: + push: + branches: + - master + tags: + - 'ciflow/all/*' + - 'ciflow/trunk/*' + workflow_dispatch: + +env: + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + ANACONDA_USER: pytorch + AWS_DEFAULT_REGION: us-east-1 + BINARY_ENV_FILE: /tmp/env + BUILD_ENVIRONMENT: linux-binary-libtorch-pre-cxx11 + BUILDER_ROOT: /builder + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IN_CI: 1 + IS_GHA: 1 + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + PR_NUMBER: ${{ github.event.pull_request.number }} + PYTORCH_FINAL_PACKAGE_DIR: /artifacts + PYTORCH_RETRY_TEST_CASES: 1 + PYTORCH_ROOT: /pytorch + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + SKIP_ALL_TESTS: 1 +concurrency: + group: linux-binary-libtorch-pre-cxx11-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + libtorch-cpu-shared-with-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cpu-shared-with-deps-cxx11-abi + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-shared-with-deps-cxx11-abi-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-shared-with-deps-cxx11-abi-build + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af diff --git a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml new file mode 100644 index 000000000000..5972b6fced8e --- /dev/null +++ b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml @@ -0,0 +1,7042 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/linux_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: linux-binary-libtorch-pre-cxx11 + +on: + push: + # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build + branches: + - nightly + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + - 'ciflow/binaries/*' + - 'ciflow/binaries_libtorch/*' + workflow_dispatch: + +env: + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + ANACONDA_USER: pytorch + AWS_DEFAULT_REGION: us-east-1 + BINARY_ENV_FILE: /tmp/env + BUILD_ENVIRONMENT: linux-binary-libtorch-pre-cxx11 + BUILDER_ROOT: /builder + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IN_CI: 1 + IS_GHA: 1 + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + PR_NUMBER: ${{ github.event.pull_request.number }} + PYTORCH_FINAL_PACKAGE_DIR: /artifacts + PYTORCH_RETRY_TEST_CASES: 1 + PYTORCH_ROOT: /pytorch + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + SKIP_ALL_TESTS: 1 +concurrency: + group: linux-binary-libtorch-pre-cxx11-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + libtorch-cpu-shared-with-deps-pre-cxx11-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cpu-shared-with-deps-pre-cxx11 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-shared-with-deps-pre-cxx11-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-shared-with-deps-pre-cxx11-build + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-with-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-shared-with-deps-pre-cxx11-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-shared-with-deps-pre-cxx11-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-with-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-shared-without-deps-pre-cxx11-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cpu-shared-without-deps-pre-cxx11 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-shared-without-deps-pre-cxx11-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-shared-without-deps-pre-cxx11-build + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-without-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-shared-without-deps-pre-cxx11-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-shared-without-deps-pre-cxx11-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-without-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-static-with-deps-pre-cxx11-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cpu-static-with-deps-pre-cxx11 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-static-with-deps-pre-cxx11-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-static-with-deps-pre-cxx11-build + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-static-with-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-static-with-deps-pre-cxx11-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-static-with-deps-pre-cxx11-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-static-with-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-static-without-deps-pre-cxx11-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cpu-static-without-deps-pre-cxx11 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-static-without-deps-pre-cxx11-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-static-without-deps-pre-cxx11-build + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-static-without-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-static-without-deps-pre-cxx11-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-static-without-deps-pre-cxx11-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-static-without-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda10_2-shared-with-deps-pre-cxx11-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cuda10_2-shared-with-deps-pre-cxx11 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda10_2-shared-with-deps-pre-cxx11-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda10_2-shared-with-deps-pre-cxx11-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda10_2-shared-with-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda10_2-shared-with-deps-pre-cxx11-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda10_2-shared-with-deps-pre-cxx11-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda10_2-shared-with-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda10_2-shared-without-deps-pre-cxx11-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cuda10_2-shared-without-deps-pre-cxx11 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda10_2-shared-without-deps-pre-cxx11-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda10_2-shared-without-deps-pre-cxx11-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda10_2-shared-without-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda10_2-shared-without-deps-pre-cxx11-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda10_2-shared-without-deps-pre-cxx11-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda10_2-shared-without-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda10_2-static-with-deps-pre-cxx11-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cuda10_2-static-with-deps-pre-cxx11 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda10_2-static-with-deps-pre-cxx11-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda10_2-static-with-deps-pre-cxx11-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda10_2-static-with-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda10_2-static-with-deps-pre-cxx11-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda10_2-static-with-deps-pre-cxx11-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda10_2-static-with-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda10_2-static-without-deps-pre-cxx11-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cuda10_2-static-without-deps-pre-cxx11 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda10_2-static-without-deps-pre-cxx11-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda10_2-static-without-deps-pre-cxx11-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda10_2-static-without-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda10_2-static-without-deps-pre-cxx11-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda10_2-static-without-deps-pre-cxx11-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda10_2-static-without-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-shared-with-deps-pre-cxx11-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cuda11_3-shared-with-deps-pre-cxx11 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-shared-with-deps-pre-cxx11-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-shared-with-deps-pre-cxx11-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-shared-with-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-shared-with-deps-pre-cxx11-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-shared-with-deps-pre-cxx11-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-shared-with-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-shared-without-deps-pre-cxx11-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cuda11_3-shared-without-deps-pre-cxx11 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-shared-without-deps-pre-cxx11-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-shared-without-deps-pre-cxx11-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-shared-without-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-shared-without-deps-pre-cxx11-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-shared-without-deps-pre-cxx11-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-shared-without-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-static-with-deps-pre-cxx11-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cuda11_3-static-with-deps-pre-cxx11 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-static-with-deps-pre-cxx11-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-static-with-deps-pre-cxx11-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-static-with-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-static-with-deps-pre-cxx11-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-static-with-deps-pre-cxx11-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-static-with-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-static-without-deps-pre-cxx11-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cuda11_3-static-without-deps-pre-cxx11 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-static-without-deps-pre-cxx11-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-static-without-deps-pre-cxx11-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-static-without-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-static-without-deps-pre-cxx11-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-static-without-deps-pre-cxx11-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-static-without-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-shared-with-deps-pre-cxx11-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cuda11_6-shared-with-deps-pre-cxx11 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-shared-with-deps-pre-cxx11-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-shared-with-deps-pre-cxx11-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-shared-with-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-shared-with-deps-pre-cxx11-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-shared-with-deps-pre-cxx11-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-shared-with-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-shared-without-deps-pre-cxx11-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cuda11_6-shared-without-deps-pre-cxx11 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-shared-without-deps-pre-cxx11-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-shared-without-deps-pre-cxx11-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-shared-without-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-shared-without-deps-pre-cxx11-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-shared-without-deps-pre-cxx11-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-shared-without-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-static-with-deps-pre-cxx11-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cuda11_6-static-with-deps-pre-cxx11 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-static-with-deps-pre-cxx11-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-static-with-deps-pre-cxx11-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-static-with-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-static-with-deps-pre-cxx11-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-static-with-deps-pre-cxx11-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-static-with-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-static-without-deps-pre-cxx11-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-cuda11_6-static-without-deps-pre-cxx11 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-static-without-deps-pre-cxx11-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-static-without-deps-pre-cxx11-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-static-without-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-static-without-deps-pre-cxx11-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-static-without-deps-pre-cxx11-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-static-without-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-rocm5_0-shared-with-deps-pre-cxx11-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.0 + GPU_ARCH_VERSION: 5.0 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-rocm5_0-shared-with-deps-pre-cxx11 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-rocm5_0-shared-with-deps-pre-cxx11-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-rocm5_0-shared-with-deps-pre-cxx11-build + runs-on: linux.rocm.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.0 + GPU_ARCH_VERSION: 5.0 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: Set DOCKER_HOST + run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" + - name: Runner health check system info + if: always() + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/rocm.list || true + cat /opt/rocm/.info/version || true + whoami + - name: Runner health check rocm-smi + if: always() + run: | + rocm-smi + - name: Runner health check rocminfo + if: always() + run: | + rocminfo + - name: Runner health check GPU count + if: always() + run: | + ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') + if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then + echo "Failed to detect GPUs on the runner" + exit 1 + fi + - name: Runner health check disconnect on failure + if: ${{ failure() }} + run: | + killall runsvc.sh + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-rocm5_0-shared-with-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-rocm5_0-shared-with-deps-pre-cxx11-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-rocm5_0-shared-with-deps-pre-cxx11-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.0 + GPU_ARCH_VERSION: 5.0 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-rocm5_0-shared-with-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-rocm5_0-static-with-deps-pre-cxx11-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.0 + GPU_ARCH_VERSION: 5.0 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-rocm5_0-static-with-deps-pre-cxx11 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-rocm5_0-static-with-deps-pre-cxx11-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-rocm5_0-static-with-deps-pre-cxx11-build + runs-on: linux.rocm.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.0 + GPU_ARCH_VERSION: 5.0 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: Set DOCKER_HOST + run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" + - name: Runner health check system info + if: always() + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/rocm.list || true + cat /opt/rocm/.info/version || true + whoami + - name: Runner health check rocm-smi + if: always() + run: | + rocm-smi + - name: Runner health check rocminfo + if: always() + run: | + rocminfo + - name: Runner health check GPU count + if: always() + run: | + ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') + if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then + echo "Failed to detect GPUs on the runner" + exit 1 + fi + - name: Runner health check disconnect on failure + if: ${{ failure() }} + run: | + killall runsvc.sh + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-rocm5_0-static-with-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-rocm5_0-static-with-deps-pre-cxx11-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-rocm5_0-static-with-deps-pre-cxx11-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.0 + GPU_ARCH_VERSION: 5.0 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-rocm5_0-static-with-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-rocm5_1_1-shared-with-deps-pre-cxx11-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.1.1 + GPU_ARCH_VERSION: 5.1.1 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-rocm5_1_1-shared-with-deps-pre-cxx11 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-rocm5_1_1-shared-with-deps-pre-cxx11-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-rocm5_1_1-shared-with-deps-pre-cxx11-build + runs-on: linux.rocm.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.1.1 + GPU_ARCH_VERSION: 5.1.1 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: Set DOCKER_HOST + run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" + - name: Runner health check system info + if: always() + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/rocm.list || true + cat /opt/rocm/.info/version || true + whoami + - name: Runner health check rocm-smi + if: always() + run: | + rocm-smi + - name: Runner health check rocminfo + if: always() + run: | + rocminfo + - name: Runner health check GPU count + if: always() + run: | + ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') + if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then + echo "Failed to detect GPUs on the runner" + exit 1 + fi + - name: Runner health check disconnect on failure + if: ${{ failure() }} + run: | + killall runsvc.sh + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-rocm5_1_1-shared-with-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-rocm5_1_1-shared-with-deps-pre-cxx11-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-rocm5_1_1-shared-with-deps-pre-cxx11-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.1.1 + GPU_ARCH_VERSION: 5.1.1 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-rocm5_1_1-shared-with-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-rocm5_1_1-static-with-deps-pre-cxx11-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.1.1 + GPU_ARCH_VERSION: 5.1.1 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: libtorch-rocm5_1_1-static-with-deps-pre-cxx11 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-rocm5_1_1-static-with-deps-pre-cxx11-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-rocm5_1_1-static-with-deps-pre-cxx11-build + runs-on: linux.rocm.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.1.1 + GPU_ARCH_VERSION: 5.1.1 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: Set DOCKER_HOST + run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" + - name: Runner health check system info + if: always() + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/rocm.list || true + cat /opt/rocm/.info/version || true + whoami + - name: Runner health check rocm-smi + if: always() + run: | + rocm-smi + - name: Runner health check rocminfo + if: always() + run: | + rocminfo + - name: Runner health check GPU count + if: always() + run: | + ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') + if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then + echo "Failed to detect GPUs on the runner" + exit 1 + fi + - name: Runner health check disconnect on failure + if: ${{ failure() }} + run: | + killall runsvc.sh + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-rocm5_1_1-static-with-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-rocm5_1_1-static-with-deps-pre-cxx11-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-rocm5_1_1-static-with-deps-pre-cxx11-test + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.1.1 + GPU_ARCH_VERSION: 5.1.1 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1 + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-rocm5_1_1-static-with-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af diff --git a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11.yml b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11.yml deleted file mode 100644 index c39fb1c690c7..000000000000 --- a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11.yml +++ /dev/null @@ -1,8046 +0,0 @@ -# @generated DO NOT EDIT MANUALLY - -# Template is at: .github/templates/linux_binary_build_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: linux-binary-libtorch-pre-cxx11 - -on: - push: - # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build - branches: - - nightly - tags: - # NOTE: Binary build pipelines should only get triggered on release candidate builds - # Release candidate tags look like: v1.11.0-rc1 - - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ - - 'ciflow/binaries/*' - - 'ciflow/binaries_libtorch/*' - workflow_dispatch: - -env: - # Needed for conda builds - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - ANACONDA_USER: pytorch - AWS_DEFAULT_REGION: us-east-1 - BINARY_ENV_FILE: /tmp/env - BUILD_ENVIRONMENT: linux-binary-libtorch-pre-cxx11 - BUILDER_ROOT: /builder - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - IN_CI: 1 - IS_GHA: 1 - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - PR_NUMBER: ${{ github.event.pull_request.number }} - PYTORCH_FINAL_PACKAGE_DIR: /artifacts - PYTORCH_RETRY_TEST_CASES: 1 - PYTORCH_ROOT: /pytorch - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - SKIP_ALL_TESTS: 1 -concurrency: - group: linux-binary-libtorch-pre-cxx11-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - libtorch-cpu-shared-with-deps-pre-cxx11-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cpu-shared-with-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-shared-with-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-shared-with-deps-pre-cxx11-build - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-shared-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-shared-with-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-shared-with-deps-pre-cxx11-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-shared-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-shared-without-deps-pre-cxx11-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cpu-shared-without-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-shared-without-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-shared-without-deps-pre-cxx11-build - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-shared-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-shared-without-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-shared-without-deps-pre-cxx11-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-shared-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-static-with-deps-pre-cxx11-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cpu-static-with-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-static-with-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-static-with-deps-pre-cxx11-build - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-static-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-static-with-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-static-with-deps-pre-cxx11-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-static-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-static-without-deps-pre-cxx11-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cpu-static-without-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-static-without-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-static-without-deps-pre-cxx11-build - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-static-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-static-without-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-static-without-deps-pre-cxx11-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-static-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda10_2-shared-with-deps-pre-cxx11-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda10_2-shared-with-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda10_2-shared-with-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda10_2-shared-with-deps-pre-cxx11-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda10_2-shared-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda10_2-shared-with-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda10_2-shared-with-deps-pre-cxx11-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda10_2-shared-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda10_2-shared-without-deps-pre-cxx11-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda10_2-shared-without-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda10_2-shared-without-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda10_2-shared-without-deps-pre-cxx11-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda10_2-shared-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda10_2-shared-without-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda10_2-shared-without-deps-pre-cxx11-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda10_2-shared-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda10_2-static-with-deps-pre-cxx11-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda10_2-static-with-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda10_2-static-with-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda10_2-static-with-deps-pre-cxx11-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda10_2-static-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda10_2-static-with-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda10_2-static-with-deps-pre-cxx11-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda10_2-static-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda10_2-static-without-deps-pre-cxx11-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda10_2-static-without-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda10_2-static-without-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda10_2-static-without-deps-pre-cxx11-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda10_2-static-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda10_2-static-without-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda10_2-static-without-deps-pre-cxx11-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda10_2-static-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-shared-with-deps-pre-cxx11-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda11_1-shared-with-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-shared-with-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-shared-with-deps-pre-cxx11-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-shared-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-shared-with-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-shared-with-deps-pre-cxx11-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-shared-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-shared-without-deps-pre-cxx11-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda11_1-shared-without-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-shared-without-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-shared-without-deps-pre-cxx11-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-shared-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-shared-without-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-shared-without-deps-pre-cxx11-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-shared-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-static-with-deps-pre-cxx11-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda11_1-static-with-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-static-with-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-static-with-deps-pre-cxx11-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-static-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-static-with-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-static-with-deps-pre-cxx11-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-static-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-static-without-deps-pre-cxx11-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda11_1-static-without-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-static-without-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-static-without-deps-pre-cxx11-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-static-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-static-without-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-static-without-deps-pre-cxx11-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-static-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-shared-with-deps-pre-cxx11-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda11_3-shared-with-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-shared-with-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-shared-with-deps-pre-cxx11-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-shared-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-shared-with-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-shared-with-deps-pre-cxx11-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-shared-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-shared-without-deps-pre-cxx11-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda11_3-shared-without-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-shared-without-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-shared-without-deps-pre-cxx11-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-shared-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-shared-without-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-shared-without-deps-pre-cxx11-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-shared-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-static-with-deps-pre-cxx11-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda11_3-static-with-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-static-with-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-static-with-deps-pre-cxx11-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-static-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-static-with-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-static-with-deps-pre-cxx11-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-static-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-static-without-deps-pre-cxx11-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda11_3-static-without-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-static-without-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-static-without-deps-pre-cxx11-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-static-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-static-without-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-static-without-deps-pre-cxx11-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-static-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-shared-with-deps-pre-cxx11-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda11_5-shared-with-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-shared-with-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-shared-with-deps-pre-cxx11-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-shared-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-shared-with-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-shared-with-deps-pre-cxx11-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-shared-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-shared-without-deps-pre-cxx11-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda11_5-shared-without-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-shared-without-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-shared-without-deps-pre-cxx11-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-shared-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-shared-without-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-shared-without-deps-pre-cxx11-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-shared-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-static-with-deps-pre-cxx11-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda11_5-static-with-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-static-with-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-static-with-deps-pre-cxx11-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-static-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-static-with-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-static-with-deps-pre-cxx11-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-static-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-static-without-deps-pre-cxx11-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/libtorch/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: libtorch-cuda11_5-static-without-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-static-without-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-static-without-deps-pre-cxx11-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-static-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-static-without-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-static-without-deps-pre-cxx11-test - env: - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-static-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-linux-binary-manywheel-master.yml b/.github/workflows/generated-linux-binary-manywheel-master.yml new file mode 100644 index 000000000000..d384b3e79bd0 --- /dev/null +++ b/.github/workflows/generated-linux-binary-manywheel-master.yml @@ -0,0 +1,294 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/linux_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: linux-binary-manywheel + +on: + push: + branches: + - master + tags: + - 'ciflow/all/*' + - 'ciflow/trunk/*' + workflow_dispatch: + +env: + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + ANACONDA_USER: pytorch + AWS_DEFAULT_REGION: us-east-1 + BINARY_ENV_FILE: /tmp/env + BUILD_ENVIRONMENT: linux-binary-manywheel + BUILDER_ROOT: /builder + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IN_CI: 1 + IS_GHA: 1 + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + PR_NUMBER: ${{ github.event.pull_request.number }} + PYTORCH_FINAL_PACKAGE_DIR: /artifacts + PYTORCH_RETRY_TEST_CASES: 1 + PYTORCH_ROOT: /pytorch + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + SKIP_ALL_TESTS: 1 +concurrency: + group: linux-binary-manywheel-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + manywheel-py3_7-cuda10_2-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: manywheel-py3_7-cuda10_2 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_7-cuda10_2-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_7-cuda10_2-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_7-cuda10_2 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml new file mode 100644 index 000000000000..783227fe9d31 --- /dev/null +++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml @@ -0,0 +1,8370 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/linux_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: linux-binary-manywheel + +on: + push: + # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build + branches: + - nightly + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + - 'ciflow/binaries/*' + - 'ciflow/binaries_wheel/*' + workflow_dispatch: + +env: + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + ANACONDA_USER: pytorch + AWS_DEFAULT_REGION: us-east-1 + BINARY_ENV_FILE: /tmp/env + BUILD_ENVIRONMENT: linux-binary-manywheel + BUILDER_ROOT: /builder + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IN_CI: 1 + IS_GHA: 1 + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + PR_NUMBER: ${{ github.event.pull_request.number }} + PYTORCH_FINAL_PACKAGE_DIR: /artifacts + PYTORCH_RETRY_TEST_CASES: 1 + PYTORCH_ROOT: /pytorch + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + SKIP_ALL_TESTS: 1 +concurrency: + group: linux-binary-manywheel-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + manywheel-py3_7-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: manywheel-py3_7-cpu + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_7-cpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_7-cpu-build + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_7-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_7-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_7-cpu-test + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_7-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_7-cuda10_2-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: manywheel-py3_7-cuda10_2 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_7-cuda10_2-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_7-cuda10_2-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_7-cuda10_2 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_7-cuda10_2-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_7-cuda10_2-test + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_7-cuda10_2 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_7-cuda11_3-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: manywheel-py3_7-cuda11_3 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_7-cuda11_3-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_7-cuda11_3-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_7-cuda11_3 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_7-cuda11_3-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_7-cuda11_3-test + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_7-cuda11_3 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_7-cuda11_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: manywheel-py3_7-cuda11_6 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_7-cuda11_6-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_7-cuda11_6-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_7-cuda11_6 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_7-cuda11_6-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_7-cuda11_6-test + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_7-cuda11_6 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_7-rocm5_0-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.0 + GPU_ARCH_VERSION: 5.0 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: manywheel-py3_7-rocm5_0 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_7-rocm5_0-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_7-rocm5_0-build + runs-on: linux.rocm.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.0 + GPU_ARCH_VERSION: 5.0 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: Set DOCKER_HOST + run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" + - name: Runner health check system info + if: always() + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/rocm.list || true + cat /opt/rocm/.info/version || true + whoami + - name: Runner health check rocm-smi + if: always() + run: | + rocm-smi + - name: Runner health check rocminfo + if: always() + run: | + rocminfo + - name: Runner health check GPU count + if: always() + run: | + ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') + if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then + echo "Failed to detect GPUs on the runner" + exit 1 + fi + - name: Runner health check disconnect on failure + if: ${{ failure() }} + run: | + killall runsvc.sh + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_7-rocm5_0 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_7-rocm5_0-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_7-rocm5_0-test + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.0 + GPU_ARCH_VERSION: 5.0 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_7-rocm5_0 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_7-rocm5_1_1-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.1.1 + GPU_ARCH_VERSION: 5.1.1 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: manywheel-py3_7-rocm5_1_1 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_7-rocm5_1_1-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_7-rocm5_1_1-build + runs-on: linux.rocm.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.1.1 + GPU_ARCH_VERSION: 5.1.1 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: Set DOCKER_HOST + run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" + - name: Runner health check system info + if: always() + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/rocm.list || true + cat /opt/rocm/.info/version || true + whoami + - name: Runner health check rocm-smi + if: always() + run: | + rocm-smi + - name: Runner health check rocminfo + if: always() + run: | + rocminfo + - name: Runner health check GPU count + if: always() + run: | + ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') + if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then + echo "Failed to detect GPUs on the runner" + exit 1 + fi + - name: Runner health check disconnect on failure + if: ${{ failure() }} + run: | + killall runsvc.sh + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_7-rocm5_1_1 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_7-rocm5_1_1-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_7-rocm5_1_1-test + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.1.1 + GPU_ARCH_VERSION: 5.1.1 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_7-rocm5_1_1 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_8-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: manywheel-py3_8-cpu + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_8-cpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_8-cpu-build + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_8-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_8-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_8-cpu-test + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_8-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_8-cuda10_2-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: manywheel-py3_8-cuda10_2 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_8-cuda10_2-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_8-cuda10_2-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_8-cuda10_2 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_8-cuda10_2-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_8-cuda10_2-test + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_8-cuda10_2 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_8-cuda11_3-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: manywheel-py3_8-cuda11_3 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_8-cuda11_3-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_8-cuda11_3-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_8-cuda11_3 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_8-cuda11_3-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_8-cuda11_3-test + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_8-cuda11_3 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_8-cuda11_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: manywheel-py3_8-cuda11_6 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_8-cuda11_6-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_8-cuda11_6-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_8-cuda11_6 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_8-cuda11_6-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_8-cuda11_6-test + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_8-cuda11_6 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_8-rocm5_0-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.0 + GPU_ARCH_VERSION: 5.0 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: manywheel-py3_8-rocm5_0 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_8-rocm5_0-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_8-rocm5_0-build + runs-on: linux.rocm.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.0 + GPU_ARCH_VERSION: 5.0 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: Set DOCKER_HOST + run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" + - name: Runner health check system info + if: always() + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/rocm.list || true + cat /opt/rocm/.info/version || true + whoami + - name: Runner health check rocm-smi + if: always() + run: | + rocm-smi + - name: Runner health check rocminfo + if: always() + run: | + rocminfo + - name: Runner health check GPU count + if: always() + run: | + ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') + if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then + echo "Failed to detect GPUs on the runner" + exit 1 + fi + - name: Runner health check disconnect on failure + if: ${{ failure() }} + run: | + killall runsvc.sh + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_8-rocm5_0 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_8-rocm5_0-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_8-rocm5_0-test + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.0 + GPU_ARCH_VERSION: 5.0 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_8-rocm5_0 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_8-rocm5_1_1-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.1.1 + GPU_ARCH_VERSION: 5.1.1 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: manywheel-py3_8-rocm5_1_1 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_8-rocm5_1_1-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_8-rocm5_1_1-build + runs-on: linux.rocm.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.1.1 + GPU_ARCH_VERSION: 5.1.1 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: Set DOCKER_HOST + run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" + - name: Runner health check system info + if: always() + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/rocm.list || true + cat /opt/rocm/.info/version || true + whoami + - name: Runner health check rocm-smi + if: always() + run: | + rocm-smi + - name: Runner health check rocminfo + if: always() + run: | + rocminfo + - name: Runner health check GPU count + if: always() + run: | + ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') + if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then + echo "Failed to detect GPUs on the runner" + exit 1 + fi + - name: Runner health check disconnect on failure + if: ${{ failure() }} + run: | + killall runsvc.sh + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_8-rocm5_1_1 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_8-rocm5_1_1-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_8-rocm5_1_1-test + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.1.1 + GPU_ARCH_VERSION: 5.1.1 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_8-rocm5_1_1 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_9-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: manywheel-py3_9-cpu + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_9-cpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_9-cpu-build + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_9-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_9-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_9-cpu-test + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_9-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_9-cuda10_2-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: manywheel-py3_9-cuda10_2 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_9-cuda10_2-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_9-cuda10_2-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_9-cuda10_2 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_9-cuda10_2-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_9-cuda10_2-test + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_9-cuda10_2 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_9-cuda11_3-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: manywheel-py3_9-cuda11_3 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_9-cuda11_3-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_9-cuda11_3-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_9-cuda11_3 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_9-cuda11_3-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_9-cuda11_3-test + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_9-cuda11_3 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_9-cuda11_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: manywheel-py3_9-cuda11_6 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_9-cuda11_6-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_9-cuda11_6-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_9-cuda11_6 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_9-cuda11_6-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_9-cuda11_6-test + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_9-cuda11_6 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_9-rocm5_0-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.0 + GPU_ARCH_VERSION: 5.0 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: manywheel-py3_9-rocm5_0 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_9-rocm5_0-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_9-rocm5_0-build + runs-on: linux.rocm.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.0 + GPU_ARCH_VERSION: 5.0 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: Set DOCKER_HOST + run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" + - name: Runner health check system info + if: always() + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/rocm.list || true + cat /opt/rocm/.info/version || true + whoami + - name: Runner health check rocm-smi + if: always() + run: | + rocm-smi + - name: Runner health check rocminfo + if: always() + run: | + rocminfo + - name: Runner health check GPU count + if: always() + run: | + ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') + if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then + echo "Failed to detect GPUs on the runner" + exit 1 + fi + - name: Runner health check disconnect on failure + if: ${{ failure() }} + run: | + killall runsvc.sh + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_9-rocm5_0 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_9-rocm5_0-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_9-rocm5_0-test + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.0 + GPU_ARCH_VERSION: 5.0 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_9-rocm5_0 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_9-rocm5_1_1-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.1.1 + GPU_ARCH_VERSION: 5.1.1 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: manywheel-py3_9-rocm5_1_1 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_9-rocm5_1_1-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_9-rocm5_1_1-build + runs-on: linux.rocm.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.1.1 + GPU_ARCH_VERSION: 5.1.1 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: Set DOCKER_HOST + run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" + - name: Runner health check system info + if: always() + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/rocm.list || true + cat /opt/rocm/.info/version || true + whoami + - name: Runner health check rocm-smi + if: always() + run: | + rocm-smi + - name: Runner health check rocminfo + if: always() + run: | + rocminfo + - name: Runner health check GPU count + if: always() + run: | + ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') + if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then + echo "Failed to detect GPUs on the runner" + exit 1 + fi + - name: Runner health check disconnect on failure + if: ${{ failure() }} + run: | + killall runsvc.sh + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_9-rocm5_1_1 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_9-rocm5_1_1-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_9-rocm5_1_1-test + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.1.1 + GPU_ARCH_VERSION: 5.1.1 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_9-rocm5_1_1 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_10-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: manywheel-py3_10-cpu + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_10-cpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_10-cpu-build + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_10-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_10-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_10-cpu-test + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_10-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_10-cuda10_2-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: manywheel-py3_10-cuda10_2 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_10-cuda10_2-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_10-cuda10_2-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_10-cuda10_2 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_10-cuda10_2-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_10-cuda10_2-test + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu102 + GPU_ARCH_VERSION: 10.2 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_10-cuda10_2 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_10-cuda11_3-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: manywheel-py3_10-cuda11_3 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_10-cuda11_3-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_10-cuda11_3-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_10-cuda11_3 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_10-cuda11_3-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_10-cuda11_3-test + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_10-cuda11_3 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_10-cuda11_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Set BUILD_SPLIT_CUDA + run: | + echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: manywheel-py3_10-cuda11_6 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_10-cuda11_6-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_10-cuda11_6-build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_10-cuda11_6 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + pushd pytorch + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + popd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_10-cuda11_6-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_10-cuda11_6-test + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_10-cuda11_6 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_10-rocm5_0-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.0 + GPU_ARCH_VERSION: 5.0 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: manywheel-py3_10-rocm5_0 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_10-rocm5_0-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_10-rocm5_0-build + runs-on: linux.rocm.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.0 + GPU_ARCH_VERSION: 5.0 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: Set DOCKER_HOST + run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" + - name: Runner health check system info + if: always() + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/rocm.list || true + cat /opt/rocm/.info/version || true + whoami + - name: Runner health check rocm-smi + if: always() + run: | + rocm-smi + - name: Runner health check rocminfo + if: always() + run: | + rocminfo + - name: Runner health check GPU count + if: always() + run: | + ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') + if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then + echo "Failed to detect GPUs on the runner" + exit 1 + fi + - name: Runner health check disconnect on failure + if: ${{ failure() }} + run: | + killall runsvc.sh + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_10-rocm5_0 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_10-rocm5_0-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_10-rocm5_0-test + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.0 + GPU_ARCH_VERSION: 5.0 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.0 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_10-rocm5_0 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_10-rocm5_1_1-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: linux.4xlarge + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.1.1 + GPU_ARCH_VERSION: 5.1.1 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Build PyTorch binary + run: | + set -x + mkdir -p artifacts/ + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" + - name: Chown artifacts + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - uses: seemethere/upload-artifact-s3@v4 + with: + name: manywheel-py3_10-rocm5_1_1 + retention-days: 14 + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + - name: Hold runner for 2 hours or until ssh sessions have drained + working-directory: pytorch/ + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_10-rocm5_1_1-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_10-rocm5_1_1-build + runs-on: linux.rocm.gpu + timeout-minutes: 240 + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.1.1 + GPU_ARCH_VERSION: 5.1.1 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: Set DOCKER_HOST + run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" + - name: Runner health check system info + if: always() + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/rocm.list || true + cat /opt/rocm/.info/version || true + whoami + - name: Runner health check rocm-smi + if: always() + run: | + rocm-smi + - name: Runner health check rocminfo + if: always() + run: | + rocminfo + - name: Runner health check GPU count + if: always() + run: | + ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') + if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then + echo "Failed to detect GPUs on the runner" + exit 1 + fi + - name: Runner health check disconnect on failure + if: ${{ failure() }} + run: | + killall runsvc.sh + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_10-rocm5_1_1 + path: "${{ runner.temp }}/artifacts/" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Test PyTorch binary + run: | + set -x + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BINARY_ENV_FILE \ + -e BUILDER_ROOT \ + -e BUILD_ENVIRONMENT \ + -e BUILD_SPLIT_CUDA \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e IS_GHA \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${GITHUB_WORKSPACE}/builder:/builder" \ + -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + # Generate test script + docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + manywheel-py3_10-rocm5_1_1-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_10-rocm5_1_1-test + env: + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm5.1.1 + GPU_ARCH_VERSION: 5.1.1 + GPU_ARCH_TYPE: rocm + DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1 + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: manywheel-py3_10-rocm5_1_1 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af diff --git a/.github/workflows/generated-linux-binary-manywheel.yml b/.github/workflows/generated-linux-binary-manywheel.yml deleted file mode 100644 index a955984d7c75..000000000000 --- a/.github/workflows/generated-linux-binary-manywheel.yml +++ /dev/null @@ -1,11122 +0,0 @@ -# @generated DO NOT EDIT MANUALLY - -# Template is at: .github/templates/linux_binary_build_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: linux-binary-manywheel - -on: - push: - # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build - branches: - - nightly - tags: - # NOTE: Binary build pipelines should only get triggered on release candidate builds - # Release candidate tags look like: v1.11.0-rc1 - - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ - - 'ciflow/binaries/*' - - 'ciflow/binaries_wheel/*' - workflow_dispatch: - -env: - # Needed for conda builds - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - ANACONDA_USER: pytorch - AWS_DEFAULT_REGION: us-east-1 - BINARY_ENV_FILE: /tmp/env - BUILD_ENVIRONMENT: linux-binary-manywheel - BUILDER_ROOT: /builder - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - IN_CI: 1 - IS_GHA: 1 - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - PR_NUMBER: ${{ github.event.pull_request.number }} - PYTORCH_FINAL_PACKAGE_DIR: /artifacts - PYTORCH_RETRY_TEST_CASES: 1 - PYTORCH_ROOT: /pytorch - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - SKIP_ALL_TESTS: 1 -concurrency: - group: linux-binary-manywheel-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - manywheel-py3_7-cpu-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_7-cpu - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_7-cpu-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_7-cpu-build - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_7-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_7-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_7-cpu-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_7-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_7-cuda10_2-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_7-cuda10_2 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_7-cuda10_2-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_7-cuda10_2-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_7-cuda10_2 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_7-cuda10_2-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_7-cuda10_2-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_7-cuda10_2 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_7-cuda11_1-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_7-cuda11_1 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_7-cuda11_1-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_7-cuda11_1-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_7-cuda11_1 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_7-cuda11_1-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_7-cuda11_1-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_7-cuda11_1 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_7-cuda11_3-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_7-cuda11_3 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_7-cuda11_3-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_7-cuda11_3-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_7-cuda11_3 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_7-cuda11_3-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_7-cuda11_3-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_7-cuda11_3 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_7-cuda11_5-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_7-cuda11_5 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_7-cuda11_5-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_7-cuda11_5-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_7-cuda11_5 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_7-cuda11_5-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_7-cuda11_5-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_7-cuda11_5 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_7-rocm4_3_1-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm4.3.1 - GPU_ARCH_VERSION: 4.3.1 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.3.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_7-rocm4_3_1 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_7-rocm4_3_1-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_7-rocm4_3_1-build - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm4.3.1 - GPU_ARCH_VERSION: 4.3.1 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.3.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_7-rocm4_3_1 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_7-rocm4_3_1-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_7-rocm4_3_1-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm4.3.1 - GPU_ARCH_VERSION: 4.3.1 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.3.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_7-rocm4_3_1 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_7-rocm4_5_2-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm4.5.2 - GPU_ARCH_VERSION: 4.5.2 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.5.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_7-rocm4_5_2 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_7-rocm4_5_2-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_7-rocm4_5_2-build - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm4.5.2 - GPU_ARCH_VERSION: 4.5.2 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.5.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_7-rocm4_5_2 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_7-rocm4_5_2-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_7-rocm4_5_2-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm4.5.2 - GPU_ARCH_VERSION: 4.5.2 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.5.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_7-rocm4_5_2 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_8-cpu-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_8-cpu - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_8-cpu-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_8-cpu-build - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_8-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_8-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_8-cpu-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_8-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_8-cuda10_2-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_8-cuda10_2 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_8-cuda10_2-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_8-cuda10_2-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_8-cuda10_2 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_8-cuda10_2-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_8-cuda10_2-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_8-cuda10_2 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_8-cuda11_1-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_8-cuda11_1 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_8-cuda11_1-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_8-cuda11_1-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_8-cuda11_1 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_8-cuda11_1-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_8-cuda11_1-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_8-cuda11_1 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_8-cuda11_3-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_8-cuda11_3 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_8-cuda11_3-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_8-cuda11_3-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_8-cuda11_3 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_8-cuda11_3-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_8-cuda11_3-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_8-cuda11_3 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_8-cuda11_5-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_8-cuda11_5 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_8-cuda11_5-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_8-cuda11_5-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_8-cuda11_5 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_8-cuda11_5-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_8-cuda11_5-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_8-cuda11_5 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_8-rocm4_3_1-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm4.3.1 - GPU_ARCH_VERSION: 4.3.1 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.3.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_8-rocm4_3_1 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_8-rocm4_3_1-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_8-rocm4_3_1-build - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm4.3.1 - GPU_ARCH_VERSION: 4.3.1 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.3.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_8-rocm4_3_1 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_8-rocm4_3_1-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_8-rocm4_3_1-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm4.3.1 - GPU_ARCH_VERSION: 4.3.1 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.3.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_8-rocm4_3_1 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_8-rocm4_5_2-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm4.5.2 - GPU_ARCH_VERSION: 4.5.2 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.5.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_8-rocm4_5_2 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_8-rocm4_5_2-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_8-rocm4_5_2-build - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm4.5.2 - GPU_ARCH_VERSION: 4.5.2 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.5.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_8-rocm4_5_2 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_8-rocm4_5_2-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_8-rocm4_5_2-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm4.5.2 - GPU_ARCH_VERSION: 4.5.2 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.5.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_8-rocm4_5_2 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_9-cpu-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_9-cpu - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_9-cpu-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_9-cpu-build - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_9-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_9-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_9-cpu-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_9-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_9-cuda10_2-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_9-cuda10_2 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_9-cuda10_2-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_9-cuda10_2-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_9-cuda10_2 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_9-cuda10_2-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_9-cuda10_2-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_9-cuda10_2 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_9-cuda11_1-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_9-cuda11_1 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_9-cuda11_1-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_9-cuda11_1-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_9-cuda11_1 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_9-cuda11_1-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_9-cuda11_1-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_9-cuda11_1 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_9-cuda11_3-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_9-cuda11_3 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_9-cuda11_3-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_9-cuda11_3-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_9-cuda11_3 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_9-cuda11_3-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_9-cuda11_3-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_9-cuda11_3 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_9-cuda11_5-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_9-cuda11_5 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_9-cuda11_5-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_9-cuda11_5-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_9-cuda11_5 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_9-cuda11_5-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_9-cuda11_5-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_9-cuda11_5 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_9-rocm4_3_1-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm4.3.1 - GPU_ARCH_VERSION: 4.3.1 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.3.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_9-rocm4_3_1 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_9-rocm4_3_1-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_9-rocm4_3_1-build - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm4.3.1 - GPU_ARCH_VERSION: 4.3.1 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.3.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_9-rocm4_3_1 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_9-rocm4_3_1-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_9-rocm4_3_1-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm4.3.1 - GPU_ARCH_VERSION: 4.3.1 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.3.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_9-rocm4_3_1 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_9-rocm4_5_2-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm4.5.2 - GPU_ARCH_VERSION: 4.5.2 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.5.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_9-rocm4_5_2 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_9-rocm4_5_2-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_9-rocm4_5_2-build - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm4.5.2 - GPU_ARCH_VERSION: 4.5.2 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.5.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_9-rocm4_5_2 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_9-rocm4_5_2-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_9-rocm4_5_2-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm4.5.2 - GPU_ARCH_VERSION: 4.5.2 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.5.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_9-rocm4_5_2 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_10-cpu-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_10-cpu - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_10-cpu-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_10-cpu-build - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_10-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_10-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_10-cpu-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_10-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_10-cuda10_2-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_10-cuda10_2 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_10-cuda10_2-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_10-cuda10_2-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_10-cuda10_2 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_10-cuda10_2-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_10-cuda10_2-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu102 - GPU_ARCH_VERSION: 10.2 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda10.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_10-cuda10_2 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_10-cuda11_1-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_10-cuda11_1 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_10-cuda11_1-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_10-cuda11_1-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_10-cuda11_1 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_10-cuda11_1-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_10-cuda11_1-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_10-cuda11_1 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_10-cuda11_3-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_10-cuda11_3 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_10-cuda11_3-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_10-cuda11_3-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_10-cuda11_3 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_10-cuda11_3-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_10-cuda11_3-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.3 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_10-cuda11_3 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_10-cuda11_5-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Set BUILD_SPLIT_CUDA - run: | - echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_10-cuda11_5 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_10-cuda11_5-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_10-cuda11_5-build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_10-cuda11_5 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - working-directory: pytorch/ - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_10-cuda11_5-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_10-cuda11_5-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.5 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_10-cuda11_5 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_10-rocm4_3_1-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm4.3.1 - GPU_ARCH_VERSION: 4.3.1 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.3.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_10-rocm4_3_1 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_10-rocm4_3_1-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_10-rocm4_3_1-build - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm4.3.1 - GPU_ARCH_VERSION: 4.3.1 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.3.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_10-rocm4_3_1 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_10-rocm4_3_1-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_10-rocm4_3_1-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm4.3.1 - GPU_ARCH_VERSION: 4.3.1 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.3.1 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_10-rocm4_3_1 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_10-rocm4_5_2-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm4.5.2 - GPU_ARCH_VERSION: 4.5.2 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.5.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - path: pytorch - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - submodules: recursive - repository: pytorch/builder - path: builder - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Build PyTorch binary - run: | - set -x - mkdir -p artifacts/ - container_name=$(docker run \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/manywheel/build.sh" - - name: Chown artifacts - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - with: - name: manywheel-py3_10-rocm4_5_2 - retention-days: 14 - if-no-files-found: error - path: - ${{ runner.temp }}/artifacts/* - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_10-rocm4_5_2-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_10-rocm4_5_2-build - runs-on: linux.4xlarge - timeout-minutes: 240 - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm4.5.2 - GPU_ARCH_VERSION: 4.5.2 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.5.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_10-rocm4_5_2 - path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Test PyTorch binary - run: | - set -x - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BINARY_ENV_FILE \ - -e BUILDER_ROOT \ - -e BUILD_ENVIRONMENT \ - -e BUILD_SPLIT_CUDA \ - -e DESIRED_CUDA \ - -e DESIRED_DEVTOOLSET \ - -e DESIRED_PYTHON \ - -e GPU_ARCH_TYPE \ - -e GPU_ARCH_VERSION \ - -e IS_GHA \ - -e LIBTORCH_VARIANT \ - -e PACKAGE_TYPE \ - -e PYTORCH_FINAL_PACKAGE_DIR \ - -e PYTORCH_ROOT \ - -e SKIP_ALL_TESTS \ - --tty \ - --detach \ - -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ - -v "${GITHUB_WORKSPACE}/builder:/builder" \ - -v "${RUNNER_TEMP}/artifacts:/final_pkgs" \ - -w / \ - "${DOCKER_IMAGE}" - ) - docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" - # Generate test script - docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh" - docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh" - - name: Hold runner for 2 hours or until ssh sessions have drained - working-directory: pytorch/ - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - manywheel-py3_10-rocm4_5_2-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_10-rocm4_5_2-test - env: - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm4.5.2 - GPU_ARCH_VERSION: 4.5.2 - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm4.5.2 - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: manywheel-py3_10-rocm4_5_2 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml deleted file mode 100644 index ee483708dfcd..000000000000 --- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml +++ /dev/null @@ -1,540 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: linux-bionic-cuda10.2-py3.9-gcc7 - -on: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/cuda/*' - - 'ciflow/linux/*' - - 'ciflow/slow/*' - - 'ciflow/trunk/*' - branches: - - master - - release/* - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: linux-bionic-cuda10.2-py3.9-gcc7 - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7 - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: linux-bionic-cuda10.2-py3.9-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - build: - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-build - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Archive artifacts into zip - run: | - zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@v3 - name: Store PyTorch Build Artifacts on S3 - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu - ENABLE_DISTRIBUTED_TEST: 1 - ENABLE_JIT_LEGACY_TEST: 1 - ENABLE_MULTIGPU_TEST: 1 - ENABLE_NOGPU_NO_AVX_TEST: 1 - ENABLE_NOGPU_NO_AVX2_TEST: 1 - ENABLE_SLOW_TEST: 1 - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 240 minutes - timeout-minutes: 240 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - PROXY_ENV= - # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now - # We should investigate whether or not there's a list of hostnames we can add to no_proxy to - # make it so that we shouldn't have to fully disable squid for XLA tests - if [[ $TEST_CONFIG != 'xla' ]]; then - # shellcheck disable=SC2089 - PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - ${PROXY_ENV} \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --ipc=host \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-linux-bionic-py3.7-clang9.yml b/.github/workflows/generated-linux-bionic-py3.7-clang9.yml deleted file mode 100644 index 91e4ff63e4c6..000000000000 --- a/.github/workflows/generated-linux-bionic-py3.7-clang9.yml +++ /dev/null @@ -1,542 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: linux-bionic-py3.7-clang9 - -on: - pull_request: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/cpu/*' - - 'ciflow/linux/*' - - 'ciflow/noarch/*' - - 'ciflow/trunk/*' - - 'ciflow/xla/*' - branches: - - master - - release/* - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: linux-bionic-py3.7-clang9 - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.7-clang9 - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: linux-bionic-py3.7-clang9-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - build: - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - JOB_BASE_NAME: linux-bionic-py3.7-clang9-build - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Archive artifacts into zip - run: | - zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@v3 - name: Store PyTorch Build Artifacts on S3 - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.2xlarge - ENABLE_DISTRIBUTED_TEST: '' - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: 1 - ENABLE_NOARCH_TEST: 1 - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: linux-bionic-py3.7-clang9-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 240 minutes - timeout-minutes: 240 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - PROXY_ENV= - # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now - # We should investigate whether or not there's a list of hostnames we can add to no_proxy to - # make it so that we shouldn't have to fully disable squid for XLA tests - if [[ $TEST_CONFIG != 'xla' ]]; then - # shellcheck disable=SC2089 - PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - ${PROXY_ENV} \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --ipc=host \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: linux-bionic-py3.7-clang9-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml b/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml deleted file mode 100644 index 5f37b48464b8..000000000000 --- a/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml +++ /dev/null @@ -1,512 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: linux-bionic-rocm4.5-py3.7 - -on: - pull_request: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/linux/*' - - 'ciflow/rocm/*' - - 'ciflow/trunk/*' - branches: - - master - - release/* - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: linux-bionic-rocm4.5-py3.7 - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-rocm4.5-py3.7 - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: linux-bionic-rocm4.5-py3.7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - build: - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - JOB_BASE_NAME: linux-bionic-rocm4.5-py3.7-build - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Archive artifacts into zip - run: | - zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@v3 - name: Store PyTorch Build Artifacts on S3 - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.rocm.gpu - ENABLE_DISTRIBUTED_TEST: 1 - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.rocm.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.rocm.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: linux-bionic-rocm4.5-py3.7-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} - steps: - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: Set DOCKER_HOST - run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" - - name: Runner health check system info - if: always() - run: | - cat /etc/os-release || true - cat /etc/apt/sources.list.d/rocm.list || true - cat /opt/rocm/.info/version || true - whoami - - name: Runner health check rocm-smi - if: always() - run: | - rocm-smi - - name: Runner health check rocminfo - if: always() - run: | - rocminfo - - name: Runner health check GPU count - if: always() - run: | - ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') - if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then - echo "Failed to detect GPUs on the runner" - exit 1 - fi - - name: Runner health check disconnect on failure - if: ${{ failure() }} - run: | - killall runsvc.sh - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: ROCm set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'rocm') && !contains(matrix.config, 'nogpu') }} - run: | - echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 240 minutes - timeout-minutes: 240 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home - docker exec -t "${container_name}" sh -c "cd .. && cp -R workspace pytorch && cd pytorch && pip install dist/*.whl && ${TEST_COMMAND}" - # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct - docker exec -t "${container_name}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test" - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: actions/upload-artifact@v2 - name: Store Test Downloaded JSONs on Github - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: actions/upload-artifact@v2 - name: Store Test Reports on Github - if: always() - with: - name: test-reports - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: linux-bionic-rocm4.5-py3.7-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-linux-docs-push.yml b/.github/workflows/generated-linux-docs-push.yml deleted file mode 100644 index 0ad84fdef3e6..000000000000 --- a/.github/workflows/generated-linux-docs-push.yml +++ /dev/null @@ -1,392 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: linux-docs-push - -on: - push: - tags: - # NOTE: Binary build pipelines should only get triggered on release candidate builds - # Release candidate tags look like: v1.11.0-rc1 - - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ - - 'ciflow/all/*' - - 'ciflow/cpu/*' - - 'ciflow/linux/*' - - 'ciflow/scheduled/*' - schedule: - - cron: 0 0 * * * - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: linux-docs-push - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.7-gcc5.4 - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: linux-docs-push-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - build: - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - JOB_BASE_NAME: linux-docs-push-build - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Archive artifacts into zip - run: | - zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@v3 - name: Store PyTorch Build Artifacts on S3 - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af - build-docs: - runs-on: linux.2xlarge - timeout-minutes: 240 - strategy: - matrix: - docs_type: [cpp, python] - needs: [build] - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - DOCS_TYPE: ${{ matrix.docs_type }} - WITH_PUSH: ${{ github.event_name == 'schedule' || startsWith(github.event.ref, 'refs/tags/v') }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Generate netrc (only for docs-push) - if: ${{ github.event_name == 'schedule' || startsWith(github.event.ref, 'refs/tags/v') }} - env: - GITHUB_PYTORCHBOT_TOKEN: ${{ secrets.GH_PYTORCHBOT_TOKEN }} - run: | - # set credentials for https pushing - echo "machine github.com" > "${RUNNER_TEMP}/.netrc" - echo "login pytorchbot" >> "${RUNNER_TEMP}/.netrc" - echo "password ${GITHUB_PYTORCHBOT_TOKEN}" >> "${RUNNER_TEMP}/.netrc" - - name: Build ${{ matrix.docs_type }} docs - run: | - set -ex - time docker pull "${DOCKER_IMAGE}" > /dev/null - # Convert refs/tags/v1.12.0rc3 into 1.12 - if [[ "${GITHUB_REF}" =~ ^refs/tags/v([0-9]+\.[0-9]+)\.* ]]; then - target="${BASH_REMATCH[1]}" - else - target="master" - fi - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e IN_CI \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SHA1="$GITHUB_SHA" \ - -e DOCS_VERSION="${target}" \ - -e DOCS_TYPE \ - -e PR_LABELS \ - -e WITH_PUSH \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${RUNNER_TEMP}/.netrc":/var/lib/jenkins/.netrc \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/${DOCS_TYPE}_doc_push_script.sh" - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - name: Upload Python Docs Preview - if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'python' }} - with: - retention-days: 14 - s3-bucket: doc-previews - if-no-files-found: error - path: pytorch.github.io/docs/master/ - s3-prefix: pytorch/${{ github.event.pull_request.number }} - - uses: seemethere/upload-artifact-s3@v3 - name: Upload C++ Docs Preview - if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'cpp' }} - with: - retention-days: 14 - if-no-files-found: error - s3-bucket: doc-previews - path: cppdocs/ - s3-prefix: pytorch/${{ github.event.pull_request.number }}/cppdocs diff --git a/.github/workflows/generated-linux-docs.yml b/.github/workflows/generated-linux-docs.yml deleted file mode 100644 index 5709b1a7eef7..000000000000 --- a/.github/workflows/generated-linux-docs.yml +++ /dev/null @@ -1,382 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: linux-docs - -on: - pull_request: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/cpu/*' - - 'ciflow/docs/*' - - 'ciflow/linux/*' - - 'ciflow/trunk/*' - branches: - - master - - release/* - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: linux-docs - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.7-gcc5.4 - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: linux-docs-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - build: - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - JOB_BASE_NAME: linux-docs-build - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Archive artifacts into zip - run: | - zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@v3 - name: Store PyTorch Build Artifacts on S3 - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af - build-docs: - runs-on: linux.2xlarge - timeout-minutes: 240 - strategy: - matrix: - docs_type: [cpp, python] - needs: [build] - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - DOCS_TYPE: ${{ matrix.docs_type }} - WITH_PUSH: ${{ github.event_name == 'schedule' || startsWith(github.event.ref, 'refs/tags/v') }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Build ${{ matrix.docs_type }} docs - run: | - set -ex - time docker pull "${DOCKER_IMAGE}" > /dev/null - # Convert refs/tags/v1.12.0rc3 into 1.12 - if [[ "${GITHUB_REF}" =~ ^refs/tags/v([0-9]+\.[0-9]+)\.* ]]; then - target="${BASH_REMATCH[1]}" - else - target="master" - fi - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e IN_CI \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SHA1="$GITHUB_SHA" \ - -e DOCS_VERSION="${target}" \ - -e DOCS_TYPE \ - -e PR_LABELS \ - -e WITH_PUSH \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" bash -c "sudo chown -R jenkins . && pip install dist/*.whl && ./.circleci/scripts/${DOCS_TYPE}_doc_push_script.sh" - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - uses: seemethere/upload-artifact-s3@v3 - name: Upload Python Docs Preview - if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'python' }} - with: - retention-days: 14 - s3-bucket: doc-previews - if-no-files-found: error - path: pytorch.github.io/docs/master/ - s3-prefix: pytorch/${{ github.event.pull_request.number }} - - uses: seemethere/upload-artifact-s3@v3 - name: Upload C++ Docs Preview - if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'cpp' }} - with: - retention-days: 14 - if-no-files-found: error - s3-bucket: doc-previews - path: cppdocs/ - s3-prefix: pytorch/${{ github.event.pull_request.number }}/cppdocs diff --git a/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml b/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml deleted file mode 100644 index 58f8cc3d0563..000000000000 --- a/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml +++ /dev/null @@ -1,541 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: linux-vulkan-bionic-py3.7-clang9 - -on: - pull_request: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/cpu/*' - - 'ciflow/linux/*' - - 'ciflow/trunk/*' - - 'ciflow/vulkan/*' - branches: - - master - - release/* - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: linux-vulkan-bionic-py3.7-clang9 - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.7-clang9 - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: linux-vulkan-bionic-py3.7-clang9-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - build: - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - JOB_BASE_NAME: linux-vulkan-bionic-py3.7-clang9-build - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Archive artifacts into zip - run: | - zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@v3 - name: Store PyTorch Build Artifacts on S3 - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.2xlarge - ENABLE_DISTRIBUTED_TEST: '' - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 1 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: linux-vulkan-bionic-py3.7-clang9-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 240 minutes - timeout-minutes: 240 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - PROXY_ENV= - # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now - # We should investigate whether or not there's a list of hostnames we can add to no_proxy to - # make it so that we shouldn't have to fully disable squid for XLA tests - if [[ $TEST_CONFIG != 'xla' ]]; then - # shellcheck disable=SC2089 - PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - ${PROXY_ENV} \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --ipc=host \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: linux-vulkan-bionic-py3.7-clang9-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7-bazel-test.yml deleted file mode 100644 index e1dc026af70b..000000000000 --- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7-bazel-test.yml +++ /dev/null @@ -1,336 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/bazel_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: linux-xenial-cuda11.3-py3.7-gcc7-bazel-test - -on: - pull_request: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/bazel/*' - - 'ciflow/cpu/*' - - 'ciflow/linux/*' - - 'ciflow/trunk/*' - branches: - - master - - release/* - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: linux-xenial-cuda11.3-py3.7-gcc7-bazel-test - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7 - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: linux-xenial-cuda11.3-py3.7-gcc7-bazel-test-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - # building and testing in a single job since bazel runs only small subset of tests - build-and-test: - runs-on: linux.2xlarge - env: - JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-bazel-test-build-and-test - NUM_TEST_SHARDS: 1 - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - name: Output disk space left - run: | - sudo df -H - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Build - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e PR_LABELS \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && sudo chown -R jenkins /dev && .jenkins/pytorch/build.sh' - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - # The artifact file is created inside docker container, which contains the result binaries. - # Now unpackage it into the project folder. The subsequent script will scan project folder - # to locate result binaries and report their sizes. - # If artifact file is not provided it assumes that the project folder has been mounted in - # the docker during build and already contains the result binaries, so this step can be skipped. - export ARTIFACTS= - if [ -n "${ARTIFACTS}" ]; then - tar xf "${ARTIFACTS}" -C "${GITHUB_WORKSPACE}" - cd "${GITHUB_WORKSPACE}" - fi - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - ANDROID_BUILD_TYPE= - export ANDROID_BUILD_TYPE - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba "android" || exit 0 - - name: Test - # Time out the test phase after 3.5 hours - timeout-minutes: 210 - run: | - # detached container should get cleaned up by teardown_ec2_linux - export SHARD_NUMBER=0 - # TODO: Stop building test binaries as part of the build phase - # Make sure we copy test results from bazel-testlogs symlink to - # a regular directory ./test/test-reports - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e SHARD_NUMBER \ - -e NUM_TEST_SHARDS \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && sudo chown -R jenkins /dev && .jenkins/pytorch/test.sh && cp -Lr ./bazel-testlogs ./test/test-reports' - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: 'bazel-${{ github.job }}' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: 'bazel-${{ github.job }}' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-bazel-test-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7-no-ops.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7-no-ops.yml deleted file mode 100644 index 7a51acf31e11..000000000000 --- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7-no-ops.yml +++ /dev/null @@ -1,248 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: linux-xenial-cuda11.3-py3.7-gcc7-no-ops - -on: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/cuda/*' - - 'ciflow/linux/*' - - 'ciflow/trunk/*' - branches: - - master - - release/* - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: linux-xenial-cuda11.3-py3.7-gcc7-no-ops - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7 - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: linux-xenial-cuda11.3-py3.7-gcc7-no-ops-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - build: - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-no-ops-build - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Archive artifacts into zip - run: | - zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@v3 - name: Store PyTorch Build Artifacts on S3 - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml deleted file mode 100644 index 4dd594483b8e..000000000000 --- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml +++ /dev/null @@ -1,540 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: linux-xenial-cuda11.3-py3.7-gcc7 - -on: - pull_request: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/cuda/*' - - 'ciflow/linux/*' - - 'ciflow/trunk/*' - branches: - - master - - release/* - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: linux-xenial-cuda11.3-py3.7-gcc7 - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7 - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: linux-xenial-cuda11.3-py3.7-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - build: - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-build - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Archive artifacts into zip - run: | - zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@v3 - name: Store PyTorch Build Artifacts on S3 - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu - ENABLE_DISTRIBUTED_TEST: 1 - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 240 minutes - timeout-minutes: 240 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - PROXY_ENV= - # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now - # We should investigate whether or not there's a list of hostnames we can add to no_proxy to - # make it so that we shouldn't have to fully disable squid for XLA tests - if [[ $TEST_CONFIG != 'xla' ]]; then - # shellcheck disable=SC2089 - PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - ${PROXY_ENV} \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --ipc=host \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-linux-xenial-py3-clang5-mobile-build.yml b/.github/workflows/generated-linux-xenial-py3-clang5-mobile-build.yml deleted file mode 100644 index df0dd5fb57f9..000000000000 --- a/.github/workflows/generated-linux-xenial-py3-clang5-mobile-build.yml +++ /dev/null @@ -1,238 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: linux-xenial-py3-clang5-mobile-build - -on: - pull_request: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/linux/*' - - 'ciflow/mobile/*' - - 'ciflow/trunk/*' - branches: - - master - - release/* - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: linux-xenial-py3-clang5-mobile-build - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-asan - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: linux-xenial-py3-clang5-mobile-build-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - build: - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - JOB_BASE_NAME: linux-xenial-py3-clang5-mobile-build-build - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-linux-xenial-py3-clang5-mobile-custom-build-static.yml b/.github/workflows/generated-linux-xenial-py3-clang5-mobile-custom-build-static.yml deleted file mode 100644 index 29a14fd9f418..000000000000 --- a/.github/workflows/generated-linux-xenial-py3-clang5-mobile-custom-build-static.yml +++ /dev/null @@ -1,238 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: linux-xenial-py3-clang5-mobile-custom-build-static - -on: - pull_request: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/linux/*' - - 'ciflow/mobile/*' - - 'ciflow/trunk/*' - branches: - - master - - release/* - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: linux-xenial-py3-clang5-mobile-custom-build-static - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: linux-xenial-py3-clang5-mobile-custom-build-static-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - build: - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - JOB_BASE_NAME: linux-xenial-py3-clang5-mobile-custom-build-static-build - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml b/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml deleted file mode 100644 index 5b538547df1b..000000000000 --- a/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml +++ /dev/null @@ -1,541 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: linux-xenial-py3.7-clang7-asan - -on: - pull_request: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/cpu/*' - - 'ciflow/linux/*' - - 'ciflow/sanitizers/*' - - 'ciflow/trunk/*' - branches: - - master - - release/* - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: linux-xenial-py3.7-clang7-asan - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang7-asan - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: linux-xenial-py3.7-clang7-asan-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - build: - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - JOB_BASE_NAME: linux-xenial-py3.7-clang7-asan-build - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Archive artifacts into zip - run: | - zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@v3 - name: Store PyTorch Build Artifacts on S3 - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.2xlarge - ENABLE_DISTRIBUTED_TEST: '' - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 3 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: linux-xenial-py3.7-clang7-asan-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 240 minutes - timeout-minutes: 240 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - PROXY_ENV= - # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now - # We should investigate whether or not there's a list of hostnames we can add to no_proxy to - # make it so that we shouldn't have to fully disable squid for XLA tests - if [[ $TEST_CONFIG != 'xla' ]]; then - # shellcheck disable=SC2089 - PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - ${PROXY_ENV} \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --ipc=host \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: linux-xenial-py3.7-clang7-asan-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml b/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml deleted file mode 100644 index 0005308beec3..000000000000 --- a/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml +++ /dev/null @@ -1,541 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: linux-xenial-py3.7-clang7-onnx - -on: - pull_request: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/cpu/*' - - 'ciflow/linux/*' - - 'ciflow/onnx/*' - - 'ciflow/trunk/*' - branches: - - master - - release/* - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: linux-xenial-py3.7-clang7-onnx - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang7-onnx - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: linux-xenial-py3.7-clang7-onnx-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - build: - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - JOB_BASE_NAME: linux-xenial-py3.7-clang7-onnx-build - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Archive artifacts into zip - run: | - zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@v3 - name: Store PyTorch Build Artifacts on S3 - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.2xlarge - ENABLE_DISTRIBUTED_TEST: '' - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: linux-xenial-py3.7-clang7-onnx-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 240 minutes - timeout-minutes: 240 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - PROXY_ENV= - # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now - # We should investigate whether or not there's a list of hostnames we can add to no_proxy to - # make it so that we shouldn't have to fully disable squid for XLA tests - if [[ $TEST_CONFIG != 'xla' ]]; then - # shellcheck disable=SC2089 - PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - ${PROXY_ENV} \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --ipc=host \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: linux-xenial-py3.7-clang7-onnx-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml deleted file mode 100644 index 5778fe613dbc..000000000000 --- a/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml +++ /dev/null @@ -1,540 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: linux-xenial-py3.7-gcc5.4 - -on: - pull_request: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/cpu/*' - - 'ciflow/linux/*' - - 'ciflow/trunk/*' - branches: - - master - - release/* - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: linux-xenial-py3.7-gcc5.4 - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.7-gcc5.4 - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: linux-xenial-py3.7-gcc5.4-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - build: - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-build - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Archive artifacts into zip - run: | - zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@v3 - name: Store PyTorch Build Artifacts on S3 - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.2xlarge - ENABLE_DISTRIBUTED_TEST: 1 - ENABLE_JIT_LEGACY_TEST: 1 - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: 1 - ENABLE_BACKWARDS_COMPAT_TEST: 1 - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 240 minutes - timeout-minutes: 240 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - PROXY_ENV= - # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now - # We should investigate whether or not there's a list of hostnames we can add to no_proxy to - # make it so that we shouldn't have to fully disable squid for XLA tests - if [[ $TEST_CONFIG != 'xla' ]]; then - # shellcheck disable=SC2089 - PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - ${PROXY_ENV} \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --ipc=host \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-linux-xenial-py3.7-gcc7-no-ops.yml b/.github/workflows/generated-linux-xenial-py3.7-gcc7-no-ops.yml deleted file mode 100644 index e9f11d265c7b..000000000000 --- a/.github/workflows/generated-linux-xenial-py3.7-gcc7-no-ops.yml +++ /dev/null @@ -1,249 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: linux-xenial-py3.7-gcc7-no-ops - -on: - pull_request: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/cpu/*' - - 'ciflow/linux/*' - - 'ciflow/trunk/*' - branches: - - master - - release/* - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: linux-xenial-py3.7-gcc7-no-ops - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.7-gcc7 - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: linux-xenial-py3.7-gcc7-no-ops-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - build: - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - JOB_BASE_NAME: linux-xenial-py3.7-gcc7-no-ops-build - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Archive artifacts into zip - run: | - zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@v3 - name: Store PyTorch Build Artifacts on S3 - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml b/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml deleted file mode 100644 index 1bb791a329b3..000000000000 --- a/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml +++ /dev/null @@ -1,540 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: linux-xenial-py3.7-gcc7 - -on: - pull_request: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/cpu/*' - - 'ciflow/linux/*' - - 'ciflow/trunk/*' - branches: - - master - - release/* - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: linux-xenial-py3.7-gcc7 - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.7-gcc7 - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: linux-xenial-py3.7-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - build: - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - JOB_BASE_NAME: linux-xenial-py3.7-gcc7-build - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Archive artifacts into zip - run: | - zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@v3 - name: Store PyTorch Build Artifacts on S3 - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.2xlarge - ENABLE_DISTRIBUTED_TEST: 1 - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: linux-xenial-py3.7-gcc7-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 240 minutes - timeout-minutes: 240 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - PROXY_ENV= - # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now - # We should investigate whether or not there's a list of hostnames we can add to no_proxy to - # make it so that we shouldn't have to fully disable squid for XLA tests - if [[ $TEST_CONFIG != 'xla' ]]; then - # shellcheck disable=SC2089 - PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - ${PROXY_ENV} \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --ipc=host \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: linux-xenial-py3.7-gcc7-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-macos-10-15-py3-arm64.yml b/.github/workflows/generated-macos-10-15-py3-arm64.yml deleted file mode 100644 index ea97b3b9facf..000000000000 --- a/.github/workflows/generated-macos-10-15-py3-arm64.yml +++ /dev/null @@ -1,87 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/macos_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: macos-10-15-py3-arm64 - -on: - push: - branches: - - master - - release/* - tags: - - 'ciflow/all/*' - - 'ciflow/macos/*' - - 'ciflow/trunk/*' - workflow_dispatch: - -# For setup-miniconda, see https://github.com/conda-incubator/setup-miniconda/issues/179 -defaults: - run: - shell: bash -e -l {0} -env: - BUILD_ENVIRONMENT: macos-10-15-py3-arm64 - COMPACT_JOB_NAME: macos-10-15-py3-arm64 - IN_CI: 1 - IS_GHA: 1 - PYTORCH_RETRY_TEST_CASES: 1 - - -jobs: - - build: - runs-on: macos-10.15 - env: - JOB_BASE_NAME: macos-10-15-py3-arm64 - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Setup miniconda - uses: conda-incubator/setup-miniconda@v2 - with: - auto-update-conda: true - python-version: 3.8 - activate-environment: build - - name: Install macOS homebrew dependencies - run: | - # Install dependencies - brew install libomp - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Build - run: | - echo "CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}" >> "${GITHUB_ENV}" - .jenkins/pytorch/macos-build.sh - - name: Archive artifacts into zip - run: | - zip -1 -r artifacts.zip dist/ - - uses: actions/upload-artifact@v2 - name: Store PyTorch Build Artifacts on GHA - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip - - -concurrency: - group: macos-10-15-py3-arm64-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true diff --git a/.github/workflows/generated-macos-10-15-py3-lite-interpreter-x86-64.yml b/.github/workflows/generated-macos-10-15-py3-lite-interpreter-x86-64.yml deleted file mode 100644 index c07454967691..000000000000 --- a/.github/workflows/generated-macos-10-15-py3-lite-interpreter-x86-64.yml +++ /dev/null @@ -1,78 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/macos_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: macos-10-15-py3-lite-interpreter-x86-64 - -on: - push: - branches: - - master - - release/* - tags: - - 'ciflow/all/*' - - 'ciflow/macos/*' - - 'ciflow/trunk/*' - workflow_dispatch: - -# For setup-miniconda, see https://github.com/conda-incubator/setup-miniconda/issues/179 -defaults: - run: - shell: bash -e -l {0} -env: - BUILD_ENVIRONMENT: macos-10-15-py3-lite-interpreter-x86-64 - COMPACT_JOB_NAME: macos-10-15-py3-lite-interpreter-x86-64 - IN_CI: 1 - IS_GHA: 1 - PYTORCH_RETRY_TEST_CASES: 1 - - # Set xcode xcode version to 12 - DEVELOPER_DIR: /Applications/Xcode_12.app/Contents/Developer - -jobs: - - build: - runs-on: macos-10.15 - env: - JOB_BASE_NAME: macos-10-15-py3-lite-interpreter-x86-64 - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Setup miniconda - uses: conda-incubator/setup-miniconda@v2 - with: - auto-update-conda: true - python-version: 3.8 - activate-environment: build - - name: Install macOS homebrew dependencies - run: | - # Install dependencies - brew install libomp - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Build - run: | - echo "CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}" >> "${GITHUB_ENV}" - .jenkins/pytorch/macos-build.sh - - -concurrency: - group: macos-10-15-py3-lite-interpreter-x86-64-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true diff --git a/.github/workflows/generated-macos-11-py3-x86-64.yml b/.github/workflows/generated-macos-11-py3-x86-64.yml deleted file mode 100644 index 41ae3259b527..000000000000 --- a/.github/workflows/generated-macos-11-py3-x86-64.yml +++ /dev/null @@ -1,228 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/macos_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: macos-11-py3-x86-64 - -on: - push: - branches: - - master - - release/* - tags: - - 'ciflow/all/*' - - 'ciflow/macos/*' - - 'ciflow/trunk/*' - workflow_dispatch: - -# For setup-miniconda, see https://github.com/conda-incubator/setup-miniconda/issues/179 -defaults: - run: - shell: bash -e -l {0} -env: - BUILD_ENVIRONMENT: macos-11-py3-x86-64 - COMPACT_JOB_NAME: macos-11-py3-x86-64 - IN_CI: 1 - IS_GHA: 1 - PYTORCH_RETRY_TEST_CASES: 1 - - # Set xcode xcode version to 12.4 - DEVELOPER_DIR: /Applications/Xcode_12.4.app/Contents/Developer - -jobs: - - build: - runs-on: macos-11 - env: - JOB_BASE_NAME: macos-11-py3-x86-64 - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Setup miniconda - uses: conda-incubator/setup-miniconda@v2 - with: - auto-update-conda: true - python-version: 3.8 - activate-environment: build - - name: Install macOS homebrew dependencies - run: | - # Install dependencies - brew install libomp - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Build - run: | - echo "CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}" >> "${GITHUB_ENV}" - .jenkins/pytorch/macos-build.sh - - name: Archive artifacts into zip - run: | - zip -1 -r artifacts.zip dist/ - - uses: actions/upload-artifact@v2 - name: Store PyTorch Build Artifacts on GHA - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip - - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: macos-11 - ENABLE_DISTRIBUTED_TEST: '' - NUM_TEST_SHARDS: 2 - PR_BODY: ${{ github.event.pull_request.body }} - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} - timeout-minutes: 240 - env: - JOB_BASE_NAME: macos-11-py3-x86-64-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} - steps: - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: false - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - uses: actions/download-artifact@v2 - name: Download PyTorch Build Artifacts from GHA - with: - name: ${{ env.BUILD_ENVIRONMENT }} - path: . - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Setup miniconda - uses: conda-incubator/setup-miniconda@v2 - with: - auto-update-conda: true - python-version: 3.8 - activate-environment: build - - name: Install macOS homebrew dependencies - run: | - # Install dependencies - brew install libomp - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - run: | - python3 -mpip install dist/*.whl - .jenkins/pytorch/macos-test.sh - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: actions/upload-artifact@v2 - name: Store Test Downloaded JSONs on Github - if: always() - with: - name: test-jsons - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: actions/upload-artifact@v2 - name: Store Test Reports on Github - if: always() - with: - name: test-reports - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: macos-11-py3-x86-64-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - -concurrency: - group: macos-11-py3-x86-64-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true diff --git a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml new file mode 100644 index 000000000000..422416060fe6 --- /dev/null +++ b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml @@ -0,0 +1,564 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/macos_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: macos-arm64-binary-conda + +on: +# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321 + push: + # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build + branches: + - nightly + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + - 'ciflow/binaries/*' + - 'ciflow/binaries_conda/*' + workflow_dispatch: + +env: + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + ANACONDA_USER: pytorch + AWS_DEFAULT_REGION: us-east-1 + BUILD_ENVIRONMENT: macos-arm64-binary-conda + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IN_CI: 1 + IS_GHA: 1 + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + PR_NUMBER: ${{ github.event.pull_request.number }} + SKIP_ALL_TESTS: 1 + CROSS_COMPILE_ARM64: 1 + +concurrency: + group: macos-arm64-binary-conda-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + conda-py3_8-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: macos-12 + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + # For sccache access (only on non-forked PRs) + AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + # shellcheck disable=SC2129 + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Install sccache (only for non-forked PRs, and pushes to trunk) + if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} + run: | + sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache + sudo chmod +x /usr/local/bin/sccache + echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" + - uses: actions/upload-artifact@v2 + if: always() + with: + name: conda-py3_8-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + conda-py3_8-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_8-cpu-build + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/conda-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: actions/download-artifact@v2 + name: Download Build Artifacts + with: + name: conda-py3_8-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_9-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: macos-12 + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + # For sccache access (only on non-forked PRs) + AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + # shellcheck disable=SC2129 + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Install sccache (only for non-forked PRs, and pushes to trunk) + if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} + run: | + sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache + sudo chmod +x /usr/local/bin/sccache + echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" + - uses: actions/upload-artifact@v2 + if: always() + with: + name: conda-py3_9-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + conda-py3_9-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_9-cpu-build + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/conda-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: actions/download-artifact@v2 + name: Download Build Artifacts + with: + name: conda-py3_9-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_10-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: macos-12 + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + # For sccache access (only on non-forked PRs) + AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + # shellcheck disable=SC2129 + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Install sccache (only for non-forked PRs, and pushes to trunk) + if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} + run: | + sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache + sudo chmod +x /usr/local/bin/sccache + echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" + - uses: actions/upload-artifact@v2 + if: always() + with: + name: conda-py3_10-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + conda-py3_10-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_10-cpu-build + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/conda-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: actions/download-artifact@v2 + name: Download Build Artifacts + with: + name: conda-py3_10-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af diff --git a/.github/workflows/generated-macos-arm64-binary-conda.yml b/.github/workflows/generated-macos-arm64-binary-conda.yml deleted file mode 100644 index 40383e51bee6..000000000000 --- a/.github/workflows/generated-macos-arm64-binary-conda.yml +++ /dev/null @@ -1,575 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/macos_binary_build_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: macos-arm64-binary-conda - -on: -# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321 - push: - # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build - branches: - - nightly - tags: - # NOTE: Binary build pipelines should only get triggered on release candidate builds - # Release candidate tags look like: v1.11.0-rc1 - - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ - - 'ciflow/binaries/*' - - 'ciflow/binaries_conda/*' - workflow_dispatch: - -env: - # Needed for conda builds - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - ANACONDA_USER: pytorch - AWS_DEFAULT_REGION: us-east-1 - BUILD_ENVIRONMENT: macos-arm64-binary-conda - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - IN_CI: 1 - IS_GHA: 1 - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - PR_NUMBER: ${{ github.event.pull_request.number }} - SKIP_ALL_TESTS: 1 - CROSS_COMPILE_ARM64: 1 - -concurrency: - group: macos-arm64-binary-conda-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - conda-py3_8-cpu-build: - runs-on: macos-10.15 - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.8" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 - if: always() - with: - name: conda-py3_8-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - conda-py3_8-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_8-cpu-build - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: actions/download-artifact@v2 - name: Download Build Artifacts - with: - name: conda-py3_8-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_9-cpu-build: - runs-on: macos-10.15 - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.9" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 - if: always() - with: - name: conda-py3_9-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - conda-py3_9-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_9-cpu-build - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: actions/download-artifact@v2 - name: Download Build Artifacts - with: - name: conda-py3_9-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_10-cpu-build: - runs-on: macos-10.15 - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.10" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 - if: always() - with: - name: conda-py3_10-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - conda-py3_10-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_10-cpu-build - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: actions/download-artifact@v2 - name: Download Build Artifacts - with: - name: conda-py3_10-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml new file mode 100644 index 000000000000..617d1e372f49 --- /dev/null +++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml @@ -0,0 +1,739 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/macos_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: macos-arm64-binary-wheel + +on: +# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321 + push: + # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build + branches: + - nightly + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + - 'ciflow/binaries/*' + - 'ciflow/binaries_wheel/*' + workflow_dispatch: + +env: + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + ANACONDA_USER: pytorch + AWS_DEFAULT_REGION: us-east-1 + BUILD_ENVIRONMENT: macos-arm64-binary-wheel + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IN_CI: 1 + IS_GHA: 1 + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + PR_NUMBER: ${{ github.event.pull_request.number }} + SKIP_ALL_TESTS: 1 + CROSS_COMPILE_ARM64: 1 + +concurrency: + group: macos-arm64-binary-wheel-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + wheel-py3_7-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: macos-12 + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + # For sccache access (only on non-forked PRs) + AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + # shellcheck disable=SC2129 + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Install sccache (only for non-forked PRs, and pushes to trunk) + if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} + run: | + sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache + sudo chmod +x /usr/local/bin/sccache + echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" + - uses: actions/upload-artifact@v2 + if: always() + with: + name: wheel-py3_7-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + wheel-py3_7-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_7-cpu-build + env: + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: actions/download-artifact@v2 + name: Download Build Artifacts + with: + name: wheel-py3_7-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + wheel-py3_8-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: macos-12 + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + # For sccache access (only on non-forked PRs) + AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + # shellcheck disable=SC2129 + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Install sccache (only for non-forked PRs, and pushes to trunk) + if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} + run: | + sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache + sudo chmod +x /usr/local/bin/sccache + echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" + - uses: actions/upload-artifact@v2 + if: always() + with: + name: wheel-py3_8-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + wheel-py3_8-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_8-cpu-build + env: + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: actions/download-artifact@v2 + name: Download Build Artifacts + with: + name: wheel-py3_8-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + wheel-py3_9-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: macos-12 + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + # For sccache access (only on non-forked PRs) + AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + # shellcheck disable=SC2129 + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Install sccache (only for non-forked PRs, and pushes to trunk) + if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} + run: | + sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache + sudo chmod +x /usr/local/bin/sccache + echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" + - uses: actions/upload-artifact@v2 + if: always() + with: + name: wheel-py3_9-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + wheel-py3_9-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_9-cpu-build + env: + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: actions/download-artifact@v2 + name: Download Build Artifacts + with: + name: wheel-py3_9-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + wheel-py3_10-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: macos-12 + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + # For sccache access (only on non-forked PRs) + AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + # shellcheck disable=SC2129 + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Install sccache (only for non-forked PRs, and pushes to trunk) + if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} + run: | + sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache + sudo chmod +x /usr/local/bin/sccache + echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" + - uses: actions/upload-artifact@v2 + if: always() + with: + name: wheel-py3_10-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + wheel-py3_10-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_10-cpu-build + env: + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: actions/download-artifact@v2 + name: Download Build Artifacts + with: + name: wheel-py3_10-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af diff --git a/.github/workflows/generated-macos-arm64-binary-wheel.yml b/.github/workflows/generated-macos-arm64-binary-wheel.yml deleted file mode 100644 index cb407a313425..000000000000 --- a/.github/workflows/generated-macos-arm64-binary-wheel.yml +++ /dev/null @@ -1,754 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/macos_binary_build_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: macos-arm64-binary-wheel - -on: -# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321 - push: - # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build - branches: - - nightly - tags: - # NOTE: Binary build pipelines should only get triggered on release candidate builds - # Release candidate tags look like: v1.11.0-rc1 - - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ - - 'ciflow/binaries/*' - - 'ciflow/binaries_wheel/*' - workflow_dispatch: - -env: - # Needed for conda builds - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - ANACONDA_USER: pytorch - AWS_DEFAULT_REGION: us-east-1 - BUILD_ENVIRONMENT: macos-arm64-binary-wheel - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - IN_CI: 1 - IS_GHA: 1 - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - PR_NUMBER: ${{ github.event.pull_request.number }} - SKIP_ALL_TESTS: 1 - CROSS_COMPILE_ARM64: 1 - -concurrency: - group: macos-arm64-binary-wheel-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - wheel-py3_7-cpu-build: - runs-on: macos-10.15 - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.7" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 - if: always() - with: - name: wheel-py3_7-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - wheel-py3_7-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_7-cpu-build - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: actions/download-artifact@v2 - name: Download Build Artifacts - with: - name: wheel-py3_7-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - wheel-py3_8-cpu-build: - runs-on: macos-10.15 - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.8" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 - if: always() - with: - name: wheel-py3_8-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - wheel-py3_8-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_8-cpu-build - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: actions/download-artifact@v2 - name: Download Build Artifacts - with: - name: wheel-py3_8-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - wheel-py3_9-cpu-build: - runs-on: macos-10.15 - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.9" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 - if: always() - with: - name: wheel-py3_9-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - wheel-py3_9-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_9-cpu-build - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: actions/download-artifact@v2 - name: Download Build Artifacts - with: - name: wheel-py3_9-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - wheel-py3_10-cpu-build: - runs-on: macos-10.15 - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.10" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 - if: always() - with: - name: wheel-py3_10-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - wheel-py3_10-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_10-cpu-build - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: actions/download-artifact@v2 - name: Download Build Artifacts - with: - name: wheel-py3_10-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-macos-binary-conda-nightly.yml b/.github/workflows/generated-macos-binary-conda-nightly.yml new file mode 100644 index 000000000000..d5c6eae896cb --- /dev/null +++ b/.github/workflows/generated-macos-binary-conda-nightly.yml @@ -0,0 +1,737 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/macos_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: macos-binary-conda + +on: +# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321 + push: + # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build + branches: + - nightly + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + - 'ciflow/binaries/*' + - 'ciflow/binaries_conda/*' + workflow_dispatch: + +env: + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + ANACONDA_USER: pytorch + AWS_DEFAULT_REGION: us-east-1 + BUILD_ENVIRONMENT: macos-binary-conda + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IN_CI: 1 + IS_GHA: 1 + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + PR_NUMBER: ${{ github.event.pull_request.number }} + SKIP_ALL_TESTS: 1 +concurrency: + group: macos-binary-conda-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + conda-py3_7-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: macos-10.15 + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + # For sccache access (only on non-forked PRs) + AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + # shellcheck disable=SC2129 + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Install sccache (only for non-forked PRs, and pushes to trunk) + if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} + run: | + sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache + sudo chmod +x /usr/local/bin/sccache + echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" + - uses: actions/upload-artifact@v2 + if: always() + with: + name: conda-py3_7-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + conda-py3_7-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_7-cpu-build + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/conda-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: actions/download-artifact@v2 + name: Download Build Artifacts + with: + name: conda-py3_7-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_8-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: macos-10.15 + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + # For sccache access (only on non-forked PRs) + AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + # shellcheck disable=SC2129 + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Install sccache (only for non-forked PRs, and pushes to trunk) + if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} + run: | + sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache + sudo chmod +x /usr/local/bin/sccache + echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" + - uses: actions/upload-artifact@v2 + if: always() + with: + name: conda-py3_8-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + conda-py3_8-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_8-cpu-build + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/conda-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: actions/download-artifact@v2 + name: Download Build Artifacts + with: + name: conda-py3_8-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_9-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: macos-10.15 + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + # For sccache access (only on non-forked PRs) + AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + # shellcheck disable=SC2129 + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Install sccache (only for non-forked PRs, and pushes to trunk) + if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} + run: | + sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache + sudo chmod +x /usr/local/bin/sccache + echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" + - uses: actions/upload-artifact@v2 + if: always() + with: + name: conda-py3_9-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + conda-py3_9-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_9-cpu-build + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/conda-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: actions/download-artifact@v2 + name: Download Build Artifacts + with: + name: conda-py3_9-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_10-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: macos-10.15 + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + # For sccache access (only on non-forked PRs) + AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + # shellcheck disable=SC2129 + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Install sccache (only for non-forked PRs, and pushes to trunk) + if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} + run: | + sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache + sudo chmod +x /usr/local/bin/sccache + echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" + - uses: actions/upload-artifact@v2 + if: always() + with: + name: conda-py3_10-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + conda-py3_10-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_10-cpu-build + env: + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/conda-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: actions/download-artifact@v2 + name: Download Build Artifacts + with: + name: conda-py3_10-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af diff --git a/.github/workflows/generated-macos-binary-conda.yml b/.github/workflows/generated-macos-binary-conda.yml deleted file mode 100644 index db148ed0e024..000000000000 --- a/.github/workflows/generated-macos-binary-conda.yml +++ /dev/null @@ -1,752 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/macos_binary_build_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: macos-binary-conda - -on: -# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321 - push: - # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build - branches: - - nightly - tags: - # NOTE: Binary build pipelines should only get triggered on release candidate builds - # Release candidate tags look like: v1.11.0-rc1 - - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ - - 'ciflow/binaries/*' - - 'ciflow/binaries_conda/*' - workflow_dispatch: - -env: - # Needed for conda builds - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - ANACONDA_USER: pytorch - AWS_DEFAULT_REGION: us-east-1 - BUILD_ENVIRONMENT: macos-binary-conda - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - IN_CI: 1 - IS_GHA: 1 - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - PR_NUMBER: ${{ github.event.pull_request.number }} - SKIP_ALL_TESTS: 1 -concurrency: - group: macos-binary-conda-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - conda-py3_7-cpu-build: - runs-on: macos-10.15 - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.7" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 - if: always() - with: - name: conda-py3_7-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - conda-py3_7-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_7-cpu-build - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: actions/download-artifact@v2 - name: Download Build Artifacts - with: - name: conda-py3_7-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_8-cpu-build: - runs-on: macos-10.15 - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.8" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 - if: always() - with: - name: conda-py3_8-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - conda-py3_8-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_8-cpu-build - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: actions/download-artifact@v2 - name: Download Build Artifacts - with: - name: conda-py3_8-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_9-cpu-build: - runs-on: macos-10.15 - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.9" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 - if: always() - with: - name: conda-py3_9-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - conda-py3_9-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_9-cpu-build - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: actions/download-artifact@v2 - name: Download Build Artifacts - with: - name: conda-py3_9-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - conda-py3_10-cpu-build: - runs-on: macos-10.15 - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.10" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 - if: always() - with: - name: conda-py3_10-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - conda-py3_10-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_10-cpu-build - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: actions/download-artifact@v2 - name: Download Build Artifacts - with: - name: conda-py3_10-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml new file mode 100644 index 000000000000..eac3e4019cd3 --- /dev/null +++ b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml @@ -0,0 +1,761 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/macos_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: macos-binary-libtorch-cxx11-abi + +on: +# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321 + push: + # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build + branches: + - nightly + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + - 'ciflow/binaries/*' + - 'ciflow/binaries_libtorch/*' + workflow_dispatch: + +env: + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + ANACONDA_USER: pytorch + AWS_DEFAULT_REGION: us-east-1 + BUILD_ENVIRONMENT: macos-binary-libtorch-cxx11-abi + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IN_CI: 1 + IS_GHA: 1 + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + PR_NUMBER: ${{ github.event.pull_request.number }} + SKIP_ALL_TESTS: 1 +concurrency: + group: macos-binary-libtorch-cxx11-abi-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + libtorch-cpu-shared-with-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: macos-10.15 + # libtorch builds take a long time on github hosted runners + timeout-minutes: 720 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + # For sccache access (only on non-forked PRs) + AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + # shellcheck disable=SC2129 + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Install sccache (only for non-forked PRs, and pushes to trunk) + if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} + run: | + sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache + sudo chmod +x /usr/local/bin/sccache + echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" + - uses: actions/upload-artifact@v2 + if: always() + with: + name: libtorch-cpu-shared-with-deps-cxx11-abi + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + libtorch-cpu-shared-with-deps-cxx11-abi-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-shared-with-deps-cxx11-abi-build + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: actions/download-artifact@v2 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-shared-without-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: macos-10.15 + # libtorch builds take a long time on github hosted runners + timeout-minutes: 720 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + # For sccache access (only on non-forked PRs) + AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + # shellcheck disable=SC2129 + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Install sccache (only for non-forked PRs, and pushes to trunk) + if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} + run: | + sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache + sudo chmod +x /usr/local/bin/sccache + echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" + - uses: actions/upload-artifact@v2 + if: always() + with: + name: libtorch-cpu-shared-without-deps-cxx11-abi + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + libtorch-cpu-shared-without-deps-cxx11-abi-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-shared-without-deps-cxx11-abi-build + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: actions/download-artifact@v2 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-without-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-static-with-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: macos-10.15 + # libtorch builds take a long time on github hosted runners + timeout-minutes: 720 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + # For sccache access (only on non-forked PRs) + AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + # shellcheck disable=SC2129 + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Install sccache (only for non-forked PRs, and pushes to trunk) + if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} + run: | + sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache + sudo chmod +x /usr/local/bin/sccache + echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" + - uses: actions/upload-artifact@v2 + if: always() + with: + name: libtorch-cpu-static-with-deps-cxx11-abi + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + libtorch-cpu-static-with-deps-cxx11-abi-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-static-with-deps-cxx11-abi-build + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: actions/download-artifact@v2 + name: Download Build Artifacts + with: + name: libtorch-cpu-static-with-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-static-without-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: macos-10.15 + # libtorch builds take a long time on github hosted runners + timeout-minutes: 720 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + # For sccache access (only on non-forked PRs) + AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + # shellcheck disable=SC2129 + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Install sccache (only for non-forked PRs, and pushes to trunk) + if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} + run: | + sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache + sudo chmod +x /usr/local/bin/sccache + echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" + - uses: actions/upload-artifact@v2 + if: always() + with: + name: libtorch-cpu-static-without-deps-cxx11-abi + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + libtorch-cpu-static-without-deps-cxx11-abi-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-static-without-deps-cxx11-abi-build + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: cxx11-abi + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: actions/download-artifact@v2 + name: Download Build Artifacts + with: + name: libtorch-cpu-static-without-deps-cxx11-abi + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af diff --git a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi.yml b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi.yml deleted file mode 100644 index 5f9ea6396f6c..000000000000 --- a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi.yml +++ /dev/null @@ -1,788 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/macos_binary_build_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: macos-binary-libtorch-cxx11-abi - -on: -# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321 - push: - # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build - branches: - - nightly - tags: - # NOTE: Binary build pipelines should only get triggered on release candidate builds - # Release candidate tags look like: v1.11.0-rc1 - - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ - - 'ciflow/binaries/*' - - 'ciflow/binaries_libtorch/*' - workflow_dispatch: - -env: - # Needed for conda builds - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - ANACONDA_USER: pytorch - AWS_DEFAULT_REGION: us-east-1 - BUILD_ENVIRONMENT: macos-binary-libtorch-cxx11-abi - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - IN_CI: 1 - IS_GHA: 1 - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - PR_NUMBER: ${{ github.event.pull_request.number }} - SKIP_ALL_TESTS: 1 -concurrency: - group: macos-binary-libtorch-cxx11-abi-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - libtorch-cpu-shared-with-deps-cxx11-abi-build: - runs-on: macos-10.15 - # libtorch builds take a long time on github hosted runners - timeout-minutes: 720 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 - if: always() - with: - name: libtorch-cpu-shared-with-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - libtorch-cpu-shared-with-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-shared-with-deps-cxx11-abi-build - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: actions/download-artifact@v2 - name: Download Build Artifacts - with: - name: libtorch-cpu-shared-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-shared-without-deps-cxx11-abi-build: - runs-on: macos-10.15 - # libtorch builds take a long time on github hosted runners - timeout-minutes: 720 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 - if: always() - with: - name: libtorch-cpu-shared-without-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - libtorch-cpu-shared-without-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-shared-without-deps-cxx11-abi-build - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: actions/download-artifact@v2 - name: Download Build Artifacts - with: - name: libtorch-cpu-shared-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-static-with-deps-cxx11-abi-build: - runs-on: macos-10.15 - # libtorch builds take a long time on github hosted runners - timeout-minutes: 720 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 - if: always() - with: - name: libtorch-cpu-static-with-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - libtorch-cpu-static-with-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-static-with-deps-cxx11-abi-build - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: actions/download-artifact@v2 - name: Download Build Artifacts - with: - name: libtorch-cpu-static-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-static-without-deps-cxx11-abi-build: - runs-on: macos-10.15 - # libtorch builds take a long time on github hosted runners - timeout-minutes: 720 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 - if: always() - with: - name: libtorch-cpu-static-without-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - libtorch-cpu-static-without-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-static-without-deps-cxx11-abi-build - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: actions/download-artifact@v2 - name: Download Build Artifacts - with: - name: libtorch-cpu-static-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml new file mode 100644 index 000000000000..b943ea97a970 --- /dev/null +++ b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml @@ -0,0 +1,761 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/macos_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: macos-binary-libtorch-pre-cxx11 + +on: +# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321 + push: + # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build + branches: + - nightly + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + - 'ciflow/binaries/*' + - 'ciflow/binaries_libtorch/*' + workflow_dispatch: + +env: + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + ANACONDA_USER: pytorch + AWS_DEFAULT_REGION: us-east-1 + BUILD_ENVIRONMENT: macos-binary-libtorch-pre-cxx11 + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IN_CI: 1 + IS_GHA: 1 + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + PR_NUMBER: ${{ github.event.pull_request.number }} + SKIP_ALL_TESTS: 1 +concurrency: + group: macos-binary-libtorch-pre-cxx11-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + libtorch-cpu-shared-with-deps-pre-cxx11-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: macos-10.15 + # libtorch builds take a long time on github hosted runners + timeout-minutes: 720 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + # For sccache access (only on non-forked PRs) + AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + # shellcheck disable=SC2129 + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Install sccache (only for non-forked PRs, and pushes to trunk) + if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} + run: | + sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache + sudo chmod +x /usr/local/bin/sccache + echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" + - uses: actions/upload-artifact@v2 + if: always() + with: + name: libtorch-cpu-shared-with-deps-pre-cxx11 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + libtorch-cpu-shared-with-deps-pre-cxx11-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-shared-with-deps-pre-cxx11-build + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: actions/download-artifact@v2 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-with-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-shared-without-deps-pre-cxx11-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: macos-10.15 + # libtorch builds take a long time on github hosted runners + timeout-minutes: 720 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + # For sccache access (only on non-forked PRs) + AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + # shellcheck disable=SC2129 + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Install sccache (only for non-forked PRs, and pushes to trunk) + if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} + run: | + sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache + sudo chmod +x /usr/local/bin/sccache + echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" + - uses: actions/upload-artifact@v2 + if: always() + with: + name: libtorch-cpu-shared-without-deps-pre-cxx11 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + libtorch-cpu-shared-without-deps-pre-cxx11-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-shared-without-deps-pre-cxx11-build + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: shared-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: actions/download-artifact@v2 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-without-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-static-with-deps-pre-cxx11-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: macos-10.15 + # libtorch builds take a long time on github hosted runners + timeout-minutes: 720 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + # For sccache access (only on non-forked PRs) + AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + # shellcheck disable=SC2129 + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Install sccache (only for non-forked PRs, and pushes to trunk) + if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} + run: | + sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache + sudo chmod +x /usr/local/bin/sccache + echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" + - uses: actions/upload-artifact@v2 + if: always() + with: + name: libtorch-cpu-static-with-deps-pre-cxx11 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + libtorch-cpu-static-with-deps-pre-cxx11-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-static-with-deps-pre-cxx11-build + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: actions/download-artifact@v2 + name: Download Build Artifacts + with: + name: libtorch-cpu-static-with-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-static-without-deps-pre-cxx11-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: macos-10.15 + # libtorch builds take a long time on github hosted runners + timeout-minutes: 720 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + # For sccache access (only on non-forked PRs) + AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + # shellcheck disable=SC2129 + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Install sccache (only for non-forked PRs, and pushes to trunk) + if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} + run: | + sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache + sudo chmod +x /usr/local/bin/sccache + echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" + - uses: actions/upload-artifact@v2 + if: always() + with: + name: libtorch-cpu-static-without-deps-pre-cxx11 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + libtorch-cpu-static-without-deps-pre-cxx11-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-static-without-deps-pre-cxx11-build + env: + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_VARIANT: static-without-deps + DESIRED_DEVTOOLSET: pre-cxx11 + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: actions/download-artifact@v2 + name: Download Build Artifacts + with: + name: libtorch-cpu-static-without-deps-pre-cxx11 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af diff --git a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11.yml b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11.yml deleted file mode 100644 index 0cac68d72912..000000000000 --- a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11.yml +++ /dev/null @@ -1,788 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/macos_binary_build_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: macos-binary-libtorch-pre-cxx11 - -on: -# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321 - push: - # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build - branches: - - nightly - tags: - # NOTE: Binary build pipelines should only get triggered on release candidate builds - # Release candidate tags look like: v1.11.0-rc1 - - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ - - 'ciflow/binaries/*' - - 'ciflow/binaries_libtorch/*' - workflow_dispatch: - -env: - # Needed for conda builds - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - ANACONDA_USER: pytorch - AWS_DEFAULT_REGION: us-east-1 - BUILD_ENVIRONMENT: macos-binary-libtorch-pre-cxx11 - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - IN_CI: 1 - IS_GHA: 1 - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - PR_NUMBER: ${{ github.event.pull_request.number }} - SKIP_ALL_TESTS: 1 -concurrency: - group: macos-binary-libtorch-pre-cxx11-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - libtorch-cpu-shared-with-deps-pre-cxx11-build: - runs-on: macos-10.15 - # libtorch builds take a long time on github hosted runners - timeout-minutes: 720 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 - if: always() - with: - name: libtorch-cpu-shared-with-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - libtorch-cpu-shared-with-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-shared-with-deps-pre-cxx11-build - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: actions/download-artifact@v2 - name: Download Build Artifacts - with: - name: libtorch-cpu-shared-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-shared-without-deps-pre-cxx11-build: - runs-on: macos-10.15 - # libtorch builds take a long time on github hosted runners - timeout-minutes: 720 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 - if: always() - with: - name: libtorch-cpu-shared-without-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - libtorch-cpu-shared-without-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-shared-without-deps-pre-cxx11-build - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: actions/download-artifact@v2 - name: Download Build Artifacts - with: - name: libtorch-cpu-shared-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-static-with-deps-pre-cxx11-build: - runs-on: macos-10.15 - # libtorch builds take a long time on github hosted runners - timeout-minutes: 720 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 - if: always() - with: - name: libtorch-cpu-static-with-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - libtorch-cpu-static-with-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-static-with-deps-pre-cxx11-build - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: actions/download-artifact@v2 - name: Download Build Artifacts - with: - name: libtorch-cpu-static-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-static-without-deps-pre-cxx11-build: - runs-on: macos-10.15 - # libtorch builds take a long time on github hosted runners - timeout-minutes: 720 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 - if: always() - with: - name: libtorch-cpu-static-without-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - libtorch-cpu-static-without-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-static-without-deps-pre-cxx11-build - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: actions/download-artifact@v2 - name: Download Build Artifacts - with: - name: libtorch-cpu-static-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-macos-binary-wheel-nightly.yml b/.github/workflows/generated-macos-binary-wheel-nightly.yml new file mode 100644 index 000000000000..2dd93eea93ca --- /dev/null +++ b/.github/workflows/generated-macos-binary-wheel-nightly.yml @@ -0,0 +1,737 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/macos_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: macos-binary-wheel + +on: +# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321 + push: + # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build + branches: + - nightly + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + - 'ciflow/binaries/*' + - 'ciflow/binaries_wheel/*' + workflow_dispatch: + +env: + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + ANACONDA_USER: pytorch + AWS_DEFAULT_REGION: us-east-1 + BUILD_ENVIRONMENT: macos-binary-wheel + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IN_CI: 1 + IS_GHA: 1 + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + PR_NUMBER: ${{ github.event.pull_request.number }} + SKIP_ALL_TESTS: 1 +concurrency: + group: macos-binary-wheel-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + wheel-py3_7-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: macos-10.15 + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + # For sccache access (only on non-forked PRs) + AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + # shellcheck disable=SC2129 + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Install sccache (only for non-forked PRs, and pushes to trunk) + if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} + run: | + sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache + sudo chmod +x /usr/local/bin/sccache + echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" + - uses: actions/upload-artifact@v2 + if: always() + with: + name: wheel-py3_7-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + wheel-py3_7-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_7-cpu-build + env: + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: actions/download-artifact@v2 + name: Download Build Artifacts + with: + name: wheel-py3_7-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + wheel-py3_8-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: macos-10.15 + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + # For sccache access (only on non-forked PRs) + AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + # shellcheck disable=SC2129 + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Install sccache (only for non-forked PRs, and pushes to trunk) + if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} + run: | + sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache + sudo chmod +x /usr/local/bin/sccache + echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" + - uses: actions/upload-artifact@v2 + if: always() + with: + name: wheel-py3_8-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + wheel-py3_8-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_8-cpu-build + env: + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: actions/download-artifact@v2 + name: Download Build Artifacts + with: + name: wheel-py3_8-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + wheel-py3_9-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: macos-10.15 + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + # For sccache access (only on non-forked PRs) + AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + # shellcheck disable=SC2129 + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Install sccache (only for non-forked PRs, and pushes to trunk) + if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} + run: | + sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache + sudo chmod +x /usr/local/bin/sccache + echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" + - uses: actions/upload-artifact@v2 + if: always() + with: + name: wheel-py3_9-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + wheel-py3_9-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_9-cpu-build + env: + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: actions/download-artifact@v2 + name: Download Build Artifacts + with: + name: wheel-py3_9-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + wheel-py3_10-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: macos-10.15 + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + # For sccache access (only on non-forked PRs) + AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + steps: + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + # shellcheck disable=SC2129 + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + # shellcheck disable=SC2129 + echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" + - name: Install conda and dependencies + run: | + # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on + curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + chmod +x "${RUNNER_TEMP}/conda.sh" + /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" + echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Install sccache (only for non-forked PRs, and pushes to trunk) + if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} + run: | + sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache + sudo chmod +x /usr/local/bin/sccache + echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" + - name: Populate binary env + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + run: | + # shellcheck disable=SC1091 + source "${RUNNER_TEMP}/anaconda/bin/activate" + "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" + - uses: actions/upload-artifact@v2 + if: always() + with: + name: wheel-py3_10-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + wheel-py3_10-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_10-cpu-build + env: + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: actions/download-artifact@v2 + name: Download Build Artifacts + with: + name: wheel-py3_10-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af diff --git a/.github/workflows/generated-macos-binary-wheel.yml b/.github/workflows/generated-macos-binary-wheel.yml deleted file mode 100644 index 2a97b166dd73..000000000000 --- a/.github/workflows/generated-macos-binary-wheel.yml +++ /dev/null @@ -1,752 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/macos_binary_build_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: macos-binary-wheel - -on: -# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321 - push: - # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build - branches: - - nightly - tags: - # NOTE: Binary build pipelines should only get triggered on release candidate builds - # Release candidate tags look like: v1.11.0-rc1 - - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ - - 'ciflow/binaries/*' - - 'ciflow/binaries_wheel/*' - workflow_dispatch: - -env: - # Needed for conda builds - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - ANACONDA_USER: pytorch - AWS_DEFAULT_REGION: us-east-1 - BUILD_ENVIRONMENT: macos-binary-wheel - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - IN_CI: 1 - IS_GHA: 1 - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - PR_NUMBER: ${{ github.event.pull_request.number }} - SKIP_ALL_TESTS: 1 -concurrency: - group: macos-binary-wheel-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - wheel-py3_7-cpu-build: - runs-on: macos-10.15 - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.7" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 - if: always() - with: - name: wheel-py3_7-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - wheel-py3_7-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_7-cpu-build - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: actions/download-artifact@v2 - name: Download Build Artifacts - with: - name: wheel-py3_7-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - wheel-py3_8-cpu-build: - runs-on: macos-10.15 - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.8" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 - if: always() - with: - name: wheel-py3_8-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - wheel-py3_8-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_8-cpu-build - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: actions/download-artifact@v2 - name: Download Build Artifacts - with: - name: wheel-py3_8-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - wheel-py3_9-cpu-build: - runs-on: macos-10.15 - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.9" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 - if: always() - with: - name: wheel-py3_9-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - wheel-py3_9-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_9-cpu-build - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: actions/download-artifact@v2 - name: Download Build Artifacts - with: - name: wheel-py3_9-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - wheel-py3_10-cpu-build: - runs-on: macos-10.15 - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.10" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - run: | - sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v2 - if: always() - with: - name: wheel-py3_10-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - wheel-py3_10-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_10-cpu-build - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: actions/download-artifact@v2 - name: Download Build Artifacts - with: - name: wheel-py3_10-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml b/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml deleted file mode 100644 index d9182993f0c1..000000000000 --- a/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml +++ /dev/null @@ -1,539 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: parallelnative-linux-xenial-py3.7-gcc5.4 - -on: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/cpu/*' - - 'ciflow/linux/*' - - 'ciflow/trunk/*' - branches: - - master - - release/* - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: parallelnative-linux-xenial-py3.7-gcc5.4 - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.7-gcc5.4 - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: parallelnative-linux-xenial-py3.7-gcc5.4-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - build: - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - JOB_BASE_NAME: parallelnative-linux-xenial-py3.7-gcc5.4-build - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Archive artifacts into zip - run: | - zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@v3 - name: Store PyTorch Build Artifacts on S3 - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.2xlarge - ENABLE_DISTRIBUTED_TEST: 1 - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 1 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: parallelnative-linux-xenial-py3.7-gcc5.4-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 240 minutes - timeout-minutes: 240 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - PROXY_ENV= - # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now - # We should investigate whether or not there's a list of hostnames we can add to no_proxy to - # make it so that we shouldn't have to fully disable squid for XLA tests - if [[ $TEST_CONFIG != 'xla' ]]; then - # shellcheck disable=SC2089 - PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - ${PROXY_ENV} \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --ipc=host \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: parallelnative-linux-xenial-py3.7-gcc5.4-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7.yml b/.github/workflows/generated-periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7.yml deleted file mode 100644 index 0c2df1244222..000000000000 --- a/.github/workflows/generated-periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7.yml +++ /dev/null @@ -1,237 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7 - -on: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/cuda/*' - - 'ciflow/libtorch/*' - - 'ciflow/linux/*' - - 'ciflow/scheduled/*' - schedule: - - cron: 45 4,10,16,22 * * * - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7 - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda11.5-cudnn8-py3-gcc7 - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - build: - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - JOB_BASE_NAME: periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7-build - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7.yml b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7.yml deleted file mode 100644 index 366395af1f20..000000000000 --- a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7.yml +++ /dev/null @@ -1,237 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7 - -on: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/cuda/*' - - 'ciflow/libtorch/*' - - 'ciflow/linux/*' - - 'ciflow/scheduled/*' - schedule: - - cron: 45 0,4,8,12,16,20 * * * - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7 - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7 - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - build: - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - JOB_BASE_NAME: periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7-build - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml b/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml deleted file mode 100644 index 85e1ca4101bd..000000000000 --- a/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml +++ /dev/null @@ -1,538 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: periodic-linux-bionic-cuda11.5-py3.7-gcc7 - -on: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/cuda/*' - - 'ciflow/linux/*' - - 'ciflow/scheduled/*' - schedule: - - cron: 45 4,10,16,22 * * * - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: periodic-linux-bionic-cuda11.5-py3.7-gcc7 - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda11.5-cudnn8-py3-gcc7 - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: periodic-linux-bionic-cuda11.5-py3.7-gcc7-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - build: - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-build - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Archive artifacts into zip - run: | - zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@v3 - name: Store PyTorch Build Artifacts on S3 - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu - ENABLE_DISTRIBUTED_TEST: 1 - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 240 minutes - timeout-minutes: 240 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - PROXY_ENV= - # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now - # We should investigate whether or not there's a list of hostnames we can add to no_proxy to - # make it so that we shouldn't have to fully disable squid for XLA tests - if [[ $TEST_CONFIG != 'xla' ]]; then - # shellcheck disable=SC2089 - PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - ${PROXY_ENV} \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --ipc=host \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml b/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml deleted file mode 100644 index 3c9c3c1199ab..000000000000 --- a/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml +++ /dev/null @@ -1,540 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck - -on: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/cuda/*' - - 'ciflow/linux/*' - - 'ciflow/scheduled/*' - - 'ciflow/slow/*' - - 'ciflow/slow-gradcheck/*' - schedule: - - cron: 0 */4 * * * - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7 - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - build: - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - JOB_BASE_NAME: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-build - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Archive artifacts into zip - run: | - zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@v3 - name: Store PyTorch Build Artifacts on S3 - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu - ENABLE_DISTRIBUTED_TEST: '' - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 360 minutes - timeout-minutes: 360 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - PROXY_ENV= - # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now - # We should investigate whether or not there's a list of hostnames we can add to no_proxy to - # make it so that we shouldn't have to fully disable squid for XLA tests - if [[ $TEST_CONFIG != 'xla' ]]; then - # shellcheck disable=SC2089 - PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - ${PROXY_ENV} \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --ipc=host \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml deleted file mode 100644 index 2e325fca8ad2..000000000000 --- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml +++ /dev/null @@ -1,539 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/linux_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug - -on: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/cuda/*' - - 'ciflow/linux/*' - - 'ciflow/scheduled/*' - schedule: - - cron: 45 0,4,8,12,16,20 * * * - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7 - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 - DEBUG: 1 -concurrency: - group: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - build: - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-build - outputs: - docker_image: ${{ steps.calculate-tag.outputs.docker_image }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - container_name=$(docker run \ - -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 - - name: Chown workspace - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Archive artifacts into zip - run: | - zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json - - uses: seemethere/upload-artifact-s3@v3 - name: Store PyTorch Build Artifacts on S3 - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - artifacts.zip - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Clean up docker images - if: always() - run: | - # Prune all of the docker images - docker system prune -af - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu - ENABLE_DISTRIBUTED_TEST: 1 - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 240 minutes - timeout-minutes: 240 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - PROXY_ENV= - # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now - # We should investigate whether or not there's a list of hostnames we can add to no_proxy to - # make it so that we shouldn't have to fully disable squid for XLA tests - if [[ $TEST_CONFIG != 'xla' ]]; then - # shellcheck disable=SC2089 - PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - ${PROXY_ENV} \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --ipc=host \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml deleted file mode 100644 index 11d24eafb62d..000000000000 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml +++ /dev/null @@ -1,321 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/windows_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: periodic-win-vs2019-cuda11.1-py3 - -on: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/cuda/*' - - 'ciflow/scheduled/*' - - 'ciflow/win/*' - schedule: - - cron: 45 0,4,8,12,16,20 * * * - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: periodic-win-vs2019-cuda11.1-py3 - BUILD_WHEEL: 1 - MAX_JOBS: 8 - CUDA_VERSION: "11.1" - IN_CI: 1 - IS_GHA: 1 - INSTALL_WINDOWS_SDK: 1 - PYTHON_VERSION: "3.8" - PYTORCH_RETRY_TEST_CASES: 1 - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - SCCACHE_BUCKET: "ossci-compiler-cache" - VC_PRODUCT: "BuildTools" - VC_VERSION: "" - VS_VERSION: "16.8.6" - VC_YEAR: "2019" - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - no_proxy: localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TORCH_CUDA_ARCH_LIST: "7.0" - USE_CUDA: 1 - -concurrency: - group: periodic-win-vs2019-cuda11.1-py3-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - build: - runs-on: "windows.4xlarge" - timeout-minutes: 240 - env: - JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-build - http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Install Visual Studio 2019 toolchain - shell: powershell - run: | - .\.circleci\scripts\vs_install.ps1 - - name: Install Cuda - shell: bash - run: | - .circleci/scripts/windows_cuda_install.sh - - name: Install Cudnn - shell: bash - run: | - .circleci/scripts/windows_cudnn_install.sh - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - .jenkins/pytorch/win-build.sh - # Upload to github so that people can click and download artifacts - - name: Upload artifacts to s3 - uses: seemethere/upload-artifact-s3@v3 - with: - retention-days: 14 - if-no-files-found: error - name: ${{ env.BUILD_ENVIRONMENT }} - path: C:\${{ github.run_id }}\build-results - - name: Wait until all sessions have drained - shell: powershell - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - - name: Cleanup build-results and workspaces - if: always() - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - # Should remove the entirety of pytorch-${{ github.run_id }} - run: | - rm -rf "${PYTORCH_FINAL_PACKAGE_DIR}" - rm -rf ./* - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: windows.8xlarge.nvidia.gpu - NUM_TEST_SHARDS: 2 - NUM_TEST_SHARDS_ON_PULL_REQUEST: 2 - PR_BODY: ${{ github.event.pull_request.body }} - NOGPU_RUNNER_TYPE: windows.4xlarge - ENABLE_FORCE_ON_CPU_TEST: '' - RUN_SMOKE_TESTS_ONLY_ON_PR: False - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - timeout-minutes: 240 - env: - JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-test - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - TEST_CONFIG: ${{ matrix.config }} - http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Install Visual Studio 2019 toolchain - shell: powershell - run: | - .\.circleci\scripts\vs_install.ps1 - - name: Install Cuda - if: ${{ matrix.config != 'force_on_cpu' }} - shell: bash - run: | - .circleci/scripts/windows_cuda_install.sh - - name: Install Cudnn - if: ${{ matrix.config != 'force_on_cpu' }} - shell: bash - run: | - .circleci/scripts/windows_cudnn_install.sh - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - path: C:\${{ github.run_id }}\build-results - - name: Check build-results folder - shell: powershell - run: | - tree /F C:\$Env:GITHUB_RUN_ID\build-results - # Needed for coverage in win-test.sh - - uses: actions/setup-python@v2 - name: Setup Python3 - with: - python-version: '3.x' - - name: Test - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - # Time out the test phase after 3.5 hours - timeout-minutes: 210 - run: | - .jenkins/pytorch/win-test.sh - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Wait until all sessions have drained - shell: powershell - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Cleanup workspace - if: always() - shell: bash - # Should remove the entirety of pytorch-${{ github.run_id }} - run: | - rm -rf ./* diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml deleted file mode 100644 index f89ea43911e2..000000000000 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml +++ /dev/null @@ -1,321 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/windows_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: periodic-win-vs2019-cuda11.5-py3 - -on: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/cuda/*' - - 'ciflow/scheduled/*' - - 'ciflow/win/*' - schedule: - - cron: 45 4,10,16,22 * * * - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: periodic-win-vs2019-cuda11.5-py3 - BUILD_WHEEL: 1 - MAX_JOBS: 8 - CUDA_VERSION: "11.5" - IN_CI: 1 - IS_GHA: 1 - INSTALL_WINDOWS_SDK: 1 - PYTHON_VERSION: "3.8" - PYTORCH_RETRY_TEST_CASES: 1 - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - SCCACHE_BUCKET: "ossci-compiler-cache" - VC_PRODUCT: "BuildTools" - VC_VERSION: "" - VS_VERSION: "16.8.6" - VC_YEAR: "2019" - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - no_proxy: localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TORCH_CUDA_ARCH_LIST: "7.0" - USE_CUDA: 1 - -concurrency: - group: periodic-win-vs2019-cuda11.5-py3-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - build: - runs-on: "windows.4xlarge" - timeout-minutes: 240 - env: - JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-build - http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Install Visual Studio 2019 toolchain - shell: powershell - run: | - .\.circleci\scripts\vs_install.ps1 - - name: Install Cuda - shell: bash - run: | - .circleci/scripts/windows_cuda_install.sh - - name: Install Cudnn - shell: bash - run: | - .circleci/scripts/windows_cudnn_install.sh - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - .jenkins/pytorch/win-build.sh - # Upload to github so that people can click and download artifacts - - name: Upload artifacts to s3 - uses: seemethere/upload-artifact-s3@v3 - with: - retention-days: 14 - if-no-files-found: error - name: ${{ env.BUILD_ENVIRONMENT }} - path: C:\${{ github.run_id }}\build-results - - name: Wait until all sessions have drained - shell: powershell - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - - name: Cleanup build-results and workspaces - if: always() - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - # Should remove the entirety of pytorch-${{ github.run_id }} - run: | - rm -rf "${PYTORCH_FINAL_PACKAGE_DIR}" - rm -rf ./* - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: windows.8xlarge.nvidia.gpu - NUM_TEST_SHARDS: 2 - NUM_TEST_SHARDS_ON_PULL_REQUEST: 2 - PR_BODY: ${{ github.event.pull_request.body }} - NOGPU_RUNNER_TYPE: windows.4xlarge - ENABLE_FORCE_ON_CPU_TEST: 1 - RUN_SMOKE_TESTS_ONLY_ON_PR: False - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - timeout-minutes: 240 - env: - JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-test - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - TEST_CONFIG: ${{ matrix.config }} - http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Install Visual Studio 2019 toolchain - shell: powershell - run: | - .\.circleci\scripts\vs_install.ps1 - - name: Install Cuda - if: ${{ matrix.config != 'force_on_cpu' }} - shell: bash - run: | - .circleci/scripts/windows_cuda_install.sh - - name: Install Cudnn - if: ${{ matrix.config != 'force_on_cpu' }} - shell: bash - run: | - .circleci/scripts/windows_cudnn_install.sh - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - path: C:\${{ github.run_id }}\build-results - - name: Check build-results folder - shell: powershell - run: | - tree /F C:\$Env:GITHUB_RUN_ID\build-results - # Needed for coverage in win-test.sh - - uses: actions/setup-python@v2 - name: Setup Python3 - with: - python-version: '3.x' - - name: Test - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - # Time out the test phase after 3.5 hours - timeout-minutes: 210 - run: | - .jenkins/pytorch/win-test.sh - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Wait until all sessions have drained - shell: powershell - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Cleanup workspace - if: always() - shell: bash - # Should remove the entirety of pytorch-${{ github.run_id }} - run: | - rm -rf ./* diff --git a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build.yml b/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build.yml deleted file mode 100644 index bccd46728c31..000000000000 --- a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build.yml +++ /dev/null @@ -1,507 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/android_ci_full_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build - -on: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/android/*' - - 'ciflow/cpu/*' - - 'ciflow/linux/*' - - 'ciflow/trunk/*' - branches: - - master - - release/* - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - # building and testing in a single job since bazel runs only small subset of tests - build-and-test: - runs-on: linux.2xlarge - env: - JOB_BASE_NAME: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build-build-and-test - NUM_TEST_SHARDS: 1 - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - name: Output disk space left - run: | - sudo df -H - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build-arm-v7a - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - #!/bin/bash -eo pipefail - # Pull Docker image and run build - time docker pull "${DOCKER_IMAGE}" >/dev/null - echo "${DOCKER_IMAGE}" - export container_name - container_name=$(docker run \ - -e BUILD_ENVIRONMENT=pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v7a-build \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0 - docker cp "${GITHUB_WORKSPACE}/." "${container_name}:/var/lib/jenkins/workspace" - # shellcheck disable=SC1105 - ((echo "sudo chown -R jenkins . && .jenkins/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete") | docker exec -u jenkins -i "${container_name}" bash) 2>&1 - - # Copy dist folder back - export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-arm-v7a - docker cp "${container_name}:/var/lib/jenkins/workspace/dist" "${GITHUB_WORKSPACE}/." || echo "Dist folder not found" - docker commit "${container_name}" "${COMMIT_DOCKER_IMAGE}" - time docker push "${COMMIT_DOCKER_IMAGE}" - - name: Build-arm-v8a - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - #!/bin/bash -eo pipefail - # Pull Docker image and run build - time docker pull "${DOCKER_IMAGE}" >/dev/null - echo "${DOCKER_IMAGE}" - export container_name - container_name=$(docker run \ - -e BUILD_ENVIRONMENT=pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v8a-build \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0 - docker cp "${GITHUB_WORKSPACE}/." "${container_name}:/var/lib/jenkins/workspace" - # shellcheck disable=SC1105 - ((echo "sudo chown -R jenkins . && .jenkins/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete") | docker exec -u jenkins -i "${container_name}" bash) 2>&1 - - # Copy dist folder back - export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-arm-v8a - docker cp "${container_name}:/var/lib/jenkins/workspace/dist" "${GITHUB_WORKSPACE}/." || echo "Dist folder not found" - docker commit "${container_name}" "${COMMIT_DOCKER_IMAGE}" - time docker push "${COMMIT_DOCKER_IMAGE}" - - name: Build-x86_32 - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - #!/bin/bash -eo pipefail - # Pull Docker image and run build - time docker pull "${DOCKER_IMAGE}" >/dev/null - echo "${DOCKER_IMAGE}" - export container_name - container_name=$(docker run \ - -e BUILD_ENVIRONMENT=pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32-build \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0 - docker cp "${GITHUB_WORKSPACE}/." "${container_name}:/var/lib/jenkins/workspace" - # shellcheck disable=SC1105 - ((echo "sudo chown -R jenkins . && .jenkins/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete") | docker exec -u jenkins -i "${container_name}" bash) 2>&1 - - # Copy dist folder back - export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-x86_32 - docker cp "${container_name}:/var/lib/jenkins/workspace/dist" "${GITHUB_WORKSPACE}/." || echo "Dist folder not found" - docker commit "${container_name}" "${COMMIT_DOCKER_IMAGE}" - time docker push "${COMMIT_DOCKER_IMAGE}" - - name: Build-x86_64 - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - # detached container should get cleaned up by teardown_ec2_linux - #!/bin/bash -eo pipefail - # Pull Docker image and run build - time docker pull "${DOCKER_IMAGE}" >/dev/null - echo "${DOCKER_IMAGE}" - export container_name - container_name=$(docker run \ - -e BUILD_ENVIRONMENT=pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_64-build \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0 - docker cp "${GITHUB_WORKSPACE}/." "${container_name}:/var/lib/jenkins/workspace" - # shellcheck disable=SC1105 - ((echo "sudo chown -R jenkins . && .jenkins/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete") | docker exec -u jenkins -i "${container_name}" bash) 2>&1 - - # Copy dist folder back - export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}-x86_64 - docker cp "${container_name}:/var/lib/jenkins/workspace/dist" "${GITHUB_WORKSPACE}/." || echo "Dist folder not found" - docker commit "${container_name}" "${COMMIT_DOCKER_IMAGE}" - time docker push "${COMMIT_DOCKER_IMAGE}" - - name: Build-Final-Artifcact - env: - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - set -eux - - docker_image_libtorch_android_x86_32="${DOCKER_IMAGE}-x86_32" - docker_image_libtorch_android_x86_64="${DOCKER_IMAGE}-x86_64" - docker_image_libtorch_android_arm_v7a="${DOCKER_IMAGE}-arm-v7a" - docker_image_libtorch_android_arm_v8a="${DOCKER_IMAGE}-arm-v8a" - - echo "docker_image_commit: ${DOCKER_IMAGE}" - echo "docker_image_libtorch_android_x86_32: ${docker_image_libtorch_android_x86_32}" - echo "docker_image_libtorch_android_x86_64: ${docker_image_libtorch_android_x86_64}" - echo "docker_image_libtorch_android_arm_v7a: ${docker_image_libtorch_android_arm_v7a}" - echo "docker_image_libtorch_android_arm_v8a: ${docker_image_libtorch_android_arm_v8a}" - - # x86_32 - time docker pull "${docker_image_libtorch_android_x86_32}" >/dev/null - export id_x86_32 - id_x86_32=$(docker run -e GRADLE_OFFLINE=1 --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins "${docker_image_libtorch_android_x86_32}") - - # shellcheck disable=SC1105 - ((echo "sudo chown -R jenkins workspace") | docker exec -u jenkins -i "${id_x86_32}" bash) 2>&1 - - # arm-v7a - time docker pull "${docker_image_libtorch_android_arm_v7a}" >/dev/null - export id_arm_v7a - id_arm_v7a=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins "${docker_image_libtorch_android_arm_v7a}") - - # shellcheck disable=SC1105 - ((echo "sudo chown -R jenkins workspace") | docker exec -u jenkins -i "${id_arm_v7a}" bash) 2>&1 - - mkdir -p "${GITHUB_WORKSPACE}/build_android_install_arm_v7a" - docker cp "${id_arm_v7a}:/var/lib/jenkins/workspace/build_android/install" "${GITHUB_WORKSPACE}/build_android_install_arm_v7a" - - # x86_64 - time docker pull "${docker_image_libtorch_android_x86_64}" >/dev/null - export id_x86_64 - id_x86_64=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins "${docker_image_libtorch_android_x86_64}") - - # shellcheck disable=SC1105 - ((echo "sudo chown -R jenkins workspace") | docker exec -u jenkins -i "${id_x86_64}" bash) 2>&1 - - mkdir -p "${GITHUB_WORKSPACE}/build_android_install_x86_64" - docker cp "${id_x86_64}:/var/lib/jenkins/workspace/build_android/install" "${GITHUB_WORKSPACE}/build_android_install_x86_64" - - # arm-v8a - time docker pull "${docker_image_libtorch_android_arm_v8a}" >/dev/null - export id_arm_v8a - id_arm_v8a=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins "${docker_image_libtorch_android_arm_v8a}") - - # shellcheck disable=SC1105 - ((echo "sudo chown -R jenkins workspace") | docker exec -u jenkins -i "$id_arm_v8a" bash) 2>&1 - - mkdir -p "${GITHUB_WORKSPACE}/build_android_install_arm_v8a" - docker cp "${id_arm_v8a}:/var/lib/jenkins/workspace/build_android/install" "${GITHUB_WORKSPACE}/build_android_install_arm_v8a" - - # Putting everything together - docker cp "${GITHUB_WORKSPACE}/build_android_install_arm_v7a" "${id_x86_32}:/var/lib/jenkins/workspace/build_android_install_arm_v7a" - docker cp "${GITHUB_WORKSPACE}/build_android_install_x86_64" "${id_x86_32}:/var/lib/jenkins/workspace/build_android_install_x86_64" - docker cp "${GITHUB_WORKSPACE}/build_android_install_arm_v8a" "${id_x86_32}:/var/lib/jenkins/workspace/build_android_install_arm_v8a" - - # run gradle buildRelease - # shellcheck disable=SC1105 - ((echo "sudo chown -R jenkins workspace && cd workspace && ./.circleci/scripts/build_android_gradle.sh") | docker exec \ - -e BUILD_ENVIRONMENT="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build" \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e AWS_DEFAULT_REGION \ - -e IS_GHA \ - -e PR_NUMBER \ - -e SHA1 \ - -e BRANCH \ - -e GITHUB_RUN_ID \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e PR_LABELS \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --user jenkins \ - -u jenkins -i "${id_x86_32}" bash) 2>&1 - - mkdir -p "${GITHUB_WORKSPACE}/build_android_artifacts" - docker cp "${id_x86_32}:/var/lib/jenkins/workspace/android/artifacts.tgz" "${GITHUB_WORKSPACE}/build_android_artifacts/" - - output_image="${DOCKER_IMAGE}-android-x86_32-gradle" - docker commit "${id_x86_32}" "${output_image}" - time docker push "${output_image}" - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - # The artifact file is created inside docker container, which contains the result binaries. - # Now unpackage it into the project folder. The subsequent script will scan project folder - # to locate result binaries and report their sizes. - # If artifact file is not provided it assumes that the project folder has been mounted in - # the docker during build and already contains the result binaries, so this step can be skipped. - export ARTIFACTS=${GITHUB_WORKSPACE}/build_android_artifacts/artifacts.tgz - if [ -n "${ARTIFACTS}" ]; then - tar xf "${ARTIFACTS}" -C "${GITHUB_WORKSPACE}" - cd "${GITHUB_WORKSPACE}" - fi - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - ANDROID_BUILD_TYPE=prebuilt - export ANDROID_BUILD_TYPE - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba "android" || exit 0 - - uses: seemethere/upload-artifact-s3@v3 - name: Store PyTorch Android Build Artifacts on S3 - with: - name: ${{ env.BUILD_ENVIRONMENT }} - retention-days: 14 - if-no-files-found: error - path: - build_android_artifacts/artifacts.tgz - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit.yml b/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit.yml deleted file mode 100644 index 95924b65d8a2..000000000000 --- a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit.yml +++ /dev/null @@ -1,274 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/android_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit - -on: - pull_request: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/android/*' - - 'ciflow/cpu/*' - - 'ciflow/linux/*' - - 'ciflow/trunk/*' - branches: - - master - - release/* - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - # building and testing in a single job since bazel runs only small subset of tests - build-and-test: - runs-on: linux.2xlarge - env: - JOB_BASE_NAME: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit-build-and-test - NUM_TEST_SHARDS: 1 - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - name: Output disk space left - run: | - sudo df -H - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Build - run: | - set -e - # Unlike other gradle jobs, it's not worth building libtorch in a separate CI job and share via docker, because: - # 1) Not shareable: it's custom selective build, which is different from default libtorch mobile build; - # 2) Not parallelizable by architecture: it only builds libtorch for one architecture; - - echo "DOCKER_IMAGE: ${DOCKER_IMAGE}" - time docker pull "${DOCKER_IMAGE}" >/dev/null - - export BUILD_LITE_INTERPRETER - BUILD_LITE_INTERPRETER="1" - if [[ "${BUILD_ENVIRONMENT}" == *"full-jit" ]]; then - BUILD_LITE_INTERPRETER="0" - fi - - git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0 - # shellcheck disable=SC2016 - export id - id=$(docker run -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e PR_LABELS \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e BUILD_LITE_INTERPRETER \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "$(pwd):/var/lib/jenkins/workspace" \ - --cap-add=SYS_PTRACE \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --security-opt seccomp=unconfined \ - -t -d -w /var/lib/jenkins "${DOCKER_IMAGE}") - - # shellcheck disable=SC2016 - export COMMAND - # shellcheck disable=SC2016 - COMMAND='((echo "export GRADLE_OFFLINE=1" && echo "export BUILD_LITE_INTERPRETER=${BUILD_LITE_INTERPRETER}" && echo "sudo chown -R jenkins workspace && cd workspace && ./.circleci/scripts/build_android_gradle.sh") | docker exec -u jenkins -i "$id" bash) 2>&1' - echo "${COMMAND}" > ./command.sh && bash ./command.sh - # Skip docker push as this job is purely for size analysis purpose. - # Result binaries are already in `/home/circleci/project/` as it's mounted instead of copied. - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - # The artifact file is created inside docker container, which contains the result binaries. - # Now unpackage it into the project folder. The subsequent script will scan project folder - # to locate result binaries and report their sizes. - # If artifact file is not provided it assumes that the project folder has been mounted in - # the docker during build and already contains the result binaries, so this step can be skipped. - export ARTIFACTS= - if [ -n "${ARTIFACTS}" ]; then - tar xf "${ARTIFACTS}" -C "${GITHUB_WORKSPACE}" - cd "${GITHUB_WORKSPACE}" - fi - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - ANDROID_BUILD_TYPE=custom-build-single - export ANDROID_BUILD_TYPE - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba "android" || exit 0 - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single.yml b/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single.yml deleted file mode 100644 index 7af766ba75aa..000000000000 --- a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single.yml +++ /dev/null @@ -1,274 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/android_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single - -on: - pull_request: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/android/*' - - 'ciflow/cpu/*' - - 'ciflow/linux/*' - - 'ciflow/trunk/*' - branches: - - master - - release/* - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single - DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c - SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - TORCH_CUDA_ARCH_LIST: 5.2 - IN_CI: 1 - IS_GHA: 1 - # This is used for the phase of adding wheel tests only, will be removed once completed - IN_WHEEL_TEST: 1 - # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh - CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - PYTORCH_RETRY_TEST_CASES: 1 -concurrency: - group: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - - # building and testing in a single job since bazel runs only small subset of tests - build-and-test: - runs-on: linux.2xlarge - env: - JOB_BASE_NAME: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-build-and-test - NUM_TEST_SHARDS: 1 - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Calculate docker image tag - id: calculate-tag - run: | - DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) - echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" >> "${GITHUB_ENV}" - echo "::set-output name=docker_tag::${DOCKER_TAG}" - echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${DOCKER_TAG}" - - name: Check if image should be built - id: check - env: - BASE_REVISION: ${{ github.event.pull_request.base.sha || github.sha }} - run: | - set -x - # Check if image already exists, if it does then skip building it - if docker manifest inspect "${DOCKER_IMAGE_BASE}:${DOCKER_TAG}"; then - exit 0 - fi - if [[ "$BASE_REVISION" = "$(git rev-parse HEAD)" ]]; then - # if we're on the base branch then use the parent commit - MERGE_BASE=$(git rev-parse HEAD~) - else - # otherwise we're on a PR, so use the most recent base commit - MERGE_BASE=$(git merge-base HEAD "$BASE_REVISION") - fi - # Covers the case where a previous tag doesn't exist for the tree - # this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly - if ! git rev-parse "$MERGE_BASE:.circleci/docker"; then - echo "Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit" - exit 1 - fi - PREVIOUS_DOCKER_TAG=$(git rev-parse "$MERGE_BASE:.circleci/docker") - # If no image exists but the hash is the same as the previous hash then we should error out here - if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then - echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch" - echo " contact the PyTorch team to restore the original images" - exit 1 - fi - echo ::set-output name=rebuild::yes - - name: Build and push docker image - if: ${{ steps.check.outputs.rebuild }} - env: - DOCKER_SKIP_S3_UPLOAD: 1 - working-directory: .circleci/docker - run: | - export IMAGE_NAME=${DOCKER_IMAGE_BASE#308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/} - ./build_docker.sh - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - name: Output disk space left - run: | - sudo df -H - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Build - run: | - set -e - # Unlike other gradle jobs, it's not worth building libtorch in a separate CI job and share via docker, because: - # 1) Not shareable: it's custom selective build, which is different from default libtorch mobile build; - # 2) Not parallelizable by architecture: it only builds libtorch for one architecture; - - echo "DOCKER_IMAGE: ${DOCKER_IMAGE}" - time docker pull "${DOCKER_IMAGE}" >/dev/null - - export BUILD_LITE_INTERPRETER - BUILD_LITE_INTERPRETER="1" - if [[ "${BUILD_ENVIRONMENT}" == *"full-jit" ]]; then - BUILD_LITE_INTERPRETER="0" - fi - - git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0 - # shellcheck disable=SC2016 - export id - id=$(docker run -e BUILD_ENVIRONMENT \ - -e JOB_BASE_NAME \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e PR_LABELS \ - -e SKIP_SCCACHE_INITIALIZATION=1 \ - -e TORCH_CUDA_ARCH_LIST \ - -e BUILD_LITE_INTERPRETER \ - -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --tty \ - --detach \ - --user jenkins \ - -v "$(pwd):/var/lib/jenkins/workspace" \ - --cap-add=SYS_PTRACE \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --security-opt seccomp=unconfined \ - -t -d -w /var/lib/jenkins "${DOCKER_IMAGE}") - - # shellcheck disable=SC2016 - export COMMAND - # shellcheck disable=SC2016 - COMMAND='((echo "export GRADLE_OFFLINE=1" && echo "export BUILD_LITE_INTERPRETER=${BUILD_LITE_INTERPRETER}" && echo "sudo chown -R jenkins workspace && cd workspace && ./.circleci/scripts/build_android_gradle.sh") | docker exec -u jenkins -i "$id" bash) 2>&1' - echo "${COMMAND}" > ./command.sh && bash ./command.sh - # Skip docker push as this job is purely for size analysis purpose. - # Result binaries are already in `/home/circleci/project/` as it's mounted instead of copied. - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Display and upload binary build size statistics (Click Me) - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - run: | - # The artifact file is created inside docker container, which contains the result binaries. - # Now unpackage it into the project folder. The subsequent script will scan project folder - # to locate result binaries and report their sizes. - # If artifact file is not provided it assumes that the project folder has been mounted in - # the docker during build and already contains the result binaries, so this step can be skipped. - export ARTIFACTS= - if [ -n "${ARTIFACTS}" ]; then - tar xf "${ARTIFACTS}" -C "${GITHUB_WORKSPACE}" - cd "${GITHUB_WORKSPACE}" - fi - COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) - export COMMIT_TIME - ANDROID_BUILD_TYPE=custom-build-single - export ANDROID_BUILD_TYPE - pip3 install requests==2.26 boto3==1.16.34 - python3 -m tools.stats.upload_binary_size_to_scuba "android" || exit 0 - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml deleted file mode 100644 index 06db1e07c519..000000000000 --- a/.github/workflows/generated-win-vs2019-cpu-py3.yml +++ /dev/null @@ -1,304 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/windows_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: win-vs2019-cpu-py3 - -on: - pull_request: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/cpu/*' - - 'ciflow/trunk/*' - - 'ciflow/win/*' - branches: - - master - - release/* - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: win-vs2019-cpu-py3 - BUILD_WHEEL: 1 - MAX_JOBS: 8 - CUDA_VERSION: "cpu" - IN_CI: 1 - IS_GHA: 1 - INSTALL_WINDOWS_SDK: 1 - PYTHON_VERSION: "3.8" - PYTORCH_RETRY_TEST_CASES: 1 - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - SCCACHE_BUCKET: "ossci-compiler-cache" - VC_PRODUCT: "BuildTools" - VC_VERSION: "" - VS_VERSION: "16.8.6" - VC_YEAR: "2019" - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - no_proxy: localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - USE_CUDA: 0 - -concurrency: - group: win-vs2019-cpu-py3-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - build: - runs-on: "windows.4xlarge" - timeout-minutes: 240 - env: - JOB_BASE_NAME: win-vs2019-cpu-py3-build - http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Install Visual Studio 2019 toolchain - shell: powershell - run: | - .\.circleci\scripts\vs_install.ps1 - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - .jenkins/pytorch/win-build.sh - # Upload to github so that people can click and download artifacts - - name: Upload artifacts to s3 - uses: seemethere/upload-artifact-s3@v3 - with: - retention-days: 14 - if-no-files-found: error - name: ${{ env.BUILD_ENVIRONMENT }} - path: C:\${{ github.run_id }}\build-results - - name: Wait until all sessions have drained - shell: powershell - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - - name: Cleanup build-results and workspaces - if: always() - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - # Should remove the entirety of pytorch-${{ github.run_id }} - run: | - rm -rf "${PYTORCH_FINAL_PACKAGE_DIR}" - rm -rf ./* - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: windows.4xlarge - NUM_TEST_SHARDS: 2 - NUM_TEST_SHARDS_ON_PULL_REQUEST: 2 - PR_BODY: ${{ github.event.pull_request.body }} - NOGPU_RUNNER_TYPE: windows.4xlarge - ENABLE_FORCE_ON_CPU_TEST: '' - RUN_SMOKE_TESTS_ONLY_ON_PR: False - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - timeout-minutes: 240 - env: - JOB_BASE_NAME: win-vs2019-cpu-py3-test - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - TEST_CONFIG: ${{ matrix.config }} - http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Install Visual Studio 2019 toolchain - shell: powershell - run: | - .\.circleci\scripts\vs_install.ps1 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - path: C:\${{ github.run_id }}\build-results - - name: Check build-results folder - shell: powershell - run: | - tree /F C:\$Env:GITHUB_RUN_ID\build-results - # Needed for coverage in win-test.sh - - uses: actions/setup-python@v2 - name: Setup Python3 - with: - python-version: '3.x' - - name: Test - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - # Time out the test phase after 3.5 hours - timeout-minutes: 210 - run: | - .jenkins/pytorch/win-test.sh - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Wait until all sessions have drained - shell: powershell - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: win-vs2019-cpu-py3-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Cleanup workspace - if: always() - shell: bash - # Should remove the entirety of pytorch-${{ github.run_id }} - run: | - rm -rf ./* diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml deleted file mode 100644 index 8e84f9d53475..000000000000 --- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml +++ /dev/null @@ -1,323 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/windows_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: win-vs2019-cuda11.3-py3 - -on: - pull_request: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/cuda/*' - - 'ciflow/trunk/*' - - 'ciflow/win/*' - branches: - - master - - release/* - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: win-vs2019-cuda11.3-py3 - BUILD_WHEEL: 1 - MAX_JOBS: 8 - CUDA_VERSION: "11.3" - IN_CI: 1 - IS_GHA: 1 - INSTALL_WINDOWS_SDK: 1 - PYTHON_VERSION: "3.8" - PYTORCH_RETRY_TEST_CASES: 1 - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - SCCACHE_BUCKET: "ossci-compiler-cache" - VC_PRODUCT: "BuildTools" - VC_VERSION: "" - VS_VERSION: "16.8.6" - VC_YEAR: "2019" - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - no_proxy: localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TORCH_CUDA_ARCH_LIST: "7.0" - USE_CUDA: 1 - -concurrency: - group: win-vs2019-cuda11.3-py3-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - build: - runs-on: "windows.4xlarge" - timeout-minutes: 240 - env: - JOB_BASE_NAME: win-vs2019-cuda11.3-py3-build - http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Install Visual Studio 2019 toolchain - shell: powershell - run: | - .\.circleci\scripts\vs_install.ps1 - - name: Install Cuda - shell: bash - run: | - .circleci/scripts/windows_cuda_install.sh - - name: Install Cudnn - shell: bash - run: | - .circleci/scripts/windows_cudnn_install.sh - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - .jenkins/pytorch/win-build.sh - # Upload to github so that people can click and download artifacts - - name: Upload artifacts to s3 - uses: seemethere/upload-artifact-s3@v3 - with: - retention-days: 14 - if-no-files-found: error - name: ${{ env.BUILD_ENVIRONMENT }} - path: C:\${{ github.run_id }}\build-results - - name: Wait until all sessions have drained - shell: powershell - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - - name: Cleanup build-results and workspaces - if: always() - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - # Should remove the entirety of pytorch-${{ github.run_id }} - run: | - rm -rf "${PYTORCH_FINAL_PACKAGE_DIR}" - rm -rf ./* - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: windows.8xlarge.nvidia.gpu - NUM_TEST_SHARDS: 2 - NUM_TEST_SHARDS_ON_PULL_REQUEST: 0 - PR_BODY: ${{ github.event.pull_request.body }} - NOGPU_RUNNER_TYPE: windows.4xlarge - ENABLE_FORCE_ON_CPU_TEST: 1 - RUN_SMOKE_TESTS_ONLY_ON_PR: True - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - timeout-minutes: 240 - env: - JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - TEST_CONFIG: ${{ matrix.config }} - http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Install Visual Studio 2019 toolchain - shell: powershell - run: | - .\.circleci\scripts\vs_install.ps1 - - name: Install Cuda - if: ${{ matrix.config != 'force_on_cpu' }} - shell: bash - run: | - .circleci/scripts/windows_cuda_install.sh - - name: Install Cudnn - if: ${{ matrix.config != 'force_on_cpu' }} - shell: bash - run: | - .circleci/scripts/windows_cudnn_install.sh - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - path: C:\${{ github.run_id }}\build-results - - name: Check build-results folder - shell: powershell - run: | - tree /F C:\$Env:GITHUB_RUN_ID\build-results - # Needed for coverage in win-test.sh - - uses: actions/setup-python@v2 - name: Setup Python3 - with: - python-version: '3.x' - - name: Test - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - # Time out the test phase after 3.5 hours - timeout-minutes: 210 - run: | - .jenkins/pytorch/win-test.sh - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Wait until all sessions have drained - shell: powershell - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Cleanup workspace - if: always() - shell: bash - # Should remove the entirety of pytorch-${{ github.run_id }} - run: | - rm -rf ./* diff --git a/.github/workflows/generated-windows-binary-conda-nightly.yml b/.github/workflows/generated-windows-binary-conda-nightly.yml new file mode 100644 index 000000000000..32dc4f4eb945 --- /dev/null +++ b/.github/workflows/generated-windows-binary-conda-nightly.yml @@ -0,0 +1,3638 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/windows_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: windows-binary-conda + +on: + push: + # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build + branches: + - nightly + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + - 'ciflow/binaries/*' + - 'ciflow/binaries_conda/*' + workflow_dispatch: + +env: + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + ANACONDA_USER: pytorch + AWS_DEFAULT_REGION: us-east-1 + BUILD_ENVIRONMENT: windows-binary-conda + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IN_CI: 1 + IS_GHA: 1 + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + PR_NUMBER: ${{ github.event.pull_request.number }} + PYTORCH_RETRY_TEST_CASES: 1 + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + SKIP_ALL_TESTS: 1 +concurrency: + group: windows-binary-conda-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + conda-py3_7-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: conda-py3_7-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_7-cpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_7-cpu-build + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_7-cpu + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_7-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_7-cpu-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_7-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_7-cuda11_3-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: conda-py3_7-cuda11_3 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_7-cuda11_3-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_7-cuda11_3-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_7-cuda11_3 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_7-cuda11_3-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_7-cuda11_3-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_7-cuda11_3 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_7-cuda11_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: conda-py3_7-cuda11_6 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_7-cuda11_6-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_7-cuda11_6-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_7-cuda11_6 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_7-cuda11_6-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_7-cuda11_6-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_7-cuda11_6 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_8-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: conda-py3_8-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_8-cpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_8-cpu-build + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_8-cpu + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_8-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_8-cpu-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_8-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_8-cuda11_3-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: conda-py3_8-cuda11_3 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_8-cuda11_3-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_8-cuda11_3-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_8-cuda11_3 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_8-cuda11_3-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_8-cuda11_3-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_8-cuda11_3 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_8-cuda11_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: conda-py3_8-cuda11_6 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_8-cuda11_6-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_8-cuda11_6-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_8-cuda11_6 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_8-cuda11_6-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_8-cuda11_6-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_8-cuda11_6 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_9-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: conda-py3_9-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_9-cpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_9-cpu-build + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_9-cpu + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_9-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_9-cpu-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_9-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_9-cuda11_3-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: conda-py3_9-cuda11_3 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_9-cuda11_3-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_9-cuda11_3-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_9-cuda11_3 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_9-cuda11_3-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_9-cuda11_3-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_9-cuda11_3 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_9-cuda11_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: conda-py3_9-cuda11_6 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_9-cuda11_6-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_9-cuda11_6-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_9-cuda11_6 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_9-cuda11_6-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_9-cuda11_6-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_9-cuda11_6 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_10-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: conda-py3_10-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_10-cpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_10-cpu-build + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_10-cpu + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_10-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_10-cpu-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_10-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_10-cuda11_3-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: conda-py3_10-cuda11_3 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_10-cuda11_3-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_10-cuda11_3-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_10-cuda11_3 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_10-cuda11_3-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_10-cuda11_3-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_10-cuda11_3 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + conda-py3_10-cuda11_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: conda-py3_10-cuda11_6 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_10-cuda11_6-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_10-cuda11_6-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_10-cuda11_6 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_10-cuda11_6-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_10-cuda11_6-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: conda-py3_10-cuda11_6 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af diff --git a/.github/workflows/generated-windows-binary-libtorch-cxx11-abi.yml b/.github/workflows/generated-windows-binary-libtorch-cxx11-abi.yml deleted file mode 100644 index f1ff574a1f7e..000000000000 --- a/.github/workflows/generated-windows-binary-libtorch-cxx11-abi.yml +++ /dev/null @@ -1,4618 +0,0 @@ -# @generated DO NOT EDIT MANUALLY - -# Template is at: .github/templates/windows_binary_build_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: windows-binary-libtorch-cxx11-abi - -on: - push: - # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build - branches: - - nightly - tags: - # NOTE: Binary build pipelines should only get triggered on release candidate builds - # Release candidate tags look like: v1.11.0-rc1 - - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ - - 'ciflow/binaries/*' - - 'ciflow/binaries_libtorch/*' - workflow_dispatch: - -env: - # Needed for conda builds - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - ANACONDA_USER: pytorch - AWS_DEFAULT_REGION: us-east-1 - BUILD_ENVIRONMENT: windows-binary-libtorch-cxx11-abi - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - IN_CI: 1 - IS_GHA: 1 - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - PR_NUMBER: ${{ github.event.pull_request.number }} - PYTORCH_RETRY_TEST_CASES: 1 - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - SKIP_ALL_TESTS: 1 -concurrency: - group: windows-binary-libtorch-cxx11-abi-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - libtorch-cpu-shared-with-deps-cxx11-abi-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cpu-shared-with-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cpu-shared-with-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-shared-with-deps-cxx11-abi-build - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-shared-with-deps-cxx11-abi - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cpu-shared-with-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-shared-with-deps-cxx11-abi-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-shared-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-shared-without-deps-cxx11-abi-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cpu-shared-without-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cpu-shared-without-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-shared-without-deps-cxx11-abi-build - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-shared-without-deps-cxx11-abi - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cpu-shared-without-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-shared-without-deps-cxx11-abi-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-shared-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-static-with-deps-cxx11-abi-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cpu-static-with-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cpu-static-with-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-static-with-deps-cxx11-abi-build - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-static-with-deps-cxx11-abi - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cpu-static-with-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-static-with-deps-cxx11-abi-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-static-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-static-without-deps-cxx11-abi-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cpu-static-without-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cpu-static-without-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-static-without-deps-cxx11-abi-build - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-static-without-deps-cxx11-abi - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cpu-static-without-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-static-without-deps-cxx11-abi-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-static-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-shared-with-deps-cxx11-abi-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cuda11_1-shared-with-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_1-shared-with-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-shared-with-deps-cxx11-abi-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-shared-with-deps-cxx11-abi - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_1-shared-with-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-shared-with-deps-cxx11-abi-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-shared-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-shared-without-deps-cxx11-abi-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cuda11_1-shared-without-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_1-shared-without-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-shared-without-deps-cxx11-abi-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-shared-without-deps-cxx11-abi - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_1-shared-without-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-shared-without-deps-cxx11-abi-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-shared-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-static-with-deps-cxx11-abi-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cuda11_1-static-with-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_1-static-with-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-static-with-deps-cxx11-abi-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-static-with-deps-cxx11-abi - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_1-static-with-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-static-with-deps-cxx11-abi-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-static-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-static-without-deps-cxx11-abi-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cuda11_1-static-without-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_1-static-without-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-static-without-deps-cxx11-abi-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-static-without-deps-cxx11-abi - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_1-static-without-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-static-without-deps-cxx11-abi-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-static-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-shared-with-deps-cxx11-abi-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cuda11_3-shared-with-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_3-shared-with-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-shared-with-deps-cxx11-abi-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-shared-with-deps-cxx11-abi - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_3-shared-with-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-shared-with-deps-cxx11-abi-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-shared-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-shared-without-deps-cxx11-abi-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cuda11_3-shared-without-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_3-shared-without-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-shared-without-deps-cxx11-abi-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-shared-without-deps-cxx11-abi - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_3-shared-without-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-shared-without-deps-cxx11-abi-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-shared-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-static-with-deps-cxx11-abi-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cuda11_3-static-with-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_3-static-with-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-static-with-deps-cxx11-abi-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-static-with-deps-cxx11-abi - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_3-static-with-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-static-with-deps-cxx11-abi-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-static-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-static-without-deps-cxx11-abi-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cuda11_3-static-without-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_3-static-without-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-static-without-deps-cxx11-abi-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-static-without-deps-cxx11-abi - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_3-static-without-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-static-without-deps-cxx11-abi-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-static-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-shared-with-deps-cxx11-abi-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cuda11_5-shared-with-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_5-shared-with-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-shared-with-deps-cxx11-abi-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-shared-with-deps-cxx11-abi - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_5-shared-with-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-shared-with-deps-cxx11-abi-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-shared-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-shared-without-deps-cxx11-abi-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cuda11_5-shared-without-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_5-shared-without-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-shared-without-deps-cxx11-abi-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-shared-without-deps-cxx11-abi - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_5-shared-without-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-shared-without-deps-cxx11-abi-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-shared-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-static-with-deps-cxx11-abi-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cuda11_5-static-with-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_5-static-with-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-static-with-deps-cxx11-abi-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-static-with-deps-cxx11-abi - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_5-static-with-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-static-with-deps-cxx11-abi-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-static-with-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-static-without-deps-cxx11-abi-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cuda11_5-static-without-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_5-static-without-deps-cxx11-abi-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-static-without-deps-cxx11-abi-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-static-without-deps-cxx11-abi - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_5-static-without-deps-cxx11-abi-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-static-without-deps-cxx11-abi-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-static-without-deps-cxx11-abi - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-master.yml b/.github/workflows/generated-windows-binary-libtorch-debug-master.yml new file mode 100644 index 000000000000..04188e958fec --- /dev/null +++ b/.github/workflows/generated-windows-binary-libtorch-debug-master.yml @@ -0,0 +1,247 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/windows_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: windows-binary-libtorch-debug + +on: + push: + branches: + - master + tags: + - 'ciflow/all/*' + - 'ciflow/trunk/*' + workflow_dispatch: + +env: + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + ANACONDA_USER: pytorch + AWS_DEFAULT_REGION: us-east-1 + BUILD_ENVIRONMENT: windows-binary-libtorch-debug + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IN_CI: 1 + IS_GHA: 1 + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + PR_NUMBER: ${{ github.event.pull_request.number }} + PYTORCH_RETRY_TEST_CASES: 1 + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + SKIP_ALL_TESTS: 1 +concurrency: + group: windows-binary-libtorch-debug-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + libtorch-cpu-shared-with-deps-debug-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: libtorch-cpu-shared-with-deps-debug + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cpu-shared-with-deps-debug-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-shared-with-deps-debug-build + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-with-deps-debug + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml new file mode 100644 index 000000000000..5983d1b4212e --- /dev/null +++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml @@ -0,0 +1,3782 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/windows_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: windows-binary-libtorch-debug + +on: + push: + # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build + branches: + - nightly + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + - 'ciflow/binaries/*' + - 'ciflow/binaries_libtorch/*' + workflow_dispatch: + +env: + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + ANACONDA_USER: pytorch + AWS_DEFAULT_REGION: us-east-1 + BUILD_ENVIRONMENT: windows-binary-libtorch-debug + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IN_CI: 1 + IS_GHA: 1 + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + PR_NUMBER: ${{ github.event.pull_request.number }} + PYTORCH_RETRY_TEST_CASES: 1 + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + SKIP_ALL_TESTS: 1 +concurrency: + group: windows-binary-libtorch-debug-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + libtorch-cpu-shared-with-deps-debug-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: libtorch-cpu-shared-with-deps-debug + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cpu-shared-with-deps-debug-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-shared-with-deps-debug-build + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-with-deps-debug + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cpu-shared-with-deps-debug-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-shared-with-deps-debug-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-with-deps-debug + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-shared-without-deps-debug-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: libtorch-cpu-shared-without-deps-debug + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cpu-shared-without-deps-debug-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-shared-without-deps-debug-build + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-without-deps-debug + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cpu-shared-without-deps-debug-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-shared-without-deps-debug-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-without-deps-debug + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-static-with-deps-debug-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: static-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: libtorch-cpu-static-with-deps-debug + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cpu-static-with-deps-debug-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-static-with-deps-debug-build + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: static-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-static-with-deps-debug + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cpu-static-with-deps-debug-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-static-with-deps-debug-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: static-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-static-with-deps-debug + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-static-without-deps-debug-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: static-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: libtorch-cpu-static-without-deps-debug + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cpu-static-without-deps-debug-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-static-without-deps-debug-build + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: static-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-static-without-deps-debug + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cpu-static-without-deps-debug-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-static-without-deps-debug-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: static-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-static-without-deps-debug + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-shared-with-deps-debug-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: libtorch-cuda11_3-shared-with-deps-debug + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_3-shared-with-deps-debug-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-shared-with-deps-debug-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-shared-with-deps-debug + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_3-shared-with-deps-debug-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-shared-with-deps-debug-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-shared-with-deps-debug + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-shared-without-deps-debug-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: libtorch-cuda11_3-shared-without-deps-debug + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_3-shared-without-deps-debug-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-shared-without-deps-debug-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-shared-without-deps-debug + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_3-shared-without-deps-debug-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-shared-without-deps-debug-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-shared-without-deps-debug + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-static-with-deps-debug-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: static-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: libtorch-cuda11_3-static-with-deps-debug + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_3-static-with-deps-debug-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-static-with-deps-debug-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: static-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-static-with-deps-debug + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_3-static-with-deps-debug-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-static-with-deps-debug-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: static-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-static-with-deps-debug + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-static-without-deps-debug-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: static-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: libtorch-cuda11_3-static-without-deps-debug + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_3-static-without-deps-debug-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-static-without-deps-debug-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: static-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-static-without-deps-debug + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_3-static-without-deps-debug-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-static-without-deps-debug-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: static-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-static-without-deps-debug + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-shared-with-deps-debug-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: libtorch-cuda11_6-shared-with-deps-debug + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_6-shared-with-deps-debug-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-shared-with-deps-debug-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-shared-with-deps-debug + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_6-shared-with-deps-debug-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-shared-with-deps-debug-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-shared-with-deps-debug + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-shared-without-deps-debug-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: libtorch-cuda11_6-shared-without-deps-debug + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_6-shared-without-deps-debug-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-shared-without-deps-debug-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-shared-without-deps-debug + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_6-shared-without-deps-debug-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-shared-without-deps-debug-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-shared-without-deps-debug + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-static-with-deps-debug-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: static-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: libtorch-cuda11_6-static-with-deps-debug + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_6-static-with-deps-debug-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-static-with-deps-debug-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: static-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-static-with-deps-debug + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_6-static-with-deps-debug-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-static-with-deps-debug-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: static-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-static-with-deps-debug + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-static-without-deps-debug-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: static-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: libtorch-cuda11_6-static-without-deps-debug + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_6-static-without-deps-debug-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-static-without-deps-debug-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: static-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-static-without-deps-debug + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_6-static-without-deps-debug-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-static-without-deps-debug-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: static-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-static-without-deps-debug + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af diff --git a/.github/workflows/generated-windows-binary-libtorch-pre-cxx11.yml b/.github/workflows/generated-windows-binary-libtorch-pre-cxx11.yml deleted file mode 100644 index e09c0f8052c1..000000000000 --- a/.github/workflows/generated-windows-binary-libtorch-pre-cxx11.yml +++ /dev/null @@ -1,4618 +0,0 @@ -# @generated DO NOT EDIT MANUALLY - -# Template is at: .github/templates/windows_binary_build_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: windows-binary-libtorch-pre-cxx11 - -on: - push: - # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build - branches: - - nightly - tags: - # NOTE: Binary build pipelines should only get triggered on release candidate builds - # Release candidate tags look like: v1.11.0-rc1 - - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ - - 'ciflow/binaries/*' - - 'ciflow/binaries_libtorch/*' - workflow_dispatch: - -env: - # Needed for conda builds - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - ANACONDA_USER: pytorch - AWS_DEFAULT_REGION: us-east-1 - BUILD_ENVIRONMENT: windows-binary-libtorch-pre-cxx11 - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - IN_CI: 1 - IS_GHA: 1 - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - PR_NUMBER: ${{ github.event.pull_request.number }} - PYTORCH_RETRY_TEST_CASES: 1 - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - SKIP_ALL_TESTS: 1 -concurrency: - group: windows-binary-libtorch-pre-cxx11-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - libtorch-cpu-shared-with-deps-pre-cxx11-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cpu-shared-with-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cpu-shared-with-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-shared-with-deps-pre-cxx11-build - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-shared-with-deps-pre-cxx11 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cpu-shared-with-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-shared-with-deps-pre-cxx11-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-shared-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-shared-without-deps-pre-cxx11-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cpu-shared-without-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cpu-shared-without-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-shared-without-deps-pre-cxx11-build - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-shared-without-deps-pre-cxx11 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cpu-shared-without-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-shared-without-deps-pre-cxx11-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-shared-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-static-with-deps-pre-cxx11-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cpu-static-with-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cpu-static-with-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-static-with-deps-pre-cxx11-build - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-static-with-deps-pre-cxx11 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cpu-static-with-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-static-with-deps-pre-cxx11-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-static-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cpu-static-without-deps-pre-cxx11-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cpu-static-without-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cpu-static-without-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-static-without-deps-pre-cxx11-build - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-static-without-deps-pre-cxx11 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cpu-static-without-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-static-without-deps-pre-cxx11-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cpu-static-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-shared-with-deps-pre-cxx11-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cuda11_1-shared-with-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_1-shared-with-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-shared-with-deps-pre-cxx11-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-shared-with-deps-pre-cxx11 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_1-shared-with-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-shared-with-deps-pre-cxx11-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-shared-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-shared-without-deps-pre-cxx11-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cuda11_1-shared-without-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_1-shared-without-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-shared-without-deps-pre-cxx11-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-shared-without-deps-pre-cxx11 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_1-shared-without-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-shared-without-deps-pre-cxx11-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-shared-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-static-with-deps-pre-cxx11-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cuda11_1-static-with-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_1-static-with-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-static-with-deps-pre-cxx11-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-static-with-deps-pre-cxx11 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_1-static-with-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-static-with-deps-pre-cxx11-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-static-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_1-static-without-deps-pre-cxx11-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cuda11_1-static-without-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_1-static-without-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-static-without-deps-pre-cxx11-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-static-without-deps-pre-cxx11 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_1-static-without-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_1-static-without-deps-pre-cxx11-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_1-static-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-shared-with-deps-pre-cxx11-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cuda11_3-shared-with-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_3-shared-with-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-shared-with-deps-pre-cxx11-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-shared-with-deps-pre-cxx11 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_3-shared-with-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-shared-with-deps-pre-cxx11-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-shared-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-shared-without-deps-pre-cxx11-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cuda11_3-shared-without-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_3-shared-without-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-shared-without-deps-pre-cxx11-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-shared-without-deps-pre-cxx11 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_3-shared-without-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-shared-without-deps-pre-cxx11-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-shared-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-static-with-deps-pre-cxx11-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cuda11_3-static-with-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_3-static-with-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-static-with-deps-pre-cxx11-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-static-with-deps-pre-cxx11 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_3-static-with-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-static-with-deps-pre-cxx11-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-static-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_3-static-without-deps-pre-cxx11-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cuda11_3-static-without-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_3-static-without-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-static-without-deps-pre-cxx11-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-static-without-deps-pre-cxx11 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_3-static-without-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_3-static-without-deps-pre-cxx11-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_3-static-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-shared-with-deps-pre-cxx11-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cuda11_5-shared-with-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_5-shared-with-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-shared-with-deps-pre-cxx11-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-shared-with-deps-pre-cxx11 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_5-shared-with-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-shared-with-deps-pre-cxx11-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-shared-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-shared-without-deps-pre-cxx11-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cuda11_5-shared-without-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_5-shared-without-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-shared-without-deps-pre-cxx11-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-shared-without-deps-pre-cxx11 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_5-shared-without-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-shared-without-deps-pre-cxx11-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-shared-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-static-with-deps-pre-cxx11-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cuda11_5-static-with-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_5-static-with-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-static-with-deps-pre-cxx11-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-static-with-deps-pre-cxx11 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_5-static-with-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-static-with-deps-pre-cxx11-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-with-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-static-with-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - libtorch-cuda11_5-static-without-deps-pre-cxx11-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: libtorch-cuda11_5-static-without-deps-pre-cxx11 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_5-static-without-deps-pre-cxx11-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-static-without-deps-pre-cxx11-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-static-without-deps-pre-cxx11 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda11_5-static-without-deps-pre-cxx11-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cuda11_5-static-without-deps-pre-cxx11-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: static-without-deps - DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: libtorch-cuda11_5-static-without-deps-pre-cxx11 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-windows-binary-libtorch-release-master.yml b/.github/workflows/generated-windows-binary-libtorch-release-master.yml new file mode 100644 index 000000000000..422cbb27cbb7 --- /dev/null +++ b/.github/workflows/generated-windows-binary-libtorch-release-master.yml @@ -0,0 +1,247 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/windows_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: windows-binary-libtorch-release + +on: + push: + branches: + - master + tags: + - 'ciflow/all/*' + - 'ciflow/trunk/*' + workflow_dispatch: + +env: + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + ANACONDA_USER: pytorch + AWS_DEFAULT_REGION: us-east-1 + BUILD_ENVIRONMENT: windows-binary-libtorch-release + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IN_CI: 1 + IS_GHA: 1 + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + PR_NUMBER: ${{ github.event.pull_request.number }} + PYTORCH_RETRY_TEST_CASES: 1 + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + SKIP_ALL_TESTS: 1 +concurrency: + group: windows-binary-libtorch-release-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + libtorch-cpu-shared-with-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: libtorch-cpu-shared-with-deps-release + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cpu-shared-with-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-shared-with-deps-release-build + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-with-deps-release + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 diff --git a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml new file mode 100644 index 000000000000..2ecfafae499f --- /dev/null +++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml @@ -0,0 +1,3782 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/windows_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: windows-binary-libtorch-release + +on: + push: + # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build + branches: + - nightly + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + - 'ciflow/binaries/*' + - 'ciflow/binaries_libtorch/*' + workflow_dispatch: + +env: + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + ANACONDA_USER: pytorch + AWS_DEFAULT_REGION: us-east-1 + BUILD_ENVIRONMENT: windows-binary-libtorch-release + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IN_CI: 1 + IS_GHA: 1 + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + PR_NUMBER: ${{ github.event.pull_request.number }} + PYTORCH_RETRY_TEST_CASES: 1 + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + SKIP_ALL_TESTS: 1 +concurrency: + group: windows-binary-libtorch-release-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + libtorch-cpu-shared-with-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: libtorch-cpu-shared-with-deps-release + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cpu-shared-with-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-shared-with-deps-release-build + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-with-deps-release + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cpu-shared-with-deps-release-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-shared-with-deps-release-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-with-deps-release + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-shared-without-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: libtorch-cpu-shared-without-deps-release + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cpu-shared-without-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-shared-without-deps-release-build + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-without-deps-release + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cpu-shared-without-deps-release-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-shared-without-deps-release-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-shared-without-deps-release + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-static-with-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: static-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: libtorch-cpu-static-with-deps-release + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cpu-static-with-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-static-with-deps-release-build + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: static-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-static-with-deps-release + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cpu-static-with-deps-release-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-static-with-deps-release-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: static-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-static-with-deps-release + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cpu-static-without-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: static-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: libtorch-cpu-static-without-deps-release + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cpu-static-without-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-static-without-deps-release-build + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: static-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-static-without-deps-release + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cpu-static-without-deps-release-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cpu-static-without-deps-release-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: static-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cpu-static-without-deps-release + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-shared-with-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: libtorch-cuda11_3-shared-with-deps-release + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_3-shared-with-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-shared-with-deps-release-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-shared-with-deps-release + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_3-shared-with-deps-release-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-shared-with-deps-release-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-shared-with-deps-release + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-shared-without-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: libtorch-cuda11_3-shared-without-deps-release + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_3-shared-without-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-shared-without-deps-release-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-shared-without-deps-release + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_3-shared-without-deps-release-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-shared-without-deps-release-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-shared-without-deps-release + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-static-with-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: static-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: libtorch-cuda11_3-static-with-deps-release + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_3-static-with-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-static-with-deps-release-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: static-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-static-with-deps-release + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_3-static-with-deps-release-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-static-with-deps-release-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: static-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-static-with-deps-release + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_3-static-without-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: static-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: libtorch-cuda11_3-static-without-deps-release + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_3-static-without-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-static-without-deps-release-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: static-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-static-without-deps-release + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_3-static-without-deps-release-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_3-static-without-deps-release-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: static-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_3-static-without-deps-release + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-shared-with-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: libtorch-cuda11_6-shared-with-deps-release + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_6-shared-with-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-shared-with-deps-release-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-shared-with-deps-release + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_6-shared-with-deps-release-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-shared-with-deps-release-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-shared-with-deps-release + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-shared-without-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: libtorch-cuda11_6-shared-without-deps-release + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_6-shared-without-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-shared-without-deps-release-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-shared-without-deps-release + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_6-shared-without-deps-release-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-shared-without-deps-release-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-shared-without-deps-release + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-static-with-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: static-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: libtorch-cuda11_6-static-with-deps-release + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_6-static-with-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-static-with-deps-release-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: static-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-static-with-deps-release + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_6-static-with-deps-release-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-static-with-deps-release-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: static-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-static-with-deps-release + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + libtorch-cuda11_6-static-without-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: static-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: libtorch-cuda11_6-static-without-deps-release + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_6-static-without-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-static-without-deps-release-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: static-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-static-without-deps-release + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda11_6-static-without-deps-release-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda11_6-static-without-deps-release-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: static-without-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda11_6-static-without-deps-release + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af diff --git a/.github/workflows/generated-windows-binary-wheel-master.yml b/.github/workflows/generated-windows-binary-wheel-master.yml new file mode 100644 index 000000000000..befb73dd15c2 --- /dev/null +++ b/.github/workflows/generated-windows-binary-wheel-master.yml @@ -0,0 +1,241 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/windows_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: windows-binary-wheel + +on: + push: + branches: + - master + tags: + - 'ciflow/all/*' + - 'ciflow/trunk/*' + workflow_dispatch: + +env: + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + ANACONDA_USER: pytorch + AWS_DEFAULT_REGION: us-east-1 + BUILD_ENVIRONMENT: windows-binary-wheel + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IN_CI: 1 + IS_GHA: 1 + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + PR_NUMBER: ${{ github.event.pull_request.number }} + PYTORCH_RETRY_TEST_CASES: 1 + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + SKIP_ALL_TESTS: 1 +concurrency: + group: windows-binary-wheel-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + wheel-py3_7-cuda11_3-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: wheel-py3_7-cuda11_3 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_7-cuda11_3-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_7-cuda11_3-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: wheel-py3_7-cuda11_3 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 diff --git a/.github/workflows/generated-windows-binary-wheel-nightly.yml b/.github/workflows/generated-windows-binary-wheel-nightly.yml new file mode 100644 index 000000000000..12a8b5661f4e --- /dev/null +++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml @@ -0,0 +1,3638 @@ +# @generated DO NOT EDIT MANUALLY + +# Template is at: .github/templates/windows_binary_build_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: windows-binary-wheel + +on: + push: + # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build + branches: + - nightly + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + - 'ciflow/binaries/*' + - 'ciflow/binaries_wheel/*' + workflow_dispatch: + +env: + # Needed for conda builds + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + ANACONDA_USER: pytorch + AWS_DEFAULT_REGION: us-east-1 + BUILD_ENVIRONMENT: windows-binary-wheel + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IN_CI: 1 + IS_GHA: 1 + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + PR_NUMBER: ${{ github.event.pull_request.number }} + PYTORCH_RETRY_TEST_CASES: 1 + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + SKIP_ALL_TESTS: 1 +concurrency: + group: windows-binary-wheel-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + wheel-py3_7-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: wheel-py3_7-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_7-cpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_7-cpu-build + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: wheel-py3_7-cpu + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_7-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_7-cpu-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: wheel-py3_7-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + wheel-py3_7-cuda11_3-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: wheel-py3_7-cuda11_3 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_7-cuda11_3-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_7-cuda11_3-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: wheel-py3_7-cuda11_3 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_7-cuda11_3-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_7-cuda11_3-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: wheel-py3_7-cuda11_3 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + wheel-py3_7-cuda11_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: wheel-py3_7-cuda11_6 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_7-cuda11_6-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_7-cuda11_6-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: wheel-py3_7-cuda11_6 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_7-cuda11_6-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_7-cuda11_6-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.7" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: wheel-py3_7-cuda11_6 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + wheel-py3_8-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: wheel-py3_8-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_8-cpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_8-cpu-build + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: wheel-py3_8-cpu + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_8-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_8-cpu-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: wheel-py3_8-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + wheel-py3_8-cuda11_3-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: wheel-py3_8-cuda11_3 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_8-cuda11_3-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_8-cuda11_3-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: wheel-py3_8-cuda11_3 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_8-cuda11_3-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_8-cuda11_3-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: wheel-py3_8-cuda11_3 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + wheel-py3_8-cuda11_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: wheel-py3_8-cuda11_6 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_8-cuda11_6-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_8-cuda11_6-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: wheel-py3_8-cuda11_6 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_8-cuda11_6-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_8-cuda11_6-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: wheel-py3_8-cuda11_6 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + wheel-py3_9-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: wheel-py3_9-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_9-cpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_9-cpu-build + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: wheel-py3_9-cpu + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_9-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_9-cpu-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: wheel-py3_9-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + wheel-py3_9-cuda11_3-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: wheel-py3_9-cuda11_3 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_9-cuda11_3-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_9-cuda11_3-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: wheel-py3_9-cuda11_3 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_9-cuda11_3-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_9-cuda11_3-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: wheel-py3_9-cuda11_3 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + wheel-py3_9-cuda11_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: wheel-py3_9-cuda11_6 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_9-cuda11_6-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_9-cuda11_6-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: wheel-py3_9-cuda11_6 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_9-cuda11_6-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_9-cuda11_6-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: wheel-py3_9-cuda11_6 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + wheel-py3_10-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: wheel-py3_10-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_10-cpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_10-cpu-build + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: wheel-py3_10-cpu + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_10-cpu-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_10-cpu-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: wheel-py3_10-cpu + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + wheel-py3_10-cuda11_3-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: wheel-py3_10-cuda11_3 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_10-cuda11_3-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_10-cuda11_3-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: wheel-py3_10-cuda11_3 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_10-cuda11_3-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_10-cuda11_3-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu113 + GPU_ARCH_VERSION: 11.3 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: wheel-py3_10-cuda11_3 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + wheel-py3_10-cuda11_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: seemethere/upload-artifact-s3@v4 + if: always() + with: + name: wheel-py3_10-cuda11_6 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_10-cuda11_6-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_10-cuda11_6-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails + - name: Disables Windows Defender scheduled and real-time scanning for files in pytorch directory. + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring() -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: wheel-py3_10-cuda11_6 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_10-cuda11_6-upload: # Uploading + runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_10-cuda11_6-test + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu116 + GPU_ARCH_VERSION: 11.6 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + - name: Setup Linux + uses: ./.github/actions/setup-linux + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clone pytorch/pytorch + uses: actions/checkout@v2 + - uses: seemethere/download-artifact-s3@v3 + name: Download Build Artifacts + with: + name: wheel-py3_10-cuda11_6 + path: "${{ runner.temp }}/artifacts/" + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} + run: | + # reference ends with an RC suffix + if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + - name: Upload binaries + env: + PKG_DIR: "${{ runner.temp }}/artifacts" + UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" + # When running these on pull_request events these should be blank + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + run: | + docker run --rm -i \ + -e ANACONDA_API_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e DRY_RUN \ + -e PACKAGE_TYPE \ + -e PKG_DIR=/artifacts \ + -e UPLOAD_CHANNEL \ + -e UPLOAD_SUBFOLDER \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -v "${GITHUB_WORKSPACE}:/v" \ + -w /v \ + 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ + bash -c '.circleci/scripts/binary_upload.sh' + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af diff --git a/.github/workflows/generated-windows-binary-wheel.yml b/.github/workflows/generated-windows-binary-wheel.yml deleted file mode 100644 index afce9a010bb8..000000000000 --- a/.github/workflows/generated-windows-binary-wheel.yml +++ /dev/null @@ -1,4426 +0,0 @@ -# @generated DO NOT EDIT MANUALLY - -# Template is at: .github/templates/windows_binary_build_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: windows-binary-wheel - -on: - push: - # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build - branches: - - nightly - tags: - # NOTE: Binary build pipelines should only get triggered on release candidate builds - # Release candidate tags look like: v1.11.0-rc1 - - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ - - 'ciflow/binaries/*' - - 'ciflow/binaries_wheel/*' - workflow_dispatch: - -env: - # Needed for conda builds - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - ANACONDA_USER: pytorch - AWS_DEFAULT_REGION: us-east-1 - BUILD_ENVIRONMENT: windows-binary-wheel - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - IN_CI: 1 - IS_GHA: 1 - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - PR_NUMBER: ${{ github.event.pull_request.number }} - PYTORCH_RETRY_TEST_CASES: 1 - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - SKIP_ALL_TESTS: 1 -concurrency: - group: windows-binary-wheel-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - wheel-py3_7-cpu-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: wheel-py3_7-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_7-cpu-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_7-cpu-build - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_7-cpu - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_7-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_7-cpu-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_7-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - wheel-py3_7-cuda11_1-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: wheel-py3_7-cuda11_1 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_7-cuda11_1-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_7-cuda11_1-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_7-cuda11_1 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_7-cuda11_1-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_7-cuda11_1-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_7-cuda11_1 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - wheel-py3_7-cuda11_3-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: wheel-py3_7-cuda11_3 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_7-cuda11_3-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_7-cuda11_3-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_7-cuda11_3 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_7-cuda11_3-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_7-cuda11_3-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_7-cuda11_3 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - wheel-py3_7-cuda11_5-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: wheel-py3_7-cuda11_5 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_7-cuda11_5-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_7-cuda11_5-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_7-cuda11_5 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_7-cuda11_5-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_7-cuda11_5-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.7" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_7-cuda11_5 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - wheel-py3_8-cpu-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: wheel-py3_8-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_8-cpu-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_8-cpu-build - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_8-cpu - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_8-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_8-cpu-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_8-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - wheel-py3_8-cuda11_1-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: wheel-py3_8-cuda11_1 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_8-cuda11_1-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_8-cuda11_1-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_8-cuda11_1 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_8-cuda11_1-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_8-cuda11_1-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_8-cuda11_1 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - wheel-py3_8-cuda11_3-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: wheel-py3_8-cuda11_3 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_8-cuda11_3-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_8-cuda11_3-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_8-cuda11_3 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_8-cuda11_3-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_8-cuda11_3-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_8-cuda11_3 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - wheel-py3_8-cuda11_5-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: wheel-py3_8-cuda11_5 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_8-cuda11_5-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_8-cuda11_5-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_8-cuda11_5 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_8-cuda11_5-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_8-cuda11_5-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_8-cuda11_5 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - wheel-py3_9-cpu-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: wheel-py3_9-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_9-cpu-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_9-cpu-build - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_9-cpu - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_9-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_9-cpu-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_9-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - wheel-py3_9-cuda11_1-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: wheel-py3_9-cuda11_1 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_9-cuda11_1-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_9-cuda11_1-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_9-cuda11_1 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_9-cuda11_1-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_9-cuda11_1-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_9-cuda11_1 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - wheel-py3_9-cuda11_3-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: wheel-py3_9-cuda11_3 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_9-cuda11_3-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_9-cuda11_3-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_9-cuda11_3 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_9-cuda11_3-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_9-cuda11_3-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_9-cuda11_3 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - wheel-py3_9-cuda11_5-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: wheel-py3_9-cuda11_5 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_9-cuda11_5-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_9-cuda11_5-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_9-cuda11_5 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_9-cuda11_5-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_9-cuda11_5-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_9-cuda11_5 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - wheel-py3_10-cpu-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: wheel-py3_10-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cpu-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_10-cpu-build - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_10-cpu - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cpu-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_10-cpu-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_10-cpu - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - wheel-py3_10-cuda11_1-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: wheel-py3_10-cuda11_1 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cuda11_1-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_10-cuda11_1-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_10-cuda11_1 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cuda11_1-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_10-cuda11_1-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu111 - GPU_ARCH_VERSION: 11.1 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_10-cuda11_1 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - wheel-py3_10-cuda11_3-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: wheel-py3_10-cuda11_3 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cuda11_3-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_10-cuda11_3-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_10-cuda11_3 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cuda11_3-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_10-cuda11_3-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu113 - GPU_ARCH_VERSION: 11.3 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_10-cuda11_3 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - wheel-py3_10-cuda11_5-build: - runs-on: windows.4xlarge - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: seemethere/upload-artifact-s3@v3 - if: always() - with: - name: wheel-py3_10-cuda11_5 - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cuda11_5-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_10-cuda11_5-build - runs-on: windows.8xlarge.nvidia.gpu - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_10-cuda11_5 - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cuda11_5-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_10-cuda11_5-test - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu115 - GPU_ARCH_VERSION: 11.5 - GPU_ARCH_TYPE: cuda - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download Build Artifacts - with: - name: wheel-py3_10-cuda11_5 - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index d98a81da5e9b..6876a2bfc36f 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -1,22 +1,79 @@ name: Lint on: + pull_request: push: branches: - master - pull_request: + - main + - release/* + workflow_dispatch: jobs: + lintrunner: + runs-on: linux.20_04.16x + steps: + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: 3.8 + architecture: x64 + + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + with: + submodules: false + + - name: Install lintrunner + run: pip install lintrunner==0.9.* + + - name: Initialize lint dependencies + run: lintrunner init + + - name: Do build steps necessary for linters + run: | + python3 -m tools.linter.clang_tidy.generate_build_files + python3 -m tools.generate_torch_version --is_debug=false + python3 -m tools.pyi.gen_pyi \ + --native-functions-path aten/src/ATen/native/native_functions.yaml \ + --tags-path aten/src/ATen/native/tags.yaml \ + --deprecated-functions-path "tools/autograd/deprecated.yaml" + + - name: Run lintrunner on all files + run: | + set +e + if ! lintrunner --verbose --force-color --all-files --tee-json=lint.json; then + echo "" + echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`.\e[0m" + echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m" + exit 1 + fi + + - name: Store annotations + if: always() && github.event_name == 'pull_request' + # Don't show this as an error; the above step will have already failed. + continue-on-error: true + run: | + # Use jq to massage the JSON lint output into GitHub Actions workflow commands. + jq --raw-output \ + '"::\(if .severity == "advice" or .severity == "disabled" then "warning" else .severity end) file=\(.path),line=\(.line),col=\(.char),title=\(.code) \(.name)::" + (.description | gsub("\\n"; "%0A"))' \ + lint.json + quick-checks: - runs-on: ubuntu-18.04 + name: quick-checks + runs-on: linux.20_04.4x steps: - name: Setup Python uses: actions/setup-python@v2 with: python-version: 3.x architecture: x64 + # [see note: pytorch repo ref] - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + with: + submodules: false + fetch-depth: 1 - name: Clean PyTorch checkout run: | # Remove any artifacts from the previous checkouts @@ -24,70 +81,16 @@ jobs: - name: Install requirements id: requirements run: pip3 install -r requirements.txt --user - - name: Ensure consistent CircleCI YAML config - if: ${{ always() && steps.requirements.outcome == 'success' }} - run: cd .circleci && ./ensure-consistency.py - - name: Lint native_functions.yaml - if: ${{ always() && steps.requirements.outcome == 'success' }} - run: | - pip3 install ruamel.yaml==0.17.4 --user - .github/scripts/lint_native_functions.py - - name: Ensure correct trailing newlines - if: ${{ always() && steps.requirements.outcome == 'success' }} - run: | - (! git --no-pager grep -Il '' -- . ':(exclude)**/contrib/**' ':(exclude)third_party' ':(exclude)**.expect' ':(exclude)**.ipynb' ':(exclude)tools/clang_format_hash' | tools/linter/trailing_newlines.py || (echo "The above files do not have correct trailing newlines; please normalize them"; false)) - - name: Ensure no trailing spaces - if: always() - run: | - (! git --no-pager grep -In '[[:blank:]]$' -- . ':(exclude)**/contrib/**' ':(exclude)**.diff' ':(exclude)third_party' || (echo "The above lines have trailing spaces; please remove them"; false)) - - name: Ensure no tabs - if: always() - run: | - (! git --no-pager grep -In $'\t' -- . ':(exclude)*.svg' ':(exclude)**Makefile' ':(exclude)**/contrib/**' ':(exclude)third_party' ':(exclude).gitattributes' ':(exclude).gitmodules' || (echo "The above lines have tabs; please convert them to spaces"; false)) - name: Ensure no non-breaking spaces if: always() run: | # NB: We use 'printf' below rather than '\u000a' since bash pre-4.2 # does not support the '\u000a' syntax (which is relevant for local linters) (! git --no-pager grep -In "$(printf '\xC2\xA0')" -- . || (echo "The above lines have non-breaking spaces (U+00A0); please convert them to spaces (U+0020)"; false)) - - name: Ensure canonical include - if: always() - run: | - (! git --no-pager grep -In $'#include "' -- ./c10 ./aten ./torch/csrc ':(exclude)aten/src/ATen/native/quantized/cpu/qnnpack/**' ':(exclude)torch/csrc/jit/serialization/mobile_bytecode_generated.h'|| (echo "The above lines have include with quotes; please convert them to #include "; false)) - name: Ensure no versionless Python shebangs if: always() run: | (! git --no-pager grep -In '#!.*python$' -- . || (echo "The above lines have versionless Python shebangs; please specify either python2 or python3"; false)) - - name: Ensure no unqualified noqa - if: always() - run: | - # shellcheck disable=SC2016 - (! git --no-pager grep -InP '# noqa(?!: [A-Z]+\d{3})' -- '**.py' '**.pyi' ':(exclude)caffe2' || (echo 'The above lines have unqualified `noqa`; please convert them to `noqa: XXXX`'; false)) - - name: Ensure no unqualified type ignore - if: always() - run: | - # shellcheck disable=SC2016 - (! git --no-pager grep -InP '# type:\s*ignore(?!\[)' -- '**.py' '**.pyi' ':(exclude)test/test_jit.py' || (echo 'The above lines have unqualified `type: ignore`; please convert them to `type: ignore[xxxx]`'; false)) - - name: Ensure GitHub PyPi dependencies are pinned - if: always() - run: | - (! git --no-pager grep --color=always -InP \ - '(pip|pip3|python -m pip|python3 -m pip|python3 -mpip|python -mpip) install ([a-z][\.a-z-0-9]*+(?!(=|.*\.whl))([[:blank:]]|))+' \ - -- .github \ - ':(exclude)**.rst' \ - ':(exclude)**.py' \ - ':(exclude)**.md' \ - ':(exclude)**.diff' \ - ':(exclude)third_party' || - (echo "The above lines have unpinned PyPi installs; please pin them to a specific version: e.g. 'thepackage==1.2'"; false)) - # note that this next step depends on a clean checkout; - # if you run it locally then it will likely to complain - # about all the generated files in torch/test - - name: Ensure C++ source files are not executable - if: always() - run: | - # shellcheck disable=SC2016 - (! find . \( -path ./third_party -o -path ./.git -o -path ./torch/bin -o -path ./build \) -prune -o -type f -executable -regextype posix-egrep -not -regex '.+(\.(bash|sh|py|so)|git-pre-commit|git-clang-format|gradlew)$' -print | grep . || (echo 'The above files have executable permission; please remove their executable permission by using `chmod -x`'; false)) - name: C++ docs check if: ${{ always() && steps.requirements.outcome == 'success' }} run: | @@ -98,89 +101,22 @@ jobs: run: | set -eux python torch/testing/_check_kernel_launches.py |& tee "${GITHUB_WORKSPACE}"/cuda_kernel_launch_checks.txt - - name: Ensure no direct cub include - if: always() - run: | - (! git --no-pager grep -I -no $'#include commit-sha.txt - - name: Install dependencies - run: | - set -eux - pip3 install typing-extensions==3.10 --user # for tools/linter/translate_annotations.py - pip3 install -r requirements-flake8.txt --user - flake8 --version - - name: Run flake8 - run: | - set -eux - flake8 | tee "${GITHUB_WORKSPACE}"/flake8-output.txt - - name: Translate annotations - if: ${{ github.event_name == 'pull_request' }} - env: - HEAD_SHA: ${{ github.event.pull_request.head.sha }} - run: | - tools/linter/translate_annotations.py \ - --file="${GITHUB_WORKSPACE}"/flake8-output.txt \ - --regex='^(?P.*?):(?P\d+):(?P\d+): (?P\w+\d+) (?P.*)' \ - --commit="$HEAD_SHA" \ - > flake8-output/annotations.json - - name: Fail if there were any warnings - run: | - set -eu - # Re-output flake8 status so GitHub logs show it on the step that actually failed - cat "${GITHUB_WORKSPACE}"/flake8-output.txt - if [ -s "${GITHUB_WORKSPACE}"/flake8-output.txt ]; then - echo 'Please fix the above Flake8 warnings.' - false - fi - - name: Add annotations - # Don't run on forked pull requests - if: ${{ failure() && github.event.pull_request.head.repo.full_name == github.repository }} - uses: pytorch/add-annotations-github-action@master - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - check_name: 'flake8-py3' - linter_output_path: flake8-output/annotations.json - commit_sha: ${{ github.event.pull_request.head.sha }} - mode: json - - clang-tidy: - runs-on: linux.2xlarge - container: - # ubuntu20.04-cuda11.2-py3.8-tidy11 - image: ghcr.io/pytorch/cilint-clang-tidy:d8f0c777964d0dd8a147360de80aed1a13eb613a - steps: - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" + # [see note: pytorch repo ref] + # deep clone (fetch-depth 0) required, to allow us to use git log - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master with: - fetch-depth: 0 # to allow tools/linter/clang_tidy.py to do its thing - - name: Prepare output dir with HEAD commit SHA - env: - HEAD_SHA: ${{ github.event.pull_request.head.sha }} - run: | - cd "${GITHUB_WORKSPACE}" - mkdir clang-tidy-output - cd clang-tidy-output - echo "$HEAD_SHA" > commit-sha.txt - - name: Fetch PR diff - if: ${{ github.event_name == 'pull_request' }} - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - run: | - cd "${GITHUB_WORKSPACE}" - wget -O pr.diff "https://patch-diff.githubusercontent.com/raw/pytorch/pytorch/pull/$PR_NUMBER.diff" - - name: Generate build files - run: | - cd "${GITHUB_WORKSPACE}" - python3 -m tools.linter.clang_tidy.generate_build_files - - name: Run PR clang-tidy - if: ${{ github.event_name == 'pull_request' }} - run: | - cd "${GITHUB_WORKSPACE}" - - # The Docker image has our custom build, so we don't need to install it - python3 -m tools.linter.clang_tidy \ - --clang-tidy-exe "$(which clang-tidy)" \ - --diff-file pr.diff \ - --disable-progress-bar 2>&1 | tee "${GITHUB_WORKSPACE}"/clang-tidy-output.txt - - # Run clang-tidy on a smaller subset of the codebase on master until we - # make the repository clang-tidy clean - - name: Run master clang-tidy - run: | - cd "${GITHUB_WORKSPACE}" - - python3 -m tools.linter.clang_tidy \ - --paths \ - torch/csrc/fx \ - torch/csrc/utils \ - torch/csrc/generic \ - torch/csrc/deploy \ - torch/csrc/tensor \ - --clang-tidy-exe "$(which clang-tidy)" \ - --disable-progress-bar 2>&1 | tee -a "${GITHUB_WORKSPACE}"/clang-tidy-output.txt - - - name: Annotate output - if: ${{ github.event_name == 'pull_request' }} - env: - HEAD_SHA: ${{ github.event.pull_request.head.sha }} - run: | - cd "${GITHUB_WORKSPACE}" - sed --in-place 's/^\.\.\///g' clang-tidy-output.txt - tools/linter/translate_annotations.py \ - --file=clang-tidy-output.txt \ - --regex='^(?P.*?):(?P\d+):(?P\d+): (?P.*?) \[(?P.*)\]' \ - --commit="$HEAD_SHA" \ - > clang-tidy-output/annotations.json - - name: Check for warnings - run: | - cd "${GITHUB_WORKSPACE}" - set -eu - cat "${GITHUB_WORKSPACE}"/clang-tidy-output.txt - if grep -Fq "Warnings detected!" "${GITHUB_WORKSPACE}"/clang-tidy-output.txt; then - echo 'Please fix the above clang-tidy warnings.' - false - fi - - name: Add annotations - # Don't run on forked pull requests - if: ${{ failure() && github.event.pull_request.head.repo.full_name == github.repository }} - uses: pytorch/add-annotations-github-action@master - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - check_name: 'clang-tidy' - linter_output_path: clang-tidy/annotations.json - commit_sha: ${{ github.event.pull_request.head.sha }} - mode: json - - cmakelint: - runs-on: ubuntu-18.04 - steps: - - name: Setup Python - uses: actions/setup-python@v2 - with: - python-version: 3.x - architecture: x64 - - name: Fetch PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + submodules: false - name: Install dependencies + # mypy and boto3 versions copied from + # .circleci/docker/common/install_conda.sh run: | set -eux - pip3 install cmakelint==1.4.1 --user - cmakelint --version - - name: Run cmakelint - run: | - set -eux - git ls-files -z -- bootstrap '*.cmake' '*.cmake.in' '*CMakeLists.txt' | \ - grep -E -z -v '^(cmake/Modules/|cmake/Modules_CUDA_fix/|cmake/Caffe2Config.cmake.in|aten/src/ATen/ATenConfig.cmake.in|cmake/Caffe2ConfigVersion.cmake.in|cmake/TorchConfig.cmake.in|cmake/TorchConfigVersion.cmake.in|cmake/cmake_uninstall.cmake.in)' | \ - xargs -0 cmakelint --config=.cmakelintrc --spaces=2 --quiet - - mypy: - runs-on: ubuntu-18.04 + python3 -mpip install -r requirements.txt + python3 -mpip install boto3==1.16.34 + pip3 install typing-extensions==3.10 --user + pip3 install -r requirements-flake8.txt --user + python3 -mpip install -r requirements.txt --user + python3 -mpip install mypy==0.812 --user + make setup_lint + - name: Test tools + run: | + python3 -m unittest discover -vs tools/test -p 'test_*.py' + python3 -m unittest discover -vs .github/scripts -p 'test_*.py' + + test_collect_env: + if: ${{ github.repository == 'pytorch/pytorch' }} + name: Test collect_env + runs-on: linux.20_04.4x + strategy: + matrix: + with_torch: [with_torch, without_torch] steps: - name: Setup Python uses: actions/setup-python@v2 with: python-version: 3.8 architecture: x64 - - name: Fetch PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Install dependencies - run: | - set -eux - python3 -mpip install -r requirements.txt --user - python3 -mpip install numpy==1.20 --user # https://github.com/pytorch/pytorch/pull/60472 - python3 -mpip install expecttest==0.1.3 mypy==0.812 --user - # Needed to check tools/render_junit.py - python3 -mpip install junitparser==2.1.1 rich==10.9.0 --user - - name: Run autogen + # [see note: pytorch repo ref] + # deep clone (fetch-depth 0) required, to allow us to use git log + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + with: + submodules: false + fetch-depth: 1 + - name: Install torch + if: matrix.with_torch == 'with_torch' run: | - set -eux - time python3 -mtools.generate_torch_version --is_debug=false - time python3 -mtools.codegen.gen -s aten/src/ATen -d build/aten/src/ATen - time python3 -mtools.pyi.gen_pyi --native-functions-path aten/src/ATen/native/native_functions.yaml --deprecated-functions-path "tools/autograd/deprecated.yaml" - - name: Run mypy - env: - MYPY_FORCE_COLOR: 1 - TERM: xterm-color + # Doesn't really matter what torch version, we just need ANY torch installed + pip install 'torch==1.*' + - name: Run collect_env.py run: | - set -eux - STATUS= - for CONFIG in mypy*.ini; do - if ! python3 -mmypy --config="$CONFIG"; then - STATUS=fail - fi - done - if [ -n "$STATUS" ]; then - echo 'Please fix the above mypy warnings.' - false - fi + # All we need to see is that it passes + python3 torch/utils/collect_env.py concurrency: - group: lint-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml new file mode 100644 index 000000000000..3322b2097a17 --- /dev/null +++ b/.github/workflows/nightly.yml @@ -0,0 +1,33 @@ +name: nightly + +on: + schedule: + - cron: 0 0 * * * + push: + tags: + - ciflow/nightly/* + workflow_dispatch: + + +concurrency: + group: ${{ github.workflow }}--${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + docs-build: + name: docs build + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-xenial-py3.7-gcc5.4 + docker-image-name: pytorch-linux-xenial-py3.7-gcc5.4 + + docs-push: + name: docs push + uses: ./.github/workflows/_docs.yml + needs: docs-build + with: + build-environment: linux-xenial-py3.7-gcc5.4 + docker-image: ${{ needs.docs-build.outputs.docker-image }} + push: true + secrets: + GH_PYTORCHBOT_TOKEN: ${{ secrets.GH_PYTORCHBOT_TOKEN }} diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml new file mode 100644 index 000000000000..ad3908e5f5cd --- /dev/null +++ b/.github/workflows/periodic.yml @@ -0,0 +1,202 @@ +name: periodic + +on: + schedule: + - cron: 45 0,4,8,12,16,20 * * * + push: + tags: + - ciflow/periodic/* + - ciflow/all/* + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}--${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + linux-bionic-cuda11_6-py3_7-gcc7-build: + name: linux-bionic-cuda11.6-py3.7-gcc7 + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-bionic-cuda11.6-py3.7-gcc7 + docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7 + + linux-bionic-cuda11_6-py3_7-gcc7-test: + name: linux-bionic-cuda11.6-py3.7-gcc7 + uses: ./.github/workflows/_linux-test.yml + needs: linux-bionic-cuda11_6-py3_7-gcc7-build + with: + build-environment: linux-bionic-cuda11.6-py3.7-gcc7 + docker-image: ${{ needs.linux-bionic-cuda11_6-py3_7-gcc7-build.outputs.docker-image }} + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" }, + { config: "default", shard: 2, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" }, + ]} + + libtorch-linux-bionic-cuda11_6-py3_7-gcc7-build: + name: libtorch-linux-bionic-cuda11.6-py3.7-gcc7 + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: libtorch-linux-bionic-cuda11.6-py3.7-gcc7 + docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7 + build-generates-artifacts: false + + linux-xenial-cuda10_2-py3-gcc7-slow-gradcheck-build: + name: linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck + docker-image-name: pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7 + + linux-xenial-cuda10_2-py3-gcc7-slow-gradcheck-test: + name: linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck + uses: ./.github/workflows/_linux-test.yml + needs: linux-xenial-cuda10_2-py3-gcc7-slow-gradcheck-build + with: + build-environment: linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck + docker-image: ${{ needs.linux-xenial-cuda10_2-py3-gcc7-slow-gradcheck-build.outputs.docker-image }} + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" }, + { config: "default", shard: 2, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" }, + ]} + + linux-bionic-rocm5_1-py3_7-slow-build: + name: linux-bionic-rocm5.1-py3.7-slow + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-bionic-rocm5.1-py3.7 + docker-image-name: pytorch-linux-bionic-rocm5.1-py3.7 + + linux-bionic-rocm5_1-py3_7-slow-test: + name: linux-bionic-rocm5.1-py3.7-slow + uses: ./.github/workflows/_rocm-test.yml + needs: linux-bionic-rocm5_1-py3_7-slow-build + with: + build-environment: linux-bionic-rocm5.1-py3.7 + docker-image: ${{ needs.linux-bionic-rocm5_1-py3_7-slow-build.outputs.docker-image }} + test-matrix: | + { include: [ + { config: "slow", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" }, + ]} + secrets: + AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }} + AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }} + + linux-bionic-rocm5_1-py3_7-distributed-build: + name: linux-bionic-rocm5.1-py3.7-distributed + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-bionic-rocm5.1-py3.7 + docker-image-name: pytorch-linux-bionic-rocm5.1-py3.7 + + linux-bionic-rocm5_1-py3_7-distributed-test: + name: linux-bionic-rocm5.1-py3.7-distributed + uses: ./.github/workflows/_rocm-test.yml + needs: linux-bionic-rocm5_1-py3_7-distributed-build + with: + build-environment: linux-bionic-rocm5.1-py3.7 + docker-image: ${{ needs.linux-bionic-rocm5_1-py3_7-distributed-build.outputs.docker-image }} + test-matrix: | + { include: [ + { config: "distributed", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" }, + { config: "distributed", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" }, + ]} + secrets: + AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }} + AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }} + + linux-xenial-cuda11_3-py3_7-gcc7-debug-build: + name: linux-xenial-cuda11.3-py3.7-gcc7-debug + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-xenial-cuda11.3-py3.7-gcc7-debug + docker-image-name: pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7 + build-with-debug: true + + linux-xenial-cuda11_3-py3_7-gcc7-debug-test: + name: linux-xenial-cuda11.3-py3.7-gcc7-debug + uses: ./.github/workflows/_linux-test.yml + needs: linux-xenial-cuda11_3-py3_7-gcc7-debug-build + with: + build-environment: linux-xenial-cuda11.3-py3.7-gcc7-debug + docker-image: ${{ needs.linux-xenial-cuda11_3-py3_7-gcc7-debug-build.outputs.docker-image }} + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" }, + { config: "default", shard: 2, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" }, + ]} + + win-vs2019-cuda11_6-py3-build: + name: win-vs2019-cuda11.6-py3 + uses: ./.github/workflows/_win-build.yml + with: + build-environment: win-vs2019-cuda11.6-py3 + cuda-version: "11.6" + + win-vs2019-cuda11_6-py3-test: + name: win-vs2019-cuda11.6-py3 + uses: ./.github/workflows/_win-test.yml + needs: win-vs2019-cuda11_6-py3-build + with: + build-environment: win-vs2019-cuda11.6-py3 + cuda-version: "11.6" + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 2, runner: "windows.8xlarge.nvidia.gpu" }, + { config: "default", shard: 2, num_shards: 2, runner: "windows.8xlarge.nvidia.gpu" }, + { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" }, + ]} + + ios-12-5-1-arm64: + name: ios-12-5-1-arm64 + uses: ./.github/workflows/_ios-build-test.yml + with: + build-environment: ios-12-5-1-arm64 + ios-platform: OS + ios-arch: arm64 + secrets: + IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }} + IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET}} + IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID}} + IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }} + + ios-12-5-1-arm64-coreml: + name: ios-12-5-1-arm64-coreml + uses: ./.github/workflows/_ios-build-test.yml + with: + build-environment: ios-12-5-1-arm64-coreml + ios-platform: OS + ios-arch: arm64 + secrets: + IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }} + IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET}} + IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID}} + IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }} + + ios-12-5-1-arm64-custom-ops: + name: ios-12-5-1-arm64-custom-ops + uses: ./.github/workflows/_ios-build-test.yml + with: + build-environment: ios-12-5-1-arm64-custom-ops + ios-platform: OS + ios-arch: arm64 + secrets: + IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }} + IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET}} + IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID}} + IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }} + + ios-12-5-1-arm64-metal: + name: ios-12-5-1-arm64-metal + uses: ./.github/workflows/_ios-build-test.yml + with: + build-environment: ios-12-5-1-arm64-metal + ios-platform: OS + ios-arch: arm64 + secrets: + IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }} + IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET}} + IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID}} + IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }} diff --git a/.github/workflows/pr-labels.yml b/.github/workflows/pr-labels.yml index af2acc1101e7..7313d0b8e968 100644 --- a/.github/workflows/pr-labels.yml +++ b/.github/workflows/pr-labels.yml @@ -4,6 +4,7 @@ on: push: branches: - master + - main jobs: is-properly-labeled: diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml new file mode 100644 index 000000000000..ebc936bcd5ed --- /dev/null +++ b/.github/workflows/pull.yml @@ -0,0 +1,325 @@ +name: pull + +on: + pull_request: + push: + branches: + - master + - main + - release/* + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + linux-xenial-py3_7-gcc5_4-build: + name: linux-xenial-py3.7-gcc5.4 + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-xenial-py3.7-gcc5.4 + docker-image-name: pytorch-linux-xenial-py3.7-gcc5.4 + + linux-xenial-py3_7-gcc5_4-test: + name: linux-xenial-py3.7-gcc5.4 + uses: ./.github/workflows/_linux-test.yml + needs: linux-xenial-py3_7-gcc5_4-build + with: + build-environment: linux-xenial-py3.7-gcc5.4 + docker-image: ${{ needs.linux-xenial-py3_7-gcc5_4-build.outputs.docker-image }} + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" }, + { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" }, + { config: "distributed", shard: 1, num_shards: 1, runner: "linux.2xlarge" }, + { config: "docs_test", shard: 1, num_shards: 1, runner: "linux.2xlarge" }, + { config: "backwards_compat", shard: 1, num_shards: 1, runner: "linux.2xlarge" }, + { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.2xlarge" }, + ]} + + linux-docs: + name: linux-docs + uses: ./.github/workflows/_docs.yml + needs: linux-xenial-py3_7-gcc5_4-build + with: + build-environment: linux-xenial-py3.7-gcc5.4 + docker-image: ${{ needs.linux-xenial-py3_7-gcc5_4-build.outputs.docker-image }} + + linux-xenial-py3_7-gcc7-build: + name: linux-xenial-py3.7-gcc7 + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-xenial-py3.7-gcc7 + docker-image-name: pytorch-linux-xenial-py3.7-gcc7 + + linux-xenial-py3_7-gcc7-test: + name: linux-xenial-py3.7-gcc7 + uses: ./.github/workflows/_linux-test.yml + needs: linux-xenial-py3_7-gcc7-build + with: + build-environment: linux-xenial-py3.7-gcc7 + docker-image: ${{ needs.linux-xenial-py3_7-gcc7-build.outputs.docker-image }} + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" }, + { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" }, + ]} + + linux-xenial-py3_7-clang7-asan-build: + name: linux-xenial-py3.7-clang7-asan + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-xenial-py3.7-clang7-asan + docker-image-name: pytorch-linux-xenial-py3-clang7-asan + + linux-xenial-py3_7-clang7-asan-test: + name: linux-xenial-py3.7-clang7-asan + uses: ./.github/workflows/_linux-test.yml + needs: linux-xenial-py3_7-clang7-asan-build + with: + build-environment: linux-xenial-py3.7-clang7-asan + docker-image: ${{ needs.linux-xenial-py3_7-clang7-asan-build.outputs.docker-image }} + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 5, runner: "linux.2xlarge" }, + { config: "default", shard: 2, num_shards: 5, runner: "linux.2xlarge" }, + { config: "default", shard: 3, num_shards: 5, runner: "linux.2xlarge" }, + { config: "default", shard: 4, num_shards: 5, runner: "linux.2xlarge" }, + { config: "default", shard: 5, num_shards: 5, runner: "linux.2xlarge" }, + ]} + + linux-xenial-py3_7-gcc7-no-ops: + name: linux-xenial-py3.7-gcc7-no-ops + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-xenial-py3.7-gcc7-no-ops + docker-image-name: pytorch-linux-xenial-py3.7-gcc7 + + linux-xenial-py3_7-clang7-onnx-build: + name: linux-xenial-py3.7-clang7-onnx + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-xenial-py3.7-clang7-onnx + docker-image-name: pytorch-linux-xenial-py3-clang7-onnx + + linux-xenial-py3_7-clang7-onnx-test: + name: linux-xenial-py3.7-clang7-onnx + uses: ./.github/workflows/_linux-test.yml + needs: linux-xenial-py3_7-clang7-onnx-build + with: + build-environment: linux-xenial-py3.7-clang7-onnx + docker-image: ${{ needs.linux-xenial-py3_7-clang7-onnx-build.outputs.docker-image }} + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" }, + { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" }, + ]} + + linux-bionic-py3_7-clang9-build: + name: linux-bionic-py3.7-clang9 + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-bionic-py3.7-clang9 + docker-image-name: pytorch-linux-bionic-py3.7-clang9 + + linux-bionic-py3_7-clang9-test: + name: linux-bionic-py3.7-clang9 + uses: ./.github/workflows/_linux-test.yml + needs: linux-bionic-py3_7-clang9-build + with: + build-environment: linux-bionic-py3.7-clang9 + docker-image: ${{ needs.linux-bionic-py3_7-clang9-build.outputs.docker-image }} + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" }, + { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" }, + { config: "crossref", shard: 1, num_shards: 2, runner: "linux.2xlarge" }, + { config: "crossref", shard: 2, num_shards: 2, runner: "linux.2xlarge" }, + ]} + + linux-bionic-cuda11_3-py3_7-clang9-build: + name: linux-bionic-cuda11.3-py3.7-clang9 + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-bionic-cuda11.3-py3.7-clang9 + docker-image-name: pytorch-linux-bionic-cuda11.3-cudnn8-py3-clang9 + + linux-vulkan-bionic-py3_7-clang9-build: + name: linux-vulkan-bionic-py3.7-clang9 + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-vulkan-bionic-py3.7-clang9 + docker-image-name: pytorch-linux-bionic-py3.7-clang9 + + linux-vulkan-bionic-py3_7-clang9-test: + name: linux-vulkan-bionic-py3.7-clang9 + uses: ./.github/workflows/_linux-test.yml + needs: linux-vulkan-bionic-py3_7-clang9-build + with: + build-environment: linux-vulkan-bionic-py3.7-clang9 + docker-image: ${{ needs.linux-vulkan-bionic-py3_7-clang9-build.outputs.docker-image }} + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 1, runner: "linux.2xlarge" }, + ]} + + linux-xenial-cuda11_3-py3_7-gcc7-build: + name: linux-xenial-cuda11.3-py3.7-gcc7 + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-xenial-cuda11.3-py3.7-gcc7 + docker-image-name: pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7 + + linux-xenial-cuda11_3-py3_7-gcc7-test: + name: linux-xenial-cuda11.3-py3.7-gcc7 + uses: ./.github/workflows/_linux-test.yml + needs: linux-xenial-cuda11_3-py3_7-gcc7-build + with: + build-environment: linux-xenial-cuda11.3-py3.7-gcc7 + docker-image: ${{ needs.linux-xenial-cuda11_3-py3_7-gcc7-build.outputs.docker-image }} + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" }, + { config: "default", shard: 2, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" }, + { config: "default", shard: 3, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" }, + { config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" }, + { config: "distributed", shard: 1, num_shards: 2, runner: "linux.8xlarge.nvidia.gpu" }, + { config: "distributed", shard: 2, num_shards: 2, runner: "linux.8xlarge.nvidia.gpu" }, + ]} + + linux-bionic-rocm5_1-py3_7-build: + name: linux-bionic-rocm5.1-py3.7 + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-bionic-rocm5.1-py3.7 + docker-image-name: pytorch-linux-bionic-rocm5.1-py3.7 + + linux-bionic-rocm5_1-py3_7-test: + name: linux-bionic-rocm5.1-py3.7 + uses: ./.github/workflows/_rocm-test.yml + needs: linux-bionic-rocm5_1-py3_7-build + with: + build-environment: linux-bionic-rocm5.1-py3.7 + docker-image: ${{ needs.linux-bionic-rocm5_1-py3_7-build.outputs.docker-image }} + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" }, + { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" }, + ]} + secrets: + AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }} + AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }} + + linux-xenial-py3-clang5-mobile-build: + name: linux-xenial-py3-clang5-mobile-build + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-xenial-py3-clang5-mobile-build + docker-image-name: pytorch-linux-xenial-py3-clang5-asan + build-generates-artifacts: false + + linux-xenial-py3-clang5-mobile-custom-build-static: + name: linux-xenial-py3-clang5-mobile-custom-build-static + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-xenial-py3-clang5-mobile-custom-build-static + docker-image-name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c + build-generates-artifacts: false + + pytorch-xla-linux-bionic-py3_7-clang8-build: + name: pytorch-xla-linux-bionic-py3.7-clang8 + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: pytorch-xla-linux-bionic-py3.7-clang8 + docker-image-name: xla_base + + pytorch-xla-linux-bionic-py3_7-clang8-test: + name: pytorch-xla-linux-bionic-py3.7-clang8 + uses: ./.github/workflows/_linux-test.yml + needs: pytorch-xla-linux-bionic-py3_7-clang8-build + with: + build-environment: pytorch-xla-linux-bionic-py3.7-clang8 + docker-image: ${{ needs.pytorch-xla-linux-bionic-py3_7-clang8-build.outputs.docker-image }} + test-matrix: | + { include: [ + { config: "xla", shard: 1, num_shards: 1, runner: "linux.2xlarge" }, + ]} + + win-vs2019-cpu-py3-build: + name: win-vs2019-cpu-py3 + uses: ./.github/workflows/_win-build.yml + with: + build-environment: win-vs2019-cpu-py3 + cuda-version: cpu + + win-vs2019-cpu-py3-test: + name: win-vs2019-cpu-py3 + uses: ./.github/workflows/_win-test.yml + needs: win-vs2019-cpu-py3-build + with: + build-environment: win-vs2019-cpu-py3 + cuda-version: cpu + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 2, runner: "windows.4xlarge" }, + { config: "default", shard: 2, num_shards: 2, runner: "windows.4xlarge" }, + ]} + + # please ensure that this and its corresponding job in trunk.yml are in sync + win-vs2019-cuda11_3-py3-build: + # don't run build twice on master + if: github.event_name == 'pull_request' + name: win-vs2019-cuda11.3-py3 + uses: ./.github/workflows/_win-build.yml + with: + build-environment: win-vs2019-cuda11.3-py3 + cuda-version: "11.3" + + linux-xenial-cuda11_3-py3_7-gcc7-bazel-test: + name: linux-xenial-cuda11.3-py3.7-gcc7-bazel-test + uses: ./.github/workflows/_bazel-build-test.yml + with: + build-environment: linux-xenial-cuda11.3-py3.7-gcc7-bazel-test + docker-image-name: pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7 + + pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single: + name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single + uses: ./.github/workflows/_android-build-test.yml + with: + build-environment: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single + docker-image-name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c + + pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit: + name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit + uses: ./.github/workflows/_android-build-test.yml + with: + build-environment: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit + docker-image-name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c + + linux-xenial-py3_7-gcc5_4-mobile-lightweight-dispatch-build: + name: linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build + docker-image-name: pytorch-linux-xenial-py3.7-gcc5.4 + build-generates-artifacts: false + + deploy-linux-xenial-cuda11_3-py3_7-gcc7-build: + name: deploy-linux-xenial-cuda11.3-py3.7-gcc7 + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: deploy-linux-xenial-cuda11.3-py3.7-gcc7 + docker-image-name: pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7 + + deploy-linux-xenial-cuda11_3-py3_7-gcc7-test: + name: linux-xenial-cuda11.3-py3.7-gcc7 + uses: ./.github/workflows/_linux-test.yml + needs: deploy-linux-xenial-cuda11_3-py3_7-gcc7-build + with: + build-environment: deploy-linux-xenial-cuda11.3-py3.7-gcc7 + docker-image: ${{ needs.deploy-linux-xenial-cuda11_3-py3_7-gcc7-build.outputs.docker-image }} + test-matrix: | + { include: [ + { config: "deploy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" }, + ]} diff --git a/.github/workflows/push_nightly_docker_ghcr.yml b/.github/workflows/push_nightly_docker_ghcr.yml index b11eebe3ffdf..ca30c9651ff8 100644 --- a/.github/workflows/push_nightly_docker_ghcr.yml +++ b/.github/workflows/push_nightly_docker_ghcr.yml @@ -1,22 +1,39 @@ -name: Build PyTorch nightly Docker image and push to GitHub Container Registry +name: docker-release-builds on: schedule: # Push the nightly docker daily at 1 PM UTC - cron: '0 13 * * *' + # Trigger when we modify something related to these images + pull_request: + paths: + - .github/scripts/build_publish_nightly_docker.sh + - .github/workflows/push_nightly_docker_ghcr.yml + - Dockerfile + - docker.Makefile # Have the ability to trigger this job manually using the API as well workflow_dispatch: jobs: - build-publish-docker: + docker-release-build: if: ${{ github.repository == 'pytorch/pytorch' }} runs-on: linux.2xlarge env: GHCR_PAT: ${{ secrets.GHCR_PAT }} + WITH_PUSH: ${{ github.event_name == 'schedule' }} steps: - - name: Checkout + - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - ref: master - - name: Build and upload nightly docker - run: | - bash .github/scripts/build_publish_nightly_docker.sh + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + - uses: nick-fields/retry@71062288b76e2b6214ebde0e673ce0de1755740a + name: Build and upload nightly docker + with: + timeout_minutes: 10 + max_attempts: 3 + command: | + set -ex + bash .github/scripts/build_publish_nightly_docker.sh + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true diff --git a/.github/workflows/revert.yml b/.github/workflows/revert.yml index fa5451d96951..05e7e68ff454 100644 --- a/.github/workflows/revert.yml +++ b/.github/workflows/revert.yml @@ -6,6 +6,7 @@ on: jobs: do_revert: + name: try_revert_pr_${{ github.event.client_payload.pr_num }} runs-on: ubuntu-20.04 steps: - name: Setup Python @@ -27,6 +28,14 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.MERGEBOT_TOKEN }} PR_NUM: ${{ github.event.client_payload.pr_num }} + COMMENT_ID: ${{ github.event.client_payload.comment_id }} GH_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} run: | - python3 .github/scripts/trymerge.py --revert "${PR_NUM}" + set -ex + if [ -n "${COMMENT_ID}" ]; then + python3 .github/scripts/trymerge.py --revert --comment-id "${COMMENT_ID}" "${PR_NUM}" + else + python3 .github/scripts/trymerge.py --revert "${PR_NUM}" + fi + +concurrency: try-revert diff --git a/.github/workflows/run_android_tests.yml b/.github/workflows/run_android_tests.yml new file mode 100644 index 000000000000..85cef5623d7e --- /dev/null +++ b/.github/workflows/run_android_tests.yml @@ -0,0 +1,67 @@ +name: android-tests + +on: + push: + tags: + # Trigger on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + - 'ciflow/trunk/*' + - 'ciflow/android/*' + branches: + - master + - main + - release/* + workflow_dispatch: + +concurrency: + group: run-android-tests-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +defaults: + run: + shell: bash -e -l {0} + +jobs: + + build-and-test: + runs-on: ubuntu-latest + env: + JOB_BASE_NAME: ubuntu-latest-android-tests + steps: + - name: Setup miniconda + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + python-version: 3.8 + activate-environment: build + + - name: Install dependencies + run: | + conda install -y \ + cffi \ + cmake \ + mkl \ + mkl-include \ + ninja \ + numpy \ + pyyaml \ + requests \ + setuptools \ + typing_extensions + + # [see note: pytorch repo ref] + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + + - name: Build PyTorch Android + run: | + export ANDROID_NDK="${ANDROID_SDK_ROOT}/ndk-bundle" + echo "CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}" >> "${GITHUB_ENV}" + ./scripts/build_pytorch_android.sh x86 + + - name: Run tests + uses: reactivecircus/android-emulator-runner@v2 + with: + api-level: 25 + script: ./android/run_tests.sh diff --git a/.github/workflows/run_torchbench.yml b/.github/workflows/run_torchbench.yml index 6533e43facf8..d84a32ca318e 100644 --- a/.github/workflows/run_torchbench.yml +++ b/.github/workflows/run_torchbench.yml @@ -36,10 +36,15 @@ jobs: # shellcheck disable=SC1091 . "${HOME}"/anaconda3/etc/profile.d/conda.sh conda activate pr-ci - conda install -y numpy requests ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions \ - future six dataclasses pillow pytest tabulate gitpython git-lfs tqdm + # pin cmake version to 3.22 since 3.23 breaks pytorch build + # see details at: https://github.com/pytorch/pytorch/issues/74985 + conda install -y numpy requests ninja pyyaml mkl mkl-include setuptools cmake=3.22 cffi typing_extensions \ + future six dataclasses pillow pytest tabulate gitpython git-lfs tqdm psutil # install magma conda install -y -c pytorch "${MAGMA_VERSION}" + # install ffmpeg-4.4.1 + # torchvision doesn't compile on ffmpeg-5: https://github.com/pytorch/vision/issues/5616 + conda install -y ffmpeg=4.4.1 - name: Setup TorchBench branch run: | # shellcheck disable=SC1091 @@ -53,7 +58,7 @@ jobs: with: repository: pytorch/benchmark path: benchmark - lfs: true + lfs: false ref: ${{ env.TORCHBENCH_BRANCH }} - name: Run TorchBench run: | @@ -84,5 +89,5 @@ jobs: path: ~/.torchbench/bisection/pr${{ github.event.number }} concurrency: - group: run-torchbench-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml new file mode 100644 index 000000000000..fb29e397b970 --- /dev/null +++ b/.github/workflows/stale.yml @@ -0,0 +1,152 @@ +# A workflow that implements similar logic to actions/stale. +# +# Compared to actions/stale, it is implemented to make API requests proportional +# to the number of stale PRs, not the total number of issues in the repo. This +# is because PyTorch has a lot of issues/PRs, so the actions/stale runs into +# rate limits way too quickly. +# +# The behavior is: +# - If a PR is not labeled stale, after 60 days inactivity label the PR as stale and comment about it. +# - If a PR is labeled stale, after 30 days inactivity close the PR. +# - `high priority` and `no-stale` PRs are exempt. + +name: Close stale pull requests + +on: + schedule: + # Run hourly. + - cron: 30 * * * * + +jobs: + stale: + if: ${{ github.repository == 'pytorch/pytorch' }} + runs-on: ubuntu-latest + + steps: + - uses: actions/github-script@v6 + with: + script: | + // Do some dumb retries on requests. + const retries = 7; + const baseBackoff = 100; + const sleep = timeout => new Promise(resolve => setTimeout(resolve, timeout)); + github.hook.wrap('request', async (request, options) => { + for (let attempt = 1; attempt <= retries; attempt++) { + try { + return await request(options); + } catch (err) { + if (attempt < retries) { + core.warning(`Request getting retried. Attempt: ${attempt}`); + await sleep(baseBackoff * Math.pow(2, attempt)); + continue; + } + throw err; + } + } + }); + + const MAX_API_REQUESTS = 100; + + // If a PRs not labeled stale, label them stale after no update for 60 days. + const STALE_LABEL_THRESHOLD_MS = 1000 * 60 * 60 * 24 * 60; + // For PRs already labeled stale, close after not update for 30 days. + const STALE_CLOSE_THRESHOLD_MS = 1000 * 60 * 60 * 24 * 30; + + const STALE_MESSAGE = + "Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as `Stale`.
" + + "Feel free to remove the `Stale` label if you feel this was a mistake.
" + + "If you are unable to remove the `Stale` label please contact a maintainer in order to do so.
" + + "If you want the bot to never mark this PR stale again, add the `no-stale` label.
" + + "`Stale` pull requests will automatically be closed after 30 days of inactivity.
"; + + let numAPIRequests = 0; + let numProcessed = 0; + + async function processPull(pull) { + core.info(`[${pull.number}] URL: ${pull.html_url}`); + numProcessed += 1; + const labels = pull.labels.map((label) => label.name); + + // Skip if certain labels are present. + if (labels.includes("no-stale") || labels.includes("high priority")) { + core.info(`[${pull.number}] Skipping because PR has an exempting label.`); + return false; + } + + // Check if the PR is stale, according to our configured thresholds. + let staleThresholdMillis; + if (labels.includes("Stale")) { + core.info(`[${pull.number}] PR is labeled stale, checking whether we should close it.`); + staleThresholdMillis = STALE_CLOSE_THRESHOLD_MS; + } else { + core.info(`[${pull.number}] Checking whether to label PR as stale.`); + staleThresholdMillis = STALE_LABEL_THRESHOLD_MS; + } + + const millisSinceLastUpdated = + new Date().getTime() - new Date(pull.updated_at).getTime(); + + if (millisSinceLastUpdated < staleThresholdMillis) { + core.info(`[${pull.number}] Skipping because PR was updated recently`); + return false; + } + + // At this point, we know we should do something. + // For PRs already labeled stale, close them. + if (labels.includes("Stale")) { + core.info(`[${pull.number}] Closing PR.`); + numAPIRequests += 1; + await github.rest.issues.update({ + owner: "pytorch", + repo: "pytorch", + issue_number: pull.number, + state: "closed", + }); + } else { + // For PRs not labeled stale, label them stale. + core.info(`[${pull.number}] Labeling PR as stale.`); + + numAPIRequests += 1; + await github.rest.issues.createComment({ + owner: "pytorch", + repo: "pytorch", + issue_number: pull.number, + body: STALE_MESSAGE, + }); + + numAPIRequests += 1; + await github.rest.issues.addLabels({ + owner: "pytorch", + repo: "pytorch", + issue_number: pull.number, + labels: ["Stale"], + }); + } + } + + for await (const response of github.paginate.iterator( + github.rest.pulls.list, + { + owner: "pytorch", + repo: "pytorch", + state: "open", + sort: "created", + direction: "asc", + per_page: 100, + } + )) { + numAPIRequests += 1; + const pulls = response.data; + // Awaiting in a loop is intentional here. We want to serialize execution so + // that log groups are printed correctl + for (const pull of pulls) { + if (numAPIRequests > MAX_API_REQUESTS) { + core.warning("Max API requests exceeded, exiting."); + process.exit(0); + } + await core.group(`Processing PR #${pull.number}`, async () => { + await processPull(pull); + }); + } + } + core.info(`Processed ${numProcessed} PRs total.`); diff --git a/.github/workflows/stale_pull_requests.yml b/.github/workflows/stale_pull_requests.yml deleted file mode 100644 index fabb1c6b1a66..000000000000 --- a/.github/workflows/stale_pull_requests.yml +++ /dev/null @@ -1,36 +0,0 @@ -name: 'Close stale pull requests' -on: - schedule: - # TODO: Reduce frequency once we work through the backlog of pull requests - - cron: '0 * * * *' - workflow_dispatch: - -jobs: - stale: - if: ${{ github.repository == 'pytorch/pytorch' }} - runs-on: ubuntu-18.04 - steps: - - uses: actions/stale@v4.1.0 - with: - stale-pr-message: > - Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as `Stale`.
- Feel free to remove the `Stale` label if you feel this was a mistake.
- `Stale` pull requests will automatically be closed 30 days after being marked `Stale`
- exempt-pr-labels: "no-stale,open source,high priority" - days-before-stale: 60 - days-before-close: 90 - stale-open-source: - if: ${{ github.repository == 'pytorch/pytorch' }} - runs-on: ubuntu-18.04 - steps: - - uses: actions/stale@v4.1.0 - with: - stale-pr-message: > - Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as `Stale`.
- Feel free to remove the `Stale` label if you feel this was a mistake.
- If you are unable to remove the `Stale` label please contact a maintainer in order to do so.
- `Stale` pull requests will automatically be closed 30 days after being marked `Stale`
- exempt-pr-labels: "no-stale,high priority" - only-labels: "open source" - days-before-stale: 150 - days-before-close: 180 diff --git a/.github/workflows/test_tools.yml b/.github/workflows/test_tools.yml deleted file mode 100644 index ed8f5babdb8d..000000000000 --- a/.github/workflows/test_tools.yml +++ /dev/null @@ -1,38 +0,0 @@ -name: Test tools - -on: - push: - branches: - - master - pull_request: - -jobs: - test: - if: ${{ github.repository == 'pytorch/pytorch' }} - runs-on: ubuntu-18.04 - steps: - - name: Setup Python - uses: actions/setup-python@v2 - with: - python-version: 3.8 - architecture: x64 - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - fetch-depth: 0 # deep clone, to allow us to use git log - - name: Install dependencies - # mypy and boto3 versions copied from - # .circleci/docker/common/install_conda.sh - run: | - set -eux - python3 -mpip install -r requirements.txt - python3 -mpip install boto3==1.16.34 - make setup_lint - - name: Test tools - run: | - python3 -m unittest discover -vs tools/test -p 'test_*.py' - python3 -m unittest discover -vs .github/scripts -p 'test_*.py' - -concurrency: - group: test-tools-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml new file mode 100644 index 000000000000..3f210d3381fe --- /dev/null +++ b/.github/workflows/trunk.yml @@ -0,0 +1,230 @@ +name: trunk + +on: + push: + branches: + - master + - main + - release/* + tags: + - ciflow/trunk/* + - ciflow/all/* + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + parallelnative-linux-xenial-py3_7-gcc5_4-build: + name: parallelnative-linux-xenial-py3.7-gcc5.4 + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: parallelnative-linux-xenial-py3.7-gcc5.4 + docker-image-name: pytorch-linux-xenial-py3.7-gcc5.4 + + parallelnative-linux-xenial-py3_7-gcc5_4-test: + name: parallelnative-linux-xenial-py3.7-gcc5.4 + uses: ./.github/workflows/_linux-test.yml + needs: parallelnative-linux-xenial-py3_7-gcc5_4-build + with: + build-environment: parallelnative-linux-xenial-py3.7-gcc5.4 + docker-image: ${{ needs.parallelnative-linux-xenial-py3_7-gcc5_4-build.outputs.docker-image }} + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" }, + { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" }, + ]} + + # Build PyTorch with BUILD_CAFFE2=ON + caffe2-linux-xenial-py3_7-gcc5_4-build: + name: caffe2-linux-xenial-py3.7-gcc5.4 + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: caffe2-linux-xenial-py3.7-gcc5.4 + docker-image-name: pytorch-linux-xenial-py3.7-gcc5.4 + + linux-bionic-cuda10_2-py3_9-gcc7-build: + name: linux-bionic-cuda10.2-py3.9-gcc7 + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-bionic-cuda10.2-py3.9-gcc7 + docker-image-name: pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7 + + linux-bionic-cuda10_2-py3_9-gcc7-test: + name: linux-bionic-cuda10.2-py3.9-gcc7 + uses: ./.github/workflows/_linux-test.yml + needs: linux-bionic-cuda10_2-py3_9-gcc7-build + with: + build-environment: linux-bionic-cuda10.2-py3.9-gcc7 + docker-image: ${{ needs.linux-bionic-cuda10_2-py3_9-gcc7-build.outputs.docker-image }} + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" }, + { config: "default", shard: 2, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" }, + { config: "slow", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" }, + { config: "nogpu_NO_AVX", shard: 1, num_shards: 1, runner: "linux.2xlarge" }, + { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "linux.2xlarge" }, + { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" }, + { config: "distributed", shard: 1, num_shards: 2, runner: "linux.8xlarge.nvidia.gpu" }, + { config: "distributed", shard: 2, num_shards: 2, runner: "linux.8xlarge.nvidia.gpu" }, + { config: "multigpu", shard: 1, num_shards: 1, runner: "linux.16xlarge.nvidia.gpu" }, + ]} + + libtorch-linux-xenial-cuda10_2-py3_7-gcc7-build: + name: libtorch-linux-xenial-cuda10.2-py3.7-gcc7 + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: libtorch-linux-xenial-cuda10.2-py3.7-gcc7 + docker-image-name: pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7 + build-generates-artifacts: false + + libtorch-linux-xenial-cuda11_3-py3_7-gcc7-build: + name: libtorch-linux-xenial-cuda11.3-py3.7-gcc7 + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: libtorch-linux-xenial-cuda11.3-py3.7-gcc7 + docker-image-name: pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7 + build-generates-artifacts: false + + # no-ops builds test USE_PER_OPERATOR_HEADERS=0 where ATen/ops is not generated + linux-xenial-cuda11_3-py3_7-gcc7-no-ops-build: + name: linux-xenial-cuda11.3-py3.7-gcc7-no-ops + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-xenial-cuda11.3-py3.7-gcc7-no-ops + docker-image-name: pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7 + + pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build: + name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build + uses: ./.github/workflows/_android-full-build-test.yml + with: + build-environment: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build + docker-image-name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c + secrets: + SONATYPE_NEXUS_USERNAME: ${{ secrets.SONATYPE_NEXUS_USERNAME }} + SONATYPE_NEXUS_PASSWORD: ${{ secrets.SONATYPE_NEXUS_PASSWORD }} + ANDROID_SIGN_KEY: ${{ secrets.ANDROID_SIGN_KEY }} + ANDROID_SIGN_PASS: ${{ secrets.ANDROID_SIGN_PASS }} + SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} + + linux-bionic-py3_7-clang9-slow-build: + name: linux-bionic-py3.7-clang9-slow + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-bionic-py3.7-clang9-slow + docker-image-name: pytorch-linux-bionic-py3.7-clang9 + + linux-bionic-py3_7-clang9-slow-test: + name: linux-bionic-py3.7-clang9-slow + uses: ./.github/workflows/_linux-test.yml + needs: linux-bionic-py3_7-clang9-slow-build + with: + build-environment: linux-bionic-py3.7-clang9-slow + docker-image: ${{ needs.linux-bionic-py3_7-clang9-slow-build.outputs.docker-image }} + test-matrix: | + { include: [ + { config: "slow", shard: 1, num_shards: 1, runner: "linux.2xlarge" }, + ]} + + ios-12-5-1-x86-64: + name: ios-12-5-1-x86-64 + uses: ./.github/workflows/_ios-build-test.yml + with: + build-environment: ios-12-5-1-x86-64 + ios-platform: SIMULATOR + ios-arch: x86_64 + secrets: + IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }} + IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET}} + IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID}} + IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }} + + ios-12-5-1-x86-64-coreml: + name: ios-12-5-1-x86-64-coreml + uses: ./.github/workflows/_ios-build-test.yml + with: + build-environment: ios-12-5-1-x86-64-coreml + ios-platform: SIMULATOR + ios-arch: x86_64 + secrets: + IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }} + IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET}} + IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID}} + IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }} + + macos-11-py3-x86-64-build: + name: macos-11-py3-x86-64 + uses: ./.github/workflows/_mac-build.yml + with: + build-environment: macos-11-py3-x86-64 + xcode-version: "13.3.1" + runner-type: macos-12 + build-generates-artifacts: true + secrets: + MACOS_SCCACHE_S3_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + MACOS_SCCACHE_S3_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + + macos-11-py3-x86-64-test: + name: macos-11-py3-x86-64 + uses: ./.github/workflows/_mac-test.yml + needs: macos-11-py3-x86-64-build + with: + build-environment: macos-11-py3-x86-64 + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 2, runner: "macos-12", xcode-version: "13.3.1" }, + { config: "default", shard: 2, num_shards: 2, runner: "macos-12", xcode-version: "13.3.1" }, + ]} + secrets: + AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }} + AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }} + + macos-10-15-py3-lite-interpreter-x86-64: + name: macos-10-15-py3-lite-interpreter-x86-64 + uses: ./.github/workflows/_mac-build.yml + with: + build-environment: macos-10-15-py3-lite-interpreter-x86-64 + xcode-version: "12" + runner-type: macos-10.15 + build-generates-artifacts: false + secrets: + MACOS_SCCACHE_S3_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + MACOS_SCCACHE_S3_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + + macos-10-15-py3-arm64: + name: macos-10-15-py3-arm64 + uses: ./.github/workflows/_mac-build.yml + with: + build-environment: macos-10-15-py3-arm64 + xcode-version: "13.3.1" + runner-type: macos-12 + build-generates-artifacts: false + secrets: + MACOS_SCCACHE_S3_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} + MACOS_SCCACHE_S3_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} + + # please ensure that this and its corresponding job in pull.yml are in sync + win-vs2019-cuda11_3-py3-build: + name: win-vs2019-cuda11.3-py3 + uses: ./.github/workflows/_win-build.yml + with: + build-environment: win-vs2019-cuda11.3-py3 + cuda-version: "11.3" + + win-vs2019-cuda11_3-py3-test: + name: win-vs2019-cuda11.3-py3 + uses: ./.github/workflows/_win-test.yml + needs: win-vs2019-cuda11_3-py3-build + with: + build-environment: win-vs2019-cuda11.3-py3 + cuda-version: "11.3" + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" }, + { config: "default", shard: 2, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" }, + { config: "default", shard: 3, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" }, + { config: "default", shard: 4, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" }, + { config: "default", shard: 5, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" }, + { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" }, + ]} diff --git a/.github/workflows/trymerge.yml b/.github/workflows/trymerge.yml index ae29ab82462a..d5092046ebad 100644 --- a/.github/workflows/trymerge.yml +++ b/.github/workflows/trymerge.yml @@ -6,7 +6,8 @@ on: jobs: do_merge: - runs-on: ubuntu-20.04 + name: try_merge_pr_${{ github.event.client_payload.pr_num }} + runs-on: linux.20_04.4x steps: - name: Setup Python uses: actions/setup-python@v2 @@ -28,5 +29,25 @@ jobs: GITHUB_TOKEN: ${{ secrets.MERGEBOT_TOKEN }} PR_NUM: ${{ github.event.client_payload.pr_num }} GH_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + FORCE: ${{ github.event.client_payload.force}} + ON_GREEN: ${{ github.event.client_payload.on_green}} + COMMENT_ID: ${{ github.event.client_payload.comment_id }} run: | - python3 .github/scripts/trymerge.py "${PR_NUM}" + set -ex + if [ -n "${FORCE}" ]; then + if [ -n "${COMMENT_ID}" ]; then + python3 .github/scripts/trymerge.py --force --comment-id "${COMMENT_ID}" "${PR_NUM}" + else + python3 .github/scripts/trymerge.py --force "${PR_NUM}" + fi + elif [ -n "${ON_GREEN}" ]; then + python3 .github/scripts/trymerge.py --on-green "${PR_NUM}" + elif [ -n "${COMMENT_ID}" ]; then + python3 .github/scripts/trymerge.py --comment-id "${COMMENT_ID}" "${PR_NUM}" + else + python3 .github/scripts/trymerge.py "${PR_NUM}" + fi + +# TODO: Separate merge on green merges from regular merges to not hold up try-merge workflows overall concurrency +# NOTE: force pushes are also put in their concurrency group to put them higher than regular merges +concurrency: try-merge-${{ github.event.client_payload.force}}-${{ github.event.client_payload.on_green }} diff --git a/.github/workflows/tryrebase.yml b/.github/workflows/tryrebase.yml new file mode 100644 index 000000000000..d45018c1ad6d --- /dev/null +++ b/.github/workflows/tryrebase.yml @@ -0,0 +1,34 @@ +name: Rebase PR + +on: + repository_dispatch: + types: [try-rebase] + +jobs: + do_rebase: + runs-on: ubuntu-20.04 + steps: + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: 3.8 + architecture: x64 + + - name: Checkout repo + uses: actions/checkout@v2 + with: + fetch-depth: 0 + token: ${{ secrets.MERGEBOT_TOKEN }} + + - name: Setup committer id + run: | + git config --global user.email "pytorchmergebot@users.noreply.github.com" + git config --global user.name "PyTorch MergeBot" + + - name: Rebase + env: + GITHUB_TOKEN: ${{ secrets.MERGEBOT_TOKEN }} + PR_NUM: ${{ github.event.client_payload.pr_num }} + GH_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + run: | + python3 .github/scripts/tryrebase.py "${PR_NUM}" diff --git a/.github/workflows/update_pytorch_labels.yml b/.github/workflows/update_pytorch_labels.yml index 82061efa3c3c..f19347070ece 100644 --- a/.github/workflows/update_pytorch_labels.yml +++ b/.github/workflows/update_pytorch_labels.yml @@ -17,8 +17,8 @@ jobs: uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - name: Update PyTorch labels list in S3 env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_SECRET_ACCESS_KEY }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }} run: | python3 -m pip install boto3==1.19.12 .github/scripts/export_pytorch_labels.py diff --git a/.github/workflows/update_s3_htmls.yml b/.github/workflows/update_s3_htmls.yml index 6a53d4d24595..5f3ff056c5a4 100644 --- a/.github/workflows/update_s3_htmls.yml +++ b/.github/workflows/update_s3_htmls.yml @@ -12,7 +12,7 @@ jobs: if: ${{ github.repository == 'pytorch/pytorch' }} strategy: matrix: - prefix: ["whl", "whl/test", "whl/nightly"] + prefix: ["whl", "whl/test", "whl/nightly", "whl/lts/1.8"] steps: - name: Run updater image env: @@ -20,4 +20,4 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_UPDATE_SECRET_ACCESS_KEY }} uses: docker://pytorch/manage_s3_html with: - args: ${{ matrix.prefix }} + args: --generate-pep503 ${{ matrix.prefix }} diff --git a/.github/workflows/upload-test-stats.yml b/.github/workflows/upload-test-stats.yml new file mode 100644 index 000000000000..bfed85e5131e --- /dev/null +++ b/.github/workflows/upload-test-stats.yml @@ -0,0 +1,35 @@ +name: Upload test stats + +on: + workflow_run: + workflows: [pull, trunk, periodic] + types: + - completed + +jobs: + upload-test-stats: + if: github.event.workflow_run.conclusion == 'success' || github.event.workflow_run.conclusion == 'failure' + runs-on: [self-hosted, linux.2xlarge] + + steps: + - name: Print workflow information + env: + TRIGGERING_WORKFLOW: ${{ toJSON(github.event.workflow_run) }} + run: echo "${TRIGGERING_WORKFLOW}" + + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@master + + - run: | + pip3 install requests==2.26 + pip3 install rockset==0.8.3 + pip3 install boto3==1.19.12 + pip3 install six==1.16.0 + + - name: Upload test stats + env: + ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }} + WORKFLOW_RUN_ATTEMPT: ${{ github.event.workflow_run.run_attempt }} + run: python3 tools/stats/upload_test_stats.py --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" diff --git a/.gitignore b/.gitignore index 71e9d56255e1..b62b84d9d0e8 100644 --- a/.gitignore +++ b/.gitignore @@ -35,6 +35,7 @@ aten/src/ATen/cuda/CUDAConfig.h benchmarks/.data caffe2/cpp_test/ dist/ +docs/build/ docs/cpp/src docs/src/**/* docs/cpp/build @@ -66,8 +67,11 @@ torch/_C/__init__.pyi torch/_C/_nn.pyi torch/_C/_VariableFunctions.pyi torch/_VF.pyi +torch/return_types.pyi torch/nn/functional.pyi +torch/utils/data/datapipes/datapipe.pyi torch/csrc/autograd/generated/* +torch/csrc/lazy/generated/*.[!m]* # Listed manually because some files in this directory are not generated torch/testing/_internal/generated/annotated_fn_args.py torch/testing/_internal/data/*.pt @@ -137,6 +141,7 @@ scripts/release_notes/*.json compile_commands.json *.egg-info/ docs/source/scripts/activation_images/ +docs/source/scripts/quantization_backend_configs/ ## General @@ -255,6 +260,9 @@ cmake-build-debug # # Below files are not deleted by "setup.py clean". +# Downloaded bazel +tools/bazel + # Visual Studio Code files .vs /.vscode/* @@ -304,10 +312,20 @@ bazel-* *.zip # core dump files -core.* +**/core.[1-9]* # Generated if you use the pre-commit script for clang-tidy pr.diff # coverage files */**/.coverage.* + +# buck generated files +.buckd/ +.lsp-buck-out/ +.lsp.buckd/ +buck-out/ + +# Downloaded libraries +third_party/ruy/ +third_party/glog/ diff --git a/.gitmodules b/.gitmodules index 9c9373ef7229..8d1ea6f02fa7 100644 --- a/.gitmodules +++ b/.gitmodules @@ -9,7 +9,7 @@ [submodule "third_party/eigen"] ignore = dirty path = third_party/eigen - url = https://github.com/eigenteam/eigen-git-mirror.git + url = https://gitlab.com/libeigen/eigen.git [submodule "third_party/googletest"] ignore = dirty path = third_party/googletest @@ -139,9 +139,6 @@ [submodule "third_party/pocketfft"] path = third_party/pocketfft url = https://github.com/mreineck/pocketfft -[submodule "third_party/breakpad"] - path = third_party/breakpad - url = https://github.com/driazati/breakpad.git [submodule "third_party/flatbuffers"] path = third_party/flatbuffers url = https://github.com/google/flatbuffers.git diff --git a/.jenkins/caffe2/common.sh b/.jenkins/caffe2/common.sh index 168e823ba2cc..087055536564 100644 --- a/.jenkins/caffe2/common.sh +++ b/.jenkins/caffe2/common.sh @@ -26,7 +26,7 @@ if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then fi fi -# /usr/local/caffe2 is where the cpp bits are installed to in in cmake-only +# /usr/local/caffe2 is where the cpp bits are installed to in cmake-only # builds. In +python builds the cpp tests are copied to /usr/local/caffe2 so # that the test code in .jenkins/test.sh is the same INSTALL_PREFIX="/usr/local/caffe2" diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh index fd626d09c3e2..e9d1feba7a50 100755 --- a/.jenkins/caffe2/test.sh +++ b/.jenkins/caffe2/test.sh @@ -45,8 +45,8 @@ fi ################################################################################ # C++ tests # ################################################################################ -# Don't run cpp tests a second time in the sharded ort_test2 job -if [[ "$BUILD_ENVIRONMENT" != *ort_test2* ]]; then +# Only run cpp tests in the first shard, don't run cpp tests a second time in the second shard +if [[ "${SHARD_NUMBER:-1}" == "1" ]]; then echo "Running C++ tests.." for test in $(find "$cpp_test_dir" -executable -type f); do case "$test" in @@ -134,19 +134,15 @@ if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then rocm_ignore_test+=("--ignore $caffe2_pypath/python/ideep/pool_op_test.py") fi -# NB: Warnings are disabled because they make it harder to see what -# the actual erroring test is echo "Running Python tests.." -if [[ "$BUILD_ENVIRONMENT" == *py3* ]]; then - # locale setting is required by click package with py3 - for loc in "en_US.utf8" "C.UTF-8"; do - if locale -a | grep "$loc" >/dev/null 2>&1; then - export LC_ALL="$loc" - export LANG="$loc" - break; - fi - done -fi +# locale setting is required by click package +for loc in "en_US.utf8" "C.UTF-8"; do + if locale -a | grep "$loc" >/dev/null 2>&1; then + export LC_ALL="$loc" + export LANG="$loc" + break; + fi +done # Some Caffe2 tests fail when run using AVX512 ISA, see https://github.com/pytorch/pytorch/issues/66111 export DNNL_MAX_CPU_ISA=AVX2 @@ -154,6 +150,8 @@ export DNNL_MAX_CPU_ISA=AVX2 # Should still run even in the absence of SHARD_NUMBER if [[ "${SHARD_NUMBER:-1}" == "1" ]]; then pip install --user pytest-sugar + # NB: Warnings are disabled because they make it harder to see what + # the actual erroring test is "$PYTHON" \ -m pytest \ -x \ @@ -170,18 +168,18 @@ if [[ "${SHARD_NUMBER:-1}" == "1" ]]; then "${EXTRA_TESTS[@]}" fi -##################### -# torchvision tests # -##################### +############## +# ONNX tests # +############## if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then # Check out torch/vision at 0.9.0-rc1 commit # This hash must match one in .jenkins/pytorch/test.sh pip install -q --user git+https://github.com/pytorch/vision.git@8a2dc6f22ac4389ccba8859aa1e1cb14f1ee53db - pip install -q --user ninja + pip install -q --user ninja flatbuffers==2.0 numpy==1.21.5 onnxruntime==1.11.0 + # numba requires numpy <= 1.20, onnxruntime requires numpy >= 1.21. + # We don't actually need it for our tests, but it's imported if it's present, so uninstall. + pip uninstall -q --yes numba # JIT C++ extensions require ninja, so put it into PATH. export PATH="/var/lib/jenkins/.local/bin:$PATH" - if [[ "$BUILD_ENVIRONMENT" == *py3* ]]; then - pip install -q --user flatbuffers==2.0 onnxruntime==1.9.0 - fi "$ROOT_DIR/scripts/onnx/test.sh" fi diff --git a/.jenkins/pytorch/build-asan.sh b/.jenkins/pytorch/build-asan.sh index 60d5e5e80807..b15ab65afa87 100755 --- a/.jenkins/pytorch/build-asan.sh +++ b/.jenkins/pytorch/build-asan.sh @@ -15,7 +15,7 @@ clang --version # detect_leaks=0: Python is very leaky, so we need suppress it # symbolize=1: Gives us much better errors when things go wrong -export ASAN_OPTIONS=detect_leaks=0:symbolize=1:detect_odr_violation=0 +export ASAN_OPTIONS=detect_leaks=0:detect_stack_use_after_return=1:symbolize=1:detect_odr_violation=0 if [ -n "$(which conda)" ]; then export CMAKE_PREFIX_PATH=/opt/conda fi @@ -35,7 +35,7 @@ fi # # TODO: Make the ASAN flags a centralized env var and unify with USE_ASAN option CC="clang" CXX="clang++" LDSHARED="clang --shared" \ - CFLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -shared-libasan -pthread" \ + CFLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -fsanitize-address-use-after-scope -shared-libasan -pthread" \ CXX_FLAGS="-pthread" \ USE_ASAN=1 USE_CUDA=0 USE_MKLDNN=0 \ python setup.py bdist_wheel diff --git a/.jenkins/pytorch/build-mobile.sh b/.jenkins/pytorch/build-mobile.sh index f79306f87032..48cfb4fba83a 100755 --- a/.jenkins/pytorch/build-mobile.sh +++ b/.jenkins/pytorch/build-mobile.sh @@ -26,6 +26,8 @@ retry pip install --pre torch torchvision \ # binary, and running forward pass with a real model. if [[ "$BUILD_ENVIRONMENT" == *-mobile-custom-build-static* ]]; then TEST_CUSTOM_BUILD_STATIC=1 test/mobile/custom_build/build.sh +elif [[ "$BUILD_ENVIRONMENT" == *-mobile-lightweight-dispatch* ]]; then + test/mobile/lightweight_dispatch/build.sh else TEST_DEFAULT_BUILD=1 test/mobile/custom_build/build.sh fi diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index 1fc4fecf2f82..8c74fc107603 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -20,7 +20,7 @@ if [[ "$BUILD_ENVIRONMENT" == *-mobile-*build* ]]; then exec "$(dirname "${BASH_SOURCE[0]}")/build-mobile.sh" "$@" fi -if [[ "$BUILD_ENVIRONMENT" == *linux-xenial-cuda11.3* || "$BUILD_ENVIRONMENT" == *linux-bionic-cuda11.5* ]]; then +if [[ "$BUILD_ENVIRONMENT" == *linux-xenial-cuda11.3* || "$BUILD_ENVIRONMENT" == *linux-bionic-cuda11.5* || "$BUILD_ENVIRONMENT" == *linux-bionic-cuda11.6* ]]; then # Enabling DEPLOY build (embedded torch python interpreter, experimental) # only on one config for now, can expand later export USE_DEPLOY=ON @@ -209,10 +209,13 @@ else if [[ "$BUILD_ENVIRONMENT" != *libtorch* ]]; then - # ppc64le build fails when WERROR=1 + # ppc64le, rocm builds fail when WERROR=1 + # XLA test build fails when WERROR=1 # set only when building other architectures - # only use for "python setup.py install" line - if [[ "$BUILD_ENVIRONMENT" != *ppc64le* && "$BUILD_ENVIRONMENT" != *rocm* ]]; then + # or building non-XLA tests. + if [[ "$BUILD_ENVIRONMENT" != *ppc64le* && + "$BUILD_ENVIRONMENT" != *rocm* && + "$BUILD_ENVIRONMENT" != *xla* ]]; then WERROR=1 python setup.py bdist_wheel else python setup.py bdist_wheel @@ -249,13 +252,11 @@ else fi sudo rm -rf original popd - - # exit before building custom test artifacts until we resolve cmake error: - # static library kineto_LIBRARY-NOTFOUND not found. - exit 0 fi CUSTOM_TEST_ARTIFACT_BUILD_DIR=${CUSTOM_TEST_ARTIFACT_BUILD_DIR:-${PWD}/../} + CUSTOM_TEST_USE_ROCM=$([[ "$BUILD_ENVIRONMENT" == *rocm* ]] && echo "ON" || echo "OFF") + CUSTOM_TEST_MODULE_PATH="${PWD}/cmake/public" mkdir -pv "${CUSTOM_TEST_ARTIFACT_BUILD_DIR}" # Build custom operator tests. @@ -265,7 +266,8 @@ else SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')" mkdir -p "$CUSTOM_OP_BUILD" pushd "$CUSTOM_OP_BUILD" - cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" + cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \ + -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM" make VERBOSE=1 popd assert_git_not_dirty @@ -277,7 +279,8 @@ else SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')" mkdir -p "$JIT_HOOK_BUILD" pushd "$JIT_HOOK_BUILD" - cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" + cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \ + -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM" make VERBOSE=1 popd assert_git_not_dirty @@ -288,7 +291,8 @@ else python --version mkdir -p "$CUSTOM_BACKEND_BUILD" pushd "$CUSTOM_BACKEND_BUILD" - cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" + cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \ + -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM" make VERBOSE=1 popd assert_git_not_dirty diff --git a/.jenkins/pytorch/codegen-test.sh b/.jenkins/pytorch/codegen-test.sh index 290baa7a3b3b..9f895bbdbcc4 100755 --- a/.jenkins/pytorch/codegen-test.sh +++ b/.jenkins/pytorch/codegen-test.sh @@ -26,7 +26,7 @@ set -x rm -rf "$OUT" # aten codegen -python -m tools.codegen.gen \ +python -m torchgen.gen \ -d "$OUT"/torch/share/ATen # torch codegen @@ -38,6 +38,7 @@ mkdir -p "$OUT"/pyi/torch/_C mkdir -p "$OUT"/pyi/torch/nn python -m tools.pyi.gen_pyi \ --native-functions-path aten/src/ATen/native/native_functions.yaml \ + --tags-path aten/src/ATen/native/tags.yaml \ --deprecated-functions-path tools/autograd/deprecated.yaml \ --out "$OUT"/pyi @@ -45,6 +46,7 @@ python -m tools.pyi.gen_pyi \ python -m tools.autograd.gen_autograd \ "$OUT"/torch/share/ATen/Declarations.yaml \ aten/src/ATen/native/native_functions.yaml \ + aten/src/ATen/native/tags.yaml \ "$OUT"/autograd \ tools/autograd @@ -52,5 +54,6 @@ python -m tools.autograd.gen_autograd \ mkdir -p "$OUT"/annotated_fn_args python -m tools.autograd.gen_annotated_fn_args \ aten/src/ATen/native/native_functions.yaml \ + aten/src/ATen/native/tags.yaml \ "$OUT"/annotated_fn_args \ tools/autograd diff --git a/.jenkins/pytorch/common.sh b/.jenkins/pytorch/common.sh index be5245bf19bc..a593db026005 100644 --- a/.jenkins/pytorch/common.sh +++ b/.jenkins/pytorch/common.sh @@ -8,20 +8,25 @@ set -ex # Save the SCRIPT_DIR absolute path in case later we chdir (as occurs in the gpu perf test) SCRIPT_DIR="$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )" +if [[ "${BUILD_ENVIRONMENT}" == *linux* ]]; then + # TODO: Remove this once nvidia package repos are back online + # Comment out nvidia repositories to prevent them from getting apt-get updated, see https://github.com/pytorch/pytorch/issues/74968 + # shellcheck disable=SC2046 + sudo sed -i 's/.*nvidia.*/# &/' $(find /etc/apt/ -type f -name "*.list") +fi + # Required environment variables: # $BUILD_ENVIRONMENT (should be set by your Docker image) # Figure out which Python to use for ROCm -if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]] && [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then +if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then # HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors unset HIP_PLATFORM - PYTHON=$(which "python${BASH_REMATCH[1]}") - # non-interactive bashs do not expand aliases by default - shopt -s expand_aliases export PYTORCH_TEST_WITH_ROCM=1 - alias python='$PYTHON' # temporary to locate some kernel issues on the CI nodes export HSAKMT_DEBUG_LEVEL=4 + # improve rccl performance for distributed tests + export HSA_FORCE_FINE_GRAIN_PCIE=1 fi # This token is used by a parser on Jenkins logs for determining @@ -145,7 +150,8 @@ fi # export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} if [[ "${TEST_CONFIG:-}" == *xla* ]] || \ [[ "$BUILD_ENVIRONMENT" == *centos* ]] || \ - [[ "$BUILD_ENVIRONMENT" == *linux-bionic* ]]; then + [[ "$BUILD_ENVIRONMENT" == *linux-bionic* ]] || \ + [[ "$BUILD_ENVIRONMENT" == *linux-focal* ]]; then if ! which conda; then echo "Expected ${BUILD_ENVIRONMENT} to use conda, but 'which conda' returns empty" exit 1 diff --git a/.jenkins/pytorch/common_utils.sh b/.jenkins/pytorch/common_utils.sh index 54bd44d3ccc6..4169f6a2cb8c 100644 --- a/.jenkins/pytorch/common_utils.sh +++ b/.jenkins/pytorch/common_utils.sh @@ -60,19 +60,18 @@ function get_pr_change_files() { set -e } -function file_diff_from_base() { - # The fetch may fail on Docker hosts, this fetch is necessary for GHA - set +e - git fetch origin master --quiet - set -e - git diff --name-only "$(git merge-base origin/master HEAD)" > "$1" -} - function get_bazel() { - # download bazel version - wget https://ossci-linux.s3.amazonaws.com/bazel-4.2.1-linux-x86_64 -O tools/bazel - # verify content - echo '1a4f3a3ce292307bceeb44f459883859c793436d564b95319aacb8af1f20557c tools/bazel' | sha256sum --quiet -c + if [[ $(uname) == "Darwin" ]]; then + # download bazel version + curl https://github.com/bazelbuild/bazel/releases/download/4.2.1/bazel-4.2.1-darwin-x86_64 -Lo tools/bazel + # verify content + echo '74d93848f0c9d592e341e48341c53c87e3cb304a54a2a1ee9cff3df422f0b23c tools/bazel' | shasum -a 256 -c >/dev/null + else + # download bazel version + curl https://ossci-linux.s3.amazonaws.com/bazel-4.2.1-linux-x86_64 -o tools/bazel + # verify content + echo '1a4f3a3ce292307bceeb44f459883859c793436d564b95319aacb8af1f20557c tools/bazel' | shasum -a 256 -c >/dev/null + fi chmod +x tools/bazel } diff --git a/.jenkins/pytorch/macos-build.sh b/.jenkins/pytorch/macos-build.sh index 06e24936c196..ee35efc010c2 100755 --- a/.jenkins/pytorch/macos-build.sh +++ b/.jenkins/pytorch/macos-build.sh @@ -37,7 +37,7 @@ cross_compile_arm64() { } compile_x86_64() { - USE_DISTRIBUTED=1 python setup.py bdist_wheel + USE_DISTRIBUTED=1 USE_NNPACK=OFF python setup.py bdist_wheel } build_lite_interpreter() { diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh index 78999637f7f9..858a0c1eab53 100755 --- a/.jenkins/pytorch/macos-test.sh +++ b/.jenkins/pytorch/macos-test.sh @@ -4,13 +4,13 @@ # shellcheck source=./macos-common.sh source "$(dirname "${BASH_SOURCE[0]}")/macos-common.sh" -export PYTORCH_TEST_SKIP_NOARCH=1 - conda install -y six -pip install -q hypothesis "expecttest==0.1.3" "librosa>=0.6.2,<0.9.0" "numba<=0.49.1" psutil "scipy==1.6.3" +pip install -q hypothesis "expecttest==0.1.3" "librosa>=0.6.2" "numba<=0.49.1" psutil "scipy==1.6.3" # TODO move this to docker -pip install unittest-xml-reporting pytest +# Pin unittest-xml-reporting to freeze printing test summary logic, related: https://github.com/pytorch/pytorch/issues/69014 +pip install "unittest-xml-reporting<=3.2.0,>=2.0.0" \ + pytest if [ -z "${IN_CI}" ]; then rm -rf "${WORKSPACE_DIR}"/miniconda3/lib/python3.6/site-packages/torch* diff --git a/.jenkins/pytorch/multigpu-test.sh b/.jenkins/pytorch/multigpu-test.sh index 2d119d09a70c..481619a8dc31 100755 --- a/.jenkins/pytorch/multigpu-test.sh +++ b/.jenkins/pytorch/multigpu-test.sh @@ -13,7 +13,8 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh" echo "Testing pytorch (distributed only)" if [ -n "${IN_CI}" ]; then # TODO move this to docker - pip_install unittest-xml-reporting + # Pin unittest-xml-reporting to freeze printing test summary logic, related: https://github.com/pytorch/pytorch/issues/69014 + pip_install "unittest-xml-reporting<=3.2.0,>=2.0.0" fi # Disabling tests to see if they solve timeout issues; see https://github.com/pytorch/pytorch/issues/70015 diff --git a/.jenkins/pytorch/short-perf-test-cpu.sh b/.jenkins/pytorch/short-perf-test-cpu.sh index f2e02b52974c..ff9ef7a84eee 100755 --- a/.jenkins/pytorch/short-perf-test-cpu.sh +++ b/.jenkins/pytorch/short-perf-test-cpu.sh @@ -17,14 +17,15 @@ pip install -q awscli # Set multipart_threshold to be sufficiently high, so that `aws s3 cp` is not a multipart read # More info at https://github.com/aws/aws-cli/issues/2321 aws configure set default.s3.multipart_threshold 5GB +UPSTREAM_DEFAULT_BRANCH="$(git remote show https://github.com/pytorch/pytorch.git | awk '/HEAD branch/ {print $NF}')" -if [[ "$COMMIT_SOURCE" == master ]]; then - # Get current master commit hash - MASTER_COMMIT_ID=$(git log --format="%H" -n 1) - export MASTER_COMMIT_ID +if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then + # Get current default branch commit hash + DEFAULT_BRANCH_COMMIT_ID=$(git log --format="%H" -n 1) + export DEFAULT_BRANCH_COMMIT_ID fi -# Find the master commit to test against +# Find the default branch commit to test against git remote add upstream https://github.com/pytorch/pytorch.git git fetch upstream IFS=$'\n' @@ -33,13 +34,13 @@ while IFS='' read -r commit_id; do LATEST_TESTED_COMMIT=${commit_id} break fi -done < <(git rev-list upstream/master) +done < <(git rev-list upstream/"$UPSTREAM_DEFAULT_BRANCH") aws s3 cp s3://ossci-perf-test/pytorch/cpu_runtime/"${LATEST_TESTED_COMMIT}".json cpu_runtime.json -if [[ "$COMMIT_SOURCE" == master ]]; then +if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then # Prepare new baseline file cp cpu_runtime.json new_cpu_runtime.json - python update_commit_hash.py new_cpu_runtime.json "${MASTER_COMMIT_ID}" + python update_commit_hash.py new_cpu_runtime.json "${DEFAULT_BRANCH_COMMIT_ID}" fi # Include tests @@ -54,7 +55,7 @@ fi # Run tests export TEST_MODE="compare_with_baseline" -if [[ "$COMMIT_SOURCE" == master ]]; then +if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then export TEST_MODE="compare_and_update" fi @@ -66,8 +67,8 @@ run_test test_cpu_speed_torch_tensor ${TEST_MODE} run_test test_cpu_speed_mini_sequence_labeler 20 ${TEST_MODE} run_test test_cpu_speed_mnist 20 ${TEST_MODE} -if [[ "$COMMIT_SOURCE" == master ]]; then - # This could cause race condition if we are testing the same master commit twice, +if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then + # This could cause race condition if we are testing the same default branch commit twice, # but the chance of them executing this line at the same time is low. - aws s3 cp new_cpu_runtime.json s3://ossci-perf-test/pytorch/cpu_runtime/"${MASTER_COMMIT_ID}".json --acl public-read + aws s3 cp new_cpu_runtime.json s3://ossci-perf-test/pytorch/cpu_runtime/"${DEFAULT_BRANCH_COMMIT_ID}".json --acl public-read fi diff --git a/.jenkins/pytorch/short-perf-test-gpu.sh b/.jenkins/pytorch/short-perf-test-gpu.sh index 4d8efee8dc20..bde8ca5c9dd3 100755 --- a/.jenkins/pytorch/short-perf-test-gpu.sh +++ b/.jenkins/pytorch/short-perf-test-gpu.sh @@ -17,14 +17,15 @@ pip install -q awscli --ignore-installed PyYAML # Set multipart_threshold to be sufficiently high, so that `aws s3 cp` is not a multipart read # More info at https://github.com/aws/aws-cli/issues/2321 aws configure set default.s3.multipart_threshold 5GB +UPSTREAM_DEFAULT_BRANCH="$(git remote show https://github.com/pytorch/pytorch.git | awk '/HEAD branch/ {print $NF}')" -if [[ "$COMMIT_SOURCE" == master ]]; then - # Get current master commit hash - MASTER_COMMIT_ID=$(git log --format="%H" -n 1) - export MASTER_COMMIT_ID +if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then + # Get current default branch commit hash + DEFAULT_BRANCH_COMMIT_ID=$(git log --format="%H" -n 1) + export DEFAULT_BRANCH_COMMIT_ID fi -# Find the master commit to test against +# Find the default branch commit to test against git remote add upstream https://github.com/pytorch/pytorch.git git fetch upstream IFS=$'\n' @@ -33,13 +34,13 @@ while IFS='' read -r commit_id; do LATEST_TESTED_COMMIT=${commit_id} break fi -done < <(git rev-list upstream/master) +done < <(git rev-list upstream/"$UPSTREAM_DEFAULT_BRANCH") aws s3 cp s3://ossci-perf-test/pytorch/gpu_runtime/"${LATEST_TESTED_COMMIT}".json gpu_runtime.json -if [[ "$COMMIT_SOURCE" == master ]]; then +if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then # Prepare new baseline file cp gpu_runtime.json new_gpu_runtime.json - python update_commit_hash.py new_gpu_runtime.json "${MASTER_COMMIT_ID}" + python update_commit_hash.py new_gpu_runtime.json "${DEFAULT_BRANCH_COMMIT_ID}" fi # Include tests @@ -55,7 +56,7 @@ fi . ./test_gpu_speed_mlstm.sh # Run tests -if [[ "$COMMIT_SOURCE" == master ]]; then +if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then run_test test_gpu_speed_mnist 20 compare_and_update run_test test_gpu_speed_word_language_model 20 compare_and_update run_test test_gpu_speed_cudnn_lstm 20 compare_and_update @@ -69,10 +70,10 @@ else run_test test_gpu_speed_mlstm 20 compare_with_baseline fi -if [[ "$COMMIT_SOURCE" == master ]]; then - # This could cause race condition if we are testing the same master commit twice, +if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then + # This could cause race condition if we are testing the same default branch commit twice, # but the chance of them executing this line at the same time is low. - aws s3 cp new_gpu_runtime.json s3://ossci-perf-test/pytorch/gpu_runtime/"${MASTER_COMMIT_ID}".json --acl public-read + aws s3 cp new_gpu_runtime.json s3://ossci-perf-test/pytorch/gpu_runtime/"${DEFAULT_BRANCH_COMMIT_ID}".json --acl public-read fi popd diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index 6544b0f2693d..75234f2ff446 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -40,6 +40,11 @@ PR_NUMBER=${PR_NUMBER:-${CIRCLE_PR_NUMBER:-}} if [[ $TEST_CONFIG == 'default' ]]; then export CUDA_VISIBLE_DEVICES=0 + export HIP_VISIBLE_DEVICES=0 +fi + +if [[ $TEST_CONFIG == 'distributed' ]] && [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then + export HIP_VISIBLE_DEVICES=0,1 fi if [[ "$BUILD_ENVIRONMENT" == *-slow-* || $TEST_CONFIG == 'slow' ]]; then @@ -48,11 +53,11 @@ if [[ "$BUILD_ENVIRONMENT" == *-slow-* || $TEST_CONFIG == 'slow' ]]; then fi if [[ "$BUILD_ENVIRONMENT" == *slow-gradcheck* ]]; then - export PYTORCH_TEST_WITH_SLOW_GRADCHECK=ON + export PYTORCH_TEST_WITH_SLOW_GRADCHECK=1 fi -if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then - # Used so that only cuda specific versions of tests are generated +if [[ "$BUILD_ENVIRONMENT" == *cuda* || "$BUILD_ENVIRONMENT" == *rocm* ]]; then + # Used so that only cuda/rocm specific versions of tests are generated # mainly used so that we're not spending extra cycles testing cpu # devices on expensive gpu machines export PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda" @@ -62,10 +67,8 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then export BUILD_SPLIT_CUDA=ON fi -if [[ "$BUILD_ENVIRONMENT" == *noarch* ]]; then - export PYTORCH_TEST_SKIP_NOARCH=0 -else - export PYTORCH_TEST_SKIP_NOARCH=1 +if [[ "$BUILD_ENVIRONMENT" == *crossref* ]]; then + export PYTORCH_TEST_WITH_CROSSREF=1 fi if [[ -n "$PR_NUMBER" ]] && [[ -z "$CI_MASTER" || "$CI_MASTER" == "false" ]]; then @@ -77,6 +80,7 @@ fi if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then # Print GPU info + rocminfo rocminfo | grep -E 'Name:.*\sgfx|Marketing' # Manually set NUM_TEST_SHARDS since Jenkins doesn't do it @@ -100,7 +104,7 @@ fi # ASAN test is not working if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then # Suppress vptr violations arising from multiple copies of pybind11 - export ASAN_OPTIONS=detect_leaks=0:symbolize=1:strict_init_order=true:detect_odr_violation=0 + export ASAN_OPTIONS=detect_leaks=0:symbolize=1:detect_stack_use_after_return=1:strict_init_order=true:detect_odr_violation=0 export UBSAN_OPTIONS=print_stacktrace=1:suppressions=$PWD/ubsan.supp export PYTORCH_TEST_WITH_ASAN=1 export PYTORCH_TEST_WITH_UBSAN=1 @@ -274,6 +278,14 @@ test_libtorch() { else "$TORCH_BIN_DIR"/test_jit --gtest_filter='-*CUDA' --gtest_output=xml:$TEST_REPORTS_DIR/test_jit.xml fi + + # Run Lazy Tensor cpp tests + if [[ "$BUILD_ENVIRONMENT" == *cuda* && "$BUILD_ENVIRONMENT" != *nogpu* ]]; then + LTC_TS_CUDA=1 "$TORCH_BIN_DIR"/test_lazy --gtest_output=xml:$TEST_REPORTS_DIR/test_lazy.xml + else + "$TORCH_BIN_DIR"/test_lazy --gtest_output=xml:$TEST_REPORTS_DIR/test_lazy.xml + fi + python test/cpp/jit/tests_setup.py shutdown # Wait for background download to finish wait @@ -306,16 +318,16 @@ test_vulkan() { # test reporting process (in print_test_stats.py) to function as expected. TEST_REPORTS_DIR=test/test-reports/cpp-vulkan/test_vulkan mkdir -p $TEST_REPORTS_DIR - "$TORCH_TEST_DIR"/vulkan_test --gtest_output=xml:$TEST_REPORTS_DIR/vulkan_test.xml + "$TORCH_TEST_DIR"/vulkan_api_test --gtest_output=xml:$TEST_REPORTS_DIR/vulkan_test.xml fi } test_distributed() { echo "Testing distributed python tests" - time python test/run_test.py --distributed-tests --verbose + time python test/run_test.py --distributed-tests --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose assert_git_not_dirty - if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then + if [[ "$BUILD_ENVIRONMENT" == *cuda* && "$SHARD_NUMBER" == 1 ]]; then echo "Testing distributed C++ tests" ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR" ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR" @@ -355,7 +367,7 @@ test_rpc() { } test_custom_backend() { - if [[ "$BUILD_ENVIRONMENT" != *rocm* ]] && [[ "$BUILD_ENVIRONMENT" != *asan* ]] ; then + if [[ "$BUILD_ENVIRONMENT" != *asan* ]] ; then echo "Testing custom backends" CUSTOM_BACKEND_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/custom-backend-build" pushd test/custom_backend @@ -372,7 +384,7 @@ test_custom_backend() { } test_custom_script_ops() { - if [[ "$BUILD_ENVIRONMENT" != *rocm* ]] && [[ "$BUILD_ENVIRONMENT" != *asan* ]] ; then + if [[ "$BUILD_ENVIRONMENT" != *asan* ]] ; then echo "Testing custom script operators" CUSTOM_OP_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/custom-op-build" pushd test/custom_operator @@ -388,7 +400,7 @@ test_custom_script_ops() { } test_jit_hooks() { - if [[ "$BUILD_ENVIRONMENT" != *rocm* ]] && [[ "$BUILD_ENVIRONMENT" != *asan* ]] ; then + if [[ "$BUILD_ENVIRONMENT" != *asan* ]] ; then echo "Testing jit hooks in cpp" HOOK_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/jit-hook-build" pushd test/jit_hooks @@ -441,6 +453,8 @@ test_xla() { # nightly version. test_forward_backward_compatibility() { set -x + # create a dummy ts model at this version + python test/create_dummy_torchscript_model.py /tmp/model_new.pt pushd test/forward_backward_compatibility python -m venv venv # shellcheck disable=SC1091 @@ -448,10 +462,21 @@ test_forward_backward_compatibility() { pip_install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html pip show torch python dump_all_function_schemas.py --filename nightly_schemas.txt + # FC: verify newmodel can be load with old code. + if ! python ../load_torchscript_model.py /tmp/model_new.pt; then + echo "FC check failed: new model cannot be load in old code" + return 1 + fi + python ../create_dummy_torchscript_model.py /tmp/model_old.pt deactivate rm -r venv pip show torch python check_forward_backward_compatibility.py --existing-schemas nightly_schemas.txt + # BC: verify old model can be load with new code + if ! python ../load_torchscript_model.py /tmp/model_old.pt; then + echo "BC check failed: old model cannot be load in new code" + return 1 + fi popd set +x assert_git_not_dirty @@ -518,7 +543,7 @@ test_torch_deploy() { ln -sf "$TORCH_LIB_DIR"/libshm* "$TORCH_BIN_DIR" ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR" "$TORCH_BIN_DIR"/test_deploy - "$TORCH_BIN_DIR"/test_api --gtest_filter='IMethodTest.*' + "$TORCH_BIN_DIR"/test_deploy_gpu assert_git_not_dirty } @@ -530,8 +555,9 @@ if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-baze (cd test && python -c "import torch; print(torch.__config__.show())") (cd test && python -c "import torch; print(torch.__config__.parallel_info())") fi - -if [[ "${BUILD_ENVIRONMENT}" == *backward* ]]; then +if [[ "${BUILD_ENVIRONMENT}" == *deploy* ]]; then + test_torch_deploy +elif [[ "${BUILD_ENVIRONMENT}" == *backward* ]]; then test_forward_backward_compatibility # Do NOT add tests after bc check tests, see its comment. elif [[ "${TEST_CONFIG}" == *xla* ]]; then @@ -543,15 +569,18 @@ elif [[ "${BUILD_ENVIRONMENT}" == *jit_legacy-test || "${JOB_BASE_NAME}" == *jit elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then # TODO: run some C++ tests echo "no-op at the moment" -elif [[ "${BUILD_ENVIRONMENT}" == *-test1 || "${JOB_BASE_NAME}" == *-test1 || ("${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1) ]]; then - if [[ "${BUILD_ENVIRONMENT}" == *linux-xenial-cuda11.1*-test1* ]]; then - test_torch_deploy +elif [[ "${BUILD_ENVIRONMENT}" == *distributed* || "${JOB_BASE_NAME}" == *distributed* ]]; then + test_distributed + # Only run RPC C++ tests on the first shard + if [[ "${SHARD_NUMBER}" == 1 ]]; then + test_rpc fi +elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then test_without_numpy install_torchvision test_python_shard 1 test_aten -elif [[ "${BUILD_ENVIRONMENT}" == *-test2 || "${JOB_BASE_NAME}" == *-test2 || ("${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1) ]]; then +elif [[ "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1 ]]; then install_torchvision test_python_shard 2 test_libtorch @@ -563,12 +592,12 @@ elif [[ "${SHARD_NUMBER}" -gt 2 ]]; then # Handle arbitrary number of shards test_python_shard "$SHARD_NUMBER" elif [[ "${BUILD_ENVIRONMENT}" == *vulkan* ]]; then - test_vulkan + # TODO: re-enable vulkan test + echo "no-op at the moment" elif [[ "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then test_bazel -elif [[ "${BUILD_ENVIRONMENT}" == *distributed* || "${JOB_BASE_NAME}" == *distributed* ]]; then - test_distributed - test_rpc +elif [[ "${BUILD_ENVIRONMENT}" == *-mobile-lightweight-dispatch* ]]; then + test_libtorch elif [[ "${TEST_CONFIG}" = docs_test ]]; then test_docs_test else diff --git a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat index 4954dcf4f451..c3650856d478 100644 --- a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat +++ b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat @@ -28,10 +28,7 @@ call %INSTALLER_DIR%\install_sccache.bat if errorlevel 1 exit /b if not errorlevel 0 exit /b -call :retry %INSTALLER_DIR%\install_miniconda3.bat - -:retry -call %* || (powershell -nop -c "& {sleep 1}" && call %*) || (powershell -nop -c "& {sleep 2}" && call %*) +call %INSTALLER_DIR%\install_miniconda3.bat if errorlevel 1 exit /b if not errorlevel 0 exit /b @@ -92,6 +89,7 @@ if "%TORCH_CUDA_ARCH_LIST%" == "" set TORCH_CUDA_ARCH_LIST=5.2 :: The default sccache idle timeout is 600, which is too short and leads to intermittent build errors. set SCCACHE_IDLE_TIMEOUT=0 +set SCCACHE_IGNORE_SERVER_IO_ERROR=1 sccache --stop-server sccache --start-server sccache --zero-stats @@ -143,7 +141,7 @@ python setup.py install --cmake && sccache --show-stats && ( if "%BUILD_ENVIRONMENT%"=="" ( echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3` in Command Prompt before running Git Bash. ) else ( - 7z a %TMP_DIR_WIN%\%IMAGE_COMMIT_TAG%.7z %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\caffe2 && copy /Y "%TMP_DIR_WIN%\%IMAGE_COMMIT_TAG%.7z" "%PYTORCH_FINAL_PACKAGE_DIR%\" + 7z a %TMP_DIR_WIN%\%IMAGE_COMMIT_TAG%.7z %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torchgen %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\caffe2 && copy /Y "%TMP_DIR_WIN%\%IMAGE_COMMIT_TAG%.7z" "%PYTORCH_FINAL_PACKAGE_DIR%\" if errorlevel 1 exit /b if not errorlevel 0 exit /b @@ -157,4 +155,5 @@ python setup.py install --cmake && sccache --show-stats && ( sccache --show-stats > stats.txt python -m tools.stats.upload_sccache_stats stats.txt +sccache --stop-server rm stats.txt diff --git a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat index 20b3b4db4c02..657848631245 100644 --- a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat +++ b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat @@ -22,7 +22,7 @@ if "%INSTALL_FRESH_CONDA%"=="1" ( call conda install -y -q python=%PYTHON_VERSION% numpy cffi pyyaml boto3 libuv if errorlevel 1 exit /b if not errorlevel 0 exit /b - call conda install -y -q -c conda-forge cmake + call conda install -y -q -c conda-forge cmake=3.22.3 if errorlevel 1 exit /b if not errorlevel 0 exit /b ) diff --git a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat index b738b4e70b74..c7f3e1b6a614 100644 --- a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat +++ b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat @@ -34,7 +34,9 @@ popd :: The version is fixed to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136 ======= -pip install "ninja==1.10.0.post1" future "hypothesis==4.53.2" "expecttest==0.1.3" "librosa>=0.6.2,<0.9.0" psutil pillow unittest-xml-reporting pytest +:: Pin unittest-xml-reporting to freeze printing test summary logic, related: https://github.com/pytorch/pytorch/issues/69014 + +pip install "ninja==1.10.0.post1" future "hypothesis==4.53.2" "expecttest==0.1.3" "librosa>=0.6.2" "scipy==1.6.3" psutil pillow "unittest-xml-reporting<=3.2.0,>=2.0.0" pytest if errorlevel 1 exit /b if not errorlevel 0 exit /b diff --git a/.jenkins/pytorch/win-test-helpers/test_python.bat b/.jenkins/pytorch/win-test-helpers/test_python.bat deleted file mode 100644 index 2de7ac4c3bcd..000000000000 --- a/.jenkins/pytorch/win-test-helpers/test_python.bat +++ /dev/null @@ -1,20 +0,0 @@ -call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat -:: exit the batch once there's an error -if not errorlevel 0 ( - echo "setup pytorch env failed" - echo %errorlevel% - exit /b -) - -pushd test -if "%RUN_SMOKE_TESTS_ONLY%"=="1" ( - :: Download specified test cases to run - curl --retry 3 -k https://raw.githubusercontent.com/pytorch/test-infra/main/stats/windows_smoke_tests.csv --output .pytorch_specified_test_cases.csv - if ERRORLEVEL 1 exit /b 1 - - python run_test.py --exclude-jit-executor --verbose --run-specified-test-cases -) else ( - python run_test.py --exclude-jit-executor --verbose -) -popd -if ERRORLEVEL 1 exit /b 1 diff --git a/.jenkins/pytorch/win-test-helpers/test_python_first_shard.bat b/.jenkins/pytorch/win-test-helpers/test_python_first_shard.bat deleted file mode 100644 index 181259df7e35..000000000000 --- a/.jenkins/pytorch/win-test-helpers/test_python_first_shard.bat +++ /dev/null @@ -1,44 +0,0 @@ -call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat -:: exit the batch once there's an error -if not errorlevel 0 ( - echo "setup pytorch env failed" - echo %errorlevel% - exit /b -) - -pushd test - -set GFLAGS_EXE="C:\Program Files (x86)\Windows Kits\10\Debuggers\x64\gflags.exe" -if exist %GFLAGS_EXE% ( - echo Some smoke tests - %GFLAGS_EXE% /i python.exe +sls - python %SCRIPT_HELPERS_DIR%\run_python_nn_smoketests.py - if ERRORLEVEL 1 goto fail - - %GFLAGS_EXE% /i python.exe -sls - if ERRORLEVEL 1 goto fail -) - -echo Copying over test times file -copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%TEST_DIR_WIN%" - -echo Run nn tests - -if "%RUN_SMOKE_TESTS_ONLY%"=="1" ( - :: Download specified test cases to run - curl --retry 3 -k https://raw.githubusercontent.com/pytorch/test-infra/main/stats/windows_smoke_tests.csv --output .pytorch_specified_test_cases.csv - if ERRORLEVEL 1 goto fail - - python run_test.py --exclude-jit-executor --shard 1 2 --verbose --run-specified-test-cases -) else ( - python run_test.py --exclude-jit-executor --shard 1 2 --verbose -) -if ERRORLEVEL 1 goto fail - -popd - -:eof -exit /b 0 - -:fail -exit /b 1 diff --git a/.jenkins/pytorch/win-test-helpers/test_python_second_shard.bat b/.jenkins/pytorch/win-test-helpers/test_python_second_shard.bat deleted file mode 100644 index 56d115f64df7..000000000000 --- a/.jenkins/pytorch/win-test-helpers/test_python_second_shard.bat +++ /dev/null @@ -1,26 +0,0 @@ -call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat -:: exit the batch once there's an error -if not errorlevel 0 ( - echo "setup pytorch env failed" - echo %errorlevel% - exit /b -) - -echo Copying over test times file -copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%TEST_DIR_WIN%" - -pushd test - -if "%RUN_SMOKE_TESTS_ONLY%"=="1" ( - :: Download specified test cases to run - curl --retry 3 -k https://raw.githubusercontent.com/pytorch/test-infra/main/stats/windows_smoke_tests.csv --output .pytorch_specified_test_cases.csv - if ERRORLEVEL 1 exit /b 1 - - python run_test.py --exclude-jit-executor --shard 2 2 --verbose --run-specified-test-cases -) else ( - python run_test.py --exclude-jit-executor --shard 2 2 --verbose -) - -popd - -if ERRORLEVEL 1 exit /b 1 diff --git a/.jenkins/pytorch/win-test-helpers/test_python_shard.bat b/.jenkins/pytorch/win-test-helpers/test_python_shard.bat new file mode 100644 index 000000000000..ccc615f67f31 --- /dev/null +++ b/.jenkins/pytorch/win-test-helpers/test_python_shard.bat @@ -0,0 +1,37 @@ +call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat +:: exit the batch once there's an error +if not errorlevel 0 ( + echo "setup pytorch env failed" + echo %errorlevel% + exit /b +) + +pushd test + +set GFLAGS_EXE="C:\Program Files (x86)\Windows Kits\10\Debuggers\x64\gflags.exe" +if "%SHARD_NUMBER%" == "1" ( + if exist %GFLAGS_EXE% ( + echo Some smoke tests + %GFLAGS_EXE% /i python.exe +sls + python %SCRIPT_HELPERS_DIR%\run_python_nn_smoketests.py + if ERRORLEVEL 1 goto fail + + %GFLAGS_EXE% /i python.exe -sls + if ERRORLEVEL 1 goto fail + ) +) + +echo Copying over test times file +copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%TEST_DIR_WIN%" + +echo Run nn tests +python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose +if ERRORLEVEL 1 goto fail + +popd + +:eof +exit /b 0 + +:fail +exit /b 1 diff --git a/.jenkins/pytorch/win-test.sh b/.jenkins/pytorch/win-test.sh index 51c5700db0b8..7312ca7f23c6 100755 --- a/.jenkins/pytorch/win-test.sh +++ b/.jenkins/pytorch/win-test.sh @@ -26,7 +26,6 @@ export TEST_DIR_WIN export PYTORCH_FINAL_PACKAGE_DIR="${PYTORCH_FINAL_PACKAGE_DIR:-/c/users/circleci/workspace/build-results}" PYTORCH_FINAL_PACKAGE_DIR_WIN=$(cygpath -w "${PYTORCH_FINAL_PACKAGE_DIR}") export PYTORCH_FINAL_PACKAGE_DIR_WIN -export PYTORCH_TEST_SKIP_NOARCH=1 mkdir -p "$TMP_DIR"/build/torch @@ -49,8 +48,13 @@ fi if [[ "$TEST_CONFIG" = "force_on_cpu" ]]; then # run the full test suite for force_on_cpu test export USE_CUDA=0 -elif [[ "$TEST_CONFIG" == "smoke_tests" ]]; then - export RUN_SMOKE_TESTS_ONLY=1 +fi + +if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then + # Used so that only cuda/rocm specific versions of tests are generated + # mainly used so that we're not spending extra cycles testing cpu + # devices on expensive gpu machines + export PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda" fi run_tests() { @@ -62,32 +66,20 @@ run_tests() { fi done + "$SCRIPT_HELPERS_DIR"/test_python_shard.bat if [[ ( -z "${JOB_BASE_NAME}" || "${JOB_BASE_NAME}" == *-test ) && $NUM_TEST_SHARDS -eq 1 ]]; then - "$SCRIPT_HELPERS_DIR"/test_python.bat - - if [[ -z ${RUN_SMOKE_TESTS_ONLY} ]]; then - "$SCRIPT_HELPERS_DIR"/test_custom_script_ops.bat - "$SCRIPT_HELPERS_DIR"/test_custom_backend.bat - "$SCRIPT_HELPERS_DIR"/test_libtorch.bat - fi + "$SCRIPT_HELPERS_DIR"/test_custom_script_ops.bat + "$SCRIPT_HELPERS_DIR"/test_custom_backend.bat + "$SCRIPT_HELPERS_DIR"/test_libtorch.bat else - if [[ "${JOB_BASE_NAME}" == *-test1 || ("${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1) ]]; then - "$SCRIPT_HELPERS_DIR"/test_python_first_shard.bat - - if [[ -z ${RUN_SMOKE_TESTS_ONLY} ]]; then - "$SCRIPT_HELPERS_DIR"/test_libtorch.bat - if [[ "${USE_CUDA}" == "1" ]]; then - "$SCRIPT_HELPERS_DIR"/test_python_jit_legacy.bat - fi - fi - - elif [[ "${JOB_BASE_NAME}" == *-test2 || ("${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1) ]]; then - "$SCRIPT_HELPERS_DIR"/test_python_second_shard.bat - - if [[ -z ${RUN_SMOKE_TESTS_ONLY} ]]; then - "$SCRIPT_HELPERS_DIR"/test_custom_backend.bat - "$SCRIPT_HELPERS_DIR"/test_custom_script_ops.bat + if [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then + "$SCRIPT_HELPERS_DIR"/test_libtorch.bat + if [[ "${USE_CUDA}" == "1" ]]; then + "$SCRIPT_HELPERS_DIR"/test_python_jit_legacy.bat fi + elif [[ "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1 ]]; then + "$SCRIPT_HELPERS_DIR"/test_custom_backend.bat + "$SCRIPT_HELPERS_DIR"/test_custom_script_ops.bat fi fi } diff --git a/.lintrunner.toml b/.lintrunner.toml index 7126745fca1a..295c516bc30b 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -20,7 +20,6 @@ exclude_patterns = [ command = [ 'python3', 'tools/linter/adapters/flake8_linter.py', - '--binary=flake8', '--', '@{{PATHSFILE}}' ] @@ -53,6 +52,9 @@ include_patterns = [ 'test/cpp/tensorexpr/**/*.h', 'test/cpp/tensorexpr/**/*.cpp', ] +exclude_patterns = [ + 'torch/csrc/jit/serialization/mobile_bytecode_generated.h', +] init_command = [ 'python3', 'tools/linter/adapters/s3_init.py', @@ -69,18 +71,21 @@ command = [ '--', '@{{PATHSFILE}}' ] +is_formatter = true [[linter]] code = 'MYPY' include_patterns = [ 'torch/**/*.py', + 'torch/**/*.pyi', 'caffe2/**/*.py', + 'caffe2/**/*.pyi', 'test/test_bundled_images.py', 'test/test_bundled_inputs.py', 'test/test_complex.py', 'test/test_datapipe.py', 'test/test_futures.py', - 'test/test_numpy_interop.py', + # 'test/test_numpy_interop.py', 'test/test_torch.py', 'test/test_type_hints.py', 'test/test_type_info.py', @@ -90,11 +95,21 @@ exclude_patterns = [ 'torch/include/**', 'torch/csrc/**', 'torch/distributed/elastic/agent/server/api.py', + 'torch/testing/_internal/**', + 'torch/distributed/fsdp/fully_sharded_data_parallel.py', + # TODO(suo): these exclusions were added just to get lint clean on master. + # Follow up to do more target suppressions and remove them. + 'torch/distributed/fsdp/flatten_params_wrapper.py', + 'torch/ao/quantization/fx/convert.py', + 'torch/ao/quantization/_dbr/function_fusion.py', + 'test/test_datapipe.py', + 'caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py', + 'test/test_numpy_interop.py', + 'torch/torch_version.py', ] command = [ 'python3', 'tools/linter/adapters/mypy_linter.py', - '--binary=mypy', '--config=mypy.ini', '--', '@{{PATHSFILE}}' @@ -105,9 +120,17 @@ init_command = [ '--dry-run={{DRYRUN}}', 'numpy==1.20', 'expecttest==0.1.3', - 'mypy==0.812', + 'mypy==0.950', + 'types-requests==2.27.25', + 'types-six==1.16.15', + 'types-PyYAML==6.0.7', + 'types-tabulate==0.8.8', + 'types-protobuf==3.19.18', + 'types-pkg-resources==0.1.3', + 'types-Jinja2==2.11.9', 'junitparser==2.1.1', 'rich==10.9.0', + 'pyyaml==6.0', ] [[linter]] @@ -121,10 +144,13 @@ include_patterns = [ 'torch/utils/benchmark/utils/timer.py', 'torch/utils/benchmark/utils/valgrind_wrapper/**/*.py', ] +exclude_patterns = [ + # (linbinyu) copied from internal repo + 'tools/code_analyzer/gen_operators_yaml.py', +] command = [ 'python3', 'tools/linter/adapters/mypy_linter.py', - '--binary=mypy', '--config=mypy-strict.ini', '--', '@{{PATHSFILE}}' @@ -133,11 +159,12 @@ command = [ [[linter]] code = 'CLANGTIDY' include_patterns = [ + 'torch/csrc/deploy/**/*.cpp', 'torch/csrc/fx/**/*.cpp', - 'torch/csrc/utils/**/*.cpp', 'torch/csrc/generic/**/*.cpp', - 'torch/csrc/deploy/**/*.cpp', + 'torch/csrc/onnx/**/*.cpp', 'torch/csrc/tensor/**/*.cpp', + 'torch/csrc/utils/**/*.cpp', ] exclude_patterns = [ # The negative filters below are to exclude files that include onnx_pb.h or @@ -191,7 +218,7 @@ exclude_patterns = ['test/test_jit.py'] command = [ 'python3', 'tools/linter/adapters/grep_linter.py', - '--pattern=# type:\s*ignore(?!\[)', + '--pattern=# type:\s*ignore([^\[]|$)', '--linter-name=TYPEIGNORE', '--error-name=unqualified type: ignore', """--error-description=\ @@ -209,7 +236,7 @@ exclude_patterns = ['caffe2/**'] command = [ 'python3', 'tools/linter/adapters/grep_linter.py', - '--pattern=# noqa(?!: [A-Z]+\d{3})', + '--pattern=# noqa([^:]|$)', '--linter-name=NOQA', '--error-name=unqualified noqa', """--error-description=\ @@ -245,6 +272,7 @@ init_command = [ '--dry-run={{DRYRUN}}', 'ruamel.yaml==0.17.4', ] +is_formatter = true [[linter]] code = 'NEWLINE' @@ -254,7 +282,10 @@ exclude_patterns=[ 'third_party/**', '**/*.expect', '**/*.ipynb', + '**/*.ptl', 'tools/clang_format_hash/**', + 'test/cpp/jit/upgrader_models/*.ptl', + 'test/cpp/jit/upgrader_models/*.ptl.ff', ] command = [ 'python3', @@ -262,6 +293,7 @@ command = [ '--', '@{{PATHSFILE}}', ] +is_formatter = true [[linter]] code = 'SPACES' @@ -270,6 +302,8 @@ exclude_patterns = [ '**/contrib/**', '**/*.diff', 'third_party/**', + 'test/cpp/jit/upgrader_models/*.ptl', + 'test/cpp/jit/upgrader_models/*.ptl.ff', ] command = [ 'python3', @@ -295,11 +329,14 @@ exclude_patterns = [ 'third_party/**', '**/.gitattributes', '**/.gitmodules', + 'test/cpp/jit/upgrader_models/*.ptl', + 'test/cpp/jit/upgrader_models/*.ptl.ff', + '.lintrunner.toml', ] command = [ 'python3', 'tools/linter/adapters/grep_linter.py', - '--pattern=\t', + '--pattern= ', '--linter-name=TABS', '--error-name=saw some tabs', '--replace-pattern=s/\t/ /', @@ -319,6 +356,7 @@ include_patterns = [ ] exclude_patterns = [ 'aten/src/ATen/native/quantized/cpu/qnnpack/**', + 'torch/csrc/jit/serialization/mobile_bytecode_generated.h', ] command = [ 'python3', @@ -348,7 +386,7 @@ command = [ 'tools/linter/adapters/grep_linter.py', """--pattern=\ (pip|pip3|python -m pip|python3 -m pip|python3 -mpip|python -mpip) \ - install ([a-z][\\.a-z-0-9]*+(?!(=|.*\\.whl))([[:blank:]]|))+\ + install ([a-zA-Z0-9][A-Za-z0-9\\._\\-]+)([^/=<>~!]+)[A-Za-z0-9\\._\\-\\*\\+\\!]*$\ """, '--linter-name=PYPIDEP', '--error-name=unpinned PyPI install', @@ -455,3 +493,87 @@ init_command = [ '--dry-run={{DRYRUN}}', 'cmakelint==1.4.1', ] + +[[linter]] +code = 'SHELLCHECK' +include_patterns = [ + '.jenkins/pytorch/**/*.sh' +] +command = [ + 'python3', + 'tools/linter/adapters/shellcheck_linter.py', + '--', + '@{{PATHSFILE}}', +] +init_command = [ + 'python3', + 'tools/linter/adapters/pip_init.py', + '--dry-run={{DRYRUN}}', + 'shellcheck-py==0.7.2.1', +] + +[[linter]] +code = 'ACTIONLINT' +include_patterns = [ + '.github/workflows/*.yml', + '.github/workflows/*.yaml', + # actionlint does not support composite actions yet + # '.github/actions/**/*.yml', + # '.github/actions/**/*.yaml', +] +command = [ + 'python3', + 'tools/linter/adapters/actionlint_linter.py', + '--binary=.lintbin/actionlint', + '--', + '@{{PATHSFILE}}', +] +init_command = [ + 'python3', + 'tools/linter/adapters/s3_init.py', + '--config-json=tools/linter/adapters/s3_init_config.json', + '--linter=actionlint', + '--dry-run={{DRYRUN}}', + '--output-dir=.lintbin', + '--output-name=actionlint', +] + +[[linter]] +code = 'TESTOWNERS' +include_patterns = [ + 'test/**/test_*.py', + 'test/**/*_test.py', +] +exclude_patterns = [ + 'test/run_test.py', +] +command = [ + 'python3', + 'tools/linter/adapters/testowners_linter.py', + '--', + '@{{PATHSFILE}}', +] + +[[linter]] +code = 'BLACK' +include_patterns = [ + 'torchgen/**/*.py', + 'tools/**/*.py', + 'torch/onnx/**/*.py', + 'torch/_refs/**/*.py', + 'torch/_prims/**/*.py', + 'test/onnx/**/*.py', +] +command = [ + 'python3', + 'tools/linter/adapters/black_linter.py', + '--', + '@{{PATHSFILE}}' +] +init_command = [ + 'python3', + 'tools/linter/adapters/pip_init.py', + '--dry-run={{DRYRUN}}', + 'black==22.3.0', +] +is_formatter = true diff --git a/BUILD.bazel b/BUILD.bazel index 6590a7b1c3c4..d373a84f64d9 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -3,10 +3,14 @@ load("@pybind11_bazel//:build_defs.bzl", "pybind_extension") load("@rules_proto//proto:defs.bzl", "proto_library") load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_proto_library", "cc_test") load("//third_party:substitution.bzl", "header_template_rule") -load("//:tools/build_variables.bzl", "jit_core_sources", "libtorch_core_sources", "libtorch_cuda_sources", "libtorch_distributed_sources", "libtorch_extra_sources", "libtorch_nvfuser_generated_headers", "libtorch_nvfuser_runtime_sources", "libtorch_python_core_sources", "torch_cpp_srcs") +load("//:tools/bazel.bzl", "rules") +load("//:tools/build_variables.bzl", "jit_core_sources", "libtorch_core_sources", "libtorch_cuda_sources", "libtorch_distributed_sources", "libtorch_extra_sources", "libtorch_nvfuser_generated_headers", "libtorch_nvfuser_runtime_sources", "libtorch_python_core_sources", "torch_cpp_srcs", "lazy_tensor_ts_sources") load("//tools/rules:cu.bzl", "cu_library") load("//tools/config:defs.bzl", "if_cuda") -load("//:aten.bzl", "intern_build_aten_ops", "generate_aten") +load("//:aten.bzl", "intern_build_aten_ops", "generate_aten", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cuda_sources") +load(":build.bzl", "define_targets", "GENERATED_AUTOGRAD_CPP", "GENERATED_AUTOGRAD_PYTHON") + +define_targets(rules = rules) COMMON_COPTS = [ "-DHAVE_MALLOC_USABLE_SIZE=1", @@ -25,17 +29,7 @@ COMMON_COPTS = [ "-DUSE_CUDNN", ]) -# TODO: refactor this into its own library (but how to make -# a binary based off of a module in a library?) -py_binary( - name = "gen", - srcs = ["tools/setup_helpers/gen.py"], - deps = [ - ":tools_codegen" - ], -) - -aten_generation_srcs = ["aten/src/ATen/native/native_functions.yaml"] + glob(["aten/src/ATen/templates/**"]) +aten_generation_srcs = ["aten/src/ATen/native/native_functions.yaml"] + ["aten/src/ATen/native/tags.yaml"] + glob(["aten/src/ATen/templates/**"]) generated_cpu_cpp = [ "aten/src/ATen/RegisterBackendSelect.cpp", @@ -46,6 +40,7 @@ generated_cpu_cpp = [ "aten/src/ATen/RegisterFunctionalization_3.cpp", # "aten/src/ATen/RegisterFunctionalizationEverything.cpp", "aten/src/ATen/RegisterMkldnnCPU.cpp", + "aten/src/ATen/RegisterNestedTensorCPU.cpp", "aten/src/ATen/RegisterQuantizedCPU.cpp", "aten/src/ATen/RegisterSparseCPU.cpp", "aten/src/ATen/RegisterSparseCsrCPU.cpp", @@ -60,6 +55,7 @@ generated_cpu_cpp = [ "aten/src/ATen/CompositeExplicitAutogradFunctions_inl.h", "aten/src/ATen/CompositeImplicitAutogradFunctions.h", "aten/src/ATen/CompositeImplicitAutogradFunctions_inl.h", + "aten/src/ATen/CompositeViewCopyKernels.cpp", "aten/src/ATen/FunctionalInverses.h", "aten/src/ATen/Functions.h", "aten/src/ATen/Functions.cpp", @@ -86,6 +82,7 @@ generated_cuda_cpp = [ "aten/src/ATen/CUDAFunctions.h", "aten/src/ATen/CUDAFunctions_inl.h", "aten/src/ATen/RegisterCUDA.cpp", + "aten/src/ATen/RegisterNestedTensorCUDA.cpp", "aten/src/ATen/RegisterQuantizedCUDA.cpp", "aten/src/ATen/RegisterSparseCUDA.cpp", "aten/src/ATen/RegisterSparseCsrCUDA.cpp", @@ -94,103 +91,21 @@ generated_cuda_cpp = [ generate_aten( name = "generated_aten_cpp", srcs = aten_generation_srcs, - outs = generated_cpu_cpp + generated_cuda_cpp + [ - "aten/src/ATen/Declarations.yaml", - ], - generator=":gen", -) - -py_library( - name = "tools_codegen", - srcs = glob(["tools/codegen/**/*.py"]), -) - -py_library( - name = "tools_autograd", - srcs = glob(["tools/autograd/*.py"]), - data = glob([ - "tools/autograd/*.yaml", - "tools/autograd/templates/*", - ]), - deps = [":tools_codegen"], -) - -py_library( - name = "tools_jit", - srcs = glob(["tools/jit/*.py"]), - data = glob(["tools/jit/templates/*"]), -) - -py_binary( - name = "generate_code", - srcs = ["tools/setup_helpers/generate_code.py"], - deps = [ - ":tools_autograd", - ":tools_jit", - ], -) - -libtorch_cpp_generated_sources = [ - "torch/csrc/autograd/generated/VariableType.h", - "torch/csrc/autograd/generated/VariableType_0.cpp", - "torch/csrc/autograd/generated/VariableType_1.cpp", - "torch/csrc/autograd/generated/VariableType_2.cpp", - "torch/csrc/autograd/generated/VariableType_3.cpp", - "torch/csrc/autograd/generated/VariableType_4.cpp", - # "torch/csrc/autograd/generated/VariableTypeEverything.cpp", - "torch/csrc/autograd/generated/TraceType_0.cpp", - "torch/csrc/autograd/generated/TraceType_1.cpp", - "torch/csrc/autograd/generated/TraceType_2.cpp", - "torch/csrc/autograd/generated/TraceType_3.cpp", - "torch/csrc/autograd/generated/TraceType_4.cpp", - # "torch/csrc/autograd/generated/TraceTypeEverything.cpp", - "torch/csrc/autograd/generated/ADInplaceOrViewType_0.cpp", - "torch/csrc/autograd/generated/ADInplaceOrViewType_1.cpp", - # "torch/csrc/autograd/generated/ADInplaceOrViewTypeEverything.cpp", - "torch/csrc/autograd/generated/Functions.h", - "torch/csrc/autograd/generated/Functions.cpp", - "torch/csrc/autograd/generated/variable_factories.h", -] - -libtorch_python_generated_sources = [ - "torch/csrc/autograd/generated/python_functions.h", - "torch/csrc/autograd/generated/python_functions_0.cpp", - "torch/csrc/autograd/generated/python_functions_1.cpp", - "torch/csrc/autograd/generated/python_functions_2.cpp", - "torch/csrc/autograd/generated/python_functions_3.cpp", - "torch/csrc/autograd/generated/python_functions_4.cpp", - "torch/csrc/autograd/generated/python_variable_methods.cpp", - "torch/csrc/autograd/generated/python_torch_functions_0.cpp", - "torch/csrc/autograd/generated/python_torch_functions_1.cpp", - "torch/csrc/autograd/generated/python_torch_functions_2.cpp", - "torch/csrc/autograd/generated/python_nn_functions.cpp", - "torch/csrc/autograd/generated/python_fft_functions.cpp", - "torch/csrc/autograd/generated/python_linalg_functions.cpp", - "torch/csrc/autograd/generated/python_sparse_functions.cpp", - "torch/csrc/autograd/generated/python_special_functions.cpp", - "torch/csrc/autograd/generated/python_return_types.cpp", -] - -genrule( - name = "all_generated_code", - srcs = [ - "aten/src/ATen/native/native_functions.yaml", - ], - outs = libtorch_cpp_generated_sources + libtorch_python_generated_sources, - cmd = "$(location :generate_code) --install_dir `dirname $(location torch/csrc/autograd/generated/variable_factories.h)`/../.. --native-functions-path $(location aten/src/ATen/native/native_functions.yaml) --nn-path aten/src", - tools = [":generate_code"], + outs = ( + generated_cpu_cpp + + generated_cuda_cpp + + aten_ufunc_generated_cpu_sources("aten/src/ATen/{}") + + aten_ufunc_generated_cpu_kernel_sources("aten/src/ATen/{}") + + aten_ufunc_generated_cuda_sources("aten/src/ATen/{}") + + ["aten/src/ATen/Declarations.yaml"] + ), + generator = "//torchgen:gen", ) filegroup( name = "cpp_generated_code", - data = [":all_generated_code"], - srcs = libtorch_cpp_generated_sources, -) - -filegroup( - name = "python_generated_code", - data = [":all_generated_code"], - srcs = libtorch_python_generated_sources, + data = [":generate-code"], + srcs = GENERATED_AUTOGRAD_CPP, ) exports_files( @@ -229,6 +144,11 @@ filegroup( srcs = glob(["aten/src/ATen/native/sparse/*.cpp"]), ) +filegroup( + name = "aten_native_nested_cpp", + srcs = glob(["aten/src/ATen/native/nested/*.cpp"]), +) + filegroup( name = "aten_native_quantized_cpp", srcs = glob( @@ -239,6 +159,11 @@ filegroup( ), ) +filegroup( + name = "aten_native_transformers_cpp", + srcs = glob(["aten/src/ATen/native/transformers/*.cpp"]), +) + filegroup( name = "aten_native_mkl_cpp", srcs = glob(["aten/src/ATen/native/mkl/*.cpp", "aten/src/ATen/mkl/*.cpp"]), @@ -287,7 +212,10 @@ filegroup( "aten/src/ATen/native/cuda/linalg/*.cpp", "aten/src/ATen/native/cudnn/*.cpp", "aten/src/ATen/native/miopen/*.cpp", + "aten/src/ATen/native/nested/cuda/*.cpp", + "aten/src/ATen/native/quantized/cudnn/*.cpp", "aten/src/ATen/native/sparse/cuda/*.cpp", + "aten/src/ATen/native/transformers/cuda/*.cpp", "aten/src/THC/*.cpp", ], ), @@ -299,9 +227,13 @@ filegroup( "aten/src/ATen/cuda/*.cu", "aten/src/ATen/cuda/detail/*.cu", "aten/src/ATen/native/cuda/*.cu", + "aten/src/ATen/native/nested/cuda/*.cu", "aten/src/ATen/native/quantized/cuda/*.cu", "aten/src/ATen/native/sparse/cuda/*.cu", - ]), + "aten/src/ATen/native/transformers/cuda/*.cu", + ]) + aten_ufunc_generated_cuda_sources("aten/src/ATen/{}"), + # It's a bit puzzling to me why it's not necessary to declare the + # target that generates these sources... ) header_template_rule( @@ -383,6 +315,7 @@ intern_build_aten_ops( "@fbgemm", "@mkl", ], + extra_impls = aten_ufunc_generated_cpu_kernel_sources("aten/src/ATen/{}"), ) cc_library( @@ -398,9 +331,11 @@ cc_library( ":aten_native_mkldnn_cpp", ":aten_native_quantized_cpp", ":aten_native_sparse_cpp", + ":aten_native_nested_cpp", + ":aten_native_transformers_cpp", ":aten_native_xnnpack", ":aten_src_ATen_config", - ] + generated_cpu_cpp, + ] + generated_cpu_cpp + aten_ufunc_generated_cpu_sources("aten/src/ATen/{}"), copts = ATEN_COPTS, data = if_cuda( [":libcaffe2_nvrtc.so"], @@ -1354,7 +1289,7 @@ cc_library( py_binary( name = "gen_op", srcs = ["caffe2/contrib/aten/gen_op.py"], - deps = [":tools_codegen"], + deps = ["//torchgen"], ) genrule( @@ -1622,19 +1557,6 @@ cc_library( ) # torch -py_binary( - name = "gen_version_header", - srcs = ["tools/setup_helpers/gen_version_header.py"], -) - -genrule( - name = "version_h", - srcs = ["torch/csrc/api/include/torch/version.h.in", "version.txt"], - outs = ["torch/csrc/api/include/torch/version.h"], - cmd = "$(location :gen_version_header) --template-path $(location torch/csrc/api/include/torch/version.h.in) --version-path $(location version.txt) --output-path $@", - tools = [':gen_version_header'], -) - py_binary( name = "stringify_file", srcs = ["torch/csrc/jit/codegen/cuda/tools/stringify_file.py"], @@ -1673,7 +1595,7 @@ cc_library( "torch/csrc/autograd/generated/variable_factories.h", "torch/csrc/autograd/generated/Functions.h", ] + torch_cuda_headers, - ) + [":cpp_generated_code", ":version_h"], + ) + GENERATED_AUTOGRAD_CPP + [":version_h"], includes = [ "torch/csrc", "torch/csrc/api/include", @@ -1718,8 +1640,7 @@ cc_library( "torch/csrc/cuda/nccl.cpp", "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu", ], - )) + libtorch_core_sources + libtorch_distributed_sources + torch_cpp_srcs + libtorch_extra_sources + jit_core_sources + [ - ":cpp_generated_code", + )) + libtorch_core_sources + libtorch_distributed_sources + torch_cpp_srcs + libtorch_extra_sources + jit_core_sources + lazy_tensor_ts_sources + GENERATED_AUTOGRAD_CPP + [ "torch/csrc/jit/serialization/flatbuffer_serializer.cpp", "torch/csrc/jit/mobile/flatbuffer_loader.cpp" ], @@ -1752,7 +1673,10 @@ cc_library( "**/*.h", "**/*.cuh", ]) + [ - ":cpp_generated_code", + # We need the filegroup here because the raw list causes Bazel + # to see duplicate files. It knows how to deduplicate with the + # filegroup. + ":cpp_generated_code" ], includes = [ "torch/csrc/api/include", @@ -1768,7 +1692,7 @@ cc_library( cc_library( name = "torch_python", - srcs = libtorch_python_core_sources + [":python_generated_code"], + srcs = libtorch_python_core_sources + GENERATED_AUTOGRAD_PYTHON, deps = [ ":torch", ":shm", @@ -1880,6 +1804,9 @@ cc_test( "test/cpp/jit/*.h", "test/cpp/tensorexpr/*.cpp", "test/cpp/tensorexpr/*.h", + ], exclude=[ + # skip this since is not found in OSS build + "test/cpp/jit/test_exception.cpp", ]), linkstatic = True, tags = [ @@ -1898,6 +1825,11 @@ cc_test( srcs = glob([ "test/cpp/lazy/*.cpp", "test/cpp/lazy/*.h", + ], exclude=[ + # skip these since they depend on generated LazyIr.h which isn't available in bazel yet + "test/cpp/lazy/test_ir.cpp", + "test/cpp/lazy/test_lazy_ops.cpp", + "test/cpp/lazy/test_lazy_ops_util.cpp", ]), linkstatic = True, tags = [ @@ -1919,3 +1851,25 @@ test_suite( "//c10/test:tests", ], ) + +# An internal genrule that we are converging with refers to these file +# as if they are from this package, so we alias them for +# compatibility. + +[ + alias( + name = paths.basename(path), + actual = path, + ) + for path in [ + "aten/src/ATen/templates/DispatchKeyNativeFunctions.cpp", + "aten/src/ATen/templates/DispatchKeyNativeFunctions.h", + "aten/src/ATen/templates/LazyIr.h", + "aten/src/ATen/templates/RegisterDispatchKey.cpp", + "aten/src/ATen/native/native_functions.yaml", + "aten/src/ATen/native/tags.yaml", + "aten/src/ATen/native/ts_native_functions.yaml", + "torch/csrc/lazy/core/shape_inference.h", + "torch/csrc/lazy/ts_backend/ts_native_functions.cpp", + ] +] diff --git a/BUILD.buck b/BUILD.buck new file mode 100644 index 000000000000..ad8caff6ec4d --- /dev/null +++ b/BUILD.buck @@ -0,0 +1,620 @@ +load("//tools/build_defs:glob_defs.bzl", "subdir_glob") +load("//tools/build_defs:fb_xplat_genrule.bzl", "fb_xplat_genrule") +load( + "//tools:build_variables.bzl", + "aten_cpu_source_list", + "aten_native_source_list", + "core_sources_common", + "jit_core_headers", + "jit_core_sources", + "libtorch_profiler_sources", +) +load( + "//:pt_defs.oss.bzl", + "USED_PT_BACKENDS", + "build_aten_cpu", + "gen_aten_files", + "gen_aten_libtorch_files", + "get_aten_codegen_extra_params", + "get_pt_compiler_flags", + "get_pt_preprocessor_flags", + "pt_operator_library", + "get_pt_ops_deps", + "aten_ufunc_generated_all_cpu_sources", + "TEMPLATE_SOURCE_LIST", +) + +cxx_library( + name = "pthreadpool", + srcs = ['caffe2/utils/threadpool/pthreadpool.cc', 'caffe2/utils/threadpool/pthreadpool_impl.cc', 'caffe2/utils/threadpool/pthreadpool-cpp.cc', 'caffe2/utils/threadpool/thread_pool_guard.cpp', 'caffe2/utils/threadpool/ThreadPool.cc'], + deps = [':caffe2_headers', '//third_party:cpuinfo', '//third_party:glog', '//c10:c10', '//third_party:FXdiv'], + exported_deps = ['//third_party:pthreadpool'], + compiler_flags = ['-Wno-unused-function'], + preferred_linkage = "static", + exported_headers = subdir_glob([("", "caffe2/utils/threadpool/*.h")]), + exported_preprocessor_flags = ['-DUSE_PTHREADPOOL'], + header_namespace = "", + headers = [], + link_whole = True, + platform_preprocessor_flags = [['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]], + visibility = ['PUBLIC'], +) + +cxx_library( + name = "caffe2_headers", + deps = ['//c10:c10'], + exported_headers = subdir_glob( + [ + ("", "caffe2/**/*.h"), + ("", "binaries/**/*.h"), + ("modules", "**/*.h"), + ("aten/src", "ATen/core/**/*.h"), + ], + exclude = [ + "caffe2/fb/**/*.h", + "caffe2/mobile/contrib/libopencl-stub/**/*.h", + "caffe2/mobile/contrib/libvulkan-stub/**/*.h", + "caffe2/mobile/contrib/nnapi/**/*.h", + "caffe2/mobile/fb/binary/**/*.h", + "caffe2/mobile/fb/snpe_so/**/*.h", + "caffe2/mobile/fb/boltnn/bolt_lib/include/**/*.h", + "caffe2/mobile/contrib/snpe/**/*.h", + "caffe2/mobile/fb/qpl/jni/QuickPerformanceLogger.h", + "caffe2/share/fb/x3d/ldi/*.h", + "**/*.pb.h", + ], + ), + compiler_flags = ['-Os', '-fexceptions', '-frtti', '-Wno-shadow', '-Wno-unknown-pragmas', '-Wno-unused-variable', '-Wno-sign-compare', '-Icaffe2', '-Imodules', '-DEIGEN_NO_DEBUG', '-DCAFFE2_USE_LITE_PROTO', '-DCAFFE2_USE_GOOGLE_GLOG', '-DCAFFE2_RNN_NO_TEXT_FORMAT', '-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK=1', '-DCAFFE2_IS_XPLAT_BUILD', '-DSTRIP_ERROR_MESSAGES', '-DUSE_INTERNAL_PTHREADPOOL_IMPL', '-DCAFFE2_USE_HPTT'], + preferred_linkage = "static", + platform_preprocessor_flags = [['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]], + preprocessor_flags = ['-DUSE_INTERNAL_PTHREADPOOL_IMPL'], + visibility = ['PUBLIC'], +) + +cxx_library( + name = "common_core", + srcs = ['caffe2/core/common.cc'], + deps = [':caffe2_headers', '//c10:c10'], + exported_deps = [], + compiler_flags = ['-frtti', '-Os', '-Wno-unknown-pragmas', '-Wno-write-strings', '-Wno-unused-variable', '-Wno-unused-function', '-Wno-deprecated-declarations', '-Wno-shadow', '-Wno-global-constructors', '-Wno-missing-prototypes', '-std=gnu++17'], + preferred_linkage = "static", + header_namespace = "caffe2", + headers = [], + link_whole = True, + platform_preprocessor_flags = [['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]], + visibility = ['PUBLIC'], +) + +cxx_library( + name = "th_header", + header_namespace = "", + exported_headers = subdir_glob([ + # TH + ("aten/src", "TH/*.h"), + ("aten/src", "TH/*.hpp"), + ("aten/src", "TH/generic/*.h"), + ("aten/src", "TH/generic/*.hpp"), + ("aten/src", "TH/generic/simd/*.h"), + ("aten/src", "TH/vector/*.h"), + ("aten/src", "TH/generic/*.c"), + ("aten/src", "TH/generic/*.cpp"), + ("aten/src/TH", "*.h"), # for #include + # THNN + ("aten/src", "THNN/*.h"), + ("aten/src", "THNN/generic/*.h"), + ("aten/src", "THNN/generic/*.c"), + ]), +) + +cxx_library( + name = "aten_header", + header_namespace = "", + exported_headers = subdir_glob([ + # ATen Core + ("aten/src", "ATen/core/**/*.h"), + ("aten/src", "ATen/ops/*.h"), + # ATen Base + ("aten/src", "ATen/*.h"), + ("aten/src", "ATen/cpu/**/*.h"), + ("aten/src", "ATen/detail/*.h"), + ("aten/src", "ATen/quantized/*.h"), + ("aten/src", "ATen/vulkan/*.h"), + ("aten/src", "ATen/metal/*.h"), + ("aten/src", "ATen/mps/*.h"), + ("aten/src", "ATen/nnapi/*.h"), + # ATen Native + ("aten/src", "ATen/native/*.h"), + ("aten/src", "ATen/native/ao_sparse/quantized/cpu/*.h"), + ("aten/src", "ATen/native/cpu/**/*.h"), + ("aten/src", "ATen/native/sparse/*.h"), + ("aten/src", "ATen/native/mps/*.h"), + ("aten/src", "ATen/native/nested/*.h"), + ("aten/src", "ATen/native/quantized/*.h"), + ("aten/src", "ATen/native/quantized/cpu/*.h"), + ("aten/src", "ATen/native/transformers/*.h"), + ("aten/src", "ATen/native/ufunc/*.h"), + ("aten/src", "ATen/native/utils/*.h"), + ("aten/src", "ATen/native/vulkan/ops/*.h"), + ("aten/src", "ATen/native/xnnpack/*.h"), + # Remove the following after modifying codegen for mobile. + ("aten/src", "ATen/mkl/*.h"), + ("aten/src", "ATen/native/mkl/*.h"), + ("aten/src", "ATen/native/mkldnn/*.h"), + ], exclude = ["aten/src/ATen/Config.h"]), + visibility = ["PUBLIC"], +) + +cxx_library( + name = "jit_core_headers", + header_namespace = "", + exported_headers = subdir_glob([("", x) for x in jit_core_headers]), +) + +cxx_library( + name = "generated_aten_config_header", + header_namespace = "ATen", + exported_headers = { + "Config.h": ":generate_aten_config[Config.h]", + }, +) + +cxx_library( + name = "torch_mobile_headers", + header_namespace = "", + exported_headers = subdir_glob( + [ + ("", "torch/csrc/jit/mobile/*.h"), + ], + ), + visibility = ["PUBLIC"], +) + +fb_xplat_genrule( + name = "generate_aten_config", + srcs = [ + "aten/src/ATen/Config.h.in", + ], + cmd = " ".join([ + "sed", + "-e 's/@AT_MKLDNN_ENABLED@/ATEN_MKLDNN_ENABLED_FBXPLAT/g'", + "-e 's/@AT_MKL_ENABLED@/ATEN_MKL_ENABLED_FBXPLAT/g'", + "-e 's/@AT_MKL_SEQUENTIAL@/ATEN_MKL_SEQUENTIAL_FBXPLAT/g'", + "-e 's/@AT_FFTW_ENABLED@/0/g'", + "-e 's/@AT_POCKETFFT_ENABLED@/0/g'", + "-e 's/@AT_NNPACK_ENABLED@/ATEN_NNPACK_ENABLED_FBXPLAT/g'", + "-e 's/@CAFFE2_STATIC_LINK_CUDA_INT@/CAFFE2_STATIC_LINK_CUDA_FBXPLAT/g'", + "-e 's/@AT_BUILD_WITH_BLAS@/USE_BLAS_FBXPLAT/g'", + "-e 's/@AT_PARALLEL_OPENMP@/AT_PARALLEL_OPENMP_FBXPLAT/g'", + "-e 's/@AT_PARALLEL_NATIVE@/AT_PARALLEL_NATIVE_FBXPLAT/g'", + "-e 's/@AT_PARALLEL_NATIVE_TBB@/AT_PARALLEL_NATIVE_TBB_FBXPLAT/g'", + "-e 's/@AT_BUILD_WITH_LAPACK@/USE_LAPACK_FBXPLAT/g'", + "-e 's/@AT_BLAS_F2C@/AT_BLAS_F2C_FBXPLAT/g'", + "-e 's/@AT_BLAS_USE_CBLAS_DOT@/AT_BLAS_USE_CBLAS_DOT_FBXPLAT/g'", + "aten/src/ATen/Config.h.in > $OUT/Config.h" + ]), + outs = { + "Config.h": ["Config.h"], + }, + default_outs = ["."], +) + +gen_aten_files( + name = "gen_aten", + extra_flags = get_aten_codegen_extra_params(USED_PT_BACKENDS), + visibility = ["PUBLIC"], +) + +ATEN_EXPORTED_HEADERS = { + "CPUFunctions.h": ":gen_aten[CPUFunctions.h]", + "CPUFunctions_inl.h": ":gen_aten[CPUFunctions_inl.h]", + "CompositeExplicitAutogradFunctions.h": ":gen_aten[CompositeExplicitAutogradFunctions.h]", + "CompositeExplicitAutogradFunctions_inl.h": ":gen_aten[CompositeExplicitAutogradFunctions_inl.h]", + "CompositeImplicitAutogradFunctions.h": ":gen_aten[CompositeImplicitAutogradFunctions.h]", + "CompositeImplicitAutogradFunctions_inl.h": ":gen_aten[CompositeImplicitAutogradFunctions_inl.h]", + "FunctionalInverses.h": ":gen_aten[FunctionalInverses.h]", + "Functions.h": ":gen_aten[Functions.h]", + "MethodOperators.h": ":gen_aten[MethodOperators.h]", + "NativeFunctions.h": ":gen_aten[NativeFunctions.h]", + "NativeMetaFunctions.h": ":gen_aten[NativeMetaFunctions.h]", + "Operators.h": ":gen_aten[Operators.h]", + "RedispatchFunctions.h": ":gen_aten[RedispatchFunctions.h]", + "core/TensorBody.h": ":gen_aten[core/TensorBody.h]", + "core/aten_interned_strings.h": ":gen_aten[core/aten_interned_strings.h]", +} + +cxx_library( + name = "generated_aten_headers_cpu", + header_namespace = "ATen", + exported_headers = ATEN_EXPORTED_HEADERS, +) + +filegroup( + name = "aten_src_path", + srcs = [ + "aten/src/ATen/native/native_functions.yaml", + "aten/src/ATen/native/tags.yaml", + ] + glob(["aten/src/ATen/templates/*"]), + visibility = [ + "PUBLIC", + ], +) + +build_aten_cpu( + name = "aten_cpu", + srcs = jit_core_sources + + aten_cpu_source_list + [ + # Generated + ":gen_aten[Functions.cpp]", + ":gen_aten[Operators_0.cpp]", + ":gen_aten[Operators_1.cpp]", + ":gen_aten[Operators_2.cpp]", + ":gen_aten[Operators_3.cpp]", + ":gen_aten[Operators_4.cpp]", + ":gen_aten[core/ATenOpList.cpp]", + ":gen_aten[core/TensorMethods.cpp]", + ] + [ + # Needed by ATen/native/EmbeddingBag.cpp + "caffe2/perfkernels/embedding_lookup_idx.cc", + ], +) + +gen_aten_libtorch_files(name = "gen_aten_libtorch") + + +GENERATED_AUTOGRAD_H = { + "Functions.h": ":gen_aten_libtorch[autograd/generated/Functions.h]", + "VariableType.h": ":gen_aten_libtorch[autograd/generated/VariableType.h]", + "variable_factories.h": ":gen_aten_libtorch[autograd/generated/variable_factories.h]", + + # Don't build python bindings on mobile. + #"python_functions.h", +} + +cxx_library( + name = "generated-autograd-headers", + header_namespace = "torch/csrc/autograd/generated", + exported_headers = GENERATED_AUTOGRAD_H, + visibility = ["PUBLIC"], +) + +cxx_library( + name = "torch_mobile_observer", + srcs = [ + "torch/csrc/jit/mobile/observer.cpp", + #"torch/fb/observers/MobileObserverUtil.cpp", + ], + header_namespace = "", + exported_headers = subdir_glob( + [ + ("", "torch/csrc/jit/mobile/observer.h"), + #("", "torch/fb/observers/ObserverUtil.h"), + #("", "torch/fb/observers/MobileObserverUtil.h"), + ], + ), + visibility = ["PUBLIC"], + deps = [ + "//c10:c10", + ], +) + +python_library( + name = "aten_code_template", + srcs = subdir_glob([ + ("aten", "src/ATen/code_template.py"), + ]), + base_module = "", + visibility = ["PUBLIC"], +) + +fb_xplat_genrule( + name = "generate-version-header", + srcs = [ + "torch/csrc/api/include/torch/version.h.in", + "version.txt", + ], + cmd = "$(exe //tools/setup_helpers:gen-version-header) " + " ".join([ + "--template-path", + "torch/csrc/api/include/torch/version.h.in", + "--version-path", + "version.txt", + "--output-path", + "$OUT/version.h", + ]), + outs = { + "version.h": ["version.h"], + }, + default_outs = ["."], +) + +cxx_library( + name = "generated-version-header", + header_namespace = "torch", + exported_headers = { + "version.h": ":generate-version-header[version.h]", + }, +) + +cxx_library( + name = "torch_headers", + header_namespace = "", + exported_headers = subdir_glob( + [ + ("torch/csrc/api/include", "torch/**/*.h"), + ("", "torch/csrc/**/*.h"), + ("", "torch/csrc/generic/*.cpp"), + ("", "torch/script.h"), + ("", "torch/library.h"), + ("", "torch/custom_class.h"), + ("", "torch/custom_class_detail.h"), + # Add again due to namespace difference from aten_header. + ("", "aten/src/ATen/*.h"), + ("", "aten/src/ATen/quantized/*.h"), + ], + exclude = [ + # Don't need on mobile. + "torch/csrc/Exceptions.h", + "torch/csrc/python_headers.h", + "torch/csrc/utils/auto_gil.h", + "torch/csrc/jit/serialization/mobile_bytecode_generated.h", + "torch/csrc/api/include/torch/version.h", + ], + ), + visibility = ["PUBLIC"], + deps = [ + ":generated-version-header", + ], +) + + +cxx_library( + name = "torch_common", + srcs = core_sources_common, + compiler_flags = get_pt_compiler_flags(), + exported_preprocessor_flags = get_pt_preprocessor_flags(), + link_whole = True, + visibility = ["PUBLIC"], + deps = [ + ":aten_cpu", + ":generated-autograd-headers", + ":torch_headers", + "//third_party:glog", + "//c10:c10", + ], +) + + +cxx_library( + name = "torch_mobile_deserialize_common", + srcs = [ + "torch/csrc/jit/mobile/parse_bytecode.cpp", + "torch/csrc/jit/mobile/parse_operators.cpp", + "torch/csrc/jit/mobile/upgrader_mobile.cpp", + "torch/csrc/jit/serialization/import_read.cpp", + "torch/csrc/jit/serialization/unpickler.cpp", + ], + header_namespace = "", + exported_headers = [ + "torch/csrc/jit/serialization/import_read.h", + "torch/csrc/jit/serialization/unpickler.h", + ], + compiler_flags = get_pt_compiler_flags(), + link_whole = True, + linker_flags = [ + "-Wl,--no-as-needed", + ], + visibility = ["PUBLIC"], + exported_deps = [ + ":aten_cpu", + ":caffe2_headers", + ":caffe2_serialize", + ":torch_common", + ":torch_headers", + ":torch_mobile_headers", + ":torch_mobile_module", + ":torch_mobile_observer", + "//third_party:glog", + "//c10:c10", + ], +) + +cxx_library( + name = "caffe2_serialize", + srcs = [ + "caffe2/serialize/file_adapter.cc", + "caffe2/serialize/inline_container.cc", + "caffe2/serialize/istream_adapter.cc", + "caffe2/serialize/read_adapter_interface.cc", + ], + visibility = ["PUBLIC"], + deps = [ + ":caffe2_headers", + "//third_party:glog", + "//c10:c10", + "//third_party:miniz", + ], +) + +cxx_library( + name = "torch_mobile_deserialize", + srcs = [ + "torch/csrc/jit/mobile/import.cpp", + ], + header_namespace = "", + exported_headers = [ + "torch/csrc/jit/mobile/import.h", + ], + compiler_flags = get_pt_compiler_flags(), + link_whole = True, + linker_flags = [ + "-Wl,--no-as-needed", + ], + visibility = ["PUBLIC"], + exported_deps = [ + ":aten_cpu", + ":caffe2_headers", + ":caffe2_serialize", + ":torch_common", + ":torch_headers", + ":torch_mobile_headers", + ":torch_mobile_module", + ":torch_mobile_observer", + "//third_party:glog", + "//c10:c10", + ":torch_mobile_deserialize_common", + ], +) + +cxx_library( + name = "torch_mobile_module", + srcs = [ + "torch/csrc/jit/mobile/function.cpp", + "torch/csrc/jit/mobile/interpreter.cpp", + "torch/csrc/jit/mobile/module.cpp", + ], + header_namespace = "", + exported_headers = [], + compiler_flags = get_pt_compiler_flags(), + link_whole = True, + linker_flags = [ + "-Wl,--no-as-needed", + ], + visibility = ["PUBLIC"], + exported_deps = [ + ":aten_cpu", + ":caffe2_headers", + ":torch_common", + ":torch_headers", + ":torch_mobile_headers", + ":torch_mobile_observer", + "//third_party:glog", + "//c10:c10", + ], +) + +cxx_library( + name = "torch_mobile_core", + srcs = [], + header_namespace = "", + exported_headers = [], + compiler_flags = get_pt_compiler_flags(), + exported_preprocessor_flags = get_pt_preprocessor_flags(), + link_whole = True, + linker_flags = [ + "-Wl,--no-as-needed", + # "-ldl", + ], + visibility = ["PUBLIC"], + deps = [ + ":generated-autograd-headers", + ":torch_mobile_observer", + ":torch_mobile_headers", + ], + exported_deps = [ + ":aten_cpu", + ":torch_common", + ":torch_mobile_deserialize", + ], +) + +pt_operator_library( + name = "torch_mobile_ops_full_dev", + check_decl = False, + include_all_operators = True, +) + +cxx_library( + name = "torch_mobile_all_ops", + visibility = ["PUBLIC"], + deps = get_pt_ops_deps( + name = "pt_ops_full", + train = False, + deps = [ + ":torch_mobile_ops_full_dev", + ], + enable_flatbuffer = False, + ), +) + +python_library( + name = "gen_oplist_lib", + srcs = subdir_glob([ + ("tools/code_analyzer", "gen_oplist.py"), + ("tools/code_analyzer", "gen_op_registration_allowlist.py"), + ]), + base_module = "", + deps = [ + "//third_party:pyyaml", + "//tools/lite_interpreter:gen_selected_mobile_ops_header", + "//torchgen:torchgen", + ], +) + +python_binary( + name = "gen_oplist", + main_module = "gen_oplist", + visibility = ["PUBLIC"], + deps = [ + ":gen_oplist_lib", + ], +) + +python_library( + name = "gen_operators_yaml_lib", + srcs = subdir_glob([ + ("tools/code_analyzer", "gen_operators_yaml.py"), + ("tools/code_analyzer", "gen_op_registration_allowlist.py"), + ]), + base_module = "", + deps = [ + "//third_party:pyyaml", + "//torchgen:torchgen", + ], +) + +python_binary( + name = "gen_operators_yaml", + main_module = "gen_operators_yaml", + visibility = ["PUBLIC"], + deps = [ + ":gen_operators_yaml_lib", + ], +) + +cxx_binary( + name = 'ptmobile_benchmark', + srcs = [ + 'binaries/speed_benchmark_torch.cc', + ], + compiler_flags = [ + "-fexceptions", + "-frtti", + "-Wno-deprecated-declarations", + ], + preprocessor_flags = [ + "-DBUILD_LITE_INTERPRETER", + ], + platform_linker_flags = [ + ( + "^linux.*$", + [ + "-Wl,--no-as-needed", + "-ldl", + "-pthread", + ], + ), + ], + deps = [ + ":torch_mobile_core", + ":torch_mobile_all_ops", + "//c10:c10", + ], +) + +filegroup( + name = "templated_selective_build_srcs", + # NB: no glob here, there are generated targets in this list! + srcs = glob(TEMPLATE_SOURCE_LIST) + aten_ufunc_generated_all_cpu_sources(":gen_aten[{}]"), + visibility = [ + "PUBLIC", + ], +) diff --git a/CMakeLists.txt b/CMakeLists.txt index a0ddb61781ea..eb0ce9c882e4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -94,30 +94,38 @@ if(APPLE) # RPATH stuff set(CMAKE_MACOSX_RPATH ON) if(NOT IOS) - # Determine if we can link against ML Compute - set(MLCOMPUTE_FOUND OFF) + # Determine if we can link against MPSGraph + set(MPS_FOUND OFF) execute_process( - COMMAND bash -c "xcrun --sdk macosx --show-sdk-path" - OUTPUT_VARIABLE _macosx_sdk_path + COMMAND bash -c "xcodebuild -sdk macosx -version SDKVersion" + RESULT_VARIABLE _exit_code + OUTPUT_VARIABLE _macosx_sdk_version OUTPUT_STRIP_TRAILING_WHITESPACE) - - set(_SDK_SEARCH_PATH "${_macosx_sdk_path}/System/Library/Frameworks/") - set(_FRAMEWORK_SEARCH_PATH "/System/Library/Frameworks/") - - find_library(_MLCompute_fwrk_path_ NAMES MLCompute PATHS ${_FRAMEWORK_SEARCH_PATH} NO_DEFAULT_PATH) - find_library(_MLCompute_sdk_path_ NAMES MLCompute PATHS ${_SDK_SEARCH_PATH} NO_DEFAULT_PATH) - - if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mlc) - set(_MLC_FOLDER_EXISTS YES) + if(_exit_code EQUAL 0) + set(_MPS_supported_os_version OFF) + if(_macosx_sdk_version VERSION_GREATER_EQUAL 12.3) + set(_MPS_supported_os_version ON) + endif() + message(STATUS "sdk version: ${_macosx_sdk_version}, mps supported: ${_MPS_supported_os_version}") + execute_process( + COMMAND bash -c "xcrun --sdk macosx --show-sdk-path" + OUTPUT_VARIABLE _macosx_sdk_path + OUTPUT_STRIP_TRAILING_WHITESPACE) + set(_SDK_SEARCH_PATH "${_macosx_sdk_path}/System/Library/Frameworks/") + set(_FRAMEWORK_SEARCH_PATH "/System/Library/Frameworks/") + + find_library(_MPS_fwrk_path_ NAMES MetalPerformanceShadersGraph MetalPerformanceShaders PATHS ${_FRAMEWORK_SEARCH_PATH} NO_DEFAULT_PATH) + find_library(_MPS_sdk_path_ NAMES MetalPerformanceShadersGraph MetalPerformanceShaders PATHS ${_SDK_SEARCH_PATH} NO_DEFAULT_PATH) + + if(_MPS_supported_os_version AND _MPS_fwrk_path_ AND _MPS_sdk_path_) + set(MPS_FOUND ON) + message(STATUS "MPSGraph framework found") + else() + message(STATUS "MPSGraph framework not found") + endif() else() - set(_MLC_FOLDER_EXISTS NO) - endif() - - if(_MLCompute_fwrk_path_ AND _MLCompute_sdk_path_ AND _MLC_FOLDER_EXISTS) - set(MLCOMPUTE_FOUND ON) - message(STATUS "ML Compute framework found") - else() - message(STATUS "ML Compute framework not found") + message(STATUS "MPS: unable to get MacOS sdk version") + message(STATUS "MPSGraph framework not found") endif() endif() endif() @@ -189,6 +197,8 @@ option(USE_CUDA "Use CUDA" ON) cmake_dependent_option( BUILD_SPLIT_CUDA "Split torch_cuda library into torch_cuda_cu and torch_cuda_cpp" OFF "USE_CUDA AND NOT CUDA_SEPARABLE_COMPILATION" OFF) +cmake_dependent_option( + BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF) option(USE_FAST_NVCC "Use parallel NVCC build" OFF) option(USE_ROCM "Use ROCm" ON) option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF) @@ -202,15 +212,11 @@ cmake_dependent_option( BUILD_NVFUSER_BENCHMARK "Build C++ binaries for nvfuser benchmarks" ON "USE_CUDA;BUILD_TEST" OFF) cmake_dependent_option( - USE_WHOLE_CUDNN "Use whole-library linking for cuDNN" OFF - "USE_STATIC_CUDNN" OFF) -cmake_dependent_option( - USE_EXPERIMENTAL_CUDNN_V8_API "Use experimental cuDNN v8 API" OFF + USE_EXPERIMENTAL_CUDNN_V8_API "Use experimental cuDNN v8 API" ON "USE_CUDNN" OFF) option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON) option(USE_KINETO "Use Kineto profiling library" ON) -option(USE_BREAKPAD "Use breakpad crash dump library" ON) -option(USE_CUPTI_SO "Use CUPTI as a shared library" OFF) +option(USE_CUPTI_SO "Use CUPTI as a shared library" ON) option(USE_FAKELOWP "Use FakeLowp operators" OFF) option(USE_FFMPEG "Use ffmpeg" OFF) option(USE_GFLAGS "Use GFLAGS" OFF) @@ -224,8 +230,8 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF) option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF) option(USE_NATIVE_ARCH "Use -march=native" OFF) cmake_dependent_option( - USE_MLCOMPUTE "Use ML Compute for macOS build" ON - "MLCOMPUTE_FOUND" OFF) + USE_MPS "Use MPS for macOS build" ON + "MPS_FOUND" OFF) cmake_dependent_option( USE_NCCL "Use NCCL" ON "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF) @@ -273,17 +279,20 @@ if(NOT DEFINED USE_VULKAN) "ANDROID" OFF) endif() -if(IOS) - set(USE_BREAKPAD OFF) -endif() - option(USE_SLEEF_FOR_ARM_VEC256 "Use sleef for arm" OFF) option(USE_SOURCE_DEBUG_ON_MOBILE "Enable " ON) option(USE_LITE_INTERPRETER_PROFILER "Enable " ON) option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference" OFF) option(USE_VULKAN_RELAXED_PRECISION "Vulkan - Use relaxed precision math in the kernels (mediump)" OFF) option(USE_VULKAN_SHADERC_RUNTIME "Vulkan - Use runtime shader compilation as opposed to build-time (needs libshaderc)" OFF) -option(USE_XNNPACK "Use XNNPACK" ON) +# option USE_XNNPACK: try to enable xnnpack by default. +set(XNNPACK_MIN_CMAKE_VER 3.12) +cmake_dependent_option( + USE_XNNPACK "Use XNNPACK. Requires cmake >= ${XNNPACK_MIN_CMAKE_VER}." ON + "CMAKE_VERSION VERSION_GREATER_EQUAL ${XNNPACK_MIN_CMAKE_VER}" OFF) +if(NOT USE_XNNPACK AND CMAKE_VERSION VERSION_LESS ${XNNPACK_MIN_CMAKE_VER}) + message(WARNING "USE_XNNPACK is set to OFF. XNNPACK requires CMake version ${XNNPACK_MIN_CMAKE_VER} or greater.") +endif() option(USE_ZMQ "Use ZMQ" OFF) option(USE_ZSTD "Use ZSTD" OFF) # Ensure that an MKLDNN build is the default for x86 CPUs @@ -298,6 +307,7 @@ set(MKLDNN_ENABLE_CONCURRENT_EXEC ${USE_MKLDNN}) cmake_dependent_option( USE_MKLDNN_CBLAS "Use CBLAS in MKLDNN" OFF "USE_MKLDNN" OFF) +option(USE_STATIC_MKL "Prefer to link with MKL statically (Unix only)" OFF) option(USE_DISTRIBUTED "Use distributed" ON) cmake_dependent_option( USE_MPI "Use MPI for Caffe2. Only available if USE_DISTRIBUTED is on." ON @@ -306,12 +316,15 @@ cmake_dependent_option( USE_GLOO "Use Gloo. Only available if USE_DISTRIBUTED is on." ON "USE_DISTRIBUTED" OFF) cmake_dependent_option( - USE_GLOO_WITH_OPENSSL "Use Gloo with OpenSSL. Only available if USE_GLOO is on." OFF + USE_GLOO_WITH_OPENSSL "Use Gloo with OpenSSL. Only available if USE_GLOO is on." OFF "USE_GLOO AND LINUX AND NOT INTERN_BUILD_MOBILE" OFF) cmake_dependent_option( USE_C10D_GLOO "USE C10D GLOO" ON "USE_DISTRIBUTED;USE_GLOO" OFF) cmake_dependent_option( USE_C10D_NCCL "USE C10D NCCL" ON "USE_DISTRIBUTED;USE_NCCL" OFF) +cmake_dependent_option( + USE_NCCL_WITH_UCC "Enable UCC support for ProcessGroupNCCL. Only available if USE_C10D_NCCL is on." OFF + "USE_C10D_NCCL" OFF) cmake_dependent_option( USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF) cmake_dependent_option( @@ -330,6 +343,9 @@ cmake_dependent_option(USE_CCACHE "Attempt using CCache to wrap the compilation" option(WERROR "Build with -Werror supported by the compiler" OFF) option(USE_COREML_DELEGATE "Use the CoreML backend through delegate APIs" OFF) option(USE_PER_OPERATOR_HEADERS "Whether ATen should generate separate headers for each operator" ON) +cmake_dependent_option( + BUILD_LAZY_TS_BACKEND "Build the lazy Torchscript backend, not compatible with mobile builds" ON + "NOT INTERN_BUILD_MOBILE" OFF) if(USE_CCACHE) @@ -429,8 +445,14 @@ else() endif() set(SELECTED_OP_LIST "" CACHE STRING "Path to the yaml file that contains the list of operators to include for custom build. Include all operators by default.") -set(STATIC_DISPATCH_BACKEND "" CACHE STRING - "Name of the backend for which static dispatch code is generated, e.g.: CPU.") +option( + STATIC_DISPATCH_BACKEND + "Name of the backend for which static dispatch code is generated, e.g.: CPU." + "") +option(USE_LIGHTWEIGHT_DISPATCH "Enable codegen unboxing for ATen ops, need to work with static dispatch in order to work properly." OFF) +if(USE_LIGHTWEIGHT_DISPATCH AND NOT STATIC_DISPATCH_BACKEND) + message(FATAL_ERROR "Need to enable static dispatch after enabling USE_LIGHTWEIGHT_DISPATCH.") +endif() option( TRACING_BASED "Master flag to build Lite Interpreter with tracing build option" @@ -538,6 +560,8 @@ endif(NOT MSVC) # purpose. if(ANDROID OR IOS OR DEFINED ENV{BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN}) set(INTERN_BUILD_MOBILE ON) + message(WARNING "INTERN_BUILD_MOBILE is on, disabling BUILD_LAZY_TS_BACKEND") + set(BUILD_LAZY_TS_BACKEND OFF) if(DEFINED ENV{BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN}) # C10_MOBILE is derived from Android/iOS toolchain macros in @@ -667,6 +691,8 @@ if(USE_FBGEMM AND ((CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND CMAKE_SIZEOF_VO set(USE_FBGEMM OFF) endif() +set(BUILD_ONEDNN_GRAPH OFF) + include(cmake/Dependencies.cmake) if(USE_CUDA AND (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 10.2) AND (CMAKE_HOST_SYSTEM_NAME MATCHES "Windows")) @@ -766,7 +792,6 @@ if(NOT MSVC) string(APPEND CMAKE_CXX_FLAGS " -Wno-type-limits") string(APPEND CMAKE_CXX_FLAGS " -Wno-array-bounds") string(APPEND CMAKE_CXX_FLAGS " -Wno-unknown-pragmas") - string(APPEND CMAKE_CXX_FLAGS " -Wno-sign-compare") string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-parameter") string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-function") string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-result") @@ -778,6 +803,10 @@ if(NOT MSVC) string(APPEND CMAKE_CXX_FLAGS " -Wno-range-loop-analysis") string(APPEND CMAKE_CXX_FLAGS " -Wno-pass-failed") endif() + if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.0.0)) + # Suppress issue: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43407 + string(APPEND CMAKE_CXX_FLAGS " -Wno-attributes") + endif() if(CMAKE_COMPILER_IS_GNUCXX AND NOT (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0.0)) string(APPEND CMAKE_CXX_FLAGS " -Wno-stringop-overflow") endif() @@ -864,6 +893,9 @@ if(NOT MSVC) if(HAS_WERROR_CAST_FUNCTION_TYPE) string(APPEND CMAKE_CXX_FLAGS " -Werror=cast-function-type") endif() + check_cxx_compiler_flag("-Werror=sign-compare" HAS_WERROR_SIGN_COMPARE) + # This doesn't work globally so we use the test on specific + # target_compile_options endif() if(USE_ASAN) @@ -918,8 +950,8 @@ if(USE_CPP_CODE_COVERAGE) endif() if(APPLE) - if(USE_MLCOMPUTE) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_MLCOMPUTE -fobjc-arc -framework MLCompute -framework Metal") + if(USE_MPS) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_MPS -fno-objc-arc -framework Foundation -weak_framework MetalPerformanceShaders -weak_framework MetalPerformanceShadersGraph -weak_framework Metal") endif() string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-private-field") string(APPEND CMAKE_CXX_FLAGS " -Wno-missing-braces") diff --git a/CODEOWNERS b/CODEOWNERS index 054bd8171311..7de2b0e66d9f 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -11,8 +11,10 @@ /torch/nn/ @albanD @jbschlosser /torch/optim/ @albanD /test/test_public_bindings.py @albanD +/test/allowlist_for_publicAPI.json @albanD @anjali411 /docs/source/conf.py @albanD -/aten/src/ATen/native/native_functions.yaml @ezyang +/aten/src/ATen/native/native_functions.yaml @bdhirsh +/aten/src/ATen/native/tags.yaml @anjali411 # Tensorpipe RPC Agent. /torch/csrc/distributed/rpc/tensorpipe_agent.cpp @jiayisuse @osalpekar @lw @beauby @@ -21,15 +23,15 @@ # Distributed package # This list is mostly if you'd like to be tagged as reviewer, feel free to add # or remove yourself from it. -/torch/csrc/distributed/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088 @H-Huang @bowangbj -/torch/distributed/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088 @H-Huang @bowangbj -/torch/nn/parallel/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088 @H-Huang @bowangbj +/torch/csrc/distributed/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088 @H-Huang @awgu +/torch/distributed/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088 @H-Huang @awgu +/torch/nn/parallel/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088 @H-Huang @awgu # Distributed tests # This list is mostly if you'd like to be tagged as reviewer, feel free to add # or remove yourself from it. -/test/distributed @mrshenli @pritamdamania87 @zhaojuanmao @rohan-varma @H-Huang @bowangbj -/torch/testing/_internal/distributed @mrshenli @pritamdamania87 @zhaojuanmao @rohan-varma @H-Huang @bowangbj +/test/distributed @mrshenli @pritamdamania87 @zhaojuanmao @rohan-varma @H-Huang @awgu +/torch/testing/_internal/distributed @mrshenli @pritamdamania87 @zhaojuanmao @rohan-varma @H-Huang @awgu # ONNX Export /torch/csrc/jit/passes/onnx.h @bowenbao @shubhambhokare1 @@ -39,7 +41,7 @@ /test/onnx/ @bowenbao @shubhambhokare1 # Docker -/.circleci/docker/ @jeffdaily @jithunnair-amd +/.circleci/docker/ @jeffdaily # Github Actions # This list is for people wanting to be notified every time there's a change @@ -47,9 +49,9 @@ /.github/ @seemethere @janeyx99 @atalman # Custom Test Infrastructure -/test/run_test.py @pytorch-dev-infra +/test/run_test.py @pytorch/pytorch-dev-infra /torch/testing/_internal/common_device_type.py @mruberry -/torch/testing/_internal/common_utils.py @pytorch-dev-infra +/torch/testing/_internal/common_utils.py @pytorch/pytorch-dev-infra # Parametrizations /torch/nn/utils/parametriz*.py @lezcano @@ -61,3 +63,13 @@ /aten/src/ATen/native/**/*LinearAlgebra* @lezcano @nikitaved @IvanYashchuk # tests /test/test_linalg.py @lezcano @nikitaved @IvanYashchuk + +# OpInfo-related files +/torch/testing/_internal/common_methods_invocations.py @mruberry @ngimel +/torch/testing/_internal/common_device_type.py @mruberry @ngimel +test/test_ops.py @mruberry @ngimel +test/test_ops_gradients.py @mruberry @ngimel +test/test_unary_ufuncs.py @mruberry @ngimel +test/test_binary_ufuncs.py @mruberry @ngimel +test/test_reductions.py @mruberry @ngimel +test/test_type_promotion.py @mruberry @ngimel diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 59b7ae8a488f..a09e03c01e44 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -342,6 +342,8 @@ The `expecttest` and `hypothesis` libraries must be installed to run the tests. an optional dependency, and `pytest` may help run tests more selectively. All these packages can be installed with `conda` or `pip`. +**Weird note:** In our CI (Continuous Integration) jobs, we actually run the tests from the `test` folder and **not** the root of the repo, since there are various dependencies we set up for CI that expects the tests to be run from the test folder. As such, there may be some inconsistencies between local testing and CI testing--if you observe an inconsistency, please [file an issue](https://github.com/pytorch/pytorch/issues/new/choose). + ### Better local unit tests with `pytest` We don't officially support `pytest`, but it works well with our @@ -512,7 +514,7 @@ missing file warnings but will still complete. For example, to work on `jit.rst` ```bash cd docs/source -ls | grep rst | grep -v index | grep -v jit | xargs rm +find . -type f | grep rst | grep -v index | grep -v jit | xargs rm # Make your changes, build the docs, etc. @@ -1098,8 +1100,7 @@ This internally invokes our driver script and closely mimics how clang-tidy is r ## Pre-commit tidy/linting hook -We use clang-tidy and flake8 (installed with flake8-bugbear, -flake8-comprehensions, flake8-pyi, and others) to perform additional +We use clang-tidy to perform additional formatting and semantic checking of code. We provide a pre-commit git hook for performing these checks, before a commit is created: @@ -1107,18 +1108,18 @@ performing these checks, before a commit is created: ln -s ../../tools/git-pre-commit .git/hooks/pre-commit ``` -You'll need to install an appropriately configured flake8; see -[Lint as you type](https://github.com/pytorch/pytorch/wiki/Lint-as-you-type) -for documentation on how to do this. - -If you haven't set up the pre-commit hook and have already committed files and +If you have already committed files and CI reports `flake8` errors, you can run the check locally in your PR branch with: ```bash flake8 $(git diff --name-only $(git merge-base --fork-point master)) ``` -fix the code so that no errors are reported when you re-run the above check again, +You'll need to install an appropriately configured flake8; see +[Lint as you type](https://github.com/pytorch/pytorch/wiki/Lint-as-you-type) +for documentation on how to do this. + +Fix the code so that no errors are reported when you re-run the above check again, and then commit the fix. ## Building PyTorch with ASAN @@ -1245,39 +1246,17 @@ Once you submit a PR or push a new commit to a branch that is in an active PR, CI jobs will be run automatically. Some of these may fail and you will need to find out why, by looking at the logs. -Fairly often, a CI failure might be unrelated to your changes. In this case, you +Fairly often, a CI failure might be unrelated to your changes. You can +confirm by going to our [HUD](hud.pytorch.org) and seeing if the CI job +is failing upstream already. In this case, you can usually ignore the failure. See [the following subsection](#which-commit-is-used-in-ci) for more details. Some failures might be related to specific hardware or environment -configurations. In this case, if the job is run by CircleCI, you can -ssh into the job's session to perform manual debugging using the -following steps: - -1. In the CircleCI page for the failed job, make sure you are logged in - and then click the `Rerun` actions dropdown button on the top right. - Click `Rerun Job with SSH`. - -2. When the job reruns, a new step will be added in the `STEPS` tab - labelled `Set up SSH`. Inside that tab will be an ssh command that - you can execute in a shell. - -3. Once you are connected through ssh, you may need to enter a docker - container. Run `docker ps` to check if there are any docker - containers running. Note that your CI job might be in the process - of initiating a docker container, which means it will not show up - yet. It is best to wait until the CI job reaches a step where it is - building pytorch or running pytorch tests. If the job does have a - docker container, run `docker exec -it IMAGE_ID /bin/bash` to - connect to it. - -4. Now you can find the pytorch working directory, which could be - `~/workspace` or `~/project`, and run commands locally to debug - the failure. - -For certain Windows failures, it may be useful to have a full [Remote -Desktop](https://docs.microsoft.com/en-us/windows-server/remote/remote-desktop-services/clients/remote-desktop-clients) connection. See detailed instructions [here](https://github.com/pytorch/pytorch/wiki/Debugging-Windows-with-Remote-Desktop-or-CDB-(CLI-windbg)-on-CircleCI) -for how to set that up after rerunning the job. +configurations. In this case, if you're a Meta employee, you can ssh into +the job's session to perform manual debugging following the instructions in +our [CI wiki](https://github.com/pytorch/pytorch/wiki/Debugging-using-with-ssh-for-Github-Actions). + ### Which commit is used in CI? diff --git a/Dockerfile b/Dockerfile index 57c5dae733da..a8dc7f141685 100644 --- a/Dockerfile +++ b/Dockerfile @@ -32,7 +32,7 @@ RUN curl -fsSL -v -o ~/miniconda.sh -O https://repo.anaconda.com/miniconda/Mini chmod +x ~/miniconda.sh && \ ~/miniconda.sh -b -p /opt/conda && \ rm ~/miniconda.sh && \ - /opt/conda/bin/conda install -y python=${PYTHON_VERSION} conda-build pyyaml numpy ipython&& \ + /opt/conda/bin/conda install -y python=${PYTHON_VERSION} conda-build pyyaml numpy ipython && \ /opt/conda/bin/conda clean -ya FROM dev-base as submodule-update @@ -51,7 +51,7 @@ RUN --mount=type=cache,target=/opt/ccache \ FROM conda as conda-installs ARG PYTHON_VERSION=3.8 -ARG CUDA_VERSION=11.1 +ARG CUDA_VERSION=11.3 ARG CUDA_CHANNEL=nvidia ARG INSTALL_CHANNEL=pytorch-nightly ENV CONDA_OVERRIDE_CUDA=${CUDA_VERSION} diff --git a/LICENSE b/LICENSE index 9cb8cbef5a9f..04f9ad110565 100644 --- a/LICENSE +++ b/LICENSE @@ -28,6 +28,10 @@ All rights reserved. All contributions by Kakao Brain: Copyright 2019-2020 Kakao Brain +All contributions by Cruise LLC: +Copyright (c) 2022 Cruise LLC. +All rights reserved. + All contributions from Caffe: Copyright(c) 2013, 2014, 2015, the respective contributors All rights reserved. diff --git a/Makefile b/Makefile index 3d18c2b46381..21745f42a887 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,6 @@ # This makefile does nothing but delegating the actual building to cmake. PYTHON = python3 +PIP = pip3 all: @mkdir -p build && cd build && cmake .. $(shell $(PYTHON) ./scripts/get_python_cmake_flags.py) && $(MAKE) @@ -15,110 +16,18 @@ ios: clean: # This will remove ALL build folders. @rm -r build*/ - @$(RM) -r $(SHELLCHECK_GHA_GENERATED_FOLDER) linecount: @cloc --read-lang-def=caffe.cloc caffe2 || \ echo "Cloc is not available on the machine. You can install cloc with " && \ echo " sudo apt-get install cloc" -SHELLCHECK_GHA_GENERATED_FOLDER=.shellcheck_generated_gha -shellcheck-gha: - @$(RM) -r $(SHELLCHECK_GHA_GENERATED_FOLDER) - tools/extract_scripts.py --out=$(SHELLCHECK_GHA_GENERATED_FOLDER) - tools/linter/run_shellcheck.sh $(SHELLCHECK_GHA_GENERATED_FOLDER) - -generate-gha-workflows: - .github/scripts/generate_ci_workflows.py - $(MAKE) shellcheck-gha - -shellcheck: - @$(PYTHON) tools/actions_local_runner.py \ - --file .github/workflows/lint.yml \ - --job 'shellcheck' \ - --step "Regenerate workflows" - @$(PYTHON) tools/actions_local_runner.py \ - --file .github/workflows/lint.yml \ - --job 'shellcheck' \ - --step "Assert that regenerating the workflows didn't change them" - @$(PYTHON) tools/actions_local_runner.py \ - --file .github/workflows/lint.yml \ - --job 'shellcheck' \ - --step 'Extract scripts from GitHub Actions workflows' - @$(PYTHON) tools/actions_local_runner.py \ - $(CHANGED_ONLY) \ - --job 'shellcheck' - setup_lint: - $(PYTHON) tools/actions_local_runner.py --file .github/workflows/lint.yml \ - --job 'flake8-py3' --step 'Install dependencies' --no-quiet - $(PYTHON) tools/actions_local_runner.py --file .github/workflows/lint.yml \ - --job 'cmakelint' --step 'Install dependencies' --no-quiet - $(PYTHON) tools/actions_local_runner.py --file .github/workflows/lint.yml \ - --job 'mypy' --step 'Install dependencies' --no-quiet - $(PYTHON) tools/actions_local_runner.py --file .github/workflows/lint.yml \ - --job 'shellcheck' --step 'Install Jinja2' --no-quiet - - @if [ "$$(uname)" = "Darwin" ]; then \ - if [ -z "$$(which brew)" ]; then \ - echo "'brew' is required to install ShellCheck, get it here: https://brew.sh "; \ - exit 1; \ - fi; \ - brew install shellcheck; \ - else \ - $(PYTHON) tools/actions_local_runner.py --file .github/workflows/lint.yml \ - --job 'shellcheck' --step 'Install ShellCheck' --no-quiet; \ - fi - $(PYTHON) -mpip install jinja2 --user - $(PYTHON) -mpip install -r tools/linter/clang_tidy/requirements.txt --user - $(PYTHON) -m tools.linter.install.clang_tidy - -quick_checks: -# TODO: This is broken when 'git config submodule.recurse' is 'true' since the -# lints will descend into third_party submodules - @$(PYTHON) tools/actions_local_runner.py \ - --file .github/workflows/lint.yml \ - --job 'quick-checks' \ - --step 'Ensure no trailing spaces' \ - --step 'Ensure no tabs' \ - --step 'Ensure no non-breaking spaces' \ - --step 'Ensure canonical include' \ - --step 'Ensure no versionless Python shebangs' \ - --step 'Ensure no unqualified noqa' \ - --step 'Ensure GitHub PyPi dependencies are pinned' \ - --step 'Ensure no unqualified type ignore' \ - --step 'Ensure no direct cub include' \ - --step 'Ensure correct trailing newlines' \ - --step 'Ensure no raw cuda api calls' - -flake8: - @$(PYTHON) tools/actions_local_runner.py \ - $(CHANGED_ONLY) \ - --job 'flake8-py3' - -mypy: - @$(PYTHON) tools/actions_local_runner.py \ - $(CHANGED_ONLY) \ - --job 'mypy' - -cmakelint: - @$(PYTHON) tools/actions_local_runner.py \ - --file .github/workflows/lint.yml \ - --job 'cmakelint' \ - --step 'Run cmakelint' - -clang-tidy: - @$(PYTHON) tools/actions_local_runner.py \ - $(CHANGED_ONLY) \ - --job 'clang-tidy' - -toc: - @$(PYTHON) tools/actions_local_runner.py \ - --file .github/workflows/lint.yml \ - --job 'toc' \ - --step "Regenerate ToCs and check that they didn't change" + $(PIP) install lintrunner + lintrunner init -lint: flake8 mypy quick_checks cmakelint shellcheck +lint: + lintrunner -quicklint: CHANGED_ONLY=--changed-only -quicklint: mypy flake8 quick_checks cmakelint shellcheck clang-tidy +quicklint: + lintrunner diff --git a/README.md b/README.md index 88a77f04b345..c5c362b80a6a 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,8 @@ PyTorch is a Python package that provides two high-level features: You can reuse your favorite Python packages such as NumPy, SciPy, and Cython to extend PyTorch when needed. +Our trunk health (Continuous Integration signals) can be found at [hud.pytorch.org](https://hud.pytorch.org/ci/pytorch/pytorch/master). + - [More About PyTorch](#more-about-pytorch) @@ -39,18 +41,6 @@ You can reuse your favorite Python packages such as NumPy, SciPy, and Cython to -| System | 3.7 | 3.8 | -| :---: | :---: | :--: | -| Linux CPU | [![Build Status](https://ci.pytorch.org/jenkins/job/pytorch-master/badge/icon)](https://ci.pytorch.org/jenkins/job/pytorch-master/) |
| -| Linux GPU | [![Build Status](https://ci.pytorch.org/jenkins/job/pytorch-master/badge/icon)](https://ci.pytorch.org/jenkins/job/pytorch-master/) |
| -| Windows CPU / GPU | [![Build Status](https://ci.pytorch.org/jenkins/job/pytorch-builds/job/pytorch-win-ws2016-cuda9-cudnn7-py3-trigger/badge/icon)](https://ci.pytorch.org/jenkins/job/pytorch-builds/job/pytorch-win-ws2016-cuda9-cudnn7-py3-trigger/) |
| -| Linux (ppc64le) CPU | [![Build Status](https://powerci.osuosl.org/job/pytorch-master-nightly-py3-linux-ppc64le/badge/icon)](https://powerci.osuosl.org/job/pytorch-master-nightly-py3-linux-ppc64le/) |
| -| Linux (ppc64le) GPU | [![Build Status](https://powerci.osuosl.org/job/pytorch-master-nightly-py3-linux-ppc64le-gpu/badge/icon)](https://powerci.osuosl.org/job/pytorch-master-nightly-py3-linux-ppc64le-gpu/) |
| -| Linux (aarch64) CPU | [![Build Status](http://openlabtesting.org:15000/badge?project=pytorch%2Fpytorch&job_name=pytorch-arm64-build-daily-master-py37)](https://status.openlabtesting.org/builds/builds?project=pytorch%2Fpytorch&job_name=pytorch-arm64-build-daily-master-py37) | [![Build Status](http://openlabtesting.org:15000/badge?project=pytorch%2Fpytorch&job_name=pytorch-arm64-build-daily-master-py38)](https://status.openlabtesting.org/builds/builds?project=pytorch%2Fpytorch&job_name=pytorch-arm64-build-daily-master-py38) | - -See also the [CI HUD at hud.pytorch.org](https://hud.pytorch.org/ci/pytorch/pytorch/master). - - ## More About PyTorch At a granular level, PyTorch is a library that consists of the following components: @@ -155,14 +145,9 @@ Commands to install binaries via Conda or pip wheels are on our website: [https: #### NVIDIA Jetson Platforms -Python wheels for NVIDIA's Jetson Nano, Jetson TX2, and Jetson AGX Xavier are available via the following URLs: - -- Stable binaries: - - Python 3.6: https://nvidia.box.com/v/torch-stable-cp36-jetson-jp42 -- Rolling weekly binaries: - - Python 3.6: https://nvidia.box.com/v/torch-weekly-cp36-jetson-jp42 +Python wheels for NVIDIA's Jetson Nano, Jetson TX2, and Jetson AGX Xavier are provided [here](https://forums.developer.nvidia.com/t/pytorch-for-jetson-version-1-10-now-available/72048) and the L4T container is published [here](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/l4t-pytorch) -They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv) maintains them +They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv) and [@ptrblck](https://github.com/ptrblck) are maintaining them. ### From Source @@ -178,16 +163,16 @@ If you want to compile with CUDA support, install - [Compiler](https://gist.github.com/ax3l/9489132) compatible with CUDA Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/pdf/cuDNN-Support-Matrix.pdf) for cuDNN versions with the various supported CUDA, CUDA driver and NVIDIA hardwares -If you want to disable CUDA support, export environment variable `USE_CUDA=0`. +If you want to disable CUDA support, export the environment variable `USE_CUDA=0`. Other potentially useful environment variables may be found in `setup.py`. If you are building for NVIDIA's Jetson platforms (Jetson Nano, TX1, TX2, AGX Xavier), Instructions to install PyTorch for Jetson Nano are [available here](https://devtalk.nvidia.com/default/topic/1049071/jetson-nano/pytorch-for-jetson-nano/) If you want to compile with ROCm support, install - [AMD ROCm](https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html) 4.0 and above installation -- ROCm is currently supported only for Linux system. +- ROCm is currently supported only for Linux systems. -If you want to disable ROCm support, export environment variable `USE_ROCM=0`. +If you want to disable ROCm support, export the environment variable `USE_ROCM=0`. Other potentially useful environment variables may be found in `setup.py`. #### Install Dependencies @@ -245,7 +230,7 @@ collect2: error: ld returned 1 exit status error: command 'g++' failed with exit status 1 ``` -This is caused by `ld` from Conda environment shadowing the system `ld`. You should use a newer version of Python that fixes this issue. The recommended Python version is 3.6.10+, 3.7.6+ and 3.8.1+. +This is caused by `ld` from Conda environment shadowing the system `ld`. You should use a newer version of Python that fixes this issue. The recommended Python version is 3.7.6+ and 3.8.1+. On macOS ```bash @@ -299,7 +284,7 @@ You can refer to the [build_pytorch.bat](https://github.com/pytorch/pytorch/blob cmd :: Set the environment variables after you have downloaded and upzipped the mkl package, -:: else CMake would throw error as `Could NOT find OpenMP`. +:: else CMake would throw an error as `Could NOT find OpenMP`. set CMAKE_INCLUDE_PATH={Your directory}\mkl\include set LIB={Your directory}\mkl\lib;%LIB% diff --git a/RELEASE.md b/RELEASE.md index 8f967985a9cf..80b4bfefc122 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -3,13 +3,30 @@ - [General Overview](#general-overview) + - [Cutting a release branch preparations](#cutting-a-release-branch-preparations) - [Cutting release branches](#cutting-release-branches) + - [`pytorch/pytorch`](#pytorchpytorch) + - [`pytorch/builder` / PyTorch domain libraries](#pytorchbuilder--pytorch-domain-libraries) - [Making release branch specific changes](#making-release-branch-specific-changes) - [Getting CI signal on release branches:](#getting-ci-signal-on-release-branches) - [Drafting RCs (Release Candidates)](#drafting-rcs-release-candidates) - [Release Candidate Storage](#release-candidate-storage) - [Cherry Picking Fixes](#cherry-picking-fixes) - [Promoting RCs to Stable](#promoting-rcs-to-stable) + - [Additonal Steps to prepare for release day](#additonal-steps-to-prepare-for-release-day) + - [Modify release matrix](#modify-release-matrix) + - [Open Google Colab issue](#open-google-colab-issue) +- [Patch Releases](#patch-releases) + - [Patch Release Criteria](#patch-release-criteria) + - [Patch Release Process](#patch-release-process) + - [Triage](#triage) + - [Building a release schedule / cherry picking](#building-a-release-schedule--cherry-picking) + - [Building Binaries / Promotion to Stable](#building-binaries--promotion-to-stable) +- [Hardware / Software Support in Binary Build Matrix](#hardware--software-support-in-binary-build-matrix) + - [Python](#python) + - [TL;DR](#tldr) + - [Accelerator Software](#accelerator-software) + - [Special support cases](#special-support-cases) - [Special Topics](#special-topics) - [Updating submodules for a release](#updating-submodules-for-a-release) @@ -19,36 +36,60 @@ Releasing a new version of PyTorch generally entails 3 major steps: +0. Cutting a release branch preparations 1. Cutting a release branch and making release branch specific changes 2. Drafting RCs (Release Candidates), and merging cherry picks -3. Promoting RCs to stable +3. Promoting RCs to stable and performing release day tasks + +## Cutting a release branch preparations + +Following Requirements needs to be met prior to final RC Cut: + +* Resolve all outstanding issues in the milestones(for example [1.11.0](https://github.com/pytorch/pytorch/milestone/28))before first RC cut is completed. After RC cut is completed following script should be executed from builder repo in order to validate the presence of the fixes in the release branch : +``` python github_analyze.py --repo-path ~/local/pytorch --remote upstream --branch release/1.11 --milestone-id 26 --missing-in-branch ``` +* Validate that all new workflows have been created in the PyTorch and domain libraries included in the release. Validate it against all dimensions of release matrix, including operating systems(Linux, MacOS, Windows), Python versions as well as CPU architectures(x86 and arm) and accelerator versions(CUDA, ROCm). +* All the nighly jobs for pytorch and domain libraries should be green. Validate this using following HUD links: + * [Pytorch](https://hud.pytorch.org/hud/pytorch/pytorch/nightly) + * [TorchVision](https://hud.pytorch.org/hud/pytorch/vision/nightly) + * [TorchAudio](https://hud.pytorch.org/hud/pytorch/audio/nightly) + * [TorchText](https://hud.pytorch.org/hud/pytorch/text/nightly) ## Cutting release branches +### `pytorch/pytorch` + Release branches are typically cut from the branch [`viable/strict`](https://github.com/pytorch/pytorch/tree/viable/strict) as to ensure that tests are passing on the release branch. -Release branches *should* be prefixed like so: -``` -release/{MAJOR}.{MINOR} -``` +There's a convenience script to create release branches from current `viable/strict` (from root `pytorch/pytorch`): -An example of this would look like: +```bash +DRY_RUN=disabled scripts/release/cut-release-branch.sh ``` -release/1.8 + +This script should create 2 branches: +* `release/{MAJOR}.{MINOR}` +* `orig/release/{MAJOR}.{MINOR}` + +### `pytorch/builder` / PyTorch domain libraries + +Convenience script can also be used domains as well as `pytorch/builder` + +> NOTE: RELEASE_VERSION only needs to be specified if version.txt is not available in root directory + +```bash +DRY_RUN=disabled GIT_BRANCH_TO_CUT_FROM=main RELEASE_VERSION=1.11 scripts/release/cut-release-branch.sh ``` -Please make sure to create branch that pins divergent point of release branch from the main branch, i.e. `orig/release/{MAJOR}.{MINOR}` ### Making release branch specific changes These are examples of changes that should be made to release branches so that CI / tooling can function normally on them: -* Update target determinator to use release branch: - * Example: https://github.com/pytorch/pytorch/pull/40712 * Update backwards compatibility tests to use RC binaries instead of nightlies * Example: https://github.com/pytorch/pytorch/pull/40706 * A release branches should also be created in [`pytorch/xla`](https://github.com/pytorch/xla) and [`pytorch/builder`](https://github.com/pytorch/builder) repos and pinned in `pytorch/pytorch` - * Example: https://github.com/pytorch/pytorch/pull/65433 + * Example PR (CircleCI, to be removed): https://github.com/pytorch/pytorch/pull/65433 + * Example PR (GHA): https://github.com/pytorch/pytorch/pull/72739 These are examples of changes that should be made to the *default* branch after a release branch is cut @@ -56,6 +97,7 @@ These are examples of changes that should be made to the *default* branch after * Example: https://github.com/pytorch/pytorch/pull/65435 ### Getting CI signal on release branches: + Create a PR from `release/{MAJOR}.{MINOR}` to `orig/release/{MAJOR}.{MINOR}` in order to start CI testing for cherry-picks into release branch. Example: @@ -98,8 +140,11 @@ For fixes that are to go into a release after the release branch has been cut we An example of this would look like: * https://github.com/pytorch/pytorch/issues/51886 +Please also make sure to add milestone target to the PR/issue, especially if it needs to be considered for inclusion into the dot release. + **NOTE**: The cherry pick process is not an invitation to add new features, it is mainly there to fix regressions + ## Promoting RCs to Stable Promotion of RCs to stable is done with this script: @@ -113,6 +158,95 @@ Promotion should occur in two steps: **NOTE**: The promotion of wheels to PyPI can only be done once so take caution when attempting to promote wheels to PyPI, (see https://github.com/pypa/warehouse/issues/726 for a discussion on potential draft releases within PyPI) +## Additonal Steps to prepare for release day + +The following should be prepared for the release day + +### Modify release matrix + +Need to modify release matrix for get started page. See following [PR](https://github.com/pytorch/pytorch.github.io/pull/959) as reference. + +After modifying published_versions.json you will need to regenerate regenerate the quick-start-module.js file run following command +``` +python3 scripts/gen_quick_start_module.py >assets/quick-start-module.js +``` +Please note: This PR needs to be merged on the release day and hence it should be absolutely free of any failures. To test this PR, open another test PR but pointing to to the Release candidate location as above [Release Candidate Storage](RELEASE.md#release-candidate-storage) + +### Open Google Colab issue + +This is normally done right after the release is completed. We would need to create Google Colab Issue see following [PR](https://github.com/googlecolab/colabtools/issues/2372) + +# Patch Releases + +A patch release is a maintenance release of PyTorch that includes fixes for regressions found in a previous minor release. Patch releases typically will bump the `patch` version from semver (i.e. `[major].[minor].[patch]` + +## Patch Release Criteria + +Patch releases should be considered if a regression meets the following criteria: + +1. Does the regression break core functionality (stable / beta features) including functionality in first party domain libraries? + * First party domain libraries: + * [pytorch/vision](https://github.com/pytorch/vision) + * [pytorch/audio](https://github.com/pytorch/audio) + * [pytorch/text](https://github.com/pytorch/text) +3. Is there not a viable workaround? + * Can the regression be solved simply or is it not overcomable? + +> *NOTE*: Patch releases should only be considered when functionality is broken, documentation does not typically fall within this category + +## Patch Release Process + +### Triage + +> Main POC: Triage Reviewers + +1. Tag issues / pull requests that are candidates for a potential patch release with `triage review` + * ![adding triage review label](https://user-images.githubusercontent.com/1700823/132589089-a9210a14-6159-409d-95e5-f79067f6fa38.png) +2. Triage reviewers will then check if the regression / fix identified fits within above mentioned [Patch Release Criteria](#patch-release-criteria) +3. Triage reviewers will then add the issue / pull request to the related milestone (i.e. `1.9.1`) if the regressions if found to be within the [Patch Release Criteria](#patch-release-criteria) + * ![adding to milestone](https://user-images.githubusercontent.com/1700823/131175980-148ff38d-44c3-4611-8a1f-cd2fd1f4c49d.png) + +### Building a release schedule / cherry picking + +> Main POC: Patch Release Managers + +1. After regressions / fixes have been triaged Patch Release Managers will work together and build /announce a schedule for the patch release + * *NOTE*: Ideally this should be ~2-3 weeks after a regression has been identified to allow other regressions to be identified +2. Patch Release Managers will work with the authors of the regressions / fixes to cherry pick their change into the related release branch (i.e. `release/1.9` for `1.9.1`) + +### Building Binaries / Promotion to Stable + +> Main POC: Patch Release managers + +1. Patch Release Managers will follow the process of [Drafting RCs (Release Candidates)](#drafting-rcs-release-candidates) +2. Patch Release Managers will follow the process of [Promoting RCs to Stable](#promoting-rcs-to-stable) + +# Hardware / Software Support in Binary Build Matrix + +PyTorch has a support matrix across a couple of different axis. This section should be used as a decision making framework to drive hardware / software support decisions + +## Python + +For versions of Python that we support we follow the [NEP 29 policy](https://numpy.org/neps/nep-0029-deprecation_policy.html), which was originally drafted by numpy. + +### TL;DR + +* All minor versions of Python released 42 months prior to the project, and at minimum the two latest minor versions. +* All minor versions of numpy released in the 24 months prior to the project, and at minimum the last three minor versions. + +## Accelerator Software + +For acclerator software like CUDA and ROCm we will typically use the following criteria: +* Support latest 2 minor versions + +### Special support cases + +In some instances support for a particular version of software will continue if a need is found. For example, our CUDA 11 binaries do not currently meet +the size restrictions for publishing on PyPI so the default version that is published to PyPI is CUDA 10.2. + +These special support cases will be handled on a case by case basis and support may be continued if current PyTorch maintainers feel as though there may still be a +need to support these particular versions of software. + # Special Topics ## Updating submodules for a release diff --git a/WORKSPACE b/WORKSPACE index 95eee3bdd494..fb15aad66cb8 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -33,6 +33,13 @@ http_archive( ], ) +http_archive( + name = "google_benchmark", + sha256 = "6132883bc8c9b0df5375b16ab520fac1a85dc9e4cf5be59480448ece74b278d4", + strip_prefix = "benchmark-1.6.1/", + urls = ["https://github.com/google/benchmark/archive/refs/tags/v1.6.1.tar.gz"], +) + http_archive( name = "pybind11_bazel", strip_prefix = "pybind11_bazel-7f397b5d2cc2434bbd651e096548f7b40c128044", diff --git a/android/README.md b/android/README.md index 002409c52349..d1d6bcd6aa3b 100644 --- a/android/README.md +++ b/android/README.md @@ -14,9 +14,16 @@ repositories { jcenter() } +# lite interpreter build dependencies { - implementation 'org.pytorch:pytorch_android:1.6.0' - implementation 'org.pytorch:pytorch_android_torchvision:1.6.0' + implementation 'org.pytorch:pytorch_android_lite:1.10.0' + implementation 'org.pytorch:pytorch_android_torchvision_lite:1.10.0' +} + +# full jit build +dependencies { + implementation 'org.pytorch:pytorch_android:1.10.0' + implementation 'org.pytorch:pytorch_android_torchvision:1.10.0' } ``` @@ -32,6 +39,15 @@ repositories { } } +# lite interpreter build +dependencies { + ... + implementation 'org.pytorch:pytorch_android_lite:1.12.0-SNAPSHOT' + implementation 'org.pytorch:pytorch_android_torchvision_lite:1.12.0-SNAPSHOT' + ... +} + +# full jit build dependencies { ... implementation 'org.pytorch:pytorch_android:1.12.0-SNAPSHOT' @@ -68,7 +84,7 @@ They are specified as environment variables: `ANDROID_HOME` - path to [Android SDK](https://developer.android.com/studio/command-line/sdkmanager.html) -`ANDROID_NDK` - path to [Android NDK](https://developer.android.com/studio/projects/install-ndk) +`ANDROID_NDK` - path to [Android NDK](https://developer.android.com/studio/projects/install-ndk). It's recommended to use NDK 21.x. `GRADLE_HOME` - path to [gradle](https://gradle.org/releases/) @@ -133,7 +149,7 @@ android { } dependencies { - extractForNativeBuild('org.pytorch:pytorch_android:1.6.0') + extractForNativeBuild('org.pytorch:pytorch_android:1.10.0') } task extractAARForNativeBuild { diff --git a/android/common.sh b/android/common.sh index ab1cb5ff43c7..1fee30bdc382 100644 --- a/android/common.sh +++ b/android/common.sh @@ -29,7 +29,8 @@ check_gradle() { } parse_abis_list() { - ABIS_LIST="x86" + # sync with https://github.com/pytorch/pytorch/blob/0ca0e02685a9d033ac4f04e2fa5c8ba6dbc5ae50/android/gradle.properties#L1 + ABIS_LIST="armeabi-v7a,arm64-v8a,x86,x86_64" CUSTOM_ABIS_LIST=false if [ $# -gt 0 ]; then ABIS_LIST=$1 diff --git a/android/pytorch_android/build.gradle b/android/pytorch_android/build.gradle index a65c0ffd436b..d10f6a305085 100644 --- a/android/pytorch_android/build.gradle +++ b/android/pytorch_android/build.gradle @@ -50,7 +50,17 @@ android { } androidTest { java { - exclude 'org/pytorch/PytorchHostTests.java' + if(System.env.BUILD_LITE_INTERPRETER == '0') { + println 'Build test for full jit (pytorch_jni)' + exclude 'org/pytorch/PytorchHostTests.java' + exclude 'org/pytorch/PytorchLiteInstrumentedTests.java' + exclude 'org/pytorch/suite/PytorchLiteInstrumentedTestSuite.java' + } else { + println 'Build test for lite interpreter (pytorch_jni_lite)' + exclude 'org/pytorch/PytorchHostTests.java' + exclude 'org/pytorch/PytorchInstrumentedTests.java' + exclude 'org/pytorch/suite/PytorchInstrumentedTestSuite.java' + } } } } diff --git a/android/pytorch_android/generate_test_torchscripts.py b/android/pytorch_android/generate_test_torchscripts.py index 8b41fefc246e..909f824fb26d 100644 --- a/android/pytorch_android/generate_test_torchscripts.py +++ b/android/pytorch_android/generate_test_torchscripts.py @@ -1,4 +1,6 @@ import torch +from torch import Tensor +from typing import Dict, List, Tuple, Optional OUTPUT_DIR = "src/androidTest/assets/" @@ -7,7 +9,8 @@ def scriptAndSave(module, fileName): script_module = torch.jit.script(module) print(script_module.graph) outputFileName = OUTPUT_DIR + fileName - script_module.save(outputFileName) + # note that the lite interpreter model can also be used in full JIT + script_module._save_for_lite_interpreter(outputFileName) print("Saved to " + outputFileName) print('=' * 80) diff --git a/android/pytorch_android/host/build.gradle b/android/pytorch_android/host/build.gradle index 0f795f08657e..088d1b5ca420 100644 --- a/android/pytorch_android/host/build.gradle +++ b/android/pytorch_android/host/build.gradle @@ -25,6 +25,7 @@ sourceSets { java { srcDir '../src/androidTest/java' exclude '**/PytorchInstrumented*' + exclude '**/PytorchLiteInstrumented*' } resources.srcDirs = ["../src/androidTest/assets"] } diff --git a/android/pytorch_android/src/androidTest/assets/activation_ops.ptl b/android/pytorch_android/src/androidTest/assets/activation_ops.ptl new file mode 100644 index 000000000000..179f426ae7cd Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/activation_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/android_api_module.ptl b/android/pytorch_android/src/androidTest/assets/android_api_module.ptl new file mode 100644 index 000000000000..df62dd862088 Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/android_api_module.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/blas_lapack_ops.ptl b/android/pytorch_android/src/androidTest/assets/blas_lapack_ops.ptl new file mode 100644 index 000000000000..fea933ee644f Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/blas_lapack_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/comparison_ops.ptl b/android/pytorch_android/src/androidTest/assets/comparison_ops.ptl new file mode 100644 index 000000000000..01b1c153e751 Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/comparison_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/convolution_ops.ptl b/android/pytorch_android/src/androidTest/assets/convolution_ops.ptl new file mode 100644 index 000000000000..db253a207a33 Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/convolution_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/distance_function_ops.ptl b/android/pytorch_android/src/androidTest/assets/distance_function_ops.ptl new file mode 100644 index 000000000000..cc4d994f440a Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/distance_function_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/dropout_ops.ptl b/android/pytorch_android/src/androidTest/assets/dropout_ops.ptl new file mode 100644 index 000000000000..422c2f60e6be Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/dropout_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/dynamic_quant_ops.ptl b/android/pytorch_android/src/androidTest/assets/dynamic_quant_ops.ptl new file mode 100644 index 000000000000..0bbbce9671c3 Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/dynamic_quant_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/fused_quant_ops.ptl b/android/pytorch_android/src/androidTest/assets/fused_quant_ops.ptl new file mode 100644 index 000000000000..9d2b3f9dde1a Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/fused_quant_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/general_quant_ops.ptl b/android/pytorch_android/src/androidTest/assets/general_quant_ops.ptl new file mode 100644 index 000000000000..7d4888e0bc81 Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/general_quant_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/linear_ops.ptl b/android/pytorch_android/src/androidTest/assets/linear_ops.ptl new file mode 100644 index 000000000000..ca9066c03dc4 Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/linear_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/loss_function_ops.ptl b/android/pytorch_android/src/androidTest/assets/loss_function_ops.ptl new file mode 100644 index 000000000000..4c0592e5485a Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/loss_function_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/mobilenet_v2.ptl b/android/pytorch_android/src/androidTest/assets/mobilenet_v2.ptl new file mode 100644 index 000000000000..9b8297a250d3 Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/mobilenet_v2.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/nn_utils_ops.ptl b/android/pytorch_android/src/androidTest/assets/nn_utils_ops.ptl new file mode 100644 index 000000000000..5d008eab03b9 Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/nn_utils_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/normalization_ops.ptl b/android/pytorch_android/src/androidTest/assets/normalization_ops.ptl new file mode 100644 index 000000000000..d85bd06c763b Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/normalization_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/other_math_ops.ptl b/android/pytorch_android/src/androidTest/assets/other_math_ops.ptl new file mode 100644 index 000000000000..7209c3b3bd1f Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/other_math_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/padding_ops.ptl b/android/pytorch_android/src/androidTest/assets/padding_ops.ptl new file mode 100644 index 000000000000..02e57ba20712 Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/padding_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/pointwise_ops.ptl b/android/pytorch_android/src/androidTest/assets/pointwise_ops.ptl new file mode 100644 index 000000000000..948ed4832660 Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/pointwise_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/pooling_ops.ptl b/android/pytorch_android/src/androidTest/assets/pooling_ops.ptl new file mode 100644 index 000000000000..df051163413f Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/pooling_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/recurrent_ops.ptl b/android/pytorch_android/src/androidTest/assets/recurrent_ops.ptl new file mode 100644 index 000000000000..245ceb454d53 Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/recurrent_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/reduction_ops.ptl b/android/pytorch_android/src/androidTest/assets/reduction_ops.ptl new file mode 100644 index 000000000000..13771302c668 Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/reduction_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/sampling_ops.ptl b/android/pytorch_android/src/androidTest/assets/sampling_ops.ptl new file mode 100644 index 000000000000..416be7cb1279 Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/sampling_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/shuffle_ops.ptl b/android/pytorch_android/src/androidTest/assets/shuffle_ops.ptl new file mode 100644 index 000000000000..5e5520118764 Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/shuffle_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/sparse_ops.ptl b/android/pytorch_android/src/androidTest/assets/sparse_ops.ptl new file mode 100644 index 000000000000..a16f68f8f95f Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/sparse_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/spectral_ops.ptl b/android/pytorch_android/src/androidTest/assets/spectral_ops.ptl new file mode 100644 index 000000000000..9828dd2ba901 Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/spectral_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/static_quant_ops.ptl b/android/pytorch_android/src/androidTest/assets/static_quant_ops.ptl new file mode 100644 index 000000000000..d0a0a254d1ef Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/static_quant_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/tensor_creation_ops.ptl b/android/pytorch_android/src/androidTest/assets/tensor_creation_ops.ptl new file mode 100644 index 000000000000..d897b43cd36c Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/tensor_creation_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/tensor_general_ops.ptl b/android/pytorch_android/src/androidTest/assets/tensor_general_ops.ptl new file mode 100644 index 000000000000..6f2855ea83ea Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/tensor_general_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/tensor_indexing_ops.ptl b/android/pytorch_android/src/androidTest/assets/tensor_indexing_ops.ptl new file mode 100644 index 000000000000..ac9cb8c4b94a Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/tensor_indexing_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/tensor_typing_ops.ptl b/android/pytorch_android/src/androidTest/assets/tensor_typing_ops.ptl new file mode 100644 index 000000000000..3e2f4d8cc689 Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/tensor_typing_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/tensor_view_ops.ptl b/android/pytorch_android/src/androidTest/assets/tensor_view_ops.ptl new file mode 100644 index 000000000000..5e2dc8294842 Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/tensor_view_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/test.pt b/android/pytorch_android/src/androidTest/assets/test.pt index 375ade9bc913..016b6d666a2a 100644 Binary files a/android/pytorch_android/src/androidTest/assets/test.pt and b/android/pytorch_android/src/androidTest/assets/test.pt differ diff --git a/android/pytorch_android/src/androidTest/assets/torchscript_builtin_ops.ptl b/android/pytorch_android/src/androidTest/assets/torchscript_builtin_ops.ptl new file mode 100644 index 000000000000..2d2532df2fd2 Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/torchscript_builtin_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/torchscript_collection_ops.ptl b/android/pytorch_android/src/androidTest/assets/torchscript_collection_ops.ptl new file mode 100644 index 000000000000..ce434b3b4210 Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/torchscript_collection_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/transformer_ops.ptl b/android/pytorch_android/src/androidTest/assets/transformer_ops.ptl new file mode 100644 index 000000000000..ebb2bd693604 Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/transformer_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/assets/vision_function_ops.ptl b/android/pytorch_android/src/androidTest/assets/vision_function_ops.ptl new file mode 100644 index 000000000000..c9c45655e2bc Binary files /dev/null and b/android/pytorch_android/src/androidTest/assets/vision_function_ops.ptl differ diff --git a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchHostTests.java b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchHostTests.java index bc406dc9ae74..afdde74c5bde 100644 --- a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchHostTests.java +++ b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchHostTests.java @@ -10,7 +10,11 @@ public class PytorchHostTests extends PytorchTestBase { @Override - protected String assetFilePath(String assetName) throws IOException { + protected Module loadModel(String path) throws IOException { + return Module.load(assetFilePath(path)); + } + + private String assetFilePath(String assetName) throws IOException { Path tempFile = Files.createTempFile("test", ".pt"); try (InputStream resource = Objects.requireNonNull(getClass().getClassLoader().getResourceAsStream("test.pt"))) { diff --git a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchInstrumentedTests.java b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchInstrumentedTests.java index bae01e394025..20c30d1587c8 100644 --- a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchInstrumentedTests.java +++ b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchInstrumentedTests.java @@ -14,7 +14,11 @@ public class PytorchInstrumentedTests extends PytorchTestBase { @Override - protected String assetFilePath(String assetName) throws IOException { + protected Module loadModel(String path) throws IOException { + return Module.load(assetFilePath(path)); + } + + private String assetFilePath(String assetName) throws IOException { final Context appContext = InstrumentationRegistry.getInstrumentation().getTargetContext(); File file = new File(appContext.getFilesDir(), assetName); if (file.exists() && file.length() > 0) { diff --git a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchLiteInstrumentedTests.java b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchLiteInstrumentedTests.java new file mode 100644 index 000000000000..bc62270a6fa8 --- /dev/null +++ b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchLiteInstrumentedTests.java @@ -0,0 +1,42 @@ +package org.pytorch; + +import android.content.Context; +import androidx.test.InstrumentationRegistry; +import androidx.test.runner.AndroidJUnit4; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import org.junit.runner.RunWith; + +@RunWith(AndroidJUnit4.class) +public class PytorchLiteInstrumentedTests extends PytorchTestBase { + + @Override + protected Module loadModel(String path) throws IOException { + return LiteModuleLoader.load(assetFilePath(path)); + } + + private String assetFilePath(String assetName) throws IOException { + final Context appContext = InstrumentationRegistry.getInstrumentation().getTargetContext(); + File file = new File(appContext.getFilesDir(), assetName); + if (file.exists() && file.length() > 0) { + return file.getAbsolutePath(); + } + + try (InputStream is = appContext.getAssets().open(assetName)) { + try (OutputStream os = new FileOutputStream(file)) { + byte[] buffer = new byte[4 * 1024]; + int read; + while ((read = is.read(buffer)) != -1) { + os.write(buffer, 0, read); + } + os.flush(); + } + return file.getAbsolutePath(); + } catch (IOException e) { + throw e; + } + } +} diff --git a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java index 2817ae1bbd09..9abcbcbda8a6 100644 --- a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java +++ b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java @@ -12,11 +12,11 @@ import org.junit.Test; public abstract class PytorchTestBase { - private static final String TEST_MODULE_ASSET_NAME = "test.pt"; + private static final String TEST_MODULE_ASSET_NAME = "android_api_module.ptl"; @Test public void testForwardNull() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); final IValue input = IValue.from(Tensor.fromBlob(Tensor.allocateByteBuffer(1), new long[] {1})); assertTrue(input.isTensor()); final IValue output = module.forward(input); @@ -25,7 +25,7 @@ public void testForwardNull() throws IOException { @Test public void testEqBool() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); for (boolean value : new boolean[] {false, true}) { final IValue input = IValue.from(value); assertTrue(input.isBool()); @@ -38,7 +38,7 @@ public void testEqBool() throws IOException { @Test public void testEqInt() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); for (long value : new long[] {Long.MIN_VALUE, -1024, -1, 0, 1, 1024, Long.MAX_VALUE}) { final IValue input = IValue.from(value); assertTrue(input.isLong()); @@ -51,7 +51,7 @@ public void testEqInt() throws IOException { @Test public void testEqFloat() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); double[] values = new double[] { -Double.MAX_VALUE, @@ -86,7 +86,7 @@ public void testEqTensor() throws IOException { } final Tensor inputTensor = Tensor.fromBlob(inputTensorData, inputTensorShape); - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); final IValue input = IValue.from(inputTensor); assertTrue(input.isTensor()); assertTrue(inputTensor == input.toTensor()); @@ -103,7 +103,7 @@ public void testEqTensor() throws IOException { @Test public void testEqDictIntKeyIntValue() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); final Map inputMap = new HashMap<>(); inputMap.put(Long.MIN_VALUE, IValue.from(-Long.MIN_VALUE)); @@ -127,7 +127,7 @@ public void testEqDictIntKeyIntValue() throws IOException { @Test public void testEqDictStrKeyIntValue() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); final Map inputMap = new HashMap<>(); inputMap.put("long_min_value", IValue.from(Long.MIN_VALUE)); @@ -151,7 +151,7 @@ public void testEqDictStrKeyIntValue() throws IOException { @Test public void testListIntSumReturnTuple() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); for (int n : new int[] {0, 1, 128}) { long[] a = new long[n]; @@ -178,7 +178,7 @@ public void testListIntSumReturnTuple() throws IOException { @Test public void testOptionalIntIsNone() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); assertFalse(module.runMethod("optionalIntIsNone", IValue.from(1l)).toBool()); assertTrue(module.runMethod("optionalIntIsNone", IValue.optionalNull()).toBool()); @@ -186,7 +186,7 @@ public void testOptionalIntIsNone() throws IOException { @Test public void testIntEq0None() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); assertTrue(module.runMethod("intEq0None", IValue.from(0l)).isNull()); assertTrue(module.runMethod("intEq0None", IValue.from(1l)).toLong() == 1l); @@ -194,7 +194,7 @@ public void testIntEq0None() throws IOException { @Test(expected = IllegalArgumentException.class) public void testRunUndefinedMethod() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); module.runMethod("test_undefined_method_throws_exception"); } @@ -241,7 +241,7 @@ public void testTensorIllegalStateOnWrongType() { @Test public void testEqString() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); String[] values = new String[] { "smoketest", @@ -260,7 +260,7 @@ public void testEqString() throws IOException { @Test public void testStr3Concat() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); String[] values = new String[] { "smoketest", @@ -281,7 +281,7 @@ public void testStr3Concat() throws IOException { @Test public void testEmptyShape() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); final long someNumber = 43; final IValue input = IValue.from(Tensor.fromBlob(new long[] {someNumber}, new long[] {})); final IValue output = module.runMethod("newEmptyShapeWithItem", input); @@ -293,7 +293,7 @@ public void testEmptyShape() throws IOException { @Test public void testAliasWithOffset() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); final IValue output = module.runMethod("testAliasWithOffset"); assertTrue(output.isTensorList()); Tensor[] tensors = output.toTensorList(); @@ -303,7 +303,7 @@ public void testAliasWithOffset() throws IOException { @Test public void testNonContiguous() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); final IValue output = module.runMethod("testNonContiguous"); assertTrue(output.isTensor()); Tensor value = output.toTensor(); @@ -316,7 +316,7 @@ public void testChannelsLast() throws IOException { long[] inputShape = new long[] {1, 3, 2, 2}; long[] data = new long[] {1, 11, 101, 2, 12, 102, 3, 13, 103, 4, 14, 104}; Tensor inputNHWC = Tensor.fromBlob(data, inputShape, MemoryFormat.CHANNELS_LAST); - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); final IValue outputNCHW = module.runMethod("contiguous", IValue.from(inputNHWC)); assertIValueTensor( outputNCHW, @@ -334,7 +334,7 @@ public void testChannelsLast3d() throws IOException { long[] dataNHWDC = new long[] {1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16}; Tensor inputNHWDC = Tensor.fromBlob(dataNHWDC, shape, MemoryFormat.CHANNELS_LAST_3D); - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); final IValue outputNCHWD = module.runMethod("contiguous", IValue.from(inputNHWDC)); assertIValueTensor(outputNCHWD, MemoryFormat.CONTIGUOUS, shape, dataNCHWD); @@ -358,7 +358,7 @@ public void testChannelsLastConv2d() throws IOException { long[] dataWeightOHWI = new long[] {2, 0, 0, 0, 1, 0, 0, 0, -1}; Tensor wNHWC = Tensor.fromBlob(dataWeightOHWI, weightShape, MemoryFormat.CHANNELS_LAST); - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); final IValue outputNCHW = module.runMethod("conv2d", IValue.from(inputNCHW), IValue.from(wNCHW), IValue.from(false)); @@ -377,6 +377,186 @@ public void testChannelsLastConv2d() throws IOException { new long[] {2, 11, -101, 4, 12, -102, 6, 13, -103, 8, 14, -104}); } + @Test + public void testMobileNetV2() throws IOException { + try { + final Module module = loadModel("mobilenet_v2.ptl"); + final IValue inputs = module.runMethod("get_all_bundled_inputs"); + assertTrue(inputs.isList()); + final IValue input = inputs.toList()[0]; + assertTrue(input.isTuple()); + module.forward(input.toTuple()[0]); + assertTrue(true); + } catch (Exception ex) { + assertTrue("failed to run MobileNetV2 " + ex.getMessage(), false); + } + } + + @Test + public void testPointwiseOps() throws IOException { + runModel("pointwise_ops"); + } + + @Test + public void testReductionOps() throws IOException { + runModel("reduction_ops"); + } + + @Test + public void testComparisonOps() throws IOException { + runModel("comparison_ops"); + } + + @Test + public void testOtherMathOps() throws IOException { + runModel("other_math_ops"); + } + + @Test + public void testSpectralOps() throws IOException { + runModel("spectral_ops"); + } + + @Test + public void testBlasLapackOps() throws IOException { + runModel("blas_lapack_ops"); + } + + @Test + public void testSamplingOps() throws IOException { + runModel("sampling_ops"); + } + + @Test + public void testTensorOps() throws IOException { + runModel("tensor_general_ops"); + } + + @Test + public void testTensorCreationOps() throws IOException { + runModel("tensor_creation_ops"); + } + + @Test + public void testTensorIndexingOps() throws IOException { + runModel("tensor_indexing_ops"); + } + + @Test + public void testTensorTypingOps() throws IOException { + runModel("tensor_typing_ops"); + } + + @Test + public void testTensorViewOps() throws IOException { + runModel("tensor_view_ops"); + } + + @Test + public void testConvolutionOps() throws IOException { + runModel("convolution_ops"); + } + + @Test + public void testPoolingOps() throws IOException { + runModel("pooling_ops"); + } + + @Test + public void testPaddingOps() throws IOException { + runModel("padding_ops"); + } + + @Test + public void testActivationOps() throws IOException { + runModel("activation_ops"); + } + + @Test + public void testNormalizationOps() throws IOException { + runModel("normalization_ops"); + } + + @Test + public void testRecurrentOps() throws IOException { + runModel("recurrent_ops"); + } + + @Test + public void testTransformerOps() throws IOException { + runModel("transformer_ops"); + } + + @Test + public void testLinearOps() throws IOException { + runModel("linear_ops"); + } + + @Test + public void testDropoutOps() throws IOException { + runModel("dropout_ops"); + } + + @Test + public void testSparseOps() throws IOException { + runModel("sparse_ops"); + } + + @Test + public void testDistanceFunctionOps() throws IOException { + runModel("distance_function_ops"); + } + + @Test + public void testLossFunctionOps() throws IOException { + runModel("loss_function_ops"); + } + + @Test + public void testVisionFunctionOps() throws IOException { + runModel("vision_function_ops"); + } + + @Test + public void testShuffleOps() throws IOException { + runModel("shuffle_ops"); + } + + @Test + public void testNNUtilsOps() throws IOException { + runModel("nn_utils_ops"); + } + + @Test + public void testQuantOps() throws IOException { + runModel("general_quant_ops"); + } + + @Test + public void testDynamicQuantOps() throws IOException { + runModel("dynamic_quant_ops"); + } + + @Test + public void testStaticQuantOps() throws IOException { + runModel("static_quant_ops"); + } + + @Test + public void testFusedQuantOps() throws IOException { + runModel("fused_quant_ops"); + } + + @Test + public void testTorchScriptBuiltinQuantOps() throws IOException { + runModel("torchscript_builtin_ops"); + } + + @Test + public void testTorchScriptCollectionQuantOps() throws IOException { + runModel("torchscript_collection_ops"); + } + static void assertIValueTensor( final IValue ivalue, final MemoryFormat memoryFormat, @@ -389,5 +569,15 @@ static void assertIValueTensor( assertArrayEquals(expectedData, t.getDataAsLongArray()); } - protected abstract String assetFilePath(String assetName) throws IOException; + void runModel(final String name) throws IOException { + final Module storage_module = loadModel(name + ".ptl"); + storage_module.forward(); + + // TODO enable this once the on-the-fly script is ready + // final Module on_the_fly_module = loadModel(name + "_temp.ptl"); + // on_the_fly_module.forward(); + assertTrue(true); + } + + protected abstract Module loadModel(String assetName) throws IOException; } diff --git a/android/pytorch_android/src/androidTest/java/org/pytorch/suite/PytorchLiteInstrumentedTestSuite.java b/android/pytorch_android/src/androidTest/java/org/pytorch/suite/PytorchLiteInstrumentedTestSuite.java new file mode 100644 index 000000000000..a494ffc663ff --- /dev/null +++ b/android/pytorch_android/src/androidTest/java/org/pytorch/suite/PytorchLiteInstrumentedTestSuite.java @@ -0,0 +1,9 @@ +package org.pytorch.suite; + +import org.junit.runner.RunWith; +import org.junit.runners.Suite; +import org.pytorch.PytorchLiteInstrumentedTests; + +@RunWith(Suite.class) +@Suite.SuiteClasses({PytorchLiteInstrumentedTests.class}) +public class PytorchLiteInstrumentedTestSuite {} diff --git a/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp b/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp index 8094f7bdc974..5ed0c9978e83 100644 --- a/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp +++ b/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp @@ -223,7 +223,8 @@ class TensorHybrid : public facebook::jni::HybridClass { } else { facebook::jni::throwNewJavaException( facebook::jni::gJavaLangIllegalArgumentException, - "at::Tensor scalar type is not supported on java side"); + "at::Tensor scalar type %s is not supported on java side", + c10::toString(scalarType)); } const auto& tensorShape = tensor.sizes(); diff --git a/android/pytorch_android/src/main/java/org/pytorch/Tensor.java b/android/pytorch_android/src/main/java/org/pytorch/Tensor.java index 7e0f6a41d868..83a7c021bf6a 100644 --- a/android/pytorch_android/src/main/java/org/pytorch/Tensor.java +++ b/android/pytorch_android/src/main/java/org/pytorch/Tensor.java @@ -23,7 +23,7 @@ * methods. * *

When constructing {@code Tensor} objects with {@code data} as an array, it is not specified - * whether this data is is copied or retained as a reference so it is recommended not to modify it + * whether this data is copied or retained as a reference so it is recommended not to modify it * after constructing. {@code data} passed as a {@link Buffer} is not copied, so it can be modified * between {@link Module} calls to avoid reallocation. Data retrieved from {@code Tensor} objects * may be copied or may be a reference to the {@code Tensor}'s internal data buffer. {@code shape} diff --git a/android/run_tests.sh b/android/run_tests.sh index a96177f072b7..839ee209c7b7 100755 --- a/android/run_tests.sh +++ b/android/run_tests.sh @@ -48,4 +48,9 @@ fi echo "Waiting for emulator boot completed" $ADB_PATH wait-for-device shell 'while [[ -z $(getprop sys.boot_completed) ]]; do sleep 1; done;' -$GRADLE_PATH -PABI_FILTERS=x86 -p $PYTORCH_ANDROID_DIR connectedAndroidTest +{ + $GRADLE_PATH -PABI_FILTERS=x86 -p $PYTORCH_ANDROID_DIR connectedAndroidTest +} || { + echo "::error::Check https://github.com/pytorch/pytorch/tree/master/test/mobile/model_test to see how to fix the failed mobile test" + exit 1 +} diff --git a/aten.bzl b/aten.bzl index eccdb4b4d0cd..c97f22284f10 100644 --- a/aten.bzl +++ b/aten.bzl @@ -1,5 +1,6 @@ load("@bazel_skylib//lib:paths.bzl", "paths") load("@rules_cc//cc:defs.bzl", "cc_library") +load("//:tools/build_variables.bzl", "aten_ufunc_headers") CPU_CAPABILITY_NAMES = ["DEFAULT", "AVX2"] CAPABILITY_COMPILER_FLAGS = { @@ -8,8 +9,9 @@ CAPABILITY_COMPILER_FLAGS = { } PREFIX = "aten/src/ATen/native/" +EXTRA_PREFIX = "aten/src/ATen/" -def intern_build_aten_ops(copts, deps): +def intern_build_aten_ops(copts, deps, extra_impls): for cpu_capability in CPU_CAPABILITY_NAMES: srcs = [] for impl in native.glob( @@ -28,6 +30,17 @@ def intern_build_aten_ops(copts, deps): ) srcs.append(out) + for impl in extra_impls: + name = impl.replace(EXTRA_PREFIX, "") + out = EXTRA_PREFIX + name + "." + cpu_capability + ".cpp" + native.genrule( + name = name + "_" + cpu_capability + "_cp", + srcs = [impl], + outs = [out], + cmd = "cp $< $@", + ) + srcs.append(out) + cc_library( name = "ATen_CPU_" + cpu_capability, srcs = srcs, @@ -81,3 +94,32 @@ generate_aten = rule( "srcs": attr.label_list(allow_files = True), }, ) + +# copy pasted from ufunc_defs.bzl, as ufuncs_defs.bzl cannot be included +# from BUILD.bazel because it has a directory relative load, and Bazel +# always load from workspace root. The "correct" fix would be to move +# build_variables.bzl to the top level but I don't have time to do this at +# the moment. + +aten_ufunc_names = [ + paths.split_extension(paths.basename(h))[0] + for h in aten_ufunc_headers +] + +def aten_ufunc_generated_cpu_sources(gencode_pattern = "{}"): + return [gencode_pattern.format(name) for name in [ + "UfuncCPU_{}.cpp".format(n) + for n in aten_ufunc_names + ]] + +def aten_ufunc_generated_cpu_kernel_sources(gencode_pattern = "{}"): + return [gencode_pattern.format(name) for name in [ + "UfuncCPUKernel_{}.cpp".format(n) + for n in aten_ufunc_names + ]] + +def aten_ufunc_generated_cuda_sources(gencode_pattern = "{}"): + return [gencode_pattern.format(name) for name in [ + "UfuncCUDA_{}.cu".format(n) + for n in aten_ufunc_names + ]] diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt index 9344f7e9b870..9c3757f346cd 100644 --- a/aten/CMakeLists.txt +++ b/aten/CMakeLists.txt @@ -23,6 +23,7 @@ set(ATen_CPU_INCLUDE) set(ATen_THIRD_PARTY_INCLUDE) set(ATen_CUDA_CPP_SRCS) set(ATen_CUDA_CU_SRCS) +set(ATen_CUDA_LINALG_SRCS) set(ATen_CUDA_SRCS_W_SORT_BY_KEY) set(ATen_CUDA_TEST_SRCS) set(ATen_CUDA_INCLUDE) @@ -31,6 +32,7 @@ set(ATen_HIP_SRCS) set(ATen_HIP_SRCS_W_SORT_BY_KEY) set(ATen_HIP_TEST_SRCS) set(ATen_HIP_INCLUDE) +set(ATen_MPS_SRCS) set(ATen_VULKAN_TEST_SRCS) set(ATen_CPU_DEPENDENCY_LIBS) set(ATen_CUDA_DEPENDENCY_LIBS) @@ -99,9 +101,11 @@ set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE) set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE) set(ATen_CUDA_CU_SRCS ${ATen_CUDA_CU_SRCS} PARENT_SCOPE) set(ATen_CUDA_CPP_SRCS ${ATen_CUDA_CPP_SRCS} PARENT_SCOPE) +set(ATen_CUDA_LINALG_SRCS ${ATen_CUDA_LINALG_SRCS} PARENT_SCOPE) set(ATen_CUDA_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY} PARENT_SCOPE) set(ATen_CUDA_CU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY} PARENT_SCOPE) set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE) +set(ATen_MPS_SRCS ${ATen_MPS_SRCS} PARENT_SCOPE) set(ATen_HIP_SRCS_W_SORT_BY_KEY ${ATen_HIP_SRCS_W_SORT_BY_KEY} PARENT_SCOPE) set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE) set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE) diff --git a/aten/src/ATen/BatchedTensorImpl.cpp b/aten/src/ATen/BatchedTensorImpl.cpp index 2b4898412aec..d5ab588de53d 100644 --- a/aten/src/ATen/BatchedTensorImpl.cpp +++ b/aten/src/ATen/BatchedTensorImpl.cpp @@ -17,7 +17,7 @@ BatchedTensorImpl::BatchedTensorImpl(Tensor value, BatchDims bdims) { TORCH_INTERNAL_ASSERT(value_.defined()); set_storage_access_should_throw(); - set_has_contiguity_policy(HasContiguityPolicy::CustomBehavior); + set_sizes_strides_policy(SizesStridesPolicy::CustomStrides); checkInvariants(); const auto public_dims = value_.dim() - bdims_.size(); @@ -77,6 +77,13 @@ void BatchedTensorImpl::checkInvariants() const { } // The following are publically exposed as methods of Tensor + +IntArrayRef BatchedTensorImpl::strides_custom() const { + return strides_default(); +} + +// TODO: implement proper contiguity on batched tensor, then put +// sizes_strides_policy back to Default bool BatchedTensorImpl::is_contiguous_custom(at::MemoryFormat memory_format) const { TORCH_CHECK(memory_format == MemoryFormat::Contiguous, "NYI: querying is_contiguous inside of vmap for memory_format ", diff --git a/aten/src/ATen/BatchedTensorImpl.h b/aten/src/ATen/BatchedTensorImpl.h index ce59fcd20947..0c025aa01b35 100644 --- a/aten/src/ATen/BatchedTensorImpl.h +++ b/aten/src/ATen/BatchedTensorImpl.h @@ -72,6 +72,8 @@ struct TORCH_API BatchedTensorImpl : public c10::TensorImpl { // bt.actualDim(2) -> Error int64_t actualDim(int64_t dim, bool wrap_dim = true) const; + // We have to override this because we opted into CustomStrides + IntArrayRef strides_custom() const override; // Override a bunch of methods inherited from TensorImpl to return error messages. bool is_contiguous_custom(at::MemoryFormat memory_format) const override; void set_size(int64_t dim, int64_t new_size) override; diff --git a/aten/src/ATen/BatchingRegistrations.cpp b/aten/src/ATen/BatchingRegistrations.cpp index 0eb0d697078e..b1b082a4f88a 100644 --- a/aten/src/ATen/BatchingRegistrations.cpp +++ b/aten/src/ATen/BatchingRegistrations.cpp @@ -181,6 +181,11 @@ Tensor expand_batching_rule(const Tensor& self, IntArrayRef size, bool implicit) return self_physical.getPhysicalToLogicalMap().apply(result); } +Tensor expand_batching_rule_symint(const Tensor& self, SymIntArrayRef psize, bool implicit) { + return expand_batching_rule(self, expectIntArrayRef(psize), implicit); +} + + std::vector chunk_batching_rule(const Tensor& self, int64_t chunks, int64_t dim) { auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self); auto dim_physical = self_physical.getPhysicalDim(dim); @@ -1088,6 +1093,7 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) { m.impl("tensor_split.indices", tensor_split_indices_batching_rule); m.impl("diagonal", diagonal_batching_rule); m.impl("expand", expand_batching_rule); + m.impl("expand.SymInt", expand_batching_rule_symint); m.impl("expand_as", native::expand_as); // composite wrt autograd m.impl("movedim.intlist", movedim_batching_rule); m.impl("movedim.int", static_cast(native::movedim)); // composite wrt autograd @@ -1105,6 +1111,7 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) { m.impl("select.int", select_batching_rule); m.impl("slice.Tensor", slice_batching_rule); m.impl("split.Tensor", split_batching_rule); + m.impl("split.sizes", split_with_sizes_batching_rule); m.impl("split_with_sizes", split_with_sizes_batching_rule); m.impl("squeeze", squeeze_batching_rule); m.impl("squeeze.dim", squeeze_dim_batching_rule); diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index f259e345f96d..63a4cd76c2bb 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -100,10 +100,20 @@ file(GLOB native_ao_sparse_cpp "native/ao_sparse/cpu/*.cpp" "native/ao_sparse/quantized/*.cpp" "native/ao_sparse/quantized/cpu/*.cpp") +# MPS +file(GLOB mps_cpp "mps/*.cpp") +file(GLOB mps_mm "mps/*.mm") +file(GLOB mps_h "mps/*.h") +file(GLOB_RECURSE native_mps_cpp "native/mps/*.cpp") +file(GLOB_RECURSE native_mps_mm "native/mps/*.mm") +file(GLOB_RECURSE native_mps_h "native/mps/*.h") + file(GLOB native_sparse_cpp "native/sparse/*.cpp") file(GLOB native_quantized_cpp "native/quantized/*.cpp" "native/quantized/cpu/*.cpp") +file(GLOB native_nested_cpp "native/nested/*.cpp") +file(GLOB native_transformers_cpp "native/transformers/*.cpp") file(GLOB native_h "native/*.h") file(GLOB native_ao_sparse_h @@ -120,21 +130,30 @@ file(GLOB native_cuda_h "native/cuda/*.h" "native/cuda/*.cuh") file(GLOB native_cuda_linalg_cpp "native/cuda/linalg/*.cpp") file(GLOB native_hip_h "native/hip/*.h" "native/hip/*.cuh") file(GLOB native_cudnn_cpp "native/cudnn/*.cpp") +file(GLOB native_nested_cuda_cu "native/nested/cuda/*.cu") +file(GLOB native_nested_cuda_cpp "native/nested/cuda/*.cpp") file(GLOB native_sparse_cuda_cu "native/sparse/cuda/*.cu") file(GLOB native_sparse_cuda_cpp "native/sparse/cuda/*.cpp") file(GLOB native_quantized_cuda_cu "native/quantized/cuda/*.cu") file(GLOB native_quantized_cuda_cpp "native/quantized/cuda/*.cpp") file(GLOB native_quantized_cudnn_cpp "native/quantized/cudnn/*.cpp") +file(GLOB native_transformers_cuda_cu "native/transformers/cuda/*.cu") +file(GLOB native_transformers_cuda_cpp "native/transformers/cuda/*.cpp") file(GLOB native_hip_hip "native/hip/*.hip") file(GLOB native_hip_cpp "native/hip/*.cpp") file(GLOB native_hip_linalg_cpp "native/hip/linalg/*.cpp") file(GLOB native_miopen_cpp "native/miopen/*.cpp") file(GLOB native_cudnn_hip_cpp "native/cudnn/hip/*.cpp") +file(GLOB native_nested_hip_hip "native/nested/hip/*.hip") +file(GLOB native_nested_hip_cpp "native/nested/hip/*.cpp") file(GLOB native_sparse_hip_hip "native/sparse/hip/*.hip") file(GLOB native_sparse_hip_cpp "native/sparse/hip/*.cpp") file(GLOB native_quantized_hip_hip "native/quantized/hip/*.hip") file(GLOB native_quantized_hip_cpp "native/quantized/hip/*.cpp") +file(GLOB native_transformers_hip_hip "native/transformers/hip/*.hip") +file(GLOB native_transformers_hip_cpp "native/transformers/hip/*.cpp") +file(GLOB native_quantized_cudnn_hip_cpp "native/quantized/cudnn/hip/*.cpp") file(GLOB native_utils_cpp "native/utils/*.cpp") # XNNPACK @@ -155,13 +174,17 @@ if(BUILD_LITE_INTERPRETER) else() set( all_cpu_cpp ${base_cpp} ${ATen_CORE_SRCS} ${native_cpp} - ${native_ao_sparse_cpp} ${native_sparse_cpp} + ${native_ao_sparse_cpp} ${native_sparse_cpp} ${native_nested_cpp} ${native_quantized_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp} + ${native_transformers_cpp} ${native_utils_cpp} ${native_xnnpack} ${generated_sources} ${core_generated_sources} ${ATen_CPU_SRCS} ${ATen_QUANTIZED_SRCS} ${ATen_NNAPI_SRCS} ${cpu_kernel_cpp} ) endif() +if(USE_LIGHTWEIGHT_DISPATCH) + set(all_cpu_cpp ${all_cpu_cpp} ${generated_unboxing_sources}) +endif() if(AT_MKL_ENABLED) set(all_cpu_cpp ${all_cpu_cpp} ${mkl_cpp}) endif() @@ -194,9 +217,10 @@ if(USE_CUDA) list(APPEND ATen_CUDA_CU_SRCS ${cuda_cu} ${native_cuda_cu} - ${native_cuda_linalg_cpp} + ${native_nested_cuda_cu} ${native_sparse_cuda_cu} ${native_quantized_cuda_cu} + ${native_transformers_cuda_cu} ${cuda_generated_sources} ) list(APPEND ATen_CUDA_CPP_SRCS @@ -204,10 +228,16 @@ if(USE_CUDA) ${native_cuda_cpp} ${native_cudnn_cpp} ${native_miopen_cpp} + ${native_nested_cuda_cpp} ${native_quantized_cuda_cpp} ${native_quantized_cudnn_cpp} ${native_sparse_cuda_cpp} + ${native_transformers_cuda_cpp} ) + set(ATen_CUDA_LINALG_SRCS ${native_cuda_linalg_cpp}) + if(NOT BUILD_LAZY_CUDA_LINALG) + list(APPEND ATen_CUDA_CU_SRCS ${native_cuda_linalg_cpp}) + endif() if(CAFFE2_USE_CUDNN) list(APPEND ATen_CUDA_CPP_SRCS ${cudnn_cpp}) endif() @@ -225,9 +255,9 @@ endif() if(USE_ROCM) list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip) - set(ATen_HIP_SRCS ${ATen_HIP_SRCS} ${hip_hip} ${native_hip_hip} ${native_sparse_hip_hip} ${native_quantized_hip_hip}) + set(ATen_HIP_SRCS ${ATen_HIP_SRCS} ${hip_hip} ${native_hip_hip} ${native_nested_hip_hip} ${native_sparse_hip_hip} ${native_quantized_hip_hip} ${native_transformers_hip_hip}) # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources) - set(all_hip_cpp ${native_sparse_hip_cpp} ${native_quantized_hip_cpp} ${hip_cpp} ${native_hip_cpp} ${native_hip_linalg_cpp} ${cuda_generated_sources} ${ATen_HIP_SRCS}) + set(all_hip_cpp ${native_nested_hip_cpp} ${native_sparse_hip_cpp} ${native_quantized_hip_cpp} ${native_transformers_hip_cpp} ${native_quantized_cudnn_hip_cpp} ${hip_cpp} ${native_hip_cpp} ${native_hip_linalg_cpp} ${cuda_generated_sources} ${ATen_HIP_SRCS}) set(all_hip_cpp ${native_miopen_cpp} ${native_cudnn_hip_cpp} ${miopen_cpp} ${all_hip_cpp}) endif() @@ -392,16 +422,24 @@ if(USE_CUDA AND NOT USE_ROCM) ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcurand_static.a ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublas_static.a ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcufft_static_nocallback.a - ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcusolver_static.a - ${CUDA_TOOLKIT_ROOT_DIR}/lib64/liblapack_static.a # needed for libcusolver_static ) + if(NOT BUILD_LAZY_CUDA_LINALG) + list(APPEND ATen_CUDA_DEPENDENCY_LIBS + ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcusolver_static.a + ${CUDA_TOOLKIT_ROOT_DIR}/lib64/liblapack_static.a # needed for libcusolver_static + ) + endif() else() list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${CUDA_LIBRARIES} ${CUDA_cusparse_LIBRARY} ${CUDA_curand_LIBRARY} - ${CUDA_cusolver_LIBRARY} ) + if(NOT BUILD_LAZY_CUDA_LINALG) + list(APPEND ATen_CUDA_DEPENDENCY_LIBS + ${CUDA_cusolver_LIBRARY} + ) + endif() endif() if(CAFFE2_USE_CUDNN) @@ -415,9 +453,9 @@ endif() if(USE_MAGMA) - if(USE_CUDA) + if(USE_CUDA AND NOT BUILD_LAZY_CUDA_LINALG) list(APPEND ATen_CUDA_DEPENDENCY_LIBS torch::magma) - endif(USE_CUDA) + endif(USE_CUDA AND NOT BUILD_LAZY_CUDA_LINALG) if(USE_ROCM) list(APPEND ATen_HIP_DEPENDENCY_LIBS torch::magma) endif(USE_ROCM) @@ -455,6 +493,10 @@ if(USE_CUDA) list(APPEND ATen_CUDA_DEPENDENCY_LIBS ATEN_CUDA_FILES_GEN_LIB) endif() +if(USE_MPS) + set(ATen_MPS_SRCS ${ATen_MPS_SRCS} ${mps_cpp} ${mps_mm} ${mps_h} ${native_mps_cpp} ${native_mps_mm} ${native_mps_h}) +endif() + if(USE_ROCM) set(ATen_HIP_SRCS ${all_hip_cpp}) # caffe2_nvrtc's stubs to driver APIs are useful for HIP. @@ -536,10 +578,12 @@ set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE) set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE) set(ATen_CUDA_CU_SRCS ${ATen_CUDA_CU_SRCS} PARENT_SCOPE) set(ATen_CUDA_CPP_SRCS ${ATen_CUDA_CPP_SRCS} PARENT_SCOPE) +set(ATen_CUDA_LINALG_SRCS ${ATen_CUDA_LINALG_SRCS} PARENT_SCOPE) set(ATen_CUDA_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY} PARENT_SCOPE) set(ATen_CUDA_CU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY} PARENT_SCOPE) set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE) set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE) +set(ATen_MPS_SRCS ${ATen_MPS_SRCS} PARENT_SCOPE) set(ATen_QUANTIZED_SRCS ${ATen_QUANTIZED_SRCS} PARENT_SCOPE) set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE) set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE) diff --git a/aten/src/ATen/CPUApplyUtils.h b/aten/src/ATen/CPUApplyUtils.h index 3c0aa1c6bfc9..b0d15988b95c 100644 --- a/aten/src/ATen/CPUApplyUtils.h +++ b/aten/src/ATen/CPUApplyUtils.h @@ -69,6 +69,7 @@ struct strided_tensor_iter_fixed { strided_tensor_iter_fixed(strided_tensor_iter_fixed&&) = default; strided_tensor_iter_fixed(Tensor& tensor, bool sort_strides = false) : data_(tensor.data_ptr()) { + (void)sort_strides; // Suppress unused variable warning std::memset(counter_, 0, sizeof(int64_t) * N); if (tensor.dim() > 0) { std::memcpy( @@ -152,7 +153,7 @@ inline int64_t _max_dim_tensors(ArrayRef tensors) { return dim; } -inline void iterate(int64_t size){}; +inline void iterate(int64_t /*size*/){}; template inline void iterate(int64_t size, Arg& iter, Args&... iter_tail) { @@ -199,7 +200,7 @@ inline void iterate_overflow(Arg& iter, Args&... iter_tail) { iterate_overflow(iter_tail...); } -inline void forward(int64_t offset){}; +inline void forward(int64_t /*offset*/){}; template inline void forward(int64_t offset, Arg& iter, Args&... iter_tail) { diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp index 98590b266be4..afbb2ee7200a 100644 --- a/aten/src/ATen/Context.cpp +++ b/aten/src/ATen/Context.cpp @@ -4,7 +4,10 @@ #include #include +#include +#include +#include #include #include #include @@ -138,11 +141,44 @@ void Context::setBenchmarkCuDNN(bool b) { } bool Context::allowTF32CuBLAS() const { - return allow_tf32_cublas; + static bool allow_tf32_cublas_override = c10::utils::check_env("TORCH_ALLOW_TF32_CUBLAS_OVERRIDE") == true; + return allow_tf32_cublas_override || float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST; } void Context::setAllowTF32CuBLAS(bool b) { - allow_tf32_cublas = b; + float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST; +} + +Float32MatmulPrecision Context::float32MatmulPrecision() const { + return float32_matmul_precision; +} + +void Context::setFloat32MatmulPrecision(Float32MatmulPrecision p) { + float32_matmul_precision = p; +} + +void Context::setFloat32MatmulPrecision(const std::string &s) { + auto match = [this](const std::string & s_) { + // TODO: consider if CuDNN field needs to also be set for potential future CuDNN ops like multi-headed attention + if (s_ == "highest") { + float32_matmul_precision = at::Float32MatmulPrecision::HIGHEST; + return true; + } else if (s_ == "high") { + float32_matmul_precision = at::Float32MatmulPrecision::HIGH; + return true; + } else if (s_ == "medium") { + float32_matmul_precision = at::Float32MatmulPrecision::MEDIUM; + return true; + } + return false; + }; + if (match(s)) { return; } + std::string sl; + std::transform(s.begin(), s.end(), sl.begin(), + [](unsigned char c) -> unsigned char { return std::tolower(c); }); + if (match(sl)) { return; } + TORCH_WARN(s, " is not one of 'highest', 'high', or 'medium'; the current" + "setFloat32MatmulPrecision call has no effect."); } at::LinalgBackend Context::linalgPreferredBackend() const { @@ -188,6 +224,22 @@ bool Context::hasMKLDNN() { #endif } +bool Context::hasMPS() { +#if defined(__APPLE__) +#if __is_target_os(macOS) + if (__builtin_available(macOS 12.3, *)) { + return c10::impl::hasDeviceGuardImpl(at::DeviceType::MPS); + } else { + return false; + } +#else + return false; +#endif +#else + return false; +#endif +} + bool Context::hasOpenMP() { #ifdef _OPENMP return true; @@ -236,6 +288,10 @@ const std::vector& Context::supportedQEngines() { engines.push_back(at::kNoQEngine); #endif // C10_MOBILE +#if AT_MKLDNN_ENABLED() + engines.push_back(at::kONEDNN); +#endif + #ifdef USE_FBGEMM if (fbgemm::fbgemmSupportedCPU()) { engines.push_back(at::kFBGEMM); @@ -293,6 +349,26 @@ bool NoTF32Guard::should_disable_tf32() { return override_allow_tf32_flag; } +#ifdef USE_ROCM +// Ops can query this flag to know they are in the backward pass. +// This information can be used, for example, to select implementations +// with different numerical or performance characteristics. +// See https://pytorch.org/docs/stable/notes/numerical_accuracy.html for details. +thread_local bool ROCmBackwardPassGuard::is_backward_pass_; + +ROCmBackwardPassGuard::ROCmBackwardPassGuard() { + is_backward_pass_ = true; +} + +ROCmBackwardPassGuard::~ROCmBackwardPassGuard() { + is_backward_pass_ = false; +} + +bool ROCmBackwardPassGuard::is_backward_pass() { + return is_backward_pass_; +} +#endif + bool Context::areVmapFallbackWarningsEnabled() const { return display_vmap_fallback_warnings_; } diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index 88cbc3ec0bb3..d4840c292643 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -22,6 +22,8 @@ namespace at { class Tensor; +enum class TORCH_API Float32MatmulPrecision {HIGHEST, HIGH, MEDIUM}; + class TORCH_API Context { public: Context(); @@ -80,15 +82,17 @@ class TORCH_API Context { static bool hasHIP() { return detail::getHIPHooks().hasHIP(); } + static bool hasIPU() { + return c10::impl::hasDeviceGuardImpl(at::DeviceType::IPU); + } static bool hasXLA() { return c10::impl::hasDeviceGuardImpl(at::DeviceType::XLA); } static bool hasLazy() { return c10::impl::hasDeviceGuardImpl(at::DeviceType::Lazy); } - static bool hasMLC() { - return c10::impl::hasDeviceGuardImpl(at::DeviceType::MLC); - } + static bool hasMPS(); + static bool hasORT() { return c10::impl::hasDeviceGuardImpl(at::DeviceType::ORT); } @@ -202,10 +206,13 @@ class TORCH_API Context { // https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility void alertCuBLASConfigNotDeterministic() const; + void setFloat32MatmulPrecision(const std::string & s); bool allowTF32CuDNN() const; void setAllowTF32CuDNN(bool); bool allowTF32CuBLAS() const; void setAllowTF32CuBLAS(bool); + Float32MatmulPrecision float32MatmulPrecision() const; + void setFloat32MatmulPrecision(Float32MatmulPrecision p); bool allowFP16ReductionCuBLAS() const; void setAllowFP16ReductionCuBLAS(bool); at::QEngine qEngine() const; @@ -243,8 +250,8 @@ class TORCH_API Context { bool _deterministic_algorithms = false; bool _deterministic_algorithms_warn_only = false; bool benchmark_cudnn = false; + Float32MatmulPrecision float32_matmul_precision = at::Float32MatmulPrecision::HIGHEST; bool allow_tf32_cudnn = true; - bool allow_tf32_cublas = true; bool allow_fp16_reduction_cublas = true; bool enabled_mkldnn = true; at::LinalgBackend linalg_preferred_backend = at::LinalgBackend::Default; @@ -287,6 +294,11 @@ static inline DeprecatedTypeProperties& HIP(ScalarType s) { Backend::HIP, s); } +static inline DeprecatedTypeProperties& MPS(ScalarType s) { + return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties( + Backend::MPS, s); +} + static inline bool hasCUDA() { return globalContext().hasCUDA(); } @@ -295,12 +307,16 @@ static inline bool hasHIP() { return globalContext().hasHIP(); } +static inline bool hasIPU() { + return globalContext().hasIPU(); +} + static inline bool hasXLA() { return globalContext().hasXLA(); } -static inline bool hasMLC() { - return globalContext().hasMLC(); +static inline bool hasMPS() { + return globalContext().hasMPS(); } static inline bool hasORT() { @@ -387,4 +403,14 @@ struct TORCH_API NoTF32Guard { bool changed = false; }; +#ifdef USE_ROCM +struct TORCH_API ROCmBackwardPassGuard { + ROCmBackwardPassGuard(); + ~ROCmBackwardPassGuard(); + static bool is_backward_pass(); +private: + static thread_local bool is_backward_pass_; +}; +#endif + } // namespace at diff --git a/aten/src/ATen/Dispatch.cpp b/aten/src/ATen/Dispatch.cpp index 297b25b54ead..73e54b319cb6 100644 --- a/aten/src/ATen/Dispatch.cpp +++ b/aten/src/ATen/Dispatch.cpp @@ -7,7 +7,7 @@ void record_kernel_function_dtype(std::string name) { RECORD_FUNCTION_WITH_SCOPE( at::RecordScope::KERNEL_FUNCTION_DTYPE, name, - {}); + c10::ArrayRef{}); } }} // namespace at::detail diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h index 55e2036d62e2..05f31606c484 100644 --- a/aten/src/ATen/Dispatch.h +++ b/aten/src/ATen/Dispatch.h @@ -20,8 +20,8 @@ namespace at { * included in this file when code-gen is ready. */ inline constexpr bool should_include_kernel_dtype( - const char *kernel_tag_str, - at::ScalarType scalar_type + const char* /*kernel_tag_str*/, + at::ScalarType /*scalar_type*/ ) { return true; } @@ -416,6 +416,46 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} } \ }() +#define AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \ + [&] { \ + const auto& the_type = TYPE; \ + /* don't use TYPE again in case it is an expensive or side-effect op */ \ + at::ScalarType _st = ::detail::scalar_type(the_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st); \ + switch (_st) { \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE( \ + NAME, \ + at::ScalarType::ComplexDouble, \ + c10::complex, \ + __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE( \ + NAME, \ + at::ScalarType::ComplexFloat, \ + c10::complex, \ + __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE( \ + NAME, \ + SCALARTYPE1, \ + decltype(c10::impl::ScalarTypeToCPPType::t), \ + __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE( \ + NAME, \ + SCALARTYPE2, \ + decltype(c10::impl::ScalarTypeToCPPType::t), \ + __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE( \ + NAME, \ + SCALARTYPE3, \ + decltype(c10::impl::ScalarTypeToCPPType::t), \ + __VA_ARGS__) \ + default: \ + AT_ERROR(#NAME, " not implemented for '", toString(_st), "'"); \ + } \ + }() + #define AT_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \ [&] { \ const auto& the_type = TYPE; \ @@ -495,6 +535,33 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} } \ }() +#define AT_DISPATCH_COMPLEX_TYPES_AND(SCALARTYPE, TYPE, NAME, ...) \ + [&] { \ + const auto& the_type = TYPE; \ + /* don't use TYPE again in case it is an expensive or side-effect op */ \ + at::ScalarType _st = ::detail::scalar_type(the_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st); \ + switch (_st) { \ + AT_PRIVATE_CASE_TYPE( \ + NAME, \ + at::ScalarType::ComplexFloat, \ + c10::complex, \ + __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE( \ + NAME, \ + at::ScalarType::ComplexDouble, \ + c10::complex, \ + __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE( \ + NAME, \ + SCALARTYPE, \ + decltype(c10::impl::ScalarTypeToCPPType::t), \ + __VA_ARGS__) \ + default: \ + AT_ERROR(#NAME, " not implemented for '", toString(_st), "'"); \ + } \ + }() + #define AT_DISPATCH_QINT_TYPES(TYPE, NAME, ...) \ [&] { \ const auto& the_type = TYPE; \ @@ -513,6 +580,22 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} } \ }() +#define AT_DISPATCH_QINT_BYTE_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& the_type = TYPE; \ + /* don't use TYPE again in case it is an expensive or side-effect op */ \ + at::ScalarType _st = ::detail::scalar_type(the_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st); \ + switch (_st) { \ + AT_QINT_PRIVATE_CASE_TYPE( \ + NAME, at::kQInt8, at::qint8, at::kChar, int8_t, __VA_ARGS__) \ + AT_QINT_PRIVATE_CASE_TYPE( \ + NAME, at::kQUInt8, at::quint8, at::kByte, uint8_t, __VA_ARGS__) \ + default: \ + AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ + } \ + }() + #define AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(TYPE, NAME, ...) \ [&] { \ const auto& the_type = TYPE; \ @@ -753,6 +836,56 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} } \ }() +#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, TYPE, NAME, ...) \ + [&] { \ + const auto& the_type = TYPE; \ + /* don't use TYPE again in case it is an expensive or side-effect op*/ \ + at::ScalarType _st = ::detail::scalar_type(the_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st); \ + switch (_st) { \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Char, int8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Int, int32_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Long, int64_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Short, int16_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE( \ + NAME, \ + at::ScalarType::ComplexFloat, \ + c10::complex, \ + __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE( \ + NAME, \ + at::ScalarType::ComplexDouble, \ + c10::complex, \ + __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE( \ + NAME, \ + SCALARTYPE1, \ + decltype(c10::impl::ScalarTypeToCPPType::t), \ + __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE( \ + NAME, \ + SCALARTYPE2, \ + decltype(c10::impl::ScalarTypeToCPPType::t), \ + __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE( \ + NAME, \ + SCALARTYPE3, \ + decltype(c10::impl::ScalarTypeToCPPType::t), \ + __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE( \ + NAME, \ + SCALARTYPE4, \ + decltype(c10::impl::ScalarTypeToCPPType::t), \ + __VA_ARGS__) \ + default: \ + AT_ERROR(#NAME, " not implemented for '", toString(_st), "'"); \ + } \ + }() + #define AT_DISPATCH_INDEX_TYPES(TYPE, NAME, ...) \ [&] { \ const auto& the_index_type = TYPE; \ diff --git a/aten/src/ATen/DynamicLibrary.cpp b/aten/src/ATen/DynamicLibrary.cpp index b6577241bcdb..f3287121b2e2 100644 --- a/aten/src/ATen/DynamicLibrary.cpp +++ b/aten/src/ATen/DynamicLibrary.cpp @@ -20,22 +20,22 @@ namespace at { static void* checkDL(void* x) { if (!x) { - AT_ERROR("Error in dlopen or dlsym: ", dlerror()); + TORCH_CHECK_WITH(DynamicLibraryError, false, "Error in dlopen or dlsym: ", dlerror()); } return x; } -DynamicLibrary::DynamicLibrary(const char* name, const char* alt_name) { +DynamicLibrary::DynamicLibrary(const char* name, const char* alt_name, bool leak_handle_): leak_handle(leak_handle_) { // NOLINTNEXTLINE(hicpp-signed-bitwise) handle = dlopen(name, RTLD_LOCAL | RTLD_NOW); if (!handle) { if (alt_name) { handle = dlopen(alt_name, RTLD_LOCAL | RTLD_NOW); if (!handle) { - AT_ERROR("Error in dlopen for library ", name, "and ", alt_name); + TORCH_CHECK_WITH(DynamicLibraryError, false, "Error in dlopen for library ", name, "and ", alt_name); } } else { - AT_ERROR("Error in dlopen: ", dlerror()); + TORCH_CHECK_WITH(DynamicLibraryError, false, "Error in dlopen: ", dlerror()); } } } @@ -46,8 +46,9 @@ void* DynamicLibrary::sym(const char* name) { } DynamicLibrary::~DynamicLibrary() { - if (!handle) + if (!handle || leak_handle) { return; + } dlclose(handle); } @@ -55,7 +56,7 @@ DynamicLibrary::~DynamicLibrary() { // Windows -DynamicLibrary::DynamicLibrary(const char* name, const char* alt_name) { +DynamicLibrary::DynamicLibrary(const char* name, const char* alt_name, bool leak_handle_): leak_handle(leak_handle_) { // NOLINTNEXTLINE(hicpp-signed-bitwise) HMODULE theModule; bool reload = true; @@ -83,7 +84,7 @@ DynamicLibrary::DynamicLibrary(const char* name, const char* alt_name) { FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, dw, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf, (sizeof(buf) / sizeof(char)), NULL); - AT_ERROR("error in LoadLibrary for ", name, ". WinError ", dw, ": ", buf); + TORCH_CHECK_WITH(DynamicLibraryError, false, "error in LoadLibrary for ", name, ". WinError ", dw, ": ", buf); } } @@ -91,13 +92,13 @@ void* DynamicLibrary::sym(const char* name) { AT_ASSERT(handle); FARPROC procAddress = GetProcAddress((HMODULE)handle, name); if (!procAddress) { - AT_ERROR("error in GetProcAddress"); + TORCH_CHECK_WITH(DynamicLibraryError, false, "error in GetProcAddress"); } return (void*)procAddress; } DynamicLibrary::~DynamicLibrary() { - if (!handle) { + if (!handle || leak_handle) { return; } FreeLibrary((HMODULE)handle); diff --git a/aten/src/ATen/DynamicLibrary.h b/aten/src/ATen/DynamicLibrary.h index 88bc0e201ced..8f65dd5b494f 100644 --- a/aten/src/ATen/DynamicLibrary.h +++ b/aten/src/ATen/DynamicLibrary.h @@ -1,20 +1,30 @@ #pragma once #include +#include #include +namespace c10 { + +class DynamicLibraryError : public Error { + using Error::Error; +}; + +} // namespace c10 + namespace at { struct DynamicLibrary { AT_DISALLOW_COPY_AND_ASSIGN(DynamicLibrary); - TORCH_API DynamicLibrary(const char* name, const char* alt_name = nullptr); + TORCH_API DynamicLibrary(const char* name, const char* alt_name = nullptr, bool leak_handle = false); TORCH_API void* sym(const char* name); TORCH_API ~DynamicLibrary(); private: + bool leak_handle; void* handle = nullptr; }; diff --git a/aten/src/ATen/EmptyTensor.cpp b/aten/src/ATen/EmptyTensor.cpp index 5e21a2f52d18..5a72a09d1841 100644 --- a/aten/src/ATen/EmptyTensor.cpp +++ b/aten/src/ATen/EmptyTensor.cpp @@ -2,31 +2,93 @@ #include #include #include +#include + +#include namespace at { namespace detail { - -static c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) { +namespace { +c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) { if (pin_memory) { return at::detail::getCUDAHooks().getPinnedMemoryAllocator(); } return c10::GetCPUAllocator(); } +constexpr uint64_t storage_max() { + // int64_t and size_t are used somewhat inconsistently throughout ATen. + // To be safe, storage size calculations must fit in both types. + constexpr auto int64_max = static_cast( + std::numeric_limits::max()); + constexpr auto size_max = static_cast( + std::numeric_limits::max()); + return std::min(int64_max, size_max); +} + +} // namespace (anonymous) + +size_t computeStorageNbytesContiguous( + IntArrayRef sizes, + size_t itemsize_bytes, + size_t storage_offset + ) { + // Ignore overflow checks on mobile +#ifndef C10_MOBILE + uint64_t size = 1; + bool overflowed = c10::safe_multiplies_u64(sizes, &size); + overflowed |= c10::add_overflows(size, storage_offset, &size); + overflowed |= c10::mul_overflows(size, itemsize_bytes, &size); + overflowed |= size > storage_max(); + TORCH_CHECK(!overflowed, + "Storage size calculation overflowed with sizes=", sizes); + return static_cast(size); +#else + const auto numel = c10::multiply_integers(sizes); + return itemsize_bytes * (storage_offset + numel); +#endif +} + size_t computeStorageNbytes( IntArrayRef sizes, IntArrayRef strides, - size_t itemsize_bytes) { + size_t itemsize_bytes, + size_t storage_offset + ) { + // Ignore overflow checks on mobile +#ifndef C10_MOBILE // size of the underlying storage is 1 bigger than the offset // of the last element according to stride - size_t size = 1; + uint64_t size = storage_offset + 1; + bool overflowed = false; for (const auto i : c10::irange(sizes.size())) { - if(sizes[i] == 0) { + if (sizes[i] == 0) { return 0; } - size += strides[i]*(sizes[i]-1); + + uint64_t strided_size; + overflowed |= c10::mul_overflows(strides[i], sizes[i] - 1, &strided_size); + overflowed |= c10::add_overflows(size, strided_size, &size); } - return size * itemsize_bytes; + overflowed |= c10::mul_overflows(size, itemsize_bytes, &size); + overflowed |= size > storage_max(); + TORCH_CHECK(!overflowed, + "Storage size calculation overflowed with sizes=", + sizes, " and strides=", strides); + return static_cast(size); +#else + // size of the underlying storage is 1 bigger than the offset + // of the last element according to stride + uint64_t size = 1; + for (const auto i : c10::irange(sizes.size())) { + if (sizes[i] == 0) { + return 0; + } + + size += strides[i] * (sizes[i] - 1); + } + return itemsize_bytes * (storage_offset + size); +#endif } TensorBase empty_generic( @@ -37,9 +99,8 @@ TensorBase empty_generic( c10::optional memory_format_opt) { at::detail::check_size_nonnegative(size); - int64_t nelements = c10::multiply_integers(size); caffe2::TypeMeta dtype = scalarTypeToTypeMeta(scalar_type); - int64_t size_bytes = nelements * dtype.itemsize(); + size_t size_bytes = computeStorageNbytesContiguous(size, dtype.itemsize()); auto storage_impl = c10::make_intrusive( c10::StorageImpl::use_byte_size_t(), size_bytes, @@ -73,7 +134,7 @@ TensorBase empty_strided_generic( at::detail::check_size_nonnegative(size); caffe2::TypeMeta dtype = scalarTypeToTypeMeta(scalar_type); - int64_t size_bytes = computeStorageNbytes(size, stride, dtype.itemsize()); + size_t size_bytes = computeStorageNbytes(size, stride, dtype.itemsize()); auto storage_impl = c10::make_intrusive( c10::StorageImpl::use_byte_size_t(), size_bytes, @@ -176,13 +237,11 @@ struct MetaAllocator final : public at::Allocator { static MetaAllocator g_meta_alloc; -at::Allocator* GetMetaAllocator() { - return &g_meta_alloc; -} +REGISTER_ALLOCATOR(kMeta, &g_meta_alloc); TensorBase empty_meta(IntArrayRef size, ScalarType dtype, c10::optional memory_format_opt) { - auto *allocator = GetMetaAllocator(); + auto *allocator = GetAllocator(kMeta); constexpr c10::DispatchKeySet meta_dks(c10::DispatchKey::Meta); return at::detail::empty_generic( size, allocator, meta_dks, dtype, memory_format_opt); @@ -222,7 +281,7 @@ TensorBase empty_meta( TensorBase empty_strided_meta(IntArrayRef size, IntArrayRef stride, ScalarType dtype) { - auto *allocator = GetMetaAllocator(); + auto *allocator = GetAllocator(kMeta); constexpr c10::DispatchKeySet meta_dks(c10::DispatchKey::Meta); return at::detail::empty_strided_generic( size, stride, allocator, meta_dks, dtype); diff --git a/aten/src/ATen/EmptyTensor.h b/aten/src/ATen/EmptyTensor.h index a49b3e909d6e..895bcc8e1779 100644 --- a/aten/src/ATen/EmptyTensor.h +++ b/aten/src/ATen/EmptyTensor.h @@ -10,8 +10,11 @@ inline void check_size_nonnegative(IntArrayRef size) { } } +TORCH_API size_t computeStorageNbytesContiguous( + IntArrayRef sizes, size_t itemsize, size_t storage_offset=0); TORCH_API size_t computeStorageNbytes( - IntArrayRef sizes, IntArrayRef strides, size_t itemsize); + IntArrayRef sizes, IntArrayRef strides, + size_t itemsize, size_t storage_offset=0); TORCH_API TensorBase empty_generic( IntArrayRef size, diff --git a/aten/src/ATen/ExpandBase.h b/aten/src/ATen/ExpandBase.h new file mode 100644 index 000000000000..e0a24091da19 --- /dev/null +++ b/aten/src/ATen/ExpandBase.h @@ -0,0 +1,23 @@ +#include + +// Broadcasting utilities for working with TensorBase +namespace at { +namespace internal { +TORCH_API TensorBase expand_slow_path(const TensorBase &self, IntArrayRef size); +} // namespace internal + +inline c10::MaybeOwned expand_size(const TensorBase &self, IntArrayRef size) { + if (size.equals(self.sizes())) { + return c10::MaybeOwned::borrowed(self); + } + return c10::MaybeOwned::owned( + at::internal::expand_slow_path(self, size)); +} +c10::MaybeOwned expand_size(TensorBase &&self, IntArrayRef size) = delete; + +inline c10::MaybeOwned expand_inplace(const TensorBase &tensor, const TensorBase &to_expand) { + return expand_size(to_expand, tensor.sizes()); +} +c10::MaybeOwned expand_inplace(const TensorBase &tensor, TensorBase &&to_expand) = delete; + +} // namespace at diff --git a/aten/src/ATen/ExpandUtils.cpp b/aten/src/ATen/ExpandUtils.cpp index 35588ac62a29..a44005a2ef81 100644 --- a/aten/src/ATen/ExpandUtils.cpp +++ b/aten/src/ATen/ExpandUtils.cpp @@ -1,8 +1,15 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include +#include #include namespace at { +namespace internal { +TensorBase expand_slow_path(const TensorBase &self, IntArrayRef size) { + return OptionalTensorRef(self)->expand(size); +} +} namespace { // NOTE: are_expandable did a similar check, please keep them sync if change is needed diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h index 55a392c8d9cc..a1b7c8a04602 100644 --- a/aten/src/ATen/ExpandUtils.h +++ b/aten/src/ATen/ExpandUtils.h @@ -1,5 +1,11 @@ #pragma once +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + #include #include #include @@ -293,7 +299,7 @@ inline std::vector expand_outplace(TensorList to_expand) { // Sums `tensor` repeatedly to produce a tensor of shape `shape`. // Precondition: is_expandable_to(shape, tensor.sizes()) must be true -static inline Tensor sum_to(Tensor tensor, const IntArrayRef shape) { +static inline Tensor sum_to(Tensor tensor, const IntArrayRef shape, bool always_return_non_view=false) { if (shape.size() == 0) { return tensor.sum(); } @@ -311,7 +317,13 @@ static inline Tensor sum_to(Tensor tensor, const IntArrayRef shape) { if (!reduce_dims.empty()) { tensor = tensor.sum(reduce_dims, /*keepdim=*/true); } - return leading_dims > 0 ? tensor.view(shape) : tensor; + if (always_return_non_view) { + // This is only actually used by the functionalization pass. + // We want to be able to guarantee that this function doesn't return a view of the input. + return leading_dims > 0 ? at::view_copy(tensor, shape) : tensor.clone(); + } else { + return leading_dims > 0 ? tensor.view(shape) : tensor; + } } // True if `shape` can be broadcasted to `desired` diff --git a/aten/src/ATen/FunctionalInverses.cpp b/aten/src/ATen/FunctionalInverses.cpp index 3e686701fa63..d4b2a08825b8 100644 --- a/aten/src/ATen/FunctionalInverses.cpp +++ b/aten/src/ATen/FunctionalInverses.cpp @@ -10,34 +10,46 @@ namespace functionalization { // We can't easily share it though, because (eventually) these functions // will all call `permute/unsqueeze_copy()` instead of `permute/unsqueeze`. -Tensor permute_inverse(const Tensor& self, IntArrayRef dims) { +Tensor permute_copy_inverse(const Tensor& self, IntArrayRef dims, bool reapply_views) { // invert the permutation auto ndims = dims.size(); std::vector dims_(ndims); for(const auto i : c10::irange(ndims)) { dims_[at::maybe_wrap_dim(dims[i], ndims)] = i; } - return self.permute(dims_); + if (reapply_views) { + return at::permute(self, dims_); + } else { + return at::permute_copy(self, dims_); + } } -Tensor unsqueeze_to(const Tensor & self, IntArrayRef sizes) { +Tensor unsqueeze_copy_to(const Tensor & self, IntArrayRef sizes, bool reapply_views) { auto result = self; int64_t nDims = sizes.size(); for(const auto dim : c10::irange(nDims)) { if (sizes[dim] == 1) { - result = result.unsqueeze(dim); + if (reapply_views) { + result = at::unsqueeze(result, dim); + } else { + result = at::unsqueeze_copy(result, dim); + } } } return result; } -Tensor unsqueeze_to(const Tensor & self, int64_t dim, IntArrayRef sizes) { +Tensor unsqueeze_copy_to(const Tensor & self, int64_t dim, IntArrayRef sizes, bool reapply_views) { dim = at::maybe_wrap_dim(dim, sizes.size()); // in NumPy it's not an error to unsqueeze a scalar, but we still need to avoided // unsqueezing in the backward. if (sizes.size() > 0 && sizes[dim] == 1) { - return self.unsqueeze(dim); + if (reapply_views) { + return at::unsqueeze(self, dim); + } else { + return at::unsqueeze_copy(self, dim); + } } return self; } @@ -73,72 +85,99 @@ Tensor unsqueeze_to(const Tensor & self, int64_t dim, IntArrayRef sizes) { // The codegen automatically generates the corresponding function declaration. // ---------------------------------------------------------- -Tensor FunctionalInverses::_fw_primal_inverse(const at::Tensor& base, const at::Tensor& mutated_view, int64_t level) { +Tensor FunctionalInverses::_fw_primal_copy_inverse(const at::Tensor& base, const at::Tensor& mutated_view, bool reapply_views, int64_t level) { TORCH_INTERNAL_ASSERT(false, "Attempted to call _fw_primal() during the functionalization pass. For now, this is not supported."); return Tensor(); } -Tensor FunctionalInverses::_make_dual_inverse(const at::Tensor& base, const at::Tensor& mutated_view, const at::Tensor& tangent, int64_t level) { +Tensor FunctionalInverses::_make_dual_copy_inverse(const at::Tensor& base, const at::Tensor& mutated_view, bool reapply_views, const at::Tensor& tangent, int64_t level) { TORCH_INTERNAL_ASSERT(false, "Attempted to call _make_dual() during the functionalization pass. For now, this is not supported."); return Tensor(); } -Tensor FunctionalInverses::view_as_real_inverse(const Tensor& base, const Tensor& mutated_view) { - return at::view_as_complex(mutated_view); +Tensor FunctionalInverses::view_as_real_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) { + if (reapply_views) { + return at::view_as_complex(mutated_view); + } else { + return at::view_as_complex_copy(mutated_view); + } } -Tensor FunctionalInverses::view_as_complex_inverse(const Tensor& base, const Tensor& mutated_view) { - return at::view_as_real(mutated_view.resolve_conj()); +Tensor FunctionalInverses::view_as_complex_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) { + if (reapply_views) { + return at::view_as_real(mutated_view.resolve_conj()); + } else { + return at::view_as_real_copy(mutated_view.resolve_conj()); + } } -Tensor FunctionalInverses::_conj_inverse(const Tensor& base, const Tensor& mutated_view) { - return mutated_view.conj(); +Tensor FunctionalInverses::_conj_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) { + if (reapply_views) { + return at::_conj(mutated_view); + } else { + return at::_conj_copy(mutated_view); + } } -Tensor FunctionalInverses::_neg_view_inverse(const Tensor& base, const Tensor& mutated_view) { - return mutated_view.neg(); +Tensor FunctionalInverses::_neg_view_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) { + if (reapply_views) { + return at::_neg_view(mutated_view); + } else { + return at::_neg_view_copy(mutated_view); + } } -Tensor FunctionalInverses::as_strided_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef size, at::IntArrayRef stride, c10::optional storage_offset) { +Tensor FunctionalInverses::as_strided_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, at::IntArrayRef size, at::IntArrayRef stride, c10::optional storage_offset) { TORCH_INTERNAL_ASSERT(false, "as_strided has not been implemented in the functionalization pass yet"); return Tensor(); } -Tensor FunctionalInverses::diagonal_inverse(const Tensor& base, const Tensor& mutated_view, int64_t offset, int64_t dim1, int64_t dim2) { +Tensor FunctionalInverses::diagonal_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, int64_t offset, int64_t dim1, int64_t dim2) { + // Pessimism: we can't reapply views for slice_scatter. return base.diagonal_scatter(mutated_view, offset, dim1, dim2); } -Tensor FunctionalInverses::expand_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef size, bool implicit) { - return at::sum_to(mutated_view, base.sizes()); +Tensor FunctionalInverses::expand_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, at::IntArrayRef size, bool implicit) { + return at::sum_to(mutated_view, base.sizes(),/*always_return_non_view=*/!reapply_views); } -Tensor FunctionalInverses::permute_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef dims) { - return at::functionalization::permute_inverse(mutated_view, dims); +Tensor FunctionalInverses::expand_copy_SymInt_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, c10::SymIntArrayRef size, bool implicit) { + return at::sum_to(mutated_view, c10::expectIntArrayRef(base.sym_sizes()),/*always_return_non_view=*/!reapply_views); } -Tensor FunctionalInverses::_reshape_alias_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef size, at::IntArrayRef stride) { +Tensor FunctionalInverses::permute_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, at::IntArrayRef dims) { + return at::functionalization::permute_copy_inverse(mutated_view, dims, reapply_views); +} + +Tensor FunctionalInverses::_reshape_alias_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, at::IntArrayRef size, at::IntArrayRef stride) { // Note that I'm directly calling reshape(), and ignoring the strides. // _reshape_alias() isn't available from user code, and is an implementation detail of reshape(). // Specifically, passing in the strides directly can get us into trouble in cases like: // b = a[0]; c = b.reshape(...); c.add_(1); print(a) // When we eventually run the _reshape_alias_inverse() call here, if we were to pass in both sizes and strides, // The call would fail because `mutated_view` doesn't have enough bytes of storage. - return mutated_view.reshape(base.sizes()); + if (reapply_views) { + return at::_reshape_alias(mutated_view, base.sizes(), base.strides()); + } else { + return at::_reshape_alias_copy(mutated_view, base.sizes(), base.strides()); + } } -Tensor FunctionalInverses::select_int_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dim, int64_t index) { +Tensor FunctionalInverses::select_copy_int_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, int64_t dim, int64_t index) { + // Pessimism: we can't reapply views for slice_scatter. return base.select_scatter(mutated_view, dim, index); } -Tensor FunctionalInverses::detach_inverse(const Tensor& base, const Tensor& mutated_view) { +Tensor FunctionalInverses::detach_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) { // the functionalization pass doesn't care about autograd metadata - as a view, I think detach() is just an identity function return mutated_view; } -Tensor FunctionalInverses::slice_Tensor_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dim, c10::optional start, c10::optional end, int64_t step) { +Tensor FunctionalInverses::slice_copy_Tensor_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, int64_t dim, c10::optional start, c10::optional end, int64_t step) { + // Pessimism: we can't reapply views for slice_scatter. return base.slice_scatter(mutated_view, dim, start, end, step); } -Tensor FunctionalInverses::split_Tensor_inverse(const Tensor& base, const Tensor& mutated_view, int64_t mutated_view_idx, int64_t split_size, int64_t dim) { +Tensor FunctionalInverses::split_copy_Tensor_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, int64_t mutated_view_idx, int64_t split_size, int64_t dim) { // It would be nice if this logic could be re-used from autograd's split_backward(), but I don't think it can. // For functionalization, we have only have one of the tensors from the TensorList outputed by split(), and we want to layer i // on top of the base tensor. @@ -148,10 +187,11 @@ Tensor FunctionalInverses::split_Tensor_inverse(const Tensor& base, const Tensor auto start = mutated_view_idx * split_size; auto end = start + split_size; if (end > dim_size) end = dim_size; + // Pessimism: we can't reapply views for slice_scatter. return base.slice_scatter(mutated_view, dim, start, end, 1); } -Tensor FunctionalInverses::split_with_sizes_inverse(const Tensor& base, const Tensor& mutated_view, int64_t mutated_view_idx, at::IntArrayRef split_sizes, int64_t dim) { +Tensor FunctionalInverses::split_with_sizes_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, int64_t mutated_view_idx, at::IntArrayRef split_sizes, int64_t dim) { dim = at::maybe_wrap_dim(dim, base.sizes().size()); auto dim_size = base.size(dim); int64_t start = 0; @@ -160,84 +200,123 @@ Tensor FunctionalInverses::split_with_sizes_inverse(const Tensor& base, const Te } auto end = start + split_sizes[mutated_view_idx]; if (end > dim_size) end = dim_size; + // Pessimism: we can't reapply views for slice_scatter. return base.slice_scatter(mutated_view, dim, start, end, 1); } -Tensor FunctionalInverses::squeeze_inverse(const Tensor& base, const Tensor& mutated_view) { - return unsqueeze_to(mutated_view, base.sizes()); +Tensor FunctionalInverses::squeeze_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) { + return unsqueeze_copy_to(mutated_view, base.sizes(), reapply_views); } -Tensor FunctionalInverses::squeeze_dim_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dim) { - return unsqueeze_to(mutated_view, dim, base.sizes()); +Tensor FunctionalInverses::squeeze_copy_dim_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, int64_t dim) { + return unsqueeze_copy_to(mutated_view, dim, base.sizes(), reapply_views); } -Tensor FunctionalInverses::t_inverse(const Tensor& base, const Tensor& mutated_view) { - return mutated_view.t(); +Tensor FunctionalInverses::t_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) { + if (reapply_views) { + return at::t(mutated_view); + } else { + return at::t_copy(mutated_view); + } } -Tensor FunctionalInverses::transpose_int_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dim0, int64_t dim1) { - return mutated_view.transpose(dim0, dim1); +Tensor FunctionalInverses::transpose_copy_int_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, int64_t dim0, int64_t dim1) { + if (reapply_views) { + return transpose(mutated_view, dim0, dim1); + } else { + return transpose_copy(mutated_view, dim0, dim1); + } } -Tensor FunctionalInverses::unsqueeze_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dim) { - return mutated_view.squeeze(dim); +Tensor FunctionalInverses::unsqueeze_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, int64_t dim) { + if (reapply_views) { + return at::squeeze(mutated_view, dim); + } else { + return at::squeeze_copy(mutated_view, dim); + } } -Tensor FunctionalInverses::_indices_inverse(const Tensor& base, const Tensor& mutated_view) { +Tensor FunctionalInverses::_indices_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) { TORCH_INTERNAL_ASSERT(false, "Attempted to call _indices() during the functionalization pass. For now, sparse tensors aren't supported during functionalization"); return Tensor(); } -Tensor FunctionalInverses::_values_inverse(const Tensor& base, const Tensor& mutated_view) { +Tensor FunctionalInverses::_values_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) { TORCH_INTERNAL_ASSERT(false, "Attempted to call _values() during the functionalization pass. For now, sparse tensors aren't supported during functionalization"); return Tensor(); } -Tensor FunctionalInverses::indices_inverse(const Tensor& base, const Tensor& mutated_view) { +Tensor FunctionalInverses::indices_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) { TORCH_INTERNAL_ASSERT(false, "Attempted to call indices() during the functionalization pass. For now, sparse tensors aren't supported during functionalization"); return Tensor(); } -Tensor FunctionalInverses::values_inverse(const Tensor& base, const Tensor& mutated_view) { +Tensor FunctionalInverses::values_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) { TORCH_INTERNAL_ASSERT(false, "Attempted to call values() during the functionalization pass. For now, sparse tensors aren't supported during functionalization"); return Tensor(); } -Tensor FunctionalInverses::_sparse_broadcast_to_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef size) { +Tensor FunctionalInverses::_sparse_broadcast_to_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, at::IntArrayRef size) { TORCH_INTERNAL_ASSERT(false, "Attempted to call _sparse_broadcast_to() during the functionalization pass. For now, sparse tensors aren't supported during functionalization"); return Tensor(); } -Tensor FunctionalInverses::crow_indices_inverse(const at::Tensor& base, const at::Tensor& mutated_view) { +Tensor FunctionalInverses::crow_indices_copy_inverse(const at::Tensor& base, const at::Tensor& mutated_view, bool reapply_views) { TORCH_INTERNAL_ASSERT(false, "Attempted to call crow_indices() during the functionalization pass. For now, sparse tensors aren't supported during functionalization"); return Tensor(); } -Tensor FunctionalInverses::col_indices_inverse(const at::Tensor& base, const at::Tensor& mutated_view) { +Tensor FunctionalInverses::col_indices_copy_inverse(const at::Tensor& base, const at::Tensor& mutated_view, bool reapply_views) { TORCH_INTERNAL_ASSERT(false, "Attempted to call col_indices() during the functionalization pass. For now, sparse tensors aren't supported during functionalization"); return Tensor(); } -Tensor FunctionalInverses::unbind_int_inverse(const Tensor& base, const Tensor& mutated_view, int64_t mutated_view_idx, int64_t dim) { +Tensor FunctionalInverses::ccol_indices_copy_inverse(const at::Tensor& base, const at::Tensor& mutated_view, bool reapply_views) { + TORCH_INTERNAL_ASSERT(false, "Attempted to call ccol_indices() during the functionalization pass. For now, sparse tensors aren't supported during functionalization"); + return Tensor(); +} + +Tensor FunctionalInverses::row_indices_copy_inverse(const at::Tensor& base, const at::Tensor& mutated_view, bool reapply_views) { + TORCH_INTERNAL_ASSERT(false, "Attempted to call row_indices() during the functionalization pass. For now, sparse tensors aren't supported during functionalization"); + return Tensor(); +} + +Tensor FunctionalInverses::unbind_copy_int_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, int64_t mutated_view_idx, int64_t dim) { dim = at::maybe_wrap_dim(dim, base.sizes().size()); + // Pessimism: we can't reapply views for select_scatter. return base.select_scatter(mutated_view, dim, mutated_view_idx); } -Tensor FunctionalInverses::view_inverse(const Tensor& base, const Tensor& mutated_view, at::IntArrayRef size) { - return mutated_view.view(base.sizes()); +Tensor FunctionalInverses::view_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, at::IntArrayRef size) { + if (reapply_views) { + return mutated_view.view(base.sizes()); + } else { + return at::view_copy(mutated_view, base.sizes()); + } } -Tensor FunctionalInverses::view_dtype_inverse(const Tensor& base, const Tensor& mutated_view, at::ScalarType dtype) { - return mutated_view.view(base.scalar_type()); +Tensor FunctionalInverses::view_copy_dtype_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, at::ScalarType dtype) { + if (reapply_views) { + return mutated_view.view(base.scalar_type()); + } else { + return at::view_copy(mutated_view, base.scalar_type()); + } } -Tensor FunctionalInverses::unfold_inverse(const Tensor& base, const Tensor& mutated_view, int64_t dimension, int64_t size, int64_t step) { +Tensor FunctionalInverses::unfold_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, int64_t dimension, int64_t size, int64_t step) { // I think autograd and the functionalization pass want the exact same thing here, but need to test to confirm. + // unfold_backward() is safe to use here because it is NOT a view op. + // (note: technically, "reapply_views" won't do anything here and we'll have an extra memory copy. + // We'd need to add an aliasing version of unfold_backward to fix that though). return unfold_backward(mutated_view, base.sizes(), dimension, size, step); } -Tensor FunctionalInverses::alias_inverse(const Tensor& base, const Tensor& mutated_view) { - return mutated_view; +Tensor FunctionalInverses::alias_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) { + if (reapply_views) { + return at::alias(mutated_view); + } else { + return at::alias_copy(mutated_view); + } } } // functionalization diff --git a/aten/src/ATen/FunctionalStorageImpl.cpp b/aten/src/ATen/FunctionalStorageImpl.cpp index 52c9a3bb28cf..2fad6bfad606 100644 --- a/aten/src/ATen/FunctionalStorageImpl.cpp +++ b/aten/src/ATen/FunctionalStorageImpl.cpp @@ -75,16 +75,18 @@ const Tensor apply_update(const Alias::Update& update, const Tensor& base) { return t; } -void Alias::apply_updates() { +bool Alias::apply_updates() { // N.B:none of the tensors used in this function should be FunctionalTensorWrappers at this point. // The only reason we currently need the TLS exclude guard here is because of functorch's DynamicLayer stack. // It adds the Functionalize key into TLS before redispatching to the functionalization kernels, // which means that we need to explicitly exclude it here before doing any other work underneath the pass. at::AutoDispatchSkipFunctionalize guard; + bool any_updates = updates_.size() > 0; for (auto& update_data: updates_) { base_ = apply_update(update_data, base_); } updates_.clear(); + return any_updates; } FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& value) @@ -103,8 +105,8 @@ void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vec alias_.add_update(updated_val, view_metas); } -void FunctionalStorageImpl::apply_updates() { - alias_.apply_updates(); +bool FunctionalStorageImpl::apply_updates() { + return alias_.apply_updates(); } const Tensor& FunctionalStorageImpl::base() { diff --git a/aten/src/ATen/FunctionalStorageImpl.h b/aten/src/ATen/FunctionalStorageImpl.h index e8478a7ae903..2c8a1312cbe1 100644 --- a/aten/src/ATen/FunctionalStorageImpl.h +++ b/aten/src/ATen/FunctionalStorageImpl.h @@ -72,7 +72,7 @@ class Alias { const at::Tensor& base() const; size_t generation() const { return generation_; } void add_update(const at::Tensor& updated_val, const std::vector& metas); - void apply_updates(); + bool apply_updates(); private: // NB: base_ should always point to a tensor BELOW the current functionalization layer. // This is mainly to avoid reference cycles. @@ -96,7 +96,7 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl { explicit FunctionalStorageImpl(const Tensor& value); void add_update(const Tensor& updated_val, const std::vector& view_metas); - void apply_updates(); + bool apply_updates(); const Tensor& base(); size_t generation() const; diff --git a/aten/src/ATen/FunctionalTensorWrapper.cpp b/aten/src/ATen/FunctionalTensorWrapper.cpp index 5f99e3774798..2a0ca304baf5 100644 --- a/aten/src/ATen/FunctionalTensorWrapper.cpp +++ b/aten/src/ATen/FunctionalTensorWrapper.cpp @@ -17,12 +17,16 @@ void FunctionalTensorWrapper::set_constructor_metadata() { // For now I'm retroactively setting this in functorch, // but once Open Multiple Dispatch lands we should be able to calculate this in core. level_ = -1; - // shallow_copy_from overwrites the storage and dispatch keyset... - auto functional_storage = storage_; - shallow_copy_from(value_.getIntrusivePtr()); - storage_ = functional_storage; + // mirror all of the generic tensor metadata onto the wrapper + copy_generic_tensor_metadata(value_.getIntrusivePtr().get(), this); + refresh_numel(); + refresh_contiguous(); storage_access_should_throw_ = false; key_set_ = c10::DispatchKeySet(c10::DispatchKey::Functionalize) | value_.key_set(); + // All of the keys corresponding to functorch transforms should not be copied over. + // Functorch transforms all have their own wrapper tensors (e.g. BatchedTensorImpl) which expect + // to participate in the functorch transforms. + key_set_ = key_set_ - c10::functorch_transforms_ks; } FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& value) @@ -176,6 +180,9 @@ void FunctionalTensorWrapper::replace_(const Tensor& other) { // TODO: going to need to change this if we want nested functionalize() transforms. TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(other)); value_ = other; + // out= ops are allowed to resize the output tensors, mutating both the data and metadata of the tensor. + // We need to propagate that metadata mutation to the wrapper (new size). + set_sizes_and_strides(value_.sizes(), value_.strides()); } @@ -183,8 +190,10 @@ void FunctionalTensorWrapper::sync_() { if (is_up_to_date()) { return; } - apply_updates(); - regenerate_from_base(); + auto any_updates = apply_updates(); + if (any_updates) { + regenerate_from_base(); + } } void FunctionalTensorWrapper::regenerate_from_base() { @@ -201,10 +210,10 @@ void FunctionalTensorWrapper::regenerate_from_base() { generation_ = storage_impl->generation(); } -void FunctionalTensorWrapper::apply_updates() { +bool FunctionalTensorWrapper::apply_updates() { // Apply all updates on alias_ auto storage_impl = functional_storage_impl(); - storage_impl->apply_updates(); + return storage_impl->apply_updates(); } const char* FunctionalTensorWrapper::tensorimpl_type_name() const { @@ -222,6 +231,12 @@ Tensor to_functional_tensor(const Tensor& tensor) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!isFunctionalTensor(tensor)); return at::detail::make_tensor(tensor); } +c10::optional to_functional_tensor(const c10::optional& tensor) { + if (tensor.has_value()) { + return c10::make_optional(to_functional_tensor(*tensor)); + } + return c10::nullopt; +} c10::List to_functional_tensor(const c10::List& t_list) { c10::List outputs; outputs.reserve(t_list.size()); @@ -230,6 +245,14 @@ c10::List to_functional_tensor(const c10::List& t_list) { } return outputs; } +c10::List> to_functional_tensor(const c10::List>& t_list) { + c10::List> outputs; + outputs.reserve(t_list.size()); + for (const auto i : c10::irange(t_list.size())) { + outputs.push_back(to_functional_tensor(t_list[i])); + } + return outputs; +} std::vector to_functional_tensor(const std::vector& t_list) { std::vector outputs(t_list.size()); for (const auto i : c10::irange(t_list.size())) { @@ -237,7 +260,7 @@ std::vector to_functional_tensor(const std::vector& t_list) { } return outputs; } -TensorList to_functional_tensor(const TensorList& t_list) { +std::vector to_functional_tensor(const TensorList& t_list) { std::vector outputs(t_list.size()); for (const auto i : c10::irange(t_list.size())) { outputs[i] = to_functional_tensor(t_list[i]); @@ -276,10 +299,10 @@ c10::List> from_functional_tensor(const c10::List from_functional_tensor(const TensorList& t_list) { std::vector outputs(t_list.size()); for (const auto i : c10::irange(t_list.size())) { - outputs.push_back(from_functional_tensor(t_list[i])); + outputs[i] = from_functional_tensor(t_list[i]); } return outputs; } @@ -322,6 +345,81 @@ void sync(const c10::List> t_list) { } } +void replace_(const Tensor& functional_tensor, const Tensor& other) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isFunctionalTensor(functional_tensor)); + unsafeGetFunctionalWrapper(functional_tensor)->replace_(other); +} + +void replace_(const TensorList functional_tensor, TensorList other) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(functional_tensor.size() == other.size()); + for (const auto i : c10::irange(functional_tensor.size())) { + replace_(functional_tensor[i], other[i]); + } +} + + +void commit_update(const Tensor& functional_tensor) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isFunctionalTensor(functional_tensor)); + unsafeGetFunctionalWrapper(functional_tensor)->commit_update(); +} + +void commit_update(const TensorList functional_tensor) { + for (const auto i : c10::irange(functional_tensor.size())) { + commit_update(functional_tensor[i]); + } +} + +bool isFunctionalTensor(const at::Tensor& tensor) { + return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Functionalize); +} + +bool isFunctionalTensor(const c10::optional& t) { + if (t.has_value()) { + return isFunctionalTensor(*t); + } else { + return false; + } +} + +bool isFunctionalTensor(const c10::List& t_list) { + if (t_list.size() == 0) return false; + bool any_functional = isFunctionalTensor(t_list[0]); + for (const auto i : c10::irange(1, t_list.size())) { + auto curr_functional = isFunctionalTensor(t_list[i]); + TORCH_INTERNAL_ASSERT( + curr_functional == any_functional, + "Functionalization encountered a list of tensors where some are functional", + "and some are not, which is not currently unsupported."); + } + return any_functional; +} + +bool isFunctionalTensor(const c10::List>& t_list) { + if (t_list.size() == 0) return false; + bool any_functional = isFunctionalTensor(t_list[0]); + for (const auto i : c10::irange(1, t_list.size())) { + auto curr_functional = isFunctionalTensor(t_list[i]); + TORCH_INTERNAL_ASSERT( + curr_functional == any_functional, + "Functionalization encountered a list of tensors where some are functional", + "and some are not, which is not currently unsupported."); + } + return any_functional; +} + +bool isFunctionalTensor(const c10::ArrayRef t_list) { + if (t_list.size() == 0) return false; + bool any_functional = isFunctionalTensor(t_list[0]); + for (const auto i : c10::irange(1, t_list.size())) { + auto curr_functional = isFunctionalTensor(t_list[i]); + TORCH_INTERNAL_ASSERT( + curr_functional == any_functional, + "Functionalization encountered a list of tensors where some are functional", + "and some are not, which is not currently unsupported."); + } + return any_functional; +} + Tensor create_functional_tensor_with_view_meta(const at::Tensor& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta, int64_t out_idx) { TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap)); TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base)); @@ -373,6 +471,14 @@ void set_sizes_strides_offset(const std::vector& outs, const std::vector } } +thread_local bool _functionalizationReapplyViews; + +bool getFunctionalizationReapplyViewsTLS() { + return _functionalizationReapplyViews; +} +void setFunctionalizationReapplyViewsTLS(bool reapply_views) { + _functionalizationReapplyViews = reapply_views; +} } // namespace impl } // namespace functionalization diff --git a/aten/src/ATen/FunctionalTensorWrapper.h b/aten/src/ATen/FunctionalTensorWrapper.h index 1696b41f1543..d0bb8e0dcf11 100644 --- a/aten/src/ATen/FunctionalTensorWrapper.h +++ b/aten/src/ATen/FunctionalTensorWrapper.h @@ -37,7 +37,6 @@ namespace at { // // See Note [Functionalization: Alias Removal] for details on the aliasing machinery. // See Note [Functionalization: Mutation Removal] for details on mutation removal. - struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { explicit FunctionalTensorWrapper(const Tensor& value); // Additional constructor to create a FunctionalTensorWrapper directly from an underlying tensor that was created from a view. @@ -64,7 +63,8 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { // It can't just call sync_(), because the FunctionalTensorWrapper will look like it has no aliases and sync_ will be a noop. // We use the reference count on storage_ to determine if the wrapper is aliased, and by the time functorch // is ready to propagate updates to inputs, any intermediate views of the input created by the program will have been deallocated. - void apply_updates(); + // This function also returns whether or not the base actually had any updates to apply. + bool apply_updates(); // Takes the current state of value_ and snapshots it, sending it as a pending update to the alias. void commit_update(); // When any tensor is mutated, the tensor increments its alias's "generation". @@ -117,20 +117,24 @@ TORCH_API inline FunctionalTensorWrapper* unsafeGetFunctionalWrapper(const Tenso return functional_impl; } -TORCH_API inline bool isFunctionalTensor(const at::Tensor& tensor) { - return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Functionalize); -} +TORCH_API bool isFunctionalTensor(const at::Tensor& tensor); +TORCH_API bool isFunctionalTensor(const c10::optional& t); +TORCH_API bool isFunctionalTensor(const c10::List& t_list); +TORCH_API bool isFunctionalTensor(const c10::List>& t_list); +TORCH_API bool isFunctionalTensor(const c10::ArrayRef t_list); TORCH_API Tensor to_functional_tensor(const Tensor& tensor); +TORCH_API c10::optional to_functional_tensor(const c10::optional& tensor); TORCH_API c10::List to_functional_tensor(const c10::List& t_list); +TORCH_API c10::List> to_functional_tensor(const c10::List>& t_list); TORCH_API std::vector to_functional_tensor(const std::vector& t_list); -TORCH_API TensorList to_functional_tensor(const TensorList& t_list); +TORCH_API std::vector to_functional_tensor(const TensorList& t_list); TORCH_API Tensor from_functional_tensor(const Tensor& tensor); TORCH_API c10::optional from_functional_tensor(const c10::optional& t); TORCH_API c10::List from_functional_tensor(const c10::List& t_list); TORCH_API c10::List> from_functional_tensor(const c10::List>& t_list); -TORCH_API TensorList from_functional_tensor(const TensorList& tensors); +TORCH_API std::vector from_functional_tensor(const TensorList& tensors); TORCH_API void sync(const at::Tensor& t); TORCH_API void sync(const c10::optional& t); @@ -138,6 +142,12 @@ TORCH_API void sync(const c10::List t_list); TORCH_API void sync(const at::TensorList t_list); TORCH_API void sync(const c10::List> t_list); +TORCH_API void replace_(const Tensor& functional_tensor, const Tensor& other); +TORCH_API void replace_(const TensorList functional_tensor, TensorList other); + +TORCH_API void commit_update(const Tensor& functional_tensor); +TORCH_API void commit_update(const TensorList functional_tensor); + Tensor create_functional_tensor_with_view_meta(const Tensor& view_to_wrap, const Tensor& base, functionalization::ViewMeta meta, int64_t out_idx = 0); std::vector create_functional_tensor_with_view_meta(const c10::List& view_to_wrap, const Tensor& base, functionalization::ViewMeta meta); std::vector create_functional_tensor_with_view_meta(const std::vector& view_to_wrap, const Tensor& base, functionalization::ViewMeta meta); @@ -147,6 +157,32 @@ void mutate_view_meta(const Tensor& self, functionalization::ViewMeta meta); void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out); void set_sizes_strides_offset(const std::vector& outs, const std::vector& meta_outs); + +// ~~~~~ TLS used in functionalization ~~~~~ + +TORCH_API bool getFunctionalizationReapplyViewsTLS(); +TORCH_API void setFunctionalizationReapplyViewsTLS(bool reapply_views); + +class TORCH_API FunctionalizationReapplyViewsGuard { + public: + FunctionalizationReapplyViewsGuard(bool reapply_views) { + prev_ = getFunctionalizationReapplyViewsTLS(); + setFunctionalizationReapplyViewsTLS(reapply_views); + } + + ~FunctionalizationReapplyViewsGuard() { + setFunctionalizationReapplyViewsTLS(prev_); + } + + FunctionalizationReapplyViewsGuard(const FunctionalizationReapplyViewsGuard&) = delete; + FunctionalizationReapplyViewsGuard operator=(const FunctionalizationReapplyViewsGuard&) = delete; + FunctionalizationReapplyViewsGuard(FunctionalizationReapplyViewsGuard&&) = delete; + FunctionalizationReapplyViewsGuard operator=(FunctionalizationReapplyViewsGuard&&) = delete; + + private: + bool prev_; +}; + } // namespace impl } // namespace functionalization } // namespace at diff --git a/aten/src/ATen/FunctionalizeFallbackKernel.cpp b/aten/src/ATen/FunctionalizeFallbackKernel.cpp index f130fc7cdbd4..a86a2db0521c 100644 --- a/aten/src/ATen/FunctionalizeFallbackKernel.cpp +++ b/aten/src/ATen/FunctionalizeFallbackKernel.cpp @@ -4,6 +4,12 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + namespace { void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet, torch::jit::Stack* stack) { const auto& schema = op.schema(); @@ -12,23 +18,45 @@ namespace { const auto arguments_begin = stack->size() - num_arguments; auto arguments = torch::jit::last(stack, num_arguments); + auto any_functional_inputs = false; + auto any_tensor_inputs = false; for (uint64_t idx = 0; idx < num_arguments; ++idx) { const auto& ivalue = arguments[idx]; if (ivalue.isTensor()) { + any_tensor_inputs = true; auto t = ivalue.toTensor(); - at::functionalization::impl::sync(t); - auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t)); - (*stack)[arguments_begin + idx] = t_new; + if (at::functionalization::impl::isFunctionalTensor(t)) { + any_functional_inputs = true; + at::functionalization::impl::sync(t); + auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t)); + (*stack)[arguments_begin + idx] = t_new; + } } else if (ivalue.isTensorList()) { + any_tensor_inputs = true; auto tensors = ivalue.toTensorList(); - at::functionalization::impl::sync(tensors); - auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(tensors)); - (*stack)[arguments_begin + idx] = t_new; + if (at::functionalization::impl::isFunctionalTensor(tensors)) { + any_functional_inputs = true; + at::functionalization::impl::sync(tensors); + auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(tensors)); + (*stack)[arguments_begin + idx] = t_new; + } + } else if (ivalue.isOptionalTensorList()) { + any_tensor_inputs = true; + auto opt_tensors = ivalue.toOptionalTensorList(); + if (at::functionalization::impl::isFunctionalTensor(opt_tensors)) { + any_functional_inputs = true; + at::functionalization::impl::sync(opt_tensors); + auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(opt_tensors)); + (*stack)[arguments_begin + idx] = t_new; + } } } + // we should wrap the output if any inputs were wrapped, + // OR if we're hitting a factory function (with no tensor inputs) + auto should_wrap_outputs = !any_tensor_inputs || any_functional_inputs; { at::AutoDispatchSkipFunctionalize guard; - op.redispatchBoxed(dispatchKeySet & c10::after_func_keyset, stack); + op.callBoxed(stack); } const auto num_returns = schema.returns().size(); const auto returns_begin = stack->size() - num_returns; @@ -36,19 +64,32 @@ namespace { for (const auto idx : c10::irange(num_returns)) { const auto& ivalue = returns[idx]; - if (ivalue.isTensor()) { + if (ivalue.isTensor() && should_wrap_outputs) { auto t = ivalue.toTensor(); auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(t)); (*stack)[returns_begin + idx] = t_new; - } else if (ivalue.isTensorList()) { + } else if (ivalue.isTensorList() && should_wrap_outputs) { auto tensors = ivalue.toTensorList(); auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(tensors)); (*stack)[returns_begin + idx] = t_new; + } else if (ivalue.isOptionalTensorList() && should_wrap_outputs) { + auto opt_tensors = ivalue.toOptionalTensorList(); + auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(opt_tensors)); + (*stack)[returns_begin + idx] = t_new; } } } } +at::Tensor lift_functionalize(const at::Tensor & self) { + TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(self)); + return at::functionalization::impl::to_functional_tensor(self); +} + TORCH_LIBRARY_IMPL(_, Functionalize, m) { m.fallback(torch::CppFunction::makeFromBoxedFunction<&functionalizeFallback>()); } + +TORCH_LIBRARY_IMPL(aten, Functionalize, m) { + m.impl("lift", TORCH_FN(lift_functionalize)); +} diff --git a/aten/src/ATen/NamedTensorUtils.cpp b/aten/src/ATen/NamedTensorUtils.cpp index 24a85b4ce708..0a527bde20c9 100644 --- a/aten/src/ATen/NamedTensorUtils.cpp +++ b/aten/src/ATen/NamedTensorUtils.cpp @@ -459,7 +459,7 @@ std::vector broadcast_to_outnames( return unify_from_right(reference_names, tensor_names); } -std::vector compute_cat_outnames(TensorList tensors) { +std::vector compute_cat_outnames(ITensorListRef tensors) { if (!at::has_names(tensors)) { return {}; } diff --git a/aten/src/ATen/NamedTensorUtils.h b/aten/src/ATen/NamedTensorUtils.h index 8cd3e238159b..b8d421c6a611 100644 --- a/aten/src/ATen/NamedTensorUtils.h +++ b/aten/src/ATen/NamedTensorUtils.h @@ -10,7 +10,7 @@ namespace at { using NameVector = SmallVector; -inline bool has_names(TensorList tensors) { +inline bool has_names(ITensorListRef tensors) { return std::any_of( tensors.begin(), tensors.end(), [](const Tensor& t) { return t.has_names(); }); } @@ -98,7 +98,7 @@ TORCH_API void propagate_names_for_reduction(const Tensor& result, const Tensor& TORCH_API void propagate_names_for_expand(const Tensor& result, const Tensor& self); -TORCH_API std::vector compute_cat_outnames(TensorList tensors); +TORCH_API std::vector compute_cat_outnames(ITensorListRef tensors); TORCH_API std::vector compute_broadcast_outnames( const Tensor& self, diff --git a/aten/src/ATen/NestedTensorImpl.cpp b/aten/src/ATen/NestedTensorImpl.cpp new file mode 100644 index 000000000000..1509bf4a2a04 --- /dev/null +++ b/aten/src/ATen/NestedTensorImpl.cpp @@ -0,0 +1,92 @@ +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { + +inline std::vector construct_opt_sizes(const at::Tensor& sizes) { + if (sizes.dim() == 0) { + return std::vector(); + } + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(sizes.dim() == 2); + std::vector result(1, sizes.sizes()[0]); + if (sizes.dim() > 0) { + size_t nested_dim = result.size(); + int64_t* sizes_ptr = sizes.data_ptr(); + result.resize(nested_dim + sizes.sizes()[1]); + int64_t sizes_size_0 = sizes.sizes()[0]; + int64_t sizes_size_1 = sizes.sizes()[1]; + for (const auto i : c10::irange(sizes_size_1)) { + result[nested_dim + i] = sizes_ptr[i]; + } + for (const auto j : c10::irange(sizes_size_1)) { + for (const auto i : c10::irange(sizes_size_0)) { + if (result[nested_dim + j] && + (result[nested_dim + j] != sizes_ptr[i * sizes.size(1) + j])) { + result[nested_dim + j] = -1; + } + } + } + } + return result; +} + +NestedTensorImpl::NestedTensorImpl( + at::Tensor buffer, + at::Tensor nested_size_tensor) + : TensorImpl( + (c10::DispatchKeySet(DispatchKey::NestedTensor) | + c10::DispatchKeySet(buffer.is_cuda() ? BackendComponent::CUDABit : BackendComponent::CPUBit)), + buffer.dtype(), + buffer.device()), + buffer_(std::move(buffer)), + nested_size_tensor_(std::move(nested_size_tensor)), + opt_sizes_(construct_opt_sizes(nested_size_tensor_)) +{ + TORCH_WARN_ONCE( + "The PyTorch API of nested tensors is in prototype stage and will change " + "in the near future."); + TORCH_INTERNAL_ASSERT(buffer_.is_cuda() || buffer_.is_cpu(), "NestedTensorImpl buffer must be either CUDA or CPU but got ", buffer_); + TORCH_INTERNAL_ASSERT(nested_size_tensor_.is_contiguous()); + int64_t size_dim = nested_size_tensor_.dim(); + TORCH_INTERNAL_ASSERT(size_dim == 0 || size_dim == 2); + remove_autograd_key(); + key_set_ = + key_set_ - c10::DispatchKeySet({c10::DispatchKey::ADInplaceOrView}); + refresh_dim(); + set_sizes_strides_policy(c10::TensorImpl::SizesStridesPolicy::CustomSizes); +} + +void NestedTensorImpl::refresh_dim() { + const auto my_dim = nested_size_tensor_.dim() ? nested_size_tensor_.sizes()[1] + 1 : 1; + sizes_and_strides_.resize(my_dim); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dim() == my_dim); +} + +int64_t NestedTensorImpl::dim_custom() const { + return dim_default(); +} +int64_t NestedTensorImpl::numel_custom() const { + TORCH_CHECK(false, "numel is disabled."); +} +bool NestedTensorImpl::is_contiguous_custom(MemoryFormat) const { + TORCH_CHECK(false, "is_contiguous is disabled."); +} +IntArrayRef NestedTensorImpl::sizes_custom() const { + TORCH_CHECK(false, "Internal error: NestedTensorImpl doesn't support sizes. Please file an issue on https://github.com/pytorch/nestedtensor"); +} + +IntArrayRef NestedTensorImpl::strides_custom() const { + TORCH_CHECK(false, "Internal error: NestedTensorImpl doesn't support strides. Please file an issue on https://github.com/pytorch/nestedtensor"); +} + +const char* NestedTensorImpl::tensorimpl_type_name() const { + return "NestedTensorImpl"; +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/NestedTensorImpl.h b/aten/src/ATen/NestedTensorImpl.h new file mode 100644 index 000000000000..7f29dd620b93 --- /dev/null +++ b/aten/src/ATen/NestedTensorImpl.h @@ -0,0 +1,86 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { + +struct TORCH_API NestedTensorImpl : public c10::TensorImpl { + explicit NestedTensorImpl(at::Tensor buffer, at::Tensor nested_size_tensor); + + // TODO: don't expose private implementation details like this; in + // particular, resizing this tensor will mess up our dim() and + // callers cannot fix it. + const Tensor& get_nested_size_tensor() const { + return nested_size_tensor_; + } + // Returns nullopt if the ith dimension is irregular. The ith dimension + // of a NestedTensor is regular if the unbound tensors match in + // size at the (i-1)th dimension. + c10::optional opt_size(int64_t d) const { + d = at::maybe_wrap_dim(d, dim(), false); + if (opt_sizes_[d] == -1) { + return c10::nullopt; + } + return opt_sizes_[d]; + } + + const at::Tensor& get_buffer() const { + return buffer_; + } + + protected: + const char* tensorimpl_type_name() const override; + + // TODO: numel_custom and is_contiguous_custom can be profitably overridden + // with real implementations + int64_t numel_custom() const override; + bool is_contiguous_custom(MemoryFormat) const override; + IntArrayRef sizes_custom() const override; + IntArrayRef strides_custom() const override; + + // this one is real + int64_t dim_custom() const override; + + private: + // Must be called after any changes to our dim() to sync the state + // to TensorImpl. + void refresh_dim(); + + at::Tensor buffer_; + const at::Tensor nested_size_tensor_; + // NOTE: -1 here means the size is missing + std::vector opt_sizes_; +}; + +inline NestedTensorImpl* get_nested_tensor_impl_or_null(const at::Tensor& tensor) { + if (tensor.is_nested()) { + return static_cast(tensor.unsafeGetTensorImpl()); + } + return nullptr; +} + +inline NestedTensorImpl* get_nested_tensor_impl( + const at::Tensor& tensor) { + TORCH_CHECK( + tensor.is_nested(), + "get_nested_tensor_impl requires a NestedTensor."); + return static_cast( + tensor.unsafeGetTensorImpl()); +} + + +// TODO: real implementation once we support strides. +inline bool nested_tensor_impl_is_contiguous( + const NestedTensorImpl* nt, + at::MemoryFormat memory_format = MemoryFormat::Contiguous) { + return memory_format == MemoryFormat::Contiguous; +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/NumericUtils.h b/aten/src/ATen/NumericUtils.h index f9647389dc03..858aaf7a41f1 100644 --- a/aten/src/ATen/NumericUtils.h +++ b/aten/src/ATen/NumericUtils.h @@ -19,7 +19,7 @@ namespace at { template ::value, int>::type = 0> -inline C10_HOST_DEVICE bool _isnan(T val) { +inline C10_HOST_DEVICE bool _isnan(T /*val*/) { return false; } @@ -63,7 +63,7 @@ inline C10_HOST_DEVICE bool _isnan(at::BFloat16 val) { template ::value, int>::type = 0> -inline C10_HOST_DEVICE bool _isinf(T val) { +inline C10_HOST_DEVICE bool _isinf(T /*val*/) { return false; } diff --git a/aten/src/ATen/OpMathType.h b/aten/src/ATen/OpMathType.h index b58d4779ac7a..c25944b2074f 100644 --- a/aten/src/ATen/OpMathType.h +++ b/aten/src/ATen/OpMathType.h @@ -1,7 +1,9 @@ #pragma once +#include #include #include +#include namespace at { @@ -9,8 +11,26 @@ namespace at { template struct OpMathType { using type = scalar_t; }; template<> struct OpMathType { using type = float; }; template<> struct OpMathType { using type = float; }; +template<> struct OpMathType> { using type = c10::complex; }; template using opmath_type = typename OpMathType::type; +namespace { + +c10::ScalarType toOpMathType(const c10::ScalarType type) { + switch (type) { +#define DEFINE_CASE(scalar_t, TypeNum) \ + case ScalarType::TypeNum: \ + return CppTypeToScalarType>::value; + + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CASE) +#undef DEFINE_CASE + + default: TORCH_INTERNAL_ASSERT(false, "Unrecognized ScalarType: ", type); + } +} + +} + } // namespace at diff --git a/aten/src/ATen/OpaqueTensorImpl.h b/aten/src/ATen/OpaqueTensorImpl.h index 2c337e4a787e..63e451244a52 100644 --- a/aten/src/ATen/OpaqueTensorImpl.h +++ b/aten/src/ATen/OpaqueTensorImpl.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -29,7 +30,7 @@ struct TORCH_API OpaqueTensorImpl : public TensorImpl { : TensorImpl(key_set, data_type, device), opaque_handle_(std::move(opaque_handle)) { set_storage_access_should_throw(); - set_has_contiguity_policy(HasContiguityPolicy::ContiguityNotSupported); + set_sizes_strides_policy(SizesStridesPolicy::CustomStrides); sizes_and_strides_.set_sizes(sizes); refresh_numel(); is_non_overlapping_and_dense_ = is_non_overlapping_and_dense; @@ -40,14 +41,6 @@ struct TORCH_API OpaqueTensorImpl : public TensorImpl { opaque_handle_ = {}; } - IntArrayRef strides() const override { - AT_ERROR("opaque tensors do not have strides"); - } - - int64_t stride(int64_t d) const override { - AT_ERROR("opaque tensors do not have strides"); - } - void set_size(int64_t dim, int64_t new_size) override { AT_ERROR("opaque tensors do not have set_size"); } diff --git a/aten/src/ATen/ParallelNativeTBB.h b/aten/src/ATen/ParallelNativeTBB.h index 01dda99990c8..a3675056f161 100644 --- a/aten/src/ATen/ParallelNativeTBB.h +++ b/aten/src/ATen/ParallelNativeTBB.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include diff --git a/aten/src/ATen/PythonTorchFunctionTLS.cpp b/aten/src/ATen/PythonTorchFunctionTLS.cpp new file mode 100644 index 000000000000..ae9f722de60a --- /dev/null +++ b/aten/src/ATen/PythonTorchFunctionTLS.cpp @@ -0,0 +1,38 @@ +#include +#include + +namespace at { +namespace impl { + +static thread_local PythonTorchFunctionTLS pythonTorchFunctionState; + +void PythonTorchFunctionTLS::set_mode(std::shared_ptr mode) { + pythonTorchFunctionState.mode_ = std::move(mode); +} + +const std::shared_ptr& PythonTorchFunctionTLS::get_mode() { + return pythonTorchFunctionState.mode_; +} + +void PythonTorchFunctionTLS::swap_mode(std::shared_ptr& mode) { + pythonTorchFunctionState.mode_.swap(mode); +} + +void PythonTorchFunctionTLS::set_disabled(bool disabled) { + pythonTorchFunctionState.disabled_ = disabled; +} + +bool PythonTorchFunctionTLS::is_disabled() { + return pythonTorchFunctionState.disabled_; +} + +void PythonTorchFunctionTLS::set_state(const PythonTorchFunctionTLS& state) { + pythonTorchFunctionState = state; +} + +const PythonTorchFunctionTLS& PythonTorchFunctionTLS::get_state() { + return pythonTorchFunctionState; +} + +} // namespace impl +} // namespace at diff --git a/aten/src/ATen/PythonTorchFunctionTLS.h b/aten/src/ATen/PythonTorchFunctionTLS.h new file mode 100644 index 000000000000..64256d2f7c21 --- /dev/null +++ b/aten/src/ATen/PythonTorchFunctionTLS.h @@ -0,0 +1,26 @@ +#pragma once + +#include +#include + +namespace at { +namespace impl { + +struct TORCH_API PythonTorchFunctionTLS { + static void set_disabled(bool); + static bool is_disabled(); + + static void set_mode(std::shared_ptr); + static const std::shared_ptr& get_mode(); + static void swap_mode(std::shared_ptr&); + + static void set_state(const PythonTorchFunctionTLS& state); + static const PythonTorchFunctionTLS& get_state(); + +private: + bool disabled_; + std::shared_ptr mode_; +}; + +} // namespace impl +} // namespace at diff --git a/aten/src/ATen/ScalarOps.cpp b/aten/src/ATen/ScalarOps.cpp index 8eb10266d78f..98a38023f9b4 100644 --- a/aten/src/ATen/ScalarOps.cpp +++ b/aten/src/ATen/ScalarOps.cpp @@ -15,8 +15,8 @@ inline void fill_inplace(Tensor& self, const Scalar& value_scalar) { namespace detail { Tensor& scalar_fill(Tensor& self, const Scalar& value) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( - kHalf, kBool, kBFloat16, self.scalar_type(), "fill_out", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( + kComplexHalf, kHalf, kBool, kBFloat16, self.scalar_type(), "fill_out", [&]() { fill_inplace(self, value); }); return self; diff --git a/aten/src/ATen/SparseCsrTensorImpl.cpp b/aten/src/ATen/SparseCsrTensorImpl.cpp index 2029189912e6..adae0c7c0238 100644 --- a/aten/src/ATen/SparseCsrTensorImpl.cpp +++ b/aten/src/ATen/SparseCsrTensorImpl.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -23,6 +24,7 @@ DeviceType SparseCsrTensorSetToDeviceType(DispatchKeySet key_set) { SparseCsrTensorImpl::SparseCsrTensorImpl( at::DispatchKeySet key_set, + at::Layout layout, const caffe2::TypeMeta data_type) : SparseCsrTensorImpl( key_set, @@ -44,6 +46,8 @@ SparseCsrTensorImpl::SparseCsrTensorImpl( at::initialTensorOptions() .device(SparseCsrTensorSetToDeviceType(key_set)) .dtype(data_type)) // values + , + layout ) {} SparseCsrTensorImpl::SparseCsrTensorImpl( @@ -51,30 +55,54 @@ SparseCsrTensorImpl::SparseCsrTensorImpl( const caffe2::TypeMeta data_type, at::Tensor crow_indices, at::Tensor col_indices, - at::Tensor values) + at::Tensor values, + at::Layout layout) : TensorImpl(key_set, data_type, values.device()), crow_indices_(std::move(crow_indices)), col_indices_(std::move(col_indices)), - values_(std::move(values)) { + values_(std::move(values)), + layout_(layout) { + // https://pytorch.org/blog/pytorch-feature-classification-changes/#beta + TORCH_WARN_ONCE("Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensor support is in beta state." + "If you miss a functionality in the sparse tensor support, please submit a feature request " + "to https://github.com/pytorch/pytorch/issues."); set_storage_access_should_throw(); + is_non_overlapping_and_dense_ = false; + set_sizes_strides_policy(SizesStridesPolicy::CustomStrides); + // TODO: If this check ever shows up as a bottleneck, which is unlikely given that + // comparing devices only involves comparing the type and index (two integers), we + // can move this to a DEBUG only assert. Until then this confirms and maintains a + // crucial invariance. + TORCH_CHECK(values_.device() == crow_indices_.device(), "Values and crow_indices need to be on the same device."); + TORCH_CHECK(values_.device() == col_indices_.device(), "Values and col_indices need to be on the same device."); +} + +const char* SparseCsrTensorImpl::tensorimpl_type_name() const { + return "SparseCsrTensorImpl"; } void SparseCsrTensorImpl::resize_(int64_t nnz, IntArrayRef size) { - auto rows = size[0]; - auto cols = size[1]; + auto rows = size[size.size() - 2]; + auto cols = size[size.size() - 1]; auto old_crow_indices_size = crow_indices_.size(-1); - crow_indices_.resize_({rows + 1}); + + auto new_crow_indices_size = DimVector(size.slice(0, size.size() - 2)); + new_crow_indices_size.push_back(rows + 1); + crow_indices_.resize_(new_crow_indices_size); if (rows + 1 >= old_crow_indices_size) { crow_indices_.narrow(-1, old_crow_indices_size, rows + 1 - old_crow_indices_size).fill_(nnz); } else { crow_indices_.narrow(-1, rows, 1).fill_(std::min(nnz, rows*cols)); } - col_indices_.resize_({std::min(nnz, rows*cols)}); - values_.resize_({std::min(nnz, rows*cols)}); + auto col_indices_values_size = DimVector(size.slice(0, size.size() - 2)); + col_indices_values_size.push_back(std::min(nnz, rows*cols)); + col_indices_.resize_(col_indices_values_size); + values_.resize_(col_indices_values_size); sizes_and_strides_.set_sizes(size); } void SparseCsrTensorImpl::resize_as_sparse_csr_tensor_(const Tensor& src) { + set_layout(src.layout()); crow_indices_ = at::empty_like( src.crow_indices(), src.crow_indices().options(), @@ -112,5 +140,25 @@ void SparseCsrTensorImpl::set_member_tensors( sizes_and_strides_.set_sizes(size); refresh_numel(); + // TODO: If this check ever shows up as a bottleneck, which is unlikely given that + // comparing devices only involves comparing the type and index (two integers), we + // can move this to a DEBUG only assert. Until then this confirms and maintains a + // crucial invariance. + TORCH_CHECK(values_.device() == crow_indices_.device(), "Values and crow_indices need to be on the same device."); + TORCH_CHECK(values_.device() == col_indices_.device(), "Values and col_indices need to be on the same device."); +} + +IntArrayRef SparseCsrTensorImpl::strides_custom() const { + TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have strides"); +} +void SparseCsrTensorImpl::set_size(int64_t dim, int64_t new_size) { + TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have set_size."); +} +void SparseCsrTensorImpl::set_stride(int64_t dim, int64_t new_stride) { + TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have set_stride."); } +void SparseCsrTensorImpl::set_storage_offset(int64_t storage_offset) { + TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have set_storage_offset."); +} + } // namespace at diff --git a/aten/src/ATen/SparseCsrTensorImpl.h b/aten/src/ATen/SparseCsrTensorImpl.h index 850e0a02a448..174ce53a2dad 100644 --- a/aten/src/ATen/SparseCsrTensorImpl.h +++ b/aten/src/ATen/SparseCsrTensorImpl.h @@ -28,9 +28,10 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl { Tensor crow_indices_; Tensor col_indices_; Tensor values_; + Layout layout_; public: - explicit SparseCsrTensorImpl(at::DispatchKeySet, const caffe2::TypeMeta); + explicit SparseCsrTensorImpl(at::DispatchKeySet, Layout layout, const caffe2::TypeMeta); void resize_(int64_t nnz, IntArrayRef size); void resize_as_sparse_csr_tensor_(const Tensor& src); @@ -40,10 +41,31 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl { const Tensor& values, IntArrayRef size); - const Tensor& crow_indices() const { return crow_indices_; } - const Tensor& col_indices() const { return col_indices_; } + const Tensor& compressed_indices() const { return crow_indices_; } + const Tensor& plain_indices() const { return col_indices_; } const Tensor& values() const { return values_; } - int nnz() { return values_.size(0); } + int nnz() { return col_indices_.size(-1); } + + protected: + IntArrayRef strides_custom() const override; + + public: + void set_size(int64_t dim, int64_t new_size) override; + void set_stride(int64_t dim, int64_t new_stride) override; + void set_storage_offset(int64_t storage_offset) override; + Layout layout_impl() const override { return layout_; } + void set_layout(Layout layout) { + switch (layout) { + case kSparseCsr: + case kSparseCsc: + case kSparseBsr: + case kSparseBsc: + layout_ = layout; + break; + default: + TORCH_CHECK(false, "unsupported layout ", layout); + } + } /** * Return a TensorImpl that is a shallow-copy of this TensorImpl. @@ -54,7 +76,7 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl { c10::intrusive_ptr shallow_copy_and_detach( const c10::VariableVersion& version_counter, bool allow_tensor_metadata_change) const override { - auto impl = c10::make_intrusive(key_set(), dtype()); + auto impl = c10::make_intrusive(key_set(), layout_impl(), dtype()); copy_tensor_metadata( /*src_impl=*/this, /*dest_impl=*/impl.get(), @@ -73,7 +95,7 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl { c10::intrusive_ptr shallow_copy_and_detach( c10::VariableVersion&& version_counter, bool allow_tensor_metadata_change) const override { - auto impl = c10::make_intrusive(key_set(), dtype()); + auto impl = c10::make_intrusive(key_set(), layout_impl(), dtype()); copy_tensor_metadata( /*src_impl=*/this, /*dest_impl=*/impl.get(), @@ -89,7 +111,10 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl { const caffe2::TypeMeta data_type, at::Tensor crow_indices, at::Tensor col_indices, - at::Tensor values); + at::Tensor values, + at::Layout layout); + + const char* tensorimpl_type_name() const override; /** * Copy the tensor metadata fields (e.g. sizes / strides / storage pointer / storage_offset) @@ -105,9 +130,10 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl { TensorImpl::copy_tensor_metadata(src_sparse_impl, dest_sparse_impl, version_counter, allow_tensor_metadata_change); // Sparse-specific fields - dest_sparse_impl->crow_indices_ = src_sparse_impl->crow_indices(); - dest_sparse_impl->col_indices_ = src_sparse_impl->col_indices(); + dest_sparse_impl->crow_indices_ = src_sparse_impl->compressed_indices(); + dest_sparse_impl->col_indices_ = src_sparse_impl->plain_indices(); dest_sparse_impl->values_ = src_sparse_impl->values(); + dest_sparse_impl->layout_ = src_sparse_impl->layout_impl(); } }; } // namespace at diff --git a/aten/src/ATen/SparseCsrTensorUtils.h b/aten/src/ATen/SparseCsrTensorUtils.h index 6dd328003ca8..dfc7ff881304 100644 --- a/aten/src/ATen/SparseCsrTensorUtils.h +++ b/aten/src/ATen/SparseCsrTensorUtils.h @@ -5,16 +5,151 @@ #include #include +#define AT_DISPATCH_ALL_SPARSE_COMPRESSED_LAYOUTS(LAYOUT, NAME, ...) \ + [&] { \ + const auto& the_layout = LAYOUT; \ + switch (the_layout) { \ + case kSparseCsr: \ + case kSparseCsc: \ + case kSparseBsr: \ + case kSparseBsc: \ + return __VA_ARGS__(); \ + default: \ + AT_ERROR(#NAME, " expected sparse compressed tensor layout but got ", the_layout); \ + } \ + } () + +#define AT_DISPATCH_ROW_SPARSE_COMPRESSED_LAYOUTS(LAYOUT, NAME, ROW_DIM_ACTION, COLUMN_DIM_ACTION) \ + [&]() { \ + const auto& the_layout = LAYOUT; \ + switch (the_layout) { \ + case kSparseCsr: \ + case kSparseBsr: \ + return (ROW_DIM_ACTION)(); \ + case kSparseCsc: \ + case kSparseBsc: \ + return (COLUMN_DIM_ACTION)(); \ + default: \ + AT_ERROR(#NAME, " expected sparse compressed tensor layout but got ", the_layout); \ + } \ + } () + +#define AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS(LAYOUT, NAME, NO_BLOCK_ACTION, BLOCK_ACTION) \ + [&]() { \ + const auto& the_layout = LAYOUT; \ + switch (the_layout) { \ + case kSparseCsr: \ + case kSparseCsc: \ + return (NO_BLOCK_ACTION)(); \ + case kSparseBsr: \ + case kSparseBsc: \ + return (BLOCK_ACTION)(); \ + default: \ + AT_ERROR(#NAME, " expected sparse compressed tensor layout but got ", the_layout); \ + } \ + } () + +#define AT_DISPATCH_SPARSE_ROW_COMPRESSED_LAYOUTS(LAYOUT, NAME, ROW_DIM_ACTION) \ + [&]() { \ + const auto& the_layout = LAYOUT; \ + switch (the_layout) { \ + case kSparseCsr: \ + case kSparseBsr: \ + return (ROW_DIM_ACTION)(); \ + default: \ + AT_ERROR(#NAME, " expected sparse row compressed tensor layout but got ", the_layout); \ + } \ + } () + +#define AT_DISPATCH_SPARSE_COL_COMPRESSED_LAYOUTS(LAYOUT, NAME, COL_DIM_ACTION) \ + [&]() { \ + const auto& the_layout = LAYOUT; \ + switch (the_layout) { \ + case kSparseCsc: \ + case kSparseBsc: \ + return (COL_DIM_ACTION)(); \ + default: \ + AT_ERROR(#NAME, " expected sparse column compressed tensor layout but got ", the_layout); \ + } \ + } () + +#define AT_DISPATCH_SPARSE_COMPRESSED_NONBLOCK_LAYOUTS(LAYOUT, NAME, ACTION) \ + [&]() { \ + const auto& the_layout = LAYOUT; \ + switch (the_layout) { \ + case kSparseCsr: \ + case kSparseCsc: \ + return (ACTION)(); \ + default: \ + AT_ERROR(#NAME, " expected sparse compressed (non-block) tensor layout but got ", the_layout); \ + } \ + } () + +#define AT_DISPATCH_SPARSE_COMPRESSED_BLOCK_LAYOUTS(LAYOUT, NAME, ACTION) \ + [&]() { \ + const auto& the_layout = LAYOUT; \ + switch (the_layout) { \ + case kSparseBsr: \ + case kSparseBsc: \ + return (ACTION)(); \ + default: \ + AT_ERROR(#NAME, " expected sparse compressed block tensor layout but got ", the_layout); \ + } \ + } () + namespace at { namespace sparse_csr { using SparseCsrTensor = Tensor; inline SparseCsrTensorImpl* get_sparse_csr_impl(const SparseCsrTensor& self) { - AT_ASSERTM( - self.is_sparse_csr(), - "_internal_get_SparseCsrTensorImpl: not a sparse CSR tensor"); + AT_DISPATCH_ALL_SPARSE_COMPRESSED_LAYOUTS(self.layout(), "get_sparse_csr_impl", [&] {}); return static_cast(self.unsafeGetTensorImpl()); } -} // namespace sparse + +inline std::string layoutToString(Layout layout, bool upper=false, bool lower=false) { + switch (layout) { + case kSparseCsr: return (upper ? "CSR" : (lower ? "csr" : "Csr")); + case kSparseCsc: return (upper ? "CSC" : (lower ? "csc" : "Csc")); + case kSparseBsr: return (upper ? "BSR" : (lower ? "bsr" : "Bsr")); + case kSparseBsc: return (upper ? "BSC" : (lower ? "bsc" : "Bsc")); + default: + TORCH_CHECK(false, "Not a sparse compressed layout:", layout); + return ""; + } +} + +inline bool isCompressedRow(Layout layout) { + return AT_DISPATCH_ROW_SPARSE_COMPRESSED_LAYOUTS(layout, "isCompressedRow", [&]{ return true; }, [&]{ return false; }); +} + +inline bool isCompressedColumn(Layout layout) { + return AT_DISPATCH_ROW_SPARSE_COMPRESSED_LAYOUTS(layout, "isCompressedColumn", [&]{ return false; }, [&]{ return true; }); +} + +inline std::string compressedIndicesName(Layout layout) { + return AT_DISPATCH_ROW_SPARSE_COMPRESSED_LAYOUTS(layout, "compressedIndicesName", [&]{ return "crow_indices"; }, [&]{ return "ccol_indices"; }); +} + +inline std::string plainIndicesName(Layout layout) { + return AT_DISPATCH_ROW_SPARSE_COMPRESSED_LAYOUTS(layout, "plainIndicesName", [&]{ return "col_indices"; }, [&]{ return "row_indices"; }); +} + +inline int rowDimension(Layout layout, IntArrayRef size) { + return size.size() - (isCompressedRow(layout) ? 2 : 1); +} + +inline int columnDimension(Layout layout, IntArrayRef size) { + return size.size() - (isCompressedColumn(layout) ? 2 : 1); +} + +inline int compressedDimension(Layout layout, IntArrayRef size) { + return size.size() - (isCompressedRow(layout) ? 2 : 1); +} + +inline int plainDimension(Layout layout, IntArrayRef size) { + return size.size() - (isCompressedRow(layout) ? 1 : 2); +} + +} // namespace sparse_csr } // namespace at diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp index e144ffd479d6..61303866c450 100644 --- a/aten/src/ATen/SparseTensorImpl.cpp +++ b/aten/src/ATen/SparseTensorImpl.cpp @@ -51,7 +51,7 @@ SparseTensorImpl::SparseTensorImpl(at::DispatchKeySet key_set, const caffe2::Typ is_non_overlapping_and_dense_ = false; set_storage_access_should_throw(); - set_has_contiguity_policy(HasContiguityPolicy::ContiguityNotSupported); + set_sizes_strides_policy(SizesStridesPolicy::CustomStrides); } void SparseTensorImpl::release_resources() { @@ -60,12 +60,6 @@ void SparseTensorImpl::release_resources() { indices_.reset(); } -IntArrayRef SparseTensorImpl::strides() const { - AT_ERROR("sparse tensors do not have strides"); -} -int64_t SparseTensorImpl::stride(int64_t d) const { - AT_ERROR("sparse tensors do not have strides"); -} void SparseTensorImpl::set_size(int64_t dim, int64_t new_size) { AT_ERROR("sparse tensors do not have set_size"); } diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h index a52236d9369b..7381540ea3ff 100644 --- a/aten/src/ATen/SparseTensorImpl.h +++ b/aten/src/ATen/SparseTensorImpl.h @@ -54,8 +54,6 @@ struct TORCH_API SparseTensorImpl : public TensorImpl { Tensor indices() const { return indices_; } Tensor values() const { return values_; } - IntArrayRef strides() const override; - int64_t stride(int64_t d) const override; void set_size(int64_t dim, int64_t new_size) override; void set_stride(int64_t dim, int64_t new_stride) override; void set_storage_offset(int64_t storage_offset) override; diff --git a/aten/src/ATen/SparseTensorUtils.cpp b/aten/src/ATen/SparseTensorUtils.cpp index d5811b933e7c..712e85e851be 100644 --- a/aten/src/ATen/SparseTensorUtils.cpp +++ b/aten/src/ATen/SparseTensorUtils.cpp @@ -30,7 +30,7 @@ Tensor flatten_indices(const Tensor& indices, IntArrayRef full_size, bool force_ } } else { std::vector indices_mult_cpu_vec; - indices_mult_cpu_vec.reserve(sparse_dim); + indices_mult_cpu_vec.resize(sparse_dim); int64_t mult = 1; for (int64_t i = sparse_dim - 1; i >= 0; i--) { indices_mult_cpu_vec[i] = mult; diff --git a/aten/src/ATen/Tensor.h b/aten/src/ATen/Tensor.h index 1dfb8bb4ffcb..0b3719cca3bf 100644 --- a/aten/src/ATen/Tensor.h +++ b/aten/src/ATen/Tensor.h @@ -1,3 +1,3 @@ #pragma once -#include +#include diff --git a/aten/src/ATen/TensorGeometry.cpp b/aten/src/ATen/TensorGeometry.cpp index 20ab6bb6690c..164a7b279129 100644 --- a/aten/src/ATen/TensorGeometry.cpp +++ b/aten/src/ATen/TensorGeometry.cpp @@ -1,10 +1,30 @@ #include -#include -#include +#include +#include namespace at { +// See TensorGeometry.h on why this is useful now that we cache is_contiguous. +bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides) { + assert(!overflows(sizes.size())); + auto dim = static_cast(sizes.size()); + int64_t expected_stride = 1; + bool contig_if_nonempty = true; + for (int64_t i = dim - 1; i >= 0; i--) { + if (sizes[i] == 0) { + return true; + } + if (contig_if_nonempty) { + if (sizes[i] != 1 && strides[i] != expected_stride) { + contig_if_nonempty = false; + } + expected_stride *= sizes[i]; + } + } + return contig_if_nonempty; +} + bool TensorGeometry::is_contiguous() const { if (numel_ == 0) { return true; diff --git a/aten/src/ATen/TensorGeometry.h b/aten/src/ATen/TensorGeometry.h index ad3e16da4a6a..7762cc94df61 100644 --- a/aten/src/ATen/TensorGeometry.h +++ b/aten/src/ATen/TensorGeometry.h @@ -1,10 +1,17 @@ #pragma once -#include -#include +#include +#include namespace at { +// Return if the tensor geometry represented by `sizes` and `strides` is contiguous +// Although we cache is_contiguous in tensor now, this is till useful because it +// allows checking if a particular geometry is contiguous without explicitly +// constructing a tensor, e.g., when you want to choose a kernel strategy based +// on whether a subgeometry is contiguous. +TORCH_API bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides); + struct TORCH_API TensorGeometry { TensorGeometry() : storage_offset_(0) {} @@ -21,7 +28,7 @@ struct TORCH_API TensorGeometry { numel_ = expected_stride; } - explicit TensorGeometry(const Tensor& t) + explicit TensorGeometry(const TensorBase& t) : sizes_(t.sizes().vec()) , strides_(t.strides().vec()) , storage_offset_(t.storage_offset()) @@ -32,12 +39,12 @@ struct TORCH_API TensorGeometry { int64_t dim() const { return sizes_.size(); } int64_t size(int64_t dim) const { - dim = maybe_wrap_dim(dim, this->dim()); + dim = c10::maybe_wrap_dim(dim, this->dim()); return sizes_.at(static_cast(dim)); } IntArrayRef sizes() const { return IntArrayRef{ sizes_ }; } int64_t stride(int64_t dim) const { - dim = maybe_wrap_dim(dim, this->dim()); + dim = c10::maybe_wrap_dim(dim, this->dim()); return strides_.at(static_cast(dim)); } IntArrayRef strides() const { return IntArrayRef{ strides_ }; } diff --git a/aten/src/ATen/TensorIndexing.h b/aten/src/ATen/TensorIndexing.h index 71c9c3feb9e7..8352b510f609 100644 --- a/aten/src/ATen/TensorIndexing.h +++ b/aten/src/ATen/TensorIndexing.h @@ -217,7 +217,7 @@ static inline Tensor applySelect( int64_t dim, int64_t index, int64_t real_dim, - const at::Device& self_device, + const at::Device& /*self_device*/, const IntArrayRef& self_sizes) { TORCH_CHECK_INDEX( !(index == 0 && dim == 0 && self_sizes.size() == 0), @@ -272,7 +272,7 @@ static inline void recordTensorIndex(const Tensor& tensor, std::vector& (*dim_ptr)++; }; -static inline c10::List> typeConvertIndices(const Tensor& self, std::vector&& indices) { +static inline c10::List> typeConvertIndices(const Tensor& /*self*/, std::vector&& indices) { c10::List> converted_inds; converted_inds.reserve(indices.size()); for (const auto &i: indices){ diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp index f978456754d9..907ec8c5c57d 100644 --- a/aten/src/ATen/TensorIterator.cpp +++ b/aten/src/ATen/TensorIterator.cpp @@ -1,3 +1,4 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #define TORCH_ASSERT_NO_OPERATORS #include #undef TORCH_ASSERT_NO_OPERATORS @@ -13,6 +14,13 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#endif + #include #include @@ -564,19 +572,19 @@ void TensorIteratorBase::allocate_or_resize_outputs() { // can just return contiguous output // it is faster because it avoids allocating 0 size tensor and // resizing and restriding it - set_output(i, tensor_shape, {}, original_options(op), names_); + set_output_raw_strided(i, tensor_shape, {}, original_options(op), names_); } else { auto tensor_stride = invert_perm(op.stride_bytes); for (const auto dim : c10::irange(ndim())) { tensor_stride[dim] /= element_size; } - set_output(i, tensor_shape, tensor_stride, original_options(op), names_); + set_output_raw_strided(i, tensor_shape, tensor_stride, original_options(op), names_); } op.current_dtype = op.target_dtype; } else if (op.tensor_base().defined()) { // Even if we don't resize, we still need to tell set_output about // the output, so that we properly set guard and propagate names - set_output(i, op.tensor_base().sizes(), {}, original_options(op), names_); + set_output_raw_strided(i, op.tensor_base().sizes(), {}, original_options(op), names_); } } } @@ -1326,7 +1334,7 @@ bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) { if (!op.tensor_base().defined()) { TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i); } - set_output(i, shape_, {}, original_options(op).memory_format(MemoryFormat::Contiguous), names_); + set_output_raw_strided(i, shape_, {}, original_options(op).memory_format(MemoryFormat::Contiguous), names_); } break; } @@ -1337,7 +1345,7 @@ bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) { if (!op.tensor_base().defined()) { TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i); } - set_output(i, shape_, {}, original_options(op).memory_format(MemoryFormat::ChannelsLast), names_); + set_output_raw_strided(i, shape_, {}, original_options(op).memory_format(MemoryFormat::ChannelsLast), names_); } break; } @@ -1354,7 +1362,7 @@ bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) { if (!op.tensor_base().defined()) { TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i); } - set_output(i, shape_, tensor_base(i_defined).strides(), original_options(op), names_); + set_output_raw_strided(i, shape_, tensor_base(i_defined).strides(), original_options(op), names_); } break; } @@ -1485,8 +1493,10 @@ void TensorIteratorBase::build(TensorIteratorConfig& config) { // Nothing beyond this point is important for meta functions, so it's fine to exit early here. // Extend the condition to ORT tesnors as ORT tensors also don't have storage. if (common_device_.type() == DeviceType::XLA || + common_device_.type() == DeviceType::IPU || common_device_.type() == DeviceType::Lazy || - common_device_.type() == DeviceType::ORT) return; + common_device_.type() == DeviceType::ORT || + common_device_.type() == DeviceType::HPU) return; for (auto& op : operands_) { TORCH_INTERNAL_ASSERT(op.tensor_base().defined()); @@ -1501,14 +1511,14 @@ void TensorIteratorBase::build(TensorIteratorConfig& config) { view_offsets_ = DimVector(ndim_offsets, 0); } -// This is the structured kernels implementation of set_output. It is +// This is the structured kernels' implementation of set_output. It is // NEVER actually called directly; instead, a subclass of TensorIteratorBase // will override set_output to actually do the operation, and then call // set_output on the TensorIteratorBase to setup TI's metadata. // The precondition for this function is that maybe_get_output() now // unconditionally returns a real Tensor (prior to output setting, // this function may return an undefined tensor.) -void TensorIteratorBase::set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) { +void TensorIteratorBase::set_output_raw_strided(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) { auto& op = operands_[output_idx]; TORCH_INTERNAL_ASSERT_DEBUG_ONLY(output_idx < num_outputs_); const auto& t = maybe_get_output(output_idx); @@ -1575,7 +1585,7 @@ void TensorIteratorBase::set_output(int64_t output_idx, IntArrayRef sizes, IntAr // This is the "traditional" implementation of set_output. On TensorIterator // instances, it is invoked directly from various call sites in this file. No // funny business. -void TensorIterator::set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) { +void TensorIterator::set_output_raw_strided(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) { // NB: intentionally no superclass call auto& op = operands_[output_idx]; TORCH_INTERNAL_ASSERT_DEBUG_ONLY(output_idx < num_outputs_); diff --git a/aten/src/ATen/TensorIterator.h b/aten/src/ATen/TensorIterator.h index 1c485e84f16d..d09f6e77e95a 100644 --- a/aten/src/ATen/TensorIterator.h +++ b/aten/src/ATen/TensorIterator.h @@ -415,7 +415,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase { return true; } - void set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) override; + void set_output_raw_strided(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) override; #define TORCH_DISALLOW_TEMPORARIES_IMPL(methodname, maybestatic) \ maybestatic void methodname(TensorBase&& out, const TensorBase& a, const TensorBase& b) = delete; \ @@ -591,7 +591,7 @@ struct TORCH_API TensorIterator final : public TensorIteratorBase { #undef TORCH_DISALLOW_TEMPORARIES_IMPL const Tensor& maybe_get_output(int64_t output_idx) override; - void set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) override; + void set_output_raw_strided(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) override; }; class TORCH_API TensorIteratorConfig final { diff --git a/aten/src/ATen/TensorMeta.h b/aten/src/ATen/TensorMeta.h index 128bb67aa970..5608046db598 100644 --- a/aten/src/ATen/TensorMeta.h +++ b/aten/src/ATen/TensorMeta.h @@ -2,6 +2,7 @@ #include #include +#include #include C10_CLANG_DIAGNOSTIC_PUSH() @@ -62,7 +63,10 @@ namespace impl { // // A notable subclass of this interface is TensorIteratorBase. struct TORCH_API MetaBase { - virtual void set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) = 0; + virtual void set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) { + set_output_raw_strided(output_idx, sizes, strides, options, names); + } + virtual const Tensor& maybe_get_output(int64_t output_idx) = 0; void set_output(IntArrayRef sizes, TensorOptions options) { set_output(0, sizes, {}, options, {}); @@ -70,6 +74,59 @@ struct TORCH_API MetaBase { void set_output(int64_t output_idx, IntArrayRef sizes, TensorOptions options) { set_output(output_idx, sizes, {}, options, {}); } + + // See: https://github.com/pytorch/pytorch/issues/69813 + // Whenever defining the output properties in the META function of a structured + // kernel (what was usually done with `set_output`), use one of these 3 variants, + // instead. In order to decide which variant to use, check the following + // decision tree: + // + // - Can the kernel you are going to implement support output tensors + // with arbitrary strides? + // | + // -- YES: `set_output_raw_strided` + // | + // -- NO: Should the output tensor strides be contiguous? + // | + // -- YES: `set_output_contiguous` + // | + // -- NO: `set_output_strided` + // + // Use this function whenever the kernel requires specific strides for the output. + // If `strides` does not match the given output strides, proxy outputs will be + // created and passed to the IMPL function. + virtual void set_output_strided( + int64_t output_idx, + IntArrayRef sizes, + IntArrayRef strides, + TensorOptions options, + DimnameList names = {}) { + TORCH_INTERNAL_ASSERT(false, "set_output_strided not implemented."); + } + + // Use this function whenever the kernel knows how to handle arbitrary strided outputs. + // This function has the same behavior as the old `set_output`: it will only + // re-stride if the given output was resized. + virtual void set_output_raw_strided( + int64_t output_idx, + IntArrayRef sizes, + IntArrayRef strides_hint, + TensorOptions options, + DimnameList names = {}) { + TORCH_INTERNAL_ASSERT(false, "set_output_strided not implemented."); + } + + // Use this function if the kernel requires contiguous strides. + // Alias for `set_output_strided`, but with contiguous strides. + void set_output_contiguous( + int64_t output_idx, + IntArrayRef sizes, + TensorOptions options, + DimnameList names = {}) { + auto strides = c10::contiguous_strides(sizes); + set_output_strided(output_idx, sizes, strides, options, names); + } + // Returns a reference to an undefined tensor if there is no presupplied // output const Tensor& maybe_get_output() { return maybe_get_output(0); } diff --git a/aten/src/ATen/TensorSubclassLikeUtils.h b/aten/src/ATen/TensorSubclassLikeUtils.h index 7f5517bc0811..e9f5e7d26e11 100644 --- a/aten/src/ATen/TensorSubclassLikeUtils.h +++ b/aten/src/ATen/TensorSubclassLikeUtils.h @@ -28,8 +28,7 @@ constexpr auto kFunctorchWrappedTensors = DispatchKeySet({ constexpr auto kTensorSubclassLike = kFunctorchWrappedTensors | DispatchKeySet({ DispatchKey::Batched, - DispatchKey::SparseCPU, - DispatchKey::SparseCUDA, + DispatchKey::Sparse, DispatchKey::SparseCsrCPU, DispatchKey::SparseCsrCUDA, DispatchKey::Meta, diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp index 754c73bb6154..7fbddd7a3482 100644 --- a/aten/src/ATen/TensorUtils.cpp +++ b/aten/src/ATen/TensorUtils.cpp @@ -129,15 +129,15 @@ void checkAllSameNumel(CheckedFrom c, ArrayRef tensors) { } void checkSameGPU(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) { - if (! (t1->is_cuda()) || ! (t2->is_cuda())) { + if (t1->is_cpu() || t2->is_cpu()) { std::ostringstream oss; - if (! t1->is_cuda()) { + if (t1->is_cpu()) { oss << "Tensor for " << t1 << " is on CPU, "; } - if (! t2->is_cuda()) { + if (t2->is_cpu()) { oss << "Tensor for " << t2 << " is on CPU, "; } - oss << "but expected " << ((!(t1->is_cuda() || t2->is_cuda())) ? "them" : "it") + oss << "but expected " << ((!t1->is_cpu() && !t2->is_cpu()) ? "them" : "it") << " to be on GPU (while checking arguments for " << c << ")"; AT_ERROR(oss.str()); } @@ -264,25 +264,6 @@ void * maybe_data_ptr(const TensorArg& tensor) { return tensor->defined() ? (void *)tensor->data_ptr() : nullptr; } -// See TensorUtils.h on why this is useful now that we cache is_contiguous. -bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides) { - int64_t dim = sizes.size(); - int64_t expected_stride = 1; - bool contig_if_nonempty = true; - for (int64_t i = dim - 1; i >= 0; i--) { - if (sizes[i] == 0) { - return true; - } - if (contig_if_nonempty) { - if (sizes[i] != 1 && strides[i] != expected_stride) { - contig_if_nonempty = false; - } - expected_stride *= sizes[i]; - } - } - return contig_if_nonempty; -} - void check_dim_size( const Tensor& tensor, int64_t dim, diff --git a/aten/src/ATen/TensorUtils.h b/aten/src/ATen/TensorUtils.h index f018c33f1aea..e8adf16ca183 100644 --- a/aten/src/ATen/TensorUtils.h +++ b/aten/src/ATen/TensorUtils.h @@ -138,13 +138,6 @@ TORCH_API void checkLayout(CheckedFrom c, at::ArrayRef tensors, at::Layo TORCH_API void* maybe_data_ptr(const Tensor& tensor); TORCH_API void* maybe_data_ptr(const TensorArg& tensor); -// Return if the tensor geometry represented by `sizes` and `strides` is contiguous -// Although we cache is_contiguous in tensor now, this is till useful because it -// allows checking if a particular geometry is contiguous without explicitly -// constructing a tensor, e.g., when you want to choose a kernel strategy based -// on whether a subgeometry is contiguous. -TORCH_API bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides); - TORCH_API void check_dim_size( const Tensor& tensor, int64_t dim, diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp index 3e3d4d6a9573..8315ddad97b2 100644 --- a/aten/src/ATen/ThreadLocalState.cpp +++ b/aten/src/ATen/ThreadLocalState.cpp @@ -13,13 +13,13 @@ ThreadLocalState::ThreadLocalState() : dispatch_key_(c10::impl::tls_local_dispatch_key_set()), debug_info_(c10::ThreadLocalDebugInfo::current()), functorch_tls_(functorch::getCopyOfFuncTorchTLS()), - autograd_tls_(c10::AutogradState::get_tls_state()) { + autograd_tls_(c10::AutogradState::get_tls_state()), + python_torch_function_state_(at::impl::PythonTorchFunctionTLS::get_state()) { rf_tls_ = at::get_record_function_tls_(); saved_tensors_default_hooks_ = at::SavedTensorDefaultHooks::get_stack(); - bumped_record_all_functions_ = at::checkRecordAllFunctions(); - python_mode_state_ = at::impl::PythonModeTLS::get_state(); + torch_dispatch_mode_state_ = at::impl::TorchDispatchModeTLS::get_state(); } void ThreadLocalState::set_grad_mode(bool enabled) { @@ -33,7 +33,9 @@ void ThreadLocalState::setThreadLocalState( // restore the dispatch key set TLS at the same time. c10::AutogradState::set_tls_state(state.autograd_tls_); - at::impl::PythonModeTLS::set_state(state.python_mode_state_); + at::impl::TorchDispatchModeTLS::set_state(state.torch_dispatch_mode_state_); + + at::impl::PythonTorchFunctionTLS::set_state(state.python_torch_function_state_); at::set_record_function_tls_(state.rf_tls_); diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h index c5f14518f422..3818827d479b 100644 --- a/aten/src/ATen/ThreadLocalState.h +++ b/aten/src/ATen/ThreadLocalState.h @@ -9,7 +9,8 @@ #include #include -#include +#include +#include namespace at { @@ -53,14 +54,15 @@ class TORCH_API ThreadLocalState { // TLS for AutogradModes AutogradState autograd_tls_; - std::shared_ptr python_mode_state_; + // TLS for enable_torch_dispatch_mode + std::shared_ptr torch_dispatch_mode_state_; + + // TLS for __torch_function__ (mode and disable_torch_function) + at::impl::PythonTorchFunctionTLS python_torch_function_state_; // TLS for saved tensors default hooks std::stack> saved_tensors_default_hooks_; - // Whether pre-sampling RecordFunction optimization was enabled - bool bumped_record_all_functions_ = false; - friend class ThreadLocalStateGuard; }; @@ -68,21 +70,7 @@ class TORCH_API ThreadLocalState { class TORCH_API ThreadLocalStateGuard { public: explicit ThreadLocalStateGuard(const ThreadLocalState& state) - : prev_state_(ThreadLocalState()), - bumped_record_all_functions_(state.bumped_record_all_functions_) { - // Special handling of RecordFunction pre-sampling optimization: - // pre-samping is enabled (bumped) when there're non-sampled - // (or high-frequency) global or TLS callbacks. - // - // ThreadLocalStateGuard simply resets RecordFunction's TLS and - // hence its thread local callbacks. - // - // Checking if the pre-sampling was enabled and preserving it in the - // async task by calling bumpRecordAllFunctions() and the corresponding - // releaseRecordAllFunctions() - if (bumped_record_all_functions_) { - at::bumpRecordAllFunctions(); - } + : prev_state_(ThreadLocalState()) { // set the given state across the thread boundary ThreadLocalState::setThreadLocalState(state); } @@ -90,15 +78,10 @@ class TORCH_API ThreadLocalStateGuard { ~ThreadLocalStateGuard() { // restore previously set variables ThreadLocalState::setThreadLocalState(prev_state_); - if (bumped_record_all_functions_) { - at::releaseRecordAllFunctions(); - } } private: const ThreadLocalState prev_state_; - // Whether pre-sampling RecordFunction optimization was enabled - bool bumped_record_all_functions_ = false; }; template diff --git a/aten/src/ATen/Utils.h b/aten/src/ATen/Utils.h index 9160cbe2fedd..36b0785400ba 100644 --- a/aten/src/ATen/Utils.h +++ b/aten/src/ATen/Utils.h @@ -91,29 +91,6 @@ std::array check_intlist(ArrayRef list, const char * name, return res; } -/** - * Utility function to static cast input Generator* to - * the backend generator type (CPU/CUDAGeneratorImpl etc.) - */ -template -static inline T * check_generator(c10::optional gen) { - TORCH_CHECK(gen.has_value(), "Expected Generator but received nullopt"); - TORCH_CHECK(gen->defined(), "Generator with undefined implementation is not allowed"); - TORCH_CHECK(T::device_type() == gen->device().type(), "Expected a '", T::device_type(), "' device type for generator but found '", gen->device().type(), "'"); - return gen->get(); -} - -/** - * Utility function used in tensor implementations, which - * supplies the default generator to tensors, if an input generator - * is not supplied. The input Generator* is also static casted to - * the backend generator type (CPU/CUDAGeneratorImpl etc.) - */ -template -static inline T* get_generator_or_default(const c10::optional& gen, const Generator& default_gen) { - return gen.has_value() && gen->defined() ? check_generator(gen) : check_generator(default_gen); -} - using at::detail::check_size_nonnegative; namespace detail { diff --git a/aten/src/ATen/Version.cpp b/aten/src/ATen/Version.cpp index e8cfa3e6b553..4b9da640fb76 100644 --- a/aten/src/ATen/Version.cpp +++ b/aten/src/ATen/Version.cpp @@ -205,7 +205,7 @@ std::string show_config() { // TODO: do HIP // TODO: do XLA - // TODO: do MLC + // TODO: do MPS return ss.str(); } diff --git a/aten/src/ATen/WrapDimUtils.h b/aten/src/ATen/WrapDimUtils.h index 24fe684c6dc6..bb3fdd484992 100644 --- a/aten/src/ATen/WrapDimUtils.h +++ b/aten/src/ATen/WrapDimUtils.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace at { @@ -74,7 +75,7 @@ static inline int64_t legacy_cat_wrap_dim(int64_t dim, const std::vector&), lower_precision_fp) @@ -474,46 +494,18 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) { KERNEL_CPU(ADD_NS(addbmm), "addbmm", Tensor (const Tensor &, const Tensor &, const Tensor &, const Scalar&, const Scalar&), lower_precision_fp) KERNEL_CPU(ADD_NS(linear), "linear", Tensor (const Tensor &, const Tensor &, const c10::optional &), lower_precision_fp) KERNEL_CPU(ADD_NS(_convolution), "_convolution.deprecated", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, bool, IntArrayRef, int64_t, bool, bool, bool), lower_precision_fp) + KERNEL_CPU(ADD_NS(_convolution), "_convolution", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, bool, IntArrayRef, int64_t, bool, bool, bool, bool), lower_precision_fp) + KERNEL_CPU(ADD_NS(matmul), "matmul", Tensor (const Tensor &, const Tensor &), lower_precision_fp) + KERNEL_CPU(ADD_NS(conv_tbc), "conv_tbc", Tensor(const Tensor &, const Tensor &, const Tensor &, int64_t), lower_precision_fp) // fp32 cast policy KERNEL_CPU(ADD_NS(conv_transpose1d), "conv_transpose1d", Tensor (const Tensor &, const Tensor &, const c10::optional &, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, IntArrayRef), fp32) KERNEL_CPU(ADD_NS(conv_transpose2d), "conv_transpose2d.input", Tensor (const Tensor &, const Tensor &, const c10::optional &, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, IntArrayRef), fp32) KERNEL_CPU(ADD_NS(conv_transpose3d), "conv_transpose3d.input", Tensor (const Tensor &, const Tensor &, const c10::optional &, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, IntArrayRef), fp32) - KERNEL_CPU(ADD_NS(batch_norm), "batch_norm", Tensor (const Tensor &, const c10::optional &, const c10::optional &, const c10::optional &, const c10::optional &, bool, double, double, bool), fp32) - - KERNEL_CPU(ADD_NS(dropout), "dropout", Tensor (const Tensor &, double, bool), fp32) - KERNEL_CPU(ADD_NS(avg_pool1d), "avg_pool1d", Tensor (const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, bool, bool), fp32) - KERNEL_CPU(ADD_NS(avg_pool2d), "avg_pool2d", Tensor (const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, bool, bool, c10::optional), fp32) KERNEL_CPU(ADD_NS(avg_pool3d), "avg_pool3d", Tensor (const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, bool, bool, c10::optional), fp32) - KERNEL_CPU(ADD_NS(gelu), "gelu", Tensor (const Tensor &), fp32) - KERNEL_CPU(ADD_NS(upsample_nearest1d), "upsample_nearest1d", Tensor (const Tensor &, IntArrayRef, c10::optional), fp32) - KERNEL_CPU(ADD_NS(upsample_nearest1d), "upsample_nearest1d.vec", Tensor (const Tensor &, c10::optional, c10::optional>), fp32) - KERNEL_CPU(ADD_NS(_upsample_nearest_exact1d), "_upsample_nearest_exact1d", Tensor (const Tensor &, IntArrayRef, c10::optional), fp32) - KERNEL_CPU(ADD_NS(_upsample_nearest_exact1d), "_upsample_nearest_exact1d.vec", Tensor (const Tensor &, c10::optional, c10::optional>), fp32) - KERNEL_CPU(ADD_NS(upsample_nearest2d), "upsample_nearest2d", Tensor (const Tensor &, IntArrayRef, c10::optional, c10::optional), fp32) - KERNEL_CPU(ADD_NS(upsample_nearest2d), "upsample_nearest2d.vec", Tensor (const Tensor &, c10::optional, c10::optional>), fp32) - KERNEL_CPU(ADD_NS(_upsample_nearest_exact2d), "_upsample_nearest_exact2d", Tensor (const Tensor &, IntArrayRef, c10::optional, c10::optional), fp32) - KERNEL_CPU(ADD_NS(_upsample_nearest_exact2d), "_upsample_nearest_exact2d.vec", Tensor (const Tensor &, c10::optional, c10::optional>), fp32) - KERNEL_CPU(ADD_NS(upsample_nearest3d), "upsample_nearest3d", Tensor (const Tensor &, IntArrayRef, c10::optional, c10::optional, c10::optional), fp32) - KERNEL_CPU(ADD_NS(upsample_nearest3d), "upsample_nearest3d.vec", Tensor (const Tensor &, c10::optional, c10::optional>), fp32) - KERNEL_CPU(ADD_NS(_upsample_nearest_exact3d), "_upsample_nearest_exact3d", Tensor (const Tensor &, IntArrayRef, c10::optional, c10::optional, c10::optional), fp32) - KERNEL_CPU(ADD_NS(_upsample_nearest_exact3d), "_upsample_nearest_exact3d.vec", Tensor (const Tensor &, c10::optional, c10::optional>), fp32) - KERNEL_CPU(ADD_NS(upsample_linear1d), "upsample_linear1d", Tensor (const Tensor &, IntArrayRef, bool, c10::optional), fp32) - KERNEL_CPU(ADD_NS(upsample_linear1d), "upsample_linear1d.vec", Tensor (const Tensor &, c10::optional, bool, c10::optional>), fp32) - KERNEL_CPU(ADD_NS(upsample_bilinear2d), "upsample_bilinear2d", Tensor (const Tensor &, IntArrayRef, bool, c10::optional, c10::optional), fp32) - KERNEL_CPU(ADD_NS(upsample_bilinear2d), "upsample_bilinear2d.vec", Tensor (const Tensor &, c10::optional, bool, c10::optional>), fp32) - KERNEL_CPU(ADD_NS(upsample_trilinear3d), "upsample_trilinear3d", Tensor (const Tensor &, IntArrayRef, bool, c10::optional, c10::optional, c10::optional), fp32) - KERNEL_CPU(ADD_NS(upsample_trilinear3d), "upsample_trilinear3d.vec", Tensor (const Tensor &, c10::optional, bool, c10::optional>), fp32) - KERNEL_CPU(ADD_NS(binary_cross_entropy), "binary_cross_entropy", Tensor (const Tensor &, const Tensor &, const c10::optional&, int64_t), fp32) - KERNEL_CPU(ADD_NS(binary_cross_entropy_with_logits), "binary_cross_entropy_with_logits", Tensor (const Tensor &, const Tensor &, const c10::optional&, const c10::optional&, int64_t), fp32) - KERNEL_CPU(ADD_NS(instance_norm), "instance_norm", Tensor (const Tensor &, const c10::optional&, const c10::optional&, const c10::optional&, const c10::optional&, bool, double, double, bool), fp32) KERNEL_CPU(ADD_NS(grid_sampler), "grid_sampler", Tensor(const Tensor &, const Tensor &, int64_t, int64_t, bool), fp32) KERNEL_CPU(ADD_NS(polar), "polar", Tensor(const Tensor &, const Tensor &), fp32) - KERNEL_CPU(ADD_NS(multinomial), "multinomial", Tensor(const Tensor &, int64_t, bool, c10::optional), fp32) - KERNEL_CPU(ADD_NS(poisson), "poisson", Tensor(const Tensor &, c10::optional), fp32) - KERNEL_CPU(ADD_NS(fmod), "fmod.Tensor", Tensor(const Tensor &, const Tensor &), fp32) - KERNEL_CPU(ADD_NS(fmod), "fmod.Scalar", Tensor(const Tensor &, const Scalar &), fp32) KERNEL_CPU(ADD_NS(prod), "prod", Tensor(const Tensor &, c10::optional), fp32) KERNEL_CPU(ADD_NS(prod), "prod.dim_int", Tensor(const Tensor &, int64_t, bool, c10::optional), fp32) KERNEL_CPU(ADD_NS(prod), "prod.dim_Dimname", Tensor(const Tensor &, at::Dimname, bool, c10::optional), fp32) @@ -522,36 +514,22 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) { KERNEL_CPU(ADD_NS(nanquantile), "nanquantile", Tensor(const Tensor &, const Tensor &, c10::optional, bool, c10::string_view), fp32) KERNEL_CPU(ADD_NS(nanquantile), "nanquantile.scalar", Tensor(const Tensor &, double, c10::optional, bool, c10::string_view), fp32) KERNEL_CPU(ADD_NS(stft), "stft", Tensor(const Tensor &, int64_t, c10::optional, c10::optional, const c10::optional &, bool, c10::optional, c10::optional), fp32) + KERNEL_CPU(ADD_NS(stft), "stft.center", Tensor(const Tensor &, int64_t, c10::optional, c10::optional, const c10::optional &, bool, c10::string_view, bool, c10::optional, c10::optional), fp32) KERNEL_CPU(ADD_NS(cdist), "cdist", Tensor(const Tensor &, const Tensor &, double, c10::optional), fp32) - KERNEL_CPU(ADD_NS(cross), "cross", Tensor(const Tensor &, const Tensor &, c10::optional), fp32) - KERNEL_CPU(ADD_NS(cumprod), "cumprod", Tensor(const Tensor &, int64_t, c10::optional), fp32) - KERNEL_CPU(ADD_NS(cumprod), "cumprod.dimname", Tensor(const Tensor &, at::Dimname, c10::optional), fp32) - KERNEL_CPU(ADD_NS(cumsum), "cumsum", Tensor(const Tensor &, int64_t, c10::optional), fp32) - KERNEL_CPU(ADD_NS(cumsum), "cumsum.dimname", Tensor(const Tensor &, at::Dimname, c10::optional), fp32) - KERNEL_CPU(ADD_NS(diag), "diag", Tensor(const Tensor &, int64_t), fp32) - KERNEL_CPU(ADD_NS(diagflat), "diagflat", Tensor(const Tensor &, int64_t), fp32) - KERNEL_CPU(ADD_NS(histc), "histc", Tensor(const Tensor &, int64_t, const at::Scalar &, const at::Scalar &), fp32) - KERNEL_CPU(ADD_NS(logcumsumexp), "logcumsumexp", Tensor(const Tensor &, int64_t), fp32) - KERNEL_CPU(ADD_NS(searchsorted), "searchsorted.Tensor", Tensor(const Tensor &, const Tensor &, bool, bool, const c10::optional, const c10::optional &), fp32) - KERNEL_CPU(ADD_NS(searchsorted), "searchsorted.Scalar", Tensor(const Tensor &, const at::Scalar &, bool, bool, const c10::optional, const c10::optional &), fp32) + KERNEL_CPU(ADD_NS(grid_sampler_2d), "grid_sampler_2d", Tensor(const Tensor &, const Tensor &, int64_t, int64_t, bool), fp32) + KERNEL_CPU(ADD_NS(_grid_sampler_2d_cpu_fallback), "_grid_sampler_2d_cpu_fallback", Tensor(const Tensor &, const Tensor &, int64_t, int64_t, bool), fp32) + KERNEL_CPU(ADD_NS(grid_sampler_3d), "grid_sampler_3d", Tensor(const Tensor &, const Tensor &, int64_t, int64_t, bool), fp32) KERNEL_CPU(ADD_NS(trace), "trace", Tensor(const Tensor &), fp32) - KERNEL_CPU(ADD_NS(tril), "tril", Tensor(const Tensor &, int64_t), fp32) - KERNEL_CPU(ADD_NS(triu), "triu", Tensor(const Tensor &, int64_t), fp32) - KERNEL_CPU(ADD_NS(vander), "vander", Tensor(const Tensor &, c10::optional, bool), fp32) KERNEL_CPU(ADD_NS(view_as_complex), "view_as_complex", Tensor(const Tensor &), fp32) KERNEL_CPU(ADD_NS(cholesky), "cholesky", Tensor(const Tensor &, bool), fp32) KERNEL_CPU(ADD_NS(cholesky_inverse), "cholesky_inverse", Tensor(const Tensor &, bool), fp32) KERNEL_CPU(ADD_NS(cholesky_solve), "cholesky_solve", Tensor(const Tensor &, const Tensor &, bool), fp32) - KERNEL_CPU(ADD_NS(dot), "dot", Tensor(const Tensor &, const Tensor &), fp32) KERNEL_CPU(ADD_NS(inverse), "inverse", Tensor(const Tensor &), fp32) KERNEL_CPU(ADD_NS(lu_solve), "lu_solve", Tensor(const Tensor &, const Tensor &, const Tensor &), fp32) KERNEL_CPU(ADD_NS(matrix_rank), "matrix_rank", Tensor(const Tensor &, bool), fp32) KERNEL_CPU(ADD_NS(orgqr), "orgqr", Tensor(const Tensor &, const Tensor &), fp32) KERNEL_CPU(ADD_NS(ormqr), "ormqr", Tensor(const Tensor &, const Tensor &, const Tensor &, bool, bool), fp32) KERNEL_CPU(ADD_NS(pinverse), "pinverse", Tensor(const Tensor &, double), fp32) - KERNEL_CPU(ADD_NS(vdot), "vdot", Tensor(const Tensor &, const Tensor &), fp32) - KERNEL_CPU(ADD_NS(im2col), "im2col", Tensor(const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef), fp32) - KERNEL_CPU(ADD_NS(col2im), "col2im", Tensor(const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef), fp32) KERNEL_CPU(ADD_NS(max_pool3d), "max_pool3d", Tensor(const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, bool), fp32) KERNEL_CPU(ADD_NS(max_unpool2d), "max_unpool2d", Tensor(const Tensor &, const Tensor &, IntArrayRef), fp32) KERNEL_CPU(ADD_NS(max_unpool3d), "max_unpool3d", Tensor(const Tensor &, const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef), fp32) @@ -561,18 +539,6 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) { KERNEL_CPU(ADD_NS(replication_pad1d), "replication_pad1d", Tensor(const Tensor &, IntArrayRef), fp32) KERNEL_CPU(ADD_NS(replication_pad2d), "replication_pad2d", Tensor(const Tensor &, IntArrayRef), fp32) KERNEL_CPU(ADD_NS(replication_pad3d), "replication_pad3d", Tensor(const Tensor &, IntArrayRef), fp32) - KERNEL_CPU(ADD_NS(elu), "elu", Tensor(const Tensor &, const Scalar &, const Scalar &, const Scalar &), fp32) - KERNEL_CPU(ADD_NS(hardshrink), "hardshrink", Tensor(const Tensor &, const Scalar &), fp32) - KERNEL_CPU(ADD_NS(hardsigmoid), "hardsigmoid", Tensor(const Tensor &), fp32) - KERNEL_CPU(ADD_NS(hardswish), "hardswish", Tensor(const Tensor &), fp32) - KERNEL_CPU(ADD_NS(log_sigmoid), "log_sigmoid", Tensor(const Tensor &), fp32) - KERNEL_CPU(ADD_NS(prelu), "prelu", Tensor(const Tensor &, const Tensor &), fp32) - KERNEL_CPU(ADD_NS(selu), "selu", Tensor(const Tensor &), fp32) - KERNEL_CPU(ADD_NS(celu), "celu", Tensor(const Tensor &, const Scalar &), fp32) - KERNEL_CPU(ADD_NS(softplus), "softplus", Tensor(const Tensor &, const Scalar &, const Scalar &), fp32) - KERNEL_CPU(ADD_NS(softshrink), "softshrink", Tensor(const Tensor &, const Scalar &), fp32) - KERNEL_CPU(ADD_NS(group_norm), "group_norm", Tensor(const Tensor &, int64_t, const c10::optional &, const c10::optional &, double, bool), fp32) - KERNEL_CPU(ADD_NS(smooth_l1_loss), "smooth_l1_loss", Tensor (const Tensor &, const Tensor &, int64_t, double), fp32) KERNEL_CPU(ADD_NS(mse_loss), "mse_loss", Tensor(const Tensor &, const Tensor &, int64_t), fp32) KERNEL_CPU(ADD_NS(ctc_loss), "ctc_loss.IntList", Tensor(const Tensor &, const Tensor &, IntArrayRef, IntArrayRef, int64_t, int64_t, bool), fp32) KERNEL_CPU(ADD_NS(ctc_loss), "ctc_loss.Tensor", Tensor(const Tensor &, const Tensor &, const Tensor &, const Tensor &, int64_t, int64_t, bool), fp32) @@ -580,25 +546,26 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) { KERNEL_CPU(ADD_NS(multilabel_margin_loss), "multilabel_margin_loss", Tensor(const Tensor &, const Tensor &, int64_t), fp32) KERNEL_CPU(ADD_NS(fft_fft), "fft_fft", Tensor(const Tensor &, c10::optional, int64_t, c10::optional), fp32) KERNEL_CPU(ADD_NS(fft_ifft), "fft_ifft", Tensor(const Tensor &, c10::optional, int64_t, c10::optional), fp32) - KERNEL_CPU(ADD_NS(fft_fft2), "fft_fft2", Tensor(const Tensor &, c10::optional, at::IntArrayRef, c10::optional), fp32) - KERNEL_CPU(ADD_NS(fft_ifft2), "fft_ifft2", Tensor(const Tensor &, c10::optional, at::IntArrayRef, c10::optional), fp32) - KERNEL_CPU(ADD_NS(fft_fftn), "fft_fftn", Tensor(const Tensor &, c10::optional, c10::optional, c10::optional), fp32) - KERNEL_CPU(ADD_NS(fft_ifftn), "fft_ifftn", Tensor(const Tensor &, c10::optional, c10::optional, c10::optional), fp32) + KERNEL_CPU(ADD_NS(fft_fft2), "fft_fft2", Tensor(const Tensor &, at::OptionalIntArrayRef, at::IntArrayRef, c10::optional), fp32) + KERNEL_CPU(ADD_NS(fft_ifft2), "fft_ifft2", Tensor(const Tensor &, at::OptionalIntArrayRef, at::IntArrayRef, c10::optional), fp32) + KERNEL_CPU(ADD_NS(fft_fftn), "fft_fftn", Tensor(const Tensor &, at::OptionalIntArrayRef, at::OptionalIntArrayRef, c10::optional), fp32) + KERNEL_CPU(ADD_NS(fft_ifftn), "fft_ifftn", Tensor(const Tensor &, at::OptionalIntArrayRef, at::OptionalIntArrayRef, c10::optional), fp32) KERNEL_CPU(ADD_NS(fft_rfft), "fft_rfft", Tensor(const Tensor &, c10::optional, int64_t, c10::optional), fp32) KERNEL_CPU(ADD_NS(fft_irfft), "fft_irfft", Tensor(const Tensor &, c10::optional, int64_t, c10::optional), fp32) - KERNEL_CPU(ADD_NS(fft_rfft2), "fft_rfft2", Tensor(const Tensor &, c10::optional, at::IntArrayRef, c10::optional), fp32) - KERNEL_CPU(ADD_NS(fft_irfft2), "fft_irfft2", Tensor(const Tensor &, c10::optional, at::IntArrayRef, c10::optional), fp32) - KERNEL_CPU(ADD_NS(fft_rfftn), "fft_rfftn", Tensor(const Tensor &, c10::optional, c10::optional, c10::optional), fp32) - KERNEL_CPU(ADD_NS(fft_irfftn), "fft_irfftn", Tensor(const Tensor &, c10::optional, c10::optional, c10::optional), fp32) + KERNEL_CPU(ADD_NS(fft_rfft2), "fft_rfft2", Tensor(const Tensor &, at::OptionalIntArrayRef, at::IntArrayRef, c10::optional), fp32) + KERNEL_CPU(ADD_NS(fft_irfft2), "fft_irfft2", Tensor(const Tensor &, at::OptionalIntArrayRef, at::IntArrayRef, c10::optional), fp32) + KERNEL_CPU(ADD_NS(fft_rfftn), "fft_rfftn", Tensor(const Tensor &, at::OptionalIntArrayRef, at::OptionalIntArrayRef, c10::optional), fp32) + KERNEL_CPU(ADD_NS(fft_irfftn), "fft_irfftn", Tensor(const Tensor &, at::OptionalIntArrayRef, at::OptionalIntArrayRef, c10::optional), fp32) KERNEL_CPU(ADD_NS(fft_hfft), "fft_hfft", Tensor(const Tensor &, c10::optional, int64_t, c10::optional), fp32) KERNEL_CPU(ADD_NS(fft_ihfft), "fft_ihfft", Tensor(const Tensor &, c10::optional, int64_t, c10::optional), fp32) - KERNEL_CPU(ADD_NS(conv_tbc), "conv_tbc", Tensor(const Tensor &, const Tensor &, const Tensor &, int64_t), fp32) KERNEL_CPU(ADD_NS(linalg_matrix_norm), "linalg_matrix_norm", Tensor(const Tensor &, const at::Scalar &, at::IntArrayRef, bool, c10::optional), fp32) KERNEL_CPU(ADD_NS(linalg_matrix_norm), "linalg_matrix_norm.str_ord", Tensor(const Tensor &, c10::string_view, at::IntArrayRef, bool, c10::optional), fp32) KERNEL_CPU(ADD_NS(linalg_cond), "linalg_cond", Tensor(const Tensor &, const c10::optional &), fp32) KERNEL_CPU(ADD_NS(linalg_cond), "linalg_cond.p_str", Tensor(const Tensor &, c10::string_view), fp32) KERNEL_CPU(ADD_NS(linalg_matrix_rank), "linalg_matrix_rank", Tensor(const Tensor &, double, bool), fp32) KERNEL_CPU(ADD_NS(linalg_matrix_rank), "linalg_matrix_rank.tol_tensor", Tensor(const Tensor &, const Tensor &, bool), fp32) + KERNEL_CPU(ADD_NS(linalg_matrix_rank), "linalg_matrix_rank.atol_rtol_tensor", Tensor(const Tensor &, const c10::optional &, const c10::optional &, bool), fp32) + KERNEL_CPU(ADD_NS(linalg_matrix_rank), "linalg_matrix_rank.atol_rtol_float", Tensor(const Tensor &, c10::optional, c10::optional, bool), fp32) KERNEL_CPU(ADD_NS(linalg_solve), "linalg_solve", Tensor(const Tensor &, const Tensor &), fp32) KERNEL_CPU(ADD_NS(linalg_cholesky), "linalg_cholesky", Tensor(const Tensor &, bool), fp32) KERNEL_CPU(ADD_NS(linalg_svdvals), "linalg_svdvals", Tensor(const Tensor &), fp32) @@ -607,33 +574,8 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) { KERNEL_CPU(ADD_NS(linalg_inv), "linalg_inv", Tensor(const Tensor &), fp32) KERNEL_CPU(ADD_NS(linalg_householder_product), "linalg_householder_product", Tensor(const Tensor &, const Tensor &), fp32) KERNEL_CPU(ADD_NS(linalg_tensorinv), "linalg_tensorinv", Tensor(const Tensor &, int64_t), fp32) - KERNEL_CPU(ADD_NS(linalg_tensorsolve), "linalg_tensorsolve", Tensor(const Tensor &, const Tensor &, c10::optional), fp32) + KERNEL_CPU(ADD_NS(linalg_tensorsolve), "linalg_tensorsolve", Tensor(const Tensor &, const Tensor &, at::OptionalIntArrayRef), fp32) KERNEL_CPU(ADD_NS(fake_quantize_per_tensor_affine), "fake_quantize_per_tensor_affine", Tensor (const Tensor &, double, int64_t, int64_t, int64_t), fp32) - KERNEL_CPU(ADD_NS(glu), "glu", Tensor (const Tensor &, int64_t), fp32) - - m.impl(TORCH_SELECTIVE_NAME("aten::cummax"), - TORCH_FN((&WrapFunction (const Tensor &, int64_t), - std::tuple (const Tensor &, int64_t), - &ADD_NS(cummax)>::type::call))); - - m.impl(TORCH_SELECTIVE_NAME("aten::cummax.dimname"), - TORCH_FN((&WrapFunction (const Tensor &, at::Dimname), - std::tuple (const Tensor &, at::Dimname), - &ADD_NS(cummax)>::type::call))); - - m.impl(TORCH_SELECTIVE_NAME("aten::cummin"), - TORCH_FN((&WrapFunction (const Tensor &, int64_t), - std::tuple (const Tensor &, int64_t), - &ADD_NS(cummin)>::type::call))); - - m.impl(TORCH_SELECTIVE_NAME("aten::cummin.dimname"), - TORCH_FN((&WrapFunction (const Tensor &, at::Dimname), - std::tuple (const Tensor &, at::Dimname), - &ADD_NS(cummin)>::type::call))); m.impl(TORCH_SELECTIVE_NAME("aten::eig"), TORCH_FN((&WrapFunction (const Tensor &, bool, bool), &ADD_NS(_lu_with_info)>::type::call))); - m.impl(TORCH_SELECTIVE_NAME("aten::lu_unpack"), - TORCH_FN((&WrapFunction (const Tensor &, const Tensor &, bool, bool), - std::tuple (const Tensor &, const Tensor &, bool, bool), - &ADD_NS(lu_unpack)>::type::call))); m.impl(TORCH_SELECTIVE_NAME("aten::qr"), TORCH_FN((&WrapFunction (const Tensor &, bool), &ADD_NS(qr)>::type::call))); - m.impl(TORCH_SELECTIVE_NAME("aten::solve"), - TORCH_FN((&WrapFunction (const Tensor &, const Tensor &), - std::tuple (const Tensor &, const Tensor &), - &ADD_NS(solve)>::type::call))); - m.impl(TORCH_SELECTIVE_NAME("aten::svd"), TORCH_FN((&WrapFunction (const Tensor &, bool, bool), @@ -707,17 +638,6 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) { std::tuple (const Tensor &, IntArrayRef, IntArrayRef, const Tensor &), &ADD_NS(fractional_max_pool3d)>::type::call))); - m.impl(TORCH_SELECTIVE_NAME("aten::adaptive_max_pool1d"), - TORCH_FN((&WrapFunction (const Tensor &, IntArrayRef), - std::tuple (const Tensor &, IntArrayRef), - &ADD_NS(adaptive_max_pool1d)>::type::call))); - - m.impl(TORCH_SELECTIVE_NAME("aten::adaptive_max_pool2d"), - TORCH_FN((&WrapFunction (const Tensor &, IntArrayRef), - std::tuple (const Tensor &, IntArrayRef), - &ADD_NS(adaptive_max_pool2d)>::type::call))); m.impl(TORCH_SELECTIVE_NAME("aten::adaptive_max_pool3d"), TORCH_FN((&WrapFunction + +// Forward declarations of core ATen types used in dispatch functions +namespace c10 { + +template +class optional; +template +class List; +template +class IListRef; +class Stream; +class Scalar; +class SymInt; +class SymIntList; +struct Storage; +struct TensorOptions; +template +class ArrayRef; +template +class OptionalArrayRef; + +} // namespace c10 + +namespace at { + +class Tensor; +class OptionalTensorRef; +struct Dimname; +struct Generator; +using TensorList = c10::ArrayRef; +using ITensorListRef = c10::IListRef; +using IOptTensorListRef = c10::IListRef; +using DimnameList = c10::ArrayRef; +using IntArrayRef = c10::ArrayRef; +using OptionalIntArrayRef = c10::OptionalArrayRef; + +using c10::Stream; +using c10::Storage; +using c10::QScheme; +using c10::Scalar; +using c10::SymInt; +using c10::SymIntList; +using c10::TensorOptions; + +} // namespace at diff --git a/aten/src/ATen/core/ATen_pch.h b/aten/src/ATen/core/ATen_pch.h index 8e8d354d8fe8..10b5b53b933b 100644 --- a/aten/src/ATen/core/ATen_pch.h +++ b/aten/src/ATen/core/ATen_pch.h @@ -98,6 +98,8 @@ #include #include #include +#include +#include #include #include #include @@ -105,6 +107,7 @@ #include #include #include +#include #include #include @@ -153,13 +156,14 @@ #include #include #include +#include #include #include #include -#include #include #include +#include #include #include #include diff --git a/aten/src/ATen/core/DimVector.h b/aten/src/ATen/core/DimVector.h index 6e9e2c037a5f..576b9e142ebf 100644 --- a/aten/src/ATen/core/DimVector.h +++ b/aten/src/ATen/core/DimVector.h @@ -1,13 +1,13 @@ #pragma once - -#include -#include +#include namespace at { -constexpr size_t kDimVectorStaticSize = 5; +// Re-declaring 'DimVector' type and size inside 'at' namespace. +// This is done to avoid modifying every use into their 'c10' +// equivalent. -/// A container for sizes or strides -using DimVector = SmallVector; +using c10::kDimVectorStaticSize; +using c10::DimVector; } // namespace at diff --git a/aten/src/ATen/core/DistributionsHelper.h b/aten/src/ATen/core/DistributionsHelper.h index 6205fc4210f9..1ef6fb0f3c2e 100644 --- a/aten/src/ATen/core/DistributionsHelper.h +++ b/aten/src/ATen/core/DistributionsHelper.h @@ -158,7 +158,7 @@ template ::value || \ !has_member_set_next_##TYPE##_normal_sample::value \ ), int> = 0> \ -C10_HOST_DEVICE inline bool maybe_get_next_##TYPE##_normal_sample(RNG* generator, ret_type* ret) { \ +C10_HOST_DEVICE inline bool maybe_get_next_##TYPE##_normal_sample(RNG* /*generator*/, ret_type* /*ret*/) { \ return false; \ } \ \ @@ -174,7 +174,7 @@ template ::value \ ), int> = 0> \ -C10_HOST_DEVICE inline void maybe_set_next_##TYPE##_normal_sample(RNG* generator, ret_type cache) { \ +C10_HOST_DEVICE inline void maybe_set_next_##TYPE##_normal_sample(RNG* /*generator*/, ret_type /*cache*/) { \ } DISTRIBUTION_HELPER_GENERATE_NEXT_NORMAL_METHODS(double); diff --git a/aten/src/ATen/core/Formatting.cpp b/aten/src/ATen/core/Formatting.cpp index f3122daf2cc6..832059ed1980 100644 --- a/aten/src/ATen/core/Formatting.cpp +++ b/aten/src/ATen/core/Formatting.cpp @@ -12,6 +12,28 @@ namespace c10 { std::ostream& operator<<(std::ostream & out, Backend b) { return out << toString(b); } + +std::ostream& operator<<(std::ostream & out, Scalar s) { + if (s.isFloatingPoint()) { + return out << s.toDouble(); + } + if (s.isComplex()) { + return out << s.toComplexDouble(); + } + if (s.isBoolean()) { + return out << (s.toBool() ? "true" : "false"); + } + if (s.isIntegral(false)) { + return out << s.toLong(); + } + throw std::logic_error("Unknown type in Scalar"); +} + +std::string toString(Scalar s) { + std::stringstream out; + out << s; + return out.str(); +} } namespace at { diff --git a/aten/src/ATen/core/Formatting.h b/aten/src/ATen/core/Formatting.h index 55cfe7b3bdf7..6dcfc6c7b3cd 100644 --- a/aten/src/ATen/core/Formatting.h +++ b/aten/src/ATen/core/Formatting.h @@ -1,12 +1,15 @@ #pragma once -#include -#include #include +#include +#include +#include namespace c10 { TORCH_API std::ostream& operator<<(std::ostream& out, Backend b); +TORCH_API std::ostream& operator<<(std::ostream & out, Scalar s); +TORCH_API std::string toString(Scalar s); } namespace at { @@ -19,21 +22,4 @@ static inline std::ostream& operator<<(std::ostream & out, const Tensor & t) { return print(out,t,80); } TORCH_API void print(const Tensor & t, int64_t linesize=80); - -static inline std::ostream& operator<<(std::ostream & out, Scalar s) { - if (s.isFloatingPoint()) { - return out << s.toDouble(); - } - if (s.isComplex()) { - return out << s.toComplexDouble(); - } - if (s.isBoolean()) { - return out << (s.toBool() ? "true" : "false"); - } - if (s.isIntegral(false)) { - return out << s.toLong(); - } - throw std::logic_error("Unknown type in Scalar"); -} - } diff --git a/aten/src/ATen/core/Generator.h b/aten/src/ATen/core/Generator.h index 1e6e8d54fa72..60323f3d3a00 100644 --- a/aten/src/ATen/core/Generator.h +++ b/aten/src/ATen/core/Generator.h @@ -138,6 +138,29 @@ Generator make_generator(Args&&... args) { return Generator(c10::make_intrusive(std::forward(args)...)); } +/** + * Utility function to static cast input Generator* to + * the backend generator type (CPU/CUDAGeneratorImpl etc.) + */ +template +static inline T * check_generator(c10::optional gen) { + TORCH_CHECK(gen.has_value(), "Expected Generator but received nullopt"); + TORCH_CHECK(gen->defined(), "Generator with undefined implementation is not allowed"); + TORCH_CHECK(T::device_type() == gen->device().type(), "Expected a '", T::device_type(), "' device type for generator but found '", gen->device().type(), "'"); + return gen->get(); +} + +/** + * Utility function used in tensor implementations, which + * supplies the default generator to tensors, if an input generator + * is not supplied. The input Generator* is also static casted to + * the backend generator type (CPU/CUDAGeneratorImpl etc.) + */ +template +static inline T* get_generator_or_default(const c10::optional& gen, const Generator& default_gen) { + return gen.has_value() && gen->defined() ? check_generator(gen) : check_generator(default_gen); +} + namespace detail { /** diff --git a/aten/src/ATen/core/IListRef.h b/aten/src/ATen/core/IListRef.h new file mode 100644 index 000000000000..442bc7bfabf7 --- /dev/null +++ b/aten/src/ATen/core/IListRef.h @@ -0,0 +1,610 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include + +/* + * [Note: IListRef] + * Wrapper around different API containers (e.g. boxed and unboxed). + * + * What is it? + * =========== + * It is a tagged union of both boxed and unboxed API containers. + * Working implementations: + * + * - `IListRef` + * - `IListRef` + * + * Note that `IListRef` is a view type. Meaning that it won't own the + * tensors it holds. It's intended to be used only as argument parameters. + * Specifically, where these 2 worlds overlap. + * + * What is this for? + * ================= + * Historically, PyTorch has maintained 2 different APIs: the unboxed + * (called from C++ API and Python eager mode) and boxed APIs (called + * from the TorchScript JIT, mobile interpreter, and boxed fallbacks). + * + * Calling unboxed kernels from the boxed "world" and vice-versa may + * result in non-negligible overhead. Lists are one of those types: + * + * - Boxed world: `c10::List` + * - Unboxed world: `c10::ArrayRef` + * + * In this context, `c10::IListRef` solves this problem by wrapping those + * 2 container types, so that we don't need to convert from one to + * the other. + * + * (see https://github.com/pytorch/pytorch/issues/66328) + * + * What does it do? + * ================ + * This container wraps around the different tagged containers + * (currently, only boxed and unboxed), without incurring in extra + * overhead for converting from one to another. It does so while + * exposing usual container methods, which dispatch to corresponding + * implementations. + * + * While it works with different container types, it introduces + * overhead for repeatedly calling member functions (since those will + * get dispatched, again). Therefore, you should only use it to iterate + * through the list up to one time. If you need to do more complex things, + * call `materialize()` first. + * + * Adding support for a new Tag + * ============================ + * Suppose we want to add a new tag: `Chest`. Here are the steps + * we would have to go through: + * + * 1. Add a line for it in the macro `TORCH_ILISTREF_FORALL_TAGS`. + * + * #define TORCH_ILISTREF_FORALL_TAGS(_, ...) \ + * ... + * _(Chest, ##__VA_ARGS__) + * + * 2. Add type aliases, union members, and constructors. + * + * template + * class IListRef { + * ... + * using chest_type = + * typename detail::IListRefTagImpl::list_type; + * ... + * IListRef(...) : tag_(IListRefTag::Chest) { + * ... + * } + * ... + * union Payload { + * ... + * chest_type chest; + * ... + * }; + * ... + * }; + * + * 3. Add a default implementation for it (in 'IListRef_inl.h'). It's + * preferable to make the default implementation work for `T = Tensor` + * (both `Unboxed` and `Boxed` do it). + * + * template + * class IListRefTagImplBase { + * public: + * using elem_type = ListElemT; + * using list_type = ChestContainer; + * + * static const list_type& unwrap(const IListRef& ilist) { ... } + * + * static typename list_type::const_iterator& unwrap( + * IListRefIterator& it) { ... } + * + * static const typename list_type::const_iterator& unwrap( + * const IListRefIterator& it) { ... } + * + * static IListRefConstRef iterator_get( + * const typename list_type::const_iterator& it) { ... } + * } + * + * 4. Add an specialization for each of the already supported types. + * Finally, for consistency, add them to the tracking list. + * (see [Note: IListRefTagImpl Specializations]) + * + * template <> + * class IListRefTagImpl + * : public IListRefTagImplBase {}; + * + * Adding support for a new Type + * ============================= + * Suppose we want to add support for a new type: `Matrix`. + * Here are the steps we would have to go through: + * + * 1. Add an specialization for each of the existing tags. + * For consistency, add them to the tracking list. + * (see [Note: IListRefTagImpl Specializations]) + * + * template <> + * class IListRefTagImpl + * : public IListRefTagImplBase {}; + * + * template <> + * class IListRefTagImpl + * : public IListRefTagImplBase {}; + * + * Common Problems + * =============== + * 1. One of `IListRef(Iterator)` methods are failing to compile. + * + * That may be happening because the container type you added + * is not compatible with the code written for that method. If + * that's true, then you might have to transform that code into + * a static method call (see `List::operator[]` method). + * + * 2. Can't make `IListRefIterator::operator*` return a const-reference. + * + * First, keep in mind that we assume that boxed containers will + * have to deal with `IValue` (e.g. `c10::List`). In this context, + * what may be happening is that `IValue` doesn't store internally + * your type `T`. Instead, it constructs a type new `T` everytime + * you try to get `T` for it (see `IListRef`). + */ + +namespace c10 { +template +class IListRef; + +/* + * Applies arbitrary macros to each `IListRefTag`. + */ +#define TORCH_ILISTREF_FORALL_TAGS(_, ...) \ + _(Unboxed, ##__VA_ARGS__) \ + _(Boxed, ##__VA_ARGS__) \ + _(Materialized, ##__VA_ARGS__) + +/* + * Defines a "switch-case" for `TAG`. Inside, it executes `BODY`, + * while bringing to scope: + * + * - `ImplT`: the implementation class for `TAG` + * - `this_`: the result of unwrapping `this` + */ +#define TORCH_ILISTREF_UNWRAP_CASE(TAG, BODY) \ + case c10::IListRefTag::TAG: { \ + using ImplT = c10::detail::IListRefTagImpl; \ + auto& this_ = ImplT::unwrap(*this); \ + BODY \ + } break; + +/* + * Dispatches the unwrap call, depending on `TAG`, followed by + * the execution of `BODY`. It aborts if `TAG` is not a `IListRefTag`. + * + * This macro is useful because it allows us to handle different + * types (that correspond to different tags) to be implemented + * only once. We can do it even when the implementation of the + * different tags aren't syntatically the same, by dispatching + * it to a function (e.g. `ImplT::(this_)`). + */ +#define TORCH_ILISTREF_UNWRAP(TAG, BODY) \ + switch (TAG) { \ + TORCH_ILISTREF_FORALL_TAGS(TORCH_ILISTREF_UNWRAP_CASE, BODY) \ + break; \ + default: \ + TORCH_INTERNAL_ASSERT(false, "invalid IListRef tag."); \ + } + +enum class IListRefTag { +#define DEFINE_TAG(tag, ...) tag, + TORCH_ILISTREF_FORALL_TAGS(DEFINE_TAG) +#undef DEFINE_TAG + None +}; + +namespace detail { +/* + * Type alias that specifies whether we return a reference or a copy of `T`. + * + * What is this for? + * ================= + * Since values in the boxed world are represented by an `IValue`, we also + * depend on whether it can be converted to a const-reference (`Tensor`) or + * has to create a new copy of `T` (`OptionalTensorRef`). + */ +template +using IListRefConstRef = typename ivalue_to_const_ref_overload_return::type; + +/* + * Interface that implements key functions for each `IListRefTag` type. + * + * What is this for? + * ================= + * Given an `IListRef(Iterator)`, some methods have to be implemented + * differently for each `TAG`. Therefore, the methods inside this class + * are used as dispatch targets for the different `IListRefTag` values. + * + * You should create an specialization of this class for each possible + * combination of `IListRefTag` type (except `None`) and element types + * (e.g. `Tensor`). + * + * What does it do? + * ================ + * 1. defines static methods to be used as dispatch targets by both + * `IListRef` and `IListRefIterator` (see the implementation of + * `IListRefTagImplBase`). + * + * 2. defines the `elem_type` and `list_type` aliases that will be + * used in the definition of `IListRef`. In general, we should do + * so by inheriting from `IListRefTagImplBase`. + * + * [Note: IListRefTagImpl Specialization] + * ====================================== + * For `IListRef(Iterator)`: + * - + * - + * - + * + * For `IListRef(Iterator)`: + * - + * - + * - + */ +template +class IListRefTagImpl {}; + +/* + * Base implementation of `IListRefTagImpl` methods. + * + * What is this for? + * ================= + * This should make adding specializations for new types easier. For + * example, one should be able to add a new type just by making its + * `IListRefTagImpl` specialization inherit from `IListRefTagImplBase`. + * + * You should create a partial specialization for this class only if + * you introduce a new `IListRefTag`. The idea being that there is one + * default implementation for each possible value of `IListRefTag`. + * + * What does it do? + * ================ + * 1. defines `elem_type` as an alias to `ListElemT`. + * + * 1. defines `list_type` as an alias to the default container type + * that will hold a collection of `elem_type`. The idea being that + * all types tagged as `TAG` will have `list_type` as its container, + * with different `elem_type`. + * + * 3. defines the default implementation for each of the methods that + * are supposed to be defined on `IListRefTagImpl` specializations. + * + * 4. inheriting from `IListRefTagImplBase` also means + * that the payload of the type `IListRef` will be of type `list_type` + * when it is tagged as `TAG`. + */ +template +class IListRefTagImplBase {}; + +/* + * Materialized container for `IListRef`. + * + * What is this for? + * ================= + * Container that groups `T` references together. This exchanges the + * overhead of every method call from `IListRef` for a dynamic allocation. + * + * You should use this container instead of `IListRef` if: + * + * - You are going to iterate the list more than once + * - You need to repeatedly access arbitrary elements (using `operator[]`) + * What does it do? + + * ================ + * Removes the reference (&) from the type, and wraps it into a + * `std::reference_wrapper`. If `IListRefConstRef` is not a + * reference type, then it's left unchanged. + */ +template +using _MaterializedIListRefElem = typename std::conditional< + std::is_reference::value, + typename std::reference_wrapper::type>, + T>::type; + +template +using MaterializedIListRef = std::vector<_MaterializedIListRefElem>>; + +} // namespace detail + +/* + * Iterator for `IListRef`. + * + * What is it? + * =========== + * Currently, a `std::bidirectional_iterator` that wraps the iterator + * types defined for each of the `IListRefTag`. + * + * One should be able to use it, as if it were the unwrapped + * iterators themselves. + + * What does it do? + * ================ + * Similarly to `IListRef`, this is a wrapper class. Specifically, it + * wraps each container's `const_iterator` type alias. So, for example, + * given that the container for `IListRefTag::Boxed` is `c10::List`, this + * iterator will wrap a `c10::List::const_iterator`. + * + * [Note: MSVC Iterator Debug] + * =========================== + * MSVC `vector::iterator` implementation (used in the boxed variant) + * makes it so this union's destructor, copy-constructor (assignment), and + * move-constructor (assignment) are implicitly deleted. + * + * Therefore, we need to explicitly define them as needed. Follows a list + * of places where these are needed and their reason: + * + * - `Payload` destructor: + * it is deleted only if the macro `_ITERATOR_DEBUG_LEVEL` is set to 2. + * + * - `IListRefIterator` destructor: + * same as above. However, we need to explicitly call the variant + * destructor explicitly. + * + * - `IListRefIterator` copy-constructor: + * it is deleted only if the macro `_ITERATOR_DEBUG_LEVEL` is different + * than 0. + */ +template +class IListRefIterator : public std::iterator { + private: +#define DEFINE_FRIEND_CLASS(TAG, ...) \ + friend class detail::IListRefTagImpl; \ + friend class detail::IListRefTagImplBase< \ + IListRefTag::TAG, \ + T, \ + typename detail::IListRefTagImpl::elem_type>; + TORCH_ILISTREF_FORALL_TAGS(DEFINE_FRIEND_CLASS) +#undef DEFINE_FRIEND_CLASS + + public: + using unboxed_iterator_type = typename detail:: + IListRefTagImpl::list_type::const_iterator; + using boxed_iterator_type = typename detail:: + IListRefTagImpl::list_type::const_iterator; + using materialized_iterator_type = + typename detail::MaterializedIListRef::const_iterator; + + IListRefIterator() : tag_(IListRefTag::None) {} + +#if defined(_MSC_VER) && _ITERATOR_DEBUG_LEVEL != 0 + // See [Note: MSVC Iterator Debug] + IListRefIterator(const IListRefIterator& iterator) + : tag_(iterator.tag_) { + switch (tag_) { + case IListRefTag::Boxed: + payload_.boxed_iterator = iterator.payload_.boxed_iterator; + case IListRefTag::Unboxed: + payload_.unboxed_iterator = iterator.payload_.unboxed_iterator; + default: + TORCH_INTERNAL_ASSERT(false, "invalid IListRef tag."); + } + } +#endif + +#if defined(_MSC_VER) && _ITERATOR_DEBUG_LEVEL == 2 + // See [Note: MSVC Iterator Debug] + ~IListRefIterator() { + switch (tag_) { + case IListRefTag::Boxed: + payload_.boxed_iterator.~boxed_iterator_type(); + case IListRefTag::Unboxed: + payload_.unboxed_iterator.~unboxed_iterator_type(); + default: + TORCH_INTERNAL_ASSERT(false, "invalid IListRef tag."); + } + } +#endif + + IListRefIterator(boxed_iterator_type boxed) : tag_(IListRefTag::Boxed) { + payload_.boxed_iterator = boxed; + } + + IListRefIterator(unboxed_iterator_type unboxed) : tag_(IListRefTag::Unboxed) { + payload_.unboxed_iterator = unboxed; + } + + IListRefIterator(materialized_iterator_type materialized) : tag_(IListRefTag::Materialized) { + payload_.materialized_iterator = materialized; + } + + detail::IListRefConstRef operator*() const { + TORCH_ILISTREF_UNWRAP(tag_, { return ImplT::iterator_get(this_); }); + } + + IListRefIterator& operator++() { + TORCH_ILISTREF_UNWRAP(tag_, { ++this_; }); + return *this; + } + + IListRefIterator operator++(int) { + auto old = *this; + TORCH_ILISTREF_UNWRAP(tag_, { ++this_; }); + return old; + } + + IListRefIterator& operator--() { + TORCH_ILISTREF_UNWRAP(tag_, { --this_; }); + return *this; + } + + IListRefIterator operator--(int) { + auto old = *this; + TORCH_ILISTREF_UNWRAP(tag_, { --this_; }); + return old; + } + + bool operator==(const IListRefIterator& rhs) const { + if (tag_ != rhs.tag_) { + return false; + } + TORCH_ILISTREF_UNWRAP(tag_, { + auto& rhs_it = ImplT::unwrap(rhs); + return this_ == rhs_it; + }); + } + + bool operator!=(const IListRefIterator& rhs) const { + return !(*this == rhs); + } + + private: + union Payload { + boxed_iterator_type boxed_iterator; + unboxed_iterator_type unboxed_iterator; + materialized_iterator_type materialized_iterator; + void* _init_ptr; + Payload() : _init_ptr(nullptr) {} +#if defined(_MSC_VER) + // See [Note: MSVC Iterator Debug] + ~Payload() {} +#endif + }; + + Payload payload_; + IListRefTag tag_; +}; + +/* + * See [Note: IListRef] + */ +template +class IListRef { + private: +#define DEFINE_FRIEND_CLASS(TAG, ...) \ + friend class detail::IListRefTagImpl; \ + friend class detail::IListRefTagImplBase< \ + IListRefTag::TAG, \ + T, \ + typename detail::IListRefTagImpl::elem_type>; + TORCH_ILISTREF_FORALL_TAGS(DEFINE_FRIEND_CLASS) +#undef DEFINE_FRIEND_CLASS + + public: + using unboxed_type = + typename detail::IListRefTagImpl::list_type; + using boxed_type = + typename detail::IListRefTagImpl::list_type; + using materialized_type = + typename detail::MaterializedIListRef; + + using iterator = IListRefIterator; + using const_iterator = IListRefIterator; + using value_type = typename iterator::value_type; + + IListRef() : tag_(IListRefTag::None) {} + + IListRef(const boxed_type& boxed) : tag_(IListRefTag::Boxed) { + payload_.boxed = &boxed; + } + + IListRef(const unboxed_type& unboxed) : tag_(IListRefTag::Unboxed) { + payload_.unboxed = unboxed; + } + + IListRef(const std::initializer_list& list) : tag_(IListRefTag::Unboxed) { + payload_.unboxed = at::ArrayRef(list); + } + + template < + typename... UnboxedConstructorArgs, + typename = std::enable_if_t< + std::is_constructible::value>> + IListRef(UnboxedConstructorArgs&&... args) : tag_(IListRefTag::Unboxed) { + payload_.unboxed = unboxed_type(std::forward(args)...); + } + + IListRef(const materialized_type& materialized) : tag_(IListRefTag::Materialized) { + payload_.materialized = &materialized; + } + + size_t size() const { + TORCH_ILISTREF_UNWRAP(tag_, { return this_.size(); }); + } + + bool empty() const { + return size() == 0; + } + + iterator begin() const { + TORCH_ILISTREF_UNWRAP(tag_, { return this_.begin(); }); + } + + iterator end() const { + TORCH_ILISTREF_UNWRAP(tag_, { return this_.end(); }); + } + + detail::IListRefConstRef front() const { + TORCH_ILISTREF_UNWRAP(tag_, { return ImplT::front(this_); }); + } + + /* + * Materializes the `IListRef` into a `std::vector`. + * + * This should be used when one wishes to either: + * + * - iterate over the list more than once: each `IListRefIterator` + * member function call has to go through a switch, introducing + * non-negligible overhead + * + * - randomly access an arbitrary element using `operator[]`: + * same reason as above + */ + detail::MaterializedIListRef materialize() const { + if (isMaterialized()) { + return toMaterialized(); + } + + detail::MaterializedIListRef materialized; + materialized.reserve(size()); + for (const auto& t : *this) { + materialized.emplace_back(t); + } + return materialized; + } + +#define DEFINE_CHECK(TAG, ...) \ + bool is##TAG() const { \ + return tag_ == IListRefTag::TAG; \ + } + TORCH_ILISTREF_FORALL_TAGS(DEFINE_CHECK); +#undef DEFINE_CHECK + + bool isNone() const { + return tag_ == IListRefTag::None; + } + +#define DEFINE_CASTING(TAG, ...) \ + const typename detail::IListRefTagImpl::list_type& \ + to##TAG() const { \ + TORCH_INTERNAL_ASSERT(is##TAG()); \ + return detail::IListRefTagImpl::unwrap(*this); \ + } + TORCH_ILISTREF_FORALL_TAGS(DEFINE_CASTING); +#undef DEFINE_CASTING + + private: + union Payload { + const boxed_type* boxed; + unboxed_type unboxed; + const materialized_type* materialized; + Payload() : boxed(nullptr) {} + ~Payload() {} + }; + + Payload payload_; + IListRefTag tag_; +}; + +} // namespace c10 + +#include diff --git a/aten/src/ATen/core/IListRef_inl.h b/aten/src/ATen/core/IListRef_inl.h new file mode 100644 index 000000000000..a14bcfddae2d --- /dev/null +++ b/aten/src/ATen/core/IListRef_inl.h @@ -0,0 +1,201 @@ +#pragma once + +#include +#include + +namespace at { +class Tensor; +class OptionalTensorRef; +} + +namespace c10 { +namespace detail { + +/* + * Specializations of `IListRefTagImplBase` that implement the default + * implementation for `IListRefTag::Unboxed`. + */ +template +class IListRefTagImplBase { + public: + using elem_type = ListElemT; + using list_type = ArrayRef; + + /* + * These `unwrap` static methods unwraps the inner containers out + * of `IListRef` (and `IListRefIterator`). They are required when + * the macro `TORCH_ILISTREF_UNWRAP` is called. + */ + static const list_type& unwrap(const IListRef& ilist) { + return ilist.payload_.unboxed; + } + + static typename list_type::const_iterator& unwrap(IListRefIterator& it) { + return it.payload_.unboxed_iterator; + } + + static const typename list_type::const_iterator& unwrap( + const IListRefIterator& it) { + return it.payload_.unboxed_iterator; + } + + /* + * We have these function (besides the `unwrap`s above) because the + * implementation for both `IListRef::operator[]` and `IListRefIterator::operator*` + * weren't syntatically equal for the existing tags at the time + * (`Unboxed` and `Boxed`). + */ + static IListRefConstRef front(const list_type& lst) { + return lst.front(); + } + + static IListRefConstRef iterator_get( + const typename list_type::const_iterator& it) { + return *it; + } +}; + +/* + * Specializations of `IListRefTagImplBase` that implement the default + * implementation for `IListRefTag::Boxed`. + */ +template +class IListRefTagImplBase { + public: + using elem_type = ListElemT; + using list_type = List; + + static const list_type& unwrap(const IListRef& ilist) { + return *ilist.payload_.boxed; + } + + static typename list_type::const_iterator& unwrap(IListRefIterator& it) { + return it.payload_.boxed_iterator; + } + + static const typename list_type::const_iterator& unwrap( + const IListRefIterator& it) { + return it.payload_.boxed_iterator; + } + + static IListRefConstRef front(const list_type& lst) { + return lst[0]; + } + + static IListRefConstRef iterator_get( + const typename list_type::const_iterator& it) { + return (*it).get().toTensor(); + } +}; + +/* + * Specializations of `IListRefTagImplBase` that implement the default + * implementation for `IListRefTag::Materialized`. + */ +template +class IListRefTagImplBase> { + public: + using elem_type = _MaterializedIListRefElem; + using list_type = MaterializedIListRef; + + static const list_type& unwrap(const IListRef& ilist) { + return *ilist.payload_.materialized; + } + + static typename list_type::const_iterator& unwrap(IListRefIterator& it) { + return it.payload_.materialized_iterator; + } + + static const typename list_type::const_iterator& unwrap( + const IListRefIterator& it) { + return it.payload_.materialized_iterator; + } + + static IListRefConstRef front(const list_type& lst) { + return lst[0]; + } + + static IListRefConstRef iterator_get( + const typename list_type::const_iterator& it) { + return *it; + } +}; + +/* + * [Note: ITensorListRef] + * Specializations necessary for `IListRef` type. + * + * Since the default implementations are usually done with supporting + * `Tensor` in mind, we only have to inherit from the base implementations. + */ +template <> +class IListRefTagImpl + : public IListRefTagImplBase {}; + +template <> +class IListRefTagImpl + : public IListRefTagImplBase {}; + +template <> +class IListRefTagImpl + : public IListRefTagImplBase< + IListRefTag::Materialized, + at::Tensor, + _MaterializedIListRefElem> {}; + +/* + * [Note: IOptTensorListRef] + * Specializations necessary for `IListRef` type. + * + * We can't get an `at::OptionalTensorRef` directly from an instance of + * `List>` (the type that corresponds to the boxed world). + * + * So, the default implementation won't help us. Thus, we have to implement + * this method ourselves. + */ +template <> +class IListRefTagImpl + : public IListRefTagImplBase {}; + +template <> +class IListRefTagImpl + : public IListRefTagImplBase> { + + public: + /* + * Given an instance of the types corresponding to the `Boxed` tag, we override + * the default implementation, so that we can return a `at::OptionalTensorRef`. + */ + static IListRefConstRef iterator_get( + const typename list_type::const_iterator& it) { + const auto& ivalue = (*it).get(); + if (!ivalue.isNone()) { + const auto& tensor = ivalue.toTensor(); + return (tensor.defined()) ? tensor : at::OptionalTensorRef{}; + } + return {}; + } +}; + +template <> +class IListRefTagImpl + : public IListRefTagImplBase< + IListRefTag::Materialized, + at::OptionalTensorRef, + _MaterializedIListRefElem> {}; + +} // namespace detail +} // namespace c10 + +namespace at { + +// [Note: ITensorListRef] +using ITensorListRef = c10::IListRef; +using ITensorListRefIterator = c10::IListRefIterator; +using MaterializedITensorListRef = c10::detail::MaterializedIListRef; +// [Note: IOptTensorListRef] +using IOptTensorListRef = c10::IListRef; +using IOptTensorListRefIterator = c10::IListRefIterator; +using MaterializedIOptTensorListRef = c10::detail::MaterializedIListRef; + +} // namespace at diff --git a/aten/src/ATen/core/IListRef_test.cpp b/aten/src/ATen/core/IListRef_test.cpp new file mode 100644 index 000000000000..1a609de74f80 --- /dev/null +++ b/aten/src/ATen/core/IListRef_test.cpp @@ -0,0 +1,254 @@ +#include +#include +#include +#include +#include +#include + +using namespace c10; + +static std::vector get_tensor_vector() { + std::vector tensors; + const size_t SIZE = 5; + for (size_t i = 0; i < SIZE; i++) { + tensors.emplace_back(at::empty({0})); + } + return tensors; +} + +static std::vector> get_boxed_opt_tensor_vector() { + std::vector> optional_tensors; + const size_t SIZE = 5; + for (size_t i = 0; i < SIZE * 2; i++) { + auto opt_tensor = (i % 2 == 0) ? optional(at::empty({0})) : nullopt; + optional_tensors.emplace_back(opt_tensor); + } + return optional_tensors; +} + +static std::vector get_unboxed_opt_tensor_vector() { + std::vector optional_tensors; + const size_t SIZE = 5; + for (size_t i = 0; i < SIZE * 2; i++) { + auto opt_tensor = (i % 2 == 0) ? at::OptionalTensorRef(at::empty({0})) + : at::OptionalTensorRef(); + optional_tensors.emplace_back(opt_tensor); + } + return optional_tensors; +} + +template +void check_elements_same(at::ITensorListRef list, const T& thing, int use_count) { + EXPECT_EQ(thing.size(), list.size()); + size_t i = 0; + for (const auto& t : list) { + const at::Tensor& other = thing[i]; + EXPECT_EQ(other.use_count(), use_count); + EXPECT_TRUE(other.is_same(t)); + i++; + } +} + +TEST(ITensorListRefTest, CtorEmpty_IsNone_Throws) { + at::ITensorListRef list; + EXPECT_TRUE(list.isNone()); + // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) + EXPECT_THROW(list.size(), c10::Error); +} + +TEST(ITensorListRefTest, CtorBoxed_IsBoxed) { + auto vec = get_tensor_vector(); + List boxed(vec); + at::ITensorListRef list(boxed); + EXPECT_TRUE(list.isBoxed()); +} + +TEST(ITensorListRefTest, CtorUnboxed_IsUnboxed) { + auto vec = get_tensor_vector(); + at::ArrayRef unboxed(vec); + at::ITensorListRef list(unboxed); + EXPECT_TRUE(list.isUnboxed()); +} + +TEST(ITensorListRefTest, CtorUnboxedIndirect_IsUnboxed) { + auto vec = get_tensor_vector(); + auto check_is_unboxed = [](at::ITensorListRef list) { + EXPECT_TRUE(list.isUnboxed()); + }; + check_is_unboxed(at::ITensorListRef{vec[0]}); + check_is_unboxed(at::ITensorListRef{vec.data(), vec.size()}); + check_is_unboxed(at::ITensorListRef{&*vec.begin(), &*vec.end()}); + check_is_unboxed(vec); + check_is_unboxed({vec[0], vec[1], vec[2]}); +} + +TEST(ITensorListRefTest, CtorTemp_IsUnboxed) { + auto check_is_unboxed = [](at::ITensorListRef list) { + EXPECT_TRUE(list.isUnboxed()); + }; + + auto vec = get_tensor_vector(); + check_is_unboxed({vec[0], vec[1]}); +} + +TEST(ITensorListRefTest, Boxed_GetConstRefTensor) { + auto vec = get_tensor_vector(); + // We need 'boxed' to be 'const' here (and some other tests below) + // because 'List::operator[]' returns a 'ListElementReference' + // instead of returning a 'Tensor'. On the other hand, + // 'List::operator[] const' returns a 'const Tensor &'. + const List boxed(vec); + at::ITensorListRef list(boxed); + static_assert( + std::is_same::value, + "Accessing elements from List through a ITensorListRef should be const references."); + EXPECT_TRUE(boxed[0].is_same(*list.begin())); + EXPECT_TRUE(boxed[1].is_same(*(++list.begin()))); +} + +TEST(ITensorListRefTest, Unboxed_GetConstRefTensor) { + auto vec = get_tensor_vector(); + at::ITensorListRef list(vec); + static_assert( + std::is_same::value, + "Accessing elements from ArrayRef through a ITensorListRef should be const references."); + EXPECT_TRUE(vec[0].is_same(*list.begin())); + EXPECT_TRUE(vec[1].is_same(*(++list.begin()))); +} + +TEST(ITensorListRefTest, Boxed_Equal) { + auto vec = get_tensor_vector(); + List boxed(vec); + check_elements_same(boxed, vec, /* use_count= */ 2); +} + +TEST(ITensorListRefTest, Unboxed_Equal) { + auto vec = get_tensor_vector(); + check_elements_same(at::ArrayRef(vec), vec, /* use_count= */ 1); +} + +TEST(ITensorListRefTest, UnboxedIndirect_Equal) { + // The 4 ref-count locations: + // 1. `vec` + // 2. `initializer_list` for `ITensorListRef` + // 3. `initializer_list` for `std::vector` + // 4. temporary `std::vector` + auto vec = get_tensor_vector(); + // Implicit constructors + check_elements_same(vec[0], std::vector{vec[0]}, /* use_count= */ 3); + check_elements_same({vec.data(), vec.size()}, vec, /* use_count= */ 1); + check_elements_same({&*vec.begin(), &*vec.end()}, vec, /* use_count= */ 1); + // Vector constructor + check_elements_same(vec, vec, /* use_count= */ 1); + // InitializerList constructor + check_elements_same({vec[0], vec[1], vec[2]}, std::vector{vec[0], vec[1], vec[2]}, /* use_count= */ 4); +} + +TEST(ITensorListRefTest, BoxedMaterialize_Equal) { + auto vec = get_tensor_vector(); + List boxed(vec); + at::ITensorListRef list(boxed); + auto materialized = list.materialize(); + check_elements_same(list, vec, 2); + check_elements_same(list, materialized, 2); + check_elements_same(materialized, vec, 2); +} + +TEST(ITensorListRefTest, UnboxedMaterialize_Equal) { + auto vec = get_tensor_vector(); + at::ArrayRef unboxed(vec); + at::ITensorListRef list(unboxed); + auto materialized = list.materialize(); + check_elements_same(list, vec, 1); + check_elements_same(list, materialized, 1); + check_elements_same(materialized, vec, 1); +} + +TEST(ITensorListRefIteratorTest, CtorEmpty_ThrowsError) { + at::ITensorListRefIterator it; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) + EXPECT_THROW(*it, c10::Error); +} + +TEST(ITensorListRefIteratorTest, Boxed_GetFirstElement) { + auto vec = get_tensor_vector(); + const List boxed(vec); + at::ITensorListRef list(boxed); + EXPECT_TRUE(boxed[0].is_same(*list.begin())); +} + +TEST(ITensorListRefIteratorTest, Unboxed_GetFirstElement) { + auto vec = get_tensor_vector(); + at::ITensorListRef list(vec); + EXPECT_TRUE(vec[0].is_same(*list.begin())); +} + +TEST(ITensorListRefIteratorTest, Boxed_Equality) { + auto vec = get_tensor_vector(); + List boxed(vec); + at::ITensorListRef list(boxed); + EXPECT_EQ(list.begin(), list.begin()); + EXPECT_NE(list.begin(), list.end()); + EXPECT_NE(list.end(), list.begin()); + EXPECT_EQ(list.end(), list.end()); +} + +TEST(ITensorListRefIteratorTest, Unboxed_Equality) { + auto vec = get_tensor_vector(); + at::ITensorListRef list(vec); + EXPECT_EQ(list.begin(), list.begin()); + EXPECT_NE(list.begin(), list.end()); + EXPECT_NE(list.end(), list.begin()); + EXPECT_EQ(list.end(), list.end()); +} + +TEST(ITensorListRefIteratorTest, Boxed_Iterate) { + auto vec = get_tensor_vector(); + const List boxed(vec); + at::ITensorListRef list(boxed); + size_t i = 0; + for (const auto& t : list) { + EXPECT_TRUE(boxed[i++].is_same(t)); + } + EXPECT_EQ(i, list.size()); +} + +TEST(ITensorListRefIteratorTest, Unboxed_Iterate) { + auto vec = get_tensor_vector(); + at::ITensorListRef list(vec); + size_t i = 0; + for (const auto& t : list) { + EXPECT_TRUE(vec[i++].is_same(t)); + } + EXPECT_EQ(i, list.size()); +} + +TEST(IOptTensorListRefTest, Boxed_Iterate) { + auto vec = get_boxed_opt_tensor_vector(); + const List> boxed(vec); + at::IOptTensorListRef list(boxed); + size_t i = 0; + for (const auto t : list) { + EXPECT_EQ(boxed[i].has_value(), t.has_value()); + if (t.has_value()) { + EXPECT_TRUE((*boxed[i]).is_same(*t)); + } + i++; + } + EXPECT_EQ(i, list.size()); +} + +TEST(IOptTensorListRefTest, Unboxed_Iterate) { + auto vec = get_unboxed_opt_tensor_vector(); + at::ArrayRef unboxed(vec); + at::IOptTensorListRef list(unboxed); + size_t i = 0; + for (const auto t : list) { + EXPECT_EQ(unboxed[i].has_value(), t.has_value()); + if (t.has_value()) { + EXPECT_TRUE((*unboxed[i]).is_same(*t)); + } + i++; + } + EXPECT_EQ(i, list.size()); +} diff --git a/aten/src/ATen/core/List.h b/aten/src/ATen/core/List.h index b042fab24f7d..0785a6941aff 100644 --- a/aten/src/ATen/core/List.h +++ b/aten/src/ATen/core/List.h @@ -78,6 +78,10 @@ class ListElementReference final { // assigning another ref to this assigns the underlying value ListElementReference& operator=(ListElementReference&& rhs) &&; + const IValue& get() const& { + return *iterator_; + } + friend void swap(ListElementReference&& lhs, ListElementReference&& rhs); private: @@ -235,6 +239,7 @@ class List final { using value_type = T; using size_type = typename c10::detail::ListImpl::list_type::size_type; using iterator = impl::ListIterator; + using const_iterator = impl::ListIterator; using reverse_iterator = impl::ListIterator; /** diff --git a/aten/src/ATen/core/PythonFallbackKernel.cpp b/aten/src/ATen/core/PythonFallbackKernel.cpp index 6b51aa53156f..f9f3d6ff7f83 100644 --- a/aten/src/ATen/core/PythonFallbackKernel.cpp +++ b/aten/src/ATen/core/PythonFallbackKernel.cpp @@ -1,14 +1,59 @@ -#include -#include -#include +#include +#include +#include namespace { +// This TLS is used to track the state of the dispatcher to be able to restore +// it when calling back into python. +// It has the following invariant: +// - It must be empty while python code is executed. +// - It should only be set once even for multiple dispatcher calls that do not come +// back to python. +// To achieve this, we ensure that the tls is empty by default and emptied again both when +// we call into user torch_dispatch or returning back to python after this call. + +thread_local c10::optional tls_on_entry; + +c10::impl::LocalDispatchKeySet safe_get_tls_on_entry() { + TORCH_CHECK(tls_on_entry.has_value(), "Accessing torch dispatch state outside of '__torch_dispatch__' " + "is not allowed."); + return tls_on_entry.value(); +} + +// All the keys below the Python key +constexpr c10::DispatchKeySet after_Python_keyset = c10::DispatchKeySet(c10::DispatchKeySet::FULL) ^ + (c10::DispatchKeySet(c10::DispatchKeySet::FULL_AFTER, c10::DispatchKey::Python) | + c10::DispatchKeySet(c10::DispatchKey::Python)); + + +// This guard assumes that tls_on_entry has a value. +struct StashTLSOnEntryGuard { +public: + StashTLSOnEntryGuard(): saved_(tls_on_entry.value()) { + tls_on_entry = c10::nullopt; + } + + ~StashTLSOnEntryGuard() { + TORCH_INTERNAL_ASSERT(!tls_on_entry.has_value()); + tls_on_entry = saved_; + } + +private: + c10::impl::LocalDispatchKeySet saved_; +}; + void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) { - // If Python Mode is active, use its PyInterpreter for dispatch - const auto& maybe_python_mode_state = at::impl::PythonModeTLS::get_state(); - if (maybe_python_mode_state) { - maybe_python_mode_state->pyinterpreter()->dispatch(op, stack, maybe_python_mode_state); + TORCH_INTERNAL_ASSERT(tls_on_entry.has_value()); + // c10::impl::ForceDispatchKeyGuard dispatcher_guard(tls_on_entry.value()); + // StashTLSOnEntryGuard stash_guard; + c10::impl::ExcludeDispatchKeyGuard guard(after_Python_keyset); + + + // If Torch Dispatch Mode is active, use its PyInterpreter for dispatch + const auto& maybe_torch_dispatch_mode_state = at::impl::TorchDispatchModeTLS::get_state(); + if (maybe_torch_dispatch_mode_state) { + maybe_torch_dispatch_mode_state->pyinterpreter()->dispatch(op, stack, maybe_torch_dispatch_mode_state); return; } @@ -42,8 +87,53 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) { TORCH_INTERNAL_ASSERT(0, "Hit Python dispatch key but no arguments had PyInterpreter (no tensor args?)"); } +void pythonTLSSnapshotFallback(const c10::OperatorHandle &op, c10::DispatchKeySet dispatch_keys, torch::jit::Stack* stack) { + // It is ok for the tls to be already set here. + // It means that there are multiple calls into the dispatcher not originating from python code. + // The guard below will properly ignore such calls. + at::impl::MaybeSetTLSOnEntryGuard guard; + + op.redispatchBoxed(dispatch_keys & c10::DispatchKeySet(c10::DispatchKeySet::FULL_AFTER, c10::DispatchKey::PythonTLSSnapshot), stack); +} + + } // anonymous namespace +namespace at { +namespace impl { + +RestorePythonTLSSnapshot::RestorePythonTLSSnapshot() : saved_(safe_get_tls_on_entry()), guard_(safe_get_tls_on_entry()) { + tls_on_entry = c10::nullopt; +} + +RestorePythonTLSSnapshot::~RestorePythonTLSSnapshot() { + TORCH_INTERNAL_ASSERT(!tls_on_entry.has_value()); + tls_on_entry = saved_; +} + +MaybeSetTLSOnEntryGuard::MaybeSetTLSOnEntryGuard() { + if (tls_on_entry.has_value()) { + value_set_ = false; + } else { + value_set_ = true; + tls_on_entry = c10::impl::tls_local_dispatch_key_set(); + } +} +MaybeSetTLSOnEntryGuard::~MaybeSetTLSOnEntryGuard() { + if (value_set_) { + TORCH_INTERNAL_ASSERT(tls_on_entry.has_value()); + tls_on_entry = c10::nullopt; + } +} + + +} // namespace impl +} // namespace at + TORCH_LIBRARY_IMPL(_, Python, m) { m.fallback(torch::CppFunction::makeFromBoxedFunction<&pythonFallback>()); } + +TORCH_LIBRARY_IMPL(_, PythonTLSSnapshot, m) { + m.fallback(torch::CppFunction::makeFromBoxedFunction<&pythonTLSSnapshotFallback>()); +} diff --git a/aten/src/ATen/core/PythonFallbackKernel.h b/aten/src/ATen/core/PythonFallbackKernel.h new file mode 100644 index 000000000000..94cd4e81291a --- /dev/null +++ b/aten/src/ATen/core/PythonFallbackKernel.h @@ -0,0 +1,28 @@ +#pragma once + + +namespace at { +namespace impl { + +struct TORCH_API RestorePythonTLSSnapshot { + RestorePythonTLSSnapshot(); + ~RestorePythonTLSSnapshot(); + +private: + c10::impl::LocalDispatchKeySet saved_; + c10::impl::ForceDispatchKeyGuard guard_; +}; + + +// RAII guard to make working with the above TLS safer. +struct TORCH_API MaybeSetTLSOnEntryGuard { +public: + MaybeSetTLSOnEntryGuard(); + ~MaybeSetTLSOnEntryGuard(); + +private: + bool value_set_; +}; + +} // namespace impl +} // namespace at diff --git a/aten/src/ATen/core/PythonModeTLS.cpp b/aten/src/ATen/core/PythonModeTLS.cpp deleted file mode 100644 index dd4b44bc5fed..000000000000 --- a/aten/src/ATen/core/PythonModeTLS.cpp +++ /dev/null @@ -1,26 +0,0 @@ -#include - -namespace at { namespace impl { - -thread_local std::shared_ptr pythonModeState; - -void PythonModeTLS::set_state(const std::shared_ptr& state) { - pythonModeState = state; - if (state) { - c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, true); - } else { - PythonModeTLS::reset_state(); - } -} - -const std::shared_ptr& PythonModeTLS::get_state() { - return pythonModeState; -} - -void PythonModeTLS::reset_state() { - pythonModeState.reset((TorchDispatchTypeObject*)nullptr); - c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, false); -} - -} // namespace impl -} // namespace at diff --git a/aten/src/ATen/core/PythonModeTLS.h b/aten/src/ATen/core/PythonModeTLS.h deleted file mode 100644 index be52b182c659..000000000000 --- a/aten/src/ATen/core/PythonModeTLS.h +++ /dev/null @@ -1,17 +0,0 @@ -#pragma once - -#include -#include -#include - -namespace at { -namespace impl { - -struct TORCH_API PythonModeTLS { - static void set_state(const std::shared_ptr& state); - static const std::shared_ptr& get_state(); - static void reset_state(); -}; - -} // namespace impl -} // namespace at diff --git a/aten/src/ATen/core/QuantizerBase.h b/aten/src/ATen/core/QuantizerBase.h index e11d8d6e049c..922ea8a38f50 100644 --- a/aten/src/ATen/core/QuantizerBase.h +++ b/aten/src/ATen/core/QuantizerBase.h @@ -55,7 +55,7 @@ struct TORCH_API Quantizer : public c10::intrusive_ptr_target { */ virtual QScheme qscheme() const = 0; - ScalarType scalar_type() { + ScalarType scalar_type() const { return scalar_type_; } @@ -77,7 +77,7 @@ struct TORCH_API Quantizer : public c10::intrusive_ptr_target { /** * Compare against `other` for equality. */ - virtual bool equalTo(QuantizerPtr other) = 0; + virtual bool equalTo(QuantizerPtr other) const = 0; }; } // namespace at diff --git a/aten/src/ATen/core/Tensor.cpp b/aten/src/ATen/core/Tensor.cpp index 9f56923c1cdb..fa175165d2e1 100644 --- a/aten/src/ATen/core/Tensor.cpp +++ b/aten/src/ATen/core/Tensor.cpp @@ -4,6 +4,15 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#include +#include +#endif + #include namespace at { @@ -29,6 +38,18 @@ const TensorBase& TensorBase::zero_() const { return *this; } +TensorBase TensorBase::to( + at::TensorOptions options, + bool non_blocking, + bool copy, + c10::optional memory_format) const { + Tensor self(*this); + return at::_ops::to_dtype_layout::call( + self, optTypeMetaToScalarType(options.dtype_opt()), + options.layout_opt(), options.device_opt(), + options.pinned_memory_opt(), non_blocking, copy, memory_format); +} + void TensorBase::enforce_invariants() { if (impl_.get() == nullptr) { throw std::runtime_error("TensorImpl with nullptr is not supported"); diff --git a/aten/src/ATen/core/TensorAccessor.h b/aten/src/ATen/core/TensorAccessor.h index 9d65522b5d96..9c60f84a16b3 100644 --- a/aten/src/ATen/core/TensorAccessor.h +++ b/aten/src/ATen/core/TensorAccessor.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h index b05f74259dc2..37c1ed895782 100644 --- a/aten/src/ATen/core/TensorBase.h +++ b/aten/src/ATen/core/TensorBase.h @@ -17,6 +17,7 @@ #include #include +#include #include namespace c10 { @@ -43,7 +44,6 @@ inline bool variable_excluded_from_dispatch() { // Please read the comment in `VariableFallbackKernel.cpp` about the background of this change. return true; #else - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!c10::impl::tls_local_dispatch_key_set().excluded_.has(DispatchKey::Autograd)); return c10::impl::tls_local_dispatch_key_set().excluded_.isSupersetOf(c10::autograd_dispatch_keyset); #endif } @@ -142,6 +142,8 @@ class TORCH_API TensorBase { const TensorBase& fill_(const c10::Scalar& scalar) const; const TensorBase& zero_() const; + TensorBase to(at::TensorOptions options={}, bool non_blocking=false, bool copy=false, c10::optional memory_format=c10::nullopt) const; + bool is_complex() const { return at::isComplexType(this->scalar_type()); } @@ -155,15 +157,17 @@ class TORCH_API TensorBase { } int64_t size(int64_t dim) const { + const auto sizes = this->sizes(); + const auto ndim = static_cast(sizes.size()); // false is passed to maybe_wrap_dim so behavior is identical to array access (but with wrapping) - dim = c10::maybe_wrap_dim(dim, this->dim(), false); - return sizes()[dim]; + return sizes[c10::maybe_wrap_dim(dim, ndim, /*wrap_scalar=*/false)]; } int64_t stride(int64_t dim) const { + const auto strides = this->strides(); + const auto ndim = static_cast(strides.size()); // false is passed to maybe_wrap_dim so behavior is identical to array access (but with wrapping) - dim = c10::maybe_wrap_dim(dim, this->dim(), false); - return strides()[dim]; + return strides[c10::maybe_wrap_dim(dim, ndim, /*wrap_scalar=*/false)]; } TensorImpl * unsafeGetTensorImpl() const { @@ -216,6 +220,9 @@ class TORCH_API TensorBase { IntArrayRef sizes() const { return impl_->sizes(); } + c10::SymIntArrayRef sym_sizes() const { + return c10::SymIntArrayRef(reinterpret_cast(sizes().data()), sizes().size()); + } IntArrayRef strides() const { return impl_->strides(); } @@ -243,7 +250,7 @@ class TORCH_API TensorBase { bool channels_last_strides_exact_match = false) const { // Setting channels_last_strides_exact_match to true forces function to // check 0,1 - sized dimension strides. - if (!is_mkldnn() && !is_sparse()) { + if (layout() == at::kStrided) { if (impl_->is_strides_like_channels_last()) { if (!channels_last_strides_exact_match || get_channels_last_strides_2d(sizes()) == strides()) { @@ -369,6 +376,12 @@ class TORCH_API TensorBase { return impl_->is_cuda(); } + /// Returns if a `Tensor` has IPU backend. + bool is_ipu() const { + // NB: this is not a native function to avoid dispatching overhead. + return impl_->is_ipu(); + } + /// Returns if a `Tensor` has XPU backend. bool is_xpu() const { // NB: this is not a native function to avoid dispatching overhead. @@ -420,10 +433,10 @@ class TORCH_API TensorBase { return impl_->is_mkldnn(); } - /// Returns if a `Tensor` is mlc tensor. - bool is_mlc() const { + /// Returns if a `Tensor` is mps tensor. + bool is_mps() const { // NB: this is not a native function to avoid dispatching overhead. - return impl_->is_mlc(); + return impl_->is_mps(); } /// Returns if a `Tensor` is ort tensor. @@ -461,6 +474,11 @@ class TORCH_API TensorBase { return impl_->is_inference(); } + // Returns if a `Tensor` is a NestedTensor. + bool is_nested() const { + return impl_->is_nested(); + } + /// If a tensor is a quantized tensor, returns its quantizer /// TODO: it's not in native_functions.yaml yet as it's not exposed to python QuantizerPtr quantizer() const; @@ -865,7 +883,7 @@ struct MaybeOwnedTraits { return &borrow; } - static bool debugBorrowIsValid(const borrow_type& borrow) { + static bool debugBorrowIsValid(const borrow_type& /*borrow*/) { return true; } }; diff --git a/aten/src/ATen/core/TorchDispatchModeTLS.cpp b/aten/src/ATen/core/TorchDispatchModeTLS.cpp new file mode 100644 index 000000000000..6c35890eb8b6 --- /dev/null +++ b/aten/src/ATen/core/TorchDispatchModeTLS.cpp @@ -0,0 +1,29 @@ +#include +#include + +namespace at { namespace impl { + +thread_local std::shared_ptr torchDispatchModeState; + +void TorchDispatchModeTLS::set_state(std::shared_ptr state) { + if (state) { + c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, true); + c10::impl::tls_set_dispatch_key_included(DispatchKey::PythonTLSSnapshot, true); + } else { + TorchDispatchModeTLS::reset_state(); + } + torchDispatchModeState = std::move(state); +} + +const std::shared_ptr& TorchDispatchModeTLS::get_state() { + return torchDispatchModeState; +} + +void TorchDispatchModeTLS::reset_state() { + torchDispatchModeState.reset(); + c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, false); + c10::impl::tls_set_dispatch_key_included(DispatchKey::PythonTLSSnapshot, false); +} + +} // namespace impl +} // namespace at diff --git a/aten/src/ATen/core/TorchDispatchModeTLS.h b/aten/src/ATen/core/TorchDispatchModeTLS.h new file mode 100644 index 000000000000..adbf30844382 --- /dev/null +++ b/aten/src/ATen/core/TorchDispatchModeTLS.h @@ -0,0 +1,17 @@ +#pragma once + +#include +#include +#include + +namespace at { +namespace impl { + +struct TORCH_API TorchDispatchModeTLS { + static void set_state(std::shared_ptr state); + static const std::shared_ptr& get_state(); + static void reset_state(); +}; + +} // namespace impl +} // namespace at diff --git a/aten/src/ATen/core/VariableFallbackKernel.cpp b/aten/src/ATen/core/VariableFallbackKernel.cpp index ed67c4754fed..ebc54d8e7cba 100644 --- a/aten/src/ATen/core/VariableFallbackKernel.cpp +++ b/aten/src/ATen/core/VariableFallbackKernel.cpp @@ -56,7 +56,7 @@ TORCH_LIBRARY_IMPL(_, AutogradLazy, m) { m.fallback(torch::CppFunction::makeFallthrough()); } -TORCH_LIBRARY_IMPL(_, AutogradMLC, m) { +TORCH_LIBRARY_IMPL(_, AutogradMPS, m) { m.fallback(torch::CppFunction::makeFallthrough()); } diff --git a/aten/src/ATen/core/boxing/KernelFunction_impl.h b/aten/src/ATen/core/boxing/KernelFunction_impl.h index c9611475255b..01537c2dc471 100644 --- a/aten/src/ATen/core/boxing/KernelFunction_impl.h +++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h @@ -157,6 +157,7 @@ inline KernelFunction KernelFunction::makeFromUnboxedFunction(FuncPtr func_ptr) static_assert(FuncPtr::func_ptr() != nullptr, "Kernel function cannot be nullptr"); #if !defined(C10_MOBILE) + (void)func_ptr; // Suppress unused variable warning return makeFromUnboxedFunctor::type>( guts::make_unique_base::type>() ); diff --git a/aten/src/ATen/core/boxing/impl/boxing.h b/aten/src/ATen/core/boxing/impl/boxing.h index d9b14623dc54..4bbc5dd69dcf 100644 --- a/aten/src/ATen/core/boxing/impl/boxing.h +++ b/aten/src/ATen/core/boxing/impl/boxing.h @@ -200,7 +200,7 @@ struct BoxedKernelWrapper< // 3. in-place ops take a single non-const Tensor reference // as their first argument, and return it. // -// Note: all signatures matching this pattern are are assumed to be for such ops. +// Note: all signatures matching this pattern are assumed to be for such ops. // Because of this, the generated BoxedKernelWrapper specializations simply // return the in-place argument. // @@ -260,7 +260,7 @@ struct BoxedKernelWrapper< // 4. out of place ops that take a single non-const Tensor reference as their // final argument, and also return it. // -// Note: all signatures matching this pattern are are assumed to be for such ops. +// Note: all signatures matching this pattern are assumed to be for such ops. // This assumption permits the generated BoxedKernelWrapper specializations to simply // return out arguments. // @@ -300,7 +300,7 @@ struct BoxedKernelWrapper< // 5. out of place ops that take multiple non-const Tensor references as their // final arguments, and return them in a std::tuple. // -// Note: all signatures matching this pattern are are assumed to be for such ops. +// Note: all signatures matching this pattern are assumed to be for such ops. // This assumption permits the generated BoxedKernelWrapper specializations to simply // return the out arguments. // diff --git a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h index f48246c02fd6..2b2228bb944d 100644 --- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h +++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -180,6 +181,13 @@ namespace impl { "You tried to register a kernel with an unsupported input type: ArrayRef. Please use List, List or Tensor instead."); }; + template + struct assert_is_valid_input_type, AllowDeprecatedTypes> + : assert_is_valid_input_type { + static_assert(!std::is_same::value, + "You tried to register a kernel with an unsupported input type: OptionalArrayRef. Please use List, List or Tensor instead."); + }; + template struct assert_is_valid_input_type, AllowDeprecatedTypes> : assert_is_valid_input_type { @@ -233,6 +241,10 @@ namespace impl { struct assert_is_valid_output_type, AllowDeprecatedTypes> : assert_is_valid_output_type {}; + template + struct assert_is_valid_output_type, AllowDeprecatedTypes> + : assert_is_valid_output_type {}; + template struct assert_is_valid_output_type, AllowDeprecatedTypes> : assert_is_valid_output_type { @@ -358,16 +370,33 @@ namespace impl { return ivalue_to_arg, AllowDeprecatedTypes>::call(v); } }; + template + struct ivalue_to_arg final { + static std::vector call(IValue& v) { + return ivalue_to_arg, AllowDeprecatedTypes>::call(v); + } + }; template struct ivalue_to_arg>, AllowDeprecatedTypes> final { // If an argument is optional>, convert the IValue to an optional> and pass that - // to the operator. OptionalArray is basically a optional> but impliticly convertible + // to the operator. OptionalArray is basically a optional> but implicitly convertible // to optional>. static OptionalArray call(IValue& v) { return ivalue_to_arg, AllowDeprecatedTypes>::call(v); } }; + template + struct ivalue_to_arg, AllowDeprecatedTypes> final { + // If an argument is OptionalArrayRef, convert the IValue to an + // optional> and pass that to the operator. OptionalArray + // is basically a optional> but implicitly convertible to + // OptionalArrayRef + static OptionalArray call(IValue& v) { + return ivalue_to_arg, AllowDeprecatedTypes>::call(v); + } + }; + // return_to_ivalue template struct return_to_ivalue final {}; diff --git a/aten/src/ATen/core/builtin_function.h b/aten/src/ATen/core/builtin_function.h index 3c6fd0c77cad..6f1e9e75ea3e 100644 --- a/aten/src/ATen/core/builtin_function.h +++ b/aten/src/ATen/core/builtin_function.h @@ -62,7 +62,7 @@ struct BuiltinOpFunction : public Function { return *this; } - bool call(Stack& stack, size_t, c10::function_ref) override { + bool call(Stack& stack, c10::optional, c10::function_ref) override { run(stack); return false; } diff --git a/aten/src/ATen/core/class_type.h b/aten/src/ATen/core/class_type.h index 3a019708cdda..67507c89bf1b 100644 --- a/aten/src/ATen/core/class_type.h +++ b/aten/src/ATen/core/class_type.h @@ -391,6 +391,7 @@ struct TORCH_API ClassType : public NamedType { std::vector unresolved_class_attributes = {}); std::string annotation_str_impl(TypePrinter printer = nullptr) const override { + (void)printer; // Suppress unused variable warning const auto& n = name().value(); return n.qualifiedName(); } diff --git a/aten/src/ATen/core/custom_class.cpp b/aten/src/ATen/core/custom_class.cpp index f61766c0cef3..2bba7e6df62f 100644 --- a/aten/src/ATen/core/custom_class.cpp +++ b/aten/src/ATen/core/custom_class.cpp @@ -4,22 +4,38 @@ #include #include #include +#include #include #include namespace c10 { -ska::flat_hash_map& getCustomClassTypeMap() { +static ska::flat_hash_map& getCustomClassTypeMap() { static ska::flat_hash_map tmap; return tmap; } -std::unordered_map>& -getClassConverter() { - static std::unordered_map> - classConverter; - return classConverter; +c10::ClassTypePtr getCustomClassTypeImpl(const std::type_index &tindex) { + auto& tmap = c10::getCustomClassTypeMap(); + auto res = tmap.find(tindex); + if (C10_UNLIKELY(res == tmap.end())) { + // type_index is not guaranteed to be unique across shared libraries on some platforms + // For example see https://github.com/llvm-mirror/libcxx/blob/78d6a7767ed57b50122a161b91f59f19c9bd0d19/include/typeinfo#L133 + // Also, this is not the case if RTLD_LOCAL option is used, see + // https://github.com/pybind/pybind11/blob/f791dc8648e1f6ec33f402d679b6b116a76d4e1b/include/pybind11/detail/internals.h#L101-L106 + // Take a slow path of iterating over all registered types and compare their names + auto class_name = std::string(tindex.name()); + for(const auto &it: tmap) { + if (class_name == it.first.name()) { + // Do not modify existing type map here as this template is supposed to be called only once per type + // from getCustomClassTypeImpl() + return it.second; + } + } + TORCH_CHECK(false, "Can't find class id in custom class type map for ", tindex.name()); + } + return res->second; } } // namespace c10 @@ -29,7 +45,7 @@ namespace torch { namespace detail { void record_custom_class(std::string name) { - RECORD_FUNCTION_WITH_SCOPE(at::RecordScope::CUSTOM_CLASS, name, {}); + RECORD_FUNCTION_WITH_SCOPE(at::RecordScope::CUSTOM_CLASS, name, c10::ArrayRef{}); } } // namespace detail diff --git a/aten/src/ATen/core/custom_class.h b/aten/src/ATen/core/custom_class.h index 54d7bfecd762..ff9bda981b29 100644 --- a/aten/src/ATen/core/custom_class.h +++ b/aten/src/ATen/core/custom_class.h @@ -2,45 +2,17 @@ #include #include -#include #include #include #include -#include -#include namespace c10 { struct ClassType; using ClassTypePtr = std::shared_ptr; -TORCH_API ska::flat_hash_map& -getCustomClassTypeMap(); - -template -c10::ClassTypePtr getCustomClassTypeImpl() { - auto& tmap = c10::getCustomClassTypeMap(); - auto tindex = std::type_index(typeid(T)); - auto res = tmap.find(tindex); - if (C10_UNLIKELY(res == tmap.end())) { - // type_index is not guaranteed to be unique across shared libraries on some platforms - // For example see https://github.com/llvm-mirror/libcxx/blob/78d6a7767ed57b50122a161b91f59f19c9bd0d19/include/typeinfo#L133 - // Also, this is not the case if RTLD_LOCAL option is used, see - // https://github.com/pybind/pybind11/blob/f791dc8648e1f6ec33f402d679b6b116a76d4e1b/include/pybind11/detail/internals.h#L101-L106 - // Take a slow path of iterating over all registered types and compare their names - auto class_name = std::string(tindex.name()); - for(const auto &it: tmap) { - if (class_name == it.first.name()) { - // Do not modify existing type map here as this template is supposed to be called only once per type - // from getCustomClassTypeImpl() - return it.second; - } - } - TORCH_CHECK(false, "Can't find class id in custom class type map for ", tindex.name()); - } - return res->second; -} +TORCH_API c10::ClassTypePtr getCustomClassTypeImpl(const std::type_index &tindex); template const c10::ClassTypePtr& getCustomClassType() { @@ -48,10 +20,9 @@ const c10::ClassTypePtr& getCustomClassType() { // hash lookup can be a hot path, so just cache. // For the same reason, it's fine If this ends up getting duplicated across // DSO boundaries for whatever reason. - static c10::ClassTypePtr cache = getCustomClassTypeImpl(); + static c10::ClassTypePtr cache = getCustomClassTypeImpl( + std::type_index(typeid(T))); return cache; } -TORCH_API std::unordered_map>& -getClassConverter(); } diff --git a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp index a930edc2db63..9180d0d19e64 100644 --- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp +++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp @@ -6,11 +6,52 @@ namespace c10 { void DispatchKeyExtractor::setOperatorHasFallthroughForKey(DispatchKey k, bool has_fallthrough) { + // (1) update nonFallthroughKeys_ if (has_fallthrough) { nonFallthroughKeys_ = nonFallthroughKeys_.remove(k); } else { nonFallthroughKeys_ = nonFallthroughKeys_.add(k); } + // (2) update nonFallthroughKeysPerBackend_ + if (isPerBackendFunctionalityKey(toFunctionalityKey(k))) { + // This is a per-backend functionality key. + // We need to figure out what the current backend is, + // and only update the bitset for that backend. + // subtracting 1 because the first backend should have index 0 (CPU), + // But the enum starts with BackendComponent::InvalidBit. + auto backend_idx = static_cast(toBackendComponent(k)) - 1; + TORCH_INTERNAL_ASSERT(backend_idx >= 0 && static_cast(backend_idx) < nonFallthroughKeysPerBackend_.size()); + if (has_fallthrough) { + nonFallthroughKeysPerBackend_[backend_idx] = nonFallthroughKeysPerBackend_[backend_idx].remove(k); + } else { + nonFallthroughKeysPerBackend_[backend_idx] = nonFallthroughKeysPerBackend_[backend_idx].add(k); + } + + // Set requiresBitsetPerBackend_ accordingly + for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size() - 1)) { + if (nonFallthroughKeysPerBackend_[i] != nonFallthroughKeysPerBackend_[i+1]) { + requiresBitsetPerBackend_ = true; + return; + } + } + requiresBitsetPerBackend_ = false; + return; + } else { + // Otherwise, if a fallthrough is set for a functionality that isn't per backend, + // Then we update the fallthrough bitset for EVERY backend. + // TODO: we could probably optimize this by only lazily updating these values + // the first time that we see requiresBitsetPerBackend_ = true + // (which should almost never happen) + if (has_fallthrough) { + for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size())) { + nonFallthroughKeysPerBackend_[i] = nonFallthroughKeysPerBackend_[i].remove(k); + } + } else { + for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size())) { + nonFallthroughKeysPerBackend_[i] = nonFallthroughKeysPerBackend_[i].add(k); + } + } + } } std::string DispatchKeyExtractor::dumpState() const { diff --git a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h index 4d2e7d0d4bdc..d5345b28e714 100644 --- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h +++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h @@ -74,7 +74,7 @@ namespace detail { } } } - void operator()(at::ArrayRef> xs) { + void operator()(at::ArrayRef>) { // Just checking that the handling of Tensor?[] didn't change. TORCH_INTERNAL_ASSERT(false); } @@ -89,7 +89,7 @@ namespace detail { } } template - void operator()(const T& x) { + void operator()(const T&) { // do nothing } }; @@ -156,14 +156,24 @@ struct TORCH_API DispatchKeyExtractor final { } }); // Keys that are fallthrough should be skipped - return impl::computeDispatchKeySet(ks, nonFallthroughKeys_); + if (requiresBitsetPerBackend_) { + auto backend_idx = ks.getBackendIndex(); + return impl::computeDispatchKeySet(ks, nonFallthroughKeysPerBackend_[backend_idx]); + } else { + return impl::computeDispatchKeySet(ks, nonFallthroughKeys_); + } } template DispatchKeySet getDispatchKeySetUnboxed(const Args&... args) const { auto ks = detail::multi_dispatch_key_set(args...); // Keys that are fallthrough should be skipped - return impl::computeDispatchKeySet(ks, nonFallthroughKeys_); + if (requiresBitsetPerBackend_) { + auto backend_idx = ks.getBackendIndex(); + return impl::computeDispatchKeySet(ks, nonFallthroughKeysPerBackend_[backend_idx]); + } else { + return impl::computeDispatchKeySet(ks, nonFallthroughKeys_); + } } void setOperatorHasFallthroughForKey(DispatchKey k, bool has_fallthrough); @@ -193,7 +203,12 @@ struct TORCH_API DispatchKeyExtractor final { explicit DispatchKeyExtractor(c10::utils::bitset dispatch_arg_indices_reverse) : dispatch_arg_indices_reverse_(dispatch_arg_indices_reverse) - , nonFallthroughKeys_(DispatchKeySet::FULL) {} + , nonFallthroughKeys_(DispatchKeySet::FULL) + , requiresBitsetPerBackend_(false) { + for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size())) { + nonFallthroughKeysPerBackend_[i] = DispatchKeySet::FULL; + } + } // this is a bitset that has ones for each argument index which has to be // considered for dispatch. This avoids having to iterate over the stack @@ -205,8 +220,14 @@ struct TORCH_API DispatchKeyExtractor final { // fallthrough c10::utils::bitset dispatch_arg_indices_reverse_; - // Set of keys for which the operator does NOT have fallthrough kernel. + // Set of functionality keys for which the operator does NOT have fallthrough kernel. DispatchKeySet nonFallthroughKeys_; + // Set of functionality keys for which the operator does NOT have fallthrough kernel, defined PER BACKEND. + // This is only needed if we know that the operator has a different set of fallthroughs defined for some backends. + std::array nonFallthroughKeysPerBackend_; + // Flag to tell us if we can use the single set of nonFallthroughKeys_ (fast path), + // or if we need to fall back to the slower path and check nonFallthroughKeysPerBackend_ + bool requiresBitsetPerBackend_; }; } diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp index 3dccc4645a82..66be5a187027 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.cpp +++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp @@ -267,14 +267,16 @@ void Dispatcher::cleanup(const OperatorHandle& op, const OperatorName& op_name) RegistrationHandleRAII Dispatcher::registerFallback(DispatchKey dispatchKey, KernelFunction kernel, std::string debug) { std::lock_guard lock(mutex_); + auto idx = getDispatchTableIndexForDispatchKey(dispatchKey); + TORCH_CHECK(idx >= 0 && static_cast(idx) < backendFallbackKernels_.size(), "idx=", idx); TORCH_CHECK( - !backendFallbackKernels_[static_cast(dispatchKey)].kernel.isValid(), + !backendFallbackKernels_[idx].kernel.isValid(), "Tried to register multiple backend fallbacks for the same dispatch key ", dispatchKey, "; previous registration ", - backendFallbackKernels_[static_cast(dispatchKey)].debug, ", new registration ", debug + backendFallbackKernels_[idx].debug, ", new registration ", debug ); // NB: inferred function schema is always nullptr for fallbacks, as fallbacks // cannot be unobxed - backendFallbackKernels_[static_cast(dispatchKey)] = impl::AnnotatedKernel(std::move(kernel), nullptr, std::move(debug)); + backendFallbackKernels_[idx] = impl::AnnotatedKernel(std::move(kernel), nullptr, std::move(debug)); for (auto& op : operators_) { op.op.updateFallback(*this, dispatchKey); @@ -288,7 +290,8 @@ RegistrationHandleRAII Dispatcher::registerFallback(DispatchKey dispatchKey, Ker void Dispatcher::deregisterFallback_(DispatchKey dispatchKey) { std::lock_guard lock(mutex_); - backendFallbackKernels_[static_cast(dispatchKey)] = {}; + auto idx = getDispatchTableIndexForDispatchKey(dispatchKey); + backendFallbackKernels_[idx] = {}; for (auto& op : operators_) { op.op.updateFallback(*this, dispatchKey); @@ -353,18 +356,18 @@ int64_t Dispatcher::sequenceNumberForRunningRecordFunction(DispatchKey dispatchK return seq_num; } -void Dispatcher::runRecordFunction(at::RecordFunction& guard, const OperatorHandle& op, DispatchKey dispatchKey, const torch::jit::Stack &stack) { - guard.before(op, stack, sequenceNumberForRunningRecordFunction(dispatchKey)); +void Dispatcher::runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey, const torch::jit::Stack &stack) { + guard.before(schema_ref, c10::ArrayRef(stack.data(), stack.size()), sequenceNumberForRunningRecordFunction(dispatchKey)); } -void Dispatcher::runRecordFunction(at::RecordFunction& guard, const OperatorHandle& op, DispatchKey dispatchKey, torch::jit::Stack &&stack) { - guard.before(op, std::move(stack), sequenceNumberForRunningRecordFunction(dispatchKey)); +void Dispatcher::runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey, torch::jit::Stack &&stack) { + guard.before(schema_ref, c10::ArrayRef(stack.data(), stack.size()), sequenceNumberForRunningRecordFunction(dispatchKey)); } -void Dispatcher::runRecordFunction(at::RecordFunction& guard, const OperatorHandle& op, DispatchKey dispatchKey) { +void Dispatcher::runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey) { // Setting sequence number in the Autograd case to associate // the forward range with the coresponding Autograd's node - guard.before(op, sequenceNumberForRunningRecordFunction(dispatchKey)); + guard.before(schema_ref, sequenceNumberForRunningRecordFunction(dispatchKey)); } } diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h index 14ffa2f94c9c..c52e7822ec5c 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.h +++ b/aten/src/ATen/core/dispatch/Dispatcher.h @@ -152,7 +152,7 @@ class TORCH_API Dispatcher final { template - static Return callWithDispatchKeySlowPath(const TypedOperatorHandle& op, bool pre_sampled, DispatchKeySet dispatchKeySet, const KernelFunction& kernel, Args... args); + static Return callWithDispatchKeySlowPath(const TypedOperatorHandle& op, at::StepCallbacks& stepCallbacks, DispatchKeySet dispatchKeySet, const KernelFunction& kernel, Args... args); // Like call, but intended for use in a redispatch in kernels that have explicitly performed the DispatchKey update calculatulation. // This will take the DispatchKeySet completely as is and dispatch to the kernel of the corresponding highest priority key in the set. @@ -263,9 +263,9 @@ class TORCH_API Dispatcher final { Dispatcher(); static int64_t sequenceNumberForRunningRecordFunction(DispatchKey dispatchKey); - static void runRecordFunction(at::RecordFunction& guard, const OperatorHandle& op, DispatchKey dispatchKey); - static void runRecordFunction(at::RecordFunction& guard, const OperatorHandle& op, DispatchKey dispatchKey, torch::jit::Stack &&stack); - static void runRecordFunction(at::RecordFunction& guard, const OperatorHandle& op, DispatchKey dispatchKey, const torch::jit::Stack &stack); + static void runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey); + static void runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey, torch::jit::Stack &&stack); + static void runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey, const torch::jit::Stack &stack); OperatorHandle findOrRegisterSchema_(FunctionSchema&& schema); OperatorHandle findOrRegisterName_(const OperatorName& op_name); @@ -291,7 +291,7 @@ class TORCH_API Dispatcher final { // Map from namespace to debug string (saying, e.g., where the library was defined) ska::flat_hash_map libraries_; - std::array(DispatchKey::NumDispatchKeys)> backendFallbackKernels_; + std::array backendFallbackKernels_; std::unique_ptr listeners_; std::mutex mutex_; @@ -494,33 +494,28 @@ struct CaptureKernelCall { // See [Note: Argument forwarding in the dispatcher] for why Args doesn't use && template -inline Return Dispatcher::callWithDispatchKeySlowPath(const TypedOperatorHandle& op, bool pre_sampled, DispatchKeySet dispatchKeySet, const KernelFunction& kernel, Args... args) { - // Check if we need to run callbacks registered with RecordFunction - // If true and callbacks need inputs, we box the arguments and pass - // them into the callbacks and also into the kernel call - - // Note: for perf reasons we wouldn't want to pass arguments into - // the function call or prematurely box them - at::RecordFunction guard(at::RecordScope::FUNCTION, pre_sampled); - if (C10_UNLIKELY(guard.isActive())) { - auto dispatchKey = dispatchKeySet.highestPriorityTypeId(); - if (op.operatorDef_->op.isObserved()) { - if (guard.needsInputs()) { - runRecordFunction(guard, op, dispatchKey, impl::boxArgs(args...)); - } else { - runRecordFunction(guard, op, dispatchKey); - } - if (C10_UNLIKELY(guard.needsOutputs())) { - // Calls the kernel and capture the output temporarily to pass to - // RecordFunction. - detail::CaptureKernelCall captureKernelCall( - kernel, op, dispatchKeySet, std::forward(args)...); - guard.setOutputs(captureKernelCall.getOutputs()); - // Releases the captured output to return to caller. - return std::move(captureKernelCall).release(); - } - } +inline Return Dispatcher::callWithDispatchKeySlowPath(const TypedOperatorHandle& op, at::StepCallbacks& stepCallbacks, DispatchKeySet dispatchKeySet, const KernelFunction& kernel, Args... args) { + // If callbacks need inputs, we box the arguments and pass them to the guard. + // Note: For perf reasons we wouldn't want to prematurely box the arguments. + at::RecordFunction guard(std::move(stepCallbacks)); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(op.operatorDef_->op.isObserved()); + auto dispatchKey = dispatchKeySet.highestPriorityTypeId(); + auto& schema = op.schema(); + auto schema_ref = std::reference_wrapper(schema); + guard.needsInputs() + ? runRecordFunction(guard, schema_ref, dispatchKey, impl::boxArgs(args...)) + : runRecordFunction(guard, schema_ref, dispatchKey); + + if (C10_UNLIKELY(guard.needsOutputs())) { + // Calls the kernel and capture the output temporarily to pass to + // RecordFunction. + detail::CaptureKernelCall captureKernelCall( + kernel, op, dispatchKeySet, std::forward(args)...); + guard.setOutputs(captureKernelCall.getOutputs()); + // Releases the captured output to return to caller. + return std::move(captureKernelCall).release(); } + // keeping the guard alive while executing the kernel return kernel.template call(op, dispatchKeySet, std::forward(args)...); } @@ -531,18 +526,11 @@ C10_DISPATCHER_INLINE_UNLESS_MOBILE Return Dispatcher::call(const TypedOperatorH detail::unused_arg_(args...); // workaround for a false-positive warning about unused parameters in gcc 5 auto dispatchKeySet = op.operatorDef_->op.dispatchKeyExtractor() .template getDispatchKeySetUnboxed(args...); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!c10::isAliasDispatchKey(dispatchKeySet.highestPriorityTypeId())); - const KernelFunction& kernel = op.operatorDef_->op.lookup(dispatchKeySet.highestPriorityTypeId()); + const KernelFunction& kernel = op.operatorDef_->op.lookup(dispatchKeySet); #ifndef PYTORCH_DISABLE_PER_OP_PROFILING - // By default, when there're no high-frequency or non-sampled callbacks, - // RecordFunction is pre-sampled as a perf optimization; - // shouldRunRecordFunction checks whether RecordFunction should be executed, - // and sets pre_sampled boolean argument value to whether pre-sampling was used - - // this boolean is passed into RecordFunction to adjust the sampling rates of - // the callbacks - bool pre_sampled = false; - if (C10_UNLIKELY(at::shouldRunRecordFunction(&pre_sampled))) { - return callWithDispatchKeySlowPath(op, pre_sampled, dispatchKeySet, kernel, std::forward(args)...); + auto step_callbacks = at::getStepCallbacks(at::RecordScope::FUNCTION); + if (C10_UNLIKELY(!step_callbacks.empty() && op.operatorDef_->op.isObserved())) { + return callWithDispatchKeySlowPath(op, step_callbacks, dispatchKeySet, kernel, std::forward(args)...); } #endif // PYTORCH_DISABLE_PER_OP_PROFILING return kernel.template call(op, dispatchKeySet, std::forward(args)...); @@ -553,7 +541,7 @@ template inline Return Dispatcher::redispatch(const TypedOperatorHandle& op, DispatchKeySet currentDispatchKeySet, Args... args) const { detail::unused_arg_(args...); // workaround for a false-positive warning about unused parameters in gcc 5 // do not use RecordFunction on redispatch - const KernelFunction& kernel = op.operatorDef_->op.lookup(currentDispatchKeySet.highestPriorityTypeId()); + const KernelFunction& kernel = op.operatorDef_->op.lookup(currentDispatchKeySet); return kernel.template call(op, currentDispatchKeySet, std::forward(args)...); } @@ -561,27 +549,21 @@ inline void Dispatcher::callBoxed(const OperatorHandle& op, Stack* stack) const // note: this doesn't need the mutex because write operations on the list keep iterators intact. const auto& entry = op.operatorDef_->op; auto dispatchKeySet = entry.dispatchKeyExtractor().getDispatchKeySetBoxed(stack); - const auto& kernel = entry.lookup(dispatchKeySet.highestPriorityTypeId()); + const auto& kernel = entry.lookup(dispatchKeySet); #ifndef PYTORCH_DISABLE_PER_OP_PROFILING - bool pre_sampled = false; - if (C10_UNLIKELY(at::shouldRunRecordFunction(&pre_sampled))) { - // using already existing stack to record function execution in observers - at::RecordFunction guard(at::RecordScope::FUNCTION, pre_sampled); - if (C10_UNLIKELY(guard.isActive())) { - auto dispatchKey = dispatchKeySet.highestPriorityTypeId(); - if (entry.isObserved()) { - if (guard.needsInputs()) { - runRecordFunction(guard, op, dispatchKey, *stack); - } else { - runRecordFunction(guard, op, dispatchKey); - } - } - } + auto step_callbacks = at::getStepCallbacks(at::RecordScope::FUNCTION); + if (C10_UNLIKELY(!step_callbacks.empty() && entry.isObserved())) { + at::RecordFunction guard(std::move(step_callbacks)); + auto dispatchKey = dispatchKeySet.highestPriorityTypeId(); + auto& schema = op.schema(); + auto schema_ref = std::reference_wrapper(schema); + guard.needsInputs() ? runRecordFunction(guard, schema_ref, dispatchKey, *stack) + : runRecordFunction(guard, schema_ref, dispatchKey); + // keeping the guard alive while executing the kernel kernel.callBoxed(op, dispatchKeySet, stack); - // track outputs - if (C10_UNLIKELY( - guard.isActive() && entry.isObserved() && guard.needsOutputs())) { + + if (C10_UNLIKELY(guard.needsOutputs())) { guard.setOutputs(*stack); } return; @@ -593,7 +575,7 @@ inline void Dispatcher::callBoxed(const OperatorHandle& op, Stack* stack) const inline void Dispatcher::redispatchBoxed(const OperatorHandle& op, DispatchKeySet dispatchKeySet, Stack* stack) const { // note: this doesn't need the mutex because write operations on the list keep iterators intact. const auto& entry = op.operatorDef_->op; - const auto& kernel = entry.lookup(dispatchKeySet.highestPriorityTypeId()); + const auto& kernel = entry.lookup(dispatchKeySet); return kernel.callBoxed(op, dispatchKeySet, stack); } diff --git a/aten/src/ATen/core/dispatch/ObservedOperators.cpp b/aten/src/ATen/core/dispatch/ObservedOperators.cpp index 1d1ed4c1926a..65545a221f9c 100644 --- a/aten/src/ATen/core/dispatch/ObservedOperators.cpp +++ b/aten/src/ATen/core/dispatch/ObservedOperators.cpp @@ -15,6 +15,7 @@ std::unordered_set& ObservedOperators::getUnobservedOperatorList() "aten::_version", "aten::is_complex", "profiler::_record_function_enter", + "profiler::_record_function_enter_new", "profiler::_record_function_exit", }; return not_observed_ops; diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp index d4d997fde69a..d5cc6d45933f 100644 --- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp +++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp @@ -283,7 +283,10 @@ std::pair OperatorEntry::computeDispatchTab } // 3. Backend fallback - auto dispatch_ix = static_cast(dispatch_key); + auto dispatch_ix = getDispatchTableIndexForDispatchKey(dispatch_key); + if (dispatch_ix < 0) { + return {missingKernel(), "backend fallback not registered on mobile"}; + } if (dispatcher.backendFallbackKernels_[dispatch_ix].kernel.isValid()) { return {dispatcher.backendFallbackKernels_[dispatch_ix], "backend fallback"}; } @@ -299,7 +302,7 @@ std::pair OperatorEntry::computeDispatchTab // or alias keys and their associated keysets). // This function should be considered a private helper for updateDispatchTable_() void OperatorEntry::updateDispatchTableEntry_(const c10::Dispatcher& dispatcher, DispatchKey dispatch_key) { - const auto dispatch_ix = c10::getDispatchTableIndexForDispatchKey(dispatch_key); + const auto dispatch_ix = getDispatchTableIndexForDispatchKey(dispatch_key); if (C10_UNLIKELY(dispatch_ix == -1)) { return; } @@ -329,8 +332,12 @@ void OperatorEntry::updateDispatchTable_(const c10::Dispatcher& dispatcher, Disp } // Note [Refresh Runtime Autograd entries in dispatchTable_] // Registering to backend key might affect computed entry at its Autograd backend key due to (2.1) & (2.3). + // In theory, we should only have to check if the given runtime key has "dense" functionality, + // e.g. DispatchKey::CPU (which is composed of DispatchKey::Dense and BackendComponent::CPUBit). + // However, there are some backends that should be included in this set that don't have the dense key set. + // E.g. DispatchKey::Meta, DispatchKey::ORT. if (c10::isBackendDispatchKey(dispatch_key)) { - DispatchKey autograd_key = getAutogradKeyFromBackend(dispatch_key); + DispatchKey autograd_key = getAutogradKeyFromBackend(toBackendComponent(dispatch_key)); updateDispatchTableEntry_(dispatcher, autograd_key); } } @@ -357,8 +364,9 @@ void OperatorEntry::updateDispatchTableFull_(const c10::Dispatcher& dispatcher) // catchAll. After catchAllKernel_ is removed, Undefined now can get a kernel from either CompositeExplicitAutograd // or CompositeImplicitAutograd alias key so that we don't break the support. Ideally isIncludedInAlias(Undefined, CompositeImplicitAutograd) // should return true, it returns false because Undefined cannot be represented in a DispatchKeySet. - for (uint8_t iter = 0; iter != static_cast(DispatchKey::NumDispatchKeys); ++iter) { - updateDispatchTable_(dispatcher, static_cast(iter)); + updateDispatchTable_(dispatcher, DispatchKey::Undefined); + for (auto k : DispatchKeySet(DispatchKeySet::FULL)) { + updateDispatchTable_(dispatcher, k); } } @@ -371,9 +379,13 @@ void OperatorEntry::checkInvariants() const { for (const auto& kv : kernels_) { TORCH_INTERNAL_ASSERT(kv.second.size() > 0, dumpState()); } - for (uint8_t iter = 0; iter != static_cast(DispatchKey::NumDispatchKeys); ++iter) { - auto expected_k = computeDispatchTableEntry(c10::Dispatcher::singleton(), static_cast(iter)); - TORCH_INTERNAL_ASSERT(expected_k._equalsBoxedAndUnboxed(dispatchTable_[iter]), + for (auto k : DispatchKeySet(DispatchKeySet::FULL)) { + auto expected_k = computeDispatchTableEntry(c10::Dispatcher::singleton(), k); + auto idx = getDispatchTableIndexForDispatchKey(k); + if (C10_UNLIKELY(idx == -1)) { + continue; + } + TORCH_INTERNAL_ASSERT(expected_k._equalsBoxedAndUnboxed(dispatchTable_[idx]), "Canonical state\n~~~~~~~~~~~\n", dumpState(), "\n\n" "Computed table:\n~~~~~~~~~~~\n", dumpComputedTable()); } @@ -384,8 +396,9 @@ std::string OperatorEntry::listAllDispatchKeys() const { str << "["; bool has_kernels = false; - for (uint8_t iter = 0; iter != static_cast(DispatchKey::NumDispatchKeys); ++iter) { - if (!dispatchTable_[iter].isValid()) { + for (auto k : DispatchKeySet(DispatchKeySet::FULL)) { + auto iter = getDispatchTableIndexForDispatchKey(k); + if (iter == -1 || !dispatchTable_[iter].isValid()) { continue; } if (has_kernels) { @@ -443,8 +456,12 @@ void OperatorEntry::reportError(DispatchKey dispatchKey) const { // updateDispatchTableFull_ would update the dispatch table to be) std::string OperatorEntry::dumpComputedTable() const { std::ostringstream oss; - for (uint8_t i = 0; i < static_cast(DispatchKey::NumDispatchKeys); i++) { - auto k = static_cast(i); + // Need to handle Undefined separately, because its a runtime key that can't be represented + // in a DispatchKeySet. + std::vector runtime_keys = {DispatchKey::Undefined}; + for (auto k : DispatchKeySet(DispatchKeySet::FULL)) runtime_keys.push_back(k); + + for (auto k : runtime_keys) { auto kernel_prov = computeDispatchTableEntryWithDebug(c10::Dispatcher::singleton(), k); if (kernel_prov.first.kernel.isValid()) { oss << toString(k) << ": " diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.h b/aten/src/ATen/core/dispatch/OperatorEntry.h index d98bd6bc6904..c0f90808280a 100644 --- a/aten/src/ATen/core/dispatch/OperatorEntry.h +++ b/aten/src/ATen/core/dispatch/OperatorEntry.h @@ -173,10 +173,10 @@ class TORCH_API OperatorEntry final { [[noreturn]] void reportError(DispatchKey dispatchKey) const; - const KernelFunction& lookup(DispatchKey k) const { - const auto idx = getDispatchTableIndexForDispatchKey(k); + const KernelFunction& lookup(DispatchKeySet ks) const { + const auto idx = ks.getDispatchTableIndexForDispatchKeySet(); if (C10_UNLIKELY(idx == -1)) { - reportError(k); + reportError(ks.highestPriorityTypeId()); } const auto& kernel = dispatchTable_[idx]; // A valid kernel *always* has a boxed kernel and *may* have an @@ -187,7 +187,7 @@ class TORCH_API OperatorEntry final { // in the common case. if (C10_UNLIKELY(!kernel.isValidUnboxed())) { if (!kernel.isValid()) { - reportError(k); + reportError(ks.highestPriorityTypeId()); } } return kernel; @@ -211,7 +211,7 @@ class TORCH_API OperatorEntry final { OperatorName name_; c10::optional schema_; - std::array dispatchTable_; + std::array dispatchTable_; DispatchKeyExtractor dispatchKeyExtractor_; // kernels_ stores all registered kernels for the corresponding dispatch key diff --git a/aten/src/ATen/core/dynamic_type.cpp b/aten/src/ATen/core/dynamic_type.cpp index 95050da593eb..5920d7c05f1f 100644 --- a/aten/src/ATen/core/dynamic_type.cpp +++ b/aten/src/ATen/core/dynamic_type.cpp @@ -123,6 +123,7 @@ DynamicType::DynamicType(const Type& other) : SharedType(DynamicType::Kind) { tag_ = Tag::T; \ break; FORALL_DYNAMIC_TYPES(CASE_TYPE) + FORALL_DYNAMIC_TYPES_FAKE(CASE_TYPE) #undef CASE_TYPE default: TORCH_INTERNAL_ASSERT(false, "Unsupported dynamic type: ", other.str()); @@ -210,6 +211,9 @@ TypeKind DynamicType::dynamicKind() const { case Tag::T: \ return TypeKind::T##Type; FORALL_DYNAMIC_TYPES(CASE_TYPE) + // FORALL_DYNAMIC_TYPES_FAKE is intentionally omitted here + // as these dynamic types map to the same tag, so they always + // resolve to integers #undef CASE_TYPE default: TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false); @@ -227,6 +231,8 @@ TypePtr DynamicType::fallback() const { return BoolType::get(); case Tag::Int: return IntType::get(); + case Tag::SymInt: + return SymIntType::get(); case Tag::Float: return FloatType::get(); case Tag::Complex: @@ -320,6 +326,8 @@ DynamicType::Ptr IValue::TagType::get(const c10::IValue& v) { return DynamicTypeTrait::getBaseType(); case Tag::Int: return DynamicTypeTrait::getBaseType(); + case Tag::SymInt: + return DynamicTypeTrait::getBaseType(); case Tag::Bool: return DynamicTypeTrait::getBaseType(); case Tag::String: @@ -368,7 +376,7 @@ ivalue::TupleTypeFactory::fallback(const Type& type) { for (const auto& elem : dyn.arguments().elems) { types.emplace_back(elem.ty); if (const auto& name = elem.label) { - fields.emplace_back(*elem.label); + fields.emplace_back(*name); } } if (const auto& name = dyn.name()) { @@ -381,6 +389,7 @@ ivalue::TupleTypeFactory::fallback(const Type& type) { #define DYNAMIC_TYPE_TAG_VALUE(NAME, _, __) \ constexpr bool DynamicTypeTrait::isBaseType; FORALL_DYNAMIC_TYPES(DYNAMIC_TYPE_TAG_VALUE) +FORALL_DYNAMIC_TYPES_FAKE(DYNAMIC_TYPE_TAG_VALUE) #undef DYNAMIC_TYPE_TAG_VALUE } // namespace c10 diff --git a/aten/src/ATen/core/dynamic_type.h b/aten/src/ATen/core/dynamic_type.h index d5551c9a5e51..a84644ddde04 100644 --- a/aten/src/ATen/core/dynamic_type.h +++ b/aten/src/ATen/core/dynamic_type.h @@ -16,6 +16,7 @@ constexpr DynamicTypeBits kDynamicAnyTypeBit = DYNAMIC_TYPE_BIT(30); constexpr DynamicTypeBits kDynamicNoneTypeBit = DYNAMIC_TYPE_BIT(1); constexpr DynamicTypeBits kDynamicIntTypeBit = DYNAMIC_TYPE_BIT(3); +constexpr DynamicTypeBits kDynamicSymIntTypeBit = DYNAMIC_TYPE_BIT(23); constexpr DynamicTypeBits kDynamicFloatTypeBit = DYNAMIC_TYPE_BIT(4); constexpr DynamicTypeBits kDynamicComplexTypeBit = DYNAMIC_TYPE_BIT(5); constexpr DynamicTypeBits kDynamicListTypeBit = DYNAMIC_TYPE_BIT(7); @@ -28,6 +29,7 @@ constexpr DynamicTypeBits kDynamicClassTypeBit = DYNAMIC_TYPE_BIT(10); _(Bool, DYNAMIC_TYPE_BIT(2), 1) \ _(Int, kDynamicIntTypeBit, 1) \ _(Float, kDynamicFloatTypeBit, 1) \ + _(SymInt, kDynamicSymIntTypeBit, 1) \ _(Complex, kDynamicComplexTypeBit, 1) \ _(Number, \ (kDynamicIntTypeBit | kDynamicFloatTypeBit | kDynamicComplexTypeBit), \ @@ -58,8 +60,14 @@ constexpr DynamicTypeBits kDynamicClassTypeBit = DYNAMIC_TYPE_BIT(10); _(Future, DYNAMIC_TYPE_BIT(22), 0) \ _(Any, 0xffffffff, 1) +#define FORALL_DYNAMIC_TYPES_FAKE(_) \ + _(ScalarType, kDynamicIntTypeBit, 1) \ + _(Layout, kDynamicIntTypeBit, 1) \ + _(MemoryFormat, kDynamicIntTypeBit, 1) + #define FORWARD_DECL_TYPE(NAME, _, __) struct NAME ## Type; FORALL_DYNAMIC_TYPES(FORWARD_DECL_TYPE) + FORALL_DYNAMIC_TYPES_FAKE(FORWARD_DECL_TYPE) #undef FORWARD_DECL_TYPE class DynamicType; @@ -136,6 +144,7 @@ class DynamicType : public SharedType { enum class Tag : DynamicTypeBits { #define DYNAMIC_TYPE_ITEM(NAME, VAL, _) NAME = VAL, FORALL_DYNAMIC_TYPES(DYNAMIC_TYPE_ITEM) + FORALL_DYNAMIC_TYPES_FAKE(DYNAMIC_TYPE_ITEM) #undef DYNAMIC_TYPE_ITEM }; @@ -159,7 +168,7 @@ class DynamicType : public SharedType { const Arguments& arguments() const { return arguments_; } - TypeKind dynamicKind() const; + TORCH_API TypeKind dynamicKind() const; // Should be used only on the server side to restore static type information. #ifndef C10_MOBILE @@ -223,6 +232,7 @@ C10_NOINLINE DynamicTypePtr makeBaseType(DynamicType::Tag tag); } \ }; // namespace c10 FORALL_DYNAMIC_TYPES(DYNAMIC_TYPE_TAG_VALUE) +FORALL_DYNAMIC_TYPES_FAKE(DYNAMIC_TYPE_TAG_VALUE) #undef DYNAMIC_TYPE_TAG_VALUE } // namespace c10 diff --git a/aten/src/ATen/core/enum_type.h b/aten/src/ATen/core/enum_type.h index 50e4f3b88ba2..720d5363799f 100644 --- a/aten/src/ATen/core/enum_type.h +++ b/aten/src/ATen/core/enum_type.h @@ -87,6 +87,7 @@ struct TORCH_API EnumType : public NamedType { std::string annotation_str_impl( TypePrinter printer = nullptr) const override { + (void)printer; // Suppress unused variable warning const auto& n = name().value(); return n.qualifiedName(); } diff --git a/aten/src/ATen/core/function.h b/aten/src/ATen/core/function.h index aa18e9a073df..76e417b8c5cf 100644 --- a/aten/src/ATen/core/function.h +++ b/aten/src/ATen/core/function.h @@ -29,7 +29,7 @@ using Kwargs = std::unordered_map; struct RecursiveMethodCallError : public std::exception {}; using TaskLauncher = std::function)>; -TORCH_API void preoptimizeGraph(std::shared_ptr& graph); +TORCH_API void preoptimizeGraph(std::shared_ptr& graph, bool disable_autocast=false); // A Function is a pure Graph with no implicit `self` object bound. // It contains schema information and the executor that manages the @@ -48,8 +48,9 @@ struct TORCH_API Function { virtual void run(Stack& stack) = 0; virtual c10::intrusive_ptr runAsync( - Stack& stack, + Stack& /*stack*/, TaskLauncher taskLauncher = at::launch) { + (void)taskLauncher; // Suppress unused variable warning TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false); return {}; } @@ -89,7 +90,7 @@ struct TORCH_API Function { // call() returns false. // Overload for server interpreter, a bailout size is needed for graph executor. - virtual bool call(Stack&, size_t, c10::function_ref) { + virtual bool call(Stack&, c10::optional, c10::function_ref) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false); return false; } diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h index 328ab79e2e44..2b3d51ee5e2e 100644 --- a/aten/src/ATen/core/function_schema.h +++ b/aten/src/ATen/core/function_schema.h @@ -30,8 +30,19 @@ struct Argument { c10::optional default_value = c10::nullopt, bool kwarg_only = false, c10::optional alias_info = c10::nullopt) + : Argument(name, type, type, N, default_value, kwarg_only, alias_info) {} + + Argument( + std::string name, + TypePtr fake_type, + TypePtr real_type, + c10::optional N = c10::nullopt, + c10::optional default_value = c10::nullopt, + bool kwarg_only = false, + c10::optional alias_info = c10::nullopt) : name_(std::move(name)), - type_(type ? std::move(type) : TensorType::get()), + type_(fake_type ? std::move(fake_type) : TensorType::get()), + real_type_(real_type ? std::move(real_type) : TensorType::get()), N_(std::move(N)), default_value_(std::move(default_value)), alias_info_(alias_info ? std::make_unique(std::move(*alias_info)) : nullptr), @@ -46,6 +57,7 @@ struct Argument { Argument(const Argument& rhs) : name_(rhs.name_), type_(rhs.type_), + real_type_(rhs.real_type_), N_(rhs.N_), default_value_(rhs.default_value_), alias_info_(rhs.alias_info_ ? std::make_unique(*rhs.alias_info_) : nullptr), @@ -58,6 +70,7 @@ struct Argument { if (this != &rhs) { name_ = rhs.name_; type_ = rhs.type_; + real_type_ = rhs.real_type_; N_ = rhs.N_; default_value_ = rhs.default_value_; alias_info_ = rhs.alias_info_ ? std::make_unique(*rhs.alias_info_) : nullptr; @@ -73,6 +86,9 @@ struct Argument { const TypePtr& type() const { return type_; } + const TypePtr& real_type() const { + return real_type_; + } c10::optional N() const { return N_; } @@ -153,6 +169,7 @@ struct Argument { private: std::string name_; TypePtr type_; + TypePtr real_type_; // this is ScalarType, not int, e.g. // for list types, an optional statically known length for the list // e.g. for int[3]: type = ListType::ofInts(), N = 3 // If present, this will allow scalars to be broadcast to this length to diff --git a/aten/src/ATen/core/function_schema_inl.h b/aten/src/ATen/core/function_schema_inl.h index 5d58ee88a418..dc4fdaf10133 100644 --- a/aten/src/ATen/core/function_schema_inl.h +++ b/aten/src/ATen/core/function_schema_inl.h @@ -162,7 +162,7 @@ inline bool FunctionSchema::isBackwardCompatibleWith( } } - // we want to test both out and default args seperately + // we want to test both out and default args separately size_t old_out_start_idx = findFirstOutArg(old.arguments()); size_t new_out_start_idx = findFirstOutArg(arguments()); @@ -212,7 +212,7 @@ inline bool FunctionSchema::isForwardCompatibleWith( return false; } - // we want to test both out and default args seperately + // we want to test both out and default args separately size_t old_out_start_idx = findFirstOutArg(old.arguments()); size_t new_out_start_idx = findFirstOutArg(arguments()); diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h index 36fb0f91e4c8..10be63c2c1d9 100644 --- a/aten/src/ATen/core/interned_strings.h +++ b/aten/src/ATen/core/interned_strings.h @@ -43,8 +43,15 @@ namespace c10 { _(prim, FusionGroup) \ _(prim, CudaFusionGroup) \ _(prim, CudaFusionGuard) \ + _(prim, oneDNNFusionGroup) \ + _(prim, oneDNNFusionGuard) \ _(prim, FunctionalGraph) \ _(prim, add_optional) \ + _(prim, view_copy) \ + _(prim, reshape_copy) \ + _(prim, squeeze_copy) \ + _(prim, unsqueeze_copy) \ + _(prim, flatten_copy) \ _(prim, DifferentiableGraph) \ _(prim, TensorExprGroup) \ _(prim, TensorExprDynamicGroup) \ @@ -60,6 +67,8 @@ namespace c10 { _(prim, PadPacked) /* onnx */ \ _(prim, Placeholder) /* debug */ \ _(prim, Print) \ + _(prim, EmptyListLiteral) \ + _(prim, LegacyTypedConstructor) \ _(prim, PythonOp) \ _(prim, IgnoredPythonOp) \ _(prim, Reverse) \ @@ -92,6 +101,7 @@ namespace c10 { _(prim, With) \ _(prim, Enter) \ _(prim, Exit) \ + _(prim, IfThenElse) \ _(aten, Bool) \ _(aten, Int) \ _(aten, FloatImplicit) \ @@ -102,7 +112,6 @@ namespace c10 { _(aten, Complex) \ _(aten, str) \ _(aten, Delete) \ - _(aten, gelu_) \ _(prim, device) \ _(prim, dtype) \ _(prim, layout) \ @@ -220,6 +229,7 @@ namespace c10 { _(onnx, Gemm) \ _(onnx, LSTM) \ _(onnx, MatMul) \ + _(onnx, Min) \ _(onnx, Mul) \ _(onnx, Pow) \ _(onnx, RNN) \ @@ -241,7 +251,7 @@ namespace c10 { _(onnx, Less) \ _(onnx, LessOrEqual) \ _(onnx, Not) \ - _(onnx, ATen) \ + _(aten, ATen) \ _(onnx, Split) \ _(onnx, ConstantOfShape) \ _(onnx, Cast) \ @@ -270,6 +280,9 @@ namespace c10 { _(onnx, Range) \ _(onnx, Tile) \ _(onnx, Where) \ + _(onnx, Optional) \ + _(onnx, OptionalGetElement) \ + _(onnx, OptionalHasElement) \ FORALL_ATTR_BASE_SYMBOLS(_) \ _(attr, Subgraph) \ _(attr, ReverseSubgraph) \ @@ -297,6 +310,7 @@ namespace c10 { _(attr, transA) \ _(attr, transB) \ _(attr, name) \ + _(attr, module) \ _(attr, beg) \ _(attr, idx) \ _(attr, split) \ @@ -308,8 +322,10 @@ namespace c10 { _(attr, cache_id) \ _(attr, new_axis) \ _(attr, warn_id) \ + _(attr, output_layouts) \ _(attr, allowzero) \ - _(attr, seen_none) + _(attr, seen_none) \ + _(attr, overload_name) enum class _keys : unique_t { #define DEFINE_KEY(ns, s) ns##_##s, diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp index 85117e345e30..eb977f09cbe6 100644 --- a/aten/src/ATen/core/ivalue.cpp +++ b/aten/src/ATen/core/ivalue.cpp @@ -91,6 +91,8 @@ c10::TypePtr IValue::TagType::get(const IValue& v) { return ComplexType::get(); case Tag::Int: return IntType::get(); + case Tag::SymInt: + return c10::SymIntType::get(); case Tag::Bool: return BoolType::get(); case Tag::String: @@ -271,8 +273,8 @@ bool operator==(const IValue& lhs, const IValue& rhs) { } bool IValue::ptrEqual(const IValue& lhs, const IValue& rhs) { - TORCH_INTERNAL_ASSERT(lhs.is_intrusive_ptr); - TORCH_INTERNAL_ASSERT(rhs.is_intrusive_ptr); + TORCH_INTERNAL_ASSERT(lhs.isIntrusivePtr()); + TORCH_INTERNAL_ASSERT(rhs.isIntrusivePtr()); return lhs.tag == rhs.tag && lhs.payload.u.as_intrusive_ptr == rhs.payload.u.as_intrusive_ptr; } @@ -298,6 +300,8 @@ IValue IValue::equals(const IValue& rhs) const { return rhs.isComplexDouble() && lhs.toComplexDouble() == rhs.toComplexDouble(); case Tag::Int: return rhs.isInt() && lhs.toInt() == rhs.toInt(); + case Tag::SymInt: + return rhs.isSymInt() && lhs.toSymInt() == rhs.toSymInt(); case Tag::Bool: return rhs.isBool() && lhs.toBool() == rhs.toBool(); case Tag::String: @@ -349,6 +353,8 @@ size_t IValue::hash(const IValue& v) { return c10::get_hash(v.payload.u.as_int); case Tag::Int: return c10::get_hash(v.payload.u.as_int); + case Tag::SymInt: + return c10::get_hash(v.payload.u.as_int); case Tag::String: return c10::get_hash(v.toStringRef()); case Tag::Tuple: @@ -398,8 +404,8 @@ bool IValue::is(const IValue& rhs) const { return rhs.isTensor() && lhs.toTensor().is_same(rhs.toTensor()); } - if (lhs.is_intrusive_ptr) { - return rhs.is_intrusive_ptr && ptrEqual(lhs, rhs); + if (lhs.isIntrusivePtr()) { + return rhs.isIntrusivePtr() && ptrEqual(lhs, rhs); } return lhs == rhs; } @@ -429,6 +435,15 @@ bool IValue::isTensorList() const { return isListOf(); } +bool IValue::isOptionalTensorList() const { + if (!isList()) { + return false; + } + const auto& ty = static_cast(payload.u.as_intrusive_ptr)->elementType; + const auto expected_ty = c10::getTypePtr>(); + return expected_ty == ty; +} + bool IValue::isIntList() const { return isListOf(); } @@ -567,6 +582,8 @@ std::ostream& IValue::repr( } case IValue::Tag::Int: return out << v.toInt(); + case IValue::Tag::SymInt: + return out << v.toSymInt(); case IValue::Tag::Bool: return out << (v.toBool() ? "True" : "False"); case IValue::Tag::Tuple: { @@ -753,6 +770,8 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) { return printComplex(out, v); } case IValue::Tag::Int: return out << v.toInt(); + case IValue::Tag::SymInt: + return out << v.toSymInt(); case IValue::Tag::Bool: return out << (v.toBool() ? "True" : "False"); case IValue::Tag::Tuple: { @@ -886,6 +905,7 @@ IValue IValue::deepcopy( case IValue::Tag::None: case IValue::Tag::Double: case IValue::Tag::Int: + case IValue::Tag::SymInt: case IValue::Tag::Bool: case IValue::Tag::Device: case IValue::Tag::Uninitialized: { @@ -1159,5 +1179,4 @@ TORCH_API intrusive_ptr collectAny( } return ctx->dstFuture; } - } // namespace c10 diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h index 81867348450d..e9a0caecc5d6 100644 --- a/aten/src/ATen/core/ivalue.h +++ b/aten/src/ATen/core/ivalue.h @@ -92,12 +92,29 @@ struct OptionalArray { return *this; } + // Used when saving an argument for the backwards pass. + OptionalArray& operator=(c10::OptionalArrayRef ref) { + if (ref) { + list = std::vector(ref->begin(), ref->end()); + } else { + list = nullopt; + } + return *this; + } + operator c10::optional>() { if (!list) { return nullopt; } return *list; } + + operator c10::OptionalArrayRef() { + if (!list) { + return nullopt; + } + return *list; + } }; // Capsule is an internal implementation detail of custom C++ classes. We @@ -127,6 +144,7 @@ struct Capsule { _(Double) \ _(ComplexDouble) \ _(Int) \ + _(SymInt) \ _(Bool) \ _(Tuple) \ _(String) \ @@ -183,13 +201,13 @@ struct Capsule { /// \endrst struct TORCH_API IValue final { IValue(const IValue& rhs) - : IValue(rhs.payload, rhs.tag, rhs.is_intrusive_ptr) { - if (is_intrusive_ptr && payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()) { + : IValue(rhs.payload, rhs.tag) { + if (isIntrusivePtr() && payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()) { c10::raw::intrusive_ptr::incref(payload.u.as_intrusive_ptr); } } - IValue(IValue&& rhs) noexcept : tag(rhs.tag), is_intrusive_ptr(rhs.is_intrusive_ptr) { + IValue(IValue&& rhs) noexcept : tag(rhs.tag) { moveFrom(std::move(rhs)); } @@ -330,12 +348,12 @@ struct TORCH_API IValue final { return isAliasOf(this->toTensor(), rhs.toTensor()); } - if (!this->is_intrusive_ptr) { + if (!isIntrusivePtr()) { // Primitive types don't alias anything return false; } - AT_ASSERT(rhs.is_intrusive_ptr); + AT_ASSERT(rhs.isIntrusivePtr()); // Other types can be compared by their ptr value return this->payload.u.as_intrusive_ptr == rhs.payload.u.as_intrusive_ptr; @@ -347,7 +365,7 @@ struct TORCH_API IValue final { return payload.as_tensor.use_count(); } - if (!is_intrusive_ptr) { + if (!isIntrusivePtrLegacyBehavior()) { return 1; } @@ -380,7 +398,6 @@ struct TORCH_API IValue final { } else { std::swap(payload.u, rhs.payload.u); } - std::swap(is_intrusive_ptr, rhs.is_intrusive_ptr); std::swap(tag, rhs.tag); } @@ -388,7 +405,7 @@ struct TORCH_API IValue final { // While some of these accessors could be generated through templates, // we prefer to write them manually for clarity - IValue(at::TensorBase t) : tag(Tag::Tensor), is_intrusive_ptr(false) { + IValue(at::TensorBase t) : tag(Tag::Tensor) { new (&payload.as_tensor) at::Tensor(std::move(t)); } bool isTensor() const { @@ -407,12 +424,7 @@ struct TORCH_API IValue final { return payload.as_tensor.unsafeGetTensorImpl(); } - IValue(at::Storage s) : tag(Tag::Storage), is_intrusive_ptr(static_cast(s)) { - // Note: the undefined tensor is not refcounted, so while it - // is tagged as a tensor, is_intrusive_ptr is set to false. - // This is not an optional optimization: our incref call - // *will not* do the right thing when called on an - // undefined tensor. + IValue(at::Storage s) : tag(Tag::Storage) { payload.u.as_intrusive_ptr = null_to_undefined_tensor(s.unsafeReleaseStorageImpl()); } bool isStorage() const { @@ -430,7 +442,7 @@ struct TORCH_API IValue final { /// @private [doxygen private] IValue(intrusive_ptr blob) - : tag(Tag::Blob), is_intrusive_ptr(true) { + : tag(Tag::Blob) { // TODO (after Tensor merge) If we pass in a Blob holding a Tensor, extract // and store it as a Tensor instead. payload.u.as_intrusive_ptr = null_to_undefined_tensor(blob.release()); @@ -497,7 +509,7 @@ struct TORCH_API IValue final { C10_NODISCARD ivalue::Tuple& toTupleRef() const; // Double - IValue(double d) : tag(Tag::Double), is_intrusive_ptr(false) { + IValue(double d) : tag(Tag::Double) { payload.u.as_double = d; } bool isDouble() const { @@ -539,10 +551,24 @@ struct TORCH_API IValue final { c10::intrusive_ptr toQuantizer() const&; // Int - IValue(int64_t i) : tag(Tag::Int), is_intrusive_ptr(false) { + IValue(int64_t i) : tag(Tag::Int) { payload.u.as_int = i; } + IValue(c10::SymInt i) : tag(Tag::SymInt) { + payload.u.as_int = i.data(); + } + + IValue(c10::SymIntArrayRef v); + + bool isSymInt() const { + return Tag::SymInt == tag; + } + + c10::SymInt toSymInt() const { + return c10::SymInt(payload.u.as_int); + } + // allow you to pass literals (3, 4) without ambiguity IValue(int32_t i) : IValue(static_cast(i)) {} @@ -556,7 +582,7 @@ struct TORCH_API IValue final { } // Bool - IValue(bool b) : tag(Tag::Bool), is_intrusive_ptr(false) { + IValue(bool b) : tag(Tag::Bool) { #if defined(__clang__) && defined(__x86_64__) // Initializing entire payload stops valgrind's from reporting // "jump or move depends on uninitialised value" in IValue copy constructor @@ -619,6 +645,12 @@ struct TORCH_API IValue final { c10::List toTensorList() const&; std::vector toTensorVector() const; + // OptionalTensorList + bool isOptionalTensorList() const; + c10::List> toOptionalTensorList() &&; + c10::List> toOptionalTensorList() const&; + std::vector> toOptionalTensorVector() const; + // GenericList IValue(c10::List v); bool isList() const { @@ -666,6 +698,8 @@ struct TORCH_API IValue final { template = nullptr> IValue(c10::optional v); + template = nullptr> + IValue(c10::OptionalArrayRef v); IValue(c10::nullopt_t); // ClassType @@ -698,7 +732,7 @@ struct TORCH_API IValue final { c10::intrusive_ptr toEnumHolder() const&; // None - IValue() : tag(Tag::None), is_intrusive_ptr(false) {} + IValue() : tag(Tag::None) {} bool isNone() const { return Tag::None == tag; } @@ -716,15 +750,17 @@ struct TORCH_API IValue final { // Scalar, which gets encoded as either an Int, a Double or a ComplexDouble IValue(const at::Scalar& s) : IValue() { if (s.isFloatingPoint()) { - *this = s.toDouble(); + tag = Tag::Double; + payload.u.as_double = s.toDouble(); } else if (s.isComplex()) { *this = s.toComplexDouble(); } else if (s.isBoolean()) { - *this = s.toBool(); - } else if (s.isIntegral(false)) { - *this = s.toLong(); + tag = Tag::Bool; + payload.u.as_bool = s.toBool(); } else { - TORCH_CHECK(false, "Unknown type in Scalar"); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(s.isIntegral(false), "Unknown type in Scalar"); + tag = Tag::Int; + payload.u.as_int = s.toLong(); } } @@ -745,7 +781,7 @@ struct TORCH_API IValue final { } // Device - IValue(c10::Device d) : tag(Tag::Device), is_intrusive_ptr(false) { + IValue(c10::Device d) : tag(Tag::Device) { payload.u.as_device.type = d.type(); payload.u.as_device.index = d.index(); } @@ -759,7 +795,7 @@ struct TORCH_API IValue final { //Stream IValue(c10::Stream stream) - : tag(Tag::Stream), is_intrusive_ptr(false) { + : tag(Tag::Stream) { payload.u.as_int = stream.pack(); } c10::Stream toStream() &&; @@ -788,7 +824,7 @@ struct TORCH_API IValue final { } // QScheme - IValue(at::QScheme qscheme) : tag(Tag::Int), is_intrusive_ptr(false) { + IValue(at::QScheme qscheme) : tag(Tag::Int) { payload.u.as_int = static_cast(qscheme); } @@ -804,12 +840,7 @@ struct TORCH_API IValue final { } // Generator - IValue(at::Generator g) : tag(Tag::Generator), is_intrusive_ptr(g.defined()) { - // Note: the undefined generator is not refcounted, so while it - // is tagged as a generator, is_intrusive_ptr is set to false. - // This is not an optional optimization: our incref call - // *will not* do the right thing when called on an - // undefined generator. + IValue(at::Generator g) : tag(Tag::Generator) { payload.u.as_intrusive_ptr = null_to_undefined_tensor(g.unsafeReleaseGeneratorImpl()); } bool isGenerator() const { @@ -881,7 +912,10 @@ struct TORCH_API IValue final { const IValue& v); bool isPtrType() const { - return (isTensor() && payload.as_tensor.defined()) || is_intrusive_ptr; + if (isTensor()) { + return payload.as_tensor.defined(); + } + return isIntrusivePtrLegacyBehavior(); } /// @private [doxygen private] @@ -989,7 +1023,7 @@ struct TORCH_API IValue final { // the "wrong" one of as_tensor and as_intrusive_ptr and 2) enable // the compiler to generate the same code for each case. It is // surprisingly difficult to get this right. - if (isTensor() || is_intrusive_ptr) { + if (isTensor() || isIntrusivePtr()) { c10::intrusive_ptr_target* p = isTensor() ? payload.as_tensor.unsafeGetTensorImpl() : payload.u.as_intrusive_ptr; c10::intrusive_ptr::reclaim(p); // No need to make this destructor call! @@ -1013,14 +1047,78 @@ struct TORCH_API IValue final { payload.u = rhs.payload.u; } tag = rhs.tag; - is_intrusive_ptr = rhs.is_intrusive_ptr; rhs.clearToNone(); } void clearToNone() noexcept { payload.u.as_int = 0; tag = Tag::None; - is_intrusive_ptr = false; + } + + bool isIntrusivePtr() const { + switch (tag) { + case Tag::None: + return false; + case Tag::Tensor: + return false; + case Tag::Storage: + return true; + case Tag::Generator: + return true; + case Tag::Double: + return false; + case Tag::ComplexDouble: + return true; + case Tag::Int: + return false; + case Tag::SymInt: + return false; + case Tag::Bool: + return false; + case Tag::Tuple: + return true; + case Tag::String: + return true; + case Tag::Blob: + return true; + case Tag::GenericList: + return true; + case Tag::GenericDict: + return true; + case Tag::Future: + return true; + case Tag::Device: + return false; + case Tag::Stream: + return false; + case Tag::Object: + return true; + case Tag::PyObject: + return true; + case Tag::Uninitialized: + return false; + case Tag::Capsule: + return true; + case Tag::RRef: + return true; + case Tag::Quantizer: + return true; + case Tag::Enum: + return true; + } + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false, "unexpected tag ", static_cast(tag)); + return false; + } + + // Storage and Generator were treated specially when + // is_intrusive_ptr was stored as explicit state. This getter + // preserves the old behavior for use with WeakIValue for now. + bool isIntrusivePtrLegacyBehavior() const { + if (tag == Tag::Storage || tag == Tag::Generator) { + return payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(); + } else { + return isIntrusivePtr(); + } } union Payload { @@ -1048,7 +1146,7 @@ struct TORCH_API IValue final { ~Payload() {} }; - IValue(const Payload& p, Tag t, bool i) : tag(t), is_intrusive_ptr(i) { + IValue(const Payload& p, Tag t) : tag(t) { if (isTensor()) { new (&payload.as_tensor) at::Tensor(p.as_tensor); } else { @@ -1063,7 +1161,6 @@ struct TORCH_API IValue final { Payload payload; Tag tag; - bool is_intrusive_ptr; friend struct WeakIValue; }; @@ -1080,7 +1177,7 @@ struct TORCH_API WeakIValue final { } WeakIValue(const IValue& rhs) : tag(rhs.tag), - is_intrusive_ptr(rhs.is_intrusive_ptr) { + is_intrusive_ptr(rhs.isIntrusivePtrLegacyBehavior()) { if (rhs.isTensor()) { payload.as_intrusive_ptr = rhs.unsafeToTensorImpl(); is_intrusive_ptr = true; @@ -1124,7 +1221,7 @@ struct TORCH_API WeakIValue final { if (!is_intrusive_ptr) { IValue::Payload newPayload; newPayload.u = payload; - return IValue(newPayload, tag, false); + return IValue(newPayload, tag); } if (IValue::Tag::Tensor == tag) { auto temp = c10::weak_intrusive_ptr::reclaim( @@ -1147,7 +1244,7 @@ struct TORCH_API WeakIValue final { if (!pl.u.as_intrusive_ptr) { return IValue(); } else { - return IValue(pl, tag, true); + return IValue(pl, tag); } } } diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h index 6c524da40ed2..7f87380e7267 100644 --- a/aten/src/ATen/core/ivalue_inl.h +++ b/aten/src/ATen/core/ivalue_inl.h @@ -1179,7 +1179,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target { continue; } c10::Device device = storage->device(); - if (!device.is_cpu()) { + if (!device.is_cpu() && !device.is_meta()) { TORCH_CHECK_VALUE( device.type() == impl.type(), "Expected all data ptrs to be on a device of type ", @@ -1235,7 +1235,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target { // We need devices to be sorted in order to use ensureIsSubsetOfDevices. static std::vector sortAndDeduplicateDevices( - const c10::impl::VirtualGuardImpl& impl, + const c10::impl::VirtualGuardImpl& /*impl*/, std::vector devices) { std::sort( devices.begin(), devices.end(), @@ -1584,6 +1584,7 @@ DEFINE_TO(at::MemoryFormat, toMemoryFormat) DEFINE_TO(at::QScheme, toQScheme) DEFINE_TO(at::Dimname, toDimname) DEFINE_TO(at::Generator, toGenerator) +DEFINE_TO(c10::SymInt, toSymInt) template struct _fake_type {}; @@ -1880,6 +1881,22 @@ inline std::vector IValue::toTensorVector() const { return createVectorFromList( static_cast(payload.u.as_intrusive_ptr)); } +inline c10::List> IValue::toOptionalTensorList() && { + AT_ASSERT(isOptionalTensorList(), "Expected OptionalTensorList but got ", tagKind()); + return c10::List>(moveToIntrusivePtr()); +} +inline c10::List> IValue::toOptionalTensorList() const& { + AT_ASSERT(isOptionalTensorList(), "Expected OptionalTensorList but got ", tagKind()); + return c10::List>(toIntrusivePtr()); +} +inline std::vector> IValue::toOptionalTensorVector() const { + AT_ASSERT(isOptionalTensorList(), "Expected OptionalTensorList but got ", tagKind()); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(), + "called toOptionalTensorVector on null intrusive_ptr IValue"); + return createVectorFromList>( + static_cast(payload.u.as_intrusive_ptr)); +} inline c10::List IValue::toList() && { AT_ASSERT(isList(), "Expected GenericList but got ", tagKind()); return c10::List(moveToIntrusivePtr()); @@ -1922,7 +1939,7 @@ inline ivalue::Tuple& IValue::toTupleRef() const { } inline IValue::IValue(c10::intrusive_ptr v) - : tag(Tag::Tuple), is_intrusive_ptr(true) { + : tag(Tag::Tuple) { payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release()); } template < @@ -1950,14 +1967,14 @@ inline IValue::IValue(std::tuple&& t) } inline IValue::IValue(c10::intrusive_ptr v) - : tag(Tag::String), is_intrusive_ptr(true) { + : tag(Tag::String) { payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release()); } inline IValue::IValue(std::string v) : IValue(ivalue::ConstantString::create(std::move(v))) {} inline IValue::IValue(c10::impl::GenericList v) - : tag(Tag::GenericList), is_intrusive_ptr(true) { + : tag(Tag::GenericList) { payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.impl_.release()); } @@ -1973,6 +1990,7 @@ inline IValue::IValue(at::ArrayRef v) : IValue(c10::List()) { list.push_back(e); } } +inline IValue::IValue(c10::SymIntArrayRef v) : IValue(at::ArrayRef(v.data(), v.size())) {} template > inline IValue::IValue(const std::vector& v) : IValue(c10::List()) { auto list = to>(); @@ -1981,6 +1999,13 @@ inline IValue::IValue(const std::vector& v) : IValue(c10::List()) { list.push_back(e); } } +template > +inline IValue::IValue(c10::OptionalArrayRef v) : IValue() { + if (v.has_value()) { + *this = IValue(std::move(*v)); + } +} + template inline IValue::IValue(std::array v) : IValue(c10::List()) { auto list = to>(); @@ -1991,7 +2016,7 @@ inline IValue::IValue(std::array v) : IValue(c10::List()) { } inline IValue::IValue(c10::impl::GenericDict v) - : tag(Tag::GenericDict), is_intrusive_ptr(true) { + : tag(Tag::GenericDict) { payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.impl_.release()); } template @@ -2018,17 +2043,17 @@ inline IValue::IValue(c10::optional v) : IValue() { inline IValue::IValue(c10::nullopt_t) : IValue() {} inline IValue::IValue(c10::intrusive_ptr v) - : tag(Tag::Object), is_intrusive_ptr(true) { + : tag(Tag::Object) { payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release()); } inline IValue::IValue(c10::intrusive_ptr v) - : tag(Tag::PyObject), is_intrusive_ptr(true) { + : tag(Tag::PyObject) { payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release()); } inline IValue::IValue(c10::intrusive_ptr v) - : tag(Tag::Enum), is_intrusive_ptr(true) { + : tag(Tag::Enum) { payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release()); } @@ -2036,7 +2061,6 @@ inline IValue IValue::make_capsule( intrusive_ptr blob) { IValue iv; iv.tag = Tag::Capsule; - iv.is_intrusive_ptr = true; iv.payload.u.as_intrusive_ptr = null_to_undefined_tensor(blob.release()); return iv; } @@ -2059,27 +2083,26 @@ IValue::IValue(c10::intrusive_ptr custom_class) { ivalue_obj->setSlot(0, IValue::make_capsule(std::move(custom_class))); payload.u.as_intrusive_ptr = null_to_undefined_tensor(ivalue_obj.release()); tag = Tag::Object; - is_intrusive_ptr = true; } inline IValue::IValue(c10::intrusive_ptr v) - : tag(Tag::Future), is_intrusive_ptr(true) { + : tag(Tag::Future) { payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release()); } inline IValue::IValue(c10::intrusive_ptr v) - : tag(Tag::RRef), is_intrusive_ptr(true) { + : tag(Tag::RRef) { payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release()); } inline IValue::IValue(c10::intrusive_ptr v) - : tag(Tag::Quantizer), is_intrusive_ptr(true) { + : tag(Tag::Quantizer) { payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release()); } template inline IValue::IValue(c10::complex c) - : tag(Tag::ComplexDouble), is_intrusive_ptr(true) { + : tag(Tag::ComplexDouble) { auto v = c10::make_intrusive(c); payload.u.as_intrusive_ptr = v.release(); } @@ -2150,7 +2173,7 @@ inline bool IValue::isSameIdentity(const IValue& rhs) const { // Str) return value equality // 2. If it is a tensor type, we need to take undefined tensor into account // 3. Undefined_tensor is None and vice versa should be true - // 4. If it is a reference type (i.e. is_intrusive_ptr), then is is True when + // 4. If it is a reference type (i.e. isIntrusivePtr()), then is True when // the pointed-to object is the same. // 5. False for all other comparisons. if (this->isNone() && rhs.isNone()) { @@ -2175,7 +2198,7 @@ inline bool IValue::isSameIdentity(const IValue& rhs) const { } else { // for objects holding in IValue, do shallow compare on pointer address to // testify the identity - return this->is_intrusive_ptr && rhs.is_intrusive_ptr && + return this->isIntrusivePtr() && rhs.isIntrusivePtr() && this->payload.u.as_intrusive_ptr == rhs.payload.u.as_intrusive_ptr; } } @@ -2192,7 +2215,7 @@ IValue from_(c10::intrusive_ptr x, std::false_type) { return IValue(std::move(x)); } template -IValue from_(T&& x, std::false_type) { +IValue from_(T&& /*x*/, std::false_type) { static_assert( guts::false_t::value, "You are calling from with a type that it doesn't support, and isn't a potential custom class (ie: is an intrusive_ptr)"); @@ -2221,7 +2244,7 @@ struct MaybeOwnedTraits { if (from.isTensor()) { return IValue(MaybeOwnedTraits::createBorrow(from.toTensor())); } else { - return IValue(from.payload, from.tag, from.is_intrusive_ptr); + return IValue(from.payload, from.tag); } } @@ -2232,7 +2255,7 @@ struct MaybeOwnedTraits { } else if (rhs.isTensor()) { lhs = IValue(MaybeOwnedTraits::createBorrow(rhs.toTensor())); } else { - lhs = IValue(rhs.payload, rhs.tag, rhs.is_intrusive_ptr); + lhs = IValue(rhs.payload, rhs.tag); } } diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h index c04d48213bad..8dd9e15f7dd4 100644 --- a/aten/src/ATen/core/jit_type.h +++ b/aten/src/ATen/core/jit_type.h @@ -227,6 +227,9 @@ struct TORCH_API OptionalType : public UnionType { // common cast Optional[Tensor] for undefined tensor type static TypePtr ofTensor(); + // + // global singleton + static TypePtr get(TypePtr inner); private: explicit OptionalType(TypePtr contained); @@ -435,6 +438,17 @@ struct TORCH_API SymbolicShape { return dims_; } + c10::optional> symbolicDims() const { + if (!dims_) { + return c10::nullopt; + } + auto symbolic_dims = std::vector(); + for (const ShapeSymbol& s : *dims_) { + symbolic_dims.push_back(!s.is_static()); + } + return symbolic_dims; + } + // Checks whether the shape is fully defined/complete, ie. rank and sizes // of every dimension are known. bool isComplete() const { @@ -456,6 +470,14 @@ struct TORCH_API SymbolicShape { // result will be unranked. SymbolicShape merge(const SymbolicShape& other) const; + friend bool operator==(const SymbolicShape& lhs, const SymbolicShape& rhs) { + return lhs.dims_ == rhs.dims_; + } + + friend bool operator!=(const SymbolicShape& lhs, const SymbolicShape& rhs) { + return !(lhs == rhs); + } + private: c10::optional> dims_; }; @@ -466,7 +488,7 @@ inline bool isComplete(const Stride& s) { } template -inline bool isComplete(const T& t) { +inline bool isComplete(const T& /*t*/) { return true; } } @@ -764,15 +786,36 @@ struct TORCH_API TensorType : public SharedType { static const TypeKind Kind = TypeKind::TensorType; - static std::vector contiguousStridesOf(at::IntArrayRef sizes) { - std::vector strides(sizes.size()); - if (sizes.empty()) // zero-dim case + static std::vector contiguousStridesOf( + at::IntArrayRef in_sizes, + at::MemoryFormat memory_format = MemoryFormat::Contiguous) { + auto contiguous_fn = [](const at::IntArrayRef& sizes, + const std::vector& dim_order) { + std::vector strides(sizes.size()); + if (sizes.empty()) // zero-dim case + return strides; + + strides[dim_order[0]] = 1; + for (size_t i = 1; i < dim_order.size(); i++) { + auto cur_dim = dim_order[i]; + auto pre_dim = dim_order[i - 1]; + strides[cur_dim] = strides[pre_dim] * sizes[pre_dim]; + } return strides; - strides.back() = 1; - for (size_t i = strides.size() - 1; i > 0; i--) { - strides[i - 1] = strides[i] * sizes[i]; + }; + + std::vector dim_order(in_sizes.size()); + if (memory_format == MemoryFormat::ChannelsLast) { + dim_order = {1, 3, 2, 0}; + } else if (memory_format == MemoryFormat::ChannelsLast3d) { + dim_order = {1, 4, 3, 2, 0}; + } else { + auto ndims = in_sizes.size(); + for (size_t i = 0; i < ndims; i++) { + dim_order[i] = ndims - i - 1; // Reverse + } } - return strides; + return contiguous_fn(in_sizes, dim_order); } private: @@ -840,6 +883,14 @@ struct TORCH_API ListType bool isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const override; + // global singleton + // Given an inner type T and an identifier, + // this function wil return the global singleton type pointer + // the type List. + // The extra "identifier" argument is needed beccause we have multiple container types + // that all re-use this function (List, array, etc.) + static TypePtr get(std::string identifier, TypePtr inner); + // common cast List[Tensor] static ListTypePtr ofTensors(); static ListTypePtr ofOptionalTensors(); @@ -866,7 +917,11 @@ struct TORCH_API DictType : public SharedType { static const TypeKind Kind = TypeKind::DictType; static DictTypePtr create(TypePtr key, TypePtr value) { - switch (key->kind()) { + auto kind = key->kind(); + if (auto dyn = key->castRaw()) { + kind = dyn->dynamicKind(); + } + switch (kind) { case TypeKind::AnyType: case TypeKind::IntType: case TypeKind::BoolType: @@ -924,6 +979,14 @@ struct TORCH_API DictType : public SharedType { return false; } + // global singleton + // Given an inner type T and an identifier, + // this function wil return the global singleton type pointer + // the type List. + // The extra "identifier" argument is needed beccause we have multiple container types + // that all re-use this function (Dict and unordered_map) + static TypePtr get(std::string identifier, TypePtr key, TypePtr val); + private: DictType(TypePtr key, TypePtr value) : SharedType(TypeKind::DictType), @@ -1173,6 +1236,7 @@ struct TORCH_API NumberType : public Type { NumberType(TypeKind kind = TypeKind::NumberType) : Type(kind) {} std::string annotation_str_impl(TypePrinter printer = nullptr) const override { + (void)printer; // Suppress unused variable warning return "number"; // technically not a valid python type, but // we need to use it when parsing back in annotations // for implicit conversions @@ -1200,6 +1264,7 @@ struct TORCH_API FloatType : public NumberType { private: FloatType() : NumberType(TypeKind::FloatType) {} std::string annotation_str_impl(TypePrinter printer = nullptr) const override { + (void)printer; // Suppress unused variable warning return "float"; } }; @@ -1225,10 +1290,36 @@ struct TORCH_API ComplexType : public NumberType { private: ComplexType() : NumberType(TypeKind::ComplexType) {} std::string annotation_str_impl(TypePrinter printer = nullptr) const override { + (void)printer; // Suppress unused variable warning return "complex"; } }; +// We need to introduce `SymIntType` to represent the `SymInt` type +// used in function schemas e.g. `aten::narrow_copy(... SymInt length) +// `SymInt` will be used to enable tracing arithmetic operations on +// dimension values. Please see [SymInt.h] for more information +struct SymIntType; +using SymIntTypePtr = SingletonTypePtr; +struct TORCH_API SymIntType : public Type { + bool equals(const Type& rhs) const override { + return rhs.kind() == kind(); + } + std::string str() const override { + return "SymInt"; + } + std::string annotation_str_impl(TypePrinter printer = nullptr) const override { + // TODO: will become a Union[SymbolicIntNode|int] in the near future + return "int"; + } + static const TypeKind Kind = TypeKind::SymIntType; + // global singleton + static SymIntTypePtr get(); + + private: + SymIntType() : Type(TypeKind::SymIntType) {} +}; + struct IntType; using IntTypePtr = SingletonTypePtr; // This type represents a Python int number @@ -1250,6 +1341,7 @@ struct TORCH_API IntType : public NumberType { private: IntType() : NumberType(TypeKind::IntType) {} std::string annotation_str_impl(TypePrinter printer = nullptr) const override { + (void)printer; // Suppress unused variable warning return "int"; } }; @@ -1284,6 +1376,7 @@ struct TORCH_API StringType : public Type { return annotation_str(); } std::string annotation_str_impl(TypePrinter printer = nullptr) const override { + (void)printer; // Suppress unused variable warning return "str"; } static const TypeKind Kind = TypeKind::StringType; @@ -1304,6 +1397,7 @@ struct TORCH_API StorageType : public Type { return annotation_str(); } std::string annotation_str_impl(TypePrinter printer = nullptr) const override { + (void)printer; // Suppress unused variable warning return "Storage"; } static const TypeKind Kind = TypeKind::StorageType; @@ -1339,6 +1433,7 @@ struct TORCH_API FunctionType : public NamedType { private: FunctionType(torch::jit::Function* function); std::string annotation_str_impl(TypePrinter printer = nullptr) const override { + (void)printer; // Suppress unused variable warning const auto& n = name().value(); return n.qualifiedName(); } @@ -1686,6 +1781,13 @@ struct getTypePtr_ final { return IntType::get(); } }; + +template <> +struct getTypePtr_ final { + static decltype(auto) call() { + return SymIntType::get(); + } +}; template <> struct getTypePtr_ final { static decltype(auto) call() { @@ -1756,55 +1858,95 @@ struct getTypePtr_ final { template struct getTypePtr_> final { static const auto& call() { - static auto type = ListType::create(getTypePtr_::call()); + static auto inner_type = getTypePtr_::call(); + // The "per vector" static singleton needs to live in a .cpp file, + // otherwise we'll end up with one singleton instance per shared library. + static auto type = ListType::get("vector", inner_type); return type; } }; template struct getTypePtr_> final { static const auto& call() { - static auto type = ListType::create(getTypePtr_::call()); + static auto inner_type = getTypePtr_::call(); + // The "per ArrayRef" static singleton needs to live in a .cpp file, + // otherwise we'll end up with one singleton instance per shared library. + static auto type = ListType::get("ArrayRef", inner_type); + return type; + } +}; +template <> +struct getTypePtr_ final { + static const auto& call() { + static auto type = ListType::create(getTypePtr_::call()); return type; } }; template struct getTypePtr_> final { static const auto& call() { - static auto type = ListType::create(getTypePtr_::call()); + static auto inner_type = getTypePtr_::call(); + // The "per List" static singleton needs to live in a .cpp file, + // otherwise we'll end up with one singleton instance per shared library. + static auto type = ListType::get("List", inner_type); return type; } }; template struct getTypePtr_> final { static const auto& call() { - static auto type = ListType::create(getTypePtr_::call()); + static auto inner_type = getTypePtr_::call(); + // The "per array" static singleton needs to live in a .cpp file, + // otherwise we'll end up with one singleton instance per shared library. + // (Concatenating the length onto the end of the string because we want a unique + // type_ptr created for every std::array type). + static auto type = ListType::get(std::string("array") + std::to_string(N), inner_type); return type; } }; template struct getTypePtr_> final { static const auto& call() { - static auto type = - DictType::create(getTypePtr_::call(), getTypePtr_::call()); + static auto inner_key_type = getTypePtr_::call(); + static auto inner_val_type = getTypePtr_::call(); + // The "per unordered_map" static singleton needs to live in a .cpp file, + // otherwise we'll end up with one singleton instance per shared library. + static auto type = DictType::get("unordered_map", inner_key_type, inner_val_type); return type; } }; template struct getTypePtr_> final { static const auto& call() { - static auto type = - DictType::create(getTypePtr_::call(), getTypePtr_::call()); + static auto inner_key_type = getTypePtr_::call(); + static auto inner_val_type = getTypePtr_::call(); + // The "per Dict" static singleton needs to live in a .cpp file, + // otherwise we'll end up with one singleton instance per shared library. + static auto type = DictType::get("Dict", inner_key_type, inner_val_type); return type; } }; + template struct getTypePtr_> final { static const auto& call() { - static auto type = TypeFactory::create( - getTypePtr_::call()); + static auto inner_type = getTypePtr_::call(); + // The "per optional" static singleton needs to live in a .cpp file, + // otherwise we'll end up with one singleton instance per shared library. + static auto type = OptionalType::get(inner_type); return type; } }; + + +template<> +struct getTypePtr_ final { + static const auto& call() { + static auto type = OptionalType::create(getTypePtr_::call()); + return type; + } +}; + template struct getTypePtr_> final { static const auto& call() { @@ -1922,6 +2064,7 @@ struct TORCH_API InterfaceType : public NamedType { std::ostream* why_not); std::string annotation_str_impl(TypePrinter printer = nullptr) const override { + (void)printer; // Suppress unused variable warning return name()->qualifiedName(); } @@ -1944,24 +2087,12 @@ bool equals(const Type& rhs) const override { EnumerationType() : Type(Kind) {} }; -struct LayoutType; -using LayoutTypePtr = SingletonTypePtr; -// This type represents a Generator -struct TORCH_API LayoutType : public EnumerationType { -std::string str() const override { -return "Layout"; -} -static const TypeKind Kind = TypeKind::LayoutType; -// global singleton -static LayoutTypePtr get(); - -private: -LayoutType() : EnumerationType() {} -}; +// WARNING: These enumeration types below DO NOT actually get parsed out +// from the logical schema strings, instead they are mapped as ints. To +// observe these types, use real_type() instead of type() on Argument struct ScalarTypeType; using ScalarTypeTypePtr = SingletonTypePtr; -// This type represents a Generator struct TORCH_API ScalarTypeType : public EnumerationType { std::string str() const override { return "ScalarType"; @@ -1974,6 +2105,34 @@ static ScalarTypeTypePtr get(); ScalarTypeType() : EnumerationType() {} }; +struct MemoryFormatType; +using MemoryFormatTypePtr = SingletonTypePtr; +struct TORCH_API MemoryFormatType : public EnumerationType { +std::string str() const override { +return "MemoryFormatType"; +} +static const TypeKind Kind = TypeKind::MemoryFormatType; +// global singleton +static MemoryFormatTypePtr get(); + +private: +MemoryFormatType() : EnumerationType() {} +}; + +struct LayoutType; +using LayoutTypePtr = SingletonTypePtr; +struct TORCH_API LayoutType : public EnumerationType { +std::string str() const override { +return "LayoutType"; +} +static const TypeKind Kind = TypeKind::LayoutType; +// global singleton +static LayoutTypePtr get(); + +private: +LayoutType() : EnumerationType() {} +}; + // the common supertype of all lists, // List[T] <: AnyList for all T struct AnyListType; diff --git a/aten/src/ATen/core/jit_type_base.h b/aten/src/ATen/core/jit_type_base.h index 99ef1be1dd9b..2e1c84db867b 100644 --- a/aten/src/ATen/core/jit_type_base.h +++ b/aten/src/ATen/core/jit_type_base.h @@ -6,6 +6,8 @@ #include #include +#include +#include #include #include #include @@ -43,11 +45,13 @@ namespace c10 { _(CapsuleType) \ _(InterfaceType) \ _(QSchemeType) \ - _(LayoutType) \ _(ScalarTypeType) \ + _(LayoutType) \ + _(MemoryFormatType) \ _(AnyListType) \ _(AnyTupleType) \ _(AnyClassType) \ + _(SymIntType) \ _(UnionType) \ _(DynamicType) @@ -94,8 +98,9 @@ TORCH_DECLARE_SINGLETON(DeviceObjType); TORCH_DECLARE_SINGLETON(StreamObjType); TORCH_DECLARE_SINGLETON(CapsuleType); TORCH_DECLARE_SINGLETON(PyObjectType); -TORCH_DECLARE_SINGLETON(LayoutType); TORCH_DECLARE_SINGLETON(ScalarTypeType); +TORCH_DECLARE_SINGLETON(LayoutType); +TORCH_DECLARE_SINGLETON(MemoryFormatType); TORCH_DECLARE_SINGLETON(AnyListType); TORCH_DECLARE_SINGLETON(AnyTupleType); TORCH_DECLARE_SINGLETON(AnyClassType); @@ -140,7 +145,7 @@ struct TORCH_API Type { protected: Type(TypeKind kind) : kind_(kind) {} - virtual std::string annotation_str_impl(TypePrinter printer) const { + virtual std::string annotation_str_impl(TypePrinter /*printer*/) const { return str(); } // a == b @@ -567,7 +572,7 @@ struct TORCH_API Type { // per-type constructor, you only need to override this if the // containedTypes() is not empty virtual TypePtr createWithContained( - std::vector contained_types) const { + std::vector /*contained_types*/) const { AT_ERROR( "type with contained types did not overload createWithContained: ", str()); diff --git a/aten/src/ATen/core/library.cpp b/aten/src/ATen/core/library.cpp index ba16a5bf10c1..ba608e98ad53 100644 --- a/aten/src/ATen/core/library.cpp +++ b/aten/src/ATen/core/library.cpp @@ -235,6 +235,9 @@ Library& Library::_fallback(CppFunction&& f) & { // Note if dispatch_key is DispatchKey::Undefined, it'll be ignored here since Undefined // isn't a runtime key, you shouldn't register anything to it at all. for (auto k : c10::getRuntimeDispatchKeySet(*dispatch_key)) { + // mobile doesn't use all dispatch keys, so skip any fallback registrations for the unused keys. + auto idx = getDispatchTableIndexForDispatchKey(k); + if (idx < 0) continue; registrars_.emplace_back( c10::Dispatcher::singleton().registerFallback( k, diff --git a/aten/src/ATen/core/op_registration/op_allowlist.h b/aten/src/ATen/core/op_registration/op_allowlist.h index 997fb937093b..6e77c5653881 100644 --- a/aten/src/ATen/core/op_registration/op_allowlist.h +++ b/aten/src/ATen/core/op_registration/op_allowlist.h @@ -185,7 +185,7 @@ constexpr bool op_allowlist_contains_name_in_schema(string_view allowlist, strin // and should be registered. When we turn this on, the list of valid // mobile dispatch keys is hard coded (but you need to make sure // that you have the correct set of dispatch keys for this). -constexpr bool dispatch_key_allowlist_check(DispatchKey k) { +constexpr bool dispatch_key_allowlist_check(DispatchKey /*k*/) { #ifdef C10_MOBILE return true; // Disabled for now: to be enabled later! diff --git a/aten/src/ATen/core/op_registration/op_registration_test.cpp b/aten/src/ATen/core/op_registration/op_registration_test.cpp index ba4c8052e372..05294c25548e 100644 --- a/aten/src/ATen/core/op_registration/op_registration_test.cpp +++ b/aten/src/ATen/core/op_registration/op_registration_test.cpp @@ -284,7 +284,8 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsInSameOpCallAndCall EXPECT_FALSE(called_kernel1); EXPECT_TRUE(called_kernel2); - for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) { + // Test for out of tree lazy backends- ::Lazy key is now registered to TS backend in tree + for (c10::DispatchKey key : {c10::DispatchKey::XLA}) { std::string expectMessage = expectedMessageForBackend(key); expectThrows([&] { callOp(*op, dummyTensor(key)); @@ -591,7 +592,7 @@ TEST(OperatorRegistrationTest, AutogradBackendOverridesAutogradKernel) { void LazyBackendsAutogradOverridesAutogradKernel(DispatchKey key) { auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options() - .kernel(c10::getAutogradKeyFromBackend(key)) + .kernel(c10::getAutogradKeyFromBackend(toBackendComponent(key))) .kernel(DispatchKey::Autograd)); auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""}); @@ -613,14 +614,13 @@ void LazyBackendsAutogradOverridesAutogradKernel(DispatchKey key) { EXPECT_FALSE(called_nonautograd); } +// no longer test ::Lazy key here +// since it is now registered to TS backend in-tree and thus behaves differently, +// does not throw the expected 'could not run..' messages TEST(OperatorRegistrationTest, AutogradXLAOverridesAutogradKernel) { LazyBackendsAutogradOverridesAutogradKernel(DispatchKey::XLA); } -TEST(OperatorRegistrationTest, AutogradLazyOverridesAutogradKernel) { - LazyBackendsAutogradOverridesAutogradKernel(DispatchKey::Lazy); -} - void whenRegisterWithLazyBackendsAndCatchAll_AutogradLazyBackendsIsNotFilled(DispatchKey key) { { auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options() @@ -670,6 +670,17 @@ TEST(OperatorRegistrationTest, whenRegisterWithLazyKernelAndCatchAll_AutogradLaz whenRegisterWithLazyBackendsAndCatchAll_AutogradLazyBackendsIsNotFilled(DispatchKey::Lazy); } +TEST(OperatorRegistrationTest, whenregisteringwithinvalidoverloadname) { + expectThrows([] { + auto registrar = c10::RegisterOperators().op("_test::dummy.default", c10::RegisterOperators::options() + .kernel(DispatchKey::CPU, [] (const int64_t&) {})); + }, "default is not a legal overload name for aten operators"); + expectThrows([] { + auto registrar = c10::RegisterOperators().op("_test::dummy.__name__", c10::RegisterOperators::options() + .kernel(DispatchKey::CPU, [] (const int64_t&) {})); + }, "__name__ is not a legal overload name for aten operators"); +} + TEST(OperatorRegistrationTest, givenLambdaKernel_whenRegisteringWithMismatchingCppSignatures_thenFails) { expectThrows([] { auto registrar = c10::RegisterOperators().op("_test::dummy", c10::RegisterOperators::options() @@ -1243,6 +1254,16 @@ TEST(OperatorRegistrationTest, testAvailableArgTypes) { "(Dict(str, Dict(int, str)?[])[] a) -> Dict(str, Dict(int, str)?[])[]"); } +TEST(NewOperatorRegistrationTest, erroroutwithinvalidoverloadname) { + auto m = MAKE_TORCH_LIBRARY(_test); + expectThrows([&] { + m.def("dummy.default(Tensor self) -> Tensor"); + }, "default is not a legal overload name for aten operators"); + expectThrows([&] { + m.def("dummy.__name__(Tensor self) -> Tensor"); + }, "__name__ is not a legal overload name for aten operators"); +} + TEST(NewOperatorRegistrationTest, testBasics) { auto m = MAKE_TORCH_LIBRARY(_test); m.def("dummy(Tensor self) -> Tensor"); @@ -1770,22 +1791,22 @@ TEST(NewOperatorRegistrationTest, dispatchAutogradPrecedence) { TEST(NewOperatorRegistrationTest, throwsWhenRegisterToBackendMapsToAutogradOther) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - bool sparsecpu_called, math_called = false; + bool fpga_called, math_called = false; auto m = MAKE_TORCH_LIBRARY(test); - m.def("fn", torch::dispatch(c10::DispatchKey::SparseCPU, [&](const Tensor& x) { sparsecpu_called = true; return x; })); + m.def("fn", torch::dispatch(c10::DispatchKey::FPGA, [&](const Tensor& x) { fpga_called = true; return x; })); m.impl("fn", c10::DispatchKey::CompositeImplicitAutograd, [&](const Tensor& x) { math_called = true; return x; }); auto op = Dispatcher::singleton().findSchema({"test::fn", ""}); ASSERT_TRUE(op.has_value()); { - callOp(*op, dummyTensor(c10::DispatchKey::SparseCPU)); - ASSERT_TRUE(sparsecpu_called); + callOp(*op, dummyTensor(c10::DispatchKey::FPGA)); + ASSERT_TRUE(fpga_called); } { expectThrows([&] { - callOp(*op, dummyTensor(c10::DispatchKey::SparseCPU, /*requires_grad=*/true)); + callOp(*op, dummyTensor(c10::DispatchKey::FPGA, /*requires_grad=*/true)); }, "test::fn has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther."); } } @@ -1828,18 +1849,15 @@ TEST(NewOperatorRegistrationTest, dispatchMultipleTensors) { } { - // TODO(#43908): currently this will fallthrough AutogradPrivateUse1 then call catchall kernel - // at AutogradCPU, while backend extenders are indeed expecting to call PrivateUse1 kernel. - // This confusing behavior is caused by we registering fallthrough as backend fallback for - // Autograd keys. Note users could always work around this by registering the same kernel to - // AutogradPrivateUse1 as shown below until we support it. auto op = Dispatcher::singleton().findOp({"test::fn", ""}); ASSERT_TRUE(op.has_value()); catchall_called = false; + privateuse1_called = false; callOp(*op, dummyTensor(c10::DispatchKey::PrivateUse1, /*requires_grad=*/true), dummyTensor(c10::DispatchKey::CPU, /*requires_grad=*/true)); - ASSERT_TRUE(catchall_called); + ASSERT_FALSE(catchall_called); + ASSERT_TRUE(privateuse1_called); } m.impl("fn", c10::DispatchKey::AutogradPrivateUse1, [&](const Tensor& x, const Tensor& y) { privateuse1_called = true; return x; }); @@ -1855,6 +1873,27 @@ TEST(NewOperatorRegistrationTest, dispatchMultipleTensors) { } } +TEST(NewOperatorRegistrationTest, registerCompositeImplicitAutogradWithCPUKernel_andCallAutogradOtherKernel_callsComposite) { + bool math_called = false; + bool cpu_called = false; + auto m = MAKE_TORCH_LIBRARY(test); + m.def("fn(Tensor dummy) -> Tensor"); + m.impl("fn", c10::DispatchKey::CPU, [&](const Tensor& x) { cpu_called = true; return x; }); + m.impl("fn", c10::DispatchKey::CompositeImplicitAutograd, [&](const Tensor& x) { math_called = true; return x; }); + + auto op = Dispatcher::singleton().findSchema({"test::fn", ""}); + ASSERT_TRUE(op.has_value()); + + { + math_called = cpu_called = false; + // Meta should redispatch to the AutogradOther backend, + // which the composite kernel should be registered to. + callOp(*op, dummyTensor(c10::DispatchKey::Meta, /*requires_grad=*/true)); + ASSERT_TRUE(math_called); + ASSERT_FALSE(cpu_called); + } +} + TEST(NewOperatorRegistrationTest, dispatchMultiple) { bool cpu_called = false; bool cuda_called = false; diff --git a/aten/src/ATen/core/stack.h b/aten/src/ATen/core/stack.h index 35bb9964eb39..1695e5995ab6 100644 --- a/aten/src/ATen/core/stack.h +++ b/aten/src/ATen/core/stack.h @@ -188,7 +188,7 @@ struct TuplePacker { template struct TuplePacker<0, Args...> { - static void execute(Stack& stack, std::tuple&& t){}; + static void execute(Stack& /*stack*/, std::tuple&& /*t*/){}; }; template diff --git a/aten/src/ATen/core/tensor_type.cpp b/aten/src/ATen/core/tensor_type.cpp index cb7b6cc27667..87972825d291 100644 --- a/aten/src/ATen/core/tensor_type.cpp +++ b/aten/src/ATen/core/tensor_type.cpp @@ -3,6 +3,40 @@ namespace c10 { +namespace { + +// The idea is to only mark possible overlap across dimensions. We want to +// return false for expanded tensors and permuted tensors, for which dimensional +// collapsing is safe. +bool possible_cross_dimension_overlap(c10::IntArrayRef sizes, c10::IntArrayRef strides) { + int n_dim = static_cast(sizes.size()); + std::vector stride_indices(n_dim); + std::iota(stride_indices.rbegin(), stride_indices.rend(), 0); + + // sort indices going with ascending strides + for (int i = 1; i < n_dim; i++) { + auto c = i; + for (int j = i - 1; j >= 0; j--) { + if (strides[stride_indices[j]] > strides[stride_indices[c]]) { + std::swap(stride_indices[j], stride_indices[c]); + c = j; + } + } + } + + for (const auto i : c10::irange(1, n_dim)) { + if (i != 0) { + // we are being conservative on checking for memory overlap + if (sizes[stride_indices[i]] != 1 && strides[stride_indices[i]] < sizes[stride_indices[i-1]] * strides[stride_indices[i-1]]) { + return true; + } + } + } + return false; +} + +} + const TensorTypePtr& TensorType::get() { static auto value = TensorType::create( {}, {}, SymbolicShape(), VaryingShape{}, {}); @@ -115,6 +149,10 @@ VaryingShape TensorType::computeStrideProps( bool tensor_contiguity) { int n_dim = static_cast(sizes.size()); std::vector stride_indices(n_dim); + // default has_overlap to false as we only compute overlap when: + // 1. input sizes/strides fails format check; + // 2. tensor_contiguity are not set. + bool has_overlap = false; // Sorting strides in ascending order // Example: @@ -155,7 +193,7 @@ VaryingShape TensorType::computeStrideProps( } else if (strides[a] > strides[b]) { return 1; } else { // strides[a] == strides[b] - if (sizes[a] < sizes[b] || a > b ) { + if (sizes[a] > sizes[b]) { return 1; } } @@ -173,21 +211,35 @@ VaryingShape TensorType::computeStrideProps( } } } + // conveniently is_contiguous_strides/is_contiguous_strides only returns + // true when there's no memory overlap, so we only re-compute has_overlap + // in the last branch when both returns false + if (!tensor_contiguity) { + // trust tensor_contiguity and only computes overlap when it is not set + has_overlap = possible_cross_dimension_overlap(sizes, strides); + } } std::vector stride_properties; + + for (size_t i = 0; i < stride_indices.size(); i++) { bool contiguous_ = tensor_contiguity; if (!contiguous_) { - // innermost stride expected to be 1 - // TODO: turn contiguous_ into an enum CONTIGUOUS, NONCONTIGUOUS, - // BROADCASTED - if (i == 0) { - contiguous_ = strides[stride_indices[i]] == 1; + if (!has_overlap) { + // innermost stride expected to be 1 + // TODO: turn contiguous_ into an enum CONTIGUOUS, NONCONTIGUOUS, + // BROADCASTED + if (i == 0) { + contiguous_ = strides[stride_indices[i]] == 1; + } else { + contiguous_ = strides[stride_indices[i]] == 1 || + (strides[stride_indices[i]] != 0 && + strides[stride_indices[i]] == + strides[stride_indices[i - 1]] * sizes[stride_indices[i - 1]]); + } } else { - contiguous_ = strides[stride_indices[i]] == 1 || - (strides[stride_indices[i]] != 0 && - strides[stride_indices[i]] == - strides[stride_indices[i - 1]] * sizes[stride_indices[i - 1]]); + // leaving this assign statement for readability; + contiguous_ = false; } } stride_properties.emplace_back(stride_indices[i], contiguous_, strides[stride_indices[i]]); @@ -201,7 +253,7 @@ TensorTypePtr TensorType::create(const at::Tensor& t) { VaryingShape stride_indices; VaryingShape strides; VaryingShape sizes; - if (!t.is_mkldnn() && !t.is_sparse() && !t.is_sparse_csr()) { + if (t.layout() == at::kStrided) { sizes = VaryingShape{t.sizes().vec()}; strides = VaryingShape{t.strides().vec()}; return TensorType::create( diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp index a3f0451dc61c..00e4ceffc156 100644 --- a/aten/src/ATen/core/type.cpp +++ b/aten/src/ATen/core/type.cpp @@ -11,6 +11,28 @@ #include #include +namespace std { +template<> +struct hash> { + size_t operator()(std::tuple const& t) const { + // This hashing is all hidden behind a static initializer so it + // doesn't have to be optimal + auto hash = std::hash()(std::get<0>(t)); + hash = at::hash_combine(hash, std::hash()(std::get<1>(t))); + hash = at::hash_combine(hash, std::hash()(std::get<2>(t))); + return hash; + } +}; +template<> +struct hash> { + size_t operator()(std::tuple const& t) const { + auto hash = std::hash()(std::get<0>(t)); + hash = at::hash_combine(hash, std::hash()(std::get<1>(t))); + return hash; + } +}; +} // namespace std + namespace c10 { static_assert( @@ -208,6 +230,10 @@ LayoutTypePtr LayoutType::get() { static LayoutTypePtr value(new LayoutType()); return value; } +MemoryFormatTypePtr MemoryFormatType::get() { +static MemoryFormatTypePtr value(new MemoryFormatType()); +return value; +} PyObjectTypePtr PyObjectType::get() { static PyObjectTypePtr value(new PyObjectType()); return value; @@ -237,6 +263,47 @@ ListTypePtr ListType::ofStrings() { return value; } +TypePtr OptionalType::get(TypePtr inner) { + static ska::flat_hash_map containerTypePtrs; + static std::mutex mutex; + // Perf from the lock is ok because this function is guarded behind + // a static initializer; it should only be called once per type. + std::lock_guard lock(mutex); + if (containerTypePtrs.find(inner) == containerTypePtrs.end()) { + TypePtr t = TypeFactory::create(inner); + containerTypePtrs.emplace(inner, std::move(t)); + } + return containerTypePtrs[inner]; +} + +TypePtr ListType::get(std::string identifier, TypePtr inner) { + static ska::flat_hash_map, TypePtr> containerTypePtrs; + static std::mutex mutex; + // Perf from the lock is ok because this function is guarded behind + // a static initializer; it should only be called once per type. + auto key = std::make_tuple(identifier, inner); + std::lock_guard lock(mutex); + if (containerTypePtrs.find(key) == containerTypePtrs.end()) { + TypePtr t = ListType::create(inner); + containerTypePtrs.emplace(key, std::move(t)); + } + return containerTypePtrs[key]; +} + +TypePtr DictType::get(std::string identifier, TypePtr key, TypePtr value) { + static ska::flat_hash_map, TypePtr> containerTypePtrs; + static std::mutex mutex; + // Perf from the lock is ok because this function is guarded behind + // a static initializer; it should only be called once per type. + auto map_key = std::make_tuple(identifier, key, value); + std::lock_guard lock(mutex); + if (containerTypePtrs.find(map_key) == containerTypePtrs.end()) { + TypePtr t = DictType::create(key, value); + containerTypePtrs.emplace(map_key, std::move(t)); + } + return containerTypePtrs[map_key]; +} + AnyListTypePtr AnyListType::get() { static AnyListTypePtr value(new AnyListType()); return value; @@ -257,6 +324,11 @@ AnyEnumTypePtr AnyEnumType::get() { return value; } +SymIntTypePtr SymIntType::get() { + static SymIntTypePtr value(new SymIntType()); + return value; +} + c10::optional unifyTypesImpl(const TypePtr& t1, const TypePtr& t2, bool default_to_union=false, TypePtr type_hint=nullptr) { // check direct subtyping relation if (t1->isSubtypeOf(*t2)) { diff --git a/aten/src/ATen/cpu/vec/functional_base.h b/aten/src/ATen/cpu/vec/functional_base.h index eb160577e869..44d39028b990 100644 --- a/aten/src/ATen/cpu/vec/functional_base.h +++ b/aten/src/ATen/cpu/vec/functional_base.h @@ -8,7 +8,7 @@ namespace at { namespace vec { -// TODO: Make this more efficient +// slow path template inline scalar_t vec_reduce_all( const Op& vec_fun, @@ -27,6 +27,62 @@ inline scalar_t vec_reduce_all( return acc_arr[0]; } +template +struct VecReduceAllSIMD { + static inline scalar_t apply(const Op& vec_fun, Vectorized acc_vec) { + return vec_reduce_all(vec_fun, acc_vec, Vectorized::size()); + } +}; + +#if defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) && !defined(C10_MOBILE) +#if defined(CPU_CAPABILITY_AVX2) +template +struct VecReduceAllSIMD { + static inline float apply(const Op& vec_fun, Vectorized acc_vec) { + using Vec = Vectorized; + Vec v = acc_vec; + // 128-bit shuffle + Vec v1 = _mm256_permute2f128_ps(v, v, 0x1); + v = vec_fun(v, v1); + // 64-bit shuffle + v1 = _mm256_shuffle_ps(v, v, 0x4E); + v = vec_fun(v, v1); + // 32-bit shuffle + v1 = _mm256_shuffle_ps(v, v, 0xB1); + v = vec_fun(v, v1); + return _mm256_cvtss_f32(v); + } +}; +#endif // defined(CPU_CAPABILITY_AVX2) +#if defined(CPU_CAPABILITY_AVX512) +template +struct VecReduceAllSIMD { + static inline float apply(const Op& vec_fun, Vectorized acc_vec) { + using Vec = Vectorized; + Vec v = acc_vec; + // 256-bit shuffle + Vec v1 = _mm512_shuffle_f32x4(v, v, 0x4E); + v = vec_fun(v, v1); + // 128-bit shuffle + v1 = _mm512_shuffle_f32x4(v, v, 0xB1); + v = vec_fun(v, v1); + // 64-bit shuffle + v1 = _mm512_shuffle_ps(v, v, 0x4E); + v = vec_fun(v, v1); + // 32-bit shuffle + v1 = _mm512_shuffle_ps(v, v, 0xB1); + v = vec_fun(v, v1); + return _mm512_cvtss_f32(v); + } +}; +#endif // defined(CPU_CAPABILITY_AVX512) +#endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) && !defined(C10_MOBILE) + +template +inline scalar_t vec_reduce_all(const Op& vec_fun, Vectorized acc_vec) { + return VecReduceAllSIMD::apply(vec_fun, acc_vec); +} + template inline scalar_t reduce_all(const Op& vec_fun, const scalar_t* data, int64_t size) { using Vec = vec::Vectorized; @@ -42,7 +98,7 @@ inline scalar_t reduce_all(const Op& vec_fun, const scalar_t* data, int64_t size Vec data_vec = Vec::loadu(data + d, size - d); acc_vec = Vec::set(acc_vec, vec_fun(acc_vec, data_vec), size - d); } - return vec_reduce_all(vec_fun, acc_vec, Vec::size()); + return vec_reduce_all(vec_fun, acc_vec); } // similar to reduce_all, but reduces into two outputs @@ -70,8 +126,8 @@ inline std::pair reduce2_all(const Op1& vec_fun1, const Op2& acc_vec2 = Vec::set(acc_vec2, vec_fun2(acc_vec2, data_vec), size - d); } return std::pair( - vec_reduce_all(vec_fun1, acc_vec1, Vec::size()), - vec_reduce_all(vec_fun2, acc_vec2, Vec::size())); + vec_reduce_all(vec_fun1, acc_vec1), + vec_reduce_all(vec_fun2, acc_vec2)); } template @@ -95,7 +151,7 @@ inline scalar_t map_reduce_all( data_vec = map_fun(data_vec); acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d); } - return vec_reduce_all(red_fun, acc_vec, Vec::size()); + return vec_reduce_all(red_fun, acc_vec); } template @@ -126,7 +182,7 @@ inline scalar_t map2_reduce_all( data_vec = map_fun(data_vec, data2_vec); acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d); } - return vec_reduce_all(red_fun, acc_vec, Vec::size()); + return vec_reduce_all(red_fun, acc_vec); } template @@ -162,7 +218,7 @@ inline scalar_t map3_reduce_all( data_vec = map_fun(data_vec, data2_vec, data3_vec); acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d); } - return vec_reduce_all(red_fun, acc_vec, Vec::size()); + return vec_reduce_all(red_fun, acc_vec); } template diff --git a/aten/src/ATen/cpu/vec/functional_bfloat16.h b/aten/src/ATen/cpu/vec/functional_bfloat16.h index 9efa7004090b..acb77ccaa491 100644 --- a/aten/src/ATen/cpu/vec/functional_bfloat16.h +++ b/aten/src/ATen/cpu/vec/functional_bfloat16.h @@ -75,7 +75,7 @@ inline BFloat16 reduce_all(const Op& vec_fun, const BFloat16* data, int64_t size } } acc_fvec0 = vec_fun(acc_fvec0, acc_fvec1); - return vec_reduce_all(vec_fun, acc_fvec0, fVec::size()); + return vec_reduce_all(vec_fun, acc_fvec0); } template @@ -131,8 +131,8 @@ inline std::pair reduce2_all(const Op1& vec_fun1, const Op2& acc1_fvec0 = vec_fun1(acc1_fvec0, acc1_fvec1); acc2_fvec0 = vec_fun2(acc2_fvec0, acc2_fvec1); return std::pair( - vec_reduce_all(vec_fun1, acc1_fvec0, fVec::size()), - vec_reduce_all(vec_fun2, acc2_fvec0, fVec::size())); + vec_reduce_all(vec_fun1, acc1_fvec0), + vec_reduce_all(vec_fun2, acc2_fvec0)); } template @@ -187,7 +187,7 @@ inline BFloat16 map_reduce_all( } } acc_fvec0 = red_fun(acc_fvec0, acc_fvec1); - return vec_reduce_all(red_fun, acc_fvec0, fVec::size()); + return vec_reduce_all(red_fun, acc_fvec0); } template @@ -255,7 +255,7 @@ inline BFloat16 map2_reduce_all( } } acc_fvec0 = red_fun(acc_fvec0, acc_fvec1); - return vec_reduce_all(red_fun, acc_fvec0, fVec::size()); + return vec_reduce_all(red_fun, acc_fvec0); } template @@ -336,7 +336,7 @@ inline BFloat16 map3_reduce_all( } } acc_fvec0 = red_fun(acc_fvec0, acc_fvec1); - return vec_reduce_all(red_fun, acc_fvec0, fVec::size()); + return vec_reduce_all(red_fun, acc_fvec0); } template diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h index c64e3e589905..83060f686051 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h @@ -698,6 +698,23 @@ inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) { } } +template <> +inline void convert(const BFloat16* src, float* dst, int64_t n) { + int64_t i; +#pragma unroll + for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { + auto vsrc = _mm256_loadu_si256(reinterpret_cast<__m256i*>((void*)(src + i))); + __m256 o1, o2; + cvtbf16_fp32(vsrc, o1, o2); + _mm256_storeu_ps(dst + i, o1); + _mm256_storeu_ps(dst + i + Vectorized::size(), o2); + } +#pragma unroll + for (; i < n; i++) { + dst[i] = static_cast(src[i]); + } +} + template <> Vectorized inline fmadd(const Vectorized& a, const Vectorized& b, const Vectorized& c) { diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h index 24c25c96137b..487233bc3c40 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h @@ -214,7 +214,7 @@ template <> class Vectorized> { return _mm256_sub_pd(pi_2, asin()); } Vectorized> atan() const; - Vectorized> atan2(const Vectorized> &b) const { + Vectorized> atan2(const Vectorized>&) const { AT_ERROR("not supported for complex numbers"); } Vectorized> erf() const { @@ -255,20 +255,20 @@ template <> class Vectorized> { Vectorized> floor() const { return _mm256_floor_pd(values); } - Vectorized> hypot(const Vectorized> &b) const { + Vectorized> hypot(const Vectorized> &) const { AT_ERROR("not supported for complex numbers"); } - Vectorized> igamma(const Vectorized> &x) const { + Vectorized> igamma(const Vectorized> &) const { AT_ERROR("not supported for complex numbers"); } - Vectorized> igammac(const Vectorized> &x) const { + Vectorized> igammac(const Vectorized> &) const { AT_ERROR("not supported for complex numbers"); } Vectorized> neg() const { auto zero = _mm256_setzero_pd(); return _mm256_sub_pd(zero, values); } - Vectorized> nextafter(const Vectorized> &b) const { + Vectorized> nextafter(const Vectorized> &) const { AT_ERROR("not supported for complex numbers"); } Vectorized> round() const { @@ -309,31 +309,31 @@ template <> class Vectorized> { Vectorized> operator!=(const Vectorized>& other) const { return _mm256_cmp_pd(values, other.values, _CMP_NEQ_UQ); } - Vectorized> operator<(const Vectorized>& other) const { + Vectorized> operator<(const Vectorized>&) const { TORCH_CHECK(false, "not supported for complex numbers"); } - Vectorized> operator<=(const Vectorized>& other) const { + Vectorized> operator<=(const Vectorized>&) const { TORCH_CHECK(false, "not supported for complex numbers"); } - Vectorized> operator>(const Vectorized>& other) const { + Vectorized> operator>(const Vectorized>&) const { TORCH_CHECK(false, "not supported for complex numbers"); } - Vectorized> operator>=(const Vectorized>& other) const { + Vectorized> operator>=(const Vectorized>&) const { TORCH_CHECK(false, "not supported for complex numbers"); } Vectorized> eq(const Vectorized>& other) const; Vectorized> ne(const Vectorized>& other) const; - Vectorized> lt(const Vectorized>& other) const { + Vectorized> lt(const Vectorized>&) const { TORCH_CHECK(false, "not supported for complex numbers"); } - Vectorized> le(const Vectorized>& other) const { + Vectorized> le(const Vectorized>&) const { TORCH_CHECK(false, "not supported for complex numbers"); } - Vectorized> gt(const Vectorized>& other) const { + Vectorized> gt(const Vectorized>&) const { TORCH_CHECK(false, "not supported for complex numbers"); } - Vectorized> ge(const Vectorized>& other) const { + Vectorized> ge(const Vectorized>&) const { TORCH_CHECK(false, "not supported for complex numbers"); } }; diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h index f917eb02da56..4093022a7e34 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h @@ -248,7 +248,7 @@ template <> class Vectorized> { return map(std::acos); } Vectorized> atan() const; - Vectorized> atan2(const Vectorized> &b) const { + Vectorized> atan2(const Vectorized>& /*b*/) const { AT_ERROR("not supported for complex numbers"); } Vectorized> erf() const { @@ -289,20 +289,20 @@ template <> class Vectorized> { Vectorized> floor() const { return _mm256_floor_ps(values); } - Vectorized> hypot(const Vectorized> &b) const { + Vectorized> hypot(const Vectorized>& /*b*/) const { AT_ERROR("not supported for complex numbers"); } - Vectorized> igamma(const Vectorized> &x) const { + Vectorized> igamma(const Vectorized>& /*x*/) const { AT_ERROR("not supported for complex numbers"); } - Vectorized> igammac(const Vectorized> &x) const { + Vectorized> igammac(const Vectorized>& /*x*/) const { AT_ERROR("not supported for complex numbers"); } Vectorized> neg() const { auto zero = _mm256_setzero_ps(); return _mm256_sub_ps(zero, values); } - Vectorized> nextafter(const Vectorized> &b) const { + Vectorized> nextafter(const Vectorized>& /*b*/) const { AT_ERROR("not supported for complex numbers"); } Vectorized> round() const { @@ -343,31 +343,31 @@ template <> class Vectorized> { Vectorized> operator!=(const Vectorized>& other) const { return _mm256_cmp_ps(values, other.values, _CMP_NEQ_UQ); } - Vectorized> operator<(const Vectorized>& other) const { + Vectorized> operator<(const Vectorized>& /*other*/) const { TORCH_CHECK(false, "not supported for complex numbers"); } - Vectorized> operator<=(const Vectorized>& other) const { + Vectorized> operator<=(const Vectorized>& /*other*/) const { TORCH_CHECK(false, "not supported for complex numbers"); } - Vectorized> operator>(const Vectorized>& other) const { + Vectorized> operator>(const Vectorized>& /*other*/) const { TORCH_CHECK(false, "not supported for complex numbers"); } - Vectorized> operator>=(const Vectorized>& other) const { + Vectorized> operator>=(const Vectorized>& /*other*/) const { TORCH_CHECK(false, "not supported for complex numbers"); } Vectorized> eq(const Vectorized>& other) const; Vectorized> ne(const Vectorized>& other) const; - Vectorized> lt(const Vectorized>& other) const { + Vectorized> lt(const Vectorized>& /*other*/) const { TORCH_CHECK(false, "not supported for complex numbers"); } - Vectorized> le(const Vectorized>& other) const { + Vectorized> le(const Vectorized>& /*other*/) const { TORCH_CHECK(false, "not supported for complex numbers"); } - Vectorized> gt(const Vectorized>& other) const { + Vectorized> gt(const Vectorized>& /*other*/) const { TORCH_CHECK(false, "not supported for complex numbers"); } - Vectorized> ge(const Vectorized>& other) const { + Vectorized> ge(const Vectorized>& /*other*/) const { TORCH_CHECK(false, "not supported for complex numbers"); } }; diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h index bba32942cc3a..6a1b84fc39a9 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h @@ -65,10 +65,10 @@ __m256i pack_saturate_and_clamp( template <> inline __m256i pack_saturate_and_clamp( - __m256i first, - __m256i second, - int32_t min_val, - int32_t max_val) { + __m256i /*first*/, + __m256i /*second*/, + int32_t /*min_val*/, + int32_t /*max_val*/) { // This function is for linkage only, will not be used AT_ERROR("pack_saturate_and_clamp is not supported"); } @@ -259,7 +259,7 @@ struct Vectorized : public Vectorizedqi { float_vec_return_type dequantize( Vectorized scale, - Vectorized zero_point, + Vectorized /*zero_point*/, Vectorized scale_zp_premul) const { __m256 float_vals = _mm256_cvtepi32_ps(vals); return {vec::fmadd(scale, Vectorized(float_vals), scale_zp_premul)}; @@ -269,7 +269,7 @@ struct Vectorized : public Vectorizedqi { const float_vec_return_type& rhs, float scale, int32_t zero_point, - float inverse_scale) { + float /*inverse_scale*/) { Vectorized retval; auto rhs_data = (__m256)rhs[0]; at::native::quantize_vec( @@ -442,7 +442,7 @@ struct Vectorized : public Vectorizedqi { public: float_vec_return_type dequantize( Vectorized scale, - Vectorized zero_point, + Vectorized /*zero_point*/, Vectorized scale_neg_zp_premul) const { __m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0)); __m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1)); @@ -467,7 +467,7 @@ struct Vectorized : public Vectorizedqi { static Vectorized quantize( const float_vec_return_type& rhs, - float scale, + float /*scale*/, int32_t zero_point, float inverse_scale) { auto* rhs_data = (float*)rhs.data(); @@ -605,7 +605,7 @@ struct Vectorized : public Vectorizedqi { public: float_vec_return_type dequantize( Vectorized scale, - Vectorized zero_point, + Vectorized /*zero_point*/, Vectorized scale_zp_premul) const { __m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0)); __m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1)); @@ -630,7 +630,7 @@ struct Vectorized : public Vectorizedqi { static Vectorized quantize( const float_vec_return_type& rhs, - float scale, + float /*scale*/, int32_t zero_point, float inverse_scale) { auto* rhs_data = (float*)rhs.data(); @@ -763,7 +763,7 @@ struct VectorizedQuantizedConverter { float_vec_return_type dequantize( Vectorized scale, Vectorized zero_point, - Vectorized scale_zp_premul) const { + Vectorized /*scale_zp_premul*/) const { float_vec_return_type rv; for (const auto i : c10::irange(float_num_vecs())) { float tmp_vals[8]; @@ -820,7 +820,7 @@ struct Vectorized : public VectorizedQuantizedConverter< const float_vec_return_type& rhs, float scale, int32_t zero_point, - float inverse_scale) { + float /*inverse_scale*/) { std::array qvals; std::array float_vals; @@ -952,7 +952,7 @@ struct Vectorized : public VectorizedQuantizedConverter< const float_vec_return_type& rhs, float scale, int32_t zero_point, - float inverse_scale) { + float /*inverse_scale*/) { std::array qvals; std::array float_vals; @@ -1072,7 +1072,7 @@ struct Vectorized : public VectorizedQuantizedConverter< const float_vec_return_type& rhs, float scale, int32_t zero_point, - float inverse_scale) { + float /*inverse_scale*/) { std::array qvals; std::array float_vals; diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h index c690682a4aa4..c0b34252b50b 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h @@ -800,6 +800,23 @@ inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) { } } +template <> +inline void convert(const BFloat16* src, float* dst, int64_t n) { + int64_t i; +#pragma unroll + for (i = 0; i <= (n - Vectorized::size()); i += Vectorized::size()) { + auto vsrc = _mm512_loadu_si512(reinterpret_cast<__m512i*>((void*)(src + i))); + __m512 o1, o2; + cvtbf16_fp32(vsrc, o1, o2); + _mm512_storeu_ps(dst + i, o1); + _mm512_storeu_ps(dst + i + Vectorized::size(), o2); + } +#pragma unroll + for (; i < n; i++) { + dst[i] = static_cast(src[i]); + } +} + template <> Vectorized inline fmadd(const Vectorized& a, const Vectorized& b, const Vectorized& c) { diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h index 407cbbd7a392..3bf1010efd68 100644 --- a/aten/src/ATen/cpu/vec/vec_base.h +++ b/aten/src/ATen/cpu/vec/vec_base.h @@ -14,6 +14,7 @@ // See https://github.com/pytorch/pytorch/issues/37577 for an instance // of this bug in the past. +#include #include #include #include @@ -133,7 +134,7 @@ struct Vectorized { static constexpr size_type size() { return VECTOR_WIDTH / sizeof(T); } - Vectorized() : values{0} {} + Vectorized() : values{static_cast(0)} {} Vectorized(T val) { for (int i = 0; i != size(); i++) { values[i] = val; @@ -537,7 +538,7 @@ struct Vectorized { // 1 if the pred is true, otherwise 0. Vectorized vector; for (int i = 0; i != size(); ++ i) { - vector[i] = bool(op(values[i], other.values[i])); + vector[i] = static_cast(op(values[i], other.values[i])); } return vector; } diff --git a/aten/src/ATen/cpu/vml.h b/aten/src/ATen/cpu/vml.h index 92bf85ad2d2c..d8d0a1544ccd 100644 --- a/aten/src/ATen/cpu/vml.h +++ b/aten/src/ATen/cpu/vml.h @@ -12,7 +12,7 @@ // It implements various functions with a simple interface // For example it enables the user to call vsin(float* out, const float* in, // size) This functions takes a pointer to a contious output array of floats and -// a constant input array. It will then apply sin to each value in in the input +// a constant input array. It will then apply sin to each value in the input // array and write the result into the output array. out and in may point to the // same memory, i.e. this fully supports in-place operations. These functions // also implement their own parallelization, so take precautions when calling diff --git a/aten/src/ATen/cuda/Atomic.cuh b/aten/src/ATen/cuda/Atomic.cuh index cd002414687a..1189cc05de12 100644 --- a/aten/src/ATen/cuda/Atomic.cuh +++ b/aten/src/ATen/cuda/Atomic.cuh @@ -4,6 +4,8 @@ #include #include +#include + template struct AtomicFPOp; @@ -298,7 +300,7 @@ static inline __device__ void gpuAtomicAddNoReturn(at::BFloat16 *address, at::BF static inline __device__ void gpuAtomicAddNoReturn(double *address, double val) { gpuAtomicAdd(address, val); } /* Special case fp32 atomic. */ -#if defined(USE_ROCM) && defined(__gfx908__) +#if defined(USE_ROCM) static inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { atomicAddNoRet(address, val); } #else static inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { gpuAtomicAdd(address, val); } @@ -344,3 +346,109 @@ inline __device__ float gpuAtomicMul (float * address, float val) { return __int_as_float(old); } + +// Atomic maximum implementation. + +template +__host__ __device__ T safe_max(T a, T b) { + #if defined(__HIPCC__) + // TODO: remove this special case for HIP when issue is fixed: + // https://github.com/ROCm-Developer-Tools/HIP/issues/2209 + T max = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::max(a, b)); + #else + T max = at::_isnan(b) ? b : std::max(a, b); + #endif + + return max; +} + +inline __device__ at::Half gpuAtomicMax(at::Half * address, at::Half val) { + return AtomicFPOp()(address, val, + [](at::Half bsum, at::Half val) { + return safe_max(bsum, val); + }); +} + +inline __device__ at::BFloat16 gpuAtomicMax(at::BFloat16 * address, at::BFloat16 val) { + return AtomicFPOp()(address, val, + [](at::BFloat16 bsum, at::BFloat16 val) { + return safe_max(bsum, val); + }); +} + +inline __device__ double gpuAtomicMax(double * address, double val) { + return AtomicFPOp()(address, val, + [](double val, unsigned long long int assumed) { + return __double_as_longlong(safe_max(val, __longlong_as_double(assumed))); + }); +} + +// Dont use a templated function for this since the addition function defaults to the CUDA built-in. +inline __device__ float gpuAtomicMax(float * address, float val) { + unsigned int* address_as_ull = (unsigned int*)address; + unsigned int old = *address_as_ull; + unsigned int assumed; + + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __float_as_int(safe_max(val, __int_as_float(assumed)))); + + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + + return __int_as_float(old); +} + +// Atomic minimum implementation. + +template +__host__ __device__ T safe_min(T a, T b) { + #if defined(__HIPCC__) + // TODO: remove this special case for HIP when issue is fixed: + // https://github.com/ROCm-Developer-Tools/HIP/issues/2209 + T min = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::min(a, b)); + #else + T min = at::_isnan(b) ? b : std::min(a, b); + #endif + + return min; +} + +inline __device__ at::Half gpuAtomicMin(at::Half * address, at::Half val) { + return AtomicFPOp()(address, val, + [](at::Half bsum, at::Half val) { + return safe_min(bsum, val); + }); +} + +inline __device__ at::BFloat16 gpuAtomicMin(at::BFloat16 * address, at::BFloat16 val) { + return AtomicFPOp()(address, val, + [](at::BFloat16 bsum, at::BFloat16 val) { + return safe_min(bsum, val); + }); +} + +inline __device__ double gpuAtomicMin(double * address, double val) { + return AtomicFPOp()(address, val, + [](double val, unsigned long long int assumed) { + return __double_as_longlong(safe_min(val, __longlong_as_double(assumed))); + }); +} + +// Dont use a templated function for this since the addition function defaults to the CUDA built-in. +inline __device__ float gpuAtomicMin(float * address, float val) { + unsigned int* address_as_ull = (unsigned int*)address; + unsigned int old = *address_as_ull; + unsigned int assumed; + + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __float_as_int(safe_min(val, __int_as_float(assumed)))); + + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + + return __int_as_float(old); +} diff --git a/aten/src/ATen/cuda/CUDAApplyUtils.cuh b/aten/src/ATen/cuda/CUDAApplyUtils.cuh index 44e24ab52b99..6a8ca194397d 100644 --- a/aten/src/ATen/cuda/CUDAApplyUtils.cuh +++ b/aten/src/ATen/cuda/CUDAApplyUtils.cuh @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include #include @@ -378,12 +378,14 @@ kernelPointwiseApply2(detail::TensorInfo a, template -inline bool CUDA_tensor_apply2(at::Tensor a, - at::Tensor b, +inline bool CUDA_tensor_apply2(at::TensorBase a, + at::TensorBase b, const Op op, TensorArgType aType = TensorArgType::ReadWrite, TensorArgType bType = TensorArgType::ReadOnly) { - checkDeviceType("CUDA_tensor_apply2", {a, b}, DeviceType::CUDA); + TORCH_CHECK(a.device().is_cuda() && b.device().is_cuda(), + "CUDA_tensor_apply2: Expected tensors to have CUDA DeviceType, but got " + "tensors with type ", a.device().type(), " and ", b.device().type()); int64_t totalElements = a.numel(); if (totalElements != b.numel()) { @@ -413,8 +415,8 @@ inline bool CUDA_tensor_apply2(at::Tensor a, This ensures that each element of the tensor is operated on once and only once. */ - Tensor oldA; - Tensor oldB; + TensorBase oldA; + TensorBase oldB; if (aType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(a)) { // Must perform in contiguous space @@ -524,8 +526,8 @@ inline bool CUDA_tensor_apply2(at::Tensor a, template -inline bool CUDA_tensor_apply2(at::Tensor a, - at::Tensor b, +inline bool CUDA_tensor_apply2(const at::TensorBase &a, + const at::TensorBase &b, const Op op, TensorArgType aType = TensorArgType::ReadWrite, TensorArgType bType = TensorArgType::ReadOnly) { diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp index 34b0214a5614..e99017289d68 100644 --- a/aten/src/ATen/cuda/CUDABlas.cpp +++ b/aten/src/ATen/cuda/CUDABlas.cpp @@ -2,10 +2,23 @@ Provides the implementations of CUDA BLAS function templates. */ +#include #include #include -#include +#include #include +#include + +// cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also +// added bf16 support +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER) +#include +#endif + +#ifdef USE_ROCM +#define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR) +#define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242) +#endif #define CUDABLAS_POSINT_CHECK(FD, X) \ TORCH_CHECK( \ @@ -97,42 +110,6 @@ namespace at { namespace cuda { namespace blas { -C10_EXPORT const char* _cublasGetErrorEnum(cublasStatus_t error) { - if (error == CUBLAS_STATUS_SUCCESS) { - return "CUBLAS_STATUS_SUCCESS"; - } - if (error == CUBLAS_STATUS_NOT_INITIALIZED) { - return "CUBLAS_STATUS_NOT_INITIALIZED"; - } - if (error == CUBLAS_STATUS_ALLOC_FAILED) { - return "CUBLAS_STATUS_ALLOC_FAILED"; - } - if (error == CUBLAS_STATUS_INVALID_VALUE) { - return "CUBLAS_STATUS_INVALID_VALUE"; - } - if (error == CUBLAS_STATUS_ARCH_MISMATCH) { - return "CUBLAS_STATUS_ARCH_MISMATCH"; - } - if (error == CUBLAS_STATUS_MAPPING_ERROR) { - return "CUBLAS_STATUS_MAPPING_ERROR"; - } - if (error == CUBLAS_STATUS_EXECUTION_FAILED) { - return "CUBLAS_STATUS_EXECUTION_FAILED"; - } - if (error == CUBLAS_STATUS_INTERNAL_ERROR) { - return "CUBLAS_STATUS_INTERNAL_ERROR"; - } - if (error == CUBLAS_STATUS_NOT_SUPPORTED) { - return "CUBLAS_STATUS_NOT_SUPPORTED"; - } -#ifdef CUBLAS_STATUS_LICENSE_ERROR - if (error == CUBLAS_STATUS_LICENSE_ERROR) { - return "CUBLAS_STATUS_LICENSE_ERROR"; - } -#endif - return ""; -} - /* LEVEL 3 BLAS FUNCTIONS */ #ifndef USE_ROCM @@ -274,13 +251,17 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::Half)) { float falpha = alpha; float fbeta = beta; #ifdef USE_ROCM + int flag = 0; +#if USE_GEMM_FLAGS_FP16_ALT_IMPL + flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0; +#endif TORCH_CUDABLAS_CHECK(rocblas_gemm_strided_batched_ex(handle, opa, opb, (int)m, (int)n, (int)k, (void*)&falpha, a, rocblas_datatype_f16_r, (int)lda, stridea, b, rocblas_datatype_f16_r, (int)ldb, strideb, (void*)&fbeta, c, rocblas_datatype_f16_r, (int)ldc, stridec, c, rocblas_datatype_f16_r, (int)ldc, stridec, (int) num_batches, rocblas_datatype_f32_r, rocblas_gemm_algo_standard, - 0, 0)); + 0, flag)); #else #if defined(CUDA_VERSION) && CUDA_VERSION < 11000 // On CUDA versions prior to 11, users are required to set the math mode to CUBLAS_TENSOR_OP_MATH @@ -420,6 +401,10 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) { _cublasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc); GEMM_CHECK_ARGVALUES(at::Half); #ifdef USE_ROCM + int flag = 0; +#if USE_GEMM_FLAGS_FP16_ALT_IMPL + flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0; +#endif TORCH_CUDABLAS_CHECK(rocblas_gemm_ex( handle, opa, @@ -444,7 +429,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) { rocblas_datatype_f32_r, rocblas_gemm_algo_standard, 0, - 0)); + flag)); #else cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); if (prop->major >= 5) { @@ -576,6 +561,270 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { } #endif // defined(CUDA_VERSION) && CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER) + +namespace { +// Following the pattern of CuSparseDescriptor +// Defined here for now because this is the only place cublas_lt interface is +// used but can be moved to a header once cublas_lt interface is used in +// multiple places. +template +struct CuBlasLtDeleter { + void operator()(T* x) { + if (x != nullptr) { + TORCH_CUDABLAS_CHECK(destructor(x)); + } + } +}; + +template +class CuBlasLtDescriptor { + public: + T* descriptor() const { + return descriptor_.get(); + } + T* descriptor() { + return descriptor_.get(); + } + + protected: + std::unique_ptr> descriptor_; +}; + +class CuBlasLtMatmulDescriptor : public CuBlasLtDescriptor< + cublasLtMatmulDescOpaque_t, + &cublasLtMatmulDescDestroy> { + public: + CuBlasLtMatmulDescriptor( + cublasComputeType_t compute_type, + cudaDataType_t scale_type) { + cublasLtMatmulDesc_t raw_descriptor = nullptr; + TORCH_CUDABLAS_CHECK( + cublasLtMatmulDescCreate(&raw_descriptor, compute_type, scale_type)); + descriptor_.reset(raw_descriptor); + } +}; + +class CuBlasLtMatrixLayout : public CuBlasLtDescriptor< + cublasLtMatrixLayoutOpaque_t, + &cublasLtMatrixLayoutDestroy> { + public: + CuBlasLtMatrixLayout( + cudaDataType_t type, + uint64_t rows, + uint64_t cols, + int64_t ld) { + cublasLtMatrixLayout_t raw_descriptor = nullptr; + TORCH_CUDABLAS_CHECK( + cublasLtMatrixLayoutCreate(&raw_descriptor, type, rows, cols, ld)); + descriptor_.reset(raw_descriptor); + } +}; + +class CuBlasLtMatmulPreference : public CuBlasLtDescriptor< + cublasLtMatmulPreferenceOpaque_t, + &cublasLtMatmulPreferenceDestroy> { + public: + CuBlasLtMatmulPreference() { + cublasLtMatmulPreference_t raw_descriptor = nullptr; + TORCH_CUDABLAS_CHECK(cublasLtMatmulPreferenceCreate(&raw_descriptor)); + descriptor_.reset(raw_descriptor); + } +}; +} // namespace + +template +void gemm_and_bias( + bool transpose_mat1, + bool transpose_mat2, + int64_t m, + int64_t n, + int64_t k, + at::opmath_type alpha_val, + const Dtype* mat1_ptr, + int64_t mat1_ld, + const Dtype* mat2_ptr, + int64_t mat2_ld, + const Dtype* bias, + Dtype* result_ptr, + int64_t result_ld, + GEMMAndBiasActivationEpilogue activation) { + using opmath_t = at::opmath_type; + opmath_t beta_val = 0; // bias is added in epilogue + + cudaDataType_t abcType = CUDA_R_32F; + cublasComputeType_t computeType = CUBLAS_COMPUTE_32F; + cudaDataType_t scaleType = CUDA_R_32F; + if (std::is_same::value) { + abcType = CUDA_R_64F; + computeType = CUBLAS_COMPUTE_64F; + scaleType = CUDA_R_64F; + } else if (std::is_same::value) { + if (at::globalContext().allowTF32CuBLAS()) { + computeType = CUBLAS_COMPUTE_32F_FAST_TF32; + } + abcType = CUDA_R_32F; + } else if (std::is_same::value) { + abcType = CUDA_R_16F; + } else if (std::is_same::value) { + abcType = CUDA_R_16BF; + } + + CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType); + cublasOperation_t transa = transpose_mat1 ? CUBLAS_OP_T : CUBLAS_OP_N; + TORCH_CUDABLAS_CHECK(cublasLtMatmulDescSetAttribute( + computeDesc.descriptor(), + CUBLASLT_MATMUL_DESC_TRANSA, + &transa, + sizeof(transa))); + cublasOperation_t transb = transpose_mat2 ? CUBLAS_OP_T : CUBLAS_OP_N; + TORCH_CUDABLAS_CHECK(cublasLtMatmulDescSetAttribute( + computeDesc.descriptor(), + CUBLASLT_MATMUL_DESC_TRANSB, + &transb, + sizeof(transb))); + cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BIAS; + if (activation == GEMMAndBiasActivationEpilogue::RELU) { + epilogue = CUBLASLT_EPILOGUE_RELU_BIAS; + } else if (activation == GEMMAndBiasActivationEpilogue::GELU) { +#if CUDA_VERSION >= 11040 + epilogue = CUBLASLT_EPILOGUE_GELU_BIAS; +#endif + } + TORCH_CUDABLAS_CHECK(cublasLtMatmulDescSetAttribute( + computeDesc.descriptor(), + CUBLASLT_MATMUL_DESC_EPILOGUE, + &epilogue, + sizeof(epilogue))); + TORCH_CUDABLAS_CHECK(cublasLtMatmulDescSetAttribute( + computeDesc.descriptor(), + CUBLASLT_MATMUL_DESC_BIAS_POINTER, + &bias, + sizeof(Dtype*))); + + CuBlasLtMatrixLayout Adesc( + abcType, transpose_mat1 ? k : m, transpose_mat1 ? m : k, mat1_ld); + CuBlasLtMatrixLayout Bdesc( + abcType, transpose_mat2 ? n : k, transpose_mat2 ? k : n, mat2_ld); + CuBlasLtMatrixLayout Cdesc(abcType, m, n, result_ld); + + CuBlasLtMatmulPreference preference; + // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind + // setting this to 1M. + size_t workspaceSize = 1024 * 1024; + TORCH_CUDABLAS_CHECK(cublasLtMatmulPreferenceSetAttribute( + preference.descriptor(), + CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, + &workspaceSize, + sizeof(workspaceSize))); + + auto workspace = at::empty( + {static_cast(workspaceSize)}, + at::device({at::kCUDA, at::cuda::current_device()}).dtype(at::kByte)); + + cublasLtMatmulHeuristicResult_t heuristicResult = {}; + int returnedResult = 0; + cublasLtHandle_t ltHandle = + reinterpret_cast(at::cuda::getCurrentCUDABlasHandle()); + TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic( + ltHandle, + computeDesc.descriptor(), + Adesc.descriptor(), + Bdesc.descriptor(), + Cdesc.descriptor(), + Cdesc.descriptor(), + preference.descriptor(), + 1, + &heuristicResult, + &returnedResult)); + if (returnedResult == 0) { + TORCH_CUDABLAS_CHECK(CUBLAS_STATUS_NOT_SUPPORTED); + } + + TORCH_CUDABLAS_CHECK(cublasLtMatmul( + ltHandle, + computeDesc.descriptor(), + &alpha_val, + mat1_ptr, + Adesc.descriptor(), + mat2_ptr, + Bdesc.descriptor(), + &beta_val, + result_ptr, + Cdesc.descriptor(), + result_ptr, + Cdesc.descriptor(), + &heuristicResult.algo, + workspace.data_ptr(), + workspaceSize, + at::cuda::getCurrentCUDAStream())); +} + +template void gemm_and_bias( + bool transpose_mat1, + bool transpose_mat2, + int64_t m, + int64_t n, + int64_t k, + at::opmath_type alpha_val, + const double* mat1_ptr, + int64_t mat1_ld, + const double* mat2_ptr, + int64_t mat2_ld, + const double* bias, + double* result_ptr, + int64_t result_ld, + GEMMAndBiasActivationEpilogue activation); + +template void gemm_and_bias( + bool transpose_mat1, + bool transpose_mat2, + int64_t m, + int64_t n, + int64_t k, + at::opmath_type alpha_val, + const float* mat1_ptr, + int64_t mat1_ld, + const float* mat2_ptr, + int64_t mat2_ld, + const float* bias, + float* result_ptr, + int64_t result_ld, + GEMMAndBiasActivationEpilogue activation); + +template void gemm_and_bias( + bool transpose_mat1, + bool transpose_mat2, + int64_t m, + int64_t n, + int64_t k, + at::opmath_type alpha_val, + const at::Half* mat1_ptr, + int64_t mat1_ld, + const at::Half* mat2_ptr, + int64_t mat2_ld, + const at::Half* bias, + at::Half* result_ptr, + int64_t result_ld, + GEMMAndBiasActivationEpilogue activation); + +template void gemm_and_bias( + bool transpose_mat1, + bool transpose_mat2, + int64_t m, + int64_t n, + int64_t k, + at::opmath_type alpha_val, + const at::BFloat16* mat1_ptr, + int64_t mat1_ld, + const at::BFloat16* mat2_ptr, + int64_t mat2_ld, + const at::BFloat16* bias, + at::BFloat16* result_ptr, + int64_t result_ld, + GEMMAndBiasActivationEpilogue activation); +#endif // defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER) + template <> void trsm(CUDABLAS_TRSM_ARGTYPES(float)) { TORCH_CUDABLAS_CHECK(cublasStrsm( diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h index f5f437d8d63a..10e589ecd6c9 100644 --- a/aten/src/ATen/cuda/CUDABlas.h +++ b/aten/src/ATen/cuda/CUDABlas.h @@ -70,6 +70,33 @@ template <> void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)); #endif +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER) +enum GEMMAndBiasActivationEpilogue { + None, + RELU, + GELU, +}; + +// NOTE: GELU activation is not supported prior to CUDA 11.4 and will +// do nothing if passed in that case. +template +void gemm_and_bias( + bool transpose_mat1, + bool transpose_mat2, + int64_t m, + int64_t n, + int64_t k, + at::opmath_type alpha_val, + const Dtype* mat1_ptr, + int64_t mat1_ld, + const Dtype* mat2_ptr, + int64_t mat2_ld, + const Dtype* bias, + Dtype* result_ptr, + int64_t result_ld, + GEMMAndBiasActivationEpilogue activation = GEMMAndBiasActivationEpilogue::None); +#endif + #define CUDABLAS_BGEMM_ARGTYPES(Dtype) \ char transa, char transb, int64_t m, int64_t n, int64_t k, at::opmath_type alpha, \ const Dtype *a, int64_t lda, int64_t stridea, \ diff --git a/aten/src/ATen/cuda/CUDAEvent.h b/aten/src/ATen/cuda/CUDAEvent.h index deaebd3583d6..f07daeb979b9 100644 --- a/aten/src/ATen/cuda/CUDAEvent.h +++ b/aten/src/ATen/cuda/CUDAEvent.h @@ -32,15 +32,11 @@ struct TORCH_CUDA_CPP_API CUDAEvent { CUDAEvent( DeviceIndex device_index, const cudaIpcEventHandle_t* handle) { - #if !defined(USE_ROCM) device_index_ = device_index; CUDAGuard guard(device_index_); AT_CUDA_CHECK(cudaIpcOpenEventHandle(&event_, *handle)); is_created_ = true; - #else - AT_ERROR("cuIpcOpenEventHandle with HIP is not supported"); - #endif } // Note: event destruction done on creating device to avoid creating a @@ -148,7 +144,6 @@ struct TORCH_CUDA_CPP_API CUDAEvent { // Note: cudaIpcGetEventHandle must be called on the same device as the event void ipc_handle(cudaIpcEventHandle_t * handle) { - #if !defined(USE_ROCM) if (!is_created_) { // this CUDAEvent object was initially constructed from flags but event_ // is not created yet. @@ -156,9 +151,6 @@ struct TORCH_CUDA_CPP_API CUDAEvent { } CUDAGuard guard(device_index_); AT_CUDA_CHECK(cudaIpcGetEventHandle(handle, event_)); - #else - AT_ERROR("cuIpcGetEventHandle with HIP is not supported"); - #endif } private: diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.h b/aten/src/ATen/cuda/CUDAGeneratorImpl.h index 3fddd8556467..768f0b7549c2 100644 --- a/aten/src/ATen/cuda/CUDAGeneratorImpl.h +++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.h @@ -1,9 +1,7 @@ #pragma once -#include #include #include -#include #include #include diff --git a/aten/src/ATen/cuda/CUDAGraph.cpp b/aten/src/ATen/cuda/CUDAGraph.cpp index b28c276037b7..c7734334f4e2 100644 --- a/aten/src/ATen/cuda/CUDAGraph.cpp +++ b/aten/src/ATen/cuda/CUDAGraph.cpp @@ -187,7 +187,7 @@ void CUDAGraph::replay() { // certain topologies to be corrupted (kernels elided, internal syncs // ignored) when replayed back to back without a sync in between. // The bug is fixed in CUDA 11.4+. - cudaDeviceSynchronize(); + AT_CUDA_CHECK(cudaDeviceSynchronize()); } #else TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 and not yet supported on ROCM"); diff --git a/aten/src/ATen/cuda/CUDASparse.h b/aten/src/ATen/cuda/CUDASparse.h index fd88a7fc3ffd..ecb7127dfa32 100644 --- a/aten/src/ATen/cuda/CUDASparse.h +++ b/aten/src/ATen/cuda/CUDASparse.h @@ -34,8 +34,7 @@ // BSR triangular solve functions were added in hipSPARSE 1.11.2 (ROCm 4.5.0) #if defined(CUDART_VERSION) || \ - (defined(USE_ROCM) && (hipsparseVersionMajor >= 1) && \ - (hipsparseVersionMinor >= 11) && (hipsparseVersionPatch >= 2)) + (defined(USE_ROCM) && ROCM_VERSION >= 40500 ) #define AT_USE_HIPSPARSE_TRIANGULAR_SOLVE() 1 #else #define AT_USE_HIPSPARSE_TRIANGULAR_SOLVE() 0 diff --git a/aten/src/ATen/cuda/CUDASparseDescriptors.cpp b/aten/src/ATen/cuda/CUDASparseDescriptors.cpp index b71bcfdd6fe6..3065babf89b6 100644 --- a/aten/src/ATen/cuda/CUDASparseDescriptors.cpp +++ b/aten/src/ATen/cuda/CUDASparseDescriptors.cpp @@ -53,12 +53,12 @@ cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type) { } } -CuSparseDnMatDescriptor::CuSparseDnMatDescriptor(const Tensor& input) { +CuSparseDnMatDescriptor::CuSparseDnMatDescriptor(const Tensor& input, int64_t batch_offset) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.layout() == kStrided); IntArrayRef input_strides = input.strides(); IntArrayRef input_sizes = input.sizes(); auto ndim = input.dim(); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ndim == 2 || ndim == 3); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ndim >= 2); auto rows = input_sizes[ndim - 2]; auto cols = input_sizes[ndim - 1]; @@ -79,7 +79,9 @@ CuSparseDnMatDescriptor::CuSparseDnMatDescriptor(const Tensor& input) { auto order = CUSPARSE_ORDER_COL; #endif - void* values_ptr = input.data_ptr(); + auto batch_stride = ndim > 2 && batch_offset >= 0 ? input_strides[ndim - 3] : 0; + void* values_ptr = static_cast(input.data_ptr()) + + batch_offset * batch_stride * input.itemsize(); cudaDataType value_type = ScalarTypeToCudaDataType(input.scalar_type()); check_supported_cuda_type(value_type); @@ -94,7 +96,7 @@ CuSparseDnMatDescriptor::CuSparseDnMatDescriptor(const Tensor& input) { value_type, order)); - if (ndim == 3) { + if (ndim >= 3 && batch_offset == -1) { int batch_count = at::native::cuda_int_cast(at::native::batchCount(input), "batch_count"); TORCH_CUDASPARSE_CHECK(cusparseDnMatSetStridedBatch( @@ -121,9 +123,9 @@ CuSparseDnVecDescriptor::CuSparseDnVecDescriptor(const Tensor& input) { descriptor_.reset(raw_descriptor); } -CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input) { +CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input, int64_t batch_offset) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.is_sparse_csr()); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.dim() == 2 || input.dim() == 3); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.dim() >= 2); IntArrayRef input_sizes = input.sizes(); auto ndim = input.dim(); @@ -144,16 +146,29 @@ CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input) { cudaDataType value_type = ScalarTypeToCudaDataType(input.scalar_type()); check_supported_cuda_type(value_type); + auto crow_indices_batch_stride = crow_indices.dim() >= 2 && batch_offset >= 0 + ? crow_indices.stride(-2) + : 0; + auto col_indices_batch_stride = + col_indices.dim() >= 2 && batch_offset >= 0 ? col_indices.stride(-2) : 0; + auto values_batch_stride = + values.dim() >= 2 && batch_offset >= 0 ? values.stride(-2) : 0; + cusparseSpMatDescr_t raw_descriptor; TORCH_CUDASPARSE_CHECK(cusparseCreateCsr( &raw_descriptor, // output descriptor rows, cols, nnz, - crow_indices - .data_ptr(), // row offsets of the sparse matrix, size = rows + 1 - col_indices.data_ptr(), // column indices of the sparse matrix, size = nnz - values.data_ptr(), // values of the sparse matrix, size = nnz + // row offsets of the sparse matrix, size = rows + 1 + static_cast(crow_indices.data_ptr()) + + batch_offset * crow_indices_batch_stride * crow_indices.itemsize(), + // column indices of the sparse matrix, size = nnz + static_cast(col_indices.data_ptr()) + + batch_offset * col_indices_batch_stride * col_indices.itemsize(), + // values of the sparse matrix, size = nnz + static_cast(values.data_ptr()) + + batch_offset * values_batch_stride * values.itemsize(), index_type, // data type of row offsets index index_type, // data type of col indices CUSPARSE_INDEX_BASE_ZERO, // base index of row offset and col indes @@ -161,7 +176,7 @@ CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input) { )); #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 - if (ndim == 3) { + if (ndim == 3 && batch_offset == -1) { int batch_count = at::native::cuda_int_cast(at::native::batchCount(input), "batch_count"); if (crow_indices.dim() >= 2 || values.dim() >= 2 || diff --git a/aten/src/ATen/cuda/CUDASparseDescriptors.h b/aten/src/ATen/cuda/CUDASparseDescriptors.h index 8c29f7022c5c..40078b65df64 100644 --- a/aten/src/ATen/cuda/CUDASparseDescriptors.h +++ b/aten/src/ATen/cuda/CUDASparseDescriptors.h @@ -99,7 +99,7 @@ cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type); class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor : public CuSparseDescriptor { public: - explicit CuSparseDnMatDescriptor(const Tensor& input); + explicit CuSparseDnMatDescriptor(const Tensor& input, int64_t batch_offset = -1); }; class TORCH_CUDA_CPP_API CuSparseDnVecDescriptor @@ -114,7 +114,7 @@ class TORCH_CUDA_CPP_API CuSparseSpMatDescriptor class TORCH_CUDA_CPP_API CuSparseSpMatCsrDescriptor : public CuSparseSpMatDescriptor { public: - explicit CuSparseSpMatCsrDescriptor(const Tensor& input); + explicit CuSparseSpMatCsrDescriptor(const Tensor& input, int64_t batch_offset = -1); #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 std::tuple get_size() { diff --git a/aten/src/ATen/cuda/Exceptions.cpp b/aten/src/ATen/cuda/Exceptions.cpp new file mode 100644 index 000000000000..2821f94d2b7d --- /dev/null +++ b/aten/src/ATen/cuda/Exceptions.cpp @@ -0,0 +1,68 @@ +//NS: CUDACachingAllocator must be included before to get CUDART_VERSION definedi +#include + +#include + +namespace at { +namespace cuda { +namespace blas { + +C10_EXPORT const char* _cublasGetErrorEnum(cublasStatus_t error) { + if (error == CUBLAS_STATUS_SUCCESS) { + return "CUBLAS_STATUS_SUCCESS"; + } + if (error == CUBLAS_STATUS_NOT_INITIALIZED) { + return "CUBLAS_STATUS_NOT_INITIALIZED"; + } + if (error == CUBLAS_STATUS_ALLOC_FAILED) { + return "CUBLAS_STATUS_ALLOC_FAILED"; + } + if (error == CUBLAS_STATUS_INVALID_VALUE) { + return "CUBLAS_STATUS_INVALID_VALUE"; + } + if (error == CUBLAS_STATUS_ARCH_MISMATCH) { + return "CUBLAS_STATUS_ARCH_MISMATCH"; + } + if (error == CUBLAS_STATUS_MAPPING_ERROR) { + return "CUBLAS_STATUS_MAPPING_ERROR"; + } + if (error == CUBLAS_STATUS_EXECUTION_FAILED) { + return "CUBLAS_STATUS_EXECUTION_FAILED"; + } + if (error == CUBLAS_STATUS_INTERNAL_ERROR) { + return "CUBLAS_STATUS_INTERNAL_ERROR"; + } + if (error == CUBLAS_STATUS_NOT_SUPPORTED) { + return "CUBLAS_STATUS_NOT_SUPPORTED"; + } +#ifdef CUBLAS_STATUS_LICENSE_ERROR + if (error == CUBLAS_STATUS_LICENSE_ERROR) { + return "CUBLAS_STATUS_LICENSE_ERROR"; + } +#endif + return ""; +} + +} // namespace blas + +#ifdef CUDART_VERSION +namespace solver { + +C10_EXPORT const char* cusolverGetErrorMessage(cusolverStatus_t status) { + switch (status) { + case CUSOLVER_STATUS_SUCCESS: return "CUSOLVER_STATUS_SUCCES"; + case CUSOLVER_STATUS_NOT_INITIALIZED: return "CUSOLVER_STATUS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_ALLOC_FAILED: return "CUSOLVER_STATUS_ALLOC_FAILED"; + case CUSOLVER_STATUS_INVALID_VALUE: return "CUSOLVER_STATUS_INVALID_VALUE"; + case CUSOLVER_STATUS_ARCH_MISMATCH: return "CUSOLVER_STATUS_ARCH_MISMATCH"; + case CUSOLVER_STATUS_EXECUTION_FAILED: return "CUSOLVER_STATUS_EXECUTION_FAILED"; + case CUSOLVER_STATUS_INTERNAL_ERROR: return "CUSOLVER_STATUS_INTERNAL_ERROR"; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + default: return "Unknown cusolver error number"; + } +} + +} // namespace solver +#endif + +}} // namespace at::cuda diff --git a/aten/src/ATen/cuda/ScanUtils.cuh b/aten/src/ATen/cuda/ScanUtils.cuh index 30e21b689efc..8b3ef2df76de 100644 --- a/aten/src/ATen/cuda/ScanUtils.cuh +++ b/aten/src/ATen/cuda/ScanUtils.cuh @@ -10,88 +10,6 @@ namespace at { namespace cuda { -// Extends the above Inclusive Scan to support segments. It has the same properties -// but also takes a flag array that indicates the starts of "segments", i.e. individual -// units to scan. For example, consider the following (+)-scan that is segmented: -// -// Input: [1, 3, 2, 4, 1, 2, 3, 2, 1, 4] -// Flags: [1, 0, 0, 1, 0, 1, 1, 0, 1, 0] -// Output: 1 4 6 4 5 2 3 5 1 5 -// -// So we see that each "flag" resets the scan to that index. -template -__device__ void segmentedInclusivePrefixScan(T *smem, bool *bmem, BinaryOp binop) { - // Reduce step ("upsweep") -#pragma unroll - for (int stride = 1; stride < Power2ScanSize; stride <<= 1) { - int index = (threadIdx.x + 1) * stride * 2 - 1; - if (index < Power2ScanSize) { - smem[index] = bmem[index] ? smem[index] : binop(smem[index], smem[index - stride]); - bmem[index] = bmem[index] | bmem[index - stride]; - } - __syncthreads(); - } - - // Post-reduce step ("downsweep") -#pragma unroll - for (int stride = Power2ScanSize / 4; stride > 0; stride >>= 1) { - int index = (threadIdx.x + 1) * stride * 2 - 1; - if ((index + stride) < Power2ScanSize) { - smem[index + stride] = bmem[index + stride] ? smem[index + stride] : binop(smem[index + stride], smem[index]); - bmem[index + stride] = bmem[index + stride] | bmem[index]; - } - __syncthreads(); - } -} - -// Inclusive prefix sum using shared memory -template -__device__ void inclusivePrefixScan(T* smem, T in, T* out, BinaryFunction binop) { - // FIXME: this is a slow, simple implementation; need up/down sweep, - // prevent smem conflicts - smem[threadIdx.x] = in; - - __syncthreads(); - - for (int offset = 1; offset < blockDim.x; offset *= 2) { - T val = 0; - - if (threadIdx.x >= offset) { - val = binop(smem[threadIdx.x - offset], smem[threadIdx.x]); - } - - __syncthreads(); - if (threadIdx.x >= offset) { - smem[threadIdx.x] = val; - } - - __syncthreads(); - } - - *out = smem[threadIdx.x]; - - // Prevent write-after-read dependencies on smem usage above if necessary - if (KillWARDependency) { - __syncthreads(); - } -} - -// Exclusive prefix sum using shared memory -template -__device__ void exclusivePrefixScan(T* smem, T in, T* out, T* carry, BinaryFunction binop) { - // FIXME: crappy implementation - // We kill write-after-read dependencies separately below, hence the `false` - inclusivePrefixScan(smem, in, out, binop); - - *out -= in; - *carry = smem[blockDim.x - 1]; - - // Prevent write-after-read dependencies on smem usage above if necessary - if (KillWARDependency) { - __syncthreads(); - } -} - // Inclusive prefix sum for binary vars using intra-warp voting + // shared memory template diff --git a/aten/src/ATen/cuda/cub.cu b/aten/src/ATen/cuda/cub.cu index 6915a1c2b98f..bf3216eee6da 100644 --- a/aten/src/ATen/cuda/cub.cu +++ b/aten/src/ATen/cuda/cub.cu @@ -57,8 +57,8 @@ AT_INSTANTIATE_SORT_PAIRS(int64_t, 4) AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, AT_INSTANTIATE_SORT_PAIRS_8) -// BFloat16 is not supported by ROCm's radix sort -#if !AT_ROCM_ENABLED() +// BFloat16 Radix sort is supported from ROCm 4.5 onwards +#if !AT_ROCM_ENABLED() || (AT_ROCM_ENABLED() && ROCM_VERSION >= 40500) AT_INSTANTIATE_SORT_PAIRS(c10::BFloat16, 8) #endif diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh index 6ac9905f571e..abe2e9272014 100644 --- a/aten/src/ATen/cuda/cub.cuh +++ b/aten/src/ATen/cuda/cub.cuh @@ -6,6 +6,8 @@ #include #include +#include + #include #if USE_GLOBAL_CUB_WRAPPED_NAMESPACE() @@ -45,17 +47,23 @@ #ifdef USE_ROCM #define NO_ROCM(x) +#define ROCM_HIPCUB(x) ::hipcub #else #define NO_ROCM(x) x +#define ROCM_HIPCUB(x) x #endif -#if !defined(USE_ROCM) && !CUB_SUPPORTS_NV_BFLOAT16() +#if (!defined(USE_ROCM) && !CUB_SUPPORTS_NV_BFLOAT16()) || \ + (defined(USE_ROCM) && ROCM_VERSION >= 40500) +#if !defined(USE_ROCM) namespace at_cuda_detail { +#endif + // backport https://github.com/NVIDIA/cub/pull/306 for c10::BFloat16 template <> -struct cub::FpLimits +struct ROCM_HIPCUB(cub)::FpLimits { static __host__ __device__ __forceinline__ c10::BFloat16 Max() { unsigned short max_word = 0x7F7F; @@ -68,8 +76,14 @@ struct cub::FpLimits } }; -template <> struct cub::NumericTraits: cub::BaseTraits {}; -} +template <> +struct ROCM_HIPCUB(cub)::NumericTraits: + ROCM_HIPCUB(cub)::BaseTraits {}; + +#if !defined(USE_ROCM) +} // namespace at_cuda_detail +#endif + #endif #if !defined(USE_ROCM) @@ -93,13 +107,20 @@ struct cuda_type { using type = __half; }; -#if CUB_SUPPORTS_NV_BFLOAT16() +#if !defined(USE_ROCM) && CUB_SUPPORTS_NV_BFLOAT16() template<> struct cuda_type { using type = __nv_bfloat16; }; +#elif (defined(USE_ROCM) && ROCM_VERSION >= 40500) + +template<> +struct cuda_type { + using type = hip_bfloat16; +}; + #endif } // namespace detail @@ -142,6 +163,34 @@ inline void segmented_sort_pairs( } } +#if CUB_SUPPORTS_UNIQUE_BY_KEY() +template +inline void unique_by_key( + KeysInputIteratorT keys_in, ValuesInputIteratorT values_in, + KeysOutputIteratorT keys_out, ValuesOutputIteratorT values_out, + NumSelectedIteratorT num_selected, int64_t num_input_items) +{ + // TODO: use thrust::discard_iterator to handle null keys_out when https://github.com/NVIDIA/cub/issues/406 is fixed. + constexpr bool null_keys_out = std::is_same::value; + using KeyT = typename std::iterator_traits::value_type; + using RealKeysOutputIteratorT = typename std::conditional::type; + RealKeysOutputIteratorT keys_out_; + auto allocator = c10::cuda::CUDACachingAllocator::get(); + c10::DataPtr keys_out_owner; + c10::guts::if_constexpr( + [&](auto _) { + keys_out_owner = allocator->allocate(num_input_items * sizeof(KeyT)); + keys_out_ = static_cast(keys_out_owner.get()); + }, + [&](auto _) { + keys_out_ = keys_out; + } + ); + CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceSelect::UniqueByKey, + keys_in, values_in, keys_out_, values_out, num_selected, num_input_items, c10::cuda::getCurrentCUDAStream()); +} +#endif + namespace impl { template diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh index e464b19e57d5..a3d551673558 100644 --- a/aten/src/ATen/cuda/cub_definitions.cuh +++ b/aten/src/ATen/cuda/cub_definitions.cuh @@ -18,7 +18,7 @@ #define CUB_SUPPORTS_NV_BFLOAT16() false #endif -// cub sort support for CUB_WRAPPED_NAMESPACE is added to cub 1.13.1 in: +// cub support for CUB_WRAPPED_NAMESPACE is added to cub 1.13.1 in: // https://github.com/NVIDIA/cub/pull/326 // CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake // starting from CUDA 11.5 @@ -28,6 +28,14 @@ #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false #endif +// cub support for UniqueByKey is added to cub 1.16 in: +// https://github.com/NVIDIA/cub/pull/405 +#if CUB_VERSION >= 101600 +#define CUB_SUPPORTS_UNIQUE_BY_KEY() true +#else +#define CUB_SUPPORTS_UNIQUE_BY_KEY() false +#endif + // cub support for scan by key is added to cub 1.15 // in https://github.com/NVIDIA/cub/pull/376 #if CUB_VERSION >= 101500 diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp index 4efe2ec4c33f..93a23ec6a730 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp +++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp @@ -139,16 +139,22 @@ bool CUDAHooks::hasCuSOLVER() const { #endif } -#if !defined(USE_ROCM) +bool CUDAHooks::hasROCM() const { + // Currently, this is same as `compiledWithMIOpen`. + // But in future if there are ROCm builds without MIOpen, + // then `hasROCM` should return true while `compiledWithMIOpen` + // should return false + return AT_ROCM_ENABLED(); +} + #if defined(USE_DIRECT_NVRTC) static std::pair, at::cuda::NVRTC*> load_nvrtc() { return std::make_pair(nullptr, at::cuda::load_nvrtc()); } -#else +#elif !defined(USE_ROCM) static std::pair, at::cuda::NVRTC*> load_nvrtc() { return std::make_pair(nullptr, &at::cuda::detail::lazyNVRTC); } -#endif #else static std::pair, at::cuda::NVRTC*> load_nvrtc() { #if defined(_WIN32) @@ -293,10 +299,22 @@ std::string CUDAHooks::showConfig() const { cudaRuntimeGetVersion(&runtimeVersion); auto printCudaStyleVersion = [&](int v) { +#ifdef USE_ROCM + // HIP_VERSION value format was changed after ROCm v4.2 to include the patch number + if(v < 500) { + // If major=xx, minor=yy then format -> xxyy + oss << (v / 100) << "." << (v % 10); + } + else { + // If major=xx, minor=yy & patch=zzzzz then format -> xxyyzzzzz + oss << (v / 10000000) << "." << (v / 100000 % 100) << "." << (v % 100000); + } +#else oss << (v / 1000) << "." << (v / 10 % 100); if (v % 10 != 0) { oss << "." << (v % 10); } +#endif }; #if !defined(USE_ROCM) diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h index a0d175df27c0..5aa2721170ed 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.h +++ b/aten/src/ATen/cuda/detail/CUDAHooks.h @@ -1,3 +1,5 @@ +#pragma once + #include #include @@ -27,6 +29,7 @@ struct CUDAHooks : public at::CUDAHooksInterface { bool hasMAGMA() const override; bool hasCuDNN() const override; bool hasCuSOLVER() const override; + bool hasROCM() const override; const at::cuda::NVRTC& nvrtc() const override; int64_t current_device() const override; bool hasPrimaryContext(int64_t device_index) const override; diff --git a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp index fe5a95525e7d..e720994e9249 100644 --- a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp +++ b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp @@ -166,6 +166,8 @@ CUDA_STUB1(cuModuleUnload, CUmodule); CUDA_STUB3(cuDevicePrimaryCtxGetState, CUdevice, unsigned int *, int *); CUDA_STUB4(cuLinkCreate, unsigned int, CUjit_option *, void **, CUlinkState *); CUDA_STUB3(cuLinkComplete, CUlinkState, void **, size_t *); +CUDA_STUB3(cuFuncSetAttribute, CUfunction, CUfunction_attribute, int); +CUDA_STUB3(cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction); // Irregularly shaped functions CUresult CUDAAPI cuLaunchKernel(CUfunction f, diff --git a/aten/src/ATen/cuda/jiterator.cu b/aten/src/ATen/cuda/jiterator.cu new file mode 100644 index 000000000000..905dc75c14ae --- /dev/null +++ b/aten/src/ATen/cuda/jiterator.cu @@ -0,0 +1,345 @@ +#include + +#if AT_USE_JITERATOR() + +#include +#include +#include + +#include +#include +#include +namespace at { +namespace native { + +static inline void launch_jitted_vectorized_kernel_dynamic( + const std::string& name, TensorIteratorBase& iter, + DeviceIndex dev_idx, int64_t N, const std::string& f, void* data_ptr, + const std::vector& extra_args) { + TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits::max()); + // N is still int64_t for the computation, but it's always safe to cast result to int + const uint32_t grid = (N + block_work_size() - 1) / block_work_size(); + + const int vec_size = jitted_can_vectorize_up_to(iter); + bool vectorized = vec_size > 1; + + // Different kernels are compiled depending on what we're vectorizing up to (1, 2 or 4 elements) + // fn_ptr is set to the appropriate function based on the vec size and GPU used + // TODO: Memory use can probably be optimized by re-using kernels across GPUs with + // the same compute capability + + int nTensors = iter.ntensors(); + const at::ScalarType common_dtype = iter.common_dtype(); + std::string f_inputs_type_str = at::cuda::jit::typeName(common_dtype); + std::string compute_type_str = at::cuda::jit::typeName(toOpMathType(common_dtype)); + std::string result_type_str = at::cuda::jit::typeName(common_dtype); + c10::SmallVector extra_args_types = get_extra_args_typenames(extra_args); + + // The cache key includes all the parameters to generate_code + vec_size + dev_idx + std::stringstream ss; + ss << nTensors << f; + ss << f_inputs_type_str << compute_type_str << result_type_str; + ss << static_cast(at::cuda::jit::BinaryFuncVariant::NoScalar); + ss << extra_args_types; + ss << vec_size; +// DeviceIndex, e.g. int8_t, is not treated as a number by the stream, cast to int as a workaround + ss << static_cast(dev_idx); + const std::string cache_key = ss.str(); + + static std::mutex _jiterator_mutex; + static std::unordered_map fns; + at::cuda::jit::NvrtcFunction* fn_ptr = &fns[cache_key]; + + if (!fn_ptr->function) { + const std::lock_guard lock{_jiterator_mutex}; + if (!fn_ptr->function) { // cache miss! + // Generates program + auto code = at::cuda::jit::generate_code(nTensors, f, name, + f_inputs_type_str, compute_type_str, result_type_str, + /*contiguous=*/true, /*dynamic_casting=*/false, + at::cuda::jit::BinaryFuncVariant::NoScalar, + extra_args_types, + vectorized, vec_size); + std::string kernel_name = vectorized ? name + "_vectorized" + std::to_string(vec_size) : name; + // Acquires the program + *fn_ptr = at::cuda::jit::jit_pwise_function(code, kernel_name); + } + } + + // size of `extra_args` is unknown at compile-time + auto extra_args_size = extra_args.size(); + + float scalar_val = 0; + + if (vectorized) { + // pack args for kernel launch + constexpr int kernel_args = 3; + auto args = std::make_unique(kernel_args + extra_args_size); + args[0] = static_cast(&N); + args[1] = data_ptr; + args[2] = static_cast(&scalar_val); + + for (const auto i : c10::irange(extra_args_size)) { + // since 3 slots are already filled in `args` + args[i + 3] = const_cast(extra_args[i].data_ptr()); + } + at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args.get(), {grid, 1u, 1u}, {num_threads(), 1u, 1u}); + } else { + TrivialOffsetCalculatorVariant input_offset_calculator(iter); + void* ic_ptr = input_offset_calculator.data_ptr(); + auto oc = TrivialOffsetCalculator<1>(); + auto l = memory::LoadWithoutCast(); + auto s = memory::StoreWithoutCast(); + + // pack args for kernel launch + constexpr int kernel_args = 7; + auto args = std::make_unique(kernel_args + extra_args_size); + args[0] = static_cast(&N); + args[1] = data_ptr; + args[2] = ic_ptr; + args[3] = static_cast(&oc); + args[4] = static_cast(&l); + args[5] = static_cast(&s); + args[6] = static_cast(&scalar_val); + + for (const auto i : c10::irange(extra_args_size)) { + // since 7 slots are already filled in `args` + args[i + 7] = const_cast(extra_args[i].data_ptr()); + } + + at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args.get(), {grid, 1u, 1u}, {num_threads(), 1u, 1u}); + } +} + +static inline void launch_jitted_unrolled_kernel_dynamic( + const std::string& name, TensorIteratorBase& iter, + DeviceIndex dev_idx, int64_t N, const std::string& f, void* data_ptr, + void* ic_ptr, void* oc_ptr, void* l_ptr, void* s_ptr, bool contiguous, bool dynamic_casting, + const std::vector& extra_args) { + + TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits::max()); + //casting result to int is always safe, intermediate is int64 and won't overflow + const uint32_t grid = (N + block_work_size() - 1) / block_work_size(); + + int nTensors = iter.ntensors(); + const at::ScalarType common_dtype = iter.common_dtype(); + std::string f_inputs_type_str = at::cuda::jit::typeName(common_dtype); + std::string compute_type_str = at::cuda::jit::typeName(toOpMathType(common_dtype)); + std::string result_type_str = at::cuda::jit::typeName(common_dtype); + c10::SmallVector extra_args_types = get_extra_args_typenames(extra_args); + + // The cache key includes all the parameters to generate_code + dev_idx + std::stringstream ss; + ss << nTensors << f; + ss << f_inputs_type_str << compute_type_str << result_type_str; + ss << contiguous << dynamic_casting; + ss << static_cast(at::cuda::jit::BinaryFuncVariant::NoScalar); + ss << extra_args_types; + ss << dev_idx; + const std::string cache_key = ss.str(); + + static std::mutex _jiterator_mutex; + static std::unordered_map fns; + + at::cuda::jit::NvrtcFunction* fn_ptr = &fns[cache_key]; + if (!fn_ptr->function) { + const std::lock_guard lock{_jiterator_mutex}; + if (!fn_ptr->function) { + auto code = at::cuda::jit::generate_code(nTensors, f, name, + f_inputs_type_str, compute_type_str, result_type_str, + contiguous, dynamic_casting, + at::cuda::jit::BinaryFuncVariant::NoScalar, + extra_args_types); + *fn_ptr = at::cuda::jit::jit_pwise_function(code, name); + } + } + + float scalar_val = 0; + + // pack args for kernel launch + constexpr int kernel_args = 7; + auto extra_args_size = extra_args.size(); + auto args = std::make_unique(kernel_args + extra_args_size); + args[0] = static_cast(&N); + args[1] = data_ptr; + args[2] = ic_ptr; + args[3] = oc_ptr; + args[4] = l_ptr; + args[5] = s_ptr; + args[6] = static_cast(&scalar_val); + + for (const auto i : c10::irange(extra_args_size)) { + // since 7 slots are already filled in `args` + args[i + 7] = const_cast(extra_args[i].data_ptr()); + } + + at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args.get(), {grid, 1u, 1u}, {num_threads(), 1u, 1u}); +} + +void jitted_gpu_kernel_dynamic_impl( + const std::string& kernel_name, + TensorIteratorBase& iter, + const std::string& f, + const bool dynamic_casting, + const std::vector& extra_args) { + + TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing()); + TORCH_INTERNAL_ASSERT(iter.noutputs() == 1); + TORCH_INTERNAL_ASSERT(iter.ninputs() <= 8); + + ArrayVariant data(iter); + void* data_ptr = data.data_ptr(); + + int64_t numel = iter.numel(); + bool contiguous = iter.is_contiguous(); + + // Decides which of 4 kernel types to launch + // Variations are: + // - Case 1: no dynamic casting and contiguous + // - Case 2: no dynamic casting and noncontiguous + // - Case 3: dynamic casting and contiguous + // - Case 4: dynamic casting and noncontiguous + // These cases align with the non-jitted CUDALoops.cuh cases in gpu_kernel_impl + + if (!dynamic_casting) { + if (contiguous) { + // Case 1: no dynamic casting and contiguous + launch_jitted_vectorized_kernel_dynamic(kernel_name, iter, + iter.device().index(), numel, f, data_ptr, extra_args); + return; + } + + // Case 2: no dynamic casting and noncontiguous + OffsetCalculatorVariant input_offset_calculator(iter); + void* ic_ptr = input_offset_calculator.data_ptr(); + auto output_offset_calculator = make_output_offset_calculator(iter); + void* oc_ptr = static_cast(&output_offset_calculator); + + auto loader = memory::LoadWithoutCast(); + auto storer = memory::StoreWithoutCast(); + void* l_ptr = static_cast(&loader); + void* s_ptr = static_cast(&storer); + + launch_jitted_unrolled_kernel_dynamic( + kernel_name, iter, iter.device().index(), numel, f, data_ptr, + ic_ptr, oc_ptr, l_ptr, s_ptr, contiguous, dynamic_casting, extra_args); + + return; + } + + // Cases 3 and 4 are handled below + // Both require construction of a storer (this asserts 1 output) and one or more loaders + + // Creates load casts from inputs (note offset indexing into the iterators 1...n tensors) + LoadWithCastVariant loader(iter); + void* l_ptr = loader.data_ptr(); + + // Creates store cast to output (the zeroth tensor in TensorIterator) + auto storer = memory::StoreWithCast(iter.dtype(0)); + void* s_ptr = static_cast(&storer); + + if (contiguous) { + // Case 3: dynamic casting and contiguous + TrivialOffsetCalculatorVariant input_offset_calculator(iter); + void* ic_ptr = input_offset_calculator.data_ptr(); + + auto output_offset_calculator = TrivialOffsetCalculator<1>(); + void* oc_ptr = static_cast(&output_offset_calculator); + + launch_jitted_unrolled_kernel_dynamic( + kernel_name, iter, iter.device().index(), numel, f, data_ptr, + ic_ptr, oc_ptr, l_ptr, s_ptr, contiguous, dynamic_casting, extra_args); + return; + } + + // Case 4: dynamic casting and noncontiguous + OffsetCalculatorVariant input_offset_calculator(iter); + void* ic_ptr = input_offset_calculator.data_ptr(); + + auto output_offset_calculator = make_output_offset_calculator(iter); + void* oc_ptr = static_cast(&output_offset_calculator); + + launch_jitted_unrolled_kernel_dynamic( + kernel_name, iter, iter.device().index(), numel, f, data_ptr, + ic_ptr, oc_ptr, l_ptr, s_ptr, contiguous, dynamic_casting, extra_args); +} + +// Entrypoint for dynamic version of jitted GPU kernels, which accepts dynamic number of inputs +// and arbitrary types of input and extra args. This dynamic version is needed for jiterator with python interface, +// since the kernel definition is unknown at the compilation time. +// Similarly, launch_jitted_vectorized_kernel_dynamic and launch_jitted_unrolled_kernel_dynamic are created +// to handle arbitrary functions defined in python user code. +// For templated version, see note [Jiterator] in JitLoops.cuh for more details +void jitted_gpu_kernel_dynamic( + const std::string& kernel_name, + TensorIteratorBase& iter, + const std::string& f, + const std::vector& extra_args) { + + // TODO: much of preamble is common to both jitted_gpu_kernel and gpu_kernel + // Maybe it could be refactored? + for (int arg = 0; arg < iter.ntensors(); arg++) { + TORCH_INTERNAL_ASSERT( + iter.device(arg).is_cuda(), + "argument ", arg, ": expected a CUDA device but found ", iter.device(arg)); + } + + if (iter.numel() == 0) { + return; + } + + if (!iter.can_use_32bit_indexing()) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + jitted_gpu_kernel_dynamic(kernel_name, sub_iter, f, extra_args); + } + return; + } + + // Computes if dynamic casting is needed + // Dynamic casting is needed if an input's or output's dtype differs from the common dtype + bool needs_dynamic_casting = false; + const at::ScalarType common_dtype = iter.common_dtype(); + for (auto i = 0; i < iter.ntensors(); ++i) { + if (iter.dtype(i) != common_dtype) { + needs_dynamic_casting = true; + break; + } + } + + jitted_gpu_kernel_dynamic_impl(kernel_name, iter, f, needs_dynamic_casting, extra_args); +} + +} // namespace native + +namespace cuda { + +at::Tensor CompileAndLaunchKernel( + const std::string& code_string, + const std::string& kernel_name, + const std::vector& tensors, + const std::vector& extra_args) { + + Tensor output; + TensorIteratorConfig config; + config + .set_check_mem_overlap(true) + .allow_cpu_scalars(false) + .promote_inputs_to_common_dtype(true) + .cast_common_dtype_to_outputs(true) + .enforce_safe_casting_to_output(true) + .check_all_same_device(true) + .add_owned_output(output); + for (const auto& t: tensors){ + config.add_input(t); + } + TensorIterator iter = config.build(); + + CUDAGuard guard(iter.device()); + at::native::jitted_gpu_kernel_dynamic(kernel_name, iter, code_string, extra_args); + + return iter.output(); +} + +}} // namespace at::cuda + +#endif // AT_USE_JITERATOR() diff --git a/aten/src/ATen/cuda/jiterator.h b/aten/src/ATen/cuda/jiterator.h new file mode 100644 index 000000000000..aa831fd06505 --- /dev/null +++ b/aten/src/ATen/cuda/jiterator.h @@ -0,0 +1,35 @@ +#pragma once +#include + +#if AT_USE_JITERATOR() + +#include +#include + +#include +#include + +namespace at { +namespace cuda { + +TORCH_CUDA_CPP_API at::Tensor CompileAndLaunchKernel( + const std::string& code_string, + const std::string& kernel_name, + const std::vector& tensors, + const std::vector& extra_args); + +}} // namespace at::cuda + +#else + +namespace at { namespace cuda { +TORCH_CUDA_CPP_API at::Tensor CompileAndLaunchKernel( + const std::string& code_string, + const std::string& kernel_name, + const std::vector& tensors, + const std::vector& extra_args) { + TORCH_CHECK(false, "Jiterator is not supported on ROCm"); + } +}} // namespace at::cuda + +#endif // AT_USE_JITERATOR() diff --git a/aten/src/ATen/cuda/jiterator_impl.h b/aten/src/ATen/cuda/jiterator_impl.h new file mode 100644 index 000000000000..4fa179b41a66 --- /dev/null +++ b/aten/src/ATen/cuda/jiterator_impl.h @@ -0,0 +1,208 @@ +#pragma once +#include + +#if AT_USE_JITERATOR() + +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace at { +namespace native { + +constexpr int NUM_INPUTS = 8; + +#define AT_FOR_8_INPUTS(_) \ + _(1) \ + _(2) \ + _(3) \ + _(4) \ + _(5) \ + _(6) \ + _(7) \ + _(8) + +c10::SmallVector get_extra_args_typenames(const std::vector& extra_args) { + c10::SmallVector args_typenames(extra_args.size()); + for (auto i = 0; i < extra_args.size(); ++i) { + args_typenames[i] = at::cuda::jit::typeName(extra_args[i].type()); + } + return args_typenames; +} + +int can_vectorize_up_to(at::ScalarType type, char* pointer) { + switch(type) { +#define DEFINE_CASE(ctype, scalartype) \ + case ScalarType::scalartype : return memory::can_vectorize_up_to(pointer); + + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CASE) +#undef DEFINE_CASE + + default: TORCH_INTERNAL_ASSERT(false, "Unrecognized ScalarType: ", type); + } +} + +// jitted version of the above +// See Note [Jiterator], this relies on the assumptions enumerated there +int jitted_can_vectorize_up_to(const TensorIteratorBase& iter) { + const at::ScalarType common_dtype = iter.common_dtype(); + const at::ScalarType result_dtype = common_dtype; + + // Deals with output + int result = can_vectorize_up_to(result_dtype, static_cast(iter.data_ptr(0))); + + // Incorporates input(s) + for (auto i = 1; i < iter.ntensors(); ++i) { + result = std::min(result, can_vectorize_up_to(common_dtype, static_cast(iter.data_ptr(i)))); + } + + return result; +} + +template +static std::unique_ptr> make_unique_input_offset_calculator(const TensorIteratorBase& iter) { + // array size can not be 0, this happens when N == 0 + constexpr int array_size = std::max(N, 1); + TORCH_INTERNAL_ASSERT(N == iter.ntensors() - iter.noutputs()); + std::array strides; + int64_t element_sizes[array_size]; + for (int i = 0; i < N; i++) { + strides[i] = iter.strides(i + iter.noutputs()).data(); + element_sizes[i] = iter.element_size(i + iter.noutputs()); + } + return std::make_unique>(iter.ndim(), iter.shape().data(), strides.data(), element_sizes); +} + +struct OffsetCalculatorVariant { +#define DEFINE_CASE(index) std::unique_ptr>, + using OffsetCalculatorTypes = c10::variant< + AT_FOR_8_INPUTS(DEFINE_CASE) + >; +#undef DEFINE_CASE + + OffsetCalculatorVariant(const TensorIteratorBase& iter) { + int arity = iter.ninputs(); + switch(arity) { +#define DEFINE_CASE(index) \ + case index : v = make_unique_input_offset_calculator(iter); break; + + AT_FOR_8_INPUTS(DEFINE_CASE) +#undef DEFINE_CASE + default: + TORCH_CHECK(false, "OffsetCalculatorVariant is not implemented for ninputs = ", arity); + } + } + + void* data_ptr() { + return c10::visit([](auto & v){ return static_cast(v.get()); }, v); + } + + private: + OffsetCalculatorTypes v; +}; + +struct ArrayVariant { + // notice: This would produce c10::variant> +#define DEFINE_CASE(index) at::detail::Array, + using ArrayTypes = c10::variant< + AT_FOR_8_INPUTS(DEFINE_CASE) + >; +#undef DEFINE_CASE + + ArrayVariant(const TensorIteratorBase& iter) { + int arity = iter.ninputs(); + // This assumes that jiterator kernels only have 1 output + switch(arity) { +#define DEFINE_CASE(index) \ + case index: array = at::detail::Array{}; break; + + AT_FOR_8_INPUTS(DEFINE_CASE) +#undef DEFINE_CASE + + default: + TORCH_CHECK(false, "ArrayVariant is not implemented for ninputs = ", arity); + } + + c10::visit([&](auto& a) { + for (auto i = 0; i < arity + 1; ++i) { + a[i] = (char*)iter.data_ptr(i); + } + }, array); + } + + void* data_ptr() { + return c10::visit([](auto & a){ return static_cast(&a); }, array); + } + +private: + ArrayTypes array; +}; + +struct TrivialOffsetCalculatorVariant { +#define DEFINE_CASE(index) TrivialOffsetCalculator, + using TrivialOffsetCalculatorTypes = c10::variant< + AT_FOR_8_INPUTS(DEFINE_CASE) + >; +#undef DEFINE_CASE + + TrivialOffsetCalculatorVariant(const TensorIteratorBase& iter) { + int arity = iter.ninputs(); + switch(arity) { +#define DEFINE_CASE(index) \ + case index: v = TrivialOffsetCalculator(); break; + + AT_FOR_8_INPUTS(DEFINE_CASE) +#undef DEFINE_CASE + + default: + TORCH_CHECK(false, "TrivialOffsetCalculatorVariant is not implemented for ninputs = ", arity); + } + } + + void* data_ptr() { + return c10::visit([](auto & v){ return static_cast(&v); }, v); + } + +private: + TrivialOffsetCalculatorTypes v; +}; + +struct LoadWithCastVariant { +#define DEFINE_CASE(index) std::unique_ptr>, + using LoadWithCastPtr = c10::variant< + AT_FOR_8_INPUTS(DEFINE_CASE) + >; +#undef DEFINE_CASE + + LoadWithCastVariant(const TensorIteratorBase& iter) { + int arity = iter.ninputs(); + switch(arity) { +#define DEFINE_CASE(index) \ + case index: v = std::make_unique>(iter); break; + + AT_FOR_8_INPUTS(DEFINE_CASE) +#undef DEFINE_CASE + + default: + TORCH_CHECK(false, "LoadWithCastVariant is not implemented for ninputs = ", arity); + } + } + + void* data_ptr() { + return c10::visit([](auto & v){ return static_cast(v.get()); }, v); + } + +private: + LoadWithCastPtr v; +}; + +}} // namespace at::native + + +#endif // AT_USE_JITERATOR() diff --git a/aten/src/ATen/cuda/llvm_complex.cpp b/aten/src/ATen/cuda/llvm_complex.cpp index 00339bdac0fb..55e39e280272 100644 --- a/aten/src/ATen/cuda/llvm_complex.cpp +++ b/aten/src/ATen/cuda/llvm_complex.cpp @@ -477,6 +477,14 @@ operator!=(const _Tp& __x, const complex<_Tp>& __y) return !(__x == __y); } +template +inline constexpr +bool +operator&&(const complex<_Tp>& __x, const complex<_Tp>& __y) +{ + return (__x.real() || __x.imag()) && (__y.real() || __y.imag()); +} + // 26.3.7 values: template ::value, @@ -583,10 +591,41 @@ arg(_Tp __re) )ESCAPE"; +const std::string complex_half_body = R"ESCAPE( +namespace std { +template <> +struct alignas(2) complex { + at::Half real_; + at::Half imag_; + + // Constructors + complex() = default; + + // implicit casting to and from `complex`. + // NOTE: computation of `complex` will occur in `complex` + __host__ __device__ inline complex(const std::complex& value) + : real_(value.real()), imag_(value.imag()) {} + + inline __host__ __device__ operator std::complex() const { + return {real_, imag_}; + } + + at::Half real() const {return real_;} + at::Half imag() const {return imag_;} + +}; +} +)ESCAPE"; + + const std::string &get_complex_body_string() { return complex_body; } +const std::string &get_complex_half_body_string() { + return complex_half_body; +} + const std::string complex_math = R"ESCAPE( namespace std { @@ -724,6 +763,16 @@ log10(const complex<_Tp>& __x) return log(__x) / log(_Tp(10)); } +// log2 + +template +inline +complex<_Tp> +log2(const complex<_Tp>& __x) +{ + return log(__x) / log(_Tp(2)); +} + // sqrt template diff --git a/aten/src/ATen/cuda/llvm_jit_strings.h b/aten/src/ATen/cuda/llvm_jit_strings.h index dcbecd4279bb..237bcdbb4ccb 100644 --- a/aten/src/ATen/cuda/llvm_jit_strings.h +++ b/aten/src/ATen/cuda/llvm_jit_strings.h @@ -9,6 +9,7 @@ namespace cuda { TORCH_CUDA_CPP_API const std::string &get_traits_string(); TORCH_CUDA_CPP_API const std::string &get_cmath_string(); TORCH_CUDA_CPP_API const std::string &get_complex_body_string(); +TORCH_CUDA_CPP_API const std::string &get_complex_half_body_string(); TORCH_CUDA_CPP_API const std::string &get_complex_math_string(); }} // namespace at diff --git a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h index 9a77b87713ef..5dbe49953cf1 100644 --- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h +++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h @@ -55,7 +55,9 @@ namespace at { namespace cuda { _(cuDevicePrimaryCtxGetState) \ _(cuLinkCreate) \ _(cuLinkAddData) \ - _(cuLinkComplete) + _(cuLinkComplete) \ + _(cuFuncSetAttribute) \ + _(cuFuncGetAttribute) #if defined(CUDA_VERSION) && CUDA_VERSION >= 11010 #define AT_FORALL_NVRTC(_) \ diff --git a/aten/src/ATen/cudnn/Descriptors.cpp b/aten/src/ATen/cudnn/Descriptors.cpp index d915fda024de..f954bbf5623a 100644 --- a/aten/src/ATen/cudnn/Descriptors.cpp +++ b/aten/src/ATen/cudnn/Descriptors.cpp @@ -19,6 +19,13 @@ inline cudnnDataType_t getDataType(const at::Tensor& t) { } else if (scalar_type == at::kDouble) { return CUDNN_DATA_DOUBLE; } +#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 8200 + else if (scalar_type == at::kBFloat16) { + return CUDNN_DATA_BFLOAT16; + } else if (scalar_type == at::kQInt8) { + return CUDNN_DATA_INT8; + } +#endif throw std::runtime_error("TensorDescriptor only supports double, float and half tensors"); } @@ -73,6 +80,10 @@ std::string cudnnTypeToString(cudnnDataType_t dtype) { return "CUDNN_DATA_DOUBLE"; case CUDNN_DATA_HALF: return "CUDNN_DATA_HALF"; +#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 8200 + case CUDNN_DATA_BFLOAT16: + return "CUDNN_DATA_BFLOAT16"; +#endif case CUDNN_DATA_INT8: return "CUDNN_DATA_INT8"; case CUDNN_DATA_INT32: diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h index c704826511eb..a7bcb5eb72ea 100644 --- a/aten/src/ATen/cudnn/Descriptors.h +++ b/aten/src/ATen/cudnn/Descriptors.h @@ -21,6 +21,9 @@ std::string cudnnTypeToString(cudnnDataType_t dtype); inline int dataSize(cudnnDataType_t dataType) { switch (dataType) { +#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 8200 + case CUDNN_DATA_BFLOAT16: +#endif case CUDNN_DATA_HALF: return 2; case CUDNN_DATA_FLOAT: return 4; default: return 8; diff --git a/aten/src/ATen/cudnn/Handle.cpp b/aten/src/ATen/cudnn/Handle.cpp index 2b1d90f4b3cf..a6eb8fd78154 100644 --- a/aten/src/ATen/cudnn/Handle.cpp +++ b/aten/src/ATen/cudnn/Handle.cpp @@ -9,7 +9,7 @@ void createCuDNNHandle(cudnnHandle_t *handle) { AT_CUDNN_CHECK(cudnnCreate(handle)); } -void destroyCuDNNHandle(cudnnHandle_t handle) { +void destroyCuDNNHandle(cudnnHandle_t /*handle*/) { // this is because of something dumb in the ordering of // destruction. Sometimes atexit, the cuda context (or something) // would already be destroyed by the time this gets destroyed. It diff --git a/aten/src/ATen/cudnn/Types.cpp b/aten/src/ATen/cudnn/Types.cpp index 857a7da05127..215d42fcd23f 100644 --- a/aten/src/ATen/cudnn/Types.cpp +++ b/aten/src/ATen/cudnn/Types.cpp @@ -5,12 +5,18 @@ namespace at { namespace native { cudnnDataType_t getCudnnDataTypeFromScalarType(const at::ScalarType dtype) { - if (dtype == at::kFloat) { + if (dtype == c10::kQInt8) { + return CUDNN_DATA_INT8; + } else if (dtype == at::kFloat) { return CUDNN_DATA_FLOAT; } else if (dtype == at::kDouble) { return CUDNN_DATA_DOUBLE; } else if (dtype == at::kHalf) { return CUDNN_DATA_HALF; + } +#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 8200 + else if (dtype == at::kBFloat16) { + return CUDNN_DATA_BFLOAT16; } else if (dtype == at::kInt) { return CUDNN_DATA_INT32; } else if (dtype == at::kByte) { @@ -18,6 +24,7 @@ cudnnDataType_t getCudnnDataTypeFromScalarType(const at::ScalarType dtype) { } else if (dtype == at::kChar) { return CUDNN_DATA_INT8; } +#endif std::string msg("getCudnnDataTypeFromScalarType() not supported for "); msg += toString(dtype); throw std::runtime_error(msg); diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h index 0454c2f30a22..1303b9f8c8bf 100644 --- a/aten/src/ATen/detail/CUDAHooksInterface.h +++ b/aten/src/ATen/detail/CUDAHooksInterface.h @@ -75,14 +75,15 @@ struct TORCH_API CUDAHooksInterface { } virtual const Generator& getDefaultCUDAGenerator(DeviceIndex device_index = -1) const { + (void)device_index; // Suppress unused variable warning TORCH_CHECK(false, "Cannot get default CUDA generator without ATen_cuda library. ", CUDA_HELP); } - virtual Device getDeviceFromPtr(void* data) const { + virtual Device getDeviceFromPtr(void* /*data*/) const { TORCH_CHECK(false, "Cannot get device of pointer on CUDA without ATen_cuda library. ", CUDA_HELP); } - virtual bool isPinnedPtr(void* data) const { + virtual bool isPinnedPtr(void* /*data*/) const { return false; } @@ -106,6 +107,10 @@ struct TORCH_API CUDAHooksInterface { return false; } + virtual bool hasROCM() const { + return false; + } + virtual const at::cuda::NVRTC& nvrtc() const { TORCH_CHECK(false, "NVRTC requires CUDA. ", CUDA_HELP); } @@ -159,19 +164,19 @@ struct TORCH_API CUDAHooksInterface { "Cannot query batchnormMinEpsilonCuDNN() without ATen_cuda library. ", CUDA_HELP); } - virtual int64_t cuFFTGetPlanCacheMaxSize(int64_t device_index) const { + virtual int64_t cuFFTGetPlanCacheMaxSize(int64_t /*device_index*/) const { TORCH_CHECK(false, "Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP); } - virtual void cuFFTSetPlanCacheMaxSize(int64_t device_index, int64_t max_size) const { + virtual void cuFFTSetPlanCacheMaxSize(int64_t /*device_index*/, int64_t /*max_size*/) const { TORCH_CHECK(false, "Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP); } - virtual int64_t cuFFTGetPlanCacheSize(int64_t device_index) const { + virtual int64_t cuFFTGetPlanCacheSize(int64_t /*device_index*/) const { TORCH_CHECK(false, "Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP); } - virtual void cuFFTClearPlanCache(int64_t device_index) const { + virtual void cuFFTClearPlanCache(int64_t /*device_index*/) const { TORCH_CHECK(false, "Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP); } @@ -179,7 +184,7 @@ struct TORCH_API CUDAHooksInterface { return 0; } - virtual void deviceSynchronize(int64_t device_index) const { + virtual void deviceSynchronize(int64_t /*device_index*/) const { TORCH_CHECK(false, "Cannot synchronize CUDA device without ATen_cuda library. ", CUDA_HELP); } }; diff --git a/aten/src/ATen/gen_vulkan_glsl.py b/aten/src/ATen/gen_vulkan_glsl.py index d90afbf6a019..b43dcb6cfeff 100644 --- a/aten/src/ATen/gen_vulkan_glsl.py +++ b/aten/src/ATen/gen_vulkan_glsl.py @@ -4,7 +4,7 @@ import glob import sys import os -from tools.codegen.code_template import CodeTemplate +from torchgen.code_template import CodeTemplate H_NAME = "glsl.h" CPP_NAME = "glsl.cpp" diff --git a/aten/src/ATen/gen_vulkan_spv.py b/aten/src/ATen/gen_vulkan_spv.py index eb3542410a20..0d0906ded60e 100644 --- a/aten/src/ATen/gen_vulkan_spv.py +++ b/aten/src/ATen/gen_vulkan_spv.py @@ -6,7 +6,7 @@ import os import sys import subprocess -from tools.codegen.code_template import CodeTemplate +from torchgen.code_template import CodeTemplate H_NAME = "spv.h" CPP_NAME = "spv.cpp" diff --git a/aten/src/ATen/jit_macros.h b/aten/src/ATen/jit_macros.h index e1542d5fb605..bfe49b51b80a 100644 --- a/aten/src/ATen/jit_macros.h +++ b/aten/src/ATen/jit_macros.h @@ -8,7 +8,6 @@ #define AT_USE_JITERATOR() true #define jiterator_stringify(...) std::string(#__VA_ARGS__); #else - // TODO: update this to become a static assertion #define AT_USE_JITERATOR() false - #define jiterator_stringify(...) std::string("Jiterator is disabled"); + #define jiterator_stringify(...) static_assert(false, "Jiterator is not supported on ROCm"); #endif // USE_ROCM diff --git a/aten/src/ATen/jiterator_macros.h b/aten/src/ATen/jiterator_macros.h new file mode 100644 index 000000000000..2769537346c8 --- /dev/null +++ b/aten/src/ATen/jiterator_macros.h @@ -0,0 +1,38 @@ +#pragma once +#include +#include + +#define JITERATOR_HOST_DEVICE C10_HOST_DEVICE +#if defined(_MSC_VER) && defined(__CUDACC__) +// NVRTC on Windows errors if __host__ __device__ attribute is +// present on kernel. +// error: attribute "__host__" does not apply here +// error: attribute "__device__" does not apply here +#define JITERATOR_HOST_DEVICE +#endif + +// jiterator_also_stringify_as macro is used to define code (for CPU/ROCm) +// and generate code string for `jiterator` (only when compiling for CUDA). +// Usage : +// jiterator_also_stringify_as( +// jiterator_code(template T identity(T x) { return x; }), +// identity_string); +// This will define the template `identity` as present in code and +// also define `std::string identity_string` with the code as the string +// if this is being compiled for CUDA. + +// `jiterator_code` macro is to deal with `,` in the kernel code. +// These `,`s confuse the preprocessor into thinking we are passing +// multiple arguments to the macro. +#define jiterator_code(...) __VA_ARGS__ +#if defined(__CUDACC__) + // CPU and CUDA case + #define stringify_code(...) #__VA_ARGS__ + #define jiterator_also_stringify_as(code, str_name) \ + code /* define the function */ \ + const std::string str_name = std::string(stringify_code(code)); +#else + // CPU only or CPU and ROCm case + // Only needs the function + #define jiterator_also_stringify_as(code, str_name) code +#endif diff --git a/aten/src/ATen/mkl/SparseBlas.cpp b/aten/src/ATen/mkl/SparseBlas.cpp index 67dcb30e5283..1ad464b8d3a3 100644 --- a/aten/src/ATen/mkl/SparseBlas.cpp +++ b/aten/src/ATen/mkl/SparseBlas.cpp @@ -253,6 +253,39 @@ void mm>(MKL_SPARSE_MM_ARGTYPES(c10::complex)) { ldc)); } +#if !defined(_WIN32) +template <> +void spmmd(MKL_SPARSE_SPMMD_ARGTYPES(float)) { + TORCH_MKLSPARSE_CHECK(mkl_sparse_s_spmmd( + operation, A, B, layout, C, ldc)); +} +template <> +void spmmd(MKL_SPARSE_SPMMD_ARGTYPES(double)) { + TORCH_MKLSPARSE_CHECK(mkl_sparse_d_spmmd( + operation, A, B, layout, C, ldc)); +} +template <> +void spmmd>(MKL_SPARSE_SPMMD_ARGTYPES(c10::complex)) { + TORCH_MKLSPARSE_CHECK(mkl_sparse_c_spmmd( + operation, + A, + B, + layout, + reinterpret_cast(C), + ldc)); +} +template <> +void spmmd>(MKL_SPARSE_SPMMD_ARGTYPES(c10::complex)) { + TORCH_MKLSPARSE_CHECK(mkl_sparse_z_spmmd( + operation, + A, + B, + layout, + reinterpret_cast(C), + ldc)); +} +#endif + template <> void trsv(MKL_SPARSE_TRSV_ARGTYPES(float)) { TORCH_MKLSPARSE_CHECK(mkl_sparse_s_trsv(operation, alpha, A, descr, x, y)); diff --git a/aten/src/ATen/mkl/SparseBlas.h b/aten/src/ATen/mkl/SparseBlas.h index 7281b6950611..20fb59a54ff9 100644 --- a/aten/src/ATen/mkl/SparseBlas.h +++ b/aten/src/ATen/mkl/SparseBlas.h @@ -157,6 +157,29 @@ void mm>(MKL_SPARSE_MM_ARGTYPES(c10::complex)); template <> void mm>(MKL_SPARSE_MM_ARGTYPES(c10::complex)); +#define MKL_SPARSE_SPMMD_ARGTYPES(scalar_t) \ + const sparse_operation_t operation, const sparse_matrix_t A, \ + const sparse_matrix_t B, const sparse_layout_t layout, scalar_t *C, \ + const MKL_INT ldc + +template +inline void spmmd(MKL_SPARSE_SPMMD_ARGTYPES(scalar_t)) { + TORCH_INTERNAL_ASSERT( + false, + "at::mkl::sparse::spmmd: not implemented for ", + typeid(scalar_t).name()); +} + +template <> +void spmmd(MKL_SPARSE_SPMMD_ARGTYPES(float)); +template <> +void spmmd(MKL_SPARSE_SPMMD_ARGTYPES(double)); +template <> +void spmmd>(MKL_SPARSE_SPMMD_ARGTYPES(c10::complex)); +template <> +void spmmd>( + MKL_SPARSE_SPMMD_ARGTYPES(c10::complex)); + #define MKL_SPARSE_TRSV_ARGTYPES(scalar_t) \ const sparse_operation_t operation, const scalar_t alpha, \ const sparse_matrix_t A, const struct matrix_descr descr, \ diff --git a/aten/src/ATen/mkl/SparseDescriptors.h b/aten/src/ATen/mkl/SparseDescriptors.h index 46d656898a8d..e0dfb158e356 100644 --- a/aten/src/ATen/mkl/SparseDescriptors.h +++ b/aten/src/ATen/mkl/SparseDescriptors.h @@ -76,7 +76,7 @@ class MklSparseCsrDescriptor : public MklSparseDescriptor { public: MklSparseCsrDescriptor(const Tensor& input) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.is_sparse_csr()); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY((input.layout() == kSparseCsr || input.layout() == kSparseBsr)); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.dim() == 2); TORCH_INTERNAL_ASSERT_DEBUG_ONLY( @@ -100,8 +100,10 @@ class MklSparseCsrDescriptor sparse_matrix_t raw_descriptor; - // Assuming that the last two dimensions are block elements of the matrix - if (values.dim() == 3) { + if (input.layout() == kSparseBsr) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + values.dim() == 3 && crow_indices.dim() == 1 && + col_indices.dim() == 1); TORCH_CHECK( values.size(-1) == values.size(-2), "MKL Sparse doesn't support matrices with non-square blocks."); diff --git a/aten/src/ATen/mps/EmptyTensor.cpp b/aten/src/ATen/mps/EmptyTensor.cpp new file mode 100644 index 000000000000..fd2b0b0e536a --- /dev/null +++ b/aten/src/ATen/mps/EmptyTensor.cpp @@ -0,0 +1,119 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include +#include +#include +#include +#include + +#define MPS_ERROR_NOT_COMPILED "PyTorch code is not compiled with MPS enabled" +#define MPS_ERROR_RUNTIME_TOO_LOW \ + "The MPS backend is supported on MacOS 12.3+.", \ + "Current OS version can be queried using `sw_vers`" + +namespace at { namespace detail { +TensorBase empty_mps( + IntArrayRef size, + c10::optional dtype_opt, + c10::optional layout_opt, + c10::optional device_opt, + c10::optional pin_memory_opt, + c10::optional memory_format_opt) { +#if defined(__APPLE__) +#if __is_target_os(macOS) + if (__builtin_available(macOS 12.3, *)) { + auto device = device_or_default(device_opt); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device.type() == DeviceType::MPS); + + TORCH_CHECK_NOT_IMPLEMENTED( + layout_or_default(layout_opt) == Layout::Strided, + "strided tensors not supported yet"); + check_size_nonnegative(size); + + auto* allocator = at::mps::GetMPSAllocator(); + int64_t nelements = c10::multiply_integers(size); + auto dtype = dtype_or_default(dtype_opt); + auto dtype_meta = scalarTypeToTypeMeta(dtype); + int64_t size_bytes = nelements * dtype_meta.itemsize(); + auto storage_impl = c10::make_intrusive( + c10::StorageImpl::use_byte_size_t(), + size_bytes, + allocator->allocate(size_bytes), + allocator, + /*resizeable=*/true); + + auto tensor = + detail::make_tensor(storage_impl, DispatchKey::MPS, dtype_meta); + // Default TensorImpl has size [0] + if (size.size() != 1 || size[0] != 0) { + tensor.unsafeGetTensorImpl()->set_sizes_contiguous(size); + } + + auto memory_format = memory_format_opt.value_or(MemoryFormat::Contiguous); + tensor.unsafeGetTensorImpl()->empty_tensor_restride(memory_format); + return tensor; + } else { + TORCH_CHECK(false, MPS_ERROR_RUNTIME_TOO_LOW) + } +#else + TORCH_CHECK(false, MPS_ERROR_NOT_COMPILED) +#endif +#else + TORCH_CHECK(false, MPS_ERROR_NOT_COMPILED) +#endif +} + +TensorBase empty_mps( + IntArrayRef size, const TensorOptions &options) { + return at::detail::empty_mps( + size, + optTypeMetaToScalarType(options.dtype_opt()), + options.layout_opt(), + options.device_opt(), + options.pinned_memory_opt(), + options.memory_format_opt()); +} + +TensorBase empty_strided_mps( + IntArrayRef size, + IntArrayRef stride, + ScalarType dtype, + c10::optional device_opt) { +#if defined(__APPLE__) +#if __is_target_os(macOS) + if (__builtin_available(macOS 12.3, *)) { + auto device = device_or_default(device_opt); + TORCH_INTERNAL_ASSERT(device.is_mps()); + const DeviceGuard device_guard(device); + auto* allocator = at::mps::GetMPSAllocator(); + constexpr c10::DispatchKeySet mps_dks(c10::DispatchKey::MPS); + return at::detail::empty_strided_generic( + size, stride, allocator, mps_dks, dtype); + } else { + TORCH_CHECK(false, MPS_ERROR_RUNTIME_TOO_LOW) + } +#else + TORCH_CHECK(false, MPS_ERROR_NOT_COMPILED) +#endif +#else + TORCH_CHECK(false, MPS_ERROR_NOT_COMPILED) +#endif +} + +TensorBase empty_strided_mps( + IntArrayRef size, + IntArrayRef stride, + const TensorOptions &options) { + return at::native::empty_strided_mps( + size, + stride, + optTypeMetaToScalarType(options.dtype_opt()), + options.layout_opt(), + options.device_opt(), + options.pinned_memory_opt()); +} + +} // namespace detail +} // namespace at diff --git a/aten/src/ATen/mps/EmptyTensor.h b/aten/src/ATen/mps/EmptyTensor.h new file mode 100644 index 000000000000..fcdb7e152da9 --- /dev/null +++ b/aten/src/ATen/mps/EmptyTensor.h @@ -0,0 +1,31 @@ +// Copyright © 2022 Apple Inc. + +#pragma once +#include + +namespace at { +namespace detail { + +C10_EXPORT TensorBase empty_mps( + IntArrayRef size, + c10::optional dtype_opt, + c10::optional layout_opt, + c10::optional device_opt, + c10::optional pin_memory_opt, + c10::optional memory_format_opt); +C10_EXPORT TensorBase empty_mps( + IntArrayRef size, const TensorOptions &options); + +C10_EXPORT TensorBase empty_strided_mps( + IntArrayRef size, + IntArrayRef stride, + ScalarType dtype, + c10::optional device_opt); + +C10_EXPORT TensorBase empty_strided_mps( + IntArrayRef size, + IntArrayRef stride, + const TensorOptions &options); + +} // namespace detail +} // namespace at diff --git a/aten/src/ATen/mps/MPSAllocator.h b/aten/src/ATen/mps/MPSAllocator.h new file mode 100644 index 000000000000..7e3d3a653517 --- /dev/null +++ b/aten/src/ATen/mps/MPSAllocator.h @@ -0,0 +1,244 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifdef __OBJC__ +#include +#include +#include +#include +#endif + +// this implementation is based on CUDACachingAllocator. +// It utilizes Metal Heaps to improve the performance with buffer allocation. +// TODO: Unify the logic with CUDACachingAllocator and remove redundant code. +namespace at { +namespace mps { + +namespace HeapAllocator { + +#define MB(x) round_page(x * 1048576UL) + +static const size_t kMaxSmallAlloc = MB(1); // largest "small" allocation is 1 MiB +static const size_t kMinLargeAlloc = MB(10); // allocations between 1 and 10 MiB may use kLargeHeap +static const size_t kSmallHeap = MB(8); // "small" allocations are packed in 8 MiB heaps +static const size_t kLargeHeap = MB(32); // "large" allocations may be packed in 32 MiB heaps +static const size_t kRoundLarge = MB(2); // round up large allocations to 2 MiB + +// TODO: check the caching performance of write-combined mode +constexpr MTLResourceOptions kCPUCacheMode = MTLResourceOptionCPUCacheModeDefault; +constexpr MTLResourceOptions kPrivateResourceOptions = kCPUCacheMode | MTLResourceStorageModePrivate; +constexpr MTLResourceOptions kSharedResourceOptions = kCPUCacheMode | MTLResourceStorageModeShared; + +struct HeapBlock; + +struct BufferBlock +{ + id buffer; + size_t size; + bool in_use; + HeapBlock* heap; + id_t buf_id; + + BufferBlock(size_t Size, const id Buffer = nullptr, HeapBlock* Heap = nullptr, id_t BufID = 0) : + buffer(Buffer), size(Size), in_use(false), heap(Heap), buf_id(BufID) { } + + static bool Comparator(const BufferBlock* a, const BufferBlock* b) { + return (a->size != b->size) ? a->size < b->size : (uintptr_t)a->buffer < (uintptr_t)b->buffer; + } + static size_t alignUp(size_t Size, size_t Alignment) { + assert(((Alignment - 1) & Alignment) == 0); + return ((Size + Alignment - 1) & ~(Alignment - 1)); + } +}; +typedef bool (*BufferComparison)(const BufferBlock*, const BufferBlock*); + +struct BufferPool; + +struct HeapBlock +{ + id heap; + struct { size_t total, available; } size; + BufferPool* pool; + unsigned int n_buffers; + + HeapBlock(size_t Size, const id Heap = nullptr, BufferPool *Pool = nullptr) : + heap(Heap), size({.total = Size, .available = Size}), pool(Pool), n_buffers(0) { } + + static MTLResourceOptions getOptions(bool SharedStorage = false) { return SharedStorage ? kSharedResourceOptions : kPrivateResourceOptions; } + + static id createMTLHeap(id device, size_t size, bool is_shared) { + id heap = nil; + MTLHeapDescriptor *d = [MTLHeapDescriptor new]; + if (d) { + if (size <= kMaxSmallAlloc) { + d.size = kSmallHeap; + } else if (size < kMinLargeAlloc) { + d.size = kLargeHeap; + } else { + d.size = kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge); + } + d.storageMode = is_shared ? MTLStorageModeShared : MTLStorageModePrivate; + d.cpuCacheMode = MTLCPUCacheModeDefaultCache; + // this automatically handles Metal buffer access synchronizations at the + // cost of slightly lower performance. + d.hazardTrackingMode = MTLHazardTrackingModeTracked; + d.resourceOptions = getOptions(is_shared) | (MTLHazardTrackingModeTracked << MTLResourceHazardTrackingModeShift); + d.type = MTLHeapTypeAutomatic; + heap = [device newHeapWithDescriptor: d]; + if (heap) { + [heap setPurgeableState:MTLPurgeableStateEmpty]; + } + [d release]; + } + return heap; + } + static bool Comparator(const HeapBlock* a, const HeapBlock* b) { + return a->size.available < b->size.available; + } + static NSUInteger heapAvailableSize(id heap, size_t Alignment = vm_page_size) { + return [heap maxAvailableSizeWithAlignment:Alignment]; + } + id newMTLBuffer(size_t length, bool is_shared) { + id buf = [heap newBufferWithLength:length options:getOptions(is_shared)]; + if (buf) { + size.available = heapAvailableSize(heap); + n_buffers++; + } + return buf; + } + void releaseMTLBuffer(id buffer) { + [buffer release]; + size.available = heapAvailableSize(heap); + n_buffers--; + } + void releaseMTLHeap() { + TORCH_INTERNAL_ASSERT(!n_buffers); // assert if heap isn't empty + [heap release]; + size.available = 0; + } +}; +typedef bool (*HeapComparison)(const HeapBlock*, const HeapBlock*); + +struct BufferPool +{ + BufferPool(const id Device, bool Small, bool Shared) : + device(Device), is_small(Small), is_shared(Shared), + heaps(HeapBlock::Comparator), buffers(BufferBlock::Comparator) { } + + const id device; + // small heaps have sizes of kSmallHeap, and large ones kLargeHeap + const bool is_small; + // private pools allocated on device memory; otherwise, shared between host/device + const bool is_shared; + // list of heaps ordered by their "available" (not total) memory size + std::set heaps; + // list of only "available" buffers in the pool (i.e., buffers not in-use) + std::set buffers; +}; + +struct AllocParams +{ + AllocParams(size_t Alloc_Size, size_t Requested_Size, BufferPool* Pool) : + search_key(Alloc_Size), pool(Pool), + buffer_block(nullptr), requested_size(Requested_Size) {} + size_t size() const { return search_key.size; } + + BufferBlock search_key; + BufferPool* pool; + BufferBlock* buffer_block; + size_t requested_size; +}; + +class MPSHeapAllocatorImpl +{ +public: + explicit MPSHeapAllocatorImpl() : + m_device(at::mps::MPSDevice::getInstance()->device()), + m_large_pool_shared(m_device, false, true), m_large_pool_private(m_device, false, false), + m_small_pool_shared(m_device, true , true), m_small_pool_private(m_device, true , false), + m_total_allocated_memory(0), m_max_buffer_size([m_device maxBufferLength]), + m_set_fraction(false), m_enable_debug_info(false) { } + + // interface exposed to at::Allocator + id Malloc(size_t size, bool sharedStorage); + void Free(void* ptr); + void EmptyCache(); + bool isSharedBuffer(void* ptr); + + inline id Device() const { return m_device; } + void enable_debug_info() { m_enable_debug_info = true; } + bool debug_info_enabled() const { return m_enable_debug_info; } + void set_shared_storage_mode(bool useSharedStorage); + +private: + const id m_device; + std::mutex m_mutex; + // allocated buffers by device pointer + ska::flat_hash_map m_allocated_buffers; + // unallocated cached buffers larger than 1 MB + BufferPool m_large_pool_shared, m_large_pool_private; + // unallocated cached buffers 1 MB or smaller + BufferPool m_small_pool_shared, m_small_pool_private; + // total memory allocated by HeapAllocator + size_t m_total_allocated_memory; + // max buffer size allowed by Metal + size_t m_max_buffer_size; + // sets a soft upper bound to limit the total allocations + bool m_set_fraction; + // use "PYTORCH_DEBUG_MPS_ALLOCATOR" env-var to enable debug info + bool m_enable_debug_info; + + HeapBlock* get_free_heap(AllocParams& p); + bool get_free_buffer(AllocParams& p); + BufferBlock* get_allocated_buffer_block(void* ptr); + bool alloc_buffer(AllocParams& p); + void free_buffer(BufferBlock* buffer_block); + void release_buffer(BufferBlock* buffer_block, bool remove_empty_heap = true); + void release_buffers(BufferPool& pool); + bool release_available_cached_buffers(const AllocParams& p); + bool release_cached_buffers(); + + BufferPool& get_pool(size_t Size, bool useShared) { + return Size <= kMaxSmallAlloc ? (useShared ? m_small_pool_shared : m_small_pool_private) : + (useShared ? m_large_pool_shared : m_large_pool_private); + } + + size_t get_allocation_size(size_t Length, bool useShared) { + MTLSizeAndAlign sizeAlign = [m_device heapBufferSizeAndAlignWithLength:Length + options:HeapBlock::getOptions(useShared)]; + return BufferBlock::alignUp(sizeAlign.size, sizeAlign.align); + } + // TODO: make this configurable + static size_t max_split_size() { return std::numeric_limits::max(); } + // maximum size of device memory available for allocation in current process + size_t max_available_size() const { return [m_device recommendedMaxWorkingSetSize] - [m_device currentAllocatedSize]; } + + // TODO: make a common function to do size unit conversions in PyTorch. + static std::string format_size(uint64_t size) { + std::ostringstream os; + os.precision(2); + os << std::fixed; + if (size <= 1024UL) { os << size << " bytes"; } + else if (size <= 1048576UL) { os << ((float) size / 1024.0) << " KB"; } + else if (size <= 1073741824UL) { os << ((float) size / 1048576.0) << " MB"; } + else { os << ((float) size / 1073741824.0) << " GB"; } + return os.str(); + } +}; + +} // namespace HeapAllocator + +} // namespace mps +} // namespace at diff --git a/aten/src/ATen/mps/MPSAllocator.mm b/aten/src/ATen/mps/MPSAllocator.mm new file mode 100644 index 000000000000..0c30af5c36b5 --- /dev/null +++ b/aten/src/ATen/mps/MPSAllocator.mm @@ -0,0 +1,351 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include +#include + +namespace at { +namespace mps { + +namespace HeapAllocator { + +HeapBlock* MPSHeapAllocatorImpl::get_free_heap(AllocParams& p) +{ + BufferPool *pool = p.pool; + HeapBlock *heapBlock = nullptr; + HeapBlock search_key(p.size()); + + auto it = pool->heaps.lower_bound(&search_key); + if (it == pool->heaps.end()) { + id heap = HeapBlock::createMTLHeap(pool->device, p.size(), pool->is_shared); + if (heap) { + size_t heap_size = HeapBlock::heapAvailableSize(heap); + heapBlock = new HeapBlock(heap_size, heap, pool); + + if (debug_info_enabled()) { + static unsigned int heap_counter = 0; + std::cerr << "\nAllocated " + << (pool->is_small ? "small " : "large ") + << (pool->is_shared ? "shared " : "private ") + << "heap of size " << format_size(heap_size) + << " (#heaps: " << (++heap_counter) + << ", free memory: " << format_size(max_available_size()) << ")\n"; + } + } + } else { + heapBlock = *it; + // remove and re-insert heap in the set later after a buffer is created. + // this ensures updating the order of heaps based on their new available sizes + pool->heaps.erase(it); + } + return heapBlock; +} + +bool MPSHeapAllocatorImpl::alloc_buffer(AllocParams& p) +{ + if (m_set_fraction && m_total_allocated_memory + p.size() > max_available_size()) + return false; + + HeapBlock *heap = get_free_heap(p); + if (!heap) + return false; // this will cause releasing pool buffers to free up memory + + id buffer = heap->newMTLBuffer(p.size(), p.pool->is_shared); + // this should never happen as the backing memory (i.e., heap) was allocated successfully. + TORCH_INTERNAL_ASSERT(buffer); + // insert heap after a buffer was created on it to update the order of heap's set + p.pool->heaps.insert(heap); + p.buffer_block = new BufferBlock(p.size(), buffer, heap, m_allocated_buffers.size() + 1); + m_allocated_buffers[p.buffer_block->buffer] = p.buffer_block; + m_total_allocated_memory += p.size(); + + if (debug_info_enabled()) { + std::cerr << "Allocated " + << (p.pool->is_shared ? "shared" : "private") + << " buffer #" << p.buffer_block->buf_id + << " with aligned size " << format_size(p.size()) + << " (requested size: " << format_size(p.requested_size) + << ", heap size: " << format_size(heap->size.available) + << ", total allocated: " << format_size(m_total_allocated_memory) << ")\n"; + } + return true; +} + +bool MPSHeapAllocatorImpl::get_free_buffer(AllocParams& p) +{ + BufferPool& pool = *p.pool; + auto it = pool.buffers.lower_bound(&p.search_key); + if (it == pool.buffers.end()) + return false; + // do not return an oversized buffer for a large request + // allow oversized buffer size to be rounded up but within a limit + if ((p.size() < max_split_size() && (*it)->size >= max_split_size()) || + ((p.size() >= max_split_size()) && ((*it)->size >= p.size() + kLargeHeap))) + return false; + + p.buffer_block = *it; + pool.buffers.erase(it); + if (debug_info_enabled()) { + std::cerr << "Reusing " + << (p.pool->is_shared ? "shared" : "private") + << " buffer #" << p.buffer_block->buf_id + << " with aligned size " << format_size(p.buffer_block->size) + << " (requested size: " << format_size(p.requested_size) << ")\n"; + } + return true; +} + +id MPSHeapAllocatorImpl::Malloc(size_t size, bool sharedStorage) +{ + TORCH_CHECK(size < m_max_buffer_size, "Invalid buffer size: ", format_size(size)); + + std::lock_guard lock(m_mutex); + __block id buf = nil; + + size_t alloc_size = get_allocation_size(size, sharedStorage); + auto& pool = get_pool(alloc_size, sharedStorage); + AllocParams params(alloc_size, size, &pool); + + bool block_found = + // Search pool + get_free_buffer(params) || + // Attempt allocate + alloc_buffer(params) || + // Free enough available cached blocks to satisfy alloc and retry alloc. + (release_available_cached_buffers(params) && alloc_buffer(params)) || + // Free all non-split cached buffers and retry alloc. + (release_cached_buffers() && alloc_buffer(params)); + + BufferBlock* buffer_block = params.buffer_block; + TORCH_INTERNAL_ASSERT(block_found && buffer_block); + buffer_block->in_use = true; + return buffer_block->buffer; +} + +void MPSHeapAllocatorImpl::free_buffer(BufferBlock* buffer_block) +{ + TORCH_INTERNAL_ASSERT(buffer_block->in_use); + buffer_block->in_use = false; + BufferPool *pool = buffer_block->heap->pool; + // Makes sure the BufferBlock* isn't already present in the pool we're freeing it back into. + TORCH_INTERNAL_ASSERT(pool->buffers.insert(buffer_block).second); +} + +BufferBlock* MPSHeapAllocatorImpl::get_allocated_buffer_block(void* ptr) +{ + id buf = __builtin_bit_cast(id, ptr); + auto it = m_allocated_buffers.find(buf); + if (it == m_allocated_buffers.end()) + return nullptr; + + return it->second; +} + +bool MPSHeapAllocatorImpl::isSharedBuffer(void* ptr) +{ + std::lock_guard lock(m_mutex); + + BufferBlock *buffer_block = get_allocated_buffer_block(ptr); + // it's OK for the buffer_block to not exist yet + return buffer_block && buffer_block->heap->pool->is_shared; +} + +void MPSHeapAllocatorImpl::Free(void* ptr) +{ + std::lock_guard lock(m_mutex); + + BufferBlock *buffer_block = get_allocated_buffer_block(ptr); + TORCH_INTERNAL_ASSERT(buffer_block); + free_buffer(buffer_block); +} + +void MPSHeapAllocatorImpl::EmptyCache() +{ + std::lock_guard lock(m_mutex); + release_cached_buffers(); +} + +void MPSHeapAllocatorImpl::release_buffer(BufferBlock* buffer_block, bool remove_empty_heap) +{ + HeapBlock *heap = buffer_block->heap; + BufferPool *pool = heap->pool; + m_total_allocated_memory -= buffer_block->size; + m_allocated_buffers.erase(buffer_block->buffer); + pool->buffers.erase(buffer_block); + // will re-insert later to keep the heaps list sorted based on heap's new available size (if heap not empty) + pool->heaps.erase(heap); + heap->releaseMTLBuffer(buffer_block->buffer); + if (debug_info_enabled()) { + std::cerr << "Released buffer #" << buffer_block->buf_id + << " of size " << format_size(buffer_block->size) + << " (heap size: " << format_size(heap->size.available) + << ", total allocated: " << format_size(m_total_allocated_memory) << ")\n"; + + } + delete buffer_block; + + if (remove_empty_heap && heap->n_buffers == 0) { + heap->releaseMTLHeap(); + if (debug_info_enabled()) { + std::cerr << "Released heap of size " << format_size(heap->size.total) + << " (free memory: " << format_size(max_available_size()) << ")\n"; + } + delete heap; + } else { + pool->heaps.insert(heap); + } +} + +void MPSHeapAllocatorImpl::release_buffers(BufferPool& pool) +{ + auto it = pool.buffers.begin(); + while (it != pool.buffers.end()) { + BufferBlock* buffer_block = *it; + ++it; + release_buffer(buffer_block); + } +} + +bool MPSHeapAllocatorImpl::release_available_cached_buffers(const AllocParams& p) +{ + BufferPool& pool = *p.pool; + + if (max_split_size() == std::numeric_limits::max() || pool.buffers.empty()) + return false; + + BufferBlock key = p.search_key; + key.size = (key.size < max_split_size()) ? max_split_size() : key.size; + auto it = pool.buffers.lower_bound(&key); + if (it == pool.buffers.end()) { + size_t totalReleased = 0; + --it; + while ((totalReleased < key.size) && ((*it)->size >= max_split_size())) { + auto cur = it; + totalReleased += (*it)->size; + if (it != pool.buffers.begin()) { + --it; + release_buffer(*cur); + } else { + release_buffer(*cur); + break; + } + } + if (totalReleased < key.size) + return false; + } else { + release_buffer(*it); + } + return true; +} + +bool MPSHeapAllocatorImpl::release_cached_buffers() +{ + // Free all cached blocks to system allocator + release_buffers(m_large_pool_private); + release_buffers(m_large_pool_shared); + release_buffers(m_small_pool_private); + release_buffers(m_small_pool_shared); + return true; +} + +} // namespace HeapAllocator + +// Use "at::mps::GetMPSAllocator()" to acquire a handle to MPS Allocator +static HeapAllocator::MPSHeapAllocatorImpl s_allocatorImpl; + +// MPS allocator struct to be registered with Pytorch +struct TORCH_API MPSAllocator final : public at::Allocator { +public: + explicit MPSAllocator(bool useSharedStorage) : + m_has_unified_memory(s_allocatorImpl.Device().hasUnifiedMemory), m_use_shared_storage(useSharedStorage) + { + const bool enable_debug_info = isEnvVarEnabled("PYTORCH_DEBUG_MPS_ALLOCATOR"); + if (enable_debug_info) { + s_allocatorImpl.enable_debug_info(); + if (!m_use_shared_storage || m_has_unified_memory) { + std::cerr << "Initializing " + << (useSharedStorage ? "shared" : "private") + << " heap allocator on " + << (m_has_unified_memory ? "unified" : "discrete") + << " device memory of size " + << s_allocatorImpl.Device().recommendedMaxWorkingSetSize / 1048576UL << " MB\n"; + } + } + } + + ~MPSAllocator() override { + s_allocatorImpl.EmptyCache(); + } + + DataPtr allocate(const size_t nbytes) const override { + __block id buf = nbytes > 0 ? s_allocatorImpl.Malloc(nbytes, m_use_shared_storage) : nullptr; + return { buf, buf, &Delete, at::Device(at::DeviceType::MPS, 0)}; + } + + DeleterFnPtr raw_deleter() const override { return &Delete; } + bool is_shared(void* ptr) const { return s_allocatorImpl.isSharedBuffer(ptr); } + bool is_shared_storge_supported() const { return m_has_unified_memory; } + +private: + bool m_has_unified_memory; + // use shared buffers on unified memory + bool m_use_shared_storage; + + static void Delete(void* ptr) { if (ptr) s_allocatorImpl.Free(ptr); } + + static bool isEnvVarEnabled(const char *envvar) { + const char *e = getenv(envvar); + if (e) { + char *t = (char*) e; + long val = strtol(e, &t, 0); + return (t != e && val != 0); + } + return false; + } +}; + +static MPSAllocator s_mps_shared_alloc(true); +at::Allocator* getMPSSharedAllocator() +{ + if (s_mps_shared_alloc.is_shared_storge_supported()) + return &s_mps_shared_alloc; + + return nullptr; +} + +} // namespace mps + +namespace native { + +// torch.is_pinned() implementation +// Pinned memory will be helpful on Apple Silicon Macs with Unified memory as we +// will be able to use SharedStorageMode for MTLBuffer allocations. This will +// avoid extra copies on DataLoading operations. +bool is_pinned_mps(const Tensor& self, c10::optional device) +{ + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_mps()); + return at::mps::s_mps_shared_alloc.is_shared(self.storage().data()); +} + +// torch.pin_memory() implementation +Tensor _pin_memory_mps(const Tensor& self, c10::optional device) +{ + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_mps()); + auto* shared_allocator = at::mps::getMPSSharedAllocator(); + TORCH_CHECK(shared_allocator, "unable to pin memory on a non-unified memory device"); + + const size_t storage_size = detail::computeStorageNbytes(self.sizes(), self.strides(), self.dtype().itemsize()); + std::cout << "Pinning memory of size " << storage_size / 1024UL << " KB\n"; + auto storage = Storage(Storage::use_byte_size_t(), storage_size, shared_allocator, false); + auto tensor = at::cpu::empty({0}, self.options()).set_(storage, 0, self.sizes(), self.strides()); + tensor.copy_(self); + return tensor; +} + +} // namespace native + +static mps::MPSAllocator s_mps_private_alloc(false); +REGISTER_ALLOCATOR(DeviceType::MPS, &s_mps_private_alloc); + +} // namespace at diff --git a/aten/src/ATen/mps/MPSDevice.h b/aten/src/ATen/mps/MPSDevice.h new file mode 100644 index 000000000000..a4a4b869b44c --- /dev/null +++ b/aten/src/ATen/mps/MPSDevice.h @@ -0,0 +1,62 @@ +// Copyright © 2022 Apple Inc. + +#pragma once +#include +#include +#include + + +#ifdef __OBJC__ +#include +#include +#include +typedef id MTLDevice_t; +#else +typedef void* MTLDevice; +typedef void* MTLDevice_t; +#endif + +using namespace std; + +namespace at { +namespace mps { + +//----------------------------------------------------------------- +// MPSDevice +// +// MPSDevice is a singleton class that returns the default device +//----------------------------------------------------------------- + +class TORCH_API MPSDevice { + public: + /** + * MPSDevice should not be cloneable. + */ + MPSDevice(MPSDevice& other) = delete; + /** + * MPSDevice should not be assignable. + */ + void operator=(const MPSDevice&) = delete; + /** + * Gets single instance of the Device. + */ + static MPSDevice* getInstance(); + /** + * Returns the single device. + */ + MTLDevice_t device() { + return _mtl_device; + } + + ~MPSDevice(); + + private: + static MPSDevice* _device; + MTLDevice_t _mtl_device; + MPSDevice(); +}; + +at::Allocator* GetMPSAllocator(bool useSharedAllocator = false); + +} // namespace mps +} // namespace at diff --git a/aten/src/ATen/mps/MPSDevice.mm b/aten/src/ATen/mps/MPSDevice.mm new file mode 100644 index 000000000000..8ade0a1f7817 --- /dev/null +++ b/aten/src/ATen/mps/MPSDevice.mm @@ -0,0 +1,62 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include +#include +#include + +namespace at { +namespace mps { + +static std::unique_ptr mps_device; +static std::once_flag mpsdev_init; + +MPSDevice* MPSDevice::getInstance() { + std::call_once(mpsdev_init, [] { + mps_device = std::unique_ptr(new MPSDevice()); + }); + return mps_device.get(); +} + +MPSDevice::~MPSDevice() { + [_mtl_device release]; + _mtl_device = nil; +} + +MPSDevice::MPSDevice() { + NSArray* devices = MTLCopyAllDevices(); + for (unsigned long i = 0 ; i < [devices count] ; i++) { + id device = devices[i]; + if(![device isLowPower]) { // exclude Intel GPUs + _mtl_device = device; + break; + } + } + assert(_mtl_device); +} + +at::Allocator* getMPSSharedAllocator(); +at::Allocator* GetMPSAllocator(bool useSharedAllocator) { + return useSharedAllocator ? getMPSSharedAllocator() : GetAllocator(DeviceType::MPS); +} + +} // namespace mps + +TORCH_LIBRARY_IMPL(aten, MPS, m) { + m.impl("bitwise_and.Tensor_out", torch::CppFunction::makeFromBoxedFunction<&native::cpu_fallback>()); + m.impl("embedding_renorm_", torch::CppFunction::makeFromBoxedFunction<&native::cpu_fallback>()); + m.impl("linalg_svd", torch::CppFunction::makeFromBoxedFunction<&native::cpu_fallback>()); + m.impl("linalg_svd.U", torch::CppFunction::makeFromBoxedFunction<&native::cpu_fallback>()); + m.impl("repeat_interleave.Tensor", torch::CppFunction::makeFromBoxedFunction<&native::cpu_fallback>()); + m.impl("repeat_interleave.self_Tensor", torch::CppFunction::makeFromBoxedFunction<&native::cpu_fallback>()); + m.impl("repeat_interleave.self_int", torch::CppFunction::makeFromBoxedFunction<&native::cpu_fallback>()); + m.impl("_fft_c2c", torch::CppFunction::makeFromBoxedFunction<&native::cpu_fallback>()); + m.impl("_fft_r2c", torch::CppFunction::makeFromBoxedFunction<&native::cpu_fallback>()); + m.impl("linalg_vector_norm", torch::CppFunction::makeFromBoxedFunction<&native::cpu_fallback>()); + m.impl("sgn.out", torch::CppFunction::makeFromBoxedFunction<&native::cpu_fallback>()); + m.impl("nonzero", torch::CppFunction::makeFromBoxedFunction<&native::cpu_fallback>()); + m.impl("masked_select", torch::CppFunction::makeFromBoxedFunction<&native::cpu_fallback>()); +} + +} // namespace at diff --git a/aten/src/ATen/mps/MPSGuardImpl.h b/aten/src/ATen/mps/MPSGuardImpl.h new file mode 100644 index 000000000000..27d32bf652e7 --- /dev/null +++ b/aten/src/ATen/mps/MPSGuardImpl.h @@ -0,0 +1,171 @@ +// Copyright © 2022 Apple Inc. + +#pragma once +#include +#include +#include +#include + +#ifdef __OBJC__ +#include +#include +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace mps { + +// TODO: Move the MPSGuardImpl to inherit from NoOpDeviceGuardImpl +// https://github.com/pytorch/pytorch/issues/77170 +struct TORCH_API MPSGuardImpl final : public c10::impl::DeviceGuardImplInterface { + static constexpr DeviceType static_type = DeviceType::MPS; + + // constructor + MPSGuardImpl() {} + explicit MPSGuardImpl(DeviceType t) { + TORCH_INTERNAL_ASSERT(t == DeviceType::MPS); + } + + // returns the type + DeviceType type() const override { + return DeviceType::MPS; + } + + Device exchangeDevice(Device d) const override { + return Device(DeviceType::MPS, 0); + } + + Device getDevice() const override { + return Device(DeviceType::MPS, 0); + } + + c10::optional uncheckedGetDevice() const noexcept { + return Device(DeviceType::MPS, 0); + } + + void setDevice(Device d) const override { + TORCH_INTERNAL_ASSERT(d.is_mps()); + } + + void uncheckedSetDevice(Device d) const noexcept override { + // TODO: Currently setting only device 0 + } + + Stream getStream(Device d) const noexcept override { + return Stream(Stream::DEFAULT, Device(DeviceType::MPS, 0)); + } + + Stream getDefaultStream(Device d) const override { + return Stream(Stream::DEFAULT, Device(DeviceType::MPS, 0)); + } + + // NB: These do NOT set the current device + Stream exchangeStream(Stream s) const noexcept override { + return Stream(Stream::DEFAULT, Device(DeviceType::MPS, 0)); + } + DeviceIndex deviceCount() const noexcept override { + if (at::hasMPS()) { + //TODO: extend it for multi-device case + return 1; + } else { + return 0; + } + } + + // Event-related functions + void createEvent( + mpsEvent_t* event, + const EventFlag flag) const; + + void destroyEvent( + void* event, + const DeviceIndex device_index) const noexcept override; + + void record( + void** event, + const Stream& stream, + const DeviceIndex device_index, + const EventFlag flag) const override; + + void block( + void* event, + const Stream& stream) const override; + + bool queryEvent(void* event) const override; + +}; + +/// A variant of OptionalDeviceGuard that is specialized for MPS. +struct OptionalMPSGuard { + explicit OptionalMPSGuard() : guard_() {} + + explicit OptionalMPSGuard(optional device_opt) + : guard_(device_opt) {} + + /// Set the current MPS device to the passed device index, if it is not + /// nullopt + explicit OptionalMPSGuard(optional device_index_opt) + : guard_(device_index_opt) {} + + // Copy is not allowed + OptionalMPSGuard(const OptionalMPSGuard&) = delete; + OptionalMPSGuard& operator=(const OptionalMPSGuard&) = delete; + OptionalMPSGuard(OptionalMPSGuard&& other) = delete; + OptionalMPSGuard& operator=(OptionalMPSGuard&& other) = delete; + + /// Sets the MPS device to the given device, initializing the guard if it + /// is not already initialized. Errors if the given device is not a MPS + /// device. + void set_device(Device device) { + guard_.set_device(device); + } + + /// Sets the MPS device to the given device, initializing the guard if it is + /// not already initialized. Errors if the given device is not a MPS device. + void reset_device(Device device) { + guard_.reset_device(device); + } + + /// Sets the MPS device to the given device index, initializing the guard if + /// it is not already initialized. + void set_index(DeviceIndex device_index) { + guard_.set_index(device_index); + } + + /// Returns the device that was set immediately prior to initialization of the + /// guard, or nullopt if the guard is uninitialized. + optional original_device() const { + return guard_.original_device(); + } + + /// Returns the most recent device that was set using this device guard, + /// either from construction, or via set_device, if the guard is initialized, + /// or nullopt if the guard is uninitialized. + optional current_device() const { + return guard_.current_device(); + } + + /// Restore the original MPS device, resetting this guard to uninitialized + /// state. + void reset() { + guard_.reset(); + } + + private: + c10::impl::InlineOptionalDeviceGuard guard_; +}; + + +C10_REGISTER_GUARD_IMPL(MPS, MPSGuardImpl); + +}} // namespace at::mps diff --git a/aten/src/ATen/mps/MPSGuardImpl.mm b/aten/src/ATen/mps/MPSGuardImpl.mm new file mode 100644 index 000000000000..c2987fdaa3e7 --- /dev/null +++ b/aten/src/ATen/mps/MPSGuardImpl.mm @@ -0,0 +1,60 @@ +// Copyright © 2022 Apple Inc. + +#include +#include + +namespace at { +namespace mps { + + void MPSGuardImpl::createEvent( + mpsEvent_t* event, + const EventFlag flag) const { + id mtl_device = MPSDevice::getInstance()->device(); + // when static casting we already create an _event object. + auto mps_event = static_cast(*event); + } + + void MPSGuardImpl::destroyEvent( + void* event, + const DeviceIndex device_index) const noexcept { + if (!event) return; + auto mps_event = static_cast(event); + mps_event->~MPSEvent(); + + } + + void MPSGuardImpl::record( + void** event, + const Stream& stream, + const DeviceIndex device_index, + const EventFlag flag) const { + + TORCH_CHECK(device_index == -1 || device_index == stream.device_index(), + "Event device index ", + device_index, + " does not match recording stream's device index ", + stream.device_index(), + "."); + + auto mps_event = static_cast(*event); + MPSStream mps_stream{stream}; + mps_event->recordEvent(&mps_stream); + } + + void MPSGuardImpl::block( + void* event, + const Stream& stream) const { + + auto mps_event = static_cast(event); + MPSStream mps_stream{stream}; + + mps_event->waitForEvent(&mps_stream); + } + + bool MPSGuardImpl::queryEvent(void* event) const { + auto mps_event = static_cast(event); + return mps_event->queryEvent(); + } + +} +} diff --git a/aten/src/ATen/mps/MPSStream.h b/aten/src/ATen/mps/MPSStream.h new file mode 100644 index 000000000000..1c19c42b7d77 --- /dev/null +++ b/aten/src/ATen/mps/MPSStream.h @@ -0,0 +1,134 @@ +// Copyright © 2022 Apple Inc. + +#pragma once + +#include +#include + +#include +#include +#include +#include + +#ifdef __OBJC__ +#include +#include +#include +#include +typedef id MTLCommandQueue_t; +typedef id MTLCommandBuffer_t; +typedef id MTLSharedEvent_t; +typedef id MTLDevice_t; +#else +typedef void* MTLCommandQueue_t; +typedef void* MTLCommandQueue; +typedef void* MTLCommandBuffer_t; +typedef void* MTLCommandBuffer; +typedef void* MTLSharedEvent_t; +typedef void* dispatch_queue_t; +typedef void* MTLDevice_t; +#define nil NULL; +#endif + + +namespace at { +namespace mps { + +//----------------------------------------------------------------- +// MPSStream +//----------------------------------------------------------------- + +class TORCH_API MPSStream +{ +public: + enum Unchecked { UNCHECKED }; + /// Construct a MPSStream from a Stream. This construction is checked, + /// and will raise an error if the Stream is not, in fact, a MPS stream. + explicit MPSStream(Stream stream); + + ~MPSStream(); + MTLCommandQueue_t commandQueue() const { return _commandQueue; }; + dispatch_queue_t queue() const { return _serialQueue; } + + MTLCommandBuffer_t commandBuffer(); + void commit(bool flush); + void commitAndWait(); + void synchronize(); + + void flush(); + + /// Get the MPS device index that this stream is associated with. + c10::DeviceIndex device_index() const { return _stream.device_index(); } + + MTLCommandQueue_t stream() const { return _commandQueue; }; + + MTLDevice_t device() const { return [_commandQueue device];} + + /// Explicit conversion to Stream. + Stream unwrap() const { return _stream; } + +private: + Stream _stream; + MTLCommandQueue_t _commandQueue = nil; + MTLCommandBuffer_t _commandBuffer = nil; + void _flush(bool commitAndWait) const; + + dispatch_queue_t _serialQueue = nullptr; +}; + +/** + * Get the current MPS stream + */ +TORCH_API MPSStream* getCurrentMPSStream(); + +/** + * Get the default MPS stream + */ +TORCH_API MPSStream* getDefaultMPSStream(); + +//----------------------------------------------------------------- +// MPSStreamImpl +//----------------------------------------------------------------- + +class TORCH_API MPSStreamImpl +{ + public: + /** + * Gets single instance of the MPSStream. + */ + static MPSStream* getInstance(); + + private: + static MPSStream* _stream; + MPSStreamImpl(); +}; + + +//----------------------------------------------------------------- +// MPSEvent +//----------------------------------------------------------------- + +struct TORCH_API MPSEvent +{ + MPSEvent(); + // MPSEvent(id device); + + ~MPSEvent(); + MTLSharedEvent_t event() const {return _event; } + + void recordEvent(MPSStream *stream); + void waitForEvent(MPSStream *queue); // waits on the cpu + bool queryEvent(); + uint64_t getCurrentValue() { return _currentValue; } + void setCurrentValue(uint64_t currValue) { _currentValue = currValue; } +private: + bool _isRecorded = false; + uint64_t _currentValue = 0; + MTLSharedEvent_t _event; +}; + +typedef MPSEvent* mpsEvent_t; + + +} // namespace mps +} // namespace at diff --git a/aten/src/ATen/mps/MPSStream.mm b/aten/src/ATen/mps/MPSStream.mm new file mode 100644 index 000000000000..7d1d346f1755 --- /dev/null +++ b/aten/src/ATen/mps/MPSStream.mm @@ -0,0 +1,139 @@ +// Copyright © 2022 Apple Inc. + +#include + +namespace at { +namespace mps { + +//----------------------------------------------------------------- +// MPSStream +//----------------------------------------------------------------- + +MPSStream::MPSStream(Stream stream) : _stream(stream) { + _commandQueue = [MPSDevice::getInstance()->device() newCommandQueue]; + TORCH_CHECK(_stream.device_type() == DeviceType::MPS); + _serialQueue = dispatch_queue_create("metal gpu stream", NULL); +} + +MPSStream::~MPSStream() { + [_commandQueue autorelease]; + _commandQueue = nil; + + assert(_commandBuffer == nil); +} + +id MPSStream::commandBuffer() { + if (!_commandBuffer) { + _commandBuffer = + [MPSCommandBuffer commandBufferFromCommandQueue:_commandQueue].retain; + } + + return _commandBuffer; +} + +void MPSStream::synchronize() { + dispatch_sync(queue(), ^() { + @autoreleasepool { + commandBuffer(); + commitAndWait(); + } + }); +} + +void MPSStream::commit(bool doFlush) { + if (doFlush) { + flush(); + } +} + +void MPSStream::commitAndWait() { + assert(_commandBuffer); + [_commandBuffer commit]; + [_commandBuffer waitUntilCompleted]; + [_commandBuffer release]; + _commandBuffer = nil; +} + +void MPSStream::flush() { + if (_commandBuffer) { + [_commandBuffer commit]; + [_commandBuffer release]; + _commandBuffer = nil; + } +} + +void MPSStream::_flush(bool commitAndWait) const { + assert(_commandBuffer); + [_commandBuffer commit]; + if (commitAndWait) { + [_commandBuffer waitUntilCompleted]; + } + [_commandBuffer release]; +} + +//----------------------------------------------------------------- +// MPSStreamImpl +//----------------------------------------------------------------- + +MPSStream* MPSStreamImpl::_stream = nullptr; + +MPSStream* MPSStreamImpl::getInstance() { + if (_stream == nullptr) { + _stream = + new MPSStream(Stream(Stream::UNSAFE, c10::Device(DeviceType::MPS), 0)); + } + return _stream; +} + +MPSStreamImpl::MPSStreamImpl() {} + +MPSStream* getCurrentMPSStream() { + return getDefaultMPSStream(); +} + +MPSStream* getDefaultMPSStream() { + return MPSStreamImpl::getInstance(); +} + +//----------------------------------------------------------------- +// MPSEvent +//----------------------------------------------------------------- + +MPSEvent::MPSEvent() { + _event = [MPSDevice::getInstance()->device() newSharedEvent]; +} + +MPSEvent::~MPSEvent() { + [_event release]; + _event = nil; +} + +void MPSEvent::recordEvent(MPSStream* stream) { + @autoreleasepool { + _isRecorded = true; + dispatch_sync(stream->queue(), ^() { + @autoreleasepool { + id commandBuffer = stream->commandBuffer(); + [commandBuffer encodeSignalEvent:_event value:_currentValue]; + stream->commit(true); + } + }); + } +} + +void MPSEvent::waitForEvent(MPSStream* stream) { + dispatch_sync(stream->queue(), ^() { + @autoreleasepool { + id commandBuffer = stream->commandBuffer(); + [commandBuffer encodeWaitForEvent:_event value:_currentValue]; + stream->commit(false); + } + }); +} + +bool MPSEvent::queryEvent() { + return !_isRecorded || (_event.signaledValue >= _currentValue); +} + +} // namespace mps +} // namespace at diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp index ff79939830c7..f40c4aa3e823 100644 --- a/aten/src/ATen/native/Activation.cpp +++ b/aten/src/ATen/native/Activation.cpp @@ -164,12 +164,12 @@ TORCH_META_FUNC(softshrink_backward) ( build_borrowing_binary_op(maybe_get_output(), grad, self); } -TORCH_META_FUNC(gelu) (const Tensor & self) { +TORCH_META_FUNC(gelu) (const Tensor & self, c10::string_view approximate) { build_unary_op(maybe_get_output(), self); } TORCH_META_FUNC(gelu_backward) ( - const Tensor& grad, const Tensor& self + const Tensor& grad, const Tensor& self, c10::string_view approximate ) { build_borrowing_binary_op(maybe_get_output(), grad, self); } @@ -202,6 +202,8 @@ DEFINE_DISPATCH(silu_stub); DEFINE_DISPATCH(silu_backward_stub); DEFINE_DISPATCH(mish_stub); DEFINE_DISPATCH(mish_backward_stub); +DEFINE_DISPATCH(prelu_cpu_stub); +DEFINE_DISPATCH(prelu_backward_cpu_stub); TORCH_IMPL_FUNC(elu_out) ( const Tensor& self, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale, const Tensor& result @@ -324,50 +326,68 @@ bool use_mkldnn(const Tensor& input) { } TORCH_IMPL_FUNC(gelu_out_cpu) ( - const Tensor& self, const Tensor& result + const Tensor& self, c10::string_view approximate, const Tensor& result ) { +auto approximate_type = get_gelutype_enum(approximate); #if AT_MKLDNN_ENABLED() - if (use_mkldnn(self)) { + if (use_mkldnn(self) && (approximate_type == GeluType::None)) { const ideep::tensor& x = itensor_from_tensor(self); ideep::tensor y = itensor_from_tensor(result); ideep::eltwise_forward::compute( x, y, ideep::algorithm::eltwise_gelu_erf, ideep::prop_kind::forward_training, /*alpha*/ 0.0); } else { - GeluKernel(kCPU, *this); + GeluKernel(kCPU, *this, approximate_type); } #else - GeluKernel(kCPU, *this); + GeluKernel(kCPU, *this, approximate_type); #endif } TORCH_IMPL_FUNC(gelu_backward_out_cpu) ( - const Tensor& grad, const Tensor& self, const Tensor& grad_input + const Tensor& grad, const Tensor& self, c10::string_view approximate, const Tensor& grad_input ) { +auto approximate_type = get_gelutype_enum(approximate); #if AT_MKLDNN_ENABLED() - if (use_mkldnn(self)) { + if (use_mkldnn(self) && (approximate_type == GeluType::None)) { const ideep::tensor& x = itensor_from_tensor(self); ideep::tensor grady = itensor_from_tensor(grad); ideep::tensor gradx = itensor_from_tensor(grad_input); ideep::eltwise_backward::compute(x, grady, gradx, ideep::algorithm::eltwise_gelu_erf, /*alpha*/ 0.0); } else { - GeluBackwardKernel(kCPU, *this); + GeluBackwardKernel(kCPU, *this, approximate_type); } #else - GeluBackwardKernel(kCPU, *this); + GeluBackwardKernel(kCPU, *this, approximate_type); #endif } Tensor hardtanh(const Tensor& self, const Scalar& min, const Scalar& max) { - return at::clamp(self, min, max); + Tensor result = at::empty_like(self); + return at::hardtanh_out(result, self, min, max); } Tensor& hardtanh_out(const Tensor& self, const Scalar& min, const Scalar& max, Tensor& result) { - return at::clamp_out(result, self, min, max); + TORCH_CHECK(self.scalar_type() != at::kBool, + "Bool inputs not supported for hardtanh"); + //preserve legacy behavior of boundaries not causing type promotion + Scalar min_, max_; + if (at::isIntegralType(self.scalar_type(), /*include_bool*/false)) { + int64_t minval = min.toLong(); + int64_t maxval = max.toLong(); + TORCH_CHECK(self.dtype() != at::kByte || (minval >= 0 && + maxval >=0), "cannot do hardtanh on an unsigned type with negative limits"); + min_ = minval; + max_ = maxval; + } else { + min_ = min; + max_ = max; + } + return at::clamp_out(result, self, min_, max_); } Tensor& hardtanh_(Tensor& self, const Scalar& min, const Scalar& max) { - return at::clamp_(self, min, max); + return at::hardtanh_out(self, self, min, max); } Tensor& hardtanh_backward_out(const Tensor& grad_output, const Tensor& self, const Scalar& min, const Scalar& max, Tensor& grad_input) { @@ -421,10 +441,12 @@ Tensor hardswish_backward(const Tensor& grad_output, const Tensor& self) { } Tensor relu(const Tensor & self) { + TORCH_CHECK(self.scalar_type() != at::kBool, "Boolean inputs not supported for relu"); return at::clamp_min(self, 0); } Tensor & relu_(Tensor & self) { + TORCH_CHECK(self.scalar_type() != at::kBool, "Boolean inputs not supported for relu"); return at::clamp_min_(self, 0); } @@ -566,14 +588,13 @@ Tensor rrelu_with_noise_backward( const Scalar& upper, bool training, bool is_result) { - auto lower_tensor = scalar_to_tensor(lower); - auto upper_tensor = scalar_to_tensor(upper); - if (training && (upper_tensor - lower_tensor).item().to() > 1E-6) { - return grad_output.mul(noise); + if (training) { + return noise * grad_output; } else { - auto negative = (lower_tensor + upper_tensor) / 2; - Scalar negative_slope = negative.item(); - return at::leaky_relu_backward(grad_output, self_or_result, negative_slope, is_result); + auto l = lower.toDouble(); + auto u = upper.toDouble(); + auto mid = (l + u) / 2.; + return at::leaky_relu_backward(grad_output, self_or_result, mid, is_result); } } @@ -593,253 +614,119 @@ TORCH_IMPL_FUNC(threshold_backward_out)(const Tensor& grad, const Tensor& self, threshold_stub(device_type(), *this, threshold, 0); } -// ----------------------------------- -// prelu forward -// ----------------------------------- -template -void inline prelu_cpu_kernel_share_weights( - Tensor& result, - const Tensor& input, - const Tensor& weight) { - - int64_t input_numel = input.numel(); - auto result_data = result.data_ptr(); - auto input_data = input.data_ptr(); - auto weight_val = weight.data_ptr()[0]; - - at::parallel_for(0, input_numel, 1000, [&](int64_t start, int64_t end) { - for (const auto i : c10::irange(start, end)) { - scalar_t input_data_val = input_data[i]; - // to allow for compiler optimization, here splitting into two lines: - scalar_t r = (input_data_val > 0) ? scalar_t(1) : weight_val; - result_data[i] = r * input_data_val; - } - }); -} - -template -void inline prelu_cpu_kernel_multi_weights( - Tensor& result, - const Tensor& input, - const Tensor& weight, - int64_t input_dim0_size, - int64_t channel_size, - int64_t input_stride0, - int64_t input_stride1) { - - scalar_t* result_data = result.data_ptr(); - scalar_t* input_data = input.data_ptr(); - scalar_t* weight_data = weight.data_ptr(); - - auto loop = [&](int64_t start, int64_t end) { - for (const auto i : c10::irange(start, end)) { - int64_t offset = i * channel_size * input_stride1; - scalar_t* n_input_data = input_data + offset; - scalar_t* n_result_data = result_data + offset; - for (const auto j : c10::irange(channel_size)) { - for (const auto k : c10::irange(input_stride1)) { - // to allow for compiler optimization, here splitting into two lines: - scalar_t w = (n_input_data[k] > 0) ? scalar_t(1) : weight_data[j]; - n_result_data[k] = w * n_input_data[k]; - } - n_input_data += input_stride1; - n_result_data += input_stride1; - } - } - }; - if (input.numel() > 1000) { - at::parallel_for(0, input_dim0_size, 0, loop); - } else { - loop(0, input_dim0_size); - } -} - Tensor prelu_cpu(const Tensor& self, const Tensor& weight_) { - auto input = self.contiguous(); - auto weight = weight_.contiguous(); - - TORCH_CHECK(input.is_contiguous()); - TORCH_CHECK(weight.is_contiguous()); + int64_t weight_num = weight_.numel(); + Tensor result = at::empty_like(self, self.suggest_memory_format()); - int64_t weight_num = weight.numel(); - Tensor result = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - auto strides = input.strides(); - - // case1: shared weight for all channels - if (weight_num == 1) { - AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "prelu_cpu", [&] { - prelu_cpu_kernel_share_weights(result, input, weight); - }); - } - else { // case2: multiple weights, one for each channel - int64_t input_ndim = input.dim(); + if (weight_num != 1) { + int64_t input_ndim = self.dim(); TORCH_CHECK(input_ndim > 0, "Not allow zero-dim input tensor."); int64_t channel_size = 1; // channel_size default to 1 - int64_t input_dim0_size = 1, input_stride0 = 1, input_stride1 = 1; - if (input_ndim > 1) { - channel_size = input.size(1); // channel is the 2nd dim of input - input_dim0_size = input.size(0); - input_stride0 = strides[0]; - input_stride1 = strides[1]; + channel_size = self.size(1); // channel is the 2nd dim of input } TORCH_CHECK(channel_size == weight_num, "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num, " and channel size = ", channel_size, "."); - - AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "prelu_cpu", [&] { - prelu_cpu_kernel_multi_weights( - result, - input, - weight, - input_dim0_size, - channel_size, - input_stride0, - input_stride1); - }); } - return result; -} -// ----------------------------------- -// prelu backward -// ----------------------------------- -template -void inline prelu_cpu_backward_kernel_share_weights( - const Tensor& input, - const Tensor& weight, - const Tensor& grad_out, - Tensor& input_grad, - Tensor& weight_grad) { - - int64_t input_numel = input.numel(); - auto input_data = input.data_ptr(); - auto weight_val = weight.data_ptr()[0]; - auto grad_out_data = grad_out.data_ptr(); - auto input_grad_data = input_grad.data_ptr(); - auto weight_grad_data = weight_grad.data_ptr(); - - scalar_t sum = at::parallel_reduce(0, input_numel, 1000, scalar_t(0), - [&](int64_t start, int64_t end, scalar_t ident) -> scalar_t { - scalar_t partial_sum = ident; - for (const auto i : c10::irange(start, end)) { - scalar_t input_data_val = input_data[i]; - scalar_t grad_out_data_val = grad_out_data[i]; - // to allow for compiler optimization, here splitting into two lines: - scalar_t w = (input_data_val > 0) ? scalar_t(1) : weight_val; - input_grad_data[i] = w * grad_out_data_val; - // to allow for compiler optimization, here splitting into two lines: - scalar_t mask = (input_data_val > 0) ? scalar_t(0) : scalar_t(1); - partial_sum += mask * input_data_val * grad_out_data_val; - } - return partial_sum; - }, std::plus()); - weight_grad_data[0] = sum; -} - -template -void inline prelu_cpu_backward_kernel_multi_weights( - const Tensor& input, - const Tensor& weight, - const Tensor& grad_out, - Tensor& input_grad, - Tensor& weight_grad_collector, - int64_t input_dim0_size, - int64_t channel_size, - int64_t input_stride0, - int64_t input_stride1) { - - auto input_data = input.data_ptr(); - auto weight_data = weight.data_ptr(); - auto grad_out_data = grad_out.data_ptr(); - auto input_grad_data = input_grad.data_ptr(); - auto weight_grad_collector_data = weight_grad_collector.data_ptr(); - - auto loop = [&](int64_t start, int64_t end) { - for (const auto i : c10::irange(start, end)) { - for (const auto j : c10::irange(channel_size)) { - for (const auto k : c10::irange(input_stride1)) { - int64_t pos = i * input_stride0 + j * input_stride1 + k; - scalar_t weight_data_val = weight_data[j]; - scalar_t input_data_val = input_data[pos]; - scalar_t grad_out_data_val = grad_out_data[pos]; - // to allow for compiler optimization, here splitting into two lines: - scalar_t w = (input_data_val > 0) ? scalar_t(1) : weight_data_val; - input_grad_data[pos] = w * grad_out_data_val; - // to allow for compiler optimization, here splitting into two lines: - scalar_t mask = (input_data_val > 0) ? scalar_t(0) : scalar_t(1); - weight_grad_collector_data[pos] = mask * input_data_val * grad_out_data_val; - } - } + const int64_t ndim = self.dim(); + // Helper to convert 1d tensors or scalar tensor to an nd tensor that broadcasts with input + // All elements go into the channel dimension + DimVector sizes(ndim, 1), strides(ndim, 0); + auto as_nd = [&](const Tensor& t) { + TORCH_INTERNAL_ASSERT(t.defined() && (t.dim() == 1 || t.dim() == 0)); + if (ndim >= 2) { + sizes[1] = t.dim() == 1 ? t.sizes()[0] : 1; + strides[1] = t.dim() == 1 ? t.strides()[0] : 0; + return t.as_strided(sizes, strides); } + return t.as_strided(sizes, strides); }; - if (input.numel() > 1000) { - at::parallel_for(0, input_dim0_size, 0, loop); + Tensor w; + if (self.scalar_type() == ScalarType::BFloat16) { + auto w_bf16 = at::empty(weight_.sizes(), weight_.options().dtype(ScalarType::BFloat16)); + w_bf16.copy_(weight_); + w = weight_.defined() ? as_nd(w_bf16) : + at::detail::scalar_tensor_static(1, self.scalar_type(), kCPU); } else { - loop(0, input_dim0_size); + w = weight_.defined() ? as_nd(weight_) : + at::detail::scalar_tensor_static(1, self.scalar_type(), kCPU); } + + auto iter = TensorIteratorConfig() + .add_output(result) + .add_input(self) + .add_input(w) + .build(); + prelu_cpu_stub(iter.device_type(), iter); + return result; } std::tuple prelu_backward_cpu(const Tensor& grad_out_, const Tensor& self, const Tensor& weight_) { - auto input = self.contiguous(); - auto grad_out = grad_out_.contiguous(); - auto weight = weight_.contiguous(); - - TORCH_CHECK(input.is_contiguous()); - TORCH_CHECK(grad_out.is_contiguous()); - TORCH_CHECK(weight.is_contiguous()); - - int64_t weight_num = weight.numel(); - auto strides = input.strides(); - auto dims = input.dim(); + int64_t weight_num = weight_.numel(); - Tensor input_grad = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - Tensor weight_grad = at::empty_like(weight, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - Tensor weight_grad_collector = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + Tensor input_grad = at::empty_like(self, self.suggest_memory_format()); + Tensor weight_grad = at::empty_like(weight_, at::MemoryFormat::Contiguous); + Tensor weight_grad_collector = at::empty_like(self, at::MemoryFormat::Contiguous); - // case1: shared parameter for all channels - if (weight_num == 1) { - AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "prelu_backward_cpu", [&] { - prelu_cpu_backward_kernel_share_weights(input, weight, grad_out, input_grad, weight_grad); - }); - } - else { // case2: multiple parameters, one for each channel - int64_t input_ndim = input.dim(); + if (weight_num != 1) { + int64_t input_ndim = self.dim(); TORCH_CHECK(input_ndim > 0, "Not allow zero-dim input tensor."); int64_t channel_size = 1; // channel_size default to 1 - int64_t input_dim0_size = 1, input_stride0 = 1, input_stride1 = 1; - if (input_ndim > 1) { - channel_size = input.size(1); // channel is the 2nd dim of input - input_dim0_size = input.size(0); - input_stride0 = strides[0]; - input_stride1 = strides[1]; + channel_size = self.size(1); // channel is the 2nd dim of input } TORCH_CHECK(channel_size == weight_num, "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num, " and channel size = ", channel_size, "."); + } - AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "prelu_backward_cpu", [&] { - prelu_cpu_backward_kernel_multi_weights( - input, - weight, - grad_out, - input_grad, - weight_grad_collector, - input_dim0_size, - channel_size, - input_stride0, - input_stride1); - }); + const int64_t ndim = self.dim(); + // Helper to convert 1d tensor or scalar tensor to an nd tensor that broadcasts with input + // All elements go into the channel dimension + DimVector sizes(ndim, 1), strides(ndim, 0); + auto as_nd = [&](const Tensor& t) { + TORCH_INTERNAL_ASSERT(t.defined() && (t.dim() == 1 || t.dim() == 0)); + if (ndim >= 2) { + sizes[1] = t.dim() == 1 ? t.sizes()[0] : 1; + strides[1] = t.dim() == 1 ? t.strides()[0] : 0; + return t.as_strided(sizes, strides); + } + return t.as_strided(sizes, strides); + }; + Tensor w; + if (self.scalar_type() == ScalarType::BFloat16) { + auto w_bf16 = at::empty(weight_.sizes(), weight_.options().dtype(ScalarType::BFloat16)); + w_bf16.copy_(weight_); + w = weight_.defined() ? as_nd(w_bf16) : + at::detail::scalar_tensor_static(1, self.scalar_type(), kCPU); + } else { + w = weight_.defined() ? as_nd(weight_) : + at::detail::scalar_tensor_static(1, self.scalar_type(), kCPU); + } + + auto iter = TensorIteratorConfig() + .add_output(input_grad) + .add_output(weight_grad_collector) + .add_input(self) + .add_input(grad_out_) + .add_input(w) + .build(); + + prelu_backward_cpu_stub(iter.device_type(), iter); + + if (weight_num == 1) { + weight_grad.fill_(weight_grad_collector.sum()); + } else { // update weight_grad std::vector reduce_dims; + int64_t input_ndim = self.dim(); reduce_dims.push_back(0); - if (dims > 2) { - for (const auto i : c10::irange(2, dims)) { - reduce_dims.push_back(i); - } + if (input_ndim > 2) { + for(int64_t i = 2; i < input_ndim; i++) reduce_dims.push_back(i); } weight_grad = weight_grad_collector.sum(reduce_dims); } diff --git a/aten/src/ATen/native/Activation.h b/aten/src/ATen/native/Activation.h index 963dc4665fd1..ba2dbc0768e8 100644 --- a/aten/src/ATen/native/Activation.h +++ b/aten/src/ATen/native/Activation.h @@ -14,6 +14,23 @@ class TensorBase; namespace at { namespace native { +// These constants control the approximation behavior of gelu function. +enum GeluType { + None, // Baseline Gelu + Tanh, // Tahn Gelu Approximation + END +}; + +static GeluType get_gelutype_enum(const c10::string_view approximate) { + if (approximate == "none") { + return GeluType::None; + } else if (approximate == "tanh") { + return GeluType::Tanh; + } else { + TORCH_CHECK(false, "approximate argument must be either none or tanh."); + } +} + using structured_activation_fn = void (*)(TensorIteratorBase&); using structured_activation_backward_fn = void (*)(TensorIteratorBase&); @@ -35,6 +52,9 @@ using elu_backward_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const using leaky_relu_fn = void (*)(TensorIteratorBase&, const c10::Scalar&); using leaky_relu_backward_fn = void (*)(TensorIteratorBase&, const c10::Scalar&); using log_sigmoid_cpu_fn = void (*)(TensorBase&, TensorBase&, const TensorBase&); +using gelu_fn = void (*)(TensorIteratorBase&, GeluType); +using gelu_backward_fn = void (*)(TensorIteratorBase&, GeluType); +using glu_jvp_fn = void (*)(TensorIteratorBase&); DECLARE_DISPATCH(elu_fn, elu_stub); DECLARE_DISPATCH(elu_backward_fn, elu_backward_stub); @@ -43,8 +63,8 @@ DECLARE_DISPATCH(softplus_backward_fn, softplus_backward_stub); DECLARE_DISPATCH(log_sigmoid_cpu_fn, log_sigmoid_cpu_stub); DECLARE_DISPATCH(activation_backward_fn, log_sigmoid_backward_stub); DECLARE_DISPATCH(threshold_fn, threshold_stub); -DECLARE_DISPATCH(structured_activation_fn, GeluKernel); -DECLARE_DISPATCH(structured_activation_backward_fn, GeluBackwardKernel); +DECLARE_DISPATCH(gelu_fn, GeluKernel); +DECLARE_DISPATCH(gelu_backward_fn, GeluBackwardKernel); DECLARE_DISPATCH(hardtanh_backward_fn, hardtanh_backward_stub); DECLARE_DISPATCH(hardsigmoid_fn, hardsigmoid_stub); DECLARE_DISPATCH(hardsigmoid_backward_fn, hardsigmoid_backward_stub); @@ -57,10 +77,13 @@ DECLARE_DISPATCH(leaky_relu_fn, leaky_relu_stub); DECLARE_DISPATCH(leaky_relu_backward_fn, leaky_relu_backward_stub); DECLARE_DISPATCH(structured_activation_fn, glu_stub); DECLARE_DISPATCH(activation_backward_fn, glu_backward_stub); +DECLARE_DISPATCH(glu_jvp_fn, glu_jvp_stub); DECLARE_DISPATCH(structured_activation_fn, silu_stub); DECLARE_DISPATCH(structured_activation_backward_fn, silu_backward_stub); DECLARE_DISPATCH(structured_activation_fn, mish_stub); DECLARE_DISPATCH(activation_backward_fn, mish_backward_stub); +DECLARE_DISPATCH(activation_fn, prelu_cpu_stub); +DECLARE_DISPATCH(activation_backward_fn, prelu_backward_cpu_stub); } // namespace native diff --git a/aten/src/ATen/native/AdaptivePooling.h b/aten/src/ATen/native/AdaptivePooling.h index 87cf202c3cc5..68fb08a5f397 100644 --- a/aten/src/ATen/native/AdaptivePooling.h +++ b/aten/src/ATen/native/AdaptivePooling.h @@ -1,9 +1,12 @@ #pragma once -#include #include +#include -namespace at { namespace native { +namespace at { +class Tensor; + +namespace native { using adaptive_avg_pooling_fn = void(*)(Tensor& output, const Tensor& input, IntArrayRef output_size); using adaptive_avg_pooling_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output); diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp index 225985d60485..7fa3c3e37f3e 100644 --- a/aten/src/ATen/native/BatchLinearAlgebra.cpp +++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp @@ -21,12 +21,6 @@ // linear algebra function uses that routine #if AT_BUILD_WITH_LAPACK() -// gesv -extern "C" void zgesv_(int *n, int *nrhs, std::complex *a, int *lda, int *ipiv, std::complex *b, int *ldb, int *info); -extern "C" void cgesv_(int *n, int *nrhs, std::complex *a, int *lda, int *ipiv, std::complex *b, int *ldb, int *info); -extern "C" void dgesv_(int *n, int *nrhs, double *a, int *lda, int *ipiv, double *b, int *ldb, int *info); -extern "C" void sgesv_(int *n, int *nrhs, float *a, int *lda, int *ipiv, float *b, int *ldb, int *info); - // getrf extern "C" void zgetrf_(int *m, int *n, std::complex *a, int *lda, int *ipiv, int *info); extern "C" void cgetrf_(int *m, int *n, std::complex *a, int *lda, int *ipiv, int *info); @@ -57,6 +51,128 @@ extern "C" void cpotri_(char *uplo, int *n, std::complex *a, int *lda, in extern "C" void dpotri_(char *uplo, int *n, double *a, int *lda, int *info); extern "C" void spotri_(char *uplo, int *n, float *a, int *lda, int *info); +// sytrf +extern "C" void dsytrf_( + char* uplo, + int* n, + double* a, + int* lda, + int* ipiv, + double* work, + int* lwork, + int* info); +extern "C" void ssytrf_( + char* uplo, + int* n, + float* a, + int* lda, + int* ipiv, + float* work, + int* lwork, + int* info); +extern "C" void zsytrf_( + char* uplo, + int* n, + std::complex* a, + int* lda, + int* ipiv, + std::complex* work, + int* lwork, + int* info); +extern "C" void csytrf_( + char* uplo, + int* n, + std::complex* a, + int* lda, + int* ipiv, + std::complex* work, + int* lwork, + int* info); + +// hetrf +extern "C" void zhetrf_( + char* uplo, + int* n, + std::complex* a, + int* lda, + int* ipiv, + std::complex* work, + int* lwork, + int* info); +extern "C" void chetrf_( + char* uplo, + int* n, + std::complex* a, + int* lda, + int* ipiv, + std::complex* work, + int* lwork, + int* info); + +// sytrs +extern "C" void dsytrs_( + char* uplo, + int* n, + int* nrhs, + double* a, + int* lda, + int* ipiv, + double* b, + int* ldb, + int* info); +extern "C" void ssytrs_( + char* uplo, + int* n, + int* nrhs, + float* a, + int* lda, + int* ipiv, + float* b, + int* ldb, + int* info); +extern "C" void zsytrs_( + char* uplo, + int* n, + int* nrhs, + std::complex* a, + int* lda, + int* ipiv, + std::complex* b, + int* ldb, + int* info); +extern "C" void csytrs_( + char* uplo, + int* n, + int* nrhs, + std::complex* a, + int* lda, + int* ipiv, + std::complex* b, + int* ldb, + int* info); + +// hetrs +extern "C" void zhetrs_( + char* uplo, + int* n, + int* nrhs, + std::complex* a, + int* lda, + int* ipiv, + std::complex* b, + int* ldb, + int* info); +extern "C" void chetrs_( + char* uplo, + int* n, + int* nrhs, + std::complex* a, + int* lda, + int* ipiv, + std::complex* b, + int* ldb, + int* info); + // geqrf extern "C" void zgeqrf_(int *m, int *n, std::complex *a, int *lda, std::complex *tau, std::complex *work, int *lwork, int *info); extern "C" void cgeqrf_(int *m, int *n, std::complex *a, int *lda, std::complex *tau, std::complex *work, int *lwork, int *info); @@ -207,6 +323,70 @@ extern "C" void strsm_(char *side, char *uplo, char *trans, char *diag, int *n, namespace at { namespace meta { +TORCH_META_FUNC(linalg_ldl_factor_ex) +(const Tensor& self, bool hermitian, bool check_errors) { + at::native::squareCheckInputs(self, "torch.linalg.ldl_factor_ex"); + at::native::checkFloatingOrComplex(self, "torch.linalg.ldl_factor_ex"); + + auto ndim = self.dim(); + + // prefer column major strides + auto ld_strides = at::native::batched_matrix_contiguous_strides(self.sizes(), /*column_major=*/true); + set_output(0, self.sizes(), ld_strides, self.options(), {}); // LD + + auto pivots_shape = + IntArrayRef(self.sizes().data(), ndim - 1); // self.shape[:-1] + set_output( + 1, pivots_shape, {}, self.options().dtype(ScalarType::Int), {}); // pivots + + auto info_shape = + IntArrayRef(self.sizes().data(), ndim - 2); // self.shape[:-2] + set_output( + 2, info_shape, {}, self.options().dtype(ScalarType::Int), {}); // info +} + +TORCH_META_FUNC(linalg_ldl_solve) +(const Tensor& LD, + const Tensor& pivots, + const Tensor& B, + bool hermitian) { + at::native::squareCheckInputs(LD, "torch.linalg.ldl_solve"); + at::native::checkFloatingOrComplex(LD, "torch.linalg.ldl_solve"); + at::native::linearSolveCheckInputs(B, LD, "torch.linalg.ldl_solve"); + TORCH_CHECK( + B.dim() >= 2, + "torch.linalg.ldl_solve: Expected B to have at least 2 dimensions, but it has ", + B.dim(), + " dimensions instead"); + auto expected_pivots_shape = + IntArrayRef(LD.sizes().data(), LD.dim() - 1); // LD.shape[:-1] + TORCH_CHECK( + expected_pivots_shape.equals(pivots.sizes()), + "torch.linalg.ldl_solve: Expected LD.shape[:-1] and pivots.shape to be the same, but got pivots with shape ", + pivots.sizes(), + " instead"); + // pivots is allowed to be any integer type + // LAPACK we use is 32-bit interface while cuSOLVER uses 64-bit interface for integers + TORCH_CHECK( + at::isIntegralType(pivots.scalar_type(), /*includeBool=*/false), + "torch.linalg.ldl_solve: Expected pivots to be integers. Got ", + pivots.scalar_type()); + TORCH_CHECK( + LD.scalar_type() == B.scalar_type(), + "torch.linalg.ldl_solve: ", + "LD dtype", + LD.scalar_type(), + " does not match b dtype ", + B.scalar_type()); + + std::vector B_broadcast_size; + std::tie(B_broadcast_size, std::ignore) = at::native::_linalg_broadcast_batch_dims(B, LD); + + // prefer column major strides + auto result_strides = at::native::batched_matrix_contiguous_strides(B_broadcast_size, /*column_major=*/true); + set_output(0, B_broadcast_size, result_strides, B.options(), {}); +} + TORCH_META_FUNC(triangular_solve)(const Tensor& self, const Tensor& A, bool upper, bool transpose, bool unitriangular) { TORCH_CHECK(self.dim() >= 2, "torch.triangular_solve: Expected b to have at least 2 dimensions, but it has ", self.dim(), " dimensions instead"); @@ -220,13 +400,13 @@ TORCH_META_FUNC(triangular_solve)(const Tensor& self, const Tensor& A, bool uppe std::tie(self_broadcast_size, A_broadcast_size) = at::native::_linalg_broadcast_batch_dims(self, A); // make column major strides for BLAS - const auto solution_strides = at::native::contiguous_strides(self_broadcast_size, /*f-contig=*/true); + const auto solution_strides = at::native::batched_matrix_contiguous_strides(self_broadcast_size, /*f-contig=*/true); set_output(0, self_broadcast_size, solution_strides, self.options(), {}); // make column major strides for BLAS - auto clone_A_strides = at::native::contiguous_strides(A_broadcast_size, /*f_contig=*/true); + auto clone_A_strides = at::native::batched_matrix_contiguous_strides(A_broadcast_size, /*f_contig=*/true); set_output(1, A_broadcast_size, clone_A_strides, A.options(), {}); - } else if (A.layout() == Layout::SparseCsr) { + } else if (A.layout() == Layout::SparseCsr || A.layout() == Layout::SparseBsr) { // no broadcasting for non-strided layout set_output(0, self.sizes(), {}, self.options(), {}); // make row major strides for Sparse BLAS set_output(1, {0}, {}, self.options(), {}); // return 0-sized tensor @@ -243,7 +423,7 @@ TORCH_META_FUNC(linalg_lu_factor_ex)(const Tensor& A, bool pivot, bool check_err const auto n = sizes.cend()[-1]; // make column major strides for BLAS - auto LU_strides = at::native::contiguous_strides(sizes, /*f-contig*=*/true); + auto LU_strides = at::native::batched_matrix_contiguous_strides(sizes, /*f-contig*=*/true); set_output(0, sizes, LU_strides, A.options(), {}); // Set sizes to the size of pivots @@ -269,7 +449,7 @@ TORCH_META_FUNC(_linalg_svd)(const Tensor& A, // Prepare sizes for U if (compute_uv) { sizes.back() = full_matrices ? m : k; - auto U_strides = at::native::contiguous_strides(sizes, /*f-contig*=*/true); + auto U_strides = at::native::batched_matrix_contiguous_strides(sizes, /*f-contig*=*/true); set_output(0, sizes, U_strides, A.options(), {}); // Prepare sizes for Vh @@ -279,7 +459,7 @@ TORCH_META_FUNC(_linalg_svd)(const Tensor& A, // We need to distinguish the cuSOLVER case, as the cuSOLVER algorithms we use // expect F-contig matrices, but they compute V rather than Vh const bool use_cusolver = at::native::svd_uses_cusolver(A); - auto Vh_strides = at::native::contiguous_strides(sizes, /*f-contig*=*/!use_cusolver); + auto Vh_strides = at::native::batched_matrix_contiguous_strides(sizes, /*f-contig*=*/!use_cusolver); set_output(2, sizes, Vh_strides, A.options(), {}); } else { set_output(0, {0}, {}, A.options(), {}); @@ -289,8 +469,71 @@ TORCH_META_FUNC(_linalg_svd)(const Tensor& A, // Prepare sizes for S. S is always real, even when A is complex. sizes.pop_back(); sizes.end()[-1] = k; - set_output(1, sizes, {}, A.options().dtype(c10::toValueType(A.scalar_type())), {}); + set_output(1, sizes, {}, A.options().dtype(c10::toRealValueType(A.scalar_type())), {}); +} + +TORCH_META_FUNC(lu_unpack)(const Tensor& LU, const Tensor& pivots, bool unpack_data, bool unpack_pivots) { + TORCH_CHECK(LU.dim() >= 2, "torch.lu_unpack: Expected tensor with 2 or more dimensions. Got size: ", LU.sizes(), " instead"); + if (unpack_pivots) { + TORCH_CHECK(pivots.scalar_type() == at::kInt, + "torch.lu_unpack: LU_pivots is expected to be a contiguous tensor of torch.int32 dtype.\n" + "Note: this function is intended to be used with the output produced by torch.linalg.lu_factor"); + } + + auto sizes = LU.sizes().vec(); + const auto m = sizes.cend()[-2]; + const auto n = sizes.cend()[-1]; + const auto k = std::min(m, n); + + // P.shape[-2:] == (m, m) (or size zero if pivot == False) + sizes.end()[-1] = m; + if (unpack_pivots) { + set_output(0, sizes, LU.options()); + } else { + set_output(0, {0}, LU.options()); + } + + if (unpack_data) { + // L.shape[-2:] == (m, k) + sizes.end()[-1] = k; + set_output(1, sizes, LU.options()); + + // U.shape[-2:] == (k, n) + sizes.end()[-2] = k; + sizes.end()[-1] = n; + set_output(2, sizes, LU.options()); + } else { + set_output(1, {0}, LU.options()); + set_output(2, {0}, LU.options()); + } } + +TORCH_META_FUNC(linalg_lu)(const Tensor& A, bool pivot) { + TORCH_CHECK(A.dim() >= 2, "linalg.lu: Expected tensor with 2 or more dimensions. Got size: ", A.sizes(), " instead"); + + auto sizes = A.sizes().vec(); + const auto m = sizes.cend()[-2]; + const auto n = sizes.cend()[-1]; + const auto k = std::min(m, n); + + // P.shape[-2:] == (m, m) (or size zero if pivot == False) + sizes.end()[-1] = m; + if (pivot) { + set_output(0, sizes, A.options()); + } else { + set_output(0, {0}, A.options()); + } + + // L.shape[-2:] == (m, k) + sizes.end()[-1] = k; + set_output(1, sizes, A.options()); + + // U.shape[-2:] == (k, n) + sizes.end()[-2] = k; + sizes.end()[-1] = n; + set_output(2, sizes, A.options()); +} + } // namespace meta namespace native { @@ -298,8 +541,6 @@ namespace native { #if AT_BUILD_WITH_LAPACK() // Define the per-batch functions to be used in the main implementation of the batched // linear algebra operations -template -void lapackSolve(int n, int nrhs, scalar_t *a, int lda, int *ipiv, scalar_t *b, int ldb, int *info); template void lapackGetri(int n, scalar_t *a, int lda, int *ipiv, scalar_t *work, int lwork, int *info); @@ -310,22 +551,6 @@ void lapackCholeskySolve(char uplo, int n, int nrhs, scalar_t *a, int lda, scala template void lapackSymeig(char jobz, char uplo, int n, scalar_t *a, int lda, value_t *w, scalar_t *work, int lwork, value_t *rwork, int *info); -template<> void lapackSolve>(int n, int nrhs, c10::complex *a, int lda, int *ipiv, c10::complex *b, int ldb, int *info) { - zgesv_(&n, &nrhs, reinterpret_cast*>(a), &lda, ipiv, reinterpret_cast*>(b), &ldb, info); -} - -template<> void lapackSolve>(int n, int nrhs, c10::complex *a, int lda, int *ipiv, c10::complex *b, int ldb, int *info) { - cgesv_(&n, &nrhs, reinterpret_cast*>(a), &lda, ipiv, reinterpret_cast*>(b), &ldb, info); -} - -template<> void lapackSolve(int n, int nrhs, double *a, int lda, int *ipiv, double *b, int ldb, int *info) { - dgesv_(&n, &nrhs, a, &lda, ipiv, b, &ldb, info); -} - -template<> void lapackSolve(int n, int nrhs, float *a, int lda, int *ipiv, float *b, int ldb, int *info) { - sgesv_(&n, &nrhs, a, &lda, ipiv, b, &ldb, info); -} - template<> void lapackGetri>(int n, c10::complex *a, int lda, int *ipiv, c10::complex *work, int lwork, int *info) { zgetri_(&n, reinterpret_cast*>(a), &lda, ipiv, reinterpret_cast*>(work), &lwork, info); } @@ -552,6 +777,290 @@ template<> void lapackSvd(char jobz, int m, int n, float *a, int lda, sgesdd_(&jobz, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, &lwork, iwork, info); } +template <> +void lapackLdlSymmetric( + char uplo, + int n, + double* a, + int lda, + int* ipiv, + double* work, + int lwork, + int* info) { + dsytrf_(&uplo, &n, a, &lda, ipiv, work, &lwork, info); +} + +template <> +void lapackLdlSymmetric( + char uplo, + int n, + float* a, + int lda, + int* ipiv, + float* work, + int lwork, + int* info) { + ssytrf_(&uplo, &n, a, &lda, ipiv, work, &lwork, info); +} + +template <> +void lapackLdlSymmetric>( + char uplo, + int n, + c10::complex* a, + int lda, + int* ipiv, + c10::complex* work, + int lwork, + int* info) { + zsytrf_( + &uplo, + &n, + reinterpret_cast*>(a), + &lda, + ipiv, + reinterpret_cast*>(work), + &lwork, + info); +} + +template <> +void lapackLdlSymmetric>( + char uplo, + int n, + c10::complex* a, + int lda, + int* ipiv, + c10::complex* work, + int lwork, + int* info) { + csytrf_( + &uplo, + &n, + reinterpret_cast*>(a), + &lda, + ipiv, + reinterpret_cast*>(work), + &lwork, + info); +} + +template <> +void lapackLdlHermitian( + char uplo, + int n, + double* a, + int lda, + int* ipiv, + double* work, + int lwork, + int* info) { + dsytrf_(&uplo, &n, a, &lda, ipiv, work, &lwork, info); +} + +template <> +void lapackLdlHermitian( + char uplo, + int n, + float* a, + int lda, + int* ipiv, + float* work, + int lwork, + int* info) { + ssytrf_(&uplo, &n, a, &lda, ipiv, work, &lwork, info); +} + +template <> +void lapackLdlHermitian>( + char uplo, + int n, + c10::complex* a, + int lda, + int* ipiv, + c10::complex* work, + int lwork, + int* info) { + zhetrf_( + &uplo, + &n, + reinterpret_cast*>(a), + &lda, + ipiv, + reinterpret_cast*>(work), + &lwork, + info); +} + +template <> +void lapackLdlHermitian>( + char uplo, + int n, + c10::complex* a, + int lda, + int* ipiv, + c10::complex* work, + int lwork, + int* info) { + chetrf_( + &uplo, + &n, + reinterpret_cast*>(a), + &lda, + ipiv, + reinterpret_cast*>(work), + &lwork, + info); +} + +template <> +void lapackLdlSolveSymmetric( + char uplo, + int n, + int nrhs, + double* a, + int lda, + int* ipiv, + double* b, + int ldb, + int* info) { + dsytrs_(&uplo, &n, &nrhs, a, &lda, ipiv, b, &ldb, info); +} + +template <> +void lapackLdlSolveSymmetric( + char uplo, + int n, + int nrhs, + float* a, + int lda, + int* ipiv, + float* b, + int ldb, + int* info) { + ssytrs_(&uplo, &n, &nrhs, a, &lda, ipiv, b, &ldb, info); +} + +template <> +void lapackLdlSolveSymmetric>( + char uplo, + int n, + int nrhs, + c10::complex* a, + int lda, + int* ipiv, + c10::complex* b, + int ldb, + int* info) { + zsytrs_( + &uplo, + &n, + &nrhs, + reinterpret_cast*>(a), + &lda, + ipiv, + reinterpret_cast*>(b), + &ldb, + info); +} + +template <> +void lapackLdlSolveSymmetric>( + char uplo, + int n, + int nrhs, + c10::complex* a, + int lda, + int* ipiv, + c10::complex* b, + int ldb, + int* info) { + csytrs_( + &uplo, + &n, + &nrhs, + reinterpret_cast*>(a), + &lda, + ipiv, + reinterpret_cast*>(b), + &ldb, + info); +} + +template <> +void lapackLdlSolveHermitian( + char uplo, + int n, + int nrhs, + double* a, + int lda, + int* ipiv, + double* b, + int ldb, + int* info) { + dsytrs_(&uplo, &n, &nrhs, a, &lda, ipiv, b, &ldb, info); +} + +template <> +void lapackLdlSolveHermitian( + char uplo, + int n, + int nrhs, + float* a, + int lda, + int* ipiv, + float* b, + int ldb, + int* info) { + ssytrs_(&uplo, &n, &nrhs, a, &lda, ipiv, b, &ldb, info); +} + +template <> +void lapackLdlSolveHermitian>( + char uplo, + int n, + int nrhs, + c10::complex* a, + int lda, + int* ipiv, + c10::complex* b, + int ldb, + int* info) { + zhetrs_( + &uplo, + &n, + &nrhs, + reinterpret_cast*>(a), + &lda, + ipiv, + reinterpret_cast*>(b), + &ldb, + info); +} + +template <> +void lapackLdlSolveHermitian>( + char uplo, + int n, + int nrhs, + c10::complex* a, + int lda, + int* ipiv, + c10::complex* b, + int ldb, + int* info) { + chetrs_( + &uplo, + &n, + &nrhs, + reinterpret_cast*>(a), + &lda, + ipiv, + reinterpret_cast*>(b), + &ldb, + info); +} + template<> void lapackLuSolve>(char trans, int n, int nrhs, c10::complex *a, int lda, int *ipiv, c10::complex *b, int ldb, int *info) { zgetrs_(&trans, &n, &nrhs, reinterpret_cast*>(a), &lda, ipiv, reinterpret_cast*>(b), &ldb, info); } @@ -802,100 +1311,6 @@ bool _requires_fw_or_bw_grad(const Tensor& input) { // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ solve ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -/* -Computes the solution to a system of linear equations - A X = B, -where A is an n-by-n matrix and X and B are n-by-nrhs matrices. -Note that B is required to be a matrix, the usual, vector case, is obtained with nrhs = 1. -Above description is for non-batched input, the batched input is also supported. -This is an in-place routine, content of both A and b are overwritten. -'infos' is an int Tensor containing error codes for each matrix in the batched input. -For more information see LAPACK's documentation for GESV routine. -*/ -template -static void apply_solve(Tensor& b, Tensor& A, Tensor& infos) { -#if !AT_BUILD_WITH_LAPACK() - AT_ERROR("solve: LAPACK library not found in compilation"); -#else - auto A_data = A.data_ptr(); - auto b_data = b.data_ptr(); - auto A_mat_stride = matrixStride(A); - auto b_mat_stride = matrixStride(b); - auto batch_size = batchCount(A); - auto n = A.size(-2); - auto nrhs = b.size(-1); - auto lda = std::max(1, n); - - auto ipiv = at::empty({lda}, b.options().dtype(kInt)); - auto ipiv_data = ipiv.data_ptr(); - auto infos_data = infos.data_ptr(); - - for (const auto i : c10::irange(batch_size)) { - scalar_t* A_working_ptr = &A_data[i * A_mat_stride]; - scalar_t* b_working_ptr = &b_data[i * b_mat_stride]; - int* info_working_ptr = &infos_data[i]; - lapackSolve(n, nrhs, A_working_ptr, lda, ipiv_data, b_working_ptr, lda, info_working_ptr); - } -#endif -} - -std::tuple _solve_helper_cpu(const Tensor& self, const Tensor& A) { - auto self_working_copy = cloneBatchedColumnMajor(self); - auto A_working_copy = cloneBatchedColumnMajor(A); - // infos might not get filled for empty inputs therefore at::zeros is used instead of at::empty - auto infos = at::zeros({std::max(1, batchCount(self))}, self.options().dtype(kInt)); - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "solve_cpu", [&]{ - apply_solve(self_working_copy, A_working_copy, infos); - }); - at::_linalg_check_errors(infos, "solve_cpu", self.dim() == 2); - return std::tuple(self_working_copy, A_working_copy); -} - -// Supports arbitrary batch dimensions for self and A -std::tuple solve(const Tensor& self, const Tensor& A) { - TORCH_WARN_ONCE( - "torch.solve is deprecated in favor of torch.linalg.solve", - "and will be removed in a future PyTorch release.\n", - "torch.linalg.solve has its arguments reversed and does not return the LU factorization.\n", - "To get the LU factorization see torch.lu, which can be used with torch.lu_solve or torch.lu_unpack.\n", - "X = torch.solve(B, A).solution\n", - "should be replaced with\n", - "X = torch.linalg.solve(A, B)" - ); - TORCH_CHECK(self.dim() >= 2, - "B should have at least 2 dimensions, but has ", self.dim(), " dimensions instead"); - TORCH_CHECK(A.dim() >= 2, - "A should have at least 2 dimensions, but has ", A.dim(), " dimensions instead"); - Tensor self_broadcasted, A_broadcasted; - std::tie(self_broadcasted, A_broadcasted) = _linalg_broadcast_batch_dims(self, A, "solve"); - return at::_solve_helper(self_broadcasted, A_broadcasted); -} - -std::tuple solve_out(const Tensor& self, const Tensor& A, Tensor& solution, Tensor& lu) { - TORCH_WARN_ONCE( - "torch.solve is deprecated in favor of torch.linalg.solve", - "and will be removed in a future PyTorch release.\n", - "torch.linalg.solve has its arguments reversed and does not return the LU factorization.\n", - "To get the LU factorization see torch.lu, which can be used with torch.lu_solve or torch.lu_unpack.\n", - "X = torch.solve(B, A).solution\n", - "should be replaced with\n", - "X = torch.linalg.solve(A, B)" - ); - checkSameDevice("solve", solution, self, "solution"); - checkSameDevice("solve", lu, self, "lu"); - checkLinalgCompatibleDtype("solve", solution, self, "solution"); - checkLinalgCompatibleDtype("solve", lu, self, "lu"); - - Tensor solution_tmp, lu_tmp; - std::tie(solution_tmp, lu_tmp) = at::_solve_helper(self, A); - - at::native::resize_output(solution, solution_tmp.sizes()); - at::native::resize_output(lu, lu_tmp.sizes()); - solution.copy_(solution_tmp); - lu.copy_(lu_tmp); - return std::tuple(solution, lu); -} - // Solves a system of linear equations matmul(input, x) = other in-place // LAPACK/MAGMA error codes are saved in 'infos' tensor, they are not checked here static Tensor& linalg_solve_out_info(Tensor& result, Tensor& infos, const Tensor& input, const Tensor& other) { @@ -952,8 +1367,8 @@ static Tensor& linalg_solve_out_info(Tensor& result, Tensor& infos, const Tensor // _linalg_broadcast_batch_dims also includes linearSolveCheckInputs // it checks for squareness of 'input' and 'shape' compatibility of 'other' and 'input' - Tensor other_broadcasted, input_broadcasted; - std::tie(other_broadcasted, input_broadcasted) = _linalg_broadcast_batch_dims(other_, input, "linalg.solve"); + Tensor other_broadcasted; + std::tie(other_broadcasted, std::ignore) = _linalg_broadcast_batch_dims(other_, input, "linalg.solve"); auto squeezed_other_broadcasted = at::squeeze(other_broadcasted, -1); auto squeezed_result_shape = squeezed_other_broadcasted.sizes(); @@ -989,18 +1404,17 @@ static Tensor& linalg_solve_out_info(Tensor& result, Tensor& infos, const Tensor // lu_factor_stub+lu_solve_stub perform calculations in-place and 'result' must be a copy of 'other_broadcasted' result.copy_(other_broadcasted); - auto input_working_copy = cloneBatchedColumnMajor(input_broadcasted); - TORCH_INTERNAL_ASSERT(infos.scalar_type() == kInt); TORCH_INTERNAL_ASSERT(infos.device() == input.device()); - infos.resize_({std::max(1, batchCount(input_broadcasted))}); + infos.resize_({std::max(1, batchCount(input))}); // if input is empty infos might not get filled; make sure infos doesn't contain garbage then if (input.numel() == 0) { infos.fill_(0); } // compute the LU factorization of 'input_working_copy' - auto pivots_shape = IntArrayRef(input_broadcasted.sizes().data(), input_broadcasted.dim() - 2).vec(); // input_broadcasted.shape[:-2] + auto input_working_copy = cloneBatchedColumnMajor(input); + auto pivots_shape = IntArrayRef(input.sizes().data(), input.dim() - 2).vec(); // input.shape[:-2] pivots_shape.push_back(std::min(input.size(-2), input.size(-1))); Tensor pivots = at::empty(pivots_shape, input.options().dtype(kInt)); lu_factor_stub(input.device().type(), input_working_copy, pivots, infos, /*compute_pivots=*/true); @@ -1023,8 +1437,7 @@ Tensor& linalg_solve_out(const Tensor& input, const Tensor& other, Tensor& resul // Now check LAPACK/MAGMA error codes // _linalg_check_errors calls 'infos = infos.to(kCPU)' - bool vector_case = linalg_solve_is_vector_rhs(input, other); - at::_linalg_check_errors(infos, "linalg.solve", vector_case ? result.dim() == 1 : result.dim() == 2); + at::_linalg_check_errors(infos, "linalg.solve", input.dim() == 2); return result; } @@ -1109,7 +1522,7 @@ Tensor& _linalg_inv_out_helper_cpu(Tensor &result, Tensor& infos_lu, Tensor& inf return result; } -// Computes the inverse matrix of 'input', it is is saved to 'result' in-place +// Computes the inverse matrix of 'input', it is saved to 'result' in-place // LAPACK/MAGMA/cuSOLVER error codes are saved in 'infos' tensors, they are not checked here static Tensor& linalg_inv_out_info(Tensor& result, Tensor& infos_lu, Tensor& infos_getri, const Tensor& input) { squareCheckInputs(input, "linalg.inv"); @@ -1198,7 +1611,7 @@ static Tensor& linalg_inv_out_info(Tensor& result, Tensor& infos_lu, Tensor& inf return result; } -// Computes the inverse matrix of 'input', it is is saved to 'result' in-place +// Computes the inverse matrix of 'input', it is saved to 'result' in-place Tensor& linalg_inv_out(const Tensor &input, Tensor &result) { auto info_shape = IntArrayRef(input.sizes().cbegin(), input.sizes().cend() - 2); // input.shape[:-2] auto infos_lu = at::zeros({info_shape}, input.options().dtype(kInt)); @@ -1648,6 +2061,105 @@ std::tuple _lu_with_info(const Tensor& self, bool comput return at::linalg_lu_factor_ex(self, compute_pivots, false); } +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ linalg_lu ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +DEFINE_DISPATCH(unpack_pivots_stub); + +TORCH_IMPL_FUNC(linalg_lu_out)(const Tensor& A, + bool pivot, + const Tensor& P, + const Tensor& L, + const Tensor& U) { + const auto m = A.sizes().end()[-2]; + const auto n = A.sizes().end()[-1]; + + // A.shape[-2:] == (m, n) + // P.shape[-2:] == (m, m) + // L.shape[-2:] == (m, k) + // U.shape[-2:] == (k, n) + // with k = min(m, n) + + // Use L as it has the correct size + const bool use_L = m > n; + auto pivots = at::empty({0}, A.options().dtype(kInt)); + auto info = at::empty({0}, A.options().dtype(kInt)); + at::linalg_lu_factor_ex_out(const_cast(use_L ? L : U), + const_cast(pivots), + const_cast(info), + A, + pivot, + /*check_errors=*/false); + at::lu_unpack_out(const_cast(P), + const_cast(L), + const_cast(U), + use_L ? L : U, + pivots, + /*unpack_lu=*/true, + /*unpack_pivots=*/pivot); +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ lu_unpack ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +TORCH_IMPL_FUNC(lu_unpack_out)(const Tensor& LU, + const Tensor& pivots, + bool unpack_lu, + bool unpack_pivots, + const Tensor& P, + const Tensor& L, + const Tensor& U) { + const auto m = LU.sizes().end()[-2]; + const auto n = LU.sizes().end()[-1]; + + // A.shape[-2:] == (m, n) + // P.shape[-2:] == (m, m) + // L.shape[-2:] == (m, k) + // U.shape[-2:] == (k, n) + // with k = min(m, n) + + if (unpack_lu) { + if (m > n || LU.is_same(L)) { + // The order of triu and tril is important as we may have LU.is_same(L) + at::triu_out(const_cast(U), m == n ? LU : LU.narrow(-2, 0, n), 0); + at::tril_out(const_cast(L), LU, -1); + L.diagonal(0, -2, -1).fill_(1.); + } else { + // The order of triu and tril is important as we may have LU.is_same(U) + at::tril_out(const_cast(L), m == n ? LU : LU.narrow(-1, 0, m), -1); + L.diagonal(0, -2, -1).fill_(1.); + at::triu_out(const_cast(U), LU, 0); + } + } + if (unpack_pivots) { + // lu_factor_ex returns an int32 1-based indexing, which is what we have in `pivots` + // We transform that to a proper permutation of the indices {0, ..., m-1} + const auto perm_sizes = IntArrayRef(P.sizes().data(), P.dim() - 1); + + // Fill `perm` with the identity permutation (perhaps batched) + const auto perm = at::arange(m, pivots.options().memory_format(at::MemoryFormat::Contiguous).dtype(kLong)) + .expand(perm_sizes) + .contiguous(); + + // Note that perm is of type kLong and pivots is a 1-indexed kInt. + // This is taken into account in the unpack_pivots kernel + auto iter = TensorIteratorConfig() + .set_check_mem_overlap(false) + .check_all_same_dtype(false) + .resize_outputs(false) + .declare_static_shape(pivots.sizes(), /*squash_dim=*/pivots.dim() - 1) + .add_output(perm) + .add_owned_input(pivots.contiguous()) + .build(); + + if (iter.numel() != 0) { + unpack_pivots_stub(pivots.device().type(), iter, std::min(m, n)); + } + + // Transform the permutation into a permutation matrix + P.zero_(); + P.scatter_(-2, perm.unsqueeze(-2), 1.); + } +} + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triangular_solve ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEFINE_DISPATCH(triangular_solve_stub); @@ -2307,7 +2819,7 @@ void linalg_eigh_out_info( // eigenvalues are always real-valued // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores) - ScalarType real_dtype = toValueType(input.scalar_type()); + ScalarType real_dtype = toRealValueType(input.scalar_type()); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(values.scalar_type() == real_dtype); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.scalar_type() == vectors.scalar_type()); @@ -2354,7 +2866,7 @@ void linalg_eigh_out_info( std::tuple linalg_eigh(const Tensor& input, c10::string_view uplo) { squareCheckInputs(input, "linalg.eigh"); checkUplo(uplo); - ScalarType real_dtype = toValueType(input.scalar_type()); + ScalarType real_dtype = toRealValueType(input.scalar_type()); Tensor values = at::empty({0}, input.options().dtype(real_dtype)); Tensor vectors = at::empty({0}, input.options()); Tensor infos = at::zeros({std::max(1, batchCount(input))}, input.options().dtype(kInt)); @@ -2370,7 +2882,7 @@ std::tuple linalg_eigh_out(const Tensor& input, c10::string_vi checkLinalgCompatibleDtype("torch.linalg.eigh", eigvecs, input, "eigenvectors"); // eigenvalues are always real-valued here - ScalarType real_dtype = toValueType(input.scalar_type()); + ScalarType real_dtype = toRealValueType(input.scalar_type()); checkLinalgCompatibleDtype("torch.linalg.eigh", eigvals.scalar_type(), real_dtype, "eigenvalues"); Tensor eigvals_tmp, eigvecs_tmp; @@ -2393,14 +2905,14 @@ Tensor linalg_eigvalsh(const Tensor& input, c10::string_view uplo) { return values; } - ScalarType real_dtype = toValueType(input.scalar_type()); + ScalarType real_dtype = toRealValueType(input.scalar_type()); Tensor values = at::empty({0}, input.options().dtype(real_dtype)); values = at::linalg_eigvalsh_outf(input, uplo, values); return values; } Tensor& linalg_eigvalsh_out(const Tensor& input, c10::string_view uplo, Tensor& result) { - ScalarType real_dtype = toValueType(input.scalar_type()); + ScalarType real_dtype = toRealValueType(input.scalar_type()); checkLinalgCompatibleDtype("torch.linalg.eigvalsh", result.scalar_type(), real_dtype); squareCheckInputs(input, "linalg.eigvalsh"); @@ -2461,7 +2973,7 @@ static void apply_symeig(Tensor& self, Tensor& eigvals, bool eigenvectors, bool value_t* rwork_data = nullptr; if (isComplexType(at::typeMetaToScalarType(self.dtype()))) { int64_t lrwork = std::max(int64_t(1), 3 * n - 2); - ScalarType dtype = toValueType(typeMetaToScalarType(self.dtype())); + ScalarType dtype = toRealValueType(typeMetaToScalarType(self.dtype())); rwork = at::empty({lrwork}, self.options().dtype(dtype)); rwork_data = rwork.data_ptr(); } @@ -2489,7 +3001,7 @@ std::tuple _symeig_helper_cpu(const Tensor& self, bool eigenvect auto self_sizes = self.sizes().vec(); self_sizes.pop_back(); - ScalarType dtype = toValueType(typeMetaToScalarType(self.dtype())); + ScalarType dtype = toRealValueType(typeMetaToScalarType(self.dtype())); auto eigvals = at::empty(self_sizes, self.options().dtype(dtype)); if (self.numel() == 0) { @@ -2549,7 +3061,7 @@ std::tuple symeig_out(const Tensor& self, bool eigenvectors, b checkSameDevice("symeig", vecs, self, "eigenvectors"); checkLinalgCompatibleDtype("symeig", vecs, self, "eigenvectors"); // eigenvalues are always real-valued here - ScalarType real_dtype = toValueType(self.scalar_type()); + ScalarType real_dtype = toRealValueType(self.scalar_type()); checkLinalgCompatibleDtype("symeig", vals.scalar_type(), real_dtype, "eigenvalues"); Tensor vals_tmp, vecs_tmp; @@ -3068,7 +3580,11 @@ Tensor& linalg_svdvals_out(const Tensor& A, Tensor & S) { } Tensor linalg_svdvals(const Tensor& A) { - return std::get<1>(at::_linalg_svd(A, /*full_matrices=*/false, /*comptue_uv=*/_requires_fw_or_bw_grad(A))); + // NB: Why do we need isTensorSubclassLike check for linalg_svdvals but not linalg_eigvals? + // svdvals is decomposed at the vmap level in functorch so A can be a BatchedTensor wrapping + // a TensorWrapper requiring fw or bw grad. + return std::get<1>(at::_linalg_svd(A, /*full_matrices=*/false, + /*comptue_uv=*/_requires_fw_or_bw_grad(A) || isTensorSubclassLike(A))); } std::tuple svd_out(const Tensor& self, bool some, bool compute_uv, Tensor& U, Tensor& S, Tensor& V) { @@ -3195,7 +3711,7 @@ static void linalg_lstsq_out_info( TORCH_INTERNAL_ASSERT(rank.scalar_type() == at::kLong); TORCH_INTERNAL_ASSERT(rank.device() == input.device()); - auto real_dtype = toValueType(input.scalar_type()); + auto real_dtype = toRealValueType(input.scalar_type()); TORCH_INTERNAL_ASSERT(singular_values.scalar_type() == real_dtype); TORCH_INTERNAL_ASSERT(singular_values.device() == input.device()); @@ -3393,7 +3909,7 @@ std::tuple linalg_lstsq_out( checkLinalgCompatibleDtype("torch.linalg.lstsq", solution, input, "solution"); // 'residuals' is expected to have real float dtype - ScalarType real_dtype = c10::toValueType(input.scalar_type()); + ScalarType real_dtype = c10::toRealValueType(input.scalar_type()); checkLinalgCompatibleDtype("torch.linalg.lstsq", residuals.scalar_type(), real_dtype, "solution"); // 'rank' is expected to have integer dtype @@ -3410,7 +3926,7 @@ std::tuple linalg_lstsq_out( // set default rcond value double rcond_value = rcond.has_value() ? rcond.value() - : _get_epsilon(c10::toValueType(input.scalar_type())) * std::max(input.size(-2), input.size(-1)); + : _get_epsilon(c10::toRealValueType(input.scalar_type())) * std::max(input.size(-2), input.size(-1)); auto infos = at::zeros({std::max(1, batchCount(input))}, input.options().dtype(kInt)); @@ -3524,9 +4040,9 @@ std::tuple linalg_lstsq( c10::optional rcond, c10::optional driver) { Tensor solution = at::empty({0}, input.options()); - Tensor residuals = at::empty({0}, input.options().dtype(toValueType(input.scalar_type()))); + Tensor residuals = at::empty({0}, input.options().dtype(toRealValueType(input.scalar_type()))); Tensor rank = at::empty({0}, input.options().dtype(at::kLong)); - Tensor singular_values = at::empty({0}, input.options().dtype(toValueType(input.scalar_type()))); + Tensor singular_values = at::empty({0}, input.options().dtype(toRealValueType(input.scalar_type()))); std::tie(solution, residuals, rank, singular_values) = at::linalg_lstsq_outf(input, other, rcond, driver, solution, residuals, rank, singular_values); return std::make_tuple(solution, residuals, rank, singular_values); @@ -3700,7 +4216,7 @@ Tensor _det_lu_based_helper_backward_helper( const Tensor& lu, const Tensor& pivs ) { - auto eps = at::native::_get_epsilon(c10::toValueType(self.scalar_type())); + auto eps = at::native::_get_epsilon(c10::toRealValueType(self.scalar_type())); auto n = self.size(-1); auto eps_tensor = at::tensor(eps, self.options()); auto condition_diagonal = [&](const Tensor& x) { @@ -3781,6 +4297,114 @@ Tensor _det_lu_based_helper_backward_helper( } } +DEFINE_DISPATCH(ldl_factor_stub); + +TORCH_IMPL_FUNC(linalg_ldl_factor_ex_out) +(const Tensor& self, + bool hermitian, + bool check_errors, + const Tensor& LD, + const Tensor& pivots, + const Tensor& info) { + // LAPACK workspace query segfalts if the input has 0 in batch dimensions. + if (self.numel() == 0) { + info.zero_(); + return; + } + + auto pivots_ = pivots.expect_contiguous(); + auto info_ = info.expect_contiguous(); + + auto LD_ = at::native::borrow_else_clone( + LD.mT().is_contiguous(), LD, self, /*row_major=*/false); + if (LD.mT().is_contiguous()) { + LD_->copy_(self); + } + + // We decided not to include upper flag in the API. + // https://github.com/pytorch/pytorch/pull/69828#issuecomment-1015143819 + // We can revisit this decision later and remove upper completely + // also from low level functions or add it to the public API. + bool upper = false; + if (upper) { + LD_->triu_(); + } else { + LD_->tril_(); + } + + // call ldl_factor_stub that fills the result tensors + ldl_factor_stub( + self.device().type(), *LD_, *pivots_, *info_, upper, hermitian); + + if (!LD.is_same(*LD_)) { + LD.copy_(*LD_); + } + if (!info.is_same(*info_)) { + info.copy_(*info_); + } + if (!pivots.is_same(*pivots_)) { + pivots.copy_(*pivots_); + } + + if (check_errors) { + at::_linalg_check_errors( + info, "torch.linalg.ldl_factor_ex", self.dim() == 2); + } +} + +std::tuple linalg_ldl_factor_out( + const Tensor& self, + bool hermitian, + Tensor& LD, + Tensor& pivots) { + auto info = at::empty({0}, self.options().dtype(kInt)); + // We pass check_errors as we want to use lu_factor rather than lu_factor_ex + // in the errors + at::linalg_ldl_factor_ex_outf( + self, hermitian, /*check_errors=*/false, LD, pivots, info); + at::_linalg_check_errors(info, "torch.linalg.ldl_factor", self.dim() == 2); + return std::tie(LD, pivots); +} + +std::tuple linalg_ldl_factor( + const Tensor& self, + bool hermitian) { + Tensor LD, pivots, info; + std::tie(LD, pivots, info) = + at::linalg_ldl_factor_ex(self, hermitian, /*check_errors=*/false); + at::_linalg_check_errors(info, "torch.linalg.ldl_factor", self.dim() == 2); + return std::make_tuple(std::move(LD), std::move(pivots)); +} + +DEFINE_DISPATCH(ldl_solve_stub); + +TORCH_IMPL_FUNC(linalg_ldl_solve_out) +(const Tensor& LD, + const Tensor& pivots, + const Tensor& B, + bool hermitian, + const Tensor& result) { + if (LD.numel() == 0 || pivots.numel() == 0) { + return; + } + + auto pivots_ = pivots.expect_contiguous(); + + auto LD_ = at::native::borrow_else_clone( + LD.mT().is_contiguous(), LD, LD, /*row_major=*/false); + result.copy_(B); + auto result_ = at::native::borrow_else_clone( + result.mT().is_contiguous(), result, result, /*row_major=*/false); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(batchCount(result) == batchCount(*result_)); + + ldl_solve_stub( + B.device().type(), *LD_, *pivots_, *result_, false, hermitian); + + if (!result.is_same(*result_)) { + result.copy_(*result_); + } +} + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ solve_triangular ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ namespace { void checkIsMatrix(const Tensor& t, @@ -4037,4 +4661,28 @@ Tensor linalg_solve_triangular( return out; } +Tensor linalg_vander( + const Tensor& x, + c10::optional N) { + auto t = x.scalar_type(); + TORCH_CHECK(t == ScalarType::Float || + t == ScalarType::Double || + t == ScalarType::ComplexFloat || + t == ScalarType::ComplexDouble || + isIntegralType(t), + "linalg.vander supports floating point, complex, and integer tensors, but got ", t); + const auto x_ = x.dim() == 0 ? x.unsqueeze(-1) : x; + + auto shape = x_.sizes().vec(); + const auto n = N.value_or(shape.back()); + TORCH_CHECK(n > 1, "N must be greater than 1."); + + // Append cumprod of the oher 0...n-1 powers + shape.push_back(n - 1); + auto result = at::cumprod(x_.unsqueeze(-1).expand(shape), -1); + // The row of ones + shape.back() = 1LL; + auto ones = result.new_ones(shape); + return at::cat({ones, result}, /*dim=*/ -1); +} }} // namespace at::native diff --git a/aten/src/ATen/native/BatchLinearAlgebra.h b/aten/src/ATen/native/BatchLinearAlgebra.h index f2e4057ad0de..667a6ad793fa 100644 --- a/aten/src/ATen/native/BatchLinearAlgebra.h +++ b/aten/src/ATen/native/BatchLinearAlgebra.h @@ -1,11 +1,14 @@ #pragma once -#include +#include #include #include -#include -#include +#include +// Forward declare TI +namespace at { +struct TensorIterator; +} namespace at { namespace native { @@ -161,6 +164,52 @@ void lapackLuSolve(char trans, int n, int nrhs, scalar_t *a, int lda, int *ipiv, template void lapackLu(int m, int n, scalar_t *a, int lda, int *ipiv, int *info); +template +void lapackLdlHermitian( + char uplo, + int n, + scalar_t* a, + int lda, + int* ipiv, + scalar_t* work, + int lwork, + int* info); + +template +void lapackLdlSymmetric( + char uplo, + int n, + scalar_t* a, + int lda, + int* ipiv, + scalar_t* work, + int lwork, + int* info); + +template +void lapackLdlSolveHermitian( + char uplo, + int n, + int nrhs, + scalar_t* a, + int lda, + int* ipiv, + scalar_t* b, + int ldb, + int* info); + +template +void lapackLdlSolveSymmetric( + char uplo, + int n, + int nrhs, + scalar_t* a, + int lda, + int* ipiv, + scalar_t* b, + int ldb, + int* info); + template void lapackSvd(char jobz, int m, int n, scalar_t *a, int lda, value_t *s, scalar_t *u, int ldu, scalar_t *vt, int ldvt, scalar_t *work, int lwork, value_t *rwork, int *iwork, int *info); #endif @@ -228,6 +277,12 @@ using lu_factor_fn = void (*)( bool /*compute_pivots*/); DECLARE_DISPATCH(lu_factor_fn, lu_factor_stub); +using unpack_pivots_fn = void(*)( + TensorIterator& iter, + const int64_t dim_size +); +DECLARE_DISPATCH(unpack_pivots_fn, unpack_pivots_stub); + using lu_solve_fn = void (*)( const Tensor& /*b*/, const Tensor& /*lu*/, @@ -241,6 +296,14 @@ using lu_solve_trans_fn = void (*)( TransposeType /*trans*/); DECLARE_DISPATCH(lu_solve_trans_fn, lu_solve_trans_stub); +using ldl_factor_fn = void (*)( + const Tensor& /*LD*/, + const Tensor& /*pivots*/, + const Tensor& /*info*/, + bool /*upper*/, + bool /*hermitian*/); +DECLARE_DISPATCH(ldl_factor_fn, ldl_factor_stub); + using svd_fn = void (*)( const Tensor& /*A*/, const bool /*full_matrices*/, @@ -251,4 +314,11 @@ using svd_fn = void (*)( const Tensor& /*info*/); DECLARE_DISPATCH(svd_fn, svd_stub); +using ldl_solve_fn = void (*)( + const Tensor& /*LD*/, + const Tensor& /*pivots*/, + const Tensor& /*result*/, + bool /*upper*/, + bool /*hermitian*/); +DECLARE_DISPATCH(ldl_solve_fn, ldl_solve_stub); }} // namespace at::native diff --git a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp index 2bfac093f22c..b2c52afc4cc9 100644 --- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp +++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp @@ -149,7 +149,7 @@ void apply_eig(const Tensor& self, bool eigenvectors, Tensor& vals_, Tensor& vec Tensor rwork; value_t* rwork_data = nullptr; if (self.is_complex()) { - ScalarType real_dtype = toValueType(typeMetaToScalarType(self.dtype())); + ScalarType real_dtype = toRealValueType(typeMetaToScalarType(self.dtype())); rwork = at::empty({n*2}, self.options().dtype(real_dtype)); rwork_data = rwork.data_ptr(); } @@ -242,7 +242,7 @@ void apply_linalg_eig(Tensor& values, Tensor& vectors, Tensor& input, Tensor& in Tensor rwork; value_t* rwork_data = nullptr; if (input.is_complex()) { - ScalarType real_dtype = toValueType(input.scalar_type()); + ScalarType real_dtype = toRealValueType(input.scalar_type()); rwork = at::empty({lda * 2}, input.options().dtype(real_dtype)); rwork_data = rwork.data_ptr(); } @@ -647,7 +647,7 @@ void apply_lstsq(const Tensor& A, Tensor& B, Tensor& rank, Tensor& singular_valu default: rwork_len = std::max(1, rwork_opt); } - rwork = at::empty({rwork_len}, A.options().dtype(c10::toValueType(A.scalar_type()))); + rwork = at::empty({rwork_len}, A.options().dtype(c10::toRealValueType(A.scalar_type()))); rwork_data = rwork.data_ptr(); } @@ -833,6 +833,137 @@ void triangular_solve_kernel(const Tensor& A, const Tensor& B, bool left, bool u }); } +template +void apply_ldl_factor( + const Tensor& A, + const Tensor& pivots, + const Tensor& info, + bool upper, + bool hermitian) { +#if !AT_BUILD_WITH_LAPACK() + TORCH_CHECK( + false, + "Calling torch.linalg.ldl_factor on a CPU tensor requires compiling ", + "PyTorch with LAPACK. Please use PyTorch built with LAPACK support."); +#else + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(batchCount(A) > 0); + auto batch_size = batchCount(A); + auto n = A.size(-2); + auto leading_dim = A.stride(-1); + auto uplo = upper ? 'U' : 'L'; + + auto a_stride = A.dim() > 2 ? A.stride(-3) : 0; + auto pivots_stride = pivots.dim() > 1 ? pivots.stride(-2) : 0; + + auto a_data = A.data_ptr(); + auto pivots_data = pivots.data_ptr(); + auto info_data = info.data_ptr(); + + auto ldl_func = + hermitian ? lapackLdlHermitian : lapackLdlSymmetric; + + scalar_t wkopt; + ldl_func(uplo, n, a_data, leading_dim, pivots_data, &wkopt, -1, info_data); + using value_t = typename c10::scalar_value_type::type; + int lwork = std::max(1, real_impl(wkopt)); + Tensor work = at::empty({lwork}, A.dtype()); + auto work_data = work.data_ptr(); + + for (const auto i : c10::irange(batch_size)) { + scalar_t* a_working_ptr = &a_data[i * a_stride]; + auto* pivots_working_ptr = &pivots_data[i * pivots_stride]; + auto* info_working_ptr = &info_data[i]; + ldl_func( + uplo, + n, + a_working_ptr, + leading_dim, + pivots_working_ptr, + work_data, + lwork, + info_working_ptr); + } +#endif +} + +void ldl_factor_kernel( + const Tensor& LD, + const Tensor& pivots, + const Tensor& info, + bool upper, + bool hermitian) { + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES( + LD.scalar_type(), "ldl_factor_kernel_cpu", [&] { + apply_ldl_factor(LD, pivots, info, upper, hermitian); + }); +} + +template +void apply_ldl_solve( + const Tensor& A, + const Tensor& pivots, + const Tensor& B, + bool upper, + bool hermitian) { +#if !AT_BUILD_WITH_LAPACK() + TORCH_CHECK( + false, + "Calling torch.linalg.ldl_factor on a CPU tensor requires compiling ", + "PyTorch with LAPACK. Please use PyTorch built with LAPACK support."); +#else + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(batchCount(A) > 0); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(batchCount(pivots.unsqueeze(-1)) > 0); + auto batch_size = batchCount(B); + auto n = A.size(-2); + auto nrhs = B.size(-1); + auto lda = A.stride(-1); + auto ldb = B.stride(-1); + auto uplo = upper ? 'U' : 'L'; + + auto a_stride = A.dim() > 2 ? A.stride(-3) : 0; + auto b_stride = B.dim() > 2 ? B.stride(-3) : 0; + auto pivots_stride = pivots.dim() > 1 ? pivots.stride(-2) : 0; + + auto a_data = A.data_ptr(); + auto b_data = B.data_ptr(); + auto pivots_ = pivots.to(kInt); + auto pivots_data = pivots_.data_ptr(); + + auto ldl_solve_func = hermitian ? lapackLdlSolveHermitian + : lapackLdlSolveSymmetric; + + int info = 0; + for (const auto i : c10::irange(batch_size)) { + scalar_t* a_working_ptr = &a_data[i * a_stride]; + scalar_t* b_working_ptr = &b_data[i * b_stride]; + auto* pivots_working_ptr = &pivots_data[i * pivots_stride]; + ldl_solve_func( + uplo, + n, + nrhs, + a_working_ptr, + lda, + pivots_working_ptr, + b_working_ptr, + ldb, + &info); + } + TORCH_INTERNAL_ASSERT(info == 0); +#endif +} + +void ldl_solve_kernel( + const Tensor& LD, + const Tensor& pivots, + const Tensor& result, + bool upper, + bool hermitian) { + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES( + LD.scalar_type(), "ldl_solve_kernel_cpu", [&] { + apply_ldl_solve(LD, pivots, result, upper, hermitian); + }); +} + /* Computes the LU decomposition of a m×n matrix or batch of matrices in 'input' tensor. This is an in-place routine, content of 'input', 'pivots', and 'infos' is overwritten. @@ -851,7 +982,7 @@ void apply_lu_factor(const Tensor& input, const Tensor& pivots, const Tensor& in #if !AT_BUILD_WITH_LAPACK() TORCH_CHECK( false, - "Calling torch.lu on a CPU tensor requires compiling ", + "Calling torch.linalg.lu_factor on a CPU tensor requires compiling ", "PyTorch with LAPACK. Please use PyTorch built with LAPACK support."); #else TORCH_CHECK(compute_pivots, "linalg.lu_factor: LU without pivoting is not implemented on the CPU"); @@ -908,8 +1039,8 @@ void apply_lu_solve(const Tensor& b, const Tensor& lu, const Tensor& pivots, Tra const auto trans = to_blas(transpose); auto pivots_data = pivots.data_ptr(); auto b_stride = matrixStride(b); - auto lu_stride = matrixStride(lu); - auto pivots_stride = pivots.size(-1); + auto lu_stride = lu.dim() > 2 ? lu.stride(-3) : 0; + auto pivots_stride = pivots.dim() > 1 ? pivots.stride(-2) : 0; auto batch_size = batchCount(b); auto n = lu.size(-2); @@ -917,10 +1048,19 @@ void apply_lu_solve(const Tensor& b, const Tensor& lu, const Tensor& pivots, Tra auto leading_dimension = std::max(1, n); int info = 0; + + // lu and pivots tensors can be broadcast to b + // here we construct a helper indexing tensor to linearly index into lu and pivots + IntArrayRef lu_batch_shape(lu.sizes().data(), lu.dim() - 2); + IntArrayRef b_batch_shape(b.sizes().data(), b.dim() - 2); + BroadcastLinearIndices lu_index( + batchCount(lu), lu_batch_shape, b_batch_shape); + for (const auto i : c10::irange(batch_size)) { + int64_t lu_index_i = lu_index(i); scalar_t* b_working_ptr = &b_data[i * b_stride]; - scalar_t* lu_working_ptr = &lu_data[i * lu_stride]; - int* pivots_working_ptr = &pivots_data[i * pivots_stride]; + scalar_t* lu_working_ptr = &lu_data[lu_index_i * lu_stride]; + int* pivots_working_ptr = &pivots_data[lu_index_i * pivots_stride]; lapackLuSolve(trans, n, nrhs, lu_working_ptr, leading_dimension, pivots_working_ptr, b_working_ptr, leading_dimension, &info); @@ -1021,6 +1161,32 @@ void svd_kernel(const Tensor& A, }); } +void unpack_pivots_cpu_kernel(TensorIterator& iter, const int64_t dim_size) { + auto loop = [&](char* const* const data, const int64_t* const strides, const int64_t nelems) { + auto* perm_ptr = data[0]; + const auto* pivots_ptr = data[1]; + + for (const auto elem : c10::irange(nelems)) { + (void)elem; //Suppress unused variable warning + // WARNING: linalg.lu_factor returns int32 pivots, + // this behavior could change in the future. + const auto perm_data = reinterpret_cast(perm_ptr); + const auto pivots_data = reinterpret_cast(pivots_ptr); + + for (const auto i : c10::irange(dim_size)) { + std::swap( + perm_data[i], + perm_data[pivots_data[i] - 1] + ); + } + + perm_ptr += strides[0]; + pivots_ptr += strides[1]; + } + }; + + iter.for_each(loop); +} } // anonymous namespace REGISTER_ARCH_DISPATCH(cholesky_stub, DEFAULT, &cholesky_kernel); @@ -1089,6 +1255,18 @@ REGISTER_AVX2_DISPATCH(lu_factor_stub, &lu_factor_kernel); REGISTER_VSX_DISPATCH(lu_factor_stub, &lu_factor_kernel); REGISTER_ZVECTOR_DISPATCH(lu_factor_stub, &lu_factor_kernel); +REGISTER_ARCH_DISPATCH(ldl_factor_stub, DEFAULT, &ldl_factor_kernel); +REGISTER_AVX512_DISPATCH(ldl_factor_stub, &ldl_factor_kernel); +REGISTER_AVX2_DISPATCH(ldl_factor_stub, &ldl_factor_kernel); +REGISTER_VSX_DISPATCH(ldl_factor_stub, &ldl_factor_kernel); +REGISTER_ZVECTOR_DISPATCH(ldl_factor_stub, &ldl_factor_kernel); + +REGISTER_ARCH_DISPATCH(ldl_solve_stub, DEFAULT, &ldl_solve_kernel); +REGISTER_AVX512_DISPATCH(ldl_solve_stub, &ldl_solve_kernel); +REGISTER_AVX2_DISPATCH(ldl_solve_stub, &ldl_solve_kernel); +REGISTER_VSX_DISPATCH(ldl_solve_stub, &ldl_solve_kernel); +REGISTER_ZVECTOR_DISPATCH(ldl_solve_stub, &ldl_solve_kernel); + REGISTER_ARCH_DISPATCH(lu_solve_trans_stub, DEFAULT, &lu_solve_trans_kernel); REGISTER_AVX512_DISPATCH(lu_solve_trans_stub, &lu_solve_trans_kernel); REGISTER_AVX2_DISPATCH(lu_solve_trans_stub, &lu_solve_trans_kernel); @@ -1106,4 +1284,10 @@ REGISTER_AVX512_DISPATCH(svd_stub, &svd_kernel); REGISTER_AVX2_DISPATCH(svd_stub, &svd_kernel); REGISTER_VSX_DISPATCH(svd_stub, &svd_kernel); REGISTER_ZVECTOR_DISPATCH(svd_stub, &svd_kernel); + +REGISTER_ARCH_DISPATCH(unpack_pivots_stub, DEFAULT, &unpack_pivots_cpu_kernel); +REGISTER_AVX512_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel); +REGISTER_AVX2_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel); +REGISTER_VSX_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel); +REGISTER_ZVECTOR_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel); }} // namespace at::native diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp index bdd6c87403e3..e6ba1dc47428 100644 --- a/aten/src/ATen/native/BinaryOps.cpp +++ b/aten/src/ATen/native/BinaryOps.cpp @@ -21,10 +21,11 @@ namespace native { static void check_convert(const Scalar& scalar, ScalarType scalarType) { // Validate that is possible to convert scalar to tensor dtype without // overflow - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half, + at::ScalarType::ComplexHalf, scalarType, "check_convert", [&] { scalar.to(); }); @@ -232,10 +233,9 @@ CREATE_COMPARISON_SCALAR_TENSOR_META_FUNC(ge); namespace native { -DEFINE_DISPATCH(add_stub); DEFINE_DISPATCH(add_clamp_stub); -DEFINE_DISPATCH(sub_stub); DEFINE_DISPATCH(mul_stub); +DEFINE_DISPATCH(sub_stub); DEFINE_DISPATCH(div_true_stub); DEFINE_DISPATCH(div_floor_stub); DEFINE_DISPATCH(div_trunc_stub); @@ -277,17 +277,10 @@ DEFINE_DISPATCH(xlogy_stub); DEFINE_DISPATCH(xlog1py_stub); DEFINE_DISPATCH(zeta_stub); -TORCH_IMPL_FUNC(add_out) ( - const Tensor& self, const Tensor& other, const Scalar& alpha, const Tensor& result -) { - add_stub(device_type(), *this, alpha); - TORCH_INTERNAL_ASSERT(result.scalar_type() == output().dtype()); -} - TORCH_IMPL_FUNC(sub_out) ( const Tensor& self, const Tensor& other, const Scalar& alpha, const Tensor& result ) { - sub_stub(device_type(), *this, alpha); + add_stub(device_type(), *this, -alpha); TORCH_INTERNAL_ASSERT(result.scalar_type() == output().dtype()); } @@ -626,6 +619,11 @@ Tensor& mul_(Tensor& self, const Scalar& other) { return at::mul_out(self, wrapped_scalar_tensor(other), self); // redispatch! } +Tensor& mul__scalar_sparse_csr(Tensor& self, const Scalar& other) { + self.values().mul_(other); + return self; +} + Device correct_out_device(const Tensor& self, const Tensor& other) { if (self.device() == at::kCPU){ return other.device(); @@ -643,8 +641,6 @@ Tensor mul_zerotensor(const Tensor& self, const Tensor& other) { } Tensor div_zerotensor(const Tensor& self, const Tensor& other) { - TORCH_INTERNAL_ASSERT(self._is_zerotensor() || other._is_zerotensor()); - auto out_device = correct_out_device(self, other); // hack to use the TensorIterator to get the correct broadcasting and type promotion logic auto device_ = Device(DeviceType::Meta); @@ -672,7 +668,7 @@ Tensor div_zerotensor(const Tensor& self, const Tensor& other) { } } -Tensor add_zerotensor(const Tensor& self, const Tensor& other, const Scalar& alpha) { +Tensor maybe_add_maybe_sub(const Tensor& self, const Tensor& other, const Scalar& alpha) { auto out_device = correct_out_device(self, other); // hack to use the TensorIterator to get the correct broadcasting and type promotion logic auto device_ = Device(DeviceType::Meta); @@ -694,6 +690,33 @@ Tensor add_zerotensor(const Tensor& self, const Tensor& other, const Scalar& alp return get_out_like(self); } } +Tensor add_zerotensor(const Tensor& self, const Tensor& other, const Scalar& alpha) { + return maybe_add_maybe_sub(self, other, alpha); +} + +Tensor sub_zerotensor(const Tensor& self, const Tensor& other, const Scalar& alpha) { + return maybe_add_maybe_sub(self, other, -alpha); +} + +Tensor linalg_cross_zerotensor( + const Tensor& input, + const Tensor& other, + const int64_t dim) +{ + auto out_device = correct_out_device(input, other); + // hack to use the TensorIterator to get the correct broadcasting and type + // promotion logic (see add_zerotensor) + auto device = Device(DeviceType::Meta); + auto meta_out = at::redispatch::linalg_cross( + c10::DispatchKeySet(at::DispatchKey::Meta), + input.to(device), + other.to(device), + dim); + + return at::_efficientzerotensor( + meta_out.sizes(), + meta_out.options().device(out_device)); +} // multiply, alias for mul Tensor& multiply_out(const Tensor& self, const Tensor& other, Tensor& result) { @@ -791,6 +814,10 @@ Tensor bitwise_and(const Tensor& self, const Scalar& other) { return at::bitwise_and(self, wrapped_scalar_tensor(other)); } +Tensor bitwise_and(const Scalar& self, const Tensor& other) { + return at::bitwise_and(wrapped_scalar_tensor(self), other); +} + Tensor& bitwise_and_(Tensor& self, const Scalar& other) { return self.bitwise_and_(wrapped_scalar_tensor(other)); } @@ -820,6 +847,10 @@ Tensor bitwise_or(const Tensor& self, const Scalar& other) { return at::bitwise_or(self, wrapped_scalar_tensor(other)); } +Tensor bitwise_or(const Scalar& self, const Tensor& other) { + return at::bitwise_or(wrapped_scalar_tensor(self), other); +} + Tensor& bitwise_or_(Tensor& self, const Scalar& other) { return self.bitwise_or_(wrapped_scalar_tensor(other)); } @@ -849,6 +880,10 @@ Tensor bitwise_xor(const Tensor& self, const Scalar& other) { return at::bitwise_xor(self, wrapped_scalar_tensor(other)); } +Tensor bitwise_xor(const Scalar& self, const Tensor& other) { + return at::bitwise_xor(wrapped_scalar_tensor(self), other); +} + Tensor& bitwise_xor_(Tensor& self, const Scalar& other) { return self.bitwise_xor_(wrapped_scalar_tensor(other)); } @@ -879,7 +914,7 @@ Tensor __lshift__(const Tensor& self, const Tensor& other) { Tensor __lshift__(const Tensor& self, const Scalar& other) { Tensor result; - auto wrapper = wrapped_scalar_tensor(other).toType(self.scalar_type()); + auto wrapper = wrapped_scalar_tensor(other); auto iter = TensorIterator::binary_op(result, self, wrapper); lshift_stub(iter.device_type(), iter); return iter.output(); @@ -892,7 +927,7 @@ Tensor& __ilshift__(Tensor& self, const Tensor& other) { } Tensor& __ilshift__(Tensor& self, const Scalar& other) { - auto wrapper = wrapped_scalar_tensor(other).toType(self.scalar_type()); + auto wrapper = wrapped_scalar_tensor(other); auto iter = TensorIterator::binary_op(self, self, wrapper); lshift_stub(iter.device_type(), iter); return self; @@ -903,19 +938,19 @@ TORCH_IMPL_FUNC(bitwise_left_shift_out) (const Tensor& self, const Tensor& other } Tensor& bitwise_left_shift_out(const Tensor& self, const Scalar& other, Tensor& result) { - return at::bitwise_left_shift_out(result, self, wrapped_scalar_tensor(other).toType(self.scalar_type())); + return at::bitwise_left_shift_out(result, self, wrapped_scalar_tensor(other)); } Tensor bitwise_left_shift(const Tensor& self, const Scalar& other) { - return at::bitwise_left_shift(self, wrapped_scalar_tensor(other).toType(self.scalar_type())); + return at::bitwise_left_shift(self, wrapped_scalar_tensor(other)); } Tensor& bitwise_left_shift_(Tensor& self, const Scalar& other) { - return at::bitwise_left_shift_out(self, self, wrapped_scalar_tensor(other).toType(self.scalar_type())); + return at::bitwise_left_shift_out(self, self, wrapped_scalar_tensor(other)); } Tensor bitwise_left_shift(const Scalar& self, const Tensor& other) { - return at::bitwise_left_shift(wrapped_scalar_tensor(self).toType(other.scalar_type()), other); + return at::bitwise_left_shift(wrapped_scalar_tensor(self), other); } Tensor __rshift__(const Tensor& self, const Tensor& other) { @@ -927,7 +962,7 @@ Tensor __rshift__(const Tensor& self, const Tensor& other) { Tensor __rshift__(const Tensor& self, const Scalar& other) { Tensor result; - auto wrapper = wrapped_scalar_tensor(other).toType(self.scalar_type()); + auto wrapper = wrapped_scalar_tensor(other); auto iter = TensorIterator::binary_op(result, self, wrapper); rshift_stub(iter.device_type(), iter); return iter.output(); @@ -940,7 +975,7 @@ Tensor& __irshift__(Tensor& self, const Tensor& other) { } Tensor& __irshift__(Tensor& self, const Scalar& other) { - auto wrapper = wrapped_scalar_tensor(other).toType(self.scalar_type()); + auto wrapper = wrapped_scalar_tensor(other); auto iter = TensorIterator::binary_op(self, self, wrapper); rshift_stub(iter.device_type(), iter); return self; @@ -951,19 +986,19 @@ TORCH_IMPL_FUNC(bitwise_right_shift_out) (const Tensor& self, const Tensor& othe } Tensor& bitwise_right_shift_out(const Tensor& self, const Scalar& other, Tensor& result) { - return at::bitwise_right_shift_out(result, self, wrapped_scalar_tensor(other).toType(self.scalar_type())); + return at::bitwise_right_shift_out(result, self, wrapped_scalar_tensor(other)); } Tensor bitwise_right_shift(const Tensor& self, const Scalar& other) { - return at::bitwise_right_shift(self, wrapped_scalar_tensor(other).toType(self.scalar_type())); + return at::bitwise_right_shift(self, wrapped_scalar_tensor(other)); } Tensor& bitwise_right_shift_(Tensor& self, const Scalar& other) { - return at::bitwise_right_shift_out(self, self, wrapped_scalar_tensor(other).toType(self.scalar_type())); + return at::bitwise_right_shift_out(self, self, wrapped_scalar_tensor(other)); } Tensor bitwise_right_shift(const Scalar& self, const Tensor& other) { - return at::bitwise_right_shift(wrapped_scalar_tensor(self).toType(other.scalar_type()), other); + return at::bitwise_right_shift(wrapped_scalar_tensor(self), other); } template diff --git a/aten/src/ATen/native/BinaryOps.h b/aten/src/ATen/native/BinaryOps.h index aea2a125bb07..f34f210c4e48 100644 --- a/aten/src/ATen/native/BinaryOps.h +++ b/aten/src/ATen/native/BinaryOps.h @@ -50,7 +50,9 @@ using binary_fn = void(*)(TensorIterator&); using binary_clamp_fn_alpha = void(*)(TensorIterator&, const Scalar& alpha, const Scalar& min_val, const Scalar& max_val); +// NB: codegenned DECLARE_DISPATCH(structured_binary_fn_alpha, add_stub); + DECLARE_DISPATCH(binary_clamp_fn_alpha, add_clamp_stub); DECLARE_DISPATCH(structured_binary_fn_alpha, sub_stub); DECLARE_DISPATCH(structured_binary_fn, mul_stub); @@ -84,7 +86,7 @@ DECLARE_DISPATCH(binary_fn_double, huber_stub); DECLARE_DISPATCH(structured_binary_fn, sigmoid_backward_stub); DECLARE_DISPATCH(binary_fn_alpha, logit_backward_stub); DECLARE_DISPATCH(structured_binary_fn, tanh_backward_stub); -DECLARE_DISPATCH(binary_fn, mse_stub); +DECLARE_DISPATCH(structured_binary_fn, mse_stub); DECLARE_DISPATCH(structured_binary_fn, fmod_stub); DECLARE_DISPATCH(structured_binary_fn, logaddexp_stub); DECLARE_DISPATCH(structured_binary_fn, logaddexp2_stub); diff --git a/aten/src/ATen/native/Blas.cpp b/aten/src/ATen/native/Blas.cpp index 04a12cb4e400..26c3804219e0 100644 --- a/aten/src/ATen/native/Blas.cpp +++ b/aten/src/ATen/native/Blas.cpp @@ -165,7 +165,7 @@ Tensor dot(const Tensor &self, const Tensor &other){ return r; } - return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "dot", [&] { + return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::BFloat16, self.scalar_type(), "dot", [&] { Tensor result = at::empty({}, self.options()); result.fill_(dot_impl(self.numel(), self.data_ptr(), self.stride(0), other.data_ptr(), other.stride(0))); return result; diff --git a/aten/src/ATen/native/Bucketization.cpp b/aten/src/ATen/native/Bucketization.cpp index 728222090542..15d30c137d5b 100644 --- a/aten/src/ATen/native/Bucketization.cpp +++ b/aten/src/ATen/native/Bucketization.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include diff --git a/aten/src/ATen/native/BucketizationUtils.h b/aten/src/ATen/native/BucketizationUtils.h index 7122723cf1ed..e23fa1267807 100644 --- a/aten/src/ATen/native/BucketizationUtils.h +++ b/aten/src/ATen/native/BucketizationUtils.h @@ -1,7 +1,14 @@ #pragma once -#include +#include #include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif namespace at { namespace native { diff --git a/aten/src/ATen/native/CPUBlas.h b/aten/src/ATen/native/CPUBlas.h index a789f58140db..80248fb23392 100644 --- a/aten/src/ATen/native/CPUBlas.h +++ b/aten/src/ATen/native/CPUBlas.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include #include #include diff --git a/aten/src/ATen/native/ComplexHelper.h b/aten/src/ATen/native/ComplexHelper.h index e9efd4b7c88d..88668d13145c 100644 --- a/aten/src/ATen/native/ComplexHelper.h +++ b/aten/src/ATen/native/ComplexHelper.h @@ -40,7 +40,7 @@ Tensor _view_as_real_physical(const Tensor& self) { new_sizes.back() = 2; auto new_strides = computeStrideForViewAsReal(self.strides()); auto new_storage_offset = 2 * self.storage_offset(); - const auto float_type = c10::toValueType(self.scalar_type()); + const auto float_type = c10::toRealValueType(self.scalar_type()); auto real_tensor = view_tensor(self, float_type, new_storage_offset, new_sizes, new_strides); return real_tensor; } diff --git a/aten/src/ATen/native/ConstantPadNd.cpp b/aten/src/ATen/native/ConstantPadNd.cpp deleted file mode 100644 index f7a2d76ed522..000000000000 --- a/aten/src/ATen/native/ConstantPadNd.cpp +++ /dev/null @@ -1,87 +0,0 @@ -#include - -#include - -namespace at { namespace native { - -Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value) { - TORCH_CHECK(pad.size() % 2 == 0, "Length of pad must be even but instead it equals ", - pad.size()); - - auto input_sizes = self.sizes(); - auto l_inp = self.dim(); - - auto l_pad = pad.size() / 2; - auto l_diff = l_inp - l_pad; - TORCH_CHECK(l_inp >= (int64_t)l_pad, "Length of pad should be no more than twice the number of " - "dimensions of the input. Pad length is ", pad.size(), "while the input has ", - l_inp, "dimensions."); - - std::vector new_shape; - - bool all_pads_non_positive = true; - - auto c_input = self; - for (const auto i : c10::irange(l_diff, l_inp)) { - auto pad_idx = 2 * (l_inp - i - 1); - if (pad[pad_idx] < 0) { - c_input = c_input.narrow(i, -pad[pad_idx], c_input.size(i) + pad[pad_idx]); - } else if (pad[pad_idx] != 0) { - all_pads_non_positive = false; - } - if (pad[pad_idx + 1] < 0) { - c_input = c_input.narrow(i, 0, c_input.size(i) + pad[pad_idx + 1]); - } else if (pad[pad_idx + 1] != 0) { - all_pads_non_positive = false; - } - } - - // if none of the pads are positive we can optimize and just return the result - // of calling .narrow() on the input - if (all_pads_non_positive) { - return c_input.clone(); - } - - - for (size_t i = 0; i < (size_t)l_diff; i ++) { - new_shape.emplace_back(input_sizes[i]); - } - - for (const auto i : c10::irange((size_t)l_pad)) { - auto pad_idx = pad.size() - ((i + 1) * 2); - auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1]; - TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ", - pad[pad_idx], " and ", pad[pad_idx + 1], " resulted in a negative output size, " - "which is invalid. Check dimension ", l_diff + i, " of your input."); - new_shape.emplace_back(new_dim); - } - - at::Tensor output; - const auto memory_format = self.suggest_memory_format(); - if (self.is_quantized()) { - const auto qscheme = self.qscheme(); - TORCH_CHECK(qscheme == kPerTensorAffine || qscheme == kPerTensorSymmetric, - "Only per-tensor padding is supported."); - output = at::_empty_affine_quantized( - new_shape, self.options().memory_format(memory_format), - self.q_scale(), self.q_zero_point(), c10::nullopt); - } else { - output = at::empty(new_shape, self.options().memory_format(memory_format)); - } - output.fill_(value); - - auto c_output = output; - for (const auto i : c10::irange(l_diff, l_inp)) { - auto pad_idx = 2 * (l_inp - i - 1); - if (pad[pad_idx] > 0) { - c_output = c_output.narrow(i, pad[pad_idx], c_output.size(i) - pad[pad_idx]); - } - if (pad[pad_idx + 1] > 0) { - c_output = c_output.narrow(i, 0, c_output.size(i) - pad[pad_idx + 1]); - } - } - c_output.copy_(c_input); - return output; -} - -}} // namespace at::native diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h index 74b87e76b39f..8493deba7b33 100644 --- a/aten/src/ATen/native/ConvUtils.h +++ b/aten/src/ATen/native/ConvUtils.h @@ -1,5 +1,6 @@ #pragma once #include +#include #include #include #include @@ -19,6 +20,10 @@ using cudnn_convolution_backward_fn = std::tuple(*)( const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, bool, std::array); DECLARE_DISPATCH(cudnn_convolution_backward_fn, cudnn_convolution_backward_stub); +using mps_convolution_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, int64_t, std::array); +DECLARE_DISPATCH(mps_convolution_backward_fn, mps_convolution_backward_stub); using cudnn_convolution_transpose_backward_fn = std::tuple(*)( const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, bool, std::array); @@ -56,6 +61,25 @@ using slow_conv_transpose3d_backward_fn = std::tuple); DECLARE_DISPATCH(slow_conv_transpose3d_backward_fn, slow_conv_transpose3d_backward_stub); +namespace { + static bool cudnnv8_heuristic_mode_b = c10::utils::check_env("TORCH_CUDNN_USE_HEURISTIC_MODE_B") == true; +} + +static inline bool cudnnv8_enabled_check_debug() { + static bool cudnnv8_flag = c10::utils::check_env("TORCH_CUDNN_V8_API_ENABLED") == true; + static bool cudnnv8_debug = c10::utils::check_env("TORCH_CUDNN_V8_API_DEBUG") == true; + static uint8_t cudnnv8_debugcount = 0; + if (cudnnv8_debug == 1 && cudnnv8_debugcount < 10) { + TORCH_WARN("TORCH_CUDNN_V8_DEBUG ON, V8_FLAG: ", cudnnv8_flag, " TORCH_CUDNN_USE_HEURISTIC_MODE B: ", cudnnv8_heuristic_mode_b); + cudnnv8_debugcount++; + } + return cudnnv8_flag == 1; +} + +static inline bool cudnnv8_use_heur_mode_b() { + return cudnnv8_heuristic_mode_b; +} + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) struct ConvParams { std::vector stride; @@ -85,7 +109,8 @@ struct ConvParams { bool use_mkldnn(const at::Tensor& input, const at::Tensor& weight) const; bool use_nnpack(const at::Tensor& input, const at::Tensor& weight) const; bool use_xnnpack(const at::Tensor& input, const at::Tensor& weight, - const c10::optional bias_sizes_opt) const; + const at::OptionalIntArrayRef bias_sizes_opt) const; + bool use_mps(const at::Tensor& input, const at::Tensor& weight) const; bool is_depthwise(const at::Tensor& input, const at::Tensor& weight) const; }; @@ -109,7 +134,9 @@ enum class ConvBackend { SlowTranspose2d, SlowTranspose3d, Winograd3x3Depthwise, - Xnnpack2d + Xnnpack2d, + Mps, + MpsTranspose, }; // Function to select the convolution backend based on the inputs and params. @@ -120,7 +147,7 @@ enum class ConvBackend { TORCH_API ConvBackend select_conv_backend( const Tensor& input, const Tensor& weight, - const c10::optional bias_sizes_opt, + const at::OptionalIntArrayRef bias_sizes_opt, const bool need_backward, const ConvParams& params); @@ -147,6 +174,69 @@ constexpr int weight_input_channels_dim = 1; // Often written as 2 + max_dim (extra dims for batch size and channels) constexpr int max_dim = 3; +// --------------------------------------------------------------------- +// +// Checking +// +// --------------------------------------------------------------------- + +// Used on pad, stride and dilation +static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, const char* arg_name) +{ + TORCH_CHECK(args.size() <= expected_size, + "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ", + expected_size, " (while checking arguments for ", c, ")"); + TORCH_CHECK(args.size() >= expected_size, + "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ", + expected_size, " (while checking arguments for ", c, ")"); + + auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;}); + if (num_negative_values > 0){ + std::stringstream ss; + ss << arg_name << " should be greater than zero but got ("; + std::copy(args.begin(), args.end() - 1, std::ostream_iterator(ss,", ")); + ss << args.back() << ")" << " (while checking arguments for " << c << ")"; + AT_ERROR(ss.str()); + } +} + + +// NOTE [ Convolution checks ] +// +// NB: For many call sites, it is not strictly necessary to check all of +// these relationships (for example, for forward convolution, we compute +// the size of output ourselves, so we don't actually need to check +// output. However, writing a single function that does everything +// means we get to reuse it for both forwards and all backwards +// variants, even when the set of "real" inputs varies. The magic of +// relational computing! +// +// (There is one downside, which is that it is slightly harder to write +// error messages which are able to distinguish between real inputs +// (which the user can change) and computed inputs (which the user can +// only indirectly affect). It would be an interesting exercise to +// come up with a general framework to handle such situations.) +static void convolution_shape_check( + CheckedFrom c, + const TensorGeometryArg& input, const TensorGeometryArg& weight, const TensorGeometryArg& output, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups) +{ + check_args(c, padding, input->dim() - 2, "padding"); + check_args(c, stride, padding.size(), "stride"); + check_args(c, dilation, padding.size(), "dilation"); + + // Input + checkDimRange(c, input, 3, 6 /* exclusive */); + checkSize(c, input, input_channels_dim, weight->size(1) * groups); + + // Weight + checkSameDim(c, input, weight); + + // TODO: check that output->size() matches output_sizes + // TODO: check that weight matches output->sizes() + checkSameDim(c, input, output); +} + // NB: conv_output_size and conv_input_size are not bijections, // as conv_output_size loses information; this is why conv_input_size // takes an extra output_padding argument to resolve the ambiguity. @@ -270,4 +360,42 @@ static inline bool miopen_conv_use_channels_last(const at::Tensor& input, const return can_use_miopen_channels_last_2d || can_use_miopen_channels_last_3d; } +static inline bool mkldnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) { + + // disable NHWC for float64 input. + if (input.scalar_type() == at::kDouble || + weight.scalar_type() == at::kDouble) { + return false; + } + + // disable NHWC for MkldnnCPU tensor. + if (input.is_mkldnn() || weight.is_mkldnn()) { + return false; + } + + auto input_memory_format = input.suggest_memory_format(); + auto weight_memory_format = weight.suggest_memory_format(); + + bool can_use_mkldnn_channels_last_2d = + (input_memory_format == at::MemoryFormat::ChannelsLast) || + (weight_memory_format == at::MemoryFormat::ChannelsLast); + + // TODO: add channels last 3d support + bool can_use_mkldnn_channels_last_3d = false; + + return can_use_mkldnn_channels_last_2d || can_use_mkldnn_channels_last_3d; +} + +static inline bool thnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) { + + auto input_memory_format = input.suggest_memory_format(); + auto weight_memory_format = weight.suggest_memory_format(); + + bool can_use_thnn_channels_last_2d = input.device().is_cpu() && ( + (input_memory_format == at::MemoryFormat::ChannelsLast) || ( + weight_memory_format == at::MemoryFormat::ChannelsLast)); + + return can_use_thnn_channels_last_2d; +} + }} // namespace at::native diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp index 5a3275239716..a6127a53577b 100644 --- a/aten/src/ATen/native/Convolution.cpp +++ b/aten/src/ATen/native/Convolution.cpp @@ -19,6 +19,10 @@ #include #endif +#if AT_MKLDNN_ENABLED() +#include +#endif + constexpr int MIOPEN_DIM_MAX = 5; namespace at { namespace native { @@ -190,8 +194,8 @@ auto ConvParams::use_cudnn(const at::Tensor& input, const at::Tensor& weight) co if (!input.is_cuda() || !cudnn_enabled) { return false; } - if (input.scalar_type() == at::kBFloat16 || weight.scalar_type() == at::kBFloat16) { - return false; + if (input.scalar_type() == at::kBFloat16 || weight.scalar_type() == at::kBFloat16) { + return at::native::cudnnv8_enabled_check_debug(); } if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous) { // bypass dilation checks for channels_last convolution @@ -209,6 +213,22 @@ auto ConvParams::use_cudnn(const at::Tensor& input, const at::Tensor& weight) co #endif } +auto ConvParams::use_mps( const at::Tensor& input, const at::Tensor& weight) const -> bool { + // These checks need to be expanded. Currently we have very limited set of + // checks for MPS. +#ifdef USE_MPS + if (needs_64bit_indexing_no_split(input, weight)) { + return false; + } + if (!input.is_mps()) { + return false; + } + return true; +#else + return false; +#endif +} + auto ConvParams::use_miopen(const at::Tensor& input, const at::Tensor& weight, bool bias_defined) const -> bool { if (needs_64bit_indexing_no_split(input, weight)) { return false; @@ -228,6 +248,9 @@ auto ConvParams::use_mkldnn(const at::Tensor& input, const at::Tensor& weight) c if (!at::globalContext().userEnabledMkldnn()) { return false; } + if (input.device().is_cpu() && input.scalar_type() == kBFloat16 && mkldnn_bf16_device_check()) { + return true; + } return (input.is_mkldnn()) || // input is mkldnn Tensor (input.device().is_cpu() && input.scalar_type() == kFloat && // only on CPU Float Tensors @@ -267,7 +290,7 @@ auto ConvParams::use_nnpack(const at::Tensor& input, const at::Tensor& weight) c auto ConvParams::use_xnnpack( const at::Tensor& input, const at::Tensor& weight, - const c10::optional bias_sizes_opt) const -> bool { + const at::OptionalIntArrayRef bias_sizes_opt) const -> bool { #if defined(C10_MOBILE) if (!transposed) { return (input.size(1) == groups) && @@ -629,6 +652,25 @@ static void check_input_same_type_as_parameters( check_input_same_type_as_parameters(input, weight, /*bias=*/ Tensor()); } +static void check_input_same_type_as_parameters( + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + const ConvBackend backend) { + if (backend == ConvBackend::Mkldnn) { + TORCH_CHECK(input.options().type_equal(weight.options()) + || (input.is_mkldnn() && weight.device().is_cpu() && weight.scalar_type() == kFloat), + "Input type (", input.toString(), ") and weight type (", weight.toString(), + ") should be the same or input should be a MKLDNN tensor and weight is a dense tensor"); + TORCH_CHECK(!bias.defined() || (input.options().type_equal(bias.options())) + || (input.is_mkldnn() && bias.device().is_cpu() && bias.scalar_type() == kFloat), + "Input type (", input.toString(), ") and bias type (", bias.toString(), + ") should be the same or input should be a MKLDNN tensor and bias is a dense tensor"); + } else { + check_input_same_type_as_parameters(input, weight, bias); + } +} + static auto view4d(const at::Tensor& tensor) -> at::Tensor { TORCH_CHECK(tensor.ndimension() == 3, "expected 3D tensor, got tensor with ", tensor.ndimension(), @@ -643,15 +685,97 @@ static auto view3d(const at::Tensor& tensor) -> at::Tensor { return tensor.squeeze(2); } - static at::Tensor subtensor(at::Tensor& tensor, int dim, int groups, int g) { if (!tensor.defined()) { return at::Tensor(); } + const auto memory_format = tensor.suggest_memory_format(); int64_t n = tensor.sizes()[dim] / groups; - return tensor.narrow(dim, n * g, n).contiguous(); + return tensor.narrow(dim, n * g, n).contiguous(memory_format); +} + +namespace { + +std::pair complex_to_real(const Tensor& inp) { + auto inp_view_as_complex = at::view_as_real(inp); + auto dim_i = inp_view_as_complex.dim() - 1; + auto i_r = inp_view_as_complex.select(dim_i, 0); + auto i_i = inp_view_as_complex.select(dim_i, 1); + return std::make_pair(i_r, i_i); +} + +at::Tensor complex_convolution( + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + IntArrayRef output_padding, + int64_t groups) { + check_input_same_type_as_parameters(input, weight, bias); + Tensor i_r, i_i, w_r, w_i; + std::tie(i_r, i_i) = complex_to_real(input.resolve_conj()); + std::tie(w_r, w_i) = complex_to_real(weight.resolve_conj()); + + // [NOTE] Complex Convolution + // conv(W, x, b) = conv(Wr, xr, br) - conv(Wi, xi, 0) + i(conv(Wi, xr, bi) + conv(Wr, xi, 0)) + // where W, x and b are all complex inputs. + // With Gauss Trick: + // a = conv(Wr, xr, br), + // b = conv(Wi, xi, 0), + // c = conv(Wr + Wi, xr + xi, bi + br) + // conv(W, x, b) = a - b + i(c - a - b) + Tensor a, b, c; + if (!bias.defined()) { + a = at::convolution(i_r, w_r, bias, stride, padding, dilation, false, output_padding, groups); + b = at::convolution(i_i, w_i, bias, stride, padding, dilation, false, output_padding, groups); + c = at::convolution(i_r + i_i, w_r + w_i, bias, stride, padding, dilation, false, output_padding, groups); + } else { + Tensor b_r, b_i; + std::tie(b_r, b_i) = complex_to_real(bias.resolve_conj()); + a = at::convolution(i_r, w_r, b_r, stride, padding, dilation, false, output_padding, groups); + b = at::convolution(i_i, w_i, Tensor(), stride, padding, dilation, false, output_padding, groups); + c = at::convolution(i_r + i_i, w_r + w_i, b_r + b_i, stride, padding, dilation, false, output_padding, groups); + } + + auto i = c10::Scalar(c10::complex(0, 1)); + return a - b + i * (c - a - b); } +at::Tensor complex_convolution_mode( + const at::Tensor& input, + const at::Tensor& weight, + const c10::optional& bias_opt, + at::IntArrayRef stride, + c10::string_view padding, + at::IntArrayRef dilation, + int64_t groups) { + auto bias = bias_opt.value_or(Tensor()); + check_input_same_type_as_parameters(input, weight, bias); + Tensor i_r, i_i, w_r, w_i; + std::tie(i_r, i_i) = complex_to_real(input.resolve_conj()); + std::tie(w_r, w_i) = complex_to_real(weight.resolve_conj()); + + // See [NOTE] Complex Convolution + Tensor a, b, c; + if (!bias.defined()) { + a = at::_convolution_mode(i_r, w_r, bias, stride, padding, dilation, groups); + b = at::_convolution_mode(i_i, w_i, bias, stride, padding, dilation, groups); + c = at::_convolution_mode(i_r + i_i, w_r + w_i, bias, stride, padding, dilation, groups); + } else { + Tensor b_r, b_i; + std::tie(b_r, b_i) = complex_to_real(bias.resolve_conj()); + a = at::_convolution_mode(i_r, w_r, b_r, stride, padding, dilation, groups); + b = at::_convolution_mode(i_i, w_i, Tensor(), stride, padding, dilation, groups); + c = at::_convolution_mode(i_r + i_i, w_r + w_i, b_r + b_i, stride, padding, dilation, groups); + } + + auto i = c10::Scalar(c10::complex(0, 1)); + return a - b + i * (c - a - b); +} + +} // namespace at::Tensor conv1d( const Tensor& input_, const Tensor& weight, const c10::optional& bias_opt, @@ -663,7 +787,12 @@ at::Tensor conv1d( Tensor input; bool is_batched; std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 1, "conv1d"); - auto output = at::convolution(input, weight, bias, stride, padding, dilation, false, {0}, groups); + Tensor output; + if (at::isComplexType(input_.scalar_type())) { + output = complex_convolution(input, weight, bias, stride, padding, dilation, {0}, groups); + } else { + output = at::convolution(input, weight, bias, stride, padding, dilation, false, {0}, groups); + } return is_batched ? output : output.squeeze(0); } @@ -677,7 +806,12 @@ at::Tensor conv2d( Tensor input; bool is_batched; std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 2, "conv2d"); - auto output = at::convolution(input, weight, bias, stride, padding, dilation, false, {{0, 0}}, groups); + Tensor output; + if (at::isComplexType(input_.scalar_type())) { + output = complex_convolution(input, weight, bias, stride, padding, dilation, {{0, 0}}, groups); + } else { + output = at::convolution(input, weight, bias, stride, padding, dilation, false, {{0, 0}}, groups); + } return is_batched ? output : output.squeeze(0); } @@ -691,7 +825,12 @@ at::Tensor conv3d( Tensor input; bool is_batched; std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 3, "conv3d"); - auto output = at::convolution(input, weight, bias, stride, padding, dilation, false, {{0, 0, 0}}, groups); + Tensor output; + if (at::isComplexType(input_.scalar_type())) { + output = complex_convolution(input, weight, bias, stride, padding, dilation, {{0, 0, 0}}, groups); + } else { + output = at::convolution(input, weight, bias, stride, padding, dilation, false, {{0, 0, 0}}, groups); + } return is_batched ? output : output.squeeze(0); } @@ -787,8 +926,12 @@ at::Tensor conv1d( Tensor input; bool is_batched; std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 1, "conv1d"); - auto output = at::_convolution_mode( - input, weight, bias, stride, std::move(padding), dilation, groups); + Tensor output; + if (at::isComplexType(input_.scalar_type())) { + output = complex_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups); + } else { + output = at::_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups); + } return is_batched ? output : output.squeeze(0); } @@ -799,8 +942,12 @@ at::Tensor conv2d( Tensor input; bool is_batched; std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 2, "conv2d"); - auto output = at::_convolution_mode( - input, weight, bias, stride, std::move(padding), dilation, groups); + Tensor output; + if (at::isComplexType(input_.scalar_type())) { + output = complex_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups); + } else { + output = at::_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups); + } return is_batched ? output : output.squeeze(0); } @@ -811,8 +958,12 @@ at::Tensor conv3d( Tensor input; bool is_batched; std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 3, "conv3d"); - auto output = at::_convolution_mode( - input, weight, bias, stride, std::move(padding), dilation, groups); + Tensor output; + if (at::isComplexType(input_.scalar_type())) { + output = complex_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups); + } else { + output = at::_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups); + } return is_batched ? output : output.squeeze(0); } @@ -933,7 +1084,7 @@ ConvBackend select_conv_backend( ConvBackend select_conv_backend( const Tensor& input, const Tensor& weight, - const c10::optional bias_sizes_opt, + const at::OptionalIntArrayRef bias_sizes_opt, const bool need_backward, const ConvParams& params) { @@ -1018,6 +1169,12 @@ ConvBackend select_conv_backend( // unsupported } } + } else if (params.use_mps(input, weight)) { + if (params.transposed) { + return ConvBackend::MpsTranspose; + } else { + return ConvBackend::Mps; + } } else { // Only reach here when input is backend with out-of-source implementation. return ConvBackend::Overrideable; @@ -1078,18 +1235,41 @@ static inline std::vector calc_output_size( static inline at::MemoryFormat determine_backend_memory_format( const Tensor& input, - const Tensor& weight) { + const Tensor& weight, + const ConvBackend backend) { at::MemoryFormat backend_memory_format = at::MemoryFormat::Contiguous; auto k = weight.ndimension(); #if !defined(C10_MOBILE) // See Note [Mobile check segfaults] - if (detail::getCUDAHooks().compiledWithCuDNN()) { - backend_memory_format = cudnn_conv_suggest_memory_format(input, weight); - } - if (detail::getCUDAHooks().compiledWithMIOpen() && miopen_conv_use_channels_last(input, weight)) { - TORCH_INTERNAL_ASSERT((k == 4 || k == 5), - "Expected 4D or 5D input for miopen memory format selection in determine_backend_memory_format()"); - backend_memory_format = (k == 5) ? at::MemoryFormat::Contiguous /*at::MemoryFormat::ChannelsLast3d*/ : at::MemoryFormat::ChannelsLast; + switch(backend) { + case ConvBackend::Cudnn: + case ConvBackend::CudnnTranspose: + if (detail::getCUDAHooks().compiledWithCuDNN()) { + backend_memory_format = cudnn_conv_suggest_memory_format(input, weight); + } + break; + case ConvBackend::Miopen: + case ConvBackend::MiopenDepthwise: + case ConvBackend::MiopenTranspose: + if (detail::getCUDAHooks().compiledWithMIOpen() && miopen_conv_use_channels_last(input, weight)) { + TORCH_INTERNAL_ASSERT((k == 4 || k == 5), + "Expected 4D or 5D input for miopen memory format selection in determine_backend_memory_format()"); + backend_memory_format = (k == 5) ? at::MemoryFormat::Contiguous /*at::MemoryFormat::ChannelsLast3d*/ : at::MemoryFormat::ChannelsLast; + } + break; + case ConvBackend::Mkldnn: + if (mkldnn_conv_use_channels_last(input, weight)) { + backend_memory_format = (k == 5) ? at::MemoryFormat::Contiguous /*at::MemoryFormat::ChannelsLast3d*/ : at::MemoryFormat::ChannelsLast; + } + break; + case ConvBackend::Slow2d: + case ConvBackend::SlowDilated2d: + if (thnn_conv_use_channels_last(input, weight)) { + backend_memory_format = at::MemoryFormat::ChannelsLast; + } + break; + default: + backend_memory_format = at::MemoryFormat::Contiguous; } #endif return backend_memory_format; @@ -1142,7 +1322,7 @@ at::Tensor _convolution( bool need_backward = GradMode::is_enabled() && (input.requires_grad() || weight.requires_grad() || (bias.defined() && bias.requires_grad())); ConvBackend backend = select_conv_backend(input, weight, bias_sizes_opt, need_backward, params); - at::MemoryFormat backend_memory_format = determine_backend_memory_format(input, weight); + at::MemoryFormat backend_memory_format = determine_backend_memory_format(input, weight, backend); // Call the backend. Tensor output; @@ -1203,18 +1383,11 @@ at::Tensor _convolution( break; case ConvBackend::Mkldnn: #if AT_MKLDNN_ENABLED() - TORCH_CHECK(input.options().type_equal(weight.options()) - || (input.is_mkldnn() && weight.device().is_cpu() && weight.scalar_type() == kFloat), - "Input type (", input.toString(), ") and weight type (", weight.toString(), - ") should be the same or input should be a MKLDNN tensor and weight is a dense tensor"); - TORCH_CHECK(!bias.defined() || (input.options().type_equal(bias.options())) - || (input.is_mkldnn() && bias.device().is_cpu() && bias.scalar_type() == kFloat), - "Input type (", input.toString(), ") and bias type (", bias.toString(), - ") should be the same or input should be a MKLDNN tensor and bias is a dense tensor"); + check_input_same_type_as_parameters(input, weight, bias, backend); if (!input.is_mkldnn()) { // need to ensure contiguous for non-mkldnn tensors - input = input.contiguous(); - weight = weight.contiguous(); + input = input.contiguous(backend_memory_format); + weight = weight.contiguous(backend_memory_format); bias = bias.defined() ? bias.contiguous() : bias; } output = at::mkldnn_convolution( @@ -1255,11 +1428,12 @@ at::Tensor _convolution( case ConvBackend::SlowDilated3d: case ConvBackend::SlowTranspose2d: case ConvBackend::SlowTranspose3d: + input = input.contiguous(backend_memory_format); + weight = weight.contiguous(backend_memory_format); if (params.groups == 1) { - output = _convolution_nogroup_backend(input.contiguous(), weight, bias, backend, params); + output = _convolution_nogroup_backend(input, weight, bias, backend, params); } else { std::vector outputs(params.groups); - input = input.contiguous(); for (const auto g : c10::irange(params.groups)) { auto input_g = subtensor(input, 1, params.groups, g); auto weight_g = subtensor(weight, 0, params.groups, g); @@ -1269,6 +1443,41 @@ at::Tensor _convolution( output = at::cat(outputs, 1); } break; + case ConvBackend::Mps: +#ifdef USE_MPS + TORCH_CHECK(input.options().type_equal(weight.options()), + "Input type (", input.toString(), ") and weight type (", weight.toString(), + ") should be the same"); + TORCH_CHECK(!bias.defined() || (input.options().type_equal(bias.options())), + "Input type (", input.toString(), ") and bias type (", bias.toString(), + ") should be the same"); + + output = at::_mps_convolution(input.contiguous(), weight, bias.defined() ? bias.contiguous() : bias, + params.padding, params.stride, params.dilation, + params.groups); +#else + TORCH_INTERNAL_ASSERT(false, "MPS backend was selected in PyTorch without support"); +#endif + break; + case ConvBackend::MpsTranspose: +#ifdef USE_MPS + TORCH_CHECK(input.options().type_equal(weight.options()), + "Input type (", input.toString(), ") and weight type (", weight.toString(), + ") should be the same"); + TORCH_CHECK(!bias.defined() || (input.options().type_equal(bias.options())), + "Input type (", input.toString(), ") and bias type (", bias.toString(), + ") should be the same"); + output = at::_mps_convolution_transpose( + input.contiguous(backend_memory_format), weight, + params.padding, params.output_padding, + params.stride, params.dilation, params.groups); + if (bias.defined()) { + output.add_(reshape_bias(input.dim(), bias)); + } +#else + TORCH_INTERNAL_ASSERT(false, "MPS backend was selected in PyTorch without support"); +#endif + break; } if (k == 3 && !input.is_mkldnn()) { @@ -1565,7 +1774,7 @@ std::tuple _convolution_backward_nogroup_bac // output_mask: 3-dim boolean array specifying which gradients to compute in input, weight, bias order std::tuple convolution_backward( const Tensor& grad_output_, const Tensor& input_, const Tensor& weight_, - const c10::optional bias_sizes_opt, + const at::OptionalIntArrayRef bias_sizes_opt, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool transposed, IntArrayRef output_padding, int64_t groups, std::array output_mask) { auto grad_output = grad_output_; @@ -1617,7 +1826,7 @@ std::tuple convolution_backward( // Select appropriate backend to use. ConvBackend backend = select_conv_backend(input, weight, bias_sizes_opt, /*need_backward=*/ true, params); - at::MemoryFormat backend_memory_format = determine_backend_memory_format(input, weight); + at::MemoryFormat backend_memory_format = determine_backend_memory_format(input, weight, backend); // Call the backend. Tensor backend_grad_input, backend_grad_weight, backend_grad_bias; @@ -1651,6 +1860,33 @@ std::tuple convolution_backward( input_weight_output_mask); break; } + case ConvBackend::Mps: + { +#ifdef USE_MPS + check_input_same_type_as_parameters(input, weight); + std::tie(backend_grad_input, backend_grad_weight, backend_grad_bias) = + at::mps_convolution_backward(input, grad_output, weight, params.padding, + params.stride, params.dilation, params.groups, output_mask); +#else + TORCH_INTERNAL_ASSERT(false, "MPS backend was selected in PyTorch without support"); +#endif + break; + } + case ConvBackend::MpsTranspose: + { +#ifdef USE_MPS + check_input_same_type_as_parameters(input, weight); + std::array input_weight_output_mask = {output_mask[0], output_mask[1]}; + std::tie(backend_grad_input, backend_grad_weight) = at::mps_convolution_transpose_backward( + // Only make input contiguous when it is necessary for the backwards computation + output_mask[1] ? input.contiguous(backend_memory_format) : input, + grad_output, weight, params.padding, params.output_padding, + params.stride, params.dilation, params.groups, input_weight_output_mask); +#else + TORCH_INTERNAL_ASSERT(false, "MPS backend was selected in PyTorch without support"); +#endif + break; + } case ConvBackend::CudnnTranspose: { check_input_same_type_as_parameters(input, weight); @@ -1725,8 +1961,8 @@ std::tuple convolution_backward( TORCH_CHECK(!weight.is_mkldnn(), "The MKLDNN backend does not support weight as an MKLDNN tensor during training"); if (!input.is_mkldnn()) { - input = input.contiguous(); - weight = weight.contiguous(); + input = input.contiguous(backend_memory_format); + weight = weight.contiguous(backend_memory_format); } std::tie(backend_grad_input, backend_grad_weight, backend_grad_bias) = mkldnn_convolution_backward_stub(input.device().type(), input, grad_output, weight, params.padding, @@ -1753,7 +1989,8 @@ std::tuple convolution_backward( case ConvBackend::SlowTranspose2d: case ConvBackend::SlowTranspose3d: { - input = input.contiguous(); + input = input.contiguous(backend_memory_format); + weight = weight.contiguous(backend_memory_format); if (params.groups == 1) { std::tie(backend_grad_input, backend_grad_weight, backend_grad_bias) = _convolution_backward_nogroup_backend( diff --git a/aten/src/ATen/native/ConvolutionMM2d.cpp b/aten/src/ATen/native/ConvolutionMM2d.cpp index 30fb04b13615..1837a0d838ea 100644 --- a/aten/src/ATen/native/ConvolutionMM2d.cpp +++ b/aten/src/ATen/native/ConvolutionMM2d.cpp @@ -26,26 +26,31 @@ static Tensor compute_columns2d( const int64_t pad_width = padding[1]; const int64_t stride_height = stride[0]; const int64_t stride_width = stride[1]; - const int64_t dim_planes = 1; - const int64_t dim_height = 2; - const int64_t dim_width = 3; - const int64_t n_input_plane = input.size(dim_planes); - const int64_t input_height = input.size(dim_height); - const int64_t input_width = input.size(dim_width); - const int64_t output_height = - (input_height + 2 * pad_height - kernel_height) / stride_height + 1; - const int64_t output_width = - (input_width + 2 * pad_width - kernel_width) / stride_width + 1; const int64_t batch_size = input.size(0); + const int64_t n_input_plane = input.size(1); + const int64_t input_height = input.size(2); + const int64_t input_width = input.size(3); + const int64_t output_height = (input_height + 2 * pad_height - kernel_height) / stride_height + 1; + const int64_t output_width = (input_width + 2 * pad_width - kernel_width) / stride_width + 1; + + bool is_channels_last = input.suggest_memory_format() == at::MemoryFormat::ChannelsLast; Tensor columns; if ((kernel_height == 1) && (stride_height == 1) && (pad_height == 0) && (kernel_width == 1) && (stride_width == 1) && (pad_width == 0)) { // Columns are just a view on the input for the 1x1 kernel special case. - columns = input.view({batch_size, n_input_plane, output_height * output_width}).detach(); + if (is_channels_last) { + columns = input.as_strided({batch_size, output_height * output_width, n_input_plane}, + {output_height * output_width * n_input_plane, n_input_plane, 1}).detach(); + } else { + columns = input.view({batch_size, n_input_plane, output_height * output_width}).detach(); + } } else { - columns = at::empty({batch_size, n_input_plane * kernel_height * kernel_width, - output_height * output_width}, input.options()); + int64_t row = is_channels_last ? + output_height * output_width : n_input_plane * kernel_height * kernel_width; + int64_t col = is_channels_last ? + kernel_height * kernel_width * n_input_plane : output_height * output_width; + columns = at::empty({batch_size, row, col}, input.options()); AT_DISPATCH_ALL_TYPES_AND(kBFloat16, input.scalar_type(), "slow_conv2d_cpu", [&]{ auto input_a = input.accessor(); auto columns_a = columns.accessor(); @@ -69,7 +74,8 @@ static Tensor compute_columns2d( input_height, input_width, output_height, - output_width); + output_width, + is_channels_last); } }); }); @@ -189,12 +195,15 @@ static inline void slow_conv2d_shape_check( } } -static Tensor view_weight_2d(const Tensor& weight_) { - Tensor weight = weight_.contiguous(); +static inline Tensor view_weight_2d(const Tensor& weight_, + at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) { + Tensor weight = weight_.contiguous(memory_format); if (weight.dim() == 4) { const int64_t s1 = weight.size(0); const int64_t s2 = weight.size(1) * weight.size(2) * weight.size(3); - return weight.view({s1, s2}); + return memory_format == at::MemoryFormat::ChannelsLast + ? weight.as_strided({s1, s2}, {s2, 1}) // CL: view as {oc, kh*kw*ic} + : weight.view({s1, s2}); // CF: view as {oc, ic*kh*kw} } else { return weight; } @@ -218,29 +227,50 @@ static void slow_conv2d_update_output_frame( int64_t input_width, int64_t n_output_plane, int64_t output_height, - int64_t output_width) { + int64_t output_width, + bool is_channels_last) { const int beta = has_bias ? 1 : 0; // Compute out = weight * input // Note gemm expects fortran order, so all 3 matrices are transposed. // Swapping argument order cancels this, since C == AB <=> T(C) == T(B)T(A) - const int64_t m = output_height * output_width; - const int64_t n = n_output_plane; - const int64_t k = n_input_plane * kernel_height * kernel_width; - - const int64_t lda = m; - const int64_t ldb = k; - const int64_t ldc = m; - - at::native::cpublas::gemm( - TransposeType::NoTranspose, - TransposeType::NoTranspose, - m, n, k, - static_cast(1), - finput.data(), lda, - weight.data(), ldb, - static_cast(beta), - output.data(), ldc); + if (is_channels_last) { + const int64_t m = n_output_plane; + const int64_t n = output_height * output_width; + const int64_t k = n_input_plane * kernel_height * kernel_width; + + const int64_t lda = k; + const int64_t ldb = k; + const int64_t ldc = m; + + at::native::cpublas::gemm( + TransposeType::Transpose, + TransposeType::NoTranspose, + m, n, k, + static_cast(1), + weight.data(), lda, + finput.data(), ldb, + static_cast(beta), + output.data(), ldc); + } else { + const int64_t m = output_height * output_width; + const int64_t n = n_output_plane; + const int64_t k = n_input_plane * kernel_height * kernel_width; + + const int64_t lda = m; + const int64_t ldb = k; + const int64_t ldc = m; + + at::native::cpublas::gemm( + TransposeType::NoTranspose, + TransposeType::NoTranspose, + m, n, k, + static_cast(1), + finput.data(), lda, + weight.data(), ldb, + static_cast(beta), + output.data(), ldc); + } } template @@ -254,27 +284,48 @@ void slow_conv2d_backward_update_grad_input_frame( int64_t stride_height, int64_t stride_width, int64_t pad_height, - int64_t pad_width) { + int64_t pad_width, + bool is_channels_last) { // Compute fgrad_input = weight.T * grad_output.reshape({grad_output.shape(0), -1}) // Note gemm expects fortran order, so all 3 matrices are transposed. // Swapping argument order cancels this, since C == AB <=> T(C) == T(B)T(A) - const int64_t m = grad_output.size(1) * grad_output.size(2); - const int64_t n = weight.size(1); - const int64_t k = weight.size(0); - - const int64_t lda = m; - const int64_t ldb = n; - const int64_t ldc = m; - - at::native::cpublas::gemm( - TransposeType::NoTranspose, - TransposeType::Transpose, - m, n, k, - static_cast(1), - grad_output.data(), lda, - weight.data(), ldb, - static_cast(0), - fgrad_input, ldc); + if (is_channels_last) { + const int64_t m = weight.size(1); + const int64_t n = grad_output.size(1) * grad_output.size(2); + const int64_t k = weight.size(0); + + const int64_t lda = m; + const int64_t ldb = k; + const int64_t ldc = m; + + at::native::cpublas::gemm( + TransposeType::NoTranspose, + TransposeType::NoTranspose, + m, n, k, + static_cast(1), + weight.data(), lda, + grad_output.data(), ldb, + static_cast(0), + fgrad_input, ldc); + } else { + const int64_t m = grad_output.size(1) * grad_output.size(2); + const int64_t n = weight.size(1); + const int64_t k = weight.size(0); + + const int64_t lda = m; + const int64_t ldb = n; + const int64_t ldc = m; + + at::native::cpublas::gemm( + TransposeType::NoTranspose, + TransposeType::Transpose, + m, n, k, + static_cast(1), + grad_output.data(), lda, + weight.data(), ldb, + static_cast(0), + fgrad_input, ldc); + } unfolded2d_acc_stub( kCPU, @@ -291,7 +342,8 @@ void slow_conv2d_backward_update_grad_input_frame( grad_input.size(1), grad_input.size(2), grad_output.size(1), - grad_output.size(2)); + grad_output.size(2), + is_channels_last); } void slow_conv2d_backward_out_cpu_template( @@ -309,7 +361,10 @@ void slow_conv2d_backward_out_cpu_template( const int64_t stride_height = stride[0]; const int64_t stride_width = stride[1]; - const Tensor weight = view_weight_2d(weight_); + bool use_channels_last = thnn_conv_use_channels_last(input_, weight_); + auto memory_format = use_channels_last ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous; + + const Tensor weight = view_weight_2d(weight_, memory_format); slow_conv2d_shape_check( input_, grad_output_, @@ -323,27 +378,21 @@ void slow_conv2d_backward_out_cpu_template( pad_width, false); - const Tensor input = input_.contiguous(); + const Tensor input = input_.contiguous(memory_format); // Compute shape of columnized data excluding batch dim. - const int64_t dim_planes = 1; - const int64_t dim_height = 2; - const int64_t dim_width = 3; - const int64_t n_input_plane = input.size(dim_planes); - const int64_t input_height = input.size(dim_height); - const int64_t input_width = input.size(dim_width); - const int64_t output_height = - (input_height + 2 * pad_height - kernel_height) / stride_height + 1; - const int64_t output_width = - (input_width + 2 * pad_width - kernel_width) / stride_width + 1; - const int64_t fgrad_input_size = - n_input_plane * kernel_height * kernel_width * output_height * output_width; - - const Tensor grad_output = grad_output_.contiguous(); - grad_input.resize_as_(input); - grad_input.zero_(); - TORCH_CHECK(grad_input.is_contiguous(), "slow_conv2d: grad_input must be contiguous"); const int64_t batch_size = input.size(0); + const int64_t n_input_plane = input.size(1); + const int64_t input_height = input.size(2); + const int64_t input_width = input.size(3); + const int64_t output_height = (input_height + 2 * pad_height - kernel_height) / stride_height + 1; + const int64_t output_width = (input_width + 2 * pad_width - kernel_width) / stride_width + 1; + const int64_t fgrad_input_size = n_input_plane * kernel_height * kernel_width * output_height * output_width; + + const Tensor grad_output = grad_output_.contiguous(memory_format); + grad_input.resize_as_(input, memory_format); + grad_input.zero_(); + TORCH_CHECK(grad_input.is_contiguous(memory_format), "slow_conv2d: grad_input must be contiguous"); AT_DISPATCH_FLOATING_TYPES_AND( kBFloat16, input.scalar_type(), "slow_conv2d_cpu_grad_input", [&] { @@ -366,7 +415,8 @@ void slow_conv2d_backward_out_cpu_template( stride_height, stride_width, pad_height, - pad_width); + pad_width, + use_channels_last); } }); }); @@ -376,27 +426,48 @@ template void slow_conv2d_backward_weight_frame( TensorAccessor grad_weight, TensorAccessor grad_output, - TensorAccessor finput) { + TensorAccessor finput, + bool is_channels_last) { // Compute grad_weight += grad_output.reshape({grad_output.shape(0), -1}) * finput.T // Note gemm expects fortran order, so all 3 matrices are transposed. // Swapping argument order cancels this, since C == AB <=> T(C) == T(B)T(A) - const int64_t m = finput.size(0); - const int64_t n = grad_output.size(0); - const int64_t k = grad_output.size(1) * grad_output.size(2); - - const int64_t lda = k; - const int64_t ldb = k; - const int64_t ldc = m; - - at::native::cpublas::gemm( - TransposeType::Transpose, - TransposeType::NoTranspose, - m, n, k, - static_cast(1), - finput.data(), lda, - grad_output.data(), ldb, - static_cast(1), - grad_weight.data(), ldc); + if (is_channels_last) { + const int64_t m = finput.size(1); + const int64_t n = grad_output.size(0); + const int64_t k = grad_output.size(1) * grad_output.size(2); + + const int64_t lda = m; + const int64_t ldb = n; + const int64_t ldc = m; + + at::native::cpublas::gemm( + TransposeType::NoTranspose, + TransposeType::Transpose, + m, n, k, + static_cast(1), + finput.data(), lda, + grad_output.data(), ldb, + static_cast(1), + grad_weight.data(), ldc); + } else { + const int64_t m = finput.size(0); + const int64_t n = grad_output.size(0); + const int64_t k = grad_output.size(1) * grad_output.size(2); + + const int64_t lda = k; + const int64_t ldb = k; + const int64_t ldc = m; + + at::native::cpublas::gemm( + TransposeType::Transpose, + TransposeType::NoTranspose, + m, n, k, + static_cast(1), + finput.data(), lda, + grad_output.data(), ldb, + static_cast(1), + grad_weight.data(), ldc); + } } static void slow_conv2d_backward_weight_out_cpu_template( @@ -406,9 +477,6 @@ static void slow_conv2d_backward_weight_out_cpu_template( IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding) { - CheckedFrom c = "slow_conv2d_backward_parameters_cpu"; - auto grad_weight_arg = TensorArg(grad_weight, "grad_weight_arg", 0); - const int64_t kernel_height = kernel_size[0]; const int64_t kernel_width = kernel_size[1]; const int64_t pad_height = padding[0]; @@ -416,9 +484,11 @@ static void slow_conv2d_backward_weight_out_cpu_template( const int64_t stride_height = stride[0]; const int64_t stride_width = stride[1]; - Tensor grad_weight_2d; - checkContiguous(c, grad_weight_arg); - grad_weight_2d = view_weight_2d(grad_weight); + bool use_channels_last = thnn_conv_use_channels_last(input, grad_weight); + auto memory_format = use_channels_last ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous; + + TORCH_CHECK(grad_weight.is_contiguous(memory_format), "slow_conv2d: grad_weight must be contiguous"); + Tensor grad_weight_2d = view_weight_2d(grad_weight, memory_format); slow_conv2d_shape_check( input, @@ -433,7 +503,7 @@ static void slow_conv2d_backward_weight_out_cpu_template( pad_width, true); - auto grad_output = grad_output_.contiguous(); + auto grad_output = grad_output_.contiguous(memory_format); Tensor finput = compute_columns2d(input, padding, stride, kernel_size); const int64_t batch_size = input.size(0); @@ -449,7 +519,7 @@ static void slow_conv2d_backward_weight_out_cpu_template( auto finput_t = finput_a[t]; slow_conv2d_backward_weight_frame( - grad_weight_2d_a, grad_output_t, finput_t); + grad_weight_2d_a, grad_output_t, finput_t, use_channels_last); } }); } @@ -474,7 +544,10 @@ Tensor& slow_conv2d_forward_out_cpu( const int64_t stride_height = stride[0]; const int64_t stride_width = stride[1]; - const Tensor weight_2d = view_weight_2d(weight_); + bool use_channels_last = thnn_conv_use_channels_last(self, weight_); + auto memory_format = use_channels_last ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous; + + const Tensor weight_2d = view_weight_2d(weight_, memory_format); slow_conv2d_shape_check( self, @@ -489,28 +562,21 @@ Tensor& slow_conv2d_forward_out_cpu( pad_width, false); - const Tensor input = self.contiguous(); - const int64_t dim_planes = 1; - const int64_t dim_height = 2; - const int64_t dim_width = 3; - - const int64_t n_input_plane = input.size(dim_planes); - const int64_t input_height = input.size(dim_height); - const int64_t input_width = input.size(dim_width); - const int64_t n_output_plane = weight_2d.size(0); - const int64_t output_height = - (input_height + 2 * pad_height - kernel_height) / stride_height + 1; - const int64_t output_width = - (input_width + 2 * pad_width - kernel_width) / stride_width + 1; - + const Tensor input = self.contiguous(memory_format); const int64_t batch_size = input.size(0); + const int64_t n_input_plane = input.size(1); + const int64_t input_height = input.size(2); + const int64_t input_width = input.size(3); + const int64_t n_output_plane = weight_2d.size(0); + const int64_t output_height = (input_height + 2 * pad_height - kernel_height) / stride_height + 1; + const int64_t output_width = (input_width + 2 * pad_width - kernel_width) / stride_width + 1; Tensor finput = compute_columns2d(input, padding, stride, kernel_size); - output.resize_({batch_size, n_output_plane, output_height, output_width}); + output.resize_({batch_size, n_output_plane, output_height, output_width}, memory_format); if (bias.defined()) { output.copy_(bias.reshape({-1, 1, 1})); } - TORCH_CHECK(output.is_contiguous(), "slow_conv2d output tensor must be contiguous"); + TORCH_CHECK(output.is_contiguous(memory_format), "slow_conv2d output tensor must be contiguous"); AT_DISPATCH_ALL_TYPES_AND(kBFloat16, input.scalar_type(), "slow_conv2d_cpu", [&]{ auto input_a = input.accessor(); @@ -540,7 +606,8 @@ Tensor& slow_conv2d_forward_out_cpu( input_width, n_output_plane, output_height, - output_width); + output_width, + use_channels_last); } }); }); @@ -596,9 +663,8 @@ std::tuple slow_conv2d_backward_out_cpu( at::sum_out(grad_bias, grad_output, IntArrayRef{0, 2, 3}); } - if (grad_weight.defined()) { - grad_weight.resize_(weight.sizes()); + grad_weight.resize_(weight.sizes(), weight.suggest_memory_format()); grad_weight.zero_(); slow_conv2d_backward_weight_out_cpu_template( grad_weight, diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp index caf2dfe7773f..46c0d48d8a7b 100644 --- a/aten/src/ATen/native/Copy.cpp +++ b/aten/src/ATen/native/Copy.cpp @@ -2,9 +2,11 @@ #include #include +#include #include #include #include +#include #include #include #include @@ -52,7 +54,7 @@ void copy_same_type_transpose_(Tensor& self, const Tensor& src) { // The code below is implemented with the assumption that sizes are equal TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self.sizes().equals(src.sizes())); - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, self.scalar_type(), "copy_", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kHalf, kBool, kBFloat16, kComplexHalf, self.scalar_type(), "copy_", [&] { scalar_t* sp = src.data_ptr(); scalar_t* rp = self.data_ptr(); scalar_t* bp = buf.data_ptr(); @@ -97,7 +99,7 @@ void copy_same_type_transpose_(Tensor& self, const Tensor& src) { // (e.g. XLA) may be supported by overriding copy_ and _copy_from. bool is_supported_device(Device device) { DeviceType device_type = device.type(); - return device_type == kCPU || device_type == kCUDA || device_type == kHIP || device_type == kVulkan || device_type == kMetal; + return device_type == kCPU || device_type == kCUDA || device_type == kHIP || device_type == kVulkan || device_type == kMetal || device_type == kMPS; } } // namespace @@ -184,7 +186,7 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking) } if (self.is_quantized() && !src.is_quantized()) { - return quantized_copy_from_float_cpu_(self, src); + return quantized_copy_from_float_(self, src); } if (self.is_quantized() && src.is_quantized()) { @@ -210,6 +212,7 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking) return at::metal::metal_copy_(self, src); } + auto iter = TensorIteratorConfig() .add_output(self) .add_input(src) @@ -227,6 +230,8 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking) device_type = kCUDA; } else if (iter.device_type(1) == kHIP) { device_type = kHIP; + } else if (iter.device_type(1) == kMPS) { + device_type = kMPS; } // TODO: if we need to, we can also enable this path for quantized tensor @@ -235,6 +240,12 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking) return self; } +#ifdef USE_MPS + if (self.device().type() == at::kMPS || src.device().type() == at::kMPS) { + return at::native::mps::mps_copy_(self, src, non_blocking); + } +#endif + if(!self.is_complex() && src.is_complex()) { TORCH_WARN_ONCE("Casting complex values to real discards the imaginary part"); } @@ -242,6 +253,24 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking) return self; } +Tensor copy(const Tensor& self, const Tensor& src, bool non_blocking) { + // copy() is the "functional" form of copy_(). It exists so we can properly functionalize copy_(), but: + // (1) It isn't exposed to the frontend (no python bindings) + // (2) It isn't exposed to the backend (it's a composite, that decomposes into to() and expand_as() calls. + // Note: This implementation doesn't currently preserve the strides of `self`. + // That might be fine for functorch (which already doesn't preserve strides in vmap), + // but it's worth looking into whether or not this implementation will be problematic for LazyTensor/XLA. + auto intermediate = src.to(self, non_blocking); + // Unfortunately, copy()'s decomposition involves view ops. + // To preserve the functionalization pass semantics of "maybe reapply views", + // we need to manually do that here. + if (at::functionalization::impl::getFunctionalizationReapplyViewsTLS()) { + return intermediate.expand(self.sizes()); + } else { + return at::expand_copy(intermediate, self.sizes()); + } +} + Tensor& copy_(Tensor& self, const Tensor& src, bool non_blocking) { auto maybe_outnames = namedinference::compute_broadcast_outnames(self, src); { @@ -258,7 +287,7 @@ Tensor& copy_(Tensor& self, const Tensor& src, bool non_blocking) { return self; } -void copy_ignoring_overlaps(const Tensor &dst, const Tensor &src) { +void copy_ignoring_overlaps(const TensorBase &dst, const TensorBase &src) { // Called when we are copying into an overlapping index `dst`, but we don't // care which writer wins. Hacky but it works. This is only used by // CUDA_tensor_apply2 in case that there are write overlaps. diff --git a/aten/src/ATen/native/Copy.h b/aten/src/ATen/native/Copy.h index 6f688a73e84c..14abb32fa5ad 100644 --- a/aten/src/ATen/native/Copy.h +++ b/aten/src/ATen/native/Copy.h @@ -6,6 +6,7 @@ namespace at { class Tensor; struct TensorIterator; +class TensorBase; namespace native { @@ -13,7 +14,7 @@ using copy_fn = void (*)(TensorIterator&, bool non_blocking); DECLARE_DISPATCH(copy_fn, copy_stub); -TORCH_API void copy_ignoring_overlaps(const Tensor &dst, const Tensor &src); +TORCH_API void copy_ignoring_overlaps(const TensorBase &dst, const TensorBase &src); } // namespace native } // namespace at diff --git a/aten/src/ATen/native/Cross.h b/aten/src/ATen/native/Cross.h index 30001fc6b8a2..9daee7f2d6c4 100644 --- a/aten/src/ATen/native/Cross.h +++ b/aten/src/ATen/native/Cross.h @@ -1,9 +1,11 @@ #pragma once -#include #include -namespace at { namespace native { +namespace at { +class Tensor; + +namespace native { using cross_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const int64_t d); diff --git a/aten/src/ATen/native/DilatedConvolutionUtils.h b/aten/src/ATen/native/DilatedConvolutionUtils.h index 2d4815799b10..51b30a9bc77a 100644 --- a/aten/src/ATen/native/DilatedConvolutionUtils.h +++ b/aten/src/ATen/native/DilatedConvolutionUtils.h @@ -4,7 +4,7 @@ #include #include -#include +#include #include #define TORCH_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE) \ diff --git a/aten/src/ATen/native/DispatchStub.h b/aten/src/ATen/native/DispatchStub.h index 02fb12928090..bd04b4df9a95 100644 --- a/aten/src/ATen/native/DispatchStub.h +++ b/aten/src/ATen/native/DispatchStub.h @@ -197,8 +197,8 @@ struct RegisterHIPDispatch { stub.set_cuda_dispatch_ptr(value); } }; -} // anonymous namespace +} // anonymous namespace // Compiler will complain if you put things like std::tuple in // the `fn` argument of DECLARE_DISPATCH. Some possible workarounds, e.g., // adding parentheses and using helper struct to get rid of the parentheses, do diff --git a/aten/src/ATen/native/Distance.cpp b/aten/src/ATen/native/Distance.cpp index 1bbb9cb8426a..8d23e10b1719 100644 --- a/aten/src/ATen/native/Distance.cpp +++ b/aten/src/ATen/native/Distance.cpp @@ -239,19 +239,72 @@ Tensor _pdist_backward(const Tensor& grad, const Tensor& self, const double p, c return result; } -Tensor cosine_similarity(const Tensor& x1, const Tensor& x2, int64_t dim, double eps) { - auto common_size = at::infer_size_dimvector(x1.sizes(), x2.sizes()); - auto commonDtype = at::result_type(x1, x2); +Tensor cosine_similarity(const Tensor& x1_, const Tensor& x2_, int64_t dim, double eps) { + /* + * cosine_similarity(x1, x2) = / (||x1|| * ||x2||) + * + * The current implementation is an improvement over the previous version. + * + * Previous implementation: + * 1. Compute num = , + * 2. Compute denom = ||x1|| * ||x2||, + * 3. Compute denom = max(denom, eps) to avoid division by zero, + * 4. Return num / denom. + * + * Previous implementation has the following issues: + * 1. Chance of losing precision in when ||x1|| and ||x2|| are large. + * 2. Chance of losing precision in ||x1|| * ||x2|| when ||x1|| and ||x2|| are large. + * 3. Losing precision may cause |cosing_similarity(x1, x2)| > 1.0. + * + * Current implementation: + * 1. Compute x1_normalized = x1 / max(||x1||, eps), + * x2_normalized = x2 / max(||x2||, eps), + * 2. Return . + * + * The current implementation improves over the previous one by: + * 1. Making sure that and ||x1|| * ||x2|| are not computed explicitly, + * hence avoiding floating point overflows. + * 2. Both methods might have issues with computing ||x1|| and ||x2||, but for + * the current method this is the only source of the floating point imprecision. + * 3. Makes sure |cosing_similarity(x1, x2)| <= 1.0. + * + */ + auto commonDtype = at::result_type(x1_, x2_); TORCH_CHECK(at::isFloatingType(commonDtype), "expected common dtype to be floating point, yet common dtype is ", commonDtype); - Tensor x1_ = x1.to(commonDtype).expand(common_size); - Tensor x2_ = x2.to(commonDtype).expand(common_size); - // Follow scipy impl to improve numerical precision - // Use x / sqrt(x * x) instead of x / (sqrt(x) * sqrt(x)) - Tensor w12 = at::sum(x1_ * x2_, dim); - Tensor w1 = at::sum(x1_ * x1_, dim); - Tensor w2 = at::sum(x2_ * x2_, dim); - Tensor n12 = (w1 * w2).clamp_min_(eps * eps).sqrt_(); - return w12.div_(n12); + + auto common_size = at::infer_size_dimvector(x1_.sizes(), x2_.sizes()); + auto x1 = x1_.to(commonDtype).expand(common_size); + auto x2 = x2_.to(commonDtype).expand(common_size); + + auto x1_squared_norm = at::pow(x1, 2).sum(dim, /*keepdim=*/true); + auto x2_squared_norm = at::pow(x2, 2).sum(dim, /*keepdim=*/true); + + { + at::NoGradGuard guard; + x1_squared_norm.clamp_min_(eps * eps); + x2_squared_norm.clamp_min_(eps * eps); + } + + auto x1_norm = x1_squared_norm.sqrt_(); + auto x2_norm = x2_squared_norm.sqrt_(); + + auto x1_normalized = x1.div(x1_norm); + auto x2_normalized = x2.div(x2_norm); + + Tensor cos_sim_value = at::sum(x1_normalized * x2_normalized, dim); + + // The code above is resistant to over +/-1 overshoots. + // However, if this happens and if it is critical, uncommenting + // the lines below will solve the issue. + // We keep these lines commented as to reduce the number of kernel + // launches for better runtime performance. + //{ + // at::NoGradGuard guard; + // cos_sim_value.clamp_min_(-1.0); + // cos_sim_value.clamp_max_(1.0); + //} + + return cos_sim_value; } }} // namespace at::native diff --git a/aten/src/ATen/native/Distance.h b/aten/src/ATen/native/Distance.h index f8ea4741207b..c2d881ae66f6 100644 --- a/aten/src/ATen/native/Distance.h +++ b/aten/src/ATen/native/Distance.h @@ -1,9 +1,11 @@ #pragma once -#include #include -namespace at { namespace native { +namespace at { +class Tensor; + +namespace native { using pdist_forward_fn = void(*)(Tensor&, const Tensor&, const double p); using pdist_backward_fn = void(*)(Tensor&, const Tensor&, const Tensor&, const double p, const Tensor&); diff --git a/aten/src/ATen/native/DistributionTemplates.h b/aten/src/ATen/native/DistributionTemplates.h index c8a8a6ed8a50..907dffc6f736 100644 --- a/aten/src/ATen/native/DistributionTemplates.h +++ b/aten/src/ATen/native/DistributionTemplates.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -157,50 +158,22 @@ at::Tensor& random_from_to_impl(at::Tensor& self, int64_t from, c10::optional(), \ + "normal expects all elements of std >= 0.0"); \ + } while (0) + +#define CHECK_NORMAL_STD(std) \ + TORCH_CHECK(std >= 0.0, "normal expects std >= 0.0, but found std ", std); template class normal_kernel, typename RNG> Tensor& normal_impl_(Tensor& self, double mean, double std, c10::optional gen) { - TORCH_CHECK(std >= 0.0, "normal_ expects std >= 0.0, but found std=", std); + CHECK_NORMAL_STD(std); if (self.is_complex()) { auto float_tensor = at::view_as_real(self); // variance for normal distribution of the real and imaginary values @@ -214,6 +187,10 @@ Tensor& normal_impl_(Tensor& self, double mean, double std, c10::optional class normal_kernel, typename RNG> Tensor& normal_out_impl(Tensor& output, const Tensor& mean, double std, c10::optional gen) { + CHECK_NORMAL_STD(std); + auto std_tensor = at::empty_like(output, MemoryFormat::Contiguous); + auto shape = at::infer_size(mean.sizes(), std_tensor.sizes()); + at::native::resize_output(output, shape); normal_impl_(output, 0, std, gen); output.add_(mean); return output; @@ -221,12 +198,11 @@ Tensor& normal_out_impl(Tensor& output, const Tensor& mean, double std, c10::opt template class normal_kernel, typename RNG> Tensor& normal_out_impl(Tensor& output, double mean, const Tensor& std, c10::optional gen) { - TORCH_CHECK(!std.is_complex(), "normal expects standard deviation to be non-complex"); - TORCH_CHECK( - std.min().ge(0).item(), - "normal expects all elements of std >= 0.0"); - normal_impl_(output, 0, 1, gen); + CHECK_NORMAL_TENSOR_STD(std); auto mean_tensor = at::full({}, mean, output.options()); + auto shape = at::infer_size(mean_tensor.sizes(), std.sizes()); + at::native::resize_output(output, shape); + normal_impl_(output, 0, 1, gen); // CUDA NB: addcmul_out copies the tensor to be added into the output. // Please look at aten/src/THC/generic/THCTensorMathPointwise.cu // The previous function here was addcmul_out(output, mean_tensor, output, std, 1); @@ -238,28 +214,22 @@ Tensor& normal_out_impl(Tensor& output, double mean, const Tensor& std, c10::opt template class normal_kernel, typename RNG> Tensor& normal_out_impl(Tensor& output, const Tensor& mean, const Tensor& std, c10::optional gen) { - TORCH_CHECK(!std.is_complex(), "normal expects standard deviation to be non-complex"); - TORCH_CHECK( - std.numel() == 0 || std.min().ge(0).item(), - "normal expects all elements of std >= 0.0"); - bool is_deprecated_th_impl = resize_output_for_normal(output, mean, std); + CHECK_NORMAL_TENSOR_STD(std); + auto shape = at::infer_size(mean.sizes(), std.sizes()); + at::native::resize_output(output, shape); normal_impl_(output, 0, 1, gen); // CUDA NB: addcmul_out copies the tensor to be added into the output. // Please look at aten/src/THC/generic/THCTensorMathPointwise.cu // The previous function here was addcmul_out(output, mean, output, std, 1); // The third argument is not a constant reference and hence the samples in output are overwritten. // Consequently, the computation performed is mean + mean * std instead of mean + output * std - if (is_deprecated_th_impl) { - output.mul_(std.reshape(mean.sizes())).add_(mean); - } - else { - output.mul_(std).add_(mean); - } + output.mul_(std).add_(mean); return output; } template class normal_kernel, typename RNG> Tensor normal_impl(const Tensor& mean, double std, c10::optional gen) { + CHECK_NORMAL_STD(std); Tensor ret = at::empty_like(mean, MemoryFormat::Contiguous); normal_out_impl(ret, mean, std, gen); return ret; @@ -267,6 +237,7 @@ Tensor normal_impl(const Tensor& mean, double std, c10::optional gen) template class normal_kernel, typename RNG> Tensor normal_impl(double mean, const Tensor& std, c10::optional gen) { + CHECK_NORMAL_TENSOR_STD(std); Tensor ret = at::empty_like(std, MemoryFormat::Contiguous); normal_out_impl(ret, mean, std, gen); return ret; @@ -274,7 +245,9 @@ Tensor normal_impl(double mean, const Tensor& std, c10::optional gen) template class normal_kernel, typename RNG> Tensor normal_impl(const Tensor& mean, const Tensor& std, c10::optional gen) { - Tensor ret = at::empty({0}, mean.options(), MemoryFormat::Contiguous); + CHECK_NORMAL_TENSOR_STD(std); + auto shape = at::infer_size(mean.sizes(), std.sizes()); + Tensor ret = at::empty(shape, mean.options(), MemoryFormat::Contiguous); normal_out_impl(ret, mean, std, gen); return ret; } diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp index b4063af9931f..b23a18a8376a 100644 --- a/aten/src/ATen/native/Distributions.cpp +++ b/aten/src/ATen/native/Distributions.cpp @@ -257,39 +257,77 @@ struct NormalStub { } }; +template +struct NormalMeta { + // No-op! + void operator()(Tensor& self, double mean, double std, c10::optional gen) { + } +}; + +// inplace Tensor& normal_(Tensor& self, double mean, double std, c10::optional gen) { return at::native::templates::normal_impl_(self, mean, std, gen); } Tensor& normal_meta_(Tensor& self, double mean, double std, c10::optional gen) { - TORCH_CHECK(std > 0.0, "normal_ expects std > 0.0, but found std=", std); // TODO: dedupe - return self; + return at::native::templates::normal_impl_(self, mean, std, gen); } +// out tensor float Tensor& normal_out(const Tensor& mean, double std, c10::optional gen, Tensor& output) { return at::native::templates::normal_out_impl(output, mean, std, gen); } +Tensor& normal_out_meta(const Tensor& mean, double std, c10::optional gen, Tensor& output) { + return at::native::templates::normal_out_impl(output, mean, std, gen); +} + +// out float tensor Tensor& normal_out(double mean, const Tensor& std, c10::optional gen, Tensor& output) { return at::native::templates::normal_out_impl(output, mean, std, gen); } +Tensor& normal_out_meta(double mean, const Tensor& std, c10::optional gen, Tensor& output) { + return at::native::templates::normal_out_impl(output, mean, std, gen); + +} + +// out tensor tensor Tensor& normal_out(const Tensor& mean, const Tensor& std, c10::optional gen, Tensor& output) { return at::native::templates::normal_out_impl(output, mean, std, gen); } +Tensor& normal_out_meta(const Tensor& mean, const Tensor& std, c10::optional gen, Tensor& output) { + return at::native::templates::normal_out_impl(output, mean, std, gen); +} + +// functional tensor float Tensor normal(const Tensor& mean, double std, c10::optional gen) { return at::native::templates::normal_impl(mean, std, gen); } +Tensor normal_meta(const Tensor& mean, double std, c10::optional gen) { + return at::native::templates::normal_impl(mean, std, gen); +} + +// functional float tensor Tensor normal(double mean, const Tensor& std, c10::optional gen) { return at::native::templates::normal_impl(mean, std, gen); } +Tensor normal_meta(double mean, const Tensor& std, c10::optional gen) { + return at::native::templates::normal_impl(mean, std, gen); +} + +// functional tensor tensor Tensor normal(const Tensor& mean, const Tensor& std, c10::optional gen) { return at::native::templates::normal_impl(mean, std, gen); } +Tensor normal_meta(const Tensor& mean, const Tensor& std, c10::optional gen) { + return at::native::templates::normal_impl(mean, std, gen); +} + // ==================================================== Random ======================================================== template @@ -411,7 +449,7 @@ Tensor _s_poisson_cpu(const Tensor& lambda, c10::optional gen) { .add_output(ret) .add_input(lambda) .build(); - AT_DISPATCH_FLOATING_TYPES(ret.scalar_type(), "poisson_cpu", [&] { + AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, ret.scalar_type(), "poisson_cpu", [&] { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); // See Note [Acquire lock when using random generators] std::lock_guard lock(generator->mutex_); diff --git a/aten/src/ATen/native/Distributions.h b/aten/src/ATen/native/Distributions.h index ebfaf4631369..2c334157eba9 100644 --- a/aten/src/ATen/native/Distributions.h +++ b/aten/src/ATen/native/Distributions.h @@ -1,7 +1,5 @@ #pragma once -#include -#include #include #include #include diff --git a/aten/src/ATen/native/Dropout.cpp b/aten/src/ATen/native/Dropout.cpp index fb11bc8d8cbb..36e1b92ad1bd 100644 --- a/aten/src/ATen/native/Dropout.cpp +++ b/aten/src/ATen/native/Dropout.cpp @@ -99,11 +99,11 @@ native_dropout_cpu(const Tensor& input, double p, c10::optional train) { double p1m = 1. - p; // Check for probability of zero to avoid divide by zero and NaN results double scale = p1m == 0 ? 0. : 1. / p1m; - mask = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + mask = at::empty_like(input, input.options().dtype(c10::CppTypeToScalarType::value)); mask.bernoulli_(p1m); output = input.mul(mask).mul_(scale); } else { - mask = at::ones_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + mask = at::ones_like(input, input.options().dtype(c10::CppTypeToScalarType::value)); output = input.clone(); } return std::make_tuple(output, mask); diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp index e6f88f556c82..6d8cea26f52e 100644 --- a/aten/src/ATen/native/EmbeddingBag.cpp +++ b/aten/src/ATen/native/EmbeddingBag.cpp @@ -10,6 +10,7 @@ #ifdef USE_FBGEMM #include +#include #else #include #endif @@ -60,14 +61,14 @@ std::pair promoteIndicesAndOffsets( // is only applicable if special conditions are met template bool is_fast_path_index_select(const Tensor& src, Tensor& output, index_t padding_idx) { - return src.scalar_type() == kFloat && src.strides()[1] == 1 && output.strides()[1] == 1 && padding_idx < static_cast(0); + return (src.scalar_type() == kFloat || src.scalar_type() == kHalf) && src.strides()[1] == 1 && output.strides()[1] == 1 && padding_idx < static_cast(0); } // Determines if we can use a fast implementation for index_select_scale_add, // which is only applicable if special conditions are met template bool is_fast_path_index_select_scale(const Tensor& src, const Tensor& scale, Tensor& output, index_t padding_idx) { - return src.scalar_type() == kFloat && src.strides()[1] == 1 && output.strides()[1] == 1 && scale.strides()[0] == 1 && padding_idx < static_cast(0); + return (src.scalar_type() == kFloat || src.scalar_type() == kHalf) && src.strides()[1] == 1 && output.strides()[1] == 1 && scale.strides()[0] == 1 && padding_idx < static_cast(0); } template @@ -81,7 +82,7 @@ bool is_fast_path(const Tensor& src, const c10::optional& scale, Tensor& // index_add (using add_indices as the index), without creating an intermediary // tensor to hold the selected embeddings template -typename std::enable_if::value, void>::type +typename std::enable_if::value && !std::is_same::value, void>::type index_select_add(const Tensor &select_indices, const Tensor &add_indices, const Tensor &src, @@ -89,19 +90,20 @@ index_select_add(const Tensor &select_indices, const Tensor& /*offsets*/, bool /*include_last_offset*/, Tensor &bag_size, - index_t padding_idx) { + index_t padding_idx, + _EmbeddingBagKernelCache* /* fbgemm_kernel_cache */) { TORCH_CHECK(select_indices.numel() == add_indices.numel()); auto* add_indices_data = add_indices.data_ptr(); auto* select_indices_data = select_indices.data_ptr(); auto* src_data = src.data_ptr(); auto* output_data = output.data_ptr(); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - index_t* bag_size_data; + index_t* bag_size_data = nullptr; if (bag_size.defined()) { bag_size_data = bag_size.data_ptr(); } auto numel = add_indices.numel(); - int64_t ddim = src.sizes()[1]; + int64_t ddim = src.size(1); auto vocab_size = src.size(0); auto src_stride0 = src.strides()[0]; auto src_stride1 = src.strides()[1]; @@ -157,6 +159,157 @@ void fbgemm_spmdm_report_error_( } } // namespace +template +typename std::enable_if::value, void>::type +index_select_add(const Tensor &select_indices, + const Tensor &add_indices, + const Tensor &src, + Tensor &output, + const Tensor& offsets, + bool include_last_offset, + Tensor &bag_size, + index_t padding_idx, + _EmbeddingBagKernelCache* fbgemm_kernel_cache) { + int64_t ddim = src.size(1); + auto* select_indices_data = select_indices.data_ptr(); + auto* output_data = output.data_ptr(); + + if (is_fast_path_index_select(src, output, padding_idx)) { + auto src_contig = src.contiguous(); + auto* src_data = src_contig.data_ptr(); + int64_t output_size = offsets.numel() - 1; + auto* offsets_data = offsets.data_ptr(); + std::vector offsets_include_last; + + if (include_last_offset) { + output_size = offsets.numel() - 1; + } else { + output_size = offsets.numel(); + offsets_include_last.resize(offsets.numel() + 1); + if (offsets.numel() > 0) { + std::memcpy( + offsets_include_last.data(), + offsets.data_ptr(), + sizeof(index_t) * offsets.numel()); + } + offsets_include_last[offsets.numel()] = select_indices.numel(); + offsets_data = offsets_include_last.data(); + } + +#ifdef USE_FBGEMM + using float16 = uint16_t; + auto kernel_fp16_index_t = fbgemm_kernel_cache ? + fbgemm_kernel_cache->getCallback(ddim) : + fbgemm::GenerateEmbeddingSpMDM( + /* block_size */ddim, + /* has_weight */false, + /* normalize_by_lengths */false, + /* prefetch */16, + /* is_weight_positional */false, + /* use_offsets */true + ); +#else + // Initialize the intermediate output buffer to be 0. + Tensor output_fp32 = at::zeros({output_size, ddim}, output.options().dtype(at::kFloat)); + auto* output_data_fp32 = output_fp32.data_ptr(); +#endif + at::parallel_for( + 0, output_size, 1, [&](index_t start_idx, index_t end_idx) { +#ifdef USE_FBGEMM + bool success = kernel_fp16_index_t( + /* output_size */end_idx - start_idx, + /* index_size */offsets_data[end_idx] - offsets_data[start_idx], + /* data_size */src.size(0), + /* input */reinterpret_cast(src_data), + /* indices */select_indices_data + offsets_data[start_idx], + /* offsets_or_lengths */offsets_data + start_idx, + /* weights */nullptr, + /* output */reinterpret_cast(output_data + start_idx * ddim)); + if (!success) { + fbgemm_spmdm_report_error_( + end_idx - start_idx, + offsets_data[end_idx] - offsets_data[start_idx], + src.size(0), + offsets_data + start_idx, + select_indices_data + offsets_data[start_idx]); + } +#else + caffe2::EmbeddingLookupIdx( + /*block_size=*/ddim, + /*output_size=*/end_idx - start_idx, + /*index_size=*/offsets_data[end_idx] - offsets_data[start_idx], + /*data_size=*/src.size(0), + /*input=*/src_data, + /*indices=*/select_indices_data + offsets_data[start_idx], + /*offsets=*/offsets_data + start_idx, + /*weights=*/nullptr, + /*scale_bias=*/nullptr, + /*normalize_by_lengths=*/false, + /*out=*/output_data_fp32 + start_idx * ddim); + for (const auto i : c10::irange(output_size)) { + // Convert FP32 intermediate buffer result back to FP16 for output dtype + for (const auto d : c10::irange(ddim)) { + (output_data + i * ddim)[d] = static_cast((output_data_fp32 + ddim * i)[d]); + } + } +#endif + }); + + } else { + TORCH_CHECK(select_indices.numel() == add_indices.numel()); + auto* src_data = src.data_ptr(); + auto* add_indices_data = add_indices.data_ptr(); + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + index_t* bag_size_data = nullptr; + if (bag_size.defined()) { + bag_size_data = bag_size.data_ptr(); + } + auto vocab_size = src.size(0); + auto src_stride0 = src.strides()[0]; + auto src_stride1 = src.strides()[1]; + auto output_stride0 = output.strides()[0]; + auto output_stride1 = output.strides()[1]; + auto numel = add_indices.numel(); + + Tensor src_fp32 = at::empty({ddim}, src.options().dtype(at::kFloat)); + auto* src_data_fp32 = src_fp32.data_ptr(); + + // Initialize the intermediate output buffer to be 0. + Tensor output_fp32 = at::zeros({output.size(0), ddim}, output.options().dtype(at::kFloat)); + auto* output_data_fp32 = output_fp32.data_ptr(); + + for (const auto i : c10::irange(numel)) { + // We can skip indices equal to padding_idx so they are not included in + // the reduction + auto idx = select_indices_data[i]; + TORCH_CHECK( + idx >= 0 && idx < vocab_size, + "embedding_bag: Expected idx >= 0 && idx < num_embeddings but found idx to be ", + idx); + if (idx != padding_idx) { + // Copy src_data + src_stride0 * idx to src_data_fp32 + for (const auto d : c10::irange(ddim)) { + src_data_fp32[d] = static_cast((src_data + src_stride0 * idx)[d * src_stride1]); + } + at::native::cpublas::axpy(ddim, 1, + src_data_fp32, 1, + output_data_fp32 + ddim * add_indices_data[i], 1); + + } else if (bag_size.defined()) { + // Decrement bag_size to reflect that the index is padded + // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) + bag_size_data[add_indices_data[i]]--; + } + } + for (const auto i : c10::irange(output.size(0))) { + // Convert FP32 intermediate buffer result back to FP16 for output dtype + for (const auto d : c10::irange(ddim)) { + (output_data + output_stride0 * i)[d * output_stride1] = static_cast((output_data_fp32 + ddim * i)[d]); + } + } + } +} + template typename std::enable_if::value, void>::type index_select_add(const Tensor &select_indices, @@ -166,8 +319,9 @@ index_select_add(const Tensor &select_indices, const Tensor& offsets, bool include_last_offset, Tensor &bag_size, - index_t padding_idx) { - int64_t ddim = src.sizes()[1]; + index_t padding_idx, + _EmbeddingBagKernelCache* fbgemm_kernel_cache) { + int64_t ddim = src.size(1); auto* select_indices_data = select_indices.data_ptr(); auto* output_data = output.data_ptr(); @@ -195,6 +349,8 @@ index_select_add(const Tensor &select_indices, #ifdef USE_FBGEMM auto kernel_fp32_index_t = + fbgemm_kernel_cache ? + fbgemm_kernel_cache->getCallback(ddim) : fbgemm::GenerateEmbeddingSpMDM( /* block_size */ddim, /* has_weight */false, @@ -210,7 +366,7 @@ index_select_add(const Tensor &select_indices, bool success = kernel_fp32_index_t( /* output_size */end_idx - start_idx, /* index_size */offsets_data[end_idx] - offsets_data[start_idx], - /* data_size */src.sizes()[0], + /* data_size */src.size(0), /* input */src_data, /* indices */select_indices_data + offsets_data[start_idx], /* offsets_or_lengths */offsets_data + start_idx, @@ -220,7 +376,7 @@ index_select_add(const Tensor &select_indices, fbgemm_spmdm_report_error_( end_idx - start_idx, offsets_data[end_idx] - offsets_data[start_idx], - src.sizes()[0], + src.size(0), offsets_data + start_idx, select_indices_data + offsets_data[start_idx]); } @@ -229,7 +385,7 @@ index_select_add(const Tensor &select_indices, /*block_size=*/ddim, /*output_size=*/end_idx - start_idx, /*index_size=*/offsets_data[end_idx] - offsets_data[start_idx], - /*data_size=*/src.sizes()[0], + /*data_size=*/src.size(0), /*input=*/src_data, /*indices=*/select_indices_data + offsets_data[start_idx], /*offsets=*/offsets_data + start_idx, @@ -244,7 +400,7 @@ index_select_add(const Tensor &select_indices, auto* src_data = src.data_ptr(); auto* add_indices_data = add_indices.data_ptr(); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - index_t* bag_size_data; + index_t* bag_size_data = nullptr; if (bag_size.defined()) { bag_size_data = bag_size.data_ptr(); } @@ -284,7 +440,7 @@ index_select_add(const Tensor &select_indices, // mul (scaling by per_sample_weights) // index_add (using add_indices as the index) template -static typename std::enable_if::value, void>::type +static typename std::enable_if::value && !std::is_same::value, void>::type index_select_scale_add(const Tensor &select_indices, const Tensor &add_indices, const Tensor &scale, @@ -293,14 +449,15 @@ index_select_scale_add(const Tensor &select_indices, const Tensor& /*offsets*/, bool /*include_last_offset*/, Tensor &bag_size, - index_t padding_idx) { + index_t padding_idx, + _EmbeddingBagKernelCache* /* fbgemm_kernel_cache */) { AT_ASSERT(select_indices.numel() == add_indices.numel()); auto* add_indices_data = add_indices.data_ptr(); auto* select_indices_data = select_indices.data_ptr(); auto* src_data = src.data_ptr(); auto* output_data = output.data_ptr(); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - index_t* bag_size_data; + index_t* bag_size_data = nullptr; if (bag_size.defined()) { bag_size_data = bag_size.data_ptr(); } @@ -338,6 +495,161 @@ index_select_scale_add(const Tensor &select_indices, } } +template +typename std::enable_if::value, void>::type +index_select_scale_add(const Tensor &select_indices, + const Tensor &add_indices, + const Tensor &scale, + const Tensor &src, + Tensor &output, + const Tensor& offsets, + bool include_last_offset, + Tensor &bag_size, + index_t padding_idx, + _EmbeddingBagKernelCache* fbgemm_kernel_cache) { + int64_t ddim = src.size(1); + auto* scale_data = scale.data_ptr(); + auto* select_indices_data = select_indices.data_ptr(); + auto* output_data = output.data_ptr(); + + if (is_fast_path_index_select_scale(src, scale, output, padding_idx)) { + auto src_contig = src.contiguous(); + auto* src_data = src_contig.data_ptr(); + int64_t output_size = offsets.numel() - 1; + auto* offsets_data = offsets.data_ptr(); + std::vector offsets_include_last; + + if (include_last_offset) { + output_size = offsets.numel() - 1; + } else { + output_size = offsets.numel(); + offsets_include_last.resize(offsets.numel() + 1); + std::memcpy( + offsets_include_last.data(), + offsets.data_ptr(), + sizeof(index_t) * offsets.numel()); + offsets_include_last[offsets.numel()] = select_indices.numel(); + offsets_data = offsets_include_last.data(); + } + + Tensor scale_fp32 = at::empty(scale.sizes(), scale.options().dtype(at::kFloat)); + auto* scale_data_fp32 = scale_fp32.data_ptr(); + +#ifdef USE_FBGEMM + using float16 = uint16_t; + fbgemm::Float16ToFloat_simd(reinterpret_cast(scale_data), scale_data_fp32, scale_fp32.numel()); + auto kernel_fp16_index_t = + fbgemm_kernel_cache ? + fbgemm_kernel_cache->getCallback(ddim) : + fbgemm::GenerateEmbeddingSpMDM( + /* block_size */ddim, + /* has_weight */true, + /* normalize_by_lengths */false, + /* prefetch */16, + /* is_weight_positional */false, + /* use_offsets */true + ); +#else + // Initialize the intermediate output buffer to be 0. + Tensor output_fp32 = at::zeros({output_size, ddim}, output.options().dtype(at::kFloat)); + auto* output_data_fp32 = output_fp32.data_ptr(); + for (const auto i : c10::irange(scale.numel())) { + scale_data_fp32[i] = static_cast(scale_data[i]); + } +#endif + at::parallel_for( + 0, output_size, 1, [&](index_t start_idx, index_t end_idx) { +#ifdef USE_FBGEMM + bool success = kernel_fp16_index_t( + /* output_size */end_idx - start_idx, + /* index_size */offsets_data[end_idx] - offsets_data[start_idx], + /* data_size */src.size(0), + /* input */reinterpret_cast(src_data), + /* indices */select_indices_data + offsets_data[start_idx], + /* offsets_or_lengths */offsets_data + start_idx, + /* weights */scale_data_fp32 + offsets_data[start_idx], + /* output */reinterpret_cast(output_data + start_idx * ddim)); + if (!success) { + fbgemm_spmdm_report_error_( + end_idx - start_idx, + offsets_data[end_idx] - offsets_data[start_idx], + src.size(0), + offsets_data + start_idx, + select_indices_data + offsets_data[start_idx]); + } +#else + caffe2::EmbeddingLookupIdx( + /*block_size=*/ddim, + /*output_size=*/end_idx - start_idx, + /*index_size=*/offsets_data[end_idx] - offsets_data[start_idx], + /*data_size=*/src.size(0), + /*input=*/src_data, + /*indices=*/select_indices_data + offsets_data[start_idx], + /*offsets=*/offsets_data + start_idx, + /*weights=*/scale_data_fp32 + offsets_data[start_idx], + /*scale_bias=*/nullptr, + /*normalize_by_lengths=*/false, + /*out=*/output_data_fp32 + start_idx * ddim); + for (const auto i : c10::irange(output_size)) { + // Convert FP32 intermediate buffer result back to FP16 for output dtype + for (const auto d : c10::irange(ddim)) { + (output_data + i * ddim)[d] = static_cast((output_data_fp32 + ddim * i)[d]); + } + } +#endif + }); + } else { + AT_ASSERT(select_indices.numel() == add_indices.numel()); + auto* src_data = src.data_ptr(); + auto* add_indices_data = add_indices.data_ptr(); + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + index_t* bag_size_data = nullptr; + if (bag_size.defined()) { + bag_size_data = bag_size.data_ptr(); + } + auto vocab_size = src.size(0); + auto src_stride0 = src.strides()[0]; + auto src_stride1 = src.strides()[1]; + auto output_stride0 = output.strides()[0]; + auto output_stride1 = output.strides()[1]; + auto scale_stride = scale.strides()[0]; + auto numel = add_indices.numel(); + + // Initialize the intermediate output buffer to be 0. + Tensor output_fp32 = at::zeros({output.size(0), ddim}, output.options().dtype(at::kFloat)); + auto* output_data_fp32 = output_fp32.data_ptr(); + + for (const auto i : c10::irange(numel)) { + // We can skip indices equal to padding_idx so they are not included in + // the reduction + auto idx = select_indices_data[i]; + TORCH_CHECK( + idx >= 0 && idx < vocab_size, + "embedding_bag: Expected idx >= 0 && idx < num_embeddings but found idx to be ", + idx); + if (idx != padding_idx) { + + auto* src_base = src_data + src_stride0 * idx; + auto* output_base_fp32 = output_data_fp32 + ddim * add_indices_data[i]; + auto scale = scale_data[i * scale_stride]; + for (const auto j : c10::irange(ddim)) { + output_base_fp32[j] += static_cast(src_base[j * src_stride1]) * static_cast(scale); + } + } else if (bag_size.defined()) { + // Decrement bag_size to reflect that the index is padded + // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) + bag_size_data[add_indices_data[i]]--; + } + } + for (const auto i : c10::irange(output.size(0))) { + // Convert FP32 intermediate buffer result back to FP16 for output dtype + for (const auto d : c10::irange(ddim)) { + (output_data + output_stride0 * i)[d * output_stride1] = static_cast((output_data_fp32 + ddim * i)[d]); + } + } + } +} + template typename std::enable_if::value, void>::type index_select_scale_add(const Tensor &select_indices, @@ -348,8 +660,9 @@ index_select_scale_add(const Tensor &select_indices, const Tensor& offsets, bool include_last_offset, Tensor &bag_size, - index_t padding_idx) { - int64_t ddim = src.sizes()[1]; + index_t padding_idx, + _EmbeddingBagKernelCache* fbgemm_kernel_cache) { + int64_t ddim = src.size(1); auto* scale_data = scale.data_ptr(); auto* select_indices_data = select_indices.data_ptr(); auto* output_data = output.data_ptr(); @@ -376,6 +689,8 @@ index_select_scale_add(const Tensor &select_indices, #ifdef USE_FBGEMM auto kernel_fp32_index_t = + fbgemm_kernel_cache ? + fbgemm_kernel_cache->getCallback(ddim) : fbgemm::GenerateEmbeddingSpMDM( /* block_size */ddim, /* has_weight */true, @@ -391,7 +706,7 @@ index_select_scale_add(const Tensor &select_indices, bool success = kernel_fp32_index_t( /* output_size */end_idx - start_idx, /* index_size */offsets_data[end_idx] - offsets_data[start_idx], - /* data_size */src.sizes()[0], + /* data_size */src.size(0), /* input */src_data, /* indices */select_indices_data + offsets_data[start_idx], /* offsets_or_lengths */offsets_data + start_idx, @@ -401,7 +716,7 @@ index_select_scale_add(const Tensor &select_indices, fbgemm_spmdm_report_error_( end_idx - start_idx, offsets_data[end_idx] - offsets_data[start_idx], - src.sizes()[0], + src.size(0), offsets_data + start_idx, select_indices_data + offsets_data[start_idx]); } @@ -410,7 +725,7 @@ index_select_scale_add(const Tensor &select_indices, /*block_size=*/ddim, /*output_size=*/end_idx - start_idx, /*index_size=*/offsets_data[end_idx] - offsets_data[start_idx], - /*data_size=*/src.sizes()[0], + /*data_size=*/src.size(0), /*input=*/src_data, /*indices=*/select_indices_data + offsets_data[start_idx], /*offsets=*/offsets_data + start_idx, @@ -425,7 +740,7 @@ index_select_scale_add(const Tensor &select_indices, auto* src_data = src.data_ptr(); auto* add_indices_data = add_indices.data_ptr(); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - index_t* bag_size_data; + index_t* bag_size_data = nullptr; if (bag_size.defined()) { bag_size_data = bag_size.data_ptr(); } @@ -477,17 +792,17 @@ void check_arguments( checkScalarTypes("embedding_bag", offsets_arg, {kLong, kInt}); checkSameType("embedding_bag", indices_arg, offsets_arg); auto weight_arg = TensorArg(weight, "weight", 1); - checkScalarTypes("embedding_bag", weight_arg, {kFloat, kDouble}); + checkScalarTypes("embedding_bag", weight_arg, {kHalf, kFloat, kDouble}); AT_DISPATCH_INDEX_TYPES(offsets.scalar_type(), "_embedding_bag_cpu_impl", [&]() { - if (offsets.sizes()[0] > 0) { + if (offsets.size(0) > 0) { index_t offset_0 = offsets.data_ptr()[0]; - index_t offset_n = offsets.data_ptr()[offsets.sizes()[0]-1]; + index_t offset_n = offsets.data_ptr()[offsets.size(0)-1]; TORCH_CHECK(offset_0 == 0, "offsets[0] has to be 0, i.e., the first sequence " "in the mini-batch has to start from position 0. " "However, got ", offsets[0]); - TORCH_CHECK(offset_n <= indices.sizes()[0], "offsets[-1] can not " - "be greater than input's length ", indices.sizes()[0], " but got offsets[-1] of ", + TORCH_CHECK(offset_n <= indices.size(0), "offsets[-1] can not " + "be greater than input's length ", indices.size(0), " but got offsets[-1] of ", offset_n); } }); @@ -504,7 +819,7 @@ void check_arguments( if (include_last_offset) { TORCH_CHECK( - offsets.sizes()[0] >= 1, + offsets.size(0) >= 1, "include_last_offset: number of offset should be at least 1"); } } @@ -517,16 +832,16 @@ void make_bag_size_out( const bool include_last_offset, const bool requires_grad) { if (requires_grad || mode == MODE_MEAN || mode == MODE_MAX) { - auto num_bags = offsets.sizes()[0] - (include_last_offset ? 1 : 0); + auto num_bags = offsets.size(0) - (include_last_offset ? 1 : 0); at::native::resize_(bag_size_out, {num_bags}, c10::nullopt); // Compute this for MODE_MEAN and MODE_MAX (latter needed for backwards) if (num_bags != 1) { - bag_size_out.slice(0, 0, bag_size_out.sizes()[0] - 1, 1) = + bag_size_out.slice(0, 0, bag_size_out.size(0) - 1, 1) = offsets.slice(0, 1, num_bags, 1) - offsets.slice(0, 0, num_bags - 1, 1); } if (num_bags > 0) { - bag_size_out[-1] = indices.sizes()[0] - offsets[num_bags - 1]; + bag_size_out[-1] = indices.size(0) - offsets[num_bags - 1]; } } else { at::native::resize_(bag_size_out, offsets.sizes(), c10::nullopt); @@ -541,7 +856,7 @@ void make_max_indices_out( const Tensor& bag_size, const int64_t mode, bool include_last_offset) { - int64_t numBags = offsets.sizes()[0]; + int64_t numBags = offsets.size(0); if (mode == MODE_MAX) { if (include_last_offset) { TORCH_CHECK( @@ -569,13 +884,11 @@ void make_offset2bag_out( bool fast_path_sum = is_fast_path(weight, per_sample_weights, output, padding_idx); if (mode == MODE_MEAN || mode == MODE_MAX || !fast_path_sum) { - at::native::resize_(offset2bag, {indices.sizes()[0] + 1}, c10::nullopt); + at::native::resize_(offset2bag, {indices.size(0) + 1}, c10::nullopt); at::native::zero_(offset2bag); - } - if (mode == MODE_MEAN || mode == MODE_MAX || !fast_path_sum) { make_offset2bag(offsets, offset2bag); - at::native::resize_(offset2bag, {indices.sizes()[0]}, c10::nullopt); + at::native::resize_(offset2bag, {indices.size(0)}, c10::nullopt); // only initialize output in slow path at::native::zero_(output); } @@ -647,7 +960,7 @@ static Tensor apply_bag_size_backward( template void embedding_bag_cpu_max_out( - Tensor& max_indices, + Tensor* max_indices, const Tensor& weight, const Tensor& indices, const Tensor& offset2bag, @@ -662,8 +975,12 @@ void embedding_bag_cpu_max_out( auto* indices_data = indices.data_ptr(); auto* offset2bag_data = offset2bag.data_ptr(); - auto* max_indices_data = max_indices.data_ptr(); - auto max_indices_stride = max_indices.strides()[0]; + index_t* max_indices_data = nullptr; + int64_t max_indices_stride = 0; + if (max_indices) { + max_indices_data = max_indices->data_ptr(); + max_indices_stride = max_indices->strides()[0]; + } auto* weight_data = weight.data_ptr(); auto* output_data = output.data_ptr(); @@ -690,7 +1007,9 @@ void embedding_bag_cpu_max_out( if (is_first_for_bag || (weight_item > current_item)) { current_item = weight_item; - max_indices_data[max_indices_stride * bag + dim] = word_idx; + if (max_indices_data) { + max_indices_data[max_indices_stride * bag + dim] = word_idx; + } } } if (is_first_for_bag) { @@ -705,22 +1024,22 @@ void embedding_bag_cpu_max_out( } void _embedding_bag_cpu_impl_out(Tensor& output, Tensor& offset2bag, - Tensor& bag_size, Tensor& max_indices, + Tensor& bag_size, Tensor* max_indices, const Tensor &weight, const Tensor &indices, const Tensor &offsets, const int64_t mode, const c10::optional& per_sample_weights, - bool include_last_offset, int64_t padding_idx) { + bool include_last_offset, int64_t padding_idx, _EmbeddingBagKernelCache* fbgemm_kernel_cache) { if (mode == MODE_MEAN || mode == MODE_SUM) { - AT_DISPATCH_FLOATING_TYPES(weight.scalar_type(), "embedding_bag_no_grad_cpu_out", - [&indices, &offset2bag, &per_sample_weights, &weight, &output, &offsets, &include_last_offset, &mode, &bag_size, &padding_idx]() { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, weight.scalar_type(), "embedding_bag_no_grad_cpu_out", + [&indices, &offset2bag, &per_sample_weights, &weight, &output, &offsets, &include_last_offset, &mode, &bag_size, &padding_idx, &fbgemm_kernel_cache]() { AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_no_grad_cpu_out", - [&indices, &offset2bag, &per_sample_weights, &weight, &output, &offsets, &include_last_offset, &mode, &bag_size, &padding_idx]() { + [&indices, &offset2bag, &per_sample_weights, &weight, &output, &offsets, &include_last_offset, &mode, &bag_size, &padding_idx, &fbgemm_kernel_cache]() { if (per_sample_weights.has_value() && per_sample_weights.value().defined()) { TORCH_INTERNAL_ASSERT(mode == MODE_SUM); index_select_scale_add( - indices, offset2bag, per_sample_weights.value(), weight, output, offsets, include_last_offset, bag_size, padding_idx); + indices, offset2bag, per_sample_weights.value(), weight, output, offsets, include_last_offset, bag_size, padding_idx, fbgemm_kernel_cache); } else { - index_select_add(indices, offset2bag, weight, output, offsets, include_last_offset, bag_size, padding_idx); + index_select_add(indices, offset2bag, weight, output, offsets, include_last_offset, bag_size, padding_idx, fbgemm_kernel_cache); } }); }); @@ -729,7 +1048,9 @@ void _embedding_bag_cpu_impl_out(Tensor& output, Tensor& offset2bag, // make bag_size output deterministic at::native::zero_(bag_size); } - max_indices.copy_(bag_size); + if (max_indices) { + max_indices->copy_(bag_size); + } } else { // MODE_MAX AT_DISPATCH_FLOATING_TYPES_AND_HALF( weight.scalar_type(), "embedding_bag_cpu_max_out", [&]() { @@ -756,7 +1077,7 @@ std::tuple _embedding_bag_cpu_impl( check_arguments(weight, indices, offsets, mode, per_sample_weights, include_last_offset); Tensor output = at::empty( - {include_last_offset ? offsets.sizes()[0] - 1 : offsets.sizes()[0], + {include_last_offset ? offsets.size(0) - 1 : offsets.size(0), weight.sizes()[1]}, weight.options()); @@ -767,7 +1088,7 @@ std::tuple _embedding_bag_cpu_impl( Tensor max_indices = make_max_indices(weight, indices, offsets, bag_size, mode, include_last_offset); _embedding_bag_cpu_impl_out(output, offset2bag, - bag_size, max_indices, + bag_size, &max_indices, weight, indices, offsets, mode, per_sample_weights, include_last_offset, padding_idx); @@ -866,6 +1187,63 @@ _embedding_bag_cpu(const Tensor &weight, const Tensor &indices, /*requires_grad=*/true); } +void _embedding_bag_cpu_out( + at::Tensor& output, + at::Tensor& offset2bag, + at::Tensor& bag_size, + at::Tensor* p_max_indices, + const at::Tensor& weight, + const at::Tensor& indices, + const at::Tensor& offsets, + const bool /* scale_grad_by_freq */, + const int64_t mode, + const bool /* sparse */, + const c10::optional& per_sample_weights, + const bool include_last_offset, + const c10::optional& padding_idx, + _EmbeddingBagKernelCache* fbgemm_kernel_cache) { + at::native::check_arguments( + weight, indices, offsets, mode, per_sample_weights, include_last_offset); + + at::native::make_offset2bag_out( + offset2bag, + output, + weight, + indices, + offsets, + mode, + per_sample_weights, + padding_idx.value_or(-1)); + + at::native::make_bag_size_out( + bag_size, offsets, indices, mode, include_last_offset, false); + + if (p_max_indices) { + at::native::make_max_indices_out( + *p_max_indices, + weight, + indices, + offsets, + bag_size, + mode, + include_last_offset); + } + + at::native::_embedding_bag_cpu_impl_out( + output, + offset2bag, + bag_size, + p_max_indices, + weight, + indices, + offsets, + mode, + per_sample_weights, + include_last_offset, + padding_idx.value_or(-1), + fbgemm_kernel_cache); +} + // Assumes all input tensors are contiguous. // See NOTE [ embedding_bag Native Functions ] in native_functions.yaml for details Tensor _embedding_bag_backward(const Tensor &grad, const Tensor &indices_, @@ -894,10 +1272,10 @@ Tensor _embedding_bag_backward(const Tensor &grad, const Tensor &indices_, Tensor offset2bag_; if (indices.numel() != 0 && offset2bag.numel() == 0) { offset2bag_ = at::zeros( - {indices.sizes()[0] + 1}, offsets.options()); // offset2bag = [0 0 0 0 0] + {indices.size(0) + 1}, offsets.options()); // offset2bag = [0 0 0 0 0] make_offset2bag(offsets, offset2bag_); - offset2bag_.resize_({indices.sizes()[0]}); + offset2bag_.resize_({indices.size(0)}); } else { auto offset2bag_arg = TensorArg(offset2bag, "offset2bag", 1); checkScalarTypes("embedding_bag", offset2bag_arg, {kLong, kInt}); @@ -1081,7 +1459,7 @@ Tensor _embedding_bag_dense_backward_cpu(const Tensor &grad_, const Tensor &indi // for more details. auto grad = grad_.contiguous(); auto grad_arg = TensorArg(grad, "grad_", 1); - checkScalarTypes("embedding_bag", grad_arg, {kFloat, kDouble}); + checkScalarTypes("embedding_bag", grad_arg, {kHalf, kFloat, kDouble}); if (mode == MODE_MAX) { return _embedding_bag_dense_backward_cpu_max( @@ -1092,12 +1470,24 @@ Tensor _embedding_bag_dense_backward_cpu(const Tensor &grad_, const Tensor &indi auto index_grad_weight = at::zeros({num_weights, grad.sizes()[1]}, grad.options()); - AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "embedding_bag_backward", [&] { - _embedding_bag_dense_backward_cpu_sum_mean( - grad, indices_, offset2bag__, bag_size_, num_weights, - scale_grad_by_freq, mode, per_sample_weights_, index_grad_weight, - padding_idx); - }); + AT_DISPATCH_FLOATING_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + grad.scalar_type(), + "embedding_bag_backward", + [&] { + _embedding_bag_dense_backward_cpu_sum_mean( + grad, + indices_, + offset2bag__, + bag_size_, + num_weights, + scale_grad_by_freq, + mode, + per_sample_weights_, + index_grad_weight, + padding_idx); + }); return index_grad_weight; } @@ -1120,7 +1510,7 @@ Tensor _embedding_bag_per_sample_weights_backward_cpu_template( Tensor indices, offsets; std::tie(indices, offsets) = promoteIndicesAndOffsets(indices_, offsets_); AT_ASSERT(indices.dim() == 1); - auto num_samples = indices.sizes()[0]; + auto num_samples = indices.size(0); AT_ASSERT(weight.dim() == 2); AT_ASSERT(weight.sizes()[1] == embedding_features); @@ -1134,11 +1524,11 @@ Tensor _embedding_bag_per_sample_weights_backward_cpu_template( Tensor offset2bag_; if (indices.numel() != 0 && offset2bag.numel() == 0) { offset2bag_ = at::zeros( - {indices.sizes()[0] + 1}, offset2bag.options()); // offset2bag = [0 0 0 0 0] + {indices.size(0) + 1}, offset2bag.options()); // offset2bag = [0 0 0 0 0] make_offset2bag(offsets, offset2bag_); - at::native::resize_(offset2bag_, {indices.sizes()[0]}, c10::nullopt); + at::native::resize_(offset2bag_, {indices.size(0)}, c10::nullopt); } else { auto offset2bag_arg = TensorArg(offset2bag, "offset2bag", 1); checkScalarTypes("embedding_bag", offset2bag_arg, {kLong, kInt}); @@ -1194,12 +1584,16 @@ Tensor _embedding_bag_per_sample_weights_backward_cpu( const Tensor& offset2bag, int64_t mode, int64_t padding_idx) { - return AT_DISPATCH_FLOATING_TYPES( - grad.scalar_type(), "_embedding_bag_per_sample_weights_backward_cpu", [&]() { - return _embedding_bag_per_sample_weights_backward_cpu_template( - grad, weight, indices, offsets, offset2bag, mode, padding_idx); - } - ); + return AT_DISPATCH_FLOATING_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + grad.scalar_type(), + "_embedding_bag_per_sample_weights_backward_cpu", + [&]() { + return _embedding_bag_per_sample_weights_backward_cpu_template< + scalar_t>( + grad, weight, indices, offsets, offset2bag, mode, padding_idx); + }); } Tensor _embedding_bag_sparse_backward( @@ -1229,6 +1623,5 @@ Tensor _embedding_bag_sparse_backward( return native::embedding_backward(index_grad, indices, num_weights, padding_idx, scale_grad_by_freq, true); } - } } // namespace at::native diff --git a/aten/src/ATen/native/EmbeddingBag.h b/aten/src/ATen/native/EmbeddingBag.h index e0ce5f01b384..6600c661d46a 100644 --- a/aten/src/ATen/native/EmbeddingBag.h +++ b/aten/src/ATen/native/EmbeddingBag.h @@ -1,4 +1,9 @@ #include +#include + +#ifdef USE_FBGEMM +#include +#endif namespace at { namespace native { @@ -38,12 +43,98 @@ void make_offset2bag_out( const c10::optional& per_sample_weights, const int64_t padding_idx = -1); +#ifdef USE_FBGEMM + +template +struct _CallbackAndBlockSize { + using TCallback = typename fbgemm::EmbeddingSpMDMKernelSignature::Type; + + int64_t blockSize = -1; + TCallback callback = nullptr; + + static TCallback generateCallback(int64_t block_size) { + return fbgemm::GenerateEmbeddingSpMDM( + block_size, + has_weight, + /* normalize_by_lengths */false, + /* prefetch */16, + /* is_weight_positional */false, + /* use_offsets */true); + } + + _CallbackAndBlockSize() = default; + + explicit _CallbackAndBlockSize(c10::optional maybe_block_size) + : blockSize(maybe_block_size.value_or(-1)) + , callback(maybe_block_size.has_value() ? generateCallback(maybe_block_size.value()) : nullptr) + {} +}; + +template +struct _EmbeddingBagKernelCacheImpl : private StorageMixins... { + + _EmbeddingBagKernelCacheImpl() = default; + // use each of the mixins to store corresponding kernel and block size + explicit _EmbeddingBagKernelCacheImpl(c10::optional maybe_block_size) + : StorageMixins(maybe_block_size)... + {} + + // this method is thread safe (call sites may call from different threads) + template + typename _CallbackAndBlockSize::TCallback + getCallback(int64_t block_size) const { + // if the cache doesn't store the kernel for the incoming block size + // (so it is different from the one stored in corresponding mixin) + // regenerate the kernel (not writing it into the cache so we avoid locks) + if (block_size != _CallbackAndBlockSize::blockSize) { + return _CallbackAndBlockSize::generateCallback(block_size); + } + // else retrieve the cached kernel from the corresponding mixin + return _CallbackAndBlockSize::callback; + } +}; + +// instantiate the cache with the list of storage mixins +// for each of the 8 _EmbeddingBagKernelCache* usages in the EmbeddingBag.cpp impl file +using _EmbeddingBagKernelCache = _EmbeddingBagKernelCacheImpl< + _CallbackAndBlockSize, + _CallbackAndBlockSize, + _CallbackAndBlockSize, + _CallbackAndBlockSize, + _CallbackAndBlockSize, + _CallbackAndBlockSize, + _CallbackAndBlockSize, + _CallbackAndBlockSize>; +#else +struct _EmbeddingBagKernelCache { + explicit _EmbeddingBagKernelCache(c10::optional /* maybe_block_size */) {} +}; +#endif + void _embedding_bag_cpu_impl_out(Tensor& output, Tensor& offset2bag, - Tensor& bag_size, Tensor& max_indices, + Tensor& bag_size, Tensor* max_indices, const Tensor &weight, const Tensor &indices, const Tensor &offsets, const int64_t mode = 0, const c10::optional& per_sample_weights = c10::nullopt, bool include_last_offset = false, - int64_t padding_idx = -1); + int64_t padding_idx = -1, + _EmbeddingBagKernelCache* fbgemm_kernel_cache = nullptr); + +void _embedding_bag_cpu_out( + at::Tensor& output, + at::Tensor& offset2bag, + at::Tensor& bag_size, + at::Tensor* p_max_indices, + const at::Tensor& weight, + const at::Tensor& indices, + const at::Tensor& offsets, + const bool scale_grad_by_freq, + const int64_t mode, + const bool sparse, + const c10::optional& per_sample_weights, + const bool include_last_offset, + const c10::optional& padding_idx, + _EmbeddingBagKernelCache* fbgemm_kernel_cache = nullptr); + } // native } // at diff --git a/aten/src/ATen/native/Fill.cpp b/aten/src/ATen/native/Fill.cpp index ca48deab83ae..63fc611961cc 100644 --- a/aten/src/ATen/native/Fill.cpp +++ b/aten/src/ATen/native/Fill.cpp @@ -61,6 +61,14 @@ Tensor& fill_meta_(Tensor& self, const Tensor& value) { return self; } +Tensor fill(const Tensor& self, const Scalar& value) { + return at::empty_like(self).fill_(value); +} + +Tensor fill(const Tensor& self, const Tensor& value) { + return at::empty_like(self).fill_(value); +} + DEFINE_DISPATCH(fill_stub); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ fill_diagonal ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/aten/src/ATen/native/Fill.h b/aten/src/ATen/native/Fill.h index e1903a379a0c..f6de9580ae7c 100644 --- a/aten/src/ATen/native/Fill.h +++ b/aten/src/ATen/native/Fill.h @@ -2,13 +2,19 @@ #pragma once -#include #include -#include -namespace at { namespace native { +namespace c10 { +class Scalar; +} -DECLARE_DISPATCH(void(*)(TensorIterator&, const Scalar&), fill_stub); +namespace at { +class Tensor; +struct TensorIterator; + +namespace native { + +DECLARE_DISPATCH(void(*)(TensorIterator&, const c10::Scalar&), fill_stub); Tensor& fill_out(Tensor& self, const Scalar& value); diff --git a/aten/src/ATen/native/ForeachUtils.h b/aten/src/ATen/native/ForeachUtils.h index 8855fd313a56..033052f401f6 100644 --- a/aten/src/ATen/native/ForeachUtils.h +++ b/aten/src/ATen/native/ForeachUtils.h @@ -126,19 +126,11 @@ bool check_fast_path_restrictions( bool can_use_fast_route(ArrayRef tensorLists, ArrayRef scalarList = {}, bool does_op_promote_integer_inputs_to_float = false) { -#if defined(USE_ROCM) - return false; -#else return check_fast_path_restrictions(tensorLists, scalarList, does_op_promote_integer_inputs_to_float); -#endif } bool can_use_fast_route(TensorList tensors1, TensorList tensors2, bool does_op_promote_integer_inputs_to_float = false) { -#if defined(USE_ROCM) - return false; -#else return can_use_fast_route({tensors1, tensors2}, {}, does_op_promote_integer_inputs_to_float); -#endif } } diff --git a/aten/src/ATen/native/FractionalMaxPool2d.cpp b/aten/src/ATen/native/FractionalMaxPool2d.cpp index b4ea2ec186f2..bb25be4a02e5 100644 --- a/aten/src/ATen/native/FractionalMaxPool2d.cpp +++ b/aten/src/ATen/native/FractionalMaxPool2d.cpp @@ -134,8 +134,9 @@ static std::vector fractional_max_pool2d_generate_intervals( static_cast((i + sample) * alpha) - static_cast(sample * alpha); } } - sequence[outputSize - 1] = inputSize - poolSize; - + if (outputSize > 0) { + sequence[outputSize - 1] = inputSize - poolSize; + } return sequence; } diff --git a/aten/src/ATen/native/FractionalMaxPool3d.cpp b/aten/src/ATen/native/FractionalMaxPool3d.cpp index 757ce7c05691..8bcb53847271 100644 --- a/aten/src/ATen/native/FractionalMaxPool3d.cpp +++ b/aten/src/ATen/native/FractionalMaxPool3d.cpp @@ -106,8 +106,9 @@ static std::vector generate_intervals( static_cast((i + sample) * alpha) - static_cast(sample * alpha); } } - sequence[outputSize - 1] = inputSize - poolSize; - + if (outputSize > 0) { + sequence[outputSize - 1] = inputSize - poolSize; + } return sequence; } @@ -238,7 +239,6 @@ TORCH_IMPL_FUNC(fractional_max_pool3d_out_cpu)( int64_t inputW, const at::Tensor& output, const at::Tensor& indices) { - /* get contiguous input */ auto input = input_.contiguous(); diff --git a/aten/src/ATen/native/FunctionOfAMatrixUtils.cpp b/aten/src/ATen/native/FunctionOfAMatrixUtils.cpp index 154f1bf43be6..d31789051104 100644 --- a/aten/src/ATen/native/FunctionOfAMatrixUtils.cpp +++ b/aten/src/ATen/native/FunctionOfAMatrixUtils.cpp @@ -1,5 +1,17 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#endif + namespace at { namespace native { DEFINE_DISPATCH(_compute_linear_combination_stub); diff --git a/aten/src/ATen/native/FunctionOfAMatrixUtils.h b/aten/src/ATen/native/FunctionOfAMatrixUtils.h index 330efa0923f9..68b26ed13811 100644 --- a/aten/src/ATen/native/FunctionOfAMatrixUtils.h +++ b/aten/src/ATen/native/FunctionOfAMatrixUtils.h @@ -1,11 +1,12 @@ #pragma once -#include -#include #include -#include +#include -namespace at { namespace native { +namespace at { +struct TensorIterator; + +namespace native { using _compute_linear_combination_fn = void(*)( TensorIterator& iter, diff --git a/aten/src/ATen/native/GatedLinearUnit.cpp b/aten/src/ATen/native/GatedLinearUnit.cpp index c585caa71a01..b7b20e1c32f1 100644 --- a/aten/src/ATen/native/GatedLinearUnit.cpp +++ b/aten/src/ATen/native/GatedLinearUnit.cpp @@ -30,6 +30,8 @@ namespace native { DEFINE_DISPATCH(glu_stub); // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) DEFINE_DISPATCH(glu_backward_stub); +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +DEFINE_DISPATCH(glu_jvp_stub); TORCH_IMPL_FUNC(glu_out) (const Tensor& self, int64_t dim, const Tensor& out) { glu_stub(device_type(), *this); @@ -69,5 +71,72 @@ Tensor glu_backward_cpu(const Tensor& grad_output, const Tensor& input, int64_t return glu_backward_cpu_out(grad_output, input, dim, grad_input); } +Tensor glu_jvp( + const Tensor& glu, + const Tensor& x, + const Tensor& dx, + int64_t dim +) { + dim = maybe_wrap_dim(dim, x.dim()); + const auto glu_size = glu.size(dim); + const auto b = x.narrow(dim, glu_size, glu_size); + const auto da = dx.narrow(dim, 0, glu_size); + const auto db = dx.narrow(dim, glu_size, glu_size); + auto dglu = at::empty_like(glu); + auto iter = at::TensorIteratorConfig() + .add_output(dglu) + .add_input(glu) + .add_input(b) + .add_input(da) + .add_input(db) + .build(); + glu_jvp_stub(iter.device_type(), iter); + return dglu; +} + +Tensor glu_backward_jvp( + const Tensor& grad_x, + const Tensor& grad_glu, + const Tensor& x, + const Tensor& dgrad_glu, + const Tensor& dx, + int64_t dim +) { + dim = maybe_wrap_dim(dim, x.dim()); + const auto glu_size = grad_glu.size(dim); + const auto a = x.narrow(dim, 0, glu_size); + const auto b = x.narrow(dim, glu_size, glu_size); + const auto da = dx.narrow(dim, 0, glu_size); + const auto db = dx.narrow(dim, glu_size, glu_size); + // grad_x_a = grad_glu * sigmoid(b) + const auto grad_x_a = grad_x.narrow(dim, 0, glu_size); + // grad_x_b = grad_x_a * a * (1 - sigmoid(b)) + const auto grad_x_b = grad_x.narrow(dim, glu_size, glu_size); + + const auto sig_b = at::sigmoid(b); + // TODO: use glu from forward. + // TODO: fuse kernels. + const auto glu = a * sig_b; + const auto db_neg_sig_b = db - db * sig_b; + + // dgrad_x_a = d(grad_glu * sigmoid(b)) + // = dgrad_glu * sigmoid(b) + grad_glu * sigmoid(b) * (1 - sigmoid(b)) * db + // = dgrad_glu * sig_b + grad_x_a * (db - db * sig_b) + // = dgrad_glu * sig_b + grad_x_a * db_neg_sig_b + const auto dgrad_x_a = dgrad_glu * sig_b + grad_x_a * db_neg_sig_b; + + // dgrad_x_b = d(grad_glu * sigmoid(b) * a * (1 - sigmoid(b)) + // = d(grad_glu * sigmoid(b)) * a * (1 - sigmoid(b)) + // + grad_glu * sigmoid(b) * da * (1 - sigmoid(b)) + // - grad_glu * sigmoid(b) * a * sigmoid(b) * (1 - sigmoid(b)) * db + // = dgrad_x_a * a * (1 - sigmoid(b)) + // + (grad_glu * sigmoid(b)) * (da * (1 - sigmoid(b)) - a * sigmoid(b) * (1 - sigmoid(b)) * db) + // = dgrad_x_a * (a - glu) + grad_x_a * (da - da * sig_b - glu * db_neg_sig_b + const auto dgrad_x_b = dgrad_x_a * (a - glu) + grad_x_a * (da - da * sig_b - glu * db_neg_sig_b); + + return at::cat({dgrad_x_a, dgrad_x_b}, dim); +} + + } // at::native } // at diff --git a/aten/src/ATen/native/GridSampler.cpp b/aten/src/ATen/native/GridSampler.cpp index 99b3d933bd89..8b0440610226 100644 --- a/aten/src/ATen/native/GridSampler.cpp +++ b/aten/src/ATen/native/GridSampler.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -23,6 +24,12 @@ namespace { GridSamplerInterpolation interpolation_mode, GridSamplerPadding padding_mode, bool align_corners) { + // See NOTE [ grid_sampler Native Functions ]. + // Add checks here in case this is called instead of grid_sampler. + check_grid_sampler_common(input, grid); + check_grid_sampler_3d( + input, grid, static_cast(interpolation_mode)); + int64_t N = input.size(0); int64_t C = input.size(1); int64_t inp_D = input.size(2); @@ -178,8 +185,21 @@ namespace { const Tensor& input, const Tensor& grid, GridSamplerInterpolation interpolation_mode, GridSamplerPadding padding_mode, - bool align_corners) { - auto grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + bool align_corners, std::array output_mask) { + // See NOTE [ grid_sampler Native Functions ]. + // Add checks here in case this is called instead of grid_sampler. + check_grid_sampler_common(input, grid); + check_grid_sampler_3d( + input, grid, static_cast(interpolation_mode)); + + auto input_requires_grad = output_mask[0]; + Tensor grad_input = ([&]() { + if (input_requires_grad) { + return at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + } else { + return Tensor(); + } + })(); auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT); // If interpolation mode is Nearest, then grad_grid is not filled in the // loop below. @@ -209,17 +229,27 @@ namespace { int64_t gOut_sD = grad_output.stride(2); int64_t gOut_sH = grad_output.stride(3); int64_t gOut_sW = grad_output.stride(4); - int64_t gInp_sN = grad_input.stride(0); - int64_t gInp_sC = grad_input.stride(1); - int64_t gInp_sD = grad_input.stride(2); - int64_t gInp_sH = grad_input.stride(3); - int64_t gInp_sW = grad_input.stride(4); + int64_t gInp_sN = 0; + int64_t gInp_sC = 0; + int64_t gInp_sD = 0; + int64_t gInp_sH = 0; + int64_t gInp_sW = 0; + if (input_requires_grad) { + gInp_sN = grad_input.stride(0); + gInp_sC = grad_input.stride(1); + gInp_sD = grad_input.stride(2); + gInp_sH = grad_input.stride(3); + gInp_sW = grad_input.stride(4); + } int64_t gGrid_sN = grad_grid.stride(0); int64_t gGrid_sW = grad_grid.stride(3); scalar_t *inp_ptr = input.data_ptr(); scalar_t *grid_ptr = grid.data_ptr(); scalar_t *gOut_ptr = grad_output.data_ptr(); - scalar_t *gInp_ptr = grad_input.data_ptr(); + scalar_t *gInp_ptr = nullptr; + if (input_requires_grad) { + gInp_ptr = grad_input.data_ptr(); + } scalar_t *gGrid_ptr = grad_grid.data_ptr(); // loop over each output pixel at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) { @@ -290,22 +320,23 @@ namespace { scalar_t gix = static_cast(0), giy = static_cast(0), giz = static_cast(0); scalar_t *gOut_ptr_NCDHW = gOut_ptr + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW; - scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN; scalar_t *inp_ptr_NC = inp_ptr_N; + scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN; // calculate bilinear weighted pixel value and set output pixel for (int64_t c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) { scalar_t gOut = *gOut_ptr_NCDHW; // calculate and set grad_input - safe_add_3d(gInp_ptr_NC, iz_tnw, iy_tnw, ix_tnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tnw * gOut); - safe_add_3d(gInp_ptr_NC, iz_tne, iy_tne, ix_tne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tne * gOut); - safe_add_3d(gInp_ptr_NC, iz_tsw, iy_tsw, ix_tsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tsw * gOut); - safe_add_3d(gInp_ptr_NC, iz_tse, iy_tse, ix_tse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tse * gOut); - safe_add_3d(gInp_ptr_NC, iz_bnw, iy_bnw, ix_bnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bnw * gOut); - safe_add_3d(gInp_ptr_NC, iz_bne, iy_bne, ix_bne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bne * gOut); - safe_add_3d(gInp_ptr_NC, iz_bsw, iy_bsw, ix_bsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bsw * gOut); - safe_add_3d(gInp_ptr_NC, iz_bse, iy_bse, ix_bse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bse * gOut); - + if (input_requires_grad) { + safe_add_3d(gInp_ptr_NC, iz_tnw, iy_tnw, ix_tnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tnw * gOut); + safe_add_3d(gInp_ptr_NC, iz_tne, iy_tne, ix_tne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tne * gOut); + safe_add_3d(gInp_ptr_NC, iz_tsw, iy_tsw, ix_tsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tsw * gOut); + safe_add_3d(gInp_ptr_NC, iz_tse, iy_tse, ix_tse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tse * gOut); + safe_add_3d(gInp_ptr_NC, iz_bnw, iy_bnw, ix_bnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bnw * gOut); + safe_add_3d(gInp_ptr_NC, iz_bne, iy_bne, ix_bne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bne * gOut); + safe_add_3d(gInp_ptr_NC, iz_bsw, iy_bsw, ix_bsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bsw * gOut); + safe_add_3d(gInp_ptr_NC, iz_bse, iy_bse, ix_bse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bse * gOut); + } // calculate grad_grid if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) { scalar_t tnw_val = inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW]; @@ -368,11 +399,13 @@ namespace { // assign nearest neighor pixel value to output pixel scalar_t *gOut_ptr_NCDHW = gOut_ptr + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW; - scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN; - for (int64_t c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC) { - // calculate and set grad_input - safe_add_3d(gInp_ptr_NC, iz_nearest, iy_nearest, ix_nearest, - gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, *gOut_ptr_NCDHW); + if (input_requires_grad) { + scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN; + for (int64_t c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC) { + // calculate and set grad_input + safe_add_3d(gInp_ptr_NC, iz_nearest, iy_nearest, ix_nearest, + gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, *gOut_ptr_NCDHW); + } } } } @@ -391,6 +424,11 @@ Tensor _grid_sampler_2d_cpu_quantized( int64_t interpolation_mode_, int64_t padding_mode_, bool align_corners) { + // See NOTE [ grid_sampler Native Functions ]. + // Add checks here in case this is called instead of grid_sampler. + check_grid_sampler_common(input, grid); + check_grid_sampler_2d(input, grid); + auto interpolation_mode = static_cast(interpolation_mode_); /* Bilinear interpolation is supported using the fact that we can perform @@ -495,6 +533,11 @@ Tensor _grid_sampler_2d_cpu_fallback(const Tensor& input, const Tensor& grid, int64_t interpolation_mode_, int64_t padding_mode_, bool align_corners) { + // See NOTE [ grid_sampler Native Functions ]. + // Add checks here in case this is called instead of grid_sampler. + check_grid_sampler_common(input, grid); + check_grid_sampler_2d(input, grid); + auto interpolation_mode = static_cast(interpolation_mode_); auto padding_mode = static_cast(padding_mode_); using scalar_t = float; @@ -643,6 +686,11 @@ _grid_sampler_2d_cpu_fallback_backward(const Tensor& grad_output, int64_t interpolation_mode_, int64_t padding_mode_, bool align_corners) { + // See NOTE [ grid_sampler Native Functions ]. + // Add checks here in case this is called instead of grid_sampler. + check_grid_sampler_common(input, grid); + check_grid_sampler_2d(input, grid); + const auto interpolation_mode = static_cast(interpolation_mode_); const auto padding_mode = static_cast(padding_mode_); using scalar_t = float; @@ -836,10 +884,14 @@ _grid_sampler_2d_cpu_fallback_backward(const Tensor& grad_output, return std::make_tuple(grad_input, grad_grid); } -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. Tensor grid_sampler_2d_cpu(const Tensor& input, const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners) { + // See NOTE [ grid_sampler Native Functions ]. + // Add checks here in case this is called instead of grid_sampler. + check_grid_sampler_common(input, grid); + check_grid_sampler_2d(input, grid); + if (input.scalar_type() == kQUInt8) { return native::_grid_sampler_2d_cpu_quantized( input, grid, interpolation_mode, padding_mode, align_corners); @@ -864,17 +916,26 @@ Tensor grid_sampler_2d_cpu(const Tensor& input, const Tensor& grid, } } - return grid_sampler_2d_cpu_kernel( - kCPU, input, grid, interpolation_mode, padding_mode, align_corners); + auto in_size = input.sizes(); + auto grid_size = grid.sizes(); + auto output = at::empty( + {in_size[0], in_size[1], grid_size[1], grid_size[2]}, input.options()); + grid_sampler_2d_cpu_kernel( + kCPU, output, input, grid, interpolation_mode, padding_mode, align_corners); + return output; } DEFINE_DISPATCH(grid_sampler_2d_cpu_kernel); -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. Tensor grid_sampler_3d_cpu(const Tensor& input, const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners) { + // See NOTE [ grid_sampler Native Functions ]. + // Add checks here in case this is called instead of grid_sampler. + check_grid_sampler_common(input, grid); + check_grid_sampler_3d(input, grid, interpolation_mode); + return AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "grid_sampler3d_cpu", [&] { return grid_sampler_3d_cpu_impl( input, grid, static_cast(interpolation_mode), @@ -882,11 +943,14 @@ Tensor grid_sampler_3d_cpu(const Tensor& input, const Tensor& grid, }); } -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. std::tuple grid_sampler_2d_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, std::array output_mask) { + // See NOTE [ grid_sampler Native Functions ]. + // Add checks here in case this is called instead of grid_sampler. + check_grid_sampler_common(input, grid); + check_grid_sampler_2d(input, grid); // AVX gather instructions use signed 32-bit offsets to gather float values. // Check for possible overflow and fallback to scalar implementation @@ -911,80 +975,64 @@ grid_sampler_2d_backward_cpu(const Tensor& grad_output, const Tensor& input, con } } - return grid_sampler_2d_backward_cpu_kernel( - kCPU, grad_output, input, grid, interpolation_mode, padding_mode, align_corners, output_mask); + auto input_requires_grad = output_mask[0]; + Tensor grad_input = ([&]() { + if (input_requires_grad) { + return at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + } else { + return Tensor(); + } + })(); + auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + grid_sampler_2d_backward_cpu_kernel( + kCPU, grad_input, grad_grid, grad_output, input, grid, + interpolation_mode, padding_mode, align_corners, output_mask); + return std::make_tuple(std::move(grad_input), std::move(grad_grid)); } DEFINE_DISPATCH(grid_sampler_2d_backward_cpu_kernel); -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. std::tuple grid_sampler_3d_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, int64_t padding_mode, bool align_corners) { + int64_t interpolation_mode, int64_t padding_mode, bool align_corners, + std::array output_mask) { + // See NOTE [ grid_sampler Native Functions ]. + // Add checks here in case this is called instead of grid_sampler. + check_grid_sampler_common(input, grid); + check_grid_sampler_3d(input, grid, interpolation_mode); + return AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "grid_sampler_3d_backward_cpu", [&] { return grid_sampler_3d_backward_cpu_impl( grad_output, input, grid, static_cast(interpolation_mode), - static_cast(padding_mode), align_corners); + static_cast(padding_mode), + align_corners, output_mask); }); } -Tensor grid_sampler(const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, int64_t padding_mode, - bool align_corners) { - TORCH_CHECK( - input.defined() && grid.defined(), - "grid_sampler(): expected input and grid to not be undefined, but input " - "is ", input, " and grid is ", grid); - auto input_opt = input.options(); - auto grid_opt = grid.options(); - TORCH_CHECK( - input_opt.device() == grid_opt.device(), - "grid_sampler(): expected input and grid to be on same device, but input " - "is on ", input_opt.device(), " and grid is on ", grid_opt.device()); - TORCH_CHECK( - input_opt.layout() == kStrided && grid_opt.layout() == kStrided, - "grid_sampler(): expected input and grid to have torch.strided layout, but " - "input has ", input_opt.layout(), " and grid has ", grid_opt.layout()); - TORCH_CHECK( - (input.dim() == 4 || input.dim() == 5) && input.dim() == grid.dim(), - "grid_sampler(): expected 4D or 5D input and grid with same number of " - "dimensions, but got input with sizes ", input.sizes(), - " and grid with sizes ", grid.sizes()); - TORCH_CHECK( - input.size(0) == grid.size(0), - "grid_sampler(): expected grid and input to have same batch size, but got " - "input with sizes ", input.sizes(), " and grid with sizes ", grid.sizes()); - TORCH_CHECK( - grid.size(-1) == input.dim() - 2, - "grid_sampler(): expected grid to have size ", input.dim() - 2, " in last " - "dimension, but got grid with sizes ", grid.sizes()); - TORCH_CHECK( - !(input.dim() == 5 && static_cast(interpolation_mode) == GridSamplerInterpolation::Bicubic), - "grid_sampler(): bicubic interpolation only supports 4D input" - ); - for (const auto i : c10::irange(2, input.dim())) { - TORCH_CHECK(input.size(i) > 0, - "grid_sampler(): expected input to have non-empty spatial dimensions, " - "but input has sizes ", input.sizes(), " with dimension ", i, " being " - "empty"); - } - // cudnn does not support inputs larger than 1024 - if (at::native::cudnn_is_acceptable(input) && - at::native::cudnn_is_acceptable(grid) && - at::native::canUse32BitIndexMath(input) && - at::native::canUse32BitIndexMath(grid) && - static_cast(interpolation_mode) == GridSamplerInterpolation::Bilinear && - static_cast(padding_mode) == GridSamplerPadding::Zeros && - align_corners && - input.dim() == 4 && - input.size(1) <= 1024) { +// See NOTE [ grid_sampler Native Functions ]. +Tensor grid_sampler( + const Tensor& input, + const Tensor& grid, + int64_t interpolation_mode, + int64_t padding_mode, + bool align_corners +) { + if (cond_cudnn_grid_sampler(input, grid) && + static_cast(interpolation_mode) == + GridSamplerInterpolation::Bilinear && + static_cast(padding_mode) == + GridSamplerPadding::Zeros && + align_corners) { return cudnn_grid_sampler(input, grid); } + if (input.dim() == 4) { - return at::grid_sampler_2d(input, grid, interpolation_mode, padding_mode, align_corners); + return at::grid_sampler_2d( + input, grid, interpolation_mode, padding_mode, align_corners); } else { - return at::grid_sampler_3d(input, grid, interpolation_mode, padding_mode, align_corners); + return at::grid_sampler_3d( + input, grid, interpolation_mode, padding_mode, align_corners); } } diff --git a/aten/src/ATen/native/GridSampler.h b/aten/src/ATen/native/GridSampler.h index effc322c0d3a..f4a735032430 100644 --- a/aten/src/ATen/native/GridSampler.h +++ b/aten/src/ATen/native/GridSampler.h @@ -1,16 +1,13 @@ #pragma once -#include -#include +#include +#include +#include +#include -namespace at { namespace native { - -namespace detail { +#include - enum class GridSamplerInterpolation {Bilinear, Nearest, Bicubic}; - enum class GridSamplerPadding {Zeros, Border, Reflection}; - -} // namespace detail +namespace at { namespace native { using detail::GridSamplerInterpolation; using detail::GridSamplerPadding; diff --git a/aten/src/ATen/native/GridSamplerUtils.h b/aten/src/ATen/native/GridSamplerUtils.h new file mode 100644 index 000000000000..0b6f29de8c42 --- /dev/null +++ b/aten/src/ATen/native/GridSamplerUtils.h @@ -0,0 +1,109 @@ +#pragma once + +// See NOTE: [Tensor vs. TensorBase] +// https://github.com/pytorch/pytorch/pull/66979 +#include +#include +#include + +namespace at { namespace native { + +namespace detail { + +enum class GridSamplerInterpolation {Bilinear, Nearest, Bicubic}; +enum class GridSamplerPadding {Zeros, Border, Reflection}; + +} // namespace detail + +using detail::GridSamplerInterpolation; +using detail::GridSamplerPadding; + +namespace { + +// See NOTE [ grid_sampler Native Functions ]. +void check_grid_sampler_common( + const TensorBase& input, + const TensorBase& grid +) { + auto input_opt = input.options(); + auto grid_opt = grid.options(); + + TORCH_CHECK( + input.defined(), + "grid_sampler(): expected input to not be undefined"); + TORCH_CHECK( + grid.defined(), + "grid_sampler(): expected grid to not be undefined"); + TORCH_CHECK( + input_opt.device() == grid_opt.device(), + "grid_sampler(): expected input and grid to be on same device, but input " + "is on ", input_opt.device(), " and grid is on ", grid_opt.device()); + TORCH_CHECK( + input_opt.layout() == kStrided && grid_opt.layout() == kStrided, + "grid_sampler(): expected input and grid to have torch.strided layout, but " + "input has ", input_opt.layout(), " and grid has ", grid_opt.layout()); + TORCH_CHECK( + input.size(0) == grid.size(0), + "grid_sampler(): expected grid and input to have same batch size, but got " + "input with sizes ", input.sizes(), " and grid with sizes ", grid.sizes()); + TORCH_CHECK( + grid.size(-1) == input.dim() - 2, + "grid_sampler(): expected grid to have size ", input.dim() - 2, " in last " + "dimension, but got grid with sizes ", grid.sizes()); + + for (const auto i : c10::irange(2, input.dim())) { + TORCH_CHECK(input.size(i) > 0, + "grid_sampler(): expected input to have non-empty spatial dimensions, " + "but input has sizes ", input.sizes(), " with dimension ", i, " being " + "empty"); + } +} + +// See NOTE [ grid_sampler Native Functions ]. +void check_grid_sampler_2d( + const TensorBase& input, + const TensorBase& grid +) { + TORCH_CHECK( + input.dim() == 4 && input.dim() == grid.dim(), + "grid_sampler(): expected 4D input and grid with same number of " + "dimensions, but got input with sizes ", input.sizes(), + " and grid with sizes ", grid.sizes()); +} + +// See NOTE [ grid_sampler Native Functions ]. +void check_grid_sampler_3d( + const TensorBase& input, + const TensorBase& grid, + int64_t interpolation_mode +) { + TORCH_CHECK( + input.dim() == 5 && input.dim() == grid.dim(), + "grid_sampler(): expected 5D input and grid with same number of " + "dimensions, but got input with sizes ", input.sizes(), + " and grid with sizes ", grid.sizes()); + TORCH_CHECK( + !(input.dim() == 5 && + static_cast(interpolation_mode) == + GridSamplerInterpolation::Bicubic), + "grid_sampler(): bicubic interpolation only supports 4D input"); +} + +// See NOTE [ grid_sampler Native Functions ]. +// cudnn does not support inputs larger than 1024. +bool cond_cudnn_grid_sampler( + const TensorBase& input, + const TensorBase& grid +) { + return ( + at::native::cudnn_is_acceptable(input) && + at::native::cudnn_is_acceptable(grid) && + at::native::canUse32BitIndexMath(input) && + at::native::canUse32BitIndexMath(grid) && + input.dim() == 4 && + input.size(1) <= 1024); +} + +} // anonymous namespace + +}} // namespace at::native diff --git a/aten/src/ATen/native/Histogram.cpp b/aten/src/ATen/native/Histogram.cpp index abd1ae32ded1..c3a007f2c2dc 100644 --- a/aten/src/ATen/native/Histogram.cpp +++ b/aten/src/ATen/native/Histogram.cpp @@ -407,4 +407,28 @@ Tensor histogram_histc_cpu(const Tensor& self, int64_t bin_ct, return histogram_histc_cpu_out(self, bin_ct, min, max, hist); } +std::tuple> histogramdd( + const Tensor &self, TensorList bins, c10::optional> /*range*/, + const c10::optional &weight, bool density) { + auto hist = at::_histogramdd_from_bin_tensors(self, bins, weight, density); + return std::tuple>{ + std::move(hist), bins.vec()}; +} + +std::tuple> histogramdd( + const Tensor &self, IntArrayRef bins, c10::optional> range, + const c10::optional &weight, bool density) { + auto bin_edges = at::_histogramdd_bin_edges(self, bins, range, weight, density); + auto hist = at::_histogramdd_from_bin_cts(self, bins, range, weight, density); + return std::tuple>{ + std::move(hist), std::move(bin_edges)}; +} + +std::tuple> histogramdd( + const Tensor &self, int64_t bins, c10::optional> range, + const c10::optional &weight, bool density) { + DimVector bins_v(self.size(-1), bins); + return at::native::histogramdd(self, bins_v, range, weight, density); +} + }} // namespace at::native diff --git a/aten/src/ATen/native/Histogram.h b/aten/src/ATen/native/Histogram.h index 02dbe4723b15..9df0aafafc18 100644 --- a/aten/src/ATen/native/Histogram.h +++ b/aten/src/ATen/native/Histogram.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include diff --git a/aten/src/ATen/native/Itertools.cpp b/aten/src/ATen/native/Itertools.cpp index d1117b8c1d4d..265b05054b0a 100644 --- a/aten/src/ATen/native/Itertools.cpp +++ b/aten/src/ATen/native/Itertools.cpp @@ -11,7 +11,7 @@ Tensor _triu_mask(int64_t n, int64_t dims, bool diagonal, TensorOptions opt) { // get a mask that has value 1 whose indices satisfies i < j < k < ... // or i <= j <= k <= ... (depending on diagonal) Tensor range = at::arange(n, opt.dtype(kLong)); - std::vector index_grids = at::meshgrid(std::vector(dims, range)); + std::vector index_grids = at::meshgrid(std::vector(dims, range), "ij"); Tensor mask = at::full(index_grids[0].sizes(), true, opt.dtype(kBool)); if(diagonal) { for(int64_t i = 0; i < dims - 1; i++) { @@ -46,9 +46,12 @@ Tensor cartesian_prod(TensorList tensors) { Tensor combinations(const Tensor& self, int64_t r, bool with_replacement) { TORCH_CHECK(self.dim() == 1, "Expect a 1D vector, but got shape ", self.sizes()); - TORCH_CHECK(r > 0, "Expect a positive number, but got ", r); + TORCH_CHECK(r >= 0, "Expect a non-negative number, but got ", r); + if (r == 0) { + return at::empty({0}, self.options()); + } int64_t num_elements = self.numel(); - std::vector grids = at::meshgrid(std::vector(r, self)); + std::vector grids = at::meshgrid(std::vector(r, self), "ij"); Tensor mask = _triu_mask(num_elements, r, with_replacement, self.options()); for(Tensor &t : grids) { t = t.masked_select(mask); diff --git a/aten/src/ATen/native/Lerp.cpp b/aten/src/ATen/native/Lerp.cpp index 4e8dbbccdff7..bfac91a881ae 100644 --- a/aten/src/ATen/native/Lerp.cpp +++ b/aten/src/ATen/native/Lerp.cpp @@ -18,7 +18,7 @@ TORCH_META_FUNC(lerp_Tensor)( } TORCH_META_FUNC(lerp_Scalar)( - const Tensor& self, const Tensor& end, const Scalar& weight) { + const Tensor& self, const Tensor& end, const Scalar& /*weight*/) { TORCH_CHECK(self.dtype() == end.dtype(), "expected dtype ", self.dtype(), " for `end` but got dtype ", end.dtype()); build_binary_op(maybe_get_output(), self, end); @@ -29,12 +29,12 @@ TORCH_META_FUNC(lerp_Scalar)( namespace native { TORCH_IMPL_FUNC(lerp_Tensor)( - const Tensor& self, const Tensor& end, const Tensor& weight, const Tensor &out) { + const Tensor& /*self*/, const Tensor& /*end*/, const Tensor& weight, const Tensor& /*out*/) { lerp_kernel_tensor_weight(device_type(), *this); } TORCH_IMPL_FUNC(lerp_Scalar)( - const Tensor& self, const Tensor& end, const Scalar& weight, const Tensor &out) { + const Tensor& /*self*/, const Tensor& /*end*/, const Scalar& weight, const Tensor& /*out*/) { lerp_kernel_scalar_weight(device_type(), *this, weight); } diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp index 3a4a8e1fd7f2..127a2cdc1037 100644 --- a/aten/src/ATen/native/Linear.cpp +++ b/aten/src/ATen/native/Linear.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -25,6 +26,9 @@ Tensor linear(const Tensor& input, const Tensor& weight, const c10::optionaldefined() && input.is_contiguous()) { + // Also hit the fused path for contiguous 3D input. + const auto input_sizes = input.sizes(); + const auto result = at::addmm(*bias, input.view({input_sizes[0] * input_sizes[1], input_sizes[2]}), weight.t()); + return result.view({input_sizes[0], input_sizes[1], result.size(1)}); + } auto output = at::matmul(input, weight.t()); if (bias->defined()) { - output.add_(*bias); + // for composite compliance use out-of-place version of `add` + if (isTensorSubclassLike(*bias)) { + output = at::add(output, *bias); + } else { + output.add_(*bias); + } } return output; } diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index aed94f107051..c7ed0850e778 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -28,16 +28,41 @@ #include namespace at { + +namespace detail { + void check_linalg_norm_dtype(optional opt_dtype, ScalarType self_dtype, const char* const name) { + if (opt_dtype.has_value()) { + auto dtype = opt_dtype.value(); + TORCH_CHECK(isFloatingType(dtype) || isComplexType(dtype), name, ": dtype should" + " be floating point or complex, but got ", dtype); + TORCH_CHECK(isComplexType(self_dtype) == isComplexType(dtype), + name, ": dtype should be ", isComplexType(self_dtype) ? "complex" : "real", + " for ", isComplexType(self_dtype) ? "complex" : "real", " inputs, but got ", dtype); + TORCH_CHECK(promoteTypes(self_dtype, dtype) == dtype, + name, ": the dtype of the input ", "(", self_dtype, ") should be convertible ", + "without narrowing to the specified dtype (", dtype, ")."); + } + } +} + namespace meta { -TORCH_META_FUNC(addmm)(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha) { - TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor"); - TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor"); - TORCH_CHECK( - mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (", - mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")"); - auto names = at::namedinference::propagate_names_for_addmm(mat1, mat2, self); +#define ADDMM_META() \ + TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor"); \ + TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor"); \ + TORCH_CHECK( \ + mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (", \ + mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")"); \ + \ + auto names = at::namedinference::propagate_names_for_addmm(mat1, mat2, self); \ set_output(0, {mat1.sizes()[0], mat2.sizes()[1]}, {}, self.options(), names); + +TORCH_META_FUNC(addmm)(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha) { + ADDMM_META(); +} + +TORCH_META_FUNC(_addmm_activation)(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, bool use_gelu) { + ADDMM_META(); } TORCH_META_FUNC(mm)(const Tensor & self, const Tensor & mat2) { @@ -51,6 +76,39 @@ TORCH_META_FUNC(mm)(const Tensor & self, const Tensor & mat2) { set_output(0, {self.sizes()[0], mat2.sizes()[1]}, {}, self.options(), names); } +TORCH_META_FUNC(linalg_vector_norm)(const Tensor& self, const Scalar& scalar_ord, OptionalIntArrayRef opt_dim, bool keepdim, optional opt_dtype) { + at::native::checkFloatingOrComplex(self, "linalg.vector_norm"); + + auto dim = opt_dim.value_or(IntArrayRef{}); + // Casting a large integer to a double will just introduce an error for + // values larger than 10^53 (same for negative numbers), so that's fine. + auto ord = scalar_ord.toDouble(); + + // For more context, see issue 52783 + // If the tensor is empty and norm < 0 || norm == infty + // - We cannot reduce the whole tensor + // - We cannot reduce over an empty dimension + if (self.numel() == 0 && (ord < 0. || ord == INFINITY)) { + TORCH_CHECK(opt_dim.has_value(), + "linalg.vector_norm cannot compute the ", scalar_ord, " norm on an empty ", + "tensor because the operation does not have an identity"); + for (auto dim_num : dim) { + TORCH_CHECK(self.size(dim_num) != 0, + "linalg.vector_norm cannot compute the ", scalar_ord, " norm on an empty ", + "dimension because the operation does not have an identity"); + } + } + + at::detail::check_linalg_norm_dtype(opt_dtype, self.scalar_type(), "linalg.vector_norm"); + + auto mask = at::native::make_dim_mask(dim, self.dim()); + auto shape = at::native::shape_from_dim_mask(self, std::move(mask), keepdim); + auto options = self.options() + .dtype(toRealValueType(opt_dtype.value_or(self.scalar_type()))); + + set_output(shape, options); +} + template void common_checks_baddbmm_bmm(Meta& meta, const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, bool is_bmm, const c10::optional& self_baddbmm = nullopt) { TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor"); @@ -111,7 +169,6 @@ TORCH_META_FUNC(baddbmm)(const Tensor& self, const Tensor& batch1, const Tensor& namespace native { DEFINE_DISPATCH(addr_stub); -DEFINE_DISPATCH(linalg_vector_norm_stub); // As P is a permutation matrix // det(P) = 1 if it's an even permutation and det(P) = -1 if it's an odd permutation @@ -209,7 +266,7 @@ std::tuple linalg_slogdet_out(const Tensor& input, Tensor& sig checkSameDevice("linalg.slogdet", sign, input, "sign"); checkSameDevice("linalg.slogdet", logabsdet, input, "logabsdet"); checkLinalgCompatibleDtype("linalg.slogdet", sign, input, "sign"); - ScalarType real_dtype = toValueType(input.scalar_type()); + ScalarType real_dtype = toRealValueType(input.scalar_type()); // logabsdet is always real-valued here checkLinalgCompatibleDtype("linalg.slogdet", logabsdet.scalar_type(), real_dtype, "logabsdet"); @@ -248,7 +305,7 @@ std::tuple get_atol_rtol( rtol = rtol_opt.value(); checkNotComplexTolerance(rtol, function_name, "rtol"); } else { - ScalarType real_dtype = toValueType(input.scalar_type()); + ScalarType real_dtype = toRealValueType(input.scalar_type()); auto default_rtol = at::full({}, _get_epsilon(real_dtype) * std::max(input.size(-1), input.size(-2)), options); rtol = atol_opt.has_value() ? at::where(atol_opt.value() > 0, at::zeros({}, options), default_rtol) @@ -266,7 +323,7 @@ std::tuple get_atol_rtol( if (rtol_opt.has_value()) { rtol = rtol_opt.value(); } else { - ScalarType real_dtype = toValueType(input.scalar_type()); + ScalarType real_dtype = toRealValueType(input.scalar_type()); auto default_rtol = _get_epsilon(real_dtype) * std::max(input.size(-1), input.size(-2)); rtol = (atol_opt.has_value() && atol_opt.value() > 0.0) ? 0.0 @@ -416,6 +473,7 @@ Tensor linalg_matrix_power_impl( const Tensor& self, int64_t n, c10::optional _out) { + NoTF32Guard disable_tf32; auto out = _out.value_or(Tensor()); squareCheckInputs(self, "linalg.matrix_power"); @@ -1125,6 +1183,19 @@ static void addmm_impl_cpu_( return; } + // Some paths in the code below do not handle multiplications of the form [a, 0] x [0, b] + if (m1_sizes[1] == 0) { + if (beta.toComplexDouble() == 0.0) { + result.zero_(); + } else { + if (!self.is_same(result)) { + result.copy_(self); + } + result.mul_(beta); + } + return; + } + if (beta.toComplexDouble() != 0.0 && !self.is_same(result)) { result.copy_(self); } @@ -1201,7 +1272,7 @@ static void addmm_impl_cpu_( TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!c.is_conj()); // Apply BLAS routine - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, result.scalar_type(), "addmm_impl_cpu_", [&]{ at::native::cpublas::gemm( @@ -1289,6 +1360,19 @@ TORCH_IMPL_FUNC(addmm_out_cpu)(const Tensor& self, const Tensor& mat1, const Ten } } +TORCH_IMPL_FUNC(addmm_activation_out_cpu)(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, bool use_gelu, const Tensor &result) { + auto b_self = expand_size(self, {mat1.sizes()[0], mat2.sizes()[1]}, "addmm_out"); + { + at::NoNamesGuard guard; + addmm_impl_cpu_(const_cast(result), *b_self, mat1, mat2, beta, alpha); + if (use_gelu) { + at::gelu_(const_cast(result)); + } else { + at::relu_(const_cast(result)); + } + } +} + TORCH_IMPL_FUNC(mm_out_cpu)(const Tensor & self, const Tensor & mat2, const Tensor & result) { { at::NoNamesGuard guard; @@ -1393,20 +1477,6 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens // is_bmm_out: true for bmm_out, false for baddbmm_ // self_or_result is "self" for baddbmm_ and "result" for bmm_out Tensor& self_or_result = const_cast(self_or_result_); - CheckedFrom c = (is_bmm_out ? "bmm" : "baddbmm"); - - auto checkOnCPU = [](const Tensor& t, CheckedFrom c) { - TORCH_CHECK( - !t.is_cuda(), - "Expect tensor to have CPU backend, but got tensor with ", - toString(t.options().backend()), - " Backend (while checking arguments for ", - c); - }; - - checkOnCPU(self_or_result, c); - checkOnCPU(batch1, c); - checkOnCPU(batch2, c); const auto batch1_sizes = batch1.sizes(); const auto batch2_sizes = batch2.sizes(); @@ -1443,16 +1513,15 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens if (contraction_size * res_rows * res_cols < 400) { if (is_bmm_out) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, batch1.scalar_type(), "bmm", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, batch1.scalar_type(), "bmm", [&] { baddbmm_cpu_kernel(self_or_result, batch1, batch2, beta, alpha); }); } else { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, batch1.scalar_type(), "baddbmm", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, batch1.scalar_type(), "baddbmm", [&] { baddbmm_cpu_kernel(self_or_result, batch1, batch2, beta, alpha); }); } } else if (at::hasMKL() && (( - self_or_result.scalar_type() != kHalf && self_or_result.scalar_type() != kBFloat16 && at::native::is_floating_point(self_or_result)) || at::native::is_complex(self_or_result)) @@ -1582,124 +1651,164 @@ Tensor& vdot_out(const Tensor& self, const Tensor& other, Tensor& result) { return result.fill_(self.vdot(other)); } +bool should_fold(const Tensor& tensor1, const int64_t dim_tensor2) { + const auto dim_tensor1 = tensor1.dim(); + if (dim_tensor1 >= 3 && (dim_tensor2 == 1 || dim_tensor2 == 2)) { + const auto t1_sizes_ptr = tensor1.sizes().cbegin(); + const auto t1_strides = tensor1.strides(); + if (dim_tensor1 == 3 && dim_tensor2 == 2 && + t1_strides.back() != 1 && + t1_strides.front() == t1_sizes_ptr[1] * t1_sizes_ptr[2]) { + // First dim is slowest moving, and then the following two dims are + // transposed. This can happen for example by permute(0, 2, 1). + // First 2 dims could be folded to use mm but would require permutation + // with actual data movement, which can be instead handled by BMM with each + // GEMM transposed. + // This can be generalized to a tensor with dim X + Y + Z where X, Y, and Z + // dims are contiguous, Y dims and Z dims are transposed, and X, Y, Z > 0. + // For example, this can happen by permute(0, 1, 5, 2, 3, 4), where X = 2, + // Y = 3, and Z = 1. + return false; + } else { + return true; + } + } else { + return false; + } +} + /* Matrix product of two Tensors. The behavior depends on the dimensionality of the Tensors as follows: -- If both Tensors are 1-dimensional, the dot product (scalar) is returned. -- If both arguments are 2-dimensional, the matrix-matrix product is returned. -- If the first argument is 1-dimensional and the second argument is 2-dimensional, - a 1 is prepended to its dimension for the purpose of the matrix multiply. - After the matrix multiply, the prepended dimension is removed. -- If the first argument is 2-dimensional and the second argument is 1-dimensional, - the matrix-vector product is returned. -- If both arguments are at least 1-dimensional and at least one argument is - N-dimensional (where N > 2), then a batched matrix multiply is returned. If the first - argument is 1-dimensional, a 1 is prepended to its dimension for the purpose of the - batched matrix multiply and removed after. If the second argument is 1-dimensional, a - 1 is appended to its dimension for the purpose of the batched matrix multiple and removed after. - The non-matrix (i.e. batch) dimensions are broadcasted (and thus - must be broadcastable). For example, if tensor1 is a (j x 1 x n x m) Tensor - and tensor2 is a (k x m x p) Tensor, the returned tensor will be an (j x k x n x p) Tensor. +- If both Tensors are 1-dimensional, (1d) the dot product (scalar) is returned. +- If the arguments are 2D - 1D or 1D - 2D, the matrix-vector product is returned. +- If both arguments are 2D, the matrix-matrix product is returned. +- If one of the arguments is ND with N >= 3 and the other is 1D or 2D, and some + conditions on the strides apply (see should_fold) we fold the first N-1 dimensions + of the ND argument to form a matrix, call mm or mv, reshape it back to ND and return it +- Otherwise, we return bmm, after broadcasting and folding the batched dimensions if + there's more than one */ -Tensor matmul( - c10::optional out_opt, +Tensor _matmul_impl( + Tensor& out, const Tensor& tensor1, const Tensor& tensor2) { NoNamesGuard guard; - auto dim_tensor1 = tensor1.dim(); - auto dim_tensor2 = tensor2.dim(); - auto has_out = out_opt.has_value(); - Tensor out = out_opt.value_or(Tensor()); + const auto dim_tensor1 = tensor1.dim(); + const auto dim_tensor2 = tensor2.dim(); + + // This is checked up here to simplify the logic below + // Note that the strings are just evaluated on failure, so almost always we just evaluate + // the condition and move on + TORCH_CHECK(dim_tensor1 != 0 && dim_tensor2 != 0, + "both arguments to matmul need to be at least 1D, but they are ", + dim_tensor1, "D and ", dim_tensor2, "D"); + + + const bool has_out = out.defined(); if (dim_tensor1 == 1 && dim_tensor2 == 1) { - return has_out ? at::native::dot_out(tensor1, tensor2, out) : tensor1.dot(tensor2); + return has_out ? at::dot_out(out, tensor1, tensor2) : tensor1.dot(tensor2); } else if (dim_tensor1 == 2 && dim_tensor2 == 1) { return has_out ? at::mv_out(out, tensor1, tensor2) : tensor1.mv(tensor2); } else if (dim_tensor1 == 1 && dim_tensor2 == 2) { - return has_out ? at::mm_out(out, tensor1.unsqueeze(0), tensor2).squeeze_(0) - : tensor1.unsqueeze(0).mm(tensor2).squeeze_(0); + return has_out ? at::mv_out(out, tensor2.t(), tensor1) : tensor2.t().mv(tensor1); } else if (dim_tensor1 == 2 && dim_tensor2 == 2) { return has_out ? at::mm_out(out, tensor1, tensor2) : tensor1.mm(tensor2); - } else if (dim_tensor1 >= 3 && (dim_tensor2 == 1 || dim_tensor2 == 2)) { - // optimization: use mm instead of bmm by folding tensor1's batch into - // its leading matrix dimension. - - Tensor t2 = dim_tensor2 == 1 ? tensor2.unsqueeze(-1) : tensor2; - auto size1 = tensor1.sizes(); - auto size2 = t2.sizes(); - std::vector output_size; - output_size.insert(output_size.end(), size1.begin(), size1.end() - 1); - if (dim_tensor2 > 1) { - output_size.push_back(size2[dim_tensor2 - 1]); - } - - // fold the batch into the first dimension - // Why not tensor1.view(-1, size1[size1.size() -1])? + } else if (should_fold(tensor1, dim_tensor2) || should_fold(tensor2, dim_tensor1)) { + // dim_tensor1 >=3 && (dim_tensor2 == 1 || dim_tensor2 == 2) || + // dim_tensor2 >=3 && (dim_tensor1 == 1 || dim_tensor1 == 2) + // and some condition on the strides is fulfilled + + // optimization: use mm instead of bmm by folding the batch of the larger tensor + // into its leading matrix dimension + const auto transpose = dim_tensor2 > dim_tensor1; + const auto t1 = transpose ? MaybeOwned::owned(tensor2.mT()) + : MaybeOwned::borrowed(tensor1); + const auto t2 = !transpose ? MaybeOwned::borrowed(tensor2) + : dim_tensor1 == 2 + ? MaybeOwned::owned(tensor1.t()) + : MaybeOwned::borrowed(tensor1); + // Invariant: t1->dim() >= 3 && (t2->dim() == 1 || t2->dim() == 2) + // and *t1 and *t2 are matmul-compatible + + // Why not t1->view(-1, sizes_1.back())? // If the last dim is 0, then view(-1, 0) won't work because the -1 becomes ambiguous. // This can happen in e.g. [3, 5, 0] @ [0, 0]. - // So we manually compute the folding as a result. - const auto dim1_size = c10::multiply_integers(size1.begin(), size1.end() - 1); - auto t1 = tensor1.expect_contiguous()->view({dim1_size, size1[size1.size() - 1]}); - Tensor output = has_out ? at::_unsafe_view(at::mm_out(out, t1, t2), output_size) - : at::_unsafe_view(t1.mm(t2), output_size); - return has_out ? out.set_(output) : output; - } else if ((dim_tensor1 == 1 || dim_tensor1 == 2) && dim_tensor2 >= 3) { - // optimization: transpose the inner dimensions of the arguments, call - // matmul on the swapped arguments, then transpose the inner dimensions - // of the result. - const int64_t n = dim_tensor1 == 2 ? tensor1.size(-2) : 1; - const int64_t m = tensor1.size(-1); - const int64_t p = tensor2.size(-1); - - const Tensor t2_T = tensor2.transpose(-1, -2); - const Tensor t1_T = dim_tensor1 == 2 ? tensor1.t() : tensor1.reshape({n, m}).t(); - const Tensor res_T = matmul(out_opt, t2_T, t1_T); - - if (dim_tensor1 == 2) { - Tensor res = res_T.transpose(-1, -2).contiguous(); - return has_out ? out.set_(res) : res; + const auto sizes_1 = t1->sizes(); + auto output_shape = DimVector(sizes_1.begin(), sizes_1.end() - 1); + const auto folded_dim1 = c10::multiply_integers(output_shape); + + // Readjust output_shape if we are multiplying by a matrix + const auto t2_is_matrix = t2->dim() == 2; + if (t2_is_matrix) { + output_shape.push_back(t2->sizes()[1]); } - else { - std::vector shape = tensor2.sizes().slice(0, dim_tensor2 - 2).vec(); - shape.push_back(p); - - Tensor res = res_T.reshape(shape).contiguous(); - return has_out ? out.set_(res) : res; + const auto t1_folded = t1->reshape({folded_dim1, sizes_1.back()}); + if (!has_out) { + if (t2_is_matrix) { + // FIXME This path always does an unnecessary copy when transpose == true as the returned + // result from BLAS is already C-transposed + const auto output = at::_unsafe_view(t1_folded.mm(*t2), output_shape); + return transpose ? output.mT().contiguous() : output; + } else { + return at::_unsafe_view(t1_folded.mv(*t2), output_shape); + } + } else { + // Resize output into the correct shape + const auto transpose_out = transpose && t2_is_matrix; + if (transpose_out) { + // Swap last two elements of output_shape + std::iter_swap(output_shape.end() - 2, output_shape.end() - 1); + at::native::resize_output(out, output_shape); + std::iter_swap(output_shape.end() - 2, output_shape.end() - 1); + } else { + at::native::resize_output(out, output_shape); + } + const auto out_ = transpose_out ? c10::MaybeOwned::owned(out.mT()) + : c10::MaybeOwned::borrowed(out); + + // We then reshape the output to the expected shape and call mm/mv + // and transpose back if necessary + auto reshaped_out = t2_is_matrix ? out_->reshape({folded_dim1, t2->sizes().back()}) + : out_->reshape({folded_dim1}); + if (t2_is_matrix) { + at::mm_out(reshaped_out, t1_folded, *t2); + } else { + at::mv_out(reshaped_out, t1_folded, *t2); + } + if (!reshaped_out.is_alias_of(out)) { + out_->copy_(reshaped_out.view_as(*out_)); + } + return out; } - } else if ((dim_tensor1 >= 1 && dim_tensor2 >= 1) && (dim_tensor1 >= 3 || dim_tensor2 >= 3)) { - // We are multiplying b1 x n x m1 by x2 x m2 x p (where b1 can be a list); - // we track m1 vs m2 separately even though they must match for nicer error messages - int64_t n = dim_tensor1 > 1 ? tensor1.size(-2) : 1; - int64_t m1 = tensor1.size(-1); - IntArrayRef batch_tensor1(tensor1.sizes().data(), std::max(dim_tensor1 - 2, 0)); - int64_t m2 = dim_tensor2 > 1 ? tensor2.size(-2) : 1; - int64_t p = tensor2.size(-1); - IntArrayRef batch_tensor2(tensor2.sizes().data(), std::max(dim_tensor2 - 2, 0)); - - // expand the batch portion (i.e. cut off matrix dimensions and expand rest) - std::vector expand_batch_portion = infer_size(batch_tensor1, batch_tensor2); - - std::vector tensor1_expand_size(expand_batch_portion); - tensor1_expand_size.insert(tensor1_expand_size.end(), {n, m1}); - - std::vector tensor2_expand_size(expand_batch_portion); - tensor2_expand_size.insert(tensor2_expand_size.end(), {m2, p}); - - const int64_t expand_batch_product = - c10::multiply_integers(expand_batch_portion); - - std::vector tensor1_bmm_view({expand_batch_product}); - tensor1_bmm_view.insert(tensor1_bmm_view.end(), {n, m1}); - - std::vector tensor2_bmm_view({expand_batch_product}); - tensor2_bmm_view.insert(tensor2_bmm_view.end(), {m2, p}); + } else { + // dim_tensor1 >= 3 || dim_tensor2 >= 3 + // We track m1 vs m2 separately even though they must match for nicer error messages + const int64_t n = dim_tensor1 > 1 ? tensor1.sizes().cend()[-2] : 1LL; + const int64_t m1 = tensor1.sizes().back(); + const IntArrayRef batch_tensor1(tensor1.sizes().data(), + std::max(dim_tensor1 - 2, 0LL)); + const int64_t m2 = dim_tensor2 > 1 ? tensor2.sizes().cend()[-2] : tensor2.sizes().back(); + const int64_t p = dim_tensor2 > 1 ? tensor2.sizes().back() : 1LL; + const IntArrayRef batch_tensor2(tensor2.sizes().data(), + std::max(dim_tensor2 - 2, 0LL)); + auto output_shape = infer_size_dimvector(batch_tensor1, batch_tensor2); + + const auto tensor1_expand_size = [&output_shape, n, m1]{ DimVector ret(output_shape); + ret.append({n, m1}); + return ret; }(); + const auto tensor2_expand_size = [&output_shape, m2, p]{ DimVector ret(output_shape); + ret.append({m2, p}); + return ret; }(); + + const int64_t expand_batch_product = c10::multiply_integers(output_shape); // flatten expanded batches - Tensor tensor1_expanded = tensor1.expand(tensor1_expand_size).reshape(tensor1_bmm_view); - Tensor tensor2_expanded = tensor2.expand(tensor2_expand_size).reshape(tensor2_bmm_view); - - // reshape batches back into result - std::vector output_shape(expand_batch_portion); + const auto tensor1_expanded = tensor1.expand(tensor1_expand_size) + .reshape({expand_batch_product, n, m1}); + const auto tensor2_expanded = tensor2.expand(tensor2_expand_size) + .reshape({expand_batch_product, m2, p}); if (dim_tensor1 > 1) { output_shape.push_back(n); } @@ -1707,37 +1816,42 @@ Tensor matmul( output_shape.push_back(p); } - Tensor output = has_out ? at::_unsafe_view(at::bmm_out(out, tensor1_expanded, tensor2_expanded), output_shape) - : at::_unsafe_view(tensor1_expanded.bmm(tensor2_expanded), output_shape); - - return has_out ? out.set_(output) : output; + if (!has_out) { + return at::_unsafe_view(tensor1_expanded.bmm(tensor2_expanded), output_shape); + } else { + at::native::resize_output(out, output_shape); + auto reshaped_out = out.reshape({expand_batch_product, n, p}); + at::bmm_out(reshaped_out, tensor1_expanded, tensor2_expanded); + if (!reshaped_out.is_alias_of(out)) { + out.copy_(reshaped_out.view_as(out)); + } + return out; + } } - - AT_ERROR("both arguments to matmul need to be at least 1D, but they are ", - dim_tensor1, "D and ", dim_tensor2, "D"); } Tensor matmul(const Tensor & tensor1, const Tensor & tensor2) { auto maybe_outnames = namedinference::compute_matmul_outnames(tensor1, tensor2); - auto result = at::native::matmul(c10::nullopt, tensor1, tensor2); + at::Tensor unused; + auto result = at::native::_matmul_impl(unused, tensor1, tensor2); namedinference::propagate_names_if_nonempty(result, maybe_outnames); return result; } Tensor& matmul_out(const Tensor & tensor1, const Tensor & tensor2, Tensor &result) { auto maybe_outnames = namedinference::compute_matmul_outnames(tensor1, tensor2); - at::native::matmul(c10::optional(result), tensor1, tensor2); + at::native::_matmul_impl(result, tensor1, tensor2); namedinference::propagate_names_if_nonempty(result, maybe_outnames); return result; } // torch.linalg.matmul, alias for torch.matmul Tensor linalg_matmul(const Tensor & tensor1, const Tensor & tensor2) { - return at::native::matmul(tensor1, tensor2); + return at::matmul(tensor1, tensor2); } Tensor& linalg_matmul_out(const Tensor & tensor1, const Tensor & tensor2, Tensor &result) { - return at::native::matmul_out(tensor1, tensor2, result); + return at::matmul_out(result, tensor1, tensor2); } // torch.linalg.diagonal, alias for torch.diagonal with dim1=-2, dim2=-1 as defaults @@ -1798,8 +1912,10 @@ void _fill_matrix_powers(Tensor& buffer, const Tensor& a, int num_matrices) { // fill a^2 if (2 <= num_matrices - 1) { - at::native::matmul( - buffer.select(0, 2), // out for a^2 + // out for a^2 + auto view_out = buffer.select(0, 2); + _matmul_impl( + view_out, buffer.select(0, 1), buffer.select(0, 1) ); @@ -1807,8 +1923,10 @@ void _fill_matrix_powers(Tensor& buffer, const Tensor& a, int num_matrices) { // fill a^3 if (3 <= num_matrices - 1) { - at::native::matmul( - buffer.select(0, 3), // out for a^3 + // out for a^3 + auto view_out = buffer.select(0, 3); + _matmul_impl( + view_out, buffer.select(0, 1), buffer.select(0, 2) ); @@ -1816,8 +1934,10 @@ void _fill_matrix_powers(Tensor& buffer, const Tensor& a, int num_matrices) { // fill a^6 if (4 <= num_matrices - 1) { - at::native::matmul( - buffer.select(0, 4), + // out for a^6 + auto view_out = buffer.select(0, 4); + _matmul_impl( + view_out, buffer.select(0, 3), buffer.select(0, 3) ); @@ -1847,7 +1967,7 @@ inline Tensor _blob_to_Tensor( // we also insert a fake dimension so that the result could directly // be used in _compute_linear_combination auto tensor = at::from_blob((void*)blob.begin(), blob.size(), - c10::toValueType(in.scalar_type())).unsqueeze(0); + c10::toRealValueType(in.scalar_type())).unsqueeze(0); return _move_memory_if_cuda_input(tensor, in); } @@ -1875,9 +1995,10 @@ Tensor compute_T4(const Tensor& A) { // 3 for {I, A, A^2} _fill_matrix_powers(As, A, 3); - at::native::matmul( - // output for A^2 * (I / 2 + A / 6 + A^2 / 24) - As.select(0, 3), + // output for A^2 * (I / 2 + A / 6 + A^2 / 24) + auto view_out = As.select(0, 3); + _matmul_impl( + view_out, // contains A^2 As.select(0, 2), // computes (I / 2 + A / 6 + A^2 / 24) @@ -1909,10 +2030,11 @@ Tensor compute_T8(const Tensor& A) { // 3 for {I, A, A^2} _fill_matrix_powers(As, A, 3); + // output for A4 + auto view_out = As.select(0, 3); // A4 = A2 * (x1 * A + x2 * A2) - at::native::matmul( - // output for A4 - As.select(0, 3), + _matmul_impl( + view_out, // As.select(0, 2) = A^2 As.select(0, 2), at::native::_compute_linear_combination( @@ -1922,10 +2044,11 @@ Tensor compute_T8(const Tensor& A) { ) ); + // output for A8 + view_out = As.select(0, 4); // A8 = (x3 * A2 + A4) * (x4 * I + x5 * A + x6 * A2 + x7 * A4) - at::native::matmul( - // output for A8 - As.select(0, 4), + _matmul_impl( + view_out, // x3 * A2 + A4 at::native::_compute_linear_combination( As.narrow(0, 2, 2), @@ -1980,7 +2103,7 @@ Tensor compute_T12(const Tensor& A) { reinterpret_cast(&b), {num_prods, num_prods}, {num_prods, 1}, - c10::toValueType(A.scalar_type()) + c10::toRealValueType(A.scalar_type()) ); bs = _move_memory_if_cuda_input(bs, A); @@ -1989,17 +2112,17 @@ Tensor compute_T12(const Tensor& A) { auto Bs = at::native::_compute_linear_combination(As, bs); + // output for A6 + auto view_out = As.select(0, 0); // compute A6 - Bs.select(0, 2).add_(at::native::matmul( - // tmp buffer for this matrix product - As.select(0, 0), + Bs.select(0, 2).add_(_matmul_impl( + view_out, Bs.select(0, 3), Bs.select(0, 3) )); - return Bs.select(0,0).add_(at::native::matmul( - // tmp buffer for this matrix product - As.select(0, 0), + return Bs.select(0, 0).add_(_matmul_impl( + view_out, Bs.select(0, 1).add_(Bs.select(0, 2)), Bs.select(0, 2) )); @@ -2052,7 +2175,7 @@ Tensor compute_T18(const Tensor& A) { reinterpret_cast(&b), {num_prods, num_prods}, {num_prods, 1}, - c10::toValueType(A.scalar_type()) + c10::toRealValueType(A.scalar_type()) ); bs = _move_memory_if_cuda_input(bs, A); @@ -2061,17 +2184,17 @@ Tensor compute_T18(const Tensor& A) { auto Bs = at::native::_compute_linear_combination(As, bs); + // tmp buffer for this matrix product + auto view_out = As.select(0, 0); // compute A9 - Bs.select(0, 3).add_(at::native::matmul( - // tmp buffer for this matrix product - As.select(0, 0), + Bs.select(0, 3).add_(_matmul_impl( + view_out, Bs.select(0, 0), Bs.select(0, 4)) ); - return Bs.select(0, 1).add_(at::native::matmul( - // tmp buffer for this matrix product - As.select(0, 0), + return Bs.select(0, 1).add_(_matmul_impl( + view_out, Bs.select(0, 2).add_(Bs.select(0, 3)), Bs.select(0, 3) )); @@ -2280,6 +2403,218 @@ Tensor matrix_exp_backward(const Tensor& self, const Tensor& grad) { ); } +TORCH_IMPL_FUNC(linalg_vector_norm_out)(const Tensor& self, const Scalar& scalar_ord, OptionalIntArrayRef opt_dim, bool keepdim, optional opt_dtype, const Tensor& result) { + // Casting a large integer to a double will just introduce an error for + // values larger than 10^53 (same for negative numbers), so that's fine. + auto ord = scalar_ord.toDouble(); + auto dim = opt_dim.value_or(IntArrayRef{}); + // No need to handle opt_dtype explicitly as it is already encoded in the dtype of result + + // Issue arising from the difference between vectorized and non-vectorized implementation on CPU + Tensor self_; + if (self.device().type() == c10::kCPU && + isComplexType(self.scalar_type()) && + std::abs(ord) == INFINITY) { + // TODO: This at::abs() call is used so that the at::abs() call in the + // backward function produces an identical result for complex inputs. + // However, it would be ideal if we could incorporate this into + // linalg_vector_norm_stub. See issue: + // https://github.com/pytorch/pytorch/issues/52648 + auto in_dtype = opt_dtype.value_or(self.scalar_type()); + self_ = self.to(in_dtype).abs(); + } else { + self_ = self; + } + + auto iter = make_reduction("vector_norm", const_cast(result), self_, dim, keepdim, result.scalar_type()); + norm_stub(iter.device_type(), iter, ord); +} + +void _linalg_matrix_norm_checks(const Tensor& A, IntArrayRef dim, optional opt_dtype) { + at::native::checkFloatingOrComplex(A, "linalg.matrix_norm"); + TORCH_CHECK(A.dim() >= 2, + "linalg.matrix_norm: input tensor must be a matrix or a batch of matrices"); + + // dim + TORCH_CHECK(dim.size() == 2, "linalg.matrix_norm: dim must be a 2-tuple of ints"); + TORCH_CHECK(dim[0] != dim[1], "Expected dims to be different, got (", dim[0], ", ", dim[1], ") instead"); + + // dtype + at::detail::check_linalg_norm_dtype(opt_dtype, A.scalar_type(), "linalg.matrix_norm"); +} + +Tensor linalg_matrix_norm( + const Tensor& A, + const Scalar& scalar_ord, + IntArrayRef dim, + bool keepdim, + optional opt_dtype) { + _linalg_matrix_norm_checks(A, dim, opt_dtype); + + auto ord = scalar_ord.toDouble(); + auto abs_ord = std::abs(ord); + TORCH_CHECK(abs_ord == 2. || abs_ord == 1. || abs_ord == INFINITY, "linalg.matrix_norm: Order ", ord, " not supported."); + + auto dim_ = dim.vec(); + maybe_wrap_dims(dim_, A.dim()); + + auto max_min = [ord, keepdim](const Tensor& A, int64_t dim) { return ord > 0 ? A.amax(dim, keepdim) : A.amin(dim, keepdim); }; + if (abs_ord == 2.) { + // Move dims to the end + auto permutation = create_dim_backshift_permutation(dim_[0], dim_[1], A.dim()); + + auto A_ = opt_dtype.has_value() ? A.to(*opt_dtype) : A; + auto result = max_min(at::linalg_svdvals(A_.permute(permutation)), -1); + if (keepdim) { + auto permutation_reverse = create_reverse_permutation(permutation); + result = result.unsqueeze(-1).permute(permutation_reverse); + } + return result; + } else { // 1, -1, inf, -inf + // The infty norm is like the 1 norm on the transposed matrix + if (abs_ord == INFINITY) { + std::swap(dim_[0], dim_[1]); + } + + // If the first reduction removes one dim from the front (dim_[0] < dim_[1]), after this + // reduction dim_[1] will be off by one + if (!keepdim && (dim_[0] < dim_[1])) { + dim_[1]--; + } + return max_min(at::linalg_vector_norm(A, 1., {dim_[0]}, keepdim, opt_dtype), dim_[1]); + } +} + +Tensor& linalg_matrix_norm_out( + const Tensor& A, + const Scalar& ord, + IntArrayRef dim, + bool keepdim, + optional opt_dtype, + Tensor& result) { + checkSameDevice("linalg.matrix_norm", A, result); + auto out = at::linalg_matrix_norm(A, ord, dim, keepdim, opt_dtype); + TORCH_CHECK(out.scalar_type() == result.scalar_type(), + "linalg.matrix_norm expected out tensor dtype ", out.scalar_type(), + " but got: ", result.scalar_type()); + at::native::resize_output(result, out.sizes()); + result.copy_(out); + return result; +} + +// fro / nuc +Tensor linalg_matrix_norm( + const Tensor& A, + c10::string_view ord, + IntArrayRef dim, + bool keepdim, + optional opt_dtype) { + _linalg_matrix_norm_checks(A, dim, opt_dtype); + TORCH_CHECK(ord == "fro" || ord == "nuc", "linalg.matrix_norm: Order ", ord, " not supported."); + + auto A_ = opt_dtype.has_value() ? A.to(*opt_dtype) : A; + using Int = IntArrayRef::value_type; + + if (ord == "fro") { + return at::linalg_vector_norm(A_, 2, dim, keepdim); + } else { // nuc + auto dim_ = dim.vec(); + maybe_wrap_dims(dim_, A_.dim()); + // Move dims to the end + auto permutation = create_dim_backshift_permutation(dim_[0], dim_[1], A_.dim()); + auto result = at::linalg_svdvals(A_.permute(permutation)).sum(-1, keepdim); + if (keepdim) { + auto permutation_reverse = create_reverse_permutation(permutation); + result = result.unsqueeze(-1).permute(permutation_reverse); + } + return result; + } +} + +Tensor& linalg_matrix_norm_out( + const Tensor& A, + c10::string_view ord, + IntArrayRef dim, + bool keepdim, + optional opt_dtype, + Tensor& result) { + checkSameDevice("linalg.matrix_norm", A, result); + auto out = at::linalg_matrix_norm(A, ord, dim, keepdim, opt_dtype); + TORCH_CHECK(out.scalar_type() == result.scalar_type(), + "linalg.matrix_norm expected out tensor dtype ", out.scalar_type(), + " but got: ", result.scalar_type()); + at::native::resize_output(result, out.sizes()); + result.copy_(out); + return result; +} + +// Numerical or None norms +Tensor linalg_norm(const Tensor& X, const optional& opt_ord, OptionalIntArrayRef opt_dim, bool keepdim, optional opt_dtype) { + if (opt_dim.has_value()) { + TORCH_CHECK(opt_dim->size() == 1 || opt_dim ->size() == 2, "linalg.norm: If ", + "dim is specified, it must be of length 1 or 2. Got ", *opt_dim); + } else { + if (opt_ord.has_value()) { + TORCH_CHECK(X.dim() == 1 || X.dim() == 2, "linalg.norm: If ", + "dim is not specified but ord is, the input must be 1D or 2D. Got ", X.dim(), "D."); + } + } + + // If ord=None, we'll always use the 2-norm or frob norm (which are the same) so we go through + // vector_norm + if (opt_ord.has_value() && + ((opt_dim.has_value() && opt_dim->size() == 2) || + (!opt_dim.has_value() && X.dim() == 2))) { + using Int = IntArrayRef::value_type; + auto dim = opt_dim.has_value() ? opt_dim.value().vec() : std::vector{0, 1}; + return at::linalg_matrix_norm(X, *opt_ord, dim, keepdim, opt_dtype); + } else { + auto scalar_ord = opt_ord.value_or(Scalar(2.)); + return at::linalg_vector_norm(X, scalar_ord, opt_dim, keepdim, opt_dtype); + } +} + +Tensor& linalg_norm_out(const Tensor& X, const optional& opt_ord, OptionalIntArrayRef opt_dim, bool keepdim, optional opt_dtype, Tensor& result) { + checkSameDevice("linalg.norm", X, result); + auto out = at::linalg_norm(X, opt_ord, opt_dim, keepdim, opt_dtype); + TORCH_CHECK(out.scalar_type() == result.scalar_type(), + "linalg.norm expected out tensor dtype ", out.scalar_type(), + " but got: ", result.scalar_type()); + at::native::resize_output(result, out.sizes()); + result.copy_(out); + return result; +} + +// Frobenius and nuclear norms +Tensor linalg_norm(const Tensor& X, c10::string_view ord, OptionalIntArrayRef opt_dim, bool keepdim, optional opt_dtype) { + if (opt_dim.has_value()) { + TORCH_CHECK(opt_dim->size() == 1 || opt_dim ->size() == 2, "linalg.norm: If ", + "dim is specified, it mut be of length 1 or 2. Got ", *opt_dim); + } else { + TORCH_CHECK(X.dim() == 1 || X.dim() == 2, "linalg.norm: If ", + "dim is not specified but ord is, the input must be 1D or 2D. Got ", X.dim(), "D."); + } + using Int = IntArrayRef::value_type; + auto dim = opt_dim.has_value() ? opt_dim.value().vec() : std::vector{0, 1}; + return at::linalg_matrix_norm(X, ord, dim, keepdim, opt_dtype); +} + +Tensor& linalg_norm_out(const Tensor& X, c10::string_view ord, OptionalIntArrayRef opt_dim, bool keepdim, optional opt_dtype, Tensor& result) { + checkSameDevice("linalg.norm", X, result); + auto out = at::linalg_norm(X, ord, opt_dim, keepdim, opt_dtype); + TORCH_CHECK(out.scalar_type() == result.scalar_type(), + "linalg.norm expected out tensor dtype ", out.scalar_type(), + " but got: ", result.scalar_type()); + at::native::resize_output(result, out.sizes()); + result.copy_(out); + return result; +} + +//////////////////////////////////////////////////////////////////////////////// +// Frobenius Norm // +// Just used in linalg.norm. It should not be removed. // +//////////////////////////////////////////////////////////////////////////////// + Tensor frobenius_norm(const Tensor& self) { return at::norm(self); } @@ -2287,7 +2622,7 @@ Tensor frobenius_norm(const Tensor& self) { Tensor frobenius_norm(const Tensor& self, IntArrayRef dim, bool keepdim) { // NOTE: As frobenius_norm_out is currently implemented, it will always produce a // strided tensor result, even if the input is sparse. - auto options = self.options().layout(c10::Layout::Strided).dtype(toValueType(self.scalar_type())); + auto options = self.options().layout(c10::Layout::Strided).dtype(toRealValueType(self.scalar_type())); Tensor result = at::empty({0}, options); return at::native::frobenius_norm_out(self, dim, keepdim, result); } @@ -2322,6 +2657,11 @@ Tensor &frobenius_norm_out(const Tensor& self, return result; } +//////////////////////////////////////////////////////////////////////////////// +// Nuclear Norm // +// Just used in linalg.norm. It should not be removed. // +//////////////////////////////////////////////////////////////////////////////// + Tensor nuclear_norm(const Tensor& self, bool keepdim) { TORCH_CHECK( self.dim() == 2, @@ -2338,12 +2678,8 @@ Tensor &nuclear_norm_out(const Tensor& self, bool keepdim, Tensor& result) { return at::native::nuclear_norm_out(self, IntArrayRef({0, 1}), keepdim, result); } -Tensor nuclear_norm(const Tensor& self, IntArrayRef dim, bool keepdim) { - Tensor result = at::empty({0}, self.options().dtype(toValueType(self.scalar_type()))); - return at::native::nuclear_norm_out(self, dim, keepdim, result); -} - -Tensor& nuclear_norm_out(const Tensor& self, IntArrayRef dim, bool keepdim, Tensor& result) { +namespace { +Tensor nuclear_norm_impl(const Tensor& self, IntArrayRef dim, bool keepdim) { TORCH_CHECK(dim.size() == 2, "nuclear norm requires a 'dim' argument of size 2"); auto dim_ = dim.vec(); maybe_wrap_dims(dim_, self.dim()); @@ -2356,323 +2692,25 @@ Tensor& nuclear_norm_out(const Tensor& self, IntArrayRef dim, bool keepdim, Tens auto permutation_reverse = create_reverse_permutation(permutation); result_ = result_.permute(permutation_reverse); } - at::native::resize_output(result, result_.sizes()); - result.copy_(result_); - return result; -} - -// Creates a vector of length ndim with values equal to its indices -// (e.g. [0, 1, 2, ..., ndim-1]) -static std::vector make_dim_list(int64_t ndim) { - std::vector dim_list(ndim); - for (const auto ind : c10::irange(ndim)) { - dim_list[ind] = ind; - } - return dim_list; -} - -// Checks for valid arguments to linalg_norm when type(ord) == str -static void check_str_ord_valid(const c10::string_view str_ord, optional opt_dim, int64_t ndim) { - TORCH_CHECK((str_ord == "nuc") || (str_ord == "fro"), "Invalid norm order: ", str_ord); - bool dims_valid = (ndim == 2 && !opt_dim.has_value()) || (opt_dim.has_value() && opt_dim.value().size() == 2); - TORCH_CHECK(dims_valid, "order \"", str_ord, - "\" can only be used if either len(dim) == 2 or (self.dim() == 2 and dim is None)"); + return result_; } +} // anonymous namespace -// Performs second dimension reduction for matrix norms -static Tensor _norm_min_max(Tensor& self, double ord, int64_t dim, bool keepdim) { - if (ord > 0) { - return self.amax(dim, keepdim); - } else { - return self.amin(dim, keepdim); - } +Tensor nuclear_norm(const Tensor& self, IntArrayRef dim, bool keepdim) { + return nuclear_norm_impl(self, dim, keepdim).to(toRealValueType(self.scalar_type())); } -// Performs matrix norm -static Tensor& _linalg_norm_matrix_out(Tensor& result, const Tensor &self, const optional& opt_ord, - IntArrayRef dim, bool keepdim, optional opt_dtype) { - Tensor result_; - auto ord = opt_ord.value_or(2.0).toDouble(); - TORCH_CHECK(self.layout() == Layout::Strided, - "matrix norm only supports strided layout, got: ", self.layout()); - - TORCH_CHECK(dim.size() == 2, "_linalg_norm_matrix: 'dim' must either specify 2 dimensions. ", - "Got 'dim' specifying ", dim.size(), " dims"); - auto dim_ = dim.vec(); - maybe_wrap_dims(dim_, self.dim()); - TORCH_CHECK(dim_[0] != dim_[1], - "Expected dims to be different, got (", dim[0], ", ", dim[1], ") instead"); - - ScalarType scalarType = opt_dtype.has_value() ? opt_dtype.value() : self.scalar_type(); - TORCH_CHECK( - at::isFloatingType(scalarType) || at::isComplexType(scalarType), - "Can only calculate the mean of floating and complex types. Got ", - toString(scalarType), " instead."); - - Tensor self_; - if (opt_dtype.has_value()) { - self_ = self.to(scalarType); - } else { - self_ = self; - } - - if (std::abs(ord) == 2) { - // Need to shift the reduction dims to the back, because at::linalg_svdvals will only operate on - // the last 2 dimensions - auto permutation = create_dim_backshift_permutation(dim_[0], dim_[1], self.dim()); - auto permutation_reverse = create_reverse_permutation(permutation); - - result_ = at::linalg_svdvals(self_.permute(permutation)); - result_ = _norm_min_max(result_, ord, result_.dim() - 1, keepdim); - - if (keepdim) { - result_ = result_.unsqueeze(-1).permute(permutation_reverse); - } - } else { - // abs(p) == infinity and abs(p) == 1 will perform identical reductions, except - // that the order of the two dims is swapped. So we can swap the dims if - // abs(p) == infinity to simplify the rest of the operation's logic. - if (std::abs(ord) == INFINITY) { - std::swap(dim_[0], dim_[1]); - } - // If the dim of the second reduction is greater than that of the first reduction - // and we are not keeping the dims, then the fact that the output of the first - // reduction will have one fewer dimension means that the second reduction dim - // will be off by one, so we need to correct that. - if ((dim_[1] > dim_[0]) && !keepdim) { - dim_[1]--; - } - if (std::abs(ord) == 1 || std::abs(ord) == INFINITY) { - result_ = self_.abs().sum(dim_[0], keepdim); - result_ = _norm_min_max(result_, ord, dim_[1], keepdim); - } else { - TORCH_CHECK(false, "Order ", ord, " not supported for matrix norm"); - } - } +Tensor& nuclear_norm_out(const Tensor& self, IntArrayRef dim, bool keepdim, Tensor& result) { + auto result_ = nuclear_norm_impl(self, dim, keepdim); at::native::resize_output(result, result_.sizes()); result.copy_(result_); return result; } -static Tensor& linalg_norm_out_impl(Tensor& result, const Tensor& self, const optional& opt_num_ord, optional opt_str_ord, optional opt_dim, bool keepdim, optional opt_dtype) { - // Callers must give the ord argument as either a number, a string, or neither. - // Since the user-facing API has no direct control over how this function is called, this is an internal assert. - TORCH_INTERNAL_ASSERT(!(opt_num_ord.has_value() && opt_str_ord.has_value())); - if (opt_dtype.has_value()) { - auto dtype = opt_dtype.value(); - TORCH_CHECK(dtype == result.scalar_type(), "provided dtype must match dtype of result, but got", - "dtype = ", dtype, ", out.dtype = ", result.scalar_type()); - } - int64_t ndim = self.dim(); - if (opt_str_ord.has_value()) { - // 'ord' is string - auto str_ord = opt_str_ord.value(); - check_str_ord_valid(str_ord, opt_dim, ndim); - Tensor self_ = opt_dtype.has_value() ? self.to(opt_dtype.value()) : self; - if (str_ord == "fro") { - at::frobenius_norm_out(result, self_, opt_dim.value_or(IntArrayRef({0, 1})), keepdim); - } else if (str_ord == "nuc") { - if (opt_dim.has_value()) { - at::nuclear_norm_out(result, self_, opt_dim.value(), keepdim); - } else { - at::nuclear_norm_out(result, self_, keepdim); - } - } - } else { - // 'ord' is int or None - std::vector dim_ = opt_dim.has_value() ? opt_dim.value().vec() : make_dim_list(ndim); - if (!opt_num_ord.has_value() || dim_.size() == 1) { - Tensor result_ = at::linalg_vector_norm( - self, opt_num_ord.value_or(2), opt_dim, keepdim, opt_dtype); - // TODO: Resize and copy should be avoided with - // https://github.com/pytorch/pytorch/issues/52712 - at::native::resize_output(result, result_.sizes()); - result.copy_(result_); - } else if (dim_.size() == 2) { - _linalg_norm_matrix_out(result, self, opt_num_ord.value(), dim_, keepdim, opt_dtype); - } else { - TORCH_CHECK(false, "'dim' must specify 1 or 2 dimensions when order is numerical and input is " - "not 1-D or 2-D"); - } - } - return result; -} - -static Tensor& linalg_vector_norm_impl(const Tensor& self, const Scalar& scalar_ord, optional opt_dim, bool keepdim, optional opt_dtype, Tensor& result) { - // Casting a large integer to a double will introduce some error, but for - // practical purposes, it won't matter since a large order will usually - // give an infinite result - auto ord = scalar_ord.toDouble(); - - TORCH_CHECK(self.device().type() == DeviceType::CPU || self.device().type() == DeviceType::CUDA, - "linalg.vector_norm only supports CPU and CUDA device types, but got: ", - self.device().type()); - TORCH_CHECK(self.layout() == Layout::Strided, - "linalg.vector_norm only supports strided layout, but got: ", self.layout()); - - if (opt_dtype.has_value() && isComplexType(self.scalar_type())) { - TORCH_CHECK(isComplexType(opt_dtype.value()), - "linalg.vector_norm expected complex 'dtype', since input is complex, ", - "but got ", opt_dtype.value()); - } - - checkFloatingOrComplex(self, "linalg.vector_norm"); - ScalarType in_dtype = opt_dtype.value_or(self.scalar_type()); - - IntArrayRef dim = opt_dim.value_or(IntArrayRef{}); - - if (self.numel() == 0) { - // TODO: The question about how to handle negative orders when the input - // is empty has not been settled yet. For now, we raise an error. Issue: - // https://github.com/pytorch/pytorch/issues/52783 - TORCH_CHECK(ord >= 0, - "linalg.vector_norm of negative order cannot be performed on an empty tensor"); - - // For NumPy compatibility, we can only perform order infinity reduction - // (max/min) on a tensor with zero elements if the dimensions to reduce are - // nonzero. Otherwise, throw an error. - if (ord == INFINITY) { - bool has_identity = true; - - if (dim.size() == 0) { - has_identity = false; - } else { - for (int64_t dim_num : dim) { - if (self.size(dim_num) == 0) { - has_identity = false; - break; - } - } - } - TORCH_CHECK(has_identity, - "linalg.vector_norm cannot compute the infinity norm on an empty ", - "dimension because the operation does not have an identity"); - } - } - Tensor self_; - if (self.device().type() == c10::kCPU && isComplexType(self.scalar_type()) && std::abs(ord) == INFINITY) { - // TODO: This at::abs() call is used so that the at::abs() call in the - // backward function produces an identical result for complex inputs. - // However, it would be ideal if we could incorporate this into - // linalg_vector_norm_stub. See issue: - // https://github.com/pytorch/pytorch/issues/52648 - self_ = self.to(in_dtype).abs(); - in_dtype = toValueType(in_dtype); - } else { - self_ = self; - } - ScalarType out_dtype = opt_dtype.value_or(toValueType(self.scalar_type())); - TORCH_CHECK(!result.defined() || out_dtype == result.scalar_type(), - "linalg.vector_norm expected out tensor dtype ", out_dtype, - " but got: ", result.scalar_type()); - // omit in_dtype in the following call, to avoid make_reduction explicitly casting input to out_dtype - auto iter = isComplexType(self.scalar_type()) ? - make_reduction("vector_norm", result, self_, dim, keepdim, in_dtype, out_dtype) : - make_reduction("vector_norm", result, self_, dim, keepdim, out_dtype); - - linalg_vector_norm_stub(iter.device_type(), iter, ord); - return result; -} - -Tensor linalg_vector_norm(const Tensor& self, const Scalar& ord, optional opt_dim, bool keepdim, optional opt_dtype) { - ScalarType out_dtype = opt_dtype.value_or(toValueType(self.scalar_type())); - Tensor result = create_reduction_result(self, opt_dim.value_or(IntArrayRef{}), keepdim, out_dtype); - return at::native::linalg_vector_norm_impl(self, ord, opt_dim, keepdim, opt_dtype, result); -} +//////////////////////////////////////////////////////////////////////////////// +// linalg.cond // +//////////////////////////////////////////////////////////////////////////////// -Tensor& linalg_vector_norm_out(const Tensor& self, const Scalar& ord, optional opt_dim, bool keepdim, optional opt_dtype, Tensor& result) { - return at::native::linalg_vector_norm_impl(self, ord, opt_dim, keepdim, opt_dtype, result); -} - -namespace { - -// Only performs checks not performed by linalg.norm -void check_linalg_matrix_norm_args( - const Tensor& self, - IntArrayRef dim, - optional dtype) { - TORCH_CHECK( - self.ndimension() >= 2, - "linalg.matrix_norm(): input tensor must be a matrix or batch of matrices"); - ScalarType in_dtype = dtype.value_or(self.scalar_type()); - TORCH_CHECK( - in_dtype == kFloat || in_dtype == kDouble || in_dtype == kComplexFloat || - in_dtype == kComplexDouble, - "linalg.matrix_norm(): only supports the float, double, cfloat and cdouble dtypes, but got: ", - toString(in_dtype)); - TORCH_CHECK( - dim.size() == 2, "linalg.matrix_norm(): dim must be a 2-tuple of ints"); -} - -} // namespace - -Tensor linalg_matrix_norm( - const Tensor& self, - const Scalar& ord, - IntArrayRef dim, - bool keepdim, - optional dtype) { - check_linalg_matrix_norm_args(self, dim, dtype); - return at::native::linalg_norm(self, ord, dim, keepdim, dtype); -} - -Tensor& linalg_matrix_norm_out( - const Tensor& self, - const Scalar& ord, - IntArrayRef dim, - bool keepdim, - optional dtype, - Tensor& result) { - check_linalg_matrix_norm_args(self, dim, dtype); - return at::native::linalg_norm_out(self, ord, dim, keepdim, dtype, result); -} - -Tensor linalg_matrix_norm( - const Tensor& self, - c10::string_view ord, - IntArrayRef dim, - bool keepdim, - optional dtype) { - check_linalg_matrix_norm_args(self, dim, dtype); - return at::native::linalg_norm(self, ord, dim, keepdim, dtype); -} - -Tensor& linalg_matrix_norm_out( - const Tensor& self, - c10::string_view ord, - IntArrayRef dim, - bool keepdim, - optional dtype, - Tensor& result) { - check_linalg_matrix_norm_args(self, dim, dtype); - return at::native::linalg_norm_out(self, ord, dim, keepdim, dtype, result); -} - -// Numerical or None norms -Tensor linalg_norm(const Tensor& self, const optional& opt_ord, optional opt_dim, bool keepdim, optional opt_dtype) { - auto options = TensorOptions().dtype(opt_dtype.has_value() ? opt_dtype.value() : toValueType(self.scalar_type())).device(self.device()); - Tensor result = at::empty({0}, options); - return at::native::linalg_norm_out( - self, opt_ord, opt_dim, keepdim, opt_dtype, result); -} - -// Frobenius and nuclear norms -Tensor linalg_norm(const Tensor& self, c10::string_view ord, optional opt_dim, bool keepdim, optional opt_dtype) { - auto options = TensorOptions().dtype(opt_dtype.has_value() ? opt_dtype.value() : toValueType(self.scalar_type())).device(self.device()); - Tensor result = at::empty({0}, options); - return at::native::linalg_norm_out( - self, ord, opt_dim, keepdim, opt_dtype, result); -} - -// Numerical or None norms -Tensor& linalg_norm_out(const Tensor& self, const optional& opt_ord, optional opt_dim, bool keepdim, optional opt_dtype, Tensor& result) { - return linalg_norm_out_impl(result, self, opt_ord, c10::nullopt, opt_dim, keepdim, opt_dtype); -} - -// Frobenius and nuclear norms -Tensor& linalg_norm_out(const Tensor& self, c10::string_view ord, optional opt_dim, bool keepdim, optional opt_dtype, Tensor& result) { - return linalg_norm_out_impl(result, self, c10::nullopt, ord, opt_dim, keepdim, opt_dtype); -} // This function helps to dispatch norm computations depending on 'ord' of variant type Tensor _linalg_cond_helper(const Tensor& self, c10::variant ord_variant) { @@ -2694,7 +2732,7 @@ Tensor _linalg_cond_helper(const Tensor& self, c10::variant& opt_ord) { // NumPy doesn't define the condition number for 0x0 matrices, we return 0.0 for such input if (self.numel() == 0) { - auto real_dtype = toValueType(typeMetaToScalarType(self.dtype())); + auto real_dtype = toRealValueType(typeMetaToScalarType(self.dtype())); return _linalg_cond_empty_matrix(self, real_dtype); } @@ -2757,7 +2795,7 @@ Tensor linalg_cond(const Tensor& self, const optional& opt_ord) { Tensor& linalg_cond_out(const Tensor& self, const optional& opt_ord, Tensor& result) { checkSameDevice("linalg.cond", result, self); - ScalarType real_dtype = toValueType(self.scalar_type()); + ScalarType real_dtype = toRealValueType(self.scalar_type()); checkLinalgCompatibleDtype("linalg.cond", result.scalar_type(), real_dtype); Tensor result_tmp = at::linalg_cond(self, opt_ord); @@ -2791,7 +2829,7 @@ Tensor linalg_cond(const Tensor& self, c10::string_view ord) { // TODO: implement _out variant avoiding copy and using already allocated storage directly Tensor& linalg_cond_out(const Tensor& self, c10::string_view ord, Tensor& result) { checkSameDevice("linalg.cond", result, self); - ScalarType real_dtype = toValueType(self.scalar_type()); + ScalarType real_dtype = toRealValueType(self.scalar_type()); checkLinalgCompatibleDtype("linalg.cond", result.scalar_type(), real_dtype); Tensor result_tmp = at::linalg_cond(self, ord); @@ -2849,7 +2887,7 @@ Tensor& linalg_tensorinv_out(const Tensor& self, int64_t ind, Tensor& result) { return result; } -Tensor linalg_tensorsolve(const Tensor& self, const Tensor& other, optional dims) { +Tensor linalg_tensorsolve(const Tensor& self, const Tensor& other, OptionalIntArrayRef dims) { /* The idea is to reduce the problem to 2D matrix solve. Step 1. (optional) `self` is permuted with `dims` such that dimensions from `dims` are moved to the right. @@ -2887,7 +2925,7 @@ Tensor linalg_tensorsolve(const Tensor& self, const Tensor& other, optional dims, Tensor& result) { +Tensor& linalg_tensorsolve_out(const Tensor& self, const Tensor& other, OptionalIntArrayRef dims, Tensor& result) { checkSameDevice("tensorsolve", result, self); checkLinalgCompatibleDtype("tensorsolve", result, self); @@ -2946,142 +2984,6 @@ struct KronImpl final { }; } -DEFINE_DISPATCH(unpack_pivots_stub); - -std::tuple lu_unpack( - const Tensor& LU_data, - const Tensor& LU_pivots, - bool unpack_data, - bool unpack_pivots - ) { - TORCH_CHECK(LU_pivots.is_contiguous() && (LU_pivots.scalar_type() == at::kInt), - "lu_unpack: LU_pivots is expected to be a contiguous tensor of torch.int32 dtype." - "Note: this function is intended to be used with the output produced by torch{.linalg}.lu"); - - // trivial case - if (!unpack_data && !unpack_pivots) { - return std::make_tuple(Tensor(), Tensor(), Tensor()); - } - - Tensor L, U; - // In the generalized LU factorization, the following shape relations hold: - // A.shape[-2:] == (m, n), - // P.shape[-2:] == (m, m), - // U.shape[-2:] == (m, k), - // L.shape[-2:] == (k, n), - // where k = min(m, n) - int64_t m = LU_data.size(-2); - int64_t n = LU_data.size(-1); - int64_t k = std::min(m, n); - - if (unpack_data) { - U = LU_data.triu(); - if (m != k) { - U = U.narrow(-2, 0, k); - } - - L = LU_data.tril(); - if (k != n) { - L = L.narrow(-1, 0, k); - } - L.diagonal(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1).fill_(1); - } - - if (!unpack_pivots) { - return std::make_tuple(Tensor(), L, U); - } - - auto unpacked_pivots_sizes = LU_pivots.sizes().vec(); - unpacked_pivots_sizes[LU_pivots.dim() - 1] = m; - auto unpacked_pivots = at::empty( - unpacked_pivots_sizes, - LU_pivots.options().memory_format(at::MemoryFormat::Contiguous) - ); - - // Fill `unpacked_pivots` with identity permutation - auto id_perm = at::arange(m, LU_pivots.options()); - unpacked_pivots.copy_(id_perm); - - // WARNING: we assume that unchanged LAPACK pivots are provided. - // Since LAPACK relies on the FORTRAN's 1-based indexing, - // we subtract 1 to convert the pivots to the C-style 0-based indexing. - // This behaviour could change in the future. - auto LU_pivots_zero_idx = LU_pivots - 1; - - auto iter = TensorIteratorConfig() - .set_check_mem_overlap(false) - .check_all_same_dtype(false) - .resize_outputs(false) - .declare_static_shape(LU_pivots.sizes(), /*squash_dim=*/LU_pivots.dim() - 1) - .add_output(unpacked_pivots) - .add_input(LU_pivots_zero_idx) - .build(); - // } - - unpack_pivots_stub( - LU_pivots.device().type(), - iter, - LU_pivots.size(-1) - ); - - // The permutation matrix is converted to LU_data.dtype - // because `matmul` does not work with integer matrices. - unpacked_pivots_sizes.push_back(m); - auto permutation_matrix = at::zeros( - unpacked_pivots_sizes, - LU_data.options().memory_format(at::MemoryFormat::Contiguous) - ); - - // now that we know the final permutation, - // scatter 1s at proper locations. - permutation_matrix.scatter_( - -2, - unpacked_pivots.unsqueeze(-2).to(at::kLong), - at::ones({1}, permutation_matrix.options()).expand(permutation_matrix.sizes()) - ); - - return std::make_tuple(permutation_matrix, L, U); -} - -using TupleTensorRefs3 = std::tuple; - -TupleTensorRefs3 lu_unpack_out( - const Tensor& LU_data, - const Tensor& LU_pivots, - bool unpack_data, - bool unpack_pivots, - Tensor& P, - Tensor& L, - Tensor& U - ) { - Tensor P_tmp, L_tmp, U_tmp; - std::tie(P_tmp, L_tmp, U_tmp) = at::lu_unpack(LU_data, LU_pivots, unpack_data, unpack_pivots); - - if (unpack_pivots) { - checkSameDevice("lu_unpack", P, LU_data, "P"); - // Note that lu_unpack returns P such that P.dtype == LU_data.dtype, - // because otherwise we cannot use P in matric products (no int -> float promotion) - checkLinalgCompatibleDtype("lu_unpack", P, LU_data, "L"); - - at::native::resize_output(P, P_tmp.sizes()); - P.copy_(P_tmp); - } - - if (unpack_data) { - checkSameDevice("lu_unpack", L, LU_data, "L"); - checkSameDevice("lu_unpack", U, LU_data, "U"); - checkLinalgCompatibleDtype("lu_unpack", L, LU_data, "L"); - checkLinalgCompatibleDtype("lu_unpack", U, LU_data, "U"); - - at::native::resize_output(L, L_tmp.sizes()); - at::native::resize_output(U, U_tmp.sizes()); - L.copy_(L_tmp); - U.copy_(U_tmp); - } - - return TupleTensorRefs3(P, L, U); -} - /* Calculates the Kronecker product between two Tensors. */ diff --git a/aten/src/ATen/native/LinearAlgebra.h b/aten/src/ATen/native/LinearAlgebra.h index 050fe7dedc7b..304fbb8e6847 100644 --- a/aten/src/ATen/native/LinearAlgebra.h +++ b/aten/src/ATen/native/LinearAlgebra.h @@ -15,15 +15,4 @@ namespace at { namespace native { using addr_fn = void (*)(TensorIterator &, const Scalar& beta, const Scalar& alpha); DECLARE_DISPATCH(addr_fn, addr_stub); - -using linalg_vector_norm_fn = void(*)(TensorIterator &, Scalar); -DECLARE_DISPATCH(linalg_vector_norm_fn, linalg_vector_norm_stub); - -using unpack_pivots_fn = void(*)( - TensorIterator& iter, - int64_t dim_size -); -DECLARE_DISPATCH(unpack_pivots_fn, unpack_pivots_stub); - - }} // namespace at::native diff --git a/aten/src/ATen/native/LinearAlgebraUtils.h b/aten/src/ATen/native/LinearAlgebraUtils.h index 555cbb001ef2..9301d090080a 100644 --- a/aten/src/ATen/native/LinearAlgebraUtils.h +++ b/aten/src/ATen/native/LinearAlgebraUtils.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -25,23 +26,6 @@ namespace at { namespace native { -// Used as an interface between the different BLAS-like libraries -enum class TransposeType { - NoTranspose, - Transpose, - ConjTranspose, -}; - -// Transforms TransposeType into the BLAS / LAPACK format -static char to_blas(TransposeType trans) { - switch (trans) { - case TransposeType::Transpose: return 'T'; - case TransposeType::NoTranspose: return 'N'; - case TransposeType::ConjTranspose: return 'C'; - } - TORCH_INTERNAL_ASSERT(false, "Invalid transpose type"); -} - static inline c10::MaybeOwned expect_resolved_conj(const Tensor& tensor) { if (tensor.is_conj()) { return c10::MaybeOwned::owned(tensor.resolve_conj()); @@ -50,46 +34,23 @@ static inline c10::MaybeOwned expect_resolved_conj(const Tensor& tensor) } } -template -static inline Vec contiguous_strides_template(const IntArrayRef sizes, const bool f_contig=false) { - static_assert(std::is_same::value, - "Incompatible integral type of sizes and strides"); - // f_contig chooses between the strides of a batch of Fortran (F-contiguous) and C-contiguous matrices - using Int = IntArrayRef::value_type; - constexpr auto one = Int{1}; - const auto n = sizes.size(); - if (n == 0) { - return Vec{}; - } else if (n == 1) { - // Use initializer-list to initialize the vector - return Vec{one}; - } - // Now we have a matrix or batch of matrices - auto strides = Vec(n); - const auto last_idx = n - 1; - const auto snd_last_idx = n - 2; - // We'll fill the first two strides afterwards, otherwise the first step - // in the for loop is wrong - strides[snd_last_idx] = std::max(sizes[last_idx], one); - for (int i = snd_last_idx - 1; i >= 0; --i) { - strides[i] = strides[i + 1] * std::max(sizes[i + 1], one); - } - strides[last_idx] = f_contig ? std::max(sizes[snd_last_idx], one) : one; - if (f_contig) { - // We filled the wrong stride before so we correct it - strides[snd_last_idx] = one; +static inline DimVector batched_matrix_contiguous_strides( + const IntArrayRef sizes, + const bool f_contig = false) { + // f_contig chooses between the strides of a batch of Fortran (F-contiguous) + // and C-contiguous matrices + auto strides = c10::contiguous_strides(sizes); + auto dim = strides.size(); + + if (f_contig && dim >= 2) { + // Fix the strides of the last two dimensions, so that we return + // C-contiguous batches of F-contiguous matrices. + strides[dim - 1] = std::max(sizes[dim - 2], static_cast(1)); + strides[dim - 2] = 1; } return strides; } -static inline DimVector contiguous_strides(const IntArrayRef sizes, const bool f_contig=false) { - return contiguous_strides_template(sizes, f_contig); -} - -static inline std::vector contiguous_strides_vec(const IntArrayRef sizes, const bool f_contig=false) { - return contiguous_strides_template>(sizes, f_contig); -} - /* * Clones a Tensor so that the following conditions hold: * If we think of a Tensor of having size (B, M, N), where B is any number @@ -131,13 +92,13 @@ static inline c10::MaybeOwned borrow_else_clone(const bool cond, const T * broadcasted shape. */ static inline Tensor copyBatchedColumnMajor(const Tensor& src, int64_t nrows = -1, - c10::optional desired_batch_sizes = c10::nullopt) { + at::OptionalIntArrayRef desired_batch_sizes = c10::nullopt) { nrows = (nrows == -1) ? src.size(-2) : nrows; auto copy_sizes = desired_batch_sizes.has_value() ? desired_batch_sizes.value().vec() : IntArrayRef(src.sizes().data(), src.dim() - 2).vec(); copy_sizes.insert(copy_sizes.end(), {nrows, src.size(-1)}); - const auto copy_strides = contiguous_strides(copy_sizes, /*f-contig*/true); + const auto copy_strides = batched_matrix_contiguous_strides(copy_sizes, /*f-contig*/true); auto copy = at::empty_strided(copy_sizes, copy_strides, src.options()); copy.narrow(-2, 0, src.size(-2)).copy_(src); return copy; @@ -213,7 +174,7 @@ void batch_iterator_with_broadcasting(const Tensor& a, const Tensor& b, const fu auto a_broadcasts_over_b = (a_batch_sizes != b_batch_sizes); Tensor a_buffer, a_was_accessed, a_buffer_3d; std::function check_if_copy_needed_for_a - = [](int64_t a_curr_linear_batch_idx){}; + = [](int64_t /*a_curr_linear_batch_idx*/){}; if (a_broadcasts_over_b) { a_buffer = at::empty_strided(a.sizes(), a.strides(), a.options()) .copy_(a); @@ -467,14 +428,14 @@ static inline std::tuple _parse_qr_mode(c10::string_view mode) { } // Function to compute sizes, strides and the extra columns for the Q matrix in the QR Decomposition -static inline std::tuple, - std::vector, - int64_t> _compute_geometry_for_Q(const Tensor& input, bool reduced) { +static inline std::tuple _compute_geometry_for_Q( + const Tensor& input, + bool reduced) { int64_t m = input.size(-2), n = input.size(-1); int64_t n_columns_q; // We need to compute the required size of Q based on the `reduced` option - auto q_sizes = input.sizes().vec(); + DimVector q_sizes(input.sizes()); if (!reduced && m > n) { q_sizes[input.dim() - 1] = m; n_columns_q = m; @@ -482,7 +443,7 @@ static inline std::tuple, q_sizes[input.dim() - 1] = n; n_columns_q = std::min(m, n); } - auto q_strides = contiguous_strides_vec(q_sizes, /*f-contig*/true); + auto q_strides = batched_matrix_contiguous_strides(q_sizes, /*f-contig*/true); return std::make_tuple(q_sizes, q_strides, n_columns_q); } @@ -623,11 +584,49 @@ static inline bool linalg_solve_is_vector_rhs(const Tensor& input, const Tensor& return vector_case; } +/* + Computes linear indices for a tensor with original_shape to access its elements like it was a materialized broadcast tensor. +*/ +static inline Tensor get_linear_indices(int64_t numel, IntArrayRef original_shape, IntArrayRef broadcast_shape) { + TensorOptions options = at::TensorOptions().dtype(at::kLong).device(at::kCPU); + return at::arange(numel, options).view(original_shape).broadcast_to(broadcast_shape).contiguous(); +} + +class BroadcastLinearIndices { + private: + Tensor linear_indices_; + bool is_broadcasting_; + + public: + BroadcastLinearIndices( + int64_t numel, + IntArrayRef original_shape, + IntArrayRef broadcast_shape) { + // The assumption is that the broadcast_shape is a materialized broadcast + // shape of the original_shape. We need to compute the linear indices + // compatible with the original_shape to access the elements in the original + // tensor corresponding to the broadcast tensor. + is_broadcasting_ = !original_shape.equals(broadcast_shape); + if (is_broadcasting_) { + linear_indices_ = + get_linear_indices(numel, original_shape, broadcast_shape); + } + } + int64_t operator()(int64_t broadcast_linear_index) { + return is_broadcasting_ + ? linear_indices_.data_ptr()[broadcast_linear_index] + : broadcast_linear_index; + } +}; + static inline bool is_blas_compatible_column_major_order(const Tensor& input) { IntArrayRef input_strides = input.strides(); IntArrayRef input_sizes = input.sizes(); auto ndim = input.dim(); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ndim == 2 || ndim == 3); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ndim >= 2); + if (ndim > 3) { + return input.transpose(-2, -1).is_contiguous(); + } auto leading_dimension = input_strides[ndim - 1]; auto rows = input_sizes[ndim - 2]; bool batch_stride_compatible = true; @@ -645,7 +644,10 @@ static inline bool is_blas_compatible_row_major_order(const Tensor& input) { IntArrayRef input_strides = input.strides(); IntArrayRef input_sizes = input.sizes(); auto ndim = input.dim(); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ndim == 2 || ndim == 3); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ndim >= 2); + if (ndim > 3) { + return input.is_contiguous(); + } auto leading_dimension = input_strides[ndim - 2]; auto cols = input_sizes[ndim - 1]; bool batch_stride_compatible = true; diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp index 1812e61febce..5358b83bdf22 100644 --- a/aten/src/ATen/native/Loss.cpp +++ b/aten/src/ATen/native/Loss.cpp @@ -8,6 +8,7 @@ #include #include #include +#include constexpr float EPSILON = 1e-12; @@ -39,6 +40,17 @@ TORCH_META_FUNC(smooth_l1_loss) maybe_get_output().resize_({}); } +TORCH_META_FUNC(mse_loss) +(const Tensor& input, const Tensor& target, const int64_t reduction) { + build_borrowing_binary_op(maybe_get_output(), input, target); + if (reduction == Reduction::None) { + return; + } + + TORCH_INTERNAL_ASSERT(reduction == Reduction::Mean || reduction == Reduction::Sum); + maybe_get_output().resize_({}); +} + } // namespace meta namespace native { @@ -70,6 +82,22 @@ TORCH_IMPL_FUNC(smooth_l1_loss_out) } } +TORCH_IMPL_FUNC(mse_loss_out) +(const Tensor& input, const Tensor& target, int64_t reduction, const Tensor& result) { + if (reduction != Reduction::None) { + Tensor loss; + auto iter = TensorIterator::borrowing_binary_op(loss, input, target); + mse_stub(iter.device_type(), iter); + if (reduction == Reduction::Mean) { + at::mean_out(const_cast(result), iter.output(), IntArrayRef{}); + } else { + at::sum_out(const_cast(result), iter.output(), IntArrayRef{}); + } + } else { + mse_stub(device_type(), *this); + } +} + Tensor cosine_embedding_loss(const Tensor& input1, const Tensor& input2, const Tensor& target, double margin, int64_t reduction) { auto targ_dim = target.dim(); TORCH_CHECK( @@ -310,30 +338,47 @@ Tensor binary_cross_entropy_with_logits(const Tensor& input, const Tensor& targe return apply_loss_reduction(loss, reduction); } -Tensor binary_cross_entropy_with_logits_backward(const Tensor& grad, const Tensor& input, const Tensor& target, const c10::optional& weight_opt, const c10::optional& pos_weight_opt, int64_t reduction) { +Tensor binary_cross_entropy_with_logits_backward( + const Tensor& grad, + const Tensor& input, + const Tensor& target, + const c10::optional& weight_opt, + const c10::optional& pos_weight_opt, + int64_t reduction) { // See [Note: hacky wrapper removal for optional tensor] - c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); + c10::MaybeOwned weight_maybe_owned = + at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; - const Tensor& pos_weight = c10::value_or_else(pos_weight_opt, [] {return Tensor();}); - - Tensor grad_input; - if (pos_weight.defined()) { - // pos_weight need to be broadcasted, thus mul(target) is not inplace. - auto t = pos_weight.mul(target); - grad_input = t.add(1).sub_(target).mul_(input.sigmoid()).sub_(t).mul_(grad); + const Tensor& pos_weight = + c10::value_or_else(pos_weight_opt, [] { return Tensor(); }); + + Tensor grad_input; + auto hasSubclassTensors = at::areAnyTensorSubclassLike({grad, input, target}); + + // If there are subclassed tensors use the out of place version + if (pos_weight.defined()) { + // pos_weight might need to be broadcasted, thus mul(target) is not inplace. + auto t = pos_weight.mul(target); + grad_input = hasSubclassTensors + ? t.add(1).sub(target).mul(input.sigmoid()).sub(t).mul(grad) + : t.add(1).sub_(target).mul_(input.sigmoid()).sub_(t).mul_(grad); + } else { + grad_input = hasSubclassTensors ? (input.sigmoid() - target).mul(grad) + : (input.sigmoid() - target).mul_(grad); + } + if (weight.defined()) { + if (at::areAnyTensorSubclassLike({grad_input, weight})) { + grad_input = grad_input.mul(weight); } else { - grad_input = (input.sigmoid() - target).mul_(grad); - } - - if (weight.defined()) { - grad_input.mul_(weight); + grad_input.mul_(weight); } + } - if (reduction == at::Reduction::Mean) { - return grad_input / input.numel(); - } + if (reduction == at::Reduction::Mean) { + return grad_input / input.numel(); + } - return grad_input; + return grad_input; } Tensor poisson_nll_loss(const Tensor& input, const Tensor& target, const bool log_input, const bool full, const double eps, const int64_t reduction) @@ -454,30 +499,6 @@ Tensor& huber_loss_backward_out(const Tensor& grad_output, const Tensor& input, return grad_input; } -Tensor mse_loss(const Tensor& input, const Tensor& target, int64_t reduction) { - Tensor loss; - auto iter = TensorIterator::borrowing_binary_op(loss, input, target); - mse_stub(iter.device_type(), iter); - return apply_loss_reduction(iter.output(), reduction); -} - -Tensor& mse_loss_out(const Tensor& input, const Tensor& target, int64_t reduction, Tensor&result) { - if (reduction != Reduction::None) { - Tensor loss; - auto iter = TensorIterator::borrowing_binary_op(loss, input, target); - mse_stub(iter.device_type(), iter); - if (reduction == Reduction::Mean) { - at::mean_out(result, iter.output(), 0); - } else { - at::sum_out(result, iter.output(), 0); - } - } else { - auto iter = TensorIterator::borrowing_binary_op(result, input, target); - mse_stub(iter.device_type(), iter); - } - return result; -} - Tensor mse_loss_backward(const Tensor& grad_output, const Tensor& input, const Tensor& target, int64_t reduction) { Tensor grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); return at::mse_loss_backward_out(grad_input, grad_output, input, target, reduction); @@ -497,7 +518,7 @@ Tensor& mse_loss_backward_out(const Tensor& grad_output, } Tensor l1_loss(const Tensor& input, const Tensor& target, int64_t reduction) { - const auto float_type = c10::toValueType(input.scalar_type()); + const auto float_type = c10::toRealValueType(input.scalar_type()); Tensor result = at::empty({0}, input.options().dtype(float_type)); return at::l1_loss_out(result, input, target, reduction); } diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp index ed733411ff53..212f28bca23e 100644 --- a/aten/src/ATen/native/LossNLL.cpp +++ b/aten/src/ATen/native/LossNLL.cpp @@ -463,7 +463,8 @@ Tensor cross_entropy_loss_prob_target( const Tensor& weight, int64_t reduction, double label_smoothing) { - const auto n_classes = self.size(1); + const auto class_dim = self.dim() == 1 ? 0 : 1; + const auto n_classes = self.size(class_dim); TORCH_CHECK( !weight.defined() || (weight.dim() == 1 && weight.numel() == n_classes), "cross_entropy: weight tensor should be defined either for all ", @@ -472,7 +473,7 @@ Tensor cross_entropy_loss_prob_target( " but got weight tensor of shape: ", weight.sizes()); - auto input = at::log_softmax(self, 1, self.scalar_type()); + auto input = at::log_softmax(self, class_dim, self.scalar_type()); Tensor target; if (label_smoothing > 0.0) { @@ -484,29 +485,40 @@ Tensor cross_entropy_loss_prob_target( if (weight.defined()) { // Expand weight to the correct number of dims for broadcasting with input / target - auto weight_broadcast_shape = SmallBuffer(input.dim()); - std::fill(weight_broadcast_shape.begin(), weight_broadcast_shape.end(), 1); - weight_broadcast_shape[1] = weight.size(0); - Tensor weight_ = weight.view(weight_broadcast_shape); + Tensor weight_ = weight; + if (input.dim() > 1) { + auto weight_broadcast_shape = SmallBuffer(input.dim()); + std::fill(weight_broadcast_shape.begin(), weight_broadcast_shape.end(), 1); + weight_broadcast_shape[1] = weight.size(0); + weight_ = weight.view(weight_broadcast_shape); + } switch (reduction) { case Reduction::Mean: - return -(input * target * weight_).sum() / (input.numel() / input.size(1)); + if (input.numel()==0){ + return -(input * target * weight_).sum().fill_(std::numeric_limits::quiet_NaN()); + } else { + return -(input * target * weight_).sum() / (input.numel() / n_classes); + } case Reduction::Sum: return -(input * target * weight_).sum(); case Reduction::None: - return -(input * target * weight_).sum(1); + return -(input * target * weight_).sum(class_dim); default: TORCH_CHECK(false, "Invalid reduction type encountered in cross_entropy: ", reduction); } } else { switch (reduction) { case Reduction::Mean: - return -(input * target).sum() / (input.numel() / input.size(1)); + if (input.numel()==0){ + return -(input * target).sum().fill_(std::numeric_limits::quiet_NaN()); + } else { + return -(input * target).sum() / (input.numel() / n_classes); + } case Reduction::Sum: return -(input * target).sum(); case Reduction::None: - return -(input * target).sum(1); + return -(input * target).sum(class_dim); default: TORCH_CHECK(false, "Invalid reduction type encountered in cross_entropy: ", reduction); } diff --git a/aten/src/ATen/native/Math.h b/aten/src/ATen/native/Math.h index 09255e065879..ee10d00f9b5c 100644 --- a/aten/src/ATen/native/Math.h +++ b/aten/src/ATen/native/Math.h @@ -12,6 +12,7 @@ #include #include #include +#include C10_CLANG_DIAGNOSTIC_PUSH() #if C10_CLANG_HAS_WARNING("-Wimplicit-float-conversion") @@ -67,6 +68,83 @@ Output was modified to be inf or -inf when input is 1 or -1. */ POSSIBILITY OF SUCH DAMAGE. */ +namespace { +/* + * This function is derived from the implementation of the i0e function in the + * Cephes Math Library. See note [3-Clause BSD License for the Cephes Math + * Library]. + * + * Computes an approximation of the exponentially scaled zeroth order modified + * Bessel function of the first kind. The approximation is actually two + * (sub)approximations, both using a Chebyshev polynomial expansion. One + * approximates the function over [0, 8], and the other over (8, infinity). This + * function takes the absolute value of all inputs to convert them into the + * domain of the approximation. + */ +jiterator_also_stringify_as(jiterator_code( + template + JITERATOR_HOST_DEVICE T chbevl(T x, const T array[], const int len) { + T b0, b1, b2; + + b0 = array[0]; + b1 = 0; + + for (int i = 1; i < len; ++i) { + b2 = b1; + b1 = b0; + b0 = x * b1 - b2 + array[i]; + } + + return T{0.5} * (b0 - b2); + } + + template + JITERATOR_HOST_DEVICE T calc_i0e(T _x) { + T x = fabs(_x); + + if (x <= T{8.0}) { + static const T coefficients[] = { + -4.41534164647933937950E-18, 3.33079451882223809783E-17, + -2.43127984654795469359E-16, 1.71539128555513303061E-15, + -1.16853328779934516808E-14, 7.67618549860493561688E-14, + -4.85644678311192946090E-13, 2.95505266312963983461E-12, + -1.72682629144155570723E-11, 9.67580903537323691224E-11, + -5.18979560163526290666E-10, 2.65982372468238665035E-9, + -1.30002500998624804212E-8, 6.04699502254191894932E-8, + -2.67079385394061173391E-7, 1.11738753912010371815E-6, + -4.41673835845875056359E-6, 1.64484480707288970893E-5, + -5.75419501008210370398E-5, 1.88502885095841655729E-4, + -5.76375574538582365885E-4, 1.63947561694133579842E-3, + -4.32430999505057594430E-3, 1.05464603945949983183E-2, + -2.37374148058994688156E-2, 4.93052842396707084878E-2, + -9.49010970480476444210E-2, 1.71620901522208775349E-1, + -3.04682672343198398683E-1, 6.76795274409476084995E-1}; + + T y = (x / T{2.0}) - T{2.0}; + return chbevl(y, coefficients, int{30}); + } + + // x > 8 + static const T coefficients[] = { + -7.23318048787475395456E-18, -4.83050448594418207126E-18, + 4.46562142029675999901E-17, 3.46122286769746109310E-17, + -2.82762398051658348494E-16, -3.42548561967721913462E-16, + 1.77256013305652638360E-15, 3.81168066935262242075E-15, + -9.55484669882830764870E-15, -4.15056934728722208663E-14, + 1.54008621752140982691E-14, 3.85277838274214270114E-13, + 7.18012445138366623367E-13, -1.79417853150680611778E-12, + -1.32158118404477131188E-11, -3.14991652796324136454E-11, + 1.18891471078464383424E-11, 4.94060238822496958910E-10, + 3.39623202570838634515E-9, 2.26666899049817806459E-8, + 2.04891858946906374183E-7, 2.89137052083475648297E-6, + 6.88975834691682398426E-5, 3.36911647825569408990E-3, + 8.04490411014108831608E-1}; + + return chbevl(T{32.0} / x - T{2.0}, coefficients, int{25}) / sqrt(x); + }), + i0e_string); // i0e_string +} + #define CENTRAL_RANGE 0.7 template @@ -1385,37 +1463,6 @@ calc_i0(T _x) { // Upcast bfloat16 input to float for numerical accuracy purposes static inline c10::BFloat16 calc_i0(c10::BFloat16 a) { return calc_i0(static_cast(a)); } -/* - * This function is derived from the implementation of the i0e function in the Cephes Math Library. - * See note [3-Clause BSD License for the Cephes Math Library]. - * - * Computes an approximation of the exponentially scaled zeroth order modified Bessel function of the first kind. - * The approximation is actually two (sub)approximations, both using a Chebyshev polynomial expansion. - * One approximates the function over [0, 8], and the other over (8, infinity). This function takes the absolute value - * of all inputs to convert them into the domain of the approximation. - */ -template -static inline typename std::enable_if::value, T>::type -calc_i0e(T _x) { - T x = std::abs(_x); - - if (x <= T{8.0}) { - auto coeff_pair = chebyshev_coefficients_i0e_A(); - auto A = std::get<0>(coeff_pair); - auto len = std::get<1>(coeff_pair); - T y = (x / T{2.0}) - T{2.0}; - return chbevl(y, A, len); - } - - auto coeff_pair = chebyshev_coefficients_i0e_B(); - auto B = std::get<0>(coeff_pair); - auto len = std::get<1>(coeff_pair); - return chbevl(T{32.0} / x - T{2.0}, B, len) / std::sqrt(x); -} - -// Upcast bfloat16 input to float for numerical accuracy purposes -static inline c10::BFloat16 calc_i0e(c10::BFloat16 a) { return calc_i0e(static_cast(a)); } - /* * This function is derived from the implementation of the i1 function in the Cephes Math Library. * See note [3-Clause BSD License for the Cephes Math Library]. @@ -2113,4 +2160,21 @@ calc_erfcx(T x) } } +/* + * Logarithm of Gaussian cumulative distribution function. + + * This implementation of log_ndtr and its helper functions + * follow SciPy's implementation + * See NOTICE for the licenses. + */ +template +static inline C10_HOST_DEVICE T calc_log_ndtr(T x) { + T t = x * M_SQRT1_2; + if (x < T{-1.0}) { + return std::log(calc_erfcx(-t) / 2) - t * t; + } else { + return std::log1p(-std::erfc(t) / 2); + } +} + C10_CLANG_DIAGNOSTIC_POP() diff --git a/aten/src/ATen/native/MaxPooling.h b/aten/src/ATen/native/MaxPooling.h index c429c8e667bc..e133ad5939c8 100644 --- a/aten/src/ATen/native/MaxPooling.h +++ b/aten/src/ATen/native/MaxPooling.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include diff --git a/aten/src/ATen/native/MaxUnpooling.cpp b/aten/src/ATen/native/MaxUnpooling.cpp index 6d395d9078c6..27d4e1a93c81 100644 --- a/aten/src/ATen/native/MaxUnpooling.cpp +++ b/aten/src/ATen/native/MaxUnpooling.cpp @@ -185,132 +185,8 @@ Tensor max_unpooling3d_forward_cpu( return output; } -Tensor& max_unpooling2d_backward_out_cpu(const Tensor& grad_output_, - const Tensor& self, - const Tensor& indices_, - IntArrayRef output_size, - Tensor& grad_input) { - TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous"); - int64_t oheight = output_size[0]; - int64_t owidth = output_size[1]; - int64_t ndim = self.ndimension(); - int64_t dimh = ndim == 3 ? 1 : 2; - int64_t dimw = ndim == 3 ? 2 : 3; - - TORCH_CHECK( - indices_.scalar_type() == at::ScalarType::Long, - "elements in indices should be type int64 but got type: ", indices_.scalar_type()); - TORCH_CHECK( - self.sizes() == indices_.sizes(), - "Expected shape of indices to be same as that of the input tensor (", - self.sizes(), ") but got indices tensor with shape: ", indices_.sizes()); - TORCH_CHECK(output_size.size() == 2, "Output size must be 2 but got: ", output_size.size()); - - auto memory_format = self.suggest_memory_format(); - auto grad_output = grad_output_.contiguous(memory_format); - auto indices = indices_.contiguous(memory_format); - - grad_input.resize_(self.sizes(), memory_format); - grad_input.zero_(); - - if (owidth != grad_output.size(dimw) || oheight != grad_output.size(dimh)) { - AT_ERROR( - "Inconsistent gradOutput size. output height = ", - oheight, - ", output width = ", - owidth, - ", gradOutput: ", - grad_output.size(dimh), - "x", - grad_output.size(dimw)); - } - - if (grad_input.numel() != 0) { - max_unpool2d_backward_kernel(kCPU, grad_input, grad_output, indices); - } - - return grad_input; -} - -Tensor max_unpooling2d_backward_cpu( - const Tensor& grad_output, - const Tensor& self, - const Tensor& indices, - IntArrayRef output_size) { - auto grad_input = at::empty({0}, self.options()); - max_unpooling2d_backward_out_cpu( - grad_output, self, indices, output_size, grad_input); - return grad_input; -} - -Tensor& max_unpooling3d_backward_out_cpu( - const Tensor& grad_output_, - const Tensor& self, - const Tensor& indices_, - IntArrayRef output_size, - IntArrayRef stride, - IntArrayRef padding, - Tensor& grad_input) { - TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous"); - int64_t oT = output_size[0]; - int64_t oH = output_size[1]; - int64_t oW = output_size[2]; - int64_t ndim = self.ndimension(); - int64_t dimt = ndim == 4 ? 1 : 2; - int64_t dimh = ndim == 4 ? 2 : 3; - int64_t dimw = ndim == 4 ? 3 : 4; - - max_unpooling3d_shape_check( - self, grad_output_, indices_, output_size, stride, padding, "max_unpooling3d_backward_out_cpu()"); - - /* get contiguous gradOutput */ - auto grad_output = grad_output_.contiguous(); - auto indices = indices_.contiguous(); - - /* resize */ - grad_input.resize_as_(self); - grad_input.zero_(); - - if (oW != grad_output.size(dimw) || oH != grad_output.size(dimh) || oT != grad_output.size(dimt)) { - AT_ERROR( - "Inconsistent gradOutput size. output depth = ", - oT, - ", output height = ", - oH, - ", output width = ", - oW, - ", gradOutput: ", - grad_output.size(dimt), - "x", - grad_output.size(dimh), - "x", - grad_output.size(dimw)); - } - - if (grad_input.numel() != 0) { - max_unpool3d_backward_kernel(kCPU, grad_input, grad_output, indices); - } - - return grad_input; -} - -Tensor max_unpooling3d_backward_cpu( - const Tensor& grad_output, - const Tensor& self, - const Tensor& indices, - IntArrayRef output_size, - IntArrayRef stride, - IntArrayRef padding) { - auto grad_input = at::empty({0}, self.options()); - at::native::max_unpooling3d_backward_out_cpu( - grad_output, self, indices, output_size, stride, padding, grad_input); - return grad_input; -} - DEFINE_DISPATCH(max_unpool2d_kernel); -DEFINE_DISPATCH(max_unpool2d_backward_kernel); DEFINE_DISPATCH(max_unpool3d_kernel); -DEFINE_DISPATCH(max_unpool3d_backward_kernel); } // namespace native } // namespace at diff --git a/aten/src/ATen/native/NaiveDilatedConvolution.cpp b/aten/src/ATen/native/NaiveDilatedConvolution.cpp index 68eaa372b7ee..fa7b30f5977e 100644 --- a/aten/src/ATen/native/NaiveDilatedConvolution.cpp +++ b/aten/src/ATen/native/NaiveDilatedConvolution.cpp @@ -24,7 +24,8 @@ void hvol2col( const IntArrayRef stride_size, const IntArrayRef pad_size, const IntArrayRef dilation_size, - Dtype* data_col) { + Dtype* data_col, + bool is_channels_last = false) { if (dim == 3) { vol2col( data_hvol, @@ -65,7 +66,8 @@ void hvol2col( stride_size[1], dilation_size[0], dilation_size[1], - data_col); + data_col, + is_channels_last); } } @@ -80,7 +82,8 @@ void col2hvol( const IntArrayRef stride_size, const IntArrayRef pad_size, const IntArrayRef dilation_size, - Dtype* data_hvol) { + Dtype* data_hvol, + bool is_channels_last = false) { if (dim == 3) { col2vol( data_col, @@ -121,7 +124,8 @@ void col2hvol( stride_size[1], dilation_size[0], dilation_size[1], - data_hvol); + data_hvol, + is_channels_last); } } @@ -167,7 +171,8 @@ void slow_conv_dilated_all_cpu_template( IntArrayRef kernel_size, IntArrayRef stride_size, IntArrayRef pad_size, - IntArrayRef dilation_size) { + IntArrayRef dilation_size, + bool is_channels_last = false) { slow_conv_dilated_location_check(input, weight, bias, grad_output); auto options = input.options(); // The rear part of input tensor sizes: @@ -183,7 +188,11 @@ void slow_conv_dilated_all_cpu_template( if (output.defined() || grad_weight.defined() || grad_input.defined()) { const int64_t m = c10::multiply_integers(kernel_size); const int64_t n = c10::multiply_integers(output_size); - columns.resize_({nInputPlane * m, n}); + if (is_channels_last) { + columns.resize_({n, m * nInputPlane}); + } else { + columns.resize_({nInputPlane * m, n}); + } } // Initialize if (grad_weight.defined()) { @@ -200,7 +209,8 @@ void slow_conv_dilated_all_cpu_template( std::vector dims(dim); std::iota(dims.begin(), dims.end(), 1); - AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Long, input.scalar_type(), "slow_conv_dilated<>", [&] { + AT_DISPATCH_FLOATING_TYPES_AND2( + at::ScalarType::Long, at::ScalarType::BFloat16, input.scalar_type(), "slow_conv_dilated<>", [&] { // For each elt in batch, do: for (const auto elt : c10::irange(batchSize)) { // Matrix multiply per output: @@ -246,7 +256,8 @@ void slow_conv_dilated_all_cpu_template( stride_size, pad_size, dilation_size, - columns.data_ptr()); + columns.data_ptr(), + is_channels_last); /* Compute: @@ -265,25 +276,47 @@ void slow_conv_dilated_all_cpu_template( gemm assumes column-major matrices: + channels last: + output_n^T = weight *columns^T + output_n^T + C = alpha * op(A) * op(B) + beta * C + op(A) = 't', op(B) = 'n', alpha=1, beta=1 + + channels first: output_n^T = columns^T * weight^T + output_n^T C = alpha * op(A) * op(B) + beta * C op(A) = 'n', op(B) = 'n', alpha=1, beta=1 */ - cpublas::gemm( - /*transa=*/TransposeType::NoTranspose, - /*transb=*/TransposeType::NoTranspose, - /* m=*/columns.size(1), - /* n=*/nOutputPlane, - /* k=*/columns.size(0), - /* alpha=*/1, - /* A=*/columns.data_ptr(), - /* lda=*/columns.size(1), - /* B=*/weight.data_ptr(), - /* ldb=*/columns.size(0), - /* beta=*/1, - /* C=*/output_n.data_ptr(), - /* ldc=*/columns.size(1)); - + if (is_channels_last) { + cpublas::gemm( + /*transa=*/TransposeType::Transpose, + /*transb=*/TransposeType::NoTranspose, + /* m=*/nOutputPlane, + /* n=*/columns.size(0), + /* k=*/columns.size(1), + /* alpha=*/static_cast(1), + /* A=*/weight.data_ptr(), + /* lda=*/columns.size(1), + /* B=*/columns.data_ptr(), + /* lda=*/columns.size(1), + /* beta=*/static_cast(1), + /* C=*/output_n.data_ptr(), + /* ldc=*/nOutputPlane); + } else { + cpublas::gemm( + /*transa=*/TransposeType::NoTranspose, + /*transb=*/TransposeType::NoTranspose, + /* m=*/columns.size(1), + /* n=*/nOutputPlane, + /* k=*/columns.size(0), + /* alpha=*/static_cast(1), + /* A=*/columns.data_ptr(), + /* lda=*/columns.size(1), + /* B=*/weight.data_ptr(), + /* ldb=*/columns.size(0), + /* beta=*/static_cast(1), + /* C=*/output_n.data_ptr(), + /* ldc=*/columns.size(1)); + } } else { // All gradients grad_output_n = grad_output.select(0, elt); @@ -309,24 +342,47 @@ void slow_conv_dilated_all_cpu_template( gemm assumes column-major matrices: + channels last: + columns^T = weight^T * grad_output_n^T + C = alpha * op(A) * op(B) + beta * C + op(A) = 'n', op(B) = 'n', alpha=1, beta=0 + + channels first: columns^T = grad_output_n^T * weight C = alpha * op(A) * op(B) + beta * C op(A) = 'n', op(B) = 't', alpha=1, beta=0 */ - cpublas::gemm( - /*transa=*/TransposeType::NoTranspose, - /*transb=*/TransposeType::Transpose, - /* m=*/columns.size(1), - /* n=*/columns.size(0), - /* k=*/nOutputPlane, - /* alpha=*/1, - /* A=*/grad_output_n.data_ptr(), - /* lda=*/columns.size(1), - /* B=*/weight.data_ptr(), - /* ldb=*/columns.size(0), - /* beta=*/0, - /* C=*/columns.data_ptr(), - /* ldc=*/columns.size(1)); + if (is_channels_last) { + cpublas::gemm( + /*transa=*/TransposeType::NoTranspose, + /*transb=*/TransposeType::NoTranspose, + /* m=*/columns.size(1), + /* n=*/columns.size(0), + /* k=*/nOutputPlane, + /* alpha=*/static_cast(1), + /* A=*/weight.data_ptr(), + /* lda=*/columns.size(1), + /* B=*/grad_output_n.data_ptr(), + /* ldb=*/nOutputPlane, + /* beta=*/static_cast(0), + /* C=*/columns.data_ptr(), + /* ldc=*/columns.size(1)); + } else { + cpublas::gemm( + /*transa=*/TransposeType::NoTranspose, + /*transb=*/TransposeType::Transpose, + /* m=*/columns.size(1), + /* n=*/columns.size(0), + /* k=*/nOutputPlane, + /* alpha=*/static_cast(1), + /* A=*/grad_output_n.data_ptr(), + /* lda=*/columns.size(1), + /* B=*/weight.data_ptr(), + /* ldb=*/columns.size(0), + /* beta=*/static_cast(0), + /* C=*/columns.data_ptr(), + /* ldc=*/columns.size(1)); + } // Unpack columns back into input: Tensor grad_input_n = grad_input.select(0, elt); @@ -339,7 +395,8 @@ void slow_conv_dilated_all_cpu_template( stride_size, pad_size, dilation_size, - grad_input_n.data_ptr()); + grad_input_n.data_ptr(), + is_channels_last); } // Gradient of weight: @@ -354,7 +411,8 @@ void slow_conv_dilated_all_cpu_template( stride_size, pad_size, dilation_size, - columns.data_ptr()); + columns.data_ptr(), + is_channels_last); scalar_t scale = 1; // TODO: expose as argument? /* Compute: @@ -374,24 +432,47 @@ void slow_conv_dilated_all_cpu_template( gemm assumes column-major matrices: - grad_weight^T = scale * columns * grad_output_n^T + - grad_weight^T C = alpha * op(A) * op(B) + beta * C op(A) = 't', - op(B) = 'n', alpha=scale, beta=1 + channels last: + grad_weight^T = scale * columns^T * grad_output_n + grad_weight^T + C = alpha * op(A) * op(B) + beta * C + op(A) = 'n', op(B) = 't', alpha=scale, beta=1 + + channels first: + grad_weight^T = scale * columns * grad_output_n^T + grad_weight^T + C = alpha * op(A) * op(B) + beta * C + op(A) = 't', op(B) = 'n', alpha=scale, beta=1 */ - cpublas::gemm( - /*transa=*/TransposeType::Transpose, - /*transb=*/TransposeType::NoTranspose, - /* m=*/columns.size(0), - /* n=*/nOutputPlane, - /* k=*/columns.size(1), - /* alpha=*/scale, - /* A=*/columns.data_ptr(), - /* lda=*/columns.size(1), - /* B=*/grad_output_n.data_ptr(), - /* ldb=*/columns.size(1), - /* beta=*/1, - /* C=*/grad_weight.data_ptr(), - /* ldc=*/columns.size(0)); + if (is_channels_last) { + cpublas::gemm( + /*transa=*/TransposeType::NoTranspose, + /*transb=*/TransposeType::Transpose, + /* m=*/columns.size(1), + /* n=*/nOutputPlane, + /* k=*/columns.size(0), + /* alpha=*/static_cast(scale), + /* A=*/columns.data_ptr(), + /* lda=*/columns.size(1), + /* B=*/grad_output_n.data_ptr(), + /* ldb=*/nOutputPlane, + /* beta=*/static_cast(1), + /* C=*/grad_weight.data_ptr(), + /* ldc=*/columns.size(1)); + } else { + cpublas::gemm( + /*transa=*/TransposeType::Transpose, + /*transb=*/TransposeType::NoTranspose, + /* m=*/columns.size(0), + /* n=*/nOutputPlane, + /* k=*/columns.size(1), + /* alpha=*/static_cast(scale), + /* A=*/columns.data_ptr(), + /* lda=*/columns.size(1), + /* B=*/grad_output_n.data_ptr(), + /* ldb=*/columns.size(1), + /* beta=*/static_cast(1), + /* C=*/grad_weight.data_ptr(), + /* ldc=*/columns.size(0)); + } } // Gradient of bias: @@ -441,6 +522,9 @@ Tensor slow_conv_dilated2d_cpu( c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); const Tensor& bias = *bias_maybe_owned; + bool use_channels_last = thnn_conv_use_channels_last(input, weight); + auto memory_format = use_channels_last ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous; + Tensor undefined; internal::slow_conv_dilated_shape_check<2>( input, @@ -459,10 +543,10 @@ Tensor slow_conv_dilated2d_cpu( // template function assumes batched tensors. unsqueeze(0) will // insert batch dimension without affecting the original tensor. const Tensor input_ = - (is_batch ? input.contiguous() : input.contiguous().unsqueeze(0)); - const Tensor weight_ = weight.contiguous(); + (is_batch ? input.contiguous(memory_format) : input.contiguous().unsqueeze(0)); + const Tensor weight_ = weight.contiguous(memory_format); const Tensor bias_ = (bias.defined() ? bias.contiguous() : undefined); - Tensor output = at::empty(output_size, options); + Tensor output = at::empty(output_size, options.memory_format(memory_format)); Tensor output_ = (is_batch ? output : output.unsqueeze(0)); slow_conv_dilated_all_cpu_template<2>( @@ -477,7 +561,8 @@ Tensor slow_conv_dilated2d_cpu( kernel_size, stride_size, pad_size, - dilation_size); + dilation_size, + use_channels_last); return output; } @@ -541,6 +626,9 @@ std::tuple slow_conv_dilated2d_backward_cpu( IntArrayRef pad_size, IntArrayRef dilation_size, const std::array output_mask) { + bool use_channels_last = thnn_conv_use_channels_last(input, weight); + auto memory_format = use_channels_last ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous; + Tensor undefined; internal::slow_conv_dilated_shape_check<2>( input, @@ -556,16 +644,16 @@ std::tuple slow_conv_dilated2d_backward_cpu( // template function assumes batched tensors. unsqueeze(0) will // insert batch dimension without affecting the original tensor. const Tensor grad_output_ = - (is_batch ? grad_output.contiguous() + (is_batch ? grad_output.contiguous(memory_format) : grad_output.contiguous().unsqueeze(0)); const Tensor input_ = - (is_batch ? input.contiguous() : input.contiguous().unsqueeze(0)); - const Tensor weight_ = weight.contiguous(); + (is_batch ? input.contiguous(memory_format) : input.contiguous().unsqueeze(0)); + const Tensor weight_ = weight.contiguous(memory_format); // compute only gradients for which the corresponding output_mask is true: Tensor grad_input = - (output_mask[0] ? at::empty(input.sizes(), options) : undefined); + (output_mask[0] ? at::empty(input.sizes(), options.memory_format(memory_format)) : undefined); Tensor grad_weight = - (output_mask[1] ? at::empty(weight.sizes(), options) : undefined); + (output_mask[1] ? at::empty(weight.sizes(), options.memory_format(memory_format)) : undefined); Tensor grad_bias = (output_mask[2] ? at::empty(weight.size(0), options) : undefined); Tensor grad_input_ = @@ -583,7 +671,8 @@ std::tuple slow_conv_dilated2d_backward_cpu( kernel_size, stride_size, pad_size, - dilation_size); + dilation_size, + use_channels_last); return std::tie(grad_input, grad_weight, grad_bias); } diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp index fdce903c0806..7cdf38192708 100644 --- a/aten/src/ATen/native/Normalization.cpp +++ b/aten/src/ATen/native/Normalization.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -26,7 +27,7 @@ TORCH_META_FUNC(renorm)(const Tensor& self, const Scalar& p, int64_t dim, const TORCH_CHECK(maxnorm.toDouble() >= 0.0, "renorm: expected maxnorm to be >= 0 but got ", maxnorm.toDouble()); const auto ndim = self.dim(); - TORCH_CHECK(ndim > 1, "renorm: input needs at least 2 dimensions, got ", ndim, "dimensions"); + TORCH_CHECK(ndim > 1, "renorm: input needs at least 2 dimensions, got ", ndim, " dimensions"); set_output(self.sizes(), self.options()); } @@ -82,7 +83,7 @@ static inline MemoryFormat suggest_memory_format_contig(const Tensor& t) { return t.is_contiguous() ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast; } -template +template std::tuple batch_norm_cpu_transform_input_template( const Tensor& input, const Tensor& weight, const Tensor& bias, const Tensor& save_mean /* optional */, const Tensor& save_invstd /* optional */, @@ -122,10 +123,12 @@ std::tuple batch_norm_cpu_transform_input_template( return 1 / at::sqrt(running_var + eps); } }()); + const bool mixed_type = !std::is_same::value; + const auto dtype = mixed_type ? kFloat : input.scalar_type(); auto w = weight.defined() ? as_nd(weight) : - at::detail::scalar_tensor_static(1, input.scalar_type(), kCPU); + at::detail::scalar_tensor_static(1, dtype, kCPU); auto b = bias.defined() ? as_nd(bias) : - at::detail::scalar_tensor_static(0, input.scalar_type(), kCPU); + at::detail::scalar_tensor_static(0, dtype, kCPU); Tensor output = at::empty_like(input, input.suggest_memory_format()); auto iter = TensorIteratorConfig() @@ -135,15 +138,17 @@ std::tuple batch_norm_cpu_transform_input_template( .add_input(invstd) .add_input(w) .add_input(b) + .check_all_same_dtype(false) + .promote_inputs_to_common_dtype(false) .build(); - cpu_kernel(iter, [=](scalar_t input, scalar_t mean, scalar_t invstd, scalar_t weight, scalar_t bias) { + cpu_kernel(iter, [=](scalar_t input, param_t mean, param_t invstd, param_t weight, param_t bias) { return ((input - mean) * invstd) * weight + bias; }); return std::make_tuple(output, save_mean, save_invstd); } -template class VarTransform> +template class VarTransform> std::tuple batch_norm_cpu_update_stats_template( const Tensor& input, const Tensor& running_mean, const Tensor& running_var, double momentum, double eps) { @@ -161,20 +166,26 @@ std::tuple batch_norm_cpu_update_stats_template( reduce_dims[i - 1] = i; } - Tensor save_mean = at::mean(input, /*dims=*/reduce_dims); - Tensor save_var_transform = at::empty({n_input}, input.options()); - auto save_mean_a = save_mean.accessor(); - auto save_var_transform_a = save_var_transform.accessor(); + bool all_contiguous = is_contiguous(input); + const bool mixed_type = !std::is_same::value; + const auto dtype = mixed_type ? kFloat : input.scalar_type(); - auto running_mean_a = conditional_accessor_1d(running_mean); - auto running_var_a = conditional_accessor_1d(running_var); + // For contiguous case, leave 'mean' computation to kernel + Tensor save_mean = all_contiguous + ? at::empty({n_input}, input.options().dtype(dtype)) + : at::mean(input, /*dim=*/reduce_dims, /*keepdim=*/false, dtype); + Tensor save_var_transform = at::empty({n_input}, input.options().dtype(dtype)); + auto save_mean_a = save_mean.accessor(); + auto save_var_transform_a = save_var_transform.accessor(); + + auto running_mean_a = conditional_accessor_1d(running_mean); + auto running_var_a = conditional_accessor_1d(running_var); - bool all_contiguous = is_contiguous(input); if (all_contiguous) { - auto _mean = at::empty({n_input}, input.options()); - auto _var_sum = at::empty({n_input}, input.options()); - auto _mean_a = _mean.accessor(); - auto _var_sum_a = _var_sum.accessor(); + auto _mean = at::empty({n_input}, input.options().dtype(dtype)); + auto _var_sum = at::empty({n_input}, input.options().dtype(dtype)); + auto _mean_a = _mean.accessor(); + auto _var_sum_a = _var_sum.accessor(); batch_norm_cpu_collect_stats_stub(kCPU, _mean, _var_sum, input); @@ -203,6 +214,8 @@ std::tuple batch_norm_cpu_update_stats_template( .add_input(input) .resize_outputs(false) .declare_static_shape(input.sizes(), /*squash_dims=*/1) + .check_all_same_dtype(false) + .promote_inputs_to_common_dtype(false) .build(); parallel_for(0, n_input, 1, [&](int64_t b_begin, int64_t b_end) { @@ -230,7 +243,7 @@ std::tuple batch_norm_cpu_update_stats_template( return std::make_tuple(save_mean, save_var_transform); } -template +template std::tuple batch_norm_backward_cpu_template( const Tensor& grad_out_, const Tensor& input, const Tensor& weight, const Tensor& running_mean, const Tensor& running_var, const Tensor& save_mean, const Tensor& save_invstd, @@ -238,6 +251,9 @@ std::tuple batch_norm_backward_cpu_template( using accscalar_t = at::acc_type; + const bool mixed_type = !std::is_same::value; + const auto dtype = mixed_type ? kFloat : input.scalar_type(); + Tensor grad_input; Tensor grad_weight; Tensor grad_bias; @@ -245,10 +261,10 @@ std::tuple batch_norm_backward_cpu_template( grad_input = at::empty_like(input, input.suggest_memory_format()); } if (grad_input_mask[1]) { - grad_weight = at::empty_like(weight, at::MemoryFormat::Contiguous); + grad_weight = at::empty({input.size(1)}, input.options().dtype(dtype)); } if (grad_input_mask[2]) { - grad_bias = at::empty({input.size(1)}, input.options()); + grad_bias = at::empty({input.size(1)}, input.options().dtype(dtype)); } // since we are directly manipulating pointers in contiguous path, @@ -266,18 +282,18 @@ std::tuple batch_norm_backward_cpu_template( return std::make_tuple(grad_input, grad_weight, grad_bias); } - auto weight_a = conditional_accessor_1d(weight); - auto grad_weight_a = conditional_accessor_1d(grad_weight); - auto grad_bias_a = conditional_accessor_1d(grad_bias); + auto weight_a = conditional_accessor_1d(weight); + auto grad_weight_a = conditional_accessor_1d(grad_weight); + auto grad_bias_a = conditional_accessor_1d(grad_bias); int64_t n_input = input.size(1); int64_t n = input.numel() / n_input; - auto save_mean_a = conditional_accessor_1d(save_mean); - auto save_invstd_a = conditional_accessor_1d(save_invstd); + auto save_mean_a = conditional_accessor_1d(save_mean); + auto save_invstd_a = conditional_accessor_1d(save_invstd); - auto running_mean_a = conditional_accessor_1d(running_mean); - auto running_var_a = conditional_accessor_1d(running_var); + auto running_mean_a = conditional_accessor_1d(running_mean); + auto running_var_a = conditional_accessor_1d(running_var); const int64_t ndim = input.dim(); @@ -332,9 +348,9 @@ std::tuple batch_norm_backward_cpu_template( TensorIterator binary_iter_local(binary_iter); for (const auto f : c10::irange(b_begin, b_end)) { - scalar_t w = weight.defined() ? weight_a[f] : 1; + param_t w = weight.defined() ? weight_a[f] : param_t(1); - scalar_t mean, invstd; + param_t mean, invstd; if (train) { mean = save_mean_a[f]; invstd = save_invstd_a[f]; @@ -557,7 +573,6 @@ std::tuple _batch_norm_impl_index_backward( } // backward in inference mode is not supported in cudnn, fallback to native - // TODO: verify the same thing in miopen if (impl_index == 0 || (!train)) { return at::native_batch_norm_backward(grad_output, input, weight, running_mean, running_var, save_mean, save_var_transform, train, epsilon, output_mask); } else if (impl_index == 1) { @@ -627,9 +642,15 @@ std::tuple batch_norm_update_stats_cpu( const Tensor& running_mean = *running_mean_maybe_owned; const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();}); - return AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "batch_norm_update_stats_cpu", [&] { - return batch_norm_cpu_update_stats_template(self, running_mean, running_var, momentum, 0); - }); + const bool mixed_type = is_mixed_type(self, running_mean, running_var); + return AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, self.scalar_type(), "batch_norm_update_stats_cpu", [&] { + if (mixed_type) { + check_mixed_data_type(self, running_mean, running_var); + return batch_norm_cpu_update_stats_template(self, running_mean, running_var, momentum, 0); + } else { + return batch_norm_cpu_update_stats_template(self, running_mean, running_var, momentum, 0); + } + }); } std::tuple batch_norm_cpu(const Tensor& self, const c10::optional& weight_opt, const c10::optional& bias_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, @@ -643,16 +664,29 @@ std::tuple batch_norm_cpu(const Tensor& self, const c10: checkBackend("batch_norm_cpu", {self, weight, bias, running_mean, running_var}, Backend::CPU); - return AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "batch_norm", [&] { + const bool mixed_type = is_mixed_type(self, weight, bias, running_mean, running_var); + return AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, self.scalar_type(), "batch_norm", [&] { + if (mixed_type) { + check_mixed_data_type(self, weight, bias, running_mean, running_var); + if (!train) { + auto save_mean = at::empty({0}, self.options().dtype(kFloat)); + auto save_var = at::empty({0}, self.options().dtype(kFloat)); + return batch_norm_cpu_transform_input_template(self, weight, bias, save_mean, save_var, running_mean, running_var, train, eps); + } else { + auto save_stats = batch_norm_cpu_update_stats_template(self, running_mean, running_var, momentum, eps); + return batch_norm_cpu_transform_input_template(self, weight, bias, std::get<0>(save_stats), std::get<1>(save_stats), running_mean, running_var, train, eps); + } + } else { if (!train) { auto save_mean = at::empty({0}, self.options()); auto save_var = at::empty({0}, self.options()); - return batch_norm_cpu_transform_input_template(self, weight, bias, save_mean, save_var, running_mean, running_var, train, eps); + return batch_norm_cpu_transform_input_template(self, weight, bias, save_mean, save_var, running_mean, running_var, train, eps); } else { - auto save_stats = batch_norm_cpu_update_stats_template(self, running_mean, running_var, momentum, eps); - return batch_norm_cpu_transform_input_template(self, weight, bias, std::get<0>(save_stats), std::get<1>(save_stats), running_mean, running_var, train, eps); + auto save_stats = batch_norm_cpu_update_stats_template(self, running_mean, running_var, momentum, eps); + return batch_norm_cpu_transform_input_template(self, weight, bias, std::get<0>(save_stats), std::get<1>(save_stats), running_mean, running_var, train, eps); } - }); + } + }); } std::tuple batch_norm_backward_cpu(const Tensor& grad_out, const Tensor& self, const c10::optional& weight_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, const c10::optional& save_mean_opt, const c10::optional& save_invstd_opt, @@ -665,9 +699,15 @@ std::tuple batch_norm_backward_cpu(const Tensor& grad_ou const Tensor& save_mean = c10::value_or_else(save_mean_opt, [] {return Tensor();}); const Tensor& save_invstd = c10::value_or_else(save_invstd_opt, [] {return Tensor();}); - return AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "batch_norm_backward_cpu", [&] { - return batch_norm_backward_cpu_template(grad_out, self, weight, running_mean, running_var, save_mean, save_invstd, train, eps, grad_input_mask); - }); + const bool mixed_type = is_mixed_type(self, weight, running_mean, running_var, save_mean, save_invstd); + return AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, self.scalar_type(), "batch_norm_backward_cpu", [&] { + if (mixed_type) { + check_mixed_data_type(self, weight, running_mean, running_var, save_mean, save_invstd); + return batch_norm_backward_cpu_template(grad_out, self, weight, running_mean, running_var, save_mean, save_invstd, train, eps, grad_input_mask); + } else { + return batch_norm_backward_cpu_template(grad_out, self, weight, running_mean, running_var, save_mean, save_invstd, train, eps, grad_input_mask); + } + }); } TORCH_IMPL_FUNC(renorm_out)(const Tensor& self, const Scalar& p, int64_t dim, @@ -692,7 +732,7 @@ TORCH_IMPL_FUNC(renorm_out)(const Tensor& self, const Scalar& p, int64_t dim, /*keepdim=*/true); } - auto factor = (acc_type == c10::toValueType(dtype)) ? + auto factor = (acc_type == c10::toRealValueType(dtype)) ? norm : at::empty(norm.sizes(), self.options()); auto iter = TensorIteratorConfig() .add_output(factor) diff --git a/aten/src/ATen/native/PadNd.cpp b/aten/src/ATen/native/PadNd.cpp new file mode 100644 index 000000000000..9510b17de002 --- /dev/null +++ b/aten/src/ATen/native/PadNd.cpp @@ -0,0 +1,213 @@ +#include +#include + +#include + +namespace at { namespace native { + +Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value) { + TORCH_CHECK(pad.size() % 2 == 0, "Length of pad must be even but instead it equals ", + pad.size()); + + auto input_sizes = self.sizes(); + auto l_inp = self.dim(); + + auto l_pad = pad.size() / 2; + auto l_diff = l_inp - l_pad; + TORCH_CHECK(l_inp >= (int64_t)l_pad, "Length of pad should be no more than twice the number of " + "dimensions of the input. Pad length is ", pad.size(), "while the input has ", + l_inp, "dimensions."); + + std::vector new_shape; + + bool all_pads_non_positive = true; + + auto c_input = self; + for (const auto i : c10::irange(l_diff, l_inp)) { + auto pad_idx = 2 * (l_inp - i - 1); + if (pad[pad_idx] < 0) { + c_input = c_input.narrow(i, -pad[pad_idx], c_input.size(i) + pad[pad_idx]); + } else if (pad[pad_idx] != 0) { + all_pads_non_positive = false; + } + if (pad[pad_idx + 1] < 0) { + c_input = c_input.narrow(i, 0, c_input.size(i) + pad[pad_idx + 1]); + } else if (pad[pad_idx + 1] != 0) { + all_pads_non_positive = false; + } + } + + // if none of the pads are positive we can optimize and just return the result + // of calling .narrow() on the input + if (all_pads_non_positive) { + return c_input.clone(); + } + + + for (size_t i = 0; i < (size_t)l_diff; i ++) { + new_shape.emplace_back(input_sizes[i]); + } + + for (const auto i : c10::irange((size_t)l_pad)) { + auto pad_idx = pad.size() - ((i + 1) * 2); + auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1]; + TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ", + pad[pad_idx], " and ", pad[pad_idx + 1], " resulted in a negative output size, " + "which is invalid. Check dimension ", l_diff + i, " of your input."); + new_shape.emplace_back(new_dim); + } + + at::Tensor output; + const auto memory_format = self.suggest_memory_format(); + if (self.is_quantized()) { + const auto qscheme = self.qscheme(); + TORCH_CHECK(qscheme == kPerTensorAffine || qscheme == kPerTensorSymmetric, + "Only per-tensor padding is supported."); + output = at::_empty_affine_quantized( + new_shape, self.options().memory_format(memory_format), + self.q_scale(), self.q_zero_point(), c10::nullopt); + } else { + output = at::empty(new_shape, self.options().memory_format(memory_format)); + } + output.fill_(value); + + auto c_output = output; + for (const auto i : c10::irange(l_diff, l_inp)) { + auto pad_idx = 2 * (l_inp - i - 1); + if (pad[pad_idx] > 0) { + c_output = c_output.narrow(i, pad[pad_idx], c_output.size(i) - pad[pad_idx]); + } + if (pad[pad_idx + 1] > 0) { + c_output = c_output.narrow(i, 0, c_output.size(i) - pad[pad_idx + 1]); + } + } + c_output.copy_(c_input); + return output; +} + +Tensor _pad_circular(const Tensor &self, IntArrayRef padding) { + const auto in_shape = self.sizes(); + const auto ndim = static_cast(in_shape.size()) - 2; + TORCH_CHECK(padding.size() + 4 == in_shape.size() * 2, + "Invalid padding size, expected ", ndim * 2, " but got ", padding.size()); + + DimVector out_shape(in_shape.size()); + out_shape[0] = in_shape[0]; + out_shape[1] = in_shape[1]; + + // Get shape of padded tensor + for (const auto i : c10::irange(ndim)) { + const auto pad_l = padding[2 * (ndim - i - 1) + 0]; + const auto pad_r = padding[2 * (ndim - i - 1) + 1]; + const auto size = in_shape[2 + i]; + out_shape[2 + i] = size + pad_l + pad_r; + + TORCH_CHECK( + pad_l <= size && pad_r <= size, + "Padding value causes wrapping around more than once."); + TORCH_CHECK( + out_shape[2 + i] >= 0, + "Negative padding value is resulting in an empty dimension"); + } + + auto out = self.new_empty(out_shape, self.options()); + + // Put original array into the padded array + Tensor out_slice = out; + Tensor in_slice = self; + constexpr int64_t zero = 0; + for (const auto i : c10::irange(ndim)) { + const auto dim = ndim - i + 1; + const auto pad_l = padding[2*i + 0]; + const auto pad_r = padding[2*i + 1]; + out_slice = out_slice.slice(dim, std::max(pad_l, zero), out_shape[dim] - std::max(pad_r, zero)); + in_slice = in_slice.slice(dim, std::max(-pad_l, zero), in_shape[dim] - std::max(-pad_r, zero)); + } + out_slice.copy_(in_slice); + + // The following steps first pad the beginning of the tensor (left side), + // and then pad the end of the tensor (right side). + // Note: Corners will be written more than once when ndim > 1. + // + // Only in cases where padding values are > 0 are when additional copying + // is required. + for (const auto i : c10::irange(ndim)) { + const auto dim = ndim - i + 1; + const auto pad_l = padding[2*i + 0]; + const auto pad_r = padding[2*i + 1]; + + if (pad_l > 0) { + out_slice = out.slice(dim, 0, pad_l); + in_slice = out.slice(dim, + out_shape[dim] - pad_l - std::max(pad_r, zero), + out_shape[dim] - std::max(pad_r, zero)); + out_slice.copy_(in_slice); + } + + if (pad_r > 0) { + out_slice = out.slice(dim, out_shape[dim] - pad_r, out_shape[dim]); + in_slice = out.slice(dim, std::max(pad_l, zero), std::max(pad_l, zero) + pad_r); + out_slice.copy_(in_slice); + } + } + + return out; +} + +Tensor _pad_enum(const Tensor &self, IntArrayRef pad, int64_t mode_int, c10::optional value) { + const auto input_dim = self.dim(); + TORCH_CHECK(pad.size() % 2 == 0, "Padding length must be divisible by 2"); + TORCH_CHECK(static_cast(pad.size()) <= input_dim * 2, "Padding length too large"); + auto mode = static_cast(mode_int); + + if (mode == at::padding_mode::constant) { + return at::constant_pad_nd(self, pad, value.value_or(0.0)); + } + TORCH_CHECK(!value.has_value() || *value == 0, + "Padding mode \"", padding_mode_string(mode), + "\" doesn't take in value argument"); + + if (pad.size() == 2 && (input_dim == 2 || input_dim == 3)) { + switch (mode) { + case at::padding_mode::reflect: return at::reflection_pad1d(self, pad); + case at::padding_mode::replicate: return at::replication_pad1d(self, pad); + case at::padding_mode::circular: return at::_pad_circular(self, pad); + default: {} + } + } else if(pad.size() == 4 && (input_dim == 3 || input_dim == 4)) { + switch (mode) { + case at::padding_mode::reflect: return at::reflection_pad2d(self, pad); + case at::padding_mode::replicate: return at::replication_pad2d(self, pad); + case at::padding_mode::circular: return at::_pad_circular(self, pad); + default: {} + } + } else if (pad.size() == 6 && (input_dim == 4 || input_dim == 5)) { + switch (mode) { + case at::padding_mode::reflect: return at::reflection_pad3d(self, pad); + case at::padding_mode::replicate: return at::replication_pad3d(self, pad); + case at::padding_mode::circular: return at::_pad_circular(self, pad); + default: {} + } + } + C10_THROW_ERROR(NotImplementedError, + "Only 2D, 3D, 4D, 5D padding with non-constant padding are supported for now"); +} + +Tensor pad(const Tensor &self, IntArrayRef pad, c10::string_view mode, c10::optional value) { + const auto mode_enum = [&] { + if (mode == "reflect") { + return at::padding_mode::reflect; + } else if (mode == "constant") { + return at::padding_mode::constant; + } else if (mode == "replicate") { + return at::padding_mode::replicate; + } else if (mode == "circular") { + return at::padding_mode::circular; + } + C10_THROW_ERROR(NotImplementedError, + c10::str("Unrecognised padding mode ", mode)); + }(); + return at::native::_pad_enum(self, pad, static_cast(mode_enum), value); +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/PadNd.h b/aten/src/ATen/native/PadNd.h new file mode 100644 index 000000000000..37f59acb8a4c --- /dev/null +++ b/aten/src/ATen/native/PadNd.h @@ -0,0 +1,22 @@ +#pragma once + +namespace at { + +enum class padding_mode { + reflect, + replicate, + circular, + constant, +}; + +static inline c10::string_view padding_mode_string(padding_mode m) { + switch (m) { + case padding_mode::reflect: return "reflect"; + case padding_mode::replicate: return "replicate"; + case padding_mode::circular: return "circular"; + case padding_mode::constant: return "constant"; + } + TORCH_CHECK(false, "Invalid padding mode (", static_cast(m), ")"); +} + +} // namespace at diff --git a/aten/src/ATen/native/PixelShuffle.cpp b/aten/src/ATen/native/PixelShuffle.cpp index fc8e3c80cefc..41547a10f5fd 100644 --- a/aten/src/ATen/native/PixelShuffle.cpp +++ b/aten/src/ATen/native/PixelShuffle.cpp @@ -3,21 +3,83 @@ #include #include -#include -#include -#include +#include namespace at { namespace native { -Tensor pixel_shuffle(const Tensor& self, int64_t upscale_factor) { +static inline void check_pixel_shuffle_shapes(const Tensor& self, int64_t upscale_factor) { TORCH_CHECK(self.dim() >= 3, "pixel_shuffle expects input to have at least 3 dimensions, but got input with ", self.dim(), " dimension(s)"); - TORCH_CHECK( - upscale_factor > 0, - "pixel_shuffle expects a positive upscale_factor, but got ", - upscale_factor); + TORCH_CHECK(upscale_factor > 0, + "pixel_shuffle expects a positive upscale_factor, but got ", + upscale_factor); + int64_t c = self.size(-3); + int64_t upscale_factor_squared = upscale_factor * upscale_factor; + TORCH_CHECK(c % upscale_factor_squared == 0, + "pixel_shuffle expects its input's 'channel' dimension to be divisible by the square of " + "upscale_factor, but input.size(-3)=", c, " is not divisible by ", upscale_factor_squared); +} + +static inline void check_pixel_unshuffle_shapes(const Tensor& self, int64_t downscale_factor) { + TORCH_CHECK(self.dim() >= 3, + "pixel_unshuffle expects input to have at least 3 dimensions, but got input with ", + self.dim(), " dimension(s)"); + TORCH_CHECK(downscale_factor > 0, + "pixel_unshuffle expects a positive downscale_factor, but got ", + downscale_factor); + int64_t h = self.size(-2); + int64_t w = self.size(-1); + TORCH_CHECK(h % downscale_factor == 0, + "pixel_unshuffle expects height to be divisible by downscale_factor, but input.size(-2)=", h, + " is not divisible by ", downscale_factor); + TORCH_CHECK(w % downscale_factor == 0, + "pixel_unshuffle expects width to be divisible by downscale_factor, but input.size(-1)=", w, + " is not divisible by ", downscale_factor); +} + +Tensor pixel_shuffle_cpu(const Tensor& self, int64_t upscale_factor) { + check_pixel_shuffle_shapes(self, upscale_factor); + + // Format: (B1, ..., Bn), C, H, W + std::vector output_sizes(self.sizes().begin(), self.sizes().end() - 3); + output_sizes.insert(output_sizes.end(), + {self.size(-3) / upscale_factor / upscale_factor, + self.size(-2) * upscale_factor, + self.size(-1) * upscale_factor}); + + auto output = at::empty({0}, self.options()); + auto memory_format = self.suggest_memory_format(); + output.resize_(output_sizes, memory_format); + auto input = self.contiguous(memory_format); + + pixel_shuffle_kernel(kCPU, output, input, upscale_factor); + return output; +} + +Tensor pixel_unshuffle_cpu(const Tensor& self, int64_t downscale_factor) { + check_pixel_unshuffle_shapes(self, downscale_factor); + + // Format: (B1, ..., Bn), C, H, W + std::vector output_sizes(self.sizes().begin(), self.sizes().end() - 3); + output_sizes.insert(output_sizes.end(), + {self.size(-3) * downscale_factor * downscale_factor, + self.size(-2) / downscale_factor, + self.size(-1) / downscale_factor}); + + auto output = at::empty({0}, self.options()); + auto memory_format = self.suggest_memory_format(); + output.resize_(output_sizes, memory_format); + auto input = self.contiguous(memory_format); + + pixel_unshuffle_kernel(kCPU, output, input, downscale_factor); + return output; +} + +Tensor math_pixel_shuffle(const Tensor& self, int64_t upscale_factor) { + check_pixel_shuffle_shapes(self, upscale_factor); + // Format: (B1, ..., Bn), C, H, W int64_t c = self.size(-3); int64_t h = self.size(-2); @@ -26,9 +88,6 @@ Tensor pixel_shuffle(const Tensor& self, int64_t upscale_factor) { const auto self_sizes_batch_end = self.sizes().end() - NUM_NON_BATCH_DIMS; int64_t upscale_factor_squared = upscale_factor * upscale_factor; - TORCH_CHECK(c % upscale_factor_squared == 0, - "pixel_shuffle expects its input's 'channel' dimension to be divisible by the square of " - "upscale_factor, but input.size(-3)=", c, " is not divisible by ", upscale_factor_squared); int64_t oc = c / upscale_factor_squared; int64_t oh = h * upscale_factor; int64_t ow = w * upscale_factor; @@ -54,18 +113,13 @@ Tensor pixel_shuffle(const Tensor& self, int64_t upscale_factor) { // and (w, upscale_factor) -> a single dim (ow). std::vector final_shape(self.sizes().begin(), self_sizes_batch_end); final_shape.insert(final_shape.end(), {oc, oh, ow}); + return input_permuted.reshape(final_shape); } +Tensor math_pixel_unshuffle(const Tensor& self, int64_t downscale_factor) { + check_pixel_unshuffle_shapes(self, downscale_factor); -Tensor pixel_unshuffle(const Tensor& self, int64_t downscale_factor) { - TORCH_CHECK(self.dim() >= 3, - "pixel_unshuffle expects input to have at least 3 dimensions, but got input with ", - self.dim(), " dimension(s)"); - TORCH_CHECK( - downscale_factor > 0, - "pixel_unshuffle expects a positive downscale_factor, but got ", - downscale_factor); // Format: (B1, ..., Bn), C, H, W int64_t c = self.size(-3); int64_t h = self.size(-2); @@ -73,12 +127,6 @@ Tensor pixel_unshuffle(const Tensor& self, int64_t downscale_factor) { constexpr auto NUM_NON_BATCH_DIMS = 3; const auto self_sizes_batch_end = self.sizes().end() - NUM_NON_BATCH_DIMS; - TORCH_CHECK(h % downscale_factor == 0, - "pixel_unshuffle expects height to be divisible by downscale_factor, but input.size(-2)=", h, - " is not divisible by ", downscale_factor) - TORCH_CHECK(w % downscale_factor == 0, - "pixel_unshuffle expects width to be divisible by downscale_factor, but input.size(-1)=", w, - " is not divisible by ", downscale_factor) int64_t downscale_factor_squared = downscale_factor * downscale_factor; int64_t oc = c * downscale_factor_squared; int64_t oh = h / downscale_factor; @@ -105,7 +153,11 @@ Tensor pixel_unshuffle(const Tensor& self, int64_t downscale_factor) { // resulting in height=oh and width=ow. std::vector final_shape(self.sizes().begin(), self_sizes_batch_end); final_shape.insert(final_shape.end(), {oc, oh, ow}); + return input_permuted.reshape(final_shape); } +DEFINE_DISPATCH(pixel_shuffle_kernel); +DEFINE_DISPATCH(pixel_unshuffle_kernel); + }} // namespace at::native diff --git a/aten/src/ATen/native/Pool.h b/aten/src/ATen/native/Pool.h index 503cf8907884..0f3885524a79 100644 --- a/aten/src/ATen/native/Pool.h +++ b/aten/src/ATen/native/Pool.h @@ -1,6 +1,6 @@ -#include -#include +#include #include +#include #include #include diff --git a/aten/src/ATen/native/Pooling.cpp b/aten/src/ATen/native/Pooling.cpp index 0526f5b2b8e4..724c53fdd0c0 100644 --- a/aten/src/ATen/native/Pooling.cpp +++ b/aten/src/ATen/native/Pooling.cpp @@ -122,7 +122,12 @@ Tensor max_pool2d( return at::mkldnn_max_pool2d( self, kernel_size, stride, padding, dilation, ceil_mode); } - +#ifdef USE_MPS + if (self.is_mps()) { + return at::_mps_max_pool2d( + self, kernel_size, stride, padding, dilation, ceil_mode); + } +#endif #if defined(C10_MOBILE) if(xnnpack::use_max_pool2d(self, kernel_size, padding, stride, dilation, ceil_mode)) { diff --git a/aten/src/ATen/native/QuantizedLinear.cpp b/aten/src/ATen/native/QuantizedLinear.cpp index e3030f71d165..fcd8f6335b58 100644 --- a/aten/src/ATen/native/QuantizedLinear.cpp +++ b/aten/src/ATen/native/QuantizedLinear.cpp @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include @@ -53,6 +53,9 @@ Tensor fbgemm_linear_int8_weight_fp32_activation( // fallback path and rather fail loudly if we cannot run FBGEMM. TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU doesn't support FBGEMM."); + TORCH_WARN_ONCE("fbgemm_linear_int8_weight_fp32_activation is deprecated " + "and will be removed in a future PyTorch release.") + const Tensor input_contig = input.contiguous(); const float* input_ptr = input_contig.data_ptr(); @@ -179,11 +182,6 @@ Tensor fbgemm_linear_int8_weight( const Scalar& weight_scale, const Scalar& weight_zero_point, const Tensor& bias) { - // Replace after https://github.com/pytorch/pytorch/issues/24354 is fixed - // TORCH_WARN( - // "fbgemm_linear_int8_weight will be deprecated soon." - // "Please use fbgemm_linear_int8_weight_fp32_activation instead."); - return at::native::fbgemm_linear_int8_weight_fp32_activation( input, weight, @@ -219,6 +217,9 @@ void CalcColOffsetsTranspose( std::tuple fbgemm_linear_quantize_weight( const Tensor& weight) { + TORCH_WARN_ONCE("fbgemm_linear_quantize_weight is deprecated " + "and will be removed in a future PyTorch release.") + // We make a strong guarantee that models using these operators will have the // same numerics across different machines. Therefore, we do not provide a // fallback path and rather fail loudly if we cannot run FBGEMM. @@ -284,6 +285,9 @@ std::tuple fbgemm_linear_quantize_weight( } Tensor fbgemm_pack_quantized_matrix(const Tensor& weight) { + TORCH_WARN_ONCE("fbgemm_pack_quantized_matrix is deprecated " + "and will be removed in a future PyTorch release.") + // We make a strong guarantee that models using these operators will have the // same numerics across different machines. Therefore, we do not provide a // fallback path and rather fail loudly if we cannot run FBGEMM. @@ -366,6 +370,9 @@ void HandleWeightsSaturation(int64_t N, float* weight) { } // namespace Tensor fbgemm_pack_gemm_matrix_fp16(const Tensor& weight) { + TORCH_WARN_ONCE("fbgemm_pack_gemm_matrix_fp16 is deprecated " + "and will be removed in a future PyTorch release.") + // We make a strong guarantee that models using these operators will have the // same numerics across different machines. Therefore, we do not provide a // fallback path and rather fail loudly if we cannot run FBGEMM. @@ -398,6 +405,9 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation( const Tensor& input, const Tensor& packed_weight, const Tensor& bias) { + TORCH_WARN_ONCE("fbgemm_linear_fp16_weight_fp32_activation is deprecated " + "and will be removed in a future PyTorch release.") + // We make a strong guarantee that models using these operators will have the // same numerics across different machines. Therefore, we do not provide a // fallback path and rather fail loudly if we cannot run FBGEMM. @@ -443,10 +453,6 @@ Tensor fbgemm_linear_fp16_weight( const Tensor& input, const Tensor& packed_weight, const Tensor& bias) { - // Replace after https://github.com/pytorch/pytorch/issues/24354 is fixed - // TORCH_WARN( - // "fbgemm_linear_fp16_weight will be deprecated soon." - // "Please use fbgemm_linear_fp16_weight_fp32_activation instead."); return at::native::fbgemm_linear_fp16_weight_fp32_activation( input, packed_weight, bias); } @@ -461,6 +467,9 @@ Tensor fbgemm_linear_int8_weight_fp32_activation( const Scalar& /*weight_scale*/, const Scalar& /*weight_zero_point*/, const Tensor& /*bias*/) { + TORCH_WARN_ONCE("fbgemm_linear_int8_weight_fp32_activation is deprecated " + "and will be removed in a future PyTorch release.") + // We make a strong guarantee that models using these operators will have the // same numerics across different machines. Therefore, we do not provide a // fallback path and rather fail loudly if we cannot run FBGEMM. @@ -476,10 +485,8 @@ Tensor fbgemm_linear_int8_weight( const Scalar& /*weight_scale*/, const Scalar& /*weight_zero_point*/, const Tensor& /*bias*/) { - // Replace after https://github.com/pytorch/pytorch/issues/24354 is fixed - // TORCH_WARN( - // "fbgemm_linear_int8_weight will be deprecated soon." - // "Please use fbgemm_linear_int8_weight_fp32_activation instead."); + TORCH_WARN_ONCE("fbgemm_linear_int8_weight is deprecated " + "and will be removed in a future PyTorch release.") // We make a strong guarantee that models using these operators will have the // same numerics across different machines. Therefore, we do not provide a @@ -490,6 +497,9 @@ Tensor fbgemm_linear_int8_weight( std::tuple fbgemm_linear_quantize_weight( const Tensor& /*weight*/) { + TORCH_WARN_ONCE("fbgemm_linear_quantize_weight is deprecated " + "and will be removed in a future PyTorch release.") + // We make a strong guarantee that models using these operators will have the // same numerics across different machines. Therefore, we do not provide a // fallback path and rather fail loudly if we cannot run FBGEMM. @@ -498,6 +508,9 @@ std::tuple fbgemm_linear_quantize_weight( } Tensor fbgemm_pack_quantized_matrix(const Tensor& /*input*/) { + TORCH_WARN_ONCE("fbgemm_pack_quantized_matrix is deprecated " + "and will be removed in a future PyTorch release.") + // We make a strong guarantee that models using these operators will have the // same numerics across different machines. Therefore, we do not provide a // fallback path and rather fail loudly if we cannot run FBGEMM. @@ -509,10 +522,8 @@ Tensor fbgemm_pack_quantized_matrix( const Tensor& /*input*/, int64_t /*K*/, int64_t /*N*/) { - // Replace after https://github.com/pytorch/pytorch/issues/24354 is fixed - // TORCH_WARN( - // "fbgemm_pack_quantized_matrix(weight, K, N) will be deprecated soon." - // "Please use fbgemm_pack_quantized_matrix(weight) instead."); + TORCH_WARN_ONCE("fbgemm_pack_quantized_matrix is deprecated " + "and will be removed in a future PyTorch release.") // We make a strong guarantee that models using these operators will have the // same numerics across different machines. Therefore, we do not provide a @@ -522,6 +533,9 @@ Tensor fbgemm_pack_quantized_matrix( } Tensor fbgemm_pack_gemm_matrix_fp16(const Tensor& weight) { + TORCH_WARN_ONCE("fbgemm_pack_gemm_matrix_fp16 is deprecated " + "and will be removed in a future PyTorch release.") + // We make a strong guarantee that models using these operators will have the // same numerics across different machines. Therefore, we do not provide a // fallback path and rather fail loudly if we cannot run FBGEMM. @@ -533,6 +547,9 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation( const Tensor& input, const Tensor& packed_weight, const Tensor& bias) { + TORCH_WARN_ONCE("fbgemm_linear_fp16_weight_fp32_activation is deprecated " + "and will be removed in a future PyTorch release.") + // We make a strong guarantee that models using these operators will have the // same numerics across different machines. Therefore, we do not provide a // fallback path and rather fail loudly if we cannot run FBGEMM. @@ -544,10 +561,8 @@ Tensor fbgemm_linear_fp16_weight( const Tensor& input, const Tensor& packed_weight, const Tensor& bias) { - // Replace after https://github.com/pytorch/pytorch/issues/24354 is fixed - // TORCH_WARN( - // "fbgemm_linear_fp16_weight will be deprecated soon." - // "Please use fbgemm_linear_fp16_weight_fp32_activation instead."); + TORCH_WARN_ONCE("fbgemm_linear_fp16_weight is deprecated " + "and will be removed in a future PyTorch release.") // We make a strong guarantee that models using these operators will have the // same numerics across different machines. Therefore, we do not provide a @@ -556,10 +571,6 @@ Tensor fbgemm_linear_fp16_weight( false, "This PyTorch installation was not built with FBGEMM operators"); } -bool fbgemm_is_cpu_supported() { - return false; -} - #endif // USE_FBGEMM } // namespace native diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md index a2b50e3ee467..3c10afef14fa 100644 --- a/aten/src/ATen/native/README.md +++ b/aten/src/ATen/native/README.md @@ -291,7 +291,7 @@ If two backends have the same dispatch function, you can write `CPU, CUDA: func` to reuse the same function name in both cases. Available backend options can be found by searching `dispatch_keys` in -[codegen](https://github.com/pytorch/pytorch/blob/master/tools/codegen/gen.py). +[codegen](https://github.com/pytorch/pytorch/blob/master/torchgen/gen.py). There are also two special "generic" backends: - `CompositeExplicitAutograd` (previously known as `DefaultBackend`): diff --git a/aten/src/ATen/native/RNN.cpp b/aten/src/ATen/native/RNN.cpp index 8793f4e5d7b4..38696432b257 100644 --- a/aten/src/ATen/native/RNN.cpp +++ b/aten/src/ATen/native/RNN.cpp @@ -3,8 +3,7 @@ #include #include #include -#include -#include +#include #include #include #include @@ -22,7 +21,6 @@ bool use_miopen(const at::Tensor& input, const double dropout_state) { bool is_miopen_acceptable = ((input.scalar_type() == at::kFloat)|| (input.scalar_type() == at::kHalf)) && (detail::getCUDAHooks().compiledWithMIOpen()) && (input.is_cuda()) && - (dropout_state == 0.0) && (at::globalContext().userEnabledCuDNN()); return is_miopen_acceptable; } @@ -579,88 +577,6 @@ static std::vector gather_params(TensorList params, bool has_biases, return result; } -// These gather_* functions are kept solely for the purposes of backward -// compatbility in the legacy quantized_{lstm,gru} APIs - -static c10::List> gather_quantized_params( - c10::List params) { - static at::Tensor undefined; - std::vector> result; - TORCH_CHECK(params.size() % 12 == 0, "got an incorrect number of quantized RNN parameters"); - for (size_t i = 0; i < params.size(); i += 12) { - result.emplace_back(c10::make_intrusive( - static_cast(params[i]), - static_cast(params[i + 1]), - static_cast(params[i + 2]), - static_cast(params[i + 3]), - static_cast(params[i + 4]), - static_cast(params[i + 5]), - static_cast(params[i + 6]), - static_cast(params[i + 7]), - static_cast(params[i + 8]).item(), - static_cast(params[i + 9]).item(), - static_cast(params[i + 10]).item(), - static_cast(params[i + 11]).item())); - } - return c10::List>(result); -} - -static c10::List> -gather_quantized_params_dynamic(c10::List params) { - static at::Tensor undefined; - std::vector> result; - for (size_t i = 0; i < params.size(); i += 2) { - auto packed_struct_ih = - cpp_custom_type_hack::cast>( - static_cast(params[i])); - auto packed_struct_hh = - cpp_custom_type_hack::cast>( - static_cast(params[i + 1])); - - auto bias_ih = packed_struct_ih->bias().value_or(undefined); - auto bias_hh = packed_struct_hh->bias().value_or(undefined); - result.emplace_back(c10::make_intrusive( - std::move(packed_struct_ih), - std::move(packed_struct_hh), - std::move(bias_ih), - std::move(bias_hh))); - } - return c10::List>(result); -} - -static c10::List> -gather_quantized_params_fp16(c10::List params) { - static at::Tensor undefined; - std::vector> result; - TORCH_CHECK(params.size() % 4 == 0, - "incorrect number of quantized RNN parameters FP16"); - for (size_t i = 0; i < params.size(); i += 4) { - c10::intrusive_ptr packed_struct_ih = - cpp_custom_type_hack::cast>( - static_cast(params[i])); - c10::intrusive_ptr packed_struct_hh = - cpp_custom_type_hack::cast>( - static_cast(params[i + 1])); - - // NB: we install the bias from the gathered parameters here because - // in the "new world", the fp16 linear apply() method always expects - // the bias to be present in the packed struct. In the "old world", - // we called `fbgemm_linear_fp16_weight_fp32_activation`, which took - // the bias explicitly and ignored the bias in the packed struct. To - // reconcile serialized models that behavied in the old style, we - // put the bias into the appropriate packed structures here. - // - // Hopefully we can remove this in the future when we eliminate - // the old style altogether - packed_struct_ih->set_bias(params[i + 2]); - packed_struct_hh->set_bias(params[i + 3]); - - result.emplace_back(c10::make_intrusive( - std::move(packed_struct_ih), std::move(packed_struct_hh))); - } - return c10::List>(result); -} - //////////////////////////////////////////////////////////////////////////////// // HIDDEN STATE FUNCTIONS // @@ -1201,6 +1117,18 @@ bool _use_cudnn_rnn_flatten_weight() { return detail::getCUDAHooks().compiledWithCuDNN(); } +// NB: This a (composite) wrapper for _thnn_fused_lstm_cell_backward_impl. +// It duplicates the outputs of this function so the non-composite verison doesn't have to. +// The point is so that we avoid triggering TensorImpl use count asserts in debug mode +std::tuple _thnn_fused_lstm_cell_backward( const c10::optional& grad_hy_opt, const c10::optional& grad_cy_opt, + const Tensor& cx, const Tensor& cy, + const Tensor& workspace, bool has_bias) { + TORCH_INTERNAL_ASSERT(!GradMode::is_enabled()); + auto ret = at::_thnn_fused_lstm_cell_backward_impl(grad_hy_opt, grad_cy_opt, cx, cy, workspace, has_bias); + return std::make_tuple(std::get<0>(ret), std::get<0>(ret), std::get<1>(ret), std::get<2>(ret), std::get<2>(ret)); +} + + //////////////////////////////////////////////////////////////////////////////// // PUBLIC FUNCTIONS //////////////////////////////////////////////////////////////////////////////// @@ -1411,21 +1339,11 @@ std::tuple quantized_gru_input_legacy( bool train, bool bidirectional, bool batch_first) { - TORCH_WARN_ONCE( + TORCH_CHECK( + false, "torch.quantized_gru with List[Tensor] for parameters is " - "deprecated and may be removed! Please re-export your model " + "no longer supported. Please re-export your model " "using the newer definitions in torch.jit.quantized"); - auto params = gather_quantized_params(std::move(_params)); - return quantized_gru_input( - _input, - hx, - std::move(params), - has_biases, - num_layers, - dropout_p, - train, - bidirectional, - batch_first); } std::tuple quantized_gru_data_legacy( @@ -1438,21 +1356,11 @@ std::tuple quantized_gru_data_legacy( double dropout_p, bool train, bool bidirectional) { - TORCH_WARN_ONCE( + TORCH_CHECK( + false, "torch.quantized_gru with List[Tensor] for parameters is " - "deprecated and may be removed! Please re-export your model " + "no longer supported. Please re-export your model " "using the newer definitions in torch.jit.quantized"); - auto params = gather_quantized_params(std::move(_params)); - return quantized_gru_data( - data, - batch_sizes, - hx, - std::move(params), - has_biases, - num_layers, - dropout_p, - train, - bidirectional); } using tanf_cell_type = SimpleCell; @@ -1480,6 +1388,14 @@ std::tuple lstm( num_layers, dropout_p, train, bidirectional, batch_first); return std::make_tuple(std::move(output), std::move(hy), std::move(cy)); } +#ifdef USE_MPS + if (_input.is_mps() && !bidirectional) { + std::tuple output = at::_lstm_mps(_input, hx, _params, has_biases, + num_layers, dropout_p, train, bidirectional, batch_first); + std::tuple return_values = std::make_tuple(std::get<0>(output), std::get<1>(output), std::get<2>(output)); + return return_values; + } +#endif // if cells are of different size, that means projections are used bool has_projections = (hx[0].size(2) != hx[1].size(2)); if (use_miopen(_input, dropout_p)) { @@ -1768,34 +1684,11 @@ std::tuple quantized_lstm_input_legacy( bool batch_first, c10::optional dtype, bool use_dynamic) { - TORCH_WARN_ONCE( + TORCH_CHECK( + false, "torch.quantized_lstm with List[Tensor] for parameters is " - "deprecated and may be removed! Please re-export your model " + "no longer supported. Please re-export your model " "using the newer definitions in torch.jit.quantized"); - c10::List> params; - auto result_dtype = dtype.has_value() ? dtype.value() : at::kChar; - if (result_dtype == at::kChar || result_dtype == at::kQInt8) { - if (use_dynamic) { - params = gather_quantized_params_dynamic(std::move(_params_)); - } else { - params = gather_quantized_params(std::move(_params_)); - } - } else { - params = gather_quantized_params_fp16(std::move(_params_)); - } - return quantized_lstm_input( - _input, - std::move(hx_), - std::move(params), - has_biases, - num_layers, - dropout_p, - train, - bidirectional, - batch_first, - // NOLINTNEXTLINE(performance-move-const-arg) - std::move(dtype), - use_dynamic); } std::tuple quantized_lstm_data( @@ -1857,34 +1750,11 @@ std::tuple quantized_lstm_data_legacy( bool bidirectional, c10::optional dtype, bool use_dynamic) { - TORCH_WARN_ONCE( + TORCH_CHECK( + false, "torch.quantized_lstm with List[Tensor] for parameters is " - "deprecated and may be removed! Please re-export your model " + "no longer supported. Please re-export your model " "using the newer definitions in torch.jit.quantized"); - c10::List> params; - auto result_dtype = dtype.has_value() ? dtype.value() : at::kChar; - if (result_dtype == at::kChar || result_dtype == at::kQInt8) { - if (use_dynamic) { - params = gather_quantized_params_dynamic(std::move(_params_)); - } else { - params = gather_quantized_params(std::move(_params_)); - } - } else { - params = gather_quantized_params_fp16(std::move(_params_)); - } - return quantized_lstm_data( - data, - batch_sizes, - std::move(hx_), - std::move(params), - has_biases, - num_layers, - dropout_p, - train, - bidirectional, - // NOLINTNEXTLINE(performance-move-const-arg) - std::move(dtype), - use_dynamic); } #define DEFINE_QUANTIZED_RNN_CELL(name, hx_type, cell_type, return_type, prepare_hx_fn) \ @@ -1982,7 +1852,7 @@ DEFINE_QUANTIZED_RNN_CELL_DYNAMIC(quantized_rnn_tanh_cell_dynamic, simple_hx_typ namespace { -static auto ensure_linear_params_registered = register_linear_params(); +static C10_UNUSED auto ensure_linear_params_registered = register_linear_params(); static auto cell_params_base_registry = torch::selective_class_("rnn", TORCH_SELECTIVE_CLASS("CellParamsBase")) diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp index 38eafedbeebf..d5ee3a3e9103 100644 --- a/aten/src/ATen/native/ReduceOps.cpp +++ b/aten/src/ATen/native/ReduceOps.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -200,16 +201,26 @@ TORCH_META_FUNC2(prod, dim_int) resize_reduction(*this, self, dim, keepdim, out_dtype); } -void check_floating_or_complex_dtype(const char* name, ScalarType dtype) { - TORCH_CHECK( - at::isFloatingType(dtype) || at::isComplexType(dtype), - name, "(): input dtype should be either floating point or complex dtypes. " - "Got ", toString(dtype), " instead."); -} - TORCH_META_FUNC2(mean, dim) (const Tensor& self, IntArrayRef dim, bool keepdim, optional opt_dtype) { - check_floating_or_complex_dtype("mean", self.scalar_type()); + auto in_dtype = at::native::get_dtype_from_self(self, opt_dtype, true); + + if (!at::isFloatingType(in_dtype) && !at::isComplexType(in_dtype)) { + std::string what = "Input"; + std::string dtype = toString(self.scalar_type()); + + if (opt_dtype.has_value()) { + what = "Optional"; + dtype = toString(opt_dtype.value()); + } + + TORCH_CHECK( + false, + "mean(): could not infer output dtype. ", + what, " dtype must be either a floating point or complex dtype. ", + "Got: ", dtype); + } + auto out_dtype = infer_dtype_from_optional(self, dim, keepdim, opt_dtype, maybe_get_output()); resize_reduction(*this, self, dim, keepdim, out_dtype); } @@ -221,15 +232,17 @@ ScalarType get_result_or_self_value_dtype( if (result.defined()) { return result.scalar_type(); } else { - return dtype.value_or(toValueType(self.scalar_type())); + return dtype.value_or(toRealValueType(self.scalar_type())); } } - - TORCH_META_FUNC2(norm, ScalarOpt_dim) (const Tensor& self, const OptionalScalarRef p, IntArrayRef dim, bool keepdim) { - check_floating_or_complex_dtype("norm", self.scalar_type()); + TORCH_CHECK( + at::isFloatingType(self.scalar_type()) || at::isComplexType(self.scalar_type()), + "norm(): input dtype should be either floating point or complex. " + "Got ", self.scalar_type(), " instead."); + auto out_dtype = get_result_or_self_value_dtype(self, maybe_get_output(), c10::nullopt); resize_reduction(*this, self, dim, keepdim, out_dtype); } @@ -240,7 +253,11 @@ TORCH_META_FUNC2(norm, ScalarOpt_dim_dtype) IntArrayRef dim, bool keepdim, ScalarType dtype) { - check_floating_or_complex_dtype("norm", dtype); + TORCH_CHECK( + at::isFloatingType(dtype) || at::isComplexType(dtype), + "norm(): the desired output dtype should be either floating point or complex. " + "Got ", dtype, " instead."); + auto out_dtype = get_result_or_self_value_dtype(self, maybe_get_output(), dtype); resize_reduction(*this, self, dim, keepdim, out_dtype); } @@ -266,6 +283,34 @@ TORCH_META_FUNC(aminmax) this->set_output(1, shape, options); } +TORCH_META_FUNC(amax) +(const Tensor& self, IntArrayRef dim, bool keepdim) { + auto maybe_result = maybe_get_output(); + if (maybe_result.defined()) { + TORCH_CHECK(self.scalar_type() == maybe_result.scalar_type(), "Expected the dtype for input and out to match, but got ", + self.scalar_type(), " for input's dtype and ", maybe_result.scalar_type(), " for out's dtype."); + } + if (self.numel() == 0) { + at::native::zero_numel_check_dims(self, dim, "amax()"); + } + const ScalarType& out_dtype = maybe_result.defined() ? maybe_result.scalar_type() : self.scalar_type(); + resize_reduction(*this, self, dim, keepdim, out_dtype); +} + +TORCH_META_FUNC(amin) +(const Tensor& self, IntArrayRef dim, bool keepdim) { + auto maybe_result = maybe_get_output(); + if (maybe_result.defined()) { + TORCH_CHECK(self.scalar_type() == maybe_result.scalar_type(), "Expected the dtype for input and out to match, but got ", + self.scalar_type(), " for input's dtype and ", maybe_result.scalar_type(), " for out's dtype."); + } + if (self.numel() == 0) { + at::native::zero_numel_check_dims(self, dim, "amin()"); + } + const ScalarType& out_dtype = maybe_result.defined() ? maybe_result.scalar_type() : self.scalar_type(); + resize_reduction(*this, self, dim, keepdim, out_dtype); +} + } // namespace meta namespace native { @@ -830,7 +875,7 @@ Tensor& diff_out(const Tensor& self, int64_t n, int64_t dim, const c10::optional } } -void pre_check_gradient(const Tensor& self, c10::optional spacing_size, c10::optional dim, int64_t edge_order) { +void pre_check_gradient(const Tensor& self, c10::optional spacing_size, at::OptionalIntArrayRef dim, int64_t edge_order) { // Helper for gradient function to make sure input data satisfies prerequisites TORCH_CHECK(self.scalar_type() != ScalarType::Byte, "torch.gradient does not support uint8 input."); if (spacing_size.has_value() && !dim.has_value()) { @@ -932,7 +977,7 @@ std::vector gradient_dim_preprocess(const Tensor& self, c10::optional gradient(const Tensor& self, TensorList coordinates, IntArrayRef dim, int64_t edge_order) { pre_check_gradient(self, c10::optional(coordinates.size()), - c10::optional(dim), + at::OptionalIntArrayRef(dim), edge_order); return gradient_helper(self, coordinates, dim, edge_order); } @@ -941,7 +986,7 @@ std::vector gradient(const Tensor& self, TensorList coordinates, c10::op const auto processed_dim = gradient_dim_preprocess(self, dim); pre_check_gradient(self, c10::optional(coordinates.size()), - dim.has_value() ? c10::optional(processed_dim) : c10::nullopt, + dim.has_value() ? at::OptionalIntArrayRef(processed_dim) : c10::nullopt, edge_order); return gradient_helper(self, coordinates, processed_dim, edge_order); } @@ -949,7 +994,7 @@ std::vector gradient(const Tensor& self, TensorList coordinates, c10::op std::vector gradient(const Tensor& self, c10::ArrayRef spacing, IntArrayRef dim, int64_t edge_order) { pre_check_gradient(self, c10::optional(spacing.size()), - c10::optional(dim), + at::OptionalIntArrayRef(dim), edge_order); return gradient_helper_float(self, spacing, dim, edge_order); } @@ -958,7 +1003,7 @@ std::vector gradient(const Tensor& self, ArrayRef spacing, c10:: const auto processed_dim = gradient_dim_preprocess(self, dim); pre_check_gradient(self, c10::optional(spacing.size()), - dim.has_value() ? c10::optional(processed_dim) : c10::nullopt, + dim.has_value() ? at::OptionalIntArrayRef(processed_dim) : c10::nullopt, edge_order); return gradient_helper_float(self, spacing, processed_dim, edge_order); } @@ -969,7 +1014,7 @@ std::vector gradient(const Tensor& self, const Scalar& unit_size, IntArr std::vector spacing(dim.size(), unit_size); pre_check_gradient(self, c10::optional(spacing.size()), - c10::optional(dim), + at::OptionalIntArrayRef(dim), edge_order); return gradient_helper_float(self, spacing, dim, edge_order); } @@ -983,7 +1028,7 @@ std::vector gradient(const Tensor& self, const c10::optional& un unit_size.has_value() ? unit_size.value() : 1.0) ; pre_check_gradient(self, unit_size.has_value() ? c10::optional(spacing.size()) : c10::nullopt, - dim.has_value() ? c10::optional(processed_dim) : c10::nullopt, + dim.has_value() ? at::OptionalIntArrayRef(processed_dim) : c10::nullopt, edge_order); return gradient_helper_float(self, spacing, processed_dim, edge_order); } @@ -992,7 +1037,7 @@ std::vector gradient(const Tensor& self, IntArrayRef dim, int64_t edge_o std::vector spacing(dim.size(), 1.0) ; pre_check_gradient(self, c10::optional(spacing.size()), - c10::optional(dim), + at::OptionalIntArrayRef(dim), edge_order); return gradient_helper_float(self, spacing, dim, edge_order); } @@ -1054,10 +1099,6 @@ Tensor& nansum_out(const Tensor& self, IntArrayRef dim, return result; } -Tensor nansum(const Tensor &self, c10::optional dtype) { - return at::native::nansum(self, std::vector{}, false, dtype); -} - Tensor nansum(const Tensor& self, IntArrayRef dim, bool keepdim, c10::optional opt_dtype) { ScalarType dtype = get_dtype_from_self(self, opt_dtype, true); Tensor result = create_reduction_result(self, dim, keepdim, dtype); @@ -1262,22 +1303,29 @@ Tensor& logsumexp_out(const Tensor& self, IntArrayRef dims, bool keepdim, Tensor result.scalar_type()); { NoNamesGuard guard; - logsumexp_out_impl(result, self, dims, keepdim); + if (at::isIntegralType(self.scalar_type(), /*includeBool=*/true)) { + // for integral inputs, promote input to default floating type. + auto default_dtype = at::typeMetaToScalarType(c10::get_default_dtype()); + logsumexp_out_impl(result, self.to(default_dtype), dims, keepdim); + } else { + logsumexp_out_impl(result, self, dims, keepdim); + } } namedinference::propagate_names_for_reduction(result, self, dims, keepdim); return result; } Tensor logsumexp(const Tensor& self, IntArrayRef dims, bool keepdim) { - Tensor result; - auto default_dtype = at::typeMetaToScalarType(c10::get_default_dtype()); + TensorOptions result_options; if (at::isIntegralType(self.scalar_type(), /*includeBool=*/true)) { - result = at::empty({0}, self.options().dtype(default_dtype)); - return at::native::logsumexp_out(self.to(default_dtype), dims, keepdim, result); + // even for integral inputs, result is floating dtype + auto default_dtype = at::typeMetaToScalarType(c10::get_default_dtype()); + result_options = self.options().dtype(default_dtype); } else { - result = at::empty({0}, self.options()); - return at::native::logsumexp_out(self, dims, keepdim, result); + result_options = self.options(); } + auto result = at::empty({0}, result_options); + return at::native::logsumexp_out(self, dims, keepdim, result); } Tensor logsumexp(const Tensor& self, DimnameList dims, bool keepdim) { @@ -1415,42 +1463,20 @@ TORCH_IMPL_FUNC(any_all_out)(const Tensor& self, const Tensor& result) { allany_impl<0>(self, result, {}, false, or_stub); } -Tensor &amin_out(const Tensor& self, IntArrayRef dim, bool keepdim, Tensor& result) { - TORCH_CHECK(self.scalar_type() == result.scalar_type(), "Expected the dtype for input and out to match, but got ", - self.scalar_type(), " for input's dtype and ", result.scalar_type(), " for out's dtype."); - if (self.numel() == 0) { - zero_numel_check_dims(self, dim, "amin()"); - } - - auto iter = make_reduction("amin", result, self, dim, keepdim, self.scalar_type()); +TORCH_IMPL_FUNC(amin_out) (const Tensor& self, IntArrayRef dim, bool keepdim, const Tensor& result) { + auto iter = + meta::make_reduction(self, result, dim, keepdim, self.scalar_type()); if (iter.numel() != 0) { min_values_stub(iter.device_type(), iter); } - return result; } -Tensor amin(const Tensor& self, IntArrayRef dim, bool keepdim) { - Tensor result = at::empty({0}, self.options()); - return at::amin_out(result, self, dim, keepdim); -} - -Tensor &amax_out(const Tensor& self, IntArrayRef dim, bool keepdim, Tensor& result) { - TORCH_CHECK(self.scalar_type() == result.scalar_type(), "Expected the dtype for input and out to match, but got ", - self.scalar_type(), " for input's dtype and ", result.scalar_type(), " for out's dtype."); - if (self.numel() == 0) { - zero_numel_check_dims(self, dim, "amax()"); - } - - auto iter = make_reduction("amax", result, self, dim, keepdim, self.scalar_type()); +TORCH_IMPL_FUNC(amax_out) (const Tensor& self, IntArrayRef dim, bool keepdim, const Tensor& result) { + auto iter = + meta::make_reduction(self, result, dim, keepdim, self.scalar_type()); if (iter.numel() != 0) { max_values_stub(iter.device_type(), iter); } - return result; -} - -Tensor amax(const Tensor& self, IntArrayRef dim, bool keepdim) { - Tensor result = at::empty({0}, self.options()); - return at::amax_out(result, self, dim, keepdim); } template @@ -1556,7 +1582,7 @@ static double std_var_all_cpu(const Tensor& self, int64_t correction, bool take_ static Tensor& std_var_out( const char* fname, Tensor& result, const Tensor& self, - c10::optional dim, c10::optional correction_opt, + at::OptionalIntArrayRef dim, c10::optional correction_opt, bool keepdim, bool take_sqrt) { TORCH_CHECK(self.device().is_cpu() || self.device().is_cuda(), "std and var only supports tensors on a CPU or CUDA device, but got: ", @@ -1569,7 +1595,7 @@ static Tensor& std_var_out( if (at::isComplexType(self.scalar_type())) { // For complex, calculate variance of real and imaginary components // seperately then add to get overall variance. - ScalarType dtype = c10::toValueType(get_dtype_from_result(result, {})); + ScalarType dtype = c10::toRealValueType(get_dtype_from_result(result, {})); Tensor real_in = at::real(self); Tensor real_out = at::empty({0}, self.options().dtype(dtype)); std_var_out( @@ -1624,7 +1650,7 @@ static Tensor& std_var_out( static std::tuple std_var_mean_out( const char* fname, Tensor& result1, Tensor& result2, const Tensor& self, - c10::optional dim, c10::optional correction_opt, + at::OptionalIntArrayRef dim, c10::optional correction_opt, bool keepdim, bool take_sqrt) { AT_ASSERT(result1.defined() && result2.defined()); TORCH_CHECK(self.device().is_cpu() || self.is_cuda(), @@ -1634,7 +1660,7 @@ static std::tuple std_var_mean_out( fname, " only supports strided layout, got: ", self.layout()); TORCH_CHECK(at::isFloatingType(self.scalar_type()) || at::isComplexType(self.scalar_type()), fname, " only support floating point and complex dtypes"); - TORCH_CHECK(result1.scalar_type() == c10::toValueType(result2.scalar_type()), + TORCH_CHECK(result1.scalar_type() == c10::toRealValueType(result2.scalar_type()), fname, " expected result1 to be real and match the precision of result2. Got ", result1.scalar_type(), " and ", result2.scalar_type(), "."); @@ -1642,7 +1668,7 @@ static std::tuple std_var_mean_out( // For complex, calculate for real and imaginary components seperately then combine as: // variance = var_real + var_imag // mean = mean_real + j * mean_imag - ScalarType dtype = c10::toValueType(get_dtype_from_result(result1, {})); + ScalarType dtype = c10::toRealValueType(get_dtype_from_result(result1, {})); Tensor real_in = at::real(self); Tensor real_out_var = at::empty({0}, self.options().dtype(dtype)); Tensor real_out_mean = at::empty({0}, self.options().dtype(dtype)); @@ -1695,13 +1721,13 @@ static std::tuple std_var_mean_out( std::tuple var_mean( const Tensor& self, IntArrayRef dim, bool unbiased, bool keepdim) { - return at::var_mean(self, /*dim=*/c10::optional(dim), + return at::var_mean(self, /*dim=*/at::OptionalIntArrayRef(dim), /*correction=*/int64_t{unbiased ? 1 : 0}, keepdim); } std::tuple std_mean( const Tensor& self, IntArrayRef dim, bool unbiased, bool keepdim) { - return at::std_mean(self, /*dim=*/c10::optional(dim), + return at::std_mean(self, /*dim=*/at::OptionalIntArrayRef(dim), /*correction=*/int64_t{unbiased ? 1 : 0}, keepdim); } @@ -1724,11 +1750,11 @@ std::tuple var_mean_out( static TensorOptions options_to_value_type(TensorOptions opts) { auto scalar_type = typeMetaToScalarType(opts.dtype()); - return opts.dtype(c10::toValueType(scalar_type)); + return opts.dtype(c10::toRealValueType(scalar_type)); } std::tuple var_mean( - const Tensor& self, c10::optional dim, + const Tensor& self, at::OptionalIntArrayRef dim, c10::optional correction, bool keepdim) { Tensor result1 = at::empty({0}, options_to_value_type(self.options())); Tensor result2 = at::empty({0}, self.options()); @@ -1737,7 +1763,7 @@ std::tuple var_mean( } std::tuple std_mean( - const Tensor& self, c10::optional dim, + const Tensor& self, at::OptionalIntArrayRef dim, c10::optional correction, bool keepdim) { Tensor result1 = at::empty({0}, options_to_value_type(self.options())); Tensor result2 = at::empty({0}, self.options()); @@ -1751,12 +1777,12 @@ Tensor var(const Tensor& self, bool unbiased) { } Tensor var(const Tensor& self, IntArrayRef dim, bool unbiased, bool keepdim) { - return at::var(self, /*dim=*/c10::optional(dim), + return at::var(self, /*dim=*/at::OptionalIntArrayRef(dim), /*correction=*/int64_t{unbiased ? 1 : 0}, keepdim); } Tensor& var_out(const Tensor& self, IntArrayRef dim, bool unbiased, bool keepdim, Tensor& result) { - return at::var_out(result, self, /*dim=*/c10::optional(dim), + return at::var_out(result, self, /*dim=*/at::OptionalIntArrayRef(dim), /*correction=*/int64_t{unbiased ? 1 : 0}, keepdim); } @@ -1766,35 +1792,35 @@ Tensor std(const Tensor& self, bool unbiased) { } Tensor std(const Tensor& self, IntArrayRef dim, bool unbiased, bool keepdim) { - return at::std(self, /*dim=*/c10::optional(dim), + return at::std(self, /*dim=*/at::OptionalIntArrayRef(dim), /*correction=*/int64_t{unbiased ? 1 : 0}, keepdim); } Tensor& std_out(const Tensor& self, IntArrayRef dim, bool unbiased, bool keepdim, Tensor& result) { - return at::std_out(result, self, /*dim=*/c10::optional(dim), + return at::std_out(result, self, /*dim=*/at::OptionalIntArrayRef(dim), /*correction=*/int64_t{unbiased ? 1 : 0}, keepdim); } -Tensor std(const Tensor& self, c10::optional dim, +Tensor std(const Tensor& self, at::OptionalIntArrayRef dim, c10::optional correction, bool keepdim) { Tensor result = at::empty({0}, options_to_value_type(self.options())); return std_var_out("std", result, self, dim, correction, keepdim, true); } Tensor& std_out( - const Tensor& self, c10::optional dim, + const Tensor& self, at::OptionalIntArrayRef dim, c10::optional correction, bool keepdim, Tensor& result) { return std_var_out("std", result, self, dim, correction, keepdim, true); } Tensor& var_out( - const Tensor& self, c10::optional dim, + const Tensor& self, at::OptionalIntArrayRef dim, c10::optional correction, bool keepdim, Tensor& result) { return std_var_out("var", result, self, dim, correction, keepdim, false); } Tensor var( - const Tensor& self, c10::optional dim, + const Tensor& self, at::OptionalIntArrayRef dim, c10::optional correction, bool keepdim) { Tensor result = at::empty({0}, options_to_value_type(self.options())); return std_var_out("var", result, self, dim, correction, keepdim, false); @@ -1971,12 +1997,25 @@ bool cpu_equal(const Tensor& self, const Tensor& other) { // backward function for those operators; it propagates the grad to the // specific value locations referred to at `indices`. Tensor value_selecting_reduction_backward(const Tensor& grad, int64_t dim, const Tensor& indices, IntArrayRef sizes, bool keepdim) { + auto inplace_scatter_if_not_tensor_subclass = + [&](const Tensor& grad_out, const Tensor& indices_) { + auto grad_in = at::zeros(sizes, grad_out.options()); + if (areAnyTensorSubclassLike({grad, indices})) { + return grad_in.scatter(dim, indices_, grad_out); + } + return grad_in.scatter_(dim, indices_, grad_out); + }; + if (!keepdim && sizes.size() > 0) { auto grad_ = grad.unsqueeze(dim); auto indices_ = indices.unsqueeze(dim); - return at::zeros(sizes, grad_.options()).scatter_(dim, indices_, grad_); + return inplace_scatter_if_not_tensor_subclass(grad_, indices_); } - return at::zeros(sizes, grad.options()).scatter_(dim, indices, grad); + return inplace_scatter_if_not_tensor_subclass(grad, indices); +} + +Tensor sum_csr(const Tensor &self, c10::optional dtype) { + return self.values().sum(dtype); } } // namespace native diff --git a/aten/src/ATen/native/ReduceOpsUtils.h b/aten/src/ATen/native/ReduceOpsUtils.h index 3c3bff454178..aa0ed5462db2 100644 --- a/aten/src/ATen/native/ReduceOpsUtils.h +++ b/aten/src/ATen/native/ReduceOpsUtils.h @@ -1,13 +1,21 @@ #pragma once #include -#include +#include #include #include #include #include +#include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#endif + namespace at { namespace native { // Maximum and minimum possible scalar values, including infinities @@ -59,7 +67,7 @@ inline bool _dimreduce_return_trivial(const Tensor &result, const Tensor &self, } inline bool _dimreduce_return_trivial_no_ident(Tensor &result, const Tensor &self, - int64_t dim, bool keepdim, const char *fn_name) { + int64_t /*dim*/, bool /*keepdim*/, const char* /*fn_name*/) { if (self.numel() == 1 && self.ndimension() == 0) { result.resize_({}); result.fill_(self); @@ -128,7 +136,7 @@ inline DimVector shape_from_dim_mask(const Tensor& self, DimMask mask, bool keep static void resize_reduction_result( Tensor& result, const Tensor& self, DimMask mask, bool keepdim, - ScalarType dtype) + ScalarType /*dtype*/) { auto shape = shape_from_dim_mask(self, mask, keepdim); TORCH_CHECK(result.defined(), "Cannot create a new tensor inside a reduction op. You likely tried to call an operator with an out argument but the out argument was an undefined tensor."); @@ -160,7 +168,7 @@ static Tensor review_reduce_result(const Tensor& result, int ndim, DimMask mask, static TensorIterator make_reduction( const char* name, Tensor& result, const Tensor& self, - c10::optional dim_opt, + at::OptionalIntArrayRef dim_opt, bool keepdim, ScalarType in_dtype, ScalarType out_dtype) { // check that result type and dtype match if provided TORCH_CHECK( @@ -185,20 +193,22 @@ static TensorIterator make_reduction( static C10_UNUSED TensorIterator make_reduction( const char* name, Tensor& result, const Tensor& self, - c10::optional dim, bool keepdim, ScalarType out_dtype) { + at::OptionalIntArrayRef dim, bool keepdim, ScalarType out_dtype) { // special case for type promotion in mixed precision, improves computational // efficiency. // not generalize this to common mismatched input/output types to avoid cross // product of templated kernel launches. const bool gpu_lowp_to_f32 = ( self.is_cuda() && (self.scalar_type() == kHalf || self.scalar_type() == kBFloat16) && out_dtype == kFloat); - auto in_dtype = gpu_lowp_to_f32 ? self.scalar_type() : out_dtype; + auto in_dtype = gpu_lowp_to_f32 ? self.scalar_type() + : self.is_complex() ? c10::toComplexType(out_dtype) + : out_dtype; return make_reduction(name, result, self, dim, keepdim, in_dtype, out_dtype); } static TensorIterator make_reduction( const char* name, Tensor& result1, Tensor& result2, const Tensor& self, - c10::optional dim_opt, bool keepdim, ScalarType dtype1, + at::OptionalIntArrayRef dim_opt, bool keepdim, ScalarType dtype1, ScalarType dtype2) { // check that result type and dtype match if provided TORCH_CHECK( @@ -235,7 +245,7 @@ static TensorIterator make_reduction( static C10_UNUSED TensorIterator make_reduction( const char* name, Tensor& result1, Tensor& result2, const Tensor& self, - c10::optional dim, bool keepdim, ScalarType dtype) { + at::OptionalIntArrayRef dim, bool keepdim, ScalarType dtype) { return make_reduction(name, result1, result2, self, dim, keepdim, dtype, dtype); } @@ -250,7 +260,11 @@ static void zero_numel_check_dims(const Tensor& self, const int64_t dim, const c } } -static C10_UNUSED void zero_numel_check_dims(const Tensor& self, const IntArrayRef dim, const char *fn_name) { +static void zero_numel_check_dims(const Tensor& self, const IntArrayRef dim, const char *fn_name) { + TORCH_CHECK( + !dim.empty(), + fn_name, ": Expected reduction dim to be specified for input.numel() == 0. ", + "Specify the reduction dim with the 'dim' argument."); for (const int64_t d : dim) { zero_numel_check_dims(self, d, fn_name); } @@ -357,7 +371,7 @@ static TensorIterator make_reduction( IntArrayRef dims, bool keepdim, ScalarType dtype1, - ScalarType dtype2) { + ScalarType /*dtype2*/) { int64_t ndim = self.dim(); auto mask = at::native::make_dim_mask(dims, ndim); auto viewed_result1 = at::native::review_reduce_result(result1, ndim, mask, keepdim); diff --git a/aten/src/ATen/native/ReflectionPad.cpp b/aten/src/ATen/native/ReflectionPad.cpp index 81eba80af1dd..d90d00e9ab40 100644 --- a/aten/src/ATen/native/ReflectionPad.cpp +++ b/aten/src/ATen/native/ReflectionPad.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include namespace at { @@ -266,76 +267,43 @@ inline void reflection_pad1d_out_loop( void reflection_pad1d_out_template( const Tensor& output, const Tensor& input_, IntArrayRef padding) { - int64_t dim_plane = 0; - int64_t dim_w = 1; - int64_t nbatch = 1; - // allow dim=0 only in the batch dimension. - TORCH_CHECK( - (input_.ndimension() == 2 && input_.size(1) != 0) || - (input_.ndimension() == 3 && input_.size(1) != 0 && input_.size(2) != 0), - "2D or 3D (batch mode) tensor expected for input, but got: ", input_); - - if (input_.ndimension() == 3) { - nbatch = input_.size(0); - dim_w++; - dim_plane++; - } - - /* sizes */ - auto pad_l = padding[0]; - auto pad_r = padding[1]; - - int64_t nplane = input_.size(dim_plane); - int64_t input_w = input_.size(dim_w); - int64_t output_w = input_w + pad_l + pad_r; - - TORCH_CHECK(pad_l < input_w && pad_r < input_w, "Argument #4: Padding size " - "should be less than the corresponding input dimension, but got: padding (", - pad_l, ", ", pad_r, ") at dimension ", dim_w, " of input ", input_.sizes()); - - TORCH_CHECK(output_w >= 1 , 2, - "input (W: ", input_w, ")is too small. Calculated output W: ", output_w); - /* get contiguous input */ Tensor input = input_.contiguous(); - /* resize output */ if (input.ndimension() == 2) { - output.resize_({nplane, output_w}); if (input.is_quantized()) { AT_DISPATCH_QINT_TYPES(input.scalar_type(), "qreflection_pad1d", [&]() { reflection_pad1d_out_frame( input.data_ptr(), output.data_ptr(), - nplane, - input_w, output_w, - pad_l); + input.size(0), + input.size(1), output.size(-1), + padding[0]); }); } else { AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(input.scalar_type(), "reflection_pad1d", [&] { reflection_pad1d_out_frame( input.data_ptr(), output.data_ptr(), - nplane, - input_w, output_w, - pad_l); + input.size(0), + input.size(1), output.size(-1), + padding[0]); }); } } else { - output.resize_({nbatch, nplane, output_w}); if (input.is_quantized()) { AT_DISPATCH_QINT_TYPES(input.scalar_type(), "qreflection_pad1d", [&]() { reflection_pad1d_out_loop( input.data_ptr(), output.data_ptr(), - nbatch, nplane, - input_w, output_w, - pad_l); + output.size(0), input.size(1), + input.size(2), output.size(-1), + padding[0]); }); } else { AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(input.scalar_type(), "reflection_pad1d", [&] { reflection_pad1d_out_loop( input.data_ptr(), output.data_ptr(), - nbatch, nplane, - input_w, output_w, - pad_l); + output.size(0), input.size(1), + input.size(2), output.size(-1), + padding[0]); }); } } @@ -854,25 +822,18 @@ static void reflection_pad3d_backward_out_loop( } // namespace +// TODO: I tihnk this function should be removed since we implement it with +// TORCH_IMPL_FUNC below Tensor& reflection_pad1d_out_cpu(const Tensor& input, IntArrayRef padding, Tensor& output) { reflection_pad1d_out_template(output, input, padding); return output; } -Tensor reflection_pad1d_cpu(const Tensor& input, IntArrayRef padding) { - Tensor output; - if (input.is_quantized()) { - if (input.qscheme() == kPerTensorAffine) { - output = at::_empty_affine_quantized({0}, input.options(), - input.q_scale(), - input.q_zero_point()); - } else { - TORCH_CHECK(false, "Only per tensor quantization is supported"); - } - } else { - output = at::empty({0}, input.options()); - } +Tensor& reflection_pad1d_out_quantized_cpu(const Tensor& input, IntArrayRef padding, + Tensor& output) { + TORCH_CHECK(input.qscheme() == kPerTensorAffine, "Only per tensor quantization is supported"); + set_quantizer_(output, make_per_tensor_affine_quantizer(input.q_scale(), input.q_zero_point(), input.scalar_type())); reflection_pad1d_out_template(output, input, padding); return output; } @@ -940,18 +901,16 @@ Tensor& reflection_pad2d_out_cpu(const Tensor& input, IntArrayRef padding, } Tensor reflection_pad2d_cpu(const Tensor& input, IntArrayRef padding) { - Tensor output; - if (input.is_quantized()) { - if (input.qscheme() == kPerTensorAffine) { - output = at::_empty_affine_quantized({0}, input.options(), + Tensor output = at::empty({0}, input.options()); + reflection_pad2d_out_template(output, input, padding); + return output; +} + +Tensor reflection_pad2d_quantized_cpu(const Tensor& input, IntArrayRef padding) { + TORCH_CHECK(input.qscheme() == kPerTensorAffine, "Only per tensor quantization is supported"); + Tensor output = at::_empty_affine_quantized({0}, input.options(), input.q_scale(), input.q_zero_point()); - } else { - TORCH_CHECK(false, "Only per tensor quantization is supported"); - } - } else { - output = at::empty({0}, input.options()); - } reflection_pad2d_out_template(output, input, padding); return output; } @@ -1007,7 +966,7 @@ TORCH_IMPL_FUNC(reflection_pad3d_out_cpu) if (batch_mode) { AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1( - kHalf, input.scalar_type(), "replication_pad3d_cpu", [&] { + kHalf, input.scalar_type(), "reflection_pad3d_cpu", [&] { auto input_data = input.data_ptr(); auto output_data = output.data_ptr(); auto nbatch = input.size(0); @@ -1028,7 +987,7 @@ TORCH_IMPL_FUNC(reflection_pad3d_out_cpu) }); } else { AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1( - kHalf, input.scalar_type(), "replication_pad3d_cpu", [&] { + kHalf, input.scalar_type(), "reflection_pad3d_cpu", [&] { auto input_data = input.data_ptr(); auto output_data = output.data_ptr(); reflection_pad3d_out_frame( @@ -1085,7 +1044,7 @@ TORCH_IMPL_FUNC(reflection_pad3d_backward_out_cpu)(const Tensor& grad_output, if (batch_mode) { AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1( - kHalf, input.scalar_type(), "replication_pad3d_backward_cpu", [&] { + kHalf, input.scalar_type(), "reflection_pad3d_backward_cpu", [&] { reflection_pad3d_backward_out_loop( grad_input.data_ptr(), grad_output_.data_ptr(), @@ -1103,7 +1062,7 @@ TORCH_IMPL_FUNC(reflection_pad3d_backward_out_cpu)(const Tensor& grad_output, }); } else { AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1( - kHalf, input.scalar_type(), "replication_pad3d_backward_cpu", [&] { + kHalf, input.scalar_type(), "reflection_pad3d_backward_cpu", [&] { reflection_pad3d_backward_out_frame( grad_input.data_ptr(), grad_output_.data_ptr(), diff --git a/aten/src/ATen/native/Repeat.h b/aten/src/ATen/native/Repeat.h index 9751f2ec8be7..dadbfb0c2374 100644 --- a/aten/src/ATen/native/Repeat.h +++ b/aten/src/ATen/native/Repeat.h @@ -1,6 +1,14 @@ #pragma once -#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#endif namespace at { namespace native { diff --git a/aten/src/ATen/native/ReplicationPadding.cpp b/aten/src/ATen/native/ReplicationPadding.cpp index d89150cee267..36339aae8445 100644 --- a/aten/src/ATen/native/ReplicationPadding.cpp +++ b/aten/src/ATen/native/ReplicationPadding.cpp @@ -230,7 +230,7 @@ static void replication_pad1d_out_frame( long nslices, long iwidth, long owidth, - int pad_l, int pad_r) + int pad_l) { int iStartX = std::max(0, -pad_l); int oStartX = std::max(0, pad_l); @@ -263,14 +263,14 @@ static void replication_pad1d_out_batch( long nslices, long iwidth, long owidth, - int pad_l, int pad_r, + int pad_l, int nbatch) { at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { for (const auto p : c10::irange(start, end)) { scalar_t *input_p = input_data+p*nslices*iwidth; scalar_t *output_p = output_data+p*nslices*owidth; - replication_pad1d_out_frame(input_p, output_p, nslices, iwidth, owidth, pad_l, pad_r); + replication_pad1d_out_frame(input_p, output_p, nslices, iwidth, owidth, pad_l); } }); } @@ -281,7 +281,7 @@ static void replication_pad1d_backward_out_frame( long nslices, long iwidth, long owidth, - int pad_l, int pad_r) + int pad_l) { int iStartX = std::max(0, -pad_l); int oStartX = std::max(0, pad_l); @@ -322,7 +322,7 @@ static void replication_pad1d_backward_out_batch( scalar_t *ginput_p = ginput_data + p * nslices * iwidth; scalar_t *goutput_p = goutput_data + p * nslices * owidth; replication_pad1d_backward_out_frame(ginput_p, goutput_p, - nslices, iwidth, owidth, pad_l, pad_r); + nslices, iwidth, owidth, pad_l); } }); } @@ -334,7 +334,7 @@ static void replication_pad2d_out_frame( int64_t iwidth, int64_t iheight, int64_t owidth, int64_t oheight, int pad_l, int pad_r, - int pad_t, int pad_b) + int pad_t) { int iStartX = std::max(0, -pad_l); int iStartY = std::max(0, -pad_t); @@ -381,7 +381,7 @@ static void replication_pad2d_out_batch( int64_t iwidth, int64_t iheight, int64_t owidth, int64_t oheight, int pad_l, int pad_r, - int pad_t, int pad_b, + int pad_t, int nbatch) { at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { @@ -389,7 +389,7 @@ static void replication_pad2d_out_batch( scalar_t *input_p = input_data+p*nslices*iwidth*iheight; scalar_t *output_p = output_data+p*nslices*owidth*oheight; replication_pad2d_out_frame(input_p, output_p, nslices, - iwidth, iheight, owidth, oheight, pad_l, pad_r, pad_t, pad_b); + iwidth, iheight, owidth, oheight, pad_l, pad_r, pad_t); } }); } @@ -811,7 +811,6 @@ TORCH_IMPL_FUNC(replication_pad1d_out_cpu) ( constexpr int64_t dimslices = -2; int64_t pad_l = paddingSize[0]; - int64_t pad_r = paddingSize[1]; /* get contiguous input */ auto input = input_.contiguous(); @@ -837,7 +836,7 @@ TORCH_IMPL_FUNC(replication_pad1d_out_cpu) ( nslices, iwidth, owidth, - pad_l, pad_r); + pad_l); } ); } @@ -852,7 +851,7 @@ TORCH_IMPL_FUNC(replication_pad1d_out_cpu) ( nslices, iwidth, owidth, - pad_l, pad_r, + pad_l, nbatch); } ); @@ -907,7 +906,7 @@ TORCH_IMPL_FUNC(replication_pad1d_backward_out_cpu) ( nslices, iwidth, owidth, - pad_l, pad_r); + pad_l); } ); } @@ -969,7 +968,7 @@ TORCH_IMPL_FUNC(replication_pad2d_out_cpu) ( iwidth, iheight, owidth, oheight, pad_l, pad_r, - pad_t, pad_b); + pad_t); } ); } @@ -983,7 +982,7 @@ TORCH_IMPL_FUNC(replication_pad2d_out_cpu) ( iwidth, iheight, owidth, oheight, pad_l, pad_r, - pad_t, pad_b, + pad_t, nbatch); } ); diff --git a/aten/src/ATen/native/Resize.cpp b/aten/src/ATen/native/Resize.cpp index f05a18b7806d..08286f3983cc 100644 --- a/aten/src/ATen/native/Resize.cpp +++ b/aten/src/ATen/native/Resize.cpp @@ -17,7 +17,7 @@ bool resize_output_check(const Tensor& output, IntArrayRef shape) { TORCH_WARN( "An output with one or more elements was resized since it had ", "shape ", output.sizes(), ", which does not match the required ", - "output shape ", shape, ".", + "output shape ", shape, ". ", "This behavior is deprecated, and in a future PyTorch release outputs ", "will not be resized unless they have zero elements. You can explicitly ", "reuse an out tensor t by resizing it, inplace, to zero elements with ", @@ -45,6 +45,12 @@ bool resize_output(const Tensor& output, IntArrayRef shape) { } } +const Tensor& _resize_output_(const Tensor& self, IntArrayRef shape, c10::Device device) { + TORCH_CHECK(self.device() == device, "out Tensor doesn't have the correct device set"); + at::native::resize_output(self, shape); + return self; +} + void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes) { TORCH_CHECK(storage->resizable(), "Trying to resize storage that is not resizable"); diff --git a/aten/src/ATen/native/Resize.h b/aten/src/ATen/native/Resize.h index 3540ef8b21ac..c6fe2b3d2146 100644 --- a/aten/src/ATen/native/Resize.h +++ b/aten/src/ATen/native/Resize.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -30,22 +31,16 @@ TORCH_API bool resize_output_check(const Tensor& output, IntArrayRef shape); TORCH_API void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes); -static inline void maybe_resize_storage_cpu(TensorImpl* self, uint64_t new_size) { +static inline void maybe_resize_storage_cpu(TensorImpl* self, size_t new_size_bytes) { // It does not make sense to try to resize a storage // to hold 0 elements, and this can break // if storage_offset is positive but // new_size is 0, so just bail in that case // (same comment is in cuda/Resize.h) - if (new_size == 0) { + if (self->numel() == 0) { return; } - const auto new_size_bytes_i = - (new_size + self->storage_offset()) * self->dtype().itemsize(); - TORCH_CHECK(!overflows(new_size_bytes_i), "Requested storage size (", - new_size_bytes_i, ") cannot be represented as a size_t"); - const auto new_size_bytes = static_cast(new_size_bytes_i); - const Storage& storage = self->unsafe_storage(); if (!storage) { auto new_storage = c10::make_intrusive( @@ -62,21 +57,25 @@ static inline void maybe_resize_storage_cpu(TensorImpl* self, uint64_t new_size) inline TensorImpl* resize_impl_cpu_( TensorImpl* self, IntArrayRef size, - c10::optional stride, + at::OptionalIntArrayRef stride, bool resize_storage = true) { - if (self->sizes() == size && (!stride || self->strides() == stride)) { + if (self->sizes() == size && (!stride || self->strides() == stride.value())) { return self; } - int64_t storage_size = 1; + const auto itemsize = self->dtype().itemsize(); + const auto storage_offset = self->storage_offset(); + size_t storage_size = 1; if (stride) { self->set_sizes_and_strides(size, *stride); - // NB: storage size can be different from numel. - storage_size = storage_size_for(size, *stride); + storage_size = at::detail::computeStorageNbytes( + size, *stride, itemsize, storage_offset); } else { self->set_sizes_contiguous(size); - storage_size = self->numel(); + storage_size = at::detail::computeStorageNbytesContiguous( + size, itemsize, storage_offset); } + if (resize_storage) { maybe_resize_storage_cpu(self, storage_size); } @@ -158,6 +157,12 @@ inline void setStrided( IntArrayRef stride, int64_t storage_offset) { TORCH_CHECK(size.size() == stride.size(), "mismatch in length of strides and shape"); + for (auto val : stride) { + TORCH_CHECK(val >= 0, + "as_strided: Negative strides are not supported at the moment, " + "got strides: ", stride); + } + auto* self_ = self.unsafeGetTensorImpl(); checkInBoundsForStorage( size, stride, storage_offset, self_->dtype(), self_->storage()); @@ -170,11 +175,6 @@ inline void setStrided( if (self_->sizes() == size && self_->strides() == stride) { return; } - for (auto val : stride) { - TORCH_CHECK(val >= 0, - "as_strided: Negative strides are not supported at the moment, " - "got strides: ", stride); - } self_->set_sizes_and_strides(size, stride); } diff --git a/aten/src/ATen/native/Scalar.cpp b/aten/src/ATen/native/Scalar.cpp index aecfffadb020..7342c4806d44 100644 --- a/aten/src/ATen/native/Scalar.cpp +++ b/aten/src/ATen/native/Scalar.cpp @@ -20,8 +20,8 @@ Scalar item(const Tensor& self) { Scalar _local_scalar_dense_cpu(const Tensor& self) { Scalar r; - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( - at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "_local_scalar_dense_cpu", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( + kComplexHalf, kHalf, kBool, kBFloat16, self.scalar_type(), "_local_scalar_dense_cpu", [&] { scalar_t value = *self.data_ptr(); r = Scalar(value); }); diff --git a/aten/src/ATen/native/ScatterGatherChecks.h b/aten/src/ATen/native/ScatterGatherChecks.h index 1b71eb40975d..92e1edeb5fe0 100644 --- a/aten/src/ATen/native/ScatterGatherChecks.h +++ b/aten/src/ATen/native/ScatterGatherChecks.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include #include diff --git a/aten/src/ATen/native/SegmentReduce.h b/aten/src/ATen/native/SegmentReduce.h index 11a399ae77a1..1e5b87eefb6d 100644 --- a/aten/src/ATen/native/SegmentReduce.h +++ b/aten/src/ATen/native/SegmentReduce.h @@ -1,10 +1,12 @@ #pragma once -#include #include +#include #include namespace at { +class Tensor; + namespace native { enum SegmentReductionType { MAX, MEAN, MIN, SUM }; diff --git a/aten/src/ATen/native/SharedReduceOps.h b/aten/src/ATen/native/SharedReduceOps.h index cd58d4f48ee8..0519bfa57e61 100644 --- a/aten/src/ATen/native/SharedReduceOps.h +++ b/aten/src/ATen/native/SharedReduceOps.h @@ -344,17 +344,17 @@ template struct AbsSwitch {}; template -inline C10_DEVICE acc_t abs_if_complex(scalar_t data, AbsSwitch s) { +inline C10_DEVICE acc_t abs_if_complex(scalar_t data, AbsSwitch) { return static_cast(data); } template -inline C10_DEVICE acc_t abs_if_complex(std::complex data, AbsSwitch s) { +inline C10_DEVICE acc_t abs_if_complex(std::complex data, AbsSwitch) { return static_cast(std::abs(data)); } template -inline C10_DEVICE acc_t abs_if_complex(c10::complex data, AbsSwitch s) { +inline C10_DEVICE acc_t abs_if_complex(c10::complex data, AbsSwitch) { return static_cast(std::abs(data)); } diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp index b4635365e432..6d9f1324eb28 100644 --- a/aten/src/ATen/native/SoftMax.cpp +++ b/aten/src/ATen/native/SoftMax.cpp @@ -9,6 +9,7 @@ #include #include +#include #include namespace at { @@ -148,7 +149,7 @@ void host_softmax( int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, (int64_t)1); parallel_for( 0, outer_size * inner_size, grain_size, - [&](int64_t begin, int64_t end) { + [&](int64_t begin, int64_t end) __ubsan_ignore_float_divide_by_zero__ { for (const auto i : c10::irange(begin, end)) { int64_t outer_idx = i / inner_size; int64_t inner_idx = i % inner_size; @@ -170,7 +171,7 @@ void host_softmax( } } else { for (const auto d : c10::irange(0, dim_size)) { - if (mask_data[d * dim_stride]) { + if (!mask_data[d * dim_stride]) { max_input = is_meaningful_max ? std::max(max_input, input_data[d * dim_stride]) : input_data[d * dim_stride]; @@ -183,7 +184,7 @@ void host_softmax( acc_type tmpsum = 0; for (const auto d : c10::irange(dim_size)) { scalar_t z{}; - if (!MaskedSoftMax || mask_data[d * dim_stride]) { + if (!MaskedSoftMax || !mask_data[d * dim_stride]) { z = std::exp(input_data[d * dim_stride] - max_input); } else { z = 0; @@ -196,6 +197,8 @@ void host_softmax( if (LogSoftMax) { tmpsum = std::log(tmpsum); + } else if (tmpsum == 0) { + tmpsum = std::numeric_limits::quiet_NaN(); } else { tmpsum = 1 / tmpsum; } @@ -214,12 +217,13 @@ void host_softmax( }); } -template +template void host_softmax_backward( const Tensor& gI, const Tensor& grad, const Tensor& output, - int64_t dim) { + int64_t dim, + bool* mask = nullptr) { int64_t outer_size = 1; int64_t dim_size = grad.size(dim); @@ -235,6 +239,7 @@ void host_softmax_backward( scalar_t* gradInput_data_base = gI.data_ptr(); scalar_t* output_data_base = output.data_ptr(); scalar_t* gradOutput_data_base = grad.data_ptr(); + bool* mask_data_base = mask; int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, (int64_t)1); parallel_for( 0, outer_size * inner_size, grain_size, [&](int64_t begin, int64_t end) { @@ -247,19 +252,28 @@ void host_softmax_backward( output_data_base + outer_idx * outer_stride + inner_idx; const scalar_t* gradOutput_data = gradOutput_data_base + outer_idx * outer_stride + inner_idx; + bool* mask_data = nullptr; + if (MaskedSoftMax) { + mask_data = mask_data_base + outer_idx * outer_stride + inner_idx; + } acc_type sum = 0; for (const auto d : c10::irange(dim_size)) { - if (LogSoftMax) { - sum += gradOutput_data[d * dim_stride]; - } else { - sum += - gradOutput_data[d * dim_stride] * output_data[d * dim_stride]; + if (!MaskedSoftMax || !mask_data[d * dim_stride]) { + if (LogSoftMax) { + sum += gradOutput_data[d * dim_stride]; + } else { + sum += + gradOutput_data[d * dim_stride] * output_data[d * dim_stride]; + } } } for (const auto d : c10::irange(dim_size)) { - if (LogSoftMax) { + if (MaskedSoftMax && mask_data[d * dim_stride]) { + gradInput_data[d * dim_stride] = 0; + } + else if (LogSoftMax) { gradInput_data[d * dim_stride] = gradOutput_data[d * dim_stride] - std::exp(output_data[d * dim_stride]) * sum; } else { @@ -360,7 +374,10 @@ TORCH_IMPL_FUNC(softmax_backward_cpu_out) } else { AT_DISPATCH_FLOATING_TYPES_AND( at::ScalarType::BFloat16, grad.scalar_type(), "softmax_backward", [&] { - host_softmax_backward(grad_input, grad_, output, dim_); + host_softmax_backward< + scalar_t, + false /* LogSoftMax */, + false /* MaskedSoftmax */>(grad_input, grad_, output, dim_); }); } } @@ -389,7 +406,10 @@ TORCH_IMPL_FUNC(log_softmax_backward_cpu_out) ( grad.scalar_type(), "log_softmax_backward", [&] { - host_softmax_backward(grad_input, grad_, output_, dim_); + host_softmax_backward< + scalar_t, + true /* LogSoftMax */, + false /* MaskedSoftMax */>(grad_input, grad_, output_, dim_); }); } } @@ -418,6 +438,43 @@ Tensor softmax(const Tensor& input_, const int64_t dim_, c10::optional dtype, + Tensor& output_) { + Tensor output_temp; + if (input_.is_cuda() && input_.scalar_type() == ScalarType::Half && + dtype == ScalarType::Float) { + if (!output_.is_contiguous()) { + auto options = + TensorOptions().dtype(output_.dtype()).device(output_.device()); + output_temp = at::empty(output_.sizes(), options); + at::_softmax_out(output_temp, input_, dim_, true); + } else { + at::_softmax_out(output_, input_, dim_, true); + } + } else { + Tensor converted = + dtype.has_value() ? input_.toType(dtype.value()) : input_; + if (!output_.is_contiguous()) { + auto options = + TensorOptions().dtype(output_.dtype()).device(output_.device()); + output_temp = at::empty(output_.sizes(), options); + at::_softmax_out(output_temp, converted, dim_, false); + } else { + at::_softmax_out(output_, converted, dim_, false); + } + } + + if (!output_.is_contiguous()) { + output_.resize_(output_temp.sizes()); + output_.copy_(output_temp); + } + + return output_; +} + // special_softmax, alias for softmax Tensor special_softmax(const Tensor& input_, const int64_t dim_, c10::optional dtype) { return at::softmax(input_, dim_, dtype); @@ -446,6 +503,43 @@ Tensor log_softmax(const Tensor& input_, const int64_t dim_, c10::optional dtype, + Tensor& output_) { + Tensor output_temp; + if (input_.is_cuda() && input_.scalar_type() == ScalarType::Half && + dtype == ScalarType::Float) { + if (!output_.is_contiguous()) { + auto options = + TensorOptions().dtype(output_.dtype()).device(output_.device()); + output_temp = at::empty(output_.sizes(), options); + at::_log_softmax_out(output_temp, input_, dim_, true); + } else { + at::_log_softmax_out(output_, input_, dim_, true); + } + } else { + Tensor converted = + dtype.has_value() ? input_.toType(dtype.value()) : input_; + if (!output_.is_contiguous()) { + auto options = + TensorOptions().dtype(output_.dtype()).device(output_.device()); + output_temp = at::empty(output_.sizes(), options); + at::_log_softmax_out(output_temp, converted, dim_, false); + } else { + at::_log_softmax_out(output_, converted, dim_, false); + } + } + + if (!output_.is_contiguous()) { + output_.resize_(output_temp.sizes()); + output_.copy_(output_temp); + } + + return output_; +} + Tensor special_log_softmax(const Tensor& input, const int64_t dim, c10::optional dtype) { return at::log_softmax(input, dim, dtype); } @@ -466,23 +560,64 @@ Tensor log_softmax(const Tensor& self, Dimname dim, optional dtype) return at::log_softmax(self, dimname_to_position(self, dim), dtype); } -Tensor masked_softmax_cpu(const Tensor& input, const Tensor& mask) { - Tensor output = at::empty_like(input, input.options()); +Tensor masked_softmax_cpu(const Tensor& input_, const Tensor& mask_, const c10::optional dim_) { TORCH_CHECK( - input.sizes() == mask.sizes(), "Mask shape should match input shape"); - TORCH_CHECK(mask.is_contiguous(), "Mask should always be contiguous"); + input_.sizes() == mask_.sizes(), "Mask shape should match input shape"); TORCH_CHECK( - mask.scalar_type() == ScalarType::Bool, + mask_.scalar_type() == ScalarType::Bool, "Mask should be a boolean tensor"); + + Tensor output = at::empty_like(input_, input_.options()); + auto input = input_.contiguous(); + auto mask = mask_.contiguous(); + int64_t dim = dim_.has_value() ? dim_.value() : input.dim() - 1; + dim = maybe_wrap_dim(dim, input_.dim()); + + if (input.dim() == 0) { + input = input.view(1); + } + AT_DISPATCH_FLOATING_TYPES_AND( - at::ScalarType::BFloat16, input.scalar_type(), "log_softmax", [&] { + at::ScalarType::BFloat16, input.scalar_type(), "masked_softmax", [&] { host_softmax< scalar_t, false /* LogSoftMax */, true /* MaskedSoftMax */>( - output, input, input.dim() - 1, mask.data_ptr()); + output, input, dim, mask.data_ptr()); }); return output; } + +Tensor masked_softmax_backward_cpu( + const Tensor& grad_, + const Tensor& output_, + const Tensor& mask_, + const c10::optional dim_) { + TORCH_CHECK( + grad_.sizes() == mask_.sizes(), "Mask shape should match grad shape"); + TORCH_CHECK( + mask_.scalar_type() == ScalarType::Bool, + "Mask should be a boolean tensor"); + auto grad = grad_.contiguous(); + auto output = output_.contiguous(); + auto mask = mask_.contiguous(); + + int64_t dim = dim_.has_value() ? dim_.value() : output.dim() - 1; + dim = maybe_wrap_dim(dim, grad.dim()); + + grad = grad.dim() == 0 ? grad.view(1) : grad; + output = output.dim() == 0 ? output.view(1) : output; + mask = mask.dim() == 0 ? mask.view(1) : mask; + + Tensor grad_input = at::empty_like(grad, grad.options()); + AT_DISPATCH_FLOATING_TYPES_AND( + at::ScalarType::BFloat16, grad.scalar_type(), "masked_softmax_backward", [&] { + host_softmax_backward< + scalar_t, + false /* LogSoftMax */, + true /* MaskedSoftmax */>(grad_input, grad, output, dim, mask.data_ptr()); + }); + return grad_input; +} } } diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp index ae88547a8aa9..e99fd75467b6 100644 --- a/aten/src/ATen/native/Sorting.cpp +++ b/aten/src/ATen/native/Sorting.cpp @@ -14,30 +14,46 @@ namespace at { namespace meta { + using namespace native; - TORCH_META_FUNC(topk) ( - const Tensor& self, - int64_t k, - int64_t dim_, - bool largest, - bool sorted) { - int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true); - TORCH_CHECK( - k >= 0 && k <= (self.dim() > 0 ? self.size(dim) : 1), - "selected index k out of range"); - int64_t sliceSize = self.dim() == 0 ? 1 : self.size(dim); - TORCH_CHECK(k >= 0 && k <= sliceSize, "k not in range for dimension"); - - // Build the output size, which is the dim being selected set to - // size k - DimVector topKSize(self.sizes().vec()); - if (topKSize.size() > 0) { - topKSize[dim] = k; - } - set_output(0, topKSize, self.options()); - set_output(1, topKSize, self.options().dtype(at::kLong)); +TORCH_META_FUNC(topk) +(const Tensor& self, int64_t k, int64_t dim_, bool largest, bool sorted) { + int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true); + TORCH_CHECK( + k >= 0 && k <= (self.dim() > 0 ? self.size(dim) : 1), + "selected index k out of range"); + int64_t sliceSize = self.dim() == 0 ? 1 : self.size(dim); + TORCH_CHECK(k >= 0 && k <= sliceSize, "k not in range for dimension"); + + // Build the output size, which is the dim being selected set to + // size k + DimVector topKSize(self.sizes().vec()); + if (topKSize.size() > 0) { + topKSize[dim] = k; } + set_output(0, topKSize, self.options()); + set_output(1, topKSize, self.options().dtype(at::kLong)); +} + +TORCH_META_FUNC2(sort, stable) +(const Tensor& self, c10::optional stable, int64_t dim, bool descending) { + TORCH_INTERNAL_ASSERT( + stable.has_value(), + "sort(): c10::optional for stable has to have value."); + maybe_wrap_dim(dim, self.dim()); + + // See issue: https://github.com/pytorch/pytorch/issues/65863 + // Strides should be dense, so as not to allocate too much memory. + // We either use 'self' strides, or infer dense strides from them. + std::vector strides = (self.is_non_overlapping_and_dense()) + ? self.strides().vec() + : at::infer_dense_strides(self.sizes(), self.strides()); + + set_output(0, self.sizes(), strides, self.options(), {}); + set_output(1, self.sizes(), strides, self.options().dtype(kLong), {}); +} + } // namespace meta namespace native { @@ -45,6 +61,19 @@ namespace native { DEFINE_DISPATCH(sort_stub); DEFINE_DISPATCH(topk_stub); +void _fill_indices(const TensorBase &indices, int64_t dim) { + auto ndim = indices.dim(); + assert(0 <= dim && dim < ndim); + auto dim_size = indices.size(dim); + auto idx_dim = at::arange(0, dim_size, indices.options().dtype(at::kLong)); + auto idx_dim_sizes = std::vector(ndim, 1); + auto idx_dim_strides = std::vector(ndim, 0); + idx_dim_sizes[dim] = dim_size; + idx_dim_strides[dim] = 1; + auto idx_dim_restrided = idx_dim.as_strided(idx_dim_sizes, idx_dim_strides); + OptionalTensorRef(indices)->copy_(idx_dim_restrided); +} + namespace { /* Note from TH: @@ -86,7 +115,7 @@ void quick_select_template( } // Use median of three for pivot choice - P = (L + R) >> 1; + P = L + (R - L) / 2; swap_fn(P, L + 1); if (gt_or_nan(arr[L + 1], arr[R])) { swap_fn(L + 1, R); @@ -852,52 +881,37 @@ Tensor nanmedian_cpu(const Tensor& self) { return median_impl(self, /*ignore_nan=*/true); } -std::tuple sort_out_cpu_stable(const Tensor& self, - c10::optional stable, - int64_t dim, - bool descending, - Tensor& values, - Tensor& indices) { - values.resize_(self.sizes()).copy_(self); - indices.resize_(self.sizes()); - +TORCH_IMPL_FUNC(sort_stable_out) +(const Tensor& self, + c10::optional stable, + int64_t dim, + bool descending, + const Tensor& values, + const Tensor& indices) { + values.copy_(self); // check if self is scalar if (self.dim() == 0 && self.numel() == 1) { indices.zero_(); - return std::forward_as_tuple(values, indices); + } else { + dim = maybe_wrap_dim(dim, self.dim()); + sort_stub(self.device().type(), self, values, indices, dim, descending, stable.value()); } - - TORCH_INTERNAL_ASSERT(stable.has_value(), "sort_out(): c10::optional for stable has to have value."); - sort_stub(kCPU, values, indices, dim, descending, stable.value()); - - return std::forward_as_tuple(values, indices); } -std::tuple sort_out_cpu(const Tensor& self, +std::tuple sort_out( + const Tensor& self, int64_t dim, bool descending, Tensor& values, Tensor& indices) { - return at::native::sort_out_cpu_stable( - self, /*stable=*/false, dim, descending, values, indices); -} - -std::tuple sort_cpu_stable( - const Tensor& self, - c10::optional stable, - int64_t dim, - bool descending) { - TORCH_CHECK(!self.is_complex(), "sort(): input tensor must be of non-complex type"); - Tensor values = at::empty({0}, self.options()); - Tensor indices = at::empty({0}, self.options().dtype(kLong)); - return at::native::sort_out_cpu_stable(self, stable, dim, descending, values, indices); + return at::sort_out(values, indices, self, false, dim, descending); } -std::tuple sort_cpu( +std::tuple sort( const Tensor& self, int64_t dim, bool descending) { - return sort_cpu_stable(self, /*stable=*/false, dim, descending); + return at::sort(self, false, dim, descending); } Tensor& msort_out(const Tensor& self, Tensor& values) { diff --git a/aten/src/ATen/native/Sorting.h b/aten/src/ATen/native/Sorting.h index edfc583a50bf..627ee4521150 100644 --- a/aten/src/ATen/native/Sorting.h +++ b/aten/src/ATen/native/Sorting.h @@ -1,7 +1,11 @@ #pragma once -#include #include +#include + +namespace at { +class TensorBase; +} namespace at { namespace native { @@ -14,11 +18,13 @@ enum class QUANTILE_INTERPOLATION_MODE : uint8_t { NEAREST }; -using sort_fn = void(*)(Tensor& values, Tensor& indices, int64_t dim, bool descending, bool stable); -using topk_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, int64_t, int64_t, bool, bool); +using sort_fn = void(*)(const TensorBase&, const TensorBase&, const TensorBase&, int64_t, bool, bool); +using topk_fn = void(*)(const TensorBase&, const TensorBase&, const TensorBase&, int64_t, int64_t, bool, bool); DECLARE_DISPATCH(sort_fn, sort_stub); DECLARE_DISPATCH(topk_fn, topk_stub); +void _fill_indices(const TensorBase &indices, int64_t dim); + } // namespace native } // namespace at diff --git a/aten/src/ATen/native/SortingUtils.h b/aten/src/ATen/native/SortingUtils.h index f3d8805a3526..f6065927eba4 100644 --- a/aten/src/ATen/native/SortingUtils.h +++ b/aten/src/ATen/native/SortingUtils.h @@ -86,92 +86,5 @@ inline void _allocate_or_resize_output_with_indices( } } - -#ifdef CPU_CAPABILITY -inline namespace CPU_CAPABILITY { -#else -inline namespace DEFAULT { -#endif - -// Core topk loop, shared between CPU and QuantizedCPU -template -void topk_impl_loop( - const int64_t mode_values_stride, - const int64_t mode_indices_stride, - const int64_t tmp_values_stride, - const int64_t k, - const int64_t dim_size, - const bool largest, - const bool sorted, - char** data, const int64_t* strides, const int64_t n) { - - using elem_t = std::pair; - std::vector queue(dim_size); - for (const auto i : c10::irange(n)) { - TensorAccessor mode_values( - reinterpret_cast(data[0] + i * strides[0]), - &k, &mode_values_stride); - TensorAccessor mode_indices( - reinterpret_cast(data[1] + i * strides[1]), - &k, &mode_indices_stride); - TensorAccessor tmp_values( - reinterpret_cast(data[2] + i * strides[2]), - &dim_size, &tmp_values_stride); - - auto n = dim_size; - auto use_partial_sort = k * 64 <= n; - - for (const auto j : c10::irange(n)) { - queue[j].first = tmp_values[j]; - queue[j].second = j; - } - - // we want nan to be sorted as top for numpy compatibility - if (use_partial_sort) { - if (largest) { - std::partial_sort(queue.begin(), queue.begin() + k, queue.end(), - [](const elem_t& x, const elem_t& y) -> bool { - return ((_isnan(x.first) && !_isnan(y.first)) || (x.first > y.first)); - }); - } else { - std::partial_sort(queue.begin(), queue.begin() + k, queue.end(), - [](const elem_t& x, const elem_t& y) -> bool { - return ((!_isnan(x.first) && _isnan(y.first)) || (x.first < y.first)); - }); - } - } else { - if (largest) { - std::nth_element(queue.begin(), queue.begin() + k - 1, queue.end(), - [](const elem_t& x, const elem_t& y) -> bool { - return ((_isnan(x.first) && !_isnan(y.first)) || (x.first > y.first)); - }); - if (sorted) { - std::sort(queue.begin(), queue.begin() + k - 1, - [](const elem_t& x, const elem_t& y) -> bool { - return ((_isnan(x.first) && !_isnan(y.first)) || (x.first > y.first)); - }); - } - } else { - std::nth_element(queue.begin(), queue.begin() + k -1, queue.end(), - [](const elem_t& x, const elem_t& y) -> bool { - return ((!_isnan(x.first) && _isnan(y.first)) || (x.first < y.first)); - }); - if (sorted) { - std::sort(queue.begin(), queue.begin() + k -1, - [](const elem_t& x, const elem_t& y) -> bool { - return ((!_isnan(x.first) && _isnan(y.first)) || (x.first < y.first)); - }); - } - } - } - - for (const auto j : c10::irange(k)) { - mode_values[j] = queue[j].first; - mode_indices[j] = queue[j].second; - } - } -} - -} // namespace CPU_CAPABILITY } // namespace native } // namespace at diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp index 2f5789a8f387..9c0ebed7551a 100644 --- a/aten/src/ATen/native/SpectralOps.cpp +++ b/aten/src/ATen/native/SpectralOps.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -18,7 +19,7 @@ namespace { // * Integers are promoted to the default floating type // * If require_complex=True, all types are promoted to complex // * Raises an error for half-precision dtypes to allow future support -ScalarType promote_type_fft(ScalarType type, bool require_complex) { +ScalarType promote_type_fft(ScalarType type, bool require_complex, Device device) { if (at::isComplexType(type)) { return type; } @@ -27,7 +28,11 @@ ScalarType promote_type_fft(ScalarType type, bool require_complex) { type = c10::typeMetaToScalarType(c10::get_default_dtype()); } - TORCH_CHECK(type == kFloat || type == kDouble, "Unsupported dtype ", type); + if (device.is_cuda() && !at::detail::getCUDAHooks().hasROCM()) { + TORCH_CHECK(type == kHalf || type == kFloat || type == kDouble, "Unsupported dtype ", type); + } else { + TORCH_CHECK(type == kFloat || type == kDouble, "Unsupported dtype ", type); + } if (!require_complex) { return type; @@ -35,6 +40,7 @@ ScalarType promote_type_fft(ScalarType type, bool require_complex) { // Promote to complex switch (type) { + case kHalf: return kComplexHalf; case kFloat: return kComplexFloat; case kDouble: return kComplexDouble; default: TORCH_INTERNAL_ASSERT(false, "Unhandled dtype"); @@ -44,7 +50,7 @@ ScalarType promote_type_fft(ScalarType type, bool require_complex) { // Promote a tensor's dtype according to promote_type_fft Tensor promote_tensor_fft(const Tensor& t, bool require_complex=false) { auto cur_type = t.scalar_type(); - auto new_type = promote_type_fft(cur_type, require_complex); + auto new_type = promote_type_fft(cur_type, require_complex, t.device()); return (cur_type == new_type) ? t : t.to(new_type); } @@ -218,7 +224,7 @@ struct ShapeAndDims { // Wraps dimensions and applies defaulting behavior. // Also checks transform dims are unique and transform shape is non-empty. ShapeAndDims canonicalize_fft_shape_and_dim_args( - Tensor input, c10::optional shape, c10::optional dim) { + Tensor input, at::OptionalIntArrayRef shape, at::OptionalIntArrayRef dim) { const int64_t input_dim = input.dim(); const IntArrayRef input_sizes = input.sizes(); ShapeAndDims ret; @@ -371,8 +377,8 @@ Tensor& fft_ihfft_out(const Tensor& self, c10::optional n, return out; } -Tensor fft_fftn(const Tensor& self, c10::optional s, - c10::optional dim, +Tensor fft_fftn(const Tensor& self, at::OptionalIntArrayRef s, + at::OptionalIntArrayRef dim, c10::optional norm) { auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); // TODO: For real input, perform rfftn then mirror with conjugate symmetry @@ -381,8 +387,8 @@ Tensor fft_fftn(const Tensor& self, c10::optional s, } Tensor& fft_fftn_out(const Tensor& self, - c10::optional s, - c10::optional dim, + at::OptionalIntArrayRef s, + at::OptionalIntArrayRef dim, c10::optional norm, Tensor& out) { auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); // TODO: For real input, perform rfftn then mirror with conjugate symmetry @@ -391,8 +397,8 @@ Tensor& fft_fftn_out(const Tensor& self, return out; } -Tensor fft_ifftn(const Tensor& self, c10::optional s, - c10::optional dim, +Tensor fft_ifftn(const Tensor& self, at::OptionalIntArrayRef s, + at::OptionalIntArrayRef dim, c10::optional norm) { auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); Tensor input = promote_tensor_fft(self, /*require_complex=*/true); @@ -400,8 +406,8 @@ Tensor fft_ifftn(const Tensor& self, c10::optional s, } Tensor& fft_ifftn_out(const Tensor& self, - c10::optional s, - c10::optional dim, + at::OptionalIntArrayRef s, + at::OptionalIntArrayRef dim, c10::optional norm, Tensor& out) { auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); Tensor input = promote_tensor_fft(self, /*require_complex=*/true); @@ -410,8 +416,8 @@ Tensor& fft_ifftn_out(const Tensor& self, } static Tensor fft_rfftn_impl(Tensor out, const Tensor& self, - c10::optional s, - c10::optional dim, + at::OptionalIntArrayRef s, + at::OptionalIntArrayRef dim, const c10::optional& norm_str) { TORCH_CHECK(!self.is_complex(), "rfftn expects a real-valued input tensor, but got ", self.scalar_type()); auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); @@ -423,15 +429,15 @@ static Tensor fft_rfftn_impl(Tensor out, const Tensor& self, return fft_r2c_maybe_out(fname, out, x, desc.dim, norm, /*onesided=*/true); } -Tensor fft_rfftn(const Tensor& self, c10::optional s, - c10::optional dim, +Tensor fft_rfftn(const Tensor& self, at::OptionalIntArrayRef s, + at::OptionalIntArrayRef dim, c10::optional norm_str) { return fft_rfftn_impl({}, self, s, dim, norm_str); } Tensor& fft_rfftn_out(const Tensor& self, - c10::optional s, - c10::optional dim, + at::OptionalIntArrayRef s, + at::OptionalIntArrayRef dim, c10::optional norm_str, Tensor& out) { fft_rfftn_impl(out, self, s, dim, norm_str); return out; @@ -439,12 +445,13 @@ Tensor& fft_rfftn_out(const Tensor& self, ShapeAndDims canonicalize_fft_c2r_shape_and_dim_args( c10::string_view fname, const Tensor& self, - const c10::optional& s, - const c10::optional& dims, + const at::OptionalIntArrayRef& s, + const at::OptionalIntArrayRef& dims, int64_t& last_dim_size) { auto desc = canonicalize_fft_shape_and_dim_args(self, s, dims); TORCH_CHECK(desc.shape.size() > 0, fname, " must transform at least one axis"); + // Expected output size of the hermitian-symmetric dimension last_dim_size = [&] { // Fixup default shape handling in the last dimension, if (!s.has_value() || (s->back() == -1)) { @@ -453,15 +460,16 @@ ShapeAndDims canonicalize_fft_c2r_shape_and_dim_args( } return desc.shape.back(); }(); - auto ld = last_dim_size / 2 + 1; - desc.shape.back() = ld; - TORCH_CHECK(ld >= 1, "Invalid number of data points (", last_dim_size, ") specified"); + TORCH_CHECK(last_dim_size >= 1, "Invalid number of data points (", last_dim_size, ") specified"); + + // Expected input size of the complex-hermitian data + desc.shape.back() = last_dim_size / 2 + 1; return desc; } static Tensor fft_irfftn_impl(Tensor out, const Tensor& self, - c10::optional s, - c10::optional dim, + at::OptionalIntArrayRef s, + at::OptionalIntArrayRef dim, const c10::optional& norm_str) { int64_t last_dim_size = 0; auto desc = canonicalize_fft_c2r_shape_and_dim_args( @@ -474,15 +482,15 @@ static Tensor fft_irfftn_impl(Tensor out, const Tensor& self, } Tensor fft_irfftn(const Tensor& self, - c10::optional s, - c10::optional dim, + at::OptionalIntArrayRef s, + at::OptionalIntArrayRef dim, c10::optional norm_str) { return fft_irfftn_impl({}, self, s, dim, norm_str); } Tensor& fft_irfftn_out(const Tensor& self, - c10::optional s, - c10::optional dim, + at::OptionalIntArrayRef s, + at::OptionalIntArrayRef dim, c10::optional norm_str, Tensor& out) { fft_irfftn_impl(out, self, s, dim, norm_str); return out; @@ -490,8 +498,8 @@ Tensor& fft_irfftn_out(const Tensor& self, static Tensor fft_hfftn_impl( const Tensor& self, - c10::optional s, - c10::optional dim, + at::OptionalIntArrayRef s, + at::OptionalIntArrayRef dim, c10::optional norm_str, const Tensor& out) { constexpr c10::string_view fname = "hfftn"; @@ -518,16 +526,16 @@ static Tensor fft_hfftn_impl( Tensor fft_hfftn( const Tensor& self, - c10::optional s, - c10::optional dim, + at::OptionalIntArrayRef s, + at::OptionalIntArrayRef dim, c10::optional norm) { return fft_hfftn_impl(self, s, dim, norm, {}); } const Tensor& fft_hfftn_out( const Tensor& self, - c10::optional s, - c10::optional dim, c10::optional norm, + at::OptionalIntArrayRef s, + at::OptionalIntArrayRef dim, c10::optional norm, const Tensor& out) { fft_hfftn_impl(self, s, dim, norm, out); return out; @@ -535,8 +543,8 @@ const Tensor& fft_hfftn_out( static Tensor fft_ihfftn_impl( const Tensor& self, - const c10::optional& s, - const c10::optional& dim, + const at::OptionalIntArrayRef& s, + const at::OptionalIntArrayRef& dim, const c10::optional& norm_str, const Tensor& out) { constexpr c10::string_view fname = "ihfftn"; @@ -560,80 +568,80 @@ static Tensor fft_ihfftn_impl( Tensor fft_ihfftn( const Tensor& self, - c10::optional s, - c10::optional dim, + at::OptionalIntArrayRef s, + at::OptionalIntArrayRef dim, c10::optional norm) { return fft_ihfftn_impl(self, s, dim, norm, {}); } const Tensor& fft_ihfftn_out( const Tensor& self, - c10::optional s, - c10::optional dim, + at::OptionalIntArrayRef s, + at::OptionalIntArrayRef dim, c10::optional norm, const Tensor& out) { fft_ihfftn_impl(self, s, dim, norm, out); return out; } -Tensor fft_fft2(const Tensor& self, c10::optional s, +Tensor fft_fft2(const Tensor& self, at::OptionalIntArrayRef s, IntArrayRef dim, c10::optional norm) { return native::fft_fftn(self, s, dim, std::move(norm)); } -Tensor& fft_fft2_out(const Tensor& self, c10::optional s, +Tensor& fft_fft2_out(const Tensor& self, at::OptionalIntArrayRef s, IntArrayRef dim, c10::optional norm, Tensor& out) { return native::fft_fftn_out(self, s, dim, std::move(norm), out); } -Tensor fft_ifft2(const Tensor& self, c10::optional s, +Tensor fft_ifft2(const Tensor& self, at::OptionalIntArrayRef s, IntArrayRef dim, c10::optional norm) { return native::fft_ifftn(self, s, dim, std::move(norm)); } -Tensor& fft_ifft2_out(const Tensor& self, c10::optional s, +Tensor& fft_ifft2_out(const Tensor& self, at::OptionalIntArrayRef s, IntArrayRef dim, c10::optional norm, Tensor& out) { return native::fft_ifftn_out(self, s, dim, std::move(norm), out); } -Tensor fft_rfft2(const Tensor& self, c10::optional s, +Tensor fft_rfft2(const Tensor& self, at::OptionalIntArrayRef s, IntArrayRef dim, c10::optional norm) { return native::fft_rfftn(self, s, dim, std::move(norm)); } -Tensor& fft_rfft2_out(const Tensor& self, c10::optional s, +Tensor& fft_rfft2_out(const Tensor& self, at::OptionalIntArrayRef s, IntArrayRef dim, c10::optional norm, Tensor& out) { return native::fft_rfftn_out(self, s, dim, std::move(norm), out); } -Tensor fft_irfft2(const Tensor& self, c10::optional s, +Tensor fft_irfft2(const Tensor& self, at::OptionalIntArrayRef s, IntArrayRef dim, c10::optional norm) { return native::fft_irfftn(self, s, dim, std::move(norm)); } -Tensor& fft_irfft2_out(const Tensor& self, c10::optional s, +Tensor& fft_irfft2_out(const Tensor& self, at::OptionalIntArrayRef s, IntArrayRef dim, c10::optional norm, Tensor& out) { return native::fft_irfftn_out(self, s, dim, std::move(norm), out); } const Tensor& fft_hfft2_out( - const Tensor& self, c10::optional s, IntArrayRef dim, + const Tensor& self, at::OptionalIntArrayRef s, IntArrayRef dim, c10::optional norm, const Tensor& out) { return native::fft_hfftn_out(self, s, dim, std::move(norm), out); } -Tensor fft_hfft2(const Tensor& self, c10::optional s, +Tensor fft_hfft2(const Tensor& self, at::OptionalIntArrayRef s, IntArrayRef dim, c10::optional norm) { return native::fft_hfftn(self, s, dim, std::move(norm)); } const Tensor& fft_ihfft2_out( - const Tensor& self, c10::optional s, IntArrayRef dim, + const Tensor& self, at::OptionalIntArrayRef s, IntArrayRef dim, c10::optional norm, const Tensor& out) { return native::fft_ihfftn_out(self, s, dim, std::move(norm), out); } -Tensor fft_ihfft2(const Tensor& self, c10::optional s, +Tensor fft_ihfft2(const Tensor& self, at::OptionalIntArrayRef s, IntArrayRef dim, c10::optional norm) { return native::fft_ihfftn(self, s, dim, std::move(norm)); } @@ -684,7 +692,7 @@ Tensor fft_rfftfreq(int64_t n, double d, // If an array dim is specified, wraps them according to self.dim(). // Otherwise returns a vector of all dims. -DimVector default_alldims(const Tensor& self, c10::optional dim_opt) { +DimVector default_alldims(const Tensor& self, at::OptionalIntArrayRef dim_opt) { DimVector dim; if (dim_opt) { IntArrayRef dim_unwrapped = *dim_opt; @@ -699,7 +707,7 @@ DimVector default_alldims(const Tensor& self, c10::optional dim_opt return dim; } -Tensor fft_fftshift(const Tensor& x, c10::optional dim_opt) { +Tensor fft_fftshift(const Tensor& x, at::OptionalIntArrayRef dim_opt) { auto dim = default_alldims(x, dim_opt); IntArrayRef x_sizes = x.sizes(); @@ -711,7 +719,7 @@ Tensor fft_fftshift(const Tensor& x, c10::optional dim_opt) { return at::roll(x, shift, dim); } -Tensor fft_ifftshift(const Tensor& x, c10::optional dim_opt) { +Tensor fft_ifftshift(const Tensor& x, at::OptionalIntArrayRef dim_opt) { auto dim = default_alldims(x, dim_opt); IntArrayRef x_sizes = x.sizes(); @@ -756,14 +764,11 @@ static Stream& write_opt(Stream& SS, const optional& value) { * * This is modeled after librosa but with support for complex time-domain * signals and complex windows. - * - * NOTE: librosa's center and pad_mode arguments are currently only implemented - * in python because it uses torch.nn.functional.pad which is python-only. */ Tensor stft(const Tensor& self, const int64_t n_fft, const optional hop_lengthOpt, const optional win_lengthOpt, const c10::optional& window_opt, - const bool normalized, const optional onesidedOpt, - const optional return_complexOpt) { + const bool center, c10::string_view mode, const bool normalized, + const optional onesidedOpt, const optional return_complexOpt) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned window_maybe_owned = at::borrow_from_optional_tensor(window_opt); const Tensor& window = *window_maybe_owned; @@ -821,6 +826,19 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const optional hop if (self.dim() == 1) { input = input.unsqueeze(0); } + + if (center) { + const auto input_shape = input.sizes(); + const auto input_dim = input_shape.size(); + const auto extra_dims = std::max(size_t{3}, input_dim) - input_dim; + const auto pad_amount = n_fft / 2; + + DimVector extended_shape(extra_dims, 1); + extended_shape.append(input_shape.begin(), input_shape.end()); + input = at::pad(input.view(extended_shape), {pad_amount, pad_amount}, mode); + input = input.view(IntArrayRef(input.sizes()).slice(extra_dims)); + } + int64_t batch = input.size(0); int64_t len = input.size(1); if (n_fft <= 0 || n_fft > len) { @@ -894,6 +912,17 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const optional hop } } +Tensor stft( + const Tensor& self, const int64_t n_fft, const optional hop_lengthOpt, + const optional win_lengthOpt, const c10::optional& window_opt, + const bool normalized, + const optional onesidedOpt, const optional return_complexOpt) { + return at::stft( + self, n_fft, hop_lengthOpt, win_lengthOpt, window_opt, + /*center=*/false, /*mode=*/"constant", normalized, onesidedOpt, + return_complexOpt); +} + // Create complex tensor from the old style of real tensor with size=(..., 2) // This is to support istft in the transition to requiring complex input. // NOTE: This may return a view of the input tensor, or might clone if necessary @@ -1087,14 +1116,6 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const optional ho #undef REPR } -Tensor stft(const Tensor& self, const int64_t n_fft, const optional hop_lengthOpt, - const optional win_lengthOpt, const Tensor& window, - const bool normalized, const optional onesidedOpt) { - return at::native::stft( - self, n_fft, hop_lengthOpt, win_lengthOpt, window, normalized, onesidedOpt, - /*return_complex=*/c10::nullopt); -} - Tensor istft(const Tensor& self, const int64_t n_fft, const optional hop_lengthOpt, const optional win_lengthOpt, const Tensor& window, const bool center, const bool normalized, const optional onesidedOpt, diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp index 340bc5a822ad..613f6bb2bd70 100644 --- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp +++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp @@ -62,6 +62,8 @@ #include #include #include +#include +#include #include #include @@ -74,13 +76,29 @@ namespace at { namespace meta { -native::SCATTER_GATHER_OP get_operator_enum(const c10::string_view reduce) { - if (reduce == "add") { - return native::SCATTER_GATHER_OP::REDUCE_ADD; - } else if (reduce == "multiply") { - return native::SCATTER_GATHER_OP::REDUCE_MULTIPLY; +native::SCATTER_GATHER_OP get_operator_enum(const c10::string_view reduce, bool use_new_options = false) { + if (use_new_options) { + if (reduce == "sum") { + return native::SCATTER_GATHER_OP::REDUCE_ADD; + } else if (reduce == "prod") { + return native::SCATTER_GATHER_OP::REDUCE_MULTIPLY; + } else if (reduce == "mean") { + return native::SCATTER_GATHER_OP::REDUCE_MEAN; + } else if (reduce == "amax") { + return native::SCATTER_GATHER_OP::REDUCE_MAXIMUM; + } else if (reduce == "amin") { + return native::SCATTER_GATHER_OP::REDUCE_MINIMUM; + } else { + TORCH_CHECK(false, "reduce argument must be either sum, prod, mean, amax or amin."); + } } else { - TORCH_CHECK(false, "reduce argument must be either add or multiply."); + if (reduce == "add") { + return native::SCATTER_GATHER_OP::REDUCE_ADD; + } else if (reduce == "multiply") { + return native::SCATTER_GATHER_OP::REDUCE_MULTIPLY; + } else { + TORCH_CHECK(false, "reduce argument must be either add or multiply.") + } } } @@ -113,7 +131,7 @@ TORCH_META_FUNC(gather) at::native::gather_shape_check(self, wrapped_dim, index); } -template +template void scatter_meta_impl( Meta& meta, const Tensor& self, @@ -137,7 +155,7 @@ void scatter_meta_impl( meta.set_output(self.sizes(), self.options()); if (reduce.has_value()) { // Check if we have a valid reduce operator. - get_operator_enum(reduce.value()); + get_operator_enum(reduce.value(), use_new_options); } } @@ -174,6 +192,17 @@ TORCH_META_FUNC(scatter_add) scatter_meta_impl(*this, self, dim, index, src, "add"); } +TORCH_META_FUNC2(scatter_reduce, two) +(const Tensor& self, + int64_t dim, + const Tensor& index, + const Tensor& src, + const c10::string_view reduce, + bool include_self) { + (void) include_self; + scatter_meta_impl(*this, self, dim, index, src, reduce); +} + TORCH_PRECOMPUTE_META_FUNC(index_copy) (const Tensor& self, int64_t dim, const Tensor& index, const Tensor& source) { dim = maybe_wrap_dim(dim, self.dim()); @@ -233,28 +262,33 @@ TORCH_PRECOMPUTE_META_FUNC(index_copy) return TORCH_PRECOMPUTE_STRUCT(index_copy)().set_dim(dim); } -TORCH_PRECOMPUTE_META_FUNC(index_add) -(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& source, const Scalar& alpha) { - dim = maybe_wrap_dim(dim, self.dim()); +template +void index_func_meta_impl( + Meta& meta, + const Tensor& self, + int64_t dim, + const Tensor& index, + const Tensor& source, + c10::string_view func) { auto numel = index.numel(); - TORCH_CHECK_INDEX(index.dim() <= 1, "index_add_(): Index is supposed to be a vector, but got dim: ", + TORCH_CHECK_INDEX(index.dim() <= 1, func, "_(): Index is supposed to be a vector, but got dim: ", index.dim(), " with type: ", index.scalar_type(), " and size: ", index.sizes()); TORCH_CHECK(index.scalar_type() == ScalarType::Long || index.scalar_type() == ScalarType::Int, - "index_add_(): Expected dtype int32/int64 for index but got: ", index.scalar_type()); + func, "_(): Expected dtype int32/int64 for index but got: ", index.scalar_type()); TORCH_CHECK(self.scalar_type() == source.scalar_type(), - "index_add_(): self (", self.scalar_type(), ") and source (", source.scalar_type(), + func, "_(): self (", self.scalar_type(), ") and source (", source.scalar_type(), ") must have the same scalar type"); TORCH_CHECK(dim == 0 || dim < source.dim(), - "index_add_(): Indexing dim ", dim, " is out of bounds of the source tensor with dim ", + func, "_(): Indexing dim ", dim, " is out of bounds of the source tensor with dim ", source.dim()); TORCH_CHECK(numel == (source.dim() == 0 ? 1 : source.size(dim)), - "index_add_(): Number of indices (", numel, ") should be equal to source.size(dim): (", + func, "_(): Number of indices (", numel, ") should be equal to source.size(dim): (", source.size(dim), "), for dim: ", dim); - auto& result = maybe_get_output(0); + auto& result = meta.maybe_get_output(0); bool is_defined = result.defined(); - set_output(self.sizes(), self.options()); + meta.set_output(self.sizes(), self.options()); if (is_defined) { at::assert_no_internal_overlap(result); at::assert_no_overlap(result, index); @@ -269,10 +303,30 @@ TORCH_PRECOMPUTE_META_FUNC(index_add) auto sourceSlice = source.select(dim, 0); auto iter = TensorIterator::borrowing_binary_op(selfSlice, selfSlice, sourceSlice); } +} +TORCH_PRECOMPUTE_META_FUNC(index_add) +(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& source, const Scalar& alpha) { + dim = maybe_wrap_dim(dim, self.dim()); + index_func_meta_impl(*this, self, dim, index, source, "index_add"); return TORCH_PRECOMPUTE_STRUCT(index_add)().set_dim(dim); } +TORCH_PRECOMPUTE_META_FUNC(index_reduce) +(const Tensor& self, + int64_t dim, + const Tensor& index, + const Tensor& source, + const c10::string_view reduce, + bool include_self) { + (void)include_self; + TORCH_CHECK(reduce == "prod" || reduce == "mean" || reduce == "amax" || reduce == "amin", + "index_reduce(): Expected reduce to be one of prod, mean, amax or amin but got ", reduce, "."); + dim = maybe_wrap_dim(dim, self.dim()); + index_func_meta_impl(*this, self, dim, index, source, "index_reduce"); + return TORCH_PRECOMPUTE_STRUCT(index_reduce)().set_dim(dim); +} + } // namespace meta namespace native { @@ -296,6 +350,7 @@ DEFINE_DISPATCH(scatter_fill_stub); DEFINE_DISPATCH(scatter_add_stub); DEFINE_DISPATCH(scatter_reduce_stub); DEFINE_DISPATCH(scatter_scalar_reduce_stub); +DEFINE_DISPATCH(scatter_reduce_two_stub); static bool all_strides_match(TensorList tensors) { TORCH_CHECK(tensors.size() >= 1); @@ -759,6 +814,7 @@ TORCH_IMPL_FUNC(index_copy_out) result_dim_stride); } +// Not calling into index_reduce_func_impl because of a different dtype dispatch TORCH_IMPL_FUNC(index_add_cpu_out) (const Tensor& self, int64_t dim, const Tensor& index, const Tensor& source, const Scalar& alpha, const Tensor& result) { if (!result.is_same(self)) result.copy_(self); @@ -825,6 +881,164 @@ TORCH_IMPL_FUNC(index_add_cpu_out) } } +void index_reduce_func_impl( + const Tensor& self, + int64_t dim, + const Tensor& index, + const Tensor& source, + bool include_self, + const Tensor& result, + const SCATTER_GATHER_OP& op) { + if (!result.is_same(self)) result.copy_(self); + if (!include_self) { + AT_DISPATCH_FLOATING_TYPES_AND2( + at::ScalarType::Half, at::ScalarType::BFloat16, + self.scalar_type(), "index_reduce_func_exclude_input_init", [&] { + scalar_t init_val; + switch (op) { + case SCATTER_GATHER_OP::REDUCE_MULTIPLY: + init_val = (scalar_t)1; + break; + case SCATTER_GATHER_OP::REDUCE_MAXIMUM: + init_val = std::numeric_limits::has_infinity ? -std::numeric_limits::infinity() + : std::numeric_limits::lowest(); + break; + case SCATTER_GATHER_OP::REDUCE_MINIMUM: + init_val = std::numeric_limits::has_infinity ? std::numeric_limits::infinity() + : std::numeric_limits::max(); + break; + default: + init_val = (scalar_t)0; + break; + } + // index_fill_ requires index to be a LongTensor + result.index_fill_(dim, index.to(at::ScalarType::Long), init_val); + }); + } + + auto numel = index.numel(); + + auto index_contig = index.contiguous(); + + if (result.dim() > 1) { + // Equivalent to: + // for (const auto i : c10::irange(numel)) { + // auto selfSlice = self.select(dim, index_data[i]); + // auto sourceSlice = source.select(dim, i); + // selfSlice.op_(sourceSlice); + // } + // But much faster as this reuses the iterator from the binary op + if (numel == 0) { + return; + } + auto selfSlice = result.select(dim, 0); + auto sourceSlice = source.select(dim, 0); + auto self_stride_bytes = result.stride(dim) * elementSize(result.scalar_type()); + auto source_stride_bytes = source.stride(dim) * elementSize(source.scalar_type()); + auto self_dim_size = result.size(dim); + auto iter = TensorIterator::borrowing_binary_op(selfSlice, selfSlice, sourceSlice); + + AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_func_cpu_", [&] () { + auto index_data = index_contig.data_ptr(); + for (const auto i : c10::irange(numel)) { + auto self_i = index_data[i]; + TORCH_CHECK_INDEX((self_i >= 0) && (self_i < self_dim_size), "index out of range in self"); + auto self_data = static_cast(selfSlice.data_ptr()) + self_i * self_stride_bytes; + auto source_data = static_cast(sourceSlice.data_ptr()) + i * source_stride_bytes; + iter.unsafe_replace_operand(0, self_data); + iter.unsafe_replace_operand(1, self_data); + iter.unsafe_replace_operand(2, source_data); + + switch (op) { + case SCATTER_GATHER_OP::REDUCE_MULTIPLY : + mul_stub(iter.device_type(), iter); + break; + case SCATTER_GATHER_OP::REDUCE_MINIMUM : + minimum_stub(iter.device_type(), iter); + break; + case SCATTER_GATHER_OP::REDUCE_MAXIMUM : + maximum_stub(iter.device_type(), iter); + break; + default : + add_stub(iter.device_type(), iter, 1); + break; + } + } + }); + + if (op == SCATTER_GATHER_OP::REDUCE_MEAN) { + auto counts = include_self ? at::ones_like(result) : at::zeros_like(result); + counts.index_add_(dim, index, at::ones_like(source)); + counts.masked_fill_(counts == 0, 1); + result.div_(counts); + } + } + else { + TORCH_CHECK(source.dim() <= 1, "source.dim() (", source.dim(), ") must one or zero for given self.dim() (", self.dim(), ")"); + auto counts = include_self ? at::ones_like(result) : at::zeros_like(result); + // explicitly capture all required variables to work around windows build + // TODO: fix this when windows can correctly capture variables in nested lambda + AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, + result.scalar_type(), "index_func_", [&result, &source, &dim, &index_contig, &numel, &op, &counts] { + auto result_stride = result.dim() == 0 ? 1 : result.stride(dim); + auto source_stride = source.dim() == 0 ? 1 : source.stride(dim); + auto counts_stride = counts.dim() == 0 ? 1 : counts.stride(dim); + // TODO: Maybe TensorAccessor can be used here? + auto* result_ptr = result.data_ptr(); + auto* source_ptr = source.data_ptr(); + auto counts_ptr = counts.data_ptr(); + AT_DISPATCH_INDEX_TYPES(index_contig.scalar_type(), "index_func_cpu_", + [&index_contig, &numel, &result, &result_ptr, &result_stride, &source_ptr, &source_stride, &op, &counts_ptr, &counts_stride] { + auto index_data = index_contig.data_ptr(); + for (const auto i : c10::irange(numel)) { + auto self_i = index_data[i]; + TORCH_CHECK_INDEX((self_i >= 0) && (self_i < result.numel()), "index out of range in self"); + scalar_t *self_ip = result_ptr + self_i * result_stride; + scalar_t *count_ip; + scalar_t val; + switch (op) { + case SCATTER_GATHER_OP::REDUCE_MEAN : + *self_ip += *(source_ptr + i * source_stride); + count_ip = counts_ptr + self_i * counts_stride; + *count_ip += 1; + break; + case SCATTER_GATHER_OP::REDUCE_MULTIPLY : + *self_ip *= *(source_ptr + i * source_stride); + break; + case SCATTER_GATHER_OP::REDUCE_MINIMUM : + val = *(source_ptr + i * source_stride); + *self_ip = at::_isnan(val) ? val : std::min(*self_ip, val); + break; + case SCATTER_GATHER_OP::REDUCE_MAXIMUM : + val = *(source_ptr + i * source_stride); + *self_ip = at::_isnan(val) ? val : std::max(*self_ip, val); + break; + default: + break; + } + } + }); + }); + if (op == SCATTER_GATHER_OP::REDUCE_MEAN) { + counts.masked_fill_(counts == 0, 1); + result.div_(counts); + } + } +} + +TORCH_IMPL_FUNC(index_reduce_cpu_out) +(const Tensor& self, + int64_t dim, + const Tensor& index, + const Tensor& source, + const c10::string_view reduce, + bool include_input, + const Tensor& result) { + TORCH_WARN_ONCE("index_reduce() is in beta and the API may change at any time."); + auto op = meta::get_operator_enum(reduce, true); + index_reduce_func_impl(self, dim, index, source, include_input, result, op); +} + // Check that indices fall within dimension array size // Avoid redispatch call to min/max template @@ -880,9 +1094,6 @@ Tensor & index_select_out_cpu_dim1_( for (const auto i : c10::irange(N)) { auto idx = idxs[i]; - if (idx < 0) { - idx = idx + src_indexing_axis_dim; - } dst_floats[i] = src_floats[idx]; } } @@ -892,10 +1103,6 @@ Tensor & index_select_out_cpu_dim1_( for (const auto batch : c10::irange(outer_dims_product)) { for (const auto i : c10::irange(N)) { auto idx = idxs[i]; - if (idx < 0) { - idx = idx + src_indexing_axis_dim; - } - auto src = src_base + batch * src_batch_bytesize + idx * block_bytesize; auto dst = out + batch * gathered_batch_bytesize + i * block_bytesize; memcpy(dst, src, block_bytesize); @@ -1071,7 +1278,12 @@ Tensor index_select_quantized_cpu_(const Tensor & self, int64_t dim, const Tenso } Tensor index_select_backward(const Tensor& grad, IntArrayRef self_sizes, int64_t dim, const Tensor& index) { - return at::zeros(self_sizes, grad.options()).index_add_(dim, index, grad); + // for composite compliance, use out-of-place variant of + // `index_add` if index tensor is a Tensor Subclass. + if (isTensorSubclassLike(index)) { + return grad.new_zeros(self_sizes, grad.options()).index_add(dim, index, grad); + } + return grad.new_zeros(self_sizes, grad.options()).index_add_(dim, index, grad); } Tensor & index_fill_(Tensor & self, int64_t dim, const Tensor & index, const Scalar& source) { @@ -1173,10 +1385,49 @@ Tensor gather_backward(const Tensor& grad, const Tensor& self, int64_t dim, cons if (sparse_grad) { return at::_gather_sparse_backward(self, dim, index, grad); } - return grad.new_zeros(self.sizes()).scatter_add_(dim, index, grad); + auto result = grad.new_zeros(self.sizes()); + // for composite compliance, use out-of-place variant of + // `scatter_add` if index tensor is a Tensor Subclass. + if (isTensorSubclassLike(index)) { + return result.scatter_add(dim, index, grad); + } + result.scatter_add_(dim, index, grad); + return result; +} + +static void scatter_reduce_exclude_self_helper( + const Tensor& self, + int64_t dim, + const Tensor& index, + const SCATTER_GATHER_OP& op) { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( + at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, + self.scalar_type(), "scatter_reduce_exclude_input_init", [&] { + scalar_t init_val; + switch (op) { + case SCATTER_GATHER_OP::REDUCE_ADD: + init_val = (scalar_t)0; + break; + case SCATTER_GATHER_OP::REDUCE_MULTIPLY: + init_val = (scalar_t)1; + break; + case SCATTER_GATHER_OP::REDUCE_MAXIMUM: + init_val = std::numeric_limits::has_infinity ? -std::numeric_limits::infinity() + : std::numeric_limits::lowest(); + break; + case SCATTER_GATHER_OP::REDUCE_MINIMUM: + init_val = std::numeric_limits::has_infinity ? std::numeric_limits::infinity() + : std::numeric_limits::max(); + break; + case SCATTER_GATHER_OP::REDUCE_MEAN: + init_val = (scalar_t)0; + break; + } + self.scatter_(dim, index, init_val); + }); } -template +template void scatter_impl( const Tensor& self, int64_t dim, @@ -1185,7 +1436,8 @@ void scatter_impl( const Tensor& out, ReduceStub& reduce_stub, FillStub& fill_stub, - const c10::optional reduce = nullopt) { + const c10::optional reduce = nullopt, + bool reduce_includes_self = true) { dim = at::maybe_wrap_dim(dim, self.dim()); auto mut_out = const_cast(out); @@ -1197,7 +1449,11 @@ void scatter_impl( if (index.numel() == 0) return; if (reduce.has_value()) { - auto op = meta::get_operator_enum(reduce.value()); + auto op = meta::get_operator_enum(reduce.value(), use_new_options); + if (!reduce_includes_self) { + // scatter inits for reduction to appropriate indices (used by scatter_reduce.two) + scatter_reduce_exclude_self_helper(mut_out, dim, index, op); + } reduce_stub(self.device().type(), mut_out, dim, index, src, op); } else { fill_stub(self.device().type(), mut_out, dim, index, src); @@ -1282,113 +1538,35 @@ TORCH_IMPL_FUNC(scatter_add) } } -Tensor scatter_reduce_two_cpu(const Tensor& self, - int64_t dim, - const Tensor& index, - const c10::string_view reduce, - const c10::optional output_size) { - - // TODO: Add documentation. - - - TORCH_CHECK(dim >= -self.dim() && dim < self.dim(), - "Expected `dim` to be in range ", -self.dim(), " to ", self.dim() - 1, " (got ", dim, ")"); - - dim = dim < 0 ? dim + self.dim() : dim; - - auto sizes = self.sizes().vec(); - if (output_size.has_value()) { - sizes[dim] = output_size.value(); - } else { - sizes[dim] = index.numel() > 0 ? index.max().item() + 1: 0; - } - Tensor out = at::empty(sizes, self.options()); - - TORCH_CHECK(self.dim() == index.dim(), - "Shape mismatch between `self` (got ", self.sizes(), ") and `index` (got ", index.sizes(), ")"); - for (const auto i : c10::irange(self.dim())) { - TORCH_CHECK(self.size(i) == index.size(i), - "Shape mismatch between `self` (got ", self.sizes(), ") and `index` (got ", index.sizes(), ")"); - } - - TORCH_CHECK(reduce == "sum" || reduce == "prod" || reduce == "mean" || reduce == "amax" || reduce =="amin", - "`reduce` argument must be one of ('sum', 'prod', 'mean', 'amax', 'amin'"); - - if (self.numel() == 0) { - return out.zero_(); - } - - AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, self.scalar_type(), "scatter_reduce", [&] { - if (reduce == "prod") { - out.fill_((scalar_t)1); - } else if (reduce == "amax") { - out.fill_(std::numeric_limits::lowest()); - } else if (reduce == "amin") { - out.fill_(std::numeric_limits::max()); +TORCH_IMPL_FUNC(scatter_reduce_two) +(const Tensor& self, + int64_t dim, + const Tensor& index, + const Tensor& src, + const c10::string_view reduce, + bool include_self, + const Tensor& out) { + // See issue https://github.com/pytorch/pytorch/issues/74770 + TORCH_WARN_ONCE("scatter_reduce() is in beta and the API may change at any time."); + + scatter_impl(self, dim, index, src, out, + scatter_reduce_two_stub, + scatter_stub, + reduce, + include_self); + + if (meta::get_operator_enum(reduce, true) == SCATTER_GATHER_OP::REDUCE_MEAN) { + auto ones = at::ones_like(src); + auto count = include_self ? at::ones_like(out) : at::zeros_like(out); + count.scatter_add_(dim, index, ones); + count.masked_fill_(count == 0, 1); + + if (out.is_floating_point() || out.is_complex()) { + out.div_(count); } else { - out.fill_((scalar_t)0); - } - - - auto self_cont = self.contiguous(); - auto index_cont = index.contiguous(); - auto self_data = self_cont.data_ptr(); - auto index_data = index_cont.data_ptr(); - bool out_is_contiguous = out.is_contiguous(); - auto out_cont = out.contiguous(); - auto out_cont_data = out_cont.data_ptr(); - - auto counts = at::zeros_like(out_cont); - auto counts_data = counts.data_ptr(); - - - int64_t offset1 = 1, offset2 = 1; - for (const auto d : c10::irange(dim)) { - offset1 *= self.size(d); + out.div_(count, "floor"); } - for (int64_t d = dim + 1; d < self.dim(); d++) { - offset2 *= self.size(d); - } - - scalar_t value; - int64_t dim_index; - for (const auto i : c10::irange(offset1)) { - for (const auto j : c10::irange(self.size(dim))) { - for (const auto k : c10::irange(offset2)) { - value = self_data[i * self_cont.stride(dim) * self_cont.size(dim) + j * self_cont.stride(dim) + k]; - dim_index = index_data[i * index_cont.stride(dim) * index_cont.size(dim) + j * index_cont.stride(dim) + k]; - TORCH_CHECK(dim_index >= 0 && dim_index < out.size(dim), - "Expected `index` values to be in range ", 0, " to ", out.size(dim), " (got ", dim_index, ")"); - int64_t ind = i * out_cont.stride(dim) * out_cont.size(dim) + dim_index * out_cont.stride(dim) + k; - if (reduce == "sum") { - out_cont_data[ind] += value; - } else if (reduce == "prod") { - out_cont_data[ind] *= value; - } else if (reduce == "mean") { - auto n = counts_data[ind]; - out_cont_data[ind] = (out_cont_data[ind] * n + value) / (n + 1); - counts_data[ind] += 1; - } else if (reduce == "amax") { - out_cont_data[ind] = std::max(out_cont_data[ind], value); - } else { - out_cont_data[ind] = std::min(out_cont_data[ind], value); - } - } - } - } - - if (reduce == "amin" || reduce == "amax") { - auto val = (reduce == "amin") ? std::numeric_limits::max() : std::numeric_limits::lowest(); - out_cont.masked_fill_(out_cont == val, (scalar_t)0); - } - - if (!out_is_contiguous) { - out.copy_(out_cont); - } - - }); - - return out; + } } Tensor masked_scatter(const Tensor & self, const Tensor & mask, const Tensor & source) { @@ -1566,7 +1744,14 @@ Tensor masked_select_backward(const Tensor& grad, const Tensor& input, const Ten // implicitly handles broadcasting). auto result = at::zeros_like( input.expand(at::infer_size(input.sizes(), mask.sizes())), at::MemoryFormat::Preserve); - return result.masked_scatter_(mask, grad); + + // for composite compliance, use out-of-place variant + // of `masked_scatter`. + if (areAnyTensorSubclassLike({grad, mask})) { + return result.masked_scatter(mask, grad); + } + result.masked_scatter_(mask, grad); + return result; } namespace { diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.h b/aten/src/ATen/native/TensorAdvancedIndexing.h index 56012881ac68..a0c282d550e4 100644 --- a/aten/src/ATen/native/TensorAdvancedIndexing.h +++ b/aten/src/ATen/native/TensorAdvancedIndexing.h @@ -2,16 +2,17 @@ // Indexing tensors by tensors -#include +#include +#include #include namespace at { - struct TensorIterator; +struct TensorIterator; } namespace at { namespace native { -enum class SCATTER_GATHER_OP: uint8_t {REDUCE_ADD, REDUCE_MULTIPLY}; +enum class SCATTER_GATHER_OP: uint8_t {REDUCE_ADD, REDUCE_MULTIPLY, REDUCE_MAXIMUM, REDUCE_MINIMUM, REDUCE_MEAN}; using index_put_with_sort_fn = void(*)(Tensor &, const c10::List> &, const Tensor &, bool accumulate, bool unsafe); @@ -23,6 +24,8 @@ using scatter_reduce_fn = void(*)(const Tensor& self, const int64_t dim, const T const Tensor& src, const SCATTER_GATHER_OP& reduce); using scatter_scalar_reduce_fn = void(*)(const Tensor& self, const int64_t dim, const Tensor& index, const Scalar& value, const SCATTER_GATHER_OP& reduce); +using scatter_reduce_two_fn = void(*)(const Tensor& self, const int64_t dim, const Tensor& index, + const Tensor& src, const SCATTER_GATHER_OP& reduce); DECLARE_DISPATCH(index_put_with_sort_fn, index_put_with_sort_stub); @@ -32,6 +35,7 @@ DECLARE_DISPATCH(scatter_fill_fn, scatter_fill_stub); DECLARE_DISPATCH(scatter_add_fn, scatter_add_stub); DECLARE_DISPATCH(scatter_reduce_fn, scatter_reduce_stub); DECLARE_DISPATCH(scatter_scalar_reduce_fn, scatter_scalar_reduce_stub); +DECLARE_DISPATCH(scatter_reduce_two_fn, scatter_reduce_two_stub); TORCH_API Tensor& index_out(Tensor& result, const Tensor & self, const c10::List>& indices); diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp index 13283d244d67..2e723fdae538 100644 --- a/aten/src/ATen/native/TensorCompare.cpp +++ b/aten/src/ATen/native/TensorCompare.cpp @@ -5,10 +5,13 @@ #include #include #include +#include #include #include +#include #include #include +#include namespace at { namespace meta { @@ -29,12 +32,117 @@ const OptionalScalarRef max) { if (!min && !max) { TORCH_CHECK(false, "torch.clamp: At least one of 'min' or 'max' must not be None"); } + //Manual type promotion, since scalars have to participate in it + ScalarType result_type = self.scalar_type(); + TORCH_CHECK(!isComplexType(result_type), "clamp is not supported for complex types"); + //Floating is the highest supported + if (!isFloatingType(result_type)) { + at::native::ResultTypeState state = {}; + state = at::native::update_result_type_state(self, state); + + if (min) { + state = at::native::update_result_type_state(min.get(), state); + } + if (max) { + state = at::native::update_result_type_state(max.get(), state); + } + result_type = at::native::result_type(state); + //disallow type promoting inplace op + TORCH_CHECK((result_type == self.scalar_type()) || + (!(maybe_get_output().defined()) || !(maybe_get_output().is_same(self))), + "result type ", result_type, " can't be cast to the desired output type ", + self.dtype()); + } + //make sure scalars weren't complex + TORCH_CHECK(!isComplexType(result_type), "clamp is not supported for complex types"); + build_unary_op(maybe_get_output(), self.to(result_type)); +} + +TORCH_META_FUNC2(clamp, Tensor) ( +const Tensor& self, +const OptionalTensorRef min, +const OptionalTensorRef max) { + TORCH_CHECK(min || max, "torch.clamp: At least one of 'min' or 'max' must not be None"); + TORCH_CHECK(!isComplexType(self.scalar_type()), "clamp is not supported for complex types"); + #define CLAMP_CONFIG() \ + TensorIteratorConfig() \ + .set_check_mem_overlap(true) \ + .add_output(maybe_get_output()) \ + .add_input(self) \ + .promote_inputs_to_common_dtype(true) \ + .cast_common_dtype_to_outputs(true) \ + .enforce_safe_casting_to_output(true) + + if (min && max) { + build(CLAMP_CONFIG().add_input(*min).add_input(*max)); + } else if (min) { + build(CLAMP_CONFIG().add_input(*min)); + } else if (max) { + build(CLAMP_CONFIG().add_input(*max)); + } +} + + +TORCH_META_FUNC(clamp_max) ( + const Tensor& self, + const Scalar& max +) { + //we could wrap max into tensor and send to tensor overload, + //but relu is implemented via clamp_min, so for perf an uniformity reasons + //do a faster but correct thing + ScalarType result_type = self.scalar_type(); + TORCH_CHECK(!isComplexType(result_type), "clamp is not supported for complex types"); + TORCH_CHECK(!max.isComplex(), "clamp is not supported for complex types"); + //Floating is the highest supported + if (!isFloatingType(result_type)) { + auto result_type = at::native::result_type(self, max); + TORCH_CHECK((result_type == self.scalar_type()) || + (!(maybe_get_output().defined()) || !(maybe_get_output().is_same(self))), + "result type ", result_type, " can't be cast to the desired output type ", + self.dtype()); + build_unary_op(maybe_get_output(), self.to(result_type)); + } else { + build_borrowing_unary_op(maybe_get_output(), self); + } +} + +TORCH_META_FUNC2(clamp_max, Tensor) ( + const Tensor& self, + const Tensor& max +) { + build_borrowing_binary_op(maybe_get_output(), self, max); +} + + +TORCH_META_FUNC(clamp_min) ( + const Tensor& self, + const Scalar& min +) { + ScalarType result_type = self.scalar_type(); + TORCH_CHECK(!isComplexType(result_type), "clamp is not supported for complex types"); + TORCH_CHECK(!min.isComplex(), "clamp is not supported for complex types"); + //Floating is the highest supported + if (!isFloatingType(result_type)) { + auto result_type = at::native::result_type(self, min); + TORCH_CHECK((result_type == self.scalar_type() || + !(maybe_get_output().defined()) || !(maybe_get_output().is_same(self))), + "result type ", result_type, " can't be cast to the desired output type ", + self.dtype()); + build_unary_op(maybe_get_output(), self.to(result_type)); + } else { + build_borrowing_unary_op(maybe_get_output(), self); + } +} - build_borrowing_unary_op(maybe_get_output(), self); +TORCH_META_FUNC2(clamp_min, Tensor) ( + const Tensor& self, + const Tensor& min +) { + build_borrowing_binary_op(maybe_get_output(), self, min); } TORCH_META_FUNC2(isin, Tensor_Tensor) ( - const Tensor& elements, const Tensor& test_elements, bool assume_unique, bool invert + const Tensor& elements, const Tensor& test_elements, bool /*assume_unique*/, bool /*invert*/ ) { check_for_unsupported_isin_dtype(elements.scalar_type()); check_for_unsupported_isin_dtype(test_elements.scalar_type()); @@ -42,7 +150,7 @@ TORCH_META_FUNC2(isin, Tensor_Tensor) ( } TORCH_META_FUNC2(isin, Tensor_Scalar) ( - const Tensor& elements, const c10::Scalar& test_elements, bool assume_unique, bool invert + const Tensor& elements, const c10::Scalar& test_elements, bool /*assume_unique*/, bool /*invert*/ ) { check_for_unsupported_isin_dtype(elements.scalar_type()); check_for_unsupported_isin_dtype(test_elements.type()); @@ -50,7 +158,7 @@ TORCH_META_FUNC2(isin, Tensor_Scalar) ( } TORCH_META_FUNC2(isin, Scalar_Tensor) ( - const c10::Scalar& elements, const Tensor& test_elements, bool assume_unique, bool invert + const c10::Scalar& elements, const Tensor& test_elements, bool /*assume_unique*/, bool /*invert*/ ) { check_for_unsupported_isin_dtype(elements.type()); check_for_unsupported_isin_dtype(test_elements.scalar_type()); @@ -105,8 +213,6 @@ DEFINE_DISPATCH(isposinf_stub); // NOLINT(cppcoreguidelines-avoid-non-const-glob DEFINE_DISPATCH(isneginf_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) DEFINE_DISPATCH(mode_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) DEFINE_DISPATCH(clamp_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) -DEFINE_DISPATCH(clamp_min_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) -DEFINE_DISPATCH(clamp_max_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) DEFINE_DISPATCH(clamp_scalar_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) DEFINE_DISPATCH(clamp_min_scalar_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) DEFINE_DISPATCH(clamp_max_scalar_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) @@ -220,7 +326,7 @@ Tensor isfinite(const Tensor& self) { // Note: a complex value is finite iff both parts are finite if (self.is_complex()) { - return at::isfinite(self.abs()); + return at::isfinite(at::real(self)).__iand__(at::isfinite(at::imag(self))); } return AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, self.scalar_type(), "isfinite", [&]() { @@ -232,47 +338,6 @@ void _assert_async_cpu(const Tensor& self) { TORCH_CHECK(native::is_nonzero(self), "Expected Tensor with single nonzero value, but got zero"); } -namespace { - -// DO NOT USE THIS -- it's just an implementation detail of wrapped_scalar tensor below. -at::Tensor scalar_to_tensor_default_dtype( - const Scalar& s, - const Device device = at::kCPU) { - if (s.isFloatingPoint()) { - return at::scalar_tensor( - s, at::device(device).dtype(at::get_default_dtype())); - } else if (s.isBoolean()) { - return at::scalar_tensor(s, at::device(device).dtype(at::kBool)); - } else if (s.isComplex()) { - return at::scalar_tensor( - s, at::device(device).dtype(at::get_default_complex_dtype())); - } else { - TORCH_INTERNAL_ASSERT(s.isIntegral(false)); - return at::scalar_tensor(s, at::device(device).dtype(at::kLong)); - } -} - -// TLDR: Don't call `wrapped_scalar_tensor_default_dtype` -- this function is only necessary to support the partial -// type-promotion that torch.where supports. Once torch.where fully supports type promotion, we -// won't need this function. -// -// Longer explanation: -// `wrapped_scalar_tensor_default_dtype` is a bit of a hack because torch.where doesn't support type promotion, but -// does support `torch.where(tensor, scalar1, scalar2)` with default scalar types. The trickiness is we -// usually convert double scalars to doubles, and `set_wrapped_number` defines type promotion priority -// as being below tensor types rather than as the default dtype (perhaps we should?). This wouldn't matter -// if we just supported type normal type promotion on torch.where, however. -Tensor wrapped_scalar_tensor_default_dtype( - const Scalar& scalar, - Device device) { - at::Tensor tensor; - tensor = scalar_to_tensor_default_dtype(scalar, device); - tensor.unsafeGetTensorImpl()->set_wrapped_number(true); - return tensor; -} - -} // anonymous namespace - // Sorting-based algorithm for isin(); used when the number of test elements is large. static void isin_sorting( const Tensor& elements, @@ -295,7 +360,7 @@ static void isin_sorting( // 2. Stable sort all elements, maintaining order indices to reverse the // operation. Stable sort is necessary to keep elements before test // elements within the sorted list. - Tensor all_elements = at::_cat({elements_flat, test_elements_flat}); + Tensor all_elements = at::cat({elements_flat, test_elements_flat}); Tensor sorted_elements, sorted_order; std::tie (sorted_elements, sorted_order) = all_elements.sort( /*stable=*/ true, /*dim=*/ 0, /*descending=*/ false); @@ -323,35 +388,58 @@ static void isin_sorting( } } -Tensor where(const Tensor& condition, const Tensor& self, const Tensor& other) { - TORCH_CHECK(condition.device() == self.device() && self.device() == other.device(), - "Expected condition, x and y to be on the same device, but condition is on ", - condition.device(), " and x and y are on ", self.device(), " and ", other.device(), - " respectively"); - +Tensor& where_self_out(const Tensor& condition, const Tensor& self, const Tensor& other, Tensor& out) { + Tensor self_, other_; + if (self.dtype() != other.dtype()) { + auto result_type = at::native::result_type(self, other); + self_ = self.to(result_type); + other_ = other.to(result_type); + } else { + self_ = self; + other_ = other; + } if (condition.scalar_type() == ScalarType::Byte) { TORCH_WARN_ONCE("where received a uint8 condition tensor. This behavior is deprecated and will be removed in a future version of PyTorch. Use a boolean condition instead."); -} else { + } else { TORCH_CHECK(condition.scalar_type() == ScalarType::Bool, "where expected condition to be a boolean tensor, but got a tensor with dtype ", condition.scalar_type()); + } + Tensor cond_bool = condition.scalar_type() == ScalarType::Byte ? condition.to(ScalarType::Bool) : condition; + auto iter = at::TensorIteratorConfig() + .check_all_same_dtype(false) + .add_output(out) + .add_input(cond_bool) + .add_input(self_) + .add_input(other_) + .build(); + where_kernel(iter.device_type(), iter); + return out; } - c10::MaybeOwned b_condition, b_self, b_other; - std::tie(b_condition, b_self, b_other) = expand_outplace(condition, self, other, "where"); - return at::_s_where(*b_condition, *b_self, *b_other); +Tensor where(const Tensor& condition, const Tensor& self, const Tensor& other) { + auto result_type = at::native::result_type(self, other); + Tensor ret = at::empty({0}, self.options().dtype(result_type)); + at::native::where_self_out(condition, self, other, ret); + return ret; } Tensor where(const Tensor& condition, const Scalar& self, const Tensor& other) { - return at::where(condition, wrapped_scalar_tensor(self, other.device()), other); + auto result_type = at::native::result_type(other, self); + auto self_converted = at::scalar_tensor(self, other.options().dtype(result_type)); + auto other_converted = other.to(result_type); + return at::where(condition, self_converted, other_converted); } Tensor where(const Tensor& condition, const Tensor& self, const Scalar& other) { - return at::where(condition, self, wrapped_scalar_tensor(other, self.device())); + auto result_type = at::native::result_type(self, other); + auto other_converted = at::scalar_tensor(other, self.options().dtype(result_type)); + auto self_converted = self.to(result_type); + return at::where(condition, self_converted, other_converted); } Tensor where(const Tensor& condition, const Scalar& self, const Scalar& other) { - const auto device = condition.device(); - const Tensor& other_t = wrapped_scalar_tensor_default_dtype(other, device); - const Tensor& self_t = wrapped_scalar_tensor_default_dtype(self, device); + auto result_type = at::native::result_type(self, other); + const Tensor& other_t = at::scalar_tensor(other, condition.options().dtype(result_type)); + const Tensor& self_t = at::scalar_tensor(self, condition.options().dtype(result_type)); return at::where(condition, self_t, other_t); } @@ -359,22 +447,6 @@ std::vector where(const Tensor& condition) { return condition.nonzero_numpy(); } -Tensor _s_where(const Tensor& condition, const Tensor& self, const Tensor& other) { - TORCH_CHECK(self.dtype() == other.dtype(), "expected scalar type ", self.dtype(), " but found ", other.dtype()); - Tensor ret = at::empty(self.sizes(), self.options()); - // - Tensor cond_bool = condition.scalar_type() == ScalarType::Byte ? condition.to(ScalarType::Bool) : condition; - auto iter = at::TensorIteratorConfig() - .check_all_same_dtype(false) - .add_output(ret) - .add_input(cond_bool) - .add_input(self) - .add_input(other) - .build(); - where_kernel(iter.device_type(), iter); - return ret; -} - std::tuple mode(const Tensor& self, int64_t dim, bool keepdim) { Tensor values = at::empty({0}, self.options()); Tensor indices = at::empty({0}, self.options().dtype(kLong)); @@ -485,13 +557,18 @@ std::tuple _aminmax(const Tensor& self, int64_t dim, bool keepdi TORCH_IMPL_FUNC(clamp_out) ( - const Tensor& self, + const Tensor& /*self*/, const OptionalScalarRef min, const OptionalScalarRef max, const Tensor& result) { using at::native::detail::ClampLimits; if (min && max) { - clamp_scalar_stub(device_type(), *this, min.get(), max.get()); + if (min.get().toDouble() != min.get().toDouble() || + max.get().toDouble() != max.get().toDouble()) { + at::fill_(const_cast(result), std::numeric_limits::quiet_NaN()); + } else { + clamp_scalar_stub(device_type(), *this, min.get(), max.get()); + } } else if (max) { clamp_max_scalar_stub(device_type(), *this, max.get()); } else if (min) { @@ -499,112 +576,47 @@ TORCH_IMPL_FUNC(clamp_out) } } -Tensor& clamp_out(const Tensor& self, const c10::optional& min, - const c10::optional& max, Tensor& result) { +TORCH_IMPL_FUNC(clamp_Tensor_out) +(const Tensor& self, const OptionalTensorRef min, + const OptionalTensorRef max, const Tensor&) { if (min && max) { - TORCH_CHECK(self.layout() == Layout::Strided, - "torch.clamp only supports strided layout, got: ", self.layout()); - auto iter = TensorIteratorConfig() - .set_check_mem_overlap(true) - .add_output(result) - .add_input(self) - .add_input(*min) - .add_input(*max) - .promote_inputs_to_common_dtype(true) - .cast_common_dtype_to_outputs(true) - .enforce_safe_casting_to_output(true) - .build(); - clamp_stub(iter.device_type(), iter); - } else if (max) { - at::clamp_max_outf(self, *max, result); + clamp_stub(device_type(), *this); } else if (min) { - at::clamp_min_outf(self, *min, result); - } else { - TORCH_CHECK(false, "torch.clamp: At least one of 'min' or 'max' must not be None"); + maximum_stub(device_type(), *this); + } else if (max) { + minimum_stub(device_type(), *this); } - return result; -} - -Tensor clamp(const Tensor& self, const c10::optional& min, const c10::optional& max) { - Tensor result = at::empty({0}, self.options()); - return at::clamp_outf(self, min, max, result); } -Tensor clamp(const Tensor& self, const c10::optional& min, const c10::optional& max) { - Tensor result = at::empty({0}, self.options()); - return at::clamp_outf(self, min, max, result); -} - -Tensor& clamp_(Tensor& self, const c10::optional& min, const c10::optional& max) { - return at::clamp_outf(self, min, max, self); -} - -Tensor& clamp_(Tensor& self, const c10::optional& min, const c10::optional& max) { - return at::clamp_outf(self, min, max, self); -} - -Tensor& clamp_max_out(const Tensor& self, const Scalar& max, Tensor& result) { - auto iter = TensorIterator::unary_op(result, self); - clamp_max_scalar_stub(iter.device_type(), iter, max); - return result; -} - -Tensor& clamp_max_out(const Tensor& self, const Tensor& max, Tensor& result) { - TORCH_CHECK(self.layout() == Layout::Strided, - "torch.clamp only supports strided layout, got: ", self.layout()); - auto iter = TensorIterator::borrowing_binary_op(result, self, max); - clamp_max_stub(iter.device_type(), iter); - return result; -} - -Tensor clamp_max(const Tensor& self, const Scalar& max) { - Tensor result = at::empty({0}, self.options()); - return at::clamp_max_outf(self, max, result); -} - -Tensor clamp_max(const Tensor& self, const Tensor& max) { - Tensor result = at::empty({0}, self.options()); - return at::clamp_max_outf(self, max, result); -} - -Tensor& clamp_max_(Tensor& self, const Scalar& max) { - return at::clamp_max_outf(self, max, self); -} - -Tensor& clamp_max_(Tensor& self, const Tensor& max) { - return at::clamp_max_outf(self, max, self); -} - -Tensor& clamp_min_out(const Tensor& self, const Scalar& min, Tensor& result) { - auto iter = TensorIterator::unary_op(result, self); - clamp_min_scalar_stub(iter.device_type(), iter, min); - return result; -} - -Tensor& clamp_min_out(const Tensor& self, const Tensor& min, Tensor& result) { - TORCH_CHECK(self.layout() == Layout::Strided, - "torch.clamp only supports strided layout, got: ", self.layout()); - auto iter = TensorIterator::borrowing_binary_op(result, self, min); - clamp_min_stub(iter.device_type(), iter); - return result; -} - -Tensor clamp_min(const Tensor& self, const Scalar& min) { - Tensor result = at::empty({0}, self.options()); - return at::clamp_min_outf(self, min, result); +TORCH_IMPL_FUNC(clamp_max_out) +(const Tensor& self, const Scalar& max, const Tensor& result) { + if (max.toDouble() != max.toDouble()) { +//TODO this is not great, building TI again is expensive, but I can't use +//fill_stub because fill is not structured +//this is a corner case anyway + at::fill_(const_cast(result), wrapped_scalar_tensor(max)); + } else { + clamp_max_scalar_stub(device_type(), *this, max); + } } -Tensor clamp_min(const Tensor& self, const Tensor& min) { - Tensor result = at::empty({0}, self.options()); - return at::clamp_min_outf(self, min, result); +TORCH_IMPL_FUNC(clamp_max_Tensor_out) +(const Tensor& self, const Tensor& max, const Tensor& result) { + minimum_stub(device_type(), *this); } -Tensor& clamp_min_(Tensor& self, const Scalar& min) { - return at::clamp_min_outf(self, min, self); +TORCH_IMPL_FUNC(clamp_min_out) +(const Tensor& self, const Scalar& min, const Tensor& result) { + if (min.toDouble() != min.toDouble()) { + at::fill_(const_cast(result), min); + } else { + clamp_min_scalar_stub(device_type(), *this, min); + } } -Tensor& clamp_min_(Tensor& self, const Tensor& min) { - return at::clamp_min_outf(self, min, self); +TORCH_IMPL_FUNC(clamp_min_Tensor_out) +(const Tensor& self, const Tensor& min, const Tensor& result) { + maximum_stub(device_type(), *this); } // Implements the "clip" alias for clamp @@ -646,13 +658,13 @@ std::tuple max(const Tensor& self, Dimname dim, bool keepdim) { std::tuple max_out(const Tensor& self, Dimname dim, bool keepdim, Tensor& max, Tensor& max_indices) { return at::max_out(max, max_indices, self, dimname_to_position(self, dim), keepdim); } -Tensor argmax(const Tensor& self, Dimname dim, bool keepdim) { +Tensor argmax(const Tensor& /*self*/, Dimname /*dim*/, bool /*keepdim*/) { reportNYIDimnameOverload("argmax"); } -Tensor argmin(const Tensor& self, Dimname dim, bool keepdim) { +Tensor argmin(const Tensor& /*self*/, Dimname /*dim*/, bool /*keepdim*/) { reportNYIDimnameOverload("argmin"); } -Tensor argsort(const Tensor& self, Dimname dim, bool keepdim) { +Tensor argsort(const Tensor& /*self*/, Dimname /*dim*/, bool /*keepdim*/) { reportNYIDimnameOverload("argsort"); } std::tuple mode(const Tensor& self, Dimname dim, bool keepdim) { diff --git a/aten/src/ATen/native/TensorCompare.h b/aten/src/ATen/native/TensorCompare.h index e81f96b0e24a..f35cd68d4806 100644 --- a/aten/src/ATen/native/TensorCompare.h +++ b/aten/src/ATen/native/TensorCompare.h @@ -32,10 +32,8 @@ DECLARE_DISPATCH(is_infinity_op_fn, isneginf_stub); using mode_fn = void (*)(Tensor&, Tensor&, const Tensor&, int64_t, bool); DECLARE_DISPATCH(mode_fn, mode_stub); -using clamp_fn = void (*)(TensorIterator &); -DECLARE_DISPATCH(clamp_fn, clamp_stub); -DECLARE_DISPATCH(clamp_fn, clamp_min_stub); -DECLARE_DISPATCH(clamp_fn, clamp_max_stub); +using clamp_tensor_fn = void (*)(TensorIteratorBase &); +DECLARE_DISPATCH(clamp_tensor_fn, clamp_stub); namespace detail { enum class ClampLimits {Min, Max, MinMax}; diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp index 71690c4bf2d1..05691d2998df 100644 --- a/aten/src/ATen/native/TensorConversions.cpp +++ b/aten/src/ATen/native/TensorConversions.cpp @@ -2,8 +2,10 @@ #include #include #include +#include #include +#include namespace at { namespace native { @@ -51,34 +53,99 @@ Tensor _to_copy( // memory_format is handled separately due to MemoryFormat::Preserve logic options = self.options().merge_in(options).memory_format(c10::nullopt); auto memory_format = optional_memory_format.value_or(MemoryFormat::Preserve); + // TODO: Use the dispatcher for this. + // Currently there are unenumerated extensibility issues preventing this. + if (self.is_sparse_csr()) { + TORCH_CHECK( + memory_format == MemoryFormat::Preserve, + "sparse_csr only supports memory format Preserve, but got ", + memory_format, + " instead."); + + auto new_values = at::native::to( + self.values(), + dtype, + c10::kStrided, // values are strided + device, + pin_memory, + non_blocking, + true, // force copy since we're in _to_copy + memory_format); + + auto new_crow_indices = at::native::to( + self.crow_indices(), + self.crow_indices().scalar_type(), // indices are integral + c10::kStrided, // indices are strided + device, + pin_memory, + non_blocking, + true, // force copy since we're in _to_copy + memory_format); + + auto new_col_indices = at::native::to( + self.col_indices(), + self.col_indices().scalar_type(), // indices are integral + c10::kStrided, // indices are strided + device, + pin_memory, + non_blocking, + true, // force copy since we're in _to_copy + memory_format); + + return at::native::_sparse_csr_tensor_unsafe( + new_crow_indices, + new_col_indices, + new_values, + self.sizes(), + new_values.scalar_type(), + self.layout(), + new_values.device()); + } bool pin_out = (non_blocking && self.is_cuda() && options.device().is_cpu() && (options.layout() == c10::kStrided)); if (memory_format == MemoryFormat::Preserve) { - if (self.is_non_overlapping_and_dense() && options.device().supports_as_strided()) { - Tensor r; - if (self.is_quantized()) { - r = at::empty_quantized(self.sizes(), self, options); - at::QuantizerPtr quantizer = r.quantizer(); - r.copy_(self, non_blocking); - set_quantizer_(r, quantizer); + if (options.device().supports_as_strided()) { + if (self.is_non_overlapping_and_dense()) { + Tensor r; + if (self.is_quantized()) { + r = at::empty_quantized(self.sizes(), self, options); + at::QuantizerPtr quantizer = r.quantizer(); + r.copy_(self, non_blocking); + set_quantizer_(r, quantizer); + } else { + r = at::empty_strided( + self.sizes(), + self.strides(), + options.pinned_memory(pin_out)); + r.copy_(self, non_blocking); + } + return r; + } else if (!self.is_quantized() && self.layout() == kStrided) { + Tensor r; + auto strides = infer_dense_strides(self.sizes(), self.strides()); + r = at::empty_strided( + self.sizes(), + strides, + options.pinned_memory(pin_out)); + r.copy_(self, non_blocking); + return r; } else { - r = at::empty_strided( - self.sizes(), - self.strides(), - options.pinned_memory(pin_out)); - r.copy_(self, non_blocking); + memory_format = self.suggest_memory_format(); } - return r; } else { memory_format = self.suggest_memory_format(); } } // See Note [Explicit nullopt MemoryFormat argument] - auto r = at::empty(self.sizes(), - options.memory_format(memory_format).pinned_memory(pin_out), - c10::nullopt); + // TODO: empty_quantized does not work here. It raises an exception in CheckMemoryFormat.h prior to + // empty_affine_quantizd/_empty_per_channel_affine_quantized calls + // at::empty also does not work here because there is no proper at::empty support for quantized tensors + // as it would return a quantized tensor with an UnknownQuantizer + auto r = self.is_quantized() ? at::empty_like(self, memory_format) + : at::empty(self.sizes(), + options.memory_format(memory_format).pinned_memory(pin_out), c10::nullopt); r.copy_(self, non_blocking); return r; } @@ -240,11 +307,14 @@ Tensor to_dense_backward(const Tensor& grad, const Tensor& input_) { if (input_.layout() == c10::kSparse) { auto input = input_.coalesce(); return grad.sparse_mask(input); - } else if (input_.layout() == c10::kMkldnn) { + } + if (input_.layout() == c10::kMkldnn) { return grad.to_mkldnn(input_.scalar_type()); - } else { - AT_ERROR("Unsupported input layout: ", input_.layout()); } + if (input_.layout() == c10::kStrided) { + return grad.to_dense(); + } + AT_ERROR("to_dense_backward: Unsupported input layout: ", input_.layout()); } Tensor to_mkldnn_backward(const Tensor& grad, const Tensor& input_) { @@ -252,6 +322,44 @@ Tensor to_mkldnn_backward(const Tensor& grad, const Tensor& input_) { return grad.to_dense(input_.scalar_type()); } +Tensor to_dense(const Tensor& tensor, c10::optional dtype) { + if (tensor.layout() == c10::kSparse) { + return tensor._to_dense(dtype); + } + if (tensor.layout() == c10::kSparseCsr || tensor.layout() == c10::kSparseCsc) { + return tensor._to_dense(dtype); + } + if (tensor.layout() == c10::kMkldnn) { + return tensor._to_dense(dtype); + } + TORCH_CHECK(tensor.layout() == c10::kStrided, "to_dense does not support layout ", tensor.layout()); + if (dtype) { + return tensor.to(*dtype); + } + return tensor; +} + +Tensor sparse_to_dense( + const Tensor& self, + c10::optional dtype) { + TORCH_CHECK( + !dtype.has_value(), "dtype argument is not supported by sparse_to_dense"); + Tensor dst = at::zeros(self.sizes(), self.options().layout(kStrided)); + return dst.add_(self); +} + +Tensor sparse_compressed_to_dense( + const Tensor& self, + c10::optional dtype) { + TORCH_CHECK( + !dtype.has_value(), "dtype argument is not supported by sparse_csr_to_dense"); + if (self.layout() == kSparseCsr) { + Tensor dst = at::zeros(self.sizes(), self.options().layout(kStrided)); + return dst.add_(self); + } + return self.to_sparse().to_dense(); +} + // Computes the strides for view_dtype output when the view dtype is // smaller than the original dtype inline DimVector compute_strides_for_view_dtype_downsize(IntArrayRef old_strides, int64_t size_ratio, ScalarType old_dtype, ScalarType new_dtype) { @@ -371,4 +479,502 @@ Tensor view_dtype(const Tensor& self, ScalarType dtype) { return new_tensor; } +// Sparse layout conversions Start + +Tensor dense_to_sparse_csr(const Tensor& self) { + return self.to_sparse().to_sparse_csr(); +} + +Tensor dense_to_sparse_csc(const Tensor& self) { + return self.to_sparse().to_sparse_csc(); +} + +Tensor dense_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize) { + AT_ERROR( + "Conversion from ", self.layout(), " to SparseBsr is currently not supported."); + return self; +} + +Tensor dense_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize) { + AT_ERROR( + "Conversion from ", self.layout(), " to SparseBsc is currently not supported."); + return self; +} + +Tensor sparse_compressed_to_sparse_csr(const Tensor& self) { + if (self.layout() == kSparseCsc) { + TORCH_CHECK( + self.dim() == 2, + "Expected self to be of dimension 2, but got ", + self.dim(), + "."); + auto sizes = self.sizes(); + auto ccol_indices = self.ccol_indices(); + auto row_indices = self.row_indices(); + auto values = self.values(); + + // convert CSC indices to COO indices and swap its rows + const bool out_int32 = ccol_indices.scalar_type() == ScalarType::Int; + Tensor indices_transposed = _convert_indices_from_csr_to_coo( + ccol_indices, row_indices, out_int32, true); + + // sort transposed indices + auto indices_scalar = + at::sparse::flatten_indices(indices_transposed, {sizes[0], sizes[1]}); + auto indicesPermutation = std::get<1>(indices_scalar.sort(0)); + auto indices_transposed_sorted = + indices_transposed.index_select(1, indicesPermutation); + + // construct a CSR tensor + auto new_row_indices = indices_transposed_sorted.select(0, 0); + auto new_col_indices = indices_transposed_sorted.select(0, 1); + auto new_values = values.index_select(0, indicesPermutation); + Tensor new_crow_indices = + _convert_indices_from_coo_to_csr(new_row_indices, sizes[0], out_int32); + + return _sparse_csr_tensor_unsafe( + new_crow_indices, + new_col_indices, + new_values, + {sizes[0], sizes[1]}, + new_values.scalar_type(), + c10::kSparseCsr, + new_values.device()); + } + if (self.layout() == kSparseCsr) { + // Just returning self doesn't work + // RuntimeError: t.use_count() <= 1 INTERNAL ASSERT FAILED at + // "../torch/csrc/autograd/autograd_not_implemented_fallback.cpp":152, + // please report a bug to PyTorch. aten::to_sparse_csr + return at::native::_sparse_csr_tensor_unsafe( + self.crow_indices(), + self.col_indices(), + self.values(), + self.sizes(), + self.scalar_type(), + c10::kSparseCsr, + self.device()); + } + AT_ERROR( + "sparse_compressed_to_sparse_csr expected SparseCsr or SparseCsc layout but got ", + self.layout()); +} + +Tensor coo_to_sparse_csr(const Tensor& self) { + TORCH_CHECK( + self.dim() == 2, + "Only 2D tensors can be converted to the SparseCsr layout but got shape: ", + self.sizes()); + auto coalesced_self = self.coalesce(); + auto row_indices = coalesced_self.indices()[0]; + bool out_int32 = (row_indices.scalar_type() == at::kInt); + auto crow_indices = at::_convert_indices_from_coo_to_csr( + row_indices, self.size(0), out_int32); + return at::native::_sparse_csr_tensor_unsafe( + crow_indices, + coalesced_self.indices()[1].contiguous(), + coalesced_self.values(), + coalesced_self.sizes(), + coalesced_self.scalar_type(), + c10::kSparseCsr, + coalesced_self.device()); +} + +Tensor coo_to_sparse_csc(const Tensor& self) { + TORCH_CHECK( + self.dim() == 2, + "Only 2D tensors can be converted to the SparseCsc layout but got shape: ", + self.sizes()); + auto coalesced_self = self.transpose(0, 1).coalesce().to_sparse_csr(); + return at::native::_sparse_csc_tensor_unsafe( + coalesced_self.crow_indices(), + coalesced_self.col_indices(), + coalesced_self.values(), + self.sizes(), + coalesced_self.scalar_type(), + c10::kSparseCsc, + coalesced_self.device()); +} + +Tensor coo_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize) { + AT_ERROR( + "Conversion from ", self.layout(), " to SparseBsr is currently not supported."); + return self; +} + +Tensor coo_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize) { + AT_ERROR( + "Conversion from ", self.layout(), " to SparseBsc is currently not supported."); + return self; +} + +namespace { +template +void convert_indices_from_coo_to_csr_cpu( + const Tensor& result, + const Tensor& input, + const int64_t size) { + int64_t numel = input.numel(); + const input_t* data_in = input.data_ptr(); + output_t* data_out = result.data_ptr(); + + if (numel == 0) { + result.zero_(); + return; + } + + for (int64_t i = 0; i <= data_in[0]; i++) + data_out[i] = static_cast(0); + + at::parallel_for( + 0, numel - 1, at::internal::GRAIN_SIZE, [&](int64_t start, int64_t end) { + input_t curr_value = data_in[start], next_value; + for (const auto i : c10::irange(start, end)) { + next_value = data_in[i + 1]; + for (; curr_value < next_value; curr_value++) + data_out[curr_value + 1] = static_cast(i + 1); + } + }); + for (int64_t i = data_in[numel - 1] + 1; i < size + 1; i++) { + data_out[i] = static_cast(numel); + } +} + +template +void convert_indices_from_csr_to_coo_cpu( + const Tensor& indices, + const Tensor& crow_indices, + const Tensor& col_indices, + const bool transpose = false) { + int64_t nrows = crow_indices.numel() - 1; + if (nrows == 0) { + indices.zero_(); + return; + } + auto crow_indices_ = crow_indices.expect_contiguous(); + const input_t* crow_indices_data_in = crow_indices_->data_ptr(); + TORCH_INTERNAL_ASSERT(indices.is_contiguous()); + auto row0 = indices.select(0, transpose ? 1 : 0); + auto row1 = indices.select(0, transpose ? 0 : 1); + output_t* data_out = row0.data_ptr(); + row1.copy_(*col_indices.expect_contiguous()); + at::parallel_for( + 0, nrows, at::internal::GRAIN_SIZE, [&](int64_t start, int64_t end) { + for (const auto i : c10::irange(start, end)) { + std::fill( + &data_out[crow_indices_data_in[i]], + &data_out[crow_indices_data_in[i + 1]], + static_cast(i)); + } + }); +} +} // namespace + +TORCH_IMPL_FUNC(_convert_indices_from_coo_to_csr_structured_cpu) +(const Tensor& input, + const int64_t size, + const bool out_int32, + const Tensor& result) { + if (out_int32) { + AT_DISPATCH_INTEGRAL_TYPES( + input.scalar_type(), "convert_indices_from_coo_to_csr_cpu", [&] { + convert_indices_from_coo_to_csr_cpu( + result, input, size); + }); + } else { + AT_DISPATCH_INTEGRAL_TYPES( + input.scalar_type(), "convert_indices_from_coo_to_csr_cpu", [&] { + convert_indices_from_coo_to_csr_cpu( + result, input, size); + }); + } +} + +TORCH_IMPL_FUNC(_convert_indices_from_csr_to_coo_structured_cpu) +(const Tensor& crow_indices, + const Tensor& col_indices, + const bool out_int32, + const bool transpose, + const Tensor& result) { + if (out_int32) { + AT_DISPATCH_INTEGRAL_TYPES( + crow_indices.scalar_type(), "convert_indices_from_csr_to_coo_cpu", [&] { + convert_indices_from_csr_to_coo_cpu( + result, crow_indices, col_indices, transpose); + }); + } else { + AT_DISPATCH_INTEGRAL_TYPES( + crow_indices.scalar_type(), "convert_indices_from_csr_to_coo_cpu", [&] { + convert_indices_from_csr_to_coo_cpu( + result, crow_indices, col_indices, transpose); + }); + } +} + +/* + * Based on + * https://github.com/scipy/scipy/blob/8a64c938ddf1ae4c02a08d2c5e38daeb8d061d38/scipy/sparse/sparsetools/csr.h + */ +template +void _csr_to_block_csr_cpu_kernel( + const I n_row, + const I n_col, + const I R, + const I C, + const I* input_crow_indices, + const I* input_col_indices, + const T* input_values, + I* result_crow_indices, + I* result_col_indices, + T* result_values) { + // All blocks are possible, that is, may be allocated if a single non-zero + // value lives within them. Otherwise they're not. + + // Allocate pointers for all possible column blocks plus 1 + std::vector blocks(n_col / C + 1, (T*)0); + + assert(n_row % R == 0); + assert(n_col % C == 0); + + // Major assumptions + // 1. Blocks must be square + + // Number of blocks along rows + I n_brow = n_row / R; + // Number of blocks along columns + // I n_bcol = n_col / C; + + // Number of elements per block + I RC = R * C; + // Number of blocks overall + I n_blks = 0; + + result_crow_indices[0] = 0; + + // Iterate over blocks along rows + for (I block_i = 0; block_i < n_brow; block_i++) { + // Iterate over rows within block + for (I r = 0; r < R; r++) { + I i = R * block_i + r; // row index + for (I jj = input_crow_indices[i]; jj < input_crow_indices[i + 1]; jj++) { + I j = input_col_indices[jj]; // column index + + // Block corresponding to column index + I block_j = j / C; + // Column within block + I c = j % C; + + if (blocks[block_j] == 0) { + blocks[block_j] = result_values + RC * n_blks; + result_col_indices[n_blks] = block_j; + n_blks++; + } + + // Specific blocks entries should not be visited more than once. + // Scipy code does an addition here. Why? + *(blocks[block_j] + C * r + c) = input_values[jj]; + } + } + + for (I jj = input_crow_indices[R * block_i]; + jj < input_crow_indices[R * (block_i + 1)]; + jj++) { + blocks[input_col_indices[jj] / C] = 0; + } + + result_crow_indices[block_i + 1] = n_blks; + } +} + +/* + * Based on + * https://github.com/scipy/scipy/blob/8a64c938ddf1ae4c02a08d2c5e38daeb8d061d38/scipy/sparse/sparsetools/csr.h + */ +template +I csr_count_blocks( + const I n_row, + const I n_col, + const I R, + const I C, + const I Ap[], + const I Aj[]) { + std::vector mask(n_col / C + 1, -1); + I n_blks = 0; + for (I i = 0; i < n_row; i++) { + I bi = i / R; + for (I jj = Ap[i]; jj < Ap[i + 1]; jj++) { + I bj = Aj[jj] / C; + if (mask[bj] != bi) { + mask[bj] = bi; + n_blks++; + } + } + } + return n_blks; +} + +Tensor _csr_to_block_csr_cpu(const Tensor& self, IntArrayRef blocksize) { + TORCH_CHECK( + blocksize[0] == blocksize[1], + "blocks must be square. ", + "Got (", + blocksize[0], + ", ", + blocksize[1], + ") instead."); + TORCH_CHECK( + self.size(0) % blocksize[0] == 0 && self.size(1) % blocksize[1] == 0, + "Block sparse CSR Tensors must have a size that is an ", + "integral multiple of their block size. ", + "Got Tensor of size (", + self.size(0), + ", ", + self.size(1), + ") with block size (", + blocksize[0], + ", ", + blocksize[1], + ") instead."); + Tensor input_values = self.values().contiguous(); + Tensor input_crow_indices = self.crow_indices().contiguous(); + Tensor input_col_indices = self.col_indices().contiguous(); + + // First we determine the number of blocks needed. For each given block, if it + // contains a non-zero element we will allocate values and indices for it. + int64_t num_blocks; + int64_t n_row = self.size(0); + int64_t n_col = self.size(1); + AT_DISPATCH_INDEX_TYPES( + input_crow_indices.scalar_type(), "_csr_to_block_csr_cpu", [&] { + num_blocks = csr_count_blocks( + n_row, + n_col, + blocksize[0], + blocksize[1], + input_crow_indices.data_ptr(), + input_col_indices.data_ptr()); + }); + + Tensor result_values = + input_values.new_zeros({num_blocks, blocksize[0], blocksize[1]}); + Tensor result_crow_indices = + input_crow_indices.new_empty({(n_row / blocksize[0]) + 1}); + Tensor result_col_indices = input_col_indices.new_empty({num_blocks}); + + // Next we copy over non-zero elements into the allocated blocks. + AT_DISPATCH_INDEX_TYPES( + input_crow_indices.scalar_type(), "_csr_to_block_csr_cpu", [&] { + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES( + input_values.scalar_type(), "_csr_to_block_csr_cpu", [&] { + _csr_to_block_csr_cpu_kernel( + n_row, + n_col, + blocksize[0], + blocksize[1], + input_crow_indices.data_ptr(), + input_col_indices.data_ptr(), + input_values.data_ptr(), + result_crow_indices.data_ptr(), + result_col_indices.data_ptr(), + result_values.data_ptr()); + }); + }); + return at::native::_sparse_bsr_tensor_unsafe( + result_crow_indices, + result_col_indices, + result_values, + self.sizes(), + result_values.scalar_type(), + c10::kSparseBsr, + result_values.device()); +} + +Tensor sparse_compressed_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize) { + TORCH_CHECK( + self.is_sparse_csr(), + "Can only convert CSR to SparseBsr, but got ", + self.layout(), + " instead."); + Tensor self_values = self.values(); + Tensor self_crow_indices = self.crow_indices(); + Tensor self_col_indices = self.col_indices(); + Tensor cpu_result = _csr_to_block_csr_cpu( + _sparse_csr_tensor_unsafe( + self_crow_indices.cpu(), + self_col_indices.cpu(), + self_values.cpu(), + self.sizes(), + self_values.scalar_type(), + self.layout(), + self_values.device()), + blocksize); + Tensor result_values = cpu_result.values().to(self_values.options()); + Tensor result_crow_indices = + cpu_result.crow_indices().to(self_crow_indices.options()); + Tensor result_col_indices = + cpu_result.col_indices().to(self_col_indices.options()); + return at::native::_sparse_bsr_tensor_unsafe( + result_crow_indices, + result_col_indices, + result_values, + self.sizes(), + result_values.scalar_type(), + c10::kSparseBsr, + result_values.device()); +} + +Tensor sparse_compressed_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize) { + AT_ERROR( + "Conversion from ", self.layout(), " to SparseBsc is currently not supported."); + return self; +} + +Tensor sparse_compressed_to_sparse_csc(const Tensor& self) { + if (self.layout() == kSparseCsc) { + // Based on to_sparse_csr just returning self doesn't work + return _sparse_csc_tensor_unsafe( + self.ccol_indices(), + self.row_indices(), + self.values(), + self.sizes(), + self.scalar_type(), + c10::kSparseCsc, + self.device()); + } + AT_ERROR( + "Conversion from ", self.layout(), " to SparseCsc is currently not supported."); +} + +Tensor sparse_compressed_to_sparse(const Tensor& self, int64_t sparse_dim) { + TORCH_CHECK(sparse_dim > 0, "sparse_dim must be >0"); + TORCH_CHECK(sparse_dim <= 2, + "sparse_dim must be less than or equal to 2"); + // TODO: implement coo.to_sparse(sparse_dim) and then use + // return self.to_sparse().to_sparse(sparse_dim); + TORCH_CHECK( + sparse_dim == 2, "sparse dim 1 is not supported by sparse_csr_to_dense"); + if (self.layout() == kSparseCsc) { + Tensor indices = at::_convert_indices_from_csr_to_coo( + self.ccol_indices(), self.row_indices(), false, true); + return at::native::_sparse_coo_tensor_unsafe( + indices, self.values(), self.sizes()) + ._coalesced_(true); + } + if (self.layout() == kSparseCsr) { + Tensor indices = at::_convert_indices_from_csr_to_coo( + self.crow_indices(), self.col_indices(), false, false); + return at::native::_sparse_coo_tensor_unsafe( + indices, self.values(), self.sizes()) + ._coalesced_(true); + } + AT_ERROR( + "sparse_compressed_to_sparse expected SparseCsr or SparseCsc layout but got ", + self.layout()); +} + +Tensor sparse_compressed_to_sparse(const Tensor& self) { + return sparse_compressed_to_sparse(self, 2); +} + +// Sparse layout conversions End }} // namespace at::native diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp index 458a694411e4..4494ff16eb6b 100644 --- a/aten/src/ATen/native/TensorFactories.cpp +++ b/aten/src/ATen/native/TensorFactories.cpp @@ -110,9 +110,9 @@ Tensor _dim_arange(const Tensor& like, int64_t dim) { // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ complex / polar ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ void complex_check_floating(const Tensor& a, const Tensor& b) { - TORCH_CHECK((a.scalar_type() == kFloat || a.scalar_type() == kDouble) && - (b.scalar_type() == kFloat || b.scalar_type() == kDouble), - "Expected both inputs to be Float or Double tensors but got ", + TORCH_CHECK((a.scalar_type() == kFloat || a.scalar_type() == kDouble || a.scalar_type() == kHalf) && + (b.scalar_type() == kFloat || b.scalar_type() == kDouble || b.scalar_type() == kHalf), + "Expected both inputs to be Half, Float or Double tensors but got ", a.scalar_type(), " and ", b.scalar_type()); } @@ -932,7 +932,7 @@ Tensor& randperm_out_cpu(int64_t n, c10::optional generator, Tensor& auto gen = get_generator_or_default(generator, detail::getDefaultCPUGenerator()); // See Note [Acquire lock when using random generators] std::lock_guard lock(gen->mutex_); - AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, result.scalar_type(), "randperm", [&]() -> void { + AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, result.scalar_type(), "randperm", [&]() -> void { randperm_cpu(result, n, gen); }); @@ -1344,6 +1344,11 @@ Tensor kaiser_window( TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); window_function_checks("kaiser_window", options, window_length); + // short-circuit for `meta`. + if (device == kMeta) { + return at::empty({window_length}, options); + } + if (window_length == 0) { return at::empty({0}, options); } diff --git a/aten/src/ATen/native/TensorFactories.h b/aten/src/ATen/native/TensorFactories.h index 9ef00d619675..35e058df4b3a 100644 --- a/aten/src/ATen/native/TensorFactories.h +++ b/aten/src/ATen/native/TensorFactories.h @@ -1,11 +1,17 @@ #pragma once -#include +#include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + namespace at { namespace native { // Different combinations of row, col, and offset can lead to two cases: // @@ -29,6 +35,10 @@ namespace at { namespace native { // In this case, we first calculate the size of top trapezoid, and then // calculate the size of the bottom rectangle. inline int64_t get_tril_size(int64_t row, int64_t col, int64_t offset) { + // If either dimension is 0 then the there is no tril + if (row == 0 || col == 0) { + return 0; + } // number of elements in the first row of the tril auto m_first_row = offset > 0 ? std::min(col, 1 + offset) : // upper bounded by col @@ -95,7 +105,7 @@ struct ZeroTensorAllocator final : public at::Allocator { static void deleter(void* const pointer) { TORCH_INTERNAL_ASSERT(!pointer); } - DataPtr allocate(const size_t nbytes) const override { + DataPtr allocate(const size_t /*nbytes*/) const override { return {nullptr, nullptr, &deleter, device_}; } DeleterFnPtr raw_deleter() const override { diff --git a/aten/src/ATen/native/TensorProperties.cpp b/aten/src/ATen/native/TensorProperties.cpp index 63d928749e09..fd72abc580b4 100644 --- a/aten/src/ATen/native/TensorProperties.cpp +++ b/aten/src/ATen/native/TensorProperties.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include @@ -31,7 +31,7 @@ int64_t stride(const Tensor& self, Dimname dim) { return self.strides()[pos_dim]; } -bool cudnn_is_acceptable(const Tensor& self) { +bool cudnn_is_acceptable(const TensorBase& self) { if (!globalContext().userEnabledCuDNN()) return false; if (!self.is_cuda()) return false; auto st = self.scalar_type(); @@ -48,6 +48,10 @@ bool cudnn_is_acceptable(const Tensor& self) { return true; } +bool cudnn_is_acceptable(const Tensor& self) { + return cudnn_is_acceptable(static_cast(self)); +} + Tensor & detach_(Tensor & self) { // this just exists to give us a hook in VariableType and an entry in Declarations.yaml //AT_ERROR("detach_ is not implemented for Tensor"); diff --git a/aten/src/ATen/native/TensorProperties.h b/aten/src/ATen/native/TensorProperties.h new file mode 100644 index 000000000000..fe6e8395c178 --- /dev/null +++ b/aten/src/ATen/native/TensorProperties.h @@ -0,0 +1,12 @@ +#pragma once + +// See NOTE: [Tensor vs. TensorBase] +namespace at { +class TensorBase; +} + +namespace at { namespace native { + +TORCH_API bool cudnn_is_acceptable(const TensorBase& self); + +}} // namespace at::native diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index 3999805fee14..9d05610f4fdb 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -28,6 +29,139 @@ #include namespace at { +namespace meta { +inline void cat_check_no_zero_dim(const MaterializedITensorListRef& tensors) { + size_t i = 0; + for (const Tensor& t : tensors) { + TORCH_CHECK( + t.dim() > 0, + "zero-dimensional tensor (at position ", i, ") cannot be concatenated"); + i++; + } +} + +inline c10::MemoryFormat cat_compute_output_memory_format(const MaterializedITensorListRef& inputs) { + c10::optional format = c10::nullopt; + for (const Tensor& t : inputs) { + auto f = t.suggest_memory_format(); + if (f == c10::MemoryFormat::Contiguous) { + return f; + } + if (format.has_value() && format.value() != f) { + return c10::MemoryFormat::Contiguous; + } + format = f; + } + return format.value(); +} + +TORCH_PRECOMPUTE_META_FUNC(cat)(ITensorListRef tensors, int64_t dim) { + // previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible + // to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors + // to be "skipped". We maintain this behavior for backwards compatibility, but only for this specific + // size (i.e. other empty sizes are not skipped). + auto materialized = tensors.materialize(); + + cat_check_no_zero_dim(materialized); + dim = at::legacy_cat_wrap_dim(dim, tensors); + + // Checking names before the actual dimensions. + auto maybe_outnames = namedinference::compute_cat_outnames(tensors); + + TORCH_CHECK( + materialized.size() > 0, "torch.cat(): expected a non-empty list of Tensors"); + + // Look for the first valid tensor. + size_t valid = materialized.size(); + for (const auto i : c10::irange(materialized.size())) { + if (!at::native::cat_should_skip_tensor(materialized[i].get())) { + valid = i; + break; + } + } + + bool all_contiguous = true; + bool all_same_dtype = true; + bool all_same_sizes_and_stride = true; + auto memory_format = cat_compute_output_memory_format(materialized); + + // Compute what the output dtype should be: + const auto& result = maybe_get_output(); + auto is_out_defined = result.defined(); + auto out_dtype = at::native::result_type(tensors); + + // If the output tensor is defined, we need to take it into account + // when computing the actual output dtype and the flags. + if (is_out_defined) { + // Check for type promotion, if the output tensor is defined. + TORCH_CHECK( + canCast(out_dtype, result.scalar_type()), + "torch.cat(): input types can't be cast to the desired output type ", + result.scalar_type()); + out_dtype = result.scalar_type(); + all_contiguous = result.is_contiguous(memory_format); + } + + // Fallback 'set_output' parameters. + // (in case we don't find a valid tensor) + DimVector sizes {0}; + TensorOptions options = materialized[0].get().options() + .dtype(out_dtype) + .memory_format(memory_format); + + // If we found a valid tensor, check whether the input tensors + // are compatible, i.e. we can execute `cat` on them. + bool found_valid_tensor = valid < materialized.size(); + if (found_valid_tensor) { + TORCH_CHECK( + dim <= materialized[valid].get().dim(), "torch.cat(): dimension ", dim, "out of range"); + + // Compute the output tensor size. + // It should have the same shape as any other valid tensor, + // except in the dimension 'dim'. + size_t size_at_dim = 0; + for (const auto i : c10::irange(materialized.size())) { + const Tensor& t = materialized[i]; + if (!at::native::cat_should_skip_tensor(t)) { + at::native::check_cat_shape_except_dim(materialized[valid], t, dim, i); + size_at_dim += t.size(dim); + all_contiguous = all_contiguous && t.is_contiguous(memory_format); + all_same_dtype = all_same_dtype && out_dtype == t.scalar_type(); + all_same_sizes_and_stride = all_same_sizes_and_stride && + t.sizes() == materialized[valid].get().sizes() && + t.strides() == materialized[valid].get().strides(); + } else { + all_contiguous = false; + } + } + + // Actually set the output. + sizes = materialized[valid].get().sizes().vec(); + sizes[dim] = size_at_dim; + options = materialized[valid].get().options() + .dtype(out_dtype) + .memory_format(memory_format); + } + + set_output(0, sizes, {}, options, maybe_outnames); + // Checks for overlaps between the inputs and the output tensor. + if (is_out_defined && found_valid_tensor) { + at::assert_no_internal_overlap(result); + for (const Tensor& t : materialized) { + at::assert_no_overlap(result, t); + } + } + + return TORCH_PRECOMPUTE_STRUCT(cat)() + .set_dim(dim) + .set_valid(valid) + .set_all_contiguous(all_contiguous) + .set_all_same_dtype(all_same_dtype) + .set_all_same_sizes_and_stride(all_same_sizes_and_stride) + .set_memory_format(memory_format); +} +} // namespace meta + namespace native { DEFINE_DISPATCH(cat_serial_stub); @@ -59,12 +193,19 @@ Tensor& set_storage_cpu_(Tensor& result, Storage storage, int64_t storage_offset checkSetStorage(result, storage, storage_offset, size, stride); result.unsafeGetTensorImpl()->set_storage_offset(storage_offset); - c10::optional stride_opt = stride.data() != nullptr ? - c10::optional(stride) : c10::nullopt; - at::native::resize_impl_cpu_(result.unsafeGetTensorImpl(), size, stride_opt); + at::OptionalIntArrayRef stride_opt = stride.data() != nullptr ? + at::OptionalIntArrayRef(stride) : c10::nullopt; + // We can re-use this kernel for the meta device. + // We just need to make sure we don't actually try to resize the (null) storage. + at::native::resize_impl_cpu_(result.unsafeGetTensorImpl(), size, stride_opt, /*resize_storage=*/!result.is_meta()); return result; } +Tensor& set_(Tensor& result, const Tensor& storage, int64_t storage_offset, IntArrayRef size, IntArrayRef stride) { + TORCH_CHECK(storage.is_contiguous(), "passed in tensor to be used as storage must be contiguous"); + return result.set_(storage.storage(), storage_offset + storage.storage_offset(), size, stride); +} + Tensor& set_tensor_(Tensor& result, const Tensor& source) { if (result.unsafeGetTensorImpl() != source.unsafeGetTensorImpl()) { return result.set_(source.storage(), source.storage_offset(), source.sizes(), source.strides()); @@ -87,6 +228,19 @@ Tensor& set_cpu_(Tensor& result) { return result; } +// We can't re-use the cpu kernel here because we don't want to use the cpu allocator. +Tensor& set_meta_(Tensor& result) { + caffe2::TypeMeta dtype = result.dtype(); + Storage storage( + Storage::use_byte_size_t(), + 0, + c10::GetAllocator(kMeta), + true); + result.set_(storage, 0, {0}, {}); + TORCH_INTERNAL_ASSERT(dtype == result.dtype()); + return result; +} + Tensor sparse_broadcast_to(const Tensor& self, IntArrayRef size) { TORCH_CHECK(self.is_sparse(), "input must be sparse tensor"); int64_t sparse_extra_ndim = size.size() - self.dim(); @@ -171,132 +325,49 @@ std::vector broadcast_tensors(TensorList tensors) { return expand_outplace(tensors); } -static bool should_skip(const Tensor& t) { - return t.numel() == 0 && t.dim() == 1; -} - -Tensor & _cat_out_cpu(TensorList tensors, int64_t dim, Tensor& result) { - check_cat_no_zero_dim(tensors); - dim = legacy_cat_wrap_dim(dim, tensors); - // previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible - // to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors - // to be "skipped". We maintain this behavior for backwards compatibility, but only for this specific - // size (i.e. other empty sizes are not skipped). - - bool allContiguous = true; - - // Inputs cannot alias the output tensor - for (const auto i : c10::irange(tensors.size())) { - auto lap = at::get_overlap_status(result, tensors[i]); - TORCH_CHECK(lap != at::MemOverlapStatus::PARTIAL && - lap != at::MemOverlapStatus::FULL, 0, - "unsupported operation: the input tensors cannot refer to any of the " - "output memory locations. Found overlap in input tensor ", i); - } - at::assert_no_internal_overlap(result); - - const Tensor* pnotSkippedTensor = [](const TensorList &tensors) -> const Tensor* { - for (auto const &tensor : tensors) { - if (should_skip(tensor)) { - continue; - } - // we've found a non-empty tensor - return &tensor; - } - return nullptr; - }(tensors); - - if (!pnotSkippedTensor) { - // FIXME: warn if this is the case -- see comment about skipped - // tensors at top of function. - return result; - } - const Tensor& notSkippedTensor = *pnotSkippedTensor; - - TORCH_CHECK(tensors.size() > 0, "torch.cat(): expected a non-empty list of Tensors"); - TORCH_CHECK(dim <= notSkippedTensor.dim(), "torch.cat(): dimension ", dim, "out of range"); - - // when the input tensors are of the same size and strides, - // reuse the same iterator for all input tensors - bool reuse_iterator = true; - bool no_type_promotion = true; - // Check the type of the result - no_type_promotion = result.dtype() == notSkippedTensor.dtype(); - - // compute size of the result in the cat dimension - int64_t cat_dim_size = 0; - auto first_tensor_mem_format = tensors[0].suggest_memory_format(); - for (const auto i : c10::irange(tensors.size())) { - auto const &tensor = tensors[i]; - if (should_skip(tensor)) { - // don't use fast path for empty tensor - allContiguous = false; - continue; - } - check_cat_shape_except_dim(notSkippedTensor, tensor, dim, i); - cat_dim_size += tensor.sizes()[dim]; - - if (!tensor.is_contiguous(first_tensor_mem_format)) { - allContiguous = false; - } - - if (tensor.sizes() != notSkippedTensor.sizes() || - tensor.strides() != notSkippedTensor.strides()) { - reuse_iterator = false; - } - if (tensor.dtype() != notSkippedTensor.dtype()) { - no_type_promotion = false; - } - } - // compute the size of the result - auto result_size = notSkippedTensor.sizes().vec(); - result_size[dim] = cat_dim_size; - - // skip resizing if size of result is same as expected - // raise a warning while resizing if output has one or more elements - // See https://github.com/pytorch/pytorch/pull/62560#discussion_r687363362 - // for understanding why at::native::resize_output is not called directly. - // if (at::native::resize_output_check(result, result_size)) { - // TODO: restore the above, see https://github.com/pytorch/pytorch/issues/64709 - - if (result.sizes() != result_size) { - result.resize_(result_size, first_tensor_mem_format); - } - +TORCH_IMPL_FUNC(cat_out_cpu) +(ITensorListRef tensors, + int64_t dim, + int64_t valid, + bool all_contiguous, + bool all_same_dtype, + bool all_same_sizes_and_stride, + MemoryFormat memory_format, + const Tensor& result) { if (result.numel() == 0) { - return result; + return; } + auto materialized = tensors.materialize(); + // fast path for single thread when both inputs and result are contiguous and not empty - allContiguous = allContiguous && result.is_contiguous(first_tensor_mem_format); bool use_serial_kernel = result.numel() < at::internal::GRAIN_SIZE || at::get_num_threads() == 1; - ScalarType dtype = notSkippedTensor.scalar_type(); + ScalarType dtype = materialized[valid].get().scalar_type(); bool serial_dtype = (dtype == ScalarType::Double || dtype == ScalarType::Float || dtype == ScalarType::BFloat16); - if (use_serial_kernel && allContiguous && no_type_promotion && serial_dtype) { - cat_serial_stub(kCPU, result, tensors, dim); - return result; + if (use_serial_kernel && all_contiguous && all_same_dtype && serial_dtype) { + cat_serial_stub(kCPU, result, materialized, dim); + return; } int64_t offset = 0; - if (reuse_iterator && - result.is_contiguous(first_tensor_mem_format) && - no_type_promotion) { - const auto& source_slice = notSkippedTensor; + if (all_same_sizes_and_stride && result.is_contiguous(memory_format) && + all_same_dtype) { + const Tensor& source_slice = materialized[valid]; auto slice_dim_size = source_slice.sizes()[dim]; auto result_slice = result.narrow(dim, 0, slice_dim_size); auto result_slice_data = result_slice.data_ptr(); auto result_stride_bytes = result.stride(dim) * elementSize(result.scalar_type()); auto iter = TensorIteratorConfig() - .set_check_mem_overlap(false) // Already checked above + .set_check_mem_overlap(false) .resize_outputs(false) .add_output(result_slice) .add_input(source_slice) .enforce_safe_casting_to_output(true) .build(); - for (auto const &tensor : tensors) { - if (should_skip(tensor)) { + for (const Tensor& tensor : materialized) { + if (cat_should_skip_tensor(tensor)) { continue; } auto source_data = static_cast(tensor.data_ptr()); @@ -307,8 +378,8 @@ Tensor & _cat_out_cpu(TensorList tensors, int64_t dim, Tensor& result) { offset += slice_dim_size; } } else { - for (auto const &tensor: tensors) { - if (should_skip(tensor)) { + for (const Tensor& tensor: materialized) { + if (cat_should_skip_tensor(tensor)) { continue; } auto slice_dim_size = tensor.sizes()[dim]; @@ -327,24 +398,6 @@ Tensor & _cat_out_cpu(TensorList tensors, int64_t dim, Tensor& result) { offset += slice_dim_size; } } - - return result; -} - -Tensor _cat_cpu(TensorList tensors, int64_t dim) { - ScalarType high_type = result_type(tensors); - Tensor result = at::empty({0}, tensors[0].options().dtype(high_type)); - return native::_cat_out_cpu(tensors, dim, result); -} - -Tensor & cat_out(TensorList tensors, int64_t dim, Tensor & result) { - auto maybe_outnames = namedinference::compute_cat_outnames(tensors); - { - NoNamesGuard guard; - at::_cat_out(result, tensors, dim); - } - namedinference::propagate_names_if_nonempty(result, maybe_outnames); - return result; } Tensor& cat_out(TensorList tensors, Dimname dim, Tensor& result) { @@ -404,7 +457,7 @@ static void check_cat_sparse_dims(Tensor const &t, ", but tensor at position ", pos, " has ", t.sparse_dim(), ", ", t.dense_dim(), "."); } -static Tensor cat_sparse(TensorList tensors, int64_t dim) { +static Tensor cat_sparse_impl(TensorList tensors, int64_t dim) { std::vector indices; std::vector values; int64_t wrapped = maybe_wrap_dim(dim, tensors[0].dim()); @@ -501,15 +554,15 @@ static Tensor cat_sparse(TensorList tensors, int64_t dim) { t._values().options().layout_opt(), t._values().options().device_opt(), t._values().options().pinned_memory_opt()); - vals_pieces.push_back(native::cat({z1, t._values(), z2}, values_dim)); + vals_pieces.push_back(at::cat({z1, t._values(), z2}, values_dim)); idxs_pieces.push_back(t._indices()); } auto sizes_copy = sizes.vec(); sizes_copy[wrapped] = total_size; // This can create an uncoalesced tensor return native::sparse_coo_tensor( - native::cat(idxs_pieces, 1), - native::cat(vals_pieces), + at::cat(idxs_pieces, 1), + at::cat(vals_pieces), sizes_copy, optTypeMetaToScalarType(tensors[0].options().dtype_opt()), tensors[0].options().layout_opt(), @@ -518,18 +571,9 @@ static Tensor cat_sparse(TensorList tensors, int64_t dim) { } } -Tensor cat(TensorList tensors, int64_t dim) { - if (tensors.size() > 0 && - tensors[0].is_sparse()) { - return cat_sparse(tensors, dim); - } - +Tensor cat_sparse(TensorList tensors, int64_t dim) { auto maybe_outnames = namedinference::compute_cat_outnames(tensors); - Tensor result; - { - NoNamesGuard guard; - result = at::_cat(tensors, dim); - } + auto result = cat_sparse_impl(tensors, at::legacy_cat_wrap_dim(dim, tensors)); namedinference::propagate_names_if_nonempty(result, maybe_outnames); return result; } @@ -798,6 +842,11 @@ Tensor diag_embed(const Tensor& self, int64_t offset, int64_t dim1_, int64_t dim return result; } +Tensor expand_symint(const Tensor& self, c10::SymIntArrayRef packed_size, bool implicit) { + auto size = expectIntArrayRef(packed_size); + return expand(self, size, implicit); +} + Tensor expand(const Tensor& self, IntArrayRef size, bool /*unused*/) { TORCH_CHECK(size.size() >= (size_t)self.dim(), "expand(", self.toString(), "{", self.sizes(), "}, size=", size, @@ -877,6 +926,19 @@ const Tensor &as_strided_(const Tensor& self, IntArrayRef size, IntArrayRef stri return self; } +Tensor narrow_copy_symint(const Tensor& self, int64_t dim, int64_t start, SymInt sym_length) { + return narrow_copy(self, dim, start, sym_length.expect_int()); +} + +Tensor narrow_copy_dense(const Tensor& self, int64_t dim, int64_t start, int64_t length) { + return self.narrow(dim, start, length).clone(at::MemoryFormat::Contiguous); +} + +Tensor narrow_copy_dense_cpu(const Tensor& self, int64_t dim, int64_t start, int64_t length){ + auto output = at::empty_like(self); + return narrow_copy_dense_cpu_out(self, dim, start, length, output); +} + Tensor narrow_copy_sparse(const Tensor& self, int64_t dim, int64_t start, int64_t length) { int64_t allDim = self.dim(); int64_t end = start+length; @@ -914,6 +976,7 @@ Tensor narrow_copy_sparse(const Tensor& self, int64_t dim, int64_t start, int64_ Tensor& narrow_copy_dense_cpu_out( const Tensor& self, int64_t dim, int64_t start, int64_t length, Tensor& output ) { + TORCH_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor."); TORCH_CHECK(self.dtype() == output.dtype()); @@ -991,15 +1054,6 @@ Tensor& narrow_copy_dense_cpu_out( return output; } -Tensor narrow_copy_dense(const Tensor& self, int64_t dim, int64_t start, int64_t length){ - return self.narrow(dim, start, length).clone(at::MemoryFormat::Contiguous); -} - -Tensor narrow_copy_dense_cpu(const Tensor& self, int64_t dim, int64_t start, int64_t length){ - auto output = at::empty_like(self); - return narrow_copy_dense_cpu_out(self, dim, start, length, output); -} - Tensor narrow(const Tensor& self, int64_t dim, int64_t start, int64_t length) { TORCH_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor."); auto cur_size = self.size(dim); @@ -1159,7 +1213,7 @@ Tensor reshape(const Tensor& self, IntArrayRef proposed_shape) { // // We need to do the checks here instead of in `native_functions.yaml` // to preserve backwards compatibility. - if (!self.is_xla() && !self.is_lazy()) { + if (!self.is_xla() && !self.is_lazy() && !self.is_ipu()) { return self._reshape_alias(shape, stride.value()); } else { return self.view(shape); @@ -1302,7 +1356,7 @@ Tensor select_backward(const Tensor& grad, IntArrayRef input_sizes, int64_t dim, return grad_input; } -Tensor index_select_sparse(const Tensor& self, int64_t dim, const Tensor& index) { +Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& index) { /* Algorithm: index - a 1-D tensor of indicies with shape (n,) @@ -1315,81 +1369,627 @@ Tensor index_select_sparse(const Tensor& self, int64_t dim, const Tensor& index) new_values - shape is (new_nnz,) + dense_shape if dim < len(sparse_shape): - for i, idx in enumerate(index): - for j, jdx in enumerate(indices[dim]): - if idx == jdx: - icol = indices[:dim][j] + (i,) + indices[dim+1:][j] - new_indices.add_column(icol) - new_values.add_row(values[j]) + # Find new_indices[dim] of the output sparse tensor and + # indices at which to select values/indices. + # The CPP code uses (binary/in a count table) search to find matches and may + # swap the loop order for better algorithmic complexity. + new_dim_indices = [] + selected_dim_indices = [] + # This is a brute-force algorithms to convey the main idea. + # The CPP code below is more efficient but more complicated. + for i, i_idx in enumerate(indices[dim]): + for j, j_idx in enumerate(index): + if i_idx == j_idx: + new_dim_indices.append(j) + selected_dim_indices.append(i) + new_indices = indices.index_select(1, selected_dim_indices) + new_values = values.index_select(0, selected_dim_indices) + new_indices[dim] = new_dim_indices else: new_indices = indices - new_values[k] = values[k].index_select(dim - len(sparse_shape), index) for k in range(nnz) + new_values = values.index_select(dim - sparse_dim + 1, index); */ - auto ndim = self.dim(); - if (ndim == 0) { - TORCH_CHECK_INDEX(false, "index_select() cannot be applied to a 0-dim tensor."); - } - if (!(index.dim() == 1 && index.dtype() == at::kLong)) { - TORCH_CHECK_INDEX(false, "index_select() argument index must be 1-D long-tensor."); - } + const auto ndim = self.dim(); + TORCH_CHECK_INDEX(ndim, "index_select() cannot be applied to a 0-dim tensor."); + TORCH_CHECK_INDEX( + index.dim() == 1 && index.dtype() == at::kLong && index.options().layout() == at::kStrided, + "index_select() argument index must be 1-D strided (non-sparse) long-tensor."); dim = maybe_wrap_dim(dim, ndim); - auto size = self.size(dim); - auto sparse_dim = self.sparse_dim(); - auto dense_dim = self.dense_dim(); - auto indices = self._indices(); - auto values = self._values(); - auto nnz = values.size(0); - auto new_sizes = self.sizes().vec(); - new_sizes[dim] = index.size(0); + const auto size = self.size(dim); + const auto sparse_dim = self.sparse_dim(); + const auto dense_dim = self.dense_dim(); + const auto indices = self._indices(); + const auto values = self._values(); + const auto nnz = values.size(0); + const auto index_len = index.size(0); + auto res_sizes = self.sizes().vec(); + res_sizes[dim] = index_len; + + // Equivalent to t.index_select(dim, idx), but vanilla index_select is not parallel, + // so we use gather instead. + // We use this method to select relevant indices/values + // from the intersection between indices[dim] and the index. + const auto index_select = [](const Tensor& t, int64_t dim, const Tensor& idx) -> Tensor { + const auto idx_len = idx.numel(); + auto out_shape = t.sizes().vec(); + out_shape[dim] = idx_len; + auto idx_shape = std::vector(t.dim(), 1); + idx_shape[dim] = idx_len; + return t.gather(dim, idx.view(idx_shape).expand(out_shape)); + }; + // If indexing into sparse dimensions if (dim < sparse_dim) { + // short-circuit if index is empty + if (!index_len) { + auto res_indices = index_select(indices, 1, index); + res_indices[dim] = index; + const auto res_values = index_select(values, 0, index); - auto cpu_dim_indices = indices[dim].to(c10::kCPU).contiguous(); - int64_t* cpu_dim_indices_ptr = cpu_dim_indices.data_ptr(); - auto cpu_index = index.to(c10::kCPU).contiguous(); - int64_t* cpu_index_ptr = cpu_index.data_ptr(); - std::vector zindices; - std::vector iindices; - int64_t new_nnz = 0; - for (const auto i : c10::irange(new_sizes[dim])) { - int64_t idx = cpu_index_ptr[i]; - if (idx < -size || idx >= size) { - TORCH_CHECK_INDEX(false, "index_select(): index contains ", idx, " that is out of range for tensor of size ", - self.sizes(), " at dimension ", dim); + return _sparse_coo_tensor_with_dims_and_tensors( + sparse_dim, dense_dim, res_sizes, res_indices, res_values, self.options()); + } + + const auto nneg_index = [&index, index_len, &self, size, dim]() -> Tensor { + const auto index_contiguous = index.contiguous(); + auto nneg_index = at::empty_like(index_contiguous); + // nneg_index = (index < 0) * (index + size) + (index >= 0) * index + auto* ptr_index = index_contiguous.data_ptr(); + auto* ptr_nneg_index = nneg_index.data_ptr(); + at::parallel_for(0, index_len, at::internal::GRAIN_SIZE, [&](int64_t start, int64_t end) { + const auto* src = ptr_index + start; + auto* dst = ptr_nneg_index + start; + for (C10_UNUSED const auto _ : c10::irange(start, end)) { + auto idx = *src++; + if (idx < -size || idx >= size) { + // Mark self and dim as used if code is compiled with STRIP_ERROR_MESSAGES + (void)dim; + (void)self; + TORCH_CHECK_INDEX(false, + "index_select(): index contains ", idx, " that is out of range for tensor of size ", + self.sizes(), " at dimension ", dim + ); + } + if (idx < 0) { + idx += size; + } + *dst++ = idx; + } + }); + + return nneg_index; + }(); + + const auto dim_indices = indices[dim].contiguous(); + + // If nnz is smaller than size, then either indices[dim] or index gets sorted, + // then this is followed by a binary search to find interesections. + const auto get_selected_indices_small_nnz_large_size = [&]() -> std::tuple { + const auto grain_size = at::internal::GRAIN_SIZE; + const auto n_threads_nnz = std::max( + 1, std::min((nnz + grain_size - 1) / grain_size, at::get_num_threads()) + ); + const auto n_threads_index = std::max( + 1, std::min((index_len + grain_size - 1) / grain_size, at::get_num_threads()) + ); + const auto search_in_dim_indices + // if either dim_indices or index requires sorting, we compare + // the cost of sort + binary search, which is comparing + // (len(dim_indices) + len(index)) * log(len(index)) to + // (len(dim_indices) + len(index)) * log(len(dim_indices)). + // That simplifies to comparing len(dim_indices) to len(index). + // Additionally, we take into consideration potential parallel + // speedup. + = (nnz / n_threads_nnz <= index_len / n_threads_index) + // if self is coalesced and dim is 0, then we compare + // index_len * log(len(dim_indices)), which is binary search into dim_indices, + // to (len(index_len) + len(dim_indices)) * log(index_len). + // Additionally, we take into consideration potential parallel + // speedup. + || (self.is_coalesced() && dim == 0 + && (index_len * std::log2(nnz) / n_threads_index + <= (nnz / n_threads_nnz + index_len) * std::log2(index_len))) + ? true : false; + + // src is a source of indices to binary search in sorted + Tensor sorted, sorted_idx, src; + std::tie(sorted, sorted_idx, src) = [ + &dim_indices, &nneg_index, &self, + search_in_dim_indices, dim, nnz + ](void) -> std::tuple { + // sort dim_indices to binary search into it + if (search_in_dim_indices) { + // dim_indices is already sorted if self is coalesced and dim == 0 + if (self.is_coalesced() && dim == 0) { + return std::make_tuple(dim_indices, at::arange(nnz, dim_indices.options()), nneg_index); + } + else { + Tensor sorted_dim_indices, sorted_dim_indices_idx; + std::tie(sorted_dim_indices, sorted_dim_indices_idx) = dim_indices.sort(); + return std::make_tuple(sorted_dim_indices, sorted_dim_indices_idx, nneg_index); + } + } + // sort nneg_index to binary search into it + else { + Tensor sorted_nneg_index, sorted_nneg_index_idx; + std::tie(sorted_nneg_index, sorted_nneg_index_idx) = nneg_index.sort(); + return std::make_tuple(sorted_nneg_index, sorted_nneg_index_idx, dim_indices); + } + }(); + + const auto src_grain_size = at::internal::GRAIN_SIZE; + const auto src_len = src.numel(); + const auto n_threads_src = std::max( + // 1 <= n_threads_src <= std::min(ceil(src.numel() / src_grain_size), max_threads) + 1, std::min((src_len + src_grain_size - 1) / src_grain_size, at::get_num_threads()) + ); + const auto chunk_size_src = (src_len + n_threads_src - 1) / n_threads_src; + + const std::vector src_n_threads_shape = { + n_threads_src, (src_len + n_threads_src - 1) / n_threads_src + }; + + // src_int_idx and sorted_int_idx store "i" and "j" indices indicating + // intersections such that src_int_idx[i] == sorted_int_idx[j]. + // These intersections are found with binary search and in parallel. + auto src_int_idx = at::empty(src_n_threads_shape, src.options()); + auto sorted_int_idx = at::empty_like(src_int_idx); + // For each element "i" from src, int_counts define how many + // elements there are in sorted, i.e. "j" indices, corresponding + // to "i", i.e.: + // |{j : src_int_idx[i] == sorted_int_idx[j]}| for each i in src_int_idx. + auto int_counts = at::zeros_like(src_int_idx); + + // fill in src_int_idx, sorted_int_idx, int_counts + { + const auto sorted_len = sorted.numel(); + const auto* ptr_sorted = sorted.data_ptr(); + const auto* ptr_sorted_start = ptr_sorted; + const auto* ptr_sorted_end = ptr_sorted + sorted_len; + + at::parallel_for(0, n_threads_src, 1, [&](int64_t tid, C10_UNUSED int64_t _) { + const auto start = tid * chunk_size_src; + const auto end = std::min(start + chunk_size_src, src_len); + auto* ptr_tid_src_int_idx = src_int_idx.select(0, tid).data_ptr(); + auto* ptr_tid_sorted_int_idx = sorted_int_idx.select(0, tid).data_ptr(); + auto* ptr_tid_int_counts = int_counts.select(0, tid).data_ptr(); + const auto* ptr_src = src.data_ptr() + start; + + for (const auto i : c10::irange(start, end)) { + const auto src_val = *ptr_src++; + const auto src_val_lb = std::lower_bound(ptr_sorted_start, ptr_sorted_end, src_val); + // We cannot just use *src_val_lb != src_val because when + // src_val_lb == ptr_sorted_end, dereferencing past-the-end value + // is not well-defined. + if (src_val_lb == ptr_sorted_end || *src_val_lb != src_val) { + ++ptr_tid_src_int_idx; + ++ptr_tid_sorted_int_idx; + ++ptr_tid_int_counts; + continue; + } + const auto src_val_ub = std::upper_bound(ptr_sorted_start, ptr_sorted_end, src_val); + + const int64_t count = src_val_ub - src_val_lb; + const int64_t j = src_val_lb - ptr_sorted_start; + + *ptr_tid_src_int_idx++ = i; + *ptr_tid_sorted_int_idx++ = j; + *ptr_tid_int_counts++ = count; + } + }); } - if (idx < 0) { - idx += size; + + const auto compressed_int_counts = int_counts.sum(-1); + const auto res_len = compressed_int_counts.sum().item(); + + // Short-circuit if empty intersection + if (!res_len) { + auto empty_idx = at::empty({0}, src.options()); + return std::make_tuple(empty_idx, empty_idx); + } + + // Now that we know "i", "j" and the counts, we "unflatten" + // them into two arrays of intersection indices such that + // selected_src = repeat_interleave(src_int_idx, int_counts), + // and selected_sorted is obtained as follows: + // offsets = int_counts.cumsum(0).sub_(int_counts) + // for ii, (j, c) in enumerate(zip(sorted_int_idx, int_counts)): + // out_slice = slice(offsets[ii], offsets[ii] + c) + // src_slice = slice(j, j + c) + // selected_sorted[out_slice] = sorted_int_idx[src_slice] + auto selected_sorted = at::empty({res_len}, sorted.options()); + auto selected_src = at::empty({res_len}, src.options()); + + // fill in selected_sorted, selected_src + { + auto* ptr_selected_sorted = selected_sorted.data_ptr(); + auto* ptr_selected_src = selected_src.data_ptr(); + + const auto thread_offsets = compressed_int_counts.cumsum(0).sub_(compressed_int_counts); + const auto* ptr_sorted_idx = sorted_idx.data_ptr(); + at::parallel_for(0, n_threads_src, 1, [&](int64_t tid, C10_UNUSED int64_t _) { + const auto start = tid * chunk_size_src; + const auto end = std::min(start + chunk_size_src, src_len); + const auto tid_offset = thread_offsets.data_ptr()[tid]; + const auto* ptr_tid_src_int_idx = src_int_idx.select(0, tid).data_ptr(); + const auto* ptr_tid_sorted_int_idx = sorted_int_idx.select(0, tid).data_ptr(); + const auto* ptr_tid_int_counts = int_counts.select(0, tid).data_ptr(); + auto* ptr_tid_selected_sorted = ptr_selected_sorted + tid_offset; + auto* ptr_tid_selected_src = ptr_selected_src + tid_offset; + + for (C10_UNUSED const auto _ : c10::irange(start, end)) { + const auto count = *ptr_tid_int_counts++; + const auto i = *ptr_tid_src_int_idx++; + const auto j = *ptr_tid_sorted_int_idx++; + if (!count) continue; + + std::fill_n(ptr_tid_selected_src, count, i); + std::copy_n(ptr_sorted_idx + j, count, ptr_tid_selected_sorted); + + ptr_tid_selected_sorted += count; + ptr_tid_selected_src += count; + } + }); } - for (const auto j : c10::irange(nnz)) { - int64_t jdx = cpu_dim_indices_ptr[j]; - if (idx == jdx) { - new_nnz++; - iindices.push_back(i); - zindices.push_back(j); + + return search_in_dim_indices + ? std::make_tuple(selected_sorted, selected_src) + : std::make_tuple(selected_src, selected_sorted); + }; + + // Converts a 1d sorted idx to a compressed 1d compressed idx, + // aka crow in the CSR format. Useful to get a count table in + // a parallelized and no-sync manner. + // TODO: this function is equivalent to _convert_indices_from_coo_to_csr. + // The mentioned function is not public yet. + const auto sorted_idx_to_cidx = []( + const Tensor& idx, + int64_t len, + bool run_in_parallel = true) -> Tensor { + auto cidx = at::empty({len + 1}, idx.options()); + + const auto* ptr_idx = idx.data_ptr(); + auto* ptr_cidx = cidx.data_ptr(); + + const auto idx_len = idx.numel(); + + std::fill_n(ptr_cidx, ptr_idx[0] + 1, 0); + std::fill_n(ptr_cidx + ptr_idx[idx_len - 1] + 1, len - ptr_idx[idx_len - 1], idx_len); + + const auto grain_size = run_in_parallel ? at::internal::GRAIN_SIZE : idx_len; + at::parallel_for(0, idx_len, grain_size, [&](int64_t start, int64_t end) { + auto* ptr_curr_cidx = ptr_cidx + ptr_idx[start] + 1; + for (int64_t i = start; i < std::min(end, idx_len - 1); ++i) { + const auto diff = ptr_idx[i + 1] - ptr_idx[i]; + std::fill_n(ptr_curr_cidx, diff, i + 1); + ptr_curr_cidx += diff; + } + }); + + return cidx; + }; + + // If nnz is (much) larger than size, then both indices[dim] and index get sorted + // with a count sort (faster, and no huge nnz-sized chunk memory allocations). + // The element-wise product between the count tables gives us all the intersections. + const auto get_selected_indices_large_nnz_small_size = [&]() -> std::tuple { + const auto get_counts = [&sorted_idx_to_cidx]( + // Writes into counts (must be preallocated and zero) + // and allows to use external buffers. + Tensor& counts, + const Tensor& t, + int64_t bins, + bool is_sorted = false, + bool run_in_parallel = true) -> void { + if (is_sorted) { + const auto cidx = sorted_idx_to_cidx(t, bins, run_in_parallel); + at::sub_out(counts, cidx.slice(0, 1, bins + 1), cidx.slice(0, 0, bins)); + } + else { + auto* ptr_counts = counts.data_ptr(); + const auto* ptr_vals = t.data_ptr(); + for (C10_UNUSED const auto _ : c10::irange(t.numel())) { + ++ptr_counts[*ptr_vals++]; + } } + }; + + const auto counts_per_thread = [&get_counts, size]( + const Tensor& idx, + bool is_sorted = false, + int64_t grain_size = at::internal::GRAIN_SIZE + ) -> Tensor { + const auto idx_len = idx.numel(); + // 1 <= n_threads <= min(ceil(len / grain_size), max_threads) + const auto n_threads = std::max( + 1, std::min((idx_len + grain_size - 1) / grain_size, at::get_num_threads()) + ); + const auto chunk_size = (idx_len + n_threads - 1) / n_threads; + const auto run_in_parallel = (n_threads == 1); + + auto counts_per_thread = at::zeros({n_threads, size}, idx.options()); + at::parallel_for(0, n_threads, 1, [&](int64_t tid, C10_UNUSED int64_t _) { + const auto start = tid * chunk_size; + const auto end = std::min(start + chunk_size, idx_len); + const auto tid_idx = idx.slice(0, start, end); + auto tid_counts = counts_per_thread.select(0, tid); + get_counts(tid_counts, tid_idx, /*bins=*/size, + /*is_sorted=*/is_sorted, /*run_in_parallel=*/run_in_parallel); + }); + + return counts_per_thread; + }; + + auto dim_indices_counts_per_thread = counts_per_thread( + dim_indices, + /*is_sorted=*/self.is_coalesced() && dim == 0 + /*grain_size = at::internal::GRAIN_SIZE*/ + ); + auto dim_indices_offset_counts_per_thread = dim_indices_counts_per_thread.cumsum(0); + + auto index_counts_per_thread = counts_per_thread( + nneg_index, + /*is_sorted=*/false + /*grain_size = at::internal::GRAIN_SIZE*/ + ); + auto index_offset_counts_per_thread = index_counts_per_thread.cumsum(0); + + const auto index_counts = index_offset_counts_per_thread.select(0, -1); + const auto dim_indices_counts = dim_indices_offset_counts_per_thread.select(0, -1); + const auto intersection_counts = index_counts.mul(dim_indices_counts); + const auto res_len = intersection_counts.sum().item(); + // Short-circuit if empty intersection + if (!res_len) { + auto empty_idx = at::empty({0}, index.options()); + return std::make_tuple(empty_idx, empty_idx); } - } - auto zIndices = at::from_blob(zindices.data(), {new_nnz}, at::kLong).to(indices.device()); - auto new_indices = indices.index_select(1, zIndices); - new_indices[dim] = at::from_blob(iindices.data(), {new_nnz}, at::kLong).to(indices.device()); - auto new_values = values.index_select(0, zIndices); - return _sparse_coo_tensor_with_dims_and_tensors( - sparse_dim, dense_dim, new_sizes, new_indices, new_values, self.options()); + const auto intersection_offsets = intersection_counts.cumsum(0); + + const auto search_in_dim_indices = [&]() -> bool { + const auto grain_size = at::internal::GRAIN_SIZE; + const auto n_threads_index = std::max( + 1, std::min((index_len + grain_size - 1) / grain_size, at::get_num_threads()) + ); + const auto n_threads_dim_indices = std::max( + 1, std::min((nnz + grain_size - 1) / grain_size, at::get_num_threads()) + ); + + const auto index_max_copy_work_per_thread = + index_counts_per_thread.mul(dim_indices_counts).sum(-1).max().item(); + const auto dim_indices_max_copy_work_per_thread + = dim_indices_counts_per_thread.mul(index_counts).sum(-1).max().item(); + + const auto index_max_work_per_thread = index_max_copy_work_per_thread * index_len / n_threads_index; + const auto dim_indices_max_work_per_thread = dim_indices_max_copy_work_per_thread * nnz / n_threads_dim_indices; + return index_max_work_per_thread <= dim_indices_max_work_per_thread + ? true + : false; + }(); + + Tensor idx, idx_counts_per_thread, idx_offset_counts_per_thread; + Tensor src, src_counts_per_thread, src_offset_counts_per_thread; + std::tie( + idx, idx_counts_per_thread, idx_offset_counts_per_thread, + src, src_counts_per_thread, src_offset_counts_per_thread + ) = [&]() { + return search_in_dim_indices + ? std::make_tuple( + nneg_index, index_counts_per_thread, index_offset_counts_per_thread, + dim_indices, dim_indices_counts_per_thread, dim_indices_offset_counts_per_thread + ) + : std::make_tuple( + dim_indices, dim_indices_counts_per_thread, dim_indices_counts_per_thread.cumsum(0), + nneg_index, index_counts_per_thread, index_counts_per_thread.cumsum(0) + ); + }(); + + const auto idx_counts = idx_offset_counts_per_thread.select(0, -1); + const auto src_counts = src_offset_counts_per_thread.select(0, -1); + + Tensor src_idx, src_idx_offsets; + std::tie(src_idx, src_idx_offsets) = [&]( + int64_t grain_size = at::internal::GRAIN_SIZE + ) -> std::tuple { + const auto src_intersection_counts = src_counts.mul(idx_counts > 0); + const auto src_intersection_offsets = src_intersection_counts.cumsum(0); + const auto src_idx_len = src_intersection_offsets.data_ptr()[size - 1]; + auto src_idx = at::empty({src_idx_len}, src.options()); + + const auto* ptr_src = src.data_ptr(); + const auto* ptr_intersection_counts = intersection_counts.data_ptr(); + const auto* ptr_src_intersection_counts = src_intersection_counts.data_ptr(); + const auto* ptr_src_intersection_offsets = src_intersection_offsets.data_ptr(); + auto* ptr_src_idx = src_idx.data_ptr(); + + const auto src_len = src.numel(); + const auto n_threads_src = std::max( + 1, std::min((src_len + grain_size - 1) / grain_size, at::get_num_threads()) + ); + const auto chunk_size = (src_len + n_threads_src - 1) / n_threads_src; + at::parallel_for(0, n_threads_src, 1, [&](int64_t tid, C10_UNUSED int64_t _) { + const auto start = tid * chunk_size; + const auto end = std::min(start + chunk_size, src_len); + auto* ptr_src_tid = ptr_src + start; + const auto* ptr_src_counts_per_thread + = src_counts_per_thread.select(0, tid).data_ptr(); + const auto* ptr_src_offset_counts_per_thread + = src_offset_counts_per_thread.select(0, tid).data_ptr(); + auto tid_counts = at::zeros({size}, src.options()); + auto* ptr_tid_counts = tid_counts.data_ptr(); + + for (const auto i : c10::irange(start, end)) { + const auto idx_val = *ptr_src_tid++; + // skip idx value if not in the intersection + if (!ptr_intersection_counts[idx_val]) continue; + const auto idx_val_offset + = ptr_src_intersection_offsets[idx_val] + - ptr_src_intersection_counts[idx_val]; + const auto idx_val_tid_offset + = ptr_src_offset_counts_per_thread[idx_val] + - ptr_src_counts_per_thread[idx_val]; + auto& idx_val_local_tid_count = ptr_tid_counts[idx_val]; + ptr_src_idx[idx_val_offset + idx_val_tid_offset + idx_val_local_tid_count] = i; + ++idx_val_local_tid_count; + } + }); + + const auto src_idx_offsets = src_intersection_offsets.sub_(src_intersection_counts); + + return std::make_tuple(src_idx, src_idx_offsets); + }(); + + Tensor idx_selected, src_selected; + std::tie(idx_selected, src_selected) = [&]( + int64_t grain_size = at::internal::GRAIN_SIZE + ) -> std::tuple { + const auto thread_offset = [&]() { + // we do not need idx_counts_per_thread anymore, + // so it is safe to do in-place intersection. + auto counts_per_thread = idx_counts_per_thread.mul_(src_counts).sum(-1); + return counts_per_thread.cumsum(0).sub_(counts_per_thread); + }(); + const auto* ptr_thread_offset = thread_offset.data_ptr(); + + auto idx_selected = at::empty({res_len}, idx.options()); + auto src_selected = at::empty({res_len}, src.options()); + + const auto* ptr_idx = idx.data_ptr(); + const auto* ptr_src_counts = src_counts.data_ptr(); + const auto* ptr_intersection_counts = intersection_counts.data_ptr(); + const auto* ptr_src_idx = src_idx.data_ptr(); + const auto* ptr_src_idx_offsets = src_idx_offsets.data_ptr(); + auto* ptr_idx_selected = idx_selected.data_ptr(); + auto* ptr_src_selected = src_selected.data_ptr(); + + const auto idx_len = idx.numel(); + const auto n_threads_idx = std::max( + 1, std::min((idx_len + grain_size - 1) / grain_size, at::get_num_threads()) + ); + const auto chunk_size = (idx_len + n_threads_idx - 1) / n_threads_idx; + at::parallel_for(0, n_threads_idx, 1, [&](int64_t tid, C10_UNUSED int64_t _) { + const auto start = tid * chunk_size; + const auto end = std::min(start + chunk_size, idx_len); + const auto tid_offset = ptr_thread_offset[tid]; + const auto* ptr_idx_tid = ptr_idx + start; + auto* ptr_idx_selected_tid = ptr_idx_selected + tid_offset; + auto* ptr_src_selected_tid = ptr_src_selected + tid_offset; + + for (const auto i : c10::irange(start, end)) { + const auto idx_val = *ptr_idx_tid++; + // skip if idx_val is not in the intersection + if (!ptr_intersection_counts[idx_val]) continue; + const auto count = ptr_src_counts[idx_val]; + const auto j = ptr_src_idx_offsets[idx_val]; + std::fill_n(ptr_idx_selected_tid, count, i); + std::copy_n(ptr_src_idx + j, count, ptr_src_selected_tid); + ptr_idx_selected_tid += count; + ptr_src_selected_tid += count; + } + }); + + return std::make_tuple(idx_selected, src_selected); + }(); + + return search_in_dim_indices + ? std::make_tuple(src_selected, idx_selected) + : std::make_tuple(idx_selected, src_selected); + }; + + const auto make_output = [&]( + const Tensor& selected_dim_indices, + const Tensor& res_dim_indices) -> Tensor { + auto res_indices = index_select(indices, 1, selected_dim_indices); + res_indices[dim] = res_dim_indices; + const auto res_values = index_select(values, 0, selected_dim_indices); - } else { + return _sparse_coo_tensor_with_dims_and_tensors( + sparse_dim, dense_dim, res_sizes, res_indices, res_values, self.options()); + }; + + // Brute-force solution for small values of nnz and index_len + const auto get_result_small_nnz_small_index = [&]() + -> Tensor { + const auto dim_indices_in_inner_loop = nnz >= index_len; + Tensor outer, inner; + std::tie(outer, inner) = [&]() -> std::tuple { + if (dim_indices_in_inner_loop) { + return std::make_tuple(nneg_index, dim_indices); + } + else { + return std::make_tuple(dim_indices, nneg_index); + } + }(); + + const auto* ptr_outer = outer.data_ptr(); + const auto* ptr_inner = inner.data_ptr(); + // NOTE: if very critical, replace std::vector with + // a data structure that operates on stack up to some limit. + auto outer_selected_idx = std::vector(); + auto inner_selected_idx = std::vector(); + int64_t res_len = 0; + for (const auto i : c10::irange(outer.numel())) { + for (const auto j : c10::irange(inner.numel())) { + if (ptr_outer[i] == ptr_inner[j]) { + ++res_len; + outer_selected_idx.push_back(i); + inner_selected_idx.push_back(j); + } + } + } - auto vsize = values.sizes().vec(); - vsize[dim + 1 - sparse_dim] = index.size(0); - auto new_values = at::empty(vsize, values.options()); - for (const auto k : c10::irange(nnz)) { - new_values[k] = values[k].index_select(dim - sparse_dim, index); + const auto outer_selected_idx_tensor = at::from_blob( + outer_selected_idx.data(), {res_len}, at::kLong + ); + const auto inner_selected_idx_tensor = at::from_blob( + inner_selected_idx.data(), {res_len}, at::kLong + ); + + return dim_indices_in_inner_loop + ? make_output(inner_selected_idx_tensor, outer_selected_idx_tensor) + : make_output(outer_selected_idx_tensor, inner_selected_idx_tensor); + }; + + constexpr int64_t BRUTE_FORCE_SIZE_LIMIT = 2 << 14; // 16384 + // NOTE: such a condition to avoid overflows in (nnz * index_len) + if (nnz <= BRUTE_FORCE_SIZE_LIMIT && index_len <= BRUTE_FORCE_SIZE_LIMIT + && (nnz * index_len) <= BRUTE_FORCE_SIZE_LIMIT) { + return get_result_small_nnz_small_index(); } - return _sparse_coo_tensor_with_dims_and_tensors( - sparse_dim, dense_dim, new_sizes, indices, new_values, self.options()); + else { + Tensor selected_dim_indices; + Tensor res_dim_indices; + + // A more precise decision could be of the form: + // `nnz < C(nnz, size) * size`, but it requires heavy benchmarking. + // We choose `nnz < size`, which measures theoretical complexity + // and does not rely on runtime performance. + // TODO: perform this analysis and find better C(nnz, size). + if (nnz <= size) { + std::tie(selected_dim_indices, res_dim_indices) = get_selected_indices_small_nnz_large_size(); + } + else { + std::tie(selected_dim_indices, res_dim_indices) = get_selected_indices_large_nnz_small_size(); + } + return make_output(selected_dim_indices, res_dim_indices); + } + } + // If indexing into dense dimensions + else { + // It is sufficient to just perform `index_select` on values + // if `dim` refers to dense dimensions. + const auto res_values = index_select(values, dim - sparse_dim + 1, index); + + return _sparse_coo_tensor_with_dims_and_tensors( + sparse_dim, dense_dim, res_sizes, indices, res_values, self.options()); } } +Tensor index_select_sparse_cuda(const Tensor& self, int64_t dim, const Tensor& index) { + auto res = index_select_sparse_cpu(self.to(at::kCPU), dim, index.to(at::kCPU)); + return res.to(self.device()); +} + Tensor slice( const Tensor& self, int64_t dim, @@ -1453,21 +2053,9 @@ Tensor slice_backward(const Tensor& grad, IntArrayRef input_sizes, int64_t dim, } std::vector split(const Tensor& self, int64_t split_size, int64_t dim) { - TORCH_CHECK(self.dim() != 0, "split expects at least a 1-dimensional tensor"); - TORCH_CHECK(split_size >= 0, "split expects split_size be non-negative, but got split_size=", split_size); - int64_t dim_size = self.size(dim); - TORCH_CHECK(split_size > 0 || dim_size == 0, - "split_size can only be 0 if dimension size is 0, " - "but got dimension size of ", dim_size); - // if split_size is 0 and dimension size is 0, there is 1 split. - int64_t num_splits = 1; - if (split_size != 0) { - // ensuring num_splits is at least 1 makes consistent the case where split_size > dim_size - // (returns a single split). We might want to error here, but keep it for BC. - num_splits = std::max((dim_size + split_size - 1) / split_size, 1); - } + const auto num_splits = get_num_splits(self, split_size, dim); std::vector splits(num_splits); - int64_t last_split_size = split_size - (split_size * num_splits - dim_size); + int64_t last_split_size = split_size - (split_size * num_splits - self.size(dim)); for (const auto i : c10::irange(num_splits)) { auto length = i < num_splits - 1 ? split_size : last_split_size; @@ -1476,6 +2064,10 @@ std::vector split(const Tensor& self, int64_t split_size, int64_t dim) { return splits; } +std::vector split(const Tensor& self, IntArrayRef sizes, int64_t dim) { + return at::split_with_sizes(self, sizes, dim); +} + std::vector unsafe_split(const Tensor& self, int64_t split_size, int64_t dim) { auto result = at::native::split(self, split_size, dim); for (auto& t : result) { @@ -2111,7 +2703,7 @@ Tensor unsqueeze_sparse(Tensor const &self, int64_t dim) { auto sizes = self.sizes().vec(); sizes.insert(sizes.begin() + dim, 1); if (dim <= sparse_dim) { - auto new_indices = native::cat( + auto new_indices = at::cat( {indices.narrow(0, 0, dim), native::zeros( {1, indices.size(1)}, @@ -2218,7 +2810,7 @@ Tensor flatten(const Tensor& self, DimnameList dims, Dimname out_dim) { } Tensor ravel(const Tensor& self) { - return self.reshape(-1); + return self.contiguous().view(-1); } static inline void handle_unflatten_exception(const std::runtime_error &e, @@ -2545,7 +3137,7 @@ Tensor diag(const Tensor& self, int64_t dimension) { } Tensor& diag_cpu_out(const Tensor& self, int64_t dimension, Tensor &result) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Bool, self.scalar_type(), "diag", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kBool, self.scalar_type(), "diag", [&] { apply_diag(result, self, dimension); }); return result; @@ -2736,5 +3328,265 @@ at::Tensor diagonal_scatter(const at::Tensor& self, const at::Tensor& src, int64 return output; } +// The default implementation of lift is a no-op. +// If TLS is set appropriately (for wrapper-tensor keys like Functionalize or functorch transforms), +// then we'll dispatch to one of their implementations, which will properly lift the tensor into a wrapper. +at::Tensor lift(const at::Tensor& self) { + return self; +} + +at::Tensor& _fw_primal_copy_out(const at::Tensor & self, int64_t level, at::Tensor & out) { + auto tmp = self._fw_primal(level); + out.copy_(tmp); + return out; +} + + +at::Tensor& _make_dual_copy_out(const at::Tensor & primal, const at::Tensor & tangent, int64_t level, at::Tensor & out) { + auto tmp = at::_make_dual(primal, tangent, level); + out.copy_(tmp); + return out; +} + + +at::Tensor& view_as_real_copy_out(const at::Tensor & self, at::Tensor & out) { + auto tmp = at::view_as_real(self); + out.copy_(tmp); + return out; +} + + +at::Tensor& view_as_complex_copy_out(const at::Tensor & self, at::Tensor & out) { + auto tmp = at::view_as_complex(self); + out.copy_(tmp); + return out; +} + + +at::Tensor& _conj_copy_out(const at::Tensor & self, at::Tensor & out) { + auto tmp = self._conj(); + out.copy_(tmp); + return out; +} + + +at::Tensor& _neg_view_copy_out(const at::Tensor & self, at::Tensor & out) { + auto tmp = self._neg_view(); + out.copy_(tmp); + return out; +} + + +at::Tensor& as_strided_copy_out(const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, c10::optional storage_offset, at::Tensor & out) { + auto tmp = self.as_strided(size, stride, storage_offset); + out.copy_(tmp); + return out; +} + + +at::Tensor& _sparse_broadcast_to_copy_out(const at::Tensor & self, at::IntArrayRef size, at::Tensor & out) { + auto tmp = at::_sparse_broadcast_to(self, size); + out.copy_(tmp); + return out; +} + + +at::Tensor& diagonal_copy_out(const at::Tensor & self, int64_t offset, int64_t dim1, int64_t dim2, at::Tensor & out) { + auto tmp = self.diagonal(offset, dim1, dim2); + out.copy_(tmp); + return out; +} + + +at::Tensor& expand_copy_SymInt_out(const at::Tensor & self, c10::SymIntArrayRef size, bool implicit, at::Tensor & out) { + auto tmp = self.expand(size, implicit); + out.copy_(tmp); + return out; +} + + +at::Tensor& expand_copy_out(const at::Tensor & self, at::IntArrayRef size, bool implicit, at::Tensor & out) { + auto tmp = self.expand(size, implicit); + out.copy_(tmp); + return out; +} + + +at::Tensor& narrow_copy_out(const at::Tensor & self, int64_t dim, int64_t start, int64_t length, at::Tensor & out) { + auto tmp = self.narrow(dim, start, length); + out.copy_(tmp); + return out; +} + + +at::Tensor& permute_copy_out(const at::Tensor & self, at::IntArrayRef dims, at::Tensor & out) { + auto tmp = self.permute(dims); + out.copy_(tmp); + return out; +} + + +at::Tensor& _reshape_alias_copy_out(const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, at::Tensor & out) { + auto tmp = self._reshape_alias(size, stride); + out.copy_(tmp); + return out; +} + + +at::Tensor& select_copy_int_out(const at::Tensor & self, int64_t dim, int64_t index, at::Tensor & out) { + auto tmp = self.select(dim, index); + out.copy_(tmp); + return out; +} + + +at::Tensor& detach_copy_out(const at::Tensor & self, at::Tensor & out) { + auto tmp = self.detach(); + out.copy_(tmp); + return out; +} + + +at::Tensor& slice_copy_Tensor_out(const at::Tensor & self, int64_t dim, c10::optional start, c10::optional end, int64_t step, at::Tensor & out) { + auto tmp = self.slice(dim, start, end, step); + out.copy_(tmp); + return out; +} + + +void split_copy_Tensor_out(const at::Tensor & self, int64_t split_size, int64_t dim, at::TensorList out) { + auto tmp = self.split(split_size, dim); + + TORCH_CHECK(out.size() == tmp.size(), "split_copy_Tensor_out() expected an out= argument of size ", tmp.size(), ", got size ", out.size()); + for (const auto i : c10::irange(out.size())) { + out[i].copy_(tmp[i]); + } +} + + +void split_with_sizes_copy_out(const at::Tensor & self, at::IntArrayRef split_sizes, int64_t dim, at::TensorList out) { + auto tmp = self.split_with_sizes(split_sizes, dim); + + TORCH_CHECK(out.size() == tmp.size(), "split_with_sizes_copy_out() expected an out= argument of size ", tmp.size(), ", got size ", out.size()); + for (const auto i : c10::irange(out.size())) { + out[i].copy_(tmp[i]); + } +} + + +at::Tensor& squeeze_copy_out(const at::Tensor & self, at::Tensor & out) { + auto tmp = self.squeeze(); + out.copy_(tmp); + return out; +} + + +at::Tensor& squeeze_copy_dim_out(const at::Tensor & self, int64_t dim, at::Tensor & out) { + auto tmp = self.squeeze(dim); + out.copy_(tmp); + return out; +} + + +at::Tensor& t_copy_out(const at::Tensor & self, at::Tensor & out) { + auto tmp = self.t(); + out.copy_(tmp); + return out; +} + + +at::Tensor& transpose_copy_int_out(const at::Tensor & self, int64_t dim0, int64_t dim1, at::Tensor & out) { + auto tmp = self.transpose(dim0, dim1); + out.copy_(tmp); + return out; +} + + +at::Tensor& unsqueeze_copy_out(const at::Tensor & self, int64_t dim, at::Tensor & out) { + auto tmp = self.unsqueeze(dim); + out.copy_(tmp); + return out; +} + + +at::Tensor& _indices_copy_out(const at::Tensor & self, at::Tensor & out) { + auto tmp = self._indices(); + out.copy_(tmp); + return out; +} + + +at::Tensor& _values_copy_out(const at::Tensor & self, at::Tensor & out) { + auto tmp = self._values(); + out.copy_(tmp); + return out; +} + + +at::Tensor& indices_copy_out(const at::Tensor & self, at::Tensor & out) { + auto tmp = self.indices(); + out.copy_(tmp); + return out; +} + + +at::Tensor& values_copy_out(const at::Tensor & self, at::Tensor & out) { + auto tmp = self.values(); + out.copy_(tmp); + return out; +} + + +at::Tensor& crow_indices_copy_out(const at::Tensor & self, at::Tensor & out) { + auto tmp = self.crow_indices(); + out.copy_(tmp); + return out; +} + + +at::Tensor& col_indices_copy_out(const at::Tensor & self, at::Tensor & out) { + auto tmp = self.col_indices(); + out.copy_(tmp); + return out; +} + + +void unbind_copy_int_out(const at::Tensor & self, int64_t dim, at::TensorList out) { + auto tmp = self.unbind(dim); + + TORCH_CHECK(out.size() == tmp.size(), "unbind_copy_int_out() expected an out= argument of size ", tmp.size(), ", got size ", out.size()); + for (const auto i : c10::irange(out.size())) { + out[i].copy_(tmp[i]); + } +} + + +at::Tensor& view_copy_out(const at::Tensor & self, at::IntArrayRef size, at::Tensor & out) { + auto tmp = self.view(size); + out.copy_(tmp); + return out; +} + + +at::Tensor& view_copy_dtype_out(const at::Tensor & self, at::ScalarType dtype, at::Tensor & out) { + auto tmp = self.view(dtype); + out.copy_(tmp); + return out; +} + + +at::Tensor& unfold_copy_out(const at::Tensor & self, int64_t dimension, int64_t size, int64_t step, at::Tensor & out) { + auto tmp = self.unfold(dimension, size, step); + out.copy_(tmp); + return out; +} + + +at::Tensor& alias_copy_out(const at::Tensor & self, at::Tensor & out) { + auto tmp = self.alias(); + out.copy_(tmp); + return out; +} + } // namespace native } // namespace at diff --git a/aten/src/ATen/native/TensorShape.h b/aten/src/ATen/native/TensorShape.h index 19245faff368..bb296b5ae5bc 100644 --- a/aten/src/ATen/native/TensorShape.h +++ b/aten/src/ATen/native/TensorShape.h @@ -1,8 +1,12 @@ -#include +#pragma once +#include #include namespace at { namespace native { +inline bool cat_should_skip_tensor(const Tensor& t) { + return t.numel() == 0 && t.dim() == 1; +} // Check to see if the shape of tensors is compatible // for being concatenated along a given dimension. @@ -30,4 +34,28 @@ inline void check_cat_no_zero_dim(at::ArrayRef tensors) { } } +inline int64_t get_num_splits(const Tensor& self, int64_t split_size, int64_t dim) { + TORCH_CHECK(self.dim() != 0, "split expects at least a 1-dimensional tensor"); + TORCH_CHECK(split_size >= 0, "split expects split_size be non-negative, but got split_size=", split_size); + int64_t dim_size = self.size(dim); + TORCH_CHECK(split_size > 0 || dim_size == 0, + "split_size can only be 0 if dimension size is 0, " + "but got dimension size of ", dim_size); + // if split_size is 0 and dimension size is 0, there is 1 split. + int64_t num_splits = 1; + if (split_size != 0) { + // ensuring num_splits is at least 1 makes consistent the case where split_size > dim_size + // (returns a single split). We might want to error here, but keep it for BC. + num_splits = std::max((dim_size + split_size - 1) / split_size, 1); + } + return num_splits; +} + +/// +/// For more information, see +/// https://pytorch.org/docs/master/generated/torch.Tensor.unfold.html#torch.Tensor.unfold +/// + +Tensor unfold(const Tensor& self, int64_t dimension, int64_t size, int64_t step); + }} // namespace at::native diff --git a/aten/src/ATen/native/TensorTransformations.cpp b/aten/src/ATen/native/TensorTransformations.cpp index 5e5f9c91179e..f0e2c0f02caa 100644 --- a/aten/src/ATen/native/TensorTransformations.cpp +++ b/aten/src/ATen/native/TensorTransformations.cpp @@ -1,6 +1,7 @@ #include #include // for flip_stub +#include #include #include #include @@ -211,6 +212,10 @@ std::vector atleast_3d(TensorList tensors) { return result; } +Tensor chalf(const Tensor& self, c10::optional memory_format) { + return self.to(kComplexHalf, false, false, memory_format); +} + DEFINE_DISPATCH(flip_stub); }} // namespace at::native diff --git a/aten/src/ATen/native/TensorTransformations.h b/aten/src/ATen/native/TensorTransformations.h index 03ee31e696aa..4909ebe84bb0 100644 --- a/aten/src/ATen/native/TensorTransformations.h +++ b/aten/src/ATen/native/TensorTransformations.h @@ -1,4 +1,10 @@ -#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif #include diff --git a/aten/src/ATen/native/TestOps.cpp b/aten/src/ATen/native/TestOps.cpp index 065850261920..9a3a5b10cb26 100644 --- a/aten/src/ATen/native/TestOps.cpp +++ b/aten/src/ATen/native/TestOps.cpp @@ -13,7 +13,7 @@ namespace native { /// Else, return a new tensor containing the elementwise sums. Tensor _test_optional_intlist( const Tensor& values, - c10::optional addends) { + at::OptionalIntArrayRef addends) { if (!addends) { return values; } diff --git a/aten/src/ATen/native/TopKImpl.h b/aten/src/ATen/native/TopKImpl.h new file mode 100644 index 000000000000..69d5c70236b8 --- /dev/null +++ b/aten/src/ATen/native/TopKImpl.h @@ -0,0 +1,95 @@ +#pragma once +#include +#include + +namespace at { +namespace native { + +#ifdef CPU_CAPABILITY +inline namespace CPU_CAPABILITY { +#else +inline namespace DEFAULT { +#endif + +// Core topk loop, shared between CPU and QuantizedCPU +template +void topk_impl_loop( + const int64_t mode_values_stride, + const int64_t mode_indices_stride, + const int64_t tmp_values_stride, + const int64_t k, + const int64_t dim_size, + const bool largest, + const bool sorted, + char** data, const int64_t* strides, const int64_t n) { + + using elem_t = std::pair; + std::vector queue(dim_size); + for (const auto i : c10::irange(n)) { + TensorAccessor mode_values( + reinterpret_cast(data[0] + i * strides[0]), + &k, &mode_values_stride); + TensorAccessor mode_indices( + reinterpret_cast(data[1] + i * strides[1]), + &k, &mode_indices_stride); + TensorAccessor tmp_values( + reinterpret_cast(data[2] + i * strides[2]), + &dim_size, &tmp_values_stride); + + auto n = dim_size; + auto use_partial_sort = k * 64 <= n; + + for (const auto j : c10::irange(n)) { + queue[j].first = tmp_values[j]; + queue[j].second = j; + } + + // we want nan to be sorted as top for numpy compatibility + if (use_partial_sort) { + if (largest) { + std::partial_sort(queue.begin(), queue.begin() + k, queue.end(), + [](const elem_t& x, const elem_t& y) -> bool { + return ((_isnan(x.first) && !_isnan(y.first)) || (x.first > y.first)); + }); + } else { + std::partial_sort(queue.begin(), queue.begin() + k, queue.end(), + [](const elem_t& x, const elem_t& y) -> bool { + return ((!_isnan(x.first) && _isnan(y.first)) || (x.first < y.first)); + }); + } + } else { + if (largest) { + std::nth_element(queue.begin(), queue.begin() + k - 1, queue.end(), + [](const elem_t& x, const elem_t& y) -> bool { + return ((_isnan(x.first) && !_isnan(y.first)) || (x.first > y.first)); + }); + if (sorted) { + std::sort(queue.begin(), queue.begin() + k - 1, + [](const elem_t& x, const elem_t& y) -> bool { + return ((_isnan(x.first) && !_isnan(y.first)) || (x.first > y.first)); + }); + } + } else { + std::nth_element(queue.begin(), queue.begin() + k -1, queue.end(), + [](const elem_t& x, const elem_t& y) -> bool { + return ((!_isnan(x.first) && _isnan(y.first)) || (x.first < y.first)); + }); + if (sorted) { + std::sort(queue.begin(), queue.begin() + k -1, + [](const elem_t& x, const elem_t& y) -> bool { + return ((!_isnan(x.first) && _isnan(y.first)) || (x.first < y.first)); + }); + } + } + } + + for (const auto j : c10::irange(k)) { + mode_values[j] = queue[j].first; + mode_indices[j] = queue[j].second; + } + } +} + +} // namespace CPU_CAPABILITY +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/TransposeType.h b/aten/src/ATen/native/TransposeType.h new file mode 100644 index 000000000000..5353394a9dde --- /dev/null +++ b/aten/src/ATen/native/TransposeType.h @@ -0,0 +1,24 @@ +#pragma once +#include + +namespace at { +namespace native { + +// Used as an interface between the different BLAS-like libraries +enum class TransposeType { + NoTranspose, + Transpose, + ConjTranspose, +}; + +// Transforms TransposeType into the BLAS / LAPACK format +static char to_blas(TransposeType trans) { + switch (trans) { + case TransposeType::Transpose: return 'T'; + case TransposeType::NoTranspose: return 'N'; + case TransposeType::ConjTranspose: return 'C'; + } + TORCH_INTERNAL_ASSERT(false, "Invalid transpose type"); +} + +}} // at::native diff --git a/aten/src/ATen/native/TriangularOps.cpp b/aten/src/ATen/native/TriangularOps.cpp index b6a8a690bd28..b00a4a176918 100644 --- a/aten/src/ATen/native/TriangularOps.cpp +++ b/aten/src/ATen/native/TriangularOps.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include namespace at { @@ -174,7 +175,13 @@ Tensor trace_backward(const Tensor& grad, IntArrayRef sizes) { auto grad_input = at::zeros(sizes[0] * sizes[1], grad.options()); auto indices = at::arange(0, grad_input.numel(), sizes[1] + 1, grad.options().dtype(at::kLong)); - grad_input.index_fill_(0, indices, grad); + // for composite compliance, use out-of-place variant of + // `index_fill` if grad tensor is a Tensor Subclass. + if (isTensorSubclassLike(grad)) { + grad_input = grad_input.index_fill(0, indices, grad); + } else { + grad_input.index_fill_(0, indices, grad); + } return grad_input.view(sizes); } diff --git a/aten/src/ATen/native/TriangularOpsUtils.h b/aten/src/ATen/native/TriangularOpsUtils.h index 13c2d33c6c18..c5bce42ed3fd 100644 --- a/aten/src/ATen/native/TriangularOpsUtils.h +++ b/aten/src/ATen/native/TriangularOpsUtils.h @@ -26,7 +26,7 @@ static inline std::tuple checkTrilTriuBatchContiguous(const Tensor // Complete contiguity is the most desired property, which is why // we return true if the tensor is contiguous if (tensor.is_contiguous()) { - auto default_strides_for_size = contiguous_strides_vec(tensor.sizes()); + auto default_strides_for_size = batched_matrix_contiguous_strides(tensor.sizes()); if (tensor.strides() == default_strides_for_size) { return std::make_tuple(true, tensor); } else { diff --git a/aten/src/ATen/native/TypeProperties.cpp b/aten/src/ATen/native/TypeProperties.cpp index a49e2a582658..9577e7c9dc58 100644 --- a/aten/src/ATen/native/TypeProperties.cpp +++ b/aten/src/ATen/native/TypeProperties.cpp @@ -133,7 +133,7 @@ ScalarType result_type(const ResultTypeState& in_state) { return combine_categories(in_state.dimResult, combine_categories(in_state.zeroResult, in_state.wrappedResult)); } -ScalarType result_type(TensorList tensors) { +ScalarType result_type(ITensorListRef tensors) { ResultTypeState state = {}; for (const Tensor& tensor : tensors) { state = update_result_type_state(tensor, state); diff --git a/aten/src/ATen/native/TypeProperties.h b/aten/src/ATen/native/TypeProperties.h index 85ffed1ee07f..b0f18c594882 100644 --- a/aten/src/ATen/native/TypeProperties.h +++ b/aten/src/ATen/native/TypeProperties.h @@ -1,6 +1,7 @@ #pragma once -#include +#include +#include namespace at { namespace native { @@ -11,8 +12,9 @@ struct ResultTypeState { }; TORCH_API ResultTypeState update_result_type_state(const Tensor& tensor, const ResultTypeState& in_state); +TORCH_API ResultTypeState update_result_type_state(const Scalar& scalar, const ResultTypeState& in_state); TORCH_API ScalarType result_type(const ResultTypeState& state); -TORCH_API ScalarType result_type(TensorList tensors); +TORCH_API ScalarType result_type(ITensorListRef tensors); }} diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp index e8cfeba2df02..085fcbcca975 100644 --- a/aten/src/ATen/native/UnaryOps.cpp +++ b/aten/src/ATen/native/UnaryOps.cpp @@ -67,6 +67,7 @@ CREATE_UNARY_FLOAT_META_FUNC(special_i0e) CREATE_UNARY_FLOAT_META_FUNC(special_i1) CREATE_UNARY_FLOAT_META_FUNC(special_i1e) CREATE_UNARY_FLOAT_META_FUNC(special_ndtri) +CREATE_UNARY_FLOAT_META_FUNC(special_log_ndtr) CREATE_UNARY_FLOAT_META_FUNC(sqrt) CREATE_UNARY_FLOAT_META_FUNC(tan) CREATE_UNARY_FLOAT_META_FUNC(tanh) @@ -184,6 +185,7 @@ CREATE_UNARY_TORCH_IMPL_FUNC(special_i0e_out, special_i0e_stub) CREATE_UNARY_TORCH_IMPL_FUNC(special_i1e_out, special_i1e_stub) CREATE_UNARY_TORCH_IMPL_FUNC(special_i1_out, special_i1_stub) CREATE_UNARY_TORCH_IMPL_FUNC(special_ndtri_out, special_ndtri_stub) +CREATE_UNARY_TORCH_IMPL_FUNC(special_log_ndtr_out, special_log_ndtr_stub) CREATE_UNARY_TORCH_IMPL_FUNC(sqrt_out, sqrt_stub) CREATE_UNARY_TORCH_IMPL_FUNC(tan_out, tan_stub) CREATE_UNARY_TORCH_IMPL_FUNC(tanh_out, tanh_stub) @@ -250,7 +252,7 @@ template static inline Tensor& unary_op_impl_with_complex_to_float_out(Tensor& result, const Tensor& self, Stub& stub, bool promotes_integer_to_float) { if (self.is_complex() && !result.is_complex()) { // Checks if the corresponding float type can be cast to the desired dtype - const auto float_type = c10::toValueType(self.scalar_type()); + const auto float_type = c10::toRealValueType(self.scalar_type()); TORCH_CHECK(canCast(float_type, result.scalar_type()), "result type ", float_type, " can't be cast to the desired output type ", result.scalar_type()); @@ -288,8 +290,8 @@ static inline Tensor unary_op_impl(const Tensor& self, OutImpl& out_impl) { template static inline Tensor unary_op_impl_with_complex_to_float(const Tensor& self, OutImpl& out_impl) { if (self.is_complex()) { - const auto float_type = c10::toValueType(self.scalar_type()); - Tensor result = at::empty({0}, self.options().dtype(float_type)); + const auto float_type = c10::toRealValueType(self.scalar_type()); + Tensor result = at::empty_like(self, self.options().dtype(float_type)); return out_impl(result, self); } @@ -385,7 +387,7 @@ Tensor& angle_out(const Tensor& self, Tensor& result) { } Tensor angle(const Tensor& self) { if (self.is_complex()) { - const auto float_type = c10::toValueType(self.scalar_type()); + const auto float_type = c10::toRealValueType(self.scalar_type()); Tensor result = at::empty({0}, self.options().dtype(float_type)); return at::angle_out(result, self); } @@ -538,7 +540,7 @@ Tensor special_sinc(const Tensor& self) { return self.sinc(); } namespace { inline Tensor calc_ndtr(const Tensor& self) { - auto x_sqrt_2 = self / std::sqrt(2.); + auto x_sqrt_2 = self * M_SQRT1_2; return (1 + at::erf(x_sqrt_2)) * 0.5; } @@ -806,8 +808,6 @@ Tensor& special_gammaln_out(const Tensor& self, Tensor& result) { return at::lga DEFINE_DISPATCH(abs_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) DEFINE_DISPATCH(angle_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) -DEFINE_DISPATCH(real_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) -DEFINE_DISPATCH(imag_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) DEFINE_DISPATCH(conj_physical_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) DEFINE_DISPATCH(acos_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) DEFINE_DISPATCH(acosh_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) @@ -841,6 +841,7 @@ DEFINE_DISPATCH(log1p_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global- DEFINE_DISPATCH(log2_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) DEFINE_DISPATCH(logical_not_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) DEFINE_DISPATCH(special_ndtri_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) +DEFINE_DISPATCH(special_log_ndtr_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) DEFINE_DISPATCH(neg_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) DEFINE_DISPATCH(nan_to_num_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) DEFINE_DISPATCH(polygamma_stub); // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) diff --git a/aten/src/ATen/native/UnaryOps.h b/aten/src/ATen/native/UnaryOps.h index 47224d51fc35..3c205cb9a878 100644 --- a/aten/src/ATen/native/UnaryOps.h +++ b/aten/src/ATen/native/UnaryOps.h @@ -6,7 +6,7 @@ namespace at { class Tensor; -struct TensorIterator; +class TensorBase; struct TensorIteratorBase; } @@ -17,8 +17,6 @@ using unary_fn_with_scalar = void(*)(TensorIteratorBase&, const Scalar& a); DECLARE_DISPATCH(unary_fn, abs_stub); DECLARE_DISPATCH(unary_fn, angle_stub); -DECLARE_DISPATCH(unary_fn, real_stub); -DECLARE_DISPATCH(unary_fn, imag_stub); DECLARE_DISPATCH(unary_fn, conj_physical_stub); DECLARE_DISPATCH(unary_fn, acos_stub); DECLARE_DISPATCH(unary_fn, acosh_stub); @@ -52,6 +50,7 @@ DECLARE_DISPATCH(unary_fn, log10_stub); DECLARE_DISPATCH(unary_fn, log1p_stub); DECLARE_DISPATCH(unary_fn, log2_stub); DECLARE_DISPATCH(unary_fn, special_ndtri_stub); +DECLARE_DISPATCH(unary_fn, special_log_ndtr_stub); DECLARE_DISPATCH(unary_fn, neg_stub); DECLARE_DISPATCH(unary_fn, reciprocal_stub); @@ -73,14 +72,14 @@ DECLARE_DISPATCH(unary_fn, trunc_stub); DECLARE_DISPATCH(unary_fn, lgamma_stub); // NB: these are actually defined in Distribution -DECLARE_DISPATCH(void(*)(Tensor&, const Tensor&, c10::optional), bernoulli_tensor_stub); -DECLARE_DISPATCH(void(*)(Tensor&, const double, c10::optional), bernoulli_scalar_stub); +DECLARE_DISPATCH(void(*)(const TensorBase&, const TensorBase&, c10::optional), bernoulli_tensor_stub); +DECLARE_DISPATCH(void(*)(const TensorBase&, const double, c10::optional), bernoulli_scalar_stub); DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional), cauchy_stub); DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, c10::optional), exponential_stub); DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, c10::optional), geometric_stub); DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional), log_normal_stub); DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional), uniform_stub); -DECLARE_DISPATCH(void(*)(Tensor&, const double, const double, c10::optional), normal_stub); +DECLARE_DISPATCH(void(*)(const TensorBase&, const double, const double, c10::optional), normal_stub); DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const uint64_t, const int64_t, c10::optional), random_from_to_stub); DECLARE_DISPATCH(void(*)(TensorIteratorBase&, c10::optional), random_full_64_bits_range_stub); DECLARE_DISPATCH(void(*)(TensorIteratorBase&, c10::optional), random_stub); diff --git a/aten/src/ATen/native/Unfold2d.h b/aten/src/ATen/native/Unfold2d.h index bfee0bc782f4..2ea27e0caded 100644 --- a/aten/src/ATen/native/Unfold2d.h +++ b/aten/src/ATen/native/Unfold2d.h @@ -1,7 +1,8 @@ #pragma once -#include #include +#include +#include namespace at { namespace native { @@ -19,7 +20,8 @@ using unfold2d_fn = void (*)( int64_t input_height, int64_t input_width, int64_t output_height, - int64_t output_width + int64_t output_width, + bool is_channels_last ); DECLARE_DISPATCH(unfold2d_fn, unfolded2d_copy_stub); diff --git a/aten/src/ATen/native/UnfoldBackward.cpp b/aten/src/ATen/native/UnfoldBackward.cpp index f1509c9dd837..10bee80cea23 100644 --- a/aten/src/ATen/native/UnfoldBackward.cpp +++ b/aten/src/ATen/native/UnfoldBackward.cpp @@ -1,5 +1,14 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#endif + namespace at { namespace native { DEFINE_DISPATCH(unfold_backward_stub); diff --git a/aten/src/ATen/native/UnfoldBackward.h b/aten/src/ATen/native/UnfoldBackward.h index 8e33d64aa5f6..1f6c8fa1b289 100644 --- a/aten/src/ATen/native/UnfoldBackward.h +++ b/aten/src/ATen/native/UnfoldBackward.h @@ -1,11 +1,17 @@ #pragma once -#include +#include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + namespace at { namespace native { using unfold_backward_fn = void (*)( @@ -106,8 +112,8 @@ static C10_UNUSED TensorIterator _make_unfold_backward_iter_over_grad_in( Tensor& grad_out, const Tensor& grad_in, int64_t dim, - int64_t size, - int64_t step + int64_t /*size*/, + int64_t /*step*/ ) { dim = maybe_wrap_dim(dim, grad_out.dim()); // last dim stores the folds diff --git a/aten/src/ATen/native/Unique.cpp b/aten/src/ATen/native/Unique.cpp index ce911f1763b6..dc066d99d46d 100644 --- a/aten/src/ATen/native/Unique.cpp +++ b/aten/src/ATen/native/Unique.cpp @@ -182,7 +182,7 @@ std::tuple _unique_dim_cpu_template( TORCH_CHECK( num_zero_dims == 1, "Number of zero sized dimensions is more than one, so unique cannot be applied ") - Tensor output = at::empty({0}, self.options()); + Tensor output = at::empty(sizes, self.options()); Tensor inverse_indices = at::empty({0}, self.options().dtype(kLong)); Tensor counts = at::empty({0}, self.options().dtype(kLong)); diff --git a/aten/src/ATen/native/UpSample.cpp b/aten/src/ATen/native/UpSample.cpp index bcc8891de8dc..db75b7e99fdb 100644 --- a/aten/src/ATen/native/UpSample.cpp +++ b/aten/src/ATen/native/UpSample.cpp @@ -9,7 +9,7 @@ namespace upsample { TORCH_API c10::SmallVector compute_output_size( c10::IntArrayRef input_size, // Full input tensor size. - c10::optional output_size, + at::OptionalIntArrayRef output_size, c10::optional> scale_factors) { const auto spatial_dimensions = static_cast(input_size.size()) - 2; if (output_size) { diff --git a/aten/src/ATen/native/UpSample.h b/aten/src/ATen/native/UpSample.h index 5bc3a434f428..6b248352de6a 100644 --- a/aten/src/ATen/native/UpSample.h +++ b/aten/src/ATen/native/UpSample.h @@ -2,7 +2,7 @@ #include -#include +#include #include #include @@ -51,7 +51,7 @@ namespace upsample { TORCH_API c10::SmallVector compute_output_size( c10::IntArrayRef input_size, // Full input tensor size. - c10::optional output_size, + at::OptionalIntArrayRef output_size, c10::optional> scale_factors); inline c10::optional get_scale_value(c10::optional> scales, int idx) { @@ -328,6 +328,39 @@ static inline int64_t nearest_neighbor_exact_compute_source_index( return src_index; } +static inline int64_t nearest_idx( + int64_t output_index, + int64_t input_size, + int64_t output_size, + c10::optional scales) { + // This method specificly treats cases: output_size == input_size or + // output_size == 2 * input_size, that we would like to get rid of + // We keep this method for BC and consider as deprecated. + // See nearest_exact_idx as replacement + if (output_size == input_size) { + // scale_factor = 1, simply copy + return output_index; + } else if (output_size == 2 * input_size) { + // scale_factor = 2, shift input index + return output_index >> 1; + } else { + float scale = compute_scales_value(scales, input_size, output_size); + return nearest_neighbor_compute_source_index(scale, output_index, input_size); + } +} + +static inline int64_t nearest_exact_idx( + int64_t output_index, + int64_t input_size, + int64_t output_size, + c10::optional scales) { + float scale = compute_scales_value(scales, input_size, output_size); + return nearest_neighbor_exact_compute_source_index(scale, output_index, input_size); +} + +// Define a typedef to dispatch to nearest_idx or nearest_exact_idx +typedef int64_t (*nearest_idx_fn_t)(int64_t, int64_t, int64_t, c10::optional); + template static scalar_t upsample_get_value_bounded( scalar_t* data, diff --git a/aten/src/ATen/native/UpSampleBicubic2d.cpp b/aten/src/ATen/native/UpSampleBicubic2d.cpp index 95d9f91bcb80..7cda89c61264 100644 --- a/aten/src/ATen/native/UpSampleBicubic2d.cpp +++ b/aten/src/ATen/native/UpSampleBicubic2d.cpp @@ -188,7 +188,7 @@ static void upsample_bicubic2d_backward_kernel( auto grad_output = grad_output_.contiguous(); - AT_DISPATCH_FLOATING_TYPES_AND_HALF( + AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, grad_output.scalar_type(), "upsample_bicubic2d_backward", [&] { scalar_t* idata = grad_input.data_ptr(); scalar_t* odata = grad_output.data_ptr(); @@ -264,7 +264,7 @@ using at::native::upsample::get_scale_value; Tensor upsample_bicubic2d( const Tensor& input, - c10::optional output_size, + at::OptionalIntArrayRef output_size, bool align_corners, c10::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); @@ -275,7 +275,7 @@ Tensor upsample_bicubic2d( Tensor upsample_bicubic2d_backward( const Tensor& grad_output, - c10::optional output_size, + at::OptionalIntArrayRef output_size, IntArrayRef input_size, bool align_corners, c10::optional> scale_factors) { @@ -287,7 +287,7 @@ Tensor upsample_bicubic2d_backward( Tensor _upsample_bicubic2d_aa( const Tensor& input, - c10::optional output_size, + at::OptionalIntArrayRef output_size, bool align_corners, c10::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); @@ -298,7 +298,7 @@ Tensor _upsample_bicubic2d_aa( Tensor _upsample_bicubic2d_aa_backward( const Tensor& grad_output, - c10::optional output_size, + at::OptionalIntArrayRef output_size, IntArrayRef input_size, bool align_corners, c10::optional> scale_factors) { diff --git a/aten/src/ATen/native/UpSampleBilinear2d.cpp b/aten/src/ATen/native/UpSampleBilinear2d.cpp index f73bb50c9ff4..2a228a86ac71 100644 --- a/aten/src/ATen/native/UpSampleBilinear2d.cpp +++ b/aten/src/ATen/native/UpSampleBilinear2d.cpp @@ -145,7 +145,7 @@ using at::native::upsample::get_scale_value; Tensor upsample_bilinear2d( const Tensor& input, - c10::optional output_size, + at::OptionalIntArrayRef output_size, bool align_corners, c10::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); @@ -156,7 +156,7 @@ Tensor upsample_bilinear2d( Tensor upsample_bilinear2d_backward( const Tensor& grad_output, - c10::optional output_size, + at::OptionalIntArrayRef output_size, IntArrayRef input_size, bool align_corners, c10::optional> scale_factors) { @@ -168,7 +168,7 @@ Tensor upsample_bilinear2d_backward( Tensor _upsample_bilinear2d_aa( const Tensor& input, - c10::optional output_size, + at::OptionalIntArrayRef output_size, bool align_corners, c10::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); @@ -179,7 +179,7 @@ Tensor _upsample_bilinear2d_aa( Tensor _upsample_bilinear2d_aa_backward( const Tensor& grad_output, - c10::optional output_size, + at::OptionalIntArrayRef output_size, IntArrayRef input_size, bool align_corners, c10::optional> scale_factors) { diff --git a/aten/src/ATen/native/UpSampleLinear1d.cpp b/aten/src/ATen/native/UpSampleLinear1d.cpp index 371a53dc8900..687cad5c879b 100644 --- a/aten/src/ATen/native/UpSampleLinear1d.cpp +++ b/aten/src/ATen/native/UpSampleLinear1d.cpp @@ -79,7 +79,7 @@ using at::native::upsample::get_scale_value; Tensor upsample_linear1d( const Tensor& input, - c10::optional output_size, + at::OptionalIntArrayRef output_size, bool align_corners, c10::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); @@ -89,7 +89,7 @@ Tensor upsample_linear1d( Tensor upsample_linear1d_backward( const Tensor& grad_output, - c10::optional output_size, + at::OptionalIntArrayRef output_size, IntArrayRef input_size, bool align_corners, c10::optional> scale_factors) { diff --git a/aten/src/ATen/native/UpSampleNearest1d.cpp b/aten/src/ATen/native/UpSampleNearest1d.cpp index 52fa7bcc5c9a..b9bc5b3c5b96 100644 --- a/aten/src/ATen/native/UpSampleNearest1d.cpp +++ b/aten/src/ATen/native/UpSampleNearest1d.cpp @@ -109,7 +109,7 @@ using at::native::upsample::get_scale_value; Tensor upsample_nearest1d( const Tensor& input, - c10::optional output_size, + at::OptionalIntArrayRef output_size, c10::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_w = get_scale_value(scale_factors, 0); @@ -118,7 +118,7 @@ Tensor upsample_nearest1d( Tensor _upsample_nearest_exact1d( const Tensor& input, - c10::optional output_size, + at::OptionalIntArrayRef output_size, c10::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_w = get_scale_value(scale_factors, 0); @@ -127,7 +127,7 @@ Tensor _upsample_nearest_exact1d( Tensor upsample_nearest1d_backward( const Tensor& grad_output, - c10::optional output_size, + at::OptionalIntArrayRef output_size, IntArrayRef input_size, c10::optional> scale_factors) { auto osize = compute_output_size(input_size, output_size, scale_factors); @@ -137,7 +137,7 @@ Tensor upsample_nearest1d_backward( Tensor _upsample_nearest_exact1d_backward( const Tensor& grad_output, - c10::optional output_size, + at::OptionalIntArrayRef output_size, IntArrayRef input_size, c10::optional> scale_factors) { auto osize = compute_output_size(input_size, output_size, scale_factors); diff --git a/aten/src/ATen/native/UpSampleNearest2d.cpp b/aten/src/ATen/native/UpSampleNearest2d.cpp index 864121fb0afa..1f9a9eafd4f6 100644 --- a/aten/src/ATen/native/UpSampleNearest2d.cpp +++ b/aten/src/ATen/native/UpSampleNearest2d.cpp @@ -134,7 +134,7 @@ using at::native::upsample::get_scale_value; Tensor upsample_nearest2d( const Tensor& input, - c10::optional output_size, + at::OptionalIntArrayRef output_size, c10::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_h = get_scale_value(scale_factors, 0); @@ -144,7 +144,7 @@ Tensor upsample_nearest2d( Tensor _upsample_nearest_exact2d( const Tensor& input, - c10::optional output_size, + at::OptionalIntArrayRef output_size, c10::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_h = get_scale_value(scale_factors, 0); @@ -154,7 +154,7 @@ Tensor _upsample_nearest_exact2d( Tensor upsample_nearest2d_backward( const Tensor& grad_output, - c10::optional output_size, + at::OptionalIntArrayRef output_size, IntArrayRef input_size, c10::optional> scale_factors) { auto osize = compute_output_size(input_size, output_size, scale_factors); @@ -165,7 +165,7 @@ Tensor upsample_nearest2d_backward( Tensor _upsample_nearest_exact2d_backward( const Tensor& grad_output, - c10::optional output_size, + at::OptionalIntArrayRef output_size, IntArrayRef input_size, c10::optional> scale_factors) { auto osize = compute_output_size(input_size, output_size, scale_factors); diff --git a/aten/src/ATen/native/UpSampleNearest3d.cpp b/aten/src/ATen/native/UpSampleNearest3d.cpp index c659a86cd81f..ff559f3e09c0 100644 --- a/aten/src/ATen/native/UpSampleNearest3d.cpp +++ b/aten/src/ATen/native/UpSampleNearest3d.cpp @@ -149,7 +149,7 @@ using at::native::upsample::get_scale_value; Tensor upsample_nearest3d_cpu( const Tensor& input, - c10::optional output_size, + at::OptionalIntArrayRef output_size, c10::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_d = get_scale_value(scale_factors, 0); @@ -160,7 +160,7 @@ Tensor upsample_nearest3d_cpu( Tensor _upsample_nearest_exact3d_cpu( const Tensor& input, - c10::optional output_size, + at::OptionalIntArrayRef output_size, c10::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_d = get_scale_value(scale_factors, 0); @@ -172,7 +172,7 @@ Tensor _upsample_nearest_exact3d_cpu( // when structured kernels can handle QuantizedCPU, update these overloads to be CompositeExplicitAutograd Tensor upsample_nearest3d_backward_cpu( const Tensor& grad_output, - c10::optional output_size, + at::OptionalIntArrayRef output_size, IntArrayRef input_size, c10::optional> scale_factors) { auto osize = compute_output_size(input_size, output_size, scale_factors); @@ -184,7 +184,7 @@ Tensor upsample_nearest3d_backward_cpu( Tensor _upsample_nearest_exact3d_backward_cpu( const Tensor& grad_output, - c10::optional output_size, + at::OptionalIntArrayRef output_size, IntArrayRef input_size, c10::optional> scale_factors) { auto osize = compute_output_size(input_size, output_size, scale_factors); diff --git a/aten/src/ATen/native/UpSampleTrilinear3d.cpp b/aten/src/ATen/native/UpSampleTrilinear3d.cpp index 75a77a76c623..52cb2e00df46 100644 --- a/aten/src/ATen/native/UpSampleTrilinear3d.cpp +++ b/aten/src/ATen/native/UpSampleTrilinear3d.cpp @@ -51,7 +51,7 @@ TORCH_META_FUNC(upsample_trilinear3d_backward) ( " but got grad_output.size(", i, ") = ", grad_output.size(i)); } - set_output(input_size, grad_output.options()); + set_output(input_size, grad_output.options().memory_format(grad_output.suggest_memory_format())); } } // namespace meta @@ -90,7 +90,7 @@ using at::native::upsample::get_scale_value; Tensor upsample_trilinear3d( const Tensor& input, - c10::optional output_size, + at::OptionalIntArrayRef output_size, bool align_corners, c10::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); @@ -102,7 +102,7 @@ Tensor upsample_trilinear3d( Tensor upsample_trilinear3d_backward( const Tensor& grad_output, - c10::optional output_size, + at::OptionalIntArrayRef output_size, IntArrayRef input_size, bool align_corners, c10::optional> scale_factors) { diff --git a/aten/src/ATen/native/WeightNorm.cpp b/aten/src/ATen/native/WeightNorm.cpp index d1bc46809c53..b2229bdbf0d2 100644 --- a/aten/src/ATen/native/WeightNorm.cpp +++ b/aten/src/ATen/native/WeightNorm.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include @@ -10,6 +11,9 @@ namespace at { namespace native { +DEFINE_DISPATCH(weight_norm_stub); +DEFINE_DISPATCH(weight_norm_backward_stub); + // Staying faithful to the Python for now for clarity, look for optimizations later // (e.g., single return statement for RVO) Tensor norm_except_dim(const Tensor & v, int64_t pow, int64_t dim) @@ -32,6 +36,38 @@ Tensor norm_except_dim(const Tensor & v, int64_t pow, int64_t dim) } } +std::tuple weight_norm_cpu( + const Tensor& v, + const Tensor& g, + int64_t dim) { + auto w = at::empty_like(v, at::MemoryFormat::Contiguous); + + // align with cuda behavior, keep norm in 'Float' when g is 'BFloat16' + const auto dtype = g.scalar_type() == at::ScalarType::BFloat16 ? + at::ScalarType::Float : g.scalar_type(); + auto norm = at::empty_strided(g.sizes(), g.strides(), g.options().dtype(dtype)); + weight_norm_stub(kCPU, w, norm, v, g, dim); + + return std::tuple{w, norm}; +} + +std::tuple weight_norm_backward_cpu( + const Tensor& grad_w, + const Tensor& saved_v, + const Tensor& saved_g, + const Tensor& saved_norm, + int64_t dim) { + TORCH_CHECK(saved_v.is_contiguous(), "saved_v must be contiguous"); + TORCH_CHECK(saved_g.is_contiguous(), "saved_g must be contiguous"); + TORCH_CHECK(saved_norm.is_contiguous(), "saved_norm must be contiguous"); + + auto grad_v = at::empty_like(saved_v, at::MemoryFormat::Contiguous); + auto grad_g = at::empty_like(saved_g, at::MemoryFormat::Contiguous); + weight_norm_backward_stub(kCPU, grad_v, grad_g, grad_w, saved_v, saved_g, saved_norm, dim); + + return std::tuple{grad_v, grad_g}; +} + Tensor _weight_norm (const Tensor & v_in, const Tensor & g_in, @@ -46,12 +82,12 @@ Tensor _weight_norm auto v = v_in.contiguous(); auto g = g_in.contiguous(); - bool can_use_fused = v.is_cuda() && (dim == 0 || dim == v.dim() - 1); + bool can_use_fused = (dim == 0) || (dim == v.dim() - 1); if (can_use_fused) { // weight_norm does not have a derivative defined for it, so this will route back through // VariableType.cpp, and construct a WeightNormFusedBackward object in the autograd graph. - return std::get<0>(at::_weight_norm_cuda_interface(v, g, dim)); + return std::get<0>(at::_weight_norm_interface(v, g, dim)); } else { // Double-differentiable primitive ops // at::native::norm_except_dim would probably be fine as well. @@ -59,7 +95,7 @@ Tensor _weight_norm } } -// Differentiable backward path, an alternative to weight_norm_cuda_backward, to be used +// Differentiable backward path, an alternative to weight_norm_backward, to be used // when backward is itself creating a graph. // The GradMode::is_enabled() check must be performed within Functions.cpp; that's why we // define a separate function here, instead of inlining it in weight_norm_cuda_backward. diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp index 0212f2688b52..07fc3d245fe2 100644 --- a/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp +++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp @@ -71,7 +71,7 @@ int register_linear_params() { } namespace { -static auto linear_params = register_linear_params(); +static C10_UNUSED auto linear_params = register_linear_params(); } // namespace }} // namespace ao::sparse diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp index e0fb55427a77..187ed4fd1404 100644 --- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp +++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp @@ -2,7 +2,6 @@ #include #include -#include #include #include #include diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp index a0a389f818c4..ec6e160b16c3 100644 --- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp +++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp @@ -1,7 +1,6 @@ #include #include -#include #include #include #include diff --git a/aten/src/ATen/native/attention.cpp b/aten/src/ATen/native/attention.cpp deleted file mode 100644 index 599f0f866e2b..000000000000 --- a/aten/src/ATen/native/attention.cpp +++ /dev/null @@ -1,240 +0,0 @@ -#include - -#include -#include -#include -#include -#include -#include - -namespace at { - -namespace native { - -namespace { - -Tensor gemm_nt(const Tensor& a, const Tensor& b) { - auto a_ = a.view({a.size(0) * a.size(1), a.size(2)}); - auto b_ = b.transpose(1, 0); - auto c_ = at::native::matmul(a_, b_); - return c_.view({a.size(0), a.size(1), b.size(0)}); -} - -// compute q = (q + q_bias) / sqrt(dim_per_head), k = k + k_bias, v = v + v_bias -std::tuple transform_bias_rescale_qkv( - const Tensor& qkv, - const Tensor& qkv_bias, - const int64_t num_head) { - auto B = qkv.size(0); - auto T = qkv.size(1); - auto _3D = qkv.size(2); - auto D = _3D / 3; - TORCH_CHECK(D % num_head == 0); - const auto dim_per_head = D / num_head; - auto q_k_v = at::empty({3, B, num_head, T, dim_per_head}, qkv.options()); - - AT_DISPATCH_FLOATING_TYPES_AND2( - ScalarType::Half, - ScalarType::BFloat16, - qkv.scalar_type(), - "transform_bias_rescale_qkv", - [&] { - scalar_t* qkv_data = qkv.data_ptr(); - scalar_t* qkv_bias_data = qkv_bias.data_ptr(); - scalar_t* q_k_v_data = q_k_v.data_ptr(); - const scalar_t sqrt_dim_per_head = std::sqrt(static_cast(dim_per_head)); - - int64_t grain_size = - std::min(internal::GRAIN_SIZE / (3 * dim_per_head), (int64_t)1); - parallel_for( - 0, B * num_head * T, grain_size, [&](int64_t begin, int64_t end) { - for (auto i : c10::irange(begin, end)) { - auto t = i % T; - i /= T; - auto nh = i % num_head; - i /= num_head; - auto b = i; - using Vec = vec::Vectorized; - auto V = vec::Vectorized::size(); - // TODO: handle epilogue - for (auto dh = 0; dh < dim_per_head / V; dh += V) { - auto d = nh * dim_per_head + dh; - // load - auto q_bias_data = Vec::loadu(&qkv_bias_data[d + 0 * D]); - auto k_bias_data = Vec::loadu(&qkv_bias_data[d + 1 * D]); - auto v_bias_data = Vec::loadu(&qkv_bias_data[d + 2 * D]); - - auto q_data = - Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 0 * D]) + - q_bias_data; - auto k_data = - Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 1 * D]) + - k_bias_data; - auto v_data = - Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 2 * D]) + - v_bias_data; - - q_data = q_data / Vec(sqrt_dim_per_head); - - q_data.store(&q_k_v_data - [0 * B * num_head * T * dim_per_head + - b * num_head * T * dim_per_head + - num_head * T * dim_per_head + - t * dim_per_head + dh]); - k_data.store(&q_k_v_data - [1 * B * num_head * T * dim_per_head + - b * num_head * T * dim_per_head + - num_head * T * dim_per_head + - t * dim_per_head + dh]); - v_data.store(&q_k_v_data - [2 * B * num_head * T * dim_per_head + - b * num_head * T * dim_per_head + - num_head * T * dim_per_head + - t * dim_per_head + dh]); - } - } - }); - }); - auto q_k_v_s = - at::native::split(q_k_v.view({3 * B, num_head, T, dim_per_head}), B, 0); - return std::make_tuple(q_k_v_s[0], q_k_v_s[1], q_k_v_s[2]); -} - -Tensor bmm_nt(const Tensor& a, const Tensor& b) { - auto a_ = a.view({a.size(0) * a.size(1), a.size(2), a.size(3)}); - auto b_ = b.view({b.size(0) * b.size(1), b.size(2), b.size(3)}); - auto bt_ = b_.transpose(2, 1); - // TODO: are these a single call to cublas batched matmul? - auto c_ = at::matmul(a_, bt_); - return c_.view({a.size(0), a.size(1), a.size(2), b.size(2)}); -} - -void masked_softmax_dropout( - const Tensor& attn_scores, - const c10::optional& attn_mask) { - auto B = attn_scores.size(0); - auto num_heads = attn_scores.size(1); - auto T = attn_scores.size(2); - if (attn_mask) { - TORCH_CHECK(attn_mask->is_contiguous()); - } - AT_DISPATCH_FLOATING_TYPES_AND2( - ScalarType::Half, - ScalarType::BFloat16, - attn_scores.scalar_type(), - "masked_softmax_dropout", - [&] { - using accscalar_t = acc_type; - // TODO: proper implementation with masking. - scalar_t* attn_scores_data = attn_scores.data_ptr(); - int64_t grain_size = std::min(internal::GRAIN_SIZE / T, (int64_t)1); - parallel_for( - 0, B * num_heads * T, grain_size, [&](int64_t begin, int64_t end) { - for (const auto i : c10::irange(begin, end)) { - using Vec = vec::Vectorized; - auto V = vec::Vectorized::size(); - - scalar_t* input_data = attn_scores_data + i * T; - auto max_input = Vec(std::numeric_limits::lowest()); - // TODO: handle epilogue - for (auto t = 0; t < T; t += V) { - auto v = Vec::loadu(&input_data[t]); - max_input = vec::maximum(max_input, v); - } - - auto hmax = std::numeric_limits::lowest(); - for (auto i = 0; i < V; ++i) { - hmax = std::max(max_input[i], hmax); - } - accscalar_t hsum = 0; - for (auto t = 0; t < T; t += V) { - auto v = Vec::loadu(&input_data[t]); - // TODO: vectorize in accscalar_t? - for (auto i = 0; i < V; ++i) { - hsum += std::exp(static_cast(v[i]) - hmax); - } - } - auto inv_denominator = 1.0 / hsum; - for (auto t = 0; t < T; t += V) { - Vec v = Vec::loadu(&input_data[t]); - - // TODO: vectorize in accscalar_t? - // TODO this faster solution does not work on Android build - /* - for (auto i = 0; i < V; ++i) { - v[i] = static_cast(std::exp(static_cast(v[i]) - hmax) * inv_denominator); - } - v.store(&input_data[t]); - */ - for (auto i = 0; i < V; ++i) { - input_data[t + i] = static_cast(std::exp(static_cast(v[i]) - hmax) * inv_denominator); - } - } - } - }); - }); -} - -Tensor bmm_nn(const Tensor& a, const Tensor& b) { - auto a_ = a.view({a.size(0) * a.size(1), a.size(2), a.size(3)}); - auto b_ = b.view({b.size(0) * b.size(1), b.size(2), b.size(3)}); - // TODO: are these a single call to cublas batched matmul? - auto c_ = at::matmul(a_, b_); - return c_.view({a.size(0), a.size(1), a.size(2), b.size(3)}); -} - -Tensor transform_0213(const Tensor& a) { - // TODO: check perf vs dedicated kernel. - return a.permute({0, 2, 1, 3}) - .contiguous() - .view({a.size(0), a.size(2), a.size(1) * a.size(3)}); -} - -Tensor gemm_nt_bias(const Tensor& a, const Tensor& b, const Tensor& c) { - auto a_ = a.view({a.size(0) * a.size(1), a.size(2)}); - auto r_ = at::native::linear(a_, b, c); - return r_.view({a.size(0), a.size(1), r_.size(1)}); -} - -} // namespace - -Tensor multi_head_self_attention_cpu( - const Tensor& query, - const Tensor& qkv_weight, - const Tensor& qkv_bias, - const Tensor& proj_weight, - const Tensor& proj_bias, - const int64_t num_head, - const c10::optional& mask) { - // query shape: [B, T, D] - // qkv_weight shape: [3 * D, D] - - // shape: [B, T, 3 x D] - auto qkv = gemm_nt(query, qkv_weight); - - // shape: 3 x [B, num_head, T, dim_per_head] - auto q_k_v = transform_bias_rescale_qkv(qkv, qkv_bias, num_head); - auto q = std::get<0>(q_k_v); - auto k = std::get<1>(q_k_v); - auto v = std::get<2>(q_k_v); - - // shape: [B, num_head, T, T] - auto qkt = bmm_nt(q, k); - - // shape: [B, num_head, T, T] - masked_softmax_dropout(qkt, mask); - - // shape: [B, num_head, T, dim_per_head] - auto attn_ctx = bmm_nn(qkt, v); - - // shape: [B, T, D] - auto attn = transform_0213(attn_ctx); - - // shape: [B, T, D] - auto proj = gemm_nt_bias(attn, proj_weight, proj_bias); - - return proj; -} - -} // namespace native -} // namespace at diff --git a/aten/src/ATen/native/batch_norm.h b/aten/src/ATen/native/batch_norm.h index 4c25b08aa684..b729dfe199b0 100644 --- a/aten/src/ATen/native/batch_norm.h +++ b/aten/src/ATen/native/batch_norm.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include namespace at { diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp index b192d0c4d707..14fef621b10f 100644 --- a/aten/src/ATen/native/cpu/Activation.cpp +++ b/aten/src/ATen/native/cpu/Activation.cpp @@ -24,41 +24,106 @@ namespace { template inline void _vec_log_sigmoid(TensorBase &output, TensorBase &buffer, const TensorBase &input) { - using Vec = Vectorized; - scalar_t* output_data = output.data_ptr(); - scalar_t* buffer_data = buffer.data_ptr(); - scalar_t* input_data = input.data_ptr(); - parallel_for(0, input.numel(), 1, [&] (int64_t begin, int64_t end) { - int64_t size = end - begin; - int64_t d = 0; - for (; d < size - (size % Vec::size()); d += Vec::size()) { - Vec data_vec = Vec::loadu(input_data + begin+ d); - Vec min_vec = vec::minimum(data_vec, Vec(scalar_t(0))); - Vec buffer_vec = data_vec.abs().neg().exp(); - Vec output_vec = min_vec - buffer_vec.log1p(); - buffer_vec.store(buffer_data + begin + d); - output_vec.store(output_data + begin + d); - } - if (size - d > 0) { - Vec data_vec = Vec::loadu(input_data + begin + d, size - d); - Vec min_vec = vec::minimum(data_vec, Vec(scalar_t(0))); - Vec buffer_vec = data_vec.abs().neg().exp(); - Vec output_vec = min_vec - buffer_vec.log1p(); - buffer_vec.store(buffer_data + begin + d, size - d); - output_vec.store(output_data + begin + d, size - d); - } - }); + if (input.scalar_type() == kBFloat16) { + using Vec = Vectorized; + BFloat16* output_data = output.data_ptr(); + BFloat16* buffer_data = buffer.data_ptr(); + BFloat16* input_data = input.data_ptr(); + parallel_for(0, input.numel(), 1, [&] (int64_t begin, int64_t end) { + int64_t size = end - begin; + int64_t d = 0; + for (; d < size - (size % Vec::size()); d += Vec::size()) { + Vec data_vec = Vec::loadu(input_data + begin+ d); + Vectorized data_vec0, data_vec1; + std::tie(data_vec0, data_vec1) = convert_bfloat16_float(data_vec); + Vectorized min_vec = minimum(data_vec0, Vectorized(float(0))); + Vectorized buffer_vec0 = data_vec0.abs().neg().exp(); + Vectorized output_vec0 = min_vec - buffer_vec0.log1p(); + min_vec = minimum(data_vec1, Vectorized(float(0))); + Vectorized buffer_vec1 = data_vec1.abs().neg().exp(); + Vectorized output_vec1 = min_vec - buffer_vec1.log1p(); + convert_float_bfloat16(buffer_vec0, buffer_vec1).store(buffer_data + begin + d); + convert_float_bfloat16(output_vec0, output_vec1).store(output_data + begin + d); + } + if (size - d > 0) { + Vec data_vec = Vec::loadu(input_data + begin + d, size - d); + Vectorized data_vec0, data_vec1; + std::tie(data_vec0, data_vec1) = convert_bfloat16_float(data_vec); + Vectorized min_vec = minimum(data_vec0, Vectorized(float(0))); + Vectorized buffer_vec0 = data_vec0.abs().neg().exp(); + Vectorized output_vec0 = min_vec - buffer_vec0.log1p(); + min_vec = minimum(data_vec1, Vectorized(float(0))); + Vectorized buffer_vec1 = data_vec1.abs().neg().exp(); + Vectorized output_vec1 = min_vec - buffer_vec1.log1p(); + convert_float_bfloat16(buffer_vec0, buffer_vec1).store(buffer_data + begin + d, size - d); + convert_float_bfloat16(output_vec0, output_vec1).store(output_data + begin + d, size - d); + } + }); + } else { + using Vec = Vectorized; + scalar_t* output_data = output.data_ptr(); + scalar_t* buffer_data = buffer.data_ptr(); + scalar_t* input_data = input.data_ptr(); + parallel_for(0, input.numel(), 1, [&] (int64_t begin, int64_t end) { + int64_t size = end - begin; + int64_t d = 0; + for (; d < size - (size % Vec::size()); d += Vec::size()) { + Vec data_vec = Vec::loadu(input_data + begin+ d); + Vec min_vec = vec::minimum(data_vec, Vec(scalar_t(0))); + Vec buffer_vec = data_vec.abs().neg().exp(); + Vec output_vec = min_vec - buffer_vec.log1p(); + buffer_vec.store(buffer_data + begin + d); + output_vec.store(output_data + begin + d); + } + if (size - d > 0) { + Vec data_vec = Vec::loadu(input_data + begin + d, size - d); + Vec min_vec = vec::minimum(data_vec, Vec(scalar_t(0))); + Vec buffer_vec = data_vec.abs().neg().exp(); + Vec output_vec = min_vec - buffer_vec.log1p(); + buffer_vec.store(buffer_data + begin + d, size - d); + output_vec.store(output_data + begin + d, size - d); + } + }); + } } -static void log_sigmoid_cpu_kernel( - TensorBase &output, TensorBase &buffer, const TensorBase &input) { - AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&] { +static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const TensorBase &input) { + AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, input.scalar_type(), "log_sigmoid_cpu", [&] { _vec_log_sigmoid(output, buffer, input); }); } static void log_sigmoid_backward_cpu_kernel(TensorIterator& iter) { - AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "log_sigmoid_backward_cpu", [&]() { + if (iter.dtype() == kBFloat16) { + using Vec = Vectorized; + auto zero_val = float(0); + auto zero_vec = Vectorized(zero_val); + auto one_val = float(1); + auto one_vec = Vectorized(one_val); + cpu_kernel_vec(iter, + [=](BFloat16 a, BFloat16 b, BFloat16 c) -> BFloat16 { + auto in_negative = float(a) < float(0); + auto max_deriv = in_negative ? float(1) : float(0); + auto sign = in_negative ? float(1) : -float(1); + return (max_deriv - sign * (float(b) / (float(1) + b))) * float(c); + }, + [=](Vec a, Vec b, Vec c) -> Vec { + Vectorized a0, a1, b0, b1, c0, c1; + std::tie(a0, a1) = convert_bfloat16_float(a); + std::tie(b0, b1) = convert_bfloat16_float(b); + std::tie(c0, c1) = convert_bfloat16_float(c); + auto mask = a0 < zero_vec; + auto max_deriv_vec = Vectorized::blendv(zero_vec, one_vec, mask); + auto sign_vec = Vectorized::blendv(one_vec.neg(), one_vec, mask); + a0 = (max_deriv_vec - sign_vec * (b0 / (one_vec + b0))) * c0; + mask = a1 < zero_vec; + max_deriv_vec = Vectorized::blendv(zero_vec, one_vec, mask); + sign_vec = Vectorized::blendv(one_vec.neg(), one_vec, mask); + a1 = (max_deriv_vec - sign_vec * (b1 / (one_vec + b1))) * c1; + return convert_float_bfloat16(a0, a1); + }); + } else { + AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "log_sigmoid_backward_cpu", [&]() { using Vec = Vectorized; auto zero_val = scalar_t(0); auto zero_vec = Vec(zero_val); @@ -78,6 +143,7 @@ static void log_sigmoid_backward_cpu_kernel(TensorIterator& iter) { return (max_deriv_vec - sign_vec * (b / (one_vec + b))) * c; }); }); + } } static void threshold_kernel( @@ -102,71 +168,142 @@ static void threshold_kernel( } void elu_kernel(TensorIteratorBase& it, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale) { - AT_DISPATCH_FLOATING_TYPES(it.dtype(), "elu_cpu", [&]() { - using Vec = Vectorized; - auto negcoef = alpha.to() * scale.to(); - auto poscoef = scale.to(); - auto negiptcoef = input_scale.to(); - const Vec negcoef_vec(negcoef); - const Vec negiptcoef_vec(negiptcoef); - const Vec poscoef_vec(poscoef); - const Vec one_vec(static_cast(1)); - const Vec zero_vec(static_cast(0)); + if (it.common_dtype() == kBFloat16) { + auto negcoef = alpha.to() * scale.to(); + auto poscoef = scale.to(); + auto negiptcoef = input_scale.to(); + const Vectorized negcoef_vec(negcoef); + const Vectorized negiptcoef_vec(negiptcoef); + const Vectorized poscoef_vec(poscoef); + const Vectorized one_vec(static_cast(1)); + const Vectorized zero_vec(static_cast(0)); cpu_kernel_vec( - it, - [negcoef, negiptcoef, poscoef](scalar_t a) -> scalar_t { - return a <= scalar_t(0) ? (std::exp(a * negiptcoef) - scalar_t(1)) * negcoef : a * poscoef; - }, - [&negcoef_vec, &negiptcoef_vec, &poscoef_vec, &one_vec, &zero_vec](Vec a) -> Vec { - auto cmp = (a > zero_vec); - if (!cmp.zero_mask()) { // only a * poscoef (which is very quick) needs to be computed - return a * poscoef_vec; - } else { - return Vec::blendv(((a * negiptcoef_vec).exp() - one_vec) * negcoef_vec, a * poscoef_vec, cmp); - } - }); - }); + it, + [negcoef, negiptcoef, poscoef](BFloat16 a) -> BFloat16 { + return float(a) <= float(0) ? (std::exp(float(a) * negiptcoef) - float(1)) * negcoef : float(a) * poscoef; + }, + [&negcoef_vec, &negiptcoef_vec, &poscoef_vec, &one_vec, &zero_vec](Vectorized a) -> Vectorized { + Vectorized a0, a1; + std::tie(a0, a1) = convert_bfloat16_float(a); + auto cmp0 = (a0 > zero_vec); + auto cmp1 = (a1 > zero_vec); + if (!cmp0.zero_mask() && !cmp1.zero_mask()) { // only a * poscoef (which is very quick) needs to be computed + return convert_float_bfloat16(a0 * poscoef_vec, a1 * poscoef_vec); + } else { + auto res0 = Vectorized::blendv(((a0 * negiptcoef_vec).exp() - one_vec) * negcoef_vec, a0 * poscoef_vec, cmp0); + auto res1 = Vectorized::blendv(((a1 * negiptcoef_vec).exp() - one_vec) * negcoef_vec, a1 * poscoef_vec, cmp1); + return convert_float_bfloat16(res0, res1); + } + } + ); + } else { + AT_DISPATCH_FLOATING_TYPES(it.dtype(), "elu_cpu", [&]() { + using Vec = Vectorized; + auto negcoef = alpha.to() * scale.to(); + auto poscoef = scale.to(); + auto negiptcoef = input_scale.to(); + const Vec negcoef_vec(negcoef); + const Vec negiptcoef_vec(negiptcoef); + const Vec poscoef_vec(poscoef); + const Vec one_vec(static_cast(1)); + const Vec zero_vec(static_cast(0)); + cpu_kernel_vec( + it, + [negcoef, negiptcoef, poscoef](scalar_t a) -> scalar_t { + return a <= scalar_t(0) ? (std::exp(a * negiptcoef) - scalar_t(1)) * negcoef : a * poscoef; + }, + [&negcoef_vec, &negiptcoef_vec, &poscoef_vec, &one_vec, &zero_vec](Vec a) -> Vec { + auto cmp = (a > zero_vec); + if (!cmp.zero_mask()) { // only a * poscoef (which is very quick) needs to be computed + return a * poscoef_vec; + } else { + return Vec::blendv(((a * negiptcoef_vec).exp() - one_vec) * negcoef_vec, a * poscoef_vec, cmp); + } + }); + }); + } } void elu_backward_kernel(TensorIteratorBase& it, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale, bool is_result) { - AT_DISPATCH_FLOATING_TYPES(it.dtype(), "elu_backward_cpu", [&]() { - using Vec = Vectorized; - auto negcoef = alpha.to() * scale.to(); - auto poscoef = scale.to(); - auto negiptcoef = input_scale.to(); - const Vec negcoef_vec(negcoef); - const Vec negiptcoef_vec(negiptcoef); - const Vec poscoef_vec(poscoef); - const Vec zero_vec(static_cast(0)); + if (it.common_dtype() == kBFloat16) { + auto negcoef = alpha.to() * scale.to(); + auto poscoef = scale.to(); + auto negiptcoef = input_scale.to(); + const Vectorized negcoef_vec(negcoef); + const Vectorized negiptcoef_vec(negiptcoef); + const Vectorized poscoef_vec(poscoef); + const Vectorized zero_vec(static_cast(0)); cpu_kernel_vec( it, - [negcoef, negiptcoef, poscoef, is_result](scalar_t a, scalar_t b) -> scalar_t { + [negcoef, negiptcoef, poscoef, is_result](BFloat16 a, BFloat16 b) -> BFloat16 { if (is_result) { - return b <= scalar_t(0) ? a * negiptcoef * (b + negcoef) : a * poscoef; + return float(b) <= float(0) ? float(a) * negiptcoef * (float(b) + negcoef) : float(a) * poscoef; } else { - return b <= scalar_t(0) ? a * negiptcoef * negcoef * std::exp(b * negiptcoef): a * poscoef; + return float(b) <= float(0) ? float(a) * negiptcoef * negcoef * std::exp(float(b) * negiptcoef): float(a) * poscoef; } }, - [&negcoef_vec, &negiptcoef_vec, &poscoef_vec, &zero_vec, is_result](Vec a, Vec b) -> Vec { - auto cmp = (b > zero_vec); + [&negcoef_vec, &negiptcoef_vec, &poscoef_vec, &zero_vec, is_result](Vectorized a, Vectorized b) -> Vectorized { + Vectorized a0, a1; + std::tie(a0, a1) = convert_bfloat16_float(a); + Vectorized b0, b1; + std::tie(b0, b1) = convert_bfloat16_float(b); + auto cmp0 = (b0 > zero_vec); + auto cmp1 = (b1 > zero_vec); if (is_result) { - if (!cmp.zero_mask()) { // only a * poscoef (which is very quick) needs to be computed - return a * poscoef_vec; + if (!cmp0.zero_mask() && !cmp1.zero_mask()) { // only a * poscoef (which is very quick) needs to be computed + return convert_float_bfloat16(a0 * poscoef_vec, a1 * poscoef_vec); } else { - return Vec::blendv(a * negiptcoef_vec * (b + negcoef_vec), a * poscoef_vec, cmp); + auto res0 = Vectorized::blendv(a0 * negiptcoef_vec * (b0 + negcoef_vec), a0 * poscoef_vec, cmp0); + auto res1 = Vectorized::blendv(a1 * negiptcoef_vec * (b1 + negcoef_vec), a1 * poscoef_vec, cmp1); + return convert_float_bfloat16(res0, res1); } } else { - return Vec::blendv(a * negiptcoef_vec * negcoef_vec * (b * negiptcoef_vec).exp(), a * poscoef_vec, cmp); + auto res0 = Vectorized::blendv(a0 * negiptcoef_vec * negcoef_vec * (b0 * negiptcoef_vec).exp(), a0 * poscoef_vec, cmp0); + auto res1 = Vectorized::blendv(a1 * negiptcoef_vec * negcoef_vec * (b1 * negiptcoef_vec).exp(), a1 * poscoef_vec, cmp1); + return convert_float_bfloat16(res0, res1); } } - ); - }); + ); + } else { + AT_DISPATCH_FLOATING_TYPES(it.dtype(), "elu_backward_cpu", [&]() { + using Vec = Vectorized; + auto negcoef = alpha.to() * scale.to(); + auto poscoef = scale.to(); + auto negiptcoef = input_scale.to(); + const Vec negcoef_vec(negcoef); + const Vec negiptcoef_vec(negiptcoef); + const Vec poscoef_vec(poscoef); + const Vec zero_vec(static_cast(0)); + cpu_kernel_vec( + it, + [negcoef, negiptcoef, poscoef, is_result](scalar_t a, scalar_t b) -> scalar_t { + if (is_result) { + return b <= scalar_t(0) ? a * negiptcoef * (b + negcoef) : a * poscoef; + } else { + return b <= scalar_t(0) ? a * negiptcoef * negcoef * std::exp(b * negiptcoef): a * poscoef; + } + }, + [&negcoef_vec, &negiptcoef_vec, &poscoef_vec, &zero_vec, is_result](Vec a, Vec b) -> Vec { + auto cmp = (b > zero_vec); + if (is_result) { + if (!cmp.zero_mask()) { // only a * poscoef (which is very quick) needs to be computed + return a * poscoef_vec; + } else { + return Vec::blendv(a * negiptcoef_vec * (b + negcoef_vec), a * poscoef_vec, cmp); + } + } else { + return Vec::blendv(a * negiptcoef_vec * negcoef_vec * (b * negiptcoef_vec).exp(), a * poscoef_vec, cmp); + } + } + ); + }); + } } // TODO(yangxm): Add another fast kernel using formula // y = 0.5x * (1 + tanh(sqrt(2/Pi) * (x + 0.044715x^3))) // and the fast tanh impl from Eigen. -void GeluKernelImpl(TensorIteratorBase& it) { +void GeluKernelImpl(TensorIteratorBase& it, GeluType approximate) { auto grain_size = at::internal::GRAIN_SIZE; // Numbers based on benchmarking. // Benchmark: benchmarks/operator_benchmarks/pt/gelu_test.py @@ -187,57 +324,165 @@ void GeluKernelImpl(TensorIteratorBase& it) { if (it.numel() > GELU_MIN_ELEMENTS_FOR_MULTI_THREADING) { grain_size = it.numel() / at::get_num_threads(); } - AT_DISPATCH_FLOATING_TYPES_AND( - ScalarType::BFloat16, it.dtype(), "GeluKernelImpl", [&]() { - using Vec = vec::Vectorized; - const Vec kAlphaVec(scalar_t(M_SQRT1_2)); - const Vec kOneVec(scalar_t(1)); - const Vec kPointFiveVec(scalar_t(0.5)); - cpu_kernel_vec( - it, - [](scalar_t x) { - const scalar_t kAlpha = scalar_t(M_SQRT1_2); - return x * scalar_t(0.5) * (scalar_t(1) + std::erf(x * kAlpha)); - }, - [&](Vec x_vec) { - return x_vec * kPointFiveVec * - (kOneVec + (x_vec * kAlphaVec).erf()); - }, - grain_size); - }); + if (approximate == GeluType::Tanh) { + AT_DISPATCH_FLOATING_TYPES_AND( + ScalarType::BFloat16, it.dtype(), "GeluKernelImpl", [&]() { + using Vec = vec::Vectorized; + const Vec kBetaVec(scalar_t(M_SQRT2 * M_2_SQRTPI * 0.5)); + const Vec kKappaVec(scalar_t(0.044715)); + const Vec kOneVec(scalar_t(1)); + const Vec kPointFiveVec(scalar_t(0.5)); + cpu_kernel_vec( + it, + [](scalar_t x) { + const scalar_t kBeta = M_SQRT2 * M_2_SQRTPI * 0.5; + const scalar_t kKappa = 0.044715; + auto x_cube = x * x * x; + auto inner = kBeta * (x + kKappa * x_cube); + return scalar_t(0.5) * x * (scalar_t(1) + std::tanh(inner)); + }, + [&](Vec x_vec) { + auto x_cube = x_vec * x_vec * x_vec; + auto inner_vec = kBetaVec * (x_vec + kKappaVec * x_cube); + return kPointFiveVec * x_vec * (kOneVec + inner_vec.tanh()); + }, + grain_size); + }); + } else { + AT_DISPATCH_FLOATING_TYPES_AND( + ScalarType::BFloat16, it.dtype(), "GeluKernelImpl", [&]() { + using Vec = vec::Vectorized; + const Vec kAlphaVec(scalar_t(M_SQRT1_2)); + const Vec kOneVec(scalar_t(1)); + const Vec kPointFiveVec(scalar_t(0.5)); + cpu_kernel_vec( + it, + [](scalar_t x) { + const scalar_t kAlpha = scalar_t(M_SQRT1_2); + return x * scalar_t(0.5) * (scalar_t(1) + std::erf(x * kAlpha)); + }, + [&](Vec x_vec) { + return x_vec * kPointFiveVec * + (kOneVec + (x_vec * kAlphaVec).erf()); + }, + grain_size); + }); + } } -void GeluBackwardKernelImpl(TensorIteratorBase& it) { - AT_DISPATCH_FLOATING_TYPES_AND( - ScalarType::BFloat16, it.dtype(), "GeluBackwardKernelImpl", [&]() { - using Vec = vec::Vectorized; - const Vec kAlphaVec(scalar_t(M_SQRT1_2)); - const Vec kBetaVec(scalar_t(M_2_SQRTPI * M_SQRT1_2 * 0.5)); - const Vec kOneVec(scalar_t(1)); - const Vec kPointFiveVec(scalar_t(0.5)); - const Vec kMinusPointFiveVec(scalar_t(-0.5)); - cpu_kernel_vec( - it, - [](scalar_t dy, scalar_t x) { - const scalar_t kAlpha = scalar_t(M_SQRT1_2); - const scalar_t kBeta = M_2_SQRTPI * M_SQRT1_2 * scalar_t(0.5); - const scalar_t cdf = - scalar_t(0.5) * (scalar_t(1) + std::erf(x * kAlpha)); - const scalar_t pdf = kBeta * std::exp(x * x * scalar_t(-0.5)); - return dy * (cdf + x * pdf); - }, - [&](Vec dy_vec, Vec x_vec) { - const Vec cdf_vec = - kPointFiveVec * (kOneVec + (x_vec * kAlphaVec).erf()); - const Vec pdf_vec = - kBetaVec * (x_vec * x_vec * kMinusPointFiveVec).exp(); - return dy_vec * (cdf_vec + x_vec * pdf_vec); - }); - }); +void GeluBackwardKernelImpl(TensorIteratorBase& it, GeluType approximate) { + if (approximate == GeluType::Tanh) { + AT_DISPATCH_FLOATING_TYPES_AND( + ScalarType::BFloat16, it.dtype(), "GeluBackwardKernelImpl", [&]() { + using Vec = vec::Vectorized; + const Vec kBetaVec(scalar_t(M_SQRT2 * M_2_SQRTPI * 0.5)); + const Vec kKappaVec(scalar_t(0.044715)); + const Vec kOneVec(scalar_t(1)); + const Vec kThreeVec(scalar_t(3)); + const Vec kPointFiveVec(scalar_t(0.5)); + cpu_kernel_vec( + it, + [](scalar_t dy, scalar_t x) { + const scalar_t kBeta = M_SQRT2 * M_2_SQRTPI * 0.5; + const scalar_t kKappa = 0.044715; + auto x_sq = x * x; + auto x_cube = x_sq * x; + auto inner = kBeta * (x + kKappa * x_cube); + auto tanh_inner = std::tanh(inner); + + auto left = scalar_t(0.5) * x; + auto right = scalar_t(1) + tanh_inner; + + auto left_derivative = scalar_t(0.5) * right; + + auto tanh_derivative = scalar_t(1) - tanh_inner * tanh_inner; + auto inner_derivative = + kBeta * (scalar_t(1) + scalar_t(3) * kKappa * x_sq); + auto right_derivative = left * tanh_derivative * inner_derivative; + + return dy * (left_derivative + right_derivative); + }, + [&](Vec dy_vec, Vec x_vec) { + auto x_sq = x_vec * x_vec; + auto x_cube = x_vec * x_vec * x_vec; + auto inner_vec = + kBetaVec * (x_vec + kKappaVec * x_cube); + auto tanh_inner_vec = inner_vec.tanh(); + + auto left_vec = kPointFiveVec * x_vec; + auto right_vec = kOneVec + tanh_inner_vec; + + auto left_derivative_vec = kPointFiveVec * right_vec; + + auto tanh_derivative_vec = + kOneVec - tanh_inner_vec * tanh_inner_vec; + auto inner_derivative_vec = + kBetaVec * (kOneVec + kThreeVec * kKappaVec * x_sq); + auto right_derivative_vec = + left_vec * tanh_derivative_vec * inner_derivative_vec; + + return dy_vec * (left_derivative_vec + right_derivative_vec); + }); + }); + } else { + AT_DISPATCH_FLOATING_TYPES_AND( + ScalarType::BFloat16, it.dtype(), "GeluBackwardKernelImpl", [&]() { + using Vec = vec::Vectorized; + const Vec kAlphaVec(scalar_t(M_SQRT1_2)); + const Vec kBetaVec(scalar_t(M_2_SQRTPI * M_SQRT1_2 * 0.5)); + const Vec kOneVec(scalar_t(1)); + const Vec kPointFiveVec(scalar_t(0.5)); + const Vec kMinusPointFiveVec(scalar_t(-0.5)); + cpu_kernel_vec( + it, + [](scalar_t dy, scalar_t x) { + const scalar_t kAlpha = scalar_t(M_SQRT1_2); + const scalar_t kBeta = M_2_SQRTPI * M_SQRT1_2 * scalar_t(0.5); + const scalar_t cdf = + scalar_t(0.5) * (scalar_t(1) + std::erf(x * kAlpha)); + const scalar_t pdf = kBeta * std::exp(x * x * scalar_t(-0.5)); + return dy * (cdf + x * pdf); + }, + [&](Vec dy_vec, Vec x_vec) { + const Vec cdf_vec = + kPointFiveVec * (kOneVec + (x_vec * kAlphaVec).erf()); + const Vec pdf_vec = + kBetaVec * (x_vec * x_vec * kMinusPointFiveVec).exp(); + return dy_vec * (cdf_vec + x_vec * pdf_vec); + }); + }); + } } void hardsigmoid_kernel(TensorIteratorBase& iter) { - AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "hardsigmoid_cpu", [&] { + if (iter.dtype() == kBFloat16) { + const float zero(0.0f); + const float three(3.0f); + const float six(6.0f); + using Vec = vec::Vectorized; + const Vec kZeroVec(zero); + const Vec kThreeVec(three); + const Vec kSixVec(six); + cpu_kernel_vec( + iter, + [&](BFloat16 self_val) -> BFloat16 { + return std::min(std::max(float(self_val) + three, zero), six) / six; + }, + [&](vec::Vectorized self_val) -> vec::Vectorized { + Vectorized self_val0, self_val1; + std::tie(self_val0, self_val1) = convert_bfloat16_float(self_val); + self_val0 = minimum( + maximum(self_val0 + kThreeVec, kZeroVec), + kSixVec + ) / kSixVec; + self_val1 = minimum( + maximum(self_val1 + kThreeVec, kZeroVec), + kSixVec + ) / kSixVec; + return convert_float_bfloat16(self_val0, self_val1); + }); + } else { + AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "hardsigmoid_cpu", [&] { const scalar_t zero(0.0f); const scalar_t three(3.0f); const scalar_t six(6.0f); @@ -257,10 +502,37 @@ void hardsigmoid_kernel(TensorIteratorBase& iter) { ) / kSixVec; }); }); + } } void hardsigmoid_backward_kernel(TensorIteratorBase& iter) { - AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "hardsigmoid_backward", [&] { + if (iter.dtype() == kBFloat16) { + const float zero(0.0f); + const float three(3.0f); + const float neg_three(-3.0f); + const float one_sixth(1.0f / 6.0f); + using Vec = Vectorized; + Vec kZeroVec(0.0f); + Vec kOneSixthVec(1.0f / 6.0f); + cpu_kernel_vec( + iter, + [=](BFloat16 grad_val, BFloat16 self_val) -> BFloat16 { + return (float(self_val) > neg_three && float(self_val) < three) + ? float(grad_val) * one_sixth + : zero; + }, + [=](Vectorized grad_val, Vectorized self_val) -> Vectorized { + Vec self_val0, self_val1, grad_val0, grad_val1; + std::tie(self_val0, self_val1) = convert_bfloat16_float(self_val); + std::tie(grad_val0, grad_val1) = convert_bfloat16_float(grad_val); + Vec gradNonZeroMask = (self_val0 > neg_three) & (self_val0 < three); + self_val0 = Vec::blendv(kZeroVec, grad_val0 * kOneSixthVec, gradNonZeroMask); + gradNonZeroMask = (self_val1 > neg_three) & (self_val1 < three); + self_val1 = Vec::blendv(kZeroVec, grad_val1 * kOneSixthVec, gradNonZeroMask); + return convert_float_bfloat16(self_val0, self_val1); + }); + } else { + AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "hardsigmoid_backward", [&] { const scalar_t zero(0.0f); const scalar_t three(3.0f); const scalar_t neg_three(-3.0f); @@ -280,10 +552,11 @@ void hardsigmoid_backward_kernel(TensorIteratorBase& iter) { return Vec::blendv(kZeroVec, grad_val * kOneSixthVec, gradNonZeroMask); }); }); + } } void hardshrink_kernel(TensorIteratorBase& iter, const Scalar& lambd) { - AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "hardshrink_cpu", [&] { + AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, iter.dtype(), "hardshrink_cpu", [&] { auto lambd_val = lambd.to(); cpu_kernel_vec( iter, @@ -298,16 +571,43 @@ void hardshrink_kernel(TensorIteratorBase& iter, const Scalar& lambd) { } void softshrink_kernel(TensorIteratorBase& iter, const Scalar& lambd) { - AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "softshrink_cpu", [&]() { + if (iter.dtype() == kBFloat16) { + auto lambd_val = lambd.to(); + auto lambdVec = Vectorized(lambd_val); + cpu_kernel_vec( + iter, + [=](BFloat16 a) -> BFloat16 { + return float(a) > lambd_val ? a - lambd_val : (float(a) < -lambd_val ? a + lambd_val : float(0)); + }, + [=](Vectorized self_val) { + Vectorized self_val0, self_val1; + Vectorized self_val_t0, self_val_t1; + std::tie(self_val0, self_val1) = convert_bfloat16_float(self_val); + self_val_t0 = convert_float_bfloat16((self_val0 > lambdVec) & (self_val0 - lambdVec), (self_val1 > lambdVec) & (self_val1 - lambdVec)); + self_val_t1 = convert_float_bfloat16((self_val0 < -lambd_val) & (self_val0 + lambdVec), (self_val1 < -lambd_val) & (self_val1 + lambdVec)); + return (self_val_t0 | self_val_t1); + }); + } else { + AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "softshrink_cpu", [&]() { auto lambd_val = lambd.to(); - cpu_kernel(iter, [=](scalar_t a) -> scalar_t { - return a > lambd_val ? a - lambd_val : (a < -lambd_val ? a + lambd_val : scalar_t(0)); - }); + auto lambdVec = Vectorized(lambd_val); + cpu_kernel_vec( + iter, + [=](scalar_t a) -> scalar_t { + return a > lambd_val ? a - lambd_val : (a < -lambd_val ? a + lambd_val : scalar_t(0)); + }, + [=](Vectorized self_val) { + Vectorized self_val_t0, self_val_t1; + self_val_t0 = (self_val > lambdVec) & (self_val - lambdVec); + self_val_t1 = (self_val < -lambd_val) & (self_val + lambdVec); + return (self_val_t0 | self_val_t1); + }); }); + } } void shrink_backward_kernel(TensorIteratorBase& iter, const Scalar& lambd) { - AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "shrink_backward_cpu", [&] { + AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, iter.dtype(), "shrink_backward_cpu", [&] { auto lambd_val = lambd.to(); cpu_kernel_vec( iter, @@ -337,7 +637,35 @@ void hardtanh_backward_kernel(TensorIterator& iter, const Scalar& min, const Sca } void hardswish_kernel(TensorIterator& iter) { - AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "hardswish_cpu", [&]() { + if (iter.dtype() == kBFloat16) { + const float zero(0.0f); + const float three(3.0f); + const float six(6.0f); + using Vec = vec::Vectorized; + const Vec kZeroVec(zero); + const Vec kThreeVec(three); + const Vec kSixVec(six); + cpu_kernel_vec( + iter, + [&](BFloat16 x) -> BFloat16 { + return float(x) * std::min(std::max(float(x) + three, zero), six) / six; + }, + [&](vec::Vectorized x_vec) { + Vectorized x_vec0, x_vec1; + std::tie(x_vec0, x_vec1) = convert_bfloat16_float(x_vec); + x_vec0 = x_vec0 * minimum( + maximum(x_vec0 + kThreeVec, kZeroVec), + kSixVec + ) / kSixVec; + x_vec1 = x_vec1 * minimum( + maximum(x_vec1 + kThreeVec, kZeroVec), + kSixVec + ) / kSixVec; + return convert_float_bfloat16(x_vec0, x_vec1); + } + ); + } else { + AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "hardswish_cpu", [&]() { const scalar_t zero(0.0f); const scalar_t three(3.0f); const scalar_t six(6.0f); @@ -358,10 +686,58 @@ void hardswish_kernel(TensorIterator& iter) { } ); }); + } } void hardswish_backward_kernel(TensorIterator& iter) { - AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "hardswish_backward_cpu", [&]() { + if (iter.dtype() == kBFloat16) { + const float zero(0.0f); + const float three(3.0f); + const float neg_three(-3.0f); + const float one_half(0.5f); + using Vec = vec::Vectorized; + const Vec kZeroVec(zero); + const Vec kThreeVec(three); + const Vec kNegThreeVec(neg_three); + const Vec kOneHalfVec(one_half); + cpu_kernel_vec( + iter, + [&](BFloat16 grad_val, BFloat16 self_val) -> BFloat16 { + if (float(self_val) < neg_three) { + return zero; + } else if (float(self_val) <= three) { + return float(grad_val) * ((float(self_val) / three) + one_half); + } else { + return grad_val; + } + }, + [&](vec::Vectorized grad_val, vec::Vectorized self_val) { + Vectorized self_val0, self_val1, grad_val0, grad_val1; + std::tie(self_val0, self_val1) = convert_bfloat16_float(self_val); + std::tie(grad_val0, grad_val1) = convert_bfloat16_float(grad_val); + self_val0 = Vec::blendv( + Vec::blendv( + grad_val0 * ((self_val0 / kThreeVec) + kOneHalfVec), + grad_val0, + self_val0 >= kThreeVec + ), + kZeroVec, + self_val0 < kNegThreeVec + ); + self_val1 = Vec::blendv( + Vec::blendv( + grad_val1 * ((self_val1 / kThreeVec) + kOneHalfVec), + grad_val1, + self_val1 >= kThreeVec + ), + kZeroVec, + self_val1 < kNegThreeVec + ); + return convert_float_bfloat16(self_val0, self_val1); + } + ); + } else { + AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "hardswish_backward_cpu", [&]() { const scalar_t zero(0.0f); const scalar_t three(3.0f); const scalar_t neg_three(-3.0f); @@ -395,6 +771,7 @@ void hardswish_backward_kernel(TensorIterator& iter) { } ); }); + } } static void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negval_) { @@ -475,7 +852,28 @@ static void leaky_relu_backward_kernel(TensorIteratorBase& iter, const Scalar& n } void softplus_kernel(TensorIteratorBase& iter, const Scalar& beta_, const Scalar& threshold_) { - AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "softplus_cpu", [&]() { + if (iter.dtype() == kBFloat16) { + using Vec = Vectorized; + auto beta = beta_.to(); + auto threshold = threshold_.to(); + const Vec beta_vec(beta); + const Vec threshold_vec(threshold); + cpu_kernel_vec( + iter, + [beta, threshold](BFloat16 a) -> BFloat16 { + return (float(a) * beta) > threshold ? a + : static_cast((std::log1p(std::exp(float(a) * beta))) / beta); + }, + [beta_vec, threshold_vec](Vectorized a) -> Vectorized { + Vectorized a0, a1; + std::tie(a0, a1) = convert_bfloat16_float(a); + a0 = Vec::blendv((a0 * beta_vec).exp().log1p() / beta_vec, a0, (a0 * beta_vec) > threshold_vec); + a1 = Vec::blendv((a1 * beta_vec).exp().log1p() / beta_vec, a1, (a1 * beta_vec) > threshold_vec); + return convert_float_bfloat16(a0, a1); + } + ); + } else { + AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "softplus_cpu", [&]() { using Vec = Vectorized; auto beta = beta_.to(); auto threshold = threshold_.to(); @@ -492,10 +890,36 @@ void softplus_kernel(TensorIteratorBase& iter, const Scalar& beta_, const Scalar } ); }); + } } void softplus_backward_kernel(TensorIteratorBase& iter, const Scalar& beta_, const Scalar& threshold_) { - AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "softplus_backward_cpu", [&]() { + if (iter.dtype() == kBFloat16) { + using Vec = Vectorized; + auto beta = beta_.to(); + auto threshold = threshold_.to(); + const Vec beta_vec(beta); + const Vec threshold_vec(threshold); + const Vec one_vec(static_cast(1.0)); + cpu_kernel_vec( + iter, + [beta, threshold](BFloat16 a, BFloat16 b) -> BFloat16 { + float z = std::exp(float(b) * beta); + return (float(b) * beta) > threshold ? a : static_cast(float(a) * z / (z + float(1.))); + }, + [beta_vec, one_vec, threshold_vec](Vectorized a, Vectorized b) -> Vectorized { + Vectorized a0, a1, b0, b1; + std::tie(a0, a1) = convert_bfloat16_float(a); + std::tie(b0, b1) = convert_bfloat16_float(b); + Vec z = (b0 * beta_vec).exp(); + a0 = Vec::blendv(a0 * z / (z + one_vec), a0, (b0 * beta_vec) > threshold_vec); + z = (b1 * beta_vec).exp(); + a1 = Vec::blendv(a1 * z / (z + one_vec), a1, (b1 * beta_vec) > threshold_vec); + return convert_float_bfloat16(a0, a1); + } + ); + } else { + AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "softplus_backward_cpu", [&]() { using Vec = Vectorized; auto beta = beta_.to(); auto threshold = threshold_.to(); @@ -514,10 +938,29 @@ void softplus_backward_kernel(TensorIteratorBase& iter, const Scalar& beta_, con } ); }); + } } void glu_kernel(TensorIteratorBase& iter) { - AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "glu_cpu", [&] { + if (iter.dtype() == kBFloat16) { + const float float_one_val(1); + const Vectorized float_one_vec(float_one_val); + cpu_kernel_vec( + iter, + [float_one_val](BFloat16 a, BFloat16 b) -> BFloat16 { + return float(a) * (float_one_val / (float_one_val + std::exp(- float(b)))); + }, + [float_one_vec](Vectorized a, Vectorized b) -> Vectorized { + Vectorized a0, a1, b0, b1; + std::tie(a0, a1) = convert_bfloat16_float(a); + std::tie(b0, b1) = convert_bfloat16_float(b); + a0 = a0 * (float_one_vec / (float_one_vec + b0.neg().exp())); + a1 = a1 * (float_one_vec / (float_one_vec + b1.neg().exp())); + return convert_float_bfloat16(a0, a1); + } + ); + } else { + AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "glu_cpu", [&] { using Vec = Vectorized; const scalar_t one_val(1); const Vec one_vec(one_val); @@ -531,25 +974,65 @@ void glu_kernel(TensorIteratorBase& iter) { } ); }); + } } -void glu_backward_kernel(TensorIterator& iter) { - AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "glu_backward_cpu", [&] { +void glu_jvp_kernel(TensorIteratorBase& iter) { + AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "glu_jvp_cpu", [&] { using Vec = Vectorized; - const scalar_t one_val(1); - const Vec one_vec(one_val); + const scalar_t one(1); + const Vec ones(one); cpu_kernel_vec( iter, - [one_val](scalar_t a, scalar_t b, scalar_t c) -> scalar_t { - return (one_val - a) * a * b * c; + [one](scalar_t res, scalar_t b, scalar_t da, scalar_t db) -> scalar_t { + const auto sig_b = one / (one + std::exp(-b)); + return da * sig_b + res * (db - sig_b * db); }, - [one_vec](Vec a, Vec b, Vec c) -> Vec { - return (one_vec - a) * a * b * c; + [ones](Vec res, Vec b, Vec da, Vec db) -> Vec { + const auto sig_b = ones / (ones + b.neg().exp()); + return da * sig_b + res * (db - sig_b * db); } ); }); } +void glu_backward_kernel(TensorIterator& iter) { + if (iter.dtype() == kBFloat16) { + const float float_one_val(1); + const Vectorized float_one_vec(float_one_val); + cpu_kernel_vec( + iter, + [float_one_val](BFloat16 a, BFloat16 b, BFloat16 c) -> BFloat16 { + return (float_one_val - float(a)) * float(a) * float(b) * float(c); + }, + [float_one_vec](Vectorized a, Vectorized b, Vectorized c) -> Vectorized { + Vectorized a0, a1, b0, b1, c0, c1; + std::tie(a0, a1) = convert_bfloat16_float(a); + std::tie(b0, b1) = convert_bfloat16_float(b); + std::tie(c0, c1) = convert_bfloat16_float(c); + a0 = (float_one_vec - a0) * a0 * b0 * c0; + a1 = (float_one_vec - a1) * a1 * b1 * c1; + return convert_float_bfloat16(a0, a1); + } + ); + } else { + AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "glu_backward_cpu", [&] { + using Vec = Vectorized; + const scalar_t one_val(1); + const Vec one_vec(one_val); + cpu_kernel_vec( + iter, + [one_val](scalar_t a, scalar_t b, scalar_t c) -> scalar_t { + return (one_val - a) * a * b * c; + }, + [one_vec](Vec a, Vec b, Vec c) -> Vec { + return (one_vec - a) * a * b * c; + } + ); + }); + } +} + void silu_kernel(TensorIteratorBase& iter) { AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1( kBFloat16, iter.dtype(), "silu_cpu", [&]() { @@ -618,6 +1101,65 @@ void mish_backward_kernel(TensorIterator& iter) { }); } +void prelu_cpu_kernel(TensorIterator& iter) { + if (iter.common_dtype() == kBFloat16) { + auto zero_vec = Vectorized((float)(0)); + auto one_vec = Vectorized((float)(1)); + cpu_kernel_vec( + iter, + [=](BFloat16 input, BFloat16 weight) -> BFloat16 { + return (float(input) > float(0)) ? float(input) : float(weight) * float(input); + }, + [=](Vectorized input, Vectorized weight) -> Vectorized { + Vectorized input0, input1; + Vectorized weight0, weight1; + std::tie(input0, input1) = convert_bfloat16_float(input); + std::tie(weight0, weight1) = convert_bfloat16_float(weight); + + auto res0 = input0 * (Vectorized::blendv(weight0, one_vec, input0 > zero_vec)); + auto res1 = input1 * (Vectorized::blendv(weight1, one_vec, input1 > zero_vec)); + return convert_float_bfloat16(res0, res1); + }); + } else { + AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "prelu_cpu", [&]() { + using Vec = Vectorized; + auto zero_vec = Vec((scalar_t)(0)); + auto one_vec = Vec((scalar_t)(1)); + cpu_kernel_vec( + iter, + [=](scalar_t input, scalar_t weight) { + return (input > scalar_t(0)) ? input : weight * input; + }, + [=](Vec input, Vec weight) { + auto r = Vec::blendv(weight, one_vec, input > zero_vec); + return input * r; + }); + }); + } +} + +void prelu_backward_cpu_kernel(TensorIterator& iter) { + if (iter.common_dtype() == kBFloat16) { + cpu_kernel_multiple_outputs( + iter, + [=](BFloat16 input, BFloat16 grad_out, BFloat16 weight) -> std::tuple { + float input_grad = (float(input) > float(0)) ? float(grad_out) : float(weight) * float(grad_out); + float weight_grad_collector = (float(input) > float(0)) ? float(0) : float(input) * float(grad_out); + return std::tuple(input_grad, weight_grad_collector); + }); + } else { + AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "prelu_backward_cpu", [&]() { + cpu_kernel_multiple_outputs( + iter, + [=](scalar_t input, scalar_t grad_out, scalar_t weight) -> std::tuple { + scalar_t input_grad = (input > scalar_t(0)) ? grad_out : weight * grad_out; + scalar_t weight_grad_collector = (input > scalar_t(0)) ? scalar_t(0) : input * grad_out; + return std::tuple(input_grad, weight_grad_collector); + }); + }); + } +} + } // namespace REGISTER_DISPATCH(log_sigmoid_cpu_stub, &log_sigmoid_cpu_kernel); @@ -641,10 +1183,13 @@ REGISTER_DISPATCH(softplus_stub, &softplus_kernel); REGISTER_DISPATCH(softplus_backward_stub, &softplus_backward_kernel); REGISTER_DISPATCH(glu_stub, &glu_kernel); REGISTER_DISPATCH(glu_backward_stub, &glu_backward_kernel); +REGISTER_DISPATCH(glu_jvp_stub, &glu_jvp_kernel); REGISTER_DISPATCH(silu_stub, &silu_kernel); REGISTER_DISPATCH(silu_backward_stub, &silu_backward_kernel); REGISTER_DISPATCH(mish_stub, &mish_kernel); REGISTER_DISPATCH(mish_backward_stub, &mish_backward_kernel); +REGISTER_DISPATCH(prelu_cpu_stub, &prelu_cpu_kernel); +REGISTER_DISPATCH(prelu_backward_cpu_stub, &prelu_backward_cpu_kernel); } // namespace native } // namespace at diff --git a/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp b/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp index dd131a1e2a89..b121e2390258 100644 --- a/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp +++ b/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp @@ -1,4 +1,5 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include diff --git a/aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp b/aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp index 1de76289bf32..3f4038685da4 100644 --- a/aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp +++ b/aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp @@ -1,4 +1,5 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include diff --git a/aten/src/ATen/native/cpu/AvgPoolKernel.cpp b/aten/src/ATen/native/cpu/AvgPoolKernel.cpp index e0b8551a0a55..df51715e1632 100644 --- a/aten/src/ATen/native/cpu/AvgPoolKernel.cpp +++ b/aten/src/ATen/native/cpu/AvgPoolKernel.cpp @@ -1,5 +1,5 @@ -#include - +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include #include diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp index f2410947de16..22c82237637f 100644 --- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp @@ -21,27 +21,6 @@ namespace { using namespace vec; -// Note: Undefined behavior when performing addition is intentionally -// ignored. -void add_kernel(TensorIteratorBase& iter, const Scalar& alpha_scalar) { - if (iter.dtype() == ScalarType::Bool) { - using scalar_t = bool; - auto alpha = alpha_scalar.to(); - cpu_kernel(iter, - [=](scalar_t a, scalar_t b) __ubsan_ignore_undefined__ -> scalar_t { return a + alpha * b; }); - } else { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "add_cpu/sub_cpu", [&]() { - auto alpha = alpha_scalar.to(); - auto alpha_vec = Vectorized(alpha); - cpu_kernel_vec(iter, - [=](scalar_t a, scalar_t b) __ubsan_ignore_undefined__ -> scalar_t { return a + alpha * b; }, - [=](Vectorized a, Vectorized b) __ubsan_ignore_undefined__ { - return vec::fmadd(b, alpha_vec, a); - }); - }); - } -} - void add_clamp_kernel(TensorIterator& iter, const Scalar& alpha_scalar, const Scalar& min_val, const Scalar& max_val) { AT_DISPATCH_ALL_TYPES(iter.dtype(), "add_clamp_cpu", [&]() { auto alpha = alpha_scalar.to(); @@ -64,7 +43,7 @@ void add_clamp_kernel(TensorIterator& iter, const Scalar& alpha_scalar, const Sc } void atan2_kernel(TensorIteratorBase& iter) { - AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "atan2_cpu", [&]() { + AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, iter.dtype(), "atan2_cpu", [&]() { cpu_kernel_vec(iter, [=](scalar_t a, scalar_t b) -> scalar_t { return std::atan2(a, b); }, @@ -74,15 +53,17 @@ void atan2_kernel(TensorIteratorBase& iter) { }); } -// Note: Undefined behavior when performing subtraction is intentionally -// ignored. -void sub_kernel(TensorIteratorBase& iter, const Scalar& alpha_scalar) __ubsan_ignore_undefined__ { - add_kernel(iter, -alpha_scalar); -} - void mul_kernel(TensorIteratorBase& iter) { if (iter.dtype() == ScalarType::Bool) { cpu_kernel(iter, [=](bool a, bool b) -> bool { return a && b; }); + } else if (iter.dtype() == kComplexHalf) { + cpu_kernel( + iter, + [=](c10::complex a, + c10::complex b) -> c10::complex { + using comp_t = c10::complex; + return comp_t{a} * comp_t{b}; + }); } else { AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "mul_cpu", [&]() { cpu_kernel_vec(iter, @@ -331,26 +312,12 @@ void bitwise_xor_kernel(TensorIteratorBase& iter) { } void lshift_kernel(TensorIteratorBase& iter) { - if (iter.dtype() == ScalarType::Float || iter.dtype() == ScalarType::Double) { - AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "lshift_cpu", [&]() { - auto base_vec = Vectorized((scalar_t)(2)); - cpu_kernel_vec( - iter, - [=](scalar_t a, scalar_t b) -> scalar_t { - return a * std::pow((scalar_t)(2), b); - }, - [=](Vectorized a, Vectorized b) { - return a * base_vec.pow(b); - }); - }); - } else { - AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "lshift_cpu", [&]() { - cpu_kernel(iter, - [](scalar_t a, scalar_t b) -> scalar_t { - return static_cast>(a) << b; - }); + AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "lshift_cpu", [&]() { + cpu_kernel(iter, + [](scalar_t a, scalar_t b) -> scalar_t { + return static_cast>(a) << b; }); - } + }); } void logical_and_kernel(TensorIterator& iter) { @@ -411,26 +378,12 @@ void logical_xor_kernel(TensorIterator& iter) { } void rshift_kernel(TensorIteratorBase& iter) { - if (iter.dtype() == ScalarType::Float || iter.dtype() == ScalarType::Double) { - AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "rshift_cpu", [&]() { - auto base_vec = Vectorized((scalar_t)(2)); - cpu_kernel_vec( - iter, - [=](scalar_t a, scalar_t b) -> scalar_t { - return a / std::pow((scalar_t)(2), b); - }, - [=](Vectorized a, Vectorized b) { - return a / base_vec.pow(b); + AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "rshift_cpu", [&]() { + cpu_kernel(iter, + [](scalar_t a, scalar_t b) -> scalar_t { + return a >> b; }); - }); - } else { - AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "rshift_cpu", [&]() { - cpu_kernel(iter, - [](scalar_t a, scalar_t b) -> scalar_t { - return a >> b; - }); - }); - } + }); } void lt_kernel(TensorIteratorBase& iter) { @@ -528,18 +481,18 @@ void ge_kernel(TensorIteratorBase& iter) { void eq_kernel(TensorIteratorBase& iter) { // See Note [special-case bool outputs] if (iter.dtype() == ScalarType::Bool) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, iter.common_dtype(), "eq_cpu", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kComplexHalf, kBool, kBFloat16, kHalf, iter.common_dtype(), "eq_cpu", [&]() { cpu_kernel(iter, [](scalar_t a, scalar_t b) -> bool { return a == b; }); }); } else { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.common_dtype(), "eq_cpu", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kComplexHalf, kBFloat16, kHalf, iter.common_dtype(), "eq_cpu", [&]() { cpu_kernel_vec( iter, [](scalar_t a, scalar_t b) -> scalar_t { - return a == b; + return static_cast(a == b); }, [](Vectorized a, Vectorized b) -> Vectorized { return a.eq(b); @@ -652,8 +605,33 @@ void fmin_kernel(TensorIteratorBase& iter) { } void smooth_l1_kernel(TensorIteratorBase& iter, double beta) { - AT_DISPATCH_FLOATING_TYPES_AND2( - kBFloat16, kHalf, iter.dtype(), "smooth_l1_cpu", [&]() { + if (iter.dtype() == kBFloat16) { + const float beta_val(beta); + const Vectorized beta_val_vec(beta_val); + const Vectorized point_five_vec(static_cast(0.5)); + cpu_kernel_vec( + iter, + [&beta_val](BFloat16 a, BFloat16 b) -> BFloat16 { + auto z = std::abs(float(a) - float(b)); + return z < beta_val + ? static_cast(0.5) * z * z / beta_val + : z - static_cast(0.5) * beta_val; + }, + [&beta_val_vec, &point_five_vec](Vectorized a, Vectorized b) { + Vectorized a0, a1, b0, b1; + std::tie(a0, a1) = convert_bfloat16_float(a); + std::tie(b0, b1) = convert_bfloat16_float(b); + auto z = (a0 - b0).abs(); + a0 = Vectorized::blendv( + point_five_vec * z * z / beta_val_vec, z - point_five_vec * beta_val_vec, z >= beta_val_vec); + z = (a1 - b1).abs(); + a1 = Vectorized::blendv( + point_five_vec * z * z / beta_val_vec, z - point_five_vec * beta_val_vec, z >= beta_val_vec); + return convert_float_bfloat16(a0, a1); + }); + } else { + AT_DISPATCH_FLOATING_TYPES_AND( + kHalf, iter.dtype(), "smooth_l1_cpu", [&]() { using Vec = Vectorized; const scalar_t beta_val(beta); const Vec beta_val_vec(beta_val); @@ -672,6 +650,7 @@ void smooth_l1_kernel(TensorIteratorBase& iter, double beta) { point_five_vec * z * z / beta_val_vec, z - point_five_vec * beta_val_vec, z >= beta_val_vec); }); }); + } } void huber_kernel(TensorIterator& iter, double delta) { @@ -836,7 +815,7 @@ void tanh_backward_kernel(TensorIteratorBase& iter) { } } -void mse_kernel(TensorIterator& iter) { +void mse_kernel(TensorIteratorBase& iter) { if (iter.dtype() == ScalarType::Half) { TORCH_WARN_ONCE("Applying the CPU mse kernel on half-type tensors. " "This may be slower than using float or double-type tensors."); @@ -864,7 +843,7 @@ void fmod_kernel(TensorIteratorBase& iter) { }); }); } else { - AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.common_dtype(), "fmod_cpu", [&]() { + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "fmod_cpu", [&]() { cpu_kernel_vec( iter, [](scalar_t x, scalar_t d) -> scalar_t { @@ -1133,9 +1112,7 @@ void zeta_kernel(TensorIteratorBase& iter) { } // namespace -REGISTER_DISPATCH(add_stub, &add_kernel); REGISTER_DISPATCH(add_clamp_stub, &add_clamp_kernel); -REGISTER_DISPATCH(sub_stub, &sub_kernel); REGISTER_DISPATCH(mul_stub, &mul_kernel); REGISTER_DISPATCH(div_true_stub, &div_true_kernel); REGISTER_DISPATCH(div_trunc_stub, &div_trunc_kernel); diff --git a/aten/src/ATen/native/cpu/BlasKernel.cpp b/aten/src/ATen/native/cpu/BlasKernel.cpp index c5c938818d0d..7b60e9a45cba 100644 --- a/aten/src/ATen/native/cpu/BlasKernel.cpp +++ b/aten/src/ATen/native/cpu/BlasKernel.cpp @@ -1,3 +1,4 @@ +#define TORCH_ASSERT_NO_OPERATORS #include #include #include @@ -190,19 +191,28 @@ void cpublas_gemm_impl( } void cpublas_axpy_impl(at::ScalarType type, int64_t n, const Scalar& _a, const void *_x, int64_t incx, void *_y, int64_t incy){ - AT_DISPATCH_ALL_TYPES_AND_COMPLEX(type, "cpublas_axpy_impl", - [&] { - auto a = _a.to(); - auto x = static_cast(_x); - auto y = static_cast(_y); + if (type == at::kBool) { + auto a = _a.to(); + auto x = static_cast(_x); + auto y = static_cast(_y); int64_t i; for(i = 0; i < n; i++) - y[i*incy] += a*x[i*incx]; - }); + y[i*incy] |= a & x[i*incx]; + } else { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::kHalf, at::kBFloat16, type, "cpublas_axpy_impl", + [&] { + auto a = _a.to(); + auto x = static_cast(_x); + auto y = static_cast(_y); + int64_t i; + for(i = 0; i < n; i++) + y[i*incy] += a*x[i*incx]; + }); + } } void cpublas_copy_impl(at::ScalarType type, int64_t n, const void *_x, int64_t incx, void *_y, int64_t incy){ - AT_DISPATCH_ALL_TYPES_AND_COMPLEX(type, "cpublas_copy_impl", + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::kHalf, at::kBFloat16, at::kBool, type, "cpublas_copy_impl", [&] { auto x = static_cast(_x); auto y = static_cast(_y); diff --git a/aten/src/ATen/native/cpu/CatKernel.cpp b/aten/src/ATen/native/cpu/CatKernel.cpp index f9ddc5ef329c..c4fa1bb05405 100644 --- a/aten/src/ATen/native/cpu/CatKernel.cpp +++ b/aten/src/ATen/native/cpu/CatKernel.cpp @@ -1,4 +1,5 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include @@ -20,15 +21,15 @@ struct InputMeta { }; template -void cat_serial_kernel_impl(Tensor& result, TensorList tensors, int64_t dim) { +void cat_serial_kernel_impl(const Tensor& result, const MaterializedITensorListRef& tensors, int64_t dim) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( dim >= 0 && dim < result.dim(), "dim out of range in cat_serial_kernel_impl"); int64_t outer = result.numel() / (result.sizes()[dim] * result.strides()[dim]); scalar_t* result_data = result.data_ptr(); - int64_t ninputs = tensors.size(); + int64_t ninputs = static_cast(tensors.size()); std::vector inputs; inputs.reserve(ninputs); - for (auto const &tensor : tensors) { + for (const Tensor& tensor : tensors) { inputs.emplace_back(tensor, dim, result.strides()[dim]); } @@ -54,7 +55,7 @@ void cat_serial_kernel_impl(Tensor& result, TensorList tensors, int64_t dim) { } } -void cat_serial_kernel(Tensor& result, TensorList tensors, int64_t dim) { +void cat_serial_kernel(const Tensor& result, const MaterializedITensorListRef& tensors, int64_t dim) { AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, result.scalar_type(), "cat_serial_kernel", [&]() { cat_serial_kernel_impl(result, tensors, dim); }); diff --git a/aten/src/ATen/native/cpu/CatKernel.h b/aten/src/ATen/native/cpu/CatKernel.h index 6b9d40c6d630..aedb4aec4f57 100644 --- a/aten/src/ATen/native/cpu/CatKernel.h +++ b/aten/src/ATen/native/cpu/CatKernel.h @@ -1,11 +1,12 @@ #pragma once -#include +#include #include +#include namespace at { namespace native { -using cat_serial_fn = void(*)(Tensor &, TensorList, int64_t); +using cat_serial_fn = void(*)(const Tensor &, const MaterializedITensorListRef&, int64_t); DECLARE_DISPATCH(cat_serial_fn, cat_serial_stub); }} // namespace at::native diff --git a/aten/src/ATen/native/cpu/ComplexKernel.cpp b/aten/src/ATen/native/cpu/ComplexKernel.cpp index 6b78645db1d5..99dc6134537e 100644 --- a/aten/src/ATen/native/cpu/ComplexKernel.cpp +++ b/aten/src/ATen/native/cpu/ComplexKernel.cpp @@ -1,3 +1,4 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include #include @@ -8,7 +9,7 @@ namespace native { namespace { void complex_kernel(TensorIterator& iter) { - AT_DISPATCH_FLOATING_TYPES(iter.input_dtype(), "complex_cpu", [&]() { + AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.input_dtype(), "complex_cpu", [&]() { cpu_kernel(iter, [=](scalar_t a, scalar_t b) -> c10::complex { return c10::complex(a, b); }); diff --git a/aten/src/ATen/native/cpu/CopyKernel.cpp b/aten/src/ATen/native/cpu/CopyKernel.cpp index 0b9992890c67..40a0c20b5ca8 100644 --- a/aten/src/ATen/native/cpu/CopyKernel.cpp +++ b/aten/src/ATen/native/cpu/CopyKernel.cpp @@ -72,7 +72,7 @@ void copy_same_dtype(TensorIteratorBase &iter, bool requires_conj, bool requires } } -void copy_kernel(TensorIterator& iter, bool non_blocking) { +void copy_kernel(TensorIterator& iter, bool /*non_blocking*/) { ScalarType dtype = iter.dtype(0); const bool requires_conj = ( isComplexType(dtype) && (iter.tensor_base(0).is_conj() != iter.tensor_base(1).is_conj())); @@ -81,9 +81,9 @@ void copy_kernel(TensorIterator& iter, bool non_blocking) { if (dtype == iter.dtype(1)) { copy_same_dtype(iter, requires_conj, requires_neg); } else { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Half, ScalarType::Bool, ScalarType::BFloat16, dtype, "copy_", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(ScalarType::ComplexHalf, ScalarType::Half, ScalarType::Bool, ScalarType::BFloat16, dtype, "copy_", [&] { using dest_t = scalar_t; - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Half, ScalarType::Bool, ScalarType::BFloat16, iter.dtype(1), "copy_", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(ScalarType::ComplexHalf, ScalarType::Half, ScalarType::Bool, ScalarType::BFloat16, iter.dtype(1), "copy_", [&] { // Note (@zasdfgbnm): // // The code below can not be simplified as diff --git a/aten/src/ATen/native/cpu/CrossKernel.cpp b/aten/src/ATen/native/cpu/CrossKernel.cpp index 99a4402d51ee..1511d17fce78 100644 --- a/aten/src/ATen/native/cpu/CrossKernel.cpp +++ b/aten/src/ATen/native/cpu/CrossKernel.cpp @@ -1,3 +1,4 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include @@ -5,8 +6,10 @@ #include #include +#include #include #include +#include #include #include namespace at { namespace native { namespace { diff --git a/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp b/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp index 9ab2e860d895..9fb24db673d5 100644 --- a/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp +++ b/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp @@ -1,8 +1,16 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include -#include +#include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#endif + #ifdef __ARM_NEON__ #include #endif diff --git a/aten/src/ATen/native/cpu/DepthwiseConvKernel.h b/aten/src/ATen/native/cpu/DepthwiseConvKernel.h index 7ef848032af3..56956b443386 100644 --- a/aten/src/ATen/native/cpu/DepthwiseConvKernel.h +++ b/aten/src/ATen/native/cpu/DepthwiseConvKernel.h @@ -1,6 +1,5 @@ #pragma once -#include #include /* @@ -8,6 +7,8 @@ */ namespace at { +class Tensor; + namespace native { using convolution_depthwise3x3_winograd_fn = diff --git a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp index 2058ca482ea0..98404005c551 100644 --- a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp @@ -1,11 +1,12 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include -#include -#include #include +#include #include #include +#include #include #include @@ -91,7 +92,7 @@ struct Dist { struct zdist_calc { static inline data_t map(const data_t& diff, const data_t& p) { return min(ceil(abs(diff)), 1); } static inline data_t red(const data_t& agg, const data_t& up) { return agg + up; } - static inline scalar_t finish(const scalar_t agg, const scalar_t p) { return agg; } + static inline scalar_t finish(const scalar_t agg, const scalar_t /*p*/) { return agg; } }; // One norm @@ -99,8 +100,8 @@ struct Dist { struct odist_calc { static inline data_t map(const data_t& diff, const data_t& p) { return diff; } static inline data_t red(const data_t& agg, const data_t& up) { return agg + up; } - static inline scalar_t finish(const scalar_t agg, const scalar_t p) { return agg; } - static inline Vec backward(const Vec& diff, const scalar_t grad, const scalar_t dist, const Vec& p) { return Vec(grad) * sign(diff); } + static inline scalar_t finish(const scalar_t agg, const scalar_t /*p*/) { return agg; } + static inline Vec backward(const Vec& diff, const scalar_t grad, const scalar_t /*dist*/, const Vec& /*p*/) { return Vec(grad) * sign(diff); } }; // Special general pnorm derivative if p is less than two diff --git a/aten/src/ATen/native/cpu/DistributionKernels.cpp b/aten/src/ATen/native/cpu/DistributionKernels.cpp index f6803e5a3994..4363cc9d62e3 100644 --- a/aten/src/ATen/native/cpu/DistributionKernels.cpp +++ b/aten/src/ATen/native/cpu/DistributionKernels.cpp @@ -1,13 +1,19 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include #include #include #include -#include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + #include #include #include @@ -25,22 +31,22 @@ static void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, templates::cpu::cauchy_kernel(iter, median, sigma, generator); } -void bernoulli_tensor_kernel(Tensor& self, const Tensor& p_, c10::optional gen) { +void bernoulli_tensor_kernel(const TensorBase &self, const TensorBase &p_, c10::optional gen) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::bernoulli_kernel(self, p_, generator); } -void bernoulli_scalar_kernel_default(Tensor& self, double p, c10::optional gen) { +void bernoulli_scalar_kernel_default(const TensorBase &self, double p, c10::optional gen) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::bernoulli_kernel(self, p, generator); } #if !AT_MKL_ENABLED() -void bernoulli_scalar_kernel(Tensor& self, double p, c10::optional gen) { +void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional gen) { bernoulli_scalar_kernel_default(self, p, gen); } #else -void bernoulli_scalar_kernel(Tensor &self, double p, c10::optional gen) { +void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional gen) { if (cpuinfo_initialize() && cpuinfo_vendor_intel == cpuinfo_get_processor(0)->core->vendor) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); int64_t seed; @@ -87,7 +93,7 @@ void bernoulli_scalar_kernel(Tensor &self, double p, c10::optional ge // copy_ if using buffer and non contiguous if (!contig) { - self.copy_(tmp_int_tensor); + OptionalTensorRef(self)->copy_(tmp_int_tensor); } }); } else { @@ -117,7 +123,7 @@ void uniform_kernel(TensorIteratorBase& iter, double from, double to, c10::optio templates::cpu::uniform_kernel(iter, from, to, generator); } -void normal_kernel(Tensor& self, double mean, double std, c10::optional gen) { +void normal_kernel(const TensorBase &self, double mean, double std, c10::optional gen) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::normal_kernel(self, mean, std, generator); } diff --git a/aten/src/ATen/native/cpu/DistributionTemplates.h b/aten/src/ATen/native/cpu/DistributionTemplates.h index 6c017e15c461..37c799803eaf 100644 --- a/aten/src/ATen/native/cpu/DistributionTemplates.h +++ b/aten/src/ATen/native/cpu/DistributionTemplates.h @@ -1,7 +1,8 @@ #pragma once -#include #include +#include +#include #include #include #include @@ -105,7 +106,7 @@ static void normal_fill_16_AVX2(float *data, } template -void normal_fill_AVX2(Tensor& self, const float mean, const float std, RNG generator) { +void normal_fill_AVX2(const TensorBase &self, const float mean, const float std, RNG generator) { float *data = self.data_ptr(); auto size = self.numel(); std::lock_guard lock(generator->mutex_); @@ -148,7 +149,7 @@ static void normal_fill_16(scalar_t *data, const scalar_t mean, const scalar_t s } template -void normal_fill(Tensor& self, const scalar_t mean, const scalar_t std, RNG generator) { +void normal_fill(const TensorBase &self, const scalar_t mean, const scalar_t std, RNG generator) { scalar_t *data = self.data_ptr(); auto size = self.numel(); std::lock_guard lock(generator->mutex_); @@ -172,7 +173,7 @@ void normal_fill(Tensor& self, const scalar_t mean, const scalar_t std, RNG gene } template -void normal_kernel(Tensor& self, double mean, double std, RNG generator) { +void normal_kernel(const TensorBase &self, double mean, double std, RNG generator) { auto size = self.numel(); if (self.scalar_type() == ScalarType::Float && size >= 16 && self.is_contiguous()) { #ifdef CPU_CAPABILITY_AVX2 @@ -308,25 +309,25 @@ struct ExponentialKernel { // ================================================== Bernoulli ======================================================= template -void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG generator) { +void bernoulli_kernel(const TensorBase &self, const TensorBase &p_, RNG generator) { AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "bernoulli_tensor_cpu_self_", [&] { // See Note [Acquire lock when using random generators] std::lock_guard lock(generator->mutex_); using self_t = scalar_t; auto p_cpu = p_.to(kCPU); - c10::MaybeOwned p = expand_inplace(self, p_cpu); + auto p = expand_inplace(self, p_cpu); auto iter = TensorIteratorConfig() .add_output(self) .add_input(*p) .check_all_same_dtype(false) .build(); - if (p_.scalar_type() == kDouble) { + if (p->scalar_type() == kDouble) { cpu_serial_kernel(iter, [&](const double p_val) -> self_t { at::bernoulli_distribution bernoulli(p_val); return static_cast(bernoulli(generator)); }); } else { - AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, p_.scalar_type(), "bernoulli_tensor_cpu_p_", [&] { + AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, p->scalar_type(), "bernoulli_tensor_cpu_p_", [&] { using p_t = scalar_t; cpu_serial_kernel(iter, [&](const p_t p_val) -> self_t { at::bernoulli_distribution bernoulli(p_val); @@ -338,7 +339,7 @@ void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG generator) { } template -void bernoulli_kernel(Tensor& self, double p, RNG generator) { +void bernoulli_kernel(const TensorBase &self, double p, RNG generator) { AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "bernoulli_scalar_cpu_", [&] { // See Note [Acquire lock when using random generators] std::lock_guard lock(generator->mutex_); @@ -352,10 +353,10 @@ void bernoulli_kernel(Tensor& self, double p, RNG generator) { template struct BernoulliKernel { - void operator()(Tensor& self, double p, c10::optional gen) { + void operator()(const TensorBase &self, double p, c10::optional gen) { bernoulli_kernel(self, p, check_generator(gen)); } - void operator()(Tensor& self, const Tensor& p_, c10::optional gen) { + void operator()(const TensorBase &self, const TensorBase &p_, c10::optional gen) { bernoulli_kernel(self, p_, check_generator(gen)); } }; diff --git a/aten/src/ATen/native/cpu/FillKernel.cpp b/aten/src/ATen/native/cpu/FillKernel.cpp index 3685f2e179ce..c023013052d9 100644 --- a/aten/src/ATen/native/cpu/FillKernel.cpp +++ b/aten/src/ATen/native/cpu/FillKernel.cpp @@ -1,3 +1,4 @@ +#define TORCH_ASSERT_NO_OPERATORS #include #include #include @@ -6,6 +7,7 @@ #include #include +#include namespace at { namespace native { namespace { diff --git a/aten/src/ATen/native/cpu/FunctionOfAMatrixUtilsKernel.cpp b/aten/src/ATen/native/cpu/FunctionOfAMatrixUtilsKernel.cpp index 2e0cc33c3f51..0f4d4b607717 100644 --- a/aten/src/ATen/native/cpu/FunctionOfAMatrixUtilsKernel.cpp +++ b/aten/src/ATen/native/cpu/FunctionOfAMatrixUtilsKernel.cpp @@ -1,6 +1,8 @@ +#define TORCH_ASSERT_NO_OPERATORS #include -#include +#include +#include #include #if (defined(_WIN32) || defined(_WIN64)) diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp index 4e89a499d233..47b20b2ca4c1 100644 --- a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp +++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp @@ -1,11 +1,12 @@ -#include -#include -#include -#include -#include +#define TORCH_ASSERT_NO_OPERATORS #include #include -#include +#include +#include +#include +#include +#include +#include #include #include @@ -664,6 +665,7 @@ struct ApplyGridSample* gInp_slice_ptr, TensorAccessor& gGrid_slice, const TensorAccessor& gOut_slice, - const TensorAccessor& inp_slice, + const TensorAccessor& /*inp_slice*/, int64_t offset, const Vec& grid_x, const Vec& grid_y, int64_t len) const { if (input_requires_grad) { @@ -1146,13 +1148,12 @@ static inline void grid_sample_2d_grid_slice_iterator( // and backward. // See NOTE [ Grid Sample CPU Kernels ] for details. -Tensor grid_sampler_2d_cpu_kernel_impl(const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, - int64_t padding_mode, bool align_corners) { +void grid_sampler_2d_cpu_kernel_impl( + const TensorBase &output, const TensorBase &input, const TensorBase &grid, + int64_t interpolation_mode, int64_t padding_mode, bool align_corners) { auto N = input.size(0); auto H = grid.size(1); auto W = grid.size(2); - auto output = at::empty({N, input.size(1), H, W}, input.options()); auto spatial_size = H * W; auto grain_size = spatial_size == 0 ? (N + 1) : at::divup(at::internal::GRAIN_SIZE, spatial_size * 4 /* 2d * 2 tensors*/); @@ -1207,18 +1208,18 @@ Tensor grid_sampler_2d_cpu_kernel_impl(const Tensor& input, const Tensor& grid, }); #undef HANDLE_CASE #undef HANDLE_INTERP - - return output; } -std::tuple -grid_sampler_2d_backward_cpu_kernel_impl(const Tensor& grad_output_, - const Tensor& input, - const Tensor& grid, - int64_t interpolation_mode, - int64_t padding_mode, - bool align_corners, - std::array output_mask) { +void grid_sampler_2d_backward_cpu_kernel_impl( + const TensorBase &grad_input, + const TensorBase &grad_grid, + const TensorBase &grad_output_, + const TensorBase &input, + const TensorBase &grid, + int64_t interpolation_mode, + int64_t padding_mode, + bool align_corners, + std::array output_mask) { // grad_output should be contiguous most of time. Ensuring that it is // contiguous can greatly simplify this code. auto grad_output = grad_output_.contiguous(); @@ -1228,11 +1229,6 @@ grid_sampler_2d_backward_cpu_kernel_impl(const Tensor& grad_output_, // is always computed.) auto input_requires_grad = output_mask[0]; - Tensor grad_input; - if (input_requires_grad) { - grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - } - auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT); auto N = input.size(0); auto spatial_size = grid.size(1) * grid.size(2); auto grain_size = spatial_size == 0 ? (N + 1) @@ -1315,8 +1311,6 @@ grid_sampler_2d_backward_cpu_kernel_impl(const Tensor& grad_output_, }); #undef HANDLE_CASE #undef HANDLE_INTERP - - return std::make_tuple(grad_input, grad_grid); } } diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.h b/aten/src/ATen/native/cpu/GridSamplerKernel.h index aa4a24736dac..b1830fcd3911 100644 --- a/aten/src/ATen/native/cpu/GridSamplerKernel.h +++ b/aten/src/ATen/native/cpu/GridSamplerKernel.h @@ -1,17 +1,33 @@ #pragma once -#include -#include -#include #include -#include -#include +#include +#include + +namespace at { +class TensorBase; +} namespace at { namespace native { -using forward_2d_fn = Tensor(*)(const Tensor &, const Tensor &, int64_t, int64_t, bool); -using backward_2d_fn = std::tuple(*)(const Tensor &, const Tensor &, const Tensor &, int64_t, int64_t, bool, std::array); +using forward_2d_fn = void (*) ( + const TensorBase &output, + const TensorBase &input, + const TensorBase &grid, + int64_t interpolation_mode, + int64_t padding_mode, + bool align_corners); +using backward_2d_fn = void (*) ( + const TensorBase &grad_input, + const TensorBase &grad_grid, + const TensorBase &grad_output, + const TensorBase &input, + const TensorBase &grid, + int64_t interpolation_mode, + int64_t padding_mode, + bool align_corners, + std::array output_mask); DECLARE_DISPATCH(forward_2d_fn, grid_sampler_2d_cpu_kernel); DECLARE_DISPATCH(backward_2d_fn, grid_sampler_2d_backward_cpu_kernel); diff --git a/aten/src/ATen/native/cpu/HistogramKernel.cpp b/aten/src/ATen/native/cpu/HistogramKernel.cpp index 583f3679c0aa..6d6b4a749fb2 100644 --- a/aten/src/ATen/native/cpu/HistogramKernel.cpp +++ b/aten/src/ATen/native/cpu/HistogramKernel.cpp @@ -1,16 +1,23 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include -#include +#include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#endif + #include -#include #include -#include #include -#include namespace at { namespace native { @@ -219,7 +226,7 @@ void histogramdd_out_cpu_template(const Tensor& self, const c10::optional( hist, bin_edges_contig, reshaped_input, reshaped_weight); }); diff --git a/aten/src/ATen/native/cpu/IndexKernel.cpp b/aten/src/ATen/native/cpu/IndexKernel.cpp index 242241b97988..7b7ab6c72802 100644 --- a/aten/src/ATen/native/cpu/IndexKernel.cpp +++ b/aten/src/ATen/native/cpu/IndexKernel.cpp @@ -103,7 +103,7 @@ void cpu_index_kernel(TensorIterator& iter, IntArrayRef index_size, IntArrayRef } void index_kernel(TensorIterator& iter, IntArrayRef index_size, IntArrayRef index_stride) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Half, ScalarType::Bool, ScalarType::BFloat16, + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kComplexHalf, kHalf, kBool, kBFloat16, iter.dtype(), "index_cpu", [&] { cpu_index_kernel(iter, index_size, index_stride, [](char* dst, char* src, int64_t offset) { *(scalar_t*)dst = *(scalar_t*)(src + offset); @@ -234,7 +234,7 @@ void take_kernel( void index_put_kernel(TensorIterator& iter, IntArrayRef index_size, IntArrayRef index_stride, bool accumulate) { // NOTE: duplicate indices are only supported if accumulate is true. - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Half, ScalarType::Bool, ScalarType::BFloat16, + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kComplexHalf, kHalf, kBool, kBFloat16, iter.dtype(), "index_put", [&] { // See Note [Enabling Deterministic Operations] // Parallel cpu_index_kernel with accumulation is nondeterministic, so we @@ -409,7 +409,7 @@ void cpu_masked_fill_kernel(TensorIterator& iter, scalar_t value) { } void masked_fill_kernel(TensorIterator& iter, const Scalar& value) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Bool, ScalarType::BFloat16, ScalarType::Half, + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kComplexHalf, kBool, kBFloat16, kHalf, iter.dtype(), "masked_fill", [&] { scalar_t scalar_val = value.to(); auto mask_dtype = iter.input_dtype(0); diff --git a/aten/src/ATen/native/cpu/IsContiguous.h b/aten/src/ATen/native/cpu/IsContiguous.h index 971717bae4be..192177cc9bcf 100644 --- a/aten/src/ATen/native/cpu/IsContiguous.h +++ b/aten/src/ATen/native/cpu/IsContiguous.h @@ -25,7 +25,7 @@ struct IsContiguous<0, 0, traits, s> { // will be called when there is no output template struct IsContiguous<0, -1, traits, s> { - static bool eval(const int64_t* strides) { + static bool eval(const int64_t* /*strides*/) { return true; } }; diff --git a/aten/src/ATen/native/cpu/LinearAlgebraKernel.cpp b/aten/src/ATen/native/cpu/LinearAlgebraKernel.cpp index 0bb92a158aa2..d67769dead45 100644 --- a/aten/src/ATen/native/cpu/LinearAlgebraKernel.cpp +++ b/aten/src/ATen/native/cpu/LinearAlgebraKernel.cpp @@ -1,5 +1,6 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include +#include #include #include #include @@ -20,7 +21,7 @@ void addr_kernel(TensorIterator &iter, // nans and infs in self should not propagate. if (beta_val == false) { cpu_kernel(iter, - [=](scalar_t self_val, + [=](scalar_t /*self_val*/, scalar_t vec1_val, scalar_t vec2_val) __ubsan_ignore_undefined__ -> scalar_t { return alpha_val && vec1_val && vec2_val; @@ -53,12 +54,12 @@ void addr_kernel(TensorIterator &iter, // nans and infs in self should not propagate. if (beta_val == zero_val) { cpu_kernel_vec(iter, - [=](scalar_t self_val, + [=](scalar_t /*self_val*/, scalar_t vec1_val, scalar_t vec2_val) __ubsan_ignore_undefined__ -> scalar_t { return alpha_val * vec1_val * vec2_val; }, - [=](Vec self_vec, + [=](Vec /*self_vec*/, Vec vec1_vec, Vec vec2_vec) __ubsan_ignore_undefined__ { return alpha_vec * vec1_vec * vec2_vec; @@ -82,86 +83,7 @@ void addr_kernel(TensorIterator &iter, ); } -template ::type> -void linalg_vector_norm_kernel_cpu_impl(TensorIterator& iter, Scalar ord) { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - double ord_val; - if (ord.isFloatingPoint()) { - ord_val = ord.to(); - } else { - TORCH_CHECK(false, "linalg.vector_norm expects ord to be float"); - } - // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) - acc_t init_val = (ord_val == -INFINITY) ? std::numeric_limits::infinity() : static_cast(0); - if (iter.numel() == 0) { - iter.output().fill_((ord_val < 0) ? INFINITY : 0); - return; - } - if (ord_val == 0) { - binary_kernel_reduce(iter, NormZeroOps(), init_val); - } else if (ord_val == 1) { - binary_kernel_reduce(iter, NormOneOps(), init_val); - } else if (ord_val == 2) { - binary_kernel_reduce(iter, NormTwoOps(), init_val); - } else if (ord_val == INFINITY) { - binary_kernel_reduce(iter, AbsMaxOps(), init_val); - // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) - } else if (ord_val == -INFINITY) { - binary_kernel_reduce(iter, AbsMinOps(), init_val); - } else { - binary_kernel_reduce(iter, NormOps { static_cast(ord_val) }, init_val); - } - // For complex outputs, the above kernels do not touch the imaginary values, - // so we must zero them out - if (isComplexType(iter.output().scalar_type())) { - at::imag(iter.output()).zero_(); - } -} - -static void linalg_vector_norm_kernel_cpu(TensorIterator& iter, Scalar ord) { - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "linalg_vector_norm_cpu", [&] { - linalg_vector_norm_kernel_cpu_impl(iter, ord); - }); -} - -void unpack_pivots_cpu_kernel( - TensorIterator& iter, - int64_t dim_size -) { - if (iter.numel() == 0) { - return; - } - - auto loop = [&](char** data, const int64_t* strides, int64_t nelems) { - auto* unpacked_pivots_ptr = data[0]; - const auto* pivots_ptr = data[1]; - - for (const auto elem : c10::irange(nelems)) { - (void)elem; //Suppress unused variable warning - // WARNING: torch.lu returns int32 pivots, - // this behavior could change in the future. - auto* unpacked_pivots_data = reinterpret_cast(unpacked_pivots_ptr); - auto* pivots_data = reinterpret_cast(pivots_ptr); - - for (const auto i : c10::irange(dim_size)) { - std::swap( - unpacked_pivots_data[i], - unpacked_pivots_data[pivots_data[i]] - ); - } - - unpacked_pivots_ptr += strides[0]; - pivots_ptr += strides[1]; - } - }; - - iter.for_each(loop); -} - } // anonymous namespace REGISTER_DISPATCH(addr_stub, &addr_kernel); -REGISTER_DISPATCH(linalg_vector_norm_stub, &linalg_vector_norm_kernel_cpu); -REGISTER_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel); - }} // namespace at::native diff --git a/aten/src/ATen/native/cpu/Loops.h b/aten/src/ATen/native/cpu/Loops.h index f704240481fe..2facc434d341 100644 --- a/aten/src/ATen/native/cpu/Loops.h +++ b/aten/src/ATen/native/cpu/Loops.h @@ -231,7 +231,7 @@ vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, ve template static inline void unroll_contiguous_scalar_checks( - const int64_t* strides, + const int64_t* /*strides*/, std::index_sequence<>, cb_t&& cb) { cb(0); diff --git a/aten/src/ATen/native/cpu/MaxPoolKernel.cpp b/aten/src/ATen/native/cpu/MaxPoolKernel.cpp index e81601b987e1..a2e7736a4a82 100644 --- a/aten/src/ATen/native/cpu/MaxPoolKernel.cpp +++ b/aten/src/ATen/native/cpu/MaxPoolKernel.cpp @@ -1,4 +1,6 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include #include diff --git a/aten/src/ATen/native/cpu/MaxPooling.cpp b/aten/src/ATen/native/cpu/MaxPooling.cpp index d70b6ef6e70d..06d0fe501426 100644 --- a/aten/src/ATen/native/cpu/MaxPooling.cpp +++ b/aten/src/ATen/native/cpu/MaxPooling.cpp @@ -1,4 +1,6 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include #include #include @@ -30,13 +32,13 @@ void max_pool1d_impl( Tensor& output, const Tensor& input, const PoolingParams1D& p) { - AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "max_pool1d_impl", [&] { + AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, input.scalar_type(), "max_pool1d_impl", [&] { const Tensor in = input.contiguous(); scalar_t* const OP = output.data_ptr(); const scalar_t* const IP = in.data_ptr(); // Value used for padding - constexpr scalar_t FILL = std::numeric_limits::has_infinity + scalar_t FILL = std::numeric_limits::has_infinity ? -std::numeric_limits::infinity() : std::numeric_limits::lowest(); diff --git a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp index d08531ddf32a..566f13591603 100644 --- a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp +++ b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp @@ -1,9 +1,9 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include -#include +#include #include #include -#include #include #include @@ -233,68 +233,6 @@ void cpu_max_unpool_backward( } } -template -void cpu_max_unpool_backward_channels_last( - Tensor& grad_input_, - const Tensor& grad_output, - const Tensor& indices) { - TORCH_CHECK(grad_output.ndimension() == 4, - "max_unpool2d backward with channels last format supports tensors with 4 dims."); - auto memory_format = at::MemoryFormat::ChannelsLast; - auto grad_input = grad_input_.contiguous(memory_format); - - auto grad_input_data = grad_input.data_ptr(); - auto grad_output_data = grad_output.data_ptr(); - auto indices_data = indices.data_ptr(); - - int64_t nbatch = grad_input.size(0); - int64_t channels = grad_input.size(1); - int64_t input_height = grad_input.size(2); - int64_t input_width = grad_input.size(3); - int64_t output_height = grad_output.size(2); - int64_t output_width = grad_output.size(3); - int64_t input_image_size = input_height * input_width; - int64_t output_image_size = output_height * output_width; - - c10::optional optional_error_index; - - // parallel on dim N, H, W - at::parallel_for(0, nbatch * input_image_size, 0, [&](int64_t begin, int64_t end) { - int64_t n = 0; - int64_t ip = 0; - data_index_init(begin, n, nbatch, ip, input_image_size); - - for (const auto i : c10::irange(begin, end)) { - scalar_t* grad_output_ptr = grad_output_data + n * output_image_size * channels; - scalar_t* grad_input_ptr = grad_input_data + i * channels; - int64_t* indices_ptr = indices_data + i * channels; - - for (const auto c : c10::irange(channels)) { - int64_t maxp = indices_ptr[c]; - if (maxp < 0 || maxp >= output_image_size) { - optional_error_index = maxp; - std::atomic_thread_fence(std::memory_order_release); - } else { - grad_input_ptr[c] = grad_output_ptr[maxp * channels + c]; - } - } - - // move on to next input index - data_index_step(n, nbatch, ip, input_image_size); - } - }); - - if (optional_error_index) { - AT_ERROR("invalid max index ", optional_error_index.value(), - ", owidth= ", output_width, - ", oheight= ", output_height); - } - - if (!grad_input_.is_contiguous(memory_format)) { - grad_input_.copy_(grad_input); - } -} - void max_unpool2d_kernel_impl( Tensor& output, const Tensor& input, @@ -326,42 +264,9 @@ void max_unpool3d_kernel_impl( }); } -void max_unpool2d_backward_kernel_impl( - Tensor& grad_input, - const Tensor& grad_output, - const Tensor& indices) { - switch(grad_output.suggest_memory_format()) { - case at::MemoryFormat::Contiguous: { - AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_unpool2d_backward", [&] { - cpu_max_unpool_backward(grad_input, grad_output, indices); - }); - break; - } - case at::MemoryFormat::ChannelsLast: { - AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_unpool2d_backward_channels_last", [&] { - cpu_max_unpool_backward_channels_last(grad_input, grad_output, indices); - }); - break; - } - default: - TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); - } -} - -void max_unpool3d_backward_kernel_impl( - Tensor& grad_input, - const Tensor& grad_output, - const Tensor& indices) { - AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "max_unpool3d_backward", [&] { - cpu_max_unpool_backward(grad_input, grad_output, indices); - }); -} - } // anonymous namespace REGISTER_DISPATCH(max_unpool2d_kernel, &max_unpool2d_kernel_impl); -REGISTER_DISPATCH(max_unpool2d_backward_kernel, &max_unpool2d_backward_kernel_impl); REGISTER_DISPATCH(max_unpool3d_kernel, &max_unpool3d_kernel_impl); -REGISTER_DISPATCH(max_unpool3d_backward_kernel, &max_unpool3d_backward_kernel_impl); }} // at::native diff --git a/aten/src/ATen/native/cpu/MaxUnpoolKernel.h b/aten/src/ATen/native/cpu/MaxUnpoolKernel.h index 00fbeb64213d..1c6507909ca4 100644 --- a/aten/src/ATen/native/cpu/MaxUnpoolKernel.h +++ b/aten/src/ATen/native/cpu/MaxUnpoolKernel.h @@ -1,16 +1,14 @@ -#include -#include +#pragma once #include -#pragma once +namespace at { +class Tensor; -namespace at { namespace native { +namespace native { using max_unpooling_fn = void(*)(Tensor&, const Tensor&, const Tensor&); DECLARE_DISPATCH(max_unpooling_fn, max_unpool2d_kernel); -DECLARE_DISPATCH(max_unpooling_fn, max_unpool2d_backward_kernel); DECLARE_DISPATCH(max_unpooling_fn, max_unpool3d_kernel); -DECLARE_DISPATCH(max_unpooling_fn, max_unpool3d_backward_kernel); }} // at::native diff --git a/aten/src/ATen/native/cpu/MultinomialKernel.cpp b/aten/src/ATen/native/cpu/MultinomialKernel.cpp index f181572f51af..feda5fe7b3ba 100644 --- a/aten/src/ATen/native/cpu/MultinomialKernel.cpp +++ b/aten/src/ATen/native/cpu/MultinomialKernel.cpp @@ -1,13 +1,20 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include +#include #include #include -#include -#include #include +#include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + namespace at { namespace native { namespace { @@ -18,7 +25,8 @@ void multinomial_with_replacement_apply( const Tensor& self, const int64_t n_sample, c10::optional generator) { - auto gen = get_generator_or_default(generator, detail::getDefaultCPUGenerator()); + auto gen = get_generator_or_default( + generator, detail::getDefaultCPUGenerator()); // See Note [Acquire lock when using random generators] std::lock_guard lock(gen->mutex_); @@ -28,9 +36,9 @@ void multinomial_with_replacement_apply( /* cumulative probability distribution vector */ Tensor cum_dist = at::empty({n_categories}, self.options()); - const scalar_t * const self_ptr = self.data_ptr(); - scalar_t * const cum_dist_ptr = cum_dist.data_ptr(); - int64_t * const result_ptr = result.data_ptr(); + const scalar_t* const self_ptr = self.data_ptr(); + scalar_t* const cum_dist_ptr = cum_dist.data_ptr(); + int64_t* const result_ptr = result.data_ptr(); auto self_stride_0 = self.dim() > 1 ? self.stride(-2) : 0; auto self_stride_1 = self.stride(-1); @@ -47,22 +55,28 @@ void multinomial_with_replacement_apply( scalar_t val; for (const auto j : c10::irange(n_categories)) { val = self_ptr[i * self_stride_0 + j * self_stride_1]; - TORCH_CHECK(val >= 0, "invalid multinomial distribution (encountering probability entry < 0)"); + TORCH_CHECK( + val >= 0, + "invalid multinomial distribution (encountering probability entry < 0)"); // NB: std::isfinite doesn't bode well with libc++ for half datatypes, // so we manually cast it to a double and perform the check. #if defined(_LIBCPP_VERSION) - TORCH_CHECK(std::isfinite(static_cast(val)), - "invalid multinomial distribution (encountering probability entry = infinity or NaN)"); + TORCH_CHECK( + std::isfinite(static_cast(val)), + "invalid multinomial distribution (encountering probability entry = infinity or NaN)"); #else - TORCH_CHECK(std::isfinite(val), - "invalid multinomial distribution (encountering probability entry = infinity or NaN)"); + TORCH_CHECK( + std::isfinite(val), + "invalid multinomial distribution (encountering probability entry = infinity or NaN)"); #endif sum += val; cum_dist_ptr[j * cum_dist_stride_0] = sum; } - TORCH_CHECK(sum > 0, "invalid multinomial distribution (sum of probabilities <= 0)"); + TORCH_CHECK( + sum > 0, + "invalid multinomial distribution (sum of probabilities <= 0)"); /* normalize cumulative probability distribution so that last val is 1 i.e. doesn't assume original self row sums to one */ @@ -89,20 +103,124 @@ void multinomial_with_replacement_apply( /* Make sure the last cumulative distribution bucket sums to 1 */ cum_dist_ptr[(n_categories - 1) * cum_dist_stride_0] = 1; - while(right_pointer - left_pointer > 0) { + while (right_pointer - left_pointer > 0) { mid_pointer = left_pointer + (right_pointer - left_pointer) / 2; cum_prob = cum_dist_ptr[mid_pointer * cum_dist_stride_0]; if (cum_prob < uniform_sample) { left_pointer = mid_pointer + 1; + } else { + right_pointer = mid_pointer; } - else { + } + sample_idx = left_pointer; + + /* store in result tensor (will be incremented for lua compat by wrapper) + */ + result_ptr[i * result_dist_stride_0 + j * result_dist_stride_1] = + sample_idx; + } + } +} + +template <> +void multinomial_with_replacement_apply( + Tensor& result, + const Tensor& self, + const int64_t n_sample, + c10::optional generator) { + auto gen = get_generator_or_default( + generator, detail::getDefaultCPUGenerator()); + // See Note [Acquire lock when using random generators] + std::lock_guard lock(gen->mutex_); + + int64_t n_categories = self.size(-1); + int64_t n_dist = self.dim() > 1 ? self.size(-2) : 1; + + /* cumulative probability distribution vector */ + Tensor cum_dist = at::empty({n_categories}, self.options().dtype(kFloat)); + + const BFloat16* const self_ptr = self.data_ptr(); + float* const cum_dist_ptr = cum_dist.data_ptr(); + int64_t* const result_ptr = result.data_ptr(); + + auto self_stride_0 = self.dim() > 1 ? self.stride(-2) : 0; + auto self_stride_1 = self.stride(-1); + + auto cum_dist_stride_0 = cum_dist.stride(0); + + auto result_dist_stride_0 = result.dim() > 1 ? result.stride(-2) : 0; + auto result_dist_stride_1 = result.stride(-1); + + for (const auto i : c10::irange(n_dist)) { + /* Get normalized cumulative distribution from prob distribution */ + float sum = 0; + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) + float val; + for (const auto j : c10::irange(n_categories)) { + val = self_ptr[i * self_stride_0 + j * self_stride_1]; + TORCH_CHECK( + val >= 0, + "invalid multinomial distribution (encountering probability entry < 0)"); +// NB: std::isfinite doesn't bode well with libc++ for half datatypes, +// so we manually cast it to a double and perform the check. +#if defined(_LIBCPP_VERSION) + TORCH_CHECK( + std::isfinite(static_cast(val)), + "invalid multinomial distribution (encountering probability entry = infinity or NaN)"); +#else + TORCH_CHECK( + std::isfinite(val), + "invalid multinomial distribution (encountering probability entry = infinity or NaN)"); +#endif + + sum += val; + cum_dist_ptr[j * cum_dist_stride_0] = sum; + } + + TORCH_CHECK( + sum > 0, + "invalid multinomial distribution (sum of probabilities <= 0)"); + + /* normalize cumulative probability distribution so that last val is 1 + i.e. doesn't assume original self row sums to one */ + if ((sum > 0) || ((sum < 1.00001) && (sum > 0.99999))) { + for (const auto j : c10::irange(n_categories)) { + cum_dist_ptr[j * cum_dist_stride_0] /= sum; + } + } + + for (const auto j : c10::irange(n_sample)) { + /* sample a probability mass from a uniform distribution */ + at::uniform_real_distribution uniform(0, 1); + double uniform_sample = uniform(gen); + /* Do a binary search for the slot in which the prob falls + ie cum_dist[row][slot-1] < uniform_prob < cum_distr[row][slot] */ + int left_pointer = 0; + int right_pointer = n_categories; + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + int mid_pointer; + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) + float cum_prob; + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + int sample_idx; + /* Make sure the last cumulative distribution bucket sums to 1 */ + cum_dist_ptr[(n_categories - 1) * cum_dist_stride_0] = 1; + + while (right_pointer - left_pointer > 0) { + mid_pointer = left_pointer + (right_pointer - left_pointer) / 2; + cum_prob = cum_dist_ptr[mid_pointer * cum_dist_stride_0]; + if (cum_prob < uniform_sample) { + left_pointer = mid_pointer + 1; + } else { right_pointer = mid_pointer; } } sample_idx = left_pointer; - /* store in result tensor (will be incremented for lua compat by wrapper) */ - result_ptr[i * result_dist_stride_0 + j * result_dist_stride_1] = sample_idx; + /* store in result tensor (will be incremented for lua compat by wrapper) + */ + result_ptr[i * result_dist_stride_0 + j * result_dist_stride_1] = + sample_idx; } } } @@ -112,14 +230,16 @@ static void multinomial_with_replacement_kernel_impl( const Tensor& self, const int64_t n_sample, c10::optional gen) { - AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "multinomial", [&] { - multinomial_with_replacement_apply(result, self, n_sample, gen); - }); -} + AT_DISPATCH_FLOATING_TYPES_AND2( + kHalf, kBFloat16, self.scalar_type(), "multinomial", [&] { + multinomial_with_replacement_apply( + result, self, n_sample, gen); + }); } +} // namespace REGISTER_DISPATCH( multinomial_with_replacement_stub, &multinomial_with_replacement_kernel_impl); -} -} +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/cpu/PixelShuffleKernel.cpp b/aten/src/ATen/native/cpu/PixelShuffleKernel.cpp new file mode 100644 index 000000000000..aedd845fee89 --- /dev/null +++ b/aten/src/ATen/native/cpu/PixelShuffleKernel.cpp @@ -0,0 +1,251 @@ +#include +#include +#include +#include +#include +#include +#include + +namespace at { namespace native { + +namespace { + +template +void cpu_pixel_shuffle( + Tensor& output, + const Tensor& input, + int64_t upscale_factor) { + auto input_data = input.data_ptr(); + auto output_data = output.data_ptr(); + + // [(B1...Bn), C, H, W] => [N, C, H, W] + int64_t channels = input.size(-3); + int64_t height = input.size(-2); + int64_t width = input.size(-1); + int64_t sub_channels = channels / (upscale_factor * upscale_factor); + int64_t numel = input.numel(); + int64_t nbatch = numel / (channels * height * width); + int64_t S = upscale_factor; + + // input strides + int64_t stride_n = channels * height * width; + int64_t stride_c = S * S * height * width; + int64_t stride_s1 = S * height * width; + int64_t stride_s2 = height * width; + int64_t stride_h = width; + + // input tensor shape of [n, c, s1, s2, h, w] + // output tensor shape of [n, c, h, s1, w, s2] + at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) { + int64_t n{0}, c{0}, h{0}, s1{0}, w{0}, s2{0}; + data_index_init(begin, n, nbatch, c, sub_channels, h, height, s1, S, w, width, s2, S); + + for (const auto i : c10::irange(begin, end)) { + int64_t input_offset = n * stride_n + c * stride_c + s1 * stride_s1 + + s2 * stride_s2 + h * stride_h + w; + output_data[i] = input_data[input_offset]; + + data_index_step(n, nbatch, c, sub_channels, h, height, s1, S, w, width, s2, S); + } + }); +} + +template +void cpu_pixel_shuffle_channels_last( + Tensor& output, + const Tensor& input, + int64_t upscale_factor) { + TORCH_CHECK(input.ndimension() == 4, + "pixel shuffle with channels last format supports tensors with 4 dims"); + auto input_data = input.data_ptr(); + auto output_data = output.data_ptr(); + + int64_t nbatch = input.size(0); + int64_t channels = input.size(1); + int64_t height = input.size(2); + int64_t width = input.size(3); + int64_t sub_channels = channels / (upscale_factor * upscale_factor); + int64_t S = upscale_factor; + + // input tensor shape of [n, h, w, c, s1, s2] + // output tensor shape of [n, h, s1, w, s2, c] + using Vec = vec::Vectorized; + at::parallel_for(0, nbatch * height, 0, [&](int64_t begin, int64_t end) { + // temp buffer holding each channel lane + std::unique_ptr buffer(new scalar_t[channels]); + scalar_t* buffer_ptr = buffer.get(); + + int64_t n{0}, h{0}; + data_index_init(begin, n, nbatch, h, height); + for (const auto i : c10::irange(begin, end)) { + for (const auto w : c10::irange(width)) { + scalar_t* input_ptr = input_data + n * height * width * channels + h * width * channels + w * channels; + + // step 1: transpose each channel lane + // from: [c, s1*s2] + // to: [s1*s2, c] + utils::transpose(sub_channels, S * S, input_ptr, S * S, buffer_ptr, sub_channels); + + // step 2: copy from temp buffer to output + for (const auto s1 : c10::irange(S)) { + scalar_t* x_ptr = buffer_ptr + s1 * S * sub_channels; + scalar_t* y_ptr = output_data + i * width * channels + s1 * width * S * sub_channels + w * S * sub_channels; + + int64_t size = S * sub_channels; + int64_t d = 0; + for (; d < size - (size % Vec::size()); d += Vec::size()) { + Vec data_vec = Vec::loadu(x_ptr + d); + data_vec.store(y_ptr + d); + } + for (; d < size; d++) { + y_ptr[d] = x_ptr[d]; + } + } + } + + data_index_step(n, nbatch, h, height); + } + }); +} + +template +void cpu_pixel_unshuffle( + Tensor& output, + const Tensor& input, + int64_t downscale_factor) { + auto input_data = input.data_ptr(); + auto output_data = output.data_ptr(); + + // [(B1...Bn), C, H, W] => [N, C, H, W] + int64_t sub_channels = input.size(-3); + int64_t height = input.size(-2) / downscale_factor; + int64_t width = input.size(-1) / downscale_factor; + int64_t channels = sub_channels * downscale_factor * downscale_factor; + int64_t numel = input.numel(); + int64_t nbatch = numel / (channels * height * width); + int64_t S = downscale_factor; + + // input strides + int64_t stride_n = channels * height * width; + int64_t stride_c = height * S * width * S; + int64_t stride_h = S * width * S; + int64_t stride_s1 = width * S; + int64_t stride_w = S; + int64_t stride_s2 = 1; + + // input tensor shape of [n, c, h, s1, w, s2] + // output tensor shape of [n, c, s1, s2, h, w] + at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) { + int64_t n{0}, c{0}, s1{0}, s2{0}, h{0}, w{0}; + data_index_init(begin, n, nbatch, c, sub_channels, s1, S, s2, S, h, height, w, width); + + for (const auto i : c10::irange(begin, end)) { + int64_t input_offset = n * stride_n + c * stride_c + h * stride_h + + s1 * stride_s1 + w * stride_w + s2 * stride_s2; + output_data[i] = input_data[input_offset]; + + data_index_step(n, nbatch, c, sub_channels, s1, S, s2, S, h, height, w, width); + } + }); +} + +template +void cpu_pixel_unshuffle_channels_last( + Tensor& output, + const Tensor& input, + int64_t downscale_factor) { + TORCH_CHECK(input.ndimension() == 4, + "pixel unshuffle with channels last format supports tensors with 4 dims"); + auto input_data = input.data_ptr(); + auto output_data = output.data_ptr(); + + int64_t nbatch = input.size(0); + int64_t sub_channels = input.size(1); + int64_t height = input.size(2) / downscale_factor; + int64_t width = input.size(3) / downscale_factor; + int64_t channels = sub_channels * downscale_factor * downscale_factor; + int64_t numel = input.numel(); + int64_t S = downscale_factor; + + // input strides + int64_t stride_n = height * width * channels; + int64_t stride_h = S * width * S * sub_channels; + int64_t stride_s1 = width * S * sub_channels; + int64_t stride_w = S * sub_channels; + int64_t stride_s2 = sub_channels; + int64_t stride_c = 1; + + // input tensor shape of [n, h, s1, w, s2, c] + // output tensor shape of [n, h, w, c, s1, s2] + at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) { + int64_t n{0}, h{0}, w{0}, c{0}, s1{0}, s2{0}; + data_index_init(begin, n, nbatch, h, height, w, width, c, sub_channels, s1, S, s2, S); + + for (const auto i : c10::irange(begin, end)) { + int64_t input_offset = n * stride_n + h * stride_h + s1 * stride_s1 + + w * stride_w + s2 * stride_s2 + c * stride_c; + output_data[i] = input_data[input_offset]; + + data_index_step(n, nbatch, h, height, w, width, c, sub_channels, s1, S, s2, S); + } + }); +} + +void pixel_shuffle_kernel_impl( + Tensor& output, + const Tensor& input, + int64_t upscale_factor) { + switch (input.suggest_memory_format()) { + case at::MemoryFormat::Contiguous: { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Bool, ScalarType::BFloat16, ScalarType::Half, + input.scalar_type(), "pixel_shuffle", [&] { + cpu_pixel_shuffle(output, input, upscale_factor); + }); + break; + } + case at::MemoryFormat::ChannelsLast: { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Bool, ScalarType::BFloat16, ScalarType::Half, + input.scalar_type(), "pixel_shuffle_channels_last", [&] { + cpu_pixel_shuffle_channels_last(output, input, upscale_factor); + }); + break; + } + default: + TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + } +} + +void pixel_unshuffle_kernel_impl( + Tensor& output, + const Tensor& input, + int64_t downscale_factor) { + switch (input.suggest_memory_format()) { + case at::MemoryFormat::Contiguous: { + // input tensor shape of [N, C, Hr, Wr] + // output tensor shape of [N, Crr, H, W] + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Bool, ScalarType::BFloat16, ScalarType::Half, + input.scalar_type(), "pixel_unshuffle", [&] { + cpu_pixel_unshuffle(output, input, downscale_factor); + }); + break; + } + case at::MemoryFormat::ChannelsLast: { + // input tensor shape of [N, Hr, Wr, C] + // output tensor shape of [N, H, W, Crr] + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Bool, ScalarType::BFloat16, ScalarType::Half, + input.scalar_type(), "pixel_unshuffle_channels_last", [&] { + cpu_pixel_unshuffle_channels_last(output, input, downscale_factor); + }); + break; + } + default: + TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + } +} + +} // anonymous namespace + +REGISTER_DISPATCH(pixel_shuffle_kernel, &pixel_shuffle_kernel_impl); +REGISTER_DISPATCH(pixel_unshuffle_kernel, &pixel_unshuffle_kernel_impl); + +}} // at::native diff --git a/aten/src/ATen/native/cpu/PixelShuffleKernel.h b/aten/src/ATen/native/cpu/PixelShuffleKernel.h new file mode 100644 index 000000000000..f7234edf0e60 --- /dev/null +++ b/aten/src/ATen/native/cpu/PixelShuffleKernel.h @@ -0,0 +1,13 @@ +#include +#include +#include + +#pragma once + +namespace at { namespace native { + +using pixel_shuffle_fn = void(*)(Tensor&, const Tensor&, int64_t); +DECLARE_DISPATCH(pixel_shuffle_fn, pixel_shuffle_kernel); +DECLARE_DISPATCH(pixel_shuffle_fn, pixel_unshuffle_kernel); + +}} // at::native diff --git a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp index 549384055f20..d3be310e2802 100644 --- a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp @@ -92,7 +92,55 @@ static void addcdiv_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) { static void smooth_l1_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, double beta) { ScalarType dtype = iter.dtype(0); - AT_DISPATCH_ALL_TYPES(dtype, "smooth_l1_backward_cpu_out", [&] { + if (dtype == kBFloat16) { + auto norm_val = norm.to(); + float beta_val(beta); + auto norm_val_vec = Vectorized(norm_val); + auto beta_val_vec = Vectorized(beta_val); + const auto neg_1_vec = Vectorized(-1); + const auto zero_vec = Vectorized(0); + const auto pos_1_vec = Vectorized(1); + cpu_kernel_vec(iter, + [=](BFloat16 input, BFloat16 target, BFloat16 grad_output) -> BFloat16 { + const auto x = float(input) - float(target); + if (x <= -beta){ + return -norm_val * float(grad_output); + }else if (x >= beta){ + return norm_val * float(grad_output); + }else{ + return norm_val * x * float(grad_output) / beta; + } + }, + [norm_val_vec, beta_val_vec, neg_1_vec, zero_vec, pos_1_vec]( + Vectorized input, Vectorized target, Vectorized grad_output) -> Vectorized { + // using two blendv calls to simulate the 3 cases + // 1 if x >= beta + // -1 if x <= -beta + // x / beta if |x| < beta + Vectorized input0, input1, target0, target1, grad_output0, grad_output1; + std::tie(input0, input1) = convert_bfloat16_float(input); + std::tie(target0, target1) = convert_bfloat16_float(target); + std::tie(grad_output0, grad_output1) = convert_bfloat16_float(grad_output); + auto x = input0 - target0; + auto pos_or_neg_1_vec = Vectorized::blendv( + neg_1_vec, pos_1_vec, x > zero_vec); + auto x_abs = x.abs(); + auto output = Vectorized::blendv( + x / beta_val_vec, pos_or_neg_1_vec, x_abs >= beta_val_vec); + input0 = norm_val_vec * output * grad_output0; + + x = input1 - target1; + pos_or_neg_1_vec = Vectorized::blendv( + neg_1_vec, pos_1_vec, x > zero_vec); + x_abs = x.abs(); + output = Vectorized::blendv( + x / beta_val_vec, pos_or_neg_1_vec, x_abs >= beta_val_vec); + input1 = norm_val_vec * output * grad_output1; + return convert_float_bfloat16(input0, input1); + } + ); + } else { + AT_DISPATCH_ALL_TYPES(dtype, "smooth_l1_backward_cpu_out", [&] { auto norm_val = norm.to(); scalar_t beta_val(beta); auto norm_val_vec = Vectorized(norm_val); @@ -126,6 +174,7 @@ static void smooth_l1_backward_cpu_kernel(TensorIterator& iter, const Scalar& no } ); }); + } } static void huber_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, double delta) { diff --git a/aten/src/ATen/native/cpu/PowKernel.cpp b/aten/src/ATen/native/cpu/PowKernel.cpp index a13c4bca88ae..bade9772f697 100644 --- a/aten/src/ATen/native/cpu/PowKernel.cpp +++ b/aten/src/ATen/native/cpu/PowKernel.cpp @@ -69,7 +69,7 @@ void pow_tensor_scalar_optimized_kernel(TensorIteratorBase& iter, const exp_scal ); } else if (exp == -2.0) { cpu_kernel_vec(iter, - [](scalar_t base) -> scalar_t { + [](scalar_t base) __ubsan_ignore_float_divide_by_zero__ -> scalar_t { return static_cast(1.0) / (base * base); }, [](Vec base) -> Vec { return (base * base).reciprocal(); } ); diff --git a/aten/src/ATen/native/cpu/Reduce.h b/aten/src/ATen/native/cpu/Reduce.h index 083f9cf19b16..58f39f156677 100644 --- a/aten/src/ATen/native/cpu/Reduce.h +++ b/aten/src/ATen/native/cpu/Reduce.h @@ -133,7 +133,7 @@ static void set_results(const res_t result, const TensorIteratorBase &iter, cons template static inline typename std::enable_if::type -for_each_in_tuple(const std::tuple& t, const TensorIteratorBase &iter, const int num_outputs) { +for_each_in_tuple(const std::tuple& /*t*/, const TensorIteratorBase& /*iter*/, const int /*num_outputs*/) { return i; } diff --git a/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp index 90bac8aab63f..c3d8ba7d136d 100644 --- a/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp @@ -1,11 +1,12 @@ -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include +#include #include #include -#include -#include -#include +#include #include #include @@ -30,7 +31,7 @@ inline void reduce_all_impl_vec( auto input_data = input.data_ptr(); // NOTE: parallel_reduce not support bool type scalar_t result = at::parallel_reduce(0, input_numel, internal::GRAIN_SIZE, ident_v, - [&](int64_t start, int64_t end, const scalar_t ident) -> scalar_t { + [&](int64_t start, int64_t end, const scalar_t /*ident*/) -> scalar_t { scalar_t partial_out = vec::reduce_all( [=](Vec x, Vec y) { return vop(x, y); }, input_data + start, diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp index 67d8036f701c..52e18faf737d 100644 --- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp @@ -1,18 +1,23 @@ -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include +#include #include #include #include #include -#include #include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + #include #include #include @@ -74,7 +79,7 @@ static void cumsum_cpu_kernel(const Tensor& result, const Tensor& self, int64_t auto wrap_dim = maybe_wrap_dim(dim, self.dim()); int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim); - AT_DISPATCH_ALL_TYPES_AND_COMPLEX(self.scalar_type(), "cumsum_out_cpu", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, self.scalar_type(), "cumsum_out_cpu", [&] { cpu_cum_base_kernel(result, self, wrap_dim, [&] ( scalar_t* result_data, auto result_dim_stride, const scalar_t* self_data, auto self_dim_stride, scalar_t init_val) { @@ -93,7 +98,7 @@ static void cumprod_cpu_kernel(const Tensor& result, const Tensor& self, int64_t auto wrap_dim = maybe_wrap_dim(dim, self.dim()); int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim); - AT_DISPATCH_ALL_TYPES_AND_COMPLEX(self.scalar_type(), "cumprod_out_cpu", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, self.scalar_type(), "cumprod_out_cpu", [&] { cpu_cum_base_kernel(result, self, wrap_dim, [&] ( scalar_t* result_data, auto result_dim_stride, const scalar_t* self_data, auto self_dim_stride, scalar_t init_val) { @@ -112,18 +117,19 @@ static void logcumsumexp_cpu_kernel(Tensor& result, const Tensor& self, int64_t auto wrap_dim = maybe_wrap_dim(dim, self.dim()); int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim); - AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "logcumsumexp_out_cpu", [&] { + AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, self.scalar_type(), "logcumsumexp_out_cpu", [&] { cpu_cum_base_kernel(result, self, wrap_dim, [&] ( scalar_t* result_data, auto result_dim_stride, const scalar_t* self_data, auto self_dim_stride, scalar_t init_val) { - scalar_t cum_number = (at::acc_type)init_val; + using accscalar_t = at::acc_type; + auto cum_number = (accscalar_t)init_val; for (const auto i : c10::irange(self_dim_size)) { - scalar_t x = self_data[i * self_dim_stride]; + accscalar_t x = self_data[i * self_dim_stride]; // Reference : https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp - auto log_add_exp = [](scalar_t x, scalar_t y) -> scalar_t { - scalar_t min = std::isnan(y) ? y : std::min(x,y); //std::min returns first arg if one of the args is nan - scalar_t max = std::isnan(y) ? y : std::max(x,y); //std::max returns first arg if one of the args is nan + auto log_add_exp = [](accscalar_t x, accscalar_t y) -> accscalar_t { + accscalar_t min = std::isnan(y) ? y : std::min(x,y); //std::min returns first arg if one of the args is nan + accscalar_t max = std::isnan(y) ? y : std::max(x,y); //std::max returns first arg if one of the args is nan if (min != max || std::isfinite(min)) { // nan will be propagated here return std::log1p(std::exp(min - max)) + max; @@ -218,6 +224,10 @@ static void norm_kernel_tensor_iterator_impl( } else { AT_ERROR("norm_kernel_tensor_iterator_impl expects norm to be integer or float"); } + if (iter.numel() == 0) { + iter.output().fill_((val < 0) ? INFINITY : 0); + return; + } bool use_fast_path = is_reduce_lastdim(iter) && iter.dtype(0) == iter.input_dtype() && (iter.input_dtype() == kFloat || iter.input_dtype() == kBFloat16); @@ -297,7 +307,7 @@ static void norm_kernel_tensor_iterator_impl( binary_kernel_reduce( iter, AbsMinOps(), - std::numeric_limits::max() + std::numeric_limits::infinity() ); }); } else { diff --git a/aten/src/ATen/native/cpu/RenormKernel.cpp b/aten/src/ATen/native/cpu/RenormKernel.cpp index 532dea3e59ab..0a9a0d0df352 100644 --- a/aten/src/ATen/native/cpu/RenormKernel.cpp +++ b/aten/src/ATen/native/cpu/RenormKernel.cpp @@ -1,5 +1,6 @@ +#define TORCH_ASSERT_NO_OPERATORS #include -#include +#include #include #include diff --git a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp index 45c86ebdd181..d43f107a5502 100644 --- a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp +++ b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp @@ -1,7 +1,11 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include #include +#include +#include +#include #include #include @@ -32,6 +36,33 @@ class ReduceAdd { }; static ReduceAdd reduce_add; +class ReduceMean { +public: + template + constexpr void operator() (scalar_t * self_data, scalar_t * src_data) const { + *self_data += *src_data; + } +}; +static ReduceMean reduce_mean; + +class ReduceMaximum { +public: + template + constexpr void operator() (scalar_t * self_data, scalar_t * src_data) const { + *self_data = at::_isnan(*src_data) ? *src_data : std::max(*self_data, *src_data); + } +}; +static ReduceMaximum reduce_maximum; + +class ReduceMinimum { +public: + template + constexpr void operator() (scalar_t * self_data, scalar_t * src_data) const { + *self_data = at::_isnan(*src_data) ? *src_data : std::min(*self_data, *src_data); + } +}; +static ReduceMinimum reduce_minimum; + class TensorAssign { public: template @@ -280,6 +311,273 @@ struct cpu_scatter_gather_base_kernel { } ); } + + void operator()(const Tensor& self, int64_t dim, + const Tensor& index, const Tensor& src, + const std::string& method_name, ReduceMean& kernel_func) { + + auto iter = TensorIteratorConfig() + .check_all_same_dtype(false) + .resize_outputs(false) + // NOLINTNEXTLINE(bugprone-argument-comment) + .declare_static_shape(index.sizes(), /*squash_dim=*/dim) + .add_output(self) + .add_input(src) + .add_input(index) + .build(); + + auto self_dim_stride = ensure_nonempty_stride(self, dim); + auto self_dim_size = ensure_nonempty_size(self, dim); + + auto index_dim_stride = ensure_nonempty_stride(index, dim); + auto index_dim_size = ensure_nonempty_size(index, dim); + + auto src_dim_stride = ensure_nonempty_stride(src, dim); + auto src_dim_size = ensure_nonempty_size(src, dim); + + auto index_upper_bound = is_scatter_like ? self_dim_size : src_dim_size; + + int64_t grain_size = std::max((int64_t) 1, at::internal::GRAIN_SIZE / index_dim_size); + + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( + ScalarType::Half, ScalarType::BFloat16, iter.dtype(), + "scatter_gather_tensor_cpu_reduce_mean", [&] { + constexpr auto SELF_ITER_STRIDE_IDX = 0; + constexpr auto INDEX_ITER_STRIDE_IDX = 2; + constexpr auto SRC_ITER_STRIDE_IDX = 1; + auto loop = [&](char** data, const int64_t* strides, int64_t n) { + auto* self_data_bytes = data[SELF_ITER_STRIDE_IDX]; + auto* index_data_bytes = data[INDEX_ITER_STRIDE_IDX]; + auto* src_data_bytes = data[SRC_ITER_STRIDE_IDX]; + // we change the order of TensorIterator-dim loop + // vs dim-TensorIterator loop order depending on + // whether dim is the last dimension + if (dim== self.dim() - 1) { + for (const auto nelem : c10::irange(n)) { + (void)nelem; //Suppress unused variable warning + // dim loop is a separate code block + // for better performance + _cpu_scatter_gather_dim_loop()( + (scalar_t*)self_data_bytes, self_dim_stride, + (int64_t*)index_data_bytes, index_dim_stride, + (scalar_t*)src_data_bytes, src_dim_stride, + dim, index_dim_size, index_upper_bound, + kernel_func + ); + + self_data_bytes += strides[SELF_ITER_STRIDE_IDX]; + index_data_bytes += strides[INDEX_ITER_STRIDE_IDX]; + src_data_bytes += strides[SRC_ITER_STRIDE_IDX]; + } + } + else { + for (const auto i : c10::irange(index_dim_size)) { + auto* self_data = self_data_bytes; + auto* index_data = (char*)((int64_t*)index_data_bytes + i * index_dim_stride); + auto* src_data = src_data_bytes; + for (const auto nelem : c10::irange(n)) { + (void)nelem; //Suppress unused variable warning + int64_t idx_dim = *(int64_t*)index_data; + // we are not putting idx_dim in the error message because it disables + // loop optimization in clang-7 + TORCH_CHECK(idx_dim >= 0 && idx_dim < index_upper_bound, + "index ", *(int64_t*)index_data, + " is out of bounds for dimension ", dim, + " with size ", index_upper_bound); + + kernel_func( + (scalar_t*)self_data + (is_scatter_like ? idx_dim : i) * self_dim_stride, + (scalar_t*)src_data + (is_scatter_like ? i : idx_dim) * src_dim_stride); + + self_data += strides[SELF_ITER_STRIDE_IDX]; + index_data += strides[INDEX_ITER_STRIDE_IDX]; + src_data += strides[SRC_ITER_STRIDE_IDX]; + } + } + } + }; + iter.for_each(loop, grain_size); + } + ); + } + + void operator()(const Tensor& self, int64_t dim, + const Tensor& index, const Tensor& src, + const std::string& method_name, ReduceMaximum& kernel_func) { + + auto iter = TensorIteratorConfig() + .check_all_same_dtype(false) + .resize_outputs(false) + // NOLINTNEXTLINE(bugprone-argument-comment) + .declare_static_shape(index.sizes(), /*squash_dim=*/dim) + .add_output(self) + .add_input(src) + .add_input(index) + .build(); + + auto self_dim_stride = ensure_nonempty_stride(self, dim); + auto self_dim_size = ensure_nonempty_size(self, dim); + + auto index_dim_stride = ensure_nonempty_stride(index, dim); + auto index_dim_size = ensure_nonempty_size(index, dim); + + auto src_dim_stride = ensure_nonempty_stride(src, dim); + auto src_dim_size = ensure_nonempty_size(src, dim); + + auto index_upper_bound = is_scatter_like ? self_dim_size : src_dim_size; + + int64_t grain_size = std::max((int64_t) 1, at::internal::GRAIN_SIZE / index_dim_size); + + AT_DISPATCH_ALL_TYPES_AND3( + ScalarType::Bool, ScalarType::Half, ScalarType::BFloat16, iter.dtype(), + "scatter_gather_tensor_cpu_reduce_amax", [&] { + constexpr auto SELF_ITER_STRIDE_IDX = 0; + constexpr auto INDEX_ITER_STRIDE_IDX = 2; + constexpr auto SRC_ITER_STRIDE_IDX = 1; + auto loop = [&](char** data, const int64_t* strides, int64_t n) { + auto* self_data_bytes = data[SELF_ITER_STRIDE_IDX]; + auto* index_data_bytes = data[INDEX_ITER_STRIDE_IDX]; + auto* src_data_bytes = data[SRC_ITER_STRIDE_IDX]; + // we change the order of TensorIterator-dim loop + // vs dim-TensorIterator loop order depending on + // whether dim is the last dimension + if (dim== self.dim() - 1) { + for (const auto nelem : c10::irange(n)) { + (void)nelem; //Suppress unused variable warning + // dim loop is a separate code block + // for better performance + _cpu_scatter_gather_dim_loop()( + (scalar_t*)self_data_bytes, self_dim_stride, + (int64_t*)index_data_bytes, index_dim_stride, + (scalar_t*)src_data_bytes, src_dim_stride, + dim, index_dim_size, index_upper_bound, + kernel_func + ); + + self_data_bytes += strides[SELF_ITER_STRIDE_IDX]; + index_data_bytes += strides[INDEX_ITER_STRIDE_IDX]; + src_data_bytes += strides[SRC_ITER_STRIDE_IDX]; + } + } + else { + for (const auto i : c10::irange(index_dim_size)) { + auto* self_data = self_data_bytes; + auto* index_data = (char*)((int64_t*)index_data_bytes + i * index_dim_stride); + auto* src_data = src_data_bytes; + for (const auto nelem : c10::irange(n)) { + (void)nelem; //Suppress unused variable warning + int64_t idx_dim = *(int64_t*)index_data; + // we are not putting idx_dim in the error message because it disables + // loop optimization in clang-7 + TORCH_CHECK(idx_dim >= 0 && idx_dim < index_upper_bound, + "index ", *(int64_t*)index_data, + " is out of bounds for dimension ", dim, + " with size ", index_upper_bound); + + kernel_func( + (scalar_t*)self_data + (is_scatter_like ? idx_dim : i) * self_dim_stride, + (scalar_t*)src_data + (is_scatter_like ? i : idx_dim) * src_dim_stride); + + self_data += strides[SELF_ITER_STRIDE_IDX]; + index_data += strides[INDEX_ITER_STRIDE_IDX]; + src_data += strides[SRC_ITER_STRIDE_IDX]; + } + } + } + }; + iter.for_each(loop, grain_size); + } + ); + } + + void operator()(const Tensor& self, int64_t dim, + const Tensor& index, const Tensor& src, + const std::string& method_name, ReduceMinimum& kernel_func) { + + auto iter = TensorIteratorConfig() + .check_all_same_dtype(false) + .resize_outputs(false) + // NOLINTNEXTLINE(bugprone-argument-comment) + .declare_static_shape(index.sizes(), /*squash_dim=*/dim) + .add_output(self) + .add_input(src) + .add_input(index) + .build(); + + auto self_dim_stride = ensure_nonempty_stride(self, dim); + auto self_dim_size = ensure_nonempty_size(self, dim); + + auto index_dim_stride = ensure_nonempty_stride(index, dim); + auto index_dim_size = ensure_nonempty_size(index, dim); + + auto src_dim_stride = ensure_nonempty_stride(src, dim); + auto src_dim_size = ensure_nonempty_size(src, dim); + + auto index_upper_bound = is_scatter_like ? self_dim_size : src_dim_size; + + int64_t grain_size = std::max((int64_t) 1, at::internal::GRAIN_SIZE / index_dim_size); + + AT_DISPATCH_ALL_TYPES_AND3( + ScalarType::Bool, ScalarType::Half, ScalarType::BFloat16, iter.dtype(), + "scatter_gather_tensor_cpu_reduce_amin", [&] { + constexpr auto SELF_ITER_STRIDE_IDX = 0; + constexpr auto INDEX_ITER_STRIDE_IDX = 2; + constexpr auto SRC_ITER_STRIDE_IDX = 1; + auto loop = [&](char** data, const int64_t* strides, int64_t n) { + auto* self_data_bytes = data[SELF_ITER_STRIDE_IDX]; + auto* index_data_bytes = data[INDEX_ITER_STRIDE_IDX]; + auto* src_data_bytes = data[SRC_ITER_STRIDE_IDX]; + // we change the order of TensorIterator-dim loop + // vs dim-TensorIterator loop order depending on + // whether dim is the last dimension + if (dim== self.dim() - 1) { + for (const auto nelem : c10::irange(n)) { + (void)nelem; //Suppress unused variable warning + // dim loop is a separate code block + // for better performance + _cpu_scatter_gather_dim_loop()( + (scalar_t*)self_data_bytes, self_dim_stride, + (int64_t*)index_data_bytes, index_dim_stride, + (scalar_t*)src_data_bytes, src_dim_stride, + dim, index_dim_size, index_upper_bound, + kernel_func + ); + + self_data_bytes += strides[SELF_ITER_STRIDE_IDX]; + index_data_bytes += strides[INDEX_ITER_STRIDE_IDX]; + src_data_bytes += strides[SRC_ITER_STRIDE_IDX]; + } + } + else { + for (const auto i : c10::irange(index_dim_size)) { + auto* self_data = self_data_bytes; + auto* index_data = (char*)((int64_t*)index_data_bytes + i * index_dim_stride); + auto* src_data = src_data_bytes; + for (const auto nelem : c10::irange(n)) { + (void)nelem; //Suppress unused variable warning + int64_t idx_dim = *(int64_t*)index_data; + // we are not putting idx_dim in the error message because it disables + // loop optimization in clang-7 + TORCH_CHECK(idx_dim >= 0 && idx_dim < index_upper_bound, + "index ", *(int64_t*)index_data, + " is out of bounds for dimension ", dim, + " with size ", index_upper_bound); + + kernel_func( + (scalar_t*)self_data + (is_scatter_like ? idx_dim : i) * self_dim_stride, + (scalar_t*)src_data + (is_scatter_like ? i : idx_dim) * src_dim_stride); + + self_data += strides[SELF_ITER_STRIDE_IDX]; + index_data += strides[INDEX_ITER_STRIDE_IDX]; + src_data += strides[SRC_ITER_STRIDE_IDX]; + } + } + } + }; + iter.for_each(loop, grain_size); + } + ); + } }; void gather_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim, const Tensor& index) { @@ -316,6 +614,34 @@ void scatter_reduce_cpu_kernel(const Tensor& self, const int64_t dim, const Tens cpu_scatter_gather_base_kernel<>()(self, dim, index, src, "scatter_reduce_multiply_", reduce_multiply); break; + default : + break; + } +} + +void scatter_reduce_two_cpu_kernel(const Tensor& self, const int64_t dim, const Tensor& index, + const Tensor& src, const SCATTER_GATHER_OP& reduce) { + switch (reduce) { + case SCATTER_GATHER_OP::REDUCE_ADD : + cpu_scatter_gather_base_kernel<>()(self, dim, index, src, + "scatter_reduce_sum_", reduce_add); + break; + case SCATTER_GATHER_OP::REDUCE_MULTIPLY : + cpu_scatter_gather_base_kernel<>()(self, dim, index, src, + "scatter_reduce_prod_", reduce_multiply); + break; + case SCATTER_GATHER_OP::REDUCE_MAXIMUM : + cpu_scatter_gather_base_kernel<>()(self, dim, index, src, + "scatter_reduce_amax_", reduce_maximum); + break; + case SCATTER_GATHER_OP::REDUCE_MINIMUM : + cpu_scatter_gather_base_kernel<>()(self, dim, index, src, + "scatter_reduce_amin_", reduce_minimum); + break; + case SCATTER_GATHER_OP::REDUCE_MEAN : + cpu_scatter_gather_base_kernel<>()(self, dim, index, src, + "scatter_reduce_mean_", reduce_mean); + break; } } @@ -330,6 +656,8 @@ void scatter_scalar_reduce_cpu_kernel(const Tensor& self, const int64_t dim, con cpu_scatter_gather_base_kernel<>()(self, dim, index, value, "scatter_scalar_reduce_multiply_", reduce_multiply); break; + default: + break; } } @@ -341,5 +669,6 @@ REGISTER_DISPATCH(scatter_fill_stub, &scatter_fill_cpu_kernel); REGISTER_DISPATCH(scatter_add_stub, &scatter_add_cpu_kernel); REGISTER_DISPATCH(scatter_reduce_stub, &scatter_reduce_cpu_kernel); REGISTER_DISPATCH(scatter_scalar_reduce_stub, &scatter_scalar_reduce_cpu_kernel); +REGISTER_DISPATCH(scatter_reduce_two_stub, &scatter_reduce_two_cpu_kernel); }} // namespace at::native diff --git a/aten/src/ATen/native/cpu/SerialStackImpl.h b/aten/src/ATen/native/cpu/SerialStackImpl.h index 3f509b0c6306..682161372009 100644 --- a/aten/src/ATen/native/cpu/SerialStackImpl.h +++ b/aten/src/ATen/native/cpu/SerialStackImpl.h @@ -1,10 +1,11 @@ // Copyright 2004-present Facebook. All Rights Reserved. #pragma once -#include +#include #include #include +#include #include #include #include @@ -111,7 +112,7 @@ bool can_use_native_serial_stack_impl(Tensor& result, TensorListType tensors, in // or there is only one thread. Note that we aren't checking result.numel() here because // it may not have been resized and we want to defer that cost till later. int64_t numel_in_stack = first_tensor.numel() * tensors.size(); - return numel_in_stack < at::internal::GRAIN_SIZE && at::get_num_threads() == 1; + return numel_in_stack < at::internal::GRAIN_SIZE || at::get_num_threads() == 1; } template diff --git a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp index 50a9b2350b1c..908d4fc60b7b 100644 --- a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp +++ b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp @@ -1,3 +1,4 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include @@ -6,12 +7,13 @@ #include #include +#include +#include #include #include #include #include -#include // [Note AVX-SSE transitions] In general we avoid calls into cmath for code // compiled with AVX/AVX2 This is because of SSE-AVX transitions and a bug in // Glibc2.23 See https://bugs.launchpad.net/ubuntu/+source/glibc/+bug/1663280 @@ -41,11 +43,11 @@ inline void _vec_log_softmax_lastdim( outer_size, grain_size, [&](int64_t begin, int64_t end) { + // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays) + scalar_t tmp_sum_scalar[CHUNK_SIZE]; + // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays) + scalar_t max_input_arr[CHUNK_SIZE]; for (int64_t ii = begin; ii < end; ii += CHUNK_SIZE) { - // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays) - scalar_t tmp_sum_scalar[CHUNK_SIZE]; - // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays) - scalar_t max_input_arr[CHUNK_SIZE]; int64_t loop_end = CHUNK_SIZE; if (ii + CHUNK_SIZE > end) loop_end = end - ii; @@ -102,38 +104,97 @@ inline void _vec_softmax_lastdim( scalar_t* output_data_base, int64_t outer_size, int64_t dim_size) { - using Vec = vec::Vectorized>; - int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size); - if (grain_size < 1) - grain_size = 1; + using Vec = vec::Vectorized; + int64_t grain_size = std::max(internal::GRAIN_SIZE / (16 * dim_size), (int64_t)1); + parallel_for(0, outer_size, grain_size, [&](int64_t begin, int64_t end) { + for (const auto i : c10::irange(begin, end)) { + scalar_t* input_data = input_data_base + i * dim_size; + scalar_t* output_data = output_data_base + i * dim_size; + scalar_t max_input = vec::reduce_all( + [](Vec& x, Vec& y) { return vec::maximum(x, y); }, + input_data, + dim_size); + vec::map( + [max_input](Vec x) { return (x - Vec(max_input)).exp(); }, + output_data, + input_data, + dim_size); + scalar_t tmp_sum = vec::reduce_all( + [](Vec x, Vec y) { return x + y; }, output_data, dim_size); + tmp_sum = 1 / tmp_sum; + vec::map( + [tmp_sum](Vec x) { return x * Vec(tmp_sum); }, + output_data, + output_data, + dim_size); + } + }); +} - parallel_for( - 0, - outer_size, - grain_size, - [&](int64_t begin, int64_t end) { - for (const auto i : c10::irange(begin, end)) { - scalar_t* input_data = input_data_base + i * dim_size; - scalar_t* output_data = output_data_base + i * dim_size; - scalar_t max_input = vec::reduce_all( - [](Vec& x, Vec& y) { return vec::maximum(x, y); }, - input_data, - dim_size); - vec::map( - [max_input](Vec x) { return (x - Vec(max_input)).exp(); }, - output_data, - input_data, - dim_size); - scalar_t tmp_sum = vec::reduce_all( - [](Vec x, Vec y) { return x + y; }, output_data, dim_size); - tmp_sum = 1 / tmp_sum; - vec::map( - [tmp_sum](Vec x) { return x * Vec(tmp_sum); }, - output_data, - output_data, - dim_size); - } - }); +template <> +inline void _vec_softmax_lastdim( + BFloat16* input_data_base, + BFloat16* output_data_base, + int64_t outer_size, + int64_t dim_size) { + using bVec = vec::Vectorized; + using fVec = vec::Vectorized; + int64_t grain_size = std::max(internal::GRAIN_SIZE / (16 * dim_size), (int64_t)1); + parallel_for(0, outer_size, grain_size, [&](int64_t begin, int64_t end) { + // thread local temp buffer. + std::unique_ptr buffer(new float[dim_size]); + float* buffer_data = buffer.get(); + + for (const auto i : c10::irange(begin, end)) { + BFloat16* input_data = input_data_base + i * dim_size; + BFloat16* output_data = output_data_base + i * dim_size; + // reduce to max and cache float input data + fVec max_fvec = fVec(-std::numeric_limits::infinity()); + int64_t d0 = 0; + for (; d0 < dim_size - (dim_size % bVec::size()); d0 += bVec::size()) { + bVec data_bvec = bVec::loadu(input_data + d0); + fVec data_fvec0, data_fvec1; + std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec); + max_fvec = vec::maximum(max_fvec, data_fvec0); + max_fvec = vec::maximum(max_fvec, data_fvec1); + data_fvec0.store(buffer_data + d0); + data_fvec1.store(buffer_data + d0 + fVec::size()); + } + float max_val = vec::vec_reduce_all([](fVec& x, fVec& y) { return vec::maximum(x, y); }, max_fvec); + for (; d0 < dim_size; d0++) { + float data_val = input_data[d0]; + max_val = std::max(max_val, data_val); + buffer_data[d0] = data_val; + } + + // map (x - max).exp() and reduce to sum + fVec sum_fvec = fVec(float(0)); + int64_t d1 = 0; + for (; d1 < dim_size - (dim_size % fVec::size()); d1 += fVec::size()) { + fVec data_fvec = (fVec::loadu(buffer_data + d1) - fVec(max_val)).exp(); + sum_fvec += data_fvec; + data_fvec.store(buffer_data + d1); + } + float sum_val = vec::vec_reduce_all([](fVec& x, fVec& y) { return x + y; }, sum_fvec); + for (; d1 < dim_size; d1++) { + float data_val = std::exp(buffer_data[d1] - max_val); + sum_val += data_val; + buffer_data[d1] = data_val; + } + + sum_val = 1 / sum_val; + int64_t d2 = 0; + for (; d2 < dim_size - (dim_size % bVec::size()); d2 += bVec::size()) { + fVec out_fvec0 = fVec::loadu(buffer_data + d2) * fVec(sum_val); + fVec out_fvec1 = fVec::loadu(buffer_data + d2 + fVec::size()) * fVec(sum_val); + bVec out_bvec = convert_float_bfloat16(out_fvec0, out_fvec1); + out_bvec.store(output_data + d2); + } + for (; d2 < dim_size; d2++) { + output_data[d2] = BFloat16(buffer_data[d2] * sum_val); + } + } + }); } template diff --git a/aten/src/ATen/native/cpu/SoftmaxKernel.h b/aten/src/ATen/native/cpu/SoftmaxKernel.h index a393c08056e2..f9af73903454 100644 --- a/aten/src/ATen/native/cpu/SoftmaxKernel.h +++ b/aten/src/ATen/native/cpu/SoftmaxKernel.h @@ -1,9 +1,11 @@ #pragma once -#include #include +#include namespace at { +class Tensor; + namespace native { using forward_fn = void (*)(const Tensor&, const Tensor&); diff --git a/aten/src/ATen/native/cpu/SortingKernel.cpp b/aten/src/ATen/native/cpu/SortingKernel.cpp index 8eab924407d1..b756c6c46a7e 100644 --- a/aten/src/ATen/native/cpu/SortingKernel.cpp +++ b/aten/src/ATen/native/cpu/SortingKernel.cpp @@ -1,42 +1,27 @@ -#include +#define TORCH_ASSERT_NO_OPERATORS +#include +#include #include #include #include -#include +#include #include #include -#include -#include +#include +#include #include namespace at { namespace native { namespace { -void _fill_indices(Tensor& indices, int64_t dim) { - auto dim_size = indices.size(dim); - auto idx_dim = at::arange(0, dim_size, indices.options().dtype(at::kLong)); - auto idx_dim_sizes = std::vector(indices.dim(), 1); - auto idx_dim_strides = std::vector(indices.dim(), 0); - idx_dim_sizes[dim] = dim_size; - idx_dim_strides[dim] = 1; - auto idx_dim_restrided = idx_dim.as_strided(idx_dim_sizes, idx_dim_strides); - indices.copy_(idx_dim_restrided); -} - template void _dim_apply( - Tensor& values, - Tensor& indices, + const TensorBase &values, + const TensorBase &indices, int64_t dim, const std::string& method_name, const func_t& f) { - dim = maybe_wrap_dim(dim, values.dim()); - TORCH_CHECK( - dim >= 0 && dim < values.dim(), - method_name, "(): invalid dimension parameter ", dim - ); - auto iter = TensorIteratorConfig() .check_all_same_dtype(false) .resize_outputs(false) @@ -56,6 +41,10 @@ void _dim_apply( auto* values_data_bytes = data[0]; auto* indices_data_bytes = data[1]; + if(values_data_bytes==nullptr || indices_data_bytes==nullptr){ + return; + } + for (const auto i : c10::irange(n)) { (void)i; //Suppress unused variable warning f( @@ -95,8 +84,9 @@ struct KeyValueCompDesc { }; static void sort_kernel( - Tensor& values, - Tensor& indices, + const TensorBase& self, + const TensorBase& values, + const TensorBase& indices, int64_t dim, bool descending, bool stable) { @@ -143,9 +133,9 @@ static void sort_kernel( } static void topk_kernel( - const Tensor& values, - const Tensor& indices, - const Tensor& self, + const TensorBase &values, + const TensorBase &indices, + const TensorBase &self, int64_t k, int64_t dim, bool largest, diff --git a/aten/src/ATen/native/cpu/StackKernel.cpp b/aten/src/ATen/native/cpu/StackKernel.cpp index 8a6615c0d277..6e9248149d8a 100644 --- a/aten/src/ATen/native/cpu/StackKernel.cpp +++ b/aten/src/ATen/native/cpu/StackKernel.cpp @@ -1,6 +1,6 @@ // Copyright 2004-present Facebook. All Rights Reserved. - -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include diff --git a/aten/src/ATen/native/cpu/StackKernel.h b/aten/src/ATen/native/cpu/StackKernel.h index abb72f9dba7f..4e9a45e4dd12 100644 --- a/aten/src/ATen/native/cpu/StackKernel.h +++ b/aten/src/ATen/native/cpu/StackKernel.h @@ -1,7 +1,7 @@ // Copyright 2004-present Facebook. All Rights Reserved. #pragma once -#include +#include #include namespace at { namespace native { diff --git a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp index 914f03a2d81c..27fa214fba1c 100644 --- a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp +++ b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp @@ -1,3 +1,5 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include @@ -10,13 +12,20 @@ #include #include #include +#include +#include #include #include -#include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + namespace at { namespace native { namespace { template @@ -322,14 +331,18 @@ static void isin_default_kernel_cpu( }); } -static void clamp_kernel_impl(TensorIterator& iter) { +static void clamp_kernel_impl(TensorIteratorBase& iter) { AT_DISPATCH_ALL_TYPES_AND(kBFloat16, iter.common_dtype(), "clamp_cpu", [&]() { cpu_kernel_vec(iter, [](scalar_t a, scalar_t min, scalar_t max) -> scalar_t { - return std::min(std::max(a, min), max); + if (min != min || max != max) { + return std::numeric_limits::quiet_NaN(); + } else { + return std::min(std::max(a, min), max); + } }, [](Vectorized a, Vectorized min, Vectorized max) { - return vec::clamp(a, min, max); + return vec::minimum(vec::maximum(a, min), max); }); }); } @@ -350,18 +363,6 @@ static void clamp_scalar_kernel_impl(TensorIteratorBase& iter, const Scalar& min }); } -static void clamp_max_kernel_impl(TensorIterator& iter) { - AT_DISPATCH_ALL_TYPES_AND(kBFloat16, iter.common_dtype(), "clamp_max_cpu", [&]() { - cpu_kernel_vec(iter, - [](scalar_t a, scalar_t max) -> scalar_t { - return std::min(a, max); - }, - [](Vectorized a, Vectorized max) { - return vec::clamp_max(a, max); - }); - }); -} - static void clamp_max_scalar_kernel_impl(TensorIteratorBase& iter, Scalar max_) { AT_DISPATCH_ALL_TYPES_AND(kBFloat16, iter.common_dtype(), "clamp_max_scalar_cpu", [&]() { const auto max = max_.to(); @@ -376,18 +377,6 @@ static void clamp_max_scalar_kernel_impl(TensorIteratorBase& iter, Scalar max_) }); } -static void clamp_min_kernel_impl(TensorIterator& iter) { - AT_DISPATCH_ALL_TYPES_AND(kBFloat16, iter.common_dtype(), "clamp_min_cpu", [&]() { - cpu_kernel_vec(iter, - [](scalar_t a, scalar_t min) -> scalar_t { - return std::max(a, min); - }, - [](Vectorized a, Vectorized min) { - return vec::clamp_min(a, min); - }); - }); -} - static void clamp_min_scalar_kernel_impl(TensorIteratorBase& iter, Scalar min_) { AT_DISPATCH_ALL_TYPES_AND(kBFloat16, iter.common_dtype(), "clamp_min_cpu", [&]() { const auto min = min_.to(); @@ -412,8 +401,6 @@ REGISTER_DISPATCH(isposinf_stub, &isposinf_kernel_impl); REGISTER_DISPATCH(isneginf_stub, &isneginf_kernel_impl); REGISTER_DISPATCH(mode_stub, &mode_kernel_impl); REGISTER_DISPATCH(clamp_stub, &clamp_kernel_impl); -REGISTER_DISPATCH(clamp_min_stub, &clamp_min_kernel_impl); -REGISTER_DISPATCH(clamp_max_stub, &clamp_max_kernel_impl); REGISTER_DISPATCH(clamp_scalar_stub, &clamp_scalar_kernel_impl); REGISTER_DISPATCH(clamp_min_scalar_stub, &clamp_min_scalar_kernel_impl); REGISTER_DISPATCH(clamp_max_scalar_stub, &clamp_max_scalar_kernel_impl); diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp index 8d862615cc5d..5e61823e1d25 100644 --- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -174,12 +175,19 @@ void logit_kernel(TensorIteratorBase& iter, const Scalar& eps_scalar) { } static void abs_kernel(TensorIteratorBase& iter) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "abs_cpu", [&]() { - cpu_kernel_vec( - iter, - [=](scalar_t a) -> scalar_t { return abs_impl(a); }, - [=](Vectorized a) { return a.abs(); }); - }); + auto dtype = iter.dtype(); + if (dtype == kComplexHalf) { + using scalar_t = c10::complex; + using opmath_t = at::opmath_type; + cpu_kernel(iter, [=](scalar_t a) -> scalar_t { return abs_impl(opmath_t{a}); }); + } else { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "abs_cpu", [&]() { + cpu_kernel_vec( + iter, + [=](scalar_t a) -> scalar_t { return abs_impl(a); }, + [=](Vectorized a) { return a.abs(); }); + }); + } } static void angle_kernel(TensorIteratorBase& iter) { @@ -191,28 +199,10 @@ static void angle_kernel(TensorIteratorBase& iter) { }); } -static void real_kernel(TensorIteratorBase& iter) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "real_cpu", [&]() { - cpu_kernel_vec( - iter, - [=](scalar_t a) -> scalar_t { return real_impl(a); }, - [=](Vectorized a) { return a.real(); }); - }); -} - -static void imag_kernel(TensorIteratorBase& iter) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "imag_cpu", [&]() { - cpu_kernel_vec( - iter, - [=](scalar_t a) -> scalar_t { return imag_impl(a); }, - [=](Vectorized a) { return a.imag(); }); - }); -} - // NB: Ignores the negative bit on tensors void conj_kernel(TensorIteratorBase& iter) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( - kBool, kBFloat16, kHalf, iter.common_dtype(), "conj_cpu", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( + kBool, kBFloat16, kHalf, kComplexHalf, iter.common_dtype(), "conj_cpu", [&]() { cpu_kernel_vec( iter, [=](scalar_t a) -> scalar_t { return conj_impl(a); }, @@ -275,7 +265,7 @@ void reciprocal_kernel(TensorIteratorBase& iter) { // NB: Ignores the negative bit on tensors void neg_kernel(TensorIteratorBase& iter) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "neg_cpu", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kComplexHalf, kBFloat16, kHalf, iter.dtype(), "neg_cpu", [&]() { cpu_kernel_vec( iter, [=](scalar_t a) -> scalar_t { return -a; }, @@ -312,13 +302,21 @@ static void signbit_kernel(TensorIteratorBase& iter){ }); } -static void sgn_kernel(TensorIteratorBase& iter){ - AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "sgn_cpu", [&]() { - cpu_kernel_vec( - iter, - [=](scalar_t a) -> scalar_t { return sgn_impl(a); }, - [=](Vectorized a) { return a.sgn(); }); - }); +static void sgn_kernel(TensorIteratorBase& iter) { + auto dtype = iter.dtype(); + if (dtype == kComplexHalf) { + using scalar_t = c10::complex; + using opmath_t = at::opmath_type; + cpu_kernel( + iter, [=](scalar_t a) -> scalar_t { return sgn_impl(opmath_t{a}); }); + } else { + AT_DISPATCH_COMPLEX_TYPES(dtype, "sgn_cpu", [&]() { + cpu_kernel_vec( + iter, + [=](scalar_t a) -> scalar_t { return sgn_impl(a); }, + [=](Vectorized a) { return a.sgn(); }); + }); + } } static void sinc_kernel(TensorIteratorBase& iter) { @@ -504,6 +502,13 @@ static void ndtri_kernel(TensorIteratorBase& iter) { }); } +static void log_ndtr_kernel(TensorIteratorBase& iter) { + TORCH_INTERNAL_ASSERT(iter.ntensors() == 2); + AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "log_ndtr_cpu", [&]() { + cpu_kernel(iter, [](scalar_t x) { return calc_log_ndtr(x); }); + }); +} + static void i0e_kernel(TensorIteratorBase& iter) { TORCH_INTERNAL_ASSERT(iter.ntensors() == 2); AT_DISPATCH_FLOATING_TYPES_AND( @@ -614,8 +619,6 @@ REGISTER_DISPATCH(sigmoid_stub, &CPU_CAPABILITY::sigmoid_kernel); REGISTER_DISPATCH(logit_stub, &CPU_CAPABILITY::logit_kernel); REGISTER_DISPATCH(abs_stub, &CPU_CAPABILITY::abs_kernel); REGISTER_DISPATCH(angle_stub, &CPU_CAPABILITY::angle_kernel); -REGISTER_DISPATCH(real_stub, &CPU_CAPABILITY::real_kernel); -REGISTER_DISPATCH(imag_stub, &CPU_CAPABILITY::imag_kernel); REGISTER_DISPATCH(conj_physical_stub, &CPU_CAPABILITY::conj_kernel); REGISTER_DISPATCH(exp2_stub, &CPU_CAPABILITY::exp2_kernel); REGISTER_DISPATCH(bitwise_not_stub, &CPU_CAPABILITY::bitwise_not_kernel); @@ -641,6 +644,7 @@ REGISTER_DISPATCH(special_entr_stub, &CPU_CAPABILITY::entr_kernel); REGISTER_DISPATCH(frexp_stub, &CPU_CAPABILITY::frexp_kernel); REGISTER_DISPATCH(special_i0e_stub, &CPU_CAPABILITY::i0e_kernel); REGISTER_DISPATCH(special_ndtri_stub, &CPU_CAPABILITY::ndtri_kernel); +REGISTER_DISPATCH(special_log_ndtr_stub, &CPU_CAPABILITY::log_ndtr_kernel); REGISTER_DISPATCH(special_i1_stub, &CPU_CAPABILITY::i1_kernel); REGISTER_DISPATCH(special_i1e_stub, &CPU_CAPABILITY::i1e_kernel); REGISTER_DISPATCH(special_erfcx_stub, &CPU_CAPABILITY::erfcx_kernel); diff --git a/aten/src/ATen/native/cpu/Unfold2d.cpp b/aten/src/ATen/native/cpu/Unfold2d.cpp index cc3a6b68d43e..9bfa9ac8c6ab 100644 --- a/aten/src/ATen/native/cpu/Unfold2d.cpp +++ b/aten/src/ATen/native/cpu/Unfold2d.cpp @@ -1,8 +1,11 @@ +#define TORCH_ASSERT_NO_OPERATORS +#include #include #include #include #include #include +#include #include namespace at { @@ -116,6 +119,61 @@ static void unfolded2d_acc( }); } +template +static void unfolded2d_acc_channels_last( + scalar_t* finput_data, + scalar_t* input_data, + int64_t kH, + int64_t kW, + int64_t dH, + int64_t dW, + int64_t padH, + int64_t padW, + int64_t n_input_plane, + int64_t input_height, + int64_t input_width, + int64_t output_height, + int64_t output_width) { + + for (int64_t y = 0; y < output_height; y++) { + for (int64_t x = 0; x < output_width; x++) { + scalar_t* src = finput_data + y * output_width * kH * kW * n_input_plane + x * kH * kW * n_input_plane; + scalar_t* dst = input_data; + + if (padW > 0 || padH > 0) { + for (int64_t kh = 0; kh < kH; kh++) { + for (int64_t kw = 0; kw < kW; kw++) { + int64_t iy = y * dH - padH + kh; + int64_t ix = x * dW - padW + kw; + if (iy < 0 || iy >= input_height || ix < 0 || ix >= input_width) { + } else { + scalar_t* dst_slice = dst + iy * input_width * n_input_plane + ix * n_input_plane; + scalar_t* src_slice = src + kh * kW * n_input_plane + kw * n_input_plane; + cadd(dst_slice, + dst_slice, + src_slice, + n_input_plane); + } + } + } + } else { + for (int64_t kh = 0; kh < kH; kh++) { + for (int64_t kw = 0; kw < kW; kw++) { + int64_t iy = y * dH + kh; + int64_t ix = x * dW + kw; + scalar_t* dst_slice = dst + iy * input_width * n_input_plane + ix * n_input_plane; + scalar_t* src_slice = src + kh * kW * n_input_plane + kw * n_input_plane; + cadd(dst_slice, + dst_slice, + src_slice, + n_input_plane); + } + } + } + } + } +} + /* note: due to write issues, this one cannot be parallelized as well as * unfolded2d_copy */ void unfolded2d_acc_kernel( @@ -132,28 +190,41 @@ void unfolded2d_acc_kernel( int64_t input_height, int64_t input_width, int64_t output_height, - int64_t output_width) { + int64_t output_width, + bool is_channels_last) { // This function assumes that // output_height*dH does not overflow a int64_t // output_width*dW does not overflow a int64_t - AT_DISPATCH_FLOATING_TYPES_AND( - at::ScalarType::BFloat16, dtype, "unfolded2d_acc", [&] { - unfolded2d_acc( - static_cast(finput_data), - static_cast(input_data), - kH, - kW, - dH, - dW, - padH, - padW, - n_input_plane, - input_height, - input_width, - output_height, - output_width); + if (is_channels_last) { + AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, dtype, "unfolded2d_acc_channels_last", [&] { + unfolded2d_acc_channels_last( + static_cast(finput_data), + static_cast(input_data), + kH, kW, + dH, dW, + padH, padW, + n_input_plane, + input_height, + input_width, + output_height, + output_width); + }); + } else { + AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, dtype, "unfolded2d_acc", [&] { + unfolded2d_acc( + static_cast(finput_data), + static_cast(input_data), + kH, kW, + dH, dW, + padH, padW, + n_input_plane, + input_height, + input_width, + output_height, + output_width); }); + } } template @@ -263,6 +334,64 @@ static void unfolded2d_copy( }); } +template +static void unfolded2d_copy_channels_last( + scalar_t* input_data, + scalar_t* finput_data, + int64_t kH, + int64_t kW, + int64_t dH, + int64_t dW, + int64_t padH, + int64_t padW, + int64_t n_input_plane, + int64_t input_height, + int64_t input_width, + int64_t output_height, + int64_t output_width) { + at::parallel_for(0, output_height * output_width, 0, [&](int64_t start, int64_t end) { + int64_t y = 0; + int64_t x = 0; + data_index_init(start, y, output_height, x, output_width); + + for (const auto k : c10::irange(start, end)) { + (void)k; // Suppress unused variable warning + scalar_t* dst = finput_data + y * output_width * kH * kW * n_input_plane + x * kH * kW * n_input_plane; + scalar_t* src = input_data; + + if (padW > 0 || padH > 0) { + for (int64_t kh = 0; kh < kH; kh++) { + for (int64_t kw = 0; kw < kW; kw++) { + int64_t iy = y * dH - padH + kh; + int64_t ix = x * dW - padW + kw; + if (iy < 0 || iy >= input_height || ix < 0 || ix >= input_width) { + memset(dst + kh * kW * n_input_plane + kw * n_input_plane, + 0, + sizeof(scalar_t) * n_input_plane); + } else { + memcpy(dst + kh * kW * n_input_plane + kw * n_input_plane, + src + iy * input_width * n_input_plane + ix * n_input_plane, + sizeof(scalar_t) * n_input_plane); + } + } + } + } else { + for (int64_t kh = 0; kh < kH; kh++) { + for (int64_t kw = 0; kw < kW; kw++) { + int64_t iy = y * dH + kh; + int64_t ix = x * dW + kw; + memcpy(dst + kh * kW * n_input_plane + kw * n_input_plane, + src + iy * input_width * n_input_plane + ix * n_input_plane, + sizeof(scalar_t) * n_input_plane); + } + } + } + // move on to next output index + data_index_step(y, output_height, x, output_width); + } + }); +} + void unfolded2d_copy_kernel( ScalarType dtype, void *finput_data, @@ -277,30 +406,43 @@ void unfolded2d_copy_kernel( int64_t input_height, int64_t input_width, int64_t output_height, - int64_t output_width) { + int64_t output_width, + bool is_channels_last) { // This function assumes that // kH*kW does not overflow an int // n_input_plane*kH*kW does not overflow a int64_t // output_height*dH does not overflow a int64_t // output_width*dW does not overflow a int64_t - AT_DISPATCH_ALL_TYPES_AND( - at::ScalarType::BFloat16, dtype, "unfolded2d_copy", [&] { - unfolded2d_copy( - static_cast(input_data), - static_cast(finput_data), - kH, - kW, - dH, - dW, - padH, - padW, + if (is_channels_last) { + AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::BFloat16, dtype, "unfolded2d_copy_channels_last", [&] { + unfolded2d_copy_channels_last( + static_cast(input_data), + static_cast(finput_data), + kH, kW, + dH, dW, + padH, padW, n_input_plane, input_height, input_width, output_height, output_width); - }); + }); + } else { + AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::BFloat16, dtype, "unfolded2d_copy", [&] { + unfolded2d_copy( + static_cast(input_data), + static_cast(finput_data), + kH, kW, + dH, dW, + padH, padW, + n_input_plane, + input_height, + input_width, + output_height, + output_width); + }); + } } } // namespace diff --git a/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp b/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp index b226b68bbca1..8cfe6674906e 100644 --- a/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp +++ b/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp @@ -1,3 +1,5 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include #include diff --git a/aten/src/ATen/native/cpu/UpSampleKernel.cpp b/aten/src/ATen/native/cpu/UpSampleKernel.cpp index 88bdbd71d1ee..cfc931862372 100644 --- a/aten/src/ATen/native/cpu/UpSampleKernel.cpp +++ b/aten/src/ATen/native/cpu/UpSampleKernel.cpp @@ -1,51 +1,28 @@ -#include - +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include -#include #include +#include #include +#include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#include +#endif + namespace at { namespace native { namespace { using scale_t = std::vector>; -static inline int64_t nearest_idx( - int64_t output_index, - int64_t input_size, - int64_t output_size, - c10::optional scales) { - // This method specificly treats cases: output_size == input_size or - // output_size == 2 * input_size, that we would like to get rid of - // We keep this method for BC and consider as deprecated. - // See nearest_exact_idx as replacement - if (output_size == input_size) { - // scale_factor = 1, simply copy - return output_index; - } else if (output_size == 2 * input_size) { - // scale_factor = 2, shift input index - return output_index >> 1; - } else { - float scale = compute_scales_value(scales, input_size, output_size); - return nearest_neighbor_compute_source_index(scale, output_index, input_size); - } -} - -static inline int64_t nearest_exact_idx( - int64_t output_index, - int64_t input_size, - int64_t output_size, - c10::optional scales) { - float scale = compute_scales_value(scales, input_size, output_size); - return nearest_neighbor_exact_compute_source_index(scale, output_index, input_size); -} - -// Define a typedef to dispatch to nearest_idx or nearest_exact_idx -typedef int64_t (*nearest_idx_fn_t)(int64_t, int64_t, int64_t, c10::optional); - // Helper structs and methods for cpu_upsample_linear // // Interpolation methods that used below are separable, and as such we can compute the interpolation @@ -147,7 +124,6 @@ template static inline scalar_t interpolate_aa_single_dim_zero_strides( char* src, char** data, - int64_t i, const index_t ids_stride) { const index_t ids_min = *(index_t*)&data[0][0]; const index_t ids_size = *(index_t*)&data[1][0]; @@ -259,7 +235,7 @@ struct CheckAlmostAllZeroStrides { template struct CheckAlmostAllZeroStrides<0, non_zero_stride_dim, scalar_t, index_t, interp_size> { - static inline bool eval(const int64_t* strides) { + static inline bool eval(const int64_t* /*strides*/) { return true; } }; @@ -293,7 +269,7 @@ static inline void basic_loop_aa_single_dim_zero_strides( for (const auto i : c10::irange(n)) { *(scalar_t*)&dst[i * strides[0]] = interpolate_aa_single_dim_zero_strides( - src + i * strides[1], &data[2], i, ids_stride); + src + i * strides[1], &data[2], ids_stride); } } @@ -452,6 +428,16 @@ void cpu_upsample_nearest_channels_last( } } +template +inline VecType interpolate(const scalar_t* t, accscalar_t w) { + return VecType::loadu(t) * VecType(w); +} + +template +inline VecType interpolate(const scalar_t* t, accscalar_t w, Args... args) { + return VecType::loadu(t) * VecType(w) + interpolate(args...); +} + template void cpu_upsample_linear_channels_last( const Tensor& output_, @@ -485,6 +471,7 @@ void cpu_upsample_linear_channels_last( TORCH_CHECK(channels > 0, "expected input and output channels greater than 0 but got ", channels); int64_t output_slice_size = output_depth * output_height * output_width * channels; + using accscalar_t = at::acc_type; using Vec = vec::Vectorized; auto loop2d = [&](int64_t begin, int64_t end) { const scalar_t height_scale = area_pixel_compute_scale( @@ -514,23 +501,19 @@ void cpu_upsample_linear_channels_last( scalar_t* i01 = input_indexr(n, ih0, iw1); scalar_t* i10 = input_indexr(n, ih1, iw0); scalar_t* i11 = input_indexr(n, ih1, iw1); + accscalar_t w00 = h0lambda * w0lambda; + accscalar_t w01 = h0lambda * w1lambda; + accscalar_t w10 = h1lambda * w0lambda; + accscalar_t w11 = h1lambda * w1lambda; int64_t size = channels; int64_t d = 0; for (; d < size - (size % Vec::size()); d += Vec::size()) { - Vec out_vec = - Vec(h0lambda * w0lambda) * Vec::loadu(i00 + d) + /* h0 * w0 * i00 */ - Vec(h0lambda * w1lambda) * Vec::loadu(i01 + d) + /* h0 * w1 * i01 */ - Vec(h1lambda * w0lambda) * Vec::loadu(i10 + d) + /* h1 * w0 * i10 */ - Vec(h1lambda * w1lambda) * Vec::loadu(i11 + d); /* h1 * w1 * i11 */ + auto out_vec = interpolate(i00 + d, w00, i01 + d, w01, i10 + d, w10, i11 + d, w11); out_vec.store(out + d); } for (; d < size; d++) { - out[d] = - h0lambda * w0lambda * i00[d] + /* h0 * w0 * i00 */ - h0lambda * w1lambda * i01[d] + /* h0 * w1 * i01 */ - h1lambda * w0lambda * i10[d] + /* h1 * w0 * i10 */ - h1lambda * w1lambda * i11[d]; /* h1 * w1 * i11 */ + out[d] = i00[d] * w00 + i01[d] * w01 + i10[d] * w10 + i11[d] * w11; } } } @@ -576,31 +559,27 @@ void cpu_upsample_linear_channels_last( scalar_t* i101 = input_indexr(n, id1, ih0, iw1); scalar_t* i110 = input_indexr(n, id1, ih1, iw0); scalar_t* i111 = input_indexr(n, id1, ih1, iw1); + accscalar_t w000 = d0lambda * h0lambda * w0lambda; + accscalar_t w001 = d0lambda * h0lambda * w1lambda; + accscalar_t w010 = d0lambda * h1lambda * w0lambda; + accscalar_t w011 = d0lambda * h1lambda * w1lambda; + accscalar_t w100 = d1lambda * h0lambda * w0lambda; + accscalar_t w101 = d1lambda * h0lambda * w1lambda; + accscalar_t w110 = d1lambda * h1lambda * w0lambda; + accscalar_t w111 = d1lambda * h1lambda * w1lambda; int64_t size = channels; int64_t d = 0; for (; d < size - (size % Vec::size()); d += Vec::size()) { - Vec out_vec = - Vec(d0lambda * h0lambda * w0lambda) * Vec::loadu(i000 + d) + /* d0 * h0 * w0 * i000 */ - Vec(d0lambda * h0lambda * w1lambda) * Vec::loadu(i001 + d) + /* d0 * h0 * w1 * i001 */ - Vec(d0lambda * h1lambda * w0lambda) * Vec::loadu(i010 + d) + /* d0 * h1 * w0 * i010 */ - Vec(d0lambda * h1lambda * w1lambda) * Vec::loadu(i011 + d) + /* d0 * h1 * w1 * i011 */ - Vec(d1lambda * h0lambda * w0lambda) * Vec::loadu(i100 + d) + /* d1 * h0 * w0 * i100 */ - Vec(d1lambda * h0lambda * w1lambda) * Vec::loadu(i101 + d) + /* d1 * h0 * w1 * i101 */ - Vec(d1lambda * h1lambda * w0lambda) * Vec::loadu(i110 + d) + /* d1 * h1 * w0 * i110 */ - Vec(d1lambda * h1lambda * w1lambda) * Vec::loadu(i111 + d); /* d1 * h1 * w1 * i111 */ + auto out_vec = interpolate( + i000 + d, w000, i001 + d, w001, i010 + d, w010, i011 + d, w011, + i100 + d, w100, i101 + d, w101, i110 + d, w110, i111 + d, w111); out_vec.store(out + d); } for (; d < size; d++) { out[d] = - d0lambda * h0lambda * w0lambda * i000[d] + /* d0 * h0 * w0 * i000 */ - d0lambda * h0lambda * w1lambda * i001[d] + /* d0 * h0 * w1 * i001 */ - d0lambda * h1lambda * w0lambda * i010[d] + /* d0 * h1 * w0 * i010 */ - d0lambda * h1lambda * w1lambda * i011[d] + /* d0 * h1 * w1 * i011 */ - d1lambda * h0lambda * w0lambda * i100[d] + /* d1 * h0 * w0 * i100 */ - d1lambda * h0lambda * w1lambda * i101[d] + /* d1 * h0 * w1 * i101 */ - d1lambda * h1lambda * w0lambda * i110[d] + /* d1 * h1 * w0 * i110 */ - d1lambda * h1lambda * w1lambda * i111[d]; /* d1 * h1 * w1 * i111 */ + i000[d] * w000 + i001[d] * w001 + i010[d] * w010 + i011[d] * w011 + + i100[d] * w100 + i101[d] * w101 + i110[d] * w110 + i111[d] * w111; } } } @@ -675,7 +654,7 @@ struct HelperInterpBase { template static inline std::vector _compute_indices_weights_aa( int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims, - int64_t reshape_dim, bool align_corners, scalar_t scale, + int64_t reshape_dim, scalar_t scale, int interp_size, aa_filter_fn_t aa_filter_fn ) { @@ -786,8 +765,8 @@ struct HelperInterpNearest : public HelperInterpBase { HelperInterpNearest::init_indices_weights( scalar_type, output, output_size, ndims, reshape_dim, HelperInterpNearest::interp_size); - AT_DISPATCH_FLOATING_TYPES( - scalar_type, "compute_indices_weights_nearest", [&] { + AT_DISPATCH_FLOATING_TYPES_AND( + ScalarType::BFloat16, scalar_type, "compute_indices_weights_nearest", [&] { scalar_t scale = area_pixel_compute_scale(input_size, output_size, align_corners, opt_scale); @@ -887,8 +866,8 @@ struct HelperInterpLinear : public HelperInterpBase { HelperInterpLinear::init_indices_weights( scalar_type, output, output_size, ndims, reshape_dim, HelperInterpLinear::interp_size); - AT_DISPATCH_FLOATING_TYPES( - scalar_type, "compute_indices_weights_linear", [&] { + AT_DISPATCH_FLOATING_TYPES_AND( + ScalarType::BFloat16, scalar_type, "compute_indices_weights_linear", [&] { scalar_t scale = area_pixel_compute_scale(input_size, output_size, align_corners, opt_scale); @@ -956,7 +935,6 @@ struct HelperInterpLinear : public HelperInterpBase { stride, ndims, reshape_dim, - align_corners, scale, interp_size, &HelperInterpLinear::aa_filter); @@ -990,8 +968,8 @@ struct HelperInterpCubic : public HelperInterpBase { HelperInterpCubic::init_indices_weights( scalar_type, output, output_size, ndims, reshape_dim, HelperInterpCubic::interp_size); - AT_DISPATCH_FLOATING_TYPES( - scalar_type, "compute_indices_weights_cubic", [&] { + AT_DISPATCH_FLOATING_TYPES_AND( + ScalarType::BFloat16, scalar_type, "compute_indices_weights_cubic", [&] { scalar_t scale = area_pixel_compute_scale(input_size, output_size, align_corners, opt_scale); @@ -1068,7 +1046,6 @@ struct HelperInterpCubic : public HelperInterpBase { stride, ndims, reshape_dim, - align_corners, scale, interp_size, &HelperInterpCubic::aa_filter); @@ -1114,7 +1091,7 @@ void upsample_generic_Nd_kernel_impl( constexpr int interp_size = F::interp_size; auto input_scalar_type = input.scalar_type(); - if (interp_size == 1 && input_scalar_type == at::ScalarType::Byte) { + if ((interp_size == 1 && input_scalar_type == at::ScalarType::Byte)) { // nearest also supports uint8 tensor, but we have to use float // with compute_indices_weights input_scalar_type = at::ScalarType::Float; @@ -1147,14 +1124,14 @@ void upsample_generic_Nd_kernel_impl( if (interp_size > 1) { // Nearest also supports uint8 tensor, so need to handle it separately - AT_DISPATCH_FLOATING_TYPES( - iter.dtype(), "upsample_generic_Nd", [&] { + AT_DISPATCH_FLOATING_TYPES_AND( + at::ScalarType::BFloat16, iter.dtype(), "upsample_generic_Nd", [&] { // MSVC can not catch constexpr int interp_size here constexpr int mode = F::interp_size; cpu_upsample_generic(iter); }); } else { - AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Byte, + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Byte, at::ScalarType::BFloat16, iter.dtype(), "upsample_generic_Nd", [&] { constexpr int mode = F::interp_size; cpu_upsample_generic(iter); @@ -1295,7 +1272,8 @@ void upsample_nearest2d_kernel_impl( c10::optional scales_h, c10::optional scales_w) { if (input.is_contiguous(at::MemoryFormat::ChannelsLast)) { - AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Byte, input.scalar_type(), "upsample_nearest2d_channels_last", [&] { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Byte, at::ScalarType::BFloat16, + input.scalar_type(), "upsample_nearest2d_channels_last", [&] { cpu_upsample_nearest_channels_last(output, input, {scales_h, scales_w}); }); } else { @@ -1326,7 +1304,8 @@ void upsample_nearest3d_kernel_impl( c10::optional scales_h, c10::optional scales_w) { if (input.is_contiguous(at::MemoryFormat::ChannelsLast3d)) { - AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Byte, input.scalar_type(), "upsample_nearest3d_channels_last", [&] { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Byte, at::ScalarType::BFloat16, + input.scalar_type(), "upsample_nearest3d_channels_last", [&] { cpu_upsample_nearest_channels_last(output, input, {scales_d, scales_h, scales_w}); }); } else { @@ -1369,7 +1348,7 @@ void upsample_bilinear2d_kernel_impl( // Temporarily dispatch to original channels last implementation if (input.is_contiguous(at::MemoryFormat::ChannelsLast)) { - AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "upsample_bilinear2d_channels_last", [&] { + AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, input.scalar_type(), "upsample_bilinear2d_channels_last", [&] { cpu_upsample_linear_channels_last(output, input, align_corners, {scales_h, scales_w}); }); } else { @@ -1397,7 +1376,7 @@ void upsample_trilinear3d_kernel_impl( c10::optional scales_h, c10::optional scales_w) { if (input.is_contiguous(at::MemoryFormat::ChannelsLast3d)) { - AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "upsample_trilinear3d_channels_last", [&] { + AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, input.scalar_type(), "upsample_trilinear3d_channels_last", [&] { cpu_upsample_linear_channels_last(output, input, align_corners, {scales_d, scales_h, scales_w}); }); } else { @@ -1427,156 +1406,6 @@ void upsample_bicubic2d_aa_kernel_impl( output, input, align_corners, {scales_h, scales_w}); } -template -void cpu_upsample_nearest_backward( - const Tensor& grad_input_, - const Tensor& grad_output_, - const scale_type& scales) { - TORCH_CHECK(grad_input_.dtype() == grad_output_.dtype(), "expected dtype ", grad_output_.dtype(), - " for `grad_input` but got dtype ", grad_input_.dtype()); - - auto grad_output = grad_output_.contiguous(); - auto grad_input = grad_input_.contiguous(); - - auto grad_output_data = grad_output.data_ptr(); - auto grad_input_data = grad_input.data_ptr(); - auto input_sizes = grad_input.sizes().vec(); - auto output_sizes = grad_output.sizes().vec(); - auto ndim = input_sizes.size(); - - // treat nbatch and channels as one dimension - int64_t channels = input_sizes[0] * input_sizes[1]; - int64_t input_depth = (ndim == 5) ? input_sizes[2] : 1; - int64_t output_depth = (ndim == 5) ? output_sizes[2] : 1; - int64_t input_height = (ndim >= 4) ? input_sizes[ndim - 2] : 1; - int64_t output_height = (ndim >= 4) ? output_sizes[ndim - 2] : 1; - int64_t input_width = input_sizes[ndim - 1]; - int64_t output_width = output_sizes[ndim - 1]; - - int64_t output_slice_size = output_depth * output_height * output_width; - int64_t input_slice_size = input_depth * input_height * input_width; - - auto loop1d = [&](int64_t begin, int64_t end) { - for (const auto c : c10::irange(begin, end)) { - for (const auto ow : c10::irange(output_width)) { - int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[0]); - int64_t output_offset = c * output_slice_size + ow; - int64_t input_offset = c * input_slice_size + iw; - grad_input_data[input_offset] += grad_output_data[output_offset]; - } - } - }; - - auto loop2d = [&](int64_t begin, int64_t end) { - for (const auto c : c10::irange(begin, end)) { - for (const auto oh : c10::irange(output_height)) { - int64_t ih = nearest_idx_fn(oh, input_height, output_height, scales[0]); - for (const auto ow : c10::irange(output_width)) { - int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[1]); - int64_t output_offset = c * output_slice_size + oh * output_width + ow; - int64_t input_offset = c * input_slice_size + ih * input_width + iw; - grad_input_data[input_offset] += grad_output_data[output_offset]; - } - } - } - }; - - auto loop3d = [&](int64_t begin, int64_t end) { - for (const auto c : c10::irange(begin, end)) { - for (const auto od : c10::irange(output_depth)) { - int64_t id = nearest_idx_fn(od, input_depth, output_depth, scales[0]); - for (const auto oh : c10::irange(output_height)) { - int64_t ih = nearest_idx_fn(oh, input_height, output_height, scales[1]); - for (const auto ow : c10::irange(output_width)) { - int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[2]); - int64_t output_offset = c * output_slice_size + - od * output_height * output_width + oh * output_width + ow; - int64_t input_offset = c * input_slice_size + - id * input_height * input_width + ih * input_width + iw; - grad_input_data[input_offset] += grad_output_data[output_offset]; - } - } - } - } - }; - - if (ndim == 3) { - // upsample nearest 1d - at::parallel_for(0, channels, at::internal::GRAIN_SIZE / output_slice_size, loop1d); - } else if (ndim == 4) { - // upsample nearest 2d - at::parallel_for(0, channels, at::internal::GRAIN_SIZE / output_slice_size , loop2d); - } else { - // upsample nearest 3d - TORCH_INTERNAL_ASSERT(ndim == 5); - at::parallel_for(0, channels, at::internal::GRAIN_SIZE / output_slice_size, loop3d); - } - - if (!grad_input_.is_contiguous()) { - grad_input_.copy_(grad_input); - } -} - -void upsample_nearest1d_backward_kernel_impl( - const Tensor& grad_input, - const Tensor& grad_output, - c10::optional scales_w) { - AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "upsample_nearest1d_backward", [&] { - cpu_upsample_nearest_backward(grad_input, grad_output, {scales_w}); - }); -} - -void _upsample_nearest_exact1d_backward_kernel_impl( - const Tensor& grad_input, - const Tensor& grad_output, - c10::optional scales_w) { - AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "_upsample_nearest_exact1d_backward", [&] { - cpu_upsample_nearest_backward(grad_input, grad_output, {scales_w}); - }); -} - -void upsample_nearest2d_backward_kernel_impl( - const Tensor& grad_input, - const Tensor& grad_output, - c10::optional scales_h, - c10::optional scales_w) { - AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "upsample_nearest2d_backward", [&] { - cpu_upsample_nearest_backward(grad_input, grad_output, {scales_h, scales_w}); - }); -} - -void _upsample_nearest_exact2d_backward_kernel_impl( - const Tensor& grad_input, - const Tensor& grad_output, - c10::optional scales_h, - c10::optional scales_w) { - AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "_upsample_nearest_exact2d_backward", [&] { - cpu_upsample_nearest_backward(grad_input, grad_output, {scales_h, scales_w}); - }); -} - -void upsample_nearest3d_backward_kernel_impl( - const Tensor& grad_input, - const Tensor& grad_output, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w) { - AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "upsample_nearest3d_backward", [&] { - cpu_upsample_nearest_backward(grad_input, grad_output, {scales_d, scales_h, scales_w}); - }); -} - -void _upsample_nearest_exact3d_backward_kernel_impl( - const Tensor& grad_input, - const Tensor& grad_output, - c10::optional scales_d, - c10::optional scales_h, - c10::optional scales_w) { - AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "_upsample_nearest_exact3d_backward", [&] { - cpu_upsample_nearest_backward(grad_input, grad_output, {scales_d, scales_h, scales_w}); - }); -} - template < typename scalar_t, typename scale_type, @@ -1726,12 +1555,6 @@ REGISTER_DISPATCH(upsample_nearest2d_kernel, &upsample_nearest2d_kernel_impl); REGISTER_DISPATCH(_upsample_nearest_exact2d_kernel, &_upsample_nearest_exact2d_kernel_impl); REGISTER_DISPATCH(upsample_nearest3d_kernel, &upsample_nearest3d_kernel_impl); REGISTER_DISPATCH(_upsample_nearest_exact3d_kernel, &_upsample_nearest_exact3d_kernel_impl); -REGISTER_DISPATCH(upsample_nearest1d_backward_kernel, &upsample_nearest1d_backward_kernel_impl); -REGISTER_DISPATCH(_upsample_nearest_exact1d_backward_kernel, &_upsample_nearest_exact1d_backward_kernel_impl); -REGISTER_DISPATCH(upsample_nearest2d_backward_kernel, &upsample_nearest2d_backward_kernel_impl); -REGISTER_DISPATCH(_upsample_nearest_exact2d_backward_kernel, &_upsample_nearest_exact2d_backward_kernel_impl); -REGISTER_DISPATCH(upsample_nearest3d_backward_kernel, &upsample_nearest3d_backward_kernel_impl); -REGISTER_DISPATCH(_upsample_nearest_exact3d_backward_kernel, &_upsample_nearest_exact3d_backward_kernel_impl); REGISTER_DISPATCH(upsample_linear1d_kernel, &upsample_linear1d_kernel_impl); REGISTER_DISPATCH(upsample_bilinear2d_kernel, &upsample_bilinear2d_kernel_impl); diff --git a/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp b/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp index 22ab12bad12a..a26cef72bb10 100644 --- a/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp +++ b/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp @@ -1,13 +1,13 @@ -// NOLINTNEXTLINE(modernize-deprecated-headers) -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include -#include +#include #include #include #include -#include +#include #include +#include namespace at { namespace native { @@ -15,6 +15,260 @@ namespace { using scale_t = std::vector>; +template +void cpu_upsample_nearest_backward( + const Tensor& grad_input_, + const Tensor& grad_output_, + const scale_type& scales) { + TORCH_CHECK(grad_input_.dtype() == grad_output_.dtype(), "expected dtype ", grad_output_.dtype(), + " for `grad_input` but got dtype ", grad_input_.dtype()); + + auto grad_output = grad_output_.contiguous(); + auto grad_input = grad_input_.contiguous(); + + auto grad_output_data = grad_output.data_ptr(); + auto grad_input_data = grad_input.data_ptr(); + auto input_sizes = grad_input.sizes().vec(); + auto output_sizes = grad_output.sizes().vec(); + auto ndim = input_sizes.size(); + + // treat nbatch and channels as one dimension + int64_t channels = input_sizes[0] * input_sizes[1]; + int64_t input_depth = (ndim == 5) ? input_sizes[2] : 1; + int64_t output_depth = (ndim == 5) ? output_sizes[2] : 1; + int64_t input_height = (ndim >= 4) ? input_sizes[ndim - 2] : 1; + int64_t output_height = (ndim >= 4) ? output_sizes[ndim - 2] : 1; + int64_t input_width = input_sizes[ndim - 1]; + int64_t output_width = output_sizes[ndim - 1]; + + int64_t output_slice_size = output_depth * output_height * output_width; + int64_t input_slice_size = input_depth * input_height * input_width; + + auto loop1d = [&](int64_t begin, int64_t end) { + for (const auto c : c10::irange(begin, end)) { + for (const auto ow : c10::irange(output_width)) { + int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[0]); + int64_t output_offset = c * output_slice_size + ow; + int64_t input_offset = c * input_slice_size + iw; + grad_input_data[input_offset] += grad_output_data[output_offset]; + } + } + }; + + auto loop2d = [&](int64_t begin, int64_t end) { + for (const auto c : c10::irange(begin, end)) { + for (const auto oh : c10::irange(output_height)) { + int64_t ih = nearest_idx_fn(oh, input_height, output_height, scales[0]); + for (const auto ow : c10::irange(output_width)) { + int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[1]); + int64_t output_offset = c * output_slice_size + oh * output_width + ow; + int64_t input_offset = c * input_slice_size + ih * input_width + iw; + grad_input_data[input_offset] += grad_output_data[output_offset]; + } + } + } + }; + + auto loop3d = [&](int64_t begin, int64_t end) { + for (const auto c : c10::irange(begin, end)) { + for (const auto od : c10::irange(output_depth)) { + int64_t id = nearest_idx_fn(od, input_depth, output_depth, scales[0]); + for (const auto oh : c10::irange(output_height)) { + int64_t ih = nearest_idx_fn(oh, input_height, output_height, scales[1]); + for (const auto ow : c10::irange(output_width)) { + int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[2]); + int64_t output_offset = c * output_slice_size + + od * output_height * output_width + oh * output_width + ow; + int64_t input_offset = c * input_slice_size + + id * input_height * input_width + ih * input_width + iw; + grad_input_data[input_offset] += grad_output_data[output_offset]; + } + } + } + } + }; + + if (ndim == 3) { + // upsample nearest 1d + at::parallel_for(0, channels, at::internal::GRAIN_SIZE / output_slice_size, loop1d); + } else if (ndim == 4) { + // upsample nearest 2d + at::parallel_for(0, channels, at::internal::GRAIN_SIZE / output_slice_size , loop2d); + } else { + // upsample nearest 3d + TORCH_INTERNAL_ASSERT(ndim == 5); + at::parallel_for(0, channels, at::internal::GRAIN_SIZE / output_slice_size, loop3d); + } + + if (!grad_input_.is_contiguous()) { + grad_input_.copy_(grad_input); + } +} + +template +void cpu_upsample_nearest_backward_channels_last( + const Tensor& grad_input_, + const Tensor& grad_output_, + const scale_type& scales) { + TORCH_CHECK(grad_input_.dtype() == grad_output_.dtype(), "expected dtype ", grad_output_.dtype(), + " for `grad_input` but got dtype ", grad_input_.dtype()); + + auto ndim = grad_output_.ndimension(); + TORCH_CHECK(ndim >=4 && ndim <= 5, "Upsample with NHWC format supports tensors with 4 or 5 dims.") + + auto channels_last_memory_format = ndim == 4 ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::ChannelsLast3d; + auto grad_output = grad_output_.contiguous(channels_last_memory_format); + auto grad_input = grad_input_.contiguous(channels_last_memory_format); + + auto grad_output_data = grad_output.data_ptr(); + auto grad_input_data = grad_input.data_ptr(); + + auto input_sizes = grad_input.sizes().vec(); + auto output_sizes = grad_output.sizes().vec(); + + int64_t num_batches = input_sizes[0]; + int64_t channels = input_sizes[1]; + int64_t input_depth = (ndim == 5) ? input_sizes[2] : 1; + int64_t output_depth = (ndim == 5) ? output_sizes[2] : 1; + int64_t input_height = (ndim >= 4) ? input_sizes[ndim - 2] : 1; + int64_t output_height = (ndim >= 4) ? output_sizes[ndim - 2] : 1; + int64_t input_width = input_sizes[ndim - 1]; + int64_t output_width = output_sizes[ndim - 1]; + + using Vec = vec::Vectorized; + auto acc = [](scalar_t* gin, scalar_t* gout, int64_t size) { + int64_t d = 0; + for (; d < size - (size % Vec::size()); d += Vec::size()) { + Vec gin_vec = Vec::loadu(gin + d) + Vec::loadu(gout + d); + gin_vec.store(gin + d); + } + for (; d < size; d++) { + gin[d] += gout[d]; + } + }; + + auto loop2d = [&](int64_t begin, int64_t end) { + for (const auto n : c10::irange(begin, end)) { + for (const auto oh : c10::irange(output_height)) { + int64_t ih = nearest_idx_fn(oh, input_height, output_height, scales[0]); + for (const auto ow : c10::irange(output_width)) { + int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[1]); + scalar_t* grad_output_ptr = grad_output_data + + (n * output_height * output_width + oh * output_width + ow) * channels; + scalar_t* grad_input_ptr = grad_input_data + + (n * input_height * input_width + ih * input_width + iw) * channels; + acc(grad_input_ptr, grad_output_ptr, channels); + } + } + } + }; + + auto loop3d = [&](int64_t begin, int64_t end) { + for (const auto n : c10::irange(begin, end)) { + for (int64_t od = 0; od < output_depth; od++) { + int64_t id = nearest_idx_fn(od, input_depth, output_depth, scales[0]); + for (int64_t oh = 0; oh < output_height; oh++) { + int64_t ih = nearest_idx_fn(oh, input_height, output_height, scales[1]); + for (int64_t ow = 0; ow < output_width; ow++) { + int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[2]); + scalar_t* grad_output_ptr = grad_output_data + + (n * output_depth * output_height * output_width + + od * output_height * output_width + oh * output_width + ow) * channels; + scalar_t* grad_input_ptr = grad_input_data + + (n * input_depth * input_height * input_width + + id * input_height * input_width + ih * input_width + iw) * channels; + acc(grad_input_ptr, grad_output_ptr, channels); + } + } + } + } + }; + + if (ndim == 4) { + // upsample nearest 2d + at::parallel_for(0, num_batches, 0, loop2d); + } else { + // upsample nearest 3d + TORCH_INTERNAL_ASSERT(ndim == 5); + at::parallel_for(0, num_batches, 0, loop3d); + } + + if (!grad_input_.is_contiguous(channels_last_memory_format)) { + grad_input_.copy_(grad_input); + } +} + +void upsample_nearest1d_backward_kernel_impl( + const Tensor& grad_input, + const Tensor& grad_output, + c10::optional scales_w) { + AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_nearest1d_backward", [&] { + cpu_upsample_nearest_backward(grad_input, grad_output, {scales_w}); + }); +} + +void _upsample_nearest_exact1d_backward_kernel_impl( + const Tensor& grad_input, + const Tensor& grad_output, + c10::optional scales_w) { + AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "_upsample_nearest_exact1d_backward", [&] { + cpu_upsample_nearest_backward(grad_input, grad_output, {scales_w}); + }); +} + +void upsample_nearest2d_backward_kernel_impl( + const Tensor& grad_input, + const Tensor& grad_output, + c10::optional scales_h, + c10::optional scales_w) { + if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast)) { + AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_nearest2d_backward_cl", [&] { + cpu_upsample_nearest_backward_channels_last(grad_input, grad_output, {scales_h, scales_w}); + }); + } else { + AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_nearest2d_backward", [&] { + cpu_upsample_nearest_backward(grad_input, grad_output, {scales_h, scales_w}); + }); + } +} + +void _upsample_nearest_exact2d_backward_kernel_impl( + const Tensor& grad_input, + const Tensor& grad_output, + c10::optional scales_h, + c10::optional scales_w) { + if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast)) { + AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "_upsample_nearest_exact2d_backward_cl", [&] { + cpu_upsample_nearest_backward_channels_last(grad_input, grad_output, {scales_h, scales_w}); + }); + } else { + AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "_upsample_nearest_exact2d_backward", [&] { + cpu_upsample_nearest_backward(grad_input, grad_output, {scales_h, scales_w}); + }); + } +} + +void upsample_nearest3d_backward_kernel_impl( + const Tensor& grad_input, + const Tensor& grad_output, + c10::optional scales_d, + c10::optional scales_h, + c10::optional scales_w) { + AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_nearest3d_backward", [&] { + cpu_upsample_nearest_backward(grad_input, grad_output, {scales_d, scales_h, scales_w}); + }); +} + +void _upsample_nearest_exact3d_backward_kernel_impl( + const Tensor& grad_input, + const Tensor& grad_output, + c10::optional scales_d, + c10::optional scales_h, + c10::optional scales_w) { + AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "_upsample_nearest_exact3d_backward", [&] { + cpu_upsample_nearest_backward(grad_input, grad_output, {scales_d, scales_h, scales_w}); + }); +} template void cpu_upsample_linear_backward( @@ -156,12 +410,143 @@ void cpu_upsample_linear_backward( } } +template +void cpu_upsample_linear_backward_channels_last( + const Tensor& grad_input_, + const Tensor& grad_output_, + bool align_corners, + const scale_type& scales) { + TORCH_CHECK(grad_input_.dtype() == grad_output_.dtype(), "expected dtype ", grad_output_.dtype(), + " for `grad_input` but got dtype ", grad_input_.dtype()); + + auto ndim = grad_output_.ndimension(); + TORCH_CHECK(ndim >=4 && ndim <= 5, "Upsample with NHWC format supports tensors with 4 or 5 dims.") + + auto channels_last_memory_format = ndim == 4 ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::ChannelsLast3d; + auto grad_output = grad_output_.contiguous(channels_last_memory_format); + auto grad_input = grad_input_.contiguous(channels_last_memory_format); + + auto grad_output_data = grad_output.data_ptr(); + auto grad_input_data = grad_input.data_ptr(); + + auto input_sizes = grad_input.sizes().vec(); + auto output_sizes = grad_output.sizes().vec(); + + int64_t num_batches = input_sizes[0]; + int64_t channels = input_sizes[1]; + int64_t input_depth = (ndim == 5) ? input_sizes[2] : 1; + int64_t output_depth = (ndim == 5) ? output_sizes[2] : 1; + int64_t input_height = (ndim >= 4) ? input_sizes[ndim - 2] : 1; + int64_t output_height = (ndim >= 4) ? output_sizes[ndim - 2] : 1; + int64_t input_width = input_sizes[ndim - 1]; + int64_t output_width = output_sizes[ndim - 1]; + + using accscalar_t = at::acc_type; + using Vec = vec::Vectorized; + auto acc = [](scalar_t* gin, scalar_t* gout, accscalar_t w, int64_t size) { + int64_t d = 0; + for (; d < size - (size % Vec::size()); d += Vec::size()) { + Vec gin_vec = Vec::loadu(gin + d) + Vec(w) * Vec::loadu(gout + d); + gin_vec.store(gin + d); + } + for (; d < size; d++) { + gin[d] += w * gout[d]; + } + }; + + auto loop2d = [&](int64_t begin, int64_t end) { + const scalar_t height_scale = area_pixel_compute_scale( + input_height, output_height, align_corners, scales[0]); + const scalar_t width_scale = area_pixel_compute_scale( + input_width, output_width, align_corners, scales[1]); + + auto input_indexr = [=](int64_t n, int64_t h, int64_t w){ + return grad_input_data + (n * input_height * input_width + h * input_width + w) * channels; + }; + + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + int64_t ih0, ih1, iw0, iw1; + scalar_t h0lambda, h1lambda, w0lambda, w1lambda; + for (const auto n : c10::irange(begin, end)) { + for (const auto oh : c10::irange(output_height)) { + compute_source_index_and_lambda( + ih0, ih1, h0lambda, h1lambda, height_scale, oh, input_height, output_height, align_corners); + for (const auto ow : c10::irange(output_width)) { + compute_source_index_and_lambda( + iw0, iw1, w0lambda, w1lambda, width_scale, ow, input_width, output_width, align_corners); + scalar_t* grad_output_ptr = grad_output_data + + (n * output_height * output_width + oh * output_width + ow) * channels; + acc(input_indexr(n, ih0, iw0), grad_output_ptr, h0lambda * w0lambda, channels); /* i00 */ + acc(input_indexr(n, ih0, iw1), grad_output_ptr, h0lambda * w1lambda, channels); /* i01 */ + acc(input_indexr(n, ih1, iw0), grad_output_ptr, h1lambda * w0lambda, channels); /* i10 */ + acc(input_indexr(n, ih1, iw1), grad_output_ptr, h1lambda * w1lambda, channels); /* i11 */ + } + } + } + }; + + auto loop3d = [&](int64_t begin, int64_t end) { + const scalar_t depth_scale = area_pixel_compute_scale( + input_depth, output_depth, align_corners, scales[0]); + const scalar_t height_scale = area_pixel_compute_scale( + input_height, output_height, align_corners, scales[1]); + const scalar_t width_scale = area_pixel_compute_scale( + input_width, output_width, align_corners, scales[2]); + + auto input_indexr = [=](int64_t n, int64_t d, int64_t h, int64_t w) { + return grad_input_data + (n * input_depth * input_height * input_width + + d * input_height * input_width + h * input_width + w) * channels; + }; + + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + int64_t id0, id1, ih0, ih1, iw0, iw1; + scalar_t d0lambda, d1lambda, h0lambda, h1lambda, w0lambda, w1lambda; + for (const auto n : c10::irange(begin, end)) { + for (const auto od : c10::irange(output_depth)) { + compute_source_index_and_lambda( + id0, id1, d0lambda, d1lambda, depth_scale, od, input_depth, output_depth, align_corners); + for (const auto oh : c10::irange(output_height)) { + compute_source_index_and_lambda( + ih0, ih1, h0lambda, h1lambda, height_scale, oh, input_height, output_height, align_corners); + for (const auto ow : c10::irange(output_width)) { + compute_source_index_and_lambda( + iw0, iw1, w0lambda, w1lambda, width_scale, ow, input_width, output_width, align_corners); + scalar_t* grad_output_ptr = grad_output_data + (n * output_depth * output_height * output_width + + od * output_height * output_width + oh * output_width + ow) * channels; + acc(input_indexr(n, id0, ih0, iw0), grad_output_ptr, d0lambda * h0lambda * w0lambda, channels); /* i000 */ + acc(input_indexr(n, id0, ih0, iw1), grad_output_ptr, d0lambda * h0lambda * w1lambda, channels); /* i001 */ + acc(input_indexr(n, id0, ih1, iw0), grad_output_ptr, d0lambda * h1lambda * w0lambda, channels); /* i010 */ + acc(input_indexr(n, id0, ih1, iw1), grad_output_ptr, d0lambda * h1lambda * w1lambda, channels); /* i011 */ + acc(input_indexr(n, id1, ih0, iw0), grad_output_ptr, d1lambda * h0lambda * w0lambda, channels); /* i100 */ + acc(input_indexr(n, id1, ih0, iw1), grad_output_ptr, d1lambda * h0lambda * w1lambda, channels); /* i101 */ + acc(input_indexr(n, id1, ih1, iw0), grad_output_ptr, d1lambda * h1lambda * w0lambda, channels); /* i110 */ + acc(input_indexr(n, id1, ih1, iw1), grad_output_ptr, d1lambda * h1lambda * w1lambda, channels); /* i111 */ + } + } + } + } + }; + + if (ndim == 4) { + // upsample bilinear 2d + at::parallel_for(0, num_batches, 0, loop2d); + } else { + // upsample trilinear 3d + TORCH_INTERNAL_ASSERT(ndim == 5); + at::parallel_for(0, num_batches, 0, loop3d); + } + + if (!grad_input_.is_contiguous(channels_last_memory_format)) { + grad_input_.copy_(grad_input); + } +} + void upsample_linear1d_backward_kernel_impl( const Tensor& grad_input, const Tensor& grad_output, bool align_corners, c10::optional scales_w) { - AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "upsample_linear1d_backward", [&] { + AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_linear1d_backward", [&] { cpu_upsample_linear_backward(grad_input, grad_output, align_corners, {scales_w}); }); } @@ -172,9 +557,15 @@ void upsample_bilinear2d_backward_kernel_impl( bool align_corners, c10::optional scales_h, c10::optional scales_w) { - AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "upsample_bilinear2d_backward", [&] { - cpu_upsample_linear_backward(grad_input, grad_output, align_corners, {scales_h, scales_w}); - }); + if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast)) { + AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_bilinear2d_backward_channels_last", [&] { + cpu_upsample_linear_backward_channels_last(grad_input, grad_output, align_corners, {scales_h, scales_w}); + }); + } else { + AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_bilinear2d_backward", [&] { + cpu_upsample_linear_backward(grad_input, grad_output, align_corners, {scales_h, scales_w}); + }); + } } void upsample_trilinear3d_backward_kernel_impl( @@ -184,13 +575,26 @@ void upsample_trilinear3d_backward_kernel_impl( c10::optional scales_d, c10::optional scales_h, c10::optional scales_w) { - AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "upsample_trilinear3d_backward", [&] { - cpu_upsample_linear_backward(grad_input, grad_output, align_corners, {scales_d, scales_h, scales_w}); - }); + if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast3d)) { + AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_trilinear3d_backward_channels_last", [&] { + cpu_upsample_linear_backward_channels_last(grad_input, grad_output, align_corners, {scales_d, scales_h, scales_w}); + }); + } else { + AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_trilinear3d_backward", [&] { + cpu_upsample_linear_backward(grad_input, grad_output, align_corners, {scales_d, scales_h, scales_w}); + }); + } } } // anonymous namespace +REGISTER_DISPATCH(upsample_nearest1d_backward_kernel, &upsample_nearest1d_backward_kernel_impl); +REGISTER_DISPATCH(_upsample_nearest_exact1d_backward_kernel, &_upsample_nearest_exact1d_backward_kernel_impl); +REGISTER_DISPATCH(upsample_nearest2d_backward_kernel, &upsample_nearest2d_backward_kernel_impl); +REGISTER_DISPATCH(_upsample_nearest_exact2d_backward_kernel, &_upsample_nearest_exact2d_backward_kernel_impl); +REGISTER_DISPATCH(upsample_nearest3d_backward_kernel, &upsample_nearest3d_backward_kernel_impl); +REGISTER_DISPATCH(_upsample_nearest_exact3d_backward_kernel, &_upsample_nearest_exact3d_backward_kernel_impl); + REGISTER_DISPATCH(upsample_linear1d_backward_kernel, &upsample_linear1d_backward_kernel_impl); REGISTER_DISPATCH(upsample_bilinear2d_backward_kernel, &upsample_bilinear2d_backward_kernel_impl); REGISTER_DISPATCH(upsample_trilinear3d_backward_kernel, &upsample_trilinear3d_backward_kernel_impl); diff --git a/aten/src/ATen/native/cpu/WeightNormKernel.cpp b/aten/src/ATen/native/cpu/WeightNormKernel.cpp new file mode 100644 index 000000000000..dfec0a49aeb1 --- /dev/null +++ b/aten/src/ATen/native/cpu/WeightNormKernel.cpp @@ -0,0 +1,437 @@ +#include + +#include +#include +#include +#include +#include +#include + +namespace at { namespace native { + +namespace { + +template +void weight_norm_first_dim_kernel( + Tensor& w, + Tensor& norm, + const Tensor& v, + const Tensor& g, + int64_t M, int64_t N) { + const auto v_data = v.data_ptr(); + const auto g_data = g.data_ptr(); + auto w_data = w.data_ptr(); + auto norm_data = norm.data_ptr(); + + using Vec = vec::Vectorized; + at::parallel_for(0, M, 1, [&](int64_t begin, int64_t end) { + for (const auto i : c10::irange(begin, end)) { + accscalar_t norm_val = vec::map_reduce_all( + [](Vec x) { return x * x; }, + [](Vec x, Vec y) { return x + y; }, + v_data + i * N, + N); + norm_val = std::sqrt(norm_val); + norm_data[i] = norm_val; + + accscalar_t a = g_data[i] / norm_val; + vec::map( + [a](Vec x) { return x * Vec(a); }, + w_data + i * N, + v_data + i * N, + N); + } + }); +} + +template +inline void sum_norm_per_row( + scalar_t* out_ptr, + const scalar_t* v_ptr, + int64_t size) { + using Vec = vec::Vectorized; + vec::map2( + [](Vec out, Vec v) { return out + v * v; }, + out_ptr, + out_ptr, + v_ptr, + size); +} + +inline void sum_norm_per_row( + float* out_ptr, + const BFloat16* v_ptr, + int64_t size) { + using bVec = vec::Vectorized; + using fVec = vec::Vectorized; + int64_t d = 0; + for (; d < size - (size % bVec::size()); d += bVec::size()) { + bVec v_bvec = bVec::loadu(v_ptr + d); + fVec v_fvec0, v_fvec1; + std::tie(v_fvec0, v_fvec1) = convert_bfloat16_float(v_bvec); + + fVec out_fvec0 = fVec::loadu(out_ptr + d) + v_fvec0 * v_fvec0; + fVec out_fvec1 = fVec::loadu(out_ptr + d + fVec::size()) + v_fvec1 * v_fvec1; + out_fvec0.store(out_ptr + d); + out_fvec1.store(out_ptr + d + fVec::size()); + } + for(; d < size; ++d) { + float v_val = float(v_ptr[d]); + out_ptr[d] += v_val * v_val; + } +} + +template +inline void apply_norm_per_row( + scalar_t* w_ptr, + const scalar_t* v_ptr, + const scalar_t* a_ptr, + int64_t size) { + using Vec = vec::Vectorized; + vec::map2( + [](Vec v, Vec a) { return v * a; }, + w_ptr, + v_ptr, + a_ptr, + size); +} + +inline void apply_norm_per_row( + BFloat16* w_ptr, + const BFloat16* v_ptr, + const float* a_ptr, + int64_t size) { + using bVec = vec::Vectorized; + using fVec = vec::Vectorized; + int64_t d = 0; + for (; d < size - (size % bVec::size()); d += bVec::size()) { + bVec v_bvec = bVec::loadu(v_ptr + d); + fVec v_fvec0, v_fvec1; + std::tie(v_fvec0, v_fvec1) = convert_bfloat16_float(v_bvec); + + fVec w_fvec0 = fVec::loadu(a_ptr + d) * v_fvec0; + fVec w_fvec1 = fVec::loadu(a_ptr + d + fVec::size()) * v_fvec1; + bVec w_bvec = convert_float_bfloat16(w_fvec0, w_fvec1); + w_bvec.store(w_ptr + d); + } + for(; d < size; ++d) { + w_ptr[d] = float(v_ptr[d]) * a_ptr[d]; + } +} + +template +void weight_norm_last_dim_kernel( + Tensor& w, + Tensor& norm, + const Tensor& v, + const Tensor& g, + int64_t M, int64_t N) { + const auto v_data = v.data_ptr(); + const auto g_data = g.data_ptr(); + auto w_data = w.data_ptr(); + auto norm_data = norm.data_ptr(); + + int num_threads = at::get_num_threads(); + Tensor buffer = at::empty({num_threads, N}, norm.options()).zero_(); + auto buffer_data = buffer.data_ptr(); + + // vertical parallel reduction + at::parallel_for(0, M, 1, [&](int64_t begin, int64_t end) { + int tid = at::get_thread_num(); + TORCH_CHECK(tid < num_threads, "expect thread id smaller than ", num_threads, ", got thread id ", tid); + auto buffer_ptr = buffer_data + tid * N; + for (const auto i : c10::irange(begin, end)) { + sum_norm_per_row(buffer_ptr, v_data + i * N, N); + } + }); + + for (const auto j : c10::irange(N)) { + accscalar_t sum = 0; + for (const auto t : c10::irange(num_threads)) { + sum += buffer_data[t * N + j]; + } + norm_data[j] = std::sqrt(sum); + } + + // reuse the first row of buffer to store g / norm + vec::convert(g_data, buffer_data, N); + using Vec = vec::Vectorized; + vec::map2( + [](Vec g, Vec norm) { return g / norm; }, + buffer_data, + buffer_data, + norm_data, + N); + + // apply w = v * (g/norm) + at::parallel_for(0, M, 1, [&](int64_t begin, int64_t end) { + for (const auto i : c10::irange(begin, end)) { + apply_norm_per_row(w_data + i * N, v_data + i * N, buffer_data, N); + } + }); +} + +template +void weight_norm_backward_first_dim_kernel( + Tensor& grad_v, + Tensor& grad_g, + const Tensor& grad_w, + const Tensor& saved_v, + const Tensor& saved_g, + const Tensor& saved_norm, + int64_t M, int64_t N) { + const auto grad_w_data = grad_w.data_ptr(); + const auto saved_v_data = saved_v.data_ptr(); + const auto saved_g_data = saved_g.data_ptr(); + const auto saved_norm_data = saved_norm.data_ptr(); + auto grad_v_data = grad_v.data_ptr(); + auto grad_g_data = grad_g.data_ptr(); + + using Vec = vec::Vectorized; + at::parallel_for(0, M, 1, [&](int64_t begin, int64_t end) { + for (const auto i : c10::irange(begin, end)) { + accscalar_t per_dim_sum_val = vec::map2_reduce_all( + [](Vec grad_w, Vec saved_v) { return grad_w * saved_v; }, + [](Vec x, Vec y) { return x + y; }, + grad_w_data + i * N, + saved_v_data + i * N, + N); + + accscalar_t saved_norm_val = saved_norm_data[i]; + accscalar_t saved_g_val = accscalar_t(saved_g_data[i]); + accscalar_t grad_g_val = per_dim_sum_val / saved_norm_val; + + // grad_g = sum / norm + // grad_v = (g / norm) * (grad_w - v * (sum / norm^2)) + // let a = g /norm + // b = a * grad_g / norm + // grad_v = a * grad_w - b * v + grad_g_data[i] = scalar_t(grad_g_val); + accscalar_t a = saved_g_val / saved_norm_val; + accscalar_t b = a * grad_g_val / saved_norm_val; + + vec::map2( + [a, b](Vec grad_w, Vec v) { return Vec(a) * grad_w - Vec(b) * v; }, + grad_v_data + i * N, + grad_w_data + i * N, + saved_v_data + i * N, + N); + } + }); +} + +template +inline void sum_product_per_row( + scalar_t* out_ptr, + const scalar_t* grad_w_ptr, + const scalar_t* v_ptr, + int64_t size) { + using Vec = vec::Vectorized; + vec::map3( + [](Vec out, Vec grad_w, Vec v) { return out + grad_w * v; }, + out_ptr, + out_ptr, + grad_w_ptr, + v_ptr, + size); +} + +inline void sum_product_per_row( + float* out_ptr, + const BFloat16* grad_w_ptr, + const BFloat16* v_ptr, + int64_t size) { + using bVec = vec::Vectorized; + using fVec = vec::Vectorized; + int64_t d = 0; + for (; d < size - (size % bVec::size()); d += bVec::size()) { + bVec grad_w_bvec = bVec::loadu(grad_w_ptr + d); + fVec grad_w_fvec0, grad_w_fvec1; + std::tie(grad_w_fvec0, grad_w_fvec1) = convert_bfloat16_float(grad_w_bvec); + bVec v_bvec = bVec::loadu(v_ptr + d); + fVec v_fvec0, v_fvec1; + std::tie(v_fvec0, v_fvec1) = convert_bfloat16_float(v_bvec); + + fVec out_fvec0 = fVec::loadu(out_ptr + d) + grad_w_fvec0 * v_fvec0; + fVec out_fvec1 = fVec::loadu(out_ptr + d + fVec::size()) + grad_w_fvec1 * v_fvec1; + out_fvec0.store(out_ptr + d); + out_fvec1.store(out_ptr + d + fVec::size()); + } + for(; d < size; ++d) { + float grad_w_val = float(grad_w_ptr[d]); + float v_val = float(v_ptr[d]); + out_ptr[d] += grad_w_val * v_val; + } +} + +template +inline void apply_per_row_backward( + scalar_t* grad_v_ptr, + const scalar_t* grad_w_ptr, + const scalar_t* v_ptr, + const scalar_t* a_ptr, + const scalar_t* b_ptr, + int64_t size) { + using Vec = vec::Vectorized; + vec::map4( + [](Vec grad_w, Vec v, Vec a, Vec b) { return a * grad_w - b * v; }, + grad_v_ptr, + grad_w_ptr, + v_ptr, + a_ptr, + b_ptr, + size); +} + +inline void apply_per_row_backward( + BFloat16* grad_v_ptr, + const BFloat16* grad_w_ptr, + const BFloat16* v_ptr, + const float* a_ptr, + const float* b_ptr, + int64_t size) { + using bVec = vec::Vectorized; + using fVec = vec::Vectorized; + int64_t d = 0; + for (; d < size - (size % bVec::size()); d += bVec::size()) { + bVec grad_w_bvec = bVec::loadu(grad_w_ptr + d); + fVec grad_w_fvec0, grad_w_fvec1; + std::tie(grad_w_fvec0, grad_w_fvec1) = convert_bfloat16_float(grad_w_bvec); + bVec v_bvec = bVec::loadu(v_ptr + d); + fVec v_fvec0, v_fvec1; + std::tie(v_fvec0, v_fvec1) = convert_bfloat16_float(v_bvec); + + fVec grad_v_fvec0 = fVec::loadu(a_ptr + d) * grad_w_fvec0 - fVec::loadu(b_ptr + d) * v_fvec0; + fVec grad_v_fvec1 = fVec::loadu(a_ptr + d + fVec::size()) * grad_w_fvec1 + - fVec::loadu(b_ptr + d + fVec::size()) * v_fvec1; + bVec grad_v_bvec = convert_float_bfloat16(grad_v_fvec0, grad_v_fvec1); + grad_v_bvec.store(grad_v_ptr + d); + } + for(; d < size; ++d) { + grad_v_ptr[d] = float(grad_w_ptr[d]) * a_ptr[d] - float(v_ptr[d]) * b_ptr[d]; + } +} + +template +void weight_norm_backward_last_dim_kernel( + Tensor& grad_v, + Tensor& grad_g, + const Tensor& grad_w, + const Tensor& saved_v, + const Tensor& saved_g, + const Tensor& saved_norm, + int64_t M, int64_t N) { + const auto grad_w_data = grad_w.data_ptr(); + const auto saved_v_data = saved_v.data_ptr(); + const auto saved_g_data = saved_g.data_ptr(); + const auto saved_norm_data = saved_norm.data_ptr(); + auto grad_v_data = grad_v.data_ptr(); + auto grad_g_data = grad_g.data_ptr(); + + int num_threads = at::get_num_threads(); + Tensor buffer = at::empty({num_threads, N}, saved_norm.options()).zero_(); + auto buffer_data = buffer.data_ptr(); + + // vertical parallel reduction + at::parallel_for(0, M, 1, [&](int64_t begin, int64_t end) { + int tid = at::get_thread_num(); + TORCH_CHECK(tid < num_threads, "expect thread id smaller than ", num_threads, ", got thread id ", tid); + auto buffer_ptr = buffer_data + tid * N; + for (const auto i : c10::irange(begin, end)) { + sum_product_per_row(buffer_ptr, grad_w_data + i * N, saved_v_data + i * N, N); + } + }); + + // store result on the first row of buffer + for (const auto j : c10::irange(N)) { + accscalar_t sum = 0; + for (const auto t : c10::irange(num_threads)) { + sum += buffer_data[t * N + j]; + } + buffer_data[j] = sum; + } + + accscalar_t* per_dim_sum = buffer_data; + accscalar_t* a = buffer_data + N; + accscalar_t* b = buffer_data + 2 * N; + + // a = g /norm + // b = a * grad_g / norm + for (const auto j : c10::irange(N)) { + accscalar_t saved_norm_val = saved_norm_data[j]; + accscalar_t saved_g_val = accscalar_t(saved_g_data[j]); + accscalar_t grad_g_val = per_dim_sum[j] / saved_norm_val; + grad_g_data[j] = scalar_t(grad_g_val); + + a[j] = saved_g_val / saved_norm_val; + b[j] = a[j] * grad_g_val / saved_norm_val; + } + + // apply grad_v = a * grad_w - b * v + at::parallel_for(0, M, 1, [&](int64_t begin, int64_t end) { + for (const auto i : c10::irange(begin, end)) { + apply_per_row_backward( + grad_v_data + i * N, + grad_w_data + i * N, + saved_v_data + i * N, + a, + b, + N); + } + }); +} + +void weight_norm_kernel( + Tensor& w, + Tensor& norm, + const Tensor& v, + const Tensor& g, + int64_t dim) { + TORCH_INTERNAL_ASSERT(dim == 0 || dim == v.dim() - 1, + "fused kernels can only be applied for first or last dim"); + AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, v.scalar_type(), + "weight_norm_kernel", [&]() { + using accscalar_t = vec::vec_scalar_t; + if (dim == 0) { + int64_t M = v.size(0); + int64_t N = v.numel() / M; + weight_norm_first_dim_kernel(w, norm, v, g, M, N); + } else { + int64_t N = v.size(-1); + int64_t M = v.numel() / N; + weight_norm_last_dim_kernel(w, norm, v, g, M, N); + } + }); +} + +void weight_norm_backward_kernel( + Tensor& grad_v, + Tensor& grad_g, + const Tensor& grad_w, + const Tensor& saved_v, + const Tensor& saved_g, + const Tensor& saved_norm, + int64_t dim) { + TORCH_INTERNAL_ASSERT(dim == 0 || dim == saved_v.dim() - 1, + "fused kernels can only be applied for first or last dim"); + AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, saved_v.scalar_type(), + "weight_norm_backward_kernel", [&]() { + using accscalar_t = vec::vec_scalar_t; + if (dim == 0) { + int64_t M = saved_v.size(0); + int64_t N = saved_v.numel() / M; + weight_norm_backward_first_dim_kernel(grad_v, grad_g, grad_w, saved_v, saved_g, saved_norm, M, N); + } else { + int64_t N = saved_v.size(-1); + int64_t M = saved_v.numel() / N; + weight_norm_backward_last_dim_kernel(grad_v, grad_g, grad_w, saved_v, saved_g, saved_norm, M, N); + } + }); +} + +} // anonymous namespace + +REGISTER_DISPATCH(weight_norm_stub, &weight_norm_kernel); +REGISTER_DISPATCH(weight_norm_backward_stub, &weight_norm_backward_kernel); + +}} // at::native diff --git a/aten/src/ATen/native/cpu/WeightNormKernel.h b/aten/src/ATen/native/cpu/WeightNormKernel.h new file mode 100644 index 000000000000..1f5ad65b52d9 --- /dev/null +++ b/aten/src/ATen/native/cpu/WeightNormKernel.h @@ -0,0 +1,15 @@ +#pragma once + +#include +#include + +namespace at { namespace native { + +using weight_norm_fn = void(*)(Tensor&, Tensor&, const Tensor&, const Tensor&, int64_t); +using weight_norm_backward_fn = void(*)( + Tensor&, Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, int64_t); + +DECLARE_DISPATCH(weight_norm_fn, weight_norm_stub); +DECLARE_DISPATCH(weight_norm_backward_fn, weight_norm_backward_stub); + +}} // namespace at::native diff --git a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp index 302edc1e1d0a..ad277a278fa2 100644 --- a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp +++ b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp @@ -1,36 +1,44 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include -#include +#include #include -#include #include #include #include #include #include +#include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#endif + namespace at { namespace native { namespace { using namespace vec; -template +template void batch_norm_cpu_collect_linear_and_constant_terms( - scalar_t* alpha, scalar_t* beta, int64_t n_channel, + accscalar_t* alpha, accscalar_t* beta, int64_t n_channel, const Tensor& weight /* optional */, const Tensor& bias /* optional */, const Tensor& save_mean, const Tensor& save_invstd, const Tensor& running_mean, const Tensor& running_var, bool train, double eps) { - const scalar_t* weight_data = weight.defined() ? weight.data_ptr() : nullptr; - const scalar_t* bias_data = bias.defined() ? bias.data_ptr() : nullptr; + const param_t* weight_data = weight.defined() ? weight.data_ptr() : nullptr; + const param_t* bias_data = bias.defined() ? bias.data_ptr() : nullptr; - auto save_mean_a = conditional_accessor_1d(save_mean); - auto save_invstd_a = conditional_accessor_1d(save_invstd); - auto running_mean_a = conditional_accessor_1d(running_mean); - auto running_var_a = conditional_accessor_1d(running_var); + auto save_mean_a = conditional_accessor_1d(save_mean); + auto save_invstd_a = conditional_accessor_1d(save_invstd); + auto running_mean_a = conditional_accessor_1d(running_mean); + auto running_var_a = conditional_accessor_1d(running_var); /// Collect the linear and constant terms regarding the input. /// output(n, c, h, w) @@ -44,16 +52,16 @@ void batch_norm_cpu_collect_linear_and_constant_terms( /// Note that this is only a good idea if (input_size >> c), in degenerate /// cases where image_size == 1 && batch_size == 1, it is slow. for (const auto c : c10::irange(n_channel)) { - scalar_t mean, invstd; + accscalar_t mean, invstd; if (train) { mean = save_mean_a[c]; invstd = save_invstd_a[c]; } else { mean = running_mean_a[c]; - invstd = 1 / std::sqrt(running_var_a[c] + static_cast(eps)); + invstd = 1 / std::sqrt(running_var_a[c] + static_cast(eps)); } - scalar_t weight_v = weight_data ? weight_data[c] : 1; - scalar_t bias_v = bias_data ? bias_data[c] : 0; + param_t weight_v = weight_data ? weight_data[c] : param_t(1); + param_t bias_v = bias_data ? bias_data[c] : param_t(0); alpha[c] = invstd * weight_v; beta[c] = bias_v - mean * alpha[c]; } @@ -75,7 +83,7 @@ void batch_norm_cpu_contiguous_impl(Tensor& output, const Tensor& input, scalar_t* alpha_data = alpha.data_ptr(); scalar_t* beta_data = beta.data_ptr(); - batch_norm_cpu_collect_linear_and_constant_terms( + batch_norm_cpu_collect_linear_and_constant_terms( alpha_data, beta_data, n_channel, weight, bias, save_mean, save_invstd, running_mean, running_var, train, eps); @@ -84,62 +92,37 @@ void batch_norm_cpu_contiguous_impl(Tensor& output, const Tensor& input, // Apply the linear terms to the input, // output(n, c, h, w) = input(n, c, h, w) * alpha(c) + beta(c) - if (image_size != 1) { - const int64_t loop_size = image_size - (image_size % Vec::size()); - at::parallel_for(0, n_batch * n_channel, 1, [&](int64_t begin, int64_t end) { - int64_t n = 0; - int64_t c = 0; - data_index_init(begin, n, n_batch, c, n_channel); + const int64_t loop_size = image_size - (image_size % Vec::size()); + at::parallel_for(0, n_batch * n_channel, 1, [&](int64_t begin, int64_t end) { + int64_t n = 0; + int64_t c = 0; + data_index_init(begin, n, n_batch, c, n_channel); - for (const auto i : c10::irange(begin, end)) { - const Vec alpha_vec(alpha_data[c]); - const Vec beta_vec(beta_data[c]); - int64_t offset = i * image_size; - int64_t d = 0; - for (; d < loop_size; d += Vec::size()) { - Vec data_vec = Vec::loadu(input_data + offset + d); - Vec output_vec = data_vec * alpha_vec + beta_vec; - output_vec.store(output_data + offset + d); - } - if (image_size - d > 0) { - Vec data_vec = Vec::loadu(input_data + offset + d, image_size - d); - Vec output_vec = data_vec * alpha_vec + beta_vec; - output_vec.store(output_data + offset + d, image_size - d); - } - // move on to next index - data_index_step(n, n_batch, c, n_channel); + for (const auto i : c10::irange(begin, end)) { + const Vec alpha_vec(alpha_data[c]); + const Vec beta_vec(beta_data[c]); + int64_t offset = i * image_size; + int64_t d = 0; + for (; d < loop_size; d += Vec::size()) { + Vec data_vec = Vec::loadu(input_data + offset + d); + Vec output_vec = data_vec * alpha_vec + beta_vec; + output_vec.store(output_data + offset + d); } - }); - } else { - // image_size == 1 - const int64_t loop_size = n_channel - (n_channel % Vec::size()); - at::parallel_for(0, n_batch, 1, [&](int64_t begin, int64_t end) { - for (const auto n : c10::irange(begin, end)) { - int64_t offset = n * n_channel; - int64_t d = 0; - for (; d < loop_size; d += Vec::size()) { - Vec alpha_vec = Vec::loadu(alpha_data + d); - Vec beta_vec = Vec::loadu(beta_data + d); - Vec data_vec = Vec::loadu(input_data + offset + d); - Vec output_vec = data_vec * alpha_vec + beta_vec; - output_vec.store(output_data + offset + d); - } - if (n_channel - d > 0) { - Vec alpha_vec = Vec::loadu(alpha_data + d, n_channel - d); - Vec beta_vec = Vec::loadu(beta_data + d, n_channel - d); - Vec data_vec = Vec::loadu(input_data + offset + d, n_channel - d); - Vec output_vec = data_vec * alpha_vec + beta_vec; - output_vec.store(output_data + offset + d, n_channel - d); - } + if (image_size - d > 0) { + Vec data_vec = Vec::loadu(input_data + offset + d, image_size - d); + Vec output_vec = data_vec * alpha_vec + beta_vec; + output_vec.store(output_data + offset + d, image_size - d); } - }); - } + // move on to next index + data_index_step(n, n_batch, c, n_channel); + } + }); } template void batch_norm_cpu_channels_last_impl(Tensor& output, const Tensor& input, const Tensor& weight, const Tensor& bias, const Tensor& save_mean, const Tensor& save_invstd, - const Tensor& running_mean, const Tensor& runnning_var, bool train, double eps) { + const Tensor& running_mean, const Tensor& running_var, bool train, double eps) { using Vec = Vectorized; int64_t n_batch = input.size(0); @@ -151,9 +134,9 @@ void batch_norm_cpu_channels_last_impl(Tensor& output, const Tensor& input, scalar_t* alpha_data = alpha.data_ptr(); scalar_t* beta_data = beta.data_ptr(); - batch_norm_cpu_collect_linear_and_constant_terms( + batch_norm_cpu_collect_linear_and_constant_terms( alpha_data, beta_data, n_channel, weight, bias, - save_mean, save_invstd, running_mean, runnning_var, train, eps); + save_mean, save_invstd, running_mean, running_var, train, eps); scalar_t* output_data = output.data_ptr(); const scalar_t* input_data = input.data_ptr(); @@ -609,16 +592,660 @@ void batch_norm_cpu_backward_channels_last_impl(Tensor& grad_input, Tensor& grad } } +/// bfloat16 kernels +template<> +void batch_norm_cpu_contiguous_impl(Tensor& output, const Tensor& input, + const Tensor& weight, const Tensor& bias, const Tensor& save_mean, const Tensor& save_invstd, + const Tensor& running_mean, const Tensor& running_var, bool train, double eps) { + + using bVec = Vectorized; + using fVec = Vectorized; + int64_t n_batch = input.size(0); + int64_t n_channel = input.size(1); + int64_t image_size = input.numel() / n_batch / n_channel; + + // use float as acc type + Tensor alpha = at::empty({n_channel}, input.options().dtype(kFloat)); + Tensor beta = at::empty({n_channel}, input.options().dtype(kFloat)); + float* alpha_data = alpha.data_ptr(); + float* beta_data = beta.data_ptr(); + + const bool mixed_type = is_mixed_type(input, weight, bias, save_mean, save_invstd, running_mean, running_var); + if (mixed_type) { + batch_norm_cpu_collect_linear_and_constant_terms( + alpha_data, beta_data, n_channel, weight, bias, + save_mean, save_invstd, running_mean, running_var, train, eps); + } else { + batch_norm_cpu_collect_linear_and_constant_terms( + alpha_data, beta_data, n_channel, weight, bias, + save_mean, save_invstd, running_mean, running_var, train, eps); + } + + BFloat16* output_data = output.data_ptr(); + const BFloat16* input_data = input.data_ptr(); + + const int64_t loop_size = image_size - (image_size % bVec::size()); + at::parallel_for(0, n_batch * n_channel, 1, [&](int64_t begin, int64_t end) { + int64_t n = 0; + int64_t c = 0; + data_index_init(begin, n, n_batch, c, n_channel); + + for (const auto i : c10::irange(begin, end)) { + const BFloat16* input_ptr = input_data + i * image_size; + BFloat16* output_ptr = output_data + i * image_size; + const float alpha_val = alpha_data[c]; + const float beta_val = beta_data[c]; + const fVec alpha_fvec(alpha_val); + const fVec beta_fvec(beta_val); + int64_t d = 0; + for (; d < loop_size; d += bVec::size()) { + bVec data_bvec = bVec::loadu(input_ptr + d); + fVec data_fvec0, data_fvec1; + std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec); + + fVec out_fvec0 = data_fvec0 * alpha_fvec + beta_fvec; + fVec out_fvec1 = data_fvec1 * alpha_fvec + beta_fvec; + bVec out_bvec = convert_float_bfloat16(out_fvec0, out_fvec1); + out_bvec.store(output_ptr + d); + } + for (; d < image_size; d++) { + output_ptr[d] = BFloat16(float(input_ptr[d]) * alpha_val + beta_val); + } + // move on to next index + data_index_step(n, n_batch, c, n_channel); + } + }); +} + +template <> +void batch_norm_cpu_channels_last_impl(Tensor& output, const Tensor& input, + const Tensor& weight, const Tensor& bias, const Tensor& save_mean, const Tensor& save_invstd, + const Tensor& running_mean, const Tensor& running_var, bool train, double eps) { + + using bVec = Vectorized; + using fVec = Vectorized; + int64_t n_batch = input.size(0); + int64_t n_channel = input.size(1); + int64_t image_size = input.numel() / n_batch / n_channel; + + Tensor alpha = at::empty({n_channel}, input.options().dtype(kFloat)); + Tensor beta = at::empty({n_channel}, input.options().dtype(kFloat)); + float* alpha_data = alpha.data_ptr(); + float* beta_data = beta.data_ptr(); + + const bool mixed_type = is_mixed_type(input, weight, bias, save_mean, save_invstd, running_mean, running_var); + if (mixed_type) { + batch_norm_cpu_collect_linear_and_constant_terms( + alpha_data, beta_data, n_channel, weight, bias, + save_mean, save_invstd, running_mean, running_var, train, eps); + } else { + batch_norm_cpu_collect_linear_and_constant_terms( + alpha_data, beta_data, n_channel, weight, bias, + save_mean, save_invstd, running_mean, running_var, train, eps); + } + + BFloat16* output_data = output.data_ptr(); + const BFloat16* input_data = input.data_ptr(); + + const int64_t loop_size = n_channel - (n_channel % bVec::size()); + at::parallel_for(0, n_batch * image_size, 1, [&](int64_t begin, int64_t end) { + for (const auto i : c10::irange(begin, end)) { + const BFloat16* input_ptr = input_data + i * n_channel; + BFloat16* output_ptr = output_data + i * n_channel; + int64_t d = 0; + for (; d < loop_size; d += bVec::size()) { + fVec alpha_fvec0 = fVec::loadu(alpha_data + d); + fVec alpha_fvec1 = fVec::loadu(alpha_data + d + fVec::size()); + fVec beta_fvec0 = fVec::loadu(beta_data + d); + fVec beta_fvec1 = fVec::loadu(beta_data + d + fVec::size()); + bVec data_bvec = bVec::loadu(input_ptr + d); + fVec data_fvec0, data_fvec1; + std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec); + + fVec out_fvec0 = data_fvec0 * alpha_fvec0 + beta_fvec0; + fVec out_fvec1 = data_fvec1 * alpha_fvec1 + beta_fvec1; + bVec out_bvec = convert_float_bfloat16(out_fvec0, out_fvec1); + out_bvec.store(output_ptr + d); + } + for (; d < n_channel; d++) { + output_ptr[d] = BFloat16(float(input_ptr[d]) * alpha_data[d] + beta_data[d]); + } + } + }); +} + +template +inline void batch_norm_cpu_collect_stats_contiguous_internal( + Tensor& mean, Tensor& var_sum, const Tensor& input) { + + using bVec = Vectorized; + using fVec = Vectorized; + int64_t n_batch = input.size(0); + int64_t n_channel = input.size(1); + int64_t image_size = input.numel() / n_batch / n_channel; + int64_t N = input.numel() / n_channel; + + const BFloat16* input_data = input.data_ptr(); + param_t* mean_data = mean.data_ptr(); + param_t* var_sum_data = var_sum.data_ptr(); + + at::parallel_for(0, n_channel, 1, [&](int64_t begin, int64_t end) { + for (const auto c : c10::irange(begin, end)) { + float sum_val = float(0); + fVec sum_fvec = fVec(float(0)); + for (int64_t n = 0; n < n_batch; n++) { + const BFloat16* input_ptr = input_data + n * n_channel * image_size + c * image_size; + int64_t d = 0; + for (; d < image_size - (image_size % bVec::size()); d += bVec::size()) { + bVec data_bvec = bVec::loadu(input_ptr + d); + fVec data_fvec0, data_fvec1; + std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec); + sum_fvec += data_fvec0; + sum_fvec += data_fvec1; + } + for (; d < image_size; d++) { + sum_val += float(input_ptr[d]); + } + } + // TODO: use fast version + sum_val += vec_reduce_all([](fVec& x, fVec& y) { return x + y; }, sum_fvec, fVec::size()); + float mean_val = sum_val / N; + mean_data[c] = param_t(mean_val); + + float var_val = float(0); + fVec var_fvec = fVec(float(0)); + fVec mean_fvec = fVec(mean_val); + for (int64_t n = 0; n < n_batch; n++) { + const BFloat16* input_ptr = input_data + n * n_channel * image_size + c * image_size; + int64_t d = 0; + for (; d < image_size - (image_size % bVec::size()); d += bVec::size()) { + bVec data_bvec = bVec::loadu(input_ptr + d); + fVec data_fvec0, data_fvec1; + std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec); + var_fvec += (data_fvec0 - mean_fvec) * (data_fvec0 - mean_fvec); + var_fvec += (data_fvec1 - mean_fvec) * (data_fvec1 - mean_fvec); + } + for (; d < image_size; d++) { + float data_val = input_ptr[d]; + var_val += (data_val - mean_val) * (data_val - mean_val); + } + } + // TODO: use fast version + var_val += vec_reduce_all([](fVec& x, fVec& y) { return x + y; }, var_fvec, fVec::size()); + var_sum_data[c] = param_t(var_val); + } + }); +} + +template <> +void batch_norm_cpu_collect_stats_contiguous_impl( + Tensor& mean, Tensor& var_sum, const Tensor& input) { + const bool mixed_type = is_mixed_type(input, mean, var_sum); + if (mixed_type) { + batch_norm_cpu_collect_stats_contiguous_internal(mean, var_sum, input); + } else { + batch_norm_cpu_collect_stats_contiguous_internal(mean, var_sum, input); + } +} + +static inline std::tuple, Vectorized> load2f(const BFloat16* ptr) { + return convert_bfloat16_float(Vectorized::loadu(ptr)); +} + +static inline std::tuple, Vectorized> load2f(const float* ptr) { + using Vec = Vectorized; + return std::make_tuple(Vec::loadu(ptr), Vec::loadu(ptr + Vec::size())); +} + +template +inline void batch_norm_cpu_collect_stats_channels_last_internal( + Tensor& mean, Tensor& var_sum, const Tensor& input) { + + using bVec = Vectorized; + using fVec = Vectorized; + int64_t n_channel = input.size(1); + int64_t N = input.numel() / n_channel; + + const BFloat16* input_data = input.data_ptr(); + param_t* mean_data = mean.data_ptr(); + param_t* var_sum_data = var_sum.data_ptr(); + + int num_threads = at::get_num_threads(); + Tensor buffer = at::empty({num_threads, n_channel}, input.options().dtype(kFloat)).zero_(); + float* buffer_data = buffer.data_ptr(); + + at::parallel_for(0, N, 1, [&](int64_t begin, int64_t end) { + int tid = at::get_thread_num(); + TORCH_CHECK(tid < num_threads, "expect thread id smaller than ", num_threads, ", got thread id ", tid); + float* buffer_ptr = buffer_data + tid * n_channel; + for (const auto i : c10::irange(begin, end)) { + const BFloat16* input_ptr = input_data + i * n_channel; + int64_t d = 0; + for (; d < n_channel - (n_channel % bVec::size()); d += bVec::size()) { + bVec data_bvec = bVec::loadu(input_ptr + d); + fVec data_fvec0, data_fvec1; + std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec); + fVec sum_fvec0 = fVec::loadu(buffer_ptr + d) + data_fvec0; + fVec sum_fvec1 = fVec::loadu(buffer_ptr + d + fVec::size()) + data_fvec1; + sum_fvec0.store(buffer_ptr + d); + sum_fvec1.store(buffer_ptr + d + fVec::size()); + } + for (; d < n_channel; d++) { + buffer_ptr[d] += input_ptr[d]; + } + } + }); + + for (const auto c : c10::irange(n_channel)) { + float sum = 0; + for (const auto t : c10::irange(num_threads)) { + sum += buffer_data[t * n_channel + c]; + } + mean_data[c] = param_t(sum / N); + } + + buffer.zero_(); + at::parallel_for(0, N, 1, [&](int64_t begin, int64_t end) { + int tid = at::get_thread_num(); + TORCH_CHECK(tid < num_threads, "expect thread id smaller than ", num_threads, ", got thread id ", tid); + float* buffer_ptr = buffer_data + tid * n_channel; + for (const auto i : c10::irange(begin, end)) { + const BFloat16* input_ptr = input_data + i * n_channel; + int64_t d = 0; + for (; d < n_channel - (n_channel % bVec::size()); d += bVec::size()) { + bVec data_bvec = bVec::loadu(input_ptr + d); + fVec data_fvec0, data_fvec1; + std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec); + fVec mean_fvec0, mean_fvec1; + std::tie(mean_fvec0, mean_fvec1) = load2f(mean_data + d); + fVec var_fvec0 = fVec::loadu(buffer_ptr + d); + fVec var_fvec1 = fVec::loadu(buffer_ptr + d + fVec::size()); + var_fvec0 += (data_fvec0 - mean_fvec0) * (data_fvec0 - mean_fvec0); + var_fvec1 += (data_fvec1 - mean_fvec1) * (data_fvec1 - mean_fvec1); + var_fvec0.store(buffer_ptr + d); + var_fvec1.store(buffer_ptr + d + fVec::size()); + } + for (; d < n_channel; d++) { + float data_val = float(input_ptr[d]); + float mean_val = float(mean_data[d]); + buffer_ptr[d] += (data_val - mean_val) * (data_val - mean_val); + } + } + }); + + for (const auto c : c10::irange(n_channel)) { + float _var_sum = 0; + for (const auto t : c10::irange(num_threads)) { + _var_sum += buffer_data[t * n_channel + c]; + } + var_sum_data[c] = param_t(_var_sum); + } +} + +template <> +void batch_norm_cpu_collect_stats_channels_last_impl( + Tensor& mean, Tensor& var_sum, const Tensor& input) { + const bool mixed_type = is_mixed_type(input, mean, var_sum); + if (mixed_type) { + batch_norm_cpu_collect_stats_channels_last_internal(mean, var_sum, input); + } else { + batch_norm_cpu_collect_stats_channels_last_internal(mean, var_sum, input); + } +} + +template +void batch_norm_cpu_backward_contiguous_internal(Tensor& grad_input, Tensor& grad_weight, Tensor& grad_bias, + const Tensor& grad_output, const Tensor& input, const Tensor& weight, + const Tensor& running_mean, const Tensor& running_var, const Tensor& save_mean, const Tensor& save_invstd, + bool train, double eps) { + + using bVec = Vectorized; + using fVec = Vectorized; + int64_t n_batch = input.size(0); + int64_t n_channel = input.size(1); + int64_t image_size = input.numel() / n_batch / n_channel; + int64_t N = input.numel() / n_channel; + + const BFloat16* grad_output_data = grad_output.data_ptr(); + const BFloat16* input_data = input.data_ptr(); + + BFloat16* grad_input_data = grad_input.defined() ? grad_input.data_ptr() : nullptr; + param_t* grad_weight_data = grad_weight.defined() ? grad_weight.data_ptr() : nullptr; + param_t* grad_bias_data = grad_bias.defined() ? grad_bias.data_ptr() : nullptr; + const bool grad_input_null = grad_input_data == nullptr; + const bool grad_weight_null = grad_weight_data == nullptr; + const bool grad_bias_null = grad_bias_data == nullptr; + + auto weight_a = conditional_accessor_1d(weight); + auto save_mean_a = conditional_accessor_1d(save_mean); + auto save_invstd_a = conditional_accessor_1d(save_invstd); + auto running_mean_a = conditional_accessor_1d(running_mean); + auto running_var_a = conditional_accessor_1d(running_var); + + // parallel dim reduce on 'channel' + at::parallel_for(0, n_channel, 1, [&](int64_t begin, int64_t end) { + for (const auto c : c10::irange(begin, end)) { + float w = weight.defined() ? float(weight_a[c]) : 1; + + float mean, invstd; + if (train) { + mean = save_mean_a[c]; + invstd = save_invstd_a[c]; + } else { + mean = running_mean_a[c]; + invstd = 1 / std::sqrt(running_var_a[c] + eps); + } + + // compute 1) sum; 2) dot product of Q(X) and dY. + float sum{0}, dotp{0}; + fVec sum_fvec{0}, dotp_fvec{0}; + for (const auto n : c10::irange(n_batch)) { + const BFloat16* x_ptr = input_data + n * n_channel * image_size + c * image_size; + const BFloat16* dy_ptr = grad_output_data + n * n_channel * image_size + c * image_size; + + int64_t d = 0; + for (; d < image_size - (image_size % bVec::size()); d += bVec::size()) { + bVec dy_bvec = bVec::loadu(dy_ptr + d); + fVec dy_fvec0, dy_fvec1; + std::tie(dy_fvec0, dy_fvec1) = convert_bfloat16_float(dy_bvec); + sum_fvec += dy_fvec0; + sum_fvec += dy_fvec1; + + bVec x_bvec = bVec::loadu(x_ptr + d); + fVec x_fvec0, x_fvec1; + std::tie(x_fvec0, x_fvec1) = convert_bfloat16_float(x_bvec); + dotp_fvec += (x_fvec0 - fVec(mean)) * dy_fvec0; + dotp_fvec += (x_fvec1 - fVec(mean)) * dy_fvec1; + } + for (; d < image_size; d++) { + sum += float(dy_ptr[d]); + dotp += (float(x_ptr[d]) - mean) * float(dy_ptr[d]); + } + } + // TODO: use fast version + sum += vec_reduce_all([](fVec& x, fVec& y) { return x + y; }, sum_fvec, fVec::size()); + dotp += vec_reduce_all([](fVec& x, fVec& y) { return x + y; }, dotp_fvec, fVec::size()); + + if (!grad_input_null) { + if (train) { + float k = (float) dotp * invstd * invstd / N; + float grad_mean = sum / N; + for (const auto n : c10::irange(n_batch)) { + const BFloat16* x_ptr = input_data + n * n_channel * image_size + c * image_size; + BFloat16* dx_ptr = grad_input_data + n * n_channel * image_size + c * image_size; + const BFloat16* dy_ptr = grad_output_data + n * n_channel * image_size + c * image_size; + vec::map2( + [=](fVec x, fVec dy) { + fVec dx = (x - fVec(mean)) * fVec(k); + return (dy - fVec(grad_mean) - dx) * fVec(invstd) * fVec(w); + }, + dx_ptr, x_ptr, dy_ptr, image_size); + } + } else { // evaluation mode + for (const auto n : c10::irange(n_batch)) { + BFloat16* dx_ptr = grad_input_data + n * n_channel * image_size + c * image_size; + const BFloat16* dy_ptr = grad_output_data + n * n_channel * image_size + c * image_size; + vec::map( + [=](fVec dy) { return dy * fVec(invstd) * fVec(w); }, + dx_ptr, dy_ptr, image_size); + } + } + } + + if (!grad_weight_null) { + grad_weight_data[c] = param_t(dotp * invstd); + } + + if (!grad_bias_null) { + grad_bias_data[c] = param_t(sum); + } + } + }); +} + +template <> +void batch_norm_cpu_backward_contiguous_impl(Tensor& grad_input, Tensor& grad_weight, Tensor& grad_bias, + const Tensor& grad_output, const Tensor& input, const Tensor& weight, + const Tensor& running_mean, const Tensor& running_var, const Tensor& save_mean, const Tensor& save_invstd, + bool train, double eps) { + const bool mixed_type = is_mixed_type(input, weight, running_mean, running_var, save_mean, save_invstd); + if (mixed_type) { + batch_norm_cpu_backward_contiguous_internal(grad_input, grad_weight, grad_bias, + grad_output, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps); + } else { + batch_norm_cpu_backward_contiguous_internal(grad_input, grad_weight, grad_bias, + grad_output, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps); + } +} + +template +void batch_norm_cpu_backward_channels_last_internal(Tensor& grad_input, Tensor& grad_weight, Tensor& grad_bias, + const Tensor& grad_output, const Tensor& input, const Tensor& weight, + const Tensor& running_mean, const Tensor& running_var, const Tensor& save_mean, const Tensor& save_invstd, + bool train, double eps) { + + using bVec = Vectorized; + using fVec = Vectorized; + int64_t n_channel = input.size(1); + int64_t N = input.numel() / n_channel; + + const BFloat16* grad_output_data = grad_output.data_ptr(); + const BFloat16* input_data = input.data_ptr(); + + BFloat16* grad_input_data = grad_input.defined() ? grad_input.data_ptr() : nullptr; + param_t* grad_weight_data = grad_weight.defined() ? grad_weight.data_ptr() : nullptr; + param_t* grad_bias_data = grad_bias.defined() ? grad_bias.data_ptr() : nullptr; + + auto weight_a = conditional_accessor_1d(weight); + auto save_mean_a = conditional_accessor_1d(save_mean); + auto save_invstd_a = conditional_accessor_1d(save_invstd); + auto running_mean_a = conditional_accessor_1d(running_mean); + auto running_var_a = conditional_accessor_1d(running_var); + + // use float as acc type + bool weight_defined = weight.defined(); + Tensor weight_f = at::empty({n_channel}, input.options().dtype(kFloat)); + Tensor mean = at::empty({n_channel}, input.options().dtype(kFloat)); + Tensor invstd = at::empty({n_channel}, input.options().dtype(kFloat)); + float* weight_data = weight_f.data_ptr(); + float* mean_data = mean.data_ptr(); + float* invstd_data = invstd.data_ptr(); + + for (const auto c : c10::irange(n_channel)) { + weight_data[c] = weight_defined ? float(weight_a[c]) : 1; + + if (train) { + mean_data[c] = save_mean_a[c]; + invstd_data[c] = save_invstd_a[c]; + } else { + mean_data[c] = running_mean_a[c]; + invstd_data[c] = 1 / std::sqrt(running_var_a[c] + eps); + } + } + + int num_threads = at::get_num_threads(); + Tensor buffer = at::empty({2, num_threads, n_channel}, input.options().dtype(kFloat)).zero_(); + float* sum_data = buffer.data_ptr(); + float* dotp_data = sum_data + num_threads * n_channel; + + at::parallel_for(0, N, 1, [&](int64_t begin, int64_t end) { + int tid = at::get_thread_num(); + TORCH_CHECK(tid < num_threads, "expect thread id smaller than ", num_threads, ", got thread id ", tid); + float* sum_ptr = sum_data + tid * n_channel; + float* dotp_ptr = dotp_data + tid * n_channel; + for (const auto i : c10::irange(begin, end)) { + const BFloat16* x_ptr = input_data + i * n_channel; + const BFloat16* dy_ptr = grad_output_data + i * n_channel; + + int64_t d = 0; + for(; d < n_channel - (n_channel % bVec::size()); d += bVec::size()) { + bVec dy_bvec = bVec::loadu(dy_ptr + d); + fVec dy_fvec0, dy_fvec1; + std::tie(dy_fvec0, dy_fvec1) = convert_bfloat16_float(dy_bvec); + fVec sum_fvec0 = dy_fvec0 + fVec::loadu(sum_ptr + d); + fVec sum_fvec1 = dy_fvec1 + fVec::loadu(sum_ptr + d + fVec::size()); + sum_fvec0.store(sum_ptr + d); + sum_fvec1.store(sum_ptr + d + fVec::size()); + + bVec x_bvec = bVec::loadu(x_ptr + d); + fVec x_fvec0, x_fvec1; + std::tie(x_fvec0, x_fvec1) = convert_bfloat16_float(x_bvec); + fVec mean_fvec0 = fVec::loadu(mean_data + d); + fVec mean_fvec1 = fVec::loadu(mean_data + d + fVec::size()); + fVec dotp_fvec0 = fVec::loadu(dotp_ptr + d); + fVec dotp_fvec1 = fVec::loadu(dotp_ptr + d + fVec::size()); + dotp_fvec0 += (x_fvec0 - mean_fvec0) * dy_fvec0; + dotp_fvec1 += (x_fvec1 - mean_fvec1) * dy_fvec1; + dotp_fvec0.store(dotp_ptr + d); + dotp_fvec1.store(dotp_ptr + d + fVec::size()); + } + for (; d < n_channel; d++) { + float dy_val = dy_ptr[d]; + float x_val = x_ptr[d]; + float mean_val = mean_data[d]; + sum_ptr[d] += dy_val; + dotp_ptr[d] += (x_val - mean_val) * dy_val; + } + } + }); + + at::parallel_for(0, n_channel, 1, [&](int64_t begin, int64_t end) { + for (const auto c : c10::irange(begin, end)) { + // store the final result of sum and dotp in the 1st lane of immediate buffer, + // so that we won't need to allocate anther buffer to store the temp values. + float _sum = 0; + for (const auto t : c10::irange(num_threads)) { + _sum += sum_data[t * n_channel + c]; + } + sum_data[/* 0 * n_channel + */c] = _sum; + + float _dotp = 0; + for (const auto t : c10::irange(num_threads)) { + _dotp += dotp_data[t * n_channel + c]; + } + dotp_data[/* 0 * n_channel + */c] = _dotp; + } + }); + + // compute grad_input + if (grad_input.defined()) { + at::parallel_for(0, N, 1, [&](int64_t begin, int64_t end) { + for (const auto i : c10::irange(begin, end)) { + BFloat16* dx_ptr = grad_input_data + i * n_channel; + const BFloat16* x_ptr = input_data + i * n_channel; + const BFloat16* dy_ptr = grad_output_data + i * n_channel; + if (train) { + int64_t d = 0; + for (; d < n_channel - (n_channel % bVec::size()); d += bVec::size()) { + bVec x_bvec = bVec::loadu(x_ptr + d); + fVec x_fvec0, x_fvec1; + std::tie(x_fvec0, x_fvec1) = convert_bfloat16_float(x_bvec); + fVec mean_fvec0 = fVec::loadu(mean_data + d); + fVec mean_fvec1 = fVec::loadu(mean_data + d + fVec::size()); + fVec dotp_fvec0 = fVec::loadu(dotp_data + d); + fVec dotp_fvec1 = fVec::loadu(dotp_data + d + fVec::size()); + fVec invstd_fvec0 = fVec::loadu(invstd_data + d); + fVec invstd_fvec1 = fVec::loadu(invstd_data + d + fVec::size()); + fVec k_fvec0 = dotp_fvec0 * invstd_fvec0 * invstd_fvec0 / fVec(N); + fVec k_fvec1 = dotp_fvec1 * invstd_fvec1 * invstd_fvec1 / fVec(N); + fVec dx_fvec0 = (x_fvec0 - mean_fvec0) * k_fvec0; + fVec dx_fvec1 = (x_fvec1 - mean_fvec1) * k_fvec1; + bVec dy_bvec = bVec::loadu(dy_ptr + d); + fVec dy_fvec0, dy_fvec1; + std::tie(dy_fvec0, dy_fvec1) = convert_bfloat16_float(dy_bvec); + fVec grad_mean_fvec0 = fVec::loadu(sum_data + d) / fVec(N); + fVec grad_mean_fvec1 = fVec::loadu(sum_data + d + fVec::size()) / fVec(N); + fVec w_fvec0 = fVec::loadu(weight_data + d); + fVec w_fvec1 = fVec::loadu(weight_data + d + fVec::size()); + dx_fvec0 = (dy_fvec0 - grad_mean_fvec0 - dx_fvec0) * invstd_fvec0 * w_fvec0; + dx_fvec1 = (dy_fvec1 - grad_mean_fvec1 - dx_fvec1) * invstd_fvec1 * w_fvec1; + bVec dx_bvec = convert_float_bfloat16(dx_fvec0, dx_fvec1); + dx_bvec.store(dx_ptr + d); + } + for (; d < n_channel; d++) { + float x_val = x_ptr[d]; + float mean_val = mean_data[d]; + float dotp_val = dotp_data[d]; + float invstd_val = invstd_data[d]; + float k_val = dotp_val * invstd_val * invstd_val / N; + float dx_val = (x_val - mean_val) * k_val; + float dy_val = dy_ptr[d]; + float grad_mean_val = sum_data[d] / N; + float w_val = weight_data[d]; + dx_val = (dy_val - grad_mean_val - dx_val) * invstd_val * w_val; + dx_ptr[d] = BFloat16(dx_val); + } + } else { // evaluation mode + int64_t d = 0; + for (; d < n_channel - (n_channel % bVec::size()); d += bVec::size()) { + bVec dy_bvec = bVec::loadu(dy_ptr + d); + fVec dy_fvec0, dy_fvec1; + std::tie(dy_fvec0, dy_fvec1) = convert_bfloat16_float(dy_bvec); + fVec invstd_fvec0 = fVec::loadu(invstd_data + d); + fVec invstd_fvec1 = fVec::loadu(invstd_data + d + fVec::size()); + fVec w_fvec0 = fVec::loadu(weight_data + d); + fVec w_fvec1 = fVec::loadu(weight_data + d + fVec::size()); + fVec dx_fvec0 = dy_fvec0 * invstd_fvec0 * w_fvec0; + fVec dx_fvec1 = dy_fvec1 * invstd_fvec1 * w_fvec1; + bVec dx_bvec = convert_float_bfloat16(dx_fvec0, dx_fvec1); + dx_bvec.store(dx_ptr + d); + } + for (; d < n_channel; d++) { + float dy_val = dy_ptr[d]; + float invstd_val = invstd_data[d]; + float w_val = weight_data[d]; + float dx_val = dy_val * invstd_val * w_val; + dx_ptr[d] = BFloat16(dx_val); + } + } + } + }); + } + + if (grad_weight.defined()) { + for (const auto c : c10::irange(n_channel)) { + grad_weight_data[c] = param_t(dotp_data[c] * invstd_data[c]); + } + } + + if (grad_bias.defined()) { + for (const auto c : c10::irange(n_channel)) { + grad_bias_data[c] = param_t(sum_data[c]); + } + } +} + +template <> +void batch_norm_cpu_backward_channels_last_impl(Tensor& grad_input, Tensor& grad_weight, Tensor& grad_bias, + const Tensor& grad_output, const Tensor& input, const Tensor& weight, + const Tensor& running_mean, const Tensor& running_var, const Tensor& save_mean, const Tensor& save_invstd, + bool train, double eps) { + const bool mixed_type = is_mixed_type(input, weight, running_mean, running_var, save_mean, save_invstd); + if (mixed_type) { + batch_norm_cpu_backward_channels_last_internal(grad_input, grad_weight, grad_bias, + grad_output, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps); + } else { + batch_norm_cpu_backward_channels_last_internal(grad_input, grad_weight, grad_bias, + grad_output, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps); + } +} + void batch_norm_cpu_kernel(Tensor& output, const Tensor& input, const Tensor& weight, const Tensor& bias, const Tensor& save_mean, const Tensor& save_invstd, const Tensor& running_mean, const Tensor& running_var, bool train, double eps) { - if (input.is_contiguous()) { - AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_contiguous", [&] { - batch_norm_cpu_contiguous_impl(output, input, weight, bias, - save_mean, save_invstd, running_mean, running_var, train, eps); + int64_t image_size = input.numel() / input.size(0) / input.size(1); + if (input.is_contiguous()) { // NC11 is also channels last + AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, input.scalar_type(), "batch_norm_cpu_contiguous", [&] { + if (image_size == 1) { + batch_norm_cpu_channels_last_impl(output, input, weight, bias, + save_mean, save_invstd, running_mean, running_var, train, eps); + } else { + batch_norm_cpu_contiguous_impl(output, input, weight, bias, + save_mean, save_invstd, running_mean, running_var, train, eps); + } }); } else if (input.is_contiguous(at::MemoryFormat::ChannelsLast)) { - AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_channels_last", [&] { + AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, input.scalar_type(), "batch_norm_cpu_channels_last", [&] { batch_norm_cpu_channels_last_impl(output, input, weight, bias, save_mean, save_invstd, running_mean, running_var, train, eps); }); @@ -631,7 +1258,7 @@ void batch_norm_cpu_collect_stats_kernel( Tensor& mean, Tensor& var_sum, const Tensor& input) { int64_t image_size = input.numel() / input.size(0) / input.size(1); if (input.is_contiguous()) { - AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_collect_stats_contiguous", [&] { + AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, input.scalar_type(), "batch_norm_cpu_collect_stats_contiguous", [&] { if (image_size == 1) { // NC11 is also channels last batch_norm_cpu_collect_stats_channels_last_impl(mean, var_sum, input); } else { @@ -639,7 +1266,7 @@ void batch_norm_cpu_collect_stats_kernel( } }); } else if (input.is_contiguous(at::MemoryFormat::ChannelsLast)) { - AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_collect_stats_channels_last", [&] { + AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, input.scalar_type(), "batch_norm_cpu_collect_stats_channels_last", [&] { batch_norm_cpu_collect_stats_channels_last_impl(mean, var_sum, input); }); } else { @@ -653,7 +1280,7 @@ void batch_norm_cpu_backward_kernel(Tensor& grad_input, Tensor& grad_weight, Ten bool train, double eps) { int64_t image_size = input.numel() / input.size(0) / input.size(1); if (input.is_contiguous()) { - AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_backward_contiguous", [&] { + AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, input.scalar_type(), "batch_norm_cpu_backward_contiguous", [&] { if (image_size == 1) { // NC11 is also channels last batch_norm_cpu_backward_channels_last_impl(grad_input, grad_weight, grad_bias, grad_output, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps); @@ -663,7 +1290,7 @@ void batch_norm_cpu_backward_kernel(Tensor& grad_input, Tensor& grad_weight, Ten } }); } else if (input.is_contiguous(at::MemoryFormat::ChannelsLast)) { - AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "batch_norm_cpu_backward_channels_last", [&] { + AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, input.scalar_type(), "batch_norm_cpu_backward_channels_last", [&] { batch_norm_cpu_backward_channels_last_impl(grad_input, grad_weight, grad_bias, grad_output, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps); }); diff --git a/aten/src/ATen/native/cpu/group_norm_kernel.cpp b/aten/src/ATen/native/cpu/group_norm_kernel.cpp index 6f98b58a3c0e..ff84f9b60784 100644 --- a/aten/src/ATen/native/cpu/group_norm_kernel.cpp +++ b/aten/src/ATen/native/cpu/group_norm_kernel.cpp @@ -1,16 +1,24 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include #include #include -#include -#include +#include #include #include +#include +#include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + namespace at { namespace native { @@ -75,6 +83,78 @@ void GroupNormKernelImplInternal( }); } +template +std::tuple ColumnwiseMoments( + const T* X_data, + int64_t HxW, + int64_t C, + int64_t D) { + using Vec = vec::Vectorized; + constexpr int64_t K = Vec::size(); + const int64_t inner_size = D / K * K; + Vec acc0_vec{0}, acc1_vec{0}; + for (const auto m : c10::irange(HxW)) { + const T* X_ptr = X_data + m * C; + int64_t d = 0; + for (; d < inner_size; d += K) { + Vec x_vec = Vec::loadu(X_ptr + d); + acc0_vec += x_vec; + acc1_vec += x_vec * x_vec; + } + if (D - d > 0) { + Vec x_vec = Vec::loadu(X_ptr + d, D - d); + acc0_vec += x_vec; + acc1_vec += x_vec * x_vec; + } + } + // TODO: use fast path + T mean_val = vec::vec_reduce_all([](Vec& x, Vec& y) { return x + y; }, acc0_vec, Vec::size()); + T rstd_val = vec::vec_reduce_all([](Vec& x, Vec& y) { return x + y; }, acc1_vec, Vec::size()); + return std::tuple(mean_val, rstd_val); +} + +template +std::tuple ColumnwiseMoments( + const BFloat16* X_data, + int64_t HxW, + int64_t C, + int64_t D) { + using bVec = vec::Vectorized; + using fVec = vec::Vectorized; + constexpr int64_t K = bVec::size(); + const int64_t inner_size = D / K * K; + fVec acc0_fvec{0}, acc1_fvec{0}, zero{0}; + for (const auto m : c10::irange(HxW)) { + const BFloat16* X_ptr = X_data + m * C; + int64_t d = 0; + for (; d < inner_size; d += K) { + bVec x_bvec = bVec::loadu(X_ptr + d); + fVec x_fvec0, x_fvec1; + std::tie(x_fvec0, x_fvec1) = convert_bfloat16_float(x_bvec); + acc0_fvec += x_fvec0 + x_fvec1; + acc1_fvec += x_fvec0 * x_fvec0 + x_fvec1 * x_fvec1; + } + if (D - d > 0) { + bVec x_bvec = bVec::loadu(X_ptr + d, D - d); + fVec x_fvec0, x_fvec1; + std::tie(x_fvec0, x_fvec1) = convert_bfloat16_float(x_bvec); + if (D - d > fVec::size()) { + x_fvec1 = fVec::set(zero, x_fvec1, D - d - fVec::size()); + acc0_fvec += x_fvec0 + x_fvec1; + acc1_fvec += x_fvec0 * x_fvec0 + x_fvec1 * x_fvec1; + } else { + x_fvec0 = fVec::set(zero, x_fvec0, D - d); + acc0_fvec += x_fvec0; + acc1_fvec += x_fvec0 * x_fvec0; + } + } + } + // TODO: use fast path + float mean_val = vec::vec_reduce_all([](fVec& x, fVec& y) { return x + y; }, acc0_fvec, fVec::size()); + float rstd_val = vec::vec_reduce_all([](fVec& x, fVec& y) { return x + y; }, acc1_fvec, fVec::size()); + return std::tuple(mean_val, rstd_val); +} + template void GroupNormKernelImplChannelsLastInternal( const Tensor& X, @@ -99,110 +179,204 @@ void GroupNormKernelImplChannelsLastInternal( T* Y_data = Y.data_ptr(); T* mean_data = mean.data_ptr(); T* rstd_data = rstd.data_ptr(); - const T s = T(1) / static_cast(D * HxW); + + using T_ACC = vec::vec_scalar_t; + using Vec = vec::Vectorized; + + const T s = T_ACC(1) / static_cast(D * HxW); const bool gamma_null = (gamma_data == nullptr); const bool beta_null = beta_data == nullptr; - // temp buffer holding x and x2 - Tensor buffer = at::empty({N, 2 * C}, X.options()).zero_(); - T* buffer_data = buffer.data_ptr(); + // NB: About algorithm choosen: + // + // On channels last, GroupNorm has a input shape of {N, H, W, GD}, + // Mean and rstd are collected per each n and g, which involves reduction + // on non-adjacent dimensions. We can parallel in the following 2 impls: + // + // impl-1: parallel on N * G. Only need one omp session but memory access + // per thread is non-contiguous. + // + // impl-2: parallel on N * HxW. Memory access per thread is contiguous, + // but requires help of extra temp buffer of size {T, N, 2C}. + // + // Generally impl-2 has better performance when HxW is large enough, so that + // data per thread {NHWC / T} is much larger then temp buffer per thread {2NC} + // + constexpr int64_t feature_map_threshold = 1024; + if (HxW < feature_map_threshold) { + // impl-1: parallel on N * G. + // + // for each plain of HxW, scale and bias is calculated only once + Tensor buffer = at::empty({N * G, 2 * D}, X.options()); + T* buffer_data = buffer.data_ptr(); - using Vec = vec::Vectorized; - at::parallel_for(0, N, 1, [&](int64_t start, int64_t end) { - constexpr int64_t K = Vec::size(); - const int64_t inner_size = C / K * K; - for (const auto n : c10::irange(start, end)) { - T* mean_ptr = buffer_data + n * 2 * C; - T* rstd_ptr = mean_ptr + C; - for (const auto i : c10::irange(HxW)) { - const T* X_ptr = X_data + n * HxW * C + i * C; - for (int64_t j = 0; j < inner_size; j += K) { - const Vec x_vec = Vec::loadu(X_ptr + j); - Vec mean_vec = Vec::loadu(mean_ptr + j) + x_vec; - Vec rstd_vec = Vec::loadu(rstd_ptr + j) + x_vec * x_vec; - mean_vec.store(mean_ptr + j); - rstd_vec.store(rstd_ptr + j); + at::parallel_for(0, N * G, 1, [&](int64_t begin, int64_t end) { + int64_t n{0}, g{0}; + data_index_init(begin, n, N, g, G); + for (const auto i : c10::irange(begin, end)) { + // step-1: for each n and g, collect sum of x and x2 + // + // Note that using vec::map_reduce_all here is simpler to write + // but it is slower since horizontal reduce from vec to scalar is slow. + // So it is better to reduce with a vec across all HxW plain, + // and do a horizontal add just once for each {n, g}. + // + T_ACC mean_val, rstd_val; + std::tie(mean_val, rstd_val) = ColumnwiseMoments( + X_data + n * HxW * C + g * D, + HxW, + C, + D); + + mean_val *= s; + rstd_val = std::max(rstd_val * s - mean_val * mean_val, T_ACC(0)); + rstd_val = T_ACC(1) / std::sqrt(rstd_val + eps); + mean_data[i] = mean_val; + rstd_data[i] = rstd_val; + + // step-2: calculate scale and bias + T* scale_ptr = buffer_data + i * 2 * D; + T* bias_ptr = scale_ptr + D; + for (const auto d : c10::irange(D)) { + const int64_t c = g * D + d; + scale_ptr[d] = rstd_val * (gamma_null ? T(1) : gamma_data[c]); + bias_ptr[d] = -scale_ptr[d] * mean_val + (beta_null ? T(0) : beta_data[c]); } - for (const auto j : c10::irange(inner_size, C)) { - mean_ptr[j] += X_ptr[j]; - rstd_ptr[j] += X_ptr[j] * X_ptr[j]; + + // step-3: apply scale and bias + for (const auto m : c10::irange(HxW)) { + const T* X_ptr = X_data + n * HxW * C + m * C + g * D; + T* Y_ptr = Y_data + n * HxW * C + m * C + g * D; + vec::map3( + [](Vec x, Vec scale, Vec bias) { return x * scale + bias; }, + Y_ptr, + X_ptr, + scale_ptr, + bias_ptr, + D); } + + data_index_step(n, N, g, G); + } + }); + } else { + // impl-2: parallel on N * HxW. + // + // temp buffer holding x and x2 + int num_threads = at::get_num_threads(); + Tensor buffer = at::empty({num_threads, N, 2 * C}, X.options()).zero_(); + T* buffer_data = buffer.data_ptr(); + + // step-1: accumulate on dimension of C + // + // In order to improve multi-core performance when N=1, + // we parallel on the all the outer dimensions of N and HxW, + // leaving the most inner dimension C for vectorization. + // + // Note that parallel on {N, HxW, G} is not feasible for some common configs, + // e.g. say input shape is {1, 32, h, w} and G = 8, + // this will give D = 4 which is unable to take full SIMD length. + // + // To avoid thread conflict, we make use of a temp buffer of {T, N, 2C}, + // firstly, reduce from {N, HxW, C} to {T, N, 2C} + // + at::parallel_for(0, N * HxW, 1, [&](int64_t begin, int64_t end) { + int tid = at::get_thread_num(); + T* buffer_ptr = buffer_data + tid * N * 2 * C; + + int64_t n{0}, m{0}; + data_index_init(begin, n, N, m, HxW); + for (const auto i : c10::irange(begin, end)) { + T* mean_ptr = buffer_ptr + n * 2 * C; + T* rstd_ptr = mean_ptr + C; + const T* X_ptr = X_data + i * C; + + vec::map2( + [](Vec x, Vec y) { return x + y; }, + mean_ptr, + X_ptr, + mean_ptr, + C); + + vec::map2( + [](Vec x, Vec y) { return x * x + y; }, + rstd_ptr, + X_ptr, + rstd_ptr, + C); + + data_index_step(n, N, m, HxW); } + }); + // step-2: compute mean and rstd + for (const auto n : c10::irange(N)) { for (const auto g : c10::irange(G)) { - T mean_val = T(0); - T rstd_val = T(0); + T_ACC mean_val{0}, rstd_val{0}; for (const auto d : c10::irange(D)) { - mean_val += mean_ptr[g * D + d]; - rstd_val += rstd_ptr[g * D + d]; + for (const auto t : c10::irange(num_threads)) { + T* buffer_ptr = buffer_data + t * N * 2 * C + n * 2 * C; + mean_val += buffer_ptr[g * D + d]; + rstd_val += buffer_ptr[g * D + d + C]; + } } mean_val *= s; - rstd_val = std::max(rstd_val * s - mean_val * mean_val, T(0)); - rstd_val = T(1) / std::sqrt(rstd_val + eps); + rstd_val = std::max(rstd_val * s - mean_val * mean_val, T_ACC(0)); + rstd_val = T_ACC(1) / std::sqrt(rstd_val + eps); + mean_data[n * G + g] = T(mean_val); + rstd_data[n * G + g] = T(rstd_val); + } + } - // continue to use the temp buffer for mean and rstd value, - // so that we can vectorize the following math on entire C dimension. + // step-3: compute scale and bias + // + // mean/rstd have shape of {N, G}, gamma/beta have shape of {G, D}. + // And scale/bias have shape of {N, C} so that we can directly vectorize on + // dimension of C in the final step. + // + // We could fuse step 3 and 4 into a single session but this way is better: + // a. D might be too small for vectorization; + // b. Avoid duplicate caculation of scale/bias, each HxW plain share the same scale/bias + // + for (const auto n : c10::irange(N)) { + for (const auto g : c10::irange(G)) { + T* scale_ptr = buffer_data + n * 2 * C; + T* bias_ptr = scale_ptr + C; + T mean_val = mean_data[n * G + g]; + T rstd_val = rstd_data[n * G + g]; for (const auto d : c10::irange(D)) { - mean_ptr[g * D + d] = mean_val; - rstd_ptr[g * D + d] = rstd_val; + const int64_t c = g * D + d; + scale_ptr[c] = rstd_val * (gamma_null ? T(1) : gamma_data[c]); + bias_ptr[c] = -scale_ptr[c] * mean_val + (beta_null ? T(0) : beta_data[c]); } - - mean_data[n * G + g] = mean_val; - rstd_data[n * G + g] = rstd_val; } + } - // expand gamma_null and beta_null to reduce if-else on critial path. - if (!gamma_null && !beta_null) { - for (const auto i : c10::irange(HxW)) { - const T* X_ptr = X_data + n * HxW * C + i * C; - T* Y_ptr = Y_data + n * HxW * C + i * C; - for (int64_t j = 0; j < inner_size; j += K) { - Vec scale_vec = Vec::loadu(rstd_ptr + j) * Vec::loadu(gamma_data + j); - Vec bias_vec = Vec::loadu(beta_data + j) - scale_vec * Vec::loadu(mean_ptr + j); - Vec y_vec = scale_vec * Vec::loadu(X_ptr + j) + bias_vec; - y_vec.store(Y_ptr + j); - } - for (const auto j : c10::irange(inner_size, C)) { - T scale = rstd_ptr[j] * gamma_data[j]; - T bias = -scale * mean_ptr[j] + beta_data[j]; - Y_ptr[j] = scale * X_ptr[j] + bias; - } - } - } else if (gamma_null && beta_null) { - for (const auto i : c10::irange(HxW)) { - const T* X_ptr = X_data + n * HxW * C + i * C; - T* Y_ptr = Y_data + n * HxW * C + i * C; - for (int64_t j = 0; j < inner_size; j += K) { - Vec scale_vec = Vec::loadu(rstd_ptr + j); - Vec y_vec = scale_vec * Vec::loadu(X_ptr + j) - scale_vec * Vec::loadu(mean_ptr + j); - y_vec.store(Y_ptr + j); - } - for (const auto j : c10::irange(inner_size, C)) { - T scale = rstd_ptr[j]; - Y_ptr[j] = scale * X_ptr[j] -scale * mean_ptr[j]; - } - } - } else { - for (const auto i : c10::irange(HxW)) { - const T* X_ptr = X_data + n * HxW * C + i * C; - T* Y_ptr = Y_data + n * HxW * C + i * C; - for (int64_t j = 0; j < inner_size; j += K) { - Vec gamma_vec = gamma_null ? Vec(1) : Vec::loadu(gamma_data + j); - Vec beta_vec = beta_null ? Vec(0) : Vec::loadu(beta_data + j); - Vec scale_vec = Vec::loadu(rstd_ptr + j) * gamma_vec; - Vec bias_vec = beta_vec - scale_vec * Vec::loadu(mean_ptr + j); - Vec y_vec = scale_vec * Vec::loadu(X_ptr + j) + bias_vec; - y_vec.store(Y_ptr + j); - } - for (const auto j : c10::irange(inner_size, C)) { - T scale = rstd_ptr[j] * (gamma_null ? T(1) : gamma_data[j]); - T bias = -scale * mean_ptr[j] + (beta_null ? T(0) : beta_data[j]); - Y_ptr[j] = scale * X_ptr[j] + bias; - } - } + // step-4: apply scale and bias + // + // Parallel on on the all the outer dimensions of N and HxW + // and vectorize on C. + // + at::parallel_for(0, N * HxW, 1, [&](int64_t begin, int64_t end) { + int64_t n{0}, m{0}; + data_index_init(begin, n, N, m, HxW); + for (const auto i : c10::irange(begin, end)) { + const T* X_ptr = X_data + i * C; + T* Y_ptr = Y_data + i * C; + T* scale_ptr = buffer_data + n * 2 * C; + T* bias_ptr = scale_ptr + C; + vec::map3( + [](Vec x, Vec scale, Vec bias) { return x * scale + bias; }, + Y_ptr, + X_ptr, + scale_ptr, + bias_ptr, + C); + + data_index_step(n, N, m, HxW); } - } - }); + }); + } } void GroupNormKernelImpl( @@ -219,21 +393,22 @@ void GroupNormKernelImpl( Tensor& rstd) { switch (X.suggest_memory_format()) { case at::MemoryFormat::Contiguous: { - AT_DISPATCH_FLOATING_TYPES(X.scalar_type(), "GroupNormKernelImpl", [&]() { + AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, X.scalar_type(), "GroupNormKernelImpl", [&]() { GroupNormKernelImplInternal( X, gamma, beta, N, C, HxW, group, static_cast(eps), Y, mean, rstd); }); break; } - case at::MemoryFormat::ChannelsLast: { - AT_DISPATCH_FLOATING_TYPES(X.scalar_type(), "GroupNormKernelImpl", [&]() { + case at::MemoryFormat::ChannelsLast: + case at::MemoryFormat::ChannelsLast3d: { + AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, X.scalar_type(), "GroupNormKernelImpl", [&]() { GroupNormKernelImplChannelsLastInternal( X, gamma, beta, N, C, HxW, group, static_cast(eps), Y, mean, rstd); }); break; } default: - TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, ChannelsLast3d, Contiguous"); } } @@ -457,8 +632,8 @@ void GroupNormBackwardKernelImpl( Tensor& dX, Tensor& dgamma, Tensor& dbeta) { - AT_DISPATCH_FLOATING_TYPES( - X.scalar_type(), "GroupNormBackwardKernelImpl", [&]() { + AT_DISPATCH_FLOATING_TYPES_AND( + ScalarType::BFloat16, X.scalar_type(), "GroupNormBackwardKernelImpl", [&]() { GroupNormBackwardKernelImplInternal( dY, X, mean, rstd, gamma, N, C, HxW, group, dX, dgamma, dbeta); }); diff --git a/aten/src/ATen/native/cpu/layer_norm_kernel.cpp b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp index 887b7a1dcdc9..e32b930bb592 100644 --- a/aten/src/ATen/native/cpu/layer_norm_kernel.cpp +++ b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp @@ -1,77 +1,183 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include #include -#include -#include +#include #include #include #include #include +#include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + namespace at { namespace native { namespace { -template -void LayerNormKernelImplInternal( - const Tensor& X, - const Tensor& gamma, - const Tensor& beta, - int64_t M, - int64_t N, - T eps, - Tensor* Y, - Tensor* mean, - Tensor* rstd) { - using T_ACC = vec::vec_scalar_t; - using Vec = vec::Vectorized; - DCHECK_EQ(X.numel(), M * N); - DCHECK(!gamma.defined() || gamma.numel() == N); - DCHECK(!beta.defined() || beta.numel() == N); - const T* X_data = X.data_ptr(); - const T* gamma_data = gamma.defined() ? gamma.data_ptr() : nullptr; - const T* beta_data = beta.defined() ? beta.data_ptr() : nullptr; - T* Y_data = Y->data_ptr(); - T* mean_data = mean->data_ptr(); - T* rstd_data = rstd->data_ptr(); - const bool gamma_null = gamma_data == nullptr; - const bool beta_null = beta_data == nullptr; - at::parallel_for(0, M, 1, [&](int64_t start, int64_t end) { - for (const auto i : c10::irange(start, end)) { - const T* X_ptr = X_data + i * N; - T* Y_ptr = Y_data + i * N; - T mean_val; - T rstd_val; - std::tie(mean_val, rstd_val) = utils::RowwiseMoments(X_ptr, N); - rstd_val = T(1) / std::sqrt(rstd_val + eps); - const T_ACC scale = rstd_val; - const T_ACC bias = -rstd_val * mean_val; - if (gamma_null || beta_null) { - for (const auto j : c10::irange(N)) { - const T gamma_v = gamma_null ? T(1) : gamma_data[j]; - const T beta_v = beta_null ? T(0) : beta_data[j]; - Y_ptr[j] = (X_ptr[j] * scale + bias) * gamma_v + beta_v; +template +struct LayerNormKernelImplInternal { + constexpr static void apply( + const Tensor& X, + const Tensor& gamma, + const Tensor& beta, + int64_t M, + int64_t N, + T eps, + Tensor* Y, + Tensor* mean, + Tensor* rstd) { + using Vec = vec::Vectorized; + const T* X_data = X.data_ptr(); + const T* gamma_data = gamma.defined() ? gamma.data_ptr() : nullptr; + const T* beta_data = beta.defined() ? beta.data_ptr() : nullptr; + T* Y_data = Y->data_ptr(); + T* mean_data = mean ? mean->data_ptr() : nullptr; + T* rstd_data = rstd ? rstd->data_ptr() : nullptr; + const bool gamma_null = gamma_data == nullptr; + const bool beta_null = beta_data == nullptr; + const bool mean_null = mean_data == nullptr; + const bool rstd_null = rstd_data == nullptr; + at::parallel_for(0, M, 1, [&](int64_t start, int64_t end) { + for (const auto i : c10::irange(start, end)) { + const T* X_ptr = X_data + i * N; + T* Y_ptr = Y_data + i * N; + T mean_val; + T rstd_val; + std::tie(mean_val, rstd_val) = utils::RowwiseMoments(X_ptr, N); + rstd_val = T(1) / std::sqrt(rstd_val + eps); + const T scale = rstd_val; + const T bias = -rstd_val * mean_val; + if (gamma_null || beta_null) { + for (const auto j : c10::irange(N)) { + const T gamma_v = gamma_null ? T(1) : gamma_data[j]; + const T beta_v = beta_null ? T(0) : beta_data[j]; + Y_ptr[j] = (X_ptr[j] * scale + bias) * gamma_v + beta_v; + } + } else { + vec::map3( + [scale, bias](Vec x, Vec gamma, Vec beta) { + return (x * Vec(scale) + Vec(bias)) * gamma + beta; + }, + Y_ptr, + X_ptr, + gamma_data, + beta_data, + N); + } + if (!mean_null) { + mean_data[i] = mean_val; + } + if (!rstd_null) { + rstd_data[i] = rstd_val; } - } else { - vec::map3( - [scale, bias](Vec x, Vec gamma, Vec beta) { - return (x * Vec(scale) + Vec(bias)) * gamma + beta; - }, - Y_ptr, - X_ptr, - gamma_data, - beta_data, - N); } - mean_data[i] = mean_val; - rstd_data[i] = rstd_val; - } - }); -} + }); + } +}; + +template +struct LayerNormKernelImplInternal { + constexpr static void apply( + const Tensor& X, + const Tensor& gamma, + const Tensor& beta, + int64_t M, + int64_t N, + BFloat16 eps, + Tensor* Y, + Tensor* mean, + Tensor* rstd) { + using bVec = vec::Vectorized; + using fVec = vec::Vectorized; + const BFloat16* X_data = X.data_ptr(); + const T_ACC* gamma_data = gamma.defined() ? gamma.data_ptr() : nullptr; + const T_ACC* beta_data = beta.defined() ? beta.data_ptr() : nullptr; + BFloat16* Y_data = Y->data_ptr(); + T_ACC* mean_data = mean ? mean->data_ptr() : nullptr; + T_ACC* rstd_data = rstd ? rstd->data_ptr() : nullptr; + const bool gamma_null = gamma_data == nullptr; + const bool beta_null = beta_data == nullptr; + const bool mean_null = mean_data == nullptr; + const bool rstd_null = rstd_data == nullptr; + + // pre convert `gamma` and `beta` to float when they are both defined + const bool pre_convert_gamma_beta = !gamma_null && !beta_null; + + at::parallel_for(0, M, 1, [&](int64_t start, int64_t end) { + // temp buffer holding input, gamma/beta (if defined) in float + // + // pre convert input slice to float has 2 benefits: + // a. Welford algorithm involves more arithmetic operations, + // this will reduce rounding error and improve performance. + // b. The input slice (float) can be reused when updating + // corresponding output slice. + // + int64_t buffer_size = pre_convert_gamma_beta ? 3 * N : N; + std::unique_ptr buffer(new float[buffer_size]); + float* input_buffer_ptr = buffer.get(); + float* gamma_buffer_ptr = nullptr; + float* beta_buffer_ptr = nullptr; + if (pre_convert_gamma_beta) { + gamma_buffer_ptr = buffer.get() + N; + beta_buffer_ptr = buffer.get() + 2 * N; + vec::convert(gamma_data, gamma_buffer_ptr, N); + vec::convert(beta_data, beta_buffer_ptr, N); + } + + for (const auto i : c10::irange(start, end)) { + const BFloat16* X_ptr = X_data + i * N; + BFloat16* Y_ptr = Y_data + i * N; + vec::convert(X_ptr, input_buffer_ptr, N); + + float mean_val; + float rstd_val; + std::tie(mean_val, rstd_val) = utils::RowwiseMoments(input_buffer_ptr, N); + rstd_val = float(1) / std::sqrt(rstd_val + eps); + const float scale = rstd_val; + const float bias = -rstd_val * mean_val; + if (gamma_null || beta_null) { + for (const auto j : c10::irange(N)) { + const float gamma_v = gamma_null ? float(1) : float(gamma_data[j]); + const float beta_v = beta_null ? float(0) : float(beta_data[j]); + Y_ptr[j] = (input_buffer_ptr[j] * scale + bias) * gamma_v + beta_v; + } + } else { + int64_t d = 0; + for (; d < N - (N % bVec::size()); d += bVec::size()) { + fVec x_fvec0 = fVec::loadu(input_buffer_ptr + d); + fVec x_fvec1 = fVec::loadu(input_buffer_ptr + d + fVec::size()); + fVec gamma_fvec0 = fVec::loadu(gamma_buffer_ptr + d); + fVec gamma_fvec1 = fVec::loadu(gamma_buffer_ptr + d + fVec::size()); + fVec beta_fvec0 = fVec::loadu(beta_buffer_ptr + d); + fVec beta_fvec1 = fVec::loadu(beta_buffer_ptr + d + fVec::size()); + fVec y_fvec0 = (x_fvec0 * fVec(scale) + fVec(bias)) * gamma_fvec0 + beta_fvec0; + fVec y_fvec1 = (x_fvec1 * fVec(scale) + fVec(bias)) * gamma_fvec1 + beta_fvec1; + bVec y_bvec = convert_float_bfloat16(y_fvec0, y_fvec1); + y_bvec.store(Y_ptr + d); + } + for (; d < N; d++) { + Y_ptr[d] = (input_buffer_ptr[d] * scale + bias) * gamma_data[d] + beta_data[d]; + } + } + if (!mean_null) { + mean_data[i] = T_ACC(mean_val); + } + if (!rstd_null) { + rstd_data[i] = T_ACC(rstd_val); + } + } + }); + } +}; void LayerNormKernelImpl( const Tensor& X, @@ -83,10 +189,21 @@ void LayerNormKernelImpl( Tensor* Y, Tensor* mean, Tensor* rstd) { + DCHECK_EQ(X.numel(), M * N); + DCHECK(!gamma.defined() || gamma.numel() == N); + DCHECK(!beta.defined() || beta.numel() == N); AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, X.scalar_type(), "LayerNormKernelImpl", [&]() { - LayerNormKernelImplInternal( - X, gamma, beta, M, N, static_cast(eps), Y, mean, rstd); + using accscalar_t = at::acc_type; + const bool mixed_type = is_mixed_type(X, gamma, beta); + if (mixed_type) { + check_mixed_data_type(X, gamma, beta); + LayerNormKernelImplInternal::apply( + X, gamma, beta, M, N, static_cast(eps), Y, mean, rstd); + } else { + LayerNormKernelImplInternal::apply( + X, gamma, beta, M, N, static_cast(eps), Y, mean, rstd); + } }); } diff --git a/aten/src/ATen/native/cpu/mixed_data_type.h b/aten/src/ATen/native/cpu/mixed_data_type.h new file mode 100644 index 000000000000..6964dd5fa71d --- /dev/null +++ b/aten/src/ATen/native/cpu/mixed_data_type.h @@ -0,0 +1,41 @@ +#pragma once + +#include + +namespace at { namespace native { + +inline ScalarType first_type() { + return ScalarType::Undefined; +} + +template +inline ScalarType first_type(const Tensor& arg, const Args&... parameters) { + return arg.defined() ? arg.scalar_type() : first_type(parameters...); +} + +template +inline bool is_mixed_type(const Tensor& input, const Args&... parameters) { + const auto parameter_type = first_type(parameters...); + return ((parameter_type != ScalarType::Undefined) && + (parameter_type != input.scalar_type())); +} + +// currently on CPU, mixed data type is only supported +// when input is 'BFloat16' and parameters are 'Float' +inline void check_mixed_data_type(const Tensor& input) { + TORCH_CHECK(input.scalar_type() == ScalarType::BFloat16, + "mixed dtype (CPU): expect input to have scalar type of BFloat16"); +} + +template +inline void check_mixed_data_type(const Tensor& input, const Tensor& parameter, const Args&... parameters) { + TORCH_CHECK(!parameter.defined() || parameter.scalar_type() == ScalarType::Float, + "mixed dtype (CPU): expect parameter to have scalar type of Float"); + check_mixed_data_type(input, parameters...); +} + +inline ScalarType param_scalar_type(const Tensor& t, bool is_mixed_type) { + return is_mixed_type ? ScalarType::Float : t.scalar_type(); +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/cpu/utils.h b/aten/src/ATen/native/cpu/utils.h index 136479e2a0d4..5c607f06b3a5 100644 --- a/aten/src/ATen/native/cpu/utils.h +++ b/aten/src/ATen/native/cpu/utils.h @@ -37,6 +37,30 @@ inline bool data_index_step(T& x, const T& X, Args&&... args) { return false; } +// Helper struct for bfloat16 vectorization +// Useful when you need float as immediate dtype or accumulate dtype +using namespace vec; +struct Vec2 { + Vectorized val0, val1; + Vec2(Vectorized v0, Vectorized v1) : val0(v0), val1(v1) {} + Vec2(float v) : val0(v), val1(v) {} + static Vec2 loadu(const BFloat16* ptr) { + Vectorized v0, v1; + std::tie(v0, v1) = convert_bfloat16_float(Vectorized::loadu(ptr)); + return {v0, v1}; + } + void store(BFloat16* ptr) const { + Vectorized val = convert_float_bfloat16(val0, val1); + val.store(ptr); + } +}; +inline Vec2 operator+(const Vec2& a, const Vec2& b) { return {a.val0 + b.val0, a.val1 + b.val1}; } +inline Vec2 operator*(const Vec2& a, const Vec2& b) { return {a.val0 * b.val0, a.val1 * b.val1}; } + +template struct VectorizedType { using type = Vectorized; }; +template <> struct VectorizedType { using type = Vec2; }; +template using VecType = typename VectorizedType::type; + } // namespace namespace utils { diff --git a/aten/src/ATen/native/cpu/zmath.h b/aten/src/ATen/native/cpu/zmath.h index 0017b6a16d81..3f3971e6e76e 100644 --- a/aten/src/ATen/native/cpu/zmath.h +++ b/aten/src/ATen/native/cpu/zmath.h @@ -94,7 +94,7 @@ constexpr double real_impl , double> (c10::complex } template -constexpr VALUE_TYPE imag_impl (SCALAR_TYPE z) { +constexpr VALUE_TYPE imag_impl (SCALAR_TYPE /*z*/) { return 0; } @@ -123,6 +123,11 @@ inline TYPE conj_impl (TYPE z) { return z; //No-Op } +template<> +inline c10::complex conj_impl > (c10::complex z) { + return c10::complex{z.real(), -z.imag()}; +} + template<> inline c10::complex conj_impl > (c10::complex z) { return c10::complex(z.real(), -z.imag()); diff --git a/aten/src/ATen/native/cuda/AbsKernel.cu b/aten/src/ATen/native/cuda/AbsKernel.cu index 3bfc2621d930..19b12354cc48 100644 --- a/aten/src/ATen/native/cuda/AbsKernel.cu +++ b/aten/src/ATen/native/cuda/AbsKernel.cu @@ -1,6 +1,7 @@ #define TORCH_ASSERT_NO_OPERATORS #include #include +#include #include #include #include @@ -14,12 +15,37 @@ struct AbsFunctor { } }; +const char abs_name[] = "abs_kernel"; void abs_kernel_cuda(TensorIteratorBase& iter) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool, iter.dtype(), "abs_cuda", [&]() { - gpu_kernel(iter, AbsFunctor()); - }); + auto dtype = iter.dtype(); + if (at::isComplexType(dtype)) { +#if AT_USE_JITERATOR() + static const auto abs_string = jiterator_stringify( + template T abs_kernel(T x) { return std::abs(x); }); + AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "abs_cuda", [&]() { + jitted_gpu_kernel< + /*name=*/abs_name, + /*return_dtype=*/scalar_t, + /*common_dtype=*/scalar_t, + /*arity=*/1>(iter, abs_string); + }); +#else + AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "abs_cuda", [&]() { + using opmath_t = at::opmath_type; + gpu_kernel(iter, AbsFunctor()); + }); +#endif + } else { + AT_DISPATCH_ALL_TYPES_AND3( + ScalarType::Half, + ScalarType::BFloat16, + ScalarType::Bool, + iter.dtype(), + "abs_cuda", + [&]() { gpu_kernel(iter, AbsFunctor()); }); + } } -REGISTER_DISPATCH(abs_stub, &abs_kernel_cuda); + REGISTER_DISPATCH(abs_stub, &abs_kernel_cuda); }} // namespace at::native diff --git a/aten/src/ATen/native/cuda/Activation.cpp b/aten/src/ATen/native/cuda/Activation.cpp index 2dfe0a862ea4..55b397ca77f4 100644 --- a/aten/src/ATen/native/cuda/Activation.cpp +++ b/aten/src/ATen/native/cuda/Activation.cpp @@ -1,9 +1,27 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include -#include +#include +#include +#include +#include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#endif + namespace at { namespace native { // ----------------------------------- @@ -156,15 +174,15 @@ std::tuple prelu_backward_cuda(const Tensor& grad_out_, const Te } TORCH_IMPL_FUNC(gelu_out_cuda) ( - const Tensor& /*self*/, const Tensor& /*result*/ - ) { - GeluCUDAKernelImpl(*this); + const Tensor& /*self*/, c10::string_view approximate, const Tensor& /*result*/ +) { + GeluCUDAKernelImpl(*this, get_gelutype_enum(approximate)); } TORCH_IMPL_FUNC(gelu_backward_out_cuda) ( - const Tensor& /*grad*/, const Tensor& /*self*/, const Tensor& /*grad_input*/ - ) { - GeluBackwardCUDAKernelImpl(*this); + const Tensor& /*grad*/, const Tensor& /*self*/, c10::string_view approximate, const Tensor& /*grad_input*/ +) { + GeluBackwardCUDAKernelImpl(*this, get_gelutype_enum(approximate)); } }} // namespace at::native diff --git a/aten/src/ATen/native/cuda/Activation.cu b/aten/src/ATen/native/cuda/Activation.cu index 168e142dd291..4f8e9b005552 100644 --- a/aten/src/ATen/native/cuda/Activation.cu +++ b/aten/src/ATen/native/cuda/Activation.cu @@ -35,6 +35,31 @@ void glu_kernel(TensorIteratorBase& iter) { }); } +// ----------------------------------- +// glu forward ad +// ----------------------------------- +void glu_jvp_kernel(TensorIteratorBase& iter) { + AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "glu_cuda", [&]() { + using acc_t = at::acc_type; + gpu_kernel(iter, [] GPU_LAMBDA ( + scalar_t res_, + scalar_t b_, + scalar_t da_, + scalar_t db_) -> scalar_t { + const acc_t res = res_; + const acc_t b = b_; + const acc_t da = da_; + const acc_t db = db_; + const acc_t one = acc_t(1); + + const acc_t sig_b = one / (one + std::exp(-b)); + return ( + da * sig_b + res * (db - sig_b * db) + ); + }); + }); +} + // ----------------------------------- // glu backward // ----------------------------------- @@ -107,11 +132,12 @@ void launch_glu_backward_kernel(const TensorIteratorBase& iter, void launch_log_sigmoid_forward_kernel(TensorIteratorBase& iter) { AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.common_dtype(), "log_sigmoid_forward_cuda", [&] { - using acc_t = acc_type; + using opmath_t = at::opmath_type; + gpu_kernel(iter, [] GPU_LAMBDA (scalar_t in_) -> scalar_t { - const acc_t in = in_; - const auto min = std::min(acc_t(0), in); + const opmath_t in = in_; + const auto min = std::min(opmath_t(0), in); const auto z = std::exp(-std::abs(in)); return min - std::log1p(z); }); @@ -125,17 +151,17 @@ void launch_log_sigmoid_forward_kernel(TensorIteratorBase& iter) { void log_sigmoid_backward_kernel(TensorIterator& iter) { AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.common_dtype(), "log_sigmoid_backward_cuda", [&] { - using acc_t = acc_type; + using opmath_t = at::opmath_type; gpu_kernel(iter, [] GPU_LAMBDA (scalar_t in_, scalar_t grad_out_) -> scalar_t { - const acc_t in = in_; - const acc_t grad_out = grad_out_; + const opmath_t in = in_; + const opmath_t grad_out = grad_out_; - auto in_negative = in < acc_t(0); - auto max_deriv = in_negative ? acc_t(1) : acc_t(0); - auto sign = in_negative ? acc_t(1) : -acc_t(1); + auto in_negative = in < opmath_t(0); + auto max_deriv = in_negative ? opmath_t(1) : opmath_t(0); + auto sign = in_negative ? opmath_t(1) : -opmath_t(1); const auto z = std::exp(-std::abs(in)); - return grad_out * (max_deriv - sign * (z / (acc_t(1) + z))); + return grad_out * (max_deriv - sign * (z / (opmath_t(1) + z))); }); }); } @@ -368,54 +394,101 @@ static void threshold_kernel_cuda(TensorIteratorBase& iter, const Scalar& thresh void elu_kernel(TensorIteratorBase& iter, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale) { AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "elu_cuda", [&]() { - auto negcoef = alpha.to() * scale.to(); - auto poscoef = scale.to(); - auto negiptcoef = input_scale.to(); + using opmath_t = at::opmath_type; + auto negcoef = alpha.to() * scale.to(); + auto poscoef = scale.to(); + auto negiptcoef = input_scale.to(); gpu_kernel(iter, [negcoef, poscoef, negiptcoef]GPU_LAMBDA(scalar_t a) -> scalar_t { - return a > scalar_t(0) ? a * poscoef : (static_cast(std::exp(a * negiptcoef)) - scalar_t(1.)) * negcoef; + opmath_t aop = static_cast(a); + return aop > 0 ? aop * poscoef : std::expm1(aop * negiptcoef) * negcoef; }); }); } void elu_backward_kernel(TensorIteratorBase& iter, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale, bool is_result) { AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "elu_backward_cuda", [&]() { - auto negcoef = alpha.to() * scale.to(); - auto poscoef = scale.to(); - auto negiptcoef = input_scale.to(); + using opmath_t = at::opmath_type; + auto negcoef = alpha.to() * scale.to(); + auto poscoef = scale.to(); + auto negiptcoef = input_scale.to(); gpu_kernel(iter, [negcoef, poscoef, negiptcoef, is_result]GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { + opmath_t aop = static_cast(a); + opmath_t bop = static_cast(b); + if (is_result) { - return b <= scalar_t(0) ? a * negiptcoef * (b + negcoef) : a * poscoef; + return bop <= 0 ? aop * negiptcoef * (bop + negcoef) : aop * poscoef; } else { - return b <= scalar_t(0) ? a * negiptcoef * negcoef * (static_cast(std::exp(b * negiptcoef))) : a * poscoef; + return bop <= 0 ? aop * negiptcoef * negcoef * std::exp(bop * negiptcoef) : aop * poscoef; } }); }); } -void GeluCUDAKernelImpl(TensorIteratorBase& it) { - AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, it.dtype(), "GeluCUDAKernelImpl", [&]() { - using T_ACC = acc_type; - gpu_kernel(it, [] GPU_LAMBDA(scalar_t x) -> scalar_t { - return static_cast(x) * - c10::cuda::compat::normcdf(static_cast(x)); +void GeluCUDAKernelImpl(TensorIteratorBase& it, GeluType approximate) { + if (approximate == GeluType::Tanh) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, it.dtype(), "GeluCUDAKernelImpl", [&]() { + gpu_kernel(it, [] GPU_LAMBDA(scalar_t x) -> scalar_t { + using opmath_t = at::opmath_type; + constexpr opmath_t kBeta = M_SQRT2 * M_2_SQRTPI * opmath_t(0.5); + constexpr opmath_t kKappa = 0.044715; + auto x_cube = static_cast(x) * static_cast(x) * static_cast(x); + auto inner = kBeta * (static_cast(x) + kKappa * x_cube); + return opmath_t(0.5) * static_cast(x) * (opmath_t(1) + c10::cuda::compat::tanh(inner)); + }); }); - }); + } else { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, it.dtype(), "GeluCUDAKernelImpl", [&]() { + gpu_kernel(it, [] GPU_LAMBDA(scalar_t x) -> scalar_t { + using opmath_t = at::opmath_type; + constexpr opmath_t kAlpha = M_SQRT1_2; + return static_cast(x) * opmath_t(0.5) * (opmath_t(1) + ::erf(static_cast(x) * kAlpha)); + }); + }); + } } -void GeluBackwardCUDAKernelImpl(TensorIteratorBase& it) { - AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, - it.dtype(), "GeluBackwardCUDAKernelImpl", [&]() { - using T_ACC = acc_type; - gpu_kernel(it, [] GPU_LAMBDA(scalar_t dy, scalar_t x) -> scalar_t { - constexpr T_ACC kBeta = M_2_SQRTPI * M_SQRT1_2 * T_ACC(0.5); - const T_ACC cdf = c10::cuda::compat::normcdf(static_cast(x)); - const T_ACC pdf = - c10::cuda::compat::exp( - T_ACC(-0.5) * static_cast(x) * static_cast(x)) * - kBeta; - return static_cast(dy) * (cdf + static_cast(x) * pdf); +void GeluBackwardCUDAKernelImpl(TensorIteratorBase& it, GeluType approximate) { + if (approximate == GeluType::Tanh) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, + it.dtype(), "GeluBackwardCUDAKernelImpl", [&]() { + gpu_kernel(it, [] GPU_LAMBDA(scalar_t dy, scalar_t x) -> scalar_t { + using opmath_t = at::opmath_type; + constexpr opmath_t kBeta = M_SQRT2 * M_2_SQRTPI * opmath_t(0.5); + constexpr opmath_t kKappa = 0.044715; + auto x_sq = static_cast(x) * static_cast(x); + auto x_cube = x_sq * static_cast(x); + auto inner = kBeta * (static_cast(x) + kKappa * x_cube); + auto tanh_inner = c10::cuda::compat::tanh(inner); + + auto left = opmath_t(0.5) * static_cast(x); + auto right = opmath_t(1) + tanh_inner; + + auto left_derivative = 0.5 * right; + + auto tanh_derivative = opmath_t(1) - tanh_inner * tanh_inner; + auto inner_derivative = kBeta * (opmath_t(1) + opmath_t(3) * kKappa * x_sq); + auto right_derivative = left * tanh_derivative * inner_derivative; + + return static_cast(dy) * (left_derivative + right_derivative); }); }); + } else { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, + it.dtype(), "GeluBackwardCUDAKernelImpl", [&]() { + gpu_kernel(it, [] GPU_LAMBDA(scalar_t dy, scalar_t x) -> scalar_t { + using opmath_t = at::opmath_type; + constexpr opmath_t kBeta = M_2_SQRTPI * M_SQRT1_2 * opmath_t(0.5); + constexpr opmath_t kAlpha = M_SQRT1_2; + const opmath_t cdf = + opmath_t(0.5) * (opmath_t(1) + ::erf(static_cast(x) * kAlpha)); + const opmath_t pdf = + c10::cuda::compat::exp( + opmath_t(-0.5) * static_cast(x) * static_cast(x)) * + kBeta; + return static_cast(dy) * (cdf + static_cast(x) * pdf); + }); + }); + } } namespace { @@ -594,6 +667,7 @@ REGISTER_DISPATCH(shrink_backward_stub, &shrink_backward_kernel); REGISTER_DISPATCH(elu_stub, &elu_kernel); REGISTER_DISPATCH(elu_backward_stub, &elu_backward_kernel); REGISTER_DISPATCH(glu_stub, &glu_kernel); +REGISTER_DISPATCH(glu_jvp_stub, &glu_jvp_kernel); REGISTER_DISPATCH(leaky_relu_stub, &leaky_relu_kernel); REGISTER_DISPATCH(leaky_relu_backward_stub, &leaky_relu_backward_kernel); REGISTER_DISPATCH(hardswish_stub, &hardswish_kernel); diff --git a/aten/src/ATen/native/cuda/Activation.h b/aten/src/ATen/native/cuda/Activation.h index 5e798316c9bc..5fc52ff257ce 100644 --- a/aten/src/ATen/native/cuda/Activation.h +++ b/aten/src/ATen/native/cuda/Activation.h @@ -1,4 +1,5 @@ - +#pragma once +#include #include namespace at { @@ -24,7 +25,7 @@ void launch_prelu_cuda_backward_kernel_multi_weights( const TensorBase &input, const TensorBase &weight, const TensorBase &grad_out, const TensorBase &input_grad, const TensorBase &weight_grad_collector); -void GeluCUDAKernelImpl(TensorIteratorBase& it); -void GeluBackwardCUDAKernelImpl(TensorIteratorBase& it); +void GeluCUDAKernelImpl(TensorIteratorBase& it, GeluType approximate); +void GeluBackwardCUDAKernelImpl(TensorIteratorBase& it, GeluType approximate); }} // namespace at::native diff --git a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu index ebb2e1691107..55b0d3322e04 100644 --- a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu +++ b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu @@ -1,13 +1,24 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include +#include #include #include -#include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + #include #include #include diff --git a/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu b/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu index 4581fa3bf7eb..ec71b37015fb 100644 --- a/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu +++ b/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu @@ -1,12 +1,23 @@ -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include +#include #include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + #include #include #include diff --git a/aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu b/aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu index d1d3c079b0ad..5b46fb9c34a5 100644 --- a/aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu +++ b/aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu @@ -1,13 +1,23 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include #include -#include +#include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#endif + #include #include #include diff --git a/aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu b/aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu index c2aa9adeee5b..baafc6c56d46 100644 --- a/aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu +++ b/aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu @@ -1,13 +1,23 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include #include -#include +#include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#endif + #include #include #include diff --git a/aten/src/ATen/native/cuda/AmpKernels.cu b/aten/src/ATen/native/cuda/AmpKernels.cu index c89d8a09e8d1..276f320bb199 100644 --- a/aten/src/ATen/native/cuda/AmpKernels.cu +++ b/aten/src/ATen/native/cuda/AmpKernels.cu @@ -1,8 +1,9 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #define _USE_MATH_DEFINES #include -#include +#include #include #include #include diff --git a/aten/src/ATen/native/cuda/AveragePool2d.cu b/aten/src/ATen/native/cuda/AveragePool2d.cu index 883228ecc45d..55632014a0de 100644 --- a/aten/src/ATen/native/cuda/AveragePool2d.cu +++ b/aten/src/ATen/native/cuda/AveragePool2d.cu @@ -1,6 +1,8 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include +#include #include #include #include @@ -8,6 +10,14 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#endif + namespace at { namespace native { namespace { diff --git a/aten/src/ATen/native/cuda/AveragePool3d.cu b/aten/src/ATen/native/cuda/AveragePool3d.cu index 29ba71d6acd5..ce395a4ad044 100644 --- a/aten/src/ATen/native/cuda/AveragePool3d.cu +++ b/aten/src/ATen/native/cuda/AveragePool3d.cu @@ -1,5 +1,8 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include +#include #include #include #include @@ -9,6 +12,14 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#endif + namespace at { namespace native { diff --git a/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu b/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu deleted file mode 100644 index 56d6b0acd728..000000000000 --- a/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu +++ /dev/null @@ -1,37 +0,0 @@ -#define TORCH_ASSERT_NO_OPERATORS -#include -#include -#include -#include -#include -#include - -// NOTE: CUDA on Windows requires that the enclosing function -// of a __device__ lambda not have internal linkage. - -namespace at { namespace native { - -template -struct AddFunctor { - AddFunctor(T alpha) : alpha_(alpha) {} - T alpha_; - __device__ __forceinline__ T operator()(T a, T b) const __ubsan_ignore_undefined__ { - return a + b * alpha_; - } -}; - -void add_kernel_cuda(TensorIteratorBase& iter, const Scalar& alpha_scalar) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, iter.common_dtype(), "add_cuda/sub_cuda", [&]() { - using opmath_t = at::opmath_type; - opmath_gpu_kernel_with_scalars(iter, AddFunctor(alpha_scalar.to())); - }); -} - -static void sub_kernel_cuda(TensorIteratorBase& iter, const Scalar& alpha_scalar) { - add_kernel_cuda(iter, -alpha_scalar); -} - -REGISTER_DISPATCH(add_stub, &add_kernel_cuda); -REGISTER_DISPATCH(sub_stub, &sub_kernel_cuda); - -}} // namespace at::native diff --git a/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu index c1bc53594a20..3a8ab02e3a54 100644 --- a/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu +++ b/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu @@ -2,6 +2,7 @@ #include #include #include +#include #include #include @@ -10,13 +11,39 @@ namespace at { namespace native { +const char logical_and_name[] = "logical_and_kernel"; void logical_and_kernel_cuda(TensorIterator& iter) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, ScalarType::BFloat16, - iter.common_dtype(), "logical_and_cuda", [&]() { - gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool { - return a && b; + auto dtype = iter.common_dtype(); + if (at::isComplexType(dtype)) { +#if AT_USE_JITERATOR() + static const auto logical_and_string = jiterator_stringify( + template + T logical_and_kernel(T a, T b) { + return a && b; + } + ); // logical_and_string + AT_DISPATCH_COMPLEX_TYPES(dtype, "logical_and_cuda", [&]() { + jitted_gpu_kernel< + /*name=*/ logical_and_name, + /*return_dtype=*/ scalar_t, + /*common_dtype=*/ scalar_t, + /*arity=*/ 2>(iter, logical_and_string); + }); // logical_and_string +#else + AT_DISPATCH_COMPLEX_TYPES(dtype, "logical_and_cuda", [&]() { + gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool { + return a && b; + }); }); - }); +#endif + } else { + AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, ScalarType::BFloat16, + dtype, "logical_and_cuda", [&]() { + gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool { + return a && b; + }); + }); + } } void logical_or_kernel_cuda(TensorIterator& iter) { diff --git a/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu index f4b618ec283f..844388e61094 100644 --- a/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu +++ b/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu @@ -1,3 +1,4 @@ +#define TORCH_ASSERT_NO_OPERATORS #include #include @@ -7,6 +8,7 @@ #include #include #include +#include // NOTE: CUDA on Windows requires that the enclosing function // of a __device__ lambda not have internal linkage. @@ -14,15 +16,37 @@ namespace at { namespace native { +const char sigmoid_backward_name[] = "sigmoid_backward"; void sigmoid_backward_kernel_cuda(TensorIteratorBase& iter) { - if(isComplexType(iter.dtype())) { - AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "sigmoid_backward_cuda", [&]() { + auto dtype = iter.dtype(); + if(isComplexType(dtype)) { +#if AT_USE_JITERATOR() + static const auto sigmoid_backward_string = jiterator_stringify( + template + T sigmoid_backward(T a, T b) { + return a * std::conj((T{1.} - b) * b); + } + ); // sigmoid_backward_string + AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "sigmoid_backward_cuda", [&]() { + jitted_gpu_kernel< + /*name=*/ sigmoid_backward_name, + /*return_dtype=*/ scalar_t, + /*common_dtype=*/ scalar_t, + /*arity=*/ 2>(iter, sigmoid_backward_string); + }); +#else + AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "sigmoid_backward_cuda", [&]() { gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { - return a * std::conj((scalar_t{1.} - b) * b); + using comp_t = at::opmath_type; + const auto one = comp_t{1.}; + const auto comp_b = static_cast(b); + const auto comp_a = static_cast(a); + return static_cast(comp_a * std::conj((one - comp_b) * comp_b)); }); }); +#endif } else { - AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "sigmoid_backward_cuda", [&]() { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, dtype, "sigmoid_backward_cuda", [&]() { gpu_kernel(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { return a * (scalar_t(1.) - b) * b; }); @@ -63,15 +87,37 @@ void logit_backward_kernel_cuda(TensorIteratorBase& iter, const Scalar& eps_scal }); } +const char tanh_backward_name[] = "tanh_backward"; void tanh_backward_kernel_cuda(TensorIteratorBase& iter) { - if(isComplexType(iter.dtype())) { - AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "tanh_backward_complex_cuda", [&]() { + auto dtype = iter.dtype(); + if(isComplexType(dtype)) { +#if AT_USE_JITERATOR() + static const auto tanh_backward_string = jiterator_stringify( + template + T tanh_backward(T a, T b) { + return a * std::conj(T{1.} - b * b); + } + ); // tanh_backward_string + AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "tanh_backward_complex_cuda", [&]() { + jitted_gpu_kernel< + /*name=*/ tanh_backward_name, + /*return_dtype=*/ scalar_t, + /*common_dtype=*/ scalar_t, + /*arity=*/ 2>(iter, tanh_backward_string); + }); +#else + AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "tanh_backward_complex_cuda", [&]() { gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { - return a * std::conj(scalar_t{1.} - b * b); + using comp_t = at::opmath_type; + const auto one = comp_t{1.}; + const auto comp_b = static_cast(b); + const auto comp_a = static_cast(a); + return static_cast(comp_a * std::conj(one - comp_b * comp_b)); }); }); +#endif } else { - AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "tanh_backward_cuda", [&]() { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, dtype, "tanh_backward_cuda", [&]() { gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { return a * (scalar_t{1.} - b * b); }); diff --git a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu index f72ddfa4bfe2..703436a1d495 100644 --- a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu +++ b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu @@ -32,7 +32,7 @@ void huber_kernel_cuda(TensorIterator& iter, double delta) { }); } -void mse_kernel_cuda(TensorIterator& iter) { +void mse_kernel_cuda(TensorIteratorBase& iter) { AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "mse_cuda", [&]() { gpu_kernel(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { auto diff = a - b; diff --git a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu index aef5600c640e..bb34c8f85366 100644 --- a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu +++ b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -38,13 +39,30 @@ struct MulFunctor { } }; - +const char div_name[] = "div_kernel"; void div_true_kernel_cuda(TensorIteratorBase& iter) { + auto common_dtype = iter.common_dtype(); + if (iter.common_dtype() == kComplexHalf) { + using scalar_t = c10::complex; + #if AT_USE_JITERATOR() + static const auto div_string = jiterator_stringify( + template + T div_kernel(T a, T b) { + return a / b; + } + ); + opmath_jitted_gpu_kernel_with_scalars(iter, div_string); + #else + using opmath_t = at::opmath_type; + opmath_gpu_kernel_with_scalars(iter, DivFunctor()); + #endif + return; + } if (iter.is_cpu_scalar(2)) { // optimization for floating-point types: if the second operand is a CPU // scalar, compute a * reciprocal(b). Note that this may lose one bit of // precision compared to computing the division. - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "div_true_cuda", [&]() { + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, common_dtype, "div_true_cuda", [&]() { using opmath_t = at::opmath_type; auto inv_b = opmath_t(1.0) / iter.scalar_value(2); iter.remove_operand(2); @@ -52,7 +70,7 @@ void div_true_kernel_cuda(TensorIteratorBase& iter) { MulFunctor(), inv_b)); }); } else { - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "div_true_cuda", [&]() { + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, common_dtype, "div_true_cuda", [&]() { DivFunctor f; gpu_kernel_with_scalars(iter, f); }); @@ -171,11 +189,29 @@ void div_floor_kernel_cuda(TensorIteratorBase& iter) { } } +const char mul_name[] = "mul_kernel"; void mul_kernel_cuda(TensorIteratorBase& iter) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "mul_cuda", [&]() { - using opmath_t = at::opmath_type; - opmath_gpu_kernel_with_scalars(iter, MulFunctor()); - }); + auto common_dtype = iter.common_dtype(); + if (common_dtype == kComplexHalf) { + using scalar_t = c10::complex; + #if AT_USE_JITERATOR() + static const auto mul_string = jiterator_stringify( + template + T mul_kernel(T a, T b) { + return a * b; + } + ); + opmath_jitted_gpu_kernel_with_scalars(iter, mul_string); + #else + using opmath_t = at::opmath_type; + opmath_gpu_kernel_with_scalars(iter, MulFunctor()); + #endif + } else { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "mul_cuda", [&]() { + using opmath_t = at::opmath_type; + opmath_gpu_kernel_with_scalars(iter, MulFunctor()); + }); + } } REGISTER_DISPATCH(div_true_stub, &div_true_kernel_cuda); diff --git a/aten/src/ATen/native/cuda/BinaryShiftOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryShiftOpsKernels.cu index 7f22ace666f2..d6bd145c4f50 100644 --- a/aten/src/ATen/native/cuda/BinaryShiftOpsKernels.cu +++ b/aten/src/ATen/native/cuda/BinaryShiftOpsKernels.cu @@ -12,47 +12,21 @@ namespace at { namespace native { void lshift_kernel_cuda(TensorIteratorBase& iter) { - if (iter.dtype() == ScalarType::Float || - iter.dtype() == ScalarType::Double || - iter.dtype() == ScalarType::Half || - iter.dtype() == ScalarType::BFloat16) { - AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "lshift_cuda", [&]() { - gpu_kernel_with_scalars( - iter, - []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { - return a * std::pow(static_cast(2), b); - }); + AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "lshift_cuda", [&]() { + gpu_kernel_with_scalars(iter, + []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { + return static_cast>(a) << b; }); - } else { - AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "lshift_cuda", [&]() { - gpu_kernel_with_scalars(iter, - []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { - return static_cast>(a) << b; - }); - }); - } + }); } void rshift_kernel_cuda(TensorIteratorBase& iter) { - if (iter.dtype() == ScalarType::Float || - iter.dtype() == ScalarType::Double || - iter.dtype() == ScalarType::Half || - iter.dtype() == ScalarType::BFloat16) { - AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "rshift_cuda", [&]() { - gpu_kernel_with_scalars( - iter, - []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { - return a / std::pow(static_cast(2), b); - }); - }); - } else { - AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "rshift_cuda", [&]() { - gpu_kernel_with_scalars(iter, - []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { - return a >> b; - }); + AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "rshift_cuda", [&]() { + gpu_kernel_with_scalars(iter, + []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { + return a >> b; }); - } + }); } REGISTER_DISPATCH(lshift_stub, &lshift_kernel_cuda); diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp index 2317f072b8cc..3ca9814175c5 100644 --- a/aten/src/ATen/native/cuda/Blas.cpp +++ b/aten/src/ATen/native/cuda/Blas.cpp @@ -1,9 +1,35 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include +#include +#include +#include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + namespace at { namespace native { namespace { @@ -90,7 +116,29 @@ c10::MaybeOwned prepare_batch_matrix_for_cublas(const Tensor& tensor, bo namespace { -Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha) { +enum class Activation { + None, + RELU, + GELU, +}; + +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER) +cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activation a) { + switch (a) { + case Activation::None: + return cuda::blas::GEMMAndBiasActivationEpilogue::None; + case Activation::RELU: + return cuda::blas::GEMMAndBiasActivationEpilogue::RELU; + case Activation::GELU: + return cuda::blas::GEMMAndBiasActivationEpilogue::GELU; + default: + TORCH_CHECK(false); + return cuda::blas::GEMMAndBiasActivationEpilogue::None; + } +} +#endif + +Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, Activation activation=Activation::None) { // Make sure to keep addmm_cuda below in sync with this code; it // preflights a check to try to avoid actually needing to call // expand(). @@ -102,9 +150,36 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma IntArrayRef mat1_sizes = mat1.sizes(); IntArrayRef mat2_sizes = mat2.sizes(); IntArrayRef self__sizes; + bool useLtInterface = false; + at::ScalarType scalar_type = self.scalar_type(); c10::MaybeOwned self_; if (&result != &self) { - self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm"); +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11040 && !defined(_MSC_VER) + // Strangely, if mat2 has only 1 row or column, we get + // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic. + // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1] + // is to use lt interface only when self is bias. + // for cuda 11.4, cublasLtMatmul is activated + // the last two conditions is to skip 16b transA and non-trans-B having + // leading dim >> rows when they are sliced from a large tensor + // see fbcode/caffe2/test/test_linalg.py:test_corner_cases_of_cublasltmatmul + useLtInterface = beta.toComplexDouble() == 1.0 && self.dim() == 1 && + result.dim() == 2 && self.sizes()[0] == mat2_sizes[1] && + self.is_contiguous() && + (scalar_type == at::ScalarType::Double || + scalar_type == at::ScalarType::Float || + scalar_type == at::ScalarType::Half || + scalar_type == at::ScalarType::BFloat16) && + mat2_sizes[0] > 1 && mat2_sizes[1] > 1 && + mat2_sizes[0] < 65535 && mat2_sizes[1] < 65535 && + mat1_sizes[0] < 65535 && mat1_sizes[1] < 65535 && + // avoid leaing dim >> rows bugs + ((mat1.strides()[0]==1 && mat1.strides()[1]==mat1_sizes[0]) || (mat1.strides()[1] == 1 && mat1.strides()[0] == mat1_sizes[1]) || (scalar_type != at::ScalarType::Half && scalar_type != at::ScalarType::BFloat16)) && + ((mat2.strides()[0]==1 && mat2.strides()[1]==mat2_sizes[0]) || (mat2.strides()[1] == 1 && mat2.strides()[0] == mat2_sizes[1]) || (scalar_type != at::ScalarType::Half && scalar_type != at::ScalarType::BFloat16)); +#endif + if (!useLtInterface) { + self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm"); + } self__sizes = self_->sizes(); } else { self_ = c10::MaybeOwned::borrowed(self); @@ -115,8 +190,8 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma } if (&result != &self) { - at::native::resize_output(result, self__sizes); - if (beta.toComplexDouble() != 0.0) { + at::native::resize_output(result, {mat1_sizes[0], mat2_sizes[1]}); + if (beta.toComplexDouble() != 0.0 && !useLtInterface) { at::native::copy_(result, *self_); } } @@ -147,7 +222,6 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma int64_t mat1_ld = mat1_->stride((transpose_mat1 == transpose_result) ? 1 : 0); int64_t mat2_ld = mat2_->stride((transpose_mat2 == transpose_result) ? 1 : 0); int64_t result_ld = result_->stride(transpose_result ? 0 : 1); - at::ScalarType scalar_type = self_->scalar_type(); if (mat1.numel() == 0) { // By definition, when beta==0, values in self should be ignored. nans and infs @@ -170,24 +244,92 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!result_->is_conj()); - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, scalar_type, "addmm_cuda", [&] { - using opmath_t = at::opmath_type; - opmath_t alpha_val = alpha.to(); - opmath_t beta_val = beta.to(); - scalar_t* mat1_ptr = mat1_->data_ptr(); - scalar_t* mat2_ptr = mat2_->data_ptr(); - scalar_t* result_ptr = result_->data_ptr(); - at::cuda::blas::gemm( - transpose_mat1 ? mat1_->is_conj() ? 'c' : 't' : 'n', - transpose_mat2 ? mat2_->is_conj() ? 'c' : 't' : 'n', - m, n, k, - alpha_val, - mat1_ptr, mat1_ld, - mat2_ptr, mat2_ld, - beta_val, - result_ptr, result_ld - ); - }); +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER) + if (useLtInterface) { + AT_DISPATCH_FLOATING_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + scalar_type, + "addmm_cuda_lt", + [&] { + at::cuda::blas::gemm_and_bias( + transpose_mat1, + transpose_mat2, + m, + n, + k, + alpha.to>(), + mat1_->data_ptr(), + mat1_ld, + mat2_->data_ptr(), + mat2_ld, + self.data_ptr(), + result_->data_ptr(), + result_ld, +#if 0 + activation_to_gemm_and_blas_arg(activation) +#else + // GELU is not supported (and does not compile!) prior + // to CUDA 11.4. Have observed accuracy issues with + // GELU epilogue in 11.4; disabling the GELU epilogue + // path until we confirm which version it's working in. + activation != Activation::GELU + ? activation_to_gemm_and_blas_arg(activation) + : cuda::blas::GEMMAndBiasActivationEpilogue::None +#endif + ); + }); + } else +#endif + { + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + scalar_type, + "addmm_cuda", + [&] { + using opmath_t = at::opmath_type; + opmath_t alpha_val = alpha.to(); + opmath_t beta_val = beta.to(); + scalar_t* mat1_ptr = mat1_->data_ptr(); + scalar_t* mat2_ptr = mat2_->data_ptr(); + scalar_t* result_ptr = result_->data_ptr(); + at::cuda::blas::gemm( + transpose_mat1 ? mat1_->is_conj() ? 'c' : 't' : 'n', + transpose_mat2 ? mat2_->is_conj() ? 'c' : 't' : 'n', + m, + n, + k, + alpha_val, + mat1_ptr, + mat1_ld, + mat2_ptr, + mat2_ld, + beta_val, + result_ptr, + result_ld); + }); + switch (activation) { + case Activation::RELU: + at::relu_(const_cast(*result_)); + break; + case Activation::GELU: + at::gelu_(const_cast(*result_)); + break; + default: break; + } + } + +// Preprocessor gate here needs to match the inverse of the check +// gating activation_to_gemm_and_blas_arg above; here we are manually +// performing a post-GELU because we weren't able to use the GELU +// epilogue above. +#if !0 + if (useLtInterface && activation == Activation::GELU) { + at::gelu_(const_cast(*result_)); + } +#endif + if (!result.is_same(*result_)) { result.copy_(*result_); } @@ -271,6 +413,10 @@ TORCH_IMPL_FUNC(addmm_out_cuda)(const Tensor& self, const Tensor& mat1, const Te addmm_out_cuda_impl(const_cast(result), self, mat1, mat2, beta, alpha); } +TORCH_IMPL_FUNC(addmm_activation_out_cuda)(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, bool use_gelu, const Tensor& result) { + addmm_out_cuda_impl(const_cast(result), self, mat1, mat2, beta, alpha, use_gelu ? Activation::GELU : Activation::RELU); +} + TORCH_IMPL_FUNC(mm_out_cuda)(const Tensor& self, const Tensor& mat2, const Tensor& result) { addmm_out_cuda_impl(const_cast(result), result, self, mat2, 0, 1); } @@ -457,7 +603,8 @@ TORCH_IMPL_FUNC(addmv_out_cuda)(const Tensor &self, const Tensor &mat, const Ten // Check for contiguity of `vec` and update `vec_stride` accordingly const auto vec_contiguous = vec_stride == 0 ? vec.contiguous() : vec; - vec_stride = vec_contiguous.stride(0); + // A vector can be contiguous and have a stride of zero if it has it is of length 1 + vec_stride = std::max(vec_contiguous.stride(0), 1LL); AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, mat.scalar_type(), "addmv_impl_cuda", [&] { auto beta = beta_.to(); diff --git a/aten/src/ATen/native/cuda/Bucketization.cu b/aten/src/ATen/native/cuda/Bucketization.cu index 81f81aa315ce..2a3d5730d786 100644 --- a/aten/src/ATen/native/cuda/Bucketization.cu +++ b/aten/src/ATen/native/cuda/Bucketization.cu @@ -1,10 +1,21 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + namespace at { namespace native { diff --git a/aten/src/ATen/native/cuda/CUDAJitLoops.cuh b/aten/src/ATen/native/cuda/CUDAJitLoops.cuh index b5b1cd5c63bc..ae0797f38e8c 100644 --- a/aten/src/ATen/native/cuda/CUDAJitLoops.cuh +++ b/aten/src/ATen/native/cuda/CUDAJitLoops.cuh @@ -71,7 +71,8 @@ static inline void launch_jitted_unrolled_kernel( std::tuple extra_args) { TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits::max()); - const int64_t grid = (N + block_work_size() - 1) / block_work_size(); + //casting result to int is always safe, intermediate is int64 and won't overflow + const uint32_t grid = (N + block_work_size() - 1) / block_work_size(); static std::mutex _jiterator_mutex; static std::vector fns(c10::cuda::device_count()); @@ -114,9 +115,8 @@ static inline void launch_jitted_unrolled_kernel( // since 7 slots are already filled in `args` args[i + 7] = extra_args_array[i]; } - - at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args, grid, num_threads()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); + at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args, {grid, 1u, 1u}, + {num_threads(), 1u, 1u}); } template< @@ -129,7 +129,8 @@ template< static inline void launch_jitted_vectorized_kernel(DeviceIndex dev_idx, int64_t N, const std::string& f, array_t data, at::opmath_type scalar_val, std::tuple extra_args) { TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits::max()); - const int64_t grid = (N + block_work_size() - 1) / block_work_size(); + // N is still int64_t for the computation, but it's always safe to cast result to int + const uint32_t grid = (N + block_work_size() - 1) / block_work_size(); const int vec_size = memory::jitted_can_vectorize_up_to(data); // Different kernels are compiled depending on what we're vectorizing up to (1, 2 or 4 elements) @@ -195,9 +196,7 @@ at::opmath_type scalar_val, std::tuple extra_args) { // since 3 slots are already filled in `args` args[i + 3] = extra_args_array[i]; } - - at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args, grid, num_threads()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); + at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args, {grid, 1u, 1u}, {num_threads(), 1u, 1u}); } else { auto ic = TrivialOffsetCalculator(); auto oc = TrivialOffsetCalculator<1>(); @@ -219,14 +218,25 @@ at::opmath_type scalar_val, std::tuple extra_args) { // since 7 slots are already filled in `args` args[i + 7] = extra_args_array[i]; } - at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args, grid, num_threads()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); + + at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args, {grid, 1u, 1u}, {num_threads(), 1u, 1u}); } } -template -void jitted_gpu_kernel_impl(TensorIteratorBase& iter, const std::string& f, const bool dynamic_casting, compute_type scalar_val, std::tuple extra_args) { +template < + char const* name, + typename result_type, + typename f_inputs_type, + int arity, + at::cuda::jit::BinaryFuncVariant scalar_pos = + at::cuda::jit::BinaryFuncVariant::NoScalar, + typename... Args> +void jitted_gpu_kernel_impl( + TensorIteratorBase& iter, + const std::string& f, + const bool dynamic_casting, + at::opmath_type scalar_val, + std::tuple extra_args) { TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing()); TORCH_INTERNAL_ASSERT(iter.ninputs() == arity); TORCH_INTERNAL_ASSERT(iter.noutputs() == 1); @@ -251,7 +261,7 @@ void jitted_gpu_kernel_impl(TensorIteratorBase& iter, const std::string& f, cons if (!dynamic_casting) { if (contiguous) { // Case 1: no dynamic casting and contiguous - launch_jitted_vectorized_kernel( + launch_jitted_vectorized_kernel( iter.device().index(), numel, f, data, scalar_val, extra_args); return; } @@ -261,7 +271,7 @@ void jitted_gpu_kernel_impl(TensorIteratorBase& iter, const std::string& f, cons auto output_offset_calculator = make_output_offset_calculator(iter); auto loader = memory::LoadWithoutCast(); auto storer = memory::StoreWithoutCast(); - launch_jitted_unrolled_kernel( + launch_jitted_unrolled_kernel( iter.device().index(), numel, f, data, input_offset_calculator, output_offset_calculator, loader, storer, contiguous, scalar_val, extra_args); return; @@ -284,7 +294,7 @@ void jitted_gpu_kernel_impl(TensorIteratorBase& iter, const std::string& f, cons // Case 3: dynamic casting and contiguous auto input_offset_calculator = TrivialOffsetCalculator(); auto output_offset_calculator = TrivialOffsetCalculator<1>(); - launch_jitted_unrolled_kernel( + launch_jitted_unrolled_kernel( iter.device().index(), numel, f, data, input_offset_calculator, output_offset_calculator, loader, storer, contiguous, scalar_val, extra_args); return; @@ -293,7 +303,7 @@ void jitted_gpu_kernel_impl(TensorIteratorBase& iter, const std::string& f, cons // Case 4: dynamic casting and noncontiguous auto input_offset_calculator = make_input_offset_calculator(iter); auto output_offset_calculator = make_output_offset_calculator(iter); - launch_jitted_unrolled_kernel( + launch_jitted_unrolled_kernel( iter.device().index(), numel, f, data, input_offset_calculator, output_offset_calculator, loader, storer, contiguous, scalar_val, extra_args); } diff --git a/aten/src/ATen/native/cuda/CUDAScalar.cu b/aten/src/ATen/native/cuda/CUDAScalar.cu index 242778faa14f..4f2b092573e3 100644 --- a/aten/src/ATen/native/cuda/CUDAScalar.cu +++ b/aten/src/ATen/native/cuda/CUDAScalar.cu @@ -1,5 +1,12 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS #include +#else +#include +#endif #include @@ -8,8 +15,8 @@ namespace native { Scalar _local_scalar_dense_cuda(const Tensor& self) { Scalar r; - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( - at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "_local_scalar_dense_cuda", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( + kComplexHalf, kHalf, kBool, kBFloat16, self.scalar_type(), "_local_scalar_dense_cuda", [&] { scalar_t value; cudaStream_t stream = at::cuda::getCurrentCUDAStream(); at::cuda::memcpy_and_sync(&value, self.data_ptr(), sizeof(scalar_t), cudaMemcpyDeviceToHost, stream); diff --git a/aten/src/ATen/native/cuda/Col2Im.cu b/aten/src/ATen/native/cuda/Col2Im.cu index f7a63428a56f..5cb825a2e70b 100644 --- a/aten/src/ATen/native/cuda/Col2Im.cu +++ b/aten/src/ATen/native/cuda/Col2Im.cu @@ -1,6 +1,7 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include -#include +#include #include #include #include @@ -10,6 +11,16 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + namespace at { namespace native { namespace { diff --git a/aten/src/ATen/native/cuda/CompareEQKernel.cu b/aten/src/ATen/native/cuda/CompareEQKernel.cu index 9a82205e2e47..88a22f1fc2b5 100644 --- a/aten/src/ATen/native/cuda/CompareEQKernel.cu +++ b/aten/src/ATen/native/cuda/CompareEQKernel.cu @@ -29,7 +29,7 @@ struct CompareEqFunctor{ } void eq_kernel_cuda(TensorIteratorBase& iter) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "eq_cuda", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kComplexHalf, kHalf, kBFloat16, kBool, iter.common_dtype(), "eq_cuda", [&]() { gpu_kernel_with_scalars(iter, CompareEqFunctor(EqOpType::EQ)); }); } diff --git a/aten/src/ATen/native/cuda/ComplexKernel.cu b/aten/src/ATen/native/cuda/ComplexKernel.cu index 32e60b9b2885..8738c0ab4c8e 100644 --- a/aten/src/ATen/native/cuda/ComplexKernel.cu +++ b/aten/src/ATen/native/cuda/ComplexKernel.cu @@ -1,3 +1,4 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include #include @@ -11,7 +12,7 @@ namespace native { namespace { void complex_kernel_cuda(TensorIterator& iter) { - AT_DISPATCH_FLOATING_TYPES(iter.input_dtype(0), "complex_cuda", [&]() { + AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.input_dtype(0), "complex_cuda", [&]() { gpu_kernel( iter, [] GPU_LAMBDA(scalar_t a, scalar_t b) -> c10::complex { return c10::complex(a, b); diff --git a/aten/src/ATen/native/cuda/ConvolutionMM2d.cu b/aten/src/ATen/native/cuda/ConvolutionMM2d.cu index c6144546a992..3d76bcfd30a6 100644 --- a/aten/src/ATen/native/cuda/ConvolutionMM2d.cu +++ b/aten/src/ATen/native/cuda/ConvolutionMM2d.cu @@ -1,12 +1,23 @@ -#include - -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include +#include +#include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + namespace at { namespace native { namespace { diff --git a/aten/src/ATen/native/cuda/Copy.cu b/aten/src/ATen/native/cuda/Copy.cu index a8720c7c967e..57f04d481fc5 100644 --- a/aten/src/ATen/native/cuda/Copy.cu +++ b/aten/src/ATen/native/cuda/Copy.cu @@ -1,4 +1,5 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include #include @@ -10,6 +11,12 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + namespace at { namespace native { @@ -24,8 +31,8 @@ void direct_copy_kernel_cuda(TensorIteratorBase &iter) { gpu_kernel(iter, [] GPU_LAMBDA(scalar_t x) { return x; }); }); } else { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( - kHalf, kBool, kBFloat16, dtype, "copy_", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( + kHalf, kBool, kBFloat16, kComplexHalf, dtype, "copy_", [&] { gpu_kernel(iter, [] GPU_LAMBDA(scalar_t x) { return x; }); }); } diff --git a/aten/src/ATen/native/cuda/CrossKernel.cu b/aten/src/ATen/native/cuda/CrossKernel.cu index e573d6594160..62310347799f 100644 --- a/aten/src/ATen/native/cuda/CrossKernel.cu +++ b/aten/src/ATen/native/cuda/CrossKernel.cu @@ -1,7 +1,9 @@ -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include +#include #include +#include +#include namespace at { namespace native { diff --git a/aten/src/ATen/native/cuda/CuFFTPlanCache.h b/aten/src/ATen/native/cuda/CuFFTPlanCache.h index ad7ca2ac5a14..9897fbeb51e7 100644 --- a/aten/src/ATen/native/cuda/CuFFTPlanCache.h +++ b/aten/src/ATen/native/cuda/CuFFTPlanCache.h @@ -1,5 +1,5 @@ -#include #include +#include #include #include #include diff --git a/aten/src/ATen/native/cuda/CuFFTUtils.h b/aten/src/ATen/native/cuda/CuFFTUtils.h index 09d561736472..4b02f914d7e2 100644 --- a/aten/src/ATen/native/cuda/CuFFTUtils.h +++ b/aten/src/ATen/native/cuda/CuFFTUtils.h @@ -1,6 +1,5 @@ #pragma once -#include #include #include diff --git a/aten/src/ATen/native/cuda/DepthwiseConv2d.cu b/aten/src/ATen/native/cuda/DepthwiseConv2d.cu index ac32bfafe1a9..8f0f9b99903a 100644 --- a/aten/src/ATen/native/cuda/DepthwiseConv2d.cu +++ b/aten/src/ATen/native/cuda/DepthwiseConv2d.cu @@ -1,4 +1,6 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include #include #include @@ -7,7 +9,14 @@ #include #include #include -#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#endif namespace at { namespace native { @@ -442,7 +451,7 @@ void conv_depthwise2d_backward_out( int getGradParamsNumThreads(int batchSize) { //warp per item in a batch, up to a maximum constexpr int MAX_BLOCK_SIZE = 256; - return std::min(batchSize * C10_WARP_SIZE, MAX_BLOCK_SIZE); + return std::min(batchSize * at::cuda::warp_size(), MAX_BLOCK_SIZE); } void conv_depthwise2d_grad_weight_out( @@ -498,8 +507,9 @@ void conv_depthwise2d_grad_weight_out( const auto input_a = input.packed_accessor32(); const auto grad_weight_a = grad_weight.packed_accessor32(); using acc_t = at::acc_type; - TORCH_INTERNAL_ASSERT(block.x % C10_WARP_SIZE == 0); - int smem = (block.x / C10_WARP_SIZE) * sizeof(acc_t); + int warp_size = at::cuda::warp_size(); + TORCH_INTERNAL_ASSERT(block.x % warp_size == 0); + int smem = (block.x / warp_size) * sizeof(acc_t); conv_depthwise2d_grad_weight_kernel<<>>( grad_output_a, input_a, grad_weight_a, batchSize, inputChannels, outputChannels, depthwiseMultiplier, width, height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH); diff --git a/aten/src/ATen/native/cuda/DepthwiseConv3d.cu b/aten/src/ATen/native/cuda/DepthwiseConv3d.cu index 8fbe14b797a7..5859be064bed 100644 --- a/aten/src/ATen/native/cuda/DepthwiseConv3d.cu +++ b/aten/src/ATen/native/cuda/DepthwiseConv3d.cu @@ -1,10 +1,20 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#endif + #include #include #include @@ -596,9 +606,10 @@ std::tuple _depthwise_3d_backward_cuda_out( TORCH_CHECK(padding[i] * 2 + input.size(i + 2) <= int_max, "Padded input tensor is too large."); } - TORCH_CHECK(grad_output_.size(0) * grad_output_.size(2) < int_max - block / C10_WARP_SIZE && - grad_output_.size(3) <= int_max - C10_WARP_SIZE && - grad_output_.size(4) <= int_max - C10_WARP_SIZE, + int64_t warp_size = at::cuda::warp_size(); + TORCH_CHECK(grad_output_.size(0) * grad_output_.size(2) < int_max - block / warp_size && + grad_output_.size(3) <= int_max - warp_size && + grad_output_.size(4) <= int_max - warp_size, "Output size is too large."); DWCONV3D_BACKWARD_WEIGHT_DISPATCH_SPECIALIZATION(1, 1) diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu index e651ab80f47b..05a201147241 100644 --- a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu +++ b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu @@ -1,6 +1,8 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include +#include #include #include #include @@ -12,6 +14,13 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#endif + namespace at { namespace native { namespace { @@ -128,8 +137,8 @@ __global__ void max_pool_forward_nhwc(const scalar_t* bottom_data, const int nba hstart += dilation_h; while(wstart < 0) wstart += dilation_w; - for (int ih = hstart; ih < hend; ih++) { - for (int iw = wstart; iw < wend; iw++) { + for (int ih = hstart; ih < hend; ih += dilation_h) { + for (int iw = wstart; iw < wend; iw += dilation_w) { int cached_index = threadIdx.x; const scalar_t *ptr_input = bottom_data + ih * in_stride_h + iw * in_stride_w; for(int c = channel_offset; c < channels; c+= blockDim.x*kernel_stride_C) { diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu index 67f5f41b9232..12817d5f66ea 100644 --- a/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu +++ b/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu @@ -1,5 +1,8 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include +#include #include #include #include @@ -11,6 +14,15 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif namespace at { namespace native { diff --git a/aten/src/ATen/native/cuda/DistanceKernel.cu b/aten/src/ATen/native/cuda/DistanceKernel.cu index b9cd1b31461e..a9130bd3e808 100644 --- a/aten/src/ATen/native/cuda/DistanceKernel.cu +++ b/aten/src/ATen/native/cuda/DistanceKernel.cu @@ -1,4 +1,6 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include #include #include @@ -6,6 +8,13 @@ #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#endif + #include namespace at { namespace native { diff --git a/aten/src/ATen/native/cuda/DistributionBernoulli.cu b/aten/src/ATen/native/cuda/DistributionBernoulli.cu index 8c9c59e7861e..a7967122db9c 100644 --- a/aten/src/ATen/native/cuda/DistributionBernoulli.cu +++ b/aten/src/ATen/native/cuda/DistributionBernoulli.cu @@ -1,6 +1,5 @@ +#define TORCH_ASSERT_NO_OPERATORS #include -#include -#include #include #include #include @@ -24,12 +23,12 @@ namespace at { namespace native { -void bernoulli_tensor_kernel(Tensor& self, const Tensor& p_, c10::optional gen_) { +void bernoulli_tensor_kernel(const TensorBase &self, const TensorBase &p_, c10::optional gen_) { auto generator = get_generator_or_default(gen_, cuda::detail::getDefaultCUDAGenerator()); at::native::templates::cuda::bernoulli_kernel(self, p_, generator); } -void bernoulli_scalar_kernel(Tensor& self, double p, c10::optional gen) { +void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional gen) { auto iter = TensorIterator::borrowing_nullary_op(self); auto generator = get_generator_or_default(gen, cuda::detail::getDefaultCUDAGenerator()); at::native::templates::cuda::bernoulli_kernel(iter, p, generator); diff --git a/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu b/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu index 0b4849d1a449..27f316bc82b4 100644 --- a/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu +++ b/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu @@ -1,27 +1,8 @@ -#include -#include -#include -#include -#include +#define TORCH_ASSERT_NO_OPERATORS #include #include #include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include - namespace at { namespace native { void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, c10::optional gen) { diff --git a/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu b/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu index 3ff39c3907a2..4dac756a2aaf 100644 --- a/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu +++ b/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu @@ -1,27 +1,8 @@ -#include -#include -#include -#include -#include +#define TORCH_ASSERT_NO_OPERATORS #include #include #include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include - namespace at { namespace native { void exponential_kernel(TensorIteratorBase& iter, double lambda, c10::optional gen) { diff --git a/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu b/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu index 35ddcc65330b..4bfe6cb692b5 100644 --- a/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu +++ b/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu @@ -1,27 +1,8 @@ -#include -#include -#include -#include -#include +#define TORCH_ASSERT_NO_OPERATORS #include #include #include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include - namespace at { namespace native { void geometric_kernel(TensorIteratorBase& iter, double p_, c10::optional gen) { diff --git a/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu b/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu index 155759b18f57..f7b094ed6252 100644 --- a/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu +++ b/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu @@ -1,27 +1,8 @@ -#include -#include -#include -#include -#include +#define TORCH_ASSERT_NO_OPERATORS #include #include #include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include - namespace at { namespace native { void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, c10::optional gen) { diff --git a/aten/src/ATen/native/cuda/DistributionNormal.cu b/aten/src/ATen/native/cuda/DistributionNormal.cu index 025c70c42601..28330dbd69aa 100644 --- a/aten/src/ATen/native/cuda/DistributionNormal.cu +++ b/aten/src/ATen/native/cuda/DistributionNormal.cu @@ -1,30 +1,11 @@ -#include -#include -#include -#include -#include -#include +#define TORCH_ASSERT_NO_OPERATORS #include +#include #include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include - namespace at { namespace native { -void normal_kernel(Tensor& self, double mean, double std, c10::optional gen) { +void normal_kernel(const TensorBase &self, double mean, double std, c10::optional gen) { auto generator = get_generator_or_default(gen, cuda::detail::getDefaultCUDAGenerator()); at::native::templates::cuda::normal_kernel(self, mean, std, generator); } diff --git a/aten/src/ATen/native/cuda/DistributionRandomKernel.cu b/aten/src/ATen/native/cuda/DistributionRandomKernel.cu index 5da90a820a14..0607e4fa804e 100644 --- a/aten/src/ATen/native/cuda/DistributionRandomKernel.cu +++ b/aten/src/ATen/native/cuda/DistributionRandomKernel.cu @@ -1,27 +1,8 @@ -#include -#include -#include -#include -#include +#define TORCH_ASSERT_NO_OPERATORS #include #include #include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include - namespace at { namespace native { void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, c10::optional gen_) { diff --git a/aten/src/ATen/native/cuda/DistributionTemplates.h b/aten/src/ATen/native/cuda/DistributionTemplates.h index 54324cbbaf54..6a096b42f719 100644 --- a/aten/src/ATen/native/cuda/DistributionTemplates.h +++ b/aten/src/ATen/native/cuda/DistributionTemplates.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include #include @@ -231,7 +231,7 @@ __global__ void distribution_binary_elementwise_kernel( } template -void distribution_binary_kernel(TensorIterator &iter, PhiloxCudaState philox_args, const func_t &f) { +void distribution_binary_kernel(TensorIteratorBase &iter, PhiloxCudaState philox_args, const func_t &f) { static_assert(std::is_same::template arg<0>::type, curandStatePhilox4_32_10_t&>::value, "the first argument of functor must be curandStatePhilox4_32_10_t"); using input_t_1 = typename function_traits::template arg<1>::type; using input_t_2 = typename function_traits::template arg<2>::type; @@ -430,7 +430,7 @@ void normal_and_transform(TensorIteratorBase& iter, RNG gen, transform_t transfo // ==================================================== Normal ======================================================== template -void normal_kernel(Tensor& self, double mean_, double std_, RNG gen) { +void normal_kernel(const TensorBase &self, double mean_, double std_, RNG gen) { auto iter = TensorIterator::borrowing_nullary_op(self); AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "normal_kernel_cuda", [&] { using accscalar_t = at::acc_type; @@ -446,7 +446,7 @@ void normal_kernel(Tensor& self, double mean_, double std_, RNG gen) { template struct NormalKernel { - void operator()(Tensor& self, double mean, double std, c10::optional gen) { + void operator()(const TensorBase &self, double mean, double std, c10::optional gen) { normal_kernel(self, mean, std, check_generator(gen)); } }; @@ -574,7 +574,7 @@ struct CauchyKernel { template void bernoulli_tensor_cuda_kernel( - at::Tensor& ret, const at::Tensor& p, + const TensorBase &ret, const at::TensorBase &p, PhiloxCudaState philox_args) { auto functor = [philox_args] __device__( int n, scalar_t& v1, scalar_t& v2, scalar_t& v3, scalar_t& v4, @@ -618,7 +618,7 @@ void bernoulli_tensor_cuda_kernel( } template -void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG gen) { +void bernoulli_kernel(const TensorBase &self, const TensorBase &p_, RNG gen) { PhiloxCudaState rng_engine_inputs; { // See Note [Acquire lock when using random generators] @@ -626,14 +626,10 @@ void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG gen) { rng_engine_inputs = gen->philox_cuda_state(10); } TORCH_CHECK(at::isFloatingType(p_.scalar_type()), "expected probabilities tensor to have floating type, got ", p_.scalar_type()); - auto p_CUDA = p_.to(kCUDA); - //cast probabilities tensor to double for double `self` tensor, and to `float` for everything else - if (self.dtype() == at::kDouble) { - p_CUDA = p_CUDA.to(at::kDouble); - } else { - p_CUDA = p_CUDA.to(at::kFloat); - } - c10::MaybeOwned p = expand_inplace(self, p_CUDA); + // cast probabilities tensor to double for double `self` tensor, and to `float` for everything else + const auto p_type = self.dtype() == at::kDouble ? at::kDouble : at::kFloat; + auto p_cuda = p_.to(TensorOptions().device(self.device()).dtype(p_type)); + auto p = expand_inplace(self, p_cuda); AT_DISPATCH_ALL_TYPES_AND3( at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, self.scalar_type(), "bernoulli_tensor_cuda_self_", [&] { if (std::is_same::value) { @@ -662,7 +658,7 @@ struct BernoulliKernel { void operator()(TensorIteratorBase& iter, double p, c10::optional gen) { bernoulli_kernel(iter, p, check_generator(gen)); } - void operator()(Tensor& self, const Tensor& p_, c10::optional gen) { + void operator()(const TensorBase &self, const TensorBase &p_, c10::optional gen) { bernoulli_kernel(self, p_, check_generator(gen)); } }; diff --git a/aten/src/ATen/native/cuda/DistributionUniform.cu b/aten/src/ATen/native/cuda/DistributionUniform.cu index 04bc172ed23d..a848f0fd48f5 100644 --- a/aten/src/ATen/native/cuda/DistributionUniform.cu +++ b/aten/src/ATen/native/cuda/DistributionUniform.cu @@ -1,9 +1,7 @@ -#include +#define TORCH_ASSERT_NO_OPERATORS #include #include #include -#include -#include namespace at { namespace native { diff --git a/aten/src/ATen/native/cuda/Distributions.cpp b/aten/src/ATen/native/cuda/Distributions.cpp new file mode 100644 index 000000000000..fc885d867445 --- /dev/null +++ b/aten/src/ATen/native/cuda/Distributions.cpp @@ -0,0 +1,84 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#endif + +namespace at { namespace native { + +Tensor _s_poisson_cuda(const Tensor& lambda, c10::optional gen_) { + auto gen = get_generator_or_default(gen_, cuda::detail::getDefaultCUDAGenerator()); + Tensor ret = at::empty(lambda.sizes(), lambda.options()); + launch_poisson_cuda_kernel(ret, lambda, gen); + return ret; +} + +Tensor _s_binomial_cuda(const Tensor& count, const Tensor& prob, c10::optional gen_) { + auto gen = get_generator_or_default(gen_, cuda::detail::getDefaultCUDAGenerator()); + Tensor ret = at::empty(count.sizes(), count.options()); + at::TensorIterator iter = at::TensorIteratorConfig() + .add_output(ret) + .add_input(count) + .add_input(prob) + .build(); + launch_binomial_cuda_kernel(iter, gen); + return ret; +} + +Tensor _s_gamma_cuda(const Tensor& alpha, c10::optional gen_) { + auto gen = get_generator_or_default(gen_, cuda::detail::getDefaultCUDAGenerator()); + Tensor ret = at::empty(alpha.sizes(), alpha.options()); + launch_gamma_kernel(ret, alpha, gen); + return ret; +} + +Tensor _s_dirichlet_cuda(const Tensor& alpha, c10::optional gen_) { + auto gen = get_generator_or_default(gen_, cuda::detail::getDefaultCUDAGenerator()); + Tensor ret = at::empty(alpha.sizes(), alpha.options()); + launch_gamma_kernel(ret, alpha, gen); + auto gamma_sum = ret.sum(/*dim=*/-1, /*keepdim=*/true); + at::TensorIterator iter = at::TensorIteratorConfig() + .add_output(ret) + .add_input(ret) + .add_input(gamma_sum) + .build(); + launch_dirichlet_kernel(iter); + return ret; +} + +Tensor _standard_gamma_grad_cuda(const Tensor& self, const Tensor& output) { + Tensor ret = at::empty(self.sizes(), self.options()); + TensorIterator iter = at::TensorIteratorConfig() + .add_output(ret) + .add_input(self) + .add_input(output) + .build(); + launch_standard_gamma_grad_kernel(iter); + return ret; +} + +Tensor _dirichlet_grad_cuda(const Tensor& x, const Tensor& alpha, const Tensor& total) { + Tensor ret = at::empty(x.sizes(), x.options()); + TensorIterator iter = at::TensorIteratorConfig() + .add_output(ret) + .add_input(x) + .add_input(alpha) + .add_input(total) + .build(); + launch_dirichlet_grad_kernel(iter); + return ret; +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/cuda/Distributions.cu b/aten/src/ATen/native/cuda/Distributions.cu index d7ab78c18129..717ad4d985d4 100644 --- a/aten/src/ATen/native/cuda/Distributions.cu +++ b/aten/src/ATen/native/cuda/Distributions.cu @@ -1,6 +1,6 @@ +#define TORCH_ASSERT_NO_OPERATORS +#include #include -#include -#include #include #include #include @@ -42,8 +42,8 @@ namespace { template void poisson_cuda_kernel( - at::Tensor& ret, - const at::Tensor& lambda, + const at::TensorBase &ret, + const at::TensorBase &lambda, at::PhiloxCudaState philox_args) { auto functor = [philox_args] __device__( scalar_t & ret_val, const scalar_t& lambda) { @@ -74,19 +74,12 @@ struct curand_uniform_wrapper { template void binomial_cuda_kernel( - at::Tensor& ret, - const at::Tensor& count, - const at::Tensor& prob, + at::TensorIteratorBase &iter, at::PhiloxCudaState philox_args) { using accscalar_t = at::acc_type; - at::TensorIterator iter = at::TensorIteratorConfig() - .add_output(ret) - .add_input(count) - .add_input(prob) - .build(); at::native::distribution_binary_kernel(iter, philox_args, - [philox_args] GPU_LAMBDA (curandStatePhilox4_32_10_t& state, scalar_t count, scalar_t prob) { + [] GPU_LAMBDA (curandStatePhilox4_32_10_t& state, scalar_t count, scalar_t prob) { #if defined(__CUDA_ARCH__) || defined(USE_ROCM) auto uniform_lambda = curand_uniform_wrapper(state); BaseSampler standard_uniform(uniform_lambda); @@ -101,8 +94,8 @@ void binomial_cuda_kernel( template void gamma_cuda_kernel( - at::Tensor& ret, - const at::Tensor& alpha, + const at::TensorBase &ret, + const at::TensorBase &alpha, at::PhiloxCudaState philox_args) { using accscalar_t = at::acc_type; auto functor = [philox_args] __device__( @@ -132,18 +125,16 @@ void gamma_cuda_kernel( /*min_blocks_per_sm==*/2>(ret, alpha, functor); } -template -void dirichlet_scalar_cuda_kernel( - at::Tensor& ret, - const at::Tensor& gamma) { - auto gamma_sum = gamma.sum(-1, true); - at::TensorIterator iter = at::TensorIteratorConfig() - .add_output(ret) - .add_input(gamma) - .add_input(gamma_sum) - .build(); - at::native::gpu_kernel(iter, - [] GPU_LAMBDA (scalar_t gamma, scalar_t gamma_sum) { +} // namespace + +namespace at { namespace native { + +void launch_dirichlet_kernel(at::TensorIteratorBase &iter) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, + iter.input_dtype(), "dirichlet_cuda", [&] { + at::native::gpu_kernel( + iter, + [] GPU_LAMBDA (scalar_t gamma, scalar_t gamma_sum) { auto ret_val = gamma / gamma_sum; auto min_value = std::numeric_limits::min(); auto max_value = 1 - std::numeric_limits::epsilon(); @@ -151,107 +142,66 @@ void dirichlet_scalar_cuda_kernel( ret_val = (max_value < ret_val) ? max_value : ret_val; return ret_val; }); + }); } -} // namespace - -namespace at { namespace native { - -Tensor _s_poisson_cuda(const Tensor& lambda, c10::optional gen_) { - auto gen = get_generator_or_default(gen_, cuda::detail::getDefaultCUDAGenerator()); +void launch_poisson_cuda_kernel( + const TensorBase &ret, const TensorBase &lambda, CUDAGeneratorImpl *gen) { PhiloxCudaState rng_engine_inputs; { // See Note [Acquire lock when using random generators] std::lock_guard lock(gen->mutex_); rng_engine_inputs = gen->philox_cuda_state(20); } - Tensor ret = at::empty(lambda.sizes(), lambda.options()); AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, ret.scalar_type(), "poisson_cuda", [&] { poisson_cuda_kernel(ret, lambda, rng_engine_inputs); }); - return ret; } -Tensor _s_binomial_cuda(const Tensor& count, const Tensor& prob, c10::optional gen_) { - auto gen = get_generator_or_default(gen_, cuda::detail::getDefaultCUDAGenerator()); +void launch_binomial_cuda_kernel( + TensorIteratorBase &iter, CUDAGeneratorImpl *gen) { PhiloxCudaState rng_engine_inputs; { // See Note [Acquire lock when using random generators] std::lock_guard lock(gen->mutex_); rng_engine_inputs = gen->philox_cuda_state(42); } - Tensor ret = at::empty(count.sizes(), count.options()); - AT_DISPATCH_FLOATING_TYPES_AND_HALF(ret.scalar_type(), "binomial_cuda", [&] { - binomial_cuda_kernel(ret, count, prob, rng_engine_inputs); + AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.input_dtype(), "binomial_cuda", [&] { + binomial_cuda_kernel(iter, rng_engine_inputs); }); - return ret; } -Tensor _s_gamma_cuda(const Tensor& alpha, c10::optional gen_) { - auto gen = get_generator_or_default(gen_, cuda::detail::getDefaultCUDAGenerator()); +void launch_gamma_kernel( + const TensorBase &ret, const TensorBase &alpha, CUDAGeneratorImpl *gen) { PhiloxCudaState rng_engine_inputs; { // See Note [Acquire lock when using random generators] std::lock_guard lock(gen->mutex_); rng_engine_inputs = gen->philox_cuda_state(10); } - Tensor ret = at::empty(alpha.sizes(), alpha.options()); AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, ret.scalar_type(), "gamma_cuda", [&] { gamma_cuda_kernel(ret, alpha, rng_engine_inputs); }); - return ret; } -Tensor _s_dirichlet_cuda(const Tensor& alpha, c10::optional gen_) { - auto gen = get_generator_or_default(gen_, cuda::detail::getDefaultCUDAGenerator()); - PhiloxCudaState rng_engine_inputs; - { - // See Note [Acquire lock when using random generators] - std::lock_guard lock(gen->mutex_); - rng_engine_inputs = gen->philox_cuda_state(10); - } - Tensor ret = at::empty(alpha.sizes(), alpha.options()); - AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, ret.scalar_type(), "dirichlet", [&] { - Tensor gamma = at::empty(alpha.sizes(), alpha.options()); - gamma_cuda_kernel(gamma, alpha, rng_engine_inputs); - dirichlet_scalar_cuda_kernel(ret, gamma); - }); - return ret; -} - -Tensor _standard_gamma_grad_cuda(const Tensor& self, const Tensor& output) { - Tensor ret = at::empty(self.sizes(), self.options()); - TensorIterator iter = at::TensorIteratorConfig() - .add_output(ret) - .add_input(self) - .add_input(output) - .build(); - AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.common_dtype(), "_standard_gamma_grad_cuda", [&] { +void launch_standard_gamma_grad_kernel(TensorIteratorBase &iter) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.input_dtype(), "_standard_gamma_grad_cuda", [&] { using accscalar_t = at::acc_type; gpu_kernel(iter, [] GPU_LAMBDA (scalar_t self_val, scalar_t output_val) { return standard_gamma_grad_one(self_val, output_val); }); }); - return ret; } -Tensor _dirichlet_grad_cuda(const Tensor& x, const Tensor& alpha, const Tensor& total) { - Tensor ret = at::empty(x.sizes(), x.options()); - TensorIterator iter = at::TensorIteratorConfig() - .add_output(ret) - .add_input(x) - .add_input(alpha) - .add_input(total) - .build(); - AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "_dirichlet_grad_cuda", [&] { +void launch_dirichlet_grad_kernel(TensorIteratorBase &iter) { + AT_DISPATCH_FLOATING_TYPES(iter.input_dtype(), "_dirichlet_grad_cuda", [&] { using accscalar_t = at::acc_type; - gpu_kernel(iter, + at::native::gpu_kernel(iter, [] GPU_LAMBDA (scalar_t x_val, scalar_t alpha_val, scalar_t total_val) -> scalar_t { return dirichlet_grad_one(x_val, alpha_val, total_val); }); }); - return ret; } }} // namespace at::native diff --git a/aten/src/ATen/native/cuda/Distributions.h b/aten/src/ATen/native/cuda/Distributions.h new file mode 100644 index 000000000000..1a34fdfdf314 --- /dev/null +++ b/aten/src/ATen/native/cuda/Distributions.h @@ -0,0 +1,25 @@ +#pragma once + +namespace at { +struct CUDAGeneratorImpl; +struct TensorIteratorBase; +class TensorBase; + +namespace native { + +void launch_poisson_cuda_kernel( + const TensorBase &ret, const TensorBase &lambda, CUDAGeneratorImpl *gen); + +void launch_gamma_kernel( + const TensorBase &ret, const TensorBase &alpha, CUDAGeneratorImpl *gen); + +void launch_binomial_cuda_kernel( + TensorIteratorBase &iter, CUDAGeneratorImpl *gen); + +void launch_dirichlet_kernel(TensorIteratorBase &iter); + +void launch_standard_gamma_grad_kernel(TensorIteratorBase &iter); + +void launch_dirichlet_grad_kernel(TensorIteratorBase &iter); + +}} // namespace at::native diff --git a/aten/src/ATen/native/cuda/Dropout.cu b/aten/src/ATen/native/cuda/Dropout.cu index 528a43646b9b..6ec054aa6050 100644 --- a/aten/src/ATen/native/cuda/Dropout.cu +++ b/aten/src/ATen/native/cuda/Dropout.cu @@ -1,6 +1,9 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include +#include +#include #include #include #include @@ -11,6 +14,17 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#endif + namespace at{ namespace native{ diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu index edf7e31d5621..8a241cabcd2d 100644 --- a/aten/src/ATen/native/cuda/Embedding.cu +++ b/aten/src/ATen/native/cuda/Embedding.cu @@ -1,5 +1,7 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include +#include #include #include #include @@ -11,11 +13,24 @@ #include #include #include +#include #if CUB_SUPPORTS_SCAN_BY_KEY() #include #endif +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#endif + namespace at { namespace native { namespace { @@ -249,8 +264,9 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice auto indices_contig = indices.contiguous(); auto grad_weight = at::zeros({num_weights, grad_.size(-1)}, grad_.options()); int64_t stride = grad_weight.stride(0); - dim3 grid(ceil_div(stride, (int64_t)C10_WARP_SIZE)); - dim3 block(C10_WARP_SIZE, BLOCKDIMY); + int warp_size = at::cuda::warp_size(); + dim3 grid(ceil_div(stride, (int64_t)warp_size)); + dim3 block(warp_size, BLOCKDIMY); AT_DISPATCH_FLOATING_TYPES_AND2( at::ScalarType::Half, at::ScalarType::BFloat16, @@ -263,7 +279,7 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice embedding_backward_feature_kernel <<>> (indices_contig.data_ptr(), grad.data_ptr(), @@ -352,18 +368,18 @@ Tensor & embedding_renorm_cuda_(Tensor & self, const Tensor & indices, num_indices ); - constexpr int num_threads = 128; - static_assert(num_threads % C10_WARP_SIZE == 0 && - num_threads <= cuda_utils::kCUDABlockReduceMaxThreads, + int warp_size = at::cuda::warp_size(); + TORCH_INTERNAL_ASSERT(num_threads() % warp_size == 0 && + num_threads() <= cuda_utils::kCUDABlockReduceMaxThreads, "BlockReduceSum requires all warps be active"); int64_t *num_unique_indices_ptr = num_unique_indices.data_ptr(); dim3 grid = unique_indices.numel(); - dim3 block = num_threads; + dim3 block = num_threads(); int dim = self.stride(0); AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "embedding_renorm_cuda_", [&] { using accscalar_t = acc_type; - renorm_kernel<<>>( + renorm_kernel<<>>( self.data_ptr(), unique_indices.data_ptr(), static_cast(max_norm), diff --git a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu index afb2f25cc346..1a2c7627fc73 100644 --- a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu +++ b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu @@ -1,15 +1,26 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include -#include +#include +#include +#include #include -#include #include -#include - #include +#if CUB_SUPPORTS_UNIQUE_BY_KEY() +#include +#endif + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#endif + namespace at { namespace native { @@ -35,7 +46,8 @@ int64_t ceil_div(int64_t x, int64_t y) { template __global__ void krn_partials_per_segment(index_t *ret, const index_t *segment_offsets, - int64_t num_of_segments, int64_t numel) { + int64_t *num_of_segments_ptr, int64_t numel) { + int64_t num_of_segments = *num_of_segments_ptr; const int id = blockIdx.x * blockDim.x + threadIdx.x; if(id < num_of_segments) { const int64_t idx_start = segment_offsets[id]; @@ -52,7 +64,8 @@ void krn_partial_segment_offset( const index_t *partials_per_segment, const index_t *partials_per_segment_offset, const index_t *segment_offsets, - int64_t num_of_segments) { + int64_t *num_of_segments_ptr) { + int64_t num_of_segments = *num_of_segments_ptr; const int id = blockIdx.x * blockDim.x + threadIdx.x; if(id < num_of_segments) { index_t idx = partials_per_segment_offset[id]; @@ -71,10 +84,11 @@ __global__ void compute_grad_weight_bags( index_t *offset2bag, index_t *count, ptrdiff_t numel, int64_t stride, int mode_mean, const index_t *bag_size, scalar_t* per_sample_weights, int64_t per_sample_weights_stride, - index_t* segment_offsets, int64_t num_of_segments, + index_t* segment_offsets, int64_t *num_of_segments_ptr, acc_type *grad_weight_per_segment, const int64_t stride_warped) { + int64_t num_of_segments = *num_of_segments_ptr; const int gid = blockIdx.x * blockDim.x + threadIdx.x; const int id = gid / stride_warped; const int startFeature = gid % stride_warped; @@ -115,10 +129,11 @@ __global__ void compute_grad_weight( ptrdiff_t numel, int64_t stride, index_t* segment_offsets, - int64_t num_of_segments, + int64_t *num_of_segments_ptr, acc_type *grad_weight_per_segment, const int64_t stride_warped) { + int64_t num_of_segments = *num_of_segments_ptr; using accscalar_t = acc_type; const int gid = blockIdx.x * blockDim.x + threadIdx.x; const int id = gid / stride_warped; @@ -145,12 +160,14 @@ __global__ void compute_grad_weight( template __global__ void sum_and_scatter( index_t *input, scalar_t *gradWeight, int64_t stride, - index_t* segment_offsets, int64_t num_of_segments, + index_t* segment_offsets, int64_t *num_of_segments_ptr, const acc_type *grad_weight_per_segment, - const index_t *segment_sizes_offsets, int64_t num_of_partial_segments, + const index_t *segment_sizes_offsets, int64_t *num_of_partial_segments_ptr, const int64_t padding_idx, const int64_t stride_warped) { + int64_t num_of_segments = *num_of_segments_ptr; + int64_t num_of_partial_segments = *num_of_partial_segments_ptr; const int gid = blockIdx.x * blockDim.x + threadIdx.x; const int id = gid / stride_warped; const int startFeature = gid % stride_warped; @@ -173,10 +190,23 @@ __global__ void sum_and_scatter( } } +template +__global__ void compute_num_of_partial_segments(index_t *partials_per_segment, index_t *partials_per_segment_offset, int64_t *num_of_segments_ptr, int64_t *output) { + int64_t num_of_segments = *num_of_segments_ptr; + *output = partials_per_segment[num_of_segments-1] + + partials_per_segment_offset[num_of_segments-1]; +} + +__global__ void write_num_of_segments_for_legacy_thrust_path(int64_t *num_of_segments_ptr, int64_t num_of_segments) { + *num_of_segments_ptr = num_of_segments; +} + } // anon namespace +#if !CUB_SUPPORTS_UNIQUE_BY_KEY() template int64_t embedding_backward_cuda_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets); +#endif Tensor embedding_backward_cuda_kernel( const Tensor &grad, @@ -200,19 +230,35 @@ Tensor embedding_backward_cuda_kernel( // spawn a warp per index. In this context, a segment is a number of rows that should // be summarized. // Unit: index in `sorted_indices` and `orig_indices` + auto segment_offsets = at::empty({numel}, orig_indices.options()); + auto num_of_segments_tensor = at::empty({}, grad.options().dtype(kLong)); + int64_t *num_of_segments_ptr = num_of_segments_tensor.data_ptr(); +#if !CUB_SUPPORTS_UNIQUE_BY_KEY() AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () { - auto segment_offsets = at::empty({numel}, orig_indices.options()); int64_t num_of_segments = embedding_backward_cuda_kernel_unique_by_key(sorted_indices, segment_offsets); + write_num_of_segments_for_legacy_thrust_path<<<1, 1, 0, c10::cuda::getCurrentCUDAStream()>>>(num_of_segments_ptr, num_of_segments); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); +#else + AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () { + auto num_of_segments_tensor = at::empty({}, grad.options().dtype(kLong)); + cuda::cub::unique_by_key( + sorted_indices.data_ptr(), thrust::make_counting_iterator(0), + nullptr, segment_offsets.data_ptr(), + num_of_segments_ptr, sorted_indices.numel()); + }); +#endif + AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () { // We split the segments up into sizes of `NROWS_PER_THREAD` // Compute the number partial-segments per segment (some partial-segments // may not be the full `NROWS_PER_THREAD` number of rows) - auto partials_per_segment = at::empty({num_of_segments}, orig_indices.options()); + auto partials_per_segment = at::empty({numel}, orig_indices.options()); { - krn_partials_per_segment<<>> ( + krn_partials_per_segment<<>> ( partials_per_segment.data_ptr(), segment_offsets.data_ptr(), - num_of_segments, + num_of_segments_ptr, numel); C10_CUDA_KERNEL_LAUNCH_CHECK(); } @@ -221,32 +267,38 @@ Tensor embedding_backward_cuda_kernel( // of each partial-segment in `sorted_indices`, we need to compute the // start position of each _segment_ in `partial_segment_offset`. // Unit: index in `partial_segment_offset` - auto partials_per_segment_offset = at::empty({num_of_segments}, orig_indices.options()); + auto partials_per_segment_offset = at::empty({numel}, orig_indices.options()); cuda::cub::exclusive_sum( partials_per_segment.data_ptr(), partials_per_segment_offset.data_ptr(), - num_of_segments); + numel); // The total number of partial-segments is the sum of `partials_per_segment_offset` - const int num_of_partial_segments = partials_per_segment[num_of_segments-1].item() + - partials_per_segment_offset[num_of_segments-1].item(); + auto num_of_partial_segments_tensor = at::empty({}, grad.options().dtype(kLong)); + int64_t *num_of_partial_segments_ptr = num_of_partial_segments_tensor.data_ptr(); + compute_num_of_partial_segments<<<1, 1, 0, c10::cuda::getCurrentCUDAStream()>>>( + partials_per_segment.data_ptr(), + partials_per_segment_offset.data_ptr(), + num_of_segments_ptr, num_of_partial_segments_ptr); + C10_CUDA_KERNEL_LAUNCH_CHECK(); // Now we can compute the start position of each partial-segment // Unit: index in `sorted_indices` and `orig_indices` - auto partial_segment_offset = at::empty({num_of_partial_segments}, orig_indices.options()); + auto partial_segment_offset = at::empty({numel}, orig_indices.options()); { - krn_partial_segment_offset<<>> ( + krn_partial_segment_offset<<>> ( partial_segment_offset.data_ptr(), partials_per_segment.data_ptr(), partials_per_segment_offset.data_ptr(), segment_offsets.data_ptr(), - num_of_segments); + num_of_segments_ptr); C10_CUDA_KERNEL_LAUNCH_CHECK(); } - const int stride_warped = ceil_div(stride, C10_WARP_SIZE)*C10_WARP_SIZE; + const int warp_size = at::cuda::warp_size(); + const int stride_warped = ceil_div(stride, warp_size)*warp_size; const int block = std::min(stride_warped, MAX_BLOCK_SIZE); - const int grid = ceil_div(num_of_partial_segments*stride_warped, block); + const int grid = ceil_div(numel*stride_warped, block); AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, grad.scalar_type(), "embedding_bag_backward_cuda_compute_grad_weight", [&] { @@ -259,7 +311,7 @@ Tensor embedding_backward_cuda_kernel( } else { op = grad.options(); } - auto grad_weight_per_segment = at::empty({num_of_partial_segments, stride}, op); + auto grad_weight_per_segment = at::empty({numel, stride}, op); // Compute the sum of each partial-segment and handle bags if (offset2bag.defined()) { compute_grad_weight_bags<<>>( @@ -271,7 +323,7 @@ Tensor embedding_backward_cuda_kernel( per_sample_weights.defined() ? per_sample_weights.data_ptr() : NULL, per_sample_weights.defined() ? per_sample_weights.stride(0) : 0, partial_segment_offset.data_ptr(), - num_of_partial_segments, grad_weight_per_segment.data_ptr(), + num_of_partial_segments_ptr, grad_weight_per_segment.data_ptr(), stride_warped); C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { @@ -281,7 +333,7 @@ Tensor embedding_backward_cuda_kernel( count.defined() ? count.data_ptr() : nullptr, numel, stride, partial_segment_offset.data_ptr(), - num_of_partial_segments, + num_of_partial_segments_ptr, grad_weight_per_segment.data_ptr(), stride_warped); C10_CUDA_KERNEL_LAUNCH_CHECK(); @@ -289,15 +341,15 @@ Tensor embedding_backward_cuda_kernel( // Finally, we sum all the partial-sums and scatter them // into `grad_weight`. - const int grid2 = ceil_div(num_of_segments*stride_warped, block); + const int grid2 = ceil_div(numel*stride_warped, block); sum_and_scatter<<>>( sorted_indices.data_ptr(), grad_weight.data_ptr(), stride, segment_offsets.data_ptr(), - num_of_segments, grad_weight_per_segment.data_ptr(), + num_of_segments_ptr, grad_weight_per_segment.data_ptr(), partials_per_segment_offset.data_ptr(), - num_of_partial_segments, + num_of_partial_segments_ptr, padding_idx, stride_warped); C10_CUDA_KERNEL_LAUNCH_CHECK(); diff --git a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cuh b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cuh index 7b8fc9576e21..0d8d45c1defb 100644 --- a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cuh +++ b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cuh @@ -1,10 +1,8 @@ -#include +#pragma once +#include #include #include #include -#include - -#pragma once namespace at { namespace native { diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu index 4c842f2c7bcd..7ac3a7151b79 100644 --- a/aten/src/ATen/native/cuda/EmbeddingBag.cu +++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu @@ -1,12 +1,26 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include +#include #include #include #include #include -#include -#include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#endif #include #include @@ -53,7 +67,7 @@ __global__ void EmbeddingBag_updateOutputKernel_max( index_t *offset2bag, int64_t numIndices, int64_t numBags, int64_t featureSize, int64_t weight_stride0, int64_t weight_stride1, index_t *bag_size, index_t *max_indices, - index_t padding_idx, int64_t vocab_size) { + index_t padding_idx) { // the strategy here is that each bag x feature is handled by a single thread @@ -74,7 +88,6 @@ __global__ void EmbeddingBag_updateOutputKernel_max( int64_t bag_size_ = 0; int64_t maxWord = -1; for (int64_t emb = begin; emb < end; emb++) { - CUDA_KERNEL_ASSERT(input[emb] >= 0 && input[emb] < vocab_size); bool pad = (input[emb] == padding_idx); const int64_t weightRow = input[emb] * weight_stride0; scalar_t weightValue = weightFeat[weightRow]; @@ -104,7 +117,7 @@ __global__ void EmbeddingBag_updateOutputKernel_sum_mean( int64_t featureSize, int64_t weight_stride0, int64_t weight_stride1, int mode, index_t *bag_size, scalar_t* per_sample_weights, int64_t per_sample_weights_stride, - index_t padding_idx, int64_t vocab_size) { + index_t padding_idx) { // the strategy here is that each bag x feature is handled by a single thread @@ -125,7 +138,6 @@ __global__ void EmbeddingBag_updateOutputKernel_sum_mean( accscalar_t weightFeatSum = 0; int64_t bag_size_ = 0; for (int64_t emb = begin; emb < end; emb++) { - CUDA_KERNEL_ASSERT(input[emb] >= 0 && input[emb] < vocab_size); bool pad = (input[emb] == padding_idx); const int64_t weightRow = input[emb] * weight_stride0; scalar_t weightValue = weightFeat[weightRow]; @@ -350,7 +362,6 @@ _embedding_bag_cuda(const Tensor &weight, const Tensor &indices_, numBags -= 1; } int64_t featureSize = weight.size(1); - int64_t vocabSize = weight.size(0); auto bag_size = at::empty(offsets.sizes(), indices.options()); auto offset2bag = @@ -384,7 +395,7 @@ _embedding_bag_cuda(const Tensor &weight, const Tensor &indices_, offset2bag.data_ptr(), numIndices, numBags, featureSize, weight.stride(0), weight.stride(1), bag_size.data_ptr(), max_indices.data_ptr(), - padding_idx, vocabSize); + padding_idx); C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { EmbeddingBag_updateOutputKernel_sum_mean<<>>( @@ -394,7 +405,7 @@ _embedding_bag_cuda(const Tensor &weight, const Tensor &indices_, weight.stride(0), weight.stride(1), mode, bag_size.data_ptr(), per_sample_weights.defined() ? per_sample_weights.data_ptr() : NULL, per_sample_weights.defined() ? per_sample_weights.stride(0) : 0, - padding_idx, vocabSize); + padding_idx); C10_CUDA_KERNEL_LAUNCH_CHECK(); } }); @@ -515,7 +526,7 @@ Tensor _embedding_bag_per_sample_weights_backward_cuda( AT_ASSERT(weight.size(1) == embedding_features); const int threads_per_block = 512; - const int warps_per_block = threads_per_block / C10_WARP_SIZE; + const int warps_per_block = threads_per_block / at::cuda::warp_size(); dim3 block(threads_per_block); dim3 grid((num_samples + warps_per_block - 1) / warps_per_block); diff --git a/aten/src/ATen/native/cuda/Equal.cpp b/aten/src/ATen/native/cuda/Equal.cpp index 401571b2f1f2..ab8c9adef4e4 100644 --- a/aten/src/ATen/native/cuda/Equal.cpp +++ b/aten/src/ATen/native/cuda/Equal.cpp @@ -1,6 +1,14 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS #include #include -#include +#else +#include +#include +#endif namespace at { namespace native { diff --git a/aten/src/ATen/native/cuda/FillKernel.cu b/aten/src/ATen/native/cuda/FillKernel.cu index 76497ee7188a..facceccf8028 100644 --- a/aten/src/ATen/native/cuda/FillKernel.cu +++ b/aten/src/ATen/native/cuda/FillKernel.cu @@ -1,8 +1,10 @@ +#define TORCH_ASSERT_NO_OPERATORS #include #include #include #include #include +#include namespace at { namespace native { @@ -17,7 +19,7 @@ struct FillFunctor { }; void fill_kernel_cuda(TensorIterator& iter, const Scalar& value) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::Bool, at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "fill_cuda", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kComplexHalf, kBool, kHalf, kBFloat16, iter.dtype(), "fill_cuda", [&]() { gpu_kernel(iter, FillFunctor(value.to())); }); } diff --git a/aten/src/ATen/native/cuda/ForeachReduceOp.cu b/aten/src/ATen/native/cuda/ForeachReduceOp.cu index 0d6848324252..05fb1f6a087d 100644 --- a/aten/src/ATen/native/cuda/ForeachReduceOp.cu +++ b/aten/src/ATen/native/cuda/ForeachReduceOp.cu @@ -1,6 +1,7 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include +#include #include #include #include @@ -24,13 +25,13 @@ namespace native { template struct LpNormFunctor { static_assert(NormType == 1 || NormType == 2, "foreach_norm supports only L1 and L2 norm"); + using opmath_t = typename at::opmath_type; __device__ __forceinline__ void operator() ( int chunk_size, TensorListMetadata& tl, - T* output_per_tensor, + opmath_t* output_per_tensor, const int max_chunks_per_tensor ) { - using opmath_t = typename at::opmath_type; int tensor_loc = tl.block_to_tensor[blockIdx.x]; int chunk_idx = tl.block_to_chunk[blockIdx.x]; int n = tl.numel_for_tensor[tensor_loc]; @@ -82,16 +83,15 @@ struct LpNormFunctor { } }; -template +template> __global__ void lpnorm_cleanup( - T* output_per_tensor, + opmath_t* output_per_tensor, T* ret_per_tensor, int max_chunks_per_tensor) { - using opmath_t = typename at::opmath_type; __shared__ opmath_t vals[512]; - T* output_this_tensor = output_per_tensor + blockIdx.x*max_chunks_per_tensor; - T val = 0; + opmath_t* output_this_tensor = output_per_tensor + blockIdx.x*max_chunks_per_tensor; + opmath_t val = 0; for (int i = threadIdx.x; i < max_chunks_per_tensor; i += blockDim.x) { val += output_this_tensor[i]; } @@ -134,7 +134,7 @@ std::vector foreach_tensor_norm_cuda(TensorList tensors, const Scalar& o } } const auto options = tensors[0].options(); - auto output_per_tensor = at::zeros({ntensors*max_chunks_per_tensor}, options); + auto output_per_tensor = at::zeros({ntensors*max_chunks_per_tensor}, options.dtype(toOpMathType(tensors[0].scalar_type()))); auto ret_per_tensor = at::empty({ntensors}, options); auto tensor_lists = std::vector>{tensors.vec()}; @@ -145,13 +145,13 @@ std::vector foreach_tensor_norm_cuda(TensorList tensors, const Scalar& o multi_tensor_apply<1>( tensor_lists, LpNormFunctor(), - output_per_tensor.data_ptr(), + output_per_tensor.data_ptr(), max_chunks_per_tensor); C10_CUDA_KERNEL_LAUNCH_CHECK(); const at::cuda::OptionalCUDAGuard device_guard(device_of(output_per_tensor)); auto stream = at::cuda::getCurrentCUDAStream(); lpnorm_cleanup<<>>( - output_per_tensor.data_ptr(), + output_per_tensor.data_ptr(), ret_per_tensor.data_ptr(), max_chunks_per_tensor); C10_CUDA_KERNEL_LAUNCH_CHECK(); @@ -163,13 +163,13 @@ std::vector foreach_tensor_norm_cuda(TensorList tensors, const Scalar& o multi_tensor_apply<1>( tensor_lists, LpNormFunctor(), - output_per_tensor.data_ptr(), + output_per_tensor.data_ptr(), max_chunks_per_tensor); C10_CUDA_KERNEL_LAUNCH_CHECK(); const at::cuda::OptionalCUDAGuard device_guard(device_of(output_per_tensor)); auto stream = at::cuda::getCurrentCUDAStream(); lpnorm_cleanup<<>>( - output_per_tensor.data_ptr(), + output_per_tensor.data_ptr(), ret_per_tensor.data_ptr(), max_chunks_per_tensor); C10_CUDA_KERNEL_LAUNCH_CHECK(); diff --git a/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu b/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu index aa898d50a2ce..46ea4eadf1fe 100644 --- a/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu +++ b/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu @@ -1,16 +1,24 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include +#include #include #include #include #include #include -#include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#endif + #include #include #include diff --git a/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu b/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu index 34b238410bb5..92a77dc00af5 100644 --- a/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu +++ b/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu @@ -1,17 +1,27 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include +#include #include #include #include #include #include #include -#include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#endif + #include #include #include diff --git a/aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu b/aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu index 0c758c9cc10b..7c04ce4da351 100644 --- a/aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu +++ b/aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu @@ -1,5 +1,7 @@ +#define TORCH_ASSERT_NO_OPERATORS #include +#include #include #include #include diff --git a/aten/src/ATen/native/cuda/GridSampler.cpp b/aten/src/ATen/native/cuda/GridSampler.cpp new file mode 100644 index 000000000000..aefe6f822270 --- /dev/null +++ b/aten/src/ATen/native/cuda/GridSampler.cpp @@ -0,0 +1,83 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#endif + +namespace at { +namespace native { + +Tensor grid_sampler_2d_cuda(const Tensor& input, const Tensor& grid, + int64_t interpolation_mode, int64_t padding_mode, + bool align_corners) { + auto in_size = input.sizes(); + auto grid_size = grid.sizes(); + auto output = at::empty( + {in_size[0], in_size[1], grid_size[1], grid_size[2]}, input.options()); + launch_grid_sampler_2d_forward_kernel( + output, input, grid, interpolation_mode, padding_mode, align_corners); + return output; +} + +Tensor grid_sampler_3d_cuda(const Tensor& input, const Tensor& grid, + int64_t interpolation_mode, int64_t padding_mode, + bool align_corners) { + auto in_size = input.sizes(); + auto grid_size = grid.sizes(); + auto output = at::empty( + {in_size[0], in_size[1], grid_size[1], grid_size[2], grid_size[3]}, + input.options()); + launch_grid_sampler_3d_forward_kernel( + output, input, grid, interpolation_mode, padding_mode, align_corners); + return output; +} + +std::tuple +grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input, + const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode, + bool align_corners, std::array output_mask) { + auto input_requires_grad = output_mask[0]; + Tensor grad_input = ([&]() { + if (input_requires_grad) { + return at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + } else { + return Tensor(); + } + })(); + auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + launch_grid_sampler_2d_backward_kernel( + grad_input, grad_grid, grad_output, input, + grid, interpolation_mode, padding_mode, align_corners, output_mask); + return std::make_tuple(grad_input, grad_grid); +} + +std::tuple +grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input, + const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode, + bool align_corners, std::array output_mask) { + auto input_requires_grad = output_mask[0]; + Tensor grad_input = ([&]() { + if (input_requires_grad) { + return at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + } else { + return Tensor(); + } + })(); + auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + launch_grid_sampler_3d_backward_kernel( + grad_input, grad_grid, grad_output, input, + grid, interpolation_mode, padding_mode, align_corners, output_mask); + return std::make_tuple(grad_input, grad_grid); +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/cuda/GridSampler.cu b/aten/src/ATen/native/cuda/GridSampler.cu index b358853c997f..bfc3d86b8ab9 100644 --- a/aten/src/ATen/native/cuda/GridSampler.cu +++ b/aten/src/ATen/native/cuda/GridSampler.cu @@ -1,10 +1,14 @@ -#include +#define TORCH_ASSERT_NO_OPERATORS +#include +#include #include #include #include #include #include #include +#include +#include #include namespace at { namespace native { @@ -509,12 +513,13 @@ namespace { TensorInfo grad_output, TensorInfo input, TensorInfo grid, - TensorInfo grad_input, // initialized to zeros + TensorInfo grad_input, // initialized to zeros (or unused if input_requires_grad is false) TensorInfo grad_grid, // initialized to empty const GridSamplerInterpolation interpolation_mode, const GridSamplerPadding padding_mode, bool align_corners, - const index_t grad_input_memory_span) { + const index_t grad_input_memory_span, + const bool input_requires_grad) { index_t C = input.sizes[1]; index_t inp_D = input.sizes[2]; @@ -538,11 +543,19 @@ namespace { index_t gOut_sD = grad_output.strides[2]; index_t gOut_sH = grad_output.strides[3]; index_t gOut_sW = grad_output.strides[4]; - index_t gInp_sN = grad_input.strides[0]; - index_t gInp_sC = grad_input.strides[1]; - index_t gInp_sD = grad_input.strides[2]; - index_t gInp_sH = grad_input.strides[3]; - index_t gInp_sW = grad_input.strides[4]; + // gInp_* (and NC_offset below) are not really needed if input_requires_grad is false. + int64_t gInp_sN = 0; + int64_t gInp_sC = 0; + int64_t gInp_sD = 0; + int64_t gInp_sH = 0; + int64_t gInp_sW = 0; + if (input_requires_grad) { + gInp_sN = grad_input.strides[0]; + gInp_sC = grad_input.strides[1]; + gInp_sD = grad_input.strides[2]; + gInp_sH = grad_input.strides[3]; + gInp_sW = grad_input.strides[4]; + } index_t gGrid_sW = grad_grid.strides[3]; CUDA_KERNEL_LOOP_TYPE(index, nthreads, index_t) { @@ -611,30 +624,34 @@ namespace { scalar_t gix = static_cast(0), giy = static_cast(0), giz = static_cast(0); scalar_t *gOut_ptr_NCDHW = grad_output.data + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW; - index_t NC_offset = n * gInp_sN; + index_t NC_offset; + if (input_requires_grad) { + NC_offset = n * gInp_sN; + } scalar_t *inp_ptr_NC = input.data + n * inp_sN; // calculate bilinear weighted pixel value and set output pixel for (index_t c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, NC_offset += gInp_sC, inp_ptr_NC += inp_sC) { scalar_t gOut = *gOut_ptr_NCDHW; // calculate and set grad_input. See Note [Passing pointer and offset to fastAtomicAdd]. - safe_add_3d(grad_input.data, iz_tnw, iy_tnw, ix_tnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tnw * gOut, - NC_offset, grad_input_memory_span); - safe_add_3d(grad_input.data, iz_tne, iy_tne, ix_tne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tne * gOut, - NC_offset, grad_input_memory_span); - safe_add_3d(grad_input.data, iz_tsw, iy_tsw, ix_tsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tsw * gOut, - NC_offset, grad_input_memory_span); - safe_add_3d(grad_input.data, iz_tse, iy_tse, ix_tse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tse * gOut, - NC_offset, grad_input_memory_span); - safe_add_3d(grad_input.data, iz_bnw, iy_bnw, ix_bnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bnw * gOut, - NC_offset, grad_input_memory_span); - safe_add_3d(grad_input.data, iz_bne, iy_bne, ix_bne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bne * gOut, - NC_offset, grad_input_memory_span); - safe_add_3d(grad_input.data, iz_bsw, iy_bsw, ix_bsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bsw * gOut, - NC_offset, grad_input_memory_span); - safe_add_3d(grad_input.data, iz_bse, iy_bse, ix_bse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bse * gOut, - NC_offset, grad_input_memory_span); - + if (input_requires_grad) { + safe_add_3d(grad_input.data, iz_tnw, iy_tnw, ix_tnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tnw * gOut, + NC_offset, grad_input_memory_span); + safe_add_3d(grad_input.data, iz_tne, iy_tne, ix_tne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tne * gOut, + NC_offset, grad_input_memory_span); + safe_add_3d(grad_input.data, iz_tsw, iy_tsw, ix_tsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tsw * gOut, + NC_offset, grad_input_memory_span); + safe_add_3d(grad_input.data, iz_tse, iy_tse, ix_tse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tse * gOut, + NC_offset, grad_input_memory_span); + safe_add_3d(grad_input.data, iz_bnw, iy_bnw, ix_bnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bnw * gOut, + NC_offset, grad_input_memory_span); + safe_add_3d(grad_input.data, iz_bne, iy_bne, ix_bne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bne * gOut, + NC_offset, grad_input_memory_span); + safe_add_3d(grad_input.data, iz_bsw, iy_bsw, ix_bsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bsw * gOut, + NC_offset, grad_input_memory_span); + safe_add_3d(grad_input.data, iz_bse, iy_bse, ix_bse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bse * gOut, + NC_offset, grad_input_memory_span); + } // calculate grad_grid if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) { scalar_t tnw_val = inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW]; @@ -695,20 +712,21 @@ namespace { gGrid_ptr_NDHW[1] = giy_mult * giy; gGrid_ptr_NDHW[2] = giz_mult * giz; } else if (interpolation_mode == GridSamplerInterpolation::Nearest) { - auto ix_nearest = static_cast(::round(ix)); - auto iy_nearest = static_cast(::round(iy)); - auto iz_nearest = static_cast(::round(iz)); + if (input_requires_grad) { + auto ix_nearest = static_cast(::round(ix)); + auto iy_nearest = static_cast(::round(iy)); + auto iz_nearest = static_cast(::round(iz)); - // assign nearest neighor pixel value to output pixel - scalar_t *gOut_ptr_NCDHW = grad_output.data + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW; - index_t NC_offset = n * gInp_sN; - for (index_t c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, NC_offset += gInp_sC) { - // calculate and set grad_input. See Note [Passing pointer and offset to fastAtomicAdd]. - safe_add_3d(grad_input.data, iz_nearest, iy_nearest, ix_nearest, - gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, *gOut_ptr_NCDHW, - NC_offset, grad_input_memory_span); + // assign nearest neighor pixel value to output pixel + scalar_t *gOut_ptr_NCDHW = grad_output.data + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW; + index_t NC_offset = n * gInp_sN; + for (index_t c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, NC_offset += gInp_sC) { + // calculate and set grad_input. See Note [Passing pointer and offset to fastAtomicAdd]. + safe_add_3d(grad_input.data, iz_nearest, iy_nearest, ix_nearest, + gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, *gOut_ptr_NCDHW, + NC_offset, grad_input_memory_span); + } } - // assuming grad_grid is contiguous // thus we can // 1. use index with gGrid_sW to directly compute gGrid_ptr_NDHW @@ -722,15 +740,17 @@ namespace { } } // namespace -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -Tensor grid_sampler_2d_cuda(const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, int64_t padding_mode, - bool align_corners) { +void launch_grid_sampler_2d_forward_kernel( + const TensorBase &output, const TensorBase &input, const TensorBase &grid, + int64_t interpolation_mode, int64_t padding_mode, bool align_corners) { + // See NOTE [ grid_sampler Native Functions ]. + // Add checks here in case this is called instead of grid_sampler. + check_grid_sampler_common(input, grid); + check_grid_sampler_2d(input, grid); + auto N = input.size(0); - auto C = input.size(1); auto H = grid.size(1); auto W = grid.size(2); - auto output = at::empty({N, C, H, W}, input.options()); int64_t count = N * H * W; if (count > 0) { AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_2d_cuda", [&] { @@ -760,18 +780,20 @@ Tensor grid_sampler_2d_cuda(const Tensor& input, const Tensor& grid, } }); } - return output; } -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -Tensor grid_sampler_3d_cuda(const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, int64_t padding_mode, - bool align_corners) { +void launch_grid_sampler_3d_forward_kernel( + const TensorBase &output, const TensorBase &input, const TensorBase &grid, + int64_t interpolation_mode, int64_t padding_mode, bool align_corners) { + // See NOTE [ grid_sampler Native Functions ]. + // Add checks here in case this is called instead of grid_sampler. + check_grid_sampler_common(input, grid); + check_grid_sampler_3d(input, grid, interpolation_mode); + auto N = input.size(0); auto D = grid.size(1); auto H = grid.size(2); auto W = grid.size(3); - auto output = at::empty({N, input.size(1), D, H, W}, input.options()); int64_t count = N * D * H * W; if (count > 0) { AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_3d_cuda", [&] { @@ -801,15 +823,18 @@ Tensor grid_sampler_3d_cuda(const Tensor& input, const Tensor& grid, } }); } - return output; } -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -std::tuple -grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input, - const Tensor& grid, int64_t interpolation_mode, - int64_t padding_mode, bool align_corners, - std::array output_mask) { +void launch_grid_sampler_2d_backward_kernel( + const TensorBase &grad_input, const TensorBase &grad_grid, + const TensorBase &grad_output, const TensorBase &input, + const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode, + bool align_corners, std::array output_mask) { + // See NOTE [ grid_sampler Native Functions ]. + // Add checks here in case this is called instead of grid_sampler. + check_grid_sampler_common(input, grid); + check_grid_sampler_2d(input, grid); + // See Note [Writing Nondeterministic Operations] // Nondeterministic because of atomicAdd usage globalContext().alertNotDeterministic("grid_sampler_2d_backward_cuda"); @@ -822,11 +847,6 @@ grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input, // is always computed.) auto input_requires_grad = output_mask[0]; - Tensor grad_input; - if (input_requires_grad) { - grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - } - auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT); int64_t count = N * H * W; if (count > 0) { AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_2d_backward_cuda", [&] { @@ -864,14 +884,18 @@ grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input, } }); } - return std::make_tuple(grad_input, grad_grid); } -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -std::tuple -grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input, - const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode, - bool align_corners) { +void launch_grid_sampler_3d_backward_kernel( + const TensorBase &grad_input, const TensorBase &grad_grid, + const TensorBase& grad_output, const TensorBase& input, + const TensorBase& grid, int64_t interpolation_mode, int64_t padding_mode, + bool align_corners, std::array output_mask) { + // See NOTE [ grid_sampler Native Functions ]. + // Add checks here in case this is called instead of grid_sampler. + check_grid_sampler_common(input, grid); + check_grid_sampler_3d(input, grid, interpolation_mode); + // See Note [Writing Nondeterministic Operations] // Nondeterministic because of atomicAdd usage globalContext().alertNotDeterministic("grid_sampler_3d_backward_cuda"); @@ -879,9 +903,8 @@ grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input, auto D = grid.size(1); auto H = grid.size(2); auto W = grid.size(3); - auto grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT); int64_t count = N * D * H * W; + auto input_requires_grad = output_mask[0]; if (count > 0) { AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_3d_backward_cuda", [&] { if (canUse32BitIndexMath(input) && canUse32BitIndexMath(grid) && @@ -892,12 +915,13 @@ grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input, getTensorInfo(grad_output), getTensorInfo(input), getTensorInfo(grid), - getTensorInfo(grad_input), + input_requires_grad ? getTensorInfo(grad_input) : TensorInfo(), getTensorInfo(grad_grid), static_cast(interpolation_mode), static_cast(padding_mode), align_corners, - /*grad_input_memory_span =*/static_cast(grad_input.numel())); + /*grad_input_memory_span =*/input_requires_grad ? static_cast(grad_input.numel()) : 0, + input_requires_grad); C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { grid_sampler_3d_backward_kernel @@ -906,17 +930,17 @@ grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input, getTensorInfo(grad_output), getTensorInfo(input), getTensorInfo(grid), - getTensorInfo(grad_input), + input_requires_grad ? getTensorInfo(grad_input) : TensorInfo(), getTensorInfo(grad_grid), static_cast(interpolation_mode), static_cast(padding_mode), align_corners, - /*grad_input_memory_span =*/grad_input.numel()); + /*grad_input_memory_span =*/input_requires_grad ? grad_input.numel() : 0, + input_requires_grad); C10_CUDA_KERNEL_LAUNCH_CHECK(); } }); } - return std::make_tuple(grad_input, grad_grid); } }} // namespace at::native diff --git a/aten/src/ATen/native/cuda/GridSampler.cuh b/aten/src/ATen/native/cuda/GridSampler.cuh index 2fdf3bd54912..a0e3b16c3a43 100644 --- a/aten/src/ATen/native/cuda/GridSampler.cuh +++ b/aten/src/ATen/native/cuda/GridSampler.cuh @@ -1,16 +1,9 @@ -#include -#include +#pragma once #include +#include namespace at { namespace native { -namespace detail { - - enum class GridSamplerInterpolation {Bilinear, Nearest, Bicubic}; - enum class GridSamplerPadding {Zeros, Border, Reflection}; - -} // namespace detail - using detail::GridSamplerInterpolation; using detail::GridSamplerPadding; diff --git a/aten/src/ATen/native/cuda/GridSampler.h b/aten/src/ATen/native/cuda/GridSampler.h new file mode 100644 index 000000000000..aace9c30b0a7 --- /dev/null +++ b/aten/src/ATen/native/cuda/GridSampler.h @@ -0,0 +1,32 @@ +#pragma once +#include +#include + +namespace at { +class TensorBase; +} + +namespace at { +namespace native { + +void launch_grid_sampler_2d_forward_kernel( + const TensorBase &output, const TensorBase &input, const TensorBase &grid, + int64_t interpolation_mode, int64_t padding_mode, bool align_corners); + +void launch_grid_sampler_3d_forward_kernel( + const TensorBase &output, const TensorBase &input, const TensorBase &grid, + int64_t interpolation_mode, int64_t padding_mode, bool align_corners); + +void launch_grid_sampler_2d_backward_kernel( + const TensorBase &grad_input, const TensorBase &grad_grid, + const TensorBase &grad_output, const TensorBase &input, + const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode, + bool align_corners, std::array output_mask); + +void launch_grid_sampler_3d_backward_kernel( + const TensorBase &grad_input, const TensorBase &grad_grid, + const TensorBase &grad_output, const TensorBase &input, + const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode, + bool align_corners, std::array output_mask); + +}} // namespace at::native diff --git a/aten/src/ATen/native/cuda/Im2Col.cu b/aten/src/ATen/native/cuda/Im2Col.cu index 053418423adf..89b2a1879b4b 100644 --- a/aten/src/ATen/native/cuda/Im2Col.cu +++ b/aten/src/ATen/native/cuda/Im2Col.cu @@ -1,6 +1,7 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include -#include +#include #include #include #include @@ -10,6 +11,16 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + namespace at { namespace native { namespace { diff --git a/aten/src/ATen/native/cuda/IndexKernel.cpp b/aten/src/ATen/native/cuda/IndexKernel.cpp index b85baf097559..478c96fa6084 100644 --- a/aten/src/ATen/native/cuda/IndexKernel.cpp +++ b/aten/src/ATen/native/cuda/IndexKernel.cpp @@ -1,10 +1,21 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include // For at::native::index_out +#include +#include #include -#include #include #include + +#ifndef AT_PER_OPERATOR_HEADERS +#include #include +#else +#include +#include +#include +#endif + namespace at { namespace native { diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu index eac807d0aa9b..a40bf35205e7 100644 --- a/aten/src/ATen/native/cuda/IndexKernel.cu +++ b/aten/src/ATen/native/cuda/IndexKernel.cu @@ -192,7 +192,7 @@ void index_put_kernel_impl(TensorIterator& iter, IntArrayRef index_size, IntArra } static void index_kernel(TensorIterator& iter, IntArrayRef index_size, IntArrayRef index_stride) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, iter.dtype(), "index_cuda", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kComplexHalf, kHalf, kBool, kBFloat16, iter.dtype(), "index_cuda", [&] { using dtype = OpaqueType; index_kernel_impl(iter, index_size, index_stride); }); @@ -233,7 +233,7 @@ static void index_copy_kernel( static void index_put_kernel(TensorIterator& iter, IntArrayRef index_size, IntArrayRef index_stride, bool accumulate) { TORCH_CHECK(!accumulate, "index_put does not support accumulate=true"); - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, iter.dtype(), "index_put", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kComplexHalf, kHalf, kBool, kBFloat16, iter.dtype(), "index_put", [&] { using dtype = OpaqueType; index_put_kernel_impl(iter, index_size, index_stride); }); diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu index 9ea21b2171e9..5fc9e4f5521e 100644 --- a/aten/src/ATen/native/cuda/Indexing.cu +++ b/aten/src/ATen/native/cuda/Indexing.cu @@ -1,19 +1,36 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include +#include -#include +#include #include -#include +#include #include #include +#include #include #include #include -#include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + #include #include #include @@ -34,7 +51,7 @@ __global__ void indexing_backward_kernel( //stride_before is the stride of the dimension immediately preceding first indexed dimension //if indexing starts from the 0th dimension, stride_before does not matter because blockIdx.z will be 0 in this case //outer_dim is number of elements in the first unindexed dimensions - using accscalar_t = at::acc_type; + using opmath_t = at::opmath_type; // Each warp is responsible for an input into the LookupTable. // If the preceding input has the same destination index as this input, then the warp @@ -61,19 +78,19 @@ __global__ void indexing_backward_kernel( } const int64_t weight_row = ((int64_t) sorted_indices[idx]) * stride + z * stride_before; const int64_t grad_row = ((int64_t) indices[idx]) * stride + z * numel * stride; - const accscalar_t scale = (accscalar_t)1.0; + const opmath_t scale = (opmath_t)1.0; - accscalar_t gradient[SZ]; - accscalar_t weight[SZ]; + opmath_t gradient[SZ]; + opmath_t weight[SZ]; while (start_feature < stride) { #pragma unroll for (int ii = 0; ii < SZ; ii++) { int64_t feature_dim = start_feature + ii * C10_WARP_SIZE; if (feature_dim < stride) { - gradient[ii] = static_cast(grad_output[grad_row + feature_dim]); + gradient[ii] = static_cast(grad_output[grad_row + feature_dim]); if (accumulate) { - weight[ii] = static_cast(grad_weight[weight_row + feature_dim]); + weight[ii] = static_cast(grad_weight[weight_row + feature_dim]); } } } @@ -109,6 +126,49 @@ __global__ void indexing_backward_kernel( namespace at { namespace native { +namespace { + +class ReduceMultiply { +public: + template + constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const { + (void)numel; // suppress unused warning + gpuAtomicMul(self_data_start + index, *src_data); + } +}; +static ReduceMultiply reduce_multiply; + +class ReduceAdd { +public: + template + constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const { + fastAtomicAdd(self_data_start, index, numel, *src_data, true); + } +}; +static ReduceAdd reduce_add; + +class ReduceMinimum { +public: + template + constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const { + (void)numel; // suppress unused warning + gpuAtomicMin(self_data_start + index, *src_data); + } +}; +static ReduceMinimum reduce_minimum; + +class ReduceMaximum { +public: + template + constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const { + (void)numel; // suppress unused warning + gpuAtomicMax(self_data_start + index, *src_data); + } +}; +static ReduceMaximum reduce_maximum; + +} + static Tensor wrapIndexOnce(const Tensor & index, int64_t dim, int64_t dim_size, bool check_range=true) { //we don't need to check range in backward - if there were out of bounds indices forward should already have errored out if (index.numel() != 0 && check_range) { @@ -209,13 +269,12 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List (size_t)self.dim()) { TORCH_CHECK_INDEX(false, "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); } - if (!self.is_contiguous()) { - self = self.contiguous(); - } + bool self_contiguous = self.is_contiguous(); + auto self_ = self_contiguous ? self : self.contiguous(); Tensor linearIndex, src, expandedValue = value; int64_t nElemBefore, strideBefore, sliceSize; std::vector inversePerm; - std::tie(linearIndex, src, nElemBefore, strideBefore, sliceSize, inversePerm) = makeLinearIndex(self, indices, !unsafe); + std::tie(linearIndex, src, nElemBefore, strideBefore, sliceSize, inversePerm) = makeLinearIndex(self_, indices, !unsafe); int64_t num_indices = linearIndex.numel(); if (expandedValue.numel() < num_indices * nElemBefore * sliceSize) { @@ -255,7 +314,7 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List(), sorted_indices.data_ptr(), range.data_ptr(), orig_indices.data_ptr(), @@ -268,12 +327,13 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List(at::cuda::getCurrentDeviceProperties()->maxGridSize[1], ceil_div(sliceSize, (int64_t) (C10_WARP_SIZE*UNROLL))), + std::min(at::cuda::getCurrentDeviceProperties()->maxGridSize[1], ceil_div(sliceSize, (int64_t) (warp_size*UNROLL))), std::min(std::max(1,nElemBefore), at::cuda::getCurrentDeviceProperties()->maxGridSize[2])); - dim3 block(C10_WARP_SIZE, indices_per_block); + dim3 block(warp_size, indices_per_block); - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kComplexHalf, kHalf, kBool, kBFloat16, expandedValue.scalar_type(), "indexing_backward", [&] { indexing_backward_kernel<<>>( sorted_indices.data_ptr(), @@ -290,6 +350,8 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List -__global__ void indexAddSmallIndex(cuda::detail::TensorInfo dst, - cuda::detail::TensorInfo src, - cuda::detail::TensorInfo indices, - int dstAddDim, - int srcAddDim, - IndexType innerSize, - int64_t dstAddDimSize, - T alpha) { +template +__global__ void indexFuncSmallIndex(cuda::detail::TensorInfo dst, + cuda::detail::TensorInfo src, + cuda::detail::TensorInfo indices, + int dstAddDim, + int srcAddDim, + IndexType innerSize, + int64_t dstAddDimSize, + int64_t dstNumel, + const func_t& op, + T alpha) { // In order to avoid reloading the index that we are copying, load // it once to handle all of the points that are being selected, so // it can be reused as much as possible. This kernel is chosen when @@ -385,8 +450,10 @@ __global__ void indexAddSmallIndex(cuda::detail::TensorInfo dst, cuda::detail::IndexToOffset::get(linearIndex, src); srcOffset += srcIndex * src.strides[srcAddDim]; - gpuAtomicAddNoReturn(&dst.data[dstOffset], src.data[srcOffset] * alpha); + T val = src.data[srcOffset] * alpha; + op(dst.data, dstOffset, dstNumel, &val); } + } } @@ -394,19 +461,21 @@ __global__ void indexAddSmallIndex(cuda::detail::TensorInfo dst, // if there are a large number of indices. // This kernel in fact works for all choices of problem size, but if // the number of indices chosen is small, then the -// indexAddSmallIndex kernel is a better choice to reduce memory +// indexFuncSmallIndex kernel is a better choice to reduce memory // accesses. template -__global__ void indexAddLargeIndex(cuda::detail::TensorInfo dst, - cuda::detail::TensorInfo src, - cuda::detail::TensorInfo indices, - int dstAddDim, - int srcAddDim, - IndexType totalSize, - IndexType innerSize, - int64_t dstAddDimSize, - T alpha) { + bool IndexIsMajor, typename func_t> +__global__ void indexFuncLargeIndex(cuda::detail::TensorInfo dst, + cuda::detail::TensorInfo src, + cuda::detail::TensorInfo indices, + int dstAddDim, + int srcAddDim, + IndexType totalSize, + IndexType innerSize, + int64_t dstAddDimSize, + int64_t dstNumel, + const func_t& op, + T alpha) { // We stride over the output including the indexed dimension // (totalSize), and calculate the destination index point based on that for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; @@ -435,7 +504,8 @@ __global__ void indexAddLargeIndex(cuda::detail::TensorInfo dst, cuda::detail::IndexToOffset::get(elementInSlice, src); srcOffset += srcIndex * src.strides[srcAddDim]; - gpuAtomicAddNoReturn(&dst.data[dstOffset], src.data[srcOffset] * alpha); + T val = src.data[srcOffset] * alpha; + op(dst.data, dstOffset, dstNumel, &val); } } @@ -505,6 +575,7 @@ void index_add_cuda_impl(const Tensor& self, int64_t dim, const Tensor& index, c ptrdiff_t sourceTotalSize = source.numel(); int64_t selfAddDimSize = self_.size(dim); ptrdiff_t numIndex = index.numel(); + int64_t selfNumel = self_.numel(); if (sliceSize == 0) { return; @@ -514,22 +585,23 @@ void index_add_cuda_impl(const Tensor& self, int64_t dim, const Tensor& index, c int mpc = at::cuda::getCurrentDeviceProperties()->multiProcessorCount; -#define SMALL_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE, SELF_DIM, SOURCE_DIM, IDX_DIM) \ - indexAddSmallIndex \ - <<>>( \ - selfInfo, sourceInfo, indexInfo, \ - selfAddDim, sourceAddDim, sliceSize, selfAddDimSize, alpha_value); \ +#define SMALL_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE, SELF_DIM, SOURCE_DIM, IDX_DIM) \ + indexFuncSmallIndex \ + <<>>( \ + selfInfo, sourceInfo, indexInfo, \ + selfAddDim, sourceAddDim, sliceSize, selfAddDimSize, \ + selfNumel, reduce_add, alpha_value); \ C10_CUDA_KERNEL_LAUNCH_CHECK(); #define LARGE_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE, \ SELF_DIM, SOURCE_DIM, IDX_DIM, IDX_IS_MAJOR) \ - indexAddLargeIndex \ + indexFuncLargeIndex \ <<>>( \ selfInfo, sourceInfo, indexInfo, \ selfAddDim, sourceAddDim, sourceTotalSize, \ (IDX_IS_MAJOR) ? sliceSize : numIndex, \ - selfAddDimSize, alpha_value); \ + selfAddDimSize, selfNumel, reduce_add, alpha_value); \ C10_CUDA_KERNEL_LAUNCH_CHECK(); dim3 smallIndexGrid(std::min(ceil_div(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); @@ -619,11 +691,211 @@ void index_add_cuda_impl(const Tensor& self, int64_t dim, const Tensor& index, c #undef LARGE_INDEX } +template +void index_reduce_func_cuda_impl( + const Tensor& self, + int64_t dim, + const Tensor& index, + const Tensor& source, + bool include_self, + const SCATTER_GATHER_OP& reduce, + const func_t& reduce_func, + const Tensor& result) { + globalContext().alertNotDeterministic("index_reduce_cuda"); + + if (!result.is_same(self)) result.copy_(self); + + // Scalars are treated as 1-d tensor + Tensor self_ = (result.dim() == 0) ? result.view(1) : result; + Tensor source_ = (source.dim() == 0) ? source.view(1) : source; + + TORCH_CHECK(result.dim() <= MAX_TENSORINFO_DIMS, "tensor has too many (>", MAX_TENSORINFO_DIMS, ") dims"); + TORCH_CHECK(source.dim() <= MAX_TENSORINFO_DIMS, "tensor has too many (>", MAX_TENSORINFO_DIMS, ") dims" ); + TORCH_CHECK(index.dim() <= MAX_TENSORINFO_DIMS, "tensor has too many (>", MAX_TENSORINFO_DIMS, ") dims"); + + if (!include_self) { + AT_DISPATCH_FLOATING_TYPES_AND2( + at::ScalarType::Half, at::ScalarType::BFloat16, + self.scalar_type(), "index_reduce_func_cuda_exclude_input_init", [&] { + scalar_t init_val; + switch (reduce) { + case SCATTER_GATHER_OP::REDUCE_MULTIPLY: + init_val = (scalar_t)1; + break; + case SCATTER_GATHER_OP::REDUCE_MAXIMUM: + init_val = std::numeric_limits::has_infinity ? -std::numeric_limits::infinity() + : std::numeric_limits::lowest(); + break; + case SCATTER_GATHER_OP::REDUCE_MINIMUM: + init_val = std::numeric_limits::has_infinity ? std::numeric_limits::infinity() + : std::numeric_limits::max(); + break; + default: + init_val = (scalar_t)0; + break; + } + // index_fill_ requires index to be a LongTensor + self_.index_fill_(dim, index.to(at::ScalarType::Long), init_val); + }); + } + + // The `source` is partitioned into two parts: + // -the size of each slice we are indexing, which is the + // total size of the tensor ignoring dimension `dim`; + // -the number of index we are choosing, which is the total size + // of the tensor `index`. + ptrdiff_t sliceSize = getSliceSize(self_, dim, index, source_); + ptrdiff_t sourceTotalSize = source.numel(); + int64_t selfReduceDimSize = self_.size(dim); + ptrdiff_t numIndex = index.numel(); + int64_t selfNumel = self_.numel(); + + if (sliceSize == 0) { + return; + } + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + bool indContig = index.is_contiguous(); + + int mpc = at::cuda::getCurrentDeviceProperties()->multiProcessorCount; + +#define SMALL_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE, SELF_DIM, SOURCE_DIM, IDX_DIM) \ + indexFuncSmallIndex \ + <<>>( \ + selfInfo, sourceInfo, indexInfo, \ + selfReduceDim, sourceReduceDim, sliceSize, selfReduceDimSize, \ + selfNumel, reduce_func, alpha_value); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); + +#define LARGE_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE, \ + SELF_DIM, SOURCE_DIM, IDX_DIM, IDX_IS_MAJOR) \ + indexFuncLargeIndex \ + <<>>( \ + selfInfo, sourceInfo, indexInfo, \ + selfReduceDim, sourceReduceDim, sourceTotalSize, \ + (IDX_IS_MAJOR) ? sliceSize : numIndex, \ + selfReduceDimSize, selfNumel, reduce_func, alpha_value); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + dim3 smallIndexGrid(std::min(ceil_div(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); + dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128)); + + dim3 largeIndexGrid(std::min(ceil_div(sourceTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); + dim3 largeIndexBlock(std::min(sourceTotalSize, (ptrdiff_t)128)); + + if (cuda::detail::canUse32BitIndexMath(result) && + cuda::detail::canUse32BitIndexMath(source) && + cuda::detail::canUse32BitIndexMath(index)) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, result.scalar_type(), "index_reduce", [&] { + cuda::detail::TensorInfo selfInfo = + cuda::detail::getTensorInfo(self_); + int selfReduceDim = selfInfo.collapseDims(dim); + selfInfo.reduceDim(selfReduceDim); + auto alpha_value = (scalar_t) 1; + AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_reduce_cuda", [&] () { + auto sourceInfo = + cuda::detail::getTensorInfo(source_); + int sourceReduceDim = sourceInfo.collapseDims(dim); + sourceInfo.reduceDim(sourceReduceDim); + + auto indexInfo = + cuda::detail::getTensorInfo(index); + indexInfo.collapseDims(); + + // A reasonable choice for when to have each thread iterate over + // index to choose + if (numIndex <= 16) { + if (selfInfo.dims == 1 && sourceInfo.dims == 1 && indContig) { + SMALL_INDEX(scalar_t, index_t, unsigned int, 1, 1, -2); + } else if (selfInfo.dims == 2 && sourceInfo.dims == 2 && indContig) { + SMALL_INDEX(scalar_t, index_t, unsigned int, 2, 2, -2); + } else if (selfInfo.dims == 3 && sourceInfo.dims == 3 && indContig) { + SMALL_INDEX(scalar_t, index_t, unsigned int, 3, 3, -2); + } else { + SMALL_INDEX(scalar_t, index_t, unsigned int, -1, -1, -1); + } + } else { + bool indexIsMajor = indexShouldBeMajor(selfInfo, selfReduceDim); + + if (selfInfo.dims == 1 && sourceInfo.dims == 1 && indContig) { + LARGE_INDEX(scalar_t, index_t, unsigned int, 1, 1, -2, true); + } else if (selfInfo.dims == 2 && sourceInfo.dims == 2 && indContig) { + if (indexIsMajor) { + LARGE_INDEX(scalar_t, index_t, unsigned int, 2, 2, -2, true); + } else { + LARGE_INDEX(scalar_t, index_t, unsigned int, 2, 2, -2, false); + } + } else if (selfInfo.dims == 3 && sourceInfo.dims == 3 && indContig) { + if (indexIsMajor) { + LARGE_INDEX(scalar_t, index_t, unsigned int, 3, 3, -2, true); + } else { + LARGE_INDEX(scalar_t, index_t, unsigned int, 3, 3, -2, false); + } + } else { + LARGE_INDEX(scalar_t, index_t, unsigned int, -1, -1, -1, true); + } + } + }); + }); + } else { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "index_reduce", [&] { + cuda::detail::TensorInfo selfInfo = + cuda::detail::getTensorInfo(self_); + int selfReduceDim = selfInfo.collapseDims(dim); + selfInfo.reduceDim(selfReduceDim); + auto alpha_value = (scalar_t) 1; + + cuda::detail::TensorInfo sourceInfo = + cuda::detail::getTensorInfo(source_); + int sourceReduceDim = sourceInfo.collapseDims(dim); + sourceInfo.reduceDim(sourceReduceDim); + + AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_reduce_cuda", [&] () { + cuda::detail::TensorInfo indexInfo = + cuda::detail::getTensorInfo(index); + indexInfo.collapseDims(); + + LARGE_INDEX(scalar_t, index_t, uint64_t, -1, -1, -1, true); + }); + }); + } + +#undef SMALL_INDEX +#undef LARGE_INDEX +} + TORCH_IMPL_FUNC(index_add_cuda_out) (const Tensor& self, int64_t dim, const Tensor& index, const Tensor& source, const Scalar& alpha, const Tensor& result) { index_add_cuda_impl(self, dim, index, source, alpha, result); } +TORCH_IMPL_FUNC(index_reduce_cuda_out) +(const Tensor& self, + int64_t dim, + const Tensor& index, + const Tensor& source, + const c10::string_view reduce, + bool include_self, + const Tensor& result) { + TORCH_WARN_ONCE("index_reduce() is in beta and the API may change at any time."); + + if (reduce == "prod") { + index_reduce_func_cuda_impl(self, dim, index, source, include_self, SCATTER_GATHER_OP::REDUCE_MULTIPLY, reduce_multiply, result); + } else if (reduce == "mean") { + index_reduce_func_cuda_impl(self, dim, index, source, include_self, SCATTER_GATHER_OP::REDUCE_MEAN, reduce_add, result); + auto counts = include_self ? at::ones_like(result) : at::zeros_like(result); + counts.index_add_(dim, index, at::ones_like(source)); + counts.masked_fill_(counts == 0, 1); + result.div_(counts); + } else if (reduce == "amax") { + index_reduce_func_cuda_impl(self, dim, index, source, include_self, SCATTER_GATHER_OP::REDUCE_MAXIMUM, reduce_maximum, result); + } else if (reduce == "amin") { + index_reduce_func_cuda_impl(self, dim, index, source, include_self, SCATTER_GATHER_OP::REDUCE_MINIMUM, reduce_minimum, result); + } else { + TORCH_CHECK(false, "reduce argument must be either prod, mean, amax or amin, got ", reduce, "."); + } +} + namespace { // We prefer this kernel to avoid reloading index points if the number // of indices is a small number. @@ -905,15 +1177,16 @@ Tensor& index_select_out_cuda( } Tensor index_select_cuda(const Tensor& self, int64_t dim, const Tensor& index) { - Tensor out; - if (self.is_quantized()){ - TORCH_CHECK( - self.qscheme() == kPerTensorAffine, - "Only per_tensor quantized quantized tensors are supported by index_select.") - out = at::empty_quantized({0}, self); - } else { - out = at::empty({0}, self.options()); - } + Tensor out = at::empty({0}, self.options()); + at::native::index_select_out_cuda(self, dim, index, out); + return out; +} + +Tensor index_select_quantized_cuda(const Tensor& self, int64_t dim, const Tensor& index) { + TORCH_CHECK( + self.qscheme() == kPerTensorAffine, + "Only per_tensor quantized quantized tensors are supported by index_select.") + Tensor out = at::empty_quantized({0}, self); at::native::index_select_out_cuda(self, dim, index, out); return out; } @@ -922,8 +1195,8 @@ namespace { template void masked_fill_kernel(TensorIterator& iter, const Scalar& value) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( - kBool, kHalf, kBFloat16, iter.common_dtype(), "masked_fill_", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( + kBool, kHalf, kBFloat16, kComplexHalf, iter.common_dtype(), "masked_fill_", [&]() { const auto value_ = value.to(); gpu_kernel( iter, [value_] GPU_LAMBDA(scalar_t self, mask_t mask) -> scalar_t { diff --git a/aten/src/ATen/native/cuda/JitLoops.cuh b/aten/src/ATen/native/cuda/JitLoops.cuh index 6284feba2d56..bb37a6acc2e1 100644 --- a/aten/src/ATen/native/cuda/JitLoops.cuh +++ b/aten/src/ATen/native/cuda/JitLoops.cuh @@ -132,7 +132,7 @@ void jitted_gpu_kernel( /*f_inputs_type=*/f_inputs_type, arity, at::cuda::jit::BinaryFuncVariant::NoScalar>( - iter, f, needs_dynamic_casting, /*scalar_val=*/0, extra_args); + iter, f, needs_dynamic_casting, /*scalar_val=*/scalar_val, extra_args); } else if (scalar_pos == at::cuda::jit::BinaryFuncVariant::RhsScalar) { jitted_gpu_kernel_impl< /*name*/ name, diff --git a/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu b/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu index f8ac9d3ed8f6..b080a6e5eac2 100644 --- a/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu +++ b/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu @@ -1,7 +1,14 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + #include #include #include diff --git a/aten/src/ATen/native/cuda/Lerp.cu b/aten/src/ATen/native/cuda/Lerp.cu index ed57a2700c48..ac1f2ba379b5 100644 --- a/aten/src/ATen/native/cuda/Lerp.cu +++ b/aten/src/ATen/native/cuda/Lerp.cu @@ -3,6 +3,7 @@ #include #include #include +#include namespace at { namespace native { @@ -13,17 +14,23 @@ void lerp_tensor_kernel(at::TensorIteratorBase& iter) { at::ScalarType::Half, at::ScalarType::BFloat16, iter.common_dtype(), "lerp_cuda", [&] { + using opmath_t = at::opmath_type; at::native::gpu_kernel( iter, [] GPU_LAMBDA( scalar_t self_val, scalar_t end_val, scalar_t weight_val) -> scalar_t { - return (std::abs(weight_val) < 0.5) - ? self_val + weight_val * (end_val - self_val) - : end_val - - (end_val - self_val) * - (static_cast(1) - weight_val); + opmath_t self_val_f = self_val; + opmath_t end_val_f = end_val; + opmath_t weight_val_f = weight_val; + // Conditional for better numeric. This has been discussed in + // https://github.com/pytorch/pytorch/pull/18871 + return (std::abs(weight_val_f) < 0.5) + ? self_val_f + weight_val_f * (end_val_f - self_val_f) + : end_val_f - + (end_val_f - self_val_f) * + (opmath_t{1} - weight_val_f); }); }); } @@ -33,13 +40,18 @@ void lerp_scalar_kernel(at::TensorIteratorBase& iter, const c10::Scalar& weight) at::ScalarType::Half, at::ScalarType::BFloat16, iter.common_dtype(), "lerp_cuda", [&]{ - auto weight_val = weight.to(); + using opmath_t = at::opmath_type; + auto weight_val = weight.to(); at::native::gpu_kernel( iter, [=] GPU_LAMBDA(scalar_t self_val, scalar_t end_val) { + opmath_t self_val_f = self_val; + opmath_t end_val_f = end_val; + // Conditional for better numeric. This has been discussed in + // https://github.com/pytorch/pytorch/pull/18871 return (std::abs(weight_val) < 0.5) - ? self_val + weight_val * (end_val - self_val) - : end_val - - (end_val - self_val) * (static_cast(1) - weight_val); + ? self_val_f + weight_val * (end_val_f - self_val_f) + : end_val_f - + (end_val_f - self_val_f) * (opmath_t{1} - weight_val); }); }); } diff --git a/aten/src/ATen/native/cuda/LinearAlgebra.cu b/aten/src/ATen/native/cuda/LinearAlgebra.cu index f2360261e865..24590e0647b5 100644 --- a/aten/src/ATen/native/cuda/LinearAlgebra.cu +++ b/aten/src/ATen/native/cuda/LinearAlgebra.cu @@ -1,7 +1,7 @@ -#define TORCH_ASSERT_NO_OPERATORS #include #include #include +#include #include #include #include @@ -100,56 +100,38 @@ static void _launch_kernel(int total_n_elems, func_t f) { C10_CUDA_KERNEL_LAUNCH_CHECK(); } -void _unpack_pivots_internal_kernel( - TensorIterator& iter, - int64_t dim_size -) { - if (iter.numel() == 0) { - return; - } - +void unpack_pivots_cuda_kernel(TensorIterator& iter, const int64_t dim_size) { if (!iter.can_use_32bit_indexing()) { for (auto& sub_iter : iter.with_32bit_indexing()) { - _unpack_pivots_internal_kernel(sub_iter, dim_size); + unpack_pivots_cuda_kernel(sub_iter, dim_size); } return; } - auto offset_calculator = make_offset_calculator<2>(iter); + const auto offset_calculator = make_offset_calculator<2>(iter); - char* unpacked_pivots_ptr = reinterpret_cast(iter.data_ptr(0)); - const char* const __restrict__ pivots_ptr = reinterpret_cast(iter.data_ptr(1)); + const auto perm_ptr = reinterpret_cast(iter.data_ptr(0)); + const auto pivots_ptr = reinterpret_cast(iter.data_ptr(1)); - auto loop = [=]C10_DEVICE(int i) { - auto offsets = offset_calculator.get(i); + auto loop = [=]C10_DEVICE(const int idx) { + const auto offsets = offset_calculator.get(idx); - auto* unpacked_pivots_data = reinterpret_cast( - unpacked_pivots_ptr + offsets[0]); - const auto* const __restrict__ pivots_data = reinterpret_cast( - pivots_ptr + offsets[1]); + int64_t* const __restrict__ perm_data = reinterpret_cast(perm_ptr + offsets[0]); + const int32_t* const __restrict__ pivots_data = reinterpret_cast(pivots_ptr + offsets[1]); // QUESTION: can we mix 64bit offsets with 32bit Iterator indexing? for (int64_t i = 0; i < dim_size; ++i) { thrust::swap( - unpacked_pivots_data[i], - unpacked_pivots_data[pivots_data[i]] + perm_data[i], + perm_data[pivots_data[i] - 1] ); } }; _launch_kernel(iter.numel(), loop); } - -void unpack_pivots_cuda_kernel( - TensorIterator& iter, - int64_t dim_size -) { - _unpack_pivots_internal_kernel(iter, dim_size); -} - } // anonymous namespace -REGISTER_DISPATCH(addr_stub, &addr_kernel_cuda); REGISTER_DISPATCH(unpack_pivots_stub, &unpack_pivots_cuda_kernel); - +REGISTER_DISPATCH(addr_stub, &addr_kernel_cuda); }} diff --git a/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp b/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp new file mode 100644 index 000000000000..a7606e93047f --- /dev/null +++ b/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp @@ -0,0 +1,235 @@ +// LinearAlgebraStubs.cpp +// Mostly a no-op unless BUILD_LAZY_CUDA_LINALG is defined +// In that case load library is dynamically loaded when first linalg call is made +// This helps reduce size of GPU memory context if linear algebra functions are not used +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(BUILD_LAZY_CUDA_LINALG) +#include + +#if AT_MAGMA_ENABLED() +#include + +namespace { +struct MagmaInitializer { + MagmaInitializer() { + ::at::cuda::detail::set_magma_init_fn([]{ }); + }; +} initializer; +} // namespace (anonymous) +#endif +#endif +namespace at { +namespace native { +#if defined(BUILD_LAZY_CUDA_LINALG) +namespace { +cuda::detail::LinalgDispatch disp = {_symeig_helper_cuda, + _linalg_qr_helper_cuda, + _cholesky_solve_helper_cuda, + legacy_lstsq_cuda, + _linalg_inv_out_helper_cuda}; + +at::DynamicLibrary& getTorchLinalgLibrary() { + static at::DynamicLibrary lib("libtorch_cuda_linalg.so", nullptr, true); + return lib; +} + +// Lazy dispatches do nothing but load linalg library and call the stub +// Loading the library should override the registration of those with the proper implementation +// getTorchLinalgLibrary() throws an exception if library is not found, +// which makes it unnecessary to have an explicit error checking +// But make sure that this function is called only once, to avoid infinite recursion +void loadLazyTorchLinalgLibrary() { + static int invoke_count = 0; + getTorchLinalgLibrary(); + TORCH_CHECK(invoke_count++ == 0, "lazy wrapper should be called at most once"); +} + +void lazy_cholesky_kernel(const Tensor& input, const Tensor& info, bool upper) { + loadLazyTorchLinalgLibrary(); + cholesky_stub(DeviceType::CUDA, input, info, upper); +} + +Tensor& lazy_cholesky_inverse_kernel(Tensor &result, Tensor& infos, bool upper) { + loadLazyTorchLinalgLibrary(); + return cholesky_inverse_stub(DeviceType::CUDA, result, infos, upper); +} + +void lazy_lu_factor(const Tensor& input, const Tensor& pivots, const Tensor& infos, bool compute_pivots) { + loadLazyTorchLinalgLibrary(); + lu_factor_stub(DeviceType::CUDA, input, pivots, infos, compute_pivots); +} + +void lazy_triangular_solve_kernel(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) { + loadLazyTorchLinalgLibrary(); + triangular_solve_stub(DeviceType::CUDA, A, B, left, upper, transpose, unitriangular); +} + +Tensor& lazy_orgqr_kernel(Tensor& result, const Tensor& tau) { + loadLazyTorchLinalgLibrary(); + return orgqr_stub(DeviceType::CUDA, result, tau); +} + +void lazy_ormqr_kernel(const Tensor& input, const Tensor& tau, const Tensor& other, bool left, bool transpose) { + loadLazyTorchLinalgLibrary(); + ormqr_stub(DeviceType::CUDA, input, tau, other, left, transpose); +} + +void lazy_geqrf_kernel(const Tensor& input, const Tensor& tau) { + loadLazyTorchLinalgLibrary(); + geqrf_stub(DeviceType::CUDA, input, tau); +} + +void lazy_linalg_eigh_kernel(const Tensor& eigenvalues, const Tensor& eigenvectors, const Tensor& infos, bool upper, bool compute_eigenvectors) { + loadLazyTorchLinalgLibrary(); + linalg_eigh_stub(DeviceType::CUDA, eigenvalues, eigenvectors, infos, upper, compute_eigenvectors); +} + +std::tuple lazy_eig_kernel(const Tensor& self, bool& eigenvectors) { + loadLazyTorchLinalgLibrary(); + return eig_stub(DeviceType::CUDA, self, eigenvectors); +} + +void lazy_linalg_eig_kernel(Tensor& eigenvalues, Tensor& eigenvectors, Tensor& infos, const Tensor& input, bool compute_eigenvectors) { + getTorchLinalgLibrary(); + linalg_eig_stub(DeviceType::CUDA, eigenvalues, eigenvectors, infos, input, compute_eigenvectors); +} + +void lazy_svd_kernel(const Tensor& A, + const bool full_matrices, + const bool compute_uv, + const Tensor& U, + const Tensor& S, + const Tensor& Vh, + const Tensor& info) { + getTorchLinalgLibrary(); + svd_stub(DeviceType::CUDA, A, full_matrices, compute_uv, U, S, Vh, info); +} + +void lazy_lu_solve_trans(const Tensor& b, const Tensor& lu, const Tensor& pivots, TransposeType trans) { + getTorchLinalgLibrary(); + lu_solve_trans_stub(DeviceType::CUDA, b, lu, pivots, trans); +} + +void lazy_lu_solve(const Tensor& b, const Tensor& lu, const Tensor& pivots) { + getTorchLinalgLibrary(); + lu_solve_stub(DeviceType::CUDA, b, lu, pivots); +} + +void lazy_lstsq_kernel(const Tensor& a, Tensor& b, Tensor& rank, Tensor& singular_values, Tensor& infos, double rcond, std::string driver_name) { + getTorchLinalgLibrary(); + lstsq_stub(DeviceType::CUDA, a, b, rank, singular_values, infos, rcond, driver_name); +} + +void lazy_ldl_factor( + const Tensor& LD, + const Tensor& pivots, + const Tensor& info, + bool upper, + bool hermitian) { + loadLazyTorchLinalgLibrary(); + ldl_factor_stub(DeviceType::CUDA, LD, pivots, info, upper, hermitian); +} + +void lazy_ldl_solve( + const Tensor& LD, + const Tensor& pivots, + const Tensor& B, + bool upper, + bool hermitian) { + loadLazyTorchLinalgLibrary(); + ldl_solve_stub(DeviceType::CUDA, LD, pivots, B, upper, hermitian); +} + +REGISTER_CUDA_DISPATCH(cholesky_stub, &lazy_cholesky_kernel) +REGISTER_CUDA_DISPATCH(cholesky_inverse_stub, &lazy_cholesky_inverse_kernel); +REGISTER_CUDA_DISPATCH(lu_factor_stub, &lazy_lu_factor); +REGISTER_CUDA_DISPATCH(ldl_factor_stub, &lazy_ldl_factor); +REGISTER_CUDA_DISPATCH(ldl_solve_stub, &lazy_ldl_solve); +REGISTER_CUDA_DISPATCH(triangular_solve_stub, &lazy_triangular_solve_kernel); +REGISTER_CUDA_DISPATCH(orgqr_stub, &lazy_orgqr_kernel); +REGISTER_CUDA_DISPATCH(ormqr_stub, &lazy_ormqr_kernel); +REGISTER_CUDA_DISPATCH(geqrf_stub, &lazy_geqrf_kernel); +REGISTER_CUDA_DISPATCH(linalg_eigh_stub, &lazy_linalg_eigh_kernel); +REGISTER_CUDA_DISPATCH(eig_stub, &lazy_eig_kernel); +REGISTER_CUDA_DISPATCH(linalg_eig_stub, &lazy_linalg_eig_kernel); +REGISTER_CUDA_DISPATCH(svd_stub, &lazy_svd_kernel) +REGISTER_CUDA_DISPATCH(lu_solve_trans_stub, &lazy_lu_solve_trans); +REGISTER_CUDA_DISPATCH(lu_solve_stub, &lazy_lu_solve); +REGISTER_CUDA_DISPATCH(lstsq_stub, &lazy_lstsq_kernel); +} // anonymous namespace + +// Old style dispatches +// torch_cuda_linalg dynamic library should have a global constructor +// that calls regiserLinaglDispatch so in order ot lazy bind +// old style dispatch all one have to do is to load library and call disp.func_name +// Protect from infinite recursion by initializing dispatch to self and checking +// that values are different after linalg library were loaded + +namespace cuda { +namespace detail { +void registerLinalgDispatch(const LinalgDispatch& disp_) { + disp = disp_; +} +}} //namespace cuda::detail + +Tensor& _linalg_inv_out_helper_cuda(Tensor &result, Tensor& infos_lu, Tensor& infos_getri) { + getTorchLinalgLibrary(); + TORCH_CHECK(disp.inv_out_helper != _linalg_inv_out_helper_cuda, "Can't find _linalg_inv_out_helper_cuda"); + return disp.inv_out_helper(result, infos_lu, infos_getri); +} + +std::tuple legacy_lstsq_cuda(const Tensor &B, const Tensor &A) { + getTorchLinalgLibrary(); + TORCH_CHECK(disp.legacy_lstsq != legacy_lstsq_cuda, "Can't find legacy_lstsq_cuda"); + return disp.legacy_lstsq(B, A); +} + +Tensor _cholesky_solve_helper_cuda(const Tensor& self, const Tensor& A, bool upper) { + getTorchLinalgLibrary(); + TORCH_CHECK(disp.cholesky_solve_helper != _cholesky_solve_helper_cuda, "Can't find _cholesky_solve_helper_cuda"); + return disp.cholesky_solve_helper(self, A, upper); +} + +std::tuple _linalg_qr_helper_cuda(const Tensor& input, c10::string_view mode) { + getTorchLinalgLibrary(); + TORCH_CHECK(disp.qr_helper != _linalg_qr_helper_cuda, "Can't find _linalg_qr_helper_cuda"); + return disp.qr_helper(input, mode); +} + +std::tuple _symeig_helper_cuda(const Tensor& self, bool eigenvectors, bool upper) { + getTorchLinalgLibrary(); + TORCH_CHECK(disp.symeig_helper != _symeig_helper_cuda, "Can't find _symeig_helper_cuda"); + return disp.symeig_helper(self, eigenvectors, upper); +} + +#endif /*defined(BUILD_LAZY_CUDA_LINALG)*/ + +std::tuple legacy_lstsq_out_cuda( + const Tensor& B, const Tensor& A, Tensor& B_out, Tensor& A_out) { + const auto dtype = A.scalar_type(); + TORCH_CHECK(B.scalar_type() == dtype, "exepected A and B dtypes to match but found ", + A.scalar_type(), " and ", B.scalar_type()); + TORCH_CHECK(A_out.scalar_type() == dtype, "A_out to have scalar type ", dtype, + " but found", A_out.scalar_type()); + TORCH_CHECK(B_out.scalar_type() == dtype, "A_out to have scalar type ", dtype, + " but found", B_out.scalar_type()); + Tensor A_tmp, B_tmp; + std::tie(B_tmp, A_tmp) = native::legacy_lstsq_cuda(B, A); + resize_output(A_out, A_tmp.sizes()); + A_out.copy_(A_tmp); + resize_output(B_out, B_tmp.sizes()); + B_out.copy_(B_tmp); + return std::tuple(B_out, A_out); +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/cuda/Loss.cu b/aten/src/ATen/native/cuda/Loss.cu index 6afc89592799..1f885ff6fe0b 100644 --- a/aten/src/ATen/native/cuda/Loss.cu +++ b/aten/src/ATen/native/cuda/Loss.cu @@ -1,14 +1,28 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include -#include #include #include #include -#include +#include +#include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#endif + constexpr float EPSILON = 1e-12; namespace { diff --git a/aten/src/ATen/native/cuda/LossCTC.cu b/aten/src/ATen/native/cuda/LossCTC.cu index 65508b1a956b..4e406f7cd4de 100644 --- a/aten/src/ATen/native/cuda/LossCTC.cu +++ b/aten/src/ATen/native/cuda/LossCTC.cu @@ -7,15 +7,32 @@ // Graves et al call the probabilities y, we use log_probs (also calling them inputs) // A few optimizations (similar to those here, but also some I didn't take) are described in // 2. Minmin Sun: http://on-demand.gputechconf.com/gtc/2016/presentation/s6383-minmin-sun-speech-recognition.pdf - +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include #include -#include +#include #include +#include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + #include #include diff --git a/aten/src/ATen/native/cuda/Math.cuh b/aten/src/ATen/native/cuda/Math.cuh index e063ec7f42fb..cbd562f542c5 100644 --- a/aten/src/ATen/native/cuda/Math.cuh +++ b/aten/src/ATen/native/cuda/Math.cuh @@ -7,108 +7,6 @@ namespace at { namespace native { - -// TODO: these functions are unconditionally available because kaiser window depends on them -// TODO: jiterate kaiser window and make them only available when not jiterating -// NOTE: jiterating kaiser window requires extending the jiterator's scalar support -/* - * For licensing information and documentation, please refer to the the cpu implementation located in "ATen/native/Math.h". - */ -template -static inline C10_HOST_DEVICE scalar_t -chbevl(scalar_t _x, const scalar_t array[], size_t len) { - static_assert(!std::is_same() && !std::is_same(), "don't instantiate with low precision type"); - - scalar_t b0, b1, b2; - - b0 = array[0]; - b1 = 0; - - for (size_t i = 1; i < len; ++i) { - b2 = b1; - b1 = b0; - b0 = _x * b1 - b2 + array[i]; - } - - return (0.5 * (b0 - b2)); -} - -/* - * For licensing information and documentation, please refer to the the cpu implementation located in "ATen/native/Math.h". - */ -template -C10_HOST_DEVICE inline std::tuple chebyshev_coefficients_i0e_A() { - /* Chebyshev coefficients for exp(-x) I0(x) - * in the interval [0,8]. - * - * lim(x->0){ exp(-x) I0(x) } = 1. - */ - static const T coefficients[] = { - -4.41534164647933937950E-18, 3.33079451882223809783E-17, - -2.43127984654795469359E-16, 1.71539128555513303061E-15, - -1.16853328779934516808E-14, 7.67618549860493561688E-14, - -4.85644678311192946090E-13, 2.95505266312963983461E-12, - -1.72682629144155570723E-11, 9.67580903537323691224E-11, - -5.18979560163526290666E-10, 2.65982372468238665035E-9, - -1.30002500998624804212E-8, 6.04699502254191894932E-8, - -2.67079385394061173391E-7, 1.11738753912010371815E-6, - -4.41673835845875056359E-6, 1.64484480707288970893E-5, - -5.75419501008210370398E-5, 1.88502885095841655729E-4, - -5.76375574538582365885E-4, 1.63947561694133579842E-3, - -4.32430999505057594430E-3, 1.05464603945949983183E-2, - -2.37374148058994688156E-2, 4.93052842396707084878E-2, - -9.49010970480476444210E-2, 1.71620901522208775349E-1, - -3.04682672343198398683E-1, 6.76795274409476084995E-1}; - - return std::make_tuple(coefficients, 30); -} - -template -C10_HOST_DEVICE inline std::tuple chebyshev_coefficients_i0e_B() { - /* Chebyshev coefficients for exp(-x) sqrt(x) I0(x) - * in the inverted interval [8,infinity]. - * - * lim(x->inf){ exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi). - */ - static const T coefficients[] = { - -7.23318048787475395456E-18, -4.83050448594418207126E-18, - 4.46562142029675999901E-17, 3.46122286769746109310E-17, - -2.82762398051658348494E-16, -3.42548561967721913462E-16, - 1.77256013305652638360E-15, 3.81168066935262242075E-15, - -9.55484669882830764870E-15, -4.15056934728722208663E-14, - 1.54008621752140982691E-14, 3.85277838274214270114E-13, - 7.18012445138366623367E-13, -1.79417853150680611778E-12, - -1.32158118404477131188E-11, -3.14991652796324136454E-11, - 1.18891471078464383424E-11, 4.94060238822496958910E-10, - 3.39623202570838634515E-9, 2.26666899049817806459E-8, - 2.04891858946906374183E-7, 2.89137052083475648297E-6, - 6.88975834691682398426E-5, 3.36911647825569408990E-3, - 8.04490411014108831608E-1}; - - return std::make_tuple(coefficients, 25); -} - -template -static inline C10_HOST_DEVICE scalar_t calc_i0(scalar_t _x) { - static_assert(!std::is_same() && !std::is_same(), "don't instantiate with low precision type"); - // Upcast input for numerical accuracy purposes - // Needed for accurate results if input is bfloat16 or float16 - scalar_t x = ::abs(_x); - - if (x <= scalar_t{8.0}) { - auto coeff_pair = chebyshev_coefficients_i0e_A(); - auto A = std::get<0>(coeff_pair); - auto len = std::get<1>(coeff_pair); - scalar_t y = (x / scalar_t{2.0}) - scalar_t{2.0}; - return (::exp(x) * chbevl(y, A, len)); - } - - auto coeff_pair = chebyshev_coefficients_i0e_B(); - auto B = std::get<0>(coeff_pair); - auto len = std::get<1>(coeff_pair); - return (::exp(x) * chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len) / ::sqrt(x)); -} - // See note [Jiterator] // TODO: elaborate in this comment on the structure of math.cuh #if AT_USE_JITERATOR() @@ -276,6 +174,19 @@ const auto ndtri_string = jiterator_stringify( } ); // ndtri_string +const auto log_ndtr_string = jiterator_stringify( + template + T log_ndtr(T x) { + constexpr T SQRT1_2{0.707106781186547524400844362104849039}; // 1/sqrt(2) + T t = x * SQRT1_2; + if (x < T{-1.0}) { + return log(erfcx(-t) / 2) - t * t; + } else { + return log1p(-erfc(t) / 2); + } + } +); // log_ndtr_string + const auto gcd_string = jiterator_stringify( template T gcd(const T a_in, const T b_in) { @@ -555,6 +466,8 @@ const auto entr_string = jiterator_stringify( } ); // entr_string +// NOTE: `kaiser_window_string` depends on `i0_string` +// for its implementation. const auto i0_string = jiterator_stringify( template T chbevl(T x, const T array[], const int len) { @@ -629,69 +542,6 @@ const auto i0_string = jiterator_stringify( } ); // i0_string -const auto i0e_string = jiterator_stringify( - template - T chbevl(T x, const T array[], const int len) { - T b0, b1, b2; - - b0 = array[0]; - b1 = 0; - - for (int i = 1; i < len; ++i) { - b2 = b1; - b1 = b0; - b0 = x * b1 - b2 + array[i]; - } - - return T{0.5} * (b0 - b2); - } - - template - T i0e(T _x) { - T x = fabs(_x); - - if (x <= T{8.0}) { - T coefficients[] = { - -4.41534164647933937950E-18, 3.33079451882223809783E-17, - -2.43127984654795469359E-16, 1.71539128555513303061E-15, - -1.16853328779934516808E-14, 7.67618549860493561688E-14, - -4.85644678311192946090E-13, 2.95505266312963983461E-12, - -1.72682629144155570723E-11, 9.67580903537323691224E-11, - -5.18979560163526290666E-10, 2.65982372468238665035E-9, - -1.30002500998624804212E-8, 6.04699502254191894932E-8, - -2.67079385394061173391E-7, 1.11738753912010371815E-6, - -4.41673835845875056359E-6, 1.64484480707288970893E-5, - -5.75419501008210370398E-5, 1.88502885095841655729E-4, - -5.76375574538582365885E-4, 1.63947561694133579842E-3, - -4.32430999505057594430E-3, 1.05464603945949983183E-2, - -2.37374148058994688156E-2, 4.93052842396707084878E-2, - -9.49010970480476444210E-2, 1.71620901522208775349E-1, - -3.04682672343198398683E-1, 6.76795274409476084995E-1}; - - T y = (x / T{2.0}) - T{2.0}; - return chbevl(y, coefficients, int{30}); - } - - // x > 8 - T coefficients[] = { - -7.23318048787475395456E-18, -4.83050448594418207126E-18, - 4.46562142029675999901E-17, 3.46122286769746109310E-17, - -2.82762398051658348494E-16, -3.42548561967721913462E-16, - 1.77256013305652638360E-15, 3.81168066935262242075E-15, - -9.55484669882830764870E-15, -4.15056934728722208663E-14, - 1.54008621752140982691E-14, 3.85277838274214270114E-13, - 7.18012445138366623367E-13, -1.79417853150680611778E-12, - -1.32158118404477131188E-11, -3.14991652796324136454E-11, - 1.18891471078464383424E-11, 4.94060238822496958910E-10, - 3.39623202570838634515E-9, 2.26666899049817806459E-8, - 2.04891858946906374183E-7, 2.89137052083475648297E-6, - 6.88975834691682398426E-5, 3.36911647825569408990E-3, - 8.04490411014108831608E-1}; - - return chbevl(T{32.0} / x - T{2.0}, coefficients, int{25}) / sqrt(x); - } -); // i0e_string - const auto i1_string = jiterator_stringify( template T chbevl(const T x, const T array[], const int len) { @@ -881,6 +731,15 @@ const auto i1e_string = jiterator_stringify( } ); // i1e_string +const auto kaiser_window_string = i0_string + jiterator_stringify( + template + T kaiser_window(T a, T inv_alpha, T beta, T inv_i0_beta) { + T x = a * inv_alpha - T{1}; + T y = max(T{0}, T{1} - x * x); + return i0(beta * sqrt(y)) * inv_i0_beta; + } +); // kaiser_window_string + const auto sinc_string = jiterator_stringify( template T sinc(T a) { @@ -1509,22 +1368,102 @@ static inline C10_HOST_DEVICE scalar_t calc_trigamma(scalar_t in) { return static_cast(sign * result); } +/* + * For licensing information and documentation, please refer to the the cpu implementation located in "ATen/native/Math.h". + */ template -static inline C10_HOST_DEVICE scalar_t calc_i0e(scalar_t _x) { +static inline C10_HOST_DEVICE scalar_t +chbevl(scalar_t _x, const scalar_t array[], size_t len) { static_assert(!std::is_same() && !std::is_same(), "don't instantiate with low precision type"); + + scalar_t b0, b1, b2; + + b0 = array[0]; + b1 = 0; + + for (size_t i = 1; i < len; ++i) { + b2 = b1; + b1 = b0; + b0 = _x * b1 - b2 + array[i]; + } + + return (0.5 * (b0 - b2)); +} + +/* + * For licensing information and documentation, please refer to the the cpu implementation located in "ATen/native/Math.h". + */ +template +C10_HOST_DEVICE inline std::tuple chebyshev_coefficients_i0e_A() { + /* Chebyshev coefficients for exp(-x) I0(x) + * in the interval [0,8]. + * + * lim(x->0){ exp(-x) I0(x) } = 1. + */ + static const T coefficients[] = { + -4.41534164647933937950E-18, 3.33079451882223809783E-17, + -2.43127984654795469359E-16, 1.71539128555513303061E-15, + -1.16853328779934516808E-14, 7.67618549860493561688E-14, + -4.85644678311192946090E-13, 2.95505266312963983461E-12, + -1.72682629144155570723E-11, 9.67580903537323691224E-11, + -5.18979560163526290666E-10, 2.65982372468238665035E-9, + -1.30002500998624804212E-8, 6.04699502254191894932E-8, + -2.67079385394061173391E-7, 1.11738753912010371815E-6, + -4.41673835845875056359E-6, 1.64484480707288970893E-5, + -5.75419501008210370398E-5, 1.88502885095841655729E-4, + -5.76375574538582365885E-4, 1.63947561694133579842E-3, + -4.32430999505057594430E-3, 1.05464603945949983183E-2, + -2.37374148058994688156E-2, 4.93052842396707084878E-2, + -9.49010970480476444210E-2, 1.71620901522208775349E-1, + -3.04682672343198398683E-1, 6.76795274409476084995E-1}; + + return std::make_tuple(coefficients, 30); +} + +template +C10_HOST_DEVICE inline std::tuple chebyshev_coefficients_i0e_B() { + /* Chebyshev coefficients for exp(-x) sqrt(x) I0(x) + * in the inverted interval [8,infinity]. + * + * lim(x->inf){ exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi). + */ + static const T coefficients[] = { + -7.23318048787475395456E-18, -4.83050448594418207126E-18, + 4.46562142029675999901E-17, 3.46122286769746109310E-17, + -2.82762398051658348494E-16, -3.42548561967721913462E-16, + 1.77256013305652638360E-15, 3.81168066935262242075E-15, + -9.55484669882830764870E-15, -4.15056934728722208663E-14, + 1.54008621752140982691E-14, 3.85277838274214270114E-13, + 7.18012445138366623367E-13, -1.79417853150680611778E-12, + -1.32158118404477131188E-11, -3.14991652796324136454E-11, + 1.18891471078464383424E-11, 4.94060238822496958910E-10, + 3.39623202570838634515E-9, 2.26666899049817806459E-8, + 2.04891858946906374183E-7, 2.89137052083475648297E-6, + 6.88975834691682398426E-5, 3.36911647825569408990E-3, + 8.04490411014108831608E-1}; + + return std::make_tuple(coefficients, 25); +} + +template +static inline C10_HOST_DEVICE scalar_t calc_i0(scalar_t _x) { + static_assert(!std::is_same() && !std::is_same(), "don't instantiate with low precision type"); + // Upcast input for numerical accuracy purposes + // Needed for accurate results if input is bfloat16 or float16 scalar_t x = ::abs(_x); + if (x <= scalar_t{8.0}) { auto coeff_pair = chebyshev_coefficients_i0e_A(); auto A = std::get<0>(coeff_pair); auto len = std::get<1>(coeff_pair); scalar_t y = (x / scalar_t{2.0}) - scalar_t{2.0}; - return (chbevl(y, A, len)); + return (::exp(x) * chbevl(y, A, len)); } auto coeff_pair = chebyshev_coefficients_i0e_B(); auto B = std::get<0>(coeff_pair); auto len = std::get<1>(coeff_pair); - return (chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len) / ::sqrt(x)); + return (::exp(x) * chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len) / ::sqrt(x)); } template diff --git a/aten/src/ATen/native/cuda/MaxUnpooling.cu b/aten/src/ATen/native/cuda/MaxUnpooling.cu index 73db29deb4aa..bb9fce986d2f 100644 --- a/aten/src/ATen/native/cuda/MaxUnpooling.cu +++ b/aten/src/ATen/native/cuda/MaxUnpooling.cu @@ -1,11 +1,23 @@ -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include + +#include +#include +#endif + namespace at { namespace native { diff --git a/aten/src/ATen/native/cuda/MemoryAccess.cuh b/aten/src/ATen/native/cuda/MemoryAccess.cuh index e0b37802e875..17b02346611a 100644 --- a/aten/src/ATen/native/cuda/MemoryAccess.cuh +++ b/aten/src/ATen/native/cuda/MemoryAccess.cuh @@ -116,6 +116,15 @@ struct LoadWithCast { } } + LoadWithCast(const TensorIteratorBase& iter) { + assert(iter.ninputs() == N); + #pragma unroll + for (auto i = 0; i < N; ++i) { + this->dtypes[i] = iter.dtype(i + 1); + element_sizes[i] = c10::elementSize(iter.dtype(i + 1)); + } + } + template __device__ scalar_t load(char *base_ptr, uint32_t offset, int arg) { void *ptr = base_ptr + element_sizes[arg] * offset; diff --git a/aten/src/ATen/native/cuda/MiscUtils.h b/aten/src/ATen/native/cuda/MiscUtils.h index 39305f41e641..e616a7d1fcfb 100644 --- a/aten/src/ATen/native/cuda/MiscUtils.h +++ b/aten/src/ATen/native/cuda/MiscUtils.h @@ -4,89 +4,9 @@ #include #include -#if AT_MAGMA_ENABLED() -#include -#include -#endif - namespace at { namespace native { -#if AT_MAGMA_ENABLED() - -// RAII for a MAGMA Queue -struct MAGMAQueue { - - // Default constructor without a device will cause - // destroying a queue which has not been initialized. - MAGMAQueue() = delete; - - // Constructor - explicit MAGMAQueue(int64_t device_id) { - cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); -#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 - // Magma operations is numerically sensitive, so TF32 should be off - // regardless of the global flag. - TORCH_CUDABLAS_CHECK(cublasGetMathMode(handle, &original_math_mode)); - TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH)); -#endif - magma_queue_create_from_cuda( - device_id, - at::cuda::getCurrentCUDAStream(), - handle, - at::cuda::getCurrentCUDASparseHandle(), - &magma_queue_); - } - - // Getter - magma_queue_t get_queue() const { return magma_queue_; } - - // Destructor - ~MAGMAQueue() { -#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 - // We've manually set the math mode to CUBLAS_DEFAULT_MATH, now we - // should restore the original math mode back - cublasHandle_t handle = magma_queue_get_cublas_handle(magma_queue_); - cublasSetMathMode(handle, original_math_mode); -#endif - magma_queue_destroy(magma_queue_); - } - - private: - magma_queue_t magma_queue_; -#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 - cublasMath_t original_math_mode; -#endif -}; - -static inline magma_int_t magma_int_cast(int64_t value, const char* varname) { - auto result = static_cast(value); - if (static_cast(result) != value) { - AT_ERROR("magma: The value of ", varname, "(", (long long)value, - ") is too large to fit into a magma_int_t (", sizeof(magma_int_t), " bytes)"); - } - return result; -} - -// MAGMA functions that don't take a magma_queue_t aren't stream safe -// Work around this by synchronizing with the default stream -struct MagmaStreamSyncGuard { - MagmaStreamSyncGuard() { - auto stream = at::cuda::getCurrentCUDAStream(); - if (stream != at::cuda::getDefaultCUDAStream()) { - at::cuda::stream_synchronize(stream); - } - } - - ~MagmaStreamSyncGuard() noexcept(false) { - auto default_stream = at::cuda::getDefaultCUDAStream(); - if (at::cuda::getCurrentCUDAStream() != default_stream) { - at::cuda::stream_synchronize(default_stream); - } - } -}; -#endif - static inline int cuda_int_cast(int64_t value, const char* varname) { auto result = static_cast(value); TORCH_CHECK(static_cast(result) == value, diff --git a/aten/src/ATen/native/cuda/MultiLabelMarginCriterion.cu b/aten/src/ATen/native/cuda/MultiLabelMarginCriterion.cu index 88c88ce0ad80..7f61d9a0b5b0 100644 --- a/aten/src/ATen/native/cuda/MultiLabelMarginCriterion.cu +++ b/aten/src/ATen/native/cuda/MultiLabelMarginCriterion.cu @@ -1,12 +1,22 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include -#include -#include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#include +#else +#include +#include +#include +#include +#endif + namespace at { namespace native { diff --git a/aten/src/ATen/native/cuda/MultiMarginLoss.cu b/aten/src/ATen/native/cuda/MultiMarginLoss.cu index fcf0a6a2356a..15e6d1e9dc0c 100644 --- a/aten/src/ATen/native/cuda/MultiMarginLoss.cu +++ b/aten/src/ATen/native/cuda/MultiMarginLoss.cu @@ -1,9 +1,21 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include +#include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + namespace at { namespace native { namespace { @@ -114,7 +126,7 @@ __global__ void MultiMarginLoss_backward_kernel( } } -void multi_margin_loss_shape_check( +void multi_margin_loss_shape_check(int &nframe, const Tensor &input, const Tensor &target) { auto in_sizes = input.sizes(); auto dims = in_sizes.size(); @@ -124,7 +136,7 @@ void multi_margin_loss_shape_check( "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ", in_sizes); - int64_t nframe = dims <= 1 ? 1 : in_sizes[0]; + nframe = dims <= 1 ? 1 : in_sizes[0]; TORCH_CHECK( target.dim() <= 1 && target.numel() == nframe, "inconsistent target size, expected ", nframe, " but got ", @@ -138,16 +150,16 @@ Tensor& multi_margin_loss_cuda_out( const c10::optional &weights_, int64_t reduction, Tensor& out_) { auto p = p_.toLong(); TORCH_CHECK(p == 1 || p == 2, "multi_margin_loss: Invalid p, expected 1 or 2 but got ", p); - multi_margin_loss_shape_check(input_, target_); - if (reduction == at::Reduction::None) { - resize_output(out_, target_.sizes()); - } else if (input_.dim() == 2) { - resize_output(out_, {input_.sizes()[0]}); + int nframe; + multi_margin_loss_shape_check(nframe, input_, target_); + + // produce a scalar output for 1d input + if (reduction == Reduction::None && target_.dim() > 0) { + resize_output(out_, {nframe}); } else { resize_output(out_, {}); } - if (input_.numel() == 0) { return out_; } @@ -166,7 +178,6 @@ Tensor& multi_margin_loss_cuda_out( AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "multi_margin_loss_cuda", [&] { const scalar_t margin = margin_.to(); if (input.dim() <= 1) { - int nframe = 1; TORCH_CHECK(target.dim() <= 1 && target.numel() == nframe, "inconsistent target size"); dim3 blocks(1); dim3 threads(MULTIMARGIN_THREADS); @@ -196,7 +207,6 @@ Tensor& multi_margin_loss_cuda_out( } else { auto in_sizes = input.sizes(); TORCH_INTERNAL_ASSERT(in_sizes.size() == 2); - int nframe = in_sizes[0]; // allow zero-dim target for 2D input. TORCH_CHECK(in_sizes[1] != 0 && target.dim() <= 1 && target.numel() == nframe, "inconsistent target size"); @@ -248,7 +258,7 @@ Tensor& multi_margin_loss_cuda_out( margin); C10_CUDA_KERNEL_LAUNCH_CHECK(); } - at::sum_out(out, tmp_output, /*dims=*/IntArrayRef{}); + at::sum_out(out, tmp_output, IntArrayRef{}); } } }); @@ -262,7 +272,7 @@ Tensor& multi_margin_loss_cuda_out( Tensor multi_margin_loss_cuda( const Tensor &input, const Tensor &target, const Scalar &p, const Scalar &margin, const c10::optional &weights, int64_t reduction) { - auto out = at::empty({}, input.options()); + auto out = at::empty({0}, input.options()); multi_margin_loss_cuda_out(input, target, p, margin, weights, reduction, out); return out; } @@ -274,7 +284,8 @@ Tensor& multi_margin_loss_cuda_backward_out( auto p = p_.toLong(); TORCH_CHECK(p == 1 || p == 2, "multi_margin_loss_backward: Invalid p, expected 1 or 2 but got ", p); - multi_margin_loss_shape_check(input_, target_); + int nframe; + multi_margin_loss_shape_check(nframe, input_, target_); resize_output(grad_input_, input_.sizes()); if (input_.numel() == 0) { @@ -331,7 +342,6 @@ Tensor& multi_margin_loss_cuda_backward_out( } else { auto in_sizes = input.sizes(); TORCH_INTERNAL_ASSERT(in_sizes.size() == 2); - int nframe = in_sizes[0]; TORCH_CHECK((in_sizes[1] != 0) && (target.dim() <= 1) && (target.numel() == nframe), "inconsistent target size"); dim3 blocks(in_sizes[0]); diff --git a/aten/src/ATen/native/cuda/MultinomialKernel.cu b/aten/src/ATen/native/cuda/MultinomialKernel.cu index f9404fab0193..de8e8404ac2d 100644 --- a/aten/src/ATen/native/cuda/MultinomialKernel.cu +++ b/aten/src/ATen/native/cuda/MultinomialKernel.cu @@ -1,8 +1,9 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include -#include -#include +#include +#include #include #include #include @@ -11,6 +12,16 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + #include #include #include @@ -74,12 +85,13 @@ void renormRows(Tensor& t) { const int64_t maxThreads = std::min( props->maxThreadsPerBlock, cuda_utils::kCUDABlockReduceMaxThreads); + int warp_size = at::cuda::warp_size(); dim3 grid(rows < numSM * 4 ? rows : numSM * 4); - dim3 block(std::min(maxThreads, C10_WARP_SIZE * ceil_div(cols, int64_t{C10_WARP_SIZE}))); + dim3 block(std::min(maxThreads, warp_size * ceil_div(cols, int64_t{warp_size}))); AT_DISPATCH_FLOATING_TYPES_AND_HALF(t.scalar_type(), "renormRows_cuda", [&] { renormRowsL1 - <<>>(t.data_ptr(), rows, cols); C10_CUDA_KERNEL_LAUNCH_CHECK(); @@ -335,8 +347,9 @@ void multinomial_with_replacement_kernel_impl( int maxThreads = props->maxThreadsPerBlock; int maxShared = props->sharedMemPerBlock; - int requiredWarps = at::ceil_div(numCategories, C10_WARP_SIZE); - int requiredThreads = std::min(maxThreads, requiredWarps * C10_WARP_SIZE); + int warp_size = at::cuda::warp_size(); + int requiredWarps = at::ceil_div(numCategories, warp_size); + int requiredThreads = std::min(maxThreads, requiredWarps * warp_size); int requiredShared = requiredThreads * sizeof(accscalar_t); if (n_sample == 1 && maxShared >= requiredShared) { diff --git a/aten/src/ATen/native/cuda/NLLLoss2d.cu b/aten/src/ATen/native/cuda/NLLLoss2d.cu index 79cec9f8da3e..2246c836f3dc 100644 --- a/aten/src/ATen/native/cuda/NLLLoss2d.cu +++ b/aten/src/ATen/native/cuda/NLLLoss2d.cu @@ -1,7 +1,7 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include -#include #include #include #include @@ -12,6 +12,16 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + namespace at { namespace native { diff --git a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu index a04d118b7502..75b4e3357540 100644 --- a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu +++ b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu @@ -1,6 +1,9 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include + +#include #include -#include +#include #include #include #include @@ -9,7 +12,16 @@ #include #include -#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif namespace at { namespace native { diff --git a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu index 1198555d144e..d34de0f156bd 100644 --- a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu +++ b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu @@ -1,6 +1,7 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include -#include +#include #include #include @@ -10,6 +11,17 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#endif + namespace at { namespace native { namespace { diff --git a/aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu b/aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu index 2c2c11f22467..6c2942b05de3 100644 --- a/aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu +++ b/aten/src/ATen/native/cuda/NaiveDilatedConvolution.cu @@ -1,12 +1,25 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include #include #include #include #include -#include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#endif + #include namespace at { diff --git a/aten/src/ATen/native/cuda/Nonzero.cu b/aten/src/ATen/native/cuda/Nonzero.cu index dcacf98a8007..0e524b7b81fd 100644 --- a/aten/src/ATen/native/cuda/Nonzero.cu +++ b/aten/src/ATen/native/cuda/Nonzero.cu @@ -1,4 +1,6 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include #include #include @@ -6,6 +8,13 @@ #include //for MAX_DIMS #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#endif + namespace at { namespace native { diff --git a/aten/src/ATen/native/cuda/Normalization.cu b/aten/src/ATen/native/cuda/Normalization.cu index 2f9484770ad4..e7b2372a18da 100644 --- a/aten/src/ATen/native/cuda/Normalization.cu +++ b/aten/src/ATen/native/cuda/Normalization.cu @@ -1,3 +1,4 @@ +// #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include #include @@ -7,6 +8,30 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + +// TODO: Doesn't exist in this branch +#if 0 +#include +#else +#include +#endif + namespace at { namespace native { namespace { diff --git a/aten/src/ATen/native/cuda/Normalization.cuh b/aten/src/ATen/native/cuda/Normalization.cuh index 266d5f19206d..a9b11e76db68 100644 --- a/aten/src/ATen/native/cuda/Normalization.cuh +++ b/aten/src/ATen/native/cuda/Normalization.cuh @@ -1,6 +1,7 @@ #pragma once -#include +#include +#include #include #include #include @@ -9,6 +10,14 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#include +#endif + namespace at { namespace native { // The maximum number of threads in a block @@ -846,9 +855,10 @@ std::tuple batch_norm_backward_reduce_cuda_templ auto feature_size = input_reshaped.size(2); auto stream = at::cuda::getCurrentCUDAStream(); - int block_y = std::min(lastPow2(batch_size), MAX_BLOCK_SIZE/C10_WARP_SIZE); + int warp_size = at::cuda::warp_size(); + int block_y = std::min(lastPow2(batch_size), MAX_BLOCK_SIZE/warp_size); // We want block_x to be at least a warp width - int block_x = std::min(std::max(getNumThreads(feature_size), C10_WARP_SIZE), MAX_BLOCK_SIZE/block_y); + int block_x = std::min(std::max(getNumThreads(feature_size), warp_size), MAX_BLOCK_SIZE/block_y); const dim3 block(block_x, block_y); const dim3 grid(n_input); diff --git a/aten/src/ATen/native/cuda/PersistentSoftmax.cuh b/aten/src/ATen/native/cuda/PersistentSoftmax.cuh index 6fbbe1f3be47..4f308d0847dc 100644 --- a/aten/src/ATen/native/cuda/PersistentSoftmax.cuh +++ b/aten/src/ATen/native/cuda/PersistentSoftmax.cuh @@ -126,7 +126,7 @@ __global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batc if (!is_transformer_mask) { idx += i*element_count; } - if (mask[idx]) { + if (!mask[idx]) { max_value[i] = (is_meaningful_max && max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it]; is_meaningful_max = true; } @@ -160,13 +160,18 @@ __global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batc idx += i*element_count; } - if (mask[idx]) { + if (!mask[idx]) { if (is_log_softmax) { sum[i] += std::exp(elements[i][it] - max_value[i]); } else { elements[i][it] = std::exp(elements[i][it] - max_value[i]); sum[i] += elements[i][it]; } + } else { + if (!is_log_softmax) { + // Masked values are treated as -infinity, and std::exp(-infinity) is 0. + elements[i][it] = 0; + } } } } @@ -183,18 +188,10 @@ __global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batc for (int it = 0; it < WARP_ITERATIONS; ++it) { int element_index = local_idx + it * WARP_SIZE; if (element_index < element_count) { - if (is_masked) { - int idx = it*WARP_SIZE; - if (!is_transformer_mask) { - idx += i*element_count; - } - if (!mask[idx]) { - dst[i*element_count+it*WARP_SIZE] = 0; - continue; - } - } if (is_log_softmax) { dst[i*element_count+it*WARP_SIZE] = elements[i][it] - max_value[i] - sum[i]; + } else if (sum[i] == 0) { + dst[i*element_count+it*WARP_SIZE] = std::numeric_limits::quiet_NaN(); } else { dst[i*element_count+it*WARP_SIZE] = elements[i][it] / sum[i]; } @@ -205,8 +202,8 @@ __global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batc } } -template -__global__ void softmax_warp_backward(output_t *gradInput, const input_t *grad, const input_t *output, int batch_size, int stride, int element_count) +template +__global__ void softmax_warp_backward(output_t *gradInput, const input_t *grad, const input_t *output, int batch_size, int stride, int element_count, const bool *mask = nullptr) { // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_backward_kernel. constexpr int next_power_of_two = 1 << log2_elements; @@ -230,6 +227,9 @@ __global__ void softmax_warp_backward(output_t *gradInput, const input_t *grad, grad += thread_offset; output += thread_offset; gradInput += thread_offset; + if (is_masked) { + mask += thread_offset; + } // The nested loops over WARP_BATCH and then WARP_ITERATIONS can be simplified to one loop, // but I think doing so would obfuscate the logic of the algorithm, thus I chose to keep @@ -253,13 +253,14 @@ __global__ void softmax_warp_backward(output_t *gradInput, const input_t *grad, } } - acc_t sum[WARP_BATCH]; + acc_t sum[WARP_BATCH] { 0.0f }; #pragma unroll for (int i = 0; i < WARP_BATCH; ++i) { - sum[i] = grad_reg[i][0]; #pragma unroll - for (int it = 1; it < WARP_ITERATIONS; ++it) { - sum[i] += grad_reg[i][it]; + for (int it = 0; it < WARP_ITERATIONS; ++it) { + if (!is_masked || !mask[i*element_count+it*WARP_SIZE]) { + sum[i] += grad_reg[i][it]; + } } } warp_reduce(sum); @@ -273,8 +274,11 @@ __global__ void softmax_warp_backward(output_t *gradInput, const input_t *grad, for (int it = 0; it < WARP_ITERATIONS; ++it) { int element_index = local_idx + it * WARP_SIZE; if (element_index < element_count) { + if (is_masked && mask[i*element_count+it*WARP_SIZE]) { + gradInput[i*element_count+it*WARP_SIZE] = 0; + } // compute gradients - if (is_log_softmax) { + else if (is_log_softmax) { gradInput[i*element_count+it*WARP_SIZE] = (grad_reg[i][it] - std::exp(output_reg[i][it]) * sum[i]); } else { gradInput[i*element_count+it*WARP_SIZE] = (grad_reg[i][it] - output_reg[i][it] * sum[i]); @@ -297,7 +301,8 @@ void dispatch_softmax_forward(output_t *dst, const input_t *src, int softmax_ele const int next_power_of_two = 1 << log2_elements; // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward. - int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE; + int warp_size = at::cuda::warp_size(); + warp_size = (next_power_of_two < warp_size) ? next_power_of_two : warp_size; // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward. int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1; @@ -335,8 +340,8 @@ void dispatch_softmax_forward(output_t *dst, const input_t *src, int softmax_ele } } -template -void dispatch_softmax_backward(output_t *grad_input, const input_t *grad, const input_t *output, int softmax_elements, int softmax_elements_stride, int batch_count) +template +void dispatch_softmax_backward(output_t *grad_input, const input_t *grad, const input_t *output, int softmax_elements, int softmax_elements_stride, int batch_count, const bool *mask = nullptr) { TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 1024 ); if (softmax_elements == 0) { @@ -346,7 +351,8 @@ void dispatch_softmax_backward(output_t *grad_input, const input_t *grad, const const int next_power_of_two = 1 << log2_elements; // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward. - int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE; + int warp_size = at::cuda::warp_size(); + warp_size = (next_power_of_two < warp_size) ? next_power_of_two : warp_size; // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward. int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1; @@ -361,10 +367,10 @@ void dispatch_softmax_backward(output_t *grad_input, const input_t *grad, const // Launch code would be more elegant if C++ supported FOR CONSTEXPR switch (log2_elements) { #define LAUNCH_SOFTMAX_WARP_BACKWARD(L2E) case L2E: \ - softmax_warp_backward \ + softmax_warp_backward \ <<>> \ (grad_input, grad, output, batch_count, softmax_elements_stride, \ - softmax_elements); \ + softmax_elements, mask); \ C10_CUDA_KERNEL_LAUNCH_CHECK(); \ break; diff --git a/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu b/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu index 5e42326056c1..b1c4a2ae4b41 100644 --- a/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu +++ b/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -10,28 +11,88 @@ namespace at { namespace native { +const char addcmul_name[] = "addcmul"; void addcmul_cuda_kernel(TensorIteratorBase& iter, const Scalar& value) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, iter.dtype(), "addcmul_cuda", [&]() { - // note(mkozuki): If scalar_t is fp16 or bfloat16, cast scalar to float - // and do math in fp32 for better accuracy. - using accscalar_t = at::acc_type; - auto alpha = value.to(); - gpu_kernel(iter, [alpha]GPU_LAMBDA(scalar_t a, scalar_t b, scalar_t c) -> scalar_t { - return a + alpha * (static_cast(b) * static_cast(c)); + auto dtype = iter.dtype(); + if (at::isComplexType(dtype)) { + #if AT_USE_JITERATOR() + AT_DISPATCH_COMPLEX_TYPES(dtype, "addcmul_cuda", [&]() { + auto alpha = value.to(); + static const auto addcmul_string = jiterator_stringify( + template T addcmul(T a, T b, T c, T alpha) { return a + alpha * (b * c); }); + jitted_gpu_kernel< + /*name=*/addcmul_name, + /*return_dtype=*/scalar_t, + /*common_dtype=*/scalar_t, + /*arity=*/3>( + iter, + addcmul_string, + /*scalar_pos=*/at::cuda::jit::BinaryFuncVariant::NoScalar, + /*scalar_val=*/0, + /*extra_args=*/std::make_tuple(alpha)); + }); + #else + AT_DISPATCH_COMPLEX_TYPES(dtype, "addcmul_cuda", [&]() { + auto alpha = value.to(); + gpu_kernel(iter, [alpha]GPU_LAMBDA(scalar_t a, scalar_t b, scalar_t c) -> scalar_t { + return a + alpha * b * c; + }); + }); + #endif + } else { + AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, dtype, "addcmul_cuda", [&]() { + // note(mkozuki): If scalar_t is fp16 or bfloat16, cast scalar to float + // and do math in fp32 for better accuracy. + using accscalar_t = at::acc_type; + auto alpha = value.to(); + gpu_kernel(iter, [alpha]GPU_LAMBDA(scalar_t a, scalar_t b, scalar_t c) -> scalar_t { + return a + alpha * (static_cast(b) * static_cast(c)); + }); }); - }); + } } +// return a + alpha * (b / static_cast(c)); +const char addcdiv_name[] = "addcdiv"; void addcdiv_cuda_kernel(TensorIteratorBase& iter, const Scalar& value) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, iter.dtype(), "addcdiv_cuda", [&]() { - // note(mkozuki): If scalar_t is fp16 or bfloat16, cast scalar to float - // and do math in fp32 for better accuracy. - using accscalar_t = at::acc_type; - auto alpha = value.to(); - gpu_kernel(iter, [alpha]GPU_LAMBDA(scalar_t a, scalar_t b, scalar_t c) -> scalar_t { - return a + alpha * (b / static_cast(c)); + auto dtype = iter.dtype(); + if (at::isComplexType(dtype)) { + #if AT_USE_JITERATOR() + AT_DISPATCH_COMPLEX_TYPES(dtype, "addcdiv_cuda", [&]() { + auto alpha = value.to(); + static const auto addcdiv_string = + jiterator_stringify(template T addcdiv( + T a, T b, T c, T alpha) { return a + alpha * (b / c); }); + jitted_gpu_kernel< + /*name=*/addcdiv_name, + /*return_dtype=*/scalar_t, + /*common_dtype=*/scalar_t, + /*arity=*/3>( + iter, + addcdiv_string, + /*scalar_pos=*/at::cuda::jit::BinaryFuncVariant::NoScalar, + /*scalar_val=*/0, + /*extra_args=*/std::make_tuple(alpha)); + }); + #else + AT_DISPATCH_COMPLEX_TYPES(dtype, "addcdiv_cuda", [&]() { + auto alpha = value.to(); + gpu_kernel(iter, [alpha]GPU_LAMBDA(scalar_t a, scalar_t b, scalar_t c) -> scalar_t { + return a + alpha * (b / c); + }); + }); + #endif + } else { + AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, dtype, "addcdiv_cuda", [&]() { + // note(mkozuki): If scalar_t is fp16 or bfloat16, cast scalar to float + // and do math in fp32 for better accuracy. + using accscalar_t = at::acc_type; + auto alpha = value.to(); + gpu_kernel(iter, [alpha]GPU_LAMBDA(scalar_t a, scalar_t b, scalar_t c) -> scalar_t { + return a + alpha * (b / static_cast(c)); + }); }); - }); + } } void smooth_l1_backward_cuda_kernel(TensorIterator& iter, const Scalar& norm, double beta) { diff --git a/aten/src/ATen/native/cuda/RNN.cu b/aten/src/ATen/native/cuda/RNN.cu index 659ddc28c497..ed34bc78fba2 100644 --- a/aten/src/ATen/native/cuda/RNN.cu +++ b/aten/src/ATen/native/cuda/RNN.cu @@ -1,11 +1,24 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include +#include #include -#include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#endif + namespace at { namespace native { namespace { @@ -546,7 +559,7 @@ void checkLSTMBackwardSizes(const TensorArg& grad_hy, const TensorArg& grad_cy, checkNumel(c, workspace, exp_size[0] * exp_size[1] * 4); } -std::tuple _thnn_fused_lstm_cell_backward_cuda( const c10::optional& grad_hy_opt, const c10::optional& grad_cy_opt, +std::tuple _thnn_fused_lstm_cell_backward_impl_cuda( const c10::optional& grad_hy_opt, const c10::optional& grad_cy_opt, const Tensor& cx, const Tensor& cy, const Tensor& workspace, bool has_bias) { // See [Note: hacky wrapper removal for optional tensor] @@ -555,7 +568,7 @@ std::tuple _thnn_fused_lstm_cell_backwar const Tensor& grad_cy = c10::value_or_else(grad_cy_opt, [] {return Tensor();}); if (!grad_hy.defined() && !grad_cy.defined()) { - return std::tuple(); + return std::tuple(); } checkLSTMBackwardSizes({grad_hy, "grad_hy", 1}, {grad_cy, "grad_cy", 2}, {cx, "cx", 3}, {cy, "cy", 4}, @@ -572,7 +585,7 @@ std::tuple _thnn_fused_lstm_cell_backwar }); auto grad_bias = has_bias ? grad_gates.sum(0, /*keepdim=*/false) : at::Tensor{}; - return std::make_tuple(grad_gates, grad_gates, grad_cx, grad_bias, grad_bias); + return std::make_tuple(grad_gates, grad_cx, grad_bias); } static constexpr int64_t GRU_WORKSPACE_MULTIPLIER = 5; diff --git a/aten/src/ATen/native/cuda/Randperm.cu b/aten/src/ATen/native/cuda/Randperm.cu index f0c41f5be444..b3c679f77724 100644 --- a/aten/src/ATen/native/cuda/Randperm.cu +++ b/aten/src/ATen/native/cuda/Randperm.cu @@ -1,9 +1,21 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + #include namespace at { diff --git a/aten/src/ATen/native/cuda/RangeFactories.cu b/aten/src/ATen/native/cuda/RangeFactories.cu index 027806ed4216..55981ac1ad8e 100644 --- a/aten/src/ATen/native/cuda/RangeFactories.cu +++ b/aten/src/ATen/native/cuda/RangeFactories.cu @@ -1,6 +1,6 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include -#include #include #include #include @@ -8,20 +8,39 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#endif + #define GPU_LAMBDA __device__ __host__ namespace { -constexpr int num_threads = C10_WARP_SIZE * 2; +#if defined(USE_ROCM) +constexpr int num_threads() { + return 128; +} +#else +constexpr int num_threads() { + return C10_WARP_SIZE * 2; +} +#endif constexpr int thread_work_size = 1; -constexpr int block_work_size = thread_work_size * num_threads; +constexpr int block_work_size = thread_work_size * num_threads(); template -C10_LAUNCH_BOUNDS_1(num_threads) +C10_LAUNCH_BOUNDS_1(num_threads()) __global__ void elementwise_kernel_with_index(index_t N, func_t f, typename function_traits::result_type *data) { #pragma unroll for (int i = 0; i < thread_work_size; i++) { - index_t idx = block_work_size * blockIdx.x + num_threads * i + threadIdx.x; + index_t idx = block_work_size * blockIdx.x + num_threads() * i + threadIdx.x; if (idx < N) { data[idx] = f(idx); } @@ -38,10 +57,10 @@ void gpu_kernel_with_index(at::Tensor &output, func_t f) { auto stream = at::cuda::getCurrentCUDAStream(); using scalar_t = typename function_traits::result_type; if (N <= std::numeric_limits::max()) { - elementwise_kernel_with_index<<>>(N, f, output.data_ptr()); + elementwise_kernel_with_index<<>>(N, f, output.data_ptr()); C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { - elementwise_kernel_with_index<<>>(N, f, output.data_ptr()); + elementwise_kernel_with_index<<>>(N, f, output.data_ptr()); C10_CUDA_KERNEL_LAUNCH_CHECK(); } } diff --git a/aten/src/ATen/native/cuda/RecordStream.cu b/aten/src/ATen/native/cuda/RecordStream.cu index d48561df00e5..c4cb74bdc68f 100644 --- a/aten/src/ATen/native/cuda/RecordStream.cu +++ b/aten/src/ATen/native/cuda/RecordStream.cu @@ -1,5 +1,13 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + namespace at { namespace native { void record_stream_cuda(Tensor& self, c10::Stream stream) { c10::cuda::CUDACachingAllocator::recordStream(self.storage().data_ptr(), at::cuda::CUDAStream::unpack(stream.pack())); diff --git a/aten/src/ATen/native/cuda/Reduce.cu b/aten/src/ATen/native/cuda/Reduce.cu index 103a386ff0c9..2de32f6d4a35 100644 --- a/aten/src/ATen/native/cuda/Reduce.cu +++ b/aten/src/ATen/native/cuda/Reduce.cu @@ -1,3 +1,4 @@ +#define TORCH_ASSERT_NO_OPERATORS #include #include diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh index 5ee3757d5937..57fa55fbec7d 100644 --- a/aten/src/ATen/native/cuda/Reduce.cuh +++ b/aten/src/ATen/native/cuda/Reduce.cuh @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -17,6 +18,9 @@ #include #include +#include +#include + namespace at { namespace native { using at::detail::Array; @@ -272,6 +276,65 @@ func_wrapper_t func_wrapper(const func_t& op) { return func_wrapper_t { op }; } +template +struct ReduceJitOp { +//ReduceJitOp is almost like ReduceOp, but it doesn't have ops functor that specifies reduction operations +//Maybe we can find a way to unify ReduceOp and ReduceJitOp + using InputCalculator = OffsetCalculator<1, uint32_t>; + using OutputCalculator = OffsetCalculator<2, uint32_t>; + //TODO for now arg_t is always opmath_t of the input, later we'll need to change it + using arg_t = at::opmath_type; + + static constexpr int input_vec_size = ReduceConfig::input_vec_size; + //TODO - ReduceJitOp will probably need to be changed for reductions that need full functor, + //not just wrapper + arg_t ident; + ReduceConfig config; + InputCalculator input_calc; + OutputCalculator output_calc; + const void* src; + const char* dst[2]; //it accepts at most two destinations + // acc_buf used for accumulation among sub Tensor Iterator when accumulation on + // output is not permissible + void* acc_buf; + // cta_buf used for accumulation between blocks during global reduction + void* cta_buf; + int* semaphores; + int64_t base_idx; + bool accumulate; + bool final_output; + int noutputs; + + ReduceJitOp( + ReduceConfig config, + InputCalculator input_calc, + OutputCalculator output_calc, + const void* src, + char* dst0, + optional dst1, + void* acc_buf, + void* cta_buf, + int* semaphores, + arg_t ident, + int noutputs, + int64_t base_idx) + : ident(ident), + config(config), + input_calc(input_calc), + output_calc(output_calc), + src(src), + acc_buf(acc_buf), + cta_buf(cta_buf), + semaphores(semaphores), + base_idx(base_idx), + noutputs(noutputs) { + dst[0] = dst0; + if (dst1.has_value()) { + dst[1] = dst1.value(); + } + } +}; + template struct ReduceOp { using traits = function_traits; @@ -284,8 +347,6 @@ struct ReduceOp { std::is_convertible::value && std::is_convertible::value; - static constexpr float acc_buffer_multiplier = (float)sizeof(arg_t) / sizeof(out_scalar_t); - static constexpr int input_vec_size = ReduceConfig::input_vec_size; ops_t ops; @@ -837,6 +898,47 @@ static void launch_reduce_kernel(const ReduceConfig& config, const R& reduction) } } +template +static void launch_jitted_reduce_kernel(DeviceIndex idx, const ReduceConfig& config, +R& reduction, const std::string& func) { + constexpr int max_threads = mnt_wrapper::MAX_NUM_THREADS; + dim3 block = config.block(); + dim3 grid = config.grid(); + + static std::mutex _jiterator_mutex; + static std::vector> fns(c10::cuda::device_count()); + int shared_memory = config.shared_memory_size(); + at::cuda::jit::NvrtcFunction* fn_ptr; + switch(config.output_vec_size) { + case 4: + fn_ptr = &fns[idx][0]; + break; + case 2: + fn_ptr = &fns[idx][1]; + break; + default: + fn_ptr = &fns[idx][2]; + } + if (!fn_ptr->function) { + std::string f_inputs_type_str = at::cuda::jit::typeName(); + std::string accum_type_str = at::cuda::jit::typeName>(); + std::string result_type_str = at::cuda::jit::typeName(); + int max_threads_codegen = max_threads/config.output_vec_size; + auto code = at::cuda::jit::generate_reduction_code(1, func, name, vt0, + f_inputs_type_str, accum_type_str, result_type_str, + true, false, config.output_vec_size, max_threads_codegen); + + *fn_ptr = at::cuda::jit::jit_pwise_function(code, "reduction_"+std::string(name)); + + } + constexpr int kernel_args = 1; + void* args[kernel_args]; + args[0] = static_cast(&reduction); + at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args, grid, block, shared_memory); +} + + class AccumulationBuffer { public: AccumulationBuffer() {} @@ -874,7 +976,7 @@ class AccumulationBuffer { }; template -int get_output_vec_size(TensorIterator &iter) { +int get_output_vec_size(const TensorIterator &iter) { int vec_size = 4; auto update_vec_size = [&vec_size](uint64_t n) { while(n % vec_size != 0) { @@ -898,61 +1000,8 @@ int get_output_vec_size(TensorIterator &iter) { return vec_size; } -template -inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t ident=0, - AccumulationBuffer* acc_buf_ptr=nullptr, int64_t base_idx=0) { - AT_ASSERT(iter.numel() > 0 && iter.ntensors() - iter.noutputs() == 1 && iter.noutputs() >= 1); - - using traits = function_traits; - using arg_t = typename traits::template arg<0>::type; - static constexpr bool can_accumulate_in_output = - std::is_convertible::value; - - bool can_use_32bit_indexing = iter.can_use_32bit_indexing(); - std::unique_ptr owned_buf_ptr; - - // The acc_buf_ptr is a shared pointer. It is create at the first entrance and - // reused by all recursive function calls. - if (acc_buf_ptr == NULL) { - // acc_buf_ptr holds buffer used for accumulation among multiple sub_iter - // when accumulation in output is not possible. - if (!can_accumulate_in_output && !can_use_32bit_indexing) { - int64_t output_memory_size = iter.element_size(0); - for (int dim = 0; dim < iter.ndim(); dim++) { - output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]); - } - output_memory_size /= iter.element_size(0); //iter.strides is in bytes - owned_buf_ptr.reset(new AccumulationBuffer(sizeof(arg_t), - sizeof(out_scalar_t), - (char*) iter.data_ptr(0), - output_memory_size * sizeof(arg_t))); - } else { - owned_buf_ptr.reset(new AccumulationBuffer()); - } - acc_buf_ptr = owned_buf_ptr.get(); - } - - if (!can_use_32bit_indexing) { - for (auto& sub_iter : iter.with_32bit_indexing()) { - int64_t sub_iter_base_idx = sub_iter.view_offsets()[0]; - - gpu_reduce_kernel(sub_iter, ops, ident, - acc_buf_ptr, sub_iter_base_idx); - } - return; - } - - const char* in_data = (char*)iter.data_ptr(iter.ntensors() - 1); - char* out_data = (char*)iter.data_ptr(0); - const auto noutputs = iter.noutputs(); - optional out_data_extra; - if (noutputs > 1) { - out_data_extra = (char*)iter.data_ptr(1); - } else { - out_data_extra = nullopt; - } - char* acc_data = acc_buf_ptr->get_acc_slice(out_data); - +template +ReduceConfig setReduceConfig(const TensorIterator& iter){ // Start by assuming that each thread handles a single output and all // the inputs for that output. int64_t num_outputs = iter.num_output_elements(); @@ -1080,7 +1129,64 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id config.input_mult[2] = config.split_input(config.ctas_per_output); } } + return config; +}; + +template +inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t ident=0, + AccumulationBuffer* acc_buf_ptr=nullptr, int64_t base_idx=0) { + AT_ASSERT(iter.numel() > 0 && iter.ntensors() - iter.noutputs() == 1 && iter.noutputs() >= 1); + + using traits = function_traits; + using arg_t = typename traits::template arg<0>::type; + static constexpr bool can_accumulate_in_output = + std::is_convertible::value; + + bool can_use_32bit_indexing = iter.can_use_32bit_indexing(); + std::unique_ptr owned_buf_ptr; + // The acc_buf_ptr is a shared pointer. It is create at the first entrance and + // reused by all recursive function calls. + if (acc_buf_ptr == NULL) { + // acc_buf_ptr holds buffer used for accumulation among multiple sub_iter + // when accumulation in output is not possible. + if (!can_accumulate_in_output && !can_use_32bit_indexing) { + int64_t output_memory_size = iter.element_size(0); + for (int dim = 0; dim < iter.ndim(); dim++) { + output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]); + } + output_memory_size /= iter.element_size(0); //iter.strides is in bytes + owned_buf_ptr.reset(new AccumulationBuffer(sizeof(arg_t), + sizeof(out_scalar_t), + (char*) iter.data_ptr(0), + output_memory_size * sizeof(arg_t))); + } else { + owned_buf_ptr.reset(new AccumulationBuffer()); + } + acc_buf_ptr = owned_buf_ptr.get(); + } + + if (!can_use_32bit_indexing) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + int64_t sub_iter_base_idx = sub_iter.view_offsets()[0]; + + gpu_reduce_kernel(sub_iter, ops, ident, + acc_buf_ptr, sub_iter_base_idx); + } + return; + } + + const char* in_data = (char*)iter.data_ptr(iter.ntensors() - 1); + char* out_data = (char*)iter.data_ptr(0); + const auto noutputs = iter.noutputs(); + optional out_data_extra; + if (noutputs > 1) { + out_data_extra = (char*)iter.data_ptr(1); + } else { + out_data_extra = nullopt; + } + char* acc_data = acc_buf_ptr->get_acc_slice(out_data); + ReduceConfig config = setReduceConfig(iter); at::DataPtr buffer; at::DataPtr semaphores; if (config.should_global_reduce()) { @@ -1115,4 +1221,101 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id launch_reduce_kernel::MAX_NUM_THREADS>(config, reduce); } +//TODO this is 100 lines of almost-copy-paste, because we have to have different template args for this function +//try unifying with gpu_reduce_kernel +template +inline void jitted_gpu_reduce_kernel(TensorIterator& iter, const std::string& func, ident_t ident=0, + AccumulationBuffer* acc_buf_ptr=nullptr, int64_t base_idx=0) { + AT_ASSERT(iter.numel() > 0 && iter.ntensors() - iter.noutputs() == 1 && iter.noutputs() >= 1); + + //TODO - this will be different for more complicated reductions, but for now reductions using + //func_wrapper all have arg_t = opmath + using arg_t = at::opmath_type; + static constexpr bool can_accumulate_in_output = + std::is_convertible::value; + static_assert(can_accumulate_in_output == true, "unsupported arg_t for jitted reduction"); + + bool can_use_32bit_indexing = iter.can_use_32bit_indexing(); + std::unique_ptr owned_buf_ptr; + + // The acc_buf_ptr is a shared pointer. It is create at the first entrance and + // reused by all recursive function calls. + if (acc_buf_ptr == NULL) { + // acc_buf_ptr holds buffer used for accumulation among multiple sub_iter + // when accumulation in output is not possible. + if (!can_accumulate_in_output && !can_use_32bit_indexing) { + int64_t output_memory_size = iter.element_size(0); + for (int dim = 0; dim < iter.ndim(); dim++) { + output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]); + } + output_memory_size /= iter.element_size(0); //iter.strides is in bytes + owned_buf_ptr.reset(new AccumulationBuffer(sizeof(out_scalar_t), //TODO + sizeof(out_scalar_t), + (char*) iter.data_ptr(0), + output_memory_size * sizeof(out_scalar_t))); //TODO + } else { + owned_buf_ptr.reset(new AccumulationBuffer()); + } + acc_buf_ptr = owned_buf_ptr.get(); + } + + if (!can_use_32bit_indexing) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + int64_t sub_iter_base_idx = sub_iter.view_offsets()[0]; + + jitted_gpu_reduce_kernel(sub_iter, func, ident, + acc_buf_ptr, sub_iter_base_idx); + } + return; + } + + //TODO - for now we support a single input, we may be able to relax this constraint + const char* in_data = (char*)iter.data_ptr(iter.ntensors() - 1); + char* out_data = (char*)iter.data_ptr(0); + const auto noutputs = iter.noutputs(); + optional out_data_extra; + if (noutputs > 1) { + out_data_extra = (char*)iter.data_ptr(1); + } else { + out_data_extra = nullopt; + } + char* acc_data = acc_buf_ptr->get_acc_slice(out_data); + + ReduceConfig config = setReduceConfig(iter); + + at::DataPtr buffer; + at::DataPtr semaphores; + if (config.should_global_reduce()) { + auto& allocator = *c10::cuda::CUDACachingAllocator::get(); + buffer = allocator.allocate(config.global_memory_size()); + semaphores = allocator.allocate(config.semaphore_size()); + + auto stream = at::cuda::getCurrentCUDAStream(); + AT_CUDA_CHECK(cudaMemsetAsync(semaphores.get(), 0, config.semaphore_size(), stream)); + } + + AT_ASSERT(can_use_32bit_indexing); + auto output_calc = make_output_calculator(iter); + auto input_calc = make_input_calculator(iter); + auto reduce = ReduceJitOp( + config, + input_calc, + output_calc, + in_data, + out_data, + out_data_extra, + acc_data, + buffer.get(), + (int*)semaphores.get(), + ident, + noutputs, + base_idx); + reduce.accumulate = iter.should_accumulate(); + reduce.final_output = iter.is_final_output(); + + launch_jitted_reduce_kernel(iter.device().index(), + config, reduce, func); +} + }} // namespace at::native diff --git a/aten/src/ATen/native/cuda/ReduceOps.cpp b/aten/src/ATen/native/cuda/ReduceOps.cpp index ec1cbd3b64fb..ab878f82e3a0 100644 --- a/aten/src/ATen/native/cuda/ReduceOps.cpp +++ b/aten/src/ATen/native/cuda/ReduceOps.cpp @@ -1,13 +1,29 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include -#include #include #include #include #include +#include +#include +#include +#include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#endif + namespace at { namespace native { namespace { @@ -33,12 +49,6 @@ void norm_kernel_cuda(TensorIterator& iter, const Scalar& val) { } -void linalg_vector_norm_kernel_cuda(TensorIterator& iter, Scalar ord) { - TORCH_CHECK(ord.isFloatingPoint(), "linalg.vector_norm expects ord to be float"); - norm_kernel_cuda(iter, ord); -} - - void min_kernel_impl(const Tensor& result, const Tensor& indice, const Tensor& self, int64_t dim, bool keepdim) { auto iter = meta::make_reduction(self, result, indice, dim, keepdim, self.scalar_type(), kLong); min_launch_kernel(iter); @@ -86,6 +96,5 @@ REGISTER_CUDA_DISPATCH(aminmax_allreduce_stub, &aminmax_allreduce_kernel_impl); REGISTER_CUDA_DISPATCH(aminmax_stub, &aminmax_kernel_impl); REGISTER_CUDA_DISPATCH(norm_stub, &norm_kernel_cuda); -REGISTER_CUDA_DISPATCH(linalg_vector_norm_stub, &linalg_vector_norm_kernel_cuda); }} // namespace at::native diff --git a/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu b/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu index bf81ed5b7940..be1d7c515a3e 100644 --- a/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu +++ b/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu @@ -5,6 +5,8 @@ #include #include #include +#include +#include namespace at { namespace native { @@ -18,6 +20,35 @@ struct sum_functor { } }; +// jiterated specialization for `complex` +const char sum_name[] = "sum"; +template <> +struct sum_functor> { +// jiterator reduction fails on windows +// Ref: https://github.com/pytorch/pytorch/issues/77305 +#if AT_USE_JITERATOR() && !defined(_MSC_VER) + void operator()(TensorIterator& iter) { + using scalar_t = c10::complex; + std::string func = jiterator_stringify( + arg_t combine(arg_t a, arg_t b) { + return a + b; + } + ); + jitted_gpu_reduce_kernel( + iter, func, 0.); + } +#else + void operator()(TensorIterator& iter) { + using scalar_t = c10::complex; + using acc_t = at::opmath_type; + gpu_reduce_kernel( + iter, func_wrapper([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t { + return a + b; + }), acc_t{0.}); + } +#endif +}; + template struct nansum_functor { void operator()(TensorIterator& iter) { @@ -26,14 +57,30 @@ struct nansum_functor { } }; +const char prod_name[] = "prod"; + template struct prod_functor { + // jiterator reduction fails on windows + // Ref: https://github.com/pytorch/pytorch/issues/77305 + #if AT_USE_JITERATOR() && !defined(_MSC_VER) + void operator()(TensorIterator& iter) { + std::string func = jiterator_stringify( + arg_t combine(arg_t a, arg_t b) { + return a * b; + } + ); + jitted_gpu_reduce_kernel( + iter, func, 1.); + } + #else void operator()(TensorIterator& iter) { gpu_reduce_kernel( iter, func_wrapper([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t { return a * b; - }), 1); + }), 1.); } + #endif }; // Workaround for the error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context] @@ -47,6 +94,31 @@ struct prod_functor { } }; +// jiterated specialization for `complex` +template <> +struct prod_functor> { +// jiterator reduction fails on windows +// Ref: https://github.com/pytorch/pytorch/issues/77305 +#if AT_USE_JITERATOR() && !defined(_MSC_VER) + void operator()(TensorIterator& iter) { + using scalar_t = c10::complex; + std::string func = + jiterator_stringify(arg_t combine(arg_t a, arg_t b) { return a * b; }); + jitted_gpu_reduce_kernel(iter, func, 1.); + } +#else + void operator()(TensorIterator& iter) { + using scalar_t = c10::complex; + using acc_t = at::opmath_type; + gpu_reduce_kernel( + iter, + func_wrapper( + [] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t { return a * b; }), + acc_t{1.}); + } +#endif +}; + // The function `reduce_dispatch` below dispatches to the kernel based // on the type of `iter`. It takes care of the common logic // for handling Half-Precision floating types. @@ -79,8 +151,8 @@ static void reduce_dispatch(TensorIterator& iter, GeneralDispatcher op) { static void sum_kernel_cuda(TensorIterator& iter){ auto general_dispatcher = [](TensorIterator& iter) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND( - ScalarType::Bool, iter.dtype(), "sum_cuda", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( + kBool, kComplexHalf, iter.dtype(), "sum_cuda", [&]() { sum_functor{}(iter); }); }; @@ -100,7 +172,7 @@ static void nansum_kernel_cuda(TensorIterator& iter) { static void prod_kernel_cuda(TensorIterator& iter) { auto general_dispatcher = [](TensorIterator& iter) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(ScalarType::Bool, iter.dtype(), "prod_cuda", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kComplexHalf, kBool, iter.dtype(), "prod_cuda", [&]() { prod_functor{}(iter); }); }; diff --git a/aten/src/ATen/native/cuda/ReflectionPad.cu b/aten/src/ATen/native/cuda/ReflectionPad.cu index e497bae885f0..33f71368ca10 100644 --- a/aten/src/ATen/native/cuda/ReflectionPad.cu +++ b/aten/src/ATen/native/cuda/ReflectionPad.cu @@ -1,12 +1,27 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include +#include #include #include #include -#include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#endif + #include namespace at { diff --git a/aten/src/ATen/native/cuda/Repeat.cu b/aten/src/ATen/native/cuda/Repeat.cu index 43d6602ea8e2..1b29dac6690f 100644 --- a/aten/src/ATen/native/cuda/Repeat.cu +++ b/aten/src/ATen/native/cuda/Repeat.cu @@ -1,7 +1,15 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + template __global__ static void compute_cuda_kernel( index_t* repeat_ptr, @@ -33,7 +41,7 @@ static void compute_cuda( int64_t size, int64_t result_size) { int64_t block = 512; - int64_t warps_per_block = block / C10_WARP_SIZE; + int64_t warps_per_block = block / at::cuda::warp_size(); int64_t grid = std::min((size + warps_per_block - 1) / warps_per_block, 2048L); diff --git a/aten/src/ATen/native/cuda/ReplicationPadding.cu b/aten/src/ATen/native/cuda/ReplicationPadding.cu index 754161c62097..d967ffd0354d 100644 --- a/aten/src/ATen/native/cuda/ReplicationPadding.cu +++ b/aten/src/ATen/native/cuda/ReplicationPadding.cu @@ -1,13 +1,26 @@ -#include +#include #include +#include #include #include #include -#include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#endif + #include #include #include diff --git a/aten/src/ATen/native/cuda/Resize.cpp b/aten/src/ATen/native/cuda/Resize.cpp index c4167ec56e67..43e1cb951574 100644 --- a/aten/src/ATen/native/cuda/Resize.cpp +++ b/aten/src/ATen/native/cuda/Resize.cpp @@ -1,10 +1,16 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include #include -#include -#include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + namespace at { namespace native { diff --git a/aten/src/ATen/native/cuda/Resize.h b/aten/src/ATen/native/cuda/Resize.h index 33ab263693dc..569b145fa61d 100644 --- a/aten/src/ATen/native/cuda/Resize.h +++ b/aten/src/ATen/native/cuda/Resize.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -9,19 +9,15 @@ namespace at { namespace native { TORCH_CUDA_CPP_API void resize_bytes_cuda(StorageImpl* storage, size_t size_bytes); -static inline void maybe_resize_storage_cuda(TensorImpl* self, uint64_t new_size) { +static inline void maybe_resize_storage_cuda(TensorImpl* self, size_t new_size_bytes) { // It does not make sense to try to resize a storage // to hold 0 elements, and this can break // if storage_offset is positive but // new_size is 0, so just bail in that case // (same comment is in Resize.h) - if (new_size == 0) { + if (self->numel() == 0) { return; } - auto new_size_bytes_i = (new_size + self->storage_offset()) * self->dtype().itemsize(); - TORCH_CHECK(!overflows(new_size_bytes_i), "Requested storage size (", - new_size_bytes_i, ") cannot be represented as a size_t"); - const auto new_size_bytes = static_cast(new_size_bytes_i); const Storage &storage = self->unsafe_storage(); TORCH_CHECK(storage, "Tensor: invalid null storage"); @@ -33,7 +29,7 @@ static inline void maybe_resize_storage_cuda(TensorImpl* self, uint64_t new_size inline TensorImpl* resize_impl_cuda_( TensorImpl* self, IntArrayRef size, - c10::optional stride, + at::OptionalIntArrayRef stride, bool device_guard = true) { if (self->sizes() == size && (!stride || self->strides() == stride)) { return self; @@ -45,14 +41,17 @@ inline TensorImpl* resize_impl_cuda_( guard.set_index(self->storage().device().index()); } - int64_t storage_size = 1; + const auto itemsize = self->dtype().itemsize(); + const auto storage_offset = self->storage_offset(); + size_t storage_size = 1; if (stride) { self->set_sizes_and_strides(size, *stride); - // NB: storage size can be different from numel. - storage_size = storage_size_for(size, *stride); + storage_size = at::detail::computeStorageNbytes( + size, *stride, itemsize, storage_offset); } else { self->set_sizes_contiguous(size); - storage_size = self->numel(); + storage_size = at::detail::computeStorageNbytesContiguous( + size, itemsize, storage_offset); } maybe_resize_storage_cuda(self, storage_size); diff --git a/aten/src/ATen/native/cuda/RreluWithNoise.cu b/aten/src/ATen/native/cuda/RreluWithNoise.cu index 048118cf7925..3b2435d3dae4 100644 --- a/aten/src/ATen/native/cuda/RreluWithNoise.cu +++ b/aten/src/ATen/native/cuda/RreluWithNoise.cu @@ -1,6 +1,18 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#endif + namespace at { namespace native { @@ -132,6 +144,12 @@ Tensor& rrelu_with_noise_out_cuda(const Tensor& self, bool training, c10::optional generator, Tensor& output) { + at::native::resize_output(output, self.sizes()); + + if (self.numel() == 0) { + return output; + } + TensorArg self_arg{self, "self", 1}, noise_arg{noise, "noise", 2}, output_arg{output, "output", 3}; checkAllSameGPU("rrelu_with_noise_out_cuda", {self_arg, noise_arg, output_arg}); diff --git a/aten/src/ATen/native/cuda/ScanKernels.cpp b/aten/src/ATen/native/cuda/ScanKernels.cpp index f88faa1fcac9..8ba8b742af77 100644 --- a/aten/src/ATen/native/cuda/ScanKernels.cpp +++ b/aten/src/ATen/native/cuda/ScanKernels.cpp @@ -1,10 +1,21 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include -#include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#endif + namespace at { namespace native { static c10::MaybeOwned contiguous_out_arg(const Tensor &tensor) { diff --git a/aten/src/ATen/native/cuda/ScanKernels.h b/aten/src/ATen/native/cuda/ScanKernels.h index a502847f6307..28e65372511b 100644 --- a/aten/src/ATen/native/cuda/ScanKernels.h +++ b/aten/src/ATen/native/cuda/ScanKernels.h @@ -1,3 +1,4 @@ +#pragma once #include namespace at { diff --git a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu index 4ec12e166634..8461aa4cd8e3 100644 --- a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu +++ b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu @@ -1,6 +1,7 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include -#include +#include #include #include @@ -9,6 +10,7 @@ #include #include +#include #include #include #include @@ -19,8 +21,9 @@ namespace at { namespace native { class ReduceMultiply { public: template - constexpr C10_DEVICE void operator() (scalar_t * self_data, const scalar_t * src_data) const { - gpuAtomicMul(self_data, *src_data); + constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const { + (void)numel; // suppress unused warning + gpuAtomicMul(self_data_start + index, *src_data); } }; static ReduceMultiply reduce_multiply; @@ -28,17 +31,47 @@ static ReduceMultiply reduce_multiply; class ReduceAdd { public: template - constexpr C10_DEVICE void operator() (scalar_t * self_data, const scalar_t * src_data) const { - gpuAtomicAddNoReturn(self_data, *src_data); + constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const { + fastAtomicAdd(self_data_start, index, numel, *src_data, true); } }; static ReduceAdd reduce_add; +class ReduceMean { +public: + template + constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const { + fastAtomicAdd(self_data_start, index, numel, *src_data, true); + } +}; +static ReduceMean reduce_mean; + +class ReduceMinimum { +public: + template + constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const { + (void)numel; // suppress unused warning + gpuAtomicMin(self_data_start + index, *src_data); + } +}; +static ReduceMinimum reduce_minimum; + +class ReduceMaximum { +public: + template + constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const { + (void)numel; // suppress unused warning + gpuAtomicMax(self_data_start + index, *src_data); + } +}; +static ReduceMaximum reduce_maximum; + class TensorAssign { public: template - constexpr C10_DEVICE void operator() (scalar_t * self_data, const scalar_t * src_data) const { - *self_data = *src_data; + constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const { + (void)numel; // suppress unused warning + *(self_data_start + index) = *src_data; } }; static TensorAssign tensor_assign; @@ -87,12 +120,13 @@ struct _cuda_scatter_gather_internal_kernel { TensorIterator& iter, int64_t index_size, int64_t index_stride, + int64_t numel, // Do not use `const` qualifier here as it may cause issue in cuda 11.6.x. See #75434, #75545 const func_t& f ) { if (!iter.can_use_32bit_indexing()) { for (auto& sub_iter : iter.with_32bit_indexing()) { _cuda_scatter_gather_internal_kernel()( - sub_iter, index_size, index_stride, f + sub_iter, index_size, index_stride, numel, f ); } return; @@ -110,14 +144,12 @@ struct _cuda_scatter_gather_internal_kernel { CUDA_KERNEL_ASSERT(idx_dim >= 0 && idx_dim < index_size && "index out of bounds"); - char* self_data = self_ptr + offsets[0]; - char* src_data = src_ptr + offsets[1]; - f( - (scalar_t*)self_data + (is_scatter_like ? idx_dim * index_stride : 0), - (scalar_t*)src_data + (is_scatter_like ? 0 : idx_dim * index_stride) + (scalar_t*)(self_ptr + offsets[0]), + is_scatter_like ? idx_dim * index_stride : 0, + numel, + (scalar_t*)(src_ptr + offsets[1]) + (is_scatter_like ? 0 : idx_dim * index_stride) ); - }; _launch_scatter_gather_kernel(iter.numel(), loop); @@ -126,12 +158,11 @@ struct _cuda_scatter_gather_internal_kernel { template struct cuda_scatter_gather_base_kernel { - template void operator()( const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src, const std::string& method_name, - const func_t& f + const ReduceAdd& f ) { at::assert_no_internal_overlap(self); @@ -179,7 +210,7 @@ struct cuda_scatter_gather_base_kernel { OpaqueType, scalar_t>::type; _cuda_scatter_gather_internal_kernel()( - iter, index_size, index_stride, f + iter, index_size, index_stride, self.numel(), f ); } ); @@ -189,7 +220,66 @@ struct cuda_scatter_gather_base_kernel { const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src, const std::string& method_name, - const ReduceMultiply& f + const TensorAssign& f + ) { + at::assert_no_internal_overlap(self); + + auto index_sizes = ensure_nonempty_vec(index.sizes().vec()); + auto self_strides = ensure_nonempty_vec(self.strides().vec()); + auto src_strides = ensure_nonempty_vec(src.strides().vec()); + + // restride self and src such that + // self.shape = src.shape = index.shape + // + // restride stride[dim] such that + // if (is_scatter_like) self.stride[dim] = 0 + // else src.stride[dim] = 0 + auto self_restrided = is_scatter_like ? + restride_dim(self, dim, index_sizes) + : self.as_strided(index_sizes, self_strides); + auto src_restrided = is_scatter_like ? + src.as_strided(index_sizes, src_strides) + : restride_dim(src, dim, index_sizes); + + auto iter = TensorIteratorConfig() + .set_check_mem_overlap(false) + .check_all_same_dtype(false) + .resize_outputs(false) + .add_output(self_restrided) + .add_input(src_restrided) + .add_input(index) + .build(); + + auto self_dim_stride = ensure_nonempty_stride(self, dim); + auto self_dim_size = ensure_nonempty_size(self, dim); + + auto src_dim_stride = ensure_nonempty_stride(src, dim); + auto src_dim_size = ensure_nonempty_size(src, dim); + + auto index_size = is_scatter_like ? self_dim_size : src_dim_size; + auto index_stride = is_scatter_like ? self_dim_stride : src_dim_stride; + + + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( + at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, + iter.dtype(), + "cuda_scatter_gather_base_kernel_func", [&] { + using dtype = typename std::conditional, scalar_t>::type; + + _cuda_scatter_gather_internal_kernel()( + iter, index_size, index_stride, self.numel(), f + ); + } + ); + } + + template + void operator()( + const Tensor& self, int64_t dim, + const Tensor& index, const Tensor& src, + const std::string& method_name, + const func_t& f ) { at::assert_no_internal_overlap(self); @@ -232,12 +322,12 @@ struct cuda_scatter_gather_base_kernel { AT_DISPATCH_FLOATING_TYPES_AND2( at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), - "cuda_scatter_gather_base_kernel_reduce_multiply", [&] { + "cuda_scatter_gather_base_kernel_func", [&] { using dtype = typename std::conditional, scalar_t>::type; _cuda_scatter_gather_internal_kernel()( - iter, index_size, index_stride, f + iter, index_size, index_stride, self.numel(), f ); } ); @@ -252,12 +342,13 @@ struct _cuda_scatter_fill_internal_kernel { scalar_t src_val, int64_t index_size, int64_t index_stride, + int64_t numel, // Do not use `const` qualifier here as it may cause issue in cuda 11.6.x. See #75434, #75545 const func_t& f ) { if (!iter.can_use_32bit_indexing()) { for (auto& sub_iter : iter.with_32bit_indexing()) { _cuda_scatter_fill_internal_kernel()( - sub_iter, src_val, index_size, index_stride, f + sub_iter, src_val, index_size, index_stride, numel, f ); } return; @@ -275,13 +366,12 @@ struct _cuda_scatter_fill_internal_kernel { && "index out of bounds" ); - char* self_data = self_ptr + offsets[0]; - f( - (scalar_t*)self_data + idx_dim * index_stride, + (scalar_t*)(self_ptr + offsets[0]), + idx_dim * index_stride, + numel, (scalar_t*)&src_val ); - }; _launch_scatter_gather_kernel(iter.numel(), loop); @@ -328,7 +418,7 @@ struct cuda_scatter_fill_base_kernel { auto src_val = *(dtype*)&src_scalar_val; _cuda_scatter_fill_internal_kernel()( - iter, src_val, index_size, index_stride, f + iter, src_val, index_size, index_stride, self.numel(), f ); } ); @@ -371,7 +461,7 @@ struct cuda_scatter_fill_base_kernel { auto src_val = *(dtype*)&src_scalar_val; _cuda_scatter_fill_internal_kernel()( - iter, src_val, index_size, index_stride, f + iter, src_val, index_size, index_stride, self.numel(), f ); } ); @@ -416,6 +506,35 @@ void scatter_reduce_cuda_kernel(const Tensor& self, const int64_t dim, const Ten cuda_scatter_gather_base_kernel()(self, dim, index, src, "scatter_reduce_cuda_multiply_", reduce_multiply); break; + default : + break; + } +} + +void scatter_reduce_two_cuda_kernel(const Tensor& self, const int64_t dim, const Tensor& index, + const Tensor& src, const SCATTER_GATHER_OP& reduce) { + globalContext().alertNotDeterministic("scatter_reduce_cuda"); + switch (reduce) { + case SCATTER_GATHER_OP::REDUCE_ADD : + cuda_scatter_gather_base_kernel()(self, dim, index, src, + "scatter_reduce_cuda_sum_", reduce_add); + break; + case SCATTER_GATHER_OP::REDUCE_MULTIPLY : + cuda_scatter_gather_base_kernel()(self, dim, index, src, + "scatter_reduce_cuda_prod_", reduce_multiply); + break; + case SCATTER_GATHER_OP::REDUCE_MAXIMUM : + cuda_scatter_gather_base_kernel()(self, dim, index, src, + "scatter_reduce_cuda_amax_", reduce_maximum); + break; + case SCATTER_GATHER_OP::REDUCE_MINIMUM : + cuda_scatter_gather_base_kernel()(self, dim, index, src, + "scatter_reduce_cuda_amin_", reduce_minimum); + break; + case SCATTER_GATHER_OP::REDUCE_MEAN : + cuda_scatter_gather_base_kernel()(self, dim, index, src, + "scatter_reduce_cuda_mean_", reduce_mean); + break; } } @@ -430,6 +549,8 @@ void scatter_scalar_reduce_cuda_kernel(const Tensor& self, const int64_t dim, co cuda_scatter_fill_base_kernel()(self, dim, index, value, "scatter_fill_cuda_multiply_", reduce_multiply); break; + default : + break; } } @@ -440,5 +561,6 @@ REGISTER_DISPATCH(scatter_fill_stub, &scatter_fill_cuda_kernel); REGISTER_DISPATCH(scatter_add_stub, &scatter_add_cuda_kernel); REGISTER_DISPATCH(scatter_reduce_stub, &scatter_reduce_cuda_kernel); REGISTER_DISPATCH(scatter_scalar_reduce_stub, &scatter_scalar_reduce_cuda_kernel); +REGISTER_DISPATCH(scatter_reduce_two_stub, &scatter_reduce_two_cuda_kernel); }} // namespace at::native diff --git a/aten/src/ATen/native/cuda/SegmentReduce.cu b/aten/src/ATen/native/cuda/SegmentReduce.cu index 6a5a768ae0d8..862de29c76cb 100644 --- a/aten/src/ATen/native/cuda/SegmentReduce.cu +++ b/aten/src/ATen/native/cuda/SegmentReduce.cu @@ -1,12 +1,20 @@ - +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include -#include +#include +#include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#endif + namespace at { namespace native { diff --git a/aten/src/ATen/native/cuda/Shape.cu b/aten/src/ATen/native/cuda/Shape.cu index 17eb91973075..08605cf4ed1b 100644 --- a/aten/src/ATen/native/cuda/Shape.cu +++ b/aten/src/ATen/native/cuda/Shape.cu @@ -1,4 +1,5 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include #include @@ -9,14 +10,21 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#endif + namespace at { namespace native { -#if defined(USE_ROCM) -constexpr int CAT_ARRAY_BATCH_SIZE = 1024; -#else constexpr int CAT_ARRAY_BATCH_SIZE = 128; -#endif constexpr int CAT_ARRAY_MAX_INPUT_DIMS = 4; namespace { @@ -83,45 +91,6 @@ struct TensorSizeStride { */ -// Use pinned memory and and pass the struct by pointer on ROCm -template -struct CatArrInputTensor { - T* input; - IndexType offset; - IndexType dimSize; - IndexType nElements; -}; - -template -C10_LAUNCH_BOUNDS_1(512) -__global__ void HIP_CatArrayBatchedCopy( - T* output, - CatArrInputTensor* inputs, - TensorSizeStride os, - const int concatDim, - IndexType dimStride) { - - IndexType tid = blockIdx.x * blockDim.x + threadIdx.x; - IndexType nElements = inputs[blockIdx.y].nElements; - - if(tid >= nElements) return; - - T* data = inputs[blockIdx.y].input; - IndexType offset = inputs[blockIdx.y].offset; - IndexType dimSize = inputs[blockIdx.y].dimSize; - IndexType dataOffset = offset * dimStride; - - IndexType stride = gridDim.x * blockDim.x; - - while( tid < nElements){ - IndexType elementOffset = CatArrIndexToOffset::compute( - os.tensorSize, os.tensorStride, dimSize, concatDim, tid); - output[dataOffset + elementOffset] = data[tid]; - - tid += stride; - } -} - // pass meta data directly through kernel argument instead of pin memory // In contiguous case, we will not need stride_size, setting it as 1 as placeholder // to pass compile. @@ -171,129 +140,8 @@ __global__ void CatArrayBatchedCopy( } } -template -void hip_parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension, - int nDims, c10::MemoryFormat memory_format) { - // First, let's set up our kernel parameters. We start with a raw pointer to - // the storage for the output Tensor. - scalar_t *data = out.data_ptr(); - - // Kernel Parameter - long tensorMetadataSize = - sizeof(CatArrInputTensor) * CAT_ARRAY_BATCH_SIZE; - auto d_inputs_storage = at::empty( - {tensorMetadataSize}, out.options().dtype(at::kByte)); - auto d_inputs = static_cast *>( - d_inputs_storage.data_ptr()); - - TensorSizeStride outputParam; - - // Next, let's initialize the size, stride arrays for the output Tensor. - if (memory_format == c10::MemoryFormat::Contiguous) { - for (int i = 0; i < nDims; ++i) { - outputParam.tensorSize[i] = at::native::size(out, i); - outputParam.tensorStride[i] = out.stride(i); - } - } else if (memory_format == c10::MemoryFormat::ChannelsLast || memory_format == c10::MemoryFormat::ChannelsLast3d) { - // permute the semantics of dims from NCHW to NHWC so that the input - // tensor is now contiguous - outputParam.tensorSize[0] = at::native::size(out, 0); - outputParam.tensorStride[0] = out.stride(0); - for (int i = 1; i < nDims - 1; ++i) { - outputParam.tensorSize[i] = at::native::size(out, i + 1); - outputParam.tensorStride[i] = out.stride(i + 1); - } - outputParam.tensorSize[nDims - 1] = at::native::size(out, 1); - outputParam.tensorStride[nDims - 1] = out.stride(1); - } else { - TORCH_CHECK(false, "unsupported memory format"); - } - - at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream(); - - // Now we loop - int batchCounter = 0; - int64_t offset = 0; - for (int i = 0; i < inputs.size() ; i += CAT_ARRAY_BATCH_SIZE) { - // Re-allocate stackInputs every iteration to avoid read-after-write hazard - { - auto stackInputs_storage = at::empty({tensorMetadataSize}, - out.options().dtype(at::kByte).device(at::kCPU).pinned_memory(true)); - auto stackInputs = - static_cast *>( - stackInputs_storage.data_ptr()); - for (batchCounter = 0; - batchCounter < CAT_ARRAY_BATCH_SIZE && - (i+batchCounter) < inputs.size(); - ++batchCounter) { - int64_t dimSize = 0; - // There is a legacy case where a 1-D empty tensor can be concat with - // high-dimensional tensor - if (inputs[i+batchCounter].numel() > 0) { - dimSize = at::native::size(inputs[i+batchCounter], dimension); - } - - stackInputs[batchCounter].input = - inputs[i+batchCounter].data_ptr(); - stackInputs[batchCounter].offset = offset; - stackInputs[batchCounter].dimSize = dimSize; - stackInputs[batchCounter].nElements = inputs[i+batchCounter].numel(); - - // update offset - offset += dimSize; - } - at::native::copy_(d_inputs_storage, stackInputs_storage, - /* non_blocking= */ true); - } - - // Next, let's consider how we set our kernel launch parameters. - // We borrow from THCApply, which the kernel's internal indexing - // is based on. - dim3 applyBlock = dim3(32*16); - - //Get grid where x dim fills half gpu and y dim is number of tensors. - //This will have cating two tensors fill the entire grid, but prevent - //many threads from needlessly load meta data if their sizes is small. - dim3 catGrid; - getCatGrid(batchCounter, catGrid); - - if (memory_format != c10::MemoryFormat::Contiguous) { - switch (dimension) { - case 0: - break; - case 1: - dimension = nDims - dimension; - break; - default: - dimension--; - } - } - // Template Declarations for dim = 1, 2, 3, 4 -#define HANDLE_CASE(DIMS) \ - HIP_CatArrayBatchedCopy<<<\ - catGrid, applyBlock, 0, stream.stream()>>>(\ - data, d_inputs, outputParam, dimension, outputParam.tensorStride[dimension]); \ - C10_CUDA_KERNEL_LAUNCH_CHECK(); - switch (nDims) { - case 1: - HANDLE_CASE(1); - break; - case 2: - HANDLE_CASE(2); - break; - case 3: - HANDLE_CASE(3); - break; - case 4: - HANDLE_CASE(4); - break; - } -#undef HANDLE_CASE - } -} - template -void parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension, +void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, int64_t dimension, int nDims, c10::MemoryFormat memory_format) { // First, let's set up our kernel parameters. We start with a raw pointer to // the storage for the output Tensor. @@ -304,19 +152,19 @@ void parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension, // Next, let's initialize the size, stride arrays for the output Tensor. if (memory_format == c10::MemoryFormat::Contiguous) { for (int i = 0; i < nDims; ++i) { - outputParam.tensorSize[i] = at::native::size(out, i); + outputParam.tensorSize[i] = out.size(i); outputParam.tensorStride[i] = out.stride(i); } } else if (memory_format == c10::MemoryFormat::ChannelsLast || memory_format == c10::MemoryFormat::ChannelsLast3d) { // permute the semantics of dims from NCHW to NHWC so that the input // tensor is now contiguous - outputParam.tensorSize[0] = at::native::size(out, 0); + outputParam.tensorSize[0] = out.size(0); outputParam.tensorStride[0] = out.stride(0); for (int i = 1; i < nDims - 1; ++i) { - outputParam.tensorSize[i] = at::native::size(out, i + 1); + outputParam.tensorSize[i] = out.size(i + 1); outputParam.tensorStride[i] = out.stride(i + 1); } - outputParam.tensorSize[nDims - 1] = at::native::size(out, 1); + outputParam.tensorSize[nDims - 1] = out.size(1); outputParam.tensorStride[nDims - 1] = out.stride(1); } else { TORCH_CHECK(false, "unsupported memory format"); @@ -335,16 +183,16 @@ void parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension, int64_t dimSize = 0; // There is a legacy case where a 1-D empty tensor can be concat with // high-dimensional tensor - if (inputs[i+batchCounter].numel() > 0) { - dimSize = at::native::size(inputs[i+batchCounter], dimension); + if (inputs[i+batchCounter].get().numel() > 0) { + dimSize = inputs[i+batchCounter].get().size(dimension); } - catMetaData.input[batchCounter] = inputs[i+batchCounter].data_ptr(); + catMetaData.input[batchCounter] = inputs[i+batchCounter].get().data_ptr(); catMetaData.offset[batchCounter] = offset; catMetaData.dimSize[batchCounter] = dimSize; - catMetaData.nElements[batchCounter] = inputs[i+batchCounter].numel(); + catMetaData.nElements[batchCounter] = inputs[i+batchCounter].get().numel(); if (stride_size > 1) { - auto strides = inputs[i+batchCounter].strides(); - auto sizes = inputs[i+batchCounter].sizes(); + auto strides = inputs[i+batchCounter].get().strides(); + auto sizes = inputs[i+batchCounter].get().sizes(); for(int j = 0; j < nDims; j++){ catMetaData.tensorStride[batchCounter].tensorSize[j] = sizes[j]; catMetaData.tensorStride[batchCounter].tensorStride[j] = strides[j]; @@ -403,125 +251,20 @@ void parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension, } } // namespace -Tensor cat_cuda(TensorList inputs, int64_t dimension) { - ScalarType high_type = result_type(inputs); - Tensor out = at::empty({0}, inputs.front().options().dtype(high_type)); - at::native::cat_out_cuda(inputs, dimension, out); - return out; -} - -inline c10::MemoryFormat compute_output_memory_format(const TensorList &inputs) { - c10::optional format = c10::nullopt; - for (auto &t : inputs) { - auto f = t.suggest_memory_format(); - if (!format.has_value()) { - format = f; - continue; - } - if (format.value() == f) { - continue; - } - bool contiguous = (format.value() == c10::MemoryFormat::Contiguous || f == c10::MemoryFormat::Contiguous || format.value() != f); - if (contiguous) { - return c10::MemoryFormat::Contiguous; - } - } - return format.value(); -} - -Tensor& cat_out_cuda(TensorList inputs, int64_t dimension, Tensor& out) { - check_cat_no_zero_dim(inputs); - dimension = legacy_cat_wrap_dim(dimension, inputs); - - // previously, size [0] tensors were the only possible empty tensors; thus, it - // wasn't possible to cat empty tensors unless all the other tensors were - // 1-dimensional, so we allowed these tensors to be "skipped". We maintain - // this behavior for backwards compatibility, but only for this specific size - // (i.e. other empty sizes are not skipped). - // FIXME: warn if this is the case - auto should_skip = [](const Tensor &t) { - return t.dim() == 1 && at::native::size(t, 0) == 0; - }; - - const Tensor *notSkippedTensor = NULL; // non-owning reference - int nDims = 0; - - // Check for type promotion - TORCH_CHECK(canCast(result_type(inputs), out.scalar_type()), "torch.cat(): input types ", - " can't be cast to the desired output type ", - out.scalar_type()); - - // Inputs cannot alias the output tensor - for (int i = 0; i < inputs.size(); i++) { - auto lap = at::get_overlap_status(out, inputs[i]); - TORCH_CHECK(lap != at::MemOverlapStatus::PARTIAL && - lap != at::MemOverlapStatus::FULL, - "torch.cat(): unsupported operation: the input tensors cannot refer to any " - "of the output memory locations. Found overlap in input " - "tensor ", i); - } - at::assert_no_internal_overlap(out); - - for (int i = 0; i < inputs.size(); i++) { - if (should_skip(inputs[i])) { - continue; - } - nDims = inputs[i].dim(); - notSkippedTensor = &inputs[i]; - break; +TORCH_IMPL_FUNC(cat_out_cuda) +(ITensorListRef tensors, + int64_t dim, + int64_t valid, + bool all_contiguous, + bool all_same_dtype, + bool all_same_sizes_and_stride, + MemoryFormat memory_format, + const Tensor& result) { + if (result.numel() == 0) { + return; } - // If all inputs are empty tensors, return an empty tensor - if (notSkippedTensor == NULL) { - return out; - } - - TORCH_CHECK(inputs.size() > 0, "torch.cat(): invalid number of inputs ", inputs.size()); - TORCH_CHECK(dimension >= 0, "torch.cat(): invalid dimension ", dimension); - - for (const Tensor& t: inputs) { - TORCH_CHECK(t.device() == notSkippedTensor->device(), - "torch.cat(): all input tensors must be on the same device. Received ", - t.device(), " and ", notSkippedTensor->device()); - } - - TORCH_CHECK( - out.device() == notSkippedTensor->device(), - "torch.cat(): all input tensors and out must be on the same device, but inputs are on ", - notSkippedTensor->device(), " and out is on ", out.device()); - - c10::MemoryFormat memory_format = compute_output_memory_format(inputs); - - std::vector size(notSkippedTensor->sizes().vec()); - - // Compute size of the result in the cat dimension - int64_t cat_dim_size = 0; - for (int i = 0; i < inputs.size(); i++) { - const Tensor &tensor = inputs[i]; - if (should_skip(tensor)) { - continue; - } - check_cat_shape_except_dim(*notSkippedTensor, tensor, dimension, i); - cat_dim_size += at::native::size(tensor, dimension); - } - - // Compute the size of the result - size[dimension] = cat_dim_size; - - // skip resizing if size of result is same as expected - // raise a warning while resizing if output has one or more elements - // See https://github.com/pytorch/pytorch/pull/62560#discussion_r687363362 - // for understanding why at::native::resize_output is not called directly. - // if (at::native::resize_output_check(out, size)) { - // TODO: restore the above, see https://github.com/pytorch/pytorch/issues/64709 - - if (out.sizes() != size) { - out.resize_(size, memory_format); - } - - if (out.numel() == 0) { - return out; - } + auto materialized = tensors.materialize(); // We parallelize the copy if all 6 conditions pass: // @@ -531,76 +274,51 @@ Tensor& cat_out_cuda(TensorList inputs, int64_t dimension, Tensor& out) { // 4. All input tensors are contiguous (output tensor may be non-contig) // 5. All input tensors can use 32-bit indexing - const bool all32BitIndexable = std::all_of(inputs.begin(), inputs.end(), + const bool all32BitIndexable = std::all_of(materialized.begin(), materialized.end(), [] (const Tensor& t) { return at::cuda::detail::canUse32BitIndexMath(t); }); - const bool allContiguous = std::all_of(inputs.begin(), inputs.end(), - [=](const Tensor& t) { - return !t.defined() || t.is_contiguous(memory_format); - }); - ScalarType firstType = inputs[0].scalar_type(); - bool allSameType = std::all_of(inputs.begin(), inputs.end(), - [firstType](const Tensor& t) { - return t.scalar_type() == firstType; - }); - allSameType = allSameType && (out.scalar_type() == firstType); -#if defined(USE_ROCM) - if (inputs.size() > 1 && - out.dim() <= CAT_ARRAY_MAX_INPUT_DIMS && - at::cuda::detail::canUse32BitIndexMath(out) && - allContiguous && - all32BitIndexable && - allSameType) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( - at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, - out.scalar_type(), "cat_cuda", [&]() { - hip_parallel_cat(out, inputs, dimension, nDims, memory_format); - }); -#else + int nDims = materialized[valid].get().dim(); + // We support the contiguous inputs and non-contiguous input (<=4 dims) in different ways // For contiguous input, we don't need to pass stride meta data to cuda kernel through constant // memory. Therefore, we could pass more inputs to cuda threads. // For non-contiguous, we reduce the number of inputs passed to cuda kernel due to the limitation // of constant memory. - if (inputs.size() > 1 && - out.dim() <= CAT_ARRAY_MAX_INPUT_DIMS && - at::cuda::detail::canUse32BitIndexMath(out) && - allContiguous && + if (materialized.size() > 1 && + result.dim() <= CAT_ARRAY_MAX_INPUT_DIMS && + at::cuda::detail::canUse32BitIndexMath(result) && + all_contiguous && all32BitIndexable && - allSameType) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( - at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, - out.scalar_type(), "cat_cuda", [&]() { - parallel_cat(out, inputs, dimension, nDims, memory_format); + all_same_dtype) { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( + kComplexHalf, kHalf, kBool, kBFloat16, + result.scalar_type(), "cat_cuda", [&]() { + parallel_cat(result, materialized, dim, nDims, memory_format); }); - } else if (inputs.size() > 1 && - out.dim() <= CAT_ARRAY_MAX_INPUT_DIMS && - at::cuda::detail::canUse32BitIndexMath(out) && + } else if (materialized.size() > 1 && + result.dim() <= CAT_ARRAY_MAX_INPUT_DIMS && + at::cuda::detail::canUse32BitIndexMath(result) && nDims <= CAT_ARRAY_MAX_INPUT_DIMS && all32BitIndexable && - allSameType && + all_same_dtype && memory_format == c10::MemoryFormat::Contiguous) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( - at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, - out.scalar_type(), "cat_cuda", [&]() { - parallel_cat(out, inputs, dimension, nDims, memory_format); + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( + kComplexHalf, kHalf, kBool, kBFloat16, + result.scalar_type(), "cat_cuda", [&]() { + parallel_cat(result, materialized, dim, nDims, memory_format); }); -#endif } else { int64_t offset = 0; - for (int j = 0; j < inputs.size(); j++) - { - if (should_skip(inputs[j])) continue; - int64_t dimSize = at::native::size(inputs[j], dimension); - Tensor nt = at::narrow(out, dimension, offset, dimSize); - copy_(nt, inputs[j]); + for (const Tensor& t : materialized) { + if (cat_should_skip_tensor(t)) continue; + int64_t dimSize = t.size(dim); + Tensor nt = at::narrow(result, dim, offset, dimSize); + copy_(nt, t); offset += dimSize; } } - - return out; } } // namespace native diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu index 181fbb994c3f..b16dad4b9156 100644 --- a/aten/src/ATen/native/cuda/SoftMax.cu +++ b/aten/src/ATen/native/cuda/SoftMax.cu @@ -1,7 +1,9 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include +#include #include -#include +#include #include #include @@ -13,6 +15,19 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#endif + namespace at { namespace native { @@ -153,7 +168,7 @@ inline dim3 SoftMax_getBlockSize(int ILP, uint64_t dim_size) { while (block_size < (max_block_size)) block_size *= 2; // Launch at least a single warp - the kernel assumes that. - block_size = std::max(block_size, static_cast(C10_WARP_SIZE)); + block_size = std::max(block_size, static_cast(at::cuda::warp_size())); return dim3(block_size); } @@ -816,7 +831,7 @@ void host_softmax_backward(const Tensor &grad_, const Tensor &output_, int64_t d int64_t remaining = outer_size; int64_t chunk_size = (1<<30) / dim_size; while(remaining > 0) { - dispatch_softmax_backward( + dispatch_softmax_backward( gI_ptr, grad_ptr, output_ptr, dim_size, dim_size, std::min(remaining, chunk_size)); gI_ptr += chunk_size * dim_size; grad_ptr += chunk_size * dim_size; @@ -840,7 +855,7 @@ void host_softmax_backward(const Tensor &grad_, const Tensor &output_, int64_t d int64_t remaining = outer_size; int64_t chunk_size = (1<<30) / dim_size; while(remaining > 0) { - dispatch_softmax_backward( + dispatch_softmax_backward( gI_ptr, grad_ptr, output_ptr, dim_size, dim_size, std::min(remaining, chunk_size)); gI_ptr += chunk_size * dim_size; grad_ptr += chunk_size * dim_size; @@ -914,7 +929,7 @@ TORCH_IMPL_FUNC(log_softmax_backward_cuda_out) ( input_dtype == ScalarType::Half), "expected input and grad types to match, or input to be at::Half and grad to be at::Float"); } - host_softmax_backward(grad, output, dim, half_to_float, grad_input); + host_softmax_backward(grad, output, dim, half_to_float, grad_input); } TORCH_IMPL_FUNC(softmax_cuda_out) ( @@ -939,34 +954,52 @@ TORCH_IMPL_FUNC(softmax_backward_cuda_out) "expected input and grad types to match, or input to be at::Half and grad to be at::Float"); } Tensor tmp = grad * output; - host_softmax_backward(tmp, output, dim, half_to_float, grad_input); + host_softmax_backward(tmp, output, dim, half_to_float, grad_input); } -Tensor masked_softmax_cuda(const Tensor& input, const Tensor& mask) { - TORCH_CHECK(mask.scalar_type() == ScalarType::Bool, "Mask should be a boolean tensor"); - bool is_transformer_mask = (input.dim() == 4 && mask.dim() == 2 && input.size(0) == mask.size(0) && input.size(2) == mask.size(1) && input.size(3) == mask.size(1)); - TORCH_CHECK(mask.sizes() == input.sizes() || is_transformer_mask, "Mask shape should match input"); - // Always do masked softmax on last dim - int softmax_elements = input.size(input.dim() - 1); - // Persistent softmax only support softmax_elements <= 1024, - // Therefore once softmax_elements > 1024, we need to fallback to vanilla masked_softmax - Tensor output = at::empty_like(input, input.options()); - // Fallback to a slower masked softmax solution - if (softmax_elements > 1024 || softmax_elements * input.element_size() > 4096 || !mask.is_contiguous()) { - AT_DISPATCH_FLOATING_TYPES_AND2( - ScalarType::Half, - ScalarType::BFloat16, - input.scalar_type(), - "masked_softmax", - [&] { - Tensor mask_not = mask.logical_not(); - output = at::softmax(input.masked_fill(mask_not, -std::numeric_limits::infinity()), -1); - }); - return output; - } - int batch_count = input.numel() / softmax_elements; - int chunk_size = input.numel() / input.size(0); - if (is_transformer_mask) { +Tensor masked_softmax_cuda(const Tensor& input_, const Tensor& mask_, const c10::optional dim_) { + Tensor output = at::empty_like(input_, input_.options()); + TORCH_CHECK(mask_.scalar_type() == ScalarType::Bool, "Mask should be a boolean tensor"); + + // If input is [B, H, T, T] and mask is [B, T] + // we have special fast kernel + bool is_BxT_mask = (input_.dim() == 4 && mask_.dim() == 2 && input_.size(0) == mask_.size(0) && input_.size(2) == mask_.size(1) && input_.size(3) == mask_.size(1)); + + // If input is [B, H, T, T] and mask is [T, T] + // expand mask to [B, H, T, T] and treat it like regular mask + // TODO We should have special fast kernel for TxT mask as well + bool is_TxT_mask = input_.dim() == 4 && mask_.dim() == 2 && input_.size(3) == mask_.size(1) && input_.size(2) == mask_.size(0) && mask_.size(0) == mask_.size(1); + TORCH_CHECK(mask_.sizes() == input_.sizes() || is_BxT_mask || is_TxT_mask, "Mask shape should match input"); + + auto input = input_.dim() == 0 ? input_.view(1) : input_; + auto mask = mask_.dim() == 0 ? mask_.view(1) : mask_; + if (is_TxT_mask) { + mask = mask.expand(input.sizes()); + } + int64_t dim = dim_.has_value() ? dim_.value() : input.dim() - 1; + + int softmax_elements = input.size(dim); + // Persistent softmax is only supported when all of the conditions are held: + // 1) softmax_elements <= 1024 + // 2) softmax_elements * input.element_size() <= 4096 + // 3) mask.is_contiguous() + // 4) dim == input.dim() - 1 + // Otherwise, we fallback to vanilla softmax (where we do not support transformer_mask since converting the mask is expensive) + if (softmax_elements > 1024 || softmax_elements * input.element_size() > 4096 || !mask.is_contiguous() || dim < input.dim()-1) { + TORCH_CHECK(mask.sizes() == input.sizes(), "Mask shape should match input shape; transformer_mask is not supported in the fallback case."); + AT_DISPATCH_FLOATING_TYPES_AND2( + ScalarType::Half, + ScalarType::BFloat16, + input.scalar_type(), + "masked_softmax", + [&] { + output = at::softmax(input.masked_fill(mask, -std::numeric_limits::infinity()), dim); + }); + return output; + } + int batch_count = input.numel() / softmax_elements; + int chunk_size = input.numel() / input.size(0); + if (is_BxT_mask) { // Only support when num_heads is even in transformer TORCH_CHECK(input.size(1) % 2 == 0, "Only support when num_heads is even in transformer"); AT_DISPATCH_FLOATING_TYPES_AND2( @@ -988,7 +1021,7 @@ Tensor masked_softmax_cuda(const Tensor& input, const Tensor& mask) { ); }); - } else { + } else { AT_DISPATCH_FLOATING_TYPES_AND2( ScalarType::Half, ScalarType::BFloat16, @@ -1005,8 +1038,71 @@ Tensor masked_softmax_cuda(const Tensor& input, const Tensor& mask) { mask.data_ptr() ); }); - } - return output; + } + return output; +} + +Tensor masked_softmax_backward_cuda( + const Tensor& grad_, + const Tensor& output_, + const Tensor& mask_, + const c10::optional dim_) { + Tensor grad_input = at::empty_like(grad_, grad_.options()); + if (grad_.numel() == 0) { + return grad_input; + } + + auto grad = grad_.contiguous(); + auto output = output_.contiguous(); + auto mask = mask_.contiguous(); + int64_t dim = dim_.has_value() ? dim_.value() : output.dim() - 1; + + grad = grad.dim() == 0 ? grad.view(1) : grad; + mask = mask.dim() == 0 ? mask.view(1) : mask; + output = output.dim() == 0 ? output.view(1) : output; + + TORCH_CHECK(dim >=0 && dim < grad.dim(), "dim must be non-negative and less than input dimensions"); + TORCH_CHECK(grad.sizes() == mask.sizes(), "Mask shape should match grad shape"); + TORCH_CHECK(mask.scalar_type() == ScalarType::Bool, "Mask should be a boolean tensor"); + + int softmax_elements = output.size(dim); + int64_t batch_count = grad.numel() / softmax_elements; + + if (softmax_elements > 1024 || softmax_elements * grad.element_size() > 4096 || dim < grad.dim()-1) { + AT_DISPATCH_FLOATING_TYPES_AND2( + ScalarType::Half, + ScalarType::BFloat16, + grad_input.scalar_type(), + "masked_softmax_backward", + [&] { + grad_input = at::_softmax_backward_data( + grad, + output.masked_fill(mask, 0), + dim, + grad.scalar_type() + ); + }); + } else { + grad = grad * output; + AT_DISPATCH_FLOATING_TYPES_AND2( + ScalarType::Half, + ScalarType::BFloat16, + grad_input.scalar_type(), + "masked_softmax_backward", + [&] { + using accscalar_t = acc_type; + dispatch_softmax_backward( + grad_input.data_ptr(), // gI_ptr + grad.data_ptr(), // grad_ptr + output.data_ptr(), // output_ptr + softmax_elements, // softmax_elements + softmax_elements, // softmax_elements_stride + batch_count, // batch_count + mask.data_ptr() /* not masked */ + ); + }); + } + return grad_input; } } } diff --git a/aten/src/ATen/native/cuda/Sort.cpp b/aten/src/ATen/native/cuda/Sort.cpp index 8bb7d93bfdb5..efef65f9f2e1 100644 --- a/aten/src/ATen/native/cuda/Sort.cpp +++ b/aten/src/ATen/native/cuda/Sort.cpp @@ -1,11 +1,24 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include +#include #include -#include -#include #include +#include #include +#include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#endif + #include namespace at { namespace native { @@ -26,7 +39,7 @@ bool should_use_small_sort(const TensorBase &self, int64_t dim) { std::vector infer_dense_strides_dim_last(const Tensor & self, int64_t dim); -void fillSliceWithIndex(Tensor& t,int dim) { +void fillSliceWithIndex(const Tensor& t, int dim) { if (t.numel()) { auto sizes = DimVector(t.dim(), 1); sizes[dim] = t.sizes()[dim]; @@ -39,18 +52,28 @@ void fillSliceWithIndex(Tensor& t,int dim) { // We perform a segmented sort in cub with inputs that have // more than 1024/2048 elements along the selected dimension. // Otherwise, we do an inplace bitonic sort (see sortKeyValueInplace). -std::tuple sort_out_stable_cuda(const Tensor & self, c10::optional stable, int64_t dim, bool descending, Tensor & values, Tensor & indices) { +void sort_cuda_kernel( + const TensorBase& self_base, + const TensorBase& values_base, + const TensorBase& indices_base, + int64_t dim, + bool descending, + bool stable) { // this algorithm is always stable - TORCH_INTERNAL_ASSERT(stable.has_value(), "sort_out(): c10::optional for stable has to have value."); - TensorArg self_arg{self, "self", 1}, values_arg{values, "values", 2}, indices_arg{indices, "indices", 3}; - checkAllSameGPU(__func__, {self_arg, values_arg, indices_arg}); - bool is_non_overlapping_and_dense = self.is_non_overlapping_and_dense(); - int64_t ndim = self.dim(); - dim = maybe_wrap_dim(dim, ndim); - int64_t nsort = self.sizes()[dim]; + // Macro for converting `TensorBase` -> `Tensor` without + // reference count bumps. +#define TOTENSOR(BASE, VAR) \ + OptionalTensorRef opt_##BASE(BASE); \ + const Tensor& VAR = *opt_##BASE; + + // Converting TensorBase into Tensor. + // We will need Tensor's methods from this point onwards. + TOTENSOR(self_base, self); + TOTENSOR(values_base, values); + TOTENSOR(indices_base, indices); - TORCH_CHECK(nsort <= std::numeric_limits::max(), + TORCH_CHECK(self.sizes()[dim] <= std::numeric_limits::max(), "The dimension being sorted can not have more than INT_MAX elements."); const auto self_dtype = self.dtype(); @@ -60,37 +83,9 @@ std::tuple sort_out_stable_cuda(const Tensor & self, c10::opt TORCH_CHECK(self_dtype != ScalarType::ComplexFloat && self_dtype != ScalarType::ComplexDouble, "Sort currently does not support complex dtypes on CUDA."); - if (ndim == 0) { - if (!values.defined()) { - values = self.clone(); - } else { - values.resize_as_(self); - values.copy_(self); - } - if (!indices.defined()) { - indices = at::zeros({}, self.options().dtype(kLong)); - } else { - indices.resize_as_(self); - indices.zero_(); - } - return std::forward_as_tuple(values, indices); - } - // use inplace algorithm for smaller input sizes without stable=True - if (should_use_small_sort(self, dim) && !stable.value()) { + if (should_use_small_sort(self, dim) && !stable) { // from thc: sorted->values, indices->indices, input->self - - if (!values.defined()) { - values = at::empty_like(self); - } - if (!indices.defined()) { - indices = at::empty_like(self, self.options().dtype(kLong)); - } - - // Make sure sufficient output space is allocated - auto self_size = self.sizes(); - at::native::resize_output(values, self_size); - at::native::resize_output(indices, self_size); fillSliceWithIndex(indices, dim); // We sort k/v pairs in-place; copy unsorted input to output @@ -99,12 +94,12 @@ std::tuple sort_out_stable_cuda(const Tensor & self, c10::opt // Sort using our in-place k/v kernel that supports arbitrary // layout sortKeyValueInplace(values, indices, dim, descending); - return std::forward_as_tuple(values, indices); + return; } Tensor self_; bool newself = false; - if (is_non_overlapping_and_dense && self.stride(dim) == 1) { + if (self.is_non_overlapping_and_dense() && self.stride(dim) == 1) { self_ = self; } else { auto new_strides_unsort = infer_dense_strides_dim_last(self, dim); @@ -114,19 +109,6 @@ std::tuple sort_out_stable_cuda(const Tensor & self, c10::opt } c10::MaybeOwned values_tmp, indices_tmp; - if (!values.defined()) { - if (is_non_overlapping_and_dense) { - values = at::empty_strided(self.sizes(), self.strides(), self.options()); - } else { - auto strides = at::infer_dense_strides(self.sizes(), self.strides()); - values = at::empty_strided(self.sizes(), strides, self.options()); - } - } else { - TORCH_CHECK(self_.scalar_type() == values.scalar_type(), - "Unexpected dtype for values, expect ", self_.scalar_type(), ", got ", values.scalar_type()); - values.resize_as_(self); - } - if (values.strides() == self_.strides() && (newself || get_overlap_status(self, values) == MemOverlapStatus::NO)) { values_tmp = c10::MaybeOwned::borrowed(values); } else { @@ -134,18 +116,6 @@ std::tuple sort_out_stable_cuda(const Tensor & self, c10::opt at::empty_strided(self_.sizes(), self_.strides(), self_.options())); } - if (!indices.defined()) { - if (is_non_overlapping_and_dense) { - indices = at::empty_strided(self.sizes(), self.strides(), self.options().dtype(kLong)); - } else { - auto strides = at::infer_dense_strides(self.sizes(), self.strides()); - indices = at::empty_strided(self.sizes(), strides, self.options().dtype(kLong)); - } - } else { - TORCH_CHECK(kLong == indices.scalar_type(), - "Unexpected dtype for values, expect torch.long, got ", indices.scalar_type()); - indices.resize_as_(self); - } if (indices.strides() != self_.strides()) { indices_tmp = c10::MaybeOwned::owned( at::empty_strided(self_.sizes(), self_.strides(), self_.options().dtype(kLong))); @@ -161,20 +131,11 @@ std::tuple sort_out_stable_cuda(const Tensor & self, c10::opt if (!indices_tmp->is_same(indices)) { indices.copy_(*indices_tmp); } - return std::forward_as_tuple(values, indices); } -std::tuple sort_out_cuda(const Tensor & self, int64_t dim, bool descending, Tensor & values, Tensor & indices) { - return sort_out_stable_cuda(self, /*stable=*/false, dim, descending, values, indices); -} - -std::tuple sort_stable_cuda(const Tensor & self, c10::optional stable, int64_t dim, bool descending) { - Tensor values, indices; - return sort_out_stable_cuda(self, stable, dim, descending, values, indices); -} - -std::tuple sort_cuda(const Tensor & self, int64_t dim, bool descending) { - return sort_stable_cuda(self, /*stable=*/false, dim, descending); -} +// TODO: we should handle this accordingly when we start using REGISTER_HIP_DISPATCH, +// since REGISTER_DISPATCH won't work in this cpp file. +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +REGISTER_CUDA_DISPATCH(sort_stub, &sort_cuda_kernel); }} // namespace at::native diff --git a/aten/src/ATen/native/cuda/Sort.cu b/aten/src/ATen/native/cuda/Sort.cu index 15c89f7b76e2..5c08ddf59782 100644 --- a/aten/src/ATen/native/cuda/Sort.cu +++ b/aten/src/ATen/native/cuda/Sort.cu @@ -11,6 +11,7 @@ #include #include +#include namespace at { namespace native { @@ -231,6 +232,7 @@ __global__ void sort_postprocess_kernel(const scalar_t *in, scalar_t *out, int64 } +C10_LAUNCH_BOUNDS_1(at::cuda::detail::CUDA_NUM_THREADS) __global__ void fill_index_and_segment_kernel( int2 *data, int numel, at::cuda::detail::IntDivider nsort_divider) { CUDA_KERNEL_LOOP(idx, numel) { @@ -241,6 +243,7 @@ __global__ void fill_index_and_segment_kernel( } } +C10_LAUNCH_BOUNDS_1(at::cuda::detail::CUDA_NUM_THREADS) __global__ void fill_reverse_indices_kernel( int64_t *data, int numel, at::cuda::detail::IntDivider nsort_divider) { CUDA_KERNEL_LOOP(idx, numel) { @@ -248,6 +251,31 @@ __global__ void fill_reverse_indices_kernel( } } +template +inline void segmented_sort_large_segments( + const int64_t nsegments, const int64_t nsort, const int64_t n, const bool descending, + const scalar_t * self_ptr, scalar_t * values_ptr, int64_t * indices_ptr + ) { + using namespace at::cuda::detail; + auto allocator = at::cuda::getCUDADeviceAllocator(); + auto stream = at::cuda::getCurrentCUDAStream(); + dim3 block = CUDA_NUM_THREADS; + dim3 grid = GET_BLOCKS(nsort); + c10::DeviceArray indices(*allocator, nsort); + at::cuda::detail::IntDivider nsort_divider(nsort); + fill_reverse_indices_kernel<<>>( + indices.get(), nsort, nsort_divider); + const int64_t *initial_indices = indices.get(); + + for (auto i: c10::irange(nsegments)){ + at::cuda::cub::radix_sort_pairs( + self_ptr, values_ptr, initial_indices, indices_ptr, + nsort, descending); + indices_ptr += nsort; + self_ptr += nsort; + values_ptr += nsort; + } +} template inline void segmented_sort_pairs_by_full_sort( @@ -325,14 +353,14 @@ void launch_stable_sort_kernel( TORCH_CHECK(nbatch > 0, "Cannot sort dimension of length ", nsort); int64_t *indices_ptr = indices.data_ptr(); -#if defined(USE_ROCM) - constexpr bool is_rocm = true; +#if (defined(USE_ROCM) && ROCM_VERSION < 40500) + constexpr bool is_rocm_bf16_sort_unsupported = true; #else - constexpr bool is_rocm = false; + constexpr bool is_rocm_bf16_sort_unsupported = false; #endif AT_DISPATCH_ALL_TYPES_AND3(kBool, kHalf, kBFloat16, self.scalar_type(), "sort", [&]{ - c10::guts::if_constexpr::value)>([&](auto _){ + c10::guts::if_constexpr::value)>([&](auto _){ const scalar_t *self_ptr = self.data_ptr(); scalar_t *values_ptr = values.data_ptr(); int64_t remaining = _(numel); @@ -340,7 +368,11 @@ void launch_stable_sort_kernel( int64_t n = std::min(remaining, nbatch); int64_t nsegments = n / nsort; - if (nsegments < 128) { + if (nsegments == 1 || nsort >= 1000000) { //rough heuristics where even a single sort occupies GPU + segmented_sort_large_segments( + nsegments, nsort, n, descending, + self_ptr, values_ptr, indices_ptr); + } else if (nsegments < 128) { segmented_sort_pairs_by_full_sort(nsegments, nsort, n, descending, self_ptr, values_ptr, indices_ptr); } else { @@ -353,7 +385,7 @@ void launch_stable_sort_kernel( values_ptr += n; indices_ptr += n; } - }, [&](auto _){ TORCH_CHECK(_(false), "BFloat16 is not supported on ROCm"); }); + }, [&](auto _){ TORCH_CHECK(_(false), "BFloat16 is not supported on ROCm < 4.5"); }); }); } diff --git a/aten/src/ATen/native/cuda/SortImpl.cu b/aten/src/ATen/native/cuda/SortImpl.cu index a806c4a13874..c6e29262046e 100644 --- a/aten/src/ATen/native/cuda/SortImpl.cu +++ b/aten/src/ATen/native/cuda/SortImpl.cu @@ -1,4 +1,6 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include namespace at { namespace native { diff --git a/aten/src/ATen/native/cuda/Sorting.cpp b/aten/src/ATen/native/cuda/Sorting.cpp index fc526497812d..97b8df55416e 100644 --- a/aten/src/ATen/native/cuda/Sorting.cpp +++ b/aten/src/ATen/native/cuda/Sorting.cpp @@ -1,12 +1,27 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include -#include -#include +#include +#include +#include +#include #include +#include #include #include + #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#endif namespace at { namespace native { diff --git a/aten/src/ATen/native/cuda/Sorting.cu b/aten/src/ATen/native/cuda/Sorting.cu index d72788c1b97c..52fa2710596d 100644 --- a/aten/src/ATen/native/cuda/Sorting.cu +++ b/aten/src/ATen/native/cuda/Sorting.cu @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -189,7 +190,7 @@ struct KthValueLauncher { } dim3 block(std::min( - round_up(slice_size, (int64_t)C10_WARP_SIZE), (int64_t)1024)); + round_up(slice_size, (int64_t)at::cuda::warp_size()), (int64_t)1024)); auto stream = at::cuda::getCurrentCUDAStream(); gatherKthValue<<>>( self_info, @@ -228,7 +229,7 @@ struct MedianLauncher { } dim3 block(std::min( - round_up(slice_size, (int64_t)C10_WARP_SIZE), (int64_t)1024)); + round_up(slice_size, (int64_t)at::cuda::warp_size()), (int64_t)1024)); auto stream = at::cuda::getCurrentCUDAStream(); gatherMedian<<>>( values_info, diff --git a/aten/src/ATen/native/cuda/SparseMM.cu b/aten/src/ATen/native/cuda/SparseMM.cu index 0cc3fe3806a0..922efa5f4fcb 100644 --- a/aten/src/ATen/native/cuda/SparseMM.cu +++ b/aten/src/ATen/native/cuda/SparseMM.cu @@ -1,7 +1,13 @@ -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + namespace at { namespace native { // sparse, sparse, sparse, dense, real, real -> sparse Tensor& _sspaddmm_out_only_sparse_cuda(const Tensor& self, diff --git a/aten/src/ATen/native/cuda/SpectralOps.cpp b/aten/src/ATen/native/cuda/SpectralOps.cpp index 95fef7d09150..b418e8ffc8ab 100644 --- a/aten/src/ATen/native/cuda/SpectralOps.cpp +++ b/aten/src/ATen/native/cuda/SpectralOps.cpp @@ -1,19 +1,28 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include #include -#include -#include -#include -#include +#include +#include #include #include -#include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#endif + #include #include @@ -248,7 +257,7 @@ static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_ out.resize_(batched_out_sizes, MemoryFormat::Contiguous); // Create the transform plan (either from cache or locally) - const auto value_type = c10::toValueType(input.scalar_type()); + const auto value_type = c10::toRealValueType(input.scalar_type()); auto fft_type = GetCuFFTTransformType(input.is_complex(), out.is_complex()); CuFFTParams Params(input.strides(), out.strides(), signal_size, fft_type, value_type); CuFFTParamsLRUCache& plan_cache = cufft_get_plan_cache(input.device().index()); @@ -445,7 +454,7 @@ Tensor _fft_c2r_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization DimVector out_sizes(in_sizes.begin(), in_sizes.end()); out_sizes[dim.back()] = lastdim; - auto output = at::empty(out_sizes, self.options().dtype(c10::toValueType(self.scalar_type()))); + auto output = at::empty(out_sizes, self.options().dtype(c10::toRealValueType(self.scalar_type()))); if (use_optimized_cufft_path(dim)) { Tensor temp; diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu index 4a91f58e61ec..2f5c13006578 100644 --- a/aten/src/ATen/native/cuda/SpectralOps.cu +++ b/aten/src/ATen/native/cuda/SpectralOps.cu @@ -1,19 +1,11 @@ -#include +#define TORCH_ASSERT_NO_OPERATORS #include #include #include -#include -#include #include #include #include -#include -#include #include -#include -#include -#include - #include #include @@ -21,8 +13,6 @@ namespace at { namespace native { -using namespace at::native::detail; - // Offset calculator for indexing in Hermitian mirrored order. // In mirrored dims, maps linear index i to (n - i) % n template @@ -116,17 +106,17 @@ void _fft_fill_with_conjugate_symmetry_cuda_( signal_half_sizes, out_strides, mirror_dims, element_size); const auto numel = c10::multiply_integers(signal_half_sizes); - AT_DISPATCH_COMPLEX_TYPES(dtype, "_fft_fill_with_conjugate_symmetry", [&] { - using namespace cuda::detail; - _fft_conjugate_copy_kernel<<< - GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>( - numel, - static_cast(out_data), - static_cast(in_data), - input_offset_calculator, - output_offset_calculator); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - }); + AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "_fft_fill_with_conjugate_symmetry", [&] { + using namespace cuda::detail; + _fft_conjugate_copy_kernel<<< + GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>( + numel, + static_cast(out_data), + static_cast(in_data), + input_offset_calculator, + output_offset_calculator); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); } REGISTER_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cuda_); diff --git a/aten/src/ATen/native/cuda/SummaryOps.cu b/aten/src/ATen/native/cuda/SummaryOps.cu index 958ad88183e8..5476682d7c4d 100644 --- a/aten/src/ATen/native/cuda/SummaryOps.cu +++ b/aten/src/ATen/native/cuda/SummaryOps.cu @@ -1,10 +1,23 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include +#include +#include #include #include -#include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + namespace at { namespace cuda { #define THRESH_NUMBER_BINS_FOR_MULTI_BLOCK_MEM 100 @@ -19,16 +32,22 @@ namespace cuda { */ enum class CUDAHistogramMemoryType { SHARED, MULTI_BLOCK, GLOBAL }; namespace { - template - __device__ static IndexType getBin(input_t bVal, input_t minvalue, input_t maxvalue, int64_t nbins) { - IndexType bin = (int)((bVal - minvalue) * nbins / (maxvalue - minvalue)); - // (only applicable for histc) - // while each bin is inclusive at the lower end and exclusive at the higher, i.e. [start, end) - // the last bin is inclusive at both, i.e. [start, end], in order to include maxvalue if exists - // therefore when bin == nbins, adjust bin to the last bin - if (bin == nbins) bin -= 1; - return bin; - } +template +__device__ static IndexType getBin( + input_t bVal, + at::acc_type minvalue, + at::acc_type maxvalue, + int64_t nbins) { + IndexType bin = (int)(((bVal - minvalue)) * nbins / (maxvalue - minvalue)); + // (only applicable for histc) + // while each bin is inclusive at the lower end and exclusive at the higher, + // i.e. [start, end) the last bin is inclusive at both, i.e. [start, end], in + // order to include maxvalue if exists therefore when bin == nbins, adjust bin + // to the last bin + if (bin == nbins) + bin -= 1; + return bin; +} } /* @@ -49,8 +68,8 @@ __global__ void kernelHistogram1D( detail::TensorInfo p, /* partial output */ detail::TensorInfo b, /* input */ int64_t nbins, - input_t minvalue, - input_t maxvalue, + at::acc_type minvalue, + at::acc_type maxvalue, IndexType totalElements, Op getOp) { extern __shared__ unsigned char my_smem[]; @@ -72,7 +91,8 @@ __global__ void kernelHistogram1D( const auto bVal = b.data[bOffset]; if (bVal >= minvalue && bVal <= maxvalue) { // Use value at `b` as an offset of `smem` - const IndexType bin = getBin(bVal, minvalue, maxvalue, nbins); + const IndexType bin = + getBin(bVal, minvalue, maxvalue, nbins); gpuAtomicAddNoReturn(&smem[bin], getOp(linearIndex)); } } @@ -98,7 +118,8 @@ __global__ void kernelHistogram1D( const auto bVal = b.data[bOffset]; if (bVal >= minvalue && bVal <= maxvalue) { // Use value at `b` as an offset of `p` - const IndexType bin = getBin(bVal, minvalue, maxvalue, nbins); + const IndexType bin = + getBin(bVal, minvalue, maxvalue, nbins); const IndexType pIdx = p.strides[0] * blockIdx.x + bin; const IndexType pOffset = detail::IndexToOffset::get(pIdx, p); @@ -129,7 +150,8 @@ __global__ void kernelHistogram1D( const auto bVal = b.data[bOffset]; if (bVal >= minvalue && bVal <= maxvalue) { // Use value at `b` as an offset of `a` - const IndexType bin = getBin(bVal, minvalue, maxvalue, nbins); + const IndexType bin = + getBin(bVal, minvalue, maxvalue, nbins); const IndexType aOffset = detail::IndexToOffset::get(bin, a); gpuAtomicAddNoReturn(&a.data[aOffset], getOp(linearIndex)); @@ -138,13 +160,23 @@ __global__ void kernelHistogram1D( } } -#define HANDLE_CASE(MEMORY_TYPE, WEIGHTS_OP, SHARED_MEM) \ - kernelHistogram1D \ - <<>>( \ - aInfo, pInfo, bInfo, nbins, minvalue, maxvalue, totalElements, WEIGHTS_OP); \ +#define HANDLE_CASE(MEMORY_TYPE, WEIGHTS_OP, SHARED_MEM) \ + kernelHistogram1D< \ + output_t, \ + input_t, \ + IndexType, \ + 1, \ + 2, \ + -1, \ + MEMORY_TYPE><<>>( \ + aInfo, \ + pInfo, \ + bInfo, \ + nbins, \ + minvalue, \ + maxvalue, \ + totalElements, \ + WEIGHTS_OP); \ C10_CUDA_KERNEL_LAUNCH_CHECK(); #define HANDLE_SWITCH_CASE(mType, getOp) \ @@ -193,8 +225,8 @@ bool CUDA_tensor_histogram( at::Tensor b, /* input */ at::Tensor c, /* weights(optional) */ int64_t nbins, - input_t minvalue, - input_t maxvalue, + at::acc_type minvalue, + at::acc_type maxvalue, TensorArgType aType = TensorArgType::ReadWrite, TensorArgType bType = TensorArgType::ReadOnly, TensorArgType cType = TensorArgType::ReadOnly) { @@ -299,9 +331,14 @@ Tensor _bincount_cuda_template( AT_ERROR("input and weights should have the same length"); } - const int64_t nbins = std::max(*self.max().cpu().data_ptr() + (int64_t)1, minlength); - const input_t minvalue = 0; - const input_t maxvalue = nbins; + const int64_t nbins = + std::max(self.max().item() + (int64_t)1, minlength); + + // we are using acc_type for the bounds, in particular int64_t for integers + // in order to avoid overflows (e.g. using 256 bins for dtype uint8) + using bounds_t = at::acc_type; + const bounds_t minvalue = 0; + const bounds_t maxvalue = nbins; // alloc output counter on GPU Tensor output; if (has_weights) { @@ -311,7 +348,7 @@ Tensor _bincount_cuda_template( weights.options().layout_opt(), weights.options().device_opt(), weights.options().pinned_memory_opt()); - auto ret = cuda::CUDA_tensor_histogram( + cuda::CUDA_tensor_histogram( output, self, weights, nbins, minvalue, maxvalue); } else { output = native::zeros( @@ -320,7 +357,7 @@ Tensor _bincount_cuda_template( c10::nullopt /* layout */, DeviceType::CUDA, c10::nullopt /* pin_memory */); - auto ret = cuda::CUDA_tensor_histogram( + cuda::CUDA_tensor_histogram( output, self, weights, nbins, minvalue, maxvalue); } return output; @@ -331,8 +368,8 @@ template Tensor _histc_cuda_template( const Tensor& self, int64_t nbins, - input_t min, - input_t max) { + at::acc_type min, + at::acc_type max) { if (nbins <= 0) { AT_ERROR("bins must be > 0"); } @@ -374,8 +411,8 @@ Tensor _histc_cuda_template( #endif TORCH_CHECK(minvalue < maxvalue, "max must be larger than min"); - auto ret = cuda::CUDA_tensor_histogram( - output, self, Tensor(), nbins, minvalue, maxvalue); + cuda::CUDA_tensor_histogram( + output, self, Tensor(), nbins, minvalue, maxvalue); return output; } } // namespace @@ -412,7 +449,9 @@ Tensor _histc_cuda( // Nondeterministic because of atomicAdd usage globalContext().alertNotDeterministic("_histc_cuda"); return AT_DISPATCH_ALL_TYPES(self.scalar_type(), "histc", [&] { - return _histc_cuda_template(self, nbins, min.to(), max.to()); + using bounds_t = at::acc_type; + return _histc_cuda_template( + self, nbins, min.to(), max.to()); }); } diff --git a/aten/src/ATen/native/cuda/TensorCompare.cpp b/aten/src/ATen/native/cuda/TensorCompare.cpp index 5d2c84fdaca5..b99df69f3b2a 100644 --- a/aten/src/ATen/native/cuda/TensorCompare.cpp +++ b/aten/src/ATen/native/cuda/TensorCompare.cpp @@ -1,4 +1,5 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include namespace at { namespace native { diff --git a/aten/src/ATen/native/cuda/TensorCompare.cu b/aten/src/ATen/native/cuda/TensorCompare.cu index a786488cabef..f81c90c56517 100644 --- a/aten/src/ATen/native/cuda/TensorCompare.cu +++ b/aten/src/ATen/native/cuda/TensorCompare.cu @@ -39,12 +39,16 @@ void isneginf_kernel_impl(TensorIteratorBase &iter) { }); } -void clamp_kernel_impl(TensorIterator& iter) { +void clamp_kernel_impl(TensorIteratorBase& iter) { AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "clamp_cuda", [&] { gpu_kernel(iter, []GPU_LAMBDA(scalar_t v, scalar_t lower, scalar_t upper) -> scalar_t { // Propagate nan, which doesn't propagate automatically for ROCm if (at::_isnan(v)) { return v; + } if (at::_isnan(lower)) { + return lower; + } if (at::_isnan(upper)) { + return upper; } else { return ::min(::max(v, lower), upper); } @@ -82,50 +86,10 @@ void clamp_min_scalar_kernel_impl(TensorIteratorBase& iter, Scalar min) { launch_clamp_scalar(iter, min, min, at::native::detail::ClampLimits::Min); } -void clamp_min_kernel_impl(TensorIterator& iter) { - AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "clamp_min_cuda", [&] { - if (iter.is_cpu_scalar(2)){ - Scalar min = iter.scalar_value(2); - iter.remove_operand(2); - clamp_min_scalar_kernel_impl(iter, min); - } else { - gpu_kernel(iter, []GPU_LAMBDA(scalar_t v, scalar_t lower) -> scalar_t { - // Propagate nan, which doesn't propagate automatically for ROCm - if (_isnan(v)) { - return v; - } else { - return ::max(v, lower); - } - }); - } - }); -} - - void clamp_max_scalar_kernel_impl(TensorIteratorBase& iter, Scalar max) { launch_clamp_scalar(iter, max, max, at::native::detail::ClampLimits::Max); } -void clamp_max_kernel_impl(TensorIterator& iter) { - AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "clamp_max_cuda", [&] { - if (iter.is_cpu_scalar(2)){ - Scalar max = iter.scalar_value(2); - iter.remove_operand(2); - clamp_max_scalar_kernel_impl(iter, max); - } else { - gpu_kernel(iter, []GPU_LAMBDA(scalar_t v, scalar_t upper) -> scalar_t { - // Propagate nan, which doesn't propagate automatically for ROCm - if (_isnan(v)) { - return v; - } else { - return ::min(v, upper); - } - }); - } - }); -} - - } // anonymous namespace @@ -133,8 +97,6 @@ REGISTER_DISPATCH(where_kernel, &where_kernel_impl); REGISTER_DISPATCH(isposinf_stub, &isposinf_kernel_impl); REGISTER_DISPATCH(isneginf_stub, &isneginf_kernel_impl); REGISTER_DISPATCH(clamp_stub, &clamp_kernel_impl); -REGISTER_DISPATCH(clamp_min_stub, &clamp_min_kernel_impl); -REGISTER_DISPATCH(clamp_max_stub, &clamp_max_kernel_impl); REGISTER_DISPATCH(clamp_scalar_stub, &clamp_scalar_kernel_impl); REGISTER_DISPATCH(clamp_min_scalar_stub, &clamp_min_scalar_kernel_impl); REGISTER_DISPATCH(clamp_max_scalar_stub, &clamp_max_scalar_kernel_impl); diff --git a/aten/src/ATen/native/cuda/TensorFactories.cu b/aten/src/ATen/native/cuda/TensorFactories.cu index 29bd7adce5a0..f442c9c9f4e1 100644 --- a/aten/src/ATen/native/cuda/TensorFactories.cu +++ b/aten/src/ATen/native/cuda/TensorFactories.cu @@ -1,14 +1,29 @@ -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include #include +#include #include #include #include -#include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#endif + #include #include #include diff --git a/aten/src/ATen/native/cuda/TensorModeKernel.cpp b/aten/src/ATen/native/cuda/TensorModeKernel.cpp index 73ae5f3199b9..c04693bb72e2 100644 --- a/aten/src/ATen/native/cuda/TensorModeKernel.cpp +++ b/aten/src/ATen/native/cuda/TensorModeKernel.cpp @@ -1,5 +1,5 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include -#include #include #include #include diff --git a/aten/src/ATen/native/cuda/TensorModeKernel.cu b/aten/src/ATen/native/cuda/TensorModeKernel.cu index 40a8e19eb445..ce76987e94e0 100644 --- a/aten/src/ATen/native/cuda/TensorModeKernel.cu +++ b/aten/src/ATen/native/cuda/TensorModeKernel.cu @@ -3,7 +3,6 @@ #include #include #include -#include #include #include #include @@ -142,7 +141,8 @@ void handle_fused_mode( int64_t slice_size, int64_t slices) { constexpr int num_threads = size / 2; - static_assert(num_threads % C10_WARP_SIZE == 0 && + int warp_size = at::cuda::warp_size(); + TORCH_INTERNAL_ASSERT(num_threads % warp_size == 0 && num_threads <= cuda_utils::kCUDABlockReduceMaxThreads, ""); const auto memsize = (sizeof(scalar_t) * size) + (2 * size * sizeof(unsigned int)); @@ -191,15 +191,9 @@ void fused_mode( case 16: case 8: case 4: - case 2: { - if (ceilPowerOf2 > 2 * C10_WARP_SIZE) { - handle_fused_mode<128, scalar_t>( - grid, self, ti_values, ti_indices, slice_size, slices); - } else { - handle_fused_mode<2 * C10_WARP_SIZE, scalar_t>( - grid, self, ti_values, ti_indices, slice_size, slices); - } - } + case 2: + handle_fused_mode<128, scalar_t>( + grid, self, ti_values, ti_indices, slice_size, slices); break; case 1: default: diff --git a/aten/src/ATen/native/cuda/TensorShapeCUDA.cpp b/aten/src/ATen/native/cuda/TensorShapeCUDA.cpp index cc1c523dc1a3..0bb7eb410acf 100644 --- a/aten/src/ATen/native/cuda/TensorShapeCUDA.cpp +++ b/aten/src/ATen/native/cuda/TensorShapeCUDA.cpp @@ -1,9 +1,15 @@ - -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + namespace at { namespace native { @@ -27,8 +33,8 @@ Tensor& set_storage_cuda_(Tensor& result, Storage storage, int64_t storage_offse checkSetStorage(result, storage, storage_offset, size, stride); result.unsafeGetTensorImpl()->set_storage_offset(storage_offset); - c10::optional stride_opt = stride.data() != nullptr ? - c10::optional(stride) : c10::nullopt; + at::OptionalIntArrayRef stride_opt = stride.data() != nullptr ? + at::OptionalIntArrayRef(stride) : c10::nullopt; at::native::resize_impl_cuda_(result.unsafeGetTensorImpl(), size, stride_opt); return result; } diff --git a/aten/src/ATen/native/cuda/TensorTopK.cpp b/aten/src/ATen/native/cuda/TensorTopK.cpp index 392b3ce25ce2..66cda4f38023 100644 --- a/aten/src/ATen/native/cuda/TensorTopK.cpp +++ b/aten/src/ATen/native/cuda/TensorTopK.cpp @@ -1,12 +1,26 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include -#include -#include + +#include +#include +#include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#include +#else +#include +#include +#include +#endif + namespace at { namespace native { +// TODO: remove this when CUDA <11.6 is no longer supported void topk_out_with_sort( const Tensor& self, int64_t k, int64_t dim, bool largest, @@ -14,12 +28,15 @@ void topk_out_with_sort( const Tensor& indices ) { Tensor sorted_values, sorted_indices; - std::tie(sorted_values, sorted_indices) = at::native::sort_cuda(self, dim, largest); + std::tie(sorted_values, sorted_indices) = at::cuda::sort(self, /* stable= */false, dim, largest); values.copy_(sorted_values.narrow(dim, 0, k)); indices.copy_(sorted_indices.narrow(dim, 0, k)); } +// TODO: remove this when CUDA <11.6 is no longer supported +bool disable_sort_for_topk(); bool should_use_sort(const Tensor& self, int64_t dim) { + if (disable_sort_for_topk()) return false; // This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/68632 if (self.dim() == 0) return false; if (self.dtype() == kBool) return false; // Bool is not support by topk @@ -71,7 +88,7 @@ TORCH_IMPL_FUNC(topk_out_cuda) Tensor sortedIndices = at::empty_like(indices); Tensor sortedValues = at::empty_like(values); - sort_out_cuda(values, dim, largest, sortedValues, sortedIndices); + at::cuda::sort_outf(values, /* stable= */ false, dim, largest, sortedValues, sortedIndices); indices.copy_(indices.gather(dim, sortedIndices)); values.copy_(sortedValues); } diff --git a/aten/src/ATen/native/cuda/TensorTopK.cu b/aten/src/ATen/native/cuda/TensorTopK.cu index 7980619a7864..a4763e2d6f0d 100644 --- a/aten/src/ATen/native/cuda/TensorTopK.cu +++ b/aten/src/ATen/native/cuda/TensorTopK.cu @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -20,6 +21,12 @@ using namespace at::native; namespace at { namespace native { + +// TODO: remove this when CUDA <11.6 is no longer supported +bool disable_sort_for_topk() { + return CUB_SUPPORTS_SCAN_BY_KEY(); +} + namespace sbtopk { // single_block_topk template @@ -189,7 +196,8 @@ void launch( dim3 grid; TORCH_INTERNAL_ASSERT(getGridFromTiles(numInputSlices, grid), "Too many slices for topk"); - dim3 block(std::min(at::ceil_div((int64_t)inputSliceSize, (int64_t)C10_WARP_SIZE) * (int64_t)C10_WARP_SIZE, (int64_t)1024)); + int warp_size = at::cuda::warp_size(); + dim3 block(std::min(at::ceil_div((int64_t)inputSliceSize, (int64_t)warp_size) * (int64_t)warp_size, (int64_t)1024)); gatherTopK<<>>( input, inputSliceSize, @@ -208,6 +216,15 @@ void launch( namespace mbtopk { // multi_block_topk +// Assumptions: +// The number of elements can be larger than UINT32_MAX, but +// the number of total blocks can not be larger than UINT32_MAX. +// So we can not have more than UINT32_MAX slices. The actual limit +// for number of slices could be a few fold smaller than UINT32_MAX, +// because we could be using multiple blocks per slice. +// Further more, the size of each input slice is also assumped to be +// smaller than UINT32_MAX + constexpr int BLOCK_THREADS = 256; // Over what radix we are selecting values @@ -215,6 +232,8 @@ constexpr int RADIX_BITS = 8; constexpr int RADIX_DIGITS = 1 << RADIX_BITS; // 2 ^ RADIX_BITS constexpr int RADIX_MASK = (RADIX_DIGITS - 1); static_assert(RADIX_DIGITS <= BLOCK_THREADS, "radixFindKthValues kernel requires RADIX_DIGITS <= BLOCK_THREADS"); +constexpr int MIN_ITEMS_PER_THREAD = 4; +constexpr int MAX_ITEMS_PER_THREAD = 64; template __global__ void fill(T* x, T value, IndexType size) { @@ -230,42 +249,44 @@ template C10_LAUNCH_BOUNDS_1(BLOCK_THREADS) __global__ void radixFindKthValues( at::cuda::detail::TensorInfo input, - IndexType slice_size, - IndexType* ks_to_find, // size: num_slices + uint32_t slice_size, + uint32_t* ks_to_find, // size: num_slices - IndexType num_slices, + uint32_t num_slices, IndexType withinSliceStride, int current_bit, int items_per_thread, - IndexType blocks_per_slice, + uint32_t blocks_per_slice, Bitwise desiredMask, // outputs uint32_t* semaphores, // size: num_slices Bitwise* desires, // size: num_slices - IndexType* counts, // size: num_slices * blocks_per_slice * radix_digits + short* counts, // size: num_slices * blocks_per_slice * radix_digits T* kthValues // size: num_slices, only write when current_bit reaches 0 ) { int items_per_block = items_per_thread * BLOCK_THREADS; int tidx = threadIdx.x; - IndexType block_idx = getLinearBlockId(); - IndexType slice_idx = block_idx / blocks_per_slice; - IndexType blk_idx_in_slice = block_idx % blocks_per_slice; + uint32_t block_idx = getLinearBlockId(); + uint32_t slice_idx = block_idx / blocks_per_slice; + uint32_t blk_idx_in_slice = block_idx % blocks_per_slice; if (slice_idx >= num_slices) { return; } Bitwise desired = desires[slice_idx]; - IndexType k_to_find = ks_to_find[slice_idx]; + uint32_t k_to_find = ks_to_find[slice_idx]; IndexType slice_start_index = at::cuda::detail::IndexToOffset::get(slice_idx, input); T* data = &input.data[slice_start_index]; - typedef cub::BlockScan BlockScan; + typedef cub::BlockScan BlockScan; + static_assert(MAX_ITEMS_PER_THREAD * BLOCK_THREADS < std::numeric_limits::max(), + "blockwise counter too large"); union __align__(16) TempStorage { uint32_t digit_counters[RADIX_DIGITS]; - IndexType digit_count_cumsum[RADIX_DIGITS]; // only used if this it the last block for this slice + uint32_t digit_count_cumsum[RADIX_DIGITS]; // only used if this it the last block for this slice typename BlockScan::TempStorage scan_storage; }; __shared__ TempStorage temp_storage; @@ -299,18 +320,19 @@ __global__ void radixFindKthValues( // load digit counter to register, one digit per thread static_assert(RADIX_DIGITS <= BLOCK_THREADS, "this kernel requires RADIX_DIGITS <= BLOCK_THREADS"); - IndexType digit_count = 0; + uint32_t digit_count = 0; if (tidx < RADIX_DIGITS) { digit_count = temp_storage.digit_counters[tidx]; } + // We always write out counts regardless if blocks_per_slice == 1 because + // it will be used to compute offsets for `gatherTopK`. + if (tidx < RADIX_DIGITS) { + counts[block_idx * RADIX_DIGITS + tidx] = digit_count; + } // if blocks_per_slice == 1, there is no need to do cross-block reduction - // in this case counts saved at registers instead of global memory + // in this case we use counts saved at registers directly if (blocks_per_slice > 1) { - - if (tidx < RADIX_DIGITS) { - counts[block_idx * RADIX_DIGITS + tidx] = digit_count; - } __threadfence(); // make sure writes are globally visible __syncthreads(); // make sure all writes are finished before update semaphores } @@ -341,7 +363,7 @@ __global__ void radixFindKthValues( } // compute the block-wide inclusive prefix sum - IndexType digit_count_cumsum; + uint32_t digit_count_cumsum; BlockScan(temp_storage.scan_storage).InclusiveSum(digit_count, digit_count_cumsum); __syncthreads(); // every thread also need the perfix_sum of it's left value for comparison, so save a copy in shared mem @@ -351,14 +373,14 @@ __global__ void radixFindKthValues( __syncthreads(); if (tidx < RADIX_DIGITS) { - IndexType digit_count_cumsum_left = (tidx == 0) ? 0 : temp_storage.digit_count_cumsum[tidx - 1]; + uint32_t digit_count_cumsum_left = (tidx == 0) ? 0 : temp_storage.digit_count_cumsum[tidx - 1]; // if not the last pass: update desired and ks_to_find // if last pass: write out the kth value if (digit_count_cumsum_left < k_to_find && k_to_find <= digit_count_cumsum) { desired = at::cuda::Bitfield::setBitfield(desired, tidx, current_bit, RADIX_BITS); + desires[slice_idx] = desired; if (current_bit > 0) { - desires[slice_idx] = desired; ks_to_find[slice_idx] = k_to_find - digit_count_cumsum_left; } else { kthValues[slice_idx] = TopKTypeConfig::deconvert(desired); @@ -370,7 +392,199 @@ __global__ void radixFindKthValues( if (tidx == 0) { semaphores[slice_idx] = 0; } -}; +} + +#if CUB_SUPPORTS_SCAN_BY_KEY() +// Assumption: k can not be larger than UINT32_MAX +template +C10_LAUNCH_BOUNDS_1(RADIX_DIGITS) // one thread per digit +__global__ void computeBlockwiseWithinKCounts( + Bitwise* desires, // size: num_slices + short* counts, // size: num_slices * blocks_per_slice * radix_digits + uint32_t blocks_per_slice, + int current_bit, + bool largest, + // outputs: + uint32_t* withinKCounts // size: num_slices * blocks_per_slice == num_blocks +) { + // This kernel should be launched with the same number of blocks as the `radixFindKthValues` kernel. + int tidx = threadIdx.x; + uint32_t block_idx = getLinearBlockId(); + uint32_t slice_idx = block_idx / blocks_per_slice; + + Bitwise desired = doLdg(desires + slice_idx); + Bitwise desired_digit = at::cuda::Bitfield::getBitfield(desired, current_bit, RADIX_BITS); + + // if largest, then only threads that has tidx > desired_digit are active + // if !largest, then only threads that has tidx < desired_digit are active + // each active thread will read the count for its corresponding, and + // do warp reduction followed by shared memory reduction to get the total count + // non-active thread should not load, and non-active warp should not do reduction. + bool warp_is_active, thread_is_active; + int warp = tidx / C10_WARP_SIZE; + if (largest) { + int end_of_warp = warp * C10_WARP_SIZE + C10_WARP_SIZE - 1; + warp_is_active = end_of_warp > desired_digit; + thread_is_active = tidx > desired_digit; + } else { + int start_of_warp = warp * C10_WARP_SIZE; + warp_is_active = start_of_warp < desired_digit; + thread_is_active = tidx < desired_digit; + } + uint32_t count = 0; + if (warp_is_active) { + if (thread_is_active) { + count = doLdg(counts + block_idx * RADIX_DIGITS + tidx); + } + for (int offset = C10_WARP_SIZE / 2; offset > 0; offset /= 2) { + count += WARP_SHFL_DOWN(count, offset); + } + } + + constexpr int num_warps = RADIX_DIGITS / C10_WARP_SIZE; + __shared__ uint32_t warp_counts[num_warps]; + if (tidx % C10_WARP_SIZE == 0) { + warp_counts[warp] = count; + } + __syncthreads(); + static_assert(RADIX_DIGITS < C10_WARP_SIZE * C10_WARP_SIZE, + "Assuming only 1 warp is needed for final reduction"); + if (warp != 0) { + return; + } + count = 0; + if (tidx < num_warps) { + count = warp_counts[tidx]; + } + for (int offset = num_warps / 2; offset > 0; offset /= 2) { + count += WARP_SHFL_DOWN(count, offset); + } + if (tidx == 0) { + withinKCounts[block_idx] += count; + } +} + +// Assumption: slice_size can not be larger than UINT32_MAX +template +__global__ void computeBlockwiseKthCounts( + Bitwise* desires, // size: num_slices + short* counts, // size: num_slices * blocks_per_slice * radix_digits + uint32_t num_blocks, // the number of blocks used by `radixFindKthValues` kernel + uint32_t blocks_per_slice, + // outputs: + uint32_t* kthCounts // size: num_slices * blocks_per_slice == num_blocks +) { + CUDA_KERNEL_LOOP_TYPE(idx, num_blocks, uint32_t) { + uint32_t slice_idx = idx / blocks_per_slice; + Bitwise desired = doLdg(desires + slice_idx); + Bitwise desired_digit = at::cuda::Bitfield::getBitfield(desired, 0, RADIX_BITS); + kthCounts[idx] = doLdg(counts + idx * RADIX_DIGITS + desired_digit); + } +} + +template +C10_LAUNCH_BOUNDS_1(BLOCK_THREADS) +__global__ void gatherTopK(at::cuda::detail::TensorInfo input, + IndexType inputSliceSize, + IndexType outputSliceSize, // aka `k` + bool largest, + + uint32_t numInputSlices, + IndexType inputWithinSliceStride, + + at::cuda::detail::TensorInfo topK, + IndexType topKWithinSliceStride, + + at::cuda::detail::TensorInfo indices, + IndexType indicesWithinSliceStride, + + uint32_t items_per_thread, + uint32_t blocks_per_slice, + + T *kthValues, + uint32_t* withinKCounts, + uint32_t* kthCounts) { + + uint32_t items_per_block = items_per_thread * BLOCK_THREADS; + uint32_t tidx = threadIdx.x; + uint32_t block_idx = getLinearBlockId(); + uint32_t slice_idx = block_idx / blocks_per_slice; + uint32_t blk_idx_in_slice = block_idx % blocks_per_slice; + + items_per_thread = (blk_idx_in_slice + 1 < blocks_per_slice) + ? items_per_thread + : at::ceil_div((int64_t)(inputSliceSize - blk_idx_in_slice * items_per_block), (int64_t)BLOCK_THREADS); + + // Find the start offset for our slice + IndexType sliceStartIndex = + at::cuda::detail::IndexToOffset::get(slice_idx, input); + IndexType topKSliceStartIndex = + at::cuda::detail::IndexToOffset::get(slice_idx, topK); + IndexType indicesSliceStartIndex = + at::cuda::detail::IndexToOffset::get(slice_idx, indices); + + T* inputSliceStart = &input.data[sliceStartIndex]; + T* topKSliceStart = &topK.data[topKSliceStartIndex]; + int64_t* indicesSliceStart = &indices.data[indicesSliceStartIndex]; + + // Find the k-th highest element in our input + T kthValue = kthValues[slice_idx]; + const auto kthValueConverted = at::native::TopKTypeConfig::convert(kthValue); + + // Find the start index in output tensor of this block + uint32_t startWithinK = 0; + if (blk_idx_in_slice > 0) { + startWithinK = withinKCounts[block_idx - 1]; + } + uint32_t startKth = withinKCounts[slice_idx * blocks_per_slice + blocks_per_slice - 1]; + if (blk_idx_in_slice > 0) { + startKth += kthCounts[block_idx - 1]; + } + + // Read input, select topk out and write + typedef cub::BlockScan BlockScan; + __shared__ typename BlockScan::TempStorage temp_storage; + for (int i = 0; i < items_per_thread; ++i) { + // Find the start offset for this slice + IndexType idx = blk_idx_in_slice * items_per_block + i * BLOCK_THREADS + tidx; + T val; + int withinK = 0; + int kth = 0; + if (idx < inputSliceSize) { + val = doLdg(inputSliceStart + idx * inputWithinSliceStride); + const auto valConverted = at::native::TopKTypeConfig::convert(val); + withinK = (largest ? valConverted > kthValueConverted : valConverted < kthValueConverted); + kth = (valConverted == kthValueConverted); + } + + uint32_t withinKIndex; + uint32_t numWithinK; + BlockScan(temp_storage).ExclusiveSum(withinK, withinKIndex, numWithinK); + __syncthreads(); + if (withinK) { + uint32_t offset = withinKIndex + startWithinK; + topKSliceStart[offset * topKWithinSliceStride] = val; + indicesSliceStart[offset * indicesWithinSliceStride] = idx; + } + startWithinK += numWithinK; + + if (startKth < outputSliceSize) { + uint32_t kthIndex; + uint32_t numKth; + BlockScan(temp_storage).ExclusiveSum(kth, kthIndex, numKth); + __syncthreads(); + if (kth) { + uint32_t offset = kthIndex + startKth; + if (offset < outputSliceSize) { + topKSliceStart[offset * topKWithinSliceStride] = val; + indicesSliceStart[offset * indicesWithinSliceStride] = idx; + } + } + startKth += numKth; + } + } +} +#endif int get_items_per_thread(uint64_t num_slices, uint64_t slice_size) { // occupancy of this kernel is limited by registers per threads @@ -391,10 +605,19 @@ int get_items_per_thread(uint64_t num_slices, uint64_t slice_size) { #endif int blocks_per_mp = std::min(regs_per_mp / REGS_PER_BLOCK, max_blocks_per_mp); int64_t items_per_thread = at::ceil_div((int64_t)(slice_size * num_slices), (int64_t)(mpc * blocks_per_mp * BLOCK_THREADS)); - items_per_thread = std::max(4, std::min((int)items_per_thread, 64)); // clamp to (4, 64) + items_per_thread = std::max(MIN_ITEMS_PER_THREAD, std::min((int)items_per_thread, MAX_ITEMS_PER_THREAD)); // clamp to (4, 64) return items_per_thread; } +class BlockIdxToKey { + uint32_t blocks_per_slice; +public: + BlockIdxToKey(uint32_t blocks_per_slice): blocks_per_slice(blocks_per_slice) {} + __device__ __forceinline__ uint32_t operator()(uint32_t blk) const { + return blk / blocks_per_slice; + } +}; + template void launch( at::cuda::detail::TensorInfo input, @@ -402,7 +625,7 @@ void launch( IndexType outputSliceSize, // aka `k` bool largest, - IndexType numInputSlices, + uint32_t numInputSlices, IndexType inputWithinSliceStride, at::cuda::detail::TensorInfo topK, @@ -410,14 +633,15 @@ void launch( at::cuda::detail::TensorInfo indices, IndexType indicesWithinSliceStride) { + auto stream = c10::cuda::getCurrentCUDAStream(); // configure items_per_thread based on device architecture and input size int items_per_thread = get_items_per_thread(numInputSlices, inputSliceSize); int items_per_block = items_per_thread * BLOCK_THREADS; using Bitwise = typename TopKTypeConfig::RadixType; - int64_t blocks_per_slice = at::ceil_div((int64_t)inputSliceSize, (int64_t)items_per_block); - int64_t num_blocks = numInputSlices * blocks_per_slice; + uint32_t blocks_per_slice = at::ceil_div((int64_t)inputSliceSize, (int64_t)items_per_block); + uint32_t num_blocks = numInputSlices * blocks_per_slice; // temporary storage auto& allocator = *c10::cuda::CUDACachingAllocator::get(); @@ -428,20 +652,31 @@ void launch( TORCH_CHECK(blocks_per_slice <= std::numeric_limits::max(), "blocks_per_slice larger than uint32 maximum is not supported"); auto semaphores_buffer = allocator.allocate(numInputSlices * sizeof(uint32_t)); uint32_t* semaphores = reinterpret_cast(semaphores_buffer.get()); - AT_CUDA_CHECK(cudaMemsetAsync(semaphores, 0, numInputSlices * sizeof(uint32_t), c10::cuda::getCurrentCUDAStream())); + AT_CUDA_CHECK(cudaMemsetAsync(semaphores, 0, numInputSlices * sizeof(uint32_t), stream)); - auto ks_to_find_buffer = allocator.allocate(numInputSlices * sizeof(IndexType)); - IndexType* ks_to_find = reinterpret_cast(ks_to_find_buffer.get()); - IndexType k_to_find = largest ? inputSliceSize - outputSliceSize + 1: outputSliceSize; - fill<<>>( + auto ks_to_find_buffer = allocator.allocate(numInputSlices * sizeof(uint32_t)); + uint32_t* ks_to_find = reinterpret_cast(ks_to_find_buffer.get()); + uint32_t k_to_find = largest ? inputSliceSize - outputSliceSize + 1: outputSliceSize; + fill<<>>( ks_to_find, k_to_find, numInputSlices); C10_CUDA_KERNEL_LAUNCH_CHECK(); auto desired_buffer = allocator.allocate(numInputSlices * sizeof(Bitwise)); Bitwise* desired = reinterpret_cast(desired_buffer.get()); - auto counts_buffer = allocator.allocate(num_blocks * RADIX_DIGITS * sizeof(IndexType)); - IndexType* counts = reinterpret_cast(counts_buffer.get()); + auto counts_buffer = allocator.allocate(num_blocks * RADIX_DIGITS * sizeof(short)); + short* counts = reinterpret_cast(counts_buffer.get()); + static_assert(MAX_ITEMS_PER_THREAD * BLOCK_THREADS < std::numeric_limits::max(), + "blockwise counter too large"); + +#if CUB_SUPPORTS_SCAN_BY_KEY() + auto withinKCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t)); + uint32_t* withinKCounts = reinterpret_cast(withinKCounts_buffer.get()); + AT_CUDA_CHECK(cudaMemsetAsync(withinKCounts, 0, num_blocks * sizeof(uint32_t), stream)); + + auto kthCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t)); + uint32_t* kthCounts = reinterpret_cast(kthCounts_buffer.get()); +#endif Bitwise desiredMask = 0; dim3 grid; @@ -450,7 +685,7 @@ void launch( // iterate radix bits for multiple passes for (int current_bit = sizeof(T) * 8 - RADIX_BITS; current_bit >= 0; current_bit -= RADIX_BITS) { - radixFindKthValues<<>>( + radixFindKthValues<<>>( input, inputSliceSize, ks_to_find, @@ -465,15 +700,38 @@ void launch( counts, kthValues); C10_CUDA_KERNEL_LAUNCH_CHECK(); +#if CUB_SUPPORTS_SCAN_BY_KEY() + computeBlockwiseWithinKCounts<<>>( + desired, counts, blocks_per_slice, current_bit, largest, withinKCounts); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +#endif desiredMask = at::cuda::Bitfield::setBitfield(desiredMask, RADIX_MASK, current_bit, RADIX_BITS); } +#if CUB_SUPPORTS_SCAN_BY_KEY() + computeBlockwiseKthCounts<<>>( + desired, counts, num_blocks, blocks_per_slice, kthCounts); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + // Do a prefix scan of withinKCounts and kthCounts using slice_idx as keys to get the starting index of each block + using counting_iter_t = cub::CountingInputIterator; + using slice_idx_iter_t = cub::TransformInputIterator; + slice_idx_iter_t slice_idx_iter(counting_iter_t(0), BlockIdxToKey(blocks_per_slice)); + at::cuda::cub::inclusive_sum_by_key(slice_idx_iter, withinKCounts, withinKCounts, num_blocks); + at::cuda::cub::inclusive_sum_by_key(slice_idx_iter, kthCounts, kthCounts, num_blocks); + // copy topk values to output tensor + gatherTopK<<>>( + input, inputSliceSize, outputSliceSize, largest, numInputSlices, inputWithinSliceStride, + topK, topKWithinSliceStride, indices, indicesWithinSliceStride, items_per_thread, + blocks_per_slice, kthValues, withinKCounts, kthCounts); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +#else // Find topk values based on kth values { dim3 grid; TORCH_INTERNAL_ASSERT(getGridFromTiles(numInputSlices, grid), "Too many slices for topk"); - dim3 block(std::min(at::ceil_div((int64_t)inputSliceSize, (int64_t)C10_WARP_SIZE) * (int64_t)C10_WARP_SIZE, (int64_t)1024)); - sbtopk::gatherTopK<<>>( + int warp_size = at::cuda::warp_size(); + dim3 block(std::min(at::ceil_div((int64_t)inputSliceSize, (int64_t)warp_size) * (int64_t)warp_size, (int64_t)1024)); + sbtopk::gatherTopK<<>>( input, inputSliceSize, outputSliceSize, @@ -487,15 +745,29 @@ void launch( kthValues); C10_CUDA_KERNEL_LAUNCH_CHECK(); } +#endif } } // namespace mbtopk bool should_use_multiblock(int64_t num_slices, int64_t slice_size) { + if (num_slices > std::numeric_limits::max() || + slice_size > std::numeric_limits::max()) return false; +#if CUB_SUPPORTS_SCAN_BY_KEY() + // This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/74267 + return (num_slices <= 20 && slice_size >= 20000) || + (num_slices > 20 && num_slices <= 40 && slice_size >= 10000) || + (num_slices > 40 && num_slices <= 80 && slice_size >= 8000) || + (num_slices > 80 && num_slices < 200 && slice_size >= 5000) || + (num_slices >= 200 && num_slices < 800 && slice_size >= 3000) || + (num_slices >= 800 && num_slices <= 4000 && slice_size >= 800) || + (num_slices > 4000 && slice_size >= 400); +#else // This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/71081 return (num_slices <= 400 && slice_size >= 5000) || - (num_slices >= 400 && num_slices < 4000 && slice_size >= 1000) || + (num_slices > 400 && num_slices < 4000 && slice_size >= 1000) || (num_slices >= 4000 && slice_size >= 300); +#endif } void launch_gather_topk_kernel( diff --git a/aten/src/ATen/native/cuda/TensorTransformations.cu b/aten/src/ATen/native/cuda/TensorTransformations.cu index d46a5613df78..335d746294d0 100644 --- a/aten/src/ATen/native/cuda/TensorTransformations.cu +++ b/aten/src/ATen/native/cuda/TensorTransformations.cu @@ -1,11 +1,20 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include +#include #include -#include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#endif + #include #include diff --git a/aten/src/ATen/native/cuda/TriangularOps.cu b/aten/src/ATen/native/cuda/TriangularOps.cu index 3a0f8fb1e4d1..2d7bf30309dc 100644 --- a/aten/src/ATen/native/cuda/TriangularOps.cu +++ b/aten/src/ATen/native/cuda/TriangularOps.cu @@ -1,15 +1,20 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include #include #include #include -#include #include #ifndef AT_PER_OPERATOR_HEADERS #include +#include #else #include +#include +#include +#include +#include #endif #include diff --git a/aten/src/ATen/native/cuda/UnaryComplexKernels.cu b/aten/src/ATen/native/cuda/UnaryComplexKernels.cu index 07be6bb96556..0589c3ba4f0d 100644 --- a/aten/src/ATen/native/cuda/UnaryComplexKernels.cu +++ b/aten/src/ATen/native/cuda/UnaryComplexKernels.cu @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -20,53 +21,41 @@ __host__ __device__ static inline scalar_t angle_wrapper(scalar_t v) { template __host__ __device__ static inline c10::complex angle_wrapper(c10::complex v) { - return std::arg(v); + return c10::complex{std::arg(v), 0}; } +const char angle_name[] = "angle_kernel"; void angle_kernel_cuda(TensorIteratorBase& iter) { - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.common_dtype(), "angle_cuda", [&]() { - gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { - return angle_wrapper(a); + auto dtype = iter.common_dtype(); + if (at::isComplexType(dtype)) { +#if AT_USE_JITERATOR() + static const auto angle_string = jiterator_stringify( + template + T angle_kernel(T v) { + return T{std::arg(v)}; + } + ); // angle string + AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "angle_cuda", [&]() { + jitted_gpu_kernel< + /*name=*/ angle_name, + /*return_dtype=*/ scalar_t, + /*common_dtype=*/ scalar_t, + /*arity=*/ 1>(iter, angle_string); }); - }); -} - -// We manually overload real because std::real does not work types other than c10::complex. -template -__host__ __device__ static inline scalar_t real_wrapper(scalar_t v) { - return v; -} - -template -__host__ __device__ static inline c10::complex real_wrapper(c10::complex v) { - return v.real(); -} - -void real_kernel_cuda(TensorIteratorBase& iter) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "real_cuda", [&]() { - gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { - return real_wrapper(a); +#else + AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "angle_cuda", [&]() { + gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { + return angle_wrapper(a); + }); }); - }); -} - -// We manually overload imag because std::imag does not work types other than c10::complex. -template -__host__ __device__ static inline scalar_t imag_wrapper(scalar_t v) { - return 0; -} - -template -__host__ __device__ static inline c10::complex imag_wrapper(c10::complex v) { - return v.imag(); -} - -void imag_kernel_cuda(TensorIteratorBase& iter) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "imag_cuda", [&]() { - gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { - return imag_wrapper(a); +#endif + } else { + AT_DISPATCH_FLOATING_TYPES(dtype, "angle_cuda", [&]() { + gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { + return angle_wrapper(a); + }); }); - }); + } } // We manually overload conj because std::conj does not work types other than c10::complex. @@ -81,18 +70,35 @@ __host__ __device__ static inline c10::complex conj_wrapper(c10::complex v } // NB: Ignores the negative bit on tensors +const char conj_name[] = "conj_kernel"; void conj_kernel_cuda(TensorIteratorBase& iter) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( + auto common_dtype = iter.common_dtype(); + if (common_dtype == kComplexHalf) { + using scalar_t = c10::complex; + #if AT_USE_JITERATOR() + static const auto conj_string = jiterator_stringify( + template + T conj_kernel(T z) { + return std::conj(z); + } + ); + jitted_gpu_kernel(iter, conj_string); + #else + gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t { + return conj_wrapper(a); + }); + #endif + } else { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( kBool, kBFloat16, kHalf, iter.common_dtype(), "conj_cuda", [&]() { gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t { return conj_wrapper(a); }); - }); + }); + } } REGISTER_DISPATCH(angle_stub, &angle_kernel_cuda); -REGISTER_DISPATCH(real_stub, &real_kernel_cuda); -REGISTER_DISPATCH(imag_stub, &imag_kernel_cuda); REGISTER_DISPATCH(conj_physical_stub, &conj_kernel_cuda); }} // namespace at::native diff --git a/aten/src/ATen/native/cuda/UnaryLogKernels.cu b/aten/src/ATen/native/cuda/UnaryLogKernels.cu index 47f88383de42..c0187284b98b 100644 --- a/aten/src/ATen/native/cuda/UnaryLogKernels.cu +++ b/aten/src/ATen/native/cuda/UnaryLogKernels.cu @@ -4,26 +4,73 @@ #include #include #include +#include +#include #include #include #include namespace at { namespace native { +const char log_name[] = "log_kernel"; void log_kernel_cuda(TensorIteratorBase& iter) { - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "log_cuda", [&]() { - gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { - return ::log(a); + auto common_dtype = iter.common_dtype(); + if (at::isComplexType(common_dtype)) { +#if AT_USE_JITERATOR() + static const auto log_string = jiterator_stringify( + template T log_kernel(T x) { return std::log(x); }); + AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "log_cuda", [&]() { + jitted_gpu_kernel< + /*name=*/log_name, + /*return_dtype=*/scalar_t, + /*common_dtype=*/scalar_t, + /*arity=*/1>(iter, log_string); }); - }); +#else + AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, iter.common_dtype(), "log_cuda", [&]() { + gpu_kernel( + iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t { + using opmath_t = at::opmath_type; + return ::log(static_cast(a)); + }); + }); +#endif + } else { + AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "log_cuda", [&]() { + gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { + return ::log(a); + }); + }); + } } +const char log10_name[] = "log10_kernel"; void log10_kernel_cuda(TensorIteratorBase& iter) { - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "log10_cuda", [&]() { - gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { - return ::log10(a); + auto common_dtype = iter.common_dtype(); + if (at::isComplexType(common_dtype)) { +#if AT_USE_JITERATOR() + static const auto log10_string = jiterator_stringify( + template T log10_kernel(T x) { return std::log10(x); }); + AT_DISPATCH_COMPLEX_TYPES(common_dtype, "log10_cuda", [&]() { + jitted_gpu_kernel< + /*name=*/log10_name, + /*return_dtype=*/scalar_t, + /*common_dtype=*/scalar_t, + /*arity=*/1>(iter, log10_string); }); - }); +#else + AT_DISPATCH_COMPLEX_TYPES(iter.common_dtype(), "log10_cuda", [&]() { + gpu_kernel( + iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t { return ::log10(a); }); + }); +#endif + } else { + AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "log10_cuda", [&]() { + gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { + return ::log10(a); + }); + }); + } } void log1p_kernel_cuda(TensorIteratorBase& iter) { @@ -34,12 +81,33 @@ void log1p_kernel_cuda(TensorIteratorBase& iter) { }); } +const char log2_name[] = "log2_kernel"; void log2_kernel_cuda(TensorIteratorBase& iter) { - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "log2_cuda", [&]() { - gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { - return ::log2(a); + auto common_dtype = iter.common_dtype(); + if (at::isComplexType(common_dtype)) { +#if AT_USE_JITERATOR() + static const auto log2_string = jiterator_stringify( + template T log2_kernel(T x) { return std::log2(x); }); + AT_DISPATCH_COMPLEX_TYPES(common_dtype, "log2_cuda", [&]() { + jitted_gpu_kernel< + /*name=*/log2_name, + /*return_dtype=*/scalar_t, + /*common_dtype=*/scalar_t, + /*arity=*/1>(iter, log2_string); }); - }); +#else + AT_DISPATCH_COMPLEX_TYPES(iter.common_dtype(), "log2_cuda", [&]() { + gpu_kernel( + iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t { return ::log2(a); }); + }); +#endif + } else { + AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "log2_cuda", [&]() { + gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { + return ::log2(a); + }); + }); + } } REGISTER_DISPATCH(log_stub, &log_kernel_cuda); diff --git a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu index 671ce1d6cbcd..85c3fb7a1005 100644 --- a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu +++ b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include #include #include @@ -32,12 +34,38 @@ void bitwise_not_kernel_cuda(TensorIteratorBase& iter) { } } +const char exp_name[] = "exp_kernel"; void exp_kernel_cuda(TensorIteratorBase& iter) { - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.common_dtype(), "exp_cuda", [&]() { - gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { - return std::exp(a); + auto common_dtype = iter.common_dtype(); + if (at::isComplexType(common_dtype)) { + #if AT_USE_JITERATOR() + static const auto exp_string = jiterator_stringify( + template + T exp_kernel(T x) { + return std::exp(x); + }); // exp_string + AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "exp_cuda", [&]() { + jitted_gpu_kernel< + /*name=*/exp_name, + /*return_dtype=*/scalar_t, + /*common_dtype=*/scalar_t, + /*arity=*/1>(iter, exp_string); + }); + #else + AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "exp_cuda", [&]() { + gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { + using opmath_t = at::opmath_type; + return std::exp(static_cast(a)); + }); + }); + #endif + } else { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, common_dtype, "exp_cuda", [&]() { + gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { + return std::exp(a); + }); }); - }); + } } void expm1_kernel_cuda(TensorIteratorBase& iter) { @@ -53,19 +81,45 @@ void expm1_kernel_cuda(TensorIteratorBase& iter) { // We manually overload rsqrt because std::rsqrt does not work with complex types. template -__host__ __device__ static inline scalar_t rsqrt_wrapper(scalar_t v) { +C10_HOST_DEVICE static inline scalar_t rsqrt_wrapper(scalar_t v) { return ::rsqrt(v); } template -__host__ __device__ static inline c10::complex rsqrt_wrapper(c10::complex v) { +C10_HOST_DEVICE static inline c10::complex rsqrt_wrapper(c10::complex v) { const c10::complex one = c10::complex(1.0, 0); // std::sqrt for c10::complex is overloaded in c10/util/complex_math.h return one / ::sqrt(v); } +const char rsqrt_name[] = "rsqrt_kernel"; void rsqrt_kernel_cuda(TensorIteratorBase& iter) { - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2( + auto common_dtype = iter.common_dtype(); + if (at::isComplexType(common_dtype)) { + #if AT_USE_JITERATOR() + static const auto rsqrt_string = jiterator_stringify( + template + T rsqrt_kernel(T x) { + const T one = T{1}; + return one / std::sqrt(x); + }); // rsqrt_string + AT_DISPATCH_COMPLEX_TYPES(common_dtype, "rsqrt_cuda", [&]() { + jitted_gpu_kernel< + /*name=*/rsqrt_name, + /*return_dtype=*/scalar_t, + /*common_dtype=*/scalar_t, + /*arity=*/1>(iter, rsqrt_string); + }); + #else + AT_DISPATCH_COMPLEX_TYPES(common_dtype, "rsqrt_cuda", [&]() { + gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { + // In CUDA, ::rsqrt is overloaded for float and at::Half here is implicitly cast to float. + return rsqrt_wrapper(a); + }); + }); + #endif + } else { + AT_DISPATCH_FLOATING_TYPES_AND2( ScalarType::BFloat16, ScalarType::Half, iter.common_dtype(), "rsqrt_cuda", [&]() { @@ -74,14 +128,40 @@ void rsqrt_kernel_cuda(TensorIteratorBase& iter) { return rsqrt_wrapper(a); }); }); + } } +const char sqrt_name[] = "sqrt_kernel"; void sqrt_kernel_cuda(TensorIteratorBase& iter) { - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "sqrt_cuda", [&]() { - gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { - return ::sqrt(a); + auto common_dtype = iter.common_dtype(); + if (at::isComplexType(common_dtype)) { + #if AT_USE_JITERATOR() + static const auto sqrt_string = jiterator_stringify( + template + T sqrt_kernel(T x) { + return std::sqrt(x); + }); // sqrt_string + AT_DISPATCH_COMPLEX_TYPES(common_dtype, "sqrt_cuda", [&]() { + jitted_gpu_kernel< + /*name=*/sqrt_name, + /*return_dtype=*/scalar_t, + /*common_dtype=*/scalar_t, + /*arity=*/1>(iter, sqrt_string); + }); + #else + AT_DISPATCH_COMPLEX_TYPES(common_dtype, "sqrt_cuda", [&]() { + gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { + return std::sqrt(a); + }); + }); + #endif + } else { + AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, common_dtype, "sqrt_cuda", [&]() { + gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { + return std::sqrt(a); + }); }); - }); + } } void clamp_kernel_cuda(TensorIteratorBase& iter, const Scalar& min_value, const Scalar& max_value) { diff --git a/aten/src/ATen/native/cuda/UnarySignKernels.cu b/aten/src/ATen/native/cuda/UnarySignKernels.cu index b88dc6597bdd..170ae6566b75 100644 --- a/aten/src/ATen/native/cuda/UnarySignKernels.cu +++ b/aten/src/ATen/native/cuda/UnarySignKernels.cu @@ -1,12 +1,14 @@ #define TORCH_ASSERT_NO_OPERATORS #include #include +#include #include #include #include #include #include #include +#include #include @@ -23,12 +25,38 @@ void logical_not_kernel_cuda(TensorIteratorBase& iter) { } // NB: Ignores the negative bit on tensors +const char neg_name[] = "neg_kernel"; void neg_kernel_cuda(TensorIteratorBase& iter) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "neg_cuda", [&]() { + auto dtype = iter.dtype(); + if (at::isComplexType(dtype)) { +#if AT_USE_JITERATOR() + static const auto neg_string = jiterator_stringify( + template + T neg_kernel(T a) { + return -a; + } + ); // neg_string + AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "neg_cuda", [&]() { + jitted_gpu_kernel< + /*name=*/ neg_name, + /*return_dtype=*/ scalar_t, + /*common_dtype=*/ scalar_t, + /*arity=*/ 1>(iter, neg_string); + }); +#else + AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "neg_cuda", [&]() { + gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { + return -a; + }); + }); +#endif + } else { + AT_DISPATCH_ALL_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, dtype, "neg_cuda", [&]() { gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { return -a; }); }); + } } void sign_kernel_cuda(TensorIteratorBase& iter){ @@ -52,7 +80,7 @@ void signbit_kernel_cuda(TensorIteratorBase& iter){ } template -__host__ __device__ static inline c10::complex sgn_wrapper(c10::complex z) { +C10_HOST_DEVICE static inline c10::complex sgn_wrapper(c10::complex z) { if (z == c10::complex(0, 0)) { return c10::complex(0, 0); } else { @@ -60,13 +88,38 @@ __host__ __device__ static inline c10::complex sgn_wrapper(c10::complex z) } } +const char sgn_name[] = "sgn_kernel"; void sgn_kernel_cuda(TensorIteratorBase& iter){ - AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "sgn_cuda", [&]() { + auto dtype = iter.dtype(); + #if AT_USE_JITERATOR() + static const auto sgn_string = jiterator_stringify( + template + T sgn_kernel(T z) { + const T zero = T(0); + if (z == zero) { + return zero; + } else { + return z / std::abs(z); + } + } + ); // sgn_string + AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "sgn_cuda", [&]() { + jitted_gpu_kernel< + /*name=*/ sgn_name, + /*return_dtype=*/ scalar_t, + /*common_dtype=*/ scalar_t, + /*arity=*/ 1>(iter, sgn_string); + }); + #else + AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "sgn_cuda", [&]() { + using opmath_t = at::opmath_type; gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { - return sgn_wrapper(a); + return sgn_wrapper(opmath_t{a}); }); }); + #endif } + REGISTER_DISPATCH(logical_not_stub, &logical_not_kernel_cuda); REGISTER_DISPATCH(neg_stub, &neg_kernel_cuda); REGISTER_DISPATCH(sign_stub, &sign_kernel_cuda); diff --git a/aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu b/aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu index 71a355347022..0cb0d9f238cf 100644 --- a/aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu +++ b/aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu @@ -63,7 +63,7 @@ void i0_kernel_cuda(TensorIteratorBase& iter) { } // See note [Jiterator] -const char i0e_name[] = "i0e"; +const char i0e_name[] = "calc_i0e"; void i0e_kernel_cuda(TensorIteratorBase& iter) { #if AT_USE_JITERATOR() AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "i0e_cuda", [&]() { @@ -120,12 +120,41 @@ void i1e_kernel_cuda(TensorIteratorBase& iter) { #endif } +const char sigmoid_name[] = "sigmoid"; void sigmoid_kernel_cuda(TensorIteratorBase& iter) { - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.common_dtype(), "sigmoid_cuda", [&]() { - gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { - return static_cast(1) / (static_cast(1) + std::exp(-a)); + auto common_dtype = iter.common_dtype(); + if (at::isComplexType(common_dtype)) { + // only jiterate for complex-dtype + #if AT_USE_JITERATOR() + static const auto sigmoid_string = jiterator_stringify( + template + T sigmoid(T x) { + return T{1} / (T{1} + std::exp(-x)); + } + ); // sigmoid_string + AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "sigmoid_cuda", [&]() { + jitted_gpu_kernel< + /*name=*/sigmoid_name, + /*return_dtype=*/scalar_t, + /*common_dtype=*/scalar_t, + /*arity=*/1>(iter, sigmoid_string); + }); + #else + AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "sigmoid_cuda", [&]() { + gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { + using opmath_t = at::opmath_type; + const auto one = opmath_t{1}; + return static_cast(one / (one + std::exp(-opmath_t{a}))); + }); + }); + #endif + } else { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, common_dtype, "sigmoid_cuda", [&]() { + gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { + return scalar_t{1} / (scalar_t{1} + std::exp(-a)); + }); }); - }); + } } const char sinc_name[] = "sinc"; @@ -202,6 +231,23 @@ void ndtri_kernel_cuda(TensorIteratorBase& iter) { #endif } +const char log_ndtr_name[] = "log_ndtr"; +void log_ndtr_kernel_cuda(TensorIteratorBase& iter) { + #if AT_USE_JITERATOR() + AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "log_ndtr_cuda", [&]() { + jitted_gpu_kernel(iter, log_ndtr_string); + }); + #else + AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "log_ndtr_cuda", [&]() { + gpu_kernel( + iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t { return calc_log_ndtr(a); }); + }); + #endif +} + void erf_kernel_cuda(TensorIteratorBase& iter) { AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.common_dtype(), "erf_cuda", [&]() { gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { @@ -264,18 +310,38 @@ void erfcx_kernel_cuda(TensorIteratorBase& iter) { #endif } +const char kaiser_window_name[] = "kaiser_window"; void kaiser_window_kernel_cuda(TensorIteratorBase& iter, int64_t window_length, double beta_){ - AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "kaiser_window_cuda", [&](){ - using opmath_t = at::opmath_type; - const opmath_t inv_alpha = static_cast(2.0 / (window_length - 1)); - const opmath_t beta = static_cast(beta_); - const opmath_t inv_i0_beta = 1.0 / calc_i0(beta); - gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t a) -> scalar_t { - opmath_t x = static_cast(a) * inv_alpha - 1; - opmath_t y = std::max(0, 1 - x * x); - return calc_i0(beta * ::sqrt(y)) * inv_i0_beta; + #if AT_USE_JITERATOR() + AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "kaiser_window_cuda", [&](){ + using opmath_t = at::opmath_type; + const opmath_t inv_alpha = static_cast(2.0 / (window_length - 1)); + const opmath_t beta = static_cast(beta_); + const opmath_t inv_i0_beta = 1.0 / calc_i0(beta); + jitted_gpu_kernel< + /*name=*/kaiser_window_name, + /*return_dtype=*/scalar_t, + /*common_dtype=*/scalar_t, + /*arity=*/1>( + iter, + kaiser_window_string, + /*scalar_pos=*/at::cuda::jit::BinaryFuncVariant::NoScalar, + /*scalar_val=*/0, + /*extra_args=*/std::make_tuple(inv_alpha, beta, inv_i0_beta)); }); - }); + #else + AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "kaiser_window_cuda", [&](){ + using opmath_t = at::opmath_type; + const opmath_t inv_alpha = static_cast(2.0 / (window_length - 1)); + const opmath_t beta = static_cast(beta_); + const opmath_t inv_i0_beta = 1.0 / calc_i0(beta); + gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t a) -> scalar_t { + opmath_t x = static_cast(a) * inv_alpha - 1; + opmath_t y = std::max(0, 1 - x * x); + return calc_i0(beta * ::sqrt(y)) * inv_i0_beta; + }); + }); + #endif } const char entr_name[] = "entr"; @@ -322,6 +388,7 @@ REGISTER_DISPATCH(erfinv_stub, &erfinv_kernel_cuda); REGISTER_DISPATCH(kaiser_window_stub, &kaiser_window_kernel_cuda); REGISTER_DISPATCH(special_entr_stub, &entr_kernel_cuda); REGISTER_DISPATCH(special_ndtri_stub, &ndtri_kernel_cuda); +REGISTER_DISPATCH(special_log_ndtr_stub, &log_ndtr_kernel_cuda); REGISTER_DISPATCH(special_erfcx_stub, &erfcx_kernel_cuda); } // namespace native diff --git a/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu b/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu index 8b43900e9271..90f5238d0180 100644 --- a/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu +++ b/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu @@ -1,3 +1,4 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include diff --git a/aten/src/ATen/native/cuda/Unique.cu b/aten/src/ATen/native/cuda/Unique.cu index d268ca1c4903..746bba7a66c5 100644 --- a/aten/src/ATen/native/cuda/Unique.cu +++ b/aten/src/ATen/native/cuda/Unique.cu @@ -1,8 +1,22 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#include +#include +#include +#include +#include +#endif + #include #include #include @@ -103,7 +117,7 @@ std::tuple unique_dim_cuda_template( TORCH_CHECK( num_zero_dims == 1, "Number of zero sized dimensions is more than one, so unique cannot be applied ") - Tensor output = at::empty({0}, self.options()); + Tensor output = at::empty(sizes, self.options()); Tensor inverse_indices = at::empty({0}, self.options().dtype(kLong)); Tensor counts = at::empty({0}, self.options().dtype(kLong)); diff --git a/aten/src/ATen/native/cuda/UniqueCub.cu b/aten/src/ATen/native/cuda/UniqueCub.cu index bda84bdda4e1..cc19b96a7797 100644 --- a/aten/src/ATen/native/cuda/UniqueCub.cu +++ b/aten/src/ATen/native/cuda/UniqueCub.cu @@ -1,3 +1,4 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include @@ -5,6 +6,13 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#endif + namespace at { namespace native { namespace internal { diff --git a/aten/src/ATen/native/cuda/UniqueCub.cuh b/aten/src/ATen/native/cuda/UniqueCub.cuh index 1bb96e3f5ebd..6e1cccc2e175 100644 --- a/aten/src/ATen/native/cuda/UniqueCub.cuh +++ b/aten/src/ATen/native/cuda/UniqueCub.cuh @@ -1,4 +1,4 @@ -#include +#include namespace at { namespace native { diff --git a/aten/src/ATen/native/cuda/UpSample.cuh b/aten/src/ATen/native/cuda/UpSample.cuh index b609b42a4d9e..09e460640df8 100644 --- a/aten/src/ATen/native/cuda/UpSample.cuh +++ b/aten/src/ATen/native/cuda/UpSample.cuh @@ -1,7 +1,12 @@ -#include -#include +#pragma once +#include #include +#include +#include +#include +#include + #include namespace at { @@ -11,7 +16,7 @@ namespace upsample { // TODO: Remove duplicate declaration. TORCH_API c10::SmallVector compute_output_size( c10::IntArrayRef input_size, // Full input tensor size. - c10::optional output_size, + at::OptionalIntArrayRef output_size, c10::optional> scale_factors); } // namespace upsample diff --git a/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu b/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu index 29dec1735f23..1214955b06d4 100644 --- a/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu +++ b/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu @@ -1,12 +1,21 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include -#include +#include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#endif + namespace at { namespace native { namespace { diff --git a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu index d5153838139f..d76e2783207f 100644 --- a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu +++ b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu @@ -1,9 +1,10 @@ // Adapted from interp.cpp from Caffe util by Pauline Luc // Originally developed by George Papandreou -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include -#include +#include +#include #include #include #include @@ -12,6 +13,20 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#endif + namespace at { namespace native { namespace { @@ -456,7 +471,6 @@ C10_LAUNCH_BOUNDS_1(256) // 256 performs better then 1024 __global__ void upsample_gen2d_aa_out_frame( const accscalar_t height_scale, const accscalar_t width_scale, - const bool align_corners, const PackedTensorAccessor64 idata, PackedTensorAccessor64 odata, const InterpFilter & interp_filter) { @@ -550,7 +564,6 @@ C10_LAUNCH_BOUNDS_1(256) // 256 performs better then 1024 __global__ void upsample_gen2d_aa_backward_out_frame( const accscalar_t height_scale, const accscalar_t width_scale, - const bool align_corners, PackedTensorAccessor64 idata, const PackedTensorAccessor64 odata, const InterpFilter & interp_filter) { @@ -672,8 +685,6 @@ static void upsample_gen2d_aa_out_cuda_template( int output_height = output_size[0]; int output_width = output_size[1]; - int nbatch = input.size(0); - int channels = input.size(1); int input_height = input.size(2); int input_width = input.size(3); @@ -735,7 +746,7 @@ static void upsample_gen2d_aa_out_cuda_template( <<>>(height_scale, width_scale, align_corners, idata, odata, interp_filter); + stream>>>(height_scale, width_scale, idata, odata, interp_filter); C10_CUDA_KERNEL_LAUNCH_CHECK(); }); @@ -766,8 +777,6 @@ static void upsample_gen2d_aa_backward_out_cuda_template( int output_height = output_size[0]; int output_width = output_size[1]; - int nbatch = input_size[0]; - int channels = input_size[1]; int input_height = input_size[2]; int input_width = input_size[3]; @@ -819,7 +828,7 @@ static void upsample_gen2d_aa_backward_out_cuda_template( <<>>(height_scale, width_scale, align_corners, idata, odata, interp_filter); + stream>>>(height_scale, width_scale, idata, odata, interp_filter); C10_CUDA_KERNEL_LAUNCH_CHECK(); }); } diff --git a/aten/src/ATen/native/cuda/UpSampleLinear1d.cu b/aten/src/ATen/native/cuda/UpSampleLinear1d.cu index c23887cb79a6..af9edca2280e 100644 --- a/aten/src/ATen/native/cuda/UpSampleLinear1d.cu +++ b/aten/src/ATen/native/cuda/UpSampleLinear1d.cu @@ -1,15 +1,24 @@ // Adapted from interp.cpp from Caffe util by Pauline Luc // Originally developed by George Papandreou -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include -#include +#include #include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#endif + namespace at { namespace native { namespace { diff --git a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu index 52b7b1d70947..decdfca30d78 100644 --- a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu +++ b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu @@ -1,12 +1,23 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include -#include +#include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + namespace at { namespace native { namespace { diff --git a/aten/src/ATen/native/cuda/UpSampleNearest2d.cu b/aten/src/ATen/native/cuda/UpSampleNearest2d.cu index 7b2a58c764bb..8aa4f68aeda6 100644 --- a/aten/src/ATen/native/cuda/UpSampleNearest2d.cu +++ b/aten/src/ATen/native/cuda/UpSampleNearest2d.cu @@ -1,7 +1,8 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include -#include +#include #include #include #include @@ -10,6 +11,17 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#endif + namespace at { namespace native { namespace { diff --git a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu index 3b12614c10d5..1a4afa012d78 100644 --- a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu +++ b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu @@ -1,11 +1,28 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include + +#include #include #include -#include +#include #include #include #include -#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif namespace at { namespace native { @@ -322,7 +339,7 @@ using at::native::upsample_cuda::get_scale_value; Tensor upsample_nearest3d_cuda( const Tensor& input, - c10::optional output_size, + at::OptionalIntArrayRef output_size, c10::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_d = get_scale_value(scale_factors, 0); @@ -333,7 +350,7 @@ Tensor upsample_nearest3d_cuda( Tensor _upsample_nearest_exact3d_cuda( const Tensor& input, - c10::optional output_size, + at::OptionalIntArrayRef output_size, c10::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_d = get_scale_value(scale_factors, 0); @@ -345,7 +362,7 @@ Tensor _upsample_nearest_exact3d_cuda( // when structured kernels can handle QuantizedCPU, update these overloads to be CompositeExplicitAutograd Tensor upsample_nearest3d_backward_cuda( const Tensor& grad_output, - c10::optional output_size, + at::OptionalIntArrayRef output_size, IntArrayRef input_size, c10::optional> scale_factors) { auto osize = compute_output_size(input_size, output_size, scale_factors); @@ -357,7 +374,7 @@ Tensor upsample_nearest3d_backward_cuda( Tensor _upsample_nearest_exact3d_backward_cuda( const Tensor& grad_output, - c10::optional output_size, + at::OptionalIntArrayRef output_size, IntArrayRef input_size, c10::optional> scale_factors) { auto osize = compute_output_size(input_size, output_size, scale_factors); diff --git a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu index a3623d2eb0f8..b19bf4858ac6 100644 --- a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu +++ b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu @@ -1,9 +1,10 @@ // Adapted from interp.cpp from Caffe util by Pauline Luc // Originally developed by George Papandreou -#include -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include -#include +#include +#include #include #include #include @@ -12,6 +13,14 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#endif + namespace at { namespace native { namespace { diff --git a/aten/src/ATen/native/cuda/WeightNorm.cu b/aten/src/ATen/native/cuda/WeightNorm.cu index e9136ca61388..e25a1b40775d 100644 --- a/aten/src/ATen/native/cuda/WeightNorm.cu +++ b/aten/src/ATen/native/cuda/WeightNorm.cu @@ -1,11 +1,24 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include +#include #include #include #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + + namespace at { namespace native { namespace { @@ -413,7 +426,7 @@ std::tuple weight_norm_cuda return std::tuple{w, norms}; } -std::tuple weight_norm_cuda_backward +std::tuple weight_norm_backward_cuda (const Tensor & grad_w, const Tensor & saved_v, const Tensor & saved_g, diff --git a/aten/src/ATen/native/cuda/attention.cu b/aten/src/ATen/native/cuda/attention.cu deleted file mode 100644 index 8dad56fac0e6..000000000000 --- a/aten/src/ATen/native/cuda/attention.cu +++ /dev/null @@ -1,253 +0,0 @@ -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include - -namespace at { - -namespace native { - -namespace { - -Tensor gemm_nt(const Tensor& a, const Tensor& b) { - auto a_ = a.view({a.size(0) * a.size(1), a.size(2)}); - auto b_ = b.transpose(1, 0); - auto c_ = at::native::matmul(a_, b_); - return c_.view({a.size(0), a.size(1), b.size(0)}); -} - -template -__global__ void transform_bias_rescale_qkv_kernel( - // [B, T, 3 * D] - const PackedTensorAccessor64 qkv, - // [3 * D] - const PackedTensorAccessor64 qkv_bias, - // [3, B, NH, T, DH] - PackedTensorAccessor64 q_k_v) { - // warp per DH. - // so launch B * NH * T warps. - auto NH = q_k_v.size(2); - auto T = q_k_v.size(3); - auto DH = q_k_v.size(4); - - auto t = blockIdx.x % T; - auto b = blockIdx.x / T; - - auto D = NH * DH; - constexpr int VEC = 4; - const scalar_t sqrt_dim_per_head = std::sqrt(static_cast(DH)); - using LoadT = memory::aligned_vector; - - // FIXME: assert ((D % VEC) == 0) - - for (int32_t d_v = threadIdx.x; d_v < D / VEC; d_v += blockDim.x) { - auto d = d_v * VEC; - auto nh = d / DH; - auto dh = d % DH; - scalar_t qkv_bias_q[VEC]; - scalar_t qkv_bias_k[VEC]; - scalar_t qkv_bias_v[VEC]; - scalar_t qkv_q[VEC]; - scalar_t qkv_k[VEC]; - scalar_t qkv_v[VEC]; - - *reinterpret_cast(&qkv_bias_q) = - *reinterpret_cast(&qkv_bias[d + 0 * D]); - *reinterpret_cast(&qkv_bias_k) = - *reinterpret_cast(&qkv_bias[d + 1 * D]); - *reinterpret_cast(&qkv_bias_v) = - *reinterpret_cast(&qkv_bias[d + 2 * D]); - - *reinterpret_cast(&qkv_q) = - *reinterpret_cast(&qkv[b][t][d + 0 * D]); - *reinterpret_cast(&qkv_k) = - *reinterpret_cast(&qkv[b][t][d + 1 * D]); - *reinterpret_cast(&qkv_v) = - *reinterpret_cast(&qkv[b][t][d + 2 * D]); - -#pragma unroll - // TODO: specialize for float2half2/half2float2? - for (auto ii = 0; ii < VEC; ++ii) { - qkv_q[ii] = static_cast( - (static_cast(qkv_q[ii]) + - static_cast(qkv_bias_q[ii])) / - static_cast(sqrt_dim_per_head)); - qkv_k[ii] = static_cast( - (static_cast(qkv_k[ii]) + - static_cast(qkv_bias_k[ii]))); - qkv_v[ii] = static_cast( - (static_cast(qkv_v[ii]) + - static_cast(qkv_bias_v[ii]))); - } - *reinterpret_cast(&q_k_v[0][b][nh][t][dh]) = - *reinterpret_cast(&qkv_q); - *reinterpret_cast(&q_k_v[1][b][nh][t][dh]) = - *reinterpret_cast(&qkv_k); - *reinterpret_cast(&q_k_v[2][b][nh][t][dh]) = - *reinterpret_cast(&qkv_v); - } -} - -// compute q = (q + q_bias) / sqrt(dim_per_head), k = k + k_bias, v = v + v_bias -std::tuple transform_bias_rescale_qkv( - const Tensor& qkv, - const Tensor& qkv_bias, - const int64_t num_head) { - auto B = qkv.size(0); - auto T = qkv.size(1); - auto _3D = qkv.size(2); - auto D = _3D / 3; - TORCH_CHECK(D % num_head == 0); - const auto dim_per_head = D / num_head; - auto q_k_v = at::empty({3, B, num_head, T, dim_per_head}, qkv.options()); - AT_DISPATCH_FLOATING_TYPES_AND2( - ScalarType::Half, - ScalarType::BFloat16, - qkv.scalar_type(), - "transform_bias_rescale_qkv", - [&] { - using accscalar_t = acc_type; - auto threads = std::min(1024, D / 4); - auto blocks = B * T; - transform_bias_rescale_qkv_kernel - <<>>( - qkv.packed_accessor64(), - qkv_bias.packed_accessor64(), - q_k_v.packed_accessor64()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - }); - auto q_k_v_s = - at::native::split(q_k_v.view({3 * B, num_head, T, dim_per_head}), B, 0); - return std::make_tuple(q_k_v_s[0], q_k_v_s[1], q_k_v_s[2]); -} - -Tensor bmm_nt(const Tensor& a, const Tensor& b) { - auto a_ = a.view({a.size(0) * a.size(1), a.size(2), a.size(3)}); - auto b_ = b.view({b.size(0) * b.size(1), b.size(2), b.size(3)}); - auto bt_ = b_.transpose(2, 1); - // TODO: are these a single call to cublas batched matmul? - auto c_ = at::matmul(a_, bt_); - return c_.view({a.size(0), a.size(1), a.size(2), b.size(2)}); -} - -template -__inline__ __device__ T WarpReduceMax(T val) { -#pragma unroll - for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) { - val = std::max(val, WARP_SHFL_DOWN(val, offset)); - } - return val; -} - -template -__inline__ __device__ T WarpReduceSum(T val) { -#pragma unroll - for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) { - val += WARP_SHFL_DOWN(val, offset); - } - return val; -} - -void masked_softmax_dropout( - const Tensor& attn_scores, - const c10::optional& attn_mask) { - auto B = attn_scores.size(0); - auto num_heads = attn_scores.size(1); - auto T = attn_scores.size(2); - if (attn_mask) { - TORCH_CHECK(attn_mask->is_contiguous()); - } - AT_DISPATCH_FLOATING_TYPES_AND2( - ScalarType::Half, - ScalarType::BFloat16, - attn_scores.scalar_type(), - "masked_softmax_dropout", - [&] { - using accscalar_t = acc_type; - // TODO: proper implementation with masking. - dispatch_softmax_forward( - attn_scores.data_ptr(), - attn_scores.data_ptr(), - T, - T, - B * num_heads * T - ); - }); -} - -Tensor bmm_nn(const Tensor& a, const Tensor& b) { - auto a_ = a.view({a.size(0) * a.size(1), a.size(2), a.size(3)}); - auto b_ = b.view({b.size(0) * b.size(1), b.size(2), b.size(3)}); - // TODO: are these a single call to cublas batched matmul? - auto c_ = at::matmul(a_, b_); - return c_.view({a.size(0), a.size(1), a.size(2), b.size(3)}); -} - -Tensor transform_0213(const Tensor& a) { - // TODO: check perf vs dedicated kernel. - return a.permute({0, 2, 1, 3}) - .contiguous() - .view({a.size(0), a.size(2), a.size(1) * a.size(3)}); -} - -Tensor gemm_nt_bias(const Tensor& a, const Tensor& b, const Tensor& c) { - auto a_ = a.view({a.size(0) * a.size(1), a.size(2)}); - auto r_ = at::native::linear(a_, b, c); - return r_.view({a.size(0), a.size(1), r_.size(1)}); -} - -} // namespace - -Tensor multi_head_self_attention_cuda( - const Tensor& query, - const Tensor& qkv_weight, - const Tensor& qkv_bias, - const Tensor& proj_weight, - const Tensor& proj_bias, - const int64_t num_head, - const c10::optional& mask) { - // query shape: [B, T, D] - // qkv_weight shape: [3 * D, D] - - // shape: [B, T, 3 x D] - auto qkv = gemm_nt(query, qkv_weight); - - // shape: 3 x [B, num_head, T, dim_per_head] - auto q_k_v = transform_bias_rescale_qkv(qkv, qkv_bias, num_head); - auto q = std::get<0>(q_k_v); - auto k = std::get<1>(q_k_v); - auto v = std::get<2>(q_k_v); - - // shape: [B, num_head, T, T] - auto qkt = bmm_nt(q, k); - - // shape: [B, num_head, T, T] - masked_softmax_dropout(qkt, mask); - - // shape: [B, num_head, T, dim_per_head] - auto attn_ctx = bmm_nn(qkt, v); - - // shape: [B, T, D] - auto attn = transform_0213(attn_ctx); - - // shape: [B, T, D] - auto proj = gemm_nt_bias(attn, proj_weight, proj_bias); - - return proj; -} - -} // namespace native -} // namespace at diff --git a/aten/src/ATen/native/cuda/group_norm_kernel.cu b/aten/src/ATen/native/cuda/group_norm_kernel.cu index 8abbae013a59..53ce77fa37b1 100644 --- a/aten/src/ATen/native/cuda/group_norm_kernel.cu +++ b/aten/src/ATen/native/cuda/group_norm_kernel.cu @@ -1,13 +1,13 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include #include -#include +#include #include #include -#include #include #include #include @@ -15,6 +15,12 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + namespace at { namespace native { @@ -573,7 +579,7 @@ void GroupNormKernelImplInternal( cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream(); const int64_t num_threads = D * HxW < cuda_utils::kCUDABlockReduceNumThreads - ? C10_WARP_SIZE + ? at::cuda::warp_size() : cuda_utils::kCUDABlockReduceNumThreads; RowwiseMomentsCUDAKernel<<>>( D * HxW, eps, X_data, mean_data, rstd_data); @@ -694,7 +700,7 @@ void GroupNorm1dBackward( T_ACC* c2_data = c2.data_ptr(); T_ACC* c3_data = c3.data_ptr(); const int64_t num_threads = (C / G) < cuda_utils::kCUDABlockReduceNumThreads - ? C10_WARP_SIZE + ? at::cuda::warp_size() : cuda_utils::kCUDABlockReduceNumThreads; Compute1dBackwardFusedParamsCUDAKernel <<>>( @@ -841,8 +847,9 @@ void GroupNormBackwardKernelImplInternal( return; } + int warp_size = at::cuda::warp_size(); int64_t num_threads = HxW < cuda_utils::kCUDABlockReduceNumThreads - ? C10_WARP_SIZE + ? warp_size : cuda_utils::kCUDABlockReduceNumThreads; ComputeInternalGradientsCUDAKernel<<>>( HxW, dY_data, X_data, ds_data, db_data); @@ -868,7 +875,7 @@ void GroupNormBackwardKernelImplInternal( } num_threads = (C / G) < cuda_utils::kCUDABlockReduceNumThreads - ? C10_WARP_SIZE + ? warp_size : cuda_utils::kCUDABlockReduceNumThreads; ComputeBackwardFusedParamsCUDAKernel <<>>( diff --git a/aten/src/ATen/native/cuda/im2col.cuh b/aten/src/ATen/native/cuda/im2col.cuh index 9c692e1e6c9e..6398230e5d5a 100644 --- a/aten/src/ATen/native/cuda/im2col.cuh +++ b/aten/src/ATen/native/cuda/im2col.cuh @@ -1,9 +1,5 @@ #pragma once -#include -#include -#include - #include #include #include diff --git a/aten/src/ATen/native/cuda/jit_utils.cpp b/aten/src/ATen/native/cuda/jit_utils.cpp index c8010a6e9b0a..0b6dcd3787a4 100644 --- a/aten/src/ATen/native/cuda/jit_utils.cpp +++ b/aten/src/ATen/native/cuda/jit_utils.cpp @@ -1,3 +1,4 @@ +#define TORCH_ASSERT_NO_OPERATORS #include #include #include @@ -6,10 +7,10 @@ #include #include #include -#include #include #include #include +#include #include #include @@ -82,7 +83,7 @@ const std::string jit_common_types = R"ESCAPE( _(void, QInt32) /* 14 */ \ _(at::BFloat16, BFloat16) /* 15 */ \ - #define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(_) \ + #define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_QINT(_) \ _(uint8_t, Byte) \ _(int8_t, Char) \ _(int16_t, Short) \ @@ -91,6 +92,7 @@ const std::string jit_common_types = R"ESCAPE( _(at::Half, Half) \ _(float, Float) \ _(double, Double) \ + _(std::complex, ComplexHalf) \ _(std::complex, ComplexFloat) \ _(std::complex, ComplexDouble) \ _(bool, Bool) \ @@ -118,11 +120,17 @@ const std::string jit_common_types = R"ESCAPE( Array() = default; Array(const Array&) = default; Array& operator=(const Array&) = default; + __device__ Array(T x) { + for (int i = 0; i < size; i++) { + data[i] = x; + } + } }; ${half_string} ${bfloat16_string} ${complex_body_string} + ${complex_half_body_string} ${complex_math_string} @@ -249,6 +257,29 @@ const std::string dynamic_cast_support_literal = R"ESCAPE( } }; + template <> + struct static_cast_with_inter_type, at::BFloat16> { + static inline std::complex apply(at::BFloat16 src) { + return static_cast>(float{src}); + } + }; + + template <> + struct static_cast_with_inter_type, at::Half> { + static inline std::complex apply(at::Half src) { + return static_cast>(float{src}); + } + }; + + template <> + struct static_cast_with_inter_type< + std::complex, + std::complex> { + static inline std::complex apply(std::complex src) { + return static_cast>(static_cast>(src)); + } + }; + // Fetch a value with dynamic type src_type from ptr, and cast it to static type dest_t. #define FETCH_AND_CAST_CASE(type, scalartype) \ case ScalarType::scalartype: \ @@ -256,7 +287,7 @@ const std::string dynamic_cast_support_literal = R"ESCAPE( template __device__ inline dest_t fetch_and_cast(const ScalarType src_type, const void *ptr) { switch (src_type) { - AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(FETCH_AND_CAST_CASE) + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_QINT(FETCH_AND_CAST_CASE) default: ERROR_UNSUPPORTED_CAST } @@ -271,7 +302,7 @@ const std::string dynamic_cast_support_literal = R"ESCAPE( template __device__ inline void cast_and_store(const ScalarType dest_type, void *ptr, src_t value) { switch (dest_type) { - AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(CAST_AND_STORE_CASE) + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_QINT(CAST_AND_STORE_CASE) default:; } ERROR_UNSUPPORTED_CAST @@ -322,10 +353,7 @@ const std::string no_dynamic_cast_support_literal = R"ESCAPE( )ESCAPE"; -const std::string jit_code_template = R"ESCAPE( - - ${dynamic_casting_string} - +const std::string offset_calc_template = R"ESCAPE( template struct DivMod { T div; @@ -409,6 +437,14 @@ const std::string jit_code_template = R"ESCAPE( ${index_type} strides_[25][NARGS]; }; + +)ESCAPE"; + +const std::string jit_code_template = R"ESCAPE( + + ${dynamic_casting_string} + + ${functor} // TODO: setup grid-stride loop @@ -709,7 +745,10 @@ std::string generate_code( functor_args << "arg0[j], scalar_val"; } env.s("args", functor_args.str()); - if (f_inputs_type == "at::Half" || result_type == "at::Half" || dynamic_casting) { + if (f_inputs_type == "at::Half" || result_type == "at::Half" || + f_inputs_type == "std::complex" || + result_type == "std::complex" || dynamic_casting) { + // complex depends on complex and Half dtypes. env.s("half_string", jiterator_half_support_literal); } else { env.s("half_string", ""); @@ -722,7 +761,9 @@ std::string generate_code( // the definition of complex math functions is only needed when the compute type is complex // but the definition of std::complex is needed for dynamic casting even if the compute type is not complex if (f_inputs_type == "std::complex" || result_type == "std::complex" || - f_inputs_type == "std::complex" || result_type == "std::complex") { + f_inputs_type == "std::complex" || result_type == "std::complex" || + f_inputs_type == "std::complex" || result_type == "std::complex") { + // complex depends on complex and Half dtypes. env.s("traits_string", get_traits_string()); env.s("complex_body_string", get_complex_body_string()); env.s("complex_math_string", get_complex_math_string()); @@ -735,6 +776,15 @@ std::string generate_code( env.s("complex_body_string", ""); env.s("complex_math_string", ""); } + if (f_inputs_type == "std::complex" || + result_type == "std::complex" || dynamic_casting) { + // dynamic_casting requires the definition of all types + // include complex + // Look at the definition of `StoreWithCast` and `LoadWithCast`. + env.s("complex_half_body_string", get_complex_half_body_string()); + } else { + env.s("complex_half_body_string", ""); + } if (!vectorized) { if (!dynamic_casting) { @@ -769,7 +819,7 @@ std::string generate_code( << ">(out[j], data[0], output_offsets[0]);\n"; env.s("store_outputs", store_outputs.str()); - static auto cuda_template = at::jit::CodeTemplate(jit_common_types + jit_code_template); + static auto cuda_template = at::jit::CodeTemplate(jit_common_types + offset_calc_template + jit_code_template); const auto code = cuda_template.format(env); return code; } @@ -808,6 +858,134 @@ std::string generate_code( return code; } +// Creates directories recursively +bool _r_mkdir(const std::string& dir) { + // Check if current dir exists + const char* p_dir = dir.c_str(); + const bool dir_exists = (access(p_dir, F_OK) == 0); + if (dir_exists) { + return true; + } + + // Try to create current directory +#ifdef _WIN32 + int ret = _mkdir(dir.c_str()); +#else + int ret = mkdir(dir.c_str(), S_IRWXU | S_IRWXG | S_IRWXO); +#endif + // Success + if (ret == 0) { + return true; + } + + // Find folder separator and check if we are at the top + auto pos = dir.find_last_of("/\\"); + if (pos == std::string::npos) { + return false; + } + + // Try to create parent directory + if (!(_r_mkdir(dir.substr(0, pos)))) { + return false; + } + + // Try to create complete path again +#ifdef _WIN32 + ret = _mkdir(dir.c_str()); +#else + ret = mkdir(dir.c_str(), S_IRWXU | S_IRWXG | S_IRWXO); +#endif + return ret == 0; +} + +// Creates directories recursively assuming that base exists +bool r_mkdir_with_base(std::string& base, std::string& dir){ + const char* p_base = base.c_str(); + const bool base_exists = (access(p_base, F_OK) == 0); + if (!base_exists) { + return false; + } + + // remove trailing '/' or '\\' + if ((base[base.size()-1]=='/') || base[base.size()-1]=='\\') { + base.pop_back(); + } + if ((dir[dir.size()-1]=='/') || dir[dir.size()-1]=='\\') { + dir.pop_back(); + } + + return _r_mkdir(base+dir); + +} + +std::string load_code_template(const std::string& path) { + std::ifstream ifs{path}; + std::string s{ + std::istreambuf_iterator(ifs), + std::istreambuf_iterator()}; + return s; +} + +std::string generate_reduction_code( + int nOutputs, + const std::string& func, + const std::string& name, + const int vt0, + const std::string& f_inputs_type, + const std::string& reduction_accum_type, + const std::string& result_type, + bool contiguous, + bool vectorized, + int vec_size, + int max_threads_codegen) { + at::jit::TemplateEnv env; + env.s("index_type", "unsigned int"); + env.s("scalar_type", f_inputs_type); + env.s("result_type", result_type); + env.s("reduction_accum_type", reduction_accum_type); + env.s("vt0", std::to_string(vt0)); + env.s("name", name); + env.s("max_threads_lb", std::to_string(max_threads_codegen)); + // reductions don't support dynamic casting, so the only way to get nonstandard types + // is through input + if (f_inputs_type == "at::Half" || f_inputs_type == "std::complex") { + // complex depends on complex and Half dtypes. + env.s("half_string", jiterator_half_support_literal); + } else { + env.s("half_string", ""); + } + if (f_inputs_type == "at::BFloat16") { + env.s("bfloat16_string", jiterator_bfloat16_support_literal); + } else { + env.s("bfloat16_string", ""); + } + if (f_inputs_type == "std::complex" || + f_inputs_type == "std::complex" || + f_inputs_type == "std::complex" ) { + // complex depends on complex and Half dtypes. + env.s("traits_string", get_traits_string()); + env.s("complex_body_string", get_complex_body_string()); + env.s("complex_math_string", get_complex_math_string()); + env.s("complex", std::to_string(1)); + } else { + env.s("traits_string", ""); + env.s("complex_body_string", ""); + env.s("complex_math_string", ""); + env.s("complex", std::to_string(0)); + } + if (f_inputs_type == "std::complex") { + env.s("complex_half_body_string", get_complex_half_body_string()); + } else { + env.s("complex_half_body_string", ""); + } + env.s("cmath_string", get_cmath_string()); + env.s("functor", func); + env.s("output_vec_size", std::to_string(vec_size)); + static auto cuda_template = at::jit::CodeTemplate( + jit_common_types + offset_calc_template + get_reduction_template()); + const auto code = cuda_template.format(env); + return code; +} // Acquires (possibly creating) the kernel cache directory c10::optional get_cache_dir() { @@ -822,6 +1000,8 @@ c10::optional get_cache_dir() { // Cache path comes from PYTORCH_KERNEL_CACHE_PATH, then TEMP (Windows) or XDG_CACHE_HOME (Linux), then HOME environment variables std::string cache_dir; char* ptkcp = std::getenv("PYTORCH_KERNEL_CACHE_PATH"); + // Create kernel_cache_dir if needed as we do not want to create the base directory passed by the user + std::string kernels_cache_dir = ""; if (ptkcp != nullptr) { cache_dir = std::string(ptkcp); } else { @@ -832,7 +1012,8 @@ c10::optional get_cache_dir() { ptkcp = std::getenv("XDG_CACHE_HOME"); #endif if (ptkcp != nullptr) { - cache_dir = std::string(ptkcp) + "/torch/kernels"; + kernels_cache_dir = "/torch/kernels"; + cache_dir = std::string(ptkcp) + kernels_cache_dir; } else { // Falls back to HOME/.cache ptkcp = std::getenv("HOME"); @@ -841,7 +1022,8 @@ c10::optional get_cache_dir() { " This disables kernel caching."); return {}; } else { - cache_dir = std::string(ptkcp) + "/.cache/torch/kernels"; + kernels_cache_dir = "/.cache/torch/kernels"; + cache_dir = std::string(ptkcp) + kernels_cache_dir; } } } @@ -850,11 +1032,8 @@ c10::optional get_cache_dir() { const char* p_cache_dir = cache_dir.c_str(); const bool cache_dir_exists = (access(p_cache_dir, F_OK) == 0); if (!cache_dir_exists) { -#ifdef _WIN32 - if (_mkdir(p_cache_dir) != 0) { -#else - if (mkdir(p_cache_dir, S_IRWXU | S_IRWXG | S_IRWXO) != 0) { -#endif + std::string s_ptkcp = std::string(ptkcp); + if (!r_mkdir_with_base(s_ptkcp, kernels_cache_dir)) { TORCH_WARN_ONCE("Specified kernel cache directory could not be created! This disables kernel caching.", " Specified directory is ", cache_dir, ".", " This warning will appear only once per process."); @@ -886,9 +1065,7 @@ c10::optional get_cache_dir() { NvrtcFunction jit_pwise_function( const std::string& code, const std::string& kernel_name) { - initializeCudaContext(); - // Acquires CUDA and nvrtc versions and whether we're compiling to ptx or SASS const cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); int cuda_major = 0, cuda_minor = 0, nvrtc_major = 0, nvrtc_minor = 0; @@ -983,7 +1160,7 @@ NvrtcFunction jit_pwise_function( AT_CUDA_NVRTC_CHECK(nvrtc.nvrtcGetProgramLog(program, log.data())); std::stringstream cu; cu << log.data(); - throw std::runtime_error(cu.str() + code); + throw std::runtime_error(code + cu.str()); } size_t ptx_size = 0; @@ -1049,24 +1226,26 @@ NvrtcFunction jit_pwise_function( void launch_jitted_pwise_function( NvrtcFunction function, void* args[], - const int nBlocks, - const int kBlockSize) { + const dim3 nBlocks, + const dim3 kBlockSize, + const int smem) { initializeCudaContext(); const auto& nvrtc = at::globalContext().getNVRTC(); // Launches kernel on current stream auto stream = at::cuda::getCurrentCUDAStream(); AT_CUDA_DRIVER_CHECK(nvrtc.cuLaunchKernel( function.function, - nBlocks, - 1, - 1, - kBlockSize, - 1, - 1, - 0, + nBlocks.x, + nBlocks.y, + nBlocks.z, + kBlockSize.x, + kBlockSize.y, + kBlockSize.z, + smem, stream, args, nullptr)); } + }}} // at::cuda::jit diff --git a/aten/src/ATen/native/cuda/jit_utils.h b/aten/src/ATen/native/cuda/jit_utils.h index 908ffabfea2f..2af015bbb7fe 100644 --- a/aten/src/ATen/native/cuda/jit_utils.h +++ b/aten/src/ATen/native/cuda/jit_utils.h @@ -8,6 +8,7 @@ #include #include #include +#include namespace at { namespace cuda { namespace jit { @@ -32,6 +33,19 @@ std::string generate_code( bool vectorized=false, int vec_size=0); +std::string generate_reduction_code( + int nOutputs, + const std::string& func, + const std::string& name, + const int vt0, + const std::string& f_inputs_type, + const std::string& reduction_accum_type, + const std::string& result_type, + bool contiguous, + bool vectorized, + int vec_size, + int max_threads_codegen); + NvrtcFunction jit_pwise_function( const std::string& code, const std::string& kernel_name); @@ -39,8 +53,9 @@ NvrtcFunction jit_pwise_function( void launch_jitted_pwise_function( NvrtcFunction function, void* args[], - const int nBlocks, - const int kBlockSize); + const dim3 nBlocks, + const dim3 kBlockSize, + const int smem=0); template struct delayed_false : std::false_type { @@ -53,7 +68,7 @@ struct delayed_false : std::false_type { template inline std::string typeName() { // we can't use static_assert(false) directly as the - // program will be not compile even if the template is not + // program will be not compiled even if the template is not // instantiated, so we use `delayed_false` // to make sure compiler doesn't eagerly raise // fail this assertion. @@ -71,16 +86,18 @@ AT_FORALL_SCALAR_TYPES(TYPE_NAME_FN) // JIT uses std::complex directly, because nvRTC compile programs // with -default-device, so there is no such issue like: // "std::sin(complex) is __host__ only" +template <> inline std::string typeName(){ + return "bool"; +} +template <> inline std::string typeName>(){ + return "std::complex"; +} template <> inline std::string typeName>(){ return "std::complex"; } template <> inline std::string typeName>(){ return "std::complex"; } -template <> inline std::string typeName>(){ - TORCH_INTERNAL_ASSERT(false, "torch.complex32 is not supported"); - return "std::complex"; -} template <> inline std::string typeName(){ return "at::Half"; } @@ -88,4 +105,20 @@ template <> inline std::string typeName(){ return "at::BFloat16"; } +#define TYPE_NAME_CASE(ctype, scalartype) \ + case ScalarType::scalartype: return std::string(#ctype); +inline std::string typeName(ScalarType t) { + switch (t) { + AT_FORALL_SCALAR_TYPES(TYPE_NAME_CASE) + case ScalarType::Bool : return "bool"; + case ScalarType::Half : return "at::Half"; + case ScalarType::BFloat16 : return "at::BFloat16"; + case ScalarType::ComplexFloat : return "std::complex"; + case ScalarType::ComplexDouble : return "std::complex"; + default: + TORCH_CHECK(false, "invalid type for jiterator"); + } +} +#undef TYPE_NAME_CASE + }}} // namespace at::cuda::jit diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu index 940ff7d06819..faa0fd2d4b98 100644 --- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu +++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu @@ -1,16 +1,28 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include #include -#include +#include #include #include -#include #include #include #include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#endif #include @@ -636,8 +648,8 @@ void launch_vectorized_layer_norm_kernel( ) { //constexpr int alignment = 16; //currently unused to make sure float and half results are bw accurate auto stream = at::cuda::getCurrentCUDAStream().stream(); - const int num_threads = 128; - const dim3 threads(C10_WARP_SIZE,num_threads/C10_WARP_SIZE,1); + const int warp_size = at::cuda::warp_size(); + const dim3 threads(warp_size, num_threads() / warp_size, 1); const dim3 blocks(M); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(threads.y % 2 == 0 || threads.y == 1); int nshared = threads.y > 1 ? threads.y * 3/2 *sizeof(T_ACC) : 0; @@ -739,10 +751,10 @@ void LayerNormBackwardKernelImplInternal( T* dX_data = dX->defined() ? dX->template data_ptr() : nullptr; cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream(); if (dX_data != nullptr) { - const int num_threads = 128; + const int warp_size = at::cuda::warp_size(); const dim3 blocks(M); - int nshared = (num_threads/C10_WARP_SIZE) * sizeof(T_ACC); - layer_norm_grad_input_kernel<<>>(dY_data, + int nshared = (num_threads()/warp_size) * sizeof(T_ACC); + layer_norm_grad_input_kernel<<>>(dY_data, X_data, mean_data, rstd_data, gamma_data, dX_data, N); C10_CUDA_KERNEL_LAUNCH_CHECK(); } @@ -933,6 +945,7 @@ std::tuple layer_norm_backward_cuda( return std::make_tuple(std::move(dX), std::move(dgamma), std::move(dbeta)); } +REGISTER_DISPATCH(LayerNormKernel, &LayerNormKernelImpl); } // namespace native } // namespace at diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp index 9910859d8b86..7eee90a1b227 100644 --- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp +++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp @@ -1,7 +1,9 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include #include #include #include -#include #include #include @@ -9,12 +11,30 @@ #include #include -#include #include #include #include +#include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + #if AT_MAGMA_ENABLED() #include #include @@ -25,11 +45,22 @@ const bool use_magma_ = true; namespace { struct MagmaInitializer { MagmaInitializer() { +#if defined(BUILD_LAZY_CUDA_LINALG) + magma_init(); +#else ::at::cuda::detail::set_magma_init_fn([]{ magma_init(); }); - }; +#endif + } } initializer; } // namespace (anonymous) +#define AT_MAGMA_VERSION MAGMA_VERSION_MAJOR*100 + MAGMA_VERSION_MINOR*10 + MAGMA_VERSION_MICRO + +// Check that MAGMA never releases MAGMA_VERSION_MINOR >= 10 or MAGMA_VERSION_MICRO >= 10 +#if MAGMA_VERSION_MINOR >= 10 || MAGMA_VERSION_MICRO >= 10 +#error "MAGMA release minor or micro version >= 10, please correct AT_MAGMA_VERSION" +#endif + #else const bool use_magma_ = false; @@ -37,18 +68,28 @@ const bool use_magma_ = false; namespace at { namespace native { +#if defined(BUILD_LAZY_CUDA_LINALG) +// All registrations with PyTorch runtime should be done dynamically +// so if library is lazy loaded it must not export anything, otherwise +// it can result in symbol clashes +namespace lazy_linalg { +#endif #if AT_MAGMA_ENABLED() -template -void magmaSolve( - magma_int_t n, magma_int_t nrhs, scalar_t* dA, magma_int_t ldda, - magma_int_t* ipiv, scalar_t* dB, magma_int_t lddb, magma_int_t* info); -template -void magmaSolveBatched( - magma_int_t n, magma_int_t nrhs, scalar_t** dA_array, magma_int_t ldda, - magma_int_t** dipiv_array, scalar_t** dB_array, magma_int_t lddb, - magma_int_t* dinfo_array, magma_int_t batch_count, const MAGMAQueue& magma_queue); +template +void magmaLdlHermitian( + magma_uplo_t uplo, + magma_int_t n, + scalar_t* dA, + magma_int_t ldda, + magma_int_t* ipiv, + magma_int_t* info) { + TORCH_CHECK( + false, + "LDL decomposition is not available.", + "Please rebuild with MAGMA 2.5.4+."); +} template void magmaLu( @@ -163,85 +204,63 @@ void magmaGels( scalar_t* dA, magma_int_t ldda, scalar_t* dB, magma_int_t lddb, scalar_t* hwork, magma_int_t lwork, magma_int_t* info); -template<> -void magmaSolve( - magma_int_t n, magma_int_t nrhs, double* dA, magma_int_t ldda, - magma_int_t* ipiv, double* dB, magma_int_t lddb, magma_int_t* info) { - MagmaStreamSyncGuard guard; - magma_dgesv_gpu(n, nrhs, dA, ldda, ipiv, dB, lddb, info); - AT_CUDA_CHECK(cudaGetLastError()); -} +#if AT_MAGMA_VERSION >= 254 -template<> -void magmaSolve( - magma_int_t n, magma_int_t nrhs, float* dA, magma_int_t ldda, - magma_int_t* ipiv, float* dB, magma_int_t lddb, magma_int_t* info) { +template <> +void magmaLdlHermitian( + magma_uplo_t uplo, + magma_int_t n, + double* dA, + magma_int_t ldda, + magma_int_t* ipiv, + magma_int_t* info) { MagmaStreamSyncGuard guard; - magma_sgesv_gpu(n, nrhs, dA, ldda, ipiv, dB, lddb, info); + magma_dsytrf_gpu(uplo, n, dA, ldda, ipiv, info); AT_CUDA_CHECK(cudaGetLastError()); } -template<> -void magmaSolve>( - magma_int_t n, magma_int_t nrhs, c10::complex* dA, magma_int_t ldda, - magma_int_t* ipiv, c10::complex* dB, magma_int_t lddb, magma_int_t* info) { +template <> +void magmaLdlHermitian( + magma_uplo_t uplo, + magma_int_t n, + float* dA, + magma_int_t ldda, + magma_int_t* ipiv, + magma_int_t* info) { MagmaStreamSyncGuard guard; - magma_zgesv_gpu(n, nrhs, - reinterpret_cast(dA), ldda, ipiv, - reinterpret_cast(dB), lddb, info); + magma_ssytrf_gpu(uplo, n, dA, ldda, ipiv, info); AT_CUDA_CHECK(cudaGetLastError()); } -template<> -void magmaSolve>( - magma_int_t n, magma_int_t nrhs, c10::complex* dA, magma_int_t ldda, - magma_int_t* ipiv, c10::complex* dB, magma_int_t lddb, magma_int_t* info) { +template <> +void magmaLdlHermitian>( + magma_uplo_t uplo, + magma_int_t n, + c10::complex* dA, + magma_int_t ldda, + magma_int_t* ipiv, + magma_int_t* info) { MagmaStreamSyncGuard guard; - magma_cgesv_gpu(n, nrhs, - reinterpret_cast(dA), ldda, ipiv, - reinterpret_cast(dB), lddb, info); - AT_CUDA_CHECK(cudaGetLastError()); -} - -template<> -void magmaSolveBatched( - magma_int_t n, magma_int_t nrhs, double** dA_array, magma_int_t ldda, - magma_int_t** dipiv_array, double** dB_array, magma_int_t lddb, - magma_int_t* dinfo_array, magma_int_t batch_count, const MAGMAQueue& magma_queue) { - magma_dgesv_batched(n, nrhs, dA_array, ldda, dipiv_array, dB_array, lddb, dinfo_array, batch_count, magma_queue.get_queue()); - AT_CUDA_CHECK(cudaGetLastError()); -} - -template<> -void magmaSolveBatched( - magma_int_t n, magma_int_t nrhs, float** dA_array, magma_int_t ldda, - magma_int_t** dipiv_array, float** dB_array, magma_int_t lddb, - magma_int_t* dinfo_array, magma_int_t batch_count, const MAGMAQueue& magma_queue) { - magma_sgesv_batched(n, nrhs, dA_array, ldda, dipiv_array, dB_array, lddb, dinfo_array, batch_count, magma_queue.get_queue()); + magma_zhetrf_gpu( + uplo, n, reinterpret_cast(dA), ldda, ipiv, info); AT_CUDA_CHECK(cudaGetLastError()); } -template<> -void magmaSolveBatched>( - magma_int_t n, magma_int_t nrhs, c10::complex** dA_array, magma_int_t ldda, - magma_int_t** dipiv_array, c10::complex** dB_array, magma_int_t lddb, - magma_int_t* dinfo_array, magma_int_t batch_count, const MAGMAQueue& magma_queue) { - magma_zgesv_batched(n, nrhs, - reinterpret_cast(dA_array), ldda, dipiv_array, - reinterpret_cast(dB_array), lddb, dinfo_array, batch_count, magma_queue.get_queue()); +template <> +void magmaLdlHermitian>( + magma_uplo_t uplo, + magma_int_t n, + c10::complex* dA, + magma_int_t ldda, + magma_int_t* ipiv, + magma_int_t* info) { + MagmaStreamSyncGuard guard; + magma_chetrf_gpu( + uplo, n, reinterpret_cast(dA), ldda, ipiv, info); AT_CUDA_CHECK(cudaGetLastError()); } -template<> -void magmaSolveBatched>( - magma_int_t n, magma_int_t nrhs, c10::complex** dA_array, magma_int_t ldda, - magma_int_t** dipiv_array, c10::complex** dB_array, magma_int_t lddb, - magma_int_t* dinfo_array, magma_int_t batch_count, const MAGMAQueue& magma_queue) { - magma_cgesv_batched(n, nrhs, - reinterpret_cast(dA_array), ldda, dipiv_array, - reinterpret_cast(dB_array), lddb, dinfo_array, batch_count, magma_queue.get_queue()); - AT_CUDA_CHECK(cudaGetLastError()); -} +#endif // AT_MAGMA_VERSION >= 254 template<> void magmaLu( @@ -1249,95 +1268,127 @@ magma_trans_t to_magma(TransposeType trans) { auto storage_##name = pin_memory(size); \ name = static_cast(storage_##name.data()); -// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ solve ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +namespace { template -static void apply_solve(Tensor& b, Tensor& A, Tensor& infos_out) { +void apply_ldl_factor_magma( + const Tensor& A, + const Tensor& pivots, + const Tensor& info, + bool upper) { #if !AT_MAGMA_ENABLED() -AT_ERROR("solve: MAGMA library not found in " - "compilation. Please rebuild with MAGMA."); + TORCH_CHECK( + false, + "torch.linalg.ldl_factor: MAGMA library not found in " + "compilation. Please rebuild with MAGMA."); #else - auto A_data = A.data_ptr(); - auto b_data = b.data_ptr(); + auto batch_size = batchCount(A); magma_int_t n = magma_int_cast(A.size(-2), "A.size(-2)"); - magma_int_t nrhs = magma_int_cast(b.size(-1), "b.size(-1)"); - magma_int_t lda = std::max(magma_int_t{1}, n); - - if (b.dim() == 2) { - auto ipiv = at::empty({n}, at::kInt); - // magmaSolve requires infos tensor to live on CPU - Tensor infos = at::empty(infos_out.sizes(), infos_out.options().device(kCPU)); - magmaSolve(n, nrhs, A_data, lda, ipiv.data_ptr(), - b_data, lda, infos.data_ptr()); - infos_out.copy_(infos); - } else { - auto infos_data = infos_out.data_ptr(); - auto A_mat_stride = matrixStride(A); - auto b_mat_stride = matrixStride(b); - magma_int_t batch_size = magma_int_cast(batchCount(A), "batchCount"); - - magma_int_t* ipiv_data; - magma_int_t** ipiv_array; - scalar_t** A_array; - scalar_t** b_array; - - ALLOCATE_ARRAY(ipiv_data, magma_int_t, batch_size * n); - ALLOCATE_ARRAY(ipiv_array, magma_int_t*, batch_size); - ALLOCATE_ARRAY(A_array, scalar_t*, batch_size); - ALLOCATE_ARRAY(b_array, scalar_t*, batch_size); - - // Set up the created arrays - for (int64_t i = 0; i < batch_size; i++) { - A_array[i] = &A_data[i * A_mat_stride]; - b_array[i] = &b_data[i * b_mat_stride]; - ipiv_array[i] = &ipiv_data[i * n]; - } - - MAGMAQueue magma_queue(b.get_device()); + magma_int_t leading_dim = magma_int_cast(A.stride(-1), "A.stride(-1)"); + magma_uplo_t uplo = upper ? MagmaUpper : MagmaLower; - constexpr int64_t batch_limit = 65535; - // Compute as many batches of 65535 possible - // The number of "mini"-batches are floor(batch_size / batch_limit) - // and these cover floor(batch_size / batch_limit) * batch_limit matrix solves - int64_t mini_batches = batch_size / batch_limit, mini_idx; - for (mini_idx = 0; mini_idx < mini_batches * batch_limit; mini_idx += batch_limit) { - scalar_t** A_array_cur = &A_array[mini_idx]; - scalar_t** b_array_cur = &b_array[mini_idx]; - magma_int_t** ipiv_array_cur = &ipiv_array[mini_idx]; - magma_int_t* info_array_cur = &infos_data[mini_idx]; + auto a_stride = A.dim() > 2 ? A.stride(-3) : 0; + auto pivots_stride = pivots.dim() > 1 ? pivots.stride(-2) : 0; - magmaSolveBatched( - n, nrhs, A_array_cur, lda, ipiv_array_cur, b_array_cur, lda, - info_array_cur, batch_limit, magma_queue); - } + auto a_data = A.data_ptr(); + Tensor pivots_cpu = + at::empty_like(pivots, pivots.options().device(kCPU).pinned_memory(true)); + auto pivots_data = pivots_cpu.data_ptr(); + Tensor info_cpu = + at::empty_like(info, info.options().device(kCPU).pinned_memory(true)); + auto info_data = info_cpu.data_ptr(); + + for (const auto i : c10::irange(batch_size)) { + scalar_t* a_working_ptr = &a_data[i * a_stride]; + magma_int_t* pivots_working_ptr = &pivots_data[i * pivots_stride]; + magma_int_t* info_working_ptr = &info_data[i]; + magmaLdlHermitian( + uplo, + n, + a_working_ptr, + leading_dim, + pivots_working_ptr, + info_working_ptr); + } + pivots.copy_(pivots_cpu); + info.copy_(info_cpu); +#endif +} - // Compute whatever is left = batch_size - floor(batch_size / batch_limit) * batch_limit - // which concisely is equal to batch_size % batch_limit - if (batch_size % batch_limit != 0) { - magmaSolveBatched( - n, nrhs, &A_array[mini_idx], lda, &ipiv_array[mini_idx], &b_array[mini_idx], lda, - &infos_data[mini_idx], batch_size % batch_limit, magma_queue); - } +void ldl_factor_magma( + const Tensor& LD, + const Tensor& pivots, + const Tensor& info, + bool upper, + bool hermitian) { + if (LD.is_complex()) { + TORCH_CHECK( + hermitian, + "torch.linalg.ldl_factor: complex tensors with hermitian=False flag are not supported with MAGMA backend. ", + "Currently preferred backend is ", + at::globalContext().linalgPreferredBackend(), + ", please set 'default' or 'cusolver' backend with torch.backends.cuda.preferred_linalg_library"); } + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES( + LD.scalar_type(), "ldl_factor_magma", [&] { + apply_ldl_factor_magma(LD, pivots, info, upper); + }); +} + +void ldl_factor_kernel( + const Tensor& LD, + const Tensor& pivots, + const Tensor& info, + bool upper, + bool hermitian) { + auto preferred_backend = at::globalContext().linalgPreferredBackend(); + switch (preferred_backend) { + case at::LinalgBackend::Cusolver: + return ldl_factor_cusolver( + LD, pivots, info, upper, hermitian); + case at::LinalgBackend::Magma: + return ldl_factor_magma(LD, pivots, info, upper, hermitian); + default: + // By default use cusolver if available and magma otherwise. + // If cusolver and magma 2.5.4+ are both available and hermitian=true, + // call magma for complex inputs +#ifdef USE_CUSOLVER +#if AT_MAGMA_ENABLED() && (AT_MAGMA_VERSION >= 254) + if (LD.is_complex() && hermitian) { + return ldl_factor_magma( + LD, pivots, info, upper, hermitian); + } +#endif + return ldl_factor_cusolver( + LD, pivots, info, upper, hermitian); +#else + return ldl_factor_magma(LD, pivots, info, upper, hermitian); #endif + } } -std::tuple _solve_helper_cuda(const Tensor& self, const Tensor& A) { - auto self_working_copy = cloneBatchedColumnMajor(self); - auto A_working_copy = cloneBatchedColumnMajor(A); - // infos might not get filled for empty inputs therefore at::zeros is used instead of at::empty - auto infos = at::zeros({std::max(1, batchCount(self))}, self.options().dtype(kInt)); - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "solve_cuda", [&]{ - apply_solve(self_working_copy, A_working_copy, infos); - }); - if (self.dim() > 2) { - batchCheckErrors(infos, "solve_cuda"); - } else { - singleCheckErrors(infos.item().toInt(), "solve_cuda"); +void ldl_solve_kernel( + const Tensor& LD, + const Tensor& pivots, + const Tensor& B, + bool upper, + bool hermitian) { + // TODO: It should be possible to add the MAGMA backend for this function when using MAGMA 2.6.0 + // https://bitbucket.org/icl/magma/src/c703d112dcf19eb8c73676cef10888aa2ef73457/ReleaseNotes#lines-48 + if (LD.is_complex()) { + TORCH_CHECK( + !hermitian, + "torch.linalg.ldl_solve: complex tensors with hermitian=True flag are not supported on CUDA."); } - return std::tuple(self_working_copy, A_working_copy); + + ldl_solve_cusolver(LD, pivots, B, upper); } +} // anonymous namespace + +REGISTER_CUDA_DISPATCH(ldl_factor_stub, &ldl_factor_kernel) +REGISTER_CUDA_DISPATCH(ldl_solve_stub, &ldl_solve_kernel) + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ inverse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /* @@ -2275,7 +2326,7 @@ std::tuple linalg_qr_helper_magma(const Tensor& self, c10::strin std::tie(compute_q, reduced) = _parse_qr_mode(mode); // Setup input geometry and inputs for apply_qr - std::vector q_sizes, q_strides; + DimVector q_sizes, q_strides; int64_t n_columns_q; std::tie(q_sizes, q_strides, n_columns_q) = _compute_geometry_for_Q(self, reduced); Tensor q_working_copy, r_working_copy; @@ -2417,7 +2468,7 @@ std::tuple _symeig_helper_cuda(const Tensor& self, bool eigenvec Tensor infos = at::zeros({std::max(1, batchCount(self))}, self.options().dtype(kInt).device(at::kCPU)); auto eigvals_shape = IntArrayRef(self.sizes().data(), self.dim()-1); // self.shape[:-1] - ScalarType real_dtype = toValueType(self.scalar_type()); + ScalarType real_dtype = toRealValueType(self.scalar_type()); // magmaSyevd uses a hybrid CPU-GPU algorithm to compute the eigenvalues and eigenvectors. // The driver routine magma_(d/s)syev_gpu accepts a tensor on the CPU for eigvalenvalues. @@ -2635,7 +2686,7 @@ TORCH_CHECK(false, "Calling torch.linalg.eig on a CUDA tensor requires compiling Tensor rwork; value_t* rwork_data = nullptr; if (input.is_complex()) { - ScalarType real_dtype = toValueType(input.scalar_type()); + ScalarType real_dtype = toRealValueType(input.scalar_type()); rwork = at::empty({lda * 2}, input.options().dtype(real_dtype)); rwork_data = rwork.data_ptr(); } @@ -2851,19 +2902,27 @@ static void apply_lu_solve_looped_magma(const Tensor& b, const Tensor& lu, const auto pivots_data = pivots_cpu.data_ptr(); auto b_stride = matrixStride(b); - auto lu_stride = matrixStride(lu); - auto pivots_stride = pivots_cpu.size(-1); + auto lu_stride = lu.dim() > 2 ? lu.stride(-3) : 0; + auto pivots_stride = pivots_cpu.dim() > 1 ? pivots_cpu.stride(-2) : 0; auto batch_size = batchCount(b); magma_int_t n = magma_int_cast(lu.size(-2), "n"); magma_int_t nrhs = magma_int_cast(b.size(-1), "nrhs"); auto leading_dimension = std::max(1, n); + // lu and pivots tensors can be broadcast to b + // here we construct a helper indexing tensor to linearly index into lu and pivots + IntArrayRef lu_batch_shape(lu.sizes().data(), lu.dim() - 2); + IntArrayRef b_batch_shape(b.sizes().data(), b.dim() - 2); + BroadcastLinearIndices lu_index( + batchCount(lu), lu_batch_shape, b_batch_shape); + int info = 0; for (decltype(batch_size) i = 0; i < batch_size; i++) { + int64_t lu_index_i = lu_index(i); scalar_t* b_working_ptr = &b_data[i * b_stride]; - scalar_t* lu_working_ptr = &lu_data[i * lu_stride]; - int* pivots_working_ptr = &pivots_data[i * pivots_stride]; + scalar_t* lu_working_ptr = &lu_data[lu_index_i * lu_stride]; + int* pivots_working_ptr = &pivots_data[lu_index_i * pivots_stride]; magmaLuSolve(n, nrhs, lu_working_ptr, leading_dimension, pivots_working_ptr, b_working_ptr, leading_dimension, &info, trans); @@ -2896,6 +2955,8 @@ static void apply_lu_solve_batched_magma(const Tensor& b, const Tensor& lu, cons "Calling torch.lu_solve on a CUDA tensor requires compiling ", "PyTorch with MAGMA. Please rebuild with MAGMA."); #else + TORCH_INTERNAL_ASSERT(batchCount(b) == batchCount(lu), "batch_size of b and lu must be the same"); + TORCH_INTERNAL_ASSERT(batchCount(lu) == batchCount(pivots.unsqueeze(-1)), "batch_size of lu and pivots must be the same"); auto trans = to_magma(transpose); auto b_data = b.data_ptr(); auto lu_data = lu.data_ptr(); @@ -2962,9 +3023,36 @@ static void lu_solve_looped_magma(const Tensor& b, const Tensor& lu, const Tenso }); } +namespace { + +c10::MaybeOwned maybe_expand_lu(const Tensor& b, const Tensor& lu) { + if (batchCount(b) != batchCount(lu)) { + IntArrayRef b_batch_size(b.sizes().data(), b.dim() - 2); + DimVector expand_size(b_batch_size); + expand_size.insert(expand_size.end(), {lu.size(-2), lu.size(-1)}); + return c10::MaybeOwned::owned( + cloneBatchedColumnMajor(lu.expand(expand_size))); + } else { + return c10::MaybeOwned::borrowed(lu); + } +} + +c10::MaybeOwned maybe_expand_pivots(const Tensor& b,const Tensor& pivots) { + if (batchCount(b) != batchCount(pivots.unsqueeze(-1))) { + IntArrayRef b_batch_size(b.sizes().data(), b.dim() - 2); + DimVector expand_size(b_batch_size); + expand_size.insert(expand_size.end(), {pivots.size(-1)}); + return c10::MaybeOwned::owned( + pivots.expand(expand_size).clone(at::MemoryFormat::Contiguous)); + } else { + return c10::MaybeOwned::borrowed(pivots); + } +} + +} // anonymous namespace static void lu_solve_trans_dispatch(const Tensor& b, const Tensor& lu, const Tensor& pivots, TransposeType trans) { - auto batch_size = batchCount(lu); + auto batch_size = batchCount(b); auto m = lu.size(-2); auto b2 = b.size(-1); bool over_magma_dim_limit = b2 > 1024; // magma implementation of LU solve cannot handle a b tensor with last dim > 1024 (https://bitbucket.org/icl/magma/issues/19/dgesv_batched-dgetrs_batched-fails-for) @@ -2980,11 +3068,15 @@ static void lu_solve_trans_dispatch(const Tensor& b, const Tensor& lu, const Ten #endif // ifdef USE_CUSOLVER #ifdef CUDART_VERSION else if ((batch_size > 2 && m <= 128) || (batch_size > 8 && over_magma_dim_limit)) { - lu_solve_batched_cublas(b, lu, pivots, trans); + c10::MaybeOwned lu_ = maybe_expand_lu(b, lu); + c10::MaybeOwned pivots_ = maybe_expand_pivots(b, pivots); + lu_solve_batched_cublas(b, *lu_, *pivots_, trans); } #endif // ifdef CUDART_VERSION else { - lu_solve_batched_magma(b, lu, pivots, trans); + c10::MaybeOwned lu_ = maybe_expand_lu(b, lu); + c10::MaybeOwned pivots_ = maybe_expand_pivots(b, pivots); + lu_solve_batched_magma(b, *lu_, *pivots_, trans); } } @@ -3159,27 +3251,20 @@ void lstsq_kernel(const Tensor& a, Tensor& b, Tensor& /*rank*/, Tensor& /*singul "Please rebuild with cuSOLVER."); #endif } else { // m >= n -#if !AT_MAGMA_ENABLED() - // MAGMA is not available we can either use cuBLAS or cuSOLVER here +#if !AT_ROCM_ENABLED() + // On CUDA platform we use either cuBLAS or cuSOLVER here // the batched vs looped dispatch is implemented based on the following performance results // https://github.com/pytorch/pytorch/pull/54725#issuecomment-832234456 if (m <= 256 && batchCount(b) >= std::max(2, m / 16)) { - // if CUDART_VERSION is defined then cuBLAS is available - #ifdef CUDART_VERSION gels_batched_cublas(a, b, infos); - #else - // this would either call cuSOLVER or MAGMA, - // if MAGMA is called a runtime error is thrown about not finding MAGMA in compilation - gels_looped(a, b, infos); - #endif // CUDART_VERSION } else { gels_looped(a, b, infos); } #else - // if both MAGMA and cuSOLVER are available this would call cuSOLVER - // MAGMA is called if cuSOLVER is not available - gels_looped(a, b, infos); -#endif // AT_MAGMA_ENABLED() + // On ROCm platform we can only use MAGMA here + // If MAGMA is not available, an error will be thrown + gels_magma(a, b, infos); +#endif // !AT_ROCM_ENABLED() } } @@ -3244,25 +3329,21 @@ std::tuple legacy_lstsq_cuda(const Tensor &B, const Tensor &A) { #endif // AT_MAGMA_ENABLED() } -std::tuple legacy_lstsq_out_cuda( - const Tensor& B, const Tensor& A, Tensor& B_out, Tensor& A_out) { - const auto dtype = A.scalar_type(); - TORCH_CHECK(B.scalar_type() == dtype, "exepected A and B dtypes to match but found ", - A.scalar_type(), " and ", B.scalar_type()); - TORCH_CHECK(A_out.scalar_type() == dtype, "A_out to have scalar type ", dtype, - " but found", A_out.scalar_type()); - TORCH_CHECK(B_out.scalar_type() == dtype, "A_out to have scalar type ", dtype, - " but found", B_out.scalar_type()); - Tensor A_tmp, B_tmp; - std::tie(B_tmp, A_tmp) = native::legacy_lstsq_cuda(B, A); - resize_output(A_out, A_tmp.sizes()); - A_out.copy_(A_tmp); - resize_output(B_out, B_tmp.sizes()); - B_out.copy_(B_tmp); - return std::tuple(B_out, A_out); -} +#if defined(BUILD_LAZY_CUDA_LINALG) +struct DispatchInitializer { + DispatchInitializer() { + cuda::detail::LinalgDispatch disp{ _symeig_helper_cuda, + _linalg_qr_helper_cuda, + _cholesky_solve_helper_cuda, + legacy_lstsq_cuda, + _linalg_inv_out_helper_cuda}; + cuda::detail::registerLinalgDispatch(disp); + }; +} initializer; +} // namespace lazy_linalg +#endif }} // namespace at::native #undef ALLOCATE_ARRAY diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp index efbd987a3814..c73a14f73b71 100644 --- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp +++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp @@ -1,8 +1,8 @@ -#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include #include -#include +#include #include #include #include @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -95,6 +96,8 @@ static void apply_lu_solve_batched_cublas(const Tensor& b, const Tensor& lu, con #ifndef CUDART_VERSION TORCH_CHECK(false, "lu_solve: cuBLAS backend for lu_solve is not available.") #else + TORCH_INTERNAL_ASSERT(batchCount(b) == batchCount(lu), "batch_size of b and lu must be the same"); + TORCH_INTERNAL_ASSERT(batchCount(lu) == batchCount(pivots.unsqueeze(-1)), "batch_size of lu and pivots must be the same"); const auto trans = to_cublas(transpose); auto pivots_data = pivots.data_ptr(); @@ -122,6 +125,181 @@ void lu_solve_batched_cublas(const Tensor& b, const Tensor& lu, const Tensor& pi }); } +namespace { + +template +void apply_ldl_factor_cusolver( + const Tensor& A, + const Tensor& pivots, + const Tensor& info, + bool upper) { +#ifndef USE_CUSOLVER + TORCH_CHECK( + false, + "Calling torch.linalg.ldl_factor on a CUDA tensor requires compiling ", + "PyTorch with cuSOLVER. Please use PyTorch built with cuSOLVER support."); +#else + auto batch_size = batchCount(A); + auto n = cuda_int_cast(A.size(-2), "A.size(-2)"); + auto lda = cuda_int_cast(A.stride(-1), "A.stride(-1)"); + auto uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER; + + auto a_stride = A.dim() > 2 ? A.stride(-3) : 0; + auto pivots_stride = pivots.dim() > 1 ? pivots.stride(-2) : 0; + + auto a_data = A.data_ptr(); + auto pivots_data = pivots.data_ptr(); + auto info_data = info.data_ptr(); + + auto handle = at::cuda::getCurrentCUDASolverDnHandle(); + + int lwork = 0; + at::cuda::solver::sytrf_bufferSize(handle, n, a_data, lda, &lwork); + auto& allocator = *::c10::cuda::CUDACachingAllocator::get(); + auto work = allocator.allocate(sizeof(scalar_t) * lwork); + + for (const auto i : c10::irange(batch_size)) { + auto* a_working_ptr = &a_data[i * a_stride]; + auto* pivots_working_ptr = &pivots_data[i * pivots_stride]; + auto* info_working_ptr = &info_data[i]; + at::cuda::solver::sytrf( + handle, + uplo, + n, + a_working_ptr, + lda, + pivots_working_ptr, + reinterpret_cast(work.get()), + lwork, + info_working_ptr); + } +#endif +} + +template +void apply_ldl_solve_cusolver( + const Tensor& A, + const Tensor& pivots, + const Tensor& B, + bool upper) { +#if !(defined(CUDART_VERSION) && defined(CUSOLVER_VERSION) && \ + CUSOLVER_VERSION >= 11102) + TORCH_CHECK( + false, + "Calling torch.linalg.ldl_solve on a CUDA tensor requires compiling ", + "PyTorch with cuSOLVER. Please use PyTorch built with cuSOLVER 11.1.2+ (CUDA 11.3.1+) support."); +#else + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(batchCount(A) > 0); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(batchCount(pivots.unsqueeze(-1)) > 0); + auto batch_size = batchCount(B); + auto n = A.size(-2); + auto nrhs = B.size(-1); + auto lda = A.stride(-1); + auto ldb = B.stride(-1); + auto uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER; + + auto a_stride = A.dim() > 2 ? A.stride(-3) : 0; + auto b_stride = B.dim() > 2 ? B.stride(-3) : 0; + auto pivots_stride = pivots.dim() > 1 ? pivots.stride(-2) : 0; + + auto a_data = A.data_ptr(); + auto b_data = B.data_ptr(); + + auto pivots_ = pivots.to(kLong); + auto pivots_data = pivots_.data_ptr(); + + auto handle = at::cuda::getCurrentCUDASolverDnHandle(); + auto datatype = at::cuda::solver::get_cusolver_datatype(); + size_t worksize_device = 0; + size_t worksize_host = 0; + + TORCH_CUSOLVER_CHECK(cusolverDnXsytrs_bufferSize( + handle, + uplo, + n, + nrhs, + datatype, + a_data, + lda, + pivots_data, + datatype, + b_data, + ldb, + &worksize_device, + &worksize_host)); + + // allocate workspace storage + auto& device_allocator = *at::cuda::getCUDADeviceAllocator(); + auto workdata_device = device_allocator.allocate(worksize_device); + void* workdata_device_ptr = workdata_device.get(); + + auto& host_allocator = *at::getCPUAllocator(); + auto workdata_host = host_allocator.allocate(worksize_host); + void* workdata_host_ptr = workdata_host.get(); + + Tensor info = at::zeros({}, A.options().dtype(at::kInt)); + for (const auto i : c10::irange(batch_size)) { + auto* a_working_ptr = &a_data[i * a_stride]; + auto* b_working_ptr = &b_data[i * b_stride]; + auto* pivots_working_ptr = &pivots_data[i * pivots_stride]; + TORCH_CUSOLVER_CHECK(cusolverDnXsytrs( + handle, + uplo, + n, + nrhs, + datatype, + a_working_ptr, + lda, + pivots_working_ptr, + datatype, + b_working_ptr, + ldb, + workdata_device_ptr, + worksize_device, + workdata_host_ptr, + worksize_host, + info.data_ptr())); + } + + // info from sytrs only reports if the i-th parameter is wrong + // so we don't need to check it all the time + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(info.item().toInt() == 0); +#endif +} + +} // anonymous namespace + +void ldl_factor_cusolver( + const Tensor& LD, + const Tensor& pivots, + const Tensor& info, + bool upper, + bool hermitian) { + if (LD.is_complex()) { + TORCH_CHECK( + !hermitian, + "torch.linalg.ldl_factor: complex tensors with hermitian=True flag are not supported with cuSOLVER backend. ", + "Currently preferred backend is ", + at::globalContext().linalgPreferredBackend(), + ", please set 'default' or 'magma' backend with torch.backends.cuda.preferred_linalg_library"); + } + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES( + LD.scalar_type(), "ldl_factor_looped_cusolver", [&] { + apply_ldl_factor_cusolver(LD, pivots, info, upper); + }); +} + +void ldl_solve_cusolver( + const Tensor& LD, + const Tensor& pivots, + const Tensor& B, + bool upper) { + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES( + LD.scalar_type(), "ldl_solve_looped_cusolver", [&] { + apply_ldl_solve_cusolver(LD, pivots, B, upper); + }); +} + template static void apply_triangular_solve(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) { cublasFillMode_t uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER; @@ -1445,26 +1623,34 @@ void lu_solve_looped_cusolver(const Tensor& b, const Tensor& lu, const Tensor& p const auto trans = to_cublas(transpose); int n = cuda_int_cast(lu.size(-2), "n"); int nrhs = cuda_int_cast(b.size(-1), "nrhs"); - auto batch_size = batchCount(lu); + auto batch_size = batchCount(b); auto info = at::zeros({1}, lu.options().dtype(kInt)); auto info_data = info.data_ptr(); auto b_data = b.data_ptr(); auto lu_data = lu.data_ptr(); auto pivots_data = pivots.data_ptr(); - auto pivots_stride = pivots.size(-1); - auto lu_stride = matrixStride(lu); + auto pivots_stride = pivots.dim() > 1 ? pivots.stride(-2) : 0; + auto lu_stride = lu.dim() > 2 ? lu.stride(-3) : 0; auto b_stride = matrixStride(b); int leading_dimension = cuda_int_cast(std::max(1, n), "leading_dimension"); + // lu and pivots tensors can be broadcast to b + // here we construct a helper indexing tensor to linearly index into lu and pivots + IntArrayRef lu_batch_shape(lu.sizes().data(), lu.dim() - 2); + IntArrayRef b_batch_shape(b.sizes().data(), b.dim() - 2); + BroadcastLinearIndices lu_index( + batchCount(lu), lu_batch_shape, b_batch_shape); + auto handle = at::cuda::getCurrentCUDASolverDnHandle(); for (auto batch = decltype(batch_size){0}; batch < batch_size; ++batch) { + int64_t lu_index_i = lu_index(batch); at::cuda::solver::getrs( handle, n, nrhs, - lu_data + batch * lu_stride, + lu_data + lu_index_i * lu_stride, leading_dimension, - pivots_data + batch * pivots_stride, + pivots_data + lu_index_i * pivots_stride, b_data + batch * b_stride, leading_dimension, info_data, diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h index 14da99f83d36..8979a23580db 100644 --- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h +++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h @@ -1,10 +1,11 @@ #pragma once +#include #include #include #include -#include +#include #include #if defined(CUDART_VERSION) && defined(CUSOLVER_VERSION) @@ -39,6 +40,17 @@ void triangular_solve_cublas(const Tensor& A, const Tensor& B, bool left, bool u void triangular_solve_batched_cublas(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular); void gels_batched_cublas(const Tensor& a, Tensor& b, Tensor& infos); void lu_solve_batched_cublas(const Tensor& b, const Tensor& lu, const Tensor& pivots, TransposeType transpose); +void ldl_factor_cusolver( + const Tensor& LD, + const Tensor& pivots, + const Tensor& info, + bool upper, + bool hermitian); +void ldl_solve_cusolver( + const Tensor& LD, + const Tensor& pivots, + const Tensor& B, + bool upper); #ifdef USE_CUSOLVER @@ -65,4 +77,19 @@ void lu_factor_looped_cusolver(const Tensor& self, const Tensor& pivots, const T #endif // USE_CUSOLVER +#if defined(BUILD_LAZY_CUDA_LINALG) +namespace cuda { namespace detail { +// This is only used for an old-style dispatches +// Please do not add any new entires to it +struct LinalgDispatch { + std::tuple (*symeig_helper)(const Tensor& self, bool eigenvectors, bool upper); + std::tuple (*qr_helper)(const Tensor& input, c10::string_view mode); + Tensor (*cholesky_solve_helper)(const Tensor& self, const Tensor& A, bool upper); + std::tuple (*legacy_lstsq)(const Tensor &B, const Tensor &A); + Tensor& (*inv_out_helper)(Tensor &result, Tensor& infos_lu, Tensor& infos_getri); +}; +C10_EXPORT void registerLinalgDispatch(const LinalgDispatch&); +}} // namespace cuda::detail +#endif + }} // namespace at::native diff --git a/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp b/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp index 036cdd329e35..85141f820e5f 100644 --- a/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp +++ b/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp @@ -10,20 +10,6 @@ namespace at { namespace cuda { namespace solver { -C10_EXPORT const char* cusolverGetErrorMessage(cusolverStatus_t status) { - switch (status) { - case CUSOLVER_STATUS_SUCCESS: return "CUSOLVER_STATUS_SUCCES"; - case CUSOLVER_STATUS_NOT_INITIALIZED: return "CUSOLVER_STATUS_NOT_INITIALIZED"; - case CUSOLVER_STATUS_ALLOC_FAILED: return "CUSOLVER_STATUS_ALLOC_FAILED"; - case CUSOLVER_STATUS_INVALID_VALUE: return "CUSOLVER_STATUS_INVALID_VALUE"; - case CUSOLVER_STATUS_ARCH_MISMATCH: return "CUSOLVER_STATUS_ARCH_MISMATCH"; - case CUSOLVER_STATUS_EXECUTION_FAILED: return "CUSOLVER_STATUS_EXECUTION_FAILED"; - case CUSOLVER_STATUS_INTERNAL_ERROR: return "CUSOLVER_STATUS_INTERNAL_ERROR"; - case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; - default: return "Unknown cusolver error number"; - } -} - template <> void getrf( cusolverDnHandle_t handle, int m, int n, double* dA, int ldda, int* ipiv, int* info) { @@ -162,6 +148,71 @@ void getrs>( info)); } +template <> +void sytrf_bufferSize(CUDASOLVER_SYTRF_BUFFER_ARGTYPES(double)) { + TORCH_CUSOLVER_CHECK(cusolverDnDsytrf_bufferSize(handle, n, A, lda, lwork)); +} + +template <> +void sytrf_bufferSize(CUDASOLVER_SYTRF_BUFFER_ARGTYPES(float)) { + TORCH_CUSOLVER_CHECK(cusolverDnSsytrf_bufferSize(handle, n, A, lda, lwork)); +} + +template <> +void sytrf_bufferSize>( + CUDASOLVER_SYTRF_BUFFER_ARGTYPES(c10::complex)) { + TORCH_CUSOLVER_CHECK(cusolverDnZsytrf_bufferSize( + handle, n, reinterpret_cast(A), lda, lwork)); +} + +template <> +void sytrf_bufferSize>( + CUDASOLVER_SYTRF_BUFFER_ARGTYPES(c10::complex)) { + TORCH_CUSOLVER_CHECK(cusolverDnCsytrf_bufferSize( + handle, n, reinterpret_cast(A), lda, lwork)); +} + +template <> +void sytrf(CUDASOLVER_SYTRF_ARGTYPES(double)) { + TORCH_CUSOLVER_CHECK( + cusolverDnDsytrf(handle, uplo, n, A, lda, ipiv, work, lwork, devInfo)); +} + +template <> +void sytrf(CUDASOLVER_SYTRF_ARGTYPES(float)) { + TORCH_CUSOLVER_CHECK( + cusolverDnSsytrf(handle, uplo, n, A, lda, ipiv, work, lwork, devInfo)); +} + +template <> +void sytrf>( + CUDASOLVER_SYTRF_ARGTYPES(c10::complex)) { + TORCH_CUSOLVER_CHECK(cusolverDnZsytrf( + handle, + uplo, + n, + reinterpret_cast(A), + lda, + ipiv, + reinterpret_cast(work), + lwork, + devInfo)); +} + +template <> +void sytrf>( + CUDASOLVER_SYTRF_ARGTYPES(c10::complex)) { + TORCH_CUSOLVER_CHECK(cusolverDnCsytrf( + handle, + uplo, + n, + reinterpret_cast(A), + lda, + ipiv, + reinterpret_cast(work), + lwork, + devInfo)); +} template<> void gesvd_buffersize(CUDASOLVER_GESVD_BUFFERSIZE_ARGTYPES()) { diff --git a/aten/src/ATen/native/cuda/linalg/CUDASolver.h b/aten/src/ATen/native/cuda/linalg/CUDASolver.h index bd8c5cc11064..4a2cd9680c77 100644 --- a/aten/src/ATen/native/cuda/linalg/CUDASolver.h +++ b/aten/src/ATen/native/cuda/linalg/CUDASolver.h @@ -46,6 +46,47 @@ void getrs>(CUDASOLVER_GETRS_ARGTYPES(c10::complex) template<> void getrs>(CUDASOLVER_GETRS_ARGTYPES(c10::complex)); +#define CUDASOLVER_SYTRF_BUFFER_ARGTYPES(Dtype) \ + cusolverDnHandle_t handle, int n, Dtype *A, int lda, int *lwork + +template +void sytrf_bufferSize(CUDASOLVER_SYTRF_BUFFER_ARGTYPES(Dtype)) { + TORCH_CHECK( + false, + "at::cuda::solver::sytrf_bufferSize: not implemented for ", + typeid(Dtype).name()); +} +template <> +void sytrf_bufferSize(CUDASOLVER_SYTRF_BUFFER_ARGTYPES(float)); +template <> +void sytrf_bufferSize(CUDASOLVER_SYTRF_BUFFER_ARGTYPES(double)); +template <> +void sytrf_bufferSize>( + CUDASOLVER_SYTRF_BUFFER_ARGTYPES(c10::complex)); +template <> +void sytrf_bufferSize>( + CUDASOLVER_SYTRF_BUFFER_ARGTYPES(c10::complex)); + +#define CUDASOLVER_SYTRF_ARGTYPES(Dtype) \ + cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, Dtype *A, int lda, \ + int *ipiv, Dtype *work, int lwork, int *devInfo + +template +void sytrf(CUDASOLVER_SYTRF_ARGTYPES(Dtype)) { + TORCH_CHECK( + false, + "at::cuda::solver::sytrf: not implemented for ", + typeid(Dtype).name()); +} +template <> +void sytrf(CUDASOLVER_SYTRF_ARGTYPES(float)); +template <> +void sytrf(CUDASOLVER_SYTRF_ARGTYPES(double)); +template <> +void sytrf>( + CUDASOLVER_SYTRF_ARGTYPES(c10::complex)); +template <> +void sytrf>(CUDASOLVER_SYTRF_ARGTYPES(c10::complex)); #define CUDASOLVER_GESVD_BUFFERSIZE_ARGTYPES() \ cusolverDnHandle_t handle, int m, int n, int *lwork diff --git a/aten/src/ATen/native/cuda/linalg/CusolverDnHandlePool.cpp b/aten/src/ATen/native/cuda/linalg/CusolverDnHandlePool.cpp index e64d8eeb1030..599c86d334d4 100644 --- a/aten/src/ATen/native/cuda/linalg/CusolverDnHandlePool.cpp +++ b/aten/src/ATen/native/cuda/linalg/CusolverDnHandlePool.cpp @@ -11,6 +11,7 @@ void createCusolverDnHandle(cusolverDnHandle_t *handle) { } void destroyCusolverDnHandle(cusolverDnHandle_t handle) { + (void)handle; // Suppress unused variable warning // this is because of something dumb in the ordering of // destruction. Sometimes atexit, the cuda context (or something) // would already be destroyed by the time this gets destroyed. It diff --git a/aten/src/ATen/native/cuda/linalg/MagmaUtils.h b/aten/src/ATen/native/cuda/linalg/MagmaUtils.h new file mode 100644 index 000000000000..a58cfd9bef9f --- /dev/null +++ b/aten/src/ATen/native/cuda/linalg/MagmaUtils.h @@ -0,0 +1,88 @@ +#pragma once +#include + +#if AT_MAGMA_ENABLED() +#include +#include +#endif + +namespace at { +namespace native { + +#if AT_MAGMA_ENABLED() + +// RAII for a MAGMA Queue +struct MAGMAQueue { + + // Default constructor without a device will cause + // destroying a queue which has not been initialized. + MAGMAQueue() = delete; + + // Constructor + explicit MAGMAQueue(int64_t device_id) { + cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 + // Magma operations is numerically sensitive, so TF32 should be off + // regardless of the global flag. + TORCH_CUDABLAS_CHECK(cublasGetMathMode(handle, &original_math_mode)); + TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH)); +#endif + magma_queue_create_from_cuda( + device_id, + at::cuda::getCurrentCUDAStream(), + handle, + at::cuda::getCurrentCUDASparseHandle(), + &magma_queue_); + } + + // Getter + magma_queue_t get_queue() const { return magma_queue_; } + + // Destructor + ~MAGMAQueue() { +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 + // We've manually set the math mode to CUBLAS_DEFAULT_MATH, now we + // should restore the original math mode back + cublasHandle_t handle = magma_queue_get_cublas_handle(magma_queue_); + cublasSetMathMode(handle, original_math_mode); +#endif + magma_queue_destroy(magma_queue_); + } + + private: + magma_queue_t magma_queue_; +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 + cublasMath_t original_math_mode; +#endif +}; + +static inline magma_int_t magma_int_cast(int64_t value, const char* varname) { + auto result = static_cast(value); + if (static_cast(result) != value) { + AT_ERROR("magma: The value of ", varname, "(", (long long)value, + ") is too large to fit into a magma_int_t (", sizeof(magma_int_t), " bytes)"); + } + return result; +} + +// MAGMA functions that don't take a magma_queue_t aren't stream safe +// Work around this by synchronizing with the default stream +struct MagmaStreamSyncGuard { + MagmaStreamSyncGuard() { + auto stream = at::cuda::getCurrentCUDAStream(); + if (stream != at::cuda::getDefaultCUDAStream()) { + at::cuda::stream_synchronize(stream); + } + } + + ~MagmaStreamSyncGuard() noexcept(false) { + auto default_stream = at::cuda::getDefaultCUDAStream(); + if (at::cuda::getCurrentCUDAStream() != default_stream) { + at::cuda::stream_synchronize(default_stream); + } + } +}; +#endif + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/cuda/reduction_template.cuh b/aten/src/ATen/native/cuda/reduction_template.cuh new file mode 100644 index 000000000000..4d9d559d8ec8 --- /dev/null +++ b/aten/src/ATen/native/cuda/reduction_template.cuh @@ -0,0 +1,664 @@ +namespace at { +namespace cuda { +//windows doesn't like large string literals, so split in two +const std::string reduction_template_0 = R"ESCAPE( + #define C10_HOST_DEVICE __host__ __device__ + #define C10_DEVICE __device__ + + template + __device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff) + { + return __shfl_down_sync(mask, value, delta, width); + } + + + #if ${complex} + template + __device__ __forceinline__ std::complex WARP_SHFL_DOWN(std::complex value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff) + { + return std::complex( + __shfl_down_sync(mask, value.real(), delta, width), + __shfl_down_sync(mask, value.imag(), delta, width)); + } + #endif + + // aligned vector generates vectorized load/store on CUDA + template + struct alignas(sizeof(scalar_t) * vec_size) aligned_vector { + scalar_t val[vec_size]; + }; + + + C10_HOST_DEVICE static void reduce_fraction(size_t &numerator, size_t &denominator) { + // get GCD of num and denom using Euclid's algorithm. + // Can replace this with std::gcd if we ever support c++17. + size_t a = denominator; + size_t b = numerator; + while (b != 0) { + a %= b; + // swap(a,b) + size_t tmp = a; + a = b; + b = tmp; + } + + // a is now the GCD + numerator /= a; + denominator /= a; + } + + + + + struct ReduceConfig { + //has to match host-side ReduceConfig in the eager code + static constexpr int BLOCK_X = 0; + static constexpr int BLOCK_Y = 1; + static constexpr int CTA = 2; + + static constexpr int input_vec_size = 4; + int element_size_bytes; + int num_inputs; + int num_outputs; + int step_input = 1; + int step_output = 1; + int ctas_per_output = 1; + int input_mult[3] = {0, 0, 0}; + int output_mult[2] = {0, 0}; + + int block_width; + int block_height; + int num_threads; + + bool vectorize_input = false; + int output_vec_size = 1; + + C10_HOST_DEVICE bool should_block_x_reduce() const { + return input_mult[BLOCK_X] != 0; + } + + C10_HOST_DEVICE bool should_block_y_reduce() const { + return input_mult[BLOCK_Y] != 0; + } + + C10_HOST_DEVICE bool should_global_reduce() const { + return input_mult[CTA] != 0; + } + + C10_DEVICE bool should_store(int output_idx) const { + return output_idx < num_outputs && + (!should_block_x_reduce() || threadIdx.x == 0) && + (!should_block_y_reduce() || threadIdx.y == 0); + } + + C10_DEVICE bool should_reduce_tail() const { + return (!should_block_y_reduce() || threadIdx.y == 0) && + (!should_global_reduce() || blockIdx.y == 0); + } + + C10_HOST_DEVICE int input_idx() const { + int lane = threadIdx.x; + int warp = threadIdx.y; + int cta2 = blockIdx.y; + return (lane * input_mult[BLOCK_X] + + warp * input_mult[BLOCK_Y] + + cta2 * input_mult[CTA]); + } + + template + C10_HOST_DEVICE int output_idx() const { + int lane = threadIdx.x; + int warp = threadIdx.y; + int cta1 = blockIdx.x; + return (lane * output_mult[BLOCK_X] + + warp * output_mult[BLOCK_Y] + + cta1 * step_output) * output_vec_size; + } + + C10_DEVICE int shared_memory_offset(int offset) const { + return threadIdx.x + (threadIdx.y + offset) * blockDim.x; + } + + C10_DEVICE int staging_memory_offset(int cta2) const { + int offset = cta2 + blockIdx.x * gridDim.y; + if (!should_block_x_reduce()) { + offset = threadIdx.x + offset * blockDim.x; + } + return offset; + } + + + }; + + +//TODO this will need to be different for more generic reduction functions +namespace reducer { + + using scalar_t = ${scalar_type}; + using arg_t = ${reduction_accum_type}; + using out_scalar_t = ${result_type}; + + + inline __device__ ${functor} + + inline __device__ out_scalar_t project(arg_t arg) { + return (out_scalar_t) arg; + } + + inline __device__ arg_t warp_shfl_down(arg_t arg, int offset) { + return WARP_SHFL_DOWN(arg, offset); + } + + inline __device__ arg_t translate_idx(arg_t acc, int64_t /*idx*/) { + return acc; + } + + // wrap a normal reduction that ignores the index + inline __device__ arg_t reduce(arg_t acc, arg_t val, int64_t idx) { + return combine(acc, val); + } +} + + +struct ReduceJitOp { + using scalar_t = ${scalar_type}; + using arg_t = ${reduction_accum_type}; + using out_scalar_t = ${result_type}; + + using InputCalculator = OffsetCalculator<1>; + using OutputCalculator = OffsetCalculator<2>; + +// static constexpr bool can_accumulate_in_output = +// std::is_convertible::value +// && std::is_convertible::value; + + static constexpr int input_vec_size = ReduceConfig::input_vec_size; + + arg_t ident; + ReduceConfig config; + InputCalculator input_calc; + OutputCalculator output_calc; + const void* src; + const char* dst[2]; //it accepts at most two destinations + // acc_buf used for accumulation among sub Tensor Iterator when accumulation on + // output is not permissible + void* acc_buf; + // cta_buf used for accumulation between blocks during global reduction + void* cta_buf; + int* semaphores; + int64_t base_idx; + bool accumulate; + bool final_output; + int noutputs; + + + C10_DEVICE void run() const { + extern __shared__ char shared_memory[]; + uint32_t output_idx = config.output_idx<${output_vec_size}>(); + uint32_t input_idx = config.input_idx(); + auto base_offsets1 = output_calc.get(output_idx)[1]; + + using arg_vec_t = Array; + arg_vec_t value; + + if (output_idx < config.num_outputs && input_idx < config.num_inputs) { + const scalar_t* input_slice = (const scalar_t*)((const char*)src + base_offsets1); + + value = thread_reduce<${output_vec_size}>(input_slice); + } + + if (config.should_block_y_reduce()) { + value = block_y_reduce<${output_vec_size}>(value, shared_memory); + } + if (config.should_block_x_reduce()) { + value = block_x_reduce<${output_vec_size}>(value, shared_memory); + } + + using out_ptr_vec_t = Array; + using offset_vec_t = Array; + offset_vec_t base_offsets; + out_ptr_vec_t out; + + #pragma unroll + for (int i = 0; i < ${output_vec_size}; i++) { + base_offsets[i] = output_calc.get(output_idx + i)[0]; + out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]); + } + + arg_vec_t* acc = nullptr; + if (acc_buf != nullptr) { + size_t numerator = sizeof(arg_t); + size_t denominator = sizeof(out_scalar_t); + reduce_fraction(numerator, denominator); + acc = (arg_vec_t*)((char*)acc_buf + (base_offsets[0] * numerator / denominator)); + } + + if (config.should_global_reduce()) { + value = global_reduce<${output_vec_size}>(value, acc, shared_memory); + } else if (config.should_store(output_idx)) { + if (accumulate) { + #pragma unroll + for (int i = 0; i < ${output_vec_size}; i++) { + value[i] = reducer::translate_idx(value[i], base_idx); + } + } + + if (acc == nullptr) { + if (accumulate) { + value = accumulate_in_output<${output_vec_size}>(out, value); + } + if (final_output) { + set_results_to_output<${output_vec_size}>(value, base_offsets); + } else { + #pragma unroll + for (int i = 0; i < ${output_vec_size}; i++) { + *(out[i]) = get_accumulated_output(out[i], value[i]); + } + } + } else { + if (accumulate) { + #pragma unroll + for (int i = 0; i < ${output_vec_size}; i++) { + value[i] = reducer::combine((*acc)[i], value[i]); + } + } + if (final_output) { + set_results_to_output<${output_vec_size}>(value, base_offsets); + } else { + *acc = value; + } + } + } + } + + template + C10_DEVICE Array thread_reduce(const scalar_t* data) const { + if (config.vectorize_input) { + assert(output_vec_size == 1); + // reduce at the header of input_slice where memory is not aligned, + // so that thread_reduce will have an aligned memory to work on. + return {input_vectorized_thread_reduce_impl(data)}; + } else { + uint32_t element_stride = input_calc.strides_[0][0] / sizeof(scalar_t); + bool is_contiguous = (input_calc.dims == 1 && element_stride == 1); + if (is_contiguous) { + return thread_reduce_impl(data, [](uint32_t idx) { return idx; }); + } else if (input_calc.dims == 1) { + return thread_reduce_impl(data, [&](uint32_t idx) { return idx * element_stride; }); + } else { + return thread_reduce_impl(data, [&](uint32_t idx) { return input_calc.get(idx)[0] / sizeof(scalar_t); }); + } + } + } + + C10_DEVICE arg_t input_vectorized_thread_reduce_impl(const scalar_t* data) const { + uint32_t end = config.num_inputs; + + // Handle the head of input slice where data is not aligned + arg_t value = ident; + constexpr int align_bytes = alignof(aligned_vector); + constexpr int align_elements = align_bytes / sizeof(scalar_t); + int shift = ((int64_t)data) % align_bytes / sizeof(scalar_t); + if (shift > 0) { + data -= shift; + end += shift; + if(threadIdx.x >= shift && threadIdx.x < align_elements && config.should_reduce_tail()){ + value = reducer::reduce(value, data[threadIdx.x], threadIdx.x - shift); + } + end -= align_elements; + data += align_elements; + shift = align_elements - shift; + } + + // Do the vectorized reduction + using load_t = aligned_vector; + + uint32_t idx = config.input_idx(); + const uint32_t stride = config.step_input; + + // Multiple accumulators to remove dependency between unrolled loops. + arg_t value_list[input_vec_size]; + value_list[0] = value; + + #pragma unroll + for (int i = 1; i < input_vec_size; i++) { + value_list[i] = ident; + } + + scalar_t values[input_vec_size]; + + load_t *values_vector = reinterpret_cast(&values[0]); + + while (idx * input_vec_size + input_vec_size - 1 < end) { + *values_vector = reinterpret_cast(data)[idx]; + #pragma unroll + for (uint32_t i = 0; i < input_vec_size; i++) { + value_list[i] = reducer::reduce(value_list[i], values[i], shift + idx * input_vec_size + i); + } + idx += stride; + } + + // tail + uint32_t tail_start = end - end % input_vec_size; + if (config.should_reduce_tail()) { + int idx = tail_start + threadIdx.x; + if (idx < end) { + value_list[0] = reducer::reduce(value_list[0], data[idx], idx + shift); + } + } + + // combine accumulators + #pragma unroll + for (int i = 1; i < input_vec_size; i++) { + value_list[0] = reducer::combine(value_list[0], value_list[i]); + } + return value_list[0]; + } + + template + C10_DEVICE Array thread_reduce_impl(const scalar_t* data_, offset_calc_t calc) const { + uint32_t idx = config.input_idx(); + const uint32_t end = config.num_inputs; + const uint32_t stride = config.step_input; + const int vt0=${vt0}; + + using arg_vec_t = Array; + using load_t = aligned_vector; + const load_t* data = reinterpret_cast(data_); + + // Multiple accumulators to remove dependency between unrolled loops. + arg_vec_t value_list[vt0]; + + #pragma unroll + for (int i = 0; i < vt0; i++) { + #pragma unroll + for (int j = 0; j < output_vec_size; j++) { + value_list[i][j] = ident; + } + } + + load_t values[vt0]; + + while (idx + (vt0 - 1) * stride < end) { + #pragma unroll + for (uint32_t i = 0; i < vt0; i++) { + values[i] = data[calc(idx + i * stride) / output_vec_size]; + } + #pragma unroll + for (uint32_t i = 0; i < vt0; i++) { + #pragma unroll + for (uint32_t j = 0; j < output_vec_size; j++) { + value_list[i][j] = reducer::reduce(value_list[i][j], values[i].val[j], idx + i * stride); + } + } + idx += stride * vt0; + } + + // tail + int idx_ = idx; + #pragma unroll + for (uint32_t i = 0; i < vt0; i++) { + if (idx >= end) { + break; + } + values[i] = data[calc(idx) / output_vec_size]; + idx += stride; + } + idx = idx_; + #pragma unroll + for (uint32_t i = 0; i < vt0; i++) { + if (idx >= end) { + break; + } + #pragma unroll + for (uint32_t j = 0; j < output_vec_size; j++) { + value_list[i][j] = reducer::reduce(value_list[i][j], values[i].val[j], idx); + } + idx += stride; + } + + // combine accumulators + #pragma unroll + for (int i = 1; i < vt0; i++) { + #pragma unroll + for (uint32_t j = 0; j < output_vec_size; j++) { + value_list[0][j] = reducer::combine(value_list[0][j], value_list[i][j]); + } + } + return value_list[0]; + } + template + C10_DEVICE Array block_x_reduce(Array value, char* shared_memory) const { + using args_vec_t = Array; + int dim_x = blockDim.x; + args_vec_t* shared = (args_vec_t*)shared_memory; + if (dim_x > warpSize) { + int address_base = threadIdx.x + threadIdx.y*blockDim.x; + shared[address_base] = value; + for (int offset = dim_x/2; offset >= warpSize; offset >>= 1) { + __syncthreads(); + if (threadIdx.x < offset && threadIdx.x + offset < blockDim.x) { + args_vec_t other = shared[address_base + offset]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = reducer::combine(value[i], other[i]); + } + shared[address_base] = value; + } + } + dim_x = warpSize; + } + + __syncthreads(); + + for (int offset = 1; offset < dim_x; offset <<= 1) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + arg_t other = reducer::warp_shfl_down(value[i], offset); + value[i] = reducer::combine(value[i], other); + } + } + return value; + } + + template + C10_DEVICE Array block_y_reduce(Array value, char* shared_memory) const { + using args_vec_t = Array; + args_vec_t* shared = (args_vec_t*)shared_memory; + shared[config.shared_memory_offset(0)] = value; + for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) { + __syncthreads(); + if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) { + args_vec_t other = shared[config.shared_memory_offset(offset)]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = reducer::combine(value[i], other[i]); + } + shared[config.shared_memory_offset(0)] = value; + } + } + return value; + } + )ESCAPE"; + + const std::string reduction_template_1 = R"ESCAPE( + + C10_DEVICE bool mark_block_finished() const { + __shared__ bool is_last_block_done_shared; + + __syncthreads(); + if (threadIdx.x == 0 && threadIdx.y == 0) { + int prev_blocks_finished = atomicAdd(&semaphores[blockIdx.x], 1); + is_last_block_done_shared = (prev_blocks_finished == gridDim.y - 1); + } + + __syncthreads(); + + return is_last_block_done_shared; + } + + template + C10_DEVICE Array accumulate_in_output( + Array out, + Array value + ) const { + Array ret; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + ret[i] = reducer::combine(*(out[i]), value[i]); + } + return ret; + } + + + C10_DEVICE out_scalar_t get_accumulated_output( + out_scalar_t* out, arg_t value + ) const { + assert(!final_output); + return (out_scalar_t)value; + } + + template + C10_DEVICE void set_results(const T x, const uint32_t base_offset) const { + assert(noutputs == 1); + auto res = (out_scalar_t*)((char*)dst[0] + base_offset); + *res = x; + } + +//TODO - multi-output reduction - we won't be able to use thrust::pair +//just explicitly specify typed output reads/writes +//Currently implemented for max of two outputs +// template +// C10_DEVICE void set_results(const thrust::pair x, const index_t base_offset) const { +// if (noutputs >= 1) { +// auto res0 = (T1*)((char*)dst[0] + base_offset); +// *res0 = x.first; +// } +// if (noutputs >= 2) { +// // base offset is computed assuming element size being sizeof(T1), so we need to make a +// // correction to obtain the correct base offset +// auto res1 = (T2*) ((char *) dst[1] + base_offset / sizeof(T1) * sizeof(T2)); +// *res1 = x.second; +// } +// } + + template + C10_DEVICE void set_results_to_output(Array value, Array base_offset) const { + assert(final_output); + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + set_results(reducer::project(value[i]), base_offset[i]); + } + } + + template + C10_DEVICE Array global_reduce(Array value, Array *acc, char* shared_memory) const { + using arg_vec_t = Array; + using out_ptr_vec_t = Array; + using offset_vec_t = Array; + + arg_vec_t* reduce_buffer = (arg_vec_t*)cta_buf; + uint32_t output_idx = config.output_idx(); + offset_vec_t base_offsets; + out_ptr_vec_t out; + + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + base_offsets[i] = output_calc.get(output_idx + i)[0]; + out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]); + } + + bool should_store = config.should_store(output_idx); + if (should_store) { + uint32_t offset = config.staging_memory_offset(blockIdx.y); + reduce_buffer[offset] = value; + } + + __threadfence(); // make sure writes are globally visible + __syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done + bool is_last_block_done = mark_block_finished(); + + if (is_last_block_done) { + value = ident; + if (config.should_block_x_reduce()) { + uint32_t input_offset = threadIdx.x + threadIdx.y * blockDim.x; + uint32_t step = blockDim.x * blockDim.y; + for (; input_offset < config.ctas_per_output; input_offset += step) { + uint32_t idx = config.staging_memory_offset(input_offset); + arg_vec_t next = reduce_buffer[idx]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = reducer::combine(value[i], next[i]); + } + } + } else { + uint32_t input_offset = threadIdx.y; + uint32_t step = blockDim.y; + for (; input_offset < config.ctas_per_output; input_offset += step) { + uint32_t idx = config.staging_memory_offset(input_offset); + arg_vec_t next = reduce_buffer[idx]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = reducer::combine(value[i], next[i]); + } + } + } + value = block_y_reduce(value, shared_memory); + if (config.should_block_x_reduce()) { + value = block_x_reduce(value, shared_memory); + } + if (should_store) { + if (accumulate) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = reducer::translate_idx(value[i], base_idx); + } + } + + if (acc == nullptr) { + if (accumulate) { + value = accumulate_in_output(out, value); + } + if (final_output) { + set_results_to_output(value, base_offsets); + } else { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + *(out[i]) = get_accumulated_output(out[i], value[i]); + } + } + } else { + if (accumulate) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = reducer::combine((*acc)[i], value[i]); + } + } + if (final_output) { + set_results_to_output(value, base_offsets); + } else { + *acc = value; + } + } + } + } + + return value; + } +}; + +extern "C" +__launch_bounds__(${max_threads_lb}, 4) +__global__ void reduction_${name}_kernel(ReduceJitOp r){ + r.run(); +} +)ESCAPE"; + +const std::string reduction_template = reduction_template_0 + reduction_template_1; + + +const std::string &get_reduction_template() { + return reduction_template; +} + +}} diff --git a/aten/src/ATen/native/cuda/thread_constants.h b/aten/src/ATen/native/cuda/thread_constants.h index 464c6fe9fe2e..651053d663e4 100644 --- a/aten/src/ATen/native/cuda/thread_constants.h +++ b/aten/src/ATen/native/cuda/thread_constants.h @@ -13,7 +13,7 @@ constexpr int num_threads() { return 256; } #else -constexpr int num_threads() { +constexpr uint32_t num_threads() { return C10_WARP_SIZE * 4; } #endif diff --git a/aten/src/ATen/native/cuda/vol2col.cuh b/aten/src/ATen/native/cuda/vol2col.cuh index 17459f382816..7ab719bc819e 100644 --- a/aten/src/ATen/native/cuda/vol2col.cuh +++ b/aten/src/ATen/native/cuda/vol2col.cuh @@ -1,9 +1,5 @@ #pragma once -#include -#include -#include - #include #include #include diff --git a/aten/src/ATen/native/cudnn/ConvShared.cpp b/aten/src/ATen/native/cudnn/ConvShared.cpp index de45a3a2dd40..6968548b0e0e 100644 --- a/aten/src/ATen/native/cudnn/ConvShared.cpp +++ b/aten/src/ATen/native/cudnn/ConvShared.cpp @@ -152,69 +152,6 @@ std::string repro_from_args(const ConvolutionParams& params) { return ss.str(); } -// --------------------------------------------------------------------- -// -// Checking -// -// --------------------------------------------------------------------- - -// Used on pad, stride and dilation -static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, const char* arg_name) -{ - TORCH_CHECK(args.size() <= expected_size, - "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ", - expected_size, " (while checking arguments for ", c, ")"); - TORCH_CHECK(args.size() >= expected_size, - "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ", - expected_size, " (while checking arguments for ", c, ")"); - - auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;}); - if (num_negative_values > 0){ - std::stringstream ss; - ss << arg_name << " should be greater than zero but got ("; - std::copy(args.begin(), args.end() - 1, std::ostream_iterator(ss,", ")); - ss << args.back() << ")" << " (while checking arguments for " << c << ")"; - AT_ERROR(ss.str()); - } -} - - -// NOTE [ Convolution checks ] -// -// NB: For many call sites, it is not strictly necessary to check all of -// these relationships (for example, for forward convolution, we compute -// the size of output ourselves, so we don't actually need to check -// output. However, writing a single function that does everything -// means we get to reuse it for both forwards and all backwards -// variants, even when the set of "real" inputs varies. The magic of -// relational computing! -// -// (There is one downside, which is that it is slightly harder to write -// error messages which are able to distinguish between real inputs -// (which the user can change) and computed inputs (which the user can -// only indirectly affect). It would be an interesting exercise to -// come up with a general framework to handle such situations.) -static void convolution_shape_check( - CheckedFrom c, - const TensorGeometryArg& input, const TensorGeometryArg& weight, const TensorGeometryArg& output, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups) -{ - check_args(c, padding, input->dim() - 2, "padding"); - check_args(c, stride, padding.size(), "stride"); - check_args(c, dilation, padding.size(), "dilation"); - - // Input - checkDimRange(c, input, 3, 6 /* exclusive */); - checkSize(c, input, input_channels_dim, weight->size(1) * groups); - - // Weight - checkSameDim(c, input, weight); - - // TODO: check that output->size() matches output_sizes - // TODO: check that weight matches output->sizes() - checkSameDim(c, input, output); -} - // --------------------------------------------------------------------- // // Convolution forward / Transposed convolution backward @@ -494,6 +431,7 @@ Tensor cudnn_convolution_relu( } auto& ctx = at::globalContext(); + bool benchmark = ctx.benchmarkCuDNN(); bool allow_tf32 = ctx.allowTF32CuDNN(); auto _bias = bias_t.has_value() ? bias_t.value() @@ -516,7 +454,7 @@ Tensor cudnn_convolution_relu( padding, dilation, groups, - false, // benchmark + benchmark, // benchmark false, // deterministic allow_tf32 // allow_tf32 ); @@ -532,7 +470,7 @@ Tensor cudnn_convolution_relu( padding, dilation, groups, - false, // benchmark + benchmark, // benchmark false, // deterministic allow_tf32 // allow_tf32 ); @@ -554,6 +492,11 @@ Tensor cudnn_convolution_add_relu( auto memory_format = cudnn_conv_suggest_memory_format(input_t, weight_t); const Tensor input = input_t.contiguous(memory_format); const Tensor weight = weight_t.contiguous(memory_format); + Tensor z = z_t; + if (z.suggest_memory_format() != memory_format) { + z = z.to(memory_format); + } + z = z.contiguous(memory_format); // FuseFrozenConvAddRelu performs some tensor shape checking Tensor output_t = at::detail::empty_cuda( @@ -566,6 +509,7 @@ Tensor cudnn_convolution_add_relu( auto& ctx = at::globalContext(); bool allow_tf32 = ctx.allowTF32CuDNN(); + bool benchmark = ctx.benchmarkCuDNN(); auto _alpha = alpha.has_value() ? alpha.value().to() : 1.0; auto _bias = bias_t.has_value() ? bias_t.value() @@ -581,14 +525,14 @@ Tensor cudnn_convolution_add_relu( output_t, input, weight, - z_t, + z, _alpha, _bias, stride, padding, dilation, groups, - false, // benchmark + benchmark, false, // deterministic allow_tf32 // allow_tf32 ); @@ -597,14 +541,14 @@ Tensor cudnn_convolution_add_relu( output_t, input, weight, - z_t, + z, _alpha, _bias, stride, padding, dilation, groups, - false, // benchmark + benchmark, false, // deterministic allow_tf32 // allow_tf32 ); diff --git a/aten/src/ATen/native/cudnn/ConvShared.h b/aten/src/ATen/native/cudnn/ConvShared.h index c3b5ef74ff8f..9ee5bfb3f9e6 100644 --- a/aten/src/ATen/native/cudnn/ConvShared.h +++ b/aten/src/ATen/native/cudnn/ConvShared.h @@ -105,4 +105,48 @@ void raw_cudnn_convolution_add_relu_fallback_out( bool benchmark, bool deterministic, bool allow_tf32); + + +#if AT_CUDNN_ENABLED() +#include + +#if HAS_CUDNN_V8() +// v7 functions are preserved here to allow for runtime switching to v7 +// (e.g., TORCH_CUDNN_V8_API_ENABLED=0). +// Note that v7 forward/backward out can have different behavior from the v8 +// versions, as v7 explicitly splits large tensors as a 32-bit indexing +// workaround whereas v8 expects cuDNN to handle large tensors. +void raw_cudnn_convolution_forward_out_v7( + const Tensor& output, const Tensor& input, const Tensor& weight, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32); + +void raw_cudnn_convolution_backward_input_out_v7( + const at::Tensor& grad_input, + const at::Tensor& grad_output, + const at::Tensor& weight, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32); + +void raw_cudnn_convolution_backward_weight_out_v7( + const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32); + +void raw_cudnn_convolution_add_relu_out_v7( + const Tensor& output, + const Tensor& input, + const Tensor& weight, + const Tensor& z, + float alpha, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32); +#endif +#endif }} diff --git a/aten/src/ATen/native/cudnn/Conv_v7.cpp b/aten/src/ATen/native/cudnn/Conv_v7.cpp index 502b32a5b446..a2ff4839a40c 100644 --- a/aten/src/ATen/native/cudnn/Conv_v7.cpp +++ b/aten/src/ATen/native/cudnn/Conv_v7.cpp @@ -619,8 +619,6 @@ if (args.params.dataType == CUDNN_DATA_FLOAT) { // // --------------------------------------------------------------------- -#if !HAS_CUDNN_V8() - void raw_cudnn_convolution_forward_out_32bit( const Tensor& output, const Tensor& input, const Tensor& weight, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, @@ -666,15 +664,18 @@ void raw_cudnn_convolution_forward_out_32bit( ); } + +#if !HAS_CUDNN_V8() void raw_cudnn_convolution_forward_out( +#else +void raw_cudnn_convolution_forward_out_v7( +#endif const Tensor& output, const Tensor& input, const Tensor& weight, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) { split_batch_dim_to_32bit_out(output, input, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32, 1024 * 1024 * 256, raw_cudnn_convolution_forward_out_32bit); } -#endif // !HAS_CUDNN_V8() - // --------------------------------------------------------------------- // // Convolution backward / Transposed convolution forward @@ -726,7 +727,11 @@ void raw_cudnn_convolution_backward_input_out_32bit( ); } +#if !HAS_CUDNN_V8() void raw_cudnn_convolution_backward_input_out( +#else +void raw_cudnn_convolution_backward_input_out_v7( +#endif const at::Tensor& grad_input, const at::Tensor& grad_output, const at::Tensor& weight, @@ -785,7 +790,11 @@ void raw_cudnn_convolution_backward_weight_out_32bit( ); } +#if !HAS_CUDNN_V8() void raw_cudnn_convolution_backward_weight_out( +#else +void raw_cudnn_convolution_backward_weight_out_v7( +#endif const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) { @@ -808,6 +817,9 @@ void raw_cudnn_convolution_backward_weight_out( int64_t split_size = std::max(1024 * 1024 * 512 / max_inner_size, 1L); int64_t num_splits = (n + split_size - 1) / split_size; if (split_size * max_inner_size < int_max) { + const auto kAccType = (grad_weight.scalar_type() == kHalf || grad_weight.scalar_type() == kBFloat16) + ? kFloat : grad_weight.scalar_type(); + Tensor grad_weight_accumulator = at::zeros(grad_weight.sizes(), grad_weight.options().dtype(kAccType)); for (const auto i : c10::irange(num_splits)) { int64_t start = split_size * i; int64_t split_size_ = std::min(split_size, n - start); @@ -815,8 +827,9 @@ void raw_cudnn_convolution_backward_weight_out( Tensor grad_output_ = grad_output.narrow(0, start, split_size_); Tensor grad_weight_ = at::empty_like(grad_weight); raw_cudnn_convolution_backward_weight_out_32bit(grad_weight_, grad_output_, input_, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); - grad_weight.add_(grad_weight_); + grad_weight_accumulator.add_(grad_weight_); } + grad_weight.copy_(grad_weight_accumulator); return; } // If control flow reaches here, this means even splitting N is not enough, then things starts to become complicated: @@ -833,7 +846,12 @@ void raw_cudnn_convolution_backward_weight_out( TORCH_INTERNAL_ASSERT(false, "This case should not be dispatched to cuDNN."); } +#if !HAS_CUDNN_V8() void raw_cudnn_convolution_add_relu_out( +#else +void raw_cudnn_convolution_add_relu_out_v7( +#endif + const Tensor& output, const Tensor& input, const Tensor& weight, diff --git a/aten/src/ATen/native/cudnn/Conv_v8.cpp b/aten/src/ATen/native/cudnn/Conv_v8.cpp index 9ba1775988b9..24c5f3c2e3d6 100644 --- a/aten/src/ATen/native/cudnn/Conv_v8.cpp +++ b/aten/src/ATen/native/cudnn/Conv_v8.cpp @@ -8,6 +8,8 @@ #include #include +#include +#include #include #include #include @@ -17,34 +19,57 @@ #include #include +#include +#include +#include + +#include #include -namespace at { namespace native{ +namespace at { namespace native { namespace { +// TODO: remove duplicate code in Conv_v7.cpp +constexpr size_t operator "" _TiB(unsigned long long n) { + return size_t(n) << 40; +} + uint8_t getAlignment(const Tensor &t) { // alignment are in bytes uint8_t alignment = 1; - uint64_t address = reinterpret_cast(t.data_ptr()); - while (address % alignment == 0 && alignment < 16) alignment *= 2; + uintptr_t address = reinterpret_cast(t.data_ptr()); + for (; alignment < 64; alignment *= 2) { + if (address % (alignment * 2)) { + return alignment; + } + } return alignment; } -cudnn_frontend::Tensor getTensorDescriptor(const Tensor &t, int64_t id, uint8_t alignment) { - auto shape = t.sizes(); +cudnn_frontend::Tensor getTensorDescriptorWithTypeVirtual(const Tensor &t, const int64_t id, const uint8_t alignment, const cudnnDataType_t dataType, const bool _virtual) { + auto sizes = t.sizes(); auto strides = t.strides(); - return cudnn_frontend::TensorBuilder() - .setDim(shape.size(), shape.data()) + auto r = cudnn_frontend::TensorBuilder() + .setDim(sizes.size(), sizes.data()) .setStrides(strides.size(), strides.data()) .setId(id) .setAlignment(alignment) - .setDataType(getCudnnDataType(t)) + .setDataType(dataType) + .setVirtual(_virtual) .build(); + return r; +} + +cudnn_frontend::Tensor getTensorDescriptor(const Tensor &t, const int64_t id, const uint8_t alignment) { + return getTensorDescriptorWithTypeVirtual(t, id, alignment, getCudnnDataType(t), false); } -cudnn_frontend::ConvDesc_v8 getConvDescriptor(cudnnDataType_t dataType, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation) { +cudnn_frontend::ConvDesc_v8 getConvDescriptor(cudnnDataType_t dataType, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, const at::ScalarType scalar_type) { uint64_t convDim = stride.size(); + if (scalar_type == kBFloat16 || scalar_type == kHalf) { + dataType = CUDNN_DATA_FLOAT; + } return cudnn_frontend::ConvDescBuilder() .setDataType(dataType) .setMathMode(CUDNN_CROSS_CORRELATION) @@ -63,11 +88,12 @@ void filterEngineConfigs( { auto filter = [=](cudnnBackendDescriptor_t c) { if (deterministic) { - if (cudnn_frontend::hasNumericalNote(c)) return true; + if (cudnn_frontend::hasNumericalNote(c)) {return true;} } - if (scalar_type == kFloat || !allow_tf32) { - if (cudnn_frontend::hasNumericalNote(c)) return true; - if (cudnn_frontend::hasNumericalNote(c)) return true; + if (cudnn_frontend::hasNumericalNote(c)) {return true;} + if (scalar_type == kFloat) { + // TODO: check under which conditions this is OK + if (!allow_tf32 && cudnn_frontend::hasNumericalNote(c)) {return true;} } return false; }; @@ -76,99 +102,545 @@ void filterEngineConfigs( struct CacheKey { ConvolutionParams params; - uint8_t input_alignment; - uint8_t weight_alignment; - uint8_t output_alignment; + cudnnBackendDescriptorType_t operation; + uint8_t x_alignment; + uint8_t w_alignment; + uint8_t y_alignment; }; -// FIXME: make this thread-safe by reusing the benchmark cache in Conv_v7.cpp -std::unordered_map, ParamsEqual> engine_cache; +struct CacheKeyFused { + ConvolutionParams params; + // No op here because it is assumed to be a forward conv op + uint8_t x_alignment; + uint8_t w_alignment; + uint8_t y_alignment; + uint8_t z_alignment; + uint8_t b_alignment; + // TODO: does it make sense to have this in the key? but alpha is a graph-level param... + float alpha; +}; -} +template +struct BenchmarkCache { +std::mutex mutex; +std::unordered_map, ParamsEqual> engine_cache; -void raw_cudnn_convolution_forward_out( - const Tensor& output, const Tensor& input, const Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) -{ - TORCH_CHECK(!benchmark, "not supported yet"); - if (output.numel() == 0) { - return; +// TODO: is this thread safe if cache is updated? is pointer stale? +cudnn_frontend::ExecutionPlan* find(const KeyType& key) { + std::lock_guard guard(mutex); + auto it = engine_cache.find(key); + if (it == engine_cache.end()) { + return nullptr; } + // TODO: probably want ExecutionPlan copy constructor or better way to return + return &(it->second); +} - cudnnHandle_t handle = getCudnnHandle(); +void emplace(const KeyType& key, T& results) { + std::lock_guard guard(mutex); + engine_cache.emplace(key, std::move(results)); +} - CacheKey key; - setConvolutionParams(&key.params, input, weight, padding, stride, dilation, groups, deterministic, allow_tf32); - key.input_alignment = getAlignment(input); - key.output_alignment = getAlignment(output); - key.weight_alignment = getAlignment(weight); - - auto run = [&](cudnn_frontend::ManagedOpaqueDescriptor cfg) { - auto plan = cudnn_frontend::ExecutionPlanBuilder() - .setHandle(handle) - .setEngineConfig(cfg) - .build(); - - auto workspace_size = plan.getWorkspaceSize(); - auto workspace = at::empty({workspace_size}, input.options().dtype(kByte)); - void *data_ptrs[] = {input.data_ptr(), output.data_ptr(), weight.data_ptr()}; - // std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl; - int64_t uids[] = {'x', 'y', 'w'}; - auto variantPack = cudnn_frontend::VariantPackBuilder() - .setWorkspacePointer(workspace.data_ptr()) - .setDataPointers(3, data_ptrs) - .setUids(3, uids) - .build(); - AT_CUDNN_CHECK(cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc())); - }; +}; - auto search = engine_cache.find(key); - if (search != engine_cache.end()) { - run(search->second); - return; - } +BenchmarkCache benchmark_cache; +BenchmarkCache benchmark_cache_fused; + +} // namespace + +// NB: This (and the fused version) can't be a constructor, because then CacheKey +// would not be a POD anymore. +void setCacheKey(CacheKey& key, const cudnnBackendDescriptorType_t operation, const Tensor& y, const Tensor& x, const Tensor& w, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, int64_t groups, bool deterministic, bool allow_tf32) { + memset(&key, 0, sizeof(key)); + setConvolutionParams(&key.params, x, w, padding, stride, dilation, groups, deterministic, allow_tf32); + key.operation = operation; + key.x_alignment = getAlignment(x); + key.y_alignment = getAlignment(y); + key.w_alignment = getAlignment(w); +} + +void setCacheKeyFused(CacheKeyFused& key, const Tensor& y, const Tensor& x, const Tensor& w, const Tensor& z, const Tensor& b, const float alpha, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, int64_t groups, bool deterministic, bool allow_tf32) { + memset(&key, 0, sizeof(key)); + setConvolutionParams(&key.params, x, w, padding, stride, dilation, groups, deterministic, allow_tf32); + key.x_alignment = getAlignment(x); + key.y_alignment = getAlignment(y); + key.w_alignment = getAlignment(w); + key.z_alignment = getAlignment(z); + key.b_alignment = getAlignment(b); + key.alpha = alpha; +} - auto op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR) - .setxDesc(getTensorDescriptor(input, 'x', key.input_alignment)) - .setyDesc(getTensorDescriptor(output, 'y', key.output_alignment)) - .setwDesc(getTensorDescriptor(weight, 'w', key.weight_alignment)) - .setcDesc(getConvDescriptor(key.params.dataType, padding, stride, dilation)) +void run_conv_plan(cudnnHandle_t handle, const Tensor& x, const Tensor& y, const Tensor& w, const cudnn_frontend::ExecutionPlan& plan) { + c10::DeviceGuard g(x.options().device()); + auto workspace_size = plan.getWorkspaceSize(); + auto workspace_ptr = c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size); + void *data_ptrs[] = {x.data_ptr(), y.data_ptr(), w.data_ptr()}; + int64_t uids[] = {'x', 'y', 'w'}; + auto variantPack = cudnn_frontend::VariantPackBuilder() + .setWorkspacePointer(workspace_size ? workspace_ptr.get() : nullptr) + .setDataPointers(3, data_ptrs) + .setUids(3, uids) .build(); - // std::cout << op.describe() << std::endl; + AT_CUDNN_CHECK(cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc())); +} - std::array ops = {&op}; +void run_conv_plan_fused(cudnnHandle_t handle, const Tensor& x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b, const cudnn_frontend::ExecutionPlan& plan) { + c10::DeviceGuard g(x.options().device()); + auto workspace_size = plan.getWorkspaceSize(); + auto workspace_ptr = c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size); + void *data_ptrs[] = {x.data_ptr(), y.data_ptr(), w.data_ptr(), z.data_ptr(), b.data_ptr()}; + int64_t uids[] = {'x', 'y', 'w', 'z', 'b'}; + auto variantPack = cudnn_frontend::VariantPackBuilder() + .setWorkspacePointer(workspace_size ? workspace_ptr.get() : nullptr) + .setDataPointers(5, data_ptrs) + .setUids(5, uids) + .build(); + AT_CUDNN_CHECK(cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc())); +} +auto build_opgraph(const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, const Tensor& x, const Tensor& y, const Tensor& w, const CacheKey& key, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation) { + auto op = cudnn_frontend::OperationBuilder(desc) + .setxDesc(getTensorDescriptor(x, 'x', key.x_alignment)) + .setyDesc(getTensorDescriptor(y, 'y', key.y_alignment)) + .setwDesc(getTensorDescriptor(w, 'w', key.w_alignment)) + .setcDesc(getConvDescriptor(key.params.dataType, padding, stride, dilation, x.scalar_type())) + .build(); + std::array ops = {&op}; auto opGraph = cudnn_frontend::OperationGraphBuilder() .setHandle(handle) - .setOperationGraph(1, ops.data()) + .setOperationGraph(ops.size(), ops.data()) + .build(); + return opGraph; +} + +auto build_opgraph_fused(const cudnnHandle_t handle, const Tensor & x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b, const float alpha, const CacheKeyFused& key, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation) { + // need computation to be done in FLOAT type regardless of reduced precision input + const auto precision = CUDNN_DATA_FLOAT; + auto addDesc = cudnn_frontend::PointWiseDescBuilder() + .setMode(CUDNN_POINTWISE_ADD) + .setMathPrecision(precision) + .build(); + auto addBiasDesc = cudnn_frontend::PointWiseDescBuilder() + .setMode(CUDNN_POINTWISE_ADD) + .setMathPrecision(precision) + .build(); + auto actDesc = cudnn_frontend::PointWiseDescBuilder() + .setMode(CUDNN_POINTWISE_RELU_FWD) + .setMathPrecision(precision) + .build(); + auto convDesc = getConvDescriptor(key.params.dataType, padding, stride, dilation, x.scalar_type()); + const float alpha1 = 1.0; + const float alpha2 = alpha; + auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR) + .setxDesc(getTensorDescriptor(x, 'x', key.x_alignment)) + // virtual output of conv + .setyDesc(getTensorDescriptorWithTypeVirtual(y, 'C', key.y_alignment, precision, true)) + .setwDesc(getTensorDescriptor(w, 'w', key.w_alignment)) + .setAlpha(alpha1) + .setcDesc(convDesc) + .build(); + auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(conv_op.getOutputTensor()) + .setbDesc(getTensorDescriptor(z, 'z', key.z_alignment)) + // another virtual output (of add) + .setyDesc(getTensorDescriptorWithTypeVirtual(y, 'A', key.y_alignment, precision, true)) + .setpwDesc(addDesc) + .setAlpha(alpha1) + .setAlpha2(alpha2) + .build(); + auto add_bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(add_op.getOutputTensor()) + .setbDesc(getTensorDescriptor(b, 'b', key.b_alignment)) + // another virtual output (of add bias) + .setyDesc(getTensorDescriptorWithTypeVirtual(y, 'B', key.y_alignment, precision, true)) + .setpwDesc(addBiasDesc) + .build(); + auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(add_bias_op.getOutputTensor()) + // final output is in original datatype + .setyDesc(getTensorDescriptor(y, 'y', key.y_alignment)) + .setpwDesc(actDesc) + .build(); + std::array ops = {&conv_op, &add_op, &add_bias_op, &act_op}; + auto opGraph = cudnn_frontend::OperationGraphBuilder() + .setHandle(handle) + .setOperationGraph(ops.size(), ops.data()) + .build(); + return opGraph; +} + +auto get_generator_sources(const cudnnBackendDescriptorType_t& desc, const Tensor& x, const bool deterministic, const bool allow_tf32, const cudnnBackendHeurMode_t heur_mode) { + // Method for engine config generator based on heuristics + auto heurgen_method = [/*&desc,*/ &x, deterministic, allow_tf32, heur_mode](cudnn_frontend::OperationGraph &opGraph) -> cudnn_frontend::EngineConfigList { + auto heuristics = cudnn_frontend::EngineHeuristicsBuilder() + .setOperationGraph(opGraph) + .setHeurMode(heur_mode) + .build(); + auto &engine_configs = heuristics.getEngineConfig(heuristics.getEngineConfigCount()); + cudnn_frontend::EngineConfigList filtered_configs; + filterEngineConfigs(engine_configs, filtered_configs, deterministic, allow_tf32, x.scalar_type()); + return filtered_configs; + }; + // Method for engine config generator based on fallback list + auto fallback_method = [&desc, &x, deterministic, allow_tf32](cudnn_frontend::OperationGraph &opGraph) -> cudnn_frontend::EngineConfigList { + auto fallback = cudnn_frontend::EngineFallbackListBuilder() + .setOperationGraph(opGraph) + .setOperation(desc) + .build(); + auto &fallback_list = fallback.getFallbackList(); + cudnn_frontend::EngineConfigList filtered_configs; + filterEngineConfigs(fallback_list, filtered_configs, deterministic, allow_tf32, x.scalar_type()); + return filtered_configs; + }; + std::array sources = {heurgen_method, fallback_method}; + return sources; +} + +size_t get_available_workspace() { + int device; + C10_CUDA_CHECK(cudaGetDevice(&device)); + size_t max_block_size = 0; + size_t tmp_bytes = 0; // Only used for filling pointer parameters that aren't used later + c10::cuda::CUDACachingAllocator::cacheInfo(device, &tmp_bytes, &max_block_size); + return max_block_size; +} + +void generate_and_filter_plans(const cudnnHandle_t handle, cudnn_frontend::OperationGraph& opGraph, cudnn_frontend::EngineConfigGenerator& generator, const Tensor& x, cudnn_frontend::executionPlans_t& valid_plans, at::DataPtr& workspace_ptr, unsigned int max_plans = 0) { + auto initial_predicate_function = [&](cudnn_frontend::ExecutionPlan const& plan) -> bool { + return false; + }; + auto plans = generator.cudnnGetPlan(handle, opGraph, initial_predicate_function); + size_t max_block_size = get_available_workspace(); + size_t max_workspace_size = 0u; + std::for_each(plans.begin(), plans.end(), [&] (cudnn_frontend::ExecutionPlan& plan) { + size_t curr_workspace_size = plan.getWorkspaceSize(); + if (curr_workspace_size <= max_block_size) { + if (curr_workspace_size > max_workspace_size) { + max_workspace_size = plan.getWorkspaceSize(); + } + valid_plans.emplace_back(std::move(plan)); + } + }); + TORCH_CHECK_WITH(CUDAOutOfMemoryError, max_workspace_size < 1_TiB, "Not enough memory for workspace!"); + bool remove_invalid = false; + while (max_workspace_size) { + try { + workspace_ptr = c10::cuda::CUDACachingAllocator::get()->allocate(max_workspace_size); + break; + } catch (c10::CUDAOutOfMemoryError &e) { + max_workspace_size /= 2; + cudaGetLastError(); // clear CUDA error + remove_invalid = true; + } + } + if (remove_invalid) { + cudnn_frontend::executionPlans_t new_valid_plans; + unsigned int plan_count = 0; + for (auto &plan : valid_plans) { + if (plan.getWorkspaceSize() <= max_workspace_size) { + new_valid_plans.emplace_back(std::move(plan)); + plan_count++; + } + if (max_plans && plan_count >= max_plans) { + break; + } + } + valid_plans = std::move(new_valid_plans); + } +} + +auto get_plans_from_find(const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, const Tensor& x, const Tensor& y, const Tensor& w, const CacheKey& key, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const bool deterministic, const bool allow_tf32) { + auto opGraph = build_opgraph(handle, desc, x, y, w, key, padding, stride, dilation); + void *data_ptrs[] = {x.data_ptr(), y.data_ptr(), w.data_ptr()}; + int64_t uids[] = {'x', 'y', 'w'}; + // We don't care about getting the best ordering of algos if we're roing to run all of them + auto sources = get_generator_sources(desc, x, deterministic, allow_tf32, CUDNN_HEUR_MODE_INSTANT); + cudnn_frontend::EngineConfigGenerator generator(sources.size(), sources.data()); + cudnn_frontend::executionPlans_t valid_plans; + c10::DeviceGuard g(x.options().device()); + at::DataPtr workspace_ptr; + generate_and_filter_plans(handle, opGraph, generator, x, valid_plans, workspace_ptr); + auto variantPack = cudnn_frontend::VariantPackBuilder() + .setDataPointers(3, data_ptrs) + .setUids(3, uids) + .setWorkspacePointer(workspace_ptr ? workspace_ptr.get() : nullptr) .build(); - // std::cout << opGraph.describe() << std::endl; - auto heuristics = cudnn_frontend::EngineHeuristicsBuilder() - .setOperationGraph(opGraph) - .setHeurMode(CUDNN_HEUR_MODE_INSTANT) + auto plans = cudnn_frontend::time_sorted_plan(handle, std::move(valid_plans), variantPack); + + cudnn_frontend::executionPlans_t sorted_plans; + for (auto& plan : plans) { + sorted_plans.emplace_back(std::move(plan)); + } + return sorted_plans; +} + +auto get_plans_from_find_fused(const cudnnHandle_t handle, + const Tensor& x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b, + const float alpha, const CacheKeyFused& key, + const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, + const bool deterministic, const bool allow_tf32) { + auto opGraph = build_opgraph_fused(handle, x, y, w, z, b, alpha, key, padding, stride, dilation); + void *data_ptrs[] = {x.data_ptr(), y.data_ptr(), w.data_ptr(), z.data_ptr(), b.data_ptr()}; + int64_t uids[] = {'x', 'y', 'w', 'z', 'b'}; + + auto sources = get_generator_sources(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, x, deterministic, allow_tf32, CUDNN_HEUR_MODE_INSTANT); + cudnn_frontend::EngineConfigGenerator generator(sources.size(), sources.data()); + cudnn_frontend::executionPlans_t valid_plans; + c10::DeviceGuard g(x.options().device()); + at::DataPtr workspace_ptr; + generate_and_filter_plans(handle, opGraph, generator, x, valid_plans, workspace_ptr); + auto variantPack = cudnn_frontend::VariantPackBuilder() + .setDataPointers(5, data_ptrs) + .setUids(5, uids) + .setWorkspacePointer(workspace_ptr ? workspace_ptr.get() : nullptr) .build(); - auto fallback = cudnn_frontend::EngineFallbackListBuilder() - .setOperationGraph(opGraph) - .setOperation(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR) - .build(); - auto& engine_configs = heuristics.getEngineConfig(heuristics.getEngineConfigCount()); - auto& fallback_list = fallback.getFallbackList(); + auto plans = cudnn_frontend::time_sorted_plan(handle, std::move(valid_plans), variantPack); + + cudnn_frontend::executionPlans_t sorted_plans; + for (auto& plan : plans) { + sorted_plans.emplace_back(std::move(plan)); + } + return sorted_plans; +} + + +// We only get configs from this stage to avoid building unnecessary plans that are never executed +auto get_configs_from_heuristics(const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, const Tensor& x, const Tensor& y, const Tensor& w, const CacheKey& key, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const bool deterministic, const bool allow_tf32) { + auto opGraph = build_opgraph(handle, desc, x, y, w, key, padding, stride, dilation); + auto heuristic_mode = at::native::cudnnv8_use_heur_mode_b() ? CUDNN_HEUR_MODE_B : CUDNN_HEUR_MODE_INSTANT; + auto sources = get_generator_sources(desc, x, deterministic, allow_tf32, heuristic_mode); + + cudnn_frontend::EngineConfigGenerator generator(sources.size(), sources.data()); + auto configs = generator.generate_engine_config(opGraph); + return configs; +} + +auto get_configs_from_heuristics_fused(const cudnnHandle_t handle, const Tensor& x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b, const float alpha, const CacheKeyFused& key, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const bool deterministic, const bool allow_tf32) { + auto opGraph = build_opgraph_fused(handle, x, y, w, z, b, alpha, key, padding, stride, dilation); + auto heuristic_mode = at::native::cudnnv8_use_heur_mode_b() ? CUDNN_HEUR_MODE_B : CUDNN_HEUR_MODE_INSTANT; + auto sources = get_generator_sources(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, x, deterministic, allow_tf32, heuristic_mode); + + cudnn_frontend::EngineConfigGenerator generator(sources.size(), sources.data()); + auto configs = generator.generate_engine_config(opGraph); + return configs; +} + +void try_plans(cudnn_frontend::executionPlans_t& plans, const CacheKey& key, const cudnnHandle_t handle, const Tensor& x, const Tensor& y, const Tensor& w) { + for (auto & plan : plans) { + try { + run_conv_plan(handle, x, y, w, plan); + benchmark_cache.emplace(key, plan); + return; + } catch (cudnn_frontend::cudnnException &e) {} catch (CuDNNError &e) {} + catch (c10::CUDAOutOfMemoryError &e) { + cudaGetLastError(); // clear CUDA error + } + } + TORCH_CHECK(false, "FIND was unable to find an engine to execute this computation"); +} + +void try_plans_fused(cudnn_frontend::executionPlans_t& plans, const CacheKeyFused& key, const cudnnHandle_t handle, const Tensor& x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b) { + for (auto & plan : plans) { + try { + run_conv_plan_fused(handle, x, y, w, z, b, plan); + benchmark_cache_fused.emplace(key, plan); + return; + } catch (cudnn_frontend::cudnnException &e) {} catch (CuDNNError &e) {} + catch (c10::CUDAOutOfMemoryError &e) { + cudaGetLastError(); // clear CUDA error + } + } + TORCH_CHECK(false, "FIND was unable to find an engine to execute this computation"); +} - cudnn_frontend::EngineConfigList filtered_configs; - filterEngineConfigs(engine_configs, filtered_configs, deterministic, allow_tf32, input.scalar_type()); - filterEngineConfigs(fallback_list, filtered_configs, deterministic, allow_tf32, input.scalar_type()); +void try_configs(cudnn_frontend::EngineConfigList& configs, const CacheKey& key, const cudnnHandle_t handle, const Tensor& x, const Tensor& y, const Tensor& w) { + for (auto & config : configs) { + try { + auto plan = cudnn_frontend::ExecutionPlanBuilder() + .setHandle(handle) + .setEngineConfig(config) + .build(); + run_conv_plan(handle, x, y, w, plan); + benchmark_cache.emplace(key, plan); + return; + } catch (cudnn_frontend::cudnnException &e) {} catch(CuDNNError &e) {} + catch (c10::CUDAOutOfMemoryError &e) { + cudaGetLastError(); // clear CUDA error + } + } + TORCH_CHECK(false, "GET was unable to find an engine to execute this computation"); +} - for (auto &cfg : filtered_configs) { +void try_configs_fused(cudnn_frontend::EngineConfigList& configs, const CacheKeyFused& key, const cudnnHandle_t handle, const Tensor& x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b) { + for (auto & config : configs) { try { - run(cfg); - engine_cache[key] = cfg; + auto plan = cudnn_frontend::ExecutionPlanBuilder() + .setHandle(handle) + .setEngineConfig(config) + .build(); + run_conv_plan_fused(handle, x, y, w, z, b, plan); + benchmark_cache_fused.emplace(key, plan); return; } catch (cudnn_frontend::cudnnException &e) {} catch(CuDNNError &e) {} + catch (c10::CUDAOutOfMemoryError &e) { + cudaGetLastError(); // clear CUDA error + } + } + TORCH_CHECK(false, "GET was unable to find an engine to execute this computation"); +} + +void run_single_conv(const cudnnBackendDescriptorType_t operation, + const Tensor& x, const Tensor& y, const Tensor& w, + const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const int64_t groups, + const bool benchmark, const bool deterministic, const bool allow_tf32) { + cudnnHandle_t handle = getCudnnHandle(); + + CacheKey key; + setCacheKey(key, operation, y, x, w, padding, stride, dilation, groups, deterministic, allow_tf32); + // TODO: is this thread safe if cache is updated? is pointer stale? + auto search = benchmark_cache.find(key); + if (search) { + try { + run_conv_plan(handle, x, y, w, *search); + return; + } catch(c10::CUDAOutOfMemoryError &e) { + cudaGetLastError(); // clear CUDA error + } + } + + if (!benchmark) { + cudnn_frontend::EngineConfigList configs = get_configs_from_heuristics(handle, operation, + x, y, w, key, + padding, stride, dilation, + deterministic, allow_tf32); + try_configs(configs, key, handle, x, y, w); + } else { + cudnn_frontend::executionPlans_t plans = get_plans_from_find(handle, operation, + x, y, w, key, + padding, stride, dilation, + deterministic, allow_tf32); + // Replicate v7 behavior: clear cached blocks as benchmark incurs + // significant memory consumptiont that is not needed after this step + c10::cuda::CUDACachingAllocator::emptyCache(); + try_plans(plans, key, handle, x, y, w); + } +} + +void run_fused_conv(const Tensor& x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b, + float alpha, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, + int64_t groups, const bool benchmark, const bool deterministic, const bool allow_tf32) { + cudnnHandle_t handle = getCudnnHandle(); + + CacheKeyFused key; + setCacheKeyFused(key, y, x, w, z, b, alpha, padding, stride, dilation, groups, deterministic, allow_tf32); + auto search = benchmark_cache_fused.find(key); + if (search) { + try { + run_conv_plan_fused(handle, x, y, w, z, b, *search); + return; + } catch(c10::CUDAOutOfMemoryError &e) { + cudaGetLastError(); // clear CUDA error + } + } + + if (!benchmark) { + cudnn_frontend::EngineConfigList configs = get_configs_from_heuristics_fused(handle, + x, y, w, z, b, alpha, key, + padding, stride, dilation, + deterministic, allow_tf32); + try_configs_fused(configs, key, handle, x, y, w, z, b); + } else { + cudnn_frontend::executionPlans_t plans = get_plans_from_find_fused(handle, + x, y, w, z, b, alpha, key, + padding, stride, dilation, + deterministic, allow_tf32); + try_plans_fused(plans, key, handle, x, y, w, z, b); + } +} + +void raw_cudnn_convolution_forward_out( + const Tensor& output, const Tensor& input, const Tensor& weight, + const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const int64_t groups, + const bool benchmark, const bool deterministic, const bool allow_tf32) +{ + if (output.numel() == 0) { return; } + if (at::native::cudnnv8_enabled_check_debug()) { + run_single_conv(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, + input, output, weight, padding, stride, dilation, groups, + benchmark, deterministic, allow_tf32); + } else { + raw_cudnn_convolution_forward_out_v7( + output, input, weight, + padding, stride, dilation, groups, + benchmark, deterministic, allow_tf32); + } +} + +void raw_cudnn_convolution_backward_input_out( + const at::Tensor& grad_input, + const at::Tensor& grad_output, + const at::Tensor& weight, + const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const int64_t groups, + const bool benchmark, const bool deterministic, const bool allow_tf32) { + if (grad_input.numel() == 0) { return; } + if (at::native::cudnnv8_enabled_check_debug()) { + run_single_conv(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR, + grad_input, grad_output, weight, padding, stride, dilation, groups, + benchmark, deterministic, allow_tf32); + } else { + raw_cudnn_convolution_backward_input_out_v7( + grad_input, + grad_output, + weight, + padding, stride, dilation, groups, + benchmark, deterministic, allow_tf32); + } +} + +void raw_cudnn_convolution_backward_weight_out( + const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input, + const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const int64_t groups, + const bool benchmark, const bool deterministic, const bool allow_tf32) { + if (grad_weight.numel() == 0) { return; } + if (at::native::cudnnv8_enabled_check_debug()) { + run_single_conv(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR, + input, grad_output, grad_weight, padding, stride, dilation, groups, + benchmark, deterministic, allow_tf32); + } else { + raw_cudnn_convolution_backward_weight_out_v7( + grad_weight, grad_output, input, + padding, stride, dilation, groups, + benchmark, deterministic, allow_tf32); + } +} + +void raw_cudnn_convolution_add_relu_out( + const Tensor& output, + const Tensor& input, + const Tensor& weight, + const Tensor& z, + float alpha, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32) { + if (output.numel() == 0) { return; } + if (at::native::cudnnv8_enabled_check_debug()) { + auto bias_ = bias.view({1, bias.numel(), 1, 1}); + run_fused_conv(input, output, weight, z, bias_, + alpha, stride, padding, dilation, + groups, benchmark, deterministic, allow_tf32); + } else { + raw_cudnn_convolution_add_relu_out_v7(output, input, weight, z, + alpha, bias, stride, padding, dilation, + groups, benchmark, deterministic, allow_tf32); } - TORCH_CHECK(false, "Unable to find an engine to execute this computation"); } }} // at::native diff --git a/aten/src/ATen/native/cudnn/GridSampler.cpp b/aten/src/ATen/native/cudnn/GridSampler.cpp index 38bde06aa6cc..b22d25cbff97 100644 --- a/aten/src/ATen/native/cudnn/GridSampler.cpp +++ b/aten/src/ATen/native/cudnn/GridSampler.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #if !AT_CUDNN_ENABLED() @@ -67,6 +68,13 @@ void checkGridSize(CheckedFrom c, TensorArg grid, TensorArg input) Tensor cudnn_grid_sampler_forward( const Tensor& input_t, const Tensor& grid_t) { + // See NOTE [ grid_sampler Native Functions ]. + // Add checks here in case this is called instead of grid_sampler. + check_grid_sampler_common(input_t, grid_t); + TORCH_CHECK( + cond_cudnn_grid_sampler(input_t, grid_t), + "Invalid arguments to cudnn_grid_sampler_forward"); + auto input_contig = contiguousIfZeroInStrides(input_t); auto grid_contig = grid_t.contiguous(); TensorArg input{ input_contig, "input", 1 }, @@ -106,6 +114,13 @@ std::tuple cudnn_grid_sampler_backward( const Tensor& input_t, const Tensor& grid_t, const Tensor& grad_output_t) { + // See NOTE [ grid_sampler Native Functions ]. + // Add checks here in case this is called instead of grid_sampler. + check_grid_sampler_common(input_t, grid_t); + TORCH_CHECK( + cond_cudnn_grid_sampler(input_t, grid_t), + "Invalid arguments to cudnn_grid_sampler_backward"); + auto input_contig = contiguousIfZeroInStrides(input_t); auto grid_contig = grid_t.contiguous(); auto grad_output_contig = contiguousIfZeroInStrides(grad_output_t); diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp index a80fc4fe0335..29430b38e74e 100644 --- a/aten/src/ATen/native/cudnn/RNN.cpp +++ b/aten/src/ATen/native/cudnn/RNN.cpp @@ -753,19 +753,61 @@ namespace { } } - cudnnRNNAlgo_t get_algo(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors, const Tensor input) { + inline bool use_rnn_persist_small_h(const RNNDescriptorParams& rnn, + const TensorDescriptorListParams& tensors, + bool forward) { +#if CUDNN_VERSION >= 8201 // 8.2.1 + cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); + if (prop->major < 6) return false; + + if (forward) { + if (rnn.mode == CUDNN_RNN_RELU || rnn.mode == CUDNN_RNN_TANH) { + return rnn.hidden_size <= 384; + } + if (rnn.mode == CUDNN_LSTM || rnn.mode == CUDNN_GRU) { + return rnn.hidden_size <= 192; + } + } else /* backward */ { + if (rnn.mode == CUDNN_RNN_RELU || rnn.mode == CUDNN_RNN_TANH) { + return rnn.hidden_size <= 256; + } + if (rnn.mode == CUDNN_LSTM || rnn.mode == CUDNN_GRU) { + return rnn.hidden_size <= 128; + } + } + + return false; +#else + return false; +#endif + } + + cudnnRNNAlgo_t get_algo(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors, const Tensor input, bool forward) { // LSTM with projections only works with standard algorithm if (rnn.proj_size != 0) { return CUDNN_RNN_ALGO_STANDARD; } - if (getCudnnDataType(input) == CUDNN_DATA_HALF && - !tensors.is_input_packed()) { - if (use_persist_common_heuristics(rnn, tensors) && - use_persist_device_heuristics(rnn, tensors)) { - return CUDNN_RNN_ALGO_PERSIST_STATIC; + // Persistent algos typically don't work for packed inputs with sequence lengths that vary + // across batch elements, and will return CUDNN_STATUS_NOT_SUPPORTED if attempted. See + // https://docs.nvidia.com/deeplearning/cudnn/developer-guide/index.html#features-of-rnn-functions + if (!tensors.is_input_packed()) { + auto cudnnDataType = getCudnnDataType(input); +#if CUDNN_VERSION >= 8201 // 8.2.1 + if (cudnnDataType != CUDNN_DATA_DOUBLE) { + if (use_rnn_persist_small_h(rnn, tensors, forward)) { + return CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H; + } + } +#endif + if (cudnnDataType == CUDNN_DATA_HALF) { + if (use_persist_common_heuristics(rnn, tensors) && + use_persist_device_heuristics(rnn, tensors)) { + return CUDNN_RNN_ALGO_PERSIST_STATIC; + } } } + return CUDNN_RNN_ALGO_STANDARD; } @@ -970,7 +1012,7 @@ std::tuple _cudnn_rnn( auto y = output; auto handle = getCudnnHandle(); - cudnnRNNAlgo_t algo = get_algo(fn.rnn, fn.tensors, input); + cudnnRNNAlgo_t algo = get_algo(fn.rnn, fn.tensors, input, true); fn.rnn.set_algo(algo); RNNDescriptors descs(fn, handle, x, y, hx, cx); @@ -1131,7 +1173,7 @@ std::tuple _cudnn_rnn_backward_input( TORCH_CHECK(dhy.is_cuda() && dy.is_cuda() && (!dcy.defined() || dcy.is_cuda()), "Gradients aren't CUDA tensors"); - cudnnRNNAlgo_t algo = get_algo(fn.rnn, fn.tensors, input); + cudnnRNNAlgo_t algo = get_algo(fn.rnn, fn.tensors, input, false); fn.rnn.set_algo(algo); RNNDescriptors descs(fn, handle, x, y, hx, cx); @@ -1234,7 +1276,7 @@ std::vector _cudnn_rnn_backward_weight( const auto& y = output; auto dw = at::zeros(weight_buf.sizes(), weight_buf.options()); - cudnnRNNAlgo_t algo = get_algo(fn.rnn, fn.tensors, input); + cudnnRNNAlgo_t algo = get_algo(fn.rnn, fn.tensors, input, false); fn.rnn.set_algo(algo); RNNDescriptors descs(fn, handle, x, y, hx, cx); diff --git a/aten/src/ATen/native/group_norm.cpp b/aten/src/ATen/native/group_norm.cpp index 5533780a4547..db1d82f84fef 100644 --- a/aten/src/ATen/native/group_norm.cpp +++ b/aten/src/ATen/native/group_norm.cpp @@ -16,6 +16,39 @@ namespace at { namespace native { +void check_group_norm_inputs( + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + int64_t C, + int64_t num_groups) { + TORCH_CHECK( + num_groups > 0, + "Expected num groups to be greater than 0, got ", num_groups); + TORCH_CHECK( + C % num_groups == 0, + "Expected number of channels in input to be divisible by ", + "num_groups, but got input of shape ", + input.sizes(), + " and " + "num_groups=", + num_groups); + TORCH_CHECK( + !weight.defined() || (weight.dim() == 1 && weight.numel() == C), + "Expected weight to be a vector of size equal to the number of ", + "channels in input, but got weight of shape ", + weight.sizes(), + " and input of shape ", + input.sizes()); + TORCH_CHECK( + !bias.defined() || (bias.dim() == 1 && bias.numel() == C), + "Expected bias to be a vector of size equal to the number of ", + "channels in input, but got bias of shape ", + weight.sizes(), + " and input of shape ", + input.sizes()); +} + std::tuple native_group_norm( const Tensor& X, const c10::optional& gamma_opt /* optional */, @@ -31,6 +64,9 @@ std::tuple native_group_norm( const Tensor& gamma = *gamma_maybe_owned; const Tensor& beta = c10::value_or_else(beta_opt, [] { return Tensor(); }); + // repeated check so expanded weights can call native_group_norm directly but + // save mean and variance from forward + check_group_norm_inputs(X, gamma, beta, C, group); auto memory_format = X.device().is_cpu() ? X.suggest_memory_format() : at::MemoryFormat::Contiguous; @@ -128,28 +164,7 @@ Tensor group_norm( const int64_t N = input.size(0); const int64_t C = input.size(1); - TORCH_CHECK( - C % num_groups == 0, - "Expected number of channels in input to be divisible by ", - "num_groups, but got input of shape ", - input.sizes(), - " and " - "num_groups=", - num_groups); - TORCH_CHECK( - !weight.defined() || (weight.dim() == 1 && weight.numel() == C), - "Expected weight to be a vector of size equal to the number of ", - "channels in input, but got weight of shape ", - weight.sizes(), - " and input of shape ", - input.sizes()); - TORCH_CHECK( - !bias.defined() || (bias.dim() == 1 && bias.numel() == C), - "Expected bias to be a vector of size equal to the number of ", - "channels in input, but got bias of shape ", - weight.sizes(), - " and input of shape ", - input.sizes()); + check_group_norm_inputs(input, weight, bias, C, num_groups); const auto input_shape = input.sizes(); const int64_t HxW = diff --git a/aten/src/ATen/native/group_norm.h b/aten/src/ATen/native/group_norm.h index 58fc0867b1ac..1673df9253ee 100644 --- a/aten/src/ATen/native/group_norm.h +++ b/aten/src/ATen/native/group_norm.h @@ -1,9 +1,11 @@ #pragma once -#include #include +#include namespace at { +class Tensor; + namespace native { using forward_fn = void (*)( diff --git a/aten/src/ATen/native/im2col.h b/aten/src/ATen/native/im2col.h index 854052145d54..c3daed3d4ffc 100644 --- a/aten/src/ATen/native/im2col.h +++ b/aten/src/ATen/native/im2col.h @@ -3,6 +3,8 @@ #include #include #include +#include +#include #include #include @@ -26,27 +28,59 @@ static void im2col( const int64_t stride_w, const int64_t dilation_h, const int64_t dilation_w, - T* data_col) { + T* data_col, + bool is_channels_last = false) { const int64_t height_col = output_height; const int64_t width_col = output_width; const int64_t channels_col = channels * kernel_h * kernel_w; - for (const auto c_col : c10::irange(channels_col)) { - int64_t w_offset = c_col % kernel_w; - int64_t h_offset = (c_col / kernel_w) % kernel_h; - int64_t c_im = c_col / kernel_h / kernel_w; + if (is_channels_last) { + at::parallel_for(0, height_col * width_col, 0, [&](int64_t begin, int64_t end) { + int64_t h_col{0}, w_col{0}; + data_index_init(begin, h_col, height_col, w_col, width_col); - for (const auto h_col : c10::irange(height_col)) { - int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h; + for (const auto i_col : c10::irange(begin, end)) { + for (const auto h_offset : c10::irange(kernel_h)) { + int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h; + for (const auto w_offset : c10::irange(kernel_w)) { + int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w; - for (const auto w_col : c10::irange(width_col)) { - int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w; - data_col[(c_col * height_col + h_col) * width_col + w_col] = - (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) - ? data_im[(c_im * height + h_im) * width + w_im] - : static_cast(0); + const T* slice_im = data_im + (h_im * width + w_im) * channels; + T* slice_col = data_col + (i_col * kernel_h * kernel_w + h_offset * kernel_w + w_offset) * channels; + + if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { + std::copy_n(slice_im, channels, slice_col); + } else { + std::fill_n(slice_col, channels, T(0)); + } + } + } + + // move the the next index + data_index_step(h_col, height_col, w_col, width_col); } - } + }); + } else { + at::parallel_for(0, channels_col, 0, [&](int64_t begin, int64_t end) { + int64_t c_im{0}, h_offset{0}, w_offset{0}; + data_index_init(begin, c_im, channels, h_offset, kernel_h, w_offset, kernel_w); + + for (const auto c_col : c10::irange(begin, end)) { + for (const auto h_col : c10::irange(height_col)) { + int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h; + for (const auto w_col : c10::irange(width_col)) { + int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w; + data_col[(c_col * height_col + h_col) * width_col + w_col] = + (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) + ? data_im[(c_im * height + h_im) * width + w_im] + : static_cast(0); + } + } + + // move to the next index + data_index_step(c_im, channels, h_offset, kernel_h, w_offset, kernel_w); + } + }); } } @@ -66,27 +100,48 @@ static void col2im( const int64_t stride_w, const int64_t dilation_h, const int64_t dilation_w, - T* data_im) { + T* data_im, + bool is_channels_last = false) { std::fill_n(data_im, height * width * channels, T(0)); const int64_t height_col = output_height; const int64_t width_col = output_width; const int64_t channels_col = channels * kernel_h * kernel_w; - for (const auto c_col : c10::irange(channels_col)) { - int64_t w_offset = c_col % kernel_w; - int64_t h_offset = (c_col / kernel_w) % kernel_h; - int64_t c_im = c_col / kernel_h / kernel_w; - + if (is_channels_last) { for (const auto h_col : c10::irange(height_col)) { - int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h; - for (const auto w_col : c10::irange(width_col)) { - int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w; + for (const auto h_offset : c10::irange(kernel_h)) { + int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h; + for (const auto w_offset : c10::irange(kernel_w)) { + int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w; + + T* slice_im = data_im + (h_im * width + w_im) * channels; + const T* slice_col = data_col + ((h_col * width_col + w_col) * kernel_h * kernel_w + + h_offset * kernel_w + w_offset) * channels; + + if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width) { + std::transform(slice_col, slice_col + channels, slice_im, slice_im, std::plus()); + } + } + } + } + } + } else { + for (const auto c_col : c10::irange(channels_col)) { + int64_t w_offset = c_col % kernel_w; + int64_t h_offset = (c_col / kernel_w) % kernel_h; + int64_t c_im = c_col / kernel_h / kernel_w; + + for (const auto h_col : c10::irange(height_col)) { + int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h; + for (const auto w_col : c10::irange(width_col)) { + int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w; - if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width) - data_im[(c_im * height + h_im) * width + w_im] += - data_col[(c_col * height_col + h_col) * width_col + w_col]; + if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width) + data_im[(c_im * height + h_im) * width + w_im] += + data_col[(c_col * height_col + h_col) * width_col + w_col]; + } } } } diff --git a/aten/src/ATen/native/im2col_shape_check.h b/aten/src/ATen/native/im2col_shape_check.h index 84de7aa4c4f5..45fc96ea8443 100644 --- a/aten/src/ATen/native/im2col_shape_check.h +++ b/aten/src/ATen/native/im2col_shape_check.h @@ -1,4 +1,5 @@ -#include +#pragma once +#include #include namespace at { diff --git a/aten/src/ATen/native/layer_norm.cpp b/aten/src/ATen/native/layer_norm.cpp index c6b9b6d5c26a..16da001d3a16 100644 --- a/aten/src/ATen/native/layer_norm.cpp +++ b/aten/src/ATen/native/layer_norm.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -18,7 +19,7 @@ namespace at { namespace native { -void layer_norm_cpu_out( +void layer_norm_with_mean_rstd_out( at::Tensor& out, at::Tensor& mean, at::Tensor& rstd, @@ -50,6 +51,20 @@ void layer_norm_cpu_out( rstd = rstd.view(stat_shape); } +void layer_norm_cpu_out( + at::Tensor& out, + const at::Tensor& input, + const Tensor& gamma, + const Tensor& beta, + double eps, + int64_t M, + int64_t N) { + if (M <= 0) { + return; + } + LayerNormKernel(kCPU, input, gamma, beta, M, N, eps, &out, /*mean=*/nullptr, /*rstd=*/nullptr); +} + std::tuple layer_norm_cpu( const Tensor& input, IntArrayRef normalized_shape, const c10::optional& weight_opt /* optional */, const c10::optional& bias_opt /* optional */, @@ -60,6 +75,7 @@ std::tuple layer_norm_cpu( c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); const Tensor& bias = *bias_maybe_owned; + bool mixed_type = is_mixed_type(input, weight, bias); auto M_N = _check_layer_norm_inputs(input, normalized_shape, weight, bias); auto M = M_N.first; @@ -75,10 +91,11 @@ std::tuple layer_norm_cpu( c10::nullopt /* device */, c10::nullopt /* pin_memory */, at::MemoryFormat::Contiguous); - Tensor mean = at::empty({M}, X->options()); - Tensor rstd = at::empty({M}, X->options()); + const auto dtype = param_scalar_type(input, mixed_type); + Tensor mean = at::empty({M}, X->options().dtype(dtype)); + Tensor rstd = at::empty({M}, X->options().dtype(dtype)); - layer_norm_cpu_out(Y, mean, rstd, *X, normalized_shape, *gamma, *beta, eps, M, N); + layer_norm_with_mean_rstd_out(Y, mean, rstd, *X, normalized_shape, *gamma, *beta, eps, M, N); return std::make_tuple(std::move(Y), std::move(mean), std::move(rstd)); } diff --git a/aten/src/ATen/native/layer_norm.h b/aten/src/ATen/native/layer_norm.h index f4ef0351095a..629bc9ab3906 100644 --- a/aten/src/ATen/native/layer_norm.h +++ b/aten/src/ATen/native/layer_norm.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -65,10 +65,7 @@ C10_ALWAYS_INLINE std::pair _check_layer_norm_inputs( void layer_norm_cpu_out( at::Tensor& out, - at::Tensor& mean, - at::Tensor& rstd, const at::Tensor& input, - IntArrayRef normalized_shape, const Tensor& gamma, const Tensor& beta, double eps, diff --git a/aten/src/ATen/native/metal/MetalContext.h b/aten/src/ATen/native/metal/MetalContext.h index ca58eb9a433a..7954c129dbdb 100644 --- a/aten/src/ATen/native/metal/MetalContext.h +++ b/aten/src/ATen/native/metal/MetalContext.h @@ -3,8 +3,7 @@ #import #include -API_AVAILABLE(ios(10.0), macos(10.13)) -// TODO[T79947194]: Convert this class to C++ +API_AVAILABLE(ios(11.0), macos(10.13)) @interface MetalContext : NSObject @property(nonatomic, strong, readonly) id device; @property(nonatomic, strong, readonly) id commandQueue; diff --git a/aten/src/ATen/native/metal/MetalNeuronType.h b/aten/src/ATen/native/metal/MetalNeuronType.h index 8ae5b3a8b341..b59d163c4ae8 100644 --- a/aten/src/ATen/native/metal/MetalNeuronType.h +++ b/aten/src/ATen/native/metal/MetalNeuronType.h @@ -37,7 +37,7 @@ static inline NeuronType neuronType( } } -static inline MPSCNNNeuron* neuronType(NeuronType type) { +static inline MPSCNNNeuron* neuron(NeuronType type) { if (type == NeuronType::Relu) { return [MPSCNNNeuronOp relu]; } else if (type == NeuronType::Sigmoid) { @@ -45,16 +45,27 @@ static inline MPSCNNNeuron* neuronType(NeuronType type) { } else if (type == NeuronType::Tanh) { return [MPSCNNNeuronOp tanh]; } else if (type == NeuronType::HardSigmoid) { - if (@available(iOS 11.0, *)) { - return [MPSCNNNeuronOp hardSigmoid]; - } else { - return nil; - } + return [MPSCNNNeuronOp hardSigmoid]; } else { return nil; } } +API_AVAILABLE(ios(11.3), macos(10.13), macCatalyst(13.0)) +static inline MPSNNNeuronDescriptor* neuronDescriptor(NeuronType type) { + if (type == NeuronType::Relu) { + return [MPSCNNNeuronOpDescriptor reluDescriptor]; + } else if (type == NeuronType::Sigmoid) { + return [MPSCNNNeuronOpDescriptor sigmoidDescriptor]; + } else if (type == NeuronType::Tanh) { + return [MPSCNNNeuronOpDescriptor tanhDescriptor]; + } else if (type == NeuronType::HardSigmoid) { + return [MPSCNNNeuronOpDescriptor hardSigmoidDescriptor]; + } else { + return [MPSNNNeuronDescriptor cnnNeuronDescriptorWithType:MPSCNNNeuronTypeNone]; + } +} + } } } diff --git a/aten/src/ATen/native/metal/MetalShaders.h b/aten/src/ATen/native/metal/MetalShaders.h index 0ee703f2ee26..edd7ba46d086 100644 --- a/aten/src/ATen/native/metal/MetalShaders.h +++ b/aten/src/ATen/native/metal/MetalShaders.h @@ -464,31 +464,6 @@ kernel void reflection_pad2d(texture2d_array in_arr[[texture } } -constant bool resize_is_arr = (ushort_arg_4 > 1 || ushort_arg_5 > 4); -constant bool resize_is_tex = !resize_is_arr; -kernel void resize_nearest(texture2d_array in_arr[[texture(0), function_constant(resize_is_arr)]], - texture2d in_tex[[texture(0), function_constant(resize_is_tex)]], - texture2d_array out_arr[[texture(1), function_constant(resize_is_arr)]], - texture2d out_tex[[texture(1), function_constant(resize_is_tex)]], - ushort3 gid[[thread_position_in_grid]]) { - const ushort oH = ushort_arg_0; - const ushort oW = ushort_arg_1; - if (gid.x >= oW || gid.y >= oH) { - return; - } - const float height_scale = float(ushort_arg_2) / 10000; - const float width_scale = float(ushort_arg_3) / 10000; - constexpr sampler s(coord::pixel, address::clamp_to_edge, filter::nearest); - const int in_y = (int)(gid.y / height_scale); - const int in_x = (int)(gid.x / width_scale); - if(resize_is_arr) { - out_arr.write(in_arr.sample(s, float2(in_x, in_y), gid.z), gid.xy, gid.z); - } else { - out_tex.write(in_tex.sample(s, float2(in_x, in_y)), gid.xy); - } -} - - constant bool reshape_out_is_arr = (ushort_arg_3 > 1 || ushort_arg_2 > 4); constant bool reshape_out_is_tex = !reshape_out_is_arr; constant bool reshape_in_is_arr = (ushort_arg_7 > 1 || ushort_arg_6 > 4); diff --git a/aten/src/ATen/native/metal/MetalTensorImpl.h b/aten/src/ATen/native/metal/MetalTensorImpl.h index 865e466a8de7..799f7ef3bd11 100644 --- a/aten/src/ATen/native/metal/MetalTensorImpl.h +++ b/aten/src/ATen/native/metal/MetalTensorImpl.h @@ -23,11 +23,11 @@ struct TORCH_API MetalTensorImpl : public OpaqueTensorImpl { opaque_handle, sizes), strides_(strides.vec()) { - TensorImpl::set_has_contiguity_policy( - TensorImpl::HasContiguityPolicy::CustomBehavior); } - IntArrayRef strides() const override { + // TODO: manually storing strides here is dumb + + IntArrayRef strides_custom() const override { return strides_; } @@ -35,11 +35,6 @@ struct TORCH_API MetalTensorImpl : public OpaqueTensorImpl { return true; } - int64_t stride(int64_t d) const override { - d = at::maybe_wrap_dim(d, this->dim(), false); - return strides_[d]; - } - private: const char* tensorimpl_type_name() const override { return "MetalTensorImpl"; diff --git a/aten/src/ATen/native/metal/MetalTensorImplStorage.mm b/aten/src/ATen/native/metal/MetalTensorImplStorage.mm index cd73ba4eddb3..f614429eefdd 100644 --- a/aten/src/ATen/native/metal/MetalTensorImplStorage.mm +++ b/aten/src/ATen/native/metal/MetalTensorImplStorage.mm @@ -10,7 +10,7 @@ namespace native { namespace metal { -class API_AVAILABLE(ios(10.0), macos(10.13)) MetalTensorImplStorage::Impl { +class API_AVAILABLE(ios(11.0), macos(10.13)) MetalTensorImplStorage::Impl { public: Impl(const std::vector& sizes, const std::vector& strides) : _sizes(sizes), @@ -93,7 +93,7 @@ void copy_data_to_host(float* host) { impl()->copy_data_to_host(hostData); } -API_AVAILABLE(ios(10.0)) +API_AVAILABLE(ios(11.0)) MPSImageWrapper* MetalTensorImplStorage::texture() const { return impl()->texture(); } diff --git a/aten/src/ATen/native/metal/mpscnn/MPSCNNConvOp.h b/aten/src/ATen/native/metal/mpscnn/MPSCNNConvOp.h index 517b00061f61..d26e358a3523 100644 --- a/aten/src/ATen/native/metal/mpscnn/MPSCNNConvOp.h +++ b/aten/src/ATen/native/metal/mpscnn/MPSCNNConvOp.h @@ -3,7 +3,7 @@ #import #import -API_AVAILABLE(ios(10.0), macos(10.13)) +API_AVAILABLE(ios(11.0), macos(10.13)) @interface MPSCNNConvDataSource : NSObject @property(nonatomic, assign) void* weights; @property(nonatomic, assign) float* bias; @@ -15,7 +15,7 @@ API_AVAILABLE(ios(10.0), macos(10.13)) @end using namespace at::native::metal; -API_AVAILABLE(ios(10.0), macos(10.13)) +API_AVAILABLE(ios(11.0), macos(10.13)) @interface MPSCNNConvOp : NSObject + (MPSCNNConvOp*)conv2d:(const Conv2DParams&)params weights:(float*)w diff --git a/aten/src/ATen/native/metal/mpscnn/MPSCNNConvOp.mm b/aten/src/ATen/native/metal/mpscnn/MPSCNNConvOp.mm index 83fd0d3c6c6d..adf9e1b75c2d 100644 --- a/aten/src/ATen/native/metal/mpscnn/MPSCNNConvOp.mm +++ b/aten/src/ATen/native/metal/mpscnn/MPSCNNConvOp.mm @@ -1,7 +1,7 @@ -#import #import #import #import +#import #include @@ -14,7 +14,7 @@ @implementation MPSCNNConvDataSource { - (id)initWithWeights:(void*)weights Bias:(float*)bias Desc:(MPSCNNConvolutionDescriptor*)desc - API_AVAILABLE(ios(10.0), macos(10.13)) { + API_AVAILABLE(ios(11.0), macos(10.13)) { self = [super init]; if (self) { _weights = (float*)weights; @@ -36,7 +36,7 @@ - (float* _Nullable)biasTerms { return _bias; } -- (MPSDataType)dataType API_AVAILABLE(ios(10.0), macos(10.13)) { +- (MPSDataType)dataType API_AVAILABLE(ios(11.0), macos(10.13)) { return MPSDataTypeFloat32; } @@ -71,7 +71,7 @@ @implementation MPSCNNConvOp { + (MPSCNNConvOp*)conv2d:(const Conv2DParams&)params weights:(float*)w bias:(float*)b - neuronFilter:(NeuronType)t API_AVAILABLE(ios(10.0), macos(10.13)) { + neuronFilter:(NeuronType)t API_AVAILABLE(ios(11.0), macos(10.13)) { using namespace at::native::metal::mpscnn; TORCH_CHECK( params.DX == params.DY == 1, "Dilated convolution is not supported yet."); @@ -79,7 +79,7 @@ + (MPSCNNConvOp*)conv2d:(const Conv2DParams&)params const int64_t iC = params.C; const int64_t kH = params.KH; const int64_t kW = params.KW; - MPSCNNNeuron* neuron = neuronType(t); + MPSCNNNeuron* neuron = at::native::metal::neuron(t); MPSCNNConvolutionDescriptor* desc = nil; if (params.isDepthwise()) { if (@available(iOS 11.0, *)) { @@ -87,9 +87,14 @@ + (MPSCNNConvOp*)conv2d:(const Conv2DParams&)params cnnConvolutionDescriptorWithKernelWidth:kW kernelHeight:kH inputFeatureChannels:iC - outputFeatureChannels:oC - neuronFilter:neuron]; + outputFeatureChannels:oC]; + desc.groups = 1; +#if TARGET_OS_MACCATALYST + desc.fusedNeuronDescriptor = at::native::metal::neuronDescriptor(t); +#else + desc.neuron = neuron; +#endif } else { TORCH_CHECK( false, @@ -103,13 +108,23 @@ + (MPSCNNConvOp*)conv2d:(const Conv2DParams&)params channels in each group to be multiple of 4 for \ group > 1."); } - desc = [MPSCNNConvolutionDescriptor - cnnConvolutionDescriptorWithKernelWidth:kW - kernelHeight:kH - inputFeatureChannels:iC - outputFeatureChannels:oC - neuronFilter:neuron]; - desc.groups = params.G; + if (@available(iOS 11.0, *)) { + desc = [MPSCNNConvolutionDescriptor + cnnConvolutionDescriptorWithKernelWidth:kW + kernelHeight:kH + inputFeatureChannels:iC + outputFeatureChannels:oC]; + desc.groups = params.G; +#if TARGET_OS_MACCATALYST + desc.fusedNeuronDescriptor = at::native::metal::neuronDescriptor(t); +#else + desc.neuron = neuron; +#endif + } else { + TORCH_CHECK( + false, + "MPSCNNConvolutionDescriptor is only available on iOS 11.0 and above"); + } } desc.strideInPixelsX = params.SX; desc.strideInPixelsY = params.SY; @@ -124,15 +139,8 @@ + (MPSCNNConvOp*)conv2d:(const Conv2DParams&)params weights:dataSource]; } else { -#if TARGET_OS_IPHONE - // Fallback on earlier versions - conv = [[MPSCNNConvolution alloc] - initWithDevice:[MetalContext sharedInstance].device - convolutionDescriptor:desc - kernelWeights:w - biasTerms:b - flags:MPSCNNConvolutionFlagsNone]; -#endif + TORCH_CHECK( + false, "MPSCNNConvolution is only available on iOS 11.0 and above"); } [conv setEdgeMode:MPSImageEdgeModeZero]; MPSOffset offset; diff --git a/aten/src/ATen/native/metal/mpscnn/MPSCNNFullyConnectedOp.h b/aten/src/ATen/native/metal/mpscnn/MPSCNNFullyConnectedOp.h index 91c01ce227f5..297b180f59c4 100644 --- a/aten/src/ATen/native/metal/mpscnn/MPSCNNFullyConnectedOp.h +++ b/aten/src/ATen/native/metal/mpscnn/MPSCNNFullyConnectedOp.h @@ -4,7 +4,7 @@ #import using namespace at::native::metal; -API_AVAILABLE(ios(10.0), macos(10.13)) +API_AVAILABLE(ios(11.0), macos(10.13)) @interface MPSCNNFullyConnectedOp : NSObject + (MPSCNNFullyConnectedOp*)linear:(const Conv2DParams&)params weights:(float*)w diff --git a/aten/src/ATen/native/metal/mpscnn/MPSCNNFullyConnectedOp.mm b/aten/src/ATen/native/metal/mpscnn/MPSCNNFullyConnectedOp.mm index 7e4d5974bbb8..353095a8f52f 100644 --- a/aten/src/ATen/native/metal/mpscnn/MPSCNNFullyConnectedOp.mm +++ b/aten/src/ATen/native/metal/mpscnn/MPSCNNFullyConnectedOp.mm @@ -10,14 +10,18 @@ + (MPSCNNFullyConnectedOp*)linear:(const Conv2DParams&)params weights:(float*)w bias:(float*)b neuronFilter:(NeuronType)t - API_AVAILABLE(ios(10.0), macos(10.13)) { - MPSCNNNeuron* neuron = neuronType(t); + API_AVAILABLE(ios(11.0), macos(10.13)) { + MPSCNNNeuron* neuron = at::native::metal::neuron(t); MPSCNNConvolutionDescriptor* desc = [MPSCNNConvolutionDescriptor cnnConvolutionDescriptorWithKernelWidth:params.KW kernelHeight:params.KH inputFeatureChannels:params.IC - outputFeatureChannels:params.OC - neuronFilter:neuron]; + outputFeatureChannels:params.OC]; +#if TARGET_OS_MACCATALYST + desc.fusedNeuronDescriptor = at::native::metal::neuronDescriptor(t); +#else + desc.neuron = neuron; +#endif desc.strideInPixelsX = 1; desc.strideInPixelsY = 1; @@ -31,14 +35,9 @@ + (MPSCNNFullyConnectedOp*)linear:(const Conv2DParams&)params initWithDevice:[MetalContext sharedInstance].device weights:ds]; } else { -#if TARGET_OS_IPHONE - fc = [[MPSCNNFullyConnected alloc] - initWithDevice:[MetalContext sharedInstance].device - convolutionDescriptor:desc - kernelWeights:w - biasTerms:b - flags:MPSCNNConvolutionFlagsNone]; -#endif + TORCH_CHECK( + false, + "MPSCNNFullyConnectedOp is only available on iOS 11.0 and above"); } [fc setClipRect:MTLRegionMake3D(0, 0, 0, 1, 1, params.N)]; [fc setOffset:{.x = static_cast(params.W / 2), diff --git a/aten/src/ATen/native/metal/mpscnn/MPSCNNNeuronOp.h b/aten/src/ATen/native/metal/mpscnn/MPSCNNNeuronOp.h index 2e2dee8b022c..e1a9b2617bd3 100644 --- a/aten/src/ATen/native/metal/mpscnn/MPSCNNNeuronOp.h +++ b/aten/src/ATen/native/metal/mpscnn/MPSCNNNeuronOp.h @@ -8,3 +8,13 @@ + (MPSCNNNeuronTanH*)tanh; @end + +API_AVAILABLE(ios(11.3), macos(10.13), macCatalyst(13.0)) +@interface MPSCNNNeuronOpDescriptor : NSObject + ++ (MPSNNNeuronDescriptor*)hardSigmoidDescriptor; ++ (MPSNNNeuronDescriptor*)reluDescriptor; ++ (MPSNNNeuronDescriptor*)sigmoidDescriptor; ++ (MPSNNNeuronDescriptor*)tanhDescriptor; + +@end diff --git a/aten/src/ATen/native/metal/mpscnn/MPSCNNNeuronOp.mm b/aten/src/ATen/native/metal/mpscnn/MPSCNNNeuronOp.mm index 5e208731c88c..1b322f9a97e9 100644 --- a/aten/src/ATen/native/metal/mpscnn/MPSCNNNeuronOp.mm +++ b/aten/src/ATen/native/metal/mpscnn/MPSCNNNeuronOp.mm @@ -4,6 +4,10 @@ @implementation MPSCNNNeuronOp + (MPSCNNNeuronHardSigmoid*)hardSigmoid API_AVAILABLE(ios(11.0), macos(10.13)) { +// Remove this once we support iOS 11.3 +#if TARGET_OS_MACCATALYST + return nil; +#else static dispatch_once_t onceToken; static MPSCNNNeuronHardSigmoid* neuron = nil; dispatch_once(&onceToken, ^{ @@ -13,9 +17,14 @@ + (MPSCNNNeuronHardSigmoid*)hardSigmoid API_AVAILABLE(ios(11.0), macos(10.13)) { b:0.5]; }); return neuron; +#endif } + (MPSCNNNeuronReLU*)relu { +// Remove this once we support iOS 11.3 +#if TARGET_OS_MACCATALYST + return nil; +#else static MPSCNNNeuronReLU* relu = nil; static dispatch_once_t onceToken; dispatch_once(&onceToken, ^{ @@ -24,9 +33,14 @@ + (MPSCNNNeuronReLU*)relu { a:0]; }); return relu; +#endif } + (MPSCNNNeuronSigmoid*)sigmoid { +// Remove this once we support iOS 11.3 +#if TARGET_OS_MACCATALYST + return nil; +#else static dispatch_once_t onceToken; static MPSCNNNeuronSigmoid* sigmoid = nil; dispatch_once(&onceToken, ^{ @@ -34,9 +48,14 @@ + (MPSCNNNeuronSigmoid*)sigmoid { initWithDevice:[MetalContext sharedInstance].device]; }); return sigmoid; +#endif } + (MPSCNNNeuronTanH*)tanh { +// Remove this once we support iOS 11.3 +#if TARGET_OS_MACCATALYST + return nil; +#else static dispatch_once_t onceToken; static MPSCNNNeuronTanH* tanh = nil; dispatch_once(&onceToken, ^{ @@ -46,6 +65,57 @@ + (MPSCNNNeuronTanH*)tanh { b:1]; }); return tanh; +#endif +} + +@end + +API_AVAILABLE(ios(11.3), macos(10.13), macCatalyst(13.0)) +@implementation MPSCNNNeuronOpDescriptor + ++ (MPSNNNeuronDescriptor*)hardSigmoidDescriptor { + static dispatch_once_t onceToken; + static MPSNNNeuronDescriptor* neuronDesc = nil; + dispatch_once(&onceToken, ^{ + neuronDesc = [MPSNNNeuronDescriptor + cnnNeuronDescriptorWithType:MPSCNNNeuronTypeHardSigmoid + a:1.0 / 6.0 + b:0.5]; + }); + return neuronDesc; +} + ++ (MPSNNNeuronDescriptor*)reluDescriptor { + static dispatch_once_t onceToken; + static MPSNNNeuronDescriptor* neuronDesc = nil; + dispatch_once(&onceToken, ^{ + neuronDesc = + [MPSNNNeuronDescriptor cnnNeuronDescriptorWithType:MPSCNNNeuronTypeReLU + a:0]; + }); + return neuronDesc; +} + ++ (MPSNNNeuronDescriptor*)sigmoidDescriptor { + static dispatch_once_t onceToken; + static MPSNNNeuronDescriptor* neuronDesc = nil; + dispatch_once(&onceToken, ^{ + neuronDesc = [MPSNNNeuronDescriptor + cnnNeuronDescriptorWithType:MPSCNNNeuronTypeSigmoid]; + }); + return neuronDesc; +} + ++ (MPSNNNeuronDescriptor*)tanhDescriptor { + static dispatch_once_t onceToken; + static MPSNNNeuronDescriptor* neuronDesc = nil; + dispatch_once(&onceToken, ^{ + neuronDesc = + [MPSNNNeuronDescriptor cnnNeuronDescriptorWithType:MPSCNNNeuronTypeTanH + a:1.0 + b:1.0]; + }); + return neuronDesc; } @end diff --git a/aten/src/ATen/native/metal/mpscnn/MPSCNNUtils.h b/aten/src/ATen/native/metal/mpscnn/MPSCNNUtils.h index 809518ef1a80..13264d097e92 100644 --- a/aten/src/ATen/native/metal/mpscnn/MPSCNNUtils.h +++ b/aten/src/ATen/native/metal/mpscnn/MPSCNNUtils.h @@ -2,6 +2,24 @@ #import #include +// This is a utility macro that can be used to throw an exception when a Metal +// API function produces a NSError. The exception will contain a message with +// useful info extracted from the NSError. +#define METAL_THROW_IF_ERROR(error, preamble) \ + do { \ + if C10_LIKELY(error) { \ + throw c10::Error( \ + {__func__, __FILE__, static_cast(__LINE__)}, \ + c10::str( \ + preamble, \ + " Error details: ", \ + " Localized_description: ", error.localizedDescription.UTF8String, \ + " Domain: ", error.domain.UTF8String, \ + " Code: ", error.code, \ + " User Info: ", error.userInfo.description.UTF8String)); \ + } \ + } while (false) + namespace at { namespace native { namespace metal { @@ -13,12 +31,12 @@ struct LaunchParams { MTLSize threadsPerGrid; // iOS 11.0 }; -API_AVAILABLE(ios(10.0), macos(10.13)) +API_AVAILABLE(ios(11.0), macos(10.13)) LaunchParams spatialPointwiseKernelLaunchParams( id pipeline, MPSImage* im); -API_AVAILABLE(ios(10.0), macos(10.13)) +API_AVAILABLE(ios(11.0), macos(10.13)) LaunchParams spatialPointwiseKernelLaunchParams( id pipeline, NSUInteger numberOfImages, @@ -26,7 +44,7 @@ LaunchParams spatialPointwiseKernelLaunchParams( NSUInteger height, NSUInteger width); -API_AVAILABLE(ios(10.0), macos(10.13)) +API_AVAILABLE(ios(11.0), macos(10.13)) static inline std::string kernelFor( MPSImage* image, const std::string& arrayKernel, diff --git a/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.h b/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.h index bba2a525429a..33de62301ef5 100644 --- a/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.h +++ b/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.h @@ -9,7 +9,7 @@ namespace at { namespace native { namespace metal { -class API_AVAILABLE(ios(10.0), macos(10.13)) MPSImageWrapper { +class API_AVAILABLE(ios(11.0), macos(10.13)) MPSImageWrapper { public: MPSImageWrapper(IntArrayRef sizes); ~MPSImageWrapper(); diff --git a/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.mm b/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.mm index 287f94dde778..d5a9632d26c9 100644 --- a/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.mm +++ b/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.mm @@ -32,12 +32,13 @@ - (void)beginSynchronization { } - (void)endSynchronization:(NSError*)error { + // if something went wrong during command buffer execution if (error) { if (_imageWrapper) { _imageWrapper->release(); } - // throw exceptions if we failed to flush the command buffer - TORCH_CHECK(error); + // throw an exception with error details + METAL_THROW_IF_ERROR(error, "Command buffer execution failed!"); } } diff --git a/aten/src/ATen/native/metal/ops/MetalAddmm.mm b/aten/src/ATen/native/metal/ops/MetalAddmm.mm index 94e1add60b30..e0c196ac68b3 100644 --- a/aten/src/ATen/native/metal/ops/MetalAddmm.mm +++ b/aten/src/ATen/native/metal/ops/MetalAddmm.mm @@ -16,7 +16,7 @@ namespace native { namespace metal { -API_AVAILABLE(ios(10.0), macos(10.13)) +API_AVAILABLE(ios(11.0), macos(10.13)) Tensor addmm( const Tensor& bias, const Tensor& input, diff --git a/aten/src/ATen/native/metal/ops/MetalConcat.mm b/aten/src/ATen/native/metal/ops/MetalConcat.mm index 14b4ce7dbfc1..c43bf055fa2e 100644 --- a/aten/src/ATen/native/metal/ops/MetalConcat.mm +++ b/aten/src/ATen/native/metal/ops/MetalConcat.mm @@ -203,7 +203,7 @@ Tensor cat(const TensorList tensors, int64_t dim) { } TORCH_LIBRARY_IMPL(aten, Metal, m) { - m.impl(TORCH_SELECTIVE_NAME("aten::_cat"), TORCH_FN(cat)); + m.impl(TORCH_SELECTIVE_NAME("aten::cat"), TORCH_FN(cat)); } } diff --git a/aten/src/ATen/native/metal/ops/MetalNeurons.mm b/aten/src/ATen/native/metal/ops/MetalNeurons.mm index 03a5de0851ad..ca925d9b841b 100644 --- a/aten/src/ATen/native/metal/ops/MetalNeurons.mm +++ b/aten/src/ATen/native/metal/ops/MetalNeurons.mm @@ -51,19 +51,19 @@ Tensor neuronKernel(const Tensor& input, MPSCNNNeuron* neuron) { return input; } -API_AVAILABLE(ios(10.0), macos(10.13)) +API_AVAILABLE(ios(11.0), macos(10.13)) Tensor relu(const Tensor& input) { TORCH_CHECK(input.is_metal()); return neuronKernel(input, [MPSCNNNeuronOp relu]); } -API_AVAILABLE(ios(10.0), macos(10.13)) +API_AVAILABLE(ios(11.0), macos(10.13)) Tensor& relu_(Tensor& input) { TORCH_CHECK(input.is_metal()); return neuronKernel_(input, [MPSCNNNeuronOp relu]); } -API_AVAILABLE(ios(10.0), macos(10.13)) +API_AVAILABLE(ios(11.0), macos(10.13)) Tensor sigmoid(const Tensor& input) { return neuronKernel(input, [MPSCNNNeuronOp sigmoid]); } @@ -74,7 +74,7 @@ Tensor sigmoid(const Tensor& input) { return neuronKernel_(input, [MPSCNNNeuronOp hardSigmoid]); } -API_AVAILABLE(ios(10.0), macos(10.13)) +API_AVAILABLE(ios(11.0), macos(10.13)) Tensor tanh(const Tensor& input) { TORCH_CHECK(input.is_metal()); return neuronKernel(input, [MPSCNNNeuronOp tanh]); @@ -85,9 +85,7 @@ Tensor tanh(const Tensor& input) { m.impl(TORCH_SELECTIVE_NAME("aten::relu"), TORCH_FN(relu)); m.impl(TORCH_SELECTIVE_NAME("aten::relu_"), TORCH_FN(relu_)); m.impl(TORCH_SELECTIVE_NAME("aten::sigmoid"), TORCH_FN(sigmoid)); - if (@available(iOS 11.0, *)) { - m.impl(TORCH_SELECTIVE_NAME("aten::hardsigmoid_"), TORCH_FN(hardsigmoid_)); - } + m.impl(TORCH_SELECTIVE_NAME("aten::hardsigmoid_"), TORCH_FN(hardsigmoid_)); }; } diff --git a/aten/src/ATen/native/metal/ops/MetalPadding.mm b/aten/src/ATen/native/metal/ops/MetalPadding.mm index ca62cfc6de65..4edd4a04bbde 100644 --- a/aten/src/ATen/native/metal/ops/MetalPadding.mm +++ b/aten/src/ATen/native/metal/ops/MetalPadding.mm @@ -13,7 +13,7 @@ namespace native { namespace metal { -// API_AVAILABLE(ios(10.0), macos(10.13)) +API_AVAILABLE(ios(11.0), macos(10.13)) Tensor reflection_pad2d(const Tensor& input, IntArrayRef padding) { TORCH_CHECK(input.is_metal()); diff --git a/aten/src/ATen/native/metal/ops/MetalPooling.mm b/aten/src/ATen/native/metal/ops/MetalPooling.mm index 056602d381b8..5e3b9110756e 100644 --- a/aten/src/ATen/native/metal/ops/MetalPooling.mm +++ b/aten/src/ATen/native/metal/ops/MetalPooling.mm @@ -15,7 +15,7 @@ namespace native { namespace metal { -API_AVAILABLE(ios(10.0), macos(10.13)) +API_AVAILABLE(ios(11.0), macos(10.13)) Tensor max_pool2d( const Tensor& input, IntArrayRef kernel_size, @@ -70,7 +70,7 @@ Tensor max_pool2d( return output; } -API_AVAILABLE(ios(10.0), macos(10.13)) +API_AVAILABLE(ios(11.0), macos(10.13)) Tensor adaptive_avg_pool2d(const Tensor& input, IntArrayRef output_size) { // averages across the width and height, and outputs a 1x1xC image. TORCH_CHECK(output_size[0] == 1 && output_size[1] == 1); diff --git a/aten/src/ATen/native/metal/ops/MetalReshape.mm b/aten/src/ATen/native/metal/ops/MetalReshape.mm index ed74014a169e..37842ee3be59 100644 --- a/aten/src/ATen/native/metal/ops/MetalReshape.mm +++ b/aten/src/ATen/native/metal/ops/MetalReshape.mm @@ -15,7 +15,7 @@ namespace native { namespace metal { -API_AVAILABLE(ios(10.0), macos(10.13)) +API_AVAILABLE(ios(11.0), macos(10.13)) Tensor view(const Tensor& input, IntArrayRef size) { TORCH_CHECK(input.is_metal()); auto inferred_size = at::infer_size(size, input.numel()); diff --git a/aten/src/ATen/native/metal/ops/MetalUpsamplingNearest.mm b/aten/src/ATen/native/metal/ops/MetalUpsamplingNearest.mm index b3fb27f7619a..39524569bae5 100644 --- a/aten/src/ATen/native/metal/ops/MetalUpsamplingNearest.mm +++ b/aten/src/ATen/native/metal/ops/MetalUpsamplingNearest.mm @@ -17,7 +17,7 @@ Tensor upsample_nearest2d_vec( const Tensor& input, - c10::optional output_size, + at::OptionalIntArrayRef output_size, c10::optional> scale_factors) { TORCH_CHECK(input.is_metal()); auto osize = @@ -58,28 +58,9 @@ Tensor upsample_nearest2d_vec( sourceImage:X destinationImage:Y]; } else { - NSUInteger sh = scale_h.value() * 10000; - NSUInteger sw = scale_w.value() * 10000; - id state = - [[MetalContext sharedInstance] specializedPipelineState:"resize_nearest" - Constants:@[ - @(output_height), - @(output_width), - @(sh), - @(sw), - @(nbatch), - @(channels), - ]]; - id encoder = - [commandBuffer.buffer computeCommandEncoder]; - [encoder setComputePipelineState:state]; - [encoder setTexture:[X texture] atIndex:0]; - [encoder setTexture:[Y texture] atIndex:1]; - const auto& launchParams = - mpscnn::spatialPointwiseKernelLaunchParams(state, Y); - [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid - threadsPerThreadgroup:launchParams.threadsPerThreadgroup]; - [encoder endEncoding]; + TORCH_CHECK( + false, + "MPSCNNUpsamplingNearest is only available on iOS 11.0 and above"); } auto output = makeTensor(std::move(mt), input.options()); return output; diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp index 8fc00d850113..fc4587db9c34 100644 --- a/aten/src/ATen/native/miopen/Conv_miopen.cpp +++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp @@ -135,52 +135,6 @@ Tensor narrowGroup(const Tensor& t, int dim, int group_idx, int64_t groups) { return t.narrow(dim, group_idx * group_size, group_size); } -// --------------------------------------------------------------------- -// -// Checking -// -// --------------------------------------------------------------------- - -// Used on pad, stride and dilation -static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, const char* arg_name) -{ - TORCH_CHECK(args.size() <= expected_size, - "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ", - expected_size, " (while checking arguments for ", c, ")"); - TORCH_CHECK(args.size() >= expected_size, - "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ", - expected_size, " (while checking arguments for ", c, ")"); - - auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;}); - if (num_negative_values > 0){ - std::stringstream ss; - ss << arg_name << " should be greater than zero but got ("; - std::copy(args.begin(), args.end() - 1, std::ostream_iterator(ss,", ")); - ss << args.back() << ")" << " (while checking arguments for " << c << ")"; - AT_ERROR(ss.str()); - } -} - -// see NOTE [ Convolution checks] in src/Aten/native/cudnn/Conv.cpp -static void convolution_shape_check( - CheckedFrom c, - const TensorGeometryArg& input, const TensorGeometryArg& weight, const TensorGeometryArg& output, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups) -{ - check_args(c, padding, input->dim() - 2, "padding"); - check_args(c, stride, padding.size(), "stride"); - check_args(c, dilation, padding.size(), "dilation"); - - // Input - checkDimRange(c, input, 3, 6 /* exclusive */); - checkSize(c, input, input_channels_dim, weight->size(1) * groups); - - // Weight - checkSameDim(c, input, weight); - - checkSameDim(c, input, output); -} - // This POD struct is used to let us easily compute hashes of the // parameters struct ConvolutionParams diff --git a/aten/src/ATen/native/miopen/RNN_miopen.cpp b/aten/src/ATen/native/miopen/RNN_miopen.cpp index 9f53c1186ab3..b5a63dd803d1 100644 --- a/aten/src/ATen/native/miopen/RNN_miopen.cpp +++ b/aten/src/ATen/native/miopen/RNN_miopen.cpp @@ -352,7 +352,7 @@ std::pair, size_t> get_parameters(miopenHandle_t handle, con param_size /= elem_size; if(linear_id == 0 || linear_id == num_linear_layers / 2) { - const auto size = { static_cast(param_size * num_linear_layers / 2), 1L}; + std::initializer_list size = { static_cast(param_size * num_linear_layers / 2), 1L}; Tensor param = at::empty({0}, weight_buf.options()).set_(weight_buf.storage(), offset, size); params.emplace_back(std::move(param)); layer_params_count++; @@ -386,7 +386,7 @@ std::pair, size_t> get_parameters(miopenHandle_t handle, con bias_size /= elem_size; if(linear_id == 0 || linear_id == num_linear_layers / 2) { - const auto size = { static_cast(bias_size * num_linear_layers / 2), 1L}; + std::initializer_list size = { static_cast(bias_size * num_linear_layers / 2), 1L}; Tensor param = at::empty({0}, weight_buf.options()).set_(weight_buf.storage(), offset, size); params.emplace_back(std::move(param)); layer_params_count++; diff --git a/aten/src/ATen/native/mkl/LinearAlgebra.h b/aten/src/ATen/native/mkl/LinearAlgebra.h index d5e4518e70bf..a536c193524e 100644 --- a/aten/src/ATen/native/mkl/LinearAlgebra.h +++ b/aten/src/ATen/native/mkl/LinearAlgebra.h @@ -1,4 +1,5 @@ #include +#include namespace at { namespace native { diff --git a/aten/src/ATen/native/mkl/SparseBlasImpl.cpp b/aten/src/ATen/native/mkl/SparseBlasImpl.cpp index 35d583c64733..40557d478b15 100644 --- a/aten/src/ATen/native/mkl/SparseBlasImpl.cpp +++ b/aten/src/ATen/native/mkl/SparseBlasImpl.cpp @@ -206,6 +206,58 @@ void addmm_dense_result( #endif } +/* + Computes a sparse matrix-sparse matrix product with dense result defined as + C <- alpha*(A*B) + beta*C + + Args: + * `A` - Sparse Tensor storing m x k matrix. + * `B` - Sparse Tensor storing k x n matrix. + * `C` - [in] Dense Tensor storing matrix of size m x n. + [out] result of the operation. +*/ +void addmm_sparse_input_dense_result( + const Tensor& A, + const Tensor& B, + const Scalar& beta, + const Scalar& alpha, + const Tensor& C) { +#if !AT_USE_MKL_SPARSE() + TORCH_CHECK( + false, + "Calling addmm on a sparse CPU tensor requires Linux platform. ", + "Please use PyTorch built with MKL on Linux."); +#else + // MKL function computes C <- A*B + // So we need a temporary matrix to store the result + // and then add it to C + auto C_ = at::empty(C.sizes(), C.options()); + auto order = SPARSE_LAYOUT_ROW_MAJOR; + auto ldc = C_.stride(-2); + + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES( + C.scalar_type(), "addmm_sparse_input_dense_result", [&] { + auto mkl_A = at::mkl::sparse::MklSparseCsrDescriptor(A); + auto mkl_B = at::mkl::sparse::MklSparseCsrDescriptor(B); + at::mkl::sparse::spmmd( + SPARSE_OPERATION_NON_TRANSPOSE, + mkl_A.descriptor(), + mkl_B.descriptor(), + order, + C_.data_ptr(), + ldc); + }); + + // If beta is zero NaN and Inf should not be propagated to the result + if (beta.toComplexDouble() == 0.) { + C.zero_(); + } else { + C.mul_(beta); + } + C.add_(C_, alpha); +#endif +} + /* Computes a sparse matrix-sparse matrix product defined as C <- alpha*(A*B) + beta*C @@ -288,14 +340,22 @@ void addmm_out_sparse_csr( const Scalar& alpha, const Tensor& result) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat1.dim() == 2 && mat2.dim() == 2 && result.dim() == 2); - if (mat2.layout() == kStrided && result.layout() == kStrided) { + if ((mat1.layout() == kSparseCsr || mat1.layout() == kSparseBsr) && + mat2.layout() == kStrided && result.layout() == kStrided) { return addmm_dense_result(mat1, mat2, beta, alpha, result); - } else if (mat2.is_sparse_csr() && result.is_sparse_csr()) { + } + if (mat1.layout() == kStrided && mat2.is_sparse_csr() && result.layout() == kStrided) { + // TODO: We can use MKL's transposition flags once we have CSC support. + return addmm_dense_result(mat2.transpose(0, 1), mat1.transpose(0, 1), beta, alpha, result.transpose(0, 1)); + } + if (mat1.is_sparse_csr() && mat2.is_sparse_csr() && result.layout() == kStrided) { + return addmm_sparse_input_dense_result(mat1, mat2, beta, alpha, result); + } + if (mat1.is_sparse_csr() && mat2.is_sparse_csr() && result.is_sparse_csr()) { return addmm_sparse_result(mat1, mat2, beta, alpha, result); - } else { - TORCH_INTERNAL_ASSERT( - false, "addmm: Received unexpected tensor layouts as input."); } + TORCH_CHECK(false, "addmm: computation on CPU is not implemented for ", + result.layout(), " + ", mat1.layout(), " @ ", mat2.layout()); } /* diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp index bcf8afe2a373..470c3a48e5e0 100644 --- a/aten/src/ATen/native/mkl/SpectralOps.cpp +++ b/aten/src/ATen/native/mkl/SpectralOps.cpp @@ -250,7 +250,7 @@ Tensor _fft_c2r_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, auto in_sizes = self.sizes(); DimVector out_sizes(in_sizes.begin(), in_sizes.end()); out_sizes[dim.back()] = last_dim_size; - auto out = at::empty(out_sizes, self.options().dtype(c10::toValueType(self.scalar_type()))); + auto out = at::empty(out_sizes, self.options().dtype(c10::toRealValueType(self.scalar_type()))); pocketfft::shape_t axes(dim.begin(), dim.end()); if (self.scalar_type() == kComplexFloat) { pocketfft::c2r(shape_from_tensor(out), stride_from_tensor(self), stride_from_tensor(out), axes, false, @@ -347,7 +347,7 @@ static DftiDescriptor _plan_mkl_fft( // precision const DFTI_CONFIG_VALUE prec = [&]{ - switch (c10::toValueType(dtype)) { + switch (c10::toRealValueType(dtype)) { case ScalarType::Float: return DFTI_SINGLE; case ScalarType::Double: return DFTI_DOUBLE; default: TORCH_CHECK(false, "MKL FFT doesn't support tensors of type: ", dtype); @@ -466,7 +466,7 @@ static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes, batched_out_sizes[i + 1] = out_sizes[dim[i]]; } - const auto value_type = c10::toValueType(input.scalar_type()); + const auto value_type = c10::toRealValueType(input.scalar_type()); out.resize_(batched_out_sizes, MemoryFormat::Contiguous); auto descriptor = _plan_mkl_fft( @@ -523,7 +523,7 @@ Tensor _fft_c2r_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, auto in_sizes = input.sizes(); DimVector out_sizes(in_sizes.begin(), in_sizes.end()); out_sizes[dim.back()] = last_dim_size; - auto out = at::empty(out_sizes, self.options().dtype(c10::toValueType(self.scalar_type()))); + auto out = at::empty(out_sizes, self.options().dtype(c10::toRealValueType(self.scalar_type()))); return _exec_fft(out, input, out_sizes, dim, normalization, /*forward=*/false); } diff --git a/aten/src/ATen/native/mkldnn/Conv.cpp b/aten/src/ATen/native/mkldnn/Conv.cpp index fb41dcdd6215..a2489e42e185 100644 --- a/aten/src/ATen/native/mkldnn/Conv.cpp +++ b/aten/src/ATen/native/mkldnn/Conv.cpp @@ -43,27 +43,78 @@ REGISTER_NO_CPU_DISPATCH(mkldnn_convolution_backward_stub); namespace at { namespace native { -ideep::tensor _mkldnn_convolution( - const ideep::tensor& x, - const ideep::tensor& w, - const c10::optional& b, +#define MKLDNNTensor(itensor, options) \ + new_with_itensor_mkldnn( \ + std::move(itensor), \ + optTypeMetaToScalarType(options.dtype_opt()), \ + options.device_opt()) + +// Note [MKLDNN Convolution Memory Formats] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// MKLDNN has 3 types of memory formats in convolution: +// +// In case memory format passed from PyTorch (aka. user layout) +// differs from the internal layout which MKLDNN used, a `reorder` is needed; +// otherwise when user layout is identical to internal layout, +// MKLDNN uses a memory `view` upon an existing CPU tensor. +// +// 1. NCHW (CPU tensor, contiguous) +// input reorder: NCHW(user) -> Blocked(internal) +// weight reorder: OIHW(user) -> Blocked(internal) +// output reorder: Blocked(internal) -> NCHW(user) +// +// 2. NHWC: (CPU tensor, channels last) +// input view: NHWC(user) -> NHWC(internal) +// weight reorder: OHWI(user) -> Blocked(internal) +// output view: NHWC(internal) -> NHWC(user) +// +// 3. Blocked (MKLDNN tensor): +// By explicitly converting a tensor to mkldnn, e.g. `x.to_mkldnn()`, +// blocked format will propagate between layers. Input, output will be in blocked format. +// +// For inference case, weight can be prepacked into blocked format by +// (so as to save weight reoder overhead): +// model = torch.utils.mkldnn.to_mkldnn(model) +// +// For training case, grad_output can be CPU tensor or MKLDNN tensor, +// but weight/bias and grad_weight/grad_bias are always CPU tensor. +// + +Tensor mkldnn_convolution( + const Tensor& input, + const Tensor& weight, const c10::optional& bias_opt, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups) { + // See [Note: hacky wrapper removal for optional tensor] + c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); + const Tensor& bias = *bias_maybe_owned; + + if (input.scalar_type() == ScalarType::BFloat16) { + TORCH_CHECK(mkldnn_bf16_device_check(), + "mkldnn_convolution: bf16 path needs the cpu support avx512bw, avx512vl and avx512dq"); + } + + bool is_channels_last = input.suggest_memory_format() == at::MemoryFormat::ChannelsLast; - auto kernel_size = w.get_dims(); + auto output_sizes = conv_output_size(input.sizes(), weight.sizes(), padding, stride, dilation); + auto output = at::empty({0}, input.options()); - std::vector input_size = x.get_dims(); - std::vector output_sizes = - conv_output_size(input_size, kernel_size, padding, stride, dilation); + const ideep::tensor x = itensor_from_tensor(input); + const ideep::tensor w = itensor_from_tensor(weight); ideep::tensor y; - if (b.has_value()) { + if (is_channels_last) { + output.resize_(output_sizes, input.suggest_memory_format()); + y = itensor_from_tensor(output); + } + if (bias.defined()) { + const ideep::tensor b = itensor_from_tensor(bias); ideep::convolution_forward::compute( x, w, - b.value(), + b, {output_sizes.cbegin(), output_sizes.cend()}, y, {stride.begin(), stride.end()}, @@ -83,47 +134,14 @@ ideep::tensor _mkldnn_convolution( {padding.begin(), padding.end()}, groups); } - return y; -} - -Tensor mkldnn_convolution( - const Tensor& input, - const Tensor& weight, const c10::optional& bias_opt, - IntArrayRef padding, - IntArrayRef stride, - IntArrayRef dilation, - int64_t groups) { - // See [Note: hacky wrapper removal for optional tensor] - c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); - const Tensor& bias = *bias_maybe_owned; - - if (input.scalar_type() == ScalarType::BFloat16) { - TORCH_CHECK(mkldnn_bf16_device_check(), - "mkldnn_convolution: bf16 path needs the cpu support avx512bw, avx512vl and avx512dq"); - } - const ideep::tensor mkldnn_input = itensor_from_tensor(input); - const ideep::tensor mkldnn_weight = itensor_from_tensor(weight); - c10::optional mkldnn_bias{c10::nullopt}; - if (bias.defined()) { - mkldnn_bias = itensor_from_tensor(bias); - } - - ideep::tensor mkldnn_output = _mkldnn_convolution( - mkldnn_input, - mkldnn_weight, - mkldnn_bias, - padding, - stride, - dilation, - groups); if (input.is_mkldnn()) { - return new_with_itensor_mkldnn(std::move(mkldnn_output), optTypeMetaToScalarType(input.options().dtype_opt()), - input.options().device_opt()); + return MKLDNNTensor(y, input.options()); + } else if (!is_channels_last) { + return mkldnn_to_dense(MKLDNNTensor(y, input.options())); } else { - return mkldnn_to_dense( - new_with_itensor_mkldnn(std::move(mkldnn_output), optTypeMetaToScalarType(input.options().dtype_opt()), - input.options().device_opt())); + TORCH_INTERNAL_ASSERT(y.get_desc().is_nhwc()); + return output; } } @@ -131,17 +149,22 @@ Tensor mkldnn_convolution_backward_input( IntArrayRef input_size, const Tensor& grad_output, const Tensor& weight, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) { - // for training case, grad_output can be cpu tensor or MKLDNN tensor, - // but weight and bias always cpu tensor. - auto mkldnn_grad_output = itensor_from_tensor(grad_output); - auto mkldnn_weight = itensor_view_from_dense(weight); + bool is_channels_last = grad_output.suggest_memory_format() == at::MemoryFormat::ChannelsLast; + auto grad_input = at::empty({0}, grad_output.options()); - ideep::tensor mkldnn_grad_input; + auto grad_y = itensor_from_tensor(grad_output); + auto w = itensor_view_from_dense(weight); + + ideep::tensor grad_x; + if (is_channels_last) { + grad_input.resize_(input_size, grad_output.suggest_memory_format()); + grad_x = itensor_from_tensor(grad_input); + } ideep::convolution_backward_data::compute( - mkldnn_grad_output, - mkldnn_weight, + grad_y, + w, input_size.vec(), - mkldnn_grad_input, + grad_x, stride.vec(), dilation.vec(), padding.vec(), @@ -149,14 +172,12 @@ Tensor mkldnn_convolution_backward_input( groups); if (grad_output.is_mkldnn()) { - return new_with_itensor_mkldnn(std::move(mkldnn_grad_input), - optTypeMetaToScalarType(grad_output.options().dtype_opt()), - grad_output.options().device_opt()); - + return MKLDNNTensor(grad_x, grad_output.options()); + } else if (!is_channels_last){ + return mkldnn_to_dense(MKLDNNTensor(grad_x, grad_output.options())); } else { - return mkldnn_to_dense(new_with_itensor_mkldnn(std::move(mkldnn_grad_input), - optTypeMetaToScalarType(grad_output.options().dtype_opt()), - grad_output.options().device_opt())); + TORCH_INTERNAL_ASSERT(grad_x.get_desc().is_nhwc()); + return grad_input; } } @@ -164,19 +185,19 @@ std::tuple mkldnn_convolution_backward_weights( IntArrayRef weight_size, const Tensor& grad_output, const Tensor& input, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) { - // for training case, grad_output and input can be cpu tensor or MKLDNN tensor, - // but weight and bias are always cpu tensor. - const ideep::tensor mkldnn_grad_output = itensor_from_tensor(grad_output); - const ideep::tensor mkldnn_input = itensor_from_tensor(input); + bool is_channels_last = grad_output.suggest_memory_format() == at::MemoryFormat::ChannelsLast; - ideep::tensor mkldnn_grad_weight, mkldnn_grad_bias; + const ideep::tensor grad_y = itensor_from_tensor(grad_output); + const ideep::tensor x = itensor_from_tensor(input); + + ideep::tensor grad_w, grad_b; if (bias_defined) { ideep::convolution_backward_weights::compute( - mkldnn_input, - mkldnn_grad_output, + x, + grad_y, weight_size.vec(), - mkldnn_grad_weight, - mkldnn_grad_bias, + grad_w, + grad_b, stride.vec(), dilation.vec(), padding.vec(), @@ -184,10 +205,10 @@ std::tuple mkldnn_convolution_backward_weights( groups); } else { ideep::convolution_backward_weights::compute( - mkldnn_input, - mkldnn_grad_output, + x, + grad_y, weight_size.vec(), - mkldnn_grad_weight, + grad_w, stride.vec(), dilation.vec(), padding.vec(), @@ -195,20 +216,23 @@ std::tuple mkldnn_convolution_backward_weights( groups); } - return std::make_tuple( - mkldnn_to_dense(new_with_itensor_mkldnn(std::move(mkldnn_grad_weight), - optTypeMetaToScalarType(grad_output.options().dtype_opt()), - grad_output.options().device_opt())), - mkldnn_to_dense(new_with_itensor_mkldnn(std::move(mkldnn_grad_bias), - optTypeMetaToScalarType(grad_output.options().dtype_opt()), - grad_output.options().device_opt()))); + if (!is_channels_last) { + return std::make_tuple( + mkldnn_to_dense(MKLDNNTensor(grad_w, grad_output.options())), + bias_defined ? mkldnn_to_dense(MKLDNNTensor(grad_b, grad_output.options())) : Tensor()); + } else { + return std::make_tuple( + mkldnn_to_dense(MKLDNNTensor(grad_w, grad_output.options())).to(at::MemoryFormat::ChannelsLast), + bias_defined ? mkldnn_to_dense(MKLDNNTensor(grad_b, grad_output.options())) : Tensor()); + } } std::tuple mkldnn_convolution_backward( const Tensor& input, const Tensor& grad_output_t, const Tensor& weight, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, std::array output_mask) { - Tensor grad_output = grad_output_t.is_mkldnn() ? grad_output_t : grad_output_t.contiguous(); + auto memory_format = input.suggest_memory_format(); + Tensor grad_output = grad_output_t.is_mkldnn() ? grad_output_t : grad_output_t.contiguous(memory_format); Tensor grad_input, grad_weight, grad_bias; if (output_mask[0]) { diff --git a/aten/src/ATen/native/mkldnn/Gelu.cpp b/aten/src/ATen/native/mkldnn/Gelu.cpp index fa78cd1c3a96..1d2a67251513 100644 --- a/aten/src/ATen/native/mkldnn/Gelu.cpp +++ b/aten/src/ATen/native/mkldnn/Gelu.cpp @@ -1,17 +1,17 @@ #include #include #include - +#include #if !AT_MKLDNN_ENABLED() namespace at { namespace native { -Tensor mkldnn_gelu(const Tensor& input) { +Tensor mkldnn_gelu(const Tensor& input, c10::string_view approximate) { TORCH_CHECK(false, "mkldnn_gelu: ATen not compiled with MKLDNN support"); } -Tensor mkldnn_gelu_backward(const Tensor& grad_output, const Tensor& input) { +Tensor mkldnn_gelu_backward(const Tensor& grad_output, const Tensor& input, c10::string_view approximate) { TORCH_CHECK(false, "mkldnn_gelu_backward: ATen not compiled with MKLDNN support"); } @@ -24,11 +24,13 @@ Tensor mkldnn_gelu_backward(const Tensor& grad_output, const Tensor& input) { namespace at { namespace native { -Tensor mkldnn_gelu(const Tensor& input) { +Tensor mkldnn_gelu(const Tensor& input, c10::string_view approximate) { if (input.scalar_type() == ScalarType::BFloat16) { TORCH_CHECK(mkldnn_bf16_device_check(), "mkldnn_gelu: bf16 path needs the cpu support avx512bw, avx512vl and avx512dq"); } + TORCH_CHECK(get_gelutype_enum(approximate) == GeluType::None, + "mkldnn_gelu: fast, approximate gelu is not supported"); const ideep::tensor& x = itensor_from_tensor(input); ideep::tensor y; ideep::eltwise_forward::compute( @@ -37,7 +39,9 @@ Tensor mkldnn_gelu(const Tensor& input) { input.options().device_opt()); } -Tensor mkldnn_gelu_backward(const Tensor& grad_output, const Tensor& input) { +Tensor mkldnn_gelu_backward(const Tensor& grad_output, const Tensor& input, c10::string_view approximate) { + TORCH_CHECK(get_gelutype_enum(approximate) == GeluType::None, + "mkldnn_gelu_backward: fast, approximate gelu is not supported"); const ideep::tensor& x = itensor_from_tensor(input); ideep::tensor grady = itensor_from_tensor(grad_output); ideep::tensor gradx; diff --git a/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp b/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp index cfbbf5c6fa19..fbfb329a5e93 100644 --- a/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp +++ b/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp @@ -30,7 +30,7 @@ Tensor mkldnn_to_dense(const Tensor& mkldnn_tensor, c10::optional dt : stensor.to_public(cpu_tensor.template data_ptr(), ideep::tensor::data_type::bf16); cpu_tensor.as_strided_(dims, pub_tensor.get_strides()); - return cpu_tensor; + return cpu_tensor.contiguous(); } Tensor dense_to_mkldnn(const Tensor& cpu_tensor, c10::optional dtype) { @@ -43,7 +43,7 @@ Tensor dense_to_mkldnn(const Tensor& cpu_tensor, c10::optional dtype "dense_to_mkldnn expects float or bfloat16 tensor input"); TORCH_CHECK(cpu_tensor.dim() <= 5, "Can't convert cpu tensor with the number of dimensions > 5"); - // TODO: consider to convert non-contiguous tensor to `ideep::tensor` directly. + // NOTE: forbid direct convert from non-contiguous (or channels last) to `ideep::tensor`. auto cpu_tensor_cont = cpu_tensor.contiguous(); auto data_type = dtype.has_value() ? dtype.value() : cpu_tensor.scalar_type(); TORCH_CHECK(data_type == ScalarType::Float || data_type == ScalarType::BFloat16, diff --git a/aten/src/ATen/native/mkldnn/Prelu.cpp b/aten/src/ATen/native/mkldnn/Prelu.cpp new file mode 100644 index 000000000000..acc78211d83c --- /dev/null +++ b/aten/src/ATen/native/mkldnn/Prelu.cpp @@ -0,0 +1,79 @@ +#include +#include +#include + + +#if !AT_MKLDNN_ENABLED() + +namespace at { namespace native { + +Tensor mkldnn_prelu(const Tensor& input, const Tensor& weight) { + TORCH_CHECK(false, "mkldnn_prelu: ATen not compiled with MKLDNN support"); +} + +std::tuple mkldnn_prelu_backward(const Tensor& grad_output, const Tensor& input, const Tensor& weight) { + TORCH_CHECK(false, "mkldnn_prelu_backward: ATen not compiled with MKLDNN support"); +} + +}} + +#else // AT_MKLDNN_EBABLED + +#include +#include + +namespace at { namespace native { + +Tensor mkldnn_prelu(const Tensor& input, const Tensor& weight) { + if (input.scalar_type() == ScalarType::BFloat16) { + TORCH_CHECK(mkldnn_bf16_device_check(), + "mkldnn_relu: bf16 path needs the cpu support avx512bw, avx512vl and avx512dq"); + } + + int64_t weight_num = weight.numel(); + if (weight_num != 1) { + int64_t channel_size = input.dim() > 1 ? input.size(1) : 1; + TORCH_CHECK(channel_size == weight_num, + "Mismatch of parameter numbers and input channel size. Found parameter numbers = ", weight_num, + " and channel size = ", channel_size, "."); + } + const ideep::tensor& x = itensor_from_mkldnn(input); + const ideep::tensor& w = itensor_from_tensor(weight); + + ideep::tensor y; + ideep::prelu_forward::compute( + x, w, y, ideep::prop_kind::forward_training); + return new_with_itensor_mkldnn(std::move(y), optTypeMetaToScalarType(input.options().dtype_opt()), + input.options().device_opt()); +} + +std::tuple mkldnn_prelu_backward(const Tensor& grad_output, const Tensor& input, const Tensor& weight) { + const ideep::tensor& x = itensor_from_mkldnn(input); + const ideep::tensor& w = itensor_from_tensor(weight); + const ideep::tensor grady = itensor_from_mkldnn(grad_output); + ideep::tensor gradx; + ideep::tensor gradw; + + ideep::prelu_backward::compute( + x, w, grady, gradx, gradw, ideep::prop_kind::backward); + if (weight.is_mkldnn()) { + return std::make_tuple( + new_with_itensor_mkldnn(std::move(gradx), + optTypeMetaToScalarType(grad_output.options().dtype_opt()), + grad_output.options().device_opt()), + new_with_itensor_mkldnn(std::move(gradw), + optTypeMetaToScalarType(weight.options().dtype_opt()), + weight.options().device_opt())); + } else { + return std::make_tuple( + new_with_itensor_mkldnn(std::move(gradx), + optTypeMetaToScalarType(grad_output.options().dtype_opt()), + grad_output.options().device_opt()), + mkldnn_to_dense(new_with_itensor_mkldnn(std::move(gradw), + optTypeMetaToScalarType(weight.options().dtype_opt()), + weight.options().device_opt()))); + } +} +}} + +#endif // AT_MKLDNN_EBABLED diff --git a/aten/src/ATen/native/mps/Copy.h b/aten/src/ATen/native/mps/Copy.h new file mode 100644 index 000000000000..1a4465e73538 --- /dev/null +++ b/aten/src/ATen/native/mps/Copy.h @@ -0,0 +1,28 @@ +// Copyright © 2022 Apple Inc. + +#pragma once +#include + +#include +#include +#include +#include +#include +#include + +#ifdef __OBJC__ +#include +#include +#include +#endif + +namespace at { +namespace native { +namespace mps { + +at::Tensor& mps_copy_(at::Tensor& dst, const at::Tensor& src, bool non_blocking); +void copy_blit_mps(void* dst, const void* src, size_t size); + +} // namespace mps +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h new file mode 100644 index 000000000000..26cae7238b70 --- /dev/null +++ b/aten/src/ATen/native/mps/OperationUtils.h @@ -0,0 +1,202 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __OBJC__ +#include +#endif + +using namespace at::mps; + +namespace at { +namespace native { +namespace mps { + +struct TORCH_CUDA_CPP_API MPSGeneratorImpl : public c10::GeneratorImpl { + MPSGeneratorImpl(DeviceIndex device_index = -1); + ~MPSGeneratorImpl() = default; + + void set_current_seed(uint64_t seed) override; + uint64_t current_seed() const override; + uint64_t seed() override; + void set_state(const c10::TensorImpl& new_state) override; + c10::intrusive_ptr get_state() const override; + static DeviceType device_type(); + +private: + MPSGeneratorImpl* clone_impl() const override; + uint64_t seed_ = default_rng_seed_val; +}; + +const Generator& getDefaultMPSGenerator(); + +void runMPSGraph( + MPSStream* mpsStream, + MPSGraph* mpsGraph, + NSDictionary* feeds, + NSDictionary* results); + +MPSDataType getMPSDataType(ScalarType scalar_type); +MPSDataType getMPSScalarType(ScalarType scalar_type); +std::string getMPSTypeString(ScalarType scalar_type); +std::string getMPSShapeString(MPSShape* shape); +std::string getTensorsStringKey(const TensorList& tensors); +double getMPSScalarValue(const Tensor& t); +std::string getArrayRefString(const IntArrayRef s); +std::string getStridedKey(const Tensor& self, const IntArrayRef sz, + const IntArrayRef strides, int64_t offset); +id gatherViewTensor(const at::Tensor& src, id s); + +MPSShape* getMPSShape(const Tensor& t); +MPSShape* getMPSShape(IntArrayRef sizes); +MPSShape* getMPSShape(c10::MaybeOwned t); + +class Placeholder { + public: + Placeholder() : _placeholder(nullptr), _value(nullptr) {} + Placeholder(MPSGraphTensor* mpsGraphTensor) : _placeholder(mpsGraphTensor), _value(nullptr) {} + Placeholder(MPSGraphTensor* mpsGraphTensor, const Tensor& self, MPSShape *mpsShape = nullptr); + MPSGraphTensor* getMPSGraphTensor() { + return _placeholder; + } + MPSGraphTensorData* getMPSGraphTensorData() { + return _value; + } + bool isIntermediate() { + return _value == nullptr; + } + + private: + MPSGraphTensor* _placeholder; + MPSGraphTensorData* _value; +}; + +void resize_tensor(Tensor* output); +MPSGraphTensor* trunc_tensor(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor); +MPSGraphTensorData *getMPSGraphTensorData(MPSGraph* mpsGraph, + MPSStream* mpsStream, + const Tensor& tensor); + +MPSGraph* make_mps_graph(); +void printTensorNDArray(const Tensor& t); + +MPSGraphTensor* mpsGraphUnrankedPlaceHolder(MPSGraph *mpsGraph, MPSDataType dataType); +MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph *mpsGraph, MPSDataType dataType, MPSShape* mpsShape); +MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph *mpsGraph, const Tensor& tensor); +MPSGraphTensor* mpsGraphConstantFloatPlaceHolder(MPSGraph *mpsGraph, const double value, MPSShape* mpsShape); +MPSGraphTensor* mpsGraphConstantPlaceHolder(MPSGraph *mpsGraph, const double value, MPSShape* mpsShape, MPSDataType dataType); + +string get_mem_format_string(c10::MemoryFormat memory_format); + +using MPSCacheKey = int64_t; + +// derive this class to cache a graph and its inputs/ouputs +// can be used to store any NSObject +struct MPSCachedGraph +{ + MPSCachedGraph(NSObject *object) : _object([object retain]) {} + virtual ~MPSCachedGraph() { + [_object release]; + _object = nullptr; + } + MPSGraph *graph() const { return (MPSGraph *)_object; } + NSObject *object() const { return _object; } +private: + NSObject *_object = nullptr; +}; + +// TODO: Improve the overall design of MPSGraphCache. +// https://github.com/pytorch/pytorch/issues/77176 +// Cache holding various keys mapped to graphs + +struct MPSGraphCache +{ + typedef MPSCachedGraph * (^CreateCachedGraphBlock)(); + + struct CacheEntry { + CacheEntry(std::string key, MPSCachedGraph *cachedGraph) : cachedGraph_(cachedGraph), key_(key) {} + MPSCachedGraph* cachedGraph_ = nullptr; + std::string key_ = nullptr; + }; + + public: + + static MPSGraphCache* getInstance() { + if(_instance_cache == nullptr) { + _instance_cache = new MPSGraphCache(); + } + return _instance_cache; + } + + ~MPSGraphCache() { + dispatch_release(serialQueue_); + + for (auto i : cache_) { + delete i.second.cachedGraph_; + } + } + + // Disallow the copy constructor and operator= functions + MPSGraphCache(const MPSGraphCache&) = delete; + void operator=(const MPSGraphCache&) = delete; + + MPSCachedGraph* CreateCachedGraph(const std::string& key, CreateCachedGraphBlock createCacheBlock) { + + __block MPSCachedGraph * result = nil; + + MPSCacheKey hash = std::hash{}(key); + + dispatch_sync(serialQueue_, ^() { + + // verify the cached entry doesn't already exist + if (cache_.count(hash) != 0) { + auto& entry = cache_.at(hash); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(key == entry.key_, "Key collision in the MPS cached graph!\n"); + result = entry.cachedGraph_; + } + else { + result = createCacheBlock(); + CacheEntry entry(key, result); + cache_.emplace(hash, entry); + } + }); + return result; + } + + MPSCachedGraph* LookUp(const std::string& key) const { + + __block MPSCachedGraph* result = nullptr; + + MPSCacheKey hash = std::hash{}(key); + + dispatch_sync(serialQueue_, ^() { + + if (cache_.count(hash) != 0) { + auto& entry = cache_.at(hash); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(key == entry.key_, "Key collision in the MPS cached graph!\n"); + result = entry.cachedGraph_; + } + }); + return result; + } + private: + MPSGraphCache() { + serialQueue_ = dispatch_queue_create("cache queue", DISPATCH_QUEUE_SERIAL); + } + + static MPSGraphCache* _instance_cache; + std::unordered_map cache_; + dispatch_queue_t serialQueue_ = nullptr; + +}; + +} // namespace mps +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm new file mode 100644 index 000000000000..ea0d153d0ecc --- /dev/null +++ b/aten/src/ATen/native/mps/OperationUtils.mm @@ -0,0 +1,447 @@ +// Copyright © 2022 Apple Inc. + +#include + +namespace at { +namespace native { +namespace mps { + +uint64_t MPSGeneratorImpl::seed() { + auto random = c10::detail::getNonDeterministicRandom(true); + this->set_current_seed(random); + return random; +} +uint64_t MPSGeneratorImpl::current_seed() const { + return seed_; +} + +void MPSGeneratorImpl::set_current_seed(uint64_t seed) { + seed_ = seed; +} + +MPSGeneratorImpl::MPSGeneratorImpl(DeviceIndex device_index) + : c10::GeneratorImpl{Device(DeviceType::MPS, device_index), + DispatchKeySet(c10::DispatchKey::MPS)} { +} + +const Generator& getDefaultMPSGenerator() { + auto gen = make_generator(0); + gen.seed(); + return gen; +} +DeviceType MPSGeneratorImpl::device_type() { + return DeviceType::MPS; +} +c10::intrusive_ptr MPSGeneratorImpl::get_state() const { + static const size_t seed_size = sizeof(uint64_t); + static const size_t offset_size = sizeof(int64_t); + static const size_t total_size = seed_size + offset_size; + + auto state_tensor = at::detail::empty_cpu({(int64_t)total_size}, ScalarType::Byte, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt); + auto rng_state = state_tensor.data_ptr(); + + return state_tensor.getIntrusivePtr(); +} + +void MPSGeneratorImpl::set_state(const c10::TensorImpl& new_state) { + static const size_t seed_size = sizeof(uint64_t); + static const size_t offset_size = sizeof(int64_t); + static const size_t total_size = seed_size + offset_size; + + detail::check_rng_state(new_state); + + auto new_state_size = new_state.numel(); + + uint64_t input_seed; + auto new_rng_state = new_state.data(); + memcpy(&input_seed, new_rng_state, seed_size); + this->set_current_seed(input_seed); +} + +MPSGeneratorImpl* MPSGeneratorImpl::clone_impl() const { + auto gen = new MPSGeneratorImpl(0); + gen->set_current_seed(this->seed_); + return gen; +} + +std::string getStridedKey(const Tensor& self, const IntArrayRef sz, + const IntArrayRef strides, int64_t offset) { + // TODO: move storage_offset to a PlaceholderTensor and strides to a + // tensor too, to avoid too many cache entries. + return std::to_string((uintptr_t)self.storage().data()) + + ":" + mps::getArrayRefString(sz) + + ":" + mps::getArrayRefString(strides) + + ":" + std::to_string(offset); +} + +void runMPSGraph( + MPSStream* mpsStream, + MPSGraph* mpsGraph, + NSDictionary* feeds, + NSDictionary* results) { + + dispatch_sync(mpsStream->queue(), ^() { + @autoreleasepool { + mpsStream->commit(true); + id commandQueue = mpsStream->commandQueue(); + MPSGraphExecutionDescriptor *executionDescriptor = [[MPSGraphExecutionDescriptor new] autorelease]; + + executionDescriptor.completionHandler = ^(NSDictionary * resultsDictionary, + NSError * _Nullable error) { + }; + + [mpsGraph runAsyncWithMTLCommandQueue:commandQueue + feeds:feeds + targetOperations:nil + resultsDictionary:results + executionDescriptor:executionDescriptor]; + + } + }); +} + +MPSDataType getMPSDataType(ScalarType scalar_type) { + switch (scalar_type) { + case ScalarType::Float: + return MPSDataTypeFloat32; + case ScalarType::Half: + return MPSDataTypeFloat16; + case ScalarType::Int: + return MPSDataTypeInt32; + case ScalarType::Long: + return MPSDataTypeInt64; + case ScalarType::Short: + return MPSDataTypeInt16; + case ScalarType::Byte: + return MPSDataTypeInt8; + case ScalarType::Bool: + return MPSDataTypeBool; + default: + TORCH_CHECK_TYPE(false, "Trying to convert ", scalar_type, " to the MPS backend but there is no mapping for it.") + } +} + +MPSDataType getMPSScalarType(ScalarType scalar_type) { + switch (scalar_type) { + // This is an intentional fallthrough supporting Double for Scalar + // types as they are casted to Float32 currently. + case ScalarType::Double: + case ScalarType::Float: + return MPSDataTypeFloat32; + case ScalarType::Half: + return MPSDataTypeFloat16; + case ScalarType::Int: + return MPSDataTypeInt32; + case ScalarType::Long: + return MPSDataTypeInt64; + case ScalarType::Short: + return MPSDataTypeInt16; + case ScalarType::Byte: + return MPSDataTypeInt8; + case ScalarType::Bool: + return MPSDataTypeBool; + default: + TORCH_INTERNAL_ASSERT(false, "Trying to convert ", scalar_type, " to the MPS backend but there is no mapping for it.") + } +} + +std::string getMPSTypeString(ScalarType scalar_type) { + switch (scalar_type) { + case ScalarType::Double: + case ScalarType::Float: + return "MPSDataTypeFloat32"; + case ScalarType::Half: + return "MPSDataTypeFloat16"; + case ScalarType::Int: + return "MPSDataTypeInt32"; + case ScalarType::Long: + return "MPSDataTypeInt64"; + case ScalarType::Short: + return "MPSDataTypeInt16"; + case ScalarType::Byte: + return "MPSDataTypeInt8"; + case ScalarType::Bool: + return "MPSDataTypeBool"; + default: + return "Undefined"; + } +} + +std::string getMPSShapeString(MPSShape* shape) { + std::string str; + for(NSNumber *elem in shape) { + str += std::to_string(elem.unsignedLongValue) + ","; + } + return str; +} + +std::string getArrayRefString(const IntArrayRef s) { + std::stringstream ss; + std::copy(s.begin(), s.end(), std::ostream_iterator(ss, ",")); + return ss.str(); +} + +std::string getTensorsStringKey(const TensorList& tensors) { + std::string str; + // The key format per tensor would look like ":MPSDataTypeFloat32[1,1,1,10]:" + for (const Tensor& tensor: tensors) { + str += ":"; + if (tensor.defined()) { + str += getMPSTypeString(tensor.scalar_type()) + "["; + // if tensor is a scalar + if (tensor.dim() == 0) { + str += std::to_string(getMPSScalarValue(tensor)); + } else { + const NSString* ns_shape_key = [[getMPSShape(tensor) valueForKey:@"description"] componentsJoinedByString:@","]; + str += std::string(ns_shape_key.UTF8String); + } + str += "]"; + } else { + str += "Undefined"; + } + } + return str; +} + +double getMPSScalarValue(const Tensor& t) { + assert (t.dim() == 0); // only applicable for scalar types + auto other_value = t.item(); + return other_value.to(); +} + +MPSShape* getMPSShape(const Tensor& t) { + const int sz = t.dim(); + const int sz_ = (sz > 0) ? sz : 1; + + NSNumber* numbers[sz_]; + + for (int i = 0; i < sz_; i++) + { + NSInteger sz_i = (i < sz) ? t.size(i) : 1; + + NSNumber* number = [NSNumber numberWithInt:sz_i]; + numbers[i] = number; + } + return [NSArray arrayWithObjects:numbers count:sz_]; +} + +MPSShape* getMPSShape(c10::MaybeOwned t) { + const Tensor& t_ = *t; + return getMPSShape(t_); +} + +MPSShape* getMPSShape(IntArrayRef sizes) { + const int sz = sizes.size(); + const int sz_ = (sz > 0) ? sz : 1; + + NSNumber* numbers[sz_]; + + for (int i = 0; i < sz_; i++) + { + NSInteger sz_i = (i < sz) ? sizes[i] : 1; + + NSNumber* number = [NSNumber numberWithInt:sz_i]; + numbers[i] = number; + } + return [NSArray arrayWithObjects:numbers count:sz_]; +} + +void printTensorNDArray(const Tensor& t) { + if (!t.is_mps()) return; + if(t.numel() == 0) + { + std::cout << "Empty tensor" << std::endl; + return; + } + // Get shape and data type + auto selfShape = getMPSShape(t); + auto selfDType = getMPSDataType(t.scalar_type()); + + // Initialize data + id selfBuf = __builtin_bit_cast(id, t.storage().data()); + MPSGraphTensorData* tdata = [[MPSGraphTensorData alloc] initWithMTLBuffer:selfBuf + shape:selfShape + dataType:selfDType]; + [tdata printNDArray]; +} + +id gatherViewTensor(const at::Tensor& src, id sourceBuffer) { + assert (!src.is_contiguous()); + id device = MPSDevice::getInstance()->device(); + MPSStream* stream = getCurrentMPSStream(); + @autoreleasepool { + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* inputTensor_ = nil; + MPSGraphTensor* outputTensor_ = nil; + IntArrayRef size_; + IntArrayRef stride_; + int64_t storage_offset_; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + string key = getStridedKey(src, src.sizes(), src.strides(), src.storage_offset()); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if (cachedGraph) { + @autoreleasepool { + MPSGraphTensor* inputTensor = cachedGraph->inputTensor_; + auto output = at::native::empty_mps( + src.sizes(), + src.scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + c10::nullopt); + MPSGraphTensorData* inputTensorData = [[MPSGraphTensorData alloc] initWithMTLBuffer: sourceBuffer + shape: [inputTensor shape] + dataType: [inputTensor dataType]]; + id resultBuffer = __builtin_bit_cast(id, output.storage().data()); + MPSGraphTensorData* outputTensorData = [[MPSGraphTensorData alloc] initWithMTLBuffer: resultBuffer + shape: getMPSShape(src.sizes()) + dataType: getMPSDataType(src.scalar_type())]; + NSDictionary* feeds = @{ + inputTensor : inputTensorData + }; + + NSDictionary* results = @{ + cachedGraph->outputTensor_ : outputTensorData + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); +#if _DEBUG + NSLog(@"%@", [cachedGraph->graph() debugDescription]); + TORCH_WARN("We have a non-contiguous tensor in copy_from_mps with key ", key); + + //// Update the Blit sourceBuffer to the result of this operation + printTensorNDArray(output); +#endif + return resultBuffer; + } + } + } + return nil; +} + + + +Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor, const Tensor& self, MPSShape *mpsShape) +{ + TORCH_CHECK(self.is_mps(), "Placeholder storage has not been allocated on MPS device!"); + // extract the pointer to MTLBuffer from the Tensor's storage + id selfBuf = __builtin_bit_cast(id, self.storage().data()); + const size_t buf_size = [selfBuf length]; + + // tensor.numel() could be zero, but tensor is valid as long as the buffer size is non-zero. + // if buf_size is zero in here, it's not a user error. It could be a missing check for + // tensor.numel() == 0 in our internal implementations of ops. + TORCH_INTERNAL_ASSERT(buf_size > 0, "Placeholder tensor is empty!"); + + TORCH_CHECK(self.storage().nbytes() <= buf_size, "Placeholder buffer size (", buf_size, + ") is not large enough to contain the Tensor storage of size ", self.storage().nbytes()); + + const MPSDataType mpsDataType = getMPSDataType(self.scalar_type()); + if (!mpsShape) + mpsShape = getMPSShape(self); + + _value = [[MPSGraphTensorData alloc] initWithMTLBuffer:selfBuf + shape:mpsShape + dataType:mpsDataType]; + TORCH_INTERNAL_ASSERT(_value); + _placeholder = mpsGraphTensor; +} + +MPSGraphTensorData *getMPSGraphTensorData(MPSGraph* mpsGraph, + MPSStream* mpsStream, + const Tensor& tensor) { + auto mpsShape = getMPSShape(tensor); + auto dataType = getMPSDataType(tensor.scalar_type()); + + MPSGraphTensorData *result = nil; + if (tensor.numel() > 0) { + id buf = __builtin_bit_cast(id, tensor.storage().data()); + result = [[[MPSGraphTensorData alloc] initWithMTLBuffer:buf + shape:mpsShape + dataType:dataType] + autorelease]; + } else { + // create empty NDArray + MPSNDArrayDescriptor *desc = [MPSNDArrayDescriptor descriptorWithDataType:dataType + shape:mpsShape]; + MPSNDArray *emptyArray = [[[MPSNDArray alloc] + initWithDevice:mpsStream->device() descriptor:desc] autorelease]; + result = [[[MPSGraphTensorData alloc] initWithMPSNDArray:emptyArray] autorelease]; + } + assert(result); + return result; +} + +void resize_tensor(Tensor* output) { + output->resize_(output->sizes()); +} + +MPSGraph* make_mps_graph() { + MPSGraph* mpsGraph = [[MPSGraph new] autorelease]; + mpsGraph.options = MPSGraphOptionsNone; + return mpsGraph; +} + +MPSGraphTensor* mpsGraphConstantFloatPlaceHolder(MPSGraph *mpsGraph, const double value, MPSShape* mpsShape) { + // "value" is always double, so is the Placeholder's type (we only support Float32). + return [mpsGraph constantWithScalar:value + shape:mpsShape + dataType:MPSDataTypeFloat32]; +} + +MPSGraphTensor* mpsGraphConstantPlaceHolder(MPSGraph *mpsGraph, const double value, MPSShape* mpsShape, MPSDataType dataType) { + // Bool is not handled by constantWithScalar + MPSGraphTensor* constPlaceHolder = [mpsGraph constantWithScalar:value + shape:mpsShape + dataType:(dataType == MPSDataTypeBool ? MPSDataTypeFloat32 : dataType)]; + if (dataType == MPSDataTypeBool) + return [mpsGraph castTensor:constPlaceHolder toType:MPSDataTypeBool name:@"ConstantBoolTensor"]; + + return constPlaceHolder; +} + +MPSGraphTensor* mpsGraphUnrankedPlaceHolder(MPSGraph *mpsGraph, MPSDataType dataType) { + return [mpsGraph placeholderWithShape:nil + dataType:dataType + name:nil]; +} + +MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph *mpsGraph, MPSDataType dataType, MPSShape* mpsShape) { + return [mpsGraph placeholderWithShape:mpsShape + dataType:dataType + name:nil]; +} + +MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph *mpsGraph, const Tensor& tensor) { + return [mpsGraph placeholderWithShape:getMPSShape(tensor) + dataType:getMPSDataType(tensor.scalar_type()) + name:nil]; +} + + +string get_mem_format_string(c10::MemoryFormat memory_format) { + string mem_format_key; + switch(memory_format) { + case at::MemoryFormat::Contiguous: + mem_format_key = "Contiguous"; + break; + case at::MemoryFormat::ChannelsLast: + mem_format_key = "ChannelsLast"; + break; + default: + assert(0 && "Invalid memory format\n"); + } + + return mem_format_key; +} + +MPSGraphCache* MPSGraphCache::_instance_cache = nullptr; + +} // namespace mps +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/mps/TensorFactory.cpp b/aten/src/ATen/native/mps/TensorFactory.cpp new file mode 100644 index 000000000000..78899fc8fa3c --- /dev/null +++ b/aten/src/ATen/native/mps/TensorFactory.cpp @@ -0,0 +1,136 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include +#include +#include +#include +#include +#include +namespace at { namespace native { + +static inline void maybe_resize_storage_mps(TensorImpl* self, uint64_t new_size) { + if (new_size == 0) { + return; + } + + auto storage = self->storage().unsafeGetStorageImpl(); + if (!storage) { + TORCH_CHECK(false, "Tensor: invalid null storage"); + } + uint64_t new_size_bytes = (new_size + self->storage_offset()) * self->dtype().itemsize(); + if (new_size_bytes > self->storage().nbytes()) { + if (new_size_bytes == 0) { + storage->set_data_ptr_noswap(at::DataPtr(nullptr, at::Device(at::DeviceType::MPS, 0))); + storage->set_nbytes(0); + } else { + at::DataPtr new_data = storage->allocator()->allocate(new_size_bytes); + size_t copy_capacity = std::min(new_size_bytes, storage->nbytes()); + if (storage->data() && copy_capacity > 0) { + at::native::mps::copy_blit_mps(new_data.get(), storage->data(), copy_capacity); + } + // Destructively overwrite data_ptr + storage->set_data_ptr_noswap(std::move(new_data)); + storage->set_nbytes(new_size_bytes); + } + } +} + +inline TensorImpl* resize_impl_mps_( + TensorImpl* self, + IntArrayRef size, + c10::optional stride, + bool device_guard = true) { + if (self->sizes() == size && (!stride || self->strides() == stride)) { + return self; + } + + int64_t storage_size = 1; + if (stride) { + self->set_sizes_and_strides(size, *stride); + // NB: storage size can be different from numel. + storage_size = storage_size_for(size, *stride); + } else { + self->set_sizes_contiguous(size); + storage_size = self->numel(); + } + maybe_resize_storage_mps(self, storage_size); + + return self; +} + +Tensor empty_mps( + IntArrayRef size, + c10::optional dtype_opt, + c10::optional layout_opt, + c10::optional device_opt, + c10::optional pin_memory_opt, + c10::optional memory_format_opt) { + + return at::detail::empty_mps(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt); +} + +Tensor empty_strided_mps( + IntArrayRef size, + IntArrayRef stride, + c10::optional dtype_opt, + c10::optional layout_opt, + c10::optional device_opt, + c10::optional pin_memory_opt) { + check_size_nonnegative(size); + // empty memory formatempty + auto t = at::native::empty_mps( + {0}, + dtype_opt, + layout_opt, + device_opt, + pin_memory_opt); + resize_impl_mps_(t.unsafeGetTensorImpl(), size, stride); + return t; +} + +const Tensor& resize_mps_( + const Tensor& self, + IntArrayRef size, + c10::optional optional_memory_format) { + if (self.has_names()) { + return resize_named_tensor_(self, size, optional_memory_format); + } + auto* self_ = self.unsafeGetTensorImpl(); + resize_impl_mps_(self_, size, /*strides=*/c10::nullopt); + if (optional_memory_format.has_value()) { + auto memory_format = + optional_memory_format.value(); + TORCH_CHECK( + memory_format != MemoryFormat::Preserve, + "Unsupported memory format", + memory_format); + self_->empty_tensor_restride(memory_format); + } + return self; +} + +Tensor& set_mps_(Tensor& result) { + caffe2::TypeMeta dtype = result.dtype(); + Storage storage( + Storage::use_byte_size_t(), + 0, + at::mps::GetMPSAllocator(), + true); + result.set_(storage, 0, {0}, {}); + TORCH_INTERNAL_ASSERT(dtype == result.dtype()); + return result; +} + +Tensor& set_storage_mps_(Tensor& result, Storage storage, int64_t storage_offset, IntArrayRef size, IntArrayRef stride) { + checkSetStorage(result, storage, storage_offset, size, stride); + //std::cout << "set storage_mps " << storage_offset << " stride " << stride << std::endl; + result.unsafeGetTensorImpl()->set_storage_offset(storage_offset); + c10::optional stride_opt = stride.data() != nullptr ? + c10::optional(stride) : c10::nullopt; + at::native::resize_impl_mps_(result.unsafeGetTensorImpl(), size, stride_opt); + return result; +} +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/mps/TensorFactory.h b/aten/src/ATen/native/mps/TensorFactory.h new file mode 100644 index 000000000000..cb7931deb6bc --- /dev/null +++ b/aten/src/ATen/native/mps/TensorFactory.h @@ -0,0 +1,17 @@ +// Copyright © 2022 Apple Inc. + +#define AT_DISPATCH_MPS_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& the_type = TYPE; \ + at::ScalarType _st = ::detail::scalar_type(the_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st); \ + switch (_st) { \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Int, int32_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Long, int64_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Short, int16_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Half, at::Half, __VA_ARGS__) \ + default: \ + AT_ERROR(#NAME, " not implemented for '", toString(_st), "'"); \ + } \ + }() diff --git a/aten/src/ATen/native/mps/operations/Activation.mm b/aten/src/ATen/native/mps/operations/Activation.mm new file mode 100644 index 000000000000..b0a1fe4bbcea --- /dev/null +++ b/aten/src/ATen/native/mps/operations/Activation.mm @@ -0,0 +1,1570 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include +#include +#include +#include + +#ifdef __OBJC__ +#include +#endif + +using namespace at::mps; + +namespace at { +namespace native { + +Tensor relu_mps(const Tensor& self) { + using namespace mps; + Tensor output = at::empty_like(self); + resize_tensor(&output); + TORCH_CHECK(output.is_mps()); + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + MPSStream* stream = getCurrentMPSStream(); + + @autoreleasepool { + string key = "relu" + getTensorsStringKey({self}); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self); + // passing selector of reLUWithTensor on the mpsGraph object + MPSGraphTensor* outputTensor = [mpsGraph reLUWithTensor:inputTensor + name:nil]; + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); + + // Create dictionary of inputs and outputs + NSDictionary* feeds = @{ + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() + }; + + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + + } + + return output; +} + +Tensor & relu_mps_(Tensor & self) { + using namespace mps; + // Inplace relu + Tensor &output = self; + TORCH_CHECK(output.is_mps()); + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + MPSStream* stream = getCurrentMPSStream(); + + @autoreleasepool { + string key = "relu_" + getTensorsStringKey({self}); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self); + // passing selector of reLUWithTensor on the mpsGraph object + MPSGraphTensor* outputTensor = [mpsGraph reLUWithTensor:inputTensor + name:nil]; + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); + + // Create dictionary of inputs and outputs + NSDictionary* feeds = @{ + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() + }; + + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + + } + + return output; +} + +TORCH_IMPL_FUNC(leaky_relu_out_mps) ( + const Tensor& self, const Scalar& negative_slope, const Tensor& output) { + using namespace mps; + TORCH_CHECK(output.is_mps()); + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + MPSStream *stream = getCurrentMPSStream(); + + @autoreleasepool { + + string key = "leaky_relu" + getTensorsStringKey({self}) + ":" + to_string(negative_slope.to()); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self); + + MPSGraphTensor* negSlopeTensor = [mpsGraph constantWithScalar:negative_slope.to() + shape:@[@1] + dataType:getMPSDataType(self.scalar_type())]; + MPSGraphTensor* negSlopeMulXTensor = [mpsGraph multiplicationWithPrimaryTensor:inputTensor + secondaryTensor:negSlopeTensor + name:nil]; + MPSGraphTensor* outputTensor = [mpsGraph maximumWithPrimaryTensor:negSlopeMulXTensor + secondaryTensor:inputTensor + name:nil]; + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); + + // Create dictionary of inputs and outputs + NSDictionary* feeds = @{ + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() + }; + + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + + } + +} + +TORCH_IMPL_FUNC(leaky_relu_backward_out_mps) ( + const Tensor& grad_output, + const Tensor& self, + const Scalar& negative_slope, + bool self_is_result, + const Tensor& output ) { + + using namespace mps; + TORCH_CHECK(output.is_mps()); + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* inputTensor_ = nil; + MPSGraphTensor* gradOutputTensor_ = nil; + MPSGraphTensor* gradInputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + MPSStream *stream = getCurrentMPSStream(); + + @autoreleasepool { + + string key = "leaky_relu_backward" + getTensorsStringKey({self, grad_output}) + ":" + to_string(negative_slope.to()); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self); + MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output); + + MPSGraphTensor* negSlopeTensor = [mpsGraph constantWithScalar:negative_slope.to() + shape:@[@1] + dataType:getMPSScalarType(self.scalar_type())]; + MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0f + shape:@[@1] + dataType:getMPSScalarType(self.scalar_type())]; + MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:inputTensor + secondaryTensor:zeroTensor + name:nil]; + MPSGraphTensor* gradientsMulNegSlopeTensor = [mpsGraph multiplicationWithPrimaryTensor:gradOutputTensor + secondaryTensor:negSlopeTensor + name:nil]; + MPSGraphTensor* gradInputTensor = [mpsGraph selectWithPredicateTensor:predicateTensor + truePredicateTensor:gradOutputTensor + falsePredicateTensor:gradientsMulNegSlopeTensor + name:nil]; + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->gradOutputTensor_ = gradOutputTensor; + newCachedGraph->gradInputTensor_ = gradInputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); + Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output); + Placeholder outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, output); + + // Create dictionary of inputs and outputs + NSDictionary* feeds = @{ + gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() + }; + + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } +} + + +TORCH_IMPL_FUNC(log_softmax_mps_out) ( + const Tensor &self, + const int64_t dim, + const bool half_to_float, + const Tensor &out) { + using namespace mps; + + if (self.numel() == 0) { + return; + } + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* inputTensor_ = nil; + MPSGraphTensor* outputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + MPSStream* stream = at::mps::getCurrentMPSStream(); + + @autoreleasepool { + string key = "log_softmax_mps_out" + getTensorsStringKey({self}) + ":" + to_string(dim); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph* newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self); + + MPSGraphTensor* softmaxTensor = [mpsGraph softMaxWithTensor:inputTensor + axis:dim + name:nil]; + MPSGraphTensor* outputTensor = [mpsGraph logarithmWithTensor:softmaxTensor + name:nil]; + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out); + + // Create dictionary of inputs and outputs + NSDictionary* feeds = @{ + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() + }; + + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + +} + +TORCH_IMPL_FUNC(log_softmax_backward_mps_out) ( + const Tensor& grad_output, + const Tensor& output, + int64_t dim, + ScalarType input_dtype, + const Tensor& out) { + using namespace mps; + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* outputTensor_ = nil; + MPSGraphTensor* gradOutputTensor_ = nil; + MPSGraphTensor* gradInputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + MPSStream* stream = at::mps::getCurrentMPSStream(); + + @autoreleasepool { + string key = "log_softmax_backward_mps_out:" + getMPSTypeString(grad_output.scalar_type()) + ":" + to_string(dim); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph* newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* gradOutputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(grad_output.scalar_type())); + MPSGraphTensor* outputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(output.scalar_type())); + + MPSGraphTensor* expTensor = [mpsGraph exponentWithTensor:outputTensor + name:nil]; + MPSGraphTensor* sumTensor = [mpsGraph reductionSumWithTensor:gradOutputTensor + axis:dim + name:nil]; + MPSGraphTensor* multiplicationTensor = [mpsGraph multiplicationWithPrimaryTensor:expTensor + secondaryTensor:sumTensor + name:nil]; + MPSGraphTensor* resultTensor = [mpsGraph subtractionWithPrimaryTensor:gradOutputTensor + secondaryTensor:multiplicationTensor + name:nil]; + + newCachedGraph->gradOutputTensor_ = gradOutputTensor; + newCachedGraph->outputTensor_ = outputTensor; + newCachedGraph->gradInputTensor_ = resultTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder gradPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); + Placeholder resultPlaceholder = Placeholder(cachedGraph->gradInputTensor_, out); + + // Create dictionary of inputs and outputs + NSDictionary* feeds = @{ + gradPlaceholder.getMPSGraphTensor() : gradPlaceholder.getMPSGraphTensorData(), + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + NSDictionary* results = @{ + resultPlaceholder.getMPSGraphTensor() : resultPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + +} + +TORCH_IMPL_FUNC(sigmoid_out_mps)( + const Tensor& self, + const Tensor& output) { + using namespace mps; + TORCH_CHECK(output.is_mps()); + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + MPSStream* stream = getCurrentMPSStream(); + + @autoreleasepool { + string key = "sigmoid_out_mps" + getTensorsStringKey({self}); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + // Initialize graph + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self); + + MPSGraphTensor* outputTensor = [mpsGraph sigmoidWithTensor:inputTensor + name:nil]; + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); + + // Create dictionary of inputs and outputs + NSDictionary* feeds = @{ + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() + }; + + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + + } + +} + +TORCH_IMPL_FUNC(sigmoid_backward_out_mps)( + const Tensor& grad_output, + const Tensor& output, + const Tensor& grad_input) { + using namespace mps; + TORCH_CHECK(grad_input.is_mps()); + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *gradOutputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + MPSGraphTensor *gradInputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + MPSStream* stream = getCurrentMPSStream(); + + @autoreleasepool { + string key = "sigmoid_backward_out_mps:" + getMPSTypeString(grad_output.scalar_type()); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* gradOutputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(grad_output.scalar_type())); + MPSGraphTensor* outputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(output.scalar_type())); + + MPSGraphTensor* unitTensor = [mpsGraph constantWithScalar:1.0 + shape:@[@1] + dataType:getMPSDataType(grad_output.scalar_type())]; + MPSGraphTensor* oneMinusSigmoidTensor = [mpsGraph subtractionWithPrimaryTensor:unitTensor + secondaryTensor:outputTensor + name:nil]; + MPSGraphTensor* timesTensor = [mpsGraph multiplicationWithPrimaryTensor:oneMinusSigmoidTensor + secondaryTensor:outputTensor + name:nil]; + MPSGraphTensor* gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:gradOutputTensor + secondaryTensor:timesTensor + name:nil]; + + newCachedGraph->gradOutputTensor_ = gradOutputTensor; + newCachedGraph->outputTensor_ = outputTensor; + newCachedGraph->gradInputTensor_ = gradInputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); + Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input); + + // Create dictionary of inputs and outputs + NSDictionary* feeds = @{ + gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + NSDictionary* results = @{ + gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + + } + +} + +TORCH_IMPL_FUNC(tanh_backward_out_mps)( + const Tensor& grad_output, + const Tensor& output, + const Tensor& grad_input) { + using namespace mps; + TORCH_CHECK(grad_input.is_mps()); + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *gradOutputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + MPSGraphTensor *gradInputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + MPSStream* stream = getCurrentMPSStream(); + + @autoreleasepool { + string key = "tanh_backward_out_mps:" + getMPSTypeString(grad_output.scalar_type()); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* gradOutputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(grad_output.scalar_type())); + MPSGraphTensor* outputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(output.scalar_type())); + + MPSGraphTensor* unitTensor = [mpsGraph constantWithScalar:1.0 + shape:@[@1] + dataType:getMPSDataType(grad_output.scalar_type())]; + MPSGraphTensor* tanh2Tensor = [mpsGraph squareWithTensor:outputTensor + name:nil]; + MPSGraphTensor* oneMinusTanh2Tensor = [mpsGraph subtractionWithPrimaryTensor:unitTensor + secondaryTensor:tanh2Tensor + name:nil]; + MPSGraphTensor* gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:gradOutputTensor + secondaryTensor:oneMinusTanh2Tensor + name:nil]; + + newCachedGraph->gradOutputTensor_ = gradOutputTensor; + newCachedGraph->outputTensor_ = outputTensor; + newCachedGraph->gradInputTensor_ = gradInputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); + Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input); + + // Create dictionary of inputs and outputs + NSDictionary* feeds = @{ + gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + NSDictionary* results = @{ + gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + + } +} + +TORCH_IMPL_FUNC(threshold_out_mps)( + const Tensor& self, + const Scalar& threshold, + const Scalar& value, + const Tensor& result) { + using namespace mps; + TORCH_CHECK(self.is_mps()); + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + MPSStream* stream = getCurrentMPSStream(); + + @autoreleasepool { + string key = "threshold_out_mps" + getTensorsStringKey({self}) + ":" + + to_string(threshold.to()) + ":" + + to_string(value.to()); + + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor *inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self); + + MPSGraphTensor *thresholdTensor = [mpsGraph constantWithScalar: threshold.to() + shape: @[@1] + dataType: getMPSDataType(self.scalar_type())]; + + MPSGraphTensor *valueTensor = [mpsGraph constantWithScalar: value.to() + shape: @[@1] + dataType: getMPSDataType(self.scalar_type())]; + + // x > threshold + MPSGraphTensor *predicateTensor = [mpsGraph greaterThanWithPrimaryTensor: inputTensor + secondaryTensor: thresholdTensor + name: nil]; + + // result = (self > threshold) ? self : value + MPSGraphTensor *outputTensor = [mpsGraph selectWithPredicateTensor: predicateTensor + truePredicateTensor: inputTensor + falsePredicateTensor: valueTensor + name: nil]; + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result); + + // Create dictionary of inputs and outputs + NSDictionary* feeds = @{ + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() + }; + + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } +} + +TORCH_IMPL_FUNC(threshold_backward_out_mps)( + const Tensor& grad, + const Tensor& self, + const Scalar& threshold, + const Tensor& gradInput) { + using namespace mps; + TORCH_CHECK(self.is_mps()); + TORCH_CHECK(grad.is_mps()); + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *gradTensor_ = nil; + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *gradInputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + MPSStream* stream = getCurrentMPSStream(); + + @autoreleasepool { + string key = "threshold_backward_out_mps" + getTensorsStringKey({self, grad}) + ":" + + to_string(threshold.to()); + + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor *inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self); + MPSGraphTensor *gradTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad); + + MPSGraphTensor *thresholdTensor = [mpsGraph constantWithScalar: threshold.to() + shape: @[@1] + dataType: getMPSDataType(self.scalar_type())]; + + MPSGraphTensor *zeroTensor = [mpsGraph constantWithScalar: 0.0 + dataType: inputTensor.dataType]; + + // x > threshold + MPSGraphTensor *predicateTensor = [mpsGraph greaterThanWithPrimaryTensor: inputTensor + secondaryTensor: thresholdTensor + name: nil]; + + // result = (self > threshold) ? grad : zeroTensor + MPSGraphTensor *gradInputTensor = [mpsGraph selectWithPredicateTensor: predicateTensor + truePredicateTensor: gradTensor + falsePredicateTensor: zeroTensor + name: nil]; + + newCachedGraph->gradTensor_ = gradTensor; + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->gradInputTensor_ = gradInputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); + Placeholder gradPlaceholder = Placeholder(cachedGraph->gradTensor_, grad); + Placeholder outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, gradInput); + + // Create dictionary of inputs and outputs + NSDictionary* feeds = @{ + gradPlaceholder.getMPSGraphTensor() : gradPlaceholder.getMPSGraphTensorData(), + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() + }; + + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } +} + +MPSGraphTensor* normcdf (MPSGraph* mpsGraph, MPSGraphTensor *inputTensor) { + // (1.0f + erf(x*SQRT1_2)) * 0.5f * x; + const float SQRT1_2 = 0.707106781186547524400844362104849039f; + MPSGraphTensor *sqrt1_2 = [mpsGraph constantWithScalar:SQRT1_2 + shape:@[@1] + dataType:MPSDataTypeFloat32]; + MPSGraphTensor *onef = [mpsGraph constantWithScalar:1.0f + shape:@[@1] + dataType:MPSDataTypeFloat32]; + MPSGraphTensor *halff = [mpsGraph constantWithScalar:0.5f + shape:@[@1] + dataType:MPSDataTypeFloat32]; + + MPSGraphTensor *erfTensor = [mpsGraph multiplicationWithPrimaryTensor: inputTensor + secondaryTensor: sqrt1_2 + name : nil]; + erfTensor = [mpsGraph erfWithTensor: erfTensor name : nil]; + erfTensor = [mpsGraph additionWithPrimaryTensor: erfTensor + secondaryTensor: onef + name : nil]; + erfTensor = [mpsGraph multiplicationWithPrimaryTensor: erfTensor + secondaryTensor: halff + name : nil]; + + return erfTensor; +} + +TORCH_IMPL_FUNC(gelu_out_mps) ( + const Tensor& self, c10::string_view approximate, const Tensor& output + ) { + using namespace mps; + TORCH_CHECK(output.is_mps()); + + // Empty output + if(output.numel() == 0) + return; + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + MPSStream* stream = getCurrentMPSStream(); + + @autoreleasepool { + string key = "gelu_out_mps" + getTensorsStringKey({self}); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, + getMPSDataType(self.scalar_type()), + getMPSShape(self)); + + MPSGraphTensor* outputTensor = normcdf(mpsGraph, inputTensor); + outputTensor = [mpsGraph multiplicationWithPrimaryTensor:outputTensor + secondaryTensor:inputTensor + name:nil]; + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); + + // Create dictionary of inputs and outputs + NSDictionary* feeds = @{ + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() + }; + + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + + } + +} + +TORCH_IMPL_FUNC(gelu_backward_out_mps) ( + const Tensor& grad, const Tensor& self, c10::string_view approximate, const Tensor& grad_input + ) { + using namespace mps; + constexpr float kBeta = M_2_SQRTPI * M_SQRT1_2 * (0.5); + + // Empty output + if(grad_input.numel() == 0) + return; + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *gradTensor_ = nil; + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + MPSStream* stream = getCurrentMPSStream(); + + @autoreleasepool { + string key = "gelu_backward_out_mps" + getTensorsStringKey({self, grad}); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* gradTensor = mpsGraphRankedPlaceHolder(mpsGraph, + getMPSDataType(grad.scalar_type()), + getMPSShape(grad)); + MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, + getMPSDataType(self.scalar_type()), + getMPSShape(self)); + MPSGraphTensor* cdf = normcdf(mpsGraph, inputTensor); + MPSGraphTensor *halff = [mpsGraph constantWithScalar:-0.5f + shape:@[@1] + dataType:MPSDataTypeFloat32]; + MPSGraphTensor *betaf = [mpsGraph constantWithScalar:kBeta + shape:@[@1] + dataType:MPSDataTypeFloat32]; + MPSGraphTensor *pdfMul = [mpsGraph squareWithTensor : inputTensor + name : nil]; + pdfMul = [mpsGraph multiplicationWithPrimaryTensor : pdfMul + secondaryTensor : halff + name : nil]; + pdfMul = [mpsGraph exponentWithTensor : pdfMul + name : nil]; + MPSGraphTensor* pdf = [mpsGraph multiplicationWithPrimaryTensor : pdfMul + secondaryTensor : betaf + name : nil]; + pdf = [mpsGraph multiplicationWithPrimaryTensor : inputTensor + secondaryTensor : pdf + name : nil]; + pdf = [mpsGraph additionWithPrimaryTensor : pdf + secondaryTensor : cdf + name : nil]; + MPSGraphTensor* outputTensor = [mpsGraph multiplicationWithPrimaryTensor : gradTensor + secondaryTensor : pdf + name : nil]; + + newCachedGraph->gradTensor_ = gradTensor; + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder gradPlaceholder = Placeholder(cachedGraph->gradTensor_, grad); + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, grad_input); + + // Create dictionary of inputs and outputs + NSDictionary* feeds = @{ + gradPlaceholder.getMPSGraphTensor() : gradPlaceholder.getMPSGraphTensorData(), + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() + }; + + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + + } + + +} + +void elu_variants_out_mps ( + const Tensor& self, + const Scalar& alpha, + const Scalar& scale, + const Scalar& input_scale, + const Tensor& result, + string func_name) { + + using namespace mps; + TORCH_CHECK(self.is_mps()); + + // Empty output + if(result.numel() == 0) + return; + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + MPSStream* stream = getCurrentMPSStream(); + + @autoreleasepool { + string key = func_name + ":" + getTensorsStringKey({self}) + ":" + + to_string(alpha.to()) + ":" + + to_string(scale.to()) + ":" + + to_string(input_scale.to()); + + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor *inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self); + + // scale * (max(0, x) + min(0, alpha * (exp(input_scale * x) - 1) )) + + MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar:alpha.to() + shape:@[@1] + dataType:getMPSDataType(self.scalar_type())]; + + MPSGraphTensor* inputScaleTensor = [mpsGraph constantWithScalar:input_scale.to() + shape:@[@1] + dataType:getMPSDataType(self.scalar_type())]; + + MPSGraphTensor* scaleTensor = [mpsGraph constantWithScalar:scale.to() + shape:@[@1] + dataType:getMPSDataType(self.scalar_type())]; + MPSGraphTensor* unitTensor = [mpsGraph constantWithScalar:1.0f + shape:@[@1] + dataType:getMPSDataType(self.scalar_type())]; + MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0f + shape:@[@1] + dataType:getMPSDataType(self.scalar_type())]; + + MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:inputTensor + secondaryTensor:inputScaleTensor + name:nil]; + MPSGraphTensor* exponentTensor = [mpsGraph exponentWithTensor:scaledInputTensor + name:nil]; + MPSGraphTensor* exponentMinusOneTensor = [mpsGraph subtractionWithPrimaryTensor:exponentTensor + secondaryTensor:unitTensor + name:nil]; + MPSGraphTensor* alphaTimesTensor = [mpsGraph multiplicationWithPrimaryTensor:exponentMinusOneTensor + secondaryTensor:alphaTensor + name:nil]; + MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:inputTensor + secondaryTensor:zeroTensor + name:nil]; + MPSGraphTensor* fusedOutput = [mpsGraph selectWithPredicateTensor:predicateTensor + truePredicateTensor:inputTensor + falsePredicateTensor:alphaTimesTensor + name:nil]; + MPSGraphTensor* outputTensor = [mpsGraph multiplicationWithPrimaryTensor:fusedOutput + secondaryTensor:scaleTensor + name:nil]; + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result); + + // Create dictionary of inputs and outputs + NSDictionary* feeds = @{ + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() + }; + + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + +} + +// scale * (max(0, x) + min(0, alpha * (exp(input_scale * x) - 1) )) +TORCH_IMPL_FUNC(elu_out_mps) ( + const Tensor& self, + const Scalar& alpha, + const Scalar& scale, + const Scalar& input_scale, + const Tensor& result) { + + elu_variants_out_mps(self, alpha, scale, input_scale, result, "elu_out_mps"); +} + +TORCH_IMPL_FUNC(elu_backward_out_mps) ( + const Tensor& grad_output, + const Scalar& alpha, + const Scalar& scale, + const Scalar& input_scale, + bool is_result, + const Tensor& self_or_result, + const Tensor& grad_input +) { + + using namespace mps; + TORCH_CHECK(grad_output.is_mps()); + + // Empty output + if(grad_input.numel() == 0) + return; + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *gradOutputTensor_ = nil; + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *resultTensor_ = nil; + MPSGraphTensor *gradInputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + MPSStream* stream = getCurrentMPSStream(); + + @autoreleasepool { + string key = "elu_backward_out_mps:" + getTensorsStringKey({grad_output}) + ":" + + to_string(alpha.to()) + ":" + + to_string(scale.to()) + ":" + + to_string(input_scale.to()) + ":" + + to_string(is_result); + + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output); + + MPSGraphTensor* inputTensor = nil; + MPSGraphTensor* resultTensor = nil; + + MPSGraphTensor* lessThanZeroGradTensor = nil; + + if(is_result) { + resultTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result); + MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar:alpha.to() + shape:@[@1] + dataType:getMPSDataType(grad_output.scalar_type())]; + MPSGraphTensor* resultPlusAlphaTensor = [mpsGraph additionWithPrimaryTensor:resultTensor + secondaryTensor:alphaTensor + name:nil]; + auto constMul = scale.to() * input_scale.to(); + MPSGraphTensor* constMulTensor = [mpsGraph constantWithScalar:constMul + shape:@[@1] + dataType:getMPSDataType(grad_output.scalar_type())]; + lessThanZeroGradTensor = [mpsGraph multiplicationWithPrimaryTensor:resultPlusAlphaTensor + secondaryTensor:constMulTensor + name:nil]; + } + else { + inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result); + MPSGraphTensor* inputScaleTensor = [mpsGraph constantWithScalar:input_scale.to() + shape:@[@1] + dataType:getMPSDataType(grad_output.scalar_type())]; + MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:inputTensor + secondaryTensor:inputScaleTensor + name:nil]; + MPSGraphTensor* expTensor = [mpsGraph exponentWithTensor:scaledInputTensor + name:nil]; + auto constMul = scale.to() * input_scale.to() * alpha.to(); + MPSGraphTensor* constMulTensor = [mpsGraph constantWithScalar:constMul + shape:@[@1] + dataType:getMPSDataType(grad_output.scalar_type())]; + lessThanZeroGradTensor = [mpsGraph multiplicationWithPrimaryTensor:expTensor + secondaryTensor:constMulTensor + name:nil]; + } + + MPSGraphTensor* scaleTensor = [mpsGraph constantWithScalar:scale.to() + shape:@[@1] + dataType:getMPSDataType(grad_output.scalar_type())]; + MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0f + shape:@[@1] + dataType:getMPSDataType(grad_output.scalar_type())]; + MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:inputTensor + secondaryTensor:zeroTensor + name:nil]; + MPSGraphTensor* gradTensor = [mpsGraph selectWithPredicateTensor:predicateTensor + truePredicateTensor:scaleTensor + falsePredicateTensor:lessThanZeroGradTensor + name:nil]; + MPSGraphTensor* gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:gradTensor + secondaryTensor:gradOutputTensor + name:nil]; + + newCachedGraph->gradOutputTensor_ = gradOutputTensor; + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->resultTensor_ = resultTensor; + newCachedGraph->gradInputTensor_ = gradInputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output); + Placeholder selfPlaceholder = Placeholder(); + Placeholder resultPlaceholder = Placeholder(); + if(is_result) + resultPlaceholder = Placeholder(cachedGraph->resultTensor_, self_or_result); + else + selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self_or_result); + Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input); + + // Create dictionary of inputs and outputs + NSDictionary* feeds = nil; + + if(is_result) + feeds = @{ + gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), + resultPlaceholder.getMPSGraphTensor() : resultPlaceholder.getMPSGraphTensorData() + }; + else + feeds = @{ + gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() + }; + + NSDictionary* results = @{ + gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + +} + +TORCH_IMPL_FUNC(silu_out_mps) ( + const Tensor& self, + const Tensor& result) { + + using namespace mps; + TORCH_CHECK(self.is_mps()); + + // Empty output + if(result.numel() == 0) + return; + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + MPSStream* stream = getCurrentMPSStream(); + + @autoreleasepool { + string key = "silu_out_mps:" + getTensorsStringKey({self}); + + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor *inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self); + + MPSGraphTensor* unitTensor = [mpsGraph constantWithScalar:1.0 + shape:@[@1] + dataType:getMPSDataType(self.scalar_type())]; + MPSGraphTensor* negativeInput = [mpsGraph negativeWithTensor:inputTensor + name:nil]; + MPSGraphTensor* expNegativeTensor = [mpsGraph exponentWithTensor:negativeInput + name:nil]; + MPSGraphTensor* expPlusOneTensor = [mpsGraph additionWithPrimaryTensor:expNegativeTensor + secondaryTensor:unitTensor + name:nil]; + MPSGraphTensor* outputTensor = [mpsGraph divisionWithPrimaryTensor:inputTensor + secondaryTensor:expPlusOneTensor + name:nil]; + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result); + + // Create dictionary of inputs and outputs + NSDictionary* feeds = @{ + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() + }; + + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + +} + +TORCH_IMPL_FUNC(silu_backward_out_mps) ( + const Tensor& grad_output, + const Tensor& self, + const Tensor& grad_input) { + + using namespace mps; + TORCH_CHECK(grad_output.is_mps()); + + // Empty output + if(grad_input.numel() == 0) + return; + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *gradOutputTensor_ = nil; + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *gradInputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + MPSStream* stream = getCurrentMPSStream(); + + @autoreleasepool { + string key = "silu_out_backward_mps:" + getTensorsStringKey({grad_output}); + + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor *inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self); + MPSGraphTensor *gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output); + + MPSGraphTensor* unitTensor = [mpsGraph constantWithScalar:1.0 + shape:@[@1] + dataType:getMPSDataType(grad_output.scalar_type())]; + MPSGraphTensor* negativeInput = [mpsGraph negativeWithTensor:inputTensor + name:nil]; + MPSGraphTensor* expNegativeTensor = [mpsGraph exponentWithTensor:negativeInput + name:nil]; + MPSGraphTensor* expPlusOneTensor = [mpsGraph additionWithPrimaryTensor:expNegativeTensor + secondaryTensor:unitTensor + name:nil]; + MPSGraphTensor* sigmoidTensor = [mpsGraph reciprocalWithTensor:expPlusOneTensor + name:nil]; + MPSGraphTensor* oneMinusSigmoid = [mpsGraph subtractionWithPrimaryTensor:unitTensor + secondaryTensor:sigmoidTensor + name:nil]; + MPSGraphTensor* inputTimesDiff = [mpsGraph multiplicationWithPrimaryTensor:inputTensor + secondaryTensor:oneMinusSigmoid + name:nil]; + MPSGraphTensor* onePlusTensor = [mpsGraph additionWithPrimaryTensor:unitTensor + secondaryTensor:inputTimesDiff + name:nil]; + MPSGraphTensor* gradTensor = [mpsGraph multiplicationWithPrimaryTensor:sigmoidTensor + secondaryTensor:onePlusTensor + name:nil]; + MPSGraphTensor* gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:gradTensor + secondaryTensor:gradOutputTensor + name:nil]; + + newCachedGraph->gradOutputTensor_ = gradOutputTensor; + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->gradInputTensor_ = gradInputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); + Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output); + Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input); + + // Create dictionary of inputs and outputs + NSDictionary* feeds = @{ + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), + gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData() + }; + + NSDictionary* results = @{ + gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + +} + +// ------------------------------------------------- +// Hardtanh backward + +Tensor hardtanh_backward_mps + (const Tensor& grad_output, + const Tensor& self, + const Scalar& min, + const Scalar& max) { + + Tensor grad_input = at::native::empty_mps( + grad_output.sizes(), + grad_output.scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + c10::nullopt); + grad_input = hardtanh_backward_out_mps(grad_output, self, min, max, grad_input); + return grad_input; +} + +// Hardtanh backward +Tensor& hardtanh_backward_out_mps + (const Tensor& grad_output, + const Tensor& self, + const Scalar& min, + const Scalar& max, + Tensor& grad_input) { + + using namespace mps; + TORCH_CHECK(grad_output.is_mps()); + + // Empty output + if(grad_input.numel() == 0) + return grad_input; + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *gradOutputTensor_ = nil; + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *gradInputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + MPSStream* stream = getCurrentMPSStream(); + + @autoreleasepool { + string key = "hardtanh_backward_out_mps:" + getTensorsStringKey({grad_output}) + ":" + + to_string(min.to()) + ":" + + to_string(max.to()); + + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output); + MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self); + + // TODO: Compute gradient + MPSGraphTensor* unitTensor = [mpsGraph constantWithScalar:1.0f + shape:@[@1] + dataType:getMPSDataType(grad_output.scalar_type())]; + MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0f + shape:@[@1] + dataType:getMPSDataType(grad_output.scalar_type())]; + MPSGraphTensor* minTensor = [mpsGraph constantWithScalar:min.to() + shape:@[@1] + dataType:getMPSDataType(grad_output.scalar_type())]; + MPSGraphTensor* maxTensor = [mpsGraph constantWithScalar:max.to() + shape:@[@1] + dataType:getMPSDataType(grad_output.scalar_type())]; + MPSGraphTensor* greaterThanMaxPredicateTensor = [mpsGraph greaterThanWithPrimaryTensor:inputTensor + secondaryTensor:maxTensor + name:nil]; + MPSGraphTensor* lesserThanMinPredicateTensor = [mpsGraph lessThanWithPrimaryTensor:inputTensor + secondaryTensor:minTensor + name:nil]; + MPSGraphTensor* greaterThanMaxGradTensor = [mpsGraph selectWithPredicateTensor:greaterThanMaxPredicateTensor + truePredicateTensor:zeroTensor + falsePredicateTensor:unitTensor + name:nil]; + MPSGraphTensor* lesserThanMinGradTensor = [mpsGraph selectWithPredicateTensor:lesserThanMinPredicateTensor + truePredicateTensor:zeroTensor + falsePredicateTensor:unitTensor + name:nil]; + MPSGraphTensor* gradTensor = [mpsGraph multiplicationWithPrimaryTensor:greaterThanMaxGradTensor + secondaryTensor:lesserThanMinGradTensor + name:nil]; + MPSGraphTensor* gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:gradTensor + secondaryTensor:gradOutputTensor + name:nil]; + + newCachedGraph->gradOutputTensor_ = gradOutputTensor; + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->gradInputTensor_ = gradInputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output); + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); + Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input); + + // Create dictionary of inputs and outputs + NSDictionary* feeds = @{ + gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() + }; + + NSDictionary* results = @{ + gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + + return grad_input; +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/mps/operations/AdaptiveAveragePooling.mm b/aten/src/ATen/native/mps/operations/AdaptiveAveragePooling.mm new file mode 100644 index 000000000000..c82818318e9e --- /dev/null +++ b/aten/src/ATen/native/mps/operations/AdaptiveAveragePooling.mm @@ -0,0 +1,154 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { + + +void set_kernel_params + (int64_t isizeH, int64_t isizeW, + int64_t osizeH, int64_t osizeW, + int64_t &strideH, int64_t &strideW, + int64_t &kernel_sizeH, int64_t &kernel_sizeW) { + + strideH = (int64_t) (isizeH / osizeH); + strideW = (int64_t) (isizeW / osizeW); + + kernel_sizeH = isizeH - (osizeH-1) * strideH; + kernel_sizeW = isizeW - (osizeW-1) * strideW; +} + +Tensor& adaptive_avg_pool2d_out_mps + (const Tensor& input, + IntArrayRef output_size, + Tensor& output) { + + for (int64_t i = 1; i < input.ndimension(); i++) { + TORCH_CHECK(input.size(i) > 0, + "adaptive_avg_pool2d(): Expected input to have non-zero size for non-batch dimensions, " + "but input has sizes ", input.sizes(), " with dimension ", i, " being " + "empty"); + } + + int64_t isizeH = input.size(-2); + int64_t isizeW = input.size(-1); + + int64_t osizeH = output_size[0]; + int64_t osizeW = output_size[1]; + + if(input.suggest_memory_format() == at::MemoryFormat::ChannelsLast) + TORCH_CHECK(input.ndimension() == 4, + "adaptive_avg_pool2d(): Expected 4D tensor, but got ", + input.sizes()) + + switch (input.suggest_memory_format()) { + case at::MemoryFormat::Contiguous: + case at::MemoryFormat::ChannelsLast: + break; + default: + TORCH_CHECK( + false, + "Unsupported memory format. Supports only ChannelsLast, Contiguous") + } + + int64_t strideH; + int64_t strideW; + int64_t kernel_sizeH; + int64_t kernel_sizeW; + + set_kernel_params(isizeH, isizeW, + osizeH, osizeW, + strideH, strideW, + kernel_sizeH, kernel_sizeW); + + output = at::avg_pool2d(input, + IntArrayRef({kernel_sizeH, kernel_sizeW}), + IntArrayRef({strideH, strideW}), + IntArrayRef({0, 0}), + false, + true, + c10::nullopt); + return output; +} + +Tensor adaptive_avg_pool2d_mps + (at::Tensor const& input, + IntArrayRef output_size) { + + IntArrayRef output_shape; + + auto osizeH = output_size[0]; + auto osizeW = output_size[1]; + + std::vector out_dims = {}; + + if(input.ndimension() == 4) { + auto sizeB = input.size(0); + auto sizeD = input.size(1); + + out_dims.push_back(sizeB); + out_dims.push_back(sizeD); + out_dims.push_back(osizeH); + out_dims.push_back(osizeW); + output_shape = IntArrayRef(out_dims); + } + else { + auto sizeD = input.size(0); + out_dims.push_back(sizeD); + out_dims.push_back(osizeH); + out_dims.push_back(osizeW); + output_shape = IntArrayRef(out_dims); + } + + const auto memory_format = input.suggest_memory_format(); + Tensor output = at::native::empty_mps( + output_shape, + input.scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + memory_format); + return adaptive_avg_pool2d_out_mps(input, output_size, output); + +} + +Tensor adaptive_avg_pool2d_backward_mps + (const Tensor& gradOutput, + const Tensor& input) { + + int64_t isizeH = input.size(-2); + int64_t isizeW = input.size(-1); + int64_t osizeH = gradOutput.size(-2); + int64_t osizeW = gradOutput.size(-1); + + int64_t strideH, strideW, kernel_sizeH, kernel_sizeW; + + set_kernel_params(isizeH, isizeW, + osizeH, osizeW, + strideH, strideW, + kernel_sizeH, kernel_sizeW); + auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + if (gradInput.numel() != 0) + gradInput = at::avg_pool2d_backward(gradOutput, + input, + IntArrayRef({kernel_sizeH, kernel_sizeW}), + IntArrayRef({strideH, strideW}), + IntArrayRef({0, 0}), + false, + true, + c10::nullopt); + + return gradInput; + +} + +} +} diff --git a/aten/src/ATen/native/mps/operations/BinaryOps.mm b/aten/src/ATen/native/mps/operations/BinaryOps.mm new file mode 100644 index 000000000000..1a3e4155dac8 --- /dev/null +++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm @@ -0,0 +1,332 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +namespace mps { + +struct BinaryOpCachedGraph : public MPSCachedGraph +{ + BinaryOpCachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *primaryTensor = nil, *secondaryTensor = nil, *outputTensor = nil; +}; + +typedef MPSGraphTensor* (^BinaryOpBlock)(MPSGraph*, MPSGraphTensor*, MPSGraphTensor*); +#define BinaryOpFn() MPSGraphTensor* (MPSGraph* mpsGraph, MPSGraphTensor* primary, MPSGraphTensor* secondary) + +void binaryOpTensor(const Tensor& self_t, const Tensor& other_t, const Tensor& output, std::string op_name, BinaryOpBlock binaryBlock) +{ + // it's possible to receive empty tensors here + if (self_t.numel() == 0 || other_t.numel() == 0) { + return; + } + + const bool is_self_scalar = self_t.dim() == 0; + const bool is_other_scalar = other_t.dim() == 0; + Tensor self = is_self_scalar ? self_t : self_t.contiguous(at::MemoryFormat::Contiguous); + Tensor other = is_other_scalar ? other_t : other_t.contiguous(at::MemoryFormat::Contiguous); + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + @autoreleasepool { + string key = op_name + getTensorsStringKey({self, other}); + BinaryOpCachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph* () { + BinaryOpCachedGraph *newCachedGraph = nil; + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new BinaryOpCachedGraph(mpsGraph); + newCachedGraph->primaryTensor = !is_self_scalar ? mpsGraphRankedPlaceHolder(mpsGraph, self) : + mpsGraphConstantPlaceHolder(mpsGraph, getMPSScalarValue(self), getMPSShape(other), + // if other is scalar too, then use self's data type here and let the other + // have the same data type as self in the secondaryTensor + getMPSDataType((!is_other_scalar ? other : self).scalar_type())); + + newCachedGraph->secondaryTensor = !is_other_scalar ? mpsGraphRankedPlaceHolder(mpsGraph, other) : + mpsGraphConstantPlaceHolder(mpsGraph, getMPSScalarValue(other), getMPSShape(self), + // regardless of self's data type, the scondaryTensor's type must match it. + getMPSDataType(self.scalar_type())); + newCachedGraph->outputTensor = binaryBlock(mpsGraph, newCachedGraph->primaryTensor, newCachedGraph->secondaryTensor); + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + NSMutableDictionary *feeds = [[NSMutableDictionary new] autorelease]; + if (!is_self_scalar) { + Placeholder selfPlaceholder = Placeholder(cachedGraph->primaryTensor, self); + feeds[selfPlaceholder.getMPSGraphTensor()] = selfPlaceholder.getMPSGraphTensorData(); + } + if (!is_other_scalar) { + Placeholder otherPlaceholder = Placeholder(cachedGraph->secondaryTensor, other); + feeds[otherPlaceholder.getMPSGraphTensor()] = otherPlaceholder.getMPSGraphTensorData(); + } + + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output); + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + } +} + +void binaryOpScalar(const Tensor& self, const Scalar& other, const Tensor& output, std::string op_name, BinaryOpBlock binaryBlock) +{ + binaryOpTensor(self, wrapped_scalar_tensor(other), output, op_name, binaryBlock); +} + +void div_mode_template(const Tensor& self, const Tensor& other, + c10::optional rounding_mode, + const Tensor& output, const string op_name) +{ + BinaryOpBlock div_mode_op_block = ^BinaryOpFn() { + MPSGraphTensor* divTensor = [mpsGraph divisionWithPrimaryTensor:primary + secondaryTensor:secondary + name:nil]; + if (!rounding_mode.has_value()) { + return divTensor; + } else if (*rounding_mode == "trunc") { + return trunc_tensor(mpsGraph, divTensor); + } else if (*rounding_mode == "floor") { + return [mpsGraph floorWithTensor:divTensor name:nil]; + } + assert(0 && "Invalid rounding mode\n"); + return nullptr; + }; + binaryOpTensor(self, other, output, op_name + "_out_mps:" + (rounding_mode.has_value() ? c10::str(*rounding_mode) : ""), div_mode_op_block); +} + +void add_sub_template(const Tensor& self, const Tensor& other, const Scalar& alpha, const Tensor& output, std::string op_name) +{ + BinaryOpBlock add_sub_op_block = ^BinaryOpFn() { + double alpha_val = alpha.toDouble(); + MPSGraphTensor* secondaryTensor = secondary; + + // if alpha is 1.0, then we don't bother adding another multiply to graph + if (alpha_val != 1.0) { + MPSGraphTensor* alphaTensor = mpsGraphConstantPlaceHolder(mpsGraph, alpha_val, getMPSShape(other), getMPSDataType(other.scalar_type())); + secondaryTensor = [mpsGraph multiplicationWithPrimaryTensor:secondary + secondaryTensor:alphaTensor + name:nil]; + } + if (op_name == "add") + return [mpsGraph additionWithPrimaryTensor:primary + secondaryTensor:secondaryTensor + name:nil]; + else + return [mpsGraph subtractionWithPrimaryTensor:primary + secondaryTensor:secondaryTensor + name:nil]; + }; + binaryOpTensor(self, other, output, op_name + "_out_mps:" + std::to_string(alpha.toDouble()), add_sub_op_block); +} + +} // namespace mps + +#define CREATE_MPS_BINARY_OP_FUNC(func_out, func_stub, other_type) \ +TORCH_IMPL_FUNC(func_out) (const Tensor& self, const other_type& other, const Tensor& output) { \ + mps::binaryOp##other_type(self, other, output, #func_stub, \ + ^BinaryOpFn() { \ + return [mpsGraph func_stub##WithPrimaryTensor:primary \ + secondaryTensor:secondary \ + name:nil]; }); \ +} + +// Boolean Ops require casting output to "MPSDataTypeBool" +#define CREATE_MPS_BOOLEAN_OP_FUNC(func_out, func_stub, other_type) \ +TORCH_IMPL_FUNC(func_out) (const Tensor& self, const other_type& other, const Tensor& output) { \ + mps::binaryOp##other_type(self, other, output, #func_stub, \ + ^BinaryOpFn() { \ + MPSGraphTensor* outputTensor = [mpsGraph func_stub##WithPrimaryTensor:primary \ + secondaryTensor:secondary \ + name:nil]; \ + return [mpsGraph castTensor:outputTensor toType:MPSDataTypeBool name:@"boolOut"]; }); \ +} + +// Boolean Binary Ops +CREATE_MPS_BOOLEAN_OP_FUNC(eq_scalar_out_mps, equal, Scalar); +CREATE_MPS_BOOLEAN_OP_FUNC(eq_tensor_out_mps, equal, Tensor); +CREATE_MPS_BOOLEAN_OP_FUNC(ne_scalar_out_mps, notEqual, Scalar); +CREATE_MPS_BOOLEAN_OP_FUNC(ne_tensor_out_mps, notEqual, Tensor); +CREATE_MPS_BOOLEAN_OP_FUNC(le_scalar_out_mps, lessThanOrEqualTo, Scalar); +CREATE_MPS_BOOLEAN_OP_FUNC(le_tensor_out_mps, lessThanOrEqualTo, Tensor); +CREATE_MPS_BOOLEAN_OP_FUNC(lt_scalar_out_mps, lessThan, Scalar); +CREATE_MPS_BOOLEAN_OP_FUNC(lt_tensor_out_mps, lessThan, Tensor); +CREATE_MPS_BOOLEAN_OP_FUNC(ge_scalar_out_mps, greaterThanOrEqualTo, Scalar); +CREATE_MPS_BOOLEAN_OP_FUNC(ge_tensor_out_mps, greaterThanOrEqualTo, Tensor); +CREATE_MPS_BOOLEAN_OP_FUNC(gt_scalar_out_mps, greaterThan, Scalar); +CREATE_MPS_BOOLEAN_OP_FUNC(gt_tensor_out_mps, greaterThan, Tensor); + +// Arithmetic Binary Ops +CREATE_MPS_BINARY_OP_FUNC(minimum_out_mps, minimum, Tensor); +CREATE_MPS_BINARY_OP_FUNC(maximum_out_mps, maximum, Tensor); +CREATE_MPS_BINARY_OP_FUNC(mul_out_mps, multiplication, Tensor); +CREATE_MPS_BINARY_OP_FUNC(pow_tensor_scalar_out_mps, power, Scalar); +CREATE_MPS_BINARY_OP_FUNC(pow_tensor_tensor_out_mps, power, Tensor); +CREATE_MPS_BINARY_OP_FUNC(atan2_mps_out, atan2, Tensor); + + +TORCH_IMPL_FUNC(div_out_mode_mps) (const Tensor& self, const Tensor& other, c10::optional rounding_mode, const Tensor& output) { + mps::div_mode_template(self, other, rounding_mode, output, "div_mode"); +} + +TORCH_IMPL_FUNC(div_out_mps) (const Tensor& self, const Tensor& other, const Tensor& output) { + mps::div_mode_template(self, other, c10::nullopt, output, "div"); +} + +TORCH_IMPL_FUNC(add_out_mps) (const Tensor& self, const Tensor& other, const Scalar& alpha, const Tensor& output) { + mps::add_sub_template(self, other, alpha, output, "add"); +} + +TORCH_IMPL_FUNC(sub_out_mps) (const Tensor& self, const Tensor& other, const Scalar& alpha, const Tensor& output) { + mps::add_sub_template(self, other, alpha, output, "sub"); +} + + +TORCH_IMPL_FUNC(logaddexp_out_mps) (const Tensor& self, const Tensor& other, const Tensor& output) +{ + using namespace mps; + MPSStream* stream = getCurrentMPSStream(); + + if (&output != &self) { + output.resize_(self.sizes());; + } + + // Derive from MPSCachedGraph + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *otherTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + @autoreleasepool { + string key = "log_base_e_out_mps:" + getTensorsStringKey({self, other}); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + MPSGraphTensor* xTensor = mpsGraphRankedPlaceHolder(mpsGraph, self); + MPSGraphTensor* yTensor = mpsGraphRankedPlaceHolder(mpsGraph, other); + MPSGraphTensor* ePowXTensor = [mpsGraph exponentWithTensor:xTensor + name:nil]; + MPSGraphTensor* ePowYTensor = [mpsGraph exponentWithTensor:yTensor + name:nil]; + MPSGraphTensor* sumTensor = [mpsGraph additionWithPrimaryTensor:ePowXTensor + secondaryTensor:ePowYTensor + name:nil]; + MPSGraphTensor* outputTensor = [mpsGraph logarithmWithTensor:sumTensor + name:nil]; + + newCachedGraph->inputTensor_ = xTensor; + newCachedGraph->otherTensor_ = yTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); + Placeholder otherPlaceholder = Placeholder(cachedGraph->otherTensor_, other); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); + + NSDictionary* feeds = @{ + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), + otherPlaceholder.getMPSGraphTensor() : otherPlaceholder.getMPSGraphTensorData() + }; + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + + } + +TORCH_IMPL_FUNC(logaddexp2_out_mps) (const Tensor& self, const Tensor& other, const Tensor& output) +{ + using namespace mps; + MPSStream* stream = getCurrentMPSStream(); + + if (&output != &self) { + output.resize_(self.sizes());; + } + + // Derive from MPSCachedGraph + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *otherTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + @autoreleasepool { + string key = "log_base_two_out_mps:" + getTensorsStringKey({self, other}); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + MPSGraphTensor* xTensor = mpsGraphRankedPlaceHolder(mpsGraph, self); + MPSGraphTensor* yTensor = mpsGraphRankedPlaceHolder(mpsGraph, other); + MPSGraphTensor* twoPowXTensor = [mpsGraph exponentBase2WithTensor:xTensor + name:nil]; + MPSGraphTensor* twoPowYTensor = [mpsGraph exponentBase2WithTensor:yTensor + name:nil]; + MPSGraphTensor* sumTensor = [mpsGraph additionWithPrimaryTensor:twoPowXTensor + secondaryTensor:twoPowYTensor + name:nil]; + MPSGraphTensor* outputTensor = [mpsGraph logarithmBase2WithTensor:sumTensor + name:nil]; + + newCachedGraph->inputTensor_ = xTensor; + newCachedGraph->otherTensor_ = yTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); + Placeholder otherPlaceholder = Placeholder(cachedGraph->otherTensor_, other); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); + + NSDictionary* feeds = @{ + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), + otherPlaceholder.getMPSGraphTensor() : otherPlaceholder.getMPSGraphTensorData() + }; + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } +} +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/mps/operations/Blas.mm b/aten/src/ATen/native/mps/operations/Blas.mm new file mode 100644 index 000000000000..7ab34ac31401 --- /dev/null +++ b/aten/src/ATen/native/mps/operations/Blas.mm @@ -0,0 +1,196 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include +#include +#include +#include +#include + +#ifdef __OBJC__ +#include +#endif + + +namespace at { +namespace native { + + +Tensor dot_mps( + const Tensor &self, + const Tensor &other) +{ + using namespace mps; + auto output = at::native::empty_mps({}, self.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt); + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* selfTensor_ = nil; + MPSGraphTensor* otherTensor_ = nil; + MPSGraphTensor* outputTensor_ = nil; + }; + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + MPSStream* stream = at::mps::getCurrentMPSStream(); + + @autoreleasepool { + string key = "dot_mps" + getTensorsStringKey({self, other}); + + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + + mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ mps::MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + + @autoreleasepool{ + MPSGraph *mpsGraph = mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor *selfTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, self); + MPSGraphTensor *otherTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, other); + + MPSGraphTensor *dot = [mpsGraph multiplicationWithPrimaryTensor: selfTensor + secondaryTensor: otherTensor + name: @"multiplication"]; + + MPSGraphTensor *dotProductTensor = [mpsGraph reductionSumWithTensor: dot + axes: nil + name: @"dotProduct"]; + newCachedGraph->selfTensor_ = selfTensor; + newCachedGraph->otherTensor_ = otherTensor; + newCachedGraph->outputTensor_ = dotProductTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder selfPlaceholder = Placeholder(cachedGraph->selfTensor_, self); + Placeholder otherPlaceholder = Placeholder(cachedGraph->otherTensor_, other); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); + + NSDictionary* feeds = @{ + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), + otherPlaceholder.getMPSGraphTensor() : otherPlaceholder.getMPSGraphTensorData(), + }; + + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + + return output; +} + +Tensor& addmv_out_mps_impl( + const Tensor &self, + const Tensor &mat, + const Tensor &vec, + const Scalar& beta_, + const Scalar& alpha_, + Tensor& result) +{ + using namespace mps; + + TORCH_CHECK(mat.is_mps()); + TORCH_CHECK(vec.is_mps()); + TORCH_CHECK(result.is_mps()); + TORCH_CHECK(self.is_mps()); + + c10::MaybeOwned self_ = expand_size(self, {mat.size(0)}); + auto betaval = beta_.toComplexDouble(); + + struct CachedGraph : public mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *selfTensor_ = nil; + MPSGraphTensor *matMulVecTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + mps::MPSGraphCache *cache_ = mps::MPSGraphCache::getInstance(); + + MPSStream *stream = at::mps::getCurrentMPSStream(); + Tensor matMulVec = mm(mat, vec.unsqueeze(1)).squeeze(1); + + @autoreleasepool { + string key = "addmv_out_mps_impl" + getTensorsStringKey({self, matMulVec}) + + ":" + to_string(beta_.toDouble()) + + ":" + to_string(alpha_.toDouble()); + CachedGraph* cachedGraph = nil; + if(!cachedGraph) { + + mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ mps::MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + + @autoreleasepool{ + MPSGraph *mpsGraph = mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor *matMulVecTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, matMulVec); + MPSGraphTensor *selfTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, self); + + // Intermediates for beta and alpha + MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar: alpha_.toDouble() + dataType: getMPSScalarType(mat.scalar_type())]; + + // Intermediates for multiplying by beta and alpha + MPSGraphTensor* productTimesAlphaTensor = [mpsGraph multiplicationWithPrimaryTensor:matMulVecTensor + secondaryTensor:alphaTensor + name:@"MM/alpha*(mat@vec)"]; + newCachedGraph->outputTensor_ = productTimesAlphaTensor; + + if (betaval != 0.0) + { + MPSGraphTensor* betaTensor = [mpsGraph constantWithScalar: beta_.toDouble() + dataType: getMPSScalarType(self.scalar_type())]; + + MPSGraphTensor* selfTimesBetaTensor = [mpsGraph multiplicationWithPrimaryTensor: selfTensor + secondaryTensor: betaTensor + name: @"MM/beta*input"]; + + MPSGraphTensor* outputTensor = [mpsGraph additionWithPrimaryTensor: productTimesAlphaTensor + secondaryTensor: selfTimesBetaTensor + name: @"MM/beta*input + alpha*(mat@vec)"]; + + newCachedGraph->outputTensor_ = outputTensor; + } + + newCachedGraph->selfTensor_ = selfTensor; + newCachedGraph->matMulVecTensor_ = matMulVecTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder matMulVecPlaceholder = Placeholder(cachedGraph->matMulVecTensor_, matMulVec); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result); + + NSMutableDictionary* feeds =[NSMutableDictionary dictionary]; + feeds[matMulVecPlaceholder.getMPSGraphTensor()] = matMulVecPlaceholder.getMPSGraphTensorData(); + if (betaval != 0.0) + { + Placeholder selfPlaceholder = Placeholder(cachedGraph->selfTensor_, self); + feeds[selfPlaceholder.getMPSGraphTensor()] = selfPlaceholder.getMPSGraphTensorData(); + } + + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + + return result; +} + +TORCH_IMPL_FUNC(addmv_out_mps)(const Tensor &self, const Tensor &mat, const Tensor &vec, const Scalar& beta_, const Scalar& alpha_, const Tensor& result) { + addmv_out_mps_impl(self, mat, vec, beta_, alpha_, const_cast(result)); +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/mps/operations/ConstantOps.mm b/aten/src/ATen/native/mps/operations/ConstantOps.mm new file mode 100644 index 000000000000..09e962b94f78 --- /dev/null +++ b/aten/src/ATen/native/mps/operations/ConstantOps.mm @@ -0,0 +1,94 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include +#include +#include +#include + +#ifdef __OBJC__ +#include +#endif + +using namespace at::mps; + +namespace at { +namespace native { + +Tensor& fill_scalar_mps_impl(Tensor& self, const Scalar& value) { + using namespace mps; + + if (self.numel() == 0) { + return self; + } + + MPSStream* stream = getCurrentMPSStream(); + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* outputTensor_ = nil; + }; + + MPSGraphCache *cache_ = MPSGraphCache::getInstance(); + + @autoreleasepool { + + MPSShape* input_shape = getMPSShape(self); + NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","]; + + string key = "fill_scalar_mps_impl:" + getMPSTypeString(self.scalar_type()) + + ":" + string([ns_shape_key UTF8String]) + + ":" + to_string(value.toDouble()); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + + @autoreleasepool{ + MPSGraph *mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* inputTensor = [mpsGraph constantWithScalar:value.toDouble() + shape:input_shape + dataType:getMPSScalarType(self.scalar_type())]; + MPSGraphTensor* outputTensor = [mpsGraph identityWithTensor:inputTensor + name:nil]; + + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, self); + + NSDictionary* feeds = nil; + + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + + return self; +} + +Tensor& zero_mps_(Tensor& self) { + return at::native::fill_scalar_mps_impl(self, 0.0f); +} + +Tensor& fill_scalar_mps(Tensor& self, const Scalar& value) { + return at::native::fill_scalar_mps_impl(self, value); +} + +Tensor& fill_tensor_mps_(Tensor& self, const Tensor& value) { + TORCH_CHECK(value.dim() == 0, "fill_ only supports 0-dimension value tensor but got tensor with ", value.dim(), " dimensions."); + return at::native::fill_scalar_mps_impl(self, value.item()); +} +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/mps/operations/Convolution.mm b/aten/src/ATen/native/mps/operations/Convolution.mm new file mode 100644 index 000000000000..40327536b564 --- /dev/null +++ b/aten/src/ATen/native/mps/operations/Convolution.mm @@ -0,0 +1,508 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { + +// Create convolution descriptor +void fill_conv_desc(MPSGraphConvolution2DOpDescriptor* descriptor_, + NSUInteger strideInX, NSUInteger strideInY, + NSUInteger dilationRateInX, NSUInteger dilationRateInY, + NSUInteger paddingHorizontal, NSUInteger paddingVertical, + c10::MemoryFormat memory_format, NSUInteger groups) { + descriptor_.strideInX = strideInX; + descriptor_.strideInY = strideInY; + descriptor_.dilationRateInX = dilationRateInX; + descriptor_.dilationRateInY = dilationRateInY; + + // TODO: Program the padding style + descriptor_.paddingStyle = MPSGraphPaddingStyleExplicit; + + descriptor_.paddingLeft = paddingHorizontal; + descriptor_.paddingRight = paddingHorizontal; + descriptor_.paddingTop = paddingVertical; + descriptor_.paddingBottom = paddingVertical; + + descriptor_.dataLayout = (memory_format == at::MemoryFormat::Contiguous) ? + MPSGraphTensorNamedDataLayoutNCHW : MPSGraphTensorNamedDataLayoutNHWC; + descriptor_.weightsLayout = (memory_format == at::MemoryFormat::Contiguous) ? + MPSGraphTensorNamedDataLayoutOIHW : MPSGraphTensorNamedDataLayoutHWIO; + descriptor_.groups = groups; +} + +Tensor _mps_convolution( + const Tensor& input_t, + const Tensor& weight_t, + const c10::optional& bias_opt, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups) { + namespace native_mps = at::native::mps; + CheckedFrom c = "mps_convolution"; + TensorArg input { input_t, "input", 1 }, + weight { weight_t, "weight", 2 }; + checkAllSameType(c, {input, weight}); + checkAllSameGPU(c, {input, weight}); + + bool bias_defined; + + if(bias_opt == c10::nullopt) + bias_defined = false; + else + bias_defined = bias_opt->defined(); + + auto memory_format = input_t.suggest_memory_format(); + auto output_t = at::empty( + conv_output_size(input->sizes(), weight->sizes(), + padding, stride, dilation), + input->scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + memory_format); + + if (output_t.numel() == 0) { + return output_t; + } + TensorArg output{ output_t, "result", 0 }; + + convolution_shape_check(c, input, weight, output, padding, stride, dilation, groups); + + // Derive from MPSCachedGraph + struct CachedGraph : public native_mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* inputTensor_ = nil; + MPSGraphTensor* biasTensor_ = nil; + MPSGraphTensor* weightTensor_ = nil; + MPSGraphTensor* outputTensor_ = nil; + }; + + native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance(); + + auto stream = at::mps::getCurrentMPSStream(); + + @autoreleasepool { + + IntArrayRef bias_shape; + if(bias_defined) + bias_shape = bias_opt.value().sizes(); + + string mem_format_key; + switch(memory_format) { + case at::MemoryFormat::Contiguous: + mem_format_key = "Contiguous"; + break; + case at::MemoryFormat::ChannelsLast: + mem_format_key = "ChannelsLast"; + break; + default: + assert(0 && "Check should have been done earlier\n"); + } + + string bias_shape_key; + if(bias_defined) + bias_shape_key = to_string(bias_shape[0]); + else + bias_shape_key = "nobias"; + + string key = "mps_convolution:" + to_string(stride[0]) + ":" + to_string(stride[1]) + ":" + + to_string(dilation[0]) + ":" + to_string(dilation[1]) + ":" + + to_string(padding[0]) + ":" + to_string(padding[1]) + ":" + + to_string(groups) + ":" + mem_format_key + + mps::getTensorsStringKey({input_t, weight_t}) + ":" + + to_string(bias_defined) + ":" + bias_shape_key; + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = native_mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphConvolution2DOpDescriptor *descriptor_ = [MPSGraphConvolution2DOpDescriptor new]; + fill_conv_desc(descriptor_, stride[0], stride[1], + dilation[0], dilation[1], + padding[1], padding[0], + memory_format, groups); + + MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t); + MPSGraphTensor* weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, weight_t); + MPSGraphTensor* biasTensor = nil; + if(bias_defined) + biasTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType((bias_opt.value()).scalar_type())); + + MPSGraphTensor* outputTensor = [mpsGraph convolution2DWithSourceTensor:inputTensor + weightsTensor:weightTensor + descriptor:descriptor_ + name:nil]; + + if(bias_defined) { + outputTensor = [mpsGraph additionWithPrimaryTensor:outputTensor + secondaryTensor:biasTensor + name:nil]; + } + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->weightTensor_ = weightTensor; + newCachedGraph->biasTensor_ = biasTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t); + auto weightsPlaceholder = native_mps::Placeholder(cachedGraph->weightTensor_, weight_t); + auto biasPlaceholder = native_mps::Placeholder(); + // Reshape the bias to be broadcastable with output of conv2d + if(bias_defined) + biasPlaceholder = native_mps::Placeholder(cachedGraph->biasTensor_, (bias_opt.value()).view({1, bias_shape[0], 1, 1})); + auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, *output); + + NSMutableDictionary* feeds = [[NSMutableDictionary alloc] initWithCapacity: 3]; + feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData(); + feeds[weightsPlaceholder.getMPSGraphTensor()] = weightsPlaceholder.getMPSGraphTensorData(); + if(bias_defined) { + feeds[biasPlaceholder.getMPSGraphTensor()] = biasPlaceholder.getMPSGraphTensorData(); + } + + NSDictionary *results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + + return *output; +} + +Tensor mps_convolution_backward_input( + IntArrayRef input_size, const Tensor& grad_output_t, const Tensor& weight_t, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) { + namespace native_mps = at::native::mps; + using namespace mps; + CheckedFrom c = "mps_convolution_backward_input"; + TensorArg grad_output{ grad_output_t, "grad_output", 1 }, + weight{ weight_t, "weight", 2 }; + checkAllSameType(c, {grad_output, weight}); + checkAllSameGPU(c, {grad_output, weight}); + auto memory_format = grad_output_t.suggest_memory_format(); + auto grad_input_t = at::empty( + input_size, + grad_output->scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + memory_format); + + // Avoid "grad_input" when this is being used as transposed convolution + TensorArg grad_input{ grad_input_t, "result", 0 }; + convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups); + + // Derive from MPSCachedGraph + struct CachedGraph : public native_mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* gradOutputTensor_ = nil; + MPSGraphTensor* weightTensor_ = nil; + MPSGraphTensor* gradInputTensor_ = nil; + }; + + native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance(); + + // Add backward with input + @autoreleasepool { + + MPSStream* stream = getCurrentMPSStream(); + + string mem_format_key; + switch(memory_format) { + case at::MemoryFormat::Contiguous: + mem_format_key = "Contiguous"; + break; + case at::MemoryFormat::ChannelsLast: + mem_format_key = "ChannelsLast"; + break; + default: + assert(0 && "Check should have been done earlier\n"); + } + + MPSShape* mps_input_shape = getMPSShape(input_size); + + NSString* ns_shape_key = [[mps_input_shape valueForKey:@"description"] componentsJoinedByString:@","]; + + string key = "mps_convolution_backward_input:" + to_string(stride[0]) + ":" + to_string(stride[1]) + ":" + + to_string(dilation[0]) + ":" + to_string(dilation[1]) + ":" + + to_string(padding[0]) + ":" + to_string(padding[1]) + ":" + + to_string(groups) + ":" + mem_format_key + + getTensorsStringKey({grad_output_t, weight_t}) + ":" + + string([ns_shape_key UTF8String]); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () { + + CachedGraph* newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = native_mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphConvolution2DOpDescriptor *descriptor_ = [MPSGraphConvolution2DOpDescriptor new]; + fill_conv_desc(descriptor_, stride[0], stride[1], + dilation[0], dilation[1], + padding[1], padding[0], + memory_format, groups); + + MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, grad_output_t); + MPSGraphTensor* weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, weight_t); + + MPSGraphTensor* gradInputTensor = [mpsGraph convolution2DDataGradientWithIncomingGradientTensor:gradOutputTensor + weightsTensor:weightTensor + outputShape:mps_input_shape + forwardConvolutionDescriptor:descriptor_ + name:nil]; + + newCachedGraph->gradOutputTensor_ = gradOutputTensor; + newCachedGraph->weightTensor_ = weightTensor; + newCachedGraph->gradInputTensor_ = gradInputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t); + auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t); + auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, *grad_input); + + NSDictionary *feeds = @{ + gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), + weightsPlaceholder.getMPSGraphTensor() : weightsPlaceholder.getMPSGraphTensorData(), + }; + + NSDictionary *results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + return *grad_input; +} + +Tensor mps_convolution_backward_weights( + IntArrayRef weight_size, const Tensor& grad_output_t, const Tensor& input_t, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) { + namespace native_mps = at::native::mps; + using namespace mps; + CheckedFrom c = "mps_convolution_backward_weights"; + auto memory_format = input_t.suggest_memory_format(); + + // For uniformity with everything else, although it seems grad_weight + // would be unambiguous too. + TensorArg grad_output{ grad_output_t, "grad_output", 1 }; + TensorArg input{ input_t, "input", 2}; + + checkAllSameType(c, {grad_output, input}); + checkAllSameGPU(c, {grad_output, input}); + + auto grad_weight_t = at::empty(weight_size, grad_output_t.options(), memory_format); + TensorArg grad_weight{ grad_weight_t, "result", 0 }; + + convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups); + + // Derive from MPSCachedGraph + struct CachedGraph : public native_mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* gradOutputTensor_ = nil; + MPSGraphTensor* inputTensor_ = nil; + MPSGraphTensor* gradWeightTensor_ = nil; + }; + + native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance(); + + @autoreleasepool { + + MPSStream* stream = getCurrentMPSStream(); + + string mem_format_key; + switch(memory_format) { + case at::MemoryFormat::Contiguous: + mem_format_key = "Contiguous"; + break; + case at::MemoryFormat::ChannelsLast: + mem_format_key = "ChannelsLast"; + break; + default: + assert(0 && "Check should have been done earlier\n"); + } + + MPSShape* mps_weight_shape = getMPSShape(weight_size); + + NSString* ns_shape_key = [[mps_weight_shape valueForKey:@"description"] componentsJoinedByString:@","]; + + string key = "mps_convolution_backward_weights:" + to_string(stride[0]) + ":" + to_string(stride[1]) + ":" + + to_string(dilation[0]) + ":" + to_string(dilation[1]) + ":" + + to_string(padding[0]) + ":" + to_string(padding[1]) + ":" + + to_string(groups) + ":" + mem_format_key + + getTensorsStringKey({grad_output_t, input_t}) + ":" + + string([ns_shape_key UTF8String]); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () { + + CachedGraph* newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = native_mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphConvolution2DOpDescriptor *descriptor_ = [MPSGraphConvolution2DOpDescriptor new]; + fill_conv_desc(descriptor_, stride[0], stride[1], + dilation[0], dilation[1], + padding[1], padding[0], + memory_format, groups); + + MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, grad_output_t); + MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t); + + MPSGraphTensor* gradWeightTensor = [mpsGraph convolution2DWeightsGradientWithIncomingGradientTensor:gradOutputTensor + sourceTensor:inputTensor + outputShape:mps_weight_shape + forwardConvolutionDescriptor:descriptor_ + name:nil]; + + newCachedGraph->gradOutputTensor_ = gradOutputTensor; + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->gradWeightTensor_ = gradWeightTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t); + auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t); + auto outputPlaceholder = Placeholder(cachedGraph->gradWeightTensor_, grad_weight_t); + + NSDictionary *feeds = @{ + gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), + inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), + }; + + NSDictionary *results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + + return grad_weight_t; +} + +std::tuple mps_convolution_backward( + const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + std::array output_mask) { + + Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format()); + + Tensor grad_input, grad_weight, grad_bias; + if (input.numel() == 0) { + if (output_mask[0]) { + grad_input = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + } + if (output_mask[1]) { + grad_weight = at::zeros_like(weight, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + } + } else { + if (output_mask[0]) { + grad_input = mps_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, output_mask[2]); + } + if (output_mask[1]) { + grad_weight = mps_convolution_backward_weights(weight.sizes(), grad_output, input, padding, stride, dilation, groups, output_mask[2]); + } + } + + return std::tuple{grad_input, grad_weight, grad_bias}; +} + +Tensor mps_convolution_transpose_forward( + const Tensor& grad_output, const Tensor& weight, + IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups) +{ + auto input_size = conv_input_size(grad_output.sizes(), weight.sizes(), + padding, output_padding, stride, dilation, groups); + return mps_convolution_backward_input(input_size, grad_output, weight, + padding, stride, dilation, groups, false); +} + +Tensor _mps_convolution_transpose( + const Tensor& input_t, const Tensor& weight_t, + IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, + int64_t groups) { + + auto output_t = mps_convolution_transpose_forward( + input_t, weight_t, padding, output_padding, stride, dilation, groups); + return output_t; + +} + +Tensor mps_convolution_transpose_backward_input( + const Tensor& grad_output_t, const Tensor& weight_t, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, + int64_t groups) +{ + return at::_mps_convolution( + grad_output_t, weight_t, c10::nullopt, padding, stride, dilation, groups); +} + +Tensor mps_convolution_transpose_backward_weight( + IntArrayRef weight_size, + const Tensor& grad_output_t, + const Tensor& input_t, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups) +{ + return mps_convolution_backward_weights( + weight_size, input_t, grad_output_t, + padding, stride, dilation, groups, false); +} + + +std::tuple mps_convolution_transpose_backward( + const Tensor& input, const Tensor& grad_output_t, const Tensor& weight, + IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + std::array output_mask) { + + Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format()); + + Tensor grad_input, grad_weight; + if (output_mask[0]) { + grad_input = mps_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups); + } + if (output_mask[1]) { + grad_weight = mps_convolution_transpose_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups); + } + + return std::tuple{grad_input, grad_weight}; +} + + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/mps/operations/Copy.mm b/aten/src/ATen/native/mps/operations/Copy.mm new file mode 100644 index 000000000000..ec7dce287a2e --- /dev/null +++ b/aten/src/ATen/native/mps/operations/Copy.mm @@ -0,0 +1,408 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { + +MPSGraphTensor* chainViewOperation(MPSGraph* mpsGraph, IntArrayRef size, + IntArrayRef stride, int64_t storage_offset, + MPSGraphTensor* inputTensor, const Tensor& self) { + MPSGraphTensor *outputTensor = nil; + @autoreleasepool { + int32_t* sizeArray = new int32_t[size.size()](); + for (int i = 0; i < size.size(); i++) { + sizeArray[i] = size[i]; + } + NSData* shapeData = [NSData dataWithBytes : sizeArray + length : size.size()*sizeof(int32_t)]; + + MPSGraphTensor* shapeTensor = [mpsGraph constantWithData : shapeData + shape : @[[NSNumber numberWithUnsignedInteger: size.size()]] + dataType : MPSDataTypeInt32]; + MPSGraphTensor* storageOffsetTensor = [mpsGraph constantWithScalar : storage_offset + dataType : MPSDataTypeInt32]; + MPSGraphTensor* strideTensor = [mpsGraph constantWithScalar : stride[self.dim()-1] + dataType : MPSDataTypeInt32]; + MPSGraphTensor* rangeTensor = [mpsGraph coordinateAlongAxis:-1 + withShapeTensor : shapeTensor + name : nil]; + MPSGraphTensor* indexTensor = [mpsGraph multiplicationWithPrimaryTensor : rangeTensor + secondaryTensor : strideTensor + name : nil]; + MPSGraphTensor* indicesTensor = indexTensor; + // create stride Tensors for each rank of the input tensor + for (int i = 1; i < self.dim(); i++) { + strideTensor = [mpsGraph constantWithScalar : stride[self.dim() - i - 1] + dataType : MPSDataTypeInt32]; + MPSGraphTensor* rangeTensor = [mpsGraph coordinateAlongAxis: (-i-1) + withShapeTensor : shapeTensor + name : nil]; + MPSGraphTensor* indexTensor = [mpsGraph multiplicationWithPrimaryTensor : rangeTensor + secondaryTensor : strideTensor + name : nil]; + indicesTensor = [mpsGraph additionWithPrimaryTensor : indexTensor + secondaryTensor : indicesTensor + name : nil]; + } + indicesTensor = [mpsGraph additionWithPrimaryTensor : indicesTensor + secondaryTensor : storageOffsetTensor + name : nil]; + MPSGraphTensor *reshapedInputTensor = [mpsGraph reshapeTensor:inputTensor + withShape:@[@-1] + name:nil]; + MPSGraphTensor *reshapedIndicesTensor = [mpsGraph reshapeTensor:indicesTensor + withShape:@[@-1] + name:nil]; + // Call gather to coalesce the needed values. Result will be of same shape as flattened indices tensor + MPSGraphTensor *gatheredTensor = [mpsGraph gatherWithUpdatesTensor:reshapedInputTensor + indicesTensor:reshapedIndicesTensor + axis:0 + batchDimensions:0 + name:nil]; + + delete[] sizeArray; + // Reshape the data to desired size + outputTensor = [mpsGraph reshapeTensor:gatheredTensor + withShapeTensor:shapeTensor + name:nil]; + } + return outputTensor; +} + + +// There are few cases we need to consider: +// Here nodes are the Tensors and the edges are the operations performed on the +// Tensor. As a result of the operation performed we can have result as View +// Tensor (View T) or a Non view tensor (NonView T). The difference is if its +// mapped by the same underlying storage ptr or a new MTLBuffer was allocated. +// T = Tensor +// ---------- +// | Orig T | +// ---------- +// / | \ +// View T View T NonView T +// / / \ | +// View T / \ | +// | / \ | +// | / \ | +// | / \ | +// NonView T NonView T +// +// +Tensor as_strided_tensorimpl_mps(const Tensor& self, IntArrayRef size, + IntArrayRef stride, + optional storage_offset_) { + using namespace mps; + // Use the size and stride to create a unique key + auto result = detail::make_tensor( + c10::TensorImpl::VIEW, Storage(self.storage()), self.key_set(), self.dtype()); + auto storage_offset = storage_offset_.value_or(self.storage_offset()); + setStrided(result, size, stride, storage_offset); + + // 0 sizes won't result in any change in the shape of the Tensor so we can + // skip it. Also if the memory is contiguous we don't need to do + // gather-scatter operations using graph. + if (size.size() > 0 && (!result.is_contiguous())) { + + // If self itself was a view tensor, that means we need to chain the graphs + // else we will create a new entry in the cache + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* inputTensor_ = nil; + MPSGraphTensor* outputTensor_ = nil; + IntArrayRef size_; + IntArrayRef stride_; + int64_t storage_offset_; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + @autoreleasepool { + string lookup_key = mps::getStridedKey(self, self.sizes(), self.strides(), + self.storage_offset()); +#if _DEBUG + std::cout << "Lookup key " << lookup_key << std::endl; +#endif + CachedGraph* cachedGraph = static_cast(cache_->LookUp(lookup_key)); + + if(!cachedGraph) { + string insert_key = mps::getStridedKey(self,size, stride, storage_offset); +#if _DEBUG + std::cout << "Insert key " << insert_key << std::endl; +#endif + CachedGraph* insertCachedGraph = static_cast(cache_->LookUp(insert_key)); + if (!insertCachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(insert_key, ^ MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + // Self is the input tensor we are creating view of + MPSGraphTensor* inputTensor = [mpsGraph placeholderWithShape : getMPSShape(self) + dataType : getMPSDataType(self.scalar_type()) + name : nil]; + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = chainViewOperation(mpsGraph, size, + stride, + storage_offset, + inputTensor, + self); + newCachedGraph->size_ = size; + newCachedGraph->stride_ = stride; + newCachedGraph->storage_offset_ = storage_offset; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + } else { + // Else part takes care of the chaining where multiple view operations + // were implemented on the same underlying data storage ptr + cachedGraph->outputTensor_ = chainViewOperation(cachedGraph->graph(), + size, stride, storage_offset, + cachedGraph->outputTensor_, self); + cachedGraph->size_ = size; + cachedGraph->stride_ = stride; + cachedGraph->storage_offset_ = storage_offset; + } + } + } + return result; +} + +namespace mps { + +void* pageAlignedBlockPtr( + const void* ptr, + NSUInteger size, + NSUInteger* alignedBlockSize) { + uintptr_t address = (uintptr_t)ptr; + uintptr_t alignedAddress = address & ~(PAGE_SIZE - 1); + uintptr_t alignedEnd = ((address + size) + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1); + uint64_t alignedLength = alignedEnd - alignedAddress; + + assert(address >= alignedAddress); + assert(address + size <= alignedAddress + alignedLength); + + *alignedBlockSize = alignedLength; + return (void*)alignedAddress; +} + +static at::Tensor& copy_from_mps_(at::Tensor& self, const at::Tensor& src, + bool non_blocking) { + + using namespace mps; + id device = MPSDevice::getInstance()->device(); + MPSStream* stream = getCurrentMPSStream(); + uint64_t size = src.nbytes(); + if (size == 0) return self; + void* host_dst = self.data_ptr(); + + // MTLContext* context = static_cast(device->device_handle); + auto storage_byte_offset = src.storage_offset() * src.itemsize(); + id sourceBuffer = __builtin_bit_cast(id, src.storage().data()); + + if (!src.is_contiguous()) { + id gatherTensor = gatherViewTensor(src, sourceBuffer); + if (gatherTensor) { + sourceBuffer = gatherTensor; + storage_byte_offset = 0; + } + } + + if (sourceBuffer == nil) return self; + NSUInteger destOffset = 0; + + @autoreleasepool { + MTLResourceOptions options = MTLResourceOptionCPUCacheModeDefault | MTLResourceStorageModeShared; + NSUInteger alignedLength = 0; + + void* alignedPtr = pageAlignedBlockPtr(host_dst, (NSUInteger)size, &alignedLength); + id destBuffer = [device newBufferWithBytesNoCopy:alignedPtr + length:alignedLength + options:options + deallocator:nil]; + destOffset = uintptr_t(host_dst) - uintptr_t(alignedPtr); + // 4 bytes alignment required on macos for blits. + TORCH_CHECK(destOffset % 4 == 0, "Unaligned blit request"); + + dispatch_sync(stream->queue(), ^() { + @autoreleasepool { + id commandBuffer = stream->commandBuffer(); + id blitEncoder = + [commandBuffer blitCommandEncoder]; + + [blitEncoder copyFromBuffer:sourceBuffer + sourceOffset:(NSUInteger)storage_byte_offset + toBuffer:destBuffer + destinationOffset:(NSUInteger)destOffset + size:(NSUInteger)size]; + [blitEncoder endEncoding]; + + if (non_blocking) { + stream->commit(true); + } else { + stream->commitAndWait(); + } + [destBuffer release]; + } + }); + } + + return self; +} + +static at::Tensor& copy_to_mps_(at::Tensor& self, const at::Tensor& src, + bool non_blocking) { + MPSStream* stream = getCurrentMPSStream(); + const void* host_src = src.data_ptr(); + uint64_t size = src.nbytes(); + + id device = MPSDevice::getInstance()->device(); + auto dst_byte_offset = self.storage_offset() * self.itemsize(); + id destBuffer = __builtin_bit_cast(id, self.storage().data()); + + NSUInteger sourceOffset = 0; + @autoreleasepool { + MTLResourceOptions options = MTLResourceOptionCPUCacheModeDefault | MTLResourceStorageModeShared; + NSUInteger alignedLength = 0; + + void* alignedPtr = pageAlignedBlockPtr(host_src, (NSUInteger)size, &alignedLength); + id sourceBuffer = [device newBufferWithBytesNoCopy:alignedPtr + length:alignedLength + options:options + deallocator:nil]; + sourceOffset = uintptr_t(host_src) - uintptr_t(alignedPtr); + + dispatch_sync(stream->queue(), ^() { + @autoreleasepool { + id commandBuffer = stream->commandBuffer(); + id blitEncoder = + [commandBuffer blitCommandEncoder]; + + [blitEncoder copyFromBuffer:sourceBuffer + sourceOffset:(NSUInteger)sourceOffset + toBuffer:destBuffer + destinationOffset:(NSUInteger)dst_byte_offset + size:(NSUInteger)size]; + [blitEncoder endEncoding]; + if (non_blocking) { + stream->commit(true); + } else { + stream->commitAndWait(); + } + } + }); + [sourceBuffer release]; + } + + return self; +} + +void copy_blit_mps(void* dst, const void* src, size_t size) { + MPSStream* stream = getCurrentMPSStream(); + id sourceBuffer = (id)(src); + id destBuffer = (id)(dst); + dispatch_sync(stream->queue(), ^() { + @autoreleasepool { + id commandBuffer = stream->commandBuffer(); + id blitEncoder = + [commandBuffer blitCommandEncoder]; + + [blitEncoder copyFromBuffer:sourceBuffer + sourceOffset:0 + toBuffer:destBuffer + destinationOffset:0 + size:size]; + [blitEncoder endEncoding]; + stream->commitAndWait(); + } + }); +} + + +static at::Tensor& copy_kernel_mps(at::Tensor& dst, const at::Tensor& src, + bool non_blocking) { + MPSStream* stream = getCurrentMPSStream(); + uint64_t size = src.nbytes(); + + auto src_byte_offset = src.storage_offset() * src.itemsize(); + id sourceBuffer = __builtin_bit_cast(id, src.storage().data()); + + auto dst_byte_offset = dst.storage_offset() * dst.itemsize(); + id destBuffer = __builtin_bit_cast(id, dst.storage().data()); + + dispatch_sync(stream->queue(), ^() { + @autoreleasepool { + id commandBuffer = stream->commandBuffer(); + id blitEncoder = [commandBuffer blitCommandEncoder]; + + [blitEncoder copyFromBuffer:sourceBuffer + sourceOffset:src_byte_offset + toBuffer:destBuffer + destinationOffset:dst_byte_offset + size:size]; + [blitEncoder endEncoding]; + if (non_blocking) { + stream->commit(true); + } else { + stream->commitAndWait(); + } + } + }); + return dst; +} + +at::Tensor& mps_copy_(at::Tensor& dst, const at::Tensor& src, bool non_blocking) +{ + TORCH_CHECK(dst.defined(), "dst is undefined"); + TORCH_CHECK(src.defined(), "src is undefined"); + + if (src.numel() == 0 || dst.is_same(src)) { + return dst; + } + if (dst.numel() == 0) { + dst.resize_as_(src); + } + + if (src.device().type() == at::kMPS && dst.device().type() == at::kCPU) { + return copy_from_mps_(dst, src, non_blocking); + } + if (src.device().type() == at::kCPU && dst.device().type() == at::kMPS) { + return copy_to_mps_(dst, src, non_blocking); + } + + if (src.device().type() == at::kMPS && dst.device().type() == at::kMPS) { + return copy_kernel_mps(dst, src, non_blocking); + } + TORCH_INTERNAL_ASSERT( + src.device().type() == DeviceType::MPS, + "mps_copy_ is implemented only for *->MPS; MPS->*"); + return dst; +} +} // namespace mps + +Tensor _copy_from_and_resize_mps(const at::Tensor& self, const at::Tensor& dst) +{ + return mps::mps_copy_(const_cast(dst), self, false); +} + +Tensor _copy_from_mps(const at::Tensor& self, const at::Tensor& dst, bool non_blocking) +{ + return mps::mps_copy_(const_cast(dst), self, non_blocking); +} +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/mps/operations/Distributions.mm b/aten/src/ATen/native/mps/operations/Distributions.mm new file mode 100644 index 000000000000..0ff2a443d6ab --- /dev/null +++ b/aten/src/ATen/native/mps/operations/Distributions.mm @@ -0,0 +1,459 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +namespace at { +namespace native { +namespace templates { + +} + +Tensor& uniform_mps_(Tensor& input, double from, double to, c10::optional gen_) +{ + using namespace mps; + + if (input.numel() == 0) { + return input; + } + double delta = (to - from); + AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "check_uniform_bounds", [&] { + const auto dtype = input.dtype(); + const auto min = static_cast(std::numeric_limits::lowest()); + const auto max = static_cast(std::numeric_limits::max()); + TORCH_CHECK(from <= to, "uniform_ expects to return a [from, to) range, but found from=", from, " > to=", to); + TORCH_CHECK((to - from) <= std::numeric_limits::max(), + "uniform_ expects to-from <= std::numeric_limits<", toString(input.scalar_type()), + ">::max(), but found to=", to, " and from=", from, + " which result in to-from to exceed the limit"); + from = std::min(std::max(from, min), max); + to = std::max(std::min(to, max), min); + }); + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *outputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + MPSStream* stream = getCurrentMPSStream(); + uint64_t seed_ = c10::detail::getNonDeterministicRandom(true); + + @autoreleasepool { + MPSShape* input_shape = getMPSShape(input); + string key = "uniform_mps_" + getTensorsStringKey(input) + ":" + to_string(from) + ":" + to_string(to) + ":" + to_string(seed_); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + // TODO: right now taking the default seed. Extend it to be extracted from the + // MPSGenerator + MPSGraphTensor* randomTensor = [mpsGraph randomUniformTensorWithShape:input_shape + seed:seed_ + name:nil]; + MPSGraphTensor* deltaTensor = [mpsGraph constantWithScalar:delta + shape:input_shape + dataType:MPSDataTypeFloat32]; + MPSGraphTensor* fromTensor = [mpsGraph constantWithScalar:from + shape:input_shape + dataType:MPSDataTypeFloat32]; + MPSGraphTensor* mulTensor = [mpsGraph multiplicationWithPrimaryTensor:randomTensor + secondaryTensor:deltaTensor + name:nil]; + MPSGraphTensor* outputTensor = [mpsGraph additionWithPrimaryTensor:mulTensor + secondaryTensor:fromTensor + name:nil]; + newCachedGraph->outputTensor_ = outputTensor; + + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, input); + NSDictionary *feeds = nil; + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + + } + + return input; +} + +Tensor& normal_mps_(Tensor& self, double mean, double std, c10::optional gen) { + if (self.numel() == 0) + return self; + TORCH_CHECK(std >= 0.0, "normal_mps_ expects std >= 0.0, but found std=", std); + + Tensor mean_t = empty_mps( + self.sizes(), + self.scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + c10::nullopt); + mean_t.fill_(mean); + + Tensor std_t = empty_mps( + self.sizes(), + self.scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + c10::nullopt); + std_t.fill_(std); + + return normal_mps_out(mean_t, std_t, gen, self); +} + +Tensor& normal_mps_out(const Tensor& mean, double std, c10::optional gen, Tensor& output) { + TORCH_CHECK(std >= 0.0, "normal_mps_out expects std >= 0.0, but found std=", std); + + Tensor std_t = empty_mps( + output.sizes(), + output.scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + c10::nullopt); + std_t.fill_(std); + + return normal_mps_out(mean, std_t, gen, output); + +} + +Tensor& normal_mps_out(double mean, const Tensor& std, c10::optional gen, Tensor& output) { + TORCH_CHECK( + std.min().ge(0).item(), + "normal expects all elements of std >= 0.0"); + + + Tensor mean_t = empty_mps( + output.sizes(), + output.scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + c10::nullopt); + mean_t.fill_(mean); + + return normal_mps_out(mean_t, std, gen, output); + +} + +Tensor& normal_mps_out(const Tensor& mean, const Tensor& std, c10::optional gen, Tensor& output) { + TORCH_CHECK(!std.is_complex(), "normal expects standard deviation to be non-complex"); + TORCH_CHECK(std.numel() == 0 || std.min().ge(0).item(), "normal expects all elements of std >= 0.0"); + // Check that mean and std have same number of elements + TORCH_CHECK(mean.numel() == std.numel(), "normal_mps_out: mean and std must have same number of elements") + + using namespace mps; + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* outputTensor_ = nil; + MPSGraphTensor* meanTensor_ = nil; + MPSGraphTensor* stdTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + MPSStream* stream = getCurrentMPSStream(); + uint64_t seed_ = c10::detail::getNonDeterministicRandom(true); + + @autoreleasepool { + MPSShape* input_shape = getMPSShape(output); + string key = "normal_mps_out:" + getMPSShapeString(input_shape) + ":" + getMPSTypeString(output.scalar_type()); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphRandomOpDescriptor* desc = [[MPSGraphRandomOpDescriptor new] autorelease]; + desc.distribution = MPSGraphRandomDistributionNormal; + desc.dataType = getMPSDataType(output.scalar_type()); + desc.mean = 0.0; + desc.standardDeviation = 1.0; + + MPSGraphTensor* meanTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(output.scalar_type()), input_shape); + MPSGraphTensor* stdTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(output.scalar_type()), input_shape); + + // TODO: right now taking the default seed. Extend it to be extracted from the + // MPSGenerator + MPSGraphTensor* randomTensor = [mpsGraph randomTensorWithShape:input_shape + descriptor:desc + name:nil]; + MPSGraphTensor* scaleTensor = [mpsGraph multiplicationWithPrimaryTensor:randomTensor + secondaryTensor:stdTensor + name:nil]; + MPSGraphTensor* outputTensor = [mpsGraph additionWithPrimaryTensor:scaleTensor + secondaryTensor:meanTensor + name:nil]; + newCachedGraph->meanTensor_ = meanTensor; + newCachedGraph->stdTensor_ = stdTensor; + newCachedGraph->outputTensor_ = outputTensor; + + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + auto meanPlaceholder = Placeholder(cachedGraph->meanTensor_, mean); + auto stdPlaceholder = Placeholder(cachedGraph->stdTensor_, std); + auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); + NSDictionary *feeds = @{ + meanPlaceholder.getMPSGraphTensor() : meanPlaceholder.getMPSGraphTensorData(), + stdPlaceholder.getMPSGraphTensor() : stdPlaceholder.getMPSGraphTensorData() + }; + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + + } + + return output; +} + +Tensor& bernoulli_out_mps(const Tensor& p_, c10::optional gen, Tensor& result) { + result.resize_(p_.sizes()); + return bernoulli_mps_(result, p_, gen); +} + +Tensor& bernoulli_mps_(Tensor& self, double p, c10::optional gen) { + TORCH_CHECK(0 <= p && p <= 1, "bernoulli_mps_ expects p to be in [0, 1], but got p=", p); + Tensor p_t = empty_mps( + self.sizes(), + self.scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + c10::nullopt); + p_t.fill_(p); + + return bernoulli_mps_(self, p_t, gen); +} + +Tensor& bernoulli_mps_(Tensor& self, const Tensor& p_, c10::optional gen) { + TORCH_CHECK(self.is_same_size(p_), "bernoulli_mps_: probability and self tensor should be of the same shape") + + using namespace mps; + + MPSStream* stream = getCurrentMPSStream(); + uint64_t seed_ = c10::detail::getNonDeterministicRandom(true); + + @autoreleasepool { + MPSShape* input_shape = getMPSShape(self); + + auto mps_dtype = getMPSDataType(p_.scalar_type()); + + MPSGraph* mpsGraph = make_mps_graph(); + + MPSGraphTensor* probTensor = mpsGraphRankedPlaceHolder(mpsGraph, mps_dtype, input_shape); + + // TODO: right now taking the default seed. Extend it to be extracted from the + // MPSGenerator + MPSGraphTensor* randomTensor = [mpsGraph randomUniformTensorWithShape:input_shape + seed:seed_ + name:nil]; + MPSGraphTensor* outputTensor = [mpsGraph lessThanWithPrimaryTensor:randomTensor + secondaryTensor:probTensor + name:nil]; + + auto probPlaceholder = Placeholder(probTensor, p_); + auto outputPlaceholder = Placeholder(outputTensor, self); + NSDictionary *feeds = @{ + probPlaceholder.getMPSGraphTensor() : probPlaceholder.getMPSGraphTensorData(), + }; + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, mpsGraph, feeds, results); + } + + return self; + +} + +// Taken from ATen/native/DistributionTemplates.h +#define CHECK_OUT_OF_BOUNDS(var, name, min, max, dtype) \ + TORCH_CHECK(var >= min && var <= max, name , " is out of bounds for ", dtype); \ + +#define WARN_OUT_OF_BOUNDS(var, name, digits, dtype) \ + if (var < -(1LL << digits) || var > (1LL << digits)) { \ + TORCH_WARN(name , " is out of bounds [-(2^", digits, "), 2^", digits, "]. ", \ + "Due to precision limitations ", dtype, " can support discrete uniform distribution only within this range. ", \ + "This warning will become an error in version 1.7 release, please fix the code in advance"); \ + } + +// Modified from ATen/native/DistributionTemplates.h +static void check_from_to_in_range(int64_t from, int64_t to_inc, ScalarType scalar_type) { + const auto dtype = scalarTypeToTypeMeta(scalar_type); + if (isFloatingType(scalar_type)) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, scalar_type, "check_random_fp_bounds", [&] { + const auto min = static_cast(std::numeric_limits::lowest()); + const auto max = static_cast(std::numeric_limits::max()); + CHECK_OUT_OF_BOUNDS(from, "from", min, max, dtype); + CHECK_OUT_OF_BOUNDS(to_inc, "to - 1", min, max, dtype); + + constexpr auto digits = std::numeric_limits::digits; + WARN_OUT_OF_BOUNDS(from, "from", digits, dtype); + WARN_OUT_OF_BOUNDS(to_inc, "to - 1", digits, dtype); + }); + } else if (isIntegralType(scalar_type, /*includeBool=*/true)) { + AT_DISPATCH_INTEGRAL_TYPES_AND(at::ScalarType::Bool, scalar_type, "check_random_integral_bounds", [&]() { + const auto min = static_cast(std::numeric_limits::lowest()); + const auto max = static_cast(std::numeric_limits::max()); + CHECK_OUT_OF_BOUNDS(from, "from", min, max, dtype); + CHECK_OUT_OF_BOUNDS(to_inc, "to - 1", min, max, dtype); + }); + } else { + TORCH_CHECK(false, "check_random_bounds handles only integral, floating-point and boolean types"); + } +} + + +// random_.from +Tensor& random_mps_ + (Tensor& self, + int64_t from, + optional to_opt, + c10::optional gen) { + + using namespace mps; + + MPSStream* stream = getCurrentMPSStream(); + uint64_t seed_ = c10::detail::getNonDeterministicRandom(true); + + auto input_dtype = self.scalar_type(); + + int64_t to; + + if(to_opt.has_value()) { + // [from, to) + to = *to_opt; + TORCH_CHECK(from < to, "random_mps_ expects 'from' to be less than 'to', but got from=", from, " >= to=", to); + if (isFloatingType(input_dtype)) { + // TODO: what is "random_update_from_to"? + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, input_dtype, "random_update_from_to", [&] { + from = templates::update_from(from); + to = templates::update_to(to); + TORCH_CHECK(from < to, "random_mps_ expects 'from' casted to dtype to be less than 'to' casted to dtype, but got from=", from, " >= to=", to); + }); + check_from_to_in_range(from, to - 1, input_dtype); + } + } + else if (from != std::numeric_limits::lowest()) { + // [from, std::numeric_limits::max()] + to = 0; + if(isFloatingType(input_dtype)) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, input_dtype, "random_from_to_range_calc", [&] { + constexpr int64_t scalar_t_max = static_cast(1) << std::numeric_limits::digits; + to = scalar_t_max > std::numeric_limits::max() ? std::numeric_limits::max() : static_cast(scalar_t_max); + from = templates::update_from(from); + TORCH_CHECK(from < to, "random_mps_ expects 'from' casted to dtype to be less than or equal to 'to' casted to dtype, but got from=", from, " > to=", to); + }); + } + else if(isIntegralType(input_dtype, /*includeBool=*/true)) { + AT_DISPATCH_INTEGRAL_TYPES_AND(at::ScalarType::Bool, input_dtype, "random_from_to_range_calc", [&] { + if (std::is_same::value) { + to = static_cast(true); + } else { + to = static_cast(std::numeric_limits::max()); + } + }); + } + else { + TORCH_CHECK(false, "random_mps_ handles only integral, floating-point and boolean types"); + } + check_from_to_in_range(from, to, input_dtype); + } + else { + // [std::numeric_limits::lowest(), std::numeric_limits::max()] + // range = 2^64 + + // TODO - how to implement this? + TORCH_CHECK(false, "random_mps_ currently does not handle the lowest() -> max() range"); + + } + + @autoreleasepool { + MPSShape* input_shape = getMPSShape(self); + + MPSGraph* mpsGraph = make_mps_graph(); + + MPSGraphRandomOpDescriptor* descriptor = [MPSGraphRandomOpDescriptor descriptorWithDistribution:MPSGraphRandomDistributionUniform + dataType:MPSDataTypeInt32]; + descriptor.minInteger = from; + descriptor.maxInteger = to - 1; + + // TODO: right now taking the default seed. Extend it to be extracted from the + // MPSGenerator + MPSGraphTensor* randomTensor = [mpsGraph randomTensorWithShape:input_shape + descriptor:descriptor + seed:seed_ + name:nil]; + + MPSGraphTensor* outputTensor = nil; + + if(input_dtype != ScalarType::Int) + outputTensor = [mpsGraph castTensor:randomTensor + toType:getMPSDataType(input_dtype) + name:@"outputTensor"]; + else + outputTensor = randomTensor; + + auto outputPlaceholder = Placeholder(outputTensor, self); + NSDictionary *feeds = nil; + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, mpsGraph, feeds, results); + } + + return self; + +} + +Tensor& random_mps_ + (Tensor& self, + int64_t to, + c10::optional gen) { + + return random_mps_(self, 0, to, gen); +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm new file mode 100644 index 000000000000..c313f0624b98 --- /dev/null +++ b/aten/src/ATen/native/mps/operations/Indexing.mm @@ -0,0 +1,330 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __OBJC__ +#include +#endif + +namespace at { +namespace native { + +Tensor index_select_mps(const Tensor & self, + int64_t dim, + const Tensor & index) { + IntArrayRef input_shape = self.sizes(); + auto num_input_dims = input_shape.size(); + + IntArrayRef index_shape = index.sizes(); + auto num_indices = index.numel(); + TORCH_CHECK_INDEX(index.dim() <= 1, "index_select(): Index is supposed to be a vector"); + + dim = maybe_wrap_dim(dim, self.dim()); + int64_t* shape_data = (int64_t*)malloc(num_input_dims * sizeof(int64_t)); + + // Calculate new shape + for(int i = 0; i < num_input_dims; i++) { + if(i == dim) + shape_data[i] = num_indices; + else + shape_data[i] = input_shape[i]; + } + + IntArrayRef output_shape = IntArrayRef(shape_data, num_input_dims); + + Tensor result = at::native::empty_mps( + output_shape, + self.scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + c10::nullopt); + + free(shape_data); + + index_select_out_mps(self, dim, index, result); + return result; +} + +Tensor& index_select_out_mps(const Tensor & self, + int64_t dim, + const Tensor & index, + Tensor & output) { + + using namespace mps; + MPSStream* stream = getCurrentMPSStream(); + dim = maybe_wrap_dim(dim, self.dim()); + // Checks + TORCH_CHECK_INDEX(index.dim() <= 1, "index_select(): Index is supposed to be a vector"); + TORCH_CHECK(index.scalar_type() == ScalarType::Long || index.scalar_type() == ScalarType::Int, "index_select(): Expected dtype int32 or int64 for index"); + TORCH_CHECK(self.scalar_type() == output.scalar_type(), + "index_select(): self and output must have the same scalar type"); + TORCH_CHECK(dim == 0 || dim < self.dim(), + "index_select(): Indexing dim ", dim, " is out of bounds of tensor"); + + // Derive from MPSCachedGraph + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* inputTensor_ = nil; + MPSGraphTensor* indexTensor_ = nil; + MPSGraphTensor* outputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + @autoreleasepool { + + string key = "index_select_out_mps" + getTensorsStringKey({self, index}) + ":" + std::to_string(dim); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self); + MPSGraphTensor* indexTensor = mpsGraphRankedPlaceHolder(mpsGraph, index); + + MPSGraphTensor* outputTensor = [mpsGraph gatherWithUpdatesTensor:inputTensor + indicesTensor:indexTensor + axis:dim + batchDimensions:0 + name:nil]; + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->indexTensor_ = indexTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); + Placeholder indexPlaceholder = Placeholder(cachedGraph->indexTensor_, index); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); + + NSDictionary* feeds = @{ + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), + indexPlaceholder.getMPSGraphTensor() : indexPlaceholder.getMPSGraphTensorData() + }; + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + + return output; + +} + +Tensor & masked_fill__mps(Tensor& self, const Tensor & mask, const Scalar& value) { + using namespace mps; + TORCH_CHECK(self.device() == mask.device(), "expected self and mask to be on the same device, but got mask on ", + mask.device(), " and self on ", self.device()); + TORCH_CHECK(mask.scalar_type() == kByte || mask.scalar_type() == kBool, + "expected mask dtype to be Bool but got ", mask.scalar_type()); + auto maybe_outnames = namedinference::broadcast_to_outnames(self, mask, "masked_fill_"); + + c10::MaybeOwned b_mask = expand_inplace(self, mask, "masked_fill_"); + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *maskTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + MPSStream* stream = getCurrentMPSStream(); + @autoreleasepool { + string key = "masked_fill" + getTensorsStringKey({self, mask}) + ":" + std::to_string(value.toDouble()); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self); + MPSGraphTensor* maskTensor = mpsGraphRankedPlaceHolder(mpsGraph, mask); + MPSDataType valueType = getMPSScalarType(value.type()); + + // constantWithScalar doesn't like Bool constants getting created so + // mapping them to int8 + if (valueType == MPSDataTypeBool) { + valueType = MPSDataTypeInt8; + } + MPSGraphTensor* valueTensor = [mpsGraph constantWithScalar:value.to() + dataType:valueType]; + valueTensor = [mpsGraph castTensor:valueTensor + toType:getMPSDataType(self.scalar_type()) + name : @"castTensorEq"]; + + MPSGraphTensor* outputTensor = [mpsGraph selectWithPredicateTensor:maskTensor + truePredicateTensor:valueTensor + falsePredicateTensor:inputTensor + name:nil]; + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->maskTensor_ = maskTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); + Placeholder maskPlaceholder = Placeholder(cachedGraph->maskTensor_, mask); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, self); + + // Create dictionary of inputs and outputs + NSDictionary* feeds = @{ + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), + maskPlaceholder.getMPSGraphTensor() : maskPlaceholder.getMPSGraphTensorData() + }; + + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + + } + namedinference::propagate_names_if_nonempty(self, maybe_outnames); + return self; +} + +Tensor embedding_dense_backward_mps( + const Tensor & grad_, const Tensor & indices, int64_t num_weights, + int64_t padding_idx, bool scale_grad_by_freq) +{ + // TODO: implement padding_idx & scale_grad_by_freq. + namespace native_mps = at::native::mps; + struct CachedGraph : public native_mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *incomingGradTensor_ = nil; + MPSGraphTensor *indicesTensor_ = nil; + MPSGraphTensor *outgoingGradTensor_ = nil; + }; + + native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance(); + + IntArrayRef incoming_gradient_shape = grad_.sizes(); + int64_t num_incoming_gradient_dims = incoming_gradient_shape.size(); + + IntArrayRef indices_shape = indices.sizes(); + int64_t num_indices_dims = indices_shape.size(); + + int64_t* outgoing_gradient_shape = (int64_t *) malloc(sizeof(int64_t) * 2); + int64_t D = incoming_gradient_shape[num_incoming_gradient_dims - 1]; + outgoing_gradient_shape[0] = num_weights; + outgoing_gradient_shape[1] = D; + int64_t num_outgoing_gradient_dims = 2; + Tensor outgoing_gradient = at::native::empty_mps( + IntArrayRef(outgoing_gradient_shape, num_outgoing_gradient_dims), + grad_.scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + c10::nullopt); + + if (outgoing_gradient.numel() == 0) { + return outgoing_gradient; + } + + auto stream = at::mps::getCurrentMPSStream(); + + @autoreleasepool { + string key = "edb_mps:" + native_mps::getMPSTypeString(grad_.scalar_type()) + ":indices" + std::to_string(num_indices_dims) + ":num_weights" + std::to_string(num_weights) + ":padding_idx" + std::to_string(padding_idx) + ":scaled" + std::to_string(scale_grad_by_freq); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + // Initialize once if configuration not found in cache + if(!cachedGraph) { + native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = native_mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* incomingGradTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(grad_.scalar_type())); + + MPSGraphTensor* indicesTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(indices.scalar_type())); + + MPSGraphTensor *reshapedIndicesTensor = [mpsGraph expandDimsOfTensor:indicesTensor + axes:@[@-1] + name:nil]; + + MPSGraphTensor *outgoingGradTensor; + outgoingGradTensor = [mpsGraph scatterNDWithUpdatesTensor:incomingGradTensor + indicesTensor:reshapedIndicesTensor + shape:native_mps::getMPSShape(IntArrayRef(outgoing_gradient_shape, num_outgoing_gradient_dims)) + batchDimensions:0 + mode:MPSGraphScatterModeAdd + name:@"edb"]; + + newCachedGraph->incomingGradTensor_ = incomingGradTensor; + newCachedGraph->indicesTensor_ = indicesTensor; + newCachedGraph->outgoingGradTensor_ = outgoingGradTensor; + + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + auto incomingGradPlaceholder = native_mps::Placeholder(cachedGraph->incomingGradTensor_, grad_); + auto indicesPlaceholder = native_mps::Placeholder(cachedGraph->indicesTensor_, indices); + auto outgoingGradPlaceholder = native_mps::Placeholder(cachedGraph->outgoingGradTensor_, outgoing_gradient); + + NSDictionary *feeds = @{ + incomingGradPlaceholder.getMPSGraphTensor() : incomingGradPlaceholder.getMPSGraphTensorData(), + indicesPlaceholder.getMPSGraphTensor() : indicesPlaceholder.getMPSGraphTensorData() + }; + + NSDictionary *results = @{ + outgoingGradPlaceholder.getMPSGraphTensor() : outgoingGradPlaceholder.getMPSGraphTensorData() + }; + native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + free(outgoing_gradient_shape); + return outgoing_gradient; +} + +Tensor & masked_fill__mps(Tensor& self, const Tensor & mask, const Tensor & value) { + TORCH_CHECK(value.dim() == 0, "masked_fill_ only supports a 0-dimensional value tensor, but got tensor " + "with ", value.dim(), " dimension(s)."); + return masked_fill__mps(self, mask, value.item()); +} + +} +} diff --git a/aten/src/ATen/native/mps/operations/Linear.mm b/aten/src/ATen/native/mps/operations/Linear.mm new file mode 100644 index 000000000000..d9cad62ee27c --- /dev/null +++ b/aten/src/ATen/native/mps/operations/Linear.mm @@ -0,0 +1,358 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include +#include +#include +#include + +#ifdef __OBJC__ +#include +#endif + +using namespace at::mps; + +namespace at { +namespace native { + +Tensor _mps_linear( + const Tensor& input, + const Tensor& weight, + const c10::optional& bias_opt) { + // wT = transpose(weight); + // y=x*wT+b + + using namespace mps; + + // See [Note: hacky wrapper removal for optional tensor] + auto bias = bias_opt.has_value() + ? c10::MaybeOwned::borrowed(*bias_opt) + : c10::MaybeOwned::owned(c10::in_place); + + auto input_size = input.sizes(); + std::vector output_size(input_size.begin(), input_size.end() - 1); + output_size.push_back(weight.size(0)); + Tensor output = at::native::empty_mps(output_size, + input.scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + input.suggest_memory_format()); + + TORCH_CHECK(output.is_mps()); + + MPSStream *stream = getCurrentMPSStream(); + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* inputTensor_ = nil; + MPSGraphTensor* weightTensor_ = nil; + MPSGraphTensor* biasTensor_ = nil; + MPSGraphTensor* outputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + bool is_bias_defined = bias->defined(); + + @autoreleasepool { + + MPSShape* wt_shape = getMPSShape(weight); + string wt_key = string([[[wt_shape valueForKey:@"description"] componentsJoinedByString:@","] UTF8String]); + MPSShape* bias_shape = nil; + string bias_key = "nobias"; + if(is_bias_defined) { + bias_key = "bias"; + } + + string key = "mps_linear" + getTensorsStringKey({input, weight}) + ":" + bias_key; + + + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + @autoreleasepool { + + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input); + MPSGraphTensor* weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight); + MPSGraphTensor* biasTensor = nil; + + if(is_bias_defined) { + biasTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType((*bias).scalar_type())); + } + + MPSGraphTensor* weightTransposeTensor = [mpsGraph transposeTensor:weightTensor + dimension:-1 + withDimension:-2 + name:nil]; + + MPSGraphTensor* outputTensor = nil; + + if (!is_bias_defined) + { + outputTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:inputTensor + secondaryTensor:weightTransposeTensor + name:nil]; + } + else + { + MPSGraphTensor* xMulWTTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:inputTensor + secondaryTensor:weightTransposeTensor + name:nil]; + outputTensor = [mpsGraph additionWithPrimaryTensor:xMulWTTensor + secondaryTensor:biasTensor + name:nil]; + } + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->weightTensor_ = weightTensor; + newCachedGraph->biasTensor_ = biasTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input); + Placeholder weightPlaceholder = Placeholder(cachedGraph->weightTensor_, weight); + Placeholder biasPlaceholder = Placeholder(); + if(is_bias_defined) + biasPlaceholder = Placeholder(cachedGraph->biasTensor_, *bias); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); + + NSMutableDictionary* feeds =[NSMutableDictionary dictionary]; + feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData(); + feeds[weightPlaceholder.getMPSGraphTensor()] = weightPlaceholder.getMPSGraphTensorData(); + if (is_bias_defined) + feeds[biasPlaceholder.getMPSGraphTensor()] = biasPlaceholder.getMPSGraphTensorData(); + + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + + return output; +} + +Tensor _mps_linear_backward_input( + IntArrayRef input_size, + const Tensor & grad_output, + const Tensor & weight) +{ + TORCH_CHECK(grad_output.is_mps(), + "mps_linear_backward: grad_output needs to be mps layout"); + TORCH_CHECK(weight.device().is_mps() && weight.scalar_type() == kFloat, + "mps_linear_backward: weight needs to be a dense tensor"); + + const Tensor weight_reshaped = weight.is_contiguous() ? weight : weight.contiguous(); + + struct CachedGraph : public mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *weightTensor_ = nil; + MPSGraphTensor *gradOutputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + Tensor output = at::native::empty_mps(input_size, + grad_output.scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + grad_output.suggest_memory_format()); + TORCH_CHECK(output.is_mps()); + + mps::MPSGraphCache *cache_ = mps::MPSGraphCache::getInstance(); + + MPSStream *stream= getCurrentMPSStream(); + + @autoreleasepool { + + string key = "mps_linear_backward_input" + mps::getTensorsStringKey({grad_output, weight_reshaped}); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ mps::MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph *mpsGraph = mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor *weightTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, weight_reshaped); + MPSGraphTensor *gradOutputTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, grad_output); + + MPSGraphTensor *outputTensor = + [mpsGraph matrixMultiplicationWithPrimaryTensor: gradOutputTensor + secondaryTensor: weightTensor + name: nil]; + + newCachedGraph->weightTensor_ = weightTensor; + newCachedGraph->gradOutputTensor_ = gradOutputTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + mps::Placeholder weightPlaceholder = mps::Placeholder(cachedGraph->weightTensor_, weight_reshaped); + mps::Placeholder gradOutputPlaceholder = mps::Placeholder(cachedGraph->gradOutputTensor_, grad_output); + mps::Placeholder outputPlaceholder = mps::Placeholder(cachedGraph->outputTensor_, output); + + NSDictionary* feeds = @{ + weightPlaceholder.getMPSGraphTensor() : weightPlaceholder.getMPSGraphTensorData(), + gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData() + }; + + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + + return output; + } +} + +std::tuple _mps_linear_backward_weights( + const Tensor& grad_output, const Tensor& input, const Tensor& weight, bool bias_defined) +{ + TORCH_CHECK(grad_output.is_mps() && input.is_mps(), + "_mps_linear_backward: grad_output and input needs to be mps layout"); + + struct CachedGraph : public mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *weightTensor_ = nil; + MPSGraphTensor *gradOutputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + MPSGraphTensor *biasTensor_ = nil; + }; + + auto grad_output_reshaped = grad_output.dim() > 2 ? + grad_output.reshape({-1, grad_output.size(grad_output.dim() - 1)}) : grad_output; + auto input_reshaped = input.dim() > 2 ? input.reshape({-1, input.size(input.dim() - 1)}) : input; + + TORCH_CHECK(grad_output_reshaped.is_mps()); + TORCH_CHECK(input_reshaped.is_mps()); + + Tensor output = at::native::empty_mps({grad_output_reshaped.size(1), input_reshaped.size(1)}, + grad_output.scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + grad_output.suggest_memory_format()); + Tensor bias = at::native::empty_mps({grad_output_reshaped.size(1)}, + grad_output.scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + grad_output.suggest_memory_format()); + TORCH_CHECK(output.is_mps()); + TORCH_CHECK(bias.is_mps()); + + mps::MPSGraphCache *cache_ = mps::MPSGraphCache::getInstance(); + + MPSStream *stream= getCurrentMPSStream(); + + @autoreleasepool { + + string key = "mps_linear_backward_weights:" + to_string(bias_defined) + ":" + + mps::getTensorsStringKey({input_reshaped, weight, grad_output_reshaped}); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ mps::MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph *mpsGraph = mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor *inputTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, input_reshaped); + MPSGraphTensor *weightTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, weight); + MPSGraphTensor *gradOutputTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, grad_output_reshaped); + + MPSGraphTensor *gradOutputTransposeTensor = + [mpsGraph transposeTensor: gradOutputTensor + dimension: -1 + withDimension: -2 + name: nil]; + + // grad_weight + MPSGraphTensor *outputTensor = + [mpsGraph matrixMultiplicationWithPrimaryTensor: gradOutputTransposeTensor + secondaryTensor: inputTensor + name: nil]; + MPSGraphTensor *biasTensor = nil; + if (bias_defined) + { + // grad_bias + biasTensor = [mpsGraph reductionSumWithTensor: gradOutputTensor + axis: 0 + name: nil]; + + } + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->weightTensor_ = weightTensor; + newCachedGraph->gradOutputTensor_ = gradOutputTensor; + newCachedGraph->outputTensor_ = outputTensor; + newCachedGraph->biasTensor_ = biasTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + mps::Placeholder inputPlaceholder = mps::Placeholder(cachedGraph->inputTensor_, input_reshaped); + mps::Placeholder weightPlaceholder = mps::Placeholder(cachedGraph->weightTensor_, weight); + mps::Placeholder gradOutputPlaceholder = mps::Placeholder(cachedGraph->gradOutputTensor_, grad_output_reshaped); + mps::Placeholder outputPlaceholder = mps::Placeholder(cachedGraph->outputTensor_, output); + mps::Placeholder biasPlaceholder = mps::Placeholder(cachedGraph->biasTensor_, bias); + + NSDictionary* feeds = @{ + gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), + inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), + weightPlaceholder.getMPSGraphTensor() : weightPlaceholder.getMPSGraphTensorData() + }; + + NSMutableDictionary* results = [NSMutableDictionary dictionary]; + results[outputPlaceholder.getMPSGraphTensor()] = outputPlaceholder.getMPSGraphTensorData(); + if (bias_defined) + results[biasPlaceholder.getMPSGraphTensor()] = biasPlaceholder.getMPSGraphTensorData(); + + mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + + return std::tuple{ output, bias }; + } +} + + +std::tuple mps_linear_backward( + const Tensor& input, const Tensor& grad_output, + const Tensor& weight, std::array output_mask) { + Tensor grad_input, grad_weight, grad_bias; + if (output_mask[0]) { + grad_input = at::_mps_linear_backward_input(input.sizes(), grad_output, weight); + } + if (output_mask[1] || output_mask[2]) { + std::tie(grad_weight, grad_bias) = at::_mps_linear_backward_weights(grad_output, input, weight, output_mask[2]); + } + return std::tuple{grad_input, grad_weight, grad_bias}; +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm new file mode 100644 index 000000000000..3b02567a2236 --- /dev/null +++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm @@ -0,0 +1,598 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include +#include +#include +#include +#include + +#ifdef __OBJC__ +#include +#endif + + +namespace at { +namespace native { + +/* + * Helper functions to be used for mm/addmm for detecting the Transpositions + * when doing Batched GEMM operations. + */ + +static Tensor prepare_batch_matrix_by_transposing(const Tensor& tensor, + bool& transpose_tensor, + int64_t& ld_tensor, + bool transpose_result, + int64_t m, int64_t n) { + IntArrayRef tensor_strides = tensor.strides(); + Tensor tensor_; + int fast_dim = transpose_result ? 2 : 1; + int leading_dim = transpose_result ? 1 : 2; + + if (tensor_strides[fast_dim] == 1 && + (tensor_strides[leading_dim] >= std::max(1, m))) { + transpose_tensor = false; + tensor_ = tensor; + ld_tensor = tensor_strides[leading_dim]; + } else if ((tensor_strides[leading_dim] == 1) && + (tensor_strides[fast_dim] >= std::max(1, n))) { + transpose_tensor = true; + tensor_ = tensor; + ld_tensor = tensor_strides[fast_dim]; + } else { + transpose_tensor = !transpose_result; + // gemm call requires leading dimension and stride parameters to be non-zero + bool is_stride_non_zero = tensor.stride(1) != 0 && tensor.stride(2) != 0; + if (tensor.is_contiguous() && is_stride_non_zero) { + tensor_ = tensor; + } else { + tensor_ = tensor.clone(at::MemoryFormat::Contiguous); + } + ld_tensor = tensor_.stride(1); + } + + return tensor_; +} + +/* + * Helper functions to be used for mm/addmm for detecting the Transpositions + * when doing GEMM operations. + */ +void prepare_matrices_for_broadcasting( + const Tensor * bias, + const Tensor & self, + const Tensor & other, + const Scalar * beta, + bool * transpose_mat1_times_mat2, + bool & transpose_mat1, + bool & transpose_mat2) { + TORCH_CHECK(self.dim() == 2 && other.dim() == 2, "tensors must be 2-D"); + if (bias && beta->toDouble() != 0.0f) { + TORCH_CHECK(bias->dim() == 2, "tensors must be 2-D"); + } + + std::pair mat1_sizes; + std::pair mat2_sizes; + + mat1_sizes = std::make_pair(self.sizes()[0], self.sizes()[1]); + mat2_sizes = std::make_pair(other.sizes()[0], other.sizes()[1]); + + if (mat1_sizes == mat2_sizes) { + transpose_mat2 = true; + std::swap(mat2_sizes.first, mat2_sizes.second); + } + if (bias && beta && transpose_mat1_times_mat2) { + if (beta->toDouble() != 0.0f && mat1_sizes.first == bias->sizes()[1] && mat2_sizes.second == bias->sizes()[0]) + *transpose_mat1_times_mat2 = true; + } +} + +enum LinearAlgebraOpType { + ADDBMM_OP_TYPE, + BADDBMM_OP_TYPE +}; + +Tensor& mm_out_mps_impl( + const Tensor& self, + const Tensor& other, + Tensor& output) { + using namespace mps; + TORCH_CHECK(self.dim() == 2 && other.dim() == 2, "tensors must be 2-D"); + + TensorArg args[]{{output, "out", 0}, {self, "mat1", 1}, {other, "mat2", 2}}; + checkAllSameGPU("mm", args); + + TORCH_CHECK(output.is_mps()); + + // Transpose inputs if needed + IntArrayRef output_sizes = output.sizes(); + if ((output_sizes[0] == 0) || (output_sizes[1] == 0)) { + return output; + } + + struct CachedGraph : public mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *selfTensor_ = nil; + MPSGraphTensor *otherTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + MPSStream* stream = getCurrentMPSStream(); + + bool transpose_mat1 = false; + bool transpose_mat2 = false; + + prepare_matrices_for_broadcasting(NULL, self, other, NULL, NULL, transpose_mat1, transpose_mat2); + + mps::MPSGraphCache *cache_ = mps::MPSGraphCache::getInstance(); + + @autoreleasepool { + + string key = "mm_out_mps_impl" + getTensorsStringKey({self, other}) + + ":" + to_string(transpose_mat1) + ":" + to_string(transpose_mat2); + + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + + mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ mps::MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + + @autoreleasepool{ + MPSGraph *mpsGraph = mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor *selfTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, self); + MPSGraphTensor *otherTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, other); + + MPSGraphTensor* t1 = nil; + MPSGraphTensor* t2 = nil; + + if(transpose_mat1) + t1 = [mpsGraph transposeTensor:selfTensor + dimension:-1 + withDimension:-2 + name:nil]; + else + t1 = selfTensor; + + if(transpose_mat2) + t2 = [mpsGraph transposeTensor:otherTensor + dimension:-1 + withDimension:-2 + name:nil]; + else + t2 = otherTensor; + + MPSGraphTensor* outputTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:t1 + secondaryTensor:t2 + name:nil]; + + newCachedGraph->selfTensor_ = selfTensor; + newCachedGraph->otherTensor_ = otherTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + Placeholder selfPlaceholder = Placeholder(cachedGraph->selfTensor_, self); + Placeholder otherPlaceholder = Placeholder(cachedGraph->otherTensor_, other); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); + + NSDictionary* feeds = @{ + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), + otherPlaceholder.getMPSGraphTensor() : otherPlaceholder.getMPSGraphTensorData() + }; + + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + + return output; +} + +Tensor& addmm_out_mps_impl( + const Tensor& bias, + const Tensor& self, // input + const Tensor& other, // weight + const Scalar& beta, + const Scalar& alpha, + Tensor& output) { + using namespace mps; + + TORCH_CHECK(output.is_mps()); + TORCH_CHECK(self.dim() == 2 && other.dim() == 2, "tensors must be 2-D"); + + TensorArg args[]{{output, "out", 0}, {bias, "self", 1}, {self, "mat1", 2}, {other, "mat2", 3}}; + checkAllSameGPU(__func__, args); + + IntArrayRef mat1_sizes = self.sizes(); + IntArrayRef mat2_sizes = other.sizes(); + IntArrayRef bias_sizes; + c10::MaybeOwned bias_; + if (&output != &bias) { + bias_ = expand_size(bias, {mat1_sizes[0], mat2_sizes[1]}, "addmm"); + bias_sizes = bias_->sizes(); + } else { + bias_ = c10::MaybeOwned::borrowed(bias); + bias_sizes = bias_->sizes(); + TORCH_CHECK(output.dim() == 2, "tensors must be 2-D"); + TORCH_CHECK(bias_sizes[0] == mat1_sizes[0], "self_ dim 0 must match mat1 dim 0"); + TORCH_CHECK(bias_sizes[1] == mat2_sizes[1], "self_ dim 1 must match mat2 dim 1"); + } + + if (&output != &self) { + output.resize_(bias_sizes); + if (beta.toComplexDouble() != 0.0) { + at::native::copy_(output, *bias_); + } + } + IntArrayRef output_sizes = output.sizes(); + if ((output_sizes[0] == 0) || (output_sizes[1] == 0)) { + return output; + } + + MPSStream* stream = getCurrentMPSStream(); + + MPSGraph* mpsGraph = make_mps_graph(); + + bool transpose_mat1_times_mat2 = false; + bool transpose_mat1 = false; + bool transpose_mat2 = false; + + prepare_matrices_for_broadcasting(&bias, self, other, &beta, &transpose_mat1_times_mat2, transpose_mat1, transpose_mat2); + + struct CachedGraph : public mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *selfTensor_ = nil; + MPSGraphTensor *otherTensor_ = nil; + MPSGraphTensor *biasTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + mps::MPSGraphCache *cache_ = mps::MPSGraphCache::getInstance(); + + @autoreleasepool { + string key = "addmm_out_mps_impl" + getTensorsStringKey({self, other, bias}) + + ":" + to_string(transpose_mat1) + ":" + to_string(transpose_mat2) + + ":" + to_string(beta.toDouble()) + + ":" + to_string(alpha.toDouble()); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + + mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ mps::MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + + @autoreleasepool{ + MPSGraph *mpsGraph = mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor *selfTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, self); + MPSGraphTensor *otherTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, other); + MPSGraphTensor *biasTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, bias); + + MPSGraphTensor* t1 = nil; + MPSGraphTensor* t2 = nil; + + if(transpose_mat1) + t1 = [mpsGraph transposeTensor:selfTensor + dimension:-1 + withDimension:-2 + name:nil]; + else + t1 = selfTensor; + + if(transpose_mat2) + t2 = [mpsGraph transposeTensor:otherTensor + dimension:-1 + withDimension:-2 + name:nil]; + else + t2 = otherTensor; + + + // TODO: Use alpha and beta here with fill_.Scalar and mul + // Intermediate as placeholder + MPSGraphTensor* productTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:t1 + secondaryTensor:t2 + name:@"MM/(mat1@mat2)"]; + + // Intermediates for beta and alpha + MPSGraphTensor* betaTensor = [mpsGraph constantWithScalar:beta.toDouble() + dataType:getMPSScalarType(bias.scalar_type())]; + MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar:alpha.toDouble() + dataType:getMPSScalarType(self.scalar_type())]; + + // Intermediates for multiplying by beta and alpha + MPSGraphTensor* productTimesAlphaTensor = [mpsGraph multiplicationWithPrimaryTensor:productTensor + secondaryTensor:alphaTensor + name:@"MM/alpha*(mat1@mat2)"]; + MPSGraphTensor* biasTimesBetaTensor = [mpsGraph multiplicationWithPrimaryTensor:biasTensor + secondaryTensor:betaTensor + name:@"MM/beta*input"]; + + if (transpose_mat1_times_mat2) + biasTimesBetaTensor = [mpsGraph transposeTensor: biasTimesBetaTensor + dimension: -1 + withDimension: -2 + name: nil]; + + MPSGraphTensor* outputTensor = [mpsGraph additionWithPrimaryTensor:productTimesAlphaTensor + secondaryTensor:biasTimesBetaTensor + name:@"MM/beta*input + alpha*(mat1@mat2)"]; + + newCachedGraph->selfTensor_ = selfTensor; + newCachedGraph->otherTensor_ = otherTensor; + newCachedGraph->biasTensor_ = biasTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + Placeholder selfPlaceholder = Placeholder(cachedGraph->selfTensor_, self); + Placeholder otherPlaceholder = Placeholder(cachedGraph->otherTensor_, other); + Placeholder biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); + + NSDictionary* feeds = @{ + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), + otherPlaceholder.getMPSGraphTensor() : otherPlaceholder.getMPSGraphTensorData(), + biasPlaceholder.getMPSGraphTensor() : biasPlaceholder.getMPSGraphTensorData() + }; + + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + + return output; +} + + +Tensor& bmm_out_mps_impl( + const Tensor & batch1, + const Tensor & batch2, + Tensor & result) { + using namespace mps; + + if (batch1.numel() == 0 || batch2.numel() == 0) { + return result; + } + + MPSStream* stream = getCurrentMPSStream(); + + struct CachedGraph : public mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *batch1Tensor_ = nil; + MPSGraphTensor *batch2Tensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + mps::MPSGraphCache *cache_ = mps::MPSGraphCache::getInstance(); + + @autoreleasepool { + string key = "bmm_out_mps_impl" + getTensorsStringKey({batch1, batch2}); + + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + + mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ mps::MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + + @autoreleasepool{ + MPSGraph *mpsGraph = mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor *batch1Tensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, batch1); + MPSGraphTensor *batch2Tensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, batch2); + + MPSGraphTensor* productTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:batch1Tensor + secondaryTensor:batch2Tensor + name:@"MM/(batch1@batch2)"]; + + newCachedGraph->batch1Tensor_ = batch1Tensor; + newCachedGraph->batch2Tensor_ = batch2Tensor; + newCachedGraph->outputTensor_ = productTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + Placeholder batch1Placeholder = Placeholder(cachedGraph->batch1Tensor_, batch1); + Placeholder batch2Placeholder = Placeholder(cachedGraph->batch2Tensor_, batch2); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result); + + NSDictionary* feeds = @{ + batch1Placeholder.getMPSGraphTensor() : batch1Placeholder.getMPSGraphTensorData(), + batch2Placeholder.getMPSGraphTensor() : batch2Placeholder.getMPSGraphTensorData(), + }; + + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + + return result; +} + +Tensor& addbmm_or_baddbmm_out_mps_impl( + const Tensor & input, + const Tensor & batch1, + const Tensor & batch2, + const Scalar & beta, + const Scalar & alpha, + Tensor & result, + LinearAlgebraOpType opType) { + using namespace mps; + + TORCH_CHECK(input.is_mps()); + TORCH_CHECK(batch1.is_mps()); + TORCH_CHECK(batch2.is_mps()); + TORCH_CHECK(result.is_mps()); + + TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor"); + TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor"); + TORCH_CHECK(batch1.size(0) == batch2.size(0), + "batch1 and batch2 must have same number of batches, got ", + batch1.size(0), " and ", batch2.size(0)); + TORCH_CHECK(batch1.size(2) == batch2.size(1), + "Incompatible matrix sizes for bmm (", + batch1.size(1), "x", batch1.size(2), " and ", + batch2.size(1), "x", batch2.size(2), ")"); + + const int64_t dim1 = batch1.size(1); + const int64_t dim2 = batch2.size(2); + TORCH_CHECK(input.size(0) == dim1 && input.size(1) == dim2, + "input tensor does not match matmul output shape"); + + if (opType == ADDBMM_OP_TYPE) + { + result.resize_as_(input); + + const int64_t num_batches = batch1.size(0); + + if (num_batches == 0) { + result.zero_(); + return result; + } + } + + MPSStream* stream = getCurrentMPSStream(); + + struct CachedGraph : public mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *batch1Tensor_ = nil; + MPSGraphTensor *batch2Tensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + mps::MPSGraphCache *cache_ = mps::MPSGraphCache::getInstance(); + + @autoreleasepool { + string key = (opType == ADDBMM_OP_TYPE) ? ("addbmm_out_mps_impl") : ("baddbmm_out_mps_impl"); + key += getTensorsStringKey({batch1, batch2, input}) + + ":" + to_string(beta.toDouble()) + + ":" + to_string(alpha.toDouble()); + + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + + mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ mps::MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + + @autoreleasepool{ + MPSGraph *mpsGraph = mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor *inputTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, input); + MPSGraphTensor *batch1Tensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, batch1); + MPSGraphTensor *batch2Tensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, batch2); + + // Intermediates for beta and alpha + MPSGraphTensor* betaTensor = [mpsGraph constantWithScalar: beta.toDouble() + dataType: getMPSScalarType(input.scalar_type())]; + MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar: alpha.toDouble() + dataType: getMPSScalarType(batch1.scalar_type())]; + + MPSGraphTensor* productTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:batch1Tensor + secondaryTensor:batch2Tensor + name:@"(batch1@batch2)"]; + + MPSGraphTensor* reductionSumTensor = productTensor; + if (opType == ADDBMM_OP_TYPE) { + reductionSumTensor = [mpsGraph reductionSumWithTensor: productTensor + axis: 0 + name: @"reductionSum(batch1@batch2)"]; + } + + // Intermediates for multiplying by beta and alpha + MPSGraphTensor* reductionSumTimesAlphaTensor = [mpsGraph multiplicationWithPrimaryTensor: reductionSumTensor + secondaryTensor: alphaTensor + name: @"alpha*(batch1@batch2)"]; + MPSGraphTensor* biasTimesBetaTensor = [mpsGraph multiplicationWithPrimaryTensor: inputTensor + secondaryTensor: betaTensor + name: @"beta*input"]; + + MPSGraphTensor* outputTensor = [mpsGraph additionWithPrimaryTensor:reductionSumTimesAlphaTensor + secondaryTensor:biasTimesBetaTensor + name:@"beta*input + alpha*(batch1@batch2)"]; + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->batch1Tensor_ = batch1Tensor; + newCachedGraph->batch2Tensor_ = batch2Tensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input); + Placeholder batch1Placeholder = Placeholder(cachedGraph->batch1Tensor_, batch1); + Placeholder batch2Placeholder = Placeholder(cachedGraph->batch2Tensor_, batch2); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result); + + NSDictionary* feeds = @{ + inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), + batch1Placeholder.getMPSGraphTensor() : batch1Placeholder.getMPSGraphTensorData(), + batch2Placeholder.getMPSGraphTensor() : batch2Placeholder.getMPSGraphTensorData(), + }; + + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + + return result; +} + +TORCH_IMPL_FUNC(mm_out_mps)(const Tensor& self, const Tensor& mat2, const Tensor& result) { + mm_out_mps_impl(self, mat2, const_cast(result)); +} + +TORCH_IMPL_FUNC(addmm_out_mps)(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, const Tensor& result) { + addmm_out_mps_impl(self, mat1, mat2, beta, alpha, const_cast(result)); +} + +TORCH_IMPL_FUNC(bmm_out_mps) (const Tensor & batch1, const Tensor & batch2, const Tensor & result) { + bmm_out_mps_impl(batch1, batch2, const_cast(result)); +} + +TORCH_IMPL_FUNC(baddbmm_out_mps) (const Tensor & self, const Tensor & batch1, const Tensor & batch2, const Scalar& beta, const Scalar& alpha, const Tensor& result) { + addbmm_or_baddbmm_out_mps_impl(self, batch1, batch2, beta, alpha, const_cast(result), BADDBMM_OP_TYPE); +} + +Tensor& addbmm_out_mps(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, Tensor& result) { + auto b_self = expand_size(self, {batch1.size(1), batch2.size(2)}, "addbmm_out"); + + addbmm_or_baddbmm_out_mps_impl(*b_self, batch1, batch2, beta, alpha, result, ADDBMM_OP_TYPE); + return result; +} + +Tensor addbmm_mps(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha) { + Tensor result = at::empty({0}, self.options()); + return addbmm_out_mps(self, batch1, batch2, beta, alpha, result); +} + +Tensor &addbmm_mps_(Tensor& self, const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha) { + return addbmm_out_mps(self, batch1, batch2, beta, alpha, self); +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/mps/operations/LossOps.mm b/aten/src/ATen/native/mps/operations/LossOps.mm new file mode 100644 index 000000000000..35202fd70a5f --- /dev/null +++ b/aten/src/ATen/native/mps/operations/LossOps.mm @@ -0,0 +1,1379 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include +#include + +#ifdef __OBJC__ +#include +#endif + +namespace at { +namespace native { +namespace mps { + +string reductionToString(int64_t reduction) +{ + switch(reduction) { + case Reduction::Mean: return "Mean"; + case Reduction::Sum: return "Sum"; + default: return "None"; + } +} + +MPSGraphTensor* reduceTensor(MPSGraphTensor *tensor, int64_t reduction, MPSGraph *mpsGraph, NSUInteger axesCount) +{ + NSMutableArray *axes = [NSMutableArray arrayWithCapacity:axesCount]; + for (NSUInteger i = 0; i < axesCount; i++) axes[i] = @(i); + + switch(reduction) { + case Reduction::Mean: + return [mpsGraph meanOfTensor: tensor axes: axes name: @"reductionMeanTensor"]; + case Reduction::Sum: + return [mpsGraph reductionSumWithTensor: tensor axes: axes name: @"reductionSumTensor"]; + default: + assert(reduction == Reduction::None); + return tensor; + } +} + +// MSELoss +void mse_loss_out_impl(const Tensor& input, const Tensor& target, + int64_t reduction, const Tensor& output, const string op_name) +{ +} + +Tensor& mse_loss_backward_out_impl(const Tensor& grad_output, const Tensor& input, const Tensor& target, + int64_t reduction, Tensor& grad_input, const string op_name) +{ + TORCH_CHECK(target.is_same_size(input), op_name + ": target and input tensors must have identical shapes") + auto norm = reduction == Reduction::Mean ? 2. / static_cast(input.numel()) : 2.; + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor = nil, *targetTensor = nil; + MPSGraphTensor *gradInputTensor = nil, *gradOutputTensor = nil; + }; + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + @autoreleasepool { + string key = op_name + reductionToString(reduction) + ":" + + to_string(grad_input.sizes()[1]) + + getTensorsStringKey({input, target, grad_output}); + + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + cachedGraph = static_cast(cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input); + newCachedGraph->targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target); + newCachedGraph->gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output); + + MPSGraphTensor *normTensor = [mpsGraph constantWithScalar: norm + dataType: MPSDataTypeFloat32]; + MPSGraphTensor *diffTensor = [mpsGraph subtractionWithPrimaryTensor: newCachedGraph->inputTensor + secondaryTensor: newCachedGraph->targetTensor + name: nil]; + MPSGraphTensor *diffGradientTensor = [mpsGraph multiplicationWithPrimaryTensor: diffTensor + secondaryTensor: newCachedGraph->gradOutputTensor + name: nil]; + newCachedGraph->gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor: diffGradientTensor + secondaryTensor: normTensor + name: nil]; + } + return newCachedGraph; + })); + } + Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor, input); + Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor, target); + Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor, grad_input); + Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor, grad_output); + + NSDictionary* feeds = @{ + inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), + targetPlaceholder.getMPSGraphTensor() : targetPlaceholder.getMPSGraphTensorData(), + gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData() + }; + NSDictionary* results = @{ + gradInputPlaceholder.getMPSGraphTensor() :gradInputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + } + + return grad_input; +} + +// namespace to localize the CachedGraph struct for Binary Cross Entropy +namespace BCELoss +{ + +struct CachedGraph : public MPSCachedGraph +{ + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor = nil, *targetTensor = nil; + // gradOutput only used on backward pass + MPSGraphTensor *weightTensor = nil, *gradOutputTensor = nil; + // lossTensor used for forward, and gradInputTensor for backward pass + union { MPSGraphTensor *lossTensor = nil; MPSGraphTensor *gradInputTensor; }; +}; + +MPSGraphTensor* bce_forward_mps(CachedGraph *bceGraph) +{ + MPSGraph *mpsGraph = bceGraph->graph(); + + // Forward BCE: L = -w (y ln(x) + (1-y) ln(1-x)) + MPSGraphTensor *one = [mpsGraph constantWithScalar: 1.0 + dataType: MPSDataTypeFloat32]; + // -100 is the hard limit value defined in BCELoss Spec. to clamp the log + MPSGraphTensor *neg100 = [mpsGraph constantWithScalar: -100.0 + dataType: MPSDataTypeFloat32]; + // 1 - x + MPSGraphTensor *one_Input = [mpsGraph subtractionWithPrimaryTensor: one + secondaryTensor: bceGraph->inputTensor + name: nil]; + // log(x) + MPSGraphTensor *logInput = [mpsGraph logarithmWithTensor: bceGraph->inputTensor + name: nil]; + // max(log(x), -100) + MPSGraphTensor *clampedLogInput = [mpsGraph maximumWithPrimaryTensor: logInput + secondaryTensor: neg100 + name: nil]; + // log(1 - x) + MPSGraphTensor *log1_Input = [mpsGraph logarithmWithTensor: one_Input + name: nil]; + // max(log(1 - x), -100) + MPSGraphTensor *clampedLog1_Input = [mpsGraph maximumWithPrimaryTensor: log1_Input + secondaryTensor: neg100 + name: nil]; + // (y - 1) resulted from -(1 - y) + MPSGraphTensor *target_1 = [mpsGraph subtractionWithPrimaryTensor: bceGraph->targetTensor + secondaryTensor: one + name: nil]; + // (y - 1) * max(log(1 - x), -100) + MPSGraphTensor *target_1TimesLog1_Input = [mpsGraph multiplicationWithPrimaryTensor: target_1 + secondaryTensor: clampedLog1_Input + name: nil]; + // y * max(log(x), -100) + MPSGraphTensor *targetTimesLogInput = [mpsGraph multiplicationWithPrimaryTensor: bceGraph->targetTensor + secondaryTensor: clampedLogInput + name: nil]; + // ((y - 1) * max(log(1 - x), -100)) - (y * max(log(x), -100)) + MPSGraphTensor *bceLoss = [mpsGraph subtractionWithPrimaryTensor: target_1TimesLog1_Input + secondaryTensor: targetTimesLogInput + name: nil]; + return bceLoss; +} + +MPSGraphTensor* bce_backward_mps(CachedGraph *bceGraph) +{ + MPSGraph *mpsGraph = bceGraph->graph(); + + // Backward BCE: d(L)/d(x) = -w (y - x) / (x - x^2) + MPSGraphTensor *one = [mpsGraph constantWithScalar: 1.0 + dataType: MPSDataTypeFloat32]; + // epsilon used to clamp the grad input denominator + MPSGraphTensor *epsilon = [mpsGraph constantWithScalar: 1e-12 + dataType: MPSDataTypeFloat32]; + // 1 - x + MPSGraphTensor *one_Input = [mpsGraph subtractionWithPrimaryTensor: one + secondaryTensor: bceGraph->inputTensor + name: nil]; + // x * (1 - x) + MPSGraphTensor *inputTimes1_Input = [mpsGraph multiplicationWithPrimaryTensor: bceGraph->inputTensor + secondaryTensor: one_Input + name: nil]; + // max(x * (1 - x), epsilon) + MPSGraphTensor *gradInputDenominator = [mpsGraph maximumWithPrimaryTensor: inputTimes1_Input + secondaryTensor: epsilon + name: nil]; + // (x - y) + MPSGraphTensor *input_target = [mpsGraph subtractionWithPrimaryTensor: bceGraph->inputTensor + secondaryTensor: bceGraph->targetTensor + name: nil]; + // (x - y) / max(x * (1 - x), epsilon) + MPSGraphTensor *inputDivGradInputDenom = [mpsGraph divisionWithPrimaryTensor: input_target + secondaryTensor: gradInputDenominator + name: nil]; + // gradOutput * (((x - y) / max(x * (1 - x), epsilon))) + MPSGraphTensor *gradInput = [mpsGraph multiplicationWithPrimaryTensor: bceGraph->gradOutputTensor + secondaryTensor: inputDivGradInputDenom + name: nil]; + return gradInput; +} + +// Binary Cross Enropy (Forward/Backward BCELoss) +// NOTE: "loss" tensor would be "grad_input" if it's a backward pass +Tensor& bce_loss_out_impl(const Tensor& input, const Tensor& target, + const c10::optional& weight_opt, int64_t reduction, Tensor& loss, + const c10::optional& grad_output_opt, const string op_name) +{ + // TODO: add sanity check for the elements of input tensor to be within [0..1] + TORCH_CHECK(target.is_same_size(input), op_name + ": target and input tensors must have identical shapes") + + c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); + c10::MaybeOwned grad_output_maybe_owned = at::borrow_from_optional_tensor(grad_output_opt); + const Tensor& weight = *weight_maybe_owned; + const Tensor& grad_output = *grad_output_maybe_owned; + + loss.resize_((reduction == Reduction::None || grad_output.defined()) ? target.sizes() : IntArrayRef({})); + TORCH_CHECK(loss.is_mps()); + + Tensor loss_squeezed = at::squeeze(loss); + Tensor input_squeezed = at::squeeze(input); + Tensor target_squeezed = at::squeeze(target); + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + @autoreleasepool { + string key = op_name + reductionToString(reduction) + getTensorsStringKey({input_squeezed, target_squeezed, weight}); + + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + cachedGraph = static_cast(cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_squeezed); + newCachedGraph->targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target_squeezed); + + MPSGraphTensor *bceLossUnweighted = nil; + // if grad_output is defined, then it's a backward pass + if (grad_output.defined()) { + newCachedGraph->gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output); + bceLossUnweighted = bce_backward_mps(newCachedGraph); + } else { + bceLossUnweighted = bce_forward_mps(newCachedGraph); + } + + MPSGraphTensor *bceLoss = bceLossUnweighted; + if (weight.defined()) { + newCachedGraph->weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight); + bceLoss = [mpsGraph multiplicationWithPrimaryTensor: bceLossUnweighted + secondaryTensor: newCachedGraph->weightTensor + name: nil]; + } + + if (grad_output.defined()) { + if (reduction == at::Reduction::Mean) { + MPSGraphTensor *inputNumel = [mpsGraph constantWithScalar: static_cast(input.numel()) + dataType: MPSDataTypeFloat32]; + newCachedGraph->gradInputTensor = [mpsGraph divisionWithPrimaryTensor: bceLoss + secondaryTensor: inputNumel + name: nil]; + } else { + newCachedGraph->gradInputTensor = bceLoss; + } + } else { + newCachedGraph->lossTensor = reduceTensor(bceLoss, reduction, mpsGraph, input.sizes().size()); + } + } + return newCachedGraph; + })); + } + Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor, input_squeezed); + Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor, target_squeezed); + Placeholder lossPlaceholder = Placeholder(cachedGraph->lossTensor, loss_squeezed); + + NSMutableDictionary *feeds = [[NSMutableDictionary new] autorelease]; + feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData(); + feeds[targetPlaceholder.getMPSGraphTensor()] = targetPlaceholder.getMPSGraphTensorData(); + if (weight.defined()) { + Placeholder weightPlaceholder = Placeholder(cachedGraph->weightTensor, weight); + feeds[weightPlaceholder.getMPSGraphTensor()] = weightPlaceholder.getMPSGraphTensorData(); + } + if (grad_output.defined()) { + Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor, grad_output); + feeds[gradOutputPlaceholder.getMPSGraphTensor()] = gradOutputPlaceholder.getMPSGraphTensorData(); + } + + NSDictionary* results = @{ + lossPlaceholder.getMPSGraphTensor() : lossPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + } + + return loss; +} + +} // namespace BCELoss + +// NLLLoss +void nllnd_loss_backward_impl( +Tensor& grad_input, +const Tensor& grad_output, +const Tensor& input, +const Tensor& target, +const Tensor& weight, +int64_t reduction, +int64_t ignore_index, +const Tensor& total_weight, +bool is2D) +{ + // Empty output + if(grad_input.numel() == 0) + return; + + MPSStream* stream = getCurrentMPSStream(); + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* inputTensor_ = nil; + MPSGraphTensor* targetTensor_ = nil; + MPSGraphTensor* weightTensor_ = nil; + MPSGraphTensor* totalWeightTensor_ = nil; + MPSGraphTensor* gradInputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + @autoreleasepool { + + auto numClasses = grad_input.sizes()[1]; + bool isWeightsArrayValid = (weight.numel() > 0); + + MPSShape* input_shape = getMPSShape(input); + MPSShape* target_shape = getMPSShape(target); + MPSShape* weight_shape = getMPSShape(weight); + MPSShape* total_weight_shape = getMPSShape(total_weight); + + NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","]; + + string key = "nllnd_loss_backward_impl:" + to_string(numClasses) + ":" + + to_string(ignore_index) + ":" + + to_string(isWeightsArrayValid) + ":" + + reductionToString(reduction) + ":" + + [ns_shape_key UTF8String] + ":" + + getMPSTypeString(input.scalar_type()) + ":" + + getMPSTypeString(target.scalar_type()) + ":" + + getMPSTypeString(weight.scalar_type()) + ":" + + getMPSTypeString(total_weight.scalar_type()); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), input_shape); + MPSGraphTensor* targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(target.scalar_type()), target_shape); + MPSGraphTensor* weightTensor = nil; + if(isWeightsArrayValid) + weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(weight.scalar_type()), weight_shape); + MPSGraphTensor* totalWeightTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(total_weight.scalar_type()), total_weight_shape); + + MPSGraphTensor *udpatedTargetTensor = targetTensor; + + // Replace ignored_index with length depth + 1 so that oneHotAPI ignores it + if(ignore_index != -100) + { + MPSGraphTensor *mpsGraphIndexTensor = [mpsGraph constantWithScalar: ignore_index + dataType: MPSDataTypeInt64]; + MPSGraphTensor *mpsGraphDepthPlusOneTensor = [mpsGraph constantWithScalar: (numClasses + 1) + dataType: MPSDataTypeInt64]; + + // Equal tensor + MPSGraphTensor* mpsGraphIsEqualTensor = [mpsGraph equalWithPrimaryTensor: targetTensor + secondaryTensor: mpsGraphIndexTensor + name: @"isEqualTensor"]; + + udpatedTargetTensor = [mpsGraph selectWithPredicateTensor: mpsGraphIsEqualTensor + truePredicateTensor: mpsGraphDepthPlusOneTensor + falsePredicateTensor: targetTensor + name: @"predicateTensor"]; + } + + float onValue = -1.0f; + + MPSGraphTensor *oneHotTensor; + + oneHotTensor = [mpsGraph oneHotWithIndicesTensor:udpatedTargetTensor + depth:numClasses + axis:1 + dataType:inputTensor.dataType + onValue:onValue + offValue:0.0f + name:nil]; + + if(isWeightsArrayValid) + { + oneHotTensor = [mpsGraph multiplicationWithPrimaryTensor:oneHotTensor + secondaryTensor:weightTensor + name:@"scaleByWeightTensor"]; + } + + if(reduction == Reduction::Mean) + { + oneHotTensor = [mpsGraph divisionNoNaNWithPrimaryTensor:oneHotTensor + secondaryTensor:totalWeightTensor + name:@"divisionTensor"]; + } + + MPSGraphTensor* gradInputTensor = oneHotTensor; + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->targetTensor_ = targetTensor; + newCachedGraph->weightTensor_ = weightTensor; + newCachedGraph->totalWeightTensor_ = totalWeightTensor; + newCachedGraph->gradInputTensor_ = gradInputTensor; + + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input); + auto targetPlaceholder = Placeholder(cachedGraph->targetTensor_, target); + Placeholder weightPlaceholder = Placeholder(); + if(isWeightsArrayValid) + weightPlaceholder = Placeholder(cachedGraph->weightTensor_, weight); + auto totalWeightPlaceholder = Placeholder(cachedGraph->totalWeightTensor_, total_weight); + auto gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input); + + NSMutableDictionary* feeds = [[NSMutableDictionary alloc] initWithCapacity: 4]; + feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData(); + feeds[targetPlaceholder.getMPSGraphTensor()] = targetPlaceholder.getMPSGraphTensorData(); + feeds[totalWeightPlaceholder.getMPSGraphTensor()] = totalWeightPlaceholder.getMPSGraphTensorData(); + + if(isWeightsArrayValid) + feeds[weightPlaceholder.getMPSGraphTensor()] = weightPlaceholder.getMPSGraphTensorData(); + + NSDictionary* results = @{ + gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + + return; +} + +void nllnd_loss_forward_impl +(Tensor& output, + Tensor& total_weight, + const Tensor& input, + const Tensor& target, + const Tensor& weight, + int64_t reduction, + int64_t ignore_index, + bool is2D) +{ + std::vector reshapedTarget(target.sizes().begin(), target.sizes().end()); + reshapedTarget.push_back(1); + + Tensor batchSizeTensor = at::empty_like(input).resize_(IntArrayRef(1)); + float batchVal = 1.0f; + for(size_t i = 0; i < reshapedTarget.size(); ++i) + batchVal *= reshapedTarget[i]; + batchSizeTensor[0] = batchVal; + + if(reduction == Reduction::None) + output.resize_(target.sizes()); + if(reduction == Reduction::Sum) + output.resize_({}); + if(reduction == Reduction::Mean) + output.resize_({}); + + TORCH_CHECK(output.is_mps()); + + // Empty output + if(output.numel() == 0) + return; + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* inputTensor_ = nil; + MPSGraphTensor* targetTensor_ = nil; + MPSGraphTensor* weightTensor_ = nil; + MPSGraphTensor* batchSizeTensor_ = nil; + MPSGraphTensor* totalWeightTensor_ = nil; + MPSGraphTensor* outputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + MPSStream* stream = getCurrentMPSStream(); + + @autoreleasepool { + + bool isWeightsArrayValid = (weight.numel() > 0); + + MPSShape* input_shape = getMPSShape(input); + MPSShape* target_shape = getMPSShape(target); + MPSShape* weight_shape = getMPSShape(weight); + + NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","]; + + // TODO: Make the key + string key = "nllnd_loss_forward_impl:" + to_string(ignore_index) + ":" + + to_string(isWeightsArrayValid) + ":" + + reductionToString(reduction) + ":" + + [ns_shape_key UTF8String] + ":" + + getMPSTypeString(input.scalar_type()) + ":" + + getMPSTypeString(target.scalar_type()) + ":" + + getMPSTypeString(weight.scalar_type()); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), input_shape); + MPSGraphTensor* targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(target.scalar_type()), target_shape); + MPSGraphTensor* weightTensor = nil; + if(isWeightsArrayValid) + weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(weight.scalar_type()), weight_shape); + MPSGraphTensor* mps_batchSizeTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(batchSizeTensor.scalar_type())); + + MPSGraphTensor* mpsGraphBatchSizeTensor = mps_batchSizeTensor; + + // The transposes are needed to get the class dimension (dim 1) to the inner most dim for gather op. + // The transpose become nop in the 2D case. + MPSGraphTensor* mpsTransposeTensor = inputTensor; + int classDim = 1; + int lastDim = input.sizes().size()-1; + mpsTransposeTensor = [mpsGraph transposeTensor:inputTensor + dimension:classDim + withDimension:lastDim + name:nil]; + for(int i = 0; i < lastDim - 2; ++i) + { + mpsTransposeTensor = [mpsGraph transposeTensor:mpsTransposeTensor + dimension:classDim+i + withDimension:classDim+i+1 name:nil]; + } + + + MPSGraphTensor* mpsGatherTensor = [mpsGraph gatherWithUpdatesTensor:mpsTransposeTensor + indicesTensor:targetTensor + axis:lastDim + batchDimensions:lastDim + name:@"gatherTensor"]; + + bool isIgnoreIndexValid = (ignore_index != -100); + MPSGraphTensor* weightGatherTensor; + + if(isWeightsArrayValid) + { + weightGatherTensor = [mpsGraph gatherWithUpdatesTensor:weightTensor + indicesTensor:targetTensor + axis:0 + batchDimensions:0 + name:@"weightGatherTensor"]; + MPSGraphTensor *mpsGatherCopyTensor = [mpsGraph identityWithTensor:mpsGatherTensor + name:@"identityTensor"]; + mpsGatherTensor = [mpsGraph multiplicationWithPrimaryTensor:weightGatherTensor + secondaryTensor:mpsGatherCopyTensor + name:@"scaledLossTensor"]; + } + + // Both these cases need recomputation of denominator when reductionMode == mean + if(isIgnoreIndexValid || isWeightsArrayValid) + { + // Setup tensors + MPSGraphTensor *mpsGraphZeroTensor = [mpsGraph constantWithScalar:0.0 + dataType:mpsGatherTensor.dataType]; + MPSGraphTensor *mpsGraphOneTensor = [mpsGraph constantWithScalar:1.0 + dataType:mpsGatherTensor.dataType]; + // @TODO: Remove this identity call with ToT StarSky MPSGraph + MPSGraphTensor *mpsGraphOneTensorCopy = [mpsGraph identityWithTensor:mpsGraphOneTensor + name:@"IdentityHackTensor"]; + + MPSGraphTensor *mpsGraphIsEqualTensor; + + if(isIgnoreIndexValid) + { + MPSGraphTensor *mpsGraphIndexTensor = [mpsGraph constantWithScalar:ignore_index + dataType:MPSDataTypeInt64]; + // Equal tensor + mpsGraphIsEqualTensor = [mpsGraph equalWithPrimaryTensor:targetTensor + secondaryTensor:mpsGraphIndexTensor + name:@"isEqualTensor"]; + // Zero out loss + MPSGraphTensor *mpsGatherCopyTensor = [mpsGraph identityWithTensor:mpsGatherTensor + name:@"identityTensor"]; + mpsGatherTensor = [mpsGraph selectWithPredicateTensor:mpsGraphIsEqualTensor + truePredicateTensor:mpsGraphZeroTensor + falsePredicateTensor:mpsGatherCopyTensor + name:@"predicateTensor"]; + } + + if(isWeightsArrayValid) + { + mpsGraphOneTensorCopy = weightGatherTensor; + if(!isIgnoreIndexValid) + { + mpsGraphIsEqualTensor = [mpsGraph constantWithScalar: 0.0 + shape: targetTensor.shape + dataType: targetTensor.dataType]; + } + } + + // Compute new batch size + MPSGraphTensor* mpsSelectOneTensor = [mpsGraph selectWithPredicateTensor:mpsGraphIsEqualTensor + truePredicateTensor:mpsGraphZeroTensor + falsePredicateTensor:mpsGraphOneTensorCopy + name:@"predicateOneTensor"]; + mpsGraphBatchSizeTensor = [mpsGraph reductionSumWithTensor:mpsSelectOneTensor + axes:nil + name:@"batchSizeReductionTensor"]; + } + + MPSGraphTensor *mpsGraphNegTensor = [mpsGraph negativeWithTensor:mpsGatherTensor + name:@"negativeTensor"]; + + MPSGraphTensor* mpsGraphReducedTensor = mpsGraphNegTensor; + + if(!(reduction == Reduction::None)) + { + mpsGraphReducedTensor = [mpsGraph reductionSumWithTensor:mpsGraphNegTensor + axes:nil + name:@"reductionSumTensor"]; + if(reduction == Reduction::Mean) + { + mpsGraphReducedTensor = [mpsGraph divisionNoNaNWithPrimaryTensor:mpsGraphReducedTensor + secondaryTensor:mpsGraphBatchSizeTensor + name:@"divisionTensor"]; + } + } + + mpsGraphReducedTensor = [mpsGraph reshapeTensor:mpsGraphReducedTensor + withShape:getMPSShape(output) + name:nil]; + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->targetTensor_ = targetTensor; + newCachedGraph->weightTensor_ = weightTensor; + newCachedGraph->batchSizeTensor_ = mps_batchSizeTensor; + newCachedGraph->totalWeightTensor_ = mpsGraphBatchSizeTensor; + newCachedGraph->outputTensor_ = mpsGraphReducedTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, input); + Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor_, target); + Placeholder weightPlaceholder = Placeholder(); + if(isWeightsArrayValid) + weightPlaceholder = Placeholder(cachedGraph->weightTensor_, weight); + Placeholder batchSizePlaceholder = Placeholder(cachedGraph->batchSizeTensor_, batchSizeTensor); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); + Placeholder totalWeightsPlaceholder = Placeholder(cachedGraph->totalWeightTensor_, total_weight); + + // Create dictionary of inputs and outputs + NSMutableDictionary* feeds = [[NSMutableDictionary alloc] initWithCapacity: 4]; + feeds[selfPlaceholder.getMPSGraphTensor()] = selfPlaceholder.getMPSGraphTensorData(); + feeds[targetPlaceholder.getMPSGraphTensor()] = targetPlaceholder.getMPSGraphTensorData(); + feeds[batchSizePlaceholder.getMPSGraphTensor()] = batchSizePlaceholder.getMPSGraphTensorData(); + + if(isWeightsArrayValid) + feeds[weightPlaceholder.getMPSGraphTensor()] = weightPlaceholder.getMPSGraphTensorData(); + + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(), + totalWeightsPlaceholder.getMPSGraphTensor() : totalWeightsPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + + } + + return; +} + +void smooth_l1_loss_impl( + const Tensor &input, + const Tensor &target, + const int64_t reduction, + double beta, + const Tensor &output, + MPSShape *mpsInputShape, + MPSShape *mpsOutputShape) +{ + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *targetTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + MPSGraphCache *cache_ = MPSGraphCache::getInstance(); + + MPSStream *stream= getCurrentMPSStream(); + + @autoreleasepool { + MPSShape* input_shape = getMPSShape(input); + NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","]; + + string key = "smooth_l1_loss_impl:" + reductionToString(reduction) + ":" + + [ns_shape_key UTF8String] + ":" + + to_string(beta) + ":" + + getMPSTypeString(input.scalar_type()) + ":" + + getMPSTypeString(target.scalar_type()); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + // smooth_l1_loss_mps: + // ln = 0.5 * ( xn - yn ) ^ 2 / beta, if |xn - yn| < beta + // = | xn - yn | - 0.5 * beta, otherwise + + @autoreleasepool { + // Initialize graph + MPSGraph *mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor *inputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type())); + MPSGraphTensor *targetTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(target.scalar_type())); + + // Setup tensors + MPSGraphTensor *mpsGraphZeroTensor = [mpsGraph constantWithScalar: 0.0 + dataType: inputTensor.dataType]; + MPSGraphTensor *mpsGraphOneTensor = [mpsGraph constantWithScalar: 1.0 + dataType: inputTensor.dataType]; + MPSGraphTensor *mpsGraphHalfTensor = [mpsGraph constantWithScalar: 0.5 + dataType: MPSDataTypeFloat32]; + MPSGraphTensor *betaTensor = [mpsGraph constantWithScalar: beta + dataType: MPSDataTypeFloat32]; + // 0.5 * beta + MPSGraphTensor *halfTensorMulBetaTensor = [mpsGraph constantWithScalar: beta * 0.5 + dataType: MPSDataTypeFloat32]; + // Calculating first part of the equation: + // ln = 0.5(xn - yn)^2/beta, if |xn - yn| < beta + + // xn - yn + MPSGraphTensor *diffTensor = [mpsGraph subtractionWithPrimaryTensor: inputTensor + secondaryTensor: targetTensor + name: nil]; + + // | xn - yn | + MPSGraphTensor *diffAbsTensor = [mpsGraph absoluteWithTensor: diffTensor + name: nil]; + + // | xn - yn | < beta + MPSGraphTensor *diffAbsLessThanBetaTensor = [mpsGraph lessThanWithPrimaryTensor: diffAbsTensor + secondaryTensor: betaTensor + name: nil]; + + // ( xn - yn ) ^ 2 + MPSGraphTensor *diffSquare = [mpsGraph squareWithTensor: diffTensor + name: nil]; + + // 0.5 * ( xn - yn ) ^ 2 + MPSGraphTensor *diffSquareMulHalfTensor = [mpsGraph multiplicationWithPrimaryTensor: diffSquare + secondaryTensor: mpsGraphHalfTensor + name: nil]; + + // 0.5 * ( xn - yn ) ^ 2 / beta + MPSGraphTensor *loss1Temp = [mpsGraph divisionWithPrimaryTensor: diffSquareMulHalfTensor + secondaryTensor: betaTensor + name: nil]; + + // Calculating second part of the equation: + // | xn - yn | - 0.5 * beta, if | xn - yn | >= beta + + // | xn - yn | - 0.5 * beta + MPSGraphTensor *loss2Temp = [mpsGraph subtractionWithPrimaryTensor: diffAbsTensor + secondaryTensor: halfTensorMulBetaTensor + name: nil]; + + MPSGraphTensor *lossTensor = [mpsGraph selectWithPredicateTensor: diffAbsLessThanBetaTensor + truePredicateTensor: loss1Temp + falsePredicateTensor: loss2Temp + name: @"lossTensor"]; + + MPSGraphTensor *outputTensor = reduceTensor(lossTensor, reduction, mpsGraph, 1); + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->targetTensor_ = targetTensor; + newCachedGraph->outputTensor_ = outputTensor; + + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input, mpsInputShape); + Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor_, target, mpsInputShape); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output, mpsOutputShape); + + NSDictionary* feeds = @{ + inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), + targetPlaceholder.getMPSGraphTensor() : targetPlaceholder .getMPSGraphTensorData() + }; + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } +} + +void smooth_l1_loss_template( + const Tensor &input, + const Tensor &target, + const int64_t reduction, + double beta, + const Tensor &output) +{ + TORCH_CHECK(beta >= 0, "smooth_l1_loss does not support negative values for beta."); + TORCH_CHECK(input.is_mps()); + TORCH_CHECK(target.is_mps()); + + MPSShape *mpsInputShape = nil; + MPSShape *mpsOutputShape = nil; + + // Determine the shape of the output + // If the reduction is 'mean' or 'sum', the output shape is a scalar, + // otherwise, the output shape is the same shape as input + if (reduction == Reduction::Mean || reduction == Reduction::Sum) + { + // Output: scalar, if reduction is 'mean' or 'sum' + IntArrayRef input_shape = input.sizes(); + int64_t num_input_dims = input_shape.size(); + NSMutableArray *apparent_input_shape = [NSMutableArray arrayWithCapacity:1]; + int64_t num_in_elements = 1; + for(int i = 0; i < num_input_dims; i++) { + num_in_elements *= input_shape[i]; + } + apparent_input_shape[0] = [NSNumber numberWithInt:num_in_elements]; + + // Output is a single value in case reduction is set to mean or sum + NSMutableArray *apparent_out_shape = [NSMutableArray arrayWithCapacity:1]; + apparent_out_shape[0] = @1; + mpsInputShape = apparent_input_shape; + mpsOutputShape = apparent_out_shape; + } + else + { + // Output: If reduction is 'none', then (N, *); same shape as the input + assert(reduction == Reduction::None); + mpsInputShape = getMPSShape(input); + mpsOutputShape = mpsInputShape; + //resize_tensor(&output); + } + TORCH_CHECK(output.is_mps()); + + smooth_l1_loss_impl( + input, + target, + reduction, + beta, + output, + mpsInputShape, + mpsOutputShape); +} + +void smooth_l1_loss_backward_impl( + const Tensor& grad_output, + const Tensor& input, + const Tensor& target, + int64_t reduction, + double beta, + Tensor& grad_input) +{ + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *targetTensor_ = nil; + MPSGraphTensor *gradInputTensor_ = nil; + }; + + MPSGraphCache *cache_ = MPSGraphCache::getInstance(); + + MPSStream *stream= getCurrentMPSStream(); + + @autoreleasepool { + + auto numClasses = grad_input.sizes()[1]; + MPSShape* input_shape = getMPSShape(input); + NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","]; + + string key = "smooth_l1_loss_backward_impl:" + to_string(numClasses) + ":" + + reductionToString(reduction) + ":" + + [ns_shape_key UTF8String] + ":" + + to_string(beta) + ":" + + getMPSTypeString(input.scalar_type()) + ":" + + getMPSTypeString(target.scalar_type()); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + auto numElements = input.numel(); + + MPSGraph *mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor *inputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type())); + MPSGraphTensor *targetTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(target.scalar_type())); + + MPSGraphTensor *betaTensor = [mpsGraph constantWithScalar: beta + dataType: MPSDataTypeFloat32]; + + MPSGraphTensor *numelTensor = [mpsGraph constantWithScalar: numElements + dataType: MPSDataTypeFloat32]; + + // xn - yn + MPSGraphTensor *diffTensor = [mpsGraph subtractionWithPrimaryTensor: inputTensor + secondaryTensor: targetTensor + name: nil]; + + // | xn - yn | + MPSGraphTensor *diffAbsTensor = [mpsGraph absoluteWithTensor: diffTensor + name: nil]; + + // | xn - yn | < beta + MPSGraphTensor *diffAbsLessThanBetaTensor = [mpsGraph lessThanWithPrimaryTensor: diffAbsTensor + secondaryTensor: betaTensor + name: nil]; + + // ( xn - yn ) / beta + MPSGraphTensor *truePredicateTensor = [mpsGraph divisionWithPrimaryTensor: diffTensor + secondaryTensor: betaTensor + name: nil]; + + // ( x - y ) / | x - y | + MPSGraphTensor *falsePredicateTensor = [mpsGraph divisionWithPrimaryTensor: diffTensor + secondaryTensor: diffAbsTensor + name: nil]; + + MPSGraphTensor *lossTensor = [mpsGraph selectWithPredicateTensor: diffAbsLessThanBetaTensor + truePredicateTensor: truePredicateTensor + falsePredicateTensor: falsePredicateTensor + name: @"lossTensor"]; + + MPSGraphTensor *outputTensor = lossTensor; + if (reduction == Reduction::Mean) + { + outputTensor = [mpsGraph divisionWithPrimaryTensor: lossTensor + secondaryTensor: numelTensor + name: nil]; + } + + MPSGraphTensor *gradInputTensor = outputTensor; + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->targetTensor_ = targetTensor; + newCachedGraph->gradInputTensor_ = gradInputTensor; + + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input); + Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor_, target); + Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input); + + NSDictionary* feeds = @{ + inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), + targetPlaceholder.getMPSGraphTensor() : targetPlaceholder .getMPSGraphTensorData() + }; + NSDictionary* results = @{ + gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } +} + +void smooth_l1_loss_backward_template( + const Tensor& grad_output, + const Tensor& input, + const Tensor& target, + int64_t reduction, + double beta, + Tensor& grad_input) +{ + TORCH_CHECK(beta >= 0, "smooth_l1_loss_backward does not support negative values for beta."); + TORCH_CHECK(input.is_mps()); + TORCH_CHECK(target.is_mps()); + + smooth_l1_loss_backward_impl( + grad_output, input, target, reduction, beta, grad_input + ); +} + +} // namespace mps + +// APIs exposed to at::native scope + +// MSELoss +TORCH_IMPL_FUNC(mse_loss_out_mps) ( + const Tensor& input, const Tensor& target, int64_t reduction, const Tensor& output) { + string op_name = __func__; + using namespace mps; + TORCH_CHECK(target.is_same_size(input), op_name + ": target and input tensors must have identical shapes") + TORCH_CHECK(output.is_mps()); + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* inputTensor = nil; + MPSGraphTensor* targetTensor = nil; + MPSGraphTensor* outputTensor = nil; + }; + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + @autoreleasepool { + string key = op_name + reductionToString(reduction) + getTensorsStringKey({input, target}); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + cachedGraph = static_cast(cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input); + newCachedGraph->targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target); + + MPSGraphTensor *diffTensor = [mpsGraph subtractionWithPrimaryTensor: newCachedGraph->inputTensor + secondaryTensor: newCachedGraph->targetTensor + name: nil]; + MPSGraphTensor *diffSquareTensor = [mpsGraph squareWithTensor: diffTensor + name: nil]; + newCachedGraph->outputTensor = reduceTensor(diffSquareTensor, reduction, mpsGraph, input.sizes().size()); + } + return newCachedGraph; + })); + } + Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor, input); + Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor, target); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output); + + NSDictionary* feeds = @{ + inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), + targetPlaceholder.getMPSGraphTensor() : targetPlaceholder.getMPSGraphTensorData() + }; + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + } +} + +Tensor& mse_loss_backward_out_mps(const Tensor& grad_output, const Tensor& input, + const Tensor& target, int64_t reduction, Tensor& grad_input) +{ + return mps::mse_loss_backward_out_impl(grad_output, input, target, reduction, grad_input, __func__); +} + +Tensor mse_loss_backward_mps(const Tensor& grad_output, const Tensor& input, + const Tensor& target, int64_t reduction) +{ + Tensor grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + return mps::mse_loss_backward_out_impl(grad_output, input, target, reduction, grad_input, __func__); +} + +// BCELoss +Tensor& binary_cross_entropy_out_mps(const Tensor& input, const Tensor& target, + const c10::optional& weight_opt, int64_t reduction, Tensor& loss) +{ + return mps::BCELoss::bce_loss_out_impl(input, target, weight_opt, reduction, loss, c10::nullopt, __func__); +} + +Tensor binary_cross_entropy_mps(const Tensor& input, const Tensor& target, + const c10::optional& weight_opt, int64_t reduction) +{ + Tensor loss = at::empty_like(input); + return mps::BCELoss::bce_loss_out_impl(input, target, weight_opt, reduction, loss, c10::nullopt, __func__); +} + +Tensor& binary_cross_entropy_backward_out_mps(const Tensor& grad_output, const Tensor& input, + const Tensor& target, const c10::optional& weight_opt, + int64_t reduction, Tensor& grad_input) +{ + return mps::BCELoss::bce_loss_out_impl(input, target, weight_opt, reduction, grad_input, grad_output, __func__); +} + +Tensor binary_cross_entropy_backward_mps(const Tensor& grad_output, const Tensor& input, const Tensor& target, + const c10::optional& weight_opt, int64_t reduction) +{ + Tensor grad_input = at::empty_like(input); + return mps::BCELoss::bce_loss_out_impl(input, target, weight_opt, reduction, grad_input, grad_output, __func__); +} + +// SmoothL1Loss +TORCH_IMPL_FUNC(smooth_l1_loss_out_mps)( + const Tensor& input, + const Tensor& target, + int64_t reduction, + double beta, + const Tensor& result) { + mps::smooth_l1_loss_template( + input, target, reduction, beta, result); +} + +Tensor& smooth_l1_loss_backward_out_mps( + const Tensor& grad_output, + const Tensor& input, + const Tensor& target, + int64_t reduction, + double beta, + Tensor& grad_input) { + mps::smooth_l1_loss_backward_template( + grad_output, input, target, reduction, beta, grad_input); + return grad_input; +} + +// NLLLoss +TORCH_IMPL_FUNC(nll_loss_backward_out_mps) +(const Tensor& grad_output, + const Tensor& self, + const Tensor& target, + OptionalTensorRef weight_opt, + int64_t reduction, + int64_t ignore_index, + const Tensor& total_weight, + const Tensor& grad_input +) +{ + const Tensor& weight = weight_opt.getTensorRef(); + + mps::nllnd_loss_backward_impl((Tensor &)grad_input, + grad_output, + self, + target, + weight, + reduction, + ignore_index, + total_weight, + false); + return; +} + +TORCH_IMPL_FUNC(nll_loss_forward_out_mps) +(const Tensor& self, + const Tensor& target, + const OptionalTensorRef weight_opt, + int64_t reduction, + int64_t ignore_index, + const Tensor& output, + const Tensor& total_weight) { + + const Tensor& weight = weight_opt.getTensorRef(); + + mps::nllnd_loss_forward_impl((Tensor &)output, + (Tensor &)total_weight, + self, + target, + weight, + reduction, + ignore_index, + false); + + return; +} + +inline void check_inputs_nll_loss2d( + const Tensor& input, + const Tensor& target, + const Tensor& weight) { + TORCH_CHECK( + target.dim() == 3, + "only batches of spatial targets supported (3D tensors)" + " but got targets of dimension: ", + target.dim()); + TORCH_CHECK( + input.dim() == 4, + "only batches of spatial inputs supported (4D tensors), " + "but got input of dimension: ", + input.dim()); + TORCH_CHECK( + !weight.defined() || weight.numel() == input.size(1), + "weight tensor should be defined either for all or no classes"); + + const int64_t input0 = input.size(0); + const int64_t input2 = input.size(2); + const int64_t input3 = input.size(3); + const int64_t target0 = target.size(0); + const int64_t target1 = target.size(1); + const int64_t target2 = target.size(2); + TORCH_CHECK( + input0 == target0 && input2 == target1 && input3 == target2, + "size mismatch (got input: ", + input.sizes(), + " , target: ", + target.sizes()); +} + + +void nll_loss2d_forward_out_mps_template( + Tensor& output, + Tensor& total_weight, + const Tensor& input, + const Tensor& target, + const Tensor& weight, + int64_t reduction, + int64_t ignore_index) { + check_inputs_nll_loss2d(input, target, weight); + total_weight.resize_({}); + + mps::nllnd_loss_forward_impl(output, + total_weight, + input, + target, + weight, + reduction, + ignore_index, + true); + + return; +} + +std::tuple nll_loss2d_forward_out_mps(const Tensor& self, + const Tensor& target, const c10::optional& weight_opt, + int64_t reduction, + int64_t ignore_index, + Tensor& output, + Tensor& total_weight) { + // See [Note: hacky wrapper removal for optional tensor] + c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); + const Tensor& weight = *weight_maybe_owned; + + nll_loss2d_forward_out_mps_template( + output, total_weight, self, target, weight, reduction, ignore_index); + return std::tuple(output, total_weight); +} + +std::tuple nll_loss2d_forward_mps( + const Tensor& self, + const Tensor& target, const c10::optional& weight_opt, + int64_t reduction, + int64_t ignore_index) { + // See [Note: hacky wrapper removal for optional tensor] + c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); + const Tensor& weight = *weight_maybe_owned; + + auto output = at::empty({0}, self.options()); + auto total_weight = at::empty({0}, self.options()); + at::native::nll_loss2d_forward_out_mps( + self, target, weight, reduction, ignore_index, output, total_weight); + return std::make_tuple(output, total_weight); +} + +void nll_loss2d_backward_out_mps_template( + Tensor& grad_input, + const Tensor& grad_output, + const Tensor& input, + const Tensor& target, + const Tensor& weight, + int64_t reduction, + int64_t ignore_index, + const Tensor& total_weight) { + check_inputs_nll_loss2d(input, target, weight); + grad_input.resize_as_(input); + grad_input.zero_(); + TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous"); + TORCH_CHECK( + total_weight.numel() == 1, + "expected total_weight to be a single element tensor, got: ", + total_weight.sizes(), + " (", + total_weight.numel(), + " elements)"); + + mps::nllnd_loss_backward_impl(grad_input, + grad_output, + input, + target, + weight, + reduction, + ignore_index, + total_weight, + true); + + return; +} + +Tensor& nll_loss2d_backward_out_mps(const Tensor& grad_output, + const Tensor& self, + const Tensor& target, const c10::optional& weight_opt, + int64_t reduction, + int64_t ignore_index, + const Tensor& total_weight, + Tensor& grad_input) { + // See [Note: hacky wrapper removal for optional tensor] + c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); + const Tensor& weight = *weight_maybe_owned; + + nll_loss2d_backward_out_mps_template( + grad_input, + grad_output, + self, + target, + weight, + reduction, + ignore_index, + total_weight); + return grad_input; +} + +Tensor nll_loss2d_backward_mps( + const Tensor& grad_output, + const Tensor& self, + const Tensor& target, const c10::optional& weight_opt, + int64_t reduction, + int64_t ignore_index, + const Tensor& total_weight) { + + // See [Note: hacky wrapper removal for optional tensor] + c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); + const Tensor& weight = *weight_maybe_owned; + + auto grad_input = at::zeros_like(self); + nll_loss2d_backward_out_mps( + grad_output, + self, + target, + weight, + reduction, + ignore_index, + total_weight, + grad_input); + return grad_input; +} + + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/mps/operations/Normalization.mm b/aten/src/ATen/native/mps/operations/Normalization.mm new file mode 100644 index 000000000000..dc85e13a9f29 --- /dev/null +++ b/aten/src/ATen/native/mps/operations/Normalization.mm @@ -0,0 +1,804 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { + +void get_shapes(MPSShape* input_shape_readonly, + NSMutableArray* &input_shape, + NSMutableArray* &new_mean_shape, + NSMutableArray* &axes, + int num_input_dims, c10::MemoryFormat memory_format, + bool isBackward) { + // Modify the shape + if(memory_format == at::MemoryFormat::Contiguous) { + for(int i = 0; i < num_input_dims; i++) + input_shape[i] = input_shape_readonly[i]; + } + else { // ChannelsLast + auto num_channels = input_shape_readonly[1]; + input_shape[0] = input_shape_readonly[0]; + for(int i = 1; i < num_input_dims-1; i++) + input_shape[i] = input_shape_readonly[i+1]; + input_shape[num_input_dims-1] = num_channels; + } + + // Mean shape should remain unchanged in backward + if(memory_format == at::MemoryFormat::Contiguous || isBackward) { + new_mean_shape[0] = @1; + new_mean_shape[1] = input_shape_readonly[1]; + for(int i = 2; i < num_input_dims; i++) + new_mean_shape[i] = @1; + } + else if(memory_format == at::MemoryFormat::ChannelsLast) { + for(int i = 0; i < num_input_dims-1; i++) + new_mean_shape[i] = @1; + new_mean_shape[num_input_dims-1] = input_shape[num_input_dims-1]; + } + + // Set axes of reduction + if(memory_format == at::MemoryFormat::Contiguous || isBackward) { + axes[0] = @0; + for(int i = 2; i < num_input_dims; i++) + axes[i-1] = [NSNumber numberWithInt:i]; + } + else { + for(int i = 0; i < num_input_dims-1; i++) + axes[i] = [NSNumber numberWithInt:i]; + } +} + +// Inverse standard deviation now becomes variance (without epsilon) +std::tuple batch_norm_mps_out + (const Tensor& self, + const c10::optional& weight_opt, + const c10::optional& bias_opt, + const c10::optional& running_mean_opt, + const c10::optional& running_var_opt, + bool train, double momentum, double epsilon, + Tensor& output, + Tensor& save_mean, + Tensor& save_var) { + + namespace native_mps = at::native::mps; + + struct CachedGraph : public native_mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* inputTensor_ = nil; + MPSGraphTensor* weightTensor_ = nil; + MPSGraphTensor* biasTensor_ = nil; + MPSGraphTensor* runningMeanTensor_ = nil; + MPSGraphTensor* runningVarTensor_ = nil; + MPSGraphTensor* outputTensor_ = nil; + MPSGraphTensor* saveMeanTensor_ = nil; + MPSGraphTensor* saveVarTensor_ = nil; + MPSGraphTensor* runningMeanInplaceUpdate_ = nil; + MPSGraphTensor* runningVarInplaceUpdate_ = nil; + }; + + native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance(); + + auto stream = at::mps::getCurrentMPSStream(); + + const bool has_running_mean = (running_mean_opt.has_value() && running_mean_opt->defined()); + const bool has_running_var = (running_var_opt.has_value() && running_var_opt->defined()); + TORCH_CHECK(has_running_mean == has_running_var); + + const bool has_weight = (weight_opt.has_value() && weight_opt->defined()); + const bool has_bias = (bias_opt.has_value() && bias_opt->defined()); + + const auto memory_format = self.suggest_memory_format(); + + if (output.numel() == 0) { + return std::tuple(output, save_mean, save_var);; + } + + @autoreleasepool { + + string mem_format_key; + switch(memory_format) { + case at::MemoryFormat::Contiguous: + mem_format_key = "Contiguous"; + break; + case at::MemoryFormat::ChannelsLast: + mem_format_key = "ChannelsLast"; + break; + default: + assert(0 && "Check should have been done earlier\n"); + } + + // Number of elements in one channel, needed for bessel correction term + const int64_t N = self.numel() / save_mean.numel(); + MPSShape* input_shape_readonly = mps::getMPSShape(self); + int num_input_dims = [input_shape_readonly count]; + // Input shape changes based on memory format + NSMutableArray* input_shape = [NSMutableArray arrayWithCapacity:num_input_dims]; + // Shape which can be broadcasted with input + NSMutableArray* new_mean_shape = [NSMutableArray arrayWithCapacity:num_input_dims]; + // Reduction axes + NSMutableArray* axes = [NSMutableArray arrayWithCapacity:(num_input_dims-1)]; + + get_shapes(input_shape_readonly, input_shape, new_mean_shape, axes, num_input_dims, memory_format, false); + + NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","]; + + string key = "batch_norm_mps_out:" + mem_format_key + ":" + std::to_string(epsilon) + ":" + + std::to_string(momentum) + ":" + std::to_string(train) + ":" + + std::to_string(has_running_mean) + ":" + + std::to_string(has_weight) + ":" + std::to_string(has_bias) + ":" + + [ns_shape_key UTF8String] + ":" + native_mps::getMPSTypeString(self.scalar_type()); + auto input_mps_dtype = native_mps::getMPSDataType(self.scalar_type()); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + // Dim where channels are located + int channelsDim; + if(memory_format == at::MemoryFormat::Contiguous) + channelsDim = 1; + else + channelsDim = num_input_dims - 1; + + if(!cachedGraph) { + native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = native_mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_mps_dtype, input_shape); + MPSGraphTensor* weightTensor = nil; + // Should have shape of mean + if(has_weight) + weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(weight_opt.value().scalar_type()), new_mean_shape); + MPSGraphTensor* biasTensor = nil; + if(has_bias) + biasTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(bias_opt.value().scalar_type()), new_mean_shape); + MPSGraphTensor* runningMeanTensor = nil; + MPSGraphTensor* runningVarTensor = nil; + if(has_running_mean) { + runningMeanTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(running_mean_opt.value().scalar_type()), new_mean_shape); + runningVarTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(running_var_opt.value().scalar_type()), new_mean_shape); + } + + // Mean and inv std tensors to be saved and returned + MPSGraphTensor* saveMeanTensor = nil; + MPSGraphTensor* saveVarTensor = nil; + + // Running stats inplace update + MPSGraphTensor* runningMeanInplaceUpdate = nil; + MPSGraphTensor* runningVarInplaceUpdate = nil; + + MPSGraphTensor* updatedRunningMeanTensor = nil; + MPSGraphTensor* updatedRunningVarTensor = nil; + + /* + If train: + If has_running_mean: + Update the running stats to be stored into save_mean and save_var, + AND to be used in current batchnorm computation + Else: + Just calculate the var using batch variance + If not train: + Check if running mean exists (maybe do this check before making graph) + Copy the running mean into the mean to be saved + Calculate the save_var directly from the running variance + + Compute the batch norm output and stats to be saved + */ + + if(train) { + // Compute mean and variance of the current batch + MPSGraphTensor* batchMeanTensor = [mpsGraph meanOfTensor:inputTensor + axes:axes + name:nil]; + MPSGraphTensor* batchVarianceTensor = [mpsGraph varianceOfTensor:inputTensor + axes:axes + name:nil]; + if(has_running_mean) { + // TODO: This is not the formula used in PyTorch, is this OK? Seems more robust + // float besselCorrectionTerm = float(N) / std::max(N - 1.0f, 1.0f); + float besselCorrectionTerm = float(N) / float(N - 1.0f); + MPSGraphTensor* besselConstantTensor = [mpsGraph constantWithScalar:(double)besselCorrectionTerm + shape:@[@1] + dataType:input_mps_dtype]; + MPSGraphTensor* unbiasedVarianceTensor = [mpsGraph multiplicationWithPrimaryTensor:batchVarianceTensor + secondaryTensor:besselConstantTensor + name:nil]; + MPSGraphTensor* momentumTensor = [mpsGraph constantWithScalar:(double)momentum + shape:@[@1] + dataType:input_mps_dtype]; + MPSGraphTensor* oneMinusMomentum = [mpsGraph constantWithScalar:(double)(1.0 - momentum) + shape:@[@1] + dataType:input_mps_dtype]; + // Compute updated running mean + MPSGraphTensor* scaledBatchMean = [mpsGraph multiplicationWithPrimaryTensor:batchMeanTensor + secondaryTensor:momentumTensor + name:nil]; + MPSGraphTensor* scaledRunningMean = [mpsGraph multiplicationWithPrimaryTensor:runningMeanTensor + secondaryTensor:oneMinusMomentum + name:nil]; + updatedRunningMeanTensor = [mpsGraph additionWithPrimaryTensor:scaledBatchMean + secondaryTensor:scaledRunningMean + name:nil]; + // Compute updated running var + MPSGraphTensor* scaledCorrectedBatchVar = [mpsGraph multiplicationWithPrimaryTensor:unbiasedVarianceTensor + secondaryTensor:momentumTensor + name:nil]; + MPSGraphTensor* scaledRunningVar = [mpsGraph multiplicationWithPrimaryTensor:runningVarTensor + secondaryTensor:oneMinusMomentum + name:nil]; + updatedRunningVarTensor = [mpsGraph additionWithPrimaryTensor:scaledCorrectedBatchVar + secondaryTensor:scaledRunningVar + name:nil]; + // Update saved mean and inverse std tensor + saveMeanTensor = batchMeanTensor; + saveVarTensor = batchVarianceTensor; + } + else { + saveMeanTensor = batchMeanTensor; + saveVarTensor = batchVarianceTensor; + } + } + else { // Test + TORCH_CHECK(has_running_mean); + saveMeanTensor = [mpsGraph identityWithTensor:runningMeanTensor + name:nil]; + saveVarTensor = [mpsGraph identityWithTensor:runningVarTensor + name:nil]; + } + + // Compute output of batch norm + MPSGraphTensor* outputTensor = [mpsGraph normalizationWithTensor:inputTensor + meanTensor:saveMeanTensor + varianceTensor:saveVarTensor + gammaTensor:weightTensor + betaTensor:biasTensor + epsilon:(float)epsilon + name:nil]; + + // Reshape saved mean and var to fit output + saveMeanTensor = [mpsGraph reshapeTensor:saveMeanTensor + withShape:@[new_mean_shape[channelsDim]] + name:nil]; + saveVarTensor = [mpsGraph reshapeTensor:saveVarTensor + withShape:@[new_mean_shape[channelsDim]] + name:nil]; + + if(train && has_running_mean) { + // Running stats inplace update + runningMeanInplaceUpdate = [mpsGraph reshapeTensor:updatedRunningMeanTensor + withShape:@[input_shape[channelsDim]] + name:nil]; + runningVarInplaceUpdate = [mpsGraph reshapeTensor:updatedRunningVarTensor + withShape:@[input_shape[channelsDim]] + name:nil]; + } + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->weightTensor_ = weightTensor; + newCachedGraph->biasTensor_ = biasTensor; + newCachedGraph->runningMeanTensor_ = runningMeanTensor; + newCachedGraph->runningVarTensor_ = runningVarTensor; + newCachedGraph->outputTensor_ = outputTensor; + newCachedGraph->saveMeanTensor_ = saveMeanTensor; + newCachedGraph->saveVarTensor_ = saveVarTensor; + newCachedGraph->runningMeanInplaceUpdate_ = runningMeanInplaceUpdate; + newCachedGraph->runningVarInplaceUpdate_ = runningVarInplaceUpdate; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, self, input_shape); + auto weightPlaceholder = native_mps::Placeholder(); + if(has_weight) + weightPlaceholder = native_mps::Placeholder(cachedGraph->weightTensor_, weight_opt.value(), new_mean_shape); + auto biasPlaceholder = native_mps::Placeholder(); + if(has_bias) + biasPlaceholder = native_mps::Placeholder(cachedGraph->biasTensor_, bias_opt.value(), new_mean_shape); + auto runningMeanPlaceholder = native_mps::Placeholder(); + auto runningVarPlaceholder = native_mps::Placeholder(); + if(has_running_mean) { + runningMeanPlaceholder = native_mps::Placeholder(cachedGraph->runningMeanTensor_, running_mean_opt.value(), new_mean_shape); + runningVarPlaceholder = native_mps::Placeholder(cachedGraph->runningVarTensor_, running_var_opt.value(), new_mean_shape); + } + + auto runningMeanInplaceUpdatePlaceholder = native_mps::Placeholder(); + auto runningVarInplaceUpdatePlaceholder = native_mps::Placeholder(); + + if(train && has_running_mean) { + runningMeanInplaceUpdatePlaceholder = native_mps::Placeholder(cachedGraph->runningMeanInplaceUpdate_, running_mean_opt.value()); + runningVarInplaceUpdatePlaceholder = native_mps::Placeholder(cachedGraph->runningVarInplaceUpdate_, running_var_opt.value()); + } + + auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output, input_shape); + auto saveMeanPlaceholder = native_mps::Placeholder(cachedGraph->saveMeanTensor_, save_mean); + auto saveVarPlaceholder = native_mps::Placeholder(cachedGraph->saveVarTensor_, save_var); + + NSMutableDictionary *feeds = [[NSMutableDictionary new] autorelease]; + feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData(); + if(has_weight) + feeds[weightPlaceholder.getMPSGraphTensor()] = weightPlaceholder.getMPSGraphTensorData(); + if(has_bias) + feeds[biasPlaceholder.getMPSGraphTensor()] = biasPlaceholder.getMPSGraphTensorData(); + if(has_running_mean) { + feeds[runningMeanPlaceholder.getMPSGraphTensor()] = runningMeanPlaceholder.getMPSGraphTensorData(); + feeds[runningVarPlaceholder.getMPSGraphTensor()] = runningVarPlaceholder.getMPSGraphTensorData(); + } + + NSMutableDictionary *results = [[NSMutableDictionary new] autorelease]; + results[outputPlaceholder.getMPSGraphTensor()] = outputPlaceholder.getMPSGraphTensorData(); + results[saveMeanPlaceholder.getMPSGraphTensor()] = saveMeanPlaceholder.getMPSGraphTensorData(); + results[saveVarPlaceholder.getMPSGraphTensor()] = saveVarPlaceholder.getMPSGraphTensorData(); + + // If train and has_running_mean, add updated running mean to the output + if(train && has_running_mean) { + results[runningMeanInplaceUpdatePlaceholder.getMPSGraphTensor()] = runningMeanInplaceUpdatePlaceholder.getMPSGraphTensorData(); + results[runningVarInplaceUpdatePlaceholder.getMPSGraphTensor()] = runningVarInplaceUpdatePlaceholder.getMPSGraphTensorData(); + } + + native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + + } + + return std::tuple(output, save_mean, save_var); +} + +std::tuple batch_norm_mps + (const Tensor& self, + const c10::optional& weight_opt, + const c10::optional& bias_opt, + const c10::optional& running_mean_opt, + const c10::optional& running_var_opt, + bool train, + double momentum, + double epsilon) { + + const auto memory_format = self.suggest_memory_format(); + + auto output = at::native::empty_mps( + self.sizes(), + self.scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + memory_format); + + int64_t n_input = self.size(1); + + auto save_mean = at::native::empty_mps( + {n_input}, + self.scalar_type(), + // TODO: Accumulate type? + // at::toAccumulateType(self.scalar_type(), /*is_cuda=*/false), + c10::nullopt, + kMPS, + c10::nullopt, + c10::nullopt); + auto save_var = at::native::empty_mps( + {n_input}, + self.scalar_type(), + // TODO: Accumulate type? + // at::toAccumulateType(self.scalar_type(), /*is_cuda=*/false), + c10::nullopt, + kMPS, + c10::nullopt, + c10::nullopt); + + at::native::batch_norm_mps_out( + self, + weight_opt, + bias_opt, + running_mean_opt, + running_var_opt, + train, + momentum, + epsilon, + output, + save_mean, + save_var); + return std::make_tuple(output, save_mean, save_var); +} + +string get_mem_string(c10::MemoryFormat memory_format) { + string mem_format_key; + switch(memory_format) { + case at::MemoryFormat::Contiguous: + mem_format_key = "Contiguous"; + break; + case at::MemoryFormat::ChannelsLast: + mem_format_key = "ChannelsLast"; + break; + default: + assert(0 && "Invalid memory format\n"); + } + + return mem_format_key; +} + +// Batch norm backward +std::tuple batch_norm_backward_mps + (const Tensor& grad_out, + const Tensor& input, + const c10::optional& weight_opt, + const c10::optional& running_mean_opt, + const c10::optional& running_var_opt, + const c10::optional& save_mean_opt, + const c10::optional& save_var_opt, + bool train, + double epsilon, + std::array grad_input_mask) { + + Tensor grad_input; + Tensor grad_weight; + Tensor grad_bias; + + const auto memory_format = input.suggest_memory_format(); + + if (grad_input_mask[0]) { + grad_input = at::native::empty_mps(input.sizes(), + input.scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + memory_format); + } + // Assuming that if grad_input_mask of weight is 1, then the weight is available + if (grad_input_mask[1]) { + grad_weight = at::native::empty_mps(weight_opt.value().sizes(), + weight_opt.value().scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + at::MemoryFormat::Contiguous); + } + if (grad_input_mask[2]) { + grad_bias = at::native::empty_mps(weight_opt.value().sizes(), + weight_opt.value().scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + at::MemoryFormat::Contiguous); + } + + namespace native_mps = at::native::mps; + + // Derive from MPSCachedGraph + struct CachedGraph : public native_mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* gradOutputTensor_ = nil; + MPSGraphTensor* inputTensor_ = nil; + MPSGraphTensor* weightTensor_ = nil; + MPSGraphTensor* runningMeanTensor_ = nil; + MPSGraphTensor* runningVarTensor_ = nil; + MPSGraphTensor* saveMeanTensor_ = nil; + MPSGraphTensor* saveVarTensor_ = nil; + MPSGraphTensor* gradInputTensor_ = nil; + MPSGraphTensor* gradWeightTensor_ = nil; + MPSGraphTensor* gradBiasTensor_ = nil; + }; + + native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance(); + + auto stream = at::mps::getCurrentMPSStream(); + + const bool has_running_mean = (running_mean_opt.has_value() && running_mean_opt->defined()); + const bool has_running_var = (running_var_opt.has_value() && running_var_opt->defined()); + TORCH_CHECK(has_running_mean == has_running_var); + const bool has_save_mean = (save_mean_opt.has_value() && save_mean_opt->defined()); + const bool has_save_var = (save_var_opt.has_value() && save_var_opt->defined()); + TORCH_CHECK(has_save_mean == has_save_var); + + const bool has_weight = (weight_opt.has_value() && weight_opt->defined()); + + if (grad_input.numel() == 0) { + return std::make_tuple(grad_input, grad_weight, grad_bias); + } + + @autoreleasepool { + + string mem_format_key; + switch(memory_format) { + case at::MemoryFormat::Contiguous: + mem_format_key = "Contiguous"; + break; + case at::MemoryFormat::ChannelsLast: + mem_format_key = "ChannelsLast"; + break; + default: + assert(0 && "Check should have been done earlier\n"); + } + + MPSShape* input_shape_readonly = mps::getMPSShape(input); + int num_input_dims = [input_shape_readonly count]; + NSMutableArray* input_shape = [NSMutableArray arrayWithCapacity:num_input_dims]; + // Broadcast with input + NSMutableArray* new_mean_shape = [NSMutableArray arrayWithCapacity:num_input_dims]; + // Reduction axes + NSMutableArray* axes = [NSMutableArray arrayWithCapacity:(num_input_dims-1)]; + + get_shapes(input_shape_readonly, input_shape, new_mean_shape, axes, num_input_dims, memory_format, true); + + NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","]; + + string key = "batch_norm_backward_mps:" + mem_format_key + ":" + std::to_string(epsilon) + ":" + + std::to_string(train) + ":" + + std::to_string(has_running_mean) + ":" + + std::to_string(has_weight) + ":" + + [ns_shape_key UTF8String] + ":" + native_mps::getMPSTypeString(input.scalar_type()); + auto input_mps_dtype = native_mps::getMPSDataType(input.scalar_type()); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = native_mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + // NCHW - Channels dim is 1 + int channelsDim = 1; + + MPSGraphTensor* inputTensorOriginal = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_mps_dtype, input_shape); + // Shape is the ORIGINAL NCHW shape + MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(grad_out.scalar_type()), input_shape_readonly); + MPSGraphTensor* weightTensor = nil; + if(has_weight) + weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(weight_opt.value().scalar_type()), new_mean_shape); + MPSGraphTensor* runningMeanTensor = nil; + MPSGraphTensor* runningVarTensor = nil; + if(has_running_mean) { + runningMeanTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(running_mean_opt.value().scalar_type()), new_mean_shape); + runningVarTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(running_var_opt.value().scalar_type()), new_mean_shape); + } + + // Mean and inv std tensors to be saved and returned + MPSGraphTensor* saveMeanTensor = nil; + MPSGraphTensor* saveVarTensor = nil; + if(has_save_mean) { + saveMeanTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(save_mean_opt.value().scalar_type()), new_mean_shape); + saveVarTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(save_var_opt.value().scalar_type()), new_mean_shape); + } + + MPSGraphTensor* gradInputTensor = nil; + MPSGraphTensor* gradWeightTensor = nil; + MPSGraphTensor* gradBiasTensor = nil; + MPSGraphTensor* inputTensor = nil; + + if(memory_format == at::MemoryFormat::Contiguous) + inputTensor = inputTensorOriginal; + else { + // Reshape/transpose the input as needed + auto N = input_shape[0]; + auto H = input_shape[1]; + auto W = input_shape[2]; + auto C = input_shape[3]; + + inputTensor = [mpsGraph reshapeTensor:inputTensorOriginal + withShape:@[N, ([NSNumber numberWithInt:[H intValue]* [W intValue]]), C] + name:nil]; + inputTensor = [mpsGraph transposeTensor:inputTensor + dimension:1 + withDimension:2 + name:nil]; + inputTensor = [mpsGraph reshapeTensor:inputTensor + withShape:@[N, C, H, W] + name:nil]; + } + + if(train) { + // Use save_mean and save_var + if(grad_input_mask[1]) { + gradWeightTensor = [mpsGraph normalizationGammaGradientWithIncomingGradientTensor:gradOutputTensor + sourceTensor:inputTensor + meanTensor:saveMeanTensor + varianceTensor:saveVarTensor + reductionAxes:axes + epsilon:(float)epsilon + name:nil]; + } + if(grad_input_mask[2]) { + gradBiasTensor = [mpsGraph normalizationBetaGradientWithIncomingGradientTensor:gradOutputTensor + sourceTensor:inputTensor + reductionAxes:axes + name:nil]; + } + if(grad_input_mask[0]) { + gradInputTensor = [mpsGraph normalizationGradientWithIncomingGradientTensor:gradOutputTensor + sourceTensor:inputTensor + meanTensor:saveMeanTensor + varianceTensor:saveVarTensor + gammaTensor:weightTensor + gammaGradientTensor:gradWeightTensor + betaGradientTensor:gradBiasTensor + reductionAxes:axes + epsilon:(float) epsilon + name:nil]; + } + } + else { + // Use running mean and running var + MPSGraphTensor* rsqrtTensor = nil; + MPSGraphTensor* epsilonTensor = nil; + if(grad_input_mask[1]) { + epsilonTensor = [mpsGraph constantWithScalar:(float)epsilon + shape:@[@1] + dataType:input_mps_dtype]; + MPSGraphTensor* xMinusMean = [mpsGraph subtractionWithPrimaryTensor:inputTensor + secondaryTensor:runningMeanTensor + name:nil]; + MPSGraphTensor* varianceEpsTensor = [mpsGraph additionWithPrimaryTensor:runningVarTensor + secondaryTensor:epsilonTensor + name:nil]; + rsqrtTensor = [mpsGraph reverseSquareRootWithTensor:varianceEpsTensor + name:nil]; + MPSGraphTensor* bnForwardTensor = [mpsGraph multiplicationWithPrimaryTensor:xMinusMean + secondaryTensor:rsqrtTensor + name:nil]; + MPSGraphTensor* gradBnMulTensor = [mpsGraph multiplicationWithPrimaryTensor:bnForwardTensor + secondaryTensor:gradOutputTensor + name:nil]; + gradWeightTensor = [mpsGraph reductionSumWithTensor:gradBnMulTensor + axes:axes + name:nil]; + } + if(grad_input_mask[2]) { + gradBiasTensor = [mpsGraph normalizationBetaGradientWithIncomingGradientTensor:gradOutputTensor + sourceTensor:inputTensor + reductionAxes:axes + name:nil]; + } + if(grad_input_mask[0]) { + + MPSGraphTensor* unitTensor = [mpsGraph constantWithScalar:1.0 + shape:input_shape_readonly + dataType:input_mps_dtype]; + if(!epsilonTensor) + epsilonTensor = [mpsGraph constantWithScalar:(float)epsilon + shape:@[@1] + dataType:input_mps_dtype]; + if(!rsqrtTensor) { + MPSGraphTensor* varianceEpsTensor = [mpsGraph additionWithPrimaryTensor:runningVarTensor + secondaryTensor:epsilonTensor + name:nil]; + rsqrtTensor = [mpsGraph reverseSquareRootWithTensor:varianceEpsTensor + name:nil]; + } + + gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:unitTensor + secondaryTensor:rsqrtTensor + name:nil]; + if(has_weight) + gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:gradInputTensor + secondaryTensor:weightTensor + name:nil]; + gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:gradInputTensor + secondaryTensor:gradOutputTensor + name:nil]; + } + } + + if(grad_input_mask[1]) { + gradWeightTensor = [mpsGraph reshapeTensor:gradWeightTensor + withShape:@[input_shape_readonly[channelsDim]] + name:nil]; + } + if(grad_input_mask[2]) { + gradBiasTensor = [mpsGraph reshapeTensor:gradBiasTensor + withShape:@[input_shape_readonly[channelsDim]] + name:nil]; + } + + MPSGraphTensor* gradInputTensorFinal = nil; + + if(memory_format == at::MemoryFormat::Contiguous) + gradInputTensorFinal = gradInputTensor; + else { + // Reshape/transpose the input as needed + auto N = input_shape[0]; + auto H = input_shape[1]; + auto W = input_shape[2]; + auto C = input_shape[3]; + + gradInputTensorFinal = [mpsGraph reshapeTensor:gradInputTensor + withShape:@[N, C, ([NSNumber numberWithInt:[H intValue]* [W intValue]])] + name:nil]; + gradInputTensorFinal = [mpsGraph transposeTensor:gradInputTensorFinal + dimension:1 + withDimension:2 + name:nil]; + gradInputTensorFinal = [mpsGraph reshapeTensor:gradInputTensorFinal + withShape:@[N, H, W, C] + name:nil]; + } + + newCachedGraph->gradOutputTensor_ = gradOutputTensor; + newCachedGraph->inputTensor_ = inputTensorOriginal; + newCachedGraph->weightTensor_ = weightTensor; + newCachedGraph->runningMeanTensor_ = runningMeanTensor; + newCachedGraph->runningVarTensor_ = runningVarTensor; + newCachedGraph->saveMeanTensor_ = saveMeanTensor; + newCachedGraph->saveVarTensor_ = saveVarTensor; + newCachedGraph->gradInputTensor_ = gradInputTensorFinal; + newCachedGraph->gradWeightTensor_ = gradWeightTensor; + newCachedGraph->gradBiasTensor_ = gradBiasTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input, input_shape); + auto gradOutputPlaceholder = native_mps::Placeholder(cachedGraph->gradOutputTensor_, grad_out, input_shape_readonly); + auto weightPlaceholder = native_mps::Placeholder(); + if(has_weight) + weightPlaceholder = native_mps::Placeholder(cachedGraph->weightTensor_, weight_opt.value(), new_mean_shape); + auto runningMeanPlaceholder = native_mps::Placeholder(); + auto runningVarPlaceholder = native_mps::Placeholder(); + if(has_running_mean) { + runningMeanPlaceholder = native_mps::Placeholder(cachedGraph->runningMeanTensor_, running_mean_opt.value(), new_mean_shape); + runningVarPlaceholder = native_mps::Placeholder(cachedGraph->runningVarTensor_, running_var_opt.value(), new_mean_shape); + } + auto saveMeanPlaceholder = native_mps::Placeholder(); + auto saveVarPlaceholder = native_mps::Placeholder(); + if(has_save_mean) { + saveMeanPlaceholder = native_mps::Placeholder(cachedGraph->saveMeanTensor_, save_mean_opt.value(), new_mean_shape); + saveVarPlaceholder = native_mps::Placeholder(cachedGraph->saveVarTensor_, save_var_opt.value(), new_mean_shape); + } + + auto gradInputPlaceholder = native_mps::Placeholder(); + if(grad_input_mask[0]) + gradInputPlaceholder = native_mps::Placeholder(cachedGraph->gradInputTensor_, grad_input, input_shape); + auto gradWeightPlaceholder = native_mps::Placeholder(); + if(grad_input_mask[1]) + gradWeightPlaceholder = native_mps::Placeholder(cachedGraph->gradWeightTensor_, grad_weight); + auto gradBiasPlaceholder = native_mps::Placeholder();; + if(grad_input_mask[2]) + gradBiasPlaceholder = native_mps::Placeholder(cachedGraph->gradBiasTensor_, grad_bias); + + NSMutableDictionary *feeds = [[NSMutableDictionary new] autorelease]; + feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData(); + feeds[gradOutputPlaceholder.getMPSGraphTensor()] = gradOutputPlaceholder.getMPSGraphTensorData(); + if(has_weight) + feeds[weightPlaceholder.getMPSGraphTensor()] = weightPlaceholder.getMPSGraphTensorData(); + if(has_running_mean) { + feeds[runningMeanPlaceholder.getMPSGraphTensor()] = runningMeanPlaceholder.getMPSGraphTensorData(); + feeds[runningVarPlaceholder.getMPSGraphTensor()] = runningVarPlaceholder.getMPSGraphTensorData(); + } + if(has_save_mean) { + feeds[saveMeanPlaceholder.getMPSGraphTensor()] = saveMeanPlaceholder.getMPSGraphTensorData(); + feeds[saveVarPlaceholder.getMPSGraphTensor()] = saveVarPlaceholder.getMPSGraphTensorData(); + } + + NSMutableDictionary *results = [[NSMutableDictionary new] autorelease]; + if(grad_input_mask[0]) + results[gradInputPlaceholder.getMPSGraphTensor()] = gradInputPlaceholder.getMPSGraphTensorData(); + if(grad_input_mask[1]) + results[gradWeightPlaceholder.getMPSGraphTensor()] = gradWeightPlaceholder.getMPSGraphTensorData(); + if(grad_input_mask[2]) + results[gradBiasPlaceholder.getMPSGraphTensor()] = gradBiasPlaceholder.getMPSGraphTensorData(); + + native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + + } + + return std::make_tuple(grad_input, grad_weight, grad_bias); + +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/mps/operations/PointwiseOps.mm b/aten/src/ATen/native/mps/operations/PointwiseOps.mm new file mode 100644 index 000000000000..569cad0fbfb0 --- /dev/null +++ b/aten/src/ATen/native/mps/operations/PointwiseOps.mm @@ -0,0 +1,123 @@ +// Copyright © 2022 Apple Inc. + +#include + +namespace at { +namespace native { +// scope the MPS's internal methods to not expose them to at::native +namespace mps { + +Tensor& addc_mul_div_out_mps(const Tensor& self, + const Tensor& tensor1, + const Tensor& tensor2, + const Scalar& value_opt, // default value = 1.0 + Tensor& output, + const bool is_div, + const string op_name) +{ + using scalar_t = double; + scalar_t value_scalar = value_opt.to(); + if (&output != &self) { + output.resize_(output.sizes()); + } + TORCH_CHECK(output.is_mps()); + + // Derive from MPSCachedGraph + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor = nil, *outputTensor = nil; + MPSGraphTensor *firstTensor = nil, *secondTensor = nil; + }; + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + @autoreleasepool { + string key = op_name + to_string(value_scalar) + + getTensorsStringKey({self, tensor1, tensor2})+ ":" + + getMPSTypeString(value_opt.type()); + + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph* newCachedGraph = nil; + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self); + newCachedGraph->firstTensor = mpsGraphRankedPlaceHolder(mpsGraph, tensor1); + newCachedGraph->secondTensor = mpsGraphRankedPlaceHolder(mpsGraph, tensor2); + + // the tensor to be optionally multiplied by value_scalar + MPSGraphTensor *multiplicandTensor = nil; + if (is_div) { + multiplicandTensor = [mpsGraph divisionWithPrimaryTensor:newCachedGraph->firstTensor + secondaryTensor:newCachedGraph->secondTensor + name:nil]; + } else { + multiplicandTensor = [mpsGraph multiplicationWithPrimaryTensor:newCachedGraph->firstTensor + secondaryTensor:newCachedGraph->secondTensor + name:nil]; + } + // the tensor to be added to input_tensor + MPSGraphTensor *addendTensor = multiplicandTensor; + // if value_scalar is 1.0, then we don't bother adding another multiply to graph + if (value_scalar != 1.0) { + MPSGraphTensor* valueTensor = [mpsGraph constantWithScalar:value_scalar + dataType:getMPSScalarType(value_opt.type())]; + addendTensor = [mpsGraph multiplicationWithPrimaryTensor:multiplicandTensor + secondaryTensor:valueTensor + name:nil]; + } + newCachedGraph->outputTensor = [mpsGraph additionWithPrimaryTensor:newCachedGraph->inputTensor + secondaryTensor:addendTensor + name:nil]; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + // Inputs as placeholders + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor, self); + Placeholder tensor1Placeholder = Placeholder(cachedGraph->firstTensor, tensor1); + Placeholder tensor2Placeholder = Placeholder(cachedGraph->secondTensor, tensor2); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output); + + // Create dictionary of inputs and outputs + // Utility to dump out graph : [mpsGraph dump]; + NSDictionary* feeds = @{ + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), + tensor1Placeholder.getMPSGraphTensor() : tensor1Placeholder.getMPSGraphTensorData(), + tensor2Placeholder.getMPSGraphTensor() : tensor2Placeholder.getMPSGraphTensorData() + }; + + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + } + + return output; +} + +} // namespace mps + +// APIs exposed to at::native scope +TORCH_IMPL_FUNC(addcmul_out_mps) +(const Tensor& self, const Tensor& tensor1, const Tensor& tensor2, const Scalar& value, const Tensor& output) +{ + mps::addc_mul_div_out_mps(self, tensor1, tensor2, value, const_cast(output), false, "addcmul_out_mps"); +} + +TORCH_IMPL_FUNC(addcdiv_out_mps) +(const Tensor& self, const Tensor& tensor1, const Tensor& tensor2, const Scalar& value, const Tensor& output) +{ + mps::addc_mul_div_out_mps(self, tensor1, tensor2, value, const_cast(output), true, "addcdiv_out_mps"); +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/mps/operations/Pooling.mm b/aten/src/ATen/native/mps/operations/Pooling.mm new file mode 100644 index 000000000000..77a284963d6e --- /dev/null +++ b/aten/src/ATen/native/mps/operations/Pooling.mm @@ -0,0 +1,891 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { + +// Create pooling descriptor +void fill_pool_desc(MPSGraphPooling2DOpDescriptor* desc, + NSUInteger kW, NSUInteger kH, + NSUInteger dW, NSUInteger dH, + NSUInteger dilationW, NSUInteger dilationH, + NSUInteger padW, NSUInteger padH, + bool ceil_mode, c10::MemoryFormat memory_format) { + desc.kernelWidth = kW; + desc.kernelHeight = kH; + desc.strideInX = dW; + desc.strideInY = dH; + desc.dilationRateInX = dilationW; + desc.dilationRateInY = dilationH; + desc.paddingLeft = padW; + desc.paddingRight = padW; + desc.paddingTop = padH; + desc.paddingBottom = padH; + desc.ceilMode = ceil_mode; + desc.paddingStyle = MPSGraphPaddingStyleExplicit; + switch(memory_format) { + case at::MemoryFormat::Contiguous: + desc.dataLayout = MPSGraphTensorNamedDataLayoutNCHW; + break; + case at::MemoryFormat::ChannelsLast: + desc.dataLayout = MPSGraphTensorNamedDataLayoutNHWC; + break; + default: + assert(0 && "Check should have been done earlier\n"); + } +} + +Tensor _mps_max_pool2d( + const Tensor& input_t, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool ceil_mode) { + + // #20866, #22032: Guarantee this for the official C++ API? + TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2, + "max_pool2d: kernel_size must either be a single int, or a tuple of two ints") + const int kH = safe_downcast(kernel_size[0]); + const int kW = kernel_size.size() == 1 ? kH : safe_downcast(kernel_size[1]); + + // NB: stride default is not expressible as an integer constant, so we accept + // empty stride for this case + TORCH_CHECK(stride.size() == 0 || stride.size() == 1 || stride.size() == 2, + "max_pool2d: stride must either be omitted, a single int, or a tuple of two ints") + const int dH = stride.empty() ? kH : safe_downcast(stride[0]); + const int dW = stride.empty() ? kW : + stride.size() == 1 ? dH : safe_downcast(stride[1]); + + TORCH_CHECK(padding.size() == 1 || padding.size() == 2, + "max_pool2d: padding must be either be a single int, or a tuple of two ints"); + const int padH = safe_downcast(padding[0]); + const int padW = padding.size() == 1 ? padH : safe_downcast(padding[1]); + + TORCH_CHECK(dilation.size() == 1 || dilation.size() == 2, + "max_pool2d: dilation must be either a single int, or a tuple of two ints"); + const int dilationH = safe_downcast(dilation[0]); + const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast(dilation[1]); + + const auto memory_format = input_t.suggest_memory_format(); + if (memory_format == at::MemoryFormat::ChannelsLast) { + TORCH_CHECK(input_t.ndimension() == 4, + "non-empty 4D (batch mode) tensor expected for input with channels_last layout"); + } else if (memory_format == at::MemoryFormat::Contiguous) { + TORCH_CHECK((input_t.ndimension() == 3 || input_t.ndimension() == 4), + "non-empty 3D or 4D (batch mode) tensor expected for input"); + } else { + TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + } + + /* sizes */ + const int64_t nbatch = input_t.ndimension() == 4 ? input_t.size(-4) : 1; + const int64_t nInputPlane = input_t.size(-3); + const int64_t inputHeight = input_t.size(-2); + const int64_t inputWidth = input_t.size(-1); + + const int64_t outputHeight = pooling_output_shape(inputHeight, kH, padH, dH, dilationH, ceil_mode); + const int64_t outputWidth = pooling_output_shape(inputWidth, kW, padW, dW, dilationW, ceil_mode); + + pool2d_shape_check( + input_t, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + nInputPlane, + inputHeight, inputWidth, + outputHeight, outputWidth, memory_format); + + namespace native_mps = at::native::mps; + CheckedFrom c = "mps_max_pool2d"; + + // Derive from MPSCachedGraph + struct CachedGraph : public native_mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance(); + + Tensor output_t; + + if (input_t.ndimension() == 3) { + output_t = at::native::empty_mps( + {nInputPlane, outputHeight, outputWidth}, + input_t.scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + memory_format); + } else { + output_t = at::native::empty_mps( + {nbatch, nInputPlane, outputHeight, outputWidth}, + input_t.scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + memory_format); + } + + if (output_t.numel() == 0) { + return output_t; + } + + auto stream = at::mps::getCurrentMPSStream(); + + @autoreleasepool { + + string mem_format_key; + switch(memory_format) { + case at::MemoryFormat::Contiguous: + mem_format_key = "Contiguous"; + break; + case at::MemoryFormat::ChannelsLast: + mem_format_key = "ChannelsLast"; + break; + default: + assert(0 && "Check should have been done earlier\n"); + } + + string key = "mps_max_pool2d:" + to_string(kW) + ":" + to_string(kH) + ":" + + to_string(dW) + ":" + to_string(dH) + ":" + + to_string(dilationW) + ":" + to_string(dilationH) + ":" + + to_string(padW) + ":" + to_string(padH) + ":" + + to_string(ceil_mode) + ":" + mem_format_key + + mps::getTensorsStringKey({input_t}); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = native_mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphPooling2DOpDescriptor* desc = [[MPSGraphPooling2DOpDescriptor new] autorelease]; + fill_pool_desc(desc, kW, kH, dW, dH, dilationW, dilationH, padW, padH, ceil_mode, memory_format); + + MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t); + MPSGraphTensor* outputTensor = [mpsGraph maxPooling2DWithSourceTensor:inputTensor + descriptor:desc + name:nil]; + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t); + auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t); + + NSDictionary *feeds = @{ + inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), + }; + + NSDictionary *results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + + return output_t; +} + +Tensor mps_max_pool2d_backward( + const Tensor& grad_output, + const Tensor& input_t, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool ceil_mode) { + + // #20866, #22032: Guarantee this for the official C++ API? + TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2, + "max_pool2d: kernel_size must either be a single int, or a tuple of two ints") + const int kH = safe_downcast(kernel_size[0]); + const int kW = kernel_size.size() == 1 ? kH : safe_downcast(kernel_size[1]); + + // NB: stride default is not expressible as an integer constant, so we accept + // empty stride for this case + TORCH_CHECK(stride.size() == 0 || stride.size() == 1 || stride.size() == 2, + "max_pool2d: stride must either be omitted, a single int, or a tuple of two ints") + const int dH = stride.empty() ? kH : safe_downcast(stride[0]); + const int dW = stride.empty() ? kW : + stride.size() == 1 ? dH : safe_downcast(stride[1]); + + TORCH_CHECK(padding.size() == 1 || padding.size() == 2, + "max_pool2d: padding must be either be a single int, or a tuple of two ints"); + const int padH = safe_downcast(padding[0]); + const int padW = padding.size() == 1 ? padH : safe_downcast(padding[1]); + + TORCH_CHECK(dilation.size() == 1 || dilation.size() == 2, + "max_pool2d: dilation must be either a single int, or a tuple of two ints"); + const int dilationH = safe_downcast(dilation[0]); + const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast(dilation[1]); + + const auto memory_format = input_t.suggest_memory_format(); + if (memory_format == at::MemoryFormat::ChannelsLast) { + TORCH_CHECK(input_t.ndimension() == 4, + "non-empty 4D (batch mode) tensor expected for input with channels_last layout"); + } else if (memory_format == at::MemoryFormat::Contiguous) { + TORCH_CHECK((input_t.ndimension() == 3 || input_t.ndimension() == 4), + "non-empty 3D or 4D (batch mode) tensor expected for input"); + } else { + TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + } + + namespace native_mps = at::native::mps; + CheckedFrom c = "mps_max_pool2d_backward"; + + // Derive from MPSCachedGraph + struct CachedGraph : public native_mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *gradOutputTensor_ = nil; + MPSGraphTensor *gradInputTensor_ = nil; + }; + + native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance(); + + Tensor grad_input; + grad_input = at::native::empty_mps( + input_t.sizes(), + input_t.scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + memory_format); + + if (grad_input.numel() == 0) { + return grad_input; + } + + auto stream = at::mps::getCurrentMPSStream(); + + @autoreleasepool { + + string mem_format_key; + switch(memory_format) { + case at::MemoryFormat::Contiguous: + mem_format_key = "Contiguous"; + break; + case at::MemoryFormat::ChannelsLast: + mem_format_key = "ChannelsLast"; + break; + default: + assert(0 && "Check should have been done earlier\n"); + } + + string key = "mps_max_pool2d_backward:" + to_string(kW) + ":" + to_string(kH) + ":" + + to_string(dW) + ":" + to_string(dH) + ":" + + to_string(dilationW) + ":" + to_string(dilationH) + ":" + + to_string(padW) + ":" + to_string(padH) + ":" + + to_string(ceil_mode) + ":" + mem_format_key + + mps::getTensorsStringKey({input_t, grad_output}); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + @autoreleasepool { + MPSGraph* mpsGraph = native_mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphPooling2DOpDescriptor* desc = [[MPSGraphPooling2DOpDescriptor new] autorelease]; + fill_pool_desc(desc, kW, kH, dW, dH, dilationW, dilationH, padW, padH, ceil_mode, memory_format); + + MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t); + MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, grad_output); + MPSGraphTensor* gradInputTensor = [mpsGraph maxPooling2DGradientWithGradientTensor:gradOutputTensor + sourceTensor:inputTensor + descriptor:desc + name:nil]; + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->gradOutputTensor_ = gradOutputTensor; + newCachedGraph->gradInputTensor_ = gradInputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t); + auto gradOutputPlaceholder = native_mps::Placeholder(cachedGraph->gradOutputTensor_, grad_output); + auto gradInputPlaceholder = native_mps::Placeholder(cachedGraph->gradInputTensor_, grad_input); + + NSDictionary *feeds = @{ + inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), + gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData() + }; + + NSDictionary *results = @{ + gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData() + }; + + native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + + return grad_input; +} + +TORCH_IMPL_FUNC(max_pool2d_with_indices_out_mps)( + const Tensor& input_t, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool ceil_mode, + const Tensor& output_t, + const Tensor& indices) { + + // #20866, #22032: Guarantee this for the official C++ API? + TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2, + "max_pool2d: kernel_size must either be a single int, or a tuple of two ints") + const int kH = safe_downcast(kernel_size[0]); + const int kW = kernel_size.size() == 1 ? kH : safe_downcast(kernel_size[1]); + + // NB: stride default is not expressible as an integer constant, so we accept + // empty stride for this case + TORCH_CHECK(stride.size() == 0 || stride.size() == 1 || stride.size() == 2, + "max_pool2d: stride must either be omitted, a single int, or a tuple of two ints") + const int dH = stride.empty() ? kH : safe_downcast(stride[0]); + const int dW = stride.empty() ? kW : + stride.size() == 1 ? dH : safe_downcast(stride[1]); + + TORCH_CHECK(padding.size() == 1 || padding.size() == 2, + "max_pool2d: padding must be either be a single int, or a tuple of two ints"); + const int padH = safe_downcast(padding[0]); + const int padW = padding.size() == 1 ? padH : safe_downcast(padding[1]); + + TORCH_CHECK(dilation.size() == 1 || dilation.size() == 2, + "max_pool2d: dilation must be either a single int, or a tuple of two ints"); + const int dilationH = safe_downcast(dilation[0]); + const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast(dilation[1]); + + const auto memory_format = input_t.suggest_memory_format(); + if (memory_format == at::MemoryFormat::ChannelsLast) { + TORCH_CHECK(input_t.ndimension() == 4, + "non-empty 4D (batch mode) tensor expected for input with channels_last layout"); + } else if (memory_format == at::MemoryFormat::Contiguous) { + TORCH_CHECK((input_t.ndimension() == 3 || input_t.ndimension() == 4), + "non-empty 3D or 4D (batch mode) tensor expected for input"); + } else { + TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + } + + /* sizes */ + const int64_t nbatch = input_t.ndimension() == 4 ? input_t.size(-4) : 1; + const int64_t nInputPlane = input_t.size(-3); + const int64_t inputHeight = input_t.size(-2); + const int64_t inputWidth = input_t.size(-1); + + const int64_t outputHeight = pooling_output_shape(inputHeight, kH, padH, dH, dilationH, ceil_mode); + const int64_t outputWidth = pooling_output_shape(inputWidth, kW, padW, dW, dilationW, ceil_mode); + + pool2d_shape_check( + input_t, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + nInputPlane, + inputHeight, inputWidth, + outputHeight, outputWidth, memory_format); + + namespace native_mps = at::native::mps; + CheckedFrom c = "max_pool2d_with_indices_out_mps"; + + // Derive from MPSCachedGraph + struct CachedGraph : public native_mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* inputTensor_ = nil; + MPSGraphTensor* outputTensor_ = nil; + MPSGraphTensor* indicesTensor_ = nil; + }; + + native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance(); + + if (output_t.numel() == 0) { + return; + } + + auto stream = at::mps::getCurrentMPSStream(); + + @autoreleasepool { + + string mem_format_key; + switch(memory_format) { + case at::MemoryFormat::Contiguous: + mem_format_key = "Contiguous"; + break; + case at::MemoryFormat::ChannelsLast: + mem_format_key = "ChannelsLast"; + break; + default: + assert(0 && "Check should have been done earlier\n"); + } + + string key = "max_pool2d_with_indices_out_mps:" + to_string(kW) + ":" + to_string(kH) + ":" + + to_string(dW) + ":" + to_string(dH) + ":" + + to_string(dilationW) + ":" + to_string(dilationH) + ":" + + to_string(padW) + ":" + to_string(padH) + ":" + + to_string(ceil_mode) + ":" + mem_format_key + + mps::getTensorsStringKey({input_t}) + ":" + + native_mps::getMPSTypeString(indices.scalar_type()); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = native_mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphPooling2DOpDescriptor* desc = [[MPSGraphPooling2DOpDescriptor new] autorelease]; + fill_pool_desc(desc, kW, kH, dW, dH, dilationW, dilationH, padW, padH, ceil_mode, memory_format); + desc.returnIndicesMode = MPSGraphPoolingReturnIndicesGlobalFlatten2D; + desc.returnIndicesDataType = MPSDataTypeInt32; + + MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t); + NSArray* poolOutputs = [mpsGraph maxPooling2DReturnIndicesWithSourceTensor:inputTensor + descriptor:desc + name:nil]; + + MPSGraphTensor* indicesTensor = poolOutputs[1]; + if(mps::getMPSDataType(indices.scalar_type()) == MPSDataTypeInt64) { + indicesTensor = [mpsGraph castTensor:indicesTensor + toType:MPSDataTypeInt64 + name:@"castToI64"]; + } + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = poolOutputs[0]; + newCachedGraph->indicesTensor_ = indicesTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t); + auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t); + auto indicesPlaceholder = native_mps::Placeholder(cachedGraph->indicesTensor_, indices); + + NSDictionary *feeds = @{ + inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), + }; + + NSDictionary *results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(), + indicesPlaceholder.getMPSGraphTensor() : indicesPlaceholder.getMPSGraphTensorData() + }; + + native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + +} + +TORCH_IMPL_FUNC(max_pool2d_with_indices_backward_out_mps) +(const Tensor& grad_output, +const Tensor& input_t, +IntArrayRef kernel_size, +IntArrayRef stride, +IntArrayRef padding, +IntArrayRef dilation, +bool ceil_mode, +const Tensor& indices, +const Tensor& grad_input) { + + // #20866, #22032: Guarantee this for the official C++ API? + TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2, + "max_pool2d: kernel_size must either be a single int, or a tuple of two ints") + const int kH = safe_downcast(kernel_size[0]); + const int kW = kernel_size.size() == 1 ? kH : safe_downcast(kernel_size[1]); + + // NB: stride default is not expressible as an integer constant, so we accept + // empty stride for this case + TORCH_CHECK(stride.size() == 0 || stride.size() == 1 || stride.size() == 2, + "max_pool2d: stride must either be omitted, a single int, or a tuple of two ints") + const int dH = stride.empty() ? kH : safe_downcast(stride[0]); + const int dW = stride.empty() ? kW : + stride.size() == 1 ? dH : safe_downcast(stride[1]); + + TORCH_CHECK(padding.size() == 1 || padding.size() == 2, + "max_pool2d: padding must be either be a single int, or a tuple of two ints"); + const int padH = safe_downcast(padding[0]); + const int padW = padding.size() == 1 ? padH : safe_downcast(padding[1]); + + TORCH_CHECK(dilation.size() == 1 || dilation.size() == 2, + "max_pool2d: dilation must be either a single int, or a tuple of two ints"); + const int dilationH = safe_downcast(dilation[0]); + const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast(dilation[1]); + + const auto memory_format = input_t.suggest_memory_format(); + if (memory_format == at::MemoryFormat::ChannelsLast) { + TORCH_CHECK(input_t.ndimension() == 4, + "non-empty 4D (batch mode) tensor expected for input with channels_last layout"); + } else if (memory_format == at::MemoryFormat::Contiguous) { + TORCH_CHECK((input_t.ndimension() == 3 || input_t.ndimension() == 4), + "non-empty 3D or 4D (batch mode) tensor expected for input"); + } else { + TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + } + + namespace native_mps = at::native::mps; + CheckedFrom c = "max_pool2d_with_indices_backward_out_mps"; + + // Derive from MPSCachedGraph + struct CachedGraph : public native_mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *gradOutputTensor_ = nil; + MPSGraphTensor *gradInputTensor_ = nil; + }; + + native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance(); + + if (grad_input.numel() == 0) { + return; + } + + auto stream = at::mps::getCurrentMPSStream(); + + @autoreleasepool { + + string mem_format_key; + switch(memory_format) { + case at::MemoryFormat::Contiguous: + mem_format_key = "Contiguous"; + break; + case at::MemoryFormat::ChannelsLast: + mem_format_key = "ChannelsLast"; + break; + default: + assert(0 && "Check should have been done earlier\n"); + } + + string key = "max_pool2d_with_indices_backward_out_mps:" + to_string(kW) + ":" + to_string(kH) + ":" + + to_string(dW) + ":" + to_string(dH) + ":" + + to_string(dilationW) + ":" + to_string(dilationH) + ":" + + to_string(padW) + ":" + to_string(padH) + ":" + + to_string(ceil_mode) + ":" + mem_format_key + + mps::getTensorsStringKey({input_t, grad_output}); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + @autoreleasepool { + MPSGraph* mpsGraph = native_mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphPooling2DOpDescriptor* desc = [[MPSGraphPooling2DOpDescriptor new] autorelease]; + fill_pool_desc(desc, kW, kH, dW, dH, dilationW, dilationH, padW, padH, ceil_mode, memory_format); + + MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t); + MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, grad_output); + MPSGraphTensor* gradInputTensor = [mpsGraph maxPooling2DGradientWithGradientTensor:gradOutputTensor + sourceTensor:inputTensor + descriptor:desc + name:nil]; + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->gradOutputTensor_ = gradOutputTensor; + newCachedGraph->gradInputTensor_ = gradInputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t); + auto gradOutputPlaceholder = native_mps::Placeholder(cachedGraph->gradOutputTensor_, grad_output); + auto gradInputPlaceholder = native_mps::Placeholder(cachedGraph->gradInputTensor_, grad_input); + + NSDictionary *feeds = @{ + inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), + gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData() + }; + + NSDictionary *results = @{ + gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData() + }; + + native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } +} + +TORCH_IMPL_FUNC(avg_pool2d_out_mps) ( + const Tensor& input_, + int64_t kH_, + int64_t kW_, + int64_t dH_, + int64_t dW_, + int64_t padH_, + int64_t padW_, + bool ceil_mode, + bool count_include_pad, + c10::optional divisor_override, + const Tensor& output) { + namespace native_mps = at::native::mps; + + TensorArg output_arg{ output, "output", 1 }; + TensorArg input_arg{ input_, "input_", 2 }; + + checkAllSameGPU("avg_pool2d_out_cuda", {output_arg, input_arg}); + + const int kH = safe_downcast(kH_); + const int kW = safe_downcast(kW_); + + const int dH = safe_downcast(dH_); + const int dW = safe_downcast(dW_); + + const int padH = safe_downcast(padH_); + const int padW = safe_downcast(padW_); + + /* sizes */ + const int64_t nbatch = input_.ndimension() == 4 ? input_.size(-4) : 1; + const int64_t nInputPlane = input_.size(-3); + const int64_t inputHeight = input_.size(-2); + const int64_t inputWidth = input_.size(-1); + + int64_t outputWidth = pooling_output_shape(inputWidth, kW, padW, dW, 1, ceil_mode); + int64_t outputHeight = pooling_output_shape(inputHeight, kH, padH, dH, 1, ceil_mode); + const auto memory_format = input_.suggest_memory_format(); + + Tensor input = input_.contiguous(memory_format); + + const int32_t count = safe_downcast(output.numel()); + + bool use_divisor = divisor_override.has_value(); + const auto divisor_override_value = use_divisor ? divisor_override.value() : 0; + + if (count != 0) { + // Derive from MPSCachedGraph + struct CachedGraph : public native_mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* inputTensor_ = nil; + MPSGraphTensor* outputTensor_ = nil; + MPSGraphTensor* indicesTensor_ = nil; + }; + + native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance(); + + auto stream = at::mps::getCurrentMPSStream(); + + @autoreleasepool { + string mem_format_key; + switch(memory_format) { + case at::MemoryFormat::Contiguous: + mem_format_key = "Contiguous"; + break; + case at::MemoryFormat::ChannelsLast: + mem_format_key = "ChannelsLast"; + break; + default: + assert(0 && "Check should have been done earlier\n"); + } + + string key = "mps_avg_pool2d:" + to_string(kW) + ":" + to_string(kH) + ":" + + to_string(dW) + ":" + to_string(dH) + ":" + + to_string(padW) + ":" + to_string(padH) + ":" + + to_string(ceil_mode) + ":" + mem_format_key + ":" + + to_string(divisor_override_value) + + mps::getTensorsStringKey({input}); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = native_mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphPooling2DOpDescriptor* desc = [[MPSGraphPooling2DOpDescriptor new] autorelease]; + fill_pool_desc(desc, kW, kH, dW, dH, 1, 1, padW, padH, ceil_mode, memory_format); + + MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input); + MPSGraphTensor* outputTensor = [mpsGraph avgPooling2DWithSourceTensor:inputTensor + descriptor:desc + name:nil]; + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input); + auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output); + + NSDictionary *feeds = @{ + inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), + }; + + NSDictionary *results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + } +} + +TORCH_IMPL_FUNC(avg_pool2d_backward_out_mps) ( + const Tensor& gradOutput_, + const Tensor& input_, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + bool ceil_mode, + bool count_include_pad, + c10::optional divisor_override, + const Tensor& gradInput +) { + TensorArg gradInput_arg{ gradInput, "gradInput", 1 }; + TensorArg gradOutput_arg{ gradOutput_, "gradOutput_", 2 }; + TensorArg input_arg{ input_, "input_", 3 }; + + checkAllSameGPU("avg_pool2d_backward_out_cuda", + {gradInput_arg, gradOutput_arg, input_arg}); + namespace native_mps = at::native::mps; + + const int kH = safe_downcast(kernel_size[0]); + const int kW = kernel_size.size() == 1 ? kH : safe_downcast(kernel_size[1]); + + const int dH = stride.empty() ? kH : safe_downcast(stride[0]); + const int dW = stride.empty() ? kW : + stride.size() == 1 ? dH : safe_downcast(stride[1]); + + const int padH = safe_downcast(padding[0]); + const int padW = padding.size() == 1 ? padH : safe_downcast(padding[1]); + + const auto memory_format = input_.suggest_memory_format(); + const Tensor input = input_.contiguous(memory_format); + const Tensor gradOutput = gradOutput_.contiguous(memory_format); + + const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1; + const int64_t nInputPlane = input.size(-3); + const int64_t inputHeight = input.size(-2); + const int64_t inputWidth = input.size(-1); + + const int64_t outputHeight = pooling_output_shape(inputHeight, kH, padH, dH, 1, ceil_mode); + const int64_t outputWidth = pooling_output_shape(inputWidth, kW, padW, dW, 1, ceil_mode); + + + const int32_t count = safe_downcast(input.numel()); + if (count == 0) { + return; + } + bool use_divisor = divisor_override.has_value(); + const auto divisor_override_value = use_divisor ? divisor_override.value() : 0; + + namespace native_mps = at::native::mps; + CheckedFrom c = "avg_pool2d_backward_out_mps"; + + // Derive from MPSCachedGraph + struct CachedGraph : public native_mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *gradOutputTensor_ = nil; + MPSGraphTensor *gradInputTensor_ = nil; + }; + + native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance(); + + if (gradInput.numel() == 0) { + return; + } + + auto stream = at::mps::getCurrentMPSStream(); + + @autoreleasepool { + + string mem_format_key; + switch(memory_format) { + case at::MemoryFormat::Contiguous: + mem_format_key = "Contiguous"; + break; + case at::MemoryFormat::ChannelsLast: + mem_format_key = "ChannelsLast"; + break; + default: + assert(0 && "Check should have been done earlier\n"); + } + + string key = "avg_pool2d_backward_out_mps:" + to_string(kW) + ":" + to_string(kH) + ":" + + to_string(dW) + ":" + to_string(dH) + ":" + + to_string(outputWidth) + ":" + to_string(outputHeight) + ":" + + to_string(padW) + ":" + to_string(padH) + ":" + + to_string(ceil_mode) + ":" + mem_format_key + + mps::getTensorsStringKey({input, gradOutput}); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + @autoreleasepool { + MPSGraph* mpsGraph = native_mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphPooling2DOpDescriptor* desc = [[MPSGraphPooling2DOpDescriptor new] autorelease]; + fill_pool_desc(desc, kW, kH, dW, dH, 1, 1, padW, padH, ceil_mode, memory_format); + + MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input); + MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, gradOutput); + MPSGraphTensor *gradInputTensor = [mpsGraph avgPooling2DGradientWithGradientTensor:gradOutputTensor + sourceTensor:inputTensor + descriptor : desc + name:nil]; + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->gradOutputTensor_ = gradOutputTensor; + newCachedGraph->gradInputTensor_ = gradInputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input); + auto gradOutputPlaceholder = native_mps::Placeholder(cachedGraph->gradOutputTensor_, gradOutput); + auto gradInputPlaceholder = native_mps::Placeholder(cachedGraph->gradInputTensor_, gradInput); + + NSDictionary *feeds = @{ + inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), + gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData() + }; + + NSDictionary *results = @{ + gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData() + }; + + native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/mps/operations/RangeFactories.mm b/aten/src/ATen/native/mps/operations/RangeFactories.mm new file mode 100644 index 000000000000..d7307b9b39e9 --- /dev/null +++ b/aten/src/ATen/native/mps/operations/RangeFactories.mm @@ -0,0 +1,66 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { + + +Tensor& arange_mps_out(const Scalar& start, const Scalar& end, const Scalar& step, Tensor& result) { + AT_DISPATCH_MPS_TYPES(result.scalar_type(), "arange_mps", [&]() { + using accscalar_t = at::acc_type; + auto xstart = start.to(); + auto xend = end.to(); + auto xstep = step.to(); + + double size_d; + if (std::is_same::value) { + size_d = std::ceil(static_cast(end.to() - start.to()) + / step.to()); + } else { + size_d = std::ceil(static_cast(end.to() - start.to()) + / step.to()); + } + + TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); + TORCH_CHECK(std::isfinite(static_cast(xstart)) && + std::isfinite(static_cast(xend)), + "unsupported range: ", xstart, " -> ", xend); + TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)), + "upper bound and larger bound inconsistent with step sign"); + + TORCH_CHECK(size_d >= 0 && size_d <= static_cast(std::numeric_limits::max()), + "invalid size, possible overflow?"); + int64_t size = static_cast(size_d); + int64_t numel = result.numel(); + + if (numel != size) { + if(numel > 0){ + TORCH_WARN("The number of elements in the out tensor of shape ", result.sizes(), + " is ", numel, " which does not match the computed number of elements ", size, + ". Note that this may occur as a result of rounding error. " + "The out tensor will be resized to a tensor of shape (", size, ",)."); + } + result.resize_({size}); + } + bool is_contiguous = result.is_contiguous(); + Tensor r = !is_contiguous ? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT) : result; + + //TODO: Add arange Metal kernel. + + if(!is_contiguous) { + result.copy_(r); + } + }); + + return result; +} +}} // namespace at::native diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm new file mode 100644 index 000000000000..7e840c3f4dd9 --- /dev/null +++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm @@ -0,0 +1,1587 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { + +using namespace std; + +enum StdVarType { + STANDARD_VARIANCE, + STANDARD_DEVIATION +}; + +void set_apparent_shapes(NSMutableArray * &apparent_out_shape, + NSMutableArray * &apparent_in_shape, + int64_t num_reduce_dims, + int64_t num_input_dims, + int64_t num_output_dims, + IntArrayRef& input_shape, + NSMutableArray * &axes) { + + if(num_reduce_dims == 0) { + /* Output shape becomes a one + * Input shape becomes flattened + * Because 0 reduce dims means all dims are reduced + */ + apparent_in_shape = [NSMutableArray arrayWithCapacity:1]; + int64_t num_in_elements = 1; + for(int i = 0; i < num_input_dims; i++) { + num_in_elements *= input_shape[i]; + } + apparent_in_shape[0] = [NSNumber numberWithInt:num_in_elements]; + + apparent_out_shape = [NSMutableArray arrayWithCapacity:1]; + apparent_out_shape[0] = @1; + } + + else { + // num_output_dims in this case is number of input dims + apparent_out_shape = [NSMutableArray arrayWithCapacity:num_output_dims]; + for(int i = 0; i < num_output_dims; i++) { + int64_t current_input_dim = input_shape[i]; + + // If the current dim is to be reduced + bool is_reduce_dim = false; + + for(int j = 0; j < num_reduce_dims; j++) { + if(i == [axes[j] intValue]) { + is_reduce_dim = true; + break; + } + } + + if(is_reduce_dim) { + apparent_out_shape[i] = @1; + } + else { + apparent_out_shape[i] = [NSNumber numberWithInt:current_input_dim]; + } + } + } + +} + +// Helper function to set the axes of reduction +void set_axes(NSMutableArray * &axes, + int64_t num_reduce_dims, + IntArrayRef& dim, + int64_t num_input_dims) { + if(num_reduce_dims == 0) { + axes = [NSMutableArray arrayWithCapacity:1]; + axes[0] = @0; + } + else { + axes = [NSMutableArray arrayWithCapacity:num_reduce_dims]; + for(int i = 0; i < num_reduce_dims; i++) { + axes[i] = [NSNumber numberWithInt:maybe_wrap_dim(dim[i], num_input_dims)]; + } + } +} + +void reduction_out_mps + (const Tensor& input_t, + IntArrayRef dim, + bool keepdim, + c10::optional dtype, + const Tensor& output_t, + string reduction_type, + string func_name) { + + IntArrayRef input_shape = input_t.sizes(); + + for(int i = 0; i < dim.size(); i++) { + auto wrap_dim = maybe_wrap_dim(dim[i], input_shape.size()); + TORCH_CHECK(wrap_dim < input_shape.size(), + func_name+": reduction dim must be in the range of input shape") + } + + namespace native_mps = at::native::mps; + + // Derive from MPSCachedGraph + struct CachedGraph : public native_mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance(); + + int64_t num_input_dims = input_shape.size(); + int64_t num_reduce_dims = dim.size(); + int64_t num_output_dims; + + // For output shape calculation, assume that keepdim is true + num_output_dims = num_input_dims; + NSMutableArray *apparent_output_shape = nil; + NSMutableArray *apparent_input_shape = nil; + + // Reduction axes + NSMutableArray *axes; + set_axes(axes, num_reduce_dims, dim, input_shape.size()); + + set_apparent_shapes(apparent_output_shape, + apparent_input_shape, + num_reduce_dims, + num_input_dims, + num_output_dims, + input_shape, + axes); + + if (output_t.numel() == 0 || input_t.numel() == 0) { + return; + } + + auto stream = at::mps::getCurrentMPSStream(); + + @autoreleasepool { + + // TODO: Make this key proper + NSString* ns_key = [[axes valueForKey:@"description"] componentsJoinedByString:@","]; + string key = func_name+":" + string([ns_key UTF8String]) + ":" + native_mps::getMPSTypeString(input_t.scalar_type()) + ":" + native_mps::getMPSTypeString(output_t.scalar_type()); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = native_mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* inputTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(input_t.scalar_type())); + + MPSGraphTensor* castInputTensor = nil; + + if(input_t.scalar_type() != ScalarType::Float && input_t.scalar_type() != ScalarType::Int) + castInputTensor = [mpsGraph castTensor:inputTensor + toType:MPSDataTypeFloat32 + name:@"castInputTensor"]; + else + castInputTensor = inputTensor; + + MPSGraphTensor* castOutputTensor = nil; + + if(reduction_type == "sum") + castOutputTensor = [mpsGraph reductionSumWithTensor:castInputTensor + axes:axes + name:nil]; + else if(reduction_type == "prod") + castOutputTensor = [mpsGraph reductionProductWithTensor:castInputTensor + axes:axes + name:nil]; + else if(reduction_type == "mean") + castOutputTensor = [mpsGraph meanOfTensor:inputTensor + axes:axes + name:nil]; + + MPSGraphTensor* outputTensor = nil; + + if(input_t.scalar_type() != ScalarType::Float) + outputTensor = [mpsGraph castTensor:castOutputTensor + toType:(native_mps::getMPSDataType(output_t.scalar_type())) + name:@"outputTensor"]; + else + outputTensor = castOutputTensor; + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + auto inputPlaceholder = native_mps::Placeholder(); + + if(apparent_input_shape) + inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t, apparent_input_shape); + else + inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t); + auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t, apparent_output_shape); + NSDictionary *feeds = @{ + inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), + }; + + NSDictionary *results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + +} + +TORCH_IMPL_FUNC(sum_out_mps) + (const Tensor& input_t, + IntArrayRef dim, + bool keepdim, + c10::optional dtype, + const Tensor& output_t) { + + reduction_out_mps(input_t, dim, keepdim, dtype, output_t, "sum", "sum_out_mps"); +} + +TORCH_IMPL_FUNC(prod_out_mps) + (const Tensor& input_t, + int64_t dim, + bool keepdim, + c10::optional dtype, + const Tensor& output_t) { + + int64_t dims[1] = {dim}; + + reduction_out_mps(input_t, IntArrayRef(dims, 1), keepdim, dtype, output_t, "prod", "prod_out_mps"); +} + +// Taken from ReduceOps.cpp +inline ScalarType get_dtype_from_self( + const Tensor& self, + const optional& dtype, + bool promote_integers) { + if (dtype.has_value()) { + return dtype.value(); + } + ScalarType src_type = self.scalar_type(); + if (promote_integers && at::isIntegralType(src_type, /*includeBool=*/true)) { + return kLong; + } + return src_type; +} + +Tensor prod_mps(const Tensor &self, c10::optional opt_dtype) { + + auto num_dims = self.dim(); + + int64_t dims[num_dims]; + + for(int i = 0; i < num_dims; i++) + dims[i] = i; + + Tensor output_t = at::native::empty_mps( + {}, + get_dtype_from_self(self, opt_dtype, true), + c10::nullopt, + kMPS, + c10::nullopt, + c10::nullopt); + + reduction_out_mps(self, IntArrayRef(dims, num_dims), false, opt_dtype, const_cast(output_t), "prod", "prod_mps"); + + return output_t; +} + +TORCH_IMPL_FUNC(mean_out_mps) + (const Tensor& input_t, + IntArrayRef dim, + bool keepdim, + c10::optional dtype, + const Tensor& output_t) { + + reduction_out_mps(input_t, dim, keepdim, dtype, output_t, "mean", "mean_out_mps"); +} + +TORCH_IMPL_FUNC(argmax_out_mps) + (const Tensor& input_t, + c10::optional dim, + bool keepdim, + const Tensor& output_t) { + + namespace native_mps = at::native::mps; + + // Derive from MPSCachedGraph + struct CachedGraph : public native_mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance(); + + int64_t dim_; + + if (dim.has_value()) { + dim_ = maybe_wrap_dim(dim.value(), input_t.dim()); + native::zero_numel_check_dims(input_t, dim_, "argmax()"); + } else { + TORCH_CHECK_INDEX( + input_t.numel() != 0, + "argmax()", ": Expected reduction dim to be specified for input.numel() == 0."); + // Since input will be flattened, take argmax along 0'th dimension + dim_ = 0; + } + + // Calculate the output shape according to keepdim=True + // If there is no dim argument, the input shape is flattened + IntArrayRef input_shape = input_t.sizes(); + int64_t num_input_dims = input_shape.size(); + NSMutableArray *apparent_in_shape = nil; + NSMutableArray *apparent_out_shape = nil; + + if(dim.has_value()) { + apparent_out_shape = [NSMutableArray arrayWithCapacity:num_input_dims]; + for(int i = 0; i < num_input_dims; i++) { + if(dim_ == i) + apparent_out_shape[i] = @1; + else + apparent_out_shape[i] = [NSNumber numberWithInt:input_shape[i]]; + } + } + else { + apparent_in_shape = [NSMutableArray arrayWithCapacity:1]; + int64_t num_in_elements = 1; + for(int i = 0; i < num_input_dims; i++) { + num_in_elements *= input_shape[i]; + } + apparent_in_shape[0] = [NSNumber numberWithInt:num_in_elements]; + + apparent_out_shape = [NSMutableArray arrayWithCapacity:1]; + apparent_out_shape[0] = @1; + } + + if (output_t.numel() == 0) { + return; + } + + auto stream = at::mps::getCurrentMPSStream(); + + @autoreleasepool { + string key = "argmax_out_mps:" + to_string(dim_) + ":" + native_mps::getMPSTypeString(input_t.scalar_type()); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = native_mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* inputTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(input_t.scalar_type())); + + MPSGraphTensor* castInputTensor = nil; + + if(input_t.scalar_type() != ScalarType::Float && + input_t.scalar_type() != ScalarType::Int && + input_t.scalar_type() != ScalarType::Half) + castInputTensor = [mpsGraph castTensor:inputTensor + toType:MPSDataTypeFloat32 + name:@"castInputTensor"]; + else + castInputTensor = inputTensor; + + MPSGraphTensor* argmaxOutTensor = [mpsGraph reductionArgMaximumWithTensor:castInputTensor + axis:(NSInteger)dim_ + name:@"argmax_out"]; + MPSGraphTensor* outputTensor = [mpsGraph castTensor:argmaxOutTensor + toType:MPSDataTypeInt64 + name:@"cast_out"]; + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + native_mps::Placeholder inputPlaceholder = native_mps::Placeholder(); + if(apparent_in_shape) + inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t, apparent_in_shape); + else + inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t); + + auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t, apparent_out_shape); + + NSDictionary *feeds = @{ + inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), + }; + + NSDictionary *results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + + } + +} + +TORCH_IMPL_FUNC(norm_out_mps) +(const Tensor& input_t, + const OptionalScalarRef opt_p, + IntArrayRef dim, + bool keepdim, + const Tensor& output_t) +{ + if (input_t.numel() == 0) + return; + IntArrayRef input_shape = input_t.sizes(); + + for(int i = 0; i < dim.size(); i++) { + auto wrap_dim = maybe_wrap_dim(dim[i], input_shape.size()); + TORCH_CHECK(wrap_dim < input_shape.size(), + "norm_out_mps: reduction dim must be in the range of input shape") + } + namespace native_mps = at::native::mps; + CheckedFrom c = "norm_out_mps"; + + // Derive from MPSCachedGraph + struct CachedGraph : public native_mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance(); + + auto p = opt_p.has_value() ? opt_p.get().to() : Scalar(2.0).to(); + auto reciprocal_p = 1 / p; + bool pIsZero = (p == 0.0); + bool pIsPosInf = (p == numeric_limits::infinity()); + bool pIsNegInf = (p == -numeric_limits::infinity()); + + int64_t num_input_dims = input_shape.size(); + int64_t num_reduce_dims = dim.size(); + int64_t num_output_dims; + + // For output shape calculation, assume that keepdim is true + num_output_dims = num_input_dims; + NSMutableArray *apparent_output_shape = nil; + NSMutableArray *apparent_input_shape = nil; + + // Reduction axes + NSMutableArray *axes; + set_axes(axes, num_reduce_dims, dim, input_shape.size()); + + set_apparent_shapes(apparent_output_shape, + apparent_input_shape, + num_reduce_dims, + num_input_dims, + num_output_dims, + input_shape, + axes); + if (output_t.numel() == 0) { + return; + } + + auto stream = at::mps::getCurrentMPSStream(); + + @autoreleasepool { + NSString* ns_key = [[axes valueForKey:@"description"] componentsJoinedByString:@","]; + string keepdim_info = (keepdim) ? "keepdim=1" : "keepdim=0"; + string key = string("norm_out_mps:") + [ns_key UTF8String] + ":" + native_mps::getMPSTypeString(input_t.scalar_type()) + ":p" + to_string(p) + ":" + keepdim_info; + + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = native_mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* inputTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(input_t.scalar_type())); + + MPSGraphTensor *outputTensor; + + if (pIsZero) + { + MPSGraphTensor *absoluteTensor = [mpsGraph absoluteWithTensor:inputTensor + name:nil]; + MPSGraphTensor *powerValTensor = [mpsGraph constantWithScalar:p + dataType:native_mps::getMPSDataType(input_t.scalar_type())]; + MPSGraphTensor *powerTensor = [mpsGraph powerWithPrimaryTensor:absoluteTensor + secondaryTensor:powerValTensor + name:nil]; + outputTensor = [mpsGraph reductionSumWithTensor:powerTensor + axes:axes + name:nil]; + } + else if (pIsPosInf) + { + MPSGraphTensor *absoluteTensor = [mpsGraph absoluteWithTensor:inputTensor + name:nil]; + outputTensor = [mpsGraph reductionMaximumWithTensor:absoluteTensor + axes:axes + name:nil]; + } + else if (pIsNegInf) + { + MPSGraphTensor *absoluteTensor = [mpsGraph absoluteWithTensor:inputTensor + name:nil]; + outputTensor = [mpsGraph reductionMinimumWithTensor:absoluteTensor + axes:axes + name:nil]; + } + else + { + MPSGraphTensor *absoluteTensor = [mpsGraph absoluteWithTensor:inputTensor + name:nil]; + + MPSGraphTensor *powerValTensor = [mpsGraph constantWithScalar:p + dataType:native_mps::getMPSDataType(input_t.scalar_type())]; + + MPSGraphTensor *reciprocalPowerValTensor = [mpsGraph constantWithScalar:reciprocal_p + dataType:native_mps::getMPSDataType(input_t.scalar_type())]; + + MPSGraphTensor *powerTensor = [mpsGraph powerWithPrimaryTensor:absoluteTensor + secondaryTensor:powerValTensor + name:nil]; + + MPSGraphTensor *reductionSumTensor = [mpsGraph reductionSumWithTensor:powerTensor + axes:axes + name:nil]; + + outputTensor = [mpsGraph powerWithPrimaryTensor:reductionSumTensor + secondaryTensor:reciprocalPowerValTensor + name:nil]; + } + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + auto inputPlaceholder = native_mps::Placeholder(); + + if(apparent_input_shape) + inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t, apparent_input_shape); + else + inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t); + + auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t, apparent_output_shape); + + + NSDictionary *feeds = @{ + inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), + }; + + NSDictionary *results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + + } +} + +Tensor std_var_common_impl_mps( + const Tensor & input_t, + at::OptionalIntArrayRef dim, + c10::optional correction, + bool keepdim, + StdVarType stdVarType) +{ + namespace native_mps = at::native::mps; + + IntArrayRef input_shape = input_t.sizes(); + int64_t num_input_dims = input_shape.size(); + + bool use_dim = dim.has_value(); + IntArrayRef dim_value = use_dim ? dim.value() : NULL; + + if (use_dim) + { + string errMessage = (stdVarType == STANDARD_DEVIATION) ? "std_mps" : "var_mps"; + errMessage += ": reduction dim must be in the range of input shape"; + for(int i = 0; i < dim_value.size(); i++) { + auto wrap_dim = maybe_wrap_dim(dim_value[i], input_shape.size()); + TORCH_CHECK(wrap_dim < input_shape.size(), errMessage.c_str()) + } + } + + bool use_correction = correction.has_value(); + const auto correction_value = use_correction ? correction.value() : false; + int64_t correction_n = 1; + + + // Derive from MPSCachedGraph + struct CachedGraph : public native_mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance(); + + int64_t num_output_dims = 0; + NSMutableArray *axes = nil; + NSMutableArray *apparent_output_shape = nil; + NSMutableArray *apparent_input_shape = nil; + int64_t* output_shape = nil; + + if ((!keepdim && !use_dim) || (!keepdim && use_dim && dim_value.size() <= 0)) + { + // Flatten the input tensor to reduce it to one value + apparent_input_shape = [NSMutableArray arrayWithCapacity:1]; + int64_t num_in_elements = 1; + for(int i = 0; i < num_input_dims; i++) { + num_in_elements *= input_shape[i]; + } + apparent_input_shape[0] = [NSNumber numberWithInt:num_in_elements]; + + // Output is a single value + apparent_output_shape = [NSMutableArray arrayWithCapacity:1]; + apparent_output_shape[0] = @1; + + num_output_dims = 0; + + correction_n = num_in_elements; + + // Reduction axes + axes = [NSMutableArray arrayWithCapacity:1]; + axes[0] = @0; + + } + else if (!keepdim && use_dim && dim_value.size() > 0) + { + int64_t num_reduce_dims = dim_value.size(); + num_output_dims = num_input_dims; + + set_axes(axes, num_reduce_dims, dim_value, num_input_dims); + set_apparent_shapes(apparent_output_shape, + apparent_input_shape, + num_reduce_dims, + num_input_dims, + num_output_dims, + input_shape, + axes); + + num_output_dims = (num_input_dims >= num_reduce_dims) ? (num_input_dims - num_reduce_dims) : 0; //num_input_dims; + output_shape = (int64_t *)malloc(num_output_dims * sizeof(int64_t)); + + unsigned int curr_i = 0; + for (int i = 0; i < num_input_dims; i++) + { + bool found = false; + for (int j = 0; j < num_reduce_dims; j++) + { + if (i == dim_value[j]) + { + found = true; + break; + } + } + if (found) continue; + output_shape[curr_i] = input_shape[i]; + curr_i += 1; + } + + for(int i = 0; i < num_reduce_dims; i++) + { + correction_n *= input_shape[dim_value[i]]; + } + // (3, 4, 5) --> (3, 5) + } + else if ((keepdim && !use_dim) || (keepdim && use_dim && dim_value.size() <= 0)) + { + num_output_dims = 0; + int64_t num_reduce_dims = 0; + set_axes(axes, num_reduce_dims, dim_value, input_shape.size()); + set_apparent_shapes(apparent_output_shape, + apparent_input_shape, + num_reduce_dims, + num_input_dims, + num_output_dims, + input_shape, + axes); + num_output_dims = num_input_dims; + output_shape = (int64_t *)malloc(num_output_dims * sizeof(int64_t)); + for (int i = 0; i < num_input_dims; i++) + { + output_shape[i] = (int64_t) 1; + correction_n *= input_shape[i]; + } + // scalar --> vector case [[1.0034567]] + } + else if (keepdim && use_dim && dim_value.size() > 0) + { + int64_t num_reduce_dims = dim_value.size(); + num_output_dims = num_input_dims; + + set_axes(axes, num_reduce_dims, dim_value, num_input_dims); + set_apparent_shapes(apparent_output_shape, + apparent_input_shape, + num_reduce_dims, + num_input_dims, + num_output_dims, + input_shape, + axes); + + num_output_dims = num_input_dims;//(num_input_dims >= num_reduce_dims) ? (num_input_dims - num_reduce_dims) : 0; + output_shape = (int64_t *)malloc(num_output_dims * sizeof(int64_t)); + + for(int i = 0; i < num_reduce_dims; i++) + { + correction_n *= input_shape[dim_value[i]]; + } + + for (int i = 0; i < num_input_dims; i++) + { + output_shape[i] = [apparent_output_shape[i] longValue]; + } + } + + Tensor output_t = at::native::empty_mps( + IntArrayRef(output_shape, num_output_dims), + input_t.scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + c10::nullopt); + + if (output_t.numel() == 0 || input_t.numel() == 0) + { + return output_t; + } + + double bessel_correction = ((double) correction_n) / ((double) (correction_n-1)); + + auto stream = at::mps::getCurrentMPSStream(); + + @autoreleasepool { + string op_key = (stdVarType == STANDARD_DEVIATION) ? "std_mps" : "var_mps"; + NSString* ns_key = [[axes valueForKey:@"description"] componentsJoinedByString:@","]; + string bessel_corrected = (use_correction && correction_value) ? "unbiased " : "biased "; + string use_dim_info = (use_dim) ? "use_dim=1:" + to_string(dim_value.size()) : "use_dim=0"; + string keepdim_info = (keepdim) ? "keepdim=1" : "keepdim=0"; + string key = op_key + use_dim_info + ":" + keepdim_info + ":" + string([ns_key UTF8String]) + ":" + native_mps::getMPSTypeString(input_t.scalar_type()) + ":" + bessel_corrected; + + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + // Initialize once if configuration not found in cache + if(!cachedGraph) { + native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = native_mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor *inputTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(input_t.scalar_type())); + MPSGraphTensor *outputVarTensor = [mpsGraph varianceOfTensor:inputTensor + axes:axes + name:nil]; + MPSGraphTensor *outputTensor; + + if (use_correction && correction_value) + { + MPSGraphTensor *besselTensor= [mpsGraph constantWithScalar:bessel_correction + dataType:MPSDataTypeFloat32]; + MPSGraphTensor *correctedTensor = [mpsGraph multiplicationWithPrimaryTensor: outputVarTensor + secondaryTensor: besselTensor + name: nil]; + outputTensor = (stdVarType == STANDARD_DEVIATION) ? + [mpsGraph squareRootWithTensor:correctedTensor name:nil] : correctedTensor; + } + else + { + outputTensor = (stdVarType == STANDARD_DEVIATION) ? + [mpsGraph squareRootWithTensor:outputVarTensor name:nil] : outputVarTensor; + } + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + auto inputPlaceholder = native_mps::Placeholder(); + + if(apparent_input_shape) + { + inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t, apparent_input_shape); + } + else + { + inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t); + } + auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t, apparent_output_shape); + + NSDictionary *feeds = @{ + inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), + }; + + NSDictionary *results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + free(output_shape); + return output_t; +} + +Tensor var_mps( + const Tensor & input_t, + at::OptionalIntArrayRef dim, + c10::optional correction, + bool keepdim) +{ + return std_var_common_impl_mps(input_t, dim, correction, keepdim, STANDARD_VARIANCE); +} + +Tensor std_mps( + const Tensor & input_t, + at::OptionalIntArrayRef dim, + c10::optional correction, + bool keepdim) +{ + return std_var_common_impl_mps(input_t, dim, correction, keepdim, STANDARD_DEVIATION); +} + +TORCH_IMPL_FUNC(any_out_mps) + (const Tensor& input_t, + int64_t dim, + bool keepdim, + const Tensor& output_t) +{ + namespace native_mps = at::native::mps; + + if (output_t.numel() == 0 || input_t.numel() == 0) { + return; + } + + // Derive from MPSCachedGraph + struct CachedGraph : public native_mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance(); + int64_t dim_ = maybe_wrap_dim(dim, input_t.dim()); + native::zero_numel_check_dims(input_t, dim_, "any()"); + + // Calculate the output shape according to keepdim=True + // If there is no dim argument, the input shape is flattened + IntArrayRef input_shape = input_t.sizes(); + int64_t num_input_dims = input_shape.size(); + NSMutableArray *apparent_out_shape = nil; + apparent_out_shape = [NSMutableArray arrayWithCapacity:num_input_dims]; + for(int i = 0; i < num_input_dims; i++) { + if(dim_ == i) + apparent_out_shape[i] = @1; + else + apparent_out_shape[i] = [NSNumber numberWithInt:input_shape[i]]; + } + + auto stream = at::mps::getCurrentMPSStream(); + + @autoreleasepool { + MPSShape* input_t_shape = native_mps::getMPSShape(input_t); + string key = string("any_out_mps:") + native_mps::getMPSShapeString(input_t_shape) + ":" + to_string(dim_) + ":" + native_mps::getMPSTypeString(input_t.scalar_type()); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + @autoreleasepool { + MPSGraph* mpsGraph = native_mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* outputTensor; + MPSDataType input_type = native_mps::getMPSDataType(input_t.scalar_type()); + MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_type, input_t_shape); + + if (input_type != MPSDataTypeInt32 && + input_type != MPSDataTypeFloat32 && + input_type != MPSDataTypeFloat16 ) + { + MPSGraphTensor* inputCastedTensor = [mpsGraph castTensor:inputTensor + toType:MPSDataTypeInt32 + name:@"any_all"]; + MPSGraphTensor* outputCastedTensor = [mpsGraph reductionOrWithTensor:inputCastedTensor + axis:dim_ + name:nil]; + outputTensor = [mpsGraph castTensor:outputCastedTensor + toType:MPSDataTypeBool + name:@"any"]; + } + else + { + MPSGraphTensor* outputUncastedTensor = [mpsGraph reductionOrWithTensor:inputTensor + axis:dim_ + name:nil]; + outputTensor = [mpsGraph castTensor:outputUncastedTensor + toType:MPSDataTypeBool + name:@"any"]; + } + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t); + auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t, apparent_out_shape); + NSDictionary *feeds = @{ + inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), + }; + + NSDictionary *results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(), + }; + + native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); +} +} + +TORCH_IMPL_FUNC(any_all_out_mps)(const Tensor& input_t, const Tensor& output_t) +{ + namespace native_mps = at::native::mps; + if (output_t.numel() == 0 || input_t.numel() == 0) { + return; + } + + // Derive from MPSCachedGraph + struct CachedGraph : public native_mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance(); + + auto stream = at::mps::getCurrentMPSStream(); + + @autoreleasepool { + MPSShape* input_t_shape = native_mps::getMPSShape(input_t); + string key = string("any_all_out_mps:") + native_mps::getMPSShapeString(input_t_shape) +":" + native_mps::getMPSTypeString(input_t.scalar_type()); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = native_mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* outputTensor; + MPSDataType input_type = native_mps::getMPSDataType(input_t.scalar_type()); + MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_type, input_t_shape); + + if (input_type != MPSDataTypeInt32 && + input_type != MPSDataTypeFloat32 && + input_type != MPSDataTypeFloat16 ) + { + MPSGraphTensor* inputCastedTensor = [mpsGraph castTensor:inputTensor + toType:MPSDataTypeInt32 + name:@"any_all"]; + MPSGraphTensor* outputCastedTensor = [mpsGraph reductionOrWithTensor:inputCastedTensor + axes:nil + name:nil]; + outputTensor = [mpsGraph castTensor:outputCastedTensor + toType:MPSDataTypeBool + name:@"any_all"]; + } + else + { + MPSGraphTensor* outputUncastedTensor = [mpsGraph reductionOrWithTensor:inputTensor + axes:nil + name:nil]; + outputTensor = [mpsGraph castTensor:outputUncastedTensor + toType:MPSDataTypeBool + name:@"any_all"]; + } + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t); + auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t); + NSDictionary *feeds = @{ + inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), + }; + + NSDictionary *results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(), + }; + + native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } +} + +TORCH_IMPL_FUNC(all_out_mps) + (const Tensor& input_t, + int64_t dim, + bool keepdim, + const Tensor& output_t) +{ + namespace native_mps = at::native::mps; + + if (output_t.numel() == 0 || input_t.numel() == 0) { + return; + } + + // Derive from MPSCachedGraph + struct CachedGraph : public native_mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance(); + int64_t dim_ = maybe_wrap_dim(dim, input_t.dim()); + native::zero_numel_check_dims(input_t, dim_, "all()"); + + // Calculate the output shape according to keepdim=True + // If there is no dim argument, the input shape is flattened + IntArrayRef input_shape = input_t.sizes(); + int64_t num_input_dims = input_shape.size(); + NSMutableArray *apparent_out_shape = nil; + apparent_out_shape = [NSMutableArray arrayWithCapacity:num_input_dims]; + for(int i = 0; i < num_input_dims; i++) { + if(dim_ == i) + apparent_out_shape[i] = @1; + else + apparent_out_shape[i] = [NSNumber numberWithInt:input_shape[i]]; + } + + auto stream = at::mps::getCurrentMPSStream(); + + @autoreleasepool { + MPSShape* input_t_shape = native_mps::getMPSShape(input_t); + string key = string("all_out_mps:") + native_mps::getMPSShapeString(input_t_shape) + ":" + to_string(dim_) + ":" + native_mps::getMPSTypeString(input_t.scalar_type()); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + @autoreleasepool { + MPSGraph* mpsGraph = native_mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* outputTensor; + MPSDataType input_type = native_mps::getMPSDataType(input_t.scalar_type()); + MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_type, input_t_shape); + + if (input_type != MPSDataTypeInt32 && + input_type != MPSDataTypeFloat32 && + input_type != MPSDataTypeFloat16 ) + { + MPSGraphTensor* inputCastedTensor = [mpsGraph castTensor:inputTensor + toType:MPSDataTypeInt32 + name:@"all_all"]; + MPSGraphTensor* outputCastedTensor = [mpsGraph reductionAndWithTensor:inputCastedTensor + axis:dim_ + name:nil]; + outputTensor = [mpsGraph castTensor:outputCastedTensor + toType:MPSDataTypeBool + name:@"all"]; + } + else + { + MPSGraphTensor* outputUncastedTensor = [mpsGraph reductionAndWithTensor:inputTensor + axis:dim_ + name:nil]; + outputTensor = [mpsGraph castTensor:outputUncastedTensor + toType:MPSDataTypeBool + name:@"all"]; + } + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t); + auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t, apparent_out_shape); + NSDictionary *feeds = @{ + inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), + }; + + NSDictionary *results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(), + }; + + native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } +} + +TORCH_IMPL_FUNC(all_all_out_mps)(const Tensor& input_t, const Tensor& output_t) +{ + namespace native_mps = at::native::mps; + if (output_t.numel() == 0 || input_t.numel() == 0) { + return; + } + + // Derive from MPSCachedGraph + struct CachedGraph : public native_mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance(); + + auto stream = at::mps::getCurrentMPSStream(); + + @autoreleasepool { + MPSShape* input_t_shape = native_mps::getMPSShape(input_t); + string key = string("all_all_out_mps:") + native_mps::getMPSShapeString(input_t_shape) +":" + native_mps::getMPSTypeString(input_t.scalar_type()); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = native_mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* outputTensor; + MPSDataType input_type = native_mps::getMPSDataType(input_t.scalar_type()); + MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_type, input_t_shape); + + if (input_type != MPSDataTypeInt32 && + input_type != MPSDataTypeFloat32 && + input_type != MPSDataTypeFloat16 ) + { + MPSGraphTensor* inputCastedTensor = [mpsGraph castTensor:inputTensor + toType:MPSDataTypeInt32 + name:@"all_all"]; + MPSGraphTensor* outputCastedTensor = [mpsGraph reductionAndWithTensor:inputCastedTensor + axes:nil + name:nil]; + outputTensor = [mpsGraph castTensor:outputCastedTensor + toType:MPSDataTypeBool + name:@"all_all"]; + } + else + { + MPSGraphTensor* outputUncastedTensor = [mpsGraph reductionAndWithTensor:inputTensor + axes:nil + name:nil]; + outputTensor = [mpsGraph castTensor:outputUncastedTensor + toType:MPSDataTypeBool + name:@"all_all"]; + } + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t); + auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t); + NSDictionary *feeds = @{ + inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), + }; + + NSDictionary *results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(), + }; + + native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } +} + +//----------------------------------------------------------------------- +// Min and max functions + +Tensor min_max_mps + (const Tensor& input_t, + string reduction_type, + string func_name) { + + namespace native_mps = at::native::mps; + + // Derive from MPSCachedGraph + struct CachedGraph : public native_mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance(); + + IntArrayRef input_shape = input_t.sizes(); + int64_t num_input_dims = input_shape.size(); + + // Flatten the input tensor to reduce it to one value + NSMutableArray *apparent_input_shape = [NSMutableArray arrayWithCapacity:1]; + int64_t num_in_elements = 1; + for(int i = 0; i < num_input_dims; i++) { + num_in_elements *= input_shape[i]; + } + apparent_input_shape[0] = [NSNumber numberWithInt:num_in_elements]; + + Tensor output_t = at::native::empty_mps({}, input_t.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt); + + if (output_t.numel() == 0 || num_in_elements == 0) { + return output_t; + } + + @autoreleasepool { + string key = func_name + mps::getTensorsStringKey(input_t); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + // Initialize once if configuration not found in cache + if(!cachedGraph) { + native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = native_mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* inputTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(input_t.scalar_type())); + + MPSGraphTensor* outputTensor = nil; + + if(reduction_type == "max") + outputTensor = [mpsGraph reductionMaximumWithTensor:inputTensor + axes:@[@0] + name:nil]; + else if(reduction_type == "min") + outputTensor = [mpsGraph reductionMinimumWithTensor:inputTensor + axes:@[@0] + name:nil]; + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t, apparent_input_shape); + auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t, @[@1]); + + NSDictionary *feeds = @{ + inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), + }; + + NSDictionary *results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + native_mps::runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + } + + return output_t; +} + +// Max entire tensor into scalar result +Tensor max_mps(const Tensor& input_t) { + + return min_max_mps(input_t, "max", "max_mps"); +} + +// Min entire tensor into scalar result +Tensor min_mps(const Tensor& input_t) { + + return min_max_mps(input_t, "min", "min_mps"); +} + +void min_max_out_mps + (const Tensor& input_t, + int64_t dim, + bool keepdim, + const Tensor& output_t, + const Tensor& indices_t, + string reduction_type, + string func_name) { + + namespace native_mps = at::native::mps; + + if (output_t.numel() == 0) { + return; + } + if (input_t.numel() == 1 && input_t.dim() == 0) { + output_t.fill_(input_t); + indices_t.fill_(0); + return; + } + + + // Derive from MPSCachedGraph + struct CachedGraph : public native_mps::MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + MPSGraphTensor *indicesTensor_ = nil; + }; + + native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance(); + + int64_t dim_ = maybe_wrap_dim(dim, input_t.dim()); + + // Calculate the output shape according to keepdim=True + // If there is no dim argument, the input shape is flattened + IntArrayRef input_shape = input_t.sizes(); + int64_t num_input_dims = input_shape.size(); + NSMutableArray *apparent_out_shape = nil; + + apparent_out_shape = [NSMutableArray arrayWithCapacity:num_input_dims]; + for(int i = 0; i < num_input_dims; i++) { + if(dim_ == i) + apparent_out_shape[i] = @1; + else + apparent_out_shape[i] = [NSNumber numberWithInt:input_shape[i]]; + } + + auto stream = at::mps::getCurrentMPSStream(); + + @autoreleasepool { + string key = func_name + ":" + to_string(dim_) + ":" + native_mps::getMPSTypeString(input_t.scalar_type()); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = native_mps::make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* inputTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(input_t.scalar_type())); + MPSGraphTensor* outputTensor = nil; + if(reduction_type == "max") + outputTensor = [mpsGraph reductionMaximumWithTensor:inputTensor + axis:(NSInteger)dim_ + name:nil]; + else if(reduction_type == "min") + outputTensor = [mpsGraph reductionMinimumWithTensor:inputTensor + axis:(NSInteger)dim_ + name:nil]; + + MPSGraphTensor* castInputTensor = nil; + + if(input_t.scalar_type() != ScalarType::Float && + input_t.scalar_type() != ScalarType::Int && + input_t.scalar_type() != ScalarType::Half) + castInputTensor = [mpsGraph castTensor:inputTensor + toType:MPSDataTypeFloat32 + name:@"castInputTensor"]; + else + castInputTensor = inputTensor; + + MPSGraphTensor* argreduceOutTensor = nil; + if(reduction_type == "max") + argreduceOutTensor = [mpsGraph reductionArgMaximumWithTensor:castInputTensor + axis:(NSInteger)dim_ + name:@"argmax_out"]; + else if(reduction_type == "min") + argreduceOutTensor = [mpsGraph reductionArgMinimumWithTensor:castInputTensor + axis:(NSInteger)dim_ + name:@"argmax_out"]; + + MPSGraphTensor *indicesTensor = [mpsGraph castTensor:argreduceOutTensor + toType:MPSDataTypeInt64 + name:@"cast_out"]; + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + newCachedGraph->indicesTensor_ = indicesTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t); + auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t, apparent_out_shape); + auto indicesPlaceholder = native_mps::Placeholder(cachedGraph->indicesTensor_, indices_t, apparent_out_shape); + + NSDictionary *feeds = @{ + inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), + }; + + NSDictionary *results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(), + indicesPlaceholder.getMPSGraphTensor() : indicesPlaceholder.getMPSGraphTensorData() + }; + + native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + + } + +} + +// Max out with dim +TORCH_IMPL_FUNC(max_out_mps) + (const Tensor& input_t, + int64_t dim, + bool keepdim, + const Tensor& output_t, + const Tensor& indices_t) { + + int64_t dim_ = maybe_wrap_dim(dim, input_t.dim()); + native::zero_numel_check_dims(input_t, dim_, "max()"); + + min_max_out_mps(input_t, dim, keepdim, output_t, indices_t, "max", "max_out_mps"); +} + +// Min out with dim +TORCH_IMPL_FUNC(min_out_mps) + (const Tensor& input_t, + int64_t dim, + bool keepdim, + const Tensor& output_t, + const Tensor& indices_t) { + + int64_t dim_ = maybe_wrap_dim(dim, input_t.dim()); + native::zero_numel_check_dims(input_t, dim_, "min()"); + + min_max_out_mps(input_t, dim, keepdim, output_t, indices_t, "min", "min_out_mps"); +} + +// Min/Max with dim +std::tuple min_max_mps + (const Tensor& input_t, + int64_t dim, + bool keepdim, + string reduction_type, + string func_name) { + + namespace native_mps = at::native::mps; + + int64_t dim_ = maybe_wrap_dim(dim, input_t.dim()); + native::zero_numel_check_dims(input_t, dim_, "max()"); + + // Calculate the output shape according to keepdim=True + // If there is no dim argument, the input shape is flattened + IntArrayRef input_shape = input_t.sizes(); + int64_t num_input_dims = input_shape.size(); + NSMutableArray *apparent_out_shape = nil; + // Use this if keepdim is false + int64_t num_output_dims = num_input_dims - 1; + + int64_t* malloc_apparent_out_shape = (int64_t *)malloc(num_input_dims * sizeof(int64_t)); + int64_t* malloc_out_shape = (int64_t *)malloc(num_output_dims * sizeof(int64_t)); + + apparent_out_shape = [NSMutableArray arrayWithCapacity:num_input_dims]; + // Counter for shape when keepdim is false + int out_i = 0; + for(int i = 0; i < num_input_dims; i++) { + if(dim_ == i) { + apparent_out_shape[i] = @1; + malloc_apparent_out_shape[i] = 1; + } + else { + apparent_out_shape[i] = [NSNumber numberWithInt:input_shape[i]]; + malloc_apparent_out_shape[i] = input_shape[i]; + malloc_out_shape[out_i] = input_shape[i]; + out_i++; + } + } + + Tensor output_t; + Tensor indices_t; + if(!keepdim) { + output_t = at::native::empty_mps( + IntArrayRef(malloc_out_shape, num_output_dims), + input_t.scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + c10::nullopt); + indices_t = at::native::empty_mps( + IntArrayRef(malloc_out_shape, num_output_dims), + ScalarType::Long, + c10::nullopt, + kMPS, + c10::nullopt, + c10::nullopt); + } + else { + output_t = at::native::empty_mps( + IntArrayRef(malloc_apparent_out_shape, num_input_dims), + input_t.scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + c10::nullopt); + indices_t = at::native::empty_mps( + IntArrayRef(malloc_apparent_out_shape, num_input_dims), + ScalarType::Long, + c10::nullopt, + kMPS, + c10::nullopt, + c10::nullopt); + } + + if (output_t.numel() == 0 || input_t.numel() == 0) { + free(malloc_out_shape); + free(malloc_apparent_out_shape); + return std::tuple{output_t, indices_t}; + } + + min_max_out_mps(input_t, dim, keepdim, output_t, indices_t, reduction_type, func_name); + + free(malloc_out_shape); + free(malloc_apparent_out_shape); + return std::tuple{output_t, indices_t}; +} + +// Max with dim +std::tuple max_mps + (const Tensor& input_t, + int64_t dim, + bool keepdim) { + + return min_max_mps(input_t, dim, keepdim, "max", "max_mps"); +} + +// Min with dim +std::tuple min_mps + (const Tensor& input_t, + int64_t dim, + bool keepdim) { + + return min_max_mps(input_t, dim, keepdim, "min", "min_mps"); +} + +} + +} diff --git a/aten/src/ATen/native/mps/operations/Repeat.mm b/aten/src/ATen/native/mps/operations/Repeat.mm new file mode 100644 index 000000000000..a7708f1a327c --- /dev/null +++ b/aten/src/ATen/native/mps/operations/Repeat.mm @@ -0,0 +1,173 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include + +#include +#include +#include +#include + +#ifdef __OBJC__ +#include +#endif + +namespace at { +namespace native { + +Tensor permute_mps(const Tensor& self, IntArrayRef dims) { + auto nDims = self.dim(); + TORCH_CHECK(dims.size() == (size_t)nDims, + "number of dims don't match in permute"); + auto oldSizes = self.sizes(); + auto oldStrides = self.strides(); + DimVector newSizes(nDims); + DimVector newStrides(nDims); + std::vector seen(nDims); + for (const auto i : c10::irange(nDims)) { + auto dim = maybe_wrap_dim(dims[i], nDims); + TORCH_CHECK(!seen[dim], + "repeated dim in permute"); + seen[dim] = true; + newSizes[i] = oldSizes[dim]; + newStrides[i] = oldStrides[dim]; + } + return self.as_strided(newSizes, newStrides); +} + +void set_apparent_shapes(NSMutableArray * input_shape, + NSMutableArray * &apparent_input_shape, + int64_t num_input_dims, + IntArrayRef repeats, + NSMutableArray * &repeats_shape, + int64_t num_repeat_dims) { + + + // Set repeats_shape + + repeats_shape = [NSMutableArray arrayWithCapacity:num_repeat_dims]; + + for(int i = 0; i < num_repeat_dims; i++) + repeats_shape[i] = [NSNumber numberWithInt:repeats[i]]; + + // If no extension of the shape is needed + if(num_repeat_dims == num_input_dims) { + apparent_input_shape = input_shape; + } + // num_repeat_dims > num_input_dims + else { + apparent_input_shape = [NSMutableArray arrayWithCapacity:num_repeat_dims]; + + for(int i = 0; i < num_repeat_dims - num_input_dims; i++) + apparent_input_shape[i] = @1; + + for(int i = num_repeat_dims - num_input_dims; i < num_repeat_dims; i++) + apparent_input_shape[i] = input_shape[i + num_input_dims - num_repeat_dims]; + } + +} + +Tensor repeat_mps(const Tensor& self, IntArrayRef repeats) { + + using namespace mps; + + TORCH_CHECK(repeats.size() >= (size_t)self.dim(), + "Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor"); + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + NSMutableArray *apparent_input_shape = nil; + NSMutableArray *repeats_shape = nil; + + auto input_shape = getMPSShape(self); + auto num_input_dims = [input_shape count]; + auto num_repeat_dims = repeats.size(); + + set_apparent_shapes(input_shape, + apparent_input_shape, + num_input_dims, + repeats, + repeats_shape, + num_repeat_dims); + + // Set output shape + int64_t output_shape[num_repeat_dims]; + bool zero_tensor = false; + for(int i = 0; i < num_repeat_dims; i++) { + output_shape[i] = repeats[i] * [apparent_input_shape[i] intValue]; + if(output_shape[i] == 0) + zero_tensor = true; + } + + Tensor output = at::native::empty_mps( + IntArrayRef(output_shape, num_repeat_dims), + self.scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + c10::nullopt); + + // Empty output + if(zero_tensor) + return output; + + auto stream = at::mps::getCurrentMPSStream(); + + @autoreleasepool { + + NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","]; + NSString* ns_repeats_key = [[repeats_shape valueForKey:@"description"] componentsJoinedByString:@","]; + + string key = "repeat_mps:" + getMPSTypeString(self.scalar_type()) + + ":" + string([ns_shape_key UTF8String]) + + ":" + string([ns_repeats_key UTF8String]); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self.scalar_type()), apparent_input_shape); + MPSGraphTensor* outputTensor = [mpsGraph tileTensor:inputTensor + withMultiplier:repeats_shape + name:nil]; + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, apparent_input_shape); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); + + NSDictionary* feeds = @{ + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() + }; + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + + return output; + +} + +} +} diff --git a/aten/src/ATen/native/mps/operations/RnnOps.mm b/aten/src/ATen/native/mps/operations/RnnOps.mm new file mode 100644 index 000000000000..a219d3f8172c --- /dev/null +++ b/aten/src/ATen/native/mps/operations/RnnOps.mm @@ -0,0 +1,510 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#import +#include + +namespace at { +namespace native { + +std::vector getTensorShape(MPSGraphTensor* mpsTensor) { + std::vector output_dimensions = {}; + auto dims = mpsTensor.shape; + for (int i = 0; i<[dims count];i++){ + output_dimensions.push_back([dims[i] intValue]); + } + return output_dimensions; +} + +std::tuple _lstm_mps(const Tensor& input, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) { + using namespace mps; + std::vector kernel_weights; + std::vector recurrent_kernel_weights; + std::vector biases; + std::vector recurrent_biases; + for (size_t i = 0; i < num_layers; i+=1) { + kernel_weights.push_back(params[i*4]); + recurrent_kernel_weights.push_back(params[i*4+1]); + biases.push_back(params[i*4+2]); + recurrent_biases.push_back(params[i*4+3]); + } + + struct CachedGraph : public MPSCachedGraph { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + std::vector inputTensors_; + std::vector outputTensors_; + NSMutableArray *kernelWeightsList_ = nil; + NSMutableArray *recurrentKernelWeightsList_ = nil; + NSMutableArray *biasList_ = nil; + NSMutableArray *recurrentBiasList_ = nil; + std::vector outputCellStateFwdVector_; + std::vector outputZStateVector_; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + MPSStream* stream = getCurrentMPSStream(); + int timesteps = (batch_first ? input.size(1) : input.size(0)); + + @autoreleasepool { + string key = "lstm_" + getTensorsStringKey({input, hx[0], hx[1]}) + getMPSTypeString(input.scalar_type()) + "_num_layers_" + std::to_string(num_layers); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + NSMutableArray *kernelWeightsList = [[NSMutableArray alloc] initWithCapacity:params.size()]; + NSMutableArray *recurrentKernelWeightsList = [[NSMutableArray alloc] initWithCapacity:params.size()]; + NSMutableArray *kernelBiasList = [[NSMutableArray alloc] initWithCapacity:params.size()]; + NSMutableArray *recurrentBiasList = [[NSMutableArray alloc] initWithCapacity:params.size()]; + + for (size_t i = 0; i < num_layers; i += 1) { + [kernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(kernel_weights[i]))]; + [recurrentKernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_kernel_weights[i]))]; + [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))]; + [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))]; + } + + MPSGraphLSTMDescriptor * opDesc = [MPSGraphLSTMDescriptor descriptor]; + opDesc.training = true; + opDesc.bidirectional = bidirectional; + opDesc.produceCell = true; + + MPSShape* inputShape = getMPSShape(input); + MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(input)); + MPSGraphTensor* stateTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(hx[0])); + MPSGraphTensor* cellStateTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(hx[1])); + std::vector inputTensors = {inputTensor, stateTensor, cellStateTensor,}; + + if(batch_first) { + inputTensor = [mpsGraph transposeTensor:inputTensor + dimension:0 + withDimension:1 + name:nil]; + } + + MPSGraphTensor* inputTensor_ = inputTensor; + MPSGraphTensor* stateTensor_ = [mpsGraph sliceTensor:stateTensor + dimension:0 + start:0 + length:1 + name:nil]; + MPSGraphTensor* cellStateTensor_ = [mpsGraph sliceTensor:cellStateTensor + dimension:0 + start:0 + length:1 + name:nil]; + NSArray* outputs = nil; + NSMutableArray* outputStateArray = [[NSMutableArray alloc] initWithCapacity:num_layers]; + NSMutableArray* outputCellStateArray = [[NSMutableArray alloc] initWithCapacity:num_layers]; + NSMutableArray* outputZStateArray = [[NSMutableArray alloc] initWithCapacity:num_layers]; + NSMutableArray* outputCellStateFwdArray = [[NSMutableArray alloc] initWithCapacity:num_layers]; + for(int i = 0; i < num_layers; i++) { + MPSGraphTensor* biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i] + secondaryTensor:recurrentBiasList[i] + name:nil]; + outputs = [mpsGraph LSTMWithSourceTensor:inputTensor_ + recurrentWeight:recurrentKernelWeightsList[i] + inputWeight:kernelWeightsList[i] + bias:biasTensor + initState:stateTensor_ + initCell:cellStateTensor_ + descriptor:opDesc + name:nil]; + + + stateTensor_ = [mpsGraph sliceTensor:stateTensor + dimension:0 + start:i + length:1 + name:nil]; + cellStateTensor_ = [mpsGraph sliceTensor:cellStateTensor + dimension:0 + start:i + length:1 + name:nil]; + inputTensor_ = [outputs objectAtIndex:0]; + if(dropout_p>0.0 && train && (i!=num_layers-1)) { + inputTensor_ = [mpsGraph dropoutTensor:inputTensor_ + rate:dropout_p + name:nil]; + + } + + [outputStateArray addObject:[mpsGraph sliceTensor:[outputs objectAtIndex:0] dimension:0 start:-1 length:1 name:nil]]; + [outputCellStateArray addObject:[mpsGraph sliceTensor:[outputs objectAtIndex:1] dimension:0 start:-1 length:1 name:nil]]; + [outputCellStateFwdArray addObject: [mpsGraph expandDimsOfTensor:[outputs objectAtIndex:1] + axis:0 + name:nil]]; + [outputZStateArray addObject: [mpsGraph expandDimsOfTensor:[outputs objectAtIndex:2] + axis:0 + name:nil]]; + } + + MPSGraphTensor* outputStates = [mpsGraph concatTensors:outputStateArray + dimension:0 + name:nil]; + MPSGraphTensor* outputCellStates = [mpsGraph concatTensors:outputCellStateArray + dimension:0 + name:nil]; + MPSGraphTensor* outputZStates = [mpsGraph concatTensors:outputZStateArray + dimension:0 + name:nil]; + MPSGraphTensor* outputCellStatesFwd = [mpsGraph concatTensors:outputCellStateFwdArray + dimension:0 + name:nil]; + + std::vector outputTensors = {[outputs objectAtIndex:0], outputStates, outputCellStates, outputZStates, outputCellStatesFwd}; + newCachedGraph->inputTensors_ = inputTensors; + newCachedGraph->outputTensors_ = outputTensors; + newCachedGraph->kernelWeightsList_ = kernelWeightsList; + newCachedGraph->recurrentKernelWeightsList_ = recurrentKernelWeightsList; + newCachedGraph->biasList_ = kernelBiasList; + newCachedGraph->recurrentBiasList_ = recurrentBiasList; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + NSMutableArray *kernelWeightsList = cachedGraph->kernelWeightsList_; + NSMutableArray *recurrentKernelWeightsList = cachedGraph->recurrentKernelWeightsList_; + NSMutableArray *biasList = cachedGraph->biasList_; + NSMutableArray *recurrentBiasList = cachedGraph->recurrentBiasList_; + + Placeholder kernelWeight; + Placeholder recurrentKernelWeight; + Placeholder bias; + Placeholder recurrentBias; + NSMutableDictionary *feeds = [[NSMutableDictionary alloc] init]; + for (size_t i = 0; i < num_layers; i+=1) { + kernelWeight = Placeholder([kernelWeightsList objectAtIndex:i], kernel_weights[i]); + recurrentKernelWeight = Placeholder([recurrentKernelWeightsList objectAtIndex:i], recurrent_kernel_weights[i]); + bias = Placeholder([biasList objectAtIndex:i], biases[i]); + recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]); + [feeds setObject:kernelWeight.getMPSGraphTensorData() forKey:kernelWeight.getMPSGraphTensor()]; + [feeds setObject:recurrentKernelWeight.getMPSGraphTensorData() forKey:recurrentKernelWeight.getMPSGraphTensor()]; + [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()]; + [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()]; + + } + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensors_[0], input); + Placeholder selfState = Placeholder(cachedGraph->inputTensors_[1], hx[0]); + Placeholder selfCellState = Placeholder(cachedGraph->inputTensors_[2], hx[1]); + [feeds setObject:selfPlaceholder.getMPSGraphTensorData() forKey:selfPlaceholder.getMPSGraphTensor()]; + [feeds setObject:selfState.getMPSGraphTensorData() forKey:selfState.getMPSGraphTensor()]; + [feeds setObject:selfCellState.getMPSGraphTensorData() forKey:selfCellState.getMPSGraphTensor()]; + + + auto dims = getTensorShape(cachedGraph->outputTensors_[0]); + Tensor output = at::empty(IntArrayRef(dims), input.options()); + Tensor hy = at::empty_like(hx[0], input.options()); + Tensor cy = at::empty_like(hx[1], input.options()); + Tensor zState = at::empty(IntArrayRef(getTensorShape(cachedGraph->outputTensors_[3])), input.options()); + Tensor cellStateFwd = at::empty(IntArrayRef(getTensorShape(cachedGraph->outputTensors_[4])), input.options()); + + Placeholder outputPlaceholder0 = Placeholder(cachedGraph->outputTensors_[0], output); + Placeholder outputPlaceholder1 = Placeholder(cachedGraph->outputTensors_[1], hy); + Placeholder outputPlaceholder2 = Placeholder(cachedGraph->outputTensors_[2], cy); + Placeholder outputPlaceholder3 = Placeholder(cachedGraph->outputTensors_[3], zState); + Placeholder outputPlaceholder4 = Placeholder(cachedGraph->outputTensors_[4], cellStateFwd); + + NSDictionary* results = @{ + outputPlaceholder0.getMPSGraphTensor() : outputPlaceholder0.getMPSGraphTensorData(), + outputPlaceholder1.getMPSGraphTensor() : outputPlaceholder1.getMPSGraphTensorData(), + outputPlaceholder2.getMPSGraphTensor() : outputPlaceholder2.getMPSGraphTensorData(), + outputPlaceholder3.getMPSGraphTensor() : outputPlaceholder3.getMPSGraphTensorData(), + outputPlaceholder4.getMPSGraphTensor() : outputPlaceholder4.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + return std::make_tuple(output, hy, cy, zState, cellStateFwd); + } +} + +std::tuple, std::vector> lstm_mps_backward(const Tensor& grad_y, const c10::optional& grad_hy_opt, const c10::optional& grad_cy_opt, const Tensor& z_state, const Tensor& cell_state_fwd, const Tensor& input, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) { + using namespace mps; + const Tensor& grad_hy_r = c10::value_or_else(grad_hy_opt, [] {return Tensor();}); + const Tensor& grad_cy_r = c10::value_or_else(grad_cy_opt, [] {return Tensor();}); + auto grad_hy = grad_hy_r.defined() ? grad_hy_r : at::zeros_like(hx[0], input.options()); + auto grad_cy = grad_cy_r.defined() ? grad_cy_r : at::zeros_like(hx[1], input.options()); + + std::vector kernel_weights; + std::vector recurrent_kernel_weights; + std::vector biases; + std::vector recurrent_biases; + for (size_t i = 0; i < num_layers; i+=1) { + kernel_weights.push_back(params[i*4]); + recurrent_kernel_weights.push_back(params[i*4+1]); + biases.push_back(params[i*4+2]); + recurrent_biases.push_back(params[i*4+3]); + } + + struct CachedGraph : public MPSCachedGraph { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + std::vector inputTensors_; + std::vector outputTensors_; + NSMutableArray *kernelWeightsList_ = nil; + NSMutableArray *recurrentKernelWeightsList_ = nil; + NSMutableArray *biasList_ = nil; + NSMutableArray *recurrentBiasList_ = nil; + NSMutableArray *gradOutput_ = nil; + NSMutableArray *gradRecWeights_ = nil; + NSMutableArray *gradWeights_ = nil; + NSMutableArray *gradBias_ = nil; + NSMutableArray *gradState_ = nil; + NSMutableArray *gradCellState_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + // Get stream + MPSStream* stream = getCurrentMPSStream(); + @autoreleasepool { + + string key = "lstm_backward_" + getTensorsStringKey({input, z_state, cell_state_fwd, grad_y, grad_cy, grad_hy})+ getMPSTypeString(input.scalar_type()) + "_num_layers_" + std::to_string(num_layers); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + NSMutableArray *kernelWeightsList = [[NSMutableArray alloc] initWithCapacity:params.size()]; + NSMutableArray *recurrentKernelWeightsList = [[NSMutableArray alloc] initWithCapacity:params.size()]; + NSMutableArray *kernelBiasList = [[NSMutableArray alloc] initWithCapacity:params.size()]; + NSMutableArray *recurrentBiasList = [[NSMutableArray alloc] initWithCapacity:params.size()]; + + for (size_t i = 0; i < num_layers; i += 1) { + [kernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(kernel_weights[i]))]; + [recurrentKernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_kernel_weights[i]))]; + [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))]; + [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))]; + } + + MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(input)); + MPSGraphTensor* stateTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(hx[0])); + MPSGraphTensor* cellStateTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(hx[1])); + MPSGraphTensor* zStateTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(z_state)); + MPSGraphTensor* gradientTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(grad_y.scalar_type()), getMPSShape(grad_y)); + MPSGraphTensor* gradientCyTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(grad_cy.scalar_type()), getMPSShape(grad_cy)); + MPSGraphTensor* gradientHyTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(grad_hy.scalar_type()), getMPSShape(grad_hy)); + MPSGraphTensor* cellStateFwdTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(cell_state_fwd.scalar_type()), getMPSShape(cell_state_fwd)); + + std::vector inputs = {inputTensor, stateTensor, cellStateTensor, gradientTensor, zStateTensor, cellStateFwdTensor, gradientHyTensor, gradientCyTensor}; + newCachedGraph->recurrentKernelWeightsList_ = recurrentKernelWeightsList; + newCachedGraph->kernelWeightsList_ = kernelWeightsList; + newCachedGraph->biasList_ = kernelBiasList; + newCachedGraph->recurrentBiasList_ = recurrentBiasList; + newCachedGraph->inputTensors_ = inputs; + + MPSGraphLSTMDescriptor * opDesc = [MPSGraphLSTMDescriptor descriptor]; + opDesc.training = true; //train; + opDesc.bidirectional = bidirectional; + opDesc.produceCell = true; + + MPSGraphTensor* gradientTensor_ = gradientTensor; + + NSArray* outputs = nil; + + NSMutableArray* gradOutputArray = [[NSMutableArray alloc] initWithCapacity:num_layers]; + NSMutableArray* gradRecWeightsArray = [[NSMutableArray alloc] initWithCapacity:num_layers]; + NSMutableArray* gradWeightsArray = [[NSMutableArray alloc] initWithCapacity:num_layers]; + NSMutableArray* gradBiasArray = [[NSMutableArray alloc] initWithCapacity:num_layers]; + NSMutableArray* gradRecBiasArray = [[NSMutableArray alloc] initWithCapacity:num_layers]; + NSMutableArray* gradStateArray = [[NSMutableArray alloc] initWithCapacity:num_layers]; + NSMutableArray* gradCellStateArray = [[NSMutableArray alloc] initWithCapacity:num_layers]; + + for (int i = num_layers - 1; i >= 0; i--) { + MPSGraphTensor* zState = [mpsGraph sliceTensor:zStateTensor + dimension:0 + start:i + length:1 + name:nil]; + zState = [mpsGraph squeezeTensor:zState + axis:0 + name:nil]; + MPSGraphTensor* cellStateFwd = [mpsGraph sliceTensor:cellStateFwdTensor + dimension:0 + start:i + length:1 + name:nil]; + cellStateFwd = [mpsGraph squeezeTensor:cellStateFwd + axis:0 + name:nil]; + MPSGraphTensor* biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i] + secondaryTensor:recurrentBiasList[i] + name:nil]; + + MPSGraphTensor* stateTensor_ = [mpsGraph sliceTensor:stateTensor + dimension:0 + start:i + length:1 + name:nil]; + MPSGraphTensor* cellStateTensor_ = [mpsGraph sliceTensor:cellStateTensor + dimension:0 + start:i + length:1 + name:nil]; + MPSGraphTensor* gradientHyTensor_ = [mpsGraph sliceTensor:gradientHyTensor + dimension:0 + start:i + length:1 + name:nil]; + + MPSGraphTensor* gradientCyTensor_ = [mpsGraph sliceTensor:gradientCyTensor + dimension:0 + start:i + length:1 + name:nil]; + + outputs = [mpsGraph LSTMGradientsWithSourceTensor: inputTensor + recurrentWeight: recurrentKernelWeightsList[i] + sourceGradient: gradientTensor_ + zState: zState + cellOutputFwd: cellStateFwd + stateGradient: gradientHyTensor_ + cellGradient: gradientCyTensor_ + inputWeight: kernelWeightsList[i] + bias: biasTensor + initState: stateTensor_ + initCell: cellStateTensor_ + mask: nil + peephole: nil + descriptor: opDesc + name: nil]; + + + gradientTensor_ = [outputs objectAtIndex:0]; + [gradOutputArray addObject:[outputs objectAtIndex:0]]; + [gradRecWeightsArray addObject:[outputs objectAtIndex:1]]; + [gradWeightsArray addObject:[outputs objectAtIndex:2]]; + [gradBiasArray addObject:[outputs objectAtIndex:3]]; + [gradStateArray addObject:[outputs objectAtIndex:4]]; + [gradCellStateArray addObject:[outputs objectAtIndex:5]]; + } + std::vector outputTensors = {[outputs objectAtIndex:0],[outputs objectAtIndex:1],[outputs objectAtIndex:2],[outputs objectAtIndex:3], [outputs objectAtIndex:4], [outputs objectAtIndex:5]}; + newCachedGraph->outputTensors_ = outputTensors; + newCachedGraph->gradOutput_ = gradOutputArray; + newCachedGraph->gradRecWeights_ = gradRecWeightsArray; + newCachedGraph->gradWeights_ = gradWeightsArray; + newCachedGraph->gradBias_ = gradBiasArray; + newCachedGraph->gradState_ = gradStateArray; + newCachedGraph->gradCellState_ = gradCellStateArray; + + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensors_[0], input); + Placeholder statePlaceholder = Placeholder(cachedGraph->inputTensors_[1], hx[0]); + Placeholder cellStatePlaceholder = Placeholder(cachedGraph->inputTensors_[2], hx[1]); + Placeholder gradientPlaceholder = Placeholder(cachedGraph->inputTensors_[3], grad_y); + Placeholder zStatePlaceholder = Placeholder(cachedGraph->inputTensors_[4], z_state); + Placeholder cellStateFwdPlaceholder = Placeholder(cachedGraph->inputTensors_[5], cell_state_fwd); + Placeholder gradientHyPlaceholder = Placeholder(cachedGraph->inputTensors_[6], grad_hy); + Placeholder gradientCyPlaceholder = Placeholder(cachedGraph->inputTensors_[7], grad_cy); + + NSMutableDictionary *feeds = [[NSMutableDictionary alloc] init]; + [feeds setObject:gradientPlaceholder.getMPSGraphTensorData() forKey:gradientPlaceholder.getMPSGraphTensor()]; + [feeds setObject:gradientHyPlaceholder.getMPSGraphTensorData() forKey:gradientHyPlaceholder.getMPSGraphTensor()]; + [feeds setObject:gradientCyPlaceholder.getMPSGraphTensorData() forKey:gradientCyPlaceholder.getMPSGraphTensor()]; + [feeds setObject:inputPlaceholder.getMPSGraphTensorData() forKey:inputPlaceholder.getMPSGraphTensor()]; + [feeds setObject:statePlaceholder.getMPSGraphTensorData() forKey: statePlaceholder.getMPSGraphTensor()]; + [feeds setObject:cellStatePlaceholder.getMPSGraphTensorData() forKey:cellStatePlaceholder.getMPSGraphTensor()]; + [feeds setObject:zStatePlaceholder.getMPSGraphTensorData() forKey:zStatePlaceholder.getMPSGraphTensor()]; + [feeds setObject:cellStateFwdPlaceholder.getMPSGraphTensorData() forKey:cellStateFwdPlaceholder.getMPSGraphTensor()]; + + NSMutableArray *kernelWeightsList = cachedGraph->kernelWeightsList_; + NSMutableArray *recurrentKernelWeightsList = cachedGraph->recurrentKernelWeightsList_; + NSMutableArray *biasList = cachedGraph->biasList_; + NSMutableArray *recurrentBiasList = cachedGraph->recurrentBiasList_; + Placeholder kernelWeight; + Placeholder recurrentKernelWeight; + Placeholder bias; + Placeholder recurrentBias; + for (size_t i = 0; i < num_layers; i+=1) { + kernelWeight = Placeholder([kernelWeightsList objectAtIndex:i], kernel_weights[i]); + recurrentKernelWeight = Placeholder([recurrentKernelWeightsList objectAtIndex:i], recurrent_kernel_weights[i]); + bias = Placeholder([biasList objectAtIndex:i], biases[i]); + recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]); + [feeds setObject:kernelWeight.getMPSGraphTensorData() forKey:kernelWeight.getMPSGraphTensor()]; + [feeds setObject:recurrentKernelWeight.getMPSGraphTensorData() forKey:recurrentKernelWeight.getMPSGraphTensor()]; + [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()]; + [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()]; + } + + Tensor output = at::empty_like(input); + Tensor grad_rec_weights = at::empty_like(recurrent_kernel_weights[0]); + Tensor grad_weights = at::empty_like(kernel_weights[0]); + Tensor grad_bias = at::empty_like(biases[0]); + Tensor grad_state = at::empty_like(hx[0]); + Tensor grad_cell_state = at::empty_like(hx[1]); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensors_[0], output); + Placeholder gradRecWeightsPlaceholder = Placeholder(cachedGraph->outputTensors_[1], grad_rec_weights); + Placeholder gradWeightsPlaceholder = Placeholder(cachedGraph->outputTensors_[2], grad_weights); + Placeholder gradBiasPlaceholder = Placeholder(cachedGraph->outputTensors_[3], grad_bias); + Placeholder gradStatePlaceholder = Placeholder(cachedGraph->outputTensors_[4], grad_state); + Placeholder gradCellStatePlaceholder = Placeholder(cachedGraph->outputTensors_[5], grad_cell_state); + + std::vector grad_hx = {grad_state, grad_cell_state}; + + NSMutableDictionary *results = [[NSMutableDictionary alloc] init]; + NSMutableArray *gradOutputArray = cachedGraph->gradOutput_; + NSMutableArray *gradRecWeightsArray = cachedGraph->gradRecWeights_; + NSMutableArray *gradWeightsArray = cachedGraph->gradWeights_; + NSMutableArray *gradBiasArray = cachedGraph->gradBias_; + NSMutableArray *gradStateArray = cachedGraph->gradState_; + NSMutableArray *gradCellStateArray = cachedGraph->gradCellState_; + Placeholder gradOutPlaceholder; + + std::vector weights; + for (int i = 0; i < num_layers; i++) { + Tensor output = at::empty_like(input); + Tensor grad_rec_weights = at::empty_like(recurrent_kernel_weights[i]); + Tensor grad_weights = at::empty_like(kernel_weights[i]); + Tensor grad_bias = at::empty_like(biases[i]); + Tensor grad_state = at::empty_like(hx[0]); + Tensor grad_cell_state = at::empty_like(hx[1]); + weights.push_back(grad_weights); + weights.push_back(grad_rec_weights); + weights.push_back(grad_bias); + weights.push_back(grad_bias); + gradOutPlaceholder = Placeholder([gradOutputArray objectAtIndex:i], output); + gradRecWeightsPlaceholder = Placeholder([gradRecWeightsArray objectAtIndex:i], grad_rec_weights); + gradWeightsPlaceholder = Placeholder([gradWeightsArray objectAtIndex:i], grad_weights); + gradBiasPlaceholder = Placeholder([gradBiasArray objectAtIndex:i], grad_bias); + gradStatePlaceholder = Placeholder([gradStateArray objectAtIndex:i], grad_state); + gradCellStatePlaceholder = Placeholder([gradCellStateArray objectAtIndex:i], grad_cell_state); + + [results setObject:gradOutPlaceholder.getMPSGraphTensorData() forKey:gradOutPlaceholder.getMPSGraphTensor()]; + [results setObject:gradRecWeightsPlaceholder.getMPSGraphTensorData() forKey:gradRecWeightsPlaceholder.getMPSGraphTensor()]; + [results setObject:gradBiasPlaceholder.getMPSGraphTensorData() forKey:gradBiasPlaceholder.getMPSGraphTensor()]; + [results setObject:gradStatePlaceholder.getMPSGraphTensorData() forKey:gradStatePlaceholder.getMPSGraphTensor()]; + [results setObject:gradCellStatePlaceholder.getMPSGraphTensorData() forKey:gradCellStatePlaceholder.getMPSGraphTensor()]; + [results setObject:gradWeightsPlaceholder.getMPSGraphTensorData() forKey:gradWeightsPlaceholder.getMPSGraphTensor()]; + } + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + + return std::tuple, std::vector> (output, grad_hx, weights); + + } +} +}}//at::native diff --git a/aten/src/ATen/native/mps/operations/Scalar.mm b/aten/src/ATen/native/mps/operations/Scalar.mm new file mode 100644 index 000000000000..2a5d7fd700c4 --- /dev/null +++ b/aten/src/ATen/native/mps/operations/Scalar.mm @@ -0,0 +1,39 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include +#include + +#include +#include +#include +#include + +#ifdef __OBJC__ +#include +#endif + +using namespace at::mps; + +namespace at { +namespace native { + +Scalar _local_scalar_dense_mps(const Tensor& self) { + Scalar r; + + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( + at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "_local_scalar_dense_mps", [&] { + Tensor output = at::empty_like(self, kCPU); + + Tensor cpu_output = mps::mps_copy_(output, self, false); + scalar_t value = *cpu_output.data_ptr(); + r = Scalar(value); + }); + + return r; +} + + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/mps/operations/ScatterGather.mm b/aten/src/ATen/native/mps/operations/ScatterGather.mm new file mode 100644 index 000000000000..a8d73d5fc42a --- /dev/null +++ b/aten/src/ATen/native/mps/operations/ScatterGather.mm @@ -0,0 +1,500 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include + +#include +#include +#include +#include + +#ifdef __OBJC__ +#include +#endif + +namespace at { +namespace native { + +TORCH_IMPL_FUNC(gather_out_mps) +(const Tensor & self, + int64_t dim, + const Tensor & index, + bool sparse_grad, + const Tensor & output) { + + using namespace mps; + MPSStream* stream = getCurrentMPSStream(); + + dim = at::maybe_wrap_dim(dim, self.dim()); + + TORCH_CHECK(!sparse_grad, "sparse_grad not supported in MPS yet") + + TORCH_CHECK(index.scalar_type() == ScalarType::Long || index.scalar_type() == ScalarType::Int, "index_select(): Expected dtype int32 or int64 for index"); + TORCH_CHECK(self.scalar_type() == output.scalar_type(), + "gather(): self and output must have the same scalar type"); + TORCH_CHECK(dim >= 0 && dim < self.dim(), + "gather(): Indexing dim ", dim, " is out of bounds of tensor"); + + + // Derive from MPSCachedGraph + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* inputTensor_ = nil; + MPSGraphTensor* indexTensor_ = nil; + MPSGraphTensor* outputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + @autoreleasepool { + + MPSShape* input_shape = getMPSShape(self); + NSString* ns_input_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","]; + MPSShape* index_shape = getMPSShape(index); + NSString* ns_index_shape_key = [[index_shape valueForKey:@"description"] componentsJoinedByString:@","]; + + int num_input_dims = [input_shape count]; + int num_index_dims = [index_shape count]; + + TORCH_CHECK(num_input_dims == num_index_dims, "Input and index must have same rank") + + // Determine if we need to slice into the input tensor + bool needSlice = false; + + for(int i = 0; i < num_input_dims; i++) { + TORCH_CHECK(i == dim || [index_shape[i] intValue] <= [input_shape[i] intValue], "Index dim must not exceed input dim except at gathering axis") + if(i != dim && [index_shape[i] intValue] < [input_shape[i] intValue]) + needSlice = true; + } + + string key = "gather_out_mps:" + getMPSTypeString(self.scalar_type()) + ":" + + getMPSTypeString(index.scalar_type()) + ":" + + std::to_string(dim) + ":" + + [ns_input_shape_key UTF8String] + ":" + + [ns_index_shape_key UTF8String]; + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self.scalar_type()), input_shape); + MPSGraphTensor* indexTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(index.scalar_type()), index_shape); + + MPSGraphTensor* getInput = nil; + + // Slice into the input tensor IF NEEDED + if(needSlice) { + NSMutableArray *starts = [NSMutableArray arrayWithCapacity:num_input_dims]; + NSMutableArray *ends = [NSMutableArray arrayWithCapacity:num_input_dims]; + NSMutableArray *strides = [NSMutableArray arrayWithCapacity:num_input_dims]; + + for(int i = 0; i < num_input_dims; i++) { + // All strides are 1 + strides[i] = @1; + // All starts are 0 + starts[i] = @0; + if(i != dim) + ends[i] = index_shape[i]; + else + ends[i] = input_shape[i]; + } + + getInput = [mpsGraph sliceTensor:inputTensor + starts:starts + ends:ends + strides:strides + name:nil]; + + } + else + getInput = inputTensor; + + MPSGraphTensor* castIndexTensor = [mpsGraph castTensor:indexTensor + toType:getMPSDataType(ScalarType::Int) + name:(NSString * _Nonnull)nil]; + + MPSGraphTensor* outputTensor = [mpsGraph gatherAlongAxisWithUpdatesTensor:getInput + indicesTensor:castIndexTensor + axis:(NSInteger)dim + name:nil]; + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->indexTensor_ = indexTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, input_shape); + Placeholder indexPlaceholder = Placeholder(cachedGraph->indexTensor_, index, index_shape); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); + + NSDictionary* feeds = @{ + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), + indexPlaceholder.getMPSGraphTensor() : indexPlaceholder.getMPSGraphTensorData() + }; + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + +} + +void scatter_mps_general +(const Tensor& self, + int64_t dim, + const Tensor& index, + const Tensor& src, + const Tensor& output, + string func_name, + const c10::string_view reduce) { + + using namespace mps; + MPSStream* stream = getCurrentMPSStream(); + + dim = at::maybe_wrap_dim(dim, self.dim()); + + TORCH_CHECK(index.scalar_type() == ScalarType::Long || index.scalar_type() == ScalarType::Int, "index_select(): Expected dtype int32 or int64 for index"); + TORCH_CHECK(self.scalar_type() == output.scalar_type() && output.scalar_type() == src.scalar_type(), + "scatter(): self, src and output must have the same scalar type"); + TORCH_CHECK(dim >= 0 && dim < self.dim(), + "scatter(): Indexing dim ", dim, " is out of bounds of tensor"); + + + // Derive from MPSCachedGraph + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* inputTensor_ = nil; + MPSGraphTensor* indexTensor_ = nil; + MPSGraphTensor* srcTensor_ = nil; + MPSGraphTensor* outputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + @autoreleasepool { + + MPSShape* input_shape = getMPSShape(self); + NSString* ns_input_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","]; + MPSShape* index_shape = getMPSShape(index); + NSString* ns_index_shape_key = [[index_shape valueForKey:@"description"] componentsJoinedByString:@","]; + MPSShape* src_shape = getMPSShape(src); + NSString* ns_src_shape_key = [[src_shape valueForKey:@"description"] componentsJoinedByString:@","]; + + int num_input_dims = [input_shape count]; + int num_index_dims = [index_shape count]; + int num_src_dims = [src_shape count]; + + TORCH_CHECK(num_input_dims == num_index_dims && num_index_dims == num_src_dims, "Input, index and src must have same rank") + + // Do we need to slice into the src tensor? + bool needSlice = false; + bool inputNeedSlice = false; + + for(int i = 0; i < num_input_dims; i++) { + TORCH_CHECK(i == dim || [index_shape[i] intValue] <= [input_shape[i] intValue], "Index dim must not exceed input dim except at gathering axis") + TORCH_CHECK([index_shape[i] intValue] <= [src_shape[i] intValue], "Index dim must not exceed input dim except at gathering axis") + if([index_shape[i] intValue] < [src_shape[i] intValue]) + needSlice = true; + if(i != dim && [index_shape[i] intValue] < [input_shape[i] intValue]) + inputNeedSlice = true; + } + + TORCH_CHECK(reduce != "mean", "Scatter reduce mean mode not yet supported in MPS") + + string reduce_key; + + if(reduce == "set") + reduce_key = "set"; + else if(reduce == "sum") + reduce_key = "sum"; + else if(reduce == "add") + reduce_key = "add"; + else if(reduce == "prod") + reduce_key = "prod"; + else if(reduce == "multiply") + reduce_key = "multiply"; + else if(reduce == "amax") + reduce_key = "amax"; + else if(reduce == "amin") + reduce_key = "amin"; + + string key = func_name + ":" + getMPSTypeString(self.scalar_type()) + ":" + + getMPSTypeString(index.scalar_type()) + ":" + + std::to_string(dim) + ":" + + [ns_input_shape_key UTF8String] + ":" + + [ns_index_shape_key UTF8String] + ":" + + [ns_src_shape_key UTF8String] + ":" + + reduce_key; + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self.scalar_type()), input_shape); + MPSGraphTensor* indexTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(index.scalar_type()), index_shape); + MPSGraphTensor* srcTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(src.scalar_type()), src_shape); + + MPSGraphTensor* getSrc = nil; + MPSGraphTensor* getInput = nil; + + // Slice into the src tensor IF NEEDED + if(needSlice) { + NSMutableArray *starts = [NSMutableArray arrayWithCapacity:num_input_dims]; + NSMutableArray *ends = [NSMutableArray arrayWithCapacity:num_input_dims]; + NSMutableArray *strides = [NSMutableArray arrayWithCapacity:num_input_dims]; + + for(int i = 0; i < num_input_dims; i++) { + // All strides are 1 + strides[i] = @1; + // All starts are 0 + starts[i] = @0; + ends[i] = index_shape[i]; + } + + getSrc = [mpsGraph sliceTensor:srcTensor + starts:starts + ends:ends + strides:strides + name:nil]; + + } + else + getSrc = srcTensor; + + // Use in case input needs to be smaller to get scatter + NSMutableArray* scatterInputShape = nil; + + // Slice into the input tensor IF NEEDED + if(inputNeedSlice) { + NSMutableArray *starts = [NSMutableArray arrayWithCapacity:num_input_dims]; + NSMutableArray *ends = [NSMutableArray arrayWithCapacity:num_input_dims]; + NSMutableArray *strides = [NSMutableArray arrayWithCapacity:num_input_dims]; + + scatterInputShape = [NSMutableArray arrayWithCapacity:num_input_dims]; + + for(int i = 0; i < num_input_dims; i++) { + // All strides are 1 + strides[i] = @1; + // All starts are 0 + starts[i] = @0; + if(i != dim) { + ends[i] = index_shape[i]; + scatterInputShape[i] = index_shape[i]; + } + else { + ends[i] = input_shape[i]; + scatterInputShape[i] = input_shape[i]; + } + } + + getInput = [mpsGraph sliceTensor:inputTensor + starts:starts + ends:ends + strides:strides + name:nil]; + + } + else { + getInput = inputTensor; + scatterInputShape = input_shape; + } + + MPSGraphTensor* outputTensor = nil; + + MPSGraphTensor* castIndexTensor = [mpsGraph castTensor:indexTensor + toType:getMPSDataType(ScalarType::Int) + name:(NSString * _Nonnull)nil]; + + MPSGraphScatterMode scatter_mode; + + if(reduce_key == "set") + scatter_mode = MPSGraphScatterModeSet; + else if(reduce_key == "sum" || reduce_key == "add") + scatter_mode = MPSGraphScatterModeAdd; + else if(reduce_key == "prod" || reduce_key == "multiply") + scatter_mode = MPSGraphScatterModeMul; + else if(reduce_key == "amax") + scatter_mode = MPSGraphScatterModeMax; + else if(reduce_key == "amin") + scatter_mode = MPSGraphScatterModeMin; + + if(!inputNeedSlice) { + outputTensor = [mpsGraph scatterAlongAxisWithDataTensor:getInput + updatesTensor:getSrc + indicesTensor:castIndexTensor + axis:(NSInteger)dim + mode:scatter_mode + name:nil]; + } + else { + // Scatter this into the input with set mode + MPSGraphTensor* scatterTensor = [mpsGraph scatterAlongAxisWithDataTensor:getInput + updatesTensor:getSrc + indicesTensor:castIndexTensor + axis:(NSInteger)dim + mode:scatter_mode + name:nil]; + + // Make an array of scatter indices tensors + NSMutableArray* indicesTensors = [NSMutableArray arrayWithCapacity:num_input_dims]; + + // 1. Concatenate the coord tensors + // 2. Flatten the values + // 3. Scatter into input with add mode + + int shape_data[num_input_dims]; + + for(int i = 0; i < num_input_dims; i++) { + shape_data[i] = {[scatterInputShape[i] intValue]}; + } + + MPSGraphTensor* scatterInputShapeTensor = [mpsGraph constantWithData:[NSData dataWithBytes:shape_data length:num_input_dims * sizeof(int)] + shape:@[[NSNumber numberWithInt:num_input_dims]] + dataType:MPSDataTypeInt32]; + + for(int i = 0; i < num_input_dims; i++) { + MPSGraphTensor* axisTensor = [mpsGraph constantWithScalar:i + dataType:MPSDataTypeInt32]; + MPSGraphTensor* scatter_currentIndexTensor = [mpsGraph getCoordinateValueWithShapeTensor:scatterInputShapeTensor + axisTensor:axisTensor + name:nil]; + scatter_currentIndexTensor = [mpsGraph reshapeTensor:scatter_currentIndexTensor + withShape:@[@-1, @1] + name:nil]; + indicesTensors[i] = scatter_currentIndexTensor; + } + + MPSGraphTensor* scatter_fullIndexTensor = [mpsGraph concatTensors:indicesTensors + dimension:(NSInteger)1 + name:nil]; + + MPSGraphTensor* flatValuesTensor = [mpsGraph reshapeTensor:scatterTensor + withShape:@[@-1] + name:nil]; + + outputTensor = [mpsGraph scatterNDWithDataTensor:inputTensor + updatesTensor:flatValuesTensor + indicesTensor:scatter_fullIndexTensor + batchDimensions:0 + mode:MPSGraphScatterModeSet + name:nil]; + } + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->srcTensor_ = srcTensor; + newCachedGraph->indexTensor_ = indexTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, input_shape); + Placeholder srcPlaceholder = Placeholder(cachedGraph->srcTensor_, src, src_shape); + Placeholder indexPlaceholder = Placeholder(cachedGraph->indexTensor_, index, index_shape); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); + + NSDictionary* feeds = @{ + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), + srcPlaceholder.getMPSGraphTensor() : srcPlaceholder.getMPSGraphTensorData(), + indexPlaceholder.getMPSGraphTensor() : indexPlaceholder.getMPSGraphTensorData() + }; + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + +} + +TORCH_IMPL_FUNC(scatter_src_out_mps) +(const Tensor& self, + int64_t dim, + const Tensor& index, + const Tensor& src, + const Tensor& output) { + + scatter_mps_general(self, dim, index, src, output, "scatter_src_out_mps", "set"); + +} + +TORCH_IMPL_FUNC(scatter_value_out_mps) +(const Tensor& self, + int64_t dim, + const Tensor& index, + const Scalar& value, + const Tensor& output) { + + Tensor src = at::native::empty_mps(index.sizes(), + self.scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + self.suggest_memory_format()); + src.fill_(value); + scatter_mps_general(self, dim, index, const_cast(src), output, "scatter_value_out_mps", "set"); + +} + +TORCH_IMPL_FUNC(scatter_reduce_out_mps) +(const Tensor& self, + int64_t dim, + const Tensor& index, + const Tensor& src, + const c10::string_view reduce, + const Tensor& output) { + + scatter_mps_general(self, dim, index, src, output, "scatter_reduce_out_mps", reduce); + +} + +TORCH_IMPL_FUNC(scatter_value_reduce_out_mps) +(const Tensor& self, + int64_t dim, + const Tensor& index, + const Scalar& value, + const c10::string_view reduce, + const Tensor& output) { + + Tensor src = at::native::empty_mps(index.sizes(), + self.scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + self.suggest_memory_format()); + src.fill_(value); + scatter_mps_general(self, dim, index, const_cast(src), output, "scatter_value_reduce_out_mps", reduce); + +} + +TORCH_IMPL_FUNC(scatter_add_mps_out) +(const Tensor& self, + int64_t dim, + const Tensor& index, + const Tensor& src, + const Tensor& output) { + + scatter_mps_general(self, dim, index, src, output, "scatter_add_mps_out", "add"); +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/mps/operations/Shape.mm b/aten/src/ATen/native/mps/operations/Shape.mm new file mode 100644 index 000000000000..edef852027fb --- /dev/null +++ b/aten/src/ATen/native/mps/operations/Shape.mm @@ -0,0 +1,918 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +namespace mps { + +// Pad operations (1D/2D/3D forward and backward) +Tensor& pad_out_template(Tensor &output, const Tensor &input_, IntArrayRef padding, + const c10::optional& grad_output_opt, + MPSGraphPaddingMode mode, const string op_name) +{ + const int padding_size = (int) padding.size(); + const int padding_dim = padding_size / 2; // either 1D, 2D, or 3D + + TORCH_CHECK(padding_size == 2 || padding_size == 4 || padding_size == 6, + "invalid padding argument of size ", padding_size); + + const Tensor& grad_output_ = *(at::borrow_from_optional_tensor(grad_output_opt)); + const bool is_backward_pass = grad_output_.defined(); + + int dim_w = padding_dim, dim_h = padding_dim - 1, dim_d = padding_dim - 2, dim_slices = 0; + int64_t nbatch = 1, ndims = input_.ndimension(); + + if (!is_backward_pass) { + bool valid_dims = input_.size(1) != 0 && input_.size(padding_dim) != 0; + TORCH_CHECK((ndims == 1 + padding_dim && valid_dims) || + (ndims == 2 + padding_dim && valid_dims && input_.size(1 + padding_dim) != 0), + "3D or 4D (batch mode) tensor expected for input, but got: ", input_); + } + + if (ndims == 2 + padding_dim) { + nbatch = input_.size(0); + dim_w++; + dim_h++; + dim_d++; + dim_slices++; + } + + int64_t pad_l = padding[0]; + int64_t pad_r = padding[1]; + int64_t pad_t = padding_dim > 1 ? padding[2] : 0; + int64_t pad_b = padding_dim > 1 ? padding[3] : 0; + int64_t pad_front = padding_dim > 2 ? padding[4] : 0; + int64_t pad_back = padding_dim > 2 ? padding[5] : 0; + + int64_t nplane = input_.size(dim_slices); + int64_t input_w = input_.size(dim_w); + int64_t output_w = input_w + pad_l + pad_r; + int64_t input_h = padding_dim > 1 ? input_.size(dim_h) : 0; + int64_t output_h = padding_dim > 1 ? input_h + pad_t + pad_b : 0; + int64_t input_d = padding_dim > 2 ? input_.size(dim_d) : 0; + int64_t output_d = padding_dim > 2 ? input_d + pad_front + pad_back : 0; + + Tensor grad_output, input = input_; + + if (!is_backward_pass) { + TORCH_CHECK(pad_l < input_w && pad_r < input_w, + "Argument #4: Padding size should be less than the corresponding " + "input dimension, but got: padding (", pad_l, ", ", pad_r, + ") at dimension ", dim_w, " of input ", ndims); + + if (padding_dim > 1) { + TORCH_CHECK(pad_t < input_h && pad_b < input_h, + "Argument #6: Padding size should be less than the corresponding " + "input dimension, but got: padding (", pad_t, ", ", pad_b, + ") at dimension ", dim_h, " of input ", ndims); + } + TORCH_CHECK(output_w >= 1 || output_h >= padding_dim - 1, + "input (H: ", input_h, ", W: ", input_w, ") is too small. Calculated " + "output H: ", output_h, " W: ", output_w); + + if (ndims == 1 + padding_dim) { + if (padding_dim == 3) + output.resize_({nplane, output_d, output_h, output_w}); + else if (padding_dim == 2) + output.resize_({nplane, output_h, output_w}); + else + output.resize_({nplane, output_w}); + } else { + if (padding_dim == 3) + output.resize_({nbatch, nplane, output_d, output_h, output_w}); + else if (padding_dim == 2) + output.resize_({nbatch, nplane, output_h, output_w}); + else + output.resize_({nbatch, nplane, output_w}); + } + if (output.numel() == 0 || input_.numel() == 0) + return output; + input = input_.contiguous(); + } else { + TORCH_CHECK(output_w == grad_output_.size(dim_w), + "gradOutput width unexpected. Expected: ", output_w, ", Got: ", grad_output_.size(dim_w)); + if (padding_dim > 1) { + TORCH_CHECK(output_h == grad_output_.size(dim_h), + "gradOutput height unexpected. Expected: ", output_h, ", Got: ", grad_output_.size(dim_h)); + } + grad_output = grad_output_.contiguous(); + } + + const int64_t input_dim = input.dim(); + MPSShape *leftPadding = nullptr, *rightPadding = nullptr; + if (padding_dim == 3) { + leftPadding = [NSArray arrayWithObjects:(const NSNumber*[]){ @(0), @(0), @(pad_front), @(pad_t), @(pad_l) } count:input_dim]; + rightPadding = [NSArray arrayWithObjects:(const NSNumber*[]){ @(0), @(0), @(pad_back), @(pad_b), @(pad_r) } count:input_dim]; + } else if (padding_dim == 2) { + leftPadding = [NSArray arrayWithObjects:(const NSNumber*[]){ @(0), @(0), @(pad_t), @(pad_l) } count:input_dim]; + rightPadding = [NSArray arrayWithObjects:(const NSNumber*[]){ @(0), @(0), @(pad_b), @(pad_r) } count:input_dim]; + } else if (padding_dim == 1) { + leftPadding = [NSArray arrayWithObjects:(const NSNumber*[]){ @(0), @(0), @(pad_l) } count:input_dim]; + rightPadding = [NSArray arrayWithObjects:(const NSNumber*[]){ @(0), @(0), @(pad_r) } count:input_dim]; + } + + struct CachedGraph : public MPSCachedGraph { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) { } + MPSGraphTensor *inputTensor = nil, *outputTensor = nil; + MPSGraphTensor *gradOutputTensor = nil; + }; + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + @autoreleasepool { + string key = op_name + getTensorsStringKey({input, grad_output}) + + ":L" + to_string(pad_l) + ":R" + to_string(pad_r) + + ":T" + to_string(pad_t) + ":B" + to_string(pad_b) + + ":F" + to_string(pad_front) + ":K" + to_string(pad_back); + + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + cachedGraph = static_cast(cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input); + if (!is_backward_pass) { + newCachedGraph->outputTensor = [mpsGraph padTensor:newCachedGraph->inputTensor + withPaddingMode:mode + leftPadding:leftPadding + rightPadding:rightPadding + constantValue:0 + name:nil]; + } else { + newCachedGraph->gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output); + newCachedGraph->outputTensor = [mpsGraph padGradientWithIncomingGradientTensor:newCachedGraph->gradOutputTensor + sourceTensor:newCachedGraph->inputTensor + paddingMode:mode + leftPadding:leftPadding + rightPadding:rightPadding + name:nil]; + } + } + return newCachedGraph; + })); + } + Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor, input); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output); + + NSMutableDictionary *feeds = [[NSMutableDictionary new] autorelease]; + feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData(); + if (is_backward_pass) { + Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor, grad_output); + feeds[gradOutputPlaceholder.getMPSGraphTensor()] = gradOutputPlaceholder.getMPSGraphTensorData(); + } + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + } + return output; +} +} // namespace mps + +// 1D Reflection and Replication Padding +TORCH_IMPL_FUNC(reflection_pad1d_out_mps) +(const Tensor& input, IntArrayRef padding, const Tensor& output) +{ + mps::pad_out_template(const_cast(output), input, padding, c10::nullopt, MPSGraphPaddingModeReflect, "reflection_pad1d_out_mps"); +} + +TORCH_IMPL_FUNC(reflection_pad1d_backward_out_mps) +(const Tensor& grad_output, const Tensor& input, IntArrayRef padding, const Tensor& grad_input) +{ + grad_input.resize_as_(input).zero_(); + mps::pad_out_template(const_cast(grad_input), input, padding, grad_output, MPSGraphPaddingModeReflect, "reflection_pad1d_backward_out_mps"); +} + +TORCH_IMPL_FUNC(replication_pad1d_out_mps) +(const Tensor& input, IntArrayRef padding, const Tensor& output) +{ + mps::pad_out_template(const_cast(output), input, padding, c10::nullopt, MPSGraphPaddingModeClampToEdge, "replication_pad1d_out_mps"); +} + +TORCH_IMPL_FUNC(replication_pad1d_backward_out_mps) +(const Tensor& grad_output, const Tensor& input, IntArrayRef padding, const Tensor& grad_input) +{ + grad_input.resize_as_(input).zero_(); + mps::pad_out_template(const_cast(grad_input), input, padding, grad_output, MPSGraphPaddingModeClampToEdge, "replication_pad1d_backward_out_mps"); +} + +// 2D Reflection and Replication Padding +Tensor& reflection_pad2d_out_mps(const Tensor& input, IntArrayRef padding, Tensor& output) +{ + return mps::pad_out_template(output, input, padding, c10::nullopt, MPSGraphPaddingModeReflect, __func__); +} + +Tensor reflection_pad2d_mps(const Tensor& input, IntArrayRef padding) +{ + Tensor output = at::empty({0}, input.options()); + return mps::pad_out_template(output, input, padding, c10::nullopt, MPSGraphPaddingModeReflect, __func__); +} + +Tensor& reflection_pad2d_backward_out_mps(const Tensor& grad_output, const Tensor& input, IntArrayRef padding, Tensor& grad_input) +{ + grad_input.resize_as_(input).zero_(); + return mps::pad_out_template(grad_input, input, padding, grad_output, MPSGraphPaddingModeReflect, __func__); +} + +Tensor reflection_pad2d_backward_mps(const Tensor& grad_output, const Tensor& input, IntArrayRef padding) +{ + auto grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + return mps::pad_out_template(grad_input, input, padding, grad_output, MPSGraphPaddingModeReflect, __func__); +} + +TORCH_IMPL_FUNC(replication_pad2d_out_mps) +(const Tensor& input, IntArrayRef padding, const Tensor& output) +{ + mps::pad_out_template(const_cast(output), input, padding, c10::nullopt, MPSGraphPaddingModeClampToEdge, "replication_pad2d_out_mps"); +} + +Tensor& replication_pad2d_backward_out_mps(const Tensor& grad_output, const Tensor& input, IntArrayRef padding, Tensor& grad_input) +{ + grad_input.resize_as_(input).zero_(); + return mps::pad_out_template(grad_input, input, padding, grad_output, MPSGraphPaddingModeClampToEdge, __func__); +} + +Tensor replication_pad2d_backward_mps(const Tensor& grad_output, const Tensor& input, IntArrayRef padding) +{ + auto grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + return mps::pad_out_template(grad_input, input, padding, grad_output, MPSGraphPaddingModeClampToEdge, __func__); +} + +// 3D Reflection and Replication Padding +TORCH_IMPL_FUNC(reflection_pad3d_out_mps) +(const Tensor& input, IntArrayRef padding, const Tensor& output) +{ + mps::pad_out_template(const_cast(output), input, padding, c10::nullopt, MPSGraphPaddingModeReflect, "reflection_pad3d_out_mps"); +} + +TORCH_IMPL_FUNC(reflection_pad3d_backward_out_mps) +(const Tensor& grad_output, const Tensor& input, IntArrayRef padding, const Tensor& grad_input) +{ + grad_input.resize_as_(input).zero_(); + mps::pad_out_template(const_cast(grad_input), input, padding, grad_output, MPSGraphPaddingModeReflect, "reflection_pad3d_backward_out_mps"); +} + +TORCH_IMPL_FUNC(replication_pad3d_out_mps) +(const Tensor& input, IntArrayRef padding, const Tensor& output) +{ + mps::pad_out_template(const_cast(output), input, padding, c10::nullopt, MPSGraphPaddingModeClampToEdge, "replication_pad3d_out_mps"); +} + +Tensor& replication_pad3d_backward_out_mps(const Tensor& grad_output, const Tensor& input, IntArrayRef padding, Tensor& grad_input) +{ + grad_input.resize_as_(input).zero_(); + return mps::pad_out_template(grad_input, input, padding, grad_output, MPSGraphPaddingModeClampToEdge, __func__); +} + +Tensor replication_pad3d_backward_mps(const Tensor& grad_output, const Tensor& input, IntArrayRef padding) +{ + auto grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + return mps::pad_out_template(grad_input, input, padding, grad_output, MPSGraphPaddingModeClampToEdge, __func__); +} + +// topk +TORCH_IMPL_FUNC(topk_out_mps) + (const Tensor& self, + int64_t k, + int64_t dim_, + bool largest, + bool sorted, + const Tensor& values, + const Tensor& indices) +{ + using namespace mps; + int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true); + TORCH_CHECK( + k >= 0 && k <= (self.dim() > 0 ? self.size(dim) : 1), + "selected index k out of range"); + + if (self.dim() == 0 && self.numel() == 1) + { + values.copy_(self); + indices.zero_(); + return; + } + MPSStream* stream = getCurrentMPSStream(); + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *selfTensor = nil, *valuesTensor = nil, *indicesTensor = nil; + }; + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + // MPSGraph topK is always sorted. + @autoreleasepool + { + // Input as placeholders + MPSShape* input_shape = getMPSShape(self); + NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","]; + string key = string("topk:") + [ns_shape_key UTF8String] + ":" + + getMPSTypeString(self.scalar_type()) + + ":k" + to_string(k) + ":dim" + to_string(dim_) + + ":largest" + to_string(largest); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) + { + cachedGraph = static_cast(cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + @autoreleasepool + { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + newCachedGraph->selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self.scalar_type()), input_shape); + if ((dim_ != -1 && dim_ != self.dim() - 1) && (!largest)) + { + // transpose and negate + MPSGraphTensor *transposedInput = [mpsGraph transposeTensor: newCachedGraph->selfTensor + dimension: (NSUInteger)self.dim()-1 + withDimension: (NSUInteger)dim_ + name: nil]; + MPSGraphTensor * identity = [mpsGraph identityWithTensor: transposedInput + name: nil]; + MPSGraphTensor * negatedTransposedInput = [mpsGraph negativeWithTensor:identity + name: nil]; + NSArray * outputMPSGraphTensors = [mpsGraph + topKWithSourceTensor:negatedTransposedInput + k:((NSUInteger) k) + name:nil]; + MPSGraphTensor *valuesNegatedTransposed = outputMPSGraphTensors[0]; + MPSGraphTensor *indicesTransposed = outputMPSGraphTensors[1]; + MPSGraphTensor *valuesNegated = [mpsGraph transposeTensor: valuesNegatedTransposed + dimension: (NSUInteger)self.dim()-1 + withDimension: (NSUInteger)dim_ + name: nil]; + newCachedGraph->valuesTensor = [mpsGraph negativeWithTensor:valuesNegated + name: nil]; + newCachedGraph->indicesTensor = [mpsGraph transposeTensor: indicesTransposed + dimension: (NSUInteger)self.dim()-1 + withDimension: (NSUInteger)dim_ + name: nil]; + } + else if (dim_ != -1 && dim_ != self.dim() - 1) + { + MPSGraphTensor *transposedInput = [mpsGraph transposeTensor: newCachedGraph->selfTensor + dimension: (NSUInteger)self.dim()-1 + withDimension: (NSUInteger)dim_ + name: nil]; + MPSGraphTensor * identity = [mpsGraph identityWithTensor: transposedInput + name: nil]; + NSArray * outputMPSGraphTensors = [mpsGraph + topKWithSourceTensor:identity + k:((NSUInteger) k) + name:nil]; + MPSGraphTensor *valuesTransposed = outputMPSGraphTensors[0]; + MPSGraphTensor *indicesTransposed = outputMPSGraphTensors[1]; + newCachedGraph->valuesTensor = [mpsGraph transposeTensor:valuesTransposed + dimension: (NSUInteger)self.dim()-1 + withDimension: (NSUInteger)dim_ + name: nil]; + newCachedGraph->indicesTensor = [mpsGraph transposeTensor: indicesTransposed + dimension: (NSUInteger)self.dim()-1 + withDimension: (NSUInteger)dim_ + name: nil]; + } + else if (!largest) + { + // only negate + MPSGraphTensor *negatedInput = [mpsGraph negativeWithTensor:newCachedGraph->selfTensor + name: nil]; + NSArray * outputMPSGraphTensors = [mpsGraph + topKWithSourceTensor:negatedInput + k:((NSUInteger) k) + name:nil]; + MPSGraphTensor *valuesNegated = outputMPSGraphTensors[0]; + newCachedGraph->valuesTensor = [mpsGraph negativeWithTensor:valuesNegated + name: nil]; + newCachedGraph->indicesTensor = outputMPSGraphTensors[1]; + } + else + { + NSArray * outputMPSGraphTensors = [mpsGraph + topKWithSourceTensor:newCachedGraph->selfTensor + k:((NSUInteger) k) + name:nil]; + newCachedGraph->valuesTensor = outputMPSGraphTensors[0]; + newCachedGraph->indicesTensor = outputMPSGraphTensors[1]; + } + + } + return newCachedGraph; + })); + } + Placeholder inputPlaceholder = Placeholder(cachedGraph->selfTensor, self); + // Outputs as placeholders + Placeholder valuesPlaceholder = Placeholder(cachedGraph->valuesTensor, values); + Placeholder indicesPlaceholder = Placeholder(cachedGraph->indicesTensor, indices); + // Create dictionary of inputs and outputs + NSDictionary* feeds = nil; + feeds = @{ + inputPlaceholder.getMPSGraphTensor() : + inputPlaceholder.getMPSGraphTensorData() + }; + NSDictionary* results = @{ + valuesPlaceholder.getMPSGraphTensor() : + valuesPlaceholder.getMPSGraphTensorData(), + indicesPlaceholder.getMPSGraphTensor() : + indicesPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } +} + +void check_shape_except_dim(const Tensor &first, const Tensor &second, + int dimension, int index) +{ + int first_dims = first.dim(); + int second_dims = second.dim(); + TORCH_CHECK(first_dims == second_dims, + "Tensors must have same number of dimensions: got ", first_dims, + " and ", second_dims); + for (int dim = 0; dim < first_dims; dim++) { + if (dim == dimension) { + continue; + } + int64_t first_dim_size = at::native::size(first, dim); + int64_t second_dim_size = at::native::size(second, dim); + TORCH_CHECK(first_dim_size == second_dim_size, + "Sizes of tensors must match except in dimension ", dim, ". Got ", + static_cast(first_dim_size), " and ", + static_cast(second_dim_size), " (The offending index is ", + index, ")"); + } +} + +inline c10::MemoryFormat compute_output_memory_format(const TensorList &inputs) { + c10::optional format = c10::nullopt; + for (auto &t : inputs) { + auto f = t.suggest_memory_format(); + if (!format.has_value()) { + format = f; + continue; + } + if (format.value() == f) { + continue; + } + bool contiguous = (format.value() == c10::MemoryFormat::Contiguous || f == c10::MemoryFormat::Contiguous || format.value() != f); + if (contiguous) { + return c10::MemoryFormat::Contiguous; + } + } + return format.value(); +} + +//Tensor cat_mps(TensorList inputs, int64_t dimension) { + //ScalarType high_type = result_type(inputs); + //Tensor out = at::empty({0}, inputs.front().options().dtype(high_type)); + //at::native::cat_out_mps(inputs, dimension, out); + //return out; +//} + +TORCH_IMPL_FUNC(cat_out_mps) + (ITensorListRef inputs, + int64_t dimension, + int64_t valid, + bool all_contiguous, + bool all_same_dtype, + bool all_same_sizes_and_stride, + MemoryFormat memory_format, + const Tensor& out) { + using namespace mps; + if (out.numel() == 0) { + return; + } + + auto materialized_inputs = inputs.materialize(); + + int idx = 0; + for(const Tensor& t : materialized_inputs) { + TORCH_CHECK(t.dim() > 0, + "zero-dimensional tensor (at position ", idx, ") cannot be concatenated"); + idx++; + } + + dimension = legacy_cat_wrap_dim(dimension, inputs); + + // previously, size [0] tensors were the only possible empty tensors; thus, it + // wasn't possible to cat empty tensors unless all the other tensors were + // 1-dimensional, so we allowed these tensors to be "skipped". We maintain + // this behavior for backwards compatibility, but only for this specific size + // (i.e. other empty sizes are not skipped). + // FIXME: warn if this is the case + auto should_skip = [](const Tensor& t) { + return t.dim() == 1 && at::native::size(t, 0) == 0; + }; + + const Tensor* notSkippedTensor = NULL; // non-owning reference + int nDims = 0; + + // Check for type promotion + TORCH_CHECK( + canCast(result_type(inputs), out.scalar_type()), + "torch.cat(): input types ", + " can't be cast to the desired output type ", + out.scalar_type()); + + // Inputs cannot alias the output tensor + idx = 0; + for(const Tensor& t : materialized_inputs) { + auto lap = at::get_overlap_status(out, t); + TORCH_CHECK( + lap != at::MemOverlapStatus::PARTIAL && + lap != at::MemOverlapStatus::FULL, + "torch.cat(): unsupported operation: the input tensors cannot refer to any " + "of the output memory locations. Found overlap in input " + "tensor ", + idx); + idx++; + } + at::assert_no_internal_overlap(out); + + for(const Tensor& t : materialized_inputs) { + if (should_skip(t)) { + continue; + } + nDims = t.dim(); + // TODO: Is this OK? + notSkippedTensor = &t; + } + + // If all inputs are empty tensors, return an empty tensor + if (notSkippedTensor == NULL) { + return; + } + + TORCH_CHECK( + inputs.size() > 0, + "torch.cat(): invalid number of inputs ", + inputs.size()); + TORCH_CHECK(dimension >= 0, "torch.cat(): invalid dimension ", dimension); + + for (const Tensor& t : inputs) { + TORCH_CHECK( + t.device() == notSkippedTensor->device(), + "torch.cat(): all input tensors must be on the same device. Received ", + t.device(), + " and ", + notSkippedTensor->device()); + } + + TORCH_CHECK( + out.device() == notSkippedTensor->device(), + "torch.cat(): all input tensors and out must be on the same device, but inputs are on ", + notSkippedTensor->device(), + " and out is on ", + out.device()); + + // TODO: memory_format is now an argument? + // // TODO: Factor out `compute_output_memory_format` + // c10::MemoryFormat memory_format = compute_output_memory_format(inputs); + + std::vector size(notSkippedTensor->sizes().vec()); + + // Compute size of the result in the cat dimension + int64_t cat_dim_size = 0; + idx = 0; + for(const Tensor& tensor : materialized_inputs) { + if (should_skip(tensor)) { + continue; + } + // TODO: Factor out `check_shape_except_dim` + check_shape_except_dim(*notSkippedTensor, tensor, dimension, idx); + cat_dim_size += at::native::size(tensor, dimension); + idx++; + } + + // Compute the size of the result + size[dimension] = cat_dim_size; + + // skip resizing if size of result is same as expected + if (out.sizes() != size) { + out.resize_(size, memory_format); + } + + if (out.numel() == 0) { + return; + } + + // Get stream + MPSStream* stream = getCurrentMPSStream(); + + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + // TODO: Free this when no longer needed globally + MPSGraphTensor** inputMPSGraphTensors_ = nil; + MPSGraphTensor* outputTensor_ = nil; + }; + + MPSGraphCache *cache_ = MPSGraphCache::getInstance(); + + @autoreleasepool { + string key = "cat_out_mps:" + getMPSTypeString(result_type(inputs)) + + ":" + to_string(inputs.size()) + + ":" + to_string(dimension); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + // Initialize graph + MPSGraph *mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + // Create placeholders + MPSGraphTensor* inputMPSGraphTensors[inputs.size()]; + + for(int i = 0; i < inputs.size(); i++) + inputMPSGraphTensors[i] = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(result_type(inputs))); + + auto inputTensorsArray = [NSArray arrayWithObjects:inputMPSGraphTensors + count:inputs.size()]; + // Use concatTensors to concatenate + MPSGraphTensor* outputTensor = [mpsGraph concatTensors:inputTensorsArray + dimension:dimension // Maybe convert this from int64_t -> int32 + name:nil]; + + newCachedGraph->inputMPSGraphTensors_ = (MPSGraphTensor**)malloc(inputs.size() * sizeof(MPSGraphTensor*)); + + for(int i = 0; i < inputs.size(); i++) + newCachedGraph->inputMPSGraphTensors_[i] = inputMPSGraphTensors[i]; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + std::vector inputPlaceholders; + int i = 0; + for(const Tensor& tensor : materialized_inputs) { + Placeholder currentInputPlaceholder = Placeholder(cachedGraph->inputMPSGraphTensors_[i], tensor); + inputPlaceholders.push_back(currentInputPlaceholder); + i++; + } + + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out); + + NSMutableDictionary *feeds = [[NSMutableDictionary new] autorelease]; + for (int i = 0; i < inputs.size(); i++) { + feeds[(inputPlaceholders[i]).getMPSGraphTensor()] = (inputPlaceholders[i]).getMPSGraphTensorData(); + } + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + +} + +void upsample_backward_out_mps(const Tensor& grad_output, + IntArrayRef output_size, + IntArrayRef input_size, + c10::optional scales_h, + c10::optional scales_w, + const Tensor& grad_input, + MPSGraphResizeMode requested_mode, + bool requested_align_corners + ) +{ + using namespace mps; + int64_t input_dims = input_size.size(); + + TORCH_CHECK((input_dims == 4), + "NCHW tensor expected for input"); + + struct CachedGraph : public MPSCachedGraph { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *gradInputTensor = nil, *gradOutputTensor = nil; + }; + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + /* sizes */ + int64_t output_height = output_size[0]; + int64_t output_width = output_size[1]; + + int64_t input_n = input_size[0]; + int64_t input_c = input_size[1]; + int64_t input_height = input_size[2]; + int64_t input_width = input_size[3]; + + @autoreleasepool { + MPSShape* output_shape = getMPSShape(grad_output); + string key = string("upsample_backward:") + mps::getMPSShapeString(output_shape) + ":" + + getMPSTypeString(grad_output.scalar_type()) + + ":oh" + to_string(output_height) + ":ow" + to_string(output_width) + + ":ih" + to_string(input_height) + ":iw" + to_string(input_width) + + ":mode" + to_string(requested_mode); + + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + cachedGraph = static_cast(cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + newCachedGraph->gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(grad_input.scalar_type()), output_shape); + MPSGraphTensor * shapeTensor = [mpsGraph constantWithScalar:0 + shape:@[[NSNumber numberWithLong: input_n], + [NSNumber numberWithLong: input_c], + [NSNumber numberWithLong:input_height], + [NSNumber numberWithLong:input_width]] + dataType:getMPSDataType(grad_output.scalar_type())]; + + newCachedGraph->gradInputTensor = [mpsGraph resizeWithGradientTensor: newCachedGraph->gradOutputTensor + input: shapeTensor + mode: requested_mode + centerResult: true + alignCorners: requested_align_corners + layout: MPSGraphTensorNamedDataLayoutNCHW + name: nil]; + + } + return newCachedGraph; + })); + } + Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor, grad_output); + Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor, grad_input); + + NSDictionary* feeds = @{ + gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), + }; + NSDictionary* results = @{ + gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData() + }; + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + } +} + +TORCH_IMPL_FUNC(_upsample_nearest_exact2d_backward_out_mps) ( + const Tensor& grad_output, + IntArrayRef output_size, + IntArrayRef input_size, + c10::optional scales_h, + c10::optional scales_w, + const Tensor& grad_input) +{ + upsample_backward_out_mps(grad_output, output_size, input_size, scales_h, scales_w, grad_input, MPSGraphResizeNearest, false); +} + +TORCH_IMPL_FUNC(upsample_nearest2d_backward_out_mps) ( + const Tensor& grad_output, + IntArrayRef output_size, + IntArrayRef input_size, + c10::optional scales_h, + c10::optional scales_w, + const Tensor& grad_input) +{ + upsample_backward_out_mps(grad_output, output_size, input_size, scales_h, scales_w, grad_input, MPSGraphResizeNearest, false); +} + +TORCH_IMPL_FUNC(upsample_bilinear2d_backward_out_mps) ( + const Tensor& grad_output, + IntArrayRef output_size, + IntArrayRef input_size, + bool align_corners, + c10::optional scales_h, + c10::optional scales_w, + const Tensor& grad_input) +{ + upsample_backward_out_mps(grad_output, output_size, input_size, scales_h, scales_w, grad_input, MPSGraphResizeBilinear, align_corners); +} + +void upsample_out_mps(const Tensor& input, + IntArrayRef output_size, + c10::optional scales_h, + c10::optional scales_w, + const Tensor& output, + MPSGraphResizeMode requested_mode, + bool requested_align_corners) +{ + // Get stream + using namespace mps; + struct CachedGraph : public MPSCachedGraph { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor = nil, *outputTensor = nil; + }; + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + /* sizes */ + int64_t output_height = output_size[0]; + int64_t output_width = output_size[1]; + @autoreleasepool { + MPSShape* input_shape = getMPSShape(input); + NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","]; + string key = string("upsample_2d:") + mps::getMPSShapeString(input_shape) + ":" + + getMPSTypeString(input.scalar_type()) + + ":h" + to_string(output_height) + ":w" + to_string(output_width) + + ":mode" + to_string(requested_mode); + + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + if(!cachedGraph) { + cachedGraph = static_cast(cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), input_shape); + newCachedGraph->outputTensor = [mpsGraph resizeTensor:newCachedGraph->inputTensor + size:@[ @(output_height), @(output_width)] + mode:requested_mode + centerResult: true + alignCorners: requested_align_corners + layout: MPSGraphTensorNamedDataLayoutNCHW + name:nil]; + } + return newCachedGraph; + })); + } + Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor, input); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output); + + NSDictionary* feeds = @{ + inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), + }; + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + } +} + +TORCH_IMPL_FUNC(_upsample_nearest_exact2d_out_mps) ( + const Tensor& input, + IntArrayRef output_size, + c10::optional scales_h, + c10::optional scales_w, + const Tensor& output) +{ + // Note: this differs from the CPU implementation in the way + // ties are resolved wrt to nearest mostly in cases where the scale + // is not an integer. + // Example: + // For upsampling from (2, 5) to (2, 16) + // MPS: + // tensor([[[[0., 0., 0., 0., 1., 1., 1., 2., 2., 2., 3., 3., 3., 4., 4., 4.], + // [5., 5., 5., 5., 6., 6., 6., 7., 7., 7., 8., 8., 8., 9., 9., 9.]]]]) + // CPU: + // tensor([[[[0., 0., 0., 1., 1., 1., 2., 2., 2., 2., 3., 3., 3., 4., 4., 4.], + // [5., 5., 5., 6., 6., 6., 7., 7., 7., 7., 8., 8., 8., 9., 9., 9.]]]]) + using namespace mps; + upsample_out_mps(input, output_size, scales_h, scales_w, output, MPSGraphResizeNearest, false); +} + + +TORCH_IMPL_FUNC(upsample_nearest2d_out_mps) ( + const Tensor& input, + IntArrayRef output_size, + c10::optional scales_h, + c10::optional scales_w, + const Tensor& output) +{ + // Note: this differs from the CPU implementation in the way + // ties are resolved wrt to nearest mostly in cases where the scale + // is not an integer. + // Example: + // For upsampling from (2, 5) to (2, 16) + // MPS: + // tensor([[[[0., 0., 0., 0., 1., 1., 1., 2., 2., 2., 3., 3., 3., 4., 4., 4.], + // [5., 5., 5., 5., 6., 6., 6., 7., 7., 7., 8., 8., 8., 9., 9., 9.]]]]) + // CPU: + // tensor([[[[0., 0., 0., 1., 1., 1., 2., 2., 2., 2., 3., 3., 3., 4., 4., 4.], + // [5., 5., 5., 6., 6., 6., 7., 7., 7., 7., 8., 8., 8., 9., 9., 9.]]]]) + using namespace mps; + upsample_out_mps(input, output_size, scales_h, scales_w, output, MPSGraphResizeNearest, false); +} + +TORCH_IMPL_FUNC(upsample_bilinear2d_out_mps) ( + const Tensor& input, + IntArrayRef output_size, + bool align_corners, + c10::optional scales_h, + c10::optional scales_w, + const Tensor& output) +{ + using namespace mps; + upsample_out_mps(input, output_size, scales_h, scales_w, output, MPSGraphResizeBilinear, align_corners); +} +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/mps/operations/SoftMax.mm b/aten/src/ATen/native/mps/operations/SoftMax.mm new file mode 100644 index 000000000000..4246a37671e9 --- /dev/null +++ b/aten/src/ATen/native/mps/operations/SoftMax.mm @@ -0,0 +1,278 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include + +#include +#include +#include +#include + +#ifdef __OBJC__ +#include +#endif + +namespace at { +namespace native { + +void get_shapes(MPSShape* input_shape_readonly, + NSMutableArray* &input_shape, + int num_input_dims, c10::MemoryFormat memory_format) { + // Modify the shape + if(memory_format == at::MemoryFormat::Contiguous) { + for(int i = 0; i < num_input_dims; i++) + input_shape[i] = input_shape_readonly[i]; + } + else { // ChannelsLast + auto num_channels = input_shape_readonly[1]; + input_shape[0] = input_shape_readonly[0]; + for(int i = 1; i < num_input_dims-1; i++) + input_shape[i] = input_shape_readonly[i+1]; + input_shape[num_input_dims-1] = num_channels; + } +} + +// Note - Currently only supported for 4D image tensors + +TORCH_IMPL_FUNC(softmax_mps_out) +(const Tensor& input_, + const int64_t dim, + const bool half_to_float, + const Tensor& output) { + + TORCH_CHECK(!half_to_float, "softmax with half to float conversion is not supported on MPS"); + + if (input_.numel() == 0) { + return; + } + + Tensor input; + if (input_.dim() == 0) { + input = input_.view(1); + } + else + input = input_; + + int64_t dim_ = maybe_wrap_dim(dim, input.dim()); + TORCH_CHECK( + dim_ >= 0 && dim_ < input.dim(), + "Softmax:dim must be non-negative and less than input dimensions"); + + const auto memory_format = input.suggest_memory_format(); + // TORCH_CHECK(input.suggest_memory_format() == output.suggest_memory_format(), "Input and output memory format should match") + + using namespace mps; + MPSStream* stream = getCurrentMPSStream(); + + // Derive from MPSCachedGraph + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* inputTensor_ = nil; + MPSGraphTensor* outputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + @autoreleasepool { + + string mem_format_key = get_mem_format_string(memory_format); + MPSShape* input_shape_readonly = mps::getMPSShape(input); + int num_input_dims = [input_shape_readonly count]; + // Check - Channels last implies 4d + TORCH_CHECK(memory_format != at::MemoryFormat::ChannelsLast || num_input_dims == 4, "ChannelsLast implies 4d tensor") + // Input shape changes based on memory format + NSMutableArray* input_shape = [NSMutableArray arrayWithCapacity:num_input_dims]; + + get_shapes(input_shape_readonly, input_shape, num_input_dims, memory_format); + + // Change dim + if(memory_format == at::MemoryFormat::ChannelsLast && dim_ > 0) { + switch(dim_) { + case 1: + dim_ = 3; + break; + case 2: + dim_ = 1; + break; + case 3: + dim_ = 2; + break; + default: + assert(0 && "Invalid dim\n"); + } + } + + NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","]; + + string key = "softmax_mps_out:" + mem_format_key + ":" + getMPSTypeString(input.scalar_type()) + ":" + + [ns_shape_key UTF8String] + ":" + std::to_string(dim_); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), input_shape); + + // passing selector of softMaxWithTensor on the mpsGraph object + MPSGraphTensor* outputTensor = [mpsGraph softMaxWithTensor:inputTensor + axis:(NSInteger)dim_ + name:nil]; + + // Output needs to be contiguous format + if(memory_format == at::MemoryFormat::ChannelsLast) { + auto N = input_shape[0]; + auto H = input_shape[1]; + auto W = input_shape[2]; + auto C = input_shape[3]; + + outputTensor = [mpsGraph reshapeTensor:outputTensor + withShape:@[N, ([NSNumber numberWithInt:[H intValue]* [W intValue]]), C] + name:nil]; + outputTensor = [mpsGraph transposeTensor:outputTensor + dimension:1 + withDimension:2 + name:nil]; + outputTensor = [mpsGraph reshapeTensor:outputTensor + withShape:@[N, C, H, W] + name:nil]; + + } + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input, input_shape); + // This must be the Contiguous shape + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); + + NSDictionary* feeds = @{ + inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData() + }; + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + +} + +TORCH_IMPL_FUNC(softmax_backward_mps_out) +(const Tensor& grad_, + const Tensor& output_, + int64_t dim, + ScalarType input_dtype, + const Tensor& grad_input) { + + if (output_.numel() == 0) { + return; + } + + Tensor grad; + if (grad_.dim() == 0) { + grad = grad_.view(1); + } + else + grad = grad_; + + Tensor output; + if (output_.dim() == 0) { + output = output_.view(1); + } + else + output = output_; + + int64_t dim_ = maybe_wrap_dim(dim, grad.dim()); + TORCH_CHECK( + dim_ >= 0 && dim_ < grad.dim(), + "Grad:dim must be non-negative and less than input dimensions"); + + using namespace mps; + MPSStream* stream = getCurrentMPSStream(); + + // Derive from MPSCachedGraph + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* softmaxTensor_ = nil; + MPSGraphTensor* gradOutputTensor_ = nil; + MPSGraphTensor* gradInputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + @autoreleasepool { + + MPSShape* grad_shape = mps::getMPSShape(grad); + int num_grad_dims = [grad_shape count]; + + NSString* ns_shape_key = [[grad_shape valueForKey:@"description"] componentsJoinedByString:@","]; + + string key = "softmax_backward_mps_out:" + getMPSTypeString(output.scalar_type()) + ":" + + [ns_shape_key UTF8String] + ":" + std::to_string(dim_); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* softmaxTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(output.scalar_type()), grad_shape); + MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(grad.scalar_type()), grad_shape); + + MPSGraphTensor* mulTensor = [mpsGraph multiplicationWithPrimaryTensor:softmaxTensor + secondaryTensor:gradOutputTensor + name:nil]; + MPSGraphTensor* mulSumTensor = [mpsGraph reductionSumWithTensor:mulTensor + axis:(NSInteger)dim_ + name:nil]; + MPSGraphTensor* gradSubTensor = [mpsGraph subtractionWithPrimaryTensor:gradOutputTensor + secondaryTensor:mulSumTensor + name:nil]; + MPSGraphTensor* gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:softmaxTensor + secondaryTensor:gradSubTensor + name:nil]; + + newCachedGraph->softmaxTensor_ = softmaxTensor; + newCachedGraph->gradOutputTensor_ = gradOutputTensor; + newCachedGraph->gradInputTensor_ = gradInputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder softmaxPlaceholder = Placeholder(cachedGraph->softmaxTensor_, output, grad_shape); + Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad, grad_shape); + Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input); + + NSDictionary* feeds = @{ + softmaxPlaceholder.getMPSGraphTensor() : softmaxPlaceholder.getMPSGraphTensorData(), + gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData() + }; + NSDictionary* results = @{ + gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + + } +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/mps/operations/TensorCompare.mm b/aten/src/ATen/native/mps/operations/TensorCompare.mm new file mode 100644 index 000000000000..a6c267290312 --- /dev/null +++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm @@ -0,0 +1,344 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include + +namespace at { +namespace native { +namespace mps { + +struct CachedGraph : public MPSCachedGraph +{ + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor = nil, *outputTensor = nil; + MPSGraphTensor *minTensor = nil, *maxTensor = nil; +}; + +void clamp_mps_graph(CachedGraph* cachedGraph, const Tensor& input_tensor) +{ + MPSGraph *mpsGraph = cachedGraph->graph(); + + cachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_tensor); + + if (cachedGraph->minTensor && cachedGraph->maxTensor) { + cachedGraph->outputTensor = [mpsGraph clampWithTensor:cachedGraph->inputTensor + minValueTensor:cachedGraph->minTensor + maxValueTensor:cachedGraph->maxTensor + name:nil]; + } else if (cachedGraph->maxTensor) { + cachedGraph->outputTensor = [mpsGraph minimumWithPrimaryTensor:cachedGraph->inputTensor + secondaryTensor:cachedGraph->maxTensor + name:nil]; + } else if (cachedGraph->minTensor) { + cachedGraph->outputTensor = [mpsGraph maximumWithPrimaryTensor:cachedGraph->inputTensor + secondaryTensor:cachedGraph->minTensor + name:nil]; + } +} + +void clamp_tensor_out_mps(const Tensor& input_t, + const OptionalTensorRef min_opt, + const OptionalTensorRef max_opt, + const Tensor& output_t, + string op_name) +{ + const bool has_min = (min_opt.has_value() && min_opt->defined()); + const bool has_max = (max_opt.has_value() && max_opt->defined()); + + TORCH_CHECK(has_min || has_max, op_name + ": either min, max or both tensors must be defined") + if (has_min) + TORCH_CHECK(min_opt->is_same_size(input_t), op_name + ": min and input tensors must be of the same shape") + if (has_max) + TORCH_CHECK(max_opt->is_same_size(input_t), op_name + ": max and input tensors must be of the same shape") + + if (output_t.numel() == 0) + return; + + @autoreleasepool { + // the optional min/max refs could affect how we build the cached graph + string key = op_name + (has_min ? "_min" : "") + (has_max ? "_max" : "") + + "_tensor" + getTensorsStringKey({input_t}); + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if (!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + if (has_min) + newCachedGraph->minTensor = mpsGraphRankedPlaceHolder(mpsGraph, *min_opt); + if (has_max) + newCachedGraph->maxTensor = mpsGraphRankedPlaceHolder(mpsGraph, *max_opt); + + clamp_mps_graph(newCachedGraph, input_t); + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + auto inputPlaceholder = Placeholder(cachedGraph->inputTensor, input_t); + auto outputPlaceholder = Placeholder(cachedGraph->outputTensor, output_t); + + NSMutableDictionary *feeds = [[NSMutableDictionary new] autorelease]; + feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData(); + if (has_min) { + auto minPlaceholder = Placeholder(cachedGraph->minTensor, *min_opt); + feeds[minPlaceholder.getMPSGraphTensor()] = minPlaceholder.getMPSGraphTensorData(); + } + if (has_max) { + auto maxPlaceholder = Placeholder(cachedGraph->maxTensor, *max_opt); + feeds[maxPlaceholder.getMPSGraphTensor()] = maxPlaceholder.getMPSGraphTensorData(); + } + + NSDictionary *results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + } +} + +void clamp_scalar_out_mps(const Tensor& input_t, + const OptionalScalarRef min_opt, + const OptionalScalarRef max_opt, + const Tensor& output_t, + string op_name) +{ + using scalar_t = double; + + const bool has_min = (min_opt.has_value()); + const bool has_max = (max_opt.has_value()); + TORCH_CHECK(has_min || has_max, op_name + ": either min, max or both scalars must be defined") + + scalar_t min_scalar = std::numeric_limits::infinity(); + scalar_t max_scalar = -std::numeric_limits::infinity(); + + if (has_min) + min_scalar = min_opt.get().to(); + if (has_max) + max_scalar = max_opt.get().to(); + + if (output_t.numel() == 0) + return ; + + @autoreleasepool { + // the optional min/max refs could affect how we build the cached graph + string key = op_name + (has_min ? ("_min:" + to_string(min_scalar)) : "") + (has_max ? ("_max:" + to_string(max_scalar)) : "") + + "_scalar:" + getTensorsStringKey({input_t}); + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if (!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + if (has_min) + newCachedGraph->minTensor = [mpsGraph constantWithScalar:min_scalar + shape:(mps::getMPSShape(input_t)) + dataType:(mps::getMPSScalarType(input_t.scalar_type())) ]; + if (has_max) + newCachedGraph->maxTensor = [mpsGraph constantWithScalar:max_scalar + shape:(mps::getMPSShape(input_t)) + dataType:(mps::getMPSScalarType(input_t.scalar_type())) ]; + + clamp_mps_graph(newCachedGraph, input_t); + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + auto inputPlaceholder = Placeholder(cachedGraph->inputTensor , input_t); + auto outputPlaceholder = Placeholder(cachedGraph->outputTensor, output_t); + + NSDictionary *feeds = @{ + inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), + }; + NSDictionary *results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + } +} + +} // namespace mps + +// APIs exposed to at::native scope +TORCH_IMPL_FUNC(clamp_Tensor_out_mps) +(const Tensor& input_t, const OptionalTensorRef min, const OptionalTensorRef max, const Tensor& output_t) +{ + mps::clamp_tensor_out_mps(input_t, min, max, output_t, __func__); +} + +TORCH_IMPL_FUNC(clamp_out_mps) +(const Tensor& input_t, const OptionalScalarRef min, const OptionalScalarRef max, const Tensor& output_t) +{ + mps::clamp_scalar_out_mps(input_t, min, max, const_cast(output_t), "clamp_out_mps"); +} + +TORCH_IMPL_FUNC(clamp_min_Tensor_out_mps) +(const Tensor& input_t, const Tensor& min, const Tensor& output_t) +{ + mps::clamp_tensor_out_mps(input_t, min, at::OptionalTensorRef(), output_t, __func__); +} + +TORCH_IMPL_FUNC(clamp_min_out_mps) +(const Tensor& input_t, const Scalar& min, const Tensor& output_t) +{ + mps::clamp_scalar_out_mps(input_t, min, at::OptionalScalarRef(), output_t, __func__); +} + +TORCH_IMPL_FUNC(clamp_max_Tensor_out_mps) +(const Tensor& input_t, const Tensor& max, const Tensor& output_t) +{ + mps::clamp_tensor_out_mps(input_t, at::OptionalTensorRef(), max, output_t, __func__); +} + +TORCH_IMPL_FUNC(clamp_max_out_mps) +(const Tensor& input_t, const Scalar& max, const Tensor& output_t) +{ + mps::clamp_scalar_out_mps(input_t, at::OptionalScalarRef(), max, output_t, __func__); +} + +Tensor& where_self_out_mps(const Tensor& condition, + const Tensor& self, + const Tensor& other, + Tensor& out) { + TORCH_CHECK(self.dtype() == other.dtype(), "expected scalar type ", self.dtype(), " but found ", other.dtype()); + + if (condition.scalar_type() == ScalarType::Byte) { + TORCH_WARN_ONCE("where received a uint8 condition tensor. This behavior is deprecated and will be removed in a future version of PyTorch. Use a boolean condition instead."); + } else { + TORCH_CHECK(condition.scalar_type() == ScalarType::Bool, "where expected condition to be a boolean tensor, but got a tensor with dtype ", condition.scalar_type()); + } + Tensor cond_bool = condition.scalar_type() == ScalarType::Byte ? condition.to(ScalarType::Bool) : condition; + + using namespace mps; + MPSStream* stream = getCurrentMPSStream(); + + // Empty output + if(out.numel() == 0) + return out; + + // Derive from MPSCachedGraph + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* conditionTensor_ = nil; + MPSGraphTensor* selfTensor_ = nil; + MPSGraphTensor* otherTensor_ = nil; + MPSGraphTensor* outputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + @autoreleasepool { + + MPSShape* input_shape = getMPSShape(self); + + string key = "where_self_out_mps:" + getTensorsStringKey({cond_bool, self, other}); + + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* conditionTensor = mpsGraphRankedPlaceHolder(mpsGraph, cond_bool); + MPSGraphTensor* selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, self); + MPSGraphTensor* otherTensor = mpsGraphRankedPlaceHolder(mpsGraph, other); + + MPSGraphTensor* outputTensor = [mpsGraph selectWithPredicateTensor:conditionTensor + truePredicateTensor:selfTensor + falsePredicateTensor:otherTensor + name:nil]; + + newCachedGraph->conditionTensor_ = conditionTensor; + newCachedGraph->selfTensor_ = selfTensor; + newCachedGraph->otherTensor_ = otherTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder conditionPlaceholder = Placeholder(cachedGraph->conditionTensor_, cond_bool); + Placeholder selfPlaceholder = Placeholder(cachedGraph->selfTensor_, self); + Placeholder otherPlaceholder = Placeholder(cachedGraph->otherTensor_, other); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out); + + NSDictionary* feeds = @{ + conditionPlaceholder.getMPSGraphTensor() : conditionPlaceholder.getMPSGraphTensorData(), + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), + otherPlaceholder.getMPSGraphTensor() : otherPlaceholder.getMPSGraphTensorData() + }; + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + + } + + return out; +} + +Tensor where_mps(const Tensor& condition, + const Tensor& self, + const Tensor& other) { + + auto cond_shape = condition.sizes(); + auto self_shape = self.sizes(); + auto other_shape = other.sizes(); + + bool cond_zero_shape = (condition.dim() == 0); + bool self_zero_shape = (self.dim() == 0); + bool other_zero_shape = (other.dim() == 0); + + auto max_dim = std::max(condition.dim(), std::max(self.dim(), other.dim())); + + auto sum_dims = condition.dim() + self.dim() + other.dim(); + + TORCH_CHECK(max_dim == 0 || !(sum_dims % max_dim), "All inputs of where should have same/compatible number of dims") + + int64_t out_arr[max_dim]; + + // Broadcasted output shape + for(int i = 0; i < max_dim; i++) { + + int64_t cond_num = cond_zero_shape ? 0 : condition.size(i); + int64_t self_num = self_zero_shape ? 0 : self.size(i); + int64_t other_num = other_zero_shape ? 0 : other.size(i); + + out_arr[i] = std::max(cond_num, std::max(self_num, other_num)); + } + + Tensor ret = empty_mps(IntArrayRef(out_arr, max_dim), + self.scalar_type(), + c10::nullopt, + kMPS, + c10::nullopt, + self.suggest_memory_format()); + return where_self_out_mps(condition, self, other, ret); + +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/mps/operations/TriangularOps.mm b/aten/src/ATen/native/mps/operations/TriangularOps.mm new file mode 100644 index 000000000000..6a29d080cb6c --- /dev/null +++ b/aten/src/ATen/native/mps/operations/TriangularOps.mm @@ -0,0 +1,370 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include + +#include +#include +#include +#include + +#ifdef __OBJC__ +#include +#endif + +namespace at { +namespace native { + +TORCH_IMPL_FUNC(triu_mps_out) +(const Tensor& self, + int64_t k, + const Tensor &output) { + + using namespace mps; + MPSStream* stream = getCurrentMPSStream(); + + // Derive from MPSCachedGraph + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + @autoreleasepool { + string key = "triu_mps_out" + mps::getTensorsStringKey({self}) + ":" + std::to_string(k); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self); + MPSGraphTensor* outputTensor = nil; + + MPSGraphTensor* minusOneTensor = [mpsGraph constantWithScalar:-1 + dataType:MPSDataTypeInt32]; + + if(k > 0) { + MPSGraphTensor* diagMinusOneTensor = [mpsGraph constantWithScalar:(k-1) + dataType:MPSDataTypeInt32]; + MPSGraphTensor* complementTensor = [mpsGraph bandPartWithTensor:inputTensor + numLowerTensor:minusOneTensor + numUpperTensor:diagMinusOneTensor + name:nil]; + outputTensor = [mpsGraph subtractionWithPrimaryTensor:inputTensor + secondaryTensor:complementTensor + name:nil]; + } + else { + MPSGraphTensor* minusDiagTensor = [mpsGraph constantWithScalar:(-k) + dataType:MPSDataTypeInt32]; + outputTensor = [mpsGraph bandPartWithTensor:inputTensor + numLowerTensor:minusDiagTensor + numUpperTensor:minusOneTensor + name:nil]; + } + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); + + NSDictionary* feeds = @{ + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() + }; + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + +} + +TORCH_IMPL_FUNC(tril_mps_out) +(const Tensor& self, + int64_t k, + const Tensor &output) { + + using namespace mps; + MPSStream* stream = getCurrentMPSStream(); + + // Derive from MPSCachedGraph + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + @autoreleasepool { + string key = "tril_mps_out" + mps::getTensorsStringKey({self}) + ":" + std::to_string(k); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self); + MPSGraphTensor* outputTensor = nil; + + MPSGraphTensor* minusOneTensor = [mpsGraph constantWithScalar:-1 + dataType:MPSDataTypeInt32]; + + if(k >= 0) { + MPSGraphTensor* diagTensor = [mpsGraph constantWithScalar:k + dataType:MPSDataTypeInt32]; + outputTensor = [mpsGraph bandPartWithTensor:inputTensor + numLowerTensor:minusOneTensor + numUpperTensor:diagTensor + name:nil]; + } + else { + MPSGraphTensor* negDiagMinusOneTensor = [mpsGraph constantWithScalar:(-k-1) + dataType:MPSDataTypeInt32]; + MPSGraphTensor* complementTensor = [mpsGraph bandPartWithTensor:inputTensor + numLowerTensor:negDiagMinusOneTensor + numUpperTensor:minusOneTensor + name:nil]; + outputTensor = [mpsGraph subtractionWithPrimaryTensor:inputTensor + secondaryTensor:complementTensor + name:nil]; + } + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); + + NSDictionary* feeds = @{ + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() + }; + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + +} + +Tensor& diag_mps_out(const Tensor& self, + int64_t diagonal, + Tensor &output) { + + // Do checks, resize output + IntArrayRef input_size = self.sizes(); + auto num_input_dims = input_size.size(); + // Input can only be 1D or 2D + TORCH_CHECK(num_input_dims == 1 || num_input_dims == 2, + "diag_mps_out: Input tensor must be 1D or 2D") + + if(num_input_dims == 1) { + auto n = input_size[0]; + if(diagonal > 0) + n += diagonal; + else if(diagonal < 0) + n -= diagonal; + + output.resize_({n, n}); + } + else if(num_input_dims == 2) { + auto num_diag_elements = std::min(input_size[0], input_size[1]); + if(diagonal > 0) { + TORCH_CHECK(input_size[1] - diagonal > 0, "Matrix not big enough for requested diagonal") + num_diag_elements = std::min(input_size[0], input_size[1] - diagonal); + } + else if(diagonal < 0) { + TORCH_CHECK(input_size[0] + diagonal > 0, "Matrix not big enough for requested diagonal") + num_diag_elements = std::min(input_size[0] + diagonal, input_size[1]); + } + + output.resize_({num_diag_elements}); + } + + using namespace mps; + MPSStream* stream = getCurrentMPSStream(); + + // Derive from MPSCachedGraph + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; + }; + + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + + @autoreleasepool { + + MPSShape* input_shape = getMPSShape(self); + MPSShape* output_shape = getMPSShape(output); + NSNumber* num_input_cols = nil; + NSNumber* num_output_cols = nil; + NSMutableArray* flat_input_shape = nil; + NSMutableArray* flat_output_shape = nil; + if(num_input_dims == 1) { + num_output_cols = output_shape[1]; + flat_output_shape = [NSMutableArray arrayWithCapacity:1]; + flat_output_shape[0] = [NSNumber numberWithInt:[output_shape[0] intValue] * [output_shape[1] intValue]]; + } + else if(num_input_dims == 2) { + num_input_cols = input_shape[1]; + flat_input_shape = [NSMutableArray arrayWithCapacity:1]; + flat_input_shape[0] = [NSNumber numberWithInt:[input_shape[0] intValue] * [input_shape[1] intValue]]; + } + NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","]; + string key = "diag_mps_out:" + getMPSTypeString(self.scalar_type()) + ":" + std::to_string(diagonal) + + ":" + string([ns_shape_key UTF8String]); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () { + CachedGraph *newCachedGraph = nil; + + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + + // TODO: Accept this as the flat version in 2D case + MPSGraphTensor* inputTensor = nil; + if(num_input_dims == 1) + inputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(self.scalar_type())); + else + inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self.scalar_type()), flat_input_shape); + + MPSGraphTensor* outputTensor = nil; + + MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0 + dataType:MPSDataTypeInt32]; + MPSGraphTensor* numDiagElementsRange = nil; + MPSGraphTensor* diagOffset = nil; + MPSGraphTensor* rowMultiplier = nil; + MPSGraphTensor* rowIndices = nil; + MPSGraphTensor* colIndices = nil; + MPSGraphTensor* indicesTensor = nil; + + if(num_input_dims == 1) { + int shape_data[1] = {[input_shape[0] intValue]}; + MPSGraphTensor* inputShapeTensor = [mpsGraph constantWithData:[NSData dataWithBytes:shape_data length:sizeof(int)] + shape:@[@1] + dataType:MPSDataTypeInt32]; + numDiagElementsRange = [mpsGraph getCoordinateValueWithShapeTensor:inputShapeTensor + axisTensor:zeroTensor + name:nil]; + diagOffset = [mpsGraph constantWithScalar:diagonal + dataType:MPSDataTypeInt32]; + rowMultiplier = [mpsGraph constantWithScalar:[num_output_cols intValue] + dataType:MPSDataTypeInt32]; + } + else { + int shape_data[1] = {[output_shape[0] intValue]}; + MPSGraphTensor* outputShapeTensor = [mpsGraph constantWithData:[NSData dataWithBytes:shape_data length:sizeof(int)] + shape:@[@1] + dataType:MPSDataTypeInt32]; + numDiagElementsRange = [mpsGraph getCoordinateValueWithShapeTensor:outputShapeTensor + axisTensor:zeroTensor + name:nil]; + diagOffset = [mpsGraph constantWithScalar:diagonal + dataType:MPSDataTypeInt32]; + rowMultiplier = [mpsGraph constantWithScalar:[num_input_cols intValue] + dataType:MPSDataTypeInt32]; + } + + if(diagonal >= 0) { + rowIndices = numDiagElementsRange; + colIndices = [mpsGraph additionWithPrimaryTensor:numDiagElementsRange + secondaryTensor:diagOffset + name:nil]; + } + else { + rowIndices = [mpsGraph subtractionWithPrimaryTensor:numDiagElementsRange + secondaryTensor:diagOffset + name:nil];; + colIndices = numDiagElementsRange; + } + + indicesTensor = [mpsGraph multiplicationWithPrimaryTensor:rowIndices + secondaryTensor:rowMultiplier + name:nil]; + indicesTensor = [mpsGraph additionWithPrimaryTensor:indicesTensor + secondaryTensor:colIndices + name:nil]; + + if(num_input_dims == 1) { + // TODO: Scatter mode doesn't matter, so what should I set it to be? + outputTensor = [mpsGraph scatterWithUpdatesTensor:inputTensor + indicesTensor:indicesTensor + shape:flat_output_shape + axis:0 + mode:MPSGraphScatterModeAdd + name:nil]; + outputTensor = [mpsGraph reshapeTensor:outputTensor + withShape:output_shape + name:nil]; + } + else if(num_input_dims == 2) { + outputTensor = [mpsGraph gatherWithUpdatesTensor:inputTensor + indicesTensor:indicesTensor + axis:0 + batchDimensions:0 + name:nil]; + } + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder selfPlaceholder = Placeholder(); + if(num_input_dims == 1) + selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); + else + selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, flat_input_shape); + + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); + + NSDictionary* feeds = @{ + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() + }; + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } + + return output; +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm new file mode 100644 index 000000000000..528b1643ff6c --- /dev/null +++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm @@ -0,0 +1,174 @@ +// Copyright © 2022 Apple Inc. + +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +namespace mps { + +typedef MPSGraphTensor* (^UnaryOpBlock)(MPSGraph*, MPSGraphTensor*); + +void unary_op(const Tensor& self_t, const Tensor& output, std::string op_name, UnaryOpBlock unaryBlock) +{ + Tensor self = self_t.contiguous(at::MemoryFormat::Contiguous); + if (!output.is_same_size(self)) { + output.resize_(self.sizes()); + } + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor = nil, *outputTensor = nil; + }; + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + @autoreleasepool { + string key = op_name + getTensorsStringKey({self}); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph* () { + CachedGraph *newCachedGraph = nil; + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self); + newCachedGraph->outputTensor = unaryBlock(mpsGraph, newCachedGraph->inputTensor); + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor, self); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output); + NSDictionary* feeds = @{ + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() + }; + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + } +} + +MPSGraphTensor* trunc_tensor(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) +{ + MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0 + dataType:inputTensor.dataType]; + MPSGraphTensor* predicateTensor = [mpsGraph lessThanWithPrimaryTensor:inputTensor + secondaryTensor:zeroTensor + name:nil]; + return [mpsGraph selectWithPredicateTensor:predicateTensor + truePredicateTensor:[mpsGraph ceilWithTensor :inputTensor name:nil] + falsePredicateTensor:[mpsGraph floorWithTensor:inputTensor name:nil] + name:nil]; +}; + +} // namespace mps + +TORCH_IMPL_FUNC(trunc_out_mps) (const Tensor& self, const Tensor& output) { + mps::unary_op(self, output, "trunc_out_mps", + ^ MPSGraphTensor* (MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) + { return mps::trunc_tensor(mpsGraph, inputTensor); }); +} + +#define CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(func_out, func_stub) \ +TORCH_IMPL_FUNC(func_out) (const Tensor& self, const Tensor& output) { \ + mps::unary_op(self, output, #func_out, \ + ^ MPSGraphTensor* (MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) \ + { return [mpsGraph func_stub##WithTensor:inputTensor name:nil]; }); \ +} + +#define CREATE_MPS_UNARY_TORCH_IMPL_FUNC(func_out, func_stub) \ +Tensor& func_out(const Tensor& self, Tensor& output) { \ + mps::unary_op(self, output, #func_out, \ + ^ MPSGraphTensor* (MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) \ + { return [mpsGraph func_stub##WithTensor:inputTensor name:nil]; }); \ + return output; \ +} + + +CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(exp_out_mps, exponent) +CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(exp2_out_mps, exponentBase2) +CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(reciprocal_out_mps, reciprocal) +CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(sqrt_out_mps, squareRoot) +CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(rsqrt_out_mps, reverseSquareRoot) +CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(sign_out_mps, sign) +CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(neg_out_mps, negative) +CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(log_out_mps, logarithm) +CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(log10_out_mps, logarithmBase10) +CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(log2_out_mps, logarithmBase2) +CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(ceil_out_mps, ceil) +CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(floor_out_mps, floor) +CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(round_out_mps, round) +CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(erf_out_mps, erf) +CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(sin_out_mps, sin) +CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(cos_out_mps, cos) +CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(tan_out_mps, tan) +CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(asin_out_mps, asin) +CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(acos_out_mps, acos) +CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(atan_out_mps, atan) +CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(sinh_out_mps, sinh) +CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(cosh_out_mps, cosh) +CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(tanh_out_mps, tanh) +CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(asinh_out_mps, asinh) +CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(acosh_out_mps, acosh) +CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(atanh_out_mps, atanh) + +CREATE_MPS_UNARY_TORCH_IMPL_FUNC(abs_out_mps, absolute) + +TORCH_IMPL_FUNC(log1p_out_mps) (const Tensor& self, const Tensor& output) +{ + using namespace mps; + if (!output.is_same_size(self)) { + output.resize_(self.sizes()); + } + struct CachedGraph : public MPSCachedGraph + { + CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor = nil, *outputTensor = nil; + }; + MPSGraphCache* cache_ = MPSGraphCache::getInstance(); + @autoreleasepool { + string key = string("log1p_out_mps") + getTensorsStringKey({self}); + CachedGraph* cachedGraph = static_cast(cache_->LookUp(key)); + + if(!cachedGraph) { + MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph* () { + CachedGraph *newCachedGraph = nil; + @autoreleasepool { + MPSGraph* mpsGraph = make_mps_graph(); + newCachedGraph = new CachedGraph(mpsGraph); + newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self); + MPSGraphTensor* oneTensor = [mpsGraph constantWithScalar:1.0 + shape:getMPSShape(self) + dataType:mps::getMPSDataType(self.scalar_type())]; + MPSGraphTensor* addedTensor = [mpsGraph additionWithPrimaryTensor:newCachedGraph->inputTensor + secondaryTensor:oneTensor + name:nil]; + newCachedGraph->outputTensor = [mpsGraph logarithmWithTensor:addedTensor + name:nil]; + } + return newCachedGraph; + }); + cachedGraph = static_cast(tmpCachedGraph); + } + + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor, self); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output); + NSDictionary* feeds = @{ + selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() + }; + NSDictionary* results = @{ + outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() + }; + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + } +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 8c333efd3bf7..d6b5adf593a6 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -145,6 +145,7 @@ - func: rename_(Tensor(a!) self, Dimname[]? names) -> Tensor(a!) variants: method + tags: inplace_view - func: rename(Tensor(a) self, Dimname[]? names) -> Tensor(a) variants: method @@ -274,6 +275,7 @@ device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: abs_out + MPS: abs_out_mps SparseCPU, SparseCUDA: abs_sparse_out SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr_out @@ -328,12 +330,12 @@ - func: view_as_real(Tensor(a) self) -> Tensor(a) variants: function dispatch: - CPU, CUDA: view_as_real + CPU, CUDA, MPS, Meta: view_as_real - func: view_as_complex(Tensor(a) self) -> Tensor(a) variants: function dispatch: - CPU, CUDA: view_as_complex + CPU, CUDA, Meta: view_as_complex - func: sgn(Tensor self) -> Tensor variants: function, method @@ -357,6 +359,9 @@ SparseCPU, SparseCUDA: sgn_sparse_out SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr_out +- func: chalf(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor + variants: method + - func: real(Tensor(a) self) -> Tensor(a) device_check: NoCheck # TensorIterator variants: function @@ -422,6 +427,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: acos_out + MPS: acos_out_mps # arccos, alias of acos - func: arccos(Tensor self) -> Tensor @@ -448,6 +454,7 @@ SparseCsrCPU, SparseCsrCUDA: add_sparse_csr MkldnnCPU: mkldnn_add ZeroTensor: add_zerotensor + NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor - func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) device_check: NoCheck # TensorIterator @@ -457,18 +464,22 @@ SparseCPU, SparseCUDA: add_sparse_ SparseCsrCPU, SparseCsrCUDA: add_sparse_csr_ MkldnnCPU: mkldnn_add_ + NestedTensorCPU, NestedTensorCUDA: NestedTensor_add__Tensor - func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator structured: True structured_inherits: TensorIteratorBase + ufunc_inner_loop: + Generic: add (AllAndComplex, BFloat16, Half, ComplexHalf) + ScalarOnly: add (Bool) dispatch: - CPU, CUDA: add_out SparseCPU: add_out_sparse_cpu SparseCUDA: add_out_sparse_cuda SparseCsrCPU: add_out_sparse_csr_cpu SparseCsrCUDA: add_out_sparse_csr_cuda MkldnnCPU: mkldnn_add_out + MPS: add_out_mps - func: _add_relu.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor variants: function @@ -521,6 +532,7 @@ dispatch: CPU: addmv_out_cpu CUDA: addmv_out_cuda + MPS: addmv_out_mps SparseCsrCPU: addmv_out_sparse_csr SparseCsrCUDA: addmv_out_sparse_csr_cuda @@ -560,6 +572,7 @@ - dim -> int dim dispatch: CPU, CUDA: all_out + MPS: all_out_mps - func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor device_check: NoCheck # TensorIterator @@ -583,6 +596,7 @@ - dim -> int dim dispatch: CPU, CUDA: any_out + MPS: any_out_mps - func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor device_check: NoCheck # TensorIterator @@ -603,6 +617,7 @@ dispatch: CPU, Meta: arange_out CUDA: arange_cuda_out + MPS: arange_mps_out # This function is a temporary hack to allow tracing of arange like constructs with dynamic # bounds on arange. Normal arange is not traceable because it does not take any tensor inputs; @@ -620,6 +635,7 @@ structured: True dispatch: CPU, CUDA: argmax_out + MPS: argmax_out_mps - func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor structured_delegate: argmin.out @@ -644,6 +660,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: acosh_out + MPS: acosh_out_mps # arccosh, alias for acosh - func: arccosh(Tensor self) -> Tensor @@ -673,6 +690,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: asinh_out + MPS: asinh_out_mps SparseCPU, SparseCUDA: asinh_sparse_out SparseCsrCPU, SparseCsrCUDA: asinh_sparse_csr_out @@ -705,6 +723,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: atanh_out + MPS: atanh_out_mps SparseCPU, SparseCUDA: atanh_sparse_out SparseCsrCPU, SparseCsrCUDA: atanh_sparse_csr_out @@ -721,6 +740,7 @@ variants: function, method dispatch: ZeroTensor, CPU, CUDA, Meta: as_strided_tensorimpl + MPS: as_strided_tensorimpl_mps QuantizedCPU, QuantizedCUDA: as_strided_qtensorimpl device_check: NoCheck device_guard: False @@ -756,6 +776,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: asin_out + MPS: asin_out_mps SparseCPU, SparseCUDA: asin_sparse_out SparseCsrCPU, SparseCsrCUDA: asin_sparse_csr_out @@ -790,6 +811,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: atan_out + MPS: atan_out_mps SparseCPU, SparseCUDA: atan_sparse_out SparseCsrCPU, SparseCsrCUDA: atan_sparse_csr_out @@ -833,6 +855,7 @@ dispatch: CPU: baddbmm_out_cpu CUDA: baddbmm_out_cuda + MPS: baddbmm_out_mps SparseCsrCUDA: baddbmm_out_sparse_csr_cuda - func: bartlett_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -861,18 +884,21 @@ variants: function dispatch: CPU, CUDA: bernoulli_out + MPS: bernoulli_out_mps - func: bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CPU, CUDA: bernoulli_ + MPS: bernoulli_mps_ - func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CPU, CUDA: bernoulli_ + MPS: bernoulli_mps_ # This out-of-place version isn't used explicitly, but needed by jit. # There is no default valid on `p` here because it would introduce ambiguity @@ -890,6 +916,7 @@ dispatch: CPU: binary_cross_entropy_cpu CUDA: binary_cross_entropy_cuda + MPS: binary_cross_entropy_mps - func: binary_cross_entropy.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator @@ -898,6 +925,7 @@ dispatch: CPU: binary_cross_entropy_out_cpu CUDA: binary_cross_entropy_out_cuda + MPS: binary_cross_entropy_out_mps - func: binary_cross_entropy_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor python_module: nn @@ -905,6 +933,7 @@ dispatch: CPU: binary_cross_entropy_backward_cpu CUDA: binary_cross_entropy_backward_cuda + MPS: binary_cross_entropy_backward_mps - func: binary_cross_entropy_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -912,6 +941,7 @@ dispatch: CPU: binary_cross_entropy_backward_out_cpu CUDA: binary_cross_entropy_backward_out_cuda + MPS: binary_cross_entropy_backward_out_mps - func: binary_cross_entropy_with_logits(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean) -> Tensor device_check: NoCheck # TensorIterator @@ -1061,6 +1091,7 @@ dispatch: CPU: bmm_out_cpu CUDA: bmm_out_cuda + MPS: bmm_out_mps SparseCPU: bmm_out_sparse_cpu SparseCUDA: bmm_out_sparse_cuda SparseCsrCUDA: bmm_out_sparse_csr_cuda @@ -1078,12 +1109,20 @@ SparseCPU, SparseCUDA: sparse_broadcast_to - func: cat(Tensor[] tensors, int dim=0) -> Tensor + structured_delegate: cat.out dispatch: - CompositeExplicitAutograd: cat + SparseCPU, SparseCUDA: cat_sparse + QuantizedCPU: cat_quantized_cpu - func: cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!) + structured: True + precomputed: + - dim -> int dim, int valid, bool all_contiguous, bool all_same_dtype, bool all_same_sizes_and_stride, MemoryFormat memory_format dispatch: - CompositeExplicitAutograd: cat_out + CPU: cat_out_cpu + CUDA: cat_out_cuda + MPS: cat_out_mps + QuantizedCPU: cat_out_quantized_cpu - func: cat.names(Tensor[] tensors, Dimname dim) -> Tensor @@ -1125,6 +1164,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: ceil_out + MPS: ceil_out_mps SparseCPU, SparseCUDA: ceil_sparse_out SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr_out @@ -1164,8 +1204,7 @@ - func: clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor variants: function, method - dispatch: - CPU, CUDA: clamp + structured_delegate: clamp.Tensor_out - func: clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!) device_check: NoCheck # TensorIterator @@ -1177,8 +1216,7 @@ - func: clamp_.Tensor(Tensor(a!) self, Tensor? min=None, Tensor? max=None) -> Tensor(a!) variants: function, method - dispatch: - CompositeExplicitAutograd: clamp_ + structured_delegate: clamp.Tensor_out - func: clamp.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator @@ -1187,73 +1225,83 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: clamp_out + MPS: clamp_out_mps - func: clamp.Tensor_out(Tensor self, Tensor? min=None, Tensor? max=None, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase dispatch: - CPU, CUDA: clamp_out + CPU, CUDA: clamp_Tensor_out + MPS: clamp_Tensor_out_mps - func: clamp_max(Tensor self, Scalar max) -> Tensor device_check: NoCheck # TensorIterator variants: function, method - dispatch: - CompositeExplicitAutograd: clamp_max + structured_delegate: clamp_max.out - func: clamp_max.Tensor(Tensor self, Tensor max) -> Tensor variants: function, method - dispatch: - CompositeExplicitAutograd: clamp_max + structured_delegate: clamp_max.Tensor_out - func: clamp_max_(Tensor(a!) self, Scalar max) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function, method - dispatch: - CompositeExplicitAutograd: clamp_max_ + structured_delegate: clamp_max.out - func: clamp_max_.Tensor(Tensor(a!) self, Tensor max) -> Tensor(a!) variants: function, method - dispatch: - CompositeExplicitAutograd: clamp_max_ + structured_delegate: clamp_max.Tensor_out - func: clamp_max.out(Tensor self, Scalar max, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: clamp_max_out + MPS: clamp_max_out_mps - func: clamp_max.Tensor_out(Tensor self, Tensor max, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase dispatch: - CPU, CUDA: clamp_max_out + CPU, CUDA: clamp_max_Tensor_out + MPS: clamp_max_Tensor_out_mps - func: clamp_min(Tensor self, Scalar min) -> Tensor device_check: NoCheck # TensorIterator variants: function, method - dispatch: - CompositeExplicitAutograd: clamp_min + structured_delegate: clamp_min.out - func: clamp_min.Tensor(Tensor self, Tensor min) -> Tensor variants: function, method - dispatch: - CompositeExplicitAutograd: clamp_min + structured_delegate: clamp_min.Tensor_out - func: clamp_min_(Tensor(a!) self, Scalar min) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function, method - dispatch: - CompositeExplicitAutograd: clamp_min_ + structured_delegate: clamp_min.out - func: clamp_min_.Tensor(Tensor(a!) self, Tensor min) -> Tensor(a!) variants: function, method - dispatch: - CompositeExplicitAutograd: clamp_min_ + structured_delegate: clamp_min.Tensor_out - func: clamp_min.out(Tensor self, Scalar min, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: clamp_min_out + MPS: clamp_min_out_mps - func: clamp_min.Tensor_out(Tensor self, Tensor min, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase dispatch: - CPU, CUDA: clamp_min_out + CPU, CUDA: clamp_min_Tensor_out + MPS: clamp_min_Tensor_out_mps # clip is an alias for clamp - func: clip(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor @@ -1360,23 +1408,28 @@ - func: conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int groups=1, int[3] dilation=1) -> Tensor +- func: copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor + variants: function + - func: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!) variants: method device_check: NoCheck device_guard: False dispatch: MkldnnCPU: copy_mkldnn_ - SparseCPU, SparseCUDA, SparseHIP: copy_sparse_wrapper_ + SparseCPU, SparseCUDA: copy_sparse_wrapper_ CompositeExplicitAutograd: copy_ - SparseCsrCPU, SparseCsrCUDA: copy_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA: copy_sparse_compressed_ - func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor - dispatch: {} + dispatch: + MPS: _copy_from_mps # We need this to be able to properly copy from a CPU to an XLA tensor with different sizes. # See https://github.com/pytorch/xla/issues/2881 - func: _copy_from_and_resize(Tensor self, Tensor dst) -> Tensor - dispatch: {} + dispatch: + MPS: _copy_from_and_resize_mps - func: cos(Tensor self) -> Tensor device_check: NoCheck # TensorIterator @@ -1394,6 +1447,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: cos_out + MPS: cos_out_mps - func: cosh(Tensor self) -> Tensor device_check: NoCheck # TensorIterator @@ -1411,6 +1465,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: cosh_out + MPS: cosh_out_mps - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor @@ -1457,6 +1512,14 @@ dispatch: CUDA: cudnn_convolution_transpose +- func: _mps_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups) -> Tensor + dispatch: + MPS: _mps_convolution_transpose + +- func: mps_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool[2] output_mask) -> (Tensor, Tensor) + dispatch: + MPS: mps_convolution_transpose_backward + - func: cudnn_convolution_relu(Tensor self, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor dispatch: CUDA: cudnn_convolution_relu @@ -1679,6 +1742,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: div_out + MPS: div_out_mps SparseCPU, SparseCUDA: div_out_sparse_zerodim - func: div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor @@ -1701,6 +1765,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: div_out_mode + MPS: div_out_mode_mps SparseCPU, SparseCUDA: div_out_sparse_zerodim # For C++ only, until we have conversion from C++ numbers to Tensor @@ -1780,6 +1845,7 @@ dispatch: CPU: dot CUDA: dot_cuda + MPS: dot_mps - func: dot.out(Tensor self, Tensor tensor, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -1800,6 +1866,7 @@ - func: embedding(Tensor weight, Tensor indices, int padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor dispatch: CompositeExplicitAutograd: embedding + NestedTensorCPU, NestedTensorCUDA: NestedTensor_embedding - func: embedding_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor @@ -1807,6 +1874,7 @@ dispatch: CPU: embedding_dense_backward_cpu CUDA: embedding_dense_backward_cuda + MPS: embedding_dense_backward_mps - func: embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!) dispatch: @@ -1872,10 +1940,12 @@ dispatch: CPU: empty_cpu CUDA: empty_cuda + MPS: empty_mps Meta: empty_meta MkldnnCPU: empty_mkldnn SparseCPU, SparseCUDA: empty_sparse - SparseCsrCPU, SparseCsrCUDA: empty_sparse_csr + SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed + QuantizedCPU, QuantizedCUDA: empty_unknown_quantized # We do not make new_empty a composite that calls into new_empty_strided, as the strided version # is significantly more difficult to implement by different backends @@ -1920,9 +1990,19 @@ dispatch: CPU, Meta: resize_ CUDA: resize_cuda_ + MPS: resize_mps_ QuantizedCPU: quantized_resize_cpu_ SparseCsrCPU, SparseCsrCUDA: resize_sparse_csr_ +# This is a utility function to enable users to resize out tensor while registering kernels for out variants. +# Eventually, we can consider exposing `resize_output` as a public API to ship it with python op registration +# to make it easy to register out variants for ops. +- func: _resize_output_(Tensor(a!) self, int[] size, Device device) -> Tensor(a!) + use_const_ref_for_mutable_tensors: True + variants: function + dispatch: + Meta: _resize_output_ + - func: empty_quantized(int[] size, Tensor qtensor, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor category_override: factory variants: function @@ -1946,7 +2026,9 @@ dispatch: CPU: empty_strided_cpu CUDA: empty_strided_cuda + MPS: empty_strided_mps Meta: empty_strided_meta + QuantizedCPU, QuantizedCUDA: empty_strided_unknown_quantized - func: erf(Tensor self) -> Tensor device_check: NoCheck # TensorIterator @@ -1970,6 +2052,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: erf_out + MPS: erf_out_mps SparseCPU, SparseCUDA: erf_sparse_out SparseCsrCPU, SparseCsrCUDA: erf_sparse_csr_out @@ -2006,6 +2089,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: exp_out + MPS: exp_out_mps - func: exp2(Tensor self) -> Tensor structured_delegate: exp2.out @@ -2020,6 +2104,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: exp2_out + MPS: exp2_out_mps - func: expm1(Tensor self) -> Tensor device_check: NoCheck # TensorIterator @@ -2046,6 +2131,13 @@ SparseCPU, SparseCUDA: expm1_sparse_out SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr_out +- func: expand.SymInt(Tensor(a) self, SymInt[] size, *, bool implicit=False) -> Tensor(a) + variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: expand_symint + - func: expand(Tensor(a) self, int[] size, *, bool implicit=False) -> Tensor(a) variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. device_check: NoCheck @@ -2090,19 +2182,32 @@ - func: unflatten.Dimname(Tensor(a) self, Dimname dim, int[] sizes, Dimname[] names) -> Tensor(a) variants: method +- func: fill.Scalar(Tensor self, Scalar value) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: fill + +- func: fill.Tensor(Tensor self, Tensor value) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: fill + - func: fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function, method dispatch: CPU, CUDA: fill_ + MPS: fill_scalar_mps QuantizedCPU, QuantizedCUDA: fill_quantized_ Meta: fill_meta_ + SparseCsrCPU, SparseCsrCUDA: fill_sparse_csr_ - func: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function, method dispatch: CPU, CUDA: fill_ + MPS: fill_tensor_mps_ QuantizedCPU, QuantizedCUDA: fill_quantized_ Meta: fill_meta_ @@ -2130,6 +2235,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: floor_out + MPS: floor_out_mps SparseCPU, SparseCUDA: floor_sparse_out SparseCsrCPU, SparseCsrCUDA: floor_sparse_csr_out @@ -2221,10 +2327,12 @@ variants: function, method # NOTE [ grid_sampler Native Functions ] -# `grid_sampler` does all the shape checking and then dispatches to one of -# `cudnn_grid_sampler`, `grid_sampler_2d`, or `grid_sampler_3d`, each of which -# has the corresponding backward defined as native functions as well. Therefore, -# in these functions and their backwards, no more shape checking is done. +# `grid_sampler` is _supposed to_ do all the shape checking and then dispatch to +# one of `cudnn_grid_sampler`, `grid_sampler_2d`, or `grid_sampler_3d`, each of +# which has the corresponding backward defined as native functions as well. +# However, we do shape checking everywhere for now since each of the mentioned +# functions can be called directly, which will lead to crashes otherwise. +# See https://github.com/pytorch/pytorch/issues/73187 for more information. # # There is also _grid_sampler_2d_backward_cpu_fallback which is an # implementation detail of grid_sampler_2d and is only exposed here for testing @@ -2262,7 +2370,10 @@ CPU: grid_sampler_3d_cpu CUDA: grid_sampler_3d_cuda -- func: grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor) +# `grid_sampler_3d_backward` takes in `output_mask` to optimize performance for +# the case where `input` doesn't require gradient. Gradient for `grid` is always +# computed (only `output_mask[0]` is checked by the implementations). +- func: grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask) -> (Tensor, Tensor) dispatch: CPU: grid_sampler_3d_backward_cpu CUDA: grid_sampler_3d_backward_cuda @@ -2451,7 +2562,7 @@ device_check: NoCheck device_guard: False dispatch: - CPU, CUDA: isnan + CPU, CUDA, MPS: isnan SparseCPU, SparseCUDA: isnan_sparse SparseCsrCPU, SparseCsrCUDA: isnan_sparse_csr @@ -2549,11 +2660,6 @@ CUDA: layer_norm_cuda CompositeImplicitAutograd: math_native_layer_norm -- func: _native_multi_head_self_attention(Tensor query, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, int num_head, Tensor? mask=None) -> Tensor - dispatch: - CPU: multi_head_self_attention_cpu - CUDA: multi_head_self_attention_cuda - - func: native_layer_norm_backward(Tensor grad_out, Tensor input, int[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor) dispatch: CPU: layer_norm_backward_cpu @@ -2582,6 +2688,14 @@ - func: linear.out(Tensor input, Tensor weight, Tensor? bias=None, *, Tensor(a!) out) -> Tensor(a!) python_module: nn +# TODO: Add this function to MPS dispatch key so that we avoid declaring it in +# native_functions.yaml +# https://github.com/pytorch/pytorch/issues/77394 +- func: _mps_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor + python_module: nn + dispatch: + MPS: _mps_linear + - func: mkldnn_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor python_module: nn dispatch: @@ -2599,6 +2713,18 @@ dispatch: MkldnnCPU: mkldnn_linear_backward +- func: _mps_linear_backward_input(int[] input_size, Tensor grad_output, Tensor weight) -> Tensor + dispatch: + MPS: _mps_linear_backward_input + +- func: _mps_linear_backward_weights(Tensor grad_output, Tensor input, Tensor weight, bool bias_defined) -> (Tensor, Tensor) + dispatch: + MPS: _mps_linear_backward_weights + +- func: mps_linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor) + dispatch: + MPS: mps_linear_backward + - func: fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor - func: fbgemm_linear_int8_weight(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor @@ -2646,6 +2772,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: log_out + MPS: log_out_mps - func: log10(Tensor self) -> Tensor device_check: NoCheck # TensorIterator @@ -2665,6 +2792,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: log10_out + MPS: log10_out_mps - func: log1p(Tensor self) -> Tensor device_check: NoCheck # TensorIterator @@ -2688,6 +2816,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: log1p_out + MPS: log1p_out_mps SparseCPU, SparseCUDA: log1p_sparse_out SparseCsrCPU, SparseCsrCUDA: log1p_sparse_csr_out @@ -2707,12 +2836,14 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: log2_out + MPS: log2_out_mps - func: logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: logaddexp_out + MPS: logaddexp_out_mps - func: logaddexp(Tensor self, Tensor other) -> Tensor variants: method, function @@ -2725,6 +2856,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: logaddexp2_out + MPS: logaddexp2_out_mps - func: logaddexp2(Tensor self, Tensor other) -> Tensor variants: method, function @@ -2798,6 +2930,11 @@ - func: log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor variants: function, method +- func: log_softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: log_softmax_out + - func: log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor variants: function, method @@ -2809,6 +2946,7 @@ dispatch: CPU: log_softmax_cpu_out CUDA: log_softmax_cuda_out + MPS: log_softmax_mps_out - func: _log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor structured_delegate: _log_softmax_backward_data.out @@ -2818,6 +2956,7 @@ dispatch: CPU: log_softmax_backward_cpu_out CUDA: log_softmax_backward_cuda_out + MPS: log_softmax_backward_mps_out - func: _logcumsumexp(Tensor self, int dim) -> Tensor dispatch: @@ -2929,6 +3068,7 @@ - dim -> int dim dispatch: CPU, CUDA: max_out + MPS: max_out_mps - func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) device_check: NoCheck # TensorIterator @@ -2944,10 +3084,10 @@ - func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor variants: function, method - dispatch: - CompositeExplicitAutograd: amax + structured_delegate: amax.out - func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + structured: True dispatch: CPU, CUDA: amax_out @@ -2958,6 +3098,17 @@ - func: max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor +# TODO: Add this function to MPS dispatch key so that we avoid declaring it in +# native_functions.yaml +# https://github.com/pytorch/pytorch/issues/77394 +- func: _mps_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor + dispatch: + MPS: _mps_max_pool2d + +- func: mps_max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor + dispatch: + MPS: mps_max_pool2d_backward + - func: mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor dispatch: MkldnnCPU: mkldnn_max_pool2d @@ -2981,6 +3132,7 @@ - func: quantized_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor dispatch: QuantizedCPU: quantized_max_pool2d + QuantizedCUDA: quantized_max_pool2d_cudnn - func: max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor @@ -3004,6 +3156,7 @@ device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: mean_out + MPS: mean_out_mps QuantizedCPU: mean_out_quantized_cpu - func: mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor @@ -3076,6 +3229,7 @@ - dim -> int dim dispatch: CPU, CUDA: min_out + MPS: min_out_mps - func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) device_check: NoCheck # TensorIterator @@ -3086,13 +3240,24 @@ - func: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor variants: function, method - dispatch: - CompositeExplicitAutograd: amin + structured_delegate: amin.out - func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + structured: True dispatch: CPU, CUDA: amin_out +# TODO: Add this function to MPS dispatch key so that we avoid declaring it in +# native_functions.yaml +# https://github.com/pytorch/pytorch/issues/77394 +- func: _mps_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor + dispatch: + MPS: _mps_convolution + +- func: mps_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) + dispatch: + MPS: mps_convolution_backward + - func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor dispatch: CompositeExplicitAutograd: mkldnn_convolution @@ -3137,10 +3302,12 @@ dispatch: CPU: mm_out_cpu CUDA: mm_out_cuda + MPS: mm_out_mps SparseCPU, SparseCUDA: _sparse_mm_out SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm_out - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor + python_module: sparse - func: _sparse_sparse_matmul(Tensor self, Tensor other) -> Tensor dispatch: @@ -3172,8 +3339,10 @@ variants: function, method dispatch: SparseCPU, SparseCUDA: mul_sparse + SparseCsrCPU, SparseCsrCUDA: mul_sparse_csr MkldnnCPU: mkldnn_mul ZeroTensor: mul_zerotensor + NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Tensor - func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) device_check: NoCheck # TensorIterator @@ -3181,7 +3350,9 @@ variants: method dispatch: SparseCPU, SparseCUDA: mul_sparse_ + SparseCsrCPU, SparseCsrCUDA: mul_sparse_csr_ MkldnnCPU: mkldnn_mul_ + NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Tensor - func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator @@ -3189,8 +3360,10 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: mul_out + MPS: mul_out_mps SparseCPU: mul_out_sparse_cpu SparseCUDA: mul_out_sparse_cuda + SparseCsrCPU, SparseCsrCUDA: mul_out_sparse_csr MkldnnCPU: mkldnn_mul_out # For C++ only, until we have conversion from C++ numbers to Tensor @@ -3199,12 +3372,14 @@ variants: function, method dispatch: CompositeExplicitAutograd: mul + SparseCsrCPU, SparseCsrCUDA: mul_scalar_sparse_csr - func: mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: CompositeExplicitAutograd: mul_ + SparseCsrCPU, SparseCsrCUDA: mul__scalar_sparse_csr # multiply, alias for mul - func: multiply.Tensor(Tensor self, Tensor other) -> Tensor @@ -3253,6 +3428,12 @@ CPU: narrow_copy_dense_cpu SparseCPU, SparseCUDA: narrow_copy_sparse CompositeExplicitAutograd: narrow_copy_dense + tags: view_copy + +- func: narrow_copy.SymInt(Tensor self, int dim, int start, SymInt length) -> Tensor + variants: function, method + dispatch: + CompositeExplicitAutograd: narrow_copy_symint - func: narrow_copy.out(Tensor self, int dim, int start, int length, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -3272,11 +3453,13 @@ dispatch: CPU: batch_norm_cpu CUDA: batch_norm_cuda + MPS: batch_norm_mps MkldnnCPU: mkldnn_batch_norm - func: native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!)) dispatch: CUDA: batch_norm_cuda_out + MPS: batch_norm_mps_out - func: batch_norm_stats(Tensor input, float eps) -> (Tensor, Tensor) dispatch: @@ -3303,6 +3486,7 @@ dispatch: CPU: batch_norm_backward_cpu CUDA: batch_norm_backward_cuda + MPS: batch_norm_backward_mps MkldnnCPU: mkldnn_batch_norm_backward - func: batch_norm_backward_reduce(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g) -> (Tensor, Tensor, Tensor, Tensor) @@ -3370,6 +3554,7 @@ variants: function, method dispatch: CompositeExplicitAutograd: permute + MPS: permute_mps - func: movedim.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a) variants: function, method @@ -3410,8 +3595,14 @@ variants: function, method - func: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor + dispatch: + CPU: pixel_shuffle_cpu + CompositeExplicitAutograd: math_pixel_shuffle - func: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor + dispatch: + CPU: pixel_unshuffle_cpu + CompositeExplicitAutograd: math_pixel_unshuffle - func: channel_shuffle(Tensor self, int groups) -> Tensor dispatch: @@ -3427,6 +3618,7 @@ variants: method dispatch: CUDA: is_pinned_cuda + MPS: is_pinned_mps CompositeExplicitAutograd: is_pinned_default # TODO: add a copy kwarg that guarantees that the tensor is put into fresh @@ -3438,6 +3630,7 @@ - func: _pin_memory(Tensor self, Device? device=None) -> Tensor dispatch: CUDA: _pin_memory_cuda + MPS: _pin_memory_mps - func: pinverse(Tensor self, float rcond=1e-15) -> Tensor variants: function, method @@ -3573,6 +3766,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: reciprocal_out + MPS: reciprocal_out_mps - func: neg(Tensor self) -> Tensor device_check: NoCheck # TensorIterator @@ -3596,6 +3790,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: neg_out + MPS: neg_out_mps SparseCPU, SparseCUDA: neg_out_sparse SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr_out @@ -3612,6 +3807,7 @@ variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. dispatch: CompositeExplicitAutograd: repeat + MPS: repeat_mps - func: repeat_interleave.Tensor(Tensor repeats, *, int? output_size=None) -> Tensor variants: function @@ -3638,7 +3834,7 @@ device_check: NoCheck device_guard: False dispatch: - CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, ZeroTensor: _reshape_alias + CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, ZeroTensor, MPS: _reshape_alias # We don't need to support mkldnn since this is handled explicitly by the reshape operator. - func: _mkldnn_reshape(Tensor self, int[] shape) -> Tensor @@ -3675,6 +3871,7 @@ dispatch: CPU: round_out CUDA: round_out + MPS: round_out_mps SparseCPU, SparseCUDA: round_sparse_out SparseCsrCPU, SparseCsrCUDA: round_sparse_csr_out @@ -3707,16 +3904,20 @@ variants: function, method dispatch: CPU, CUDA: relu + MPS: relu_mps MkldnnCPU: mkldnn_relu QuantizedCPU: relu_quantized_cpu + NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu - func: relu_(Tensor(a!) self) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function, method dispatch: CPU, CUDA: relu_ + MPS: relu_mps_ MkldnnCPU: mkldnn_relu_ QuantizedCPU: relu_quantized_cpu_ + NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu_ - func: relu6(Tensor self) -> Tensor python_module: nn @@ -3727,16 +3928,18 @@ - func: prelu(Tensor self, Tensor weight) -> Tensor variants: function, method dispatch: + MkldnnCPU: mkldnn_prelu CPU: prelu_cpu CUDA: prelu_cuda - func: prelu_backward(Tensor grad_output, Tensor self, Tensor weight) -> (Tensor, Tensor) variants: function, method dispatch: + MkldnnCPU: mkldnn_prelu_backward CPU: prelu_backward_cpu CUDA: prelu_backward_cuda -- func: gelu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +- func: gelu.out(Tensor self, *, str approximate='none', Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator @@ -3744,24 +3947,34 @@ dispatch: CPU: gelu_out_cpu CUDA: gelu_out_cuda + MPS: gelu_out_mps + +- func: gelu_(Tensor(a!) self, *, str approximate='none') -> Tensor(a!) + structured_delegate: gelu.out + device_check: NoCheck # TensorIterator + python_module: nn + dispatch: + NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu_ -- func: gelu(Tensor self) -> Tensor +- func: gelu(Tensor self, *, str approximate='none') -> Tensor structured_delegate: gelu.out device_check: NoCheck # TensorIterator python_module: nn dispatch: MkldnnCPU: mkldnn_gelu QuantizedCPU: gelu_quantized_cpu + NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu -- func: gelu_backward.grad_input(Tensor grad, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) +- func: gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase python_module: nn dispatch: CPU: gelu_backward_out_cpu CUDA: gelu_backward_out_cuda + MPS: gelu_backward_out_mps -- func: gelu_backward(Tensor grad, Tensor self) -> Tensor +- func: gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor structured_delegate: gelu_backward.grad_input python_module: nn dispatch: @@ -3811,6 +4024,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: rsqrt_out + MPS: rsqrt_out_mps - func: select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a) variants: function, method @@ -3823,6 +4037,7 @@ device_guard: False dispatch: CompositeExplicitAutograd: select + SparseCsrCPU, SparseCsrCUDA: select_sparse_csr - func: select_backward(Tensor grad_output, int[] input_sizes, int dim, int index) -> Tensor variants: function @@ -3865,6 +4080,7 @@ python_module: nn dispatch: CPU, CUDA: silu_out + MPS: silu_out_mps - func: silu_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) structured: True @@ -3872,6 +4088,7 @@ python_module: nn dispatch: CPU, CUDA: silu_backward_out + MPS: silu_backward_out_mps - func: silu_backward(Tensor grad_output, Tensor self) -> Tensor structured_delegate: silu_backward.grad_input @@ -3925,6 +4142,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: sigmoid_out + MPS: sigmoid_out_mps - func: logit(Tensor self, float? eps=None) -> Tensor variants: function, method @@ -3962,6 +4180,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: sin_out + MPS: sin_out_mps SparseCsrCPU, SparseCsrCUDA: sin_sparse_csr_out SparseCPU, SparseCUDA: sin_sparse_out @@ -4001,6 +4220,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: sinh_out + MPS: sinh_out_mps SparseCPU, SparseCUDA: sinh_sparse_out SparseCsrCPU, SparseCsrCUDA: sinh_sparse_csr_out @@ -4087,6 +4307,11 @@ - func: softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor variants: function, method +- func: softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: softmax_out + - func: softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor variants: function, method @@ -4100,6 +4325,7 @@ dispatch: CPU: softmax_cpu_out CUDA: softmax_cuda_out + MPS: softmax_mps_out - func: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor structured_delegate: _softmax_backward_data.out @@ -4109,6 +4335,7 @@ dispatch: CPU: softmax_backward_cpu_out CUDA: softmax_backward_cuda_out + MPS: softmax_backward_mps_out - func: unsafe_split.Tensor(Tensor self, int split_size, int dim=0) -> Tensor[] variants: function, method @@ -4124,6 +4351,10 @@ dispatch: CompositeExplicitAutograd: split +- func: split.sizes(Tensor(a -> *) self, int[] split_size, int dim=0) -> Tensor(a)[] + variants: function, method + device_guard: False + - func: unsafe_split_with_sizes(Tensor self, int[] split_sizes, int dim=0) -> Tensor[] variants: function, method device_check: NoCheck @@ -4161,7 +4392,7 @@ device_check: NoCheck device_guard: False dispatch: - CPU, CUDA: squeeze + CompositeExplicitAutograd: squeeze QuantizedCPU, QuantizedCUDA: squeeze_quantized - func: squeeze.dim(Tensor(a) self, int dim) -> Tensor(a) @@ -4169,7 +4400,7 @@ device_check: NoCheck device_guard: False dispatch: - CPU, CUDA: squeeze + CompositeExplicitAutograd: squeeze QuantizedCPU, QuantizedCUDA: squeeze_quantized - func: squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a) @@ -4239,12 +4470,13 @@ - func: dstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) -# The signature is designed to be consistent with librosa except that it is -# missing the `pad_mode` and `center` arguments, which are taken care of at -# `torch.functional.py`. They shall be moved here once we have mapping between -# Python strings and C++ Enum in codegen. +# Overload without center & pad mode, needed for forward-compatibility - func: stft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor variants: function, method + cpp_no_default_args: ['hop_length', 'win_length', 'window', 'normalized'] + +- func: stft.center(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, str pad_mode="reflect", bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor + variants: function, method - func: istft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, bool normalized=False, bool? onesided=None, int? length=None, bool return_complex=False) -> Tensor variants: function, method @@ -4265,6 +4497,7 @@ variants: function, method dispatch: CompositeExplicitAutograd: sum + SparseCsrCPU, SparseCsrCUDA: sum_csr - func: sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor structured_delegate: sum.IntList_out @@ -4280,21 +4513,17 @@ device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: sum_out + MPS: sum_out_mps - func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator -- func: nansum(Tensor self, *, ScalarType? dtype=None) -> Tensor +- func: nansum(Tensor self, int[1] dim=[], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor variants: function, method dispatch: CPU, CUDA: nansum -- func: nansum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor - variants: function, method - dispatch: - CPU, CUDA: nansum - -- func: nansum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) +- func: nansum.out(Tensor self, int[1] dim=[], bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) dispatch: CPU, CUDA: nansum_out @@ -4325,6 +4554,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: sqrt_out + MPS: sqrt_out_mps SparseCPU, SparseCUDA: sqrt_sparse_out SparseCsrCPU, SparseCsrCUDA: sqrt_sparse_csr_out @@ -4337,8 +4567,6 @@ variants: function, method - func: square.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU, CUDA: square_out - func: std(Tensor self, bool unbiased=True) -> Tensor device_check: NoCheck # TensorIterator @@ -4353,6 +4581,7 @@ variants: function, method dispatch: CPU, CUDA: std + MPS: std_mps - func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor) device_check: NoCheck # TensorIterator @@ -4404,6 +4633,7 @@ variants: function, method dispatch: CPU, CUDA: prod + MPS: prod_mps - func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor structured_delegate: prod.int_out @@ -4415,6 +4645,7 @@ device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: prod_out + MPS: prod_out_mps - func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor device_check: NoCheck # TensorIterator @@ -4460,6 +4691,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: tan_out + MPS: tan_out_mps SparseCPU, SparseCUDA: tan_sparse_out SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr_out @@ -4488,6 +4720,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: tanh_out + MPS: tanh_out_mps SparseCPU, SparseCUDA: tanh_sparse_out SparseCsrCPU, SparseCsrCUDA: tanh_sparse_csr_out @@ -4518,12 +4751,14 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: threshold_out + MPS: threshold_out_mps - func: threshold_backward.grad_input(Tensor grad_output, Tensor self, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: threshold_backward_out + MPS: threshold_backward_out_mps - func: threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor variants: function @@ -4602,6 +4837,28 @@ - func: trapz.dx(Tensor y, *, float dx=1, int dim=-1) -> Tensor +# Fused implementation detail for transformers. Adds in-projection bias to QKV and divides Q by sqrt(D/num_heads). +- func: _transform_bias_rescale_qkv(Tensor qkv, Tensor qkv_bias, int num_heads) -> (Tensor, Tensor, Tensor) + dispatch: + CPU, NestedTensorCPU: transform_bias_rescale_qkv_cpu + CUDA, NestedTensorCUDA: transform_bias_rescale_qkv_cuda + +- func: _nested_tensor_from_mask(Tensor t, Tensor mask) -> Tensor + dispatch: + CPU, CUDA: NestedTensor_nested_tensor_from_mask + +- func: _nested_from_padded(Tensor padded, Tensor cpu_nested_shape_example, bool fuse_transform_0213=False) -> Tensor + device_check: NoCheck # cpu_nested_shape_example will always be on CPU + dispatch: + CPU: nested_from_padded_generic + CUDA: nested_from_padded_cuda + +# _nested_from_padded is not usable from Python, so +# _nested_from_padded_and_nested_example is available for testing. +- func: _nested_from_padded_and_nested_example(Tensor padded, Tensor nt_example) -> Tensor + dispatch: + NestedTensorCPU, NestedTensorCUDA: NestedTensor_from_padded_and_nested_example + - func: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor dispatch: CompositeExplicitAutograd: _trilinear @@ -4632,6 +4889,7 @@ device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: trunc_out + MPS: trunc_out_mps SparseCPU, SparseCUDA: trunc_sparse_out SparseCsrCPU, SparseCsrCUDA: trunc_sparse_csr_out @@ -4693,7 +4951,7 @@ device_check: NoCheck device_guard: False dispatch: - CPU, CUDA: unsqueeze + CompositeExplicitAutograd: unsqueeze SparseCPU, SparseCUDA: unsqueeze_sparse QuantizedCPU, QuantizedCUDA: unsqueeze_quantized @@ -4720,6 +4978,7 @@ variants: function, method dispatch: CPU, CUDA: var + MPS: var_mps - func: var.out(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator @@ -4771,12 +5030,18 @@ device_check: NoCheck device_guard: False -# we define both of these because 'where' does the broadcast and '_s_where' doesn't; -# this allows us to implicitly calculate the broadcast derivative, while only dealing with the -# _s_where derivative. - func: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: function, method + dispatch: + CPU, CUDA: where + MPS: where_mps + +- func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + device_check: NoCheck # TensorIterator + dispatch: + CPU, CUDA: where_self_out + MPS: where_self_out_mps - func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor variants: function @@ -4791,11 +5056,6 @@ device_check: NoCheck # TensorIterator variants: function -- func: _s_where(Tensor condition, Tensor self, Tensor other) -> Tensor - variants: function - dispatch: - CPU, CUDA: _s_where - - func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor variants: function @@ -4804,15 +5064,17 @@ - func: _weight_norm(Tensor v, Tensor g, int dim=0) -> Tensor variants: function -- func: _weight_norm_cuda_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor) +- func: _weight_norm_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor) variants: function dispatch: + CPU: weight_norm_cpu CUDA: weight_norm_cuda -- func: _weight_norm_cuda_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor) +- func: _weight_norm_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor) variants: function dispatch: - CUDA: weight_norm_cuda_backward + CPU: weight_norm_backward_cpu + CUDA: weight_norm_backward_cuda - func: _weight_norm_differentiable_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor) variants: function @@ -4894,6 +5156,16 @@ SparseCPU: _sparse_sum_backward_cpu SparseCUDA: _sparse_sum_backward_cuda +- func: _sparse_csr_sum.dim_dtype(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + dispatch: + SparseCsrCPU: _sparse_csr_sum_cpu + SparseCsrCUDA: _sparse_csr_sum_cuda + +- func: _sparse_csr_prod.dim_dtype(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + dispatch: + SparseCsrCPU: _sparse_csr_prod_cpu + SparseCsrCUDA: _sparse_csr_prod_cuda + - func: _sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor python_module: sparse variants: function @@ -4969,6 +5241,7 @@ device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: norm_out + MPS: norm_out_mps # These four redispatch in their implementation, so OK to be CompositeImplicitAutograd - func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor @@ -4994,24 +5267,31 @@ dispatch: CPU, CUDA: frexp_out +# Deprecated (v.1.12) - func: frobenius_norm(Tensor self) -> Tensor variants: function +# Deprecated (v.1.12) - func: frobenius_norm.dim(Tensor self, int[1] dim, bool keepdim=False) -> Tensor variants: function +# Deprecated (v.1.12) - func: frobenius_norm.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) variants: function +# Deprecated (v.1.12) - func: nuclear_norm(Tensor self, bool keepdim=False) -> Tensor variants: function +# Deprecated (v.1.12) - func: nuclear_norm.out(Tensor self, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) variants: function +# Deprecated (v.1.12) - func: nuclear_norm.dim(Tensor self, int[2] dim, bool keepdim=False) -> Tensor variants: function +# Deprecated (v.1.12) - func: nuclear_norm.dim_out(Tensor self, int[2] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) variants: function @@ -5020,7 +5300,7 @@ dispatch: CompositeExplicitAutograd: clone SparseCPU, SparseCUDA: clone_sparse - SparseCsrCPU, SparseCsrCUDA: clone_sparse_csr + SparseCsrCPU, SparseCsrCUDA: clone_sparse_compressed MkldnnCPU: mkldnn_clone QuantizedCPU, QuantizedCUDA: quantized_clone @@ -5035,7 +5315,7 @@ - func: resize_as_sparse_(Tensor(a!) self, Tensor the_template) -> Tensor(a!) use_const_ref_for_mutable_tensors: True - variants: function + variants: function, method dispatch: SparseCPU, SparseCUDA: resize_as_sparse_ SparseCsrCPU, SparseCsrCUDA: resize_as_sparse_csr_ @@ -5045,8 +5325,10 @@ variants: method, function dispatch: CPU, CUDA: zero_ + MPS: zero_mps_ Meta: zero_meta_ SparseCPU, SparseCUDA: zero_sparse_ + SparseCsrCPU, SparseCsrCUDA: zero_sparse_csr_ MkldnnCPU: mkldnn_zero_ - func: sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) @@ -5055,6 +5337,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: sub_out + MPS: sub_out_mps SparseCPU, SparseCUDA: sub_out_sparse - func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor @@ -5063,6 +5346,7 @@ structured_delegate: sub.out dispatch: SparseCPU, SparseCUDA: sub_sparse + ZeroTensor: sub_zerotensor - func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) device_check: NoCheck # TensorIterator @@ -5132,7 +5416,7 @@ # Functionally the same as addmm, but we give it a different derivative formula # that doesn't propagate gradients to non-present entries on sparse. -- func: _sparse_addmm(Tensor self, Tensor sparse, Tensor dense, *, Scalar beta=1, Scalar alpha=1) -> Tensor +- func: _sparse_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor python_module: sparse dispatch: CompositeExplicitAutograd: _sparse_addmm @@ -5141,17 +5425,20 @@ python_module: sparse dispatch: SparseCsrCUDA: sparse_sampled_addmm_out_sparse_csr_cuda + SparseCsrCPU: sparse_sampled_addmm_out_sparse_csr_cpu - func: sparse_sampled_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor python_module: sparse dispatch: SparseCsrCUDA: sparse_sampled_addmm_sparse_csr_cuda + SparseCsrCPU: sparse_sampled_addmm_sparse_csr_cpu - func: addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) structured: True dispatch: CPU: addmm_out_cpu CUDA: addmm_out_cuda + MPS: addmm_out_mps SparseCPU: addmm_out_sparse_dense_cpu SparseCUDA: addmm_out_sparse_dense_cuda SparseCsrCPU: addmm_out_sparse_csr_cpu @@ -5174,6 +5461,16 @@ SparseCPU: s_addmm_sparse_dense_cpu_ SparseCUDA: s_addmm_sparse_dense_cuda_ +- func: _addmm_activation.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False, Tensor(a!) out) -> Tensor(a!) + structured: True + dispatch: + CPU: addmm_activation_out_cpu + CUDA: addmm_activation_out_cuda + +- func: _addmm_activation(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False) -> Tensor + structured_delegate: _addmm_activation.out + variants: function, method + # NOTE [ Sparse: autograd and API ] # # @@ -5285,11 +5582,23 @@ # FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given # the default would never make sense. +- func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor - func: sparse_csr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_csc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_bsr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_bsc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_compressed_tensor.comp_plain_value(Tensor compressed_indices, Tensor plain_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor - func: sparse_csr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_csc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_bsr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: sparse_bsc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +- func: _sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - func: _sparse_csr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: _sparse_csc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: _sparse_bsr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +- func: _sparse_bsc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - func: sparse_coo_tensor.size(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor @@ -5301,7 +5610,11 @@ - func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size) -> () +- func: _validate_sparse_compressed_tensor_args(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, Layout layout) -> () - func: _validate_sparse_csr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> () +- func: _validate_sparse_csc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> () +- func: _validate_sparse_bsr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> () +- func: _validate_sparse_bsc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> () - func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor dispatch: @@ -5328,14 +5641,20 @@ dispatch: SparseCPU: sparse_mask_cpu SparseCUDA: sparse_mask_cuda + SparseCsrCPU, SparseCsrCUDA: sparse_mask_sparse_csr - func: _to_cpu(Tensor[] tensors) -> Tensor[] variants: function - func: to_dense(Tensor self, ScalarType? dtype=None) -> Tensor variants: method + +# Special case of to_dense with custom derivative +- func: _to_dense(Tensor self, ScalarType? dtype=None) -> Tensor + variants: method dispatch: - SparseCPU, SparseCUDA, SparseCsrCPU, SparseCsrCUDA: sparse_to_dense + SparseCPU, SparseCUDA: sparse_to_dense + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_dense MkldnnCPU: mkldnn_to_dense - func: to_dense_backward(Tensor grad, Tensor input) -> Tensor @@ -5451,6 +5770,20 @@ device_check: NoCheck device_guard: False +- func: ccol_indices(Tensor(a) self) -> Tensor(a) + variants: method + dispatch: + SparseCsrCPU, SparseCsrCUDA: ccol_indices_sparse_csr + device_check: NoCheck + device_guard: False + +- func: row_indices(Tensor(a) self) -> Tensor(a) + variants: method + dispatch: + SparseCsrCPU, SparseCsrCUDA: row_indices_sparse_csr + device_check: NoCheck + device_guard: False + - func: hspmm.out(Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) dispatch: SparseCPU: hspmm_out_sparse_cpu @@ -5471,6 +5804,7 @@ variants: function, method dispatch: CompositeExplicitAutograd: unbind + NestedTensorCPU, NestedTensorCUDA: NestedTensor_unbind - func: unbind.Dimname(Tensor(a -> *) self, Dimname dim) -> Tensor(a)[] variants: function, method @@ -5479,11 +5813,41 @@ variants: method dispatch: CPU, CUDA: dense_to_sparse + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse - func: to_sparse(Tensor self) -> Tensor variants: method dispatch: CPU, CUDA: dense_to_sparse + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse + +- func: to_sparse_csr(Tensor self) -> Tensor + variants: method + dispatch: + CPU, CUDA: dense_to_sparse_csr + SparseCPU, SparseCUDA: coo_to_sparse_csr + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csr + +- func: to_sparse_csc(Tensor self) -> Tensor + variants: method + dispatch: + CPU, CUDA: dense_to_sparse_csc + SparseCPU, SparseCUDA: coo_to_sparse_csc + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_csc + +- func: to_sparse_bsr(Tensor self, int[2] blocksize) -> Tensor + variants: method + dispatch: + CPU, CUDA: dense_to_sparse_bsr + SparseCPU, SparseCUDA: coo_to_sparse_bsr + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsr + +- func: to_sparse_bsc(Tensor self, int[2] blocksize) -> Tensor + variants: method + dispatch: + CPU, CUDA: dense_to_sparse_bsc + SparseCPU, SparseCUDA: coo_to_sparse_bsc + SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse_bsc - func: to_mkldnn(Tensor self, ScalarType? dtype=None) -> Tensor variants: method @@ -5729,16 +6093,33 @@ dispatch: CPU: _local_scalar_dense_cpu CUDA: _local_scalar_dense_cuda + MPS: _local_scalar_dense_mps variants: function +# MPS LSTM implementation + +- func: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor) + dispatch: + MPS: _lstm_mps + +- func: lstm_mps_backward(Tensor grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[]) + dispatch: + MPS: lstm_mps_backward + + # Fused RNN kernels - func: _thnn_fused_lstm_cell(Tensor input_gates, Tensor hidden_gates, Tensor cx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor, Tensor) dispatch: CUDA: _thnn_fused_lstm_cell_cuda -- func: _thnn_fused_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor) +# NB: The composite version of this function below is a simple wrapper that duplicates some of the outputs +# It is necessary to avoid triggering TensorImpl use count checks in debug mode +# NB: this is function is NOT differentiable +- func: _thnn_fused_lstm_cell_backward_impl(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor) dispatch: - CUDA: _thnn_fused_lstm_cell_backward_cuda + CUDA: _thnn_fused_lstm_cell_backward_impl_cuda + +- func: _thnn_fused_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor) - func: _thnn_differentiable_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor input_gates, Tensor hidden_gates, Tensor? input_bias, Tensor? hidden_bias, Tensor cx, Tensor cy) -> (Tensor, Tensor, Tensor, Tensor, Tensor) @@ -5819,36 +6200,51 @@ device_check: NoCheck device_guard: False dispatch: - CPU, CUDA: set_ + CPU, CUDA, Meta, MPS: set_ - func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!) variants: method device_check: NoCheck device_guard: False dispatch: - CPU: set_storage_cpu_ + CPU, Meta: set_storage_cpu_ CUDA: set_storage_cuda_ + MPS: set_storage_mps_ QuantizedCPU, QuantizedCUDA: set_storage_quantized_ +- func: set_.source_Tensor_storage_offset(Tensor(a!) self, Tensor source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!) + variants: method + device_check: NoCheck + device_guard: False + - func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!) variants: method device_check: NoCheck device_guard: False dispatch: - CPU, CUDA: set_tensor_ + CPU, CUDA, Meta, MPS: set_tensor_ - func: set_(Tensor(a!) self) -> Tensor(a!) variants: method dispatch: CPU: set_cpu_ CUDA: set_cuda_ + Meta: set_meta_ + MPS: set_mps_ + +- func: lift(Tensor self) -> Tensor + variants: method + dispatch: + # Not making it CompositeImplicitAutograd because lift + # should be a primitive w.r.t. functorch + CompositeExplicitAutograd: lift - func: is_set_to(Tensor self, Tensor tensor) -> bool variants: method device_check: NoCheck device_guard: False dispatch: - CPU, CUDA: is_set_to + CPU, CUDA, MPS: is_set_to - func: masked_fill_.Scalar(Tensor(a!) self, Tensor mask, Scalar value) -> Tensor(a!) device_check: NoCheck # TensorIterator @@ -5856,6 +6252,7 @@ dispatch: CPU: masked_fill__cpu CUDA: masked_fill__cuda + MPS: masked_fill__mps - func: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor device_check: NoCheck # TensorIterator @@ -5869,6 +6266,7 @@ dispatch: CPU: masked_fill__cpu CUDA: masked_fill__cuda + MPS: masked_fill__mps - func: masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor device_check: NoCheck # TensorIterator @@ -5887,17 +6285,22 @@ dispatch: CompositeExplicitAutograd: masked_scatter -- func: _masked_softmax(Tensor self, Tensor mask) -> Tensor +- func: _masked_softmax(Tensor self, Tensor mask, int? dim=None) -> Tensor dispatch: CUDA: masked_softmax_cuda CPU: masked_softmax_cpu +- func: _masked_softmax_backward(Tensor grad_output, Tensor output, Tensor mask, int? dim=None) -> Tensor + dispatch: + CUDA: masked_softmax_backward_cuda + CPU: masked_softmax_backward_cpu + - func: view(Tensor(a) self, int[] size) -> Tensor(a) variants: method device_check: NoCheck device_guard: False dispatch: - ZeroTensor, CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA: view + ZeroTensor, CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, MPS: view MkldnnCPU: mkldnn_view # Warning: If you want to change the name or overload name of this @@ -5916,7 +6319,7 @@ - func: put_(Tensor(a!) self, Tensor index, Tensor source, bool accumulate=False) -> Tensor(a!) variants: method dispatch: - CPU, CUDA: put_ + CPU, CUDA, MPS: put_ - func: put(Tensor self, Tensor index, Tensor source, bool accumulate=False) -> Tensor variants: function, method @@ -5941,6 +6344,23 @@ - func: index_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor variants: function, method +- func: index_reduce.out(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function + precomputed: + - dim -> int dim + dispatch: + CPU: index_reduce_cpu_out + CUDA: index_reduce_cuda_out + +- func: index_reduce_(Tensor(a!) self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor(a!) + structured_delegate: index_reduce.out + variants: method + +- func: index_reduce(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor + structured_delegate: index_reduce.out + variants: function, method + - func: index_fill_.int_Scalar(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method @@ -5995,6 +6415,7 @@ variants: function dispatch: CPU, CUDA: scatter_src_out + MPS: scatter_src_out_mps - func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor structured_delegate: scatter.value_out @@ -6009,6 +6430,7 @@ variants: function dispatch: CPU, CUDA: scatter_value_out + MPS: scatter_value_out_mps - func: scatter.reduce(Tensor self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor structured_delegate: scatter.reduce_out @@ -6023,6 +6445,7 @@ variants: function dispatch: CPU, CUDA: scatter_reduce_out + MPS: scatter_reduce_out_mps - func: scatter.value_reduce(Tensor self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor structured_delegate: scatter.value_reduce_out @@ -6037,6 +6460,7 @@ variants: function dispatch: CPU, CUDA: scatter_value_reduce_out + MPS: scatter_value_reduce_out_mps - func: scatter.dimname_src(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor variants: function, method @@ -6057,14 +6481,24 @@ variants: function dispatch: CPU, CUDA: scatter_add + MPS: scatter_add_mps_out - func: scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor variants: function, method -- func: scatter_reduce.two(Tensor self, int dim, Tensor index, str reduce, *, int? output_size=None) -> Tensor +- func: scatter_reduce.two(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor + structured_delegate: scatter_reduce.two_out variants: function, method + +- func: scatter_reduce_.two(Tensor(a!) self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor(a!) + structured_delegate: scatter_reduce.two_out + variants: method + +- func: scatter_reduce.two_out(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!) + structured: True + variants: function dispatch: - CPU: scatter_reduce_two_cpu + CPU, CUDA: scatter_reduce_two - func: eq_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) structured_delegate: eq.Scalar_out @@ -6100,6 +6534,12 @@ dispatch: CompositeExplicitAutograd: bitwise_and +- func: bitwise_and.Scalar_Tensor(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: bitwise_and + - func: bitwise_and.Tensor(Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function @@ -6148,6 +6588,12 @@ device_check: NoCheck # TensorIterator variants: method, function +- func: bitwise_or.Scalar_Tensor(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: bitwise_or + - func: bitwise_or.Tensor(Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function @@ -6196,6 +6642,12 @@ device_check: NoCheck # TensorIterator variants: method, function +- func: bitwise_xor.Scalar_Tensor(Scalar self, Tensor other) -> Tensor + device_check: NoCheck # TensorIterator + variants: function + dispatch: + CompositeExplicitAutograd: bitwise_xor + - func: bitwise_xor.Tensor(Tensor self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: method, function @@ -6271,25 +6723,25 @@ device_check: NoCheck # TensorIterator variants: method, function dispatch: - CPU, CUDA: bitwise_left_shift + CompositeExplicitAutograd: bitwise_left_shift - func: bitwise_left_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: - CPU, CUDA: bitwise_left_shift_ + CompositeExplicitAutograd: bitwise_left_shift_ - func: bitwise_left_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function dispatch: - CPU, CUDA: bitwise_left_shift_out + CompositeExplicitAutograd: bitwise_left_shift_out - func: bitwise_left_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: function dispatch: - CPU, CUDA: bitwise_left_shift + CompositeExplicitAutograd: bitwise_left_shift - func: __rshift__.Scalar(Tensor self, Scalar other) -> Tensor device_check: NoCheck # TensorIterator @@ -6336,25 +6788,25 @@ device_check: NoCheck # TensorIterator variants: method, function dispatch: - CPU, CUDA: bitwise_right_shift + CompositeExplicitAutograd: bitwise_right_shift - func: bitwise_right_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: method dispatch: - CPU, CUDA: bitwise_right_shift_ + CompositeExplicitAutograd: bitwise_right_shift_ - func: bitwise_right_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator variants: function dispatch: - CPU, CUDA: bitwise_right_shift_out + CompositeExplicitAutograd: bitwise_right_shift_out - func: bitwise_right_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor device_check: NoCheck # TensorIterator variants: function dispatch: - CPU, CUDA: bitwise_right_shift + CompositeExplicitAutograd: bitwise_right_shift - func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!) structured_delegate: tril.out @@ -6383,15 +6835,18 @@ variants: method dispatch: CPU, CUDA: addbmm_ + MPS: addbmm_mps_ - func: addbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) dispatch: CPU, CUDA: addbmm_out + MPS: addbmm_out_mps - func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor variants: method, function dispatch: CPU, CUDA: addbmm + MPS: addbmm_mps - func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!) device_check: NoCheck # TensorIterator @@ -6399,6 +6854,7 @@ dispatch: CPU, CUDA: random_ Meta: random_meta_ + MPS: random_mps_ - func: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!) device_check: NoCheck # TensorIterator @@ -6406,6 +6862,7 @@ dispatch: CPU, CUDA: random_ Meta: random_meta_ + MPS: random_mps_ - func: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!) device_check: NoCheck # TensorIterator @@ -6419,6 +6876,7 @@ variants: method dispatch: CPU, CUDA: uniform_ + MPS: uniform_mps_ Meta: uniform_meta_ - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!) @@ -6451,6 +6909,7 @@ dispatch: CPU: diag_cpu_out CUDA: diag_cuda_out + MPS: diag_mps_out - func: diag(Tensor self, int diagonal=0) -> Tensor variants: method, function @@ -6472,6 +6931,7 @@ dispatch: CPU: triu_cpu CUDA: triu_cuda + MPS: triu_mps_out - func: triu(Tensor self, int diagonal=0) -> Tensor structured_delegate: triu.out @@ -6482,6 +6942,7 @@ dispatch: CPU: tril_cpu CUDA: tril_cuda + MPS: tril_mps_out - func: tril(Tensor self, int diagonal=0) -> Tensor structured_delegate: tril.out @@ -6514,6 +6975,7 @@ device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: ne_Scalar_out + MPS: ne_scalar_out_mps QuantizedCPU: ne_out_quantized_cpu - func: ne.Scalar(Tensor self, Scalar other) -> Tensor @@ -6529,6 +6991,7 @@ device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: ne_Tensor_out + MPS: ne_tensor_out_mps QuantizedCPU: ne_out_quantized_cpu - func: ne.Tensor(Tensor self, Tensor other) -> Tensor @@ -6575,6 +7038,7 @@ device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: eq_Scalar_out + MPS: eq_scalar_out_mps QuantizedCPU: eq_out_quantized_cpu - func: eq.Scalar(Tensor self, Scalar other) -> Tensor @@ -6590,6 +7054,7 @@ device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: eq_Tensor_out + MPS: eq_tensor_out_mps QuantizedCPU: eq_out_quantized_cpu - func: eq.Tensor(Tensor self, Tensor other) -> Tensor @@ -6605,6 +7070,7 @@ device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: ge_Scalar_out + MPS: ge_scalar_out_mps QuantizedCPU: ge_out_quantized_cpu - func: ge.Scalar(Tensor self, Scalar other) -> Tensor @@ -6620,6 +7086,7 @@ device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: ge_Tensor_out + MPS: ge_tensor_out_mps QuantizedCPU: ge_out_quantized_cpu - func: ge.Tensor(Tensor self, Tensor other) -> Tensor @@ -6666,6 +7133,7 @@ device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: le_Scalar_out + MPS: le_scalar_out_mps QuantizedCPU: le_out_quantized_cpu - func: le.Scalar(Tensor self, Scalar other) -> Tensor @@ -6681,6 +7149,7 @@ device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: le_Tensor_out + MPS: le_tensor_out_mps QuantizedCPU: le_out_quantized_cpu - func: le.Tensor(Tensor self, Tensor other) -> Tensor @@ -6727,6 +7196,7 @@ device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: gt_Scalar_out + MPS: gt_scalar_out_mps QuantizedCPU: gt_out_quantized_cpu - func: gt.Scalar(Tensor self, Scalar other) -> Tensor @@ -6742,6 +7212,7 @@ device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: gt_Tensor_out + MPS: gt_tensor_out_mps QuantizedCPU: gt_out_quantized_cpu - func: gt.Tensor(Tensor self, Tensor other) -> Tensor @@ -6788,6 +7259,7 @@ device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: lt_Scalar_out + MPS: lt_scalar_out_mps QuantizedCPU: lt_out_quantized_cpu - func: lt.Scalar(Tensor self, Scalar other) -> Tensor @@ -6803,6 +7275,7 @@ device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: lt_Tensor_out + MPS: lt_tensor_out_mps QuantizedCPU: lt_out_quantized_cpu - func: lt.Tensor(Tensor self, Tensor other) -> Tensor @@ -6861,15 +7334,18 @@ dispatch: CPU, QuantizedCPU: index_select_out_cpu_ CUDA, QuantizedCUDA: index_select_out_cuda + MPS: index_select_out_mps - func: index_select(Tensor self, int dim, Tensor index) -> Tensor variants: method, function dispatch: CPU: index_select_cpu_ QuantizedCPU: index_select_quantized_cpu_ - CUDA, QuantizedCUDA: index_select_cuda - SparseCPU: index_select_sparse - SparseCUDA: index_select_sparse + CUDA: index_select_cuda + QuantizedCUDA: index_select_quantized_cuda + SparseCPU: index_select_sparse_cpu + SparseCUDA: index_select_sparse_cuda + MPS: index_select_mps - func: index_select.dimname_out(Tensor self, Dimname dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!) @@ -6918,6 +7394,7 @@ structured: True dispatch: CPU, CUDA: gather_out + MPS: gather_out_mps - func: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor variants: method, function @@ -6941,6 +7418,7 @@ device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: addcmul_out + MPS: addcmul_out_mps - func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor structured_delegate: addcmul.out @@ -6958,6 +7436,7 @@ device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: addcdiv_out + MPS: addcdiv_out_mps - func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor structured_delegate: addcdiv.out @@ -7005,10 +7484,13 @@ - func: linalg_solve_triangular(Tensor self, Tensor B, *, bool upper, bool left=True, bool unitriangular=False) -> Tensor python_module: linalg - variants: method, function + variants: function dispatch: CPU, CUDA: linalg_solve_triangular +- func: linalg_vander(Tensor x, *, int? N=None) -> Tensor + python_module: linalg + - func: symeig.e(Tensor self, bool eigenvectors=False, bool upper=True, *, Tensor(a!) e, Tensor(b!) V) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) dispatch: CompositeExplicitAutograd: symeig_out @@ -7086,21 +7568,6 @@ CPU: _cholesky_solve_helper_cpu CUDA: _cholesky_solve_helper_cuda -- func: solve(Tensor self, Tensor A) -> (Tensor solution, Tensor LU) - variants: function, method - dispatch: - CompositeExplicitAutograd: solve - -- func: solve.solution(Tensor self, Tensor A, *, Tensor(a!) solution, Tensor(b!) lu) -> (Tensor(a!) solution, Tensor(b!) LU) - dispatch: - CompositeExplicitAutograd: solve_out - -- func: _solve_helper(Tensor self, Tensor A) -> (Tensor, Tensor) - variants: function - dispatch: - CPU: _solve_helper_cpu - CUDA: _solve_helper_cuda - - func: cholesky_inverse(Tensor self, bool upper=False) -> Tensor variants: method, function dispatch: @@ -7151,13 +7618,14 @@ dispatch: CPU, CUDA: lu_solve +# lu_unpack - func: lu_unpack(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True) -> (Tensor P, Tensor L, Tensor U) + structured_delegate: lu_unpack.out variants: function - dispatch: - CPU, CUDA: lu_unpack - func: lu_unpack.out(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True, *, Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) -> (Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) variants: function + structured: True dispatch: CPU, CUDA: lu_unpack_out @@ -7281,6 +7749,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: sign_out + MPS: sign_out_mps SparseCPU, SparseCUDA: sign_sparse_out SparseCsrCPU, SparseCsrCUDA: sign_sparse_csr_out @@ -7312,6 +7781,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: atan2_out + MPS: atan2_mps_out - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!) device_check: NoCheck # TensorIterator @@ -7398,6 +7868,12 @@ dispatch: CPU: histogramdd_cpu +- func: histogramdd(Tensor self, int[] bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges) + +- func: histogramdd.int_bins(Tensor self, int bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges) + +- func: histogramdd.TensorList_bins(Tensor self, Tensor[] bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges) + - func: fmod.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator dispatch: @@ -7535,6 +8011,7 @@ variants: method, function dispatch: CPU, CUDA: min + MPS: min_mps QuantizedCPU: min_quantized_cpu - func: fmin(Tensor self, Tensor other) -> Tensor @@ -7554,6 +8031,7 @@ variants: method, function dispatch: CPU, CUDA: max + MPS: max_mps QuantizedCPU: max_quantized_cpu - func: fmax(Tensor self, Tensor other) -> Tensor @@ -7579,6 +8057,7 @@ device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: maximum_out + MPS: maximum_out_mps # binary max, alias of maximum # NOTE: max is not an alias for maximum, since there is also unary max @@ -7600,6 +8079,7 @@ device_check: NoCheck # TensorIterator dispatch: CPU, CUDA: minimum_out + MPS: minimum_out_mps # binary min, alias for minimum # NOTE: min is not an alias for minimum, since there is also unary min @@ -7633,27 +8113,23 @@ - func: sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) device_check: NoCheck # TensorIterator dispatch: - CPU: sort_out_cpu - CUDA: sort_out_cuda + CompositeExplicitAutograd: sort_out - func: sort.values_stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) + structured: True dispatch: - CPU: sort_out_cpu_stable - CUDA: sort_out_stable_cuda + CPU, CUDA: sort_stable_out - func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices) device_check: NoCheck # TensorIterator variants: method, function dispatch: - CPU: sort_cpu - CUDA: sort_cuda - QuantizedCPU: sort_quantized_cpu + CompositeExplicitAutograd: sort - func: sort.stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices) + structured_delegate: sort.values_stable variants: method, function dispatch: - CPU: sort_cpu_stable - CUDA: sort_stable_cuda QuantizedCPU: sort_quantized_cpu_stable - func: sort.dimname_values(Tensor self, Dimname dim, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) @@ -7683,6 +8159,7 @@ dispatch: CPU: topk_out_cpu CUDA: topk_out_cuda + MPS: topk_out_mps - func: topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices) variants: method, function @@ -7700,6 +8177,7 @@ structured: True dispatch: CPU, CUDA: all_all_out + MPS: all_all_out_mps - func: any(Tensor self) -> Tensor device_check: NoCheck # TensorIterator @@ -7713,6 +8191,7 @@ structured: True dispatch: CPU, CUDA: any_all_out + MPS: any_all_out_mps - func: renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator @@ -7735,7 +8214,7 @@ device_check: NoCheck device_guard: False dispatch: - CPU, CUDA: unfold + CPU, CUDA, Meta: unfold QuantizedCPU, QuantizedCUDA: unfold - func: unfold_backward(Tensor grad_in, int[] input_sizes, int dim, int size, int step) -> Tensor @@ -7756,6 +8235,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: pow_Tensor_Tensor_out + MPS: pow_tensor_tensor_out_mps - func: pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor device_check: NoCheck # TensorIterator @@ -7779,6 +8259,7 @@ dispatch: CPU, CUDA: pow_Tensor_Scalar_out SparseCPU, SparseCUDA: pow_out_sparse_scalar + MPS: pow_tensor_scalar_out_mps - func: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor device_check: NoCheck # TensorIterator @@ -7822,32 +8303,45 @@ variants: method dispatch: CPU, CUDA: normal_ + MPS: normal_mps_ Meta: normal_meta_ SparseCsrCPU, SparseCsrCUDA: normal_sparse_csr_ - func: normal.Tensor_float_out(Tensor mean, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) dispatch: CPU, CUDA: normal_out + MPS: normal_mps_out + Meta: normal_out_meta - func: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor dispatch: CPU, CUDA: normal + #MPS: normal_mps + Meta: normal_meta - func: normal.float_Tensor_out(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) dispatch: CPU, CUDA: normal_out + Meta: normal_out_meta + MPS: normal_mps_out - func: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor dispatch: CPU, CUDA: normal + Meta: normal_meta + #MPS: normal_mps - func: normal.Tensor_Tensor_out(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) dispatch: CPU, CUDA: normal_out + Meta: normal_out_meta + MPS: normal_mps_out - func: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor dispatch: CPU, CUDA: normal + Meta: normal_meta + #MPS: normal_mps - func: normal.float_float(float mean, float std, int[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -7868,17 +8362,18 @@ dispatch: CUDA: _amp_update_scale_cuda_ -- func: _cat(Tensor[] tensors, int dim=0) -> Tensor - dispatch: - CPU: _cat_cpu - CUDA: cat_cuda - QuantizedCPU: cat_quantized_cpu +#- func: _cat(Tensor[] tensors, int dim=0) -> Tensor + #dispatch: + #CPU: _cat_cpu + #CUDA: cat_cuda + #MPS: cat_mps + #QuantizedCPU: cat_quantized_cpu -- func: _cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU: _cat_out_cpu - CUDA: cat_out_cuda - QuantizedCPU: cat_out_quantized_cpu +#- func: _cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!) + #dispatch: + #CPU: _cat_out_cpu + #CUDA: cat_out_cuda + #QuantizedCPU: cat_out_quantized_cpu - func: _foreach_add.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[] device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices @@ -8586,25 +9081,29 @@ - func: mse_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase python_module: nn dispatch: CPU, CUDA: mse_loss_out + MPS: mse_loss_out_mps - func: mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor device_check: NoCheck # TensorIterator + structured_delegate: mse_loss.out python_module: nn - dispatch: - CPU, CUDA: mse_loss - func: mse_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: CPU, CUDA: mse_loss_backward_out + MPS: mse_loss_backward_out_mps - func: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor python_module: nn dispatch: CPU, CUDA: mse_loss_backward + MPS: mse_loss_backward_mps - func: l1_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) python_module: nn @@ -8695,6 +9194,7 @@ dispatch: CPU: nll_loss_forward_out_cpu CUDA: nll_loss_forward_out_cuda + MPS: nll_loss_forward_out_mps - func: nll_loss_forward(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index) -> (Tensor output, Tensor total_weight) python_module: nn @@ -8706,6 +9206,7 @@ dispatch: CPU: nll_loss_backward_out_cpu CUDA: nll_loss_backward_out_cuda + MPS: nll_loss_backward_out_mps - func: nll_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight) -> Tensor python_module: nn @@ -8722,24 +9223,28 @@ dispatch: CPU: nll_loss2d_forward_out_cpu CUDA: nll_loss2d_forward_out_cuda + MPS: nll_loss2d_forward_out_mps - func: nll_loss2d_forward(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index) -> (Tensor output, Tensor total_weight) python_module: nn dispatch: CPU: nll_loss2d_forward_cpu CUDA: nll_loss2d_forward_cuda + MPS: nll_loss2d_forward_mps - func: nll_loss2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: CPU: nll_loss2d_backward_out_cpu CUDA: nll_loss2d_backward_out_cuda + MPS: nll_loss2d_backward_out_mps - func: nll_loss2d_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index, Tensor total_weight) -> Tensor python_module: nn dispatch: CPU: nll_loss2d_backward_cpu CUDA: nll_loss2d_backward_cuda + MPS: nll_loss2d_backward_mps - func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator @@ -8748,6 +9253,7 @@ python_module: nn dispatch: CPU, CUDA: smooth_l1_loss_out + MPS: smooth_l1_loss_out_mps - func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor device_check: NoCheck # TensorIterator @@ -8759,6 +9265,7 @@ dispatch: CPU: smooth_l1_loss_backward_out CUDA: smooth_l1_loss_backward_out + MPS: smooth_l1_loss_backward_out_mps - func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor python_module: nn @@ -8812,6 +9319,7 @@ python_module: nn dispatch: CPU, CUDA: elu_out + MPS: elu_out_mps - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor structured_delegate: elu.out @@ -8824,6 +9332,7 @@ python_module: nn dispatch: CPU, CUDA: elu_backward_out + MPS: elu_backward_out_mps - func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor structured_delegate: elu_backward.grad_input @@ -8860,6 +9369,16 @@ CPU: glu_backward_cpu CUDA: glu_backward_cuda +- func: glu_jvp(Tensor glu, Tensor x, Tensor dx, int dim) -> Tensor + python_module: nn + dispatch: + CPU, CUDA: glu_jvp + +- func: glu_backward_jvp(Tensor grad_x, Tensor grad_glu, Tensor x, Tensor dgrad_glu, Tensor dx, int dim) -> Tensor + python_module: nn + dispatch: + CPU, CUDA: glu_backward_jvp + - func: hardsigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase @@ -8896,31 +9415,33 @@ device_check: NoCheck # TensorIterator python_module: nn dispatch: - CPU, CUDA: hardtanh_out + CPU, CUDA, MPS: hardtanh_out QuantizedCPU: hardtanh_out_quantized_cpu - func: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor device_check: NoCheck # TensorIterator python_module: nn dispatch: - CPU, CUDA: hardtanh + CPU, CUDA, MPS: hardtanh QuantizedCPU: hardtanh_quantized_cpu - func: hardtanh_backward.grad_input(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: CPU, CUDA: hardtanh_backward_out + MPS: hardtanh_backward_out_mps - func: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor python_module: nn dispatch: CPU, CUDA: hardtanh_backward + MPS: hardtanh_backward_mps - func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!) device_check: NoCheck # TensorIterator python_module: nn dispatch: - CPU, CUDA: hardtanh_ + CPU, CUDA, MPS: hardtanh_ QuantizedCPU: hardtanh_quantized_cpu_ - func: hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) @@ -8953,6 +9474,7 @@ python_module: nn dispatch: CPU, CUDA: leaky_relu_out + MPS: leaky_relu_out_mps QuantizedCPU: leaky_relu_out_quantized_cpu - func: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor @@ -8968,6 +9490,7 @@ python_module: nn dispatch: CPU, CUDA: leaky_relu_backward_out + MPS: leaky_relu_backward_out_mps - func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor structured_delegate: leaky_relu_backward.grad_input @@ -9090,6 +9613,7 @@ dispatch: CPU: adaptive_avg_pool2d_out_cpu CUDA: adaptive_avg_pool2d_out_cuda + MPS: adaptive_avg_pool2d_out_mps MkldnnCPU: mkldnn_adaptive_avg_pool2d_out - func: adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor @@ -9107,13 +9631,16 @@ dispatch: CPU: adaptive_avg_pool2d_cpu CUDA: adaptive_avg_pool2d_cuda + MPS: adaptive_avg_pool2d_mps QuantizedCPU: adaptive_avg_pool2d_quantized_cpu + QuantizedCUDA: adaptive_avg_pool2d_quantized_cuda - func: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor python_module: nn dispatch: CPU: adaptive_avg_pool2d_backward_cpu CUDA: adaptive_avg_pool2d_backward_cuda + MPS: adaptive_avg_pool2d_backward_mps - func: adaptive_avg_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out) -> Tensor(a!) python_module: nn @@ -9201,6 +9728,7 @@ dispatch: CPU: avg_pool2d_out_cpu CUDA: avg_pool2d_out_cuda + MPS: avg_pool2d_out_mps MkldnnCPU: mkldnn_avg_pool2d_out - func: avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor @@ -9216,6 +9744,7 @@ dispatch: CPU: avg_pool2d_backward_out_cpu CUDA: avg_pool2d_backward_out_cuda + MPS: avg_pool2d_backward_out_mps MkldnnCPU: mkldnn_avg_pool2d_backward_out - func: avg_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor @@ -9313,6 +9842,7 @@ dispatch: CPU: max_pool2d_with_indices_out_cpu CUDA: max_pool2d_with_indices_out_cuda + MPS: max_pool2d_with_indices_out_mps # Return: (Tensor output, Tensor indices) - func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) @@ -9325,6 +9855,7 @@ dispatch: CPU: max_pool2d_with_indices_backward_out_cpu CUDA: max_pool2d_with_indices_backward_out_cuda + MPS: max_pool2d_with_indices_backward_out_mps - func: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor python_module: nn @@ -9368,18 +9899,6 @@ CPU: max_unpooling2d_forward_cpu CUDA: max_unpooling2d_forward_cuda -- func: max_unpool2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, int[2] output_size, *, Tensor(a!) grad_input) -> Tensor(a!) - python_module: nn - dispatch: - CPU: max_unpooling2d_backward_out_cpu - CUDA: max_unpooling2d_backward_out_cuda - -- func: max_unpool2d_backward(Tensor grad_output, Tensor self, Tensor indices, int[2] output_size) -> Tensor - python_module: nn - dispatch: - CPU: max_unpooling2d_backward_cpu - CUDA: max_unpooling2d_backward_cuda - - func: max_unpool3d.out(Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) out) -> Tensor(a!) python_module: nn dispatch: @@ -9392,30 +9911,18 @@ CPU: max_unpooling3d_forward_cpu CUDA: max_unpooling3d_forward_cuda -- func: max_unpool3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) grad_input) -> Tensor(a!) - python_module: nn - dispatch: - CPU: max_unpooling3d_backward_out_cpu - CUDA: max_unpooling3d_backward_out_cuda - -- func: max_unpool3d_backward(Tensor grad_output, Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding) -> Tensor - python_module: nn - dispatch: - CPU: max_unpooling3d_backward_cpu - CUDA: max_unpooling3d_backward_cuda - - func: reflection_pad1d.out(Tensor self, int[2] padding, *, Tensor(a!) out) -> Tensor(a!) python_module: nn structured: True dispatch: - CPU, QuantizedCPU: reflection_pad1d_out_cpu + CPU: reflection_pad1d_out_cpu + QuantizedCPU: reflection_pad1d_out_quantized_cpu CUDA: reflection_pad1d_out_cuda + MPS: reflection_pad1d_out_mps - func: reflection_pad1d(Tensor self, int[2] padding) -> Tensor python_module: nn structured_delegate: reflection_pad1d.out - dispatch: - QuantizedCPU: reflection_pad1d_cpu - func: reflection_pad1d_backward.grad_input(Tensor grad_output, Tensor self, int[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -9423,6 +9930,7 @@ dispatch: CPU: reflection_pad1d_backward_out_cpu CUDA: reflection_pad1d_backward_out_cuda + MPS: reflection_pad1d_backward_out_mps - func: reflection_pad1d_backward(Tensor grad_output, Tensor self, int[2] padding) -> Tensor python_module: nn @@ -9433,24 +9941,29 @@ dispatch: CPU, QuantizedCPU: reflection_pad2d_out_cpu CUDA: reflection_pad2d_out_cuda + MPS: reflection_pad2d_out_mps - func: reflection_pad2d(Tensor self, int[4] padding) -> Tensor python_module: nn dispatch: - CPU, QuantizedCPU: reflection_pad2d_cpu + CPU: reflection_pad2d_cpu + QuantizedCPU: reflection_pad2d_quantized_cpu CUDA: reflection_pad2d_cuda + MPS: reflection_pad2d_mps - func: reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, int[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: CPU: reflection_pad2d_backward_out_cpu CUDA: reflection_pad2d_backward_out_cuda + MPS: reflection_pad2d_backward_out_mps - func: reflection_pad2d_backward(Tensor grad_output, Tensor self, int[4] padding) -> Tensor python_module: nn dispatch: CPU: reflection_pad2d_backward_cpu CUDA: reflection_pad2d_backward_cuda + MPS: reflection_pad2d_backward_mps - func: reflection_pad3d.out(Tensor self, int[6] padding, *, Tensor(a!) out) -> Tensor(a!) python_module: nn @@ -9458,6 +9971,7 @@ dispatch: CPU: reflection_pad3d_out_cpu CUDA: reflection_pad3d_out_cuda + MPS: reflection_pad3d_out_mps - func: reflection_pad3d(Tensor self, int[6] padding) -> Tensor python_module: nn @@ -9469,6 +9983,7 @@ dispatch: CPU: reflection_pad3d_backward_out_cpu CUDA: reflection_pad3d_backward_out_cuda + MPS: reflection_pad3d_backward_out_mps - func: reflection_pad3d_backward(Tensor grad_output, Tensor self, int[6] padding) -> Tensor python_module: nn @@ -9480,6 +9995,7 @@ dispatch: CPU: replication_pad1d_out_cpu CUDA: replication_pad1d_out_cuda + MPS: replication_pad1d_out_mps - func: replication_pad1d(Tensor self, int[2] padding) -> Tensor python_module: nn @@ -9491,6 +10007,7 @@ dispatch: CPU: replication_pad1d_backward_out_cpu CUDA: replication_pad1d_backward_out_cuda + MPS: replication_pad1d_backward_out_mps - func: replication_pad1d_backward(Tensor grad_output, Tensor self, int[2] padding) -> Tensor python_module: nn @@ -9502,6 +10019,7 @@ dispatch: CPU: replication_pad2d_out_cpu CUDA: replication_pad2d_out_cuda + MPS: replication_pad2d_out_mps - func: replication_pad2d(Tensor self, int[4] padding) -> Tensor python_module: nn @@ -9512,12 +10030,14 @@ dispatch: CPU: replication_pad2d_backward_out_cpu CUDA: replication_pad2d_backward_out_cuda + MPS: replication_pad2d_backward_out_mps - func: replication_pad2d_backward(Tensor grad_output, Tensor self, int[4] padding) -> Tensor python_module: nn dispatch: CPU: replication_pad2d_backward_cpu CUDA: replication_pad2d_backward_cuda + MPS: replication_pad2d_backward_mps - func: replication_pad3d.out(Tensor self, int[6] padding, *, Tensor(a!) out) -> Tensor(a!) python_module: nn @@ -9525,6 +10045,7 @@ dispatch: CPU: replication_pad3d_out_cpu CUDA: replication_pad3d_out_cuda + MPS: replication_pad3d_out_mps - func: replication_pad3d(Tensor self, int[6] padding) -> Tensor python_module: nn @@ -9535,19 +10056,30 @@ dispatch: CPU: replication_pad3d_backward_out_cpu CUDA: replication_pad3d_backward_out_cuda + MPS: replication_pad3d_backward_out_mps - func: replication_pad3d_backward(Tensor grad_output, Tensor self, int[6] padding) -> Tensor python_module: nn dispatch: CPU: replication_pad3d_backward_cpu CUDA: replication_pad3d_backward_cuda + MPS: replication_pad3d_backward_mps -- func: upsample_linear1d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor +- func: _pad_circular(Tensor self, int[] pad) -> Tensor python_module: nn - dispatch: - CompositeExplicitAutograd: upsample_linear1d -- func: upsample_linear1d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor +- func: _pad_enum(Tensor self, int[] pad, int mode, float? value=None) -> Tensor + python_module: nn + +- func: pad(Tensor self, int[] pad, str mode="constant", float? value=None) -> Tensor + python_module: nn + +- func: upsample_linear1d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor + python_module: nn + dispatch: + CompositeExplicitAutograd: upsample_linear1d + +- func: upsample_linear1d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor python_module: nn dispatch: CompositeExplicitAutograd: upsample_linear1d_backward @@ -9697,6 +10229,7 @@ dispatch: CPU: upsample_bilinear2d_out_cpu CUDA: upsample_bilinear2d_out_cuda + MPS: upsample_bilinear2d_out_mps - func: upsample_bilinear2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor python_module: nn @@ -9710,6 +10243,7 @@ dispatch: CPU: upsample_bilinear2d_backward_out_cpu CUDA: upsample_bilinear2d_backward_out_cuda + MPS: upsample_bilinear2d_backward_out_mps - func: upsample_bilinear2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor python_module: nn @@ -9853,6 +10387,7 @@ dispatch: CPU: upsample_nearest2d_out_cpu CUDA: upsample_nearest2d_out_cuda + MPS: upsample_nearest2d_out_mps - func: _upsample_nearest_exact2d.out(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) python_module: nn @@ -9860,6 +10395,7 @@ dispatch: CPU: _upsample_nearest_exact2d_out_cpu CUDA: _upsample_nearest_exact2d_out_cuda + MPS: _upsample_nearest_exact2d_out_mps - func: upsample_nearest2d(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor python_module: nn @@ -9879,6 +10415,7 @@ dispatch: CPU: upsample_nearest2d_backward_out_cpu CUDA: upsample_nearest2d_backward_out_cuda + MPS: upsample_nearest2d_backward_out_mps - func: _upsample_nearest_exact2d_backward.grad_input(Tensor grad_output, int[2] output_size, int[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -9886,6 +10423,7 @@ dispatch: CPU: _upsample_nearest_exact2d_backward_out_cpu CUDA: _upsample_nearest_exact2d_backward_out_cuda + MPS: _upsample_nearest_exact2d_backward_out_mps - func: upsample_nearest2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor python_module: nn @@ -9949,6 +10487,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: sigmoid_backward_out + MPS: sigmoid_backward_out_mps - func: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor python_module: nn @@ -9971,6 +10510,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: tanh_backward_out + MPS: tanh_backward_out_mps - func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor python_module: nn @@ -10236,6 +10776,19 @@ dispatch: CPU, CUDA: special_ndtri_out +- func: special_log_ndtr(Tensor self) -> Tensor + structured_delegate: special_log_ndtr.out + python_module: special + variants: function + +- func: special_log_ndtr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase + python_module: special + variants: function + dispatch: + CPU, CUDA: special_log_ndtr_out + - func: special_expm1(Tensor self) -> Tensor python_module: special variants: function @@ -10489,7 +11042,7 @@ - func: special_polygamma(int n, Tensor self) -> Tensor python_module: special - variants: function, method + variants: function - func: special_polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!) python_module: special @@ -10786,6 +11339,8 @@ python_module: linalg variants: function structured_delegate: linalg_cross.out + dispatch: + ZeroTensor: linalg_cross_zerotensor - func: linalg_cross.out(Tensor self, Tensor other, *, int dim=-1, Tensor(a!) out) -> Tensor(a!) python_module: linalg @@ -10816,6 +11371,20 @@ dispatch: CPU, CUDA: linalg_lu_factor_ex_out +# linalg.lu +- func: linalg_lu(Tensor A, *, bool pivot=True) -> (Tensor P, Tensor L, Tensor U) + python_module: linalg + structured_delegate: linalg_lu.out + variants: function + +- func: linalg_lu.out(Tensor A, *, bool pivot=True, Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) -> (Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) + python_module: linalg + variants: function + structured: True + dispatch: + CPU, CUDA: linalg_lu_out + +# linalg.det - func: linalg_det(Tensor self) -> Tensor python_module: linalg variants: function @@ -10837,6 +11406,38 @@ dispatch: CPU, CUDA: _det_lu_based_helper_backward_helper +- func: linalg_ldl_factor_ex(Tensor self, *, bool hermitian=False, bool check_errors=False) -> (Tensor LD, Tensor pivots, Tensor info) + structured_delegate: linalg_ldl_factor_ex.out + python_module: linalg + variants: function + +- func: linalg_ldl_factor_ex.out(Tensor self, *, bool hermitian=False, bool check_errors=False, Tensor(a!) LD, Tensor(b!) pivots, Tensor(c!) info) -> (Tensor(a!) LD, Tensor(b!) pivots, Tensor(c!) info) + structured: True + python_module: linalg + variants: function + dispatch: + CPU, CUDA: linalg_ldl_factor_ex_out + +- func: linalg_ldl_factor(Tensor self, *, bool hermitian=False) -> (Tensor LD, Tensor pivots) + python_module: linalg + variants: function + +- func: linalg_ldl_factor.out(Tensor self, *, bool hermitian=False, Tensor(a!) LD, Tensor(b!) pivots) -> (Tensor(a!) LD, Tensor(b!) pivots) + python_module: linalg + variants: function + +- func: linalg_ldl_solve(Tensor LD, Tensor pivots, Tensor B, *, bool hermitian=False) -> Tensor + structured_delegate: linalg_ldl_solve.out + python_module: linalg + variants: function + +- func: linalg_ldl_solve.out(Tensor LD, Tensor pivots, Tensor B, *, bool hermitian=False, Tensor(a!) out) -> Tensor(a!) + structured: True + python_module: linalg + variants: function + dispatch: + CPU, CUDA: linalg_ldl_solve_out + - func: linalg_lstsq(Tensor self, Tensor b, float? rcond=None, *, str? driver=None) -> (Tensor solution, Tensor residuals, Tensor rank, Tensor singular_values) python_module: linalg variants: function @@ -10983,11 +11584,11 @@ - func: linalg_vector_norm(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor python_module: linalg variants: function - dispatch: - CPU, CUDA: linalg_vector_norm + structured_delegate: linalg_vector_norm.out - func: linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) python_module: linalg + structured: True dispatch: CPU, CUDA: linalg_vector_norm_out @@ -11111,13 +11712,13 @@ python_module: linalg variants: function -- func: linalg_qr(Tensor self, str mode='reduced') -> (Tensor Q, Tensor R) +- func: linalg_qr(Tensor A, str mode='reduced') -> (Tensor Q, Tensor R) python_module: linalg variants: function dispatch: CompositeExplicitAutograd: linalg_qr -- func: linalg_qr.out(Tensor self, str mode='reduced', *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R) +- func: linalg_qr.out(Tensor A, str mode='reduced', *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R) python_module: linalg variants: function dispatch: @@ -11237,3 +11838,447 @@ - func: unflatten_dense_tensors(Tensor flat, Tensor[] tensors) -> Tensor[] variants: function python_module: nn + +- func: nested_tensor(Tensor[] list, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + variants: function + +- func: _fw_primal_copy(Tensor self, int level) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _fw_primal_copy + tags: view_copy + +- func: _make_dual_copy(Tensor primal, Tensor tangent, int level) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _make_dual_copy + tags: view_copy + +- func: view_as_real_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: view_as_real_copy + tags: view_copy + +- func: view_as_complex_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: view_as_complex_copy + tags: view_copy + +- func: _conj_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _conj_copy + tags: view_copy + +- func: _neg_view_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _neg_view_copy + tags: view_copy + +- func: as_strided_copy(Tensor self, int[] size, int[] stride, int? storage_offset=None) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: as_strided_copy + tags: view_copy + +- func: _sparse_broadcast_to_copy(Tensor self, int[] size) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _sparse_broadcast_to_copy + tags: view_copy + +- func: diagonal_copy(Tensor self, int offset=0, int dim1=0, int dim2=1) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: diagonal_copy + tags: view_copy + +- func: expand_copy(Tensor self, int[] size, *, bool implicit=False) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: expand_copy + tags: view_copy + +- func: expand_copy.SymInt(Tensor self, SymInt[] size, *, bool implicit=False) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: expand_copy_SymInt + tags: view_copy + +- func: permute_copy(Tensor self, int[] dims) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: permute_copy + tags: view_copy + +- func: _reshape_alias_copy(Tensor self, int[] size, int[] stride) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _reshape_alias_copy + tags: view_copy + +- func: select_copy.int(Tensor self, int dim, int index) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: select_copy_int + tags: view_copy + +- func: detach_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: detach_copy + tags: view_copy + +- func: slice_copy.Tensor(Tensor self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: slice_copy_Tensor + tags: view_copy + +- func: split_copy.Tensor(Tensor self, int split_size, int dim=0) -> Tensor[] + variants: function + dispatch: + CompositeExplicitAutograd: split_copy_Tensor + tags: view_copy + +- func: split_with_sizes_copy(Tensor self, int[] split_sizes, int dim=0) -> Tensor[] + variants: function + dispatch: + CompositeExplicitAutograd: split_with_sizes_copy + tags: view_copy + +- func: squeeze_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: squeeze_copy + tags: view_copy + +- func: squeeze_copy.dim(Tensor self, int dim) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: squeeze_copy_dim + tags: view_copy + +- func: t_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: t_copy + tags: view_copy + +- func: transpose_copy.int(Tensor self, int dim0, int dim1) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: transpose_copy_int + tags: view_copy + +- func: unsqueeze_copy(Tensor self, int dim) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: unsqueeze_copy + tags: view_copy + +- func: _indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _indices_copy + tags: view_copy + +- func: _values_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _values_copy + tags: view_copy + +- func: indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: indices_copy + tags: view_copy + +- func: values_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: values_copy + tags: view_copy + +- func: crow_indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: crow_indices_copy + tags: view_copy + +- func: col_indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: col_indices_copy + tags: view_copy + +- func: ccol_indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: ccol_indices_copy + tags: view_copy + +- func: row_indices_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: row_indices_copy + tags: view_copy + +- func: unbind_copy.int(Tensor self, int dim=0) -> Tensor[] + variants: function + dispatch: + CompositeExplicitAutograd: unbind_copy_int + tags: view_copy + +- func: view_copy(Tensor self, int[] size) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: view_copy + tags: view_copy + +- func: view_copy.dtype(Tensor self, ScalarType dtype) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: view_copy_dtype + tags: view_copy + +- func: unfold_copy(Tensor self, int dimension, int size, int step) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: unfold_copy + tags: view_copy + +- func: alias_copy(Tensor self) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: alias_copy + tags: view_copy + +- func: _fw_primal_copy.out(Tensor self, int level, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: _fw_primal_copy_out + + +- func: _make_dual_copy.out(Tensor primal, Tensor tangent, int level, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: _make_dual_copy_out + + +- func: view_as_real_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: view_as_real_copy_out + + +- func: view_as_complex_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: view_as_complex_copy_out + + +- func: _conj_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: _conj_copy_out + + +- func: _neg_view_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: _neg_view_copy_out + + +- func: as_strided_copy.out(Tensor self, int[] size, int[] stride, int? storage_offset=None, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: as_strided_copy_out + + +- func: _sparse_broadcast_to_copy.out(Tensor self, int[] size, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: _sparse_broadcast_to_copy_out + + +- func: diagonal_copy.out(Tensor self, int offset=0, int dim1=0, int dim2=1, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: diagonal_copy_out + + +- func: expand_copy.SymInt_out(Tensor self, SymInt[] size, *, bool implicit=False, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: expand_copy_SymInt_out + + +- func: expand_copy.out(Tensor self, int[] size, *, bool implicit=False, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: expand_copy_out + + +- func: permute_copy.out(Tensor self, int[] dims, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: permute_copy_out + + +- func: _reshape_alias_copy.out(Tensor self, int[] size, int[] stride, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: _reshape_alias_copy_out + + +- func: select_copy.int_out(Tensor self, int dim, int index, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: select_copy_int_out + + +- func: detach_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: detach_copy_out + + +- func: slice_copy.Tensor_out(Tensor self, int dim=0, int? start=None, int? end=None, int step=1, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: slice_copy_Tensor_out + + +- func: split_copy.Tensor_out(Tensor self, int split_size, int dim=0, *, Tensor(a!)[] out) -> () + variants: function + dispatch: + CompositeExplicitAutograd: split_copy_Tensor_out + + +- func: split_with_sizes_copy.out(Tensor self, int[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> () + variants: function + dispatch: + CompositeExplicitAutograd: split_with_sizes_copy_out + + +- func: squeeze_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: squeeze_copy_out + + +- func: squeeze_copy.dim_out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: squeeze_copy_dim_out + + +- func: t_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: t_copy_out + + +- func: transpose_copy.int_out(Tensor self, int dim0, int dim1, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: transpose_copy_int_out + + +- func: unsqueeze_copy.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: unsqueeze_copy_out + + +- func: _indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: _indices_copy_out + + +- func: _values_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: _values_copy_out + + +- func: indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: indices_copy_out + + +- func: values_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: values_copy_out + + +- func: crow_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: crow_indices_copy_out + + +- func: col_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: col_indices_copy_out + + +- func: unbind_copy.int_out(Tensor self, int dim=0, *, Tensor(a!)[] out) -> () + variants: function + dispatch: + CompositeExplicitAutograd: unbind_copy_int_out + + +- func: view_copy.out(Tensor self, int[] size, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: view_copy_out + + +- func: view_copy.dtype_out(Tensor self, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: view_copy_dtype_out + + +- func: unfold_copy.out(Tensor self, int dimension, int size, int step, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: unfold_copy_out + + +- func: alias_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + variants: function + dispatch: + CompositeExplicitAutograd: alias_copy_out + +- func: to_padded_tensor(Tensor self, float padding, int[]? output_size=None) -> Tensor + variants: method + dispatch: + NestedTensorCPU: NestedTensor_to_padded_tensor_generic + NestedTensorCUDA: NestedTensor_to_padded_tensor_cuda + +- func: _nested_tensor_layer_norm(Tensor self, Tensor? weight, Tensor? bias, float eps) -> Tensor + variants: method + dispatch: + NestedTensorCPU, NestedTensorCUDA: NestedTensor_layer_norm + +# Apparently, putting "forward" in the name will cause Python bindings to be skipped, so "fwd" it is. +- func: _transformer_encoder_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None) -> Tensor + variants: function + dispatch: + CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: transformer_encoder_layer_forward + +- func: _native_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, bool need_weights=True, bool average_attn_weights=True) -> (Tensor, Tensor) + variants: function + dispatch: + CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: native_multi_head_attention diff --git a/aten/src/ATen/native/nested/NestedTensorMath.cpp b/aten/src/ATen/native/nested/NestedTensorMath.cpp new file mode 100644 index 000000000000..d4f3338fb4cc --- /dev/null +++ b/aten/src/ATen/native/nested/NestedTensorMath.cpp @@ -0,0 +1,551 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { + +namespace { +template +Tensor map_nt(const Tensor& nt, Func f) { + auto* nt_impl = get_nested_tensor_impl(nt); + const auto& sizes = nt_impl->get_nested_size_tensor(); + return at::detail::make_tensor(f(nt_impl->get_buffer()), sizes); +} + +c10::optional maybe_get_consistent_last_dim_of_nested_tensor( + const NestedTensorImpl& nt) { + const auto& sizes = nt.get_nested_size_tensor(); + // The last entry in every row of sizes must be the same. + const auto& last_dims = sizes.select(1, -1); + const auto last_dims_accessor = last_dims.packed_accessor64(); + // REVIEW: this can't be the most efficient and concise way to + // write this check, can it? + const auto last_dim_value = last_dims_accessor[0]; + for (const auto i : c10::irange(1, last_dims.numel())) { + if (last_dims_accessor[i] != last_dim_value) { + return c10::nullopt; + } + } + return last_dim_value; +} + +int64_t num_bytes(IntArrayRef sizes) { + // 0-dim Tensors have torch.Size of .size() 0, but carry 1 memory. + // Empty 1-dim Tensors (torch.tensor([])) have torch.Size of .size() 1, + // but carry 0 memory. + int64_t result = 1; + int64_t stride = 1; + for (int ii = sizes.size() - 1; ii >= 0; --ii) { + result += (sizes[ii] - 1) * stride; + // TODO: accept strides as input when we support them instead of + // assuming contiguous. + stride *= sizes[ii]; + } + return result; +} + +std::vector NestedTensor_get_max_size_from_size_tensor(const Tensor& sizes) { + if (sizes.dim() == 0) { + return {}; + } + const auto sizes_ptr = sizes.data_ptr(); + const auto sizes_size_0 = sizes.sizes()[0]; + const auto sizes_size_1 = sizes.sizes()[1]; + TORCH_INTERNAL_ASSERT(sizes_size_1 > 0); + std::vector results(sizes_size_1, 0); + for (const auto ii : c10::irange(sizes_size_0)) { + for (const auto jj : c10::irange(sizes_size_1)) { + auto val = sizes_ptr[ii * sizes_size_1 + jj]; + if (results[jj] < val) { + results[jj] = val; + } + } + } + return results; +} + +Tensor pad_tensor_to_shape( + const Tensor& t, + IntArrayRef goal_shape, + double value = 0) { + std::vector padd; + auto tup = t.sizes(); + TORCH_CHECK( + t.dim() == (int64_t)(goal_shape.size()), + "dimension ", + t.dim(), + " doesn't match length ", + goal_shape.size(), + " of goal shape."); + for (int64_t i = tup.size() - 1; i >= 0; i--) { + padd.push_back(0); + padd.push_back(goal_shape[i] - tup[i]); + } + Tensor new_tensor = at::constant_pad_nd(t, IntArrayRef(padd), value); + new_tensor = new_tensor.reshape(goal_shape); + return new_tensor; +} +} // namespace + +at::Tensor wrap_buffer(at::Tensor buffer, at::Tensor nested_size_tensor) { + TORCH_CHECK(buffer.is_contiguous(), "Given buffer must be contiguous."); + return at::detail::make_tensor( + std::move(buffer), std::move(nested_size_tensor)); +} + +inline const at::Tensor& get_buffer(const at::Tensor& tensor) { + return get_nested_tensor_impl(tensor)->get_buffer(); +} + +inline const at::Tensor& get_nested_size_tensor(const at::Tensor& tensor) { + return get_nested_tensor_impl(tensor)->get_nested_size_tensor(); +} + +// CPU only! +// TODO: The algorithm here can be optimized, right now it involves a lot of +// small tensor manipulations +std::vector NestedTensor_unbind( + const at::Tensor& self, + int64_t dim) { + TORCH_CHECK( + dim == 0, + "NestedTensor can only be unbound along dimension 0 ", + "got dimension ", + dim, + " instead."); + auto esizes = get_nested_size_tensor(self); + std::vector result_tensors; + if (esizes.dim() == 0) { + return result_tensors; + } + auto esizes_chunks = esizes.unbind(0); + std::vector splits; + for (const auto i : c10::irange(esizes_chunks.size())) { + splits.push_back(esizes_chunks[i].prod().item()); + } + auto buffer_chunks = at::split_with_sizes(get_buffer(self), splits); + for (const auto i : c10::irange(buffer_chunks.size())) { + const auto& esize_chunk = esizes_chunks[i]; + result_tensors.push_back(buffer_chunks[i].view(IntArrayRef( + esize_chunk.data_ptr(), + esize_chunk.data_ptr() + esize_chunk.numel()))); + } + return result_tensors; +} + +Tensor& NestedTensor_relu_(Tensor& self) { + at::relu_(const_cast(get_nested_tensor_impl(self)->get_buffer())); + return self; +} + +Tensor NestedTensor_relu(const Tensor& self) { + return map_nt(self, at::relu); +} + +Tensor& NestedTensor_gelu_(Tensor& self, c10::string_view approximate) { + at::gelu_(const_cast(get_nested_tensor_impl(self)->get_buffer()), approximate); + return self; +} + +Tensor NestedTensor_gelu(const Tensor& self, c10::string_view approximate) { + return map_nt( + self, + [approximate](const Tensor& buffer) { + return at::gelu(buffer, approximate); + }); +} + +Tensor NestedTensor_nested_tensor_from_mask(const Tensor& t, const Tensor& mask) { + TORCH_CHECK(mask.scalar_type() == at::ScalarType::Bool, "Expected mask to be of ScalarType Bool, but got ", mask.scalar_type(), " instead."); + TORCH_CHECK(mask.dim() == 2, "Padding mask should be 2D"); + TORCH_CHECK(t.dim() == 3, "Input should be a 3D tensor, N * L * D"); + auto N = t.size(0), L = t.size(1), D = t.size(2); + auto NN = mask.size(0), LL = mask.size(1); + TORCH_CHECK(N == NN && L == LL, "Mask size should match input size"); + + // N * L + Tensor sizes = mask; + Tensor tmp_pad = at::zeros({N, 1}, mask.options()); + // Make sure padding is only added at the end of mask + Tensor nums = at::cat({sizes, tmp_pad}, 1).to(kInt).argmin(1); + + // N, ([size1, size2, ... sizeN]) + sizes = sizes.cumsum(1).select(1, L - 1); + nums = nums.to(sizes.options()); + + TORCH_CHECK(sizes.equal(nums), "Mask must be left-aligned without gaps"); + + sizes = sizes.reshape({N, 1}); + // N, ([d1=D, d2=D, ... dN=D]) + Tensor d = at::full_like(sizes, D); + + // N * 2, ([[size1, D], [size2, D], ..., [sizeN, D]]) + sizes = at::cat({sizes, d}, 1); + + return at::_nested_from_padded(t, sizes, false); +} + +Tensor nested_tensor( + TensorList list, + c10::optional dtype, + c10::optional layout, + c10::optional device, + c10::optional pin_memory) { + TensorOptions options_ = + TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory( + pin_memory); + + if (list.size() == 0) { + return wrap_buffer(ones({0}, dtype, layout, device), ones({})); + } + std::vector sizes; + std::vector flat_tensors; + for (const auto i : c10::irange(list.size())) { + if (i > 0) { + int64_t dim_i = list[i].dim(); + int64_t dim_prev = list[i - 1].dim(); + TORCH_CHECK( + dim_i == dim_prev, + "All Tensors given to nested_tensor must have the same dimension. ", + "Found dimension ", + dim_i, + " for Tensor at index ", + i, + " and dimension ", + dim_prev, + " for Tensor at index ", + i - 1, + "."); + } + // TODO: Remove call to contiguous once we support strides. + flat_tensors.push_back(list[i].reshape(-1).contiguous()); + sizes.push_back(tensor(c10::IntArrayRef(list[i].sizes()))); + } + + TensorOptions options = flat_tensors[0].options().merge_in(options_); + + return wrap_buffer( + at::cat(flat_tensors).to(options), at::native::stack(sizes)); +} + +int64_t get_consistent_last_dim_of_nested_tensor(const NestedTensorImpl& nt) { + auto result = maybe_get_consistent_last_dim_of_nested_tensor(nt); + TORCH_CHECK( + result.has_value(), + "all tensors in NestedTensor must have the same trailing dim for Matmul but got ", + nt.get_nested_size_tensor().select(1, -1)); + return *result; +} + +std::vector NestedTensor_get_max_size(const NestedTensorImpl& nt) { + return NestedTensor_get_max_size_from_size_tensor(nt.get_nested_size_tensor()); +} + +Tensor NestedTensor_layer_norm( + const Tensor& input, + const c10::optional& weight_opt, + const c10::optional& bias_opt, + double eps) { + TORCH_CHECK(weight_opt && bias_opt, "NestedTensor layer_norm requires weight and bias"); + const auto& weight = *weight_opt; + const auto& bias = *bias_opt; + TORCH_CHECK(!weight.is_nested(), "NestedTensor weight not supported for layer_norm"); + TORCH_CHECK(!bias.is_nested(), "NestedTensor bias not supported for layer_norm"); + auto* nt_input = get_nested_tensor_impl(input); + TORCH_CHECK(nested_tensor_impl_is_contiguous(nt_input)); + const auto& input_buffer = nt_input->get_buffer(); + const auto last_dim = get_consistent_last_dim_of_nested_tensor(*nt_input); + const auto valid_word_num = input_buffer.numel() / last_dim; + const auto weight_contig = weight.expect_contiguous(); + const auto bias_contig = bias.expect_contiguous(); + auto output_buffer = at::native::empty_like( + input_buffer, + c10::nullopt /* dtype */, + c10::nullopt /* layout */, + c10::nullopt /* device */, + c10::nullopt /* pin_memory */, + at::MemoryFormat::Contiguous); + auto options = input_buffer.options(); + if (input_buffer.is_cuda()) { + auto acc_type = at::toAccumulateType(input_buffer.scalar_type(), true); + options = options.dtype(acc_type); + } + Tensor mean = at::empty({valid_word_num}, options); + Tensor rstd = at::empty({valid_word_num}, options); + LayerNormKernel( + input_buffer.is_cuda() ? kCUDA : kCPU, + input_buffer, + *weight_contig, + *bias_contig, + valid_word_num, + last_dim, + eps, + &output_buffer, + &mean, + &rstd); + return at::detail::make_tensor( + std::move(output_buffer), nt_input->get_nested_size_tensor()); +} + +Tensor NestedTensor_from_padded_and_nested_example( + const Tensor& padded, + const Tensor& nt_example) { + return _nested_from_padded(padded, get_nested_tensor_impl(nt_example)->get_nested_size_tensor()); +} + +Tensor nested_from_padded_generic( + const Tensor& padded, + const Tensor& sizes, + const bool do_transform_0213) { + // Check and do transform 0213 + auto padded_transformed = padded; + if (do_transform_0213) { + padded_transformed = padded.permute({0, 2, 1, 3}) + .contiguous() + .view( + {padded.size(0), + padded.size(2), + padded.size(1) * padded.size(3)}); + } + const auto target_size = NestedTensor_get_max_size_from_size_tensor(sizes); + IntArrayRef target_size_arr(target_size); + std::vector masks; + std::vector all_sizes = sizes.unbind(); + for (const auto& size : all_sizes) { + IntArrayRef sizes_i( + size.data_ptr(), size.data_ptr() + size.numel()); + at::Tensor mask_i = padded_transformed.new_full( + sizes_i, true, kBool, c10::nullopt, c10::nullopt, c10::nullopt); + masks.push_back(pad_tensor_to_shape(mask_i, target_size_arr)); + } + at::Tensor final_mask = at::stack(masks); + at::Tensor new_buffer = padded_transformed.masked_select(final_mask); + return at::detail::make_tensor( + std::move(new_buffer), sizes); +} + +Tensor NestedTensor_to_padded_tensor_generic( + const Tensor& t, + double padding, + OptionalIntArrayRef output_size) { + // TODO: skipped optimization for case of all 1x1 tensors + auto& nt = *get_nested_tensor_impl(t); + auto max_size = NestedTensor_get_max_size(nt); + auto sizes = nt.get_nested_size_tensor(); + + if (sizes.numel() == 0 || sizes.dim() == 0) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(nt.get_buffer().numel() == 0); + return nt.get_buffer(); + } + + // TODO: doesn't handle empty/scalar entries because we don't need + // it for transformers; see to_padded_tensor in + // pytorch/nestedtensor's masking.cpp. + + const auto sizes_num_rows = sizes.sizes()[0]; + const auto sizes_num_columns = sizes.sizes()[1]; + const auto sizes_data_start = sizes.data_ptr(); + const auto sizes_data_end = sizes_data_start + sizes.numel(); + std::vector split_sizes; + split_sizes.reserve(sizes_num_rows); + for (auto sizes_data = sizes_data_start; sizes_data != sizes_data_end; + sizes_data += sizes_num_columns) { + split_sizes.push_back( + num_bytes(IntArrayRef(sizes_data, sizes_num_columns))); + } + std::vector nonzero_split_sizes; + for (const auto split_size : split_sizes) { + if (split_size > 0) { + nonzero_split_sizes.push_back(split_size); + } + } + const auto buffer = nt.get_buffer(); + std::vector buffers_; + if (!nonzero_split_sizes.empty()) { + buffers_ = at::split_with_sizes(buffer, nonzero_split_sizes, 0); + } + + std::vector buffers; + buffers.reserve(split_sizes.size()); + int64_t next_buffer = 0; + auto sizes_ptr = sizes_data_start; + for (const auto split_size : split_sizes) { + Tensor to_pad; + IntArrayRef tensor_sizes(sizes_ptr, sizes_num_columns); + if (split_size > 0) { + to_pad = buffers_[next_buffer++].reshape(tensor_sizes); + } else { + to_pad = at::empty(tensor_sizes, buffer.options()); + } + buffers.push_back(pad_tensor_to_shape(to_pad, max_size, padding)); + sizes_ptr += sizes_num_columns; + } + auto ret_val = at::stack(buffers); + + // Pad output tensor to output_size if provided + if (output_size.has_value()) { + auto output_size_ = output_size.value(); + TORCH_CHECK( + (int64_t)output_size_.size() == ret_val.dim(), + "Length of output_size does not match NestedTensor dims. Broadcasting is not supported."); + for (int64_t i = 0; i < (int64_t)ret_val.dim(); i++) { + TORCH_CHECK( + output_size_[i] >= ret_val.size(i), + "Value in output_size is less than NestedTensor padded size. Truncation is not supported."); + } + return pad_tensor_to_shape(ret_val, output_size_, padding); + } + return ret_val; +} + +Tensor NestedTensor_embedding( + const Tensor& weight, + const Tensor& indices, + int64_t padding_idx, + bool scale_grad_by_freq, + bool sparse) { + const auto* nt_indices = get_nested_tensor_impl(indices); + TORCH_CHECK( + !weight.is_nested(), "NestedTensor weight not supported for embedding"); + TORCH_CHECK(indices.dim() < 3); + TORCH_CHECK(indices.dim() > 0, "NestedTensor embedding doesn't support empty indices.") + TORCH_CHECK(weight.dim() == 2); + TORCH_CHECK(nested_tensor_impl_is_contiguous(nt_indices)); + TORCH_CHECK(weight.is_contiguous()); + + const auto& indices_buffer = nt_indices->get_buffer(); + auto result_buffer = at::embedding( + weight, indices_buffer, padding_idx, scale_grad_by_freq, sparse); + const auto& sizes = nt_indices->get_nested_size_tensor(); + auto new_sizes = at::empty({sizes.size(0)}, sizes.options()); + new_sizes.fill_(weight.sizes()[1]); + new_sizes = new_sizes.reshape({new_sizes.size(0), 1}); + new_sizes = at::cat({sizes, new_sizes}, 1); + return at::detail::make_tensor( + result_buffer.reshape({-1}), std::move(new_sizes)); +} + +std::pair +get_elementwise_nested_tensor_impl( + const Tensor& self, + const Tensor& other, + const std::string& op_name) { + if (self.is_nested() && !(other.is_nested())) { + TORCH_CHECK( + false, + "Expected both self and other to be nested, but got a nested self and non-nested other"); + } else if (!(self.is_nested()) && other.is_nested()) { + TORCH_CHECK( + false, + "Expected both self and other to be nested, but got a non-nested self and nested other"); + } else if (!(self.is_nested()) || !(other.is_nested())) { + TORCH_CHECK( + false, + "Expected both self and other to be nested, but got a non-nested self and non-nested other"); + } + + auto self_ptr = get_nested_tensor_impl(self); + auto other_ptr = get_nested_tensor_impl(other); + + TORCH_CHECK( + self.dim() == other.dim(), + op_name, + " does not support broadcasting when given a NestedTensor"); + TORCH_CHECK( + at::equal( + self_ptr->get_nested_size_tensor(), + other_ptr->get_nested_size_tensor()), + op_name, + " does not support broadcasting when given a NestedTensor"); + TORCH_CHECK( + nested_tensor_impl_is_contiguous(self_ptr) && + nested_tensor_impl_is_contiguous(other_ptr), + op_name, + " does not support non-contiguous NestedTensor inputs"); + return std::make_pair(self_ptr, other_ptr); +} + +template +Tensor NestedTensor_elementwise_Tensor( + const Tensor& self, + const Tensor& other, + const std::string& op_name, + Func f) { + NestedTensorImpl* self_impl = nullptr; + NestedTensorImpl* other_impl = nullptr; + std::tie(self_impl, other_impl) = + get_elementwise_nested_tensor_impl(self, other, op_name); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self_impl); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(other_impl); + const auto& nt_self = *self_impl; + const auto& nt_other = *other_impl; + const auto& self_sizes = nt_self.get_nested_size_tensor(); + return wrap_buffer( + f(nt_self.get_buffer().reshape({-1}), + nt_other.get_buffer().reshape({-1})), + self_sizes); +} + +Tensor NestedTensor_add_Tensor( + const Tensor& self, + const Tensor& other, + const Scalar& alpha) { + return NestedTensor_elementwise_Tensor( + self, other, "add", [alpha](const Tensor& b1, const Tensor& b2) { + return at::add(b1, b2, alpha); + }); +} + +Tensor NestedTensor_mul_Tensor(const Tensor& self, const Tensor& other) { + return NestedTensor_elementwise_Tensor( + self, other, "mul", [](const Tensor& b1, const Tensor& b2) { + return at::mul(b1, b2); + }); +} + +template +Tensor& NestedTensor_elementwise__Tensor( + Tensor& self, + const Tensor& other, + const std::string& op_name, + Func f) { + NestedTensorImpl* self_impl = nullptr; + NestedTensorImpl* other_impl = nullptr; + std::tie(self_impl, other_impl) = + get_elementwise_nested_tensor_impl(self, other, op_name); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self_impl); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(other_impl); + const auto& nt_self = *self_impl; + const auto& nt_other = *other_impl; + f(nt_self.get_buffer().view({-1}), nt_other.get_buffer().view({-1})); + return self; +} + +Tensor& NestedTensor_add__Tensor( + Tensor& self, + const Tensor& other, + const Scalar& alpha) { + return NestedTensor_elementwise__Tensor( + self, other, "add_", [alpha](const Tensor& b1, const Tensor& b2) { + return b1.add_(b2, alpha); + }); +} + +Tensor& NestedTensor_mul__Tensor(Tensor& self, const Tensor& other) { + return NestedTensor_elementwise__Tensor( + self, other, "mul_", [](const Tensor& b1, const Tensor& b2) { + return b1.mul_(b2); + }); +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/nested/NestedTensorMath.h b/aten/src/ATen/native/nested/NestedTensorMath.h new file mode 100644 index 000000000000..8f2919fc35b8 --- /dev/null +++ b/aten/src/ATen/native/nested/NestedTensorMath.h @@ -0,0 +1,20 @@ +#pragma once + +#include +#include + +#include + +namespace at { +namespace native { +struct NestedTensorImpl; + +// TODO: cache this and only do it once per NestedTensor +int64_t get_consistent_last_dim_of_nested_tensor(const NestedTensorImpl& nt); + +TORCH_API std::vector NestedTensor_get_max_size(const NestedTensorImpl& nt); + +TORCH_API Tensor NestedTensor_to_padded_tensor_generic(const Tensor& t, double padding, OptionalIntArrayRef output_size); + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp new file mode 100644 index 000000000000..6ca4ff7e22a7 --- /dev/null +++ b/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp @@ -0,0 +1,142 @@ +#include + +#include +#include +#include +#include + +namespace at { +namespace native { + +Tensor NestedTensor_matmul(const Tensor& self, const Tensor& other) { + auto* nt_self = get_nested_tensor_impl_or_null(self); + TORCH_CHECK(nt_self != nullptr); + TORCH_CHECK(nested_tensor_impl_is_contiguous(nt_self)); + TORCH_CHECK(self.dim() == 3 && other.dim() == 2); + const auto last_dim = get_consistent_last_dim_of_nested_tensor(*nt_self); + TORCH_CHECK( + last_dim == other.sizes()[0], + "shape mismatch for NestedTensor matmul. NestedTensor last_dim: ", + last_dim, + " vs. first dim of rhs: ", + other.sizes()[0]); + const Tensor& self_buffer = nt_self->get_buffer(); + Tensor result_buffer = + at::mm(self_buffer.reshape({-1, other.sizes()[0]}), other); + result_buffer = result_buffer.reshape({-1}); + int64_t other_size_1 = other.sizes()[1]; + Tensor new_sizes = nt_self->get_nested_size_tensor().clone(); + // Now the last entry in every row of new_sizes should be other_size_1. + new_sizes.index_put_({at::indexing::Slice(), -1}, other_size_1); + return at::detail::make_tensor( + std::move(result_buffer), std::move(new_sizes)); +} + +Tensor NestedTensor_times_Tensor_plus_Tensor_addmm( + const Tensor& self, + const Tensor& mat1, + const Tensor& mat2, + const c10::Scalar& beta, + const c10::Scalar& alpha, + c10::optional use_gelu) { + // Interesting case: alpha * NT * T + beta * T + const auto* nt_mat1 = get_nested_tensor_impl_or_null(mat1); + TORCH_INTERNAL_ASSERT(nt_mat1 != nullptr); + TORCH_INTERNAL_ASSERT(!mat2.is_nested()); + TORCH_INTERNAL_ASSERT(!self.is_nested()); + TORCH_INTERNAL_ASSERT(nested_tensor_impl_is_contiguous(nt_mat1)); + TORCH_INTERNAL_ASSERT(mat1.dim() == 3 && mat2.dim() == 2); + TORCH_INTERNAL_ASSERT( + get_consistent_last_dim_of_nested_tensor(*nt_mat1) == mat2.sizes()[0]); + const Tensor& mat1_buffer = nt_mat1->get_buffer(); + Tensor result_buffer = !use_gelu.has_value() + ? at::addmm( + self, mat1_buffer.reshape({-1, mat2.sizes()[0]}), mat2, beta, alpha) + : at::_addmm_activation( + self, + mat1_buffer.reshape({-1, mat2.sizes()[0]}), + mat2, + beta, + alpha, + *use_gelu); + result_buffer = result_buffer.reshape({-1}); + int64_t other_size_1 = mat2.sizes()[1]; + Tensor new_sizes = nt_mat1->get_nested_size_tensor().clone(); + new_sizes.index_put_({at::indexing::Slice(), -1}, other_size_1); + return at::detail::make_tensor( + std::move(result_buffer), std::move(new_sizes)); +} + +Tensor NestedTensor_add_NestedTensor_in_place( + const Tensor& self, + const Tensor& other) { + TORCH_INTERNAL_ASSERT(self.is_nested() && other.is_nested()); + const auto& nt_self = *get_nested_tensor_impl(self); + const auto& nt_other = *get_nested_tensor_impl(other); + + const auto& self_sizes = nt_self.get_nested_size_tensor(); + const auto& other_sizes = nt_other.get_nested_size_tensor(); + + TORCH_CHECK(at::equal(self_sizes, other_sizes)); + TORCH_INTERNAL_ASSERT( + nested_tensor_impl_is_contiguous(&nt_self) && + nested_tensor_impl_is_contiguous(&nt_other)); + nt_self.get_buffer().view({-1}).add_(nt_other.get_buffer().view({-1})); + return self; +} + +Tensor NestedTensor_batch_offsets_from_size_tensor( + const Tensor& sizes, + int64_t extra_elements) { + int64_t* const sizes_ptr = sizes.data_ptr(); + Tensor offsets = at::empty({1 + sizes.size(0) + extra_elements}, at::kInt); + int32_t* const offsets_ptr = offsets.data_ptr(); + offsets_ptr[0] = 0; + const auto sizes_size_1 = sizes.size(1); + const auto sizes_size_0 = sizes.size(0); + for (const auto i : c10::irange(sizes_size_0)) { + int64_t prod = 1; + for (const auto j : c10::irange(sizes_size_1)) { + prod *= sizes_ptr[i * sizes_size_1 + j]; + } + offsets_ptr[i + 1] = offsets_ptr[i] + prod; + } + return offsets; +} + +Tensor NestedTensor_to_mask(const Tensor& nt, c10::optional mask_dim) { + auto* nt_impl = get_nested_tensor_impl(nt); + TORCH_CHECK( + !mask_dim || *mask_dim < nt.dim(), + "Requested mask dimension ", + *mask_dim, + " is bigger than dimension ", + nt.dim(), + " of given NestedTensor."); + + // TODO: port optimization for 1x1 tensors from + // pytorch/nestedtensor's version. + + TORCH_CHECK( + mask_dim && *mask_dim == 2 && nt.dim() == 3, + "Only the special case of mask_dim == 2 on a 3-D NestedTensor is supported right now.") + const auto& sizes = nt_impl->get_nested_size_tensor(); + // Shape: # of tensors in our NestedTensor by max size along first dim + // TODO: calculate this without allocating a std::vector. + const auto result_size_1 = NestedTensor_get_max_size(*nt_impl)[0]; + auto result = at::ones({sizes.sizes()[0], result_size_1}, at::kBool); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(sizes.dim() == 2); + auto* result_data = result.data_ptr(); + auto* sizes_ptr = sizes.data_ptr(); + const auto sizes_size_1 = sizes.sizes()[1]; + for (const auto ii : c10::irange(sizes.sizes()[0])) { + auto length = sizes_ptr[ii * sizes_size_1]; + for (const auto jj : c10::irange(length)) { + result_data[ii * result_size_1 + jj] = false; + } + } + return result; +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.h b/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.h new file mode 100644 index 000000000000..a4b70d954c3f --- /dev/null +++ b/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.h @@ -0,0 +1,87 @@ +/** + * Transformer-specific NestedTensor utility functions. + * + * Not co-located with NestedTensor core code yet because they only + * support specific cases needed in transformers. + */ +#pragma once + +#include + +#include +#include + +namespace c10 { +class Scalar; +} // namespace c10 + +namespace at { +class Tensor; +namespace native { +struct NestedTensorImpl; + +// Requires that self is a contiguous NestedTensor, other is not a +// NestedTensor, self.dim() == 3, and other.dim() == 2. Also, self +// must have a consistent last dimension across its included Tensors +// and that dimension must match other.size(0). +Tensor NestedTensor_matmul(const Tensor& self, const Tensor& other); + +// Requires that mat1 is a contiguous NestedTensor, self & mat2 are +// not NestedTensors, mat1.dim() == 3, mat2.dim() == 2, and that mat1 +// has a consistent last dimension across its included Tensors that +// matches mat2.size(0). +Tensor NestedTensor_times_Tensor_plus_Tensor_addmm( + const Tensor& self, + const Tensor& mat1, + const Tensor& mat2, + const c10::Scalar& beta, + const c10::Scalar& alpha, + c10::optional use_gelu = c10::nullopt); + +Tensor NestedTensor_add_NestedTensor_in_place( + const Tensor& self, + const Tensor& other); + +TORCH_API Tensor NestedTensor_batch_offsets_from_size_tensor( + const Tensor& sizes, + int64_t extra_elements); + +Tensor NestedTensor_from_padded_tensor_cpu( + const Tensor& padded, + const NestedTensorImpl& nt); + +Tensor NestedTensor_to_mask(const Tensor& nt, c10::optional mask_dim); + +template +void remove_padding_kernelLauncher( + const T* input, + T* output, + const int* offsets, + const int* input_sizes, + const int* output_sizes, + int output_dim, + const int batch_size); + +template +void remove_padding_transform0213_kernelLauncher( + const T* input, + T* output, + const int* offsets, + const int* input_sizes, + const int* output_sizes, + int output_dim, + const int batch_size); + +template +void add_padding_kernelLauncher( + T* input, + T* output, + T padding_value, + const int* offsets, + const int* input_sizes, + int input_dim, + const std::vector& output_sizes, + const int batch_size, + const int output_batch_size); +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp new file mode 100644 index 000000000000..f1cf67676ced --- /dev/null +++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp @@ -0,0 +1,206 @@ +#include + +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +#include +#include + +namespace at { +namespace native { +namespace { +int64_t padded_tensor_numel(const Tensor& sizes) { + const auto sizes_num_rows = sizes.sizes()[0]; + const auto sizes_row_length = sizes.sizes()[1]; + const auto* sizes_data = sizes.data_ptr(); + int64_t numel = 0; + for (const auto row_num : c10::irange(sizes_num_rows)) { + const auto* row_ptr = sizes_data + row_num * sizes_row_length; + int64_t prod = 1; + for (const auto idx : c10::irange(sizes_row_length)) { + prod *= row_ptr[idx]; + } + numel += prod; + } + return numel; +} +} // namespace +Tensor nested_from_padded_cuda( + const Tensor& padded, + const Tensor& sizes, + bool do_transform_0213) { + if (padded.dim() > 1 && padded.dim() < 5) { + if (padded.dtype() != kFloat && padded.dtype() != kHalf) { + TORCH_WARN_ONCE( + "nested_from_padded CUDA kernels only support fp32/fp16; falling " + "back to slower generic kernel"); + return at::native::nested_from_padded_generic(padded, sizes, do_transform_0213); + } + TORCH_CHECK( + (padded.dim() == 4 && do_transform_0213) || + (padded.dim() == 3 && !do_transform_0213), + "padded tensor size error"); + Tensor target_offsets = + NestedTensor_batch_offsets_from_size_tensor(sizes, 0); + Tensor padded_sizes_tensor = at::tensor(padded.sizes()); + Tensor output = at::empty({padded_tensor_numel(sizes)}, padded.options()); + Tensor target_size_sizes = sizes.reshape(-1); + + Tensor metadata = + at::cat({target_size_sizes, padded_sizes_tensor, target_offsets}); + metadata = metadata.to(at::Device(kCUDA), kInt, true, true); + + auto output_size_ptr = metadata.data_ptr(); + auto input_size_ptr = output_size_ptr + target_size_sizes.numel(); + auto offsets_ptr = input_size_ptr + padded_sizes_tensor.numel(); + + if (padded.dtype() == kFloat) { + if (do_transform_0213) { + remove_padding_transform0213_kernelLauncher( + padded.data_ptr(), + output.data_ptr(), + offsets_ptr, + input_size_ptr, + output_size_ptr, + padded.dim() - 2, + padded.sizes()[0]); + } else { + remove_padding_kernelLauncher( + padded.data_ptr(), + output.data_ptr(), + offsets_ptr, + input_size_ptr, + output_size_ptr, + padded.dim() - 1, + padded.sizes()[0]); + } + } else if (padded.dtype() == kHalf) { + if (do_transform_0213) { + remove_padding_transform0213_kernelLauncher( + padded.data_ptr(), + output.data_ptr(), + offsets_ptr, + input_size_ptr, + output_size_ptr, + padded.dim() - 2, + padded.sizes()[0]); + } else { + remove_padding_kernelLauncher( + padded.data_ptr(), + output.data_ptr(), + offsets_ptr, + input_size_ptr, + output_size_ptr, + padded.dim() - 1, + padded.sizes()[0]); + } + } else { + AT_ERROR("Only support fp32/fp16 for padded input"); + } + return at::detail::make_tensor(std::move(output), sizes); + } else { + return at::native::nested_from_padded_generic(padded, sizes); + } +} + +Tensor batch_offsets_from_efficient_size(const Tensor& ef_sizes) { + int64_t* nt_sizes_ptr = ef_sizes.data_ptr(); + int64_t ef_sizes_size_0 = ef_sizes.sizes()[0]; + Tensor offsets = at::empty({1 + ef_sizes_size_0}, at::kLong); + int64_t* offsets_ptr = offsets.data_ptr(); + offsets_ptr[0] = 0; + int64_t ef_sizes_size_1 = ef_sizes.sizes()[1]; + for (const auto i : c10::irange(ef_sizes_size_0)) { + int64_t prod = 1; + for (const auto j : c10::irange(ef_sizes_size_1)) { + prod = prod * nt_sizes_ptr[i * ef_sizes_size_1 + j]; + } + offsets_ptr[i + 1] = offsets_ptr[i] + prod; + } + return offsets; +} + +Tensor NestedTensor_to_padded_tensor_cuda( + const Tensor& t, + double padding, + OptionalIntArrayRef output_size) { + int64_t t_dim = t.dim(); + if (t_dim >= 2 && t_dim <= 4 && + (t.dtype() == at::kFloat || t.dtype() == at::kDouble || + t.dtype() == at::kHalf)) { + auto* nt_input = get_nested_tensor_impl(t); + TORCH_CHECK(nested_tensor_impl_is_contiguous(nt_input)); + const auto& nt_buffer = nt_input->get_buffer(); + + if (t_dim == 3 && nt_input->opt_size(2) && (*nt_input->opt_size(2) > 0) && + !(output_size.has_value())) { + Tensor nt_sizes = nt_input->get_nested_size_tensor(); + Tensor sizes_dim1 = at::native::narrow(nt_sizes, 1, 0, 1); + Tensor sizes_dim2 = at::native::narrow(nt_sizes, 1, 1, 1); + Tensor result = at::detail::make_tensor( + nt_input->get_buffer(), sizes_dim1 * sizes_dim2[0]); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.dim() == 2); + result = + NestedTensor_to_padded_tensor_cuda(result, padding, output_size); + return result.reshape({result.sizes()[0], -1, *nt_input->opt_size(2)}); + } + + Tensor nt_sizes = nt_input->get_nested_size_tensor(); + Tensor offsets = batch_offsets_from_efficient_size(nt_sizes); + auto new_size = NestedTensor_get_max_size(*nt_input); + new_size.insert(new_size.begin(), nt_sizes.sizes()[0]); + + // Pad output tensor to output_size if provided + if (output_size.has_value()) { + auto output_size_ = output_size.value(); + TORCH_CHECK( + output_size_.size() == new_size.size(), + "Length of output_size does not match NestedTensor dims. Broadcasting is not supported."); + for (uint64_t i = 0; i < new_size.size(); i++) { + TORCH_CHECK( + output_size_[i] >= new_size[i], + "Value in output_size is less than NestedTensor padded size. Truncation is not supported."); + new_size[i] = output_size_[i]; + } + } + + Tensor output = at::empty(IntArrayRef(new_size), nt_buffer.options()); + + int64_t input_dim = nt_sizes.sizes()[1]; + int64_t batch_size = nt_sizes.sizes()[0]; + int64_t output_batch_size = new_size[0]; + // TODO: Remove need for cat here + at::Tensor metadata = at::cat({offsets, nt_sizes.reshape(-1)}); + metadata = metadata.to(at::Device(kCUDA), at::kInt); + + std::vector split = + at::split_with_sizes(metadata, {offsets.numel(), nt_sizes.numel()}, 0); + + offsets = split[0]; + nt_sizes = split[1]; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + nt_buffer.scalar_type(), "NestedTensor_to_padded_tensor_cuda", [&]() { + add_padding_kernelLauncher( + nt_buffer.data_ptr(), + output.data_ptr(), + (scalar_t)(padding), + offsets.data_ptr(), + nt_sizes.data_ptr(), + input_dim, + new_size, + batch_size, + output_batch_size); + }); + return output; + } + return NestedTensor_to_padded_tensor_generic(t, padding, output_size); +} +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cu b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cu new file mode 100644 index 000000000000..7e9f95aad747 --- /dev/null +++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cu @@ -0,0 +1,449 @@ +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +namespace at { +namespace native { + +template +__global__ void remove_padding_transform0213_2( + const T* input, + T* output, + const int* offsets, + const int* input_sizes, + const int* output_sizes, + int output_dim, + const int batch_size) { + const int batch_id = blockIdx.x; + const int grid_id = blockIdx.y; + const int tid = threadIdx.x + grid_id * 256; + const int grainsize = 16 * 256; + const int offset = offsets[batch_id]; + const int* sizes_i = output_sizes + batch_id * output_dim; + const int numel_i = sizes_i[0] * sizes_i[1]; + int input_offset = + batch_id * input_sizes[1] * input_sizes[2] * input_sizes[3]; + for (int ii = 0; ii < (numel_i / grainsize); ii++) { + const int i = ii * grainsize + tid; + const int i2 = i / sizes_i[1]; + const int i13 = i % sizes_i[1]; + const int i1 = i13 / (sizes_i[1] / input_sizes[1]); + const int i3 = i13 % (sizes_i[1] / input_sizes[1]); + + output[offset + i] = input + [input_offset + i1 * input_sizes[2] * input_sizes[3] + + i2 * input_sizes[3] + i3]; + } + const int i = (numel_i / grainsize) * grainsize + tid; + if (i < numel_i) { + const int i2 = i / sizes_i[1]; + const int i13 = i % sizes_i[1]; + const int i1 = i13 / (sizes_i[1] / input_sizes[1]); + const int i3 = i13 % (sizes_i[1] / input_sizes[1]); + output[offset + i] = input + [input_offset + i1 * input_sizes[2] * input_sizes[3] + + i2 * input_sizes[3] + i3]; + } +} + +template +__global__ void remove_padding_2( + const T* input, + T* output, + const int* offsets, + const int* input_sizes, + const int* output_sizes, + int output_dim, + const int batch_size) { + const int batch_id = blockIdx.x; + const int grid_id = blockIdx.y; + const int tid = threadIdx.x + grid_id * 256; + const int grainsize = 16 * 256; + const int offset = offsets[batch_id]; + const int* sizes_i = output_sizes + batch_id * output_dim; + const int numel_i = sizes_i[0] * sizes_i[1]; + int input_offset = batch_id * input_sizes[1] * input_sizes[2]; + for (int ii = 0; ii < (numel_i / grainsize); ii++) { + const int i = ii * grainsize + tid; + const int i0 = i / sizes_i[1]; + const int i1 = i % sizes_i[1]; + const int i0_offset = i0 * input_sizes[2]; + output[offset + i] = input[input_offset + i0_offset + i1]; + } + const int i = (numel_i / grainsize) * grainsize + tid; + if (i < numel_i) { + const int i0 = i / sizes_i[1]; + const int i1 = i % sizes_i[1]; + const int i0_offset = i0 * input_sizes[2]; + output[offset + i] = input[input_offset + i0_offset + i1]; + } +} + +template +__global__ void remove_padding( + const T* input, + T* output, + const int* offsets, + const int* input_sizes, + const int* output_sizes, + int output_dim, + const int batch_size) { + const int batch_id = blockIdx.x; + const int grid_id = blockIdx.y; + const int tid = threadIdx.x + grid_id * 256; + const int grainsize = 16 * 256; + const int offset = offsets[batch_id]; + const int* sizes_i = output_sizes + batch_id * output_dim; + const int numel_i = sizes_i[0] * sizes_i[1] * sizes_i[2]; + int input_offset = + batch_id * input_sizes[1] * input_sizes[2] * input_sizes[3]; + for (int ii = 0; ii < (numel_i / grainsize); ii++) { + const int i = ii * grainsize + tid; + const int i0 = i / (sizes_i[1] * sizes_i[2]); + const int i1 = (i % (sizes_i[1] * sizes_i[2])) / sizes_i[2]; + const int i2 = i % sizes_i[2]; + const int i0_offset = i0 * input_sizes[2] * input_sizes[3]; + const int i1_offset = i1 * input_sizes[3]; + output[offset + i] = input[input_offset + i0_offset + i1_offset + i2]; + } + const int i = (numel_i / grainsize) * grainsize + tid; + if (i < numel_i) { + const int i0 = i / (sizes_i[1] * sizes_i[2]); + const int i1 = (i % (sizes_i[1] * sizes_i[2])) / sizes_i[2]; + const int i2 = i % sizes_i[2]; + const int i0_offset = i0 * input_sizes[2] * input_sizes[3]; + const int i1_offset = i1 * input_sizes[3]; + output[offset + i] = input[input_offset + i0_offset + i1_offset + i2]; + } +} + +template +void remove_padding_kernelLauncher( + const T* input, + T* output, + const int* offsets, + const int* input_sizes, + const int* output_sizes, + int output_dim, + const int batch_size) { + dim3 grid; + grid.x = batch_size; + grid.y = 16; + at::cuda::CUDAStream stream = at::cuda::getDefaultCUDAStream(); + if (output_dim == 2) { + remove_padding_2<<>>( + input, + output, + offsets, + input_sizes, + output_sizes, + output_dim, + batch_size); + } else { + remove_padding<<>>( + input, + output, + offsets, + input_sizes, + output_sizes, + output_dim, + batch_size); + } +} + +template +void remove_padding_transform0213_kernelLauncher( + const T* input, + T* output, + const int* offsets, + const int* input_sizes, + const int* output_sizes, + int output_dim, + const int batch_size) { + dim3 grid; + grid.x = batch_size; + grid.y = 16; + at::cuda::CUDAStream stream = at::cuda::getDefaultCUDAStream(); + TORCH_CHECK( + output_dim == 2, + "remove padding transform0213 only support output dim == 2"); + + remove_padding_transform0213_2<<>>( + input, + output, + offsets, + input_sizes, + output_sizes, + output_dim, + batch_size); +} + +template void remove_padding_kernelLauncher( + const float* input, + float* output, + const int* offsets, + const int* input_sizes, + const int* output_sizes, + int output_dim, + const int batch_size); + +template void remove_padding_kernelLauncher( + const c10::Half* input, + c10::Half* output, + const int* offsets, + const int* input_sizes, + const int* output_sizes, + int output_dim, + const int batch_size); + +template void remove_padding_transform0213_kernelLauncher( + const float* input, + float* output, + const int* offsets, + const int* input_sizes, + const int* output_sizes, + int output_dim, + const int batch_size); + +template void remove_padding_transform0213_kernelLauncher( + const c10::Half* input, + c10::Half* output, + const int* offsets, + const int* input_sizes, + const int* output_sizes, + int output_dim, + const int batch_size); + +template +__global__ void add_padding_1( + const T* input, + T* output, + T padding_value, + const int* offsets, + const int* input_sizes, + int input_dim, + int output_sizes_1, + const int batch_size) { + const int batch_id = blockIdx.x; + const int grid_id = blockIdx.y; + const int tid = threadIdx.x + grid_id * 256; + const int grainsize = 16 * 256; + const int* sizes_i = input_sizes + batch_id * input_dim; + const int batch_output_offset = batch_id * output_sizes_1; + for (int ii = 0; ii < (output_sizes_1 / grainsize); ii++) { + const int i = ii * grainsize + tid; + const int output_offset = batch_output_offset + i; + if (batch_id < batch_size && i < sizes_i[0]) { + const int batch_input_offset = offsets[batch_id]; + output[output_offset] = input[batch_input_offset + i]; + } else { + output[output_offset] = padding_value; + } + } + const int i = (output_sizes_1 / grainsize) * grainsize + tid; + if (i < output_sizes_1) { + const int output_offset = batch_output_offset + i; + if (batch_id < batch_size && (i < sizes_i[0])) { + const int batch_input_offset = offsets[batch_id]; + output[output_offset] = input[batch_input_offset + i]; + } else { + output[output_offset] = padding_value; + } + } +} + +template +__global__ void add_padding_2( + const T* input, + T* output, + T padding_value, + const int* offsets, + const int* input_sizes, + int input_dim, + int output_sizes_1, + int output_sizes_2, + const int batch_size) { + const int batch_id = blockIdx.x; + const int grid_id = blockIdx.y; + const int tid = threadIdx.x + grid_id * 256; + const int grainsize = 16 * 256; + const int* sizes_i = input_sizes + batch_id * input_dim; + const int output_offset = batch_id * output_sizes_1 * output_sizes_2; + const int output_numel = output_sizes_1 * output_sizes_2; + for (int ii = 0; ii < (output_numel / grainsize); ii++) { + const int i = ii * grainsize + tid; + const int i0 = i / (output_sizes_2); + const int i1 = i - i0 * output_sizes_2; + if (batch_id < batch_size && i0 < sizes_i[0] && i1 < sizes_i[1]) { + const int offset = offsets[batch_id]; + const int input_offset = offset + i0 * sizes_i[1] + i1; + output[output_offset + i] = input[input_offset]; + } else { + output[output_offset + i] = padding_value; + } + } + const int i = (output_numel / grainsize) * grainsize + tid; + if (i < output_numel) { + const int i0 = i / (output_sizes_2); + const int i1 = i - i0 * output_sizes_2; + if (batch_id < batch_size && i0 < sizes_i[0] && i1 < sizes_i[1]) { + const int offset = offsets[batch_id]; + const int input_offset = offset + i0 * sizes_i[1] + i1; + output[output_offset + i] = input[input_offset]; + } else { + output[output_offset + i] = padding_value; + } + } +} + +template +__global__ void add_padding_3( + const T* input, + T* output, + T padding_value, + const int* offsets, + const int* input_sizes, + int input_dim, + int output_sizes_1, + int output_sizes_2, + int output_sizes_3, + const int batch_size) { + const int batch_id = blockIdx.x; + const int grid_id = blockIdx.y; + const int tid = threadIdx.x + grid_id * 256; + const int grainsize = 16 * 256; + const int* sizes_i = input_sizes + batch_id * input_dim; + const int output_offset = + batch_id * output_sizes_1 * output_sizes_2 * output_sizes_3; + const int output_numel = output_sizes_1 * output_sizes_2 * output_sizes_3; + for (int ii = 0; ii < (output_numel / grainsize); ii++) { + const int i = ii * grainsize + tid; + const int i0 = i / (output_sizes_2 * output_sizes_3); + const int i1 = (i % (output_sizes_2 * output_sizes_3)) / output_sizes_3; + const int i2 = i % output_sizes_3; + if (batch_id < batch_size && i0 < sizes_i[0] && i1 < sizes_i[1] && i2 < sizes_i[2]) { + const int offset = offsets[batch_id]; + const int input_offset = + offset + i0 * (sizes_i[1] * sizes_i[2]) + i1 * sizes_i[2] + i2; + output[output_offset + i] = input[input_offset]; + } else { + output[output_offset + i] = padding_value; + } + } + const int i = (output_numel / grainsize) * grainsize + tid; + if (i < output_numel) { + const int i0 = i / (output_sizes_2 * output_sizes_3); + const int i1 = (i % (output_sizes_2 * output_sizes_3)) / output_sizes_3; + const int i2 = i % output_sizes_3; + if (batch_id < batch_size && i0 < sizes_i[0] && i1 < sizes_i[1] && i2 < sizes_i[2]) { + const int offset = offsets[batch_id]; + const int input_offset = + offset + i0 * (sizes_i[1] * sizes_i[2]) + i1 * sizes_i[2] + i2; + output[output_offset + i] = input[input_offset]; + } else { + output[output_offset + i] = padding_value; + } + } +} + +template +void add_padding_kernelLauncher( + T* input, // [batch_size x None] + T* output, // [batch_size x max(input.nested_size(1)) x inner_size] + T padding_value, + const int* offsets, + const int* input_sizes, + int input_dim, + const std::vector& output_sizes, + const int batch_size, + const int output_batch_size) { + at::cuda::CUDAStream stream = at::cuda::getDefaultCUDAStream(); + dim3 grid; + grid.x = output_batch_size; + grid.y = 16; + if (input_dim == 1) { + add_padding_1<<>>( + input, + output, + padding_value, + offsets, + input_sizes, + input_dim, + output_sizes[1], + batch_size); + } + if (input_dim == 2) { + add_padding_2<<>>( + input, + output, + padding_value, + offsets, + input_sizes, + input_dim, + output_sizes[1], + output_sizes[2], + batch_size); + } + if (input_dim == 3) { + add_padding_3<<>>( + input, + output, + padding_value, + offsets, + input_sizes, + input_dim, + output_sizes[1], + output_sizes[2], + output_sizes[3], + batch_size); + } +} + +template void add_padding_kernelLauncher( + double* input, + double* output, + double padding_value, + const int* offsets, + const int* input_sizes, + int input_dim, + const std::vector& output_sizes, + const int batch_size, + const int output_batch_size); + +template void add_padding_kernelLauncher( + float* input, + float* output, + float padding_value, + const int* offsets, + const int* input_sizes, + int input_dim, + const std::vector& output_sizes, + const int batch_size, + const int output_batch_size); + +template void add_padding_kernelLauncher( + c10::Half* input, + c10::Half* output, + c10::Half padding_value, + const int* offsets, + const int* input_sizes, + int input_dim, + const std::vector& output_sizes, + const int batch_size, + const int output_batch_size); + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/quantized/Copy.cpp b/aten/src/ATen/native/quantized/Copy.cpp index ac3f5e9783d0..e3b6bd8cd669 100644 --- a/aten/src/ATen/native/quantized/Copy.cpp +++ b/aten/src/ATen/native/quantized/Copy.cpp @@ -13,7 +13,7 @@ namespace native { // This means that assignment of a non-contiguous quantized subtensor is currently not supported in pytorch // e.g., Consider a 2x2 quantized tensor qt1 and a non-quantized tensor t2. The operation // `qt1[:, 0] = t2[:, 0]` would trigger the exception b/c neither the LHS nor RHS is contiguous -Tensor& quantized_copy_from_float_cpu_(Tensor& self, const Tensor& src) { +Tensor& quantized_copy_from_float_(Tensor& self, const Tensor& src) { TORCH_CHECK( src.scalar_type() == at::kFloat, "Quantized copy only works with kFloat as source Tensor"); @@ -23,21 +23,14 @@ Tensor& quantized_copy_from_float_cpu_(Tensor& self, const Tensor& src) { TORCH_CHECK( self.sizes().equals(src.sizes()), "Quantized copy only works with Tensors with the same shape"); - TORCH_CHECK( - self.device().type() == kCPU, - "Quantized copy only works with QuantizedCPU Tensors"); AT_DISPATCH_QINT_TYPES(self.scalar_type(), "Copy", [&]() { - if (self.qscheme() == kPerChannelAffine) { + if (self.qscheme() == kPerChannelAffine || self.qscheme() == kPerChannelAffineFloatQParams + || self.qscheme() == kPerChannelSymmetric) { quantize_tensor_per_channel_affine(src, self, self.q_per_channel_scales(), self.q_per_channel_zero_points(), self.q_per_channel_axis()); } else { - float* src_data = src.data_ptr(); - scalar_t* self_data = self.data_ptr(); - for (const auto i : c10::irange(self.numel())) { - self_data[i] = quantize_val( - self.q_scale(), self.q_zero_point(), src_data[i]); - } + quantize_tensor_per_tensor_affine(src, self, self.q_scale(), self.q_zero_point()); } }); return self; diff --git a/aten/src/ATen/native/quantized/Copy.h b/aten/src/ATen/native/quantized/Copy.h index 65dabd24f1f3..d52c8ff0fb2c 100644 --- a/aten/src/ATen/native/quantized/Copy.h +++ b/aten/src/ATen/native/quantized/Copy.h @@ -5,6 +5,6 @@ namespace at { namespace native { -Tensor& quantized_copy_from_float_cpu_(Tensor& self, const Tensor& src); +Tensor& quantized_copy_from_float_(Tensor& self, const Tensor& src); } } // namespace at diff --git a/aten/src/ATen/native/quantized/QTensor.cpp b/aten/src/ATen/native/quantized/QTensor.cpp index 5fefa3557f4b..6e858a3b5c25 100644 --- a/aten/src/ATen/native/quantized/QTensor.cpp +++ b/aten/src/ATen/native/quantized/QTensor.cpp @@ -15,8 +15,11 @@ Tensor quantize_per_tensor_dynamic( const Tensor& self, ScalarType dtype, bool reduce_range) { - TORCH_CHECK( (dtype == ScalarType::QInt8 || dtype == ScalarType::QUInt8), "dtype ", dtype, "not supported"); + TORCH_CHECK( (dtype == ScalarType::QInt8 || dtype == ScalarType::QUInt8 || dtype == ScalarType::Half), "dtype ", dtype, "not supported"); auto input_contig = self.contiguous(); + if (dtype == ScalarType::Half) { + return input_contig.to(ScalarType::Half); + } float x_min = input_contig.min().item(); float x_max = input_contig.max().item(); diff --git a/aten/src/ATen/native/quantized/TensorCompare.cpp b/aten/src/ATen/native/quantized/TensorCompare.cpp index 839068b28ec2..08a104257f4e 100644 --- a/aten/src/ATen/native/quantized/TensorCompare.cpp +++ b/aten/src/ATen/native/quantized/TensorCompare.cpp @@ -35,12 +35,5 @@ std::tuple sort_quantized_cpu_stable( sort_indicies); } -std::tuple sort_quantized_cpu( - const Tensor& self, - int64_t dim, - bool descending) { - return sort_quantized_cpu_stable(self, /*stable=*/false, dim, descending); -} - } // namespace native } // namespace at diff --git a/aten/src/ATen/native/quantized/TensorFactories.cpp b/aten/src/ATen/native/quantized/TensorFactories.cpp index 08a972eacc38..aa0fef5df9dc 100644 --- a/aten/src/ATen/native/quantized/TensorFactories.cpp +++ b/aten/src/ATen/native/quantized/TensorFactories.cpp @@ -66,6 +66,40 @@ Tensor empty_per_channel_affine_quantized( quantizer); } +Tensor empty_unknown_quantized( + IntArrayRef size, + c10::optional dtype, + c10::optional layout, + c10::optional device, + c10::optional pin_memory, + c10::optional optional_memory_format) { + // See [Note: hacky wrapper removal for TensorOptions] + TensorOptions options_ = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); + + TORCH_CHECK( + !(options_.has_memory_format() && optional_memory_format.has_value()), + "Cannot set memory_format both in TensorOptions and explicit argument; please delete " + "the redundant setter."); + auto options = options_.merge_memory_format(optional_memory_format); + TORCH_CHECK( + options.has_dtype(), + "Must provide data type for Tensor creation functions."); + QuantizerPtr quantizer = make_unknown_quantizer(typeMetaToScalarType(options.dtype())); + return new_qtensor(size, options, quantizer); +} + +Tensor empty_strided_unknown_quantized( + IntArrayRef size, + IntArrayRef strided, + c10::optional dtype, + c10::optional layout, + c10::optional device, + c10::optional pin_memory) { + + TORCH_CHECK(false, "empty_strided not supported on quantized tensors yet see https://github.com/pytorch/pytorch/issues/74540") + +} + // Provide better error message if dtype is wrong Tensor empty_affine_quantized_other_backends_stub( IntArrayRef, diff --git a/aten/src/ATen/native/quantized/affine_quantizer_base.cpp b/aten/src/ATen/native/quantized/affine_quantizer_base.cpp index dc58f609f7a7..c99c81226ff5 100644 --- a/aten/src/ATen/native/quantized/affine_quantizer_base.cpp +++ b/aten/src/ATen/native/quantized/affine_quantizer_base.cpp @@ -115,12 +115,13 @@ T quantize_val(double scale, int64_t zero_point, float value) { return static_cast(qvalue); } -uint8_t quantize_val_arm( +template +T quantize_val_arm( const float scale, const int32_t zero_point, const float value) { - constexpr int32_t qmin = std::numeric_limits::min(); - constexpr int32_t qmax = std::numeric_limits::max(); + constexpr int32_t qmin = std::numeric_limits::min(); + constexpr int32_t qmax = std::numeric_limits::max(); float inv_scale = 1.0f / scale; #ifndef _MSC_VER auto r = static_cast(Round(value * inv_scale)); @@ -135,7 +136,7 @@ uint8_t quantize_val_arm( #endif r = std::max(r, qmin); r = std::min(r, qmax); - return static_cast(r); + return static_cast(r); } template @@ -151,6 +152,14 @@ void quantize_vec( } } +template uint8_t quantize_val_arm( + const float scale, + const int32_t zero_point, + const float value); +template int8_t quantize_val_arm( + const float scale, + const int32_t zero_point, + const float value); template TORCH_API float dequantize_val(double scale, int64_t zero_point, T value) { return static_cast(scale) * (value.val_ - static_cast(zero_point)); diff --git a/aten/src/ATen/native/quantized/affine_quantizer_base.h b/aten/src/ATen/native/quantized/affine_quantizer_base.h index 9e6a9ff58d24..31526c3ec3c5 100644 --- a/aten/src/ATen/native/quantized/affine_quantizer_base.h +++ b/aten/src/ATen/native/quantized/affine_quantizer_base.h @@ -10,7 +10,8 @@ template TORCH_API T quantize_val(double scale, int64_t zero_point, float value); // TODO combine this with quantize_val once the numerics for ARM are aligned // with it -uint8_t quantize_val_arm( +template +T quantize_val_arm( const float scale, const int32_t zero_point, const float value); diff --git a/aten/src/ATen/native/quantized/cpu/conv_packed_params.h b/aten/src/ATen/native/quantized/cpu/conv_packed_params.h deleted file mode 100644 index 130be6a0724d..000000000000 --- a/aten/src/ATen/native/quantized/cpu/conv_packed_params.h +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once - -#include -#include - -template -struct ConvPackedParamsBase : public torch::jit::CustomClassHolder { - virtual at::Tensor apply( - const at::Tensor& input, - double output_scale, - int64_t output_zero_point) = 0; - virtual at::Tensor apply_relu( - const at::Tensor& input, - double output_scale, - int64_t output_zero_point) = 0; - virtual at::Tensor apply_dynamic( - const at::Tensor& input, - bool reduce_range) = 0; - - virtual std::tuple> unpack() = 0; - - virtual torch::List stride() const = 0; - virtual torch::List padding() const = 0; - virtual torch::List output_padding() const = 0; - virtual torch::List dilation() const = 0; - virtual int64_t groups() const = 0; - virtual bool transpose() const = 0; -}; diff --git a/aten/src/ATen/native/quantized/cpu/conv_serialization.h b/aten/src/ATen/native/quantized/cpu/conv_serialization.h index cf5c04977b6a..369f54b43961 100644 --- a/aten/src/ATen/native/quantized/cpu/conv_serialization.h +++ b/aten/src/ATen/native/quantized/cpu/conv_serialization.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -358,6 +359,20 @@ c10::intrusive_ptr> deserialize_conv( ); } #endif // USE_PYTORCH_QNNPACK +#if AT_MKLDNN_ENABLED() + if (ctx.qEngine() == at::QEngine::ONEDNN) { + return PackedConvWeightsOnednn::prepack( + weight.value(), + bias, + stride, + padding, + output_padding, + dilation, + groups, + transpose + ); + } +#endif // AT_MKLDNN_ENABLED() TORCH_CHECK( false, "Didn't find engine for when deserializing ConvPackedParams: ", diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp index ab6df06f7b73..da6064f9ddbc 100644 --- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp +++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp @@ -1,10 +1,10 @@ #include -#include +#include #include #include #include -#include #include +#include #include #include #include @@ -160,9 +160,10 @@ Tensor MakeStridedQTensorCPU( allocator->allocate(size_bytes), allocator, /* resizable = */ true); + constexpr auto quantized_cpu_ks = at::DispatchKeySet(at::DispatchKey::QuantizedCPU); auto tensor = detail::make_tensor( storage, - at::DispatchKeySet(at::DispatchKey::QuantizedCPU), + quantized_cpu_ks, dtype, quantizer); get_qtensorimpl(tensor)->set_sizes_and_strides(sizes, strides); @@ -471,6 +472,16 @@ int register_linear_params() { std::move(weight), std::move(bias)); } #endif // USE_PYTORCH_QNNPACK +#if AT_MKLDNN_ENABLED() + if (at::globalContext().qEngine() == at::QEngine::ONEDNN) { + TORCH_CHECK( + weight.scalar_type() == at::kQInt8, + "ONEDNN only supports INT8 bit width currently. Got ", + c10::toString(weight.scalar_type())); + return PackedLinearWeightsOnednn::prepack( + std::move(weight), std::move(bias)); + } +#endif // #if AT_MKLDNN_ENABLED() TORCH_CHECK(false, "Unknown qengine"); }) .def("bias", [](const c10::intrusive_ptr& self) { @@ -543,9 +554,9 @@ int register_embedding_params() { namespace { -static auto conv2d_params = register_conv_params<2>(); -static auto conv3d_params = register_conv_params<3>(); -static auto linear_params = register_linear_params(); -static auto embedding_params = register_embedding_params(); +static C10_UNUSED auto conv2d_params = register_conv_params<2>(); +static C10_UNUSED auto conv3d_params = register_conv_params<3>(); +static C10_UNUSED auto linear_params = register_linear_params(); +static C10_UNUSED auto embedding_params = register_embedding_params(); } // namespace diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h index 4282bb34dd43..c98ef18ec85c 100644 --- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h +++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h @@ -1,9 +1,8 @@ #pragma once #include -#include +#include #include -#include #include #include @@ -100,15 +99,15 @@ struct TORCH_API PackedLinearWeightFp16 : public LinearPackedParamsBase { c10::optional bias_; at::Tensor apply( - at::Tensor input, - double output_scale, - int64_t output_zero_point) override { + at::Tensor /*input*/, + double /*output_scale*/, + int64_t /*output_zero_point*/) override { TORCH_INTERNAL_ASSERT(false); } at::Tensor apply_relu( - at::Tensor input, - double output_scale, - int64_t output_zero_point) override { + at::Tensor /*input*/, + double /*output_scale*/, + int64_t /*output_zero_point*/) override { TORCH_INTERNAL_ASSERT(false); } diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp index 23afea3e52ce..a42eeeac2234 100644 --- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp +++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp @@ -1,13 +1,15 @@ #include #include #include -#include +#include +#include #include #include #include #include #include #include +#include #include #include @@ -104,69 +106,93 @@ Tensor qcat_nhwc_kernel( // which causes an internal compiler error if they're not AT_DISPATCH_QINT_TYPES(output.scalar_type(), "qcat_nhwc", [&, N, H, W]() { using Vec = Vectorized; - for (const auto batch : c10::irange(N)) { - for (const auto row : c10::irange(H)) { - for (const auto col : c10::irange(W)) { - // loop over input tensors - for (const auto tidx : c10::irange(Cs_in.size())) { - scalar_t::underlying* optr = - reinterpret_cast(output.data_ptr()) + - batch * H * W * C_out + row * W * C_out + col * C_out + - Cs_sum[tidx]; - - auto curr_C = Cs_in[tidx]; - float curr_scale = scales[tidx]; - int64_t curr_zero_pt = zero_pts[tidx]; - - scalar_t::underlying* iptr = - reinterpret_cast(data_ptrs[tidx]) + - batch * H * W * curr_C + row * W * curr_C + col * curr_C; - - constexpr int64_t VLEN = Vec::size(); - int64_t c = 0; + at::parallel_for(0, N * H * W, 0, [&](int64_t begin, int64_t end) { + for (const auto i : c10::irange(begin, end)) { + // loop over input tensors + for (const auto tidx : c10::irange(Cs_in.size())) { + scalar_t::underlying* optr = + reinterpret_cast(output.data_ptr()) + + i * C_out + Cs_sum[tidx]; + + auto curr_C = Cs_in[tidx]; + float curr_scale = scales[tidx]; + int64_t curr_zero_pt = zero_pts[tidx]; + + scalar_t::underlying* iptr = + reinterpret_cast(data_ptrs[tidx]) + + i * curr_C; + + constexpr int64_t VLEN = Vec::size(); + int64_t c = 0; - // Vectorized loop - if (c + VLEN <= curr_C) { - auto curr_scale_vec = Vectorized(curr_scale); - auto curr_zero_pt_vec = Vectorized((float)curr_zero_pt); - auto scale_neg_zp_premul = curr_scale_vec * curr_zero_pt_vec.neg(); - for (; c + VLEN <= curr_C; c += VLEN) { - auto inp_vec = Vec::loadu(iptr + c); - auto float_values = inp_vec.dequantize( - curr_scale_vec, curr_zero_pt_vec, scale_neg_zp_premul); - Vec::float_vec_return_type retvals; - for (int i = 0; i < Vec::float_num_vecs(); ++i) { - if (ReLUFused) { - retvals[i] = - vec::maximum(float_values[i], Vectorized(0.0f)); - } else { - retvals[i] = float_values[i]; - } + // Vectorized loop + if (c + VLEN <= curr_C) { + auto curr_scale_vec = Vectorized(curr_scale); + auto curr_zero_pt_vec = Vectorized((float)curr_zero_pt); + auto scale_neg_zp_premul = curr_scale_vec * curr_zero_pt_vec.neg(); + for (; c + VLEN <= curr_C; c += VLEN) { + auto inp_vec = Vec::loadu(iptr + c); + auto float_values = inp_vec.dequantize( + curr_scale_vec, curr_zero_pt_vec, scale_neg_zp_premul); + Vec::float_vec_return_type retvals; + for (int i = 0; i < Vec::float_num_vecs(); ++i) { + if (ReLUFused) { + retvals[i] = + vec::maximum(float_values[i], Vectorized(0.0f)); + } else { + retvals[i] = float_values[i]; } - auto quantized = - Vec::quantize(retvals, scale, zero_point, inv_scale); - quantized.store(optr + c); } + auto quantized = + Vec::quantize(retvals, scale, zero_point, inv_scale); + quantized.store(optr + c); } + } - // Scalar loop - for (; c < curr_C; ++c) { - auto float_val = at::native::dequantize_val( - curr_scale, - curr_zero_pt, - reinterpret_cast(iptr)[c]); + // Vectorized loop for channel between 8 and 32 (avx2) + constexpr int kVLEN = Vectorized::size(); + int64_t elem_size = curr_C - c; + if ((VLEN == 4 * kVLEN) && elem_size >= kVLEN) { + auto curr_scale_vec = Vectorized(curr_scale); + auto curr_zero_pt_vec = Vectorized((float)curr_zero_pt); + auto scale_neg_zp_premul = curr_scale_vec * curr_zero_pt_vec.neg(); + int64_t vec_num = elem_size / kVLEN; + std::array buf_in; + memcpy(buf_in.data(), iptr + c, vec_num * kVLEN); + auto inp_vec = Vec::loadu(buf_in.data()); + auto float_values = inp_vec.dequantize( + curr_scale_vec, curr_zero_pt_vec, scale_neg_zp_premul); + Vec::float_vec_return_type retvals; + for (int i = 0; i < vec_num; ++i) { if (ReLUFused) { - float_val = std::max(0.0f, float_val); + retvals[i] = + vec::maximum(float_values[i], Vectorized(0.0f)); + } else { + retvals[i] = float_values[i]; } - optr[c] = at::native::quantize_val( - scale, zero_point, float_val) - .val_; - } // for c - - } // for tidx - } // for col - } // for row - } // for b + } + auto quantized = + Vec::quantize(retvals, scale, zero_point, inv_scale); + quantized.store(optr + c, vec_num * kVLEN); + c += vec_num * kVLEN; + } + + // Scalar loop + for (; c < curr_C; ++c) { + auto float_val = at::native::dequantize_val( + curr_scale, + curr_zero_pt, + reinterpret_cast(iptr)[c]); + if (ReLUFused) { + float_val = std::max(0.0f, float_val); + } + optr[c] = at::native::quantize_val( + scale, zero_point, float_val) + .val_; + } // for c + } // for tidx + } // for i + }); }); return output; @@ -615,7 +641,7 @@ static void leaky_qrelu_out_kernel(Tensor& out, const Tensor& qx, }); } -void qgelu_kernel(const Tensor& qx, Tensor& qy) { +void qgelu_kernel(const Tensor& qx, Tensor& qy, GeluType approximate) { int64_t zero_point = qx.q_zero_point(); // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) float scale = qx.q_scale(); @@ -626,40 +652,83 @@ void qgelu_kernel(const Tensor& qx, Tensor& qy) { float output_scale = scale; float inv_output_scale = 1.0 / output_scale; const auto kAlphaVec = Vectorized(M_SQRT1_2); + const auto kBetaVec = Vectorized(M_SQRT2 * M_2_SQRTPI * 0.5); + const auto kKappaVec = Vectorized(0.044715); const auto kOneVec = Vectorized(1); const auto kPointFiveVec = Vectorized(0.5); - AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qgelu", [&]() { - qy = at::_empty_affine_quantized( - qx.sizes(), - // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage) - at::device(kCPU).dtype(SCALAR_TYPE).memory_format(qx.suggest_memory_format()), - output_scale, - output_zero_point, - c10::nullopt); - auto iter = TensorIterator::unary_op(qy, qx); - - using Vec = Vectorized; - cpu_kernel_vec( - iter, - [&](scalar_t value_qx) -> scalar_t { - const auto value_dx = - at::native::dequantize_val(scale, zero_point, value_qx); - const auto value_dy = - value_dx * 0.5 * (1 + std::erf(value_dx * M_SQRT1_2)); - return at::native::quantize_val( - output_scale, output_zero_point, value_dy); - }, - [&](Vec value_qx) -> Vec { - auto value_dx = value_qx.dequantize( - scale_vec, zero_point_vec, scale_neg_zp_premul_vec); - for (auto & value : value_dx) { - value = value * kPointFiveVec * (kOneVec + (value * kAlphaVec).erf()); - } - return Vec::quantize( - value_dx, output_scale, output_zero_point, inv_output_scale); - }); - }); + if (approximate == GeluType::Tanh) { + AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qgelu", [&]() { + qy = at::_empty_affine_quantized( + qx.sizes(), + // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage) + at::device(kCPU).dtype(SCALAR_TYPE).memory_format(qx.suggest_memory_format()), + output_scale, + output_zero_point, + c10::nullopt); + auto iter = TensorIterator::unary_op(qy, qx); + + using Vec = Vectorized; + cpu_kernel_vec( + iter, + [&](scalar_t value_qx) -> scalar_t { + const auto value_dx = + at::native::dequantize_val(scale, zero_point, value_qx); + + const auto kBeta = M_SQRT2 * M_2_SQRTPI * 0.5; + const auto kKappa = 0.044715; + const auto x_cube = value_dx * value_dx * value_dx; + const auto inner = kBeta * (value_dx + kKappa * x_cube); + const auto value_dy = 0.5 * value_dx * (1.0 + std::tanh(inner)); + + return at::native::quantize_val( + output_scale, output_zero_point, value_dy); + }, + [&](Vec value_qx) -> Vec { + auto value_dx = value_qx.dequantize( + scale_vec, zero_point_vec, scale_neg_zp_premul_vec); + for (auto & value : value_dx) { + auto value_cube = value * value * value; + auto inner = kBetaVec * (value + kKappaVec * value_cube); + value = kPointFiveVec * value * (kOneVec + inner.tanh()); + } + return Vec::quantize( + value_dx, output_scale, output_zero_point, inv_output_scale); + }); + }); + } else { + AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qgelu", [&]() { + qy = at::_empty_affine_quantized( + qx.sizes(), + // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage) + at::device(kCPU).dtype(SCALAR_TYPE).memory_format(qx.suggest_memory_format()), + output_scale, + output_zero_point, + c10::nullopt); + auto iter = TensorIterator::unary_op(qy, qx); + + using Vec = Vectorized; + cpu_kernel_vec( + iter, + [&](scalar_t value_qx) -> scalar_t { + const auto value_dx = + at::native::dequantize_val(scale, zero_point, value_qx); + const auto value_dy = + value_dx * 0.5 * (1 + std::erf(value_dx * M_SQRT1_2)); + return at::native::quantize_val( + output_scale, output_zero_point, value_dy); + }, + [&](Vec value_qx) -> Vec { + auto value_dx = value_qx.dequantize( + scale_vec, zero_point_vec, scale_neg_zp_premul_vec); + for (auto & value : value_dx) { + value = value * kPointFiveVec * (kOneVec + (value * kAlphaVec).erf()); + } + return Vec::quantize( + value_dx, output_scale, output_zero_point, inv_output_scale); + }); + }); + } } @@ -1314,87 +1383,85 @@ void qmaxpool_2d_nhwc_kernel( scalar_t* idata = static_cast(qx.data_ptr()); scalar_t* odata = static_cast(qy.data_ptr()); - // Loop over N - for (const auto b : c10::irange(qx.size(0))) { - // Loop over H - auto* i_p = - reinterpret_cast(idata + b * iW * iH * iC); - for (const auto row : c10::irange(oH)) { - // Loop over W - for (const auto col : c10::irange(oW)) { - // Pointer to output data for this specific N,H,W position - auto* o_p = reinterpret_cast( - odata + b * oH * oW * iC + row * oW * iC + col * iC); - - // Loop over reduction block - int64_t h_start = row * sH - pH; - int64_t w_start = col * sW - pW; - int64_t h_end = std::min(h_start + (kH - 1) * dH + 1, iH); - int64_t w_end = std::min(w_start + (kW - 1) * dW + 1, iW); - while (h_start < 0) - h_start += dH; - while (w_start < 0) - w_start += dW; - - int64_t c = 0; - - // Interleaved vector loop 4x - constexpr auto vec_width = Vectorized::size(); - for (; c + 4 * vec_width <= iC; c += 4 * vec_width) { - Vectorized acc{ - scalar_t(std::numeric_limits::lowest())}; - // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays) - Vectorized accs[4] = {acc, acc, acc, acc}; - int64_t tcntr = 0; - int64_t x, y; - for (y = h_start; y < h_end; y += dH) { - for (x = w_start; x < w_end; x += dW) { - for (const auto i : c10::irange(4)) { - tcntr = y * iW + x; - auto vals = Vectorized::loadu( - i_p + tcntr * iC + c + Vectorized::size() * i); - accs[i] = vec::maximum(accs[i], vals); - } - } // for x - } // for y - for (const auto i : c10::irange(4)) { - accs[i].store(o_p + c + Vectorized::size() * i); - } - } // for c - - // Vector loop - for (; c + vec_width <= iC; c += vec_width) { - Vectorized acc{ - scalar_t(std::numeric_limits::lowest())}; - int64_t tcntr = 0; - int64_t x, y; - for (y = h_start; y < h_end; y += dH) { - for (x = w_start; x < w_end; x += dW) { + int64_t nBatch = qx.size(0); + at::parallel_for(0, nBatch * oH * oW, 0, [&](int64_t begin, int64_t end) { + int64_t b{0}, row{0}, col{0}; + data_index_init(begin, b, nBatch, row, oH, col, oW); + + for (const auto i : c10::irange(begin, end)) { + auto* i_p = reinterpret_cast(idata + b * iW * iH * iC); + auto* o_p = reinterpret_cast(odata + i * iC); + + // Loop over reduction block + int64_t h_start = row * sH - pH; + int64_t w_start = col * sW - pW; + int64_t h_end = std::min(h_start + (kH - 1) * dH + 1, iH); + int64_t w_end = std::min(w_start + (kW - 1) * dW + 1, iW); + while (h_start < 0) + h_start += dH; + while (w_start < 0) + w_start += dW; + + int64_t c = 0; + + // Interleaved vector loop 4x + constexpr auto vec_width = Vectorized::size(); + for (; c + 4 * vec_width <= iC; c += 4 * vec_width) { + Vectorized acc{ + scalar_t(std::numeric_limits::lowest())}; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays) + Vectorized accs[4] = {acc, acc, acc, acc}; + int64_t tcntr = 0; + int64_t x, y; + for (y = h_start; y < h_end; y += dH) { + for (x = w_start; x < w_end; x += dW) { + for (const auto i : c10::irange(4)) { tcntr = y * iW + x; - auto vals = Vectorized::loadu(i_p + tcntr * iC + c); - acc = vec::maximum(acc, vals); - } // for x - } // for y - acc.store(o_p + c); - } // for c - - for (; c < iC; ++c) { - auto max_val = std::numeric_limits::lowest(); - int64_t tcntr = 0; - int64_t x, y; - for (y = h_start; y < h_end; y += dH) { - for (x = w_start; x < w_end; x += dW) { - tcntr = y * iW + x; - auto val = *(i_p + tcntr * iC + c); - max_val = std::max(max_val, val); - } // for x - } // for y - - o_p[c] = max_val; - } // for c - } // for col - } // for row - } // for b + auto vals = Vectorized::loadu( + i_p + tcntr * iC + c + Vectorized::size() * i); + accs[i] = vec::maximum(accs[i], vals); + } + } // for x + } // for y + for (const auto i : c10::irange(4)) { + accs[i].store(o_p + c + Vectorized::size() * i); + } + } // for c + + // Vector loop + for (; c + vec_width <= iC; c += vec_width) { + Vectorized acc{ + scalar_t(std::numeric_limits::lowest())}; + int64_t tcntr = 0; + int64_t x, y; + for (y = h_start; y < h_end; y += dH) { + for (x = w_start; x < w_end; x += dW) { + tcntr = y * iW + x; + auto vals = Vectorized::loadu(i_p + tcntr * iC + c); + acc = vec::maximum(acc, vals); + } // for x + } // for y + acc.store(o_p + c); + } // for c + + for (; c < iC; ++c) { + auto max_val = std::numeric_limits::lowest(); + int64_t tcntr = 0; + int64_t x, y; + for (y = h_start; y < h_end; y += dH) { + for (x = w_start; x < w_end; x += dW) { + tcntr = y * iW + x; + auto val = *(i_p + tcntr * iC + c); + max_val = std::max(max_val, val); + } // for x + } // for y + + o_p[c] = max_val; + } // for c + + data_index_step(b, nBatch, row, oH, col, oW); + } + }); }); } @@ -1751,9 +1818,6 @@ void _qavg_pool_nhwc_kernel( int istrideH = strideW * inputWidth; int istrideD = istrideH * inputHeight; int istrideB = istrideD * inputDepth; - int ostrideH = strideW * outputWidth; - int ostrideD = ostrideH * outputHeight; - int ostrideB = ostrideD * outputDepth; // lift these operations outside the loop to reduce access overheads float input_scale = qx.q_scale(); @@ -1763,85 +1827,81 @@ void _qavg_pool_nhwc_kernel( int64_t divisor_override_factor = divisor_override.has_value() ? divisor_override.value() : 0; - at::parallel_for(0, nBatch, 0, [&](int64_t batch_start, int64_t batch_end) { - for (int64_t b = batch_start; b < batch_end; ++b) { - auto* i_p = - reinterpret_cast(idata + b * istrideB); - for (int od = 0; od < outputDepth; od++) { - for (int oh = 0; oh < outputHeight; oh++) { - for (int ow = 0; ow < outputWidth; ow++) { - auto* o_p = reinterpret_cast( - odata + b * ostrideB + od * ostrideD + oh * ostrideH + - ow * strideW); - int dstart = od * dD - padD; - int hstart = oh * dH - padH; - int wstart = ow * dW - padW; - - int dend = std::min(dstart + kD, (int)inputDepth + padD); - int hend = std::min(hstart + kH, (int)inputHeight + padH); - int wend = std::min(wstart + kW, (int)inputWidth + padW); - int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); - - dstart = std::max(dstart, 0); - hstart = std::max(hstart, 0); - wstart = std::max(wstart, 0); - dend = std::min(dend, (int)inputDepth); - hend = std::min(hend, (int)inputHeight); - wend = std::min(wend, (int)inputWidth); - - int size = (dend - dstart) * (hend - hstart) * (wend - wstart); - int divide_size = count_include_pad ? pool_size : size; - int divide_factor = - divisor_override_factor ? divisor_override_factor : divide_size; - float multiplier = input_scale / output_scale / divide_factor; - int input_zero_point_m_size = -input_zero_point * size; - - int c_start = 0; - - // For int8 quantization, we implicitly use int32 as accumulation - // Or else, it will go to the slow path - // TODO: support 16bit, 32bit, and etc. - do_avg_pool_nhwc_on_AVX_n( - i_p, - o_p, - c_start, - input_zero_point_m_size, - output_zero_point, - multiplier, - dstart, - dend, - hstart, - hend, - wstart, - wend, - inputDepth, - inputHeight, - inputWidth, - nInputPlane); - - // 1) The following loop handles the remaining channels - // 2) It also handles the Non-AVX2 path - for (int c = c_start; c < nInputPlane; ++c) { - int32_t acc_int32 = input_zero_point_m_size; - for (int64_t id = dstart; id < dend; id++) { - for (int64_t ih = hstart; ih < hend; ih++) { - for (int64_t iw = wstart; iw < wend; iw++) { - auto val = - *(i_p + id * istrideD + ih * istrideH + iw * strideW + - c * strideC); - acc_int32 += val; - } - } - } - double acc_fp = acc_int32 * 1.0; - // clamp - o_p[c] = at::native::quantize_val( - 1.0f / multiplier, output_zero_point, acc_fp) - .val_; - } // c - } // ow - } // oh - } // od + at::parallel_for(0, nBatch * outputDepth * outputHeight * outputWidth, 0, [&](int64_t begin, int64_t end) { + int64_t b{0}, od{0}, oh{0}, ow{0}; + data_index_init(begin, b, nBatch, od, outputDepth, oh, outputHeight, ow, outputWidth); + + for (const auto i : c10::irange(begin, end)) { + auto* i_p = reinterpret_cast(idata + b * istrideB); + auto* o_p = reinterpret_cast(odata + i * strideW); + int dstart = od * dD - padD; + int hstart = oh * dH - padH; + int wstart = ow * dW - padW; + + int dend = std::min(dstart + kD, (int)inputDepth + padD); + int hend = std::min(hstart + kH, (int)inputHeight + padH); + int wend = std::min(wstart + kW, (int)inputWidth + padW); + int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + + dstart = std::max(dstart, 0); + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + dend = std::min(dend, (int)inputDepth); + hend = std::min(hend, (int)inputHeight); + wend = std::min(wend, (int)inputWidth); + + int size = (dend - dstart) * (hend - hstart) * (wend - wstart); + int divide_size = count_include_pad ? pool_size : size; + int divide_factor = + divisor_override_factor ? divisor_override_factor : divide_size; + float multiplier = input_scale / output_scale / divide_factor; + int input_zero_point_m_size = -input_zero_point * size; + + int c_start = 0; + + // For int8 quantization, we implicitly use int32 as accumulation + // Or else, it will go to the slow path + // TODO: support 16bit, 32bit, and etc. + do_avg_pool_nhwc_on_AVX_n( + i_p, + o_p, + c_start, + input_zero_point_m_size, + output_zero_point, + multiplier, + dstart, + dend, + hstart, + hend, + wstart, + wend, + inputDepth, + inputHeight, + inputWidth, + nInputPlane); + + // 1) The following loop handles the remaining channels + // 2) It also handles the Non-AVX2 path + for (const auto c: c10::irange(c_start, nInputPlane)) { + int32_t acc_int32 = input_zero_point_m_size; + for (const auto id : c10::irange(dstart, dend)) { + for (const auto ih : c10::irange(hstart, hend)) { + for (const auto iw : c10::irange(wstart, wend)) { + auto val = + *(i_p + id * istrideD + ih * istrideH + iw * strideW + + c * strideC); + acc_int32 += val; + } + } + } + double acc_fp = acc_int32 * 1.0; + // clamp + o_p[c] = at::native::quantize_val( + 1.0f / multiplier, output_zero_point, acc_fp) + .val_; + } // c + + data_index_step(b, nBatch, od, outputDepth, oh, outputHeight, ow, outputWidth); } }); } @@ -2019,88 +2079,90 @@ void qupsample_bilinear2d_nhwc_kernel( bool align_corners, c10::optional scales_h, c10::optional scales_w) { - AT_DISPATCH_QINT_TYPES( - input.scalar_type(), "upsample_bilinear2d_nhwc", [&]() { - auto* idata = static_cast(input.data_ptr()); - auto* odata = static_cast(output.data_ptr()); - float inverse_scale = output.q_scale() / input.q_scale(); - const auto rheight = area_pixel_compute_scale( - input_height, output_height, align_corners, scales_h); - const auto rwidth = area_pixel_compute_scale( - input_width, output_width, align_corners, scales_w); - - const int64_t input_q_zero_point = input.q_zero_point(); - const int64_t output_q_zero_point = output.q_zero_point(); - - for (const auto b : c10::irange(nbatch)) { - auto* i_p = reinterpret_cast( - idata + b * input_height * input_width * channels); - auto* o_p = reinterpret_cast( - odata + b * output_height * output_width * channels); - - for (const auto h2 : c10::irange(output_height)) { - const auto h1r = area_pixel_compute_source_index( - rheight, h2, align_corners, /*cubic=*/false); - - const int64_t h1 = h1r; - const int64_t h1p = (h1 < input_height - 1) ? 1 : 0; - const float h1lambda = h1r - h1; - const float h0lambda = static_cast(1.) - h1lambda; - - for (const auto w2 : c10::irange(output_width)) { - const auto w1r = area_pixel_compute_source_index( - rwidth, w2, align_corners, /*cubic=*/false); - const int64_t w1 = w1r; - const int64_t w1p = (w1 < input_width - 1) ? 1 : 0; - - const float w1lambda = w1r - w1; - const float w0lambda = static_cast(1.) - w1lambda; - - int64_t c = 0; - // We use float32 to do the computation - const typename scalar_t::underlying* pos1 = - i_p + (h1 * input_width + w1) * channels; - typename scalar_t::underlying* pos2 = - o_p + (h2 * output_width + w2) * channels; - // We have to isolate this function out because the VS does not - // expand the macro correctly. - c = do_quantized_bilinear_on_AVX_n( - pos1, - pos2, - input_height, - input_width, - output_height, - output_width, - channels, - output_q_zero_point, - input_q_zero_point, - inverse_scale, - h0lambda, - h1lambda, - w0lambda, - w1lambda, - h1p, - w1p); - // 1) The following loop handles the remaining channels - // 2) It also handles the Non-AVX2 path - for (; c < channels; ++c) { - float result = h0lambda * - (w0lambda * pos1[0] + w1lambda * pos1[w1p * channels]) + - h1lambda * - (w0lambda * pos1[h1p * input_width * channels] + - w1lambda * pos1[(h1p * input_width + w1p) * channels]); - pos2[0] = at::native::quantize_val( - inverse_scale, - output_q_zero_point, - result - input_q_zero_point) - .val_; - pos1 += 1; - pos2 += 1; - } // c - } // w2 - } // h2 - } // b - }); + AT_DISPATCH_QINT_TYPES(input.scalar_type(), "upsample_bilinear2d_nhwc", [&]() { + auto* idata = static_cast(input.data_ptr()); + auto* odata = static_cast(output.data_ptr()); + float inverse_scale = output.q_scale() / input.q_scale(); + const auto rheight = area_pixel_compute_scale( + input_height, output_height, align_corners, scales_h); + const auto rwidth = area_pixel_compute_scale( + input_width, output_width, align_corners, scales_w); + + auto input_q_zero_point = input.q_zero_point(); + auto output_q_zero_point = output.q_zero_point(); + at::parallel_for(0, nbatch * output_height * output_width, 0, [&](int64_t begin, int64_t end) { + int64_t b{0}, h2{0}, w2{0}; + data_index_init(begin, b, nbatch, h2, output_height, w2, output_width); + + for (const auto i : c10::irange(begin, end)) { + (void)i; //Suppress unused variable warning + auto* i_p = reinterpret_cast( + idata + b * input_height * input_width * channels); + auto* o_p = reinterpret_cast( + odata + b * output_height * output_width * channels); + + const auto h1r = area_pixel_compute_source_index( + rheight, h2, align_corners, /*cubic=*/false); + + const int64_t h1 = h1r; + const int64_t h1p = (h1 < input_height - 1) ? 1 : 0; + const float h1lambda = h1r - h1; + const float h0lambda = static_cast(1.) - h1lambda; + + const auto w1r = area_pixel_compute_source_index( + rwidth, w2, align_corners, /*cubic=*/false); + const int64_t w1 = w1r; + const int64_t w1p = (w1 < input_width - 1) ? 1 : 0; + + const float w1lambda = w1r - w1; + const float w0lambda = static_cast(1.) - w1lambda; + + int64_t c = 0; + // We use float32 to do the computation + const typename scalar_t::underlying* pos1 = + i_p + (h1 * input_width + w1) * channels; + typename scalar_t::underlying* pos2 = + o_p + (h2 * output_width + w2) * channels; + // We have to isolate this function out because the VS does not + // expand the macro correctly. + c = do_quantized_bilinear_on_AVX_n( + pos1, + pos2, + input_height, + input_width, + output_height, + output_width, + channels, + output_q_zero_point, + input_q_zero_point, + inverse_scale, + h0lambda, + h1lambda, + w0lambda, + w1lambda, + h1p, + w1p); + // 1) The following loop handles the remaining channels + // 2) It also handles the Non-AVX2 path + for (; c < channels; ++c) { + float result = h0lambda * + (w0lambda * pos1[0] + w1lambda * pos1[w1p * channels]) + + h1lambda * + (w0lambda * pos1[h1p * input_width * channels] + + w1lambda * pos1[(h1p * input_width + w1p) * channels]); + pos2[0] = at::native::quantize_val( + inverse_scale, + output_q_zero_point, + result - input_q_zero_point) + .val_; + pos1 += 1; + pos2 += 1; + } // c + + data_index_step(b, nbatch, h2, output_height, w2, output_width); + } + }); + }); } void qtopk_kernel(Tensor& values, @@ -2201,65 +2263,66 @@ void q_batch_norm_kernel( auto scale_neg_zp_premul = fake_scale * in_zp_vec.neg(); auto out_zero_point_v = Vec(scalar_t(out_zero_point)); const auto lanes = static_cast(Vec::float_num_vecs() * kVLen); - for (const auto i : c10::irange(outer_size)) { - auto* X_ptr = reinterpret_cast(X + i * C); - auto* Y_ptr = reinterpret_cast(Y + i * C); - int64_t ch = 0; - - for(; ch + lanes <= C; ch += lanes ) { - do_bn_compute( - X_ptr + ch, - Y_ptr + ch, - fake_scale, - in_zp_vec, - scale_neg_zp_premul, - out_zero_point, - out_zero_point_v, - alpha + ch, - beta + ch, - Vec::float_num_vecs(), - ReluFused, - kVLen - ); - } + at::parallel_for(0, outer_size, 0, [&](int64_t begin, int64_t end) { + for (const auto i : c10::irange(begin, end)) { + auto* X_ptr = reinterpret_cast(X + i * C); + auto* Y_ptr = reinterpret_cast(Y + i * C); + int64_t ch = 0; + + for(; ch + lanes <= C; ch += lanes) { + do_bn_compute( + X_ptr + ch, + Y_ptr + ch, + fake_scale, + in_zp_vec, + scale_neg_zp_premul, + out_zero_point, + out_zero_point_v, + alpha + ch, + beta + ch, + Vec::float_num_vecs(), + ReluFused, + kVLen + ); + } - // for channel between 8 and 32, still use 32 width for performance - // Benchmark shows it is faster than doing 8 channels each time - int64_t elem_size = C - ch; - if ((lanes == 32) && elem_size >= kVLen) { - int64_t vec_num = elem_size / kVLen; - std::vector buf_in(lanes); - memcpy(buf_in.data(), X_ptr + ch, vec_num * kVLen); // 3 cycles - do_bn_compute( - buf_in.data(), - Y_ptr + ch, - fake_scale, - in_zp_vec, - scale_neg_zp_premul, - out_zero_point, - out_zero_point_v, - alpha + ch, - beta + ch, - vec_num, - ReluFused, - kVLen - ); - ch += vec_num * kVLen; - } - // for channels less than 8 - for (; ch < C; ++ch) { - long quantized_down = out_zero_point + - lrintf(alpha[ch] * (X_ptr[ch] - in_zero_point) + - beta[ch]); - if (ReluFused) { // static if - quantized_down = std::max(quantized_down, out_zero_point); + // for channel between 8 and 32, still use 32 width for performance + // Benchmark shows it is faster than doing 8 channels each time + int64_t elem_size = C - ch; + if ((lanes == 32) && elem_size >= kVLen) { + int64_t vec_num = elem_size / kVLen; + std::vector buf_in(lanes); + memcpy(buf_in.data(), X_ptr + ch, vec_num * kVLen); // 3 cycles + do_bn_compute( + buf_in.data(), + Y_ptr + ch, + fake_scale, + in_zp_vec, + scale_neg_zp_premul, + out_zero_point, + out_zero_point_v, + alpha + ch, + beta + ch, + vec_num, + ReluFused, + kVLen + ); + ch += vec_num * kVLen; + } + // for channels less than 8 + for (; ch < C; ++ch) { + long quantized_down = out_zero_point + + lrintf(alpha[ch] * (X_ptr[ch] - in_zero_point) + + beta[ch]); + if (ReluFused) { // static if + quantized_down = std::max(quantized_down, out_zero_point); + } + Y_ptr[ch] = std::min( + std::max(quantized_down, minimum), maximum); } - Y_ptr[ch] = std::min( - std::max(quantized_down, minimum), maximum); } - } -}); - + }); + }); } void _fake_quantize_tensor_helper( @@ -2740,22 +2803,50 @@ void quantize_tensor_arm( } } +namespace quantize_tensor_arm_intrinsics { +template +C10_ALWAYS_INLINE Tx8 vqmov(int16x8_t vraw); + +template <> +C10_ALWAYS_INLINE uint8x8_t vqmov(int16x8_t vraw) { + return vqmovun_s16(vraw); +} + +template <> +C10_ALWAYS_INLINE int8x8_t vqmov(int16x8_t vraw) { + return vqmovn_s16(vraw); +} + +template +C10_ALWAYS_INLINE void vst1(T* out, Tx8 vout); + +template <> +C10_ALWAYS_INLINE void vst1(uint8_t* out, uint8x8_t vout) { + vst1_u8(out, vout); +} + +template <> +C10_ALWAYS_INLINE void vst1(int8_t* out, int8x8_t vout) { + vst1_s8(out, vout); +} +} // namespace quantize_tensor_arm_intrinsics + // Specialized implementation from caffe2::Int8Quantize. // There may be slight accuracy difference between this and implementation of // quantize_val // TODO Update quantize_tensor_arm implementation to follow quantize_val, // i.e. f = Round(value/scale + zero_point) -// TODO Make quantize_tensor_arm work for other datatypes too (int8, int32). -template <> -void quantize_tensor_arm( +// TODO Make quantize_tensor_arm work for int32 datatype too. +template +void quantize_tensor_arm_q8( const float* __restrict__ in, - c10::quint8* __restrict__ out, + scalar_t* __restrict__ out, const int64_t N, const float scale, const int32_t zero_point) { const float inv_scale = 1.0f / scale; uint32_t i = 0; - uint8_t* out_underlying = reinterpret_cast(out); + underlying_t* out_underlying = reinterpret_cast(out); const float32x4_t vinv_scale = vdupq_n_f32(inv_scale); #if defined(__ARM_NEON__) // magic float and magic int to take care of rounding @@ -2786,12 +2877,15 @@ void quantize_tensor_arm( vaddq_f32(vmagic_float, vmulq_f32(vin4567, vinv_scale)))); const int16x8_t vraw01234567 = vcombine_s16(vqmovn_s32(vraw0123), vqmovn_s32(vraw4567)); - const uint8x8_t vout01234567 = vqmovun_s16(vraw01234567); - vst1_u8(out_underlying, vout01234567); + const underlying_x8_t vout01234567 = + quantize_tensor_arm_intrinsics::vqmov(vraw01234567); + quantize_tensor_arm_intrinsics::vst1( + out_underlying, vout01234567); out_underlying += 8; } for (; i < N; ++i) { - (*out_underlying++) = at::native::quantize_val_arm(scale, zero_point, (*in++)); + (*out_underlying++) = + at::native::quantize_val_arm(scale, zero_point, (*in++)); } #else const int16x8_t vzero_point = vdupq_n_s16((int16_t)(uint16_t)zero_point); @@ -2804,16 +2898,42 @@ void quantize_tensor_arm( const int32x4_t v4567_rounded = vcvtnq_s32_f32(vmulq_f32(vin4567, vinv_scale)); const int16x8_t v01234567_packed = vqaddq_s16( vqmovn_high_s32(vqmovn_s32(v0123_rounded), v4567_rounded), vzero_point); - const uint8x8_t vout01234567 = vqmovun_s16(v01234567_packed); - vst1_u8(out_underlying, vout01234567); + const underlying_x8_t vout01234567 = + quantize_tensor_arm_intrinsics::vqmov( + v01234567_packed); + quantize_tensor_arm_intrinsics::vst1( + out_underlying, vout01234567); out_underlying += 8; } for (; i < N; ++i) { - (*out_underlying++) = at::native::quantize_val_arm(scale, zero_point, (*in++)); + (*out_underlying++) = + at::native::quantize_val_arm(scale, zero_point, (*in++)); } #endif } +template <> +void quantize_tensor_arm( + const float* __restrict__ in, + c10::quint8* __restrict__ out, + const int64_t N, + const float scale, + const int32_t zero_point) { + quantize_tensor_arm_q8( + in, out, N, scale, zero_point); +} + +template <> +void quantize_tensor_arm( + const float* __restrict__ in, + c10::qint8* __restrict__ out, + const int64_t N, + const float scale, + const int32_t zero_point) { + quantize_tensor_arm_q8( + in, out, N, scale, zero_point); +} + #if defined(__aarch64__) #define VMOVL_HIGH_U8(x) vmovl_high_u8(x) #define VMOVL_HIGH_S8(x) vmovl_high_s8(x) @@ -3132,8 +3252,8 @@ void quantize_tensor_per_channel_impl( out += 8; } for (; c < channels; ++c) { - (*out++) = - at::native::quantize_val_arm(scales_data[c], zero_points_data[c], (*in++)); + (*out++) = at::native::quantize_val_arm( + scales_data[c], zero_points_data[c], (*in++)); } } } @@ -3163,8 +3283,8 @@ void quantize_tensor_per_channel_impl( out += 8; } for (; e < elements_per_channel; ++e) { - (*out++) = - at::native::quantize_val_arm(scales_data[c], zero_points_data[c], (*in++)); + (*out++) = at::native::quantize_val_arm( + scales_data[c], zero_points_data[c], (*in++)); } } } @@ -3210,8 +3330,8 @@ void quantize_tensor_per_channel_impl( out += 8; } for (; c < channels; ++c) { - (*out++) = - at::native::quantize_val_arm(scales_data[c], zero_points_data[c], (*in++)); + (*out++) = at::native::quantize_val_arm( + scales_data[c], zero_points_data[c], (*in++)); } } } @@ -3238,8 +3358,8 @@ void quantize_tensor_per_channel_impl( out += 8; } for (; e < elements_per_channel; ++e) { - (*out++) = - at::native::quantize_val_arm(scales_data[c], zero_points_data[c], (*in++)); + (*out++) = at::native::quantize_val_arm( + scales_data[c], zero_points_data[c], (*in++)); } } } diff --git a/aten/src/ATen/native/quantized/cpu/onednn_utils.h b/aten/src/ATen/native/quantized/cpu/onednn_utils.h new file mode 100644 index 000000000000..4ee8e8737fb2 --- /dev/null +++ b/aten/src/ATen/native/quantized/cpu/onednn_utils.h @@ -0,0 +1,151 @@ +#pragma once + +#include +#if AT_MKLDNN_ENABLED() +#include +#include +#include +#include + +struct PackedLinearWeightsOnednn : public LinearPackedParamsBase { + PackedLinearWeightsOnednn( + std::unique_ptr weight, + c10::optional bias, + at::Tensor orig_weight, + c10::optional orig_bias) + : weight_(std::move(weight)), + bias_(std::move(bias)), + orig_weight_(std::move(orig_weight)), + orig_bias_(std::move(orig_bias)) {} + std::unique_ptr weight_; + c10::optional bias_; + at::Tensor orig_weight_; + c10::optional orig_bias_; + + at::Tensor apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + at::Tensor apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic(at::Tensor input, bool reduce_range=false) override; + at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range=false) override; + + std::tuple> unpack() override; + + c10::optional bias() override { + return orig_bias_; + } + + static c10::intrusive_ptr prepack( + at::Tensor weight, + c10::optional bias); + + private: + template + at::Tensor apply_impl( + at::Tensor input, + double output_scale, + int64_t output_zero_point); + + template + at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range=false); +}; + +template +struct PackedConvWeightsOnednn : public ConvPackedParamsBase { + PackedConvWeightsOnednn( + std::unique_ptr weight, + c10::optional bias, + at::Tensor orig_weight, + c10::optional orig_bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + uint8_t transpose) + : weight_(std::move(weight)), + bias_(std::move(bias)), + orig_weight_(std::move(orig_weight)), + orig_bias_(std::move(orig_bias)), + stride_(std::move(stride)), + padding_(std::move(padding)), + output_padding_(std::move(output_padding)), + dilation_(std::move(dilation)), + groups_(groups), + transpose_(transpose) {} + + std::unique_ptr weight_; + c10::optional bias_; + at::Tensor orig_weight_; + c10::optional orig_bias_; + torch::List stride_; + torch::List padding_; + torch::List output_padding_; + torch::List dilation_; + int64_t groups_; + uint8_t transpose_; + + at::Tensor apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic( + const at::Tensor& input, + bool reduce_range) override; + + std::tuple> unpack() override; + + static c10::intrusive_ptr> prepack( + at::Tensor weight, + c10::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose); + + torch::List stride() const override { + return stride_; + } + + torch::List padding() const override { + return padding_; + } + + torch::List output_padding() const override { + return output_padding_; + } + + torch::List dilation() const override { + return dilation_; + } + + int64_t groups() const override { + return groups_; + } + + bool transpose() const override { + return (bool)transpose_; + } + + private: + template + at::Tensor apply_impl( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point); +}; + +#endif // #if AT_MKLDNN_ENABLED() diff --git a/aten/src/ATen/native/quantized/cpu/packed_params.h b/aten/src/ATen/native/quantized/cpu/packed_params.h deleted file mode 100644 index 49bd26de5f55..000000000000 --- a/aten/src/ATen/native/quantized/cpu/packed_params.h +++ /dev/null @@ -1,73 +0,0 @@ -#pragma once - -#include - -struct LinearPackedParamsBase : public torch::jit::CustomClassHolder { - virtual at::Tensor apply( - at::Tensor input, - double output_scale, - int64_t output_zero_point) = 0; - virtual at::Tensor apply_relu( - at::Tensor input, - double output_scale, - int64_t output_zero_point) = 0; - - // out variant of LinearPackedParamsBase::apply - virtual at::Tensor& apply_out( - const at::Tensor& input, - double output_scale, - int64_t output_zero_point, - at::Tensor& output) { - throw std::runtime_error( - "apply_out is not implemented for this packed " - "parameter type"); - return output; - } - - virtual at::Tensor& apply_relu_out( - const at::Tensor& input, - double output_scale, - int64_t output_zero_point, - at::Tensor& output) { - throw std::runtime_error( - "apply_relu_out is not implemented for this packed " - "parameter type"); - return output; - } - - virtual at::Tensor apply_dynamic( - at::Tensor input, - bool reduce_range = false) = 0; - virtual at::Tensor apply_dynamic_relu( - at::Tensor input, - bool reduce_range = false) = 0; - - virtual at::Tensor& apply_dynamic_out( - const at::Tensor& /* input */, - at::Tensor& output, - bool /* reduce_range */) { - throw std::runtime_error( - "apply_dynamic_out is not implemented for this packed " - "parameter type"); - return output; - } - virtual at::Tensor& apply_dynamic_relu_out( - const at::Tensor& /* input */, - at::Tensor& output, - bool /* reduce_range */) { - throw std::runtime_error( - "apply_dynamic_relu_out is not implemented for this packed " - "parameter type"); - return output; - } - - virtual std::tuple> unpack() = 0; - - virtual c10::optional bias() = 0; - - virtual void set_bias(c10::optional bias) { - throw std::runtime_error( - "set_bias is not implemented for this packed " - "parameter type"); - } -}; diff --git a/aten/src/ATen/native/quantized/cpu/qadd.cpp b/aten/src/ATen/native/quantized/cpu/qadd.cpp index 6aaffff79a22..cbca3ba58ef7 100644 --- a/aten/src/ATen/native/quantized/cpu/qadd.cpp +++ b/aten/src/ATen/native/quantized/cpu/qadd.cpp @@ -7,10 +7,9 @@ #include #include #include +#include #include -#include - namespace at { namespace native { @@ -217,18 +216,170 @@ Tensor qnnpack_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) { return qy; } -#endif +#endif // USE_PYTORCH_QNNPACK + +#ifdef USE_XNNPACK +C10_ALWAYS_INLINE +enum xnn_status xnnp_create_add_nd( + int8_t azp, + float ascale, + int8_t bzp, + float bscale, + int8_t czp, + float cscale, + int8_t output_min, + int8_t output_max, + uint32_t flags, + xnn_operator_t* op) { + return xnn_create_add_nd_qs8( + azp, /* int8_t input1_zero_point */ + ascale, /* float input1_scale */ + bzp, /* int8_t input2_zero_point */ + bscale, /* float input2_scale */ + czp, /* int8_t output_zero_point */ + cscale, /* float output_scale */ + output_min, /* int8_t output_min */ + output_max, /* int8_t output_max */ + flags, /* uint32_t flags */ + op); /* xnn_operator_t* add_op_out */ +} + +C10_ALWAYS_INLINE +enum xnn_status xnnp_setup_add_nd( + xnn_operator_t op, + const std::vector& a_shape, + const std::vector& b_shape, + const int8_t* da, + const int8_t* db, + int8_t* dc, + pthreadpool_t pt_pool) { + return xnn_setup_add_nd_qs8( + op, /* xnn_operator_t add_op */ + a_shape.size(), /* size_t num_input1_dims */ + a_shape.data(), /* const size_t* input1_shape */ + b_shape.size(), /* size_t num_input2_dims */ + b_shape.data(), /* const size_t* input2_shape */ + da, /* const int8_t* input1 */ + db, /* const int8_t* input2 */ + dc, /* int8_t* output */ + pt_pool); /* pthreadpool_t threadpool */ +} + +template +Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) { + using underlying_t = typename scalar_t::underlying; + const string func_name = "xnnp_add()"; + TORCH_CHECK(qa.ndimension() > 0, func_name, ": Got empty input tensor."); + TORCH_CHECK(at::native::xnnpack::available(), func_name, ": XNNPACK is not available") + + // using qa memory format for qb to allow xnnpack kernel to flatten all the + // dims + auto qa_mem_format = qa.suggest_memory_format(); + Tensor qa_contig = qa.contiguous(qa_mem_format); + Tensor qb_contig = qb.contiguous(qa_mem_format); + + const auto a_zero_point = qa_contig.q_zero_point(); + const auto b_zero_point = qb_contig.q_zero_point(); + const auto a_scale = qa_contig.q_scale(); + const auto b_scale = qb_contig.q_scale(); + + Tensor qy = at::native::empty_affine_quantized( + at::infer_size_dimvector(qa_contig.sizes(), qb_contig.sizes()), + qa.scalar_type(), + c10::nullopt /* layout */, + kCPU, + c10::nullopt /* pin_memory */, + scale, + zero_point, + qa_mem_format); + + if (qa_contig.size(0) == 0) { + return qy; + } + + xnn_operator_t xnnp_op = nullptr; + xnnpack_operator xnnp_add_operator; + + auto output_max = std::numeric_limits::max(); + auto output_min = std::numeric_limits::min(); + if (ReLUFused) { + /* + * FIXME: use acticationLimits() + * With , MSVC runs into "error C3862: indetifier activationLimits not found". + */ + constexpr int64_t qmin = std::numeric_limits::min(); + constexpr int64_t qmax = std::numeric_limits::max(); + int64_t qvalue = static_cast(zero_point); + qvalue = std::max(qvalue, qmin); + output_min = static_cast(std::min(qvalue, qmax)); + } + + // Create an operator + auto status = xnnp_create_add_nd( + a_zero_point, + a_scale, + b_zero_point, + b_scale, + static_cast(zero_point), + static_cast(scale), + output_min, + output_max, + 0, + &xnnp_op); + xnnp_add_operator = xnnpack_operator(xnnp_op); + TORCH_CHECK( + status == xnn_status_success, + func_name, ": xnn create operator failed(", status,")!"); + + const auto qa_shape = xnnp_utils::get_mem_format_aware_shape(qa_contig); + const auto qb_shape = xnnp_utils::get_mem_format_aware_shape(qb_contig); + + // Setup the operator + status = xnnp_setup_add_nd( + xnnp_add_operator.get(), + qa_shape, + qb_shape, + reinterpret_cast(qa_contig.data_ptr()), + reinterpret_cast(qb_contig.data_ptr()), + reinterpret_cast(qy.data_ptr()), + caffe2::pthreadpool_()); + TORCH_CHECK( + status == xnn_status_success, + func_name, ": xnn setup operator failed(", status,")!"); + + // Run the operator + status = xnn_run_operator( + xnnp_add_operator.get(), /* xnn_operator_t op */ + caffe2::pthreadpool_()); /* pthreadpool_t threadpool */ + TORCH_CHECK( + status == xnn_status_success, + func_name, ": xnn run operator failed(", status,")"); + return qy; +} +#endif // USE_XNNPACK template Tensor qadd(Tensor qa, Tensor qb, double scale, int64_t zero_point) { check_inputs(qa, qb); + + if (at::globalContext().qEngine() == at::QEngine::QNNPACK) { + TORCH_CHECK( + qa.scalar_type() == qb.scalar_type(), + "Both inputs to qadd must have same type"); + +#ifdef USE_XNNPACK + if (qa.scalar_type() == kQInt8) { + return xnnp_add(qa, qb, scale, zero_point); + } +#endif // USE_XNNPACK + #ifdef USE_PYTORCH_QNNPACK - if (at::globalContext().qEngine() == at::QEngine::QNNPACK && - qa.sizes() == qb.sizes() && /* qnnpack does not support boradcasting */ - qa.scalar_type() == kQUInt8 && qb.scalar_type() == kQUInt8) { + if(qa.sizes() == qb.sizes() && /* qnnpack does not support boradcasting */ + qa.scalar_type() == kQUInt8) { return qnnpack_add(qa, qb, scale, zero_point); + } +#endif // USE_PYTORCH_QNNPACK } -#endif auto qc = at::_empty_affine_quantized( qa.sizes(), at::device(kCPU) diff --git a/aten/src/ATen/native/quantized/cpu/qconcat.cpp b/aten/src/ATen/native/quantized/cpu/qconcat.cpp index 8e09e32c4203..4322b3558f5c 100644 --- a/aten/src/ATen/native/quantized/cpu/qconcat.cpp +++ b/aten/src/ATen/native/quantized/cpu/qconcat.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp index c32daf362516..aa77489f7419 100644 --- a/aten/src/ATen/native/quantized/cpu/qconv.cpp +++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp @@ -5,9 +5,12 @@ #include #include #include -#include +#include #include #include +#include +#include +#include #include #include #include @@ -160,7 +163,7 @@ std::array MakeInputShape( int64_t W); template <> -std::array MakeInputShape(int64_t _, int64_t H, int64_t W) { +std::array MakeInputShape(int64_t /*D*/, int64_t H, int64_t W) { return {H, W}; } template <> @@ -442,6 +445,21 @@ at::Tensor PackedConvWeight::apply_impl( padding(), output_padding(), dilation()); + + // if use direct convolution implementation, compute the col_offsets + // of the weight matrix at model initialization stage. + // We need to know the shape of output matrix + // to compute col_offsets for direct convolution. + // Hence it cannot be called from inside weight packing function + // like other quantized conv implementation + if (pack_w->getPackedWForDirectconv().get() && + pack_w->getPackedWForDirectconv().get()->is_first_call()) { + pack_w->getPackedWForDirectconv().get()->col_offsets_with_zero_pt_s8acc32_DirectConvT( + conv_p, + w_zp.data(), + col_offsets, + M); + } } else { output_shape = MakeConvOutputShape(N, M, conv_p.OUT_DIM); } @@ -573,22 +591,262 @@ template at::Tensor PackedConvWeight<3>::apply_impl( #ifdef USE_PYTORCH_QNNPACK +#ifdef USE_XNNPACK template -at::Tensor PackedConvWeightsQnnp::apply( - const at::Tensor& input, - double output_scale, - int64_t output_zero_point) { - return apply_impl(input, output_scale, output_zero_point); -} +template +at::Tensor PackedConvWeightsQnnp::apply_impl_xnnp( + const at::Tensor& act, double output_scale, int64_t output_zero_point) { + using underlying_t = typename scalar_t::underlying; -template -at::Tensor PackedConvWeightsQnnp::apply_relu( - const at::Tensor& input, - double output_scale, - int64_t output_zero_point) { - return apply_impl(input, output_scale, output_zero_point); + std::lock_guard lock(qnnp_mutex_); + + const std::string func_name = transpose() + ? "quantized::conv_transpose (xnnpack)" + : "quantized::conv (xnnpack)"; + TORCH_CHECK( + kSpatialDim == 2, + func_name, ": xnnpack does not currently support 3d convolution."); + + /* + * NB: + * [de]conv_prepack prepares weights (values, scale, and zero_points) ahead of + * time during prepack() call assuming the activation will be uint8_t. But it + * may not always be the case. A solution may involve making prepack routine + * aware of the input qdtype. But currently all the pieces are not ready to + * pass that model level info to the prepack function. So, for now, here in + * this function we have to massage weights if we learn the input qdtype is + * not uint8_t. This involves copying and converting uint8_t to int8_t + * whenever necessary. To add to that, since XNNPACK, as of writing this, + * doesn't support per_channel weights for quint8_t, we add following assert + * makes sure we don't run into that case. Also take shortcuts when processing + * weights, which means we have to revisit and fix some weight massging logic + * when we enable the missing feature in XNNPACK. + * + * Table below summarizes how the weights are handled, + * + * .-------------------------------------------------------------------------. + * | input_qdtype | uint8_t | int8_t | + * | per_channel | yes | no | yes | no | + * |-------------------------------------------------------------------------| + * | zero_points | at::zeros()* | orig_zp + 128 | at:zeros()** | orig_zp | + * | scale | dtype = float, no changes needed | + * | values | always processed before passing to XNNPACK | + * .-------------------------------------------------------------------------. + * + * Notes: * - zero_points for uint8_t + per_channel: no support in xnnpack, need + * to fix when support is added. ** - zero_points for int8_t: symmetric + * quantization means XNNPACK will ignore kernel zero point(s). + */ + + if ((std::is_same::value )) { + TORCH_CHECK(!per_channel(), + func_name, ": xnnpack does not currently have per_channel support with activation dtype of c10::quint8." + ); + } + + // More checks + ConvDimChecks( + act.ndimension(), + stride().size(), + padding().size(), + output_padding().size(), + dilation().size(), + func_name, + transpose()); + + const int64_t N = act.size(0); + const int64_t H = act.size(2); + const int64_t W = act.size(3); + const int64_t D = 1; + const int64_t M = bias.size(0); + + const auto act_nhwc = act.contiguous(c10::MemoryFormat::ChannelsLast); + const auto act_input_scale = act_nhwc.q_scale(); + + auto status = xnn_status_invalid_state; + + // Create an operator iff necessary + if (!xnnp_convolution_op || + (!input_scale.has_value() || input_scale.value() != act_input_scale)) { + xnn_operator_t xnnp_op = nullptr; + + // Update the input scale so we may cache the op + input_scale = act_input_scale; + + // create an empty tensor for packing the weights + const at::Tensor weight_contig = + orig_weight.contiguous(c10::MemoryFormat::ChannelsLast); + const float* w_scales_data = w_scales.data_ptr(); + underlying_t w_zp = 0; + at::Tensor weight_tensor; + + if (!per_channel()) { + w_zp = static_cast( + weight_contig.q_zero_point() + + (std::is_same::value ? 128 : 0)); + + weight_tensor = at::native::empty_affine_quantized( + weight_contig.sizes(), + c10::CppTypeToScalarType::value, + c10::nullopt /* layout */, + c10::kCPU, + c10::nullopt /* pin_memory */, + w_scales_data[0], + w_zp, + c10::MemoryFormat::ChannelsLast); + } else { /* per_channel */ + weight_tensor = at::native::empty_per_channel_affine_quantized( + weight_contig.sizes(), + w_scales, + at::zeros(w_scales.sizes(), at::kInt), /* see comment above about w_zp */ + weight_contig.q_per_channel_axis(), + c10::CppTypeToScalarType::value, + c10::nullopt /* layout */, + c10::kCPU, + c10::nullopt /* pin_memory */, + c10::MemoryFormat::ChannelsLast); + } + + // copy from the original weight and take care of dtype change if necessary + at::native::xnnp_utils::q8_copy_int8_weight_and_add_offset( + weight_contig, weight_tensor); + const at::Tensor xnnp_weight = + at::native::xnnp_utils::convert_conv_weights_to_channel_last_tensor< + kSpatialDim>(weight_tensor, groups(), transpose()); + + auto output_min = kReluFused + // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) + ? activationLimits(output_scale, output_zero_point, Activation::RELU).first + : std::numeric_limits::min(); + auto output_max = kReluFused + // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) + ? activationLimits(output_scale, output_zero_point, Activation::RELU).second + : std::numeric_limits::max(); + + + // Original bias was float, so we requantize it here. + at::Tensor qbias; + if (per_channel()) { + auto bias_quant_scales = + weight_contig.q_per_channel_scales() * act_input_scale; + auto bias_zp = at::zeros(bias_quant_scales.sizes(), c10::kInt); + qbias = at::native::quantize_per_channel( + bias, bias_quant_scales, bias_zp, 0, c10::kQInt32); + } else { + qbias = at::native::quantize_per_tensor( + bias, weight_contig.q_scale() * act_input_scale, 0, c10::kQInt32); + } + + status = at::native::xnnp_utils::xnnp_create_convolution2d_nhwc( + padding()[0], + padding()[1], + padding()[0], + padding()[1], + kernel_[0], + kernel_[1], + stride()[0], + stride()[1], + dilation()[0], + dilation()[1], + groups(), + !transpose() ? orig_weight.size(1) : orig_weight.size(0) / groups(), + !transpose() ? orig_weight.size(0) / groups() : orig_weight.size(1), + !transpose() ? orig_weight.size(1) * groups() : orig_weight.size(0), + !transpose() ? orig_weight.size(0) : orig_weight.size(1) * groups(), + act_nhwc.q_zero_point(), + act_input_scale, + w_zp, /* will be ignored for Q[SC]8, see comment + above about w_zp*/ + w_scales_data, + reinterpret_cast( + xnnp_weight.template data_ptr()), + reinterpret_cast(qbias.template data_ptr()), + output_zero_point, + output_scale, + output_min, + output_max, + 0, + &xnnp_op, + per_channel(), + transpose()); + + xnnp_convolution_op = xnnpack_operator(xnnp_op); + TORCH_CHECK( + status == xnn_status_success, + func_name, + ": xnn create operator failed(", + status, + ")"); + } + + at::SmallVector output_shape; + const auto input_shape = MakeInputShape(D, H, W); + if (transpose()) { + output_shape = MakeDeConvOutputShape( + N, M, {H, W}, kernel_, stride(), padding(), output_padding(), dilation()); + } else { + output_shape = MakeConvOutputShape( + N, M, input_shape, kernel_, stride(), padding(), dilation()); + } + + if (act_nhwc.numel() > 0) { + TORCH_CHECK( + std::all_of( + output_shape.begin(), + output_shape.end(), + [](int64_t i) { return i > 0; }), + func_name, ": ", kSpatialDim, "d (xnnpack): each dimension of output tensor should be greater than 0.") + } + + // Allocate output Tensor and a buffer for XNNPACK to use + at::Tensor output = at::native::empty_affine_quantized( + output_shape, + c10::CppTypeToScalarType::value, + c10::nullopt /* layout */, + c10::kCPU, + c10::nullopt /* pin_memory */, + output_scale, + output_zero_point, + c10::MemoryFormat::ChannelsLast); + + // Setup the operator + status = at::native::xnnp_utils::xnnp_setup_convolution2d_nhwc( + xnnp_convolution_op.get(), + N, + H, + W, + reinterpret_cast(act_nhwc.template data_ptr()), + reinterpret_cast(output.template data_ptr()), + caffe2::pthreadpool_(), + per_channel(), + transpose(), + output_padding()[0], + output_padding()[1]); + + TORCH_CHECK( + status == xnn_status_success, + func_name, + ": xnn setup operator failed(", + status, + ")"); + + // Run the operator + status = xnn_run_operator( + xnnp_convolution_op.get(), /* xnn_operator_t op */ + caffe2::pthreadpool_()); /* pthreadpool_t threadpool */ + + TORCH_CHECK( + status == xnn_status_success, + func_name, + ": xnn run operator failed(", + status, + ")"); + + return output; } +#endif // USE_XNNPACK + template template at::Tensor PackedConvWeightsQnnp::apply_impl( @@ -607,7 +865,7 @@ at::Tensor PackedConvWeightsQnnp::apply_impl( func_name, "(qnnpack): Expected activation data type ", toString(c10::kQUInt8), - "but got ", + " but got ", toString(act.scalar_type())); ConvDimChecks( act.ndimension(), stride().size(), padding().size(), @@ -805,6 +1063,61 @@ at::Tensor PackedConvWeightsQnnp::apply_impl( return output; } +#ifdef USE_XNNPACK +bool can_use_xnnp( + c10::ScalarType dtype, + int kSpatialDim, + bool per_channel, + bool transpose) { + if (!at::native::xnnpack::available()) { + return false; + } + bool supported_dtypes = dtype == c10::kQInt8; + bool invalid_config = + (kSpatialDim != 2 /* No support for 3d convolution */ + || (dtype == c10::kQInt8 && transpose && + per_channel)); /* int8_t deconv does not support per-channel */ + if (supported_dtypes && invalid_config) { + /* don't want this to fall through to QNNPACK */ + const std::string func_name = + transpose ? "quantized::conv_transpose" : "quantized::conv"; + TORCH_CHECK( + false, + func_name, + " (xnnpack): Unsupported conv config for dtype KQInt8"); + } + return supported_dtypes && !invalid_config; +} +#endif // USE_XNNPACK + +template +at::Tensor PackedConvWeightsQnnp::apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) { +#ifdef USE_XNNPACK + if (can_use_xnnp(input.scalar_type(), kSpatialDim, per_channel(), transpose())) { + return apply_impl_xnnp( + input, output_scale, output_zero_point); + } /* fall through for unsupported types, configs, or shapes */ +#endif // USE_XNNPACK + return apply_impl(input, output_scale, output_zero_point); +} + +template +at::Tensor PackedConvWeightsQnnp::apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) { +#ifdef USE_XNNPACK + if (can_use_xnnp(input.scalar_type(), kSpatialDim, per_channel(), transpose())) { + return apply_impl_xnnp( + input, output_scale, output_zero_point); + } /* fall through for unsupported types, configs, or shapes */ +#endif // USE_XNNPACK + return apply_impl(input, output_scale, output_zero_point); +} + template at::Tensor PackedConvWeightsQnnp<2>::apply( const at::Tensor& act, double output_scale, @@ -837,6 +1150,177 @@ template at::Tensor PackedConvWeightsQnnp<3>::apply_impl( #endif // USE_PYTORCH_QNNPACK +#if AT_MKLDNN_ENABLED() +template +at::Tensor PackedConvWeightsOnednn::apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) { + return apply_impl(input, output_scale, output_zero_point); +} + +template +at::Tensor PackedConvWeightsOnednn::apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) { + return apply_impl(input, output_scale, output_zero_point); +} + +template +template +at::Tensor PackedConvWeightsOnednn::apply_impl( + const at::Tensor& act, + double output_scale, + int64_t output_zero_point) { + std::string func_name = "quantized::conv"; + if (transpose()) { + func_name += "_transpose"; + } + func_name += std::to_string(kSpatialDim) + "d"; + if (kReluFused) { + func_name += "_relu"; + } + ConvDimChecks( + act.ndimension(), stride().size(), padding().size(), + output_padding().size(), dilation().size(), func_name, transpose()); + TORCH_CHECK(act.scalar_type() == c10::ScalarType::QUInt8, + func_name, " (ONEDNN): data type of input should be QUint8."); + + // src + auto act_contig = act.contiguous(kSpatialDim == 2 ? c10::MemoryFormat::ChannelsLast : c10::MemoryFormat::ChannelsLast3d); + auto src_dims = act_contig.sizes().vec(); + auto src_data_type = dnnl::memory::data_type::u8; + auto src_desc = ideep::tensor::desc(src_dims, src_data_type, + kSpatialDim == 2 ? ideep::format_tag::nhwc : ideep::format_tag::ndhwc); + ideep::tensor src; + src.init(src_desc, act_contig.data_ptr()); + // weights & bias + ideep::tensor& weights = *(weight_.get()); + bool with_bias = bias_.has_value(); + const auto& kernel_size = weights.get_dims(); + // dst + const std::vector& input_size = src.get_dims(); + std::vector output_sizes; + if (transpose()) { + // Prepacked weight format: [o, i, ...] + const int N = act.size(0); // batch size + const int C = act.size(1); // input channels + const int M = weights.get_dim(0); // output channels + const int D = kSpatialDim == 2 ? 1 : act.size(2); // input depth + const int H = act.size(kSpatialDim); // input height + const int W = act.size(kSpatialDim + 1); // input width + const int KH = weights.get_dim(kSpatialDim); // kernel height + const int KW = weights.get_dim(kSpatialDim + 1); // kernel width + const int KD = kSpatialDim == 2 ? 1 : weights.get_dim(2); // kernel depth + TORCH_CHECK(C == groups() * weights.get_dim(1), // weight: [o, i, ...] + func_name, " (ONEDNN): input channel number should be ", + groups() * weights.get_dim(1), ", but got ", C); + auto output_shape = MakeDeConvOutputShape( + N, + M, + kSpatialDim == 2 ? std::vector{H, W} : std::vector{D, H, W}, + kSpatialDim == 2 ? std::vector{KH, KW} : std::vector{KD, KH, KW}, + stride(), + padding(), + output_padding(), + dilation()); + output_sizes = c10::IntArrayRef(output_shape).vec(); + } else { + output_sizes = at::native::conv_output_size(input_size, kernel_size, padding().vec(), stride().vec(), dilation().vec()); + } + ideep::dims dst_dims = ideep::dims({output_sizes.cbegin(), output_sizes.cend()}); + at::Tensor output = at::_empty_affine_quantized( + dst_dims, + device(c10::kCPU) + .dtype(c10::kQUInt8) + .memory_format(kSpatialDim == 2 ? + c10::MemoryFormat::ChannelsLast : + c10::MemoryFormat::ChannelsLast3d), + output_scale, + output_zero_point, + c10::nullopt); + if (output.numel() == 0) { + return output; + } + ideep::tensor dst({dst_dims, ideep::tensor::data_type::u8, {output.strides().cbegin(), output.strides().cend()}}, + output.data_ptr()); + // Parameters + const ideep::dims& strides = stride().vec(); + const ideep::dims& dilates = dilation().vec(); + const ideep::dims& padding_l = padding().vec(); + const ideep::dims& padding_r = padding().vec(); + const ideep::scale_t& src_scales = ideep::scale_t(1, 1.0/act.q_scale()); // Scales of ONEDNN and PyTorch are reciprocal + const ideep::scale_t& weights_scales = weights.get_scale(); + const ideep::scale_t& dst_scales = ideep::scale_t(weights_scales.size(), 1.0/output_scale); // Scales of ONEDNN and PyTorch are reciprocal + const ideep::zero_point_t src_zero_points = ideep::zero_point_t(1, act.q_zero_point()); + const ideep::zero_point_t dst_zero_points = ideep::zero_point_t(1, output_zero_point); + ideep::attr_t op_attr = kReluFused ? ideep::attr_t::fuse_relu() : ideep::attr_t(); + op_attr.set_zero_points(DNNL_ARG_SRC, ideep::utils::tensor_zp_mask(1), {DNNL_RUNTIME_S32_VAL}); // runtime src zero point + if (with_bias) { + // Bias might be modified outside (e.g. by quantization bias correction). + // If so, update the prepacked bias as well. + if (bias_.value().get_data_handle() != orig_bias_.value().data_ptr()) { + bias_.value().init(bias_.value().get_desc(), orig_bias_.value().data_ptr()); + } + const auto& b = bias_.value(); + if (transpose()) { + ideep::convolution_transpose_forward::compute_v2( + src, weights, b, dst_dims, dst, + strides, padding_l, padding_r, dilates, + groups(), src_scales, weights_scales, dst_scales, src_zero_points, dst_zero_points, + op_attr, dnnl::algorithm::deconvolution_direct, dnnl::prop_kind::forward_inference, + ideep::u8s8, ideep::engine::cpu_engine()); + } else { + ideep::convolution_forward::compute_v2( + src, weights, b, dst_dims, dst, + strides, dilates, padding_l, padding_r, groups(), + src_scales, weights_scales, dst_scales, src_zero_points, dst_zero_points, + op_attr, dnnl::algorithm::convolution_direct, dnnl::prop_kind::forward_inference, + ideep::u8s8, ideep::engine::cpu_engine()); + } + } else { + if (transpose()) { + ideep::convolution_transpose_forward::compute_v2( + src, weights, dst_dims, dst, + strides, padding_l, padding_r, dilates, + groups(), src_scales, weights_scales, dst_scales, src_zero_points, dst_zero_points, + op_attr, dnnl::algorithm::deconvolution_direct, dnnl::prop_kind::forward_inference, + ideep::u8s8, ideep::engine::cpu_engine()); + } else { + ideep::convolution_forward::compute_v2( + src, weights, dst_dims, dst, + strides, dilates, padding_l, padding_r, groups(), + src_scales, weights_scales, dst_scales, src_zero_points, dst_zero_points, + op_attr, dnnl::algorithm::convolution_direct, dnnl::prop_kind::forward_inference, + ideep::u8s8, ideep::engine::cpu_engine()); + } + } + return output; +} + +template at::Tensor PackedConvWeightsOnednn<2>::apply( + const at::Tensor& act, + double output_scale, + int64_t output_zero_point); + +template at::Tensor PackedConvWeightsOnednn<2>::apply_relu( + const at::Tensor& act, + double output_scale, + int64_t output_zero_point); + +template at::Tensor PackedConvWeightsOnednn<3>::apply( + const at::Tensor& act, + double output_scale, + int64_t output_zero_point); + +template at::Tensor PackedConvWeightsOnednn<3>::apply_relu( + const at::Tensor& act, + double output_scale, + int64_t output_zero_point); + +#endif // #if AT_MKLDNN_ENABLED() + namespace at { namespace native { namespace { @@ -914,10 +1398,10 @@ class QConvInt8ForBC final { static Tensor run( Tensor act, const c10::intrusive_ptr>& packed_weight, - torch::List stride, - torch::List padding, - torch::List dilation, - int64_t groups, + torch::List /*stride*/, + torch::List /*padding*/, + torch::List /*dilation*/, + int64_t /*groups*/, double output_scale, int64_t output_zero_point) { if (kReluFused) { diff --git a/aten/src/ATen/native/quantized/cpu/qconv_dynamic.cpp b/aten/src/ATen/native/quantized/cpu/qconv_dynamic.cpp index ec95748cd42b..2f3a6ed8f3cd 100644 --- a/aten/src/ATen/native/quantized/cpu/qconv_dynamic.cpp +++ b/aten/src/ATen/native/quantized/cpu/qconv_dynamic.cpp @@ -5,9 +5,10 @@ #include #include #include -#include +#include #include #include +#include #include #include #include @@ -118,6 +119,57 @@ template at::Tensor PackedConvWeightsQnnp<3>::apply_dynamic( #endif // USE_PYTORCH_QNNPACK +#if AT_MKLDNN_ENABLED() + +template +at::Tensor PackedConvWeightsOnednn::apply_dynamic( + const at::Tensor& input, + bool reduce_range) { + + // Find min/max of input + float x_max = 0, x_min = 0; + if (input.numel() > 0) { + x_min = input.min().item(); + x_max = input.max().item(); + } + + // Input tensor is quantized as 8-bit unsigned values + static constexpr int precision = 8; + static constexpr bool is_signed = false; + + // Calculate scale and zero point for quantization of input tensor + auto q_params = quant_utils::ChooseQuantizationParams( + /*min=*/x_min, + /*max=*/x_max, + /*qmin=*/is_signed ? -(1 << (precision - 1)) : 0, + /*qmax=*/ + is_signed ? ((1 << (precision - 1)) - 1) : (1 << precision) - 1, + /*preserve_sparsity=*/false, + /*force_scale_power_of_two=*/false, + /*reduce_range=*/reduce_range); + + // Quantize input + at::Tensor q_input = at::quantize_per_tensor( + input, q_params.scale, q_params.zero_point, c10::kQUInt8); + + at::Tensor out = + apply_impl(q_input, q_params.scale, q_params.zero_point); + + // TODO: Modify ideep to allow fp32 input & output + // to avoid explicit `quantize - dequantize` + return at::dequantize(out); +} + +template at::Tensor PackedConvWeightsOnednn<2>::apply_dynamic( + const at::Tensor& input, + bool reduce_range); + +template at::Tensor PackedConvWeightsOnednn<3>::apply_dynamic( + const at::Tensor& input, + bool reduce_range); + +#endif // AT_MKLDNN_ENABLED() + namespace at { namespace native { namespace { diff --git a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp index 3cb5d9ef1a18..85edffef25b9 100644 --- a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp @@ -2,10 +2,11 @@ #include #include -#include +#include #include #include #include +#include #include #include #include @@ -314,6 +315,165 @@ c10::intrusive_ptr> PackedConvWeightsQnnp< bool transpose); #endif // USE_PYTORCH_QNNPACK +#if AT_MKLDNN_ENABLED() +template +c10::intrusive_ptr> PackedConvWeightsOnednn< + kSpatialDim>:: + prepack( + at::Tensor weight, + c10::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose) { + TORCH_CHECK( + weight.ndimension() == kSpatialDim + 2, + "Weights are expected to have ", kSpatialDim + 2, " dimensions"); + TORCH_CHECK( + stride.size() == kSpatialDim, + "stride should contain ", kSpatialDim, " elements for ", + kSpatialDim, "D convolution."); + TORCH_CHECK( + padding.size() == kSpatialDim, + "Specify front/top/left padding only. " + "end/bottom/right padding assumed to be equal to front/top/left"); + TORCH_CHECK( + !transpose || output_padding.size() == kSpatialDim, + "quantized::conv_prepack: Specify top/left output padding " + "only. bottom/right padding assumed to be equal to top/left"); + TORCH_CHECK( + dilation.size() == kSpatialDim, + "dilation should contain ", kSpatialDim, " elements for ", + kSpatialDim, "D convolution."); + TORCH_CHECK( + !transpose || std::all_of(output_padding.begin(), output_padding.end(), [](int i) { return i==0; }), + "quantized::conv_prepack: ONEDNN only supports zero output_padding."); + + // Weight + // Format: [OC IC//group KH KW] for conv; [IC OC//group KH KW] for deconv + auto dims = weight.sizes().vec(); + auto strides = stride.vec(); + auto padding_l = padding.vec(); + auto padding_r = padding.vec(); + auto dilates = dilation.vec(); + auto op_attr = ideep::attr_t(); + std::vector wgt_zero_points; + ideep::scale_t wgt_scales; + const int output_channels = transpose ? weight.size(1) * groups + : weight.size(0); + const auto qtype = weight.qscheme(); + if (qtype == c10::kPerTensorAffine) { + TORCH_CHECK( + weight.q_zero_point()==0, + "quantized::qconv_prepack: ONEDNN only supports symmetric quantization of weight," + " whose zero point must be 0."); + wgt_zero_points = std::vector(1, weight.q_zero_point()); + wgt_scales = ideep::scale_t(1, 1.0/weight.q_scale()); // Scales of ONEDNN and PyTorch are reciprocal + } else if (qtype == c10::kPerChannelAffine) { + TORCH_CHECK( + !transpose, + "Per Channel Quantization is currently disabled for transposed conv"); + wgt_zero_points.resize(output_channels); + wgt_scales.resize(output_channels); + for (int i = 0; i < output_channels; ++i) { + wgt_zero_points[i] = weight.q_per_channel_zero_points()[i].item(); + TORCH_CHECK( + wgt_zero_points[i]==0, + "quantized::qconv_prepack: ONEDNN only supports symmetric quantization of weight," + " whose zero point must be 0."); + wgt_scales[i] = 1.0f / weight.q_per_channel_scales()[i].item(); // Scales of ONEDNN and PyTorch are reciprocal + } + } else { + TORCH_CHECK(false, "Unsupported qscheme: ", toString(qtype)); + } + + // Set runtime src zero point + auto src_zero_point = {DNNL_RUNTIME_S32_VAL}; + op_attr.set_zero_points(DNNL_ARG_SRC, + ideep::utils::tensor_zp_mask(src_zero_point.size()), + src_zero_point); + at::Tensor weight_copy; + ideep::tensor::desc w_desc; + ideep::dims dims_iohw, dims_giohw; + ideep::tag w_tag = ideep::tag::any; + const bool with_groups = groups > 1; + if (transpose) { + w_desc = ideep::convolution_transpose_forward::expected_weights_desc( + dims, dnnl::memory::data_type::s8, + strides, padding_l, padding_r, dilates, groups, + dnnl::algorithm::deconvolution_direct, dnnl::prop_kind::forward_inference, + ideep::dims(), op_attr); + // convolution_transpose_forward::expected_weights_desc() gives format [i, o, ...], + // but ONEDNN requires [o, i, ...] for computation + dims_iohw = w_desc.get_dims(); + dims_giohw = with_groups ? ideep::utils::group_dims(dims_iohw, groups) : dims_iohw; + std::vector perms(dims_giohw.size(), 0); // for permutation of weight + std::iota(perms.begin(), perms.end(), 0); + w_desc = w_desc.transpose(with_groups, with_groups + 1); + std::swap(perms[with_groups], perms[with_groups + 1]); + weight_copy = weight.reshape(dims_giohw).permute(c10::IntArrayRef(perms)).clone(); + } else { + w_desc = ideep::convolution_forward::expected_weights_desc( + dims, dnnl::memory::data_type::s8, + strides, padding_l, padding_r, dilates, groups, + dnnl::algorithm::convolution_direct, dnnl::prop_kind::forward_inference, + dnnl::memory::data_type::u8, ideep::dims(), op_attr); + weight_copy = weight.clone(); + } + if (with_groups) { + w_tag = kSpatialDim == 2 ? ideep::tag::goihw : ideep::tag::goidhw; + } else { + w_tag = kSpatialDim == 2 ? ideep::tag::oihw : ideep::tag::oidhw; + } + ideep::dims w_dims = with_groups ? ideep::utils::group_dims(w_desc.get_dims(), groups) + : w_desc.get_dims(); + ideep::tensor wgt = ideep::tensor( + ideep::tensor::desc({w_dims, dnnl::memory::data_type::s8, w_tag}, groups), + weight_copy.data_ptr()); + wgt.set_scale(wgt_scales); // Scales are needed for feed_from(). + ideep::tensor exp_wgt; + exp_wgt.init(w_desc); + exp_wgt.set_scale(wgt_scales); // Also for feed_from() + exp_wgt.feed_from(wgt, transpose); // expect wgt to be in [OC IC KH KW] format + ideep::tensor * packed_weight_p = new ideep::tensor(exp_wgt); + packed_weight_p->set_scale(wgt_scales); + packed_weight_p->set_zero_point(wgt_zero_points); + std::unique_ptr weight_ptr(packed_weight_p); + // Bias + c10::optional onednn_bias{c10::nullopt}; + if (bias.has_value()) { + at::Tensor bias_vec = bias.value(); + TORCH_CHECK(bias_vec.dim() == 1, "bias should be a vector (1D Tensor)"); + TORCH_CHECK( + bias_vec.size(0) == output_channels, + "bias should have K elements: " + std::to_string(output_channels)); + auto bias_desc = ideep::tensor::desc(bias.value().sizes().vec(), dnnl::memory::data_type::f32); + ideep::tensor packed_bias; + packed_bias.init(bias_desc, bias.value().data_ptr()); + onednn_bias = c10::optional(packed_bias); + } + auto ret_ptr = c10::make_intrusive>( + PackedConvWeightsOnednn{ + std::move(weight_ptr), + onednn_bias, + weight, + bias, + stride, + padding, + output_padding, + dilation, + groups, + transpose + }); + return ret_ptr; +} + +template struct PackedConvWeightsOnednn<2>; +template struct PackedConvWeightsOnednn<3>; +#endif // #if AT_MKLDNN_ENABLED() + namespace at { namespace native { namespace { @@ -377,6 +537,14 @@ class QConvPackWeightInt8 final { } #endif +#if AT_MKLDNN_ENABLED() + if (ctx.qEngine() == at::QEngine::ONEDNN) { + return PackedConvWeightsOnednn::prepack( + weight, bias, stride, padding, output_padding, dilation, groups, + transpose); + } +#endif + TORCH_CHECK( false, "Didn't find engine for operation quantized::conv2d_prepack ", @@ -438,8 +606,6 @@ class QConv1dPackWeightInt8 final { } #endif - - #ifdef USE_PYTORCH_QNNPACK if (ctx.qEngine() == at::QEngine::QNNPACK) { return PackedConvWeightsQnnp<2>::prepack( @@ -447,6 +613,15 @@ class QConv1dPackWeightInt8 final { transpose); } #endif + +#if AT_MKLDNN_ENABLED() + if (ctx.qEngine() == at::QEngine::ONEDNN) { + return PackedConvWeightsOnednn<2>::prepack( + weight, bias, stride, padding, output_padding, dilation, groups, + transpose); + } +#endif + TORCH_CHECK( false, "Didn't find engine for operation quantized::conv1d_prepack ", diff --git a/aten/src/ATen/native/quantized/cpu/qconv_unpack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_unpack.cpp deleted file mode 100644 index e4855062e360..000000000000 --- a/aten/src/ATen/native/quantized/cpu/qconv_unpack.cpp +++ /dev/null @@ -1,312 +0,0 @@ -#include -#include - -#include -#include -#include -#include -#include -#include - -#ifdef USE_FBGEMM -template -std::tuple> PackedConvWeight< - kSpatialDim>::unpack() { - auto* packed_weights_p = w.get(); - // output channels - const int output_channels = packed_weights_p->outputChannels(); - const int input_channels = packed_weights_p->inputChannels(); - const int groups = packed_weights_p->groups(); - - const int kernel_d = kSpatialDim == 2 ? 1 : kernel[0]; - // R (kernel height) - const int kernel_h = kernel[kSpatialDim - 2]; - // S (kernel width) - const int kernel_w = kernel[kSpatialDim - 1]; - - const int C_per_G = input_channels / groups; - - // Tensor for unpacked weights - // Unpacked format would be physical KRS(C/G) but logical KCRS (channels - // first) because that's how - // ChannelsLast3d is not available now.FBGEMM stores the weights - // TODO: Unify 2d and 3d when ChannelsLast3d is ready. - at::Tensor unpacked_weights; - if (q_scheme == c10::kPerTensorAffine) { - unpacked_weights = kSpatialDim == 2 - ? at::_empty_affine_quantized( - {output_channels, C_per_G, kernel_h, kernel_w}, - device(c10::kCPU) - .dtype(c10::kQInt8) - .memory_format(c10::MemoryFormat::ChannelsLast), - w_scale[0], - w_zp[0], - c10::nullopt) - : at::native::fbgemm_utils:: - MakeEmptyAffineQuantizedChannelsLast3dTensor( - output_channels, - C_per_G, - kernel_d, - kernel_h, - kernel_w, - device(c10::kCPU).dtype(c10::kQInt8), - w_scale[0], - w_zp[0]); - } else if (q_scheme == c10::kPerChannelAffine) { - TORCH_CHECK( - !transpose(), - "Per Channel Quantization is currently disabled for transposed conv"); - auto scales = at::from_blob( - w_scale.data(), w_scale.size(), device(c10::kCPU).dtype(c10::kFloat)); - auto zero_points = at::from_blob( - w_zp.data(), w_zp.size(), device(c10::kCPU).dtype(c10::kInt)); - unpacked_weights = kSpatialDim == 2 - ? at::_empty_per_channel_affine_quantized( - {output_channels, C_per_G, kernel_h, kernel_w}, - scales.toType(c10::kDouble), - zero_points.toType(c10::kLong), - 0, /* The output channel axis is 0 */ - device(c10::kCPU).dtype(c10::kQInt8), - c10::MemoryFormat::ChannelsLast) - : at::native::fbgemm_utils:: - MakeEmptyPerChannelAffineQuantizedChannelsLast3dTensor( - output_channels, - C_per_G, - kernel_d, - kernel_h, - kernel_w, - device(c10::kCPU).dtype(c10::kQInt8), - scales.toType(c10::kDouble), - zero_points.toType(c10::kLong)); - } else { - TORCH_CHECK(false, "Unsupported qscheme: ", toString(q_scheme)); - } - int8_t* unpacked_weights_p = - reinterpret_cast(unpacked_weights.data_ptr()); - packed_weights_p->unpack(unpacked_weights_p); - if(transpose()){ - unpacked_weights = - at::native::fbgemm_utils::TransposeConvTensorUnpackConversion< - kSpatialDim>(unpacked_weights, groups); - } - return std::tuple>( - unpacked_weights, bias); -} - -template std::tuple> PackedConvWeight< - 2>::unpack(); -template std::tuple> PackedConvWeight< - 3>::unpack(); -#endif // USE_FBGEMM - -#ifdef USE_PYTORCH_QNNPACK -template -std::tuple> PackedConvWeightsQnnp< - kSpatialDim>::unpack() { - TORCH_CHECK( - kSpatialDim == 2, - "QNNPACK only supports conv2d_unpack right " - "now."); - TORCH_CHECK( - orig_weight.defined(), - "Cannot unpack weights. " - "Call at::globalContext()::setReleaseOriginalWeights(false) before packing or loading to enable unpacking."); - return std::tuple>(orig_weight, bias); -} - -template std::tuple> PackedConvWeightsQnnp< - 2>::unpack(); -template std::tuple> PackedConvWeightsQnnp< - 3>::unpack(); -#endif // USE_PYTORCH_QNNPACK - -namespace at { -namespace native { -namespace { - -/* - * QConvPackWeightInt8 expects its input tensor to be in shape - * [output_channels, kernel_height, kernel_width, input_channels/Groups] - * Therefore, the unpacking of packed weight tensor using QConvUnpackWeightsInt8 - * results in a tensor of the same shape. - */ - -template -class QConvUnpackWeightsInt8 final { - public: - static std::tuple> run( - const c10::intrusive_ptr>& packed_weight) { - auto& ctx = at::globalContext(); - -#ifdef USE_FBGEMM - if (ctx.qEngine() == at::QEngine::FBGEMM) { - return packed_weight->unpack(); - } -#endif - -#ifdef USE_PYTORCH_QNNPACK - if (ctx.qEngine() == at::QEngine::QNNPACK) { - TORCH_CHECK( - kSpatialDim == 2, - "quantized::conv2d_unpack (qnnpack): QNNPACK only supports Conv2d " - "now."); - return packed_weight->unpack(); - } -#endif - - TORCH_CHECK( - false, - "Didn't find engine for operation quantized::conv2d_unpack ", - toString(ctx.qEngine())); - } -}; - -class QConv1dUnpackWeightsInt8 final { - public: - static std::tuple> run( - const c10::intrusive_ptr>& packed_weight) { - auto& ctx = at::globalContext(); - at::Tensor weight; - c10::optional bias; -#ifdef USE_FBGEMM - if (ctx.qEngine() == at::QEngine::FBGEMM) { - std::tie(weight, bias) = packed_weight->unpack(); - weight = weight.squeeze_(quant_utils::kConv1dSqueezeDim + 2); - return std::tuple>(weight, bias); - } -#endif - -#ifdef USE_PYTORCH_QNNPACK - if (ctx.qEngine() == at::QEngine::QNNPACK) { - std::tie(weight, bias) = packed_weight->unpack(); - at::Tensor new_weight = weight.clone(); - new_weight = new_weight.squeeze_(quant_utils::kConv1dSqueezeDim + 2); - return std::tuple>(new_weight, bias); - } -#endif - - TORCH_CHECK( - false, - "Didn't find engine for operation quantized::conv1d_unpack ", - toString(ctx.qEngine())); - } -}; - -template -class QConvStride final { - public: - static torch::List run( - const c10::intrusive_ptr>& packed_weight) { - return packed_weight->stride(); - } -}; - -template -class QConvPadding final { - public: - static torch::List run( - const c10::intrusive_ptr>& packed_weight) { - return packed_weight->padding(); - } -}; - -template -class QConvOutputPadding final { - public: - static torch::List run( - const c10::intrusive_ptr>& packed_weight) { - return packed_weight->output_padding(); - } -}; - -template -class QConvDilation final { - public: - static torch::List run( - const c10::intrusive_ptr>& packed_weight) { - return packed_weight->dilation(); - } -}; - -template -class QConvGroups final { - public: - static int64_t run( - const c10::intrusive_ptr>& packed_weight) { - return packed_weight->groups(); - } -}; - -template -class QConvTranspose final { - public: - static int64_t run( - const c10::intrusive_ptr>& packed_weight) { - return packed_weight->transpose(); - } -}; - -IValue -unpack_quantized_prepacked_sizes_conv2d(const IValue& ivalue) { - auto params = ivalue.toCustomClass>(); - at::Tensor weight; - c10::optional bias; - std::tie(weight, bias) = params->unpack(); - c10::optional bias_sizes = c10::nullopt; - if (bias && bias->defined()) { - bias_sizes = bias->sizes(); - } - return IValue(std::make_tuple( - weight.sizes(), - bias_sizes, - params->stride(), - params->padding(), - params->dilation(), - params->groups())); -} - -TORCH_LIBRARY_IMPL(quantized, CatchAll, m) { - // conv_unpack is deprecated, please use conv2d_unpack for 2D conv. - m.impl(TORCH_SELECTIVE_NAME("quantized::conv_unpack"), TORCH_FN(QConvUnpackWeightsInt8<2>::run)); - // We use conv2d_unpack to be consistent with conv3d_unpack - m.impl(TORCH_SELECTIVE_NAME("quantized::conv1d_unpack"), TORCH_FN(QConv1dUnpackWeightsInt8::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<2>::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_unpack_sizes"), TORCH_FN(unpack_quantized_prepacked_sizes_conv2d)); - m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<3>::run)); - - m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_stride"), TORCH_FN(QConvStride<2>::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_padding"), TORCH_FN(QConvPadding<2>::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_output_padding"), TORCH_FN(QConvOutputPadding<2>::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_dilation"), TORCH_FN(QConvDilation<2>::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_groups"), TORCH_FN(QConvGroups<2>::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_transpose"), TORCH_FN(QConvTranspose<2>::run)); - - m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_stride"), TORCH_FN(QConvStride<3>::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_padding"), TORCH_FN(QConvPadding<3>::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_output_padding"), TORCH_FN(QConvOutputPadding<3>::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_dilation"), TORCH_FN(QConvDilation<3>::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_groups"), TORCH_FN(QConvGroups<3>::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_transpose"), TORCH_FN(QConvTranspose<3>::run)); - - // ConvTranspose is the same, however, we want to have different name. - m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose1d_unpack"), TORCH_FN(QConv1dUnpackWeightsInt8::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<2>::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<3>::run)); - - m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_stride"), TORCH_FN(QConvStride<2>::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_padding"), TORCH_FN(QConvPadding<2>::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_output_padding"), TORCH_FN(QConvOutputPadding<2>::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_dilation"), TORCH_FN(QConvDilation<2>::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_groups"), TORCH_FN(QConvGroups<2>::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_transpose"), TORCH_FN(QConvTranspose<2>::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_stride"), TORCH_FN(QConvStride<3>::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_padding"), TORCH_FN(QConvPadding<3>::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_output_padding"), TORCH_FN(QConvOutputPadding<3>::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_dilation"), TORCH_FN(QConvDilation<3>::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_groups"), TORCH_FN(QConvGroups<3>::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_transpose"), TORCH_FN(QConvTranspose<3>::run)); -} - -} // namespace -} // namespace native -} // namespace at diff --git a/aten/src/ATen/native/quantized/cpu/qconv_unpack_impl.cpp b/aten/src/ATen/native/quantized/cpu/qconv_unpack_impl.cpp new file mode 100644 index 000000000000..693e093b1209 --- /dev/null +++ b/aten/src/ATen/native/quantized/cpu/qconv_unpack_impl.cpp @@ -0,0 +1,136 @@ +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#ifdef USE_FBGEMM +template +std::tuple> PackedConvWeight< + kSpatialDim>::unpack() { + auto* packed_weights_p = w.get(); + // output channels + const int output_channels = packed_weights_p->outputChannels(); + const int input_channels = packed_weights_p->inputChannels(); + const int groups = packed_weights_p->groups(); + + const int kernel_d = kSpatialDim == 2 ? 1 : kernel[0]; + // R (kernel height) + const int kernel_h = kernel[kSpatialDim - 2]; + // S (kernel width) + const int kernel_w = kernel[kSpatialDim - 1]; + + const int C_per_G = input_channels / groups; + + // Tensor for unpacked weights + // Unpacked format would be physical KRS(C/G) but logical KCRS (channels + // first) because that's how + // ChannelsLast3d is not available now.FBGEMM stores the weights + // TODO: Unify 2d and 3d when ChannelsLast3d is ready. + at::Tensor unpacked_weights; + if (q_scheme == c10::kPerTensorAffine) { + unpacked_weights = kSpatialDim == 2 + ? at::_empty_affine_quantized( + {output_channels, C_per_G, kernel_h, kernel_w}, + device(c10::kCPU) + .dtype(c10::kQInt8) + .memory_format(c10::MemoryFormat::ChannelsLast), + w_scale[0], + w_zp[0], + c10::nullopt) + : at::native::fbgemm_utils:: + MakeEmptyAffineQuantizedChannelsLast3dTensor( + output_channels, + C_per_G, + kernel_d, + kernel_h, + kernel_w, + device(c10::kCPU).dtype(c10::kQInt8), + w_scale[0], + w_zp[0]); + } else if (q_scheme == c10::kPerChannelAffine) { + TORCH_CHECK( + !transpose(), + "Per Channel Quantization is currently disabled for transposed conv"); + auto scales = at::from_blob( + w_scale.data(), w_scale.size(), device(c10::kCPU).dtype(c10::kFloat)); + auto zero_points = at::from_blob( + w_zp.data(), w_zp.size(), device(c10::kCPU).dtype(c10::kInt)); + unpacked_weights = kSpatialDim == 2 + ? at::_empty_per_channel_affine_quantized( + {output_channels, C_per_G, kernel_h, kernel_w}, + scales.toType(c10::kDouble), + zero_points.toType(c10::kLong), + 0, /* The output channel axis is 0 */ + device(c10::kCPU).dtype(c10::kQInt8), + c10::MemoryFormat::ChannelsLast) + : at::native::fbgemm_utils:: + MakeEmptyPerChannelAffineQuantizedChannelsLast3dTensor( + output_channels, + C_per_G, + kernel_d, + kernel_h, + kernel_w, + device(c10::kCPU).dtype(c10::kQInt8), + scales.toType(c10::kDouble), + zero_points.toType(c10::kLong)); + } else { + TORCH_CHECK(false, "Unsupported qscheme: ", toString(q_scheme)); + } + int8_t* unpacked_weights_p = + reinterpret_cast(unpacked_weights.data_ptr()); + packed_weights_p->unpack(unpacked_weights_p); + if(transpose()){ + unpacked_weights = + at::native::fbgemm_utils::TransposeConvTensorUnpackConversion< + kSpatialDim>(unpacked_weights, groups); + } + return std::tuple>( + unpacked_weights, bias); +} + +template std::tuple> PackedConvWeight< + 2>::unpack(); +template std::tuple> PackedConvWeight< + 3>::unpack(); +#endif // USE_FBGEMM + +#ifdef USE_PYTORCH_QNNPACK +template +std::tuple> PackedConvWeightsQnnp< + kSpatialDim>::unpack() { + TORCH_CHECK( + kSpatialDim == 2, + "QNNPACK only supports conv2d_unpack right " + "now."); + TORCH_CHECK( + orig_weight.defined(), + "Cannot unpack weights. " + "Call at::globalContext()::setReleaseOriginalWeights(false) before packing or loading to enable unpacking."); + return std::tuple>(orig_weight, bias); +} + +template std::tuple> PackedConvWeightsQnnp< + 2>::unpack(); +template std::tuple> PackedConvWeightsQnnp< + 3>::unpack(); +#endif // USE_PYTORCH_QNNPACK + +#if AT_MKLDNN_ENABLED() +template +std::tuple> PackedConvWeightsOnednn< + kSpatialDim>::unpack() { + return std::tuple>( + orig_weight_, orig_bias_); +} + +template std::tuple> PackedConvWeightsOnednn< + 2>::unpack(); +template std::tuple> PackedConvWeightsOnednn< + 3>::unpack(); +#endif // #if AT_MKLDNN_ENABLED() diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp index cf18da771e5c..7579e3185174 100644 --- a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp +++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp @@ -405,7 +405,7 @@ at::Tensor& embedding_bag_byte_impl( if (!pruned_weights || fallback_to_no_sparse) { auto kernel_i8 = - fbgemm::GenerateEmbeddingSpMDM( + fbgemm::GenerateEmbeddingSpMDM( /*block_size=*/D, /*has_weight=*/per_sample_weights_.has_value(), /*normalize_by_lengths=*/false, diff --git a/aten/src/ATen/native/quantized/cpu/qgelu.cpp b/aten/src/ATen/native/quantized/cpu/qgelu.cpp index 7c0ee3cd784f..c07796f608d4 100644 --- a/aten/src/ATen/native/quantized/cpu/qgelu.cpp +++ b/aten/src/ATen/native/quantized/cpu/qgelu.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -15,9 +16,9 @@ namespace native { DEFINE_DISPATCH(qgelu_stub); -Tensor gelu_quantized_cpu(const Tensor& qx) { +Tensor gelu_quantized_cpu(const Tensor& qx, c10::string_view approximate) { Tensor qy; - qgelu_stub(qx.device().type(), qx, qy); + qgelu_stub(qx.device().type(), qx, qy, get_gelutype_enum(approximate)); return qy; } }} // namespace at::native diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp index ac055bf74a6e..d358f23c6af3 100644 --- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp @@ -2,8 +2,10 @@ #include #include #include -#include +#include #include +#include +#include #include #include #include @@ -270,6 +272,161 @@ at::Tensor& PackedLinearWeight::apply_relu_out( #endif // USE_FBGEMM #ifdef USE_PYTORCH_QNNPACK + +#ifdef USE_XNNPACK +// TODO: add per_channel support in the future when xnnp supports it +template +at::Tensor PackedLinearWeightsQnnp::apply_impl_xnnp( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) { + using underlying_t = typename scalar_t::underlying; + + std::lock_guard lock(qnnp_mutex_); + + const std::string func_name = kReluFused ? "quantized::linear_relu (xnnpack)" + : "quantized::linear (xnnpack)"; + TORCH_CHECK( + input.dim() >= 2, func_name, ": Input tensor rank should be >= 2."); + TORCH_CHECK( + !per_channel(), + func_name, + ": xnnpack does not currently have per_channel support."); + + const auto input_contig = input.contiguous(); + const auto input_scale = input_contig.q_scale(); + + const size_t rows_w = bias_.size(0); + const size_t cols_w = input_contig.size(input_contig.dim() - 1); + + auto status = xnn_status_invalid_state; + + // Create an operator iff not already created + if (!xnnp_linear_op || + (!this->input_scale.has_value() || + this->input_scale.value() != input_scale)) { + // Update the input scale so we may cache the op + this->input_scale = input_scale; + + xnn_operator_t xnnp_op = nullptr; + + const float* weight_scales_data = w_scales.data_ptr(); + + // prepare weights + underlying_t w_zp = static_cast( + orig_weight.q_zero_point() + + (std::is_same::value ? 128 : 0)); + + at::Tensor xnnp_weight = at::_empty_affine_quantized( + orig_weight.sizes(), + c10::CppTypeToScalarType::value, + weight_scales_data[0], + w_zp); + + // copy from the original weight and take care of dtype change if necessary + at::native::xnnp_utils::q8_copy_int8_weight_and_add_offset( + orig_weight, xnnp_weight); + + // Original bias was float, so we requantize it here. + at::Tensor qbias = at::native::quantize_per_tensor( + bias_, orig_weight.q_scale() * input_scale, 0, c10::kQInt32); + + // output limits + auto output_min = kReluFused + // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) + ? activationLimits(output_scale, output_zero_point, Activation::RELU).first + : std::numeric_limits::min(); + auto output_max = kReluFused + // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) + ? activationLimits(output_scale, output_zero_point, Activation::RELU).second + : std::numeric_limits::max(); + + // Create an operator + status = at::native::xnnp_utils::xnnp_create_fully_connected_nc( + cols_w, /* input_channels */ + rows_w, /* output_channels */ + cols_w, /* input_stride */ + rows_w, /* output_stride */ + input_contig.q_zero_point(), + input_contig.q_scale(), + w_zp, + weight_scales_data[0], + reinterpret_cast( + xnnp_weight.template data_ptr()), + reinterpret_cast(qbias.data_ptr()), + output_zero_point, + output_scale, + output_min, + output_max, + 0, /* flags */ + &xnnp_op); + xnnp_linear_op = xnnpack_operator(xnnp_op); + + TORCH_CHECK( + status == xnn_status_success, + func_name, + ": xnn create operator failed(", + status, + ")"); + } + + /* + * Allocate output Tensor and a buffer for XNNPACK to use + * The resulting matrix here is 2-D, let's view it with the original + * left hand dimensions of the input. Here are two examples: + * 1. If the input tensor is {M, K}, the output tensor is {M, N}. + * 2. If the input tensor is {b, M, K}, the output tensor is {b, M, N}. + */ + std::vector out_sizes = input.sizes().vec(); + out_sizes.back() = static_cast(rows_w); + at::Tensor output = at::native::empty_affine_quantized( + out_sizes, + c10::CppTypeToScalarType::value, + c10::nullopt /* layout */, + c10::kCPU, + c10::nullopt /* pin_memory */, + output_scale, + output_zero_point, + input.suggest_memory_format()); + + // calculate batch_size + size_t rows_input = 1; + for (const auto i : c10::irange(input_contig.dim() - 1)) { + rows_input *= input_contig.size(i); + } + + // Setup the operator + status = at::native::xnnp_utils::xnnp_setup_fully_connected_nc( + xnnp_linear_op.get(), + rows_input, /* batch_size */ + reinterpret_cast( + input_contig.template data_ptr()), + reinterpret_cast(output.template data_ptr()), + caffe2::pthreadpool_()); + + TORCH_CHECK( + status == xnn_status_success, + func_name, + ": xnn setup operator failed(", + status, + ")"); + + // Run the opeator + status = xnn_run_operator( + xnnp_linear_op.get(), // Linear op + caffe2::pthreadpool_() // threadpool + ); + TORCH_CHECK( + status == xnn_status_success, + func_name, + ": xnn run operator failed(", + status, + ")"); + + return output; +} +#endif // USE_XNNPACK + template at::Tensor PackedLinearWeightsQnnp::apply_impl( at::Tensor input, @@ -414,10 +571,35 @@ at::Tensor PackedLinearWeightsQnnp::apply_impl( return output; } +#ifdef USE_XNNPACK +bool can_use_xnnp(c10::ScalarType dtype, bool per_channel) { + if(!at::native::xnnpack::available()) { + return false; + } + + bool supported_dtypes = dtype == c10::kQInt8; + bool invalid_config = per_channel; /* xnnp does not currently support + per-channel fully connected op */ + if (supported_dtypes && invalid_config) { + /* don't want this to fall through to QNNPACK */ + TORCH_CHECK( + false, + "quantized::linear (xnnpack): Unsupported config for dtype KQInt8"); + } + return supported_dtypes && !invalid_config; +} +#endif // USE_XNNPACK + at::Tensor PackedLinearWeightsQnnp::apply( at::Tensor input, double output_scale, int64_t output_zero_point) { +#ifdef USE_XNNPACK + if (can_use_xnnp(input.scalar_type(), per_channel())) { + return apply_impl_xnnp( + input, output_scale, output_zero_point); + } /* fall through for unsupported types, configs, or shapes */ +#endif // USE_XNNPACK return apply_impl(std::move(input), output_scale, output_zero_point); } @@ -425,11 +607,92 @@ at::Tensor PackedLinearWeightsQnnp::apply_relu( at::Tensor input, double output_scale, int64_t output_zero_point) { +#ifdef USE_XNNPACK + if (can_use_xnnp(input.scalar_type(), per_channel())) { + return apply_impl_xnnp( + input, output_scale, output_zero_point); + } /* fall through for unsupported types, configs, or shapes */ +#endif // USE_XNNPACK return apply_impl(std::move(input), output_scale, output_zero_point); } #endif // USE_PYTORCH_QNNPACK +#if AT_MKLDNN_ENABLED() +template +at::Tensor PackedLinearWeightsOnednn::apply_impl( + at::Tensor input, + double output_scale, + int64_t output_zero_point) { + const int64_t dim = input.dim(); + TORCH_CHECK( + dim != 0, + "qlinear (ONEDNN): input dim should be at least 1, but got 0"); + TORCH_CHECK(input.scalar_type() == c10::ScalarType::QUInt8, + "qlinear (ONEDNN): data type of input should be QUint8."); + + auto input_contig = input.expect_contiguous(); + auto& w = *(weight_.get()); + auto K = input.size(dim - 1), M = input.numel() / K, N = w.get_dim(1); + auto input_dims = {M, K}; + auto input_data_type = dnnl::memory::data_type::u8; + auto input_desc = ideep::tensor::desc(input_dims, input_data_type); + ideep::attr_t op_attr = ReluFused ? ideep::attr_t::fuse_relu() : ideep::attr_t(); + ideep::tensor x(input_desc, input_contig->data_ptr()); + auto dst_dims = {M, N}; + const ideep::scale_t& src_scales = ideep::scale_t(1, 1.0/input.q_scale()); + const ideep::scale_t& weights_scales = w.get_scale(); + const ideep::scale_t& dst_scales = ideep::scale_t(1, 1.0/output_scale); // Scales of ONEDNN and PyTorch are reciprocal + const ideep::zero_point_t& src_zero_point = ideep::zero_point_t(1, input.q_zero_point()); + const ideep::zero_point_t& dst_zero_point = ideep::zero_point_t(1, output_zero_point); + // Compute: Use ideep::matmul_forward to support asymmetric quantization + // Allocate output Tensor + at::Tensor output = at::_empty_affine_quantized( + dst_dims, + at::device(c10::kCPU).dtype(c10::kQUInt8), + output_scale, + output_zero_point); + if (output.numel() == 0) { + return output; + } + ideep::tensor y({dst_dims, ideep::tensor::data_type::u8, {output.strides().cbegin(), output.strides().cend()}}, + output.data_ptr()); + if (bias_.has_value()) { + // Bias might be modified outside (e.g. by quantization bias correction). + // If so, update the prepacked bias as well. + if (bias_.value().get_data_handle() != orig_bias_.value().data_ptr()) { + bias_.value().init(bias_.value().get_desc(), orig_bias_.value().data_ptr()); + } + const auto& b = bias_.value(); + ideep::matmul_forward::compute_v2(x, w, b, y, 1.0f, 1.0f, src_scales, weights_scales, dst_scales, + src_zero_point, dst_zero_point, op_attr); + } else { + ideep::matmul_forward::compute_v2(x, w, y, 1.0f, 1.0f, src_scales, weights_scales, dst_scales, + src_zero_point, dst_zero_point, op_attr); + } + auto out_sizes = input.sizes().vec(); + out_sizes.back() = N; + if (output.sizes().vec() == out_sizes) + return output; + return output.reshape(out_sizes); +} + +at::Tensor PackedLinearWeightsOnednn::apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) { + return apply_impl(std::move(input), output_scale, output_zero_point); +} + +at::Tensor PackedLinearWeightsOnednn::apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) { + return apply_impl(std::move(input), output_scale, output_zero_point); +} + +#endif // #if AT_MKLDNN_ENABLED() + namespace at { namespace native { namespace { diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp index 676b2f1ce649..111255726dcf 100644 --- a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp @@ -2,8 +2,9 @@ #include #include #include -#include +#include #include +#include #include #include #include @@ -463,6 +464,99 @@ void PackedLinearWeightFp16::set_bias(c10::optional bias) { #endif // USE_FBGEMM +#if AT_MKLDNN_ENABLED() +template +at::Tensor PackedLinearWeightsOnednn::apply_dynamic_impl( + at::Tensor input, + bool reduce_range) { + // Dynamic: fp32 * int8 -> fp32 + using at::Tensor; + + TORCH_CHECK( + input.dim() >= 2, + "The dimension of input tensor should be larger than or equal to 2"); + TORCH_CHECK(input.scalar_type() == c10::ScalarType::Float, + "qlinear_dynamic (ONEDNN): data type of input should be float."); + + // Input -> uint8 + auto input_contig = input.contiguous(); + const int64_t dim = input.dim(); + auto input_reshaped = + dim == 2 ? input : input.reshape({-1, input.size(input.dim() - 1)}); + auto input_dims = input_reshaped.sizes().vec(); + auto input_data_type = dnnl::memory::data_type::f32; + auto input_desc = ideep::tensor::desc(input_dims, input_data_type); + ideep::attr_t op_attr = ReluFused ? ideep::attr_t::fuse_relu() : ideep::attr_t(); + ideep::tensor x; + x.init(input_desc, input_contig.data_ptr()); + // Find quantization parameters + float x_max = 0, x_min = 0; + if (input.numel() > 0) { + x_min = input_contig.min().item(); + x_max = input_contig.max().item(); + } + const int precision = 8; + auto q_params = quant_utils::ChooseQuantizationParams( + /*min=*/x_min, + /*max=*/x_max, + /*qmin=*/0, + /*qmax=*/(1 << precision) - 1, + /*preserve_sparsity=*/false, + /*force_scale_power_of_two=*/false, + /*reduce_range=*/reduce_range); + const std::vector& src_zero_point = std::vector(1, q_params.zero_point); + // weights, dst + auto w = *(weight_.get()); + auto dst_dims = {x.get_dim(0), w.get_dim(1)}; + const ideep::scale_t& src_scales = ideep::scale_t(1, 1.0/q_params.scale); + const ideep::scale_t& weights_scales = w.get_scale(); + // Compute -> f32 + // Use ideep::matmul_forward instead of ideep::inner_product_forward, + // since the latter does not support asymmetric quantization + // Allocate output Tensor + at::Tensor output = at::empty(dst_dims, input.options().dtype(at::kFloat)); + if (output.numel() == 0) return output; + ideep::tensor y({dst_dims, ideep::tensor::data_type::f32, + {output.strides().cbegin(), output.strides().cend()}}, + output.data_ptr()); + if (bias_.has_value()) { + // Bias might be modified outside (e.g. by quantization bias correction). + // If so, update the prepacked bias as well. + if (bias_.value().get_data_handle() != orig_bias_.value().data_ptr()) { + bias_.value().init(bias_.value().get_desc(), orig_bias_.value().data_ptr()); + } + const ideep::tensor b = bias_.value(); + ideep::matmul_forward::compute_v2(x, w, b, y, 1.0f, 1.0f, + src_scales, weights_scales, ideep::scale_t(), + src_zero_point, ideep::zero_point_t(), op_attr); + } else { + ideep::matmul_forward::compute_v2(x, w, y, 1.0f, 1.0f, + src_scales, weights_scales, ideep::scale_t(), + src_zero_point, ideep::zero_point_t(), op_attr); + } + auto out_sizes = input.sizes().vec(); + out_sizes.back() = w.get_dim(1); + if (output.sizes().vec() == out_sizes) + return output; + return output.reshape(out_sizes); +} + +at::Tensor PackedLinearWeightsOnednn::apply_dynamic( + at::Tensor input, + bool reduce_range) { + return apply_dynamic_impl( + std::move(input), reduce_range); +} + +at::Tensor PackedLinearWeightsOnednn::apply_dynamic_relu( + at::Tensor input, + bool reduce_range) { + return apply_dynamic_impl( + std::move(input), reduce_range); +} + +#endif // #if AT_MKLDNN_ENABLED() + namespace at { namespace native { namespace { diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp index 93c54dc10889..6ca6905119f4 100644 --- a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp @@ -1,9 +1,9 @@ #include -#include #include #include -#include +#include #include +#include #include #include #include @@ -194,6 +194,80 @@ c10::intrusive_ptr PackedLinearWeightFp16::prepack( } #endif // USE_FBGEMM +#if AT_MKLDNN_ENABLED() +c10::intrusive_ptr PackedLinearWeightsOnednn::prepack( + at::Tensor weight, + c10::optional bias) { + TORCH_CHECK( + weight.dim() == 2, + "The weight tensor for quantized::linear_prepack (onednn) should" + " be 2-dimensional."); + // Weight + std::vector dims = weight.sizes().vec(); + auto N = weight.size(0); + std::vector wgt_zero_points; + ideep::scale_t wgt_scales; + const auto qtype = weight.qscheme(); + if (qtype == c10::kPerTensorAffine) { + TORCH_CHECK( + weight.q_zero_point() == 0, + "quantized::linear_prepack: ONEDNN only supports symmetric quantization of weight," + " whose zero point must be 0, but got ", weight.q_zero_point()); + wgt_zero_points = std::vector(1, weight.q_zero_point()); + wgt_scales = ideep::scale_t(1, 1.0/weight.q_scale()); // Scales of ONEDNN and PyTorch are reciprocal + } else if (qtype == c10::kPerChannelAffine) { + wgt_zero_points.resize(N); + wgt_scales.resize(N); + for (int i = 0; i < N; ++i) { + wgt_zero_points[i] = weight.q_per_channel_zero_points()[i].item(); + TORCH_CHECK( + wgt_zero_points[i] == 0, + "quantized::linear_prepack: ONEDNN only supports symmetric quantization of weight," + " whose zero point must be 0, but got ", wgt_zero_points[i], ", at index ", i); + wgt_scales[i] = 1.0f / weight.q_per_channel_scales()[i].item(); // Scales of ONEDNN and PyTorch are reciprocal + } + } else { + TORCH_CHECK(false, "Unsupported qscheme: ", toString(qtype)); + } + + // Prepack weight + auto weight_copy = weight.clone(); + ideep::tensor wgt = ideep::tensor({dims, dnnl::memory::data_type::s8}, weight_copy.data_ptr()); + wgt.transpose_(0, 1); // ONEDNN requires transposed weight + auto w_desc = ideep::matmul_forward::expected_weights_desc(wgt.get_dims(), dnnl::memory::data_type::s8, + dnnl::memory::data_type::u8); + ideep::tensor exp_wgt(w_desc); + exp_wgt.feed_from(wgt); + ideep::tensor * packed_weight_p = new ideep::tensor(exp_wgt); + packed_weight_p->set_scale(wgt_scales); + packed_weight_p->set_zero_point(wgt_zero_points); + std::unique_ptr weight_ptr(packed_weight_p); + // Bias + c10::optional onednn_bias{c10::nullopt}; + if (bias.has_value()) { + auto& b = bias.value(); + auto bias_size = b.sizes().vec(); + bias_size.insert(bias_size.begin(), 1); + TORCH_CHECK( + bias_size[1] == weight_ptr->get_dim(1), + "bias should have N elements: ", + std::to_string(weight_ptr->get_dim(1)), + ", but got ", bias_size[1]); + auto bias_desc = ideep::tensor::desc(bias_size, dnnl::memory::data_type::f32); + ideep::tensor packed_bias; + packed_bias.init(bias_desc, b.data_ptr()); + onednn_bias = c10::optional(packed_bias); + } + auto ret_ptr = c10::make_intrusive( + PackedLinearWeightsOnednn{ + std::move(weight_ptr), + onednn_bias, + weight, + bias}); + return ret_ptr; +} +#endif // #if AT_MKLDNN_ENABLED() + namespace at { namespace native { @@ -224,6 +298,11 @@ class QLinearPackWeightInt8 final { std::move(weight), std::move(bias)); } #endif +#if AT_MKLDNN_ENABLED() + if (ctx.qEngine() == at::QEngine::ONEDNN) { + return PackedLinearWeightsOnednn::prepack(std::move(weight), std::move(bias)); + } +#endif // #if AT_MKLDNN_ENABLED() TORCH_CHECK( false, "Didn't find engine for operation quantized::linear_prepack ", @@ -238,6 +317,9 @@ class QLinearPackWeightFp16 final { c10::optional bias) { auto& ctx = at::globalContext(); #ifdef USE_FBGEMM + // temporarily convert weight back to fp32, needs to be fixed + // after fbgemm fixes the interface for their prepacking op (take fp16 input0 + weight = weight.to(ScalarType::Float); if (ctx.qEngine() == at::QEngine::FBGEMM) { return PackedLinearWeightFp16::prepack( std::move(weight), std::move(bias)); @@ -251,6 +333,14 @@ class QLinearPackWeightFp16 final { "not supported by QNNPACK"); } #endif // USE_PYTORCH_QNNPACK +#if AT_MKLDNN_ENABLED() + if (ctx.qEngine() == at::QEngine::ONEDNN) { + TORCH_CHECK( + false, + "quantized::linear_prepack_fp16 is currently " + "not supported by ONEDNN"); + } +#endif // #if AT_MKLDNN_ENABLED() TORCH_CHECK( false, "Didn't find engine for operation quantized::linear_prepack_fp16 ", @@ -261,63 +351,18 @@ class QLinearPackWeightFp16 final { class QLinearPackWeightInt8Legacy final { public: static Tensor run(at::Tensor weight, c10::optional bias) { - auto& ctx = at::globalContext(); - auto options = weight.options(); - -#ifdef USE_FBGEMM - if (ctx.qEngine() == at::QEngine::FBGEMM) { - auto prepacked = - PackedLinearWeight::prepack(std::move(weight), std::move(bias)); - auto wrapped = - std::make_unique>( - std::move(prepacked)); - return cpp_custom_type_hack::create(std::move(wrapped), options); - } -#endif // USE_FBGEMM -#ifdef USE_PYTORCH_QNNPACK - if (ctx.qEngine() == at::QEngine::QNNPACK) { - auto prepacked = - PackedLinearWeightsQnnp::prepack(std::move(weight), std::move(bias)); - auto wrapped = - std::make_unique>( - std::move(prepacked)); - return cpp_custom_type_hack::create(std::move(wrapped), options); - } -#endif // USE_PYTORCH_QNNPACK - TORCH_CHECK( - false, - "Didn't find engine for operation quantized::linear_prepack ", - toString(ctx.qEngine())); + TORCH_CHECK(false, + "This model uses an outdated version of quantized.linear_prepack. " + "Please re-export your model using the newer definitions in torch.jit.quantized"); } }; class QLinearPackWeightFp16Legacy final { public: static Tensor run(at::Tensor weight, c10::optional bias) { - auto& ctx = at::globalContext(); -#ifdef USE_FBGEMM - auto options = weight.options(); - if (ctx.qEngine() == at::QEngine::FBGEMM) { - auto prepacked = - PackedLinearWeightFp16::prepack(std::move(weight), std::move(bias)); - auto wrapped = - std::make_unique>( - std::move(prepacked)); - return cpp_custom_type_hack::create(std::move(wrapped), options); - } -#endif // USE_FBGEMM -#ifdef USE_PYTORCH_QNNPACK - if (ctx.qEngine() == at::QEngine::QNNPACK) { - TORCH_CHECK( - false, - "quantized::linear_prepack_fp16 is currently " - "not supported by QNNPACK"); - } -#endif // USE_PYTORCH_QNNPACK - TORCH_CHECK( - false, - "Didn't find engine for operation quantized::linear_prepack_fp16 ", - toString(ctx.qEngine())); + TORCH_CHECK(false, + "This model uses an outdated version of quantized.linear_prepack_fp16. " + "Please re-export your model using the newer definitions in torch.jit.quantized"); } }; diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_unpack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_unpack.cpp deleted file mode 100644 index 2a34e6748eb4..000000000000 --- a/aten/src/ATen/native/quantized/cpu/qlinear_unpack.cpp +++ /dev/null @@ -1,151 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -int register_linear_params(); - -#ifdef USE_FBGEMM -std::tuple> PackedLinearWeight::unpack() { - auto packB = w.get(); - - int64_t N = static_cast(packB->numCols()); - int64_t K = static_cast(packB->numRows()); - - at::Tensor weight_origin; - if (q_scheme == c10::kPerTensorAffine) { - weight_origin = at::_empty_affine_quantized( - {N, K}, at::device(c10::kCPU).dtype(c10::kQInt8), w_scale[0], w_zp[0]); - } else if (q_scheme == c10::kPerChannelAffine) { - auto scales = at::from_blob( - w_scale.data(), w_scale.size(), device(c10::kCPU).dtype(c10::kFloat)); - auto zero_points = at::from_blob( - w_zp.data(), w_zp.size(), device(c10::kCPU).dtype(c10::kInt)); - - weight_origin = at::_empty_per_channel_affine_quantized( - {N, K}, - scales.toType(c10::kDouble), - zero_points.toType(c10::kLong), - 0, // The output channel axis is 0 - device(c10::kCPU).dtype(c10::kQInt8)); - } - - int8_t* weight_ptr_int8 = - reinterpret_cast(weight_origin.data_ptr()); - - // packB->printPackedMatrix("packedB inside fbgemm_unpack - // (QLinearUnpackWeightInt8): "); - packB->unpack(weight_ptr_int8); - - return std::tuple>( - weight_origin, bias_); -} -#endif // USE_FBGEMM - -#ifdef USE_PYTORCH_QNNPACK -std::tuple> PackedLinearWeightsQnnp:: - unpack() { - TORCH_CHECK( - orig_weight.defined(), - "Cannot unpack weights. " - "Call at::globalContext()::setReleaseOriginalWeights(false) before packing or loading to enable unpacking."); - return std::tuple>(orig_weight, bias_); -} -#endif // USE_PYTORCH_QNNPACK - -#ifdef USE_FBGEMM -std::tuple> PackedLinearWeightFp16:: - unpack() { - auto& packed_weight_ptr = w; - - auto nrows = packed_weight_ptr->numRows(); - auto ncols = packed_weight_ptr->numCols(); - - at::Tensor unpacked_weight = - at::empty({ncols, nrows}, at::kHalf, c10::MemoryFormat::Contiguous); - packed_weight_ptr->unpack( - static_cast(unpacked_weight.data_ptr()), - fbgemm::matrix_op_t::Transpose); - - return std::make_tuple(unpacked_weight.to(at::kFloat), bias_); -} -#endif // USE_FBGEMM - -namespace at { -namespace native { -namespace { - -class QLinearUnpackWeightInt8 final { - public: - static std::tuple> run( - const c10::intrusive_ptr& packed_weight) { - return packed_weight->unpack(); - } -}; - -class QLinearUnpackWeightFp16 final { - public: - static std::tuple> run( - const c10::intrusive_ptr& packed_weight) { - auto& ctx = at::globalContext(); - - TORCH_CHECK( - ctx.qEngine() != at::QEngine::QNNPACK, - "quantized::linear_unpack_fp16 is currently " - "not supported by QNNPACK"); - - return packed_weight->unpack(); - } -}; - -class QLinearUnpackWeightInt8Legacy final { - public: - static std::tuple> run( - const at::Tensor& packed_weight) { - TORCH_WARN_ONCE( - "quantized.linear_unpack(Tensor) is deprecated! Please " - "upgrade your model to use the newer quantized.linear_" - "unpack(LinearPackedParamsBase) overload"); - return cpp_custom_type_hack::cast< - c10::intrusive_ptr>(packed_weight) - ->unpack(); - } -}; - -class QLinearUnpackWeightFp16Legacy final { - public: - static std::tuple> run( - const at::Tensor& packed_weight) { - TORCH_WARN_ONCE( - "quantized.linear_unpack(Tensor) is deprecated! Please " - "upgrade your model to use the newer quantized.linear_" - "unpack(LinearPackedParamsBase) overload"); - auto& ctx = at::globalContext(); - - TORCH_CHECK( - ctx.qEngine() != at::QEngine::QNNPACK, - "quantized::linear_unpack_fp16 is currently " - "not supported by QNNPACK"); - - return cpp_custom_type_hack::cast< - c10::intrusive_ptr>(packed_weight) - ->unpack(); - } -}; - -TORCH_LIBRARY_IMPL(quantized, CPU, m) { - m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack.legacy"), TORCH_FN(QLinearUnpackWeightInt8Legacy::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack_fp16.legacy"), TORCH_FN(QLinearUnpackWeightFp16Legacy::run)); -} - -TORCH_LIBRARY_IMPL(quantized, CatchAll, m) { - m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack"), TORCH_FN(QLinearUnpackWeightInt8::run)); - m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack_fp16"), TORCH_FN(QLinearUnpackWeightFp16::run)); -} - -} // namespace -} // namespace native -} // namespace at diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_unpack_impl.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_unpack_impl.cpp new file mode 100644 index 000000000000..b7182bf0fa47 --- /dev/null +++ b/aten/src/ATen/native/quantized/cpu/qlinear_unpack_impl.cpp @@ -0,0 +1,83 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +int register_linear_params(); + +#ifdef USE_FBGEMM +std::tuple> PackedLinearWeight::unpack() { + auto packB = w.get(); + + int64_t N = static_cast(packB->numCols()); + int64_t K = static_cast(packB->numRows()); + + at::Tensor weight_origin; + if (q_scheme == c10::kPerTensorAffine) { + weight_origin = at::_empty_affine_quantized( + {N, K}, at::device(c10::kCPU).dtype(c10::kQInt8), w_scale[0], w_zp[0]); + } else if (q_scheme == c10::kPerChannelAffine) { + auto scales = at::from_blob( + w_scale.data(), w_scale.size(), device(c10::kCPU).dtype(c10::kFloat)); + auto zero_points = at::from_blob( + w_zp.data(), w_zp.size(), device(c10::kCPU).dtype(c10::kInt)); + + weight_origin = at::_empty_per_channel_affine_quantized( + {N, K}, + scales.toType(c10::kDouble), + zero_points.toType(c10::kLong), + 0, // The output channel axis is 0 + device(c10::kCPU).dtype(c10::kQInt8)); + } + + int8_t* weight_ptr_int8 = + reinterpret_cast(weight_origin.data_ptr()); + + // packB->printPackedMatrix("packedB inside fbgemm_unpack + // (QLinearUnpackWeightInt8): "); + packB->unpack(weight_ptr_int8); + + return std::tuple>( + weight_origin, bias_); +} +#endif // USE_FBGEMM + +#ifdef USE_PYTORCH_QNNPACK +std::tuple> PackedLinearWeightsQnnp:: + unpack() { + TORCH_CHECK( + orig_weight.defined(), + "Cannot unpack weights. " + "Call at::globalContext()::setReleaseOriginalWeights(false) before packing or loading to enable unpacking."); + return std::tuple>(orig_weight, bias_); +} +#endif // USE_PYTORCH_QNNPACK + +#ifdef USE_FBGEMM +std::tuple> PackedLinearWeightFp16:: + unpack() { + auto& packed_weight_ptr = w; + + auto nrows = packed_weight_ptr->numRows(); + auto ncols = packed_weight_ptr->numCols(); + + at::Tensor unpacked_weight = + at::empty({ncols, nrows}, at::kHalf, c10::MemoryFormat::Contiguous); + packed_weight_ptr->unpack( + static_cast(unpacked_weight.data_ptr()), + fbgemm::matrix_op_t::Transpose); + + return std::make_tuple(unpacked_weight.to(at::kFloat), bias_); +} +#endif // USE_FBGEMM + +#if AT_MKLDNN_ENABLED() +std::tuple> PackedLinearWeightsOnednn::unpack() { + return std::tuple>( + orig_weight_, orig_bias_); +} +#endif // #if AT_MKLDNN_ENABLED() diff --git a/aten/src/ATen/native/quantized/cpu/qmatmul.cpp b/aten/src/ATen/native/quantized/cpu/qmatmul.cpp index 013966a52510..e42941fd0a35 100644 --- a/aten/src/ATen/native/quantized/cpu/qmatmul.cpp +++ b/aten/src/ATen/native/quantized/cpu/qmatmul.cpp @@ -1,6 +1,12 @@ #include #include +#ifdef USE_RUY_QMATMUL +#include +#include +#include +#endif + namespace at { namespace native { @@ -21,6 +27,142 @@ inline void check_inputs(const Tensor& qa, const Tensor& qb) { "Both inputs to Matmul must have the same quantization scheme."); } +#ifdef USE_RUY_QMATMUL + +Tensor qmatmul( + const Tensor& qa, + const Tensor& qb, + const double output_scale, + const int64_t output_zero_point) { + check_inputs(qa, qb); + + const int64_t num_dims = qa.dim(); + const int64_t b_num_dims = qb.dim(); + + TORCH_CHECK( + num_dims == b_num_dims, + "MatMul operands should have the same dimensionality. (", num_dims, + " and ", b_num_dims, " provided)"); + TORCH_CHECK( + num_dims >= 2, + "Quantized Matmul currently only suports operands which are at least 2-dimensional. (", + num_dims, " provided)"); + + const int64_t m = qa.size(num_dims - 2); + const int64_t k = qa.size(num_dims - 1); + const int64_t b_k = qb.size(num_dims - 2); + const int64_t n = qb.size(num_dims - 1); + + TORCH_CHECK( + b_k == k, + "For Quantized Matmul, the size of tensor a (", k, + ") at dimension ", num_dims - 1, " must match the size of tensor b (", + b_k, ") at dimension ", num_dims - 2, "."); + + std::vector out_size_vec(num_dims); + size_t num_matmuls = 1; + for (int64_t i = 0; i < num_dims - 2; i++) { + const int64_t dim = qa.size(i); + const int64_t qb_dim = qb.size(i); + + TORCH_CHECK( + dim == qb_dim, + "For Quantized Matmul, the size of tensor a (", dim, + ") must match the size of tensor b (", qb_dim, + ") at dimension ", i); + + out_size_vec[i] = dim; + num_matmuls *= dim; + } + out_size_vec[num_dims - 2] = m; + out_size_vec[num_dims - 1] = n; + + Tensor out = at::_empty_affine_quantized( + IntArrayRef(out_size_vec), + at::device(kCPU) + .dtype(qa.scalar_type()) + .memory_format(qa.suggest_memory_format()), + output_scale, + output_zero_point, + c10::nullopt); + + const Tensor& qa_contig = qa.contiguous(); + const Tensor& qb_contig = qb.contiguous(); + + AT_DISPATCH_QINT_BYTE_TYPES(qa.scalar_type(), "qmatmul", [&] { + using underlying_t = typename scalar_t::underlying; + + const underlying_t* qa_data = reinterpret_cast( + qa_contig.data_ptr()); + const underlying_t* qb_data = reinterpret_cast( + qb_contig.data_ptr()); + underlying_t* out_data = + reinterpret_cast(out.data_ptr()); + + const size_t qa_stride = m * k; + const size_t qb_stride = k * n; + const size_t out_stride = m * n; + + auto matmuls = [&](int64_t begin, int64_t end) { + + ruy::Matrix qa_matrix; + ruy::MakeSimpleLayout( + m, k, ruy::Order::kRowMajor, qa_matrix.mutable_layout()); + qa_matrix.set_zero_point(qa.q_zero_point()); + + ruy::Matrix qb_matrix; + ruy::MakeSimpleLayout( + k, n, ruy::Order::kRowMajor, qb_matrix.mutable_layout()); + qb_matrix.set_zero_point(qb.q_zero_point()); + + ruy::Matrix out_matrix; + ruy::MakeSimpleLayout( + m, n, ruy::Order::kRowMajor, out_matrix.mutable_layout()); + out_matrix.set_zero_point(output_zero_point); + + // Requantization explanation: + // https://github.com/google/gemmlowp/blob/e844ffd17118c1e17d94e1ba4354c075a4577b88/doc/quantization.md + const double requantization_scale_inv = + (qa.q_scale() * qb.q_scale()) / output_scale; + + ruy::MulParams mul_params; + + int multiplier_fixedpoint; + int multiplier_exponent; + ruy_utils::quantize_multiplier(requantization_scale_inv, + &multiplier_fixedpoint, + &multiplier_exponent); + mul_params.set_multiplier_fixedpoint(multiplier_fixedpoint); + mul_params.set_multiplier_exponent(multiplier_exponent); + + const underlying_t* qa_subtensor = qa_data + begin * qa_stride; + const underlying_t* qb_subtensor = qb_data + begin * qb_stride; + underlying_t* out_subtensor = out_data + begin * out_stride; + + for (int64_t i = begin; i < end; i++) { + qa_matrix.set_data(qa_subtensor); + qb_matrix.set_data(qb_subtensor); + out_matrix.set_data(out_subtensor); + ruy::Mul(qa_matrix, + qb_matrix, + mul_params, + ruy_utils::get_ruy_context(), + &out_matrix); + + qa_subtensor += qa_stride; + qb_subtensor += qb_stride; + out_subtensor += out_stride; + } + }; + + at::parallel_for(0, num_matmuls, 1, matmuls); + }); + + return out; +} + +#else // ifdef USE_RUY_QMATMUL + Tensor qmatmul( const Tensor& qa, const Tensor& qb, @@ -34,6 +176,8 @@ Tensor qmatmul( rc, output_scale, output_zero_point, qa.scalar_type()); } +#endif // ifdef USE_RUY_QMATMUL + TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) { m.impl(TORCH_SELECTIVE_NAME("quantized::matmul"), TORCH_FN(qmatmul)); } diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/BUILD.buck b/aten/src/ATen/native/quantized/cpu/qnnpack/BUILD.buck new file mode 100644 index 000000000000..85abc6a60916 --- /dev/null +++ b/aten/src/ATen/native/quantized/cpu/qnnpack/BUILD.buck @@ -0,0 +1,143 @@ +load("//tools/build_defs:glob_defs.bzl", "subdir_glob") + +cxx_library( + name = "pytorch_qnnpack", + srcs = ['src/add.c', 'src/average-pooling.c', 'src/channel-shuffle.c', 'src/clamp.c', 'src/conv-prepack.cc', 'src/conv-run.cc', 'src/convolution.c', 'src/deconv-run.cc', 'src/deconvolution.c', 'src/fc-dynamic-run.cc', 'src/fc-prepack.cc', 'src/fc-run.cc', 'src/fully-connected.c', 'src/fully-connected-sparse.c', 'src/global-average-pooling.c', 'src/hardsigmoid.c', 'src/hardswish.c', 'src/indirection.c', 'src/init.c', 'src/leaky-relu.c', 'src/max-pooling.c', 'src/operator-delete.c', 'src/operator-run.c', 'src/pack_block_sparse.cc', 'src/sigmoid.c', 'src/softargmax.c', 'src/tanh.c'], + deps = [':qnnp_interface', ':ukernels_asm', ':ukernels_neon', ':ukernels_psimd', ':ukernels_scalar', ':ukernels_sse2', ':ukernels_sse41', ':ukernels_ssse3', '//third_party:cpuinfo', '//third_party:FP16', '//third_party:FXdiv'], + exported_deps = ['//third_party:cpuinfo'], + compiler_flags = ['-O2', '-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION', '-Wno-deprecated-declarations'], + preferred_linkage = "static", + exported_headers = subdir_glob([("src", "qnnpack/*.h"),("include", "*.h"),]), + header_namespace = "", + headers = subdir_glob([("src", "**/*.c"), ("src", "q8gemm_sparse/*.h"), ("src", "qnnpack/*.h"), ("src", "requantization/*.h")]), + link_whole = False, + platform_compiler_flags = [['armv7', ['-mfpu=neon']], ['^android-armv7$', ['-marm', '-mfloat-abi=softfp']]], + platform_preprocessor_flags = [['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]], + visibility = ['PUBLIC'], +) + + +cxx_library( + name = "ukernels_ssse3", + srcs = ['wrappers/requantization/gemmlowp-ssse3.c', 'wrappers/requantization/precise-ssse3.c', 'wrappers/requantization/q31-ssse3.c'], + deps = [':qnnp_interface', '//third_party:cpuinfo', '//third_party:FP16', '//third_party:FXdiv'], + exported_deps = [], + compiler_flags = ['-O3', '-ffast-math', '-Wno-error=unused-variable', '-Wno-shadow', '-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION'], + preferred_linkage = "static", + header_namespace = "", + headers = subdir_glob([("src", "**/*.c"), ("src", "q8gemm_sparse/*.h"), ("src", "qnnpack/*.h"), ("src", "requantization/*.h")]), + link_whole = False, + platform_compiler_flags = [['86', ['-mssse3', '-mno-sse4']], ['osmeta', ['-mosmeta-no-restrict-sse']]], + platform_preprocessor_flags = [['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]], + visibility = ['PUBLIC'], +) + + +cxx_library( + name = "ukernels_psimd", + srcs = ['src/requantization/fp32-psimd.c', 'src/requantization/precise-psimd.c', 'src/sgemm/6x8-psimd.c'], + deps = [':qnnp_interface', '//third_party:cpuinfo', '//third_party:FP16', '//third_party:FXdiv', '//third_party:psimd'], + exported_deps = [], + compiler_flags = ['-O3', '-ffast-math', '-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION'], + preferred_linkage = "static", + header_namespace = "", + headers = subdir_glob([("src", "**/*.c"), ("src", "q8gemm_sparse/*.h"), ("src", "qnnpack/*.h"), ("src", "requantization/*.h")]), + link_whole = False, + platform_compiler_flags = [['armv7', ['-mfpu=neon']], ['^android-armv7$', ['-marm', '-mfloat-abi=softfp']]], + platform_preprocessor_flags = [['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]], + visibility = ['PUBLIC'], +) + + +cxx_library( + name = "ukernels_scalar", + srcs = ['src/requantization/fp32-scalar.c', 'src/requantization/gemmlowp-scalar.c', 'src/requantization/precise-scalar.c', 'src/requantization/q31-scalar.c', 'src/u8lut32norm/scalar.c', 'src/x8lut/scalar.c'], + deps = [':qnnp_interface', '//third_party:cpuinfo', '//third_party:FP16', '//third_party:FXdiv'], + exported_deps = [], + compiler_flags = ['-O2', '-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION'], + preferred_linkage = "static", + header_namespace = "", + headers = subdir_glob([("src", "**/*.c"), ("src", "q8gemm_sparse/*.h"), ("src", "qnnpack/*.h"), ("src", "requantization/*.h")]), + link_whole = False, + platform_preprocessor_flags = [['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]], + visibility = ['PUBLIC'], +) + + +cxx_library( + name = "ukernels_asm", + srcs = ['wrappers/dummy.c', 'wrappers/hgemm/8x8-aarch32-neonfp16arith.S', 'wrappers/q8conv/4x8-aarch32-neon.S', 'wrappers/q8dwconv/up8x9-aarch32-neon.S', 'wrappers/q8dwconv/up8x9-aarch32-neon-per-channel.S', 'wrappers/q8gemm/4x8-aarch32-neon.S', 'wrappers/q8gemm/4x8-dq-aarch32-neon.S', 'wrappers/q8gemm/4x8c2-xzp-aarch32-neon.S', 'wrappers/q8gemm_sparse/4x4-packA-aarch32-neon.S', 'wrappers/q8gemm_sparse/4x8c1x4-dq-packedA-aarch32-neon.S', 'wrappers/q8gemm_sparse/4x8c8x1-dq-packedA-aarch32-neon.S', 'wrappers/q8gemm_sparse/8x4-packA-aarch64-neon.S', 'wrappers/q8gemm_sparse/8x8c1x4-dq-packedA-aarch64-neon.S', 'wrappers/q8gemm_sparse/8x8c8x1-dq-packedA-aarch64-neon.S', 'wrappers/q8conv/8x8-aarch64-neon.S', 'wrappers/q8gemm/8x8-aarch64-neon.S', 'wrappers/q8gemm/8x8-dq-aarch64-neon.S'], + deps = [], + exported_deps = [], + compiler_flags = ['-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION'], + preferred_linkage = "static", + header_namespace = "", + headers = subdir_glob([("src", "**/*.c"), ("src", "q8gemm_sparse/*.h"), ("src", "qnnpack/*.h"), ("src", "requantization/*.h")]), + link_whole = False, + platform_compiler_flags = [['^iphoneos-armv7$', ['-mfpu=neon-vfpv4']], ['osmeta', ['-mfpu=neon-vfpv4']]], + platform_preprocessor_flags = [['android', ['-D__ELF__=1']], ['tizen', ['-D__ELF__=1']], ['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]], + visibility = ['PUBLIC'], +) + + +cxx_library( + name = "ukernels_sse41", + srcs = ['wrappers/requantization/gemmlowp-sse4.c', 'wrappers/requantization/precise-sse4.c', 'wrappers/requantization/q31-sse4.c'], + deps = [':qnnp_interface', '//third_party:cpuinfo', '//third_party:FP16', '//third_party:FXdiv'], + exported_deps = [], + compiler_flags = ['-O3', '-ffast-math', '-Wno-error=unused-variable', '-Wno-shadow', '-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION'], + preferred_linkage = "static", + header_namespace = "", + headers = subdir_glob([("src", "**/*.c"), ("src", "q8gemm_sparse/*.h"), ("src", "qnnpack/*.h"), ("src", "requantization/*.h")]), + link_whole = False, + platform_compiler_flags = [['86', ['-msse4.1', '-mno-sse4.2']], ['osmeta', ['-mosmeta-no-restrict-sse']]], + platform_preprocessor_flags = [['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]], + visibility = ['PUBLIC'], +) + + +cxx_library( + name = "ukernels_neon", + srcs = ['wrappers/q8avgpool/mp8x9p8q-neon.c', 'wrappers/q8avgpool/up8x9-neon.c', 'wrappers/q8avgpool/up8xm-neon.c', 'wrappers/q8conv/4x8-neon.c', 'wrappers/q8conv/8x8-neon.c', 'wrappers/q8dwconv/mp8x25-neon.c', 'wrappers/q8dwconv/mp8x25-neon-per-channel.c', 'wrappers/q8dwconv/mp8x27-neon.c', 'wrappers/q8dwconv/up8x9-neon.c', 'wrappers/q8dwconv/up8x9-neon-per-channel.c', 'wrappers/q8gavgpool/mp8x7p7q-neon.c', 'wrappers/q8gavgpool/up8x7-neon.c', 'wrappers/q8gavgpool/up8xm-neon.c', 'wrappers/q8gemm/4x-sumrows-neon.c', 'wrappers/q8gemm/4x8-dq-neon.c', 'wrappers/q8gemm/4x8-neon.c', 'wrappers/q8gemm/4x8c2-xzp-neon.c', 'wrappers/q8gemm/6x4-neon.c', 'wrappers/q8gemm/8x8-neon.c', 'wrappers/q8vadd/neon.c', 'wrappers/requantization/fp32-neon.c', 'wrappers/requantization/gemmlowp-neon.c', 'wrappers/requantization/precise-neon.c', 'wrappers/requantization/q31-neon.c', 'wrappers/sgemm/5x8-neon.c', 'wrappers/sgemm/6x8-neon.c', 'wrappers/u8clamp/neon.c', 'wrappers/u8maxpool/16x9p8q-neon.c', 'wrappers/u8maxpool/sub16-neon.c', 'wrappers/u8rmax/neon.c', 'wrappers/x8zip/x2-neon.c', 'wrappers/x8zip/x3-neon.c', 'wrappers/x8zip/x4-neon.c', 'wrappers/x8zip/xm-neon.c'], + deps = [':qnnp_interface', '//third_party:cpuinfo', '//third_party:FP16', '//third_party:FXdiv'], + exported_deps = [], + compiler_flags = ['-O3', '-ffast-math', '-Wno-error=unused-variable', '-Wno-shadow', '-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION'], + preferred_linkage = "static", + header_namespace = "", + headers = subdir_glob([("src", "**/*.c"), ("src", "q8gemm_sparse/*.h"), ("src", "qnnpack/*.h"), ("src", "requantization/*.h")]), + link_whole = False, + platform_compiler_flags = [['armv7', ['-mfpu=neon']], ['^android-armv7$', ['-marm', '-mfloat-abi=softfp']]], + platform_preprocessor_flags = [['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]], + visibility = ['PUBLIC'], +) + + +cxx_library( + name = "ukernels_sse2", + srcs = ['wrappers/q8avgpool/mp8x9p8q-sse2.c', 'wrappers/q8avgpool/up8x9-sse2.c', 'wrappers/q8avgpool/up8xm-sse2.c', 'wrappers/q8conv/4x4c2-sse2.c', 'wrappers/q8dwconv/mp8x25-sse2.c', 'wrappers/q8dwconv/mp8x25-sse2-per-channel.c', 'wrappers/q8dwconv/mp8x27-sse2.c', 'wrappers/q8dwconv/up8x9-sse2.c', 'wrappers/q8dwconv/up8x9-sse2-per-channel.c', 'wrappers/q8gavgpool/mp8x7p7q-sse2.c', 'wrappers/q8gavgpool/up8x7-sse2.c', 'wrappers/q8gavgpool/up8xm-sse2.c', 'wrappers/q8gemm/2x4c8-sse2.c', 'wrappers/q8gemm/4x4c2-dq-sse2.c', 'wrappers/q8gemm/4x4c2-sse2.c', 'wrappers/q8gemm_sparse/8x4c1x4-packed-sse2.c', 'wrappers/q8vadd/sse2.c', 'wrappers/requantization/fp32-sse2.c', 'wrappers/requantization/gemmlowp-sse2.c', 'wrappers/requantization/precise-sse2.c', 'wrappers/requantization/q31-sse2.c', 'wrappers/u8clamp/sse2.c', 'wrappers/u8maxpool/16x9p8q-sse2.c', 'wrappers/u8maxpool/sub16-sse2.c', 'wrappers/u8rmax/sse2.c', 'wrappers/x8zip/x2-sse2.c', 'wrappers/x8zip/x3-sse2.c', 'wrappers/x8zip/x4-sse2.c', 'wrappers/x8zip/xm-sse2.c'], + deps = [':qnnp_interface', '//third_party:cpuinfo', '//third_party:FP16', '//third_party:FXdiv'], + exported_deps = [], + compiler_flags = ['-O3', '-ffast-math', '-Wno-error=unused-variable', '-Wno-shadow', '-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION'], + preferred_linkage = "static", + header_namespace = "", + headers = subdir_glob([("src", "**/*.c"), ("src", "q8gemm_sparse/*.h"), ("src", "qnnpack/*.h"), ("src", "requantization/*.h")]), + link_whole = False, + platform_compiler_flags = [['86', ['-msse2', '-mno-sse3']]], + platform_preprocessor_flags = [['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]], + visibility = ['PUBLIC'], +) + + +cxx_library( + name = "qnnp_interface", + srcs = [], + deps = ['//third_party:pthreadpool_header'], + exported_deps = [], + compiler_flags = ['-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION'], + preferred_linkage = "static", + header_namespace = "", + headers = subdir_glob([("src", "**/*.c"), ("src", "q8gemm_sparse/*.h"), ("src", "qnnpack/*.h"), ("src", "requantization/*.h")]), + link_whole = False, + platform_preprocessor_flags = [['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]], + visibility = ['PUBLIC'], +) diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/cmake/DownloadGoogleTest.cmake b/aten/src/ATen/native/quantized/cpu/qnnpack/cmake/DownloadGoogleTest.cmake index 30cc61dc17fb..4a86d641e412 100644 --- a/aten/src/ATen/native/quantized/cpu/qnnpack/cmake/DownloadGoogleTest.cmake +++ b/aten/src/ATen/native/quantized/cpu/qnnpack/cmake/DownloadGoogleTest.cmake @@ -10,7 +10,7 @@ project(googletest-download NONE) include(ExternalProject) ExternalProject_Add(googletest - URL https://github.com/google/googletest/archive/release-1.8.0.zip + URL https://github.com/google/googletest/archive/release-1.10.0.zip URL_HASH SHA256=f3ed3b58511efd272eb074a3a6d6fb79d7c2e6a0e374323d1e6bcbcc1ef141bf SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/googletest" BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/googletest" diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/cmake/DownloadGoogleTest.cmake b/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/cmake/DownloadGoogleTest.cmake index 30cc61dc17fb..4a86d641e412 100644 --- a/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/cmake/DownloadGoogleTest.cmake +++ b/aten/src/ATen/native/quantized/cpu/qnnpack/deps/clog/cmake/DownloadGoogleTest.cmake @@ -10,7 +10,7 @@ project(googletest-download NONE) include(ExternalProject) ExternalProject_Add(googletest - URL https://github.com/google/googletest/archive/release-1.8.0.zip + URL https://github.com/google/googletest/archive/release-1.10.0.zip URL_HASH SHA256=f3ed3b58511efd272eb074a3a6d6fb79d7c2e6a0e374323d1e6bcbcc1ef141bf SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/googletest" BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/googletest" diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h b/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h index 1f6d6f1d9105..60ea7822a760 100644 --- a/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h +++ b/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h @@ -6,8 +6,8 @@ #include #include -#include -#include +#include +#include #include #include @@ -40,6 +40,7 @@ struct PackedLinearWeightsQnnp : public LinearPackedParamsBase { orig_weight(std::move(orig_weight)), bias_(at::native::mobile::allocate_padded_contiguous_if_needed( bias, bias.suggest_memory_format())), + per_channel_(this->orig_weight.qscheme() == at::kPerChannelAffine), input_scale(std::move(input_scale)), w_scales(w_scales), w_zero_points(std::move(w_zps)) {} @@ -47,6 +48,7 @@ struct PackedLinearWeightsQnnp : public LinearPackedParamsBase { std::unique_ptr w; at::Tensor orig_weight; at::Tensor bias_; + bool per_channel_; c10::optional input_scale; at::Tensor w_scales; std::vector w_zero_points; @@ -74,8 +76,23 @@ struct PackedLinearWeightsQnnp : public LinearPackedParamsBase { at::Tensor weight, c10::optional bias); + bool per_channel() const { + return per_channel_; + } + private: std::mutex qnnp_mutex_; + +#ifdef USE_XNNPACK + xnnpack_operator xnnp_linear_op; + + template + at::Tensor apply_impl_xnnp( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point); +#endif // USE_XNNPACK + template at::Tensor apply_impl( at::Tensor input, @@ -112,6 +129,7 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase { dilation_(std::move(dilation)), groups_(groups), transpose_(transpose), + is_per_channel_(is_per_channel), input_scale(input_scale), kernel_(std::move(kernel)), w_scales(w_scale), @@ -200,7 +218,7 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase { convolution->input_padding_height = padding_[kSpatialDim - 2]; convolution->input_padding_width = padding_[kSpatialDim - 1]; convolution->input_padding_depth = kSpatialDim == 3 ? padding_[0] : 0; - convolution->per_channel = is_per_channel; + convolution->per_channel = is_per_channel_; convolution->transpose = transpose_; const uint32_t kr = pytorch_qnnp_params.q8conv.kr; @@ -260,6 +278,9 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase { } std::unique_ptr convolution_op; + #ifdef USE_XNNPACK + xnnpack_operator xnnp_convolution_op; + #endif // USE_XNNPACK std::unique_ptr w; at::Tensor orig_weight; at::Tensor bias; @@ -269,6 +290,7 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase { torch::List dilation_; int64_t groups_; bool transpose_; + bool is_per_channel_; c10::optional input_scale; std::vector kernel_; at::Tensor w_scales; @@ -326,6 +348,10 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase { return transpose_; } + bool per_channel() const { + return is_per_channel_; + } + private: std::mutex qnnp_mutex_; template @@ -333,6 +359,14 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase { const at::Tensor& input, double output_scale, int64_t output_zero_point); + +#ifdef USE_XNNPACK + template + at::Tensor apply_impl_xnnp( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point); +#endif // USE_XNNPACK }; enum class Activation : uint8_t { NONE = 0, RELU = 1 }; diff --git a/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp b/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp new file mode 100644 index 000000000000..cfe326aed421 --- /dev/null +++ b/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp @@ -0,0 +1,148 @@ +#include +#include + +#ifdef USE_PYTORCH_QNNPACK +#include +#include +#include +#include +#endif // USE_PYTORCH_QNNPACK + +namespace at { +namespace native { + +namespace { + +#ifdef USE_PYTORCH_QNNPACK + +const static float qnnpack_softmax_output_scale = 0x1.0p-8f; +const static int qnnpack_softmax_output_zero_point = 0; + +bool is_qnnpack_compatible( + const Tensor& qx, + const double output_scale, + const int64_t output_zero_point) { + return ( + (qx.qscheme() == kPerTensorAffine || + qx.qscheme() == kPerTensorSymmetric) && + qx.scalar_type() == c10::kQUInt8 && qx.ndimension() > 0 && + output_scale == qnnpack_softmax_output_scale && + output_zero_point == qnnpack_softmax_output_zero_point); +} + +Tensor qsoftmax_qnnpack(const Tensor& qx, const int64_t dim) { + /* + Cases for contiguity/dimensionality + 1) stride along target dim is 1 + requires no change to qx + 2) dim is the last dimension (but qx is not contiguous) + requires using qx.contiguous() + 3) other + requires permuting qx.contiguous() + */ + + const int64_t last_dim = qx.dim() - 1; + c10::optional> permuted_dims = c10::nullopt; + c10::optional qx_contig = c10::nullopt; + const at::Tensor* qx_contig_ptr = nullptr; + + if (qx.stride(dim) == 1) { + qx_contig_ptr = &qx; + } else if (dim == last_dim) { + qx_contig = qx.contiguous(); + qx_contig_ptr = &qx_contig.value(); + } else { + permuted_dims = std::vector(qx.dim()); + std::iota(permuted_dims->begin(), permuted_dims->end(), 0); + permuted_dims->at(last_dim) = dim; + permuted_dims->at(dim) = last_dim; + qx_contig = qx.permute(permuted_dims.value()).contiguous(); + qx_contig_ptr = &qx_contig.value(); + } + + at::Tensor qy = at::_empty_affine_quantized( + qx_contig_ptr->sizes(), + at::device(kCPU) + .dtype(qx.scalar_type()) + .memory_format(qx_contig_ptr->suggest_memory_format()), + qnnpack_softmax_output_scale, + qnnpack_softmax_output_zero_point, + c10::nullopt); + + const size_t channels = qx.size(dim); + const float input_scale = static_cast(qx.q_scale()); + const uint32_t flags = 0; + const size_t batch_size = qx.numel() / channels; + const uint8_t* input = + reinterpret_cast(qx_contig_ptr->data_ptr()); + const size_t input_stride = channels; + uint8_t* output = reinterpret_cast(qy.data_ptr()); + const size_t output_stride = channels; + + initQNNPACK(); + pytorch_qnnp_operator_t softargmax = nullptr; + std::unique_ptr softmax_op( + softargmax); + + pytorch_qnnp_status status = pytorch_qnnp_create_softargmax_nc_q8( + channels, + input_scale, + qnnpack_softmax_output_zero_point, + qnnpack_softmax_output_scale, + flags, + &softargmax); + TORCH_CHECK( + status == pytorch_qnnp_status_success, + "failed to create QNNPACK Softmax operator"); + CHECK_NOTNULL(softargmax); + + status = pytorch_qnnp_setup_softargmax_nc_q8( + softargmax, batch_size, input, input_stride, output, output_stride); + TORCH_CHECK( + status == pytorch_qnnp_status_success, + "failed to setup QNNPACK Softmax operator"); + + pthreadpool_t threadpool = caffe2::pthreadpool_(); + status = pytorch_qnnp_run_operator(softargmax, threadpool); + TORCH_CHECK( + status == pytorch_qnnp_status_success, + "failed to run QNNPACK Softmax operator"); + + return permuted_dims.has_value() ? qy.permute(permuted_dims.value()) : qy; +} + +#endif // USE_PYTORCH_QNNPACK + +Tensor qsoftmax_naive( + const Tensor& qx, + const int64_t dim, + const double output_scale, + const int64_t output_zero_point) { + Tensor rx = at::dequantize(qx); + Tensor ry = at::softmax(rx, dim); + return at::quantize_per_tensor( + ry, output_scale, output_zero_point, qx.scalar_type()); +} + +Tensor qsoftmax( + const Tensor& qx, + const int64_t dim, + const double output_scale, + const int64_t output_zero_point) { +#ifdef USE_PYTORCH_QNNPACK + if (at::globalContext().qEngine() == at::QEngine::QNNPACK && + is_qnnpack_compatible(qx, output_scale, output_zero_point)) { + return qsoftmax_qnnpack(qx, dim); + } +#endif // USE_PYTORCH_QNNPACK + return qsoftmax_naive(qx, dim, output_scale, output_zero_point); +} + +TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) { + m.impl(TORCH_SELECTIVE_NAME("quantized::softmax"), TORCH_FN(qsoftmax)); +} + +} // namespace + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/quantized/cpu/quantized_ops.h b/aten/src/ATen/native/quantized/cpu/quantized_ops.h index a1766380fe53..bfa1f1f77562 100644 --- a/aten/src/ATen/native/quantized/cpu/quantized_ops.h +++ b/aten/src/ATen/native/quantized/cpu/quantized_ops.h @@ -1,4 +1,5 @@ #include +#include #include #include @@ -8,7 +9,7 @@ namespace native { using qrelu_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/); using qrelu_leaky_fn = void (*)(Tensor& /*out*/, const Tensor& /*qx*/, const Scalar& /*negval_*/); -using qgelu_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/); +using qgelu_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/, GeluType /* approximate */); using qsigmoid_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/, double output_scale, int64_t output_zero_point); using qhardsigmoid_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/); using qclamp_fn = void (*)( diff --git a/aten/src/ATen/native/quantized/cpu/qupsample_bilinear2d.cpp b/aten/src/ATen/native/quantized/cpu/qupsample_bilinear2d.cpp index ab30cd7d3810..2fcd308cfd82 100644 --- a/aten/src/ATen/native/quantized/cpu/qupsample_bilinear2d.cpp +++ b/aten/src/ATen/native/quantized/cpu/qupsample_bilinear2d.cpp @@ -1,7 +1,9 @@ #include +#include #include #include #include +#include #include #include @@ -13,6 +15,18 @@ namespace at { namespace native { namespace { +// pre calcuate interpolation params on width +struct UpsampleBilinearParamW { + int64_t w1, w1p; + float w0lambda, w1lambda; + + UpsampleBilinearParamW(int64_t w1, int64_t w1p, float w0lambda, float w1lambda) + : w1(w1) + , w1p(w1p) + , w0lambda(w0lambda) + , w1lambda(w1lambda) {} +}; + // at::native functions for the native_functions.yaml template static void upsample_bilinear2d_out_frame( @@ -50,51 +64,73 @@ static void upsample_bilinear2d_out_frame( const auto rheight = area_pixel_compute_scale( input_height, output_height, align_corners, scales_h); - const auto rwidth = - area_pixel_compute_scale(input_width, output_width, align_corners, scales_w); + const auto rwidth = area_pixel_compute_scale( + input_width, output_width, align_corners, scales_w); + // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) float output_scale = output.q_scale() / input.q_scale(); const int64_t input_q_zero_point = input.q_zero_point(); const int64_t output_q_zero_point = output.q_zero_point(); - for (const auto h2 : c10::irange(output_height)) { - const auto h1r = area_pixel_compute_source_index( - rheight, h2, align_corners, /*cubic=*/false); + std::vector params_w; + params_w.reserve(output_width); + for (const auto w2 : c10::irange(output_width)) { + const auto w1r = area_pixel_compute_source_index( + rwidth, w2, align_corners, /*cubic=*/false); + + const int64_t w1 = w1r; + const int64_t w1p = (w1 < input_width - 1) ? 1 : 0; + + const float w1lambda = w1r - w1; + const float w0lambda = static_cast(1.) - w1lambda; + + params_w.emplace_back(w1, w1p, w0lambda, w1lambda); + } + + // compared to 'nearest', each requires 4 points and takes additional * and + + // set the scale to be 16. + int64_t grain_size = internal::GRAIN_SIZE / std::max(int64_t{1}, output_width) / 16; + at::parallel_for(0, channels * output_height, grain_size, [&](int64_t begin, int64_t end) { + int64_t nc{0}, h2{0}; + data_index_init(begin, nc, channels, h2, output_height); + + for (const auto i : c10::irange(begin, end)) { + const auto h1r = area_pixel_compute_source_index( + rheight, h2, align_corners, /*cubic=*/false); - const int64_t h1 = h1r; - const int64_t h1p = (h1 < input_height - 1) ? 1 : 0; + const int64_t h1 = h1r; + const int64_t h1p = (h1 < input_height - 1) ? 1 : 0; - const float h1lambda = h1r - h1; - const float h0lambda = static_cast(1.) - h1lambda; + const float h1lambda = h1r - h1; + const float h0lambda = static_cast(1.) - h1lambda; - for (const auto w2 : c10::irange(output_width)) { - const auto w1r = area_pixel_compute_source_index( - rwidth, w2, align_corners, /*cubic=*/false); + const auto* i_ptr = &i_p[nc * input_height * input_width]; + auto* pos2 = &o_p[i * output_width]; - const int64_t w1 = w1r; - const int64_t w1p = (w1 < input_width - 1) ? 1 : 0; + for (const auto w2 : c10::irange(output_width)) { + const auto& param_w = params_w[w2]; + const int64_t w1 = param_w.w1; + const int64_t w1p = param_w.w1p; + const float w0lambda = param_w.w0lambda; + const float w1lambda = param_w.w1lambda; - const float w1lambda = w1r - w1; - const float w0lambda = static_cast(1.) - w1lambda; - const typename scalar_t::underlying* pos1 = i_p + h1 * input_width + w1; - typename scalar_t::underlying* pos2 = o_p + h2 * output_width + w2; + const auto* pos1 = i_ptr + h1 * input_width + w1; - for (const auto c : c10::irange(channels)) { - (void)c; //Suppress unused variable warning float result = h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[w1p]) + h1lambda * (w0lambda * pos1[h1p * input_width] + w1lambda * pos1[h1p * input_width + w1p]) - input_q_zero_point; // requantization - pos2[0] = at::native::quantize_val( + pos2[w2] = at::native::quantize_val( output_scale, output_q_zero_point, result) .val_; - pos1 += input_width * input_height; - pos2 += output_width * output_height; } + + data_index_step(nc, channels, h2, output_height); } - } + }); + } } // namespace @@ -178,7 +214,7 @@ using at::native::upsample::get_scale_value; Tensor upsample_bilinear2d_quantized_cpu( const Tensor& input, - c10::optional output_size, + at::OptionalIntArrayRef output_size, bool align_corners, c10::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); diff --git a/aten/src/ATen/native/quantized/cpu/qupsample_nearest2d.cpp b/aten/src/ATen/native/quantized/cpu/qupsample_nearest2d.cpp index 377ef15790b1..c4f8e452c95c 100644 --- a/aten/src/ATen/native/quantized/cpu/qupsample_nearest2d.cpp +++ b/aten/src/ATen/native/quantized/cpu/qupsample_nearest2d.cpp @@ -1,8 +1,10 @@ #include +#include #include #include #include #include +#include #include @@ -44,25 +46,32 @@ static void upsample_nearest2d_out_frame( return; } - for (const auto h2 : c10::irange(output_height)) { - const int64_t h1 = - nn_compute_source_index_fn(height_scale, h2, input_height); + std::unique_ptr input_offset_arr(new int64_t[output_width]); + int64_t* input_offset = input_offset_arr.get(); - for (const auto w2 : c10::irange(output_width)) { - const int64_t w1 = - nn_compute_source_index_fn(width_scale, w2, input_width); + for (const auto w2 : c10::irange(output_width)) { + const int64_t w1 = nn_compute_source_index_fn(width_scale, w2, input_width); + input_offset[w2] = w1; + } + + int64_t grain_size = internal::GRAIN_SIZE / std::max(int64_t{1}, output_width); + at::parallel_for(0, channels * output_height, grain_size, [&](int64_t begin, int64_t end) { + int64_t nc{0}, h2{0}; + data_index_init(begin, nc, channels, h2, output_height); - const auto* pos1 = &i_p[h1 * input_width + w1]; - auto* pos2 = &o_p[h2 * output_width + w2]; + for (const auto i : c10::irange(begin, end)) { + const int64_t h1 = nn_compute_source_index_fn(height_scale, h2, input_height); + const auto* pos1 = &i_p[nc * input_height * input_width + h1 * input_width]; + auto* pos2 = &o_p[i * output_width]; - for (const auto c : c10::irange(channels)) { - (void)c; //Suppress unused variable warning - pos2[0] = pos1[0]; - pos1 += input_height * input_width; - pos2 += output_height * output_width; + for (const auto w2 : c10::irange(output_width)) { + const int64_t w1 = input_offset[w2]; + pos2[w2] = pos1[w1]; } + + data_index_step(nc, channels, h2, output_height); } - } + }); } template @@ -80,29 +89,24 @@ static void upsample_nearest2d_out_frame_nhwc( float height_scale = compute_scales_value(scales_h, input_height, output_height); float width_scale = compute_scales_value(scales_w, input_width, output_width); - for (const auto b : c10::irange(nbatch)) { - auto* i_p = reinterpret_cast(idata + b * input_height * input_width * channels); - auto* o_p = reinterpret_cast(odata + b * output_height * output_width * channels); - // special case: just copy - if (input_height == output_height && input_width == output_width) { - std::memcpy(o_p, i_p, channels * input_height * input_width * sizeof(typename scalar_t::underlying)); - return; - } + at::parallel_for(0, nbatch * output_height * output_width, 0, [&](int64_t begin, int64_t end) { + int64_t b{0}, h2{0}, w2{0}; + data_index_init(begin, b, nbatch, h2, output_height, w2, output_width); - for (const auto h2 : c10::irange(output_height)) { - const int64_t h1 = - nn_compute_source_index_fn(height_scale, h2, input_height); + for (const auto i : c10::irange(begin, end)) { + auto* i_p = reinterpret_cast(idata + b * input_height * input_width * channels); + auto* o_p = reinterpret_cast(odata + i * channels); - for (const auto w2 : c10::irange(output_width)) { - const int64_t w1 = - nn_compute_source_index_fn(width_scale, w2, input_width); + const int64_t h1 = nn_compute_source_index_fn(height_scale, h2, input_height); + const int64_t w1 = nn_compute_source_index_fn(width_scale, w2, input_width); - const auto* pos1 = &i_p[(h1 * input_width + w1)*channels]; - auto* pos2 = &o_p[(h2 * output_width + w2)*channels]; - std::memcpy(pos2, pos1, channels * sizeof(typename scalar_t::underlying)); - } + const auto* pos1 = &i_p[(h1 * input_width + w1)*channels]; + auto* pos2 = &o_p[0]; + std::memcpy(pos2, pos1, channels * sizeof(typename scalar_t::underlying)); + + data_index_step(b, nbatch, h2, output_height, w2, output_width); } - } + }); } template @@ -137,6 +141,12 @@ Tensor _upsample_nearest2d_quantized_cpu( input.q_zero_point(), c10::nullopt); + // special case: just copy + if (input_height == output_height && input_width == output_width) { + output.copy_(input); + return output; + } + AT_DISPATCH_QINT_TYPES(input.scalar_type(), "upsample_nearest2d", [&] { auto* idata = static_cast(input.data_ptr()); auto* odata = static_cast(output.data_ptr()); @@ -202,7 +212,7 @@ Tensor _upsample_nearest_exact2d_quantized_cpu( Tensor upsample_nearest2d_quantized_cpu( const Tensor& input, - c10::optional output_size, + at::OptionalIntArrayRef output_size, c10::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_h = get_scale_value(scale_factors, 0); @@ -212,7 +222,7 @@ Tensor upsample_nearest2d_quantized_cpu( Tensor _upsample_nearest_exact2d_quantized_cpu( const Tensor& input, - c10::optional output_size, + at::OptionalIntArrayRef output_size, c10::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_h = get_scale_value(scale_factors, 0); diff --git a/aten/src/ATen/native/quantized/cpu/qupsample_nearest3d.cpp b/aten/src/ATen/native/quantized/cpu/qupsample_nearest3d.cpp index db4077ef4328..d2e835421336 100644 --- a/aten/src/ATen/native/quantized/cpu/qupsample_nearest3d.cpp +++ b/aten/src/ATen/native/quantized/cpu/qupsample_nearest3d.cpp @@ -232,7 +232,7 @@ Tensor _upsample_nearest_exact3d_quantized_cpu( Tensor upsample_nearest3d_quantized_cpu( const Tensor& input, - c10::optional output_size, + at::OptionalIntArrayRef output_size, c10::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_d = get_scale_value(scale_factors, 0); @@ -243,7 +243,7 @@ Tensor upsample_nearest3d_quantized_cpu( Tensor _upsample_nearest_exact3d_quantized_cpu( const Tensor& input, - c10::optional output_size, + at::OptionalIntArrayRef output_size, c10::optional> scale_factors) { auto osize = compute_output_size(input.sizes(), output_size, scale_factors); auto scale_d = get_scale_value(scale_factors, 0); diff --git a/aten/src/ATen/native/quantized/cpu/ruy_utils.cpp b/aten/src/ATen/native/quantized/cpu/ruy_utils.cpp new file mode 100644 index 000000000000..d0164f736352 --- /dev/null +++ b/aten/src/ATen/native/quantized/cpu/ruy_utils.cpp @@ -0,0 +1,37 @@ +#ifdef USE_RUY_QMATMUL + +#include +#include + +namespace at { +namespace native { +namespace ruy_utils { + +static thread_local ruy::Context context; + +ruy::Context* get_ruy_context() { + return &context; +} + +// Adopted from Ruy: +// https://github.com/google/ruy/blob/2d950b3bfa7ebfbe7a97ecb44b1cc4da5ac1d6f0/ruy/test.h#L1602 +void quantize_multiplier(double scale, + int* multiplier_fixedpoint, + int* multiplier_exponent) { + TORCH_CHECK(scale > 0, "Quantization scale (", scale, ") must be positive."); + const double q = std::frexp(scale, multiplier_exponent); + auto q_fixed = static_cast(std::round(q * (1ll << 31))); + TORCH_CHECK(q_fixed <= (1ll << 31)); + if (q_fixed == (1ll << 31)) { + q_fixed /= 2; + ++*multiplier_exponent; + } + TORCH_CHECK(q_fixed <= std::numeric_limits::max()); + *multiplier_fixedpoint = static_cast(q_fixed); +} + +} // namespace ruy_utils +} // namespace native +} // namesplace + +#endif // USE_RUY_QMATMUL diff --git a/aten/src/ATen/native/quantized/cpu/ruy_utils.h b/aten/src/ATen/native/quantized/cpu/ruy_utils.h new file mode 100644 index 000000000000..aeb332af4eca --- /dev/null +++ b/aten/src/ATen/native/quantized/cpu/ruy_utils.h @@ -0,0 +1,21 @@ +#pragma once + +#ifdef USE_RUY_QMATMUL + +#include + +namespace at { +namespace native { +namespace ruy_utils { + +ruy::Context* get_ruy_context(); + +void quantize_multiplier(double scale, + int* multiplier_fixedpoint, + int* multiplier_exponent); + +} // namespace ruy_utils +} // namespace native +} // namesplace + +#endif // USE_RUY_QMATMUL diff --git a/aten/src/ATen/native/quantized/cpu/xnnpack_utils.cpp b/aten/src/ATen/native/quantized/cpu/xnnpack_utils.cpp new file mode 100644 index 000000000000..8f81c8ea8d5e --- /dev/null +++ b/aten/src/ATen/native/quantized/cpu/xnnpack_utils.cpp @@ -0,0 +1,89 @@ +#ifdef USE_XNNPACK + +#include +#include +#include +#include + +namespace at { +namespace native { +namespace xnnp_utils { + +std::vector get_mem_format_aware_shape(const at::Tensor& in) { + const auto mem_format = in.suggest_memory_format(); + const auto& sizes = in.sizes(); + std::vector ret(sizes.begin(), sizes.end()); + if (mem_format == c10::MemoryFormat::ChannelsLast) { + // NCHW -> NHWC + // 0123 -> 0231 + ret[1] = sizes[2]; /* H */ + ret[2] = sizes[3]; /* W */ + ret[3] = sizes[1]; /* C */ + } else if (mem_format == c10::MemoryFormat::ChannelsLast3d) { + // NCDHW -> NDHWC + // 01234 -> 02341 + ret[1] = sizes[2]; /* D */ + ret[2] = sizes[3]; /* H */ + ret[3] = sizes[4]; /* W */ + ret[4] = sizes[1]; /* C */ + } + return ret; +} + +template +void q8_copy_int8_weight_and_add_offset(const at::Tensor& in, at::Tensor& out) { + using T = typename PT::underlying; + static constexpr auto offset = std::is_same::value ? 128 : 0; + TORCH_CHECK( + in.scalar_type() == c10::kQInt8, + "q8_copy_int8_weight_and_add_offset: Expected input weight data type ", + toString(c10::kQInt8), + " but got ", + toString(in.scalar_type())) + const int8_t* in_ptr = + reinterpret_cast(in.data_ptr()); + T* out_ptr = reinterpret_cast(out.data_ptr()); + + for (const auto i : c10::irange(in.numel())) { + out_ptr[i] = static_cast(static_cast(in_ptr[i]) + offset); + } +} + +template void q8_copy_int8_weight_and_add_offset( + const at::Tensor& in, + at::Tensor& out); +template void q8_copy_int8_weight_and_add_offset( + const at::Tensor& in, + at::Tensor& out); + +/* + * Stolen from fbgemm_utils::ConvertConvWeightsToChannelLastTensor to avoid + * dependence on USE_FBGEMM. Reorder weights to the format xnnpack expects. + * TODO: add a 3d variant. + */ +template <> +Tensor convert_conv_weights_to_channel_last_tensor<2>( + const at::Tensor& src, + int groups, + bool transpose) { + return transpose ? + // 2D conv transpose weight transform + // IC OC/G KH KW -> G OC/G KH KW IC/G + [&]() { + auto ic_g_oc_g_hw_tensors = src.chunk(groups); + for (auto& tensor : ic_g_oc_g_hw_tensors) { + tensor = tensor.unsqueeze(0); + } + auto fused_tensor = at::cat(ic_g_oc_g_hw_tensors); + set_quantizer_(fused_tensor, src.quantizer()); + return fused_tensor.permute({0, 2, 3, 4, 1}) + .contiguous(c10::MemoryFormat::Contiguous); + }() + // 2d conv weight transform + : src.contiguous(c10::MemoryFormat::ChannelsLast); +} +} // namespace xnnp_utils +} // namespace native +} // namespace at + +#endif // USE_XNNPACK diff --git a/aten/src/ATen/native/quantized/cpu/xnnpack_utils.h b/aten/src/ATen/native/quantized/cpu/xnnpack_utils.h new file mode 100644 index 000000000000..78f325263f4f --- /dev/null +++ b/aten/src/ATen/native/quantized/cpu/xnnpack_utils.h @@ -0,0 +1,279 @@ +#pragma once + +#ifdef USE_XNNPACK +#include + +#include +#include + +using xnnpack_operator = at::native::xnnpack::Operator; + +namespace at { +namespace native { +namespace xnnp_utils { + +/* + * Return shape in the same order as the memory format + * e.g. channels_last will return NHWC instead of NCHW + */ +std::vector get_mem_format_aware_shape(const at::Tensor& in); + +/* + * Input is always int8_t, output can be [int8_t, uint8_t]. + * input + offset = output + * int8_t + 128 = uint8_t + * int8_t + 0 = int8_t + */ +template +void q8_copy_int8_weight_and_add_offset(const at::Tensor& in, at::Tensor& out); + +template +Tensor convert_conv_weights_to_channel_last_tensor( + const at::Tensor& src, + int groups, + bool transpose); + +/* + * Series of create wrapper functions to call xnn_create_[de]conv* functions. + */ +C10_ALWAYS_INLINE +enum xnn_status xnnp_create_convolution2d_nhwc( + uint32_t pad_top, + uint32_t pad_right, + uint32_t pad_bottom, + uint32_t pad_left, + uint32_t kernel_h, + uint32_t kernel_w, + uint32_t stride_h, + uint32_t stride_w, + uint32_t dilation_h, + uint32_t dilation_w, + uint32_t groups, + size_t group_input_channels, + size_t group_output_channels, + size_t ip_chan_stride, + size_t op_chan_stride, + int8_t izp, + float ip_scale, + int8_t kzp, + const float* k_scales, + const int8_t* kernel, + const int32_t* bias, + int8_t ozp, + float op_scale, + int8_t op_min, + int8_t op_max, + uint32_t flags, + xnn_operator_t* op, + bool per_channel, + bool transpose) { + /* Symmetric quantization forces kzp = 0 */ + TORCH_CHECK(!kzp, "XNNPACK Q[SC]8 conv kernels expects kernel zero point to be zero." + "But got: ", kzp); + + if (transpose) { + TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!"); + return xnn_create_deconvolution2d_nhwc_qs8( + pad_top, /* uint32_t output_padding_top */ + pad_right, /* uint32_t output_padding_right */ + pad_bottom, /* uint32_t output_padding_bottom */ + pad_left, /* uint32_t output_padding_left */ + kernel_h, /* uint32_t kernel_height */ + kernel_w, /* uint32_t kernel_width */ + stride_h, /* uint32_t stride_height */ + stride_w, /* uint32_t stride_width */ + dilation_h, /* uint32_t dilation_height */ + dilation_w, /* uint32_t dilation_width */ + groups, /* uint32_t groups */ + group_input_channels, /* size_t group_input_channels */ + group_output_channels, /* size_t group_output_channels */ + ip_chan_stride, /* size_t input_pixel_stride */ + op_chan_stride, /* size_t output_pixel_stride */ + izp, /* int8_t input_zero_point */ + ip_scale, /* float input_scale */ + k_scales[0], /* float kernel_scale */ + kernel, /* const int8_t* kernel */ + bias, /* const int32_t* bias */ + ozp, /* int8_t output_zero_point */ + op_scale, /* float output_scale */ + op_min, /* int8_t output_min */ + op_max, /* int8_t output_max */ + flags, /* uint32_t flags */ + op); /* xnn_operator_t* deconvolution_op_out */ + + } + + if (!per_channel) { + return xnn_create_convolution2d_nhwc_qs8( + pad_top, /* uint32_t input_padding_top */ + pad_right, /* uint32_t input_padding_right */ + pad_bottom, /* uint32_t input_padding_bottom */ + pad_left, /* uint32_t input_padding_left */ + kernel_h, /* uint32_t kernel_height */ + kernel_w, /* uint32_t kernel_width */ + stride_h, /* uint32_t subsampling_height */ + stride_w, /* uint32_t subsampling_width */ + dilation_h, /* uint32_t dilation_height */ + dilation_w, /* uint32_t dilation_width */ + groups, /* uint32_t groups */ + group_input_channels, /* size_t group_input_channels */ + group_output_channels, /* size_t group_output_channels*/ + ip_chan_stride, /* size_t input_channel_stride */ + op_chan_stride, /* size_t output_channel_stride */ + izp, /* int8_t input_zero_point */ + ip_scale, /* float input_scale */ + k_scales[0], /* float kernel_scale */ + kernel, /* const int8_t* kernel */ + bias, /* const int32_t* bias */ + ozp, /* int8_t output_zero_point */ + op_scale, /* float output_scale */ + op_min, /* int8_t output_min */ + op_max, /* int8_t output_max */ + flags, /* uint32_t flags */ + op); /* xnn_operator_t* convolution_op_out */ + } else { /* per_channel */ + return xnn_create_convolution2d_nhwc_qc8( + pad_top, /* uint32_t input_padding_top */ + pad_right, /* uint32_t input_padding_right */ + pad_bottom, /* uint32_t input_padding_bottom */ + pad_left, /* uint32_t input_padding_left */ + kernel_h, /* uint32_t kernel_height */ + kernel_w, /* uint32_t kernel_width */ + stride_h, /* uint32_t subsampling_height */ + stride_w, /* uint32_t subsampling_width */ + dilation_h, /* uint32_t dilation_height */ + dilation_w, /* uint32_t dilation_width */ + groups, /* uint32_t groups */ + group_input_channels, /* size_t group_input_channels */ + group_output_channels, /* size_t group_output_channels*/ + ip_chan_stride, /* size_t input_channel_stride */ + op_chan_stride, /* size_t output_channel_stride */ + izp, /* int8_t input_zero_point */ + ip_scale, /* float input_scale */ + k_scales, /* const float* kernel_scale */ + kernel, /* const int8_t* kernel */ + bias, /* const int32_t* bias */ + ozp, /* int8_t output_zero_point */ + op_scale, /* float output_scale */ + op_min, /* int8_t output_min */ + op_max, /* int8_t output_max */ + flags, /* uint32_t flags */ + op); /* xnn_operator_t* convolution_op_out */ + } +} + +/* + * Series of setup wrapper functions to call xnn_setup_[de]conv* functions. + */ +C10_ALWAYS_INLINE +enum xnn_status xnnp_setup_convolution2d_nhwc( + xnn_operator_t op, + size_t batch, + size_t in_h, + size_t in_w, + const int8_t* inp, + int8_t* outp, + pthreadpool_t pt_pool, + bool per_channel = false, + bool transpose = false, + uint32_t adj_h = 0, + uint32_t adj_w = 0) { + if(transpose) { + TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!"); + return xnn_setup_deconvolution2d_nhwc_qs8( + op, /* xnn_operator_t deconvolution_op */ + batch, /* size_t batch_size */ + in_h, /* size_t input_height */ + in_w, /* size_t input_width */ + adj_h, /* uint32_t adjustment_height */ + adj_w, /* uint32_t adjustment_width */ + inp, /* const int8_t* input */ + outp, /* int8_t* output */ + pt_pool); /* pthreadpool_t threadpool */ + } + + if (!per_channel) { + return xnn_setup_convolution2d_nhwc_qs8( + op, /* xnn_operator_t convolution_op */ + batch, /* size_t batch_size */ + in_h, /* size_t input_height */ + in_w, /* size_t input_width */ + inp, /* const int8_t* input */ + outp, /* int8_t* output */ + pt_pool); /* pthreadpool_t threadpool */ + } else { /* per_channel */ + return xnn_setup_convolution2d_nhwc_qc8( + op, /* xnn_operator_t convolution_op */ + batch, /* size_t batch_size */ + in_h, /* size_t input_height */ + in_w, /* size_t input_width */ + inp, /* const int8_t* input */ + outp, /* int8_t* output */ + pt_pool); /* pthreadpool_t threadpool */ + } +} + + +/* + * Series of wrapper functions to call xnn_create* and xnn_setup* + * functions for linear + */ +C10_ALWAYS_INLINE +enum xnn_status xnnp_create_fully_connected_nc( + size_t input_channels, + size_t output_channels, + size_t input_stride, + size_t output_stride, + int8_t input_zero_point, + float input_scale, + int8_t kernel_zero_point, + float kernel_scale, + const int8_t* kernel, + const int32_t* bias, + int8_t output_zero_point, + float output_scale, + int8_t output_min, + int8_t output_max, + uint32_t flags, + xnn_operator_t* fully_connected_op_out) { + /* Symmetric quantization forces kzp = 0 */ + TORCH_CHECK(!kernel_zero_point, "XNNPACK QS8 linear kernel expects kernel zero point to be zero." + "But got: ", kernel_zero_point); + return xnn_create_fully_connected_nc_qs8( + input_channels, /* size_t input_channels */ + output_channels, /* size_t output_channels */ + input_stride, /* size_t input_stride */ + output_stride, /* size_t output_stride */ + input_zero_point, /* int8_t input_zero_point */ + input_scale, /* float input_scale */ + kernel_scale, /* float kernel_scale */ + kernel, /* const int8_t* kernel */ + bias, /* const int32_t* bias */ + output_zero_point, /* int8_t output_zero_point */ + output_scale, /* float output_scale */ + output_min, /* int8_t output_min */ + output_max, /* int8_t output_max */ + flags, /* uint32_t flags */ + fully_connected_op_out); /* xnn_operator_t* fully_connected_op_out */ +} + +C10_ALWAYS_INLINE +enum xnn_status xnnp_setup_fully_connected_nc( + xnn_operator_t fully_connected_op, + size_t batch_size, + const int8_t* input, + int8_t* output, + pthreadpool_t threadpool) { + return xnn_setup_fully_connected_nc_qs8( + fully_connected_op, /* xnn_operator_t fully_connected_op */ + batch_size, /* size_t batch_size */ + input, /* const int8_t* input */ + output, /* int8_t* output */ + threadpool); /* pthreadpool_t threadpool */ +} + +} // namespace xnnp_utils +} // namespace native +} // namespace at + +#endif // USE_XNNPACK diff --git a/aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp b/aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp new file mode 100644 index 000000000000..fed5600c8369 --- /dev/null +++ b/aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp @@ -0,0 +1,258 @@ +#ifdef USE_CUDA +#include // for the definition of AT_CUDNN_ENABLED + +#if AT_CUDNN_ENABLED() +#include +#if HAS_CUDNN_V8() + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace at { +namespace native { +namespace { +constexpr uint8_t max_num_input_dim = 5; +struct AddParams { + c10::DeviceIndex device_id; + int input_a_size[max_num_input_dim]; + int input_b_size[max_num_input_dim]; + uint8_t input_dim; // we currently assume both inputs are given as the same size (i.e., no broadcasting) + at::MemoryFormat memory_format; + bool deterministic; + bool allow_tf32; +}; +struct CacheKey { + AddParams params; + uint8_t input_a_alignment; + uint8_t input_b_alignment; + uint8_t output_alignment; + bool kReluFused; +}; +void setAddParams( + AddParams* params, const at::Tensor& input_a, const at::Tensor& input_b, + bool deterministic, bool allow_tf32) { + memset(params, 0, sizeof(AddParams)); + params->device_id = at::cuda::current_device(); + params->input_dim = input_a.dim(); + params->memory_format = input_a.suggest_memory_format(); + for (int i = 0; i < params->input_dim; ++i) { + params->input_a_size[i] = input_a.sizes()[i]; + params->input_b_size[i] = input_b.sizes()[i]; + } + params->deterministic = deterministic; + params->allow_tf32 = allow_tf32; +} +// FIXME: make this thread-safe by reusing the benchmark cache in Conv_v7.cpp +// we currently set the maximum number of input dimensions to 5 +// this can be increased, if necessary +std::unordered_map, at::native::ParamsEqual> execution_plan_cache; + +// TODO: this is also in qadd.cpp and some other cpp files in quantized/cpu/. I think we should +// move everything into a utilities file in quantized/ directory later. +inline void check_inputs(const Tensor& qa, const Tensor& qb) { + TORCH_CHECK( + qa.qscheme() == kPerTensorAffine, + "Only per tensor quantization is suported in Add."); + TORCH_CHECK( + qa.qscheme() == qb.qscheme(), + "Both inputs to Add must have the same quantization shceme."); + TORCH_CHECK( + qa.scalar_type() == qb.scalar_type(), + "Add operands should have same data type."); +} + +// currently we only support int8 symmetric (zero_point = 0 for inputs and output) quantized add +// We implement relu ( (a_int8 + b_int8 * ( b_scale/a_scale) ) ) * ( a_scale / out_scale ) +// which requires 4 cudnn ops (2 multiplication, 1 addition, and 1 relu ops) +// Multiplication ops: rhs_mult_op, requant_op +// Addition op: add_op +// Relu op: relu_op +template +Tensor add(Tensor qa, Tensor qb, double output_scale, int64_t output_zero_point) { + if (qa.numel() == 0) { + return Tensor{}; + } + // TODO: add shape checking when broadcasted add is supported. For now we assume the input tensors are the same shape + TORCH_CHECK(qa.sizes() == qb.sizes(), "Quantized cudnn add currently expects both input tensors to be the same shape"); + + check_inputs(qa, qb); + + // cudnn expects tensors to be at least 3D. So we will prepend dummy dimensions if the input tensors are not at least 3D + auto orig_sizes = qa.sizes().vec(); + if (qa.dim() < 3) { + std::vector new_sizes(3, 1); + // cudnn expects leading dimensions to be the dummy dimensions + new_sizes.back() = qa.sizes().back(); + if (qa.dim() == 2) { + new_sizes[1] = qa.size(0); + } + qa = qa.view(new_sizes); + qb = qb.view(new_sizes); + } else if (qa.dim() == 4) { + qa = qa.contiguous(c10::MemoryFormat::ChannelsLast); + qb = qb.contiguous(c10::MemoryFormat::ChannelsLast); + } + + auto memory_format = qa.dim() == 4 ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous; + at::Tensor add_output = at::empty(qa.sizes(), at::device(at::kCUDA).dtype(at::kFloat), memory_format); + at::Tensor quantized_output = at::_empty_affine_quantized(qa.sizes(), at::device(at::kCUDA).dtype(at::ScalarType::QInt8), + output_scale, output_zero_point, memory_format); + // TODO: When cudnn enables support for broadcasting, we can remove this tensor + at::Tensor requantize_multiplier_tensor = at::empty(quantized_output.sizes(), at::device(at::kCUDA).dtype(at::kFloat), memory_format); + requantize_multiplier_tensor.fill_(qa.q_scale() / output_scale); + at::Tensor rhs_multiplier_tensor = at::empty(quantized_output.sizes(), at::device(at::kCUDA).dtype(at::kFloat), memory_format); + rhs_multiplier_tensor.fill_(qb.q_scale() / qa.q_scale()); + + cudnnHandle_t handle = at::native::getCudnnHandle(); + CacheKey key; + // memset is needed here because there is implicit packing added for CacheKey, and this can result in uninitialized padded values that are + // used for hashing (see how at::native::ParamsHash is defined). without memset, we can potentially come across a situation where two + // CacheKey objects have the same user defined parameters, but + // different padded values, resulting in different hash outputs. + memset(&key, 0, sizeof(key)); + bool deterministic{true}; + bool allow_tf32{false}; + setAddParams(&key.params, qa, qb, deterministic, allow_tf32); + key.kReluFused = kReluFused; + key.input_a_alignment = cudnn_utils::getAlignment(qa); + key.input_b_alignment = cudnn_utils::getAlignment(qb); + key.output_alignment = cudnn_utils::getAlignment(add_output); + + auto run = [&](cudnn_frontend::ManagedOpaqueDescriptor plan_desc) { + auto workspace_size = 0; + auto workspace = at::empty({workspace_size}, qa.options().dtype(at::kByte)); + std::vector data_ptrs; + std::vector uids; + data_ptrs.reserve(8); + uids.reserve(8); + data_ptrs = {qb.data_ptr(), rhs_multiplier_tensor.data_ptr(), add_output.data_ptr(), + qa.data_ptr(), add_output.data_ptr(), requantize_multiplier_tensor.data_ptr(), + quantized_output.data_ptr()}; + uids = {'b', 'm', 'c', 'a', 'p', 'r', 'q'}; + if (kReluFused) { + data_ptrs.emplace_back(add_output.data_ptr()), + uids.emplace_back('f'); + } + + auto variantPack = cudnn_frontend::VariantPackBuilder() + .setWorkspacePointer(workspace.data_ptr()) + .setDataPointers(uids.size(), data_ptrs.data()) + .setUids(uids.size(), uids.data()) + .build(); + auto variant_pack_desc = variantPack.get_raw_desc(); + AT_CUDNN_CHECK(cudnnBackendExecute(handle, plan_desc->get_backend_descriptor(), variant_pack_desc)); + }; + + auto search = execution_plan_cache.find(key); + if (search != execution_plan_cache.end()) { + cudnn_frontend::ManagedOpaqueDescriptor plan_desc = search->second; + run(plan_desc); + return quantized_output.view(orig_sizes); + } + + // computes qb_int8 * ( qb_scale/qa_scale ) + auto rhs_mult_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(cudnn_utils::getTensorDescriptor(qb.sizes(), qb.strides(), CUDNN_DATA_INT8, 'b', key.input_b_alignment)) + .setbDesc(cudnn_utils::getTensorDescriptor(rhs_multiplier_tensor, 'm', cudnn_utils::getAlignment(rhs_multiplier_tensor))) + .setyDesc(cudnn_utils::getTensorDescriptor(add_output, 'c', key.output_alignment)) + .setpwDesc(cudnn_utils::getPointWiseMulDescriptor(at::native::getCudnnDataType(add_output))) + .build(); + + // add_op computes (qa_int8 + qb_int8 * ( qb_scale/qa_scale ) ) + // add_output is a fp32 tensor for accumulation purposes + auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(rhs_mult_op.getOutputTensor()) + .setbDesc(cudnn_utils::getTensorDescriptor(qa.sizes(), qa.strides(), CUDNN_DATA_INT8, 'a', key.input_a_alignment)) + .setyDesc(cudnn_utils::getTensorDescriptor(add_output, 'p', key.output_alignment)) + .setpwDesc(cudnn_utils::getPointWiseAddDescriptor(at::native::getCudnnDataType(add_output))) + .build(); + + // relu_op computes + // relu( (qa_int8 + qb_int8 * ( qb_scale/qa_scale ) ) ) + // output is a fp32 tensor + c10::optional relu_op; + if (kReluFused) { + // we use inplace operation here where the output is assigned to the input + relu_op.emplace(cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(add_op.getOutputTensor()) + .setyDesc(cudnn_utils::getTensorDescriptor(add_output, 'f', key.output_alignment)) + .setpwDesc(cudnn_utils::getPointWiseReluDescriptor(at::native::getCudnnDataType(add_output))) + .build()); + } + + // requant_op computes + // (a_int8 + b_int8 * ( b_scale/a_scale) ) * a_scale / out_scale + auto requant_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(kReluFused ? relu_op.value().getOutputTensor() : add_op.getOutputTensor()) + .setbDesc(cudnn_utils::getTensorDescriptor(requantize_multiplier_tensor, 'r', cudnn_utils::getAlignment(requantize_multiplier_tensor))) + .setyDesc(cudnn_utils::getTensorDescriptor(quantized_output.sizes(), quantized_output.strides(), CUDNN_DATA_INT8, 'q', cudnn_utils::getAlignment(quantized_output))) + .setpwDesc(cudnn_utils::getPointWiseMulDescriptor(at::native::getCudnnDataType(requantize_multiplier_tensor))) + .build(); + + std::vector ops{&rhs_mult_op, &add_op}; + if (kReluFused) { + ops.emplace_back(&(relu_op.value())); + } + ops.emplace_back(&requant_op); + + auto opGraph = cudnn_frontend::OperationGraphBuilder() + .setHandle(handle) + .setOperationGraph(ops.size(), ops.data()) + .build(); + // std::cout << "opGraph: " << opGraph.describe() << std::endl; + + auto heuristics = cudnn_frontend::EngineHeuristicsBuilder() + .setOperationGraph(opGraph) + .setHeurMode(CUDNN_HEUR_MODE_INSTANT) + .build(); + auto fallback = cudnn_frontend::EngineFallbackListBuilder() + .setOperationGraph(opGraph) + .setOperation(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .build(); + + auto& engine_configs = heuristics.getEngineConfig(heuristics.getEngineConfigCount()); + auto& fallback_list = fallback.getFallbackList(); + + cudnn_frontend::EngineConfigList filtered_configs; + cudnn_utils::filterEngineConfigs(engine_configs, filtered_configs, deterministic, allow_tf32, at::kChar); + cudnn_utils::filterEngineConfigs(fallback_list, filtered_configs, deterministic, allow_tf32, at::kChar); + for (auto &cfg : engine_configs) { + try { + auto plan = cudnn_frontend::ExecutionPlanBuilder() + .setHandle(handle) + .setEngineConfig(cfg) + .build(); + auto plan_desc = plan.get_desc(); + run(plan_desc); + execution_plan_cache[key] = plan_desc; + return quantized_output.view(orig_sizes); + } catch (cudnn_frontend::cudnnException &e) {std::cout << "cudnn error:" << e.what() << std::endl;} catch(c10::CuDNNError &e) { std::cout << "other error" << e.what() << std::endl;} + } + + TORCH_CHECK(false, "Unable to find an engine to execute this computation in Quantized Add Cudnn"); +} + +TORCH_LIBRARY_IMPL(quantized, QuantizedCUDA, m) { + m.impl(TORCH_SELECTIVE_NAME("quantized::add"), TORCH_FN(add)); + m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu"), TORCH_FN(add)); +} + +} // namespace +} // namespace native +} // namespace at + +#endif // HAS_CUDNN_V8 +#endif // AT_CUDNN_ENABLED +#endif // USE_CUDA diff --git a/aten/src/ATen/native/quantized/cudnn/Conv.cpp b/aten/src/ATen/native/quantized/cudnn/Conv.cpp index bae4b9e2cb9d..6fd5be129c84 100644 --- a/aten/src/ATen/native/quantized/cudnn/Conv.cpp +++ b/aten/src/ATen/native/quantized/cudnn/Conv.cpp @@ -4,50 +4,29 @@ #if AT_CUDNN_ENABLED() #include +#include #if HAS_CUDNN_V8() -#include #include -#include #include +#include #include #include +#include +#include #include -#include #include +#include #include -#include #include +#include +#include -namespace at { namespace native{ - -namespace { - -uint8_t getAlignment(const Tensor &t) { - // alignment are in bytes - uint8_t alignment = 1; - uintptr_t address = reinterpret_cast(t.data_ptr()); - while (address % alignment == 0 && alignment < 16) alignment *= 2; - return alignment; -} - -cudnn_frontend::Tensor getTensorDescriptor(const Tensor &t, int64_t id, uint8_t alignment) { - auto shape = t.sizes(); - auto strides = t.strides(); - return cudnn_frontend::TensorBuilder() - .setDim(shape.size(), shape.data()) - .setStrides(strides.size(), strides.data()) - .setId(id) - .setAlignment(alignment) - .setDataType(getCudnnDataType(t)) - .build(); -} - -// TODO: there is a table from input dtype and weight dtype to operator dtype, +// TODO: there is a table from input dtype and weight dtype to operator qdtype, // we can derive the operator dtype based on input dtype -cudnn_frontend::ConvDesc_v8 getConvDescriptor(cudnnDataType_t dataType, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation) { +cudnn_frontend::ConvDesc_v8 getConvDescriptor(cudnnDataType_t dataType, c10::IntArrayRef padding, c10::IntArrayRef stride, c10::IntArrayRef dilation) { uint64_t convDim = stride.size(); return cudnn_frontend::ConvDescBuilder() .setDataType(dataType) @@ -60,79 +39,19 @@ cudnn_frontend::ConvDesc_v8 getConvDescriptor(cudnnDataType_t dataType, IntArray .build(); } -// TODO: there is a table from input dtype to operator dtype, we can derive -// the operator dtype based on input dtype -cudnn_frontend::PointWiseDesc_v8 getPointWiseMulDescriptor(cudnnDataType_t dataType) { - return cudnn_frontend::PointWiseDescBuilder() - .setMode(CUDNN_POINTWISE_MUL) - .setMathPrecision(dataType) - .build(); -} - -void filterEngineConfigs( - cudnn_frontend::EngineConfigList &from, - cudnn_frontend::EngineConfigList &to, - bool deterministic, bool allow_tf32, c10::ScalarType scalar_type) -{ - auto filter = [=](cudnnBackendDescriptor_t c) { - if (deterministic) { - if (cudnn_frontend::hasNumericalNote(c)) return true; - } - if (scalar_type == kFloat || scalar_type == kChar || !allow_tf32) { - if (cudnn_frontend::hasNumericalNote(c)) return true; - if (cudnn_frontend::hasNumericalNote(c)) return true; - } - return false; - }; - cudnn_frontend::filter(from, to, filter); -} - -cudnn_frontend::ExecutionPlan -get_execplan_from_heuristics_else_fall_back(cudnn_frontend::OperationGraph&& opGraph, cudnnHandle_t handle_) { - auto heuristics = cudnn_frontend::EngineHeuristicsBuilder() - .setOperationGraph(opGraph) - .setHeurMode(CUDNN_HEUR_MODE_INSTANT) - .build(); - - // std::cout << "Heuristic has " << heuristics.getEngineConfigCount() << " configurations " << std::endl; - auto& engine_config = heuristics.getEngineConfig(heuristics.getEngineConfigCount()); - - // Try engine configs returned by the heuristics and pick up the first one that works. - for (auto& ecfg : engine_config) { - try { - auto plan = cudnn_frontend::ExecutionPlanBuilder() - .setHandle(handle_) - .setEngineConfig(ecfg, opGraph.getTag()) - .build(); - return plan; - } catch (cudnn_frontend::cudnnException& e) { - continue; - } - } - - { - auto total_engines = opGraph.getEngineCount(); - // std::cout << opGraph.describe() << " has " << total_engines << " engines." << std::endl; - auto engine = cudnn_frontend::EngineBuilder().setGlobalEngineIdx(0).setOperationGraph(opGraph).build(); - // std::cout << engine.describe() << std::endl; - - auto engine_config = cudnn_frontend::EngineConfigBuilder().setEngine(engine).build(); - // std::cout << engine_config.describe() << std::endl; - - return cudnn_frontend::ExecutionPlanBuilder().setHandle(handle_).setEngineConfig(engine_config).build(); - } -} - +// FIXME: make this thread-safe by reusing the benchmark cache in Conv_v7.cpp +namespace { struct CacheKey { - ConvolutionParams params; + at::native::ConvolutionParams params; uint8_t input_alignment; uint8_t weight_alignment; uint8_t output_alignment; + // default to -1 when no bias + int8_t bias_alignment; + bool kReluFused; }; - -// FIXME: make this thread-safe by reusing the benchmark cache in Conv_v7.cpp -std::unordered_map, ParamsEqual> execution_plan_cache; - +std::unordered_map, at::native::ParamsEqual> execution_plan_cache; +} // TODO: we can use cudnn_frontend::ExecutionPlanCache when it supports caching // multiple operators // reference: https://github.com/NVIDIA/cudnn-frontend/blob/main/samples/conv_sample.cpp#L293 @@ -144,9 +63,9 @@ at::SmallVector MakeConvOutputShape( int M, // output channels const std::array& input_image_shape, const std::vector& kernel, - IntArrayRef stride, - IntArrayRef padding, - IntArrayRef dilation); + const torch::List& stride, + const torch::List& padding, + const torch::List& dilation); template <> at::SmallVector MakeConvOutputShape<2>( @@ -154,9 +73,9 @@ at::SmallVector MakeConvOutputShape<2>( int M, // output channels const std::array& input_image_shape, const std::vector& kernel, - IntArrayRef stride, - IntArrayRef padding, - IntArrayRef dilation) { + const torch::List& stride, + const torch::List& padding, + const torch::List& dilation) { const int H = input_image_shape[0]; const int W = input_image_shape[1]; const int64_t Y_H = @@ -166,48 +85,93 @@ at::SmallVector MakeConvOutputShape<2>( return {N, M, Y_H, Y_W}; } -void raw_cudnn_convolution_forward_out( - const Tensor& output, - const Tensor& input, - const Tensor& weight, - IntArrayRef padding, - IntArrayRef stride, - IntArrayRef dilation, - int64_t groups, - bool benchmark, - bool deterministic, - bool allow_tf32, - float requantize_multiplier -) { - TORCH_CHECK(!benchmark, "not supported yet"); - if (output.numel() == 0) { + +// the parameter quantized_output is a quantized tensor +template +template +void PackedConvWeightCudnn::apply_impl_helper(const at::Tensor& quantized_output, const at::Tensor& input, double output_scale) { + if (quantized_output.numel() == 0) { return; } - - Tensor conv_output = at::empty_like(output, output.options().dtype(at::kFloat)); - Tensor requantize_multiplier_tensor = at::empty_like(output, output.options().dtype(at::kFloat)); + at::Tensor conv_output = at::empty(quantized_output.sizes(), at::device(at::kCUDA).dtype(at::kFloat), at::MemoryFormat::ChannelsLast); + // TODO: combine empty & fill_ using full_like or full + at::Tensor requantize_multiplier_tensor = at::empty(quantized_output.sizes(), at::device(at::kCUDA).dtype(at::kFloat), at::MemoryFormat::ChannelsLast); + auto act_scale = input.q_scale(); + auto weight_scale = maybe_padded_weight_.q_scale(); + auto requantize_multiplier = act_scale * weight_scale / output_scale; requantize_multiplier_tensor.fill_(requantize_multiplier); - cudnnHandle_t handle = getCudnnHandle(); + c10::optional bias_multiplier_tensor; + c10::optional broadcasted_bias; + if (bias_.has_value()) { + // the input bias is a 1-D tensor whose size is the same as the size of the second dimension of quantized_output. + // we need to add trailing dimensions in order to properly broadcast bias, otherwise broadcast_to will fail. + // the number of trailling dimensions is quantized_output.dim() - 2, so the new size of the broadcast_bias + // becomes quantized_output.dim() - 2 + 1. nothing needs to be done for the leading dimensions + std::vector new_size(quantized_output.dim() - 1, 1); + new_size[0] = bias_.value().size(0); + broadcasted_bias = bias_.value().reshape(new_size); + broadcasted_bias.value() = broadcasted_bias.value().broadcast_to(quantized_output.sizes()); + broadcasted_bias.value() = broadcasted_bias.value().to(c10::MemoryFormat::ChannelsLast); + bias_multiplier_tensor = at::empty(quantized_output.sizes(), at::device(at::kCUDA).dtype(at::kFloat), at::MemoryFormat::ChannelsLast); + auto bias_multiplier = 1.0 / (act_scale * weight_scale); + bias_multiplier_tensor.value().fill_(bias_multiplier); + } + cudnnHandle_t handle = at::native::getCudnnHandle(); CacheKey key; - setConvolutionParams(&key.params, input, weight, padding, stride, dilation, groups, deterministic, allow_tf32); + // memset is needed here because there is implicit packing added for CacheKey, and this can result in uninitialized padded values that are + // used for hashing (see how at::native::ParamsHash is defined). without memset, we can potentially come across a situation where two + // CacheKey objects have the same user defined parameters, but + // different padded values, resulting in different hash outputs. + memset(&key, 0, sizeof(key)); + bool deterministic{true}; + bool allow_tf32{false}; + auto padding_vec = padding_.vec(); + auto stride_vec = stride_.vec(); + auto dilation_vec = dilation_.vec(); + setConvolutionParams(&key.params, input, maybe_padded_weight_, padding_vec, stride_vec, dilation_vec, groups_, deterministic, allow_tf32); + // operator datatype needs to be int32 for int8 convolution, but we can // set the datatype for output tensor to int32 or fp32 key.params.dataType = CUDNN_DATA_INT32; - key.input_alignment = getAlignment(input); - key.output_alignment = getAlignment(conv_output); - key.weight_alignment = getAlignment(weight); + key.input_alignment = cudnn_utils::getAlignment(input); + key.output_alignment = cudnn_utils::getAlignment(conv_output); + key.weight_alignment = cudnn_utils::getAlignment(maybe_padded_weight_); + if (bias_.has_value()) { + key.bias_alignment = cudnn_utils::getAlignment(broadcasted_bias.value()); + } else { + key.bias_alignment = -1; + } + key.kReluFused = kReluFused; auto run = [&](cudnn_frontend::ManagedOpaqueDescriptor plan_desc) { auto workspace_size = 0; - auto workspace = at::empty({workspace_size}, input.options().dtype(kByte)); - void *data_ptrs[] = {reinterpret_cast(input.data_ptr()), conv_output.data_ptr(), reinterpret_cast(weight.data_ptr()), requantize_multiplier_tensor.data_ptr(), output.data_ptr()}; - // std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl; - int64_t uids[] = {'x', 'y', 'w', 's', 'r'}; + auto workspace = at::empty({workspace_size}, input.options().dtype(at::kByte)); + std::vector data_ptrs; + std::vector uids; + data_ptrs.reserve(10); + uids.reserve(10); + data_ptrs = {input.data_ptr(), conv_output.data_ptr(), maybe_padded_weight_.data_ptr(), + requantize_multiplier_tensor.data_ptr(), quantized_output.data_ptr()}; + uids = {'x', 'y', 'w', 's', 'r'}; + if (bias_.has_value()) { + data_ptrs.insert(data_ptrs.end(), {broadcasted_bias.value().data_ptr(), bias_multiplier_tensor.value().data_ptr(), + broadcasted_bias.value().data_ptr(), conv_output.data_ptr()}); + uids.insert(uids.end(), {'b', 'c', 'd', 'e'}); + if (kReluFused) { + data_ptrs.emplace_back(conv_output.data_ptr()), + uids.emplace_back('f'); + } + } else { + if (kReluFused) { + data_ptrs.emplace_back(conv_output.data_ptr()); + uids.emplace_back('f'); + } + } auto variantPack = cudnn_frontend::VariantPackBuilder() .setWorkspacePointer(workspace.data_ptr()) - .setDataPointers(5, data_ptrs) - .setUids(5, uids) + .setDataPointers(uids.size(), data_ptrs.data()) + .setUids(uids.size(), uids.data()) .build(); auto variant_pack_desc = variantPack.get_raw_desc(); AT_CUDNN_CHECK(cudnnBackendExecute(handle, plan_desc->get_backend_descriptor(), variant_pack_desc)); @@ -219,25 +183,81 @@ void raw_cudnn_convolution_forward_out( run(plan_desc); return; } - + // conv_op computes act_fp32 * w_fp32 (matrix multiplication) + // where act_fp32 and w_fp32 are the input and weight variables, resp. + // output is a fp32 tensor auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR) - .setxDesc(getTensorDescriptor(input, 'x', key.input_alignment)) - .setyDesc(getTensorDescriptor(conv_output, 'y', key.output_alignment)) - .setwDesc(getTensorDescriptor(weight, 'w', key.weight_alignment)) - .setcDesc(getConvDescriptor(key.params.dataType, padding, stride, dilation)) + .setxDesc(cudnn_utils::getTensorDescriptor(input.sizes(), input.strides(), CUDNN_DATA_INT8, 'x', key.input_alignment)) + .setyDesc(cudnn_utils::getTensorDescriptor(conv_output, 'y', key.output_alignment)) + .setwDesc(cudnn_utils::getTensorDescriptor(maybe_padded_weight_.sizes(), maybe_padded_weight_.strides(), CUDNN_DATA_INT8, 'w', key.weight_alignment)) + .setcDesc(getConvDescriptor(key.params.dataType, padding_vec, stride_vec, dilation_vec)) .build(); // std::cout << "operator:" << conv_op.describe() << std::endl; - // TODO: add support for bias + c10::optional bias_mult_op; + c10::optional sum_conv_bias_op; + if (bias_.has_value()) { + // we can't directly assign bias_mult_op becauase operator= is deleted for cudnn_frontend::Operation; + // alternatively, I think we can use std::unique_ptr and dynamically allocate these builder ops + // but here, we chose to do it statically. c10::optional::emplace() enables this approach + + // bias_mult_op computes bias_fp32 / (act_scale * w_scale) or bias_fp32 * (1 / (act_scale * w_scale)) + // where bias_multiplier = (1 / (act_scale * w_scale)) + // output is a fp32 tensor + // we use inplace operation here where the output is assigned to the input + bias_mult_op.emplace(cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(cudnn_utils::getTensorDescriptor(broadcasted_bias.value(), 'b', cudnn_utils::getAlignment(broadcasted_bias.value()))) + .setbDesc(cudnn_utils::getTensorDescriptor(bias_multiplier_tensor.value(), 'c', cudnn_utils::getAlignment(bias_multiplier_tensor.value()))) + .setyDesc(cudnn_utils::getTensorDescriptor(broadcasted_bias.value(), 'd', cudnn_utils::getAlignment(broadcasted_bias.value()))) + .setpwDesc(cudnn_utils::getPointWiseMulDescriptor(at::native::getCudnnDataType(bias_multiplier_tensor.value()))) + .build()); + + // computes (act_int8 * w_int8 + [bias_fp32/(act_scale * w_scale)]) + // where the 1st and 2nd summands is conv_output and broadcasted_bias, resp. + // output is a fp32 tensor + // we use inplace operation here where the output is assigned to the input + sum_conv_bias_op.emplace(cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(conv_op.getOutputTensor()) + .setbDesc(cudnn_utils::getTensorDescriptor(broadcasted_bias.value(), 'd', cudnn_utils::getAlignment(broadcasted_bias.value()))) + .setyDesc(cudnn_utils::getTensorDescriptor(conv_output, 'e', key.output_alignment)) + .setpwDesc(cudnn_utils::getPointWiseAddDescriptor(at::native::getCudnnDataType(broadcasted_bias.value()))) + .build()); + } + + // relu_op computes relu(act_int8 * w_int8 + [bias_fp32/(act_scale * w_scale)] + // or relu(act_int8 * w_int8) if bias is not present. + // output is a fp32 tensor + c10::optional relu_op; + std::shared_ptr tensor2requant_ptr = bias_.has_value() ? sum_conv_bias_op.value().getOutputTensor() : conv_op.getOutputTensor(); + if (kReluFused) { + // we use inplace operation here where the output is assigned to the input + relu_op.emplace(cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(tensor2requant_ptr) + .setyDesc(cudnn_utils::getTensorDescriptor(conv_output, 'f', key.output_alignment)) + .setpwDesc(cudnn_utils::getPointWiseReluDescriptor(at::native::getCudnnDataType(conv_output))) + .build()); + } + + // relu_op computes relu(act_int8 * w_int8 + [bias_fp32/(act_scale * w_scale)]) / (out_scale / (act_scale * w_scale)) + // or relu(act_int8 * w_int8) / (out_scale / (act_scale * w_scale))) if bias is not present. + // output is a fp32 tensor auto requant_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) - .setxDesc(conv_op.getOutputTensor()) - .setbDesc(getTensorDescriptor(requantize_multiplier_tensor, 's', getAlignment(requantize_multiplier_tensor))) - .setyDesc(getTensorDescriptor(output, 'r', getAlignment(output))) - .setpwDesc(getPointWiseMulDescriptor(getCudnnDataType(requantize_multiplier_tensor))) + .setxDesc(kReluFused ? relu_op.value().getOutputTensor() : tensor2requant_ptr) + .setbDesc(cudnn_utils::getTensorDescriptor(requantize_multiplier_tensor, 's', cudnn_utils::getAlignment(requantize_multiplier_tensor))) + .setyDesc(cudnn_utils::getTensorDescriptor(quantized_output.sizes(), quantized_output.strides(), CUDNN_DATA_INT8, 'r', cudnn_utils::getAlignment(quantized_output))) + .setpwDesc(cudnn_utils::getPointWiseMulDescriptor(at::native::getCudnnDataType(requantize_multiplier_tensor))) .build(); // std::cout << "operator:" << requant_op.describe() << std::endl; - std::array ops = {&conv_op, &requant_op}; + std::vector ops{&conv_op}; + if (bias_.has_value()) { + ops.emplace_back(&(bias_mult_op.value())); + ops.emplace_back(&(sum_conv_bias_op.value())); + } + if (kReluFused) { + ops.emplace_back(&(relu_op.value())); + } + ops.emplace_back(&requant_op); auto opGraph = cudnn_frontend::OperationGraphBuilder() .setHandle(handle) @@ -258,8 +278,8 @@ void raw_cudnn_convolution_forward_out( auto& fallback_list = fallback.getFallbackList(); cudnn_frontend::EngineConfigList filtered_configs; - filterEngineConfigs(engine_configs, filtered_configs, deterministic, allow_tf32, input.scalar_type()); - filterEngineConfigs(fallback_list, filtered_configs, deterministic, allow_tf32, input.scalar_type()); + cudnn_utils::filterEngineConfigs(engine_configs, filtered_configs, deterministic, allow_tf32, at::kChar); + cudnn_utils::filterEngineConfigs(fallback_list, filtered_configs, deterministic, allow_tf32, at::kChar); for (auto &cfg : engine_configs) { try { @@ -271,101 +291,159 @@ void raw_cudnn_convolution_forward_out( run(plan_desc); execution_plan_cache[key] = plan_desc; return; - } catch (cudnn_frontend::cudnnException &e) {std::cout << "cudnn error:" << e.what() << std::endl;} catch(CuDNNError &e) { std::cout << "other error" << e.what() << std::endl;} + } catch (cudnn_frontend::cudnnException &e) {std::cout << "cudnn error:" << e.what() << std::endl;} catch(c10::CuDNNError &e) { std::cout << "other error" << e.what() << std::endl;} } - TORCH_CHECK(false, "Unable to find an engine to execute this computation"); + + TORCH_CHECK(false, "Unable to find an engine to execute this computation in Quantized Conv2D Cudnn"); } // // output Tensor will be a clampped int8 Tensor // both act and weight will be int8 Tensor -// +/* +Numerics: +out_fp32 = conv_fp32(act_fp32, w_fp32, …) + = act_fp32 * w_fp32 + bias_fp32 +act_int8 = act_fp32 / act_scale + act_zero_point +w_int8 = w_fp32 / w_scale + w_zero_point +out_int8 = out_fp32 / out_scale + out_zero_point +out_int8 = (act_fp32 * w_fp32 + [bias_fp32]) / out_scale + out_zero_point + = (act_int8 - act_zero_point) * act_scale * (w_int8 - w_zero_point) * w_scale / out_scale + out_zero_point + [bias_fp32 / out_scale] + = (act_int8 * w_int8 - act_int8 * w_zero_point - act_zero_point * w_int8 + act_zero_point * w_zero_point) * act_scale * w_scale / out_scale + out_zero_point + [bias_fp32 / out_scale] + = (if both act and weight are symmetrically quantized, int8, then act_zero_point = w_zero_point = 0) + = (act_int8 * w_int8 + [bias_fp32/(act_scale * w_scale)]) * act_scale * w_scale / out_scale + = (act_int8 * w_int8 + [bias_fp32/(act_scale * w_scale)]) / (out_scale / (act_scale * w_scale)) + = requantize((act_int8 * w_int8 + [bias_fp32/(act_scale * w_scale)]), out_scale / (act_scale * w_scale)) +*/ template -Tensor raw_cudnn_convolution_forward( - const Tensor& act, - const Tensor& weight, - IntArrayRef padding, - IntArrayRef stride, - IntArrayRef dilation, - int64_t groups, - bool benchmark, - bool deterministic, - bool allow_tf32, - float requantize_multiplier) { - // TODO: add dimension validations for input/weight/bias - const int N = act.size(0); - const int C = act.size(1); - const int D = kSpatialDim == 3 ? act.size(2) : 1; - const int H = act.size(kSpatialDim); - const int W = act.size(kSpatialDim + 1); - const int M = weight.size(0); // output channels - std::vector kernel_size = {weight.size(2), weight.size(3)}; - at::SmallVector output_shape; - output_shape = MakeConvOutputShape(N, M, {H, W}, kernel_size, stride, padding, dilation); - Tensor output_int8 = at::empty( +template +at::Tensor PackedConvWeightCudnn::apply_impl( + const at::Tensor& act, + double output_scale, + int64_t output_zero_point) { + const auto batch_size = kSpatialDim == 2 ? act.size(0) : 1; + const auto num_input_channels = act.size(kSpatialDim - 1); + const auto H = act.size(kSpatialDim); + const auto W = act.size(kSpatialDim + 1); + const auto num_output_channels = maybe_padded_weight_.size(0); // output channels + std::vector kernel_size = {maybe_padded_weight_.size(2), maybe_padded_weight_.size(3)}; + at::SmallVector output_shape = MakeConvOutputShape(batch_size, num_output_channels, {H, W}, + kernel_size, stride_, padding_, dilation_); + at::Tensor quantized_output = at::_empty_affine_quantized( output_shape, - at::device(at::kCUDA).dtype(at::kChar), - at::MemoryFormat::ChannelsLast - ); - - raw_cudnn_convolution_forward_out( - output_int8, act, weight, - padding, stride, dilation, groups, - benchmark, - deterministic, - allow_tf32, - requantize_multiplier); - return output_int8; + at::device(at::kCUDA).dtype(at::ScalarType::QInt8), + output_scale, + output_zero_point, + at::MemoryFormat::ChannelsLast); + + // cudnn v8.4.0 expects conv2d's int8 activation tensor's input channels to be a multiple of 4. if it is not + // we need to explicitly pad it to a multiple of 4 ourselves as cudnn does not currently support padding. + // TODO: when and if cudnn enables padding in their operators, we can remove padding on our end; + // currently, limit padding support to groups=1 (ungrouped conv) + // TODO: implement this for groups > 1; should be straightforward since we're only padding a single dimension + auto act_maybe_padded = act; + if (num_input_channels % 4 != 0) { + int8_t num_slices = 4 - num_input_channels % 4; // number of slices we need to pad + act_maybe_padded = at::pad(act, {0, 0, 0, 0, 0, num_slices, 0, 0}, "constant", 0); + } + apply_impl_helper( + quantized_output, act_maybe_padded.to(c10::MemoryFormat::ChannelsLast), output_scale); + + // need to return sliced tensor if output_channels was padded + if (num_unpadded_output_channels_ != maybe_padded_weight_.size(0)) { + return quantized_output.slice(1, 0, num_unpadded_output_channels_); + } + return quantized_output; } +template +at::Tensor PackedConvWeightCudnn::apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) { + return apply_impl(input, output_scale, output_zero_point); +} -template -class QConvInt8 final { +template +at::Tensor PackedConvWeightCudnn::apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) { + return apply_impl(input, output_scale, output_zero_point); +} + +template at::Tensor PackedConvWeightCudnn<2>::apply( + const at::Tensor& act, + double output_scale, + int64_t output_zero_point); + +template at::Tensor PackedConvWeightCudnn<2>::apply_relu( + const at::Tensor& act, + double output_scale, + int64_t output_zero_point); + +namespace at { +namespace native { +namespace { + +template +class QConv1dInt8 final { public: static Tensor run( Tensor act, - Tensor weight, - c10::optional bias, - torch::List stride, - torch::List padding, - torch::List dilation, - int64_t groups, + const c10::intrusive_ptr>& packed_weight, double output_scale, int64_t output_zero_point) { - TORCH_CHECK(!kReluFused, "conv relu not supported yet"); - TORCH_CHECK(!bias.has_value(), "bias is not supported yet"); - act = act.contiguous(c10::MemoryFormat::ChannelsLast); - weight = weight.contiguous(c10::MemoryFormat::ChannelsLast); - // requantization - // out_int8 = act_int8 * weight_int8 * act_scale * w_scale / output_scale - auto act_scale = act.q_scale(); - auto weight_scale = weight.q_scale(); - auto requantize_multiplier = act_scale * weight_scale / output_scale; + at::Tensor output; + // we currently use conv2d kernel for conv1d by making the input and weight tensors + // 4D rather than 3D. we add a dummy width dimension of size 1 + // N, C, L -> N, C, 1, L + act = act.unsqueeze(-2); + if (kReluFused) { + output = packed_weight->apply_relu(act, output_scale, output_zero_point); + } else { + output = packed_weight->apply(act, output_scale, output_zero_point); + } + // N, C, 1, L -> N, C, L + return output.squeeze_(-2); + } +}; +template +class QConvInt8 final { + public: + static at::Tensor run( + at::Tensor act, + const c10::intrusive_ptr>& packed_weight, + double output_scale, + int64_t output_zero_point) { + TORCH_CHECK(kSpatialDim == 1 || kSpatialDim == 2, "Error in quantized cudnn conv2d operator: " + "Expected kSpatialDim == 1 || kSpatialDim == 2; received kSpatialDim=", kSpatialDim); // TODO: check all zero_points are zero/all tensors are symmetrically quantized - Tensor output_int8_requant = raw_cudnn_convolution_forward( - act.int_repr(), weight.int_repr(), - IntArrayRef(padding.vec()), IntArrayRef(stride.vec()), IntArrayRef(dilation.vec()), groups, - false /* benchmark */, - true /* deterministic */, - false /* allow_tf32 */, - requantize_multiplier - ); - - // clamping is done in cudnn kernels, which probably defaults to -128, 127 - // for int8 dtype, we may need to add new operators to the graph if - // we want to change the clamping - Tensor quantized_output = at::_make_per_tensor_quantized_tensor(output_int8_requant, output_scale, output_zero_point); - return quantized_output; + if (kReluFused) { + return packed_weight->apply_relu(act, output_scale, output_zero_point); + } else { + return packed_weight->apply(act, output_scale, output_zero_point); + } } }; TORCH_LIBRARY_IMPL(quantized, QuantizedCUDA, m) { - m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_cudnn"), QConvInt8<2, false>::run); + // the cpu conv1d doesn't use the quantized::conv1d*.new variant for packed weights. instead it just uses + // quantized::conv1d for packed weights (see quantized/library.cpp). + // this is inconsistent with what has been done for conv2d where new variants use packed weights, and + // old variant does not. we adopt this inconsistency for now to be consistent with QuantizedCPU's conv1d + // and will eventually deprecate the old variants + m.impl(TORCH_SELECTIVE_NAME("quantized::conv1d"), QConv1dInt8::run); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv1d_relu"), QConv1dInt8::run); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d.new"), QConvInt8<2, false>::run); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_relu.new"), QConvInt8<2, true>::run); } } // namespace -}} // at::native +} // namespace native +} // namespace at + #endif // HAS_CUDNN_V8 #endif // AT_CUDNN_ENABLED diff --git a/aten/src/ATen/native/quantized/cudnn/Linear.cpp b/aten/src/ATen/native/quantized/cudnn/Linear.cpp new file mode 100644 index 000000000000..9314d9ee9293 --- /dev/null +++ b/aten/src/ATen/native/quantized/cudnn/Linear.cpp @@ -0,0 +1,374 @@ +#ifdef USE_CUDA +#include // for the definition of AT_CUDNN_ENABLED + +#if AT_CUDNN_ENABLED() + +#include +#include + +#if HAS_CUDNN_V8() + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +// TODO: there is a table from input dtype and weight dtype to operator dtype, +// we can derive the operator dtype based on input dtype +cudnn_frontend::MatMulDesc_v8 getLinearDescriptor(cudnnDataType_t dataType) { + return cudnn_frontend::MatMulDescBuilder() + .setMathPrecision(dataType) + .build(); +} + +// FIXME: make this thread-safe by reusing the benchmark cache in Conv_v7.cpp +namespace { +// we currently set the maximum number of input dimensions to 5 +// this can be increased, if necessary +constexpr uint8_t max_num_input_dim = 5; +struct LinearParams { + c10::DeviceIndex device_id; + cudnnDataType_t dataType; + int input_size[max_num_input_dim]; + uint8_t input_dim; + at::MemoryFormat memory_format; + int64_t weight_size[2]; + bool deterministic; + bool allow_tf32; +}; +struct CacheKey { + LinearParams params; + uint8_t input_alignment; + uint8_t weight_alignment; + uint8_t output_alignment; + // default to -1 when no bias + int8_t bias_alignment; + bool kReluFused; +}; +void setLinearParams( + LinearParams* params, const at::Tensor& input, const at::Tensor& weight, + bool deterministic, bool allow_tf32) { + // operator datatype needs to be int32 for int8 matmul, but we can + // set the datatype for output tensor to int32 or fp32 + memset(params, 0, sizeof(LinearParams)); + params->device_id = at::cuda::current_device(); + params->dataType = CUDNN_DATA_INT32; + params->input_dim = input.dim(); + params->memory_format = input.suggest_memory_format(); + for (int i = 0; i < params->input_dim; ++i) { + params->input_size[i] = input.sizes()[i]; + } + for (int i = 0; i < 2; ++i) { + params->weight_size[i] = weight.sizes()[i]; + } + params->deterministic = deterministic; + params->allow_tf32 = allow_tf32; +} +std::unordered_map, at::native::ParamsEqual> execution_plan_cache; +} +// TODO: we can use cudnn_frontend::ExecutionPlanCache when it supports caching +// multiple operators +// reference: https://github.com/NVIDIA/cudnn-frontend/blob/main/samples/conv_sample.cpp#L293 +//static cudnn_frontend::ExecutionPlanCache plan_cache("sample_cache"); + +// currently we only support int8 symmetric (zero_point = 0 for inputs and output) quantized linear op +// We implement relu(act_int8 * transpose(w_int8) + [bias_fp32/(act_scale * w_scale] ) * ( act_scale * w_scale / out_scale ) +// which requires 5 cudnn ops (1 matmul, 2 multiplication, 1 add, and 1 relu ops) +// matmul op: linear_op +// Multiplication ops: rhs_mult_op, requant_op +// Addition op: add_op +// Relu op: relu_op +template +void PackedLinearWeightCudnn::apply_impl_helper(const at::Tensor& quantized_output, const at::Tensor& input, double output_scale) { + if (quantized_output.numel() == 0) { + return; + } + at::Tensor linear_output = at::empty(quantized_output.sizes(), at::device(at::kCUDA).dtype(at::kFloat)); + auto act_scale = input.q_scale(); + auto weight_scale = orig_weight.q_scale(); + auto requantize_multiplier = act_scale * weight_scale / output_scale; + at::Tensor requantize_multiplier_tensor = at::full(quantized_output.sizes(), requantize_multiplier, at::device(at::kCUDA).dtype(at::kFloat)); + requantize_multiplier_tensor.fill_(requantize_multiplier); + c10::optional bias_multiplier_tensor; + c10::optional broadcasted_bias; + if (bias_.has_value()) { + // the input bias is a 1-D tensor whose size is the same as the size of the last dimension of quantized_output + // we need to add trailing dimensions in order to properly broadcast bias, otherwise broadcast_to will fail. + // the number of trailling dimensions is quantized_output.dim() - 2. We also prepend a leading dimension for clarity + std::vector new_size(quantized_output.dim(), 1); + new_size.back() = bias_.value().size(0); + broadcasted_bias = bias_.value().clone().reshape(new_size); + broadcasted_bias.value() = broadcasted_bias.value().broadcast_to(quantized_output.sizes()).contiguous(); + bias_multiplier_tensor = at::empty(quantized_output.sizes(), at::device(at::kCUDA).dtype(at::kFloat)); + auto bias_multiplier = 1.0 / (act_scale * weight_scale); + bias_multiplier_tensor.value().fill_(bias_multiplier); + } + + cudnnHandle_t handle = at::native::getCudnnHandle(); + CacheKey key; + // memset is needed here because there is implicit packing added for CacheKey, and this can result in uninitialized padded values that are + // used for hashing (see how at::native::ParamsHash is defined). without memset, we can potentially come across a situation where two + // CacheKey objects have the same user defined parameters, but + // different padded values, resulting in different hash outputs. + memset(&key, 0, sizeof(key)); + bool deterministic{true}; + bool allow_tf32{false}; + setLinearParams(&key.params, input, orig_weight, deterministic, allow_tf32); + + key.input_alignment = cudnn_utils::getAlignment(input); + key.output_alignment = cudnn_utils::getAlignment(linear_output); + key.weight_alignment = cudnn_utils::getAlignment(orig_weight); + if (bias_.has_value()) { + key.bias_alignment = cudnn_utils::getAlignment(broadcasted_bias.value()); + } else { + key.bias_alignment = -1; + } + key.kReluFused = kReluFused; + // the matmul operation is input * transpose(weight), so we will work with the transposed weight + auto weight_transposed = transpose(orig_weight, 0, 1); + // cudnn expects tensors to be at least 3D. weight_transposed is currently 2D. we will create a 3D view + // by prepending a leading dummy dimension (cudnn expects leading dimensions to be the dummy dimensions) + std::vector new_sizes(3, 1); + new_sizes.back() = weight_transposed.size(1); + new_sizes[1] = weight_transposed.size(0); + weight_transposed = weight_transposed.view(new_sizes); + + auto run = [&](cudnn_frontend::ManagedOpaqueDescriptor plan_desc) { + auto workspace_size = 0; + auto workspace = at::empty({workspace_size}, input.options().dtype(at::kByte)); + std::vector data_ptrs; + std::vector uids; + data_ptrs.reserve(9); + uids.reserve(9); + data_ptrs = {input.data_ptr(), weight_transposed.data_ptr(), + requantize_multiplier_tensor.data_ptr(), quantized_output.data_ptr()}; + uids = {'x', 'w', 's', 'r'}; + if (bias_.has_value()) { + data_ptrs.insert(data_ptrs.end(), {broadcasted_bias.value().data_ptr(), bias_multiplier_tensor.value().data_ptr(), + broadcasted_bias.value().data_ptr(), broadcasted_bias.value().data_ptr(), linear_output.data_ptr()}); + uids.insert(uids.end(), {'b', 'c', 'd', 'n', 'e'}); + } + auto variantPack = cudnn_frontend::VariantPackBuilder() + .setWorkspacePointer(workspace.data_ptr()) + .setDataPointers(uids.size(), data_ptrs.data()) + .setUids(uids.size(), uids.data()) + .build(); + auto variant_pack_desc = variantPack.get_raw_desc(); + AT_CUDNN_CHECK(cudnnBackendExecute(handle, plan_desc->get_backend_descriptor(), variant_pack_desc)); + }; + + auto search = execution_plan_cache.find(key); + if (search != execution_plan_cache.end()) { + cudnn_frontend::ManagedOpaqueDescriptor plan_desc = search->second; + run(plan_desc); + return; + } + + // linear_op computes act_int8 * tranpose(w_int8) (matrix multiplication) + // where act_int8 and w_int8 are the input and weight variables, resp. + // output is a fp32 tensor + auto linear_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR) + .setaMatDesc(cudnn_utils::getTensorDescriptor(input.sizes(), input.strides(), CUDNN_DATA_INT8, 'x', key.input_alignment)) + .setbMatDesc(cudnn_utils::getTensorDescriptor(weight_transposed.sizes(), weight_transposed.strides(), CUDNN_DATA_INT8, 'w', key.weight_alignment)) + .setcMatDesc(cudnn_utils::getTensorDescriptor(linear_output, 'y', key.output_alignment, true)) + .setmatmulDesc(getLinearDescriptor(key.params.dataType)) + .build(); + // std::cout << "operator:" << linear_op.describe() << std::endl; + + c10::optional bias_mult_op; + c10::optional sum_linear_bias_op; + if (bias_.has_value()) { + // we can't directly assign bias_mult_op becauase operator= is deleted for cudnn_frontend::Operation; + // alternatively, I think we can use std::unique_ptr and dynamically allocate these builder ops + // but here, we chose to do it statically. c10::optional::emplace() enables this approach + + // bias_mult_op computes bias_fp32 / (act_scale * w_scale) or bias_fp32 * (1 / (act_scale * w_scale)) + // where bias_multiplier = (1 / (act_scale * w_scale)) + // output is a fp32 tensor + // we use inplace operation here where the output is assigned to the input + bias_mult_op.emplace(cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(cudnn_utils::getTensorDescriptor(broadcasted_bias.value(), 'b', cudnn_utils::getAlignment(broadcasted_bias.value()))) + .setbDesc(cudnn_utils::getTensorDescriptor(bias_multiplier_tensor.value(), 'c', cudnn_utils::getAlignment(bias_multiplier_tensor.value()))) + // TODO: I think we should be able to make this a virtual tensor, but we would need cudnn to support + // setbdesc(ManagedOpaqueDescriptor const &raw_tensor) first + .setyDesc(cudnn_utils::getTensorDescriptor(broadcasted_bias.value(), 'd', cudnn_utils::getAlignment(broadcasted_bias.value()))) + .setpwDesc(cudnn_utils::getPointWiseMulDescriptor(at::native::getCudnnDataType(bias_multiplier_tensor.value()))) + .build()); + + // computes (act_int8 * w_int8 + [bias_fp32/(act_scale * w_scale)]) + // where the 1st and 2nd summands is linear_output and broadcasted_bias, resp. + // output is a fp32 tensor + // we use inplace operation here where the output is assigned to the input + sum_linear_bias_op.emplace(cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(linear_op.getOutputTensor()) + // TODO: An additional entry for broadcasted_bias in the uid-data_ptr pairing + // appears to be needed in the current version of cudnn (8.4.0). Without it, some + // test cases are failing. NVIDIA is currently investigating this issue. + // When this issue is fixed, we can change 'n' back to 'd' and remove the additional entry in uid and data_ptrs in variant pack above + .setbDesc(cudnn_utils::getTensorDescriptor(broadcasted_bias.value(), 'n', cudnn_utils::getAlignment(broadcasted_bias.value()))) + .setyDesc(cudnn_utils::getTensorDescriptor(linear_output, 'e', key.output_alignment)) + .setpwDesc(cudnn_utils::getPointWiseAddDescriptor(at::native::getCudnnDataType(broadcasted_bias.value()))) + .build()); + } + + // relu_op computes relu(act_int8 * w_int8 + [bias_fp32/(act_scale * w_scale)] + // or relu(act_int8 * w_int8) if bias is not present. + // output is a fp32 tensor + c10::optional relu_op; + std::shared_ptr tensor2requant_ptr = bias_.has_value() ? sum_linear_bias_op.value().getOutputTensor() : linear_op.getOutputTensor(); + if (kReluFused) { + // we use inplace operation here where the output is assigned to the input + relu_op.emplace(cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(tensor2requant_ptr) + .setyDesc(cudnn_utils::getTensorDescriptor(linear_output, 'f', key.output_alignment, true)) + .setpwDesc(cudnn_utils::getPointWiseReluDescriptor(at::native::getCudnnDataType(linear_output))) + .build()); + } + + // requant_op computes relu(act_int8 * w_int8 + [bias_fp32/(act_scale * w_scale)]) / (out_scale / (act_scale * w_scale)) + // or relu(act_int8 * w_int8) / (out_scale / (act_scale * w_scale))) if bias is not present. + // output is a fp32 tensor + auto requant_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(kReluFused ? relu_op.value().getOutputTensor() : tensor2requant_ptr) + .setbDesc(cudnn_utils::getTensorDescriptor(requantize_multiplier_tensor, 's', cudnn_utils::getAlignment(requantize_multiplier_tensor))) + .setyDesc(cudnn_utils::getTensorDescriptor(quantized_output.sizes(), quantized_output.strides(), CUDNN_DATA_INT8, 'r', cudnn_utils::getAlignment(quantized_output))) + .setpwDesc(cudnn_utils::getPointWiseMulDescriptor(at::native::getCudnnDataType(requantize_multiplier_tensor))) + .build(); + // // std::cout << "operator:" << requant_op.describe() << std::endl; + + std::vector ops{&linear_op}; + if (bias_.has_value()) { + ops.emplace_back(&(bias_mult_op.value())); + ops.emplace_back(&(sum_linear_bias_op.value())); + } + if (kReluFused) { + ops.emplace_back(&(relu_op.value())); + } + ops.emplace_back(&requant_op); + + auto opGraph = cudnn_frontend::OperationGraphBuilder() + .setHandle(handle) + .setOperationGraph(ops.size(), ops.data()) + .build(); + // std::cout << "opGraph: " << opGraph.describe() << std::endl; + + auto heuristics = cudnn_frontend::EngineHeuristicsBuilder() + .setOperationGraph(opGraph) + .setHeurMode(CUDNN_HEUR_MODE_INSTANT) + .build(); + auto fallback = cudnn_frontend::EngineFallbackListBuilder() + .setOperationGraph(opGraph) + .setOperation(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR) + .build(); + + auto& engine_configs = heuristics.getEngineConfig(heuristics.getEngineConfigCount()); + auto& fallback_list = fallback.getFallbackList(); + + cudnn_frontend::EngineConfigList filtered_configs; + cudnn_utils::filterEngineConfigs(engine_configs, filtered_configs, deterministic, allow_tf32, at::kChar); + cudnn_utils::filterEngineConfigs(fallback_list, filtered_configs, deterministic, allow_tf32, at::kChar); + + for (auto &cfg : engine_configs) { + try { + auto plan = cudnn_frontend::ExecutionPlanBuilder() + .setHandle(handle) + .setEngineConfig(cfg) + .build(); + auto plan_desc = plan.get_desc(); + run(plan_desc); + execution_plan_cache[key] = plan_desc; + return; + } catch (cudnn_frontend::cudnnException &e) {std::cout << "cudnn error:" << e.what() << std::endl;} catch(c10::CuDNNError &e) { std::cout << "other error" << e.what() << std::endl;} + } + + TORCH_CHECK(false, "Unable to find an engine to execute this computation Quantized Linear Cudnn"); +} + +// output Tensor will be a clampped int8 Tensor +// both act and weight will be int8 Tensor +// Numerics are the same as conv (see aten/src/ATen/native/quantized/Conv.cpp): +template +at::Tensor PackedLinearWeightCudnn::apply_impl( + const at::Tensor& act, + double output_scale, + int64_t output_zero_point) { + std::vector original_output_shape{act.sizes().vec()}; // 2D + original_output_shape.back() = orig_weight.size(0); // output channels + // cudnn expects tensors to be at least 3D. we will prepend a dummy dimension for quantized_output + std::vector output_shape(3, 1); + output_shape[1] = original_output_shape[0]; + output_shape[2] = original_output_shape[1]; + at::Tensor quantized_output = at::_empty_affine_quantized( + output_shape, + at::device(at::kCUDA).dtype(at::ScalarType::QInt8), + output_scale, + output_zero_point); + // cudnn expects tensors to be at least 3D. act is currently 2D. we will create a 3D view + std::vector new_sizes(3, 1); + // cudnn expects leading dimensions to be the dummy dimensions + new_sizes.back() = act.sizes().back(); + new_sizes[1] = act.size(0); + apply_impl_helper( + quantized_output, act.view(new_sizes), output_scale); + return quantized_output.view(original_output_shape); +} + +at::Tensor PackedLinearWeightCudnn::apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) { + return apply_impl(input, output_scale, output_zero_point); +} + +at::Tensor PackedLinearWeightCudnn::apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) { + return apply_impl(input, output_scale, output_zero_point); +} + +namespace at { +namespace native { +namespace { + +template +class QLinearInt8 final { + public: + static at::Tensor run( + at::Tensor act, + const c10::intrusive_ptr& packed_weight, + double output_scale, + int64_t output_zero_point) { + // TODO: check all zero_points are zero/all tensors are symmetrically quantized + if (kReluFused) { + return packed_weight->apply_relu(act, output_scale, output_zero_point); + } else { + return packed_weight->apply(act, output_scale, output_zero_point); + } + } +}; + +TORCH_LIBRARY_IMPL(quantized, QuantizedCUDA, m) { + m.impl(TORCH_SELECTIVE_NAME("quantized::linear"), QLinearInt8::run); + m.impl(TORCH_SELECTIVE_NAME("quantized::linear_relu"), QLinearInt8::run); +} + +} // namespace +} // namespace native +} // namespace at + + +#endif // HAS_CUDNN_V8 +#endif // AT_CUDNN_ENABLED +#endif // USE_CUDA diff --git a/aten/src/ATen/native/quantized/cudnn/Pooling.cpp b/aten/src/ATen/native/quantized/cudnn/Pooling.cpp new file mode 100644 index 000000000000..8335eeeca2ff --- /dev/null +++ b/aten/src/ATen/native/quantized/cudnn/Pooling.cpp @@ -0,0 +1,248 @@ +#include +#ifdef USE_CUDA +#include // for the definition of AT_CUDNN_ENABLED + +#if AT_CUDNN_ENABLED() +#include +#include +#include +#include +#include +#endif // AT_CUDNN_ENABLED +#endif // USE_CUDA + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace at { +namespace native { +namespace { +// TODO: This function is the same as that of qpool.cpp. We should refactor this into quantized directory +// so that we don't need to duplicate the function +void check_maxpool2d_params( + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation) { + TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2, + "Expected 1d or 2d kernel size, got ", kernel_size.size()); + TORCH_CHECK(stride.empty() || stride.size() == 2, + "Expected no strides or 2d strides, got", stride.size()); + TORCH_CHECK(padding.size() == 1 || padding.size() == 2, + "Expected 1d or 2d padding, got ", padding.size()); + TORCH_CHECK(dilation.size() == 1 || dilation.size() == 2, + "Expected 1d or 2d dilation, got ", dilation.size()); +} +} + +// The current implementation of quantized cuda adaptive average pooling uses the following: +// dequant -> fp32 adaptive average pooling -> quant. This is the same numerically as +// quantized adaptive average pooling. This is not the ideal implementation, as we desire to +// operate on the quantized values directly. +// However, we are currently blocked on this as we are waiting for cudnn's 8.5.0 release, which is anticipated +// to support adaptive average pooling. When that support is made available, we will use it directly. TODO +Tensor adaptive_avg_pool2d_quantized_cuda( + const at::Tensor& input, + IntArrayRef output_size) { +// TODO: renable these cudnn preprocessors like quantized_max_pool2d_cudnn below when we implement this function with cudnn +#ifdef USE_CUDA +// #if AT_CUDNN_ENABLED() +// #if HAS_CUDNN_V8() + // TODO: limit this to per tensor quantized tensors for now, though should be easy to adapt + // to per channel quantized tensors + TORCH_CHECK(input.qscheme() == at::kPerTensorAffine, "adaptive_avg_pool2d_quantized_cuda oonly supports per tensor quantized tensors"); + auto input_fp32 = at::dequantize(input); + auto result_fp32 = at::adaptive_avg_pool2d(input_fp32, output_size); + return at::quantize_per_tensor(result_fp32, input.q_scale(), input.q_zero_point(), input.scalar_type()); +#else // USE_CUDA + AT_ERROR("at::native::adaptive_avg_pool2d_quantized_cuda: ATen not compiled with USE_CUDA support"); + return Tensor{}; // never reached, placates the compiler +#endif +} + +// Currently we support 4D and 3D input (qx) tensors, the latter of which is supported for +// legacy reasons. The first dimension of a 4D input tensor is the batch size. +// For a 3D tensor, there is no batch size dimension -- it can be viewed as a single batch. +// cudnn's 2D pooling operation requires the input and output to be 4D tensors, so we must cast +// any 3D tensors to 4D prior to using cudnn +// This implementation currently uses the v7 cudnn APIs as v8 cudnn APIs are not yet available for +// pooling operations. +// Consult https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnPoolingForward for +// documentation on the APIs +// Currently, it appears there is no cudnn support for dilated pooling -- we will +// submit a feature request for this with cudnn +// TODO: ideally, we would like to use structured kernel support here so we do not have to repeat +// the input checks, however, that would require us to implement max_pool2d_with_indices_out_quantized_cuda +// based on how the dispatch table is currently constructed in native_functions.yaml. currently, +// there is no support for producing indices with cudnn max pooling, so until that becomes available, this cannot be done. +Tensor quantized_max_pool2d_cudnn( + const Tensor& qx, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool ceil_mode) { +#ifdef USE_CUDA +#if AT_CUDNN_ENABLED() +#if HAS_CUDNN_V8() + check_maxpool2d_params( + kernel_size, + stride, + padding, + dilation); + if (stride.empty()) { + stride = kernel_size; + } + auto ndim = qx.dim(); + TORCH_CHECK( + ndim == 3 || ndim == 4, "Expecting the input tensor of rank 3 or 4."); + TORCH_CHECK( + kernel_size.size() == 2, + "quantized_max_pool2d_cudnn(): Expected kernel_size to be 2-dimensional: got ", + kernel_size.size()); + TORCH_CHECK( + stride.size() == 2, + "quantized_max_pool2d_cudnn(): Expected stride to be 2-dimensional: got ", + stride.size()); + TORCH_CHECK( + dilation.size() == 2, + "quantized_max_pool2d_cudnn(): Expected dilation to be 2-dimensional: got ", + dilation.size()); + TORCH_CHECK( + dilation[0] == 1 && dilation[1] == 1, + "quantized_max_pool2d_cudnn(): Expected dilation=[1, 1] (cudnn does not currently support dilation[i] != 1), got", + dilation); + TORCH_CHECK( + padding.size() == 2, + "quantized_max_pool2d_cudnn(): Expected padding to be 2-dimensional: got ", + padding.size()); + + auto input = qx; + if (ndim == 4) { + input = qx.to(MemoryFormat::ChannelsLast); + } else { // 3D + std::vector new_sizes{1, qx.size(0), qx.size(1), qx.size(2)}; + input = qx.view(new_sizes); + } + int batch_size = input.size(0); + int64_t inC = input.size(1); + int64_t inH = input.size(2); + int64_t inW = input.size(3); + // Check output dimensions. + int64_t padH = padding[0]; + int64_t padW = padding[1]; + int64_t kH = kernel_size[0]; + int64_t kW = kernel_size[1]; + int64_t strideH = stride[0]; + int64_t strideW = stride[1]; + TORCH_CHECK( + kH > 0 && kW > 0, + "qnnpack_maxpool2d(): kernel_size should be greater than zero."); + TORCH_CHECK( + strideH > 0 && strideW > 0, + "qnnpack_maxpool2d(): strides should be greater than zero."); + int64_t dilationH = dilation[0]; + int64_t dilationW = dilation[1]; + int64_t outC = inC; + int64_t outH = pooling_output_shape(inH, kH, padH, strideH, dilationH, ceil_mode); + int64_t outW = pooling_output_shape(inW, kW, padW, strideW, dilationW, ceil_mode); + TORCH_CHECK(outH > 0 && outW > 0, + "Given input size: (", + inC, "x", inH, "x", inW, + "). Calculated output size: (", + outC, "x", outH, "x", outW, + "). Output size is too small."); + + std::vector output_shape; + if (ndim == 3) { + // cudnn requires 4D input and output for 2D pooling, so we prepend a dummy dimension + // whose size represents the batch size (1) + output_shape = {1, outC, outH, outW}; + } else { + output_shape = {batch_size, outC, outH, outW}; + } + auto qy = at::_empty_affine_quantized( + output_shape, + at::device(at::kCUDA).dtype(at::ScalarType::QInt8), + input.q_scale(), + input.q_zero_point(), + (ndim == 4 ? MemoryFormat::ChannelsLast : MemoryFormat::Contiguous)); + + cudnnHandle_t handle = getCudnnHandle(); + cudnnPoolingDescriptor_t poolingDesc; + AT_CUDNN_CHECK_WITH_SHAPES(cudnnCreatePoolingDescriptor(&poolingDesc)); + AT_CUDNN_CHECK_WITH_SHAPES(cudnnSetPooling2dDescriptor( + poolingDesc, + CUDNN_POOLING_MAX_DETERMINISTIC, + CUDNN_NOT_PROPAGATE_NAN, + kernel_size[0], // kernel height + kernel_size[1], // kernel width + padding[0], // vertical padding + padding[1], // horizontal padding + stride[0], // vertical stride + stride[1])); // horizontal stride + + float one{1}; + float zero{0.0}; + TensorDescriptor xDesc; + at::MemoryFormat memory_format = (ndim == 4 ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous); + xDesc.set(input, memory_format); + TensorDescriptor yDesc; + yDesc.set(qy, memory_format); + cudnnPoolingForward(handle, + poolingDesc, + &one, + xDesc.desc(), + input.data_ptr(), + &zero, + yDesc.desc(), + qy.data_ptr()); + + // recall we casted our input and output to 4D if qx was 3D, so we recast it back to 3D prior to returning + return (ndim == 3 ? qy.view(std::vector(output_shape.begin() + 1, output_shape.end())) : qy); +#else // HAS_CUDNN_V8() + AT_ERROR("at::native::quantized_max_pool2d_cudnn: ATen not compiled with cuDNN v8 support"); + return Tensor{}; // never reached, placates the compiler +#endif // HAS_CUDNN_V8() +#else // AT_CUDNN_ENABLED() + AT_ERROR("at::native::quantized_max_pool2d_cudnn: ATen not compiled with cuDNN support"); + return Tensor{}; // never reached, placates the compiler +#endif // AT_CUDNN_ENABLED() +#else // USE_CUDA + AT_ERROR("at::native::quantized_max_pool2d_cudnn: ATen not compiled with USE_CUDA support"); + return Tensor{}; // never reached, placates the compiler +#endif +} + +// Keep the registry in the anonymous namespace. +namespace { +template +class QMaxPool_arr_args final { + public: + static Tensor run( + Tensor qx, + std::vector kernel_size, + std::vector stride, + std::vector padding, + std::vector dilation, + bool ceil_mode) { + TORCH_CHECK(kSpatialDim == 2, "quantized max pool is only valid for 2D") + return quantized_max_pool2d_cudnn(qx, kernel_size, stride, padding, + dilation, ceil_mode); + } +}; + +TORCH_LIBRARY_IMPL(quantized, QuantizedCUDA, m) { + m.impl(TORCH_SELECTIVE_NAME("quantized::max_pool2d"), TORCH_FN(QMaxPool_arr_args<2>::run)); +} + +} // namespace +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/quantized/cudnn/conv_prepack.cpp b/aten/src/ATen/native/quantized/cudnn/conv_prepack.cpp new file mode 100644 index 000000000000..7db1f7092f51 --- /dev/null +++ b/aten/src/ATen/native/quantized/cudnn/conv_prepack.cpp @@ -0,0 +1,217 @@ +#ifdef USE_CUDA +#include // for the definition of AT_CUDNN_ENABLED + +#if AT_CUDNN_ENABLED() + +#include + +#if HAS_CUDNN_V8() + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +template +c10::intrusive_ptr> PackedConvWeightCudnn< + kSpatialDim>:: + prepack( + at::Tensor weight, + c10::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose) { + // TODO: need to check out to implement groups for conv operator in Conv.cpp + TORCH_CHECK(groups == 1, "Quantized cudnn conv2d is currenty limited to groups = 1; received groups =", groups); + TORCH_CHECK(weight.qscheme() == c10::kPerTensorAffine, "Unsupported qscheme: ", toString(weight.qscheme())); + TORCH_CHECK( + kSpatialDim == 2, // 1D is packed as 2d, hence we don't need other checks + "cuDNN packing only supports 2D convolution."); + TORCH_CHECK( + weight.ndimension() == kSpatialDim + 2, + "Weights are expected to have ", + kSpatialDim + 2, + " dimensions"); + TORCH_CHECK( + stride.size() == kSpatialDim, + "stride should contain ", + kSpatialDim, + " elements for ", + kSpatialDim, + "D convolution."); + TORCH_CHECK( + padding.size() == kSpatialDim, + "quantized::conv_prepack (cudnn): Specify front/top/left padding only. " + "end/bottom/right padding assumed to be equal to front/top/left"); + TORCH_CHECK( + !transpose || output_padding.size() == kSpatialDim, + "quantized::conv_prepack: Specify top/left output padding " + "only. bottom/right padding assumed to be equal to top/left"); + TORCH_CHECK( + dilation.size() == kSpatialDim, + "quantized::conv_prepack (cudnn): dilation should contain ", + kSpatialDim, + " elements for ", + kSpatialDim, + "D convolution."); + TORCH_CHECK(!transpose, "cudNN quantized conv prepack expects transpose = false") + const int num_unpadded_output_channels = weight.size(0); + const auto qtype = weight.qscheme(); + if (bias.has_value()) { + TORCH_CHECK(bias.value().dim() == 1, "bias should be a vector (1D Tensor)"); + TORCH_CHECK( + bias.value().size(0) == num_unpadded_output_channels, + "bias should have K elements: " + std::to_string(num_unpadded_output_channels)); + // TODO: we create a broadcasted_bias tensor later so I think we don't need to make this contiguous here. + // we will revisit this when nvidia adds proper support for broadcasting + // bias_contig = bias->contiguous(); + } + + // cudnn v8.4.0 expects conv2d's int8 weight tensor's input and output channels to be a multiple of 4. if it is not + // we need to explicitly pad it to a multiple of 4 ourselves as cudnn does not currently support padding. + // TODO: when and if cudnn enables padding in their operators, we can remove padding on our end; + // currently, limit padding support to groups=1 (ungrouped conv) + // TODO: implement this for groups > 1 + auto num_input_channels = weight.size(1); + int8_t num_output_slices2pad = (4 - num_unpadded_output_channels % 4) % 4; + int8_t num_input_slices2pad = (4 - num_input_channels % 4) % 4; + if (num_output_slices2pad != 0 || num_input_slices2pad != 0) { + // the second argument is an initializer list of padded values. there are 2 values for each dimension. + // refer to https://pytorch.org/docs/stable/generated/torch.nn.functional.pad.html for more details + weight = at::pad(weight, {0, 0, 0, 0, 0, num_input_slices2pad, 0, num_output_slices2pad}, "constant", 0); + if (bias.has_value()) { + bias.value() = at::pad(bias.value(), {0, num_output_slices2pad}, "constant", 0); + } + } + + auto ret_ptr = c10::make_intrusive>( + weight.to(c10::MemoryFormat::ChannelsLast), // TODO: this assumes 2D I think. make it more general? + bias, + stride, + padding, + output_padding, + dilation, + groups, + transpose, + qtype, + num_unpadded_output_channels); + return ret_ptr; +} + +template +c10::intrusive_ptr> PackedConvWeightCudnn< + 2>:: + prepack( + at::Tensor weight, + c10::optional bias_in, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose); + +namespace at { +namespace native { +namespace { + +template +class QConvPackWeightInt8Cudnn final { + public: + static c10::intrusive_ptr> run_conv( + Tensor weight, + c10::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups) { + torch::List output_padding; + output_padding.reserve(kSpatialDim); + for (const auto idx : c10::irange(kSpatialDim)) { + (void)idx; //Suppress unused variable warning + output_padding.push_back((int64_t)0); + } + return _run(weight, bias, stride, padding, output_padding, dilation, groups, + /*transpose=*/false); + } + + private: + static c10::intrusive_ptr> _run( + Tensor weight, + c10::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose) { + return PackedConvWeightCudnn::prepack( + weight, bias, stride, padding, output_padding, dilation, groups, + transpose); + } +}; + +class QConv1dPackWeightInt8Cudnn final { + public: + static c10::intrusive_ptr> run_conv( + Tensor weight, + c10::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups) { + const torch::List output_padding({0}); + return _run(weight, bias, stride, padding, output_padding, dilation, groups, + /*transpose=*/false); + } + + private: + static c10::intrusive_ptr> _run( + Tensor weight, + c10::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose) { + if (weight.dim() == 3) { + // we currently use conv2d kernel for conv1d by making the input and weight tensors + // 4D rather than 3D. we add a dummy width dimension of size 1 + // out channels, in channels / groups, L -> out channels, in channels / groups, 1, L + weight = weight.unsqueeze(-2); + } + stride = quant_utils::MakeArgForConv1d(stride, 1); + padding = quant_utils::MakeArgForConv1d(padding, 0); + output_padding = quant_utils::MakeArgForConv1d(output_padding, 0); + dilation = quant_utils::MakeArgForConv1d(dilation, 1); + + return PackedConvWeightCudnn<2>::prepack( + weight, bias, stride, padding, output_padding, dilation, groups, + transpose); + } +}; + +TORCH_LIBRARY_IMPL(quantized, QuantizedCUDA, m) { + m.impl(TORCH_SELECTIVE_NAME("quantized::conv1d_prepack"), TORCH_FN(QConv1dPackWeightInt8Cudnn::run_conv)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_prepack"), TORCH_FN(QConvPackWeightInt8Cudnn<2>::run_conv)); +} + +} // namespace +} // namespace native +} // namespace at + +#endif // HAS_CUDNN_V8 +#endif // AT_CUDNN_ENABLED +#endif // USE_CUDA diff --git a/aten/src/ATen/native/quantized/cudnn/conv_unpack_impl.cpp b/aten/src/ATen/native/quantized/cudnn/conv_unpack_impl.cpp new file mode 100644 index 000000000000..e18c6ce4d888 --- /dev/null +++ b/aten/src/ATen/native/quantized/cudnn/conv_unpack_impl.cpp @@ -0,0 +1,28 @@ +#ifdef USE_CUDA +#include // for the definition of AT_CUDNN_ENABLED + +#if AT_CUDNN_ENABLED() + +#include + +#if HAS_CUDNN_V8() + +#include +#include +#include +#include + +#include + +template +std::tuple> PackedConvWeightCudnn< + kSpatialDim>::unpack() { + return std::tuple>{maybe_padded_weight_, bias_}; +} + +template std::tuple> PackedConvWeightCudnn< + 2>::unpack(); + +#endif // HAS_CUDNN_V8 +#endif // AT_CUDNN_ENABLED +#endif // USE_CUDA diff --git a/aten/src/ATen/native/quantized/cudnn/linear_prepack.cpp b/aten/src/ATen/native/quantized/cudnn/linear_prepack.cpp new file mode 100644 index 000000000000..3541ce9b7d80 --- /dev/null +++ b/aten/src/ATen/native/quantized/cudnn/linear_prepack.cpp @@ -0,0 +1,63 @@ +#ifdef USE_CUDA +#include // for the definition of AT_CUDNN_ENABLED + +#if AT_CUDNN_ENABLED() + +#include + +#if HAS_CUDNN_V8() + +#include +#include +#include +#include +#include +#include +#include +#include + +c10::intrusive_ptr PackedLinearWeightCudnn::prepack( + at::Tensor weight, + c10::optional bias) { + TORCH_CHECK(weight.qscheme() == c10::kPerTensorAffine, "Unsupported qscheme: ", toString(weight.qscheme())); + const int output_channels = weight.size(0); + const auto qtype = weight.qscheme(); + if (bias.has_value()) { + TORCH_CHECK(bias.value().dim() == 1, "bias should be a vector (1D Tensor)"); + TORCH_CHECK( + bias.value().size(0) == output_channels, + "bias should have K elements: " + std::to_string(output_channels)); + } + + auto ret_ptr = c10::make_intrusive( + weight, + bias, + qtype); + return ret_ptr; +} + +namespace at { +namespace native { +namespace { + +class QLinearPackWeightInt8Cudnn final { + public: + static c10::intrusive_ptr run( + at::Tensor weight, + c10::optional bias) { + return PackedLinearWeightCudnn::prepack(std::move(weight), std::move(bias)); + } +}; + +TORCH_LIBRARY_IMPL(quantized, QuantizedCUDA, m) { + m.impl(TORCH_SELECTIVE_NAME("quantized::linear_prepack"), TORCH_FN(QLinearPackWeightInt8Cudnn::run)); +} + + +} // namespace +} // namespace native +} // namespace at + +#endif // HAS_CUDNN_V8 +#endif // AT_CUDNN_ENABLED +#endif // USE_CUDA diff --git a/aten/src/ATen/native/quantized/cudnn/linear_unpack_impl.cpp b/aten/src/ATen/native/quantized/cudnn/linear_unpack_impl.cpp new file mode 100644 index 000000000000..ebf77b0294d8 --- /dev/null +++ b/aten/src/ATen/native/quantized/cudnn/linear_unpack_impl.cpp @@ -0,0 +1,23 @@ +#ifdef USE_CUDA +#include // for the definition of AT_CUDNN_ENABLED + +#if AT_CUDNN_ENABLED() + +#include + +#if HAS_CUDNN_V8() + +#include +#include +#include +#include + +#include + +std::tuple> PackedLinearWeightCudnn::unpack() { + return std::tuple>{orig_weight, bias_}; +} + +#endif // HAS_CUDNN_V8 +#endif // AT_CUDNN_ENABLED +#endif // USE_CUDA diff --git a/aten/src/ATen/native/quantized/cudnn/utils.h b/aten/src/ATen/native/quantized/cudnn/utils.h new file mode 100644 index 000000000000..3eba354bd20c --- /dev/null +++ b/aten/src/ATen/native/quantized/cudnn/utils.h @@ -0,0 +1,335 @@ +#pragma once +/* +This file contains some of the auxiliary functions used by both Conv.cpp & Linear.cpp (introduced in a later PR) +*/ + +#ifdef USE_CUDA +#include // for the definition of AT_CUDNN_ENABLED + +#if AT_CUDNN_ENABLED() + +#include + +#if HAS_CUDNN_V8() + +#include +#include +#include +#include +#include +#include + +struct PackedLinearWeightCudnn : public LinearPackedParamsBase { + PackedLinearWeightCudnn( + at::Tensor orig_weight, + c10::optional bias, + c10::QScheme q_scheme) + : orig_weight(std::move(orig_weight)), + bias_(std::move(bias)), + q_scheme(std::move(q_scheme)) {} + + at::Tensor apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + at::Tensor apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic(at::Tensor input, bool reduce_range = false) override { + throw std::runtime_error( + "apply_relu_out is not implemented for this packed " + "parameter type"); + } + at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range = false) override { + throw std::runtime_error( + "apply_relu_out is not implemented for this packed " + "parameter type"); + } + + std::tuple> unpack() override; + + c10::optional bias() override { + return bias_; + } + + static c10::intrusive_ptr prepack( + at::Tensor weight, + c10::optional bias); + + private: + at::Tensor orig_weight; + c10::optional bias_; + c10::QScheme q_scheme; + + template + at::Tensor apply_impl( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point); + + template + void apply_impl_helper( + const at::Tensor& quantized_output, + const at::Tensor& input, + double output_scale); +}; + +template +struct PackedConvWeightCudnn : public ConvPackedParamsBase { + PackedConvWeightCudnn( + at::Tensor orig_weight, + c10::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose, + c10::QScheme q_scheme, + int64_t output_channels) + : maybe_padded_weight_(std::move(orig_weight)), + bias_(std::move(bias)), + stride_(std::move(stride)), + padding_(std::move(padding)), + output_padding_(std::move(output_padding)), + dilation_(std::move(dilation)), + groups_(groups), + transpose_(transpose), + q_scheme_(q_scheme), + num_unpadded_output_channels_(output_channels) {} // output channels needs to be stored when we have to pad this dimension + + at::Tensor apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic( + const at::Tensor& input, + bool reduce_range) { + TORCH_CHECK(false, "apply_dynamic is currently not reported"); + } + + at::Tensor apply_dynamic_relu( + const at::Tensor& input, + bool reduce_range) { + TORCH_CHECK(false, "apply_dynamic_relu is currently not reported"); + } + + std::tuple> unpack() override; + + static c10::intrusive_ptr> prepack( + at::Tensor weight, + c10::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose); + + const float* GetBiasData(at::Tensor* bias); + + torch::List stride() const override { + return stride_; + } + + torch::List padding() const override { + return padding_; + } + + torch::List output_padding() const override { + return output_padding_; + } + + torch::List dilation() const override { + return dilation_; + } + + int64_t groups() const override { + return groups_; + } + + bool transpose() const override { + return transpose_; + } + + private: + // cudnn v8.4.0 expects conv2d's int8 weight tensor's input and output channels to be a multiple of 4. if it is not + // we need to explicitly pad it to a multiple of 4 ourselves as cudnn does not currently support padding, hence the naming + // convention "maybe"_padded_weight. + // TODO: when and if cudnn enables padding in their operators, we can remove padding on our end and rename this to orig_weight_ + at::Tensor maybe_padded_weight_; + c10::optional bias_; + torch::List stride_; + torch::List padding_; + torch::List output_padding_; + torch::List dilation_; + int64_t groups_; + bool transpose_; + c10::QScheme q_scheme_; + int64_t num_unpadded_output_channels_; + + template + at::Tensor apply_impl( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point); + + template + void apply_impl_helper( + const at::Tensor& quantized_output, + const at::Tensor& input, + double output_scale); +}; + +namespace cudnn_utils { +namespace { + +uint8_t getAlignment(const at::Tensor &t) { + // alignment are in bytes + uint8_t alignment = 1; + uintptr_t address = reinterpret_cast(t.data_ptr()); + while (address % alignment == 0 && alignment < 16) alignment *= 2; + return alignment; +} + +// For the two getTensorDescriptor functions, there is a is_virtual parameter. This parameter is used to set the cudnn +// tensor as virtual or not. Setting the tensor as virtual is expected to have some performance benefits as the cudnn +// backend cudnn will no longer directly save to the tensor, allowing us to omit this tensor from the variant pack. +// See third_party/cudnn_frontend/samples/fusion_sample.cpp for other examples + +cudnn_frontend::Tensor getTensorDescriptor(const at::Tensor &t, int64_t id, uint8_t alignment, bool is_virtual = false) { + auto shape = t.sizes(); + auto strides = t.strides(); + if (is_virtual) { + return cudnn_frontend::TensorBuilder() + .setDim(shape.size(), shape.data()) + .setStrides(strides.size(), strides.data()) + .setId(id) + .setAlignment(alignment) + .setVirtual() + .setDataType(at::native::getCudnnDataType(t)) + .build(); + } + return cudnn_frontend::TensorBuilder() + .setDim(shape.size(), shape.data()) + .setStrides(strides.size(), strides.data()) + .setId(id) + .setAlignment(alignment) + .setDataType(at::native::getCudnnDataType(t)) + .build(); +} + +cudnn_frontend::Tensor getTensorDescriptor(const c10::IntArrayRef& shape, const c10::IntArrayRef& strides, cudnnDataType_t cudnn_dtype, int64_t id, uint8_t alignment, bool is_virtual = false) { + if (is_virtual) { + return cudnn_frontend::TensorBuilder() + .setDim(shape.size(), shape.data()) + .setStrides(strides.size(), strides.data()) + .setId(id) + .setAlignment(alignment) + .setVirtual() + .setDataType(cudnn_dtype) + .build(); + } + return cudnn_frontend::TensorBuilder() + .setDim(shape.size(), shape.data()) + .setStrides(strides.size(), strides.data()) + .setId(id) + .setAlignment(alignment) + .setDataType(cudnn_dtype) + .build(); +} + +// TODO: there is a table from input dtype to operator dtype, we can derive +// the operator dtype based on input dtype +cudnn_frontend::PointWiseDesc_v8 getPointWiseMulDescriptor(cudnnDataType_t dataType) { + return cudnn_frontend::PointWiseDescBuilder() + .setMode(cudnnPointwiseMode_t::CUDNN_POINTWISE_MUL) + .setMathPrecision(dataType) + .build(); +} + +// TODO: there is a table from input dtype to operator dtype, we can derive +// the operator dtype based on input dtype +cudnn_frontend::PointWiseDesc_v8 getPointWiseAddDescriptor(cudnnDataType_t dataType) { + return cudnn_frontend::PointWiseDescBuilder() + .setMode(cudnnPointwiseMode_t::CUDNN_POINTWISE_ADD) + .setMathPrecision(dataType) + .build(); +} + +// TODO: there is a table from input dtype to operator dtype, we can derive +// the operator dtype based on input dtype +cudnn_frontend::PointWiseDesc_v8 getPointWiseReluDescriptor(cudnnDataType_t dataType) { + return cudnn_frontend::PointWiseDescBuilder() + .setMode(cudnnPointwiseMode_t::CUDNN_POINTWISE_RELU_FWD) + .setMathPrecision(dataType) + .build(); +} + + +void filterEngineConfigs( + cudnn_frontend::EngineConfigList &from, + cudnn_frontend::EngineConfigList &to, + bool deterministic, bool allow_tf32, c10::ScalarType scalar_type) +{ + auto filter = [=](cudnnBackendDescriptor_t c) { + if (deterministic) { + if (cudnn_frontend::hasNumericalNote(c)) return true; + } + if (scalar_type == at::kFloat || scalar_type == at::kChar || !allow_tf32) { + if (cudnn_frontend::hasNumericalNote(c)) return true; + if (cudnn_frontend::hasNumericalNote(c)) return true; + } + return false; + }; + cudnn_frontend::filter(from, to, filter); +} + + +cudnn_frontend::ExecutionPlan get_execplan_from_heuristics_else_fall_back(cudnn_frontend::OperationGraph&& opGraph, cudnnHandle_t handle_) { + auto heuristics = cudnn_frontend::EngineHeuristicsBuilder() + .setOperationGraph(opGraph) + .setHeurMode(CUDNN_HEUR_MODE_INSTANT) + .build(); + + // std::cout << "Heuristic has " << heuristics.getEngineConfigCount() << " configurations " << std::endl; + auto& engine_config = heuristics.getEngineConfig(heuristics.getEngineConfigCount()); + + // Try engine configs returned by the heuristics and pick up the first one that works. + for (auto& ecfg : engine_config) { + try { + auto plan = cudnn_frontend::ExecutionPlanBuilder() + .setHandle(handle_) + .setEngineConfig(ecfg, opGraph.getTag()) + .build(); + return plan; + } catch (cudnn_frontend::cudnnException& e) { + continue; + } + } + + { + // std::cout << opGraph.describe() << " has " << total_engines << " engines." << std::endl; + auto engine = cudnn_frontend::EngineBuilder().setGlobalEngineIdx(0).setOperationGraph(opGraph).build(); + // std::cout << engine.describe() << std::endl; + + auto engine_config = cudnn_frontend::EngineConfigBuilder().setEngine(engine).build(); + // std::cout << engine_config.describe() << std::endl; + + return cudnn_frontend::ExecutionPlanBuilder().setHandle(handle_).setEngineConfig(engine_config).build(); + } +} +} // anonymous +} // cudnn_utils + +#endif // HAS_CUDNN_V8 +#endif // AT_CUDNN_ENABLED +#endif // USE_CUDA diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp index 783c847dff0a..047a126e79a1 100644 --- a/aten/src/ATen/native/quantized/library.cpp +++ b/aten/src/ATen/native/quantized/library.cpp @@ -1,7 +1,6 @@ #include -#include -#include +#include #include #include @@ -189,10 +188,7 @@ TORCH_LIBRARY(quantized, m) { m.def(TORCH_SELECTIVE_SCHEMA("quantized::relu6(Tensor qx, bool inplace=False) -> Tensor")); m.def(TORCH_SELECTIVE_SCHEMA("quantized::leaky_relu(Tensor qx, Scalar negative_slope, bool inplace, float output_scale, int output_zero_point) -> Tensor")); m.def(TORCH_SELECTIVE_SCHEMA("quantized::sigmoid(Tensor qx, float output_scale, int output_zero_point) -> Tensor")); - - // quantized ops implemented in cudnn, with QuantizedCUDA dispatch - // TODO: use the same signature as quantized::conv2d - m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_cudnn(Tensor act, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::softmax(Tensor qx, int dim, float output_scale, int output_zero_point) -> Tensor")); } // According to #33294: The "_" prefix registration will be diff --git a/aten/src/ATen/native/quantized/packed_params.h b/aten/src/ATen/native/quantized/packed_params.h new file mode 100644 index 000000000000..64d8ec840c46 --- /dev/null +++ b/aten/src/ATen/native/quantized/packed_params.h @@ -0,0 +1,98 @@ +#pragma once + +#include +#include + +struct LinearPackedParamsBase : public torch::jit::CustomClassHolder { + virtual at::Tensor apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) = 0; + virtual at::Tensor apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) = 0; + + // out variant of LinearPackedParamsBase::apply + virtual at::Tensor& apply_out( + const at::Tensor& /*input*/, + double /*output_scale*/, + int64_t /*output_zero_point*/, + at::Tensor& output) { + throw std::runtime_error( + "apply_out is not implemented for this packed " + "parameter type"); + return output; + } + + virtual at::Tensor& apply_relu_out( + const at::Tensor& /*input*/, + double /*output_scale*/, + int64_t /*output_zero_point*/, + at::Tensor& output) { + throw std::runtime_error( + "apply_relu_out is not implemented for this packed " + "parameter type"); + return output; + } + + virtual at::Tensor apply_dynamic( + at::Tensor input, + bool reduce_range = false) = 0; + virtual at::Tensor apply_dynamic_relu( + at::Tensor input, + bool reduce_range = false) = 0; + + virtual at::Tensor& apply_dynamic_out( + const at::Tensor& /* input */, + at::Tensor& output, + bool /* reduce_range */) { + throw std::runtime_error( + "apply_dynamic_out is not implemented for this packed " + "parameter type"); + return output; + } + virtual at::Tensor& apply_dynamic_relu_out( + const at::Tensor& /* input */, + at::Tensor& output, + bool /* reduce_range */) { + throw std::runtime_error( + "apply_dynamic_relu_out is not implemented for this packed " + "parameter type"); + return output; + } + + virtual std::tuple> unpack() = 0; + + virtual c10::optional bias() = 0; + + virtual void set_bias(c10::optional /*bias*/) { + throw std::runtime_error( + "set_bias is not implemented for this packed " + "parameter type"); + } +}; + +template +struct ConvPackedParamsBase : public torch::jit::CustomClassHolder { + virtual at::Tensor apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) = 0; + virtual at::Tensor apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) = 0; + virtual at::Tensor apply_dynamic( + const at::Tensor& input, + bool reduce_range) = 0; + + virtual std::tuple> unpack() = 0; + + virtual torch::List stride() const = 0; + virtual torch::List padding() const = 0; + virtual torch::List output_padding() const = 0; + virtual torch::List dilation() const = 0; + virtual int64_t groups() const = 0; + virtual bool transpose() const = 0; +}; diff --git a/aten/src/ATen/native/quantized/qconv_unpack.cpp b/aten/src/ATen/native/quantized/qconv_unpack.cpp new file mode 100644 index 000000000000..062fc8a0522a --- /dev/null +++ b/aten/src/ATen/native/quantized/qconv_unpack.cpp @@ -0,0 +1,224 @@ +/* +The dispatch registrations at the end of this file applies to fbgemm, qnnpack, and cudnn backends. +The correct unpack backend function is determined using runtime polymorphism through the packed_weight pointer, +which is of type intrusive_ptr> and points to either a PackedConvWeightsQnnp, +PackedConvWeights (Fbgemm), or PackedConvWeightsCudnn at runtime, which all inherit from ConvPackedParamsBase. +The implementations for the unpack functions can be found in /cpu/qconv_unpack_impl.cpp, for fbgemm&qnnpack +and /cudnn/conv_unpack_impl.cpp, for cudnn. +*/ + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +namespace { + +/* + * QConvPackWeightInt8 expects its input tensor to be in shape + * [output_channels, kernel_height, kernel_width, input_channels/Groups] + * Therefore, the unpacking of packed weight tensor using QConvUnpackWeightsInt8 + * results in a tensor of the same shape. + */ + +template +class QConvUnpackWeightsInt8 final { + public: + static std::tuple> run( + const c10::intrusive_ptr>& packed_weight) { + auto& ctx = at::globalContext(); + +#ifdef USE_FBGEMM + if (ctx.qEngine() == at::QEngine::FBGEMM) { + return packed_weight->unpack(); + } +#endif + +#ifdef USE_PYTORCH_QNNPACK + if (ctx.qEngine() == at::QEngine::QNNPACK) { + TORCH_CHECK( + kSpatialDim == 2, + "quantized::conv2d_unpack (qnnpack): QNNPACK only supports Conv2d " + "now."); + return packed_weight->unpack(); + } +#endif + +#if AT_MKLDNN_ENABLED() + if (ctx.qEngine() == at::QEngine::ONEDNN) { + return packed_weight->unpack(); + } +#endif + + TORCH_CHECK( + false, + "Didn't find engine for operation quantized::conv2d_unpack ", + toString(ctx.qEngine())); + } +}; + +class QConv1dUnpackWeightsInt8 final { + public: + static std::tuple> run( + const c10::intrusive_ptr>& packed_weight) { + auto& ctx = at::globalContext(); + at::Tensor weight; + c10::optional bias; +#ifdef USE_FBGEMM + if (ctx.qEngine() == at::QEngine::FBGEMM) { + std::tie(weight, bias) = packed_weight->unpack(); + weight = weight.squeeze_(quant_utils::kConv1dSqueezeDim + 2); + return std::tuple>(weight, bias); + } +#endif + +#ifdef USE_PYTORCH_QNNPACK + if (ctx.qEngine() == at::QEngine::QNNPACK) { + std::tie(weight, bias) = packed_weight->unpack(); + at::Tensor new_weight = weight.clone(); + new_weight = new_weight.squeeze_(quant_utils::kConv1dSqueezeDim + 2); + return std::tuple>(new_weight, bias); + } +#endif + +#if AT_MKLDNN_ENABLED() + if (ctx.qEngine() == at::QEngine::ONEDNN) { + std::tie(weight, bias) = packed_weight->unpack(); + at::Tensor new_weight = weight.clone(); + new_weight.squeeze_(quant_utils::kConv1dSqueezeDim + 2); + return std::tuple>(new_weight, bias); + } +#endif + + TORCH_CHECK( + false, + "Didn't find engine for operation quantized::conv1d_unpack ", + toString(ctx.qEngine())); + } +}; + +template +class QConvStride final { + public: + static torch::List run( + const c10::intrusive_ptr>& packed_weight) { + return packed_weight->stride(); + } +}; + +template +class QConvPadding final { + public: + static torch::List run( + const c10::intrusive_ptr>& packed_weight) { + return packed_weight->padding(); + } +}; + +template +class QConvOutputPadding final { + public: + static torch::List run( + const c10::intrusive_ptr>& packed_weight) { + return packed_weight->output_padding(); + } +}; + +template +class QConvDilation final { + public: + static torch::List run( + const c10::intrusive_ptr>& packed_weight) { + return packed_weight->dilation(); + } +}; + +template +class QConvGroups final { + public: + static int64_t run( + const c10::intrusive_ptr>& packed_weight) { + return packed_weight->groups(); + } +}; + +template +class QConvTranspose final { + public: + static int64_t run( + const c10::intrusive_ptr>& packed_weight) { + return packed_weight->transpose(); + } +}; + +IValue +unpack_quantized_prepacked_sizes_conv2d(const IValue& ivalue) { + auto params = ivalue.toCustomClass>(); + at::Tensor weight; + c10::optional bias; + std::tie(weight, bias) = params->unpack(); + at::OptionalIntArrayRef bias_sizes = c10::nullopt; + if (bias && bias->defined()) { + bias_sizes = bias->sizes(); + } + return IValue(std::make_tuple( + weight.sizes(), + bias_sizes, + params->stride(), + params->padding(), + params->dilation(), + params->groups())); +} + +TORCH_LIBRARY_IMPL(quantized, CatchAll, m) { + // conv_unpack is deprecated, please use conv2d_unpack for 2D conv. + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_unpack"), TORCH_FN(QConvUnpackWeightsInt8<2>::run)); + // We use conv2d_unpack to be consistent with conv3d_unpack + m.impl(TORCH_SELECTIVE_NAME("quantized::conv1d_unpack"), TORCH_FN(QConv1dUnpackWeightsInt8::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<2>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_unpack_sizes"), TORCH_FN(unpack_quantized_prepacked_sizes_conv2d)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<3>::run)); + + m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_stride"), TORCH_FN(QConvStride<2>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_padding"), TORCH_FN(QConvPadding<2>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_output_padding"), TORCH_FN(QConvOutputPadding<2>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_dilation"), TORCH_FN(QConvDilation<2>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_groups"), TORCH_FN(QConvGroups<2>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_transpose"), TORCH_FN(QConvTranspose<2>::run)); + + m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_stride"), TORCH_FN(QConvStride<3>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_padding"), TORCH_FN(QConvPadding<3>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_output_padding"), TORCH_FN(QConvOutputPadding<3>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_dilation"), TORCH_FN(QConvDilation<3>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_groups"), TORCH_FN(QConvGroups<3>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_transpose"), TORCH_FN(QConvTranspose<3>::run)); + + // ConvTranspose is the same, however, we want to have different name. + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose1d_unpack"), TORCH_FN(QConv1dUnpackWeightsInt8::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<2>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<3>::run)); + + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_stride"), TORCH_FN(QConvStride<2>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_padding"), TORCH_FN(QConvPadding<2>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_output_padding"), TORCH_FN(QConvOutputPadding<2>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_dilation"), TORCH_FN(QConvDilation<2>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_groups"), TORCH_FN(QConvGroups<2>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_transpose"), TORCH_FN(QConvTranspose<2>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_stride"), TORCH_FN(QConvStride<3>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_padding"), TORCH_FN(QConvPadding<3>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_output_padding"), TORCH_FN(QConvOutputPadding<3>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_dilation"), TORCH_FN(QConvDilation<3>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_groups"), TORCH_FN(QConvGroups<3>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose3d_transpose"), TORCH_FN(QConvTranspose<3>::run)); +} + +} // namespace +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/quantized/qlinear_unpack.cpp b/aten/src/ATen/native/quantized/qlinear_unpack.cpp new file mode 100644 index 000000000000..cfcd0589f03c --- /dev/null +++ b/aten/src/ATen/native/quantized/qlinear_unpack.cpp @@ -0,0 +1,77 @@ +/* +The dispatch registrations at the end of this file applies to fbgemm, qnnpack, and cudnn backends. +The correct unpack backend function is determined using runtime polymorphism through the packed_weight pointer, +which is of type intrusive_ptr and points to either a PackedLinearWeightsQnnp, +PackedLinearWeights (Fbgemm), or PackedLinearWeightsCudnn at runtime, which all inherit from LinearPackedParamsBase. +The implementations for the unpack functions can be found in /cpu/qlinear_unpack_impl.cpp, for fbgemm&qnnpack +and /cudnn/linear_unpack_impl.cpp, for cudnn. +*/ +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +namespace { + +class QLinearUnpackWeightInt8 final { + public: + static std::tuple> run( + const c10::intrusive_ptr& packed_weight) { + return packed_weight->unpack(); + } +}; + +class QLinearUnpackWeightFp16 final { + public: + static std::tuple> run( + const c10::intrusive_ptr& packed_weight) { + auto& ctx = at::globalContext(); + + TORCH_CHECK( + ctx.qEngine() != at::QEngine::QNNPACK, + "quantized::linear_unpack_fp16 is currently " + "not supported by QNNPACK"); + + return packed_weight->unpack(); + } +}; + +class QLinearUnpackWeightInt8Legacy final { + public: + static std::tuple> run( + const at::Tensor& packed_weight) { + TORCH_CHECK(false, + "quantized.linear_unpack(Tensor) is unsupported! Please " + "upgrade your model to use the newer quantized.linear_" + "unpack(LinearPackedParamsBase) overload"); + } +}; + +class QLinearUnpackWeightFp16Legacy final { + public: + static std::tuple> run( + const at::Tensor& packed_weight) { + TORCH_CHECK(false, + "quantized.linear_unpack(Tensor) is unsupported! Please " + "upgrade your model to use the newer quantized.linear_" + "unpack(LinearPackedParamsBase) overload"); + } +}; + +TORCH_LIBRARY_IMPL(quantized, CPU, m) { + m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack.legacy"), TORCH_FN(QLinearUnpackWeightInt8Legacy::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack_fp16.legacy"), TORCH_FN(QLinearUnpackWeightFp16Legacy::run)); +} + +TORCH_LIBRARY_IMPL(quantized, CatchAll, m) { + m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack"), TORCH_FN(QLinearUnpackWeightInt8::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack_fp16"), TORCH_FN(QLinearUnpackWeightFp16::run)); +} + +} // namespace +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/sparse/SparseBlas.cpp b/aten/src/ATen/native/sparse/SparseBlas.cpp index 50bd6a8d863c..9d5e6e163794 100644 --- a/aten/src/ATen/native/sparse/SparseBlas.cpp +++ b/aten/src/ATen/native/sparse/SparseBlas.cpp @@ -1,7 +1,9 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include +#include #include +#include #include #ifndef AT_PER_OPERATOR_HEADERS @@ -12,6 +14,10 @@ #include #include #include +#include +#include +#include +#include #endif #include @@ -26,7 +32,7 @@ Tensor& addmv_out_sparse_csr( const Scalar& beta, const Scalar& alpha, Tensor& result) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat.is_sparse_csr()); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat.layout() == kSparseCsr || mat.layout() == kSparseBsr); TORCH_CHECK(mat.dim() == 2, "addmv: Expected mat to be 2-D"); TORCH_CHECK(vec.dim() == 1, "addmv: Expected vec to be 1-D"); @@ -89,5 +95,148 @@ std::tuple triangular_solve_out_sparse_csr_cpu( return std::tuple(X, clone_A); } +/* + Computes `result` <- α*(A @ B) * spy(C) + β*C, where spy(C) is the sparsity pattern matrix of C. + + Args: + * `mat1` - [in] dense Tensor A of size m × k. + * `mat2` - [in] dense Tensor B of size k × n. + * `self` - [in] sparse Tensor C of size m × n. + * `result` - [out] sparse Tensor of size m × n. +*/ +Tensor& sparse_sampled_addmm_out_sparse_csr_cpu( + const Tensor& self, + const Tensor& mat1, + const Tensor& mat2, + const Scalar& beta, + const Scalar& alpha, + Tensor& result) { + at::native::sparse::sparse_sampled_addmm_check_inputs(self, mat1, mat2, beta, alpha, result); + // Allow only same types as for the CUDA path + auto t = self.scalar_type(); + TORCH_CHECK(t == ScalarType::Double || t == ScalarType::Float || + t == ScalarType::ComplexFloat || t == ScalarType::ComplexDouble, + "sparse_sampled_addmm: Expected self to be a floating-point or complex tensor, but got ", t); + if (&result != &self) { + // We allow self to be a single matrix when mat1 and mat2 are batched + auto result_sizes = DimVector(mat1.sizes().slice(0, mat1.dim() - 2)); + result_sizes.push_back(self.size(-2)); + result_sizes.push_back(self.size(-1)); + at::sparse_csr::get_sparse_csr_impl(result)->resize_(self._nnz(), result_sizes); + } + result.copy_((self.to_dense().mul(beta).add(mat1.matmul(mat2), alpha)).sparse_mask(self)); + return result; +} + +Tensor sparse_sampled_addmm_sparse_csr_cpu( + const Tensor& self, + const Tensor& mat1, + const Tensor& mat2, + const Scalar& beta, + const Scalar& alpha) { + auto result = at::empty({0, 0}, self.options()); + at::native::sparse_sampled_addmm_out_sparse_csr_cpu(self, mat1, mat2, beta, alpha, result); + return result; +} + +namespace sparse { + +void sparse_sampled_addmm_check_inputs( + const Tensor& self, + const Tensor& mat1, + const Tensor& mat2, + const Scalar& beta, + const Scalar& alpha, + const Tensor& result) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self.is_sparse_csr()); + + TORCH_CHECK( + mat1.layout() == kStrided, + "sampled_addmm: Expected mat1 to have strided layout, but got ", + mat1.layout()); + TORCH_CHECK( + mat2.layout() == kStrided, + "sampled_addmm: Expected mat2 to have strided layout, but got ", + mat2.layout()); + + TORCH_CHECK( + result.layout() == kSparseCsr, + "sampled_addmm: Expected result to have sparse csr layout, but got ", + result.layout()); + + TORCH_CHECK( + mat1.scalar_type() == mat2.scalar_type(), + "sampled_addmm: Expected mat1 and mat2 to have the same dtype, but got ", + mat1.scalar_type(), + " and ", + mat2.scalar_type()); + TORCH_CHECK( + mat1.scalar_type() == self.scalar_type(), + "sampled_addmm: Expected mat1 and self to have the same dtype, but got ", + mat1.scalar_type(), + " and ", + self.scalar_type()); + TORCH_CHECK( + result.scalar_type() == self.scalar_type(), + "sampled_addmm: Expected result and self to have the same dtype, but got ", + result.scalar_type(), + " and ", + self.scalar_type()); + + TORCH_CHECK( + mat1.dim() >= 2, + "sampled_addmm: Expected mat1 to be a matrix, got ", + mat1.dim(), + "-D tensor"); + TORCH_CHECK( + mat2.dim() >= 2, + "sampled_addmm: Expected mat2 to be a matrix, got ", + mat2.dim(), + "-D tensor"); + TORCH_CHECK( + result.dim() >= 2, + "sampled_addmm: Expected result to be a matrix, got ", + result.dim(), + "-D tensor"); + + TORCH_CHECK( + mat1.sizes().slice(0, mat1.dim() - 2) == mat2.sizes().slice(0, mat2.dim() - 2), + "sampled_addmm: Expected mat1 and mat2 to have the same batch size, but got ", + mat1.sizes().slice(0, mat1.dim() - 2), + " and ", + mat2.sizes().slice(0, mat2.dim() - 2)); + + TORCH_CHECK( + !(self.dim() > 2 && self.sizes().slice(0, self.dim() - 2) != mat1.sizes().slice(0, mat1.dim() - 2)), + "sampled_addmm: Expected self and mat1 to have the same batch size, but got ", + self.sizes().slice(0, self.dim() - 2), + " and ", + mat1.sizes().slice(0, mat1.dim() - 2)); + + IntArrayRef mat1_sizes = mat1.sizes(); + IntArrayRef mat2_sizes = mat2.sizes(); + TORCH_CHECK( + mat1_sizes[mat1.dim() - 1] == mat2_sizes[mat2.dim() - 2], + "sampled_addmm: mat1 and mat2 shapes cannot be multiplied (", + mat1_sizes[mat1.dim() - 2], + "x", + mat1_sizes[mat1.dim() - 1], + " and ", + mat2_sizes[mat2.dim() - 2], + "x", + mat2_sizes[mat2.dim() - 1], + ")"); + + IntArrayRef self_sizes = self.sizes(); + TORCH_CHECK( + self_sizes[self.dim() - 2] == mat1_sizes[mat1.dim() - 2], + "sampled_addmm: self.shape[-2] must match mat1.shape[-2]"); + TORCH_CHECK( + self_sizes[self.dim() - 1] == mat2_sizes[mat2.dim() - 1], + "sampled_addmm: self.shape[-1] must match mat2.shape[-1]"); +} + +} // namespace sparse + } // namespace native } // namespace at diff --git a/aten/src/ATen/native/sparse/SparseBlas.h b/aten/src/ATen/native/sparse/SparseBlas.h new file mode 100644 index 000000000000..337308a2dddf --- /dev/null +++ b/aten/src/ATen/native/sparse/SparseBlas.h @@ -0,0 +1,22 @@ +#pragma once + +#include + +#include +#include + +namespace at { +namespace native { +namespace sparse { + +TORCH_API void sparse_sampled_addmm_check_inputs( + const Tensor& self, + const Tensor& mat1, + const Tensor& mat2, + const Scalar& beta, + const Scalar& alpha, + const Tensor& result); + +} // namespace sparse +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/sparse/SparseBlasImpl.cpp b/aten/src/ATen/native/sparse/SparseBlasImpl.cpp index 6b133d3b6325..4ad0d55c6891 100644 --- a/aten/src/ATen/native/sparse/SparseBlasImpl.cpp +++ b/aten/src/ATen/native/sparse/SparseBlasImpl.cpp @@ -73,7 +73,7 @@ void triangular_solve_out_sparse_csr( "Calling triangular_solve on a sparse CPU tensor requires compiling PyTorch with MKL. ", "Please use PyTorch built MKL support."); #else - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(A.is_sparse_csr()); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(A.layout() == kSparseCsr || A.layout() == kSparseBsr); sparse::impl::mkl::triangular_solve_out_sparse_csr(A, B, X, upper, transpose, unitriangular); #endif } diff --git a/aten/src/ATen/native/sparse/SparseCsrTensor.cpp b/aten/src/ATen/native/sparse/SparseCsrTensor.cpp index f91d9648e7db..62d600dc0926 100644 --- a/aten/src/ATen/native/sparse/SparseCsrTensor.cpp +++ b/aten/src/ATen/native/sparse/SparseCsrTensor.cpp @@ -9,14 +9,26 @@ #include #include #include +#include #ifndef AT_PER_OPERATOR_HEADERS #include #include #else +#include #include +#include #include +#include +#include +#include +#include +#include #include +#include +#include +#include +#include #include #include #include @@ -26,7 +38,13 @@ #include #include #include +#include +#include +#include #include +#include +#include +#include #include #endif @@ -40,15 +58,23 @@ namespace { } // end anonymous namespace -void _validate_sparse_csr_tensor_args(const Tensor& crow_indices, const Tensor& col_indices, const Tensor& values, IntArrayRef size) { +void _validate_sparse_compressed_tensor_args_worker(const Tensor& compressed_indices, const Tensor& plain_indices, const Tensor& values, const IntArrayRef size, const Layout& layout) { + + // Layout must be Sparse Compressed + AT_DISPATCH_ALL_SPARSE_COMPRESSED_LAYOUTS(layout, "validate_sparse_compressed_tensor_args", [&]{}); + + const std::string layout_name = layoutToString(layout, /*upper=*/ true); + const std::string compressed_indices_name = compressedIndicesName(layout); + const std::string plain_indices_name = plainIndicesName(layout); + // Layout Invariants TORCH_CHECK( - col_indices.layout() == kStrided && col_indices.is_contiguous(), - "expected col_indices to be a strided and contiguous tensor"); + plain_indices.layout() == kStrided && plain_indices.is_contiguous(), + "expected ", plain_indices_name, " to be a strided and contiguous tensor"); TORCH_CHECK( - crow_indices.layout() == kStrided && crow_indices.is_contiguous(), - "expected crow_indices to be a strided and contiguous tensor"); + compressed_indices.layout() == kStrided && compressed_indices.is_contiguous(), + "expected ", compressed_indices_name ," to be a strided and contiguous tensor"); TORCH_CHECK( values.layout() == kStrided && values.is_contiguous(), @@ -56,78 +82,141 @@ void _validate_sparse_csr_tensor_args(const Tensor& crow_indices, const Tensor& // Shape and Strides invariants TORCH_CHECK( - size.size() == 2, - "size of a CSR tensor must be of length 2, but got: ", - size.size()); + size.size() >= 2, + "size of a batched ", layout_name, " tensor must have length >= 2, but got: ", + size.size()); TORCH_CHECK( - crow_indices.dim() == 1, - "crow_indices must have dim=1 but got crow_indices.dim()=", - crow_indices.dim()); + compressed_indices.dim() >= 1, + compressed_indices_name, " must have dim >= 1 but got ", compressed_indices_name, ".dim() = ", + compressed_indices.dim()); TORCH_CHECK( - col_indices.dim() == 1, - "col_indices must have dim=1 but got col_indices.dim()=", - col_indices.dim()); + plain_indices.dim() >= 1, + plain_indices_name, " must have dim >= 1 but got ", plain_indices_name, ".dim() = ", + plain_indices.dim()); TORCH_CHECK( - values.dim() == 1, - "values must have dim=1 but got values.dim()=", - values.dim()); - // Note, this check also enforces `crow_indices.numel() >= 1` + values.dim() >= 1, + "values must have dim >= 1 but got values.dim() = ", + values.dim()); + TORCH_CHECK( - crow_indices.numel() == (size[0] + 1), - "crow_indices.numel() must be size(0) + 1, but got: ", - crow_indices.numel()); + compressed_indices.dim() == plain_indices.dim(), + "number of dimensions of ", compressed_indices_name, " and ", plain_indices_name, " must be the same but got ", + compressed_indices.dim(), " and ", plain_indices.dim(), ", respectively"); + + AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS( + layout, "validate_sparse_compressed_tensor_args", + [&] { + TORCH_CHECK( + compressed_indices.dim() == values.dim(), + "number of dimensions of indices and values must be the same but got ", + compressed_indices.dim(), " and ", values.dim(), ", respectively"); + }, + [&] { + TORCH_CHECK( + compressed_indices.dim() + 2 == values.dim(), + "number of dimensions of indices must be two less than the number of dimensions of the values but got ", + compressed_indices.dim(), " + 2 not equal to ", values.dim()); + }); + + TORCH_CHECK( + static_cast(compressed_indices.dim()) == size.size() - 1, + "number of dimensions of indices must be one less than the number of dimensions of the provided size but got ", + compressed_indices.dim(), " not equal to ", size.size(), " - 1"); + + int block_ndim = AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS(layout, "validate_sparse_compressed_tensor_args", [&]{ return 0; }, [&]{ return 2; }); + IntArrayRef block_size = values.sizes().slice(values.dim() - block_ndim, block_ndim); + int64_t numel_per_block = AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS(layout, "validate_sparse_compressed_tensor_args", + [&]() -> int64_t { return 1; }, [&]() -> int64_t { return block_size[0] * block_size[1]; }); + int compressed_dim = compressedDimension(layout, size); + int plain_dim = plainDimension(layout, size); + + // All batch sizes must be the same + auto batch_size = size.slice(0, size.size() - 2); + auto compressed_indices_batch_size = compressed_indices.sizes().slice(0, compressed_indices.dim() - 1); + auto plain_indices_batch_size = plain_indices.sizes().slice(0, plain_indices.dim() - 1); + auto values_batch_size = values.sizes().slice(0, values.dim() - 1 - block_ndim); TORCH_CHECK( - col_indices.numel() == values.numel(), - "col_indices and values must have equal sizes, but got col_indices.numel(): ", - col_indices.numel(), - ", values.numel(): ", - values.numel()); + batch_size == compressed_indices_batch_size && + batch_size == plain_indices_batch_size && + batch_size == values_batch_size, + "all batch dimensions of the provided size (", batch_size, "), indices (", + compressed_indices_batch_size,", ", plain_indices_batch_size, "), and values (", + values_batch_size,") must be the same."); + + // Note, this check also enforces `compressed_indices.size(-1) >= 1` + TORCH_CHECK( + compressed_indices.size(-1) == (size[compressed_dim] + 1), + compressed_indices_name, ".size(-1) must be equal to size[-", (size.size() - compressed_dim), "] + 1 (that is ", + size[compressed_dim] + 1, "), but got: ", compressed_indices.size(-1)); + + AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS(layout, "validate_sparse_compressed_tensor_args", + [&] { + TORCH_CHECK( + plain_indices.numel() == values.numel(), + plain_indices_name, " and values must have the same number of elements, but got ", plain_indices_name, ".numel(): ", + plain_indices.numel(), ", values.numel(): ", values.numel()); + }, + [&] { + TORCH_CHECK( + plain_indices.numel() * numel_per_block == values.numel(), + "number of ", plain_indices_name, " elements must be the same as the number of blocks in values, but got ", + plain_indices_name, ".numel() * numel_per_block: ", plain_indices.numel() * numel_per_block, + ", values.numel(): ", values.numel(),", numel_per_block: ", numel_per_block); + }); // Indices invariants - AT_DISPATCH_INDEX_TYPES(crow_indices.scalar_type(), "csr_construct_check", [&] { - Tensor crow_indices_cpu = crow_indices.to(kCPU); - auto crow_indices_accessor = crow_indices_cpu.accessor(); - TORCH_CHECK( - crow_indices_accessor[0] == 0, "0th value of crow_indices must be 0."); - - TORCH_CHECK( - crow_indices_accessor[crow_indices.numel() - 1] == col_indices.numel(), - "last value of crow_indices should be equal to the length of col_indices."); - - for (int i = 1; i <= size[0]; i++) { - TORCH_CHECK( - crow_indices_accessor[i - 1] <= crow_indices_accessor[i], - "at position i = ", i, ", this condition crow_indices[i - 1] <= crow_indices[i] fails"); - } - if (col_indices.numel() > 0) { - TORCH_CHECK(0 <= col_indices.min().item(), "col_indices.min() should be greater or equal to zero"); - TORCH_CHECK(size[1] > col_indices.max().item(), "size(1) should be greater than col_indices.max()"); - } - }); - - // CSR Type Invariants - auto crow_indices_type = crow_indices.scalar_type(); - auto col_indices_type = col_indices.scalar_type(); + AT_DISPATCH_INDEX_TYPES(compressed_indices.scalar_type(), "validate_sparse_compressed_tensor_args", + [&] { + Tensor compressed_indices_cpu = compressed_indices.to(kCPU); + auto compressed_indices_data_ptr = compressed_indices_cpu.data_ptr(); + auto batch_stride = compressed_indices_cpu.dim() >= 2 ? compressed_indices_cpu.stride(-2) : 0; + auto compressed_dims = size[compressedDimension(layout, size)]; + for (const auto batch_id : c10::irange(batchCount(compressed_indices_cpu))) { + TORCH_CHECK( + compressed_indices_data_ptr[batch_id*batch_stride] == 0, + "(Batch element ", batch_id, ") ", + ": 0th value of ", compressed_indices_name, " must be 0, but it is ", compressed_indices_data_ptr[batch_id*batch_stride]); + TORCH_CHECK( + compressed_indices_data_ptr[batch_id*batch_stride + compressed_indices.size(-1) - 1] == plain_indices.size(-1), + "(Batch element ", batch_id, ") ", + "last value of ", compressed_indices_name, " should be equal to the length of ", plain_indices_name, "."); + for (int i = 1; i <= compressed_dims; i++) { + TORCH_CHECK( + compressed_indices_data_ptr[batch_id*batch_stride + i - 1] <= compressed_indices_data_ptr[batch_id*batch_stride + i], + "(Batch element ", batch_id, ") ", + "at position i = ", i, ", the condition ", compressed_indices_name, "[i - 1] <= ", compressed_indices_name, "[i] fails"); + } + } + if (plain_indices.numel() > 0) { + TORCH_CHECK(0 <= plain_indices.min().item(), plain_indices_name, ".min() should be greater or equal to zero"); + TORCH_CHECK(size[plain_dim] > plain_indices.max().item(), "size[-", (size.size() - plain_dim),"] should be greater than ", plain_indices_name, ".max()"); + } + }); + + // Type Invariants + auto compressed_indices_type = compressed_indices.scalar_type(); + auto plain_indices_type = plain_indices.scalar_type(); TORCH_CHECK( - crow_indices_type == col_indices_type, - "both crow_indices and col_indices should have the same type."); + compressed_indices_type == plain_indices_type, + "both ", compressed_indices_name, " and ", plain_indices_name, " should have the same type, bot got ", + compressed_indices_type, " and ", plain_indices_type, ", respectively"); TORCH_CHECK( - crow_indices_type == kInt || crow_indices_type == kLong, - "crow_indices and col_indices must be an int32 or int64 type, but got: ", - crow_indices_type); + compressed_indices_type == kInt || compressed_indices_type == kLong, + compressed_indices_name, " and ", plain_indices_name, " must be an int32 or int64 type, but got: ", + compressed_indices_type); - // CSR Device Invariants + // Device Invariants TORCH_CHECK( - col_indices.get_device() == crow_indices.get_device(), - "crow_indices and col_indices devices (", - crow_indices.get_device(), + plain_indices.get_device() == compressed_indices.get_device(), + compressed_indices_name, " and ", plain_indices_name, " devices (", + compressed_indices.get_device(), ", ", - col_indices.get_device(), + plain_indices.get_device(), ") must match"); TORCH_CHECK( - crow_indices.get_device() == values.get_device(), - "device of crow_indices (", - crow_indices.get_device(), + compressed_indices.get_device() == values.get_device(), + "device of ", compressed_indices_name, " (", + compressed_indices.get_device(), ") must match device of values (", values.get_device(), ")"); @@ -136,19 +225,46 @@ void _validate_sparse_csr_tensor_args(const Tensor& crow_indices, const Tensor& "device type of values (", values.device().type(), ") must be CPU or CUDA"); + +} + +void _validate_sparse_compressed_tensor_args(const Tensor& compressed_indices, const Tensor& plain_indices, const Tensor& values, IntArrayRef size, Layout layout) { + _validate_sparse_compressed_tensor_args_worker(compressed_indices, plain_indices, values, size, layout); +} + +void _validate_sparse_csr_tensor_args(const Tensor& crow_indices, const Tensor& col_indices, const Tensor& values, IntArrayRef size) { + _validate_sparse_compressed_tensor_args_worker(crow_indices, col_indices, values, size, kSparseCsr); +} + +void _validate_sparse_csc_tensor_args(const Tensor& ccol_indices, const Tensor& row_indices, const Tensor& values, IntArrayRef size) { + _validate_sparse_compressed_tensor_args_worker(ccol_indices, row_indices, values, size, kSparseCsc); +} + +void _validate_sparse_bsr_tensor_args(const Tensor& crow_indices, const Tensor& col_indices, const Tensor& values, IntArrayRef size) { + _validate_sparse_compressed_tensor_args_worker(crow_indices, col_indices, values, size, kSparseBsr); +} + +void _validate_sparse_bsc_tensor_args(const Tensor& ccol_indices, const Tensor& row_indices, const Tensor& values, IntArrayRef size) { + _validate_sparse_compressed_tensor_args_worker(ccol_indices, row_indices, values, size, kSparseBsc); } -// Construction of CSR tensors. -SparseCsrTensor new_csr_tensor(const TensorOptions& options) { +// Construction of CSR, CSC, BSR, and BSC tensors. + +// Note: The usage of "Csr" in names like SparseCsrTensor, +// SparseCsrCPU, SparseCsrCUDA, and SparseCsrTensorImpl exists because +// of historical reasons (that ought to be removed in future) and does +// not mean that the corresponding functionality would be CSR layout +// only specific. +SparseCsrTensor new_compressed_tensor(const TensorOptions& options) { // TODO: remove this comment after enabling autograd support for CSR tensor // constructor. // TORCH_INTERNAL_ASSERT(impl::variable_excluded_from_dispatch()); - TORCH_INTERNAL_ASSERT(options.layout() == kSparseCsr); + Layout layout = AT_DISPATCH_ALL_SPARSE_COMPRESSED_LAYOUTS(options.layout(), "new_compressed_tensor", [&] { return the_layout; }); DispatchKey dispatch_key; TORCH_CHECK_NOT_IMPLEMENTED( options.device().type() == kCPU || options.device().type() == kCUDA, - "Could not run '", "sparse_csr_tensor", "' from the '", options.device(), "' device.)"); + "Could not run 'new_compressed_tensor' from the '", options.device(), "' device.)"); if (options.device().is_cuda()) { dispatch_key = DispatchKey::SparseCsrCUDA; @@ -157,44 +273,117 @@ SparseCsrTensor new_csr_tensor(const TensorOptions& options) { } return detail::make_tensor( - DispatchKeySet(dispatch_key), options.dtype()); + DispatchKeySet(dispatch_key), layout, options.dtype()); } -Tensor _sparse_csr_tensor_unsafe(const Tensor& crow_indices, const Tensor& col_indices, - const Tensor& values, - IntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { - TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); +Tensor _sparse_compressed_tensor_unsafe(const Tensor& compressed_indices, + const Tensor& plain_indices, + const Tensor& values, + IntArrayRef size, + c10::optional dtype, + c10::optional layout, + c10::optional device, + c10::optional pin_memory) { + if (!layout) { + AT_ERROR("sparse_compressed_tensor_unsafe expected sparse compressed tensor layout but got none"); + } + Layout layout_ = layout.value(); + AT_DISPATCH_ALL_SPARSE_COMPRESSED_LAYOUTS(layout_, "sparse_compressed_tensor_unsafe", [&]{}); + TensorOptions options = TensorOptions().dtype(dtype).layout(layout_).device(device).pinned_memory(pin_memory); + SparseCsrTensor self = new_compressed_tensor(options); + get_sparse_csr_impl(self)->set_member_tensors(compressed_indices, plain_indices, values, size); + return self; +} - SparseCsrTensor self = new_csr_tensor(options); - get_sparse_csr_impl(self)->set_member_tensors(crow_indices, col_indices, values, size); +template +Tensor _sparse_compressed_tensor_unsafe_template(const Tensor& compressed_indices, + const Tensor& plain_indices, + const Tensor& values, + IntArrayRef size, + c10::optional dtype, + c10::optional layout, + c10::optional device, + c10::optional pin_memory) { + Layout layout_ = layout.value_or(required_layout); + TORCH_CHECK(layout_ == required_layout, "sparse compressed layout must be ",required_layout, " but got ", layout_); + TensorOptions options = TensorOptions().dtype(dtype).layout(layout_).device(device).pinned_memory(pin_memory); + SparseCsrTensor self = new_compressed_tensor(options); + get_sparse_csr_impl(self)->set_member_tensors(compressed_indices, plain_indices, values, size); return self; } +#define SPARSE_COMPRESSED_TENSOR_UNSAFE(KIND, REQUIRED_LAYOUT) \ + Tensor _sparse_##KIND##_tensor_unsafe(const Tensor& compressed_indices, \ + const Tensor& plain_indices, \ + const Tensor& values, \ + IntArrayRef size, \ + c10::optional dtype, \ + c10::optional layout, \ + c10::optional device, \ + c10::optional pin_memory) { \ + return _sparse_compressed_tensor_unsafe_template(compressed_indices, plain_indices, values, size, dtype, layout, device, pin_memory); \ + } + +SPARSE_COMPRESSED_TENSOR_UNSAFE(csr, kSparseCsr); +SPARSE_COMPRESSED_TENSOR_UNSAFE(csc, kSparseCsc); +SPARSE_COMPRESSED_TENSOR_UNSAFE(bsr, kSparseBsr); +SPARSE_COMPRESSED_TENSOR_UNSAFE(bsc, kSparseBsc); + +DimVector _estimate_sparse_compressed_tensor_size( + const Tensor& compressed_indices, + const Tensor& plain_indices, + const Tensor& values, + Layout layout) { + DimVector size = DimVector(IntArrayRef(plain_indices.sizes().data(), plain_indices.dim() - 1)); + int64_t compressed_dim = (plain_indices.size(-1) > 0 ? compressed_indices.size(-1) - 1 : 0); + int64_t plain_dim = AT_DISPATCH_INTEGRAL_TYPES(plain_indices.scalar_type(), "estimate_sparse_compressed_tensor_size", + [&]() -> int64_t { + if (plain_indices.numel() > 0) { + return plain_indices.max().item() + 1; + } else { + return 0; + } + }); + AT_DISPATCH_ROW_SPARSE_COMPRESSED_LAYOUTS(layout, "estimate_sparse_compressed_tensor_size", + [&]{ + size.push_back(compressed_dim); + size.push_back(plain_dim); + }, + [&]{ + size.push_back(plain_dim); + size.push_back(compressed_dim); + }); + return size; +} + // TODO: This constructor should probably use an ATen abstract method in order // to make autograd dispatch available for the CSR constructor. See the relevant // note in native_functions.yaml. -Tensor sparse_csr_tensor( - const Tensor& crow_indices, - const Tensor& col_indices, +Tensor sparse_compressed_tensor( + const Tensor& compressed_indices, + const Tensor& plain_indices, const Tensor& values, IntArrayRef size, c10::optional dtype, c10::optional layout, c10::optional device, c10::optional pin_memory) { + + if (!layout) { + AT_ERROR("sparse_compressed_tensor expected sparse compressed tensor layout but got none"); + } + Layout layout_ = layout.value(); + AT_DISPATCH_ALL_SPARSE_COMPRESSED_LAYOUTS(layout_, "sparse_compressed_tensor", [&]{}); + // See [Note: hacky wrapper removal for TensorOptions] - TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); + TensorOptions options = TensorOptions().dtype(dtype).layout(layout_).device(device).pinned_memory(pin_memory); - at::native::_validate_sparse_csr_tensor_args(crow_indices, col_indices, values, size); + _validate_sparse_compressed_tensor_args_worker(compressed_indices, plain_indices, values, size, layout_); - return at::native::_sparse_csr_tensor_unsafe( - crow_indices, - col_indices, + return at::native::_sparse_compressed_tensor_unsafe( + compressed_indices, + plain_indices, values, size, optTypeMetaToScalarType(options.dtype_opt()), @@ -203,29 +392,31 @@ Tensor sparse_csr_tensor( options.pinned_memory_opt()); } -Tensor sparse_csr_tensor( - const Tensor& crow_indices, - const Tensor& col_indices, +Tensor sparse_compressed_tensor( + const Tensor& compressed_indices, + const Tensor& plain_indices, const Tensor& values, c10::optional dtype, c10::optional layout, c10::optional device, c10::optional pin_memory) { - // See [Note: hacky wrapper removal for TensorOptions] - TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); - std::array size = {0, 0}; - if (col_indices.numel() > 0) { - AT_DISPATCH_INDEX_TYPES(col_indices.scalar_type(), "csr_construct_check", [&] { - size[0] = crow_indices.numel() - 1; - size[1] = col_indices.max().item() + 1; - }); + + if (!layout) { + AT_ERROR("sparse_compressed_tensor expected sparse compressed tensor layout but got none"); } + Layout layout_ = layout.value(); + AT_DISPATCH_ALL_SPARSE_COMPRESSED_LAYOUTS(layout_, "sparse_compressed_tensor", [&]{}); - at::native::_validate_sparse_csr_tensor_args(crow_indices, col_indices, values, size); + DimVector size = _estimate_sparse_compressed_tensor_size(compressed_indices, plain_indices, values, layout_); + + // See [Note: hacky wrapper removal for TensorOptions] + TensorOptions options = TensorOptions().dtype(dtype).layout(layout_).device(device).pinned_memory(pin_memory); - return at::native::_sparse_csr_tensor_unsafe( - crow_indices, - col_indices, + _validate_sparse_compressed_tensor_args_worker(compressed_indices, plain_indices, values, size, layout_); + + return at::native::_sparse_compressed_tensor_unsafe( + compressed_indices, + plain_indices, values, size, optTypeMetaToScalarType(options.dtype_opt()), @@ -234,7 +425,41 @@ Tensor sparse_csr_tensor( options.pinned_memory_opt()); } -Tensor empty_sparse_csr( +#define SPARSE_COMPRESSED_TENSOR(KIND, REQUIRED_LAYOUT) \ + Tensor sparse_##KIND##_tensor(const Tensor& compressed_indices, \ + const Tensor& plain_indices, \ + const Tensor& values, \ + c10::optional dtype, \ + c10::optional layout, \ + c10::optional device, \ + c10::optional pin_memory) { \ + if (layout) { \ + TORCH_CHECK(layout.value() == REQUIRED_LAYOUT, "sparse " # KIND " layout must be ", REQUIRED_LAYOUT, " but got ", layout.value()); \ + } \ + c10::optional layout_(REQUIRED_LAYOUT); \ + return at::native::sparse_compressed_tensor(compressed_indices, plain_indices, values, dtype, layout_, device, pin_memory); \ + } \ + Tensor sparse_##KIND##_tensor(const Tensor& compressed_indices, \ + const Tensor& plain_indices, \ + const Tensor& values, \ + IntArrayRef size, \ + c10::optional dtype, \ + c10::optional layout, \ + c10::optional device, \ + c10::optional pin_memory) { \ + if (layout) { \ + TORCH_CHECK(layout.value() == REQUIRED_LAYOUT, "sparse " # KIND " layout must be ", REQUIRED_LAYOUT, " but got ", layout.value()); \ + } \ + c10::optional layout_(REQUIRED_LAYOUT); \ + return at::native::sparse_compressed_tensor(compressed_indices, plain_indices, values, size, dtype, layout_, device, pin_memory); \ + } + +SPARSE_COMPRESSED_TENSOR(csr, kSparseCsr) +SPARSE_COMPRESSED_TENSOR(csc, kSparseCsc) +SPARSE_COMPRESSED_TENSOR(bsr, kSparseBsr) +SPARSE_COMPRESSED_TENSOR(bsc, kSparseBsc) + +Tensor empty_sparse_compressed( IntArrayRef size, c10::optional dtype, c10::optional layout, @@ -242,27 +467,34 @@ Tensor empty_sparse_csr( c10::optional pin_memory, c10::optional optional_memory_format) { check_size_nonnegative(size); + TORCH_CHECK(size.size() >= 2, "torch.empty: Only batched sparse compressed (non-block) tensors are supported, but got size ", size); - TORCH_CHECK(size.size() == 2, "torch.empty: Only 2D sparse CSR tensors are supported."); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout == Layout::SparseCsr); + // Strided is the default layout for torch.empty. + Layout layout_ = layout.value_or(Layout::Strided); + + // torch.empty cannot be used to create blocked tensors because its + // API lacks a method to specify the block size. + AT_DISPATCH_SPARSE_COMPRESSED_NONBLOCK_LAYOUTS(layout_, "empty_sparse_compressed", [&]{}); - auto rows = size[0]; int64_t nnz = 0; + auto compressed_indices_size = DimVector(size.slice(0, size.size() - 2)); + auto plain_indices_and_values_size = DimVector(size.slice(0, size.size() - 2)); + compressed_indices_size.push_back(size[compressedDimension(layout_, size)] + 1); + plain_indices_and_values_size.push_back(nnz); TensorOptions options = TensorOptions().dtype(ScalarType::Long).layout(Layout::Strided).device(device).pinned_memory(pin_memory); - auto crow_indices = at::empty({rows + 1}, options); - auto col_indices = at::empty({nnz}, options); - auto values = at::empty({nnz}, options.dtype(dtype)); - - return at::native::_sparse_csr_tensor_unsafe( - crow_indices, - col_indices, - values, - size, - dtype, - layout, - device, - pin_memory); + auto compressed_indices = at::empty(compressed_indices_size, options); + auto plain_indices = at::empty(plain_indices_and_values_size, options); + auto values = at::empty(plain_indices_and_values_size, options.dtype(dtype)); + + return at::native::_sparse_compressed_tensor_unsafe(compressed_indices, + plain_indices, + values, + size, + dtype, + layout, + device, + pin_memory); } const Tensor& resize_sparse_csr_( @@ -270,32 +502,63 @@ const Tensor& resize_sparse_csr_( IntArrayRef size, c10::optional optional_memory_format) { check_size_nonnegative(size); - TORCH_CHECK(size.size() == 2, "torch.resize_: Only 2D sparse CSR tensors are supported."); + TORCH_CHECK(size.size() >= 2, "torch.resize_: Only batched sparse CSR matrices are supported, but got size ", size); TORCH_CHECK( - self.size(1) <= size[1], + self.size(-1) <= size[size.size() - 1], "torch.resize_: Resizing columns of sparse CSR tensors to a smaller value is not supported. ", "The original number of columns is ", - self.size(1), - " while the requested new number of columns is ", size[1], "."); + self.size(-1), + " while the requested new number of columns is ", size[size.size() - 1], "."); get_sparse_csr_impl(self)->resize_(self._nnz(), size); return self; } -Tensor& copy_sparse_csr_(Tensor& self, const Tensor& src, bool non_blocking) { +Tensor& copy_sparse_compressed_(Tensor& self, const Tensor& src, bool non_blocking) { + AT_DISPATCH_ALL_SPARSE_COMPRESSED_LAYOUTS(self.layout(), "copy_sparse_compressed_", [&]{}); TORCH_CHECK( - self.sizes() == src.sizes(), - "copy_sparse_csr_: only same size tensors are supported."); + self.layout() == src.layout(), + "torch.copy_: copy of sparse compressed tensors having different layouts is not supported.", + " self layout is ", self.layout(), " and src layout is ", src.layout()); TORCH_CHECK( - self.is_sparse_csr() && src.is_sparse_csr(), - "copy_sparse_csr_: copy between different layouts is not supported. Found self type = ", - self.toString(), - " and src type = ", - src.toString()); - TORCH_CHECK( - self._nnz() == src._nnz(), - "copy_sparse_csr_: only tensors with the same number of specified elements are supported."); - self.crow_indices().copy_(src.crow_indices(), non_blocking); - self.col_indices().copy_(src.col_indices(), non_blocking); + self._nnz() == src._nnz(), // actually, values copy allows different shapes as long as operands are broadcastable + "torch.copy_: only sparse compressed tensors with the same number of specified elements are supported."); + auto self_compressed_dim = compressedDimension(self.layout(), self.sizes()); + auto src_compressed_dim = compressedDimension(src.layout(), src.sizes()); + auto self_compressed_dims = self.size(self_compressed_dim); + auto src_compressed_dims = src.size(compressedDimension(src.layout(), src.sizes())); + if (self_compressed_dim == src_compressed_dim) { + TORCH_CHECK(self_compressed_dims == src_compressed_dims, + "torch.copy_: expected shapes of self and src to match along dimension ", + self_compressed_dim, " for ", + self.layout(), " layout but the corresponding dimensions of self and src are ", + self_compressed_dims, " and ", src_compressed_dims, ", respecitvely."); + } else { + TORCH_CHECK(self_compressed_dims == src_compressed_dims, + "torch.copy_: expected shapes of self and src to match along dimensions ", + self_compressed_dim, " and ", src_compressed_dim, ", respectively, for ", + self.layout(), " layout but the corresponding dimensions of self and src are ", + self_compressed_dims, " and ", src_compressed_dims, ", respecitvely."); + } + AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS(self.layout(), "copy_sparse_compressed_", + [&]{}, + [&]{ + auto self_values = self.values(); + auto src_values = src.values(); + auto self_block_size = DimVector(self_values.sizes().slice(self_values.dim()-2, 2)); + auto src_block_size = DimVector(src_values.sizes().slice(src_values.dim()-2, 2)); + TORCH_CHECK(self_block_size == src_block_size, + "torch.copy_: copy of sparse compressed tensors having different block sizes is not supported.", + " self and src block sizes are ", self_block_size, " and ", src_block_size, ", respectivly."); + }); + AT_DISPATCH_ROW_SPARSE_COMPRESSED_LAYOUTS(self.layout(), "copy_sparse_compressed_", + [&]{ + self.crow_indices().copy_(src.crow_indices(), non_blocking); + self.col_indices().copy_(src.col_indices(), non_blocking); + }, + [&]{ + self.ccol_indices().copy_(src.ccol_indices(), non_blocking); + self.row_indices().copy_(src.row_indices(), non_blocking); + }); self.values().copy_(src.values(), non_blocking); return self; } @@ -310,11 +573,27 @@ Tensor values_sparse_csr(const Tensor& self) { } Tensor crow_indices_sparse_csr(const Tensor& self) { - return get_sparse_csr_impl(self)->crow_indices().alias(); + return AT_DISPATCH_SPARSE_ROW_COMPRESSED_LAYOUTS(self.layout(), + "crow_indices", + [&]{ return get_sparse_csr_impl(self)->compressed_indices().alias(); }); } Tensor col_indices_sparse_csr(const Tensor& self) { - return get_sparse_csr_impl(self)->col_indices().alias(); + return AT_DISPATCH_SPARSE_ROW_COMPRESSED_LAYOUTS(self.layout(), + "col_indices", + [&]{ return get_sparse_csr_impl(self)->plain_indices().alias(); }); +} + +Tensor ccol_indices_sparse_csr(const Tensor& self) { + return AT_DISPATCH_SPARSE_COL_COMPRESSED_LAYOUTS(self.layout(), + "ccol_indices", + [&]{ return get_sparse_csr_impl(self)->compressed_indices().alias(); }); +} + +Tensor row_indices_sparse_csr(const Tensor& self) { + return AT_DISPATCH_SPARSE_COL_COMPRESSED_LAYOUTS(self.layout(), + "row_indices", + [&]{ return get_sparse_csr_impl(self)->plain_indices().alias(); }); } bool _is_same_size_as_sparse_csr( @@ -339,23 +618,31 @@ const SparseCsrTensor& resize_as_sparse_csr_( return self; } -SparseCsrTensor clone_sparse_csr( - const SparseCsrTensor& self, - c10::optional optional_memory_format) { +SparseCsrTensor clone_sparse_compressed( + const SparseCsrTensor& self, + c10::optional optional_memory_format) { TORCH_CHECK( !optional_memory_format.has_value(), "unsupported memory format option ", optional_memory_format.value()); TensorOptions options = self.options(); - return at::native::_sparse_csr_tensor_unsafe( - self.crow_indices().clone(), - self.col_indices().clone(), - self.values().clone(), - self.sizes(), - optTypeMetaToScalarType(options.dtype_opt()), - options.layout_opt(), - options.device_opt(), - options.pinned_memory_opt()); + auto compressed_indices = AT_DISPATCH_ROW_SPARSE_COMPRESSED_LAYOUTS(self.layout(), + "clone_sparse_compressed", + [&]{ return self.crow_indices(); }, + [&]{ return self.ccol_indices(); }); + auto plain_indices = AT_DISPATCH_ROW_SPARSE_COMPRESSED_LAYOUTS(self.layout(), + "clone_sparse_compressed", + [&]{ return self.col_indices(); }, + [&]{ return self.row_indices(); }); + return at::native::_sparse_compressed_tensor_unsafe( + compressed_indices.clone(), + plain_indices.clone(), + self.values().clone(), + self.sizes(), + optTypeMetaToScalarType(options.dtype_opt()), + options.layout_opt(), + options.device_opt(), + options.pinned_memory_opt()); } Tensor empty_like_sparse_csr( @@ -377,9 +664,9 @@ Tensor empty_like_sparse_csr( self.col_indices().clone(), at::empty(self.values().sizes(), options.layout(kStrided)), self.sizes(), - dtype, + optTypeMetaToScalarType(options.dtype()), self.layout(), - device); + options.device()); return result; } else if (options.layout() == kStrided) { return at::native::empty_like(self, dtype, layout, device, pin_memory, optional_memory_format); @@ -388,5 +675,43 @@ Tensor empty_like_sparse_csr( } } +Tensor select_sparse_csr(const Tensor& self, int64_t dim, int64_t index) { + TORCH_INTERNAL_ASSERT(self.is_sparse_csr()); + TORCH_CHECK_INDEX(self.dim() != 0, "select() cannot be applied to a 0-dim tensor."); + dim = maybe_wrap_dim(dim, self.dim()); + auto size = self.size(dim); + if (index < -size || index >= size) { + TORCH_CHECK_INDEX(false, "select(): index ", index, " out of range for tensor of size ", + self.sizes(), " at dimension ", dim); + } + if (index < 0) { + index += size; + } + + TORCH_INTERNAL_ASSERT(dim >= 0 && dim < self.dim()); + + auto new_sizes = DimVector(self.sizes()); + new_sizes.erase(new_sizes.begin() + dim); + auto options = self.options(); + + // Selecting batch dimension + if (dim < self.dim() - 2) { + return at::native::_sparse_csr_tensor_unsafe( + self.crow_indices().select(dim, index), + self.col_indices().select(dim, index), + self.values().select(dim, index), + new_sizes, + optTypeMetaToScalarType(options.dtype_opt()), + options.layout_opt(), + options.device_opt(), + options.pinned_memory_opt()); + } else { + TORCH_CHECK(self.dim() == 2, "select(): selecting rows or columns is not implemented for batched sparse CSR tensors.") + // Converting to COO and calling select is slighly slower than operating on the CSR indices directly + // for constructing a COO vector, however current version is more readable and easier to understand. + return self.to_sparse().select(dim, index); + } +} + } // namespace native } // namespace at diff --git a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp index 8d17356ea5a1..5d520142cf0b 100644 --- a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp +++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp @@ -1,16 +1,17 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS -#include #include #include #include #include #include +#include #include #include #include #include #include #include +#include #include #ifndef AT_PER_OPERATOR_HEADERS @@ -21,7 +22,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -50,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -63,9 +67,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -85,12 +91,16 @@ #include #include #include +#include +#include #include #include #include #include +#include #include #include +#include #include #endif @@ -99,19 +109,22 @@ namespace at { namespace meta { -TORCH_META_FUNC(_convert_indices_from_coo_to_csr) ( - const Tensor& self, const int64_t size, const bool out_int32 -) { +TORCH_META_FUNC(_convert_indices_from_coo_to_csr) +(const Tensor& self, const int64_t size, const bool out_int32) { TORCH_CHECK(self.dim() <= 1, "Input is supposed to be a vector"); ScalarType scalar_type = out_int32 ? ScalarType::Int : ScalarType::Long; - c10::TensorOptions options = TensorOptions().device(self.options().device()).dtype(scalar_type); + c10::TensorOptions options = + TensorOptions().device(self.options().device()).dtype(scalar_type); set_output(size + 1, options); } -TORCH_META_FUNC(_convert_indices_from_csr_to_coo) ( - const Tensor& crow_indices, const Tensor& col_indices, const bool out_int32, const bool transpose -) { - TORCH_CHECK(crow_indices.dim() == 1, "crow_indices is supposed to be a vector"); +TORCH_META_FUNC(_convert_indices_from_csr_to_coo) +(const Tensor& crow_indices, + const Tensor& col_indices, + const bool out_int32, + const bool transpose) { + TORCH_CHECK( + crow_indices.dim() == 1, "crow_indices is supposed to be a vector"); TORCH_CHECK(col_indices.dim() == 1, "col_indices is supposed to be a vector"); ScalarType scalar_type = out_int32 ? ScalarType::Int : ScalarType::Long; c10::TensorOptions options = crow_indices.options().dtype(scalar_type); @@ -124,33 +137,6 @@ namespace { constexpr int64_t GRAIN_SIZE = at::internal::GRAIN_SIZE; -template -void convert_indices_from_coo_to_csr_cpu(const Tensor& result, const Tensor& input, const int64_t size) { - int64_t numel = input.numel(); - const input_t* data_in = input.data_ptr(); - output_t* data_out = result.data_ptr(); - - if (numel == 0) { - result.zero_(); - return; - } - - for (int64_t i = 0; i <= data_in[0]; i++) - data_out[i] = static_cast(0); - - at::parallel_for(0, numel - 1, GRAIN_SIZE, [&](int64_t start, int64_t end) { - input_t curr_value = data_in[start], next_value; - for (const auto i : c10::irange(start, end)) { - next_value = data_in[i + 1]; - for (; curr_value < next_value; curr_value++) - data_out[curr_value + 1] = static_cast(i + 1); - } - }); - - for (int64_t i = data_in[numel - 1] + 1; i < size + 1; i++) - data_out[i] = static_cast(numel); -} - template Tensor& unary_op_out(F op_out, const Tensor& self, Tensor& result) { TORCH_INTERNAL_ASSERT(self.is_sparse_csr()); @@ -162,9 +148,9 @@ Tensor& unary_op_out(F op_out, const Tensor& self, Tensor& result) { if (result.numel() == 0) { at::native::resize_as_sparse_csr_(result, self); } - // copy_sparse_csr_ internally checks the sizes of result and self tensors + // copy_sparse_compressed_ internally checks the sizes of result and self tensors // Hence no external size check required - at::native::copy_sparse_csr_(result, self); + at::native::copy_sparse_compressed_(result, self); } auto self_values = self.values(); @@ -174,7 +160,7 @@ Tensor& unary_op_out(F op_out, const Tensor& self, Tensor& result) { return result; } -template +template Tensor& unary_op_inplace(Tensor& self, const F& op_inplace, Args&&... args) { TORCH_INTERNAL_ASSERT(self.is_sparse_csr()); @@ -184,7 +170,11 @@ Tensor& unary_op_inplace(Tensor& self, const F& op_inplace, Args&&... args) { } template -void convert_indices_from_csr_to_coo_cpu(const Tensor& indices, const Tensor& crow_indices, const Tensor& col_indices, const bool transpose=false) { +void convert_indices_from_csr_to_coo_cpu( + const Tensor& indices, + const Tensor& crow_indices, + const Tensor& col_indices, + const bool transpose = false) { int64_t nrows = crow_indices.numel() - 1; if (nrows == 0) { indices.zero_(); @@ -193,16 +183,18 @@ void convert_indices_from_csr_to_coo_cpu(const Tensor& indices, const Tensor& cr auto crow_indices_ = crow_indices.expect_contiguous(); const input_t* crow_indices_data_in = crow_indices_->data_ptr(); TORCH_INTERNAL_ASSERT(indices.is_contiguous()); - auto row0 = indices.select(0, transpose?1:0); - auto row1 = indices.select(0, transpose?0:1); + auto row0 = indices.select(0, transpose ? 1 : 0); + auto row1 = indices.select(0, transpose ? 0 : 1); output_t* data_out = row0.data_ptr(); row1.copy_(*col_indices.expect_contiguous()); at::parallel_for(0, nrows, GRAIN_SIZE, [&](int64_t start, int64_t end) { for (const auto i : c10::irange(start, end)) { - std::fill(&data_out[crow_indices_data_in[i]], &data_out[crow_indices_data_in[i + 1]], static_cast(i)); + std::fill( + &data_out[crow_indices_data_in[i]], + &data_out[crow_indices_data_in[i + 1]], + static_cast(i)); } }); - } } // end anonymous namespace @@ -221,26 +213,27 @@ inline Tensor get_result_tensor_for_unary_op(F op, const Tensor& input) { // To handle type promotion for inputs to unary ops, // we first get the result from the underlined op, and use the result - // to create a sparse CSR tensor, which is used as the input to the out= variant + // to create a sparse CSR tensor, which is used as the input to the out= + // variant auto result_values = op(values); auto result = at::native::_sparse_csr_tensor_unsafe( - input.crow_indices().clone(), - input.col_indices().clone(), - result_values, - input.sizes(), - result_values.scalar_type(), - input.layout(), - result_values.device()); + input.crow_indices().clone(), + input.col_indices().clone(), + result_values, + input.sizes(), + result_values.scalar_type(), + input.layout(), + result_values.device()); return result; } -} +} // namespace static constexpr bool is_mkl_supported() { #ifdef _MSC_VER return false; -#elif __APPLE__ || __MACH__ +#elif __APPLE__ || __MACH__ return false; #else return true; @@ -248,41 +241,79 @@ static constexpr bool is_mkl_supported() { } // Only accept squares sparse matrices or dense input as a vector -// TODO: Check what happens with MKL, the output error reported with non square matrices tends to be high -// See: https://github.com/pytorch/pytorch/issues/58770 +// TODO: Check what happens with MKL, the output error reported with non square +// matrices tends to be high See: +// https://github.com/pytorch/pytorch/issues/58770 bool is_square_or_vec(int64_t dim_i, int64_t dim_j, int64_t dim_k) { - return (dim_i == dim_k && dim_k == dim_j) || (dim_i == dim_j && dim_k == 1); + return (dim_i == dim_k && dim_k == dim_j) || (dim_i == dim_j && dim_k == 1); } -Tensor& normal_sparse_csr_(Tensor& self, double mean, double std, c10::optional gen) { +Tensor& normal_sparse_csr_( + Tensor& self, + double mean, + double std, + c10::optional gen) { return unary_op_inplace(self, &Tensor::normal_, mean, std, gen); } +Tensor& fill_sparse_csr_(Tensor& self, const Scalar& value) { + return unary_op_inplace(self, &TensorBase::fill_, value); +} + +Tensor sparse_mask_sparse_csr( + const Tensor& self, + const Tensor& sparse_mask) { + TORCH_CHECK(sparse_mask.is_sparse_csr(), "sparse_mask_sparse_csr expects mask to be sparse csr"); + TORCH_CHECK(self.dim() == 2, "sparse_mask_sparse_csr expects self to be 2D"); + TORCH_CHECK(sparse_mask.dim() == 2, "sparse_mask_sparse_csr expects mask to be 2D"); + + // We are computing self.mul(at::ones_like(sparse_mask)) + // But mul(dense, sparse_csr) is not implemented yet + if (self.layout() == sparse_mask.layout()) { + // Both inputs are CSR + return self.mul(at::ones_like(sparse_mask)); + } else { + return self.sparse_mask(sparse_mask.to_sparse()).to_sparse_csr(); + } +} + +Tensor mul_scalar_sparse_csr(const Tensor& self, const Scalar& other) { + auto result_values = self.values().mul(other); + return at::native::_sparse_csr_tensor_unsafe( + self.crow_indices().clone(), + self.col_indices().clone(), + result_values, + self.sizes(), + result_values.scalar_type(), + self.layout(), + result_values.device()); +} + /* Implementation of Unary Ufuncs, those supported for Sparse CSR Layout * Only simple funcs, with 0->0 correspondence are currently supported. */ -#define CREATE_UNARY_UFUNC_OUT(op_name) \ - Tensor& op_name##_sparse_csr_out(const Tensor& self, Tensor& result) { \ - return unary_op_out(&at::op_name##_outf, self, result); \ +#define CREATE_UNARY_UFUNC_OUT(op_name) \ + Tensor& op_name##_sparse_csr_out(const Tensor& self, Tensor& result) { \ + return unary_op_out(&at::op_name##_outf, self, result); \ } -#define CREATE_UNARY_UFUNC_FUNCTIONAL(op_name) \ - Tensor op_name##_sparse_csr(const Tensor& self) { \ - return get_result_tensor_for_unary_op(&at::op_name, self); \ +#define CREATE_UNARY_UFUNC_FUNCTIONAL(op_name) \ + Tensor op_name##_sparse_csr(const Tensor& self) { \ + return get_result_tensor_for_unary_op(&at::op_name, self); \ } -#define CREATE_UNARY_UFUNC_INPLACE(op_name) \ - Tensor& op_name##_sparse_csr_(Tensor& self) { \ - return unary_op_inplace(self, &Tensor::op_name##_); \ +#define CREATE_UNARY_UFUNC_INPLACE(op_name) \ + Tensor& op_name##_sparse_csr_(Tensor& self) { \ + return unary_op_inplace(self, &Tensor::op_name##_); \ } -#define CREATE_UNARY_UFUNC(op_name) \ - CREATE_UNARY_UFUNC_OUT(op_name); \ - CREATE_UNARY_UFUNC_FUNCTIONAL(op_name); \ +#define CREATE_UNARY_UFUNC(op_name) \ + CREATE_UNARY_UFUNC_OUT(op_name); \ + CREATE_UNARY_UFUNC_FUNCTIONAL(op_name); \ CREATE_UNARY_UFUNC_INPLACE(op_name); -#define CREATE_UNARY_UFUNC_NO_INPLACE(op_name) \ - CREATE_UNARY_UFUNC_OUT(op_name); \ +#define CREATE_UNARY_UFUNC_NO_INPLACE(op_name) \ + CREATE_UNARY_UFUNC_OUT(op_name); \ CREATE_UNARY_UFUNC_FUNCTIONAL(op_name); // Exhaustive list of the unary ufuncs supported by sparse CSR @@ -309,6 +340,8 @@ CREATE_UNARY_UFUNC(tanh); CREATE_UNARY_UFUNC(trunc); CREATE_UNARY_UFUNC(conj_physical); +CREATE_UNARY_UFUNC_INPLACE(zero); + // With addition of `round.decimals` overload, using CREATE_UNARY_UFUNC leads // to unresolved overload. Tensor& round_sparse_csr_out(const Tensor& self, Tensor& result) { @@ -336,8 +369,12 @@ CREATE_UNARY_UFUNC_FUNCTIONAL(isnan); CREATE_UNARY_UFUNC_FUNCTIONAL(isinf); template -void addmm_out_sparse_csr_native_cpu(const Tensor& sparse, const Tensor& dense, const Tensor& r, Scalar alpha, Scalar beta) { - +void addmm_out_sparse_csr_native_cpu( + const Tensor& sparse, + const Tensor& dense, + const Tensor& r, + Scalar alpha, + Scalar beta) { auto dim_i = sparse.size(0); auto dim_k = dense.size(1); @@ -347,41 +384,46 @@ void addmm_out_sparse_csr_native_cpu(const Tensor& sparse, const Tensor& dense, scalar_t cast_alpha = alpha.to(); r.mul_(beta); - AT_DISPATCH_INDEX_TYPES(col_indices.scalar_type(), "csr_mm_crow_indices", [&]() { - auto csr_accessor = csr.accessor(); - auto col_indices_accessor = col_indices.accessor(); - - auto values_accessor = values.accessor(); - scalar_t* dense_ptr = dense.data_ptr(); - scalar_t* r_ptr = r.data_ptr(); - - int64_t dense_stride0 = dense.stride(0); - int64_t dense_stride1 = dense.stride(1); - int64_t r_stride0 = r.stride(0); - int64_t r_stride1 = r.stride(1); - - at::parallel_for( - 0, - dim_i, - internal::GRAIN_SIZE, - [&](int64_t irow_start, int64_t irow_end) { - for (index_t h = irow_start; h < irow_end; ++h) { - index_t i_start = csr_accessor[h]; - index_t i_end = csr_accessor[h+1]; - for (index_t i = i_start; i < i_end; i++) { - scalar_t val = values_accessor[i]; - index_t col = col_indices_accessor[i]; - at::native::cpublas::axpy(dim_k, - cast_alpha * val, - dense_ptr + col * dense_stride0, dense_stride1, - r_ptr + h * r_stride0, r_stride1); + AT_DISPATCH_INDEX_TYPES( + col_indices.scalar_type(), "csr_mm_crow_indices", [&]() { + auto csr_accessor = csr.accessor(); + auto col_indices_accessor = col_indices.accessor(); + + auto values_accessor = values.accessor(); + scalar_t* dense_ptr = dense.data_ptr(); + scalar_t* r_ptr = r.data_ptr(); + + int64_t dense_stride0 = dense.stride(0); + int64_t dense_stride1 = dense.stride(1); + int64_t r_stride0 = r.stride(0); + int64_t r_stride1 = r.stride(1); + + at::parallel_for( + 0, + dim_i, + internal::GRAIN_SIZE, + [&](int64_t irow_start, int64_t irow_end) { + for (index_t h = irow_start; h < irow_end; ++h) { + index_t i_start = csr_accessor[h]; + index_t i_end = csr_accessor[h + 1]; + for (index_t i = i_start; i < i_end; i++) { + scalar_t val = values_accessor[i]; + index_t col = col_indices_accessor[i]; + at::native::cpublas::axpy( + dim_k, + cast_alpha * val, + dense_ptr + col * dense_stride0, + dense_stride1, + r_ptr + h * r_stride0, + r_stride1); + } } - } - }); - }); + }); + }); } // Functions for matrix multiplication. +// result = beta * self + alpha (mat1 @ mat2) Tensor& addmm_out_sparse_csr_cpu( const Tensor& self, const Tensor& mat1, @@ -389,62 +431,61 @@ Tensor& addmm_out_sparse_csr_cpu( const Scalar& beta, const Scalar& alpha, Tensor& result) { - TORCH_INTERNAL_ASSERT(mat1.is_sparse_csr()); - // TODO: remove this, there are no codegenerated checks for devices yet - TORCH_CHECK( - !self.is_cuda(), - "Expected all tensors to be on the same device. addmm expected 't' to be CPU tensor, but got CUDA tensor"); - TORCH_CHECK( - !result.is_cuda(), - "Expected all tensors to be on the same device. addmm: expected 'out' to be CPU tensor, but got CUDA tensor"); - TORCH_CHECK( - !mat1.is_cuda(), - "Expected all tensors to be on the same device. addmm: expected 'mat1' to be a CPU tensor, but got a CUDA tensor"); - TORCH_CHECK( - !mat2.is_cuda(), - "Expected all tensors to be on the same device. addmm: expected 'mat2' to be a CPU tensor, but got a CUDA tensor"); + sparse::impl::_check_is_cpu(self, "self"); + sparse::impl::_check_is_cpu(mat1, "mat1"); + sparse::impl::_check_is_cpu(mat2, "mat2"); + sparse::impl::_check_is_cpu(result, "result"); - // All the checks are from addmm_out_cuda_impl (ATen/native/cuda/Blas.cpp) and TORCH_META_FUNC(addmm) (ATen/native/LinearAlgebra.cpp) + // All the checks are from addmm_out_cuda_impl (ATen/native/cuda/Blas.cpp) and + // TORCH_META_FUNC(addmm) (ATen/native/LinearAlgebra.cpp) // TODO: remove code duplication and unify code - TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor"); - TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor"); + sparse::impl::_check_dim(mat1, 2, "mat1"); + sparse::impl::_check_dim(mat2, 2, "mat2"); + TORCH_CHECK( - mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (", - mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")"); - - IntArrayRef mat1_sizes = mat1.sizes(); - IntArrayRef mat2_sizes = mat2.sizes(); - IntArrayRef self__sizes; - c10::MaybeOwned self_; - if (&result != &self && self.layout() == kStrided) { - self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm"); - self__sizes = self_->sizes(); + mat1.size(1) == mat2.size(0), "mat1 and mat2 shapes cannot be multiplied (", + mat1.size(0), "x", mat1.size(1), " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")"); + + c10::MaybeOwned self_; + // Don't expand self if this is an in-place operation + if (&result == &self) { + self_ = c10::MaybeOwned::borrowed(self); } else { - self_ = c10::MaybeOwned::borrowed(self); - self__sizes = self_->sizes(); + self_ = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm"); } - TORCH_CHECK(((self_->dim() == 2) && (self_->sizes()[0] == mat1.sizes()[0]) && (self_->sizes()[1] == mat2.sizes()[1])), - "The input tensor must be a matrix with size ", mat1.sizes()[0], "x", mat2.sizes()[1], ", but got a ", self_->dim(), - "-D tensor with size ", self__sizes[0], "x", self__sizes[1]); + + TORCH_CHECK(((self_->dim() == 2) && + (self_->size(0) == mat1.size(0)) && + (self_->size(1) == mat2.size(1))), + "The input tensor must be a matrix with size ", + mat1.size(0), + "x", + mat2.size(1), + ", but got a ", + self_->dim(), + "-D tensor with size ", + self_->size(0), + "x", + self_->size(1)); if (&result != &self) { if (result.layout() == kStrided) { - at::native::resize_output(result, self__sizes); + at::native::resize_output(result, self_->sizes()); } else { - at::native::resize_as_sparse_csr_(result, *self_); + result.resize_as_sparse_(*self_); } result.copy_(*self_); } - IntArrayRef result_sizes = result.sizes(); - if ((result_sizes[0] == 0) || (result_sizes[1] == 0)) { + if (result.numel() == 0) { return result; } - if (mat1._nnz() == 0 && mat2.layout() == kStrided) { - // According to docs, when beta==0 values in self should be ignored. nans and infs should not propagate + if (sparse::impl::_is_sparse_and_zero(mat1) || sparse::impl::_is_sparse_and_zero(mat2)) { + // According to docs, when beta==0 values in self should be ignored. + // nans and infs should not propagate if (beta.toComplexDouble() == 0.) { result.zero_(); } else { @@ -453,26 +494,19 @@ Tensor& addmm_out_sparse_csr_cpu( return result; } - if (mat2.is_sparse_csr() && (mat1._nnz() == 0 || mat2._nnz() == 0)) { - if (beta.toComplexDouble() == 0.) { - result.values().zero_(); - } else { - result.values().mul_(beta); - } - return result; - } - #if !AT_USE_MKL_SPARSE() - if (mat2.is_sparse_csr() && result.is_sparse_csr()) { - TORCH_CHECK( - false, - "Calling addmm on sparse CPU tensors requires Linux platform. ", - "Please use PyTorch built with MKL on Linux."); - } - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.layout() == kStrided); - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(result.scalar_type(), "addmm_sparse_dense", [&] { - addmm_out_sparse_csr_native_cpu(mat1, mat2, result, alpha, beta); - }); + TORCH_CHECK( + (mat1.is_sparse_csr() || + (mat2.is_sparse_csr() && result.is_sparse_csr())), + false, + "Calling addmm on sparse CPU tensors requires Linux platform. ", + "Please use PyTorch built with MKL on Linux."); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.layout() == kStrided); + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES( + result.scalar_type(), "addmm_sparse_dense", [&] { + addmm_out_sparse_csr_native_cpu( + mat1, mat2, result, alpha, beta); + }); #else sparse::impl::mkl::addmm_out_sparse_csr(mat1, mat2, beta, alpha, result); #endif @@ -504,17 +538,36 @@ Tensor& _sparse_csr_mm_out( return at::addmm_out(result, zero, mat1, mat2, 0.0, 1.0); } -Tensor _sparse_csr_mm( - const Tensor& mat1, - const Tensor& mat2) { - Tensor zero; +Tensor _sparse_csr_mm(const Tensor& mat1, const Tensor& mat2) { if (mat1.is_sparse_csr() && mat2.is_sparse_csr()) { + // Return sparse // TODO: replace with at::zeros when it's implemented for sparse csr - zero = at::empty({mat1.size(0), mat2.size(1)}, mat2.options()); - } else { - zero = at::zeros({mat1.size(0), mat2.size(1)}, mat2.options()); + return at::addmm( + at::empty({mat1.size(0), mat2.size(1)}, mat2.options()), + mat1, + mat2, + 0.0, + 1.0); + } + if (mat1.is_sparse_csr() && mat2.layout() == c10::kStrided) { + // Return dense + return at::addmm( + at::zeros({mat1.size(0), mat2.size(1)}, mat2.options()), + mat1, + mat2, + 0.0, + 1.0); + } + if (mat1.layout() == c10::kStrided && mat2.is_sparse_csr()) { + // Return dense + return at::addmm( + at::zeros({mat1.size(0), mat2.size(1)}, mat1.options()), + mat1, + mat2, + 0.0, + 1.0); } - return at::addmm(zero, mat1, mat2, 0.0, 1.0); + TORCH_INTERNAL_ASSERT(false, "Shouldn't get here. Please open an issue."); } Tensor _sparse_csr_addmm( @@ -530,14 +583,20 @@ Tensor _sparse_csr_addmm( } // Functions for element-wise addition. -Tensor add_sparse_csr(const Tensor& self, const Tensor& other, const Scalar& alpha) { +Tensor add_sparse_csr( + const Tensor& self, + const Tensor& other, + const Scalar& alpha) { auto commonDtype = at::result_type(self, other); alpha_check(commonDtype, alpha); Tensor result = at::empty({0, 0}, self.options().dtype(commonDtype)); return at::add_out(result, self, other, alpha); // redispatch! } -Tensor& add_sparse_csr_(Tensor& self, const Tensor& other, const Scalar& alpha) { +Tensor& add_sparse_csr_( + Tensor& self, + const Tensor& other, + const Scalar& alpha) { return at::add_out(self, self, other, alpha); // redispatch! } @@ -581,13 +640,10 @@ void add_out_dense_sparse_csr_cpu( " in add operation"); auto src_values = src.values(); - auto src_crow_indices = src.crow_indices(); - auto src_col_indices = src.col_indices(); resize_output(out, dense.sizes()); Tensor resultBuffer = out; - Tensor valuesBuffer = src_values.to(commonDtype); if (out.scalar_type() != commonDtype) { resultBuffer = dense.to(commonDtype); @@ -595,36 +651,54 @@ void add_out_dense_sparse_csr_cpu( resultBuffer.copy_(dense); } + if (src._nnz() == 0) { + return; + } + + auto valuesBuffer = src_values.to(commonDtype).view({-1, src_values.size(-1)}); + resultBuffer = resultBuffer.view({-1, out.size(-2), out.size(-1)}); + auto src_crow_indices = src.crow_indices().view({-1, src.crow_indices().size(-1)}); + auto src_col_indices = src.col_indices().view({-1, src.col_indices().size(-1)}); + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( - kHalf, kBool, kBFloat16, + kHalf, + kBool, + kBFloat16, commonDtype, "add_out_op2_sparse_csr", - [&valuesBuffer, &resultBuffer, &alpha, &src_crow_indices, &src_col_indices]() { + [&valuesBuffer, + &resultBuffer, + &alpha, + &src_crow_indices, + &src_col_indices]() { AT_DISPATCH_INDEX_TYPES( src_crow_indices.scalar_type(), "csr_add_out_crow_indices", - [&valuesBuffer, &resultBuffer, &alpha, &src_crow_indices, &src_col_indices]() { - auto values_accessor = valuesBuffer.accessor(); + [&valuesBuffer, + &resultBuffer, + &alpha, + &src_crow_indices, + &src_col_indices]() { + auto batch_count = resultBuffer.dim() > 2 ? resultBuffer.size(-3) : 1; + auto values_accessor = valuesBuffer.accessor(); scalar_t* out_ptr = resultBuffer.data_ptr(); scalar_t cast_value = alpha.to(); auto crow_indices_accessor = - src_crow_indices.accessor(); + src_crow_indices.accessor(); auto col_indices_accessor = - src_col_indices.accessor(); - auto out_strides0 = resultBuffer.strides()[0]; - auto out_strides1 = resultBuffer.strides()[1]; - - for (index_t irow = 0; irow < src_crow_indices.size(0) - 1; - ++irow) { - index_t start_index = crow_indices_accessor[irow]; - index_t end_index = crow_indices_accessor[irow + 1]; - - for (index_t i = start_index; i < end_index; ++i) { - auto icol = col_indices_accessor[i]; - auto index = resultBuffer.storage_offset() + irow * out_strides0 + - icol * out_strides1; - out_ptr[index] += cast_value * values_accessor[i]; + src_col_indices.accessor(); + auto out_strides = resultBuffer.strides(); + + for (const auto batch_idx : c10::irange(batch_count)) { + for (const auto irow : c10::irange(src_crow_indices.size(-1) - 1)) { + index_t start_index = crow_indices_accessor[batch_idx][irow]; + index_t end_index = crow_indices_accessor[batch_idx][irow + 1]; + for (const auto i : c10::irange(start_index, end_index)) { + auto icol = col_indices_accessor[batch_idx][i]; + auto index = batch_idx * out_strides[0] + irow * out_strides[1] + icol * out_strides[2]; + out_ptr[index] += cast_value * values_accessor[batch_idx][i]; + } } } }); @@ -654,32 +728,358 @@ Tensor& add_out_sparse_csr_cpu( return out; } -TORCH_IMPL_FUNC(_convert_indices_from_coo_to_csr_structured_cpu) ( - const Tensor& input, const int64_t size, const bool out_int32, const Tensor& result -) { - if (out_int32) { - AT_DISPATCH_INTEGRAL_TYPES(input.scalar_type(), "convert_indices_from_coo_to_csr_cpu", [&] { - convert_indices_from_coo_to_csr_cpu(result, input, size); - }); +/* + Reductions on sparse CSR tensors using masked semantics. + + - A CSR tensor is a 2D tensor that is specified by a 3-tuple + (crow_indices, col_indices, values). + + - To support a reduction operator on a CSR tensor, define: + +template +struct Reduction...Op { + inline scalar_t operator()(const scalar_t& a, const scalar_t& b) const { + return a ... b; + } + inline scalar_t identity() const { return ...; } +}; + +Tensor _sparse_csr_..._cpu(const Tensor& input, IntArrayRef dims_to_sum, bool keepdim, c10::optional dtype) { + ... + result = reduce_sparse_csr_cpu_template(input_, dims_to_sum, keepdim, Reduction...Op()); + ... + return result; +} + + and add the following + + - func: _sparse_csr_op.dim_dtype(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + dispatch: + SparseCsrCUDA: _sparse_csr_..._cpu + + to native_functions.yaml + + Use ReductionAddOp and _sparse_csr_sum implementation as an example. + + - Since a CSR tensor dimensionality is always 2, only reductions + with keepdim=True can be supported. + +*/ + +namespace { + +template +Tensor reduce_sparse_csr_dim0_cpu_template(const Tensor& sparse, ReductionOp rop) { + /* + Consider the following sparse tensor: + + 1 * * * * + * * * 2 * + * * 3 * * + * * * * * + 4 * 5 * * + + that has CSR representation + + crow_indices = [0, 1, 2, 3, 3, 5] + col_indices = [0, 3, 2, 0, 2] + values = [1, 2, 3, 4, 5] + + Reduction with dim=0 results: + + rop(1,4) * rop(3,5) 2 * + + that has CSR representation + + new_crow_indices = [0, 3] + new_col_indices = [0, 2, 3] + new_values = [rop(1, 4], rop(3, 5), 2] + + In general, the CSR representation data can be computed as follows: + + new_col_indices, col_map = col_indices.unique(sorted=True, return_inverse=True) + nnz = new_col_indices.numel() + new_crow_indices = [0, nnz] + new_values.resize(nnz); new_values.fill_(identity) + for i in range(col_indices.numel()): + new_values[col_map[i]] = rop(new_values[col_map[i], values[i]) + */ + + Tensor col_indices = sparse.col_indices(); + Tensor values = sparse.values(); + auto numel = values.numel(); + Tensor new_col_indices; + Tensor columns_map; + + /* + Calling at::_unique constitutes the main bottleneck of this + function. However, it is still about 5x faster than using the + invariant: + csr.sum(dim=0) == csr.transpose(0, 1).sum(dim=1) + */ + std::tie(new_col_indices, columns_map) = at::_unique(col_indices, true, true); + auto nnz = new_col_indices.numel(); + + Tensor new_crow_indices = at::empty({2}, col_indices.options()); + new_crow_indices[0] = 0; + new_crow_indices[1] = nnz; + + Tensor new_values = at::empty({nnz}, values.options()); + new_values.fill_(rop.identity()); + + AT_DISPATCH_INDEX_TYPES(col_indices.scalar_type(), "reduce_sparse_csr_dim0_cpu_indices", + [&]() { + index_t* columns_map_ptr = columns_map.data_ptr(); + scalar_t* values_ptr = values.data_ptr(); + scalar_t* new_values_ptr = new_values.data_ptr(); + + // There is no point in parallelizing the following for-loop + // because about 99.3% of the computation time is spent in the + // at::_unique call above. + for (int64_t i=0; i +Tensor reduce_sparse_csr_dim1_cpu_template(const Tensor& sparse, ReductionOp rop) { + /* + Consider the following sparse tensor: + + 1 * * * * + * * * 2 * + * * 3 * * + * * * * * + 4 * 5 * * + + that has CSR representation + + crow_indices = [0, 1, 2, 3, 3, 5] + col_indices = [0, 3, 2, 0, 2] + values = [1, 2, 3, 4, 5] + + Reduction with dim=1 results: + + 1 + 2 + 3 + * + rop(4, 5) + + that has CSR representation + + new_crow_indices = [0, 1, 2, 3, 3, 4] + new_col_indices = [0, 0, 0, 0] + new_values = [1, 2, 3, rop(4, 5)] + + In general, the result CSR data can be computed as follows: + + new_crow_indices = [0] + for i in range(1, nrows+1): + new_crow_indices[i] = new_crow_indices[i-1] + (crow_indices[i] == crow_indices[i-1]) + nnz = new_crow_indices[-1] + new_col_indices = zeros(nnz) + new_values.resize(nnz) + j = -1 + for i in range(1, nrows+1): + if crow_indices[i] == crow_indices[i-1]: + continue + j += 1 + new_values[j] = rop(values[crow_indices[i] : crow_indices[i-1]]) + */ + + Tensor crow_indices = sparse.crow_indices(); + auto ioptions = crow_indices.options(); + Tensor values = sparse.values(); + auto nrows = sparse.size(0); + + Tensor new_crow_indices = at::empty({crow_indices.numel()}, ioptions); + Tensor new_col_indices = at::empty({}, ioptions); + Tensor new_values = at::empty({}, values.options()); + Tensor row_map = at::empty({nrows}, ioptions); + + AT_DISPATCH_INDEX_TYPES(crow_indices.scalar_type(), "reduce_sparse_csr_dim1_cpu_indices", + [&]() { + index_t* crow_indices_ptr = crow_indices.data_ptr(); + index_t* new_crow_indices_ptr = new_crow_indices.data_ptr(); + index_t* row_map_ptr = row_map.data_ptr(); + int64_t nnz = 0; + new_crow_indices_ptr[0] = 0; + for(int64_t i=0; i(); + scalar_t* new_values_ptr = new_values.data_ptr(); + + at::parallel_for( + 0, + nrows, + internal::GRAIN_SIZE, + [&](int64_t irow_start, int64_t irow_end) { + index_t i_end = crow_indices_ptr[irow_start]; + for (index_t h = irow_start; h < irow_end; ++h) { + index_t i_start = i_end; + i_end = crow_indices_ptr[h+1]; + if (i_start != i_end) { + scalar_t res = values_ptr[i_start]; + for (index_t i = i_start + 1; i < i_end; i++) { + res = rop(res, values_ptr[i]); + } + new_values_ptr[row_map_ptr[h]] = res; + } + } + }); + }); + + return at::native::_sparse_csr_tensor_unsafe(new_crow_indices, new_col_indices, new_values, + {sparse.size(0), 1}, + new_values.scalar_type(), + sparse.layout(), + new_values.device()); +} + +template +Tensor reduce_sparse_csr_dim01_cpu_template(const Tensor& sparse, ReductionOp rop) { + + auto ioptions = sparse.col_indices().options(); + Tensor values = sparse.values(); + auto numel = values.numel(); + auto nnz = std::min(1, numel); + + /* TODO: we can likely do about 3x better than parallel_reduce: + +In [2]: t=torch.randn(5000, 5000).to_sparse_csr() + +In [3]: %timeit torch._sparse_csr_sum(t, dim=(0, 1), keepdim=True) +3.39 ms ± 898 ns per loop (mean ± std. dev. of 7 runs, 100 loops each) + +In [4]: %timeit torch.sum(t.values()) +1.07 ms ± 291 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each) + */ + scalar_t* values_ptr = values.data_ptr(); + scalar_t value = at::parallel_reduce( + 0, + numel, + internal::GRAIN_SIZE, + rop.identity(), + [&](int64_t i_start, int64_t i_end, scalar_t identity) { + scalar_t res = identity; + for (int64_t i=i_start; i{0, nnz}, ioptions); + Tensor new_values; + if (numel > 0) { + new_values = at::empty({1}, values.options()); + new_values.fill_(value); } else { - AT_DISPATCH_INTEGRAL_TYPES(input.scalar_type(), "convert_indices_from_coo_to_csr_cpu", [&] { - convert_indices_from_coo_to_csr_cpu(result, input, size); - }); + new_values = at::empty({}, values.options()); + } + return at::native::_sparse_csr_tensor_unsafe(new_crow_indices, new_col_indices, new_values, + {1, std::min(1, sparse.size(1))}, + new_values.scalar_type(), + sparse.layout(), + new_values.device()); +} + +template +Tensor reduce_sparse_csr_cpu_template(const Tensor& sparse, std::vector dims, ReductionOp rop) { + if (dims.size() == 1) { + if (dims[0] == 0) { + return reduce_sparse_csr_dim0_cpu_template(sparse, rop); + } else { + TORCH_INTERNAL_ASSERT(dims[0] == 1); + return reduce_sparse_csr_dim1_cpu_template(sparse, rop); + } + } else if (dims.size() == 2) { + TORCH_INTERNAL_ASSERT(((dims[0] == 0 && dims[1] == 1) || (dims[0] == 1 && dims[1] == 0))); + return reduce_sparse_csr_dim01_cpu_template(sparse, rop); + } + TORCH_INTERNAL_ASSERT(dims.size() == 0); + // effective after gh-29137 has been resolved + return sparse.clone(); +} + +template +Tensor reduce_sparse_csr_cpu_template(const Tensor& sparse, IntArrayRef dims_to_sum, bool keepdim, ReductionOp rop) { + TORCH_INTERNAL_ASSERT(sparse.is_sparse_csr()); + TORCH_CHECK(keepdim, "reduction operations on CSR tensors with keepdim=False is unsupported"); + TORCH_INTERNAL_ASSERT(sparse.device() == kCPU); + + const int64_t input_dim = sparse.dim(); + TORCH_INTERNAL_ASSERT(input_dim == 2); + auto dims = dims_to_sum.vec(); + maybe_wrap_dims(dims, input_dim); + if (dims.size() == 0) { + // after gh-29137 is resolved, delete this if-block + dims.emplace_back(0); + dims.emplace_back(1); } + return reduce_sparse_csr_cpu_template(sparse, dims, rop); } -TORCH_IMPL_FUNC(_convert_indices_from_csr_to_coo_structured_cpu) ( - const Tensor& crow_indices, const Tensor& col_indices, const bool out_int32, const bool transpose, const Tensor& result -) { - if (out_int32) { - AT_DISPATCH_INTEGRAL_TYPES(crow_indices.scalar_type(), "convert_indices_from_csr_to_coo_cpu", [&] { - convert_indices_from_csr_to_coo_cpu(result, crow_indices, col_indices, transpose); +template +struct ReductionAddOp { + inline scalar_t operator()(const scalar_t& a, const scalar_t& b) const { + return a + b; + } + inline scalar_t identity() const { return 0; } +}; + +template +struct ReductionMulOp { + inline scalar_t operator()(const scalar_t& a, const scalar_t& b) const { + return a * b; + } + inline scalar_t identity() const { return 1; } +}; + +} // namespace + +Tensor _sparse_csr_sum_cpu(const Tensor& input, IntArrayRef dims_to_sum, bool keepdim, c10::optional dtype) { + ScalarType dtype_ = dtype.value_or(input.scalar_type()); + Tensor input_ = input.to(dtype_); + Tensor result; + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( + kHalf, kBFloat16, input_.scalar_type(), "_sparse_csr_sum_cpu", + [&] { + result = reduce_sparse_csr_cpu_template(input_, dims_to_sum, keepdim, ReductionAddOp()); }); - } else { - AT_DISPATCH_INTEGRAL_TYPES(crow_indices.scalar_type(), "convert_indices_from_csr_to_coo_cpu", [&] { - convert_indices_from_csr_to_coo_cpu(result, crow_indices, col_indices, transpose); + return result; +} + +Tensor _sparse_csr_prod_cpu(const Tensor& input, IntArrayRef dims_to_reduce, bool keepdim, c10::optional dtype) { + ScalarType dtype_ = dtype.value_or(input.scalar_type()); + Tensor input_ = input.to(dtype_); + Tensor result; + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( + kHalf, kBFloat16, input_.scalar_type(), "_sparse_csr_prod_cpu", + [&] { + result = reduce_sparse_csr_cpu_template(input_, dims_to_reduce, keepdim, ReductionMulOp()); }); - } + return result; } } // namespace native diff --git a/aten/src/ATen/native/sparse/SparseCsrTensorMath.h b/aten/src/ATen/native/sparse/SparseCsrTensorMath.h new file mode 100644 index 000000000000..a92added5f01 --- /dev/null +++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.h @@ -0,0 +1,65 @@ +#pragma once + +#include +#include + +namespace at { +namespace native { +namespace sparse { +namespace impl { + +// Returns true if all entries of self are zero +// TODO: This has potential to be a generic helper +inline bool _is_sparse_and_zero(const Tensor& self) { + if (self.layout() == kSparse || self.layout() == kSparseCsr || + self.layout() == kSparseCsc || self.layout() == kSparseBsr || + self.layout() == kSparseBsc) { + if (self._nnz() == 0) { + return true; + } + } + return false; +} + +inline void _check_is_cpu(const Tensor& self, c10::string_view name) { + TORCH_CHECK( + self.is_cpu(), + "Expected all tensors to be on the same device. addmm expected '", + name, + "' to be CPU tensor, but got ", + self.device(), + " tensor"); +} + +inline void _check_is_cuda(const Tensor& self, c10::string_view name) { + TORCH_CHECK( + self.is_cuda(), + "Expected all tensors to be on the same device. addmm expected '", + name, + "' to be CUDA tensor, but got ", + self.device(), + " tensor"); +} + +inline void _check_dim(const Tensor& self, int64_t target_dim, c10::string_view name) { + if (target_dim == 2) { + TORCH_CHECK( + self.dim() == target_dim, + name, " must be a matrix, ", + "got ", self.dim(), "-D tensor"); + } + TORCH_CHECK( + self.dim() == target_dim, + "Expected ", + name, + " to be of dimension ", + target_dim, + " but got ", + self.dim(), + " instead."); +} + +} +} +} +} diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp index 814acad4c7f6..784aa4f4a64d 100644 --- a/aten/src/ATen/native/sparse/SparseTensor.cpp +++ b/aten/src/ATen/native/sparse/SparseTensor.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -344,7 +345,7 @@ void _validate_sparse_coo_tensor_args( Tensor max_indices = std::get(indices.max(/* dim */ 1, /* keepdim */ false)); Tensor cpu_min_indices, cpu_max_indices; - if (indices.is_cuda()) { + if (!indices.is_cpu()) { cpu_min_indices = min_indices.to(at::DeviceType::CPU); cpu_max_indices = max_indices.to(at::DeviceType::CPU); } else { @@ -545,15 +546,6 @@ SparseTensor dense_to_sparse(const Tensor& self, int64_t sparse_dim) { // NB: Dropped the resizeNd variants -Tensor sparse_to_dense( - const SparseTensor& self, - c10::optional dtype) { - TORCH_CHECK( - !dtype.has_value(), "dtype argument is not supported by sparse_to_dense"); - Tensor dst = at::zeros(self.sizes(), self.options().layout(kStrided)); - return dst.add_(self); -} - SparseTensor& copy_sparse_wrapper_( Tensor& self, const Tensor& src, @@ -640,7 +632,8 @@ SparseTensor _coalesce_sparse_cpu(const SparseTensor& self) { auto indicesBufferAccessor = indicesBuffer.accessor(); int64_t i = -1; - AT_DISPATCH_ALL_TYPES_AND_COMPLEX(values.scalar_type(), "coalesce", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::BFloat16, at::ScalarType::Half, at::ScalarType::Bool, values.scalar_type(), + "coalesce", [&] { int64_t prev = -1; int64_t blockSize = values.stride(0); scalar_t* values_ptr = values.data_ptr(); @@ -769,7 +762,7 @@ SparseTensor& sparse_mask_out_cpu( // TODO: Re-audit this; it used to be an indexSelect directly into r_values at::index_select_out(r_values, t_view, 0, indices); } else { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX(r_values.scalar_type(), "sparse_mask", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Half, r_values.scalar_type(), "sparse_mask", [&] { sparse_mask_out_cpu_kernel( r_values, t, r_nnz, sparse_dim, mask_indices); }); diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp index 611154fdee20..6963f60eaa22 100644 --- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp +++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -676,7 +677,7 @@ Tensor& add_out_dense_sparse_cpu(Tensor& r, const Tensor& dense, const SparseTen dstBuffer.add_(srcBuffer, value); } } else { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Bool, + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half, commonDtype, "add_dense_sparse", [&] { add_dense_sparse_worker_cpu(resultBuffer, value, sparse, indices, valuesBuffer); }); @@ -706,6 +707,42 @@ Tensor& mul_sparse_(Tensor& self, const Tensor& other) { return at::mul_out(self, self, other); // redispatch! } +Tensor& mul_out_sparse_csr(const Tensor& t_, const Tensor& src_, Tensor& r) { + // // TODO: Use a specialized CSR kernel for performance if needed + if (t_.is_sparse_csr() && src_.layout() == kStrided) { + return mul_out_sparse_csr(t_, src_.sparse_mask(t_), r); + } + if (t_.layout() == kStrided && src_.is_sparse_csr()) { + return mul_out_sparse_csr(t_.sparse_mask(src_), src_, r); + } + TORCH_CHECK(r.is_sparse_csr(), "Expected result Tensor to be of format CSR"); + Tensor t = t_.to_sparse(); + Tensor src = src_.to_sparse(); + Tensor tmp_result = t.mul(src); + auto r_sparse_csr = tmp_result.to_sparse_csr(); + r.resize_as_sparse_(r_sparse_csr); + r.copy_(r_sparse_csr); + return r; +} + +Tensor mul_sparse_csr(const Tensor& self, const Tensor& other) { + auto commonDtype = at::result_type(self, other); + if (self.is_sparse_csr() && other.layout() == kStrided) { + return mul_sparse_csr(self, other.sparse_mask(self)); + } + if (self.layout() == kStrided && other.is_sparse_csr()) { + return mul_sparse_csr(self.sparse_mask(other), other); + } + auto result_options = self.options().dtype(commonDtype); + // CSR is 2d! + Tensor result = at::empty({0, 0}, result_options); + return at::mul_out(result, self, other); // redispatch! +} + +Tensor& mul_sparse_csr_(Tensor& self, const Tensor& other) { + return at::mul_out(self, self, other); // redispatch! +} + SparseTensor& mul_out_sparse_cpu(const Tensor& t_, const Tensor& src_, SparseTensor& r) { if (src_.dim() == 0) { return mul_out_sparse_zerodim(r, t_, src_); @@ -781,7 +818,7 @@ SparseTensor& mul_out_sparse_cpu(const Tensor& t_, const Tensor& src_, SparseTen s_i++; } } else { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX( + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::BFloat16, at::ScalarType::Half, commonDtype, "mul_out_sparse", [&] { auto r_accessor = r_buffer.accessor(); auto t_accessor = t_values.accessor(); @@ -866,10 +903,22 @@ Tensor& s_addmm_out_sparse_dense_cpu( const Scalar& alpha ) { // TODO: This error message seems awfully opaque - TORCH_CHECK(!t.is_cuda(), "Expected all tensors to be on the same device. addmm expected 't' to be CPU tensor, but got CUDA tensor"); - TORCH_CHECK(!r.is_cuda(), "Expected all tensors to be on the same device. addmm: expected 'out' to be CPU tensor, but got CUDA tensor"); - TORCH_CHECK(!sparse_.is_cuda(), "Expected all tensors to be on the same device. addmm: expected 'mat1' to be a CPU tensor, but got a CUDA tensor"); - TORCH_CHECK(!dense.is_cuda(), "Expected all tensors to be on the same device. addmm: expected 'mat2' to be a CPU tensor, but got a CUDA tensor"); + TORCH_CHECK( + t.is_cpu(), + "Expected all tensors to be on the same device. addmm expected 't' to be CPU tensor, but got tensor on ", + t.device()); + TORCH_CHECK( + r.is_cpu(), + "Expected all tensors to be on the same device. addmm: expected 'out' to be CPU tensor, but got tensor on ", + t.device()); + TORCH_CHECK( + sparse_.is_cpu(), + "Expected all tensors to be on the same device. addmm: expected 'mat1' to be a CPU tensor, but got tensor on ", + t.device()); + TORCH_CHECK( + dense.is_cpu(), + "Expected all tensors to be on the same device. addmm: expected 'mat2' to be a CPU tensor, but got tensor on ", + t.device()); TORCH_CHECK(sparse_.sparse_dim() == 2, "addmm: matrices expected, got ", sparse_.sparse_dim(), "D tensor"); TORCH_CHECK(sparse_.dense_dim() == 0, "addmm: scalar values expected, got ", sparse_.dense_dim(), "D values"); @@ -969,11 +1018,14 @@ Tensor _sparse_addmm( } Tensor _sparse_mm( - const SparseTensor& sparse, - const Tensor& dense + const Tensor& mat1, + const Tensor& mat2 ) { - Tensor t = at::zeros({}, dense.options()); - return at::_sparse_addmm(t, sparse, dense, 0, 1); // redispatch! + if (mat1.is_sparse() && mat2.is_sparse()) { + return at::_sparse_sparse_matmul(mat1, mat2); + } + Tensor t = at::zeros({mat1.size(-2), mat2.size(-1)}, mat2.options()); + return at::_sparse_addmm(t, mat1, mat2, 0, 1); } // NB: Despite its suggestive name, this actually only exists so that @@ -1492,11 +1544,14 @@ scalar_t binary_search_strided_rightmost(scalar_t search_val, TensorAccessor::max(); bool done_searching = false; while (!done_searching) { - mid_ind = (left_ind+right_ind) >> 1; + mid_ind = left_ind + (right_ind - left_ind) / 2; scalar_t mid_val = sorted_arr_accessor[sorted_arr_begin_idx + mid_ind]; if (mid_val > search_val) { diff --git a/aten/src/ATen/native/sparse/cuda/SparseBlas.cpp b/aten/src/ATen/native/sparse/cuda/SparseBlas.cpp index 6a8b7253fbfc..0bfde528cb0e 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseBlas.cpp +++ b/aten/src/ATen/native/sparse/cuda/SparseBlas.cpp @@ -1,8 +1,11 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include +#include #include #include +#include +#include #ifndef AT_PER_OPERATOR_HEADERS #include @@ -40,46 +43,15 @@ Tensor& sparse_sampled_addmm_out_sparse_csr_cuda( const Scalar& beta, const Scalar& alpha, Tensor& result) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self.is_sparse_csr()); - - TORCH_CHECK(mat1.layout() == kStrided, "sampled_addmm: Expected mat1 to have strided layout, but got ", mat1.layout()); - TORCH_CHECK(mat2.layout() == kStrided, "sampled_addmm: Expected mat2 to have strided layout, but got ", mat2.layout()); - - TORCH_CHECK(result.layout() == kSparseCsr, "sampled_addmm: Expected result to have sparse csr layout, but got ", result.layout()); - - TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "sampled_addmm: Expected mat1 and mat2 to have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type()); - TORCH_CHECK(mat1.scalar_type() == self.scalar_type(), "sampled_addmm: Expected mat1 and self to have the same dtype, but got ", mat1.scalar_type(), " and ", self.scalar_type()); - TORCH_CHECK(result.scalar_type() == self.scalar_type(), "sampled_addmm: Expected result and self to have the same dtype, but got ", result.scalar_type(), " and ", self.scalar_type()); - - TORCH_CHECK( - mat1.dim() == 2, "sampled_addmm: Expected mat1 to be a matrix, got ", mat1.dim(), "-D tensor"); - TORCH_CHECK( - mat2.dim() == 2, "sampled_addmm: Expected mat2 to be a matrix, got ", mat2.dim(), "-D tensor"); - TORCH_CHECK( - result.dim() == 2, "sampled_addmm: Expected result to be a matrix, got ", result.dim(), "-D tensor"); - - IntArrayRef mat1_sizes = mat1.sizes(); - IntArrayRef mat2_sizes = mat2.sizes(); - TORCH_CHECK( - mat1_sizes[1] == mat2_sizes[0], - "sampled_addmm: mat1 and mat2 shapes cannot be multiplied (", - mat1_sizes[0], - "x", - mat1_sizes[1], - " and ", - mat2_sizes[0], - "x", - mat2_sizes[1], - ")"); - - IntArrayRef self_sizes = self.sizes(); - TORCH_CHECK( - self_sizes[0] == mat1_sizes[0], "sampled_addmm: self dim 0 must match mat1 dim 0"); - TORCH_CHECK( - self_sizes[1] == mat2_sizes[1], "sampled_addmm: self dim 1 must match mat2 dim 1"); + at::native::sparse::sparse_sampled_addmm_check_inputs( + self, mat1, mat2, beta, alpha, result); if (&result != &self) { - at::native::resize_as_sparse_csr_(result, self); + // We allow self to be a single matrix when mat1 and mat2 are batched + auto result_sizes = DimVector(mat1.sizes().slice(0, mat1.dim() - 2)); + result_sizes.push_back(self.size(-2)); + result_sizes.push_back(self.size(-1)); + at::sparse_csr::get_sparse_csr_impl(result)->resize_(self._nnz(), result_sizes); result.copy_(self); } @@ -103,6 +75,7 @@ Tensor sparse_sampled_addmm_sparse_csr_cuda( return result; } +// result = beta * self + alpha * (mat1 @ mat2) Tensor& addmm_out_sparse_csr_cuda( const Tensor& self, const Tensor& mat1, @@ -110,65 +83,63 @@ Tensor& addmm_out_sparse_csr_cuda( const Scalar& beta, const Scalar& alpha, Tensor& result) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat1.is_sparse_csr()); + sparse::impl::_check_is_cuda(self, "self"); + sparse::impl::_check_is_cuda(mat1, "mat1"); + sparse::impl::_check_is_cuda(mat2, "mat2"); + sparse::impl::_check_is_cuda(result, "result"); // Same checks as in TORCH_META_FUNC(addmm) at // aten/src/ATen/native/LinearAlgebra.cpp - TORCH_CHECK( - mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor"); - TORCH_CHECK( - mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor"); + sparse::impl::_check_dim(mat1, 2, "mat1"); + sparse::impl::_check_dim(mat2, 2, "mat2"); - IntArrayRef mat1_sizes = mat1.sizes(); - IntArrayRef mat2_sizes = mat2.sizes(); TORCH_CHECK( - mat1_sizes[1] == mat2_sizes[0], - "mat1 and mat2 shapes cannot be multiplied (", - mat1_sizes[0], - "x", - mat1_sizes[1], - " and ", - mat2_sizes[0], - "x", - mat2_sizes[1], - ")"); + mat1.size(1) == mat2.size(0), "mat1 and mat2 shapes cannot be multiplied (", + mat1.size(0), "x", mat1.size(1), " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")"); // From addmm_out_cuda_impl at ATen/native/cuda/Blas.cpp // TODO: remove code duplication and unify code // There were undefined symbol problems, // when using the same function for CUDA and SparseCsrCUDA dispatch keys // Also structured kernels do not support sparse output - IntArrayRef self__sizes; - c10::MaybeOwned self_; - if (&result != &self && self.layout() == kStrided) { - self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm"); - self__sizes = self_->sizes(); + c10::MaybeOwned self_; + // Don't expand self if this is an in-place operation + if (&result == &self) { + self_ = c10::MaybeOwned::borrowed(self); } else { - self_ = c10::MaybeOwned::borrowed(self); - self__sizes = self_->sizes(); - TORCH_CHECK(result.dim() == 2, "tensors must be 2-D"); - TORCH_CHECK( - self__sizes[0] == mat1_sizes[0], "self_ dim 0 must match mat1 dim 0"); - TORCH_CHECK( - self__sizes[1] == mat2_sizes[1], "self_ dim 1 must match mat2 dim 1"); + self_ = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm"); } + sparse::impl::_check_dim(*self_, 2, "self"); + TORCH_CHECK(((self_->dim() == 2) && + (self_->size(0) == mat1.size(0)) && + (self_->size(1) == mat2.size(1))), + "The input tensor must be a matrix with size ", + mat1.size(0), + "x", + mat2.size(1), + ", but got a ", + self_->dim(), + "-D tensor with size ", + self_->size(0), + "x", + self_->size(1)); + if (&result != &self) { if (result.layout() == kStrided) { - at::native::resize_output(result, self__sizes); + at::native::resize_output(result, self_->sizes()); } else { - at::native::resize_as_sparse_csr_(result, *self_); + result.resize_as_sparse_(*self_); } result.copy_(*self_); } - IntArrayRef result_sizes = result.sizes(); - if ((result_sizes[0] == 0) || (result_sizes[1] == 0)) { + if (result.numel() == 0) { return result; } - if (mat1._nnz() == 0 && mat2.layout() == kStrided) { - // According to docs, when beta==0 values in self should be ignored + if (sparse::impl::_is_sparse_and_zero(mat1) || sparse::impl::_is_sparse_and_zero(mat2)) { + // According to docs, when beta==0 values in self should be ignored. // nans and infs should not propagate if (beta.toComplexDouble() == 0.) { result.zero_(); @@ -178,15 +149,6 @@ Tensor& addmm_out_sparse_csr_cuda( return result; } - if (mat2.is_sparse_csr() && (mat1._nnz() == 0 || mat2._nnz() == 0)) { - if (beta.toComplexDouble() == 0.) { - result.values().zero_(); - } else { - result.values().mul_(beta); - } - return result; - } - sparse::impl::cuda::addmm_out_sparse_csr(mat1, mat2, beta, alpha, result); return result; } @@ -240,7 +202,7 @@ Tensor& addmv_out_sparse_csr_cuda( const Scalar& beta, const Scalar& alpha, Tensor& result) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat.is_sparse_csr()); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat.layout() == kSparseCsr || mat.layout() == kSparseBsr); TORCH_CHECK(mat.dim() == 2, "addmv: Expected mat to be 2-D"); TORCH_CHECK(vec.dim() == 1, "addmv: Expected vec to be 1-D"); diff --git a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp index 7eab11060e83..2dace2717403 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp +++ b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp @@ -120,6 +120,15 @@ void inline col_indices_and_values_resize_(const Tensor& input, int64_t nnz) { input.sizes()); } +void inline bsrsv2_bsrsm2_may_need_to_sync() { +#if defined(CUSPARSE_VERSION) && CUSPARSE_VERSION < 11703 + // cusparse bsrsv2 and bsrsm2 have a synchronization issue that may cause illegal memory access in cuda <= 11.6.x + // See https://github.com/pytorch/pytorch/issues/71297 + ::c10::cuda::device_synchronize(); +#endif + // else: do nothing! +} + void block_sparse_triangular_solve_vec( const at::sparse_csr::SparseCsrTensor& A, const Tensor& B, @@ -134,7 +143,7 @@ void block_sparse_triangular_solve_vec( "PyTorch with ROCm 4.5.0+. ", "Please use PyTorch built with newer ROCm version."); #else - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(A.is_sparse_csr()); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(A.layout() == kSparseBsr); // values is expected to be a blocks of sparse matrix TORCH_INTERNAL_ASSERT_DEBUG_ONLY(A.values().dim() == 3); // blocks are expected to be square @@ -213,6 +222,15 @@ void block_sparse_triangular_solve_vec( CUSPARSE_SOLVE_POLICY_NO_LEVEL, work_data.get()); + if (!unitriangular) { + int first_zero_diag_idx = -1; + cusparseStatus_t status = cusparseXbsrsv2_zeroPivot(handle, info.descriptor(), &first_zero_diag_idx); + if (status == CUSPARSE_STATUS_ZERO_PIVOT) { + X_->fill_(NAN); + return; + } + } + at::cuda::sparse::bsrsv2_solve( handle, block_layout, @@ -230,6 +248,8 @@ void block_sparse_triangular_solve_vec( X_->data_ptr(), CUSPARSE_SOLVE_POLICY_NO_LEVEL, work_data.get()); + + bsrsv2_bsrsm2_may_need_to_sync(); }); if (!X.is_same(*X_)) { X.copy_(*X_); @@ -251,7 +271,7 @@ void block_sparse_triangular_solve_mat( "PyTorch with ROCm 4.5.0+. ", "Please use PyTorch built with newer ROCm version."); #else - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(A.is_sparse_csr()); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(A.layout() == kSparseBsr); // values is expected to be a blocks of sparse matrix TORCH_INTERNAL_ASSERT_DEBUG_ONLY(A.values().dim() == 3); // blocks are expected to be square @@ -339,6 +359,15 @@ void block_sparse_triangular_solve_mat( CUSPARSE_SOLVE_POLICY_NO_LEVEL, work_data.get()); + if (!unitriangular) { + int first_zero_diag_idx = -1; + cusparseStatus_t status = cusparseXbsrsm2_zeroPivot(handle, info.descriptor(), &first_zero_diag_idx); + if (status == CUSPARSE_STATUS_ZERO_PIVOT) { + X_->fill_(NAN); + return; + } + } + at::cuda::sparse::bsrsm2_solve( handle, block_layout, @@ -360,6 +389,8 @@ void block_sparse_triangular_solve_mat( ldx, CUSPARSE_SOLVE_POLICY_NO_LEVEL, work_data.get()); + + bsrsv2_bsrsm2_may_need_to_sync(); }); if (!X.is_same(*X_)) { X.copy_(*X_); @@ -373,7 +404,7 @@ void block_sparse_mv( const Scalar& beta, const Scalar& alpha, const Tensor& result) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat.is_sparse_csr()); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat.layout() == kSparseBsr); // values is expected to be a blocks of sparse matrix TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat.values().dim() == 3); // blocks are expected to be square @@ -437,7 +468,7 @@ void block_sparse_mm( const Scalar& beta, const Scalar& alpha, const Tensor& result) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat1.is_sparse_csr()); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat1.layout() == kSparseBsr); // values is expected to be a blocks of sparse matrix TORCH_INTERNAL_ASSERT(mat1.values().dim() == 3); // blocks are expected to be square @@ -531,9 +562,6 @@ void spmm( const Scalar& beta, const Scalar& alpha, const Tensor& result) { - if (mat1.values().dim() >= 3 && mat1.values().size(-1) > 1) { - return block_sparse_mm(mat1, mat2, beta, alpha, result); - } #if !AT_USE_CUSPARSE_GENERIC_API() addmm_out_legacy(mat1, mat2, beta, alpha, result); #else @@ -793,18 +821,26 @@ void spgemm( } // anonymous namespace void addmm_out_sparse_csr( - const at::sparse_csr::SparseCsrTensor& mat1, + const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, const Tensor& result) { - if (mat2.layout() == kStrided && result.layout() == kStrided) { + if (mat1.layout() == kSparseBsr && mat2.layout() == kStrided && result.layout() == kStrided) { + return block_sparse_mm(mat1, mat2, beta, alpha, result); + } + if (mat1.is_sparse_csr() && mat2.layout() == kStrided && result.layout() == kStrided) { return spmm(mat1, mat2, beta, alpha, result); - } else if (mat2.is_sparse_csr() && result.is_sparse_csr()) { + } + if (mat1.layout() == kStrided && mat2.is_sparse_csr() && result.layout() == kStrided) { + // TODO: We can use cuSPARSE's transposition flags once we have CSC support. + return spmm(mat2.transpose(0, 1), mat1.transpose(0, 1), beta, alpha, result.transpose(0, 1)); + } + if (mat1.is_sparse_csr() && mat2.is_sparse_csr() && result.is_sparse_csr()) { return spgemm(mat1, mat2, beta, alpha, result); - } else { - TORCH_INTERNAL_ASSERT(false, "Received unexpected tensor layouts as input."); } + TORCH_CHECK(false, "addmm: computation on CUDA is not implemented for ", + result.layout(), " + ", mat1.layout(), " @ ", mat2.layout()); } /* @@ -823,7 +859,7 @@ void addmv_out_sparse_csr( const Scalar& beta, const Scalar& alpha, const Tensor& result) { - if (mat.values().dim() == 3 && mat.values().size(-1) > 1) { + if (mat.layout() == kSparseBsr) { return block_sparse_mv(mat, vec, beta, alpha, result); } #if !AT_USE_CUSPARSE_GENERIC_API() @@ -964,6 +1000,24 @@ void add_out_sparse_csr( auto B_col_indices_ptr = B_col_indices.data_ptr(); auto C_col_indices_ptr = C_col_indices.data_ptr(); + // Windows compilers don't support nested macros + // so we need this lambda outside of the + // AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES + auto fix_nnz = [ +#if AT_ROCM_ENABLED() + &C_crow_indices, + &m +#endif + ](int nnz) -> int { +// For some reason POINTER_MODE_HOST is not working here +// Let's extract manually the nnz from the C_crow_indices +#if AT_ROCM_ENABLED() + return std::max({nnz, C_crow_indices.narrow(-1, m, 1).item()}); +#else + return nnz; +#endif + }; + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES( C.scalar_type(), "add_out_sparse_csr_cuda_impl", [&] { auto beta_ = beta.to(); @@ -1024,6 +1078,8 @@ void add_out_sparse_csr( &nnzC, work_data.get()); + nnzC = fix_nnz(nnzC); + // Resize result using nnz information from cusparse col_indices_and_values_resize_(C, nnzC); C_col_indices = C.col_indices(); @@ -1080,7 +1136,7 @@ void triangular_solve_out_sparse_csr( X.fill_(NAN); return; } - if (A.values().dim() == 3 && A.values().size(-1) > 1) { + if (A.layout() == kSparseBsr) { if (B.size(-1) == 1) { return block_sparse_triangular_solve_vec(A, B, X, upper, transpose, unitriangular); } else { @@ -1244,64 +1300,75 @@ void sampled_addmm_out_sparse_csr( TORCH_INTERNAL_ASSERT_DEBUG_ONLY(B.layout() == Layout::Strided); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(C.is_sparse_csr()); - auto descA = at::cuda::sparse::CuSparseDnMatDescriptor(A); - auto descB = at::cuda::sparse::CuSparseDnMatDescriptor(B); - auto descC = at::cuda::sparse::CuSparseSpMatCsrDescriptor(C); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(batchCount(A) == batchCount(B)); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(batchCount(A) == batchCount(C)); cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; cusparseOperation_t opB = CUSPARSE_OPERATION_NON_TRANSPOSE; + c10::MaybeOwned A_ = prepare_dense_matrix_for_cusparse(A); + c10::MaybeOwned B_ = prepare_dense_matrix_for_cusparse(B); + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES( C.scalar_type(), "sampled_addmm_out_sparse_csr", [&] { - auto beta_ = beta.to(); - auto alpha_ = alpha.to(); - auto compute_type = at::cuda::getCudaDataType(); - auto handle = at::cuda::getCurrentCUDASparseHandle(); - size_t buffer_size = 0; - TORCH_CUDASPARSE_CHECK(cusparseSDDMM_bufferSize( - handle, - opA, - opB, - &alpha_, - descA.descriptor(), - descB.descriptor(), - &beta_, - descC.descriptor(), - compute_type, - CUSPARSE_SDDMM_ALG_DEFAULT, - &buffer_size // output - )); + // CUDA 11.6 doesn't support batched inputs, it raises an error: + // ** On entry to cusparseSDDMM_bufferSize(): batched SDDMM is not supported + // So we need to resort to the for loop + for (const auto i : c10::irange(batchCount(A))) { + auto descA = at::cuda::sparse::CuSparseDnMatDescriptor(*A_, /*batch_offset=*/i); + auto descB = at::cuda::sparse::CuSparseDnMatDescriptor(*B_, /*batch_offset=*/i); + auto descC = at::cuda::sparse::CuSparseSpMatCsrDescriptor(C, /*batch_offset=*/i); + + auto beta_ = beta.to(); + auto alpha_ = alpha.to(); + auto compute_type = at::cuda::getCudaDataType(); + auto handle = at::cuda::getCurrentCUDASparseHandle(); + size_t buffer_size = 0; + TORCH_CUDASPARSE_CHECK(cusparseSDDMM_bufferSize( + handle, + opA, + opB, + &alpha_, + descA.descriptor(), + descB.descriptor(), + &beta_, + descC.descriptor(), + compute_type, + CUSPARSE_SDDMM_ALG_DEFAULT, + &buffer_size // output + )); - auto& allocator = *c10::cuda::CUDACachingAllocator::get(); - auto buffer = allocator.allocate(buffer_size); + auto& allocator = *c10::cuda::CUDACachingAllocator::get(); + auto buffer = allocator.allocate(buffer_size); - TORCH_CUDASPARSE_CHECK(cusparseSDDMM_preprocess( - handle, - opA, - opB, - &alpha_, - descA.descriptor(), - descB.descriptor(), - &beta_, - descC.descriptor(), - compute_type, - CUSPARSE_SDDMM_ALG_DEFAULT, - buffer.get())); + TORCH_CUDASPARSE_CHECK(cusparseSDDMM_preprocess( + handle, + opA, + opB, + &alpha_, + descA.descriptor(), + descB.descriptor(), + &beta_, + descC.descriptor(), + compute_type, + CUSPARSE_SDDMM_ALG_DEFAULT, + buffer.get())); - TORCH_CUDASPARSE_CHECK(cusparseSDDMM( - handle, - opA, - opB, - &alpha_, - descA.descriptor(), - descB.descriptor(), - &beta_, - descC.descriptor(), - compute_type, - CUSPARSE_SDDMM_ALG_DEFAULT, - buffer.get())); + TORCH_CUDASPARSE_CHECK(cusparseSDDMM( + handle, + opA, + opB, + &alpha_, + descA.descriptor(), + descB.descriptor(), + &beta_, + descC.descriptor(), + compute_type, + CUSPARSE_SDDMM_ALG_DEFAULT, + buffer.get())); + } }); #endif } diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh index c83592335511..2a266319212a 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh @@ -2,6 +2,7 @@ #include #include +#include #include namespace at { namespace native { @@ -209,6 +210,13 @@ __global__ void valueSparseIntersectionKernel( int64_t match, d; int64_t nDimI = r_indices.sizes[0]; IndexType valueSize = r_values.strides[0]; + // reset valueSize if a dense dimension is zero: + for (d=0; d -C10_LAUNCH_BOUNDS_1(C10_WARP_SIZE*4) +C10_LAUNCH_BOUNDS_1(num_threads()) __global__ void coalesceValuesKernel( int64_t *segment_offsets, int64_t *value_indices, Dtype *values, Dtype *newValues, @@ -321,7 +329,6 @@ __global__ void coalesceValuesKernel( for (int row = begin; row < end; row++) { const int valueRow = ((int) value_indices[row]) * stride; - #pragma unroll for (int ii = 0; ii < SZ; ii++) { @@ -344,6 +351,56 @@ __global__ void coalesceValuesKernel( } } +// coalesceValuesKernel when Dtype/Acctype is bool. Can be eliminated using +// `if constexpr` when CUDA codes will be compiled under C++-17, see +// gh-56055 for blockers. +template +C10_LAUNCH_BOUNDS_1(C10_WARP_SIZE*4) +__global__ void coalesceValuesKernel( + int64_t *segment_offsets, int64_t *value_indices, + bool *values, bool *newValues, + int64_t nnz, int64_t newNnz, int64_t stride) { + + int seg = blockIdx.x * 4 + threadIdx.y; + + // Number of values processed by each thread (grain size) + const int SZ = 4; + + if (seg < newNnz) { + const int newValueRow = seg * stride; + const int begin = segment_offsets[seg]; + const int end = (seg < newNnz - 1) ? segment_offsets[seg + 1] : nnz; + const int startFeature = threadIdx.x + blockIdx.y * blockDim.x * SZ; + bool tmp[SZ]; + #pragma unroll + for (int ii = 0; ii < SZ; ii++) { + tmp[ii] = 0; + } + for (int row = begin; row < end; row++) { + const int valueRow = ((int) value_indices[row]) * stride; + + #pragma unroll + for (int ii = 0; ii < SZ; ii++) + { + int featureDim = startFeature + ii * C10_WARP_SIZE; + if (featureDim < stride) + { + tmp[ii] |= values[valueRow + featureDim]; + } + } + } + #pragma unroll + for (int ii = 0; ii < SZ; ii++) + { + int featureDim = startFeature + ii * C10_WARP_SIZE; + if (featureDim < stride) + { + newValues[newValueRow + featureDim] = tmp[ii]; + } + } + } +} + } // namespace apply }} // namespace at::native diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu index 30e7d873b39c..dc5a2acf2da1 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu @@ -142,10 +142,11 @@ SparseTensor _coalesce_sparse_cuda(const SparseTensor& self) { const int SZ = 4; values = values.contiguous(); int64_t stride = c10::multiply_integers(values.sizes().slice(1)); - dim3 grid(ceil_div(newNnz, (int64_t) SZ), ceil_div(stride, (int64_t) C10_WARP_SIZE*SZ)); - dim3 block(C10_WARP_SIZE, SZ); - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( - at::ScalarType::Half, at::ScalarType::BFloat16, values.scalar_type(), "coalesce_sparse_cuda", [&] { + int warp_size = at::cuda::warp_size(); + dim3 grid(ceil_div(newNnz, (int64_t) SZ), ceil_div(stride, (int64_t) warp_size*SZ)); + dim3 block(warp_size, SZ); + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( + at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, values.scalar_type(), "coalesce_sparse_cuda", [&] { using cuda_accscalar_t = acc_type; apply::coalesceValuesKernel<<>>( uniqueOffsets.data_ptr(), diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu index 0d99e298ec9d..9dbf562300f3 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu @@ -503,8 +503,8 @@ SparseTensor& mul_out_sparse_cuda(const SparseTensor& t_, const SparseTensor& sr TORCH_CHECK(cuda::getApplyGrid(valueSize, grid, curDevice), "mul: Argument #0: tensor too large or too many dimensions"); Tensor resultNnz = at::empty({1}, CUDA(kLong)); - AT_DISPATCH_ALL_TYPES_AND( - at::ScalarType::Half, commonDtype, "mul_out_sparse_cuda", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( + at::ScalarType::Half, at::ScalarType::BFloat16, commonDtype, "mul_out_sparse_cuda", [&] { apply::valueSparseIntersectionKernel<<>>( TensorMulOp(), I_INFO(r_indices_), I_INFO(t_indices_), I_INFO(s_indices_), diff --git a/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu index c13984f2d92f..6bdd4b40f8f4 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu @@ -16,8 +16,12 @@ #else #include #include +#include +#include #include #include +#include +#include #endif #include @@ -29,6 +33,7 @@ #include #include +#include #include #include #include @@ -159,18 +164,26 @@ Tensor& add_out_dense_sparse_csr_cuda( " in add operation"); Tensor src_values = src.values(); - Tensor src_crow_indices = src.crow_indices(); - Tensor src_col_indices = src.col_indices(); resize_output(output, dense.sizes()); Tensor resultBuffer = output; - Tensor valuesBuffer = src_values.to(commonDtype); + if (output.scalar_type() != commonDtype) { resultBuffer = dense.to(commonDtype); } else if (!is_same_tensor(output, dense)) { resultBuffer.copy_(dense); } + + if (src._nnz() == 0) { + return output; + } + + auto valuesBuffer = src_values.to(commonDtype).view({-1, src_values.size(-1)}); + resultBuffer = resultBuffer.view({-1, output.size(-2), output.size(-1)}); + auto src_crow_indices = src.crow_indices().view({-1, src.crow_indices().size(-1)}); + auto src_col_indices = src.col_indices().view({-1, src.col_indices().size(-1)}); + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( kHalf, kBool, kBFloat16, commonDtype, @@ -180,6 +193,7 @@ Tensor& add_out_dense_sparse_csr_cuda( src_crow_indices.scalar_type(), "csr_add_out_crow_indices", [&valuesBuffer, &resultBuffer, &alpha, &src_crow_indices, &src_col_indices]() { + auto batch_count = resultBuffer.dim() > 2 ? resultBuffer.size(-3) : 1; scalar_t* values_accessor = valuesBuffer.data_ptr(); scalar_t* out_ptr = resultBuffer.data_ptr(); scalar_t cast_value = alpha.to(); @@ -189,8 +203,11 @@ Tensor& add_out_dense_sparse_csr_cuda( int64_t out_storage_offset = resultBuffer.storage_offset(); auto out_strides = resultBuffer.strides(); - int64_t out_strides0 = out_strides[0]; - int64_t out_strides1 = out_strides[1]; + auto out_strides0 = out_strides[0]; + auto out_strides1 = out_strides[1]; + auto crow_stride0 = src_crow_indices.stride(0); + auto col_stride0 = src_col_indices.stride(0); + auto val_stride0 = valuesBuffer.stride(0); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); at::cuda::ThrustAllocator allocator; @@ -200,24 +217,29 @@ Tensor& add_out_dense_sparse_csr_cuda( thrust::for_each( policy, thrust::make_counting_iterator(int64_t(0)), - thrust::make_counting_iterator(int64_t(src_crow_indices.size(0) - 1)), + thrust::make_counting_iterator(int64_t(src_crow_indices.size(-1) - 1)), [values_accessor, crow_indices_accessor, col_indices_accessor, out_ptr, - out_storage_offset, - out_strides0, cast_value, - out_strides1 + out_strides0, + out_strides1, + crow_stride0, + col_stride0, + val_stride0, + batch_count ]__device__(int64_t irow) { - index_t start_index = crow_indices_accessor[irow]; - index_t end_index = crow_indices_accessor[irow + 1]; + for (index_t batch_idx = 0; batch_idx < batch_count; batch_idx++) { + index_t start_index = crow_indices_accessor[batch_idx*crow_stride0 + irow]; + index_t end_index = crow_indices_accessor[batch_idx*crow_stride0 + irow + 1]; for (index_t i = start_index; i < end_index; ++i) { - auto icol = col_indices_accessor[i]; - auto index = out_storage_offset + irow * out_strides0 + icol * out_strides1; - out_ptr[index] += cast_value * values_accessor[i]; + auto icol = col_indices_accessor[batch_idx*col_stride0 + i]; + auto index = batch_idx * out_strides0 + irow * out_strides1 + icol; + out_ptr[index] += cast_value * values_accessor[batch_idx*val_stride0 + i]; } + } }); }); }); @@ -275,5 +297,342 @@ TORCH_IMPL_FUNC(_convert_indices_from_csr_to_coo_structured_cuda) ( } } + /* + Reductions on sparse CSR tensors using masked semantics. + + - To support a reduction operator on a CSR tensor with CUDA storage, define + +template +struct Reduction...Op { + __device__ __forceinline__ scalar_t operator()(const scalar_t a, const scalar_t b) const { + return a ... b; + } + __device__ __forceinline__ scalar_t identity() const { return ...; } + __forceinline__ scalar_t identity_cpu() const { return ...; } +}; + + +Tensor _sparse_csr_..._cuda(const Tensor& input, IntArrayRef dims_to_sum, bool keepdim, c10::optional dtype) { + ... + result = reduce_sparse_csr_cuda_template(input_, dims_to_sum, keepdim, Reduction...Op()); + ... + return result; +} + + and add the following + + - func: _sparse_csr_op.dim_dtype(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor + dispatch: + SparseCsrCUDA: _sparse_csr_..._cuda + + to native_functions.yaml + */ + +namespace { + +template +__global__ void reduce_sparse_csr_dim0_cuda_kernel(scalar_t* new_values, + const index_t* new_col_indices, + const int64_t new_nnz, + const scalar_t* values, + const index_t* col_indices, + const int64_t nnz, + ReductionOp rop + ) { + int64_t tid = blockDim.x * blockIdx.x + threadIdx.x; + if (tid < new_nnz) { + index_t col = new_col_indices[tid]; + scalar_t v = rop.identity(); + for (int64_t j=0; j < nnz; j++) { + if (col == col_indices[j]) { + v = rop(v, values[j]); + } + } + new_values[tid] = v; + } +} + +template +Tensor reduce_sparse_csr_dim0_cuda_template(const Tensor& sparse, ReductionOp rop) { + /* + Consider the following sparse tensor: + + 1 * * * * + * * * 2 * + * * 3 * * + * * * * * + 4 * 5 * * + + that has CSR representation + + crow_indices = [0, 1, 2, 3, 3, 5] + col_indices = [0, 3, 2, 0, 2] + values = [1, 2, 3, 4, 5] + + Reduction with dim=0 results: + + rop(1,4) * rop(3,5) 2 * + + that has CSR representation + + new_crow_indices = [0, 3] + new_col_indices = [0, 2, 3] + new_values = [rop(1, 4], rop(3, 5), 2] + + In general, the CSR representation data can be computed as follows: + + nnz = col_indices.numel() + new_col_indices = col_indices.unique(sorted=True, return_inverse=False) + new_nnz = new_col_indices.numel() + new_crow_indices = [0, new_nnz] + new_values.resize(new_nnz) + + for i in range(new_nnz): + v = identity + col = new_col_indices[i] + for j in range(nnz): + if col == col_indices[j]: + v = rop(v, values[j]) + new_values[i] = v + + Notice this algorithm is different from the one used on CPU data. + */ + + Tensor col_indices = sparse.col_indices(); + Tensor values = sparse.values(); + auto ncols = sparse.size(1); + auto nnz = col_indices.numel(); + Tensor new_col_indices; + + std::tie(new_col_indices, std::ignore) = at::_unique(col_indices, true, false); + auto new_nnz = new_col_indices.numel(); + Tensor new_crow_indices = at::tensor(ArrayRef{0, new_nnz}, col_indices.options()); + Tensor new_values = at::empty({new_nnz}, values.options()); + + scalar_t* values_ptr = values.data_ptr(); + scalar_t* new_values_ptr = new_values.data_ptr(); + int64_t THREADS = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock; + int64_t BLOCKS = (new_nnz + THREADS) / THREADS; + at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream(); + AT_DISPATCH_INDEX_TYPES(col_indices.scalar_type(), "reduce_sparse_csr_dim0_cuda_indices", + [&]() { + index_t* col_indices_ptr = col_indices.data_ptr(); + index_t* new_col_indices_ptr = new_col_indices.data_ptr(); + reduce_sparse_csr_dim0_cuda_kernel<<>>(new_values_ptr, + new_col_indices_ptr, + new_nnz, + values_ptr, + col_indices_ptr, + nnz, + rop + ); + }); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + return at::native::_sparse_csr_tensor_unsafe(new_crow_indices, new_col_indices, new_values, + {1, ncols}, + new_values.scalar_type(), + sparse.layout(), + new_values.device()); +} + +template +__global__ void reduce_crow_indices_dim1_cuda_kernel(index_t* new_crow_indices, + index_t* row_map, + const index_t* crow_indices, + const int64_t nrows + ) { + int64_t nnz = 0; + new_crow_indices[0] = 0; + for(int64_t i=0; i +__global__ void reduce_sparse_csr_dim1_cuda_kernel(scalar_t* new_values, + const scalar_t* values, + const index_t* crow_indices, + const index_t* row_map, + const int64_t nrows, + ReductionOp rop + ) { + int64_t tid = blockDim.x * blockIdx.x + threadIdx.x; + if (tid < nrows) { + index_t i_start = crow_indices[tid]; + index_t i_end = crow_indices[tid+1]; + if (i_start != i_end) { + scalar_t acc = rop.identity(); + for (index_t i = i_start; i < i_end; i++) { + acc = rop(acc, values[i]); + } + new_values[row_map[tid]] = acc; + } + } +} + +template +Tensor reduce_sparse_csr_dim1_cuda_template(const Tensor& sparse, ReductionOp rop) { + /* + The algorithm of computing reduce of a CSR tensor along the last + dimension is explained in the comment of the + reduce_sparse_csr_dim1_cpu_template function. + */ + Tensor crow_indices = sparse.crow_indices(); + auto ioptions = crow_indices.options(); + Tensor values = sparse.values(); + auto nrows = sparse.size(0); + auto numel = values.numel(); + + Tensor new_crow_indices = at::empty({crow_indices.numel()}, ioptions); + Tensor new_col_indices = at::empty({}, ioptions); + Tensor new_values = at::empty({}, values.options()); + Tensor row_map = at::empty({nrows}, ioptions); + + at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream(); + int64_t THREADS = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock; + int64_t BLOCKS = (nrows + THREADS) / THREADS; + + AT_DISPATCH_INDEX_TYPES(crow_indices.scalar_type(), "reduce_sparse_csr_dim1_cuda_indices", + [&]() { + index_t* crow_indices_ptr = crow_indices.data_ptr(); + index_t* new_crow_indices_ptr = new_crow_indices.data_ptr(); + index_t* row_map_ptr = row_map.data_ptr(); + reduce_crow_indices_dim1_cuda_kernel<<<1, 1, 0, stream>>>(new_crow_indices_ptr, + row_map_ptr, + crow_indices_ptr, + nrows); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + index_t new_nnz = new_crow_indices[-1].item(); + new_col_indices.resize_(new_nnz); + new_col_indices.fill_(index_t(0)); + new_values.resize_(new_nnz); + + scalar_t* values_ptr = values.data_ptr(); + scalar_t* new_values_ptr = new_values.data_ptr(); + reduce_sparse_csr_dim1_cuda_kernel<<>>(new_values_ptr, + values_ptr, + crow_indices_ptr, + row_map_ptr, + nrows, + rop); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); + + return at::native::_sparse_csr_tensor_unsafe(new_crow_indices, new_col_indices, new_values, + {sparse.size(0), 1}, + new_values.scalar_type(), + sparse.layout(), + new_values.device()); +} + +template +Tensor reduce_sparse_csr_dim01_cuda_template(const Tensor& sparse, ReductionOp rop) { + + auto ioptions = sparse.col_indices().options(); + Tensor values = sparse.values(); + auto numel = values.numel(); + auto nnz = std::min(1, numel); + + Tensor new_values; + if (numel > 0) { + new_values = at::empty({1}, values.options()); + auto iter = TensorIterator::reduce_op(new_values, values); + gpu_reduce_kernel(iter, func_wrapper(rop), rop.identity_cpu()); + } else { + new_values = at::empty({}, values.options()); + } + Tensor new_col_indices = at::zeros({nnz}, ioptions); + Tensor new_crow_indices = at::tensor(ArrayRef{0, nnz}, ioptions); + return at::native::_sparse_csr_tensor_unsafe(new_crow_indices, new_col_indices, new_values, + {1, std::min(1, sparse.size(1))}, + new_values.scalar_type(), + sparse.layout(), + new_values.device()); +} + +template +Tensor reduce_sparse_csr_cuda_template(const Tensor& sparse, std::vector dims, ReductionOp rop) { + if (dims.size() == 1) { + if (dims[0] == 0) { + return reduce_sparse_csr_dim0_cuda_template(sparse, rop); + } else { + TORCH_INTERNAL_ASSERT(dims[0] == 1); + return reduce_sparse_csr_dim1_cuda_template(sparse, rop); + } + } else if (dims.size() == 2) { + TORCH_INTERNAL_ASSERT(((dims[0] == 0 && dims[1] == 1) || (dims[0] == 1 && dims[1] == 0))); + return reduce_sparse_csr_dim01_cuda_template(sparse, rop); + } + TORCH_INTERNAL_ASSERT(dims.size() == 0); + // effective after gh-29137 has been resolved + return sparse.clone(); +} + +template +Tensor reduce_sparse_csr_cuda_template(const Tensor& sparse, IntArrayRef dims_to_sum, bool keepdim, ReductionOp rop) { + TORCH_INTERNAL_ASSERT(sparse.is_sparse_csr()); + TORCH_CHECK(keepdim, "reduction operations on CSR tensors with keepdim=False is unsupported"); + TORCH_INTERNAL_ASSERT(sparse.is_cuda()); + + const int64_t input_dim = sparse.dim(); + TORCH_INTERNAL_ASSERT(input_dim == 2); + auto dims = dims_to_sum.vec(); + maybe_wrap_dims(dims, input_dim); + if (dims.size() == 0) { + // after gh-29137 is resolved, delete this if-block + dims.emplace_back(0); + dims.emplace_back(1); + } + return reduce_sparse_csr_cuda_template(sparse, dims, rop); +} + +template +struct ReductionAddOp { + __device__ __forceinline__ scalar_t operator()(const scalar_t a, const scalar_t b) const { + return a + b; + } + __device__ __forceinline__ scalar_t identity() const { return 0; } + __forceinline__ scalar_t identity_cpu() const { return 0; } +}; + +template +struct ReductionMulOp { + __device__ __forceinline__ scalar_t operator()(const scalar_t a, const scalar_t b) const { + return a * b; + } + __device__ __forceinline__ scalar_t identity() const { return 1; } + __forceinline__ scalar_t identity_cpu() const { return 1; } +}; + +} // namespace + +Tensor _sparse_csr_sum_cuda(const Tensor& input, IntArrayRef dims_to_sum, bool keepdim, c10::optional dtype) { + ScalarType dtype_ = dtype.value_or(input.scalar_type()); + Tensor input_ = input.to(dtype_); + Tensor result; + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( + kHalf, kBFloat16, input_.scalar_type(), "_sparse_csr_sum_cuda", + [&] { + result = reduce_sparse_csr_cuda_template(input_, dims_to_sum, keepdim, ReductionAddOp()); + }); + return result; +} + +Tensor _sparse_csr_prod_cuda(const Tensor& input, IntArrayRef dims_to_reduce, bool keepdim, c10::optional dtype) { + ScalarType dtype_ = dtype.value_or(input.scalar_type()); + Tensor input_ = input.to(dtype_); + Tensor result; + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( + kHalf, kBFloat16, input_.scalar_type(), "_sparse_csr_prod_cuda", + [&] { + result = reduce_sparse_csr_cuda_template(input_, dims_to_reduce, keepdim, ReductionMulOp()); + }); + return result; +} + } // namespace native } // namespace at diff --git a/aten/src/ATen/native/tags.yaml b/aten/src/ATen/native/tags.yaml new file mode 100644 index 000000000000..d79b13adae84 --- /dev/null +++ b/aten/src/ATen/native/tags.yaml @@ -0,0 +1,10 @@ +# This yaml file contains all the possible tags that can be defined in `tags` in `native_functions.yaml` + +- tag: inplace_view + desc: | + This tag indicates if an operator *only* modifies the tensor metadata +- tag: view_copy + desc: | + This tag indicates operators that are *_copy* variants + of view/aliasing operators. If an operator has a view_copy tag, + then it should have the name {op}_copy, where {op} is a view operator. diff --git a/aten/src/ATen/native/transformers/attention.cpp b/aten/src/ATen/native/transformers/attention.cpp new file mode 100644 index 000000000000..697aabb46009 --- /dev/null +++ b/aten/src/ATen/native/transformers/attention.cpp @@ -0,0 +1,482 @@ +#include + +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +#include + +namespace at { + +namespace native { + +namespace { + +Tensor gemm_nt(const Tensor& self, const Tensor& other) { + if (self.is_nested()) { + return NestedTensor_matmul(self, other.t()); + } else { + return at::native::matmul(self, other.t()); + } +} + +template +void transform_bias_rescale_qkv_inner_loop( + int64_t B, + int64_t T, + int64_t _3D, + int64_t D, + int64_t num_head, + int64_t dim_per_head, + scalar_t* qkv_data, + scalar_t* qkv_bias_data, + scalar_t* q_k_v_data, + scalar_t inv_sqrt_dim_per_head, + int64_t begin, + int64_t end) { + for (auto i : c10::irange(begin, end)) { + auto t = i % T; + i /= T; + auto nh = i % num_head; + i /= num_head; + auto b = i; + using Vec = vec::Vectorized; + auto V = vec::Vectorized::size(); + auto dh = 0; + auto d = nh * dim_per_head; + for (; dh + V <= dim_per_head; dh += V, d += V) { + // load + auto q_bias_data = Vec::loadu(&qkv_bias_data[d + 0 * D]); + auto k_bias_data = Vec::loadu(&qkv_bias_data[d + 1 * D]); + auto v_bias_data = Vec::loadu(&qkv_bias_data[d + 2 * D]); + + auto q_data = Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 0 * D]) + + q_bias_data; + auto k_data = Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 1 * D]) + + k_bias_data; + auto v_data = Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 2 * D]) + + v_bias_data; + + q_data = q_data * Vec(inv_sqrt_dim_per_head); + + q_data.store(&q_k_v_data + [0 * B * num_head * T * dim_per_head + + b * num_head * T * dim_per_head + + nh * T * dim_per_head + t * dim_per_head + dh]); + k_data.store(&q_k_v_data + [1 * B * num_head * T * dim_per_head + + b * num_head * T * dim_per_head + + nh * T * dim_per_head + t * dim_per_head + dh]); + v_data.store(&q_k_v_data + [2 * B * num_head * T * dim_per_head + + b * num_head * T * dim_per_head + + nh * T * dim_per_head + t * dim_per_head + dh]); + } + for (; dh < dim_per_head; dh++) { + auto d = nh * dim_per_head + dh; + auto q_bias = qkv_bias_data[d + 0 * D]; + auto k_bias = qkv_bias_data[d + 1 * D]; + auto v_bias = qkv_bias_data[d + 2 * D]; + auto q_data = qkv_data[b * _3D * T + t * _3D + d + 0 * D] + q_bias; + auto k_data = qkv_data[b * _3D * T + t * _3D + d + 1 * D] + k_bias; + auto v_data = qkv_data[b * _3D * T + t * _3D + d + 2 * D] + v_bias; + q_data = q_data * inv_sqrt_dim_per_head; + q_k_v_data + [0 * B * num_head * T * dim_per_head + + b * num_head * T * dim_per_head + nh * T * dim_per_head + + t * dim_per_head + dh] = q_data; + q_k_v_data + [1 * B * num_head * T * dim_per_head + + b * num_head * T * dim_per_head + nh * T * dim_per_head + + t * dim_per_head + dh] = k_data; + q_k_v_data + [2 * B * num_head * T * dim_per_head + + b * num_head * T * dim_per_head + nh * T * dim_per_head + + t * dim_per_head + dh] = v_data; + } + } +} + +Tensor bmm_nt(const Tensor& a, const Tensor& b) { + auto a_ = a.view({a.size(0) * a.size(1), a.size(2), a.size(3)}); + auto b_ = b.view({b.size(0) * b.size(1), b.size(2), b.size(3)}); + auto bt_ = b_.transpose(2, 1); + auto c_ = at::bmm(a_, bt_); + return c_.view({a.size(0), a.size(1), a.size(2), b.size(2)}); +} + +Tensor masked_softmax( + Tensor& attn_scores, + c10::optional attn_mask, + const Tensor& query) { + if (query.is_nested() && !attn_mask) { + // TODO: maybe we could do better than generating a mask every time? + + attn_mask = NestedTensor_to_mask(query, 2); + // TODO: CPU path does not support transformer mask yet. + if (attn_scores.is_cpu()) { + attn_mask = attn_mask->view({-1, 1, 1, attn_scores.sizes()[3]}); + // 1 means skip, 0 means keep. + // want: + // 0,0 -> 0 + // 0,1 -> 1 + // 1,1 -> 1 + // so that's logical OR. + *attn_mask = *attn_mask | attn_mask->transpose(2, 3); + attn_mask = at::expand_inplace(attn_scores, *attn_mask)->contiguous(); + } + attn_mask = attn_mask->to(query.device(), /*non-blocking=*/true); + } + if (attn_mask && attn_mask->dtype() != at::kBool) { + TORCH_WARN( + "Converting mask without torch.bool dtype to bool; this will " + "negatively affect performance. Prefer to use a boolean mask directly."); + attn_mask = attn_mask->to(at::kBool); + } + if (attn_scores.is_cpu() && attn_mask && attn_mask->dim() == 2) { + // TODO: CPU path does not support transformer mask yet. + const auto batch_size = attn_scores.sizes()[0]; + const auto seq_len = attn_scores.sizes()[3]; + TORCH_CHECK(attn_mask->sizes()[0] == batch_size); + TORCH_CHECK(attn_mask->sizes()[1] == seq_len); + attn_mask = attn_mask->view({batch_size, 1, 1, seq_len}); + attn_mask = at::expand_inplace(attn_scores, *attn_mask)->contiguous(); + } + if (attn_mask) { + return _masked_softmax(attn_scores, *attn_mask); + } else { + return _softmax_out(attn_scores, attn_scores, attn_scores.dim() - 1, false); + } +} + +Tensor bmm_nn(Tensor& out, const Tensor& a, const Tensor& b) { + const std::array newAShape = { + a.sizes()[0] * a.sizes()[1], a.sizes()[2], a.sizes()[3]}; + auto a_ = a.view(newAShape); + const std::array newBShape = { + b.sizes()[0] * b.sizes()[1], b.sizes()[2], b.sizes()[3]}; + auto b_ = b.view(newBShape); + auto out_ = out.reshape({newAShape[0], newAShape[1], newBShape[2]}); + auto c_ = at::bmm_out(out_, a_, b_); + return c_.view({a.size(0), a.size(1), a.size(2), b.size(3)}); +} + +Tensor transform_0213(const Tensor& a) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.size(1)); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.size(3)); + return a.permute({0, 2, 1, 3}) + .contiguous() + .view({a.size(0), a.size(2), a.size(1) * a.size(3)}); +} + +Tensor transform0213_gemm_nt_bias( + const Tensor& a, + const Tensor& b, + const Tensor& c, + const Tensor& query) { + if (query.is_nested()) { + at::Tensor nested_a = _nested_from_padded( + a, get_nested_tensor_impl(query)->get_nested_size_tensor(), true); + return NestedTensor_times_Tensor_plus_Tensor_addmm( + c, nested_a, b.t(), 1, 1); + } else { + const Tensor a_0213 = transform_0213(a); + auto a_ = a_0213.view({a_0213.size(0) * a_0213.size(1), a_0213.size(2)}); + auto r_ = at::native::linear(a_, b, c); + return r_.view({a_0213.size(0), a_0213.size(1), r_.size(1)}); + } +} + +void debug_assert_shape(int line, const Tensor& t, c10::IntArrayRef shape) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + (size_t)t.dim() == shape.size(), + "(called from line ", + line, + ") ", + "expected ", + shape.size(), + "-D tensor but got ", + t.dim()); + if (t.is_nested()) { + return; + } + for (auto idx : c10::irange(shape.size())) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + shape[idx] == 0 || t.sizes()[idx] == shape[idx], + "(called from line ", + line, + ") ", + "expected dim ", + idx, + " to be ", + shape[idx], + " but got ", + t.sizes()[idx]); + } +} +} // namespace + +// compute q = (q + q_bias) / sqrt(dim_per_head), k = k + k_bias, v = v + v_bias +std::tuple transform_bias_rescale_qkv_cpu( + const Tensor& qkv, + const Tensor& qkv_bias, + const int64_t num_head) { + auto qkv_ = qkv.is_nested() + ? c10::MaybeOwned::owned(qkv.to_padded_tensor(0)) + : c10::MaybeOwned::borrowed(qkv); + auto B = qkv_->size(0); + auto T = qkv_->size(1); + auto _3D = qkv_->size(2); + auto D = _3D / 3; + TORCH_CHECK(D % num_head == 0); + TORCH_CHECK(_3D % 3 == 0); + const auto dim_per_head = D / num_head; + auto q_k_v = at::empty({3, B, num_head, T, dim_per_head}, qkv_->options()); + + const auto qkv_contig = qkv_->expect_contiguous(); + const auto qkv_bias_contig = qkv_bias.expect_contiguous(); + AT_DISPATCH_FLOATING_TYPES_AND2( + ScalarType::Half, + ScalarType::BFloat16, + qkv_->scalar_type(), + "transform_bias_rescale_qkv", + [&] { + scalar_t* qkv_data = qkv_contig->data_ptr(); + scalar_t* qkv_bias_data = qkv_bias_contig->data_ptr(); + scalar_t* q_k_v_data = q_k_v.data_ptr(); + const scalar_t inv_sqrt_dim_per_head = + 1.0 / std::sqrt(static_cast(dim_per_head)); + + int64_t grain_size = + std::max(internal::GRAIN_SIZE / (3 * dim_per_head), (int64_t)1); + parallel_for( + 0, B * num_head * T, grain_size, [&](int64_t begin, int64_t end) { + transform_bias_rescale_qkv_inner_loop( + B, + T, + _3D, + D, + num_head, + dim_per_head, + qkv_data, + qkv_bias_data, + q_k_v_data, + inv_sqrt_dim_per_head, + begin, + end); + }); + }); + auto q_k_v_s = + at::native::split(q_k_v.view({3 * B, num_head, T, dim_per_head}), B, 0); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(q_k_v_s.size() == 3); + return std::make_tuple(q_k_v_s[0], q_k_v_s[1], q_k_v_s[2]); +} + +std::tuple native_multi_head_attention( + const Tensor& query, + const Tensor& key, + const Tensor& value, + const int64_t embed_dim, + const int64_t num_head, + const Tensor& qkv_weight, + const Tensor& qkv_bias, + const Tensor& proj_weight, + const Tensor& proj_bias, + const c10::optional& mask, + bool need_weights, + bool average_attn_weights) { + // query shape: [B, T, D] + // qkv_weight shape: [3 * D, D] + + TORCH_CHECK( + !mask || !query.is_nested(), + "NestedTensor with mask is not supported yet"); + const auto D = embed_dim; + TORCH_CHECK( + query.dim() == 3, + "expected 3-D `query`, got ", + query.dim(), + "-D tensor"); + TORCH_CHECK( + query.is_nested() || query.sizes()[2] == embed_dim, + "passed-in embed_dim ", + embed_dim, + " didn't match last dim of query ", + query.sizes()[2]); + TORCH_CHECK( + key.dim() == 3, + "expected 3-D `key`, got ", + key.dim(), + "-D tensor"); + TORCH_CHECK( + value.dim() == 3, + "expected 3-D `value`, got ", + value.dim(), + "-D tensor"); + TORCH_CHECK( + query.is_nested() || key.is_nested() || value.is_nested() || + (query.sizes() == key.sizes() && key.sizes() == value.sizes()), + "expected `query`/`key`/`value` shapes to match"); + TORCH_CHECK( + qkv_weight.dim() == 2, + "expected 2-D `qkv_weight`, got ", + qkv_weight.dim(), + "-D tensor"); + TORCH_CHECK( + D * 3 == qkv_weight.sizes()[0], + "expected `qkv_weight` first dim to be 3x embed_dim"); + TORCH_CHECK( + D == qkv_weight.sizes()[1], + "expected `qkv_weight` second dim to be embed_Dim"); + TORCH_CHECK( + qkv_bias.dim() == 1, + "expected 2-D `qkv_bias`, got ", + qkv_bias.dim(), + "-D tensor"); + TORCH_CHECK( + qkv_bias.sizes()[0] == 3 * D, + "expected `qkv_bias` first dim and first dim of query to be equal"); + TORCH_CHECK(D % num_head == 0, "`embed_dim` must divide evenly by `num_heads`"); + +#ifndef NDEBUG + const auto B = query.is_nested() + ? get_nested_tensor_impl(query)->get_nested_size_tensor().size(0) + : query.sizes()[0]; + auto T = query.is_nested() ? 0 : query.sizes()[1]; + const auto dim_per_head = D / num_head; +#endif + + // shape: [B, T, 3 x D] + Tensor qkv; + + if (key.is_same(value)) { + if (query.is_same(key)) { + // self-attention + qkv = gemm_nt(query, qkv_weight); + } else { + // encoder-decoder attention + // TODO: is there a more efficient way to set this up? + // TODO: can we stay nested insted of using cat? Probably just make a + // NestedTensor out of the matmul results or something? + auto q_kv_weight_s = + at::native::split_with_sizes(qkv_weight, {D, D * 2}, 0); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + q_kv_weight_s.size() == 2, + "expected split to produce 2 tensors but it produced ", + q_kv_weight_s.size()); + auto q = gemm_nt(query, q_kv_weight_s[0]); + auto kv = gemm_nt(key, q_kv_weight_s[1]); + qkv = at::cat({q, kv}, 2); + } + } else { + auto q_k_v_weight_s = at::native::chunk(qkv_weight, 3, 0); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + q_k_v_weight_s.size() == 3, + "expected chunk to produce 3 tensors but it produced ", + q_k_v_weight_s.size()); + // TODO: can we stay nested instead of using cat? + auto q = gemm_nt(query, q_k_v_weight_s[0]); + auto k = gemm_nt(key, q_k_v_weight_s[1]); + auto v = gemm_nt(value, q_k_v_weight_s[2]); + qkv = at::cat({q, k, v}, 2); + } + + if (!qkv.is_nested() && qkv.numel() == 0) { + if (query.is_nested()) { + return std::make_tuple(Tensor(), Tensor()); + } + return std::make_tuple(at::empty_like(query), Tensor()); + } + +#ifndef NDEBUG + if (!query.is_nested() || !qkv.is_nested()) { + if (query.is_nested()) { + T = qkv.size(1); + } + debug_assert_shape(__LINE__, qkv, {B, T, 3 * D}); + } +#endif + +#ifdef DEBUG_PRINT_EACH_STEP + if (!qkv.is_nested()) { + std::cerr << "qkv: " << qkv << std::endl; + } +#endif + // shape: 3 x [B, num_head, T, dim_per_head] + auto q_k_v = _transform_bias_rescale_qkv(qkv, qkv_bias, num_head); + qkv = Tensor(); // Not used any more, allow free + auto& q = std::get<0>(q_k_v); + const auto& k = std::get<1>(q_k_v); + const auto& v = std::get<2>(q_k_v); +#ifndef NDEBUG + debug_assert_shape(__LINE__, q, {B, num_head, T, dim_per_head}); + debug_assert_shape(__LINE__, k, {B, num_head, T, dim_per_head}); + debug_assert_shape(__LINE__, v, {B, num_head, T, dim_per_head}); +#endif +#ifdef DEBUG_PRINT_EACH_STEP + std::cerr << "q: " << q << std::endl; + std::cerr << "k: " << k << std::endl; + std::cerr << "v: " << v << std::endl; +#endif + + // shape: [B, num_head, T, T] + auto qkt = bmm_nt(q, k); + // q & k are dead but cannot be freed because they were packed with v +#ifndef NDEBUG + debug_assert_shape(__LINE__, qkt, {B, num_head, T, T}); +#endif +#ifdef DEBUG_PRINT_EACH_STEP + std::cerr << "qkt: " << qkt << std::endl; +#endif + + // shape: [B, num_head, T, T] + // TODO: long-term, have a kernel that works with + // NestedTensor directly if there is no mask passed + qkt = masked_softmax(qkt, mask, query); +#ifdef DEBUG_PRINT_EACH_STEP + std::cerr << "qkt after softmax: " << qkt << std::endl; +#endif + + // shape: [B, num_head, T, dim_per_head] + // reuse storage for q; we're done with it + auto attn_ctx = bmm_nn(q, qkt, v); + // qkv is not dead; we just reused storage for q! + if (!need_weights) { + qkt = Tensor(); + } +#ifndef NDEBUG + debug_assert_shape(__LINE__, attn_ctx, {B, num_head, T, dim_per_head}); +#endif +#ifdef DEBUG_PRINT_EACH_STEP + std::cerr << "attn_ctx: " << attn_ctx << std::endl; +#endif + + // shape: [B, T, D] + // Fuse transform_0213 inside + auto proj = transform0213_gemm_nt_bias( + attn_ctx, proj_weight, proj_bias, query); +#ifndef NDEBUG + debug_assert_shape(__LINE__, proj, {B, T, D}); +#endif + if (need_weights && average_attn_weights) { + // weights are not needed for full transformer, so don't worry too + // much about performance -- we implement this just to make use + // cases that don't disable need_weights still get some speedup. + qkt = qkt.sum(1); + qkt /= num_head; + } + return std::make_tuple(std::move(proj), std::move(qkt)); +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu new file mode 100644 index 000000000000..fc9a83266a20 --- /dev/null +++ b/aten/src/ATen/native/transformers/cuda/attention.cu @@ -0,0 +1,400 @@ +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +namespace at { + +namespace native { + +namespace { + +static constexpr int TRANSFORM_BIAS_RESCALE_VEC = 4; + +template +__global__ void transform_bias_rescale_qkv_kernel( + // [B, T, 3 * D] + const PackedTensorAccessor64 qkv, + // [3 * D] + const PackedTensorAccessor64 qkv_bias, + // [3, B, NH, T, DH] + PackedTensorAccessor64 q_k_v, + const scalar_t inv_sqrt_dim_per_head) { + // warp per DH. + // so launch B * NH * T warps. + auto NH = q_k_v.size(2); + auto T = q_k_v.size(3); + auto DH = q_k_v.size(4); + + auto t = blockIdx.x % T; + auto b = blockIdx.x / T; + + auto D = NH * DH; + + if (assume_aligned) { + constexpr int VEC = TRANSFORM_BIAS_RESCALE_VEC; + using LoadT = memory::aligned_vector; + for (int32_t d_v = threadIdx.x; d_v < D / VEC; d_v += blockDim.x) { + auto d = d_v * VEC; + auto nh = d / DH; + auto dh = d % DH; + scalar_t qkv_bias_q[VEC]; + scalar_t qkv_bias_k[VEC]; + scalar_t qkv_bias_v[VEC]; + scalar_t qkv_q[VEC]; + scalar_t qkv_k[VEC]; + scalar_t qkv_v[VEC]; + + // Here we require D % VEC == 0 for these vectorized loads. + *reinterpret_cast(&qkv_bias_q) = + *reinterpret_cast(&qkv_bias[d + 0 * D]); + *reinterpret_cast(&qkv_bias_k) = + *reinterpret_cast(&qkv_bias[d + 1 * D]); + *reinterpret_cast(&qkv_bias_v) = + *reinterpret_cast(&qkv_bias[d + 2 * D]); + + *reinterpret_cast(&qkv_q) = + *reinterpret_cast(&qkv[b][t][d + 0 * D]); + *reinterpret_cast(&qkv_k) = + *reinterpret_cast(&qkv[b][t][d + 1 * D]); + *reinterpret_cast(&qkv_v) = + *reinterpret_cast(&qkv[b][t][d + 2 * D]); + +#pragma unroll + // TODO: specialize for float2half2/half2float2? + for (auto ii = 0; ii < VEC; ++ii) { + qkv_q[ii] = static_cast( + (static_cast(qkv_q[ii]) + + static_cast(qkv_bias_q[ii])) * + static_cast(inv_sqrt_dim_per_head)); + qkv_k[ii] = static_cast( + (static_cast(qkv_k[ii]) + + static_cast(qkv_bias_k[ii]))); + qkv_v[ii] = static_cast( + (static_cast(qkv_v[ii]) + + static_cast(qkv_bias_v[ii]))); + } + + // Here we require DH % VEC == 0 for these vectorized stores. + *reinterpret_cast(&q_k_v[0][b][nh][t][dh]) = + *reinterpret_cast(&qkv_q); + *reinterpret_cast(&q_k_v[1][b][nh][t][dh]) = + *reinterpret_cast(&qkv_k); + *reinterpret_cast(&q_k_v[2][b][nh][t][dh]) = + *reinterpret_cast(&qkv_v); + } + } else { + // Same as above, but we can't vectorize memory access. + for (int32_t d = threadIdx.x; d < D; d += blockDim.x) { + auto nh = d / DH; + auto dh = d % DH; + scalar_t qkv_bias_q = qkv_bias[d + 0 * D]; + scalar_t qkv_bias_k = qkv_bias[d + 1 * D]; + scalar_t qkv_bias_v = qkv_bias[d + 2 * D]; + scalar_t qkv_q = qkv[b][t][d + 0 * D]; + scalar_t qkv_k = qkv[b][t][d + 1 * D]; + scalar_t qkv_v = qkv[b][t][d + 2 * D]; + qkv_q = static_cast( + (static_cast(qkv_q) + + static_cast(qkv_bias_q)) * + static_cast(inv_sqrt_dim_per_head)); + qkv_k = static_cast( + (static_cast(qkv_k) + + static_cast(qkv_bias_k))); + qkv_v = static_cast( + (static_cast(qkv_v) + + static_cast(qkv_bias_v))); + + q_k_v[0][b][nh][t][dh] = qkv_q; + q_k_v[1][b][nh][t][dh] = qkv_k; + q_k_v[2][b][nh][t][dh] = qkv_v; + } + } +} + +template +__global__ void transform_bias_rescale_qkv_add_padding_kernel( + // [B, T, 3 * D], but it's a NestedTensor buffer + const PackedTensorAccessor64 qkv, + // [3 * D] + const PackedTensorAccessor64 qkv_bias, + const int* offsets, + const int* input_sizes, + // [3, B, NH, T, DH] + PackedTensorAccessor64 q_k_v, + const scalar_t inv_sqrt_dim_per_head) { + // warp per DH. + // so launch B * NH * T warps. + const auto NH = q_k_v.size(2); + const auto T = q_k_v.size(3); + const auto DH = q_k_v.size(4); + + const auto t = blockIdx.x % T; + const auto b = blockIdx.x / T; + + const auto D = NH * DH; + const auto _3D = 3 * D; + + const auto offset_for_batch = offsets[b]; + const auto input_dim = 1; + const auto* sizes_i = input_sizes + b * input_dim; + if (assume_aligned) { + constexpr int VEC = TRANSFORM_BIAS_RESCALE_VEC; + using LoadT = memory::aligned_vector; + for (int32_t d_v = threadIdx.x; d_v < D / VEC; d_v += blockDim.x) { + auto d = d_v * VEC; + auto nh = d / DH; + auto dh = d % DH; + scalar_t qkv_bias_q[VEC]; + scalar_t qkv_bias_k[VEC]; + scalar_t qkv_bias_v[VEC]; + scalar_t qkv_q[VEC]; + scalar_t qkv_k[VEC]; + scalar_t qkv_v[VEC]; + + const auto first_item_offset = t * _3D + d; + const auto last_item_offset = first_item_offset + VEC - 1; + const bool first_item_in_bounds = first_item_offset < sizes_i[0]; + const bool entire_vec_in_bounds = last_item_offset < sizes_i[0]; + + // Here we require D % VEC == 0 for these vectorized loads. + *reinterpret_cast(&qkv_bias_q) = + *reinterpret_cast(&qkv_bias[d + 0 * D]); + *reinterpret_cast(&qkv_bias_k) = + *reinterpret_cast(&qkv_bias[d + 1 * D]); + *reinterpret_cast(&qkv_bias_v) = + *reinterpret_cast(&qkv_bias[d + 2 * D]); + + if (entire_vec_in_bounds) { + const auto offset = offset_for_batch + first_item_offset; + *reinterpret_cast(&qkv_q) = + *reinterpret_cast(&qkv[offset + 0 * D]); + *reinterpret_cast(&qkv_k) = + *reinterpret_cast(&qkv[offset + 1 * D]); + *reinterpret_cast(&qkv_v) = + *reinterpret_cast(&qkv[offset + 2 * D]); +#pragma unroll + // TODO: specialize for float2half2/half2float2? + for (auto ii = 0; ii < VEC; ++ii) { + qkv_q[ii] = static_cast( + (static_cast(qkv_q[ii]) + + static_cast(qkv_bias_q[ii])) * + static_cast(inv_sqrt_dim_per_head)); + qkv_k[ii] = static_cast( + (static_cast(qkv_k[ii]) + + static_cast(qkv_bias_k[ii]))); + qkv_v[ii] = static_cast( + (static_cast(qkv_v[ii]) + + static_cast(qkv_bias_v[ii]))); + } + } else if (first_item_in_bounds) { + const auto offset = offset_for_batch + first_item_offset; + qkv_q[0] = qkv[offset + 0 * D]; + qkv_k[0] = qkv[offset + 1 * D]; + qkv_v[0] = qkv[offset + 2 * D]; + qkv_q[0] = static_cast( + (static_cast(qkv_q[0]) + + static_cast(qkv_bias_q[0])) * + static_cast(inv_sqrt_dim_per_head)); + qkv_k[0] = static_cast( + (static_cast(qkv_k[0]) + + static_cast(qkv_bias_k[0]))); + qkv_v[0] = static_cast( + (static_cast(qkv_v[0]) + + static_cast(qkv_bias_v[0]))); +#pragma unroll + for (auto ii = 1; ii < VEC; ++ii) { + const auto loop_offset = offset + ii; + if (loop_offset < sizes_i[0]) { + qkv_q[ii] = qkv[loop_offset + 0 * D]; + qkv_k[ii] = qkv[loop_offset + 1 * D]; + qkv_v[ii] = qkv[loop_offset + 2 * D]; + qkv_q[ii] = static_cast( + (static_cast(qkv_q[ii]) + + static_cast(qkv_bias_q[ii])) * + static_cast(inv_sqrt_dim_per_head)); + qkv_k[ii] = static_cast( + (static_cast(qkv_k[ii]) + + static_cast(qkv_bias_k[ii]))); + qkv_v[ii] = static_cast( + (static_cast(qkv_v[ii]) + + static_cast(qkv_bias_v[ii]))); + } else { + qkv_q[ii] = 0; + qkv_k[ii] = 0; + qkv_v[ii] = 0; + } + } + } else { +#pragma unroll + for (auto ii = 0; ii < VEC; ++ii) { + qkv_q[ii] = 0; + qkv_k[ii] = 0; + qkv_v[ii] = 0; + } + } + + // Here we require DH % VEC == 0 for these vectorized stores. + *reinterpret_cast(&q_k_v[0][b][nh][t][dh]) = + *reinterpret_cast(&qkv_q); + *reinterpret_cast(&q_k_v[1][b][nh][t][dh]) = + *reinterpret_cast(&qkv_k); + *reinterpret_cast(&q_k_v[2][b][nh][t][dh]) = + *reinterpret_cast(&qkv_v); + } + } else { + for (int32_t d = threadIdx.x; d < D; d += blockDim.x) { + auto nh = d / DH; + auto dh = d % DH; + scalar_t qkv_bias_q = qkv_bias[d + 0 * D]; + scalar_t qkv_bias_k = qkv_bias[d + 1 * D]; + scalar_t qkv_bias_v = qkv_bias[d + 2 * D]; + + const auto item_offset = t * _3D + d; + const bool in_bounds = item_offset < sizes_i[0]; + scalar_t qkv_q, qkv_k, qkv_v; + if (in_bounds) { + const auto qkv_offset = offset_for_batch + item_offset; + qkv_q = qkv[qkv_offset + 0 * D]; + qkv_k = qkv[qkv_offset + 1 * D]; + qkv_v = qkv[qkv_offset + 2 * D]; + qkv_q = static_cast( + (static_cast(qkv_q) + + static_cast(qkv_bias_q)) * + static_cast(inv_sqrt_dim_per_head)); + qkv_k = static_cast( + (static_cast(qkv_k) + + static_cast(qkv_bias_k))); + qkv_v = static_cast( + (static_cast(qkv_v) + + static_cast(qkv_bias_v))); + } else { + qkv_q = 0; + qkv_k = 0; + qkv_v = 0; + } + + q_k_v[0][b][nh][t][dh] = qkv_q; + q_k_v[1][b][nh][t][dh] = qkv_k; + q_k_v[2][b][nh][t][dh] = qkv_v; + } + } +} + +Tensor collapse_dims_1_and_2(const Tensor& sizes) { + auto sizes_dim1 = at::native::narrow(sizes, 1, 0, 1); + auto sizes_dim2 = at::native::narrow(sizes, 1, 1, 1); + + return (sizes_dim1 * sizes_dim2).contiguous(); +} + +} // namespace +// compute q = (q + q_bias) / sqrt(dim_per_head), k = k + k_bias, v = v + v_bias +__host__ std::tuple transform_bias_rescale_qkv_cuda( + const Tensor& qkv, + const Tensor& qkv_bias, + const int64_t num_head) { + auto B = qkv.is_nested() + ? get_nested_tensor_impl(qkv)->get_nested_size_tensor().size(0) + : qkv.size(0); + // TODO: calculate this without the std::vector -- NestedTensor_to_mask wants + // this too + auto T = qkv.is_nested() + ? NestedTensor_get_max_size(*get_nested_tensor_impl(qkv))[0] + : qkv.size(1); + auto _3D = qkv_bias.size(0); + auto D = _3D / 3; + TORCH_CHECK(D % num_head == 0); + const auto dim_per_head = D / num_head; + auto q_k_v = at::empty({3, B, num_head, T, dim_per_head}, qkv_bias.options()); +#define CALL_KERNEL(assume_aligned) \ + transform_bias_rescale_qkv_kernel \ + <<>>( \ + qkv.packed_accessor64(), \ + qkv_bias.packed_accessor64(), \ + q_k_v.packed_accessor64(), \ + 1.0 / std::sqrt(static_cast(dim_per_head))) +#define CALL_ADD_PADDING_KERNEL(assume_aligned) \ + transform_bias_rescale_qkv_add_padding_kernel< \ + scalar_t, \ + accscalar_t, \ + assume_aligned> \ + <<>>( \ + nt_qkv->get_buffer() \ + .packed_accessor64(), \ + qkv_bias.packed_accessor64(), \ + offsets_ptr, \ + sizes_ptr, \ + q_k_v.packed_accessor64(), \ + 1.0 / std::sqrt(static_cast(dim_per_head))) + + AT_DISPATCH_FLOATING_TYPES_AND2( + ScalarType::Half, + ScalarType::BFloat16, + qkv.scalar_type(), + "transform_bias_rescale_qkv", + [&] { + using accscalar_t = acc_type; + auto threads = std::max( + std::min(1024, D / TRANSFORM_BIAS_RESCALE_VEC), 1); + auto blocks = B * T; + const bool aligned = + ((dim_per_head % TRANSFORM_BIAS_RESCALE_VEC) == 0) && + ((reinterpret_cast(qkv_bias.data_ptr()) % + TRANSFORM_BIAS_RESCALE_VEC) == 0); + if (aligned) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + D % TRANSFORM_BIAS_RESCALE_VEC == 0, + "D = num_heads * dim_per_head, so we should have dim_per_head % " + "TRANSFORM_BIAS_RESCALE_VEC == 0 => " + "D % TRANSFORM_BIAS_RESCALE_VEC == 0"); + } + if (qkv.is_nested()) { + auto* nt_qkv = get_nested_tensor_impl(qkv); + auto sizes = collapse_dims_1_and_2(nt_qkv->get_nested_size_tensor()); + auto offsets = + NestedTensor_batch_offsets_from_size_tensor(sizes, sizes.numel()); + at::native::narrow(offsets, 0, sizes.numel() + 1, sizes.numel()) + .copy_(sizes.reshape({-1})); + auto metadata = offsets.to(at::Device(kCUDA), at::kInt, true, true); + const auto offsets_ptr = metadata.data_ptr(); + const auto sizes_ptr = offsets_ptr + sizes.numel() + 1; + const auto input_dim = sizes.sizes()[1]; + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input_dim == 1); + if (aligned && + ((reinterpret_cast(nt_qkv->get_buffer().data_ptr()) % + TRANSFORM_BIAS_RESCALE_VEC) == 0)) { + CALL_ADD_PADDING_KERNEL(true); + } else { + CALL_ADD_PADDING_KERNEL(false); + } + } else if (aligned) { + CALL_KERNEL(true); + } else { + CALL_KERNEL(false); + } + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); +#undef CALL_ADD_PADDING_KERNEL +#undef CALL_KERNEL + auto q_k_v_s = + at::native::split(q_k_v.view({3 * B, num_head, T, dim_per_head}), B, 0); + return std::make_tuple(q_k_v_s[0], q_k_v_s[1], q_k_v_s[2]); +} +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/transformers/transformer.cpp b/aten/src/ATen/native/transformers/transformer.cpp new file mode 100644 index 000000000000..a789aab18d6c --- /dev/null +++ b/aten/src/ATen/native/transformers/transformer.cpp @@ -0,0 +1,137 @@ +#include +#include +#include +#include + +#include + +#include + +namespace at { + +namespace native { + +namespace { +Tensor linear_for_ffn( + const Tensor& bias, + const Tensor& mat1, + const Tensor& mat2, + c10::optional use_gelu) { + if (mat1.is_nested()) { + return NestedTensor_times_Tensor_plus_Tensor_addmm( + bias, mat1, mat2.t(), 1, 1, use_gelu); + } + + auto mat1_ = mat1.view({mat1.sizes()[0] * mat1.sizes()[1], mat1.sizes()[2]}); + Tensor result; + if (use_gelu.has_value()) { + result = at::_addmm_activation(bias, mat1_, mat2.t(), 1, 1, *use_gelu); + } else { + result = at::addmm(bias, mat1_, mat2.t()); + } + return result.view({mat1.sizes()[0], mat1.sizes()[1], -1}); +} + +Tensor ffn( + const Tensor& input, + const Tensor& w1, + const Tensor& b1, + const Tensor& w2, + const Tensor& b2, + bool use_gelu, + bool add_norm) { + TORCH_CHECK(add_norm == false, "TODO add_norm to be supported in FFN"); + TORCH_CHECK(input.dim() == 3, "batched input size should be 3"); + TORCH_CHECK(w1.dim() == 2, "2d weights expected"); + TORCH_CHECK(w2.dim() == 2, "2d weights expected"); + Tensor res = linear_for_ffn(b1, input, w1, use_gelu); + res = linear_for_ffn(b2, res, w2, c10::nullopt); + return res; +} +} // namespace + +Tensor transformer_encoder_layer_forward( + const Tensor& src, + const int64_t embed_dim, + const int64_t num_heads, + const Tensor& qkv_weight, + const Tensor& qkv_bias, + const Tensor& proj_weight, + const Tensor& proj_bias, + const bool use_gelu, + const bool norm_first, + const double layer_norm_eps, + const Tensor& layer_norm_weight_1, + const Tensor& layer_norm_bias_1, + const Tensor& layer_norm_weight_2, + const Tensor& layer_norm_bias_2, + const Tensor& ffn_weight_1, + const Tensor& ffn_bias_1, + const Tensor& ffn_weight_2, + const Tensor& ffn_bias_2, + const c10::optional& mask) { + { + const Tensor& check_for_empty = src.is_nested() ? get_nested_tensor_impl(src)->get_buffer() : src; + if (check_for_empty.numel() == 0) { + return src.is_nested() + ? at::detail::make_tensor(check_for_empty, get_nested_tensor_impl(src)->get_nested_size_tensor()) + : src.clone(); + } + } + TORCH_CHECK(!norm_first, "norm_first is not supported yet"); + const bool use_nested_tensor = src.is_nested(); + auto x = std::get<0>(native_multi_head_attention( + src, + src, + src, + embed_dim, + num_heads, + qkv_weight, + qkv_bias, + proj_weight, + proj_bias, + mask, + false /* need_weights */)); + if (use_nested_tensor) { + NestedTensor_add_NestedTensor_in_place(x, src); + x = NestedTensor_layer_norm( + x, layer_norm_weight_1, layer_norm_bias_1, layer_norm_eps); + } else { + x.add_(src); + x = at::layer_norm( + x, + {embed_dim}, + layer_norm_weight_1, + layer_norm_bias_1, + layer_norm_eps, + true); + } + + auto pre_ffn_res = x; + x = ffn( + x, + ffn_weight_1, + ffn_bias_1, + ffn_weight_2, + ffn_bias_2, + use_gelu, + /* add_norm* */ false); + if (use_nested_tensor) { + NestedTensor_add_NestedTensor_in_place(x, pre_ffn_res); + x = NestedTensor_layer_norm( + x, layer_norm_weight_2, layer_norm_bias_2, layer_norm_eps); + } else { + x.add_(pre_ffn_res); + x = at::layer_norm( + x, + {embed_dim}, + layer_norm_weight_2, + layer_norm_bias_2, + layer_norm_eps, + true); + } + return x; +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/ts_native_functions.yaml b/aten/src/ATen/native/ts_native_functions.yaml new file mode 100644 index 000000000000..80febbd039fc --- /dev/null +++ b/aten/src/ATen/native/ts_native_functions.yaml @@ -0,0 +1,181 @@ +backend: Lazy +cpp_namespace: torch::lazy +full_codegen: + - _adaptive_avg_pool2d + - _adaptive_avg_pool2d_backward + - _log_softmax + - _log_softmax_backward_data + - _softmax + - _softmax_backward_data + - abs + - add.Tensor + - addcdiv + - addcmul + - addmm + - arange.start_out + - all + - any + - avg_pool2d + - avg_pool2d_backward + - baddbmm + - bernoulli + - bernoulli_.float + - binary_cross_entropy + - binary_cross_entropy_backward + - bitwise_and.Tensor + - bitwise_or.Tensor + - bmm + - cat + - clamp + - clamp_min + - constant_pad_nd + - convolution + - convolution_backward + - cos + - cumsum + - div.Tensor + - div.Tensor_mode + - elu + - elu_backward + - embedding + - embedding_dense_backward + - eq.Scalar + - eq.Tensor + - exp + - flip + - floor + - frac + - gather + - ge.Scalar + - ge.Tensor + - gelu + - gelu_backward + - glu + - glu_backward + - glu_jvp + - grid_sampler_2d + - grid_sampler_2d_backward + - gt.Scalar + - gt.Tensor + - hardsigmoid + - index_select + - kl_div_backward + - l1_loss_backward + - le.Scalar + - le.Tensor + - leaky_relu + - leaky_relu_backward + - log + - log2 + - logdet + - log_sigmoid_backward + - log_sigmoid_forward + - lt.Scalar + - lt.Tensor + - masked_fill_.Scalar + - masked_fill_.Tensor + - max + - max.dim + - max_pool2d_with_indices + - max_pool2d_with_indices_backward + - maximum + - mean + - mean.dim + - min + - minimum + - mm + - mul.Tensor + - mv + - native_dropout + - native_dropout_backward + - native_layer_norm + - native_layer_norm_backward + - ne.Scalar + - ne.Tensor + - neg + - nll_loss_backward + - nll_loss_forward + - nll_loss2d_backward + - nll_loss2d_forward + - nonzero + - norm.ScalarOpt_dim + - pow.Tensor_Scalar + - pow.Tensor_Tensor + - random_ + - random_.from + - random_.to + - reciprocal + - relu + - relu_ + - remainder.Tensor + - repeat + - rsqrt + - scatter_add + - sgn + - sigmoid + - sigmoid_backward + - silu + - smooth_l1_loss + - smooth_l1_loss_backward + - softplus + - softplus_backward + - sort + - sqrt + - stack + - std + - std.dim + - std.correction + - sub.Tensor + - sum + - sum.dim_IntList + - tanh + - tanh_backward + - threshold + - threshold_backward + - topk + - trace + - tril + - triu + - trunc + - upsample_bilinear2d + - upsample_bilinear2d_backward + - upsample_nearest2d + - upsample_nearest2d_backward + - zero_ + - narrow_copy.SymInt +supported: + - as_strided + - as_strided_ + - clone + - _copy_from + - _copy_from_and_resize + - diagonal + - empty.memory_format + - empty_strided + - expand + - fill_.Scalar + - narrow + - native_batch_norm + - native_batch_norm_backward + - normal_ + - max_pool3d_with_indices + - max_pool3d_with_indices_backward + - permute + - select.int + - slice.Tensor + - squeeze + - squeeze.dim + - squeeze_ + - squeeze_.dim + - t + - t_ + - _to_copy + - transpose.int + - transpose_ + - unsqueeze + - unsqueeze_ + - view + - alias + - _unsafe_view +autograd: + - max_pool3d diff --git a/aten/src/ATen/native/ufunc/add.h b/aten/src/ATen/native/ufunc/add.h new file mode 100644 index 000000000000..94a776728ead --- /dev/null +++ b/aten/src/ATen/native/ufunc/add.h @@ -0,0 +1,27 @@ +#pragma once + +#include + +#if !defined(__CUDACC__) && !defined(__HIPCC__) +#include +#include +#endif + +namespace at { +namespace native { +namespace ufunc { + +template +C10_HOST_DEVICE C10_ALWAYS_INLINE T add(T self, T other, T alpha) __ubsan_ignore_undefined__ { + return self + alpha * other; +} + +#if !defined(__CUDACC__) && !defined(__HIPCC__) +using vec::Vectorized; +template +C10_ALWAYS_INLINE Vectorized add(Vectorized self, Vectorized other, Vectorized alpha) __ubsan_ignore_undefined__ { + return vec::fmadd(other, alpha, self); +} +#endif + +}}} // namespace at::native::ufunc diff --git a/aten/src/ATen/native/vulkan/Vulkan.cpp b/aten/src/ATen/native/vulkan/Vulkan.cpp deleted file mode 100644 index 6d253206bafd..000000000000 --- a/aten/src/ATen/native/vulkan/Vulkan.cpp +++ /dev/null @@ -1,1420 +0,0 @@ -#include -#include -#include -#include -#include - -#ifdef USE_VULKAN_WRAPPER -#include -#else -#include -#endif - -#include -#include - -#ifdef USE_VULKAN_SHADERC_RUNTIME -#include -#include -#else -#include -#endif - -#include -#include -#include -#include -#include -#include - - -#define VK_CHECK(f) \ - { \ - VkResult res = (f); \ - TORCH_CHECK(res == VK_SUCCESS, "Vulkan error VkResult:", res); \ - } - -namespace at { -namespace native { -namespace vulkan { -namespace detail { - -VContext::VContext(const bool enableValidationLayers) - : enableValidationLayers_(enableValidationLayers) { - createInstance(); - findPhysicalDevice(); - createDevice(); - - computeUnitFactory_ = std::make_unique(device_); -} - -VContext::~VContext() { - if (enableValidationLayers_) { - const auto func = (PFN_vkDestroyDebugReportCallbackEXT)vkGetInstanceProcAddr( - instance_, "vkDestroyDebugReportCallbackEXT"); - if (func) { - func(instance_, debugReportCallback_, nullptr); - } - } - - // ComputeUnitFactory_ owns ComputeUnits and VkPipelineCache, need valid - // VkDevice for destructing, destructing before vkDestroyDevice - computeUnitFactory_.reset(); - - vkDestroyCommandPool(device_, commandPool_, nullptr); - vkDestroyDevice(device_, nullptr); - vkDestroyInstance(instance_, nullptr); -} - -static VKAPI_ATTR VkBool32 VKAPI_CALL debugReportCallbackFn( - const VkDebugReportFlagsEXT msgFlags, - const VkDebugReportObjectTypeEXT objectType, - const uint64_t object, - const size_t location, - const int32_t msgCode, - const char* const pLayerPrefix, - const char* const pMsg, - void* const pUserData) { - std::stringstream s; - s << pLayerPrefix << " " << msgCode << " " << pMsg << std::endl; - if (msgFlags & VK_DEBUG_REPORT_ERROR_BIT_EXT) { - LOG(ERROR) << s.str(); - } else if (msgFlags & VK_DEBUG_REPORT_WARNING_BIT_EXT) { - LOG(WARNING) << "WARN:" << s.str(); - } else if (msgFlags & VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT) { - LOG(WARNING) << "PERF_WARN:" << s.str(); - } else if (msgFlags & VK_DEBUG_REPORT_INFORMATION_BIT_EXT) { - LOG(INFO) << s.str(); - } - return VK_FALSE; -} - -void VContext::createInstance() { - std::vector enabledExtensions; - if (enableValidationLayers_) { - uint32_t layerPresentCount = 0; - VK_CHECK(vkEnumerateInstanceLayerProperties(&layerPresentCount, nullptr)); - std::vector layerProps(layerPresentCount); - VK_CHECK(vkEnumerateInstanceLayerProperties(&layerPresentCount, layerProps.data())); - std::array instanceLayers{ - "VK_LAYER_GOOGLE_unique_objects", - "VK_LAYER_GOOGLE_threading", - "VK_LAYER_LUNARG_object_tracker", - "VK_LAYER_LUNARG_core_validation", - "VK_LAYER_LUNARG_parameter_validation", - "VK_LAYER_KHRONOS_validation", - }; - - for (const auto& wantedLayer : instanceLayers) { - for (const auto& presentLayer : layerProps) { - if (strcmp(wantedLayer, presentLayer.layerName) == 0) { - enabledValidationLayers_.push_back(wantedLayer); - break; - } - } - } - - uint32_t extCount = 0; - VK_CHECK(vkEnumerateInstanceExtensionProperties(nullptr, &extCount, nullptr)); - std::vector extProps(extCount); - VK_CHECK(vkEnumerateInstanceExtensionProperties(nullptr, &extCount, extProps.data())); - bool foundExt = false; - for (VkExtensionProperties p : extProps) { - if (strcmp(VK_EXT_DEBUG_REPORT_EXTENSION_NAME, p.extensionName) == 0) { - foundExt = true; - break; - } - } - if (foundExt) { - enabledExtensions.push_back(VK_EXT_DEBUG_REPORT_EXTENSION_NAME); - } - } - - VkApplicationInfo applicationInfo{}; - applicationInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO; - applicationInfo.pApplicationName = "PyTorch"; - applicationInfo.applicationVersion = 0; - applicationInfo.pEngineName = "PyTorch"; - applicationInfo.engineVersion = 0; - applicationInfo.apiVersion = VK_API_VERSION_1_0; - - VkInstanceCreateInfo createInfo{}; - createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; - createInfo.flags = 0; - createInfo.pApplicationInfo = &applicationInfo; - createInfo.enabledLayerCount = enabledValidationLayers_.size(); - createInfo.ppEnabledLayerNames = enabledValidationLayers_.data(); - createInfo.enabledExtensionCount = enabledExtensions.size(); - createInfo.ppEnabledExtensionNames = enabledExtensions.data(); - - VK_CHECK(vkCreateInstance(&createInfo, nullptr, &instance_)); - - if (enableValidationLayers_) { - VkDebugReportCallbackCreateInfoEXT debugReportCallbackCreateInfo{}; - debugReportCallbackCreateInfo.sType = - VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT; - debugReportCallbackCreateInfo.flags = VK_DEBUG_REPORT_ERROR_BIT_EXT | - VK_DEBUG_REPORT_WARNING_BIT_EXT | - VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT; - debugReportCallbackCreateInfo.pfnCallback = &debugReportCallbackFn; - - const auto vkCreateDebugReportCallbackEXT = - (PFN_vkCreateDebugReportCallbackEXT)vkGetInstanceProcAddr( - instance_, "vkCreateDebugReportCallbackEXT"); - TORCH_CHECK( - vkCreateDebugReportCallbackEXT, - "Could not load vkCreateDebugReportCallbackEXT"); - VK_CHECK(vkCreateDebugReportCallbackEXT( - instance_, - &debugReportCallbackCreateInfo, - nullptr, - &debugReportCallback_)); - } -} - -void VContext::findPhysicalDevice() { - uint32_t deviceCount = 0; - VK_CHECK(vkEnumeratePhysicalDevices(instance_, &deviceCount, nullptr)); - TORCH_CHECK( - deviceCount > 0, "Vulkan: Could not find a device with vulkan support"); - std::vector devices(deviceCount); - VK_CHECK(vkEnumeratePhysicalDevices(instance_, &deviceCount, devices.data())); - physicalDevice_ = devices[0]; -} - -uint32_t VContext::getComputeQueueFamilyIndex() { - uint32_t queueFamilyCount = 0; - - vkGetPhysicalDeviceQueueFamilyProperties( - physicalDevice_, &queueFamilyCount, nullptr); - TORCH_CHECK( - queueFamilyCount > 0, "Vulkan: Invalid number of queue families"); - std::vector queueFamilies(queueFamilyCount); - vkGetPhysicalDeviceQueueFamilyProperties( - physicalDevice_, &queueFamilyCount, queueFamilies.data()); - - for (const auto i : c10::irange(queueFamilies.size())) { - VkQueueFamilyProperties props = queueFamilies[i]; - if (props.queueCount > 0 && (props.queueFlags & VK_QUEUE_COMPUTE_BIT)) { - return i; - } - } - - TORCH_CHECK( - false, "Vulkan: Could not find a queue family that supports operations"); -} - -void VContext::createDevice() { - VkDeviceQueueCreateInfo queueCreateInfo{}; - queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; - queueFamilyIndex_ = getComputeQueueFamilyIndex(); - queueCreateInfo.queueFamilyIndex = queueFamilyIndex_; - queueCreateInfo.queueCount = 1; - const float queuePriorities = 1.0f; - queueCreateInfo.pQueuePriorities = &queuePriorities; - VkDeviceCreateInfo deviceCreateInfo{}; - VkPhysicalDeviceFeatures deviceFeatures{}; - - deviceCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; - deviceCreateInfo.enabledLayerCount = enabledValidationLayers_.size(); - deviceCreateInfo.ppEnabledLayerNames = enabledValidationLayers_.data(); - deviceCreateInfo.pQueueCreateInfos = &queueCreateInfo; - - deviceCreateInfo.queueCreateInfoCount = 1; - deviceCreateInfo.pEnabledFeatures = &deviceFeatures; - - VK_CHECK( - vkCreateDevice(physicalDevice_, &deviceCreateInfo, nullptr, &device_)); - queue_ = {}; - vkGetDeviceQueue(device_, queueFamilyIndex_, 0, &queue_); - - VkPhysicalDeviceProperties physicalDeviceProperties{}; - vkGetPhysicalDeviceProperties(physicalDevice_, &physicalDeviceProperties); - - VkCommandPoolCreateInfo commandPoolCreateInfo{}; - commandPoolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; - commandPoolCreateInfo.flags = 0; - commandPoolCreateInfo.queueFamilyIndex = queueFamilyIndex_; - VK_CHECK(vkCreateCommandPool( - device_, &commandPoolCreateInfo, nullptr, &commandPool_)); - physicalDeviceLimits_ = physicalDeviceProperties.limits; -} - -static std::unique_ptr gContext; -const VContext& context() { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(gContext); - return *gContext; -} - -bool initVulkanContextOnce() { - static const int once = []() { -#ifdef USE_VULKAN_WRAPPER - if (!InitVulkan()) { - TORCH_WARN("Vulkan Wrapper Failed to InitVulkan"); - return 1; - } -#endif - gContext = std::make_unique(kEnableValidationLayers); - if (!gContext) { - TORCH_WARN("Vulkan Failed to create Vulkan Context"); - return 2; - } - return 0; - }(); - ((void)once); - return static_cast(gContext); -} - -bool is_available() { - return initVulkanContextOnce(); -} - -uint32_t findMemoryType( - const VkPhysicalDevice physicalDevice, - const uint32_t memoryTypeBits, - const VkMemoryPropertyFlags properties) { - VkPhysicalDeviceMemoryProperties memoryProperties{}; - vkGetPhysicalDeviceMemoryProperties(physicalDevice, &memoryProperties); - for (const auto i : c10::irange(memoryProperties.memoryTypeCount)) { - if ((memoryTypeBits & (1 << i)) && - ((memoryProperties.memoryTypes[i].propertyFlags & properties) == - properties)) { - return i; - } - } - return -1; -} - -void VBuffer::MapMemory::flushWriteToDevice() { - VkMappedMemoryRange range{}; - range.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; - range.memory = deviceMemory_; - range.offset = offset_; - range.size = size_; - range.pNext = nullptr; - - VK_CHECK(vkFlushMappedMemoryRanges(context().device(), 1, &range)); -} - -void VBuffer::MapMemory::flushWriteToHost() { - VkMappedMemoryRange range{}; - range.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; - range.memory = deviceMemory_; - range.offset = offset_; - range.size = size_; - range.pNext = nullptr; - - VK_CHECK(vkInvalidateMappedMemoryRanges(context().device(), 1, &range)); -} - -VBuffer::VBuffer( - const VkDeviceSize bufferSizeBytes, - const VkBufferUsageFlags bufferUsageFlags, - const VkDescriptorType descriptorType) - : bufferSizeBytes_(bufferSizeBytes), descriptorType_(descriptorType) { - const auto device = context().device(); - const auto physicalDevice = context().physicalDevice(); - VkBufferCreateInfo bufferCreateInfo{}; - bufferCreateInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; - bufferCreateInfo.size = bufferSizeBytes_; - bufferCreateInfo.usage = bufferUsageFlags; - bufferCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - VK_CHECK(vkCreateBuffer(device, &bufferCreateInfo, nullptr, &buffer_)); - VkMemoryRequirements memoryRequirements; - vkGetBufferMemoryRequirements(device, buffer_, &memoryRequirements); - VkMemoryAllocateInfo allocateInfo{}; - allocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; - allocateInfo.allocationSize = memoryRequirements.size; - allocateInfo.memoryTypeIndex = findMemoryType( - physicalDevice, - memoryRequirements.memoryTypeBits, - VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | - VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); - VK_CHECK(vkAllocateMemory(device, &allocateInfo, nullptr, &bufferMemory_)); - VK_CHECK(vkBindBufferMemory(device, buffer_, bufferMemory_, 0)); -} - -VBuffer::~VBuffer() { - vkFreeMemory(context().device(), bufferMemory_, nullptr); - vkDestroyBuffer(context().device(), buffer_, nullptr); -} - -void VBuffer::copy_from_device_to_host( - void* const outputData, const int64_t size) const { - auto mm = map(); - TORCH_INTERNAL_ASSERT(mm.ptr(), "Vulkan: Failed to map Vulkan Buffer memory"); - ::memcpy(outputData, mm.ptr(), size); - mm.flushWriteToHost(); -} - -void VBuffer::copy_from_host_to_device( - const void* const data, const int64_t size) { - auto mm = map(); - TORCH_INTERNAL_ASSERT(mm.ptr(), "Vulkan: Failed to map Vulkan Buffer memory"); - ::memcpy(mm.ptr(), data, size); - mm.flushWriteToDevice(); -} - -void VBuffer::set_zeros() { - auto mm = map(); - TORCH_INTERNAL_ASSERT(mm.ptr(), "Vulkan: Failed to map Vulkan Buffer memory"); - ::memset(mm.ptr(), 0, bufferSizeBytes_); -} - -VkDescriptorBufferInfo VBuffer::makeDescriptorBufferInfo() const { - VkDescriptorBufferInfo info{}; - info.buffer = buffer_; - info.offset = 0; - info.range = bufferSizeBytes_; - return info; -} - -VkWriteDescriptorSet VBuffer::makeWriteDescriptorSet( - const VkDescriptorSet descriptorSet, - const uint32_t binding, - const VkDescriptorBufferInfo* const bufferInfo) const { - VkWriteDescriptorSet writeSet{}; - writeSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - writeSet.pNext = nullptr; - writeSet.dstSet = descriptorSet; - writeSet.dstBinding = binding; - writeSet.dstArrayElement = 0; - writeSet.descriptorCount = 1; - writeSet.descriptorType = descriptorType_; - writeSet.pImageInfo = nullptr; - writeSet.pBufferInfo = bufferInfo; - writeSet.pTexelBufferView = nullptr; - return writeSet; -} - -void VBuffer::bind(const VkDescriptorSet descriptorSet, const uint32_t binding) const { - const auto descrBufferInfo = makeDescriptorBufferInfo(); - const auto writeDescrSet = - makeWriteDescriptorSet(descriptorSet, binding, &descrBufferInfo); - vkUpdateDescriptorSets(context().device(), 1, &writeDescrSet, 0, nullptr); -} - -void VBuffer::addBufferMemoryBarrier( - const VkCommandBuffer commandBuffer, - const VkDeviceSize offset, - const VkDeviceSize size) const { - VkBufferMemoryBarrier barrier{}; - barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - barrier.buffer = buffer_; - barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.offset = offset; - barrier.pNext = nullptr; - barrier.size = size; - barrier.srcAccessMask = - VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_TRANSFER_WRITE_BIT; - barrier.dstAccessMask = - VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT; - - vkCmdPipelineBarrier( - commandBuffer, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT, - 0, - 0, - nullptr, - 1, - &barrier, - 0, - nullptr); -} - -VImage::VImage(const ImageSize imageSize, const ImageSize dataSize) - : imageSize_(imageSize), dataSize_(dataSize) { - const auto device = context().device(); - const auto physicalDevice = context().physicalDevice(); - - VkImageCreateInfo imageInfo{}; - imageInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO; - imageInfo.imageType = kImageType; - imageInfo.extent.width = imageSize_[0]; - imageInfo.extent.height = imageSize_[1]; - imageInfo.extent.depth = imageSize_[2]; - - imageInfo.mipLevels = 1; - imageInfo.arrayLayers = 1; - imageInfo.format = kFormat; - imageInfo.tiling = VK_IMAGE_TILING_OPTIMAL; - imageInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; - imageInfo.usage = VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_SAMPLED_BIT; - imageInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - imageInfo.samples = VK_SAMPLE_COUNT_1_BIT; - imageInfo.pNext = nullptr; - imageInfo.flags = 0; - imageLayout_ = VK_IMAGE_LAYOUT_UNDEFINED; - - VK_CHECK(vkCreateImage(device, &imageInfo, nullptr, &image_)); - - VkMemoryRequirements memReqs{}; - vkGetImageMemoryRequirements(device, image_, &memReqs); - VkMemoryAllocateInfo allocInfo{}; - allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; - allocInfo.allocationSize = memReqs.size; - allocInfo.memoryTypeIndex = findMemoryType( - physicalDevice, - memReqs.memoryTypeBits, - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); - - VK_CHECK(vkAllocateMemory(device, &allocInfo, nullptr, &imageMemory_)); - VK_CHECK(vkBindImageMemory(device, image_, imageMemory_, 0)); - - const VkImageViewCreateInfo imageViewCreateInfo = makeImageViewCreateInfo(); - VK_CHECK( - vkCreateImageView(device, &imageViewCreateInfo, nullptr, &imageView_)); - - const VkSamplerCreateInfo samplerCreateInfo = makeSamplerCreateInfo(); - VK_CHECK(vkCreateSampler(device, &samplerCreateInfo, nullptr, &sampler_)); -} - -VImage::~VImage() { - vkFreeMemory(context().device(), imageMemory_, nullptr); - vkDestroySampler(context().device(), sampler_, nullptr); - vkDestroyImageView(context().device(), imageView_, nullptr); - vkDestroyImage(context().device(), image_, nullptr); -} - -VkImageViewCreateInfo VImage::makeImageViewCreateInfo() const { - VkImageViewCreateInfo info{}; - info.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; - info.image = image_; - info.viewType = kImageViewType; - info.format = kFormat; - info.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; - info.subresourceRange.baseMipLevel = 0; - info.subresourceRange.levelCount = 1; - info.subresourceRange.baseArrayLayer = 0; - info.subresourceRange.layerCount = 1; - return info; -} - -VkSamplerCreateInfo VImage::makeSamplerCreateInfo() const { - VkSamplerCreateInfo info{}; - info.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO; - info.magFilter = kFilter; - info.minFilter = kFilter; - info.addressModeU = kSamplerAddressMode; - info.addressModeV = kSamplerAddressMode; - info.addressModeW = kSamplerAddressMode; - info.anisotropyEnable = VK_FALSE; - info.maxAnisotropy = 1.0f; - info.borderColor = VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK; - info.compareEnable = VK_FALSE; - info.mipmapMode = VK_SAMPLER_MIPMAP_MODE_LINEAR; - info.mipLodBias = 0.0f; - info.minLod = 0.0f; - info.maxLod = 0.0f; - return info; -} - -VkDescriptorImageInfo VImage::makeDescriptorImageInfo( - const VkImageLayout imageLayout) const { - VkDescriptorImageInfo info{}; - info.sampler = sampler_; - info.imageView = imageView_; - info.imageLayout = imageLayout; - return info; -} - -VkWriteDescriptorSet VImage::makeWriteDescriptorSet( - const VkDescriptorSet descriptorSet, - const uint32_t binding, - const VkDescriptorType descriptorType, - const VkDescriptorImageInfo* const imageInfo) const { - VkWriteDescriptorSet writeSet{}; - writeSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - writeSet.pNext = nullptr; - writeSet.dstSet = descriptorSet; - writeSet.dstBinding = binding; - writeSet.dstArrayElement = 0; - writeSet.descriptorCount = 1; - writeSet.descriptorType = descriptorType, writeSet.pImageInfo = imageInfo; - writeSet.pBufferInfo = nullptr; - writeSet.pTexelBufferView = nullptr; - return writeSet; -} - -void VImage::bind( - const VkDescriptorSet descriptorSet, - const uint32_t binding, - const VkDescriptorType descriptorType, - const VkImageLayout imageLayout) const { - const auto descrImageInfo = makeDescriptorImageInfo(imageLayout); - const auto writeDescrSet = makeWriteDescriptorSet( - descriptorSet, binding, descriptorType, &descrImageInfo); - vkUpdateDescriptorSets(context().device(), 1, &writeDescrSet, 0, nullptr); -} - -void VImage::bindShaderRead( - const VkDescriptorSet descriptorSet, const uint32_t binding) const { - bind( - descriptorSet, - binding, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); -} - -void VImage::bindStorageImage( - const VkDescriptorSet descriptorSet, const uint32_t binding) const { - bind( - descriptorSet, - binding, - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - VK_IMAGE_LAYOUT_GENERAL); -} - -void VImage::addImageMemoryBarrier( - const VkCommandBuffer commandBuffer, - const VkImageLayout newLayout) const { - const VkImageLayout oldLayout = imageLayout_; - if (oldLayout == newLayout) { - return; - } - - VkImageMemoryBarrier barrier{}; - barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; - barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.image = image_; - barrier.newLayout = newLayout; - barrier.oldLayout = oldLayout; - barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; - barrier.subresourceRange.levelCount = 1; - barrier.subresourceRange.layerCount = 1; - - barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; - barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; - - VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; - VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; - if (oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && - newLayout == VK_IMAGE_LAYOUT_GENERAL) { - barrier.srcAccessMask = 0; - barrier.dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT; - srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - } else if ( - oldLayout == VK_IMAGE_LAYOUT_UNDEFINED && - newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) { - barrier.srcAccessMask = 0; - barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; - } else if ( - oldLayout == VK_IMAGE_LAYOUT_GENERAL && - newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) { - barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; - barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; - } else if ( - oldLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL && - newLayout == VK_IMAGE_LAYOUT_GENERAL) { - barrier.srcAccessMask = VK_ACCESS_SHADER_READ_BIT; - barrier.dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT; - } else { - TORCH_INTERNAL_ASSERT( - false, "Vulkan: Unsupported Vulkan Image Layout transition"); - } - vkCmdPipelineBarrier( - commandBuffer, - srcStageMask, - dstStageMask, - 0, - 0, - nullptr, - 0, - nullptr, - 1, - &barrier); - imageLayout_ = newLayout; -} - -void VImage::addImageMemoryBarrierToGeneral( - const VkCommandBuffer commandBuffer) const { - addImageMemoryBarrier(commandBuffer, VK_IMAGE_LAYOUT_GENERAL); -} - -void VImage::addImageMemoryBarrierToShaderRead( - const VkCommandBuffer commandBuffer) const { - addImageMemoryBarrier( - commandBuffer, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); -} - -VkDescriptorSetLayoutBinding descriptorSetLayoutBinding( - const uint32_t binding, - const VkDescriptorType descriptorType) { - return {binding, descriptorType, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}; -} - -void createDescriptorSetLayout( - const VkDevice device, - const VkDescriptorSetLayoutBinding* const bindings, - const uint32_t bindingCount, - VkDescriptorSetLayout* const setLayout) { - VkDescriptorSetLayoutCreateInfo createInfo{}; - createInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; - createInfo.pNext = nullptr; - createInfo.flags = 0; - createInfo.bindingCount = bindingCount; - createInfo.pBindings = bindings; - VK_CHECK( - vkCreateDescriptorSetLayout(device, &createInfo, nullptr, setLayout)); -} - -void createDescriptorPool( - const VkDevice device, - const VkDescriptorPoolSize* poolSizes, - const uint32_t poolSizeCount, - const uint32_t maxSets, - VkDescriptorPool* const descriptorPool) { - VkDescriptorPoolCreateInfo createInfo{}; - createInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; - createInfo.pNext = nullptr; - createInfo.flags = 0; - createInfo.maxSets = maxSets; - createInfo.poolSizeCount = poolSizeCount; - createInfo.pPoolSizes = poolSizes; - VK_CHECK( - vkCreateDescriptorPool(device, &createInfo, nullptr, descriptorPool)); -} - -void allocateDescriptorSet( - const VkDevice device, - const VkDescriptorPool descriptorPool, - const VkDescriptorSetLayout* const descriptorSetLayout, - VkDescriptorSet* const descriptorSet) { - VkDescriptorSetAllocateInfo allocateInfo{}; - allocateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; - allocateInfo.pNext = nullptr; - allocateInfo.descriptorPool = descriptorPool; - allocateInfo.descriptorSetCount = 1; - allocateInfo.pSetLayouts = descriptorSetLayout; - VK_CHECK(vkAllocateDescriptorSets(device, &allocateInfo, descriptorSet)); -} - -void createDescriptorSetLayoutSinglePool( - const VkDevice device, - const std::vector& descrTypes, - VkDescriptorSetLayout* const descrSetLayout, - VkDescriptorPool* const descrPool, - VkDescriptorSet* const descrSet) { - const auto size = descrTypes.size(); - std::vector bindings; - std::vector poolSizes; - uint32_t i = 0; - for (const auto& descrType : descrTypes) { - bindings.push_back(descriptorSetLayoutBinding(i, descrType)); - poolSizes.push_back(VkDescriptorPoolSize{descrType, 1}); - i++; - } - createDescriptorSetLayout(device, bindings.data(), size, descrSetLayout); - createDescriptorPool( - device, poolSizes.data(), size, 1 /* maxSets */, descrPool); - allocateDescriptorSet(device, *descrPool, descrSetLayout, descrSet); -} - -void allocateCommandBuffer(VkDevice device, VkCommandBuffer* commandBuffer) { - VkCommandBufferAllocateInfo commandBufferAllocateInfo{}; - commandBufferAllocateInfo.sType = - VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; - commandBufferAllocateInfo.commandPool = context().commandPool(); - commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; - commandBufferAllocateInfo.commandBufferCount = 1; - - VK_CHECK(vkAllocateCommandBuffers( - device, &commandBufferAllocateInfo, commandBuffer)); -} - -void beginCommandBuffer(VkCommandBuffer commandBuffer) { - VkCommandBufferBeginInfo beginInfo{}; - beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; - beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; - VK_CHECK(vkBeginCommandBuffer(commandBuffer, &beginInfo)); -} - -void endCommandBuffer(VkCommandBuffer commandBuffer) { - VK_CHECK(vkEndCommandBuffer(commandBuffer)); -} - -void submitAndWaitCommandBuffer( - VkDevice device, - VkQueue queue, - VkCommandBuffer commandBuffer) { - VkSubmitInfo submitInfo{}; - submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; - submitInfo.commandBufferCount = 1; - submitInfo.pCommandBuffers = &commandBuffer; - - VkFence fence; - VkFenceCreateInfo fenceCreateInfo{}; - fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; - fenceCreateInfo.flags = 0; - VK_CHECK(vkCreateFence(device, &fenceCreateInfo, NULL, &fence)) - - VK_CHECK(vkQueueSubmit(queue, 1, &submitInfo, fence)); - vkWaitForFences(device, 1, &fence, VK_TRUE, ComputeUnit::kFenceTimeoutNanos); - - vkDestroyFence(device, fence, NULL); -} - -ComputeUnit::~ComputeUnit() { - vkDestroyShaderModule(context().device(), computeShaderModule_, nullptr); - vkDestroyPipelineLayout(context().device(), pipelineLayout_, nullptr); - vkDestroyPipeline(context().device(), pipeline_, nullptr); -} - -void ComputeUnit::createComputePipeline( - const uint32_t* const code, - const uint32_t codeSize, - const VkPipelineCache pipelineCache, - const VkDescriptorSetLayout descrSetLayout, - const WorkGroupSize workGroupSize) { - const auto device = context().device(); - VkShaderModuleCreateInfo createInfo{}; - createInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; - createInfo.pCode = code; - createInfo.codeSize = codeSize; - - VK_CHECK(vkCreateShaderModule( - device, &createInfo, nullptr, &computeShaderModule_)); - - VkSpecializationMapEntry spMapEntries[3]; - { - uint32_t offset = 0; - size_t size = sizeof(WorkGroupSize::x); - spMapEntries[0].constantID = 0; - spMapEntries[0].offset = offset; - spMapEntries[0].size = size; - offset += size; - size = sizeof(WorkGroupSize::y); - spMapEntries[1].constantID = 1; - spMapEntries[1].offset = offset; - spMapEntries[1].size = size; - offset += size; - size = sizeof(WorkGroupSize::z); - spMapEntries[2].constantID = 2; - spMapEntries[2].offset = offset; - spMapEntries[2].size = size; - } - VkSpecializationInfo spInfo; - spInfo.mapEntryCount = 3; - spInfo.pMapEntries = spMapEntries; - spInfo.dataSize = sizeof(workGroupSize); - spInfo.pData = &workGroupSize; - - VkPipelineShaderStageCreateInfo shaderStageCreateInfo{}; - shaderStageCreateInfo.sType = - VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; - shaderStageCreateInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT; - shaderStageCreateInfo.module = computeShaderModule_; - shaderStageCreateInfo.pName = "main"; - shaderStageCreateInfo.pSpecializationInfo = &spInfo; - - VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{}; - pipelineLayoutCreateInfo.sType = - VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; - pipelineLayoutCreateInfo.setLayoutCount = 1; - pipelineLayoutCreateInfo.pSetLayouts = &descrSetLayout; - - VK_CHECK(vkCreatePipelineLayout( - device, &pipelineLayoutCreateInfo, nullptr, &pipelineLayout_)); - - VkComputePipelineCreateInfo pipelineCreateInfo{}; - pipelineCreateInfo.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO; - pipelineCreateInfo.stage = shaderStageCreateInfo; - pipelineCreateInfo.layout = pipelineLayout_; - - VK_CHECK(vkCreateComputePipelines( - device, pipelineCache, 1, &pipelineCreateInfo, nullptr, &pipeline_)); -} - -#ifdef USE_VULKAN_SHADERC_RUNTIME -void ComputeUnit::createComputePipelineCompile( - const std::string& glslSrc, - const VkPipelineCache pipelineCache, - const VkDescriptorSetLayout descrSetLayout, - const WorkGroupSize workGroupSize) { - shaderc::Compiler compiler{}; - shaderc::CompileOptions options{}; -#ifdef DEBUG - options.SetGenerateDebugInfo(); -#endif - options.SetTargetEnvironment( - shaderc_target_env_vulkan, shaderc_env_version_vulkan_1_0); - options.SetForcedVersionProfile(450, shaderc_profile_core); - const shaderc::SpvCompilationResult compilationResult = compiler.CompileGlslToSpv( - glslSrc.c_str(), - glslSrc.size(), - shaderc_compute_shader, - "vulkan_shader.comp", - "main", - options); - const auto compilationStatus = compilationResult.GetCompilationStatus(); - TORCH_INTERNAL_ASSERT( - compilationStatus == shaderc_compilation_status_success, - "Shader compilation error: status:", - compilationStatus, - compilationResult.GetErrorMessage()); - const std::vector shaderSpvCode( - compilationResult.cbegin(), compilationResult.cend()); - const auto codeSizeBytes = 4 * shaderSpvCode.size(); - createComputePipeline( - shaderSpvCode.data(), - codeSizeBytes, - pipelineCache, - descrSetLayout, - workGroupSize); -} -#endif - -void ComputeUnit::createCommandBuffer(VkDescriptorSet& descriptorSet) { - const auto device = context().device(); - VkCommandBufferAllocateInfo commandBufferAllocateInfo{}; - commandBufferAllocateInfo.sType = - VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; - commandBufferAllocateInfo.commandPool = context().commandPool(); - commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; - commandBufferAllocateInfo.commandBufferCount = 1; - - VK_CHECK(vkAllocateCommandBuffers( - device, &commandBufferAllocateInfo, &commandBuffer_)); - - VkCommandBufferBeginInfo beginInfo{}; - beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; - beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; - VK_CHECK(vkBeginCommandBuffer(commandBuffer_, &beginInfo)); - - vkCmdBindPipeline(commandBuffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline_); - vkCmdBindDescriptorSets( - commandBuffer_, - VK_PIPELINE_BIND_POINT_COMPUTE, - pipelineLayout_, - 0, - 1, - &descriptorSet, - 0, - nullptr); -} - -void ComputeUnit::addMemoryBarrier( - const VkPipelineStageFlags srcStageMask, - const VkAccessFlags srcAccessMask, - const VkPipelineStageFlags dstStageMask, - const VkAccessFlags dstAccessMask) { - VkMemoryBarrier barrier{}; - barrier.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER; - barrier.pNext = nullptr; - barrier.srcAccessMask = srcAccessMask; - barrier.dstAccessMask = dstAccessMask; - vkCmdPipelineBarrier( - commandBuffer_, - srcStageMask, - dstStageMask, - 0, - 1, - &barrier, - 0, - nullptr, - 0, - nullptr); -} - -void ComputeUnit::dispatchCommandBuffer( - const uint32_t groupCountX, - const uint32_t groupCountY, - const uint32_t groupCountZ) { - vkCmdDispatch(commandBuffer_, groupCountX, groupCountY, groupCountZ); -} - -void ComputeUnit::endCommandBuffer() { - at::native::vulkan::detail::endCommandBuffer(commandBuffer_); -} - -void ComputeUnit::dispatchCommandBuffer( - const uint32_t gridX, - const uint32_t gridY, - const uint32_t gridZ, - const WorkGroupSize workGroupSize) { - dispatchCommandBuffer( - UP_DIV(gridX, workGroupSize.x), - UP_DIV(gridY, workGroupSize.y), - UP_DIV(gridZ, workGroupSize.z)); -} - -void ComputeUnit::submitAndWaitCommandBuffer() { - VkSubmitInfo submitInfo{}; - submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; - submitInfo.commandBufferCount = 1; - submitInfo.pCommandBuffers = &commandBuffer_; - - VkFence fence{}; - VkFenceCreateInfo fenceCreateInfo{}; - fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; - fenceCreateInfo.flags = 0; - VK_CHECK(vkCreateFence(context().device(), &fenceCreateInfo, NULL, &fence)) - - VK_CHECK(vkQueueSubmit(context().queue(), 1, &submitInfo, fence)); - vkWaitForFences(context().device(), 1, &fence, VK_TRUE, kFenceTimeoutNanos); - - vkDestroyFence(context().device(), fence, NULL); -} - -VBuffer makeUniformConstBuffer(const void* const ptr, const VkDeviceSize size) { - VBuffer constBuffer = VBuffer::makeUniformBuffer(size); - constBuffer.copy_from_host_to_device(ptr, size); - return constBuffer; -} - -ComputeUnitFactory::ComputeUnitFactory(const VkDevice device) - : device_(device) { - VkPipelineCacheCreateInfo createInfo{}; - createInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO; - createInfo.pNext = nullptr; - createInfo.flags = 0; - createInfo.initialDataSize = 0; - createInfo.pInitialData = nullptr; - VK_CHECK(vkCreatePipelineCache( - device_, &createInfo, nullptr /* allocator */, &pipelineCache_)); -} - -ComputeUnitFactory::~ComputeUnitFactory() { - vkDestroyPipelineCache(device_, pipelineCache_, nullptr /* allocator */); -} - -std::string ComputeUnitFactory::getCacheKey( - const char* const key, - const WorkGroupSize workGroupSize) { - std::stringstream ss; - ss << key << ':' << workGroupSize.x << ':' << workGroupSize.y << ':' - << workGroupSize.z; - return ss.str(); -} - -ComputeUnit& ComputeUnitFactory::get( - const std::string& cacheKey, - const std::function()> factoryFn) { - const auto it = computeUnits_.find(cacheKey); - if (it != computeUnits_.end()) { - return *(it->second.get()); - } - auto computeUnit = factoryFn(); - computeUnits_.insert(std::make_pair(cacheKey, computeUnit)); - return *(computeUnit.get()); -} - -#ifdef USE_VULKAN_SHADERC_RUNTIME -ComputeUnit& ComputeUnitFactory::get( - const char* const key, - const char* const glslSrc, - const VkDescriptorSetLayout descrSetLayout, - const WorkGroupSize workGroupSize) { - return get( - getCacheKey(key, workGroupSize), - [glslSrc, - pipelineCache = pipelineCache_, - descrSetLayout, - workGroupSize]() { - return std::make_shared( - glslSrc, pipelineCache, descrSetLayout, workGroupSize); - }); -} -#else -ComputeUnit& ComputeUnitFactory::get( - const char* const key, - const uint32_t* const code, - const uint32_t codeSize, - const VkDescriptorSetLayout descrSetLayout, - const WorkGroupSize workGroupSize) { - return get( - getCacheKey(key, workGroupSize), - [code, - codeSize, - pipelineCache = pipelineCache_, - descrSetLayout, - workGroupSize]() { - return std::make_shared( - code, codeSize, pipelineCache, descrSetLayout, workGroupSize); - }); -} -#endif - -// VBuffer <-> VImage -void copy_buffer_to_image(const VBuffer& buffer, VImage& image) { - const auto device = context().device(); - - VkDescriptorSetLayout descrSetLayout{}; - VkDescriptorSetLayoutBinding bindings[] = { - descriptorSetLayoutBinding(0, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE), - descriptorSetLayoutBinding(1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER), - descriptorSetLayoutBinding(2, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)}; - createDescriptorSetLayout( - device, bindings, 3 /* bindingsCount */, &descrSetLayout); - - VkDescriptorPool descrPool{}; - VkDescriptorPoolSize poolSizes[] = {{VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1}, - {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1}, - {VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1}}; - createDescriptorPool( - device, poolSizes, 3 /* poolSizeCount */, 1 /* maxSets */, &descrPool); - - VkDescriptorSet descrSet{}; - allocateDescriptorSet(device, descrPool, &descrSetLayout, &descrSet); - - image.bindStorageImage(descrSet, 0); - buffer.bind(descrSet, 1); - WorkGroupSize workGroupSize{8, 8, 1}; - - auto& computeUnit = context().computeUnitFactory().get( - GLSL_SPV(nchw_to_image), descrSetLayout, workGroupSize); - computeUnit.createCommandBuffer(descrSet); - - image.addImageMemoryBarrierToGeneral(computeUnit.commandBuffer()); - buffer.addBufferMemoryBarrier( - computeUnit.commandBuffer(), 0, buffer.sizeBytes()); - computeUnit.addMemoryBarrier( - VK_PIPELINE_STAGE_HOST_BIT, - VK_ACCESS_HOST_WRITE_BIT, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_SHADER_READ_BIT); - computeUnit.dispatchCommandBuffer( - image.w(), image.h(), image.d(), workGroupSize); - computeUnit.endCommandBuffer(); - computeUnit.submitAndWaitCommandBuffer(); - - vkDestroyDescriptorPool(device, descrPool, nullptr); - vkDestroyDescriptorSetLayout(device, descrSetLayout, nullptr); -} - -void copy_image_to_buffer( - const VImage& image, - VBuffer& buffer, - bool addBufferMemoryBarrierForHost) { - const auto device = context().device(); - TORCH_INTERNAL_ASSERT( - buffer.sizeBytes() >= image.capacityBytes(), - "VulkanBuffer's capacity is less than VulkanImage capacity to copy from"); - - VkDescriptorSetLayout descrSetLayout{}; - const VkDescriptorSetLayoutBinding bindings[] = { - descriptorSetLayoutBinding(0, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER), - descriptorSetLayoutBinding(1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER), - descriptorSetLayoutBinding(2, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)}; - createDescriptorSetLayout( - device, bindings, 3 /* bindingsCount */, &descrSetLayout); - - VkDescriptorPool descrPool{}; - const VkDescriptorPoolSize poolSizes[] = { - {VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 1}, - {VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1}, - {VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1}}; - createDescriptorPool( - device, poolSizes, 3 /* poolSizeCount */, 1 /* maxSets */, &descrPool); - - VkDescriptorSet descrSet{}; - allocateDescriptorSet(device, descrPool, &descrSetLayout, &descrSet); - - image.bindShaderRead(descrSet, 0); - buffer.bind(descrSet, 1); - - const WorkGroupSize workGroupSize{8, 8, 1}; - auto& computeUnit = context().computeUnitFactory().get( - GLSL_SPV(image_to_nchw), descrSetLayout, workGroupSize); - - computeUnit.createCommandBuffer(descrSet); - image.addImageMemoryBarrierToShaderRead(computeUnit.commandBuffer()); - computeUnit.dispatchCommandBuffer( - image.w(), image.h(), image.d(), workGroupSize); - - if (addBufferMemoryBarrierForHost) { - computeUnit.addMemoryBarrier( - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_SHADER_WRITE_BIT, - VK_PIPELINE_STAGE_HOST_BIT, - VK_ACCESS_HOST_READ_BIT); - } - computeUnit.endCommandBuffer(); - computeUnit.submitAndWaitCommandBuffer(); - - vkDestroyDescriptorPool(device, descrPool, nullptr); - vkDestroyDescriptorSetLayout(device, descrSetLayout, nullptr); -} // VBuffer <-> VImage - -void copy_buffer_to_buffer( - const VBuffer& srcBuffer, - VBuffer& dstBuffer, - VkDeviceSize size, - VkDeviceSize srcOffset, - VkDeviceSize dstOffset) { - auto device = context().device(); - VkCommandBuffer commandBuffer{}; - allocateCommandBuffer(device, &commandBuffer); - beginCommandBuffer(commandBuffer); - - VkBufferCopy copyRegion{}; - copyRegion.srcOffset = srcOffset; - copyRegion.dstOffset = dstOffset; - copyRegion.size = size; - vkCmdCopyBuffer( - commandBuffer, - srcBuffer.vkbuffer(), - dstBuffer.vkbuffer(), - 1, - ©Region); - - endCommandBuffer(commandBuffer); - submitAndWaitCommandBuffer(device, context().queue(), commandBuffer); -} - -// VulkanTensor - -class VulkanTensor::Impl final { - public: - explicit Impl(std::vector sizes) - : sizes_(std::move(sizes)), - strides_(std::vector(sizes_.size())), - numel_(c10::multiply_integers(sizes_)) { - TORCH_CHECK( - initVulkanContextOnce(), "Vulkan Failed to create Vulkan Context"); - } - - std::vector sizes() const { - return sizes_; - } - - std::vector strides() const { - return strides_; - } - - inline int64_t dim() const { - return sizes_.size(); - } - - inline int64_t numel() const { - return numel_; - } - - inline bool has_buffer() const { - return static_cast(buffer_); - } - - inline VBuffer* buffer() { - if (!has_buffer()) { - buffer_ = std::make_unique(buffer_size_for_sizes(sizes_)); - } - return buffer_.get(); - } - - const VBuffer* buffer() const { - if (!has_buffer()) { - buffer_ = std::make_unique(buffer_size_for_sizes(sizes_)); - } - return buffer_.get(); - } - - inline bool can_be_image() const { - return dim() <= 4; - } - - inline bool has_image() const { - return static_cast(image_); - } - - inline bool has_storage() { - return has_buffer(); - } - - ImageSizes imageSizes_W_H_NC4() { - TORCH_INTERNAL_ASSERT( - can_be_image(), - "Vulkan: Only Tensors with dim <= 4 can be represented as Vulkam Image"); - auto d = dim(); - int64_t _wd = 1; - int64_t _hd = 1; - int64_t _dd = 1; - if (d == 4) { - _wd = sizes_[3]; - _hd = sizes_[2]; - _dd = sizes_[1] * sizes_[0]; - } else if (d == 3) { - _wd = sizes_[2]; - _hd = sizes_[1]; - _dd = sizes_[0]; - } else if (d == 2) { - _wd = sizes_[1]; - _hd = sizes_[0]; - } else if (d == 1) { - _wd = sizes_[0]; - } - int32_t wd = safe_downcast(_wd); - int32_t hd = safe_downcast(_hd); - int32_t dd = safe_downcast(_dd); - return {{wd, hd, UP_DIV(dd, 4)}, {wd, hd, dd}}; - } - - VImage* image(const c10::optional imageSizes = c10::nullopt) { - if (image_) { - return image_.get(); - } - - if (imageSizes.has_value()) { - image_ = std::make_unique(*imageSizes); - return image_.get(); - } - - image_ = std::make_unique(imageSizes_W_H_NC4()); - if (buffer_) { - copy_buffer_to_image(*buffer_, *image_); - } - return image_.get(); - } - - const VImage* image( - c10::optional imageSizes = c10::nullopt) const { - return const_cast(this)->image(imageSizes); - } - - VkDeviceSize buffer_size_for_sizes(std::vector sizes) const { - const auto d = sizes.size(); - const auto numel = c10::multiply_integers(sizes); - VkDeviceSize bufferSize{sizeof(float) * numel}; - // alignment to be able to copy between image and buffer - if (d == 4) { - bufferSize = - sizeof(float) * ALIGN_UP4(sizes[0] * sizes[1]) * sizes[2] * sizes[3]; - } else if (d == 3) { - bufferSize = sizeof(float) * ALIGN_UP4(sizes[0]) * sizes[1] * sizes[2]; - } else if (d == 2) { - bufferSize = sizeof(float) * 4 * sizes[0] * sizes[1]; - } else if (d == 1) { - bufferSize = sizeof(float) * 4 * sizes[0]; - } - return bufferSize; - } - - void allocate_storage() { - buffer_ = std::make_unique(buffer_size_for_sizes(sizes_)); - } - - void set_data_from_host(const float* const inputData) { - buffer()->copy_from_host_to_device( - (const void*)inputData, sizeof(float) * numel_); - } - - void copy_data_to_host(float* const outputData) const { - sync_image_to_buffer(); - buffer()->copy_from_device_to_host(outputData, sizeof(float) * numel_); - } - - void sync_image_to_buffer() const { - if (has_image()) { - copy_image_to_buffer( - *image(), - *(const_cast(buffer())), - true /* memory barrier for host memory map */); - } - } - - private: - std::vector sizes_; - std::vector strides_; - int64_t numel_; - mutable std::unique_ptr buffer_; - std::unique_ptr image_; -}; - -std::shared_ptr VulkanTensor::impl() { - return impl_; -} - -std::shared_ptr VulkanTensor::impl() const { - return impl_; -} - -std::vector VulkanTensor::sizes() const { - return impl()->sizes(); -} - -void VulkanTensor::sync_image_to_buffer() const { - return impl()->sync_image_to_buffer(); -} - -std::vector VulkanTensor::strides() const { - return impl()->strides(); -} - -int64_t VulkanTensor::dim() const { - return impl()->dim(); -} - -int64_t VulkanTensor::numel() const { - return impl()->numel(); -} - -bool VulkanTensor::has_storage() const { - return impl()->has_buffer(); -} - -void VulkanTensor::allocate_storage() { - impl()->allocate_storage(); -} - -void VulkanTensor::set_data_from_host(const float* const inputData) { - impl()->set_data_from_host(inputData); -} - -void VulkanTensor::copy_data_to_host(float* const outputData) const { - impl()->copy_data_to_host(outputData); -} - -bool VulkanTensor::has_buffer() const { - return impl()->has_buffer(); -} - -VBuffer* VulkanTensor::buffer() { - return impl()->buffer(); -} - -const VBuffer* VulkanTensor::buffer() const { - return impl()->buffer(); -} - -bool VulkanTensor::can_be_image() const { - return impl()->can_be_image(); -} - -bool VulkanTensor::has_image() const { - return impl()->has_image(); -} - -VImage* VulkanTensor::image(const c10::optional imageSizes) { - return impl()->image(imageSizes); -} - -const VImage* VulkanTensor::image(const c10::optional imageSizes) const { - return impl()->image(imageSizes); -} - -VulkanTensor::VulkanTensor(std::vector sizes) - : impl_(std::make_shared(std::move(sizes))) {} - -std::ostream& operator<<(std::ostream& s, const ImageSize& imageSize) { - s << "ImageSize{" << imageSize[0] << ", " << imageSize[1] << ", " - << imageSize[2] << "}"; - return s; -} -std::ostream& operator<<(std::ostream& s, const ImageSizes& imageSizes) { - s << "ImageSizes{imageSize:" << imageSizes.imageSize - << ", dataSize:" << imageSizes.dataSize << "}"; - return s; -} - -std::ostream& operator<<(std::ostream& s, const WorkGroupSize& workGroupSize) { - s << "WorkGroupSize{" << workGroupSize.x << " " << workGroupSize.y << " " - << workGroupSize.z << "}"; - return s; -} - -} // namespace detail -} // namespace vulkan -} // namespace native -} // namespace at diff --git a/aten/src/ATen/native/vulkan/Vulkan.h b/aten/src/ATen/native/vulkan/Vulkan.h deleted file mode 100644 index c2b1775e8f0a..000000000000 --- a/aten/src/ATen/native/vulkan/Vulkan.h +++ /dev/null @@ -1,532 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include - -#ifdef USE_VULKAN_WRAPPER -#include -#else -#include -#endif - -#ifdef USE_VULKAN_SHADERC_RUNTIME -#include -#define GLSL_SPV(name) #name, name##_glsl -#else -#include -#define GLSL_SPV(name) #name, name##_spv, name##_spv_len -#endif -#include - -namespace at { -namespace native { -namespace vulkan { -namespace detail { - -#ifdef DEBUG -static constexpr bool kEnableValidationLayers = true; -#else -static constexpr bool kEnableValidationLayers = false; -#endif - -bool is_available(); - -class VContext; -const VContext& context(); - -// VulkanTensor is a handle that holds shared pointer to VulkanTensor:Impl, -// that owns Tensor representation on GPU. -// VulkanTensor is copyable and moveable (copying and moving pointer to Impl). -// -// VulkanTensor::Impl is moveable only, owns Vulkan device memory for Tensor -// data. Tensor can be represented in several formats. -// -// 0. VBuffer - (wrapper on vulkan VkBuffer), supports all tensor dimensions, -// data is in Contiguous format (NCHW), in plan to preserve at::Tensor memory -// format (3d or 4d tensors can be in NHWC ChannelsLast format). It is located -// in host visible memory that can be memory mapped to CPU memory. -// -// 1. VImage(TexC4) - (wrapper on vulkan VkImage), optional representation of -// tensors with dimension <= 4 as VkImage, used in shaders as texture or storage -// image. It is 3-dimensional image (x, y, z) with 4 component * 16 bit for each -// triple (x, y, z). -// For NCHW, NHWC: -// -// For dim==4: image.x - W sizes[3]; image.y - H sizes[2]; image.z - (N -// sizes[0] * C sizes[1]) / 4; -// -// For dim==3: image.x - W sizes[2]; image.y - H sizes[1]; image.z - (C -// sizes[0]) / 4 -// -// For dim==2: image.x - W sizes[1]; image.y - H sizes[0]; image.z : 1 -// -// For dim==1: image.x - W sizes[0]; image.y : 1; image.z : 1 -// -// -// 2. VImage (other format) - Currently not added, but for some operations -// another texture packing format can be beneficial for performance. -// -// Contract about synchronization between representations: -// 1.VImage(TexC4) representation is allocated lazily with calling image(), -// fails for dimensions > 4. -// -// Tensor data can be in 0.VBuffer and/or 1.VImage(TexC4), -// If Tensor can be represented as image - VulkanTensor::Impl::can_be_image() -// returns true. Image representation created lazily by call -// VulkanTensor::Impl::image(), if it is called on Tensor with !can_be_image() - -// it fails. -// -// If image allocated - image data has priority. -// VulkanTensor::copy_data_to_host checks if image allocated - -// copy_image_to_buffer first. -class VBuffer; -class VImage; - -using ImageSize = std::array; -struct ImageSizes { - ImageSize imageSize; - ImageSize dataSize; -}; - -class VulkanTensor final { - class Impl; - - public: - VulkanTensor() = default; - explicit VulkanTensor(std::vector sizes); - ~VulkanTensor() = default; - - VulkanTensor(VulkanTensor&&) = default; - VulkanTensor& operator=(VulkanTensor&&) = default; - - VulkanTensor(const VulkanTensor&) = default; - VulkanTensor& operator=(const VulkanTensor&) = default; - - bool defined() const { - return static_cast(impl_); - } - - std::vector sizes() const; - std::vector strides() const; - int64_t dim() const; - int64_t numel() const; - - bool has_storage() const; - void allocate_storage(); - void set_data_from_host(const float* inputData); - void copy_data_to_host(float* outputData) const; - - bool has_buffer() const; - VBuffer* buffer(); - const VBuffer* buffer() const; - - bool can_be_image() const; - bool has_image() const; - - void sync_image_to_buffer() const; - - // if imageSizes argument is not specified: - // Allocates VImage of sizes{W,H,NC4} and fills it from tensor VBuffer if it - // exists, see comment for VulkanTensor. - // - // if imageSizes argument is specified: - // Only allocates VImage of specified sizes, that will be returned on - // subsequent image() calls. Can be used when user wants to store tensor image - // not in default{W, H, NC4} format (For performance or other reasons). - VImage* image(c10::optional imageSizes = c10::nullopt); - const VImage* image( - c10::optional imageSizes = c10::nullopt) const; - - private: - std::shared_ptr impl(); - std::shared_ptr impl() const; - std::shared_ptr impl_; -}; - -class ComputeUnitFactory; -class VContext final { - public: - explicit VContext(bool enableValidationLayers); - ~VContext(); - VContext(const VContext&) = delete; - VContext& operator=(const VContext&) = delete; - VContext(VContext&&) = default; - VContext& operator=(VContext&&) = default; - - inline VkDevice device() const { - return device_; - } - inline VkPhysicalDevice physicalDevice() const { - return physicalDevice_; - } - inline VkPhysicalDeviceLimits limits() const { - return physicalDeviceLimits_; - } - inline VkCommandPool commandPool() const { - return commandPool_; - } - inline VkQueue queue() const { - return queue_; - } - ComputeUnitFactory& computeUnitFactory() const { - return *(computeUnitFactory_.get()); - } - - private: - void createInstance(); - void findPhysicalDevice(); - void createDevice(); - uint32_t getComputeQueueFamilyIndex(); - - VkInstance instance_; - VkDebugReportCallbackEXT debugReportCallback_; - VkDevice device_; - VkPhysicalDevice physicalDevice_; - VkPhysicalDeviceLimits physicalDeviceLimits_; - std::vector enabledValidationLayers_; - VkQueue queue_; - uint32_t queueFamilyIndex_; - bool enableValidationLayers_; - VkCommandPool commandPool_; - std::unique_ptr computeUnitFactory_; -}; - -class VBuffer final { - public: - class MapMemory final { - public: - MapMemory( - const VkDevice device, - const VkDeviceMemory deviceMemory, - const VkDeviceSize offset, - const VkDeviceSize size) - : device_(device), - deviceMemory_(deviceMemory), - offset_(offset), - size_(size) { - vkMapMemory(device_, deviceMemory_, 0, size, 0, &mappedMemory_); - } - ~MapMemory() { - vkUnmapMemory(device_, deviceMemory_); - } - MapMemory(const MapMemory&) = delete; - MapMemory& operator=(const MapMemory&) = delete; - MapMemory(MapMemory&&) = default; - MapMemory& operator=(MapMemory&&) = default; - inline const void* ptr() const { - return mappedMemory_; - } - inline void* ptr() { - return mappedMemory_; - } - void flushWriteToHost(); - void flushWriteToDevice(); - - private: - VkDevice device_; - VkDeviceMemory deviceMemory_; - VkDeviceSize offset_; - VkDeviceSize size_; - void* mappedMemory_; - }; - - explicit VBuffer( - VkDeviceSize bufferSizeBytes, - VkBufferUsageFlags bufferUsageFlags = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | - VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, - VkDescriptorType descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); - - ~VBuffer(); - - VBuffer(const VBuffer&) = delete; - VBuffer& operator=(const VBuffer&) = delete; - VBuffer(VBuffer&&) = default; - VBuffer& operator=(VBuffer&&) = default; - - static inline VBuffer makeUniformBuffer(const VkDeviceSize bufferSize) { - return VBuffer{bufferSize, - VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER}; - } - - MapMemory map() const { - return MapMemory{context().device(), bufferMemory_, 0, bufferSizeBytes_}; - } - - void copy_from_device_to_host(void* outputData, int64_t size) const; - void copy_from_host_to_device(const void* data, int64_t size); - void set_zeros(); - - VkDescriptorBufferInfo makeDescriptorBufferInfo() const; - VkWriteDescriptorSet makeWriteDescriptorSet( - VkDescriptorSet descriptorSet, - uint32_t binding, - const VkDescriptorBufferInfo* bufferInfo) const; - - void bind(VkDescriptorSet descriptorSet, uint32_t binding) const; - - inline VkDeviceSize sizeBytes() const { - return bufferSizeBytes_; - } - - void addBufferMemoryBarrier( - VkCommandBuffer commandBuffer, - VkDeviceSize offset, - VkDeviceSize size) const; - - inline VkBuffer vkbuffer() const { - return buffer_; - } - - private: - VkDeviceSize bufferSizeBytes_; - VkDescriptorType descriptorType_; - VkBuffer buffer_; - VkDeviceMemory bufferMemory_; -}; - -VBuffer makeUniformConstBuffer(const void* ptr, VkDeviceSize size); - -class VImage final { - public: - static constexpr VkImageType kImageType = VK_IMAGE_TYPE_3D; - static constexpr VkFilter kFilter = VK_FILTER_NEAREST; - static constexpr VkFormat kFormat = VK_FORMAT_R16G16B16A16_SFLOAT; - static constexpr VkSamplerAddressMode kSamplerAddressMode = - VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER; - static constexpr VkImageViewType kImageViewType = VK_IMAGE_VIEW_TYPE_3D; - - explicit VImage(ImageSize imageSize, ImageSize dataSize); - explicit VImage(ImageSizes imageSizes) - : VImage(imageSizes.imageSize, imageSizes.dataSize) {} - ~VImage(); - VImage(const VImage&) = delete; - VImage& operator=(const VImage&) = delete; - VImage(VImage&&) = default; - VImage& operator=(VImage&&) = default; - - inline auto w() const { - return imageSize_[0]; - } - inline auto h() const { - return imageSize_[1]; - } - inline auto d() const { - return imageSize_[2]; - } - - VkImageViewCreateInfo makeImageViewCreateInfo() const; - VkSamplerCreateInfo makeSamplerCreateInfo() const; - VkDescriptorImageInfo makeDescriptorImageInfo( - VkImageLayout imageLayout) const; - VkWriteDescriptorSet makeWriteDescriptorSet( - VkDescriptorSet descriptorSet, - uint32_t binding, - VkDescriptorType descriptorType, - const VkDescriptorImageInfo* imageInfo) const; - void bind( - VkDescriptorSet descriptorSet, - uint32_t binding, - VkDescriptorType descriptorType, - VkImageLayout imageLayout) const; - void bindShaderRead(VkDescriptorSet descriptorSet, uint32_t binding) const; - void bindStorageImage(VkDescriptorSet descriptorSet, uint32_t binding) const; - inline VkDeviceSize sizeBytes() const { - return sizeof(float) * dataSize_[0] * dataSize_[1] * dataSize_[2]; - } - - inline VkDeviceSize capacityBytes() const { - // Every VImage pixel(texel) contains 4 float elements - return sizeof(float) * 4 * imageSize_[0] * imageSize_[1] * imageSize_[2]; - } - - ImageSize sizes() const { - return imageSize_; - } - - void addImageMemoryBarrier( - VkCommandBuffer commandBuffer, - VkImageLayout newLayout) const; - void addImageMemoryBarrierToGeneral(VkCommandBuffer commandBuffer) const; - void addImageMemoryBarrierToShaderRead(VkCommandBuffer commandBuffer) const; - - private: - ImageSize imageSize_; - ImageSize dataSize_; - VkImage image_; - VkDeviceMemory imageMemory_; - VkImageView imageView_; - VkSampler sampler_; - // Holds current image layout that will be used in - // addImageMemoryBarrier as the previous layout. Need to be mutable to - // use addImageMemoryBarrier() for const VImage. - mutable VkImageLayout imageLayout_; -}; - -void copy_buffer_to_image(const VBuffer& buffer, VImage& image); - -void copy_image_to_buffer( - const VImage& image, - VBuffer& buffer, - bool addBufferMemoryBarrierForHost = false); - -void copy_buffer_to_buffer( - const VBuffer& srcBuffer, - VBuffer& dstBuffer, - VkDeviceSize size, - VkDeviceSize srcOffset = 0, - VkDeviceSize dstOffset = 0); - -VkDescriptorSetLayoutBinding descriptorSetLayoutBinding( - uint32_t binding, - VkDescriptorType descriptorType); - -void createDescriptorSetLayout( - VkDevice device, - const VkDescriptorSetLayoutBinding* bindings, - uint32_t bindingCount, - VkDescriptorSetLayout* setLayout); - -void allocateDescriptorSet( - VkDevice device, - VkDescriptorPool descriptorPool, - const VkDescriptorSetLayout* descriptorSetLayout, - VkDescriptorSet* descriptorSet); - -void createDescriptorSetLayoutSinglePool( - VkDevice device, - const std::vector& descrTypes, - VkDescriptorSetLayout* descrSetLayout, - VkDescriptorPool* descrPool, - VkDescriptorSet* descrSet); - -void allocateCommandBuffer(VkDevice device, VkCommandBuffer* commandBuffer); -void beginCommandBuffer(VkCommandBuffer commandBuffer); -void endCommandBuffer(VkCommandBuffer commandBuffer); -void submitAndWaitCommandBuffer(VkDevice device, VkCommandBuffer commandBuffer); - -struct WorkGroupSize { - uint32_t x; - uint32_t y; - uint32_t z; -}; - -class ComputeUnit final { - public: - static constexpr uint64_t kFenceTimeoutNanos = 100000000000; -#ifdef USE_VULKAN_SHADERC_RUNTIME - ComputeUnit( - const char* const glslSrc, - const VkPipelineCache pipelineCache, - const VkDescriptorSetLayout descrSetLayout, - const WorkGroupSize workGroupSize) { - createComputePipelineCompile( - glslSrc, pipelineCache, descrSetLayout, workGroupSize); - } -#else - ComputeUnit( - const uint32_t* const spvCode, - const unsigned int spvCodeSize, - const VkPipelineCache pipelineCache, - const VkDescriptorSetLayout& descrSetLayout, - const WorkGroupSize workGroupSize) { - const auto codeSize = spvCodeSize; - createComputePipeline( - spvCode, codeSize, pipelineCache, descrSetLayout, workGroupSize); - } -#endif - - ~ComputeUnit(); - ComputeUnit(const ComputeUnit&) = delete; - ComputeUnit& operator=(const ComputeUnit&) = delete; - ComputeUnit(ComputeUnit&&) = default; - ComputeUnit& operator=(ComputeUnit&&) = default; - - void createComputePipeline( - const uint32_t* code, - const uint32_t codeSize, - VkPipelineCache pipelineCache, - VkDescriptorSetLayout descrSetLayout, - WorkGroupSize workGroupSize); - -#ifdef USE_VULKAN_SHADERC_RUNTIME - void createComputePipelineCompile( - const std::string& glslSrc, - const VkPipelineCache pipelineCache, - const VkDescriptorSetLayout descrSetLayout, - const WorkGroupSize workGroupSize); -#endif - - void createCommandBuffer(VkDescriptorSet& descriptorSet); - void addMemoryBarrier( - VkPipelineStageFlags srcStageMask, - VkAccessFlags srcAccessMask, - VkPipelineStageFlags dstStageMask, - VkAccessFlags dstAccessMask); - void dispatchCommandBuffer( - uint32_t groupCountX, - uint32_t groupCountY, - uint32_t groupCountZ); - void dispatchCommandBuffer( - uint32_t gridX, - uint32_t gridY, - uint32_t gridZ, - WorkGroupSize workGroupSize); - void submitAndWaitCommandBuffer(); - void endCommandBuffer(); - inline VkCommandBuffer commandBuffer() { - return commandBuffer_; - } - - private: - VkCommandBuffer commandBuffer_; - VkPipeline pipeline_; - VkPipelineLayout pipelineLayout_; - VkShaderModule computeShaderModule_; -}; - -class ComputeUnitFactory { - public: - explicit ComputeUnitFactory(const VkDevice device); - ~ComputeUnitFactory(); - ComputeUnitFactory(const ComputeUnitFactory&) = default; - ComputeUnitFactory& operator=(const ComputeUnitFactory&) = default; - ComputeUnitFactory(ComputeUnitFactory&&) = default; - ComputeUnitFactory& operator=(ComputeUnitFactory&&) = default; - -#ifdef USE_VULKAN_SHADERC_RUNTIME - ComputeUnit& get( - const char* key, - const char* glslSrc, - VkDescriptorSetLayout descrSetLayout, - WorkGroupSize workGroupSize); -#else - ComputeUnit& get( - const char* key, - const uint32_t* code, - const uint32_t codeSize, - VkDescriptorSetLayout descrSetLayout, - WorkGroupSize workGroupSize); -#endif - private: - std::string getCacheKey(const char* key, WorkGroupSize workGroupSize); - ComputeUnit& get( - const std::string& cacheKey, - std::function()> factoryFn); - - VkDevice device_; - VkPipelineCache pipelineCache_; - std::unordered_map> computeUnits_; -}; - -std::ostream& operator<<(std::ostream& s, const WorkGroupSize& workGroupSize); -std::ostream& operator<<(std::ostream& s, const ImageSize& imageSize); -std::ostream& operator<<(std::ostream& s, const ImageSizes& imageSizes); - -} // namespace detail -} // namespace vulkan -} // namespace native -} // namespace at diff --git a/aten/src/ATen/native/vulkan/VulkanAten.cpp b/aten/src/ATen/native/vulkan/VulkanAten.cpp deleted file mode 100644 index 768ce081b353..000000000000 --- a/aten/src/ATen/native/vulkan/VulkanAten.cpp +++ /dev/null @@ -1,711 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace at { -namespace native { -namespace vulkan { -namespace aten { -using at::native::vulkan::detail::VulkanTensor; -using VulkanTensorImpl = VulkanOpaqueTensorImpl; - -namespace { -int64_t normalize_dim(int64_t d, int64_t n) { - return (d % n + n) % n; -} -} // namespace - -Tensor new_with_vtensor_vulkan( - VulkanTensor&& vt, - const TensorOptions& options) { - auto sizes = vt.sizes(); - auto strides = vt.strides(); - return at::detail::make_tensor( - DispatchKeySet(DispatchKey::Vulkan), - options.dtype(), - at::Device(at::kVulkan), - std::move(vt), - std::vector(sizes.begin(), sizes.end()), - std::vector(strides.begin(), strides.end())); -} - -VulkanTensor& vtensor_from_vulkan(const Tensor& tensor) { - TORCH_INTERNAL_ASSERT( - tensor.is_vulkan(), "vtensor_from_vulkan expects Vulkan tensor input"); - VulkanTensorImpl* const impl = - static_cast(tensor.unsafeGetTensorImpl()); - return impl->unsafe_opaque_handle(); -} - -Tensor empty( - IntArrayRef size, - optional dtype, - optional layout, - optional device, - optional pin_memory, - const optional memory_format) { - TORCH_CHECK( - !pin_memory.has_value(), - "'pin_memory' argument is incompatible with Vulkan tensor"); - TORCH_CHECK( - !memory_format.has_value(), - "'memory_format' argument is incompatible with Vulkan tensor"); - VulkanTensor vt{size.vec()}; - return new_with_vtensor_vulkan( - std::move(vt), at::device(at::kVulkan).dtype(dtype)); -} - -Tensor empty_strided( - IntArrayRef size, - IntArrayRef stride, - optional dtype, - optional layout, - optional device, - optional pin_memory) { - return vulkan::aten::empty( - size, dtype, layout, device, pin_memory, c10::nullopt); -} - -Tensor upsample_nearest2d( - const Tensor& input, - const IntArrayRef outputSizes, - const c10::optional scales_h, - const c10::optional scales_w) { - const auto& x = vtensor_from_vulkan(input); - const auto inputSizes = input.sizes(); - const auto in = inputSizes[0]; - const auto ic = inputSizes[1]; - const auto ih = inputSizes[2]; - const auto iw = inputSizes[3]; - - const auto oh = outputSizes[0]; - const auto ow = outputSizes[1]; - const float height_scale = compute_scales_value(scales_h, ih, oh); - const float width_scale = compute_scales_value(scales_w, iw, ow); - VulkanTensor output{{in, ic, oh, ow}}; - vulkan::detail::upsample_nearest2d( - output, x, ih, iw, oh, ow, in, ic, height_scale, width_scale); - return new_with_vtensor_vulkan(std::move(output), input.options()); -} - -Tensor adaptive_avg_pool2d(const at::Tensor& input, IntArrayRef outputSize) { - TORCH_INTERNAL_ASSERT( - input.dim() == 4, - "vulkan_adaptive_avg_pool2d expects 4-dimensional input"); - const auto& x = vtensor_from_vulkan(input); - const auto inputSize = input.sizes(); - const auto in = inputSize[0]; - const auto ic = inputSize[1]; - const auto ih = inputSize[2]; - const auto iw = inputSize[3]; - - const auto oh = outputSize[0]; - const auto ow = outputSize[1]; - VulkanTensor output{{in, ic, oh, ow}}; - vulkan::detail::adaptive_avg_pool2d(output, x, ih, iw, oh, ow, in, ic); - return new_with_vtensor_vulkan(std::move(output), input.options()); -} - -Tensor avg_pool2d( - const Tensor& self, - IntArrayRef kernel_size, - IntArrayRef stride, - IntArrayRef padding, - bool ceil_mode, - bool count_include_pad, - c10::optional divisor_override) { - TORCH_CHECK( - kernel_size.size() == 1 || kernel_size.size() == 2, - "avg_pool2d: kernel_size must either be a single int, or a tuple of two ints"); - const int kH = safe_downcast(kernel_size[0]); - const int kW = - kernel_size.size() == 1 ? kH : safe_downcast(kernel_size[1]); - - TORCH_CHECK( - stride.empty() || stride.size() == 1 || stride.size() == 2, - "avg_pool2d: stride must either be omitted, a single int, or a tuple of two ints"); - const int dH = stride.empty() ? kH : safe_downcast(stride[0]); - const int dW = stride.empty() - ? kW - : stride.size() == 1 ? dH : safe_downcast(stride[1]); - - TORCH_CHECK( - padding.size() == 1 || padding.size() == 2, - "avg_pool2d: padding must either be a single int, or a tuple of two ints"); - const int padH = safe_downcast(padding[0]); - const int padW = padding.size() == 1 ? padH : safe_downcast(padding[1]); - - const auto& x = vtensor_from_vulkan(self); - auto inputSize = self.sizes(); - const int64_t iN = inputSize[0]; - const int64_t iC = inputSize[1]; - const int64_t iH = inputSize[2]; - const int64_t iW = inputSize[3]; - - const int64_t oH = - pooling_output_shape(iH, kH, padH, dH, 1, ceil_mode); - const int64_t oW = - pooling_output_shape(iW, kW, padW, dW, 1, ceil_mode); - - pool2d_shape_check( - self, kH, kW, dH, dW, padH, padW, 1, 1, iC, iH, iW, oH, oW, self.suggest_memory_format()); - - VulkanTensor y{{iN, iC, oH, oW}}; - vulkan::detail::avg_pool2d( - y, x, iH, iW, oH, oW, iN, iC, kH, kW, dH, dW, padH, padW); - return new_with_vtensor_vulkan(std::move(y), self.options()); -} - -Tensor max_pool2d( - const at::Tensor& self, - const IntArrayRef kernel_size, - const IntArrayRef stride, - const IntArrayRef padding, - const IntArrayRef dilation, - bool ceil_mode) { - TORCH_CHECK( - kernel_size.size() == 1 || kernel_size.size() == 2, - "Vulkan max_pool2d: kernel_size must either be a single int, or a tuple of two ints") - const int kH = safe_downcast(kernel_size[0]); - const int kW = - kernel_size.size() == 1 ? kH : safe_downcast(kernel_size[1]); - TORCH_CHECK( - stride.size() == 0 || stride.size() == 1 || stride.size() == 2, - "Vulkan max_pool2d: stride must either be omitted, a single int, or a tuple of two ints") - const int dH = stride.empty() ? kH : safe_downcast(stride[0]); - const int dW = stride.empty() - ? kW - : stride.size() == 1 ? dH : safe_downcast(stride[1]); - - TORCH_CHECK( - padding.size() == 1 || padding.size() == 2, - "Vulkan max_pool2d: padding must be either be a single int, or a tuple of two ints"); - const int padH = safe_downcast(padding[0]); - const int padW = padding.size() == 1 ? padH : safe_downcast(padding[1]); - - TORCH_CHECK( - dilation.size() == 1 || dilation.size() == 2, - "Vulkan max_pool2d: dilation must be either a single int, or a tuple of two ints"); - const int dilationH = safe_downcast(dilation[0]); - const int dilationW = - dilation.size() == 1 ? dilationH : safe_downcast(dilation[1]); - TORCH_CHECK( - self.dim() == 4, "Vulkan max_pool2d is implemented for 4-dim input"); - - const auto& x = vtensor_from_vulkan(self); - const auto inputSize = self.sizes(); - const int64_t iN = inputSize[0]; - const int64_t iC = inputSize[1]; - const int64_t iH = inputSize[2]; - const int64_t iW = inputSize[3]; - - const int64_t oH = - pooling_output_shape(iH, kH, padH, dH, dilationH, ceil_mode); - const int64_t oW = - pooling_output_shape(iW, kW, padW, dW, dilationW, ceil_mode); - - pool2d_shape_check( - self, - kH, - kW, - dH, - dW, - padH, - padW, - dilationH, - dilationW, - iC, - iH, - iW, - oH, - oW, - self.suggest_memory_format()); - - VulkanTensor y{{iN, iC, oH, oW}}; - vulkan::detail::max_pool2d( - y, - x, - iH, - iW, - oH, - oW, - iN, - iC, - kH, - kW, - dH, - dW, - padH, - padW, - dilationH, - dilationW); - return new_with_vtensor_vulkan(std::move(y), self.options()); -} - -Tensor reshape(at::Tensor const& input, IntArrayRef shape) { - return new_with_vtensor_vulkan( - vulkan::detail::reshape_copy(vtensor_from_vulkan(input), shape.vec()), - input.options()); -} - -Tensor cat(const TensorList tensors, int64_t dim) { - const auto norm_dim = normalize_dim(dim, 4); - TORCH_INTERNAL_ASSERT( - norm_dim == 0 || norm_dim == 1, - "Vulkan cat is implemented only for batch and channels dimensions"); - at::Tensor tensor = tensors[0]; - int64_t cat_dim_size = 0; - - std::vector vTensors{}; - for (const auto i : c10::irange(tensors.size())) { - const auto& t = tensors[i]; - TORCH_INTERNAL_ASSERT( - t.dim() == 4, "Vulkan cat expects 4 dimensional inputs"); - TORCH_INTERNAL_ASSERT(t.is_vulkan(), "Vulkan cat expects Vulkan inputs"); - - for (const auto d : c10::irange(4)) { - if (d == dim) { - continue; - } - TORCH_INTERNAL_ASSERT( - t.size(d) == tensor.size(d), - "Vulkan cat inputs must have matching sizes except concatenated dimension"); - } - vTensors.push_back(vtensor_from_vulkan(t)); - cat_dim_size += t.size(dim); - } - - auto result_size = tensor.sizes().vec(); - result_size[dim] = cat_dim_size; - - VulkanTensor output{result_size}; - - vulkan::detail::cat(output, vTensors, dim); - return new_with_vtensor_vulkan(std::move(output), tensor.options()); -} - -Tensor transpose(const Tensor& self, int64_t dim0, int64_t dim1) { - return new_with_vtensor_vulkan( - vulkan::detail::transpose(vtensor_from_vulkan(self), dim0, dim1), - self.options()); -} - -Tensor& transpose_(Tensor& self, int64_t dim0, int64_t dim1) { - auto& x = vtensor_from_vulkan(self); - x = vulkan::detail::transpose(x, dim0, dim1); - return self; -} - -Tensor view(const Tensor& self, IntArrayRef size) { - return new_with_vtensor_vulkan( - vulkan::detail::reshape_copy( - vtensor_from_vulkan(self), at::infer_size(size, self.numel())), - self.options()); -} - -Tensor contiguous(const Tensor& self, MemoryFormat memory_format) { - return self; -} - -Tensor slice( - const Tensor& self, - int64_t dim, - int64_t start, - int64_t end, - int64_t step) { - return new_with_vtensor_vulkan( - vulkan::detail::slice(vtensor_from_vulkan(self), dim, start, end, step), - self.options()); -} - -Tensor add(const Tensor& self, const Tensor& other, const Scalar& alpha) { - auto xt = self.is_vulkan() ? self : self.vulkan(); - const auto& x = vtensor_from_vulkan(xt); - auto yt = other.is_vulkan() ? other : other.vulkan(); - const auto& y = vtensor_from_vulkan(yt); - const float a = alpha.to(); - - VulkanTensor output{self.sizes().vec()}; - vulkan::detail::add(output, x, y, a); - return new_with_vtensor_vulkan(std::move(output), self.options()); -} - -VulkanTensor& vtensor(Tensor& t) { - if (t.is_vulkan()) { - return vtensor_from_vulkan(t); - } - auto tv = t.vulkan(); - return vtensor_from_vulkan(tv); -} - -const VulkanTensor& vtensor(const Tensor& t) { - if (t.is_vulkan()) { - return vtensor_from_vulkan(t); - } - const auto tv = t.vulkan(); - return vtensor_from_vulkan(tv); -} - -Tensor& add_(Tensor& self, const Tensor& other, const Scalar& alpha) { - auto& x = vtensor(self); - const auto& y = vtensor(other); - float a = alpha.to(); - - VulkanTensor output{self.sizes().vec()}; - vulkan::detail::add(output, x, y, a); - x = std::move(output); - return self; -} - -Tensor add_scalar(const Tensor& self, const Scalar& other, const Scalar& alpha) { - const auto& x = vtensor_from_vulkan(self); - const float s = other.to(); - const float a = alpha.to(); - VulkanTensor output{self.sizes().vec()}; - vulkan::detail::add(output, x, s * a); - return new_with_vtensor_vulkan(std::move(output), self.options()); -} - -Tensor mul_scalar(const Tensor& self, const Scalar& other) { - const auto& x = vtensor_from_vulkan(self); - const float s = other.to(); - VulkanTensor output{self.sizes().vec()}; - vulkan::detail::mul(output, x, s); - return new_with_vtensor_vulkan(std::move(output), self.options()); -} - -Tensor select(const Tensor& self, int64_t dim, int64_t index) { - auto sliced = vulkan::aten::slice(self, dim, index, index + 1, 1); - auto sizes = self.sizes().vec(); - sizes.erase(sizes.begin() + dim); - return vulkan::aten::reshape(sliced, sizes); -} - -Tensor unsqueeze(const Tensor& self, int64_t dim) { - auto sizes = self.sizes().vec(); - sizes.insert(sizes.begin() + dim, 1); - return vulkan::aten::reshape(self, sizes); -} - -Tensor convolution( - const Tensor& input, // Vulkan - const Tensor& weight, // CPU - const c10::optional& bias, // CPU - const IntArrayRef stride, - const IntArrayRef padding, - const IntArrayRef dilation, - const bool transposed, - const IntArrayRef output_padding, - const int64_t groups) { - const vulkan::Conv2DParams params{ - input.sizes(), weight.sizes(), padding, stride, dilation, groups}; - TORCH_INTERNAL_ASSERT( - input.dim() == 4, "convolution: Expected 4-dimensional input"); - TORCH_INTERNAL_ASSERT( - weight.dim() == 4, "convolution: Expected 4-dimensional weight"); - TORCH_INTERNAL_ASSERT( - groups == 1 || groups == params.C, - "convolution: only nogroup or depthwise convolutions supported"); - TORCH_INTERNAL_ASSERT(!transposed, "convolution: transposed not supported"); - - const VulkanTensor& vinput = vtensor_from_vulkan(input); - VulkanTensor voutput = VulkanTensor{params.output_sizes()}; - - vulkan::detail::conv2d( - voutput, - vinput, - weight.data_ptr(), - (bias.has_value() && bias->defined()) - ? c10::make_optional(bias->data_ptr()) - : c10::nullopt, - params); - return new_with_vtensor_vulkan(std::move(voutput), input.options()); -} - -Tensor addmm( - const Tensor& self, - const Tensor& mat1, - const Tensor& mat2, - const Scalar& beta, - const Scalar& alpha) { - const VulkanTensor t = - vtensor_from_vulkan(self.is_vulkan() ? self : self.vulkan()); - const VulkanTensor m1 = - vtensor_from_vulkan(mat1.is_vulkan() ? mat1 : mat1.vulkan()); - const VulkanTensor m2 = - vtensor_from_vulkan(mat2.is_vulkan() ? mat2 : mat2.vulkan()); - const float b = beta.to(); - const float a = alpha.to(); - - VulkanTensor output = VulkanTensor{self.sizes().vec()}; - vulkan::detail::addmm(output, t, m1, m2, b, a); - return new_with_vtensor_vulkan(std::move(output), self.options()); -} - -Tensor mm(const Tensor& self, const Tensor& mat2) { - TORCH_INTERNAL_ASSERT( - self.dim() == 2 && mat2.dim() == 2, - "vulkan_mm expects 2-dimensional tensors"); - const auto m1Sizes = self.sizes(); - const auto m2Sizes = mat2.sizes(); - TORCH_INTERNAL_ASSERT( - m1Sizes[1] == m2Sizes[0], - "vulkan_mm expects self.sizes[1] equal mat2.sizes[0]"); - - const auto& m1 = vtensor_from_vulkan(self.is_vulkan() ? self : self.vulkan()); - const auto& m2 = vtensor_from_vulkan(mat2.is_vulkan() ? mat2 : mat2.vulkan()); - - VulkanTensor output{{m1Sizes[0], m2Sizes[1]}}; - vulkan::detail::addmm(output, c10::nullopt, m1, m2, 0.f, 1.f); - return new_with_vtensor_vulkan(std::move(output), self.options()); -} - -Tensor clamp( - const Tensor& self, - const c10::optional& min, - const c10::optional& max) { - const auto& x = vtensor_from_vulkan(self); - VulkanTensor output{self.sizes().vec()}; - vulkan::detail::clamp( - output, - x, - min ? min.value().to() : -std::numeric_limits::infinity(), - max ? max.value().to() : std::numeric_limits::infinity()); - return vulkan::aten::new_with_vtensor_vulkan( - std::move(output), self.options()); -} - -Tensor& clamp_( - Tensor& self, - const c10::optional& min, - const c10::optional& max) { - auto& x = vtensor_from_vulkan(self); - VulkanTensor output{self.sizes().vec()}; - vulkan::detail::clamp( - output, - x, - min ? min.value().to() : -std::numeric_limits::infinity(), - max ? max.value().to() : std::numeric_limits::infinity()); - x = std::move(output); - return self; -} - -Tensor hardtanh(const Tensor& self, const Scalar& min, const Scalar& max) { - return vulkan::aten::clamp(self, min, max); -} - -Tensor& hardtanh_(Tensor& self, const Scalar& min, const Scalar& max) { - return vulkan::aten::clamp_(self, min, max); -} - -Tensor& relu_(Tensor& self) { - return vulkan::aten::clamp_(self, 0, nullopt); -} - -Tensor mean( - const Tensor& self, - const IntArrayRef dim, - const bool keepdim, - const optional dtype) { - TORCH_INTERNAL_ASSERT(!keepdim, "keepdim not implemented for Vulkan mean"); - TORCH_INTERNAL_ASSERT(self.is_vulkan(), "mean expects Vulkan tensor input"); - - // Mean is implemented only for HW dimensions of 4-d tensor - TORCH_INTERNAL_ASSERT(self.dim() == 4); - static const std::unordered_set expected_dims_set({2, 3}); - std::unordered_set dims_set; - for (const auto& d : dim) { - dims_set.insert(normalize_dim(d, 4)); - } - TORCH_INTERNAL_ASSERT(expected_dims_set == dims_set); - - const auto& x = vtensor_from_vulkan(self); - const auto sizes = self.sizes(); - VulkanTensor output{std::vector{sizes[0], sizes[1]}}; - vulkan::detail::mean(output, x); - return new_with_vtensor_vulkan(std::move(output), self.options()); -} - -#ifndef USE_VULKAN_API - -TORCH_LIBRARY_IMPL(aten, Vulkan, m) { - m.impl("slice.Tensor", TORCH_FN(at::native::vulkan::aten::slice)); - m.impl("view", TORCH_FN(at::native::vulkan::aten::reshape)); - m.impl("select.int", TORCH_FN(at::native::vulkan::aten::select)); - m.impl("transpose.int", TORCH_FN(at::native::vulkan::aten::transpose)); - m.impl("transpose_", at::native::vulkan::aten::transpose_); - m.impl("view", TORCH_FN(at::native::vulkan::aten::view)); - m.impl("unsqueeze", TORCH_FN(at::native::vulkan::aten::unsqueeze)); - m.impl("empty.memory_format", at::native::vulkan::aten::empty); - m.impl("empty_strided", TORCH_FN(at::native::vulkan::aten::empty_strided)); - m.impl("add.Tensor", TORCH_FN(at::native::vulkan::aten::add)); - m.impl("clamp", TORCH_FN(at::native::vulkan::aten::clamp)); - m.impl("mean.dim", TORCH_FN(at::native::vulkan::aten::mean)); - m.impl("mm", TORCH_FN(at::native::vulkan::aten::mm)); - m.impl("addmm", TORCH_FN(at::native::vulkan::aten::addmm)); - m.impl( - "upsample_nearest2d", - TORCH_FN(at::native::vulkan::aten::upsample_nearest2d)); - m.impl( - "_adaptive_avg_pool2d", - TORCH_FN(at::native::vulkan::aten::adaptive_avg_pool2d)); - m.impl("avg_pool2d", TORCH_FN(at::native::vulkan::aten::avg_pool2d)); - m.impl("max_pool2d", TORCH_FN(at::native::vulkan::aten::max_pool2d)); - m.impl("_cat", TORCH_FN(at::native::vulkan::aten::cat)); - m.impl("mul.Scalar", TORCH_FN(at::native::vulkan::aten::mul_scalar)); - m.impl("add.Scalar", TORCH_FN(at::native::vulkan::aten::add_scalar)); - m.impl( - "convolution_overrideable", at::native::vulkan::aten::convolution); - m.impl("hardtanh_", at::native::vulkan::aten::hardtanh_); - m.impl("relu_", at::native::vulkan::aten::relu_); - m.impl("add_.Tensor", at::native::vulkan::aten::add_); -} - -#endif /* USE_VULKAN_API */ - -Tensor& copy_from_vulkan_(Tensor& self, const Tensor& src) { - TORCH_INTERNAL_ASSERT( - src.device().type() == DeviceType::Vulkan, - "copy_from_vulkan input tensor's device is not Vulkan"); - TORCH_INTERNAL_ASSERT( - self.device().is_cpu(), - "copy_from_vulkan is implemented only for CPU device output"); - TORCH_INTERNAL_ASSERT( - self.layout() == Layout::Strided, - "copy_from_vulkan is implemented only for Strided layout output"); - TORCH_INTERNAL_ASSERT( - self.scalar_type() == ScalarType::Float, - "copy_from_vulkan is implemented only for float dtype output, got:", - self.scalar_type()); - TORCH_INTERNAL_ASSERT( - self.is_contiguous(), - "copy_from_vulkan is implemented only for contiguous output tensor"); - - const auto& vtensor = vtensor_from_vulkan(src); - vtensor.copy_data_to_host(self.data_ptr()); - return self; -} - -Tensor& copy_to_vulkan_(Tensor& self, const Tensor& src) { - TORCH_INTERNAL_ASSERT( - self.device().type() == DeviceType::Vulkan, - "copy_to_vulkan output tensor's device is not Vulkan"); - TORCH_INTERNAL_ASSERT( - src.device().is_cpu(), - "copy_to_vulkan is implemented only for CPU device input"); - TORCH_INTERNAL_ASSERT( - src.layout() == Layout::Strided, - "copy_to_vulkan is implemented only for Strided layout input"); - TORCH_INTERNAL_ASSERT( - src.scalar_type() == ScalarType::Float, - "copy_to_vulkan is implemented only for float dtype"); - - auto cpu_tensor_contiguous = src.contiguous(); - VulkanTensor& vtensor = vtensor_from_vulkan(self); - vtensor.set_data_from_host(cpu_tensor_contiguous.data_ptr()); - return self; -} - -Tensor& vulkan_copy_impl_(Tensor& self, const Tensor& src) { - if (src.device().type() == at::kVulkan && self.device().type() == at::kCPU) { - return copy_from_vulkan_(self, src); - } - if (src.device().type() == at::kCPU && self.device().type() == at::kVulkan) { - return copy_to_vulkan_(self, src); - } - TORCH_INTERNAL_ASSERT( - src.device().type() == DeviceType::Vulkan, - "vulkan_copy_ is implemented only for CPU,Strided,float->Vulkan; Vulkan->CPU,Strided,float"); - return self; -} - -struct VulkanImpl final : public at::vulkan::VulkanImplInterface { - bool is_vulkan_available() const override { - return at::native::vulkan::detail::is_available(); - } - - Tensor& vulkan_copy_(Tensor& self, const Tensor& src) const override { - return vulkan_copy_impl_(self, src); - } -}; -static at::vulkan::VulkanImplRegistrar g_vulkan_impl(new VulkanImpl()); - -} // namespace aten - -using detail::VulkanTensor; -Tensor convolution_prepack_weights(const Tensor& weight) { - const auto wsizes = weight.sizes(); - TORCH_INTERNAL_ASSERT( - wsizes.size() == 4, - "convolution_prepack_weights: Expected 4-dimensional weight"); - - const int64_t OC = wsizes[0]; - const int64_t C = wsizes[1]; - const int64_t KH = wsizes[2]; - const int64_t KW = wsizes[3]; - VulkanTensor voutput = - VulkanTensor{{UP_DIV(OC, 4), UP_DIV(C, 4), KH * KW, 16}}; - - vulkan::detail::conv2d_prepack_weights( - voutput, weight.data_ptr(), OC, C, KH, KW); - return aten::new_with_vtensor_vulkan( - std::move(voutput), at::device(at::kVulkan).dtype(at::kFloat)); -} - -Tensor convolution_prepacked( - const Tensor& input, // Vulkan - const IntArrayRef weightSizes, - const Tensor& weight_prepacked_vulkan, // Vulkan - const c10::optional& bias, // Vulkan|CPU - const IntArrayRef padding, - const IntArrayRef stride, - const IntArrayRef dilation, - int64_t groups, - const float output_min, - const float output_max) { - TORCH_INTERNAL_ASSERT( - input.dim() == 4, "Vulkan convolution: Expected 4-dimensional input"); - TORCH_INTERNAL_ASSERT( - weight_prepacked_vulkan.dim() == 4, - "Vulkan convolution: Expected 4-dimensional weight"); - vulkan::Conv2DParams params{ - input.sizes(), weightSizes, padding, stride, dilation, groups}; - TORCH_INTERNAL_ASSERT( - groups == 1 || groups == params.C, - "Vulkan convolution: only nogroup or depthwise convolutions supported"); - const VulkanTensor& vinput = aten::vtensor_from_vulkan(input); - const VulkanTensor& vweight = - aten::vtensor_from_vulkan(weight_prepacked_vulkan); - VulkanTensor voutput = - VulkanTensor{{params.N, params.OC, params.OH, params.OW}}; - const bool hasBias = bias.has_value() && bias->defined(); - if (hasBias && bias->is_vulkan()) { - const VulkanTensor& vbias = aten::vtensor_from_vulkan(*bias); - vulkan::detail::conv2d( - voutput, vinput, vweight, vbias, params, output_min, output_max); - } else { - vulkan::detail::conv2d( - voutput, - vinput, - vweight, - hasBias ? c10::make_optional((*bias).data_ptr()) - : c10::nullopt, - params, - output_min, - output_max); - } - return aten::new_with_vtensor_vulkan(std::move(voutput), input.options()); -} - -} // namespace vulkan -} // namespace native -} // namespace at diff --git a/aten/src/ATen/native/vulkan/VulkanAten.h b/aten/src/ATen/native/vulkan/VulkanAten.h deleted file mode 100644 index 8345ff6ac065..000000000000 --- a/aten/src/ATen/native/vulkan/VulkanAten.h +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -#include - -namespace at { -namespace native { -namespace vulkan { - -Tensor convolution_prepack_weights(const at::Tensor& weight); - -Tensor convolution_prepacked( - const at::Tensor& input, // Vulkan - IntArrayRef weightSizes, - const at::Tensor& weight_prepacked_vulkan, // Vulkan - const c10::optional& bias, // Vulkan|CPU - IntArrayRef padding, - IntArrayRef stride, - IntArrayRef dilation, - int64_t groups, - const float output_min, - const float output_max); - -} // namespace vulkan -} // namespace native -} // namespace at diff --git a/aten/src/ATen/native/vulkan/VulkanCommon.h b/aten/src/ATen/native/vulkan/VulkanCommon.h deleted file mode 100644 index 39d9c3bc129c..000000000000 --- a/aten/src/ATen/native/vulkan/VulkanCommon.h +++ /dev/null @@ -1,92 +0,0 @@ -#pragma once - -#include - -#include - -#define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) -#define ROUND_UP(x, y) (((x) + (y) - (1)) / (y) * (y)) -#define ALIGN_UP4(x) ROUND_UP((x), 4) - -namespace at { -namespace native { -namespace vulkan { - -struct ContextConv2D final { - at::Tensor weight_prepacked_vulkan_; - c10::optional bias_vulkan_; - std::array weight_size_; - std::array padding_; - std::array stride_; - std::array dilation_; - int64_t groups_; - float output_min_; - float output_max_; - - ContextConv2D() = delete; - - ContextConv2D( - at::Tensor&& weight_prepacked_vulkan, - c10::optional&& bias_vulkan, - std::array weight_size, - std::array padding, - std::array stride, - std::array dilation, - int64_t groups, - float output_min, - float output_max) - : weight_prepacked_vulkan_(std::move(weight_prepacked_vulkan)), - bias_vulkan_(std::move(bias_vulkan)), - weight_size_(weight_size), - padding_(padding), - stride_(stride), - dilation_(dilation), - groups_(groups), - output_min_(output_min), - output_max_(output_max) {} - - ContextConv2D(ContextConv2D&&) = default; - ContextConv2D& operator=(ContextConv2D&&) = default; - - ~ContextConv2D() {} - - static constexpr float kMin = -std::numeric_limits::infinity(); - static constexpr float kMax = std::numeric_limits::infinity(); -}; - -namespace detail { -template -inline constexpr To safe_downcast_internal(const From v) { - typedef std::common_type_t Type; - constexpr Type min{static_cast(std::numeric_limits::lowest())}; - constexpr Type max{static_cast(std::numeric_limits::max())}; - TORCH_CHECK(min <= v && v <= max, "Cast failed: out of range"); - return static_cast(v); -} - -template -inline constexpr bool is_signed_to_unsigned() { - return std::is_signed::value && std::is_unsigned::value; -} - -template < - typename To, - typename From, - std::enable_if_t(), bool> = true> -inline constexpr To safe_downcast(const From v) { - TORCH_CHECK(v >= From{}, "Cast failed: negative signed to unsigned"); - return safe_downcast_internal(v); -} - -template < - typename To, - typename From, - std::enable_if_t(), bool> = true> -inline constexpr To safe_downcast(const From v) { - return safe_downcast_internal(v); -} - -} // namespace detail -} // namespace vulkan -} // namespace native -} // namespace at diff --git a/aten/src/ATen/native/vulkan/VulkanConvolution.cpp b/aten/src/ATen/native/vulkan/VulkanConvolution.cpp deleted file mode 100644 index d9ef04c14036..000000000000 --- a/aten/src/ATen/native/vulkan/VulkanConvolution.cpp +++ /dev/null @@ -1,106 +0,0 @@ -#include - -#include -#include -#include - -namespace at { -namespace native { -namespace vulkan { -namespace detail { -namespace convolution2d { - -namespace { -// TODO: This function is not used. -bool available( - const Tensor& weight, - const c10::optional& bias, - const IntArrayRef padding, - const IntArrayRef stride, - const IntArrayRef dilation, - const int64_t groups, - const float output_min, - const float output_max) { - return at::native::is_vulkan_available() && (4 == weight.ndimension()) && - (at::Backend::CPU == weight.options().backend()) && - (kFloat == weight.scalar_type()); -} - -} // namespace - -c10::intrusive_ptr createConv2dClampPrePackOpContext( - Tensor&& weight, - c10::optional&& bias, - std::vector&& stride, - std::vector&& padding, - std::vector&& dilation, - const int64_t groups, - const c10::optional& output_min, - const c10::optional& output_max) { - return vulkan::VulkanConv2dOpContext::create_context( - std::move(weight), - std::move(bias), - std::move(padding), - std::move(stride), - std::move(dilation), - groups, - output_min, - output_max); -} - -Tensor conv2d_clamp_run( - const Tensor& input, - const c10::intrusive_ptr& op_context) { - return op_context->run(input); -} - -ContextConv2D create( - const Tensor& weight, - const c10::optional& bias, - const IntArrayRef padding, - const IntArrayRef stride, - const IntArrayRef dilation, - const int64_t groups, - const float output_min, - const float output_max) { - const auto padding_expanded = expand_param_if_needed(padding, "padding", 2); - const auto stride_expanded = expand_param_if_needed(stride, "stride", 2); - const auto dilation_expanded = - expand_param_if_needed(dilation, "dilation", 2); - const Tensor weight_nchw = weight.contiguous(); - const auto ws = weight_nchw.sizes(); - return ContextConv2D{ - groups == 1 ? at::native::vulkan::convolution_prepack_weights(weight_nchw) - : weight_nchw.vulkan(), - bias.has_value() ? c10::make_optional((*bias).vulkan()) : c10::nullopt, - // TODO: Are we sure these tensors will always come into this fucntion with the - // the dimensions expected below? What if they don't? This may trigger a segfault. - // TODO: If we need TORCH_CHECK(available()) calls here as a sanity check, add it. - {{ws[0], ws[1], ws[2], ws[3]}}, - {padding_expanded[0], padding_expanded[1]}, - {stride_expanded[0], stride_expanded[1]}, - {dilation_expanded[0], dilation_expanded[1]}, - groups, - output_min, - output_max}; -} - -Tensor run(const ContextConv2D& context, const Tensor& input) { - return at::native::vulkan::convolution_prepacked( - input, - context.weight_size_, - context.weight_prepacked_vulkan_, - context.bias_vulkan_, - context.padding_, - context.stride_, - context.dilation_, - context.groups_, - context.output_min_, - context.output_max_); -} - -} // namespace convolution2d -} // namespace detail -} // namespace vulkan -} // namespace native -} // namespace at diff --git a/aten/src/ATen/native/vulkan/VulkanConvolution.h b/aten/src/ATen/native/vulkan/VulkanConvolution.h deleted file mode 100644 index e956d133a155..000000000000 --- a/aten/src/ATen/native/vulkan/VulkanConvolution.h +++ /dev/null @@ -1,132 +0,0 @@ -#pragma once - -#include - -#include -#include -#include -#include - -namespace at { -namespace native { -namespace vulkan { - -struct Conv2DParams final { - int64_t N; // batch size - int64_t C; // channels - int64_t H; // input height - int64_t W; // input width - int64_t OC; // output channels - int64_t KH; // kernel height - int64_t KW; // kernel width - int64_t SY; // stride y (height) - int64_t SX; // stride x (width) - int64_t PY; // padding y (height) - int64_t PX; // padding x (width) - int64_t DY; // dilation y (height) - int64_t DX; // dilation x (width) - int64_t G; // groups - int64_t OW; // output width - int64_t OH; // output height - int64_t OC_4; - int64_t C_4; - - Conv2DParams() = delete; - Conv2DParams( - c10::IntArrayRef inputSizes, - int64_t OC, - int64_t KH, - int64_t KW, - int64_t SY, - int64_t SX, - int64_t PY, - int64_t PX, - int64_t DY, - int64_t DX, - int64_t G) - // TODO: What if inputSizes is not of the expected dimensionality? - // Should check prior to indexing. - : N(inputSizes[0]), - C(inputSizes[1]), - H(inputSizes[2]), - W(inputSizes[3]), - OC(OC), - KH(KH), - KW(KW), - SY(SY), - SX(SX), - PY(PY), - PX(PX), - DY(DY), - DX(DX), - G(G) { - OC_4 = UP_DIV(OC, 4); - C_4 = UP_DIV(C, 4); - const int64_t KWE = (KW - 1) * DX + 1; - const int64_t KHE = (KH - 1) * DY + 1; - OW = ((W - KWE + 2 * PX) / SX) + 1; - OH = ((H - KHE + 2 * PY) / SY) + 1; - } - - Conv2DParams( - c10::IntArrayRef inputSizes, - c10::IntArrayRef weightSizes, - c10::IntArrayRef padding, - c10::IntArrayRef stride, - c10::IntArrayRef dilation, - int64_t groups) - // TODO: What if these parameters are not of the correct dimensionality? - // Should check prior to indexing. - : Conv2DParams( - inputSizes, - weightSizes[0], - weightSizes[2], - weightSizes[3], - stride[0], - stride[1], - padding[0], - padding[1], - dilation[0], - dilation[1], - groups) {} - - std::vector output_sizes() const { - return {N, OC, OH, OW}; - } -}; - -namespace detail { -namespace convolution2d { - -c10::intrusive_ptr -createConv2dClampPrePackOpContext( - Tensor&& weight, - c10::optional&& bias, - std::vector&& stride, - std::vector&& padding, - std::vector&& dilation, - int64_t groups, - const c10::optional& output_min, - const c10::optional& output_max); - -Tensor conv2d_clamp_run( - const Tensor& input, - const c10::intrusive_ptr& op_context); - -ContextConv2D create( - const Tensor& weight, - const c10::optional& bias, - const IntArrayRef padding, - const IntArrayRef stride, - const IntArrayRef dilation, - const int64_t groups, - const float output_min, - const float output_max); - -Tensor run(const ContextConv2D& context, const Tensor& input); - -} // namespace convolution2d -} // namespace detail -} // namespace vulkan -} // namespace native -} // namespace at diff --git a/aten/src/ATen/native/vulkan/VulkanOpContext.cpp b/aten/src/ATen/native/vulkan/VulkanOpContext.cpp deleted file mode 100644 index c5e613f4827d..000000000000 --- a/aten/src/ATen/native/vulkan/VulkanOpContext.cpp +++ /dev/null @@ -1,44 +0,0 @@ -#include -#include - -namespace at { -namespace native { -namespace vulkan { - -c10::intrusive_ptr VulkanConv2dOpContext::create_context( - at::Tensor&& weight, - c10::optional&& bias, - std::vector&& padding, - std::vector&& stride, - std::vector&& dilation, - const int64_t groups, - const c10::optional& output_min, - const c10::optional& output_max) { - auto op_context = vulkan::detail::convolution2d::create( - weight, - bias, - padding, - stride, - dilation, - groups, - output_min ? output_min->to() : vulkan::ContextConv2D::kMin, - output_max ? output_max->to() : vulkan::ContextConv2D::kMax); - return c10::make_intrusive( - std::move(weight), - std::move(bias), - std::move(padding), - std::move(stride), - std::move(dilation), - groups, - output_min, - output_max, - std::move(op_context)); -} - -Tensor VulkanConv2dOpContext::run(const Tensor& input) { - return vulkan::detail::convolution2d::run(op_context_, input); -} - -} // namespace vulkan -} // namespace native -} // namespace at diff --git a/aten/src/ATen/native/vulkan/VulkanOpContext.h b/aten/src/ATen/native/vulkan/VulkanOpContext.h deleted file mode 100644 index 970b4edca39c..000000000000 --- a/aten/src/ATen/native/vulkan/VulkanOpContext.h +++ /dev/null @@ -1,89 +0,0 @@ -#pragma once - -#include -#include -#include - -namespace at { -namespace native { -namespace vulkan { - -using SerializationTypeConv2dPrePack = std::tuple< - Tensor, - c10::optional, - std::vector, - std::vector, - std::vector, - int64_t, - c10::optional, - c10::optional>; - -class Conv2dOpContext : public torch::jit::CustomClassHolder { - protected: - Tensor orig_weight_; - c10::optional orig_bias_; - std::vector stride_; - std::vector padding_; - std::vector dilation_; - int64_t groups_; - c10::optional output_min_; - c10::optional output_max_; - - public: - SerializationTypeConv2dPrePack unpack() { - return std::make_tuple( - orig_weight_, - orig_bias_, - stride_, - padding_, - dilation_, - groups_, - output_min_, - output_max_); - } - - virtual Tensor run(const Tensor& input) = 0; -}; - -class VulkanConv2dOpContext final : public Conv2dOpContext { - private: - ContextConv2D op_context_; - - public: - VulkanConv2dOpContext( - Tensor&& weight, - c10::optional&& bias, - std::vector&& padding, - std::vector&& stride, - std::vector&& dilation, - uint64_t groups, - const c10::optional& min, - const c10::optional& max, - ContextConv2D&& op_context) - : op_context_(std::move(op_context)) { - orig_weight_ = std::move(weight); - orig_bias_ = std::move(bias); - padding_ = std::move(padding); - stride_ = std::move(stride); - dilation_ = std::move(dilation); - groups_ = groups; - output_min_ = min; - output_max_ = max; - } - - Tensor run(const Tensor& input) override; - - static c10::intrusive_ptr create_context( - Tensor&& weight, - c10::optional&& bias, - std::vector&& padding, - std::vector&& stride, - std::vector&& dilation, - int64_t groups, - const c10::optional& output_min, - const c10::optional& output_max); -}; - -} // namespace vulkan -} // namespace native -} // namespace at diff --git a/aten/src/ATen/native/vulkan/VulkanOpaqueTensorImpl.h b/aten/src/ATen/native/vulkan/VulkanOpaqueTensorImpl.h index 9e48de07094a..05c5ce977cd1 100644 --- a/aten/src/ATen/native/vulkan/VulkanOpaqueTensorImpl.h +++ b/aten/src/ATen/native/vulkan/VulkanOpaqueTensorImpl.h @@ -24,10 +24,9 @@ struct VulkanOpaqueTensorImpl : public OpaqueTensorImpl { sizes, false), strides_(strides.vec()) { - TensorImpl::set_has_contiguity_policy(TensorImpl::HasContiguityPolicy::CustomBehavior); } - IntArrayRef strides() const override { + IntArrayRef strides_custom() const override { return strides_; } @@ -35,16 +34,13 @@ struct VulkanOpaqueTensorImpl : public OpaqueTensorImpl { return true; } - int64_t stride(int64_t d) const override { - d = at::maybe_wrap_dim(d, this->dim(), false); - return strides_[d]; - } - private: const char* tensorimpl_type_name() const override { return "VulkanOpaqueTensorImpl"; } + // TODO: storing strides separately is unnecessary, the base TensorImpl + // has space for them SmallVector strides_; }; diff --git a/aten/src/ATen/native/vulkan/VulkanOps.cpp b/aten/src/ATen/native/vulkan/VulkanOps.cpp deleted file mode 100644 index 7cbd7479e256..000000000000 --- a/aten/src/ATen/native/vulkan/VulkanOps.cpp +++ /dev/null @@ -1,1307 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include - -namespace at { -namespace native { -namespace vulkan { -namespace detail { - -void upsample_nearest2d( - VulkanTensor& output, - const VulkanTensor& input, - int64_t IH, - int64_t IW, - int64_t OH, - int64_t OW, - int64_t IN, - int64_t IC, - float scaleH, - float scaleW) { - auto device = context().device(); - int64_t C = IN * IC; - struct ConstBlock { - float scaleX; - float scaleY; - }; - ConstBlock cb{scaleW, - scaleH}; - VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb)); - - VkDescriptorSetLayout descriptorSetLayout{}; - VkDescriptorPool descriptorPool{}; - VkDescriptorSet descriptorSet{}; - std::vector descriptorTypes{ - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER}; - createDescriptorSetLayoutSinglePool( - device, - descriptorTypes, - &descriptorSetLayout, - &descriptorPool, - &descriptorSet); - - output.image()->bindStorageImage(descriptorSet, 0); - input.image()->bindShaderRead(descriptorSet, 1); - constBuffer.bind(descriptorSet, 2); - - WorkGroupSize workGroupSize{8, 8, 1}; - auto& computeUnit = context().computeUnitFactory().get( - GLSL_SPV(upsample_nearest2d), descriptorSetLayout, workGroupSize); - computeUnit.createCommandBuffer(descriptorSet); - input.image()->addImageMemoryBarrierToShaderRead(computeUnit.commandBuffer()); - computeUnit.dispatchCommandBuffer(OW, OH, C, workGroupSize); - computeUnit.endCommandBuffer(); - computeUnit.submitAndWaitCommandBuffer(); - vkDestroyDescriptorPool(device, descriptorPool, nullptr); - vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr); -} - -VulkanTensor reshape_copy( - const VulkanTensor& input, - std::vector shape) { - input.sync_image_to_buffer(); - VulkanTensor output{infer_size(shape, input.numel())}; - copy_buffer_to_buffer( - *(input.buffer()), *(output.buffer()), input.buffer()->sizeBytes()); - return output; -} - -VulkanTensor cat( - VulkanTensor& output, - ArrayRef inputs, - int64_t dim) { - VkDeviceSize outputOffset = 0; - for (const auto& input : inputs) { - input.sync_image_to_buffer(); - const auto sizeBytes = sizeof(float) * input.numel(); - copy_buffer_to_buffer( - *(input.buffer()), *(output.buffer()), sizeBytes, 0, outputOffset); - outputOffset += sizeBytes; - } - return output; -} - -void adaptive_avg_pool2d( - VulkanTensor& output, - const VulkanTensor& input, - const int64_t IH, - const int64_t IW, - const int64_t OH, - const int64_t OW, - const int64_t IN, - const int64_t IC) { - auto device = context().device(); - int64_t C = IN * IC; - - VkDescriptorSetLayout descriptorSetLayout{}; - VkDescriptorPool descriptorPool{}; - VkDescriptorSet descriptorSet{}; - std::vector descriptorTypes{ - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER}; - createDescriptorSetLayoutSinglePool( - device, - descriptorTypes, - &descriptorSetLayout, - &descriptorPool, - &descriptorSet); - - output.image()->bindStorageImage(descriptorSet, 0); - input.image()->bindShaderRead(descriptorSet, 1); - - WorkGroupSize workGroupSize{8, 8, 1}; - auto& computeUnit = context().computeUnitFactory().get( - GLSL_SPV(adaptive_avg_pool2d), descriptorSetLayout, workGroupSize); - computeUnit.createCommandBuffer(descriptorSet); - input.image()->addImageMemoryBarrierToShaderRead(computeUnit.commandBuffer()); - computeUnit.dispatchCommandBuffer(OW, OH, C, workGroupSize); - computeUnit.endCommandBuffer(); - computeUnit.submitAndWaitCommandBuffer(); - vkDestroyDescriptorPool(device, descriptorPool, nullptr); - vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr); -} - -void max_pool2d( - VulkanTensor& output, - const VulkanTensor& input, - const int iH, - const int iW, - const int oH, - const int oW, - const int _n, - const int _c, - const int kH, - const int kW, - const int dH, - const int dW, - const int padH, - const int padW, - const int dilationH, - const int dilationW) { - auto device = context().device(); - const auto c = _n * _c; - struct ConstBlock { - int32_t inputSize[4]; - int32_t outputSize[4]; - int32_t kernelSize[2]; - int32_t stride[2]; - int32_t padding[2]; - int32_t dilate[2]; - }; - ConstBlock cb{ - {iW, iH, c, 0}, - {oW, oH, c, 0}, - {kW, kH}, - {dW, dH}, - {padW, padH}, - {dilationW, dilationH}, - }; - VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb)); - - VkDescriptorSetLayout descriptorSetLayout{}; - VkDescriptorPool descriptorPool{}; - VkDescriptorSet descriptorSet{}; - std::vector descriptorTypes{ - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER}; - createDescriptorSetLayoutSinglePool( - device, - descriptorTypes, - &descriptorSetLayout, - &descriptorPool, - &descriptorSet); - - output.image()->bindStorageImage(descriptorSet, 0); - input.image()->bindShaderRead(descriptorSet, 1); - constBuffer.bind(descriptorSet, 2); - - WorkGroupSize workGroupSize{8, 8, 1}; - auto& computeUnit = context().computeUnitFactory().get( - GLSL_SPV(max_pool2d), descriptorSetLayout, workGroupSize); - computeUnit.createCommandBuffer(descriptorSet); - input.image()->addImageMemoryBarrierToShaderRead(computeUnit.commandBuffer()); - computeUnit.dispatchCommandBuffer(oW, oH, c, workGroupSize); - computeUnit.endCommandBuffer(); - computeUnit.submitAndWaitCommandBuffer(); - vkDestroyDescriptorPool(device, descriptorPool, nullptr); - vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr); -} - -void avg_pool2d( - VulkanTensor& output, - const VulkanTensor& input, - const int iH, - const int iW, - const int oH, - const int oW, - const int _n, - const int _c, - const int kH, - const int kW, - const int dH, - const int dW, - const int padH, - const int padW) { - auto device = context().device(); - const auto c = _n * _c; - struct ConstBlock { - int32_t kernelSize[2]; - int32_t stride[2]; - int32_t padding[2]; - }; - ConstBlock cb{ - {kW, kH}, - {dW, dH}, - {padW, padH}, - }; - VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb)); - - VkDescriptorSetLayout descriptorSetLayout{}; - VkDescriptorPool descriptorPool{}; - VkDescriptorSet descriptorSet{}; - std::vector descriptorTypes{ - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER}; - createDescriptorSetLayoutSinglePool( - device, - descriptorTypes, - &descriptorSetLayout, - &descriptorPool, - &descriptorSet); - - output.image()->bindStorageImage(descriptorSet, 0); - input.image()->bindShaderRead(descriptorSet, 1); - constBuffer.bind(descriptorSet, 2); - - WorkGroupSize workGroupSize{8, 8, 1}; - auto& computeUnit = context().computeUnitFactory().get( - GLSL_SPV(avg_pool2d), descriptorSetLayout, workGroupSize); - computeUnit.createCommandBuffer(descriptorSet); - input.image()->addImageMemoryBarrierToShaderRead(computeUnit.commandBuffer()); - computeUnit.dispatchCommandBuffer(oW, oH, c, workGroupSize); - computeUnit.endCommandBuffer(); - computeUnit.submitAndWaitCommandBuffer(); - vkDestroyDescriptorPool(device, descriptorPool, nullptr); - vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr); -} - -VulkanTensor transpose( - const VulkanTensor& input, - const int64_t dim0, - const int64_t dim1) { - const auto idim = input.dim(); - TORCH_INTERNAL_ASSERT( - idim <= 6, "Vulkan transpose is implemented only for dim <= 6"); - auto device = context().device(); - struct ConstBlock { - int32_t istrides[8]; - int32_t ostrides[8]; - int32_t odims[8]; - int32_t storageOffset; - }; - - auto isizes = input.sizes(); - auto osizes = isizes; - std::swap(osizes[dim0], osizes[dim1]); - VulkanTensor output{osizes}; - output.allocate_storage(); - - std::array idims8; - idims8.fill(1); - std::array odims8; - odims8.fill(1); - std::copy(isizes.cbegin(), isizes.cend(), idims8.end() - idim); - std::copy(osizes.cbegin(), osizes.cend(), odims8.end() - idim); - std::array istrides8; - istrides8.fill(1); - std::array ostrides8; - ostrides8.fill(1); - for (int i = 6; i >= 0; --i) { - istrides8[i] = idims8[i + 1] * istrides8[i + 1]; - ostrides8[i] = odims8[i + 1] * ostrides8[i + 1]; - } - std::swap(istrides8[8 - idim + dim0], istrides8[8 - idim + dim1]); - - ConstBlock cb{}; - std::copy(istrides8.cbegin(), istrides8.cend(), std::begin(cb.istrides)); - std::copy(ostrides8.cbegin(), ostrides8.cend(), std::begin(cb.ostrides)); - std::copy(odims8.cbegin(), odims8.cend(), std::begin(cb.odims)); - cb.storageOffset = 0; - - VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb)); - - VkDescriptorSetLayout descriptorSetLayout{}; - VkDescriptorPool descriptorPool{}; - VkDescriptorSet descriptorSet{}; - std::vector descriptorTypes{ - VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER}; - createDescriptorSetLayoutSinglePool( - device, - descriptorTypes, - &descriptorSetLayout, - &descriptorPool, - &descriptorSet); - - output.buffer()->bind(descriptorSet, 0); - input.buffer()->bind(descriptorSet, 1); - constBuffer.bind(descriptorSet, 2); - - WorkGroupSize workGroupSize{8, 8, 1}; - auto& computeUnit = context().computeUnitFactory().get( - GLSL_SPV(permute), descriptorSetLayout, workGroupSize); - computeUnit.createCommandBuffer(descriptorSet); - input.buffer()->addBufferMemoryBarrier( - computeUnit.commandBuffer(), 0, input.buffer()->sizeBytes()); - computeUnit.dispatchCommandBuffer( - odims8[6] * odims8[7], - odims8[4] * odims8[5], - odims8[2] * odims8[3], - workGroupSize); - computeUnit.endCommandBuffer(); - computeUnit.submitAndWaitCommandBuffer(); - vkDestroyDescriptorPool(device, descriptorPool, nullptr); - vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr); - return output; -} - -VulkanTensor slice( - const VulkanTensor& input, - const int64_t dim, - const int64_t _start, - const int64_t _end, - const int64_t step) { - const auto isizes = input.sizes(); - auto osizes = isizes; - auto start = _start; - auto end = _end; - if (start < 0) { - start += isizes[dim]; - } - if (end < 0) { - end += isizes[dim]; - } - if (start < 0) { - start = 0; - } else if (start >= isizes[dim]) { - start = isizes[dim]; - } - if (end < start) { - end = start; - } else if (end >= isizes[dim]) { - end = isizes[dim]; - } - const auto len = end - start; - osizes[dim] = (len + step - 1) / step; - - VulkanTensor output{osizes}; - output.allocate_storage(); - - auto idim = input.dim(); - std::array idims8; - idims8.fill(1); - std::copy(isizes.cbegin(), isizes.cend(), idims8.end() - idim); - std::array istrides8; - istrides8.fill(1); - for (int i = 6; i >= 0; --i) { - istrides8[i] = idims8[i + 1] * istrides8[i + 1]; - } - - std::array odims8 = idims8; - std::array ostrides8 = istrides8; - - ostrides8[8 - idim + dim] *= step; - auto storage_offset = start * istrides8[8 - idim + dim]; - - auto device = context().device(); - struct ConstBlock { - int32_t istrides[8]; - int32_t ostrides[8]; - int32_t odims[8]; - int32_t storageOffset; - }; - - ConstBlock cb{}; - std::copy(istrides8.cbegin(), istrides8.cend(), std::begin(cb.istrides)); - std::copy(ostrides8.cbegin(), ostrides8.cend(), std::begin(cb.ostrides)); - std::copy(odims8.cbegin(), odims8.cend(), std::begin(cb.odims)); - cb.storageOffset = storage_offset; - - VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb)); - - VkDescriptorSetLayout descriptorSetLayout{}; - VkDescriptorPool descriptorPool{}; - VkDescriptorSet descriptorSet{}; - std::vector descriptorTypes{ - VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER}; - createDescriptorSetLayoutSinglePool( - device, - descriptorTypes, - &descriptorSetLayout, - &descriptorPool, - &descriptorSet); - - output.buffer()->bind(descriptorSet, 0); - input.buffer()->bind(descriptorSet, 1); - constBuffer.bind(descriptorSet, 2); - - WorkGroupSize workGroupSize{8, 8, 1}; - auto& computeUnit = context().computeUnitFactory().get( - GLSL_SPV(permute), descriptorSetLayout, workGroupSize); - computeUnit.createCommandBuffer(descriptorSet); - input.buffer()->addBufferMemoryBarrier( - computeUnit.commandBuffer(), 0, input.buffer()->sizeBytes()); - computeUnit.dispatchCommandBuffer( - odims8[6] * odims8[7], - odims8[4] * odims8[5], - odims8[2] * odims8[3], - workGroupSize); - computeUnit.endCommandBuffer(); - computeUnit.submitAndWaitCommandBuffer(); - vkDestroyDescriptorPool(device, descriptorPool, nullptr); - vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr); - return output; -} - -void add( - VulkanTensor& output, - const VulkanTensor& input0, - const VulkanTensor& input1, - float alpha) { - auto odim = output.dim(); - TORCH_INTERNAL_ASSERT( - odim <= 4, "Vulkan add is implemented for dim <= 4, output dim > 4"); - auto i0dim = input0.dim(); - TORCH_INTERNAL_ASSERT( - i0dim <= 4, "Vulkan add is implemented for dim <= 4, input0 dim > 4"); - auto i1dim = input1.dim(); - TORCH_INTERNAL_ASSERT( - i1dim <= 4, "Vulkan add is implemented for dim <= 4, input1 dim > 4"); - - auto os = output.sizes(); - auto i0s = input0.sizes(); - auto i1s = input1.sizes(); - - std::array os4 = {1, 1, 1, 1}; - std::copy(os.begin(), os.end(), os4.end() - odim); - std::array i0s4 = {1, 1, 1, 1}; - std::copy(i0s.cbegin(), i0s.cend(), i0s4.end() - i0dim); - std::array i1s4 = {1, 1, 1, 1}; - std::copy(i1s.cbegin(), i1s.cend(), i1s4.end() - i1dim); - - TORCH_INTERNAL_ASSERT( - (os4 == i0s4) && (i0s4 == i1s4), - "Vulkan add expects the same dimensions for all operands"); - - auto C = os4[0] * os4[1]; - auto H = os4[2]; - auto W = os4[3]; - - auto device = context().device(); - struct ConstBlock { - float alpha; - }; - ConstBlock cb{alpha}; - VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb)); - - VkDescriptorSetLayout descriptorSetLayout{}; - VkDescriptorPool descriptorPool{}; - VkDescriptorSet descriptorSet{}; - std::vector descriptorTypes{ - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER}; - createDescriptorSetLayoutSinglePool( - device, - descriptorTypes, - &descriptorSetLayout, - &descriptorPool, - &descriptorSet); - - output.image()->bindStorageImage(descriptorSet, 0); - input0.image()->bindShaderRead(descriptorSet, 1); - input1.image()->bindShaderRead(descriptorSet, 2); - constBuffer.bind(descriptorSet, 3); - - WorkGroupSize workGroupSize{8, 8, 1}; - auto& computeUnit = context().computeUnitFactory().get( - GLSL_SPV(add), descriptorSetLayout, workGroupSize); - computeUnit.createCommandBuffer(descriptorSet); - auto commandBuffer = computeUnit.commandBuffer(); - output.image()->addImageMemoryBarrierToGeneral(commandBuffer); - input0.image()->addImageMemoryBarrierToShaderRead(commandBuffer); - input1.image()->addImageMemoryBarrierToShaderRead(commandBuffer); - computeUnit.dispatchCommandBuffer(W, H, C, workGroupSize); - computeUnit.endCommandBuffer(); - computeUnit.submitAndWaitCommandBuffer(); - vkDestroyDescriptorPool(device, descriptorPool, nullptr); - vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr); -} - -void add(VulkanTensor& output, const VulkanTensor& input, const float s) { - const auto sizes = input.sizes(); - - const auto C = c10::multiply_integers(sizes.cbegin(), sizes.cend() - 2); - const auto C_4 = UP_DIV(C, 4); - const auto H = sizes[2]; - const auto W = sizes[3]; - - auto device = context().device(); - struct ConstBlock { - float s; - }; - ConstBlock cb{s}; - VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb)); - - VkDescriptorSetLayout descriptorSetLayout{}; - VkDescriptorPool descriptorPool{}; - VkDescriptorSet descriptorSet{}; - std::vector descriptorTypes{ - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER}; - createDescriptorSetLayoutSinglePool( - device, - descriptorTypes, - &descriptorSetLayout, - &descriptorPool, - &descriptorSet); - - output.image()->bindStorageImage(descriptorSet, 0); - input.image()->bindShaderRead(descriptorSet, 1); - constBuffer.bind(descriptorSet, 2); - - WorkGroupSize workGroupSize{8, 8, 1}; - auto& computeUnit = context().computeUnitFactory().get( - GLSL_SPV(add_scalar), descriptorSetLayout, workGroupSize); - computeUnit.createCommandBuffer(descriptorSet); - auto commandBuffer = computeUnit.commandBuffer(); - output.image()->addImageMemoryBarrierToGeneral(commandBuffer); - input.image()->addImageMemoryBarrierToShaderRead(commandBuffer); - computeUnit.dispatchCommandBuffer(W, H, C_4, workGroupSize); - computeUnit.endCommandBuffer(); - computeUnit.submitAndWaitCommandBuffer(); - vkDestroyDescriptorPool(device, descriptorPool, nullptr); - vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr); -} - -void mul(VulkanTensor& output, const VulkanTensor& input, const float s) { - const auto sizes = input.sizes(); - - const auto C = c10::multiply_integers(sizes.cbegin(), sizes.cend() - 2); - const auto C_4 = UP_DIV(C, 4); - const auto H = sizes[2]; - const auto W = sizes[3]; - - auto device = context().device(); - struct ConstBlock { - float s; - }; - ConstBlock cb{s}; - VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb)); - - VkDescriptorSetLayout descriptorSetLayout{}; - VkDescriptorPool descriptorPool{}; - VkDescriptorSet descriptorSet{}; - std::vector descriptorTypes{ - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER}; - createDescriptorSetLayoutSinglePool( - device, - descriptorTypes, - &descriptorSetLayout, - &descriptorPool, - &descriptorSet); - - output.image()->bindStorageImage(descriptorSet, 0); - input.image()->bindShaderRead(descriptorSet, 1); - constBuffer.bind(descriptorSet, 2); - - WorkGroupSize workGroupSize{8, 8, 1}; - auto& computeUnit = context().computeUnitFactory().get( - GLSL_SPV(mul_scalar), descriptorSetLayout, workGroupSize); - computeUnit.createCommandBuffer(descriptorSet); - auto commandBuffer = computeUnit.commandBuffer(); - output.image()->addImageMemoryBarrierToGeneral(commandBuffer); - input.image()->addImageMemoryBarrierToShaderRead(commandBuffer); - computeUnit.dispatchCommandBuffer(W, H, C_4, workGroupSize); - computeUnit.endCommandBuffer(); - computeUnit.submitAndWaitCommandBuffer(); - vkDestroyDescriptorPool(device, descriptorPool, nullptr); - vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr); -} - -VBuffer kernelNCHW_OCHW_repack_O4C4HWi4o4( - const float* weights, - const int OC, - const int C, - const int KH, - const int KW) { - const auto C_4 = UP_DIV(C, 4); - const auto kBufSizeNumel = ALIGN_UP4(OC) * ALIGN_UP4(C) * KH * KW; - auto size = sizeof(float) * kBufSizeNumel; - VBuffer kernelBuffer{size}; - const int oc_4SizeNumel = KW * KH * C_4 * 16; - auto mappedMemory = kernelBuffer.map(); - if (mappedMemory.ptr()) { - float* basePtr = (float*)mappedMemory.ptr(); - memset(basePtr, 0, size); - const float* src = weights; - int ridx = 0; - for (const auto oc : c10::irange(OC)) { - int oc_4 = oc / 4; - int oc_4_i = oc % 4; - float* dst_oc = basePtr + oc_4 * oc_4SizeNumel; - for (const auto ic : c10::irange(C)) { - int ic_4 = ic / 4; - int ic_4_i = ic % 4; - float* dst_ic = dst_oc + ic_4 * KW * KH * 16; - for (const auto ky : c10::irange(KH)) { - float* dst_ky = dst_ic + ky * KW * 16; - for (const auto kx : c10::irange(KW)) { - float* dst_kx = dst_ky + kx * 16; - dst_kx[4 * ic_4_i + oc_4_i] = src[ridx++]; - } - } - } - } - } - mappedMemory.flushWriteToDevice(); - return kernelBuffer; -} - -VBuffer bufferFromOptionalHostData( - c10::optional data, - const uint32_t dataSize, - const uint32_t bufferSize) { - TORCH_INTERNAL_ASSERT( - dataSize <= bufferSize, - "buffer size(", - bufferSize, - ") is not enough for data(", - dataSize, - ")"); - const auto sizeAligned = - ROUND_UP(bufferSize, context().limits().minStorageBufferOffsetAlignment); - VBuffer buffer{sizeAligned}; - if (data.has_value()) { - buffer.copy_from_host_to_device(*data, dataSize); - } else { - buffer.set_zeros(); - } - return buffer; -} - -VBuffer bufferZeros(const uint32_t size) { - VBuffer buffer{size}; - buffer.set_zeros(); - return buffer; -} - -void conv2d_depthwise( - VulkanTensor& output, - const VulkanTensor& input, - const VulkanTensor& weight, - const VBuffer& biasBuffer, - const Conv2DParams& params, - c10::optional output_min, - c10::optional output_max) { - TORCH_INTERNAL_ASSERT(params.G == params.C); - auto osizes = output.sizes(); - TORCH_INTERNAL_ASSERT(osizes[2] == params.OH); - TORCH_INTERNAL_ASSERT(osizes[3] == params.OW); - struct ConstBlock { - int32_t padding[2]; - int32_t kernelSize[2]; - int32_t stride[2]; - int32_t dilate[2]; - int32_t inputSize[4]; - int32_t outputSize[4]; - float outputMin; - float outputMax; - }; - ConstBlock cb{ - {safe_downcast(params.PX), safe_downcast(params.PY)}, - {safe_downcast(params.KW), safe_downcast(params.KH)}, - {safe_downcast(params.SX), safe_downcast(params.SY)}, - {safe_downcast(params.DX), safe_downcast(params.DY)}, - {safe_downcast(params.OW), - safe_downcast(params.OH), - safe_downcast(params.OC_4), - 0}, - {safe_downcast(params.W), - safe_downcast(params.H), - safe_downcast(params.C_4), - 0}, - output_min ? *output_min : -std::numeric_limits::infinity(), - output_max ? *output_max : std::numeric_limits::infinity()}; - VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb)); - - VkDescriptorSetLayout descriptorSetLayout{}; - VkDescriptorPool descriptorPool{}; - VkDescriptorSet descriptorSet{}; - std::vector descriptorTypes{ - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER}; - auto device = context().device(); - createDescriptorSetLayoutSinglePool( - device, - descriptorTypes, - &descriptorSetLayout, - &descriptorPool, - &descriptorSet); - - output.image()->bindStorageImage(descriptorSet, 0); - input.image()->bindShaderRead(descriptorSet, 1); - weight.image()->bindShaderRead(descriptorSet, 2); - biasBuffer.bind(descriptorSet, 3); - constBuffer.bind(descriptorSet, 4); - - WorkGroupSize workGroupSize{8, 8, 1}; - auto& computeUnit = context().computeUnitFactory().get( - GLSL_SPV(conv2d_dw_clamp), descriptorSetLayout, workGroupSize); - computeUnit.createCommandBuffer(descriptorSet); - auto commandBuffer = computeUnit.commandBuffer(); - output.image()->addImageMemoryBarrierToGeneral(commandBuffer); - input.image()->addImageMemoryBarrierToShaderRead(commandBuffer); - weight.image()->addImageMemoryBarrierToShaderRead(commandBuffer); - computeUnit.dispatchCommandBuffer( - params.OW, params.OH, params.OC_4, workGroupSize); - computeUnit.endCommandBuffer(); - computeUnit.submitAndWaitCommandBuffer(); - - vkDestroyDescriptorPool(device, descriptorPool, nullptr); - vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr); -} - -void conv2d_depthwise( - VulkanTensor& output, - const VulkanTensor& input, - const VulkanTensor& weight, - const c10::optional bias, - const Conv2DParams params, - c10::optional output_min, - c10::optional output_max) { - conv2d_depthwise( - output, - input, - weight, - bufferFromOptionalHostData( - bias, - sizeof(float) * params.OC, - sizeof(float) * ALIGN_UP4(params.OC)), - params, - output_min, - output_max); -} - -void conv2d_depthwise( - VulkanTensor& output, - const VulkanTensor& input, - const float* weight, - const c10::optional bias, - const Conv2DParams params, - c10::optional output_min, - c10::optional output_max) { - VulkanTensor weightTensor{{params.OC, params.KH, params.KW}}; - weightTensor.set_data_from_host(weight); - conv2d_depthwise( - output, - input, - weightTensor, - bufferFromOptionalHostData( - bias, - sizeof(float) * params.OC, - sizeof(float) * ALIGN_UP4(params.OC)), - params, - output_min, - output_max); -} - -ImageSizes conv2d_prepack_weights_image_sizes( - int64_t argOC, - int64_t argC, - int64_t KH, - int64_t KW) { - const int32_t C = safe_downcast(argC); - const int32_t OC = safe_downcast(argOC); - const int32_t Cup4 = ALIGN_UP4(C); - const int32_t OC_4 = UP_DIV(OC, 4); - const int32_t Z = safe_downcast(KH) * safe_downcast(KW); - return {{Cup4, OC_4, Z}, {Cup4, OC_4, Z}}; -} - -void conv2d_prepack_weights_to_image( - VImage& image, - const float* weight, - int64_t OC, - int64_t C, - int64_t KH, - int64_t KW) { - auto kernelBuffer = kernelNCHW_OCHW_repack_O4C4HWi4o4(weight, OC, C, KH, KW); - auto OC_4 = UP_DIV(OC, 4); - auto C_4 = UP_DIV(C, 4); - - auto expectedSizes = conv2d_prepack_weights_image_sizes(OC, C, KH, KW); - TORCH_INTERNAL_ASSERT( - image.sizes() == expectedSizes.imageSize, - "Out VImage sizes do not match expected"); - - struct ConstBlock { - int32_t KWxKH; - int32_t C_4; - }; - ConstBlock cb{safe_downcast(KW * KH), safe_downcast(C_4)}; - VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb)); - - VkDescriptorSetLayout descriptorSetLayout{}; - VkDescriptorPool descriptorPool{}; - VkDescriptorSet descriptorSet{}; - std::vector descriptorTypes{ - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER}; - createDescriptorSetLayoutSinglePool( - context().device(), - descriptorTypes, - &descriptorSetLayout, - &descriptorPool, - &descriptorSet); - - image.bindStorageImage(descriptorSet, 0); - kernelBuffer.bind(descriptorSet, 1); - constBuffer.bind(descriptorSet, 2); - - WorkGroupSize workGroupSize{1, 1, 1}; - auto& computeUnit = context().computeUnitFactory().get( - GLSL_SPV(KO4C4HW_to_image), descriptorSetLayout, workGroupSize); - computeUnit.createCommandBuffer(descriptorSet); - auto commandBuffer = computeUnit.commandBuffer(); - image.addImageMemoryBarrierToGeneral(commandBuffer); - kernelBuffer.addBufferMemoryBarrier( - commandBuffer, 0, kernelBuffer.sizeBytes()); - computeUnit.addMemoryBarrier( - VK_PIPELINE_STAGE_HOST_BIT, - VK_ACCESS_HOST_WRITE_BIT, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_SHADER_READ_BIT); - computeUnit.dispatchCommandBuffer(C_4, OC_4, KH * KW, workGroupSize); - computeUnit.endCommandBuffer(); - computeUnit.submitAndWaitCommandBuffer(); - vkDestroyDescriptorPool(context().device(), descriptorPool, nullptr); - vkDestroyDescriptorSetLayout( - context().device(), descriptorSetLayout, nullptr); -} - -VImage conv2d_prepack_weights_image( - const float* weight, - int64_t OC, - int64_t C, - int64_t KH, - int64_t KW) { - VImage image{conv2d_prepack_weights_image_sizes(OC, C, KH, KW)}; - conv2d_prepack_weights_to_image(image, weight, OC, C, KH, KW); - return image; -} - -void conv2d_prepack_weights( - VulkanTensor& output, - const float* weight, - int64_t OC, - int64_t C, - int64_t KH, - int64_t KW) { - auto imageSizes = conv2d_prepack_weights_image_sizes(OC, C, KH, KW); - conv2d_prepack_weights_to_image( - *(output.image(imageSizes)), weight, OC, C, KH, KW); -} - -void conv2d( - VulkanTensor& output, - const VulkanTensor& input, - const VImage& kernelImage, - const VBuffer& biasBuffer, - const Conv2DParams& params, - c10::optional output_min, - c10::optional output_max) { - TORCH_INTERNAL_ASSERT( - params.G == 1, "Prepacked kernel VImage for non-group conv2d only"); - auto osizes = output.sizes(); - TORCH_INTERNAL_ASSERT( - osizes[2] == params.OH, - "Output tensor dims do not match specified conv2d params"); - TORCH_INTERNAL_ASSERT( - osizes[3] == params.OW, - "Output tensor dims do not match specified conv2d params"); - - struct ConstBlock { - int32_t padding[2]; - int32_t kernelSize[2]; - int32_t stride[2]; - int32_t dilate[2]; - int32_t inputSize[4]; - int32_t outputSize[4]; - float outputMin; - float outputMax; - }; - float outputMin = - output_min ? *output_min : -std::numeric_limits::infinity(); - float outputMax = - output_max ? *output_max : std::numeric_limits::infinity(); - ConstBlock cb{ - {safe_downcast(params.PX), safe_downcast(params.PY)}, - {safe_downcast(params.KW), safe_downcast(params.KH)}, - {safe_downcast(params.SX), safe_downcast(params.SY)}, - {safe_downcast(params.DX), safe_downcast(params.DY)}, - {safe_downcast(params.OW), - safe_downcast(params.OH), - safe_downcast(params.OC_4), - safe_downcast(params.OC)}, - {safe_downcast(params.W), - safe_downcast(params.H), - safe_downcast(params.C_4), - safe_downcast(params.C)}, - outputMin, - outputMax}; - VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb)); - - auto device = context().device(); - VkDescriptorSetLayout descriptorSetLayout{}; - VkDescriptorPool descriptorPool{}; - VkDescriptorSet descriptorSet{}; - std::vector descriptorTypes{ - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER}; - createDescriptorSetLayoutSinglePool( - device, - descriptorTypes, - &descriptorSetLayout, - &descriptorPool, - &descriptorSet); - - output.image()->bindStorageImage(descriptorSet, 0); - input.image()->bindShaderRead(descriptorSet, 1); - kernelImage.bindShaderRead(descriptorSet, 2); - biasBuffer.bind(descriptorSet, 3); - constBuffer.bind(descriptorSet, 4); - - WorkGroupSize workGroupSize{1, 1, safe_downcast(params.OC_4)}; - auto& computeUnit = context().computeUnitFactory().get( - GLSL_SPV(conv2d_nogroup_clamp), descriptorSetLayout, workGroupSize); - computeUnit.createCommandBuffer(descriptorSet); - auto commandBuffer = computeUnit.commandBuffer(); - output.image()->addImageMemoryBarrierToGeneral(commandBuffer); - input.image()->addImageMemoryBarrierToShaderRead(commandBuffer); - kernelImage.addImageMemoryBarrierToShaderRead(commandBuffer); - computeUnit.dispatchCommandBuffer( - UP_DIV(params.OW, 4 * workGroupSize.x), - UP_DIV(params.OH, workGroupSize.y), - UP_DIV(params.OC_4, workGroupSize.z)); - computeUnit.endCommandBuffer(); - computeUnit.submitAndWaitCommandBuffer(); - - vkDestroyDescriptorPool(device, descriptorPool, nullptr); - vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr); -} - -void conv2d( - VulkanTensor& output, - const VulkanTensor& input, - const VImage& kernelImage, - const c10::optional bias, - const Conv2DParams& params, - c10::optional output_min, - c10::optional output_max) { - TORCH_INTERNAL_ASSERT( - params.G == 1, "Prepacked kernel VImage for non-group conv2d only"); - conv2d( - output, - input, - kernelImage, - bufferFromOptionalHostData( - bias, - sizeof(float) * params.OC, - sizeof(float) * ALIGN_UP4(params.OC)), - params, - output_min, - output_max); -} - -void conv2d( - VulkanTensor& output, - const VulkanTensor& input, - const VulkanTensor& weight_prepacked, - c10::optional bias, - const Conv2DParams params, - c10::optional output_min, - c10::optional output_max) { - if (params.G > 1) { - conv2d_depthwise( - output, - input, - weight_prepacked, - bufferFromOptionalHostData( - bias, - sizeof(float) * params.OC, - sizeof(float) * ALIGN_UP4(params.OC)), - params, - output_min, - output_max); - return; - } - - conv2d( - output, - input, - *(weight_prepacked.image()), - bias, - params, - output_min, - output_max); -} - -void conv2d( - VulkanTensor& output, - const VulkanTensor& input, - const VulkanTensor& weight_prepacked, - const VulkanTensor& bias, - const Conv2DParams params, - c10::optional output_min, - c10::optional output_max) { - if (params.G > 1) { - conv2d_depthwise( - output, - input, - weight_prepacked, - *(bias.buffer()), - params, - output_min, - output_max); - return; - } - - conv2d( - output, - input, - *(weight_prepacked.image()), - *(bias.buffer()), - params, - output_min, - output_max); -} - -void conv2d( - VulkanTensor& output, - const VulkanTensor& input, - const float* weight, - const c10::optional bias, - const Conv2DParams params, - c10::optional output_min, - c10::optional output_max) { - if (params.G > 1) { - TORCH_INTERNAL_ASSERT( - params.G == params.C, - "Vulkan conv2d supports only no-group and depthwise"); - conv2d_depthwise( - output, input, weight, bias, params, output_min, output_max); - return; - } - - conv2d( - output, - input, - conv2d_prepack_weights_image( - weight, params.OC, params.C, params.KH, params.KW), - bias, - params, - output_min, - output_max); -} - -void clamp( - VulkanTensor& output, - const VulkanTensor& input, - float min, - float max) { - auto sizes = output.sizes(); - auto C = sizes[0] * sizes[1]; - auto H = sizes[2]; - auto W = sizes[3]; - - auto device = context().device(); - struct ConstBlock { - float min; - float max; - }; - ConstBlock cb{min, max}; - VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb)); - - VkDescriptorSetLayout descriptorSetLayout{}; - VkDescriptorPool descriptorPool{}; - VkDescriptorSet descriptorSet{}; - std::vector descriptorTypes{ - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER}; - createDescriptorSetLayoutSinglePool( - device, - descriptorTypes, - &descriptorSetLayout, - &descriptorPool, - &descriptorSet); - - output.image()->bindStorageImage(descriptorSet, 0); - input.image()->bindShaderRead(descriptorSet, 1); - constBuffer.bind(descriptorSet, 2); - - WorkGroupSize workGroupSize{8, 8, 1}; - auto& computeUnit = context().computeUnitFactory().get( - GLSL_SPV(clamp), descriptorSetLayout, workGroupSize); - computeUnit.createCommandBuffer(descriptorSet); - auto commandBuffer = computeUnit.commandBuffer(); - output.image()->addImageMemoryBarrierToGeneral(commandBuffer); - input.image()->addImageMemoryBarrierToShaderRead(commandBuffer); - computeUnit.dispatchCommandBuffer(W, H, C, workGroupSize); - computeUnit.endCommandBuffer(); - computeUnit.submitAndWaitCommandBuffer(); - vkDestroyDescriptorPool(device, descriptorPool, nullptr); - vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr); -} - -void addmm( - VulkanTensor& output, - c10::optional t, - const VulkanTensor& m1, - const VulkanTensor& m2, - float beta, - float alpha) { - bool hasT = t.has_value(); - const auto m1Sizes = m1.sizes(); - const auto m2Sizes = m2.sizes(); - TORCH_INTERNAL_ASSERT(m1Sizes.size() == 2); - TORCH_INTERNAL_ASSERT(m2Sizes.size() == 2); - const auto m1W = m1Sizes[1]; - const auto m1C = 1; - const auto m2H = m2Sizes[0]; - const auto m2C = 1; - const auto OH = m1Sizes[0]; - const auto OW = m2Sizes[1]; - - TORCH_INTERNAL_ASSERT(m1W == m2H); - TORCH_INTERNAL_ASSERT(m1C == m2C); - - const auto C = m1C; - const auto C_4 = UP_DIV(C, 4); - - auto device = context().device(); - - struct ConstBlock { - float alpha; - float beta; - }; - ConstBlock cb{alpha, beta}; - VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb)); - - VkDescriptorSetLayout descriptorSetLayout{}; - VkDescriptorPool descriptorPool{}; - VkDescriptorSet descriptorSet{}; - std::vector descriptorTypes{}; - if (hasT) { - descriptorTypes = { - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, - }; - } else { - descriptorTypes = { - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - }; - } - - createDescriptorSetLayoutSinglePool( - device, - descriptorTypes, - &descriptorSetLayout, - &descriptorPool, - &descriptorSet); - - output.image()->bindStorageImage(descriptorSet, 0); - m1.image()->bindShaderRead(descriptorSet, 1); - m2.image()->bindShaderRead(descriptorSet, 2); - if (hasT) { - (*t).image()->bindShaderRead(descriptorSet, 3); - constBuffer.bind(descriptorSet, 4); - } - - WorkGroupSize workGroupSize{8, 8, 1}; - if (hasT) { - auto& computeUnit = context().computeUnitFactory().get( - GLSL_SPV(addmm), descriptorSetLayout, workGroupSize); - computeUnit.createCommandBuffer(descriptorSet); - auto commandBuffer = computeUnit.commandBuffer(); - output.image()->addImageMemoryBarrierToGeneral(commandBuffer); - m1.image()->addImageMemoryBarrierToShaderRead(commandBuffer); - m2.image()->addImageMemoryBarrierToShaderRead(commandBuffer); - (*t).image()->addImageMemoryBarrierToShaderRead(commandBuffer); - computeUnit.dispatchCommandBuffer(OW, OH, C_4, workGroupSize); - computeUnit.endCommandBuffer(); - computeUnit.submitAndWaitCommandBuffer(); - } else { - auto& computeUnit = context().computeUnitFactory().get( - GLSL_SPV(mm), descriptorSetLayout, workGroupSize); - computeUnit.createCommandBuffer(descriptorSet); - auto commandBuffer = computeUnit.commandBuffer(); - output.image()->addImageMemoryBarrierToGeneral(commandBuffer); - m1.image()->addImageMemoryBarrierToShaderRead(commandBuffer); - m2.image()->addImageMemoryBarrierToShaderRead(commandBuffer); - computeUnit.dispatchCommandBuffer(OW, OH, C_4, workGroupSize); - computeUnit.endCommandBuffer(); - computeUnit.submitAndWaitCommandBuffer(); - } - vkDestroyDescriptorPool(device, descriptorPool, nullptr); - vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr); -} - -void mean(VulkanTensor& output, const VulkanTensor& input) { - auto isizes = input.sizes(); - int32_t N = safe_downcast(isizes[0]); - int32_t C = safe_downcast(isizes[1]); - int32_t H = safe_downcast(isizes[2]); - int32_t W = safe_downcast(isizes[3]); - - auto device = context().device(); - struct ConstBlock { - int32_t W; - int32_t H; - }; - ConstBlock cb{W, H}; - VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb)); - - VkDescriptorSetLayout descriptorSetLayout{}; - VkDescriptorPool descriptorPool{}; - VkDescriptorSet descriptorSet{}; - std::vector descriptorTypes{ - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER}; - createDescriptorSetLayoutSinglePool( - device, - descriptorTypes, - &descriptorSetLayout, - &descriptorPool, - &descriptorSet); - - output.image()->bindStorageImage(descriptorSet, 0); - input.image()->bindShaderRead(descriptorSet, 1); - constBuffer.bind(descriptorSet, 2); - - WorkGroupSize workGroupSize{1, 1, 1}; - auto& computeUnit = context().computeUnitFactory().get( - GLSL_SPV(mean2d), descriptorSetLayout, workGroupSize); - computeUnit.createCommandBuffer(descriptorSet); - auto commandBuffer = computeUnit.commandBuffer(); - output.image()->addImageMemoryBarrierToGeneral(commandBuffer); - input.image()->addImageMemoryBarrierToShaderRead(commandBuffer); - computeUnit.dispatchCommandBuffer(C, N, 1, workGroupSize); - computeUnit.endCommandBuffer(); - computeUnit.submitAndWaitCommandBuffer(); - vkDestroyDescriptorPool(device, descriptorPool, nullptr); - vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr); -} - -} // namespace detail -} // namespace vulkan -} // namespace native -} // namespace at diff --git a/aten/src/ATen/native/vulkan/VulkanOps.h b/aten/src/ATen/native/vulkan/VulkanOps.h deleted file mode 100644 index b1064df9e2c1..000000000000 --- a/aten/src/ATen/native/vulkan/VulkanOps.h +++ /dev/null @@ -1,153 +0,0 @@ -#pragma once - -#include -#include -#include - -namespace at { -namespace native { -namespace vulkan { -namespace detail { - -void upsample_nearest2d( - VulkanTensor& output, - const VulkanTensor& input, - int64_t IH, - int64_t IW, - int64_t OH, - int64_t OW, - int64_t N, - int64_t C, - float scaleH, - float scaleW); - -void adaptive_avg_pool2d( - VulkanTensor& output, - const VulkanTensor& input, - const int64_t IH, - const int64_t IW, - const int64_t OH, - const int64_t OW, - const int64_t IN, - const int64_t IC); - -void max_pool2d( - VulkanTensor& output, - const VulkanTensor& input, - const int iH, - const int iW, - const int oH, - const int oW, - const int _n, - const int _c, - const int kH, - const int kW, - const int dH, - const int dW, - const int padH, - const int padW, - const int dilationH, - const int dilationW); - -void avg_pool2d( - VulkanTensor& output, - const VulkanTensor& input, - const int iH, - const int iW, - const int oH, - const int oW, - const int _n, - const int _c, - const int kH, - const int kW, - const int dH, - const int dW, - const int padH, - const int padW); - -VulkanTensor transpose( - const VulkanTensor& input, - const int64_t dim0, - const int64_t dim1); - -VulkanTensor slice( - const VulkanTensor& input, - const int64_t dim, - const int64_t start, - const int64_t end, - const int64_t step); - -VulkanTensor reshape_copy( - const VulkanTensor& input, - std::vector shape); - -VulkanTensor cat( - VulkanTensor& output, - ArrayRef inputs, - int64_t dim); - -void add( - VulkanTensor& output, - const VulkanTensor& input0, - const VulkanTensor& input1, - float alpha); - -void mul(VulkanTensor& output, const VulkanTensor& input, const float s); - -void add(VulkanTensor& output, const VulkanTensor& input, const float s); - -void conv2d_prepack_weights( - VulkanTensor& output, - const float* weight, - int64_t OC, - int64_t C, - int64_t KH, - int64_t KW); - -void conv2d( - VulkanTensor& output, - const VulkanTensor& input, - const float* weight, - const c10::optional bias, - const Conv2DParams params, - c10::optional output_min = c10::nullopt, - c10::optional output_max = c10::nullopt); - -void conv2d( - VulkanTensor& output, - const VulkanTensor& input, - const VulkanTensor& weight_prepacked, - const c10::optional bias, - const Conv2DParams params, - c10::optional output_min = c10::nullopt, - c10::optional output_max = c10::nullopt); - -void conv2d( - VulkanTensor& output, - const VulkanTensor& input, - const VulkanTensor& weight_prepacked, - const VulkanTensor& bias, - const Conv2DParams params, - c10::optional output_min = c10::nullopt, - c10::optional output_max = c10::nullopt); - -void clamp( - VulkanTensor& output, - const VulkanTensor& input, - float min, - float max); - -void addmm( - VulkanTensor& output, - c10::optional t, - const VulkanTensor& m1, - const VulkanTensor& m2, - float beta, - float alpha); - -void mean(VulkanTensor& output, const VulkanTensor& input); - -} // namespace detail -} // namespace vulkan -} // namespace native -} // namespace at diff --git a/aten/src/ATen/native/vulkan/VulkanRegisterOpContextClass.cpp b/aten/src/ATen/native/vulkan/VulkanRegisterOpContextClass.cpp deleted file mode 100644 index 0a1c5fcea72d..000000000000 --- a/aten/src/ATen/native/vulkan/VulkanRegisterOpContextClass.cpp +++ /dev/null @@ -1,59 +0,0 @@ -#include -#include - -#include -#include - -namespace at { -namespace native { -namespace vulkan { - -#ifndef USE_VULKAN_API - -using detail::convolution2d::createConv2dClampPrePackOpContext; - -TORCH_LIBRARY(vulkan, m) { - m.class_("Conv2dOpContext") - .def_pickle( - [](const c10::intrusive_ptr& op_context) - -> SerializationTypeConv2dPrePack { // __getstate__ - return op_context->unpack(); - }, - [](SerializationTypeConv2dPrePack state) - -> c10::intrusive_ptr { // __setstate__ - return createConv2dClampPrePackOpContext( - std::move(std::get<0>(state)), - std::move(std::get<1>(state)), - std::move(std::get<2>(state)), - std::move(std::get<3>(state)), - std::move(std::get<4>(state)), - std::move(std::get<5>(state)), - std::move(std::get<6>(state)), - std::move(std::get<7>(state))); - }); -} - -TORCH_LIBRARY(vulkan_prepack, m) { - m.def( - "conv2d_clamp_prepack(Tensor W, Tensor? B, int[2] stride, " - "int[2] padding, int[2] dilation, int groups, " - "Scalar? output_min=None, Scalar? output_max=None) " - "-> __torch__.torch.classes.vulkan.Conv2dOpContext"); - m.def( - "conv2d_clamp_run(Tensor X, " - "__torch__.torch.classes.vulkan.Conv2dOpContext W_prepack) -> Tensor Y"); -} - -TORCH_LIBRARY_IMPL(vulkan_prepack, CPU, m) { - m.impl("conv2d_clamp_prepack", TORCH_FN(createConv2dClampPrePackOpContext)); -} - -TORCH_LIBRARY_IMPL(vulkan_prepack, Vulkan, m) { - m.impl("conv2d_clamp_run", detail::convolution2d::conv2d_clamp_run); -} - -#endif /* USE_VULKAN_API */ - -} // namespace vulkan -} // namespace native -} // namespace at diff --git a/aten/src/ATen/native/vulkan/api/Adapter.cpp b/aten/src/ATen/native/vulkan/api/Adapter.cpp new file mode 100644 index 000000000000..461e1ec92dcc --- /dev/null +++ b/aten/src/ATen/native/vulkan/api/Adapter.cpp @@ -0,0 +1,398 @@ +#include +#include +#include + +namespace at { +namespace native { +namespace vulkan { +namespace api { + +namespace { + +void find_requested_device_extensions( + VkPhysicalDevice physical_device, + std::vector& enabled_extensions, + const std::vector& requested_extensions) { + uint32_t device_extension_properties_count = 0; + VK_CHECK(vkEnumerateDeviceExtensionProperties( + physical_device, nullptr, &device_extension_properties_count, nullptr)); + std::vector device_extension_properties( + device_extension_properties_count); + VK_CHECK(vkEnumerateDeviceExtensionProperties( + physical_device, + nullptr, + &device_extension_properties_count, + device_extension_properties.data())); + + std::vector enabled_device_extensions; + + for (const auto& requested_extension : requested_extensions) { + for (const auto& extension : device_extension_properties) { + if (strcmp(requested_extension, extension.extensionName) == 0) { + enabled_extensions.push_back(requested_extension); + break; + } + } + } +} + +// +// Print utils +// + +std::string get_device_type_str(const VkPhysicalDeviceType type) { + switch(type) { + case VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU: + return "INTEGRATED_GPU"; + case VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU: + return "DISCRETE_GPU"; + case VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU: + return "VIRTUAL_GPU"; + case VK_PHYSICAL_DEVICE_TYPE_CPU: + return "CPU"; + default: + return "UNKOWN"; + } +} + +std::string get_memory_properties_str(const VkMemoryPropertyFlags flags) { + std::bitset<10> values(flags); + std::stringstream ss("|"); + if (values[0]) { + ss << " DEVICE_LOCAL |"; + } + if (values[1]) { + ss << " HOST_VISIBLE |"; + } + if (values[2]) { + ss << " HOST_COHERENT |"; + } + if (values[3]) { + ss << " HOST_CACHED |"; + } + if (values[4]) { + ss << " LAZILY_ALLOCATED |"; + } + + return ss.str(); +} + +std::string get_queue_family_properties_str(const VkQueueFlags flags) { + std::bitset<10> values(flags); + std::stringstream ss("|"); + if (values[0]) { + ss << " GRAPHICS |"; + } + if (values[1]) { + ss << " COMPUTE |"; + } + if (values[2]) { + ss << " TRANSFER |"; + } + + return ss.str(); +} + +} // namespace + +Adapter::Adapter(const VkPhysicalDevice handle, const uint32_t num_queues) + : physical_handle_(handle), + properties_{}, + memory_properties_{}, + queue_families_{}, + num_requested_queues_{num_queues}, + queue_usage_{}, + handle_(VK_NULL_HANDLE), + queues_{}, + num_compute_queues_{}, + has_unified_memory_{false}, + timestamp_compute_and_graphics_{false}, + timestamp_period_{0.f} { + // This should never happen, but double check to be safe + TORCH_CHECK( + VK_NULL_HANDLE != physical_handle_, + "Pytorch Vulkan Adapter: VK_NULL_HANDLE passed to Adapter constructor!") + + vkGetPhysicalDeviceProperties(physical_handle_, &properties_); + vkGetPhysicalDeviceMemoryProperties(physical_handle_, &memory_properties_); + + timestamp_compute_and_graphics_ = properties_.limits.timestampComputeAndGraphics; + timestamp_period_ = properties_.limits.timestampPeriod; + + // Check if there are any memory types have both the HOST_VISIBLE and the + // DEVICE_LOCAL property flags + const VkMemoryPropertyFlags unified_memory_flags = + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + for (const uint32_t i : c10::irange(memory_properties_.memoryTypeCount)) { + if (memory_properties_.memoryTypes[i].propertyFlags | unified_memory_flags) { + has_unified_memory_ = true; + break; + } + } + + uint32_t queue_family_count = 0; + vkGetPhysicalDeviceQueueFamilyProperties( + physical_handle_, &queue_family_count, nullptr); + + queue_families_.resize(queue_family_count); + vkGetPhysicalDeviceQueueFamilyProperties( + physical_handle_, &queue_family_count, queue_families_.data()); + + // Find the total number of compute queues + for (const uint32_t family_i : c10::irange(queue_families_.size())) { + const VkQueueFamilyProperties& properties = queue_families_[family_i]; + // Check if this family has compute capability + if (properties.queueFlags & VK_QUEUE_COMPUTE_BIT) { + num_compute_queues_ += properties.queueCount; + } + } + + queue_usage_.reserve(num_requested_queues_); + queues_.reserve(num_requested_queues_); +} + +Adapter::Adapter(Adapter&& other) noexcept + : physical_handle_(other.physical_handle_), + properties_(other.properties_), + memory_properties_(other.memory_properties_), + queue_families_(std::move(other.queue_families_)), + num_requested_queues_(other.num_requested_queues_), + queue_usage_(std::move(other.queue_usage_)), + handle_(other.handle_), + queues_(std::move(other.queues_)), + num_compute_queues_(other.num_compute_queues_), + has_unified_memory_(other.has_unified_memory_), + timestamp_compute_and_graphics_(other.timestamp_compute_and_graphics_), + timestamp_period_(other.timestamp_period_) { + other.physical_handle_ = VK_NULL_HANDLE; + other.handle_ = VK_NULL_HANDLE; +} + +Adapter::~Adapter() { + if C10_LIKELY(VK_NULL_HANDLE == handle_) { + return; + } + vkDestroyDevice(handle_, nullptr); + handle_ = VK_NULL_HANDLE; +} + +void Adapter::init_device() { + // It is possible that multiple threads will attempt to initialize the device + // simultaneously, so lock the mutex before initializing + std::lock_guard lock(mutex_); + + // Do not initialize the device if there are no compute queues available + TORCH_CHECK( + num_compute_queues_ > 0, + "Pytorch Vulkan Adapter: Cannot initialize Adapter as this device does not " + "have any queues that support compute!") + + // This device has already been initialized, no-op + if C10_LIKELY(VK_NULL_HANDLE != handle_) { + return; + } + + // + // Find compute queues up to the requested number of queues + // + + std::vector queue_create_infos; + queue_create_infos.reserve(num_requested_queues_); + + std::vector> queues_to_get; + queues_to_get.reserve(num_requested_queues_); + + uint32_t remaining_queues = num_requested_queues_; + for (const uint32_t family_i : c10::irange(queue_families_.size())) { + const VkQueueFamilyProperties& properties = queue_families_[family_i]; + // Check if this family has compute capability + if (properties.queueFlags & VK_QUEUE_COMPUTE_BIT) { + const uint32_t queues_to_init = std::min( + remaining_queues, properties.queueCount); + + const std::vector queue_priorities(queues_to_init, 1.0f); + queue_create_infos.push_back({ + VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, // sType + nullptr, // pNext + 0u, // flags + family_i, // queueFamilyIndex + queues_to_init, // queueCount + queue_priorities.data(), // pQueuePriorities + }); + + for (const uint32_t queue_i : c10::irange(queues_to_init)) { + // Use this to get the queue handle once device is created + queues_to_get.emplace_back(family_i, queue_i); + } + remaining_queues -= queues_to_init; + } + if (remaining_queues == 0) { + break; + } + } + + // + // Create the VkDevice + // + + std::vector requested_device_extensions { + #ifdef VK_KHR_portability_subset + VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME, + #endif + }; + + std::vector enabled_device_extensions; + find_requested_device_extensions( + physical_handle_, enabled_device_extensions, requested_device_extensions); + + const VkDeviceCreateInfo device_create_info{ + VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, // sType + nullptr, // pNext + 0u, // flags + static_cast(queue_create_infos.size()), // queueCreateInfoCount + queue_create_infos.data(), // pQueueCreateInfos + 0u, // enabledLayerCount + nullptr, // ppEnabledLayerNames + static_cast(enabled_device_extensions.size()), // enabledExtensionCount + enabled_device_extensions.data(), // ppEnabledExtensionNames + nullptr, // pEnabledFeatures + }; + + const VkResult device_create_res = vkCreateDevice( + physical_handle_, &device_create_info, nullptr, &handle_); + // If device was not created successfully, ensure handle_ is invalid and throw + if (VK_SUCCESS != device_create_res) { + handle_ = VK_NULL_HANDLE; + VK_CHECK(device_create_res); + } + +#ifdef USE_VULKAN_VOLK + volkLoadDevice(handle_); +#endif + + // + // Obtain handles for the created queues and initialize queue usage heuristic + // + + for (const std::pair& queue_idx : queues_to_get) { + VkQueue queue_handle = VK_NULL_HANDLE; + VkQueueFlags flags = queue_families_[queue_idx.first].queueFlags; + vkGetDeviceQueue( + handle_, queue_idx.first, queue_idx.second, &queue_handle); + queues_.push_back({queue_idx.first, queue_idx.second, flags, queue_handle}); + // Initial usage value + queue_usage_.push_back(0); + } +} + +Adapter::Queue Adapter::request_queue() { + // Lock the mutex as multiple threads can request a queue at the same time + std::lock_guard lock(mutex_); + + uint32_t min_usage = UINT32_MAX; + uint32_t min_used_i = 0; + for (const uint32_t i : c10::irange(queues_.size())) { + if (queue_usage_[i] < min_usage) { + min_used_i = i; + min_usage = queue_usage_[i]; + } + } + queue_usage_[min_used_i] += 1; + + return queues_[min_used_i]; +} + +void Adapter::return_queue(Adapter::Queue& compute_queue) { + for (const uint32_t i : c10::irange(queues_.size())) { + if ((queues_[i].family_index == compute_queue.family_index) && + (queues_[i].queue_index == compute_queue.queue_index)) { + std::lock_guard lock(mutex_); + queue_usage_[i] -= 1; + break; + } + } +} + +std::string Adapter::stringize() const { + std::stringstream ss; + + uint32_t v_major = VK_VERSION_MAJOR(properties_.apiVersion); + uint32_t v_minor = VK_VERSION_MINOR(properties_.apiVersion); + std::string device_type = get_device_type_str(properties_.deviceType); + VkPhysicalDeviceLimits limits = properties_.limits; + + ss << "{" << std::endl; + ss << " Physical Device Info {" << std::endl; + ss << " apiVersion: " << v_major << "." << v_minor << std::endl; + ss << " driverversion: " << properties_.driverVersion << std::endl; + ss << " deviceType: " << device_type << std::endl; + ss << " deviceName: " << properties_.deviceName << std::endl; + +#define PRINT_LIMIT_PROP(name) \ + ss << " " << std::left << std::setw(36) << #name << limits.name << std::endl; + +#define PRINT_LIMIT_PROP_VEC3(name) \ + ss << " " << std::left << std::setw(36) << #name \ + << limits.name[0] << "," \ + << limits.name[1] << "," \ + << limits.name[2] << std::endl; + + ss << " Physical Device Limits {" << std::endl; + PRINT_LIMIT_PROP(maxImageDimension1D); + PRINT_LIMIT_PROP(maxImageDimension2D); + PRINT_LIMIT_PROP(maxImageDimension3D); + PRINT_LIMIT_PROP(maxTexelBufferElements); + PRINT_LIMIT_PROP(maxPushConstantsSize); + PRINT_LIMIT_PROP(maxMemoryAllocationCount); + PRINT_LIMIT_PROP(maxSamplerAllocationCount); + PRINT_LIMIT_PROP(maxComputeSharedMemorySize); + PRINT_LIMIT_PROP_VEC3(maxComputeWorkGroupCount); + PRINT_LIMIT_PROP(maxComputeWorkGroupInvocations); + PRINT_LIMIT_PROP_VEC3(maxComputeWorkGroupSize); + ss << " }" << std::endl; + ss << " }" << std::endl;; + + const VkPhysicalDeviceMemoryProperties& mem_props = memory_properties_; + ss << " Memory Info {" << std::endl; + ss << " Memory Types [" << std::endl; + for (int i = 0; i < mem_props.memoryTypeCount; ++i) { + ss << " " << " [Heap " << mem_props.memoryTypes[i].heapIndex << "] " + << get_memory_properties_str(mem_props.memoryTypes[i].propertyFlags) + << std::endl; + } + ss << " ]" << std::endl; + ss << " Memory Heaps [" << std::endl; + for (int i = 0; i < mem_props.memoryHeapCount; ++i) { + ss << " " << mem_props.memoryHeaps[i].size << std::endl; + } + ss << " ]" << std::endl; + ss << " }" << std::endl; + + ss << " Queue Families {" << std::endl; + for (const VkQueueFamilyProperties& queue_family_props : queue_families_) { + ss << " (" << queue_family_props.queueCount << " Queues) " + << get_queue_family_properties_str(queue_family_props.queueFlags) << std::endl; + } + ss << " }" << std::endl; + ss << " VkDevice: " << handle_ << std::endl; + ss << " Compute Queues [" << std::endl; + for (const Adapter::Queue& compute_queue : queues_) { + ss << " Family " << compute_queue.family_index + << ", Queue " << compute_queue.queue_index + << ": " << compute_queue.handle << std::endl;; + } + ss << " ]" << std::endl; + ss << "}"; + + return ss.str(); +} + +std::ostream& operator<<(std::ostream& os, const Adapter& adapter) { + os << adapter.stringize() << std::endl; + return os; +} + +} // namespace api +} // namespace vulkan +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/vulkan/api/Adapter.h b/aten/src/ATen/native/vulkan/api/Adapter.h index b4203530f635..a7aa29cc5baa 100644 --- a/aten/src/ATen/native/vulkan/api/Adapter.h +++ b/aten/src/ATen/native/vulkan/api/Adapter.h @@ -5,6 +5,8 @@ #include #include #include +#include +#include namespace at { namespace native { @@ -12,29 +14,103 @@ namespace vulkan { namespace api { // -// A Vulkan Adapter represents a physical device and its properties. Adapters -// are enumerated through the Runtime and are used in creation of Contexts. -// Each tensor in PyTorch is associated with a Context to make the -// device <-> tensor affinity explicit. +// A Vulkan Adapter represents a logical device and all its properties. It +// manages all relevant properties of the underlying physical device, a +// handle to the logical device, and a number of compute queues available to +// the device. It is primarily responsible for managing the VkDevice handle +// which points to the logical device object on the GPU. // +// This class is primarily used by the Runtime class, which holds one Adapter +// instance for each physical device visible to the VkInstance. Upon construction, +// this class will populate the physical device properties, but will not create +// the logical device until specifically requested via the init_device() funtion. +// +// init_device() will create the logical device and obtain the VkDevice handle +// for it. It will also create a number of compute queues up to the amount +// requested when the Adapter instance was constructed. +// +// Contexts (which represent one thread of execution) will request a compute +// queue from an Adapter. The Adapter will then select a compute queue to +// assign to the Context, attempting to balance load between all available +// queues. This will allow different Contexts (which typically execute on +// separate threads) to run concurrently. +// + +class Adapter final { + public: + explicit Adapter(const VkPhysicalDevice handle, const uint32_t num_queues); + + Adapter(const Adapter&) = delete; + Adapter& operator=(const Adapter&) = delete; + + Adapter(Adapter&&) noexcept; + Adapter& operator=(Adapter&&) = delete; + + ~Adapter(); + + struct Queue { + uint32_t family_index; + uint32_t queue_index; + VkQueueFlags capabilities; + VkQueue handle; + }; + + private: + // Use a mutex to manage resources held by this class since + // it can be accessed from multiple threads + std::mutex mutex_; + // Physical Device Properties + VkPhysicalDevice physical_handle_; + VkPhysicalDeviceProperties properties_; + VkPhysicalDeviceMemoryProperties memory_properties_; + std::vector queue_families_; + // Queue Management + uint32_t num_requested_queues_; + std::vector queue_usage_; + // Handles + VkDevice handle_; + std::vector queues_; + // Metadata + uint32_t num_compute_queues_; + bool has_unified_memory_; + bool timestamp_compute_and_graphics_; + float timestamp_period_; + + public: + inline VkPhysicalDevice physical_handle() const { + return physical_handle_; + } -struct Adapter final { - Runtime* runtime; - VkPhysicalDevice handle; - VkPhysicalDeviceProperties properties; - VkPhysicalDeviceMemoryProperties memory_properties; - uint32_t compute_queue_family_index; + inline VkDevice device_handle() const { + return handle_; + } inline bool has_unified_memory() const { - // Ideally iterate over all memory types to see if there is a pool that - // is both host-visible, and device-local. This should be a good proxy - // for now. - return VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU == properties.deviceType; + return has_unified_memory_; + } + + inline uint32_t num_compute_queues() const { + return num_compute_queues_; + } + + inline bool timestamp_compute_and_graphics() const { + return timestamp_compute_and_graphics_; + } + + inline float timestamp_period() const { + return timestamp_period_; } + void init_device(); + Queue request_queue(); + void return_queue(Queue& compute_queue); + inline Shader::WorkGroup local_work_group_size() const { return { 4u, 4u, 4u, }; } + + std::string stringize() const; + friend std::ostream& operator<<(std::ostream& os, const Adapter& adapter); }; } // namespace api diff --git a/aten/src/ATen/native/vulkan/api/Command.cpp b/aten/src/ATen/native/vulkan/api/Command.cpp index 692796a736ab..7f32681a6f5b 100644 --- a/aten/src/ATen/native/vulkan/api/Command.cpp +++ b/aten/src/ATen/native/vulkan/api/Command.cpp @@ -333,7 +333,7 @@ inline void Command::Buffer::Barrier::reset() { Command::Pool::Pool(const GPU& gpu) : device_(gpu.device), command_pool_( - create_command_pool(gpu.device, gpu.adapter->compute_queue_family_index), + create_command_pool(gpu.device, gpu.queue_family_index), VK_DELETER(CommandPool)(device_)), buffer_{} { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( diff --git a/aten/src/ATen/native/vulkan/api/Common.h b/aten/src/ATen/native/vulkan/api/Common.h index 827ee85cb788..84bbeaa6f0e1 100644 --- a/aten/src/ATen/native/vulkan/api/Common.h +++ b/aten/src/ATen/native/vulkan/api/Common.h @@ -32,7 +32,11 @@ #define VK_CHECK(function) \ do { \ const VkResult result = (function); \ - TORCH_CHECK(VK_SUCCESS == result, "VkResult:", result); \ + TORCH_CHECK( \ + VK_SUCCESS == result, \ + C10_STRINGIZE(__FILE__), " [", \ + C10_STRINGIZE(__LINE__), "] " \ + "VkResult:", result); \ } while (false) #define VK_CHECK_RELAXED(function) \ @@ -61,7 +65,7 @@ namespace native { namespace vulkan { namespace api { -struct Adapter; +class Adapter; struct Command; class Context; struct Descriptor; @@ -71,8 +75,10 @@ class Runtime; struct Shader; struct GPU final { + VkInstance instance; const Adapter* adapter; VkDevice device; + uint32_t queue_family_index; VkQueue queue; }; diff --git a/aten/src/ATen/native/vulkan/api/Context.cpp b/aten/src/ATen/native/vulkan/api/Context.cpp index d65a89895714..260d10dbe686 100644 --- a/aten/src/ATen/native/vulkan/api/Context.cpp +++ b/aten/src/ATen/native/vulkan/api/Context.cpp @@ -1,6 +1,7 @@ #include -#include +#include #include +#include #include @@ -103,23 +104,19 @@ VkQueue acquire_queue( } // namespace -Context::Context(const Adapter& adapter) - : adapter_(adapter), - device_( - create_device( - adapter.handle, - adapter.compute_queue_family_index), - &VK_DELETER(Device)), - queue_(acquire_queue(device(), adapter.compute_queue_family_index)), +Context::Context(const VkInstance instance, size_t adapter_i) + : instance_(instance), + adapter_i_(adapter_i), + device_(runtime()->get_adapter(adapter_i).device_handle()), + queue_(runtime()->get_adapter(adapter_i).request_queue()), shader_(gpu()), pipeline_(gpu()), threadcontext_(gpu()) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - device_, - "Invalid Vulkan device!"); } Context::~Context() { + // Let the device know the context is done with the queue + runtime()->get_adapter(adapter_i_).return_queue(queue_); // Do not call flush() since all per-thread objects will be destroyed as each thread exits } @@ -155,12 +152,7 @@ bool available() { Context* context() { static const std::unique_ptr context([]() -> Context* { try { - const Adapter adapter = runtime()->select([](const Adapter& adapter) { - // Select the first adapter. - return true; - }); - - return new Context(adapter); + return new Context(runtime()->instance(), runtime()->default_adapter_i()); } catch (const std::exception& e) { TORCH_CHECK(false, "Vulkan: Failed to initialize context! Error: ", e.what()); @@ -196,7 +188,6 @@ Descriptor::Set dispatch_prologue( const Shader::Descriptor& shader_descriptor, const Shader::WorkGroup& local_work_group_size) { Context* const context = api::context(); - const GPU gpu = context->gpu(); Descriptor& descriptor = context->descriptor(); Pipeline& pipeline = context->pipeline(); Shader& shader = context->shader(); diff --git a/aten/src/ATen/native/vulkan/api/Context.h b/aten/src/ATen/native/vulkan/api/Context.h index e38c4e59227a..7b1bb85f9230 100644 --- a/aten/src/ATen/native/vulkan/api/Context.h +++ b/aten/src/ATen/native/vulkan/api/Context.h @@ -27,16 +27,20 @@ namespace api { class Context final { public: - explicit Context(const Adapter& adapter); + explicit Context(const VkInstance instance, size_t adapter_i); + Context(const Context&) = delete; - Context(Context&&) = default; Context& operator=(const Context&) = delete; + + Context(Context&&) = default; Context& operator=(Context&&) = default; + ~Context(); GPU gpu(); Command& command(); Shader& shader(); + QueryPool& querypool(); Pipeline& pipeline(); Descriptor& descriptor(); Resource& resource(); @@ -67,15 +71,19 @@ class Context final { private: // Construction and destruction order matters. Do not move members around. - Adapter adapter_; - Handle device_; - VkQueue queue_; + VkInstance instance_; + size_t adapter_i_; + VkDevice device_; + Adapter::Queue queue_; Shader shader_; Pipeline pipeline_; ThreadContext threadcontext_; }; bool available(); + +// The global runtime is retrieved using this function, where it is declared as +// a static local variable. Context* context(); // @@ -84,10 +92,13 @@ Context* context(); inline GPU Context::gpu() { // A GPU is simply a (physical device, logical device, device queue) trio. + const Adapter* p_adapter = runtime()->get_adapter_p(adapter_i_); return { - &adapter_, - device(), - queue(), + instance_, + p_adapter, + device_, + queue_.family_index, + queue_.handle, }; } @@ -111,14 +122,16 @@ inline Resource& Context::resource() { return threadcontext_.resource(); } +inline QueryPool& Context::querypool() { + return threadcontext_.querypool(); +} + inline VkDevice Context::device() { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_); - return device_.get(); + return device_; } inline VkQueue Context::queue() { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(queue_); - return queue_; + return queue_.handle; } namespace detail { diff --git a/aten/src/ATen/native/vulkan/api/OpProfiler.h b/aten/src/ATen/native/vulkan/api/OpProfiler.h new file mode 100644 index 000000000000..b38b5dc95729 --- /dev/null +++ b/aten/src/ATen/native/vulkan/api/OpProfiler.h @@ -0,0 +1,38 @@ +#pragma once + +#ifdef USE_VULKAN_API + +#include + +namespace at { +namespace native { +namespace vulkan { +namespace api { + +class OpProfiler final { + public: + explicit OpProfiler(Command::Buffer& buffer, QueryPool& querypool, const std::string& query_name) + : buffer_(buffer), + querypool_(querypool) { + query_index_ = querypool.begin(buffer_.handle(), query_name); + } + OpProfiler(const OpProfiler&) = delete; + OpProfiler(OpProfiler&&) = delete; + OpProfiler& operator=(const OpProfiler&) = delete; + OpProfiler& operator=(OpProfiler&&) = delete; + ~OpProfiler() { + querypool_.end(buffer_.handle(), query_index_); + } + +private: + Command::Buffer& buffer_; + QueryPool& querypool_; + int query_index_; +}; + +} // namespace api +} // namespace vulkan +} // namespace native +} // namespace at + +#endif /* USE_VULKAN_API */ diff --git a/aten/src/ATen/native/vulkan/api/QueryPool.cpp b/aten/src/ATen/native/vulkan/api/QueryPool.cpp new file mode 100644 index 000000000000..9e12e3be3e3f --- /dev/null +++ b/aten/src/ATen/native/vulkan/api/QueryPool.cpp @@ -0,0 +1,120 @@ +#include +#include + +namespace at { +namespace native { +namespace vulkan { +namespace api { +namespace { + +VkQueryPool create_query_pool(const VkDevice& device, const uint32_t queryCount) { + VkQueryPool queryPool{}; + VkQueryPoolCreateInfo info{}; + info.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO; + info.queryType = VK_QUERY_TYPE_TIMESTAMP; + info.queryCount = queryCount; + VK_CHECK(vkCreateQueryPool(device, &info, nullptr, &queryPool)); + return queryPool; +}; + +void destroy_query_pool(const VkDevice& device, const VkQueryPool& querypool) { + if (VK_NULL_HANDLE != device && VK_NULL_HANDLE != querypool) { + vkDestroyQueryPool(device, querypool, nullptr); + } +} + +} // namespace + +QueryPool::QueryPool(const VkDevice& device, const bool is_timestamps_supported, const float timestamp_period_us) + : device_(device), + is_timestamps_supported_(is_timestamps_supported), + timestamp_period_us_(timestamp_period_us), + querypool_(VK_NULL_HANDLE) { +} + +QueryPool::~QueryPool() { + destroy_query_pool(device_, querypool_); + querypool_ = VK_NULL_HANDLE; + query_names_.clear(); +} + +bool QueryPool::is_enabled() const { + return VK_NULL_HANDLE != querypool_; +} + +bool QueryPool::enable() { + TORCH_CHECK(VK_NULL_HANDLE == querypool_, "The query pool already exists."); + TORCH_CHECK(is_timestamps_supported_, "The device doesn't support for timestamps on all graphics and compute queues."); + querypool_ = create_query_pool(device_, Configuration::kMaxQueryCount); + return is_enabled(); +} + +std::vector QueryPool::disable(const bool waitfor_allqueries/* = true*/) { + auto out = result(waitfor_allqueries); + destroy_query_pool(device_, querypool_); + querypool_ = VK_NULL_HANDLE; + query_names_.clear(); + return out; +} + +int QueryPool::begin(const VkCommandBuffer& commandBuffer, const std::string& query_name) { + if (VK_NULL_HANDLE == querypool_ || VK_NULL_HANDLE == commandBuffer) { + return -1; + } + auto newQueryIndex = static_cast(query_names_.size()); + TORCH_CHECK(newQueryIndex < Configuration::kMaxQueryCount, "The query index cannot exceed Configuration::kMaxQueryCount."); + query_names_.push_back(query_name); + + vkCmdWriteTimestamp( + commandBuffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, querypool_, newQueryIndex * Configuration::kTimestampsPerQuery); + return static_cast(newQueryIndex); +} + +void QueryPool::end(const VkCommandBuffer& commandBuffer, const int queryIndex) { + if (VK_NULL_HANDLE == querypool_ || VK_NULL_HANDLE == commandBuffer) { + return; + } + vkCmdWriteTimestamp( + commandBuffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, querypool_, static_cast(queryIndex) * Configuration::kTimestampsPerQuery + 1u); +} + +std::vector QueryPool::result(const bool waitfor_allqueries) const { + if (VK_NULL_HANDLE == querypool_) { + return std::vector {}; + } + + std::vector perfInfo; + const VkQueryResultFlags flags = waitfor_allqueries ? (VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT) : VK_QUERY_RESULT_64_BIT; + std::array counter_data{}; + for (uint32_t queryIndex = 0u; queryIndex < query_names_.size(); ++queryIndex) { + const auto& query_name = query_names_[queryIndex]; + + // Grab the gpu timings (nanoseconds) + auto ret = vkGetQueryPoolResults(device_, querypool_, queryIndex * Configuration::kTimestampsPerQuery, Configuration::kTimestampsPerQuery, + sizeof(uint64_t) * counter_data.size(), counter_data.data(), sizeof(uint64_t), + flags); + if (ret != VK_SUCCESS) { + std::stringstream msg; + msg << "vkGetQueryPoolResults() for \"" << query_name << "\"" << " returned an error code " << ret << "."; + TORCH_WARN(msg.str()); + continue; + } + + // Tally up GPU time + int64_t gpu_time_us = static_cast( + (static_cast(counter_data[1] - counter_data[0]) * + timestamp_period_us_) / 1'000.f); // convert ns to us + + perfInfo.emplace_back(QueryPool::PerfInfo { + query_name, + static_cast(static_cast(counter_data[0]) * timestamp_period_us_ / 1'000.f), + static_cast(static_cast(counter_data[1]) * timestamp_period_us_ / 1'000.f), + gpu_time_us }); + } + return perfInfo; +} + +} // namespace api +} // namespace vulkan +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/vulkan/api/QueryPool.h b/aten/src/ATen/native/vulkan/api/QueryPool.h new file mode 100644 index 000000000000..edabba7fa705 --- /dev/null +++ b/aten/src/ATen/native/vulkan/api/QueryPool.h @@ -0,0 +1,57 @@ +#pragma once + +#ifdef USE_VULKAN_API + +#include +#include +#include + +namespace at { +namespace native { +namespace vulkan { +namespace api { + +class QueryPool final { + public: + explicit QueryPool(const VkDevice& device, const bool is_timestamps_supported, const float timestamp_period_us); + QueryPool(const QueryPool&) = delete; + QueryPool(QueryPool&&) = default; + QueryPool& operator=(const QueryPool&) = delete; + QueryPool& operator=(QueryPool&&) = default; + ~QueryPool(); + +public: + struct PerfInfo final { + std::string query_name; + int64_t start_time_us; + int64_t end_time_us; + int64_t execution_time_us; + }; + + struct Configuration final { + static constexpr uint32_t kTimestampsPerQuery = 2u; + static constexpr uint32_t kMaxQueryCount = 65536u; + }; + +public: + bool is_enabled() const; + bool enable(); + std::vector disable(const bool waitfor_allqueries = true); + int begin(const VkCommandBuffer& commandBuffer, const std::string& query_name); + void end(const VkCommandBuffer& commandBuffer, const int queryIndex); + std::vector result(const bool waitfor_allqueries) const; + +private: + VkDevice device_; + bool is_timestamps_supported_; + float timestamp_period_us_; + VkQueryPool querypool_; + std::vector query_names_; +}; + +} // namespace api +} // namespace vulkan +} // namespace native +} // namespace at + +#endif /* USE_VULKAN_API */ diff --git a/aten/src/ATen/native/vulkan/api/Resource.cpp b/aten/src/ATen/native/vulkan/api/Resource.cpp index a6cd3c62da9b..520ccc87d533 100644 --- a/aten/src/ATen/native/vulkan/api/Resource.cpp +++ b/aten/src/ATen/native/vulkan/api/Resource.cpp @@ -364,8 +364,8 @@ Resource::Pool::Pool( : device_(gpu.device), allocator_( create_allocator( - gpu.adapter->runtime->instance(), - gpu.adapter->handle, + gpu.instance, + gpu.adapter->physical_handle(), device_), vmaDestroyAllocator), memory_{ diff --git a/aten/src/ATen/native/vulkan/api/Runtime.cpp b/aten/src/ATen/native/vulkan/api/Runtime.cpp index b90c81eb6435..c925a0226f6a 100644 --- a/aten/src/ATen/native/vulkan/api/Runtime.cpp +++ b/aten/src/ATen/native/vulkan/api/Runtime.cpp @@ -1,22 +1,132 @@ #include #include -#include - -#include namespace at { namespace native { namespace vulkan { namespace api { + namespace { -struct Configuration final { -#ifdef DEBUG - static constexpr Runtime::Type kRuntime = Runtime::Type::Debug; -#else - static constexpr Runtime::Type kRuntime = Runtime::Type::Release; + +void find_requested_layers_and_extensions( + std::vector& enabled_layers, + std::vector& enabled_extensions, + const std::vector& requested_layers, + const std::vector& requested_extensions) { + + // Get supported instance layers + uint32_t layer_count = 0; + VK_CHECK(vkEnumerateInstanceLayerProperties(&layer_count, nullptr)); + + std::vector layer_properties(layer_count); + VK_CHECK(vkEnumerateInstanceLayerProperties( + &layer_count, layer_properties.data())); + + // Search for requested layers + for (const auto& requested_layer : requested_layers) { + for (const auto& layer : layer_properties) { + if (strcmp(requested_layer, layer.layerName) == 0) { + enabled_layers.push_back(requested_layer); + break; + } + } + } + + // Get supported instance extensions + uint32_t extension_count = 0; + VK_CHECK(vkEnumerateInstanceExtensionProperties( + nullptr, &extension_count, nullptr)); + + std::vector extension_properties(extension_count); + VK_CHECK(vkEnumerateInstanceExtensionProperties( + nullptr, &extension_count, extension_properties.data())); + + // Search for requested extensions + for (const auto& requested_extension : requested_extensions) { + for (const auto& extension : extension_properties) { + if (strcmp(requested_extension, extension.extensionName) == 0) { + enabled_extensions.push_back(requested_extension); + break; + } + } + } +} + +VkInstance create_instance(const RuntimeConfiguration& config) { + const VkApplicationInfo application_info{ + VK_STRUCTURE_TYPE_APPLICATION_INFO, // sType + nullptr, // pNext + "PyTorch Vulkan Backend", // pApplicationName + 0, // applicationVersion + nullptr, // pEngineName + 0, // engineVersion + VK_API_VERSION_1_0, // apiVersion + }; + + std::vector enabled_layers; + std::vector enabled_extensions; + + if (config.enableValidationMessages) { + std::vector requested_layers { + // "VK_LAYER_LUNARG_api_dump", + "VK_LAYER_KHRONOS_validation", + }; + std::vector requested_extensions { + #ifdef VK_EXT_debug_report + VK_EXT_DEBUG_REPORT_EXTENSION_NAME, + #endif + }; + + find_requested_layers_and_extensions( + enabled_layers, + enabled_extensions, + requested_layers, + requested_extensions); + } + + const VkInstanceCreateInfo instance_create_info{ + VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, // sType + nullptr, // pNext + 0u, // flags + &application_info, // pApplicationInfo + static_cast(enabled_layers.size()), // enabledLayerCount + enabled_layers.data(), // ppEnabledLayerNames + static_cast(enabled_extensions.size()), // enabledExtensionCount + enabled_extensions.data(), // ppEnabledExtensionNames + }; + + VkInstance instance{}; + VK_CHECK(vkCreateInstance(&instance_create_info, nullptr, &instance)); + TORCH_CHECK(instance, "Invalid Vulkan instance!"); + +#ifdef USE_VULKAN_VOLK + volkLoadInstance(instance); #endif -}; + + return instance; +} + +std::vector create_adapters(const VkInstance instance, + const uint32_t num_queues) { + if (VK_NULL_HANDLE == instance) { + return std::vector(); + } + + uint32_t device_count = 0; + VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, nullptr)); + + std::vector devices(device_count); + VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, devices.data())); + + std::vector adapters; + adapters.reserve(device_count); + for (const VkPhysicalDevice physical_device : devices) { + adapters.emplace_back(physical_device, num_queues); + } + + return adapters; +} VKAPI_ATTR VkBool32 VKAPI_CALL debug_report_callback_fn( const VkDebugReportFlagsEXT flags, @@ -46,113 +156,22 @@ VKAPI_ATTR VkBool32 VKAPI_CALL debug_report_callback_fn( return VK_FALSE; } -VkInstance create_instance(const Runtime::Type type) { - std::vector enabled_instance_layers; - std::vector enabled_instance_extensions; - - if (Runtime::Type::Debug == type) { - uint32_t instance_layers_count = 0; - VK_CHECK(vkEnumerateInstanceLayerProperties( - &instance_layers_count, nullptr)); - - std::vector instance_layer_properties( - instance_layers_count); - - VK_CHECK(vkEnumerateInstanceLayerProperties( - &instance_layers_count, - instance_layer_properties.data())); - - constexpr const char* const requested_instance_layers[]{ - // "VK_LAYER_LUNARG_api_dump", - "VK_LAYER_KHRONOS_validation", - }; - - for (const auto& requested_instance_layer : requested_instance_layers) { - for (const auto& layer : instance_layer_properties) { - if (strcmp(requested_instance_layer, layer.layerName) == 0) { - enabled_instance_layers.push_back(requested_instance_layer); - break; - } - } - } - - uint32_t instance_extension_count = 0; - VK_CHECK(vkEnumerateInstanceExtensionProperties( - nullptr, &instance_extension_count, nullptr)); - - std::vector instance_extension_properties( - instance_extension_count); - - VK_CHECK(vkEnumerateInstanceExtensionProperties( - nullptr, &instance_extension_count, instance_extension_properties.data())); - - constexpr const char* const requested_instance_extensions[]{ - #ifdef VK_EXT_debug_report - VK_EXT_DEBUG_REPORT_EXTENSION_NAME, - #endif - }; - - for (const auto& requested_instance_extension : requested_instance_extensions) { - for (const auto& extension : instance_extension_properties) { - if (strcmp(requested_instance_extension, extension.extensionName) == 0) { - enabled_instance_extensions.push_back(requested_instance_extension); - break; - } - } - } - } - - constexpr VkApplicationInfo application_info{ - VK_STRUCTURE_TYPE_APPLICATION_INFO, - nullptr, - "PyTorch", - 0, - "PyTorch", - 0, - VK_API_VERSION_1_0, - }; - -const VkInstanceCreateInfo instance_create_info{ - VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, - nullptr, - 0u, - &application_info, - static_cast(enabled_instance_layers.size()), - enabled_instance_layers.data(), - static_cast(enabled_instance_extensions.size()), - enabled_instance_extensions.data(), - }; - - VkInstance instance{}; - VK_CHECK(vkCreateInstance(&instance_create_info, nullptr, &instance)); - TORCH_CHECK(instance, "Invalid Vulkan instance!"); - -#ifdef USE_VULKAN_WRAPPER -#ifdef USE_VULKAN_VOLK - volkLoadInstance(instance); -#endif -#endif - - return instance; -} - VkDebugReportCallbackEXT create_debug_report_callback( - const VkInstance instance, - const Runtime::Type type) { - if (Runtime::Type::Debug != type) { + const VkInstance instance, const RuntimeConfiguration config) { + if (VK_NULL_HANDLE == instance || !config.enableValidationMessages) { return VkDebugReportCallbackEXT{}; } const VkDebugReportCallbackCreateInfoEXT debugReportCallbackCreateInfo{ - VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT, - nullptr, + VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT, // sType + nullptr, // pNext VK_DEBUG_REPORT_INFORMATION_BIT_EXT | VK_DEBUG_REPORT_WARNING_BIT_EXT | VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT | VK_DEBUG_REPORT_ERROR_BIT_EXT | - VK_DEBUG_REPORT_DEBUG_BIT_EXT, - debug_report_callback_fn, - nullptr, + VK_DEBUG_REPORT_DEBUG_BIT_EXT, // flags + debug_report_callback_fn, // pfnCallback + nullptr, // pUserData }; const auto vkCreateDebugReportCallbackEXT = @@ -177,179 +196,177 @@ VkDebugReportCallbackEXT create_debug_report_callback( return debug_report_callback; } -std::vector acquire_physical_devices( - const VkInstance instance) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - instance, - "Invalid Vulkan instance!"); +// +// Adapter selection methods +// - uint32_t device_count = 0; - VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, nullptr)); - - TORCH_CHECK( - device_count > 0, - "Vulkan: Could not find a device with Vulkan support!"); +uint32_t select_first(const std::vector& adapters) { + if (adapters.size() == 0) { + TORCH_WARN("Pytorch Vulkan Runtime: no device adapters are available for selection!"); + return adapters.size() + 1; // return out of range to signal invalidity + } - std::vector devices(device_count); - VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, devices.data())); + // Select the first adapter that has compute capability + for (const uint32_t i : c10::irange(adapters.size())) { + if (adapters[i].num_compute_queues() > 0) { + return i; + } + } - return devices; + TORCH_WARN("Pytorch Vulkan Runtime: no device adapters support compute!"); + return adapters.size() + 1; } -VkPhysicalDeviceProperties query_physical_device_properties( - const VkPhysicalDevice physical_device) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - physical_device, - "Invalid Vulkan physical device!"); - - VkPhysicalDeviceProperties physical_device_properties{}; - vkGetPhysicalDeviceProperties( - physical_device, - &physical_device_properties); - - return physical_device_properties; -} +// +// Global runtime initialization +// + +std::unique_ptr init_global_vulkan_runtime() { + // Load Vulkan drivers +#if defined(USE_VULKAN_VOLK) + if (VK_SUCCESS != volkInitialize()) { + TORCH_WARN( + "Pytorch Vulkan Runtime: Failed to load Vulkan driver using volkInitialize()! " + "The global vulkan runtime is invalid."); + return std::unique_ptr(nullptr); + } +#elif defined(USE_VULKAN_WRAPPER) + if (!InitVulkan()) { + TORCH_WARN( + "Pytorch Vulkan Runtime: Failed to load Vulkan driver using initVulkan()! " + "The global vulkan runtime is invalid."); + return std::unique_ptr(nullptr); + } +#endif /* USE_VULKAN_VOLK, USE_VULKAN_WRAPPER */ -VkPhysicalDeviceMemoryProperties query_physical_device_memory_properties( - const VkPhysicalDevice physical_device) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - physical_device, - "Invalid Vulkan physical device!"); + const bool enableValidationMessages = +#if defined(DEBUG) + true; +#else + false; +#endif /* DEBUG */ + const bool initDefaultDevice = true; + const uint32_t numRequestedQueues = 1; // TODO: raise this value + + const RuntimeConfiguration default_config { + enableValidationMessages, + initDefaultDevice, + AdapterSelector::First, + numRequestedQueues, + }; - VkPhysicalDeviceMemoryProperties physical_device_memory_properties{}; - vkGetPhysicalDeviceMemoryProperties( - physical_device, - &physical_device_memory_properties); + try { + return std::make_unique(Runtime(default_config)); + } + catch (const std::exception& e) { + TORCH_WARN( + "Pytorch Vulkan Runtime: Failed to initialize the global vulkan runtime! " + "The global vulkan runtime is invalid. Error: ", + e.what()); + } + catch (...) { + TORCH_WARN( + "Pytorch Vulkan Runtime: Failed to initialize the global vulkan runtime! " + "The global vulkan runtime is invalid. " + "Error: Unknown"); + } - return physical_device_memory_properties; + return std::unique_ptr(nullptr); } -uint32_t query_compute_queue_family_index(const VkPhysicalDevice physical_device) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - physical_device, - "Invalid Vulkan physical device!"); - - uint32_t queue_family_count = 0; - vkGetPhysicalDeviceQueueFamilyProperties( - physical_device, &queue_family_count, nullptr); - - TORCH_CHECK( - queue_family_count > 0, - "Vulkan: Invalid number of queue families!"); - - std::vector - queue_families_properties(queue_family_count); - - vkGetPhysicalDeviceQueueFamilyProperties( - physical_device, - &queue_family_count, - queue_families_properties.data()); +} // namespace - for (const auto i : c10::irange(queue_families_properties.size())) { - const VkQueueFamilyProperties& properties = queue_families_properties[i]; - if (properties.queueCount > 0 && (properties.queueFlags & VK_QUEUE_COMPUTE_BIT)) { - return i; +Runtime::Runtime(const RuntimeConfiguration config) + : instance_(create_instance(config)), + adapters_(create_adapters(instance_, config.numRequestedQueues)), + default_adapter_i_{}, + debug_report_callback_(create_debug_report_callback(instance_, config)) { + if (config.initDefaultDevice) { + try { + switch(config.defaultSelector) { + case AdapterSelector::First: + default_adapter_i_ = init_adapter(select_first); + } + } + catch (const std::exception& e) { + TORCH_WARN( + "Pytorch Vulkan Runtime: Could not initialize default device! Error: ", + e.what()); + } + catch (...) { + TORCH_WARN( + "Pytorch Vulkan Runtime: Could not initialize default device! Error: " + "Unknown."); } } - - TORCH_CHECK( - false, - "Vulkan: Could not find a queue family that supports compute operations!"); } -} // namespace +Runtime::~Runtime() { + if C10_LIKELY(VK_NULL_HANDLE == instance_) { + return; + } -Runtime::Debug::Debug(const VkInstance instance) - : instance_(instance) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - instance, - "Invalid Vulkan instance!"); -} + // Clear adapters list to trigger device destruction before destroying VkInstance + adapters_.clear(); -void Runtime::Debug::operator()( - const VkDebugReportCallbackEXT debug_report_callback) const { - if (debug_report_callback) { + // Instance must be destroyed last as its used to destroy the debug report callback. + if (debug_report_callback_) { const auto vkDestroyDebugReportCallbackEXT = (PFN_vkDestroyDebugReportCallbackEXT)vkGetInstanceProcAddr( instance_, "vkDestroyDebugReportCallbackEXT"); TORCH_CHECK( vkDestroyDebugReportCallbackEXT, - "Could not load vkDestroyDebugReportCallbackEXT"); + "Pytorch Vulkan Runtime: Could not load vkDestroyDebugReportCallbackEXT " + "when destroying debug_report_callback_"); vkDestroyDebugReportCallbackEXT( - instance_, debug_report_callback, nullptr); + instance_, debug_report_callback_, nullptr); + + debug_report_callback_ = {}; } -} -Runtime::Runtime(const Type type) - : instance_(create_instance(type), &VK_DELETER(Instance)), - debug_report_callback_( - create_debug_report_callback(instance(), type), - Debug(instance())) { + vkDestroyInstance(instance_, nullptr); + instance_ = VK_NULL_HANDLE; } -Adapter Runtime::select(const Selector& selector) { - const std::vector physical_devices = - acquire_physical_devices(instance()); - - for (const VkPhysicalDevice physical_device : physical_devices) { - const Adapter adapter{ - this, - physical_device, - query_physical_device_properties(physical_device), - query_physical_device_memory_properties(physical_device), - query_compute_queue_family_index(physical_device), - }; - - if (selector(adapter)) { - return adapter; - } - } - - TORCH_CHECK( - false, - "Vulkan: no adapter was selected as part of device enumeration!"); +Runtime::Runtime(Runtime&& other) noexcept + : instance_(other.instance_), + adapters_(std::move(other.adapters_)), + default_adapter_i_(other.default_adapter_i_), + debug_report_callback_(other.debug_report_callback_) { + other.instance_ = VK_NULL_HANDLE; + other.debug_report_callback_ = {}; } -Runtime* runtime() { - static const std::unique_ptr runtime([]() -> Runtime* { -#ifdef USE_VULKAN_WRAPPER -#ifdef USE_VULKAN_VOLK - if (VK_SUCCESS != volkInitialize()) { - TORCH_WARN("Vulkan: Failed to initialize Volk!"); - return nullptr; - } -#else - if (!InitVulkan()) { - TORCH_WARN("Vulkan: Failed to initialize Vulkan Wrapper!"); - return nullptr; - } -#endif /* USE_VULKAN_VOLK */ -#endif /* USE_VULKAN_WRAPPER */ +uint32_t Runtime::init_adapter(const Selector& selector) { + TORCH_CHECK( + adapters_.size() > 0, + "Pytorch Vulkan Runtime: Could not initialize adapter because no " + "devices were found by the Vulkan instance."); - try { - return new Runtime(Configuration::kRuntime); - } - catch (const std::exception& e) { - TORCH_WARN( - "Vulkan: Failed to initialize runtime! Error: ", - e.what()); - } - catch (...) { - TORCH_WARN( - "Vulkan: Failed to initialize runtime! " - "Error: Unknown"); - } + uint32_t i = selector(adapters_); + TORCH_CHECK( + i < adapters_.size(), + "Pytorch Vulkan Runtime: no suitable device adapter was selected! " + "Device could not be initialized"); - return nullptr; - }()); + adapters_[i].init_device(); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - runtime, - "Invalid Vulkan runtime!"); + return i; +} - return runtime.get(); +Runtime* runtime() { + // The global vulkan runtime is declared as a static local variable within a + // non-static function to ensure it has external linkage. If it were a global + // static variable there would be one copy per translation unit that includes + // Runtime.h as it would have internal linkage. + static const std::unique_ptr p_runtime = init_global_vulkan_runtime(); + TORCH_CHECK( + p_runtime, + "Pytorch Vulkan Runtime: The global runtime could not be retrieved " + "because it failed to initialize."); + return p_runtime.get(); } } // namespace api diff --git a/aten/src/ATen/native/vulkan/api/Runtime.h b/aten/src/ATen/native/vulkan/api/Runtime.h index 55eae70f8723..140c0869d627 100644 --- a/aten/src/ATen/native/vulkan/api/Runtime.h +++ b/aten/src/ATen/native/vulkan/api/Runtime.h @@ -19,52 +19,76 @@ namespace api { // are associated with a Context to make tensor <-> device affinity explicit. // +enum AdapterSelector { + First, +}; + +struct RuntimeConfiguration final { + bool enableValidationMessages; + bool initDefaultDevice; + AdapterSelector defaultSelector; + uint32_t numRequestedQueues; +}; + class Runtime final { public: - enum class Type { - Debug, - Release, - }; + explicit Runtime(const RuntimeConfiguration config); - explicit Runtime(Type type); + // Do not allow copying. There should be only one global instance of this class. Runtime(const Runtime&) = delete; Runtime& operator=(const Runtime&) = delete; - Runtime(Runtime&&) = default; - Runtime& operator=(Runtime&&) = default; - ~Runtime() = default; - VkInstance instance() const; + Runtime(Runtime&&) noexcept; + Runtime& operator=(Runtime&&) = delete; - typedef std::function Selector; - Adapter select(const Selector& selector); + ~Runtime(); private: - class Debug final { - public: - explicit Debug(VkInstance); - void operator()(VkDebugReportCallbackEXT) const; + VkInstance instance_; + std::vector adapters_; + uint32_t default_adapter_i_; - private: - VkInstance instance_; - }; + VkDebugReportCallbackEXT debug_report_callback_; - private: - // Construction and destruction order matters. Do not move members around. - Handle instance_; - Handle debug_report_callback_; + public: + inline VkInstance instance() const { + return instance_; + } + + inline Adapter* get_adapter_p() { + TORCH_CHECK( + default_adapter_i_ >= 0 && default_adapter_i_ < adapters_.size(), + "Pytorch Vulkan Runtime: Default device adapter is not set correctly!"); + return &adapters_[default_adapter_i_]; + } + + inline Adapter& get_adapter() { + TORCH_CHECK( + default_adapter_i_ >= 0 && default_adapter_i_ < adapters_.size(), + "Pytorch Vulkan Runtime: Default device adapter is not set correctly!"); + return adapters_[default_adapter_i_]; + } + + inline Adapter* get_adapter_p(uint32_t i) { + return &adapters_[i]; + } + + inline Adapter& get_adapter(uint32_t i) { + return adapters_[i]; + } + + inline uint32_t default_adapter_i() const { + return default_adapter_i_; + } + + using Selector = std::function&)>; + uint32_t init_adapter(const Selector& selector); }; +// The global runtime is retrieved using this function, where it is declared as +// a static local variable. Runtime* runtime(); -// -// Impl -// - -inline VkInstance Runtime::instance() const { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(instance_); - return instance_.get(); -} - } // namespace api } // namespace vulkan } // namespace native diff --git a/aten/src/ATen/native/vulkan/api/ThreadContext.cpp b/aten/src/ATen/native/vulkan/api/ThreadContext.cpp index 039218fe2d2a..d230d97ecda7 100644 --- a/aten/src/ATen/native/vulkan/api/ThreadContext.cpp +++ b/aten/src/ATen/native/vulkan/api/ThreadContext.cpp @@ -35,6 +35,13 @@ ThreadContext::SingletonThreadLocalObject::SingletonThreadLocalObject( : object_(gpu) { } +template<> +ThreadContext::SingletonThreadLocalObject::SingletonThreadLocalObject(const GPU& gpu) + : object_(gpu.device, + gpu.adapter->timestamp_compute_and_graphics(), + gpu.adapter->timestamp_period()) { +} + } // namespace api } // namespace vulkan } // namespace native diff --git a/aten/src/ATen/native/vulkan/api/ThreadContext.h b/aten/src/ATen/native/vulkan/api/ThreadContext.h index 6f0360359e5e..0145e345f8d7 100644 --- a/aten/src/ATen/native/vulkan/api/ThreadContext.h +++ b/aten/src/ATen/native/vulkan/api/ThreadContext.h @@ -5,6 +5,7 @@ #include #include #include +#include #include namespace at { @@ -29,6 +30,7 @@ class ThreadContext final { Command& command(); Descriptor& descriptor(); Resource& resource(); + QueryPool& querypool(); private: GPU gpu_; @@ -67,6 +69,10 @@ inline Resource& ThreadContext::resource() { return SingletonThreadLocalObject::get(gpu_); } +inline QueryPool& ThreadContext::querypool() { + return SingletonThreadLocalObject::get(gpu_); +} + } // namespace api } // namespace vulkan } // namespace native diff --git a/aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl b/aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl deleted file mode 100644 index 2c02e034603e..000000000000 --- a/aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl +++ /dev/null @@ -1,31 +0,0 @@ -#version 450 core -#define PRECISION $precision -layout(std430) buffer; -layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput; -layout(set = 0, binding = 1) readonly buffer kernel { - vec4 data[]; -} -uKernel; -layout(set = 0, binding = 2) uniform constBlock { - int KWxKH; - int C_4; -} -uConstBlock; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - ivec3 pos = ivec3(gl_GlobalInvocationID) * ivec3(4, 1, 1); - int KWxKH = uConstBlock.KWxKH; - int C_4 = uConstBlock.C_4; - int bufferIdx = pos.x * KWxKH + 4 * pos.y * C_4 * KWxKH + 4 * pos.z; - vec4 v0 = uKernel.data[bufferIdx + 0]; - vec4 v1 = uKernel.data[bufferIdx + 1]; - vec4 v2 = uKernel.data[bufferIdx + 2]; - vec4 v3 = uKernel.data[bufferIdx + 3]; - - imageStore(uOutput, ivec3(pos.x + 0, pos.y, pos.z), v0); - imageStore(uOutput, ivec3(pos.x + 1, pos.y, pos.z), v1); - imageStore(uOutput, ivec3(pos.x + 2, pos.y, pos.z), v2); - imageStore(uOutput, ivec3(pos.x + 3, pos.y, pos.z), v3); -} diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl deleted file mode 100644 index 06af09e0b655..000000000000 --- a/aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl +++ /dev/null @@ -1,59 +0,0 @@ -#version 450 core -#define PRECISION $precision -#define FORMAT $format - -layout(std430) buffer; -layout(set = 0, binding = 0, FORMAT) writeonly PRECISION uniform image3D uOutput; -layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; -layout(set = 0, binding = 2) uniform PRECISION sampler3D uKernel; -layout(set = 0, binding = 3) readonly buffer bias { - vec4 data[]; -} -uBias; -layout(set = 0, binding = 4) uniform constBlock { - ivec2 padding; - ivec2 kernelSize; - ivec2 stride; - ivec2 dilate; - ivec4 outputSize; - ivec4 inputSize; - float outputMin; - float outputMax; -} -uConstBlock; - -#define UP_DIV(x, y) (((x) + (y)-1) / (y)) - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - ivec3 pos = ivec3(gl_GlobalInvocationID); - ivec4 outputSize = uConstBlock.outputSize; - if (all(lessThan(ivec3(gl_GlobalInvocationID), outputSize.xyz))) { - int KW = uConstBlock.kernelSize.x; - int KH = uConstBlock.kernelSize.y; - ivec4 inputSize = uConstBlock.inputSize; - ivec2 dilate = uConstBlock.dilate; - ivec2 padding = uConstBlock.padding; - ivec2 stride = uConstBlock.stride; - - ivec2 s0 = pos.xy * stride - padding; - ivec2 sfxy = max(ivec2(0), (UP_DIV(-s0, dilate))); - ivec2 efxy = min(uConstBlock.kernelSize, UP_DIV(inputSize.xy - s0, dilate)); - - vec4 acc = uBias.data[pos.z]; - int sx, kxi, kyi; - for (kyi = sfxy.y; kyi < efxy.y; ++kyi) { - int sy = kyi * dilate.y + s0.y; - for (kxi = 0; kxi < KW; ++kxi) { - sx = kxi * dilate.x + s0.x; - vec4 iv = texelFetch(uInput, ivec3(sx, sy, pos.z), 0); - vec4 kv = texelFetch(uKernel, ivec3(kxi, kyi, pos.z), 0); - acc += kv * iv; - } - } - vec4 outputMin = vec4(uConstBlock.outputMin); - vec4 outputMax = vec4(uConstBlock.outputMax); - imageStore(uOutput, pos, clamp(acc, outputMin, outputMax)); - } -} diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl deleted file mode 100644 index 89411284fed4..000000000000 --- a/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl +++ /dev/null @@ -1,82 +0,0 @@ -#version 450 core -#define PRECISION $precision -layout(std430) buffer; -layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput; -layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; -layout(set = 0, binding = 2) uniform PRECISION sampler3D uKernel; -layout(set = 0, binding = 3) readonly buffer bias { - vec4 data[]; -} -uBias; -layout(set = 0, binding = 4) uniform constBlock { - ivec2 padding; - ivec2 kernelSize; - ivec2 stride; - ivec2 dilate; - ivec4 outputSize; - ivec4 inputSize; - float outputMin; - float outputMax; -} -uConstBlock; - -#define UP_DIV(x, y) (((x) + (y)-1) / (y)) - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - ivec3 gpos = ivec3(gl_GlobalInvocationID); - if (all(lessThan(gpos, uConstBlock.outputSize.xyz))) { - ivec3 pos = gpos * ivec3(4, 1, 1); - int kernelX = uConstBlock.kernelSize.x; - int kernelY = uConstBlock.kernelSize.y; - ivec3 inputSize = uConstBlock.inputSize.xyz; - ivec2 s0 = pos.xy * uConstBlock.stride - uConstBlock.padding; - int fx, fy, fz; - ivec2 sfxy = max(ivec2(0), (UP_DIV(-s0, uConstBlock.dilate))); - ivec2 efxy = - min(uConstBlock.kernelSize, - UP_DIV(uConstBlock.inputSize.xy - s0, uConstBlock.dilate)); - vec4 color = uBias.data[pos.z]; - vec4 color2 = color; - vec4 color3 = color; - vec4 color4 = color; - int kY = pos.z; - int strideX = uConstBlock.stride.x; - for (fy = sfxy.y; fy < efxy.y; ++fy) { - int sy = fy * uConstBlock.dilate.y + s0.y; - for (fx = 0; fx < kernelX; ++fx) { - int kZ = fx + fy * kernelX; - int sx1 = fx * uConstBlock.dilate.x + s0.x; - int sx2 = sx1 + strideX; - int sx3 = sx1 + strideX * 2; - int sx4 = sx1 + strideX * 3; - float m1 = sx1 >= 0 && sx1 < inputSize.x ? 1.0 : 0.0; - float m2 = sx2 >= 0 && sx2 < inputSize.x ? 1.0 : 0.0; - float m3 = sx3 >= 0 && sx3 < inputSize.x ? 1.0 : 0.0; - float m4 = sx4 >= 0 && sx4 < inputSize.x ? 1.0 : 0.0; - fz = 0; - for (; fz < inputSize.z; ++fz) { - int kX = 4 * fz; - vec4 k0 = texelFetch(uKernel, ivec3(kX + 0, kY, kZ), 0); - vec4 k1 = texelFetch(uKernel, ivec3(kX + 1, kY, kZ), 0); - vec4 k2 = texelFetch(uKernel, ivec3(kX + 2, kY, kZ), 0); - vec4 k3 = texelFetch(uKernel, ivec3(kX + 3, kY, kZ), 0); - - mat4 k = mat4(k0, k1, k2, k3); - - color += k * texelFetch(uInput, ivec3(sx1, sy, fz), 0) * m1; - color2 += k * texelFetch(uInput, ivec3(sx2, sy, fz), 0) * m2; - color3 += k * texelFetch(uInput, ivec3(sx3, sy, fz), 0) * m3; - color4 += k * texelFetch(uInput, ivec3(sx4, sy, fz), 0) * m4; - } - } - } - vec4 outputMin = vec4(uConstBlock.outputMin); - vec4 outputMax = vec4(uConstBlock.outputMax); - imageStore(uOutput, ivec3(pos.x + 0, pos.y, pos.z), clamp(color, outputMin, outputMax)); - imageStore(uOutput, ivec3(pos.x + 1, pos.y, pos.z), clamp(color2, outputMin, outputMax)); - imageStore(uOutput, ivec3(pos.x + 2, pos.y, pos.z), clamp(color3, outputMin, outputMax)); - imageStore(uOutput, ivec3(pos.x + 3, pos.y, pos.z), clamp(color4, outputMin, outputMax)); - } -} diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp_1x.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp_1x.glsl deleted file mode 100644 index 8baae9b5fcd5..000000000000 --- a/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp_1x.glsl +++ /dev/null @@ -1,65 +0,0 @@ -#version 450 core -#define PRECISION $precision -layout(std430) buffer; -layout(set = 0, rgba32f, binding = 0) writeonly PRECISION uniform image3D uOutput; -layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; -layout(set = 0, binding = 2) uniform PRECISION sampler3D uKernel; -layout(set = 0, binding = 3) readonly buffer bias { - vec4 data[]; -} -uBias; -layout(set = 0, binding = 4) uniform constBlock { - ivec2 padding; - ivec2 kernelSize; - ivec2 stride; - ivec2 dilate; - ivec4 outputSize; - ivec4 inputSize; - float outputMin; - float outputMax; -} -uConstBlock; - -#define UP_DIV(x, y) (((x) + (y)-1) / (y)) - -layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; - -void main() { - ivec3 pos = ivec3(gl_GlobalInvocationID); - if (all(lessThan(pos, uConstBlock.outputSize.xyz))) { - int kernelX = uConstBlock.kernelSize.x; - int kernelY = uConstBlock.kernelSize.y; - ivec3 inputSize = uConstBlock.inputSize.xyz; - ivec2 s0 = pos.xy * uConstBlock.stride - uConstBlock.padding; - int fx, fy, fz; - ivec2 sfxy = max(ivec2(0), (UP_DIV(-s0, uConstBlock.dilate))); - ivec2 efxy = - min(uConstBlock.kernelSize, - UP_DIV(uConstBlock.inputSize.xy - s0, uConstBlock.dilate)); - vec4 color = uBias.data[pos.z]; - int kY = pos.z; - int strideX = uConstBlock.stride.x; - for (fy = sfxy.y; fy < efxy.y; ++fy) { - int sy = fy * uConstBlock.dilate.y + s0.y; - for (fx = 0; fx < kernelX; ++fx) { - int kZ = fx + fy * kernelX; - int sx = fx * uConstBlock.dilate.x + s0.x; - fz = 0; - for (; fz < inputSize.z; ++fz) { - int kX = 4 * fz; - vec4 k0 = texelFetch(uKernel, ivec3(kX + 0, kY, kZ), 0); - vec4 k1 = texelFetch(uKernel, ivec3(kX + 1, kY, kZ), 0); - vec4 k2 = texelFetch(uKernel, ivec3(kX + 2, kY, kZ), 0); - vec4 k3 = texelFetch(uKernel, ivec3(kX + 3, kY, kZ), 0); - - mat4 k = mat4(k0, k1, k2, k3); - - color += k * texelFetch(uInput, ivec3(sx, sy, fz), 0); - } - } - } - vec4 outputMin = vec4(uConstBlock.outputMin); - vec4 outputMax = vec4(uConstBlock.outputMax); - imageStore(uOutput, ivec3(pos.x, pos.y, pos.z), clamp(color, outputMin, outputMax)); - } -} diff --git a/aten/src/ATen/native/vulkan/glsl/lerp.glsl b/aten/src/ATen/native/vulkan/glsl/lerp.glsl new file mode 100644 index 000000000000..433877a8efe8 --- /dev/null +++ b/aten/src/ATen/native/vulkan/glsl/lerp.glsl @@ -0,0 +1,36 @@ +#version 450 core +#define PRECISION $precision +#define FORMAT $format + +layout(std430) buffer; + +/* Qualifiers: layout - storage - precision - memory */ + +layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D uOutput; +layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput0; +layout(set = 0, binding = 2) uniform PRECISION sampler3D uInput1; +layout(set = 0, binding = 3) uniform PRECISION sampler3D uInput2; +layout(set = 0, binding = 4) uniform PRECISION restrict Block { + ivec4 size; + ivec4 isize0; + ivec4 isize1; + ivec4 isize2; +} uBlock; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + const ivec3 pos = ivec3(gl_GlobalInvocationID); + + if (all(lessThan(pos, uBlock.size.xyz))) { + const ivec3 input0_pos = pos % uBlock.isize0.xyz; + const ivec3 input1_pos = pos % uBlock.isize1.xyz; + const ivec3 input2_pos = pos % uBlock.isize2.xyz; + imageStore( + uOutput, + pos, + texelFetch(uInput0, input0_pos, 0) + + texelFetch(uInput2, input2_pos, 0) + * (texelFetch(uInput1, input1_pos, 0) - texelFetch(uInput0, input0_pos, 0))); + } +} diff --git a/aten/src/ATen/native/vulkan/glsl/lerp_.glsl b/aten/src/ATen/native/vulkan/glsl/lerp_.glsl new file mode 100644 index 000000000000..b727f7bf51a7 --- /dev/null +++ b/aten/src/ATen/native/vulkan/glsl/lerp_.glsl @@ -0,0 +1,33 @@ +#version 450 core +#define PRECISION $precision +#define FORMAT $format + +layout(std430) buffer; + +/* Qualifiers: layout - storage - precision - memory */ + +layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict image3D uOutput; +layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput0; +layout(set = 0, binding = 2) uniform PRECISION sampler3D uInput1; +layout(set = 0, binding = 3) uniform PRECISION restrict Block { + ivec4 size; + ivec4 isize0; + ivec4 isize1; +} uBlock; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + const ivec3 pos = ivec3(gl_GlobalInvocationID); + + if (all(lessThan(pos, uBlock.size.xyz))) { + const ivec3 input0_pos = pos % uBlock.isize0.xyz; + const ivec3 input1_pos = pos % uBlock.isize1.xyz; + imageStore( + uOutput, + pos, + imageLoad(uOutput, pos) + + texelFetch(uInput1, input1_pos, 0) + * (texelFetch(uInput0, input0_pos, 0) - imageLoad(uOutput, pos))); + } +} diff --git a/aten/src/ATen/native/vulkan/glsl/lerp_scalar.glsl b/aten/src/ATen/native/vulkan/glsl/lerp_scalar.glsl new file mode 100644 index 000000000000..2978f0922f3d --- /dev/null +++ b/aten/src/ATen/native/vulkan/glsl/lerp_scalar.glsl @@ -0,0 +1,34 @@ +#version 450 core +#define PRECISION $precision +#define FORMAT $format + +layout(std430) buffer; + +/* Qualifiers: layout - storage - precision - memory */ + +layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D uOutput; +layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput0; +layout(set = 0, binding = 2) uniform PRECISION sampler3D uInput1; +layout(set = 0, binding = 3) uniform PRECISION restrict Block { + ivec4 size; + ivec4 isize0; + ivec3 isize1; + float weight; +} uBlock; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + const ivec3 pos = ivec3(gl_GlobalInvocationID); + + if (all(lessThan(pos, uBlock.size.xyz))) { + const ivec3 input0_pos = pos % uBlock.isize0.xyz; + const ivec3 input1_pos = pos % uBlock.isize1.xyz; + imageStore( + uOutput, + pos, + texelFetch(uInput0, input0_pos, 0) + + uBlock.weight + * (texelFetch(uInput1, input1_pos, 0) - texelFetch(uInput0, input0_pos, 0))); + } +} diff --git a/aten/src/ATen/native/vulkan/glsl/lerp_scalar_.glsl b/aten/src/ATen/native/vulkan/glsl/lerp_scalar_.glsl new file mode 100644 index 000000000000..fa32b8b13667 --- /dev/null +++ b/aten/src/ATen/native/vulkan/glsl/lerp_scalar_.glsl @@ -0,0 +1,31 @@ +#version 450 core +#define PRECISION $precision +#define FORMAT $format + +layout(std430) buffer; + +/* Qualifiers: layout - storage - precision - memory */ + +layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict image3D uOutput; +layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput0; +layout(set = 0, binding = 2) uniform PRECISION restrict Block { + ivec4 size; + ivec3 isize0; + float weight; +} uBlock; + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + const ivec3 pos = ivec3(gl_GlobalInvocationID); + + if (all(lessThan(pos, uBlock.size.xyz))) { + const ivec3 input0_pos = pos % uBlock.isize0.xyz; + imageStore( + uOutput, + pos, + imageLoad(uOutput, pos) + + uBlock.weight + * (texelFetch(uInput0, input0_pos, 0) - imageLoad(uOutput, pos))); + } +} diff --git a/aten/src/ATen/native/vulkan/glsl/permute.glsl b/aten/src/ATen/native/vulkan/glsl/permute.glsl deleted file mode 100644 index 3d1191ff6eea..000000000000 --- a/aten/src/ATen/native/vulkan/glsl/permute.glsl +++ /dev/null @@ -1,57 +0,0 @@ -#version 450 core -layout(std430) buffer; -layout(set = 0, binding = 0) writeonly buffer outputBuffer { - float data[]; -} -uOutput; -layout(set = 0, binding = 1) readonly buffer inputBuffer { - float data[]; -} -uInput; -layout(set = 0, binding = 2) uniform constBlock { - ivec4 inStrides[2]; - ivec4 outStrides[2]; - ivec4 outDims[2]; - int inOffset; -} -uConst; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - ivec3 pos = ivec3(gl_GlobalInvocationID); - ivec4 outIdx[2]; - - int d1 = uConst.outDims[0][3]; - int d3 = uConst.outDims[1][1]; - int d5 = uConst.outDims[1][3]; - - int oi0 = pos.z / d1; - int oi1 = pos.z - d1 * oi0; - - int oi2 = pos.y / d3; - int oi3 = pos.y - d3 * oi2; - - int oi4 = pos.x / d5; - int oi5 = pos.x - d5 * oi4; - - ivec4 oIdx0 = ivec4(0, 0, oi0, oi1); - ivec4 oIdx1 = ivec4(oi2, oi3, oi4, oi5); - if (all(lessThan(oIdx0, uConst.outDims[0])) && - all(lessThan(oIdx1, uConst.outDims[1]))) { - ivec4 ins0 = uConst.inStrides[0]; - ivec4 ins1 = uConst.inStrides[1]; - int inIdxInt = oIdx0.x * ins0.x + oIdx0.y * ins0.y + oIdx0.z * ins0.z + - oIdx0.w * ins0.w; - inIdxInt += oIdx1.x * ins1.x + oIdx1.y * ins1.y + oIdx1.z * ins1.z + - oIdx1.w * ins1.w; - ivec4 outs0 = uConst.outStrides[0]; - ivec4 outs1 = uConst.outStrides[1]; - int outIdxInt = oIdx0.x * outs0.x + oIdx0.y * outs0.y + oIdx0.z * outs0.z + - oIdx0.w * outs0.w; - outIdxInt += oIdx1.x * outs1.x + oIdx1.y * outs1.y + oIdx1.z * outs1.z + - oIdx1.w * outs1.w; - - uOutput.data[outIdxInt] = uInput.data[uConst.inOffset + inIdxInt]; - } -} diff --git a/aten/src/ATen/native/vulkan/glsl/tanh.glsl b/aten/src/ATen/native/vulkan/glsl/tanh.glsl index 8d611630cf74..70315def6342 100644 --- a/aten/src/ATen/native/vulkan/glsl/tanh.glsl +++ b/aten/src/ATen/native/vulkan/glsl/tanh.glsl @@ -18,6 +18,10 @@ void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); if (all(lessThan(pos, uBlock.size.xyz))) { - imageStore(uOutput, pos, tanh(texelFetch(uInput, pos, 0))); + const vec4 intex = texelFetch(uInput, pos, 0); + imageStore( + uOutput, + pos, + tanh(clamp(intex, -15.0, 15.0))); } } diff --git a/aten/src/ATen/native/vulkan/glsl/tanh_.glsl b/aten/src/ATen/native/vulkan/glsl/tanh_.glsl index 59649da65180..ef8fd35fc588 100644 --- a/aten/src/ATen/native/vulkan/glsl/tanh_.glsl +++ b/aten/src/ATen/native/vulkan/glsl/tanh_.glsl @@ -17,6 +17,10 @@ void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); if (all(lessThan(pos, uBlock.size.xyz))) { - imageStore(uOutput, pos, tanh(imageLoad(uOutput, pos))); + const vec4 intex = imageLoad(uOutput, pos); + imageStore( + uOutput, + pos, + tanh(clamp(intex, -15.0, 15.0))); } } diff --git a/aten/src/ATen/native/vulkan/ops/Arithmetic.cpp b/aten/src/ATen/native/vulkan/ops/Arithmetic.cpp index 42e941f00a77..268487e10c1c 100644 --- a/aten/src/ATen/native/vulkan/ops/Arithmetic.cpp +++ b/aten/src/ATen/native/vulkan/ops/Arithmetic.cpp @@ -1,3 +1,4 @@ +#include #include #include @@ -54,7 +55,8 @@ Tensor arithmetic_scalar( const Tensor& self_arg, const Scalar& other, const c10::optional& alpha_arg, - const api::Shader::Descriptor& shader_descriptor) { + const api::Shader::Descriptor& shader_descriptor, + const std::string& op_name) { api::Context* const context = api::context(); const Tensor self = self_arg.is_vulkan() ? self_arg : self_arg.vulkan(); @@ -69,6 +71,8 @@ Tensor arithmetic_scalar( api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); { + api::OpProfiler profiler(command_buffer, context->querypool(), op_name); + if C10_LIKELY (v_output.has_image() && v_self.has_image()) { const float other_val = alpha_arg ? other.to() * alpha_arg->to() @@ -114,7 +118,8 @@ Tensor& arithmetic_scalar_( Tensor& self, const Scalar& other, const c10::optional& alpha_arg, - const api::Shader::Descriptor& shader_descriptor) { + const api::Shader::Descriptor& shader_descriptor, + const std::string& op_name) { api::Context* const context = api::context(); TORCH_CHECK( @@ -126,6 +131,8 @@ Tensor& arithmetic_scalar_( api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); { + api::OpProfiler profiler(command_buffer, context->querypool(), op_name); + if C10_LIKELY (v_self.has_image()) { const float other_val = alpha_arg ? other.to() * alpha_arg->to() @@ -169,7 +176,8 @@ Tensor arithmetic_tensor( const Tensor& self_arg, const Tensor& other_arg, const c10::optional& alpha_arg, - const api::Shader::Descriptor& shader_descriptor) { + const api::Shader::Descriptor& shader_descriptor, + const std::string& op_name) { check_inputs(self_arg, other_arg); api::Context* const context = api::context(); @@ -188,6 +196,8 @@ Tensor arithmetic_tensor( api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); { + api::OpProfiler profiler(command_buffer, context->querypool(), op_name); + if C10_LIKELY (v_self.has_image() && v_other.has_image()) { const float alpha = alpha_arg ? alpha_arg->to() : 1.0; const struct Block final { @@ -243,7 +253,8 @@ Tensor& arithmetic_tensor_( Tensor& self, const Tensor& other_arg, const c10::optional& alpha_arg, - const api::Shader::Descriptor& shader_descriptor) { + const api::Shader::Descriptor& shader_descriptor, + const std::string& op_name) { check_inputs(self, other_arg); api::Context* const context = api::context(); @@ -259,6 +270,8 @@ Tensor& arithmetic_tensor_( api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); { + api::OpProfiler profiler(command_buffer, context->querypool(), op_name); + if C10_LIKELY ( v_self.has_image() && v_other.has_image() && !self.is_same(other)) { const float alpha = alpha_arg ? alpha_arg->to() : 1.0; @@ -310,25 +323,33 @@ Tensor add_scalar( const Scalar& other, const Scalar& alpha) { return arithmetic_scalar( - self_arg, other, c10::optional(alpha), VK_KERNEL(add_scalar)); + self_arg, other, c10::optional(alpha), VK_KERNEL(add_scalar), "aten::add.Scalar"); } Tensor& add_scalar_(Tensor& self, const Scalar& other, const Scalar& alpha) { return arithmetic_scalar_( - self, other, c10::optional(alpha), VK_KERNEL(add_scalar_)); + self, other, c10::optional(alpha), VK_KERNEL(add_scalar_), "aten::add_.Scalar"); } Tensor add_tensor( const Tensor& self_arg, const Tensor& other_arg, const Scalar& alpha) { + if (other_arg.sizes().size() == 0) { + return arithmetic_scalar( + self_arg, + other_arg.item(), + c10::optional(alpha.to()), + VK_KERNEL(add_scalar), + "aten::add.Tensor"); + } return arithmetic_tensor( - self_arg, other_arg, c10::optional(alpha), VK_KERNEL(add)); + self_arg, other_arg, c10::optional(alpha), VK_KERNEL(add), "aten::add.Tensor"); } Tensor& add_tensor_(Tensor& self, const Tensor& other_arg, const Scalar& alpha) { return arithmetic_tensor_( - self, other_arg, c10::optional(alpha), VK_KERNEL(add_)); + self, other_arg, c10::optional(alpha), VK_KERNEL(add_), "aten::add_.Tensor"); } Tensor sub_scalar( @@ -339,7 +360,8 @@ Tensor sub_scalar( self_arg, other, c10::optional(-1 * alpha.to()), - VK_KERNEL(add_scalar)); + VK_KERNEL(add_scalar), + "aten::sub.Scalar"); } Tensor& sub_scalar_(Tensor& self, const Scalar& other, const Scalar& alpha) { @@ -347,40 +369,57 @@ Tensor& sub_scalar_(Tensor& self, const Scalar& other, const Scalar& alpha) { self, other, c10::optional(-1 * alpha.to()), - VK_KERNEL(add_scalar_)); + VK_KERNEL(add_scalar_), + "aten::sub_.Scalar"); } Tensor sub_tensor( const Tensor& self_arg, const Tensor& other_arg, const Scalar& alpha) { + if (other_arg.sizes().size() == 0) { + return arithmetic_scalar( + self_arg, + other_arg.item(), + c10::optional(-1 * alpha.to()), + VK_KERNEL(add_scalar), + "aten::sub.Tensor"); + } return arithmetic_tensor( - self_arg, other_arg, c10::optional(alpha), VK_KERNEL(sub)); + self_arg, other_arg, c10::optional(alpha), VK_KERNEL(sub), "aten::sub.Tensor"); } Tensor& sub_tensor_(Tensor& self, const Tensor& other_arg, const Scalar& alpha) { return arithmetic_tensor_( - self, other_arg, c10::optional(alpha), VK_KERNEL(sub_)); + self, other_arg, c10::optional(alpha), VK_KERNEL(sub_), "aten::sub_.Tensor"); } Tensor mul_scalar(const Tensor& self_arg, const Scalar& other) { return arithmetic_scalar( - self_arg, other, c10::optional(), VK_KERNEL(mul_scalar)); + self_arg, other, c10::optional(), VK_KERNEL(mul_scalar), "aten::mul.Scalar"); } Tensor& mul_scalar_(Tensor& self, const Scalar& other) { return arithmetic_scalar_( - self, other, c10::optional(), VK_KERNEL(mul_scalar_)); + self, other, c10::optional(), VK_KERNEL(mul_scalar_), "aten::mul_.Scalar"); } Tensor mul_tensor(const Tensor& self_arg, const Tensor& other_arg) { + if (other_arg.sizes().size() == 0) { + return arithmetic_scalar( + self_arg, + other_arg.item(), + c10::optional(), + VK_KERNEL(mul_scalar), + "aten::mul.Tensor"); + } return arithmetic_tensor( - self_arg, other_arg, c10::optional(), VK_KERNEL(mul)); + self_arg, other_arg, c10::optional(), VK_KERNEL(mul), "aten::mul.Tensor"); } Tensor& mul_tensor_(Tensor& self, const Tensor& other_arg) { return arithmetic_tensor_( - self, other_arg, c10::optional(), VK_KERNEL(mul_)); + self, other_arg, c10::optional(), VK_KERNEL(mul_), "aten::mul_.Tensor"); } Tensor div_scalar(const Tensor& self_arg, const Scalar& other) { @@ -388,7 +427,8 @@ Tensor div_scalar(const Tensor& self_arg, const Scalar& other) { self_arg, 1.0 / other.to(), c10::optional(), - VK_KERNEL(mul_scalar)); + VK_KERNEL(mul_scalar), + "aten::div.Scalar"); } Tensor& div_scalar_(Tensor& self, const Scalar& other) { @@ -396,17 +436,26 @@ Tensor& div_scalar_(Tensor& self, const Scalar& other) { self, 1.0 / other.to(), c10::optional(), - VK_KERNEL(mul_scalar_)); + VK_KERNEL(mul_scalar_), + "aten::div_.Scalar"); } Tensor div_tensor(const Tensor& self_arg, const Tensor& other_arg) { + if (other_arg.sizes().size() == 0) { + return arithmetic_scalar( + self_arg, + 1.0 / other_arg.item(), + c10::optional(), + VK_KERNEL(mul_scalar), + "aten::div.Tensor"); + } return arithmetic_tensor( - self_arg, other_arg, c10::optional(), VK_KERNEL(div)); + self_arg, other_arg, c10::optional(), VK_KERNEL(div), "aten::div.Tensor"); } Tensor& div_tensor_(Tensor& self, const Tensor& other_arg) { return arithmetic_tensor_( - self, other_arg, c10::optional(), VK_KERNEL(div_)); + self, other_arg, c10::optional(), VK_KERNEL(div_), "aten::div_.Tensor"); } #ifdef USE_VULKAN_API diff --git a/aten/src/ATen/native/vulkan/ops/Clamp.cpp b/aten/src/ATen/native/vulkan/ops/Clamp.cpp index a6e65607fb07..3f5cb3d2afb9 100644 --- a/aten/src/ATen/native/vulkan/ops/Clamp.cpp +++ b/aten/src/ATen/native/vulkan/ops/Clamp.cpp @@ -1,3 +1,4 @@ +#include #include #include @@ -9,10 +10,11 @@ namespace { using namespace api::utils; -Tensor clamp( +Tensor _clamp( const Tensor& self_arg, const c10::optional& min, - const c10::optional& max) { + const c10::optional& max, + const std::string& op_name) { TORCH_CHECK( min || max, "At least one of 'min' or 'max' must not be None"); @@ -31,6 +33,8 @@ Tensor clamp( api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); { + api::OpProfiler profiler(command_buffer, context->querypool(), op_name); + if C10_LIKELY(v_output.has_image() && v_self.has_image()) { const struct Block final { uvec3 extents; @@ -79,10 +83,18 @@ Tensor clamp( return convert(v_output); } -Tensor& clamp_( - Tensor& self, +Tensor clamp( + const Tensor& self_arg, const c10::optional& min, const c10::optional& max) { + return _clamp(self_arg, min, max, "aten::clamp"); +} + +Tensor& _clamp_( + Tensor& self, + const c10::optional& min, + const c10::optional& max, + const std::string& op_name) { api::Context* const context = api::context(); TORCH_CHECK( @@ -98,6 +110,8 @@ Tensor& clamp_( api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); { + api::OpProfiler profiler(command_buffer, context->querypool(), op_name); + if C10_LIKELY(v_self.has_image()) { const struct Block final { uvec3 extents; @@ -140,9 +154,17 @@ Tensor& clamp_( return self; } +Tensor& clamp_( + Tensor& self, + const c10::optional& min, + const c10::optional& max) { + return _clamp_(self, min, max, "aten::clamp_"); +} + Tensor activation( const Tensor& self_arg, - const api::Shader::Descriptor& shader_descriptor) { + const api::Shader::Descriptor& shader_descriptor, + const std::string& op_name) { api::Context* const context = api::context(); const Tensor self = self_arg.is_vulkan() ? self_arg : self_arg.vulkan(); @@ -157,6 +179,8 @@ Tensor activation( api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); { + api::OpProfiler profiler(command_buffer, context->querypool(), op_name); + if C10_LIKELY(v_output.has_image() && v_self.has_image()) { const struct Block final { uvec3 extents; @@ -202,7 +226,8 @@ Tensor activation( Tensor& activation_( Tensor& self, - const api::Shader::Descriptor& shader_descriptor) { + const api::Shader::Descriptor& shader_descriptor, + const std::string& op_name) { api::Context* const context = api::context(); TORCH_CHECK( @@ -214,6 +239,8 @@ Tensor& activation_( api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); { + api::OpProfiler profiler(command_buffer, context->querypool(), op_name); + if C10_LIKELY(v_self.has_image()) { const struct Block final { uvec3 extents; @@ -255,44 +282,45 @@ Tensor hardtanh( const Tensor& self, const Scalar& min, const Scalar& max) { - return ops::clamp(self, min, max); + return ops::_clamp(self, min, max, "aten::hardtanh"); } Tensor& hardtanh_( Tensor& self, const Scalar& min, const Scalar& max) { - return ops::clamp_(self, min, max); + return ops::_clamp_(self, min, max, "aten::hardtanh_"); } Tensor relu(const Tensor& self) { - return ops::clamp(self, 0, c10::nullopt); + return ops::_clamp(self, 0, c10::nullopt, "aten::relu"); } Tensor& relu_(Tensor& self) { - return ops::clamp_(self, 0, c10::nullopt); + return ops::_clamp_(self, 0, c10::nullopt, "aten::relu_"); } Tensor hardswish(const Tensor& self) { - return ops::activation(self, VK_KERNEL(hardswish)); + return ops::activation(self, VK_KERNEL(hardswish), "aten::hardswish"); } Tensor& hardswish_(Tensor& self) { - return ops::activation_(self, VK_KERNEL(hardswish_)); + return ops::activation_(self, VK_KERNEL(hardswish_), "aten::hardswish_"); } Tensor hardsigmoid(const Tensor& self) { - return ops::activation(self, VK_KERNEL(hardsigmoid)); + return ops::activation(self, VK_KERNEL(hardsigmoid), "aten::hardsigmoid"); } Tensor& hardsigmoid_(Tensor& self) { - return ops::activation_(self, VK_KERNEL(hardsigmoid_)); + return ops::activation_(self, VK_KERNEL(hardsigmoid_), "aten::hardsigmoid_"); } Tensor activation_scalar( const Tensor& self_arg, const Scalar& scalar_arg, - const api::Shader::Descriptor& shader_descriptor) { + const api::Shader::Descriptor& shader_descriptor, + const std::string& op_name) { api::Context* const context = api::context(); const Tensor self = self_arg.is_vulkan() ? self_arg : self_arg.vulkan(); @@ -307,6 +335,8 @@ Tensor activation_scalar( api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); { + api::OpProfiler profiler(command_buffer, context->querypool(), op_name); + if C10_LIKELY(v_output.has_image() && v_self.has_image()) { const struct Block final { uvec3 extents; @@ -355,7 +385,8 @@ Tensor activation_scalar( Tensor& activation_scalar_( Tensor& self, const Scalar& scalar_arg, - const api::Shader::Descriptor& shader_descriptor) { + const api::Shader::Descriptor& shader_descriptor, + const std::string& op_name) { api::Context* const context = api::context(); TORCH_CHECK( @@ -367,6 +398,8 @@ Tensor& activation_scalar_( api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); { + api::OpProfiler profiler(command_buffer, context->querypool(), op_name); + if C10_LIKELY(v_self.has_image()) { const struct Block final { uvec3 extents; @@ -409,41 +442,41 @@ Tensor& activation_scalar_( Tensor hardshrink( const Tensor& self_arg, const Scalar& lambd) { - return ops::activation_scalar(self_arg, lambd, VK_KERNEL(hardshrink)); + return ops::activation_scalar(self_arg, lambd, VK_KERNEL(hardshrink), "aten::hardshrink"); } Tensor& hardshrink_( Tensor& self, const Scalar& lambd) { - return ops::activation_scalar_(self, lambd, VK_KERNEL(hardshrink_)); + return ops::activation_scalar_(self, lambd, VK_KERNEL(hardshrink_), "aten::hardshrink_"); } Tensor leaky_relu( const Tensor& self_arg, const Scalar& negative_slope) { - return ops::activation_scalar(self_arg, negative_slope, VK_KERNEL(leaky_relu)); + return ops::activation_scalar(self_arg, negative_slope, VK_KERNEL(leaky_relu), "aten::leaky_relu"); } Tensor& leaky_relu_( Tensor& self, const Scalar& negative_slope) { - return ops::activation_scalar_(self, negative_slope, VK_KERNEL(leaky_relu_)); + return ops::activation_scalar_(self, negative_slope, VK_KERNEL(leaky_relu_), "aten::leaky_relu_"); } Tensor sigmoid(const Tensor& self) { - return ops::activation(self, VK_KERNEL(sigmoid)); + return ops::activation(self, VK_KERNEL(sigmoid), "aten::sigmoid"); } Tensor& sigmoid_(Tensor& self) { - return ops::activation_(self, VK_KERNEL(sigmoid_)); + return ops::activation_(self, VK_KERNEL(sigmoid_), "aten::sigmoid_"); } Tensor tanh(const Tensor& self) { - return ops::activation(self, VK_KERNEL(tanh)); + return ops::activation(self, VK_KERNEL(tanh), "aten::tanh"); } Tensor& tanh_(Tensor& self) { - return ops::activation_(self, VK_KERNEL(tanh_)); + return ops::activation_(self, VK_KERNEL(tanh_), "aten::tanh_"); } #ifdef USE_VULKAN_API diff --git a/aten/src/ATen/native/vulkan/ops/Concat.cpp b/aten/src/ATen/native/vulkan/ops/Concat.cpp index 3d587864ad2a..eefa365bc478 100644 --- a/aten/src/ATen/native/vulkan/ops/Concat.cpp +++ b/aten/src/ATen/native/vulkan/ops/Concat.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -25,73 +26,75 @@ Tensor cat_feature(const TensorList tensors, vTensor& v_output) { api::Context* const context = api::context(); api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); + { + api::OpProfiler profiler(command_buffer, context->querypool(), "aten::_cat (cat_batch)"); - int64_t ch_size_allprior = 0; - int64_t ch_interval = 0; - for (const auto& tensor : tensors) { - ch_interval += tensor.sizes()[1]; - } + int64_t ch_size_allprior = 0; + int64_t ch_interval = 0; + for (const auto& tensor : tensors) { + ch_interval += tensor.sizes()[1]; + } - auto dst_image = v_output.image( - command_buffer, - vTensor::Stage::Compute, - vTensor::Access::Read | vTensor::Access::Write); - - for (const auto& tensor : tensors) { - const Tensor self = tensor.is_vulkan() ? tensor : tensor.vulkan(); - const vTensor& v_self = convert(self); - if C10_LIKELY(v_output.has_image() && v_self.has_image()) { - auto src_image = v_self.image( - command_buffer, - vTensor::Stage::Compute); - - const struct Block final { - uvec3 size; // output texture size - uint32_t fill_0; // dummy - uvec3 isize; // input texture size - uint32_t fill_1; // dummy - uint32_t batch_size; // input tensor's batch size - uint32_t ch_size; // input tensor's channel size - uint32_t ch_interval; // channel interval (total # of channels for all tensors) - uint32_t ch_size_allprior; // # of channels for tensor 0 to i-1 at ith tensor - } block { - v_output.extents(), - 0u, - v_self.extents(), - 0u, - safe_downcast(v_self.sizes()[0]), - safe_downcast(v_self.sizes()[1]), - safe_downcast(ch_interval), - safe_downcast(ch_size_allprior), - }; - - ch_size_allprior += v_self.sizes()[1]; - - context->dispatch( - command_buffer, - { - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, - }, - VK_KERNEL(cat_feature), + auto dst_image = v_output.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Read | vTensor::Access::Write); + + for (const auto& tensor : tensors) { + const Tensor self = tensor.is_vulkan() ? tensor : tensor.vulkan(); + const vTensor& v_self = convert(self); + if C10_LIKELY(v_output.has_image() && v_self.has_image()) { + auto src_image = v_self.image( + command_buffer, + vTensor::Stage::Compute); + + const struct Block final { + uvec3 size; // output texture size + uint32_t fill_0; // dummy + uvec3 isize; // input texture size + uint32_t fill_1; // dummy + uint32_t batch_size; // input tensor's batch size + uint32_t ch_size; // input tensor's channel size + uint32_t ch_interval; // channel interval (total # of channels for all tensors) + uint32_t ch_size_allprior; // # of channels for tensor 0 to i-1 at ith tensor + } block { + v_output.extents(), + 0u, v_self.extents(), - context->gpu().adapter->local_work_group_size(), - // Read/Write access bypasses synchronization but inserts appropriate - // barriers if necessary. - dst_image, - // Read-only access is implied on const tensors and triggers an async - // synchronization if necessary. - src_image, - // Object lifetime is managed by the resource pool. - // It is OK not to keep track of the handle. - context->resource().pool.uniform(block).object); - } - else { - TORCH_CHECK(false, "Not implemented!"); + 0u, + safe_downcast(v_self.sizes()[0]), + safe_downcast(v_self.sizes()[1]), + safe_downcast(ch_interval), + safe_downcast(ch_size_allprior), + }; + + ch_size_allprior += v_self.sizes()[1]; + + context->dispatch( + command_buffer, + { + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + }, + VK_KERNEL(cat_feature), + v_self.extents(), + context->gpu().adapter->local_work_group_size(), + // Read/Write access bypasses synchronization but inserts appropriate + // barriers if necessary. + dst_image, + // Read-only access is implied on const tensors and triggers an async + // synchronization if necessary. + src_image, + // Object lifetime is managed by the resource pool. + // It is OK not to keep track of the handle. + context->resource().pool.uniform(block).object); + } + else { + TORCH_CHECK(false, "Not implemented!"); + } } } - command_pool.submit(context->gpu().queue, command_buffer); return convert(v_output); @@ -101,52 +104,54 @@ Tensor cat_feature_mult4ch(const TensorList tensors, vTensor& v_output) { api::Context* const context = api::context(); api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); + { + api::OpProfiler profiler(command_buffer, context->querypool(), "aten::_cat (cat_feature_mult4ch)"); - int64_t depth_size_allprior = 0; - int64_t ch_interval = 0; - for (const auto& tensor : tensors) { - ch_interval += tensor.sizes()[1]; - } - const int64_t depth_interval = ch_interval / 4; - - auto dst_image = v_output.image( - command_buffer, - vTensor::Stage::Transfer, - vTensor::Access::Write); - uvec3 src_offset{}; - uvec3 dst_offset{}; - - for (const auto& tensor : tensors) { - const Tensor self = tensor.is_vulkan() ? tensor : tensor.vulkan(); - const vTensor& v_self = convert(self); - if C10_LIKELY(v_output.has_image() && v_self.has_image()) { - auto src_image = v_self.image( - command_buffer, - vTensor::Stage::Transfer); - - const uint32_t depth_slice = safe_downcast(tensor.sizes()[1] / 4); - uvec3 copy_extents {v_self.extents().data[0u], - v_self.extents().data[1u], - depth_slice}; - - for (const auto b : c10::irange(tensor.sizes()[0])) { - src_offset.data[2u] = safe_downcast(depth_slice * b); - dst_offset.data[2u] = depth_size_allprior + safe_downcast(depth_interval * b); - api::helper::copy_texture_to_texture(command_buffer, - src_image, - dst_image, - copy_extents, - src_offset, - dst_offset); - } - - depth_size_allprior += depth_slice; + int64_t depth_size_allprior = 0; + int64_t ch_interval = 0; + for (const auto& tensor : tensors) { + ch_interval += tensor.sizes()[1]; } - else { - TORCH_CHECK(false, "Not implemented!"); + const int64_t depth_interval = ch_interval / 4; + + auto dst_image = v_output.image( + command_buffer, + vTensor::Stage::Transfer, + vTensor::Access::Write); + uvec3 src_offset{}; + uvec3 dst_offset{}; + + for (const auto& tensor : tensors) { + const Tensor self = tensor.is_vulkan() ? tensor : tensor.vulkan(); + const vTensor& v_self = convert(self); + if C10_LIKELY(v_output.has_image() && v_self.has_image()) { + auto src_image = v_self.image( + command_buffer, + vTensor::Stage::Transfer); + + const uint32_t depth_slice = safe_downcast(tensor.sizes()[1] / 4); + uvec3 copy_extents {v_self.extents().data[0u], + v_self.extents().data[1u], + depth_slice}; + + for (const auto b : c10::irange(tensor.sizes()[0])) { + src_offset.data[2u] = safe_downcast(depth_slice * b); + dst_offset.data[2u] = depth_size_allprior + safe_downcast(depth_interval * b); + api::helper::copy_texture_to_texture(command_buffer, + src_image, + dst_image, + copy_extents, + src_offset, + dst_offset); + } + + depth_size_allprior += depth_slice; + } + else { + TORCH_CHECK(false, "Not implemented!"); + } } } - command_pool.submit(context->gpu().queue, command_buffer); return convert(v_output); @@ -160,37 +165,39 @@ Tensor cat_height(const TensorList tensors, vTensor& v_output) { api::Context* const context = api::context(); api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); + { + api::OpProfiler profiler(command_buffer, context->querypool(), "aten::_cat (cat_width)"); + + auto dst_image = v_output.image( + command_buffer, + vTensor::Stage::Transfer, + vTensor::Access::Write); + + uvec3 src_offset{}; + uvec3 dst_offset{}; + for (const auto& tensor : tensors) { + const Tensor self = tensor.is_vulkan() ? tensor : tensor.vulkan(); + const vTensor& v_self = convert(self); + if C10_LIKELY(v_output.has_image() && v_self.has_image()) { + auto src_image = v_self.image( + command_buffer, + vTensor::Stage::Transfer); - auto dst_image = v_output.image( - command_buffer, - vTensor::Stage::Transfer, - vTensor::Access::Write); - - uvec3 src_offset{}; - uvec3 dst_offset{}; - for (const auto& tensor : tensors) { - const Tensor self = tensor.is_vulkan() ? tensor : tensor.vulkan(); - const vTensor& v_self = convert(self); - if C10_LIKELY(v_output.has_image() && v_self.has_image()) { - auto src_image = v_self.image( - command_buffer, - vTensor::Stage::Transfer); - - api::helper::copy_texture_to_texture(command_buffer, - src_image, - dst_image, - v_self.extents(), - src_offset, - dst_offset); - - // Increment by height - dst_offset.data[1u] += v_self.extents().data[1u]; - } - else { - TORCH_CHECK(false, "Not implemented!"); + api::helper::copy_texture_to_texture(command_buffer, + src_image, + dst_image, + v_self.extents(), + src_offset, + dst_offset); + + // Increment by height + dst_offset.data[1u] += v_self.extents().data[1u]; + } + else { + TORCH_CHECK(false, "Not implemented!"); + } } } - command_pool.submit(context->gpu().queue, command_buffer); return convert(v_output); @@ -199,7 +206,6 @@ Tensor cat_height(const TensorList tensors, vTensor& v_output) { Tensor cat( const at::TensorList tensors, const int64_t dim) { - const auto norm_dim = normalize_dim(dim, 4); TORCH_CHECK( tensors.size() > 0, "Vulkan cat expects at least one tensor"); diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp index c7d629cae96f..94799208c7c8 100644 --- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp +++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp @@ -1,8 +1,9 @@ -#include +#include +#include #include #include #include -#include +#include #include namespace at { @@ -290,7 +291,7 @@ vTensor pack_weights( } api::Context* const context = api::context(); - api::Command::Buffer& command_buffer = context->command().pool.stream(); + api::Command::Buffer& command_buffer = context->command().pool.stream(); // Don't collect the timestamp since the command buffer doesn't record anything const Tensor weight = weight_arg.contiguous(); @@ -322,7 +323,7 @@ vTensor pack_biases( } api::Context* const context = api::context(); - api::Command::Buffer& command_buffer = context->command().pool.stream(); + api::Command::Buffer& command_buffer = context->command().pool.stream(); // Don't collect the timestamp since the command buffer doesn't record anything const int64_t src_w = weight.size(Layout::Filter::output); const int64_t packed_w = div_up(src_w, INT64_C(4)); @@ -549,14 +550,15 @@ Conv2dOpContext Conv2dOpContext::create( groups, method, output_min, - output_max, + output_max }; } void Conv2dOpContext::conv2d_sliding_window( const api::Shader::Descriptor& shader, vTensor& v_output, - const vTensor& v_input) const { + const vTensor& v_input, + const std::string& op_name) const { bool valid = C10_LIKELY(v_output.has_image() && v_input.has_image() && packed_.v_weight.has_image()); TORCH_CHECK(valid, "Not Implemented!") @@ -564,6 +566,8 @@ void Conv2dOpContext::conv2d_sliding_window( api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); { + api::OpProfiler profiler(command_buffer, context->querypool(), op_name); + const struct Block final { uvec3 extents; int32_t ic4; @@ -667,103 +671,106 @@ void Conv2dOpContext::conv2d_winograd_2_3( api::Context* const context = api::context(); api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); - - vTensor v_input_winograd{ - context, - { - v_input.sizes()[Layout::Activation4D::batch], - v_input.sizes()[Layout::Activation4D::channels], - out_h_units*4, - out_w_units*4, - }, - v_output.options(), - }; - { - const struct TransformBlock final { - uvec3 extents; - uint32_t fill; - ivec2 limits; - ivec2 padding; - } transform_block { - v_input_winograd.extents(), - 0u, + api::OpProfiler profiler(command_buffer, context->querypool(), "prepacked::conv2d_clamp_run (conv2d_winograd_2_3)"); + + vTensor v_input_winograd{ + context, { - safe_downcast(v_input.sizes()[Layout::Activation4D::width]), - safe_downcast(v_input.sizes()[Layout::Activation4D::height]), - }, - { - safe_downcast(packed_.padding[Layout::Parameter::width]), - safe_downcast(packed_.padding[Layout::Parameter::height]), + v_input.sizes()[Layout::Activation4D::batch], + v_input.sizes()[Layout::Activation4D::channels], + out_h_units*4, + out_w_units*4, }, + v_output.options(), }; - context->dispatch( - command_buffer, + { + const struct TransformBlock final { + uvec3 extents; + uint32_t fill; + ivec2 limits; + ivec2 padding; + } transform_block { + v_input_winograd.extents(), + 0u, { - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + safe_downcast(v_input.sizes()[Layout::Activation4D::width]), + safe_downcast(v_input.sizes()[Layout::Activation4D::height]), }, - VK_KERNEL(transform_winograd_2_3_sh), - v_input_winograd.extents(), - adaptive_work_group_size(v_input_winograd.extents()), - v_input_winograd.image( - command_buffer, - vTensor::Stage::Compute, - vTensor::Access::Write), - v_input.image( - command_buffer, - vTensor::Stage::Compute), - context->resource().pool.uniform(transform_block).object); - - } - { - const struct Block final { - uvec3 extents; - int32_t ic4; - vec2 clamp; - } block { - v_output.extents(), - safe_downcast(packed_.filter[Layout::Filter::input] / 4), - { - packed_.output_min, - packed_.output_max, - }, - }; + { + safe_downcast(packed_.padding[Layout::Parameter::width]), + safe_downcast(packed_.padding[Layout::Parameter::height]), + }, + }; - uvec3 global_size = { - safe_downcast(out_w_units), - safe_downcast(out_h_units), - v_output.extents().data[2u], - }; + context->dispatch( + command_buffer, + { + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + }, + VK_KERNEL(transform_winograd_2_3_sh), + v_input_winograd.extents(), + adaptive_work_group_size(v_input_winograd.extents()), + v_input_winograd.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Write), + v_input.image( + command_buffer, + vTensor::Stage::Compute), + context->resource().pool.uniform(transform_block).object); - context->dispatch( - command_buffer, + } + { + const struct Block final { + uvec3 extents; + int32_t ic4; + vec2 clamp; + } block { + v_output.extents(), + safe_downcast(packed_.filter[Layout::Filter::input] / 4), { - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + packed_.output_min, + packed_.output_max, }, - VK_KERNEL(conv2d_winograd_2_3), - global_size, - adaptive_work_group_size(global_size), - v_output.image( - command_buffer, - vTensor::Stage::Compute, - vTensor::Access::Write), - v_input_winograd.image( - command_buffer, - vTensor::Stage::Compute), - packed_.v_weight.image( - command_buffer, - vTensor::Stage::Compute), - packed_.v_bias.buffer( - command_buffer, - vTensor::Stage::Compute), - context->resource().pool.uniform(block).object); + }; + + uvec3 global_size = { + safe_downcast(out_w_units), + safe_downcast(out_h_units), + v_output.extents().data[2u], + }; + + context->dispatch( + command_buffer, + { + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + }, + VK_KERNEL(conv2d_winograd_2_3), + global_size, + adaptive_work_group_size(global_size), + v_output.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Write), + v_input_winograd.image( + command_buffer, + vTensor::Stage::Compute), + packed_.v_weight.image( + command_buffer, + vTensor::Stage::Compute), + packed_.v_bias.buffer( + command_buffer, + vTensor::Stage::Compute), + context->resource().pool.uniform(block).object); + } } command_pool.submit(context->gpu().queue, command_buffer); } @@ -797,19 +804,22 @@ Tensor Conv2dOpContext::run(const Tensor& input_arg) const { conv2d_sliding_window( VK_KERNEL(conv2d_dw), v_output, - v_input); + v_input, + "prepacked::conv2d_clamp_run (conv2d_sliding_window::conv2d_dw)"); break; case Conv2dPointwise: conv2d_sliding_window( VK_KERNEL(conv2d_pw_2x2), v_output, - v_input); + v_input, + "prepacked::conv2d_clamp_run (conv2d_sliding_window::conv2d_pw_2x2)"); break; default: conv2d_sliding_window( VK_KERNEL(conv2d), v_output, - v_input); + v_input, + "prepacked::conv2d_clamp_run (conv2d_sliding_window::conv2d)"); break; } diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.h b/aten/src/ATen/native/vulkan/ops/Convolution.h index 78eef9111d3b..c87d86c585b0 100644 --- a/aten/src/ATen/native/vulkan/ops/Convolution.h +++ b/aten/src/ATen/native/vulkan/ops/Convolution.h @@ -61,7 +61,8 @@ class Conv2dOpContext final : public torch::jit::CustomClassHolder { void conv2d_sliding_window( const api::Shader::Descriptor& shader, vTensor& v_output, - const vTensor& v_input) const; + const vTensor& v_input, + const std::string& op_name) const; void conv2d_winograd_2_3( vTensor& v_output, diff --git a/aten/src/ATen/native/vulkan/ops/Copy.cpp b/aten/src/ATen/native/vulkan/ops/Copy.cpp index 1cf6b1ad6aa9..b7fbea07d9e6 100644 --- a/aten/src/ATen/native/vulkan/ops/Copy.cpp +++ b/aten/src/ATen/native/vulkan/ops/Copy.cpp @@ -1,3 +1,4 @@ +#include #include namespace at { @@ -9,7 +10,6 @@ Tensor& copy_(Tensor& self, const Tensor& src) { api::Context* const context = api::context(); api::Command::Pool& command_pool = context->command().pool; - api::Command::Buffer& command_buffer = command_pool.stream(); { // X -> Vulkan if (at::kVulkan == self.device().type()) { @@ -17,28 +17,33 @@ Tensor& copy_(Tensor& self, const Tensor& src) { // Vulkan -> Vulkan if (at::kVulkan == src.device().type()) { - command_buffer.copy( - // - Read-only access is implied on const tensors. Memory barriers - // are automatically inserted if a RAW hazard is detected. - // - Recording any potential pending sync operations into the same - // command buffer prevents an expensive queue submission. - convert(src).buffer( - command_buffer, - vTensor::Stage::Transfer), - // - Write-only access never triggers a sync as the contents will be - // overwritten regardless. Having said that, appropriate barriers - // are inserted automatically if WAR or WAW hazards are detected. - // - Recording pending sync operations into the same command buffer - // prevents an expensive queue submission. - v_self.buffer( - command_buffer, - vTensor::Stage::Transfer, - vTensor::Access::Write)); - + api::Command::Buffer& command_buffer = command_pool.stream(); + { + api::OpProfiler profiler(command_buffer, context->querypool(), "copy_"); + + command_buffer.copy( + // - Read-only access is implied on const tensors. Memory barriers + // are automatically inserted if a RAW hazard is detected. + // - Recording any potential pending sync operations into the same + // command buffer prevents an expensive queue submission. + convert(src).buffer( + command_buffer, + vTensor::Stage::Transfer), + // - Write-only access never triggers a sync as the contents will be + // overwritten regardless. Having said that, appropriate barriers + // are inserted automatically if WAR or WAW hazards are detected. + // - Recording pending sync operations into the same command buffer + // prevents an expensive queue submission. + v_self.buffer( + command_buffer, + vTensor::Stage::Transfer, + vTensor::Access::Write)); + } command_pool.submit(context->gpu().queue, command_buffer); } // CPU -> Vulkan else { + api::Command::Buffer& command_buffer = command_pool.stream(); // Don't collect the timestamp since the command buffer doesn't record anything const Tensor cpu_src = src.device().is_cpu() ? src : src.cpu(); // Requesting write-only host access to the tensor never triggers a sync @@ -75,6 +80,7 @@ Tensor& copy_(Tensor& self, const Tensor& src) { } // Vulkan -> X else if (at::kVulkan == src.device().type()) { + api::Command::Buffer& command_buffer = command_pool.stream(); // Don't collect the timestamp since the command buffer doesn't record anything const vTensor& v_src = convert(src); // Vulkan -> CPU diff --git a/aten/src/ATen/native/vulkan/ops/Gru.cpp b/aten/src/ATen/native/vulkan/ops/Gru.cpp new file mode 100644 index 000000000000..8395dc8bebde --- /dev/null +++ b/aten/src/ATen/native/vulkan/ops/Gru.cpp @@ -0,0 +1,246 @@ +#include +#include + +namespace at { +namespace native { +namespace vulkan { +namespace ops { +namespace { +// +// input_vk: input tensor of shape (L, N, H_in) when batch_first=False +// (N, L, H_in) when batch_first=True containing the features of the input sequence +// hx_vk: initial hidden state for each element in the batch. tensor of shape (D * num_layers, N, H_out) +// output: tensor of shape (N, L, D * H_out)) when batch_first=True +// h_n: tensor of shape (D * num_layers, N, H_out) +// +// where +// L = sequence length +// N = batch size +// D = 2 if bidirectional=True otherwise 1 +// H_in = input_size (# of expected features in the input x) +// H_out = hidden_size (# of features in the hidden state h) +// +std::tuple gru_input( + const Tensor & input_vk, // input sequence (vulkan) + const Tensor & hx_vk, // initial hidden state (vulkan) + TensorList params_cpu, // weights/biases (cpu) + bool has_biases, + int64_t num_layers, + double dropout, + bool train, + bool bidirectional, + bool batch_first) { + TORCH_CHECK(params_cpu.size() == 4 * num_layers, "Vulkan gru expects 'params_cpu' size to be 4 * 'num_layers'."); + TORCH_INTERNAL_ASSERT(input_vk.sizes().size() == 3, "Vulkan gru expects 'input_vk' dims to be 3."); + TORCH_INTERNAL_ASSERT(hx_vk.sizes().size() == 3, "Vulkan gru expects 'hx_vk' dims to be 3."); + TORCH_INTERNAL_ASSERT(has_biases, "Vulkan gru expects 'has_biases' to be true."); + TORCH_INTERNAL_ASSERT(!train, "Vulkan gru expects 'train' to be false."); + TORCH_INTERNAL_ASSERT(!bidirectional, "Vulkan gru expects 'bidirectional' to be false."); + TORCH_INTERNAL_ASSERT(batch_first, "Vulkan gru expects 'batch_first' to be true."); + TORCH_INTERNAL_ASSERT(dropout < std::numeric_limits::epsilon()*1000, "Vulkan gru expects 'dropout' to be 0.0."); + + const auto h_in = input_vk.size(2); + std::vector h_n_list; // hidden output + + // reshape to 2D due to Vulkan at::mm op accepts only 2D + auto x = input_vk.reshape({input_vk.size(0) * input_vk.size(1), input_vk.size(2)}); + + for (int64_t i = 0; i < num_layers; ++i) { + // extract each hidden state and squeeze into 2D dim + auto h = at::slice(hx_vk, 0, i, i + 1, 1); + h = h.reshape({h.size(0) * h.size(1), h.size(2)}); + + const auto& w_ih = params_cpu[i * 4]; + const auto& w_hh = params_cpu[i * 4 + 1]; + const auto& b_ih = params_cpu[i * 4 + 2]; + const auto& b_hh = params_cpu[i * 4 + 3]; + + const auto& w_i_rzn = w_ih.split(h_in); + const auto& w_h_rzn = w_hh.split(h_in); + const auto& b_i_rzn = b_ih.split(h_in); + const auto& b_h_rzn = b_hh.split(h_in); + + const auto& w_ir = w_i_rzn[0]; + const auto& w_iz = w_i_rzn[1]; + const auto& w_in = w_i_rzn[2]; + const auto& w_hr = w_h_rzn[0]; + const auto& w_hz = w_h_rzn[1]; + const auto& w_hn = w_h_rzn[2]; + const auto& b_ir = b_i_rzn[0]; + const auto& b_iz = b_i_rzn[1]; + const auto& b_in = b_i_rzn[2]; + const auto& b_hr = b_h_rzn[0]; + const auto& b_hz = b_h_rzn[1]; + const auto& b_hn = b_h_rzn[2]; + + const auto& r = at::sigmoid(at::addmm(b_ir, x, w_ir.t()) + at::addmm(b_hr, h, w_hr.t())); + const auto& z = at::sigmoid(at::addmm(b_iz, x, w_iz.t()) + at::addmm(b_hz, h, w_hz.t())); + const auto& n = at::tanh(at::addmm(b_in, x, w_in.t()) + r * (at::addmm(b_hn, h, w_hn.t()))); + h = (z * (-1) + 1) * n + z * h; + x = h; // next input + h_n_list.emplace_back(h.reshape({1, 1, h.size(0), h.size(1)})); // 2D to 4D for cat op + } + + auto h_n = at::cat(h_n_list, 1); + h_n = h_n.reshape({h_n.size(0) * h_n.size(1), h_n.size(2), h_n.size(3)}); + return std::tuple(x, h_n); +} + +#ifdef USE_VULKAN_API + +TORCH_LIBRARY_IMPL(aten, Vulkan, m) { + m.impl(TORCH_SELECTIVE_NAME("aten::gru.input"), TORCH_FN(gru_input)); +} + +#endif /* USE_VULKAN_API */ + +} // namespace + +std::vector pack_linear_op_contexts( + const std::vector& params_cpu, + int64_t num_layers) { + TORCH_CHECK(params_cpu.size() == 4 * num_layers, "Vulkan gru expects 'params_cpu' size to be 4 * 'num_layers'."); + std::vector linear_op_contexts; + for (int64_t i = 0; i < num_layers; ++i) { + const auto& w_ih = params_cpu.at(i * 4); + const auto& w_hh = params_cpu.at(i * 4 + 1); + const auto& b_ih = params_cpu.at(i * 4 + 2); + const auto& b_hh = params_cpu.at(i * 4 + 3); + const auto& h_in = w_ih.size(0) / 3; + + const auto& w_i_rzn = w_ih.split(h_in); + const auto& w_h_rzn = w_hh.split(h_in); + const auto& b_i_rzn = b_ih.split(h_in); + const auto& b_h_rzn = b_hh.split(h_in); + + const auto& w_ir = w_i_rzn[0]; + const auto& w_iz = w_i_rzn[1]; + const auto& w_in = w_i_rzn[2]; + const auto& w_hr = w_h_rzn[0]; + const auto& w_hz = w_h_rzn[1]; + const auto& w_hn = w_h_rzn[2]; + const auto& b_ir = b_i_rzn[0]; + const auto& b_iz = b_i_rzn[1]; + const auto& b_in = b_i_rzn[2]; + const auto& b_hr = b_h_rzn[0]; + const auto& b_hz = b_h_rzn[1]; + const auto& b_hn = b_h_rzn[2]; + + linear_op_contexts.emplace_back(LinearOpContext::create(w_ir.t(), b_ir)); + linear_op_contexts.emplace_back(LinearOpContext::create(w_hr.t(), b_hr)); + linear_op_contexts.emplace_back(LinearOpContext::create(w_iz.t(), b_iz)); + linear_op_contexts.emplace_back(LinearOpContext::create(w_hz.t(), b_hz)); + linear_op_contexts.emplace_back(LinearOpContext::create(w_in.t(), b_in)); + linear_op_contexts.emplace_back(LinearOpContext::create(w_hn.t(), b_hn)); + } + return linear_op_contexts; +} + +GruOpContext::GruOpContext( + const std::vector& params_cpu, + bool has_biases, + int64_t num_layers, + double dropout, + bool train, + bool bidirectional, + bool batch_first) + : packed_{pack_linear_op_contexts(params_cpu, num_layers), has_biases, num_layers, dropout, train, bidirectional, batch_first}, + unpacked_{params_cpu, has_biases, num_layers, dropout, train, bidirectional, batch_first} { + TORCH_INTERNAL_ASSERT(packed_.has_biases, "Vulkan gru expects 'has_biases' to be true."); + TORCH_INTERNAL_ASSERT(!packed_.train, "Vulkan gru expects 'train' to be false."); + TORCH_INTERNAL_ASSERT(!packed_.bidirectional, "Vulkan gru expects 'bidirectional' to be false."); + TORCH_INTERNAL_ASSERT(packed_.batch_first, "Vulkan gru expects 'batch_first' to be true."); + TORCH_INTERNAL_ASSERT(packed_.dropout < std::numeric_limits::epsilon()*1000, "Vulkan gru expects 'dropout' to be 0.0."); +} + +GruOpContext GruOpContext::create( + const std::vector& params_cpu, // weights/biases (cpu) + bool has_biases, + int64_t num_layers, + double dropout, + bool train, + bool bidirectional, + bool batch_first) { + return GruOpContext{ + params_cpu, + has_biases, + num_layers, + dropout, + train, + bidirectional, + batch_first + }; +} + +std::tuple GruOpContext::run( + const Tensor & input_vk, // input sequence (vulkan) + const Tensor & hx_vk) const { // initial hidden state (vulkan) + TORCH_INTERNAL_ASSERT(input_vk.sizes().size() == 3, "Vulkan gru expects 'input_vk' dims to be 3."); + TORCH_INTERNAL_ASSERT(hx_vk.sizes().size() == 3, "Vulkan gru expects 'hx_vk' dims to be 3."); + + const int64_t linear_op_contexts_per_layer = 6; // (b_ir, w_ir), (b_hr, w_hr), (b_iz, w_iz), (b_hz, w_hz), (b_in, w_in), (b_hn, w_hn) + std::vector h_n_list; // hidden output + + // reshape to 2D due to Vulkan at::mm op accepts only 2D + auto x = input_vk.reshape({input_vk.size(0) * input_vk.size(1), input_vk.size(2)}); + + for (int64_t i = 0; i < packed_.num_layers; ++i) { + // extract each hidden state and squeeze into 2D dim + auto h = at::slice(hx_vk, 0, i, i + 1, 1); + h = h.reshape({h.size(0) * h.size(1), h.size(2)}); + + const auto& cxt_ir = packed_.linear_op_contexts[i * linear_op_contexts_per_layer + 0]; + const auto& cxt_hr = packed_.linear_op_contexts[i * linear_op_contexts_per_layer + 1]; + const auto& cxt_iz = packed_.linear_op_contexts[i * linear_op_contexts_per_layer + 2]; + const auto& cxt_hz = packed_.linear_op_contexts[i * linear_op_contexts_per_layer + 3]; + const auto& cxt_in = packed_.linear_op_contexts[i * linear_op_contexts_per_layer + 4]; + const auto& cxt_hn = packed_.linear_op_contexts[i * linear_op_contexts_per_layer + 5]; + + const auto& r = at::sigmoid(cxt_ir.run(x, 1.0f, 1.0f, "aten::addmm") + cxt_hr.run(h, 1.0f, 1.0f, "aten::addmm")); + const auto& z = at::sigmoid(cxt_iz.run(x, 1.0f, 1.0f, "aten::addmm") + cxt_hz.run(h, 1.0f, 1.0f, "aten::addmm")); + const auto& n = at::tanh(cxt_in.run(x, 1.0f, 1.0f, "aten::addmm") + r * (cxt_hn.run(h, 1.0f, 1.0f, "aten::addmm"))); + h = (z * (-1) + 1) * n + z * h; + x = h; // next input + h_n_list.emplace_back(h.reshape({1, 1, h.size(0), h.size(1)})); // 2D to 4D for cat op + } + + auto h_n = at::cat(h_n_list, 1); + h_n = h_n.reshape({h_n.size(0) * h_n.size(1), h_n.size(2), h_n.size(3)}); + return std::tuple(x, h_n); +} + +GruOpContext::State GruOpContext::unpack() const { + return GruOpContext::State{ + unpacked_.params_cpu, + unpacked_.has_biases, + unpacked_.num_layers, + unpacked_.dropout, + unpacked_.train, + unpacked_.bidirectional, + unpacked_.batch_first, + }; +} + +c10::intrusive_ptr gru_prepack( + std::vector&& params_cpu, + bool has_biases, + int64_t num_layers, + double dropout, + bool train, + bool bidirectional, + bool batch_first) { + return c10::make_intrusive(GruOpContext::create( + params_cpu, has_biases, num_layers, dropout, train, bidirectional, batch_first)); +} + +std::tuple gru_run( + const Tensor& input_vk, + const Tensor& hx_vk, + const c10::intrusive_ptr& context) { + return context->run(input_vk, hx_vk); +} + +} // namespace ops +} // namespace vulkan +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/vulkan/ops/Gru.h b/aten/src/ATen/native/vulkan/ops/Gru.h new file mode 100644 index 000000000000..8000aa449ca4 --- /dev/null +++ b/aten/src/ATen/native/vulkan/ops/Gru.h @@ -0,0 +1,85 @@ +#pragma once + +#ifdef USE_VULKAN_API + +#include +#include +#include + +namespace at { +namespace native { +namespace vulkan { +namespace ops { + +class GruOpContext final : public torch::jit::CustomClassHolder { + public: + static GruOpContext create( + const std::vector& params_cpu, // weights/biases (cpu) + bool has_biases, + int64_t num_layers, + double dropout, + bool train, + bool bidirectional, + bool batch_first); + + using State = std::tuple, bool, int64_t, double, bool, bool, bool>; + + std::tuple run( + const Tensor& input_vk, + const Tensor & hx_vk) const; + State unpack() const; + + private: + GruOpContext( + const std::vector& params_cpu, // weights/biases (cpu) + bool has_biases, + int64_t num_layers, + double dropout, + bool train, + bool bidirectional, + bool batch_first); + + private: + struct { + std::vector linear_op_contexts; // {{ op context for b_ir, w_ir, op context for b_hr, w_hr, + // op context for b_iz, w_iz, op context for b_hz, w_hz, + // op context for b_in, w_in, op context for b_hn, w_hn,}, ...} + bool has_biases{}; + int64_t num_layers{}; + double dropout{}; + bool train{}; + bool bidirectional{}; + bool batch_first{}; + } packed_; + + struct { + std::vector params_cpu; // weights/biases (cpu) + bool has_biases{}; + int64_t num_layers{}; + double dropout{}; + bool train{}; + bool bidirectional{}; + bool batch_first{}; + } unpacked_; +}; + +c10::intrusive_ptr gru_prepack( + std::vector&& params_cpu, // weights/biases (cpu) + bool has_biases, + int64_t num_layers, + double dropout, + bool train, + bool bidirectional, + bool batch_first); + +std::tuple gru_run( + const Tensor& input_vk, + const Tensor & hx_vk, + const c10::intrusive_ptr& context); + +} // namespace ops +} // namespace vulkan +} // namespace native +} // namespace at + +#endif /* USE_VULKAN_API */ diff --git a/aten/src/ATen/native/vulkan/ops/Lerp.cpp b/aten/src/ATen/native/vulkan/ops/Lerp.cpp new file mode 100644 index 000000000000..4a1a351919c5 --- /dev/null +++ b/aten/src/ATen/native/vulkan/ops/Lerp.cpp @@ -0,0 +1,400 @@ +#include +#include +#include + +namespace at { +namespace native { +namespace vulkan { +namespace ops { +namespace { + +using namespace api::utils; + +void check_inputs_elementwise_op(const Tensor& input1, const Tensor& input2) { + TORCH_CHECK( + channels_size(input1) == channels_size(input2), + "Vulkan elementwise ops require channel dimension to be equal!"); + if (batch_size(input1) != batch_size(input2)) { + TORCH_CHECK( + channels_size(input1) % 4 == 0, + "Vulkan elementwise ops require channel to be a multiple of 4 to broadcast along batch dimension!") + } + + const uint32_t input1_h = height_size(input1); + const uint32_t input1_w = width_size(input1); + const uint32_t input2_h = height_size(input2); + const uint32_t input2_w = width_size(input2); + + const std::string broadcast_error_msg = + "Incompatible input dimensions for broadcasting for Vulkan elementwise op!"; + if (input1_h != input2_h) { + if (input1_h > input2_h) { + TORCH_CHECK(input2_h == 1, broadcast_error_msg); + TORCH_CHECK(input2_w == input1_w || input2_w == 1, broadcast_error_msg); + } else if (input2_h > input1_h) { + TORCH_CHECK(input1_h == 1, broadcast_error_msg); + TORCH_CHECK(input1_w == input2_w || input1_w == 1, broadcast_error_msg); + } + } else if (input1_w != input2_w) { + if (input1_w > input2_w) { + TORCH_CHECK(input2_w == 1, broadcast_error_msg); + } else if (input2_w > input1_w) { + TORCH_CHECK(input1_h == 1, broadcast_error_msg); + } + } +} + +Tensor _lerp_scalar( + const Tensor& start_arg, + const Tensor& end_arg, + const Scalar& weight_arg, + const std::string& op_name) { + check_inputs_elementwise_op(start_arg, end_arg); + api::Context* const context = api::context(); + + const Tensor start = start_arg.is_vulkan() ? start_arg : start_arg.vulkan(); + const vTensor& v_start = convert(start); + + const Tensor end = end_arg.is_vulkan() ? end_arg : end_arg.vulkan(); + const vTensor& v_end = convert(end); + + vTensor v_output{ + context, + v_start.sizes(), + v_start.options(), + }; + + api::Command::Pool& command_pool = context->command().pool; + api::Command::Buffer& command_buffer = command_pool.stream(); + { + api::OpProfiler profiler(command_buffer, context->querypool(), op_name); + + if C10_LIKELY (v_start.has_image() && v_end.has_image()) { + const float weight = weight_arg.to(); + const struct Block final { + uvec3 extents; + uint32_t fill_0; + uvec3 input1_extents; + uint32_t fill_1; + uvec3 input2_extents; + float weight; + } block{ + v_output.extents(), + 0u, + v_start.extents(), + 0u, + v_end.extents(), + weight, + }; + + context->dispatch( + command_buffer, + { + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + }, + VK_KERNEL(lerp_scalar), + v_output.extents(), + adaptive_work_group_size(v_output.extents()), + // Write-only access bypasses synchronization but inserts appropriate + // barriers if necessary. + v_output.image( + command_buffer, vTensor::Stage::Compute, vTensor::Access::Write), + // Read-only access is implied on const tensors and triggers an async + // synchronization if necessary. + v_start.image(command_buffer, vTensor::Stage::Compute), + // Read-only access is implied on const tensors and triggers an async + // synchronization if necessary. + v_end.image(command_buffer, vTensor::Stage::Compute), + // Object lifetime is managed by the resource pool. + // It is OK not to keep track of the handle. + context->resource().pool.uniform(block).object); + } else { + TORCH_CHECK(false, "Not implemented!"); + } + } + command_pool.submit(context->gpu().queue, command_buffer); + + return convert(v_output); +} + +Tensor& _lerp_scalar_( + Tensor& self, + const Tensor& end_arg, + const Scalar& weight_arg, + const std::string& op_name) { + check_inputs_elementwise_op(self, end_arg); + api::Context* const context = api::context(); + + TORCH_CHECK( + self.is_vulkan(), + "Vulkan: In-place lerp is only supported on Vulkan tensors."); + + vTensor& v_self = convert(self); + + const Tensor end = end_arg.is_vulkan() ? end_arg : end_arg.vulkan(); + const vTensor& v_end = convert(end); + + api::Command::Pool& command_pool = context->command().pool; + api::Command::Buffer& command_buffer = command_pool.stream(); + { + api::OpProfiler profiler(command_buffer, context->querypool(), op_name); + + if C10_LIKELY ( + v_self.has_image() && v_end.has_image() && !self.is_same(end)) { + const float weight = weight_arg.to(); + const struct Block final { + uvec3 extents; + uint32_t fill_0; + uvec3 input_extents; + float alpha; + } block{ + v_self.extents(), + 0u, + v_end.extents(), + weight, + }; + + context->dispatch( + command_buffer, + { + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + }, + VK_KERNEL(lerp_scalar_), + v_self.extents(), + adaptive_work_group_size(v_self.extents()), + // Read-Write access triggers an async synchronization if necessory + // and inserts appropriate barriers if hazards are detected. + v_self.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Read | vTensor::Access::Write), + // Read-only access is implied on const tensors and triggers an async + // synchronization if necessary. + v_end.image(command_buffer, vTensor::Stage::Compute), + // Object lifetime is managed by the resource pool. + // It is OK not to keep track of the handle. + context->resource().pool.uniform(block).object); + } else { + TORCH_CHECK(false, "Not implemented!"); + } + } + command_pool.submit(context->gpu().queue, command_buffer); + + return self; +} + +Tensor _lerp_tensor( + const Tensor& start_arg, + const Tensor& end_arg, + const Tensor& weight_arg, + const std::string& op_name) { + check_inputs_elementwise_op(start_arg, end_arg); + check_inputs_elementwise_op(start_arg, weight_arg); + api::Context* const context = api::context(); + + const Tensor start = start_arg.is_vulkan() ? start_arg : start_arg.vulkan(); + const vTensor& v_start = convert(start); + + const Tensor end = end_arg.is_vulkan() ? end_arg : end_arg.vulkan(); + const vTensor& v_end = convert(end); + + const Tensor weight = weight_arg.is_vulkan() ? weight_arg : weight_arg.vulkan(); + const vTensor& v_weight = convert(weight); + + vTensor v_output{ + context, + v_start.sizes(), + v_start.options(), + }; + + api::Command::Pool& command_pool = context->command().pool; + api::Command::Buffer& command_buffer = command_pool.stream(); + { + api::OpProfiler profiler(command_buffer, context->querypool(), op_name); + + if C10_LIKELY (v_start.has_image() && v_end.has_image() && v_weight.has_image()) { + const struct Block final { + uvec3 extents; + uint32_t fill_0; + uvec3 input1_extents; + uint32_t fill_1; + uvec3 input2_extents; + uint32_t fill_2; + uvec3 input3_extents; + uint32_t fill_3; + } block{ + v_output.extents(), + 0u, + v_start.extents(), + 0u, + v_end.extents(), + 0u, + v_weight.extents(), + 0u, + }; + + context->dispatch( + command_buffer, + { + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + }, + VK_KERNEL(lerp), + v_output.extents(), + adaptive_work_group_size(v_output.extents()), + // Write-only access bypasses synchronization but inserts appropriate + // barriers if necessary. + v_output.image( + command_buffer, vTensor::Stage::Compute, vTensor::Access::Write), + // Read-only access is implied on const tensors and triggers an async + // synchronization if necessary. + v_start.image(command_buffer, vTensor::Stage::Compute), + // Read-only access is implied on const tensors and triggers an async + // synchronization if necessary. + v_end.image(command_buffer, vTensor::Stage::Compute), + // Read-only access is implied on const tensors and triggers an async + // synchronization if necessary. + v_weight.image(command_buffer, vTensor::Stage::Compute), + // Object lifetime is managed by the resource pool. + // It is OK not to keep track of the handle. + context->resource().pool.uniform(block).object); + } else { + TORCH_CHECK(false, "Not implemented!"); + } + } + command_pool.submit(context->gpu().queue, command_buffer); + + return convert(v_output); +} + +Tensor& _lerp_tensor_( + Tensor& self, + const Tensor& end_arg, + const Tensor& weight_arg, + const std::string& op_name) { + check_inputs_elementwise_op(self, end_arg); + check_inputs_elementwise_op(self, weight_arg); + api::Context* const context = api::context(); + + TORCH_CHECK( + self.is_vulkan(), + "Vulkan: In-place lerp is only supported on Vulkan tensors."); + + vTensor& v_self = convert(self); + + const Tensor end = end_arg.is_vulkan() ? end_arg : end_arg.vulkan(); + const vTensor& v_end = convert(end); + + const Tensor weight = weight_arg.is_vulkan() ? weight_arg : weight_arg.vulkan(); + const vTensor& v_weight = convert(weight); + + api::Command::Pool& command_pool = context->command().pool; + api::Command::Buffer& command_buffer = command_pool.stream(); + { + api::OpProfiler profiler(command_buffer, context->querypool(), op_name); + + if C10_LIKELY ( + v_self.has_image() && v_end.has_image() && v_weight.has_image() && !self.is_same(end)) { + const struct Block final { + uvec3 extents; + uint32_t fill_0; + uvec3 input1_extents; + uint32_t fill_1; + uvec3 input2_extents; + uint32_t fill_2; + } block{ + v_self.extents(), + 0u, + v_end.extents(), + 0u, + v_weight.extents(), + 0u, + }; + + context->dispatch( + command_buffer, + { + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + }, + VK_KERNEL(lerp_), + v_self.extents(), + adaptive_work_group_size(v_self.extents()), + // Read-Write access triggers an async synchronization if necessory + // and inserts appropriate barriers if hazards are detected. + v_self.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Read | vTensor::Access::Write), + // Read-only access is implied on const tensors and triggers an async + // synchronization if necessary. + v_end.image(command_buffer, vTensor::Stage::Compute), + // Read-only access is implied on const tensors and triggers an async + // synchronization if necessary. + v_weight.image(command_buffer, vTensor::Stage::Compute), + // Object lifetime is managed by the resource pool. + // It is OK not to keep track of the handle. + context->resource().pool.uniform(block).object); + } else { + TORCH_CHECK(false, "Not implemented!"); + } + } + command_pool.submit(context->gpu().queue, command_buffer); + + return self; +} + +Tensor lerp_scalar(const Tensor& start, const Tensor& end, const Scalar& weight) { + return _lerp_scalar( + start, end, weight, "aten::lerp.Scalar"); +} + +Tensor& lerp_scalar_(Tensor& self, const Tensor& end, const Scalar& weight) { + return _lerp_scalar_( + self, end, weight, "aten::lerp_.Scalar"); +} + +Tensor lerp_tensor(const Tensor& start, const Tensor& end, const Tensor& weight) { + if (weight.sizes().size() == 0) { + return _lerp_scalar( + start, end, weight.item(), "aten::lerp.Tensor"); + } + return _lerp_tensor( + start, end, weight, "aten::lerp.Tensor"); +} + +Tensor& lerp_tensor_(Tensor& self, const Tensor& end, const Tensor& weight) { + if (weight.sizes().size() == 0) { + return _lerp_scalar_( + self, end, weight.item(), "aten::lerp_.Tensor"); + } + return _lerp_tensor_( + self, end, weight, "aten::lerp_.Tensor"); +} + +#ifdef USE_VULKAN_API + +TORCH_LIBRARY_IMPL(aten, Vulkan, m) { + m.impl(TORCH_SELECTIVE_NAME("aten::lerp.Scalar"), TORCH_FN(lerp_scalar)); + m.impl(TORCH_SELECTIVE_NAME("aten::lerp_.Scalar"), TORCH_FN(lerp_scalar_)); + m.impl(TORCH_SELECTIVE_NAME("aten::lerp.Tensor"), TORCH_FN(lerp_tensor)); + m.impl(TORCH_SELECTIVE_NAME("aten::lerp_.Tensor"), TORCH_FN(lerp_tensor_)); +} + +#endif /* USE_VULKAN_API */ + +} // namespace +} // namespace ops +} // namespace vulkan +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/vulkan/ops/Mean.cpp b/aten/src/ATen/native/vulkan/ops/Mean.cpp index 947cb2c5e39d..3e678056fc3b 100644 --- a/aten/src/ATen/native/vulkan/ops/Mean.cpp +++ b/aten/src/ATen/native/vulkan/ops/Mean.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -55,6 +56,8 @@ Tensor mean( api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); { + api::OpProfiler profiler(command_buffer, context->querypool(), "aten::mean.dim"); + if C10_LIKELY(v_input.has_image()) { const struct Block final { uvec3 extents; diff --git a/aten/src/ATen/native/vulkan/ops/Mm.cpp b/aten/src/ATen/native/vulkan/ops/Mm.cpp index b19f02af0b7e..04c65677c962 100644 --- a/aten/src/ATen/native/vulkan/ops/Mm.cpp +++ b/aten/src/ATen/native/vulkan/ops/Mm.cpp @@ -1,3 +1,4 @@ +#include #include #include @@ -16,7 +17,7 @@ vTensor pack_weights( } api::Context* const context = api::context(); - api::Command::Buffer& command_buffer = context->command().pool.stream(); + api::Command::Buffer& command_buffer = context->command().pool.stream(); // Don't collect the timestamp since the command buffer doesn't record anything const Tensor weight = weight_arg.contiguous(); const IntArrayRef w_sizes = weight.sizes(); @@ -70,7 +71,7 @@ vTensor pack_biases( } api::Context* const context = api::context(); - api::Command::Buffer& command_buffer = context->command().pool.stream(); + api::Command::Buffer& command_buffer = context->command().pool.stream(); // Don't collect the timestamp since the command buffer doesn't record anything using Future = vTensor::Future; if (bias_arg) { @@ -193,7 +194,8 @@ Tensor addmm( bias).run( input, alpha.to(), - beta.to()); + beta.to(), + "aten::addmm"); } Tensor mm( @@ -204,7 +206,8 @@ Tensor mm( c10::optional()).run( mat1_arg, 1.0f, - 1.0f); + 1.0f, + "aten::mm"); } #ifdef USE_VULKAN_API @@ -250,7 +253,8 @@ LinearOpContext LinearOpContext::create( Tensor LinearOpContext::run( const Tensor& input_arg, const float alpha, - const float beta) const { + const float beta, + const std::string& op_name) const { api::Context* const context = api::context(); const Tensor input = input_arg.is_vulkan() ? input_arg : input_arg.vulkan(); @@ -278,9 +282,10 @@ Tensor LinearOpContext::run( }; api::Command::Pool& command_pool = context->command().pool; - api::Command::Buffer& command_buffer = command_pool.stream(); { + api::OpProfiler profiler(command_buffer, context->querypool(), op_name); + if (v_input.has_image() && packed_.v_weight.has_image() && packed_.v_bias.has_image()) { @@ -412,7 +417,7 @@ c10::intrusive_ptr linear_prepack( Tensor linear_run( const Tensor& input, const c10::intrusive_ptr& context) { - return context->run(input, 1.0, 1.0); + return context->run(input, 1.0, 1.0, "prepacked::linear_clamp_run"); } } // namespace ops diff --git a/aten/src/ATen/native/vulkan/ops/Mm.h b/aten/src/ATen/native/vulkan/ops/Mm.h index 1dfef32ba9a7..5603f5e51821 100644 --- a/aten/src/ATen/native/vulkan/ops/Mm.h +++ b/aten/src/ATen/native/vulkan/ops/Mm.h @@ -18,7 +18,7 @@ class LinearOpContext final : public torch::jit::CustomClassHolder { using State = std::tuple>; - Tensor run(const Tensor& input, float beta, float alpha) const; + Tensor run(const Tensor& input, float beta, float alpha, const std::string& op_name) const; State unpack() const; private: diff --git a/aten/src/ATen/native/vulkan/ops/Padding.cpp b/aten/src/ATen/native/vulkan/ops/Padding.cpp index 8d16093bd384..dcbd3a326fea 100644 --- a/aten/src/ATen/native/vulkan/ops/Padding.cpp +++ b/aten/src/ATen/native/vulkan/ops/Padding.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -55,6 +56,8 @@ Tensor reflection_pad2d(const Tensor& self_arg, IntArrayRef padding) { api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); { + api::OpProfiler profiler(command_buffer, context->querypool(), "aten::reflection_pad2d"); + if C10_LIKELY (v_output.has_image() && v_self.has_image()) { const struct Block final { uvec3 extents; diff --git a/aten/src/ATen/native/vulkan/ops/Permute.cpp b/aten/src/ATen/native/vulkan/ops/Permute.cpp index 29fed363d115..557c99592af0 100644 --- a/aten/src/ATen/native/vulkan/ops/Permute.cpp +++ b/aten/src/ATen/native/vulkan/ops/Permute.cpp @@ -1,3 +1,4 @@ +#include #include #include @@ -13,63 +14,65 @@ Tensor permute_4d(const Tensor& input, const uvec4& in_size, const uvec4& out_si api::Context* const context = api::context(); api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); - - auto dst_image = v_output.image( - command_buffer, - vTensor::Stage::Compute, - vTensor::Access::Read | vTensor::Access::Write); - - const Tensor self = input.is_vulkan() ? input : input.vulkan(); - const vTensor& v_self = convert(self); - if C10_LIKELY(v_output.has_image() && v_self.has_image()) { - auto src_image = v_self.image( - command_buffer, - vTensor::Stage::Compute); - - const struct Block final { - uvec3 size; // output texture size - uint32_t fill_0; // dummy - uvec3 isize; // input texture size - uint32_t fill_1; // dummy - uvec4 tensor_size; // output tensor size - uvec4 itensor_size; // input tensor size - uvec4 dims; // output dims - } block { - v_output.extents(), - 0u, - v_self.extents(), - 0u, - out_size, - in_size, - out_dims, - }; - - context->dispatch( - command_buffer, - { - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, - }, - VK_KERNEL(permute_4d), - // build up shader operations from the output texture point of view - // to avoid the nondeterministic order of GPU shader operations between texels + { + api::OpProfiler profiler(command_buffer, context->querypool(), "aten::permute (permute_4d)"); + + auto dst_image = v_output.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Read | vTensor::Access::Write); + + const Tensor self = input.is_vulkan() ? input : input.vulkan(); + const vTensor& v_self = convert(self); + if C10_LIKELY(v_output.has_image() && v_self.has_image()) { + auto src_image = v_self.image( + command_buffer, + vTensor::Stage::Compute); + + const struct Block final { + uvec3 size; // output texture size + uint32_t fill_0; // dummy + uvec3 isize; // input texture size + uint32_t fill_1; // dummy + uvec4 tensor_size; // output tensor size + uvec4 itensor_size; // input tensor size + uvec4 dims; // output dims + } block { v_output.extents(), - context->gpu().adapter->local_work_group_size(), - // Read/Write access bypasses synchronization but inserts appropriate - // barriers if necessary. - dst_image, - // Read-only access is implied on const tensors and triggers an async - // synchronization if necessary. - src_image, - // Object lifetime is managed by the resource pool. - // It is OK not to keep track of the handle. - context->resource().pool.uniform(block).object); - } - else { - TORCH_CHECK(false, "Not implemented!"); + 0u, + v_self.extents(), + 0u, + out_size, + in_size, + out_dims, + }; + + context->dispatch( + command_buffer, + { + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + }, + VK_KERNEL(permute_4d), + // build up shader operations from the output texture point of view + // to avoid the nondeterministic order of GPU shader operations between texels + v_output.extents(), + context->gpu().adapter->local_work_group_size(), + // Read/Write access bypasses synchronization but inserts appropriate + // barriers if necessary. + dst_image, + // Read-only access is implied on const tensors and triggers an async + // synchronization if necessary. + src_image, + // Object lifetime is managed by the resource pool. + // It is OK not to keep track of the handle. + context->resource().pool.uniform(block).object); + } + else { + TORCH_CHECK(false, "Not implemented!"); + } } - command_pool.submit(context->gpu().queue, command_buffer); return convert(v_output); diff --git a/aten/src/ATen/native/vulkan/ops/Pool.cpp b/aten/src/ATen/native/vulkan/ops/Pool.cpp index 6c67ada1d747..7a2fe98ba7d4 100644 --- a/aten/src/ATen/native/vulkan/ops/Pool.cpp +++ b/aten/src/ATen/native/vulkan/ops/Pool.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -36,6 +37,8 @@ Tensor adaptive_avg_pool2d( api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); { + api::OpProfiler profiler(command_buffer, context->querypool(), "aten::_adaptive_avg_pool2d"); + if C10_LIKELY(v_self.has_image()) { const uvec3 v_output_size = v_output.extents(); const uvec3 v_self_size = v_self.extents(); @@ -101,7 +104,8 @@ Tensor pool2d( const IntArrayRef padding_arg, const IntArrayRef dilation_arg, const bool ceil_mode, - const api::Shader::Descriptor& shader_descriptor) { + const api::Shader::Descriptor& shader_descriptor, + const std::string& op_name) { if (stride_arg.empty()) { stride_arg = kernel_arg; } @@ -175,6 +179,8 @@ Tensor pool2d( api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); { + api::OpProfiler profiler(command_buffer, context->querypool(), op_name); + if C10_LIKELY(v_self.has_image()) { const struct Block final { uvec3 extents; @@ -257,7 +263,8 @@ Tensor avg_pool2d( padding_arg, {1,1}, ceil_mode, - VK_KERNEL(avg_pool2d) + VK_KERNEL(avg_pool2d), + "aten::avg_pool2d" ); } @@ -275,7 +282,8 @@ Tensor max_pool2d( padding_arg, dilation_arg, ceil_mode, - VK_KERNEL(max_pool2d) + VK_KERNEL(max_pool2d), + "aten::max_pool2d" ); } diff --git a/aten/src/ATen/native/vulkan/ops/Register.cpp b/aten/src/ATen/native/vulkan/ops/Register.cpp index 4b90fc8696e1..942836cf6838 100644 --- a/aten/src/ATen/native/vulkan/ops/Register.cpp +++ b/aten/src/ATen/native/vulkan/ops/Register.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -28,9 +29,9 @@ TORCH_LIBRARY(vulkan, m) { std::move(std::get<2>(state)), std::move(std::get<3>(state)), std::move(std::get<4>(state)), - std::move(std::get<5>(state)), - std::move(std::get<6>(state)), - std::move(std::get<7>(state))); + std::get<5>(state), + std::get<6>(state), + std::get<7>(state)); }); m.class_("TransposeConv2dOpContext") .def_pickle( @@ -47,9 +48,9 @@ TORCH_LIBRARY(vulkan, m) { std::move(std::get<3>(state)), std::move(std::get<4>(state)), std::move(std::get<5>(state)), - std::move(std::get<6>(state)), - std::move(std::get<7>(state)), - std::move(std::get<8>(state))); + std::get<6>(state), + std::get<7>(state), + std::get<8>(state)); }); m.class_("LinearOpContext") .def_pickle( @@ -62,6 +63,23 @@ TORCH_LIBRARY(vulkan, m) { return linear_prepack( std::move(std::get<0>(state)), std::move(std::get<1>(state))); }); + m.class_("GruOpContext") + .def_pickle( + // __getstate__ + [](const c10::intrusive_ptr& context) { + return context->unpack(); + }, + // __setstate__ + [](GruOpContext::State state) { + return gru_prepack( + std::move(std::get<0>(state)), + std::get<1>(state), + std::get<2>(state), + std::get<3>(state), + std::get<4>(state), + std::get<5>(state), + std::get<6>(state)); + }); } TORCH_LIBRARY(vulkan_prepack, m) { @@ -87,18 +105,33 @@ TORCH_LIBRARY(vulkan_prepack, m) { m.def(TORCH_SELECTIVE_SCHEMA( "vulkan_prepack::linear_run(Tensor X, " "__torch__.torch.classes.vulkan.LinearOpContext BW_prepack) -> Tensor Y")); + m.def(TORCH_SELECTIVE_SCHEMA( + "vulkan_prepack::gru_prepack(Tensor[] params_cpu, " + "bool has_biases, " + "int num_layers, " + "float dropout, " + "bool train, " + "bool bidirectional, " + "bool batch_first) " + "-> __torch__.torch.classes.vulkan.GruOpContext")); + m.def(TORCH_SELECTIVE_SCHEMA( + "vulkan_prepack::gru_run(Tensor input_vk, " + "Tensor hx_vk, " + "__torch__.torch.classes.vulkan.GruOpContext G_prepack) -> (Tensor next_input, Tensor hidden_layer)")); } TORCH_LIBRARY_IMPL(vulkan_prepack, CPU, m) { m.impl(TORCH_SELECTIVE_NAME("vulkan_prepack::conv2d_clamp_prepack"), TORCH_FN(conv2d_clamp_prepack)); m.impl(TORCH_SELECTIVE_NAME("vulkan_prepack::conv2d_transpose_clamp_prepack"), TORCH_FN(conv2d_transpose_clamp_prepack)); m.impl(TORCH_SELECTIVE_NAME("vulkan_prepack::linear_prepack"), TORCH_FN(linear_prepack)); + m.impl(TORCH_SELECTIVE_NAME("vulkan_prepack::gru_prepack"), TORCH_FN(gru_prepack)); } TORCH_LIBRARY_IMPL(vulkan_prepack, Vulkan, m) { m.impl(TORCH_SELECTIVE_NAME("vulkan_prepack::conv2d_clamp_run"), TORCH_FN(conv2d_clamp_run)); m.impl(TORCH_SELECTIVE_NAME("vulkan_prepack::conv2d_transpose_clamp_run"), TORCH_FN(conv2d_transpose_clamp_run)); m.impl(TORCH_SELECTIVE_NAME("vulkan_prepack::linear_run"), TORCH_FN(linear_run)); + m.impl(TORCH_SELECTIVE_NAME("vulkan_prepack::gru_run"), TORCH_FN(gru_run)); } Tensor convolution( diff --git a/aten/src/ATen/native/vulkan/ops/Shape.cpp b/aten/src/ATen/native/vulkan/ops/Shape.cpp index 160099f3754d..86a466942052 100644 --- a/aten/src/ATen/native/vulkan/ops/Shape.cpp +++ b/aten/src/ATen/native/vulkan/ops/Shape.cpp @@ -1,3 +1,4 @@ +#include #include #include @@ -7,9 +8,10 @@ namespace vulkan { namespace ops { namespace { -Tensor view( +Tensor view_internal( const Tensor& self_arg, - const IntArrayRef shape) { + const IntArrayRef shape, + const std::string& op_name) { api::Context* const context = api::context(); const Tensor self = self_arg.is_vulkan() ? self_arg : self_arg.vulkan(); @@ -24,6 +26,8 @@ Tensor view( api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); { + api::OpProfiler profiler(command_buffer, context->querypool(), op_name); + command_buffer.copy( // Read-only access is implied on const tensors and triggers an async // synchronization if necessary. @@ -42,11 +46,17 @@ Tensor view( return convert(v_output); } +inline Tensor view( + const Tensor& self_arg, + const IntArrayRef shape) { + return view_internal(self_arg, shape, "aten::view"); +} + Tensor _reshape_alias( const Tensor& self_arg, const IntArrayRef shape, const IntArrayRef strides) { - return view(self_arg, shape); + return view_internal(self_arg, shape, "aten::_reshape_alias"); } #ifdef USE_VULKAN_API diff --git a/aten/src/ATen/native/vulkan/ops/Slice.cpp b/aten/src/ATen/native/vulkan/ops/Slice.cpp index 36f3a713b468..1d454c7ff709 100644 --- a/aten/src/ATen/native/vulkan/ops/Slice.cpp +++ b/aten/src/ATen/native/vulkan/ops/Slice.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -16,65 +17,67 @@ Tensor slice_4d(const Tensor& input, const int64_t dim, const int64_t start, con api::Context* const context = api::context(); api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); + { + api::OpProfiler profiler(command_buffer, context->querypool(), "aten::slice.Tensor (slice_4d)"); - const Tensor self = input.is_vulkan() ? input : input.vulkan(); - const vTensor& v_self = convert(self); - if C10_LIKELY(v_output.has_image() && v_self.has_image()) { - auto src_image = v_self.image( - command_buffer, - vTensor::Stage::Compute); - auto dst_image = v_output.image( - command_buffer, - vTensor::Stage::Compute, - vTensor::Access::Write); - - const struct Block final { - uvec3 size; // output texture size - uint32_t fill_0; // dummy - uvec3 isize; // input texture size - uint32_t fill_1; // dummy - uvec4 tensor_size; // output tensor size - uvec4 itensor_size; // input tensor size - uvec4 args; // input arguments (dim, start, end, step) - } block { - v_output.extents(), - 0u, - v_self.extents(), - 0u, - out_tsize, - in_tsize, - { safe_downcast(dim), - safe_downcast(start), - safe_downcast(end), - safe_downcast(step) }, - }; - - context->dispatch( + const Tensor self = input.is_vulkan() ? input : input.vulkan(); + const vTensor& v_self = convert(self); + if C10_LIKELY(v_output.has_image() && v_self.has_image()) { + auto src_image = v_self.image( + command_buffer, + vTensor::Stage::Compute); + auto dst_image = v_output.image( command_buffer, - { - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, - }, - VK_KERNEL(slice_4d), - // build up shader operations from the output texture point of view - // to avoid the nondeterministic order of GPU shader operations between texels + vTensor::Stage::Compute, + vTensor::Access::Write); + + const struct Block final { + uvec3 size; // output texture size + uint32_t fill_0; // dummy + uvec3 isize; // input texture size + uint32_t fill_1; // dummy + uvec4 tensor_size; // output tensor size + uvec4 itensor_size; // input tensor size + uvec4 args; // input arguments (dim, start, end, step) + } block { v_output.extents(), - context->gpu().adapter->local_work_group_size(), - // Write-only access bypasses synchronization but inserts appropriate - // barriers if necessary. - dst_image, - // Read-only access is implied on const tensors and triggers an async - // synchronization if necessary. - src_image, - // Object lifetime is managed by the resource pool. - // It is OK not to keep track of the handle. - context->resource().pool.uniform(block).object); - } - else { - TORCH_CHECK(false, "Not implemented!"); - } + 0u, + v_self.extents(), + 0u, + out_tsize, + in_tsize, + { safe_downcast(dim), + safe_downcast(start), + safe_downcast(end), + safe_downcast(step) }, + }; + context->dispatch( + command_buffer, + { + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + }, + VK_KERNEL(slice_4d), + // build up shader operations from the output texture point of view + // to avoid the nondeterministic order of GPU shader operations between texels + v_output.extents(), + context->gpu().adapter->local_work_group_size(), + // Write-only access bypasses synchronization but inserts appropriate + // barriers if necessary. + dst_image, + // Read-only access is implied on const tensors and triggers an async + // synchronization if necessary. + src_image, + // Object lifetime is managed by the resource pool. + // It is OK not to keep track of the handle. + context->resource().pool.uniform(block).object); + } + else { + TORCH_CHECK(false, "Not implemented!"); + } + } command_pool.submit(context->gpu().queue, command_buffer); return convert(v_output); } @@ -83,56 +86,58 @@ Tensor slice_width(const Tensor& input, const int64_t start, const int64_t end, api::Context* const context = api::context(); api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); + { + api::OpProfiler profiler(command_buffer, context->querypool(), "aten::slice.Tensor (slice_width)"); - const Tensor self = input.is_vulkan() ? input : input.vulkan(); - const vTensor& v_self = convert(self); - if C10_LIKELY(v_output.has_image() && v_self.has_image()) { - auto src_image = v_self.image( - command_buffer, - vTensor::Stage::Transfer); - auto dst_image = v_output.image( - command_buffer, - vTensor::Stage::Transfer, - vTensor::Access::Write); + const Tensor self = input.is_vulkan() ? input : input.vulkan(); + const vTensor& v_self = convert(self); + if C10_LIKELY(v_output.has_image() && v_self.has_image()) { + auto src_image = v_self.image( + command_buffer, + vTensor::Stage::Transfer); + auto dst_image = v_output.image( + command_buffer, + vTensor::Stage::Transfer, + vTensor::Access::Write); - uvec3 src_offset{}; - uvec3 dst_offset{}; + uvec3 src_offset{}; + uvec3 dst_offset{}; - if (step == 1) { - src_offset.data[0u] = start; - uvec3 copy_extents {safe_downcast(end - start), - v_self.extents().data[1u], - v_self.extents().data[2u]}; - api::helper::copy_texture_to_texture(command_buffer, - src_image, - dst_image, - copy_extents, - src_offset, - dst_offset); - } else { - uvec3 copy_extents {1u, - v_self.extents().data[1u], - v_self.extents().data[2u]}; - const auto x_max = v_self.extents().data[0u]; - for (int64_t x = start, x_new = 0; x < end; x += step, ++x_new) { - if (x >= x_max) { // out of range - continue; - } - src_offset.data[0u] = x; - dst_offset.data[0u] = x_new; + if (step == 1) { + src_offset.data[0u] = start; + uvec3 copy_extents {safe_downcast(end - start), + v_self.extents().data[1u], + v_self.extents().data[2u]}; api::helper::copy_texture_to_texture(command_buffer, src_image, dst_image, copy_extents, src_offset, dst_offset); + } else { + uvec3 copy_extents {1u, + v_self.extents().data[1u], + v_self.extents().data[2u]}; + const auto x_max = v_self.extents().data[0u]; + for (int64_t x = start, x_new = 0; x < end; x += step, ++x_new) { + if (x >= x_max) { // out of range + continue; + } + src_offset.data[0u] = x; + dst_offset.data[0u] = x_new; + api::helper::copy_texture_to_texture(command_buffer, + src_image, + dst_image, + copy_extents, + src_offset, + dst_offset); + } } } + else { + TORCH_CHECK(false, "Not implemented!"); + } } - else { - TORCH_CHECK(false, "Not implemented!"); - } - command_pool.submit(context->gpu().queue, command_buffer); return convert(v_output); } @@ -141,56 +146,58 @@ Tensor slice_height(const Tensor& input, const int64_t start, const int64_t end, api::Context* const context = api::context(); api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); + { + api::OpProfiler profiler(command_buffer, context->querypool(), "aten::slice.Tensor (slice_height)"); - const Tensor self = input.is_vulkan() ? input : input.vulkan(); - const vTensor& v_self = convert(self); - if C10_LIKELY(v_output.has_image() && v_self.has_image()) { - auto src_image = v_self.image( - command_buffer, - vTensor::Stage::Transfer); - auto dst_image = v_output.image( - command_buffer, - vTensor::Stage::Transfer, - vTensor::Access::Write); + const Tensor self = input.is_vulkan() ? input : input.vulkan(); + const vTensor& v_self = convert(self); + if C10_LIKELY(v_output.has_image() && v_self.has_image()) { + auto src_image = v_self.image( + command_buffer, + vTensor::Stage::Transfer); + auto dst_image = v_output.image( + command_buffer, + vTensor::Stage::Transfer, + vTensor::Access::Write); - uvec3 src_offset{}; - uvec3 dst_offset{}; + uvec3 src_offset{}; + uvec3 dst_offset{}; - if (step == 1) { - src_offset.data[1u] = start; - uvec3 copy_extents {v_self.extents().data[0u], - safe_downcast(end - start), - v_self.extents().data[2u]}; - api::helper::copy_texture_to_texture(command_buffer, - src_image, - dst_image, - copy_extents, - src_offset, - dst_offset); - } else { - uvec3 copy_extents {v_self.extents().data[0u], - 1u, - v_self.extents().data[2u]}; - const auto y_max = v_self.extents().data[1u]; - for (int64_t y = start, y_new = 0; y < end; y += step, ++y_new) { - if (y >= y_max) { // out of range - continue; - } - src_offset.data[1u] = y; - dst_offset.data[1u] = y_new; + if (step == 1) { + src_offset.data[1u] = start; + uvec3 copy_extents {v_self.extents().data[0u], + safe_downcast(end - start), + v_self.extents().data[2u]}; api::helper::copy_texture_to_texture(command_buffer, src_image, dst_image, copy_extents, src_offset, dst_offset); + } else { + uvec3 copy_extents {v_self.extents().data[0u], + 1u, + v_self.extents().data[2u]}; + const auto y_max = v_self.extents().data[1u]; + for (int64_t y = start, y_new = 0; y < end; y += step, ++y_new) { + if (y >= y_max) { // out of range + continue; + } + src_offset.data[1u] = y; + dst_offset.data[1u] = y_new; + api::helper::copy_texture_to_texture(command_buffer, + src_image, + dst_image, + copy_extents, + src_offset, + dst_offset); + } } } + else { + TORCH_CHECK(false, "Not implemented!"); + } } - else { - TORCH_CHECK(false, "Not implemented!"); - } - command_pool.submit(context->gpu().queue, command_buffer); return convert(v_output); } diff --git a/aten/src/ATen/native/vulkan/ops/Softmax.cpp b/aten/src/ATen/native/vulkan/ops/Softmax.cpp index 9a3cce1fa224..f36a5fc54540 100644 --- a/aten/src/ATen/native/vulkan/ops/Softmax.cpp +++ b/aten/src/ATen/native/vulkan/ops/Softmax.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -14,7 +15,8 @@ Tensor softmax_internal( const at::Tensor& input_arg, const int64_t dim, const bool half_to_float, - const api::Shader::Descriptor& shader_descriptor) { + const api::Shader::Descriptor& shader_descriptor, + const std::string& op_name) { TORCH_CHECK( input_arg.dim() == 4, "Vulkan softmax expects 4-dimensional input!"); @@ -56,6 +58,8 @@ Tensor softmax_internal( api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); { + api::OpProfiler profiler(command_buffer, context->querypool(), op_name); + if C10_LIKELY(v_input.has_image()) { const struct Block final { uvec3 iextents; @@ -105,14 +109,14 @@ Tensor softmax( const at::Tensor& input_arg, const int64_t dim, const bool half_to_float) { - return softmax_internal(input_arg, dim, half_to_float, VK_KERNEL(softmax)); + return softmax_internal(input_arg, dim, half_to_float, VK_KERNEL(softmax), "_softmax"); } Tensor log_softmax( const at::Tensor& input_arg, const int64_t dim, const bool half_to_float) { - return softmax_internal(input_arg, dim, half_to_float, VK_KERNEL(log_softmax)); + return softmax_internal(input_arg, dim, half_to_float, VK_KERNEL(log_softmax), "_log_softmax"); } #ifdef USE_VULKAN_API diff --git a/aten/src/ATen/native/vulkan/ops/Tensor.cpp b/aten/src/ATen/native/vulkan/ops/Tensor.cpp index 0a99a916f4f5..0de253447d2d 100644 --- a/aten/src/ATen/native/vulkan/ops/Tensor.cpp +++ b/aten/src/ATen/native/vulkan/ops/Tensor.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -761,6 +762,8 @@ void vTensor::View::CMD::copy_buffer_to_image( return; } + api::OpProfiler profiler(command_buffer_, view_.context_->querypool(), "copy_buffer_to_image"); + barrier( state.transition({ // Staging @@ -819,6 +822,8 @@ void vTensor::View::CMD::copy_image_to_buffer( return; } + api::OpProfiler profiler(command_buffer_, view_.context_->querypool(), "copy_image_to_buffer"); + barrier( state.transition({ // Staging diff --git a/aten/src/ATen/native/vulkan/ops/TransposeConvolution2d.cpp b/aten/src/ATen/native/vulkan/ops/TransposeConvolution2d.cpp index d459e5d9d74a..0c12e930f05c 100644 --- a/aten/src/ATen/native/vulkan/ops/TransposeConvolution2d.cpp +++ b/aten/src/ATen/native/vulkan/ops/TransposeConvolution2d.cpp @@ -1,8 +1,9 @@ -#include #include #include -#include +#include #include +#include +#include #include namespace at { @@ -86,7 +87,7 @@ vTensor pack_weights(const Tensor& weight_arg) { } api::Context* const context = api::context(); - api::Command::Buffer& command_buffer = context->command().pool.stream(); + api::Command::Buffer& command_buffer = context->command().pool.stream(); // Don't collect the timestamp since the command buffer doesn't record anything const Tensor weight = at::permute(weight_arg, {1, 0, 2, 3}).contiguous(); @@ -105,7 +106,7 @@ vTensor pack_biases( } api::Context* const context = api::context(); - api::Command::Buffer& command_buffer = context->command().pool.stream(); + api::Command::Buffer& command_buffer = context->command().pool.stream(); // Don't collect the timestamp since the command buffer doesn't record anything const int64_t src_w = weight.size(Layout::TransposedFilter::output); const int64_t packed_w = div_up(src_w, INT64_C(4)); @@ -353,6 +354,8 @@ void TransposeConv2dOpContext::conv2d_transpose_sliding_window( api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); { + api::OpProfiler profiler(command_buffer, context->querypool(), "prepacked::conv2d_transpose_clamp_run (conv2d_transpose_sliding_window)"); + const struct Block final { uvec3 extents; int32_t ic4; diff --git a/aten/src/ATen/native/vulkan/ops/Upsample.cpp b/aten/src/ATen/native/vulkan/ops/Upsample.cpp index e6aa594ec6eb..20516bb387a0 100644 --- a/aten/src/ATen/native/vulkan/ops/Upsample.cpp +++ b/aten/src/ATen/native/vulkan/ops/Upsample.cpp @@ -1,5 +1,6 @@ -#include #include +#include +#include #include namespace at { @@ -39,6 +40,8 @@ Tensor upsample_nearest2d( api::Command::Pool& command_pool = context->command().pool; api::Command::Buffer& command_buffer = command_pool.stream(); { + api::OpProfiler profiler(command_buffer, context->querypool(), "aten::upsample_nearest2d"); + if C10_LIKELY(v_input.has_image()) { const struct Block final { uvec3 extents; diff --git a/aten/src/ATen/native/xnnpack/Activation.cpp b/aten/src/ATen/native/xnnpack/Activation.cpp index 33215771fe55..5ccf4aad40e7 100644 --- a/aten/src/ATen/native/xnnpack/Activation.cpp +++ b/aten/src/ATen/native/xnnpack/Activation.cpp @@ -10,7 +10,7 @@ namespace xnnpack { bool use_hardswish( const Tensor& input) { - return xnnpack::internal::available() && + return xnnpack::available() && (1 <= input.ndimension()) && (input.device().is_cpu()) && (kFloat == input.scalar_type()) && diff --git a/aten/src/ATen/native/xnnpack/AveragePooling.cpp b/aten/src/ATen/native/xnnpack/AveragePooling.cpp index 4379741e6a05..7359836bb953 100644 --- a/aten/src/ATen/native/xnnpack/AveragePooling.cpp +++ b/aten/src/ATen/native/xnnpack/AveragePooling.cpp @@ -10,7 +10,7 @@ namespace xnnpack { bool use_global_average_pool( const Tensor& input) { - return xnnpack::internal::available() && + return xnnpack::available() && (1 <= input.ndimension()) && (input.device().is_cpu()) && (kFloat == input.scalar_type()) && diff --git a/aten/src/ATen/native/xnnpack/ChannelShuffle.cpp b/aten/src/ATen/native/xnnpack/ChannelShuffle.cpp index a17d7bb2daac..34cab01d0507 100644 --- a/aten/src/ATen/native/xnnpack/ChannelShuffle.cpp +++ b/aten/src/ATen/native/xnnpack/ChannelShuffle.cpp @@ -17,7 +17,7 @@ bool use_channel_shuffle( // and all dimensions must be positive. // * The number of groups must be larger than 1 and // the number of channels must be divisible by the number of groups. - return xnnpack::internal::available() && + return xnnpack::available() && // Input (4 == input.dim()) && (input.device().is_cpu()) && diff --git a/aten/src/ATen/native/xnnpack/Common.h b/aten/src/ATen/native/xnnpack/Common.h index 5a0b68baffe7..b000ffada157 100644 --- a/aten/src/ATen/native/xnnpack/Common.h +++ b/aten/src/ATen/native/xnnpack/Common.h @@ -67,6 +67,9 @@ struct ContextConv2D final { static constexpr float kMax = std::numeric_limits::infinity(); }; + +bool available(); + namespace internal { struct Layout final { @@ -121,9 +124,6 @@ struct Layout final { static constexpr size_t width = 1u; }; }; - -bool available(); - } // namespace internal } // namespace xnnpack } // namespace native diff --git a/aten/src/ATen/native/xnnpack/Convolution.cpp b/aten/src/ATen/native/xnnpack/Convolution.cpp index 8278bfa19d9d..278e35280c40 100644 --- a/aten/src/ATen/native/xnnpack/Convolution.cpp +++ b/aten/src/ATen/native/xnnpack/Convolution.cpp @@ -27,7 +27,7 @@ namespace { // TODO: Decouple and improve error handling and messages. bool available( const Tensor& weight, - const c10::optional bias_sizes_opt, + const at::OptionalIntArrayRef bias_sizes_opt, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, @@ -36,7 +36,7 @@ bool available( const float output_min, const float output_max) { // XNNPACK - return xnnpack::internal::available() && + return xnnpack::available() && // Weight (4 == weight.ndimension()) && (weight.size(Layout::Filter::height) > 0) && @@ -189,7 +189,7 @@ ContextConv2D create( TORCH_CHECK( available( weight_nhwc, - (bias.has_value() && bias->defined()) ? c10::optional(bias->sizes()) : c10::nullopt, + (bias.has_value() && bias->defined()) ? at::OptionalIntArrayRef(bias->sizes()) : c10::nullopt, padding_expanded, stride_expanded, dilation_expanded, @@ -433,7 +433,7 @@ unpack_prepacked_sizes_conv2d(const IValue& ivalue) { const auto& bias = std::get<1>(tuple); return IValue(std::make_tuple( std::get<0>(tuple).sizes(), - (bias && bias->defined()) ? c10::optional(bias->sizes()) : c10::nullopt, + (bias && bias->defined()) ? at::OptionalIntArrayRef(bias->sizes()) : c10::nullopt, std::get<2>(tuple), std::get<3>(tuple), std::get<4>(tuple), @@ -452,7 +452,7 @@ Tensor conv2d_transpose_clamp_run( bool use_convolution2d( const Tensor& input, const Tensor& weight, - const c10::optional bias_sizes_opt, + const at::OptionalIntArrayRef bias_sizes_opt, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, diff --git a/aten/src/ATen/native/xnnpack/Engine.h b/aten/src/ATen/native/xnnpack/Engine.h index 71ed262297b3..9d5c0e4594ac 100644 --- a/aten/src/ATen/native/xnnpack/Engine.h +++ b/aten/src/ATen/native/xnnpack/Engine.h @@ -13,7 +13,7 @@ namespace xnnpack { bool use_convolution2d( const Tensor& input, const Tensor& weight, - const c10::optional bias_sizes_opt, + const at::OptionalIntArrayRef bias_sizes_opt, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, diff --git a/aten/src/ATen/native/xnnpack/Init.cpp b/aten/src/ATen/native/xnnpack/Init.cpp index 8f69ec02c2ca..e7365bea2a61 100644 --- a/aten/src/ATen/native/xnnpack/Init.cpp +++ b/aten/src/ATen/native/xnnpack/Init.cpp @@ -49,13 +49,13 @@ bool C10_UNUSED deinitialize() { } } // namespace +} // namespace internal bool available() { // Add extra conditions here that should disable mobile CPU impl at runtime in its totality. return internal::initialize(); } -} // namespace internal } // namespace xnnpack } // namespace native } // namespace at diff --git a/aten/src/ATen/native/xnnpack/Linear.cpp b/aten/src/ATen/native/xnnpack/Linear.cpp index 7911256b1f21..3f7ae681f955 100644 --- a/aten/src/ATen/native/xnnpack/Linear.cpp +++ b/aten/src/ATen/native/xnnpack/Linear.cpp @@ -21,7 +21,7 @@ bool available( const float output_min, const float output_max) { // XNNPACK - return xnnpack::internal::available() && + return xnnpack::available() && // Weight (2 == weight.ndimension()) && (weight.device().is_cpu()) && @@ -187,7 +187,7 @@ unpack_prepacked_sizes_linear(const IValue& ivalue) { const auto& bias = std::get<1>(tuple); return IValue(std::make_tuple( std::get<0>(tuple).sizes(), - (bias && bias->defined()) ? c10::optional(bias->sizes()) : c10::nullopt)); + (bias && bias->defined()) ? at::OptionalIntArrayRef(bias->sizes()) : c10::nullopt)); } } // namespace linear diff --git a/aten/src/ATen/native/xnnpack/MaxPooling.cpp b/aten/src/ATen/native/xnnpack/MaxPooling.cpp index 7c101f9117ac..871959080821 100644 --- a/aten/src/ATen/native/xnnpack/MaxPooling.cpp +++ b/aten/src/ATen/native/xnnpack/MaxPooling.cpp @@ -88,7 +88,7 @@ bool use_max_pool2d( const bool output_size_eq = (pt_outputHeight == xnnpack_outputHeight) && (pt_outputWidth == xnnpack_outputWidth); - return xnnpack::internal::available() && + return xnnpack::available() && // Input (4 == input.dim()) && (input.device().is_cpu()) && diff --git a/aten/src/ATen/native/xnnpack/Shim.cpp b/aten/src/ATen/native/xnnpack/Shim.cpp index 89fffa024aef..32ddfb4b8525 100644 --- a/aten/src/ATen/native/xnnpack/Shim.cpp +++ b/aten/src/ATen/native/xnnpack/Shim.cpp @@ -31,7 +31,7 @@ bool available() { bool use_convolution2d( const Tensor&, const Tensor&, - const c10::optional, + const at::OptionalIntArrayRef, const IntArrayRef, const IntArrayRef, const IntArrayRef, diff --git a/aten/src/ATen/nnapi/nnapi_model_loader.cpp b/aten/src/ATen/nnapi/nnapi_model_loader.cpp index 8553d974a8de..7966c0d17b19 100644 --- a/aten/src/ATen/nnapi/nnapi_model_loader.cpp +++ b/aten/src/ATen/nnapi/nnapi_model_loader.cpp @@ -97,9 +97,9 @@ int load_nnapi_model( size_t num_buffers, const void** buffer_ptrs, int32_t* buffer_sizes, - size_t num_memories, - ANeuralNetworksMemory** memories, - int32_t* memory_sizes, + size_t /*num_memories*/, + ANeuralNetworksMemory** /*memories*/, + int32_t* /*memory_sizes*/, int32_t* out_input_count, int32_t* out_output_count, size_t* out_bytes_consumed) { diff --git a/aten/src/ATen/nnapi/nnapi_wrapper.cpp b/aten/src/ATen/nnapi/nnapi_wrapper.cpp index aa81bf942488..90122df15ef8 100644 --- a/aten/src/ATen/nnapi/nnapi_wrapper.cpp +++ b/aten/src/ATen/nnapi/nnapi_wrapper.cpp @@ -336,8 +336,6 @@ int check_Execution_getOutputOperandDimensions(ANeuralNetworksExecution* executi void nnapi_wrapper_load(struct nnapi_wrapper** nnapi, struct nnapi_wrapper** check_nnapi) { #ifdef _WIN32 TORCH_CHECK(false, "Running NNAPI models is not supported on Windows."); -#elif __XROS__ - TORCH_CHECK(false, "Running NNAPI models is not supported on XROS."); #else if (!loaded) { // Clear error flag. diff --git a/aten/src/ATen/ops/from_blob.h b/aten/src/ATen/ops/from_blob.h index 558ab57e900f..f7599e70ea05 100644 --- a/aten/src/ATen/ops/from_blob.h +++ b/aten/src/ATen/ops/from_blob.h @@ -26,7 +26,7 @@ class TORCH_API TensorMaker { public: using ContextDeleter = DeleterFnPtr; - TensorMaker& strides(optional value) noexcept { + TensorMaker& strides(OptionalIntArrayRef value) noexcept { strides_ = value; return *this; @@ -79,7 +79,7 @@ class TORCH_API TensorMaker { void* data_; IntArrayRef sizes_; - optional strides_{}; + OptionalIntArrayRef strides_{}; optional storage_offset_{}; std::function deleter_{}; std::unique_ptr ctx_{nullptr, detail::noopDelete}; diff --git a/aten/src/ATen/ops/tensor.h b/aten/src/ATen/ops/tensor.h index 3369eaf2502c..2f72b7ef0263 100644 --- a/aten/src/ATen/ops/tensor.h +++ b/aten/src/ATen/ops/tensor.h @@ -1,6 +1,6 @@ #pragma once #include -#include +#include namespace at { diff --git a/aten/src/ATen/quantized/Quantizer.cpp b/aten/src/ATen/quantized/Quantizer.cpp index aa5898194356..4a1bac8bc4c1 100644 --- a/aten/src/ATen/quantized/Quantizer.cpp +++ b/aten/src/ATen/quantized/Quantizer.cpp @@ -417,4 +417,23 @@ Tensor from_blob_quantized_per_channel_affine( return qtensor; } +Tensor UnknownQuantizer::quantize(const Tensor& tensor) { + TORCH_INTERNAL_ASSERT(false, "cannot call quantize on UnknownQuantizer"); +} +Tensor UnknownQuantizer::dequantize(const Tensor& qtensor) { + TORCH_INTERNAL_ASSERT(false, "cannot call dequantize on UnknownQuantizer"); +} +Tensor& UnknownQuantizer::dequantize_out(Tensor& rtensor, const Tensor& qtensor) { + TORCH_INTERNAL_ASSERT(false, "cannot call dequantize_out on UnknownQuantizer"); +} +QScheme UnknownQuantizer::qscheme() const { + TORCH_INTERNAL_ASSERT(false, "cannot call qscheme on UnknownQuantizer"); +} +bool UnknownQuantizer::equalTo(QuantizerPtr other) const{ + TORCH_INTERNAL_ASSERT(false, "cannot call equalTo on UnknownQuantizer"); +} +QuantizerPtr make_unknown_quantizer(ScalarType scalar_type) { + return c10::make_intrusive(scalar_type); +} + } // namespace at diff --git a/aten/src/ATen/quantized/Quantizer.h b/aten/src/ATen/quantized/Quantizer.h index 5d9c7111f19e..05bd39b71223 100644 --- a/aten/src/ATen/quantized/Quantizer.h +++ b/aten/src/ATen/quantized/Quantizer.h @@ -18,6 +18,23 @@ namespace at { +/** + * UnknownQuantizer is a placeholder quantizer for functions that implement + * quantization in a two step process. First a tensor is allocated but with + * unknown quantizer, and then the quantization kernel decides what the final + * quantizer will be. + */ +struct TORCH_API UnknownQuantizer : public Quantizer { + explicit UnknownQuantizer(ScalarType scalar_type) + : Quantizer(scalar_type) {} + + Tensor quantize(const Tensor& tensor) override; + Tensor dequantize(const Tensor& qtensor) override; + Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override; + QScheme qscheme() const override; + bool equalTo(QuantizerPtr other) const override; +}; + /** * UniformQuantizer is the parent class for all uniform quantizers. * These quantization scheme will map float value uniformly to @@ -80,7 +97,7 @@ struct TORCH_API PerTensorAffineQuantizer : public AffineQuantizer { return zero_point_; } - bool equalTo(QuantizerPtr other) override { + bool equalTo(QuantizerPtr other) const override { if (!other.get() || other->qscheme() != kPerTensorAffine) { return false; } @@ -139,7 +156,7 @@ struct TORCH_API PerChannelAffineQuantizer : public AffineQuantizer { Tensor dequantize(const Tensor& qtensor) override; Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override; - bool equalTo(QuantizerPtr other) override { + bool equalTo(QuantizerPtr other) const override { if (!other.get() || other->qscheme() != kPerChannelAffine) { return false; } @@ -190,7 +207,7 @@ struct TORCH_API PerChannelAffineFloatQParamsQuantizer : public PerChannelAffine Tensor dequantize(const Tensor& qtensor) override; Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override; - bool equalTo(QuantizerPtr other) override { + bool equalTo(QuantizerPtr other) const override { if (!other.get() || other->qscheme() != kPerChannelAffineFloatQParams) { return false; } @@ -222,6 +239,8 @@ TORCH_API QuantizerPtr make_per_channel_affine_quantizer( int64_t axis, ScalarType scalar_type); +TORCH_API QuantizerPtr make_unknown_quantizer(ScalarType scalar_type); + // Create a Quantized Tensor given arguments for normal Tensor and a quantizer TORCH_API Tensor new_qtensor( IntArrayRef sizes, diff --git a/aten/src/ATen/record_function.cpp b/aten/src/ATen/record_function.cpp index efe773f9f03b..8d9160135cc1 100644 --- a/aten/src/ATen/record_function.cpp +++ b/aten/src/ATen/record_function.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include @@ -22,16 +23,6 @@ RecordFunctionHandle next_unique_record_function_handle() { return RecordFunctionHandle(unique_rf_id++); } -RecordFunctionTLS& rf_tls() { -#if defined(C10_PREFER_CUSTOM_THREAD_LOCAL_STORAGE) - static c10::ThreadLocal rf_tls_; - return rf_tls_.get(); -#else // defined(C10_PREFER_CUSTOM_THREAD_LOCAL_STORAGE) - static thread_local RecordFunctionTLS rf_tls_; - return rf_tls_; -#endif // defined(C10_PREFER_CUSTOM_THREAD_LOCAL_STORAGE) -} - std::atomic defaultNodeId(-1); // Enumerates thread ids logically; @@ -40,534 +31,632 @@ std::atomic defaultNodeId(-1); std::atomic next_thread_id_ {0}; thread_local uint64_t current_thread_id_ = 0; -// Low probability constant -static constexpr double kLowProb = 0.001; -struct CoinflipTLS { - int tries_left_; - std::mt19937 genGeo_; - std::mt19937 genZeroOne_; - std::geometric_distribution distGeo_; - std::uniform_real_distribution distZeroOne_; - CoinflipTLS(); +static constexpr size_t NumRecordScopes = + static_cast(RecordScope::NUM_SCOPES); + +RecordFunctionCallbacks::iterator findCallback( + RecordFunctionCallbacks& entries, + CallbackHandle handle) { + auto match_handle = [handle](const auto& el) { return el.handle_ == handle; }; + return std::find_if(entries.begin(), entries.end(), match_handle); +} + +c10::optional extractCallback( + RecordFunctionCallbacks& entries, + CallbackHandle handle) { + auto it = findCallback(entries, handle); + if (it == entries.end()) { + return c10::nullopt; + } + auto out = it->callback_; + entries.erase(it); + return out; +} + +// ============================================================================ +// == Callback manager ======================================================== +// ============================================================================ +// The high level idea of the RecordFunction callback machinery is based on the +// observation that the set of callbacks to be run changes infrequently. +// However, in order to reuse the active set we have to be able to invalidate +// when the active set changes. There are three events that can change which +// callbacks should be run: +// 1) The set of global callbacks changes +// 2) The set of local callbacks changes +// 3) A sampling callback is present, and should run on this iteration +// +// Global callbacks rely on thread local replication and an atomic version +// counter to maintain consistency. Whenever we change the set of active global +// callbacks (add / remove / enable / disable) the `GlobalCallbackManager` +// increments the version number and updates the global state while holding +// a mutex. The local callback manager snapshots the global callbacks and +// lazily rebuilds by comparing`GlobalCallbackManager::version()` (which is +// a simple atomic read) to the version of the last rebuild. In the +// overwhelmingly common case that they match it can reuse the existing +// snapshot. Otherwise it must call the much more expensive (and locked) +// `GlobalCallbackManager::getSnapshot()`. +// +// Handling changes to the thread local callbacks is trivial; functions that +// change them can simply force a cache rebuild for that thread after the +// changes are made. +// +// Sampling is by far the most challenging to handle efficiently. In general +// sampling callbacks are expected to have very low frequency. (e.g. 1 per +// million) Random number generation is rather expensive, so flipping a coin on +// every call for every sampling callback is wasteful. We can significantly +// reduce this cost by noting that the number of failures of a Bernoulli random +// variable is a geometric distribution, and thus we can sample the geometric +// distribution to determine the next time a callback should run. This reduces +// the cost from a random sample to a simple integer decrement. +// +// We can further note that Bernoulli samples are independent. (In contrast to, +// say, sampling without replacement.) This means that we can generate a +// counter for each scope that a given callback supports and then decrement the +// counter corresponding to the RecordScope being called. Conceptually, this is +// analogous to flipping different coins with the same probability. By sharding +// on RecordScope, we can consolidate the decrement to a single shared counter +// and update individual counters during rebuild. + +class GlobalCallbackManager { + public: + static GlobalCallbackManager& get(); // Singleton + + private: + GlobalCallbackManager() = default; + + public: + static constexpr size_t NoVersion = 0; + using snapshot_t = std::pair; + + // Locking? + size_t version() const; // No + snapshot_t getSnapshot() const; // Yes + CallbackHandle addCallback(RecordFunctionCallback cb); // Yes + void setCallbackEnabled(CallbackHandle handle, bool enabled); // Yes + void removeCallback(CallbackHandle handle); // Yes + void clearCallbacks(); // Yes + + private: + std::atomic version_{NoVersion + 1}; + RecordFunctionCallbacks global_callbacks_; // Source of truth. + mutable std::mutex update_mutex_; }; -CoinflipTLS::CoinflipTLS() - : tries_left_(0), genGeo_(std::random_device()()), genZeroOne_(std::random_device()()), distGeo_(kLowProb), distZeroOne_(0.0, 1.0) {} +class CacheEntry { + public: + CacheEntry() = default; + CacheEntry(std::mt19937* generator, RecordScope scope); -CoinflipTLS& coinflip_tls() { -#if defined(C10_PREFER_CUSTOM_THREAD_LOCAL_STORAGE) - static c10::ThreadLocal coinflip_tls_; - return coinflip_tls_.get(); -#else // defined(C10_PREFER_CUSTOM_THREAD_LOCAL_STORAGE) - static thread_local CoinflipTLS coinflip_tls_; - return coinflip_tls_; -#endif // defined(C10_PREFER_CUSTOM_THREAD_LOCAL_STORAGE) -} + // The caller is expected to check `GlobalCallbackManager::get().version()' + // and call CacheEntry::update() if necessary. + StepCallbacks getActiveCallbacks(); -int sample_geometric() { - return coinflip_tls().distGeo_(coinflip_tls().genGeo_); -} + // Full rebuild. (E.g. during registration) + void update(const std::vector& callbacks); -double sample_zero_one() { - return coinflip_tls().distZeroOne_(coinflip_tls().genZeroOne_); -} + private: + struct CallbackAndCounter { + RecordFunctionCallback callback_; + + // `-1` indicates that a callback is not sampled. + int tries_left_{-1}; + }; + + void rebuildActiveCallbacks(); + int sampleTries(double p) const; + + // std::mt19937 is quite large, so all scopes share the same generator. + std::mt19937* generator_{nullptr}; + + // Includes sampling callbacks which are waiting to run. + c10::SmallVector callbacks_; + RecordScope scope_; + + StepCallbacks active_callbacks_; + + // For managing sampling callbacks + int sampling_countdown_{0}; + int steps_for_this_update_{0}; +}; + +class LocalCallbackManager { + public: + static LocalCallbackManager& get(); // Singleton -struct GlobalRecordFunctionCallbacksEntry { - RecordFunctionCallback callback; private: - std::atomic enabled; + LocalCallbackManager(); + public: - CallbackHandle handle; - - GlobalRecordFunctionCallbacksEntry(RecordFunctionCallback&& cb, CallbackHandle h) - : callback(std::move(cb)), enabled(true), handle(h) {} - - // Copying is fine despite std::atomic not being supposed to - // have a copy/move constructor: adding & removing callbacks is - // already not thread-safe. - GlobalRecordFunctionCallbacksEntry( - const GlobalRecordFunctionCallbacksEntry& rhs) - : callback(rhs.callback), enabled(rhs.enabled.load()), handle(rhs.handle) {} - - GlobalRecordFunctionCallbacksEntry& operator=(const GlobalRecordFunctionCallbacksEntry& rhs) { - callback = rhs.callback; - enabled = rhs.enabled.load(); - handle = rhs.handle; - return *this; - } + const RecordFunctionTLS& getTLS() const; + StepCallbacks getActiveCallbacks(const RecordScope scope); - GlobalRecordFunctionCallbacksEntry( - GlobalRecordFunctionCallbacksEntry&& rhs) noexcept - : callback(std::move(rhs.callback)), enabled(rhs.enabled.load()), handle(rhs.handle) {} + void setTLS(const RecordFunctionTLS& tls); + void seed(uint32_t seed); + CallbackHandle addCallback(RecordFunctionCallback callback); + bool setCallbackEnabled(CallbackHandle handle, bool enabled); + bool removeCallback(CallbackHandle handle); + void clearCallbacks(); - GlobalRecordFunctionCallbacksEntry& operator=(GlobalRecordFunctionCallbacksEntry&& rhs) noexcept { - callback = std::move(rhs.callback); - enabled = rhs.enabled.load(); - handle = rhs.handle; - return *this; - } + private: + void rebuild_all(const GlobalCallbackManager::snapshot_t& global_snapshot); - // Returns true if the status changed, false otherwise. - bool disable() { - bool expected = true; - // NOTE: we use sequentially consistent access here and in - // enable() because updating further atomic flags depends on this - // operation. - return enabled.compare_exchange_strong(expected, false); - } + void rebuild_callback_scopes( + const GlobalCallbackManager::snapshot_t& global_snapshot, + const RecordFunctionCallback& callback); - // Returns true if the status changed, false otherwise. - bool enable() { - bool expected = false; - return enabled.compare_exchange_strong(expected, true); - } + void rebuild_scope( + const GlobalCallbackManager::snapshot_t& global_snapshot, + const RecordScope scope); - // Read the flag. Note that it is neither necessary nor correct to - // check this before calling enable() or disable(). - bool isEnabled() const { - return enabled.load(std::memory_order_relaxed); - } -}; + // Source of truth. + RecordFunctionTLS registered_callbacks_; -using GlobalRecordFunctionCallbacks = - c10::SmallVector; + // Runtime cache. + size_t global_version_{GlobalCallbackManager::NoVersion}; + std::array active_callbacks_; + std::mt19937 generator_{}; +}; -} // namespace +// ============================================================================ +// == GlobalCallbackManager: Implementation =================================== +// ============================================================================ +GlobalCallbackManager& GlobalCallbackManager::get() { + static GlobalCallbackManager manager; + return manager; +} -const RecordFunctionTLS& get_record_function_tls_() { - return rf_tls(); +size_t GlobalCallbackManager::version() const { + return version_.load(std::memory_order_relaxed); } -void set_record_function_tls_(const RecordFunctionTLS& tls) { - rf_tls() = tls; +std::pair GlobalCallbackManager::getSnapshot() const { + std::lock_guard guard(update_mutex_); + return {version_.load(std::memory_order_seq_cst), global_callbacks_}; } -enum class ToggledCallbackResult { - NotFound, - FoundButNotToggled, - FoundAndToggled, -}; +CallbackHandle GlobalCallbackManager::addCallback(RecordFunctionCallback cb) { + std::lock_guard guard(update_mutex_); + ++version_; + auto handle = next_unique_callback_handle(); + global_callbacks_.emplace_back(std::move(cb), handle); + return handle; +} -template -static ToggledCallbackResult findAndToggleCallback( - RecordFunctionCallbacks& cbs, CallbackHandle handle, bool enabled) { - auto it = std::find_if( - cbs.begin(), cbs.end(), - [handle]( - const auto& el) { - return el.handle == handle; - }); - if (it != cbs.end()) { - bool changed = enabled ? it->enable() : it->disable(); - if (!changed) { - return ToggledCallbackResult::FoundButNotToggled; - } - if (it->callback.samplingProb() > kLowProb) { - // try to disable/restore pre-sampling of RecordFunction - if (enabled) { - at::bumpRecordAllFunctions(); - } else { - at::releaseRecordAllFunctions(); - } +void GlobalCallbackManager::setCallbackEnabled( + CallbackHandle handle, + bool enabled) { + std::lock_guard guard(update_mutex_); + auto it = findCallback(global_callbacks_, handle); + if (it != global_callbacks_.end()) { + if (it->enabled_ != enabled) { + ++version_; + it->enabled_ = enabled; } - return ToggledCallbackResult::FoundAndToggled; + } else { + LOG(WARNING) << "Requested callback is not found"; } - return ToggledCallbackResult::NotFound; -} - -template -static bool findAndRemoveCallback( - RecordFunctionCallbacks& cbs, CallbackHandle handle) { - auto it = std::find_if( - cbs.begin(), cbs.end(), - [handle]( - const auto& el) { - return el.handle == handle; - }); - if (it != cbs.end()) { - // We do not need to try to call releaseRecordAllFunctions here - // because findAndRemoveCallback is used only as a helper in - // removeCallback. removeCallback calls disableCallback, which - // calls findAndToggleCallback, which already will do a - // releaseRecordAllFunctions for us. - cbs.erase(it); - return true; +} + +void GlobalCallbackManager::removeCallback(CallbackHandle handle) { + std::lock_guard guard(update_mutex_); + if (extractCallback(global_callbacks_, handle).has_value()) { + ++version_; + } else { + LOG(WARNING) << "Requested callback is not found"; } - return false; } -class CallbackManager { - public: - CallbackManager() : num_enabled_global_callbacks_(0) {} +void GlobalCallbackManager::clearCallbacks() { + std::lock_guard guard(update_mutex_); + ++version_; + global_callbacks_.clear(); +} - CallbackHandle addThreadLocalCallback(RecordFunctionCallback cb) { - if (cb.samplingProb() > kLowProb) { - // pre-sampling of RecordFunction with prob. kLowProb cannot be used - at::bumpRecordAllFunctions(); - } - // note: monotonically increasing callbacks_unique_id keeps - // sorted_tls_callbacks_ sorted - auto handle = next_unique_callback_handle(); - rf_tls().sorted_tls_callbacks_.emplace_back(std::move(cb), handle); - return handle; - } +// ============================================================================ +// == CacheEntry: Implementation ============================================== +// ============================================================================ +CacheEntry::CacheEntry(std::mt19937* generator, RecordScope scope) + : generator_{generator}, scope_{scope} { + rebuildActiveCallbacks(); +} - CallbackHandle addGlobalCallback(RecordFunctionCallback cb) { - if (cb.samplingProb() > kLowProb) { - // pre-sampling of RecordFunction with prob. kLowProb cannot be used - at::bumpRecordAllFunctions(); - } - auto handle = next_unique_callback_handle(); - // NOLINTNEXTLINE(performance-move-const-arg) - sorted_global_callbacks_.emplace_back(std::move(cb), handle); - num_enabled_global_callbacks_.fetch_add(1, std::memory_order_relaxed); - return handle; +void CacheEntry::update(const std::vector& callbacks) { + callbacks_.clear(); + callbacks_.reserve(callbacks.size()); + for (const auto& callback : callbacks) { + const auto p = callback.samplingProb(); + callbacks_.push_back({callback, p < 1.0 ? sampleTries(p) : -1}); } - void removeCallback(CallbackHandle handle) { - // This could be implemented more efficiently, but callback - // addition/removal is not intended to run in performance-critical - // paths (it's not thread-safe and should be done during - // initialization). - disableCallback(handle); - auto found = findAndRemoveCallback(rf_tls().sorted_tls_callbacks_, handle); - if (!found) { - found = findAndRemoveCallback(sorted_global_callbacks_, handle); - } - if (!found) { - LOG(WARNING) << "Requested callback is not found"; - } - } + rebuildActiveCallbacks(); +} + +StepCallbacks CacheEntry::getActiveCallbacks() { + // We rebuild the active set when `sampling_countdown_` reaches zero, so if it + // reaches zero at the start of this function something has gone wrong. + TORCH_INTERNAL_ASSERT(sampling_countdown_ > 0, sampling_countdown_); - void disableCallback(CallbackHandle handle) { - auto found = findAndToggleCallback( - rf_tls().sorted_tls_callbacks_, handle, false); - if (found == ToggledCallbackResult::NotFound) { - found = findAndToggleCallback( - sorted_global_callbacks_, handle, false); - if (found == ToggledCallbackResult::FoundAndToggled) { - const auto previousCount = num_enabled_global_callbacks_.fetch_sub(1, std::memory_order_relaxed); - TORCH_CHECK(previousCount > 0, previousCount); + if (C10_UNLIKELY(!(--sampling_countdown_))) { + // Use inferred steps to update sampled callbacks. + for (auto& i : callbacks_) { + if (i.tries_left_ > 0) { + TORCH_INTERNAL_ASSERT(i.tries_left_ >= steps_for_this_update_); + i.tries_left_ -= steps_for_this_update_; } } - if (found == ToggledCallbackResult::NotFound) { - LOG(WARNING) << "Requested callback is not found"; - } - } - void reenableCallback(CallbackHandle handle) { - auto found = findAndToggleCallback( - rf_tls().sorted_tls_callbacks_, handle, true); - if (found == ToggledCallbackResult::NotFound) { - found = findAndToggleCallback( - sorted_global_callbacks_, handle, true); - if (found == ToggledCallbackResult::FoundAndToggled) { - num_enabled_global_callbacks_.fetch_add(1, std::memory_order_relaxed); + // Determine which callbacks to run and for how long. + rebuildActiveCallbacks(); + + // Resample any sampled callbacks that ran this call. + for (auto& i : callbacks_) { + if (!i.tries_left_) { + i.tries_left_ = sampleTries(i.callback_.samplingProb()); } } - if (found == ToggledCallbackResult::NotFound) { - LOG(WARNING) << "Requested callback is not found"; + } + + return active_callbacks_; +} + +void CacheEntry::rebuildActiveCallbacks() { + // We could store thread ID in CacheEntry, but rebuilds are infrequent and + // this saves us from having to plumb it through. + const auto thread_id = RecordFunction::currentThreadId(); + active_callbacks_ = StepCallbacks(thread_id, scope_); + + sampling_countdown_ = std::numeric_limits::max(); + for (const auto& i : callbacks_) { + if (i.tries_left_ < 0) { + // Callback is not sampled. Unconditionally push. + active_callbacks_.callbacks_.push_back( + {i.callback_.start(), i.callback_.end()}); + + } else if (i.tries_left_ == 0) { + // Callback is sampled and we have reached a sampling event. Push and + // set `sampling_countdown_` to one so we trigger a rebuild after one call. + active_callbacks_.callbacks_.push_back( + {i.callback_.start(), i.callback_.end()}); + sampling_countdown_ = 1; + + } else { + // Callback is sampled and we have not reached sampling event. Set + // `sampling_countdown_` to rebuild when it is time for this callback to + // execute. + sampling_countdown_ = std::min(sampling_countdown_, i.tries_left_); } + active_callbacks_.needs_inputs_ |= i.callback_.needsInputs(); + active_callbacks_.needs_outputs_ |= i.callback_.needsOutputs(); + active_callbacks_.needs_ids_ |= i.callback_.needsIds(); } + steps_for_this_update_ = sampling_countdown_; +} + +int CacheEntry::sampleTries(double p) const { + TORCH_INTERNAL_ASSERT(generator_ != nullptr); + TORCH_INTERNAL_ASSERT(p > 0.0 && p <= 1.0); - void clearGlobalCallbacks() { - sorted_global_callbacks_.clear(); - num_enabled_global_callbacks_ = 0; + // The geometric distribution returns the number of failures. We add one to + // also account for the call where we succeed. + return std::geometric_distribution(p)(*generator_) + 1; +} + +// ============================================================================ +// == LocalCallbackManager: Implementation ==================================== +// ============================================================================ +LocalCallbackManager& LocalCallbackManager::get() { +#if defined(C10_PREFER_CUSTOM_THREAD_LOCAL_STORAGE) + static c10::ThreadLocal manager; + return manager.get(); +#else // defined(C10_PREFER_CUSTOM_THREAD_LOCAL_STORAGE) + static thread_local LocalCallbackManager manager; + return manager; +#endif // defined(C10_PREFER_CUSTOM_THREAD_LOCAL_STORAGE) +} + +LocalCallbackManager::LocalCallbackManager() { + for (auto i : c10::irange(NumRecordScopes)) { + active_callbacks_[i] = CacheEntry(&generator_, static_cast(i)); } + rebuild_all(GlobalCallbackManager::get().getSnapshot()); +} + +const RecordFunctionTLS& LocalCallbackManager::getTLS() const { + return registered_callbacks_; +} - void clearThreadLocalCallbacks() { - rf_tls().sorted_tls_callbacks_.clear(); +StepCallbacks LocalCallbackManager::getActiveCallbacks( + const RecordScope scope) { + const auto global_version = GlobalCallbackManager::get().version(); + if (C10_UNLIKELY(global_version != global_version_)) { + rebuild_all(GlobalCallbackManager::get().getSnapshot()); } + return active_callbacks_[static_cast(scope)].getActiveCallbacks(); +} + +void LocalCallbackManager::setTLS(const RecordFunctionTLS& tls) { + registered_callbacks_ = tls; + rebuild_all(GlobalCallbackManager::get().getSnapshot()); +} + +void LocalCallbackManager::seed(uint32_t seed) { + generator_.seed(seed); +} + +CallbackHandle LocalCallbackManager::addCallback( + RecordFunctionCallback callback) { + auto handle = next_unique_callback_handle(); + auto& callbacks = registered_callbacks_.sorted_tls_callbacks_; + callbacks.emplace_back(std::move(callback), handle); + rebuild_callback_scopes( + GlobalCallbackManager::get().getSnapshot(), callbacks.back().callback_); + return handle; +} - inline bool hasGlobalCallbacks() const { - return num_enabled_global_callbacks_.load(std::memory_order_relaxed) > 0; +bool LocalCallbackManager::setCallbackEnabled( + CallbackHandle handle, + bool enabled) { + auto it = findCallback(registered_callbacks_.sorted_tls_callbacks_, handle); + auto found = (it != registered_callbacks_.sorted_tls_callbacks_.end()); + if (found && it->enabled_ != enabled) { + it->enabled_ = enabled; + rebuild_callback_scopes( + GlobalCallbackManager::get().getSnapshot(), it->callback_); } + return found; +} - inline bool hasThreadLocalCallbacks() const { - return !rf_tls().sorted_tls_callbacks_.empty(); +bool LocalCallbackManager::removeCallback(CallbackHandle handle) { + auto& callbacks = registered_callbacks_.sorted_tls_callbacks_; + auto callback = extractCallback(callbacks, handle); + if (callback.has_value()) { + rebuild_callback_scopes( + GlobalCallbackManager::get().getSnapshot(), *callback); } + return callback.has_value(); +} - // We need this function to be inlined: init() is a hot path and - // callbackShouldRun is even hotter because it's called multiple - // times per init(). Profiling shows that the function prologue is - // taking up a significant fraction of the time. - static bool C10_ALWAYS_INLINE callbackShouldRun( - const RecordFunctionCallback& cb, RecordScope scope, bool pre_sampled) { - TORCH_INTERNAL_ASSERT( - !pre_sampled || (cb.sampling_prob_ <= kLowProb), - "Incorrect usage of a pre-sampled RecordFunction with a high-frequency " - " or non-sampled callback"); - - // first check whether this callback is interested in - // the given scope type - if (!cb.checkScope(scope)) { - return false; - } +void LocalCallbackManager::clearCallbacks() { + registered_callbacks_.sorted_tls_callbacks_.clear(); + rebuild_all(GlobalCallbackManager::get().getSnapshot()); +} - // otherwise potentially do the sampling - double sampling_prob = cb.sampling_prob_; - constexpr double kLowProbInv = 1 / kLowProb; - if (pre_sampled) { - // adjust the sampling rate to account for kLowProb pre-sampling of - // the RecordFunction - sampling_prob *= kLowProbInv; - } +void LocalCallbackManager::rebuild_all(const GlobalCallbackManager::snapshot_t& global_snapshot) { + global_version_ = global_snapshot.first; + for (auto i : c10::irange(NumRecordScopes)) { + rebuild_scope(global_snapshot, static_cast(i)); + } +} - if (sampling_prob < 1.0) { - // model the low probability events as events happening - // with probability kLowProb followed by another sampling with - // probability (sampling_prob / kLowProb), then replace the coin - // flip for kLowProb with a thread local number of tries tries_left_ - // sampled from the geometric distribution. - if (sampling_prob < kLowProb) { - if (coinflip_tls().tries_left_ == 0) { - coinflip_tls().tries_left_ = sample_geometric(); - return (sample_zero_one() < sampling_prob * kLowProbInv); - } else { - --coinflip_tls().tries_left_; - return false; - } - } else { - return (sample_zero_one() < sampling_prob); +void LocalCallbackManager::rebuild_callback_scopes( + const GlobalCallbackManager::snapshot_t& global_snapshot, + const RecordFunctionCallback& callback) { + if (global_snapshot.first == global_version_) { + // Only rebuild scopes associated with `callback` + for (auto i : c10::irange(NumRecordScopes)) { + if (callback.checkScope(static_cast(i))) { + rebuild_scope(global_snapshot, static_cast(i)); } } + } else { + rebuild_all(global_snapshot); + } +} + +void LocalCallbackManager::rebuild_scope( + const GlobalCallbackManager::snapshot_t& global_snapshot, + const RecordScope scope) { + std::vector callbacks; + if (registered_callbacks_.tls_record_function_enabled_) { + auto populate_callbacks = + [&](const RecordFunctionCallbacks& raw_callbacks) { + for (const auto& i : raw_callbacks) { + if (i.enabled_ && i.callback_.checkScope(scope) && + i.callback_.samplingProb() > 0) { + callbacks.push_back(i.callback_); + } + } + }; + populate_callbacks(global_snapshot.second); + populate_callbacks(registered_callbacks_.sorted_tls_callbacks_); + } + active_callbacks_[static_cast(scope)].update(callbacks); +} + +// ============================================================================ +// == Callback execution ====================================================== +// ============================================================================ +void logTryRunCallbackError(const char* what, const char* name) { + LOG(WARNING) << "Exception in RecordFunction callback: " << what + << " , for the range " << name; +} + +template +C10_ALWAYS_INLINE bool tryRunCallback( + const StepCallbacks::StartEndPair callback_ptrs, + const RecordFunction& rf, + std::unique_ptr& ctx) { + try { + if (is_start && callback_ptrs.start_) { + ctx = callback_ptrs.start_(rf); + } + + if (!is_start && callback_ptrs.end_) { + callback_ptrs.end_(rf, ctx.get()); + } return true; + } catch (const std::exception& e) { + logTryRunCallbackError(e.what(), rf.name()); + return false; + } catch (...) { + logTryRunCallbackError("unknown", rf.name()); + return false; } +} - // init is called by RecordFunction in constructor to - // determine which thread local and global callbacks are going - // to be executed and whether any of them need inputs - inline void init(RecordFunction& rec_fn, RecordScope scope, bool pre_sampled) { - bool found_needs_inputs = false; - bool found_needs_outputs = false; - bool found_needs_ids = false; - - for (const auto& cb: rf_tls().sorted_tls_callbacks_) { - if (cb.isEnabled() && callbackShouldRun(cb.callback, scope, pre_sampled)) { - if (cb.callback.needsInputs()) { - found_needs_inputs = true; - } - if (cb.callback.needsOutputs()) { - found_needs_outputs = true; - } - if (cb.callback.needsIds()) { - found_needs_ids = true; - } - if (!rec_fn.state_) { - rec_fn.state_.emplace(scope); - } - rec_fn.state_->sorted_active_tls_handles_.push_back(cb.handle); - } - } +} // namespace - for (const auto& cb: sorted_global_callbacks_) { - if (cb.isEnabled() && callbackShouldRun(cb.callback, scope, pre_sampled)) { - if (cb.callback.needsInputs()) { - found_needs_inputs = true; - } - if (cb.callback.needsOutputs()) { - found_needs_outputs = true; - } - if (cb.callback.needsIds()) { - found_needs_ids = true; - } - if (!rec_fn.state_) { - rec_fn.state_.emplace(scope); - } - rec_fn.state_->sorted_active_global_handles_.push_back(cb.handle); - } - } +RecordFunction::RecordFunction(RecordScope scope) + : RecordFunction(getStepCallbacks(scope)) {} - if (!rec_fn.state_) { - return; - } +RecordFunction::RecordFunction(StepCallbacks&& step_callbacks) + : step_callbacks_{std::move(step_callbacks)} { + ctx_.resize(step_callbacks_.callbacks_.size()); + if (step_callbacks_.needs_ids_) { + setHandle(next_unique_record_function_handle()); + } +} - // Pre-allocate observer context list with nullptr. - rec_fn.state_->tls_ctx_.resize(rec_fn.state_->sorted_active_tls_handles_.size()); - rec_fn.state_->global_ctx_.resize(rec_fn.state_->sorted_active_global_handles_.size()); +void RecordFunction::runStartCallbacks() { + for (const auto i : c10::irange(step_callbacks_.callbacks_.size())) { + tryRunCallback( + step_callbacks_.callbacks_[i], *this, ctx_[i]); + } + called_start_callbacks_ = true; +} - rec_fn.state_->needs_inputs = found_needs_inputs; - rec_fn.state_->needs_outputs = found_needs_outputs; - if (found_needs_ids) { - rec_fn.setHandle(next_unique_record_function_handle()); +void RecordFunction::end() { + if (called_start_callbacks_) { + for (const auto i : c10::irange(step_callbacks_.callbacks_.size())) { + tryRunCallback( + step_callbacks_.callbacks_[i], *this, ctx_[i]); } + step_callbacks_.callbacks_.clear(); } +} - void runStartCallbacks(RecordFunction& rf) { - mergeRunCallbacks( - sorted_global_callbacks_, - rf.state_->sorted_active_global_handles_, - rf.state_->global_ctx_, - /* is_start */ true, - rf); - mergeRunCallbacks( - rf_tls().sorted_tls_callbacks_, - rf.state_->sorted_active_tls_handles_, - rf.state_->tls_ctx_, - /* is_start */ true, - rf); - rf.state_->called_start_callbacks_ = true; - } +const char* RecordFunction::name() const { + return c10::visit( + c10::overloaded( + [](const std::string& name) { return name.c_str(); }, + [](const schema_ref_t schema) { + return schema.get().name().c_str(); + }), + fn_); +} - void runEndCallbacks(RecordFunction& rf) { - mergeRunCallbacks( - sorted_global_callbacks_, - rf.state_->sorted_active_global_handles_, - rf.state_->global_ctx_, - /* is_start */ false, - rf); - mergeRunCallbacks( - rf_tls().sorted_tls_callbacks_, - rf.state_->sorted_active_tls_handles_, - rf.state_->tls_ctx_, - /* is_start */ false, - rf); - } +size_t RecordFunction::num_inputs() const { + return c10::visit( + c10::overloaded( + [&](const std::string&) { return inputs_.size(); }, + [](const schema_ref_t schema) { + return schema.get().arguments().size(); + }), + fn_); +} - // Global callbacks; must be sorted in increasing handle order - GlobalRecordFunctionCallbacks sorted_global_callbacks_; - std::atomic num_enabled_global_callbacks_; +size_t RecordFunction::num_outputs() const { + return c10::visit( + c10::overloaded( + [&](const std::string&) { return outputs_.size(); }, + [](const schema_ref_t schema) { + return schema.get().returns().size(); + }), + fn_); +} - private: - static void logTryRunCallbackError(const char* what, const RecordFunction& rf) { - LOG(WARNING) << "Exception in RecordFunction callback: " << what << " , for the range " << rf.name(); - } +c10::optional RecordFunction::operator_name() const { + return c10::visit( + c10::overloaded( + [&](const std::string&) -> c10::optional { + return c10::nullopt; + }, + [](const schema_ref_t schema) -> c10::optional { + return schema.get().operator_name(); + }), + fn_); +} - C10_ALWAYS_INLINE static bool tryRunCallback( - const RecordFunctionCallback& rfcb, - RecordFunction& rf, - std::unique_ptr& ctx, - bool is_start) { - try { - if (is_start) { - ctx = rfcb.start() ? rfcb.start()(rf) : nullptr; - } - else { - if (rfcb.end()) { - rfcb.end()(rf, ctx.get()); - } - } - return true; - } catch (const std::exception &e) { - logTryRunCallbackError(e.what(), rf); - return false; - } catch (...) { - logTryRunCallbackError("unknown", rf); - return false; - } - } +StepCallbacks getStepCallbacks(RecordScope scope) { + return LocalCallbackManager::get().getActiveCallbacks(scope); +} - template - static void mergeRunCallbacks( - const RecordFunctionCallbacks& sorted_callbacks, - const CallbackHandles& sorted_handles, - ObserverContextList& ctx_list, - bool is_start, - RecordFunction& rf) { - size_t num_executed = 0; - size_t idx_c = 0; - const auto sorted_handles_size = sorted_handles.size(); - const auto ctx_list_size = ctx_list.size(); - const auto sorted_callbacks_size = sorted_callbacks.size(); - for (size_t idx_h = 0; idx_h < sorted_handles_size && idx_h < ctx_list_size; ++idx_h) { - while (idx_c < sorted_callbacks_size && - sorted_callbacks[idx_c].handle < sorted_handles[idx_h]) { - ++idx_c; - } - if (idx_c >= sorted_callbacks_size) { - break; - } - if (sorted_callbacks[idx_c].handle == sorted_handles[idx_h]) { - tryRunCallback(sorted_callbacks[idx_c].callback, rf, ctx_list[idx_h], is_start); - ++num_executed; - } - } +const RecordFunctionTLS& get_record_function_tls_() { + return LocalCallbackManager::get().getTLS(); +} - if (num_executed != sorted_handles.size()) { - C10_LOG_EVERY_MS(WARNING, 1000) - << "Could not match some of the start callbacks with the corresponding end callbacks, " - << "callbacks changed during RecordFunction lifetime; you might be trying to profile " - << "the code after profiler is finished"; - } - } -}; +void set_record_function_tls_(const RecordFunctionTLS& tls) { + LocalCallbackManager::get().setTLS(tls); +} namespace { - // Keeping this static manager local. - CallbackManager& manager() { - static CallbackManager _manager; - return _manager; - } +bool anyEnabled(const RecordFunctionCallbacks& callbacks) { + return std::any_of(callbacks.begin(), callbacks.end(), [](const auto& cb) { + return cb.enabled_; + }); +} } // namespace bool hasCallbacks() { - auto& m = manager(); - return m.hasGlobalCallbacks() || m.hasThreadLocalCallbacks(); + return hasThreadLocalCallbacks() || hasGlobalCallbacks(); } bool hasGlobalCallbacks() { - return manager().hasGlobalCallbacks(); + return anyEnabled(GlobalCallbackManager::get().getSnapshot().second); } bool hasThreadLocalCallbacks() { - return manager().hasThreadLocalCallbacks(); + return anyEnabled(get_record_function_tls_().sorted_tls_callbacks_); } CallbackHandle addThreadLocalCallback( RecordFunctionCallback cb) { // NOLINTNEXTLINE(performance-move-const-arg) - return manager().addThreadLocalCallback(std::move(cb)); + return LocalCallbackManager::get().addCallback(std::move(cb)); } CallbackHandle addGlobalCallback( RecordFunctionCallback cb) { // NOLINTNEXTLINE(performance-move-const-arg) - return manager().addGlobalCallback(std::move(cb)); + return GlobalCallbackManager::get().addCallback(std::move(cb)); } void removeCallback(CallbackHandle handle) { - manager().removeCallback(handle); + if (!LocalCallbackManager::get().removeCallback(handle)) { + GlobalCallbackManager::get().removeCallback(handle); + } } void disableCallback(CallbackHandle handle) { - manager().disableCallback(handle); + if (!LocalCallbackManager::get().setCallbackEnabled(handle, false)) { + GlobalCallbackManager::get().setCallbackEnabled(handle, false); + } } void reenableCallback(CallbackHandle handle) { - manager().reenableCallback(handle); + if (!LocalCallbackManager::get().setCallbackEnabled(handle, true)) { + GlobalCallbackManager::get().setCallbackEnabled(handle, true); + } } void clearGlobalCallbacks() { - manager().clearGlobalCallbacks(); + GlobalCallbackManager::get().clearCallbacks(); } void clearThreadLocalCallbacks() { - manager().clearThreadLocalCallbacks(); + LocalCallbackManager::get().clearCallbacks(); } void clearCallbacks() { - auto& m = manager(); - m.clearGlobalCallbacks(); - m.clearThreadLocalCallbacks(); + clearGlobalCallbacks(); + clearThreadLocalCallbacks(); } bool isRecordFunctionEnabled() { - return rf_tls().tls_record_function_enabled_; + return LocalCallbackManager::get().getTLS().tls_record_function_enabled_; } void enableRecordFunction(bool enable) { - rf_tls().tls_record_function_enabled_ = enable; + auto tls = LocalCallbackManager::get().getTLS(); + if (tls.tls_record_function_enabled_ != enable) { + tls.tls_record_function_enabled_ = enable; + LocalCallbackManager::get().setTLS(tls); + } } -RecordFunction::RecordFunction(RecordScope scope, bool pre_sampled) { - auto* rf_tls_ptr = &rf_tls(); - if (rf_tls_ptr->tls_record_function_enabled_) { - auto& m = manager(); - if (!m.sorted_global_callbacks_.empty() || !rf_tls_ptr->sorted_tls_callbacks_.empty()) { - m.init(*this, scope, pre_sampled); - } - } +void set_record_function_seed_for_testing(uint32_t seed) { + LocalCallbackManager::get().seed(seed); } /* static */ @@ -580,45 +669,29 @@ uint64_t RecordFunction::currentThreadId() { } void RecordFunction::before(const char* name, int64_t sequence_nr) { - if (!isActive()) { - return; - } - state_->op_input_size = state_->inputs_.size(); - state_->name_ = name; - state_->sequence_nr_ = sequence_nr; - state_->thread_id_ = currentThreadId(); - state_->operator_name_.reset(); + fn_ = name; + sequence_nr_ = sequence_nr; - manager().runStartCallbacks(*this); + runStartCallbacks(); + invalidateInputs(); } void RecordFunction::before(std::string name, int64_t sequence_nr) { - if (!isActive()) { - return; - } - state_->op_input_size = state_->inputs_.size(); - state_->name_ = std::move(name); - state_->sequence_nr_ = sequence_nr; - state_->thread_id_ = currentThreadId(); - state_->operator_name_.reset(); + fn_ = std::move(name); + sequence_nr_ = sequence_nr; - manager().runStartCallbacks(*this); + runStartCallbacks(); + invalidateInputs(); } void RecordFunction::before( - c10::OperatorHandle const& op, + RecordFunction::schema_ref_t schema, int64_t sequence_nr) { - if (!isActive()) { - return; - } - state_->sequence_nr_ = sequence_nr; - state_->thread_id_ = currentThreadId(); - state_->operator_name_ = op.operator_name(); - state_->op_input_size = op.schema().arguments().size(); - state_->op_output_size = op.schema().returns().size(); - state_->name_ = op.schema().name(); + sequence_nr_ = sequence_nr; + fn_ = schema; - manager().runStartCallbacks(*this); + runStartCallbacks(); + invalidateInputs(); } /* static */ void RecordFunction::setDefaultNodeId(int64_t newDefaultNodeId) { @@ -634,69 +707,24 @@ RecordFunction::~RecordFunction() { end(); } -void RecordFunction::end() { - if (isActive() && state_->called_start_callbacks_) { - manager().runEndCallbacks(*this); - state_.reset(); - } -} - void RecordFunction::_setAsync() { - if (isActive()) { - state_->is_async_ = true; - } + is_async_ = true; } bool RecordFunction::isAsync() const { - if (isActive()) { - return state_->is_async_; - } - return false; -} - -// RecordFunction pre-sampling -namespace { -// Whether to try to create RecordFunction on each call (>0) or -// use pre-sampling (=0) -std::atomic global_record_all_functions_ {0}; -} - -void bumpRecordAllFunctions() { - global_record_all_functions_.fetch_add(1, std::memory_order_relaxed); + return is_async_; } -void releaseRecordAllFunctions() { - TORCH_CHECK(global_record_all_functions_.fetch_sub(1, std::memory_order_relaxed) > 0); -} - -bool checkRecordAllFunctions() { - return (global_record_all_functions_.load(std::memory_order_relaxed) > 0); -} - -bool shouldRunRecordFunction(bool* pre_sampled) { - auto* rf_tls_ptr = &rf_tls(); - if (rf_tls_ptr->sorted_tls_callbacks_.empty() && !manager().hasGlobalCallbacks()) { - *pre_sampled = false; - return false; - } - if (global_record_all_functions_.load(std::memory_order_relaxed) > 0) { - *pre_sampled = false; - return true; - } - if (!rf_tls_ptr->tls_record_function_enabled_) { - *pre_sampled = false; - return false; +void RecordFunction::_setStaticRuntimeOutVariant() { + if (isActive()) { + is_static_runtime_out_variant_ = true; } +} - *pre_sampled = true; - auto* coinflip_tls_ptr = &coinflip_tls(); - if (coinflip_tls_ptr->tries_left_ == 0) { - coinflip_tls_ptr->tries_left_ = sample_geometric(); - return true; - } else { - --coinflip_tls_ptr->tries_left_; - return false; +bool RecordFunction::isStaticRuntimeOutVariant() const { + if (isActive()) { + return is_static_runtime_out_variant_; } + return false; } - } // namespace at diff --git a/aten/src/ATen/record_function.h b/aten/src/ATen/record_function.h index f6688726fcaf..af594f47e789 100644 --- a/aten/src/ATen/record_function.h +++ b/aten/src/ATen/record_function.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -99,80 +100,250 @@ struct ObserverContext { typedef c10::SmallVector CallbackHandles; typedef c10::SmallVector, kSoftLimitCallbacks> ObserverContextList; typedef uint64_t RecordFunctionHandle; +struct RecordFunction; + +// +// PyTorch callbacks/observers API: +// + +/** + * RecordFunctionCallback represents a pair of callbacks to be used with + * RecordFunction, members: + * start, end - the callbacks to run when entering and exiting the scope; + * optionally, the start callback may return an ObserverContext which will + * be passed to the end callback, use appropriate constructor accordingly. + * needs_inputs - whether the callbacks need the inputs passed from the observed + * function/range; NOTE: passing the inputs incurs an additional overhead; + * sampling_probability - if not 1.0, then the callback is probabilistically sampled + * to run; NOTE: start and end callbacks always run as a pair and are sampled + * together; + * scopes - types of scopes to execute the callbacks on (see RecordScope); + * passing empty set means the callbacks will be executed for all possible + * scope types + * should_run - optional function that returns whether this callback should run; + * overwrites the effect of setting sampling_probability + */ +class TORCH_API RecordFunctionCallback { + public: + using StartCallback = std::unique_ptr(*)(const RecordFunction&); + using EndCallback = void (*)(const RecordFunction&, ObserverContext*); + + // This interface supports observers that require passing an ObserverContext + // between start and end callbacks. + explicit RecordFunctionCallback( + StartCallback start, + EndCallback end = nullptr) : + start_(start), + end_(end) { + scopes_.fill(true); + } + + RecordFunctionCallback& needsInputs(bool needs_inputs) { + needs_inputs_ = needs_inputs; + return *this; + } + + RecordFunctionCallback& needsOutputs(bool needs_outputs) { + needs_outputs_ = needs_outputs; + return *this; + } + + RecordFunctionCallback& needsIds(bool needs_ids) { + needs_ids_ = needs_ids; + return *this; + } + + RecordFunctionCallback& samplingProb(double sampling_prob) { + TORCH_CHECK(sampling_prob >= 0.0 && sampling_prob <= 1.0, + "Invalid sampling probability"); + sampling_prob_ = sampling_prob; + return *this; + } + + RecordFunctionCallback& scopes( + const std::unordered_set>& scopes) { + if (!scopes.empty()) { + scopes_.fill(false); + for (auto sc : scopes) { + scopes_[static_cast(sc)] = true; + } + } else { + scopes_.fill(true); + } + return *this; + } + + bool needsInputs() const { + return needs_inputs_; + } + + bool needsOutputs() const { + return needs_outputs_; + } + + bool needsIds() const { + return needs_ids_; + } + + double samplingProb() const { + return sampling_prob_; + } + + bool checkScope(RecordScope sc) const { + return scopes_[(size_t)sc]; + } + + StartCallback start() const { + return start_; + } + + EndCallback end() const { + return end_; + } + + private: + StartCallback start_; + EndCallback end_; + double sampling_prob_ = 1.0; + std::array(RecordScope::NUM_SCOPES)> scopes_ = {}; + bool needs_inputs_ = false; + bool needs_outputs_ = false; + bool needs_ids_ = false; +}; + +// Notes: +// - two types of callbacks are provided: thread local and global +// - thread local callbacks are added/removed only for the given thread +// and are stored locally for each thread and separately from the list +// of the global callbacks +// - global callbacks are stored in a single per process list and are +// invoked by every RecordFunction, in addition to the thread local +// callbacks specific to the given thread +// - we allow the added callbacks to be sampled, by specifying a sampling +// probability for each callback pair, if the start callback is +// not picked to run, the corresponding end callback won't be called +// - a typical use case for the global callbacks is passive monitoring +// in the background (e.g. fleet-wide monitoring), without focusing on +// the specific piece of code +// - in contrast, thread local callbacks are enabled locally, on demand, +// for the specific piece of code (range) and are not sampled +// - a typical use case for thread local callbacks is profiler and code +// execution tracer +// - note, thread local callbacks are automatically propagated with +// ThreadLocalState across JIT continuations and async tasks (at::launch) + +typedef uint64_t CallbackHandle; + +// It is unnecessary to use atomic operations for enabling +// thread-local function callbacks. Moreover, it prevents saving to +// ThreadLocalState because std::atomic is non-copyable. +struct RecordFunctionCallbacksEntry { + RecordFunctionCallbacksEntry(RecordFunctionCallback&& cb, CallbackHandle h) + : callback_(cb), handle_(h) {} + + RecordFunctionCallback callback_; + bool enabled_{true}; + CallbackHandle handle_; +}; + +// Holds pairs (callbacks, unique_id) +using RecordFunctionCallbacks = std::vector; + +// Generated by the callback managers to determine which functions to run. +struct StepCallbacks { + StepCallbacks() = default; + StepCallbacks(uint64_t thread_id, RecordScope scope) + : thread_id_{thread_id}, scope_{scope} {} + + bool empty() const { + return callbacks_.empty(); + } + + struct StartEndPair { + RecordFunctionCallback::StartCallback start_; + RecordFunctionCallback::EndCallback end_; + }; + + using StartEndPairs = c10::SmallVector; + + StartEndPairs callbacks_; + uint64_t thread_id_{0}; + RecordScope scope_{RecordScope::FUNCTION}; + bool needs_inputs_{false}; + bool needs_outputs_{false}; + bool needs_ids_{false}; +}; struct TORCH_API RecordFunction { // Default constructor is used with before function called afterwards: // scope - record scope that this function tracks // pre_sampled - whether this RecordFunction was already pre-sampled with // kLowProb probability - RecordFunction( - RecordScope scope = RecordScope::FUNCTION, - bool pre_sampled = false); + explicit RecordFunction(RecordScope scope = RecordScope::FUNCTION); + explicit RecordFunction(StepCallbacks&& step_callbacks); template void before( F fn, - const std::vector* args, + c10::ArrayRef args, int64_t current_sequence_nr = -1) { if (!isActive()) { return; } - state_->inputs_ = *args; + inputs_ = args; +#ifndef NDEBUG + inputs_valid_ = true; +#endif before(fn, current_sequence_nr); } + template + void before( + F fn, + const std::vector* args, + int64_t current_sequence_nr = -1) { + before(std::move(fn), c10::ArrayRef(args->data(), args->size()), current_sequence_nr); + } + // Destructor calls end callbacks virtual ~RecordFunction(); RecordFunction(const RecordFunction&) = delete; RecordFunction& operator=(const RecordFunction&) = delete; - const char* name() const { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called name() on inactive RecordFunction"); - return state_->name_.c_str(); - } + const char* name() const; int64_t seqNr() const { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called seqNr() on inactive RecordFunction"); - return state_->sequence_nr_; + return sequence_nr_; } - const std::vector& inputs() const { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called inputs() on inactive RecordFunction"); - return state_->inputs_; + c10::ArrayRef inputs() const { +#ifndef NDEBUG + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inputs_valid_, "Called inputs() outside RecordFunction start callback"); +#endif + return inputs_; } const std::vector& outputs() const { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called outputs() on inactive RecordFunction"); - return state_->outputs_; + return outputs_; } void setOutputs(std::vector&& outputs) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called setOutputs() on inactive RecordFunction"); - state_->outputs_ = std::move(outputs); + outputs_ = std::move(outputs); } void setOutputs(c10::ArrayRef outputs) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called setOutputs() on inactive RecordFunction"); - state_->outputs_ = outputs.vec(); - } - - size_t num_inputs() const { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called num_inputs() on inactive RecordFunction"); - return state_->op_input_size; + outputs_ = outputs.vec(); } - size_t num_outputs() const { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called num_outputs() on inactive RecordFunction"); - return state_->op_output_size; - } + size_t num_inputs() const; + size_t num_outputs() const; // Retrieves the thread_id that this RecordFunction ran start callbacks with. // Useful for writing thread safe end callbacks that may be potentially // executed in a different thread (async ops) uint64_t threadId() const { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called threadId() on inactive RecordFunction"); - return state_->thread_id_; + return step_callbacks_.thread_id_; } // For backward functions - thread id of the corresponding forward function, @@ -180,18 +351,15 @@ struct TORCH_API RecordFunction { // used alongside with sequence number to correlate backward functions with // the forward ones uint64_t forwardThreadId() const { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called forwardThreadId() on inactive RecordFunction"); - return state_->fwd_thread_id_; + return fwd_thread_id_; } void setForwardThreadId(uint64_t thread_id) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called setForwardThreadId() on inactive RecordFunction"); - state_->fwd_thread_id_ = thread_id; + fwd_thread_id_ = thread_id; } RecordScope scope() const { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called scope() on inactive RecordFunction"); - return state_->scope_; + return step_callbacks_.scope_; } // Returns logical thread_id for the current thread @@ -202,39 +370,16 @@ struct TORCH_API RecordFunction { // before functions initialize RecordFunction members and call // start callbacks + using schema_ref_t = std::reference_wrapper; void before(const char* name, int64_t sequence_nr = -1); void before(std::string name, int64_t sequence_nr = -1); - void before(c10::OperatorHandle const& op, int64_t sequence_nr = -1); + void before(schema_ref_t schema, int64_t sequence_nr = -1); // Sets node ID for distributed profiling static void setDefaultNodeId(int64_t defaultNodeId); // Gets node ID for distributed profiling static int64_t getDefaultNodeId(); - template - void before( - F fn, - c10::ArrayRef args, - int64_t current_sequence_nr = -1) { - if (!isActive()) { - return; - } - state_->inputs_ = args.vec(); - before(fn, current_sequence_nr); - } - - template - void before( - F fn, - std::vector&& args, - int64_t current_sequence_nr = -1) { - if (!isActive()) { - return; - } - state_->inputs_ = std::move(args); - before(fn, current_sequence_nr); - } - // Calls end callbacks. After end(), accessors will no longer provide useful results. void end(); @@ -244,238 +389,132 @@ struct TORCH_API RecordFunction { // Returns whether this RecordFunction corresponds to an async event orn ot. bool isAsync() const; + // Internal-only, used to denote out variant used for Static Runtime execution + void _setStaticRuntimeOutVariant(); + bool isStaticRuntimeOutVariant() const; + RecordFunctionHandle handle() const { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called handle() on inactive RecordFunction"); - return state_->handle_; + return handle_; } - c10::optional operator_name() const { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called operator_name() on inactive RecordFunction"); - return state_->operator_name_; - } + c10::optional operator_name() const; void setHandle(RecordFunctionHandle handle) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called setHandle() on inactive RecordFunction"); - state_->handle_ = handle; + handle_ = handle; } // Whether this RecordFunction runs any callbacks. bool isActive() const { - return state_.has_value(); + return !step_callbacks_.empty(); } bool needsInputs() const { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called needsInputs() on inactive RecordFunction"); - return state_->needs_inputs; + return step_callbacks_.needs_inputs_; } bool needsOutputs() const { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called needsOutputs() on inactive RecordFunction"); - return state_->needs_outputs; + return step_callbacks_.needs_outputs_; } int64_t debugHandle() const { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called debugHandle() on inactive RecordFunction"); - return state_->debug_handle_; + return debug_handle_; } void setDebugHandle(int64_t debug_handle) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(state_, "Called setDebugHandle() on inactive RecordFunction"); - state_->debug_handle_ = debug_handle; + debug_handle_ = debug_handle; } - private: - - // Allows the modification of some internal states for callbacks. - friend class CallbackManager; - - struct State { - explicit State(RecordScope scope) : scope_(scope) {} - - // Whether any of the picked callbacks require inputs - bool needs_inputs = false; - - // Whether any of the picked callbacks require outputs - bool needs_outputs = false; - - // In cases when RecordFunction might be active but we chose not to - // use the observers (e.g. operator is not observed), this boolean - // flag is used to check whether the start callbacks were called - bool called_start_callbacks_ = false; - - // Whether the RecordFunction is pre-sampled - bool pre_sampled_ = false; + void invalidateInputs() { +#ifndef NDEBUG + inputs_valid_ = false; +#endif + } - // Used internally to keep track of thread local and global callbacks - // that were picked to run; must be sorted; - CallbackHandles sorted_active_tls_handles_; - CallbackHandles sorted_active_global_handles_; + private: + void runStartCallbacks(); - // Stores various ObserverContext objects with event metadata for thread local - // callbacks. - ObserverContextList tls_ctx_; + StepCallbacks step_callbacks_; - // Stores various ObserverContext objects with event metadata for global - // callbacks. - ObserverContextList global_ctx_; + // In cases when RecordFunction might be active but we chose not to + // use the observers (e.g. operator is not observed), this boolean + // flag is used to check whether the start callbacks were called + bool called_start_callbacks_ = false; - std::string name_; - int64_t sequence_nr_ = -1; - std::vector inputs_; - std::vector outputs_; +#ifndef NDEBUG + bool inputs_valid_ = false; +#endif - c10::optional operator_name_; - size_t op_input_size{0}; - size_t op_output_size{0}; + // Stores various ObserverContext objects with event metadata for callbacks. + ObserverContextList ctx_; - // Kind of scope this RecordFunction is observing - const RecordScope scope_; + c10::variant fn_; - // The logical thread_id that this RecordFunction was created with - uint64_t thread_id_ = 0; + int64_t sequence_nr_ = -1; + c10::ArrayRef inputs_; + std::vector outputs_; - // For backward functions - thread id of the the forward function - uint64_t fwd_thread_id_ = 0; + // For backward functions - thread id of the the forward function + uint64_t fwd_thread_id_ = 0; - // Unique id for this RecordFunction, used in callbacks to track start - // and end of ranges - RecordFunctionHandle handle_ {0}; + // Unique id for this RecordFunction, used in callbacks to track start + // and end of ranges + RecordFunctionHandle handle_ {0}; - // Whether this record_function corresponds to an async event or not. Async - // events can complete in different threads or follow a future-like pattern - // of use. - bool is_async_{false}; + // Whether this record_function corresponds to an async event or not. Async + // events can complete in different threads or follow a future-like pattern + // of use. + bool is_async_{false}; - // Debug handles are used for lazy annotation of module hierarchy - // and callstack. - // This is specifically is useful for mobile runtime, where generated - // debug handles can be lazily symbolicated using debug information - int64_t debug_handle_{-1}; - }; + // Debug handles are used for lazy annotation of module hierarchy + // and callstack. + // This is specifically is useful for mobile runtime, where generated + // debug handles can be lazily symbolicated using debug information + int64_t debug_handle_{-1}; - c10::optional state_; + // Whether this RecordFunction is used for an out variant run with + // Static Runtime + bool is_static_runtime_out_variant_{false}; }; -// -// PyTorch callbacks/observers API: -// +TORCH_API StepCallbacks getStepCallbacks(RecordScope scope); -/** - * RecordFunctionCallback represents a pair of callbacks to be used with - * RecordFunction, members: - * start, end - the callbacks to run when entering and exiting the scope; - * optionally, the start callback may return an ObserverContext which will - * be passed to the end callback, use appropriate constructor accordingly. - * needs_inputs - whether the callbacks need the inputs passed from the observed - * function/range; NOTE: passing the inputs incurs an additional overhead; - * sampling_probability - if not 1.0, then the callback is probabilistically sampled - * to run; NOTE: start and end callbacks always run as a pair and are sampled - * together; - * scopes - types of scopes to execute the callbacks on (see RecordScope); - * passing empty set means the callbacks will be executed for all possible - * scope types - * should_run - optional function that returns whether this callback should run; - * overwrites the effect of setting sampling_probability - */ -class TORCH_API RecordFunctionCallback { - public: - using StartCallback = std::unique_ptr(*)(const RecordFunction&); - using EndCallback = void (*)(const RecordFunction&, ObserverContext*); - - // This interface supports observers that require passing an ObserverContext - // between start and end callbacks. - explicit RecordFunctionCallback( - StartCallback start, - EndCallback end = nullptr) : - start_(start), - end_(end) { - scopes_.fill(true); - } - - RecordFunctionCallback& needsInputs(bool needs_inputs) { - needs_inputs_ = needs_inputs; - return *this; - } - - RecordFunctionCallback& needsOutputs(bool needs_outputs) { - needs_outputs_ = needs_outputs; - return *this; - } - - RecordFunctionCallback& needsIds(bool needs_ids) { - needs_ids_ = needs_ids; - return *this; - } - - RecordFunctionCallback& samplingProb(double sampling_prob) { - TORCH_CHECK(sampling_prob >= 0.0 && sampling_prob <= 1.0, - "Invalid sampling probability"); - sampling_prob_ = sampling_prob; - return *this; - } - - RecordFunctionCallback& scopes( - const std::unordered_set>& scopes) { - if (!scopes.empty()) { - scopes_.fill(false); - for (auto sc : scopes) { - scopes_[static_cast(sc)] = true; - } - } else { - scopes_.fill(true); - } - return *this; - } - - bool needsInputs() const { - return needs_inputs_; - } - - bool needsOutputs() const { - return needs_outputs_; - } - - bool needsIds() const { - return needs_ids_; - } - - double samplingProb() const { - return sampling_prob_; +namespace detail { +template +void record_function_with_scope(RecordFunction& guard, F fn, const Inputs& inputs, Args&&... args) { + if (guard.needsInputs()) { + guard.before(fn, c10::ArrayRef(inputs.data(), inputs.size()), std::forward(args)...); + } else { + guard.before(fn, std::forward(args)...); } +} - bool checkScope(RecordScope sc) const { - return scopes_[(size_t)sc]; +template +void record_function_with_scope_and_debug_handle(RecordFunction& guard, F fn, int64_t debug_handle, const Inputs& inputs, Args&&... args) { + guard.setDebugHandle(debug_handle); + if (guard.needsInputs()) { + guard.before(fn, c10::ArrayRef(inputs.data(), inputs.size()), std::forward(args)...); + } else { + guard.before(fn, std::forward(args)...); } +} - StartCallback start() const { - return start_; - } +template +void record_function_with_scope(RecordFunction& guard, F fn, c10::ArrayRef inputs, Args&&... args) { + return record_function_with_scope, F, Args...>(guard, std::move(fn), inputs, std::forward(args)...); +} - EndCallback end() const { - return end_; - } +template +void record_function_with_scope_and_debug_handle(RecordFunction& guard, F fn, int64_t debug_handle, c10::ArrayRef inputs, Args&&... args) { + return record_function_with_scope_and_debug_handle, F, Args...>(guard, std::move(fn), debug_handle, inputs, std::forward(args)...); +} - private: - friend class CallbackManager; - StartCallback start_; - EndCallback end_; - double sampling_prob_ = 1.0; - std::array(RecordScope::NUM_SCOPES)> scopes_ = {}; - bool needs_inputs_ = false; - bool needs_outputs_ = false; - bool needs_ids_ = false; -}; +} // namespace detail -// Using macro to minimize inputs copies, // optional argument - function's seq_no #define RECORD_FUNCTION_WITH_SCOPE(scope, fn, inputs, ...) \ at::RecordFunction guard(scope); \ - if (guard.isActive()) { \ - if (guard.needsInputs()) { \ - guard.before(fn, inputs, ##__VA_ARGS__); \ - } else { \ - guard.before(fn, ##__VA_ARGS__); \ - } \ + if (guard.isActive()) { \ + ::at::detail::record_function_with_scope(guard, fn, inputs, ##__VA_ARGS__); \ } #define RECORD_FUNCTION(fn, inputs, ...) \ @@ -490,7 +529,7 @@ class TORCH_API RecordFunctionCallback { // Custom user scopes in C++; similar to Python's 'with record_function("..."):' #define RECORD_USER_SCOPE(fn) \ RECORD_FUNCTION_WITH_SCOPE( \ - at::RecordScope::USER_SCOPE, fn, {}) + at::RecordScope::USER_SCOPE, fn, c10::ArrayRef{}) // RECORD_USER_SCOPE with inputs #define RECORD_USER_SCOPE_WITH_INPUTS(fn, inputs) \ @@ -501,15 +540,10 @@ class TORCH_API RecordFunctionCallback { // post process events #define RECORD_WITH_SCOPE_DEBUG_HANDLE_AND_INPUTS( \ scope, fn, debug_handle, inputs, ...) \ - at::RecordFunction guard(scope); \ - if (guard.isActive()) { \ - guard.setDebugHandle(debug_handle); \ - if (guard.needsInputs()) { \ - guard.before(fn, inputs, ##__VA_ARGS__); \ - } else { \ - guard.before(fn, ##__VA_ARGS__); \ - } \ - } + at::RecordFunction guard(scope); \ + if (guard.isActive()) { \ + ::at::detail::record_function_with_scope_and_debug_handle(guard, fn, debug_handle, inputs, ##__VA_ARGS__); \ + } // Helper macros to record LITE INTERPETER scope events with debug handles #define RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS( \ @@ -517,63 +551,6 @@ class TORCH_API RecordFunctionCallback { RECORD_WITH_SCOPE_DEBUG_HANDLE_AND_INPUTS( \ at::RecordScope::LITE_INTERPRETER, fn, debug_handle, inputs) -// Notes: -// - two types of callbacks are provided: thread local and global -// - thread local callbacks are added/removed only for the given thread -// and are stored locally for each thread and separately from the list -// of the global callbacks -// - global callbacks are stored in a single per process list and are -// invoked by every RecordFunction, in addition to the thread local -// callbacks specific to the given thread -// - we allow the added callbacks to be sampled, by specifying a sampling -// probability for each callback pair, if the start callback is -// not picked to run, the corresponding end callback won't be called -// - a typical use case for the global callbacks is passive monitoring -// in the background (e.g. fleet-wide monitoring), without focusing on -// the specific peice of code -// - in contrast, thread local callbacks are enabled locally, on demand, -// for the specific piece of code (range) and are not sampled -// - a typical use case for thread local callbacks is profiler and code -// execution tracer -// - note, thread local callbacks are automatically propagated with -// ThreadLocalState across JIT continuations and async tasks (at::launch) -// - adding/removing global callbacks is not thread safe and should be done -// only when no other code is running, e.g. during the initialization - -typedef uint64_t CallbackHandle; - -// It is unnecessary to use atomic operations for enabling -// thread-local function callbacks. Moreover, it prevents saving to -// ThreadLocalState because std::atomic is non-copyable. -struct ThreadLocalRecordFunctionCallbacksEntry { - RecordFunctionCallback callback; - bool enabled = true; - CallbackHandle handle; - - ThreadLocalRecordFunctionCallbacksEntry(RecordFunctionCallback&& cb, CallbackHandle h) - : callback(std::move(cb)), handle(h) {} - - bool disable() { - auto old = enabled; - enabled = false; - return old != enabled; - } - - bool enable() { - auto old = enabled; - enabled = true; - return old != enabled; - } - - bool isEnabled() const { - return enabled; - } -}; - -// Holds pairs (callbacks, unique_id) -using ThreadLocalRecordFunctionCallbacks = - std::vector; - /** * addThreadLocalCallback adds a thread local callback to run with RecordFunction, * returns handle to use with removeThreadLocalCallback @@ -595,7 +572,6 @@ TORCH_API void clearThreadLocalCallbacks(); /** * addGlobalCallback adds a global callback to run with RecordFunction: * - * WARNING: not thread safe, typically addGlobalCallback can be called * only during the program initialization */ TORCH_API CallbackHandle addGlobalCallback( @@ -605,7 +581,6 @@ TORCH_API CallbackHandle addGlobalCallback( * removeCallback removes a callback given the handle returned by * addThreadLocalCallback or addGlobalCallback; * - * WARNING: removing a global callback is not thread safe, * no other code can run simultaneously */ TORCH_API void removeCallback(CallbackHandle handle); @@ -630,13 +605,12 @@ TORCH_API bool hasGlobalCallbacks(); /** * clearGlobalCallbacks removes all global callbacks - * WARNING: not thread safe */ TORCH_API void clearGlobalCallbacks(); // for both thread local and global callbacks TORCH_API bool hasCallbacks(); -TORCH_API void clearCallbacks(); // not thread safe +TORCH_API void clearCallbacks(); /** * enableRecordFunction enables RecordFunction thread locally @@ -673,30 +647,15 @@ class TORCH_API DisableRecordFunctionGuard : public RecordFunctionGuard { struct TORCH_API RecordFunctionTLS { // Thread local vector of callbacks, holds pairs (callbacks, unique_id); // must be sorted in increasing handles order - ThreadLocalRecordFunctionCallbacks sorted_tls_callbacks_; + RecordFunctionCallbacks sorted_tls_callbacks_; bool tls_record_function_enabled_ = true; - - // Stores the number of coin flips before the next successful coin flip - int tries_left_ = 0; }; TORCH_API const RecordFunctionTLS& get_record_function_tls_(); TORCH_API void set_record_function_tls_(const RecordFunctionTLS& tls); -// Checks whether RecordFunction should be called, -// sets boolean pointed by the argument to whether pre-sampling was used -TORCH_API bool shouldRunRecordFunction(bool*); - -// The following functions are used to disable/enable pre-sampling of RecordFunction -// when high-frequency/non-sampled callbacks are added/removed. -// Note: every call to bumpRecordAllFunctions() is supposed to be matched with -// the corresponding releaseRecordAllFunctions() call. -// Note: disabling pre-sampling of RecordFunction incurs an extra overhead, since -// RecordFunction will be created for each operator call. -TORCH_API void bumpRecordAllFunctions(); -TORCH_API void releaseRecordAllFunctions(); -TORCH_API bool checkRecordAllFunctions(); +TORCH_API void set_record_function_seed_for_testing(uint32_t seed); } // namespace at diff --git a/aten/src/ATen/templates/CompositeViewCopyKernels.cpp b/aten/src/ATen/templates/CompositeViewCopyKernels.cpp new file mode 100644 index 000000000000..558802a7b7e8 --- /dev/null +++ b/aten/src/ATen/templates/CompositeViewCopyKernels.cpp @@ -0,0 +1,20 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +// ${generated_comment} + +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +$ops_headers +#endif + +namespace at { +namespace native { + + +${CompositeViewCopyKernel_Definitions} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/templates/DispatchKeyFunction.h b/aten/src/ATen/templates/DispatchKeyFunction.h index 7cad9b73c6e1..c92d5eb3898e 100644 --- a/aten/src/ATen/templates/DispatchKeyFunction.h +++ b/aten/src/ATen/templates/DispatchKeyFunction.h @@ -11,32 +11,10 @@ // Forward declarations of any types needed in the operator signatures. // We can't directly include these classes because it will cause circular include dependencies. // This file is included by TensorBody.h, which defines the Tensor class. -namespace c10 { - -template -class optional; -template -class List; -class Stream; -class Scalar; -struct Storage; -struct TensorOptions; - -} +#include namespace at { -class Tensor; -struct Dimname; -struct Generator; -using TensorList = c10::ArrayRef; -using DimnameList = c10::ArrayRef; -using c10::Stream; -using c10::Storage; -using c10::QScheme; -using c10::Scalar; -using c10::TensorOptions; - namespace ${dispatch_namespace} { ${dispatch_namespaced_declarations} diff --git a/aten/src/ATen/templates/DispatchKeyFunctions.h b/aten/src/ATen/templates/DispatchKeyFunctions.h index 1718b4be8274..ffae71319137 100644 --- a/aten/src/ATen/templates/DispatchKeyFunctions.h +++ b/aten/src/ATen/templates/DispatchKeyFunctions.h @@ -1,4 +1,10 @@ #include + +// TODO Undo all logic introduced for Note [Avoiding Include Cycles In Static Dispatch] +// Code introduced to avoid cyclic dependency in static dispatch is no longer +// needed as static dispatch logic is moved from TensorBody.h, which caused cycles in the first place, +// to Operators.cpp for supporting multiple backends with multiple kernels. +// // Note [Avoiding Include Cycles In Static Dispatch] // In order to avoid #include cycles in the static dispatch build, we've carefully split out // the static function definition files into {DispatchKey}Functions.h and {DispatchKey}Functions_inl.h. @@ -20,4 +26,4 @@ // - All other files that want the cpu fastpath functions can include CPUFunctions.h directly. // - This also means that static dispatch build, CPUFunctions.h only needs to // #include TensorBody.h, and it will automatically bring in CPUFunctions_inl.h. -${inline_headers_for_nonstatic_build} +${inline_headers} diff --git a/aten/src/ATen/templates/DispatchKeyNativeFunctions.cpp b/aten/src/ATen/templates/DispatchKeyNativeFunctions.cpp new file mode 100644 index 000000000000..1a5b4a452592 --- /dev/null +++ b/aten/src/ATen/templates/DispatchKeyNativeFunctions.cpp @@ -0,0 +1,9 @@ +// ${generated_comment} +${includes} +${native_functions_include} + +${namespace_prologue} + +${native_function_definitions} + +${namespace_epilogue} diff --git a/aten/src/ATen/templates/DispatchKeyNativeFunctions.h b/aten/src/ATen/templates/DispatchKeyNativeFunctions.h index abc3df27b93a..b45a17b5922f 100644 --- a/aten/src/ATen/templates/DispatchKeyNativeFunctions.h +++ b/aten/src/ATen/templates/DispatchKeyNativeFunctions.h @@ -1,13 +1,19 @@ #pragma once + +// an external backend might generate file within its code tree +// and check all the source files within the tree with clang-format. +// so, disable it since the backend might have a different config. +// clang-format off + // ${generated_comment} #include -namespace ${cpp_namespace} { +${namespace_prologue} struct ${class_name} { ${dispatch_declarations} }; -} // namespace ${cpp_namespace} +${namespace_epilogue} diff --git a/aten/src/ATen/templates/Functions.h b/aten/src/ATen/templates/Functions.h index 3313b90d51b0..fb531363f53e 100644 --- a/aten/src/ATen/templates/Functions.h +++ b/aten/src/ATen/templates/Functions.h @@ -62,14 +62,14 @@ #include #include #include +#include #include #include #include #include #include #include - -${static_dispatch_extra_headers} +#include #include #include diff --git a/aten/src/ATen/templates/LazyIr.h b/aten/src/ATen/templates/LazyIr.h new file mode 100644 index 000000000000..1ee90e66cc6c --- /dev/null +++ b/aten/src/ATen/templates/LazyIr.h @@ -0,0 +1,19 @@ +#pragma once + +// This file contains autogenerated LazyTensor IR nodes +${lazy_ir_sysinc} +${lazy_ir_inc} + +${namespace_prologue} +using at::operator<<; + +// kNullValue is used to contribute a static hash value any time +// a node has an Optional input that is nullopt. It is important +// to differentiate between HASH(nullopt, something) and HASH(something, nullopt), +// and using kNullValue in the hash function in the order of arguments +// serves this purpose. +static const torch::lazy::Value kNullValue = torch::lazy::Value(); + +${ir_declarations} + +${namespace_epilogue} diff --git a/aten/src/ATen/templates/MethodOperators.h b/aten/src/ATen/templates/MethodOperators.h index 4671efe519be..0e192cd05ef3 100644 --- a/aten/src/ATen/templates/MethodOperators.h +++ b/aten/src/ATen/templates/MethodOperators.h @@ -13,33 +13,7 @@ // Forward declarations of any types needed in the operator signatures. // We can't directly include these classes because it will cause circular include dependencies. // This file is included by TensorBody.h, which defines the Tensor class. -namespace c10 { - -template -class optional; -template -class List; -class Stream; -class Scalar; -struct Storage; -struct TensorOptions; - -} - -namespace at { - -class Tensor; -struct Dimname; -struct Generator; -using TensorList = c10::ArrayRef; -using DimnameList = c10::ArrayRef; -using c10::Stream; -using c10::Storage; -using c10::QScheme; -using c10::Scalar; -using c10::TensorOptions; - -} +#include ${MethodOperators_includes} diff --git a/aten/src/ATen/templates/NativeMetaFunctions.h b/aten/src/ATen/templates/NativeMetaFunctions.h index c83830f1eb10..89989e2121c9 100644 --- a/aten/src/ATen/templates/NativeMetaFunctions.h +++ b/aten/src/ATen/templates/NativeMetaFunctions.h @@ -3,6 +3,7 @@ // ${generated_comment} #include +#include #include #include diff --git a/aten/src/ATen/templates/Operator.h b/aten/src/ATen/templates/Operator.h index 15434af15bae..8b3989b66deb 100644 --- a/aten/src/ATen/templates/Operator.h +++ b/aten/src/ATen/templates/Operator.h @@ -2,40 +2,15 @@ // ${generated_comment} -#include #include #include - // Forward declarations of any types needed in the operator signatures. // We can't directly include these classes because it will cause circular include dependencies. // This file is included by TensorBody.h, which defines the Tensor class. -namespace c10 { - -template -class optional; -template -class List; -class Stream; -class Scalar; -struct Storage; -struct TensorOptions; - -} +#include namespace at { - -class Tensor; -struct Dimname; -struct Generator; -using TensorList = c10::ArrayRef; -using DimnameList = c10::ArrayRef; -using c10::Stream; -using c10::Storage; -using c10::QScheme; -using c10::Scalar; -using c10::TensorOptions; - namespace _ops { ${declarations} diff --git a/aten/src/ATen/templates/Operators.cpp b/aten/src/ATen/templates/Operators.cpp index e390de90d27a..082bb67c3e20 100644 --- a/aten/src/ATen/templates/Operators.cpp +++ b/aten/src/ATen/templates/Operators.cpp @@ -10,6 +10,8 @@ ${operator_headers} #endif +${static_dispatch_extra_headers} + namespace at { namespace _ops { ${definitions} diff --git a/aten/src/ATen/templates/Operators.h b/aten/src/ATen/templates/Operators.h index 3dc55a677106..e74b96ef3d5c 100644 --- a/aten/src/ATen/templates/Operators.h +++ b/aten/src/ATen/templates/Operators.h @@ -17,9 +17,12 @@ and see NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS]. #endif +#include +#include #include #include #include +#include #include #include diff --git a/aten/src/ATen/templates/RedispatchFunctions.cpp b/aten/src/ATen/templates/RedispatchFunctions.cpp index e8d502dbdde5..58102bd97fca 100644 --- a/aten/src/ATen/templates/RedispatchFunctions.cpp +++ b/aten/src/ATen/templates/RedispatchFunctions.cpp @@ -6,8 +6,6 @@ #include #include -${static_dispatch_extra_headers} - namespace at { namespace redispatch { diff --git a/aten/src/ATen/templates/RegisterCodegenUnboxedKernels.cpp b/aten/src/ATen/templates/RegisterCodegenUnboxedKernels.cpp new file mode 100644 index 000000000000..279f987c66a2 --- /dev/null +++ b/aten/src/ATen/templates/RegisterCodegenUnboxedKernels.cpp @@ -0,0 +1,41 @@ +#include +#include +#include + +#include + +// ${generated_comment} + +// NOTE [Sharded File]: This file is generated in a sharded fashion to speed up +// incremental rebuilds. See the comment at the top of +// templates/VariableType.cpp for an analogous, in-depth discussion. +// +// Generated by tools/jit/gen_unboxing.py. This file registers all ATen ops into JIT op registry instead of c10 +// dispatcher. JIT op registry only takes boxed kernels, so we are calling unboxing functions in UnboxingFunctions.h +// to cast arguments into C++ types (instead of IValue) and delegate to unboxed kernels. + +namespace torch { namespace jit { + +using autograd::Variable; +using autograd::variable_list; +using at::Scalar; +using at::ScalarType; +using at::Tensor; +using at::TensorOptions; +using at::DeviceGuard; + +using ::c10::fmap; +using ::c10::filter; + +namespace { + +RegisterOperators reg({ + + // Generated operators + ${unboxed_ops} +}); + +} // anon namespace + + +}} // namespace torch::jit diff --git a/aten/src/ATen/templates/RegisterDispatchKey.cpp b/aten/src/ATen/templates/RegisterDispatchKey.cpp index 63a7a1a1a6c6..df00c0d0e4a3 100644 --- a/aten/src/ATen/templates/RegisterDispatchKey.cpp +++ b/aten/src/ATen/templates/RegisterDispatchKey.cpp @@ -5,6 +5,11 @@ #define __STDC_FORMAT_MACROS #endif +// an external backend might generate file within its code tree +// and check all the source files within the tree with clang-format. +// so, disable it since the backend might have a different config. +// clang-format off + // NOTE: This condition is true for all PyTorch internal libraries, it // just excludes external projects such as torch_xla which // re-use some of the PyTorch codegen machinery. @@ -57,12 +62,12 @@ namespace { ${dispatch_anonymous_definitions} -TORCH_LIBRARY_IMPL(aten, ${DispatchKey}, m) { - ${dispatch_registrations} -} +${static_init_dispatch_registrations} } // anonymous namespace +${deferred_dispatch_registrations} + namespace ${dispatch_namespace} { ${dispatch_namespaced_definitions} diff --git a/aten/src/ATen/templates/RegisterFunctionalization.cpp b/aten/src/ATen/templates/RegisterFunctionalization.cpp index 412d6f582e4a..3f08b1da436e 100644 --- a/aten/src/ATen/templates/RegisterFunctionalization.cpp +++ b/aten/src/ATen/templates/RegisterFunctionalization.cpp @@ -10,7 +10,13 @@ #include #include #else +// needed for the meta tensor calls to get stride info in functionalization #include +// needed for special handling of copy_(). +// See Note [functionalizating copy_() and not preserving strides] +#include +#include + $ops_headers #endif @@ -19,7 +25,8 @@ namespace functionalization { ${func_definitions} -} // namespace func + +} // namespace functionalization namespace { diff --git a/aten/src/ATen/templates/TensorBody.h b/aten/src/ATen/templates/TensorBody.h index 7cbffad063d6..6d09d68deb1f 100644 --- a/aten/src/ATen/templates/TensorBody.h +++ b/aten/src/ATen/templates/TensorBody.h @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -25,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -32,8 +32,11 @@ #include #include #include +#include +#include #include + #include namespace c10{ @@ -340,6 +343,10 @@ class TORCH_API Tensor: public TensorBase { return to(options().device(DeviceType::Metal), /*non_blocking*/ false, /*copy*/ false); } + Tensor meta() const { + return to(options().device(DeviceType::Meta), /*non_blocking*/ false, /*copy*/ false); + } + // ~~~~~ Autograd API ~~~~~ /// \fn bool is_leaf() const; @@ -628,8 +635,7 @@ Tensor make_tensor(Args&&... args) { } // namespace at -// See Note [Avoiding Include Cycles In Static Dispatch] -${static_dispatch_ops_headers} + namespace at { ${tensor_method_definitions} } // namespace at @@ -674,7 +680,7 @@ struct MaybeOwnedTraits { return &borrow; } - static bool debugBorrowIsValid(const borrow_type& borrow) { + static bool debugBorrowIsValid(const borrow_type& /*borrow*/) { return true; } }; diff --git a/aten/src/ATen/templates/TensorMethods.cpp b/aten/src/ATen/templates/TensorMethods.cpp index 29a43a657bb3..dd8f3c384176 100644 --- a/aten/src/ATen/templates/TensorMethods.cpp +++ b/aten/src/ATen/templates/TensorMethods.cpp @@ -7,7 +7,9 @@ namespace at { template <> \ TORCH_API T* TensorBase::data_ptr() const { \ TORCH_CHECK( \ - scalar_type() == ScalarType::name, \ + scalar_type() == ScalarType::name \ + || (isQIntType(scalar_type()) \ + && toUnderlying(scalar_type()) == ScalarType::name), \ "expected scalar type " \ #name \ " but found ", \ @@ -15,7 +17,7 @@ namespace at { return this->unsafeGetTensorImpl()->data_ptr_impl(); \ } - AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_CAST) + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CAST) AT_FORALL_QINT_TYPES(DEFINE_CAST) #undef DEFINE_CAST @@ -25,7 +27,7 @@ namespace at { return item().to##name(); \ } - AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_ITEM) + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_ITEM) #undef DEFINE_ITEM } //namespace at diff --git a/aten/src/ATen/templates/UfuncCPU.cpp b/aten/src/ATen/templates/UfuncCPU.cpp new file mode 100644 index 000000000000..6b363a508907 --- /dev/null +++ b/aten/src/ATen/templates/UfuncCPU.cpp @@ -0,0 +1,19 @@ +#define TORCH_ASSERT_NO_OPERATORS + +#include +#include +#include + +namespace at { + +// NB: this is explicitly copied here (via codegen) rather than +// included via NativeFunctions.h to avoid recompiling this file when +// NativeFunctions.h changes +namespace meta { +${meta_declaration} +} + +namespace native { +${native_declaration} +${native_definitions} +}} // namespace at::native diff --git a/aten/src/ATen/templates/UfuncCPUKernel.cpp b/aten/src/ATen/templates/UfuncCPUKernel.cpp new file mode 100644 index 000000000000..0cac55664d61 --- /dev/null +++ b/aten/src/ATen/templates/UfuncCPUKernel.cpp @@ -0,0 +1,14 @@ +#define TORCH_ASSERT_NO_OPERATORS + +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +${native_definitions} +}} // namespace at::native diff --git a/aten/src/ATen/templates/UfuncCUDA.cu b/aten/src/ATen/templates/UfuncCUDA.cu new file mode 100644 index 000000000000..e75d82d9cc84 --- /dev/null +++ b/aten/src/ATen/templates/UfuncCUDA.cu @@ -0,0 +1,21 @@ +#define TORCH_ASSERT_NO_OPERATORS + +#include +#include +#include +#include +${cuda_headers} + +namespace at { + +// NB: this is explicitly copied here (via codegen) rather than +// included via NativeFunctions.h to avoid recompiling this file when +// NativeFunctions.h changes +namespace meta { +${meta_declaration} +} + +namespace native { +${native_declaration} +${native_definitions} +}} // namespace at::native diff --git a/aten/src/ATen/templates/UnboxingFunctions.cpp b/aten/src/ATen/templates/UnboxingFunctions.cpp new file mode 100644 index 000000000000..86c13235d862 --- /dev/null +++ b/aten/src/ATen/templates/UnboxingFunctions.cpp @@ -0,0 +1,35 @@ +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +namespace at { +namespace unboxing { + +using ::c10::fmap; +using ::c10::filter; +using torch::jit::peek; +using torch::jit::drop; +using torch::jit::pack; +using torch::jit::pop; + +// Generated function declaration +${definitions} + +} // namespace unboxing +} // namespace at diff --git a/aten/src/ATen/templates/UnboxingFunctions.h b/aten/src/ATen/templates/UnboxingFunctions.h new file mode 100644 index 000000000000..a65469a9b012 --- /dev/null +++ b/aten/src/ATen/templates/UnboxingFunctions.h @@ -0,0 +1,32 @@ +// ${generated_comment} + +// Generated by tools/jit/gen_unboxing.py. This file declares code generated boxed C++ functions for operators, +// base off of native_functions.yaml (or similar yaml file with the same syntax). The definition of such a boxed +// function will pop out IValues from the stack then convert them into the correct C++ types based on given schema. This +// unboxing logic is an alternative to template-based metaprogramming unboxing. + +#pragma once + +#include +namespace at { +namespace unboxing { +namespace { + +template +std::array as_array(const c10::List& list) { + std::array res; + AT_ASSERT(list.size() == N); + std::vector vec; + for (c10::IValue elem : list) { + vec.push_back(elem.to()); + } + std::copy(vec.begin(), vec.end(), res.begin()); + return res; +} +} // namespace +using Stack = std::vector; +// Generated function declaration +${declarations} + +} // namespace unboxing +} // namespace at diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt index cda1262f6eb1..ce83898a50eb 100644 --- a/aten/src/ATen/test/CMakeLists.txt +++ b/aten/src/ATen/test/CMakeLists.txt @@ -84,7 +84,6 @@ list(APPEND ATen_HIP_TEST_SRCS # ${CMAKE_CURRENT_SOURCE_DIR}/hip/hip_stream_test.cpp list(APPEND ATen_VULKAN_TEST_SRCS - ${CMAKE_CURRENT_SOURCE_DIR}/vulkan_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/vulkan_api_test.cpp) list(APPEND ATen_MOBILE_TEST_SRCS diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp index 6c2c977abd7e..d14e7cd36ab9 100644 --- a/aten/src/ATen/test/basic.cpp +++ b/aten/src/ATen/test/basic.cpp @@ -41,7 +41,9 @@ void TestOnesAndDot(DeprecatedTypeProperties& type) { Tensor b = ones({3, 4}, type); ASSERT_EQ_RESOLVED((b + b).sum().item(), 24); ASSERT_EQ_RESOLVED(b.numel(), 12); - ASSERT_EQ_RESOLVED(b.view(-1).dot(b.view(-1)).item(), 12); + if (type.backend() != Backend::CPU || type.scalarType() != kHalf) { + ASSERT_EQ_RESOLVED(b.view(-1).dot(b.view(-1)).item(), 12); + } } void TestSort(DeprecatedTypeProperties& type) { diff --git a/aten/src/ATen/test/cuda_atomic_ops_test.cu b/aten/src/ATen/test/cuda_atomic_ops_test.cu index 54d43ffec019..d5d261440064 100644 --- a/aten/src/ATen/test/cuda_atomic_ops_test.cu +++ b/aten/src/ATen/test/cuda_atomic_ops_test.cu @@ -1,6 +1,7 @@ #include #include #include +#include #include #include @@ -25,6 +26,24 @@ __global__ void mul_test_kernel(T * a, T * sum) { gpuAtomicMul(&sum[idx], a[idx]); } +template +__global__ void max_test_kernel(T * a, T * max) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int a_idx = (tid) % (arraysize * factor); + int idx = a_idx / factor; + + gpuAtomicMax(&max[idx], a[a_idx]); +} + +template +__global__ void min_test_kernel(T * a, T * min) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int a_idx = (tid) % (arraysize * factor); + int idx = a_idx / factor; + + gpuAtomicMin(&min[idx], a[a_idx]); +} + template void test_atomic_add() { dim3 dimBlock(blocksize, 1); @@ -75,7 +94,7 @@ void test_atomic_mul() { for (int i = 0; i < arraysize; ++i) { a[i] = 2; sum[i] = 2; - answer[i] = pow(sum[i], static_cast(factor)); + answer[i] = pow(sum[i], static_cast(factor + 1)); } cudaMalloc((void**)&ad, arraysize * sizeof(T)); @@ -97,7 +116,88 @@ void test_atomic_mul() { cudaFree(sumd); } +template +void test_atomic_max() { + dim3 dimBlock(blocksize, 1); + dim3 dimGrid(1, 1); + + T *ad, *sumd; + + std::vector a(arraysize * factor); + std::vector sum(arraysize); + std::vector answer(arraysize); + + int j; + for (int i = 0; i < arraysize * factor; ++i) { + a[i] = i; + if (i % factor == 0) { + j = i / factor; + sum[j] = std::numeric_limits::lowest(); + answer[j] = (j + 1) * factor - 1; + } + } + + cudaMalloc((void**)&ad, arraysize * factor * sizeof(T)); + cudaMalloc((void**)&sumd, arraysize * sizeof(T)); + + cudaMemcpy(ad, a.data(), arraysize * factor * sizeof(T), cudaMemcpyHostToDevice); + cudaMemcpy(sumd, sum.data(), arraysize * sizeof(T), cudaMemcpyHostToDevice); + + max_test_kernel<<>>(ad, sumd); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + cudaMemcpy(sum.data(), sumd, arraysize * sizeof(T), cudaMemcpyDeviceToHost); + + for (int i = 0; i < arraysize; ++i) { + ASSERT_EQ(sum[i], answer[i]) << typeid(T).name(); + } + + cudaFree(ad); + cudaFree(sumd); +} + +template +void test_atomic_min() { + dim3 dimBlock(blocksize, 1); + dim3 dimGrid(1, 1); + + T *ad, *sumd; + + std::vector a(arraysize * factor); + std::vector sum(arraysize); + std::vector answer(arraysize); + + int j; + for (int i = 0; i < arraysize * factor; ++i) { + a[i] = i; + if (i % factor == 0) { + j = i / factor; + sum[j] = std::numeric_limits::max(); + answer[j] = j * factor; + } + } + + cudaMalloc((void**)&ad, arraysize * factor * sizeof(T)); + cudaMalloc((void**)&sumd, arraysize * sizeof(T)); + + cudaMemcpy(ad, a.data(), arraysize * factor * sizeof(T), cudaMemcpyHostToDevice); + cudaMemcpy(sumd, sum.data(), arraysize * sizeof(T), cudaMemcpyHostToDevice); + + min_test_kernel<<>>(ad, sumd); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + cudaMemcpy(sum.data(), sumd, arraysize * sizeof(T), cudaMemcpyDeviceToHost); + + for (int i = 0; i < arraysize; ++i) { + ASSERT_EQ(sum[i], answer[i]) << typeid(T).name(); + } + + cudaFree(ad); + cudaFree(sumd); +} + TEST(TestAtomicOps, TestAtomicAdd) { + if (!at::cuda::is_available()) return; test_atomic_add(); test_atomic_add(); test_atomic_add(); @@ -113,8 +213,25 @@ TEST(TestAtomicOps, TestAtomicAdd) { } TEST(TestAtomicOps, DISABLED_ON_WINDOWS(TestAtomicMul)) { + if (!at::cuda::is_available()) return; test_atomic_mul(); test_atomic_mul(); test_atomic_mul(); test_atomic_mul(); } + +TEST(TestAtomicOps, DISABLED_ON_WINDOWS(TestAtomicMax)) { + if (!at::cuda::is_available()) return; + test_atomic_max(); + test_atomic_max(); + test_atomic_max(); + test_atomic_max(); +} + +TEST(TestAtomicOps, DISABLED_ON_WINDOWS(TestAtomicMin)) { + if (!at::cuda::is_available()) return; + test_atomic_min(); + test_atomic_min(); + test_atomic_min(); + test_atomic_min(); +} diff --git a/aten/src/ATen/test/cuda_half_test.cu b/aten/src/ATen/test/cuda_half_test.cu index a55d9458e851..aa1644c94b76 100644 --- a/aten/src/ATen/test/cuda_half_test.cu +++ b/aten/src/ATen/test/cuda_half_test.cu @@ -76,6 +76,13 @@ __device__ void test(){ assert(::abs(::isnan(Half(0.0)) - ::isnan(0.0f)) <= threshold); assert(::abs(::isinf(Half(0.0)) - ::isinf(0.0f)) <= threshold); #endif + + // test complex<32> + Half real = 3.0f; + Half imag = -10.0f; + auto complex = c10::complex(real, imag); + assert(complex.real() == real); + assert(complex.imag() == imag); } __global__ void kernel(){ diff --git a/aten/src/ATen/test/half_test.cpp b/aten/src/ATen/test/half_test.cpp index 652823e8e9b1..02ccb8b6ce5d 100644 --- a/aten/src/ATen/test/half_test.cpp +++ b/aten/src/ATen/test/half_test.cpp @@ -164,3 +164,11 @@ TEST(TestHalf, CommonMath) { assert(std::abs(std::isinf(Half(0.0)) - std::isinf(0.0f)) <= threshold); #endif } + +TEST(TestHalf, ComplexHalf) { + Half real = 3.0f; + Half imag = -10.0f; + auto complex = c10::complex(real, imag); + assert(complex.real() == real); + assert(complex.imag() == imag); +} diff --git a/aten/src/ATen/test/ivalue_test.cpp b/aten/src/ATen/test/ivalue_test.cpp index f86bcec92d03..08312305975c 100644 --- a/aten/src/ATen/test/ivalue_test.cpp +++ b/aten/src/ATen/test/ivalue_test.cpp @@ -401,7 +401,6 @@ TEST(IValueTest, FutureSetError) { } } - TEST(IValueTest, ValueEquality) { EXPECT_EQ(IValue("asdf"), IValue("asdf")); EXPECT_NE(IValue("asdf"), IValue("ASDF")); @@ -804,6 +803,23 @@ TEST(IValueTest, ToWeakAndBack) { } } +// Storage and Generator did not set is_intrusive_ptr if they were +// undefined, which led use_count to return 1 instead of 0 for these +// cases. +TEST(IValueTest, UseCountCornerCases) { + at::Storage undefinedStorage; + at::Generator undefinedGenerator; + at::Tensor undefinedTensor; + + IValue ivEmptyStorage(undefinedStorage); + IValue ivEmptyGenerator(undefinedGenerator); + IValue ivEmptyTensor(undefinedTensor); + + ASSERT_EQ(1, ivEmptyStorage.use_count()); + ASSERT_EQ(1, ivEmptyGenerator.use_count()); + ASSERT_EQ(0, ivEmptyTensor.use_count()); +} + // TODO(gmagogsfm): Add type conversion test? using ivalue::TupleElements; diff --git a/aten/src/ATen/test/scalar_tensor_test.cpp b/aten/src/ATen/test/scalar_tensor_test.cpp index 6a3253c50548..ebe35b9a6a6b 100644 --- a/aten/src/ATen/test/scalar_tensor_test.cpp +++ b/aten/src/ATen/test/scalar_tensor_test.cpp @@ -295,3 +295,11 @@ TEST(TestScalarTensor, TestScalarTensorCUDA) { test(CUDA(kFloat)); } } + +TEST(TestScalarTensor, TestScalarTensorMPS) { + manual_seed(123); + + if (at::hasMPS()) { + test(MPS(kFloat)); + } +} diff --git a/aten/src/ATen/test/stride_properties_test.cpp b/aten/src/ATen/test/stride_properties_test.cpp index 09c13139fc4c..e37e7c13da42 100644 --- a/aten/src/ATen/test/stride_properties_test.cpp +++ b/aten/src/ATen/test/stride_properties_test.cpp @@ -69,10 +69,25 @@ TEST(StridePropertiesTest, ZeroStrideIndicesEagerConsistencyTest) { } TEST(StridePropertiesTest, ExpandedStrideIndicesTest) { - // NOLINTNEXTLINE(performance-for-range-copy) Tensor t = at::rand({1}); // note: expand with dimension of size 1 is tricky as stride is different // depending on the order of the unsqueezed dimension. t = t.expand({4, 4, 4}); EXPECT_TRUE(CheckStrideIndices(t, at::MemoryFormat::Contiguous)); } + +TEST(StridePropertiesTest, SlicedStrideIndicesTest) { + // Sliced tensor shouldn't have changed stride order + Tensor t = at::rand({16, 4}).slice(1, 0, 4, 4); + + auto temp = TensorType::create(c10::nullopt, c10::nullopt, t.sizes(), t.strides(), c10::nullopt); + TORCH_INTERNAL_ASSERT(temp->stride_properties().isComplete() && + temp->stride_properties().isComplete(), "complete stride properties is needed for the test"); + std::vector stride_indices(2); + std::iota(stride_indices.rbegin(), stride_indices.rend(), 0); + + auto index_iter = stride_indices.begin(); + for (const auto& opt_stride : *temp->stride_properties().sizes()) { + EXPECT_TRUE(*index_iter++ == opt_stride->stride_index_.value()); + } +} diff --git a/aten/src/ATen/test/vmap_test.cpp b/aten/src/ATen/test/vmap_test.cpp index 0d325906325a..1feafaa59f3a 100644 --- a/aten/src/ATen/test/vmap_test.cpp +++ b/aten/src/ATen/test/vmap_test.cpp @@ -728,7 +728,7 @@ TEST(VmapTest, TestBatchedTensorExpand) { // logical dim is 0, expand size has same dimensionality as logical dim auto tensor = at::randn({2, 3}); auto batched = makeBatched(tensor, {{0, 0}, {1, 1}}); - auto batched_out = batched.expand({}); + auto batched_out = batched.expand(c10::IntArrayRef({})); const auto& out = maybeGetBatchedImpl(batched_out)->value(); ASSERT_EQ(out.data_ptr(), tensor.data_ptr()); ASSERT_TRUE(at::allclose(out, tensor)); diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp index ec2b1bd12526..f13f673cb2b9 100644 --- a/aten/src/ATen/test/vulkan_api_test.cpp +++ b/aten/src/ATen/test/vulkan_api_test.cpp @@ -2,6 +2,9 @@ #include #include +#include +#include +#include #include // TODO: These functions should move to a common place. @@ -64,7 +67,7 @@ void showRtol(const at::Tensor& a, const at::Tensor& b) { } -static void gen_allpermutations(std::vector>& out, std::vector in, int i) { +static void gen_allpermutations(std::vector>& out, std::vector in, unsigned i) { // generate all permutations of a given dims if (i == in.size()) { out.push_back(in); @@ -137,11 +140,49 @@ static void clone_test(const std::vector& size, c10::optional +inline std::vector makeStack(Inputs&&... inputs) { + return {std::forward(inputs)...}; +} + +template +inline std::vector callOpByHandle( + const c10::OperatorHandle& op, + Args... args) { + auto stack = makeStack(std::forward(args)...); + c10::Dispatcher::singleton().callBoxed(op, &stack); + return stack; +} + +template +inline std::vector callOpByName( + const char* func_name, + const char* overload_name, + Args... args) { + const c10::optional op_handle = + c10::Dispatcher::singleton().findSchema({func_name, overload_name}); + assert(op_handle.has_value()); + return callOpByHandle(op_handle.value(), std::forward(args)...); +} + } // namespace namespace { -TEST(VulkanAPITest, adaptive_avg_pool2d) { +class VulkanAPITest : public ::testing::Test { +public: +#if defined (__ANDROID__) // to avoid `Undefined symbols for architecture arm64` error + static void SetUpTestSuite() { + at::native::vulkan::api::context()->querypool().enable(); + } + + static void TearDownTestSuite() { + at::native::vulkan::api::context()->querypool().disable(false); + } +#endif +}; + +TEST_F(VulkanAPITest, adaptive_avg_pool2d) { if (!at::is_vulkan_available()) { return; } @@ -159,7 +200,7 @@ TEST(VulkanAPITest, adaptive_avg_pool2d) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, add) { +TEST_F(VulkanAPITest, add) { if (!at::is_vulkan_available()) { return; } @@ -181,7 +222,7 @@ TEST(VulkanAPITest, add) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, add_broadcast0) { +TEST_F(VulkanAPITest, add_broadcast0) { if (!at::is_vulkan_available()) { return; } @@ -203,7 +244,7 @@ TEST(VulkanAPITest, add_broadcast0) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, add_broadcast1) { +TEST_F(VulkanAPITest, add_broadcast1) { if (!at::is_vulkan_available()) { return; } @@ -225,7 +266,7 @@ TEST(VulkanAPITest, add_broadcast1) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, add_broadcast2) { +TEST_F(VulkanAPITest, add_broadcast2) { if (!at::is_vulkan_available()) { return; } @@ -247,7 +288,7 @@ TEST(VulkanAPITest, add_broadcast2) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, add_) { +TEST_F(VulkanAPITest, add_) { if (!at::is_vulkan_available()) { return; } @@ -269,7 +310,7 @@ TEST(VulkanAPITest, add_) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, add_broadcast0_) { +TEST_F(VulkanAPITest, add_broadcast0_) { if (!at::is_vulkan_available()) { return; } @@ -291,7 +332,7 @@ TEST(VulkanAPITest, add_broadcast0_) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, add_broadcast1_) { +TEST_F(VulkanAPITest, add_broadcast1_) { if (!at::is_vulkan_available()) { return; } @@ -313,7 +354,7 @@ TEST(VulkanAPITest, add_broadcast1_) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, add_scalar) { +TEST_F(VulkanAPITest, add_scalar) { if (!at::is_vulkan_available()) { return; } @@ -334,7 +375,7 @@ TEST(VulkanAPITest, add_scalar) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, add_scalar_) { +TEST_F(VulkanAPITest, add_scalar_) { if (!at::is_vulkan_available()) { return; } @@ -355,7 +396,7 @@ TEST(VulkanAPITest, add_scalar_) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, addmm) { +TEST_F(VulkanAPITest, addmm) { if (!at::is_vulkan_available()) { return; } @@ -379,7 +420,7 @@ TEST(VulkanAPITest, addmm) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, addmm_expand) { +TEST_F(VulkanAPITest, addmm_expand) { if (!at::is_vulkan_available()) { return; } @@ -403,7 +444,7 @@ TEST(VulkanAPITest, addmm_expand) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, avg_pool2d) { +TEST_F(VulkanAPITest, avg_pool2d) { if (!at::is_vulkan_available()) { return; } @@ -420,7 +461,7 @@ TEST(VulkanAPITest, avg_pool2d) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, clamp) { +TEST_F(VulkanAPITest, clamp) { if (!at::is_vulkan_available()) { return; } @@ -442,7 +483,7 @@ TEST(VulkanAPITest, clamp) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, clamp_) { +TEST_F(VulkanAPITest, clamp_) { if (!at::is_vulkan_available()) { return; } @@ -464,7 +505,7 @@ TEST(VulkanAPITest, clamp_) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, conv2d) { +TEST_F(VulkanAPITest, conv2d) { if (!at::is_vulkan_available()) { return; } @@ -537,7 +578,7 @@ TEST(VulkanAPITest, conv2d) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, conv2d_dw) { +TEST_F(VulkanAPITest, conv2d_dw) { if (!at::is_vulkan_available()) { return; } @@ -609,7 +650,7 @@ TEST(VulkanAPITest, conv2d_dw) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, conv2d_pw) { +TEST_F(VulkanAPITest, conv2d_pw) { if (!at::is_vulkan_available()) { return; } @@ -681,7 +722,7 @@ TEST(VulkanAPITest, conv2d_pw) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, conv2d_winograd) { +TEST_F(VulkanAPITest, conv2d_winograd) { if (!at::is_vulkan_available()) { return; } @@ -753,7 +794,7 @@ TEST(VulkanAPITest, conv2d_winograd) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, copy) { +TEST_F(VulkanAPITest, copy) { if (!at::is_vulkan_available()) { return; } @@ -769,7 +810,7 @@ TEST(VulkanAPITest, copy) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, div) { +TEST_F(VulkanAPITest, div) { if (!at::is_vulkan_available()) { return; } @@ -791,7 +832,7 @@ TEST(VulkanAPITest, div) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, div_broadcast0) { +TEST_F(VulkanAPITest, div_broadcast0) { if (!at::is_vulkan_available()) { return; } @@ -813,7 +854,7 @@ TEST(VulkanAPITest, div_broadcast0) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, div_broadcast1) { +TEST_F(VulkanAPITest, div_broadcast1) { if (!at::is_vulkan_available()) { return; } @@ -835,7 +876,7 @@ TEST(VulkanAPITest, div_broadcast1) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, div_broadcast2) { +TEST_F(VulkanAPITest, div_broadcast2) { if (!at::is_vulkan_available()) { return; } @@ -857,7 +898,7 @@ TEST(VulkanAPITest, div_broadcast2) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, div_) { +TEST_F(VulkanAPITest, div_) { if (!at::is_vulkan_available()) { return; } @@ -879,7 +920,7 @@ TEST(VulkanAPITest, div_) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, div_broadcast0_) { +TEST_F(VulkanAPITest, div_broadcast0_) { if (!at::is_vulkan_available()) { return; } @@ -901,7 +942,7 @@ TEST(VulkanAPITest, div_broadcast0_) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, div_broadcast1_) { +TEST_F(VulkanAPITest, div_broadcast1_) { if (!at::is_vulkan_available()) { return; } @@ -923,7 +964,7 @@ TEST(VulkanAPITest, div_broadcast1_) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, div_scalar) { +TEST_F(VulkanAPITest, div_scalar) { if (!at::is_vulkan_available()) { return; } @@ -944,7 +985,7 @@ TEST(VulkanAPITest, div_scalar) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, div_scalar_) { +TEST_F(VulkanAPITest, div_scalar_) { if (!at::is_vulkan_available()) { return; } @@ -965,7 +1006,7 @@ TEST(VulkanAPITest, div_scalar_) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, empty) { +TEST_F(VulkanAPITest, empty) { if (!at::is_vulkan_available()) { return; } @@ -973,7 +1014,7 @@ TEST(VulkanAPITest, empty) { ASSERT_NO_THROW(at::empty({1, 17, 41, 53}, at::device(at::kVulkan).dtype(at::kFloat))); } -TEST(VulkanAPITest, hardsigmoid) { +TEST_F(VulkanAPITest, hardsigmoid) { if (!at::is_vulkan_available()) { return; } @@ -992,7 +1033,7 @@ TEST(VulkanAPITest, hardsigmoid) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, hardsigmoid_) { +TEST_F(VulkanAPITest, hardsigmoid_) { if (!at::is_vulkan_available()) { return; } @@ -1011,7 +1052,7 @@ TEST(VulkanAPITest, hardsigmoid_) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, hardshrink) { +TEST_F(VulkanAPITest, hardshrink) { if (!at::is_vulkan_available()) { return; } @@ -1033,7 +1074,7 @@ TEST(VulkanAPITest, hardshrink) { } } -TEST(VulkanAPITest, hardshrink_) { +TEST_F(VulkanAPITest, hardshrink_) { if (!at::is_vulkan_available()) { return; } @@ -1054,7 +1095,7 @@ TEST(VulkanAPITest, hardshrink_) { } } -TEST(VulkanAPITest, leaky_relu) { +TEST_F(VulkanAPITest, leaky_relu) { if (!at::is_vulkan_available()) { return; } @@ -1076,7 +1117,7 @@ TEST(VulkanAPITest, leaky_relu) { } } -TEST(VulkanAPITest, leaky_relu_) { +TEST_F(VulkanAPITest, leaky_relu_) { if (!at::is_vulkan_available()) { return; } @@ -1097,7 +1138,205 @@ TEST(VulkanAPITest, leaky_relu_) { } } -TEST(VulkanAPITest, hardswish) { +TEST_F(VulkanAPITest, lerp) { + if (!at::is_vulkan_available()) { + return; + } + + const auto a_cpu = at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat)); + const auto a_vulkan = a_cpu.vulkan(); + + const auto b_cpu = at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat)); + const auto b_vulkan = b_cpu.vulkan(); + + const auto w_cpu = at::rand({11, 7, 139, 109}, at::device(at::kCPU).dtype(at::kFloat)); + const auto w_vulkan = w_cpu.vulkan(); + + const auto c_cpu = at::lerp(a_cpu, b_cpu, w_cpu); + const auto c_vulkan = at::lerp(a_vulkan, b_vulkan, w_vulkan); + + const auto check = almostEqual(c_cpu, c_vulkan.cpu()); + if (!check) { + showRtol(c_cpu, c_vulkan.cpu()); + } + + ASSERT_TRUE(check); +} + +TEST_F(VulkanAPITest, lerp_broadcast0) { + if (!at::is_vulkan_available()) { + return; + } + + const auto a_cpu = at::rand({3, 5, 179, 221}, at::device(at::kCPU).dtype(at::kFloat)); + const auto a_vulkan = a_cpu.vulkan(); + + const auto b_cpu = at::rand({3, 5, 1, 1}, at::device(at::kCPU).dtype(at::kFloat)); + const auto b_vulkan = b_cpu.vulkan(); + + const auto w_cpu = at::rand({3, 5, 1, 221}, at::device(at::kCPU).dtype(at::kFloat)); + const auto w_vulkan = w_cpu.vulkan(); + + const auto c_cpu = at::lerp(a_cpu, b_cpu, w_cpu); + const auto c_vulkan = at::lerp(a_vulkan, b_vulkan, w_vulkan); + + const auto check = almostEqual(c_cpu, c_vulkan.cpu()); + if (!check) { + showRtol(c_cpu, c_vulkan.cpu()); + } + + ASSERT_TRUE(check); +} + +TEST_F(VulkanAPITest, lerp_broadcast1) { + if (!at::is_vulkan_available()) { + return; + } + + const auto a_cpu = at::rand({3, 4, 179, 221}, at::device(at::kCPU).dtype(at::kFloat)); + const auto a_vulkan = a_cpu.vulkan(); + + const auto b_cpu = at::rand({4, 179, 221}, at::device(at::kCPU).dtype(at::kFloat)); + const auto b_vulkan = b_cpu.vulkan(); + + const auto w_cpu = at::rand({4, 179, 221}, at::device(at::kCPU).dtype(at::kFloat)); + const auto w_vulkan = w_cpu.vulkan(); + + const auto c_cpu = at::lerp(a_cpu, b_cpu, w_cpu); + const auto c_vulkan = at::lerp(a_vulkan, b_vulkan, w_vulkan); + + const auto check = almostEqual(c_cpu, c_vulkan.cpu()); + if (!check) { + showRtol(c_cpu, c_vulkan.cpu()); + } + + ASSERT_TRUE(check); +} + +TEST_F(VulkanAPITest, lerp_) { + if (!at::is_vulkan_available()) { + return; + } + + auto a_cpu = at::rand({61, 17, 29, 83}, at::device(at::kCPU).dtype(at::kFloat)); + auto a_vulkan = a_cpu.vulkan(); + + const auto b_cpu = at::rand({61, 17, 29, 83}, at::device(at::kCPU).dtype(at::kFloat)); + const auto b_vulkan = b_cpu.vulkan(); + + const auto w_cpu = at::rand({61, 17, 29, 83}, at::device(at::kCPU).dtype(at::kFloat)); + const auto w_vulkan = w_cpu.vulkan(); + + a_cpu.lerp_(b_cpu, w_cpu); + a_vulkan.lerp_(b_vulkan, w_vulkan); + + const auto check = almostEqual(a_cpu, a_vulkan.cpu()); + if (!check) { + showRtol(a_cpu, a_vulkan.cpu()); + } + + ASSERT_TRUE(check); +} + +TEST_F(VulkanAPITest, lerp_broadcast0_) { + if (!at::is_vulkan_available()) { + return; + } + + auto a_cpu = at::rand({3, 5, 179, 221}, at::device(at::kCPU).dtype(at::kFloat)); + auto a_vulkan = a_cpu.vulkan(); + + const auto b_cpu = at::rand({3, 5, 1, 1}, at::device(at::kCPU).dtype(at::kFloat)); + const auto b_vulkan = b_cpu.vulkan(); + + const auto w_cpu = at::rand({3, 5, 1, 221}, at::device(at::kCPU).dtype(at::kFloat)); + const auto w_vulkan = w_cpu.vulkan(); + + a_cpu.lerp_(b_cpu, w_cpu); + a_vulkan.lerp_(b_vulkan, w_vulkan); + + const auto check = almostEqual(a_cpu, a_vulkan.cpu()); + if (!check) { + showRtol(a_cpu, a_vulkan.cpu()); + } + + ASSERT_TRUE(check); +} + +TEST_F(VulkanAPITest, lerp_broadcast1_) { + if (!at::is_vulkan_available()) { + return; + } + + auto a_cpu = at::rand({3, 4, 179, 221}, at::device(at::kCPU).dtype(at::kFloat)); + auto a_vulkan = a_cpu.vulkan(); + + const auto b_cpu = at::rand({4, 179, 221}, at::device(at::kCPU).dtype(at::kFloat)); + const auto b_vulkan = b_cpu.vulkan(); + + const auto w_cpu = at::rand({4, 179, 221}, at::device(at::kCPU).dtype(at::kFloat)); + const auto w_vulkan = w_cpu.vulkan(); + + a_cpu.lerp_(b_cpu, w_cpu); + a_vulkan.lerp_(b_vulkan, w_vulkan); + + const auto check = almostEqual(a_cpu, a_vulkan.cpu()); + if (!check) { + showRtol(a_cpu, a_vulkan.cpu()); + } + + ASSERT_TRUE(check); +} + +TEST_F(VulkanAPITest, lerp_scalar) { + if (!at::is_vulkan_available()) { + return; + } + + const auto a_cpu = at::rand({13, 23, 59, 73}, at::device(at::kCPU).dtype(at::kFloat)); + const auto a_vulkan = a_cpu.vulkan(); + + const auto b_cpu = at::rand({13, 23, 59, 73}, at::device(at::kCPU).dtype(at::kFloat)); + const auto b_vulkan = b_cpu.vulkan(); + + const float w_scalar = 3.1415f; + + const auto c_cpu = at::lerp(a_cpu, b_cpu, w_scalar); + const auto c_vulkan = at::lerp(a_vulkan, b_vulkan, w_scalar); + + const auto check = almostEqual(c_cpu, c_vulkan.cpu()); + if (!check) { + showRtol(c_cpu, c_vulkan.cpu()); + } + + ASSERT_TRUE(check); +} + +TEST_F(VulkanAPITest, lerp_scalar_) { + if (!at::is_vulkan_available()) { + return; + } + + auto a_cpu = at::rand({47, 2, 23, 97}, at::device(at::kCPU).dtype(at::kFloat)); + auto a_vulkan = a_cpu.vulkan(); + + const auto b_cpu = at::rand({47, 2, 23, 97}, at::device(at::kCPU).dtype(at::kFloat)); + const auto b_vulkan = b_cpu.vulkan(); + + const float w_scalar = 3.1415f; + + a_cpu.lerp_(b_cpu, w_scalar); + a_vulkan.lerp_(b_vulkan, w_scalar); + + const auto check = almostEqual(a_cpu, a_vulkan.cpu()); + if (!check) { + showRtol(a_cpu, a_vulkan.cpu()); + } + + ASSERT_TRUE(check); +} + +TEST_F(VulkanAPITest, hardswish) { if (!at::is_vulkan_available()) { return; } @@ -1116,7 +1355,7 @@ TEST(VulkanAPITest, hardswish) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, hardswish_) { +TEST_F(VulkanAPITest, hardswish_) { if (!at::is_vulkan_available()) { return; } @@ -1135,7 +1374,7 @@ TEST(VulkanAPITest, hardswish_) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, max_pool2d) { +TEST_F(VulkanAPITest, max_pool2d) { if (!at::is_vulkan_available()) { return; } @@ -1153,7 +1392,7 @@ TEST(VulkanAPITest, max_pool2d) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, mean) { +TEST_F(VulkanAPITest, mean) { const auto in_cpu = at::rand({17, 3, 79, 53}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); const auto out_cpu = at::mean(in_cpu, {-1, -2}, true); @@ -1168,7 +1407,7 @@ TEST(VulkanAPITest, mean) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, mean2d) { +TEST_F(VulkanAPITest, mean2d) { const auto in_cpu = at::rand({11, 7, 173, 37}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); const auto out_cpu = at::mean(in_cpu, {-1, -2}, false); @@ -1183,7 +1422,7 @@ TEST(VulkanAPITest, mean2d) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, mm) { +TEST_F(VulkanAPITest, mm) { if (!at::is_vulkan_available()) { return; } @@ -1203,7 +1442,7 @@ TEST(VulkanAPITest, mm) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, mul) { +TEST_F(VulkanAPITest, mul) { if (!at::is_vulkan_available()) { return; } @@ -1225,7 +1464,7 @@ TEST(VulkanAPITest, mul) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, mul_broadcast0) { +TEST_F(VulkanAPITest, mul_broadcast0) { if (!at::is_vulkan_available()) { return; } @@ -1247,7 +1486,7 @@ TEST(VulkanAPITest, mul_broadcast0) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, mul_broadcast1) { +TEST_F(VulkanAPITest, mul_broadcast1) { if (!at::is_vulkan_available()) { return; } @@ -1269,7 +1508,7 @@ TEST(VulkanAPITest, mul_broadcast1) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, mul_broadcast2) { +TEST_F(VulkanAPITest, mul_broadcast2) { if (!at::is_vulkan_available()) { return; } @@ -1291,7 +1530,7 @@ TEST(VulkanAPITest, mul_broadcast2) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, mul_) { +TEST_F(VulkanAPITest, mul_) { if (!at::is_vulkan_available()) { return; } @@ -1313,7 +1552,7 @@ TEST(VulkanAPITest, mul_) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, mul_broadcast0_) { +TEST_F(VulkanAPITest, mul_broadcast0_) { if (!at::is_vulkan_available()) { return; } @@ -1335,7 +1574,7 @@ TEST(VulkanAPITest, mul_broadcast0_) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, mul_broadcast1_) { +TEST_F(VulkanAPITest, mul_broadcast1_) { if (!at::is_vulkan_available()) { return; } @@ -1357,7 +1596,7 @@ TEST(VulkanAPITest, mul_broadcast1_) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, mul_scalar) { +TEST_F(VulkanAPITest, mul_scalar) { if (!at::is_vulkan_available()) { return; } @@ -1378,7 +1617,7 @@ TEST(VulkanAPITest, mul_scalar) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, mul_scalar_) { +TEST_F(VulkanAPITest, mul_scalar_) { if (!at::is_vulkan_available()) { return; } @@ -1399,7 +1638,7 @@ TEST(VulkanAPITest, mul_scalar_) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, reflection_pad2d) { +TEST_F(VulkanAPITest, reflection_pad2d) { if (!at::is_vulkan_available()) { return; } @@ -1418,7 +1657,7 @@ TEST(VulkanAPITest, reflection_pad2d) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, reshape) { +TEST_F(VulkanAPITest, reshape) { if (!at::is_vulkan_available()) { return; } @@ -1440,7 +1679,7 @@ TEST(VulkanAPITest, reshape) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, reshape_) { +TEST_F(VulkanAPITest, reshape_) { if (!at::is_vulkan_available()) { return; } @@ -1462,7 +1701,7 @@ TEST(VulkanAPITest, reshape_) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, sigmoid) { +TEST_F(VulkanAPITest, sigmoid) { if (!at::is_vulkan_available()) { return; } @@ -1481,7 +1720,7 @@ TEST(VulkanAPITest, sigmoid) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, sigmoid_) { +TEST_F(VulkanAPITest, sigmoid_) { if (!at::is_vulkan_available()) { return; } @@ -1500,7 +1739,7 @@ TEST(VulkanAPITest, sigmoid_) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, softmax) { +TEST_F(VulkanAPITest, softmax) { at::Tensor test_in[] = { at::rand({1, 196, 302, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat)), at::rand({1, 197, 302, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat)), @@ -1523,7 +1762,7 @@ TEST(VulkanAPITest, softmax) { } } -TEST(VulkanAPITest, log_softmax) { +TEST_F(VulkanAPITest, log_softmax) { at::Tensor test_in[] = { at::rand({1, 196, 302, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat)), at::rand({1, 197, 302, 5}, at::TensorOptions(at::kCPU).dtype(at::kFloat)), @@ -1546,12 +1785,12 @@ TEST(VulkanAPITest, log_softmax) { } } -TEST(VulkanAPITest, tanh) { +TEST_F(VulkanAPITest, tanh) { if (!at::is_vulkan_available()) { return; } - const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)); + const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) * 30; const auto in_vulkan = in_cpu.vulkan(); const auto out_cpu = at::tanh(in_cpu); @@ -1565,12 +1804,12 @@ TEST(VulkanAPITest, tanh) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, tanh_) { +TEST_F(VulkanAPITest, tanh_) { if (!at::is_vulkan_available()) { return; } - auto cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)); + auto cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) * 30; auto vulkan = cpu.vulkan(); at::tanh_(cpu); @@ -1584,7 +1823,7 @@ TEST(VulkanAPITest, tanh_) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, sub) { +TEST_F(VulkanAPITest, sub) { if (!at::is_vulkan_available()) { return; } @@ -1606,7 +1845,7 @@ TEST(VulkanAPITest, sub) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, sub_broadcast0) { +TEST_F(VulkanAPITest, sub_broadcast0) { if (!at::is_vulkan_available()) { return; } @@ -1628,7 +1867,7 @@ TEST(VulkanAPITest, sub_broadcast0) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, sub_broadcast1) { +TEST_F(VulkanAPITest, sub_broadcast1) { if (!at::is_vulkan_available()) { return; } @@ -1650,7 +1889,7 @@ TEST(VulkanAPITest, sub_broadcast1) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, sub_broadcast2) { +TEST_F(VulkanAPITest, sub_broadcast2) { if (!at::is_vulkan_available()) { return; } @@ -1672,7 +1911,7 @@ TEST(VulkanAPITest, sub_broadcast2) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, sub_) { +TEST_F(VulkanAPITest, sub_) { if (!at::is_vulkan_available()) { return; } @@ -1694,7 +1933,7 @@ TEST(VulkanAPITest, sub_) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, sub_broadcast0_) { +TEST_F(VulkanAPITest, sub_broadcast0_) { if (!at::is_vulkan_available()) { return; } @@ -1716,7 +1955,7 @@ TEST(VulkanAPITest, sub_broadcast0_) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, sub_broadcast1_) { +TEST_F(VulkanAPITest, sub_broadcast1_) { if (!at::is_vulkan_available()) { return; } @@ -1738,7 +1977,7 @@ TEST(VulkanAPITest, sub_broadcast1_) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, transposed_conv2d) { +TEST_F(VulkanAPITest, transposed_conv2d) { // Guard if (!at::is_vulkan_available()) { return; @@ -1818,7 +2057,7 @@ TEST(VulkanAPITest, transposed_conv2d) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, upsample_nearest2d) { +TEST_F(VulkanAPITest, upsample_nearest2d) { if (!at::is_vulkan_available()) { return; } @@ -1838,7 +2077,7 @@ TEST(VulkanAPITest, upsample_nearest2d) { } #if !defined(__APPLE__) -TEST(VulkanAPITest, cat_dim1_samefeature_success) { +TEST_F(VulkanAPITest, cat_dim1_samefeature_success) { // Guard if (!at::is_vulkan_available()) { return; @@ -1862,7 +2101,7 @@ TEST(VulkanAPITest, cat_dim1_samefeature_success) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, cat_dim1_difffeature_success) { +TEST_F(VulkanAPITest, cat_dim1_difffeature_success) { // Guard if (!at::is_vulkan_available()) { return; @@ -1886,7 +2125,7 @@ TEST(VulkanAPITest, cat_dim1_difffeature_success) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, cat_dim1_texture2d_success) { +TEST_F(VulkanAPITest, cat_dim1_texture2d_success) { // Guard if (!at::is_vulkan_available()) { return; @@ -1911,7 +2150,7 @@ TEST(VulkanAPITest, cat_dim1_texture2d_success) { } #endif /* !defined(__APPLE__) */ -TEST(VulkanAPITest, cat_dim1_singledepth_success) { +TEST_F(VulkanAPITest, cat_dim1_singledepth_success) { // Guard if (!at::is_vulkan_available()) { return; @@ -1935,7 +2174,7 @@ TEST(VulkanAPITest, cat_dim1_singledepth_success) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, cat_dim1_singletensor_success) { +TEST_F(VulkanAPITest, cat_dim1_singletensor_success) { // Guard if (!at::is_vulkan_available()) { return; @@ -1957,7 +2196,7 @@ TEST(VulkanAPITest, cat_dim1_singletensor_success) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, cat_dim1_twotensors_success) { +TEST_F(VulkanAPITest, cat_dim1_twotensors_success) { // Guard if (!at::is_vulkan_available()) { return; @@ -1980,7 +2219,7 @@ TEST(VulkanAPITest, cat_dim1_twotensors_success) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, cat_dim1_bat1_mult4ch_success) { +TEST_F(VulkanAPITest, cat_dim1_bat1_mult4ch_success) { // Guard if (!at::is_vulkan_available()) { return; @@ -2004,7 +2243,7 @@ TEST(VulkanAPITest, cat_dim1_bat1_mult4ch_success) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, cat_dim1_bat2_mult4ch_success) { +TEST_F(VulkanAPITest, cat_dim1_bat2_mult4ch_success) { // Guard if (!at::is_vulkan_available()) { return; @@ -2028,7 +2267,7 @@ TEST(VulkanAPITest, cat_dim1_bat2_mult4ch_success) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, cat_dim1_mult4ch_mixed_success) { +TEST_F(VulkanAPITest, cat_dim1_mult4ch_mixed_success) { // Guard if (!at::is_vulkan_available()) { return; @@ -2052,7 +2291,7 @@ TEST(VulkanAPITest, cat_dim1_mult4ch_mixed_success) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, cat_dim1_mult4ch_nonmult4ch_success) { +TEST_F(VulkanAPITest, cat_dim1_mult4ch_nonmult4ch_success) { // Guard if (!at::is_vulkan_available()) { return; @@ -2077,7 +2316,7 @@ TEST(VulkanAPITest, cat_dim1_mult4ch_nonmult4ch_success) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, cat_dim2_sameheight_success) { +TEST_F(VulkanAPITest, cat_dim2_sameheight_success) { // Guard if (!at::is_vulkan_available()) { return; @@ -2101,7 +2340,7 @@ TEST(VulkanAPITest, cat_dim2_sameheight_success) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, cat_dim2_diffheight_success) { +TEST_F(VulkanAPITest, cat_dim2_diffheight_success) { // Guard if (!at::is_vulkan_available()) { return; @@ -2125,7 +2364,7 @@ TEST(VulkanAPITest, cat_dim2_diffheight_success) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, cat_dim2_singledepth_success) { +TEST_F(VulkanAPITest, cat_dim2_singledepth_success) { // Guard if (!at::is_vulkan_available()) { return; @@ -2149,7 +2388,7 @@ TEST(VulkanAPITest, cat_dim2_singledepth_success) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, cat_dim2_invalidinputs_exceptions) { +TEST_F(VulkanAPITest, cat_dim2_invalidinputs_exceptions) { // Guard if (!at::is_vulkan_available()) { return; @@ -2198,7 +2437,7 @@ TEST(VulkanAPITest, cat_dim2_invalidinputs_exceptions) { } } -TEST(VulkanAPITest, permute_2d_success) { +TEST_F(VulkanAPITest, permute_2d_success) { // Guard if (!at::is_vulkan_available()) { return; @@ -2220,7 +2459,7 @@ TEST(VulkanAPITest, permute_2d_success) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, permute_3d_success) { +TEST_F(VulkanAPITest, permute_3d_success) { // Guard if (!at::is_vulkan_available()) { return; @@ -2247,7 +2486,7 @@ TEST(VulkanAPITest, permute_3d_success) { } } -TEST(VulkanAPITest, permute_4d_success) { +TEST_F(VulkanAPITest, permute_4d_success) { // Guard if (!at::is_vulkan_available()) { return; @@ -2274,7 +2513,7 @@ TEST(VulkanAPITest, permute_4d_success) { } } -TEST(VulkanAPITest, permute_4dmclaren_success) { +TEST_F(VulkanAPITest, permute_4dmclaren_success) { // Guard if (!at::is_vulkan_available()) { return; @@ -2296,7 +2535,7 @@ TEST(VulkanAPITest, permute_4dmclaren_success) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, permute_4dbig_success) { +TEST_F(VulkanAPITest, permute_4dbig_success) { // Guard if (!at::is_vulkan_available()) { return; @@ -2323,7 +2562,7 @@ TEST(VulkanAPITest, permute_4dbig_success) { } } -TEST(VulkanAPITest, permute_negativedims_success) { +TEST_F(VulkanAPITest, permute_negativedims_success) { // Guard if (!at::is_vulkan_available()) { return; @@ -2345,7 +2584,7 @@ TEST(VulkanAPITest, permute_negativedims_success) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, permute_1d_nochange) { +TEST_F(VulkanAPITest, permute_1d_nochange) { // Guard if (!at::is_vulkan_available()) { return; @@ -2367,7 +2606,7 @@ TEST(VulkanAPITest, permute_1d_nochange) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, permute_sameDims_nochange) { +TEST_F(VulkanAPITest, permute_sameDims_nochange) { // Guard if (!at::is_vulkan_available()) { return; @@ -2389,7 +2628,7 @@ TEST(VulkanAPITest, permute_sameDims_nochange) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, permute_invalidinputs_exceptions) { +TEST_F(VulkanAPITest, permute_invalidinputs_exceptions) { // Guard if (!at::is_vulkan_available()) { return; @@ -2449,7 +2688,7 @@ TEST(VulkanAPITest, permute_invalidinputs_exceptions) { }, ::c10::Error); } -TEST(VulkanAPITest, slice_width_success) { +TEST_F(VulkanAPITest, slice_width_success) { // Arrange std::unordered_map> dim2sizes { {3, {2, 3, 40, 50}}, // 4D tensors with dim=width @@ -2462,7 +2701,7 @@ TEST(VulkanAPITest, slice_width_success) { slice_tests(dim2sizes); } -TEST(VulkanAPITest, slice_height_success) { +TEST_F(VulkanAPITest, slice_height_success) { // Arrange std::unordered_map> dim2sizes { {2, {2, 3, 40, 50}}, // 4D tensors with dim=height @@ -2475,7 +2714,7 @@ TEST(VulkanAPITest, slice_height_success) { slice_tests(dim2sizes); } -TEST(VulkanAPITest, slice_feature_success) { +TEST_F(VulkanAPITest, slice_feature_success) { // Arrange std::unordered_map> dim2sizes { {1, {2, 40, 13, 14}}, // 4D tensors with dim=feature(channel) @@ -2487,7 +2726,7 @@ TEST(VulkanAPITest, slice_feature_success) { slice_tests(dim2sizes); } -TEST(VulkanAPITest, slice_batch_success) { +TEST_F(VulkanAPITest, slice_batch_success) { // Arrange std::unordered_map> dim2sizes { {0, {40, 3, 13, 14}}, // 4D tensors with dim=batch @@ -2498,7 +2737,7 @@ TEST(VulkanAPITest, slice_batch_success) { slice_tests(dim2sizes); } -TEST(VulkanAPITest, slice_invalidinputs_exceptions) { +TEST_F(VulkanAPITest, slice_invalidinputs_exceptions) { // Act: slice step must be positive EXPECT_THROW({ slice_test({2, 3, 4, 5}, 3, 0, 3, 0); @@ -2515,7 +2754,7 @@ TEST(VulkanAPITest, slice_invalidinputs_exceptions) { }, ::c10::Error); } -TEST(VulkanAPITest, clone_success) { +TEST_F(VulkanAPITest, clone_success) { // Arrange std::multimap, std::vector> mem2sizes { {c10::MemoryFormat::Preserve, {2, 3, 5, 161}}, // 4D tensors with MemoryFormat::Preserve @@ -2538,7 +2777,7 @@ TEST(VulkanAPITest, clone_success) { } } -TEST(VulkanAPITest, clone_invalidinputs_exceptions) { +TEST_F(VulkanAPITest, clone_invalidinputs_exceptions) { // Act: Vulkan supports Preserve and Contiguous memory foramts EXPECT_THROW({ clone_test({2, 3, 5, 161}, c10::MemoryFormat::ChannelsLast); @@ -2786,7 +3025,7 @@ class MobileNetV2 final : public OpsList { } }; -TEST(VulkanAPITest, mobilenetv2) { +TEST_F(VulkanAPITest, mobilenetv2) { if (!at::is_vulkan_available()) { return; } @@ -2805,6 +3044,453 @@ TEST(VulkanAPITest, mobilenetv2) { ASSERT_TRUE(check); } +TEST_F(VulkanAPITest, gru_mclareninputs_success) { + // Guard + if (!at::is_vulkan_available()) { + return; + } + + // Arrange + const int H_in = 384; // input_size + const int H_out = 384; // hidden_size + const int num_layers = 2; + const double gru_dropout = .0; + const bool has_biases = true; + const bool train = false; + const bool bidirectional = false; + const bool batch_first = true; + const auto in_cpu = at::rand({1, 1, H_in}, at::device(at::kCPU).dtype(at::kFloat)); + const auto h0_cpu = at::rand({num_layers, 1, H_out}, at::device(at::kCPU).dtype(at::kFloat)); + + c10::List weight_ih_l; // shape (3 * hidden_size, input_size) + c10::List weight_hh_l; // shape (3 * hidden_size, hidden_size) + c10::List bias_ih_l; // shape (3 * hidden_size) + c10::List bias_hh_l; // shape (3 * hidden_size) + for (int i = 0; i < num_layers; ++i) { + weight_ih_l.emplace_back(at::rand({3 * H_out, H_in}, at::device(at::kCPU).dtype(at::kFloat))); + weight_hh_l.emplace_back(at::rand({3 * H_out, H_out}, at::device(at::kCPU).dtype(at::kFloat))); + bias_ih_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat))); + bias_hh_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat))); + } + + // put this guard here to run inference inststead of training + // to avoid the following error: + // C++ exception with description "0INTERNAL ASSERT FAILED at "xplat/caffe2/aten/src/ATen/core/boxing/KernelFunction.cpp":31, please report a bug to PyTorch. aten::gru.input has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering (see Note [Ambiguity in AutogradOther kernel]). If you want to override CompositeImplicitAutograd, please open an issue to request a dedicated Autograd dispatch key for the backend. + // If you only want to run inference instead of training, add `c10::InferenceMode mode;` before model.forward(). Note this guard is only available in C++ but not Python at present. + c10::InferenceMode mode; + + // Act + const auto out_cpu = at::gru(in_cpu, h0_cpu, + { weight_ih_l[0], weight_hh_l[0], bias_ih_l[0], bias_hh_l[0], weight_ih_l[1], weight_hh_l[1], bias_ih_l[1], bias_hh_l[1] }, + has_biases, num_layers, gru_dropout, train, bidirectional, batch_first); + + // weights/biases should be always on CPU. + const auto out_vulkan = at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0), + weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }, + has_biases, num_layers, gru_dropout, train, bidirectional, batch_first); + + auto cpu_output = std::get<0>(out_cpu); + auto cpu_hidden = std::get<1>(out_cpu); + auto vulkan_output = std::get<0>(out_vulkan); + auto vulkan_hidden = std::get<1>(out_vulkan); + + // Assert + const auto check_output = almostEqual(cpu_output, vulkan_output.cpu()); + if (!check_output) { + showRtol(cpu_output, vulkan_output.cpu()); + } + ASSERT_TRUE(check_output); + + const auto check_hidden = almostEqual(cpu_hidden, vulkan_hidden.cpu()); + if (!check_hidden) { + showRtol(cpu_hidden, vulkan_hidden.cpu()); + } + ASSERT_TRUE(check_hidden); +} + +TEST_F(VulkanAPITest, gru_invalidinputs_exceptions) { + // Guard + if (!at::is_vulkan_available()) { + return; + } + + // Arrange + const int H_in = 384; // input_size + const int H_out = 384; // hidden_size + const int num_layers = 2; + const double gru_dropout = .0; + const bool has_biases = true; + const bool train = false; + const bool bidirectional = false; + const bool batch_first = true; + const auto in_cpu = at::rand({1, 1, H_in}, at::device(at::kCPU).dtype(at::kFloat)); + const auto h0_cpu = at::rand({num_layers, 1, H_out}, at::device(at::kCPU).dtype(at::kFloat)); + + c10::List weight_ih_l; // shape (3 * hidden_size, input_size) + c10::List weight_hh_l; // shape (3 * hidden_size, hidden_size) + c10::List bias_ih_l; // shape (3 * hidden_size) + c10::List bias_hh_l; // shape (3 * hidden_size) + for (int i = 0; i < num_layers; ++i) { + weight_ih_l.emplace_back(at::rand({3 * H_out, H_in}, at::device(at::kCPU).dtype(at::kFloat))); + weight_hh_l.emplace_back(at::rand({3 * H_out, H_out}, at::device(at::kCPU).dtype(at::kFloat))); + bias_ih_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat))); + bias_hh_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat))); + } + + // put this guard here to run inference inststead of training + // to avoid the following error: + // C++ exception with description "0INTERNAL ASSERT FAILED at "xplat/caffe2/aten/src/ATen/core/boxing/KernelFunction.cpp":31, please report a bug to PyTorch. aten::gru.input has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering (see Note [Ambiguity in AutogradOther kernel]). If you want to override CompositeImplicitAutograd, please open an issue to request a dedicated Autograd dispatch key for the backend. + // If you only want to run inference instead of training, add `c10::InferenceMode mode;` before model.forward(). Note this guard is only available in C++ but not Python at present. + c10::InferenceMode mode; + + // Act: incorrect # of weights/biases + EXPECT_THROW({ + at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0), + weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1) }, + has_biases, num_layers, gru_dropout, train, bidirectional, batch_first); + }, ::c10::Error); + + // Act: non-3D input tensor + EXPECT_THROW({ + const auto in_cpu_2d = at::rand({1, H_in}, at::device(at::kCPU).dtype(at::kFloat)); + at::gru(in_cpu_2d.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0), + weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }, + has_biases, num_layers, gru_dropout, train, bidirectional, batch_first); + }, ::c10::Error); + + // Act: non-3D hidden tensor + EXPECT_THROW({ + const auto h0_cpu_2d = at::rand({num_layers, H_out}, at::device(at::kCPU).dtype(at::kFloat)); + at::gru(in_cpu.vulkan(), h0_cpu_2d.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0), + weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }, + has_biases, num_layers, gru_dropout, train, bidirectional, batch_first); + }, ::c10::Error); + + // Act: has_biases should be true + EXPECT_THROW({ + at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0), + weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }, + false, num_layers, gru_dropout, train, bidirectional, batch_first); + }, ::c10::Error); + + // Act: train should be false + EXPECT_THROW({ + at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0), + weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }, + has_biases, num_layers, gru_dropout, true, bidirectional, batch_first); + }, ::c10::Error); + + // Act: bidirectional should be false + EXPECT_THROW({ + at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0), + weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }, + has_biases, num_layers, gru_dropout, train, true, batch_first); + }, ::c10::Error); + + // Act: batch_first should be true + EXPECT_THROW({ + at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0), + weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }, + has_biases, num_layers, gru_dropout, train, bidirectional, false); + }, ::c10::Error); + + // Act: dropout should be 0.0 + EXPECT_THROW({ + at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0), + weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }, + has_biases, num_layers, 1.0, train, bidirectional, batch_first); + }, ::c10::Error); +} + +TEST_F(VulkanAPITest, gru_prepack_success) { + // Guard + if (!at::is_vulkan_available()) { + return; + } + + // Arrange + const int H_in = 384; // input_size + const int H_out = 384; // hidden_size + const int num_layers = 2; + const double gru_dropout = .0; + const bool has_biases = true; + const bool train = false; + const bool bidirectional = false; + const bool batch_first = true; + const auto in_cpu = at::rand({1, 1, H_in}, at::device(at::kCPU).dtype(at::kFloat)); + const auto h0_cpu = at::rand({num_layers, 1, H_out}, at::device(at::kCPU).dtype(at::kFloat)); + + c10::List weight_ih_l; // shape (3 * hidden_size, input_size) + c10::List weight_hh_l; // shape (3 * hidden_size, hidden_size) + c10::List bias_ih_l; // shape (3 * hidden_size) + c10::List bias_hh_l; // shape (3 * hidden_size) + for (int i = 0; i < num_layers; ++i) { + weight_ih_l.emplace_back(at::rand({3 * H_out, H_in}, at::device(at::kCPU).dtype(at::kFloat))); + weight_hh_l.emplace_back(at::rand({3 * H_out, H_out}, at::device(at::kCPU).dtype(at::kFloat))); + bias_ih_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat))); + bias_hh_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat))); + } + + // put this guard here to run inference inststead of training + // to avoid the following error: + // C++ exception with description "0INTERNAL ASSERT FAILED at "xplat/caffe2/aten/src/ATen/core/boxing/KernelFunction.cpp":31, please report a bug to PyTorch. aten::gru.input has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering (see Note [Ambiguity in AutogradOther kernel]). If you want to override CompositeImplicitAutograd, please open an issue to request a dedicated Autograd dispatch key for the backend. + // If you only want to run inference instead of training, add `c10::InferenceMode mode;` before model.forward(). Note this guard is only available in C++ but not Python at present. + c10::InferenceMode mode; + + // Act + const auto out_cpu = at::gru(in_cpu, h0_cpu, + { weight_ih_l[0], weight_hh_l[0], bias_ih_l[0], bias_hh_l[0], weight_ih_l[1], weight_hh_l[1], bias_ih_l[1], bias_hh_l[1] }, + has_biases, num_layers, gru_dropout, train, bidirectional, batch_first); + + auto prepack = callOpByName( + "vulkan_prepack::gru_prepack", + "", + std::vector({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0), + weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }), + has_biases, num_layers, gru_dropout, train, bidirectional, batch_first); + auto out_vulkan = callOpByName( + "vulkan_prepack::gru_run", + "", + in_cpu.vulkan(), h0_cpu.vulkan(), prepack[0]); + + auto cpu_output = std::get<0>(out_cpu); + auto cpu_hidden = std::get<1>(out_cpu); + auto vulkan_output = out_vulkan[0].toTensor(); + auto vulkan_hidden = out_vulkan[1].toTensor(); + + // Assert + const auto check_output = almostEqual(cpu_output, vulkan_output.cpu()); + if (!check_output) { + showRtol(cpu_output, vulkan_output.cpu()); + } + ASSERT_TRUE(check_output); + + const auto check_hidden = almostEqual(cpu_hidden, vulkan_hidden.cpu()); + if (!check_hidden) { + showRtol(cpu_hidden, vulkan_hidden.cpu()); + } + ASSERT_TRUE(check_hidden); +} + +TEST_F(VulkanAPITest, gru_prepack_invalidinputs_exceptions) { + // Guard + if (!at::is_vulkan_available()) { + return; + } + + // Arrange + const int H_in = 384; // input_size + const int H_out = 384; // hidden_size + const int num_layers = 2; + const double gru_dropout = .0; + const bool has_biases = true; + const bool train = false; + const bool bidirectional = false; + const bool batch_first = true; + const auto in_cpu = at::rand({1, 1, H_in}, at::device(at::kCPU).dtype(at::kFloat)); + const auto h0_cpu = at::rand({num_layers, 1, H_out}, at::device(at::kCPU).dtype(at::kFloat)); + + c10::List weight_ih_l; // shape (3 * hidden_size, input_size) + c10::List weight_hh_l; // shape (3 * hidden_size, hidden_size) + c10::List bias_ih_l; // shape (3 * hidden_size) + c10::List bias_hh_l; // shape (3 * hidden_size) + for (int i = 0; i < num_layers; ++i) { + weight_ih_l.emplace_back(at::rand({3 * H_out, H_in}, at::device(at::kCPU).dtype(at::kFloat))); + weight_hh_l.emplace_back(at::rand({3 * H_out, H_out}, at::device(at::kCPU).dtype(at::kFloat))); + bias_ih_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat))); + bias_hh_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat))); + } + + // put this guard here to run inference inststead of training + // to avoid the following error: + // C++ exception with description "0INTERNAL ASSERT FAILED at "xplat/caffe2/aten/src/ATen/core/boxing/KernelFunction.cpp":31, please report a bug to PyTorch. aten::gru.input has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering (see Note [Ambiguity in AutogradOther kernel]). If you want to override CompositeImplicitAutograd, please open an issue to request a dedicated Autograd dispatch key for the backend. + // If you only want to run inference instead of training, add `c10::InferenceMode mode;` before model.forward(). Note this guard is only available in C++ but not Python at present. + c10::InferenceMode mode; + + // Act: incorrect # of weights/biases + EXPECT_THROW({ + auto prepack = callOpByName( + "vulkan_prepack::gru_prepack", + "", + std::vector({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0), + weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1) }), + has_biases, num_layers, gru_dropout, train, bidirectional, batch_first); + }, ::c10::Error); + + // Act: non-3D input tensor + EXPECT_THROW({ + const auto in_cpu_2d = at::rand({1, H_in}, at::device(at::kCPU).dtype(at::kFloat)); + auto prepack = callOpByName( + "vulkan_prepack::gru_prepack", + "", + std::vector({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0), + weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }), + has_biases, num_layers, gru_dropout, train, bidirectional, batch_first); + auto out_vulkan = callOpByName( + "vulkan_prepack::gru_run", + "", + in_cpu_2d.vulkan(), h0_cpu.vulkan(), prepack[0]); + }, ::c10::Error); + + // Act: non-3D hidden tensor + EXPECT_THROW({ + const auto h0_cpu_2d = at::rand({num_layers, H_out}, at::device(at::kCPU).dtype(at::kFloat)); + auto prepack = callOpByName( + "vulkan_prepack::gru_prepack", + "", + std::vector({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0), + weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }), + has_biases, num_layers, gru_dropout, train, bidirectional, batch_first); + auto out_vulkan = callOpByName( + "vulkan_prepack::gru_run", + "", + in_cpu.vulkan(), h0_cpu_2d.vulkan(), prepack[0]); + }, ::c10::Error); + + // Act: has_biases should be true + EXPECT_THROW({ + auto prepack = callOpByName( + "vulkan_prepack::gru_prepack", + "", + std::vector({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0), + weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }), + false, num_layers, gru_dropout, train, bidirectional, batch_first); + }, ::c10::Error); + + // Act: train should be false + EXPECT_THROW({ + auto prepack = callOpByName( + "vulkan_prepack::gru_prepack", + "", + std::vector({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0), + weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }), + has_biases, num_layers, gru_dropout, true, bidirectional, batch_first); + }, ::c10::Error); + + // Act: bidirectional should be false + EXPECT_THROW({ + auto prepack = callOpByName( + "vulkan_prepack::gru_prepack", + "", + std::vector({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0), + weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }), + has_biases, num_layers, gru_dropout, train, true, batch_first); + }, ::c10::Error); + + // Act: batch_first should be true + EXPECT_THROW({ + auto prepack = callOpByName( + "vulkan_prepack::gru_prepack", + "", + std::vector({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0), + weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }), + has_biases, num_layers, gru_dropout, train, bidirectional, false); + }, ::c10::Error); + + // Act: dropout should be 0.0 + EXPECT_THROW({ + auto prepack = callOpByName( + "vulkan_prepack::gru_prepack", + "", + std::vector({ weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0), + weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }), + has_biases, num_layers, 1.0, train, bidirectional, batch_first); + }, ::c10::Error); +} + +#if defined (__ANDROID__) // to avoid `Undefined symbols for architecture arm64` error +TEST_F(VulkanAPITest, profiling_invalideinputs_exceptions) { + // Guard + if (!at::is_vulkan_available()) { + return; + } + + // Act: The device doesn't support for timestamps on all graphics and compute queues. + EXPECT_THROW({ + const bool is_timestamps_supported_ = false; + const float timestamp_period = 1.f; + at::native::vulkan::api::QueryPool querypool(at::native::vulkan::api::context()->gpu().device, is_timestamps_supported_, timestamp_period); + querypool.enable(); + }, ::c10::Error); + + // Act: The query pool already exists. + EXPECT_THROW({ + auto context = at::native::vulkan::api::context(); + at::native::vulkan::api::QueryPool querypool( + context->gpu().device, + context->gpu().adapter->timestamp_compute_and_graphics(), + context->gpu().adapter->timestamp_period()); + querypool.enable(); + querypool.enable(); // already enabled + }, ::c10::Error); + + // Act: The query index cannot exceed Configuration::kMaxQueryCount. + EXPECT_THROW({ + auto context = at::native::vulkan::api::context(); + at::native::vulkan::api::QueryPool querypool( + context->gpu().device, + context->gpu().adapter->timestamp_compute_and_graphics(), + context->gpu().adapter->timestamp_period()); + querypool.enable(); + for (uint32_t i = 0u; i < at::native::vulkan::api::QueryPool::Configuration::kMaxQueryCount + 1u; ++i) { + at::native::vulkan::api::Command::Buffer& command_buffer = context->command().pool.stream(); + { + at::native::vulkan::api::OpProfiler profiler(command_buffer, querypool, "test"); + } + context->command().pool.submit(context->gpu().queue, command_buffer); + } + }, ::c10::Error); +} + +// NOTE: Keep the following test at the end of file +// so that it can print out the op execution time for all prior tests +TEST_F(VulkanAPITest, profiling_result_success) { + // Guard + if (!at::is_vulkan_available()) { + return; + } + + // Arrange + auto is_enabled = at::native::vulkan::api::context()->querypool().is_enabled(); + if (is_enabled) { + auto perf_info = at::native::vulkan::api::context()->querypool().disable(false); + std::cout + << "-----------------------------------------------------------------------------------------" << std::endl + << "Query Name Execution Start End" << std::endl + << "-----------------------------------------------------------------------------------------" << std::endl; + for (size_t i = 0; i < perf_info.size(); i++) { + std::cout << std::left << std::setw(35) << perf_info[i].query_name.c_str() + << std::right << std::setw(15) << perf_info[i].execution_time_us << " us" + << std::setw(15) << perf_info[i].start_time_us << " us" + << std::setw(15) << perf_info[i].end_time_us << " us" << std::left << std::endl; + } + } + at::native::vulkan::api::context()->querypool().enable(); + const auto in_cpu1 = at::rand({2, 4, 221, 193}, at::device(at::kCPU).dtype(at::kFloat)); + const auto in_cpu2 = at::rand({2, 4, 221, 193}, at::device(at::kCPU).dtype(at::kFloat)); + const auto in_cpu3 = at::rand({2, 4, 221, 193}, at::device(at::kCPU).dtype(at::kFloat)); + const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 1); + out_vulkan.cpu(); // to make sure all GPU operations are done + + // Act + auto perf_info = at::native::vulkan::api::context()->querypool().disable(true); + for (size_t i = 0; i < perf_info.size(); i++) { + std::cout << std::left << std::setw(35) << perf_info[i].query_name.c_str() + << std::right << std::setw(15) << perf_info[i].execution_time_us << " us" + << std::setw(15) << perf_info[i].start_time_us << " us" + << std::setw(15) << perf_info[i].end_time_us << " us" << std::left << std::endl; + } + + // Assert + ASSERT_TRUE(perf_info.size() == 5u); + ASSERT_TRUE(perf_info[0].query_name == "aten::_cat (cat_feature_mult4ch)"); + + if (is_enabled) { + at::native::vulkan::api::context()->querypool().enable(); + } +} +#endif + } // namespace #endif /* USE_VULKAN_API */ diff --git a/aten/src/ATen/test/vulkan_perf_test.cpp b/aten/src/ATen/test/vulkan_perf_test.cpp index fa6b303eead5..230484a0f915 100644 --- a/aten/src/ATen/test/vulkan_perf_test.cpp +++ b/aten/src/ATen/test/vulkan_perf_test.cpp @@ -7,7 +7,8 @@ namespace { -static void cat_op_channel_perf(benchmark::State& state) { +// using Vulkan Timestamp Queries for the pure GPU execution time only +static void cat_op_channel_perf_gpu_only(benchmark::State& state) { // Guard if (!at::is_vulkan_available()) { return; @@ -25,12 +26,64 @@ static void cat_op_channel_perf(benchmark::State& state) { const auto in_vulkan2 = in_cpu2.vulkan(); const auto in_vulkan3 = in_cpu3.vulkan(); + // Act + for (auto _ : state) { + at::native::vulkan::api::context()->querypool().enable(); + const auto vulkan_out = at::cat({in_vulkan1, in_vulkan2, in_vulkan3}, 1); + vulkan_out.cpu(); + auto perf_info = at::native::vulkan::api::context()->querypool().disable(true); + state.SetIterationTime(perf_info[0].execution_time_us / 1'000'000.); // us to sec + } +} + +static void gru_op_perf(benchmark::State& state) { + // Guard + if (!at::is_vulkan_available()) { + return; + } + + // Arrange + const int H_in = static_cast(state.range(0)); // input_size + const int H_out = static_cast(state.range(1)); // hidden_size + const int num_layers = static_cast(state.range(2)); + const double gru_dropout = .0; + const bool has_biases = true; + const bool train = false; + const bool bidirectional = false; + const bool batch_first = true; + const auto in_cpu = at::rand({1, 1, H_in}, at::device(at::kCPU).dtype(at::kFloat)); + const auto h0_cpu = at::rand({num_layers, 1, H_out}, at::device(at::kCPU).dtype(at::kFloat)); + + c10::List weight_ih_l; // shape (3 * hidden_size, input_size) + c10::List weight_hh_l; // shape (3 * hidden_size, hidden_size) + c10::List bias_ih_l; // shape (3 * hidden_size) + c10::List bias_hh_l; // shape (3 * hidden_size) + for (int i = 0; i < num_layers; ++i) { + weight_ih_l.emplace_back(at::rand({3 * H_out, H_in}, at::device(at::kCPU).dtype(at::kFloat))); + weight_hh_l.emplace_back(at::rand({3 * H_out, H_out}, at::device(at::kCPU).dtype(at::kFloat))); + bias_ih_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat))); + bias_hh_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat))); + } + + // put this guard here to run inference inststead of training + // to avoid the following error: + // C++ exception with description "0INTERNAL ASSERT FAILED at "xplat/caffe2/aten/src/ATen/core/boxing/KernelFunction.cpp":31, please report a bug to PyTorch. aten::gru.input has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering (see Note [Ambiguity in AutogradOther kernel]). If you want to override CompositeImplicitAutograd, please open an issue to request a dedicated Autograd dispatch key for the backend. + // If you only want to run inference instead of training, add `c10::InferenceMode mode;` before model.forward(). Note this guard is only available in C++ but not Python at present. + c10::InferenceMode mode; + // Act while (state.KeepRunning()) { - const auto out_vulkan = at::cat({in_vulkan1, in_vulkan2, in_vulkan3}, 1); + // weights/biases should be always on CPU. + const auto out_vulkan = at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0), + weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }, + has_biases, num_layers, gru_dropout, train, bidirectional, batch_first); + + auto vulkan_output = std::get<0>(out_vulkan); + auto vulkan_hidden = std::get<1>(out_vulkan); // to avoid out-of-memory issues, release resources by waiting and flushing all GPU operations - at::native::vulkan::api::context()->wait(out_vulkan); + at::native::vulkan::api::context()->wait(vulkan_output); + at::native::vulkan::api::context()->wait(vulkan_hidden); at::native::vulkan::api::context()->flush(); } } @@ -42,12 +95,14 @@ static void CommonBenchmarkSettings(benchmark::internal::Benchmark* b) { } // namespace -BENCHMARK(cat_op_channel_perf)->Apply(CommonBenchmarkSettings)->Threads(1)->Iterations(1000)->Args({3, 40, 221, 193}); // big multiple of 4 channels -BENCHMARK(cat_op_channel_perf)->Apply(CommonBenchmarkSettings)->Threads(1)->Iterations(1000)->Args({3, 20, 221, 193}); // big multiple of 4 channels -BENCHMARK(cat_op_channel_perf)->Apply(CommonBenchmarkSettings)->Threads(1)->Iterations(1000)->Args({3, 39, 221, 193}); // big non-multiple of 4 channels -BENCHMARK(cat_op_channel_perf)->Apply(CommonBenchmarkSettings)->Threads(1)->Iterations(5000)->Args({3, 4, 221, 193}); // small multiple of 4 channels -BENCHMARK(cat_op_channel_perf)->Apply(CommonBenchmarkSettings)->Threads(1)->Iterations(5000)->Args({3, 3, 221, 193}); // small non-multiple of 4 channels -BENCHMARK(cat_op_channel_perf)->Apply(CommonBenchmarkSettings)->Threads(3)->Iterations(1000)->Args({3, 40, 221, 193}); // big multiple of 4 channels (multi-thread) +BENCHMARK(cat_op_channel_perf_gpu_only)->Apply(CommonBenchmarkSettings)->UseManualTime()->Threads(1)->Iterations(100)->Args({3, 40, 221, 193}); // big multiple of 4 channels +BENCHMARK(cat_op_channel_perf_gpu_only)->Apply(CommonBenchmarkSettings)->UseManualTime()->Threads(1)->Iterations(100)->Args({3, 20, 221, 193}); // big multiple of 4 channels +BENCHMARK(cat_op_channel_perf_gpu_only)->Apply(CommonBenchmarkSettings)->UseManualTime()->Threads(1)->Iterations(100)->Args({3, 39, 221, 193}); // big non-multiple of 4 channels +BENCHMARK(cat_op_channel_perf_gpu_only)->Apply(CommonBenchmarkSettings)->UseManualTime()->Threads(1)->Iterations(100)->Args({3, 4, 221, 193}); // small multiple of 4 channels +BENCHMARK(cat_op_channel_perf_gpu_only)->Apply(CommonBenchmarkSettings)->UseManualTime()->Threads(1)->Iterations(100)->Args({3, 3, 221, 193}); // small non-multiple of 4 channels +BENCHMARK(cat_op_channel_perf_gpu_only)->Apply(CommonBenchmarkSettings)->UseManualTime()->Threads(3)->Iterations(100)->Args({3, 40, 221, 193}); // big multiple of 4 channels (multi-thread) +BENCHMARK(gru_op_perf)->Apply(CommonBenchmarkSettings)->Threads(1)->Iterations(100)->Args({384, 384, 2}); // McLaren Model inputs + BENCHMARK_MAIN(); #endif /* USE_VULKAN_API */ diff --git a/aten/src/ATen/test/vulkan_test.cpp b/aten/src/ATen/test/vulkan_test.cpp deleted file mode 100644 index 09c98fa214c1..000000000000 --- a/aten/src/ATen/test/vulkan_test.cpp +++ /dev/null @@ -1,950 +0,0 @@ -#ifndef USE_VULKAN_API - -#include - -#include -#include -#include -#include - -bool checkRtol(const at::Tensor& diff, const std::vector inputs) { - double maxValue = 0.0; - for (auto& tensor : inputs) { - maxValue = fmax(tensor.abs().max().item(), maxValue); - } - return diff.abs().max().item() < (0.01 + 2e-2 * maxValue); -} -bool almostEqual(const at::Tensor& a, const at::Tensor& b) { - return checkRtol(a - b, {a, b}); -} - -bool exactlyEqual(const at::Tensor& a, const at::Tensor& b) { - return (a - b).abs().max().item() == 0.f; -} - -TEST(VulkanTest, ToVulkanToCpu) { - if (!at::is_vulkan_available()) - return; - auto t = - at::rand({1, 2, 2, 3}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); - auto tv = t.vulkan(); - ASSERT_TRUE(tv.options().device().type() == at::kVulkan); - auto t2 = tv.cpu(); - ASSERT_TRUE(t2.options().device().type() == at::kCPU); - ASSERT_TRUE(almostEqual(t2, t)); -} - -TEST(VulkanTest, upsampleNearest2D) { - if (!at::is_vulkan_available()) - return; - - auto t_in = - at::rand({1, 2, 2, 3}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); - auto t_out_expected = at::upsample_nearest2d(t_in, {4, 6}); - auto tv_in = - t_in.to(at::TensorOptions{at::Device{at::kVulkan}}.dtype(at::kFloat)); - - auto tv_out = at::upsample_nearest2d(tv_in, {4, 6}); - auto t_out = - tv_out.to(at::TensorOptions{at::Device{at::kCPU}}.dtype(at::kFloat)); - - bool check = almostEqual(t_out_expected, t_out); - if (!check) { - std::cout << "expected:\n" << t_out_expected << std::endl; - std::cout << "got:\n" << t_out << std::endl; - } - ASSERT_TRUE(check); -} - -TEST(VulkanTest, add) { - if (!at::is_vulkan_available()) - return; - auto t_in0 = at::rand({1, 2, 2, 3}, at::device(at::kCPU).dtype(at::kFloat)); - auto t_in1 = at::rand({1, 2, 2, 3}, at::device(at::kCPU).dtype(at::kFloat)); - auto t_out_expected = at::add(t_in0, t_in1, 2); - auto tv_in0 = t_in0.vulkan(); - auto tv_in1 = t_in1.vulkan(); - auto tv_out = at::add(tv_in0, tv_in1, 2); - auto t_out = tv_out.cpu(); - - ASSERT_TRUE(almostEqual(t_out, t_out_expected)); -} - -TEST(VulkanTest, add_not4dim) { - if (!at::is_vulkan_available()) - return; - auto t_in0 = at::rand({1, 1000}, at::device(at::kCPU).dtype(at::kFloat)); - auto t_in1 = at::rand({1000}, at::device(at::kCPU).dtype(at::kFloat)); - auto t_out_expected = at::add(t_in0, t_in1, 2); - auto tv_in0 = t_in0.vulkan(); - auto tv_in1 = t_in1.vulkan(); - auto tv_out = at::add(tv_in0, tv_in1, 2); - auto t_out = tv_out.cpu(); - - ASSERT_TRUE(almostEqual(t_out, t_out_expected)); -} - -TEST(VulkanTest, add_cpu_vulkan) { - if (!at::is_vulkan_available()) - return; - auto t_in0 = at::rand({2, 96, 1000}, at::device(at::kCPU).dtype(at::kFloat)); - auto t_in1 = - at::rand({1, 2, 96, 1000}, at::device(at::kCPU).dtype(at::kFloat)); - auto t_out_expected = at::add(t_in0, t_in1, 2); - auto tv_in0 = t_in0.vulkan(); - auto tv_in1 = t_in1.vulkan(); - - auto tv_out1 = at::add(tv_in0, t_in1, 2); - auto t_out1 = tv_out1.cpu(); - ASSERT_TRUE(almostEqual(t_out1, t_out_expected)); - - auto tv_out2 = at::add(t_in0, tv_in1, 2); - auto t_out2 = tv_out2.cpu(); - ASSERT_TRUE(almostEqual(t_out2, t_out_expected)); -} - -TEST(VulkanTest, add_) { - if (!at::is_vulkan_available()) - return; - auto t_in0 = at::rand({1, 2, 2, 2}, at::device(at::kCPU).dtype(at::kFloat)); - auto t_in1 = at::rand({1, 2, 2, 2}, at::device(at::kCPU).dtype(at::kFloat)); - auto tv_in0 = t_in0.vulkan(); - auto tv_in1 = t_in1.vulkan(); - - t_in0.add_(t_in1, 2); - tv_in0.add_(tv_in1, 2); - auto t_out = tv_in0.cpu(); - bool check = almostEqual(t_out, t_in0); - if (!check) { - std::cout << "expected:\n" << t_in0 << std::endl; - std::cout << "got:\n" << t_out << std::endl; - } - ASSERT_TRUE(check); -} - -TEST(VulkanTest, mulScalar) { - if (!at::is_vulkan_available()) - return; - auto t_in = at::rand({3, 2, 2, 3}, at::device(at::kCPU).dtype(at::kFloat)); - const float other = 3.14; - auto t_out_expected = t_in.mul(other); - auto tv_in = t_in.vulkan(); - auto tv_out = tv_in.mul(other); - auto t_out = tv_out.cpu(); - - bool check = almostEqual(t_out, t_out_expected); - if (!check) { - std::cout << "expected:\n" << t_out_expected << std::endl; - std::cout << "got:\n" << t_out << std::endl; - } - ASSERT_TRUE(check); -} - -TEST(VulkanTest, addScalar) { - if (!at::is_vulkan_available()) - return; - auto t_in = at::rand({3, 2, 2, 3}, at::device(at::kCPU).dtype(at::kFloat)); - float* data = t_in.data_ptr(); - auto numel = t_in.numel(); - for (const auto i : c10::irange(numel)) { - // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) - data[i] = i; - } - - const float other = 3.14; - const float alpha = 2; - auto t_out_expected = t_in.add(other, alpha); - auto tv_in = t_in.vulkan(); - auto tv_out = tv_in.add(other, alpha); - auto t_out = tv_out.cpu(); - - bool check = almostEqual(t_out, t_out_expected); - if (!check) { - std::cout << "expected:\n" << t_out_expected << std::endl; - std::cout << "got:\n" << t_out << std::endl; - } - ASSERT_TRUE(check); -} - -TEST(VulkanTest, conv2d) { - if (!at::is_vulkan_available()) - return; - auto OC = 2; - auto C = 3; - int64_t H = 3; - int64_t W = 3; - int64_t KH = 2; - int64_t KW = 2; - auto t_in = at::rand({1, C, H, W}, at::device(at::kCPU).dtype(at::kFloat)); - auto t_w = at::rand({OC, C, KH, KW}, at::device(at::kCPU).dtype(at::kFloat)); - auto t_b = at::zeros({OC}, at::device(at::kCPU).dtype(at::kFloat)); - int64_t groups = 1; - std::vector stride{1, 1}; - std::vector padding{0, 0}; - std::vector dilation{1, 1}; - - auto t_out_expected = - at::conv2d(t_in, t_w, t_b, stride, padding, dilation, groups); - auto tv_in = t_in.vulkan(); - auto tv_out = at::conv2d(tv_in, t_w, t_b, stride, padding, dilation, groups); - auto t_out = tv_out.cpu(); - bool check = almostEqual(t_out, t_out_expected); - if (!check) { - std::cout << "expected:\n" << t_out_expected << std::endl; - std::cout << "got:\n" << t_out << std::endl; - } - ASSERT_TRUE(check); -} - -TEST(VulkanTest, conv2dDWWeightsOnCPU) { - if (!at::is_vulkan_available()) - return; - auto C = 3; - int64_t groups = C; - int64_t H = 3; - int64_t W = 3; - int64_t KH = 2; - int64_t KW = 2; - auto t_in = at::rand({1, C, H, W}, at::device(at::kCPU).dtype(at::kFloat)); - auto t_w = - at::rand({groups, 1, KH, KW}, at::device(at::kCPU).dtype(at::kFloat)); - auto t_b = at::zeros({groups}, at::device(at::kCPU).dtype(at::kFloat)); - std::vector stride{1, 1}; - std::vector padding{0, 0}; - std::vector dilation{1, 1}; - auto t_out_expected = - at::conv2d(t_in, t_w, t_b, stride, padding, dilation, groups); - auto tv_in = t_in.vulkan(); - auto tv_out = at::conv2d(tv_in, t_w, t_b, stride, padding, dilation, groups); - auto t_out = tv_out.cpu(); - bool check = almostEqual(t_out_expected, t_out); - if (!check) { - std::cout << "expected:\n" << t_out_expected << std::endl; - std::cout << "got:\n" << t_out << std::endl; - } - ASSERT_TRUE(check); -} - -TEST(VulkanTest, addmm) { - if (!at::is_vulkan_available()) - return; - auto t_m1 = at::rand({2, 2}, at::device(at::kCPU).dtype(at::kFloat)); - auto t_m2 = at::rand({2, 3}, at::device(at::kCPU).dtype(at::kFloat)); - auto t_b = at::rand({2, 3}, at::device(at::kCPU).dtype(at::kFloat)); - - float beta = 100; - float alpha = 2; - auto t_out_expected = at::addmm(t_b, t_m1, t_m2, beta, alpha); - - auto tv_m1 = t_m1.vulkan(); - auto tv_m2 = t_m2.vulkan(); - auto tv_b = t_b.vulkan(); - auto tv_out = at::addmm(tv_b, tv_m1, tv_m2, beta, alpha); - auto t_out = tv_out.cpu(); - bool check = almostEqual(t_out_expected, t_out); - if (!check) { - std::cout << "expected:\n" << t_out_expected << std::endl; - std::cout << "got:\n" << t_out << std::endl; - } - ASSERT_TRUE(check); -} - -TEST(VulkanTest, mm) { - if (!at::is_vulkan_available()) - return; - auto t_m1 = at::rand({10, 20}, at::device(at::kCPU).dtype(at::kFloat)); - auto t_m2 = at::rand({20, 30}, at::device(at::kCPU).dtype(at::kFloat)); - - auto t_out_expected = t_m1.mm(t_m2); - - auto tv_m1 = t_m1.vulkan(); - auto tv_m2 = t_m2.vulkan(); - auto tv_out = tv_m1.mm(tv_m2); - auto t_out = tv_out.cpu(); - bool check = almostEqual(t_out_expected, t_out); - if (!check) { - std::cout << "expected:\n" << t_out_expected << std::endl; - std::cout << "got:\n" << t_out << std::endl; - } - ASSERT_TRUE(check); -} - -TEST(VulkanTest, clamp) { - if (!at::is_vulkan_available()) - return; - float min = -0.5; - float max = 0.5; - auto t_in = at::rand({1, 3, 16, 16}, at::device(at::kCPU).dtype(at::kFloat)); - auto t_out_expected = at::clamp(t_in, min, max); - - auto tv_in = t_in.vulkan(); - auto tv_out = at::clamp(tv_in, min, max); - auto t_out = tv_out.cpu(); - - ASSERT_TRUE(almostEqual(t_out, t_out_expected)); -} - -TEST(VulkanTest, hardtanh_) { - if (!at::is_vulkan_available()) - return; - float min = -0.5; - float max = 0.5; - auto t_in = at::rand({1, 3, 16, 16}, at::device(at::kCPU).dtype(at::kFloat)); - auto t_out_expected = at::hardtanh_(t_in, min, max); - - auto tv_in = t_in.vulkan(); - auto tv_out = at::hardtanh_(tv_in, min, max); - auto t_out = tv_out.cpu(); - - ASSERT_TRUE(almostEqual(t_out, t_out_expected)); -} - -TEST(VulkanTest, relu_) { - if (!at::is_vulkan_available()) - return; - auto t = at::empty({1, 2, 2, 2}, at::device(at::kCPU).dtype(at::kFloat)); - auto t_in = t.uniform_(-1, 1); - auto tv_in = t_in.vulkan(); - - t_in.relu_(); - tv_in.relu_(); - auto tv_out = tv_in.cpu(); - bool check = almostEqual(t_in, tv_out); - if (!check) { - std::cout << "expected:\n" << t_in << std::endl; - std::cout << "got:\n" << tv_out << std::endl; - } - ASSERT_TRUE(check); -} - -TEST(VulkanTest, mean) { - if (!at::is_vulkan_available()) - return; - auto t_in = at::rand({2, 3, 3, 3}, at::device(at::kCPU).dtype(at::kFloat)); - auto t_out_expected = at::mean(t_in, {2, 3}, false); - auto tv_in = t_in.vulkan(); - auto tv_out = at::mean(tv_in, {2, 3}, false); - auto t_out = tv_out.cpu(); - bool check = almostEqual(t_out_expected, t_out); - if (!check) { - std::cout << "expected:\n" << t_out_expected << std::endl; - std::cout << "got:\n" << t_out << std::endl; - } - ASSERT_TRUE(check); -} - -enum class OpType { conv2d, hardtanh_, mean, addmm }; - -class BaseOp { - public: - BaseOp(OpType t) : type(t) {} - virtual ~BaseOp() = default; - virtual at::Tensor run(at::Tensor&) = 0; - virtual std::string toString() = 0; - OpType type; -}; - -class Hardtanh_ : public BaseOp { - public: - Hardtanh_() : BaseOp(OpType::hardtanh_) {} - at::Tensor run(at::Tensor& t) override { - return at::hardtanh_(t, 0, 6); - } - std::string toString() override { - return "hardtanh_"; - } -}; - -class Mean : public BaseOp { - public: - Mean() : BaseOp(OpType::mean) {} - at::Tensor run(at::Tensor& t) override { - return at::mean(t, {2, 3}, false); - } - std::string toString() override { - return "mean"; - } -}; - -class Addmm : public BaseOp { - public: - Addmm(int64_t m1H, int64_t m1W, int64_t m2W, float _beta, float _alpha) - : BaseOp(OpType::addmm), beta(_beta), alpha(_alpha) { - m2 = at::rand( - c10::IntArrayRef({m1W, m2W}), at::device(at::kCPU).dtype(at::kFloat)); - m2v = m2.vulkan(); - b = at::rand( - c10::IntArrayRef({m1H, m2W}), at::device(at::kCPU).dtype(at::kFloat)); - bv = b.vulkan(); - } - - at::Tensor run(at::Tensor& t) override { - if (t.is_vulkan()) { - return at::addmm(bv, t, m2v, beta, alpha); - } - return at::addmm(b, t, m2, beta, alpha); - } - - std::string toString() override { - return "addmm"; - } - - at::Tensor m2; - at::Tensor m2v; - at::Tensor b; - at::Tensor bv; - float beta; - float alpha; -}; - -class Conv2d : public BaseOp { - public: - Conv2d(c10::IntArrayRef wsizes, int64_t g, int64_t s, int64_t p) - : BaseOp(OpType::conv2d), stride(s), padding(p), groups(g) { - w = at::rand(wsizes, at::device(at::kCPU).dtype(at::kFloat)); - b = at::zeros(wsizes[0], at::device(at::kCPU).dtype(at::kFloat)); - }; - - at::Tensor run(at::Tensor& t) override { - return at::conv2d(t, w, b, {stride}, {padding}, {1}, groups); - } - std::string toString() override { - return "conv2d"; - } - - int64_t stride; - int64_t padding; - int64_t groups; - at::Tensor w; - at::Tensor b; -}; - -class OpsList { - public: - OpsList() = default; - OpsList(std::vector>& _ops) : ops(std::move(_ops)) {} - - auto runDual(at::Tensor& in, at::Tensor& vin) { - at::Tensor t = in; - at::Tensor tv = vin; - int i = 0; - for (const auto& op : ops) { - t = op->run(t); - tv = op->run(tv); - auto tv_cpu = t.cpu(); - TORCH_INTERNAL_ASSERT( - almostEqual(t, tv_cpu), - "Not almost equal cpu vs vulkan op i:", - i, - " ", - op->toString()); - i++; - } - return std::make_pair(t, tv); - } - - auto run(at::Tensor& in) { - at::Tensor t = in; - int i = 0; - for (const auto& op : ops) { - t = op->run(t); - i++; - } - return t; - } - - std::vector> ops; -}; - -class MobileNetV2 : public OpsList { - public: - MobileNetV2() { - ops.emplace_back(new Conv2d({32, 3, 3, 3}, 1, 2, 1)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({32, 1, 3, 3}, 32, 1, 1)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({16, 32, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Conv2d({96, 16, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({96, 1, 3, 3}, 96, 2, 1)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({24, 96, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Conv2d({144, 24, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({144, 1, 3, 3}, 144, 1, 1)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({24, 144, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Conv2d({144, 24, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({144, 1, 3, 3}, 144, 2, 1)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({32, 144, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Conv2d({192, 32, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({192, 1, 3, 3}, 192, 1, 1)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({32, 192, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Conv2d({192, 32, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({192, 1, 3, 3}, 192, 1, 1)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({32, 192, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Conv2d({192, 32, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({192, 1, 3, 3}, 192, 2, 1)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({64, 192, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Conv2d({384, 64, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({384, 1, 3, 3}, 384, 1, 1)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({64, 384, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Conv2d({384, 64, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({384, 1, 3, 3}, 384, 1, 1)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({64, 384, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Conv2d({384, 64, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({384, 1, 3, 3}, 384, 1, 1)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({64, 384, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Conv2d({384, 64, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({384, 1, 3, 3}, 384, 1, 1)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({96, 384, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Conv2d({576, 96, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({576, 1, 3, 3}, 576, 1, 1)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({96, 576, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Conv2d({576, 96, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({576, 1, 3, 3}, 576, 1, 1)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({96, 576, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Conv2d({576, 96, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({576, 1, 3, 3}, 576, 2, 1)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({160, 576, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Conv2d({960, 160, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({960, 1, 3, 3}, 960, 1, 1)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({160, 960, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Conv2d({960, 160, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({960, 1, 3, 3}, 960, 1, 1)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({160, 960, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Conv2d({960, 160, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({960, 1, 3, 3}, 960, 1, 1)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({320, 960, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Conv2d({1280, 320, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Mean()); - ops.emplace_back(new Addmm(1, 1280, 1000, 0, 1)); - } -}; - -TEST(VulkanTest, DISABLED_mobilenetv2) { - if (!at::is_vulkan_available()) - return; - - MobileNetV2 mn2{}; - auto t_in = - at::rand({1, 3, 224, 224}, at::device(at::kCPU).dtype(at::kFloat)); - auto tv_in = t_in.vulkan(); - mn2.runDual(t_in, tv_in); -} - -TEST(VulkanTest, OpsList) { - if (!at::is_vulkan_available()) - return; - - std::vector> ops; - ops.emplace_back(new Conv2d({32, 3, 3, 3}, 1, 2, 1)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({32, 1, 3, 3}, 32, 1, 1)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({16, 32, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Conv2d({96, 16, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({96, 1, 3, 3}, 96, 2, 1)); - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Conv2d({24, 96, 1, 1}, 1, 1, 0)); - ops.emplace_back(new Conv2d({144, 24, 1, 1}, 1, 1, 0)); // 1, 144, 56, 56 - ops.emplace_back(new Hardtanh_()); - ops.emplace_back(new Mean()); - ops.emplace_back(new Addmm(1, 144, 1000, 0, 1)); - OpsList opsList(ops); - auto t_in = - at::rand({1, 3, 224, 224}, at::device(at::kCPU).dtype(at::kFloat)); - auto t_out_expected = opsList.run(t_in); - - auto tv_in = t_in.vulkan(); - - auto tv_out = opsList.run(t_in); - auto t_out = tv_out.cpu(); - - ASSERT_TRUE(almostEqual(t_out, t_out_expected)); -} - -template -inline std::vector makeStack(Inputs&&... inputs) { - return {std::forward(inputs)...}; -} - -template -inline std::vector callOpByHandle( - const c10::OperatorHandle& op, - Args... args) { - auto stack = makeStack(std::forward(args)...); - c10::Dispatcher::singleton().callBoxed(op, &stack); - return stack; -} - -template -inline std::vector callOpByName( - const char* func_name, - const char* overload_name, - Args... args) { - const c10::optional op_handle = - c10::Dispatcher::singleton().findSchema({func_name, overload_name}); - assert(op_handle.has_value()); - return callOpByHandle(op_handle.value(), std::forward(args)...); -} - -TEST(VulkanTest, conv2dPrepack) { - if (!at::is_vulkan_available()) - return; - auto OC = 2; - auto C = 3; - int64_t groups = 1; - auto t_in = at::rand({1, C, 3, 3}, at::device(at::kCPU).dtype(at::kFloat)); - auto t_w = at::rand({OC, C, 2, 2}, at::device(at::kCPU).dtype(at::kFloat)); - auto t_b = at::zeros({OC}, at::device(at::kCPU).dtype(at::kFloat)); - - std::vector stride{1, 1}; - std::vector padding{0, 0}; - std::vector dilation{1, 1}; - float output_min = 0.25; - float output_max = 1.0; - - auto t_out_conv2d = - at::conv2d(t_in, t_w, t_b, stride, padding, dilation, groups); - auto t_out_expected = at::clamp(t_out_conv2d, output_min, output_max); - - auto tv_in = t_in.vulkan(); - auto tv_out_conv2d = - at::conv2d(tv_in, t_w, t_b, stride, padding, dilation, groups); - auto tv_out = at::clamp(tv_out_conv2d, output_min, output_max); - - auto t_out = tv_out.cpu(); - bool no_prepack_check = almostEqual(t_out, t_out_expected); - if (!no_prepack_check) { - std::cout << "t_out_expected:\n" << t_out_expected << std::endl; - std::cout << "t_out:\n" << t_out << std::endl; - } - ASSERT_TRUE(no_prepack_check); - - auto prepack = callOpByName( - "vulkan_prepack::conv2d_clamp_prepack", - "", - t_w, - t_b, - stride, - padding, - dilation, - groups, - output_min, - output_max); - auto tv_out_prepack_ivalues = - callOpByName("vulkan_prepack::conv2d_clamp_run", "", tv_in, prepack[0]); - auto tv_out_prepack = tv_out_prepack_ivalues[0].toTensor(); - auto t_out_prepack = tv_out_prepack.cpu(); - const auto prepack_check = almostEqual(t_out_prepack, t_out_expected); - if (!prepack_check) { - std::cout << "expected:\n" << t_out_expected << std::endl; - std::cout << "got:\n" << t_out_prepack << std::endl; - } - ASSERT_TRUE(prepack_check); -} - -TEST(VulkanTest, adaptive_avg_pool2d) { - if (!at::is_vulkan_available()) - return; - - auto t_in = - at::rand({1, 2, 7, 7}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); - auto t_out_expected = at::adaptive_avg_pool2d(t_in, {3, 3}); - auto tv_in = t_in.vulkan(); - - auto tv_out = at::adaptive_avg_pool2d(tv_in, {3, 3}); - auto t_out = tv_out.cpu(); - - const auto check = almostEqual(t_out, t_out_expected); - if (!check) { - std::cout << "expected:" << t_out_expected << std::endl; - std::cout << "got:" << t_out << std::endl; - } - ASSERT_TRUE(check); -} - -// TODO: Enable when view operator for Vulkan landed -TEST(VulkanTest, DISABLED_adaptive_avg_pool2d_2) { - if (!at::is_vulkan_available()) - return; - - auto t_in = - at::rand({1, 1280, 7, 7}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); - auto t_out_expected = at::adaptive_avg_pool2d(t_in, {1, 1}); - auto tv_in = t_in.vulkan(); - - auto tv_out = at::adaptive_avg_pool2d(tv_in, {1, 1}); - auto t_out = tv_out.cpu(); - - const auto check = almostEqual(t_out, t_out_expected); - if (!check) { - std::cout << "expected:" << t_out_expected << std::endl; - std::cout << "got:" << t_out << std::endl; - } - ASSERT_TRUE(check); -} - -TEST(VulkanTest, reshape) { - if (!at::is_vulkan_available()) - return; - - auto t_in = - at::rand({1, 8, 1, 1}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); - auto t_out_expected = at::reshape(t_in, {1, 8}); - auto tv_in = t_in.vulkan(); - auto tv_out = at::reshape(tv_in, {1, 8}); - auto t_out = tv_out.cpu(); - - const auto check = almostEqual(t_out, t_out_expected); - if (!check) { - std::cout << "expected:" << t_out_expected << std::endl; - std::cout << "got:" << t_out << std::endl; - } - ASSERT_TRUE(check); -} - -TEST(VulkanTest, reshape2) { - if (!at::is_vulkan_available()) - return; - - auto t_in = - at::rand({1, 3, 2, 2}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); - auto t_out_expected = at::reshape(t_in, {2, 3, 1, 2}); - - auto tv_in = t_in.vulkan(); - auto tv_out = at::reshape(tv_in, {2, 3, 1, 2}); - auto t_out = tv_out.cpu(); - - const auto check = almostEqual(t_out, t_out_expected); - if (!check) { - std::cout << "expected:" << t_out_expected << std::endl; - std::cout << "got:" << t_out << std::endl; - } - ASSERT_TRUE(check); -} - -TEST(VulkanTest, tensor5d) { - if (!at::is_vulkan_available()) - return; - - auto t_in = - at::rand({2, 2, 2, 3, 3}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); - auto tv_in = t_in.vulkan(); -} - -TEST(VulkanTest, tensor5d_transpose) { - if (!at::is_vulkan_available()) - return; - - auto t_in = - at::empty({1, 2, 3, 2, 1}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); - float* data = t_in.data_ptr(); - auto numel = t_in.numel(); - for (const auto i : c10::irange(numel)) { - // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) - data[i] = i; - } - - auto tv_in = t_in.vulkan(); - - auto t_out_expected = t_in.transpose(1, 2); - auto t_out = tv_in.transpose(1, 2).cpu(); - const auto check = almostEqual(t_out, t_out_expected); - if (!check) { - std::cout << "expected:" << t_out_expected << std::endl; - std::cout << "got:" << t_out << std::endl; - } - ASSERT_TRUE(check); -} - -TEST(VulkanTest, view) { - if (!at::is_vulkan_available()) - return; - - auto t_in = - at::rand({2, 4, 3, 3}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); - auto t_out_expected = t_in.view({2, 2, 2, 3, 3}); - auto tv_in = t_in.vulkan(); - auto tv_out = tv_in.view({2, 2, 2, 3, 3}); - auto t_out = tv_out.cpu(); - - const auto check = almostEqual(t_out, t_out_expected); - if (!check) { - std::cout << "expected:" << t_out_expected << std::endl; - std::cout << "got:" << t_out << std::endl; - } - ASSERT_TRUE(check); -} - -TEST(VulkanTest, slice) { - if (!at::is_vulkan_available()) - return; - - auto t_in = - at::empty({1, 4, 2, 2}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); - float* data = t_in.data_ptr(); - auto numel = t_in.numel(); - for (const auto i : c10::irange(numel)) { - // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) - data[i] = i; - } - - auto tv_in = t_in.vulkan(); - - auto t_out_expected = t_in.slice(1, 2, 4, 1); - auto t_out = tv_in.slice(1, 2, 4, 1).cpu(); - const auto check = almostEqual(t_out, t_out_expected); - if (!check) { - std::cout << "expected:" << t_out_expected << std::endl; - std::cout << "got:" << t_out << std::endl; - } - ASSERT_TRUE(check); -} - -TEST(VulkanTest, select) { - if (!at::is_vulkan_available()) - return; - - auto t_in = - at::empty({1, 4, 2, 2}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); - float* data = t_in.data_ptr(); - auto numel = t_in.numel(); - for (const auto i : c10::irange(numel)) { - // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) - data[i] = i; - } - - auto tv_in = t_in.vulkan(); - - auto t_out_expected = t_in.slice(1, 1); - auto t_out = tv_in.slice(1, 1).cpu(); - const auto check = almostEqual(t_out, t_out_expected); - if (!check) { - std::cout << "expected:" << t_out_expected << std::endl; - std::cout << "got:" << t_out << std::endl; - } - ASSERT_TRUE(check); -} - -TEST(VulkanTest, unsqueeze) { - if (!at::is_vulkan_available()) - return; - - auto t_in = - at::empty({1, 2, 2}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); - float* data = t_in.data_ptr(); - auto numel = t_in.numel(); - for (const auto i : c10::irange(numel)) { - // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) - data[i] = i; - } - - auto tv_in = t_in.vulkan(); - - auto t_out_expected = t_in.unsqueeze(1); - auto t_out = tv_in.unsqueeze(1).cpu(); - const auto check = almostEqual(t_out, t_out_expected); - if (!check) { - std::cout << "expected:" << t_out_expected << std::endl; - std::cout << "got:" << t_out << std::endl; - } - ASSERT_TRUE(check); -} - -TEST(VulkanTest, cat) { - if (!at::is_vulkan_available()) - return; - - auto t_in0 = - at::rand({1, 1, 3, 3}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); - auto t_in1 = - at::rand({1, 2, 3, 3}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); - auto t_in2 = - at::rand({1, 5, 3, 3}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); - - auto t_out_expected = at::cat({t_in0, t_in1, t_in2}, 1); - auto tv_out = at::cat({t_in0.vulkan(), t_in1.vulkan(), t_in2.vulkan()}, 1); - auto t_out = tv_out.cpu(); - - const auto check = almostEqual(t_out, t_out_expected); - if (!check) { - std::cout << "expected:" << t_out_expected << std::endl; - std::cout << "got:" << t_out << std::endl; - } - ASSERT_TRUE(check); -} - -TEST(VulkanTest, DISABLED_max_pool2d) { - if (!at::is_vulkan_available()) - return; - - auto t_in = - at::rand({1, 3, 7, 7}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); - auto t_out_expected = at::max_pool2d(t_in, {2, 2}, {1}, {0}, {1}); - auto tv_in = t_in.vulkan(); - - auto tv_out = at::max_pool2d(tv_in, {2, 2}, {1}, {0}, {1}); - auto t_out = tv_out.cpu(); - - const auto check = almostEqual(t_out, t_out_expected); - if (!check) { - std::cout << "expected:" << t_out_expected << std::endl; - std::cout << "got:" << t_out << std::endl; - } - ASSERT_TRUE(check); -} - -TEST(VulkanTest, avg_pool2d) { - if (!at::is_vulkan_available()) - return; - - auto t_in = - at::rand({1, 3, 7, 7}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); - auto t_out_expected = at::avg_pool2d(t_in, {2, 2}, {1}, {0}, true); - auto tv_in = t_in.vulkan(); - - auto tv_out = at::avg_pool2d(tv_in, {2, 2}, {1}, {0}, true); - auto t_out = tv_out.cpu(); - - const auto check = almostEqual(t_out, t_out_expected); - if (!check) { - std::cout << "expected:" << t_out_expected << std::endl; - std::cout << "got:" << t_out << std::endl; - } - ASSERT_TRUE(check); -} - -#endif /* USE_VULKAN_API */ diff --git a/aten/tools/run_tests.sh b/aten/tools/run_tests.sh index 4a724fa94008..5b0c02c2846a 100755 --- a/aten/tools/run_tests.sh +++ b/aten/tools/run_tests.sh @@ -64,6 +64,9 @@ fi if [[ -x ./cuda_cub_test ]]; then ./cuda_cub_test fi +if [[ -x ./cuda_atomic_ops_test ]]; then + ./cuda_atomic_ops_test +fi if [ "$VALGRIND" == "ON" ]; then valgrind --suppressions="$VALGRIND_SUP" --error-exitcode=1 ./basic --gtest_filter='-*CUDA' if [[ -x ./tensor_interop_test ]]; then diff --git a/benchmarks/cpp/nvfuser/CMakeLists.txt b/benchmarks/cpp/nvfuser/CMakeLists.txt index b566e6a359e9..24809c9ed18a 100644 --- a/benchmarks/cpp/nvfuser/CMakeLists.txt +++ b/benchmarks/cpp/nvfuser/CMakeLists.txt @@ -1,7 +1,9 @@ if(USE_CUDA) add_executable(nvfuser_bench - batch_norm.cpp - batch_norm_backward.cpp + batch_norm_channels_first.cpp + batch_norm_channels_first_backward.cpp + batch_norm_channels_last.cpp + batch_norm_channels_last_backward.cpp bert.cpp broadcast.cpp gelu_backward.cpp @@ -10,11 +12,15 @@ if(USE_CUDA) instance_norm.cpp layer_norm.cpp layer_norm_backward.cpp + rms_norm.cpp + rms_norm_backward.cpp lstm_cell.cpp reduction.cpp softmax.cpp softmax_backward.cpp scale_bias_relu.cpp + transpose.cpp + timm.cpp utils.cpp main.cpp) diff --git a/benchmarks/cpp/nvfuser/batch_norm.cpp b/benchmarks/cpp/nvfuser/batch_norm.cpp deleted file mode 100644 index ef6bdd667d66..000000000000 --- a/benchmarks/cpp/nvfuser/batch_norm.cpp +++ /dev/null @@ -1,252 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - -#include "utils.h" - -using namespace torch::jit::fuser::cuda; - -//------------------------------------------------------------------------------ - -static void setupBatchNorm(Fusion* fusion, DataType dtype) { - TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); - - FusionGuard fg(fusion); - - const bool kTraining = true; - const float kMomentum = 0.1; - const float kEps = 1e-5; - - // setup fusion - auto input = makeContigTensor(4, dtype); - auto weight = makeContigTensor(1, dtype); - auto bias = makeContigTensor(1, dtype); - auto running_mean = makeContigTensor(1, DataType::Float); - auto running_var = makeContigTensor(1, DataType::Float); - - fusion->addInput(input); - fusion->addInput(weight); - fusion->addInput(bias); - fusion->addInput(running_mean); - fusion->addInput(running_var); - - if (dtype == DataType::Half) { - input = castOp(DataType::Float, input); - weight = castOp(DataType::Float, weight); - bias = castOp(DataType::Float, bias); - } - - auto momentum_ptr = new Double(kMomentum); - auto eps_ptr = new Double(kEps); - - auto result = batch_norm( - input, - weight, - bias, - running_mean, - running_var, - kTraining, - momentum_ptr, - eps_ptr); - - auto output = result.output; - - if (dtype == DataType::Half) { - output = castOp(DataType::Half, output); - } - - fusion->addOutput(output); -} - -static void NvFuserScheduler_BatchNorm( - benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, - DataType dtype) { - TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); - - const bool kTraining = true; - const float kMomentum = 0.1; - const float kEps = 1e-5; - - std::vector input_shape{ - benchmark_state.range(0), - benchmark_state.range(1), - benchmark_state.range(2), - benchmark_state.range(2)}; - - // inputs - at::manual_seed(0); - auto options = - at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); - auto fp32_options = - at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor at_x = at::randn(input_shape, options); - at::Tensor at_weight = at::ones({input_shape[1]}, options); - at::Tensor at_bias = at::zeros({input_shape[1]}, options); - at::Tensor at_run_mean = at::zeros({input_shape[1]}, fp32_options); - at::Tensor at_run_var = at::ones({input_shape[1]}, fp32_options); - std::vector aten_inputs( - {at_x, at_weight, at_bias, at_run_mean, at_run_var}); - - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); - - benchmark_state.SetBytesProcessed( - int64_t(benchmark_state.iterations()) * - ((2 * (at_x.numel() + at_weight.numel() + at_bias.numel())) * - int64_t(dataTypeSize(dtype)) + - (2 * (at_run_mean.numel() + at_run_var.numel()) * - int64_t(dataTypeSize(DataType::Float))))); -} - -//------------------------------------------------------------------------------ - -static void Baseline_BatchNorm( - benchmark::State& benchmark_state, - DataType dtype) { - TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); - - const float kMomentum = 0.1; - const float kEps = 1e-5; - std::vector input_shape{ - benchmark_state.range(0), - benchmark_state.range(1), - benchmark_state.range(2), - benchmark_state.range(2)}; - - // inputs - at::manual_seed(0); - auto options = - at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); - auto fp32_options = - at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor at_x = at::randn(input_shape, options); - at::Tensor at_weight = at::ones({input_shape[1]}, options); - at::Tensor at_bias = at::zeros({input_shape[1]}, options); - at::Tensor at_run_mean = at::zeros({input_shape[1]}, fp32_options); - at::Tensor at_run_var = at::ones({input_shape[1]}, fp32_options); - - auto ato_weight = c10::optional(at_weight); - auto ato_bias = c10::optional(at_bias); - auto ato_run_mean = c10::optional(at_run_mean); - auto ato_run_var = c10::optional(at_run_var); - - auto output = at::batch_norm( - at_x, - ato_weight, - ato_bias, - ato_run_mean, - ato_run_var, - true, - kMomentum, - kEps, - true); - - clearL2Cache(); - cudaDeviceSynchronize(); - for (auto _ : benchmark_state) { - CudaKernelTimer timer; - auto output = at::batch_norm( - at_x, - ato_weight, - ato_bias, - ato_run_mean, - ato_run_var, - true, - kMomentum, - kEps, - true); - benchmark_state.SetIterationTime(timer.elapsed() / 1000.0); - cudaDeviceSynchronize(); - clearL2Cache(); - cudaDeviceSynchronize(); - } - benchmark_state.SetBytesProcessed( - int64_t(benchmark_state.iterations()) * - ((2 * (at_x.numel() + at_weight.numel() + at_bias.numel())) * - int64_t(dataTypeSize(dtype)) + - (2 * (at_run_mean.numel() + at_run_var.numel()) * - int64_t(dataTypeSize(DataType::Float))))); -} - -//------------------------------------------------------------------------------ - -static void Baseline_BatchNorm_cuDNN_fp32(benchmark::State& benchmark_state) { - Baseline_BatchNorm(benchmark_state, DataType::Float); -} - -static void Baseline_BatchNorm_cuDNN_fp16(benchmark::State& benchmark_state) { - Baseline_BatchNorm(benchmark_state, DataType::Half); -} - -//------------------------------------------------------------------------------ - -NVFUSER_BENCHMARK_DEFINE( - NvFuserScheduler_BatchNorm_fp32, - setupBatchNorm, - NvFuserScheduler_BatchNorm, - DataType::Float); - -NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_fp32) - // ->RangeMultiplier(2) - ->Ranges({{64, 512}, {32, 128}, {2, 64}}) - ->Unit(benchmark::kMicrosecond) - ->UseManualTime(); - -NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_fp32) - // ->RangeMultiplier(2) - ->Ranges({{2, 64}, {2, 32}, {2, 256}}) - ->Unit(benchmark::kMicrosecond) - ->UseManualTime(); - -NVFUSER_BENCHMARK_DEFINE( - NvFuserScheduler_BatchNorm_fp16, - setupBatchNorm, - NvFuserScheduler_BatchNorm, - DataType::Half); - -NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_fp16) - // ->RangeMultiplier(2) - ->Ranges({{64, 512}, {32, 128}, {2, 128}}) - ->Unit(benchmark::kMicrosecond) - ->UseManualTime(); - -NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_fp16) - // ->RangeMultiplier(2) - ->Ranges({{2, 64}, {2, 32}, {2, 256}}) - ->Unit(benchmark::kMicrosecond) - ->UseManualTime(); - -//------------------------------------------------------------------------------ - -BENCHMARK(Baseline_BatchNorm_cuDNN_fp32) - // ->RangeMultiplier(2) - // cuDNN didn't make it to 1024 - ->Ranges({{64, 512}, {32, 128}, {2, 64}}) - ->Unit(benchmark::kMicrosecond) - ->UseManualTime(); - -BENCHMARK(Baseline_BatchNorm_cuDNN_fp32) - // ->RangeMultiplier(2) - ->Ranges({{2, 64}, {2, 32}, {2, 256}}) - ->Unit(benchmark::kMicrosecond) - ->UseManualTime(); - -BENCHMARK(Baseline_BatchNorm_cuDNN_fp16) - // ->RangeMultiplier(2) - ->Ranges({{64, 512}, {32, 128}, {2, 128}}) - ->Unit(benchmark::kMicrosecond) - ->UseManualTime(); - -BENCHMARK(Baseline_BatchNorm_cuDNN_fp16) - // ->RangeMultiplier(2) - ->Ranges({{2, 64}, {2, 32}, {2, 256}}) - ->Unit(benchmark::kMicrosecond) - ->UseManualTime(); diff --git a/benchmarks/cpp/nvfuser/batch_norm_backward.cpp b/benchmarks/cpp/nvfuser/batch_norm_backward.cpp deleted file mode 100644 index e4a9fdcb0340..000000000000 --- a/benchmarks/cpp/nvfuser/batch_norm_backward.cpp +++ /dev/null @@ -1,276 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - -#include - -#include "utils.h" - -using namespace torch::jit::fuser::cuda; - -//------------------------------------------------------------------------------ - -static void setupBatchNorm_BWD(Fusion* fusion, DataType dtype) { - TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); - - FusionGuard fg(fusion); - - const bool kTraining = true; - const float kMomentum = 0.1; - const float kEps = 1e-5; - - // setup fusion - auto input = makeContigTensor(4, dtype); - auto grad_output = makeContigTensor(4, dtype); - auto weight = makeContigTensor(1, DataType::Float); - auto running_mean = makeContigTensor(1, DataType::Float); - auto running_var = makeContigTensor(1, DataType::Float); - auto save_mean = makeContigTensor(1, DataType::Float); - auto save_var = makeContigTensor(1, DataType::Float); - - fusion->addInput(input); - fusion->addInput(grad_output); - fusion->addInput(weight); - fusion->addInput(running_mean); - fusion->addInput(running_var); - fusion->addInput(save_mean); - fusion->addInput(save_var); - - if (dtype == DataType::Half) { - input = castOp(DataType::Float, input); - grad_output = castOp(DataType::Float, grad_output); - } - - auto eps_ptr = new Double(kEps); - - auto result = batch_norm_backward( - input, - grad_output, - weight, - running_mean, - running_var, - save_mean, - save_var, - kTraining, - eps_ptr, - std::vector(3, true)); - - auto grad_input = result.grad_input; - auto grad_weight = result.grad_weight; - auto grad_bias = result.grad_bias; - - if (dtype == DataType::Half) { - grad_input = castOp(DataType::Half, grad_input); - grad_weight = castOp(DataType::Half, grad_weight); - grad_bias = castOp(DataType::Half, grad_bias); - } - - fusion->addOutput(grad_input); - fusion->addOutput(grad_weight); - fusion->addOutput(grad_bias); -} - -static void NvFuserScheduler_BatchNorm_BWD( - benchmark::State& benchmark_state, - FusionExecutorCache* fusion_executor_cache, - DataType dtype) { - TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); - - const bool kTraining = true; - const float kEps = 1e-5; - - std::vector input_shape{ - benchmark_state.range(0), - benchmark_state.range(1), - benchmark_state.range(2), - benchmark_state.range(2)}; - - at::manual_seed(0); - auto options = - at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); - auto fp32_options = - at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn(input_shape, options); - at::Tensor grad_out = at::randn(input_shape, options); - at::Tensor weight = at::ones({input_shape[1]}, fp32_options); - at::Tensor run_mean = at::zeros({input_shape[1]}, fp32_options); - at::Tensor run_var = at::ones({input_shape[1]}, fp32_options); - at::Tensor save_mean = at::zeros({input_shape[1]}, fp32_options); - at::Tensor save_var = at::ones({input_shape[1]}, fp32_options); - - std::vector aten_inputs( - {input, grad_out, weight, run_mean, run_var, save_mean, save_var}); - - runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); - - benchmark_state.SetBytesProcessed( - int64_t(benchmark_state.iterations()) * - (((3 * input.numel()) * int64_t(dataTypeSize(dtype))) + - (run_mean.numel() + run_var.numel() + save_mean.numel() + - save_var.numel() + weight.numel()) * - int64_t(dataTypeSize(DataType::Float)))); -} - -//------------------------------------------------------------------------------ - -static void Baseline_BatchNorm_BWD( - benchmark::State& benchmark_state, - DataType dtype) { - TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); - - const float kMomentum = 0.1; - const float kEps = 1e-5; - std::vector input_shape{ - benchmark_state.range(0), - benchmark_state.range(1), - benchmark_state.range(2), - benchmark_state.range(2)}; - - at::manual_seed(0); - auto options = - at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); - auto fp32_options = - at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn(input_shape, options); - at::Tensor grad_out = at::randn(input_shape, options); - at::Tensor weight = at::ones({input_shape[1]}, fp32_options); - at::Tensor bias = at::zeros({input_shape[1]}, fp32_options); - at::Tensor run_mean = at::zeros({input_shape[1]}, fp32_options); - at::Tensor run_var = at::ones({input_shape[1]}, fp32_options); - at::Tensor save_mean = at::zeros({input_shape[1]}, fp32_options); - at::Tensor save_var = at::ones({input_shape[1]}, fp32_options); - - auto ato_weight = c10::optional(weight); - auto ato_bias = c10::optional(bias); - auto ato_run_mean = c10::optional(run_mean); - auto ato_run_var = c10::optional(run_var); - auto ato_save_mean = c10::optional(save_mean); - auto ato_save_var = c10::optional(save_var); - - auto fwd_result = at::_ops::_batch_norm_impl_index::call( - input, - ato_weight, - ato_bias, - ato_run_mean, - ato_run_var, - true, - kMomentum, - kEps, - true); - cudaDeviceSynchronize(); - - // Sync everything up before we start - clearL2Cache(); - cudaDeviceSynchronize(); - for (auto _ : benchmark_state) { - CudaKernelTimer timer; - - at::_ops::cudnn_batch_norm_backward::call( - input, - grad_out, - weight, - ato_run_mean, - ato_run_var, - save_mean, - save_var, - kEps, - std::get<3>(fwd_result)); - - benchmark_state.SetIterationTime(timer.elapsed() / 1000.0); - cudaDeviceSynchronize(); - clearL2Cache(); - cudaDeviceSynchronize(); - } - - benchmark_state.SetBytesProcessed( - int64_t(benchmark_state.iterations()) * - (((3 * input.numel()) * int64_t(dataTypeSize(dtype))) + - (run_mean.numel() + run_var.numel() + save_mean.numel() + - save_var.numel() + weight.numel()) * - int64_t(dataTypeSize(DataType::Float)))); -} - -//------------------------------------------------------------------------------ - -static void Baseline_BatchNorm_BWD_cuDNN_fp32( - benchmark::State& benchmark_state) { - Baseline_BatchNorm_BWD(benchmark_state, DataType::Float); -} - -static void Baseline_BatchNorm_BWD_cuDNN_fp16( - benchmark::State& benchmark_state) { - Baseline_BatchNorm_BWD(benchmark_state, DataType::Half); -} - -//------------------------------------------------------------------------------ - -NVFUSER_BENCHMARK_DEFINE( - NvFuserScheduler_BatchNorm_BWD_fp32, - setupBatchNorm_BWD, - NvFuserScheduler_BatchNorm_BWD, - DataType::Float); - -NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_BWD_fp32) - // ->RangeMultiplier(2) - ->Ranges({{64, 512}, {32, 128}, {2, 64}}) - ->Unit(benchmark::kMicrosecond) - ->UseManualTime(); - -NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_BWD_fp32) - // ->RangeMultiplier(2) - ->Ranges({{2, 64}, {2, 32}, {2, 256}}) - ->Unit(benchmark::kMicrosecond) - ->UseManualTime(); - -NVFUSER_BENCHMARK_DEFINE( - NvFuserScheduler_BatchNorm_BWD_fp16, - setupBatchNorm_BWD, - NvFuserScheduler_BatchNorm_BWD, - DataType::Half); - -NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_BWD_fp16) - // ->RangeMultiplier(2) - ->Ranges({{64, 512}, {32, 128}, {2, 128}}) - ->Unit(benchmark::kMicrosecond) - ->UseManualTime(); - -NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_BWD_fp16) - // ->RangeMultiplier(2) - ->Ranges({{2, 64}, {2, 32}, {2, 256}}) - ->Unit(benchmark::kMicrosecond) - ->UseManualTime(); - -//------------------------------------------------------------------------------ - -BENCHMARK(Baseline_BatchNorm_BWD_cuDNN_fp32) - // ->RangeMultiplier(2) - // cuDNN didn't make it to 1024 - ->Ranges({{64, 512}, {32, 128}, {2, 64}}) - ->Unit(benchmark::kMicrosecond) - ->UseManualTime(); - -BENCHMARK(Baseline_BatchNorm_BWD_cuDNN_fp32) - // ->RangeMultiplier(2) - ->Ranges({{2, 64}, {2, 32}, {2, 256}}) - ->Unit(benchmark::kMicrosecond) - ->UseManualTime(); - -BENCHMARK(Baseline_BatchNorm_BWD_cuDNN_fp16) - // ->RangeMultiplier(2) - ->Ranges({{64, 512}, {32, 128}, {2, 128}}) - ->Unit(benchmark::kMicrosecond) - ->UseManualTime(); - -BENCHMARK(Baseline_BatchNorm_BWD_cuDNN_fp16) - // ->RangeMultiplier(2) - ->Ranges({{2, 64}, {2, 32}, {2, 256}}) - ->Unit(benchmark::kMicrosecond) - ->UseManualTime(); diff --git a/benchmarks/cpp/nvfuser/batch_norm_channels_first.cpp b/benchmarks/cpp/nvfuser/batch_norm_channels_first.cpp new file mode 100644 index 000000000000..723d222516df --- /dev/null +++ b/benchmarks/cpp/nvfuser/batch_norm_channels_first.cpp @@ -0,0 +1,339 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +using namespace torch::jit::fuser::cuda; + +//------------------------------------------------------------------------------ + +static void setupBatchNorm(Fusion* fusion, DataType dtype) { + TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); + + FusionGuard fg(fusion); + + const bool kTraining = true; + const float kMomentum = 0.1; + const float kEps = 1e-5; + + // setup fusion + auto input = makeContigTensor(4, dtype); + auto weight = makeContigTensor(1, dtype); + auto bias = makeContigTensor(1, dtype); + auto running_mean = makeContigTensor(1, DataType::Float); + auto running_var = makeContigTensor(1, DataType::Float); + + fusion->addInput(input); + fusion->addInput(weight); + fusion->addInput(bias); + fusion->addInput(running_mean); + fusion->addInput(running_var); + + if (dtype == DataType::Half) { + input = castOp(DataType::Float, input); + weight = castOp(DataType::Float, weight); + bias = castOp(DataType::Float, bias); + } + + auto momentum_ptr = IrBuilder::create(kMomentum); + auto eps_ptr = IrBuilder::create(kEps); + + auto result = batch_norm( + input, + weight, + bias, + running_mean, + running_var, + kTraining, + momentum_ptr, + eps_ptr); + + auto output = result.output; + + if (dtype == DataType::Half) { + output = castOp(DataType::Half, output); + } + + fusion->addOutput(output); +} + +static void NvFuserScheduler_BatchNorm( + benchmark::State& benchmark_state, + FusionExecutorCache* fusion_executor_cache, + DataType dtype) { + TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); + + const bool kTraining = true; + const float kMomentum = 0.1; + const float kEps = 1e-5; + + std::vector input_shape{ + benchmark_state.range(0), + benchmark_state.range(1), + benchmark_state.range(2), + benchmark_state.range(2)}; + + // inputs + at::manual_seed(0); + auto options = + at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); + auto fp32_options = + at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor at_x = at::randn(input_shape, options); + at::Tensor at_weight = at::ones({input_shape[1]}, options); + at::Tensor at_bias = at::zeros({input_shape[1]}, options); + at::Tensor at_run_mean = at::zeros({input_shape[1]}, fp32_options); + at::Tensor at_run_var = at::ones({input_shape[1]}, fp32_options); + std::vector aten_inputs( + {at_x, at_weight, at_bias, at_run_mean, at_run_var}); + + runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + + benchmark_state.SetBytesProcessed( + int64_t(benchmark_state.iterations()) * + ((2 * (at_x.numel() + at_weight.numel() + at_bias.numel())) * + int64_t(dataTypeSize(dtype)) + + (2 * (at_run_mean.numel() + at_run_var.numel()) * + int64_t(dataTypeSize(DataType::Float))))); +} + +//------------------------------------------------------------------------------ + +static void Baseline_BatchNorm( + benchmark::State& benchmark_state, + DataType dtype) { + TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); + + const float kMomentum = 0.1; + const float kEps = 1e-5; + std::vector input_shape{ + benchmark_state.range(0), + benchmark_state.range(1), + benchmark_state.range(2), + benchmark_state.range(2)}; + + // inputs + at::manual_seed(0); + auto options = + at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); + auto fp32_options = + at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor at_x = at::randn(input_shape, options); + at::Tensor at_weight = at::ones({input_shape[1]}, options); + at::Tensor at_bias = at::zeros({input_shape[1]}, options); + at::Tensor at_run_mean = at::zeros({input_shape[1]}, fp32_options); + at::Tensor at_run_var = at::ones({input_shape[1]}, fp32_options); + + auto ato_weight = c10::optional(at_weight); + auto ato_bias = c10::optional(at_bias); + auto ato_run_mean = c10::optional(at_run_mean); + auto ato_run_var = c10::optional(at_run_var); + + auto output = at::batch_norm( + at_x, + ato_weight, + ato_bias, + ato_run_mean, + ato_run_var, + true, + kMomentum, + kEps, + true); + + clearL2Cache(); + cudaDeviceSynchronize(); + for (auto _ : benchmark_state) { + CudaKernelTimer timer; + auto output = at::batch_norm( + at_x, + ato_weight, + ato_bias, + ato_run_mean, + ato_run_var, + true, + kMomentum, + kEps, + true); + benchmark_state.SetIterationTime(timer.elapsed() / 1000.0); + cudaDeviceSynchronize(); + clearL2Cache(); + cudaDeviceSynchronize(); + } + benchmark_state.SetBytesProcessed( + int64_t(benchmark_state.iterations()) * + ((2 * (at_x.numel() + at_weight.numel() + at_bias.numel())) * + int64_t(dataTypeSize(dtype)) + + (2 * (at_run_mean.numel() + at_run_var.numel()) * + int64_t(dataTypeSize(DataType::Float))))); +} + +//------------------------------------------------------------------------------ + +static void Baseline_BatchNorm_cuDNN_fp32(benchmark::State& benchmark_state) { + Baseline_BatchNorm(benchmark_state, DataType::Float); +} + +static void Baseline_BatchNorm_cuDNN_fp16(benchmark::State& benchmark_state) { + Baseline_BatchNorm(benchmark_state, DataType::Half); +} + +// Simple aliases just for names in the printed output +static void Baseline_ResNet_BatchNorm_cuDNN_fp16(benchmark::State& benchmark_state) { + Baseline_BatchNorm(benchmark_state, DataType::Half); +} + +static void Baseline_ResNext_BatchNorm_cuDNN_fp16(benchmark::State& benchmark_state) { + Baseline_BatchNorm(benchmark_state, DataType::Half); +} + +//------------------------------------------------------------------------------ + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_BatchNorm_fp32, + setupBatchNorm, + NvFuserScheduler_BatchNorm, + DataType::Float); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_fp32) + // ->RangeMultiplier(2) + ->Ranges({{64, 512}, {32, 128}, {2, 64}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_fp32) + // ->RangeMultiplier(2) + ->Ranges({{2, 64}, {2, 32}, {2, 256}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_BatchNorm_fp16, + setupBatchNorm, + NvFuserScheduler_BatchNorm, + DataType::Half); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_fp16) + // ->RangeMultiplier(2) + ->Ranges({{64, 512}, {32, 128}, {2, 128}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_fp16) + // ->RangeMultiplier(2) + ->Ranges({{2, 64}, {2, 32}, {2, 256}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +//------------------------------------------------------------------------------ + +BENCHMARK(Baseline_BatchNorm_cuDNN_fp32) + // ->RangeMultiplier(2) + // cuDNN didn't make it to 1024 + ->Ranges({{64, 512}, {32, 128}, {2, 64}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +BENCHMARK(Baseline_BatchNorm_cuDNN_fp32) + // ->RangeMultiplier(2) + ->Ranges({{2, 64}, {2, 32}, {2, 256}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +BENCHMARK(Baseline_BatchNorm_cuDNN_fp16) + // ->RangeMultiplier(2) + ->Ranges({{64, 512}, {32, 128}, {2, 128}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +BENCHMARK(Baseline_BatchNorm_cuDNN_fp16) + // ->RangeMultiplier(2) + ->Ranges({{2, 64}, {2, 32}, {2, 256}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +//------------------------------------------------------------------------------ +// RESNET and REXNEXT benchmarks + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_ResNet_BatchNorm_fp16, + setupBatchNorm, + NvFuserScheduler_BatchNorm, + DataType::Half); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNet_BatchNorm_fp16) + ->Args({256, 64, 112}) + ->Args({256, 64, 56}) + ->Args({256, 256, 56}) + ->Args({256, 128, 56}) + ->Args({256, 128, 28}) + ->Args({256, 512, 28}) + ->Args({256, 256, 28}) + ->Args({256, 256, 14}) + ->Args({256, 1024, 14}) + ->Args({256, 512, 14}) + ->Args({256, 512, 7}) + ->Args({256, 2048, 7}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_ResNext_BatchNorm_fp16, + setupBatchNorm, + NvFuserScheduler_BatchNorm, + DataType::Half); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNext_BatchNorm_fp16) + ->Args({128, 64, 112}) + ->Args({128, 128, 56}) + ->Args({128, 256, 56}) + ->Args({128, 128, 56}) + ->Args({128, 256, 28}) + ->Args({128, 512, 28}) + ->Args({128, 512, 14}) + ->Args({128, 1024, 14}) + ->Args({128, 1024, 7}) + ->Args({128, 2048, 7}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +//------------------------------------------------------------------------------ + +BENCHMARK(Baseline_ResNet_BatchNorm_cuDNN_fp16) + ->Args({256, 64, 112}) + ->Args({256, 64, 56}) + ->Args({256, 256, 56}) + ->Args({256, 128, 56}) + ->Args({256, 128, 28}) + ->Args({256, 512, 28}) + ->Args({256, 256, 28}) + ->Args({256, 256, 14}) + ->Args({256, 1024, 14}) + ->Args({256, 512, 14}) + ->Args({256, 512, 7}) + ->Args({256, 2048, 7}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +BENCHMARK(Baseline_ResNext_BatchNorm_cuDNN_fp16) + ->Args({128, 64, 112}) + ->Args({128, 128, 56}) + ->Args({128, 256, 56}) + ->Args({128, 128, 56}) + ->Args({128, 256, 28}) + ->Args({128, 512, 28}) + ->Args({128, 512, 14}) + ->Args({128, 1024, 14}) + ->Args({128, 1024, 7}) + ->Args({128, 2048, 7}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); diff --git a/benchmarks/cpp/nvfuser/batch_norm_channels_first_backward.cpp b/benchmarks/cpp/nvfuser/batch_norm_channels_first_backward.cpp new file mode 100644 index 000000000000..af2b4d145fc8 --- /dev/null +++ b/benchmarks/cpp/nvfuser/batch_norm_channels_first_backward.cpp @@ -0,0 +1,362 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +#include + +using namespace torch::jit::fuser::cuda; + +//------------------------------------------------------------------------------ + +static void setupBatchNorm_BWD(Fusion* fusion, DataType dtype) { + TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); + + FusionGuard fg(fusion); + + const bool kTraining = true; + const float kMomentum = 0.1; + const float kEps = 1e-5; + + // setup fusion + auto input = makeContigTensor(4, dtype); + auto grad_output = makeContigTensor(4, dtype); + auto weight = makeContigTensor(1, DataType::Float); + auto running_mean = makeContigTensor(1, DataType::Float); + auto running_var = makeContigTensor(1, DataType::Float); + auto save_mean = makeContigTensor(1, DataType::Float); + auto save_var = makeContigTensor(1, DataType::Float); + + fusion->addInput(input); + fusion->addInput(grad_output); + fusion->addInput(weight); + fusion->addInput(running_mean); + fusion->addInput(running_var); + fusion->addInput(save_mean); + fusion->addInput(save_var); + + if (dtype == DataType::Half) { + input = castOp(DataType::Float, input); + grad_output = castOp(DataType::Float, grad_output); + } + + auto eps_ptr = IrBuilder::create(kEps); + + auto result = batch_norm_backward( + input, + grad_output, + weight, + running_mean, + running_var, + save_mean, + save_var, + kTraining, + eps_ptr, + std::vector(3, true)); + + auto grad_input = result.grad_input; + auto grad_weight = result.grad_weight; + auto grad_bias = result.grad_bias; + + if (dtype == DataType::Half) { + grad_input = castOp(DataType::Half, grad_input); + grad_weight = castOp(DataType::Half, grad_weight); + grad_bias = castOp(DataType::Half, grad_bias); + } + + fusion->addOutput(grad_input); + fusion->addOutput(grad_weight); + fusion->addOutput(grad_bias); +} + +static void NvFuserScheduler_BatchNorm_BWD( + benchmark::State& benchmark_state, + FusionExecutorCache* fusion_executor_cache, + DataType dtype) { + TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); + + const bool kTraining = true; + const float kEps = 1e-5; + + std::vector input_shape{ + benchmark_state.range(0), + benchmark_state.range(1), + benchmark_state.range(2), + benchmark_state.range(2)}; + + at::manual_seed(0); + auto options = + at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); + auto fp32_options = + at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor input = at::randn(input_shape, options); + at::Tensor grad_out = at::randn(input_shape, options); + at::Tensor weight = at::ones({input_shape[1]}, fp32_options); + at::Tensor run_mean = at::zeros({input_shape[1]}, fp32_options); + at::Tensor run_var = at::ones({input_shape[1]}, fp32_options); + at::Tensor save_mean = at::zeros({input_shape[1]}, fp32_options); + at::Tensor save_var = at::ones({input_shape[1]}, fp32_options); + + std::vector aten_inputs( + {input, grad_out, weight, run_mean, run_var, save_mean, save_var}); + + runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + + benchmark_state.SetBytesProcessed( + int64_t(benchmark_state.iterations()) * + (((3 * input.numel()) * int64_t(dataTypeSize(dtype))) + + (run_mean.numel() + run_var.numel() + save_mean.numel() + + save_var.numel() + weight.numel()) * + int64_t(dataTypeSize(DataType::Float)))); +} + +//------------------------------------------------------------------------------ + +static void Baseline_BatchNorm_BWD( + benchmark::State& benchmark_state, + DataType dtype) { + TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); + + const float kMomentum = 0.1; + const float kEps = 1e-5; + std::vector input_shape{ + benchmark_state.range(0), + benchmark_state.range(1), + benchmark_state.range(2), + benchmark_state.range(2)}; + + at::manual_seed(0); + auto options = + at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); + auto fp32_options = + at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor input = at::randn(input_shape, options); + at::Tensor grad_out = at::randn(input_shape, options); + at::Tensor weight = at::ones({input_shape[1]}, fp32_options); + at::Tensor bias = at::zeros({input_shape[1]}, fp32_options); + at::Tensor run_mean = at::zeros({input_shape[1]}, fp32_options); + at::Tensor run_var = at::ones({input_shape[1]}, fp32_options); + at::Tensor save_mean = at::zeros({input_shape[1]}, fp32_options); + at::Tensor save_var = at::ones({input_shape[1]}, fp32_options); + + auto ato_weight = c10::optional(weight); + auto ato_bias = c10::optional(bias); + auto ato_run_mean = c10::optional(run_mean); + auto ato_run_var = c10::optional(run_var); + auto ato_save_mean = c10::optional(save_mean); + auto ato_save_var = c10::optional(save_var); + + auto fwd_result = at::_ops::_batch_norm_impl_index::call( + input, + ato_weight, + ato_bias, + ato_run_mean, + ato_run_var, + true, + kMomentum, + kEps, + true); + cudaDeviceSynchronize(); + + // Sync everything up before we start + clearL2Cache(); + cudaDeviceSynchronize(); + for (auto _ : benchmark_state) { + CudaKernelTimer timer; + + at::_ops::cudnn_batch_norm_backward::call( + input, + grad_out, + weight, + ato_run_mean, + ato_run_var, + save_mean, + save_var, + kEps, + std::get<3>(fwd_result)); + + benchmark_state.SetIterationTime(timer.elapsed() / 1000.0); + cudaDeviceSynchronize(); + clearL2Cache(); + cudaDeviceSynchronize(); + } + + benchmark_state.SetBytesProcessed( + int64_t(benchmark_state.iterations()) * + (((3 * input.numel()) * int64_t(dataTypeSize(dtype))) + + (run_mean.numel() + run_var.numel() + save_mean.numel() + + save_var.numel() + weight.numel()) * + int64_t(dataTypeSize(DataType::Float)))); +} + +//------------------------------------------------------------------------------ + +static void Baseline_BatchNorm_BWD_cuDNN_fp32( + benchmark::State& benchmark_state) { + Baseline_BatchNorm_BWD(benchmark_state, DataType::Float); +} + +static void Baseline_BatchNorm_BWD_cuDNN_fp16( + benchmark::State& benchmark_state) { + Baseline_BatchNorm_BWD(benchmark_state, DataType::Half); +} + +// Simple aliases just for names in the printed output +static void Baseline_ResNet_BatchNorm_BWD_cuDNN_fp16(benchmark::State& benchmark_state) { + Baseline_BatchNorm_BWD(benchmark_state, DataType::Half); +} + +static void Baseline_ResNext_BatchNorm_BWD_cuDNN_fp16(benchmark::State& benchmark_state) { + Baseline_BatchNorm_BWD(benchmark_state, DataType::Half); +} +//------------------------------------------------------------------------------ + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_BatchNorm_BWD_fp32, + setupBatchNorm_BWD, + NvFuserScheduler_BatchNorm_BWD, + DataType::Float); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_BWD_fp32) + // ->RangeMultiplier(2) + ->Ranges({{64, 512}, {32, 128}, {2, 64}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_BWD_fp32) + // ->RangeMultiplier(2) + ->Ranges({{2, 64}, {2, 32}, {2, 256}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_BatchNorm_BWD_fp16, + setupBatchNorm_BWD, + NvFuserScheduler_BatchNorm_BWD, + DataType::Half); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_BWD_fp16) + // ->RangeMultiplier(2) + ->Ranges({{64, 512}, {32, 128}, {2, 128}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_BWD_fp16) + // ->RangeMultiplier(2) + ->Ranges({{2, 64}, {2, 32}, {2, 256}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +//------------------------------------------------------------------------------ + +BENCHMARK(Baseline_BatchNorm_BWD_cuDNN_fp32) + // ->RangeMultiplier(2) + // cuDNN didn't make it to 1024 + ->Ranges({{64, 512}, {32, 128}, {2, 64}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +BENCHMARK(Baseline_BatchNorm_BWD_cuDNN_fp32) + // ->RangeMultiplier(2) + ->Ranges({{2, 64}, {2, 32}, {2, 256}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +BENCHMARK(Baseline_BatchNorm_BWD_cuDNN_fp16) + // ->RangeMultiplier(2) + ->Ranges({{64, 512}, {32, 128}, {2, 128}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +BENCHMARK(Baseline_BatchNorm_BWD_cuDNN_fp16) + // ->RangeMultiplier(2) + ->Ranges({{2, 64}, {2, 32}, {2, 256}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +//------------------------------------------------------------------------------ +// RESNET and REXNEXT benchmarks + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_ResNet_BatchNorm_BWD_fp16, + setupBatchNorm_BWD, + NvFuserScheduler_BatchNorm_BWD, + DataType::Half); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNet_BatchNorm_BWD_fp16) + ->Args({256, 64, 112}) + ->Args({256, 64, 56}) + ->Args({256, 256, 56}) + ->Args({256, 128, 56}) + ->Args({256, 128, 28}) + ->Args({256, 512, 28}) + ->Args({256, 256, 28}) + ->Args({256, 256, 14}) + ->Args({256, 1024, 14}) + ->Args({256, 512, 14}) + ->Args({256, 512, 7}) + ->Args({256, 2048, 7}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_ResNext_BatchNorm_BWD_fp16, + setupBatchNorm_BWD, + NvFuserScheduler_BatchNorm_BWD, + DataType::Half); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNext_BatchNorm_BWD_fp16) + ->Args({128, 64, 112}) + ->Args({128, 128, 56}) + ->Args({128, 256, 56}) + ->Args({128, 128, 56}) + ->Args({128, 256, 28}) + ->Args({128, 512, 28}) + ->Args({128, 512, 14}) + ->Args({128, 1024, 14}) + ->Args({128, 1024, 7}) + ->Args({128, 2048, 7}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +//------------------------------------------------------------------------------ + +BENCHMARK(Baseline_ResNet_BatchNorm_BWD_cuDNN_fp16) + ->Args({256, 64, 112}) + ->Args({256, 64, 56}) + ->Args({256, 256, 56}) + ->Args({256, 128, 56}) + ->Args({256, 128, 28}) + ->Args({256, 512, 28}) + ->Args({256, 256, 28}) + ->Args({256, 256, 14}) + ->Args({256, 1024, 14}) + ->Args({256, 512, 14}) + ->Args({256, 512, 7}) + ->Args({256, 2048, 7}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +BENCHMARK(Baseline_ResNext_BatchNorm_BWD_cuDNN_fp16) + ->Args({128, 64, 112}) + ->Args({128, 128, 56}) + ->Args({128, 256, 56}) + ->Args({128, 128, 56}) + ->Args({128, 256, 28}) + ->Args({128, 512, 28}) + ->Args({128, 512, 14}) + ->Args({128, 1024, 14}) + ->Args({128, 1024, 7}) + ->Args({128, 2048, 7}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); diff --git a/benchmarks/cpp/nvfuser/batch_norm_channels_last.cpp b/benchmarks/cpp/nvfuser/batch_norm_channels_last.cpp new file mode 100644 index 000000000000..14fde631aec0 --- /dev/null +++ b/benchmarks/cpp/nvfuser/batch_norm_channels_last.cpp @@ -0,0 +1,367 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +using namespace torch::jit::fuser::cuda; + +//------------------------------------------------------------------------------ + +static void setupBatchNorm_nhwc(Fusion* fusion, DataType dtype) { + TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); + + FusionGuard fg(fusion); + + const bool kTraining = true; + const float kMomentum = 0.1; + const float kEps = 1e-5; + + // setup fusion + auto input = makeContigTensor(4, dtype); + auto weight = makeContigTensor(1, dtype); + auto bias = makeContigTensor(1, dtype); + auto running_mean = makeContigTensor(1, DataType::Float); + auto running_var = makeContigTensor(1, DataType::Float); + + fusion->addInput(input); + fusion->addInput(weight); + fusion->addInput(bias); + fusion->addInput(running_mean); + fusion->addInput(running_var); + + if (dtype == DataType::Half) { + input = castOp(DataType::Float, input); + weight = castOp(DataType::Float, weight); + bias = castOp(DataType::Float, bias); + } + + auto momentum_ptr = IrBuilder::create(kMomentum); + auto eps_ptr = IrBuilder::create(kEps); + + auto result = batch_norm( + input, + weight, + bias, + running_mean, + running_var, + kTraining, + momentum_ptr, + eps_ptr, + true); + + auto output = result.output; + + if (dtype == DataType::Half) { + output = castOp(DataType::Half, output); + } + + fusion->addOutput(output); +} + +static void NvFuserScheduler_BatchNorm_nhwc( + benchmark::State& benchmark_state, + FusionExecutorCache* fusion_executor_cache, + DataType dtype) { + TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); + + const bool kTraining = true; + const float kMomentum = 0.1; + const float kEps = 1e-5; + + std::vector input_shape{ + benchmark_state.range(0), + benchmark_state.range(2), + benchmark_state.range(2), + benchmark_state.range(1)}; + + // inputs + at::manual_seed(0); + auto options = + at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); + auto fp32_options = + at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor at_x = at::randn(input_shape, options); + at::Tensor at_weight = at::ones({input_shape[3]}, options); + at::Tensor at_bias = at::zeros({input_shape[3]}, options); + at::Tensor at_run_mean = at::zeros({input_shape[3]}, fp32_options); + at::Tensor at_run_var = at::ones({input_shape[3]}, fp32_options); + std::vector aten_inputs( + {at_x, at_weight, at_bias, at_run_mean, at_run_var}); + + runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + + benchmark_state.SetBytesProcessed( + int64_t(benchmark_state.iterations()) * + ((2 * (at_x.numel() + at_weight.numel() + at_bias.numel())) * + int64_t(dataTypeSize(dtype)) + + (2 * (at_run_mean.numel() + at_run_var.numel()) * + int64_t(dataTypeSize(DataType::Float))))); +} + +//------------------------------------------------------------------------------ + +static void Baseline_BatchNorm_nhwc( + benchmark::State& benchmark_state, + DataType dtype) { + TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); + + const float kMomentum = 0.1; + const float kEps = 1e-5; + std::vector input_shape{ + benchmark_state.range(0), + benchmark_state.range(1), + benchmark_state.range(2), + benchmark_state.range(2)}; + + // inputs + at::manual_seed(0); + auto options = + at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); + auto fp32_options = + at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor at_x = at::randn(input_shape, options) + .contiguous(c10::MemoryFormat::ChannelsLast); + at::Tensor at_weight = at::ones({input_shape[1]}, options); + at::Tensor at_bias = at::zeros({input_shape[1]}, options); + at::Tensor at_run_mean = at::zeros({input_shape[1]}, fp32_options); + at::Tensor at_run_var = at::ones({input_shape[1]}, fp32_options); + + auto ato_weight = c10::optional(at_weight); + auto ato_bias = c10::optional(at_bias); + auto ato_run_mean = c10::optional(at_run_mean); + auto ato_run_var = c10::optional(at_run_var); + + auto output = at::batch_norm( + at_x, + ato_weight, + ato_bias, + ato_run_mean, + ato_run_var, + true, + kMomentum, + kEps, + true); + + clearL2Cache(); + cudaDeviceSynchronize(); + for (auto _ : benchmark_state) { + CudaKernelTimer timer; + at::_ops::_batch_norm_impl_index::call( + at_x, + ato_weight, + ato_bias, + ato_run_mean, + ato_run_var, + true, + kMomentum, + kEps, + true); + + benchmark_state.SetIterationTime(timer.elapsed() / 1000.0); + cudaDeviceSynchronize(); + clearL2Cache(); + cudaDeviceSynchronize(); + } + benchmark_state.SetBytesProcessed( + int64_t(benchmark_state.iterations()) * + ((2 * (at_x.numel() + at_weight.numel() + at_bias.numel())) * + int64_t(dataTypeSize(dtype)) + + (2 * (at_run_mean.numel() + at_run_var.numel()) * + int64_t(dataTypeSize(DataType::Float))))); +} + +//------------------------------------------------------------------------------ + +static void Baseline_BatchNorm_nhwc_cuDNN_fp32( + benchmark::State& benchmark_state) { + Baseline_BatchNorm_nhwc(benchmark_state, DataType::Float); +} + +static void Baseline_BatchNorm_nhwc_cuDNN_fp16( + benchmark::State& benchmark_state) { + Baseline_BatchNorm_nhwc(benchmark_state, DataType::Half); +} + +// Simple aliases just for names in the printed output +static void Baseline_ResNet_BatchNorm_nhwc_cuDNN_fp16(benchmark::State& benchmark_state) { + Baseline_BatchNorm_nhwc(benchmark_state, DataType::Half); +} + +static void Baseline_ResNext_BatchNorm_nhwc_cuDNN_fp16(benchmark::State& benchmark_state) { + Baseline_BatchNorm_nhwc(benchmark_state, DataType::Half); +} + +//------------------------------------------------------------------------------ + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_BatchNorm_nhwc_fp32, + setupBatchNorm_nhwc, + NvFuserScheduler_BatchNorm_nhwc, + DataType::Float); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_fp32) + // ->RangeMultiplier(2) + ->Ranges({{64, 512}, {32, 128}, {2, 64}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_fp32) + // ->RangeMultiplier(2) + ->Ranges({{2, 64}, {2, 32}, {2, 256}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_BatchNorm_nhwc_fp16, + setupBatchNorm_nhwc, + NvFuserScheduler_BatchNorm_nhwc, + DataType::Half); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_fp16) + // ->RangeMultiplier(2) + ->Ranges({{64, 512}, {32, 128}, {2, 128}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_fp16) + // ->RangeMultiplier(2) + ->Ranges({{2, 64}, {2, 32}, {2, 256}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +//------------------------------------------------------------------------------ + +BENCHMARK(Baseline_BatchNorm_nhwc_cuDNN_fp32) + // ->RangeMultiplier(2) + // cuDNN didn't make it to 1024 + ->Ranges({{64, 512}, {32, 128}, {2, 64}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +BENCHMARK(Baseline_BatchNorm_nhwc_cuDNN_fp32) + // ->RangeMultiplier(2) + ->Ranges({{2, 64}, {2, 32}, {2, 256}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +BENCHMARK(Baseline_BatchNorm_nhwc_cuDNN_fp16) + // ->RangeMultiplier(2) + ->Ranges({{64, 512}, {32, 128}, {2, 128}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +BENCHMARK(Baseline_BatchNorm_nhwc_cuDNN_fp16) + // ->RangeMultiplier(2) + ->Ranges({{2, 64}, {2, 32}, {2, 256}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +//------------------------------------------------------------------------------ +// RESNET and REXNEXT benchmarks + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_ResNet_BatchNorm_nhwc_fp16, + setupBatchNorm_nhwc, + NvFuserScheduler_BatchNorm_nhwc, + DataType::Half); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNet_BatchNorm_nhwc_fp16) + ->Args({256, 64, 112}) + ->Args({256, 64, 56}) + ->Args({256, 256, 56}) + ->Args({256, 128, 56}) + ->Args({256, 128, 28}) + ->Args({256, 512, 28}) + ->Args({256, 256, 28}) + ->Args({256, 256, 14}) + ->Args({256, 1024, 14}) + ->Args({256, 512, 14}) + ->Args({256, 512, 7}) + ->Args({256, 2048, 7}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_ResNext_BatchNorm_nhwc_fp16, + setupBatchNorm_nhwc, + NvFuserScheduler_BatchNorm_nhwc, + DataType::Half); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNext_BatchNorm_nhwc_fp16) + ->Args({128, 64, 112}) + ->Args({128, 128, 56}) + ->Args({128, 256, 56}) + ->Args({128, 128, 56}) + ->Args({128, 256, 28}) + ->Args({128, 512, 28}) + ->Args({128, 512, 14}) + ->Args({128, 1024, 14}) + ->Args({128, 1024, 7}) + ->Args({128, 2048, 7}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +// Permutation of TIMM sizes +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_TIMM_BatchNorm_nhwc_fp16, + setupBatchNorm_nhwc, + NvFuserScheduler_BatchNorm_nhwc, + DataType::Half); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_BatchNorm_nhwc_fp16) + ->ArgsProduct( + {{8, 16, 32, 64, 128, 256}, + {24, 40, 48, 56, 72, 152, 184, 200, 368}, + {7, 14, 28, 56, 112}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_BatchNorm_nhwc_fp16) + ->ArgsProduct( + {{128, 256, 512, 1024, 2048}, + {24, 40, 48, 56, 72, 152}, + {7, 14, 28, 56}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +//------------------------------------------------------------------------------ + +BENCHMARK(Baseline_ResNet_BatchNorm_nhwc_cuDNN_fp16) + ->Args({256, 64, 112}) + ->Args({256, 64, 56}) + ->Args({256, 256, 56}) + ->Args({256, 128, 56}) + ->Args({256, 128, 28}) + ->Args({256, 512, 28}) + ->Args({256, 256, 28}) + ->Args({256, 256, 14}) + ->Args({256, 1024, 14}) + ->Args({256, 512, 14}) + ->Args({256, 512, 7}) + ->Args({256, 2048, 7}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +BENCHMARK(Baseline_ResNext_BatchNorm_nhwc_cuDNN_fp16) + ->Args({128, 64, 112}) + ->Args({128, 128, 56}) + ->Args({128, 256, 56}) + ->Args({128, 128, 56}) + ->Args({128, 256, 28}) + ->Args({128, 512, 28}) + ->Args({128, 512, 14}) + ->Args({128, 1024, 14}) + ->Args({128, 1024, 7}) + ->Args({128, 2048, 7}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); diff --git a/benchmarks/cpp/nvfuser/batch_norm_channels_last_backward.cpp b/benchmarks/cpp/nvfuser/batch_norm_channels_last_backward.cpp new file mode 100644 index 000000000000..0660b75e3942 --- /dev/null +++ b/benchmarks/cpp/nvfuser/batch_norm_channels_last_backward.cpp @@ -0,0 +1,387 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +#include + +using namespace torch::jit::fuser::cuda; + +//------------------------------------------------------------------------------ + +static void setupBatchNorm_nhwc_BWD(Fusion* fusion, DataType dtype) { + TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); + + FusionGuard fg(fusion); + + const bool kTraining = true; + const float kMomentum = 0.1; + const float kEps = 1e-5; + + // setup fusion + auto input = makeContigTensor(4, dtype); + auto grad_output = makeContigTensor(4, dtype); + auto weight = makeContigTensor(1, DataType::Float); + auto running_mean = makeContigTensor(1, DataType::Float); + auto running_var = makeContigTensor(1, DataType::Float); + auto save_mean = makeContigTensor(1, DataType::Float); + auto save_var = makeContigTensor(1, DataType::Float); + + fusion->addInput(input); + fusion->addInput(grad_output); + fusion->addInput(weight); + fusion->addInput(running_mean); + fusion->addInput(running_var); + fusion->addInput(save_mean); + fusion->addInput(save_var); + + if (dtype == DataType::Half) { + input = castOp(DataType::Float, input); + grad_output = castOp(DataType::Float, grad_output); + } + + auto eps_ptr = IrBuilder::create(kEps); + + auto result = batch_norm_backward( + input, + grad_output, + weight, + running_mean, + running_var, + save_mean, + save_var, + kTraining, + eps_ptr, + std::vector(3, true), + true); + + auto grad_input = result.grad_input; + auto grad_weight = result.grad_weight; + auto grad_bias = result.grad_bias; + + if (dtype == DataType::Half) { + grad_input = castOp(DataType::Half, grad_input); + grad_weight = castOp(DataType::Half, grad_weight); + grad_bias = castOp(DataType::Half, grad_bias); + } + + fusion->addOutput(grad_input); + fusion->addOutput(grad_weight); + fusion->addOutput(grad_bias); +} + +static void NvFuserScheduler_BatchNorm_nhwc_BWD( + benchmark::State& benchmark_state, + FusionExecutorCache* fusion_executor_cache, + DataType dtype) { + TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); + + const bool kTraining = true; + const float kEps = 1e-5; + + std::vector input_shape{ + benchmark_state.range(0), + benchmark_state.range(2), + benchmark_state.range(2), + benchmark_state.range(1)}; + + at::manual_seed(0); + auto options = + at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); + auto fp32_options = + at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor input = at::randn(input_shape, options); + at::Tensor grad_out = at::randn(input_shape, options); + at::Tensor weight = at::ones({input_shape[3]}, fp32_options); + at::Tensor run_mean = at::zeros({input_shape[3]}, fp32_options); + at::Tensor run_var = at::ones({input_shape[3]}, fp32_options); + at::Tensor save_mean = at::zeros({input_shape[3]}, fp32_options); + at::Tensor save_var = at::ones({input_shape[3]}, fp32_options); + + std::vector aten_inputs( + {input, grad_out, weight, run_mean, run_var, save_mean, save_var}); + + runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + + benchmark_state.SetBytesProcessed( + int64_t(benchmark_state.iterations()) * + (((3 * input.numel()) * int64_t(dataTypeSize(dtype))) + + (run_mean.numel() + run_var.numel() + save_mean.numel() + + save_var.numel() + weight.numel()) * + int64_t(dataTypeSize(DataType::Float)))); +} + +//------------------------------------------------------------------------------ + +static void Baseline_BatchNorm_nhwc_BWD( + benchmark::State& benchmark_state, + DataType dtype) { + TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); + + const float kMomentum = 0.1; + const float kEps = 1e-5; + std::vector input_shape{ + benchmark_state.range(0), + benchmark_state.range(1), + benchmark_state.range(2), + benchmark_state.range(2)}; + + at::manual_seed(0); + auto options = + at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); + auto fp32_options = + at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor input = at::randn(input_shape, options) + .contiguous(c10::MemoryFormat::ChannelsLast); + at::Tensor grad_out = at::randn(input_shape, options) + .contiguous(c10::MemoryFormat::ChannelsLast); + at::Tensor weight = at::ones({input_shape[1]}, fp32_options); + at::Tensor bias = at::zeros({input_shape[1]}, fp32_options); + at::Tensor run_mean = at::zeros({input_shape[1]}, fp32_options); + at::Tensor run_var = at::ones({input_shape[1]}, fp32_options); + at::Tensor save_mean = at::zeros({input_shape[1]}, fp32_options); + at::Tensor save_var = at::ones({input_shape[1]}, fp32_options); + + auto ato_weight = c10::optional(weight); + auto ato_bias = c10::optional(bias); + auto ato_run_mean = c10::optional(run_mean); + auto ato_run_var = c10::optional(run_var); + auto ato_save_mean = c10::optional(save_mean); + auto ato_save_var = c10::optional(save_var); + + auto fwd_result = at::_ops::_batch_norm_impl_index::call( + input, + ato_weight, + ato_bias, + ato_run_mean, + ato_run_var, + true, + kMomentum, + kEps, + true); + cudaDeviceSynchronize(); + + // Sync everything up before we start + clearL2Cache(); + cudaDeviceSynchronize(); + for (auto _ : benchmark_state) { + CudaKernelTimer timer; + + at::_ops::cudnn_batch_norm_backward::call( + input, + grad_out, + weight, + ato_run_mean, + ato_run_var, + save_mean, + save_var, + kEps, + std::get<3>(fwd_result)); + + benchmark_state.SetIterationTime(timer.elapsed() / 1000.0); + cudaDeviceSynchronize(); + clearL2Cache(); + cudaDeviceSynchronize(); + } + + benchmark_state.SetBytesProcessed( + int64_t(benchmark_state.iterations()) * + (((3 * input.numel()) * int64_t(dataTypeSize(dtype))) + + (run_mean.numel() + run_var.numel() + save_mean.numel() + + save_var.numel() + weight.numel()) * + int64_t(dataTypeSize(DataType::Float)))); +} + +//------------------------------------------------------------------------------ + +static void Baseline_BatchNorm_nhwc_BWD_cuDNN_fp32( + benchmark::State& benchmark_state) { + Baseline_BatchNorm_nhwc_BWD(benchmark_state, DataType::Float); +} + +static void Baseline_BatchNorm_nhwc_BWD_cuDNN_fp16( + benchmark::State& benchmark_state) { + Baseline_BatchNorm_nhwc_BWD(benchmark_state, DataType::Half); +} + +// Simple aliases just for names in the printed output +static void Baseline_ResNet_BatchNorm_nhwc_BWD_cuDNN_fp16(benchmark::State& benchmark_state) { + Baseline_BatchNorm_nhwc_BWD(benchmark_state, DataType::Half); +} + +static void Baseline_ResNext_BatchNorm_nhwc_BWD_cuDNN_fp16(benchmark::State& benchmark_state) { + Baseline_BatchNorm_nhwc_BWD(benchmark_state, DataType::Half); +} + +//------------------------------------------------------------------------------ + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_BatchNorm_nhwc_BWD_fp32, + setupBatchNorm_nhwc_BWD, + NvFuserScheduler_BatchNorm_nhwc_BWD, + DataType::Float); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_BWD_fp32) + // ->RangeMultiplier(2) + ->Ranges({{64, 512}, {32, 128}, {2, 64}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_BWD_fp32) + // ->RangeMultiplier(2) + ->Ranges({{2, 64}, {2, 32}, {2, 256}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_BatchNorm_nhwc_BWD_fp16, + setupBatchNorm_nhwc_BWD, + NvFuserScheduler_BatchNorm_nhwc_BWD, + DataType::Half); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_BWD_fp16) + // ->RangeMultiplier(2) + ->Ranges({{64, 512}, {32, 128}, {2, 128}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_BWD_fp16) + // ->RangeMultiplier(2) + ->Ranges({{2, 64}, {2, 32}, {2, 256}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); +//------------------------------------------------------------------------------ + +BENCHMARK(Baseline_BatchNorm_nhwc_BWD_cuDNN_fp32) + // ->RangeMultiplier(2) + ->Ranges({{64, 512}, {32, 128}, {2, 64}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +BENCHMARK(Baseline_BatchNorm_nhwc_BWD_cuDNN_fp32) + // ->RangeMultiplier(2) + ->Ranges({{2, 64}, {2, 32}, {2, 256}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +BENCHMARK(Baseline_BatchNorm_nhwc_BWD_cuDNN_fp16) + // ->RangeMultiplier(2) + ->Ranges({{64, 512}, {32, 128}, {2, 128}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +BENCHMARK(Baseline_BatchNorm_nhwc_BWD_cuDNN_fp16) + // ->RangeMultiplier(2) + ->Ranges({{2, 64}, {2, 32}, {2, 256}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +//------------------------------------------------------------------------------ +// RESNET and REXNEXT benchmarks + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_ResNet_BatchNorm_nhwc_BWD_fp16, + setupBatchNorm_nhwc_BWD, + NvFuserScheduler_BatchNorm_nhwc_BWD, + DataType::Half); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNet_BatchNorm_nhwc_BWD_fp16) + ->Args({256, 64, 112}) + ->Args({256, 64, 56}) + ->Args({256, 256, 56}) + ->Args({256, 128, 56}) + ->Args({256, 128, 28}) + ->Args({256, 512, 28}) + ->Args({256, 256, 28}) + ->Args({256, 256, 14}) + ->Args({256, 1024, 14}) + ->Args({256, 512, 14}) + ->Args({256, 512, 7}) + ->Args({256, 2048, 7}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_ResNext_BatchNorm_nhwc_BWD_fp16, + setupBatchNorm_nhwc_BWD, + NvFuserScheduler_BatchNorm_nhwc_BWD, + DataType::Half); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNext_BatchNorm_nhwc_BWD_fp16) + ->Args({128, 64, 112}) + ->Args({128, 128, 56}) + ->Args({128, 256, 56}) + ->Args({128, 128, 56}) + ->Args({128, 256, 28}) + ->Args({128, 512, 28}) + ->Args({128, 512, 14}) + ->Args({128, 1024, 14}) + ->Args({128, 1024, 7}) + ->Args({128, 2048, 7}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +// Permutation of TIMM sizes +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_TIMM_BatchNorm_nhwc_BWD_fp16, + setupBatchNorm_nhwc_BWD, + NvFuserScheduler_BatchNorm_nhwc_BWD, + DataType::Half); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_BatchNorm_nhwc_BWD_fp16) + ->ArgsProduct( + {{8, 16, 32, 64, 128, 256}, + {24, 40, 48, 56, 72, 152, 184, 200, 368}, + {7, 14, 28, 56, 112}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_BatchNorm_nhwc_BWD_fp16) + ->ArgsProduct( + {{128, 256, 512, 1024, 2048}, + {24, 40, 48, 56, 72, 152}, + {7, 14, 28, 56}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +//------------------------------------------------------------------------------ + +BENCHMARK(Baseline_ResNet_BatchNorm_nhwc_BWD_cuDNN_fp16) + ->Args({256, 64, 112}) + ->Args({256, 64, 56}) + ->Args({256, 256, 56}) + ->Args({256, 128, 56}) + ->Args({256, 128, 28}) + ->Args({256, 512, 28}) + ->Args({256, 256, 28}) + ->Args({256, 256, 14}) + ->Args({256, 1024, 14}) + ->Args({256, 512, 14}) + ->Args({256, 512, 7}) + ->Args({256, 2048, 7}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +BENCHMARK(Baseline_ResNext_BatchNorm_nhwc_BWD_cuDNN_fp16) + ->Args({128, 64, 112}) + ->Args({128, 128, 56}) + ->Args({128, 256, 56}) + ->Args({128, 128, 56}) + ->Args({128, 256, 28}) + ->Args({128, 512, 28}) + ->Args({128, 512, 14}) + ->Args({128, 1024, 14}) + ->Args({128, 1024, 7}) + ->Args({128, 2048, 7}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); diff --git a/benchmarks/cpp/nvfuser/bert.cpp b/benchmarks/cpp/nvfuser/bert.cpp index f8a389331ee3..f105cfe4a4e3 100644 --- a/benchmarks/cpp/nvfuser/bert.cpp +++ b/benchmarks/cpp/nvfuser/bert.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -14,7 +15,7 @@ #include -#include "utils.h" +#include using namespace torch::jit::fuser::cuda; @@ -36,7 +37,7 @@ static void setupDivMaxSoftmaxDropoutForward(Fusion* fusion, DataType dtype) { fusion->addInput(tv1); // TODO: should be input - auto d16 = new Double(1.0); + auto d16 = IrBuilder::create(1.0); if (is_fp16) { tv0 = castOp(DataType::Float, tv0); @@ -47,7 +48,7 @@ static void setupDivMaxSoftmaxDropoutForward(Fusion* fusion, DataType dtype) { auto tv3 = add(tv2, tv0); auto tv10 = softmax(tv3, 3); - auto dropout_tvs = dropout(tv10, new Double(0.9)); + auto dropout_tvs = dropout(tv10, IrBuilder::create(0.9)); auto tv12 = dropout_tvs.mask; auto tv14 = dropout_tvs.output; @@ -83,9 +84,9 @@ static void setupDivMaxSoftmaxDropoutBackward(Fusion* fusion, DataType dtype) { } // TODO: should be inputs - auto d32 = new Double(1.0); + auto d32 = IrBuilder::create(1.0); // fusion->addInput(d32); - auto d33 = new Double(2.0); + auto d33 = IrBuilder::create(2.0); // fusion->addInput(d33); auto tv4 = mul(tv2, tv3); @@ -252,14 +253,15 @@ static void setupBiasDropoutAddLayernormFwd(Fusion* fusion, DataType dtype) { auto tv5 = broadcast(tv4, {true, true, false}); auto tv6 = add(tv3, tv5); - auto dropout_outs = dropout(tv6, new Double(0.9)); + auto dropout_outs = dropout(tv6, IrBuilder::create(0.9)); auto tv8 = dropout_outs.output; auto tv10 = dropout_outs.mask; auto tv11 = add(tv10, tv2); - auto layer_norm_outs = layer_norm(tv11, 1, tv0, tv1, new Double(1e-5)); + auto layer_norm_outs = + layer_norm(tv11, 1, tv0, tv1, IrBuilder::create(1e-5)); auto tv14 = layer_norm_outs.output; auto tv21 = layer_norm_outs.mean; auto tv26 = layer_norm_outs.invstd; @@ -481,7 +483,7 @@ static void setupBiasDropoutAddLayernormBwd2(Fusion* fusion, DataType dtype) { tv1 = castOp(DataType::Float, tv1); tv8 = castOp(DataType::Float, tv8); } - auto d36 = mul(new Double(1.0), tv1->axis(2)->extent()); + auto d36 = mul(IrBuilder::create(1.0), tv1->axis(2)->extent()); auto d47 = unaryOp(UnaryOpType::Reciprocal, d36); auto tv9 = broadcast(tv5, {true, true, false}); @@ -583,7 +585,7 @@ static void setupBiasDropoutAddLayernormBwd3(Fusion* fusion, DataType dtype) { } // Uncertain this is the right value, but going for it anyways - auto d34 = div(new Double(1.0), tv0->axis(2)->extent()); + auto d34 = div(IrBuilder::create(1.0), tv0->axis(2)->extent()); auto tv25 = mul(tv21, tv0); auto tv26 = mul(tv25, d34); diff --git a/benchmarks/cpp/nvfuser/broadcast.cpp b/benchmarks/cpp/nvfuser/broadcast.cpp index d693ff68bf85..8411444ca96a 100644 --- a/benchmarks/cpp/nvfuser/broadcast.cpp +++ b/benchmarks/cpp/nvfuser/broadcast.cpp @@ -12,7 +12,7 @@ #include -#include "utils.h" +#include using namespace torch::jit::fuser::cuda; @@ -70,9 +70,8 @@ static void NvFuserScheduler_Broadcast( auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo(); auto executor_instance = compile_log.fusion_executor; TORCH_INTERNAL_ASSERT(compile_log.pointwise_params.has_value()); - TORCH_INTERNAL_ASSERT(compile_log.launch_constraints.has_value()); auto params = toString(compile_log.pointwise_params.value()); - auto lparams = toString(compile_log.launch_constraints.value()); + auto lparams = toString(compile_log.fusion_executor->lastLaunchParams()); benchmark_state.SetLabel(params + lparams); diff --git a/benchmarks/cpp/nvfuser/gelu_backward.cpp b/benchmarks/cpp/nvfuser/gelu_backward.cpp index 9d53d9c27593..6632ba58a236 100644 --- a/benchmarks/cpp/nvfuser/gelu_backward.cpp +++ b/benchmarks/cpp/nvfuser/gelu_backward.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -11,7 +12,7 @@ #include -#include "utils.h" +#include using namespace torch::jit::fuser::cuda; @@ -41,23 +42,23 @@ static void setupFusion(Fusion* fusion) { auto t5 = castOp(DataType::Float, t4); auto t6 = broadcast(t3, {true, true, false}); auto t7 = add(t6, t5); - auto t8 = mul(t7, new Double(k_079)); - auto t9 = mul(t7, new Double(k_004)); + auto t8 = mul(t7, IrBuilder::create(k_079)); + auto t9 = mul(t7, IrBuilder::create(k_004)); auto t10 = mul(t9, t7); - auto t11 = add(t10, new Int(1)); + auto t11 = add(t10, IrBuilder::create(1)); auto t12 = mul(t8, t11); auto t13 = unaryOp(UnaryOpType::Tanh, t12); - auto t14 = mul(t7, new Double(0.5)); + auto t14 = mul(t7, IrBuilder::create(0.5)); auto t15 = mul(t13, t13); auto t16 = unaryOp(UnaryOpType::Neg, t15); - auto t17 = add(t16, new Int(1)); - auto t18 = mul(t7, new Double(k_010)); + auto t17 = add(t16, IrBuilder::create(1)); + auto t18 = mul(t7, IrBuilder::create(k_010)); auto t19 = mul(t18, t7); - auto t20 = add(t19, new Double(k_079)); + auto t20 = add(t19, IrBuilder::create(k_079)); auto t21 = mul(t17, t20); auto t22 = mul(t14, t21); - auto t23 = add(t13, new Int(1)); - auto t24 = mul(t23, new Double(0.5)); + auto t23 = add(t13, IrBuilder::create(1)); + auto t24 = mul(t23, IrBuilder::create(0.5)); auto t25 = add(t22, t24); auto t26 = mul(t25, t1); diff --git a/benchmarks/cpp/nvfuser/heuristic_cache.cpp b/benchmarks/cpp/nvfuser/heuristic_cache.cpp index 22b8ec4ce972..64b1ecfb756d 100644 --- a/benchmarks/cpp/nvfuser/heuristic_cache.cpp +++ b/benchmarks/cpp/nvfuser/heuristic_cache.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -10,23 +11,10 @@ #include -#include "utils.h" +#include using namespace torch::jit::fuser::cuda; -// Make a tensor that is known to be non-contiguous of dimensionality=ndims, -// but unknown sizes -TensorView* makeSymbolicTensor(size_t ndims, DataType dtype = DataType::Float) { - return TensorViewBuilder().ndims(ndims).dtype(dtype).build(); -} - -// Make a non-contiguous tensor of compile-time known sizes -TensorView* makeConcreteTensor( - std::vector shape, - DataType dtype = DataType::Float) { - return TensorViewBuilder().shape(shape).dtype(dtype).build(); -} - static auto getLayerBackwardNormRuntime( std::unique_ptr fusion_ptr, std::unique_ptr& fec, @@ -129,7 +117,7 @@ static auto getLayerForwardNormRuntime( Fusion& fusion = *fusion_ptr.get(); const float kEps = 1e-5; - Double* eps_ptr = new Double(kEps); + Double* eps_ptr = IrBuilder::create(kEps); auto input = makeSymbolicTensor(shape.size()); fusion.addInput(input); diff --git a/benchmarks/cpp/nvfuser/heuristic_lookup.cpp b/benchmarks/cpp/nvfuser/heuristic_lookup.cpp index 22b8ec4ce972..64b1ecfb756d 100644 --- a/benchmarks/cpp/nvfuser/heuristic_lookup.cpp +++ b/benchmarks/cpp/nvfuser/heuristic_lookup.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -10,23 +11,10 @@ #include -#include "utils.h" +#include using namespace torch::jit::fuser::cuda; -// Make a tensor that is known to be non-contiguous of dimensionality=ndims, -// but unknown sizes -TensorView* makeSymbolicTensor(size_t ndims, DataType dtype = DataType::Float) { - return TensorViewBuilder().ndims(ndims).dtype(dtype).build(); -} - -// Make a non-contiguous tensor of compile-time known sizes -TensorView* makeConcreteTensor( - std::vector shape, - DataType dtype = DataType::Float) { - return TensorViewBuilder().shape(shape).dtype(dtype).build(); -} - static auto getLayerBackwardNormRuntime( std::unique_ptr fusion_ptr, std::unique_ptr& fec, @@ -129,7 +117,7 @@ static auto getLayerForwardNormRuntime( Fusion& fusion = *fusion_ptr.get(); const float kEps = 1e-5; - Double* eps_ptr = new Double(kEps); + Double* eps_ptr = IrBuilder::create(kEps); auto input = makeSymbolicTensor(shape.size()); fusion.addInput(input); diff --git a/benchmarks/cpp/nvfuser/instance_norm.cpp b/benchmarks/cpp/nvfuser/instance_norm.cpp index 395ac6c8c9cd..a7139c113a43 100644 --- a/benchmarks/cpp/nvfuser/instance_norm.cpp +++ b/benchmarks/cpp/nvfuser/instance_norm.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -9,16 +10,22 @@ #include -#include "utils.h" +#include using namespace torch::jit::fuser::cuda; -static void setupInstanceNorm(Fusion* fusion, DataType dtype) { +static void setupInstanceNorm( + Fusion* fusion, + DataType dtype, + bool channels_last_3d = false) { TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); FusionGuard fg(fusion); auto input = makeContigTensor(4, dtype); + if (channels_last_3d) { + input = makeContigTensor(5, dtype); + } auto weight = makeContigTensor(1, dtype); auto bias = makeContigTensor(1, dtype); auto running_mean = makeContigTensor(1, DataType::Float); @@ -39,8 +46,8 @@ static void setupInstanceNorm(Fusion* fusion, DataType dtype) { const bool kTraining = true; const float kMomentum = 0.1; const float kEps = 1e-5; - auto momentum_ptr = new Double(kMomentum); - auto eps_ptr = new Double(kEps); + auto momentum_ptr = IrBuilder::create(kMomentum); + auto eps_ptr = IrBuilder::create(kEps); auto norm = instance_norm( input, @@ -50,7 +57,8 @@ static void setupInstanceNorm(Fusion* fusion, DataType dtype) { running_var, kTraining, momentum_ptr, - eps_ptr); + eps_ptr, + channels_last_3d); auto output = unaryOp(UnaryOpType::Relu, norm.output); @@ -66,7 +74,8 @@ static void setupInstanceNorm(Fusion* fusion, DataType dtype) { static void NvFuserScheduler_InstanceNorm( benchmark::State& benchmark_state, FusionExecutorCache* fusion_executor_cache, - DataType dtype) { + DataType dtype, + bool channels_last_3d = false) { TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); std::vector input_shape{ @@ -75,17 +84,25 @@ static void NvFuserScheduler_InstanceNorm( benchmark_state.range(1), benchmark_state.range(1)}; + std::vector input_shape_3d{ + benchmark_state.range(0), + benchmark_state.range(1), + benchmark_state.range(1), + benchmark_state.range(1), + benchmark_state.range(2)}; + // inputs at::manual_seed(0); auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); auto fp32_options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor at_x = at::randn(input_shape, options); - at::Tensor at_weight = at::ones({input_shape[1]}, options); - at::Tensor at_bias = at::zeros({input_shape[1]}, options); - at::Tensor at_mean = at::zeros({input_shape[1]}, fp32_options); - at::Tensor at_var = at::ones({input_shape[1]}, fp32_options); + at::Tensor at_x = + at::randn(channels_last_3d ? input_shape_3d : input_shape, options); + at::Tensor at_weight = at::ones({benchmark_state.range(2)}, options); + at::Tensor at_bias = at::zeros({benchmark_state.range(2)}, options); + at::Tensor at_mean = at::zeros({benchmark_state.range(2)}, fp32_options); + at::Tensor at_var = at::ones({benchmark_state.range(2)}, fp32_options); std::vector aten_inputs = { at_x, at_weight, at_bias, at_mean, at_var}; @@ -93,21 +110,20 @@ static void NvFuserScheduler_InstanceNorm( runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); - const size_t kSize = - input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3]; - const size_t kChannels = input_shape[1]; + const size_t kChannels = benchmark_state.range(2); // Read: x, weight, bias, running_mean, running_var // Write: y, running_mean, running_var benchmark_state.SetBytesProcessed( benchmark_state.iterations() * - ((kChannels * 2 + kSize * 2) * dataTypeSize(dtype) + + ((kChannels * 2 + at_x.numel() * 2) * dataTypeSize(dtype) + (kChannels * 2 * 2) * dataTypeSize(DataType::Float))); } static void Baseline_InstanceNorm( benchmark::State& benchmark_state, - DataType dtype) { + DataType dtype, + bool channels_last_3d = false) { TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); std::vector input_shape{ @@ -115,6 +131,14 @@ static void Baseline_InstanceNorm( benchmark_state.range(2), benchmark_state.range(1), benchmark_state.range(1)}; + std::vector input_shape_3d{ + benchmark_state.range(0), + benchmark_state.range(2), + benchmark_state.range(1), + benchmark_state.range(1), + benchmark_state.range(1), + }; + const float kMomentum = 0.1; const float kEps = 1e-5; const auto aten_dtype = data_type_to_aten(dtype); @@ -125,10 +149,15 @@ static void Baseline_InstanceNorm( at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor at_x = at::randn(input_shape, options); - at::Tensor at_weight = at::ones({input_shape[1]}, options); - at::Tensor at_bias = at::zeros({input_shape[1]}, options); - at::Tensor at_mean = at::zeros({input_shape[1]}, fp32_options); - at::Tensor at_var = at::ones({input_shape[1]}, fp32_options); + if (channels_last_3d) { + at_x = at::randn( + input_shape_3d, + options.memory_format(c10::MemoryFormat::ChannelsLast3d)); + } + at::Tensor at_weight = at::ones({benchmark_state.range(2)}, options); + at::Tensor at_bias = at::zeros({benchmark_state.range(2)}, options); + at::Tensor at_mean = at::zeros({benchmark_state.range(2)}, fp32_options); + at::Tensor at_var = at::ones({benchmark_state.range(2)}, fp32_options); auto ato_weight = c10::optional(at_weight); auto ato_bias = c10::optional(at_bias); @@ -158,15 +187,13 @@ static void Baseline_InstanceNorm( cudaDeviceSynchronize(); } - const size_t kSize = - input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3]; - const size_t kChannels = input_shape[1]; + const size_t kChannels = benchmark_state.range(2); // Read: x, weight, bias, running_mean, running_var // Write: y, running_mean, running_var benchmark_state.SetBytesProcessed( benchmark_state.iterations() * - ((kChannels * 2 + kSize * 2) * dataTypeSize(dtype) + + ((kChannels * 2 + at_x.numel() * 2) * dataTypeSize(dtype) + (kChannels * 2 * 2) * dataTypeSize(DataType::Float))); } @@ -180,6 +207,11 @@ static void Baseline_InstanceNorm_fp16(benchmark::State& benchmark_state) { Baseline_InstanceNorm(benchmark_state, DataType::Half); } +static void Baseline_InstanceNorm_fp32_channels_last_3d( + benchmark::State& benchmark_state) { + Baseline_InstanceNorm(benchmark_state, DataType::Float, true); +} + //------------------------------------------------------------------------------ NVFUSER_BENCHMARK_DEFINE( @@ -205,6 +237,44 @@ NVFUSER_BENCHMARK_RUN(NvFuserScheduler_InstanceNorm_fp16) ->Ranges({{8, 8}, {640, 640}, {64, 256}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_InstanceNorm3d_channels_last_fp32, + setupInstanceNorm, + NvFuserScheduler_InstanceNorm, + DataType::Float, + true); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_InstanceNorm3d_channels_last_fp32) + ->RangeMultiplier(2) + ->Ranges({{1, 8}, {128, 128}, {32, 32}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_InstanceNorm3d_channels_last_fp32) + ->RangeMultiplier(2) + ->Ranges({{1, 8}, {64, 64}, {64, 64}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_InstanceNorm3d_channels_last_fp32) + ->RangeMultiplier(2) + ->Ranges({{1, 8}, {32, 32}, {128, 128}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_InstanceNorm3d_channels_last_fp32) + ->RangeMultiplier(2) + ->Ranges({{1, 8}, {16, 16}, {256, 256}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_InstanceNorm3d_channels_last_fp32) + ->RangeMultiplier(2) + ->Ranges({{1, 8}, {4, 8}, {320, 320}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + //------------------------------------------------------------------------------ BENCHMARK(Baseline_InstanceNorm_fp32) @@ -219,4 +289,28 @@ BENCHMARK(Baseline_InstanceNorm_fp16) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); +BENCHMARK(Baseline_InstanceNorm_fp32_channels_last_3d) + ->RangeMultiplier(2) + ->Ranges({{2, 8}, {128, 128}, {32, 32}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +BENCHMARK(Baseline_InstanceNorm_fp32_channels_last_3d) + ->RangeMultiplier(2) + ->Ranges({{2, 8}, {64, 64}, {64, 64}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +BENCHMARK(Baseline_InstanceNorm_fp32_channels_last_3d) + ->RangeMultiplier(2) + ->Ranges({{2, 8}, {16, 16}, {256, 256}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +BENCHMARK(Baseline_InstanceNorm_fp32_channels_last_3d) + ->RangeMultiplier(2) + ->Ranges({{2, 8}, {4, 8}, {320, 320}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + //------------------------------------------------------------------------------ diff --git a/benchmarks/cpp/nvfuser/layer_norm.cpp b/benchmarks/cpp/nvfuser/layer_norm.cpp index c4f79b2b668b..d793a45caa3c 100644 --- a/benchmarks/cpp/nvfuser/layer_norm.cpp +++ b/benchmarks/cpp/nvfuser/layer_norm.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -10,7 +11,7 @@ #include -#include "utils.h" +#include using namespace torch::jit::fuser::cuda; @@ -24,7 +25,7 @@ static void setupLayerNorm(Fusion* fusion, DataType dtype) { const int kReductionAxis = 1; const float kEps = 1e-5; - Double* eps_ptr = new Double(kEps); + Double* eps_ptr = IrBuilder::create(kEps); // setup fusion auto input = makeContigTensor(2, dtype); @@ -45,8 +46,8 @@ static void setupLayerNorm(Fusion* fusion, DataType dtype) { auto output = layer_norm_results.output; - if (dtype == DataType::Half) { - output = castOp(DataType::Half, output); + if (dtype != DataType::Float) { + output = castOp(dtype, output); } fusion->addOutput(output); @@ -89,9 +90,9 @@ static void Baseline_LayerNorm( std::vector input_shape{ benchmark_state.range(0), benchmark_state.range(1)}; - const int kReductionAxis = 1; + const size_t kReductionAxis = 1; std::vector norm_shape; - for (int idx = kReductionAxis; idx < input_shape.size(); ++idx) { + for (auto idx = kReductionAxis; idx < input_shape.size(); ++idx) { norm_shape.push_back(input_shape[idx]); } diff --git a/benchmarks/cpp/nvfuser/layer_norm_backward.cpp b/benchmarks/cpp/nvfuser/layer_norm_backward.cpp index 43eafcc42fb1..9e6ac1c207d1 100644 --- a/benchmarks/cpp/nvfuser/layer_norm_backward.cpp +++ b/benchmarks/cpp/nvfuser/layer_norm_backward.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -10,7 +11,7 @@ #include -#include "utils.h" +#include using namespace torch::jit::fuser::cuda; @@ -22,7 +23,7 @@ static void setupLayerNorm_BWD(Fusion* fusion, DataType dtype) { TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); const int kReductionAxis = 1; - Double* eps_ptr = new Double(1e-5); + Double* eps_ptr = IrBuilder::create(1e-5); // setup fusion auto grad_out = makeContigTensor(2, dtype); @@ -33,12 +34,12 @@ static void setupLayerNorm_BWD(Fusion* fusion, DataType dtype) { auto mean = TensorViewBuilder() .contiguity({false, false}) .shape({-1, 1}) - .dtype(dtype) + .dtype(DataType::Float) .build(); auto rstd = TensorViewBuilder() .contiguity({false, false}) .shape({-1, 1}) - .dtype(dtype) + .dtype(DataType::Float) .build(); fusion->addInput(grad_out); @@ -53,20 +54,17 @@ static void setupLayerNorm_BWD(Fusion* fusion, DataType dtype) { input = castOp(DataType::Float, input); weight = castOp(DataType::Float, weight); bias = castOp(DataType::Float, bias); - mean = castOp(DataType::Float, mean); - rstd = castOp(DataType::Float, rstd); } auto layer_norm_results = layer_norm_backward( grad_out, input, {1}, mean, rstd, weight, bias, {true, true, true}); - if (dtype == DataType::Half) { + if (dtype != DataType::Float) { layer_norm_results.grad_input = - castOp(DataType::Half, layer_norm_results.grad_input); - layer_norm_results.grad_bias = - castOp(DataType::Half, layer_norm_results.grad_bias); + castOp(dtype, layer_norm_results.grad_input); + layer_norm_results.grad_bias = castOp(dtype, layer_norm_results.grad_bias); layer_norm_results.grad_weight = - castOp(DataType::Half, layer_norm_results.grad_weight); + castOp(dtype, layer_norm_results.grad_weight); } fusion->addOutput(layer_norm_results.grad_input); @@ -85,14 +83,16 @@ static void NvFuserScheduler_LayerNorm_BWD( // inputs at::manual_seed(0); - auto options = + auto maybe_fp16_options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); - at::Tensor grad_out = at::randn(input_shape, options); - at::Tensor input = at::randn(input_shape, options); - at::Tensor weight = at::randn({input_shape[1]}, options); - at::Tensor bias = at::randn({input_shape[1]}, options); - at::Tensor mean = at::randn({input_shape[0], 1}, options); - at::Tensor rstd = at::randn({input_shape[0], 1}, options); + auto fp32_options = + at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor grad_out = at::randn(input_shape, maybe_fp16_options); + at::Tensor input = at::randn(input_shape, maybe_fp16_options); + at::Tensor weight = at::randn({input_shape[1]}, maybe_fp16_options); + at::Tensor bias = at::randn({input_shape[1]}, maybe_fp16_options); + at::Tensor mean = at::randn({input_shape[0], 1}, fp32_options); + at::Tensor rstd = at::randn({input_shape[0], 1}, fp32_options); std::vector aten_inputs( {grad_out, input, weight, bias, mean, rstd}); @@ -115,22 +115,24 @@ static void Baseline_LayerNorm_BWD( std::vector input_shape{ benchmark_state.range(0), benchmark_state.range(1)}; - const int kReductionAxis = 1; + const size_t kReductionAxis = 1; std::vector norm_shape; - for (int idx = kReductionAxis; idx < input_shape.size(); ++idx) { + for (auto idx = kReductionAxis; idx < input_shape.size(); ++idx) { norm_shape.push_back(input_shape[idx]); } // inputs at::manual_seed(0); - auto options = + auto maybe_fp16_options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); - at::Tensor grad_out = at::randn(input_shape, options); - at::Tensor input = at::randn(input_shape, options); - at::Tensor weight = at::randn({input_shape[1]}, options); - at::Tensor bias = at::randn({input_shape[1]}, options); - at::Tensor mean = at::randn({input_shape[0], 1}, options); - at::Tensor rstd = at::randn({input_shape[0], 1}, options); + auto fp32_options = + at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor grad_out = at::randn(input_shape, maybe_fp16_options); + at::Tensor input = at::randn(input_shape, maybe_fp16_options); + at::Tensor weight = at::randn({input_shape[1]}, maybe_fp16_options); + at::Tensor bias = at::randn({input_shape[1]}, maybe_fp16_options); + at::Tensor mean = at::randn({input_shape[0], 1}, fp32_options); + at::Tensor rstd = at::randn({input_shape[0], 1}, fp32_options); std::array output_mask = {true, true, true}; clearL2Cache(); diff --git a/benchmarks/cpp/nvfuser/lstm_cell.cpp b/benchmarks/cpp/nvfuser/lstm_cell.cpp index 65f869fac4ad..20ec7c8f4700 100644 --- a/benchmarks/cpp/nvfuser/lstm_cell.cpp +++ b/benchmarks/cpp/nvfuser/lstm_cell.cpp @@ -9,7 +9,7 @@ #include -#include "utils.h" +#include using namespace torch::jit::fuser::cuda; diff --git a/benchmarks/cpp/nvfuser/reduction.cpp b/benchmarks/cpp/nvfuser/reduction.cpp index c25097963dbc..3fd1bcb59dfc 100644 --- a/benchmarks/cpp/nvfuser/reduction.cpp +++ b/benchmarks/cpp/nvfuser/reduction.cpp @@ -12,7 +12,7 @@ #include -#include "utils.h" +#include using namespace torch::jit::fuser::cuda; @@ -66,9 +66,8 @@ static void NvFuserScheduler_Reduction( auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo(); auto executor_instance = compile_log.fusion_executor; TORCH_INTERNAL_ASSERT(compile_log.reduction_params.has_value()); - TORCH_INTERNAL_ASSERT(compile_log.launch_constraints.has_value()); auto rparams = toString(compile_log.reduction_params.value()); - auto lparams = toString(compile_log.launch_constraints.value()); + auto lparams = toString(compile_log.fusion_executor->lastLaunchParams()); benchmark_state.SetLabel(rparams + lparams); @@ -191,6 +190,18 @@ NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp32) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp32) + // ->RangeMultiplier(2) + ->Ranges({{1024, 1024 * 512}, {2, 4 * 1024}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp32) + // ->RangeMultiplier(2) + ->Ranges({{2, 4 * 1024}, {1024, 1024 * 512}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp16) // ->RangeMultiplier(2) ->Ranges({{1, 1024 * 1024}, {160, 320}}) @@ -215,6 +226,18 @@ NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp16) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp16) + // ->RangeMultiplier(2) + ->Ranges({{1024, 1024 * 1024}, {2, 4 * 1024}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp16) + // ->RangeMultiplier(2) + ->Ranges({{2, 4 * 1024}, {1024, 1024 * 1024}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Inner_fp32) // ->RangeMultiplier(2) ->Ranges({{1, 1024 * 1024}, {160, 320}}) diff --git a/benchmarks/cpp/nvfuser/rms_norm.cpp b/benchmarks/cpp/nvfuser/rms_norm.cpp new file mode 100644 index 000000000000..81fdf46cf818 --- /dev/null +++ b/benchmarks/cpp/nvfuser/rms_norm.cpp @@ -0,0 +1,172 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +using namespace torch::jit::fuser::cuda; + +//------------------------------------------------------------------------------ + +static void setupRMSNorm(Fusion* fusion, DataType dtype) { + TORCH_INTERNAL_ASSERT( + dtype == DataType::Float || dtype == DataType::Half || + dtype == DataType::BFloat16); + + FusionGuard fg(fusion); + + const int kReductionAxis = 2; + const float kEps = 1e-6; + + Double* eps_ptr = IrBuilder::create(kEps); + + // setup fusion + auto input = makeContigTensor(3, dtype); + auto weight = makeContigTensor(1, dtype); + + fusion->addInput(input); + fusion->addInput(weight); + + if (dtype == DataType::Half) { + input = castOp(DataType::Float, input); + weight = castOp(DataType::Float, weight); + } + + auto rms_norm_results = rms_norm(input, 1, weight, eps_ptr); + + auto output = rms_norm_results.output; + + if (dtype != DataType::Float) { + output = castOp(dtype, output); + } + + fusion->addOutput(output); +} + +static void NvFuserScheduler_RMSNorm( + benchmark::State& benchmark_state, + FusionExecutorCache* fusion_executor_cache, + DataType dtype) { + TORCH_INTERNAL_ASSERT( + dtype == DataType::Float || dtype == DataType::Half || + dtype == DataType::BFloat16); + + std::vector input_shape{8, benchmark_state.range(0), 1024}; + const float kEps = 1e-6; + + // inputs + at::manual_seed(0); + auto options = + at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); + at::Tensor input = at::randn(input_shape, options); + at::Tensor weight = at::randn({input_shape[2]}, options); + + std::vector aten_inputs({input, weight}); + + runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + + benchmark_state.SetBytesProcessed( + int64_t(benchmark_state.iterations()) * + (2 * input.numel() + weight.numel()) * int64_t(dataTypeSize(dtype))); +} + +//------------------------------------------------------------------------------ + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_RMSNorm_fp32, + setupRMSNorm, + NvFuserScheduler_RMSNorm, + DataType::Float); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp32) + ->RangeMultiplier(2) + ->Ranges({{16, 64}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp32) + ->RangeMultiplier(2) + ->Ranges({{18, 56}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp32) + ->RangeMultiplier(2) + ->Ranges({{22, 44}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp32) + ->RangeMultiplier(2) + ->Ranges({{24, 48}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_RMSNorm_fp16, + setupRMSNorm, + NvFuserScheduler_RMSNorm, + DataType::Half); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp16) + ->RangeMultiplier(2) + ->Ranges({{16, 64}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp16) + ->RangeMultiplier(2) + ->Ranges({{18, 56}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp16) + ->RangeMultiplier(2) + ->Ranges({{22, 44}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp16) + ->RangeMultiplier(2) + ->Ranges({{24, 48}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +// TODO: Automatically disable/enable if bf16 is supported +// NVFUSER_BENCHMARK_DEFINE( +// NvFuserScheduler_RMSNorm_bf16, +// setupRMSNorm, +// NvFuserScheduler_RMSNorm, +// DataType::BFloat16); + +// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_bf16) +// ->RangeMultiplier(2) +// ->Ranges({{16, 64}}) +// ->Unit(benchmark::kMicrosecond) +// ->UseManualTime(); + +// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_bf16) +// ->RangeMultiplier(2) +// ->Ranges({{18, 56}}) +// ->Unit(benchmark::kMicrosecond) +// ->UseManualTime(); + +// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_bf16) +// ->RangeMultiplier(2) +// ->Ranges({{22, 44}}) +// ->Unit(benchmark::kMicrosecond) +// ->UseManualTime(); + +// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_bf16) +// ->RangeMultiplier(2) +// ->Ranges({{24, 48}}) +// ->Unit(benchmark::kMicrosecond) +// ->UseManualTime(); diff --git a/benchmarks/cpp/nvfuser/rms_norm_backward.cpp b/benchmarks/cpp/nvfuser/rms_norm_backward.cpp new file mode 100644 index 000000000000..b4c6ac413c75 --- /dev/null +++ b/benchmarks/cpp/nvfuser/rms_norm_backward.cpp @@ -0,0 +1,166 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +using namespace torch::jit::fuser::cuda; + +//------------------------------------------------------------------------------ + +static void setupRMSNorm_BWD(Fusion* fusion, DataType dtype) { + FusionGuard fg(fusion); + + TORCH_INTERNAL_ASSERT( + dtype == DataType::Float || dtype == DataType::Half || + dtype == DataType::BFloat16); + + const int kReductionAxis = 2; + Double* eps_ptr = IrBuilder::create(1e-6); + + // setup fusion + auto grad_out = makeContigTensor(3, dtype); + auto input = makeContigTensor(3, dtype); + auto weight = makeContigTensor(1, dtype); + auto rstd = TensorViewBuilder() + .contiguity({false, false, false}) + .shape({-1, -1, 1}) + .dtype(dtype) + .build(); + + fusion->addInput(grad_out); + fusion->addInput(input); + fusion->addInput(weight); + fusion->addInput(rstd); + + if (dtype == DataType::Half) { + grad_out = castOp(DataType::Float, grad_out); + input = castOp(DataType::Float, input); + weight = castOp(DataType::Float, weight); + rstd = castOp(DataType::Float, rstd); + } + + auto rms_norm_results = + rms_norm_backward(grad_out, input, {1}, rstd, weight, {true, true, true}); + + if (dtype != DataType::Float) { + rms_norm_results.grad_input = castOp(dtype, rms_norm_results.grad_input); + rms_norm_results.grad_weight = castOp(dtype, rms_norm_results.grad_weight); + } + + fusion->addOutput(rms_norm_results.grad_input); + fusion->addOutput(rms_norm_results.grad_weight); +} + +static void NvFuserScheduler_RMSNorm_BWD( + benchmark::State& benchmark_state, + FusionExecutorCache* fusion_executor_cache, + DataType dtype) { + TORCH_INTERNAL_ASSERT( + dtype == DataType::Float || dtype == DataType::Half || + dtype == DataType::BFloat16); + + std::vector input_shape{8, benchmark_state.range(0), 1024}; + + // inputs + at::manual_seed(0); + auto options = + at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); + at::Tensor grad_out = at::randn(input_shape, options); + at::Tensor input = at::randn(input_shape, options); + at::Tensor weight = at::randn({input_shape[2]}, options); + at::Tensor rstd = at::randn({input_shape[0], input_shape[1], 1}, options); + + std::vector aten_inputs({grad_out, input, weight, rstd}); + + runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + + benchmark_state.SetBytesProcessed( + int64_t(benchmark_state.iterations()) * + (3 * input.numel() + weight.numel() + rstd.numel()) * + int64_t(dataTypeSize(dtype))); +} + +//------------------------------------------------------------------------------ + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_RMSNorm_BWD_fp32, + setupRMSNorm_BWD, + NvFuserScheduler_RMSNorm_BWD, + DataType::Float); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_fp32) + ->RangeMultiplier(2) + ->Ranges({{16, 64}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_fp32) + ->RangeMultiplier(2) + ->Ranges({{28, 56}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_fp32) + ->RangeMultiplier(2) + ->Ranges({{24, 48}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_RMSNorm_BWD_fp16, + setupRMSNorm_BWD, + NvFuserScheduler_RMSNorm_BWD, + DataType::Half); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_fp16) + ->RangeMultiplier(2) + ->Ranges({{16, 64}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_fp16) + ->RangeMultiplier(2) + ->Ranges({{28, 56}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_fp16) + ->RangeMultiplier(2) + ->Ranges({{24, 48}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +// TODO: Automatically disable/enable if bf16 is supported +// NVFUSER_BENCHMARK_DEFINE( +// NvFuserScheduler_RMSNorm_BWD_bf16, +// setupRMSNorm_BWD, +// NvFuserScheduler_RMSNorm_BWD, +// DataType::BFloat16); + +// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_bf16) +// ->RangeMultiplier(2) +// ->Ranges({{16, 64}}) +// ->Unit(benchmark::kMicrosecond) +// ->UseManualTime(); + +// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_bf16) +// ->RangeMultiplier(2) +// ->Ranges({{28, 56}}) +// ->Unit(benchmark::kMicrosecond) +// ->UseManualTime(); + +// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_bf16) +// ->RangeMultiplier(2) +// ->Ranges({{24, 48}}) +// ->Unit(benchmark::kMicrosecond) +// ->UseManualTime(); diff --git a/benchmarks/cpp/nvfuser/scale_bias_relu.cpp b/benchmarks/cpp/nvfuser/scale_bias_relu.cpp index 47ed9047f159..6bb7fc18aa0b 100644 --- a/benchmarks/cpp/nvfuser/scale_bias_relu.cpp +++ b/benchmarks/cpp/nvfuser/scale_bias_relu.cpp @@ -8,7 +8,7 @@ #include -#include "utils.h" +#include using namespace torch::jit::fuser::cuda; @@ -136,9 +136,8 @@ static void NvFuserScheduler_SBR( auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo(); auto executor_instance = compile_log.fusion_executor; TORCH_INTERNAL_ASSERT(compile_log.pointwise_params.has_value()); - TORCH_INTERNAL_ASSERT(compile_log.launch_constraints.has_value()); auto params = toString(compile_log.pointwise_params.value()); - auto lparams = toString(compile_log.launch_constraints.value()); + auto lparams = toString(compile_log.fusion_executor->lastLaunchParams()); benchmark_state.SetLabel(params + lparams); benchmark_state.SetLabel(lparams); @@ -240,9 +239,8 @@ static void NvFuserScheduler_SBR_Norm( auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo(); auto executor_instance = compile_log.fusion_executor; TORCH_INTERNAL_ASSERT(compile_log.pointwise_params.has_value()); - TORCH_INTERNAL_ASSERT(compile_log.launch_constraints.has_value()); auto params = toString(compile_log.pointwise_params.value()); - auto lparams = toString(compile_log.launch_constraints.value()); + auto lparams = toString(compile_log.fusion_executor->lastLaunchParams()); benchmark_state.SetLabel(params + lparams); diff --git a/benchmarks/cpp/nvfuser/shape_inference.cpp b/benchmarks/cpp/nvfuser/shape_inference.cpp index 33a9404b0739..2e5e23ed7442 100644 --- a/benchmarks/cpp/nvfuser/shape_inference.cpp +++ b/benchmarks/cpp/nvfuser/shape_inference.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -10,27 +11,10 @@ #include -#include "utils.h" +#include using namespace torch::jit::fuser::cuda; -namespace { - -// Make a tensor that is known to be non-contiguous of dimensionality=ndims, -// but unknown sizes -TensorView* makeSymbolicTensor(size_t ndims, DataType dtype = DataType::Float) { - return TensorViewBuilder().ndims(ndims).dtype(dtype).build(); -} - -// Make a non-contiguous tensor of compile-time known sizes -TensorView* makeConcreteTensor( - std::vector shape, - DataType dtype = DataType::Float) { - return TensorViewBuilder().shape(shape).dtype(dtype).build(); -} - -} // namespace - static auto getLayerBackwardNormRuntime( std::unique_ptr fusion_ptr, std::unique_ptr& fec, @@ -151,7 +135,7 @@ static auto getLayerForwardNormRuntime( Fusion& fusion = *fusion_ptr.get(); const float kEps = 1e-5; - Double* eps_ptr = new Double(kEps); + Double* eps_ptr = IrBuilder::create(kEps); auto input = makeSymbolicTensor(shape.size()); fusion.addInput(input); diff --git a/benchmarks/cpp/nvfuser/softmax.cpp b/benchmarks/cpp/nvfuser/softmax.cpp index 3964e03671fa..439e426220f8 100644 --- a/benchmarks/cpp/nvfuser/softmax.cpp +++ b/benchmarks/cpp/nvfuser/softmax.cpp @@ -11,7 +11,7 @@ #include -#include "utils.h" +#include using namespace torch::jit::fuser::cuda; @@ -87,7 +87,7 @@ static void Softmax_WarpReduceReference(benchmark::State& benchmark_state) { std::vector aten_inputs({aten_input}); // Schedule through magic scheduler: - auto runtime_info = SchedulerRuntimeInfo(fusion, aten_inputs, true); + SchedulerRuntimeInfo runtime_info(fusion, aten_inputs, true); TORCH_INTERNAL_ASSERT(SchedulerEntry::canSchedule( ScheduleHeuristic::Persistent, fusion, runtime_info)); auto scheduler = SchedulerEntry::makeEntry( @@ -132,7 +132,7 @@ static void Softmax_WarpReduce(benchmark::State& benchmark_state) { std::vector aten_inputs({aten_input}); // Schedule through magic scheduler: - auto runtime_info = SchedulerRuntimeInfo(fusion, aten_inputs, true); + SchedulerRuntimeInfo runtime_info(fusion, aten_inputs, true); TORCH_INTERNAL_ASSERT(SchedulerEntry::canSchedule( ScheduleHeuristic::Persistent, fusion, runtime_info)); auto scheduler = SchedulerEntry::makeEntry( diff --git a/benchmarks/cpp/nvfuser/softmax_backward.cpp b/benchmarks/cpp/nvfuser/softmax_backward.cpp index 1bf2e623291a..8fb35083c6dc 100644 --- a/benchmarks/cpp/nvfuser/softmax_backward.cpp +++ b/benchmarks/cpp/nvfuser/softmax_backward.cpp @@ -11,7 +11,7 @@ #include -#include "utils.h" +#include using namespace torch::jit::fuser::cuda; diff --git a/benchmarks/cpp/nvfuser/softmax_dropout.cpp b/benchmarks/cpp/nvfuser/softmax_dropout.cpp index b4890eaf8d8a..48950373731c 100644 --- a/benchmarks/cpp/nvfuser/softmax_dropout.cpp +++ b/benchmarks/cpp/nvfuser/softmax_dropout.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -11,7 +12,7 @@ #include -#include "utils.h" +#include using namespace torch::jit::fuser::cuda; @@ -35,7 +36,7 @@ static void setupSoftmaxDropout( auto attention_scores = makeContigTensor(4, dtype); auto attention_mask = makeContigTensor(4, dtype); - Double* divisor = new Double(); + Double* divisor = IrBuilder::create(); fusion->addInput(attention_scores); fusion->addInput(attention_mask); @@ -49,8 +50,8 @@ static void setupSoftmaxDropout( attention_scores = div(attention_scores, divisor); attention_scores = add(attention_scores, attention_mask); auto attention_probs = softmax(attention_scores, kReductionAxis); - auto prob = new Double(kDropoutProbability); - auto scale = new Double(kScale); + auto prob = IrBuilder::create(kDropoutProbability); + auto scale = IrBuilder::create(kScale); auto dropout_results = dropout(attention_probs, prob, scale); auto output = dropout_results.output; diff --git a/benchmarks/cpp/nvfuser/timm.cpp b/benchmarks/cpp/nvfuser/timm.cpp new file mode 100644 index 000000000000..e7e9d22e8c95 --- /dev/null +++ b/benchmarks/cpp/nvfuser/timm.cpp @@ -0,0 +1,741 @@ +#include +#include +#include +#include + +#include + +#include + +#include + +using namespace torch::jit::fuser::cuda; + +static void setup_vit_base_patch16_224_bcast7(Fusion* fusion, void* null) { + FusionGuard fg(fusion); + + auto t2 = makeContigTensor(3, DataType::Float); + auto t3 = TensorViewBuilder() + .shape({-1, -1, 1}) + .dtype(DataType::Float) + .contiguity({true, true, false}) + .build(); + auto t4 = TensorViewBuilder() + .shape({-1, -1, 1}) + .dtype(DataType::Float) + .contiguity({true, true, false}) + .build(); + auto t7 = makeContigTensor(3, DataType::Half); + + fusion->addInput(t2); + fusion->addInput(t3); + fusion->addInput(t4); + fusion->addInput(t7); + + auto t8 = castOp(DataType::Float, t7); + auto t9 = set(t8); + auto t10 = sub(t2, t3); + auto t11 = mul(t10, t4); + auto t25 = mul(t9, t11); + auto t26 = sum(t25, {0, 1}); + auto t36 = set(t26); + auto t27 = sum(t9, {0, 1}); + auto t37 = set(t27); + auto t39 = castOp(DataType::Half, t11); + + fusion->addOutput(t36); + fusion->addOutput(t37); + fusion->addOutput(t39); +} + +static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast7( + benchmark::State& benchmark_state, + FusionExecutorCache* fusion_executor_cache, + void* null) { + std::vector input_shape{ + benchmark_state.range(0), + benchmark_state.range(1), + benchmark_state.range(2)}; + + at::manual_seed(0); + auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); + auto fp32_options = + at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + auto t2 = at::randn(input_shape, fp32_options); + auto t3 = at::randn({input_shape[0], input_shape[1], 1}, fp32_options); + auto t4 = at::randn({input_shape[0], input_shape[1], 1}, fp32_options); + auto t7 = at::randn(input_shape, fp16_options); + + std::vector aten_inputs({t2, t3, t4, t7}); + runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + + // full tensor - float + halfx2 - t2, t7, t39 + // Inner most dimension only - floatx2 - t36, t37 + // Outer two dimensions only - floatx2 - t3, t4 + benchmark_state.SetBytesProcessed( + int64_t(benchmark_state.iterations()) * + // t2 + t7 t3 + t4 t36 + t37 + t2.numel() * (4 + 2) + t3.numel() * 4 * 2 + input_shape[2] * (4 * 2) + + // T39 + t2.numel() * 2); +} + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast7, + setup_vit_base_patch16_224_bcast7, + NvFuserScheduler_TIMM_vit_base_patch16_224_bcast7, + nullptr); + +// pwise case, broadcasting both sides +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast7) + ->Args({64, 197, 768}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +static void setup_vit_base_patch16_224_bcast5(Fusion* fusion, void* null) { + FusionGuard fg(fusion); + + auto t2 = makeContigTensor(3, DataType::Float); + auto t5 = makeContigTensor(1, DataType::Float); + auto t3 = makeContigTensor(3, DataType::Half); + auto t0 = makeContigTensor(1, DataType::Float); + auto t1 = makeContigTensor(1, DataType::Float); + + fusion->addInput(t2); + fusion->addInput(t5); + fusion->addInput(t3); + fusion->addInput(t0); + fusion->addInput(t1); + + std::vector bcast_pattern0({true, true, false}); + std::vector bcast_pattern1({false, false, true}); + + auto t4 = castOp(DataType::Float, t3); + auto t6 = set(t5); + auto t7 = broadcast(t6, bcast_pattern0); + auto t8 = add(t4, t7); + auto t9 = randlike(t8); + auto d34 = + sub(IrBuilder::create(1.0), IrBuilder::create(0.0)); + auto t10 = lt(t9, d34); + auto t11 = castOp(DataType::Float, t10); + auto t12 = mul(t8, t11); + auto b36 = eq(d34, IrBuilder::create(0.0)); + auto d37 = castOp(DataType::Double, b36); + auto d38 = add(d37, d34); + auto d40 = div(IrBuilder::create(1.0), d38); + auto t13 = mul(t12, d40); + auto t14 = set(t13); + auto t15 = add(t2, t14); + auto t16 = set(t15); + auto t36 = sum(t16, {2}); + auto d151 = castOp(DataType::Double, t2->axis(2)->extent()); + auto d152 = mul(IrBuilder::create(1.0), d151); + auto t19 = div(t36, d152); + auto t22 = broadcast(t19, bcast_pattern1); + auto t23 = sub(t16, t22); + auto t37 = mul(t23, t23); + auto t20 = sum(t37, {2}); + auto t24 = broadcast(t20, bcast_pattern1); + auto d95 = castOp(DataType::Double, t2->axis(2)->extent()); + auto d96 = mul(IrBuilder::create(1.0), d95); + auto d105 = reciprocal(d95); + auto t25 = mul(t24, d105); + auto t26 = add(t25, IrBuilder::create(1e-6)); + auto t27 = rsqrt(t26); + auto t28 = mul(t23, t27); + auto t17 = set(t1); + auto t29 = broadcast(t17, bcast_pattern0); + auto t30 = mul(t28, t29); + auto t18 = set(t0); + auto t31 = broadcast(t18, bcast_pattern0); + auto t32 = add(t30, t31); + auto t33 = set(t32); + auto t34 = castOp(DataType::Half, t33); + + fusion->addOutput(t16); // full 3d float + fusion->addOutput(t10); // full 3d bool + fusion->addOutput(t22); // bcast last dim float + fusion->addOutput(t27); // bcast last dim float + fusion->addOutput(t18); // passthrough t0 float + fusion->addOutput(t17); // passthrough t1 float + fusion->addOutput(t34); // full 3d half +} + +static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast5( + benchmark::State& benchmark_state, + FusionExecutorCache* fusion_executor_cache, + void* null) { + std::vector input_shape{ + benchmark_state.range(0), + benchmark_state.range(1), + benchmark_state.range(2)}; + + at::manual_seed(0); + auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); + auto fp32_options = + at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + auto t2 = at::randn(input_shape, fp32_options); + auto t5 = at::randn({input_shape[2]}, fp32_options); + auto t3 = at::randn(input_shape, fp16_options); + auto t0 = at::randn({input_shape[2]}, fp32_options); + auto t1 = at::randn({input_shape[2]}, fp32_options); + + std::vector aten_inputs({t2, t5, t3, t0, t1}); + runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + + // Full tensor - floatx2, halfx2, bool - t2, t16, t3, t34, t16 + // Inner most dim only - floatx5 - t5, t0, t1, t7, t17 + // Outer two dims only - floatx2 - t22, t27 + + benchmark_state.SetBytesProcessed( + int64_t(benchmark_state.iterations()) * + t2.numel() * (2 * 4 + 2 * 2 + 1) + t5.numel() * 5 * 4 + + input_shape[0] * input_shape[1] * 2 * 4); +} + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_TIMM_vit_base_patch16_224_bcast5_NCHW, + setup_vit_base_patch16_224_bcast5, + NvFuserScheduler_TIMM_vit_base_patch16_224_bcast5, + nullptr); + +// Broadcast on both sides +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_vit_base_patch16_224_bcast5_NCHW) + ->Args({64, 197, 768}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +static void setup_vit_base_patch16_224_bcast_outer2( + Fusion* fusion, + void* null) { + FusionGuard fg(fusion); + + auto t0 = makeContigTensor(3, DataType::Half); + auto t2 = makeContigTensor(1, DataType::Float); + + fusion->addInput(t0); + fusion->addInput(t2); + + auto t1 = castOp(DataType::Float, t0); + auto t3 = set(t2); + auto t4 = broadcast(t3, {true, true, false}); + auto t5 = add(t1, t4); + auto t6 = castOp(DataType::Half, t5); + auto t7 = castOp(DataType::Half, t3); + + fusion->addOutput(t6); + fusion->addOutput(t7); +} + +static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer2( + benchmark::State& benchmark_state, + FusionExecutorCache* fusion_executor_cache, + void* null) { + std::vector input_shape{ + benchmark_state.range(0), + benchmark_state.range(1), + benchmark_state.range(2)}; + + at::manual_seed(0); + auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); + auto fp32_options = + at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + auto t0 = at::randn(input_shape, fp16_options); + auto t2 = at::randn({input_shape[2]}, fp32_options); + + std::vector aten_inputs({t0, t2}); + runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + + // full tensor - halfx2 - t0, t6 + // inner dimension only - halfx2 - t2, t7 + benchmark_state.SetBytesProcessed( + int64_t(benchmark_state.iterations()) * t0.numel() * (2 + 2) + + input_shape[2] * (2 + 4)); +} + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast_outer2, + setup_vit_base_patch16_224_bcast_outer2, + NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer2, + nullptr); + +NVFUSER_BENCHMARK_RUN( + NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast_outer2) + ->Args({64, 197, 2304}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +static void setup_vit_base_patch16_224_norm_inner3(Fusion* fusion, void* null) { + FusionGuard fg(fusion); + + auto t0 = makeContigTensor(4, DataType::Half); + fusion->addInput(t0); + auto d13 = IrBuilder::create(); + fusion->addInput(d13); + + auto t1 = castOp(DataType::Float, t0); + auto t2 = set(t1); + auto t3 = mul(t2, d13); + auto t4 = set(t3); + auto t5 = max(t4, {3}); + auto t6 = broadcast(t5, {false, false, false, true}); + auto t7 = sub(t4, t6); + auto t8 = exp(t7); + auto t9 = sum(t8, {3}); + auto t10 = broadcast(t9, {false, false, false, true}); + auto t11 = reciprocal(t10); + auto t12 = mul(t8, t11); + auto t13 = randlike(t12); + auto d79 = sub(IrBuilder::create(1), IrBuilder::create(0)); + auto t14 = lt(t13, d79); + auto t15 = castOp(DataType::Float, t14); + auto b81 = eq(d79, IrBuilder::create(0)); + auto d82 = castOp(DataType::Double, b81); + auto d83 = add(d82, d79); + auto d85 = div(IrBuilder::create(1), d83); + auto t16 = mul(t12, t15); + auto t17 = mul(t16, d85); + auto t18 = set(t17); + auto t19 = castOp(DataType::Half, t18); + + fusion->addOutput(t19); + fusion->addOutput(t14); + fusion->addOutput(t12); + fusion->addOutput(t4); +} + +static void NvFuserScheduler_TIMM_vit_base_patch16_224_norm_inner3( + benchmark::State& benchmark_state, + FusionExecutorCache* fusion_executor_cache, + void* null) { + std::vector input_shape{ + benchmark_state.range(0), + benchmark_state.range(1), + benchmark_state.range(2), + benchmark_state.range(2)}; + + at::manual_seed(0); + auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); + auto fp32_options = + at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + auto t0 = at::randn(input_shape, fp16_options); + + std::vector aten_inputs({t0, 0.125}); + runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + + // Full tensors - floatx2, half x2, bool - t12, t4, t0, t19, t14 + benchmark_state.SetBytesProcessed( + int64_t(benchmark_state.iterations()) * t0.numel() * 13); +} + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_norm_inner3, + setup_vit_base_patch16_224_norm_inner3, + NvFuserScheduler_TIMM_vit_base_patch16_224_norm_inner3, + nullptr); + +// Norm inner dim +NVFUSER_BENCHMARK_RUN( + NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_norm_inner3) + ->Args({64, 12, 197}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +static void setup_vit_base_patch16_224_bcast_outer6( + Fusion* fusion, + void* null) { + FusionGuard fg(fusion); + + auto t0 = makeContigTensor(3, DataType::Half); + auto t2 = makeContigTensor(1, DataType::Float); + + fusion->addInput(t0); + fusion->addInput(t2); + + auto t1 = castOp(DataType::Float, t0); + auto t3 = set(t2); + auto t4 = broadcast(t3, {true, true, false}); + auto t5 = add(t1, t4); + auto t6 = set(t5); + auto t7 = mul(t6, IrBuilder::create(0.707106)); + auto t8 = erf(t7); + auto t9 = add(IrBuilder::create(1), t8); + auto t10 = mul(IrBuilder::create(0.5), t9); + auto t11 = mul(t6, t10); + auto t12 = randlike(t11); + auto d66 = sub(IrBuilder::create(1), IrBuilder::create(0)); + auto t13 = lt(t12, d66); + auto t14 = castOp(DataType::Float, t13); + auto t15 = mul(t11, t14); + auto b68 = eq(d66, IrBuilder::create(0)); + auto d69 = castOp(DataType::Double, b68); + auto d70 = add(d69, d66); + auto d72 = div(IrBuilder::create(1), d70); + auto t16 = mul(t15, d72); + auto t17 = set(t16); + auto t18 = castOp(DataType::Half, t17); + auto t19 = castOp(DataType::Half, t3); + + fusion->addOutput(t18); + fusion->addOutput(t13); + fusion->addOutput(t6); + fusion->addOutput(t19); +} + +static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer6( + benchmark::State& benchmark_state, + FusionExecutorCache* fusion_executor_cache, + void* null) { + std::vector input_shape{ + benchmark_state.range(0), + benchmark_state.range(1), + benchmark_state.range(2)}; + + at::manual_seed(0); + auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); + auto fp32_options = + at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + auto t0 = at::randn(input_shape, fp16_options); + auto t2 = at::randn({input_shape[2]}, fp32_options); + + std::vector aten_inputs({t0, t2}); + runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + // full tensors - float, halfx2, bool - t6, t0, t18, t13 + // inner dimension only - float, half - t2, t19 + benchmark_state.SetBytesProcessed( + int64_t(benchmark_state.iterations()) * t0.numel() * (2 + 2 + 1 + 4) + + input_shape[2] * (4 + 2)); +} + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast_outer6, + setup_vit_base_patch16_224_bcast_outer6, + NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer6, + nullptr); + +NVFUSER_BENCHMARK_RUN( + NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast_outer6) + // First size is original, the rest are variations to check perf + // reliability. + ->Args({64, 197, 3 * 1024}) + ->Args({64, 197, 2 * 1024}) + ->Args({64, 197, 1024}) + ->Args({64, 197, 512}) + ->Args({3, 1024, 64 * 197}) + ->Args({2, 1024, 64 * 197}) + ->Args({1, 1024, 64 * 197}) + ->Args({2, 256, 64 * 197}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +// Reverse the broadcast dimensions to check for consistency in scheduling. +static void setup_vit_base_patch16_224_bcast_inner6( + Fusion* fusion, + void* null) { + FusionGuard fg(fusion); + + auto t0 = makeContigTensor(3, DataType::Half); + auto t2 = makeContigTensor(2, DataType::Float); + + fusion->addInput(t0); + fusion->addInput(t2); + + auto t1 = castOp(DataType::Float, t0); + auto t3 = set(t2); + auto t4 = broadcast(t3, {false, false, true}); + auto t5 = add(t1, t4); + auto t6 = set(t5); + auto t7 = mul(t6, IrBuilder::create(0.707106)); + auto t8 = erf(t7); + auto t9 = add(IrBuilder::create(1), t8); + auto t10 = mul(IrBuilder::create(0.5), t9); + auto t11 = mul(t6, t10); + auto t12 = randlike(t11); + auto d66 = sub(IrBuilder::create(1), IrBuilder::create(0)); + auto t13 = lt(t12, d66); + auto t14 = castOp(DataType::Float, t13); + auto t15 = mul(t11, t14); + auto b68 = eq(d66, IrBuilder::create(0)); + auto d69 = castOp(DataType::Double, b68); + auto d70 = add(d69, d66); + auto d72 = div(IrBuilder::create(1), d70); + auto t16 = mul(t15, d72); + auto t17 = set(t16); + auto t18 = castOp(DataType::Half, t17); + auto t19 = castOp(DataType::Half, t3); + + fusion->addOutput(t18); + fusion->addOutput(t13); + fusion->addOutput(t6); + fusion->addOutput(t19); +} + +static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_inner6( + benchmark::State& benchmark_state, + FusionExecutorCache* fusion_executor_cache, + void* null) { + std::vector input_shape{ + benchmark_state.range(0), + benchmark_state.range(1), + benchmark_state.range(2)}; + + at::manual_seed(0); + auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); + auto fp32_options = + at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + auto t0 = at::randn(input_shape, fp16_options); + auto t2 = at::randn({input_shape[0], input_shape[1]}, fp32_options); + + std::vector aten_inputs({t0, t2}); + runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + + // full tensors - float, halfx2, bool - t6, t0, t18, t13 + // outer two dimensions only - float, half - t2, t19 + benchmark_state.SetBytesProcessed( + int64_t(benchmark_state.iterations()) * t0.numel() * (2 + 2 + 1 + 4) + + input_shape[0] * input_shape[1] * (4 + 2)); +} + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast_inner6, + setup_vit_base_patch16_224_bcast_inner6, + NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_inner6, + nullptr); + +NVFUSER_BENCHMARK_RUN( + NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast_inner6) + ->Args({64, 197, 3 * 1024}) + ->Args({64, 197, 2 * 1024}) + ->Args({64, 197, 1024}) + ->Args({64, 197, 512}) + ->Args({3, 1024, 64 * 197}) + ->Args({2, 1024, 64 * 197}) + ->Args({1, 1024, 64 * 197}) + ->Args({2, 256, 64 * 197}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +static void setup_vit_base_patch16_224_LN_BWD(Fusion* fusion, void* null) { + FusionGuard fg(fusion); + + auto t0 = makeContigTensor(3, DataType::Bool); + fusion->addInput(t0); + + auto t1 = makeContigTensor(3, DataType::Half); + fusion->addInput(t1); + + auto t2 = castOp(DataType::Float, t1); + + auto t3 = makeContigTensor(3, DataType::Half); + fusion->addInput(t3); + + auto t4 = castOp(DataType::Float, t3); + + auto d35 = t3->axis(2)->extent(); + + auto t5 = TensorViewBuilder() + .shape({-1, -1, 1}) + .dtype(DataType::Float) + .contiguity({true, true, false}) + .build(); + fusion->addInput(t5); + + auto t6 = TensorViewBuilder() + .shape({-1, -1, 1}) + .dtype(DataType::Float) + .contiguity({true, true, false}) + .build(); + fusion->addInput(t6); + + auto t7 = makeContigTensor(1, DataType::Half); + fusion->addInput(t7); + + auto t8 = castOp(DataType::Float, t7); + + auto t9 = makeContigTensor(1, DataType::Half); + fusion->addInput(t9); + + auto t11 = sub(t4, t5); + auto t12 = mul(t11, t6); + + auto t13 = broadcast(t8, {true, true, false}); + auto t14 = mul(t2, t13); + auto t15 = mul(d35, t14); + auto t16 = sum(t14, {2}); + auto t17 = broadcast(t16, {false, false, true}); + auto t18 = mul(t14, t12); + auto t19 = sum(t18, {2}); + auto t20 = broadcast(t19, {false, false, true}); + + auto t40 = castOp(DataType::Half, t12); + auto t41 = castOp(DataType::Float, t40); + auto t42 = castOp(DataType::Half, t20); + auto t43 = castOp(DataType::Float, t42); + auto t21 = mul(t42, t43); + + auto t38 = castOp(DataType::Half, t15); + auto t39 = castOp(DataType::Float, t38); + auto t44 = castOp(DataType::Half, t17); + auto t45 = castOp(DataType::Float, t44); + auto t22 = sub(t39, t45); + + auto t23 = sub(t22, t21); + + auto d87 = reciprocal(d35); + auto t24 = mul(d87, t6); + + auto t25 = mul(t24, t23); + auto t26 = mul(t2, t41); + auto t27 = sum(t26, {0, 1}); + auto t28 = sum(t2, {0, 1}); + + auto t29 = castOp(DataType::Float, t0); + auto t30 = mul(t25, t29); + + auto d33 = IrBuilder::create(); + fusion->addInput(d33); + auto t31 = mul(t30, d33); + auto t32 = sum(t31, {0, 1}); + auto t33 = castOp(DataType::Half, t32); + auto t34 = castOp(DataType::Half, t31); + auto t35 = castOp(DataType::Half, t25); + auto t36 = castOp(DataType::Half, t27); + auto t37 = castOp(DataType::Half, t28); + + fusion->addOutput(t33); + fusion->addOutput(t34); + fusion->addOutput(t35); + fusion->addOutput(t36); + fusion->addOutput(t37); +} + +static void NvFuserScheduler_TIMM_vit_base_patch16_224_LN_BWD( + benchmark::State& benchmark_state, + FusionExecutorCache* fusion_executor_cache, + void* null) { + std::vector input_shape{ + benchmark_state.range(0), + benchmark_state.range(1), + benchmark_state.range(2)}; + + at::manual_seed(0); + // auto bool_options = at::TensorOptions().dtype(at::kBool).device(at::kCUDA, + // 0); + auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); + auto fp32_options = + at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + auto t0 = at::randn(input_shape, fp16_options).to(at::kBool); + auto t1 = at::randn(input_shape, fp16_options); + auto t3 = at::randn(input_shape, fp16_options); + auto t5 = at::randn({input_shape[0], input_shape[1], 1}, fp32_options); + auto t6 = at::randn({input_shape[0], input_shape[1], 1}, fp32_options); + auto t7 = at::randn({input_shape[2]}, fp16_options); + auto t9 = at::randn({input_shape[2]}, fp16_options); + + std::vector aten_inputs({t0, t1, t3, t5, t6, t7, t9, 1.0}); + runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + + // Full tensors - bool, halfx4 - t0, t1, t3, t34, t35 + // Outer two dimensions - floatx2 - t5, t6 + // Inner dimension - halfx5 - t7, t9, t33, t36, t37 + benchmark_state.SetBytesProcessed( + int64_t(benchmark_state.iterations()) * ((t0.numel() * (4 * 2 + 1))) + + (t5.numel() * 4 * 2) + (t7.numel() * 5 * 2)); +} + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_LN_BWD, + setup_vit_base_patch16_224_LN_BWD, + NvFuserScheduler_TIMM_vit_base_patch16_224_LN_BWD, + nullptr); + +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_LN_BWD) + ->Args({128, 197, 768}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +static void nhwc_seresnet152d_transpose65(Fusion* fusion, void* null) { + FusionGuard fg(fusion); + + auto t2 = makeContigTensor(4, DataType::Half); + auto t5 = makeContigTensor(4, DataType::Half); + auto t7 = makeContigTensor(4, DataType::Half); + auto t9 = makeContigTensor(4, DataType::Half); + auto t4 = makeConcreteTensor({}, DataType::Half); + + fusion->addInput(t2); + fusion->addInput(t5); + fusion->addInput(t7); + fusion->addInput(t9); + fusion->addInput(t4); + + auto d86 = IrBuilder::create(0); + + auto t3 = castOp(DataType::Float, t2); + auto t6 = castOp(DataType::Float, t5); + auto t8 = castOp(DataType::Float, t7); + auto t10 = castOp(DataType::Float, t9); + auto t11 = add(t8, t10); + auto t12 = set(t11); + auto t13 = set(t6); + auto t14 = lt(t13, d86); + auto t15 = broadcast(t4, {true, true, true, true}); + auto t16 = where(t14, t15, t12); + auto t17 = set(t16); + auto t29 = castOp(DataType::Half, t17); + auto t18 = mul(t17, t3); + auto t19 = transpose(t18, {{0, 0}, {1, 3}, {2, 1}, {3, 2}}); + auto t30 = castOp(DataType::Half, t19); + + fusion->addOutput(t29); + fusion->addOutput(t30); +} + +static void NvFuserScheduler_nhwc_seresnet152d_transpose65( + benchmark::State& benchmark_state, + FusionExecutorCache* fusion_executor_cache, + void* null) { + std::vector input_shape{ + benchmark_state.range(0), + benchmark_state.range(2), + benchmark_state.range(2), + benchmark_state.range(1)}; + + at::manual_seed(0); + auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); + + auto t2 = at::randn(input_shape, fp16_options); + auto t5 = at::randn(input_shape, fp16_options); + auto t7 = at::randn(input_shape, fp16_options); + auto t9 = at::randn(input_shape, fp16_options); + // Need zero dim tensor don't know how to do that, so just going to reduce a + // 1D tensor + auto t4 = at::randn({2}, fp16_options).sum(); + + std::vector aten_inputs({t2, t5, t7, t9, t4}); + runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); + + // Full tensors - halfx6 - t2, t5, t7, t9, t29, t30 + benchmark_state.SetBytesProcessed( + int64_t(benchmark_state.iterations()) * t2.numel() * 6 * 2); +} + +NVFUSER_BENCHMARK_DEFINE( + NvFuserScheduler_TIMM_nhwc_seresnet152d_transpose65, + nhwc_seresnet152d_transpose65, + NvFuserScheduler_nhwc_seresnet152d_transpose65, + nullptr); + +// Norm inner dim Half version of vit_base_patch16_224_norm_inner3 +NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_nhwc_seresnet152d_transpose65) + ->Args({128, 12, 197}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); diff --git a/benchmarks/cpp/nvfuser/transpose.cpp b/benchmarks/cpp/nvfuser/transpose.cpp new file mode 100644 index 000000000000..39ee0452c160 --- /dev/null +++ b/benchmarks/cpp/nvfuser/transpose.cpp @@ -0,0 +1,483 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +#define TRANSPOSE_CONFIG {true, false, false, false} + +using namespace torch::jit::fuser::cuda; + +struct TransposeConfig { + bool input1_transpose_axes = false; + bool input2_transpose_axes = false; + bool intermediate_transpose_axes = false; + bool output_transpose_axes = false; +}; + +std::vector generateInputs( + DataType dtype, + int num_dims, + std::pair axes, + int perm_size, + int innerdim_size, + bool input1_transpose_axes, + bool input2_transpose_axes, + bool non_vectorize_offset = false, + int iter_size = 32) { + at::manual_seed(0); + auto options = + at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); + + std::vector transpose_shape(num_dims, iter_size); + transpose_shape[axes.second] = innerdim_size; + transpose_shape[axes.first] = perm_size; + + std::vector non_transpose_shape(num_dims, iter_size); + non_transpose_shape[axes.first] = innerdim_size; + non_transpose_shape[axes.second] = perm_size; + + // TensorType: Concrete, Contig, Symbolic + // Vectorization | Unroll - Add 1 to sizes + // Shift axis by 1 to disable vectorize loads + if (non_vectorize_offset) { + for (auto idx : c10::irange(transpose_shape.size())) { + transpose_shape[idx] += 1; + } + for (auto idx : c10::irange(non_transpose_shape.size())) { + non_transpose_shape[idx] += 1; + } + } + + auto optionalTransposeSize = + [&transpose_shape, &non_transpose_shape](bool transpose_tensor) { + return (transpose_tensor) ? transpose_shape : non_transpose_shape; + }; + + at::Tensor aten_input1 = + at::randn(optionalTransposeSize(input1_transpose_axes), options); + at::Tensor aten_input2 = + at::randn(optionalTransposeSize(input2_transpose_axes), options); + return {aten_input1, aten_input2}; +} + +//------------------------------------------------------------------------------ + +static void setupTranspose( + Fusion* fusion, + DataType dtype, + int num_dims, + std::pair axes, + TransposeConfig tc) { + FusionGuard fg(fusion); + typedef std::pair transpose_axes; + + auto getTransposeMap = + [](const transpose_axes& axes) -> std::unordered_map { + return {{axes.first, axes.second}, {axes.second, axes.first}}; + }; + + auto optionalTranspose = [&getTransposeMap, axes]( + TensorView* tv, bool is_transpose) { + return (is_transpose) ? transpose(tv, getTransposeMap(axes)) : tv; + }; + + auto input1 = makeContigTensor(num_dims); + auto input2 = makeContigTensor(num_dims); + fusion->addInput(input1); + fusion->addInput(input2); + + auto ot_input1 = optionalTranspose(input1, tc.input1_transpose_axes); + auto ot_input2 = optionalTranspose(input2, tc.input2_transpose_axes); + auto intermediate = add(ot_input1, ot_input2); + auto ot_intermediate = + optionalTranspose(intermediate, tc.intermediate_transpose_axes); + auto output = relu(ot_intermediate); + auto ot_output = optionalTranspose(output, tc.output_transpose_axes); + fusion->addOutput(ot_output); +} + +static void NvFuserScheduler_Transpose( + benchmark::State& benchmark_state, + FusionExecutorCache* fusion_executor_cache, + DataType dtype, + int num_dims, + std::pair axes, + TransposeConfig tc) { + auto aten_inputs = generateInputs( + dtype, + num_dims, + axes, + benchmark_state.range(0), + benchmark_state.range(1), + tc.input1_transpose_axes, + tc.input2_transpose_axes); + auto at_input1 = aten_inputs[0]; + auto at_input2 = aten_inputs[1]; + + std::vector fuser_inputs = {at_input1, at_input2}; + runBenchmarkIterations(benchmark_state, fusion_executor_cache, fuser_inputs); + + benchmark_state.SetBytesProcessed( + int64_t(benchmark_state.iterations()) * + ((at_input1.numel() * 3) * int64_t(dataTypeSize(dtype)))); +} + +//------------------------------------------------------------------------------ + +#define NVFUSER_TRANSPOSE_SQUARE_RUN( \ + TITLE, DTYPE, NUM_DIMS, AXIS1, AXIS2, CONFIG) \ + NVFUSER_BENCHMARK_DEFINE( \ + TITLE, \ + setupTranspose, \ + NvFuserScheduler_Transpose, \ + DTYPE, \ + NUM_DIMS, \ + {AXIS1, AXIS2}, \ + CONFIG); \ + \ + NVFUSER_BENCHMARK_RUN(TITLE) \ + ->RangeMultiplier(8) \ + ->Args({9, 2408}) \ + ->Args({16, 512}) \ + ->Args({18, 96}) \ + ->Args({24, 96}) \ + ->Args({24, 256}) \ + ->Args({24, 512}) \ + ->Args({32, 27}) \ + ->Args({32, 96}) \ + ->Args({32, 288}) \ + ->Args({32, 864}) \ + ->Args({40, 120}) \ + ->Args({48, 128}) \ + ->Args({48, 256}) \ + ->Args({49, 512}) \ + ->Args({49, 1024}) \ + ->Args({49, 2048}) \ + ->Args({49, 4608}) \ + ->Args({64, 64}) \ + ->Args({64, 96}) \ + ->Args({64, 128}) \ + ->Args({64, 147}) \ + ->Args({64, 192}) \ + ->Args({64, 256}) \ + ->Args({64, 288}) \ + ->Args({64, 512}) \ + ->Args({80, 64}) \ + ->Args({81, 1728}) \ + ->Args({83, 1728}) \ + ->Args({96, 864}) \ + ->Args({100, 1280}) \ + ->Args({100, 4032}) \ + ->Args({120, 40}) \ + ->Args({128, 128}) \ + ->Args({128, 512}) \ + ->Args({128, 1152}) \ + ->Args({192, 128}) \ + ->Args({192, 256}) \ + ->Args({192, 720}) \ + ->Args({192, 768}) \ + ->Args({192, 1120}) \ + ->Args({192, 1728}) \ + ->Args({196, 256}) \ + ->Args({196, 512}) \ + ->Args({196, 1024}) \ + ->Args({196, 2304}) \ + ->Args({256, 256}) \ + ->Args({256, 1024}) \ + ->Args({256, 2304}) \ + ->Args({284, 512}) \ + ->Args({320, 1280}) \ + ->Args({320, 1728}) \ + ->Args({324, 2592}) \ + ->Args({361, 768}) \ + ->Args({361, 1120}) \ + ->Args({384, 2}) \ + ->Args({384, 32}) \ + ->Args({384, 128}) \ + ->Args({384, 256}) \ + ->Args({384, 512}) \ + ->Args({384, 1280}) \ + ->Args({384, 2592}) \ + ->Args({384, 4032}) \ + ->Args({448, 1280}) \ + ->Args({480, 16}) \ + ->Args({480, 256}) \ + ->Args({512, 2}) \ + ->Args({512, 16}) \ + ->Args({512, 128}) \ + ->Args({512, 256}) \ + ->Args({512, 1024}) \ + ->Args({512, 2048}) \ + ->Args({512, 3072}) \ + ->Args({512, 4608}) \ + ->Args({784, 40}) \ + ->Args({784, 120}) \ + ->Args({784, 128}) \ + ->Args({784, 1152}) \ + ->Args({1001, 2408}) \ + ->Args({1024, 16}) \ + ->Args({1024, 256}) \ + ->Args({1024, 512}) \ + ->Args({1024, 1024}) \ + ->Args({1024, 3072}) \ + ->Args({1369, 192}) \ + ->Args({1369, 256}) \ + ->Args({1369, 288}) \ + ->Args({2048, 512}) \ + ->Args({2048, 1024}) \ + ->Args({2250, 27}) \ + ->Args({3072, 512}) \ + ->Args({3072, 1024}) \ + ->Args({3136, 64}) \ + ->Args({5329, 720}) \ + ->Args({5625, 64}) \ + ->Args({12544, 147}) \ + ->Args({22201, 288}) \ + ->Unit(benchmark::kMicrosecond) + +NVFUSER_TRANSPOSE_SQUARE_RUN( + NF_Transpose_Random_fp32_Inner_2D_01_Axis, + DataType::Float, + 2 /* num_dims */, + 0 /* axis1 */, + 1 /* axis2 */, + TransposeConfig(TRANSPOSE_CONFIG)); + +NVFUSER_TRANSPOSE_SQUARE_RUN( + NF_Transpose_Random_fp32_Inner_3D_02_Axis, + DataType::Float, + 3 /* num_dims */, + 0 /* axis1 */, + 2 /* axis2 */, + TransposeConfig(TRANSPOSE_CONFIG)); + +NVFUSER_TRANSPOSE_SQUARE_RUN( + NF_Transpose_Random_fp32_Inner_3D_12_Axis, + DataType::Float, + 3 /* num_dims */, + 1 /* axis1 */, + 2 /* axis2 */, + TransposeConfig(TRANSPOSE_CONFIG)); + +NVFUSER_TRANSPOSE_SQUARE_RUN( + NF_Transpose_Random_fp32_Outer_3D_01_Axis, + DataType::Float, + 3 /* num_dims */, + 0 /* axis1 */, + 1 /* axis2 */, + TransposeConfig(TRANSPOSE_CONFIG)); + +//------------------------------------------------------------------------------ + +NVFUSER_TRANSPOSE_SQUARE_RUN( + NF_Transpose_Random_fp16_Inner_2D_01_Axis, + DataType::Half, + 2 /* num_dims */, + 0 /* axis1 */, + 1 /* axis2 */, + TransposeConfig(TRANSPOSE_CONFIG)); + +NVFUSER_TRANSPOSE_SQUARE_RUN( + NF_Transpose_Random_fp16_Inner_3D_02_Axis, + DataType::Half, + 3 /* num_dims */, + 0 /* axis1 */, + 2 /* axis2 */, + TransposeConfig(TRANSPOSE_CONFIG)); + +NVFUSER_TRANSPOSE_SQUARE_RUN( + NF_Transpose_Random_fp16_Inner_3D_12_Axis, + DataType::Half, + 3 /* num_dims */, + 1 /* axis1 */, + 2 /* axis2 */, + TransposeConfig(TRANSPOSE_CONFIG)); + +NVFUSER_TRANSPOSE_SQUARE_RUN( + NF_Transpose_Random_fp16_Outer_3D_01_Axis, + DataType::Half, + 3 /* num_dims */, + 0 /* axis1 */, + 1 /* axis2 */, + TransposeConfig(TRANSPOSE_CONFIG)); + +//------------------------------------------------------------------------------ + + +#define NVFUSER_TRANSPOSE_RUN(TITLE, DTYPE, NUM_DIMS, AXIS1, AXIS2, CONFIG) \ + NVFUSER_BENCHMARK_DEFINE( \ + TITLE, \ + setupTranspose, \ + NvFuserScheduler_Transpose, \ + DTYPE, \ + NUM_DIMS, \ + {AXIS1, AXIS2}, \ + CONFIG); \ + \ + NVFUSER_BENCHMARK_RUN(TITLE) \ + ->RangeMultiplier(8) \ + ->Ranges({{2, 256 * 256}, {160, 320}}) \ + ->Unit(benchmark::kMicrosecond) \ + +NVFUSER_TRANSPOSE_RUN( + NF_Transpose_fp32_Inner_2D_01_Axis, + DataType::Float, + 2 /* num_dims */, + 0 /* axis1 */, + 1 /* axis2 */, + TransposeConfig(TRANSPOSE_CONFIG)); + +NVFUSER_TRANSPOSE_RUN( + NF_Transpose_fp32_Inner_3D_02_Axis, + DataType::Float, + 3 /* num_dims */, + 0 /* axis1 */, + 2 /* axis2 */, + TransposeConfig(TRANSPOSE_CONFIG)); + +NVFUSER_TRANSPOSE_RUN( + NF_Transpose_fp32_Inner_3D_12_Axis, + DataType::Float, + 3 /* num_dims */, + 1 /* axis1 */, + 2 /* axis2 */, + TransposeConfig(TRANSPOSE_CONFIG)); + +NVFUSER_TRANSPOSE_RUN( + NF_Transpose_fp32_Outer_3D_01_Axis, + DataType::Float, + 3 /* num_dims */, + 0 /* axis1 */, + 1 /* axis2 */, + TransposeConfig(TRANSPOSE_CONFIG)); + +//------------------------------------------------------------------------------ + +NVFUSER_TRANSPOSE_RUN( + NF_Transpose_fp16_Inner_2D_01_Axis, + DataType::Half, + 2 /* num_dims */, + 0 /* axis1 */, + 1 /* axis2 */, + TransposeConfig(TRANSPOSE_CONFIG)); + +NVFUSER_TRANSPOSE_RUN( + NF_Transpose_fp16_Inner_3D_02_Axis, + DataType::Half, + 3 /* num_dims */, + 0 /* axis1 */, + 2 /* axis2 */, + TransposeConfig(TRANSPOSE_CONFIG)); + +NVFUSER_TRANSPOSE_RUN( + NF_Transpose_fp16_Inner_3D_12_Axis, + DataType::Half, + 3 /* num_dims */, + 1 /* axis1 */, + 2 /* axis2 */, + TransposeConfig(TRANSPOSE_CONFIG)); + +NVFUSER_TRANSPOSE_RUN( + NF_Transpose_fp16_Outer_3D_01_Axis, + DataType::Half, + 3 /* num_dims */, + 0 /* axis1 */, + 1 /* axis2 */, + TransposeConfig(TRANSPOSE_CONFIG)); + +//------------------------------------------------------------------------------ + +static void Baseline_Transpose( + benchmark::State& benchmark_state, + DataType dtype, + int num_dims, + std::pair axes, + TransposeConfig tc) { + auto aten_inputs = generateInputs( + dtype, + num_dims, + axes, + benchmark_state.range(0), + benchmark_state.range(1), + tc.input1_transpose_axes, + tc.input2_transpose_axes); + auto at_input1 = aten_inputs[0]; + auto at_input2 = aten_inputs[1]; + + auto optionalTransposeAten = [&axes](at::Tensor at, bool is_transpose) { + return (is_transpose) ? at::transpose(at, axes.first, axes.second) : at; + }; + + for (auto _ : benchmark_state) { + clearL2Cache(); + CudaKernelTimer timer; + + auto at_ot_input1 = + optionalTransposeAten(at_input1, tc.input1_transpose_axes); + auto at_ot_input2 = + optionalTransposeAten(at_input2, tc.input2_transpose_axes); + auto at_intermediate = add(at_ot_input1, at_ot_input2); + auto at_ot_intermediate = + optionalTransposeAten(at_intermediate, tc.intermediate_transpose_axes); + auto at_output = relu(at_ot_intermediate); + auto at_ot_output = + optionalTransposeAten(at_output, tc.output_transpose_axes); + + benchmark_state.SetIterationTime(timer.elapsed() / 1000.0); + } + // Sync everything up before we're finished, don't want to run ahead on the + // cpu while benchmarking. + cudaDeviceSynchronize(); + + benchmark_state.SetBytesProcessed( + int64_t(benchmark_state.iterations()) * + (at_input1.numel() * 3 * int64_t(dataTypeSize(dtype)))); +} + +//------------------------------------------------------------------------------ + +static void Baseline_Transpose_fp32_Inner_2D_01_Axis( + benchmark::State& benchmark_state) { + Baseline_Transpose( + benchmark_state, + DataType::Float, + 2 /* num_dims */, + {0, 1} /* axes */, + TRANSPOSE_CONFIG); +} + +static void Baseline_Transpose_fp16_Inner_2D_01_Axis( + benchmark::State& benchmark_state) { + Baseline_Transpose( + benchmark_state, + DataType::Half, + 2 /* num_dims */, + {0, 1} /* axes */, + TRANSPOSE_CONFIG); +} + +//------------------------------------------------------------------------------ + +BENCHMARK(Baseline_Transpose_fp32_Inner_2D_01_Axis) + // ->RangeMultiplier(2) + ->Ranges({{2, 1024 * 1024}, {160, 320}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +BENCHMARK(Baseline_Transpose_fp16_Inner_2D_01_Axis) + // ->RangeMultiplier(2) + ->Ranges({{2, 1024 * 1024}, {160, 320}}) + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +//------------------------------------------------------------------------------ diff --git a/benchmarks/cpp/nvfuser/utils.cpp b/benchmarks/cpp/nvfuser/utils.cpp index 053fc6939082..c15248bce71d 100644 --- a/benchmarks/cpp/nvfuser/utils.cpp +++ b/benchmarks/cpp/nvfuser/utils.cpp @@ -1,4 +1,4 @@ -#include "utils.h" +#include #include @@ -16,8 +16,8 @@ std::string toString(ReductionParams rparams) { if (rparams.schedule_3D) { ss << "3D Schedule // " << "Outer Reduction: " - << (rparams.cross_block_outer_reduce ? "cross block / " : "") - << (rparams.cross_grid_outer_reduce ? "cross grid / " : "") + << (rparams.cross_block_outer_reduction ? "cross block / " : "") + << (rparams.cross_grid_outer_reduction ? "cross grid / " : "") << (rparams.split_grid_dim_outer_reduction ? "split grid dim / " : ""); if (rparams.batches_per_block_outer_reduction > 1 || rparams.persistent_kernel) { @@ -31,16 +31,17 @@ std::string toString(ReductionParams rparams) { : "") << (rparams.split_grid_dim_iter_dom ? "split grid dimension / " : "") << (rparams.vectorize_iter_dom ? "vectorize / " : "") - << (rparams.unroll_iter_dom && !rparams.vectorize_iter_dom ? "unroll / " - : ""); - if (rparams.unroll_iter_dom || rparams.vectorize_iter_dom) { + << (rparams.unroll_factor_iter_dom > 1 && !rparams.vectorize_iter_dom + ? "unroll / " + : ""); + if (rparams.unroll_factor_iter_dom > 1 || rparams.vectorize_iter_dom) { ss << "factor " << rparams.unroll_factor_iter_dom; } ss << " // Inner Reduction Domain: " - << (rparams.cross_block_inner_reduce ? "cross block reduction / " : "") + << (rparams.cross_block_inner_reduction ? "cross block reduction / " : "") << (rparams.pad_inner_reduction_to_warp ? "pad to warp / " : "") - << (rparams.cross_grid_inner_reduce ? "cross grid reduction / " : ""); + << (rparams.cross_grid_inner_reduction ? "cross grid reduction / " : ""); if (rparams.batches_per_block_inner_reduction > 1 || rparams.persistent_kernel) { @@ -48,15 +49,17 @@ std::string toString(ReductionParams rparams) { << " / "; } - ss << (rparams.cross_grid_inner_reduce && + ss << (rparams.cross_grid_inner_reduction && rparams.split_grid_dim_inner_reduction ? "split grid dimension / " : "") << (rparams.vectorize_inner_reduction ? "vectorize / " : "") - << (rparams.unroll_inner_reduction && !rparams.vectorize_inner_reduction + << (rparams.unroll_factor_inner_reduction > 1 && + !rparams.vectorize_inner_reduction ? "unroll / " : ""); - if (rparams.unroll_inner_reduction || rparams.vectorize_inner_reduction) { + if (rparams.unroll_factor_inner_reduction > 1 || + rparams.vectorize_inner_reduction) { ss << "factor " << rparams.unroll_factor_inner_reduction; } return ss.str(); @@ -76,11 +79,11 @@ std::string toString(PointwiseParams params) { ss << "1D" << "/"; } - if (params.inner_factor > 1) { + if (params.unroll_factor > 1) { if (params.vectorize) { - ss << "Vectorize, Factor: " << params.inner_factor; + ss << "Vectorize, Factor: " << params.unroll_factor; } else { - ss << "Unroll, Factor: " << params.inner_factor; + ss << "Unroll, Factor: " << params.unroll_factor; } } return ss.str(); @@ -108,6 +111,10 @@ void clearL2Cache() { torch::Tensor t1 = torch::clone(t0); }; +TensorView* makeSymbolicTensor(size_t ndims, DataType dtype) { + return TensorViewBuilder().ndims(ndims).dtype(dtype).build(); +} + TensorView* makeContigTensor(size_t ndims, DataType dtype) { return TensorViewBuilder() .ndims(ndims) @@ -116,24 +123,50 @@ TensorView* makeContigTensor(size_t ndims, DataType dtype) { .build(); } +TensorView* makeConcreteTensor( + std::vector shape, + DataType dtype) { + return TensorViewBuilder().shape(shape).dtype(dtype).build(); +} + +TensorView* makeContigConcreteTensor( + std::vector shape, + DataType dtype) { + return TensorViewBuilder() + .shape(shape) + .dtype(dtype) + .contiguity(std::vector(shape.size(), true)) + .build(); +} + void runBenchmarkIterations( benchmark::State& benchmark_state, FusionExecutorCache* fusion_executor_cache, std::vector& aten_inputs) { fusion_executor_cache->runFusionWithInputs(aten_inputs); bool segmented = - fusion_executor_cache->getMostRecentKernelRuntime()->isSegmented(); + fusion_executor_cache->getMostRecentKernelRuntime()->isSegmented() && + fusion_executor_cache->getMostRecentKernelRuntime() + ->fusionSegments() + ->groups() + .size() > 1; if (!segmented) { fusion_executor_cache->profile(true); fusion_executor_cache->runFusionWithInputs(aten_inputs); auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo(); auto executor_instance = compile_log.fusion_executor; - TORCH_INTERNAL_ASSERT(compile_log.reduction_params.has_value()); - TORCH_INTERNAL_ASSERT(compile_log.launch_constraints.has_value()); - auto rparams = toString(compile_log.reduction_params.value()); - auto lparams = toString(compile_log.launch_constraints.value()); - benchmark_state.SetLabel(rparams + lparams); + + if (compile_log.reduction_params.has_value()) { + auto rparams = toString(compile_log.reduction_params.value()); + auto lparams = toString(compile_log.fusion_executor->lastLaunchParams()); + benchmark_state.SetLabel(rparams + lparams); + } else if (compile_log.pointwise_params.has_value()){ + auto pparams = toString(compile_log.pointwise_params.value()); + auto lparams = toString(compile_log.fusion_executor->lastLaunchParams()); + benchmark_state.SetLabel(pparams + lparams); + } + executor_instance->setMeasureKernelTimeFlag(true); // Sync everything up before we start diff --git a/benchmarks/cpp/nvfuser/utils.h b/benchmarks/cpp/nvfuser/utils.h index b4a2f3a7a916..176290fd76f3 100644 --- a/benchmarks/cpp/nvfuser/utils.h +++ b/benchmarks/cpp/nvfuser/utils.h @@ -18,6 +18,24 @@ using namespace torch::jit::fuser::cuda; +// Make a tensor that is known to be non-contiguous of dimensionality=ndims, +// but unknown sizes +TensorView* makeSymbolicTensor(size_t ndims, DataType dtype = DataType::Float); + +// Make a tensor that is known to be fully contiguous of dimensionality=ndims, +// but unknown sizes. Taken from test_gpu.cpp +TensorView* makeContigTensor(size_t ndims, DataType dtype = DataType::Float); + +// Make a non-contiguous tensor of compile-time known sizes +TensorView* makeConcreteTensor( + std::vector shape, + DataType dtype = DataType::Float); + +// Make a contiguous tensor of compile-time known sizes +TensorView* makeContigConcreteTensor( + std::vector shape, + DataType dtype = DataType::Float); + std::string toString(ReductionParams rparams); std::string toString(PointwiseParams params); std::string toString(LaunchParams lparams); @@ -32,10 +50,6 @@ void runBenchmarkIterations( void clearL2Cache(); -// Make a tensor that is known to be fully contiguous of dimensionality=ndims, -// but unknown sizes. Taken from test_gpu.cpp -TensorView* makeContigTensor(size_t ndims, DataType dtype = DataType::Float); - class CudaKernelTimer { public: CudaKernelTimer() { diff --git a/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp b/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp index eddac0a46394..77e86020f28a 100644 --- a/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp +++ b/benchmarks/cpp/tensorexpr/bench_batchnorm.cpp @@ -82,10 +82,8 @@ BENCHMARK_DEFINE_F(BatchNorm, NNC)(benchmark::State& state) { VarHandle eps("eps", kFloat); using axis = const VarHandle&; - Tensor output = Compute( - "output", - {{N_, "N"}, {C_, "C"}, {H_, "H"}, {W_, "W"}}, - [&](axis n, axis c, axis h, axis w) { + Tensor output = + Compute("output", {N_, C_, H_, W_}, [&](axis n, axis c, axis h, axis w) { // Compute affine terms. auto inv_var = FloatImm::make(1.0f) / sqrt(var.load(c) + eps); auto weight_v = weight.load(c); @@ -143,10 +141,8 @@ BENCHMARK_DEFINE_F(BatchNorm, NNCRelu)(benchmark::State& state) { VarHandle eps("eps", kFloat); using axis = const VarHandle&; - Tensor output = Compute( - "output", - {{N_, "N"}, {C_, "C"}, {H_, "H"}, {W_, "W"}}, - [&](axis n, axis c, axis h, axis w) { + Tensor output = + Compute("output", {N_, C_, H_, W_}, [&](axis n, axis c, axis h, axis w) { // Compute affine terms. auto inv_var = FloatImm::make(1.0f) / sqrt(var.load(c) + eps); auto weight_v = weight.load(c); diff --git a/benchmarks/cpp/tensorexpr/bench_compile.cpp b/benchmarks/cpp/tensorexpr/bench_compile.cpp index 13a02ee7723d..be60f9cd599b 100644 --- a/benchmarks/cpp/tensorexpr/bench_compile.cpp +++ b/benchmarks/cpp/tensorexpr/bench_compile.cpp @@ -12,26 +12,21 @@ static void BM_CompileSwish(benchmark::State& state) { constexpr int N = 512; te::VarHandle n("n", te::kInt); te::BufHandle A("A", {N}, te::kFloat); - te::Tensor relu = - te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) { - return te::Max::make(A.load(i), 0.f, false); - }); - te::Tensor min6 = - te::Compute("min6", {{n, "n"}}, [&](const te::VarHandle& i) { - return te::Min::make(relu.load(i), 6.f, false); - }); - te::Tensor plus3 = - te::Compute("plus3", {{n, "n"}}, [&](const te::VarHandle& i) { - return min6.load(i) + 3.f; - }); - te::Tensor times = - te::Compute("times", {{n, "n"}}, [&](const te::VarHandle& i) { - return A.load(i) * plus3.load(i); - }); - te::Tensor sixth = - te::Compute("sixth", {{n, "n"}}, [&](const te::VarHandle& i) { - return times.load(i) * 1.f / 6.f; - }); + te::Tensor relu = te::Compute("relu", {n}, [&](const te::VarHandle& i) { + return te::Max::make(A.load(i), 0.f, false); + }); + te::Tensor min6 = te::Compute("min6", {n}, [&](const te::VarHandle& i) { + return te::Min::make(relu.load(i), 6.f, false); + }); + te::Tensor plus3 = te::Compute("plus3", {n}, [&](const te::VarHandle& i) { + return min6.load(i) + 3.f; + }); + te::Tensor times = te::Compute("times", {n}, [&](const te::VarHandle& i) { + return A.load(i) * plus3.load(i); + }); + te::Tensor sixth = te::Compute("sixth", {n}, [&](const te::VarHandle& i) { + return times.load(i) * 1.f / 6.f; + }); te::LoopNest nest({sixth}, {relu, min6, plus3, times, sixth}); for (auto tensor : {relu, min6, plus3, times}) { nest.computeInline(tensor.buf()); @@ -46,26 +41,20 @@ static void BM_CompileSwishLLVMOnly(benchmark::State& state) { constexpr int N = 512; te::VarHandle n("n", te::kInt); te::BufHandle A("A", {N}, te::kFloat); - te::Tensor relu = - te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) { - return te::Max::make(A.load(i), 0.f, false); - }); - te::Tensor min6 = - te::Compute("min6", {{n, "n"}}, [&](const te::VarHandle& i) { - return te::Min::make(relu.load(i), 6.f, false); - }); - te::Tensor plus3 = - te::Compute("plus3", {{n, "n"}}, [&](const te::VarHandle& i) { - return min6.load(i) + 3.f; - }); - te::Tensor times = - te::Compute("times", {{n, "n"}}, [&](const te::VarHandle& i) { - return A.load(i) * plus3.load(i); - }); - te::Tensor sixth = - te::Compute("sixth", {{n, "n"}}, [&](const te::VarHandle& i) { - return times.load(i) * 1.f / 6.f; - }); + te::Tensor relu = te::Compute("relu", {n}, [&](const te::VarHandle& i) { + return te::Max::make(A.load(i), 0.f, false); + }); + te::Tensor min6 = te::Compute("min6", {n}, [&](const te::VarHandle& i) { + return te::Min::make(relu.load(i), 6.f, false); + }); + te::Tensor plus3 = te::Compute( + "plus3", {n}, [&](const te::VarHandle& i) { return min6.load(i) + 3.f; }); + te::Tensor times = te::Compute("times", {n}, [&](const te::VarHandle& i) { + return A.load(i) * plus3.load(i); + }); + te::Tensor sixth = te::Compute("sixth", {n}, [&](const te::VarHandle& i) { + return times.load(i) * 1.f / 6.f; + }); te::LoopNest nest({sixth}, {relu, min6, plus3, times, sixth}); for (auto tensor : {relu, min6, plus3, times}) { nest.computeInline(tensor.buf()); diff --git a/benchmarks/cpp/tensorexpr/bench_concat.cpp b/benchmarks/cpp/tensorexpr/bench_concat.cpp index 854092139aba..b7b97d02e3a8 100644 --- a/benchmarks/cpp/tensorexpr/bench_concat.cpp +++ b/benchmarks/cpp/tensorexpr/bench_concat.cpp @@ -61,7 +61,7 @@ class ConcatBench : public benchmark::Fixture { Tensor output = Compute( "aten_cat", - {{output_size_[0], "M"}, {output_size_[1], "N"}}, + {output_size_[0], output_size_[1]}, [&](const VarHandle& m, const VarHandle& n) { int d = 0; std::vector cumulative_concat_dim_sizes(num_inputs); diff --git a/benchmarks/cpp/tensorexpr/bench_gemm.cpp b/benchmarks/cpp/tensorexpr/bench_gemm.cpp index 6d452368fc7a..403746578dff 100644 --- a/benchmarks/cpp/tensorexpr/bench_gemm.cpp +++ b/benchmarks/cpp/tensorexpr/bench_gemm.cpp @@ -44,12 +44,12 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprNoopt)(benchmark::State& state) { te::BufHandle BP("B", {K, N}, te::kFloat); te::Tensor CT = te::Reduce( "gemm", - {{M, "M"}, {N, "N"}}, + {M, N}, te::Sum(), [&](const te::ExprHandle& m, const te::ExprHandle& n, const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); }, - {{K, "K"}}); + {K}); te::LoopNest loop({CT}); loop.prepareForCodegen(); te::StmtPtr s = loop.root_stmt(); @@ -66,12 +66,12 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile32x32)(benchmark::State& state) { te::BufHandle BP("B", {K, N}, te::kFloat); te::Tensor CT = te::Reduce( "gemm", - {{M, "M"}, {N, "N"}}, + {M, N}, te::Sum(), [&](const te::ExprHandle& m, const te::ExprHandle& n, const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); }, - {{K, "K"}}); + {K}); te::LoopNest loop({CT}); { @@ -124,12 +124,12 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16)(benchmark::State& state) { te::BufHandle BP("B", {K, N}, te::kFloat); te::Tensor CT = te::Reduce( "gemm", - {{M, "M"}, {N, "N"}}, + {M, N}, te::Sum(), [&](const te::ExprHandle& m, const te::ExprHandle& n, const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); }, - {{K, "K"}}); + {K}); te::LoopNest loop({CT}); { @@ -182,12 +182,12 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16VecUnroll)(benchmark::State& state) { te::BufHandle BP("B", {K, N}, te::kFloat); te::Tensor CT = te::Reduce( "gemm", - {{M, "M"}, {N, "N"}}, + {M, N}, te::Sum(), [&](const te::ExprHandle& m, const te::ExprHandle& n, const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); }, - {{K, "K"}}); + {K}); te::LoopNest loop({CT}); { @@ -248,12 +248,12 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16Cache)(benchmark::State& state) { te::BufHandle BP("B", {K, N}, te::kFloat); te::Tensor CT = te::Reduce( "gemm", - {{M, "M"}, {N, "N"}}, + {M, N}, te::Sum(), [&](const te::ExprHandle& m, const te::ExprHandle& n, const te::ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); }, - {{K, "K"}}); + {K}); te::LoopNest loop({CT}); { diff --git a/benchmarks/cpp/tensorexpr/bench_parallel.cpp b/benchmarks/cpp/tensorexpr/bench_parallel.cpp index abc8c3de3f33..8d77a459c603 100644 --- a/benchmarks/cpp/tensorexpr/bench_parallel.cpp +++ b/benchmarks/cpp/tensorexpr/bench_parallel.cpp @@ -38,7 +38,7 @@ class ParallelAdd : public benchmark::Fixture { BENCHMARK_DEFINE_F(ParallelAdd, Simple)(benchmark::State& state) { BufHandle a_buf("a", {M}, kFloat); BufHandle b_buf("b", {M}, kFloat); - Tensor c_tensor = Compute("c", {{M, "m"}}, [&](const VarHandle& m) { + Tensor c_tensor = Compute("c", {M}, [&](const VarHandle& m) { return a_buf.load(m) + b_buf.load(m); }); LoopNest loop_nest({c_tensor}); diff --git a/benchmarks/cpp/tensorexpr/bench_reduce.cpp b/benchmarks/cpp/tensorexpr/bench_reduce.cpp index 085505b52fe5..bf0fe21ca0b1 100644 --- a/benchmarks/cpp/tensorexpr/bench_reduce.cpp +++ b/benchmarks/cpp/tensorexpr/bench_reduce.cpp @@ -235,12 +235,12 @@ BENCHMARK_DEFINE_F(Reduce1D, TeNaive)(benchmark::State& state) { te::BufHandle AP("A", {M}, te::kFloat); te::Tensor BT = te::Reduce( "reduce_full", - {{1, "N"}}, + {1}, te::Sum(), [&](const te::ExprHandle& n, const te::ExprHandle& m) { return AP.load(m); }, - {{M, "M"}}); + {M}); te::LoopNest loop({BT}); loop.prepareForCodegen(); @@ -266,12 +266,12 @@ BENCHMARK_DEFINE_F(Reduce1D, TeSplitTail)(benchmark::State& state) { te::BufHandle AP("A", {M}, te::kFloat); te::Tensor BT = te::Reduce( "reduce_full", - {{1, "N"}}, + {1}, te::Sum(), [&](const te::ExprHandle& n, const te::ExprHandle& m) { return AP.load(m); }, - {{M, "M"}}); + {M}); te::LoopNest loop({BT}); const int kChunkSize = 8; @@ -305,12 +305,12 @@ BENCHMARK_DEFINE_F(Reduce1D, TeSplitMask)(benchmark::State& state) { te::BufHandle AP("A", {M}, te::kFloat); te::Tensor BT = te::Reduce( "reduce_full", - {{1, "N"}}, + {1}, te::Sum(), [&](const te::ExprHandle& n, const te::ExprHandle& m) { return AP.load(m); }, - {{M, "M"}}); + {M}); te::LoopNest loop({BT}); const int kChunkSize = 8; @@ -349,7 +349,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) { {}, te::Sum(), [&](const te::ExprHandle& m) { return AP.load(m); }, - {{M, "M"}}); + {M}); te::LoopNest loop({BT}); te::BufPtr rfac_buf; @@ -392,8 +392,8 @@ BENCHMARK_DEFINE_F(Reduce1D, Op)(benchmark::State& state) { const int kChunkSize = 8; te::BufHandle a("A", {M}, te::kFloat); - te::Tensor b = - te::computeSum({a, te::IntList({0}), false}, {}, at::kFloat, at::kCPU); + te::Tensor b = te::computeSum( + {a, te::IntList({0}), false}, {}, {}, at::kFloat, at::kCPU); te::LoopNest nest({b}); auto loops = nest.getLoopStmtsFor(b); @@ -456,8 +456,8 @@ BENCHMARK_REGISTER_F(Reduce2DCol, Torch) BENCHMARK_DEFINE_F(Reduce2DCol, OpSchedule)(benchmark::State& state) { constexpr int kCacheSize = 1 << 12; te::BufHandle a("A", {M, N}, te::kFloat); - te::Tensor b = - te::computeSum({a, te::IntList({0}), false}, {N}, at::kFloat, at::kCPU); + te::Tensor b = te::computeSum( + {a, te::IntList({0}), false}, {N}, {1}, at::kFloat, at::kCPU); te::LoopNest nest({b}); auto sch = state.range(2); @@ -565,8 +565,8 @@ BENCHMARK_REGISTER_F(Reduce2DRow, Hand)->Args({1 << 18, 1 << 6}); BENCHMARK_DEFINE_F(Reduce2DRow, OpSchedule)(benchmark::State& state) { constexpr int kChunkSize = 8; te::BufHandle a("A", {M, N}, te::kFloat); - te::Tensor b = - te::computeSum({a, te::IntList({1}), false}, {M}, at::kFloat, at::kCPU); + te::Tensor b = te::computeSum( + {a, te::IntList({1}), false}, {M}, {1}, at::kFloat, at::kCPU); te::LoopNest nest({b}); auto sch = state.range(2); diff --git a/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp b/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp index 0454530f880f..568905acd7c4 100644 --- a/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp +++ b/benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp @@ -46,13 +46,13 @@ class SignedLog1pBench : public benchmark::Fixture { "input", {input_size_int_[0], input_size_int_[1]}, kFloat); Tensor abs_result = Compute( "aten_abs", - {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}}, + {input_size_int_[0], input_size_int_[1]}, [&](const VarHandle& m, const VarHandle& n) { return abs(input_ph.load(m, n)); }); Tensor log1p_result = Compute( "aten_log1p", - {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}}, + {input_size_int_[0], input_size_int_[1]}, [&](const VarHandle& m, const VarHandle& n) { return log1p(abs_result.load(m, n)); }); @@ -60,7 +60,7 @@ class SignedLog1pBench : public benchmark::Fixture { computeSign({input_ph}, {input_size_int_[0], input_size_int_[1]}); Tensor output = Compute( "aten_mul", - {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}}, + {input_size_int_[0], input_size_int_[1]}, [&](const VarHandle& m, const VarHandle& n) { return sign_result.load(m, n) * log1p_result.load(m, n); }); @@ -94,13 +94,13 @@ class SignedLog1pBench : public benchmark::Fixture { "input", {input_size_int_[0], input_size_int_[1]}, kFloat); Tensor abs_result = Compute( "aten_abs", - {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}}, + {input_size_int_[0], input_size_int_[1]}, [&](const VarHandle& m, const VarHandle& n) { return abs(input_ph.load(m, n)); }); Tensor log_vml_result = Compute( "aten_log1p", - {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}}, + {input_size_int_[0], input_size_int_[1]}, [&](const VarHandle& m, const VarHandle& n) { return log_vml(abs_result.load(m, n) + ExprHandle(1)); }); @@ -108,7 +108,7 @@ class SignedLog1pBench : public benchmark::Fixture { computeSign({input_ph}, {input_size_int_[0], input_size_int_[1]}); Tensor output = Compute( "aten_mul", - {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}}, + {input_size_int_[0], input_size_int_[1]}, [&](const VarHandle& m, const VarHandle& n) { return sign_result.load(m, n) * log_vml_result.load(m, n); }); diff --git a/benchmarks/distributed/rpc/parameter_server/trainer/ddp_models.py b/benchmarks/distributed/rpc/parameter_server/trainer/ddp_models.py index 6d4f0c689401..fd582ddd7781 100644 --- a/benchmarks/distributed/rpc/parameter_server/trainer/ddp_models.py +++ b/benchmarks/distributed/rpc/parameter_server/trainer/ddp_models.py @@ -4,7 +4,7 @@ def basic_ddp_model(self, rank, model, process_group, hook_state, hook): r""" A function that creates a ddp_model and hook_state objects. - The ddp model is is initialized with a single device id and + The ddp model is initialized with a single device id and the process group. The ddp_model also registers the communication hook. Args: diff --git a/benchmarks/fastrnns/bench.py b/benchmarks/fastrnns/bench.py index b7c315b27fef..8b4569a9d56b 100644 --- a/benchmarks/fastrnns/bench.py +++ b/benchmarks/fastrnns/bench.py @@ -6,6 +6,7 @@ import json import copy import time +from torch.autograd.profiler import record_function from .fuser import set_fuser from .runner import get_nn_runners @@ -73,7 +74,8 @@ def train_batch(modeldef): gc.collect() fwd_start_event.record() - forward_output = modeldef.forward(*modeldef.inputs) + with record_function("## forward ##"): + forward_output = modeldef.forward(*modeldef.inputs) fwd_end_event.record() # XXX: Use if need to print something diff --git a/benchmarks/fastrnns/fuser.py b/benchmarks/fastrnns/fuser.py index e1daab594c50..29d395055296 100644 --- a/benchmarks/fastrnns/fuser.py +++ b/benchmarks/fastrnns/fuser.py @@ -4,18 +4,18 @@ def set_fuser(fuser_name, executor_name): assert fuser_name in ['te', 'old', 'none', 'default'] if fuser_name == 'te': torch._C._jit_set_profiling_executor(True) - torch._C._jit_set_profiling_mode(True) + torch._C._get_graph_executor_optimize(True) torch._C._jit_override_can_fuse_on_cpu(False) torch._C._jit_override_can_fuse_on_gpu(True) torch._C._jit_set_texpr_fuser_enabled(True) elif fuser_name == 'old': torch._C._jit_set_profiling_executor(False) - torch._C._jit_set_profiling_mode(False) + torch._C._get_graph_executor_optimize(False) torch._C._jit_override_can_fuse_on_gpu(True) torch._C._jit_set_texpr_fuser_enabled(False) elif fuser_name == 'none': torch._C._jit_set_profiling_executor(False) - torch._C._jit_set_profiling_mode(False) + torch._C._get_graph_executor_optimize(False) torch._C._jit_override_can_fuse_on_gpu(False) torch._C._jit_override_can_fuse_on_cpu(False) torch._C._jit_set_texpr_fuser_enabled(False) @@ -25,12 +25,11 @@ def set_fuser(fuser_name, executor_name): # --executor overrides settings of --fuser if executor_name == 'profiling': torch._C._jit_set_profiling_executor(True) - torch._C._jit_set_profiling_mode(True) + torch._C._get_graph_executor_optimize(True) elif executor_name == 'simple': - torch._C._jit_set_profiling_executor(True) - torch._C._jit_set_profiling_mode(False) + torch._C._get_graph_executor_optimize(False) elif executor_name == 'legacy': torch._C._jit_set_profiling_executor(False) - torch._C._jit_set_profiling_mode(False) + torch._C._get_graph_executor_optimize(True) elif executor_name == 'default': pass diff --git a/benchmarks/functional_autograd_benchmark/README.md b/benchmarks/functional_autograd_benchmark/README.md index a5f106fec67d..32d194b5da52 100644 --- a/benchmarks/functional_autograd_benchmark/README.md +++ b/benchmarks/functional_autograd_benchmark/README.md @@ -20,6 +20,10 @@ export OMP_NUM_THREADS=10 git checkout master python setup.py develop +# Install dependencies: +# Scipy is required by detr +pip install scipy + # Run the benchmark for the base # This will use the GPU if available. pushd benchmarks/functional_autograd_benchmark @@ -46,3 +50,18 @@ popd - `compare.py` is the entry point to run the comparison script that generates a markdown table. - `torchaudio_models.py` and `torchvision_models.py` contains code extracted from torchaudio and torchvision to be able to run the models without having a specific version of these libraries installed. - `ppl_models.py`, `vision_models.py` and `audio_text_models.py` contain all the getter functions used for the benchmark. + + +### Benchmarking against `functorch` + +```bash +# Install stable functorch: +pip install functorch +# or install from source: +pip install git+https://github.com/pytorch/functorch + +# Run the benchmark for the base +# This will use the GPU if available. +pushd benchmarks/functional_autograd_benchmark +python functional_autograd_benchmark.py --output bench-with-functorch.txt +``` diff --git a/benchmarks/functional_autograd_benchmark/audio_text_models.py b/benchmarks/functional_autograd_benchmark/audio_text_models.py index 938e677ac38a..e731568afe7b 100644 --- a/benchmarks/functional_autograd_benchmark/audio_text_models.py +++ b/benchmarks/functional_autograd_benchmark/audio_text_models.py @@ -3,7 +3,11 @@ import torchaudio_models as models -from utils import extract_weights, load_weights, GetterReturnType +from utils import check_for_functorch, extract_weights, load_weights, GetterReturnType + + +has_functorch = check_for_functorch() + def get_wav2letter(device: torch.device) -> GetterReturnType: N = 10 @@ -50,6 +54,12 @@ def get_deepspeech(device: torch.device) -> GetterReturnType: model = models.DeepSpeech(rnn_type=nn.LSTM, labels=labels, rnn_hidden_size=1024, nb_layers=5, audio_conf=audio_conf, bidirectional=True) + + if has_functorch: + from functorch.experimental import replace_all_batch_norm_modules_ + + replace_all_batch_norm_modules_(model) + model = model.to(device) criterion = nn.CTCLoss() params, names = extract_weights(model) @@ -71,6 +81,11 @@ def get_transformer(device: torch.device) -> GetterReturnType: ntoken = 50 model = models.TransformerModel(ntoken=ntoken, ninp=720, nhead=12, nhid=2048, nlayers=2) model.to(device) + + if has_functorch: + # disable dropout for consistency checking + model.eval() + criterion = nn.NLLLoss() params, names = extract_weights(model) diff --git a/benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py b/benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py index aceb59e25b0d..1b0ef20902da 100644 --- a/benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py +++ b/benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py @@ -6,6 +6,13 @@ from collections import defaultdict from typing import NamedTuple, Callable, List, Any +try: + import functorch as ft + has_functorch = True + print(f"Found functorch: {ft.__version__}") +except ImportError: + has_functorch = False + import ppl_models import vision_models import audio_text_models @@ -36,6 +43,65 @@ def jacrev(model, inp, strict=None): else: return getattr(functional, task) +def get_task_functorch(task: str) -> Callable: + + @torch.no_grad() + def vjp(model, inp, v=None, strict=None): + assert v is not None + out, vjpfunc = ft.vjp(model, *inp) + return out, vjpfunc(v) + + @torch.no_grad() + def jvp(model, inp, v=None, strict=None): + assert v is not None + return ft.jvp(model, inp, v) + + @torch.no_grad() + def vhp(model, inp, v=None, strict=None): + assert v is not None + argnums = tuple(range(len(inp))) + _, vjpfunc, aux = ft.vjp(ft.grad_and_value(model, argnums), *inp, has_aux=True) + return aux, vjpfunc(v) + + @torch.no_grad() + def hvp(model, inp, v=None, strict=None): + assert v is not None + argnums = tuple(range(len(inp))) + _, hvp_out, aux = ft.jvp(ft.grad_and_value(model, argnums), inp, v, has_aux=True) + return aux, hvp_out + + @torch.no_grad() + def jacfwd(model, inp, v=None, strict=None): + argnums = tuple(range(len(inp))) + return ft.jacfwd(model, argnums)(*inp) + + @torch.no_grad() + def jacrev(model, inp, v=None, strict=None): + argnums = tuple(range(len(inp))) + return ft.jacrev(model, argnums)(*inp) + + @torch.no_grad() + def hessian(model, inp, v=None, strict=None): + argnums = tuple(range(len(inp))) + return ft.hessian(model, argnums=argnums)(*inp) + + @torch.no_grad() + def hessian_fwdrev(model, inp, v=None, strict=None): + argnums = tuple(range(len(inp))) + return ft.jacfwd(ft.jacrev(model, argnums=argnums), argnums=argnums)(*inp) + + @torch.no_grad() + def hessian_revrev(model, inp, v=None, strict=None): + argnums = tuple(range(len(inp))) + return ft.jacrev(ft.jacrev(model, argnums=argnums), argnums=argnums)(*inp) + + if task in locals(): + return locals()[task] + elif task == "jacobian": + raise RuntimeError("functorch has no equivalent of autograd.functional.jacobian with vectorize=False yet") + else: + raise RuntimeError(f"Unsupported task: {task}") + # Listing of the different tasks FAST_TASKS_NO_DOUBLE_BACK = [ "vjp", @@ -99,7 +165,7 @@ def get_v_for(model: Callable, inp: InputsType, task: str) -> VType: return v -def run_once(model: Callable, inp: InputsType, task: str, v: VType) -> None: +def run_once(model: Callable, inp: InputsType, task: str, v: VType, **kwargs) -> None: func = get_task_func(task) if v is not None: @@ -107,7 +173,24 @@ def run_once(model: Callable, inp: InputsType, task: str, v: VType) -> None: else: res = func(model, inp, strict=True) -def run_model(model_getter: GetterType, args: Any, task: str) -> List[float]: +def run_once_functorch(model: Callable, inp: InputsType, task: str, v: VType, maybe_check_consistency=False) -> None: + func = get_task_functorch(task) + + if v is not None: + res = func(model, inp, v=v, strict=True) + else: + res = func(model, inp, strict=True) + + if maybe_check_consistency: + af_func = get_task_func(task) + if v is not None: + expected = af_func(model, inp, v=v, strict=True) + else: + expected = af_func(model, inp, strict=True) + atol = 1e-2 if task == "vhp" else 5e-3 + torch.testing.assert_close(res, expected, rtol=1e-5, atol=atol, msg=f"Consistency fail for task '{task}'") + +def run_model(model_getter: GetterType, args: Any, task: str, run_once_fn: Callable = run_once) -> List[float]: if args.gpu == -1: device = torch.device("cpu") @@ -121,14 +204,17 @@ def noop(): model, inp = model_getter(device) v = get_v_for(model, inp, task) + # Warmup - run_once(model, inp, task, v) + # maybe_check_consistency=True checks for consistency between + # functorch vs autograd.functional and is done in run_once_functorch only + run_once_fn(model, inp, task, v, maybe_check_consistency=True) elapsed = [] for it in range(args.num_iters): do_sync() start = time.time() - run_once(model, inp, task, v) + run_once_fn(model, inp, task, v) do_sync() elapsed.append(time.time() - start) @@ -173,6 +259,18 @@ def main(): results[name][task] = (mean.item(), var.item()) print("Results for model {} on task {}: {}s (var: {})".format(name, task, mean, var)) + if has_functorch: + try: + runtimes = run_model(model_getter, args, task, run_once_fn=run_once_functorch) + except RuntimeError as e: + print(f"Failed model using Functorch: {name}, task: {task}, Error message: \n\t", e) + continue + + runtimes = torch.tensor(runtimes) + mean, var = runtimes.mean(), runtimes.var() + results[name][f"functorch {task}"] = (mean.item(), var.item()) + print("Results for model {} on task {} using Functorch: {}s (var: {})".format(name, task, mean, var)) + if args.output: with open(args.output, "w") as f: f.write(to_markdown_table(results)) diff --git a/benchmarks/functional_autograd_benchmark/utils.py b/benchmarks/functional_autograd_benchmark/utils.py index c7aeb29d157b..dcf03e7a28d0 100644 --- a/benchmarks/functional_autograd_benchmark/utils.py +++ b/benchmarks/functional_autograd_benchmark/utils.py @@ -101,3 +101,10 @@ def from_markdown_table(data: str) -> TimingResultType: res[model][task] = (float(mean), float(var)) return res + +def check_for_functorch(): + try: + import functorch # noqa: F401 + return True + except ImportError: + return False diff --git a/benchmarks/functional_autograd_benchmark/vision_models.py b/benchmarks/functional_autograd_benchmark/vision_models.py index cd2f84e638a1..4c7c9d5bdd53 100644 --- a/benchmarks/functional_autograd_benchmark/vision_models.py +++ b/benchmarks/functional_autograd_benchmark/vision_models.py @@ -2,13 +2,22 @@ from torch import Tensor import torchvision_models as models -from utils import extract_weights, load_weights, GetterReturnType +from utils import check_for_functorch, extract_weights, load_weights, GetterReturnType from typing import cast +has_functorch = check_for_functorch() + + def get_resnet18(device: torch.device) -> GetterReturnType: N = 32 model = models.resnet18(pretrained=False) + + if has_functorch: + from functorch.experimental import replace_all_batch_norm_modules_ + + replace_all_batch_norm_modules_(model) + criterion = torch.nn.CrossEntropyLoss() model.to(device) params, names = extract_weights(model) @@ -29,6 +38,14 @@ def get_fcn_resnet(device: torch.device) -> GetterReturnType: N = 8 criterion = torch.nn.MSELoss() model = models.fcn_resnet50(pretrained=False, pretrained_backbone=False) + + if has_functorch: + from functorch.experimental import replace_all_batch_norm_modules_ + + replace_all_batch_norm_modules_(model) + # disable dropout for consistency checking + model.eval() + model.to(device) params, names = extract_weights(model) @@ -56,6 +73,12 @@ def get_detr(device: torch.device) -> GetterReturnType: model = models.DETR(num_classes=num_classes, hidden_dim=hidden_dim, nheads=nheads, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers) + + if has_functorch: + from functorch.experimental import replace_all_batch_norm_modules_ + + replace_all_batch_norm_modules_(model) + losses = ['labels', 'boxes', 'cardinality'] eos_coef = 0.1 bbox_loss_coef = 5 @@ -74,9 +97,9 @@ def get_detr(device: torch.device) -> GetterReturnType: for idx in range(N): targets = {} n_targets: int = int(torch.randint(5, 10, size=tuple()).item()) - label = torch.randint(5, 10, size=(n_targets,)) + label = torch.randint(5, 10, size=(n_targets,), device=device) targets["labels"] = label - boxes = torch.randint(100, 800, size=(n_targets, 4)) + boxes = torch.randint(100, 800, size=(n_targets, 4), device=device) for t in range(n_targets): if boxes[t, 0] > boxes[t, 2]: boxes[t, 0], boxes[t, 2] = boxes[t, 2], boxes[t, 0] diff --git a/benchmarks/instruction_counts/core/expand.py b/benchmarks/instruction_counts/core/expand.py index 6e882f3a52cb..f6713ee65cb9 100644 --- a/benchmarks/instruction_counts/core/expand.py +++ b/benchmarks/instruction_counts/core/expand.py @@ -8,7 +8,7 @@ import os import re import textwrap -from typing import cast, List, Optional, Tuple, TYPE_CHECKING +from typing import List, Optional, Tuple, TYPE_CHECKING import uuid import torch @@ -63,15 +63,12 @@ def _generate_torchscript_file(model_src: str, name: str) -> Optional[str]: # Import magic to actually load our function. module_spec = importlib.util.spec_from_file_location(f"torchscript__{name}", module_path) + assert module_spec is not None module = importlib.util.module_from_spec(module_spec) loader = module_spec.loader assert loader is not None - # Module.loader has type Optional[_Loader]. Even when we assert loader is - # not None and MyPy narrows it to type _Loader, it will not pass type - # checks. So we have to use a cast to tell MyPy that _Loader implements - # importlib.abc.Loader. - cast(importlib.abc.Loader, loader).exec_module(module) + loader.exec_module(module) # And again, the type checker has no way of knowing that this line is valid. jit_model = module.jit_model # type: ignore[attr-defined] diff --git a/benchmarks/operator_benchmark/README.md b/benchmarks/operator_benchmark/README.md index 9efa4a8c22bc..59918f6fab3c 100644 --- a/benchmarks/operator_benchmark/README.md +++ b/benchmarks/operator_benchmark/README.md @@ -136,7 +136,7 @@ $ python -m benchmark_all_test --list_tests Filter and run an operator (use add as an example): ``` -$ python -m benchmark_all_test --operator add --omp_num_threads 1 --mkl_num_threads 1 +$ python -m benchmark_all_test --operators add --omp_num_threads 1 --mkl_num_threads 1 ``` Note: this filter is based on the operator name rather than the file name. diff --git a/benchmarks/operator_benchmark/benchmark_core.py b/benchmarks/operator_benchmark/benchmark_core.py index 4248e4776f22..16a66d5cf92b 100644 --- a/benchmarks/operator_benchmark/benchmark_core.py +++ b/benchmarks/operator_benchmark/benchmark_core.py @@ -200,8 +200,8 @@ def _print_header(self): print("# {}".format(self.args.operators)) def _print_perf_result(self, reported_run_time_us, test_case): - if self.args.ai_pep_format: - # Output for AI-PEP + if self.args.report_aibench: + # Output for AIBench # Print out per iteration execution time instead of avg time return test_name = '_'.join([test_case.framework, test_case.test_config.test_name]) @@ -288,7 +288,7 @@ def _measure_time(self, launch_test, test_case, iters, print_per_iter): report_run_time = 1e6 * run_time_sec / iters time_trace.append(report_run_time) # Print out the time spent in each epoch in ms - if self.args.ai_pep_format: + if self.args.report_aibench: mode = "JIT" if self.use_jit else "Eager" test_name = '_'.join([test_case.framework, test_case.test_config.test_name, mode]) print("PyTorchObserver " + json.dumps( diff --git a/benchmarks/operator_benchmark/benchmark_runner.py b/benchmarks/operator_benchmark/benchmark_runner.py index b9347364428e..3e998e6ceb4e 100644 --- a/benchmarks/operator_benchmark/benchmark_runner.py +++ b/benchmarks/operator_benchmark/benchmark_runner.py @@ -89,12 +89,12 @@ def parse_args(): ) parser.add_argument( - "--ai_pep_format", + "--report_aibench", type=benchmark_utils.str2bool, nargs='?', const=True, default=False, - help="Print result when running on AI-PEP" + help="Print result when running on AIBench" ) parser.add_argument( diff --git a/benchmarks/operator_benchmark/pt/qinterpolate_test.py b/benchmarks/operator_benchmark/pt/qinterpolate_test.py index ec58e6e6a7dd..764274f92581 100644 --- a/benchmarks/operator_benchmark/pt/qinterpolate_test.py +++ b/benchmarks/operator_benchmark/pt/qinterpolate_test.py @@ -44,7 +44,7 @@ def init(self, M, N, K, dtype, mode, scale, contig): zero_point=zero_point, dtype=dtype) if not contig: - permute_dims = list(range(q_input.ndim))[::-1] + permute_dims = list(range(self.q_input.ndim))[::-1] self.q_input = self.q_input.permute(permute_dims) self.inputs = { diff --git a/benchmarks/static_runtime/CMakeLists.txt b/benchmarks/static_runtime/CMakeLists.txt index d248fe2a5573..1fba02566771 100644 --- a/benchmarks/static_runtime/CMakeLists.txt +++ b/benchmarks/static_runtime/CMakeLists.txt @@ -6,4 +6,5 @@ list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/deep_wide_pt.cc list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/test_utils.cc) list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/test_static_runtime.cc) list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/test_static_module.cc) +list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/test_generated_ops.cc) set(STATIC_RUNTIME_TEST_SRCS ${STATIC_RUNTIME_TEST_SRCS} PARENT_SCOPE) diff --git a/benchmarks/static_runtime/deep_wide_pt.h b/benchmarks/static_runtime/deep_wide_pt.h index 73a943146f24..5b18c96364ba 100644 --- a/benchmarks/static_runtime/deep_wide_pt.h +++ b/benchmarks/static_runtime/deep_wide_pt.h @@ -60,7 +60,7 @@ struct DeepAndWideFast : torch::nn::Module { auto dp_unflatten = at::cpu::bmm(ad_emb_packed, user_emb_t); // auto dp = at::native::flatten(dp_unflatten, 1); auto dp = dp_unflatten.view({dp_unflatten.size(0), 1}); - auto input = at::native::_cat_cpu({dp, wide_preproc}, 1); + auto input = at::cpu::cat({dp, wide_preproc}, 1); // fc1 = torch::nn::functional::linear(input, fc_w_, fc_b_); fc_w_t_ = torch::t(fc_w_); @@ -114,7 +114,7 @@ struct DeepAndWideFast : torch::nn::Module { // Potential optimization: we can replace cat with carefully constructed // tensor views on the output that are passed to the _out ops above. - at::native::_cat_out_cpu( + at::cpu::cat_outf( {prealloc_tensors[5], prealloc_tensors[2]}, 1, prealloc_tensors[6]); at::cpu::addmm_out( prealloc_tensors[7], fc_b_, prealloc_tensors[6], fc_w_t_, 1, 1); diff --git a/benchmarks/static_runtime/test_cpu_fusion.cc b/benchmarks/static_runtime/test_cpu_fusion.cc new file mode 100644 index 000000000000..82f11a9ec5db --- /dev/null +++ b/benchmarks/static_runtime/test_cpu_fusion.cc @@ -0,0 +1,138 @@ +#include +#include +#include +#include + +#include "test_utils.h" + +using namespace torch; +using namespace torch::jit; +using namespace torch::jit::test; + +TEST(CpuFusion, Simple) { + const auto simple_script = R"JIT( + def forward(self, a, b): + return (a + b).relu().tanh() + )JIT"; + + Module m("module"); + m.define(simple_script); + + StaticModuleOptions opts; // start with the defaults. + opts.enable_tensorexpr_fusion = true; + + auto input1 = at::randn({2, 3}); + auto input2 = at::ones({2, 3}); + + auto smodule = StaticModule(m, /* is_frozen */ false, opts, {input1, input2}); + StaticRuntime runtime(smodule); + + // Test with sample inputs + { + auto actual = runtime({input1, input2}, {}); + auto expect = at::tanh(at::relu(input1 + input2)); + EXPECT_TRUE(at::allclose(expect, actual.toTensor())); + } + + // Test with different inputs + { + auto new_input1 = at::randn({5, 14}); + auto new_input2 = at::randn({5, 14}); + auto actual = runtime({new_input1, new_input2}, {}); + auto expect = at::tanh(at::relu(new_input1 + new_input2)); + EXPECT_TRUE(at::allclose(expect, actual.toTensor())); + } +} + +TEST(CpuFusion, FallbackGraph) { + const auto simple_script = R"JIT( + def forward(self, a, b): + return (a + b).relu().tanh() + )JIT"; + + Module m("module"); + m.define(simple_script); + + StaticModuleOptions opts; // start with the defaults. + opts.enable_tensorexpr_fusion = true; + + auto sample_input1 = at::randn({2, 3}); + auto sample_input2 = at::ones({2, 3}); + auto smodule = StaticModule( + m, /* is_frozen */ false, opts, {sample_input1, sample_input2}); + + StaticRuntime runtime(smodule); + + // The sample inputs above were contiguous. Now, use a strided input + // to trigger running the fallback graph. + { + auto input1 = at::narrow(at::randn({2, 6}), 1, 0, 3); + auto input2 = at::ones({2, 3}); + auto expect = at::tanh(at::relu(input1 + input2)); + auto actual = runtime({input1, input2}, {}); + EXPECT_TRUE(at::allclose(expect, actual.toTensor())); + } + + // Test with strided inputs of different size. + { + auto input1 = at::narrow(at::randn({10, 30}), 1, 0, 25); + auto input2 = at::randn({10, 25}); + auto expect = at::tanh(at::relu(input1 + input2)); + auto actual = runtime({input1, input2}, {}); + EXPECT_TRUE(at::allclose(expect, actual.toTensor())); + } +} + +TEST(CpuFusion, ParallelRuntimes) { + const auto simple_script = R"JIT( + def forward(self, a, b): + return (a + b).relu().tanh() + )JIT"; + + Module m("module"); + m.define(simple_script); + + StaticModuleOptions opts; // start with the defaults. + opts.enable_tensorexpr_fusion = true; + + auto sample_input1 = at::randn({2, 3}); + auto sample_input2 = at::ones({2, 3}); + auto smodule = StaticModule( + m, /* is_frozen */ false, opts, {sample_input1, sample_input2}); + + constexpr size_t kNumThreads = 2; + std::vector>> all_inputs; + for (size_t id = 0; id < kNumThreads; ++id) { + std::vector> thread_input = { + {id, id + 1}, + {id + 10, id + 11}, + {id + 20, id + 21}, + {id + 30, id + 31}, + {id + 40, id + 41}, + {id + 50, id + 51}, + {id + 60, id + 61}, + {id + 70, id + 71}}; + all_inputs.emplace_back(std::move(thread_input)); + } + + auto exec_runtime = [&](size_t tid) { + const auto& inputs = all_inputs[tid]; + StaticRuntime runtime(smodule); + for (const auto& inp : inputs) { + auto a = at::randn({inp.first, inp.second}); + auto b = at::randn({inp.first, inp.second}); + auto expect = at::tanh(at::relu(a + b)); + auto actual = runtime({a, b}, {}); + EXPECT_TRUE(at::allclose(expect, actual.toTensor())); + } + }; + + std::vector threads; + for (size_t id = 0; id < kNumThreads; ++id) { + threads.emplace_back(exec_runtime, id); + } + + for (auto& t : threads) { + t.join(); + } +} diff --git a/benchmarks/static_runtime/test_generated_ops.cc b/benchmarks/static_runtime/test_generated_ops.cc new file mode 100644 index 000000000000..3011a3abbe05 --- /dev/null +++ b/benchmarks/static_runtime/test_generated_ops.cc @@ -0,0 +1,7864 @@ +// @lint-ignore-every CLANGTIDY HOWTOEVEN +#include +#include +#include + +#include "test_utils.h" + +using namespace caffe2; +using namespace torch; +using namespace torch::jit; +using namespace torch::jit::test; +using c10::IValue; + +TEST(StaticRuntime, autogen_absolute) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::absolute(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_angle) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::angle(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_sgn) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::sgn(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_acos) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::acos(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_arccos) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::arccos(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen__add_relu_Tensor) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor, %alpha: int): + %bias: None = prim::Constant() + %ret = aten::_add_relu(%self, %other, %alpha) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + auto alpha0 = 2; + std::vector args{self0, other0, alpha0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + auto alpha1 = 2; + std::vector args2{self1, other1, alpha1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_addmv) { + const std::string script = R"IR( + graph(%self: Tensor, %mat: Tensor, %vec: Tensor, %beta: int, %alpha: int): + %bias: None = prim::Constant() + %ret = aten::addmv(%self, %mat, %vec, %beta, %alpha) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({2}); + auto mat0 = at::rand({2, 2}); + auto vec0 = at::rand({2}); + auto beta0 = 2; + auto alpha0 = 2; + std::vector args{self0, mat0, vec0, beta0, alpha0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({35}); + auto mat1 = at::rand({35, 35}); + auto vec1 = at::rand({35}); + auto beta1 = 2; + auto alpha1 = 2; + std::vector args2{self1, mat1, vec1, beta1, alpha1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_addr) { + const std::string script = R"IR( + graph(%self: Tensor, %vec1: Tensor, %vec2: Tensor, %beta: int, %alpha: int): + %bias: None = prim::Constant() + %ret = aten::addr(%self, %vec1, %vec2, %beta, %alpha) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6}); + auto vec10 = at::rand({6}); + auto vec20 = at::rand({6}); + auto beta0 = 2; + auto alpha0 = 2; + std::vector args{self0, vec10, vec20, beta0, alpha0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22}); + auto vec11 = at::rand({22}); + auto vec21 = at::rand({22}); + auto beta1 = 2; + auto alpha1 = 2; + std::vector args2{self1, vec11, vec21, beta1, alpha1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_argmax) { + const std::string script = R"IR( + graph(%self: Tensor, %dim: int?, %keepdim: bool): + %bias: None = prim::Constant() + %ret = aten::argmax(%self, %dim, %keepdim) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto dim0 = 1; + auto keepdim0 = false; + std::vector args{self0, dim0, keepdim0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto dim1 = 1; + auto keepdim1 = false; + std::vector args2{self1, dim1, keepdim1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_acosh) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::acosh(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({2, 2, 2}) + at::ones({2, 2, 2}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({5, 5, 5}) + at::ones({5, 5, 5}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_asinh) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::asinh(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_arcsinh) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::arcsinh(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_atanh) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::atanh(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_arctanh) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::arctanh(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_asin) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::asin(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_arcsin) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::arcsin(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_atan) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::atan(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_arctan) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::arctan(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_baddbmm) { + const std::string script = R"IR( + graph(%self: Tensor, %batch1: Tensor, %batch2: Tensor, %beta: int, %alpha: int): + %bias: None = prim::Constant() + %ret = aten::baddbmm(%self, %batch1, %batch2, %beta, %alpha) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto batch10 = at::rand({6, 6, 6}); + auto batch20 = at::rand({6, 6, 6}); + auto beta0 = 2; + auto alpha0 = 2; + std::vector args{self0, batch10, batch20, beta0, alpha0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto batch11 = at::rand({22, 22, 22}); + auto batch21 = at::rand({22, 22, 22}); + auto beta1 = 2; + auto alpha1 = 2; + std::vector args2{self1, batch11, batch21, beta1, alpha1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_bitwise_not) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::bitwise_not(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::randint(1, 100, {6, 6, 6}, at::kInt); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::randint(1, 100, {22, 22, 22}, at::kInt); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_copysign_Tensor) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::copysign(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_logical_not) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::logical_not(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_logical_xor) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::logical_xor(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_logical_and) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::logical_and(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_logical_or) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::logical_or(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_ceil) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::ceil(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_clamp_max) { + const std::string script = R"IR( + graph(%self: Tensor, %max: int): + %bias: None = prim::Constant() + %ret = aten::clamp_max(%self, %max) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto max0 = 2; + std::vector args{self0, max0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto max1 = 2; + std::vector args2{self1, max1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_clip) { + const std::string script = R"IR( + graph(%self: Tensor, %min: int?, %max: int?): + %bias: None = prim::Constant() + %ret = aten::clip(%self, %min, %max) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto min0 = 2; + auto max0 = 2; + std::vector args{self0, min0, max0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto min1 = 2; + auto max1 = 2; + std::vector args2{self1, min1, max1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_complex) { + const std::string script = R"IR( + graph(%real: Tensor, %imag: Tensor): + %bias: None = prim::Constant() + %ret = aten::complex(%real, %imag) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto real0 = at::rand({6, 6, 6}); + auto imag0 = at::rand({6, 6, 6}); + std::vector args{real0, imag0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto real1 = at::rand({22, 22, 22}); + auto imag1 = at::rand({22, 22, 22}); + std::vector args2{real1, imag1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_polar) { + const std::string script = R"IR( + graph(%abs: Tensor, %angle: Tensor): + %bias: None = prim::Constant() + %ret = aten::polar(%abs, %angle) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto abs0 = at::rand({6, 6, 6}); + auto angle0 = at::rand({6, 6, 6}); + std::vector args{abs0, angle0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto abs1 = at::rand({22, 22, 22}); + auto angle1 = at::rand({22, 22, 22}); + std::vector args2{abs1, angle1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_cos) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::cos(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_cosh) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::cosh(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_cumprod) { + const std::string script = R"IR( + graph(%self: Tensor, %dim: int, %dtype: int?): + %bias: None = prim::Constant() + %ret = aten::cumprod(%self, %dim, %dtype) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto dim0 = 1; + auto dtype0 = at::ScalarType::Float; + std::vector args{self0, dim0, dtype0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto dim1 = 1; + auto dtype1 = at::ScalarType::Float; + std::vector args2{self1, dim1, dtype1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_diff) { + const std::string script = R"IR( + graph(%self: Tensor, %n: int, %dim: int, %prepend: Tensor?, %append: Tensor?): + %bias: None = prim::Constant() + %ret = aten::diff(%self, %n, %dim, %prepend, %append) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto n0 = 1; + auto dim0 = 1; + auto prepend0 = at::rand({6, 6, 6}); + auto append0 = at::rand({6, 6, 6}); + std::vector args{self0, n0, dim0, prepend0, append0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto n1 = 1; + auto dim1 = 1; + auto prepend1 = at::rand({22, 22, 22}); + auto append1 = at::rand({22, 22, 22}); + std::vector args2{self1, n1, dim1, prepend1, append1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_divide_Tensor) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::divide(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_true_divide_Tensor) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::true_divide(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_dot) { + const std::string script = R"IR( + graph(%self: Tensor, %tensor: Tensor): + %bias: None = prim::Constant() + %ret = aten::dot(%self, %tensor) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({16}); + auto tensor0 = at::rand({16}); + std::vector args{self0, tensor0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/false); + + auto self1 = at::rand({64}); + auto tensor1 = at::rand({64}); + std::vector args2{self1, tensor1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/false); +} + +TEST(StaticRuntime, autogen_vdot) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::vdot(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({16}); + auto other0 = at::rand({16}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/false); + + auto self1 = at::rand({64}); + auto other1 = at::rand({64}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/false); +} + +TEST(StaticRuntime, autogen_erf) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::erf(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_erfc) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::erfc(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_exp) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::exp(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_exp2) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::exp2(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_expm1) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::expm1(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_floor) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::floor(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_frac) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::frac(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_gcd) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::gcd(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::randint(1, 100, {6, 6, 6}, at::kInt); + auto other0 = at::randint(1, 100, {6, 6, 6}, at::kInt); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::randint(1, 100, {22, 22, 22}, at::kInt); + auto other1 = at::randint(1, 100, {22, 22, 22}, at::kInt); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_lcm) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::lcm(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::randint(1, 100, {6, 6, 6}, at::kInt); + auto other0 = at::randint(1, 100, {6, 6, 6}, at::kInt); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::randint(1, 100, {22, 22, 22}, at::kInt); + auto other1 = at::randint(1, 100, {22, 22, 22}, at::kInt); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_index_copy) { + const std::string script = R"IR( + graph(%self: Tensor, %dim: int, %index: Tensor, %source: Tensor): + %bias: None = prim::Constant() + %ret = aten::index_copy(%self, %dim, %index, %source) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({2}); + auto dim0 = 0; + auto index0 = at::randint(0, 1, {2}, at::kLong); + auto source0 = at::rand({2}); + std::vector args{self0, dim0, index0, source0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({32}); + auto dim1 = 0; + auto index1 = at::randint(0, 10, {32}, at::kLong); + auto source1 = at::rand({32}); + std::vector args2{self1, dim1, index1, source1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_inverse) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::inverse(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_isin_Tensor_Tensor) { + const std::string script = R"IR( + graph(%elements: Tensor, %test_elements: Tensor, %assume_unique: bool, %invert: bool): + %bias: None = prim::Constant() + %ret = aten::isin(%elements, %test_elements, %assume_unique, %invert) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto elements0 = at::rand({6, 6, 6}); + auto test_elements0 = at::rand({6, 6, 6}); + auto assume_unique0 = false; + auto invert0 = false; + std::vector args{elements0, test_elements0, assume_unique0, invert0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto elements1 = at::rand({22, 22, 22}); + auto test_elements1 = at::rand({22, 22, 22}); + auto assume_unique1 = false; + auto invert1 = false; + std::vector args2{elements1, test_elements1, assume_unique1, invert1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_isin_Tensor_Scalar) { + const std::string script = R"IR( + graph(%elements: Tensor, %test_element: int, %assume_unique: bool, %invert: bool): + %bias: None = prim::Constant() + %ret = aten::isin(%elements, %test_element, %assume_unique, %invert) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto elements0 = at::rand({6, 6, 6}); + auto test_element0 = 2; + auto assume_unique0 = false; + auto invert0 = false; + std::vector args{elements0, test_element0, assume_unique0, invert0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto elements1 = at::rand({22, 22, 22}); + auto test_element1 = 2; + auto assume_unique1 = false; + auto invert1 = false; + std::vector args2{elements1, test_element1, assume_unique1, invert1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_isin_Scalar_Tensor) { + const std::string script = R"IR( + graph(%element: int, %test_elements: Tensor, %assume_unique: bool, %invert: bool): + %bias: None = prim::Constant() + %ret = aten::isin(%element, %test_elements, %assume_unique, %invert) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto element0 = 2; + auto test_elements0 = at::rand({6, 6, 6}); + auto assume_unique0 = false; + auto invert0 = false; + std::vector args{element0, test_elements0, assume_unique0, invert0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/false); + + auto element1 = 2; + auto test_elements1 = at::rand({22, 22, 22}); + auto assume_unique1 = false; + auto invert1 = false; + std::vector args2{element1, test_elements1, assume_unique1, invert1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/false); +} + +TEST(StaticRuntime, autogen_kron) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::kron(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_ldexp_Tensor) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::ldexp(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_log10) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::log10(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_log1p) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::log1p(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_log2) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::log2(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_logaddexp) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::logaddexp(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_logaddexp2) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::logaddexp2(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_xlogy_Tensor) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::xlogy(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen__log_softmax) { + const std::string script = R"IR( + graph(%self: Tensor, %dim: int, %half_to_float: bool): + %bias: None = prim::Constant() + %ret = aten::_log_softmax(%self, %dim, %half_to_float) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto dim0 = 1; + auto half_to_float0 = false; + std::vector args{self0, dim0, half_to_float0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto dim1 = 1; + auto half_to_float1 = false; + std::vector args2{self1, dim1, half_to_float1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen__log_softmax_backward_data) { + const std::string script = R"IR( + graph(%grad_output: Tensor, %output: Tensor, %dim: int, %input_dtype: int): + %bias: None = prim::Constant() + %ret = aten::_log_softmax_backward_data(%grad_output, %output, %dim, %input_dtype) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto grad_output0 = at::rand({6, 6, 6}); + auto output0 = at::rand({6, 6, 6}); + auto dim0 = 1; + auto input_dtype0 = at::ScalarType::Float; + std::vector args{grad_output0, output0, dim0, input_dtype0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto grad_output1 = at::rand({22, 22, 22}); + auto output1 = at::rand({22, 22, 22}); + auto dim1 = 1; + auto input_dtype1 = at::ScalarType::Float; + std::vector args2{grad_output1, output1, dim1, input_dtype1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen__logcumsumexp) { + const std::string script = R"IR( + graph(%self: Tensor, %dim: int): + %bias: None = prim::Constant() + %ret = aten::_logcumsumexp(%self, %dim) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto dim0 = 1; + std::vector args{self0, dim0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto dim1 = 1; + std::vector args2{self1, dim1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_logcumsumexp) { + const std::string script = R"IR( + graph(%self: Tensor, %dim: int): + %bias: None = prim::Constant() + %ret = aten::logcumsumexp(%self, %dim) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto dim0 = 1; + std::vector args{self0, dim0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto dim1 = 1; + std::vector args2{self1, dim1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_matrix_power) { + const std::string script = R"IR( + graph(%self: Tensor, %n: int): + %bias: None = prim::Constant() + %ret = aten::matrix_power(%self, %n) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto n0 = 1; + std::vector args{self0, n0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto n1 = 1; + std::vector args2{self1, n1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_mm) { + const std::string script = R"IR( + graph(%self: Tensor, %mat2: Tensor): + %bias: None = prim::Constant() + %ret = aten::mm(%self, %mat2) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({8, 8}); + auto mat20 = at::rand({8, 8}); + std::vector args{self0, mat20}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({32, 32}); + auto mat21 = at::rand({32, 32}); + std::vector args2{self1, mat21}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_multiply_Tensor) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::multiply(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_mv) { + const std::string script = R"IR( + graph(%self: Tensor, %vec: Tensor): + %bias: None = prim::Constant() + %ret = aten::mv(%self, %vec) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6}); + auto vec0 = at::rand({6}); + std::vector args{self0, vec0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22}); + auto vec1 = at::rand({22}); + std::vector args2{self1, vec1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_mvlgamma) { + const std::string script = R"IR( + graph(%self: Tensor, %p: int): + %bias: None = prim::Constant() + %ret = aten::mvlgamma(%self, %p) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto p0 = 1; + std::vector args{self0, p0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto p1 = 1; + std::vector args2{self1, p1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_rad2deg) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::rad2deg(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_deg2rad) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::deg2rad(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_reciprocal) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::reciprocal(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_neg) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::neg(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_negative) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::negative(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_round) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::round(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_round_decimals) { + const std::string script = R"IR( + graph(%self: Tensor, %decimals: int): + %bias: None = prim::Constant() + %ret = aten::round(%self, %decimals) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto decimals0 = 1; + std::vector args{self0, decimals0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto decimals1 = 1; + std::vector args2{self1, decimals1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_gelu) { + const std::string script = R"IR( + graph(%self: Tensor, %approximate: str): + %bias: None = prim::Constant() + %ret = aten::gelu(%self, %approximate) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto approximate0 = "tanh"; + std::vector args{self0, approximate0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto approximate1 = "tanh"; + std::vector args2{self1, approximate1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_gelu_backward) { + const std::string script = R"IR( + graph(%grad_output: Tensor, %self: Tensor, %approximate: str): + %bias: None = prim::Constant() + %ret = aten::gelu_backward(%grad_output, %self, %approximate) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto grad_output0 = at::rand({6, 6, 6}); + auto self0 = at::rand({6, 6, 6}); + auto approximate0 = "tanh"; + std::vector args{grad_output0, self0, approximate0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto grad_output1 = at::rand({22, 22, 22}); + auto self1 = at::rand({22, 22, 22}); + auto approximate1 = "tanh"; + std::vector args2{grad_output1, self1, approximate1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_hardshrink) { + const std::string script = R"IR( + graph(%self: Tensor, %lambd: int): + %bias: None = prim::Constant() + %ret = aten::hardshrink(%self, %lambd) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto lambd0 = 2; + std::vector args{self0, lambd0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto lambd1 = 2; + std::vector args2{self1, lambd1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_hardshrink_backward) { + const std::string script = R"IR( + graph(%grad_out: Tensor, %self: Tensor, %lambd: int): + %bias: None = prim::Constant() + %ret = aten::hardshrink_backward(%grad_out, %self, %lambd) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto grad_out0 = at::rand({6, 6, 6}); + auto self0 = at::rand({6, 6, 6}); + auto lambd0 = 2; + std::vector args{grad_out0, self0, lambd0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto grad_out1 = at::rand({22, 22, 22}); + auto self1 = at::rand({22, 22, 22}); + auto lambd1 = 2; + std::vector args2{grad_out1, self1, lambd1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_rsqrt) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::rsqrt(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_silu) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::silu(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_silu_backward) { + const std::string script = R"IR( + graph(%grad_output: Tensor, %self: Tensor): + %bias: None = prim::Constant() + %ret = aten::silu_backward(%grad_output, %self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto grad_output0 = at::rand({6, 6, 6}); + auto self0 = at::rand({6, 6, 6}); + std::vector args{grad_output0, self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto grad_output1 = at::rand({22, 22, 22}); + auto self1 = at::rand({22, 22, 22}); + std::vector args2{grad_output1, self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_mish) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::mish(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_sin) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::sin(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_sinc) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::sinc(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_sinh) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::sinh(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen__softmax) { + const std::string script = R"IR( + graph(%self: Tensor, %dim: int, %half_to_float: bool): + %bias: None = prim::Constant() + %ret = aten::_softmax(%self, %dim, %half_to_float) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto dim0 = 1; + auto half_to_float0 = false; + std::vector args{self0, dim0, half_to_float0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto dim1 = 1; + auto half_to_float1 = false; + std::vector args2{self1, dim1, half_to_float1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen__softmax_backward_data) { + const std::string script = R"IR( + graph(%grad_output: Tensor, %output: Tensor, %dim: int, %input_dtype: int): + %bias: None = prim::Constant() + %ret = aten::_softmax_backward_data(%grad_output, %output, %dim, %input_dtype) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto grad_output0 = at::rand({6, 6, 6}); + auto output0 = at::rand({6, 6, 6}); + auto dim0 = 1; + auto input_dtype0 = at::ScalarType::Float; + std::vector args{grad_output0, output0, dim0, input_dtype0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto grad_output1 = at::rand({22, 22, 22}); + auto output1 = at::rand({22, 22, 22}); + auto dim1 = 1; + auto input_dtype1 = at::ScalarType::Float; + std::vector args2{grad_output1, output1, dim1, input_dtype1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_sqrt) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::sqrt(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_square) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::square(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_prod_dim_int) { + const std::string script = R"IR( + graph(%self: Tensor, %dim: int, %keepdim: bool, %dtype: int?): + %bias: None = prim::Constant() + %ret = aten::prod(%self, %dim, %keepdim, %dtype) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto dim0 = 1; + auto keepdim0 = false; + auto dtype0 = at::ScalarType::Float; + std::vector args{self0, dim0, keepdim0, dtype0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto dim1 = 1; + auto keepdim1 = false; + auto dtype1 = at::ScalarType::Float; + std::vector args2{self1, dim1, keepdim1, dtype1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_tan) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::tan(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_threshold) { + const std::string script = R"IR( + graph(%self: Tensor, %threshold: int, %value: int): + %bias: None = prim::Constant() + %ret = aten::threshold(%self, %threshold, %value) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto threshold0 = 2; + auto value0 = 2; + std::vector args{self0, threshold0, value0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto threshold1 = 2; + auto value1 = 2; + std::vector args2{self1, threshold1, value1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_threshold_backward) { + const std::string script = R"IR( + graph(%grad_output: Tensor, %self: Tensor, %threshold: int): + %bias: None = prim::Constant() + %ret = aten::threshold_backward(%grad_output, %self, %threshold) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto grad_output0 = at::rand({6, 6, 6}); + auto self0 = at::rand({6, 6, 6}); + auto threshold0 = 2; + std::vector args{grad_output0, self0, threshold0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto grad_output1 = at::rand({22, 22, 22}); + auto self1 = at::rand({22, 22, 22}); + auto threshold1 = 2; + std::vector args2{grad_output1, self1, threshold1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_trunc) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::trunc(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_fix) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::fix(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_nuclear_norm) { + const std::string script = R"IR( + graph(%self: Tensor, %keepdim: bool): + %bias: None = prim::Constant() + %ret = aten::nuclear_norm(%self, %keepdim) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({8, 8}); + auto keepdim0 = false; + std::vector args{self0, keepdim0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/false); + + auto self1 = at::rand({32, 32}); + auto keepdim1 = false; + std::vector args2{self1, keepdim1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/false); +} + +TEST(StaticRuntime, autogen_subtract_Tensor) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor, %alpha: int): + %bias: None = prim::Constant() + %ret = aten::subtract(%self, %other, %alpha) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + auto alpha0 = 2; + std::vector args{self0, other0, alpha0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + auto alpha1 = 2; + std::vector args2{self1, other1, alpha1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_heaviside) { + const std::string script = R"IR( + graph(%self: Tensor, %values: Tensor): + %bias: None = prim::Constant() + %ret = aten::heaviside(%self, %values) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto values0 = at::rand({6, 6, 6}); + std::vector args{self0, values0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto values1 = at::rand({22, 22, 22}); + std::vector args2{self1, values1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen__addmm_activation) { + const std::string script = R"IR( + graph(%self: Tensor, %mat1: Tensor, %mat2: Tensor, %beta: int, %alpha: int, %use_gelu: bool): + %bias: None = prim::Constant() + %ret = aten::_addmm_activation(%self, %mat1, %mat2, %beta, %alpha, %use_gelu) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({8, 8}); + auto mat10 = at::rand({8, 8}); + auto mat20 = at::rand({8, 8}); + auto beta0 = 2; + auto alpha0 = 2; + auto use_gelu0 = false; + std::vector args{self0, mat10, mat20, beta0, alpha0, use_gelu0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({32, 32}); + auto mat11 = at::rand({32, 32}); + auto mat21 = at::rand({32, 32}); + auto beta1 = 2; + auto alpha1 = 2; + auto use_gelu1 = false; + std::vector args2{self1, mat11, mat21, beta1, alpha1, use_gelu1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_index_add) { + const std::string script = R"IR( + graph(%self: Tensor, %dim: int, %index: Tensor, %source: Tensor, %alpha: int): + %bias: None = prim::Constant() + %ret = aten::index_add(%self, %dim, %index, %source, %alpha) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({2}); + auto dim0 = 0; + auto index0 = at::randint(0, 1, {2}, at::kInt); + auto source0 = at::rand({2}); + auto alpha0 = 2; + std::vector args{self0, dim0, index0, source0, alpha0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/false); + + auto self1 = at::rand({16}); + auto dim1 = 0; + auto index1 = at::randint(0, 10, {16}, at::kInt); + auto source1 = at::rand({16}); + auto alpha1 = 2; + std::vector args2{self1, dim1, index1, source1, alpha1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/false); +} + +TEST(StaticRuntime, autogen_scatter_src) { + const std::string script = R"IR( + graph(%self: Tensor, %dim: int, %index: Tensor, %src: Tensor): + %bias: None = prim::Constant() + %ret = aten::scatter(%self, %dim, %index, %src) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::randint(1, 100, {2, 2, 2}, torch::kInt64); + auto dim0 = 1; + auto index0 = at::randint(0, 1, {2, 2, 2}, torch::kInt64); + auto src0 = at::randint(1, 100, {2, 2, 2}, torch::kInt64); + std::vector args{self0, dim0, index0, src0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::randint(1, 100, {5, 5, 5}, torch::kInt64); + auto dim1 = 1; + auto index1 = at::randint(0, 1, {5, 5, 5}, torch::kInt64); + auto src1 = at::randint(1, 100, {5, 5, 5}, torch::kInt64); + std::vector args2{self1, dim1, index1, src1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_scatter_value) { + const std::string script = R"IR( + graph(%self: Tensor, %dim: int, %index: Tensor, %value: int): + %bias: None = prim::Constant() + %ret = aten::scatter(%self, %dim, %index, %value) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::randint(1, 100, {2, 2, 2}, torch::kInt64); + auto dim0 = 1; + auto index0 = at::randint(0, 1, {2, 2, 2}, torch::kInt64); + auto value0 = 2; + auto src0 = at::randint(1, 100, {2, 2, 2}, torch::kInt64); + std::vector args{self0, dim0, index0, value0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::randint(1, 100, {5, 5, 5}, torch::kInt64); + auto dim1 = 1; + auto index1 = at::randint(0, 1, {5, 5, 5}, torch::kInt64); + auto value1 = 2; + auto src1 = at::randint(1, 100, {5, 5, 5}, torch::kInt64); + std::vector args2{self1, dim1, index1, value1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_scatter_reduce) { + const std::string script = R"IR( + graph(%self: Tensor, %dim: int, %index: Tensor, %src: Tensor, %reduce: str): + %bias: None = prim::Constant() + %ret = aten::scatter(%self, %dim, %index, %src, %reduce) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::randint(1, 100, {2, 2, 2}, torch::kInt64); + auto dim0 = 1; + auto index0 = at::randint(0, 1, {2, 2, 2}, torch::kInt64); + auto src0 = at::randint(1, 100, {2, 2, 2}, torch::kInt64); + auto reduce0 = "add"; + std::vector args{self0, dim0, index0, src0, reduce0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::randint(1, 100, {5, 5, 5}, torch::kInt64); + auto dim1 = 1; + auto index1 = at::randint(0, 1, {5, 5, 5}, torch::kInt64); + auto src1 = at::randint(1, 100, {5, 5, 5}, torch::kInt64); + auto reduce1 = "add"; + std::vector args2{self1, dim1, index1, src1, reduce1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_scatter_value_reduce) { + const std::string script = R"IR( + graph(%self: Tensor, %dim: int, %index: Tensor, %value: int, %reduce: str): + %bias: None = prim::Constant() + %ret = aten::scatter(%self, %dim, %index, %value, %reduce) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::randint(1, 100, {2, 2, 2}, torch::kInt64); + auto dim0 = 1; + auto index0 = at::randint(0, 1, {2, 2, 2}, torch::kInt64); + auto value0 = 2; + auto reduce0 = "add"; + auto src0 = at::randint(1, 100, {2, 2, 2}, torch::kInt64); + std::vector args{self0, dim0, index0, value0, reduce0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::randint(1, 100, {5, 5, 5}, torch::kInt64); + auto dim1 = 1; + auto index1 = at::randint(0, 1, {5, 5, 5}, torch::kInt64); + auto value1 = 2; + auto reduce1 = "add"; + auto src1 = at::randint(1, 100, {5, 5, 5}, torch::kInt64); + std::vector args2{self1, dim1, index1, value1, reduce1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_scatter_add) { + const std::string script = R"IR( + graph(%self: Tensor, %dim: int, %index: Tensor, %src: Tensor): + %bias: None = prim::Constant() + %ret = aten::scatter_add(%self, %dim, %index, %src) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::randint(1, 100, {2, 2, 2}, torch::kInt64); + auto dim0 = 1; + auto index0 = at::randint(0, 1, {2, 2, 2}, torch::kInt64); + auto src0 = at::randint(1, 100, {2, 2, 2}, torch::kInt64); + std::vector args{self0, dim0, index0, src0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::randint(1, 100, {5, 5, 5}, torch::kInt64); + auto dim1 = 1; + auto index1 = at::randint(0, 1, {5, 5, 5}, torch::kInt64); + auto src1 = at::randint(1, 100, {5, 5, 5}, torch::kInt64); + std::vector args2{self1, dim1, index1, src1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_scatter_reduce_two) { + const std::string script = R"IR( + graph(%self: Tensor, %dim: int, %index: Tensor, %src: Tensor, %reduce: str, %include_self: bool): + %bias: None = prim::Constant() + %ret = aten::scatter_reduce(%self, %dim, %index, %src, %reduce, %include_self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto dim0 = 1; + auto index0 = at::randint(6, {6, 6, 6}, torch::kInt64); + auto src0 = at::rand({6, 6, 6}); + auto reduce0 = "mean"; + auto include_self0 = false; + std::vector args{self0, dim0, index0, src0, reduce0, include_self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto dim1 = 1; + auto index1 = at::randint(22, {22, 22, 22}, torch::kInt64); + auto src1 = at::rand({22, 22, 22}); + auto reduce1 = "mean"; + auto include_self1 = false; + std::vector args2{self1, dim1, index1, src1, reduce1, include_self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_eq_Scalar) { + const std::string script = R"IR( + graph(%self: Tensor, %other: int): + %bias: None = prim::Constant() + %ret = aten::eq(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = 2; + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = 2; + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_eq_Tensor) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::eq(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_bitwise_and_Tensor) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::bitwise_and(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::randint(1, 100, {6, 6, 6}, at::kInt); + auto other0 = at::randint(1, 100, {6, 6, 6}, at::kInt); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::randint(1, 100, {22, 22, 22}, at::kInt); + auto other1 = at::randint(1, 100, {22, 22, 22}, at::kInt); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_bitwise_or_Tensor) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::bitwise_or(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::randint(1, 100, {6, 6, 6}, at::kInt); + auto other0 = at::randint(1, 100, {6, 6, 6}, at::kInt); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::randint(1, 100, {22, 22, 22}, at::kInt); + auto other1 = at::randint(1, 100, {22, 22, 22}, at::kInt); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_bitwise_xor_Tensor) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::bitwise_xor(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::randint(1, 100, {6, 6, 6}, at::kInt); + auto other0 = at::randint(1, 100, {6, 6, 6}, at::kInt); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::randint(1, 100, {22, 22, 22}, at::kInt); + auto other1 = at::randint(1, 100, {22, 22, 22}, at::kInt); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_bitwise_left_shift_Tensor) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::bitwise_left_shift(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_bitwise_right_shift_Tensor) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::bitwise_right_shift(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_tril) { + const std::string script = R"IR( + graph(%self: Tensor, %diagonal: int): + %bias: None = prim::Constant() + %ret = aten::tril(%self, %diagonal) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto diagonal0 = 1; + std::vector args{self0, diagonal0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto diagonal1 = 1; + std::vector args2{self1, diagonal1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_triu) { + const std::string script = R"IR( + graph(%self: Tensor, %diagonal: int): + %bias: None = prim::Constant() + %ret = aten::triu(%self, %diagonal) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto diagonal0 = 1; + std::vector args{self0, diagonal0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto diagonal1 = 1; + std::vector args2{self1, diagonal1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_digamma) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::digamma(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_lerp_Scalar) { + const std::string script = R"IR( + graph(%self: Tensor, %end: Tensor, %weight: int): + %bias: None = prim::Constant() + %ret = aten::lerp(%self, %end, %weight) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto end0 = at::rand({6, 6, 6}); + auto weight0 = 2; + std::vector args{self0, end0, weight0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto end1 = at::rand({22, 22, 22}); + auto weight1 = 2; + std::vector args2{self1, end1, weight1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_lerp_Tensor) { + const std::string script = R"IR( + graph(%self: Tensor, %end: Tensor, %weight: Tensor): + %bias: None = prim::Constant() + %ret = aten::lerp(%self, %end, %weight) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto end0 = at::rand({6, 6, 6}); + auto weight0 = at::rand({6, 6, 6}); + std::vector args{self0, end0, weight0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto end1 = at::rand({22, 22, 22}); + auto weight1 = at::rand({22, 22, 22}); + std::vector args2{self1, end1, weight1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_addbmm) { + const std::string script = R"IR( + graph(%self: Tensor, %batch1: Tensor, %batch2: Tensor, %beta: int, %alpha: int): + %bias: None = prim::Constant() + %ret = aten::addbmm(%self, %batch1, %batch2, %beta, %alpha) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6}); + auto batch10 = at::rand({6, 6, 6}); + auto batch20 = at::rand({6, 6, 6}); + auto beta0 = 2; + auto alpha0 = 2; + std::vector args{self0, batch10, batch20, beta0, alpha0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22}); + auto batch11 = at::rand({22, 22, 22}); + auto batch21 = at::rand({22, 22, 22}); + auto beta1 = 2; + auto alpha1 = 2; + std::vector args2{self1, batch11, batch21, beta1, alpha1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_diag) { + const std::string script = R"IR( + graph(%self: Tensor, %diagonal: int): + %bias: None = prim::Constant() + %ret = aten::diag(%self, %diagonal) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({8, 8}); + auto diagonal0 = 1; + std::vector args{self0, diagonal0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({32, 32}); + auto diagonal1 = 1; + std::vector args2{self1, diagonal1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_cross) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor, %dim: int?): + %bias: None = prim::Constant() + %ret = aten::cross(%self, %other, %dim) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({3, 3, 3}); + auto other0 = at::rand({3, 3, 3}); + auto dim0 = 1; + std::vector args{self0, other0, dim0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 3, 22}); + auto other1 = at::rand({22, 3, 22}); + auto dim1 = 1; + std::vector args2{self1, other1, dim1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_ne_Scalar) { + const std::string script = R"IR( + graph(%self: Tensor, %other: int): + %bias: None = prim::Constant() + %ret = aten::ne(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = 2; + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = 2; + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_ne_Tensor) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::ne(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_ge_Scalar) { + const std::string script = R"IR( + graph(%self: Tensor, %other: int): + %bias: None = prim::Constant() + %ret = aten::ge(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = 2; + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = 2; + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_ge_Tensor) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::ge(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_le_Scalar) { + const std::string script = R"IR( + graph(%self: Tensor, %other: int): + %bias: None = prim::Constant() + %ret = aten::le(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = 2; + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = 2; + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_le_Tensor) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::le(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_gt_Scalar) { + const std::string script = R"IR( + graph(%self: Tensor, %other: int): + %bias: None = prim::Constant() + %ret = aten::gt(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = 2; + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = 2; + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_gt_Tensor) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::gt(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_lt_Scalar) { + const std::string script = R"IR( + graph(%self: Tensor, %other: int): + %bias: None = prim::Constant() + %ret = aten::lt(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = 2; + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = 2; + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_lt_Tensor) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::lt(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_take) { + const std::string script = R"IR( + graph(%self: Tensor, %index: Tensor): + %bias: None = prim::Constant() + %ret = aten::take(%self, %index) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto index0 = at::randint(0, 216, {20}, torch::kInt64); + std::vector args{self0, index0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto index1 = at::randint(0, 1000, {100}, torch::kInt64); + std::vector args2{self1, index1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_take_along_dim) { + const std::string script = R"IR( + graph(%self: Tensor, %indices: Tensor, %dim: int?): + %bias: None = prim::Constant() + %ret = aten::take_along_dim(%self, %indices, %dim) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto indices0 = at::argsort(self0, 1); + auto dim0 = 1; + std::vector args{self0, indices0, dim0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto indices1 = at::argsort(self1, 1); + auto dim1 = 1; + std::vector args2{self1, indices1, dim1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_masked_select) { + const std::string script = R"IR( + graph(%self: Tensor, %mask: Tensor): + %bias: None = prim::Constant() + %ret = aten::masked_select(%self, %mask) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto mask0 = at::randn({6, 6, 6}) > 0.5; + std::vector args{self0, mask0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto mask1 = at::rand({22, 22, 22}) > 0.5; + std::vector args2{self1, mask1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_gather) { + const std::string script = R"IR( + graph(%self: Tensor, %dim: int, %index: Tensor, %sparse_grad: bool): + %bias: None = prim::Constant() + %ret = aten::gather(%self, %dim, %index, %sparse_grad) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::randint(1, 100, {2, 2, 2}, at::kInt); + auto dim0 = 1; + auto index0 = at::randint(0, 1, {2, 2, 2}, torch::kInt64); + auto sparse_grad0 = false; + std::vector args{self0, dim0, index0, sparse_grad0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::randint(1, 100, {5, 5, 5}, at::kInt); + auto dim1 = 1; + auto index1 = at::randint(0, 4, {5, 5, 5}, torch::kInt64); + auto sparse_grad1 = false; + std::vector args2{self1, dim1, index1, sparse_grad1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_addcmul) { + const std::string script = R"IR( + graph(%self: Tensor, %tensor1: Tensor, %tensor2: Tensor, %value: int): + %bias: None = prim::Constant() + %ret = aten::addcmul(%self, %tensor1, %tensor2, %value) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto tensor10 = at::rand({6, 6, 6}); + auto tensor20 = at::rand({6, 6, 6}); + auto value0 = 2; + std::vector args{self0, tensor10, tensor20, value0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto tensor11 = at::rand({22, 22, 22}); + auto tensor21 = at::rand({22, 22, 22}); + auto value1 = 2; + std::vector args2{self1, tensor11, tensor21, value1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_addcdiv) { + const std::string script = R"IR( + graph(%self: Tensor, %tensor1: Tensor, %tensor2: Tensor, %value: int): + %bias: None = prim::Constant() + %ret = aten::addcdiv(%self, %tensor1, %tensor2, %value) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto tensor10 = at::rand({6, 6, 6}); + auto tensor20 = at::rand({6, 6, 6}); + auto value0 = 2; + std::vector args{self0, tensor10, tensor20, value0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto tensor11 = at::rand({22, 22, 22}); + auto tensor21 = at::rand({22, 22, 22}); + auto value1 = 2; + std::vector args2{self1, tensor11, tensor21, value1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_linalg_solve_triangular) { + const std::string script = R"IR( + graph(%self: Tensor, %B: Tensor, %upper: bool, %left: bool, %unitriangular: bool): + %bias: None = prim::Constant() + %ret = aten::linalg_solve_triangular(%self, %B, %upper, %left, %unitriangular) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto B0 = at::rand({6, 6, 6}); + auto upper0 = false; + auto left0 = false; + auto unitriangular0 = false; + std::vector args{self0, B0, upper0, left0, unitriangular0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto B1 = at::rand({22, 22, 22}); + auto upper1 = false; + auto left1 = false; + auto unitriangular1 = false; + std::vector args2{self1, B1, upper1, left1, unitriangular1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_cholesky_solve) { + const std::string script = R"IR( + graph(%self: Tensor, %input2: Tensor, %upper: bool): + %bias: None = prim::Constant() + %ret = aten::cholesky_solve(%self, %input2, %upper) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto input20 = at::rand({6, 6, 6}); + auto upper0 = false; + std::vector args{self0, input20, upper0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto input21 = at::rand({22, 22, 22}); + auto upper1 = false; + std::vector args2{self1, input21, upper1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_cholesky_inverse) { + const std::string script = R"IR( + graph(%self: Tensor, %upper: bool): + %bias: None = prim::Constant() + %ret = aten::cholesky_inverse(%self, %upper) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto upper0 = false; + std::vector args{self0, upper0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto upper1 = false; + std::vector args2{self1, upper1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_orgqr) { + const std::string script = R"IR( + graph(%self: Tensor, %input2: Tensor): + %bias: None = prim::Constant() + %ret = aten::orgqr(%self, %input2) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto input20 = at::rand({6, 6}); + std::vector args{self0, input20}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto input21 = at::rand({22, 22}); + std::vector args2{self1, input21}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_ormqr) { + const std::string script = R"IR( + graph(%self: Tensor, %input2: Tensor, %input3: Tensor, %left: bool, %transpose: bool): + %bias: None = prim::Constant() + %ret = aten::ormqr(%self, %input2, %input3, %left, %transpose) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto input20 = at::rand({6, 6}); + auto input30 = at::rand({6, 6, 6}); + auto left0 = false; + auto transpose0 = false; + std::vector args{self0, input20, input30, left0, transpose0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto input21 = at::rand({22, 22}); + auto input31 = at::rand({22, 22, 22}); + auto left1 = false; + auto transpose1 = false; + std::vector args2{self1, input21, input31, left1, transpose1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_lgamma) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::lgamma(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_polygamma) { + const std::string script = R"IR( + graph(%n: int, %self: Tensor): + %bias: None = prim::Constant() + %ret = aten::polygamma(%n, %self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto n0 = 1; + auto self0 = at::rand({6, 6, 6}); + std::vector args{n0, self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto n1 = 1; + auto self1 = at::rand({22, 22, 22}); + std::vector args2{n1, self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_erfinv) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::erfinv(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_i0) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::i0(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_signbit) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::signbit(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_atan2) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::atan2(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_arctan2) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::arctan2(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_histc) { + const std::string script = R"IR( + graph(%self: Tensor, %bins: int, %min: int, %max: int): + %bias: None = prim::Constant() + %ret = aten::histc(%self, %bins, %min, %max) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto bins0 = 1; + auto min0 = 2; + auto max0 = 2; + std::vector args{self0, bins0, min0, max0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/false); + + auto self1 = at::rand({22, 22, 22}); + auto bins1 = 1; + auto min1 = 2; + auto max1 = 2; + std::vector args2{self1, bins1, min1, max1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/false); +} + +TEST(StaticRuntime, autogen_hypot) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::hypot(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_igamma) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::igamma(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_igammac) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::igammac(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_nextafter) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::nextafter(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_fmin) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::fmin(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_fmax) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::fmax(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_maximum) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::maximum(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_max_other) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::max(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_minimum) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::minimum(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_min_other) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::min(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_quantile) { + const std::string script = R"IR( + graph(%self: Tensor, %q: Tensor, %dim: int?, %keepdim: bool, %interpolation: str): + %bias: None = prim::Constant() + %ret = aten::quantile(%self, %q, %dim, %keepdim, %interpolation) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto q0 = at::rand({6}); + auto dim0 = 1; + auto keepdim0 = false; + auto interpolation0 = "linear"; + std::vector args{self0, q0, dim0, keepdim0, interpolation0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto q1 = at::rand({22}); + auto dim1 = 1; + auto keepdim1 = false; + auto interpolation1 = "linear"; + std::vector args2{self1, q1, dim1, keepdim1, interpolation1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_nanquantile) { + const std::string script = R"IR( + graph(%self: Tensor, %q: Tensor, %dim: int?, %keepdim: bool, %interpolation: str): + %bias: None = prim::Constant() + %ret = aten::nanquantile(%self, %q, %dim, %keepdim, %interpolation) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto q0 = at::rand({6}); + auto dim0 = 1; + auto keepdim0 = false; + auto interpolation0 = "linear"; + std::vector args{self0, q0, dim0, keepdim0, interpolation0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto q1 = at::rand({22}); + auto dim1 = 1; + auto keepdim1 = false; + auto interpolation1 = "linear"; + std::vector args2{self1, q1, dim1, keepdim1, interpolation1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_msort) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::msort(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_renorm) { + const std::string script = R"IR( + graph(%self: Tensor, %p: int, %dim: int, %maxnorm: int): + %bias: None = prim::Constant() + %ret = aten::renorm(%self, %p, %dim, %maxnorm) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto p0 = 2; + auto dim0 = 1; + auto maxnorm0 = 2; + std::vector args{self0, p0, dim0, maxnorm0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto p1 = 2; + auto dim1 = 1; + auto maxnorm1 = 2; + std::vector args2{self1, p1, dim1, maxnorm1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen__convert_indices_from_coo_to_csr) { + const std::string script = R"IR( + graph(%self: Tensor, %size: int, %out_int32: bool): + %bias: None = prim::Constant() + %ret = aten::_convert_indices_from_coo_to_csr(%self, %size, %out_int32) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::randint(0, 3, {2}, at::kInt); + auto size0 = 10; + auto out_int320 = false; + std::vector args{self0, size0, out_int320}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::randint(0, 3, {12}, at::kInt); + auto size1 = 24; + auto out_int321 = false; + std::vector args2{self1, size1, out_int321}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen__convert_indices_from_csr_to_coo) { + const std::string script = R"IR( + graph(%crow_indices: Tensor, %col_indices: Tensor, %out_int32: bool, %transpose: bool): + %bias: None = prim::Constant() + %ret = aten::_convert_indices_from_csr_to_coo(%crow_indices, %col_indices, %out_int32, %transpose) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto crow_indices0 = torch::tensor({1}, torch::kInt32); + auto col_indices0 = torch::tensor({0, 1, 0}, torch::kInt32); + auto out_int320 = false; + auto transpose0 = false; + std::vector args{crow_indices0, col_indices0, out_int320, transpose0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto crow_indices1 = torch::tensor({0}, torch::kInt32); + auto col_indices1 = + torch::tensor({0, 1, 0, 2, 1, 2, 0, 1, 0, 2, 1, 2}, torch::kInt32); + auto out_int321 = false; + auto transpose1 = false; + std::vector args2{ + crow_indices1, col_indices1, out_int321, transpose1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_mse_loss) { + const std::string script = R"IR( + graph(%self: Tensor, %target: Tensor, %reduction: int): + %bias: None = prim::Constant() + %ret = aten::mse_loss(%self, %target, %reduction) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto target0 = at::rand({6, 6, 6}); + auto reduction0 = 1; + std::vector args{self0, target0, reduction0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto target1 = at::rand({22, 22, 22}); + auto reduction1 = 1; + std::vector args2{self1, target1, reduction1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_l1_loss) { + const std::string script = R"IR( + graph(%self: Tensor, %target: Tensor, %reduction: int): + %bias: None = prim::Constant() + %ret = aten::l1_loss(%self, %target, %reduction) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto target0 = at::rand({6, 6, 6}); + auto reduction0 = 1; + std::vector args{self0, target0, reduction0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/false); + + auto self1 = at::rand({22, 22, 22}); + auto target1 = at::rand({22, 22, 22}); + auto reduction1 = 1; + std::vector args2{self1, target1, reduction1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/false); +} + +TEST(StaticRuntime, autogen_multi_margin_loss) { + const std::string script = R"IR( + graph(%self: Tensor, %target: Tensor, %p: int, %margin: int, %weight: Tensor?, %reduction: int): + %bias: None = prim::Constant() + %ret = aten::multi_margin_loss(%self, %target, %p, %margin, %weight, %reduction) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6}); + auto target0 = at::randint(6, {6}, torch::kInt64); + auto p0 = 2; + auto margin0 = 2; + auto weight0 = at::rand({6}); + auto reduction0 = 1; + std::vector args{self0, target0, p0, margin0, weight0, reduction0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/false); + + auto self1 = at::rand({22, 22}); + auto target1 = at::randint(22, {22}, torch::kInt64); + auto p1 = 2; + auto margin1 = 2; + auto weight1 = at::rand({22}); + auto reduction1 = 1; + std::vector args2{self1, target1, p1, margin1, weight1, reduction1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/false); +} + +TEST(StaticRuntime, autogen_multilabel_margin_loss) { + const std::string script = R"IR( + graph(%self: Tensor, %target: Tensor, %reduction: int): + %bias: None = prim::Constant() + %ret = aten::multilabel_margin_loss(%self, %target, %reduction) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6}); + auto target0 = at::randint(6, {6, 6}, torch::kInt64); + auto reduction0 = 1; + std::vector args{self0, target0, reduction0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/false); + + auto self1 = at::rand({22, 22}); + auto target1 = at::randint(22, {22, 22}, torch::kInt64); + auto reduction1 = 1; + std::vector args2{self1, target1, reduction1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/false); +} + +TEST(StaticRuntime, autogen_nll_loss) { + const std::string script = R"IR( + graph(%self: Tensor, %target: Tensor, %weight: Tensor?, %reduction: int, %ignore_index: int): + %bias: None = prim::Constant() + %ret = aten::nll_loss(%self, %target, %weight, %reduction, %ignore_index) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6}); + auto target0 = at::randint(6, {6}, torch::kInt64); + auto weight0 = at::rand({6}); + auto reduction0 = 1; + auto ignore_index0 = 1; + std::vector args{self0, target0, weight0, reduction0, ignore_index0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/false); + + auto self1 = at::rand({22, 22}); + auto target1 = at::randint(22, {22}, torch::kInt64); + auto weight1 = at::rand({22}); + auto reduction1 = 1; + auto ignore_index1 = 1; + std::vector args2{self1, target1, weight1, reduction1, ignore_index1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/false); +} + +TEST(StaticRuntime, autogen_nll_loss_backward) { + const std::string script = R"IR( + graph(%grad_output: Tensor, %self: Tensor, %target: Tensor, %weight: Tensor?, %reduction: int, %ignore_index: int, %total_weight: Tensor): + %bias: None = prim::Constant() + %ret = aten::nll_loss_backward(%grad_output, %self, %target, %weight, %reduction, %ignore_index, %total_weight) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto grad_output0 = at::rand({}); + auto self0 = at::rand({6}); + auto target0 = at::randint(0, 5, {6}, torch::kInt64); + auto weight0 = at::rand({6}); + auto reduction0 = 1; + auto ignore_index0 = 1; + auto total_weight0 = at::rand({}); + std::vector args{ + grad_output0, + self0, + target0, + weight0, + reduction0, + ignore_index0, + total_weight0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto grad_output1 = at::rand({}); + auto self1 = at::rand({36}); + auto target1 = at::randint(0, 11, {36}, torch::kInt64); + auto weight1 = at::rand({36}); + auto reduction1 = 1; + auto ignore_index1 = 1; + auto total_weight1 = at::rand({}); + std::vector args2{ + grad_output1, + self1, + target1, + weight1, + reduction1, + ignore_index1, + total_weight1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_nll_loss2d) { + const std::string script = R"IR( + graph(%self: Tensor, %target: Tensor, %weight: Tensor?, %reduction: int, %ignore_index: int): + %bias: None = prim::Constant() + %ret = aten::nll_loss2d(%self, %target, %weight, %reduction, %ignore_index) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6, 6}); + auto target0 = at::randint(6, {6, 6, 6}, torch::kInt64); + auto weight0 = at::rand({6}); + auto reduction0 = 1; + auto ignore_index0 = 1; + std::vector args{self0, target0, weight0, reduction0, ignore_index0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/false); + + auto self1 = at::rand({22, 22, 22, 22}); + auto target1 = at::randint(22, {22, 22, 22}, torch::kInt64); + auto weight1 = at::rand({22}); + auto reduction1 = 1; + auto ignore_index1 = 1; + std::vector args2{self1, target1, weight1, reduction1, ignore_index1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/false); +} + +TEST(StaticRuntime, autogen_soft_margin_loss) { + const std::string script = R"IR( + graph(%self: Tensor, %target: Tensor, %reduction: int): + %bias: None = prim::Constant() + %ret = aten::soft_margin_loss(%self, %target, %reduction) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto target0 = at::rand({6, 6, 6}); + auto reduction0 = 1; + std::vector args{self0, target0, reduction0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto target1 = at::rand({22, 22, 22}); + auto reduction1 = 1; + std::vector args2{self1, target1, reduction1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_elu) { + const std::string script = R"IR( + graph(%self: Tensor, %alpha: int, %scale: int, %input_scale: int): + %bias: None = prim::Constant() + %ret = aten::elu(%self, %alpha, %scale, %input_scale) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto alpha0 = 2; + auto scale0 = 2; + auto input_scale0 = 2; + std::vector args{self0, alpha0, scale0, input_scale0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto alpha1 = 2; + auto scale1 = 2; + auto input_scale1 = 2; + std::vector args2{self1, alpha1, scale1, input_scale1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_elu_backward) { + const std::string script = R"IR( + graph(%grad_output: Tensor, %alpha: int, %scale: int, %input_scale: int, %is_result: bool, %self_or_result: Tensor): + %bias: None = prim::Constant() + %ret = aten::elu_backward(%grad_output, %alpha, %scale, %input_scale, %is_result, %self_or_result) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto grad_output0 = at::rand({6, 6, 6}); + auto alpha0 = 2; + auto scale0 = 2; + auto input_scale0 = 2; + auto is_result0 = false; + auto self_or_result0 = at::rand({6, 6, 6}); + std::vector args{ + grad_output0, alpha0, scale0, input_scale0, is_result0, self_or_result0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto grad_output1 = at::rand({22, 22, 22}); + auto alpha1 = 2; + auto scale1 = 2; + auto input_scale1 = 2; + auto is_result1 = false; + auto self_or_result1 = at::rand({22, 22, 22}); + std::vector args2{ + grad_output1, alpha1, scale1, input_scale1, is_result1, self_or_result1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_glu) { + const std::string script = R"IR( + graph(%self: Tensor, %dim: int): + %bias: None = prim::Constant() + %ret = aten::glu(%self, %dim) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto dim0 = 1; + std::vector args{self0, dim0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto dim1 = 1; + std::vector args2{self1, dim1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_hardsigmoid) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::hardsigmoid(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_hardsigmoid_backward) { + const std::string script = R"IR( + graph(%grad_output: Tensor, %self: Tensor): + %bias: None = prim::Constant() + %ret = aten::hardsigmoid_backward(%grad_output, %self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto grad_output0 = at::rand({6, 6, 6}); + auto self0 = at::rand({6, 6, 6}); + std::vector args{grad_output0, self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto grad_output1 = at::rand({22, 22, 22}); + auto self1 = at::rand({22, 22, 22}); + std::vector args2{grad_output1, self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_hardtanh) { + const std::string script = R"IR( + graph(%self: Tensor, %min_val: int, %max_val: int): + %bias: None = prim::Constant() + %ret = aten::hardtanh(%self, %min_val, %max_val) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto min_val0 = 2; + auto max_val0 = 2; + std::vector args{self0, min_val0, max_val0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto min_val1 = 2; + auto max_val1 = 2; + std::vector args2{self1, min_val1, max_val1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_hardswish) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::hardswish(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_leaky_relu_backward) { + const std::string script = R"IR( + graph(%grad_output: Tensor, %self: Tensor, %negative_slope: int, %self_is_result: bool): + %bias: None = prim::Constant() + %ret = aten::leaky_relu_backward(%grad_output, %self, %negative_slope, %self_is_result) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto grad_output0 = at::rand({6, 6, 6}); + auto self0 = at::rand({6, 6, 6}); + auto negative_slope0 = 2; + auto self_is_result0 = false; + std::vector args{ + grad_output0, self0, negative_slope0, self_is_result0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto grad_output1 = at::rand({22, 22, 22}); + auto self1 = at::rand({22, 22, 22}); + auto negative_slope1 = 2; + auto self_is_result1 = false; + std::vector args2{ + grad_output1, self1, negative_slope1, self_is_result1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_log_sigmoid) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::log_sigmoid(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_softplus) { + const std::string script = R"IR( + graph(%self: Tensor, %beta: int, %threshold: int): + %bias: None = prim::Constant() + %ret = aten::softplus(%self, %beta, %threshold) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto beta0 = 2; + auto threshold0 = 2; + std::vector args{self0, beta0, threshold0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto beta1 = 2; + auto threshold1 = 2; + std::vector args2{self1, beta1, threshold1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_softplus_backward) { + const std::string script = R"IR( + graph(%grad_output: Tensor, %self: Tensor, %beta: int, %threshold: int): + %bias: None = prim::Constant() + %ret = aten::softplus_backward(%grad_output, %self, %beta, %threshold) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto grad_output0 = at::rand({6, 6, 6}); + auto self0 = at::rand({6, 6, 6}); + auto beta0 = 2; + auto threshold0 = 2; + std::vector args{grad_output0, self0, beta0, threshold0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto grad_output1 = at::rand({22, 22, 22}); + auto self1 = at::rand({22, 22, 22}); + auto beta1 = 2; + auto threshold1 = 2; + std::vector args2{grad_output1, self1, beta1, threshold1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_softshrink) { + const std::string script = R"IR( + graph(%self: Tensor, %lambd: int): + %bias: None = prim::Constant() + %ret = aten::softshrink(%self, %lambd) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto lambd0 = 2; + std::vector args{self0, lambd0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto lambd1 = 2; + std::vector args2{self1, lambd1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_softshrink_backward) { + const std::string script = R"IR( + graph(%grad_output: Tensor, %self: Tensor, %lambd: int): + %bias: None = prim::Constant() + %ret = aten::softshrink_backward(%grad_output, %self, %lambd) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto grad_output0 = at::rand({6, 6, 6}); + auto self0 = at::rand({6, 6, 6}); + auto lambd0 = 2; + std::vector args{grad_output0, self0, lambd0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto grad_output1 = at::rand({22, 22, 22}); + auto self1 = at::rand({22, 22, 22}); + auto lambd1 = 2; + std::vector args2{grad_output1, self1, lambd1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_adaptive_max_pool2d_backward) { + const std::string script = R"IR( + graph(%grad_output: Tensor, %self: Tensor, %indices: Tensor): + %bias: None = prim::Constant() + %ret = aten::adaptive_max_pool2d_backward(%grad_output, %self, %indices) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto grad_output0 = at::randint(-3, 2, {2, 2, 2}); + auto self0 = at::randint(-3, 2, {2, 2, 2}); + auto indices0 = at::randint(0, 1, {2, 2, 2}, at::kLong); + std::vector args{grad_output0, self0, indices0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto grad_output1 = at::randint(-3, 3, {3, 3, 3}); + auto self1 = at::randint(-3, 2, {3, 3, 3}); + auto indices1 = at::randint(0, 1, {3, 3, 3}, at::kLong); + std::vector args2{grad_output1, self1, indices1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_adaptive_max_pool3d_backward) { + const std::string script = R"IR( + graph(%grad_output: Tensor, %self: Tensor, %indices: Tensor): + %bias: None = prim::Constant() + %ret = aten::adaptive_max_pool3d_backward(%grad_output, %self, %indices) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto grad_output0 = at::randint(-3, 2, {2, 2, 2, 2}); + auto self0 = at::randint(-3, 2, {2, 2, 2, 2}); + auto indices0 = at::randint(0, 1, {2, 2, 2, 2}, at::kLong); + std::vector args{grad_output0, self0, indices0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto grad_output1 = at::randint(-3, 3, {3, 3, 3, 3}); + auto self1 = at::randint(-3, 2, {3, 3, 3, 3}); + auto indices1 = at::randint(0, 1, {3, 3, 3, 3}, at::kLong); + std::vector args2{grad_output1, self1, indices1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_sigmoid_backward) { + const std::string script = R"IR( + graph(%grad_output: Tensor, %output: Tensor): + %bias: None = prim::Constant() + %ret = aten::sigmoid_backward(%grad_output, %output) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto grad_output0 = at::rand({6, 6, 6}); + auto output0 = at::rand({6, 6, 6}); + std::vector args{grad_output0, output0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto grad_output1 = at::rand({22, 22, 22}); + auto output1 = at::rand({22, 22, 22}); + std::vector args2{grad_output1, output1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_tanh_backward) { + const std::string script = R"IR( + graph(%grad_output: Tensor, %output: Tensor): + %bias: None = prim::Constant() + %ret = aten::tanh_backward(%grad_output, %output) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto grad_output0 = at::rand({6, 6, 6}); + auto output0 = at::rand({6, 6, 6}); + std::vector args{grad_output0, output0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto grad_output1 = at::rand({22, 22, 22}); + auto output1 = at::rand({22, 22, 22}); + std::vector args2{grad_output1, output1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_isposinf) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::isposinf(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_isneginf) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::isneginf(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_entr) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::special_entr(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_ndtri) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::special_ndtri(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_log_ndtr) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::special_log_ndtr(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_expm1) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::special_expm1(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_exp2) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::special_exp2(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_psi) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::special_psi(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_digamma) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::special_digamma(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_gammaln) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::special_gammaln(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_erf) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::special_erf(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_erfc) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::special_erfc(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_erfcx) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::special_erfcx(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_erfinv) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::special_erfinv(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_ndtr) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::special_ndtr(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_xlog1py) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::special_xlog1py(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_xlogy) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::special_xlogy(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_zeta) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::special_zeta(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({2, 2, 2}, at::kDouble) + at::ones({2, 2, 2}); + auto other0 = at::rand({2, 2, 2}, at::kDouble) + at::ones({2, 2, 2}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({5, 5, 5}, at::kDouble) + at::ones({5, 5, 5}); + auto other1 = at::rand({5, 5, 5}, at::kDouble) + at::ones({5, 5, 5}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_i0) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::special_i0(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_i0e) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::special_i0e(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_i1) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::special_i1(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_i1e) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::special_i1e(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_polygamma) { + const std::string script = R"IR( + graph(%n: int, %self: Tensor): + %bias: None = prim::Constant() + %ret = aten::special_polygamma(%n, %self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto n0 = 1; + auto self0 = at::rand({6, 6, 6}); + std::vector args{n0, self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto n1 = 1; + auto self1 = at::rand({22, 22, 22}); + std::vector args2{n1, self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_expit) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::special_expit(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_sinc) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::special_sinc(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_round) { + const std::string script = R"IR( + graph(%self: Tensor, %decimals: int): + %bias: None = prim::Constant() + %ret = aten::special_round(%self, %decimals) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto decimals0 = 1; + std::vector args{self0, decimals0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto decimals1 = 1; + std::vector args2{self1, decimals1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_log1p) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::special_log1p(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_gammainc) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::special_gammainc(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_gammaincc) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::special_gammaincc(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_special_multigammaln) { + const std::string script = R"IR( + graph(%self: Tensor, %p: int): + %bias: None = prim::Constant() + %ret = aten::special_multigammaln(%self, %p) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto p0 = 1; + std::vector args{self0, p0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto p1 = 1; + std::vector args2{self1, p1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_fft_fft) { + const std::string script = R"IR( + graph(%self: Tensor, %n: int?, %dim: int, %norm: str?): + %bias: None = prim::Constant() + %ret = aten::fft_fft(%self, %n, %dim, %norm) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto n0 = 1; + auto dim0 = 1; + auto norm0 = "forward"; + std::vector args{self0, n0, dim0, norm0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto n1 = 1; + auto dim1 = 1; + auto norm1 = "forward"; + std::vector args2{self1, n1, dim1, norm1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_fft_ifft) { + const std::string script = R"IR( + graph(%self: Tensor, %n: int?, %dim: int, %norm: str?): + %bias: None = prim::Constant() + %ret = aten::fft_ifft(%self, %n, %dim, %norm) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto n0 = 1; + auto dim0 = 1; + auto norm0 = "forward"; + std::vector args{self0, n0, dim0, norm0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto n1 = 1; + auto dim1 = 1; + auto norm1 = "forward"; + std::vector args2{self1, n1, dim1, norm1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_fft_rfft) { + const std::string script = R"IR( + graph(%self: Tensor, %n: int?, %dim: int, %norm: str?): + %bias: None = prim::Constant() + %ret = aten::fft_rfft(%self, %n, %dim, %norm) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto n0 = 1; + auto dim0 = 1; + auto norm0 = "forward"; + std::vector args{self0, n0, dim0, norm0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto n1 = 1; + auto dim1 = 1; + auto norm1 = "forward"; + std::vector args2{self1, n1, dim1, norm1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_fft_irfft) { + const std::string script = R"IR( + graph(%self: Tensor, %n: int?, %dim: int, %norm: str?): + %bias: None = prim::Constant() + %ret = aten::fft_irfft(%self, %n, %dim, %norm) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto n0 = 1; + auto dim0 = 1; + auto norm0 = "forward"; + std::vector args{self0, n0, dim0, norm0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto n1 = 1; + auto dim1 = 1; + auto norm1 = "forward"; + std::vector args2{self1, n1, dim1, norm1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_fft_hfft) { + const std::string script = R"IR( + graph(%self: Tensor, %n: int?, %dim: int, %norm: str?): + %bias: None = prim::Constant() + %ret = aten::fft_hfft(%self, %n, %dim, %norm) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto n0 = 1; + auto dim0 = 1; + auto norm0 = "forward"; + std::vector args{self0, n0, dim0, norm0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto n1 = 1; + auto dim1 = 1; + auto norm1 = "forward"; + std::vector args2{self1, n1, dim1, norm1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_fft_ihfft) { + const std::string script = R"IR( + graph(%self: Tensor, %n: int?, %dim: int, %norm: str?): + %bias: None = prim::Constant() + %ret = aten::fft_ihfft(%self, %n, %dim, %norm) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto n0 = 1; + auto dim0 = 1; + auto norm0 = "forward"; + std::vector args{self0, n0, dim0, norm0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto n1 = 1; + auto dim1 = 1; + auto norm1 = "forward"; + std::vector args2{self1, n1, dim1, norm1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_linalg_cross) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor, %dim: int): + %bias: None = prim::Constant() + %ret = aten::linalg_cross(%self, %other, %dim) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 3, 6}); + auto other0 = at::rand({6, 3, 6}); + auto dim0 = 1; + std::vector args{self0, other0, dim0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 3, 22}); + auto other1 = at::rand({22, 3, 22}); + auto dim1 = 1; + std::vector args2{self1, other1, dim1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_linalg_det) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::linalg_det(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_linalg_matmul) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::linalg_matmul(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_linalg_eigvals) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::linalg_eigvals(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_linalg_inv) { + const std::string script = R"IR( + graph(%self: Tensor): + %bias: None = prim::Constant() + %ret = aten::linalg_inv(%self) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + std::vector args{self0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + std::vector args2{self1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_inner) { + const std::string script = R"IR( + graph(%self: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::inner(%self, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{self0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{self1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_outer) { + const std::string script = R"IR( + graph(%self: Tensor, %vec2: Tensor): + %bias: None = prim::Constant() + %ret = aten::outer(%self, %vec2) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({16}); + auto vec20 = at::rand({16}); + std::vector args{self0, vec20}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({64}); + auto vec21 = at::rand({64}); + std::vector args2{self1, vec21}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_linalg_svdvals) { + const std::string script = R"IR( + graph(%A: Tensor): + %bias: None = prim::Constant() + %ret = aten::linalg_svdvals(%A) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto A0 = at::rand({6, 6, 6}); + std::vector args{A0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto A1 = at::rand({22, 22, 22}); + std::vector args2{A1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_linalg_cond) { + const std::string script = R"IR( + graph(%self: Tensor, %p: int?): + %bias: None = prim::Constant() + %ret = aten::linalg_cond(%self, %p) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto p0 = 2; + std::vector args{self0, p0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto p1 = 2; + std::vector args2{self1, p1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_linalg_solve) { + const std::string script = R"IR( + graph(%input: Tensor, %other: Tensor): + %bias: None = prim::Constant() + %ret = aten::linalg_solve(%input, %other) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto input0 = at::rand({6, 6, 6}); + auto other0 = at::rand({6, 6, 6}); + std::vector args{input0, other0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto input1 = at::rand({22, 22, 22}); + auto other1 = at::rand({22, 22, 22}); + std::vector args2{input1, other1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_linalg_tensorinv) { + const std::string script = R"IR( + graph(%self: Tensor, %ind: int): + %bias: None = prim::Constant() + %ret = aten::linalg_tensorinv(%self, %ind) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6, 6}); + auto ind0 = 2; + std::vector args{self0, ind0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22, 22}); + auto ind1 = 2; + std::vector args2{self1, ind1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} + +TEST(StaticRuntime, autogen_linalg_matrix_power) { + const std::string script = R"IR( + graph(%self: Tensor, %n: int): + %bias: None = prim::Constant() + %ret = aten::linalg_matrix_power(%self, %n) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6, 6, 6}); + auto n0 = 1; + std::vector args{self0, n0}; + testStaticRuntime( + script, + args, + {}, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); + + auto self1 = at::rand({22, 22, 22}); + auto n1 = 1; + std::vector args2{self1, n1}; + testStaticRuntime( + script, + args, + args2, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/true); +} diff --git a/benchmarks/static_runtime/test_static_module.cc b/benchmarks/static_runtime/test_static_module.cc index 353ce93bb651..41758ec9f2f3 100644 --- a/benchmarks/static_runtime/test_static_module.cc +++ b/benchmarks/static_runtime/test_static_module.cc @@ -243,6 +243,14 @@ TEST(StaticRuntime, ReplaceWithCopy_replaces_reshape) { c = inp.reshape(shape) return (a, b, c) )JIT"); + ExpectToReplaceWithCopy(R"JIT( + def forward(self, cond: bool, x): + if cond: + y = x.reshape(x.shape) + else: + y = x.clone() + return y.clone() + )JIT"); } TEST( @@ -289,7 +297,6 @@ TEST( return (d) )JIT"); ExpectNotToReplaceWithCopy(reshape_inplace_script); - ExpectNotToReplaceWithCopy(reshape_inplace_script_1); } TEST(StaticRuntime, CanEnableStaticRuntime) { @@ -432,7 +439,8 @@ TEST(StaticRuntime, LongModel) { torch::jit::StaticModule smod(mod); at::Tensor output_2 = smod(input_tensors, {}).toTensor(); smod.runtime().check_for_memory_leak(); - EXPECT_TRUE(torch::allclose(output_1, output_2, 1e-6)); + EXPECT_TRUE( + torch::allclose(output_1, output_2, /*rtol=*/1e-5, /*atol=*/1e-7)); } TEST(StaticRuntime, TrivialModel) { @@ -450,7 +458,8 @@ TEST(StaticRuntime, TrivialModel) { torch::jit::StaticModule smod(mod); at::Tensor output_2 = smod(input_tensors, {}).toTensor(); smod.runtime().check_for_memory_leak(); - EXPECT_TRUE(torch::allclose(output_1, output_2, 1e-6)); + EXPECT_TRUE( + torch::allclose(output_1, output_2, /*rtol=*/1e-5, /*atol=*/1e-7)); } TEST(StaticRuntime, DeepWide) { @@ -475,7 +484,8 @@ TEST(StaticRuntime, DeepWide) { ASSERT_TRUE(outputs.size() > 0); at::Tensor output_2 = outputs[0].toTensor(); smod.runtime().check_for_memory_leak(); - EXPECT_TRUE(torch::allclose(output_1, output_2, 1e-6)); + EXPECT_TRUE( + torch::allclose(output_1, output_2, /*rtol=*/1e-5, /*atol=*/1e-7)); } } } @@ -502,7 +512,8 @@ TEST(StaticRuntime, KWargsAPI_1) { smod.runtime().check_for_memory_leak(); at::Tensor output_2 = getTensor(output_ivalue); - EXPECT_TRUE(torch::allclose(output_1, output_2, 1e-6)); + EXPECT_TRUE( + torch::allclose(output_1, output_2, /*rtol=*/1e-5, /*atol=*/1e-7)); // check for output aliasing EXPECT_EQ(output_ivalue.use_count(), 1); @@ -546,7 +557,8 @@ TEST(StaticRuntime, KWargsAPI_2) { smod.runtime().check_for_memory_leak(); at::Tensor output_2 = getTensor(output_ivalue); - EXPECT_TRUE(torch::allclose(output_1, output_2, 1e-6)); + EXPECT_TRUE( + torch::allclose(output_1, output_2, /*rtol=*/1e-5, /*atol=*/1e-7)); // check for output aliasing EXPECT_EQ(output_ivalue.use_count(), 1); @@ -562,6 +574,24 @@ TEST(StaticRuntime, KWargsAPI_2) { } } +TEST(StaticRuntime, KWargsAPI_Optional) { + const auto src = R"JIT( + def forward(self, x, y, z: Optional[Tensor] = None): + return x + y + )JIT"; + + torch::jit::Module mod("mod"); + mod.define(src); + torch::jit::StaticModule smod(mod); + const auto kwargs = std::unordered_map{ + {"x", at::randn({1})}, {"y", at::randn({1})}}; + + auto expected = mod.forward({}, kwargs).toTensor(); + auto actual = smod({}, kwargs).toTensor(); + + EXPECT_TRUE(expected.equal(actual)); +} + TEST(StaticRuntime, CleanUpMemory) { const int embedding_size = 32; const int num_features = 50; @@ -605,7 +635,8 @@ TEST(StaticRuntime, CleanUpMemory) { ASSERT_TRUE(outputs.size() > 0); auto output_2 = outputs[0].toTensor(); runtime.check_for_memory_leak(); - EXPECT_TRUE(torch::allclose(output_1, output_2, 1e-6)); + EXPECT_TRUE(torch::allclose( + output_1, output_2, /*rtol=*/1e-5, /*atol=*/1e-7)); if (manage_output_tensors) { runtime.deallocateOutputTensors(); runtime.checkOutputTensorMemoryLeaks(); @@ -850,7 +881,8 @@ TEST(StaticRuntime, FusionPass) { } EXPECT_TRUE(hit); auto output_2 = getTensor(module.forward(inputs)); - EXPECT_TRUE(torch::allclose(output_1, output_2, 1e-6)); + EXPECT_TRUE( + torch::allclose(output_1, output_2, /*rtol=*/1e-5, /*atol=*/1e-7)); } } } @@ -882,8 +914,9 @@ TEST( sigmoid_node, /*enable_out_variant=*/true, /*check_memory_overlap=*/false); - ProcessedNode pnode(sigmoid_node, &fn, createProcessedNodeInputs({0}), 1); - pnode.set_values(values.data()); + StaticNodeInfo static_node_info( + sigmoid_node, &fn, createProcessedNodeInputs({0}), 1); + ProcessedNode pnode(static_node_info, values.data()); EXPECT_TRUE(pnode.verify_no_memory_overlap(/* force_check*/ true)); pnode.Output(0) = values[0]; @@ -901,8 +934,9 @@ TEST(ProcessedNode, VerifyNoMemoryOverlapWithImmutableInputsWithInplaceOps) { sigmoid_node, /*enable_out_variant=*/true, /*check_memory_overlap=*/false); - ProcessedNode pnode(sigmoid_node, &fn, createProcessedNodeInputs({0}), 1); - pnode.set_values(values.data()); + StaticNodeInfo static_node_info( + sigmoid_node, &fn, createProcessedNodeInputs({0}), 1); + ProcessedNode pnode(static_node_info, values.data()); ASSERT_EQ(&pnode.Output(0), &values[1]); EXPECT_TRUE(pnode.verify_no_memory_overlap()); @@ -928,9 +962,10 @@ TEST(ProcessedNode, VerifyNoMemoryOverlapWithOverlappingOutputs) { list_unpack_node, /*enable_out_variant=*/true, /*check_memory_overlap */ false); - ProcessedNode list_unpack_pnode( + StaticNodeInfo list_unpack_static_node_info( list_unpack_node, &fn, createProcessedNodeInputs({0}), 1); - list_unpack_pnode.set_values(values.data()); + ProcessedNode list_unpack_pnode( + list_unpack_static_node_info, values.data()); ASSERT_EQ(list_unpack_pnode.outputs().size(), 2); EXPECT_TRUE( list_unpack_pnode.verify_no_memory_overlap(/* force_check*/ true)); @@ -942,9 +977,10 @@ TEST(ProcessedNode, VerifyNoMemoryOverlapWithOverlappingOutputs) { list_unpack_node, /*enable_out_variant=*/true, /*check_memory_overlap */ false); - ProcessedNode list_unpack_pnode( + StaticNodeInfo list_unpack_static_node_info( list_unpack_node, &fn, createProcessedNodeInputs({0}), 1); - list_unpack_pnode.set_values(values.data()); + ProcessedNode list_unpack_pnode( + list_unpack_static_node_info, values.data()); auto b = at::randn({2, 3}); list_unpack_pnode.Output(0) = b; list_unpack_pnode.Output(1) = b; @@ -1500,3 +1536,231 @@ TEST(ForceNonEmptyOutputs, TwoSubBlocks) { } } } + +TEST(EliminateExtraPermuteOps, FusesSumCorrectly) { + const auto src = R"JIT( + def forward(self, x): + y = torch.permute(x, (0, 2, 1)) + z = torch.sum(y, dim=-1) + return z + )JIT"; + torch::jit::Module mod("m"); + mod.define(src); + + auto graph = mod.get_method("forward").graph(); + // turn the ListConstruct(%constant) into proper constant lists + ConstantPropagation(graph); + EliminateExtraPermuteOps(graph); + + EXPECT_FALSE(hasNodeWithKind(graph, "aten::permute")); + auto* sum = getNodeWithKind(graph, "aten::sum"); + ASSERT_NE(sum, nullptr); + auto dim = toIValue(sum->input(1)); + ASSERT_TRUE(dim.has_value() && dim->isIntList()); + EXPECT_EQ(dim->toIntList(), c10::List{1}); +} + +TEST(EliminateExtraPermuteOps, DoesNotFuseSumWrongDim) { + const auto src = R"JIT( + def forward(self, x): + y = torch.permute(x, (0, 2, 1)) + z = torch.sum(y, dim=1) + return z + )JIT"; + torch::jit::Module mod("m"); + mod.define(src); + + auto graph = mod.get_method("forward").graph(); + // turn the ListConstruct(%constant) into proper constant lists + ConstantPropagation(graph); + EliminateExtraPermuteOps(graph); + + EXPECT_TRUE(hasNodeWithKind(graph, "aten::permute")); +} + +TEST(EliminateExtraPermuteOps, DoesNotFuseSumNonConstantDim) { + const auto src = R"JIT( + def forward(self, x, dim: int): + y = torch.permute(x, (0, 2, 1)) + z = torch.sum(y, dim=dim) + return z + )JIT"; + torch::jit::Module mod("m"); + mod.define(src); + + auto graph = mod.get_method("forward").graph(); + // turn the ListConstruct(%constant) into proper constant lists + ConstantPropagation(graph); + EliminateExtraPermuteOps(graph); + + EXPECT_TRUE(hasNodeWithKind(graph, "aten::permute")); +} + +TEST(EliminateExtraPermuteOps, FusesSoftmaxCorrectly) { + const auto src = R"JIT( + def forward(self, x): + a = torch.permute(x, [0, 2, 1]) + b = torch.softmax(a, 2) + c = torch.permute(b, [0, 2, 1]) + return c.clone() + )JIT"; + torch::jit::Module mod("m"); + mod.define(src); + auto graph = mod.get_method("forward").graph(); + ConstantPropagation(graph); + EliminateExtraPermuteOps(graph); + graph->dump(); + + EXPECT_FALSE(hasNodeWithKind(graph, "aten::permute")); + auto* softmax = getNodeWithKind(graph, "aten::softmax"); + ASSERT_NE(softmax, nullptr); + auto dim = toIValue(softmax->input(1)); + ASSERT_TRUE(dim.has_value() && dim->isInt()); + EXPECT_EQ(dim->toInt(), 1); + + std::vector args{at::randn({3, 4, 5})}; + testStaticRuntime(src, args, /*args2=*/{}, /*use_allclose=*/true); +} + +TEST(EliminateExtraPermuteOps, DoesNotFuseSoftmaxWrongPermuteDim) { + const auto src = R"JIT( + def forward(self, x): + a = torch.permute(x, [0, 1, 2]) + b = torch.softmax(a, 2) + c = torch.permute(b, [0, 1, 2]) + return c.clone() + )JIT"; + torch::jit::Module mod("m"); + mod.define(src); + auto graph = mod.get_method("forward").graph(); + ConstantPropagation(graph); + EliminateExtraPermuteOps(graph); + EXPECT_TRUE(hasNodeWithKind(graph, "aten::permute")); +} + +TEST(EliminateExtraPermuteOps, DoesNotFuseSoftmaxWrongSoftmaxDim) { + const auto src = R"JIT( + def forward(self, x): + a = torch.permute(x, [0, 2, 1]) + b = torch.softmax(a, 0) + c = torch.permute(b, [0, 2, 1]) + return c.clone() + )JIT"; + torch::jit::Module mod("m"); + mod.define(src); + auto graph = mod.get_method("forward").graph(); + ConstantPropagation(graph); + EliminateExtraPermuteOps(graph); + EXPECT_TRUE(hasNodeWithKind(graph, "aten::permute")); +} + +TEST(UseSplitAndSqueeze, Fusion) { + const auto src = R"IR( + graph(%x: Tensor): + %dim: int = prim::Constant[value=1]() + %split_size: int = prim::Constant[value=1]() + %split: Tensor[] = aten::split(%x, %split_size, %dim) + %a: Tensor, %b: Tensor = prim::ListUnpack(%split) + %c: Tensor = aten::squeeze(%a, %dim) + %d: Tensor = aten::squeeze(%b, %dim) + return (%c, %d) + )IR"; + auto graph = getGraphFromIR(src); + UseSplitAndSqueeze(graph); + EXPECT_TRUE( + hasNodeWithKind(graph, "static_runtime::fused_split_and_squeeze_copy")); + EXPECT_FALSE(hasNodeWithKind(graph, "aten::split")); + EXPECT_FALSE(hasNodeWithKind(graph, "aten::squeeze")); + EXPECT_FALSE(hasNodeWithKind(graph, "prim::ListUnpack")); +} + +TEST(EliminateNoOpSlice, IntegerStart) { + const auto src = R"JIT( + def forward(self, x: List[int]) -> List[int]: + return x[0:] + )JIT"; + torch::jit::Module mod("m"); + mod.define(src); + auto graph = mod.get_method("forward").graph(); + EXPECT_TRUE(hasNodeWithKind(graph, "aten::slice")); + EliminateNoOpSlice(graph); + EXPECT_FALSE(hasNodeWithKind(graph, "aten::slice")); +} + +TEST(EliminateNoOpSlice, NoneStart) { + const auto src = R"JIT( + def forward(self, x: List[int]) -> List[int]: + return x[:] + )JIT"; + torch::jit::Module mod("m"); + mod.define(src); + auto graph = mod.get_method("forward").graph(); + EliminateNoOpSlice(graph); + EXPECT_FALSE(hasNodeWithKind(graph, "aten::slice")); +} + +#ifdef FBCODE_CAFFE2 +// FuseClampNaNToNum pass is disabled externally to avoid MSVC errors in CI +TEST(FuseClampNaNToNum, FusionHappens) { + const auto src = R"JIT( + def forward(self, x): + y = torch.clamp(x, min=0.0, max=1.0) + z = y.nan_to_num() + return z.clone() + )JIT"; + torch::jit::Module mod("m"); + mod.define(src); + auto graph = mod.get_method("forward").graph(); + FuseClampNaNToNum(graph); + EXPECT_FALSE(hasNodeWithKind(graph, "aten::clamp")); + EXPECT_FALSE(hasNodeWithKind(graph, "aten::nan_to_num")); + EXPECT_TRUE(hasNodeWithKind(graph, "static_runtime::clamp_nan_to_num")); + // Correctness of the op is exercised in StaticRuntime.clamp_nan_to_num +} + +TEST(FuseClampNaNToNum, NoFusion) { + const auto src1 = R"JIT( + def forward(self, x, a: float, b: float): + y = torch.clamp(x, a, b) + z = y.nan_to_num() + return z.clone() + )JIT"; + + const auto src2 = R"JIT( + def forward(self, x): + y = torch.clamp(x, min=0.0) + z = y.nan_to_num() + return z.clone() + )JIT"; + + const auto src3 = R"JIT( + def forward(self, x): + y = torch.clamp(x, max=0.0) + z = y.nan_to_num() + return z.clone() + )JIT"; + + const auto src4 = R"JIT( + def forward(self, x): + y = torch.clamp(x) + z = y.nan_to_num() + return z.clone() + )JIT"; + + + auto checkScript = [](const char* src) { + torch::jit::Module mod("m"); + mod.define(src); + auto graph = mod.get_method("forward").graph(); + FuseClampNaNToNum(graph); + EXPECT_TRUE(hasNodeWithKind(graph, "aten::clamp")); + EXPECT_TRUE(hasNodeWithKind(graph, "aten::nan_to_num")); + EXPECT_FALSE(hasNodeWithKind(graph, "static_runtime::clamp_nan_to_num")); + }; + + checkScript(src1); + checkScript(src2); + checkScript(src3); + checkScript(src4); +} +#endif diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc index bc923e707e1d..f6d4b0efd58e 100644 --- a/benchmarks/static_runtime/test_static_runtime.cc +++ b/benchmarks/static_runtime/test_static_runtime.cc @@ -5,6 +5,8 @@ #include #include #include +#include +#include #include #include "deep_wide_pt.h" @@ -172,6 +174,146 @@ TEST(StaticRuntime, Clamp) { testStaticRuntime(clamp_script_2, {a, min_t, max_t}, {b, max_t1, min_t1}); } +TEST(StaticRuntime, ClampMinOnly) { + const auto src = R"JIT( + def forward(self, inp: Tensor, min: float): + a = torch.clamp(inp, min, None).clone() + return (a) + )JIT"; + auto a = at::randn({2, 3}); + auto b = at::randn({4, 3, 2}); + testStaticRuntime(src, {a, 0.5}); + testStaticRuntime(src, {a, 0.5}, {b, 0.25}); +} + +TEST(StaticRuntime, ClampMaxOnly) { + const auto src = R"JIT( + def forward(self, inp: Tensor, max: float): + a = torch.clamp(inp, None, max).clone() + return (a) + )JIT"; + auto a = at::randn({2, 3}); + auto b = at::randn({4, 3, 2}); + testStaticRuntime(src, {a, 0.5}); + testStaticRuntime(src, {a, 0.5}, {b, 0.25}); +} + +TEST(StaticRuntime, ClampIntTensor) { + const auto src = R"JIT( + def forward(self, inp: Tensor, min: float, max: float): + a = torch.clamp(inp, min, max).clone() + return (a) + )JIT"; + auto a = at::randint(0, 20, {2, 3}); + auto b = at::randint(0, 20, {4, 3, 2}); + auto min = 5.0f; + auto max = 5.0f; + testStaticRuntime(src, {a, min, max}); + testStaticRuntime(src, {a, min, max}, {b, min, max}); +} + +TEST(StaticRuntime, LenWithTuple) { + const auto src = R"IR( + graph(%input : int[]): + %res : int = aten::len(%input) + return (%res) + )IR"; + + testStaticRuntime(src, {c10::List(4)}); +} + +TEST(StaticRuntime, LenWithTensor) { + const auto src = R"IR( + graph(%input : Tensor): + %res : int = aten::len(%input) + return (%res) + )IR"; + + testStaticRuntime(src, {at::randn({2, 2, 2})}); +} + +TEST(StaticRuntime, LenWithStr) { + const auto src = R"IR( + graph(%input : str): + %res : int = aten::len(%input) + return (%res) + )IR"; + + testStaticRuntime(src, {"static_runtime"}); +} + +TEST(StaticRuntime, LenWithDict_str) { + const auto script = R"JIT( + def forward(self, input: Dict[str, str]): + return len(input) + )JIT"; + + c10::Dict dict; + dict.insert("abc", "123"); + dict.insert("def", "456"); + testStaticRuntime(script, {dict}); +} + +TEST(StaticRuntime, LenWithDict_int) { + const auto script = R"JIT( + def forward(self, input: Dict[int, int]): + return len(input) + )JIT"; + + c10::Dict dict; + dict.insert(0, 1); + dict.insert(2, 3); + testStaticRuntime(script, {dict}); +} + +TEST(StaticRuntime, LenWithDict_bool) { + const auto script = R"JIT( + def forward(self, input: Dict[bool, bool]): + return len(input) + )JIT"; + + c10::Dict dict; + dict.insert(true, false); + dict.insert(false, true); + testStaticRuntime(script, {dict}); +} + +TEST(StaticRuntime, LenWithDict_float) { + const auto script = R"JIT( + def forward(self, input: Dict[float, float]): + return len(input) + )JIT"; + + c10::Dict dict; + dict.insert(0.1, 0.9); + dict.insert(0.8, 0.18); + testStaticRuntime(script, {dict}); +} + +TEST(StaticRuntime, LenWithDict_complex) { + const auto script = R"JIT( + def forward(self, input: Dict[complex, complex]): + return len(input) + )JIT"; + + c10::Dict, c10::complex> dict; + dict.insert(0.1, 0.4); + dict.insert(0.9, 0.45); + testStaticRuntime(script, {dict}); +} + +TEST(StaticRuntime, LenWithDict_Tensor) { + const auto script = R"JIT( + def forward(self, input: Dict[Tensor, Tensor]): + return len(input) + )JIT"; + + c10::Dict dict; + dict.insert(at::randn({1, 2}), at::randn({1, 2})); + dict.insert(at::randn({1, 2}), at::randn({1, 2})); + testStaticRuntime(script, {dict}); +} + TEST(StaticRuntime, Logit) { // no nnc const auto logit_script_1 = R"JIT( @@ -293,6 +435,99 @@ TEST(StaticRuntime, EmbeddingBagWithManagedOutput) { testStaticRuntime(embedding_bag_managed_output, args, args2); } +TEST(StaticRuntime, EmbeddingBagWithExtraneousOutput) { + const std::string embedding_bag_default_ir = R"IR( + graph(%weight, %indices, %offsets): + %scale_grad_by_freq : bool = prim::Constant[value=0]() + %mode : int = prim::Constant[value=0]() + %sparse : bool = prim::Constant[value=0]() + %per_sample_weights : NoneType = prim::Constant() + %include_last_offset : bool = prim::Constant[value=0]() + %y0 : Tensor, %y1 : Tensor, %y2 : Tensor, %y3 : Tensor = aten::embedding_bag(%weight, %indices, %offsets, %scale_grad_by_freq, %mode, %sparse, %per_sample_weights, %include_last_offset) + %none : NoneType = prim::Constant() + %res : Tensor = aten::clone(%y0, %none) + return (%res) + )IR"; + auto graph = getGraphFromIR(embedding_bag_default_ir); + RemoveUnnecessaryOutputs(graph); + torch::jit::testing::FileCheck() + .check("static_runtime::embedding_bag") + ->run(*graph); + + const std::string embedding_bag_mean_ir = R"IR( + graph(%weight, %indices, %offsets): + %scale_grad_by_freq : bool = prim::Constant[value=0]() + %mode : int = prim::Constant[value=1]() + %sparse : bool = prim::Constant[value=0]() + %per_sample_weights : NoneType = prim::Constant() + %include_last_offset : bool = prim::Constant[value=0]() + %y0 : Tensor, %y1 : Tensor, %y2 : Tensor, %y3 : Tensor = aten::embedding_bag(%weight, %indices, %offsets, %scale_grad_by_freq, %mode, %sparse, %per_sample_weights, %include_last_offset) + %none : NoneType = prim::Constant() + %res : Tensor = aten::clone(%y0, %none) + return (%res) + )IR"; + graph = getGraphFromIR(embedding_bag_mean_ir); + RemoveUnnecessaryOutputs(graph); + torch::jit::testing::FileCheck() + .check("static_runtime::embedding_bag") + ->run(*graph); + + const std::string embedding_bag_max_last_offset_ir = R"IR( + graph(%weight, %indices, %offsets): + %scale_grad_by_freq : bool = prim::Constant[value=0]() + %mode : int = prim::Constant[value=2]() + %sparse : bool = prim::Constant[value=0]() + %per_sample_weights : NoneType = prim::Constant() + %include_last_offset : bool = prim::Constant[value=1]() + %y0 : Tensor, %y1 : Tensor, %y2 : Tensor, %y3 : Tensor = aten::embedding_bag(%weight, %indices, %offsets, %scale_grad_by_freq, %mode, %sparse, %per_sample_weights, %include_last_offset) + %none : NoneType = prim::Constant() + %res : Tensor = aten::clone(%y0, %none) + return (%res) + )IR"; + graph = getGraphFromIR(embedding_bag_max_last_offset_ir); + RemoveUnnecessaryOutputs(graph); + torch::jit::testing::FileCheck() + .check("static_runtime::embedding_bag") + ->run(*graph); + + const std::string embedding_bag_normal_ir = R"IR( + graph(%weight, %indices, %offsets): + %scale_grad_by_freq : bool = prim::Constant[value=0]() + %mode : int = prim::Constant[value=0]() + %sparse : bool = prim::Constant[value=0]() + %per_sample_weights : NoneType = prim::Constant() + %include_last_offset : bool = prim::Constant[value=0]() + %y0 : Tensor, %y1 : Tensor, %y2 : Tensor, %y3 : Tensor = aten::embedding_bag(%weight, %indices, %offsets, %scale_grad_by_freq, %mode, %sparse, %per_sample_weights, %include_last_offset) + %none : NoneType = prim::Constant() + %res0 : Tensor = aten::clone(%y0, %none) + %res1 : Tensor = aten::clone(%y1, %none) + %res2 : Tensor = aten::clone(%y2, %none) + %res3 : Tensor = aten::clone(%y3, %none) + return (%res0, %res1, %res2, %res3) + )IR"; + graph = getGraphFromIR(embedding_bag_normal_ir); + RemoveUnnecessaryOutputs(graph); + torch::jit::testing::FileCheck() + .check_not("static_runtime::embedding_bag") + ->run(*graph); + + at::Tensor weight = torch::randn({3, 11}, at::ScalarType::Float); + at::Tensor input = torch::tensor({0, 1, 0, 2}); + at::Tensor offset = torch::tensor({0, 2, 4}); + std::vector args{weight, input, offset}; + testStaticRuntime(embedding_bag_default_ir, args); + testStaticRuntime(embedding_bag_mean_ir, args); + testStaticRuntime(embedding_bag_max_last_offset_ir, args); + + at::Tensor weight2 = torch::randn({10, 11}, at::ScalarType::Float); + at::Tensor input2 = torch::tensor({0, 1, 0, 2, 1}); + at::Tensor offset2 = torch::tensor({0, 1, 2, 3, 4, 5}); + std::vector args2{weight2, input2, offset2}; + testStaticRuntime(embedding_bag_default_ir, args, args2); + testStaticRuntime(embedding_bag_mean_ir, args, args2); + testStaticRuntime(embedding_bag_max_last_offset_ir, args, args2); +} + TEST(StaticRuntime, LayerNorm) { const std::string layer_norm_with_weights = R"JIT( def forward(self, input: Tensor, normalized_shape: List[int], weight: Tensor, bias: Tensor): @@ -304,13 +539,6 @@ TEST(StaticRuntime, LayerNorm) { return torch.layer_norm(input, normalized_shape, None, None, 1e-05, False).clone() )JIT"; -#ifdef FBCODE_CAFFE2 - script::Module module("module"); - module.define(layer_norm_with_weights); - torch::jit::StaticModule smodule(module); - ASSERT_EQ(getNodeWithKind(smodule, "aten::layer_norm"), nullptr); - ASSERT_NE(getNodeWithKind(smodule, "static_runtime::layer_norm"), nullptr); -#endif const auto a = torch::rand({1, 2, 2, 2}); const auto b = torch::rand({3, 2, 2, 2}); for (int normalized_size : {2, 3}) { @@ -1170,13 +1398,23 @@ TEST(StaticRuntime, Full) { return (a.clone()) )JIT"; - auto dtype = at::ScalarType::Int; auto cpu = at::Device(DeviceType::CPU); c10::List size0{2, 5}; - std::vector args{size0, 4, dtype, at::kStrided, cpu, false}; + std::vector args{ + size0, 4, at::ScalarType::Int, at::kStrided, cpu, false}; + std::vector args1{ + size0, 4, at::ScalarType::Float, at::kStrided, cpu, false}; c10::List size1{5, 6}; - std::vector args2{size1, 5, dtype, at::kStrided, cpu, false}; + std::vector args2{ + size1, 5, at::ScalarType::Float, at::kStrided, cpu, false}; testStaticRuntime(full_script, args); + testStaticRuntime( + full_script, + args, + args1, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/false); testStaticRuntime(full_script, args, args2); } @@ -1202,16 +1440,157 @@ TEST(StaticRuntime, FullLike) { auto a = at::randn({2, 3}); auto b = at::randn({3, 4, 2}); - auto dtype = at::ScalarType::Int; auto cpu = at::Device(DeviceType::CPU); std::vector args{ - a, 4, dtype, at::kStrided, cpu, false, c10::MemoryFormat::Contiguous}; + a, + 4, + at::ScalarType::Int, + at::kStrided, + cpu, + false, + c10::MemoryFormat::Contiguous}; + std::vector args1{ + a, + 4, + at::ScalarType::Float, + at::kStrided, + cpu, + false, + c10::MemoryFormat::Contiguous}; std::vector args2{ - b, 4, dtype, at::kStrided, cpu, false, c10::MemoryFormat::Contiguous}; + b, + 4, + at::ScalarType::Float, + at::kStrided, + cpu, + false, + c10::MemoryFormat::Contiguous}; testStaticRuntime(full_like_script, args); + testStaticRuntime( + full_like_script, + args, + args1, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/false); testStaticRuntime(full_like_script, args, args2); } +TEST(StaticRuntime, Ones) { + const auto script = R"JIT( + def forward(self, + size: List[int], + dtype: Optional[int], + layout: Optional[int], + device: Optional[Device], + pin_memory: Optional[bool]): + a = torch.ones(size, + dtype=dtype, + layout=layout, + device=device, + pin_memory=pin_memory) + return (a.clone()) + )JIT"; + + auto dtype = at::ScalarType::Int; + auto cpu = at::Device(DeviceType::CPU); + c10::List size0{2, 5}; + std::vector args{size0, dtype, at::kStrided, cpu, false}; + c10::List size1{5, 6}; + std::vector args2{size1, dtype, at::kStrided, cpu, false}; + testStaticRuntime(script, args); + testStaticRuntime(script, args, args2); +} + +TEST(StaticRuntime, OnesLike) { + const auto script = R"JIT( + def forward(self, + input: Tensor, + dtype: Optional[int], + layout: Optional[int], + device: Optional[Device], + pin_memory: Optional[bool], + memory_format: Optional[int]): + a = torch.ones_like(input, + dtype=dtype, + layout=layout, + device=device, + pin_memory=pin_memory, + memory_format=memory_format) + return (a.clone()) + )JIT"; + + auto cpu = at::Device(DeviceType::CPU); + auto input0 = at::randn({2, 5}); + std::vector args{ + input0, + at::ScalarType::Int, + at::kStrided, + cpu, + false, + c10::MemoryFormat::Contiguous}; + std::vector args1{ + input0, + at::ScalarType::Float, + at::kStrided, + cpu, + false, + c10::MemoryFormat::Contiguous}; + auto input1 = at::randn({5, 6}); + std::vector args2{ + input1, + at::ScalarType::Float, + at::kStrided, + cpu, + false, + c10::MemoryFormat::Contiguous}; + testStaticRuntime(script, args); + testStaticRuntime( + script, + args, + args1, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/false); + testStaticRuntime(script, args, args2); +} + +TEST(StaticRuntime, Zeros) { + const auto script = R"JIT( + def forward(self, + size: List[int], + dtype: Optional[int], + layout: Optional[int], + device: Optional[Device], + pin_memory: Optional[bool]): + a = torch.zeros(size, + dtype=dtype, + layout=layout, + device=device, + pin_memory=pin_memory) + return (a.clone()) + )JIT"; + + auto cpu = at::Device(DeviceType::CPU); + c10::List size0{2, 5}; + std::vector args{ + size0, at::ScalarType::Int, at::kStrided, cpu, false}; + std::vector args1{ + size0, at::ScalarType::Float, at::kStrided, cpu, false}; + c10::List size1{5, 6}; + std::vector args2{ + size1, at::ScalarType::Float, at::kStrided, cpu, false}; + testStaticRuntime(script, args); + testStaticRuntime( + script, + args, + args1, + /*use_allclose=*/false, + /*use_equalnan=*/false, + /*check_resize=*/false); + testStaticRuntime(script, args, args2); +} + TEST(StaticRuntime, Linear) { const auto linear_script = R"JIT( def forward(self, inp: Tensor, weights: Tensor, bias: Optional[Tensor]) -> Tensor: @@ -1442,6 +1821,28 @@ TEST(StaticRuntime, Index) { testStaticRuntime(index_with_two_tensors_script, args_c, args_d); } +TEST(StaticRuntime, IndexSelect) { + const std::string script = R"IR( + graph(%self: Tensor, %dim: int, %index: Tensor): + %bias: None = prim::Constant() + %ret = aten::index_select(%self, %dim, %index) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + auto self0 = at::rand({6}); + auto dim0 = 0; + auto index0 = at::randint(0, 5, {6}, torch::kInt32); + std::vector args{self0, dim0, index0}; + testStaticRuntime(script, args); + + auto self1 = at::rand({128}); + auto dim1 = 0; + auto index1 = at::randint(0, 127, {127}, torch::kInt32); + std::vector args2{self1, dim1, index1}; + testStaticRuntime(script, args, args2); +} + TEST(StaticRuntime, ClampMin) { const auto clamp_min_int_script = R"JIT( def forward(self, a: Tensor, b: int): @@ -1770,7 +2171,7 @@ TEST(StaticRuntime, QuantizedLinearDynamicFp16) { %packed_params = quantized::linear_prepack_fp16(%weights, %bias) %output = quantized::linear_dynamic_fp16(%input, %packed_params) %ret = aten::clone(%output, %bias) - return (%output) + return (%ret) )IR"; at::Tensor weight = torch::randn({3, 2}, torch::kFloat); at::Tensor input = torch::randn({3, 2}, torch::kFloat); @@ -1784,6 +2185,27 @@ TEST(StaticRuntime, QuantizedLinearDynamicFp16) { {input_2, weight_2}); } +TEST(StaticRuntime, QuantizedLinearReluDynamicFp16) { + const std::string quantized_linear_relu_dynamic_fp16_script = R"IR( + graph(%input: Tensor, %weights: Tensor): + %bias: None = prim::Constant() + %packed_params = quantized::linear_prepack_fp16(%weights, %bias) + %output = quantized::linear_relu_dynamic_fp16(%input, %packed_params) + %ret = aten::clone(%output, %bias) + return (%ret) + )IR"; + at::Tensor weight = torch::randn({3, 2}, torch::kFloat); + at::Tensor input = torch::randn({3, 2}, torch::kFloat); + + at::Tensor weight_2 = torch::randn({4, 3}, torch::kFloat); + at::Tensor input_2 = torch::randn({5, 3}, torch::kFloat); + + testStaticRuntime( + quantized_linear_relu_dynamic_fp16_script, + {input, weight}, + {input_2, weight_2}); +} + TEST(StaticRuntime, VarStack) { const auto var_stack_script = R"JIT( def forward(self, inp1: Tensor, inp2: Tensor, dim: int): @@ -2159,21 +2581,30 @@ TEST(StaticRuntime, Where) { return torch.where(x > 0, x, y).clone() )JIT"; - std::vector args1_fallback = {at::randn({2, 2}), at::randn({2, 2})}; - std::vector args2_fallback = {at::randn({3, 6}), at::randn({3, 6})}; + std::vector args1 = {at::randn({2, 2}), at::randn({2, 2})}; + std::vector args2 = {at::randn({8, 10}), at::randn({8, 10})}; - std::vector args1_nnc = { - at::randint(-10, 10, {2, 2}, at::kLong), - at::randint(-10, 10, {2, 2}, at::kLong)}; - std::vector args2_nnc = { - at::randint(-10, 10, {3, 6}, at::kLong), - at::randint(-10, 10, {3, 6}, at::kLong)}; + testStaticRuntime(where_script, args1); + testStaticRuntime(where_script, args1, args2); +} - testStaticRuntime(where_script, args1_fallback); - testStaticRuntime(where_script, args1_fallback, args2_fallback); +TEST(StaticRuntime, WhereBroadcast) { + const auto where_script = R"JIT( + def forward(self, cond_1d, x, y): + shape = [-1] + [1] * (x.dim() - 1) + cond = cond_1d.view(shape) + return torch.where(cond, x, y).clone() + )JIT"; - testStaticRuntime(where_script, args1_nnc); - testStaticRuntime(where_script, args1_nnc, args2_nnc); + std::vector args1 = { + at::tensor({0, 1}).to(at::kBool), at::randn({2, 2}), at::randn({2, 2})}; + std::vector args2 = { + at::tensor({1, 0, 0}).to(at::kBool), + at::randn({3, 6}), + at::randn({3, 6})}; + + testStaticRuntime(where_script, args1); + testStaticRuntime(where_script, args1, args2); } TEST(StaticRuntime, View) { @@ -2720,3 +3151,229 @@ TEST(StaticRuntime, ToList) { )JIT"; testStaticRuntime(src, {at::randn({2, 2})}); } + +TEST(StaticRuntime, IfThenElse) { + const auto src = R"IR( + graph(%cond: bool, %a: Tensor, %b: Tensor): + %none: NoneType = prim::Constant() + %c: Tensor = prim::IfThenElse(%cond, %a, %b) + %d: Tensor = aten::clone(%c, %none) + return (%d) + )IR"; + + std::vector args1{true, at::randn({1}), at::randn({1})}; + std::vector args2{false, at::randn({1}), at::randn({1})}; + + testStaticRuntime(src, args1); + testStaticRuntime(src, args2); +} + +TEST(StaticRuntime, EmptyIfBlock) { + const auto src = + R"JIT( + def forward(self, cond: bool, a: Tensor, b: Tensor): + l = [] + if cond: + l.append((a + b).clone()) + return l + )JIT"; + + testStaticRuntime(src, {true, at::rand(1), at::rand({1, 2})}); + testStaticRuntime(src, {false, at::rand(1), at::rand({1, 2})}); +} + +TEST(StaticRuntime, EmptyNestedIfBlock) { + const auto src = + R"JIT( + def forward(self, cond: bool, a: Tensor, b: Tensor): + l = [] + if cond: + if cond: + l.append((a + b).clone()) + return l + )JIT"; + + testStaticRuntime(src, {true, at::rand(1), at::rand({1, 2})}); + testStaticRuntime(src, {false, at::rand(1), at::rand({1, 2})}); +} + +TEST(StaticRuntime, StackEmpty) { + const auto src = R"JIT( + def forward(self): + x = torch.stack([]) + return x + )JIT"; + + torch::jit::Module mod("mod"); + mod.define(src); + + torch::jit::StaticModule smod(mod); + EXPECT_THROW(smod({}), c10::Error); +} + +TEST(StaticRuntime, ConcatEmpty) { + const auto src = R"JIT( + def forward(self): + x = torch.concat([]) + return x + )JIT"; + + torch::jit::Module mod("mod"); + mod.define(src); + + torch::jit::StaticModule smod(mod); + EXPECT_THROW(smod({}), c10::Error); +} + +TEST(StaticRuntime, IntImplicit) { + const auto src = R"IR( + graph(%a: Tensor): + %y: int = aten::IntImplicit(%a) + return (%y) + )IR"; + testStaticRuntime(src, {at::tensor({1}, at::kInt).squeeze()}); +} + +TEST(StaticRuntime, IntImplicit_ThrowOnBadInputs) { + const auto src = R"IR( + graph(%a: Tensor): + %y: int = aten::IntImplicit(%a) + return (%y) + )IR"; + auto graph = getGraphFromIR(src); + torch::jit::StaticModule smod(graph); + // Not 0D tensor + EXPECT_THROW(smod({at::tensor({1, 2}, at::kInt)}), std::runtime_error); + // Wrong dtype + EXPECT_THROW( + smod({at::tensor({1}, at::kFloat).squeeze()}), std::runtime_error); +} + +TEST(StaticRuntime, Select) { + const auto src = R"IR( + graph(%a: Tensor, %dim: int, %index: int): + %none: NoneType = prim::Constant() + %b: Tensor = aten::select(%a, %dim, %index) + %c: Tensor = aten::clone(%b, %none) + return (%c) + )IR"; + testStaticRuntime(src, {at::randn({2, 2}), 0, 1}); +} + +TEST(StaticRuntime, ReshapeAs) { + const auto src = R"JIT( + def forward(self, a, b): + return a.reshape_as(b).clone() + )JIT"; + testStaticRuntime(src, {at::randn({2, 2}), at::randn({4})}); +} + +TEST(StaticRuntime, MoveCtor) { + auto mod = getDeepAndWideSciptModel(); + std::vector args{ + at::randn({1, 1, 32}), at::randn({1, 1, 32}), at::randn({1, 50})}; + + torch::jit::StaticModule smod(mod); + + torch::jit::StaticRuntime runtime(smod); + auto expected = runtime(args); + + torch::jit::StaticRuntime new_runtime(std::move(runtime)); + auto actual = new_runtime(args); + compareResults(expected, actual); +} + +TEST(StaticRuntime, SingleBlockIfReturnList) { + const auto src = R"JIT( + def forward(self, a, b, cond: bool): + lst = [] + if cond: + lst.append(a + b) + return lst + )JIT"; + std::vector args1{at::randn({1}), at::randn({1}), true}; + std::vector args2{at::randn({42, 42}), at::randn({42, 42}), false}; + testStaticRuntime(src, args1, args2); +} + +TEST(StaticRuntime, NestedBlockIfReturnList) { + const auto src = R"JIT( + def forward(self, a, b, cond1: bool, cond2: bool): + if cond1: + lst = [] + if cond2: + lst.append(a + b) + lst.append(a * b) + return lst + return [] + )JIT"; + std::vector args1{at::randn({1}), at::randn({1}), true, true}; + std::vector args2{ + at::randn({42, 42}), at::randn({42, 42}), true, false}; + testStaticRuntime(src, args1, args2); +} + +TEST(StaticRuntime, QuantizedLinearDynamicFp16ReluFusion) { + const auto src = R"IR( + graph(%input: Tensor, %weights: Tensor): + %bias: None = prim::Constant() + %packed_params = quantized::linear_prepack_fp16(%weights, %bias) + %x = quantized::linear_dynamic_fp16(%input, %packed_params) + %y = aten::relu(%x) + %ret = aten::clone(%y, %bias) + return (%ret) + )IR"; + at::Tensor weight = torch::randn({3, 2}, torch::kFloat); + at::Tensor input = torch::randn({3, 2}, torch::kFloat); + + at::Tensor weight_2 = torch::randn({4, 3}, torch::kFloat); + at::Tensor input_2 = torch::randn({5, 3}, torch::kFloat); + + testStaticRuntime(src, {input, weight}, {input_2, weight_2}); + + auto graph = getGraphFromIR(src); + QuantizedLinearReluFusion(graph); + EXPECT_FALSE(hasNodeWithKind(graph, "quantized::linear_dynamic_fp16")); + EXPECT_TRUE(hasNodeWithKind(graph, "quantized::linear_relu_dynamic_fp16")); +} + +TEST(StaticRuntime, ClampNaNToNum) { + const auto src1 = R"JIT( + def forward(self, a): + return torch.clamp(a, min=1.0, max=2.0).nan_to_num().clone() + )JIT"; + + const auto src2 = R"JIT( + def forward(self, a, nan: float): + return torch.clamp(a, min=-1.0, max=2.0).nan_to_num(nan=nan).clone() + )JIT"; + + const auto src3 = R"JIT( + def forward(self, a): + return torch.clamp(a, min=1.0, max=-1.0).nan_to_num().clone() + )JIT"; + + auto a = at::tensor({ + std::numeric_limits::quiet_NaN(), + std::numeric_limits::infinity(), + -std::numeric_limits::infinity(), + 0.0f, + 3.0f + }); + auto b = a.repeat({10, 5}); + + // Have to use_allclose even though all NaNs will be replaced - testStaticRuntime + // also checks inputs at the end to make sure they're not changed + testStaticRuntime(src1, {a}, {}, /*use_allclose=*/true, /*use_equalnan=*/true); + testStaticRuntime(src1, {a}, {b}, /*use_allclose=*/true, /*use_equalnan=*/true); + + testStaticRuntime(src2, {a, 42.0}, {}, /*use_allclose=*/true, /*use_equalnan=*/true); + testStaticRuntime(src2, {a, 2.0}, {b, 1.0}, /*use_allclose=*/true, /*use_equalnan=*/true); + + testStaticRuntime(src3, {a}, {}, /*use_allclose=*/true, /*use_equalnan=*/true); + testStaticRuntime(src3, {a}, {b}, /*use_allclose=*/true, /*use_equalnan=*/true); + + // Non-NNC path + testStaticRuntime(src1, {a.to(at::kDouble)}, {}, /*use_allclose=*/true, /*use_equalnan=*/true); + testStaticRuntime(src1, {a.to(at::kDouble)}, {b.to(at::kDouble)}, /*use_allclose=*/true, /*use_equalnan=*/true); +} diff --git a/benchmarks/static_runtime/test_utils.cc b/benchmarks/static_runtime/test_utils.cc index 276d0a023ff0..7e0733fbc8af 100644 --- a/benchmarks/static_runtime/test_utils.cc +++ b/benchmarks/static_runtime/test_utils.cc @@ -146,11 +146,13 @@ void compareTensorLists( } } +} // namespace + void compareResults( const IValue& expect, const IValue& actual, - const bool use_allclose = false, - const bool use_equalnan = false) { + const bool use_allclose, + const bool use_equalnan) { if (expect.isTensor()) { VLOG(2) << "expect " << expect.toTensor() << std::endl; VLOG(2) << "output " << actual.toTensor() << std::endl; @@ -198,8 +200,6 @@ void compareResults( } } -} // namespace - at::Tensor getTensor(const at::IValue& ival) { if (ival.isTensor()) { return ival.toTensor(); @@ -290,100 +290,104 @@ void testStaticRuntime( for (bool enable_out_variant : {true, false}) { for (bool manage_output_tensors : {true, false}) { - if (!enable_out_variant && manage_output_tensors) { - continue; - } - // run static runtime three times - // 1st run: collect allocation profiles (args) - // 2nd run: exercise memory planner and resizing with args2 - // 3rd run: run with args again - StaticModuleOptions opts{ - .enable_out_variant = enable_out_variant, - .optimize_memory = enable_out_variant, - .manage_output_tensors = manage_output_tensors}; - auto smodule = test_context->makeStaticModule(opts); - StaticRuntime runtime(smodule); - auto actual = runtime(args, {}); - if (actual.isTensor()) { - EXPECT_GE(smodule.num_nodes(), 2) - << "If we only have one node, the output of the op we are testing is " - << "not being managed by the memory planner! A failure here " - << "can typically be fixed by clone()ing the output of the test script."; - } - runtime.check_for_memory_leak(); - // first run - VLOG(2) << "enable_out_variant: " << enable_out_variant; - VLOG(2) << "manage_output_tensors: " << manage_output_tensors; - VLOG(2) << "args: " << args; - VLOG(2) << "args2: " << args2; - VLOG(2) << "expect: " << expect; - VLOG(2) << "actual: " << actual; - compareResults(expect, actual, use_allclose, use_equalnan); - VLOG(2) << "first run comparison done"; - if (manage_output_tensors) { - actual = IValue(); - runtime.deallocateOutputTensors(); - runtime.checkOutputTensorMemoryLeaks(); - } - - if (!args2.empty()) { - auto* memory_planner = runtime.get_memory_planner(); - size_t managed_bytes = - memory_planner ? memory_planner->total_managed() : 0; - - // Run static runtime again with inputs of a different shape. - expect = test_context->getExpected(args2); - actual = runtime(args2, {}); - runtime.check_for_memory_leak(); - VLOG(2) << "comparing with args2"; - compareResults(expect, actual, use_allclose, use_equalnan); - VLOG(2) << "second run comparison done"; - if (manage_output_tensors) { - actual = IValue(); - runtime.deallocateOutputTensors(); - runtime.checkOutputTensorMemoryLeaks(); + for (bool enable_tensorexpr_fusion : {true, false}) { + if (!enable_out_variant && manage_output_tensors) { + continue; } - - size_t new_managed_bytes = - memory_planner ? memory_planner->total_managed() : 0; - if (check_resize && new_managed_bytes > 0) { - EXPECT_GT(new_managed_bytes, managed_bytes); - } - - // Run static runtime again with an input of the shape observed during - // the profile run. - expect = test_context->getExpected(args); - actual = runtime(args, {}); - runtime.check_for_memory_leak(); - // third run - VLOG(2) << "comparing third run"; - compareResults(expect, actual, use_allclose, use_equalnan); - VLOG(2) << "third run comparison done"; - if (manage_output_tensors) { - actual = IValue(); - runtime.deallocateOutputTensors(); - runtime.checkOutputTensorMemoryLeaks(); + // run static runtime three times + // 1st run: collect allocation profiles (args) + // 2nd run: exercise memory planner and resizing with args2 + // 3rd run: run with args again + StaticModuleOptions opts{ + .enable_out_variant = enable_out_variant, + .optimize_memory = enable_out_variant, + .manage_output_tensors = manage_output_tensors, + .enable_tensorexpr_fusion = enable_tensorexpr_fusion}; + auto smodule = test_context->makeStaticModule(opts); + StaticRuntime runtime(smodule); + auto actual = runtime(args, {}); + if (actual.isTensor()) { + EXPECT_GE(smodule.num_nodes(), 2) + << "If we only have one node, the output of the op we are testing is " + << "not being managed by the memory planner! A failure here " + << "can typically be fixed by clone()ing the output of the test script."; } - } else { - // run static runtime again to exercise the memory planner - // and allocate managed tensors. - actual = runtime(args, {}); runtime.check_for_memory_leak(); - VLOG(2) << "comparing second run with same args"; + // first run + VLOG(2) << "enable_out_variant: " << enable_out_variant; + VLOG(2) << "manage_output_tensors: " << manage_output_tensors; + VLOG(2) << "enable_tensorexpr_fusion: " << enable_tensorexpr_fusion; + VLOG(2) << "args: " << args; + VLOG(2) << "args2: " << args2; + VLOG(2) << "expect: " << expect; + VLOG(2) << "actual: " << actual; compareResults(expect, actual, use_allclose, use_equalnan); - VLOG(2) << "second run comparison done"; + VLOG(2) << "first run comparison done"; if (manage_output_tensors) { actual = IValue(); runtime.deallocateOutputTensors(); runtime.checkOutputTensorMemoryLeaks(); } - // third run to use the allocated managed tensors. - actual = runtime(args, {}); - runtime.check_for_memory_leak(); - if (manage_output_tensors) { - actual = IValue(); - runtime.deallocateOutputTensors(); - runtime.checkOutputTensorMemoryLeaks(); + + if (!args2.empty()) { + auto* memory_planner = runtime.get_memory_planner(); + size_t managed_bytes = + memory_planner ? memory_planner->total_managed() : 0; + + // Run static runtime again with inputs of a different shape. + expect = test_context->getExpected(args2); + actual = runtime(args2, {}); + runtime.check_for_memory_leak(); + VLOG(2) << "comparing with args2"; + compareResults(expect, actual, use_allclose, use_equalnan); + VLOG(2) << "second run comparison done"; + if (manage_output_tensors) { + actual = IValue(); + runtime.deallocateOutputTensors(); + runtime.checkOutputTensorMemoryLeaks(); + } + + size_t new_managed_bytes = + memory_planner ? memory_planner->total_managed() : 0; + if (check_resize && new_managed_bytes > 0) { + EXPECT_GT(new_managed_bytes, managed_bytes); + } + + // Run static runtime again with an input of the shape observed during + // the profile run. + expect = test_context->getExpected(args); + actual = runtime(args, {}); + runtime.check_for_memory_leak(); + // third run + VLOG(2) << "comparing third run"; + compareResults(expect, actual, use_allclose, use_equalnan); + VLOG(2) << "third run comparison done"; + if (manage_output_tensors) { + actual = IValue(); + runtime.deallocateOutputTensors(); + runtime.checkOutputTensorMemoryLeaks(); + } + } else { + // run static runtime again to exercise the memory planner + // and allocate managed tensors. + actual = runtime(args, {}); + runtime.check_for_memory_leak(); + VLOG(2) << "comparing second run with same args"; + compareResults(expect, actual, use_allclose, use_equalnan); + VLOG(2) << "second run comparison done"; + if (manage_output_tensors) { + actual = IValue(); + runtime.deallocateOutputTensors(); + runtime.checkOutputTensorMemoryLeaks(); + } + // third run to use the allocated managed tensors. + actual = runtime(args, {}); + runtime.check_for_memory_leak(); + if (manage_output_tensors) { + actual = IValue(); + runtime.deallocateOutputTensors(); + runtime.checkOutputTensorMemoryLeaks(); + } } } } diff --git a/benchmarks/static_runtime/test_utils.h b/benchmarks/static_runtime/test_utils.h index cb0a5a4a8c2e..27efd4d7d42e 100644 --- a/benchmarks/static_runtime/test_utils.h +++ b/benchmarks/static_runtime/test_utils.h @@ -53,6 +53,12 @@ void compareResultsWithJIT( const bool use_allclose = false, const bool use_equalnan = false); +void compareResults( + const IValue& expect, + const IValue& actual, + const bool use_allclose = false, + const bool use_equalnan = false); + } // namespace test } // namespace jit } // namespace torch diff --git a/benchmarks/tensorexpr/__main__.py b/benchmarks/tensorexpr/__main__.py index f243ff5b6105..f984dbccd02d 100644 --- a/benchmarks/tensorexpr/__main__.py +++ b/benchmarks/tensorexpr/__main__.py @@ -56,7 +56,7 @@ def main(): "--input-iter", type=str, default=None, - help="a comma separated list of of Tensor dimensions that includes a start, \ + help="a comma separated list of Tensor dimensions that includes a start, \ stop, and increment that can be constant or a power of 2 \ {start:stop:inc,start:stop:pow2}", ) @@ -137,7 +137,7 @@ def main(): torch._C._jit_set_profiling_executor(True) torch._C._jit_set_texpr_fuser_enabled(True) torch._C._jit_override_can_fuse_on_gpu(True) - torch._C._jit_set_profiling_mode(True) + torch._C._get_graph_executor_optimize(True) elif args.cuda_fuser == "old": import torch torch._C._jit_set_profiling_executor(False) @@ -148,7 +148,7 @@ def main(): torch._C._jit_set_profiling_executor(True) torch._C._jit_set_texpr_fuser_enabled(False) torch._C._jit_set_nvfuser_enabled(True) - torch._C._jit_set_profiling_mode(True) + torch._C._get_graph_executor_optimize(True) else : raise ValueError("Undefined fuser: {}".format(args.cuda_fuser)) diff --git a/binaries/CMakeLists.txt b/binaries/CMakeLists.txt index a98754eea2c3..b683ee002280 100644 --- a/binaries/CMakeLists.txt +++ b/binaries/CMakeLists.txt @@ -4,6 +4,7 @@ if(INTERN_BUILD_MOBILE) caffe2_binary_target("speed_benchmark.cc") else() caffe2_binary_target("speed_benchmark_torch.cc") + caffe2_binary_target("load_benchmark_torch.cc") if(NOT BUILD_LITE_INTERPRETER) caffe2_binary_target("compare_models_torch.cc") endif() diff --git a/binaries/aot_model_compiler.cc b/binaries/aot_model_compiler.cc index b9d1d24c08ea..7d2d68a61f17 100644 --- a/binaries/aot_model_compiler.cc +++ b/binaries/aot_model_compiler.cc @@ -30,6 +30,16 @@ C10_DEFINE_string( "If multiple inputs needed, use semicolon to separate " "the dtype of different tensors." "Supported dtypes: float, int64, uint8"); +C10_DEFINE_string( + input_memory_formats, + "", + "Input memory format." + "If multiple inputs needed, use semicolon to separate." + "Supported values: contiguous, channels_last"); +C10_DEFINE_string( + dynamic_dims, + "", + "Comma separated dimensions of input tensors that can be dynamic"); C10_DEFINE_string(method_name, "forward", "The name of the method."); C10_DEFINE_string( output_llvm, @@ -61,6 +71,8 @@ c10::Dict createCompileSpec() { c10::StringType::get(), c10::AnyType::get()); method_spec.insert("sizes", FLAGS_input_dims); method_spec.insert("types", FLAGS_input_types); + method_spec.insert("memory_formats", FLAGS_input_memory_formats); + method_spec.insert("dynamic_sizes", FLAGS_dynamic_dims); method_spec.insert("asmfile", FLAGS_output_llvm); method_spec.insert("model_name", FLAGS_model_name); method_spec.insert("model_version", FLAGS_model_version); @@ -79,6 +91,7 @@ int main(int argc, char** argv) { " --model_version=" " --input_dims=" " --input_types=" + " --input_memory_formats=" " [--method_name=]" " [--output_llvm=]" " [--output_model=]"); @@ -93,10 +106,18 @@ int main(int argc, char** argv) { CAFFE_ENFORCE(!FLAGS_model_name.empty(), c10::UsageMessage()); CAFFE_ENFORCE(!FLAGS_model_version.empty(), c10::UsageMessage()); CAFFE_ENFORCE(!FLAGS_input_dims.empty(), c10::UsageMessage()); + const auto dims_size = split(';', FLAGS_input_dims).size(); CAFFE_ENFORCE( - split(';', FLAGS_input_dims).size() == - split(';', FLAGS_input_types).size(), + dims_size == split(';', FLAGS_input_types).size(), "Number of input_dims and input_types should be the same"); + const auto mem_formats_size = split(';', FLAGS_input_memory_formats).size(); + CAFFE_ENFORCE( + mem_formats_size == 0 || mem_formats_size == dims_size, + "Number of input_memory_formats should be 0 (default contiguous) or the same as number of input_dims"); + if (FLAGS_output_llvm.empty()) { + FLAGS_output_llvm = + FLAGS_model.substr(0, FLAGS_model.find('.')) + ".compiled.ll"; + } std::string output_model_name = FLAGS_output_model; if (output_model_name.empty()) { diff --git a/binaries/bench_gen/bench_gen.py b/binaries/bench_gen/bench_gen.py index 2b344c1f5947..8684e07ee4fd 100755 --- a/binaries/bench_gen/bench_gen.py +++ b/binaries/bench_gen/bench_gen.py @@ -59,7 +59,7 @@ def main(args): if __name__ == "__main__": parser = argparse.ArgumentParser( - description="Utilitity to generate Caffe2 benchmark models.") + description="Utility to generate Caffe2 benchmark models.") parser.add_argument("operator", help="Caffe2 operator to benchmark.") parser.add_argument("-b", "--blob", help="Instantiate a blob --blob name=dim1,dim2,dim3", diff --git a/binaries/load_benchmark_torch.cc b/binaries/load_benchmark_torch.cc new file mode 100644 index 000000000000..330955657ece --- /dev/null +++ b/binaries/load_benchmark_torch.cc @@ -0,0 +1,93 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include "caffe2/core/timer.h" +#include "caffe2/utils/string_utils.h" +#include +#include +#include +#include +#include + +#include + +#include +using namespace std::chrono; + +C10_DEFINE_string(model, "", "The given torch script model to benchmark."); +C10_DEFINE_int(iter, 10, "The number of iterations to run."); +C10_DEFINE_bool( + report_pep, + true, + "Whether to print performance stats for AI-PEP."); + +int main(int argc, char** argv) { + c10::SetUsageMessage( + "Run model load time benchmark for pytorch model.\n" + "Example usage:\n" + "./load_benchmark_torch" + " --model=" + " --iter=20"); + if (!c10::ParseCommandLineFlags(&argc, &argv)) { + std::cerr << "Failed to parse command line flags!" << std::endl; + return 1; + } + + std::cout << "Starting benchmark." << std::endl; + CAFFE_ENFORCE( + FLAGS_iter >= 0, + "Number of main runs should be non negative, provided ", + FLAGS_iter, + "."); + + caffe2::Timer timer; + std::vector times; + + for (int i = 0; i < FLAGS_iter; ++i) { + auto start = high_resolution_clock::now(); + +#if BUILD_LITE_INTERPRETER + auto module = torch::jit::_load_for_mobile(FLAGS_model); +#else + auto module = torch::jit::load(FLAGS_model); +#endif + + auto stop = high_resolution_clock::now(); + auto duration = duration_cast(stop - start); + times.push_back(duration.count()); + } + + const double micros = static_cast(timer.MicroSeconds()); + if (FLAGS_report_pep) { + for (auto t : times) { + std::cout << R"(PyTorchObserver {"type": "NET", "unit": "us", )" + << R"("metric": "latency", "value": ")" + << t << R"("})" << std::endl; + } + } + + const double iters = static_cast(FLAGS_iter); + std::cout << "Main run finished. Microseconds per iter: " + << micros / iters + << ". Iters per second: " << 1000.0 * 1000 * iters / micros + << std::endl; + + return 0; +} diff --git a/binaries/record_function_benchmark.cc b/binaries/record_function_benchmark.cc index c80f46d75652..8d53007bc8ef 100644 --- a/binaries/record_function_benchmark.cc +++ b/binaries/record_function_benchmark.cc @@ -49,12 +49,10 @@ float runPureRecordFunctionBench(int iter) { typedef std::chrono::microseconds us; std::chrono::time_point start_time = clock::now(); for (auto idx = 0; idx < iter; ++idx) { - bool pre_sampled = false; - if (at::shouldRunRecordFunction(&pre_sampled)) { - at::RecordFunction guard(at::RecordScope::USER_SCOPE, pre_sampled); - if (C10_UNLIKELY(guard.isActive())) { - guard.before("Test", -1); - } + auto step_callbacks = at::getStepCallbacks(at::RecordScope::USER_SCOPE); + if (!step_callbacks.empty()) { + at::RecordFunction guard(std::move(step_callbacks)); + guard.before("Test", -1); } } auto duration = static_cast( diff --git a/binaries/speed_benchmark_torch.cc b/binaries/speed_benchmark_torch.cc index e4eee10636e3..ea523898b51e 100644 --- a/binaries/speed_benchmark_torch.cc +++ b/binaries/speed_benchmark_torch.cc @@ -180,35 +180,48 @@ class vkRunner final : public Runner { virtual c10::IValue run( T& module, const std::vector& inputs) override { - // Upload the input tensor(s) to GPU memory. - inputs_.clear(); - inputs_.reserve(inputs.size()); - for (const auto& input : inputs) { - if (input.isTensor()) { - inputs_.emplace_back(input.toTensor().vulkan()); - } - else if (input.isList()) { - const c10::List input_as_list = input.toList(); - c10::List input_vk_list; - input_vk_list.reserve(input_as_list.size()); - for (int i=0; i < input_as_list.size(); ++i) { - const c10::IValue element = input_as_list.get(i); - if (element.isTensor()) { - input_vk_list.emplace_back(element.toTensor().vulkan()); - } - else { - CAFFE_THROW("Input of type c10::List must only contain Tensors!"); + + if (inputs_.size() == 0) { + // Upload the input tensor(s) to GPU memory. + inputs_.clear(); + inputs_.reserve(inputs.size()); + for (const auto& input : inputs) { + if (input.isTensor()) { + inputs_.emplace_back(at::rand(input.toTensor().sizes()).vulkan()); + } + else if (input.isTensorList()) { + const c10::List input_as_list = input.toTensorList(); + c10::List input_vk_list; + input_vk_list.reserve(input_as_list.size()); + for (int i=0; i < input_as_list.size(); ++i) { + const at::Tensor element = input_as_list.get(i); + input_vk_list.emplace_back(at::rand(element.sizes()).vulkan()); } + inputs_.emplace_back(c10::IValue(input_vk_list)); + } + else { + CAFFE_THROW("Inputs must only contain IValues of type c10::Tensor or c10::TensorList!"); } - inputs_.emplace_back(c10::IValue(input_vk_list)); - } - else { - CAFFE_THROW("Inputs must only contain IValues of type c10::Tensor or c10::List!"); } } // Run, and download the output tensor to system memory. - return module.forward(inputs_).toTensor().cpu(); + c10::IValue output = module.forward(inputs_); + if (output.isTensor()) { + return output.toTensor().cpu(); + } + else if (output.isTensorList()) { + return output.toTensorList().get(0).cpu(); + } + else if (output.isList()) { + return output.toList().get(0).toTensor().cpu(); + } + else if (output.isTuple()) { + return output.toTuple()->elements()[0].toTensor().cpu(); + } + else { + CAFFE_THROW("Outputs must only be either c10::Tensor or c10::TensorList!"); + }; } private: diff --git a/build.bzl b/build.bzl new file mode 100644 index 000000000000..a1566377e844 --- /dev/null +++ b/build.bzl @@ -0,0 +1,127 @@ +def define_targets(rules): + rules.cc_library( + name = "caffe2_serialize", + srcs = [ + "caffe2/serialize/file_adapter.cc", + "caffe2/serialize/inline_container.cc", + "caffe2/serialize/istream_adapter.cc", + "caffe2/serialize/read_adapter_interface.cc", + ], + tags = [ + "supermodule:android/default/pytorch", + "supermodule:ios/default/public.pytorch", + "-fbcode", + "xplat", + ], + visibility = ["//visibility:public"], + deps = [ + ":caffe2_headers", + "@com_github_glog//:glog", + "//c10", + "//third_party/miniz-2.0.8:miniz", + ], + ) + + rules.genrule( + name = "generate-code", + srcs = [ + ":DispatchKeyNativeFunctions.cpp", + ":DispatchKeyNativeFunctions.h", + ":LazyIr.h", + ":RegisterDispatchKey.cpp", + ":native_functions.yaml", + ":shape_inference.h", + ":tags.yaml", + ":ts_native_functions.cpp", + ":ts_native_functions.yaml", + ], + tools = ["//tools/setup_helpers:generate_code"], + outs = GENERATED_AUTOGRAD_CPP + GENERATED_AUTOGRAD_PYTHON + GENERATED_TESTING_PY, + cmd = "$(location //tools/setup_helpers:generate_code) " + + "--gen-dir=$(RULEDIR) " + + "--native-functions-path $(location :native_functions.yaml) " + + "--tags-path=$(location :tags.yaml) " + + "--gen_lazy_ts_backend", + ) + + rules.genrule( + name = "version_h", + srcs = [ + ":torch/csrc/api/include/torch/version.h.in", + ":version.txt", + ], + outs = ["torch/csrc/api/include/torch/version.h"], + cmd = "$(location //tools/setup_helpers:gen_version_header) " + + "--template-path $(location :torch/csrc/api/include/torch/version.h.in) " + + "--version-path $(location :version.txt) --output-path $@ ", + tools = ["//tools/setup_helpers:gen_version_header"], + ) + +# These lists are temporarily living in and exported from the shared +# structure so that an internal build that lives under a different +# root can access them. These could technically live in a separate +# file in the same directory but that would require extra work to +# ensure that file is synced to both Meta internal repositories and +# GitHub. This problem will go away when the targets downstream of +# generate-code that use these lists are moved into the shared +# structure as well. + +_GENERATED_AUTOGRAD_PYTHON_HEADERS = [ + "torch/csrc/autograd/generated/python_functions.h", +] + +_GENERATED_AUTOGRAD_CPP_HEADERS = [ + "torch/csrc/autograd/generated/Functions.h", + "torch/csrc/autograd/generated/VariableType.h", + "torch/csrc/autograd/generated/variable_factories.h", +] + +GENERATED_AUTOGRAD_H = _GENERATED_AUTOGRAD_CPP_HEADERS + _GENERATED_AUTOGRAD_PYTHON_HEADERS + +GENERATED_TESTING_PY = [ + "torch/testing/_internal/generated/annotated_fn_args.py", +] + +GENERATED_LAZY_H = [ + "torch/csrc/lazy/generated/LazyIr.h", + "torch/csrc/lazy/generated/LazyNativeFunctions.h", +] + +_GENERATED_AUTOGRAD_PYTHON_CPP = [ + "torch/csrc/autograd/generated/python_functions_0.cpp", + "torch/csrc/autograd/generated/python_functions_1.cpp", + "torch/csrc/autograd/generated/python_functions_2.cpp", + "torch/csrc/autograd/generated/python_functions_3.cpp", + "torch/csrc/autograd/generated/python_functions_4.cpp", + "torch/csrc/autograd/generated/python_nn_functions.cpp", + "torch/csrc/autograd/generated/python_fft_functions.cpp", + "torch/csrc/autograd/generated/python_linalg_functions.cpp", + "torch/csrc/autograd/generated/python_return_types.cpp", + "torch/csrc/autograd/generated/python_sparse_functions.cpp", + "torch/csrc/autograd/generated/python_special_functions.cpp", + "torch/csrc/autograd/generated/python_torch_functions_0.cpp", + "torch/csrc/autograd/generated/python_torch_functions_1.cpp", + "torch/csrc/autograd/generated/python_torch_functions_2.cpp", + "torch/csrc/autograd/generated/python_variable_methods.cpp", +] + +GENERATED_AUTOGRAD_PYTHON = _GENERATED_AUTOGRAD_PYTHON_HEADERS + _GENERATED_AUTOGRAD_PYTHON_CPP + +GENERATED_AUTOGRAD_CPP = [ + "torch/csrc/autograd/generated/Functions.cpp", + "torch/csrc/autograd/generated/VariableType_0.cpp", + "torch/csrc/autograd/generated/VariableType_1.cpp", + "torch/csrc/autograd/generated/VariableType_2.cpp", + "torch/csrc/autograd/generated/VariableType_3.cpp", + "torch/csrc/autograd/generated/VariableType_4.cpp", + "torch/csrc/autograd/generated/TraceType_0.cpp", + "torch/csrc/autograd/generated/TraceType_1.cpp", + "torch/csrc/autograd/generated/TraceType_2.cpp", + "torch/csrc/autograd/generated/TraceType_3.cpp", + "torch/csrc/autograd/generated/TraceType_4.cpp", + "torch/csrc/autograd/generated/ADInplaceOrViewType_0.cpp", + "torch/csrc/autograd/generated/ADInplaceOrViewType_1.cpp", + "torch/csrc/lazy/generated/LazyNativeFunctions.cpp", + "torch/csrc/lazy/generated/RegisterAutogradLazy.cpp", + "torch/csrc/lazy/generated/RegisterLazy.cpp", +] + _GENERATED_AUTOGRAD_CPP_HEADERS + GENERATED_LAZY_H diff --git a/c10/BUILD.bazel b/c10/BUILD.bazel index f4a43cf93013..5e6ed8297e5e 100644 --- a/c10/BUILD.bazel +++ b/c10/BUILD.bazel @@ -1,6 +1,9 @@ load("@bazel_skylib//rules:common_settings.bzl", "bool_flag") load("@rules_cc//cc:defs.bzl", "cc_library") -load("//tools/config:defs.bzl", "if_cuda") +load("//:tools/bazel.bzl", "rules") +load(":build.bzl", "define_targets") + +define_targets(rules = rules) # The bool_flag targets allow configuring the build from the # command-line, e.g. --//c10:use_gflags or --no//c10:use_gflags to @@ -47,27 +50,3 @@ cc_library( }), visibility = ["//:__pkg__"], ) - -cc_library( - name = "c10", - deps = [ - "//c10/core:CPUAllocator", - "//c10/core:ScalarType", - "//c10/core:alignment", - "//c10/core:alloc_cpu", - "//c10/core:base", - "//c10/macros", - "//c10/mobile:CPUCachingAllocator", - "//c10/mobile:CPUProfilingAllocator", - "//c10/util:TypeCast", - "//c10/util:base", - "//c10/util:typeid", - ] + if_cuda( - [ - "//c10/cuda", - "//c10/cuda:Macros", - ], - [], - ), - visibility = ["//:__pkg__"], -) diff --git a/c10/BUILD.buck b/c10/BUILD.buck new file mode 100644 index 000000000000..b70b780302a8 --- /dev/null +++ b/c10/BUILD.buck @@ -0,0 +1,50 @@ +load("//tools/build_defs:glob_defs.bzl", "subdir_glob") + +cxx_library( + name = "c10", + srcs = glob( + ["**/*.cpp"], + exclude = [ + "test/**/*.cpp", + "benchmark/**/*.cpp", + "cuda/**/*.cpp", + ], + ), + deps = [ + "//third_party:fmt", + "//third_party:glog", + ], + exported_deps = [], + compiler_flags = [ + "-Werror", + "-Wno-global-constructors", + "-DDISABLE_NAMEDTENSOR", + "-DSUPPORTS_BACKTRACE=0" + ], + exported_headers = subdir_glob( + [ + ("", "**/*.h"), + ], + exclude = [ + "test/**/*.h", + "benchmark/**/*.h", + "cuda/**/*.h", + ], + ), + exported_linker_flags = [], + exported_preprocessor_flags = [ + '-DC10_USING_CUSTOM_GENERATED_MACROS', + '-DC10_USE_GLOG', + '-DC10_USE_MINIMAL_GLOG', + '-DC10_DISABLE_NUMA', + '-DC10_MOBILE', + '-fexceptions', + '-Wno-global-constructors' + ], + header_namespace = "c10", + link_whole = True, + platform_preprocessor_flags = [['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]], + preprocessor_flags = ['-DC10_BUILD_MAIN_LIB'], + reexport_all_header_dependencies = True, + visibility = ['PUBLIC'], +) diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt index 23a0e024d35e..41b1a1a0bc9b 100644 --- a/c10/CMakeLists.txt +++ b/c10/CMakeLists.txt @@ -50,6 +50,9 @@ target_compile_options(c10 PRIVATE "-DC10_BUILD_MAIN_LIB") if(${COMPILER_SUPPORTS_HIDDEN_VISIBILITY}) target_compile_options(c10 PRIVATE "-fvisibility=hidden") endif() +if(HAS_WERROR_SIGN_COMPARE AND WERROR) + target_compile_options(c10 PRIVATE "-Werror=sign-compare") +endif() # ---[ Dependency of c10 if(${USE_GFLAGS}) diff --git a/c10/benchmark/BUILD.bazel b/c10/benchmark/BUILD.bazel new file mode 100644 index 000000000000..d1a0db360d23 --- /dev/null +++ b/c10/benchmark/BUILD.bazel @@ -0,0 +1,4 @@ +load("//:tools/bazel.bzl", "rules") +load(":build.bzl", "define_targets") + +define_targets(rules = rules) diff --git a/c10/benchmark/build.bzl b/c10/benchmark/build.bzl new file mode 100644 index 000000000000..f9838e00cb4e --- /dev/null +++ b/c10/benchmark/build.bzl @@ -0,0 +1,10 @@ +def define_targets(rules): + rules.cc_binary( + name = "intrusive_ptr", + srcs = ["intrusive_ptr_benchmark.cpp"], + tags = ["benchmark"], + deps = [ + "//c10/util:base", + "@google_benchmark//:benchmark", + ], + ) diff --git a/c10/build.bzl b/c10/build.bzl new file mode 100644 index 000000000000..21107eb8b992 --- /dev/null +++ b/c10/build.bzl @@ -0,0 +1,24 @@ +def define_targets(rules): + rules.cc_library( + name = "c10", + deps = [ + "//c10/core:CPUAllocator", + "//c10/core:ScalarType", + "//c10/core:alignment", + "//c10/core:alloc_cpu", + "//c10/core:base", + "//c10/macros", + "//c10/mobile:CPUCachingAllocator", + "//c10/mobile:CPUProfilingAllocator", + "//c10/util:TypeCast", + "//c10/util:base", + "//c10/util:typeid", + ] + rules.if_cuda( + [ + "//c10/cuda", + "//c10/cuda:Macros", + ], + [], + ), + visibility = ["//visibility:public"], + ) diff --git a/c10/core/Allocator.h b/c10/core/Allocator.h index c05139a93f00..4f571fd91511 100644 --- a/c10/core/Allocator.h +++ b/c10/core/Allocator.h @@ -217,9 +217,9 @@ struct AllocatorRegisterer { } }; -#define REGISTER_ALLOCATOR(t, f) \ - namespace { \ - static AllocatorRegisterer g_allocator_d(f); \ +#define REGISTER_ALLOCATOR(t, f) \ + namespace { \ + static c10::AllocatorRegisterer g_allocator_d(f); \ } // An interface for reporting thread local memory usage diff --git a/c10/core/Backend.h b/c10/core/Backend.h index e17a1bc4226c..a8ad60f8c913 100644 --- a/c10/core/Backend.h +++ b/c10/core/Backend.h @@ -32,6 +32,7 @@ enum class Backend { HIP, VE, FPGA, + IPU, XPU, SparseCPU, SparseCUDA, @@ -49,9 +50,10 @@ enum class Backend { QuantizedXPU, Undefined, MkldnnCPU, - MLC, + MPS, HPU, Lazy, + PrivateUse1, NumOptions }; @@ -72,8 +74,8 @@ static inline Backend dispatchKeyToBackend(DispatchKey t) { return Backend::XLA; } else if (t == DispatchKey::Lazy || t == DispatchKey::AutogradLazy) { return Backend::Lazy; - } else if (t == DispatchKey::MLC || t == DispatchKey::AutogradMLC) { - return Backend::MLC; + } else if (t == DispatchKey::MPS || t == DispatchKey::AutogradMPS) { + return Backend::MPS; } else if (t == DispatchKey::Vulkan) { return Backend::Vulkan; } else if (t == DispatchKey::Metal) { @@ -96,6 +98,8 @@ static inline Backend dispatchKeyToBackend(DispatchKey t) { return Backend::QuantizedCPU; } else if (t == DispatchKey::QuantizedCUDA) { return Backend::QuantizedCUDA; + } else if (t == DispatchKey::IPU || t == DispatchKey::AutogradIPU) { + return Backend::IPU; } else if (t == DispatchKey::XPU || t == DispatchKey::AutogradXPU) { return Backend::XPU; } else if (t == DispatchKey::SparseXPU) { @@ -104,6 +108,8 @@ static inline Backend dispatchKeyToBackend(DispatchKey t) { return Backend::QuantizedXPU; } else if (t == DispatchKey::HPU || t == DispatchKey::AutogradHPU) { return Backend::HPU; + } else if (t == DispatchKey::PrivateUse1) { + return Backend::PrivateUse1; } else if (t == DispatchKey::Undefined) { return Backend::Undefined; } else { @@ -129,6 +135,8 @@ static inline DispatchKey backendToDispatchKey(Backend b) { return DispatchKey::XLA; case Backend::Lazy: return DispatchKey::Lazy; + case Backend::IPU: + return DispatchKey::IPU; case Backend::XPU: return DispatchKey::XPU; case Backend::SparseXPU: @@ -157,10 +165,12 @@ static inline DispatchKey backendToDispatchKey(Backend b) { return DispatchKey::QuantizedCUDA; case Backend::Undefined: return DispatchKey::Undefined; - case Backend::MLC: - return DispatchKey::MLC; + case Backend::MPS: + return DispatchKey::MPS; case Backend::HPU: return DispatchKey::HPU; + case Backend::PrivateUse1: + return DispatchKey::PrivateUse1; default: throw std::runtime_error("Unknown backend"); } @@ -196,6 +206,8 @@ static inline DeviceType backendToDeviceType(Backend b) { return DeviceType::CPU; case Backend::SparseCsrCUDA: return DeviceType::CUDA; + case Backend::IPU: + return DeviceType::IPU; case Backend::XPU: case Backend::SparseXPU: case Backend::QuantizedXPU: @@ -209,10 +221,12 @@ static inline DeviceType backendToDeviceType(Backend b) { return DeviceType::Vulkan; case Backend::Metal: return DeviceType::Metal; - case Backend::MLC: - return DeviceType::MLC; + case Backend::MPS: + return DeviceType::MPS; case Backend::HPU: return DeviceType::HPU; + case Backend::PrivateUse1: + return DeviceType::PrivateUse1; case Backend::Undefined: TORCH_CHECK(false, "Undefined backend is not a valid device type"); default: @@ -235,14 +249,16 @@ static inline const char* toString(Backend b) { return "FPGA"; case Backend::XPU: return "XPU"; + case Backend::IPU: + return "IPU"; case Backend::ORT: return "ORT"; case Backend::XLA: return "XLA"; case Backend::Lazy: return "Lazy"; - case Backend::MLC: - return "MLC"; + case Backend::MPS: + return "MPS"; case Backend::SparseCPU: return "SparseCPU"; case Backend::SparseCUDA: @@ -271,6 +287,8 @@ static inline const char* toString(Backend b) { return "QuantizedXPU"; case Backend::HPU: return "HPU"; + case Backend::PrivateUse1: + return "PrivateUseOne"; default: return "UNKNOWN_BACKEND"; } diff --git a/c10/core/Device.cpp b/c10/core/Device.cpp index 2531e3942271..5cd474774c9e 100644 --- a/c10/core/Device.cpp +++ b/c10/core/Device.cpp @@ -20,6 +20,7 @@ DeviceType parse_type(const std::string& device_string) { types = {{ {"cpu", DeviceType::CPU}, {"cuda", DeviceType::CUDA}, + {"ipu", DeviceType::IPU}, {"xpu", DeviceType::XPU}, {"mkldnn", DeviceType::MKLDNN}, {"opengl", DeviceType::OPENGL}, @@ -32,9 +33,10 @@ DeviceType parse_type(const std::string& device_string) { {"xla", DeviceType::XLA}, {"lazy", DeviceType::Lazy}, {"vulkan", DeviceType::Vulkan}, - {"mlc", DeviceType::MLC}, + {"mps", DeviceType::MPS}, {"meta", DeviceType::Meta}, {"hpu", DeviceType::HPU}, + {"privateuseone", DeviceType::PrivateUse1}, }}; auto device = std::find_if( types.begin(), @@ -47,7 +49,7 @@ DeviceType parse_type(const std::string& device_string) { } TORCH_CHECK( false, - "Expected one of cpu, cuda, xpu, mkldnn, opengl, opencl, ideep, hip, ve, ort, mlc, xla, lazy, vulkan, meta, hpu device type at start of device string: ", + "Expected one of cpu, cuda, ipu, xpu, mkldnn, opengl, opencl, ideep, hip, ve, ort, mps, xla, lazy, vulkan, meta, hpu, privateuseone device type at start of device string: ", device_string); } enum DeviceStringParsingState { START, INDEX_START, INDEX_REST, ERROR }; diff --git a/c10/core/Device.h b/c10/core/Device.h index b935eed6a656..774cf404da29 100644 --- a/c10/core/Device.h +++ b/c10/core/Device.h @@ -81,6 +81,11 @@ struct C10_API Device final { return type_ == DeviceType::CUDA; } + /// Return true if the device is of MPS type. + bool is_mps() const noexcept { + return type_ == DeviceType::MPS; + } + /// Return true if the device is of HIP type. bool is_hip() const noexcept { return type_ == DeviceType::HIP; @@ -96,11 +101,21 @@ struct C10_API Device final { return type_ == DeviceType::XPU; } + /// Return true if the device is of IPU type. + bool is_ipu() const noexcept { + return type_ == DeviceType::IPU; + } + /// Return true if the device is of HPU type. bool is_hpu() const noexcept { return type_ == DeviceType::HPU; } + /// Return true if the device is of META type. + bool is_meta() const noexcept { + return type_ == DeviceType::Meta; + } + /// Return true if the device is of CPU type. bool is_cpu() const noexcept { return type_ == DeviceType::CPU; diff --git a/c10/core/DeviceType.cpp b/c10/core/DeviceType.cpp index 4635acdb148c..ac4c1f653efb 100644 --- a/c10/core/DeviceType.cpp +++ b/c10/core/DeviceType.cpp @@ -31,8 +31,8 @@ std::string DeviceTypeName(DeviceType d, bool lower_case) { return lower_case ? "xla" : "XLA"; case DeviceType::Lazy: return lower_case ? "lazy" : "LAZY"; - case DeviceType::MLC: - return lower_case ? "mlc" : "MLC"; + case DeviceType::MPS: + return lower_case ? "mps" : "MPS"; case DeviceType::Vulkan: return lower_case ? "vulkan" : "VULKAN"; case DeviceType::Metal: @@ -43,6 +43,10 @@ std::string DeviceTypeName(DeviceType d, bool lower_case) { return lower_case ? "meta" : "META"; case DeviceType::HPU: return lower_case ? "hpu" : "HPU"; + case DeviceType::IPU: + return lower_case ? "ipu" : "IPU"; + case DeviceType::PrivateUse1: + return lower_case ? "privateuseone" : "PRIVATEUSEONE"; default: TORCH_CHECK( false, @@ -78,12 +82,14 @@ bool isValidDeviceType(DeviceType d) { case DeviceType::ORT: case DeviceType::XLA: case DeviceType::Lazy: - case DeviceType::MLC: + case DeviceType::MPS: case DeviceType::Vulkan: case DeviceType::Metal: case DeviceType::XPU: case DeviceType::Meta: case DeviceType::HPU: + case DeviceType::IPU: + case DeviceType::PrivateUse1: return true; default: return false; diff --git a/c10/core/DeviceType.h b/c10/core/DeviceType.h index c6bd56914d6d..ca995bc9d9ab 100644 --- a/c10/core/DeviceType.h +++ b/c10/core/DeviceType.h @@ -26,16 +26,18 @@ enum class DeviceType : int8_t { Vulkan = 10, // Vulkan Metal = 11, // Metal XPU = 12, // XPU - MLC = 13, // ML Compute / Apple + MPS = 13, // MPS Meta = 14, // Meta (tensors with no data) HPU = 15, // HPU / HABANA VE = 16, // SX-Aurora / NEC Lazy = 17, // Lazy Tensors + IPU = 18, // Graphcore IPU + PrivateUse1 = 19, // PrivateUse1 device // NB: If you add more devices: // - Change the implementations of DeviceTypeName and isValidDeviceType // in DeviceType.cpp // - Change the number below - COMPILE_TIME_MAX_DEVICE_TYPES = 18, + COMPILE_TIME_MAX_DEVICE_TYPES = 20, }; constexpr DeviceType kCPU = DeviceType::CPU; @@ -44,7 +46,7 @@ constexpr DeviceType kHIP = DeviceType::HIP; constexpr DeviceType kFPGA = DeviceType::FPGA; constexpr DeviceType kORT = DeviceType::ORT; constexpr DeviceType kXLA = DeviceType::XLA; -constexpr DeviceType kMLC = DeviceType::MLC; +constexpr DeviceType kMPS = DeviceType::MPS; constexpr DeviceType kMeta = DeviceType::Meta; constexpr DeviceType kVulkan = DeviceType::Vulkan; constexpr DeviceType kMetal = DeviceType::Metal; @@ -52,18 +54,20 @@ constexpr DeviceType kXPU = DeviceType::XPU; constexpr DeviceType kHPU = DeviceType::HPU; constexpr DeviceType kVE = DeviceType::VE; constexpr DeviceType kLazy = DeviceType::Lazy; +constexpr DeviceType kIPU = DeviceType::IPU; +constexpr DeviceType kPrivateUse1 = DeviceType::PrivateUse1; // define explicit int constant constexpr int COMPILE_TIME_MAX_DEVICE_TYPES = static_cast(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES); static_assert( - COMPILE_TIME_MAX_DEVICE_TYPES <= 18, + COMPILE_TIME_MAX_DEVICE_TYPES <= 20, "Hey! You seem to be adding a lot of new DeviceTypes. The intent was " "for this constant to reflect the actual number of DeviceTypes we support " "in PyTorch; it's important that this number is not too large as we " "use this to allocate stack arrays in some places in our code. If you " - "are indeed just adding the 18th device type, feel free to change " + "are indeed just adding the 20th device type, feel free to change " "the check to 32; but if you are adding some sort of extensible device " "types registration, please be aware that you are affecting code that " "this number is small. Try auditing uses of this constant."); diff --git a/c10/core/DispatchKey.cpp b/c10/core/DispatchKey.cpp index 7d2f9e7fcb6c..f06603dc4bc1 100644 --- a/c10/core/DispatchKey.cpp +++ b/c10/core/DispatchKey.cpp @@ -1,14 +1,49 @@ #include +#include #include namespace c10 { +const char* toString(BackendComponent t) { + switch (t) { + case BackendComponent::CPUBit: + return "CPUBit"; + case BackendComponent::CUDABit: + return "CUDABit"; + case BackendComponent::HIPBit: + return "HIPBit"; + case BackendComponent::XLABit: + return "XLABit"; + case BackendComponent::LazyBit: + return "LazyBit"; + case BackendComponent::XPUBit: + return "XPUBit"; + case BackendComponent::IPUBit: + return "IPUBit"; + case BackendComponent::MPSBit: + return "MPSBit"; + case BackendComponent::HPUBit: + return "HPUBit"; + case BackendComponent::VEBit: + return "VEBit"; + case BackendComponent::PrivateUse1Bit: + return "PrivateUse1Bit"; + case BackendComponent::PrivateUse2Bit: + return "PrivateUse2Bit"; + case BackendComponent::PrivateUse3Bit: + return "PrivateUse3Bit"; + case BackendComponent::InvalidBit: + return "InvalidBit"; + default: + return "UNKNOWN_BACKEND_BIT"; + } +} + const char* toString(DispatchKey t) { switch (t) { case DispatchKey::Undefined: return "Undefined"; - case DispatchKey::CPU: return "CPU"; case DispatchKey::CUDA: @@ -21,14 +56,16 @@ const char* toString(DispatchKey t) { return "FPGA"; case DispatchKey::XPU: return "XPU"; + case DispatchKey::IPU: + return "IPU"; case DispatchKey::ORT: return "ORT"; case DispatchKey::XLA: return "XLA"; case DispatchKey::Lazy: return "Lazy"; - case DispatchKey::MLC: - return "MLC"; + case DispatchKey::MPS: + return "MPS"; case DispatchKey::HPU: return "HPU"; case DispatchKey::Vulkan: @@ -64,9 +101,15 @@ const char* toString(DispatchKey t) { case DispatchKey::NestedTensor: return "NestedTensor"; + case DispatchKey::NestedTensorCPU: + return "NestedTensorCPU"; + case DispatchKey::NestedTensorCUDA: + return "NestedTensorCUDA"; case DispatchKey::Python: return "Python"; + case DispatchKey::PythonTLSSnapshot: + return "PythonTLSSnapshot"; case DispatchKey::PrivateUse1: return "PrivateUse1"; @@ -89,6 +132,8 @@ const char* toString(DispatchKey t) { return "Autograd"; case DispatchKey::AutogradCPU: return "AutogradCPU"; + case DispatchKey::AutogradIPU: + return "AutogradIPU"; case DispatchKey::AutogradXPU: return "AutogradXPU"; case DispatchKey::AutogradCUDA: @@ -97,12 +142,10 @@ const char* toString(DispatchKey t) { return "AutogradXLA"; case DispatchKey::AutogradLazy: return "AutogradLazy"; - case DispatchKey::AutogradMLC: - return "AutogradMLC"; + case DispatchKey::AutogradMPS: + return "AutogradMPS"; case DispatchKey::AutogradHPU: return "AutogradHPU"; - case DispatchKey::AutogradNestedTensor: - return "AutogradNestedTensor"; case DispatchKey::AutogradPrivateUse1: return "AutogradPrivateUse1"; case DispatchKey::AutogradPrivateUse2: @@ -111,6 +154,8 @@ const char* toString(DispatchKey t) { return "AutogradPrivateUse3"; case DispatchKey::AutogradOther: return "AutogradOther"; + case DispatchKey::AutogradNestedTensor: + return "AutogradNestedTensor"; case DispatchKey::ZeroTensor: return "ZeroTensor"; @@ -133,6 +178,9 @@ const char* toString(DispatchKey t) { case DispatchKey::AutocastCPU: return "AutocastCPU"; + case DispatchKey::AutocastXPU: + return "AutocastXPU"; + case DispatchKey::Batched: return "Batched"; @@ -168,6 +216,21 @@ const char* toString(DispatchKey t) { case DispatchKey::FuncTorchBatched: return "FuncTorchBatched"; + // Out-of-core torchdistX dispatch keys + case DispatchKey::Fake: + return "Fake"; + case DispatchKey::DeferredInit: + return "DeferredInit"; + + case DispatchKey::Dense: + return "Dense"; + case DispatchKey::Quantized: + return "Quantized"; + case DispatchKey::Sparse: + return "Sparse"; + case DispatchKey::AutogradFunctionality: + return "AutogradFunctionality"; + default: return "UNKNOWN_TENSOR_TYPE_ID"; } @@ -176,78 +239,42 @@ const char* toString(DispatchKey t) { std::ostream& operator<<(std::ostream& str, DispatchKey rhs) { return str << toString(rhs); } +std::ostream& operator<<(std::ostream& str, BackendComponent rhs) { + return str << toString(rhs); +} -// for a given backend key, return the associated autograd key. -// for non-backend keys, return AutogradOther as a default. -// Note: it's convenient and fast to return a default here rather than (say) -// returning an optional, or throwing. But it makes callers -// responsible for either a) enforcing the invariant that only backend keys -// be passed as arguments, or b) interpreting our return value carefully. -// -DispatchKey getAutogradKeyFromBackend(DispatchKey t) { - switch (t) { - case DispatchKey::CPU: - return DispatchKey::AutogradCPU; - case DispatchKey::XPU: - return DispatchKey::AutogradXPU; - case DispatchKey::CUDA: - return DispatchKey::AutogradCUDA; - case DispatchKey::XLA: - return DispatchKey::AutogradXLA; - case DispatchKey::Lazy: - return DispatchKey::AutogradLazy; - case DispatchKey::MLC: - return DispatchKey::AutogradMLC; - case DispatchKey::HPU: - return DispatchKey::AutogradHPU; - case DispatchKey::NestedTensor: - return DispatchKey::AutogradNestedTensor; - case DispatchKey::PrivateUse1: - return DispatchKey::AutogradPrivateUse1; - case DispatchKey::PrivateUse2: - return DispatchKey::AutogradPrivateUse2; - case DispatchKey::PrivateUse3: - return DispatchKey::AutogradPrivateUse3; - default: - return DispatchKey::AutogradOther; - } +DispatchKey getAutogradKeyFromBackend(BackendComponent k) { + // We want this to return an autograd key. We're relying on the fact that + // getAutogradRelatedKeySetFromBackend returns an autograd key + + // ADInplaceOrView, and autograd has higher precedence. The core mapping from + // backend -> autograd key lives in `getAutogradRelatedKeySetFromBackend` + // instead of here for performance. `getAutogradRelatedKeySetFromBackend` is a + // hotpath function, and we want to make sure that it doesn't have to + // construct any DispatchKeySets at runtime. + return getAutogradRelatedKeySetFromBackend(k).highestPriorityTypeId(); } c10::DispatchKey parseDispatchKey(const std::string& k) { static std::unordered_map key_map = { {"Undefined", c10::DispatchKey::Undefined}, - {"CPU", c10::DispatchKey::CPU}, - {"CUDA", c10::DispatchKey::CUDA}, - {"HIP", c10::DispatchKey::HIP}, + {"Dense", c10::DispatchKey::Dense}, {"FPGA", c10::DispatchKey::FPGA}, {"ORT", c10::DispatchKey::ORT}, - {"XLA", c10::DispatchKey::XLA}, - {"MLC", c10::DispatchKey::MLC}, + {"MPS", c10::DispatchKey::MPS}, {"Vulkan", c10::DispatchKey::Vulkan}, {"Metal", c10::DispatchKey::Metal}, - {"XPU", c10::DispatchKey::XPU}, - {"HPU", c10::DispatchKey::HPU}, {"VE", c10::DispatchKey::VE}, - {"Lazy", c10::DispatchKey::Lazy}, {"Meta", c10::DispatchKey::Meta}, - {"QuantizedCPU", c10::DispatchKey::QuantizedCPU}, - {"QuantizedCUDA", c10::DispatchKey::QuantizedCUDA}, - {"QuantizedXPU", c10::DispatchKey::QuantizedXPU}, + {"Quantized", c10::DispatchKey::Quantized}, {"CustomRNGKeyId", c10::DispatchKey::CustomRNGKeyId}, {"MkldnnCPU", c10::DispatchKey::MkldnnCPU}, - {"SparseCPU", c10::DispatchKey::SparseCPU}, - {"SparseCUDA", c10::DispatchKey::SparseCUDA}, - {"SparseHIP", c10::DispatchKey::SparseHIP}, - {"SparseXPU", c10::DispatchKey::SparseXPU}, - {"SparseVE", c10::DispatchKey::SparseVE}, + {"Sparse", c10::DispatchKey::Sparse}, {"SparseCsrCPU", c10::DispatchKey::SparseCsrCPU}, {"SparseCsrCUDA", c10::DispatchKey::SparseCsrCUDA}, - {"NestedTensor", c10::DispatchKey::NestedTensor}, - {"PrivateUse1", c10::DispatchKey::PrivateUse1}, - {"PrivateUse2", c10::DispatchKey::PrivateUse2}, - {"PrivateUse3", c10::DispatchKey::PrivateUse3}, {"BackendSelect", c10::DispatchKey::BackendSelect}, {"Python", c10::DispatchKey::Python}, + {"PythonTLSSnapshot", c10::DispatchKey::PythonTLSSnapshot}, + {"Fake", c10::DispatchKey::Fake}, {"Named", c10::DispatchKey::Named}, {"Conjugate", c10::DispatchKey::Conjugate}, {"Negative", c10::DispatchKey::Negative}, @@ -256,30 +283,62 @@ c10::DispatchKey parseDispatchKey(const std::string& k) { c10::DispatchKey::FuncTorchDynamicLayerBackMode}, {"ADInplaceOrView", c10::DispatchKey::ADInplaceOrView}, {"AutogradOther", c10::DispatchKey::AutogradOther}, - {"AutogradCPU", c10::DispatchKey::AutogradCPU}, - {"AutogradCUDA", c10::DispatchKey::AutogradCUDA}, - {"AutogradXLA", c10::DispatchKey::AutogradXLA}, - {"AutogradLazy", c10::DispatchKey::AutogradLazy}, - {"AutogradXPU", c10::DispatchKey::AutogradXPU}, - {"AutogradMLC", c10::DispatchKey::AutogradMLC}, - {"AutogradHPU", c10::DispatchKey::AutogradHPU}, + {"AutogradFunctionality", c10::DispatchKey::AutogradFunctionality}, {"AutogradNestedTensor", c10::DispatchKey::AutogradNestedTensor}, - {"AutogradPrivateUse1", c10::DispatchKey::AutogradPrivateUse1}, - {"AutogradPrivateUse2", c10::DispatchKey::AutogradPrivateUse2}, - {"AutogradPrivateUse3", c10::DispatchKey::AutogradPrivateUse3}, {"Tracer", c10::DispatchKey::Tracer}, {"AutocastCPU", c10::DispatchKey::AutocastCPU}, + {"AutocastXPU", c10::DispatchKey::AutocastXPU}, {"AutocastCUDA", c10::DispatchKey::AutocastCUDA}, {"FuncTorchBatched", c10::DispatchKey::FuncTorchBatched}, {"FuncTorchVmapMode", c10::DispatchKey::FuncTorchVmapMode}, {"Batched", c10::DispatchKey::Batched}, {"VmapMode", c10::DispatchKey::VmapMode}, + {"DeferredInit", c10::DispatchKey::DeferredInit}, {"FuncTorchGradWrapper", c10::DispatchKey::FuncTorchGradWrapper}, {"FuncTorchDynamicLayerFrontMode", c10::DispatchKey::FuncTorchDynamicLayerFrontMode}, {"TESTING_ONLY_GenericWrapper", c10::DispatchKey::TESTING_ONLY_GenericWrapper}, {"TESTING_ONLY_GenericMode", c10::DispatchKey::TESTING_ONLY_GenericMode}, + + {"CPU", c10::DispatchKey::CPU}, + {"CUDA", c10::DispatchKey::CUDA}, + {"HIP", c10::DispatchKey::HIP}, + {"XLA", c10::DispatchKey::XLA}, + {"MPS", c10::DispatchKey::MPS}, + {"XPU", c10::DispatchKey::XPU}, + {"IPU", c10::DispatchKey::IPU}, + {"HPU", c10::DispatchKey::HPU}, + {"Lazy", c10::DispatchKey::Lazy}, + {"NestedTensor", c10::DispatchKey::NestedTensor}, + {"NestedTensorCPU", c10::DispatchKey::NestedTensorCPU}, + {"NestedTensorCUDA", c10::DispatchKey::NestedTensorCUDA}, + {"PrivateUse1", c10::DispatchKey::PrivateUse1}, + {"PrivateUse2", c10::DispatchKey::PrivateUse2}, + {"PrivateUse3", c10::DispatchKey::PrivateUse3}, + + {"QuantizedCPU", c10::DispatchKey::QuantizedCPU}, + {"QuantizedCUDA", c10::DispatchKey::QuantizedCUDA}, + {"QuantizedXPU", c10::DispatchKey::QuantizedXPU}, + + {"SparseCPU", c10::DispatchKey::SparseCPU}, + {"SparseCUDA", c10::DispatchKey::SparseCUDA}, + {"SparseHIP", c10::DispatchKey::SparseHIP}, + {"SparseXPU", c10::DispatchKey::SparseXPU}, + {"SparseVE", c10::DispatchKey::SparseVE}, + + {"AutogradCPU", c10::DispatchKey::AutogradCPU}, + {"AutogradCUDA", c10::DispatchKey::AutogradCUDA}, + {"AutogradXLA", c10::DispatchKey::AutogradXLA}, + {"AutogradLazy", c10::DispatchKey::AutogradLazy}, + {"AutogradIPU", c10::DispatchKey::AutogradIPU}, + {"AutogradXPU", c10::DispatchKey::AutogradXPU}, + {"AutogradMPS", c10::DispatchKey::AutogradMPS}, + {"AutogradHPU", c10::DispatchKey::AutogradHPU}, + {"AutogradPrivateUse1", c10::DispatchKey::AutogradPrivateUse1}, + {"AutogradPrivateUse2", c10::DispatchKey::AutogradPrivateUse2}, + {"AutogradPrivateUse3", c10::DispatchKey::AutogradPrivateUse3}, + {"Autograd", c10::DispatchKey::Autograd}, {"CompositeImplicitAutograd", c10::DispatchKey::CompositeImplicitAutograd}, diff --git a/c10/core/DispatchKey.h b/c10/core/DispatchKey.h index 1bb8268e2bd0..5380bfb319b7 100644 --- a/c10/core/DispatchKey.h +++ b/c10/core/DispatchKey.h @@ -9,20 +9,99 @@ namespace c10 { +// Semantically, each value of BackendComponent identifies a "backend" for our +// dispatch. Some functionalities that we may dispatch to are allowed to +// register different handlers for each backend. The BackendComponent is then +// used to figure out which backend implementation to dispatch to. + +// In implementation terms, the backend component identifies a specific "bit" in +// a DispatchKeySet. The bits in the DispatchKeySet are split between the bottom +// ~12 "BackendComponent" bits, while the remaining upper bits are assigned to +// functionalities. When we encounter a functionality bit that is known to be +// customizeable per-backend, then we also look at the lower BackendComponent +// bits and take the highest bit to determine which backend's implementation to +// use. + +enum class BackendComponent : uint8_t { + + // A "backend" is colloquially used to refer to handlers for dispatch + // which actually implement the numerics of an operation in question. + // + // Due to the nature of the enum, these backends are specified in + // an ordered way, but for most backends this order is not semantically + // meaningful (e.g., it's valid to reorder these backends without changing + // semantics). The only situation when backend ordering is meaningful + // is when the backend participates in multiple dispatch with another + // backend; e.g., CPU and CUDA (cuda must have higher priority). + + // These keys don't correspond to individual kernels. + // Instead, they represent the backends that are allowed to override specific + // pieces of functionality: + // - dense kernels (e.g. DispatchKey::CPU) + // - sparse kernels (e.g. DispatchKey::SparseCPU) + // - quantized kernels (e.g. DispatchKey::QuantizedCPU) + // - autograd kernels (e.g. DispatchKey::AutogradCPU) + // We reserve space in the runtime operator table for this full cross product + // of + // [backends in this enum] x [keys below that are explicitly marked as having + // per-backend functionality] + + InvalidBit = 0, + CPUBit, + CUDABit, + HIPBit, + XLABit, + MPSBit, + IPUBit, + XPUBit, + HPUBit, + VEBit, + LazyBit, + PrivateUse1Bit, + PrivateUse2Bit, + PrivateUse3Bit, + // Define an alias to represent end of backend dispatch keys. + // If you add new backend keys after PrivateUse3, please also update it here. + // (But you shouldn't: private use keys should have higher precedence than + // all built-in keys) + EndOfBackendKeys = PrivateUse3Bit, +}; + // Semantically, a dispatch key identifies a possible "level" in our -// dispatch, for which a handler may be registered. Traditional -// backends like CPU and CUDA get dispatch keys; however, so do -// "wrapping" layers like Variable (for autograd handling). +// dispatch, for which a handler may be registered. Each handler corresponds +// to a type of functionality. // // In implementation terms, the dispatch key identifies a specific "bit" in a // DispatchKeySet. Higher bit indexes get handled by dispatching first (because // we "count leading zeros" when we extract the highest priority dispatch // key.) // -// NOTE: Keep the list in sync with `DispatchKey` in tools/codegen/model.py -enum class DispatchKey : uint8_t { +// Note [DispatchKey Classification] +// This enum actually contains several types of keys, which are explained +// in more detail further down: +// (1) non-customizable backends (e.g. FPGA) +// (2) non-customizable functionalities (e.g. Functionalize) +// (3) functionalized that are customizable per backend (e.g. Dense, Sparse, +// AutogradFunctionality) (4) per-backend instances of customizable +// functionalities (e.g. CPU, SparseCPU, AutogradCPU) (5) alias keys (e.g. +// CompositeImplicitAutograd) +// +// Of the categories above, it's important to note: +// (a) which keys are assigned individual bits in a DispatchKeySet +// (b) which keys are assigned individual slots in the runtime operator table +// ("Runtime keys") +// +// (1), (2) and (3) all get their own dedicated bits in the DispatchKeySet. +// (1), (2) and (4) all get their own dedicated slots in the runtime operator +// table. + +// See Note [DispatchKeySet Internal Representation] for more details. +// +// NOTE: Keep the list in sync with `DispatchKey` in torchgen/model.py +enum class DispatchKey : uint16_t { + // ~~~~~~~~~~~~~~~~~~~~~~~~~~ UNDEFINED ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // - // This is not a "real" tensor id, but it exists to give us a "nullopt" + // This is not a "real" functionality, but it exists to give us a "nullopt" // element we can return for cases when a DispatchKeySet contains no elements. // You can think a more semantically accurate definition of DispatchKey is: // @@ -38,24 +117,31 @@ enum class DispatchKey : uint8_t { // this will get eliminated, but for now it's convenient) CatchAll = Undefined, - // ~~~~~~~~~~~~~~~~~~~~~~~~~~ BACKENDS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // - // A "backend" is colloquially used to refer to handlers for dispatch - // which actually implement the numerics of an operation in question. + // ~~~~~~~~~~~~~~~~~~~~~~~~~~ Functionality Keys ~~~~~~~~~~~~~~~~~~~~~~ // + // Every value in the enum (up to EndOfFunctionalityKeys) + // corresponds to an individual "functionality" that can be dispatched to. + // This is represented in the DispatchKeySet by assigning each of these enum + // values + // to each of the remaining (64 - len(BackendComponent)) bits. // - // Due to the nature of the enum, these backends are specified in - // an ordered way, but for most backends this order is not semantically - // meaningful (e.g., it's valid to reorder these backends without changing - // semantics). The only situation when backend ordering is meaningful - // is when the backend participates in multiple dispatch with another - // backend; e.g., CPU and SparseCPU (sparse must have - // higher priority). + // Most of these functionalities have a single handler assigned to them, + // making them "runtime keys". + // That map to a single slot in the runtime operator table. + // + // A few functionalities are allowed to be customizable per backend. + // See [Note: Per-Backend Functionality Dispatch Keys] for details. + + // See [Note: Per-Backend Functionality Dispatch Keys] + Dense, + + // Below are non-extensible backends. + // These are backends that currently don't have their own overrides for + // Autograd/Sparse/Quantized kernels, + // and we therefore don't waste space in the runtime operator table allocating + // space for them. + // If any of these backends ever need to customize, e.g., Autograd, then we'll + // need to add a DispatchKey::*Bit for them. - // Here are backends which you think of as traditionally specifying - // how to implement operations on some device. - CPU, // registered at build/aten/src/ATen/RegisterCPU.cpp - CUDA, // registered at build/aten/src/ATen/RegisterCUDA.cpp - HIP, // NB: I think this is not actually used, due to Note [Masquerading as - // CUDA] FPGA, // Xilinx support lives out of tree at // https://gitlab.com/pytorch-complex/vitis_kernels @@ -67,14 +153,8 @@ enum class DispatchKey : uint8_t { // - aten/src/ATen/test/extension_backend_test.cpp ORT, - XLA, // lives out of tree at https://github.com/pytorch/xla - MLC, // lives out of tree at https://github.com/pytorch/MLCompute Vulkan, Metal, - XPU, // For out of tree Intel's heterogeneous computing plug-in - HPU, // For out of tree & closed source integration of HPU / Habana - VE, // For out of tree & closed source integration of SX-Aurora / NEC - Lazy, // For lazy tensor backends // A meta tensor is a tensor without any data associated with it. (They // have also colloquially been referred to as tensors on the "null" device). @@ -83,11 +163,8 @@ enum class DispatchKey : uint8_t { // tensor with the output shape and dtype, but wouldn't actually add anything. Meta, - // Here are backends which specify more specialized operators - // based on the dtype of the tensor. - QuantizedCPU, // registered at build/aten/src/ATen/RegisterQuantizedCPU.cpp - QuantizedCUDA, // registered at build/aten/src/ATen/RegisterQuantizedCUDA.cpp - QuantizedXPU, // For out of tree Intel's heterogeneous computing plug-in + // See [Note: Per-Backend Functionality Dispatch Keys] + Quantized, // This backend is to support custom RNGs; it lets you go // to a different kernel if you pass in a generator that is not a @@ -106,30 +183,28 @@ enum class DispatchKey : uint8_t { // the corresponding dense tensors, and must be handled before them. MkldnnCPU, // registered at build/aten/src/ATen/RegisterMkldnnCPU.cpp // NB: not to be confused with MKLDNN, which is Caffe2 only - SparseCPU, // registered at build/aten/src/ATen/RegisterSparseCPU.cpp - SparseCUDA, // registered at build/aten/src/ATen/RegisterSparseCUDA.cpp - SparseHIP, // TODO: I think this is not actually used, due to Note - // [Masquerading as CUDA] - SparseXPU, // For out of tree Intel's heterogeneous computing plug-in - SparseVE, // For out of tree & closed source integration of SX-Aurora / NEC + + // See [Note: Per-Backend Functionality Dispatch Keys] + Sparse, SparseCsrCPU, SparseCsrCUDA, - NestedTensor, // lives out of tree at https://github.com/pytorch/nestedtensor - - // Here are reserved backends for user-defined backends, see Note [Private use - // DispatchKey] - // To see some example about how to use this, check out ORT - PrivateUse1, - PrivateUse2, - PrivateUse3, + // Note [Non-Customizable Backend Keys] + // Every key above here is considered a "non-customizable backend". + // These are backends that will work correctly with autograd, but + // but currently don't require separate implementations + // for autograd sparse or quantized kernels. + // Any new backends that don't need to be customized should go above here. + // If an existing backend needs to e.g. override autograd, then we can + // consider promoting it into the "BackendComponent" enum + // + // For all intents and purposes from the perspective of DispatchKeySet, + // "non-customizable backend" keys are treated the same way + // as other functionality keys + EndOfNonCustomizableBackends = SparseCsrCUDA, - // Define an alias key to represent end of backend dispatch keys. - // If you add new backend keys after PrivateUse3, please also update it here. - // (But you shouldn't: private use keys should have higher precedence than - // all built-in keys) - EndOfBackendKeys = PrivateUse3, + NestedTensor, // In some situations, it is not immediately obvious what the correct // backend for function is, because the function in question doesn't @@ -140,6 +215,10 @@ enum class DispatchKey : uint8_t { Python, + // Out-of-core key for Fake Tensor in torchdistx. + // See https://pytorch.org/torchdistx/latest/fake_tensor.html + Fake, + // The named dispatch key is set for any tensors with named dimensions. // Although we have a dispatch key for named tensors, for historical reasons, // this dispatch key doesn't do any of the substantive functionality for named @@ -233,26 +312,25 @@ enum class DispatchKey : uint8_t { // AutogradOther key. We can add specific autograd key for those backends // upon request. AutogradOther, - AutogradCPU, - AutogradCUDA, - AutogradXLA, - AutogradLazy, - AutogradXPU, - AutogradMLC, - AutogradHPU, - AutogradNestedTensor, // lives out of tree at + + // See [Note: Per-Backend Functionality Dispatch Keys] + AutogradFunctionality, + + // NestedTensor is an example of something that isn't a "real backend" + // (because it mostly consists of redispatching kernels) + // but it would like to override autograd functionality in C++. + // We can handle cases like this by adding an extra functionality key + // exclusively for handling autograd for NestedTensor. + // lives out of tree at // https://github.com/pytorch/nestedtensor - // Here are some reserved pre-autograd keys for user-defined backends, see - // Note [Private use DispatchKey] - AutogradPrivateUse1, - AutogradPrivateUse2, - AutogradPrivateUse3, + AutogradNestedTensor, Tracer, // Autocasting precedes VariableTypeId, to ensure casts are autograd-exposed // and inputs are saved for backward in the post-autocast type. AutocastCPU, + AutocastXPU, // Naughtily, AutocastCUDA is also being used for XLA. In the terminal state, // it probably should get its own Autocast key AutocastCUDA, @@ -274,12 +352,25 @@ enum class DispatchKey : uint8_t { VmapMode, FuncTorchGradWrapper, // See Note [Out-of-tree vmap+grad prototype] + // Alias and mutation removal. // If some backends want to opt into only alias removal or only mutation // removal, // we can consider adding separate keys dedicated to those individual passes. // See Note [Functionalization Pass In Core] for details. Functionalize, + + // Out-of-core key for Deferred Module Initialization in torchdistx. + // See https://pytorch.org/torchdistx/latest/deferred_init.html + DeferredInit, + + // Used by Python key logic to know the set of tls on entry to the dispatcher + // This kernel assumes it is the top-most non-functorch-related DispatchKey. + // If you add a key above, make sure to update the fallback implementation for + // this. + PythonTLSSnapshot, + + // This key should be at the very top of the dispatcher FuncTorchDynamicLayerFrontMode, // See Note [Out-of-tree vmap+grad prototype] // TESTING: This is intended to be a generic testing tensor type id. @@ -299,9 +390,128 @@ enum class DispatchKey : uint8_t { TESTING_ONLY_GenericMode, // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FIN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // - NumDispatchKeys, // Sentinel, end of runtime keys. + EndOfFunctionalityKeys, // End of functionality keys. + + // ~~~~~~~~~~~~~~ "Dense" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~~~~~ // + // Here are backends which you think of as traditionally specifying + // how to implement operations on some device. + + // See Note [The Ordering of Per-Backend Dispatch Keys Matters!] + StartOfDenseBackends, + CPU, // registered at build/aten/src/ATen/RegisterCPU.cpp + CUDA, // registered at build/aten/src/ATen/RegisterCUDA.cpp + HIP, // NB: I think this is not actually used, due to Note [Masquerading as + // CUDA] + XLA, // lives out of tree at https://github.com/pytorch/xla + MPS, // registered at build/aten/src/ATen/RegisterMPS.cpp + IPU, // lives out of tree at https://github.com/graphcore/poptorch + XPU, // For out of tree Intel's heterogeneous computing plug-in + HPU, // For out of tree & closed source integration of HPU / Habana + VE, // For out of tree & closed source integration of SX-Aurora / NEC + Lazy, // For lazy tensor backends + // Here are reserved backends for user-defined backends, see Note [Private use + // DispatchKey] + // To see some example about how to use this, check out ORT + PrivateUse1, + PrivateUse2, + PrivateUse3, + EndOfDenseBackends = PrivateUse3, + + // ~~~~~~~~~~~~~~ "Quantized" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~ // + // keys starting with an _ are not currently used, + // but are needed to ensure that every backend is indexed correctly. + + // See Note [The Ordering of Per-Backend Dispatch Keys Matters!] + StartOfQuantizedBackends, + QuantizedCPU, // registered at build/aten/src/ATen/RegisterQuantizedCPU.cpp + QuantizedCUDA, // registered at build/aten/src/ATen/RegisterQuantizedCUDA.cpp + _QuantizedHIP, + _QuantizedXLA, + _QuantizedMPS, + _QuantizedIPU, + QuantizedXPU, // For out of tree Intel's heterogeneous computing plug-in + _QuantizedHPU, + _QuantizedVE, + _QuantizedLazy, + _QuantizedPrivateUse1, + _QuantizedPrivateUse2, + _QuantizedPrivateUse3, + EndOfQuantizedBackends = _QuantizedPrivateUse3, + + // ~~~~~~~~~~~~~~ "Sparse" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~~~~ // + // keys starting with an _ are not currently used, + // but are needed to ensure that every backend is indexed correctly. + + // See Note [The Ordering of Per-Backend Dispatch Keys Matters!] + StartOfSparseBackends, + SparseCPU, // registered at build/aten/src/ATen/RegisterSparseCPU.cpp + SparseCUDA, // registered at build/aten/src/ATen/RegisterSparseCUDA.cpp + SparseHIP, // TODO: I think this is not actually used, due to Note + // [Masquerading as CUDA] + _SparseXLA, + _SparseMPS, + _SparseIPU, + SparseXPU, // For out of tree Intel's heterogeneous computing plug-in + _SparseHPU, + SparseVE, // For out of tree & closed source integration of SX-Aurora / NEC + _SparseLazy, + _SparsePrivateUse1, + _SparsePrivateUse2, + _SparsePrivateUse3, + EndOfSparseBackends = _SparsePrivateUse3, + + // ~~~~~~~~~~~~~~ "NestedTensor" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~~~~ + // // + // keys starting with an _ are not currently used, + // but are needed to ensure that every backend is indexed correctly. + + // See Note [The Ordering of Per-Backend Dispatch Keys Matters!] + StartOfNestedTensorBackends, + // registered at build/aten/src/ATen/RegisterNestedTensorCPU.cpp + NestedTensorCPU, + // registered at build/aten/src/ATen/RegisterNestedTensorCUDA.cpp + NestedTensorCUDA, + _NestedTensorHIP, + _NestedTensorXLA, + _NestedTensorMPS, + _NestedTensorIPU, + _NestedTensorXPU, + _NestedTensorHPU, + _NestedTensorVE, + _NestedTensorLazy, + _NestedTensorPrivateUse1, + _NestedTensorPrivateUse2, + _NestedTensorPrivateUse3, + EndOfNestedTensorBackends = _NestedTensorPrivateUse3, + + // ~~~~~~~~~~~~~~ "Autograd" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~~ // + // keys starting with an _ are not currently used, + // but are needed to ensure that every backend is indexed correctly. + + // See Note [The Ordering of Per-Backend Dispatch Keys Matters!] + StartOfAutogradBackends, + AutogradCPU, + AutogradCUDA, + _AutogradHIP, + AutogradXLA, + AutogradMPS, + AutogradIPU, + AutogradXPU, + AutogradHPU, + _AutogradVE, + AutogradLazy, + // Here are some reserved pre-autograd keys for user-defined backends, see + // Note [Private use DispatchKey] + AutogradPrivateUse1, + AutogradPrivateUse2, + AutogradPrivateUse3, + EndOfAutogradBackends = AutogradPrivateUse3, + // If we add a new per-backend functionality key that has higher priority + // than Autograd, then this key should be updated. + EndOfRuntimeBackendKeys = EndOfAutogradBackends, // ~~~~~~~~~~~~~~~~~~~~~~ Alias Dispatch Keys ~~~~~~~~~~~~~~~~~~~~~~~~~~ // + // Note [Alias Dispatch Keys] // Alias dispatch keys are synthetic dispatch keys which map to multiple // runtime dispatch keys. Alisa keys have precedence, but they are always // lower precedence than runtime keys. You can register a kernel to an @@ -321,6 +531,7 @@ enum class DispatchKey : uint8_t { // Define an alias key to represent end of alias dispatch keys. // If you add new alias keys after Autograd, please also update it here. + StartOfAliasKeys = Autograd, EndOfAliasKeys = CompositeExplicitAutograd, // // ~~~~~~~~~~~~~~~~~~~~~~~~~ BC ALIASES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // @@ -360,54 +571,84 @@ enum class DispatchKey : uint8_t { // built-in autograd formulas for operators are not appropriate. static_assert( - static_cast(DispatchKey::NumDispatchKeys) < 64, - "DispatchKey is used as index into 64-bit bitmask; you must have less than 64 entries"); + (static_cast(BackendComponent::EndOfBackendKeys) + + static_cast(DispatchKey::EndOfFunctionalityKeys)) <= 64, + "The BackendComponent and DispatchKey enums (below EndOfFunctionalityKeys)" + " both map to backend and functionality bits" + " into a 64-bit bitmask; you must have less than 64 total entries between them"); -#if defined(C10_MOBILE_TRIM_DISPATCH_KEYS) -/** - * The method below maps the dispatch key in the enum DispatchKey to an - * integer index in the dispatchTable_ array in OperatorEntry. The array - * is trimmed for mobile to reduce peak memory usage since it's - * unnecessary to reserve additional space for dispatch keys that will - * never be used on mobile. - */ -C10_API constexpr int getDispatchTableIndexForDispatchKey(DispatchKey dk) { - switch (dk) { - case DispatchKey::Undefined: - return 0; - case DispatchKey::CPU: - return 1; - case DispatchKey::QuantizedCPU: - return 2; - case DispatchKey::SparseCPU: - return 3; - case DispatchKey::BackendSelect: - return 4; - case DispatchKey::ADInplaceOrView: - return 5; - case DispatchKey::AutogradOther: - return 6; - case DispatchKey::AutogradCPU: - return 7; - case DispatchKey::NumDispatchKeys: // Sentinel, end of runtime keys. - return 8; - default: - return -1; +// Check if a DispatchKey is an alias mapping to other runtime keys. +constexpr bool isAliasDispatchKey(DispatchKey k) { + return k >= DispatchKey::StartOfAliasKeys && k <= DispatchKey::EndOfAliasKeys; +} + +// [Note: Per-Backend Functionality Dispatch Keys] +// Check if a DispatchKey is a per-backend functionality key +// Any functionalities that can be customized per-backend should be added here. +// These keys correspond to functionalities that can be customized indivually +// per backend. While they only take up one bit in the `DispatchKeySet` bitset, +// they map to (# backends) slots in the operator table. +// Each of these keys also has a separate set of "runtime keys" in the dispatch +// key enum, per backend, which *do* map to the individual operator table slots. +// For example, the "Sparse" key maps to an individual bit in the +// DispatchKeySet, while `SparseCPU`, `SparseCUDA`, etc all map to individual +// slots in the runtime operator table. + +constexpr bool isPerBackendFunctionalityKey(DispatchKey k) { + if (k == DispatchKey::Dense || k == DispatchKey::Quantized || + k == DispatchKey::Sparse || k == DispatchKey::AutogradFunctionality || + k == DispatchKey::NestedTensor) { + return true; + } else { + return false; } } -#else -/** - * For the server use-case, make this a simple pass-through. - */ -C10_API constexpr int getDispatchTableIndexForDispatchKey(DispatchKey dk) { - return static_cast(dk); + +// Note that this includes Undefined in the total count. +// BUT EndOfFunctionalityKeys is its own (placeholder) key. +// e.g. Undefined=0, Dense=1, Sparse=2, EndOfFunctionalityKeys=3. +// In the above example, there are 3 total functionality keys. +constexpr uint8_t num_functionality_keys = + static_cast(DispatchKey::EndOfFunctionalityKeys); + +constexpr uint8_t num_backends = + static_cast(BackendComponent::EndOfBackendKeys); + +// Note [No More Than 16 Backends] +// Search for this note to find places in the code where the "no more than 16 +// backends" invariant is baked in. +static_assert( + static_cast(BackendComponent::EndOfBackendKeys) <= 16, + "BackendComponent currently only supports <= 16 backends. If we really need to extend this, \ +there are a few places where this invariant is baked in"); + +constexpr uint8_t numPerBackendFunctionalityKeys() { + uint8_t count = 0; + for (uint8_t k = 0; k <= num_functionality_keys; ++k) { + if (isPerBackendFunctionalityKey(static_cast(k))) + ++count; + } + return count; } + +#if defined(C10_MOBILE_TRIM_DISPATCH_KEYS) +// See [Note: Trimmed Mobile Dispatch Keys] +constexpr uint16_t num_runtime_entries = 8; +#else +constexpr uint16_t num_runtime_entries = num_functionality_keys + + (numPerBackendFunctionalityKeys() * (num_backends - 1)); #endif +// See Note [No More Than 16 Backends] +constexpr uint16_t full_backend_mask = + (static_cast(1) << num_backends) - 1; + C10_API const char* toString(DispatchKey); +C10_API const char* toString(BackendComponent); C10_API std::ostream& operator<<(std::ostream&, DispatchKey); +C10_API std::ostream& operator<<(std::ostream&, BackendComponent); -C10_API DispatchKey getAutogradKeyFromBackend(DispatchKey t); +C10_API DispatchKey getAutogradKeyFromBackend(BackendComponent k); // Parses a string into a dispatch key. // If the string cannot be correctly parsed, throws an exception. @@ -420,10 +661,100 @@ C10_API c10::DispatchKey parseDispatchKey(const std::string& k); // torch::dispatch(torch::kCPU, ...) is also valid. constexpr DispatchKey kAutograd = DispatchKey::Autograd; -// Check if a DispatchKey is an alias mapping to other runtime keys. -inline bool isAliasDispatchKey(DispatchKey k) { - return k > DispatchKey::NumDispatchKeys && k <= DispatchKey::EndOfAliasKeys; +// See Note [The Ordering of Per-Backend Dispatch Keys Matters!] +// This function relies on the invariant that the dispatch keys between +// StartOfDenseBackends and EndOfRuntimeBackendKeys are ordered by backend +// in the same order as `BackendComponent`. +constexpr BackendComponent toBackendComponent(DispatchKey k) { + if (k >= DispatchKey::StartOfDenseBackends && + k <= DispatchKey::EndOfDenseBackends) { + return static_cast( + static_cast(k) - + static_cast(DispatchKey::StartOfDenseBackends)); + } else if ( + k >= DispatchKey::StartOfQuantizedBackends && + k <= DispatchKey::EndOfQuantizedBackends) { + return static_cast( + static_cast(k) - + static_cast(DispatchKey::StartOfQuantizedBackends)); + } else if ( + k >= DispatchKey::StartOfSparseBackends && + k <= DispatchKey::EndOfSparseBackends) { + return static_cast( + static_cast(k) - + static_cast(DispatchKey::StartOfSparseBackends)); + } else if ( + k >= DispatchKey::StartOfNestedTensorBackends && + k <= DispatchKey::EndOfNestedTensorBackends) { + return static_cast( + static_cast(k) - + static_cast(DispatchKey::StartOfNestedTensorBackends)); + } else if ( + k >= DispatchKey::StartOfAutogradBackends && + k <= DispatchKey::EndOfAutogradBackends) { + return static_cast( + static_cast(k) - + static_cast(DispatchKey::StartOfAutogradBackends)); + } else { + return BackendComponent::InvalidBit; + } +} + +constexpr DispatchKey toFunctionalityKey(DispatchKey k) { + if (k <= DispatchKey::EndOfFunctionalityKeys) { + return k; + } else if (k <= DispatchKey::EndOfDenseBackends) { + return DispatchKey::Dense; + } else if (k <= DispatchKey::EndOfQuantizedBackends) { + return DispatchKey::Quantized; + } else if (k <= DispatchKey::EndOfSparseBackends) { + return DispatchKey::Sparse; + } else if (k <= DispatchKey::EndOfNestedTensorBackends) { + return DispatchKey::NestedTensor; + } else if (k <= DispatchKey::EndOfAutogradBackends) { + return DispatchKey::AutogradFunctionality; + } else { + return DispatchKey::Undefined; + } } + +// Given (DispatchKey::Dense, BackendComponent::CUDABit), returns +// DispatchKey::CUDA. +// See Note [The Ordering of Per-Backend Dispatch Keys Matters!] +// This function relies on the invariant that the dispatch keys between +// StartOfDenseBackends and EndOfRuntimeBackendKeys are ordered by backend +// in the same order as `BackendComponent`. +constexpr DispatchKey toRuntimePerBackendFunctionalityKey( + DispatchKey functionality_k, + BackendComponent backend_k) { + if (functionality_k == DispatchKey::Dense) { + return static_cast( + static_cast(DispatchKey::StartOfDenseBackends) + + static_cast(backend_k)); + } + if (functionality_k == DispatchKey::Sparse) { + return static_cast( + static_cast(DispatchKey::StartOfSparseBackends) + + static_cast(backend_k)); + } + if (functionality_k == DispatchKey::Quantized) { + return static_cast( + static_cast(DispatchKey::StartOfQuantizedBackends) + + static_cast(backend_k)); + } + if (functionality_k == DispatchKey::NestedTensor) { + return static_cast( + static_cast(DispatchKey::StartOfNestedTensorBackends) + + static_cast(backend_k)); + } + if (functionality_k == DispatchKey::AutogradFunctionality) { + return static_cast( + static_cast(DispatchKey::StartOfAutogradBackends) + + static_cast(backend_k)); + } + return DispatchKey::Undefined; +} + } // namespace c10 namespace torch { diff --git a/c10/core/DispatchKeySet.cpp b/c10/core/DispatchKeySet.cpp index 7f85567f886f..3127e7bb43e3 100644 --- a/c10/core/DispatchKeySet.cpp +++ b/c10/core/DispatchKeySet.cpp @@ -1,37 +1,30 @@ #include +#include +#include namespace c10 { -// backend_dispatch_keyset should include all runtime backend keys. +// backend_dispatch_keyset includes all dispatch keys that map to backends. // Alias key DispatchKey::CompositeExplicitAutograd maps to -// backend_dispatch_keyset NestedTensor has been explicitly removed due to -// incompatibility with some kernels, such as structured kernels, that use the -// DefaultBackend key. -constexpr DispatchKeySet backend_dispatch_keyset = autogradother_backends | - DispatchKeySet({ - DispatchKey::CPU, - DispatchKey::CUDA, - DispatchKey::XLA, - DispatchKey::Lazy, - DispatchKey::XPU, - DispatchKey::PrivateUse1, - DispatchKey::PrivateUse2, - DispatchKey::PrivateUse3, - DispatchKey::MLC, - DispatchKey::HPU, - DispatchKey::ORT, - DispatchKey::Meta, - }); +// backend_dispatch_keyset +constexpr DispatchKeySet backend_dispatch_keyset = + autogradother_backends | DispatchKeySet(DispatchKey::Dense); bool isBackendDispatchKey(DispatchKey t) { return t != DispatchKey::Undefined // See Note [No Alias Keys in DispatchKeySet] - && !isAliasDispatchKey(t) && backend_dispatch_keyset.has(t); + && !isAliasDispatchKey(t) + // Note [NestedTensor Not Included in Backend Keys] + // NestedTensor has been explicitly removed from the "backend keyset" due + // to incompatibility with some kernels, so we don't want it to be + // included in CompositeImplicitAutograd or CompositeExplicitAutograd + // kernels. + && t != DispatchKey::NestedTensor && backend_dispatch_keyset.has(t); } // math_dispatch_keyset contains all keys in backend_dispatch_keyset and // autograd_dispatch_keyset Alias key DispatchKey::CompositeImplicitAutograd -// maps to math_dispatch_keyset. +// maps to [math_dispatch_keyset x full_backend_mask] constexpr DispatchKeySet math_dispatch_keyset = backend_dispatch_keyset | autograd_dispatch_keyset; @@ -39,7 +32,12 @@ DispatchKeySet getRuntimeDispatchKeySet(DispatchKey t) { TORCH_INTERNAL_ASSERT(t != DispatchKey::Undefined); switch (t) { case DispatchKey::Autograd: - return autograd_dispatch_keyset; + // See Note [autograd_dispatch_keyset Does Not Include Backend Bits] + // That's why we OR it with a mask of the backend bits here. + // getRuntimeDispatchKeySet() expects to return a keyset of runtime + // dispatch keys, like AutogradCPU, but that requires having backend bits. + return autograd_dispatch_keyset | + DispatchKeySet(DispatchKeySet::RAW, full_backend_mask); case DispatchKey::CompositeImplicitAutograd: return math_dispatch_keyset; case DispatchKey::CompositeExplicitAutograd: @@ -53,11 +51,13 @@ bool runtimeDispatchKeySetHas(DispatchKey t, DispatchKey k) { TORCH_INTERNAL_ASSERT(t != DispatchKey::Undefined); switch (t) { case DispatchKey::Autograd: - return autograd_dispatch_keyset.has(k); + return autograd_dispatch_keyset.has(toFunctionalityKey(k)); case DispatchKey::CompositeImplicitAutograd: - return math_dispatch_keyset.has(k); + // See Note [NestedTensor Not Included in Backend Keys] + return k != DispatchKey::NestedTensor && math_dispatch_keyset.has(k); case DispatchKey::CompositeExplicitAutograd: - return backend_dispatch_keyset.has(k); + // See Note [NestedTensor Not Included in Backend Keys] + return k != DispatchKey::NestedTensor && backend_dispatch_keyset.has(k); default: return t == k; } @@ -75,12 +75,12 @@ DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t) { return DispatchKeySet(DispatchKey::XLA); case DispatchKey::AutogradLazy: return DispatchKeySet(DispatchKey::Lazy); - case DispatchKey::AutogradMLC: - return DispatchKeySet(DispatchKey::MLC); + case DispatchKey::AutogradMPS: + return DispatchKeySet(DispatchKey::MPS); case DispatchKey::AutogradHPU: return DispatchKeySet(DispatchKey::HPU); - case DispatchKey::AutogradNestedTensor: - return DispatchKeySet(DispatchKey::NestedTensor); + case DispatchKey::AutogradIPU: + return DispatchKeySet(DispatchKey::IPU); case DispatchKey::AutogradXPU: return DispatchKeySet(DispatchKey::XPU); case DispatchKey::AutogradPrivateUse1: @@ -96,23 +96,6 @@ DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t) { } } -DispatchKeySet getAutocastRelatedKeySetFromBackend(DispatchKey t) { - switch (t) { - case DispatchKey::CPU: - return DispatchKeySet(DispatchKey::AutocastCPU); - case DispatchKey::CUDA: - case DispatchKey::XLA: - return DispatchKeySet(DispatchKey::AutocastCUDA); - default: - return DispatchKeySet(); - } -} - -DispatchKeySet getAutogradRelatedKeySetFromBackend(DispatchKey t) { - return DispatchKeySet( - {DispatchKey::ADInplaceOrView, getAutogradKeyFromBackend(t)}); -} - bool isIncludedInAlias(DispatchKey k, DispatchKey alias) { return k != DispatchKey::Undefined && runtimeDispatchKeySetHas(alias, k); } @@ -129,18 +112,135 @@ std::ostream& operator<<(std::ostream& os, DispatchKeySet ts) { return os; } os << "DispatchKeySet("; - DispatchKey tid; bool first = true; - while ((tid = ts.highestPriorityTypeId()) != DispatchKey::Undefined) { + for (auto k : ts) { if (!first) { os << ", "; } - os << tid; - ts = ts.remove(tid); + os << k; first = false; } os << ")"; return os; } +DispatchKeySet::iterator& DispatchKeySet::iterator::operator++() { + TORCH_INTERNAL_ASSERT(next_functionality_ <= iterator::end_iter_mask_val); + TORCH_INTERNAL_ASSERT(next_backend_ <= num_backends, next_backend_); + + // Create a masked version of the set representation to ignore previous + // keys that we've iterated through. + uint64_t masked_functionality_bits = + llvm::maskTrailingZeros(next_functionality_) & *data_ptr_; + uint64_t masked_backend_bits = + llvm::maskTrailingZeros(next_backend_) & full_backend_mask & + *data_ptr_; + + uint64_t first_functionality_idx = + llvm::findFirstSet(masked_functionality_bits); + uint64_t first_backendcomponent_idx = llvm::findFirstSet(masked_backend_bits); + + // If there are no keys, set to end iterator value + if (first_functionality_idx == std::numeric_limits::max() || + next_functionality_ == iterator::end_iter_mask_val) { + // Set up state to be the same as end() + next_functionality_ = iterator::end_iter_mask_val; + current_dispatchkey_idx_ = iterator::end_iter_key_val; + next_backend_ = 0; + current_backendcomponent_idx_ = iterator::end_iter_key_val; + return *this; + } + + // The +1 is because of DispatchKey::Undefined and + // BackendComponent::InvalidBit + auto new_next_functionality = first_functionality_idx + 1; + auto new_backendcomponent_idx = first_backendcomponent_idx + 1; + // and the -num_backends is because the first bits in the + // keyset are not Dispatch Keys. + auto next_dispatchkey_idx = new_next_functionality - num_backends; + + // If the current functionality bit is a per-backend bit, we need special + // handling + if (isPerBackendFunctionalityKey( + static_cast(next_dispatchkey_idx))) { + // case 1: if the current backend is undefined, then there is no valid + // backend instance of this functionality key so we can skip it. + if (first_backendcomponent_idx == std::numeric_limits::max()) { + // increment the functionality mask so we skip the current functionality + // bit on the next increment. + next_functionality_ = new_next_functionality; + ++(*this); + return *this; + } + + // Otherwise, at this point we know what the current backend and + // functionality bits are. + current_dispatchkey_idx_ = next_dispatchkey_idx; + current_backendcomponent_idx_ = new_backendcomponent_idx; + + // Next, we need to set up the masks for the next increment. + uint64_t next_backendcomponent_bits = + llvm::maskTrailingZeros(first_backendcomponent_idx + 1) & + full_backend_mask & *data_ptr_; + uint64_t next_backendcomponent_idx = + llvm::findFirstSet(next_backendcomponent_bits); + if (next_backendcomponent_idx == std::numeric_limits::max()) { + // case 2: the current backend is valid, but there is not another backend + // in the keyset. In this case, we need to bump the functionality mask and + // reset the backend mask for the next increment + next_functionality_ = new_next_functionality; + next_backend_ = 0; + } else { + // case 3: we have another backend to iterate over. We want to iterate + // over the same functionality bit next time, but a different backend bit. + next_backend_ = first_backendcomponent_idx + 1; + } + } else { + // Functionality bits that aren't per backend are simpler to handle. We can + // ignore the backend bits. + TORCH_INTERNAL_ASSERT(next_backend_ == 0); + current_dispatchkey_idx_ = next_dispatchkey_idx; + next_functionality_ = new_next_functionality; + } + return *this; +} + +std::array +initializeFunctionalityOffsetsAndMasks() { + std::array + offsets_and_masks; + // manualy set the first entry, which corresponds to Undefined. + offsets_and_masks[0] = FunctionalityOffsetAndMask(0, 0); + // loop through every functionality key (aside from Undefined). + for (const auto functionality_idx : c10::irange(1, num_functionality_keys)) { + // functionality_idx should be Dense -> 1, ... + auto prev_offset_and_mask = offsets_and_masks[functionality_idx - 1]; + auto k = static_cast(functionality_idx); + + // If the previous functionality was not per-backend, then we can just + // increment the previous offset. Otherwise, the next offset = + // previous_offset + num_backends. + auto next_offset = prev_offset_and_mask.offset + + (prev_offset_and_mask.mask == 0 ? 1 : num_backends); + // the mask is used in the runtime index calculation to find the offset of + // the backend. For non-per-backend functionalities, this offset should + // always be 0. Otherwise, we need to get the index of the backend (which we + // can do using a backend mask). + auto next_mask = isPerBackendFunctionalityKey(k) ? full_backend_mask : 0; + offsets_and_masks[functionality_idx] = + FunctionalityOffsetAndMask(next_offset, next_mask); + } + // Sanity check that the computed offset index of the last functionality key + // is correct. This assumes that the highest priority functionality key is not + // per backend. + TORCH_INTERNAL_ASSERT( + offsets_and_masks[num_functionality_keys - 1].offset == + (num_runtime_entries - 1), + "num_runtime_entries: ", + num_runtime_entries, + "last_offset: ", + offsets_and_masks[num_functionality_keys - 1].offset); + return offsets_and_masks; +} + } // namespace c10 diff --git a/c10/core/DispatchKeySet.h b/c10/core/DispatchKeySet.h index 79d39652219b..d6241be9701e 100644 --- a/c10/core/DispatchKeySet.h +++ b/c10/core/DispatchKeySet.h @@ -1,5 +1,4 @@ #pragma once - #include #include #include @@ -8,29 +7,147 @@ namespace c10 { +struct FunctionalityOffsetAndMask { + // empty constructor shouldn't be used; only needed to initialize + // the array before populating it. + FunctionalityOffsetAndMask() {} + FunctionalityOffsetAndMask(uint16_t offset, uint16_t mask) + : offset(offset), mask(mask) {} + // This needs to big enough to cover the size of the operator table. + uint16_t offset; + // See Note [No More Than 16 Backends] + // This mask needs to be big enough to mask all of the backend bits. + // We probably don't ever want to have more than 16 backend bits, so uint16_t + // should be enough. + uint16_t mask; +}; +static_assert( + c10::num_runtime_entries < 65536, + "The dispatcher currently only supports up to 2^16 runtime entries"); + +C10_API std::array +initializeFunctionalityOffsetsAndMasks(); + +C10_ALWAYS_INLINE static const std:: + array& + offsetsAndMasks() { + static auto offsets_and_masks_ = initializeFunctionalityOffsetsAndMasks(); + return offsets_and_masks_; +} + +// A representation of a set of DispatchKeys. A DispatchKeySet contains both +// "functionality" bits and "backend bits", and every tensor holds its own +// DispatchKeySet. The Dispatcher implements multiple dispatch by grabbing the +// keyset on every input tensor, or’ing them together, and dispatching to a +// specific piece of functionality. The functionality bits are *ordered*. When +// multiple functionality bits are set, we use the highest priority +// functionality. Similarly, multiple backend bits can theoretically be set if +// you call an operator with multiple tensors from difference devices (e.g. CPU +// and CUDA), although support for mixed device dispatch is limited (the only +// kernels that gracefully handle mixed device inputs for now are cuda kernels +// that take in a scalar cpu tensor). + // A representation of a set of DispatchKeys. A tensor may have multiple // tensor type ids, e.g., a Variable tensor can also be a CPU tensor; the // DispatchKeySet specifies what type ids apply. The internal representation is // as a 64-bit bit set (this means only 64 tensor type ids are supported). // -// Note that DispatchKeys are ordered; thus, we can ask questions like "what is -// the highest priority DispatchKey in the set"? (The set itself is not -// ordered; two sets with the same ids will always have the ids ordered in the -// same way.) +// As mentioned above, DispatchKeys are ordered; thus, we can ask questions like +// "what is the highest priority DispatchKey in the set"? (The set itself is +// not ordered; two sets with the same ids will always have the ids ordered in +// the same way.) +// +// Note [DispatchKeySet Internal Representation] +// Internally, dispatch keys are packed into 64-bit DispatchKeySet objects +// that get passed around at runtime. +// However, there isn't necessarily a 1-to-1 mapping between bits in the keyset +// and individual dispatch keys. +// +// First: why do we have this distinction, and why not map every dispatch key +// directly to a bit? This is mostly because we have several types of +// functionalities that different backends would like to customize. For example, +// we have: +// - "Dense": CPU, CUDA, XLA, ... (~12 keys) +// - "Sparse": SparseCPU, SparseCUDA, ... +// - "Quantized": QuantizedCPU, QuantizedCUDA, QuantizedXLA, ... +// - "Autograd": AutogradCPU, AutogradCUDA, Autograd XLA, ... +// The problem is that total number of keys grows quadratically with [# +// backends] x [# functionalities], making it very difficult to map each key +// directly to a bit in a bitset without dramatically increasing the size of the +// bitset over time. +// +// The two enums (BackendComponent and DispatchKey) can be divided roughly into +// 5 categories. +// +// (1) "Building block" keys +// (a) backends: jEverything in the BackendComponent enum (e.g. CPUBit, +// CUDABIt) (b) functionalities: (per-backend) functionality-bit DispatchKeys +// (e.g. AutogradFunctionality, Sparse, Dense) +// (2) "Runtime" keys +// (a) "non-customizable backends" (e.g. FPGA) +// (b) "non-customizable functionalities" (e.g. Functionalize) +// (c) "per-backend instances of customizable functionalities" (e.g. CPU, +// SparseCPU, AutogradCPU) +// (3) "Alias" DispatchKeys (see Note [Alias Dispatch Keys]) +// +// (1) Building block keys always correspond to individual bits in a +// DispatchKeySet. They can also be combined in a DispatchKeySet to form actual +// runtime keys. e.g. +// auto dense_cpu_ks = DispatchKeySet({DispatchKey::CPUBit, +// DispatchKey::Dense}); +// // The keyset has the runtime dense-cpu key. +// dense_cpu_ks.has(DispatchKey::CPU); +// // And it contains the building block keys too. +// dense_cpu_ks.has(DispatchKey::CPUBit); +// dense_cpu_ks.has(DispatchKey::Dense); +// +// Not every backend and not every functionality counts as a "building block +// key". This is mostly to give us more levers to pull in the design space. +// Backend keys and functionality keys that count as "building blocks" will +// contribute to a full cross product of functionality that can be overriden. // -// At the moment, there are no nontrivial uses of this set; tensors are always -// singletons. In the near future, this set will represent variable? + tensor -// type id. In the far future, it will be requires grad? + profiling? + -// tracing? + lazy? + tensor type id. +// For example, right now we have at least 12 "backend" building blocks (CPU, +// CUDA, XLA, ...) and at least 4 "functionality" building blocks (Dense, +// Sparse, Quantized, AutogradFunctionality, ...). These keys together allow +// every dispatcher operator to be customized in up to 12*4 different ways. Each +// of those requires a slot in the operator table of every dispatcher operator. +// Not every piece of functionality necessarily needs to be customizeable +// per-backend, and not every backend necessarily needs to be able to customize +// every type of functionality. // -// (The difference between variable and requires grad, is that -// there are currently three states a tensor can be: -// 1. Not a variable -// 2. Variable with requires_grad=False -// 3. Variable with requires_grad=True -// Eventually, we want to kill state (1), and only dispatch to autograd -// handling code if one of the inputs requires grad.) // +// (2) Every runtime key corresponds directly to a slot in an operator's runtime +// dispatch table, and you can directly register kernels to a runtime dispatch +// key. +// +// For per-backend functionalities like "Dense" or "AutogradFunctionality", +// you can think of the corresponding runtime dispatch keys as "instances" of +// that functionality, per backend. E.g. "CPU", "CUDA", "XLA", etc. are all +// runtime instances of the "Dense" building block key. + +// (2a) and (2b) are represented identically in the DispatchKeySet logic: +// - backend-agnostic functionalities (e.g. FuncTorchBatched) are NOT +// customizeable per backend. +// In order to do so, we'd need to promote it to a per-backend functionality +// "building block" key. +// - non-customizeable backends (e.g. FPGA) can NOT customize existing +// functionality like Sparse, Autograd, etc. +// In order to do so, we'd need to promote it to a backend "building block" +// key. +// +// In both cases, these keys directly correspond to runtime slots in the +// operator table. +// +// +// (3) "Alias" keys +// See Note [Alias Dispatch Keys] +// +// Final note: for anyone making future changes to the Dispatcher + +// DispatchKeySet internals, there's a closed PR with a basic +// python-implementation of the Dispatcher that might be useful in quickly +// testing out and validating changes. See it at +// https://github.com/pytorch/pytorch/pull/68743 + // An undefined tensor is one with an empty tensor type set. class DispatchKeySet final { public: @@ -41,29 +158,146 @@ class DispatchKeySet final { // NB: default constructor representation as zero is MANDATORY as // use of DispatchKeySet in TLS requires this. constexpr DispatchKeySet() : repr_(0) {} + constexpr DispatchKeySet(Full) - : repr_(std::numeric_limits::max()) {} + : repr_((1ULL << (num_backends + num_functionality_keys - 1)) - 1) {} + constexpr DispatchKeySet(FullAfter, DispatchKey t) // LSB after t are OK, but not t itself. - : repr_((1ULL << (static_cast(t) - 1)) - 1) {} + // "functionalities" have a notion of ordering (e.g. Autograd > Sparse > + // Quantized > Dense). But backends don't really have an ordering. + // Therefore, we're enforcing that FullAfter can only be used on + // "functionality" keys. + : repr_( + (1ULL + << (num_backends + static_cast(toFunctionalityKey(t)) - + 1)) - + 1) {} + // Public version of DispatchKeySet(uint64_t) API; external users // must be explicit when they do this! constexpr DispatchKeySet(Raw, uint64_t x) : repr_(x) {} - explicit constexpr DispatchKeySet(DispatchKey t) - : repr_( - t == DispatchKey::Undefined - ? 0 - : 1ULL << (static_cast(t) - 1)) {} - explicit constexpr DispatchKeySet(std::initializer_list ks) - : repr_(0) { + + constexpr explicit DispatchKeySet(BackendComponent k) { + if (k == BackendComponent::InvalidBit) { + repr_ = 0; + } else { + repr_ = 1ULL << (static_cast(k) - 1); + } + } + + constexpr explicit DispatchKeySet(DispatchKey k) { + if (k == DispatchKey::Undefined) { + // Case 1: handle Undefined specifically + repr_ = 0; + } else if (k <= DispatchKey::EndOfFunctionalityKeys) { + // Case 2: handle "functionality-only" keys + // These keys have a functionality bit set, but no backend bits + // These can technically be either: + // - valid runtime keys (e.g. DispatchKey::AutogradOther, + // DispatchKey::FuncTorchBatched, etc) + // - "building block" keys that aren't actual runtime keys (e.g. + // DispatchKey::Dense or Sparse) + uint64_t functionality_val = 1ULL + << (num_backends + static_cast(k) - 1); + repr_ = functionality_val; + } else if (k <= DispatchKey::EndOfRuntimeBackendKeys) { + // Case 3: "runtime" keys that have a functionality bit AND a backend bit. + // First compute which bit to flip for the functionality. + auto functionality_k = toFunctionalityKey(k); + // The - 1 is because Undefined is technically a "functionality" that + // doesn't show up in the bitset. So e.g. Dense is technically the second + // functionality, but the lowest functionality bit. + uint64_t functionality_val = 1ULL + << (num_backends + static_cast(functionality_k) - 1); + + // then compute which bit to flip for the backend + // Case 4a: handle the runtime instances of "per-backend functionality" + // keys For example, given DispatchKey::CPU, we should set: + // - the Dense functionality bit + // - the CPUBit backend bit + // first compute which bit to flip for the backend + auto backend_k = toBackendComponent(k); + uint64_t backend_val = backend_k == BackendComponent::InvalidBit + ? 0 + : 1ULL << (static_cast(backend_k) - 1); + repr_ = functionality_val + backend_val; + } else { + // At this point, we should have covered every case except for alias keys. + // Technically it would be possible to add alias dispatch keys to a + // DispatchKeySet, but the semantics are a little confusing and this + // currently isn't needed anywhere. + repr_ = 0; + } + } + + constexpr uint64_t keys_to_repr(std::initializer_list ks) { + uint64_t repr = 0; for (auto k : ks) { - repr_ |= DispatchKeySet(k).repr_; + repr |= DispatchKeySet(k).repr_; } + return repr; } + + constexpr uint64_t backend_bits_to_repr( + std::initializer_list ks) { + uint64_t repr = 0; + for (auto k : ks) { + repr |= DispatchKeySet(k).repr_; + } + return repr; + } + + explicit constexpr DispatchKeySet(std::initializer_list ks) + : repr_(keys_to_repr(ks)) {} + + explicit constexpr DispatchKeySet(std::initializer_list ks) + // Note: for some reason, putting this logic directly in the constructor + // appears to fail to compile on CUDA 10.1. + // See an example internal failure at + // https://www.internalfb.com/intern/skycastle/run/76561193669136035/artifact/actionlog.76561193742069401.stderr + : repr_(backend_bits_to_repr(ks)) {} + // Test if a DispatchKey is in the set - bool inline has(DispatchKey t) const { + inline bool has(DispatchKey t) const { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t != DispatchKey::Undefined); - return static_cast(repr_ & DispatchKeySet(t).repr_); + return has_all(DispatchKeySet(t)); + } + constexpr bool has_backend(BackendComponent t) const { + return has_all(DispatchKeySet(t)); + } + + // Test if a DispatchKey is in the set + // Given a DispatchKeySet of functionality keys and (potentially) backend + // keys, tests if all of them are in the current set. + constexpr bool has_all(DispatchKeySet ks) const { + return static_cast((repr_ & ks.repr_) == ks.repr_); + } + + // Given a DispatchKeySet of functionality keys and (potentially) backend + // keys, tests if any of them are in the current set. This could technically + // be pretty easily implemented using has(). It is strictly a perf + // optimization though. There are many places in the code base where we want + // to test for multiple functionality keys together. HOWEVER, runtime + // per-backend functionality keys aren't allowed to be used with this + // function, because you can end up with weird results. e.g. + // DispatchKeySet(DispatchKey::AutogradCPU).has_any(DispatchKeySet(DispatchKey::CPU)) + // would return true. + inline bool has_any(DispatchKeySet ks) const { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + // Either there are no backend bits in the input keyset + ((ks.repr_ & full_backend_mask) == 0) || + // or there are no per-backend-functionality bits + // See [Note: Per-Backend Functionality Dispatch Keys] + ((ks & + DispatchKeySet({ + DispatchKey::Dense, + DispatchKey::Quantized, + DispatchKey::Sparse, + DispatchKey::AutogradFunctionality, + }) + .repr_) == 0)); + return static_cast((repr_ & ks.repr_) != 0); } // Test if DispatchKeySet is a superset of ks. bool isSupersetOf(DispatchKeySet ks) const { @@ -74,31 +308,64 @@ class DispatchKeySet final { return DispatchKeySet(repr_ | other.repr_); } // Perform set intersection - DispatchKeySet operator&(DispatchKeySet other) const { + constexpr DispatchKeySet operator&(DispatchKeySet other) const { return DispatchKeySet(repr_ & other.repr_); } - // Compute the set difference self - other + // Compute the set difference self - other, + // but ONLY for the functionality keys. + // Any backend bits set on self will remain unchanged. + // See Note [Removing keys from DispatchKeySet Only Affects Functionality + // Keys] DispatchKeySet operator-(DispatchKeySet other) const { - return DispatchKeySet(repr_ & ~other.repr_); + return DispatchKeySet(repr_ & (full_backend_mask | ~other.repr_)); } + // Compute self ^ other constexpr DispatchKeySet operator^(DispatchKeySet other) const { return DispatchKeySet(repr_ ^ other.repr_); } - // Perform set equality bool operator==(DispatchKeySet other) const { return repr_ == other.repr_; } + bool operator!=(DispatchKeySet other) const { + return repr_ != other.repr_; + } // Add a DispatchKey to the DispatchKey set. Does NOT mutate, // returns the extended DispatchKeySet! C10_NODISCARD DispatchKeySet add(DispatchKey t) const { return *this | DispatchKeySet(t); } - // Remove a DispatchKey from the DispatchKey set. This is - // generally not an operation you should be doing (it's - // used to implement operator<<) - C10_NODISCARD constexpr DispatchKeySet remove(DispatchKey t) const { - return DispatchKeySet(repr_ & ~DispatchKeySet(t).repr_); + C10_NODISCARD DispatchKeySet add(DispatchKeySet ks) const { + return *this | ks; + } + + // Remove a DispatchKey from the DispatchKey set. + // This is generally not an operation you should be doing + // (it's used to implement the printing overload, operator<<) + // + // Note [Removing keys from DispatchKeySet Only Affects Functionality Keys] + // Only functionality bits are allowed to be removed from a keyset. + // For now, we're only allowing removal of "functionality bits" from the + // keyset, which is specifically needed by the fallthrough key calculation + // logic. Why is removing backend bits problematic? Consider this example: + // + // DispatchKeySet([DispatchKey.CPU, DispatchKey.AutogradCUDA, + // DispatchKey.CUDA]).remove(DispatchKey.AutogradCUDA) + // DispatchKeySet([DispatchKey.CPU, + // DispatchKey.AutogradCUDA]).remove(DispatchKey.AutogradCUDA) + // + // What do we want to happen? + // Technically, we'd like it to be true that after removal, + // the first keyset still has the CUDA dispatch key while the second doesn't. + // Unfortunately there's no way to represent that, because the two keysets are + // represented the same way internally: functionality bits: Autograd, Dense + // backend bits: CPU, CUDA + // + // Instead, remove(DispatchKey.AutogradCPU) will only remove the "Autograd" + // bit from the bitset. + constexpr DispatchKeySet remove(DispatchKey t) const { + return DispatchKeySet( + repr_ & ~(DispatchKeySet(t).repr_ & ~full_backend_mask)); } // Is the set empty? (AKA undefined tensor) bool empty() const { @@ -107,22 +374,112 @@ class DispatchKeySet final { uint64_t raw_repr() { return repr_; } - // Return the type id in this set with the highest priority (i.e., - // is the largest in the DispatchKey enum). Intuitively, this - // type id is the one that should handle dispatch (assuming there - // aren't any further exclusions or inclusions). + + DispatchKey highestFunctionalityKey() const { + auto functionality_idx = indexOfHighestBit(); + // This means that none of the functionality bits were set. + if (functionality_idx < num_backends) + return DispatchKey::Undefined; + // The first num_backend bits in the keyset don't correspond to real + // dispatch keys. + return static_cast(functionality_idx - num_backends); + } + + // This is similar like toBackendComponent(DispatchKey), but less restrictive. + // toBackendComponent() errors out if the key that it was passed has no + // backend bits, which is useful for error checking. We need a version of that + // here that can also handle "fake" backends like FPGA, because they need to + // map to the AutogradOther key. For those backends, we return + // BackendComponent::InvalidBit. + BackendComponent highestBackendKey() const { + // mask to mask out functionality bits + auto backend_idx = + DispatchKeySet(repr_ & full_backend_mask).indexOfHighestBit(); + // all zeros across the backend bits means that no backend bits are set. + if (backend_idx == 0) + return BackendComponent::InvalidBit; + return static_cast(backend_idx); + } + + // returns the DispatchKey of highest priority in the set. DispatchKey highestPriorityTypeId() const { - // TODO: If I put Undefined as entry 64 and then adjust the - // singleton constructor to shift from the right, we can get rid of the - // subtraction here. It's modestly more complicated to get right so I - // didn't do it for now. - return static_cast(64 - llvm::countLeadingZeros(repr_)); + auto functionality_k = highestFunctionalityKey(); + if (isPerBackendFunctionalityKey(functionality_k)) { + return toRuntimePerBackendFunctionalityKey( + functionality_k, highestBackendKey()); + } + return functionality_k; + } + + // Returns the index of the most-significant bit in the keyset. + // This is used to as part of the calculation into the operator table to get: + // - the highest "functionality" bit in the keyset. + // - the highest "backend" bit in the keyset. + uint8_t indexOfHighestBit() const { + return 64 - llvm::countLeadingZeros(repr_); } - DispatchKey highestPriorityBackendTypeId() const { - return (*this & - ((1ULL << static_cast(DispatchKey::EndOfBackendKeys)) - 1)) - .highestPriorityTypeId(); +#if defined(C10_MOBILE_TRIM_DISPATCH_KEYS) + // [Note: Trimmed Mobile Dispatch Keys] + /** + * The method below maps the dispatch key in the enum DispatchKey to an + * integer index in the dispatchTable_ array in OperatorEntry. The array + * is trimmed for mobile to reduce peak memory usage since it's + * unnecessary to reserve additional space for dispatch keys that will + * never be used on mobile. + */ + int getDispatchTableIndexForDispatchKeySet() const { + auto dk = highestPriorityTypeId(); + switch (dk) { + case DispatchKey::Undefined: + return 0; + case DispatchKey::CPU: + return 1; + case DispatchKey::QuantizedCPU: + return 2; + case DispatchKey::SparseCPU: + return 3; + case DispatchKey::BackendSelect: + return 4; + case DispatchKey::ADInplaceOrView: + return 5; + case DispatchKey::AutogradOther: + return 6; + case DispatchKey::AutogradCPU: + return 7; + default: + return -1; + } + } +#else + // returns the index in the operator table of highest priority key in the the + // keyset Note that we could in theory implement this using + // highestPriorityTypeId(), but this code is very hotpath and we can do it + // faster without it. + int getDispatchTableIndexForDispatchKeySet() const { + auto functionality_idx = + DispatchKeySet(repr_ >> num_backends).indexOfHighestBit(); + auto offset_and_mask = offsetsAndMasks()[functionality_idx]; + // Mask the functionality bits out first, then right-shift by 1. + // right-shifting by 1 because everything is zero-indexed. + // E.g. 000001 (CPU) should give us an offset of 0, 000010 (CUDA) should + // give us an offset of 1, etc. + auto backend_idx = + DispatchKeySet((repr_ & offset_and_mask.mask) >> 1).indexOfHighestBit(); + return offset_and_mask.offset + backend_idx; + } +#endif + + // returns the "index" of the highest priority backend in the keyset. + // This is pretty similar to getBackendKey(), but: + // - It's hotpath code (part of the runtime bitset calculation) + // - I's returns an integer index, not an enum value + // - Everything is shifted to the right by 1. + // BackendComponent::InvalidBit is technically the lowest enum value, + // but it isn't included in the runtime table. So CPUBit = 1, CUDABit = 2, + // etc. + uint64_t getBackendIndex() const { + return DispatchKeySet((repr_ & full_backend_mask) >> 1).indexOfHighestBit(); } private: @@ -130,42 +487,53 @@ class DispatchKeySet final { uint64_t repr_ = 0; public: - // STL iterator for DispatchKeySet. Iterates through all DispatchKeys in the - // set. The iterator is only invalidated by the destruction of the underlying - // DispatchKeySet as the iterator stores a pointer to the raw representation - // of the DispatchKeySet. + // STL iterator for DispatchKeySet. Iterates through all runtime DispatchKeys + // in the set. The iterator is only invalidated by the destruction of the + // underlying DispatchKeySet as the iterator stores a pointer to the raw + // representation of the DispatchKeySet. Note: When we encounter a per-backend + // functionality (e.g. Dense or Sparse), we will iterate through EVERY backend + // in the keyset, for that functionality. For example, if the next + // functionality key to iterate over is Autograd, and the backend bits in the + // keyset correspond to [BackendComponent::CPUBit, BackendComponent::CUDABit], + // then the next two keys we return will be DispatchKey::AutogradCPU, + // DispatchKey::AutogradCUDA (CPU first because it has lower precedence than + // CUDA in DispatchKey.h). class iterator { public: using self_type = iterator; using iterator_category = std::input_iterator_tag; using value_type = DispatchKey; using difference_type = ptrdiff_t; - - explicit iterator(const uint64_t* data_ptr, uint8_t i = 0) - : data_ptr_(data_ptr), i_(i) { + // final mask value should mask out the entire keyset + static const uint8_t end_iter_mask_val = + num_backends + num_functionality_keys; + // final key value should be the last DispatchKey + static const uint8_t end_iter_key_val = num_functionality_keys; + + // current_dispatchkey_idx_ will iterate through all functionality bits. + // current_backendcomponent_idx_ will iterate through all backend bits. + explicit iterator( + const uint64_t* data_ptr, + uint8_t next_functionality = num_backends, + uint8_t next_backend = 0) + : data_ptr_(data_ptr), + next_functionality_(next_functionality), + next_backend_(next_backend), + // These are in an invalid state at construction time, and set by the + // first increment call + current_dispatchkey_idx_(end_iter_key_val), + current_backendcomponent_idx_(end_iter_key_val) { // Go to the first key in the set + TORCH_INTERNAL_ASSERT( + next_functionality_ >= num_backends, + "num_backends=", + static_cast(num_backends), + "next_functionality_=", + static_cast(next_functionality_)); ++(*this); } - self_type& operator++() { - TORCH_INTERNAL_ASSERT( - i_ <= static_cast(DispatchKey::NumDispatchKeys)); - - // Create a masked version of the set representation to ignore previous - // keys that we've iterated through. - uint64_t masked_data = llvm::maskTrailingZeros(i_) & *data_ptr_; - uint64_t firstKeyIndex = llvm::findFirstSet(masked_data); - - // If there are no keys, set to end iterator value - if (firstKeyIndex == std::numeric_limits::max() || - i_ == static_cast(DispatchKey::NumDispatchKeys)) { - i_ = static_cast(DispatchKey::NumDispatchKeys); - return *this; - } - - i_ = static_cast(firstKeyIndex) + 1; - return *this; - } + C10_API self_type& operator++(); self_type operator++(int) { self_type previous_iterator = *this; @@ -174,18 +542,50 @@ class DispatchKeySet final { } bool operator==(const self_type& rhs) const { - return i_ == rhs.i_; + return next_functionality_ == rhs.next_functionality_ && + current_dispatchkey_idx_ == rhs.current_dispatchkey_idx_ && + next_backend_ == rhs.next_backend_ && + current_backendcomponent_idx_ == rhs.current_backendcomponent_idx_; } bool operator!=(const self_type& rhs) const { - return i_ != rhs.i_; + return next_functionality_ != rhs.next_functionality_ || + current_dispatchkey_idx_ != rhs.current_dispatchkey_idx_ || + next_backend_ != rhs.next_backend_ || + current_backendcomponent_idx_ != rhs.current_backendcomponent_idx_; } DispatchKey operator*() const { - return static_cast(i_); + auto functionality_key = + static_cast(current_dispatchkey_idx_); + if (isPerBackendFunctionalityKey(functionality_key)) { + auto next_key = toRuntimePerBackendFunctionalityKey( + functionality_key, + static_cast(current_backendcomponent_idx_)); + // We expect all of the Dense, Sparse, Quantized, and Autograd keys to + // be ordered the same way with respect to their backends + TORCH_INTERNAL_ASSERT( + toBackendComponent(next_key) == + static_cast(current_backendcomponent_idx_), + "Tried to map functionality key ", + toString(functionality_key), + " and backend bit ", + toString( + static_cast(current_backendcomponent_idx_)), + " to a runtime key, but ended up with ", + toString(next_key), + ". This can happen if the order of the backend dispatch keys in DispatchKey.h isn't consistent.", + " Please double check that enum for inconsistencies."); + return next_key; + } else { + return functionality_key; + } } private: const uint64_t* data_ptr_; - uint8_t i_; + uint8_t next_functionality_; + uint8_t next_backend_; + uint8_t current_dispatchkey_idx_; + uint8_t current_backendcomponent_idx_; }; public: @@ -195,37 +595,42 @@ class DispatchKeySet final { return iterator(&repr_); } - // We do not need to iterate beyond NumDispatchKeys so we will treat this as - // the end iterator. NumDispatchKeys will always be strictly less than 64. + // We do not need to iterate beyond EndOfFunctionalityKeys so we will treat + // this as the end iterator. iterator end() const { - return iterator(&repr_, static_cast(DispatchKey::NumDispatchKeys)); + return iterator(&repr_, iterator::end_iter_mask_val); } }; C10_API std::string toString(DispatchKeySet); C10_API std::ostream& operator<<(std::ostream&, DispatchKeySet); -// autograd_dispatch_keyset should include all runtime autograd keys. -// Alias key DispatchKey::Autograd maps to autograd_dispatch_keyset. +C10_API inline int getDispatchTableIndexForDispatchKey(DispatchKey k) { + return DispatchKeySet(k).getDispatchTableIndexForDispatchKeySet(); +} + +// Alias key DispatchKey::Autograd maps to +// (autograd_dispatch_keyset x full_backend_mask) // NB: keys in this set also get associated with CompositeImplicitAutograd +// +// Note [autograd_dispatch_keyset Does Not Include Backend Bits] +// We don't want to include any backend bits (BackendComponent::CPUBit, etc) +// directly in autograd_dispatch_keyset. +// Why? keysets like autograd_dispatch_keyset are commonly used to remove +// autograd keys from a DispatchKeySet throughout the code base. However, you +// are only allowed to remove functionality bits from a keyset, not backend +// bits. See Note [Removing keys from DispatchKeySet Only Affects Functionality +// Keys] for details. To be consistent and avoid confusion, we're explicitly +// setting up autograd_dispatch_keyset to not have any backend bits. constexpr DispatchKeySet autograd_dispatch_keyset = DispatchKeySet({ - DispatchKey::AutogradCPU, - DispatchKey::AutogradCUDA, - DispatchKey::AutogradXLA, - DispatchKey::AutogradLazy, - DispatchKey::AutogradNestedTensor, - DispatchKey::AutogradMLC, - DispatchKey::AutogradHPU, - DispatchKey::AutogradXPU, - DispatchKey::AutogradPrivateUse1, - DispatchKey::AutogradPrivateUse2, - DispatchKey::AutogradPrivateUse3, + DispatchKey::AutogradFunctionality, DispatchKey::AutogradOther, }); constexpr DispatchKeySet autocast_dispatch_keyset = DispatchKeySet({ DispatchKey::AutocastCPU, DispatchKey::AutocastCUDA, + DispatchKey::AutocastXPU, }); // See Note [TLS Initialization] @@ -237,32 +642,48 @@ constexpr DispatchKeySet default_included_set = DispatchKeySet({ constexpr DispatchKeySet default_excluded_set = DispatchKeySet({ DispatchKey::AutocastCPU, DispatchKey::AutocastCUDA, + DispatchKey::AutocastXPU, }); constexpr DispatchKeySet autograd_dispatch_keyset_with_ADInplaceOrView = autograd_dispatch_keyset | DispatchKeySet(DispatchKey::ADInplaceOrView); +constexpr DispatchKeySet python_ks = DispatchKeySet({ + DispatchKey::Python, + DispatchKey::PythonTLSSnapshot, +}); + +constexpr DispatchKeySet sparse_ks = DispatchKeySet(DispatchKey::Sparse); + +constexpr DispatchKeySet sparse_csr_ks = + DispatchKeySet({DispatchKey::SparseCsrCPU, DispatchKey::SparseCsrCUDA}); + +constexpr DispatchKeySet mkldnn_ks = DispatchKeySet(DispatchKey::MkldnnCPU); + // backend dispatch keys that map to DispatchKey::AutogradOther // NB: keys in this set also get associated with CompositeImplicitAutograd -constexpr DispatchKeySet autogradother_backends = DispatchKeySet( - {DispatchKey::HIP, - DispatchKey::VE, - DispatchKey::FPGA, - DispatchKey::ORT, - DispatchKey::Vulkan, - DispatchKey::Metal, - DispatchKey::QuantizedCPU, - DispatchKey::QuantizedCUDA, - DispatchKey::CustomRNGKeyId, - DispatchKey::MkldnnCPU, - DispatchKey::SparseCPU, - DispatchKey::SparseCUDA, - DispatchKey::SparseHIP, - DispatchKey::SparseVE, - DispatchKey::SparseXPU, - DispatchKey::SparseCsrCPU, - DispatchKey::SparseCsrCUDA, - DispatchKey::Meta}); +constexpr DispatchKeySet autogradother_backends = + DispatchKeySet( + // HIP and VE aren't in this list: they now have their own backend bits + // which means that they can now have their own Autograd keys. + // Technically, HIP will now redispatch to its own custom AutogradHIP + // slot in the runtime table. + {DispatchKey::FPGA, + DispatchKey::ORT, + DispatchKey::Vulkan, + DispatchKey::Metal, + DispatchKey::SparseCsrCPU, + DispatchKey::SparseCsrCUDA, + DispatchKey::CustomRNGKeyId, + DispatchKey::MkldnnCPU, + DispatchKey::Meta, + // Sparse and Quantized backends also live here. + DispatchKey::Sparse, + DispatchKey::Quantized}) + // Including the backend bits because this keyset is used during op + // registration, which requires looping over all runtime autogradother + // backend keys. + | DispatchKeySet(DispatchKeySet::RAW, full_backend_mask); // The set of dispatch keys that come after autograd // n.b. this relies on the fact that AutogradOther is currently the lowest @@ -292,6 +713,57 @@ constexpr DispatchKeySet after_func_keyset = // away with it by explicitly removing the key here. c10::DispatchKey::ADInplaceOrView); +constexpr DispatchKeySet backend_bitset_mask = + DispatchKeySet(DispatchKeySet::RAW, (1ULL << num_backends) - 1); + +constexpr auto inplace_or_view_ks = + DispatchKeySet(DispatchKey::ADInplaceOrView); +constexpr auto autograd_cpu_ks = DispatchKeySet(DispatchKey::AutogradCPU); +constexpr auto autograd_ipu_ks = DispatchKeySet(DispatchKey::AutogradIPU); +constexpr auto autograd_xpu_ks = DispatchKeySet(DispatchKey::AutogradXPU); +constexpr auto autograd_cuda_ks = DispatchKeySet(DispatchKey::AutogradCUDA); +constexpr auto autograd_xla_ks = DispatchKeySet(DispatchKey::AutogradXLA); +constexpr auto autograd_lazy_ks = DispatchKeySet(DispatchKey::AutogradLazy); +constexpr auto autograd_mps_ks = DispatchKeySet(DispatchKey::AutogradMPS); +constexpr auto autograd_hpu_ks = DispatchKeySet(DispatchKey::AutogradHPU); +constexpr auto autograd_privateuse1_ks = + DispatchKeySet(DispatchKey::AutogradPrivateUse1); +constexpr auto autograd_privateuse2_ks = + DispatchKeySet(DispatchKey::AutogradPrivateUse2); +constexpr auto autograd_privateuse3_ks = + DispatchKeySet(DispatchKey::AutogradPrivateUse3); +constexpr auto autograd_other_ks = DispatchKeySet(DispatchKey::AutogradOther); + +// keyset correpsonding to functorch keys that have their own dedicated +// TensorImpl subclass. +constexpr auto functorch_transforms_ks = DispatchKeySet( + {DispatchKey::FuncTorchBatched, + DispatchKey::FuncTorchVmapMode, + DispatchKey::Batched, + DispatchKey::VmapMode, + DispatchKey::FuncTorchGradWrapper}); + +// This keyset has: +// (1) the functionality bits corresponding to backends (dense, sparse, +// quantized) (2) all of the backend bits set +constexpr DispatchKeySet backend_functionality_keys = + DispatchKeySet({ + DispatchKey::Dense, + DispatchKey::Quantized, + DispatchKey::Sparse, + }) | + DispatchKeySet(DispatchKeySet::RAW, full_backend_mask); + +struct OpTableOffsetAndMask { + uint16_t offset; + uint16_t backend_mask; +}; + +static_assert( + num_backends <= 16, + "Right now we expect the number of backends not to exceed 16. In the (unlikely) event" + " that this changes, the size of OpTableOffsetAndMask::backend_mask needs to be increased too."); + // true if t is a backend dispatch key C10_API bool isBackendDispatchKey(DispatchKey t); @@ -307,10 +779,65 @@ C10_API bool runtimeDispatchKeySetHas(DispatchKey t, DispatchKey k); C10_API DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t); // Returns a DispatchKeySet of autograd related keys mapped to backend. -C10_API DispatchKeySet getAutogradRelatedKeySetFromBackend(DispatchKey t); +// for a given backend key, use the associated autograd key. +// for non-backend keys, use AutogradOther as a default. +// Note: it's convenient and fast to return a default here rather than (say) +// returning an optional, or throwing. But it makes callers +// responsible for either a) enforcing the invariant that only backend keys +// be passed as arguments, or b) interpreting our return value carefully. +inline DispatchKeySet getAutogradRelatedKeySetFromBackend(BackendComponent t) { + switch (t) { + case BackendComponent::CPUBit: + return inplace_or_view_ks | autograd_cpu_ks; + case BackendComponent::IPUBit: + return inplace_or_view_ks | autograd_ipu_ks; + case BackendComponent::XPUBit: + return inplace_or_view_ks | autograd_xpu_ks; + case BackendComponent::CUDABit: + return inplace_or_view_ks | autograd_cuda_ks; + case BackendComponent::XLABit: + return inplace_or_view_ks | autograd_xla_ks; + case BackendComponent::LazyBit: + return inplace_or_view_ks | autograd_lazy_ks; + case BackendComponent::MPSBit: + return inplace_or_view_ks | autograd_mps_ks; + case BackendComponent::HPUBit: + return inplace_or_view_ks | autograd_hpu_ks; + case BackendComponent::PrivateUse1Bit: + return inplace_or_view_ks | autograd_privateuse1_ks; + case BackendComponent::PrivateUse2Bit: + return inplace_or_view_ks | autograd_privateuse2_ks; + case BackendComponent::PrivateUse3Bit: + return inplace_or_view_ks | autograd_privateuse3_ks; + default: + return inplace_or_view_ks | autograd_other_ks; + } +} // Returns a DispatchKeySet of autocast related keys mapped to backend. -C10_API DispatchKeySet getAutocastRelatedKeySetFromBackend(DispatchKey t); +inline DispatchKeySet getAutocastRelatedKeySetFromBackend(BackendComponent t) { + constexpr auto autocast_cpu_ks = DispatchKeySet(DispatchKey::AutocastCPU); + constexpr auto autocast_xpu_ks = DispatchKeySet(DispatchKey::AutocastXPU); + constexpr auto autocast_cuda_ks = DispatchKeySet(DispatchKey::AutocastCUDA); + switch (t) { + case BackendComponent::CPUBit: + return autocast_cpu_ks; + case BackendComponent::XPUBit: + return autocast_xpu_ks; + case BackendComponent::CUDABit: + case BackendComponent::XLABit: + return autocast_cuda_ks; + default: + return DispatchKeySet(); + } +} + +// returns the "backend" DispatchKey of highest priority in the set. +// This is basically like highestBackendKey(), except that we have some +// "functionality" bits that correspond to backends (Sparse, Quantized) +inline DispatchKey highestPriorityBackendTypeId(DispatchKeySet ks) { + return (ks & backend_functionality_keys).highestPriorityTypeId(); +} // This API exists because we have a use case for checking // getRuntimeDispatchKeySet(alias).has(DispatchKey::Undefined) @@ -329,7 +856,8 @@ static inline DispatchKey legacyExtractDispatchKey(DispatchKeySet s) { // here. At the moment, autograd keys and ADInplaceOrView key need this // treatment; return (s - autograd_dispatch_keyset_with_ADInplaceOrView - - autocast_dispatch_keyset) + autocast_dispatch_keyset - + DispatchKeySet({DispatchKey::PythonTLSSnapshot, DispatchKey::Python})) .highestPriorityTypeId(); } diff --git a/c10/core/GeneratorImpl.cpp b/c10/core/GeneratorImpl.cpp index 4022b150d084..e2876bf9a1cf 100644 --- a/c10/core/GeneratorImpl.cpp +++ b/c10/core/GeneratorImpl.cpp @@ -43,7 +43,7 @@ namespace detail { * Note this is a legacy method (from THRandom.cpp) * FIXME: use std::random_device with entropy information */ -#if !defined(_WIN32) && !defined(__XROS__) +#if !defined(_WIN32) static uint64_t readURandomLong() { int randDev = open("/dev/urandom", O_RDONLY); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) @@ -56,7 +56,7 @@ static uint64_t readURandomLong() { close(randDev); return randValue; } -#endif // _WIN32 && __XROS__ +#endif // _WIN32 /** * Gets a non deterministic random number number from either the @@ -82,9 +82,6 @@ uint64_t getNonDeterministicRandom(bool is_cuda) { s = (uint64_t)std::chrono::high_resolution_clock::now() .time_since_epoch() .count(); -#elif defined(__XROS__) - std::random_device rd; - s = ((((uint64_t)rd()) << 32) + rd()) & 0x1FFFFFFFFFFFFF; #elif defined(__SGX_ENABLED__) TORCH_CHECK( sgx_read_rand(reinterpret_cast(&s), sizeof(s)) == SGX_SUCCESS, diff --git a/c10/core/Layout.h b/c10/core/Layout.h index f37ceb18a835..0ac72439b7f0 100644 --- a/c10/core/Layout.h +++ b/c10/core/Layout.h @@ -6,12 +6,24 @@ #include namespace c10 { -enum class Layout : int8_t { Strided, Sparse, SparseCsr, Mkldnn, NumOptions }; +enum class Layout : int8_t { + Strided, + Sparse, + SparseCsr, + Mkldnn, + SparseCsc, + SparseBsr, + SparseBsc, + NumOptions +}; constexpr auto kStrided = Layout::Strided; constexpr auto kSparse = Layout::Sparse; constexpr auto kSparseCsr = Layout::SparseCsr; constexpr auto kMkldnn = Layout::Mkldnn; +constexpr auto kSparseCsc = Layout::SparseCsc; +constexpr auto kSparseBsr = Layout::SparseBsr; +constexpr auto kSparseBsc = Layout::SparseBsc; inline Layout layout_from_backend(Backend backend) { switch (backend) { @@ -25,7 +37,9 @@ inline Layout layout_from_backend(Backend backend) { return Layout::Mkldnn; case Backend::SparseCsrCPU: case Backend::SparseCsrCUDA: - return Layout::SparseCsr; + TORCH_CHECK( + false, + "Cannot map Backend SparseCsrCPU|SparseCsrCUDA to a unique layout."); default: return Layout::Strided; } @@ -39,6 +53,12 @@ inline std::ostream& operator<<(std::ostream& stream, at::Layout layout) { return stream << "Sparse"; case at::kSparseCsr: return stream << "SparseCsr"; + case at::kSparseCsc: + return stream << "SparseCsc"; + case at::kSparseBsr: + return stream << "SparseBsr"; + case at::kSparseBsc: + return stream << "SparseBsc"; case at::kMkldnn: return stream << "Mkldnn"; default: diff --git a/c10/core/MemoryFormat.h b/c10/core/MemoryFormat.h index 8cafde1b5c5e..a4dfd1e87ebe 100644 --- a/c10/core/MemoryFormat.h +++ b/c10/core/MemoryFormat.h @@ -29,7 +29,8 @@ enum class MemoryFormat : int8_t { Contiguous, Preserve, ChannelsLast, - ChannelsLast3d + ChannelsLast3d, + NumOptions }; // If you are seeing this, it means that this call site was not checked if @@ -54,7 +55,7 @@ inline std::ostream& operator<<( case MemoryFormat::ChannelsLast3d: return stream << "ChannelsLast3d"; default: - TORCH_CHECK(false, "Unknown memory format"); + TORCH_CHECK(false, "Unknown memory format ", memory_format); } } diff --git a/c10/core/QEngine.h b/c10/core/QEngine.h index ac092193d921..60c21361f15f 100644 --- a/c10/core/QEngine.h +++ b/c10/core/QEngine.h @@ -15,11 +15,13 @@ enum class QEngine : uint8_t { NoQEngine = 0, FBGEMM = 1, QNNPACK = 2, + ONEDNN = 3, }; constexpr auto kNoQEngine = QEngine::NoQEngine; constexpr auto kFBGEMM = QEngine::FBGEMM; constexpr auto kQNNPACK = QEngine::QNNPACK; +constexpr auto kONEDNN = QEngine::ONEDNN; inline std::string toString(QEngine qengine) { switch (qengine) { @@ -29,6 +31,8 @@ inline std::string toString(QEngine qengine) { return "FBGEMM"; case kQNNPACK: return "QNNPACK"; + case kONEDNN: + return "ONEDNN"; default: TORCH_CHECK( false, "Unrecognized Quantized Engine: ", static_cast(qengine)); diff --git a/c10/core/SafePyObject.cpp b/c10/core/SafePyObject.cpp new file mode 100644 index 000000000000..d8c3da49ffb1 --- /dev/null +++ b/c10/core/SafePyObject.cpp @@ -0,0 +1,11 @@ +#include +#include + +namespace c10 { + +PyObject* SafePyObject::ptr(const c10::impl::PyInterpreter* interpreter) const { + TORCH_INTERNAL_ASSERT(interpreter == pyinterpreter_); + return data_; +} + +} // namespace c10 diff --git a/c10/core/SafePyObject.h b/c10/core/SafePyObject.h new file mode 100644 index 000000000000..13e32da3dc1d --- /dev/null +++ b/c10/core/SafePyObject.h @@ -0,0 +1,45 @@ +#pragma once + +#include +#include +#include + +namespace c10 { + +// This is an safe owning holder for a PyObject, akin to pybind11's +// py::object, with two major differences: +// +// - It is in c10/core; i.e., you can use this type in contexts where +// you do not have a libpython dependency +// +// - It is multi-interpreter safe (ala torchdeploy); when you fetch +// the underlying PyObject* you are required to specify what the current +// interpreter context is and we will check that you match it. +// +// It is INVALID to store a reference to a Tensor object in this way; +// you should just use TensorImpl directly in that case! +struct C10_API SafePyObject { + // Steals a reference to data + SafePyObject(PyObject* data, c10::impl::PyInterpreter* pyinterpreter) + : data_(data), pyinterpreter_(pyinterpreter) {} + + // In principle this could be copyable if we add an incref to PyInterpreter + // but for now it's easier to just disallow it. + SafePyObject(SafePyObject const&) = delete; + SafePyObject& operator=(SafePyObject const&) = delete; + + ~SafePyObject() { + pyinterpreter_->decref(data_, /*is_tensor*/ false); + } + + c10::impl::PyInterpreter* pyinterpreter() const { + return pyinterpreter_; + } + PyObject* ptr(const c10::impl::PyInterpreter*) const; + + private: + PyObject* data_; + c10::impl::PyInterpreter* pyinterpreter_; +}; + +} // namespace c10 diff --git a/c10/core/Scalar.h b/c10/core/Scalar.h index 08bf95e1875d..295d1006ff29 100644 --- a/c10/core/Scalar.h +++ b/c10/core/Scalar.h @@ -67,13 +67,17 @@ class C10_API Scalar { } // TODO: Support ComplexHalf accessor - AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_ACCESSOR) + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_ACCESSOR) // also support scalar.to(); // Deleted for unsupported types, but specialized below for supported types template T to() const = delete; + const void* data_ptr() const { + return static_cast(&v); + } + #undef DEFINE_ACCESSOR bool isFloatingPoint() const { return Tag::HAS_d == tag; @@ -201,7 +205,7 @@ using OptionalScalarRef = c10::OptionalRef; inline T Scalar::to() const { \ return to##name(); \ } -AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_TO) +AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_TO) #undef DEFINE_TO } // namespace c10 diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h index a32d4aa42151..0728e67ef2f0 100644 --- a/c10/core/ScalarType.h +++ b/c10/core/ScalarType.h @@ -1,9 +1,8 @@ #pragma once -#include #include +#include #include -#include #include #include #include @@ -63,6 +62,21 @@ namespace c10 { _(bool, Bool) \ _(at::BFloat16, BFloat16) +#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(_) \ + _(uint8_t, Byte) \ + _(int8_t, Char) \ + _(int16_t, Short) \ + _(int, Int) \ + _(int64_t, Long) \ + _(at::Half, Half) \ + _(float, Float) \ + _(double, Double) \ + _(c10::complex, ComplexHalf) \ + _(c10::complex, ComplexFloat) \ + _(c10::complex, ComplexDouble) \ + _(bool, Bool) \ + _(at::BFloat16, BFloat16) + enum class ScalarType : int8_t { #define DEFINE_ENUM(_1, n) n, AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_ENUM) @@ -307,7 +321,7 @@ static inline bool isUnderlying(ScalarType type, ScalarType qtype) { return type == toUnderlying(qtype); } -static inline ScalarType toValueType(ScalarType t) { +static inline ScalarType toRealValueType(ScalarType t) { switch (t) { case ScalarType::ComplexHalf: return ScalarType::Half; @@ -402,28 +416,28 @@ static inline ScalarType promoteTypes(ScalarType a, ScalarType b) { toString(b)); } - // this matrix has to be consistent with AT_FORALL_SCALAR_TYPES_WITH_COMPLEX - // so that's why we have to add undefined as we are not sure what is the - // corrent values for the type promotions in complex type cases. + // this matrix has to be consistent with + // AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS undefined is used where we + // are not sure about the correct value for type promotion. static constexpr ScalarType _promoteTypesLookup[static_cast( ScalarType::NumOptions)][static_cast(ScalarType::NumOptions)] = { /* u1 i1 i2 i4 i8 f2 f4 f8 c2 c4 c8 b1 q1 q2 q3 bf*/ - /* u1 */ {u1, i2, i2, i4, i8, f2, f4, f8, ud, c4, c8, u1, ud, ud, ud, bf}, - /* i1 */ {i2, i1, i2, i4, i8, f2, f4, f8, ud, c4, c8, i1, ud, ud, ud, bf}, - /* i2 */ {i2, i2, i2, i4, i8, f2, f4, f8, ud, c4, c8, i2, ud, ud, ud, bf}, - /* i4 */ {i4, i4, i4, i4, i8, f2, f4, f8, ud, c4, c8, i4, ud, ud, ud, bf}, - /* i8 */ {i8, i8, i8, i8, i8, f2, f4, f8, ud, c4, c8, i8, ud, ud, ud, bf}, - /* f2 */ {f2, f2, f2, f2, f2, f2, f4, f8, ud, c4, c8, f2, ud, ud, ud, f4}, - /* f4 */ {f4, f4, f4, f4, f4, f4, f4, f8, ud, c4, c8, f4, ud, ud, ud, f4}, - /* f8 */ {f8, f8, f8, f8, f8, f8, f8, f8, ud, c8, c8, f8, ud, ud, ud, f8}, - /* c2 */ {ud, ud, ud, ud, ud, ud, ud, ud, c2, c4, c8, ud, ud, ud, ud, ud}, + /* u1 */ {u1, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, u1, ud, ud, ud, bf}, + /* i1 */ {i2, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, i1, ud, ud, ud, bf}, + /* i2 */ {i2, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, i2, ud, ud, ud, bf}, + /* i4 */ {i4, i4, i4, i4, i8, f2, f4, f8, c2, c4, c8, i4, ud, ud, ud, bf}, + /* i8 */ {i8, i8, i8, i8, i8, f2, f4, f8, c2, c4, c8, i8, ud, ud, ud, bf}, + /* f2 */ {f2, f2, f2, f2, f2, f2, f4, f8, c2, c4, c8, f2, ud, ud, ud, f4}, + /* f4 */ {f4, f4, f4, f4, f4, f4, f4, f8, c4, c4, c8, f4, ud, ud, ud, f4}, + /* f8 */ {f8, f8, f8, f8, f8, f8, f8, f8, c8, c8, c8, f8, ud, ud, ud, f8}, + /* c2 */ {c2, c2, c2, c2, c2, c2, c4, c8, c2, c4, c8, c2, ud, ud, ud, c4}, /* c4 */ {c4, c4, c4, c4, c4, c4, c4, c8, c4, c4, c8, c4, ud, ud, ud, c4}, /* c8 */ {c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, ud, ud, ud, c8}, - /* b1 */ {u1, i1, i2, i4, i8, f2, f4, f8, ud, c4, c8, b1, ud, ud, ud, bf}, + /* b1 */ {u1, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, b1, ud, ud, ud, bf}, /* q1 */ {ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud}, /* q2 */ {ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud}, /* q3 */ {ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud}, - /* bf */ {bf, bf, bf, bf, bf, f4, f4, f8, ud, c4, c8, bf, ud, ud, ud, bf}, + /* bf */ {bf, bf, bf, bf, bf, f4, f4, f8, c4, c4, c8, bf, ud, ud, ud, bf}, }; return _promoteTypesLookup[static_cast(a)][static_cast(b)]; } diff --git a/c10/core/ScalarTypeToTypeMeta.h b/c10/core/ScalarTypeToTypeMeta.h index 6d4946b29bc3..910e0d24b0a3 100644 --- a/c10/core/ScalarTypeToTypeMeta.h +++ b/c10/core/ScalarTypeToTypeMeta.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include // these just expose TypeMeta/ScalarType bridge functions in c10 diff --git a/c10/core/Storage.h b/c10/core/Storage.h index 11c7d396fa22..d00b644b10d5 100644 --- a/c10/core/Storage.h +++ b/c10/core/Storage.h @@ -14,7 +14,7 @@ struct C10_API Storage { // Allocates memory buffer using given allocator and creates a storage with it Storage( - use_byte_size_t use_byte_size, + use_byte_size_t /*use_byte_size*/, size_t size_bytes, Allocator* allocator = nullptr, bool resizable = false) @@ -28,7 +28,7 @@ struct C10_API Storage { // potential future reallocations, however it can be nullptr if the storage // is non-resizable Storage( - use_byte_size_t use_byte_size, + use_byte_size_t /*use_byte_size*/, size_t size_bytes, at::DataPtr data_ptr, at::Allocator* allocator = nullptr, diff --git a/c10/core/StorageImpl.h b/c10/core/StorageImpl.h index f90eafee5418..cc167927229a 100644 --- a/c10/core/StorageImpl.h +++ b/c10/core/StorageImpl.h @@ -35,7 +35,7 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target { struct use_byte_size_t {}; StorageImpl( - use_byte_size_t use_byte_size, + use_byte_size_t /*use_byte_size*/, size_t size_bytes, at::DataPtr data_ptr, at::Allocator* allocator, @@ -52,7 +52,7 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target { } StorageImpl( - use_byte_size_t use_byte_size, + use_byte_size_t /*use_byte_size*/, size_t size_bytes, at::Allocator* allocator, bool resizable) diff --git a/c10/core/SymInt.cpp b/c10/core/SymInt.cpp new file mode 100644 index 000000000000..d09135a3389b --- /dev/null +++ b/c10/core/SymInt.cpp @@ -0,0 +1,18 @@ + +#include +#include + +namespace c10 { + +std::shared_ptr SymInt::toSymbolicIntNode() { + auto& st = getSymIntTable(); + TORCH_CHECK(is_symbolic()); + return st.getNode(SymInt::SYM_TAG_MASK ^ static_cast(data_)); +} + +c10::SymInt SymInt::toSymInt(std::shared_ptr sin_sp) { + auto& sit = getSymIntTable(); + auto data = sit.addNode(sin_sp) | SYM_TAG_MASK; + return c10::SymInt(data); +} +} // namespace c10 diff --git a/c10/core/SymInt.h b/c10/core/SymInt.h new file mode 100644 index 000000000000..d189a98b042d --- /dev/null +++ b/c10/core/SymInt.h @@ -0,0 +1,71 @@ +#pragma once + +#include +#include + +namespace c10 { + +class SymbolicIntNode; + +// `SymInt` is a C++ wrapper class around int64_t data_ which and is used to +// represent concrete dimension values. +// +// `SymInt` is also a data type in Pytorch that can be used in function schemas +// to enable tracing. +// +// `SymInt` is introduced to enable tracing arithmetic +// operations on symbolic integers (e.g. sizes). Tracing symbolic sizes will +// allow LTC and AOTAutograd representing dynamic shapes in expression graphs +// faithfully without baking in concrete dimension values. +// +// To trace the operations, SymInt will overload arithmetic operators (e.g. +, +// -, *) and will provide overloads taking SymInt for commonly used math +// functions. +// +// SymInt will be extenteded to represent a union structure Union[int64_t, +// SymbolicIntNode*] which will be implemented as a single packed int64_t field +// named data_. +// +// data_ can be either a plain int64_t or (1 << 63 | `index`). `index` points to +// SymbolicIntNode* that will be responsible for constructing an IR node for +// a traced operation to represent it in LTC or Fx graphs. +class C10_API SymInt { + public: + explicit SymInt(int64_t d) : data_(d){}; + + int64_t expect_int() const { + TORCH_CHECK(!is_symbolic()); + return data_; + } + + bool is_symbolic() const { + return static_cast(SYM_TAG_MASK) & + static_cast(this->data_); + } + + bool operator==(const SymInt& p2) const { + return data_ == p2.data_; + } + + SymInt operator+(SymInt sci) const { + TORCH_CHECK( + !this->is_symbolic() && !sci.is_symbolic(), + "Symbolic Add isn't supported yet"); + return SymInt(data_ + sci.data_); + } + + std::shared_ptr toSymbolicIntNode(); + static c10::SymInt toSymInt(std::shared_ptr sin); + + // This is needed for interoperability with IValue + int64_t data() const { + return data_; + } + + private: + const static int64_t SYM_TAG_MASK = 1LL << 63; + int64_t data_; +}; + +C10_API std::ostream& operator<<(std::ostream& os, SymInt s); +} // namespace c10 diff --git a/c10/core/SymIntArrayRef.cpp b/c10/core/SymIntArrayRef.cpp new file mode 100644 index 000000000000..1ac65c455be0 --- /dev/null +++ b/c10/core/SymIntArrayRef.cpp @@ -0,0 +1,23 @@ +#include +#include + +namespace c10 { + +at::IntArrayRef expectIntArrayRef(c10::SymIntArrayRef ar) { + for (c10::SymInt sci : ar) { + TORCH_CHECK(!sci.is_symbolic()); + } + + return IntArrayRef(reinterpret_cast(ar.data()), ar.size()); +} + +std::ostream& operator<<(std::ostream& os, SymInt s) { + os << "SymInt(" << s.data() << ")"; + return os; +} + +std::ostream& operator<<(std::ostream& out, const c10::SymIntArrayRef& list) { + return out << list.wrapped_symint_array_ref; +} + +} // namespace c10 diff --git a/c10/core/SymIntArrayRef.h b/c10/core/SymIntArrayRef.h new file mode 100644 index 000000000000..f7d3367dbd95 --- /dev/null +++ b/c10/core/SymIntArrayRef.h @@ -0,0 +1,183 @@ +// This file defines `SymIntArrayRef` which serves as the view onto +// std::vector. This class is conceptually and mostly functionally +// equivalent to ArrayRef. +// +// However, ArrayRef can't be used directly as it introduces ambiguity +// in the following cases: +// - a.expand({1, 2, 3}) matches two overloads: +// 1. `at::Tensor Tensor::expand(c10::SymIntArrayRef size, bool implicit)` +// 2. `at::Tensor Tensor::expand(at::IntArrayRef size, bool implicit)` +// Introducing `SymIntArrayRef` allows to have a finer-grained control over +// which overload will be used. + +#pragma once + +#include +#include +#include + +#include +#include +#include +#include + +namespace c10 { +/// SymIntArrayRef - Represent a constant reference to an array (0 or more +/// elements consecutively in memory), i.e. a start pointer and a length. It +/// allows various APIs to take consecutive elements easily and conveniently. +/// +/// This class does not own the underlying data, it is expected to be used in +/// situations where the data resides in some other buffer, whose lifetime +/// extends past that of the SymIntArrayRef. For this reason, it is not in +/// general safe to store an SymIntArrayRef. +/// +/// This is intended to be trivially copyable, so it should be passed by +/// value. + +class SymIntArrayRef final { + public: + using iterator = const c10::SymInt*; + using const_iterator = const c10::SymInt*; + using size_type = size_t; + using value_type = c10::SymInt; + + using reverse_iterator = std::reverse_iterator; + + private: + ArrayRef wrapped_symint_array_ref; + + public: + /// @name Constructors + /// @{ + + /// Construct an empty SymIntArrayRef. + /* implicit */ constexpr SymIntArrayRef() {} + + /* implicit */ SymIntArrayRef(const std::vector& Vec) + : wrapped_symint_array_ref(Vec) {} + + /// Construct an SymIntArrayRef from a pointer and length. + C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA SymIntArrayRef( + const c10::SymInt* data, + size_t length) + : wrapped_symint_array_ref(data, length) {} + + /// Construct an SymIntArrayRef from a range. + C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA SymIntArrayRef( + const c10::SymInt* begin, + const c10::SymInt* end) + : wrapped_symint_array_ref(begin, end) {} + + /// Construct an SymIntArrayRef from a C array. + template + /* implicit */ constexpr SymIntArrayRef(const c10::SymInt (&Arr)[N]) + : wrapped_symint_array_ref(Arr) {} + + /// @} + /// @name Simple Operations + /// @{ + + constexpr iterator begin() const { + return wrapped_symint_array_ref.begin(); + } + constexpr iterator end() const { + return wrapped_symint_array_ref.end(); + } + + // These are actually the same as iterator, since SymIntArrayRef only + // gives you const iterators. + constexpr const_iterator cbegin() const { + return wrapped_symint_array_ref.cbegin(); + } + constexpr const_iterator cend() const { + return wrapped_symint_array_ref.cend(); + } + + /// empty - Check if the array is empty. + constexpr bool empty() const { + return size() == 0; + } + + constexpr const c10::SymInt* data() const { + return wrapped_symint_array_ref.data(); + } + + /// size - Get the array size. + constexpr size_t size() const { + return wrapped_symint_array_ref.size(); + } + + /// front - Get the first element. + C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA const c10::SymInt& front() const { + return wrapped_symint_array_ref.front(); + } + + /// back - Get the last element. + C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA const c10::SymInt& back() const { + return wrapped_symint_array_ref.back(); + } + + /// equals - Check for element-wise equality. + constexpr bool equals(SymIntArrayRef RHS) const { + return this->wrapped_symint_array_ref.equals(RHS.wrapped_symint_array_ref); + } + + /// slice(n, m) - Take M elements of the array starting at element N + C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA SymIntArrayRef + slice(size_t N, size_t M) const { + return SymIntArrayRef(wrapped_symint_array_ref.data() + N, M); + } + + /// slice(n) - Chop off the first N elements of the array. + C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA SymIntArrayRef slice(size_t N) const { + return slice(N, size() - N); + } + + /// @} + /// @name Operator Overloads + /// @{ + constexpr const c10::SymInt& operator[](size_t Index) const { + return wrapped_symint_array_ref[Index]; + } + + /// Vector compatibility + C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA const c10::SymInt& at(size_t Index) const { + return wrapped_symint_array_ref.at(Index); + } + + /// Disallow accidental assignment from a temporary. + /// + /// The declaration here is extra complicated so that "arrayRef = {}" + /// continues to select the move assignment operator. + template + typename std::enable_if::value, SymIntArrayRef>:: + type& + operator=(U&& Temporary) = delete; + + /// Disallow accidental assignment from a temporary. + /// + /// The declaration here is extra complicated so that "arrayRef = {}" + /// continues to select the move assignment operator. + template + typename std::enable_if::value, SymIntArrayRef>:: + type& + operator=(std::initializer_list) = delete; + + /// @} + /// @name Expensive Operations + /// @{ + std::vector vec() const { + return wrapped_symint_array_ref.vec(); + } + + friend std::ostream& operator<<( + std::ostream& out, + const SymIntArrayRef& list); + /// @} +}; + +TORCH_API at::IntArrayRef expectIntArrayRef(c10::SymIntArrayRef ar); + +std::ostream& operator<<(std::ostream& out, const c10::SymIntArrayRef& list); + +} // namespace c10 diff --git a/c10/core/SymIntTable.cpp b/c10/core/SymIntTable.cpp new file mode 100644 index 000000000000..c124ed737c25 --- /dev/null +++ b/c10/core/SymIntTable.cpp @@ -0,0 +1,28 @@ +#include + +namespace c10 { + +int64_t SymIntTable::addNode(std::shared_ptr sin) { + std::lock_guard lock(mutex_); + auto index = nodes_.size(); + nodes_.push_back(sin); + return index; +} +std::shared_ptr SymIntTable::getNode(size_t index) { + std::lock_guard lock(mutex_); + TORCH_CHECK(index < nodes_.size()); + return nodes_[index]; +} + +c10::SymInt SymbolicIntNode::toSymInt() { + // We will need to figure out a way + // to dedup nodes + auto sit_sp = this->shared_from_this(); + return SymInt::toSymInt(sit_sp); +} + +SymIntTable& getSymIntTable() { + static SymIntTable sit; + return sit; +} +} // namespace c10 diff --git a/c10/core/SymbolicIntNode.h b/c10/core/SymbolicIntNode.h new file mode 100644 index 000000000000..cf8fb4de6abf --- /dev/null +++ b/c10/core/SymbolicIntNode.h @@ -0,0 +1,33 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace c10 { + +class C10_API SymbolicIntNode + : public std::enable_shared_from_this { + public: + c10::SymInt toSymInt(); + virtual ~SymbolicIntNode(){}; + virtual std::ostream& operator<<(std::ostream& os) { + return os; + }; +}; + +class C10_API SymIntTable { + public: + int64_t addNode(std::shared_ptr sin); + std::shared_ptr getNode(size_t index); + + private: + std::vector> nodes_; + std::mutex mutex_; +}; + +C10_API SymIntTable& getSymIntTable(); + +} // namespace c10 diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp index b83ee395045e..272425d8855e 100644 --- a/c10/core/TensorImpl.cpp +++ b/c10/core/TensorImpl.cpp @@ -20,43 +20,6 @@ C10_DEFINE_int64( namespace c10 { -namespace impl { - -static std::string noop_name_fn(const PyInterpreter*) { - return ""; -} - -static void noop_decref_fn(const PyInterpreter*, PyObject*, bool) { - // no-op -} - -static c10::intrusive_ptr noop_detach_fn( - const PyInterpreter*, - const TensorImpl*) { - TORCH_INTERNAL_ASSERT( - 0, - "attempted to detach (shallow_copy_and_detach) Tensor with nontrivial PyObject after corresponding interpreter died"); -} - -static void noop_dispatch_fn( - const PyInterpreter*, - const c10::OperatorHandle& op, - torch::jit::Stack* stack, - const std::shared_ptr& type) { - TORCH_INTERNAL_ASSERT( - 0, - "attempted to dispatch (__torch_dispatch__) an operator on Tensor with nontrivial PyObject after corresponding interpreter died"); -} - -void PyInterpreter::disarm() noexcept { - name_fn_ = &noop_name_fn; - decref_fn_ = &noop_decref_fn; - detach_fn_ = &noop_detach_fn; - dispatch_fn_ = &noop_dispatch_fn; -} - -} // namespace impl - const char* const TensorImpl::err_msg_tensor_metadata_change_not_allowed = "is not allowed on a Tensor created from .data or .detach().\n" "If your intent is to change the metadata of a Tensor (such as sizes / strides / storage / storage_offset)\n" @@ -120,11 +83,11 @@ TensorImpl::TensorImpl( // [Note: Python key removal] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -// In most constructors for TensorImpl, you will see Python key is removed from -// the passed in DispatchKeySet. Why? +// In most constructors for TensorImpl, you will see Python and +// PythonTLSSnapshot keys are removed from the passed in DispatchKeySet. Why? // -// INVARIANT: Python dispatch key is set iff PyObject for the Tensor has a -// nontrivial __torch_dispatch__ implementation. +// INVARIANT: Python and PythonTLSSnapshot dispatch keys are set iff PyObject +// for the Tensor has a nontrivial __torch_dispatch__ implementation. // // When a fresh TensorImpl is created, there is *no* PyObject (this only gets // initialized lazily at the first point in time the Tensor passes into Python). @@ -132,8 +95,8 @@ TensorImpl::TensorImpl( // // In practice, what will happen shortly afterwards is that the TensorImpl // will get its PyObject initialized by Tensor._make_subclass; at this point -// the Python dispatch key will be set and all is well. The point is to delay -// the dispatch key setting until that point. +// the Python and PythonTLSSnapshot dispatch keys will be set and all is well. +// The point is to delay the dispatch key setting until that point. // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) TensorImpl::TensorImpl( @@ -148,8 +111,7 @@ TensorImpl::TensorImpl( numel_(0), data_type_(data_type), device_opt_(storage_.device()), - key_set_(key_set.remove( - DispatchKey::Python)) { // See [Note: Python key removal] + key_set_(key_set - c10::python_ks) { // See [Note: Python key removal] init_bitfields(); // Inference tensor doesn't have version counter. if (!is_inference()) { @@ -190,12 +152,12 @@ TensorImpl::TensorImpl( // TODO: be more explicit about the full key set at call sites so we // don't have to keep recomputing it here - DispatchKey k = key_set.highestPriorityBackendTypeId(); + auto k = key_set.highestBackendKey(); key_set = key_set | getAutocastRelatedKeySetFromBackend(k); - key_set = - key_set.remove(DispatchKey::Python); // See [Note: Python key removal] + // See [Note: Python key removal] + key_set = key_set - c10::python_ks; // Inference tensor doesn't have autograd related keys. if (inference_mode) { @@ -219,16 +181,6 @@ TensorImpl::TensorImpl( // Caffe2 operators create Storages with default devices. } -#ifndef C10_DISABLE_TENSORIMPL_EXTENSIBILITY -IntArrayRef TensorImpl::sizes() const { - return sizes_and_strides_.sizes_arrayref(); -} -#endif - -IntArrayRef TensorImpl::strides() const { - return sizes_and_strides_.strides_arrayref(); -} - void TensorImpl::HandleResize() { // If needed, we will free the data. the next mutable_data() call // will create the data storage. @@ -371,11 +323,11 @@ void TensorImpl::release_resources() { if (storage_) { storage_ = {}; } - if (owns_pyobj_) { + if (owns_pyobj()) { TORCH_INTERNAL_ASSERT(pyobj_interpreter_ != nullptr); TORCH_INTERNAL_ASSERT(pyobj_ != nullptr); pyobj_interpreter_.load(std::memory_order_acquire) - ->decref(pyobj_, /*is_tensor*/ true); + ->decref(_unchecked_untagged_pyobj(), /*is_tensor*/ true); // NB: this destructor can only be entered when there are no // references to this C++ object (obviously), NOR any references // to the PyObject (if there are references to the PyObject, @@ -386,22 +338,6 @@ void TensorImpl::release_resources() { } } -#ifndef C10_DISABLE_TENSORIMPL_EXTENSIBILITY -int64_t TensorImpl::dim() const { - return sizes_and_strides_.size(); -} -#endif - -int64_t TensorImpl::size(int64_t d) const { - d = at::maybe_wrap_dim(d, dim(), false); - return sizes_and_strides_.size_at_unchecked(d); -} - -int64_t TensorImpl::stride(int64_t d) const { - d = at::maybe_wrap_dim(d, dim(), false); - return sizes_and_strides_.stride_at_unchecked(d); -} - #ifndef C10_DISABLE_TENSORIMPL_EXTENSIBILITY bool TensorImpl::has_storage() const { return storage_; @@ -413,28 +349,42 @@ void TensorImpl::throw_storage_access_error() const { false, "Cannot access storage of ", tensorimpl_type_name()); } -bool TensorImpl::is_contiguous_nondefault_policy_impl( - at::MemoryFormat memory_format) const { - if (has_contiguity_ == - static_cast(HasContiguityPolicy::ContiguityNotSupported)) { - TORCH_CHECK_NOT_IMPLEMENTED( +bool TensorImpl::is_contiguous_custom(at::MemoryFormat memory_format) const { + if (is_python_dispatch()) { + auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire); + if (interpreter) { + return interpreter->is_contiguous(this); + } + TORCH_CHECK( false, - "Tensors of type ", - tensorimpl_type_name(), - " do not have is_contiguous"); - } else { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - has_contiguity_ == - static_cast(HasContiguityPolicy::CustomBehavior)); - return is_contiguous_custom(memory_format); + "cannot access PyObject for Tensor on interpreter ", + pyobj_interpreter_.load()->name()); } + TORCH_CHECK( + false, + "Tensors of type ", + tensorimpl_type_name(), + " do not have is_contiguous"); } -bool TensorImpl::is_contiguous_custom(at::MemoryFormat memory_format) const { - TORCH_INTERNAL_ASSERT( +IntArrayRef TensorImpl::sizes_custom() const { + TORCH_CHECK( + false, "Tensors of type ", tensorimpl_type_name(), " do not have sizes"); +} +IntArrayRef TensorImpl::strides_custom() const { + TORCH_CHECK( false, - "TensorImpl::is_contiguous_custom should never be called; did you " - "set_has_contiguity_policy and forget to override is_contiguous_custom?"); + "Tensors of type ", + tensorimpl_type_name(), + " do not have strides"); +} +int64_t TensorImpl::dim_custom() const { + TORCH_CHECK( + false, "Tensors of type ", tensorimpl_type_name(), " do not have dim"); +} +int64_t TensorImpl::numel_custom() const { + TORCH_CHECK( + false, "Tensors of type ", tensorimpl_type_name(), " do not have numel"); } static void deletePlacementDeleteContext(void* ptr) { @@ -544,18 +494,25 @@ c10::intrusive_ptr TensorImpl::shallow_copy_and_detach( std::move(version_counter), allow_tensor_metadata_change); } -void TensorImpl::copy_tensor_metadata_except_version_counter( +// This function copies all of the metadata from the src tensor except for: +// - key_set_ +// - storage_ +// - storage_access_should_throw_ +// - sizes_strides_policy_ +// - version_counter_ +// - allow_tensor_metadata_change_ +// The idea is that if we have a "wrapper tensor" (like in functionalization), +// all of the above are properties that the wrapper will want to customize, +// while everything else should be mirrored between the wrapper and the inner +// tensor. +void TensorImpl::copy_generic_tensor_metadata( const TensorImpl* src_impl, - TensorImpl* dest_impl, - bool allow_tensor_metadata_change) { - dest_impl->storage_ = src_impl->storage_; + TensorImpl* dest_impl) { dest_impl->sizes_and_strides_ = src_impl->sizes_and_strides_; dest_impl->storage_offset_ = src_impl->storage_offset_; dest_impl->data_type_ = src_impl->data_type_; dest_impl->device_opt_ = src_impl->device_opt_; - dest_impl->key_set_ = src_impl->key_set_.remove(DispatchKey::Python); dest_impl->is_contiguous_ = src_impl->is_contiguous_; - dest_impl->has_contiguity_ = src_impl->has_contiguity_; dest_impl->is_channels_last_contiguous_ = src_impl->is_channels_last_contiguous_; dest_impl->is_channels_last_3d_contiguous_ = @@ -566,14 +523,32 @@ void TensorImpl::copy_tensor_metadata_except_version_counter( src_impl->is_non_overlapping_and_dense_; dest_impl->is_wrapped_number_ = src_impl->is_wrapped_number_; dest_impl->reserved_ = src_impl->reserved_; - dest_impl->set_allow_tensor_metadata_change(allow_tensor_metadata_change); - dest_impl->storage_access_should_throw_ = - src_impl->storage_access_should_throw_; if (src_impl->named_tensor_meta_ != nullptr) { dest_impl->named_tensor_meta_ = src_impl->named_tensor_meta_->clone(); } } +void TensorImpl::copy_tensor_metadata_except_version_counter( + const TensorImpl* src_impl, + TensorImpl* dest_impl, + bool allow_tensor_metadata_change) { + // First call the generic copy function + copy_generic_tensor_metadata(src_impl, dest_impl); + // Then copy everything else (see the comment at copy_generic_tensor_metadata + // for the list of metadata that it does not directly copy). + dest_impl->storage_ = src_impl->storage_; + // Copying tensor metadata doesn't change the PyObject (maybe + // it should), which means that we have to preserve whatever the + // original Python keyset was (as it's associated with the PyObject + // being a tensor subclass or not) + dest_impl->key_set_ = (src_impl->key_set_ - c10::python_ks) | + (dest_impl->key_set_ & c10::python_ks); + dest_impl->set_allow_tensor_metadata_change(allow_tensor_metadata_change); + dest_impl->sizes_strides_policy_ = src_impl->sizes_strides_policy_; + dest_impl->storage_access_should_throw_ = + src_impl->storage_access_should_throw_; +} + void TensorImpl::copy_tensor_metadata( const TensorImpl* src_impl, TensorImpl* dest_impl, @@ -601,21 +576,178 @@ void TensorImpl::copy_tensor_metadata( } } -TorchDispatchTypeObject::TorchDispatchTypeObject( - PyObject* type_object, - c10::impl::PyInterpreter* pyinterpreter) - : data_(type_object), pyinterpreter_(pyinterpreter) {} +// Legacy Caffe2 operations -TorchDispatchTypeObject::~TorchDispatchTypeObject() { - pyinterpreter_->decref(data_, /*is_tensor*/ false); -} - -c10::impl::PyInterpreter* TorchDispatchTypeObject::pyinterpreter() const { - return pyinterpreter_; +void TensorImpl::Extend(int64_t num, float growthPct) { + TORCH_CHECK(sizes_and_strides_.size() >= 1u); + TORCH_CHECK(num >= 0, "`num` must be non-negative for Extend"); + TORCH_CHECK( + is_contiguous_, + "Right now Extend is only supported for contiguous Tensor."); + using SizesVector = SmallVector; + SizesVector newDims( + sizes_and_strides_.sizes_begin(), sizes_and_strides_.sizes_end()); + newDims[0] += num; + if (!storage_.data()) { + Resize(newDims); + return; + } + const auto newNumel = c10::multiply_integers(newDims.begin(), newDims.end()); + if (newNumel * data_type_.itemsize() <= storage_.nbytes()) { + sizes_and_strides_.set_sizes(newDims); + numel_ = newNumel; + return; + } + SizesVector newCapacity( + sizes_and_strides_.sizes_begin(), sizes_and_strides_.sizes_end()); + newCapacity[0] = std::max( + newDims[0], + static_cast(std::ceil( + sizes_and_strides_.size_at_unchecked(0) * (1 + growthPct / 100)))); + auto oldData = std::move(storage_.data_ptr()); + auto oldSize = numel_; + Resize(newCapacity); + auto* newData = raw_mutable_data(data_type_); + if (data_type_.copy()) { + TORCH_CHECK( + device_type() == DeviceType::CPU, "non-POD types work only on CPU"); + data_type_.copy()(oldData.get(), newData, oldSize); + } else { + // The following copy uses the current (thread local) stream for copying + // and also takes the GPU id from the device() field passed in. + // + // TODO: Potentially more enforcements are necessary to avoid accidental + // switch to sync copy if the currently set device is wrong. + // + // Specifically, we might need to switch to a different context device + // here explicitly to avoid relying on user synchronizing things + // properly. + CopyBytes( + oldSize * itemsize(), + oldData.get(), + device(), + newData, + device(), + true); // non-blocking + } + reserved_ = true; + sizes_and_strides_.set_sizes(newDims); + numel_ = newNumel; } -PyObject* TorchDispatchTypeObject::ptr() const { - return data_; +void TensorImpl::ReserveSpace(int64_t outer_dim) { + TORCH_CHECK( + is_contiguous_, + "Right now ReserveSpace is only supported for contiguous Tensor."); + TORCH_CHECK(storage_.unique(), "Can't call ReserveSpace on shared storage."); + // TODO: eliminate newCapacity. + SmallVector newCapacity( + sizes_and_strides_.sizes_begin(), sizes_and_strides_.sizes_end()); + newCapacity[0] = outer_dim; + auto newNumel = c10::multiply_integers(newCapacity); + if (newNumel * data_type_.itemsize() <= storage_.nbytes()) { + return; + } + // Old data is discarded + storage_.data_ptr().clear(); + auto oldSize = numel_; + SmallVector oldDims( + sizes_and_strides_.sizes_begin(), sizes_and_strides_.sizes_end()); + Resize(newCapacity); + // Allocate new memory but don't copy over the data + raw_mutable_data(data_type_); + sizes_and_strides_.set_sizes(oldDims); + numel_ = oldSize; + reserved_ = true; +} + +void TensorImpl::Reshape(const std::vector& dims) { + TORCH_CHECK( + is_contiguous_, + "Right now Reshape is only supported for contiguous Tensor."); + int64_t new_size = 1; + for (auto d : dims) { + TORCH_CHECK(d >= 0); + new_size *= d; + } + TORCH_CHECK( + new_size == numel_, + "New size and old size are not equal. You cannot use Reshape, " + "but should use Resize." + // TODO(jiayq): remove the following warning after pending diffs + // stabilize. + " The old caffe2 mixes Reshape and Resize but this behavior has " + "been changed. If you find this error, most likely you will need " + "to change corresponding code from Reshape to Resize."); + sizes_and_strides_.set_sizes(dims); + empty_tensor_restride(MemoryFormat::Contiguous); +} + +void TensorImpl::FreeMemory() { + // We'll detach from the old Storage and create a new one + storage_ = Storage::create_legacy(storage_.device()); + storage_offset_ = 0; +} + +void TensorImpl::ShareData(const TensorImpl& src) { + // Right now, we are assuming the device_type are the same, since it is + // inherently the same in the non-templatized code. We should probably add + // an assert here which might affect perf a little bit. + TORCH_CHECK( + src.numel_ == numel_, + "Size mismatch - did you call reshape before sharing the data?"); + // It is possible that the source tensor hasn't called mutable_data() yet, + // in which case ShareData() doesn't make much sense since we don't really + // know what to share yet. + // TODO: Add the assert after all uninitialized states are eliminated + // TORCH_CHECK(src.dtype_initialized(), + // "Source tensor don't have a data type (did you call + // mutable_data on the tensor?)"); + if (!src.dtype_initialized()) { + C10_LOG_EVERY_MS(WARNING, 1000) + << "Source tensor don't have a data type (did you call mutable_data on the tensor?)"; + } + TORCH_CHECK( + src.storage_initialized(), + "Source tensor has no content and has size > 0"); + // Finally, do sharing. + /* Since we create new Storage whenever we need to change data_type/nbytes + * this still keeps the original semantics + */ + storage_ = src.storage(); + data_type_ = src.dtype(); + device_opt_ = src.device_opt(); + storage_offset_ = src.storage_offset(); +} + +void TensorImpl::ShareExternalPointer( + DataPtr&& data_ptr, + const caffe2::TypeMeta data_type, + size_t size_bytes) { + TORCH_CHECK( + data_type != ScalarType::Undefined, + "To share with a raw external pointer you need to pass in an " + "initialized data_type(TypeMeta)."); + if (!size_bytes) { + size_bytes = numel_ * data_type.itemsize(); + } + if (storage_.unique()) { + storage_.UniqueStorageShareExternalPointer(std::move(data_ptr), size_bytes); + data_type_ = data_type; + device_opt_ = storage_.device(); + storage_offset_ = 0; + } else { + // Create a new Storage + storage_ = Storage( + Storage::use_byte_size_t(), + size_bytes, + std::move(data_ptr), + /*allocator=*/nullptr, + /*resizable=*/false); + data_type_ = data_type; + device_opt_ = storage_.device(); + storage_offset_ = 0; + } } namespace impl { diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h index 86aca278c9d3..717f066e4127 100644 --- a/c10/core/TensorImpl.h +++ b/c10/core/TensorImpl.h @@ -6,8 +6,11 @@ #include #include #include +#include #include +#include #include +#include #include #include #include @@ -16,9 +19,11 @@ #include #include #include +#include #include #include +#include #include #include @@ -49,17 +54,9 @@ class TensorBase; namespace c10 { class Scalar; -struct IValue; struct Storage; -class OperatorHandle; } // namespace c10 -namespace torch { -namespace jit { -using Stack = std::vector; -} -} // namespace torch - namespace c10 { /** @@ -92,7 +89,7 @@ inline int64_t size_to_dim_(int k, IntArrayRef dims) { // Product of all dims between k and l (not including dims[k] and dims[l]) inline int64_t size_between_dim_(int k, int l, IntArrayRef dims) { - TORCH_CHECK((unsigned)l < dims.size()); + TORCH_CHECK((unsigned)l < dims.size() && (unsigned)k < dims.size()); int64_t r = 1; if (k < l) { for (int i = k + 1; i < l; ++i) { @@ -168,9 +165,6 @@ struct C10_API AutogradMetaInterface { virtual ~AutogradMetaInterface(); }; -// forward declared -struct TorchDispatchTypeObject; - namespace impl { // Unfortunately, the definition of AutogradMeta lives in a separate @@ -196,137 +190,6 @@ struct C10_API AutogradMetaFactoryRegisterer { } }; -// Note [Python interpreter tag] -// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -// We store a PyObject on TensorImpl so that we can efficiently translate -// tensors into the Python representations. However, in some situations -// (torchdeploy) there may be multiple Python interpreters in a single process -// and we must take care not to accidentally mix up PyObjects with the wrong -// interpreters. Thus, we also tag every TensorImpl with the Python interpreter -// it corresponds to. -// -// With torchdeploy, we have these invariants: -// - Any given TensorImpl can be associated with AT MOST one Python -// interpreter. -// We represent the interpreter tag as a memory address to an instance of -// a virtual class that is allocated once per interpreter (this is so that -// we can request the interpreter to perform operations for us, if -// necessary). -// - A given TensorImpl's interpreter tag can only go from uninitialized to -// tagged; once tagged, this is a quiescent state (once tagged to an -// interpreter, ALWAYS tagged to that interpreter) -// - A thread may mutate the PyObject field of a TensorImpl if and only if it -// holds the GIL for the interpreter tagged on the TensorImpl. (If the -// TensorImpl is not tagged, it must first atomically claim its tag before it -// can validly write) - -// The PyInterpreter object itself is a class that contains some function -// pointers for interacting with the interpreter. For now this is just for -// debugging, but if a Tensor can own a PyObject, the interpreter can be used to -// free it. -// -// WARNING: This class has to be written very carefully, because it may be -// possible for a Tensor to have a reference an interpreter corresponding to -// a shared library that has ALREADY BEEN UNLOADED. This makes blindly calling -// virtual methods very dangerous, because the vtable may be garbage at that -// point (on a good day, you might get "pure virtual method called"). -// -// The idea to solve this problem is we always leak PyInterpreters (so they -// always stay live even after dlclose), and disarm the "virtual methods" by -// replacing them with function pointers that just no-op. This can't be done -// with a traditional C++ vtable, so we have to roll our own. -// -// NB: The downside with representing PyInterpreter tags as full objects is that -// it takes an extra word on TensorImpl. If tags were instead just integer -// indices, on 64-bit architectures we could pack the tag and PyObject together -// into a single atomic word. On 32-bit architectures we could simply say that -// only one Python interpreter is supported (erroring if a nontrivial -// interpreter tag is attempted to be set). -// -// The difficulty with this scheme is we need to maintain an out-of-line table -// to get at the PyInterpreters so that we can do virtual method calls on them, -// and registration/deregistration to this table must be done in a thread safe -// manner. This can be easily done if the number of possible PyInterpreters is -// small enough (e.g., 8-bit integer) by simply preallocating an array of -// sufficient size to hold all possible interpreters. Surely 128 threads is -// more than enough for anyone! -// -// I didn't decide to do this technique at the moment, because the extra word -// added by the PyInterpreter tag takes us to 24 words, which means that we -// still fit inside three eight word cache lines. If you need to penny pinch -// another word consider doing this! - -struct PyInterpreter; -struct C10_API PyInterpreter { - using name_sig = std::string(const PyInterpreter*); - using decref_sig = void(const PyInterpreter*, PyObject*, bool); - using detach_sig = - c10::intrusive_ptr(const PyInterpreter*, const TensorImpl*); - using dispatch_sig = void( - const PyInterpreter*, - const c10::OperatorHandle&, - torch::jit::Stack* stack, - const std::shared_ptr& type); - - PyInterpreter( - name_sig* name_fn, - decref_sig* decref_fn, - detach_sig* detach, - dispatch_sig* dispatch) - : name_fn_(name_fn), - decref_fn_(decref_fn), - detach_fn_(detach), - dispatch_fn_(dispatch) {} - - name_sig* name_fn_; - decref_sig* decref_fn_; - detach_sig* detach_fn_; - dispatch_sig* dispatch_fn_; - - // UBSAN suppression fixes: "call to function - // (anonymous namespace)::concrete_decref_fn(c10::impl::PyInterpreter const*, - // _object*) through pointer to incorrect function type 'void (*)(const - // c10::impl::PyInterpreter *, _object *)'" See - // https://github.com/google/sanitizers/issues/911 - - // Report the name of this interpreter - __ubsan_ignore_function__ std::string name() const { - return (*name_fn_)(this); - } - - // Run Py_DECREF on a PyObject. We DO NOT assume the GIL is held on call - // See NOTE [PyInterpreter::decref takes an `is_tensor` arg] - __ubsan_ignore_function__ void decref(PyObject* pyobj, bool is_tensor) const { - return (*decref_fn_)(this, pyobj, is_tensor); - } - - // Perform a detach by deferring to the __torch_dispatch__ implementation of - // detach, which will also arrange for the PyObject to get copied in this - // situation - __ubsan_ignore_function__ c10::intrusive_ptr detach( - const TensorImpl* self) const { - return (*detach_fn_)(this, self); - } - - // Invoke the Python boxed fallback dispatch to go back into Python - __ubsan_ignore_function__ void dispatch( - const c10::OperatorHandle& op, - torch::jit::Stack* stack, - const std::shared_ptr& type) const { - return (*dispatch_fn_)(this, op, stack, type); - } - - // Disarm this PyInterpreter, making all of its methods noops. - // Because the function pointers are raw pointers (not atomics), - // a disarm() invocation that is concurrent with active destructors - // is not thread safe and will trigger TSAN. My hope is that this - // situations doesn't ever actually happen; tensor destruction should - // quiesce when a dlclose happens, and any long lived tensors whose - // destructors would be disarmed here only begin the destruction process - // on process shutdown (long after the dlclose has occurred). - void disarm() noexcept; -}; - // PyInterpreterStatus describes what the state of its interpreter tag // is, relative to the thread currently holding the GIL. enum class PyInterpreterStatus { @@ -361,30 +224,6 @@ struct C10_API NamedTensorMetaInterface { }; }; -// NOTE [What is TorchDispatchTypeObject?] -// A TorchDispatchTypeObject represents the type of a Tensor subclass that has -// a __torch_dispatch__ classmethod. Concretely, it holds the class as a -// PyObject* and a PyInterpreter* that says which python interpreter the class -// came from. -// -// See NOTE [dispatch_fn's type argument] for more details -struct C10_API TorchDispatchTypeObject { - // Steals a reference to type_object - TorchDispatchTypeObject( - PyObject* type_object, - c10::impl::PyInterpreter* pyinterpreter); - - // Releases the stolen reference to type_object - ~TorchDispatchTypeObject(); - - c10::impl::PyInterpreter* pyinterpreter() const; - PyObject* ptr() const; - - private: - PyObject* data_; - c10::impl::PyInterpreter* pyinterpreter_; -}; - // NOTE [ Version Counter Sharing ] // // Every Tensor has a version counter. Version counters are incremented whenever @@ -700,34 +539,153 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * Return a reference to the sizes of this tensor. This reference remains * valid as long as the tensor is live and not resized. */ - TENSORIMPL_MAYBE_VIRTUAL IntArrayRef sizes() const -#ifdef C10_DISABLE_TENSORIMPL_EXTENSIBILITY - { - return sizes_and_strides_.sizes_arrayref(); + IntArrayRef sizes() const { + if (C10_UNLIKELY( + sizes_strides_policy_ >= + static_cast(SizesStridesPolicy::CustomSizes))) { + return sizes_custom(); + } + return sizes_default(); } -#else - ; -#endif /** * Return a reference to the strides of this tensor. This reference remains * valid as long as the tensor is live and not restrided. */ - virtual IntArrayRef strides() const; + IntArrayRef strides() const { + if (C10_UNLIKELY( + sizes_strides_policy_ >= + static_cast(SizesStridesPolicy::CustomStrides))) { + return strides_custom(); + } + return strides_default(); + } + + /** + * Return the size of a tensor at some dimension, wrapping the dimension if + * necessary. + * + * NOTE: if you know wrapping is unnecessary, do sizes()[d] instead; it will + * be faster + */ + int64_t size(int64_t d) const { + d = maybe_wrap_dim(d, dim(), false); + if (C10_UNLIKELY( + sizes_strides_policy_ >= + static_cast(SizesStridesPolicy::CustomSizes))) { + return sizes_custom()[d]; // unchecked (maybe_wrap_dim enforces bounds) + } + return sizes_and_strides_.size_at_unchecked(d); + } + + /** + * Return the stride of a tensor at some dimension, wrapping the dimension + * if necessary. + * + * NOTE: if you know wrapping is unnecessary, do sizes()[d] instead; it will + * be faster + */ + int64_t stride(int64_t d) const { + d = maybe_wrap_dim(d, dim(), false); + if (C10_UNLIKELY( + sizes_strides_policy_ >= + static_cast(SizesStridesPolicy::CustomStrides))) { + return strides_custom()[d]; // unchecked (maybe_wrap_dim enforces bounds) + } + return sizes_and_strides_.stride_at_unchecked(d); + } /** * Return the number of dimensions of this tensor. Note that 0-dimension * represents a Tensor that is a Scalar, e.g., one that has a single element. */ - TENSORIMPL_MAYBE_VIRTUAL int64_t dim() const -#ifdef C10_DISABLE_TENSORIMPL_EXTENSIBILITY - { + int64_t dim() const { + if (C10_UNLIKELY( + sizes_strides_policy_ >= + static_cast(SizesStridesPolicy::CustomSizes))) { + return dim_custom(); + } + return dim_default(); + } + + /** + * The number of elements in a tensor. + * + * WARNING: Previously, if you were using the Caffe2 API, you could + * test numel() == -1 to see if a tensor was uninitialized. This + * is no longer true; numel always accurately reports the product + * of sizes of a tensor. + */ + int64_t numel() const { + if (C10_UNLIKELY( + sizes_strides_policy_ >= + static_cast(SizesStridesPolicy::CustomSizes))) { + return numel_custom(); + } + return numel_default(); + } + + /** + * Whether or not a tensor is laid out in contiguous memory. + * + * Tensors with non-trivial strides are not contiguous. See + * compute_contiguous() for the exact definition of whether or not + * a tensor is contiguous or not. + */ + bool is_contiguous( + at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) const { + if (C10_UNLIKELY( + sizes_strides_policy_ >= + static_cast(SizesStridesPolicy::CustomStrides))) { + return is_contiguous_custom(memory_format); + } + return is_contiguous_default(memory_format); + } + + protected: + /** + * Customization points for the functions above. sizes_strides_policy_ + * must be set to enable these. + * + * NB: dim is overrideable separately from sizes because it is possible + * for a tensor to have rank, but not well defined sizes. + */ + // sizes_strides_policy_ >= CustomStrides + virtual IntArrayRef strides_custom() const; + virtual bool is_contiguous_custom(at::MemoryFormat memory_format) const; + // sizes_strides_policy_ >= CustomSizes + virtual IntArrayRef sizes_custom() const; + virtual int64_t dim_custom() const; + virtual int64_t numel_custom() const; + + // These are factored into separate functions in case subclasses + // want to use them + inline IntArrayRef strides_default() const { + return sizes_and_strides_.strides_arrayref(); + } + inline bool is_contiguous_default(at::MemoryFormat memory_format) const { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(compute_contiguous() == is_contiguous_); + if (memory_format == at::MemoryFormat::ChannelsLast) { + return is_channels_last_contiguous_; + } else if (memory_format == at::MemoryFormat::ChannelsLast3d) { + return is_channels_last_3d_contiguous_; + } + return is_contiguous_; + } + inline IntArrayRef sizes_default() const { + return sizes_and_strides_.sizes_arrayref(); + } + inline int64_t dim_default() const { return sizes_and_strides_.size(); } -#else - ; + inline int64_t numel_default() const { +#ifdef DEBUG + TORCH_INTERNAL_ASSERT(compute_numel() == numel_); #endif + return numel_; + } + public: /** * True if this tensor has storage. See storage() for details. */ @@ -777,164 +735,125 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { return storage_; } - /** - * The number of elements in a tensor. - * - * WARNING: Previously, if you were using the Caffe2 API, you could - * test numel() == -1 to see if a tensor was uninitialized. This - * is no longer true; numel always accurately reports the product - * of sizes of a tensor. - */ - TENSORIMPL_MAYBE_VIRTUAL int64_t numel() const { -#ifdef DEBUG - TORCH_INTERNAL_ASSERT(compute_numel() == numel_); -#endif - return numel_; - } - bool unique_version() const { return version_counter_.unique(); } - /** - * Whether or not a tensor is laid out in contiguous memory. - * - * Tensors with non-trivial strides are not contiguous. See - * compute_contiguous() for the exact definition of whether or not - * a tensor is contiguous or not. - * - * NOTE: is_contiguous is only `TENSORIMPL_MAYBE_VIRTUAL` for - * backward compatibility. See `set_has_contiguity_policy` and - * `is_contiguous_custom` for the encouraged customization point. - */ - TENSORIMPL_MAYBE_VIRTUAL bool is_contiguous( - at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) const { - if (C10_UNLIKELY( - has_contiguity_ != - static_cast(HasContiguityPolicy::Default))) { - return is_contiguous_nondefault_policy_impl(memory_format); - } - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(compute_contiguous() == is_contiguous_); - if (memory_format == at::MemoryFormat::ChannelsLast) { - return is_channels_last_contiguous_; - } else if (memory_format == at::MemoryFormat::ChannelsLast3d) { - return is_channels_last_3d_contiguous_; - } - return is_contiguous_; - } - - private: - bool is_contiguous_nondefault_policy_impl(at::MemoryFormat) const; - protected: - /** - * Customization point for is_contiguous; must also - * set_has_contiguity_policy(HasContiguityPolicy::Custom) for this - * to be called. - */ - virtual bool is_contiguous_custom(at::MemoryFormat memory_format) const; + virtual Layout layout_impl() const { + TORCH_CHECK( + false, "layout_impl is only implemented for TensorImpl subclasses."); + } public: + // Whether a tensor is sparse COO or not. bool is_sparse() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has(DispatchKey::SparseCPU) || - key_set_.has(DispatchKey::SparseCUDA) || - key_set_.has(DispatchKey::SparseHIP) || - key_set_.has(DispatchKey::SparseXPU); + return key_set_.has_all(c10::sparse_ks); } - // Whether a tensor is sparse COO or not. Use is_sparse_csr for checking CSR - // format. + // Whether a tensor is sparse CSR or not. bool is_sparse_csr() const { - return key_set_.has(DispatchKey::SparseCsrCPU) || - key_set_.has(DispatchKey::SparseCsrCUDA); + return layout() == kSparseCsr; } bool is_quantized() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has(DispatchKey::QuantizedCPU) || - key_set_.has(DispatchKey::QuantizedCUDA) || - key_set_.has(DispatchKey::QuantizedXPU); + constexpr auto quantized_ks = DispatchKeySet(DispatchKey::Quantized); + return key_set_.has_all(quantized_ks); } bool is_meta() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has(DispatchKey::Meta); + constexpr auto meta_ks = DispatchKeySet(DispatchKey::Meta); + return key_set_.has_all(meta_ks); } bool is_cpu() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has(DispatchKey::CPU) || - key_set_.has(DispatchKey::SparseCPU) || - key_set_.has(DispatchKey::SparseCsrCPU) || - key_set_.has(DispatchKey::QuantizedCPU) || - key_set_.has(DispatchKey::MkldnnCPU); + constexpr auto cpu_bits_ks = DispatchKeySet(BackendComponent::CPUBit) | + DispatchKeySet({DispatchKey::SparseCsrCPU, DispatchKey::MkldnnCPU}); + return key_set_.has_any(cpu_bits_ks); } bool is_cuda() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has(DispatchKey::CUDA) || - key_set_.has(DispatchKey::SparseCUDA) || - key_set_.has(DispatchKey::SparseCsrCUDA) || - key_set_.has(DispatchKey::QuantizedCUDA); + constexpr auto cuda_bits_ks = DispatchKeySet(BackendComponent::CUDABit) | + DispatchKeySet(DispatchKey::SparseCsrCUDA); + return key_set_.has_any(cuda_bits_ks); } bool is_xpu() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has(DispatchKey::XPU) || - key_set_.has(DispatchKey::SparseXPU) || - key_set_.has(DispatchKey::QuantizedXPU); + constexpr auto xpu_ks = DispatchKeySet(BackendComponent::XPUBit); + return key_set_.has_all(xpu_ks); + } + + bool is_ipu() const { + constexpr auto ipu_ks = DispatchKeySet(BackendComponent::IPUBit); + return key_set_.has_all(ipu_ks); } bool is_xla() const { - return key_set_.has(DispatchKey::XLA); + constexpr auto xla_ks = DispatchKeySet(BackendComponent::XLABit); + return key_set_.has_all(xla_ks); } bool is_hpu() const { - return key_set_.has(DispatchKey::HPU); + constexpr auto hpu_ks = DispatchKeySet(BackendComponent::HPUBit); + return key_set_.has_all(hpu_ks); } bool is_lazy() const { - return key_set_.has(DispatchKey::Lazy); + constexpr auto lazy_ks = DispatchKeySet(BackendComponent::LazyBit); + return key_set_.has_all(lazy_ks); } bool is_hip() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has(DispatchKey::HIP) || - key_set_.has(DispatchKey::SparseHIP); + constexpr auto hip_ks = DispatchKeySet(BackendComponent::HIPBit); + return key_set_.has_all(hip_ks); } bool is_ve() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has(DispatchKey::VE) || key_set_.has(DispatchKey::SparseVE); + constexpr auto ve_ks = DispatchKeySet(BackendComponent::VEBit); + return key_set_.has_all(ve_ks); } bool is_mkldnn() const { - return key_set_.has(DispatchKey::MkldnnCPU); + return key_set_.has_all(c10::mkldnn_ks); } bool is_vulkan() const { - return key_set_.has(DispatchKey::Vulkan); + constexpr auto vulkan_ks = DispatchKeySet(DispatchKey::Vulkan); + return key_set_.has_all(vulkan_ks); } bool is_metal() const { - return key_set_.has(DispatchKey::Metal); + constexpr auto metal_ks = DispatchKeySet(DispatchKey::Metal); + return key_set_.has_all(metal_ks); } - bool is_mlc() const { - return key_set_.has(DispatchKey::MLC); + bool is_mps() const { + return key_set_.has(DispatchKey::MPS); } bool is_ort() const { - return key_set_.has(DispatchKey::ORT); + constexpr auto ort_ks = DispatchKeySet(DispatchKey::ORT); + return key_set_.has_all(ort_ks); + } + + bool is_nested() const { + return key_set_.has(DispatchKey::NestedTensor); } // TODO: remove this once we don't automatically enabled Autograd dispatch @@ -950,8 +869,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { // Invariant: // Inference tensor has version_counter_.enabled() == false bool is_inference() { - bool no_ADInplaceOrView = !key_set_.has(c10::DispatchKey::ADInplaceOrView); - bool no_Autograd = (key_set_ & c10::autograd_dispatch_keyset).empty(); + bool no_ADInplaceOrView = !key_set_.has_any(c10::inplace_or_view_ks); + bool no_Autograd = !key_set_.has_any(c10::autograd_dispatch_keyset); TORCH_INTERNAL_ASSERT_DEBUG_ONLY( no_ADInplaceOrView == no_Autograd, "ADInplaceOrView and Autograd keys must be on/off at the same time."); @@ -972,14 +891,32 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { Layout layout() const { // NB: This method is not virtual and avoid dispatches for perf. - if (is_sparse()) { + // strided is also the most common layout type, so we check for + // strided case first. + // This keyset must also be kept in sync with the logic in + // is_sparse() / is_sparse_csr() / is_mkldnn() + constexpr auto sparse_and_sparsecsr_and_mkldnn_ks = + c10::sparse_ks | c10::sparse_csr_ks | c10::mkldnn_ks; + if (!key_set_.has_any(sparse_and_sparsecsr_and_mkldnn_ks)) { + return kStrided; + } else if (is_sparse()) { return kSparse; - } else if (is_sparse_csr()) { - return kSparseCsr; - } else if (is_mkldnn()) { - return kMkldnn; + } else if (key_set_.has_any(c10::sparse_csr_ks)) { + // Typically, the tensor dispatch keys define the tensor layout + // uniquely. This allows using non-virtual layout method for + // better performance. However, when tensor's layout depends, + // say, on tensor attributes, one must use this execution path + // where the corresponding tensor impl class overwrites virtual + // layout_impl() method. + // + // TODO: implement layout() as native function/method so that + // __torch_dispatch__ users will be able to redefine the + // layout() method. + return layout_impl(); } else { - return kStrided; + TORCH_INTERNAL_ASSERT( + is_mkldnn(), "There is an error in the layout calculation logic."); + return kMkldnn; } } @@ -1065,7 +1002,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * Whether or not the imaginary part of the tensor should be negated */ inline bool is_conj() const { - return key_set_.has(DispatchKey::Conjugate); + constexpr auto conjugate_ks = DispatchKeySet(DispatchKey::Conjugate); + return key_set_.has_all(conjugate_ks); } /** @@ -1085,7 +1023,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * Whether or not the tensor is a zerotensor */ inline bool _is_zerotensor() const { - return key_set_.has(DispatchKey::ZeroTensor); + constexpr auto zerotensor_ks = DispatchKeySet(DispatchKey::ZeroTensor); + return key_set_.has_all(zerotensor_ks); } /** @@ -1105,7 +1044,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * Whether or not the tensor should be negated */ inline bool is_neg() const { - return key_set_.has(DispatchKey::Negative); + constexpr auto negative_ks = DispatchKeySet(DispatchKey::Negative); + return key_set_.has_all(negative_ks); } /** @@ -1412,16 +1352,6 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { refresh_contiguous(); } - /** - * Return the size of a tensor at some dimension. - */ - virtual int64_t size(int64_t d) const; - - /** - * Return the stride of a tensor at some dimension. - */ - virtual int64_t stride(int64_t d) const; - /** * Set whether a tensor allows changes to its metadata (e.g. sizes / strides / * storage / storage_offset). See NOTE [ Metadata Change for a Detached Tensor @@ -1476,14 +1406,14 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { void set_python_dispatch(bool k) { if (k) { - key_set_ = key_set_.add(DispatchKey::Python); + key_set_ = key_set_.add(c10::python_ks); } else { - key_set_ = key_set_.remove(DispatchKey::Python); + key_set_ = key_set_ - c10::python_ks; } } bool is_python_dispatch() const { - return key_set_.has(DispatchKey::Python); + return key_set_.has_all(c10::python_ks); } /** @@ -1548,13 +1478,23 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { */ inline bool has_compatible_shallow_copy_type(DispatchKeySet from) { auto is_dense = [](DispatchKeySet ts) { - return ts.has(DispatchKey::CPU) || ts.has(DispatchKey::CUDA) || - ts.has(DispatchKey::HIP) || ts.has(DispatchKey::XPU); + constexpr auto dense_backends = DispatchKeySet( + {BackendComponent::CPUBit, + BackendComponent::CUDABit, + BackendComponent::MPSBit, + BackendComponent::HIPBit, + BackendComponent::XPUBit}); + constexpr auto dense_k = DispatchKeySet(DispatchKey::Dense); + return ts.has_any(dense_k) && ts.has_any(dense_backends); }; auto is_sparse = [](DispatchKeySet ts) { - return ts.has(DispatchKey::SparseCPU) || - ts.has(DispatchKey::SparseCUDA) || ts.has(DispatchKey::SparseHIP) || - ts.has(DispatchKey::SparseXPU); + constexpr auto sparse_backends = DispatchKeySet( + {BackendComponent::CPUBit, + BackendComponent::CUDABit, + BackendComponent::HIPBit, + BackendComponent::XPUBit}); + constexpr auto sparse_k = DispatchKeySet(DispatchKey::Sparse); + return ts.has_any(sparse_k) && ts.has_any(sparse_backends); }; return (key_set_ == from) || (is_dense(key_set_) && is_dense(from)) || (is_sparse(key_set_) && is_sparse(from)); @@ -1679,6 +1619,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { // we are the ONLY thread that can have gotten to this point. It is not // possible to conflict with another zero interpreter as access is protected // by GIL + // NB: owns_pyobj tag is initially false pyobj_ = pyobj; } @@ -1688,6 +1629,11 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { return pyobj_interpreter_.load(std::memory_order_acquire); } + PyObject* _unchecked_untagged_pyobj() const { + return reinterpret_cast( + reinterpret_cast(pyobj_) & ~0x1ULL); + } + // Test the interpreter tag. If tagged for the current interpreter, return // a non-nullopt (but possibly null) PyObject. If (possibly) untagged, // returns a nullopt. If it is definitely invalid, raises an error. @@ -1707,7 +1653,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { return c10::nullopt; } else if (interpreter == self_interpreter) { // NB: pyobj_ could still be null! - return c10::make_optional(pyobj_); + return c10::make_optional(_unchecked_untagged_pyobj()); } else { TORCH_CHECK( false, @@ -1758,63 +1704,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * * This op is auto-asynchronous if the underlying device (CUDA) supports it. */ - void Extend(int64_t num, float growthPct) { - TORCH_CHECK(sizes_and_strides_.size() >= 1u); - TORCH_CHECK(num >= 0, "`num` must be non-negative for Extend"); - TORCH_CHECK( - is_contiguous_, - "Right now Extend is only supported for contiguous Tensor."); - using SizesVector = SmallVector; - SizesVector newDims( - sizes_and_strides_.sizes_begin(), sizes_and_strides_.sizes_end()); - newDims[0] += num; - if (!storage_.data()) { - Resize(newDims); - return; - } - const auto newNumel = - c10::multiply_integers(newDims.begin(), newDims.end()); - if (newNumel * data_type_.itemsize() <= storage_.nbytes()) { - sizes_and_strides_.set_sizes(newDims); - numel_ = newNumel; - return; - } - SizesVector newCapacity( - sizes_and_strides_.sizes_begin(), sizes_and_strides_.sizes_end()); - newCapacity[0] = std::max( - newDims[0], - static_cast(std::ceil( - sizes_and_strides_.size_at_unchecked(0) * (1 + growthPct / 100)))); - auto oldData = std::move(storage_.data_ptr()); - auto oldSize = numel_; - Resize(newCapacity); - auto* newData = raw_mutable_data(data_type_); - if (data_type_.copy()) { - TORCH_CHECK( - device_type() == DeviceType::CPU, "non-POD types work only on CPU"); - data_type_.copy()(oldData.get(), newData, oldSize); - } else { - // The following copy uses the current (thread local) stream for copying - // and also takes the GPU id from the device() field passed in. - // - // TODO: Potentially more enforcements are necessary to avoid accidental - // switch to sync copy if the currently set device is wrong. - // - // Specifically, we might need to switch to a different context device - // here explicitly to avoid relying on user synchronizing things - // properly. - CopyBytes( - oldSize * itemsize(), - oldData.get(), - device(), - newData, - device(), - true); // non-blocking - } - reserved_ = true; - sizes_and_strides_.set_sizes(newDims); - numel_ = newNumel; - } + void Extend(int64_t num, float growthPct); /** * @brief Reserve space for the underlying tensor. @@ -1822,33 +1712,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * This must be called after Resize(), since we only specify the first * dimension This does not copy over the old data to the newly allocated space */ - template - void ReserveSpace(const T& outer_dim) { - TORCH_CHECK( - is_contiguous_, - "Right now ReserveSpace is only supported for contiguous Tensor."); - TORCH_CHECK( - storage_.unique(), "Can't call ReserveSpace on shared storage."); - // TODO: eliminate newCapacity. - SmallVector newCapacity( - sizes_and_strides_.sizes_begin(), sizes_and_strides_.sizes_end()); - newCapacity[0] = outer_dim; - auto newNumel = c10::multiply_integers(newCapacity); - if (newNumel * data_type_.itemsize() <= storage_.nbytes()) { - return; - } - // Old data is discarded - storage_.data_ptr().clear(); - auto oldSize = numel_; - SmallVector oldDims( - sizes_and_strides_.sizes_begin(), sizes_and_strides_.sizes_end()); - Resize(newCapacity); - // Allocate new memory but don't copy over the data - raw_mutable_data(data_type_); - sizes_and_strides_.set_sizes(oldDims); - numel_ = oldSize; - reserved_ = true; - } + void ReserveSpace(int64_t outer_dim); /** * @brief Resizes a tensor. @@ -1883,38 +1747,14 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * Resizes the tensor without touching underlying storage. * This requires the total size of the tensor to remains constant. */ - inline void Reshape(const std::vector& dims) { - TORCH_CHECK( - is_contiguous_, - "Right now Reshape is only supported for contiguous Tensor."); - int64_t new_size = 1; - for (auto d : dims) { - TORCH_CHECK(d >= 0); - new_size *= d; - } - TORCH_CHECK( - new_size == numel_, - "New size and old size are not equal. You cannot use Reshape, " - "but should use Resize." - // TODO(jiayq): remove the following warning after pending diffs - // stabilize. - " The old caffe2 mixes Reshape and Resize but this behavior has " - "been changed. If you find this error, most likely you will need " - "to change corresponding code from Reshape to Resize."); - sizes_and_strides_.set_sizes(dims); - empty_tensor_restride(MemoryFormat::Contiguous); - } + void Reshape(const std::vector& dims); /** * Release whatever memory the tensor was holding but keep size and type * information. Subsequent call to mutable_data will trigger new memory * allocation. */ - inline void FreeMemory() { - // We'll detach from the old Storage and create a new one - storage_ = Storage::create_legacy(storage_.device()); - storage_offset_ = 0; - } + void FreeMemory(); /** * @brief Shares the data with another tensor. @@ -1929,67 +1769,12 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * The source tensor should already have its data allocated. */ // To be deprecated - void ShareData(const TensorImpl& src) { - // Right now, we are assuming the device_type are the same, since it is - // inherently the same in the non-templatized code. We should probably add - // an assert here which might affect perf a little bit. - TORCH_CHECK( - src.numel_ == numel_, - "Size mismatch - did you call reshape before sharing the data?"); - // It is possible that the source tensor hasn't called mutable_data() yet, - // in which case ShareData() doesn't make much sense since we don't really - // know what to share yet. - // TODO: Add the assert after all uninitialized states are eliminated - // TORCH_CHECK(src.dtype_initialized(), - // "Source tensor don't have a data type (did you call - // mutable_data on the tensor?)"); - if (!src.dtype_initialized()) { - C10_LOG_EVERY_MS(WARNING, 1000) - << "Source tensor don't have a data type (did you call mutable_data on the tensor?)"; - } - TORCH_CHECK( - src.storage_initialized(), - "Source tensor has no content and has size > 0"); - // Finally, do sharing. - /* Since we create new Storage whenever we need to change data_type/nbytes - * this still keeps the original semantics - */ - storage_ = src.storage(); - data_type_ = src.dtype(); - device_opt_ = src.device_opt(); - storage_offset_ = src.storage_offset(); - } + void ShareData(const TensorImpl& src); void ShareExternalPointer( DataPtr&& data_ptr, const caffe2::TypeMeta data_type, - size_t size_bytes) { - TORCH_CHECK( - data_type != ScalarType::Undefined, - "To share with a raw external pointer you need to pass in an " - "initialized data_type(TypeMeta)."); - if (!size_bytes) { - size_bytes = numel_ * data_type.itemsize(); - } - if (storage_.unique()) { - storage_.UniqueStorageShareExternalPointer( - std::move(data_ptr), size_bytes); - data_type_ = data_type; - device_opt_ = storage_.device(); - storage_offset_ = 0; - } else { - // Create a new Storage - storage_ = Storage( - Storage::use_byte_size_t(), - size_bytes, - std::move(data_ptr), - /*allocator=*/nullptr, - /*resizable=*/false); - data_type_ = data_type; - device_opt_ = storage_.device(); - storage_offset_ = 0; - } - } + size_t size_bytes); /** * Returns a mutable raw pointer of the underlying storage. Since we will need @@ -2158,6 +1943,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { // Cleaning warning messages, no need to break as TORCH_CHECK(false) // terminates flow. // break; + case MemoryFormat::NumOptions: + TORCH_INTERNAL_ASSERT(false, "invalid memory format ", memory_format); } // recompute contiguous flag, as currently NHWC/NCHW flags are not mutually // exclusive see #24090 @@ -2244,11 +2031,12 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * Compute the number of elements based on the sizes of a tensor. */ int64_t compute_numel() const { - int64_t n = 1; - for (auto s : sizes()) { - n *= s; - } - return n; +#if C10_HAS_BUILTIN_OVERFLOW() && !defined(C10_MOBILE) + // Use overflow checks if supported by the compiler + return safe_compute_numel(); +#else + return c10::multiply_integers(sizes()); +#endif } /** @@ -2257,14 +2045,15 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * using a sparse layout has multiple dimensions with large sizes. */ int64_t safe_compute_numel() const { - int64_t n = 1; - for (auto s : sizes()) { - TORCH_CHECK( - s == 0 || n <= std::numeric_limits::max() / s, - "numel: integer multiplication overflow"); - n *= s; - } - return n; + uint64_t n = 1; + bool overflows = c10::safe_multiplies_u64(sizes(), &n); + constexpr auto numel_max = std::min( + static_cast(std::numeric_limits::max()), + static_cast(std::numeric_limits::max())); + + overflows |= (n > numel_max); + TORCH_CHECK(!overflows, "numel: integer multiplication overflow"); + return static_cast(n); } /** @@ -2392,36 +2181,43 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { // See NOTE [ Metadata Change for a Detached Tensor ] for details. static const char* const err_msg_tensor_metadata_change_not_allowed; + static void copy_generic_tensor_metadata( + const TensorImpl* src_impl, + TensorImpl* dest_impl); + public: void set_storage_access_should_throw() { storage_access_should_throw_ = true; } bool owns_pyobj() { - return owns_pyobj_; + return reinterpret_cast(pyobj_) & 1; } void set_owns_pyobj(bool b) { - owns_pyobj_ = b; + pyobj_ = reinterpret_cast( + reinterpret_cast(_unchecked_untagged_pyobj()) | b); } - protected: - // Policy for adjusting the behavior of is_contiguous(). Allows - // subclass customization while still being able to inline - // is_contiguous() in the common case. - enum class HasContiguityPolicy : uint8_t { - // Default behavior: check is_contiguous_ and similar bitflags. - Default, - // Throw a generic error message that this tensor type does not - // support is_contiguous. - ContiguityNotSupported, - // Call virtual is_contiguous_custom method to implement custom - // is_contiguous behavior. - CustomBehavior, + public: + enum class SizesStridesPolicy : uint8_t { + // Default behavior, e.g., dense tensor. + // + // Can override: nothing + Default = 0, + // Customizable strides behavior, e.g., sparse tensor, + // mkldnn tensor. + // + // Can override: strides(), is_contiguous() + CustomStrides = 1, + // Customizable sizes behavior, e.g., nested tensor + // + // Can override: strides(), is_contiguous(), sizes(), dim(), numel() + CustomSizes = 2, }; - void set_has_contiguity_policy(HasContiguityPolicy p) { - has_contiguity_ = static_cast(p); + void set_sizes_strides_policy(SizesStridesPolicy policy) { + sizes_strides_policy_ = static_cast(policy); } Storage storage_; @@ -2481,17 +2277,24 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { // care) std::atomic pyobj_interpreter_; - // This field contains a weak reference to a PyObject representing - // this Tensor. It MUST NOT be a strong reference, as that would - // create a reference cycle between Tensor and the PyObject. If - // pyobj is nullptr, when we transfer Tensor to Python, we allocate - // a new PyObject for it and set this field. This field does not - // have to be protected by an atomic as it is only allowed to be - // accessed when you hold the GIL. + // This field contains a reference to a PyObject representing this Tensor. + // If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new + // PyObject for it and set this field. This field does not have to be + // protected by an atomic as it is only allowed to be accessed when you hold + // the GIL, or during destruction of the tensor. // // When a PyObject dies, you are obligated to clear this field // (otherwise, you will try to use-after-free the pyobj); this currently // occurs in THPVariable_clear in torch/csrc/autograd/python_variable.cpp + // + // NB: Ordinarily, this should not be a strong reference, as if the + // PyObject owns the Tensor, this would create a reference cycle. + // However, sometimes this ownership flips. To track who owns + // who, this has a single pointer tag indicating whether or not the + // C++ object owns the PyObject (the common case, zero, means PyObject + // owns the C++ object); see _unchecked_untagged_pyobj for raw access + // or check_pyobj for checked access. See references to PyObject + // resurrection in torch/csrc/autograd/python_variable.cpp PyObject* pyobj_; c10::impl::SizesAndStrides sizes_and_strides_; @@ -2523,9 +2326,6 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { // Tensor is contiguous bool is_contiguous_ : 1; - // gcc doesn't like enum class bitfields; see - // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61414 - /* HasContiguityPolicy */ uint8_t has_contiguity_ : 2; // Tensor is a subclass that does not permit storage access. bool storage_access_should_throw_ : 1; @@ -2534,7 +2334,6 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { // or -std=gnu++2a inline void init_bitfields() { is_contiguous_ = true; - has_contiguity_ = static_cast(HasContiguityPolicy::Default); is_channels_last_ = false; is_channels_last_contiguous_ = false; @@ -2544,7 +2343,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { is_wrapped_number_ = false; allow_tensor_metadata_change_ = true; reserved_ = false; - owns_pyobj_ = false; + sizes_strides_policy_ = static_cast(SizesStridesPolicy::Default); storage_access_should_throw_ = false; } @@ -2598,12 +2397,9 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { // then subsequent Resize()s will not free up Storage. bool reserved_ : 1; - // If pyobj_ is nullptr, this is always false. - // Otherwise, this indicates whether or not TensorImpl owns the pyobj_ - // or vice versa. Ordinarily, pyobj_ owns TensorImpl, but if the - // Python object's refcount goes to zero, we flip the ownership - // direction (to make sure the pyobj stays live). - bool owns_pyobj_ : 1; + // Call _custom() virtual methods for + // strides()/is_contiguous()/sizes()/dim()/numel() + uint8_t sizes_strides_policy_ : 2; // The set of DispatchKeys which describe this tensor. NB: this // does NOT include Autograd (historically, it did, but diff --git a/c10/core/TensorOptions.h b/c10/core/TensorOptions.h index f7619db0d60f..e906720ba61f 100644 --- a/c10/core/TensorOptions.h +++ b/c10/core/TensorOptions.h @@ -643,6 +643,9 @@ inline DispatchKey computeDispatchKey( } return DispatchKey::CUDA; } + case DeviceType::IPU: { + return DispatchKey::IPU; + } case DeviceType::XPU: { if (isQIntType(dtype_)) { return DispatchKey::QuantizedXPU; @@ -670,8 +673,8 @@ inline DispatchKey computeDispatchKey( return DispatchKey::XLA; case DeviceType::Lazy: return DispatchKey::Lazy; - case DeviceType::MLC: - return DispatchKey::MLC; + case DeviceType::MPS: + return DispatchKey::MPS; case DeviceType::Vulkan: return DispatchKey::Vulkan; case DeviceType::Metal: @@ -680,6 +683,9 @@ inline DispatchKey computeDispatchKey( return DispatchKey::Meta; case DeviceType::HPU: return DispatchKey::HPU; + case DeviceType::PrivateUse1: { + return DispatchKey::PrivateUse1; + } default: TORCH_CHECK_NOT_IMPLEMENTED( false, @@ -716,6 +722,9 @@ inline DispatchKey computeDispatchKey( device_.type()); } case Layout::SparseCsr: + case Layout::SparseCsc: + case Layout::SparseBsr: + case Layout::SparseBsc: switch (device_.type()) { case DeviceType::CPU: return DispatchKey::SparseCsrCPU; @@ -723,7 +732,9 @@ inline DispatchKey computeDispatchKey( return DispatchKey::SparseCsrCUDA; default: AT_ERROR( - "Unsupported device type for sparse CSR layout: ", + "Unsupported device type for ", + layout_, + " layout: ", device_.type()); } default: @@ -738,9 +749,14 @@ inline Layout dispatchKeyToLayout(DispatchKey dispatch_key) { case DispatchKey::SparseHIP: case DispatchKey::SparseVE: case DispatchKey::SparseXPU: + return Layout::Sparse; case DispatchKey::SparseCsrCPU: case DispatchKey::SparseCsrCUDA: - return Layout::Sparse; + TORCH_CHECK( + false, + "Cannot map DispatchKey ", + dispatch_key, + " to a unique layout."); case DispatchKey::MkldnnCPU: return Layout::Mkldnn; default: @@ -780,19 +796,24 @@ inline DeviceType dispatchKeyToDeviceType(DispatchKey dispatch_key) { return DeviceType::Meta; // stuff that people are actively developing + case DispatchKey::IPU: + case DispatchKey::AutogradIPU: + return DeviceType::IPU; case DispatchKey::XPU: case DispatchKey::SparseXPU: case DispatchKey::QuantizedXPU: case DispatchKey::AutogradXPU: return DeviceType::XPU; - case DispatchKey::MLC: - case DispatchKey::AutogradMLC: - return DeviceType::MLC; + case DispatchKey::MPS: + case DispatchKey::AutogradMPS: + return DeviceType::MPS; case DispatchKey::HPU: case DispatchKey::AutogradHPU: return DeviceType::HPU; case DispatchKey::ORT: return DeviceType::ORT; + case DispatchKey::PrivateUse1: + return DeviceType::PrivateUse1; default: TORCH_CHECK( false, diff --git a/c10/core/UndefinedTensorImpl.cpp b/c10/core/UndefinedTensorImpl.cpp index 0b8c9c1348b6..1c24c17b53d3 100644 --- a/c10/core/UndefinedTensorImpl.cpp +++ b/c10/core/UndefinedTensorImpl.cpp @@ -7,14 +7,13 @@ namespace c10 { UndefinedTensorImpl::UndefinedTensorImpl() : TensorImpl(DispatchKey::Undefined, caffe2::TypeMeta(), c10::nullopt) { set_storage_access_should_throw(); + // TODO: accessing the sizes on an undefined tensor is not meaningful + // and should error too, but empirically it does not! + set_sizes_strides_policy(SizesStridesPolicy::CustomStrides); } -int64_t UndefinedTensorImpl::size(int64_t d) const { - TORCH_CHECK(false, "size(dim) called on an undefined Tensor"); -} - -int64_t UndefinedTensorImpl::stride(int64_t d) const { - TORCH_CHECK(false, "stride(dim) called on an undefined Tensor"); +bool UndefinedTensorImpl::is_contiguous_custom(MemoryFormat format) const { + return is_contiguous_default(format); } #ifdef DEBUG @@ -29,10 +28,6 @@ void UndefinedTensorImpl::set_storage_offset(int64_t) { TORCH_CHECK(false, "set_storage_offset() called on an undefined Tensor"); } -IntArrayRef UndefinedTensorImpl::strides() const { - TORCH_CHECK(false, "strides() called on undefined Tensor"); -} - const char* UndefinedTensorImpl::tensorimpl_type_name() const { return "UndefinedTensorImpl"; } diff --git a/c10/core/UndefinedTensorImpl.h b/c10/core/UndefinedTensorImpl.h index fc6501850499..ddf688a569c6 100644 --- a/c10/core/UndefinedTensorImpl.h +++ b/c10/core/UndefinedTensorImpl.h @@ -18,14 +18,14 @@ struct C10_API UndefinedTensorImpl final : public TensorImpl { #endif return &_singleton; } - IntArrayRef strides() const override; - int64_t size(int64_t d) const override; - int64_t stride(int64_t d) const override; #ifdef DEBUG bool has_storage() const override; #endif void set_storage_offset(int64_t offset) override; + protected: + bool is_contiguous_custom(MemoryFormat format) const override; + private: UndefinedTensorImpl(); static UndefinedTensorImpl _singleton; diff --git a/c10/core/WrapDimMinimal.cpp b/c10/core/WrapDimMinimal.cpp new file mode 100644 index 000000000000..2dc359fc5d4f --- /dev/null +++ b/c10/core/WrapDimMinimal.cpp @@ -0,0 +1,36 @@ +#include + +namespace c10 { +namespace detail { + +int64_t maybe_wrap_dim_slow( + int64_t dim, + int64_t dim_post_expr, + bool wrap_scalar) { + if (dim_post_expr <= 0) { + TORCH_CHECK_INDEX( + wrap_scalar, + "dimension specified as ", + dim, + " but tensor has no dimensions"); + return c10::maybe_wrap_dim(dim, /*dim_post_expr=*/1, /*wrap_scalar=*/false); + } + + int64_t min = -dim_post_expr; + int64_t max = dim_post_expr - 1; + TORCH_CHECK_INDEX( + min <= dim && dim <= max, + "Dimension out of range (expected to be in range of [", + min, + ", ", + max, + "], but got ", + dim, + ")"); + + TORCH_INTERNAL_ASSERT( + false, "should never reach here as dim should be out-of-bounds"); +} + +} // namespace detail +} // namespace c10 diff --git a/c10/core/WrapDimMinimal.h b/c10/core/WrapDimMinimal.h index 01cb1c641a14..4a6f37514749 100644 --- a/c10/core/WrapDimMinimal.h +++ b/c10/core/WrapDimMinimal.h @@ -4,37 +4,22 @@ namespace c10 { +namespace detail { +C10_API int64_t +maybe_wrap_dim_slow(int64_t dim, int64_t dim_post_expr, bool wrap_scalar); +} + static inline int64_t maybe_wrap_dim( int64_t dim, int64_t dim_post_expr, bool wrap_scalar = true) { - if (dim_post_expr <= 0) { - if (!wrap_scalar) { - TORCH_CHECK_INDEX( - false, - "dimension specified as ", - dim, - " but tensor has no dimensions"); - } - dim_post_expr = 1; // this will make range [-1, 0] - } - - int64_t min = -dim_post_expr; - int64_t max = dim_post_expr - 1; - if (dim < min || dim > max) { - TORCH_CHECK_INDEX( - false, - "Dimension out of range (expected to be in range of [", - min, - ", ", - max, - "], but got ", - dim, - ")"); + // Inline the fast paths + if (C10_LIKELY(-dim_post_expr <= dim && dim < dim_post_expr)) { + // Branch-less version of dim + (dim < 0 ? dim_post_expr : 0) + return dim + dim_post_expr * (dim < 0); } - if (dim < 0) - dim += dim_post_expr; - return dim; + // Check edge-cases out-of-line (wrapping scalars and out-of-bounds errors) + return c10::detail::maybe_wrap_dim_slow(dim, dim_post_expr, wrap_scalar); } } // namespace c10 diff --git a/c10/core/impl/DeviceGuardImplInterface.h b/c10/core/impl/DeviceGuardImplInterface.h index a87f25b60eed..5a409715a622 100644 --- a/c10/core/impl/DeviceGuardImplInterface.h +++ b/c10/core/impl/DeviceGuardImplInterface.h @@ -117,6 +117,7 @@ struct C10_API DeviceGuardImplInterface { */ virtual Stream getStreamFromGlobalPool(Device, bool isHighPriority = false) const { + (void)isHighPriority; // Suppress unused varaible warning TORCH_CHECK(false, "Backend doesn't support acquiring a stream from pool.") } @@ -130,7 +131,7 @@ struct C10_API DeviceGuardImplInterface { /** * Destroys the given event. */ - virtual void destroyEvent(void* event, const DeviceIndex device_index) + virtual void destroyEvent(void* /*event*/, const DeviceIndex /*device_index*/) const noexcept {} /** @@ -140,10 +141,10 @@ struct C10_API DeviceGuardImplInterface { * event to continue and marks that version as recorded. * */ virtual void record( - void** event, - const Stream& stream, - const DeviceIndex device_index, - const c10::EventFlag flag) const { + void** /*event*/, + const Stream& /*stream*/, + const DeviceIndex /*device_index*/, + const c10::EventFlag /*flag*/) const { TORCH_CHECK(false, "Backend doesn't support events."); } @@ -155,7 +156,7 @@ struct C10_API DeviceGuardImplInterface { * When the stream reaches this command it will stop processing * additional commands until that version of the event is marked as recorded. */ - virtual void block(void* event, const Stream& stream) const { + virtual void block(void* /*event*/, const Stream& /*stream*/) const { TORCH_CHECK(false, "Backend doesn't support events."); } @@ -165,7 +166,7 @@ struct C10_API DeviceGuardImplInterface { * (2) the current version is marked as recorded. * Returns false otherwise. */ - virtual bool queryEvent(void* event) const { + virtual bool queryEvent(void* /*event*/) const { TORCH_CHECK(false, "Backend doesn't support events."); } @@ -180,7 +181,7 @@ struct C10_API DeviceGuardImplInterface { * Return true if all the work previously enqueued on the stream for * asynchronous execution has completed running on the device. */ - virtual bool queryStream(const Stream& stream) const { + virtual bool queryStream(const Stream& /*stream*/) const { TORCH_CHECK(false, "Backend doesn't support querying streams."); } @@ -188,7 +189,7 @@ struct C10_API DeviceGuardImplInterface { * Wait (by blocking the calling thread) until all the work previously * enqueued on the stream has completed running on the device. */ - virtual void synchronizeStream(const Stream& stream) const { + virtual void synchronizeStream(const Stream& /*stream*/) const { TORCH_CHECK(false, "Backend doesn't support synchronizing streams."); } @@ -225,15 +226,15 @@ struct NoOpDeviceGuardImpl final : public DeviceGuardImplInterface { void setDevice(Device) const override { // no-op } - void uncheckedSetDevice(Device d) const noexcept override { + void uncheckedSetDevice(Device) const noexcept override { // no-op } - Stream getStream(Device d) const noexcept override { + Stream getStream(Device) const noexcept override { // no-op return Stream(Stream::DEFAULT, Device(D, -1)); } // NB: These do NOT set the current device - Stream exchangeStream(Stream s) const noexcept override { + Stream exchangeStream(Stream) const noexcept override { // no-op return Stream(Stream::DEFAULT, Device(D, -1)); } @@ -243,26 +244,26 @@ struct NoOpDeviceGuardImpl final : public DeviceGuardImplInterface { // Event-related functions void record( - void** event, - const Stream& stream, - const DeviceIndex device_index, - const EventFlag flag) const override { + void** /*event*/, + const Stream& /*stream*/, + const DeviceIndex /*device_index*/, + const EventFlag /*flag*/) const override { TORCH_CHECK(false, D, " backend doesn't support events."); } - void block(void* event, const Stream& stream) const override { + void block(void* /*event*/, const Stream& /*stream*/) const override { TORCH_CHECK(false, D, " backend doesn't support events.") } - bool queryEvent(void* event) const override { + bool queryEvent(void* /*event*/) const override { TORCH_CHECK(false, D, " backend doesn't support events.") } - void destroyEvent(void* event, const DeviceIndex device_index) + void destroyEvent(void* /*event*/, const DeviceIndex /*device_index*/) const noexcept override {} // Stream-related functions - bool queryStream(const Stream& stream) const override { + bool queryStream(const Stream& /*stream*/) const override { return true; } - void synchronizeStream(const Stream& stream) const override { + void synchronizeStream(const Stream& /*stream*/) const override { // Don't wait for anything. } }; diff --git a/c10/core/impl/FakeGuardImpl.h b/c10/core/impl/FakeGuardImpl.h index 2d47db0fdb18..c86255220c1c 100644 --- a/c10/core/impl/FakeGuardImpl.h +++ b/c10/core/impl/FakeGuardImpl.h @@ -9,7 +9,7 @@ namespace impl { // FakeGuardImpl is hardcoded to have eight devices. Not for // any good reason, just to simplify code. -constexpr size_t kFakeGuardImplMaxDevices = 8; +constexpr DeviceIndex kFakeGuardImplMaxDevices = 8; /** * A fake implementation of DeviceGuardImplInterface suitable for testing. @@ -21,7 +21,7 @@ struct FakeGuardImpl final : public DeviceGuardImplInterface { static constexpr DeviceType static_type = T; // Runtime device type is not used FakeGuardImpl(DeviceType) {} - FakeGuardImpl() {} + FakeGuardImpl() = default; DeviceType type() const override { return T; } diff --git a/c10/core/impl/LocalDispatchKeySet.h b/c10/core/impl/LocalDispatchKeySet.h index 050363fc7c11..70af58b95716 100644 --- a/c10/core/impl/LocalDispatchKeySet.h +++ b/c10/core/impl/LocalDispatchKeySet.h @@ -117,6 +117,20 @@ class C10_API ExcludeDispatchKeyGuard { DispatchKeySet exclude_; }; +struct C10_API ForceDispatchKeyGuard { + public: + ForceDispatchKeyGuard(c10::impl::LocalDispatchKeySet key_set) + : saved_keyset_(c10::impl::tls_local_dispatch_key_set()) { + c10::impl::_force_tls_local_dispatch_key_set(key_set); + } + ~ForceDispatchKeyGuard() { + c10::impl::_force_tls_local_dispatch_key_set(saved_keyset_); + } + + private: + c10::impl::LocalDispatchKeySet saved_keyset_; +}; + // Non-RAII API for manipulating the thread-local dispatch state. // Please prefer the RAII API. The non-RAII API may be useful when // the included/excluded state of a given DispatchKey must span diff --git a/c10/core/impl/PyInterpreter.cpp b/c10/core/impl/PyInterpreter.cpp new file mode 100644 index 000000000000..145fc56bc0d5 --- /dev/null +++ b/c10/core/impl/PyInterpreter.cpp @@ -0,0 +1,48 @@ +#include +#include + +namespace c10 { +namespace impl { + +static std::string noop_name_fn(const PyInterpreter*) { + return ""; +} + +static void noop_decref_fn(const PyInterpreter*, PyObject*, bool) { + // no-op +} + +static c10::intrusive_ptr noop_detach_fn( + const PyInterpreter*, + const TensorImpl*) { + TORCH_INTERNAL_ASSERT( + 0, + "attempted to detach (shallow_copy_and_detach) Tensor with nontrivial PyObject after corresponding interpreter died"); +} + +static void noop_dispatch_fn( + const PyInterpreter*, + const c10::OperatorHandle& op, + torch::jit::Stack* stack, + const std::shared_ptr& type) { + TORCH_INTERNAL_ASSERT( + 0, + "attempted to dispatch (__torch_dispatch__) an operator on Tensor with nontrivial PyObject after corresponding interpreter died"); +} + +static bool noop_is_contiguous_fn(const PyInterpreter*, const TensorImpl*) { + TORCH_INTERNAL_ASSERT( + 0, + "attempted to is_contiguous Tensor with nontrivial PyObject after corresponding interpreter died"); +} + +void PyInterpreter::disarm() noexcept { + name_fn_ = &noop_name_fn; + decref_fn_ = &noop_decref_fn; + detach_fn_ = &noop_detach_fn; + dispatch_fn_ = &noop_dispatch_fn; + is_contiguous_fn_ = &noop_is_contiguous_fn; +} + +} // namespace impl +} // namespace c10 diff --git a/c10/core/impl/PyInterpreter.h b/c10/core/impl/PyInterpreter.h new file mode 100644 index 000000000000..fb432e78c19e --- /dev/null +++ b/c10/core/impl/PyInterpreter.h @@ -0,0 +1,198 @@ +#pragma once + +#include +#include +#include +#include +#include + +// Forward declarations + +namespace c10 { +struct IValue; +class OperatorHandle; +struct TensorImpl; +struct SafePyObject; +} // namespace c10 + +namespace torch { +namespace jit { +using Stack = std::vector; +} +} // namespace torch + +// Actual implementation + +namespace c10 { +namespace impl { + +// Note [Python interpreter tag] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// Traditionally, PyTorch is layered such that our Python library +// (libtorch_python) references our pure C++ library (libtorch) as the +// natural order of things. However, sometimes this natural order is +// subverted: C++ objects refer to Python objects (for example, we +// store a PyObject* pointer on TensorImpl so that converting from a +// C++ Tensor to a Python Tensor is just a memory dereference). +// +// These unusual orderings must be treated with care. To start, you need to +// virtualize the destructor so that the PyObject can be decref'ed on +// destruction (because the C++ object itself doesn't know anything about +// Python--remember, layering!). This process itself is fraught, since +// acquiring the GIL could lead to deadlocks if someone is blocking on you +// while holding the GIL. Furthermore, if the C++ objects outlive the +// interpreter (which can happen if you stash them in a static global +// variable defined in libtorch), you may attempt to decref the object when +// the Python interpreter has already been shutdown. +// +// BUT WAIT, IT GETS WORSE. With torchdeploy, there may be multiple Python +// interpreters in a single process. If a C++ object is accessible from +// multiple interpreters, we must take care not to accidentally pass a +// PyObject from one interpreter with another interpreter. +// +// To prevent these mixups, we introduce a PyInterpreter "tag" (object with +// a vtable), which specifies a specific Python interpreter. +// +// - Any given object can be associated with AT MOST one Python interpreter. +// We represent the interpreter tag as a memory address to an instance of +// a virtual class that is allocated once per interpreter (this is so that +// we can request the interpreter to perform operations for us, if +// necessary). +// +// - It can be recorded with a PyObject (PyInterpreterObject) so that +// we know what interpreter the object is associated with, and we can +// raise an error if you try to use the PyObject from the wrong +// interpreter context. +// +// - It contains a vtable that can be used to perform various Python +// operations from ordinary C++ code that ordinarily wouldn't be accessible +// from libtorch. +// +// A simple use case is when a C++ object must be associated with a PyObject. +// However, for TensorImpl, we lazily allocate a PyObject the first time the +// object passes into Python. The invariants for this situation are more +// subtle: +// +// - A given TensorImpl's interpreter tag can only go from uninitialized to +// tagged; once tagged, this is a quiescent state (once tagged to an +// interpreter, ALWAYS tagged to that interpreter) +// +// - A thread may mutate the PyObject field of a TensorImpl if and only if it +// holds the GIL for the interpreter tagged on the TensorImpl. (If the +// TensorImpl is not tagged, it must first atomically claim its tag before it +// can validly write) +// +// WARNING: This class has to be written very carefully, because it may be +// possible for a Tensor to have a reference an interpreter corresponding to +// a shared library that has ALREADY BEEN UNLOADED. This makes blindly calling +// virtual methods very dangerous, because the vtable may be garbage at that +// point (on a good day, you might get "pure virtual method called"). +// +// The idea to solve this problem is we always leak PyInterpreters (so they +// always stay live even after dlclose), and disarm the "virtual methods" by +// replacing them with function pointers that just no-op. This can't be done +// with a traditional C++ vtable, so we have to roll our own. +// +// NB: The downside with representing PyInterpreter tags as full objects is that +// it takes an extra word on TensorImpl. If tags were instead just integer +// indices, on 64-bit architectures we could pack the tag and PyObject together +// into a single atomic word. On 32-bit architectures we could simply say that +// only one Python interpreter is supported (erroring if a nontrivial +// interpreter tag is attempted to be set). +// +// The difficulty with this scheme is we need to maintain an out-of-line table +// to get at the PyInterpreters so that we can do virtual method calls on them, +// and registration/deregistration to this table must be done in a thread safe +// manner. This can be easily done if the number of possible PyInterpreters is +// small enough (e.g., 8-bit integer) by simply preallocating an array of +// sufficient size to hold all possible interpreters. Surely 128 threads is +// more than enough for anyone! +// +// I didn't decide to do this technique at the moment, because the extra word +// added by the PyInterpreter tag takes us to 24 words, which means that we +// still fit inside three eight word cache lines. If you need to penny pinch +// another word consider doing this! + +struct C10_API PyInterpreter { + // Feel free to add as much random crap here as you need; each of these + // can be thought of as a "C++ to Python" hook. + using name_sig = std::string(const PyInterpreter*); + using decref_sig = void(const PyInterpreter*, PyObject*, bool); + using detach_sig = + c10::intrusive_ptr(const PyInterpreter*, const TensorImpl*); + using dispatch_sig = void( + const PyInterpreter*, + const c10::OperatorHandle&, + torch::jit::Stack* stack, + // This is a Tensor subclass type object + const std::shared_ptr& type); + using is_contiguous_sig = bool(const PyInterpreter*, const TensorImpl*); + + PyInterpreter( + name_sig* name_fn, + decref_sig* decref_fn, + detach_sig* detach, + dispatch_sig* dispatch, + is_contiguous_sig* is_contiguous) + : name_fn_(name_fn), + decref_fn_(decref_fn), + detach_fn_(detach), + dispatch_fn_(dispatch), + is_contiguous_fn_(is_contiguous) {} + + name_sig* name_fn_; + decref_sig* decref_fn_; + detach_sig* detach_fn_; + dispatch_sig* dispatch_fn_; + is_contiguous_sig* is_contiguous_fn_; + + // UBSAN suppression fixes: "call to function + // (anonymous namespace)::concrete_decref_fn(c10::impl::PyInterpreter const*, + // _object*) through pointer to incorrect function type 'void (*)(const + // c10::impl::PyInterpreter *, _object *)'" See + // https://github.com/google/sanitizers/issues/911 + + // Report the name of this interpreter + __ubsan_ignore_function__ std::string name() const { + return (*name_fn_)(this); + } + + // Run Py_DECREF on a PyObject. We DO NOT assume the GIL is held on call + // See NOTE [PyInterpreter::decref takes an `is_tensor` arg] + __ubsan_ignore_function__ void decref(PyObject* pyobj, bool is_tensor) const { + return (*decref_fn_)(this, pyobj, is_tensor); + } + + // Perform a detach by deferring to the __torch_dispatch__ implementation of + // detach, which will also arrange for the PyObject to get copied in this + // situation + __ubsan_ignore_function__ c10::intrusive_ptr detach( + const TensorImpl* self) const { + return (*detach_fn_)(this, self); + } + + // Invoke the Python boxed fallback dispatch to go back into Python + __ubsan_ignore_function__ void dispatch( + const c10::OperatorHandle& op, + torch::jit::Stack* stack, + const std::shared_ptr& type) const { + return (*dispatch_fn_)(this, op, stack, type); + } + + __ubsan_ignore_function__ bool is_contiguous(const TensorImpl* self) const { + return (*is_contiguous_fn_)(this, self); + } + + // Disarm this PyInterpreter, making all of its methods noops. + // Because the function pointers are raw pointers (not atomics), + // a disarm() invocation that is concurrent with active destructors + // is not thread safe and will trigger TSAN. My hope is that this + // situations doesn't ever actually happen; tensor destruction should + // quiesce when a dlclose happens, and any long lived tensors whose + // destructors would be disarmed here only begin the destruction process + // on process shutdown (long after the dlclose has occurred). + void disarm() noexcept; +}; + +} // namespace impl +} // namespace c10 diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index c1ac4bd0ed0c..a098003f9501 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -177,6 +178,8 @@ struct Block { Block* prev; // prev block if split from a larger allocation Block* next; // next block if split from a larger allocation int event_count; // number of outstanding CUDA events + int gc_count; // counter for prioritizing older / less useful blocks for + // garbage collection Block( int device, @@ -193,7 +196,8 @@ struct Block { allocated(0), prev(nullptr), next(nullptr), - event_count(0) {} + event_count(0), + gc_count(0) {} // constructor for search key Block(int device, cudaStream_t stream, size_t size) @@ -206,7 +210,8 @@ struct Block { allocated(0), prev(nullptr), next(nullptr), - event_count(0) {} + event_count(0), + gc_count(0) {} bool is_split() const { return (prev != nullptr) || (next != nullptr); @@ -310,7 +315,7 @@ cudaError_t cudaMallocMaybeCapturing(void** p, size_t size) { if (at::cuda::currentStreamCaptureStatusMayInitCtx() == at::cuda::CaptureStatus::None) { #endif - return cudaMalloc(p, size); + return C10_CUDA_ERROR_HANDLED(cudaMalloc(p, size)); #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 } else { // It's ok to capture cudaMallocs, as long as we never cudaFree those @@ -318,7 +323,7 @@ cudaError_t cudaMallocMaybeCapturing(void** p, size_t size) { // Capturing cudaMalloc behaves nicely: it gives the graph new VA, // but is ignored (won't leakily allocate new memory) in replays. at::cuda::CUDAStreamCaptureModeGuard g{cudaStreamCaptureModeRelaxed}; - return cudaMalloc(p, size); + return C10_CUDA_ERROR_HANDLED(cudaMalloc(p, size)); } #endif } @@ -330,6 +335,17 @@ class CachingAllocatorConfig { static size_t max_split_size() { return instance().m_max_split_size; } + static double garbage_collection_threshold() { + return instance().m_garbage_collection_threshold; + } + + // This is used to round-up allocation size to nearest power of 2 divisions. + // More description below in function roundup_power2_next_division + // As ane example, if we want 4 divisions between 2's power, this can be done + // using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4 + static size_t roundup_power2_divisions() { + return instance().m_roundup_power2_divisions; + } private: static CachingAllocatorConfig& instance() { @@ -342,8 +358,12 @@ class CachingAllocatorConfig { } CachingAllocatorConfig() - : m_max_split_size(std::numeric_limits::max()) {} + : m_max_split_size(std::numeric_limits::max()), + m_roundup_power2_divisions(0), + m_garbage_collection_threshold(0) {} size_t m_max_split_size; + size_t m_roundup_power2_divisions; + double m_garbage_collection_threshold; void parseArgs() { const char* val = getenv("PYTORCH_CUDA_ALLOC_CONF"); @@ -373,6 +393,32 @@ class CachingAllocatorConfig { val2 = std::min( val2, (std::numeric_limits::max() / (1024 * 1024))); m_max_split_size = val2 * 1024 * 1024; + } else if (kv[0].compare("roundup_power2_divisions") == 0) { + size_t val2 = stoi(kv[1]); + TORCH_CHECK( + llvm::isPowerOf2_64(val2), + "For roundups, the divisons has to be power of 2 ", + ""); + m_roundup_power2_divisions = val2; + } else if (kv[0].compare("garbage_collection_threshold") == 0) { + /* + * Perform garbage collection of GPU memory blocks to avoid + * triggering expensive sync-and-reclaim-all operation. Upon setting + * the threshold (e.g., 0.8), the allocator will start reclaiming + * blocks if GPU memory capacity usage exceeds the threshold (i.e., + * 80% of total memory). + * Values 0.0 and 1.0 are not allowed as they are less meaningful. + */ + double val2 = stod(kv[1]); + TORCH_CHECK( + val2 > 0, + "garbage_collect_threshold too small, set it 0.0~1.0", + ""); + TORCH_CHECK( + val2 < 1.0, + "garbage_collect_threshold too big, set it 0.0~1.0", + ""); + m_garbage_collection_threshold = val2; } else { TORCH_CHECK(false, "Unrecognized CachingAllocator option: ", kv[0]); } @@ -469,18 +515,31 @@ class DeviceCachingAllocator { params.stat_types[static_cast(StatType::AGGREGATE)] = true; params.stat_types[static_cast(get_stat_type_for_pool(pool))] = true; + // First, try to get a block from the existing pool. bool block_found = // Search pool get_free_block(params) // Trigger callbacks and retry search - || (trigger_free_memory_callbacks(params) && get_free_block(params)) - // Attempt allocate - || alloc_block(params, false) - // Free enough available cached blocks to satisfy alloc and retry alloc. - || - (release_available_cached_blocks(params) && alloc_block(params, false)) - // Free all non-split cached blocks and retry alloc. - || (release_cached_blocks() && alloc_block(params, true)); + || (trigger_free_memory_callbacks(params) && get_free_block(params)); + + // Can't reuse an existing block; try to get a new one. + if (!block_found) { + // Do garbage collection if the flag is set. + if (C10_UNLIKELY( + set_fraction && + CachingAllocatorConfig::garbage_collection_threshold() > 0.0)) { + garbage_collect_cached_blocks(); + } + // Attempt allocate + block_found = alloc_block(params, false) + // Free enough available cached blocks to satisfy alloc and retry + // alloc. + || (release_available_cached_blocks(params) && + alloc_block(params, false)) + // Free all non-split cached blocks and retry alloc. + || (C10_LIKELY(captures_underway == 0) && release_cached_blocks() && + alloc_block(params, true)); + } if (!block_found) { // For any error code other than cudaErrorMemoryAllocation, @@ -699,9 +758,9 @@ class DeviceCachingAllocator { if (*largest == 0) { // make an initial guess if a zero *largest is passed in size_t tmp_bytes; - cudaMemGetInfo( + C10_CUDA_CHECK(cudaMemGetInfo( largest, // Use free memory as an optimistic initial guess of *largest - &tmp_bytes); + &tmp_bytes)); } cache_info_aux(large_blocks, total, largest); cache_info_aux(small_blocks, total, largest); @@ -808,11 +867,43 @@ class DeviceCachingAllocator { return result; } + // This function takes the size and number of divisions argument and rounds + // up the size argument for the nearest power-of-2 division. + // For example, if we need to round-up 1200 and number of divisions is 4, + // the size 1200 lies between 1024 and 2048 and if we do 4 divisions between + // them, the values are 1024, 1280, 1536, and 1792. So the function will + // return 1280 as the nearest ceiling of power-2 divison. + static size_t roundup_power2_next_division(size_t size, size_t divisions) { + if (C10_UNLIKELY(size <= 4 || divisions <= 1)) { + return size; + } + if (llvm::isPowerOf2_64(size)) { + return size; + } + + // divide the space between these 2's power into equal divisions + // If division is zero, return the power-of-2 ceiling. + size_t power2_floor = llvm::PowerOf2Floor(size); + size_t power2_divison = + power2_floor >> (63 - llvm::countLeadingZeros(divisions)); + if (C10_UNLIKELY(power2_divison == 0)) { + return (power2_floor << 1); + } + size_t round_size_floor = size & (~(power2_divison - 1)); + return (round_size_floor == size) ? size + : round_size_floor + power2_divison; + } + static size_t round_size(size_t size) { if (size < kMinBlockSize) { return kMinBlockSize; } else { - return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize); + auto divisions = CachingAllocatorConfig::roundup_power2_divisions(); + if (divisions > 0 && size > (kMinBlockSize * divisions)) { + return roundup_power2_next_division(size, divisions); + } else { + return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize); + } } } @@ -1037,6 +1128,15 @@ class DeviceCachingAllocator { bool get_free_block(AllocParams& p) { BlockPool& pool = *p.pool; + + if (C10_UNLIKELY( + set_fraction && + CachingAllocatorConfig::garbage_collection_threshold() > 0.0)) { + // Track block reuse interval only when garbage collection is enabled. + for (auto& b : pool.blocks) { + ++b->gc_count; + } + } auto it = pool.blocks.lower_bound(&p.search_key); if (it == pool.blocks.end() || (*it)->stream != p.stream()) return false; @@ -1049,6 +1149,7 @@ class DeviceCachingAllocator { ((*it)->size >= p.size() + kLargeBuffer)) return false; p.block = *it; + (*it)->gc_count = 0; // Denote this block has been used pool.blocks.erase(it); return true; } @@ -1062,6 +1163,62 @@ class DeviceCachingAllocator { return freed_memory; } + void garbage_collect_cached_blocks() { + // Free unused cached blocks to reclaim GPU memory. + // Unlike release_cached_blocks(), this does not enforce synchronization and + // therefore should be of less overheads. + + size_t gc_threshold = static_cast( + CachingAllocatorConfig::garbage_collection_threshold() * + allowed_memory_maximum); + // No need to trigger GC yet + if (total_allocated_memory <= gc_threshold) { + return; + } + const auto target_size = total_allocated_memory - gc_threshold; + size_t gc_reclaimed = 0; + + // Calculate the total age of the free-able blocks. We'll use it later to + // get "avg age" threshold. + double total_age = 0.0; + int freeable_block_count = 0; + for (auto& b : large_blocks.blocks) { + if (!b->is_split()) { + total_age += b->gc_count; + ++freeable_block_count; + } + } + // No free-able blocks? + if (freeable_block_count == 0) { + return; + } + + // Repeat GC until we reach reclaim > target size. + bool block_freed = true; + while (gc_reclaimed < target_size && block_freed == true && + freeable_block_count > 0) { + // Free blocks exceeding this age threshold first. + double age_threshold = total_age / freeable_block_count; + // Stop iteration if we can no longer free a block. + block_freed = false; + + // Free blocks of > avg age. Don't stop upon reaching the target_size, + // we don't want this GC to be triggered frequently. + auto it = large_blocks.blocks.begin(); + while (it != large_blocks.blocks.end()) { + Block* block = *it; + ++it; + if (!block->is_split() && block->gc_count >= age_threshold) { + block_freed = true; + gc_reclaimed += block->size; + total_age -= block->gc_count; // Decrement the age + freeable_block_count--; // One less block that can be freed + release_block(block); + } + } + } + } + bool alloc_block(AllocParams& p, bool isRetry) { // Defensively checks for preexisting CUDA error state. C10_CUDA_CHECK(cudaGetLastError()); @@ -1304,7 +1461,7 @@ class DeviceCachingAllocator { cudaEvent_t event = e.first; Block* block = e.second; - cudaError_t err = cudaEventQuery(event); + cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaEventQuery(event)); if (err == cudaErrorNotReady) { // ignore and clear the error if not ready cudaGetLastError(); @@ -1422,9 +1579,9 @@ class THCCachingAllocator { fraction, ". Please set within (0, 1)."); int activated_device; - cudaGetDevice(&activated_device); + C10_CUDA_CHECK(cudaGetDevice(&activated_device)); if (activated_device != device) { - cudaSetDevice(device); + C10_CUDA_CHECK(cudaSetDevice(device)); } device_allocator[device]->setMemoryFraction(fraction); } diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h index d3a73943f7bb..9b1a6ecf1590 100644 --- a/c10/cuda/CUDACachingAllocator.h +++ b/c10/cuda/CUDACachingAllocator.h @@ -102,6 +102,7 @@ struct DeviceStats { // cudaMalloc).. struct BlockInfo { int64_t size = 0; + int32_t gc_counter = 0; bool allocated = false; bool active = false; }; diff --git a/c10/cuda/CUDAException.h b/c10/cuda/CUDAException.h index 77d0d07ac95e..ca441711cbd6 100644 --- a/c10/cuda/CUDAException.h +++ b/c10/cuda/CUDAException.h @@ -63,6 +63,26 @@ class C10_CUDA_API CUDAError : public c10::Error { } \ } while (0) +// Indicates that a CUDA error is handled in a non-standard way +#define C10_CUDA_ERROR_HANDLED(EXPR) EXPR + +// Intentionally ignore a CUDA error +#define C10_CUDA_IGNORE_ERROR(EXPR) \ + do { \ + cudaError_t __err = EXPR; \ + if (__err != cudaSuccess) { \ + cudaError_t error_unused C10_UNUSED = cudaGetLastError(); \ + (void)error_unused; \ + } \ + } while (0) + +// Clear the last CUDA error +#define C10_CUDA_CLEAR_ERROR() \ + do { \ + cudaError_t error_unused C10_UNUSED = cudaGetLastError(); \ + (void)error_unused; \ + } while (0) + // This should be used directly after every kernel launch to ensure // the launch happened correctly and provide an early, close-to-source // diagnostic if it didn't. diff --git a/c10/cuda/CUDAFunctions.cpp b/c10/cuda/CUDAFunctions.cpp index 255d798d13fb..9ab61aa1f381 100644 --- a/c10/cuda/CUDAFunctions.cpp +++ b/c10/cuda/CUDAFunctions.cpp @@ -10,16 +10,13 @@ namespace { // returns -1 on failure int32_t driver_version() { int driver_version = -1; - cudaError_t err = cudaDriverGetVersion(&driver_version); - if (err != cudaSuccess) { - cudaError_t last_err C10_UNUSED = cudaGetLastError(); - } + C10_CUDA_IGNORE_ERROR(cudaDriverGetVersion(&driver_version)); return driver_version; } int device_count_impl(bool fail_if_no_driver) { int count; - auto err = cudaGetDeviceCount(&count); + auto err = C10_CUDA_ERROR_HANDLED(cudaGetDeviceCount(&count)); if (err == cudaSuccess) { return count; } diff --git a/c10/cuda/CUDAStream.h b/c10/cuda/CUDAStream.h index 7bb97e88b991..6d17136341c6 100644 --- a/c10/cuda/CUDAStream.h +++ b/c10/cuda/CUDAStream.h @@ -111,7 +111,7 @@ class C10_CUDA_API CUDAStream { bool query() const { DeviceGuard guard{stream_.device()}; - cudaError_t err = cudaStreamQuery(stream()); + cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaStreamQuery(stream())); if (err == cudaSuccess) { return true; diff --git a/c10/cuda/impl/CUDAGuardImpl.h b/c10/cuda/impl/CUDAGuardImpl.h index 8f5cfdc259d3..583feeec2600 100644 --- a/c10/cuda/impl/CUDAGuardImpl.h +++ b/c10/cuda/impl/CUDAGuardImpl.h @@ -41,7 +41,7 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface { } c10::optional uncheckedGetDevice() const noexcept { int device; - auto err = cudaGetDevice(&device); + const auto err = C10_CUDA_ERROR_HANDLED(cudaGetDevice(&device)); C10_CUDA_CHECK_WARN(err); if (err != cudaSuccess) { return c10::nullopt; @@ -164,7 +164,7 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface { if (!event) return true; cudaEvent_t cuda_event = static_cast(event); - const cudaError_t err = cudaEventQuery(cuda_event); + const cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaEventQuery(cuda_event)); if (err != cudaErrorNotReady) { C10_CUDA_CHECK(err); } else { diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h index 28dc1df9430e..e839d2841f7c 100644 --- a/c10/macros/Macros.h +++ b/c10/macros/Macros.h @@ -1,5 +1,6 @@ #ifndef C10_MACROS_MACROS_H_ #define C10_MACROS_MACROS_H_ +#include /* Main entry for c10/macros. * @@ -331,15 +332,14 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256; // CUDA_KERNEL_ASSERT checks the assertion // even when NDEBUG is defined. This is useful for important assertions in CUDA // code that would otherwise be suppressed when building Release. -#if defined(__ANDROID__) || defined(__APPLE__) || defined(__XROS__) || \ - (defined(USE_ROCM) && ROCM_VERSION < 40100) +#if defined(__ANDROID__) || defined(__APPLE__) || defined(USE_ROCM) // Those platforms do not support assert() #define CUDA_KERNEL_ASSERT(cond) #elif defined(_MSC_VER) #if defined(NDEBUG) extern "C" { C10_IMPORT -#if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__) || defined(__HIP__) +#if defined(__CUDA_ARCH__) __host__ __device__ #endif // __CUDA_ARCH__ void @@ -360,8 +360,7 @@ extern SYCL_EXTERNAL void __assert_fail( unsigned int line, const char* func); #else // __SYCL_DEVICE_ONLY__ -#if (defined(__CUDA_ARCH__) && !(defined(__clang__) && defined(__CUDA__))) || \ - defined(__HIP_ARCH__) || defined(__HIP__) +#if (defined(__CUDA_ARCH__) && !(defined(__clang__) && defined(__CUDA__))) __host__ __device__ #endif // __CUDA_ARCH__ void @@ -482,8 +481,7 @@ __host__ __device__ #endif #ifndef HAS_DEMANGLE -#if defined(__ANDROID__) || defined(_WIN32) || defined(__EMSCRIPTEN__) || \ - defined(__XROS__) +#if defined(__ANDROID__) || defined(_WIN32) || defined(__EMSCRIPTEN__) #define HAS_DEMANGLE 0 #elif defined(__APPLE__) && \ (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE) diff --git a/c10/macros/cmake_configure_file.bzl b/c10/macros/cmake_configure_file.bzl index 53ed0656739b..16d09cc9ee30 100644 --- a/c10/macros/cmake_configure_file.bzl +++ b/c10/macros/cmake_configure_file.bzl @@ -13,7 +13,7 @@ def _cmake_configure_file_impl(ctx): ) # Replace any that remain with /* #undef FOO */. - command.append("| sed --regexp-extended 's@#cmakedefine (\\w+)@/* #undef \\1 */@'") + command.append("| sed -r 's@#cmakedefine ([A-Z0-9_]+)@/* #undef \\1 */@'") command.append("> $2") ctx.actions.run_shell( diff --git a/c10/test/build.bzl b/c10/test/build.bzl index 0c6b2a5486f7..0b3a5a5f3d84 100644 --- a/c10/test/build.bzl +++ b/c10/test/build.bzl @@ -1,24 +1,55 @@ def define_targets(rules): - rules.cc_test( + rules.test_suite( name = "tests", + tests = [ + ":core_tests", + ":typeid_test", + ":util_base_tests", + ], + visibility = ["//:__pkg__"], + ) + + rules.cc_test( + name = "core_tests", size = "small", srcs = rules.glob([ - "util/*.cpp", "core/*.cpp", "core/impl/*.cpp", ]), copts = ["-Wno-deprecated-declarations"], + deps = [ + "@com_google_googletest//:gtest_main", + "//c10/core:base", + "//c10/util:base", + ], + ) + + rules.cc_test( + name = "typeid_test", + size = "small", + srcs = ["util/typeid_test.cpp"], + copts = ["-Wno-deprecated-declarations"], + deps = [ + "@com_google_googletest//:gtest_main", + "//c10/util:typeid", + ], + ) + + rules.cc_test( + name = "util_base_tests", + srcs = rules.glob( + ["util/*.cpp"], + exclude = ["util/typeid_test.cpp"], + ), + copts = ["-Wno-deprecated-declarations"], deps = [ ":Macros", ":complex_math_test_common", ":complex_test_common", "@com_google_googletest//:gtest_main", - "//c10/core:base", "//c10/macros", "//c10/util:base", - "//c10/util:typeid", ], - visibility = ["//:__pkg__"], ) rules.cc_library( diff --git a/c10/test/core/DispatchKeySet_test.cpp b/c10/test/core/DispatchKeySet_test.cpp index 43b06c110e5b..266f45882393 100644 --- a/c10/test/core/DispatchKeySet_test.cpp +++ b/c10/test/core/DispatchKeySet_test.cpp @@ -3,25 +3,163 @@ #include #include +#include using namespace c10; +// This test exists not to be comprehensive, but to more clearly show +// what the semantics of DispatchKeySet are. +TEST(DispatchKeySet, ShowSemantics) { + // the "CPU" dispatch key is an instance of a per-backend-functionality key. + // It corresponds to "dense" functionality, "CPU" backend. + // This means that it gets a dense functionality bit, and a cpu backend bit + // set. + auto undefined_set = DispatchKeySet(); + auto dense_cpu_set = DispatchKeySet(DispatchKey::CPU); + ASSERT_TRUE(dense_cpu_set.has(DispatchKey::Dense)); + ASSERT_TRUE(dense_cpu_set.has_backend(BackendComponent::CPUBit)); + ASSERT_TRUE(dense_cpu_set.has(DispatchKey::CPU)); + + auto dense_lazy_set = DispatchKeySet(DispatchKey::Lazy); + ASSERT_TRUE(dense_lazy_set.has(DispatchKey::Dense)); + ASSERT_TRUE(dense_lazy_set.has_backend(BackendComponent::LazyBit)); + ASSERT_TRUE(dense_lazy_set.has(DispatchKey::Lazy)); + + // You can think of "Dense/Sparse", and "CPUBit/CUDABit", as "building block" + // dispatch keys. You are allowed to directly create keysets out of them! + auto dense_cpu_set_from_building_blocks = DispatchKeySet(DispatchKey::Dense) | + DispatchKeySet(BackendComponent::CPUBit); + ASSERT_TRUE(dense_cpu_set.has(DispatchKey::Dense)); + ASSERT_TRUE(dense_cpu_set.has_backend(BackendComponent::CPUBit)); + ASSERT_TRUE(dense_cpu_set.has(DispatchKey::CPU)); + ASSERT_EQ(dense_cpu_set, dense_cpu_set_from_building_blocks); + + // Similarly, the AutogradCUDA key gets 2 bits in the keyset: + // The "Autograd" functionality bit, and the "CUDA" backend bit + auto autograd_cuda = DispatchKeySet(DispatchKey::AutogradCUDA); + ASSERT_TRUE(autograd_cuda.has(DispatchKey::AutogradFunctionality)); + ASSERT_TRUE(autograd_cuda.has_backend(BackendComponent::CUDABit)); + + // Because DispatchKeySet uses a condensed internal representation, you cannot + // use it to represent the FULL cross product of backends and functionalities + // for example: + auto autograd_dense_cpu_cuda = DispatchKeySet( + {DispatchKey::AutogradFunctionality, + DispatchKey::Dense, + DispatchKey::CUDA, + DispatchKey::CPU}); + auto fpga = DispatchKeySet(DispatchKey::FPGA); + auto fpga_and_cpu = DispatchKeySet({DispatchKey::FPGA, DispatchKey::CPU}); + // this keyset has all of the building block keys: + ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::AutogradFunctionality)); + ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::Dense)); + ASSERT_TRUE(autograd_dense_cpu_cuda.has_backend(BackendComponent::CUDABit)); + ASSERT_TRUE(autograd_dense_cpu_cuda.has_backend(BackendComponent::CPUBit)); + + // and it also has the "runtime" keys that correspond to the full + // cross-product of functionality + ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::AutogradCPU)); + ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::AutogradCPU)); + ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::CPU)); + ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::CUDA)); + + // This means that there's no way to represent a keyset with, say, only + // Autograd CUDA + Dense CPU. Instead, you should think of a keyset as + // inheriting the full set of functionalities + backends of its keys. This + // means that the below keysets are all indistinguishable from each other. + ASSERT_EQ( + autograd_dense_cpu_cuda, + DispatchKeySet( + {DispatchKey::AutogradCUDA, + DispatchKey::AutogradCPU, + DispatchKey::CUDA, + DispatchKey::CPU})); + ASSERT_EQ( + autograd_dense_cpu_cuda, + DispatchKeySet({DispatchKey::AutogradCUDA, DispatchKey::CPU})); + ASSERT_EQ( + autograd_dense_cpu_cuda, + DispatchKeySet({DispatchKey::CUDA, DispatchKey::AutogradCPU})); + + // ~~~~~~~~~~ DispatchKeySet iterators ~~~~~~~~~~~ + + // Iterators allow you to iterate individually through the DispatchKey's in a + // DispatchKeySet + auto empty_set = DispatchKeySet(); + auto t1 = empty_set.begin(); + auto t2 = empty_set.end(); + ASSERT_EQ(*empty_set.begin(), *empty_set.end()); + + // However, only keys that correspond to actual runtime indices of kernels in + // the operator table show up when you iterate through a keyset. i.e. + // DispatchKey::Dense, and BackendComponent::CPUBit won't show up in an + // iterator. + auto dense_cpu_iter = dense_cpu_set.begin(); + ASSERT_EQ(*dense_cpu_iter++, DispatchKey::CPU); + ASSERT_EQ(*dense_cpu_iter, *dense_cpu_set.end()); + + auto autograd_dense_cpu_cuda_iter = autograd_dense_cpu_cuda.begin(); + ASSERT_EQ(*autograd_dense_cpu_cuda_iter++, DispatchKey::CPU); + ASSERT_EQ(*autograd_dense_cpu_cuda_iter++, DispatchKey::CUDA); + ASSERT_EQ(*autograd_dense_cpu_cuda_iter++, DispatchKey::AutogradCPU); + ASSERT_EQ(*autograd_dense_cpu_cuda_iter++, DispatchKey::AutogradCUDA); + ASSERT_EQ(*autograd_dense_cpu_cuda_iter, *autograd_dense_cpu_cuda.end()); + + // But other "functionality bits" that are not defined per-backend DO get + // their own slots in the operator table. + auto mixed_keyset = DispatchKeySet(BackendComponent::CPUBit) | + DispatchKeySet( + {DispatchKey::FPGA, // runtime key + DispatchKey::Functionalize, // runtime key + DispatchKey::Dense}); // NOT a runtime key + auto mixed_iter = mixed_keyset.begin(); + ASSERT_EQ(*mixed_iter++, DispatchKey::CPU); + ASSERT_EQ(*mixed_iter++, DispatchKey::FPGA); + ASSERT_EQ(*mixed_iter++, DispatchKey::Functionalize); + ASSERT_EQ(*mixed_iter, *mixed_keyset.end()); +} + TEST(DispatchKeySet, Empty) { DispatchKeySet empty_set; - for (uint8_t i = 1; i < static_cast(DispatchKey::NumDispatchKeys); + for (uint8_t i = 0; + i <= static_cast(DispatchKey::EndOfRuntimeBackendKeys); i++) { auto tid = static_cast(i); + if (tid == DispatchKey::Undefined) + continue; ASSERT_FALSE(empty_set.has(tid)); } ASSERT_TRUE(empty_set.empty()); DispatchKeySet empty_set2; ASSERT_TRUE(empty_set == empty_set2); - ASSERT_EQ(empty_set.highestPriorityTypeId(), DispatchKey::Undefined); } -TEST(DispatchKeySet, Singleton) { - for (uint8_t i = 1; i < static_cast(DispatchKey::NumDispatchKeys); - i++) { +// This covers all keys that correspond to a single backend bit, e.g. +// BackendComponent::CPUBit. Even though these are NOT runtime keys, we still +// allow adding them directly to a keyset +TEST(DispatchKeySet, SingletonBackendComponent) { + for (const auto i : c10::irange(1, num_backends)) { + auto tid = static_cast(i); + DispatchKeySet sing(tid); + ASSERT_EQ(sing, sing); + ASSERT_EQ(sing, DispatchKeySet().add(tid)); + ASSERT_EQ(sing, sing.add(tid)); + ASSERT_EQ(sing, sing | sing); + ASSERT_FALSE(sing.empty()); + ASSERT_TRUE(sing.has(tid)); + } +} + +// This covers all keys that correspond to a single functionality bit: +// - runtime, not-per-backend functionality keys, e.g. +// DispatchKey::FuncTorchBatched +// - runtime, "fake backend" keys, e.g. DispatchKey::FPGA +// - NOT-runtime, per-backend functionality keys, e.g. DispatchKey::Dense +// Even though it's not a runtime key, we still allow adding it directly to a +// keyset. +// DispatchKey:: +TEST(DispatchKeySet, SingletonFunctionalityKeys) { + for (const auto i : c10::irange(1, num_functionality_keys)) { auto tid = static_cast(i); DispatchKeySet sing(tid); ASSERT_EQ(sing, sing); @@ -30,47 +168,147 @@ TEST(DispatchKeySet, Singleton) { ASSERT_EQ(sing, sing | sing); ASSERT_FALSE(sing.empty()); ASSERT_TRUE(sing.has(tid)); - ASSERT_EQ(sing.highestPriorityTypeId(), tid); ASSERT_EQ(sing.remove(tid), DispatchKeySet()); } } -TEST(DispatchKeySet, Doubleton) { - for (uint8_t i = 1; i < static_cast(DispatchKey::NumDispatchKeys); +// This covers runtime keys that are per-backend, +// and take up more than one bit in a DispatchKeySet. They take up one +// functionality bit + one backend bit. e.g. CPU, CUDA, SparseCPU, SparseCUDA, +// AutogradCPU, AutogradCUDA +TEST(DispatchKeySet, SingletonPerBackendFunctionalityKeys) { + for (uint8_t i = static_cast(DispatchKey::StartOfDenseBackends); + i <= static_cast(DispatchKey::EndOfRuntimeBackendKeys); + i++) { + auto tid = static_cast(i); + // Skip these because they aren't real keys. + if (tid == DispatchKey::StartOfDenseBackends || + tid == DispatchKey::StartOfSparseBackends || + tid == DispatchKey::StartOfQuantizedBackends || + tid == DispatchKey::StartOfAutogradBackends) { + continue; + } + DispatchKeySet sing(tid); + ASSERT_EQ(sing, sing); + ASSERT_EQ(sing, DispatchKeySet().add(tid)); + ASSERT_EQ(sing, sing.add(tid)); + ASSERT_EQ(sing, sing | sing); + ASSERT_FALSE(sing.empty()); + ASSERT_TRUE(sing.has(tid)); + + auto functionality_key = toFunctionalityKey(tid); + auto backend_key = toBackendComponent(tid); + // These two sets should be equivalent: + // DispatchKeySet(DispatchKey::CPU) + // DispatchKeySet({DispatchKey::Dense, BackendComponent::CPUBit}) + auto expected_ks = + DispatchKeySet(functionality_key) | DispatchKeySet(backend_key); + ASSERT_EQ(sing, expected_ks); + // These two sets should be equivalent: + // DispatchKeySet(DispatchKey::CPU).remove(DispatchKey::Dense) + // DispatchKeySet(BackendComponent::CPUBit) + expected_ks = DispatchKeySet(toBackendComponent(tid)); + ASSERT_EQ(sing.remove(tid), expected_ks); + } +} + +TEST(DispatchKeySet, DoubletonPerBackend) { + for (uint8_t i = static_cast(DispatchKey::StartOfDenseBackends); + i <= static_cast(DispatchKey::EndOfRuntimeBackendKeys); i++) { for (uint8_t j = i + 1; - j < static_cast(DispatchKey::NumDispatchKeys); + j <= static_cast(DispatchKey::EndOfRuntimeBackendKeys); j++) { ASSERT_LT(i, j); auto tid1 = static_cast(i); auto tid2 = static_cast(j); - auto doub = DispatchKeySet(tid1).add(tid2); - ASSERT_EQ(doub, DispatchKeySet(tid1) | DispatchKeySet(tid2)); - ASSERT_TRUE(doub.has(tid1)); - ASSERT_TRUE(doub.has(tid2)); - ASSERT_EQ(doub.highestPriorityTypeId(), tid2); // relies on i < j + + // Skip these because they aren't real keys. + if (tid1 == DispatchKey::StartOfDenseBackends || + tid1 == DispatchKey::StartOfSparseBackends || + tid1 == DispatchKey::StartOfQuantizedBackends || + tid1 == DispatchKey::StartOfNestedTensorBackends || + tid1 == DispatchKey::StartOfAutogradBackends) + continue; + if (tid2 == DispatchKey::StartOfDenseBackends || + tid2 == DispatchKey::StartOfSparseBackends || + tid2 == DispatchKey::StartOfQuantizedBackends || + tid2 == DispatchKey::StartOfNestedTensorBackends || + tid2 == DispatchKey::StartOfAutogradBackends) + continue; + + auto backend1 = toBackendComponent(tid1); + auto backend2 = toBackendComponent(tid2); + auto functionality1 = toFunctionalityKey(tid1); + auto functionality2 = toFunctionalityKey(tid2); + + auto combined = DispatchKeySet({tid1, tid2}); + // The combined set has the backend bits + ASSERT_TRUE(combined.has_backend(backend1)); + ASSERT_TRUE(combined.has_backend(backend2)); + // and it has the backend bits + ASSERT_TRUE(combined.has(functionality1)); + ASSERT_TRUE(combined.has(functionality2)); + // and it has the original two runtime keys + ASSERT_TRUE(combined.has(tid1)); + ASSERT_TRUE(combined.has(tid2)); + + // Add all of the keys in the keyset to a real set + std::unordered_set visited_keys; + auto iter = combined.begin(); + while (*iter != *combined.end()) { + visited_keys.insert(*iter); + ++iter; + } + std::unordered_set expected_keys; + expected_keys.insert( + toRuntimePerBackendFunctionalityKey(functionality1, backend1)); + expected_keys.insert( + toRuntimePerBackendFunctionalityKey(functionality1, backend2)); + expected_keys.insert( + toRuntimePerBackendFunctionalityKey(functionality2, backend1)); + expected_keys.insert( + toRuntimePerBackendFunctionalityKey(functionality2, backend2)); + ASSERT_EQ(expected_keys, visited_keys); + + if (backend1 == backend2 || functionality1 == functionality2) { + // We have two runtime keys, with either the same backend or the same + // per-backend functionalities. E.g. {AutogradCUDA, CUDA} or + // {AutogradCPU, AutogradCUDA} There should be 2 total runtime keys in + // this set. + ASSERT_EQ(2, visited_keys.size()); + } else { + // since i and j are different keys, they should not have the same + // functionality and backend + ASSERT_TRUE(backend1 != backend2 && functionality1 != functionality2); + // We have two runtime keys, that have different backends + per-backend + // functionalities. So we should expect the full cross product of + // runtime keys to be in the set. e.g. if i = AutogradCUDA, and j = CPU, + // then combined = {AutogradCUDA, AutogradCPU, CUDA, CPU} + ASSERT_EQ(4, visited_keys.size()); + } } } } TEST(DispatchKeySet, Full) { DispatchKeySet full(DispatchKeySet::FULL); - for (uint8_t i = 1; i < static_cast(DispatchKey::NumDispatchKeys); - i++) { + for (const auto i : c10::irange(1, num_functionality_keys)) { auto tid = static_cast(i); ASSERT_TRUE(full.has(tid)); } + ASSERT_FALSE(full.has(DispatchKey::EndOfFunctionalityKeys)); } TEST(DispatchKeySet, IteratorBasicOps) { DispatchKeySet empty_set; DispatchKeySet full_set(DispatchKeySet::FULL); - DispatchKeySet mutated_set = empty_set.add(static_cast(1)); + DispatchKeySet mutated_set = empty_set.add(DispatchKey::CPU); // Constructor + Comparison - ASSERT_EQ(*empty_set.begin(), DispatchKey::NumDispatchKeys); - ASSERT_EQ(*empty_set.end(), DispatchKey::NumDispatchKeys); - ASSERT_EQ(*mutated_set.begin(), static_cast(1)); + ASSERT_EQ(*empty_set.begin(), DispatchKey::EndOfFunctionalityKeys); + ASSERT_EQ(*empty_set.end(), DispatchKey::EndOfFunctionalityKeys); + ASSERT_EQ(*mutated_set.begin(), DispatchKey::CPU); ASSERT_TRUE(empty_set.begin() == empty_set.end()); ASSERT_TRUE(full_set.begin() != full_set.end()); @@ -80,6 +318,25 @@ TEST(DispatchKeySet, IteratorBasicOps) { ASSERT_TRUE(full_set.begin() != ++full_set.begin()); } +TEST(DispatchKeySet, getHighestPriorityBackendTypeId) { + // AutogradCPU isn't a backend key so it is ignored + DispatchKeySet dense_cpu({DispatchKey::AutogradCPU, DispatchKey::CPU}); + ASSERT_EQ(DispatchKey::CPU, c10::highestPriorityBackendTypeId(dense_cpu)); + + // Functionalize isn't a backend key so it is ignored + DispatchKeySet sparse_cuda( + {DispatchKey::Functionalize, DispatchKey::SparseCUDA}); + ASSERT_EQ( + DispatchKey::SparseCUDA, c10::highestPriorityBackendTypeId(sparse_cuda)); + + // quantizedCUDA has higher priority than CUDA + DispatchKeySet quantized_cuda( + {DispatchKey::CUDA, DispatchKey::QuantizedCUDA}); + ASSERT_EQ( + DispatchKey::QuantizedCUDA, + c10::highestPriorityBackendTypeId(quantized_cuda)); +} + TEST(DispatchKeySet, IteratorEmpty) { DispatchKeySet empty_set; uint8_t i = 0; @@ -90,16 +347,37 @@ TEST(DispatchKeySet, IteratorEmpty) { ASSERT_EQ(i, 0); } +TEST(DispatchKeySet, IteratorCrossProduct) { + // The iterator should return all runtime keys in the set, + // including the cross product of {backends} x {functionalities} + auto ks = + DispatchKeySet({BackendComponent::CPUBit, BackendComponent::CUDABit}) | + DispatchKeySet( + {DispatchKey::Dense, + DispatchKey::FPGA, + DispatchKey::AutogradFunctionality}); + + auto iter = ks.begin(); + // iterate through dense backends first. + ASSERT_EQ(DispatchKey::CPU, *(iter++)); + ASSERT_EQ(DispatchKey::CUDA, *(iter++)); + // FPGA doesn't have a backend bit, so it isn't included in the cross product. + ASSERT_EQ(DispatchKey::FPGA, *(iter++)); + // iterate through the autograd keys laster. + ASSERT_EQ(DispatchKey::AutogradCPU, *(iter++)); + ASSERT_EQ(DispatchKey::AutogradCUDA, *(iter++)); +} + TEST(DispatchKeySet, IteratorFull) { DispatchKeySet full_set(DispatchKeySet::FULL); uint8_t i = 0; for (const auto& it : full_set) { i++; - ASSERT_TRUE(it == static_cast(i)); - ASSERT_TRUE(it != DispatchKey::NumDispatchKeys); } - ASSERT_EQ(i, static_cast(DispatchKey::NumDispatchKeys) - 1); + // Total # of runtime entries includes an entry for DispatchKey::Undefined, + // which is not included when iterating through the DispatchKeySet. + ASSERT_EQ(i, num_runtime_entries - 1); } TEST(DispatchKeySet, IteratorRangeFull) { @@ -108,41 +386,61 @@ TEST(DispatchKeySet, IteratorRangeFull) { for (DispatchKey dispatch_key : full_set) { i++; - ASSERT_TRUE(dispatch_key == static_cast(i)); } - ASSERT_EQ(i, static_cast(DispatchKey::NumDispatchKeys) - 1); -} - -TEST(DispatchKeySet, SpecificKeys) { - DispatchKeySet keyset({ - static_cast(0), // Undefined should be ignored - static_cast(4), - static_cast(10), - static_cast(15), - }); - std::unordered_set visited_keys; - - for (DispatchKey key : keyset) { - visited_keys.insert(key); - } - - ASSERT_EQ(visited_keys.size(), 3); - ASSERT_TRUE( - visited_keys.find(static_cast(4)) != visited_keys.end()); - ASSERT_TRUE( - visited_keys.find(static_cast(10)) != visited_keys.end()); - ASSERT_TRUE( - visited_keys.find(static_cast(15)) != visited_keys.end()); + // Total # of runtime entries includes an entry for DispatchKey::Undefined, + // which is not included when iterating through the DispatchKeySet. + ASSERT_EQ(i, num_runtime_entries - 1); } TEST(DispatchKeySet, FailAtEndIterator) { DispatchKeySet full_set(DispatchKeySet::FULL); uint64_t raw_repr = full_set.raw_repr(); + // doesn't throw + DispatchKeySet::iterator(&raw_repr, num_backends + num_functionality_keys); // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) EXPECT_THROW( DispatchKeySet::iterator( - &raw_repr, static_cast(DispatchKey::NumDispatchKeys) + 1), + &raw_repr, num_backends + num_functionality_keys + 1), c10::Error); } + +TEST(DispatchKeySet, TestKeyOrderingInvariants) { + for (uint8_t i = static_cast(DispatchKey::StartOfDenseBackends); + i <= static_cast(DispatchKey::EndOfRuntimeBackendKeys); + i++) { + auto k = static_cast(i); + // Note [The Ordering of Per-Backend Dispatch Keys Matters!] + // The DispatchKey enum includes all of the runtime keys for + // Dense/Sparse/Quantized/Autograd, (e.g. CPU, CUDA, SparseCPU, SparseCUDA, + // AutogradCPU, AutogradCUDA, etc). And we expect the ordering of those keys + // to be the same as the ordering of the backends in the `BackendComponent` + // enum. This makes several utilities in `DispatchKey.h` and + // `DispatchKeySet.h` significantly easier to implement. The purpose of the + // test is to assert (through CI) that this invariant is maintained. + // + // The only way that we can really check this invariant is by + // comparing the string names of each enum. + // We only really care about the ordering for "real" keys that are actually + // used, which we expect to be able to print properly. This saves us from + // having to enumerate the full set of possible runtime keys in + // DispatchKey::toString(). It also relies on toString() being implemented + // correctly. + auto functionality_str = std::string(toString(k)); + if (functionality_str == "UNKNOWN_TENSOR_TYPE_ID") + continue; + + auto computed_backend_k = toBackendComponent(k); + auto computed_backend_str = std::string(toString(computed_backend_k)); + // Skip, e.g., the "Bit" from "CPUBit" + computed_backend_str = + computed_backend_str.substr(0, computed_backend_str.size() - 3); + + ASSERT_TRUE( + functionality_str.find(computed_backend_str) != std::string::npos) + << "DispatchKey invariant broken! Found a key that is not ordered correctly" + << " with its backend bit. key = " << toString(k) << ", " << k + << ", computed backend = " << toString(computed_backend_k); + } +} diff --git a/c10/test/util/DeadlockDetection_test.cpp b/c10/test/util/DeadlockDetection_test.cpp new file mode 100644 index 000000000000..35c4953f6d33 --- /dev/null +++ b/c10/test/util/DeadlockDetection_test.cpp @@ -0,0 +1,31 @@ +#include + +#include + +#include + +using namespace ::testing; +using namespace c10::impl; + +struct DummyPythonGILHooks : public PythonGILHooks { + bool check_python_gil() const override { + return true; + } +}; + +TEST(DeadlockDetection, basic) { + ASSERT_FALSE(check_python_gil()); + DummyPythonGILHooks hooks; + SetPythonGILHooks(&hooks); + ASSERT_TRUE(check_python_gil()); + SetPythonGILHooks(nullptr); +} + +#ifndef _WIN32 +TEST(DeadlockDetection, disable) { + setenv("TORCH_DISABLE_DEADLOCK_DETECTION", "1", 1); + DummyPythonGILHooks hooks; + SetPythonGILHooks(&hooks); + SetPythonGILHooks(&hooks); +} +#endif diff --git a/c10/test/util/Synchronized_test.cpp b/c10/test/util/Synchronized_test.cpp new file mode 100644 index 000000000000..ce781a10cadb --- /dev/null +++ b/c10/test/util/Synchronized_test.cpp @@ -0,0 +1,43 @@ +#include +#include + +#include +#include + +namespace { + +TEST(Synchronized, TestSingleThreadExecution) { + c10::Synchronized iv(0); + const int kMaxValue = 100; + for (int i = 0; i < kMaxValue; ++i) { + auto ret = iv.withLock([](int& iv) { return ++iv; }); + EXPECT_EQ(ret, i + 1); + } + + iv.withLock([kMaxValue](int& iv) { EXPECT_EQ(iv, kMaxValue); }); +} + +TEST(Synchronized, TestMultiThreadedExecution) { + c10::Synchronized iv(0); +#define NUM_LOOP_INCREMENTS 10000 + + auto thread_cb = [&iv]() { + for (int i = 0; i < NUM_LOOP_INCREMENTS; ++i) { + iv.withLock([](int& iv) { ++iv; }); + } + }; + + std::array threads; + for (auto& t : threads) { + t = std::thread(thread_cb); + } + + for (auto& t : threads) { + t.join(); + } + + iv.withLock([](int& iv) { EXPECT_EQ(iv, NUM_LOOP_INCREMENTS * 10); }); +#undef NUM_LOOP_INCREMENTS +} + +} // namespace diff --git a/c10/test/util/ordered_preserving_dict_test.cpp b/c10/test/util/ordered_preserving_dict_test.cpp index 773b2e7a2a35..aa1d7f0f986e 100644 --- a/c10/test/util/ordered_preserving_dict_test.cpp +++ b/c10/test/util/ordered_preserving_dict_test.cpp @@ -48,7 +48,7 @@ dict_int_int test_dict(dict_int_int& dict) { } dict.erase(begin, end); - std::vector order; + std::vector order; for (const auto i : c10::irange(100)) { if (!erase_set.count(i)) { order.push_back(i); @@ -211,12 +211,12 @@ TEST(OrderedPreservingDictTest, test_range_erase) { using HMap = ska_ordered::order_preserving_flat_hash_map; - const std::size_t nb_values = 1000; + const int64_t nb_values = 1000; HMap map; for (const auto i : c10::irange(nb_values)) { map[c10::guts::to_string(i)] = i; auto begin = map.begin(); - for (size_t j = 0; j <= i; ++j, begin++) { + for (int64_t j = 0; j <= i; ++j, begin++) { TORCH_INTERNAL_ASSERT(begin->second == j); } } diff --git a/c10/util/ArrayRef.h b/c10/util/ArrayRef.h index 0602404b5f05..4d45c5e6c413 100644 --- a/c10/util/ArrayRef.h +++ b/c10/util/ArrayRef.h @@ -25,7 +25,6 @@ #include namespace c10 { - /// ArrayRef - Represent a constant reference to an array (0 or more elements /// consecutively in memory), i.e. a start pointer and a length. It allows /// various APIs to take consecutive elements easily and conveniently. @@ -92,7 +91,6 @@ class ArrayRef final { debugCheckNullptrInvariant(); } - /// Construct an ArrayRef from a generic Container. template < typename Container, typename = std::enable_if_t typename std::enable_if< - !std::is_array::value && !std::is_array::value && + !std::is_array::value && !std::is_array::value && std::is_base_of::value, std::unique_ptr>::type make_unique_base(Args&&... args) { diff --git a/c10/util/DeadlockDetection.cpp b/c10/util/DeadlockDetection.cpp index d95a72f95553..bb95939fc53f 100644 --- a/c10/util/DeadlockDetection.cpp +++ b/c10/util/DeadlockDetection.cpp @@ -1,11 +1,17 @@ #include +#include + namespace c10 { namespace impl { namespace { PythonGILHooks* python_gil_hooks = nullptr; + +bool disable_detection() { + return std::getenv("TORCH_DISABLE_DEADLOCK_DETECTION") != nullptr; } +} // namespace bool check_python_gil() { if (!python_gil_hooks) { @@ -15,6 +21,9 @@ bool check_python_gil() { } void SetPythonGILHooks(PythonGILHooks* hooks) { + if (disable_detection()) { + return; + } TORCH_INTERNAL_ASSERT(!hooks || !python_gil_hooks); python_gil_hooks = hooks; } diff --git a/c10/util/DimVector.h b/c10/util/DimVector.h new file mode 100644 index 000000000000..fea1651a46c0 --- /dev/null +++ b/c10/util/DimVector.h @@ -0,0 +1,13 @@ +#pragma once + +#include +#include + +namespace c10 { + +constexpr size_t kDimVectorStaticSize = 5; + +/// A container for sizes or strides +using DimVector = SmallVector; + +} // namespace c10 diff --git a/c10/util/Exception.h b/c10/util/Exception.h index 0eb0c6a80bf1..327e4cbfabd1 100644 --- a/c10/util/Exception.h +++ b/c10/util/Exception.h @@ -314,13 +314,13 @@ C10_API std::string GetExceptionString(const std::exception& e); // (unlike assert()). // #ifdef STRIP_ERROR_MESSAGES -#define TORCH_INTERNAL_ASSERT(cond, ...) \ - if (C10_UNLIKELY_OR_CONST(!(cond))) { \ - ::c10::detail::torchCheckFail( \ - __func__, \ - __FILE__, \ - static_cast(__LINE__), \ - #cond "INTERNAL ASSERT FAILED at" C10_STRINGIZE(__FILE__)); \ +#define TORCH_INTERNAL_ASSERT(cond, ...) \ + if (C10_UNLIKELY_OR_CONST(!(cond))) { \ + ::c10::detail::torchCheckFail( \ + __func__, \ + __FILE__, \ + static_cast(__LINE__), \ + #cond " INTERNAL ASSERT FAILED at " C10_STRINGIZE(__FILE__)); \ } #else // It would be nice if we could build a combined string literal out of @@ -328,16 +328,16 @@ C10_API std::string GetExceptionString(const std::exception& e); // as the first argument, but there doesn't seem to be any good way to // do that while still supporting having a first argument that isn't a // string literal. -#define TORCH_INTERNAL_ASSERT(cond, ...) \ - if (C10_UNLIKELY_OR_CONST(!(cond))) { \ - ::c10::detail::torchInternalAssertFail( \ - __func__, \ - __FILE__, \ - static_cast(__LINE__), \ - #cond \ - "INTERNAL ASSERT FAILED at " C10_STRINGIZE(__FILE__) ":" C10_STRINGIZE( \ - __LINE__) ", please report a bug to PyTorch. ", \ - c10::str(__VA_ARGS__)); \ +#define TORCH_INTERNAL_ASSERT(cond, ...) \ + if (C10_UNLIKELY_OR_CONST(!(cond))) { \ + ::c10::detail::torchInternalAssertFail( \ + __func__, \ + __FILE__, \ + static_cast(__LINE__), \ + #cond \ + " INTERNAL ASSERT FAILED at " C10_STRINGIZE(__FILE__) ":" C10_STRINGIZE( \ + __LINE__) ", please report a bug to PyTorch. ", \ + c10::str(__VA_ARGS__)); \ } #endif @@ -375,7 +375,7 @@ C10_API std::string GetExceptionString(const std::exception& e); namespace c10 { namespace detail { template -decltype(auto) torchCheckMsgImpl(const char* msg, const Args&... args) { +decltype(auto) torchCheckMsgImpl(const char* /*msg*/, const Args&... args) { return ::c10::str(args...); } inline C10_API const char* torchCheckMsgImpl(const char* msg) { @@ -383,7 +383,7 @@ inline C10_API const char* torchCheckMsgImpl(const char* msg) { } // If there is just 1 user-provided C-string argument, use it. inline C10_API const char* torchCheckMsgImpl( - const char* msg, + const char* /*msg*/, const char* args) { return args; } @@ -433,7 +433,7 @@ namespace detail { const char* file, uint32_t line, const char* condMsg, - ::c10::detail::CompileTimeEmptyString userMsg) { + ::c10::detail::CompileTimeEmptyString /*userMsg*/) { torchCheckFail(func, file, line, condMsg); } [[noreturn]] C10_API void torchInternalAssertFail( diff --git a/c10/util/Half-inl.h b/c10/util/Half-inl.h index 3e2b5071a549..b438f4a01452 100644 --- a/c10/util/Half-inl.h +++ b/c10/util/Half-inl.h @@ -12,7 +12,7 @@ #include #endif -#ifdef __SYCL_DEVICE_ONLY__ +#ifdef SYCL_LANGUAGE_VERSION #include #endif @@ -56,6 +56,15 @@ inline C10_HOST_DEVICE Half::operator __half() const { } #endif +#ifdef SYCL_LANGUAGE_VERSION +inline C10_HOST_DEVICE Half::Half(const sycl::half& value) { + x = *reinterpret_cast(&value); +} +inline C10_HOST_DEVICE Half::operator sycl::half() const { + return *reinterpret_cast(&x); +} +#endif + // CUDA intrinsics #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)) || \ @@ -88,6 +97,8 @@ inline C10_HOST_DEVICE Half operator-(const Half& a) { #if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ defined(__HIP_DEVICE_COMPILE__) return __hneg(a); +#elif defined(__SYCL_DEVICE_ONLY__) + return -static_cast(a); #else return -static_cast(a); #endif diff --git a/c10/util/Half.h b/c10/util/Half.h index 517f5807d557..dc51f032fdc2 100644 --- a/c10/util/Half.h +++ b/c10/util/Half.h @@ -45,6 +45,10 @@ #include #endif +#ifdef SYCL_LANGUAGE_VERSION +#include +#endif + // Standard check for compiling CUDA with clang #if defined(__clang__) && defined(__CUDA__) && defined(__CUDA_ARCH__) #define C10_DEVICE_HOST_FUNCTION __device__ __host__ @@ -390,29 +394,59 @@ struct alignas(2) Half { inline C10_HOST_DEVICE Half(const __half& value); inline C10_HOST_DEVICE operator __half() const; #endif +#ifdef SYCL_LANGUAGE_VERSION + inline C10_HOST_DEVICE Half(const sycl::half& value); + inline C10_HOST_DEVICE operator sycl::half() const; +#endif }; -// This is just a placeholder for whatever complex representation we -// end up deciding to use for half-precision complex numbers. +// TODO : move to complex.h template <> struct alignas(4) complex { - using value_type = Half; Half real_; Half imag_; + + // Constructors complex() = default; - Half real() const { + // Half constructor is not constexpr so the following constructor can't + // be constexpr + C10_HOST_DEVICE explicit inline complex(const Half& real, const Half& imag) + : real_(real), imag_(imag) {} + C10_HOST_DEVICE inline complex(const c10::complex& value) + : real_(value.real()), imag_(value.imag()) {} + + // Conversion operator + inline C10_HOST_DEVICE operator c10::complex() const { + return {real_, imag_}; + } + + constexpr C10_HOST_DEVICE Half real() const { return real_; } - Half imag() const { + constexpr C10_HOST_DEVICE Half imag() const { return imag_; } - explicit inline complex(c10::complex value) - : real_(value.real()), imag_(value.imag()) {} - explicit inline complex(c10::complex value) - : real_(static_cast(value.real())), - imag_(static_cast(value.imag())) {} - inline operator c10::complex() const { - return {real_, imag_}; + + complex& operator+=(const complex& other) { + real_ = static_cast(real_) + static_cast(other.real_); + imag_ = static_cast(imag_) + static_cast(other.imag_); + return *this; + } + + complex& operator-=(const complex& other) { + real_ = static_cast(real_) - static_cast(other.real_); + imag_ = static_cast(imag_) - static_cast(other.imag_); + return *this; + } + + complex& operator*=(const complex& other) { + auto a = static_cast(real_); + auto b = static_cast(imag_); + auto c = static_cast(other.real()); + auto d = static_cast(other.imag()); + real_ = a * c - b * d; + imag_ = a * d + b * c; + return *this; } }; @@ -442,7 +476,7 @@ struct alignas(4) complex { // for `f > limit::max()` below template typename std::enable_if::value, bool>::type overflows( - From f) { + From /*f*/) { return false; } diff --git a/c10/util/LeftRight.h b/c10/util/LeftRight.h index 13529f2ea0c7..e45267cb8f7e 100644 --- a/c10/util/LeftRight.h +++ b/c10/util/LeftRight.h @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -192,13 +193,9 @@ class LeftRight final { // read-write lock to protect T (data). template class RWSafeLeftRightWrapper final { - using mutexType = std::mutex; - using rLockType = std::unique_lock; - using wLockType = std::unique_lock; - public: template - explicit RWSafeLeftRightWrapper(const Args&... args) : _data{args...} {} + explicit RWSafeLeftRightWrapper(const Args&... args) : data_{args...} {} // RWSafeLeftRightWrapper is not copyable or moveable since LeftRight // is not copyable or moveable. @@ -209,19 +206,17 @@ class RWSafeLeftRightWrapper final { template auto read(F&& readFunc) const -> typename std::result_of::type { - rLockType lock(mutex_); - return readFunc(_data); + return data_.withLock( + [&readFunc](T const& data) { return readFunc(data); }); } template auto write(F&& writeFunc) -> typename std::result_of::type { - wLockType lock(mutex_); - return writeFunc(_data); + return data_.withLock([&writeFunc](T& data) { return writeFunc(data); }); } private: - T _data; - mutable mutexType mutex_; + c10::Synchronized data_; }; } // namespace c10 diff --git a/c10/util/Logging.cpp b/c10/util/Logging.cpp index c1ede582ff2a..fe74e4954864 100644 --- a/c10/util/Logging.cpp +++ b/c10/util/Logging.cpp @@ -145,8 +145,13 @@ bool LogAPIUsageFakeReturn(const std::string& event) try { // static destructor race return true; } -} // namespace detail +namespace { + +void setLogLevelFlagFromEnv(); + +} // namespace +} // namespace detail } // namespace c10 #if defined(C10_USE_GFLAGS) && defined(C10_USE_GLOG) @@ -198,23 +203,39 @@ bool IsGoogleLoggingInitialized(); } // namespace google namespace c10 { -bool InitCaffeLogging(int* argc, char** argv) { - if (*argc == 0) - return true; +namespace { + +void initGoogleLogging(char const* name) { #if !defined(_MSC_VER) // This trick can only be used on UNIX platforms if (!::google::glog_internal_namespace_::IsGoogleLoggingInitialized()) #endif { - ::google::InitGoogleLogging(argv[0]); + ::google::InitGoogleLogging(name); #if !defined(_MSC_VER) // This is never defined on Windows -#if !defined(__XROS__) ::google::InstallFailureSignalHandler(); -#endif #endif } +} + +} // namespace + +void initLogging() { + detail::setLogLevelFlagFromEnv(); + + UpdateLoggingLevelsFromFlags(); +} + +bool InitCaffeLogging(int* argc, char** argv) { + if (*argc == 0) { + return true; + } + + initGoogleLogging(argv[0]); + UpdateLoggingLevelsFromFlags(); + return true; } @@ -254,6 +275,11 @@ C10_DEFINE_int( "The minimum log level that caffe2 will output."); namespace c10 { + +void initLogging() { + detail::setLogLevelFlagFromEnv(); +} + bool InitCaffeLogging(int* argc, char** argv) { // When doing InitCaffeLogging, we will assume that caffe's flag parser has // already finished. @@ -356,3 +382,53 @@ MessageLogger::~MessageLogger() { } // namespace c10 #endif // !C10_USE_GLOG + +namespace c10 { +namespace detail { +namespace { + +void setLogLevelFlagFromEnv() { + const char* level_str = std::getenv("TORCH_CPP_LOG_LEVEL"); + + // Not set, fallback to the default level (i.e. WARNING). + std::string level{level_str != nullptr ? level_str : ""}; + if (level.empty()) { + return; + } + + std::transform( + level.begin(), level.end(), level.begin(), [](unsigned char c) { + return toupper(c); + }); + + if (level == "0" || level == "INFO") { + FLAGS_caffe2_log_level = 0; + + return; + } + if (level == "1" || level == "WARNING") { + FLAGS_caffe2_log_level = 1; + + return; + } + if (level == "2" || level == "ERROR") { + FLAGS_caffe2_log_level = 2; + + return; + } + if (level == "3" || level == "FATAL") { + FLAGS_caffe2_log_level = 3; + + return; + } + + std::cerr + << "`TORCH_CPP_LOG_LEVEL` environment variable cannot be parsed. Valid values are " + "`INFO`, `WARNING`, `ERROR`, and `FATAL` or their numerical equivalents `0`, `1`, " + "`2`, and `3`." + << std::endl; +} + +} // namespace +} // namespace detail +} // namespace c10 diff --git a/c10/util/Logging.h b/c10/util/Logging.h index fd78a21fc594..e2ed61de606f 100644 --- a/c10/util/Logging.h +++ b/c10/util/Logging.h @@ -80,7 +80,7 @@ C10_API void UpdateLoggingLevelsFromFlags(); const char* file, const int line, const char* condition, - detail::CompileTimeEmptyString msg, + detail::CompileTimeEmptyString /*msg*/, const void* caller = nullptr) { ThrowEnforceNotMet(file, line, condition, "", caller); } @@ -103,7 +103,7 @@ C10_API void UpdateLoggingLevelsFromFlags(); const char* file, const int line, const char* condition, - detail::CompileTimeEmptyString msg, + detail::CompileTimeEmptyString /*msg*/, const void* caller = nullptr) { ThrowEnforceFiniteNotMet(file, line, condition, "", caller); } @@ -305,6 +305,9 @@ namespace detail { C10_API bool LogAPIUsageFakeReturn(const std::string& context); } // namespace detail +// Initializes the c10 logger. +C10_API void initLogging(); + } // namespace c10 #endif // C10_UTIL_LOGGING_H_ diff --git a/c10/util/MaybeOwned.h b/c10/util/MaybeOwned.h index a3028f22ea18..a698e275c119 100644 --- a/c10/util/MaybeOwned.h +++ b/c10/util/MaybeOwned.h @@ -24,7 +24,7 @@ struct MaybeOwnedTraitsGenericImpl { lhs = rhs; } - static void destroyBorrow(borrow_type& toDestroy) {} + static void destroyBorrow(borrow_type& /*toDestroy*/) {} static const owned_type& referenceFromBorrow(const borrow_type& borrow) { return *borrow; diff --git a/c10/util/Metaprogramming.h b/c10/util/Metaprogramming.h index 30f6d7c590a5..1f7fcf363f39 100644 --- a/c10/util/Metaprogramming.h +++ b/c10/util/Metaprogramming.h @@ -398,7 +398,7 @@ template < index::value, int> = 0> decltype(auto) extract_tuple_element_by_index( HeadTuple&& head_tuple, - TailTuples&&... tail_tuples) { + TailTuples&&... /*tail_tuples*/) { // TODO if constexpr instead of enable_if return std::get(std::forward(head_tuple)); } @@ -409,7 +409,7 @@ template < class... TailTuples, std::enable_if_t= std::tuple_size::value, int> = 0> decltype(auto) extract_tuple_element_by_index( - HeadTuple&& head_tuple, + HeadTuple&& /*head_tuple*/, TailTuples&&... tail_tuples) { // TODO if constexpr instead of enable_if return extract_tuple_element_by_index< diff --git a/c10/util/Optional.h b/c10/util/Optional.h index e81911296bc9..17f4d5a8007f 100644 --- a/c10/util/Optional.h +++ b/c10/util/Optional.h @@ -12,7 +12,7 @@ // C10 // - Move file to `c10` namespace. // - Remove macro use in line 478 because the nvcc device compiler cannot handle -// it it. +// it. // - Revise constructor logic so that it is 1) consistent with c++ 17 standard // documented here in (8): // https://en.cppreference.com/w/cpp/utility/optional/optional, and 2) able to diff --git a/c10/util/OptionalArrayRef.h b/c10/util/OptionalArrayRef.h new file mode 100644 index 000000000000..7ca375d7cb78 --- /dev/null +++ b/c10/util/OptionalArrayRef.h @@ -0,0 +1,228 @@ +// This file defines OptionalArrayRef, a class that has almost the same +// exact functionality as c10::optional>, except that its +// converting constructor fixes a dangling pointer issue. +// +// The implicit converting constructor of both c10::optional> and +// std::optional> can cause the underlying ArrayRef to store +// a dangling pointer. OptionalArrayRef prevents this by wrapping +// a c10::optional> and fixing the constructor implementation. +// +// See https://github.com/pytorch/pytorch/issues/63645 for more on this. + +#pragma once + +#include +#include + +namespace c10 { + +template +class OptionalArrayRef final { + public: + // Constructors + + constexpr OptionalArrayRef() noexcept {} + + constexpr OptionalArrayRef(nullopt_t) noexcept {} + + OptionalArrayRef(const OptionalArrayRef& other) = default; + + OptionalArrayRef(OptionalArrayRef&& other) = default; + + constexpr OptionalArrayRef(const optional>& other) noexcept + : wrapped_opt_array_ref(other) {} + + constexpr OptionalArrayRef(optional>&& other) noexcept + : wrapped_opt_array_ref(other) {} + + constexpr OptionalArrayRef(const T& value) noexcept + : wrapped_opt_array_ref(value) {} + + template < + typename U = ArrayRef, + std::enable_if_t< + !std::is_same, OptionalArrayRef>::value && + !std::is_same, in_place_t>::value && + std::is_constructible, U&&>::value && + std::is_convertible>::value && + !std::is_convertible::value, + bool> = false> + constexpr OptionalArrayRef(U&& value) noexcept( + std::is_nothrow_constructible, U&&>::value) + : wrapped_opt_array_ref(value) {} + + template < + typename U = ArrayRef, + std::enable_if_t< + !std::is_same, OptionalArrayRef>::value && + !std::is_same, in_place_t>::value && + std::is_constructible, U&&>::value && + !std::is_convertible>::value, + bool> = false> + constexpr explicit OptionalArrayRef(U&& value) noexcept( + std::is_nothrow_constructible, U&&>::value) + : wrapped_opt_array_ref(value) {} + + template + constexpr explicit OptionalArrayRef(in_place_t ip, Args&&... args) noexcept + : wrapped_opt_array_ref(ip, args...) {} + + template + constexpr explicit OptionalArrayRef( + in_place_t ip, + std::initializer_list il, + Args&&... args) + : wrapped_opt_array_ref(ip, il, args...) {} + + // Destructor + + ~OptionalArrayRef() = default; + + // Assignment + + constexpr OptionalArrayRef& operator=(nullopt_t) noexcept { + wrapped_opt_array_ref = c10::nullopt; + return *this; + } + + OptionalArrayRef& operator=(const OptionalArrayRef& other) = default; + + OptionalArrayRef& operator=(OptionalArrayRef&& other) = default; + + constexpr OptionalArrayRef& operator=( + const optional>& other) noexcept { + wrapped_opt_array_ref = other; + return *this; + } + + constexpr OptionalArrayRef& operator=( + optional>&& other) noexcept { + wrapped_opt_array_ref = other; + return *this; + } + + template > + constexpr std::enable_if_t< + !std::is_same, OptionalArrayRef>::value && + std::is_constructible, U&&>::value && + std::is_assignable&, U&&>::value, + OptionalArrayRef&> + operator=(U&& value) noexcept( + std::is_nothrow_constructible, U&&>::value&& + std::is_nothrow_assignable&, U&&>::value) { + wrapped_opt_array_ref = value; + return *this; + } + + // Observers + + constexpr ArrayRef* operator->() noexcept { + return &wrapped_opt_array_ref.value(); + } + + constexpr const ArrayRef* operator->() const noexcept { + return &wrapped_opt_array_ref.value(); + } + + constexpr ArrayRef& operator*() & noexcept { + return wrapped_opt_array_ref.value(); + } + + constexpr const ArrayRef& operator*() const& noexcept { + return wrapped_opt_array_ref.value(); + } + + constexpr ArrayRef&& operator*() && noexcept { + return std::move(wrapped_opt_array_ref.value()); + } + + constexpr const ArrayRef&& operator*() const&& noexcept { + return std::move(wrapped_opt_array_ref.value()); + } + + constexpr explicit operator bool() const noexcept { + return wrapped_opt_array_ref.has_value(); + } + + constexpr bool has_value() const noexcept { + return wrapped_opt_array_ref.has_value(); + } + + constexpr ArrayRef& value() & { + return wrapped_opt_array_ref.value(); + } + + constexpr const ArrayRef& value() const& { + return wrapped_opt_array_ref.value(); + } + + constexpr ArrayRef&& value() && { + return std::move(wrapped_opt_array_ref.value()); + } + + constexpr const ArrayRef&& value() const&& { + return std::move(wrapped_opt_array_ref.value()); + } + + template + constexpr std:: + enable_if_t>::value, ArrayRef> + value_or(U&& default_value) const& { + return wrapped_opt_array_ref.value_or(default_value); + } + + template + constexpr std:: + enable_if_t>::value, ArrayRef> + value_or(U&& default_value) && { + return wrapped_opt_array_ref.value_or(default_value); + } + + // Modifiers + + constexpr void swap(OptionalArrayRef& other) noexcept { + std::swap(wrapped_opt_array_ref, other.wrapped_opt_array_ref); + } + + constexpr void reset() noexcept { + wrapped_opt_array_ref.reset(); + } + + template + constexpr std::enable_if_t< + std::is_constructible, Args&&...>::value, + ArrayRef&> + emplace(Args&&... args) noexcept( + std::is_nothrow_constructible, Args&&...>::value) { + return wrapped_opt_array_ref.emplace(args...); + } + + template + constexpr ArrayRef& emplace( + std::initializer_list il, + Args&&... args) noexcept { + return wrapped_opt_array_ref.emplace(il, args...); + } + + private: + optional> wrapped_opt_array_ref; +}; + +using OptionalIntArrayRef = OptionalArrayRef; + +inline bool operator==( + const OptionalIntArrayRef& a1, + const IntArrayRef& other) { + if (!a1.has_value()) { + return false; + } + return a1.value() == other; +} + +inline bool operator==( + const c10::IntArrayRef& a1, + const c10::OptionalIntArrayRef& a2) { + return a2 == a1; +} + +} // namespace c10 diff --git a/c10/util/SmallVector.h b/c10/util/SmallVector.h index 7567ce4add05..1fcc4a1a8f43 100644 --- a/c10/util/SmallVector.h +++ b/c10/util/SmallVector.h @@ -193,6 +193,8 @@ class SmallVectorTemplateCommon /// Check whether Elt will be invalidated by resizing the vector to NewSize. void assertSafeToReferenceAfterResize(const void* Elt, size_t NewSize) { + (void)Elt; // Suppress unused variable warning + (void)NewSize; // Suppress unused variable warning assert( isSafeToReferenceAfterResize(Elt, NewSize) && "Attempting to reference an element of the vector in an operation " diff --git a/c10/util/Synchronized.h b/c10/util/Synchronized.h new file mode 100644 index 000000000000..1679d7060fe0 --- /dev/null +++ b/c10/util/Synchronized.h @@ -0,0 +1,61 @@ +#pragma once + +#include + +namespace c10 { + +/** + * A very simple Synchronization class for error-free use of data + * in a multi-threaded context. See folly/docs/Synchronized.md for + * the inspiration of this class. + * + * Full URL: + * https://github.com/facebook/folly/blob/main/folly/docs/Synchronized.md + * + * This class implements a small subset of the generic functionality + * implemented by folly:Synchronized. Specifically, only withLock + * is implemeted here since it's the smallest possible API that is + * able to cover a large surface area of functionality offered by + * folly::Synchronized. + */ +template +class Synchronized final { + mutable std::mutex mutex_; + T data_; + + public: + Synchronized() = default; + Synchronized(T const& data) : data_(data) {} + Synchronized(T&& data) : data_(data) {} + + // Don't permit copy construction, move, assignment, or + // move assignment, since the underlying std::mutex + // isn't necessarily copyable/moveable. + Synchronized(Synchronized const&) = delete; + Synchronized(Synchronized&&) = delete; + Synchronized operator=(Synchronized const&) = delete; + Synchronized operator=(Synchronized&&) = delete; + + /** + * To use, call withLock with a callback that accepts T either + * by copy or by reference. Use the protected variable in the + * provided callback safely. + */ + template + typename std::result_of::type withLock(CB cb) { + std::lock_guard guard(this->mutex_); + return cb(this->data_); + } + + /** + * To use, call withLock with a callback that accepts T either + * by copy or by const reference. Use the protected variable in + * the provided callback safely. + */ + template + typename std::result_of::type withLock(CB cb) const { + std::lock_guard guard(this->mutex_); + return cb(this->data_); + } +}; +} // end namespace c10 diff --git a/c10/util/TypeCast.h b/c10/util/TypeCast.h index 86c5c9f62231..1c6a72bab492 100644 --- a/c10/util/TypeCast.h +++ b/c10/util/TypeCast.h @@ -45,7 +45,8 @@ struct static_cast_with_inter_type { C10_HOST_DEVICE __ubsan_ignore_undefined__ static inline dest_t apply( src_t src) { constexpr bool real = needs_real::value; - return static_cast(maybe_real::apply(src)); + auto r = maybe_real::apply(src); + return static_cast(r); } }; @@ -68,6 +69,36 @@ struct static_cast_with_inter_type { } }; +template <> +struct static_cast_with_inter_type, c10::BFloat16> { + C10_HOST_DEVICE __ubsan_ignore_undefined__ static inline c10::complex< + c10::Half> + apply(c10::BFloat16 src) { + return static_cast>(c10::complex{src}); + } +}; + +template <> +struct static_cast_with_inter_type, c10::Half> { + C10_HOST_DEVICE __ubsan_ignore_undefined__ static inline c10::complex< + c10::Half> + apply(c10::Half src) { + return static_cast>(c10::complex{src}); + } +}; + +template <> +struct static_cast_with_inter_type< + c10::complex, + c10::complex> { + C10_HOST_DEVICE __ubsan_ignore_undefined__ static inline c10::complex< + c10::Half> + apply(c10::complex src) { + return static_cast>( + static_cast>(src)); + } +}; + // Dynamic type casting utils: // - fetch_and_cast // - cast_and_store @@ -130,7 +161,7 @@ C10_HOST_DEVICE inline dest_t fetch_and_cast( const ScalarType src_type, const void* ptr) { switch (src_type) { - AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(FETCH_AND_CAST_CASE) + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(FETCH_AND_CAST_CASE) default: ERROR_UNSUPPORTED_CAST } @@ -149,7 +180,7 @@ C10_HOST_DEVICE inline void cast_and_store( void* ptr, src_t value) { switch (dest_type) { - AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(CAST_AND_STORE_CASE) + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(CAST_AND_STORE_CASE) default:; } ERROR_UNSUPPORTED_CAST diff --git a/c10/util/TypeSafeSignMath.h b/c10/util/TypeSafeSignMath.h index 155f01f292ba..7eb6d61c122e 100644 --- a/c10/util/TypeSafeSignMath.h +++ b/c10/util/TypeSafeSignMath.h @@ -17,8 +17,8 @@ namespace c10 { /// Returns false since we cannot have x < 0 if x is unsigned. template static inline constexpr bool is_negative( - const T& x, - std::true_type is_unsigned) { + const T& /*x*/, + std::true_type /*is_unsigned*/) { return false; } @@ -26,7 +26,7 @@ static inline constexpr bool is_negative( template static inline constexpr bool is_negative( const T& x, - std::false_type is_unsigned) { + std::false_type /*is_unsigned*/) { return x < T(0); } @@ -42,13 +42,15 @@ inline constexpr bool is_negative(const T& x) { /// Returns the sign of an unsigned variable x as 0, 1 template -static inline constexpr int signum(const T& x, std::true_type is_unsigned) { +static inline constexpr int signum(const T& x, std::true_type /*is_unsigned*/) { return T(0) < x; } /// Returns the sign of a signed variable x as -1, 0, 1 template -static inline constexpr int signum(const T& x, std::false_type is_unsigned) { +static inline constexpr int signum( + const T& x, + std::false_type /*is_unsigned*/) { return (T(0) < x) - (x < T(0)); } @@ -68,6 +70,14 @@ inline constexpr bool signs_differ(const T& a, const U& b) { return is_negative(a) != is_negative(b); } +// Suppress sign compare warning when compiling with GCC +// as later does not account for short-circuit rule before +// raising the warning, see https://godbolt.org/z/Tr3Msnz99 +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wsign-compare" +#endif + /// Returns true if x is greater than the greatest value of the type Limit template inline constexpr bool greater_than_max(const T& x) { @@ -76,12 +86,16 @@ inline constexpr bool greater_than_max(const T& x) { return can_overflow && x > std::numeric_limits::max(); } +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + /// Returns true if x < lowest(Limit). Standard comparison template static inline constexpr bool less_than_lowest( const T& x, - std::false_type limit_is_unsigned, - std::false_type x_is_unsigned) { + std::false_type /*limit_is_unsigned*/, + std::false_type /*x_is_unsigned*/) { return x < std::numeric_limits::lowest(); } @@ -89,9 +103,9 @@ static inline constexpr bool less_than_lowest( /// negative values but x cannot be negative because it is unsigned template static inline constexpr bool less_than_lowest( - const T& x, - std::false_type limit_is_unsigned, - std::true_type x_is_unsigned) { + const T& /*x*/, + std::false_type /*limit_is_unsigned*/, + std::true_type /*x_is_unsigned*/) { return false; } @@ -100,17 +114,17 @@ static inline constexpr bool less_than_lowest( template static inline constexpr bool less_than_lowest( const T& x, - std::true_type limit_is_unsigned, - std::false_type x_is_unsigned) { + std::true_type /*limit_is_unsigned*/, + std::false_type /*x_is_unsigned*/) { return x < T(0); } /// Returns false sign both types are unsigned template static inline constexpr bool less_than_lowest( - const T& x, - std::true_type limit_is_unsigned, - std::true_type x_is_unsigned) { + const T& /*x*/, + std::true_type /*limit_is_unsigned*/, + std::true_type /*x_is_unsigned*/) { return false; } diff --git a/c10/util/accumulate.h b/c10/util/accumulate.h index 086a7977401c..8d0cc49c8ecb 100644 --- a/c10/util/accumulate.h +++ b/c10/util/accumulate.h @@ -82,7 +82,7 @@ template < inline int64_t numelements_from_dim(const int k, const C& dims) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(k >= 0); - if (k > dims.size()) { + if (k > static_cast(dims.size())) { return 1; } else { auto cbegin = dims.cbegin(); diff --git a/c10/util/int128.cpp b/c10/util/int128.cpp index a080e73430b3..f83dba499833 100644 --- a/c10/util/int128.cpp +++ b/c10/util/int128.cpp @@ -171,7 +171,7 @@ std::ostream& operator<<(std::ostream& o, const uint128& b) { // Add the requisite padding. std::streamsize width = o.width(0); - if (width > rep.size()) { + if (width > static_cast(rep.size())) { if ((flags & std::ios::adjustfield) == std::ios::left) { rep.append(width - rep.size(), o.fill()); } else { diff --git a/c10/util/intrusive_ptr.h b/c10/util/intrusive_ptr.h index d089ff86eeab..ef256b40ca3b 100644 --- a/c10/util/intrusive_ptr.h +++ b/c10/util/intrusive_ptr.h @@ -146,14 +146,18 @@ class C10_API intrusive_ptr_target { // intrusive_ptr_target supports copy and move: but refcount and weakcount // don't participate (since they are intrinsic properties of the memory // location) - intrusive_ptr_target(intrusive_ptr_target&& other) noexcept + intrusive_ptr_target(intrusive_ptr_target&& /*other*/) noexcept : intrusive_ptr_target() {} - intrusive_ptr_target& operator=(intrusive_ptr_target&& other) noexcept { + + intrusive_ptr_target& operator=(intrusive_ptr_target&& /*other*/) noexcept { return *this; } - intrusive_ptr_target(const intrusive_ptr_target& other) noexcept + + intrusive_ptr_target(const intrusive_ptr_target& /*other*/) noexcept : intrusive_ptr_target() {} - intrusive_ptr_target& operator=(const intrusive_ptr_target& other) noexcept { + + intrusive_ptr_target& operator=( + const intrusive_ptr_target& /*other*/) noexcept { return *this; } @@ -289,7 +293,6 @@ class intrusive_ptr final { delete target_; } } - target_ = NullType::singleton(); } // raw pointer constructors are not public because we shouldn't make @@ -413,6 +416,7 @@ class intrusive_ptr final { void reset() noexcept { reset_(); + target_ = NullType::singleton(); } void swap(intrusive_ptr& rhs) noexcept { @@ -591,6 +595,20 @@ inline bool operator==( return lhs.get() == rhs.get(); } +template +inline bool operator==( + const intrusive_ptr& lhs, + std::nullptr_t) noexcept { + return lhs.get() == nullptr; +} + +template +inline bool operator==( + std::nullptr_t, + const intrusive_ptr& rhs) noexcept { + return nullptr == rhs.get(); +} + template inline bool operator!=( const intrusive_ptr& lhs, @@ -598,6 +616,19 @@ inline bool operator!=( return !operator==(lhs, rhs); } +template +inline bool operator!=( + const intrusive_ptr& lhs, + std::nullptr_t) noexcept { + return !operator==(lhs, nullptr); +} + +template +inline bool operator!=( + std::nullptr_t, + const intrusive_ptr& rhs) noexcept { + return !operator==(nullptr, rhs); +} template struct MaybeOwnedTraits> { using owned_type = c10::intrusive_ptr; @@ -624,7 +655,7 @@ struct MaybeOwnedTraits> { return &borrow; } - static bool debugBorrowIsValid(const borrow_type& borrow) { + static bool debugBorrowIsValid(const borrow_type& /*borrow*/) { return true; } }; diff --git a/c10/util/llvmMathExtras.h b/c10/util/llvmMathExtras.h index 46b3e1e3613d..37b0ab8b6872 100644 --- a/c10/util/llvmMathExtras.h +++ b/c10/util/llvmMathExtras.h @@ -371,7 +371,7 @@ constexpr inline typename std::enable_if<(N < 64), bool>::type isUInt( } template constexpr inline typename std::enable_if= 64, bool>::type isUInt( - uint64_t X) { + uint64_t /*X*/) { return true; } diff --git a/c10/util/safe_numerics.h b/c10/util/safe_numerics.h new file mode 100644 index 000000000000..7eb9ed39395d --- /dev/null +++ b/c10/util/safe_numerics.h @@ -0,0 +1,74 @@ +#pragma once +#include +#include + +#include +#include +#include + +// GCC has __builtin_mul_overflow from before it supported __has_builtin +#ifdef _MSC_VER +#define C10_HAS_BUILTIN_OVERFLOW() (0) +#include +#include +#else +#define C10_HAS_BUILTIN_OVERFLOW() (1) +#endif + +namespace c10 { + +C10_ALWAYS_INLINE bool add_overflows(uint64_t a, uint64_t b, uint64_t* out) { +#if C10_HAS_BUILTIN_OVERFLOW() + return __builtin_add_overflow(a, b, out); +#else + unsigned long long tmp; + auto carry = _addcarry_u64(0, a, b, &tmp); + *out = tmp; + return carry; +#endif +} + +C10_ALWAYS_INLINE bool mul_overflows(uint64_t a, uint64_t b, uint64_t* out) { +#if C10_HAS_BUILTIN_OVERFLOW() + return __builtin_mul_overflow(a, b, out); +#else + *out = a * b; + // This test isnt exact, but avoids doing integer division + return ( + (c10::llvm::countLeadingZeros(a) + c10::llvm::countLeadingZeros(b)) < 64); +#endif +} + +template +bool safe_multiplies_u64(It first, It last, uint64_t* out) { +#if C10_HAS_BUILTIN_OVERFLOW() + uint64_t prod = 1; + bool overflow = false; + for (; first != last; ++first) { + overflow |= c10::mul_overflows(prod, *first, &prod); + } + *out = prod; + return overflow; +#else + uint64_t prod = 1; + uint64_t prod_log2 = 0; + bool is_zero = false; + for (; first != last; ++first) { + auto x = static_cast(*first); + prod *= x; + // log2(0) isn't valid, so need to track it specially + is_zero |= (x == 0); + prod_log2 += c10::llvm::Log2_64_Ceil(x); + } + *out = prod; + // This test isnt exact, but avoids doing integer division + return !is_zero && (prod_log2 >= 64); +#endif +} + +template +bool safe_multiplies_u64(const Container& c, uint64_t* out) { + return safe_multiplies_u64(c.begin(), c.end(), out); +} + +} // namespace c10 diff --git a/c10/util/strides.h b/c10/util/strides.h new file mode 100644 index 000000000000..40315a625c61 --- /dev/null +++ b/c10/util/strides.h @@ -0,0 +1,27 @@ +#pragma once +#include +#include + +namespace c10 { + +// Computes the contiguous strides of a tensor, given its sizes. +static inline DimVector contiguous_strides(const IntArrayRef sizes) { + using Int = IntArrayRef::value_type; + const Int dims = static_cast(sizes.size()); + + DimVector strides; + + if (dims > 0) { + strides.assign(dims, 0); + // Start by populating the last dimension: its strides is always 1. + strides[dims - 1] = 1; + for (auto i = dims - 2; i >= 0; --i) { + // Strides can't be 0 even if sizes are 0. + strides[i] = strides[i + 1] * std::max(sizes[i + 1], Int{1}); + } + } + + return strides; +} + +} // namespace c10 diff --git a/c10/util/variant.h b/c10/util/variant.h index 421efdf6f870..6b22116e685f 100644 --- a/c10/util/variant.h +++ b/c10/util/variant.h @@ -280,7 +280,21 @@ namespace std { #define C10_MPARK_BUILTIN_UNREACHABLE #endif -#if __has_builtin(__type_pack_element) +// NOTE [nvcc bug workaround] +// +// The original line `typename Front = lib::type_pack_element_t<0, Ts...>,` +// throws the following compiler error on nvcc: +// ``` +// c10/util/variant.h(2367): error: parameter pack "Ts" was referenced but not +// expanded +// ``` +// As a workaround, we skip defining C10_MPARK_TYPE_PACK_ELEMENT for nvcc +// compiler +// +// See the following issues for more context: +// https://github.com/pytorch/extension-cpp/issues/58 +// https://github.com/mpark/variant/issues/77 +#if __has_builtin(__type_pack_element) && !defined(__CUDACC__) #define C10_MPARK_TYPE_PACK_ELEMENT #endif diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 02949d50960a..4e9a90ef944d 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -63,7 +63,7 @@ if(INTERN_BUILD_ATEN_OPS) set(CMAKE_POSITION_INDEPENDENT_CODE ${__caffe2_CMAKE_POSITION_INDEPENDENT_CODE}) # Generate the headers wrapped by our operator - file(GLOB_RECURSE all_python "${PROJECT_SOURCE_DIR}/tools/codegen/*.py") + file(GLOB_RECURSE all_python "${PROJECT_SOURCE_DIR}/torchgen/*.py") add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/contrib/aten/aten_op.h COMMAND "${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/contrib/aten/gen_op.py @@ -89,8 +89,10 @@ if(INTERN_BUILD_ATEN_OPS) list(APPEND Caffe2_GPU_CU_SRCS ${ATen_CUDA_CU_SRCS}) list(APPEND Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY}) list(APPEND Caffe2_HIP_SRCS ${ATen_HIP_SRCS}) + list(APPEND Caffe2_MPS_SRCS ${ATen_MPS_SRCS}) list(APPEND Caffe2_HIP_SRCS ${ATen_HIP_SRCS_W_SORT_BY_KEY}) list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS}) + list(APPEND Caffe2_MPS_TEST_SRCS ${ATen_MPS_TEST_SRCS}) list(APPEND Caffe2_GPU_TEST_SRCS ${ATen_CUDA_TEST_SRCS}) list(APPEND Caffe2_HIP_TEST_SRCS ${ATen_HIP_TEST_SRCS}) list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CORE_TEST_SRCS}) @@ -230,6 +232,11 @@ if(PRINT_CMAKE_DEBUG_INFO) message(STATUS " " ${tmp}) endforeach() + message(STATUS "MPS sources: ") + foreach(tmp ${Caffe2_MPS_SRCS}) + message(STATUS " " ${tmp}) + endforeach() + message(STATUS "HIP test sources: ") foreach(tmp ${Caffe2_HIP_TEST_SRCS}) message(STATUS " " ${tmp}) @@ -240,6 +247,11 @@ if(PRINT_CMAKE_DEBUG_INFO) message(STATUS " " ${tmp}) endforeach() + message(STATUS "ATen MPS test sources: ") + foreach(tmp ${ATen_MPS_TEST_SRCS}) + message(STATUS " " ${tmp}) + endforeach() + message(STATUS "ATen CUDA test sources: ") foreach(tmp ${ATen_CUDA_TEST_SRCS}) message(STATUS " " ${tmp}) @@ -350,6 +362,13 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) "${TORCH_SRC_DIR}/csrc/autograd/generated/ADInplaceOrViewType_0.cpp" "${TORCH_SRC_DIR}/csrc/autograd/generated/ADInplaceOrViewType_1.cpp" ) + if(BUILD_LAZY_TS_BACKEND) + list(APPEND GENERATED_CXX_TORCH + "${TORCH_SRC_DIR}/csrc/lazy/generated/LazyNativeFunctions.cpp" + "${TORCH_SRC_DIR}/csrc/lazy/generated/RegisterAutogradLazy.cpp" + "${TORCH_SRC_DIR}/csrc/lazy/generated/RegisterLazy.cpp" + ) + endif() endif() set(GENERATED_H_TORCH @@ -360,6 +379,8 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) if(NOT INTERN_DISABLE_AUTOGRAD) list(APPEND GENERATED_H_TORCH "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType.h" + "${TORCH_SRC_DIR}/csrc/lazy/generated/LazyIr.h" + "${TORCH_SRC_DIR}/csrc/lazy/generated/LazyNativeFunctions.h" ) endif() @@ -397,18 +418,33 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) ${GENERATED_TESTING_PYTHON} ) + set(GEN_PER_OPERATOR_FLAG) + if(USE_PER_OPERATOR_HEADERS) + list(APPEND GEN_PER_OPERATOR_FLAG "--per_operator_headers") + endif() + add_custom_command( OUTPUT ${TORCH_GENERATED_CODE} COMMAND "${PYTHON_EXECUTABLE}" tools/setup_helpers/generate_code.py --native-functions-path "aten/src/ATen/native/native_functions.yaml" - --nn-path "aten/src" + --tags-path "aten/src/ATen/native/tags.yaml" $<$:--disable-autograd> $<$:--selected-op-list-path="${SELECTED_OP_LIST}"> --force_schema_registration + --gen_lazy_ts_backend + ${GEN_PER_OPERATOR_FLAG} DEPENDS "${TORCH_ROOT}/aten/src/ATen/native/native_functions.yaml" + "${TORCH_ROOT}/aten/src/ATen/native/tags.yaml" + "${TORCH_ROOT}/aten/src/ATen/native/ts_native_functions.yaml" + "${TORCH_ROOT}/torch/csrc/lazy/core/shape_inference.h" + "${TORCH_ROOT}/torch/csrc/lazy/ts_backend/ts_native_functions.cpp" + "${TORCH_ROOT}/aten/src/ATen/templates/DispatchKeyNativeFunctions.h" + "${TORCH_ROOT}/aten/src/ATen/templates/DispatchKeyNativeFunctions.cpp" + "${TORCH_ROOT}/aten/src/ATen/templates/LazyIr.h" + "${TORCH_ROOT}/aten/src/ATen/templates/RegisterDispatchKey.cpp" "${TOOLS_PATH}/autograd/templates/VariableType.h" "${TOOLS_PATH}/autograd/templates/VariableType.cpp" "${TOOLS_PATH}/autograd/templates/ADInplaceOrViewType.cpp" @@ -436,6 +472,10 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) "${TOOLS_PATH}/autograd/gen_variable_type.py" "${TOOLS_PATH}/autograd/gen_inplace_or_view_type.py" "${TOOLS_PATH}/autograd/load_derivatives.py" + "${TORCH_ROOT}/torchgen/gen_backend_stubs.py" + "${TORCH_ROOT}/torchgen/gen_lazy_tensor.py" + "${TORCH_ROOT}/torchgen/api/lazy.py" + "${TORCH_ROOT}/torchgen/dest/lazy_ir.py" WORKING_DIRECTORY "${TORCH_ROOT}") @@ -475,7 +515,9 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) set(CMAKE_POSITION_INDEPENDENT_CODE TRUE) else() append_filelist("libtorch_cmake_sources" LIBTORCH_CMAKE_SRCS) - + if(BUILD_LAZY_TS_BACKEND) + append_filelist("lazy_tensor_ts_sources" LIBTORCH_CMAKE_SRCS) + endif() if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU") # TODO: Delete this line once https://github.com/pytorch/pytorch/pull/55889 lands set_source_files_properties(../torch/csrc/jit/serialization/export.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations) @@ -505,6 +547,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) ${TORCH_SRC_DIR}/csrc/jit/backends/coreml/cpp/context.cpp ${TORCH_SRC_DIR}/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm ${TORCH_SRC_DIR}/csrc/jit/backends/coreml/objc/PTMCoreMLExecutor.mm + ${TORCH_SRC_DIR}/csrc/jit/backends/coreml/observer/PTMCoreMLObserver.mm ) list(APPEND TORCH_SRCS ${COREML_DELEGATE_SRCS}) endif() @@ -568,6 +611,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) ${TORCH_SRC_DIR}/csrc/jit/serialization/export_bytecode.cpp ${TORCH_SRC_DIR}/csrc/jit/serialization/export_module.cpp ${TORCH_SRC_DIR}/csrc/jit/serialization/flatbuffer_serializer.cpp + ${TORCH_SRC_DIR}/csrc/jit/serialization/flatbuffer_serializer_jit.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp ${TORCH_SRC_DIR}/csrc/jit/api/module_save.cpp ${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp @@ -625,8 +669,20 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) set_source_files_properties(${TORCH_SRC_DIR}/csrc/jit/passes/frozen_conv_add_relu_fusion.cpp PROPERTIES COMPILE_FLAGS "-DUSE_CUDA=1") endif() - if(USE_MLCOMPUTE) - include(../mlc/mlc_build.cmake) + if(BUILD_ONEDNN_GRAPH) + list(APPEND Caffe2_CPU_SRCS + ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/LlgaTensorImpl.cpp + ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/graph_fuser.cpp + ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/graph_rewriter.cpp + ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/graph_helper.cpp + ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/register_interface.cpp + ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/interface.cpp + ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/kernel.cpp + ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/defer_size_check.cpp + ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/layout_propagation.cpp + ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/prepare_binary.cpp + ${TORCH_SRC_DIR}/csrc/jit/codegen/onednn/guard_shape.cpp + ) endif() if(USE_ROCM) @@ -709,14 +765,17 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) ${TORCH_SRC_DIR}/csrc/api/src/optim/schedulers/step_lr.cpp ${TORCH_SRC_DIR}/csrc/api/src/serialize/input-archive.cpp ${TORCH_SRC_DIR}/csrc/api/src/serialize/output-archive.cpp - ${TORCH_SRC_DIR}/csrc/utils/crash_handler.cpp ) endif() list(APPEND Caffe2_CPU_SRCS ${TORCH_SRCS}) endif() -# NOTE [ Linking AVX-n and non-AVX-n files ] +if(USE_MPS) + list(APPEND Caffe2_CPU_SRCS ${Caffe2_MPS_SRCS}) +endif() + +# NOTE [ Linking AVX and non-AVX files ] # # Regardless of the CPU capabilities, we build some files with AVX2, and AVX512 # instruction set. If the host CPU doesn't support those, we simply ignore their @@ -764,6 +823,10 @@ if(HAVE_SOVERSION) VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION}) endif() torch_compile_options(torch_cpu) # see cmake/public/utils.cmake +if(HAS_WERROR_SIGN_COMPARE AND WERROR) + # target_compile_options(torch_cpu PRIVATE "-Werror=sign-compare") + set_property(SOURCE ${ATen_CORE_SRCS} ${ATen_CPU_SRCS} APPEND PROPERTY COMPILE_OPTIONS "-Werror=sign-compare") +endif() set_property(SOURCE ${ATen_CORE_SRCS} APPEND PROPERTY COMPILE_DEFINITIONS "TORCH_ASSERT_ONLY_METHOD_OPERATORS") @@ -901,6 +964,33 @@ elseif(USE_CUDA) target_link_libraries(torch_cuda PRIVATE __caffe2_nccl) target_compile_definitions(torch_cuda PRIVATE USE_NCCL) endif() + if(BUILD_LAZY_CUDA_LINALG) + add_library(torch_cuda_linalg ${ATen_CUDA_LINALG_SRCS}) + target_compile_definitions(torch_cuda_linalg PRIVATE USE_CUDA BUILD_LAZY_CUDA_LINALG) + # Library order is important during static linking + # `torch::magma` should be mentioned before other CUDA + # to transitively include all symbols present in torch_cuda/torch_cpu + if(USE_MAGMA) + target_link_libraries(torch_cuda_linalg PRIVATE torch::magma) + # CUDAHooks reports version of MAGMA PyTorch was compiled against, i.e. needs to be able to include magma headers + get_target_property(HOOKS_INCLUDE_DIRECTORIES torch_cuda INCLUDE_DIRECTORIES) + if(NOT "${MAGMA_INCLUDE_DIR}" IN_LIST HOOKS_INCLUDE_DIRECTORIES) + set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/cuda/detail/CUDAHooks.cpp PROPERTIES INCLUDE_DIRECTORIES "${MAGMA_INCLUDE_DIR}") + endif() + endif() + target_link_libraries(torch_cuda_linalg PRIVATE + torch_cpu + torch_cuda + ${CUDA_cusolver_LIBRARY} + ) + # NS: TODO, is this really necessary? + if(USE_MAGMA AND CAFFE2_STATIC_LINK_CUDA) + target_link_libraries(torch_cuda_linalg PRIVATE + "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a" dl) + endif() + set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp PROPERTIES COMPILE_FLAGS "-DBUILD_LAZY_CUDA_LINALG") + install(TARGETS torch_cuda_linalg DESTINATION "${TORCH_INSTALL_LIB_DIR}") + endif() if(USE_PRECOMPILED_HEADERS) if(BUILD_SPLIT_CUDA) @@ -914,59 +1004,7 @@ elseif(USE_CUDA) endif() if(USE_CUDA OR USE_ROCM) - if(BUILD_SPLIT_CUDA) - set(TORCHLIB_FLAVOR torch_cuda_cu) # chose torch_cuda_cu here since JIT is in torch_cuda_cpp - elseif(USE_CUDA) - set(TORCHLIB_FLAVOR torch_cuda) - elseif(USE_ROCM) - set(TORCHLIB_FLAVOR torch_hip) - endif() - - # The list of NVFUSER runtime files - list(APPEND NVFUSER_RUNTIME_FILES - ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/block_reduction.cu - ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu - ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/block_sync_default.cu - ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/broadcast.cu - ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/fp16_support.cu - ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/bf16_support.cu - ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/grid_broadcast.cu - ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/grid_reduction.cu - ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/grid_sync.cu - ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/helpers.cu - ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/index_utils.cu - ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/random_numbers.cu - ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/tensor.cu - ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/welford.cu - ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/warp.cu - ${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh - ${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/cuda/detail/UnpackRaw.cuh - ) - - file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/include/nvfuser_resources") - - # "stringify" NVFUSER runtime sources - # (generate C++ header files embedding the original input as a string literal) - set(NVFUSER_STRINGIFY_TOOL "${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/tools/stringify_file.py") - foreach(src ${NVFUSER_RUNTIME_FILES}) - get_filename_component(filename ${src} NAME_WE) - set(dst "${CMAKE_BINARY_DIR}/include/nvfuser_resources/${filename}.h") - add_custom_command( - COMMENT "Stringify NVFUSER runtime source file" - OUTPUT ${dst} - DEPENDS ${src} - COMMAND ${PYTHON_EXECUTABLE} ${NVFUSER_STRINGIFY_TOOL} -i ${src} -o ${dst} - ) - add_custom_target(nvfuser_rt_${filename} DEPENDS ${dst}) - add_dependencies(${TORCHLIB_FLAVOR} nvfuser_rt_${filename}) - - # also generate the resource headers during the configuration step - # (so tools like clang-tidy can run w/o requiring a real build) - execute_process(COMMAND - ${PYTHON_EXECUTABLE} ${NVFUSER_STRINGIFY_TOOL} -i ${src} -o ${dst}) - endforeach() - - target_include_directories(${TORCHLIB_FLAVOR} PRIVATE "${CMAKE_BINARY_DIR}/include") + include(${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/nvfuser.cmake) endif() if(NOT MSVC AND USE_XNNPACK) @@ -1061,7 +1099,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/QuantizedLinear.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations) set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/RNN.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations) set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations) - set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/quantized/cpu/qlinear_unpack.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations) + set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/quantized/qlinear_unpack.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations) endif() if(USE_TBB) @@ -1069,10 +1107,9 @@ if(USE_TBB) target_link_libraries(torch_cpu PUBLIC TBB::tbb) endif() -if(USE_BREAKPAD) - target_compile_definitions(torch_cpu PRIVATE ADD_BREAKPAD_SIGNAL_HANDLER) - target_include_directories(torch_cpu PRIVATE ${CMAKE_CURRENT_LIST_DIR}/../third_party ${CMAKE_CURRENT_LIST_DIR}/../third_party/breakpad/src) - target_link_libraries(torch_cpu PRIVATE breakpad) +if(BUILD_CAFFE2 AND BUILD_CAFFE2_OPS AND USE_FBGEMM) + # FIXME: quantization/server/conv_dnnlowp_op.cc depends on fbgemm/src/RefImplementations.h + target_include_directories(torch_cpu PRIVATE ${CMAKE_CURRENT_LIST_DIR}/../third_party) endif() target_include_directories(torch_cpu PRIVATE ${ATen_CPU_INCLUDE}) @@ -1091,10 +1128,10 @@ endif() install(DIRECTORY "${TORCH_SRC_DIR}/csrc" DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch - FILES_MATCHING PATTERN "*.h") + FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp") install(DIRECTORY "${TORCH_SRC_DIR}/csrc/distributed/c10d" DESTINATION ${TORCH_INSTALL_INCLUDE_DIR} - FILES_MATCHING PATTERN "*.hpp") + FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp") install(FILES "${TORCH_SRC_DIR}/script.h" "${TORCH_SRC_DIR}/extension.h" @@ -1108,13 +1145,16 @@ endif() DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch) endif() - if(BUILD_TEST) if(BUILD_LITE_INTERPRETER) add_subdirectory( ${TORCH_ROOT}/test/cpp/lite_interpreter_runtime ${CMAKE_BINARY_DIR}/test_lite_interpreter_runtime ) + add_subdirectory( + ${TORCH_ROOT}/test/mobile/lightweight_dispatch + ${CMAKE_BINARY_DIR}/test_codegen_unboxing + ) else() add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit) add_subdirectory( @@ -1296,8 +1336,14 @@ if(USE_DISTRIBUTED) else() if(BUILD_SPLIT_CUDA) target_compile_definitions(torch_cuda_cpp PUBLIC USE_C10D_NCCL) + if(USE_NCCL_WITH_UCC) + target_compile_definitions(torch_cuda_cpp PUBLIC USE_NCCL_WITH_UCC) + endif() else() target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL) + if(USE_NCCL_WITH_UCC) + target_compile_definitions(torch_cuda PUBLIC USE_NCCL_WITH_UCC) + endif() endif() endif() endif() @@ -1526,9 +1572,6 @@ if(USE_CUDA) elseif(USE_ROCM) target_link_libraries(torch PUBLIC torch_hip_library) endif() -if(USE_MLCOMPUTE) - target_link_libraries(torch PUBLIC torch_mlc_library) -endif() if(PRINT_CMAKE_DEBUG_INFO) print_target_properties(torch) @@ -1784,6 +1827,25 @@ if(BUILD_TEST) endif() endforeach() + if(USE_MPS) + foreach(test_src ${Caffe2_MPS_TEST_SRCS}) + get_filename_component(test_name ${test_src} NAME_WE) + add_executable(${test_name} "${test_src}") + target_link_libraries(${test_name} torch_library gtest_main) + target_include_directories(${test_name} PRIVATE $) + target_include_directories(${test_name} PRIVATE $) + target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE}) + add_test(NAME ${test_name} COMMAND $) + if(INSTALL_TEST) + install(TARGETS ${test_name} DESTINATION test) + # Install PDB files for MSVC builds + if(MSVC AND BUILD_SHARED_LIBS) + install(FILES $ DESTINATION test OPTIONAL) + endif() + endif() + endforeach() + endif() + if(USE_CUDA) foreach(test_src ${Caffe2_GPU_TEST_SRCS}) get_filename_component(test_name ${test_src} NAME_WE) @@ -1926,6 +1988,8 @@ if(BUILD_PYTHON) # ---[ Python. if(BUILD_CAFFE2) add_library(caffe2_pybind11_state MODULE ${Caffe2_CPU_PYTHON_SRCS}) + target_compile_definitions(torch PRIVATE BUILD_CAFFE2) + target_compile_definitions(torch_python PRIVATE BUILD_CAFFE2) if(USE_NUMPY) target_compile_options(caffe2_pybind11_state PRIVATE "-DUSE_NUMPY") target_link_libraries(caffe2_pybind11_state PRIVATE numpy::numpy) diff --git a/caffe2/__init__.py b/caffe2/__init__.py index e69de29bb2d1..4096a9828385 100644 --- a/caffe2/__init__.py +++ b/caffe2/__init__.py @@ -0,0 +1,6 @@ +import warnings +from torch.onnx import _CAFFE2_ATEN_FALLBACK + +if not _CAFFE2_ATEN_FALLBACK: + warnings.warn("Caffe2 support is not fully enabled in this PyTorch build. " + "Please enable Caffe2 by building PyTorch from source with `BUILD_CAFFE2=1` flag.") diff --git a/caffe2/c2_aten_srcs.bzl b/caffe2/c2_aten_srcs.bzl deleted file mode 100644 index 7755de9ccc13..000000000000 --- a/caffe2/c2_aten_srcs.bzl +++ /dev/null @@ -1,12 +0,0 @@ -ATEN_CORE_HEADER_FILES = [ - # "aten/src/" prefix is added later - "ATen/core/ATenGeneral.h", - "ATen/core/blob.h", - "ATen/core/DimVector.h", - "ATen/core/grad_mode.h", - "ATen/core/UndefinedTensorImpl.h", -] - -ATEN_CORE_SRC_FILES = [ - "aten/src/ATen/core/VariableFallbackKernel.cpp", -] diff --git a/caffe2/contrib/aten/README.md b/caffe2/contrib/aten/README.md index 593079ef1393..79a4276a65f8 100644 --- a/caffe2/contrib/aten/README.md +++ b/caffe2/contrib/aten/README.md @@ -72,7 +72,7 @@ class Add(torch.autograd.Function): @staticmethod def symbolic(g, a, b): - return g.op("ATen", a, b, operator_s = "add") + return g.at("add", a, b) @staticmethod def forward(ctx, a, b): diff --git a/caffe2/contrib/aten/aten_op_template.h b/caffe2/contrib/aten/aten_op_template.h index a5d1ea40e27a..b22b840c25ad 100644 --- a/caffe2/contrib/aten/aten_op_template.h +++ b/caffe2/contrib/aten/aten_op_template.h @@ -179,8 +179,9 @@ class ATenOp : public Operator { std::vector attrs; for (const auto i : c10::irange(operator_def.arg_size())) { auto & attr = operator_def.arg(i); - if(attr.name() == "operator" || attr.name() == "type" ) + if (attr.name() == "operator" || attr.name() == "type" || attr.name() == "overload_name") { continue; + } attrs.push_back(attr.name()); } std::sort(attrs.begin(), attrs.end()); diff --git a/caffe2/contrib/aten/aten_test.py b/caffe2/contrib/aten/aten_test.py index 4a025c3b1802..6574884245f8 100644 --- a/caffe2/contrib/aten/aten_test.py +++ b/caffe2/contrib/aten/aten_test.py @@ -1,9 +1,4 @@ - - - - - -from caffe2.python import core, dyndep +from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/contrib/aten/docs/sample.py b/caffe2/contrib/aten/docs/sample.py index 53ce19b86e89..6896f2379d8c 100644 --- a/caffe2/contrib/aten/docs/sample.py +++ b/caffe2/contrib/aten/docs/sample.py @@ -38,8 +38,8 @@ def forward(self, x, y): # graph(%input : Float(3, 4, strides=[4, 1], requires_grad=0, device=cpu), # %y : Float(3, 4, strides=[4, 1], requires_grad=0, device=cpu)): # %2 : Float(3, 4, strides=[4, 1], requires_grad=0, device=cpu) = onnx::Relu(%input) -# %3 : Tensor = onnx::ATen[operator="mul"](%2, %2) -# %4 : Float(3, 4, strides=[4, 1], requires_grad=0, device=cpu) = onnx::ATen[operator="add"](%3, %y) +# %3 : Tensor = aten::ATen[operator="mul"](%2, %2) +# %4 : Float(3, 4, strides=[4, 1], requires_grad=0, device=cpu) = aten::ATen[operator="add"](%3, %y) # return (%4) graph = onnx.load(f.name) diff --git a/caffe2/contrib/aten/gen_op.py b/caffe2/contrib/aten/gen_op.py index 93d4bad29f92..55f1faba2750 100755 --- a/caffe2/contrib/aten/gen_op.py +++ b/caffe2/contrib/aten/gen_op.py @@ -37,9 +37,9 @@ raise ValueError('aten_root ({}) does not exist'.format( args.aten_root)) sys.path.insert(0, os.path.join(args.aten_root, '..')) - from tools.codegen.code_template import CodeTemplate as CT + from torchgen.code_template import CodeTemplate as CT else: - from tools.codegen.code_template import CodeTemplate as CT + from torchgen.code_template import CodeTemplate as CT OP_TEMPLATE = CT.from_file( os.path.join(args.template_dir, 'aten_op_template.h')) diff --git a/caffe2/contrib/shm_mutex/shm_mutex.h b/caffe2/contrib/shm_mutex/shm_mutex.h index f2dc4ff97e8b..8f0293caf5f3 100644 --- a/caffe2/contrib/shm_mutex/shm_mutex.h +++ b/caffe2/contrib/shm_mutex/shm_mutex.h @@ -58,7 +58,7 @@ class ShmProcessMutexCheck { template struct shm_traits; -using ShmBaseHeader = struct { +struct ShmBaseHeader { std::atomic isInitialized; std::atomic countMapped; std::atomic owner; diff --git a/caffe2/core/blob_test.cc b/caffe2/core/blob_test.cc index 2249c3bcbf2a..a7e3a8d27e23 100644 --- a/caffe2/core/blob_test.cc +++ b/caffe2/core/blob_test.cc @@ -1264,7 +1264,7 @@ void TestDataType( std::string dataTypeName) { LOG(INFO) << dataTypeName; FLAGS_caffe2_serialize_using_bytes_as_holder = true; - size_t numEl = 1000; + int numEl = 1000; // Proto with int32 auto protoInt32 = CreateProtoWithInt32Data(dataType, numEl, false); caffe2::Blob blobInt32; diff --git a/caffe2/core/export_caffe2_op_to_c10.h b/caffe2/core/export_caffe2_op_to_c10.h index 66ffdf21a108..82da29a44f4b 100644 --- a/caffe2/core/export_caffe2_op_to_c10.h +++ b/caffe2/core/export_caffe2_op_to_c10.h @@ -4,12 +4,13 @@ #if defined(EXPOSE_C2_OPS) || \ !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) +#include #include #include #include -#include #include #include +#include #include #include @@ -113,7 +114,9 @@ void call_caffe2_op_from_c10( _call_caffe2_op_from_c10(stack, Schema(), &_call_caffe2_op); } -inline FunctionSchema make_function_schema_for_c10(const char* schema_str) { +inline FunctionSchema make_function_schema_for_c10( + const char* schema_str, + c10::optional optional_alias_analysis_kind) { #if !defined(EXPOSE_C2_OPS) && \ (defined(CAFFE2_IS_XPLAT_BUILD) || defined(C10_MOBILE)) throw std::logic_error( @@ -127,13 +130,17 @@ inline FunctionSchema make_function_schema_for_c10(const char* schema_str) { nullopt, IValue()); - return FunctionSchema( + auto schema = FunctionSchema( parsed_schema.name(), parsed_schema.overload_name(), std::move(arguments), parsed_schema.returns(), parsed_schema.is_vararg(), parsed_schema.is_varret()); + if (optional_alias_analysis_kind) { + schema.setAliasAnalysis(*optional_alias_analysis_kind); + } + return schema; #endif } @@ -169,7 +176,7 @@ inline FunctionSchema make_function_schema_for_c10(const char* schema_str) { * caffe2. * - all operators must call C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10 and * C10_EXPORT_CAFFE2_OP_TO_C10_CPU . - * - calling C10_EXPORT_CAFFE2_OP_TO_C10_CUDA is optional and can be omitted i f + * - calling C10_EXPORT_CAFFE2_OP_TO_C10_CUDA is optional and can be omitted if * you don't want to expose the operator for CUDA operations. * - caffe2 arguments must come after caffe2 inputs, in other words, any tensor * inputs must precede any non-tensor inputs. @@ -178,73 +185,85 @@ inline FunctionSchema make_function_schema_for_c10(const char* schema_str) { * - If your operator has a variable number of input tensors, make the first (!) * input an input of type TensorList. There must be no other tensor inputs. */ -#define C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(OperatorName) \ - namespace caffe2 { \ - namespace _c10_ops { \ +#define C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(OperatorName) \ + namespace caffe2 { \ + namespace _c10_ops { \ TORCH_API const FunctionSchema& schema_##OperatorName(); \ - } \ + } \ } -#define C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(OperatorName, OperatorSchema) \ - /* Register the op schema with the c10 dispatcher */ \ - namespace caffe2 { \ - namespace _c10_ops { \ - C10_EXPORT const FunctionSchema& schema_##OperatorName() { \ - static const FunctionSchema schema = \ - ::caffe2::detail::make_function_schema_for_c10(OperatorSchema); \ - return schema; \ - } \ - TORCH_LIBRARY_FRAGMENT(_caffe2, m) { \ - m.def(::caffe2::detail::make_function_schema_for_c10(OperatorSchema)); \ - } \ - } \ +#define C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY( \ + OperatorName, OperatorSchema, OptionalAliasAnalysisKind) \ + /* Register the op schema with the c10 dispatcher */ \ + namespace caffe2 { \ + namespace _c10_ops { \ + C10_EXPORT const FunctionSchema& schema_##OperatorName() { \ + static const FunctionSchema schema = \ + ::caffe2::detail::make_function_schema_for_c10( \ + OperatorSchema, OptionalAliasAnalysisKind); \ + return schema; \ + } \ + TORCH_LIBRARY_FRAGMENT(_caffe2, m) { \ + m.def(::caffe2::detail::make_function_schema_for_c10( \ + OperatorSchema, OptionalAliasAnalysisKind)); \ + } \ + } \ } #define C10_EXPORT_CAFFE2_OP_TO_C10_CPU_KERNEL_ONLY( \ OperatorName, OperatorClass) \ /* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */ \ - TORCH_LIBRARY_IMPL(_caffe2, CPU, m) { \ - m.impl("_caffe2::" #OperatorName, \ - torch::CppFunction::makeFromBoxedFunction< \ - ::caffe2::detail::call_caffe2_op_from_c10< \ - ::caffe2::_c10_ops::schema_##OperatorName, \ - OperatorClass>>()); \ - } + TORCH_LIBRARY_IMPL(_caffe2, CPU, m) { \ + m.impl( \ + "_caffe2::" #OperatorName, \ + torch::CppFunction::makeFromBoxedFunction< \ + ::caffe2::detail::call_caffe2_op_from_c10< \ + ::caffe2::_c10_ops::schema_##OperatorName, \ + OperatorClass>>()); \ + } + +#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU( \ + OperatorName, OperatorSchema, OperatorClass) \ + C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY( \ + OperatorName, OperatorSchema, c10::nullopt) \ + C10_EXPORT_CAFFE2_OP_TO_C10_CPU_KERNEL_ONLY(OperatorName, OperatorClass) -#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU( \ - OperatorName, OperatorSchema, OperatorClass) \ - C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(OperatorName, OperatorSchema) \ +#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU_WITH_ALIAS_ANALYSIS( \ + OperatorName, OperatorSchema, OperatorClass, OptionalAliasAnalysisKind) \ + C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY( \ + OperatorName, OperatorSchema, OptionalAliasAnalysisKind) \ C10_EXPORT_CAFFE2_OP_TO_C10_CPU_KERNEL_ONLY(OperatorName, OperatorClass) #define C10_EXPORT_CAFFE2_OP_TO_C10_CUDA(OperatorName, OperatorClass) \ /* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */ \ - TORCH_LIBRARY_IMPL(_caffe2, CUDA, m) { \ - m.impl("_caffe2::" #OperatorName, \ - torch::CppFunction::makeFromBoxedFunction< \ - ::caffe2::detail::call_caffe2_op_from_c10< \ - ::caffe2::_c10_ops::schema_##OperatorName, \ - OperatorClass>>()); \ - } - + TORCH_LIBRARY_IMPL(_caffe2, CUDA, m) { \ + m.impl( \ + "_caffe2::" #OperatorName, \ + torch::CppFunction::makeFromBoxedFunction< \ + ::caffe2::detail::call_caffe2_op_from_c10< \ + ::caffe2::_c10_ops::schema_##OperatorName, \ + OperatorClass>>()); \ + } // You should never manually call the C10_EXPORT_CAFFE2_OP_TO_C10_HIP macro . // The C10_EXPORT_CAFFE2_OP_TO_C10_CUDA macro from above will be automatically // rewritten to C10_EXPORT_CAFFE2_OP_TO_C10_HIP by hipify . #define C10_EXPORT_CAFFE2_OP_TO_C10_HIP(OperatorName, OperatorClass) \ /* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */ \ - TORCH_LIBRARY_IMPL(_caffe2, HIP, m) { \ - m.impl("_caffe2::" #OperatorName, \ - torch::CppFunction::makeFromBoxedFunction< \ - ::caffe2::detail::call_caffe2_op_from_c10< \ - ::caffe2::_c10_ops::schema_##OperatorName, \ - OperatorClass>>()); \ - } - + TORCH_LIBRARY_IMPL(_caffe2, HIP, m) { \ + m.impl( \ + "_caffe2::" #OperatorName, \ + torch::CppFunction::makeFromBoxedFunction< \ + ::caffe2::detail::call_caffe2_op_from_c10< \ + ::caffe2::_c10_ops::schema_##OperatorName, \ + OperatorClass>>()); \ + } #else // Don't use c10 dispatcher on mobile because of binary size #define C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(OperatorName) -#define C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(OperatorName, OperatorSchema) +#define C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY( \ + OperatorName, OperatorSchema, OptionalAliasAnalysisKind) #define C10_EXPORT_CAFFE2_OP_TO_C10_CPU_KERNEL_ONLY(OperatorName, OperatorClass) #define C10_EXPORT_CAFFE2_OP_TO_C10_CPU( \ OperatorName, OperatorSchema, OperatorClass) diff --git a/caffe2/core/qtensor.h b/caffe2/core/qtensor.h index a34da6918bcd..7dc9c59f82f6 100644 --- a/caffe2/core/qtensor.h +++ b/caffe2/core/qtensor.h @@ -60,8 +60,7 @@ class C10_EXPORT QTensor { void Resize(at::ArrayRef dim_source) { if (dims_ != dim_source) { const auto source_size = c10::multiply_integers(dim_source); - // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - if ((source_size * (precision_ + signed_)) > capacity_) { + if (static_cast(source_size * (precision_ + signed_)) > capacity_) { data_ptr_.clear(); capacity_ = 0; } @@ -188,7 +187,7 @@ class C10_EXPORT QTensor { * Returns the i-th dimension of the qtensor in int. */ inline int dim32(const int i) const { - DCHECK_LT(i, dims_.size()) << "Exceeding ndim limit " << dims_.size(); + DCHECK_LT(i, static_cast(dims_.size())) << "Exceeding ndim limit " << dims_.size(); DCHECK_GE(i, 0) << "Cannot have negative index"; CAFFE_ENFORCE_LT(dims_[i], std::numeric_limits::max()); return static_cast(dims_[i]); diff --git a/caffe2/core/serialization_test.cc b/caffe2/core/serialization_test.cc index 1912802d2ac8..902a3e01e677 100644 --- a/caffe2/core/serialization_test.cc +++ b/caffe2/core/serialization_test.cc @@ -69,7 +69,7 @@ TEST(TensorSerialization, TestUnknownDType) { auto* blobTensor = BlobGetMutableTensor(&blob, CPU); blobTensor->Resize(kTestTensorSize, 1); auto *tensorData = blobTensor->mutable_data(); - for (int n = 0; n < kTestTensorSize; ++n) { + for (unsigned n = 0; n < kTestTensorSize; ++n) { tensorData[n] = n; } auto data = SerializeBlob(blob, "test_blob"); @@ -85,7 +85,7 @@ TEST(TensorSerialization, TestUnknownDType) { EXPECT_EQ(kTestTensorSize, tensor.numel()); EXPECT_EQ(TypeMeta::Make(), tensor.dtype()); const auto* tensor_data = tensor.template data(); - for (int i = 0; i < kTestTensorSize; ++i) { + for (unsigned i = 0; i < kTestTensorSize; ++i) { EXPECT_EQ(static_cast(i), tensor_data[i]); } diff --git a/caffe2/core/stats.h b/caffe2/core/stats.h index 26fbdbe4a753..f04e37acfbe9 100644 --- a/caffe2/core/stats.h +++ b/caffe2/core/stats.h @@ -348,6 +348,7 @@ _ScopeGuard ScopeGuard(T f) { stats.field.groupName.c_str(), \ __caffe_event_value_, \ ##__VA_ARGS__); \ + (void)__caffe_event_value_; \ } #define CAFFE_DURATION(stats, field, ...) \ diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h index 7f2f93de53fc..de7d31fd7614 100644 --- a/caffe2/core/tensor.h +++ b/caffe2/core/tensor.h @@ -4,6 +4,7 @@ #include #include "caffe2/core/storage.h" +#include #include #include #include @@ -428,6 +429,11 @@ class TORCH_API Tensor final { return impl_.get()->sizes(); } + inline c10::SymIntArrayRef sym_sizes() const { + auto sizes = impl_.get()->sizes(); + return c10::SymIntArrayRef(reinterpret_cast(sizes.data()), sizes.size()); + } + inline int64_t size_from_dim(int k) const { return size_from_dim_(k, impl_->sizes()); } diff --git a/caffe2/core/transform_test.cc b/caffe2/core/transform_test.cc index adb7ecae050b..0dc6ba92c7f9 100644 --- a/caffe2/core/transform_test.cc +++ b/caffe2/core/transform_test.cc @@ -55,7 +55,7 @@ class DummyTransform : public Transform { return false; } // which index are we trying to append the new node to? - int pattern_idx = subgraph.size(); + auto pattern_idx = subgraph.size(); // type doesn't match if (g.node(idx).op.type() != pattern_chain[pattern_idx]) { return false; diff --git a/caffe2/ideep/operators/order_switch_ops.cc b/caffe2/ideep/operators/order_switch_ops.cc index 0b682c9af83f..7b8319b6c1bf 100644 --- a/caffe2/ideep/operators/order_switch_ops.cc +++ b/caffe2/ideep/operators/order_switch_ops.cc @@ -22,6 +22,10 @@ class IDEEPNHWC2NCHWOp final : public IDEEPOperator { // Thus, for iDEEP tensor, the shapes of NCHW and NHWC are identical. Y->init({X.get_dims(), X.get_data_type(), iformat::nchw}); Y->feed_from(X); + // NOTE: This ops is only used to quantization path, setting scale + // to distinguish with fp32 path activation(always return NCHW format + // even ideep tensor has NHWC format) when convert to numpy memory. + Y->set_scale({1.0}); return true; } @@ -48,6 +52,10 @@ class IDEEPNCHW2NHWCOp final : public IDEEPOperator { // Thus, for iDEEP tensor, the shapes of NCHW and NHWC are identical. Y->init({X.get_dims(), X.get_data_type(), iformat::nhwc}); Y->feed_from(X); + // NOTE: This ops is only used to quantization path, setting scale + // to distinguish with fp32 path activation(always return NCHW format + // even ideep tensor has NHWC format) when convert to numpy memory. + Y->set_scale({1.0}); return true; } diff --git a/caffe2/ideep/operators/utility_ops.cc b/caffe2/ideep/operators/utility_ops.cc index d82fe0aed866..9477666bcc5c 100644 --- a/caffe2/ideep/operators/utility_ops.cc +++ b/caffe2/ideep/operators/utility_ops.cc @@ -1,4 +1,4 @@ -#include "caffe2/operators/utility_ops.h" + #include "caffe2/operators/utility_ops.h" #include "caffe2/core/operator.h" #include "caffe2/ideep/ideep_utils.h" @@ -64,7 +64,10 @@ class CopyIDEEPToCPUOp final : public IDEEPOperator { } auto* Y = OperatorBase::OutputTensor(0, dims, at::dtype().device(CPU)); - X.to_public(Y->template mutable_data()); + itensor temp_ten( + X.get_desc().to_default_format(), + Y->template mutable_data()); + X.reorder_to(temp_ten); } else { CAFFE_THROW("Unsupported ideep type: ", static_cast(X.get_data_type())); diff --git a/caffe2/image/image_input_op.h b/caffe2/image/image_input_op.h index 5d72898bfc69..d1c27e0845a8 100644 --- a/caffe2/image/image_input_op.h +++ b/caffe2/image/image_input_op.h @@ -62,7 +62,7 @@ class ImageInputOp final : public PrefetchOperator { bool CopyPrefetched() override; private: - using BoundingBox = struct { + struct BoundingBox { bool valid; int ymin; int xmin; @@ -73,7 +73,7 @@ class ImageInputOp final : public PrefetchOperator { // Structure to store per-image information // This can be modified by the DecodeAnd* so needs // to be privatized per launch. - using PerImageArg = struct { BoundingBox bounding_params; }; + struct PerImageArg { BoundingBox bounding_params; }; bool GetImageAndLabelAndInfoFromDBValue( const string& value, diff --git a/caffe2/operators/bisect_percentile_op.h b/caffe2/operators/bisect_percentile_op.h index 8dc71795df89..8c26b111de1c 100644 --- a/caffe2/operators/bisect_percentile_op.h +++ b/caffe2/operators/bisect_percentile_op.h @@ -44,7 +44,7 @@ class BisectPercentileOp final : public Operator { pct_upper_.size(), "Feature (raw) data and upper bound dimension should match."); n_features = pct_lens_.size(); - index.reserve(n_features + 1); + index.resize(n_features + 1); index[0] = 0; for (int i = 1; i <= n_features; ++i) { index[i] = index[i - 1] + pct_lens_[i - 1]; @@ -63,12 +63,12 @@ class BisectPercentileOp final : public Operator { const auto batch_size = raw.size(0); const auto num_features = raw.size(1); CAFFE_ENFORCE_EQ(num_features, pct_lens_.size()); - const float* raw_data = raw.template data(); + const float *const raw_data = raw.template data(); // Output - auto* pct = Output(PCT, raw.sizes(), at::dtype()); - float* pct_output = pct->template mutable_data(); + auto *const pct = Output(PCT, raw.sizes(), at::dtype()); + float *const pct_output = pct->template mutable_data(); // Compute percentile for each raw feature value int feature_start_index = 0; @@ -108,20 +108,17 @@ class BisectPercentileOp final : public Operator { vector index; vector> fast_pct; - const float kEPSILON = 1e-10; + static constexpr float kEPSILON = 1e-10; - int binary_search( + int64_t binary_search( const std::vector::iterator& data, - int lo, - int hi, - float val) { - int mid; - bool low_cond, high_cond; - + int64_t lo, + int64_t hi, + const float val) { while (lo < hi) { - mid = (lo + hi) >> 1; - low_cond = (data[mid] <= val); - high_cond = (val < data[mid + 1]); + const auto mid = lo + (hi - lo) / 2; + const bool low_cond = (data[mid] <= val); + const bool high_cond = (val < data[mid + 1]); if (low_cond && high_cond) { return mid; } else if (!low_cond) { @@ -148,20 +145,18 @@ class BisectPercentileOp final : public Operator { return 1.; } - float result; // Interpolation by binary search const auto k = binary_search(pct_raw_it, 0, size - 1, val); if (pct_raw_it[k] == val) { // Exact match - result = pct_mapping_it[k]; + return pct_mapping_it[k]; } else { // interpolation - float w = (val - pct_raw_it[k]) / + const float w = (val - pct_raw_it[k]) / (pct_raw_it[k + 1] - pct_raw_it[k] + kEPSILON); - result = (1 - w) * pct_upper_it[k] + w * pct_lower_it[k + 1]; + return (1 - w) * pct_upper_it[k] + w * pct_lower_it[k + 1]; } - return result; } }; diff --git a/caffe2/operators/boolean_mask_ops.cc b/caffe2/operators/boolean_mask_ops.cc index 511aaee47831..ad7b28331de2 100644 --- a/caffe2/operators/boolean_mask_ops.cc +++ b/caffe2/operators/boolean_mask_ops.cc @@ -286,9 +286,6 @@ NO_GRADIENT(BooleanMaskLengths); } // namespace -// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable) -const float minf = -1.0f * std::numeric_limits::infinity(); - // Template this on a functor object so we can generate different // implementations at compile time and have a better chance of inlining template diff --git a/caffe2/operators/copy_op.cc b/caffe2/operators/copy_op.cc index f2323bbaf06f..c0efef07eeb6 100644 --- a/caffe2/operators/copy_op.cc +++ b/caffe2/operators/copy_op.cc @@ -200,8 +200,10 @@ REGISTER_GRADIENT(CopyCPUToGPU, GetCPUToGPUGradient); C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY( CopyGPUToCPU, - "_caffe2::CopyGPUToCPU(Tensor input) -> Tensor"); + "_caffe2::CopyGPUToCPU(Tensor input) -> Tensor", + /*optional_alias_analysis_kind=*/c10::nullopt); C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY( CopyCPUToGPU, - "_caffe2::CopyCPUToGPU(Tensor input) -> Tensor"); + "_caffe2::CopyCPUToGPU(Tensor input) -> Tensor", + /*optional_alias_analysis_kind=*/c10::nullopt); diff --git a/caffe2/operators/deform_conv_op_impl.h b/caffe2/operators/deform_conv_op_impl.h index 011b1bf9204b..4acd92267b26 100644 --- a/caffe2/operators/deform_conv_op_impl.h +++ b/caffe2/operators/deform_conv_op_impl.h @@ -156,6 +156,7 @@ bool DeformConvOp::RunOnDeviceWithOrderNCHW() { T* col_buffer_data = col_buffer->template mutable_data(); // Im2col, followed by gemm. for (const auto image_id : c10::irange(N)) { + (void)image_id; // CUDA-10.2 on Windows crashes when C10_UNUSED macro is used for (const auto group_id : c10::irange(group_)) { DeformableIm2col( Xdata + group_id * input_offset, @@ -343,6 +344,7 @@ bool DeformConvGradientOp::RunOnDeviceWithOrderNCHW() { } for (const auto image_id : c10::irange(N)) { + (void)image_id; // CUDA-10.2 on Windows crashes when C10_UNUSED macro is used for (const auto group_id : c10::irange(group_)) { math::Gemm( CblasTrans, diff --git a/caffe2/operators/dropout_op.cc b/caffe2/operators/dropout_op.cc index 6f37407bd40e..bbd1eb1c72c9 100644 --- a/caffe2/operators/dropout_op.cc +++ b/caffe2/operators/dropout_op.cc @@ -15,13 +15,12 @@ bool DropoutOp::RunOnDevice() { return true; } else { // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) - float scale = 1. / (1. - ratio_); + float scale = ratio_ >= 1.0 ? 0.0:1. / (1. - ratio_); // mask=true means keep, and mask=false means not keep, so we will // generate probability depending on 1-ratio. at::bernoulli_distribution dist(1. - ratio_); const float* Xdata = X.data(); float* Ydata = Y->template mutable_data(); - auto mask = Output(1, X.sizes(), at::dtype()); bool* mask_data = mask->template mutable_data(); auto* gen = context_.RandGenerator(); @@ -52,7 +51,7 @@ bool DropoutGradientOp::RunOnDevice() { const bool* mask_data = mask.data(); float* dXdata = dX->template mutable_data(); // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) - float scale = 1. / (1. - ratio_); + float scale = ratio_ >= 1.0 ? 0.0:1. / (1. - ratio_); for (int i = 0; i < dY.numel(); ++i) { // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) dXdata[i] = dYdata[i] * mask_data[i] * scale; diff --git a/caffe2/operators/dropout_op.h b/caffe2/operators/dropout_op.h index aff0528c7ffa..ae8f0ff1bba6 100644 --- a/caffe2/operators/dropout_op.h +++ b/caffe2/operators/dropout_op.h @@ -19,7 +19,6 @@ class DropoutOp final : public Operator { is_test_( this->template GetSingleArgument(OpSchema::Arg_IsTest, 0)) { CAFFE_ENFORCE_GE(ratio_, 0); - CAFFE_ENFORCE_LT(ratio_, 1); } bool RunOnDevice() override; @@ -41,7 +40,6 @@ class DropoutGradientOp final : public Operator { is_test_( this->template GetSingleArgument(OpSchema::Arg_IsTest, 0)) { CAFFE_ENFORCE_GE(ratio_, 0); - CAFFE_ENFORCE_LT(ratio_, 1); } bool RunOnDevice() override; diff --git a/caffe2/operators/elementwise_ops.cu b/caffe2/operators/elementwise_ops.cu index 932bd5dafda0..fcbe26f927ae 100644 --- a/caffe2/operators/elementwise_ops.cu +++ b/caffe2/operators/elementwise_ops.cu @@ -119,6 +119,9 @@ void device_reduce( int N, Tensor* buffer, CUDAContext* context) { + (void)N; // Suppress unused variable warning + (void)buffer; // Suppress unused variable warning + (void)context; // Suppress unused variable warning #if TORCH_HIP_VERSION >= 210 auto buffer_size = 1; diff --git a/caffe2/operators/gather_ranges_to_dense_op.h b/caffe2/operators/gather_ranges_to_dense_op.h index e2cdab373c97..ee11de3f972d 100644 --- a/caffe2/operators/gather_ranges_to_dense_op.h +++ b/caffe2/operators/gather_ranges_to_dense_op.h @@ -146,6 +146,7 @@ class GatherRangesToDenseOp final : public Operator { auto& key = Input(KEY); auto* key_data = key.template data(); vector> buffer; + buffer.reserve(rangeLength); for (const auto b_i : c10::irange(rangeLength)) { int64_t one_key_item = key_data[rangeStart + b_i]; auto* one_data_item = rawData + (rangeStart + b_i) * itemsize; diff --git a/caffe2/operators/generate_proposals_op_test.cc b/caffe2/operators/generate_proposals_op_test.cc index 598b8d185695..9692c2846e97 100644 --- a/caffe2/operators/generate_proposals_op_test.cc +++ b/caffe2/operators/generate_proposals_op_test.cc @@ -493,7 +493,7 @@ TEST(GenerateProposalsTest, TestRealDownSampledRotatedAngle0) { 1.53593004e-01f, -8.75087008e-02f, -4.92327996e-02f, -3.32239009e-02f}; // Add angle in bbox deltas - int num_boxes = scores.size(); + auto num_boxes = scores.size(); CHECK_EQ(bbx.size() / 4, num_boxes); vector bbx_with_angle(num_boxes * box_dim); // bbx (deltas) is in shape (A * 4, H, W). Insert angle delta @@ -666,7 +666,7 @@ TEST(GenerateProposalsTest, TestRealDownSampledRotated) { 1.53593004e-01f, -8.75087008e-02f, -4.92327996e-02f, -3.32239009e-02f}; // Add angle in bbox deltas - int num_boxes = scores.size(); + auto num_boxes = scores.size(); CHECK_EQ(bbx.size() / 4, num_boxes); vector bbx_with_angle(num_boxes * box_dim); // bbx (deltas) is in shape (A * 4, H, W). Insert angle delta diff --git a/caffe2/operators/generate_proposals_op_util_boxes.h b/caffe2/operators/generate_proposals_op_util_boxes.h index 0a402cdb6a3c..1c38562dbae0 100644 --- a/caffe2/operators/generate_proposals_op_util_boxes.h +++ b/caffe2/operators/generate_proposals_op_util_boxes.h @@ -2,7 +2,6 @@ #define CAFFE2_OPERATORS_UTILS_BOXES_H_ #include "caffe2/utils/eigen_utils.h" -#include "caffe2/utils/math.h" #include diff --git a/caffe2/operators/generate_proposals_op_util_nms.h b/caffe2/operators/generate_proposals_op_util_nms.h index 09b10c8e192a..92d2c90a06c0 100644 --- a/caffe2/operators/generate_proposals_op_util_nms.h +++ b/caffe2/operators/generate_proposals_op_util_nms.h @@ -6,7 +6,6 @@ #include "caffe2/core/logging.h" #include "caffe2/core/macros.h" #include "caffe2/utils/eigen_utils.h" -#include "caffe2/utils/math.h" #include @@ -50,8 +49,7 @@ std::vector nms_cpu_upright( std::vector keep; while (order.size() > 0) { // exit if already enough proposals - // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - if (topN >= 0 && keep.size() >= topN) { + if (topN >= 0 && keep.size() >= static_cast(topN)) { break; } @@ -127,7 +125,7 @@ std::vector soft_nms_cpu_upright( EArrXi pending = AsEArrXt(indices); while (pending.size() > 0) { // Exit if already enough proposals - if (topN >= 0 && keep.size() >= topN) { + if (topN >= 0 && keep.size() >= static_cast(topN)) { break; } @@ -560,8 +558,7 @@ std::vector nms_cpu_rotated( std::vector keep; while (order.size() > 0) { // exit if already enough proposals - // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - if (topN >= 0 && keep.size() >= topN) { + if (topN >= 0 && keep.size() >= static_cast(topN)) { break; } @@ -626,7 +623,7 @@ std::vector soft_nms_cpu_rotated( EArrXi pending = AsEArrXt(indices); while (pending.size() > 0) { // Exit if already enough proposals - if (topN >= 0 && keep.size() >= topN) { + if (topN >= 0 && keep.size() >= static_cast(topN)) { break; } diff --git a/caffe2/operators/lpnorm_op.cc b/caffe2/operators/lpnorm_op.cc index cc43135d5c22..3ca2a1db09f4 100644 --- a/caffe2/operators/lpnorm_op.cc +++ b/caffe2/operators/lpnorm_op.cc @@ -13,7 +13,6 @@ bool LpNormOp::RunOnDevice() { auto* norm = Output(0, {1}, at::dtype()); const float* X_data = X.data(); const float size = average_ ? (float)X.numel() : 1.0f; - CAFFE_ENFORCE_GT(size, 0); if (p_ == 1) { *(norm->template mutable_data()) = (ConstEigenVectorMap(X_data, X.numel()).array()).abs().sum() / diff --git a/caffe2/operators/piecewise_linear_transform_op.h b/caffe2/operators/piecewise_linear_transform_op.h index 9dcf0021f1c2..8a88f8b834be 100644 --- a/caffe2/operators/piecewise_linear_transform_op.h +++ b/caffe2/operators/piecewise_linear_transform_op.h @@ -63,6 +63,7 @@ class PiecewiseLinearTransformOp final : public Operator { const int64_t num_group) { const T* start = bounds; for (const auto i : c10::irange(num_group)) { + (void)i; // CUDA-10.2 on Windows crashes when C10_UNUSED macro is used if (!std::is_sorted(start, start + num_bounds_per_group)) { return false; } diff --git a/caffe2/operators/quant_decode_op.h b/caffe2/operators/quant_decode_op.h index 1eeb4f2db8ad..5253d9975c39 100644 --- a/caffe2/operators/quant_decode_op.h +++ b/caffe2/operators/quant_decode_op.h @@ -36,7 +36,7 @@ void Decode( } int sz = output->numel(); - for (const auto i : c10::irange(sz)) { + for (C10_UNUSED const auto i : c10::irange(sz)) { DCHECK_LE(*code_ptr, cb_size); *out_ptr++ = cb_ptr[*code_ptr++]; } diff --git a/caffe2/operators/quantized/int8_fc_op.cc b/caffe2/operators/quantized/int8_fc_op.cc index 6f0b3aa8da74..21cd23bb399c 100644 --- a/caffe2/operators/quantized/int8_fc_op.cc +++ b/caffe2/operators/quantized/int8_fc_op.cc @@ -10,7 +10,7 @@ REGISTER_CPU_OPERATOR(Int8FC, int8::Int8FCOp); using namespace std::placeholders; OPERATOR_SCHEMA(Int8FC) - .NumInputs(3, 4) + .NumInputs(3, 5) .NumOutputs(1, 4) // NOLINTNEXTLINE(modernize-avoid-bind) .TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, false)) @@ -50,6 +50,11 @@ will throw errors. "Qparam", "Optional Qparam blob that contains quant param computed on activation histogram data" "Will overwrite Y_scale and Y_zero_point argument if specified") + .Input( + 4, + "in_Qparam", + "Optional Qparam blob that contains quant param computed on activation histogram data" + "Will overwrite X_scale and X_zero_point argument if specified") .Output(0, "Y", "2D output tensor"); } // namespace caffe2 diff --git a/caffe2/operators/quantized/int8_roi_align_op.h b/caffe2/operators/quantized/int8_roi_align_op.h index 2a722d2dd8fa..360f4a62c089 100644 --- a/caffe2/operators/quantized/int8_roi_align_op.h +++ b/caffe2/operators/quantized/int8_roi_align_op.h @@ -229,8 +229,8 @@ void ROIAlignForward( for (const auto pw : c10::irange(pooled_width)) { vector acc_buffer(channels, 0); - for (const auto iy : c10::irange(roi_bin_grid_h)) { - for (const auto ix : c10::irange(roi_bin_grid_w)) { + for (C10_UNUSED const auto iy : c10::irange(roi_bin_grid_h)) { + for (C10_UNUSED const auto ix : c10::irange(roi_bin_grid_w)) { PreCalc pc = pre_calc[pre_calc_index]; const uint8_t* data_1 = offset_bottom_data + channels * pc.pos1; diff --git a/caffe2/operators/quantized/int8_test.cc b/caffe2/operators/quantized/int8_test.cc index b6d9719d5223..9b14d3eaec1d 100644 --- a/caffe2/operators/quantized/int8_test.cc +++ b/caffe2/operators/quantized/int8_test.cc @@ -341,8 +341,8 @@ TEST(Int8, SumRelu) { } void setq(int8::Int8TensorCPU* dst, const std::vector& vs) { - CHECK_EQ(vs.size(), dst->t.numel()); - for (auto i = 0; i < vs.size(); ++i) { + CHECK_EQ(vs.size(), static_cast(dst->t.numel())); + for (auto i = 0U; i < vs.size(); ++i) { uint8_t vq = std::max( std::numeric_limits::min(), std::min( @@ -354,8 +354,8 @@ void setq(int8::Int8TensorCPU* dst, const std::vector& vs) { } void biassetq(int8::Int8TensorCPU* dst, const std::vector& vs) { - CHECK_EQ(vs.size(), dst->t.numel()); - for (auto i = 0; i < vs.size(); ++i) { + CHECK_EQ(vs.size(), static_cast(dst->t.numel())); + for (auto i = 0U; i < vs.size(); ++i) { int32_t vq = std::max( std::numeric_limits::min(), std::min( diff --git a/caffe2/operators/text_file_reader_utils.h b/caffe2/operators/text_file_reader_utils.h index 01b4743a91c1..a4f2d6189860 100644 --- a/caffe2/operators/text_file_reader_utils.h +++ b/caffe2/operators/text_file_reader_utils.h @@ -56,7 +56,7 @@ struct TORCH_API CharRange { struct TORCH_API StringProvider { virtual void operator()(CharRange&) = 0; virtual void reset() = 0; - virtual ~StringProvider() {} + virtual ~StringProvider() = default; }; class TORCH_API BufferedTokenizer { @@ -99,7 +99,7 @@ class TORCH_API BufferedTokenizer { StringProvider* provider_; Tokenizer tokenizer_; TokenizedString tokenized_; - int tokenIndex_; + unsigned tokenIndex_; int numPasses_; int pass_{0}; }; diff --git a/caffe2/operators/variable_length_sequence_padding.cc b/caffe2/operators/variable_length_sequence_padding.cc index dbdb4ac87678..d6904523b7fc 100644 --- a/caffe2/operators/variable_length_sequence_padding.cc +++ b/caffe2/operators/variable_length_sequence_padding.cc @@ -19,7 +19,7 @@ N = maximum sequence length B = batch size M = hidden size -set each element of INPUT to zero if it is is past the end of the +set each element of INPUT to zero if it is past the end of the corresponding sequence (i.e. if LENS[j] > i for an index (i,j,k)). )DOC"); diff --git a/caffe2/opt/bound_shape_inference_test.cc b/caffe2/opt/bound_shape_inference_test.cc index 867142746d82..8224281124e1 100644 --- a/caffe2/opt/bound_shape_inference_test.cc +++ b/caffe2/opt/bound_shape_inference_test.cc @@ -45,7 +45,7 @@ void verifyShapeInfo( EXPECT_EQ(shape_info.getDimType(), t); const auto& shape = shape_info.shape; ASSERT_EQ(shape.dims_size(), dims.size()); - for (int i = 0; i < dims.size(); ++i) { + for (unsigned i = 0; i < dims.size(); ++i) { EXPECT_EQ(dims[i], shape.dims(i)); } EXPECT_EQ(shape.data_type(), dtype); diff --git a/caffe2/opt/glow_net_transform.cc b/caffe2/opt/glow_net_transform.cc index 5803b4efd492..a240750717cd 100644 --- a/caffe2/opt/glow_net_transform.cc +++ b/caffe2/opt/glow_net_transform.cc @@ -71,7 +71,7 @@ C10_DEFINE_bool( namespace caffe2 { namespace glow { -// The list in in the form of "0-3,5,6-7" which means, we will black list ops +// The list in the form of "0-3,5,6-7" which means, we will black list ops // with net positions in [0,1,2,3,5,6,7] std::unordered_set ParseNetPositionList(const std::string& str) { std::unordered_set net_position_list; diff --git a/caffe2/perfkernels/adagrad_avx2.cc b/caffe2/perfkernels/adagrad_avx2.cc index 0039afa942f1..08c9fd00d9a0 100644 --- a/caffe2/perfkernels/adagrad_avx2.cc +++ b/caffe2/perfkernels/adagrad_avx2.cc @@ -18,7 +18,7 @@ void adagrad_update__avx2_fma( float decay, float lr, float weight_decay = 0.f) { - constexpr size_t kSize = 8; + constexpr int kSize = 8; auto i = 0; for (; i + kSize <= N; i += kSize) { __m256 gi = _mm256_loadu_ps(g + i); diff --git a/caffe2/proto/__init__.py b/caffe2/proto/__init__.py index a753f26c5380..ce54a1aee574 100644 --- a/caffe2/proto/__init__.py +++ b/caffe2/proto/__init__.py @@ -1,3 +1,6 @@ +import warnings + + # NOTE: we have to import python protobuf here **before** we load cpp extension. # Otherwise it breaks under certain build conditions if cpp implementation of # protobuf is used. Presumably there's some registry in protobuf library and @@ -8,7 +11,13 @@ # expected caffe2.NetDef got caffe2.NetDef." # # This has to be done for all python targets, so listing them here -from caffe2.proto import caffe2_pb2, metanet_pb2, torch_pb2 +try: + from caffe2.proto import caffe2_pb2, metanet_pb2, torch_pb2 +except ImportError: + warnings.warn('Caffe2 support is not enabled in this PyTorch build. ' + 'Please enable Caffe2 by building PyTorch from source with `BUILD_CAFFE2=1` flag.') + raise + try: from caffe2.caffe2.fb.session.proto import session_pb2 except ImportError: diff --git a/caffe2/proto/caffe2.proto b/caffe2/proto/caffe2.proto index 33b4cbd4b9a9..861a6c5d4374 100644 --- a/caffe2/proto/caffe2.proto +++ b/caffe2/proto/caffe2.proto @@ -220,7 +220,7 @@ enum DeviceTypeProto { PROTO_FPGA = 7; // FPGA PROTO_ORT = 8; // ONNX Runtime PROTO_XLA = 9; // XLA / TPU - PROTO_MLC = 10; // ML Compute + PROTO_MPS = 10; // MPS // Change the following number if you add more devices in the code. PROTO_COMPILE_TIME_MAX_DEVICE_TYPES = 11; } diff --git a/caffe2/proto/caffe2_pb2.pyi b/caffe2/proto/caffe2_pb2.pyi index f7f4430d7b76..ed1f4249a43e 100644 --- a/caffe2/proto/caffe2_pb2.pyi +++ b/caffe2/proto/caffe2_pb2.pyi @@ -25,7 +25,7 @@ class _DeviceTypeProto(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapp PROTO_FPGA = DeviceTypeProto.V(7) PROTO_ORT = DeviceTypeProto.V(8) PROTO_XLA = DeviceTypeProto.V(9) - PROTO_MLC = DeviceTypeProto.V(10) + PROTO_MPS = DeviceTypeProto.V(10) PROTO_COMPILE_TIME_MAX_DEVICE_TYPES = DeviceTypeProto.V(11) class DeviceTypeProto(metaclass=_DeviceTypeProto): V = typing.NewType('V', int) @@ -39,7 +39,7 @@ PROTO_HIP = DeviceTypeProto.V(6) PROTO_FPGA = DeviceTypeProto.V(7) PROTO_ORT = DeviceTypeProto.V(8) PROTO_XLA = DeviceTypeProto.V(9) -PROTO_MLC = DeviceTypeProto.V(10) +PROTO_MPS = DeviceTypeProto.V(10) PROTO_COMPILE_TIME_MAX_DEVICE_TYPES = DeviceTypeProto.V(11) class TensorProto(google.protobuf.message.Message): diff --git a/caffe2/python/__init__.py b/caffe2/python/__init__.py index 6617a62c5a51..83e393e67731 100644 --- a/caffe2/python/__init__.py +++ b/caffe2/python/__init__.py @@ -1,7 +1,15 @@ - -from caffe2.proto import caffe2_pb2 import os import sys +import warnings + + +try: + from caffe2.proto import caffe2_pb2 +except ImportError: + warnings.warn('Caffe2 support is not enabled in this PyTorch build. ' + 'Please enable Caffe2 by building PyTorch from source with `BUILD_CAFFE2=1` flag.') + raise + # TODO: refactor & remove the following alias caffe2_pb2.CPU = caffe2_pb2.PROTO_CPU caffe2_pb2.CUDA = caffe2_pb2.PROTO_CUDA diff --git a/caffe2/python/benchmark_generator.py b/caffe2/python/benchmark_generator.py index 5342cb314a5b..c557ebfc9536 100644 --- a/caffe2/python/benchmark_generator.py +++ b/caffe2/python/benchmark_generator.py @@ -106,7 +106,7 @@ def make_blob_on_context(blob_name, blob_data, context): if __name__ == "__main__": parser = argparse.ArgumentParser( - description="Utilitity to generate Caffe2 benchmark models.") + description="Utility to generate Caffe2 benchmark models.") parser.add_argument("operator", help="Caffe2 operator to benchmark.") parser.add_argument("-b", "--blob", help="Instantiate a blob --blob name=dim1,dim2,dim3", diff --git a/caffe2/python/checkpoint.py b/caffe2/python/checkpoint.py index 872a66c7bd1f..c379211a509d 100644 --- a/caffe2/python/checkpoint.py +++ b/caffe2/python/checkpoint.py @@ -133,7 +133,7 @@ def db_name(epoch, node_name, db_prefix, path_prefix=None): node_name: A string. The name of the node. db_prefix: A string. The prefix used to construct full db name. path_prefix: A string. Optional param used to construct db name or path - where checkpoint files are are stored. + where checkpoint files are stored. Returns: db_name: A string. The absolute path of full_db_name where checkpoint files are saved diff --git a/caffe2/python/core_gradients_test.py b/caffe2/python/core_gradients_test.py index 293eccca0dd4..b8433c644155 100644 --- a/caffe2/python/core_gradients_test.py +++ b/caffe2/python/core_gradients_test.py @@ -269,7 +269,7 @@ def testUseInputButInputHasBeenChanged(self): in -> out, with UseInput in -> in - Since we overwrite in in op#1, but in will be needed by the gradient + Since we overwrite in op#1, but in will be needed by the gradient calculation of op#0, the gradient registry should raise an error. """ operators = [ diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py index 0543233b7c4f..2f143fbae07a 100644 --- a/caffe2/python/core_test.py +++ b/caffe2/python/core_test.py @@ -308,7 +308,7 @@ def testCreate(self): self.assertTrue(op.HasField('device_option')) self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType) self.assertEqual(op.device_option.device_id, 1) - self.assertTrue(len(op.arg), 3) + self.assertEqual(len(op.arg), 3) # can't guarantee ordering of kwargs, so generate a set of args # to test with diff --git a/caffe2/python/gru_cell.py b/caffe2/python/gru_cell.py index 049a9152878a..d0474ed70022 100644 --- a/caffe2/python/gru_cell.py +++ b/caffe2/python/gru_cell.py @@ -31,7 +31,7 @@ def __init__( # (reset gate -> output_gate) # So, much of the logic to calculate the reset gate output and modified # output gate input is set here, in the graph definition. - # The remaining logic lives in in gru_unit_op.{h,cc}. + # The remaining logic lives in gru_unit_op.{h,cc}. def _apply( self, model, diff --git a/caffe2/python/memonger.py b/caffe2/python/memonger.py index 6225781bc429..178ebd8cd302 100644 --- a/caffe2/python/memonger.py +++ b/caffe2/python/memonger.py @@ -798,15 +798,29 @@ def canonical_name(blob): op.output[i] = canonical_name(output) - def apply_recurrent_blob_assignments(op, blob_assignments, canonical_name): log.debug("Applying assignments to recurrent op: {}".format(op.type)) + + # Apply on alias_dst + alias_dst_args = [a for a in op.arg if a.name.endswith("alias_dst")] + for alias_dst in alias_dst_args: + for i, blob in enumerate(alias_dst.strings): + alias_dst.strings[i] = canonical_name(blob.decode()).encode() + + # Apply on link_external + link_external_args = [a for a in op.arg if a.name.endswith("link_external")] + for link_external in link_external_args: + for i, blob in enumerate(link_external.strings): + link_external.strings[i] = canonical_name(blob.decode()).encode() + + # Recurse into step nets step_args = [a for a in op.arg if a.name.endswith("step_net")] for step_arg in step_args: apply_assignments(step_arg.n, blob_assignments) for i, einp in enumerate(step_arg.n.external_input): if einp in blob_assignments: step_arg.n.external_input[i] = canonical_name(einp) + # Store renamings for blob, renamed in viewitems(blob_assignments): if blob in list(op.input) + list(op.output): diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py index d92239f5c3c1..d523eb8204ab 100644 --- a/caffe2/python/onnx/backend.py +++ b/caffe2/python/onnx/backend.py @@ -651,7 +651,13 @@ def optimize_onnx(input, init=False, predict=False): passes.append('split_init') if predict: passes.append('split_predict') - out = onnx.optimizer.optimize(input, passes) + try: + out = onnx.optimizer.optimize(input, passes) + except AttributeError: + warnings.warn("OptimizerWarning: optimizer module not found in ONNX version {}".format(onnx.__version__)) + # ONNX does no ship onnx.optimizer since version 1.9+ + import onnxoptimizer + out = onnxoptimizer.optimize(input, passes) return out @classmethod @@ -881,8 +887,9 @@ def _onnx_model_to_caffe2_net(cls, onnx_model, device, opset_version, include_in try: init_model = cls.optimize_onnx(onnx_model, init=True) pred_model = cls.optimize_onnx(onnx_model, predict=True) - except AttributeError: - warnings.warn("OptimizerWarning: optimizer module not found in ONNX version {}".format(onnx.__version__)) + except ModuleNotFoundError: + warnings.warn("OptimizerWarning: onnxoptimizer module not installed. " + "init_model and pred_model models will not be splitted, which can cause a runtime error") init_model = onnx_model pred_model = onnx_model diff --git a/caffe2/python/onnx/tests/onnx_backend_test.py b/caffe2/python/onnx/tests/onnx_backend_test.py index ebb6018ca76e..42262d269695 100644 --- a/caffe2/python/onnx/tests/onnx_backend_test.py +++ b/caffe2/python/onnx/tests/onnx_backend_test.py @@ -160,11 +160,19 @@ ')') # Unsupported ops in opset 15 -backend_test.exclude('(test_bernoulli_*' - '|test_castlike_*' - '|test_optional_*' - '|test_shape_end_*' - '|test_shape_start_*' +backend_test.exclude('(test_bernoulli_.*' + '|test_castlike_.*' + '|test_optional_.*' + '|test_shape_end_.*' + '|test_shape_start_.*' + '|test_identity_opt_*' + '|test_loop16_seq_none_*' + '|test_if_opt_*' + ')') + +# Unsupported ops in opset 16 +backend_test.exclude('(test_gridsample_.*' + '|test_spacetodepth_.*' ')') # Skip vgg to speed up CI diff --git a/caffe2/python/operator_test/bisect_percentile_op_test.py b/caffe2/python/operator_test/bisect_percentile_op_test.py index 147a41282505..2d22064d5712 100644 --- a/caffe2/python/operator_test/bisect_percentile_op_test.py +++ b/caffe2/python/operator_test/bisect_percentile_op_test.py @@ -1,7 +1,4 @@ - - - - +from typing import List import hypothesis.strategies as st @@ -115,7 +112,7 @@ def test_bisect_percentil_op_simple(self): @given( N=st.integers(min_value=20, max_value=100), - lengths=st.lists( + lengths_in=st.lists( elements=st.integers(min_value=2, max_value=10), min_size=2, max_size=5, @@ -126,9 +123,9 @@ def test_bisect_percentil_op_simple(self): **hu.gcs_cpu_only ) def test_bisect_percentil_op_large( - self, N, lengths, max_value, discrete, p, gc, dc + self, N: int, lengths_in: List[int], max_value: int, discrete: bool, p: float, gc, dc ): - lengths = np.array(lengths, dtype=np.int32) + lengths = np.array(lengths_in, dtype=np.int32) D = len(lengths) if discrete: diff --git a/caffe2/python/operator_test/dropout_op_test.py b/caffe2/python/operator_test/dropout_op_test.py index d3a5c831d875..de96554bc5cb 100644 --- a/caffe2/python/operator_test/dropout_op_test.py +++ b/caffe2/python/operator_test/dropout_op_test.py @@ -19,7 +19,7 @@ class TestDropout(serial.SerializedTestCase): in_place=st.booleans(), ratio=st.floats(0, 0.999), engine=st.sampled_from(["", "CUDNN"]), - **hu.gcs) + **hu.gcs_cpu_only) def test_dropout_is_test(self, X, in_place, ratio, engine, gc, dc): """Test with is_test=True for a deterministic reference impl.""" # TODO(lukeyeager): enable this path when the GPU path is fixed @@ -47,7 +47,7 @@ def reference_dropout_test(x): in_place=st.booleans(), output_mask=st.booleans(), engine=st.sampled_from(["", "CUDNN"]), - **hu.gcs) + **hu.gcs_cpu_only) @settings(deadline=10000) def test_dropout_ratio0(self, X, in_place, output_mask, engine, gc, dc): """Test with ratio=0 for a deterministic reference impl.""" @@ -74,3 +74,35 @@ def reference_dropout_ratio0(x): gc, op, [X], reference_dropout_ratio0, # Don't check the mask with cuDNN because it's packed data outputs_to_check=None if engine != 'CUDNN' else [0]) + + + @given(X=hu.tensor(), + in_place=st.booleans(), + output_mask=st.booleans(), + engine=st.sampled_from(["", "CUDNN"]), + **hu.gcs_cpu_only) + @settings(deadline=10000) + def test_dropout_ratio1(self, X, in_place, output_mask, engine, gc, dc): + """Test with ratio=0 for a deterministic reference impl.""" + if in_place: + # Skip if trying in-place on GPU + assume(gc.device_type not in {caffe2_pb2.CUDA, caffe2_pb2.HIP}) + # If in-place on CPU, don't compare with GPU + dc = dc[:1] + is_test = not output_mask + op = core.CreateOperator("Dropout", ["X"], + ["X" if in_place else "Y"] + + (["mask"] if output_mask else []), + ratio=1.0, engine=engine, + is_test=is_test) + + self.assertDeviceChecks(dc, op, [X], [0]) + if not is_test: + self.assertGradientChecks(gc, op, [X], 0, [0]) + + def reference_dropout_ratio1(x): + return (x,) if is_test else (np.zeros(x.shape, dtype=np.float), np.zeros(x.shape, dtype=np.bool)) + self.assertReferenceChecks( + gc, op, [X], reference_dropout_ratio1, + # Don't check the mask with cuDNN because it's packed data + outputs_to_check=None if engine != 'CUDNN' else [0]) diff --git a/caffe2/python/operator_test/elementwise_op_broadcast_test.py b/caffe2/python/operator_test/elementwise_op_broadcast_test.py index 6e5d4e7efee8..2d8222b59c9f 100644 --- a/caffe2/python/operator_test/elementwise_op_broadcast_test.py +++ b/caffe2/python/operator_test/elementwise_op_broadcast_test.py @@ -64,7 +64,7 @@ def __test_binary_op( caffe2_op: A string. Name of the caffe operator to test. op_function: an actual python operator (e.g. operator.add) path_prefix: A string. Optional param used to construct db name or path - where checkpoint files are are stored. + where checkpoint files are stored. """ for X, Y, op_args, X_out, Y_out in self.__generate_test_cases(allow_broadcast_fastpath): diff --git a/caffe2/python/operator_test/lpnorm_op_test.py b/caffe2/python/operator_test/lpnorm_op_test.py index e7ab634d0e7c..2899ba929470 100644 --- a/caffe2/python/operator_test/lpnorm_op_test.py +++ b/caffe2/python/operator_test/lpnorm_op_test.py @@ -11,13 +11,7 @@ class LpnormTest(hu.HypothesisTestCase): - @given(inputs=hu.tensors(n=1, - min_dim=1, - max_dim=3, - dtype=np.float32), - **hu.gcs) - @settings(deadline=10000) - def test_Lp_Norm(self, inputs, gc, dc): + def _test_Lp_Norm(self, inputs, gc, dc): X = inputs[0] # avoid kinks by moving away from 0 X += 0.02 * np.sign(X) @@ -74,6 +68,21 @@ def test_Lp_Norm(self, inputs, gc, dc): atol=1e-4 ) + @given(inputs=hu.tensors(n=1, + min_dim=1, + max_dim=3, + dtype=np.float32), + **hu.gcs) + @settings(deadline=10000) + def test_Lp_Norm(self, inputs, gc, dc): + self._test_Lp_Norm(inputs, gc, dc) + + def test_Lp_Norm_empty(self): + self._test_Lp_Norm([np.array([], dtype=np.float32)], hu.cpu_do, [hu.cpu_do]) + self.assertEqual(self.ws.blobs["l1_norm"].fetch()[0], 0.0) + self.assertEqual(self.ws.blobs["l2_norm"].fetch()[0], 0.0) + self.assertTrue(np.isnan(self.ws.blobs["l2_averaged_norm"].fetch()[0])) + @given(x=hu.tensor( min_dim=1, max_dim=10, dtype=np.float32, elements=st.integers(min_value=-100, max_value=100)), diff --git a/caffe2/python/operator_test/piecewise_linear_transform_test.py b/caffe2/python/operator_test/piecewise_linear_transform_test.py index d7c4e0df4416..0c260d944d81 100644 --- a/caffe2/python/operator_test/piecewise_linear_transform_test.py +++ b/caffe2/python/operator_test/piecewise_linear_transform_test.py @@ -32,7 +32,7 @@ def transform(self, x, bounds, slopes, intercepts): y = slopes[index] * x_ + intercepts[index] return y - @given(n=st.integers(1, 100), **hu.gcs) + @given(n=st.integers(1, 100), **hu.gcs_cpu_only) @settings(deadline=10000) def test_multi_predictions_params_from_arg(self, n, gc, dc): slopes = np.random.uniform(-1, 1, (2, n)).astype(np.float32) @@ -60,7 +60,7 @@ def piecewise(x, *args, **kw): self.assertReferenceChecks(gc, op, [X], piecewise) self.assertDeviceChecks(dc, op, [X], [0]) - @given(n=st.integers(1, 100), **hu.gcs) + @given(n=st.integers(1, 100), **hu.gcs_cpu_only) @settings(deadline=10000) def test_binary_predictions_params_from_arg(self, n, gc, dc): slopes = np.random.uniform(-1, 1, size=n).astype(np.float32) @@ -87,7 +87,7 @@ def piecewise(x): self.assertReferenceChecks(gc, op, [X], piecewise) self.assertDeviceChecks(dc, op, [X], [0]) - @given(n=st.integers(1, 100), **hu.gcs) + @given(n=st.integers(1, 100), **hu.gcs_cpu_only) @settings(deadline=10000) def test_multi_predictions_params_from_input(self, n, gc, dc): slopes = np.random.uniform(-1, 1, (2, n)).astype(np.float32) @@ -115,7 +115,7 @@ def piecewise(x, bounds, slopes, intercepts): gc, op, [X, bounds, slopes, intercepts], piecewise) self.assertDeviceChecks(dc, op, [X, bounds, slopes, intercepts], [0]) - @given(n=st.integers(1, 100), **hu.gcs) + @given(n=st.integers(1, 100), **hu.gcs_cpu_only) @settings(deadline=10000) def test_binary_predictions_params_from_input(self, n, gc, dc): slopes = np.random.uniform(-1, 1, size=n).astype(np.float32) @@ -141,7 +141,7 @@ def piecewise(x, bounds, slopes, intercepts): gc, op, [X, bounds, slopes, intercepts], piecewise) self.assertDeviceChecks(dc, op, [X, bounds, slopes, intercepts], [0]) - @given(n=st.integers(1, 100), **hu.gcs) + @given(n=st.integers(1, 100), **hu.gcs_cpu_only) @settings(deadline=10000) def test_1D_predictions_params_from_input(self, n, gc, dc): slopes = np.random.uniform(-1, 1, size=n).astype(np.float32) diff --git a/caffe2/python/operator_test/sequence_ops_test.py b/caffe2/python/operator_test/sequence_ops_test.py index 6fbc445a7769..cb07a96fa0f7 100644 --- a/caffe2/python/operator_test/sequence_ops_test.py +++ b/caffe2/python/operator_test/sequence_ops_test.py @@ -385,7 +385,7 @@ def test_remove_data_blocks(self, data, indices, gc, dc): ["shrunk_data"]) def op_ref(data, indices): - unique_indices = np.unique(indices) + unique_indices = np.unique(indices) if len(indices)>0 else np.array([],dtype=np.int64) sorted_indices = np.sort(unique_indices) shrunk_data = np.delete(data, sorted_indices, axis=0) return (shrunk_data,) diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc index ad04cab82d5a..ccaa0afb6ac9 100644 --- a/caffe2/python/pybind_state.cc +++ b/caffe2/python/pybind_state.cc @@ -300,7 +300,7 @@ class GetPythonGradient : public GradientMakerBase { } if (gradOutputIndices.size() > 0) { // NOLINTNEXTLINE(modernize-loop-convert) - for (int i = 0; i < gradOutputIndices.size(); ++i) { + for (unsigned i = 0; i < gradOutputIndices.size(); ++i) { int GO_i = gradOutputIndices[i]; gradientInputs.push_back(GO(GO_i)); } @@ -312,7 +312,7 @@ class GetPythonGradient : public GradientMakerBase { std::vector gradientOutputs; if (gradInputIndices.size() > 0) { // NOLINTNEXTLINE(modernize-loop-convert) - for (int i = 0; i < gradInputIndices.size(); ++i) { + for (unsigned i = 0; i < gradInputIndices.size(); ++i) { int GI_i = gradInputIndices[i]; gradientOutputs.push_back(GI(GI_i)); } @@ -877,7 +877,7 @@ void addObjectMethods(py::module& m) { std::vector tensors_data; #ifdef USE_NUMPY // NOLINTNEXTLINE(modernize-loop-convert) - for (auto i = 0; i < inputs.size(); ++i) { + for (auto i = 0U; i < inputs.size(); ++i) { auto input = inputs[i]; CAFFE_ENFORCE( PyArray_Check(input.ptr()), @@ -988,7 +988,7 @@ void addObjectMethods(py::module& m) { std::vector tensors_data; #ifdef USE_NUMPY // NOLINTNEXTLINE(modernize-loop-convert) - for (auto i = 0; i < inputs.size(); ++i) { + for (auto i = 0U; i < inputs.size(); ++i) { auto input = inputs[i]; CAFFE_ENFORCE( PyArray_Check(input.ptr()), @@ -1201,7 +1201,7 @@ void addGlobalMethods(py::module& m) { }); m.def("nearby_opnames", [](const std::string& name) { std::vector alternatives; - int editTolerance = 3; + unsigned editTolerance = 3; // NOLINTNEXTLINE(performance-for-range-copy) for (auto it : caffe2::CPUOperatorRegistry()->Keys()) { if (editDistance(it, name, editTolerance) < editTolerance + 1) { diff --git a/caffe2/python/pybind_state_ideep.cc b/caffe2/python/pybind_state_ideep.cc index 7fecf195f937..f93524b2f9d3 100644 --- a/caffe2/python/pybind_state_ideep.cc +++ b/caffe2/python/pybind_state_ideep.cc @@ -65,10 +65,19 @@ class IDeepFetcher : public BlobFetcherBase { numpy_type != -1, "Unsupported ideep memory data type? This usually should not happen " "since ideep memory usually only do float and double."); - itensor::dims dims = atensor.get_public_format_dims(); + itensor::dims dims; + bool need_reorder = atensor.need_reorder(); + if (atensor.get_data_type() == idtype::f32 && !atensor.has_scale()) { + // For FP32 path, only support NCHW format input, so if atensor + // has NHWC format, we need reorder it to NCHW format. + dims = atensor.get_dims(); + need_reorder = need_reorder || atensor.get_desc().is_nhwc(); + } else { + dims = atensor.get_public_format_dims(); + } std::vector npy_dims(dims.begin(), dims.end()); - result.copied = force_copy || atensor.need_reorder(); + result.copied = force_copy || need_reorder; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) void* outPtr; if (result.copied) { @@ -87,7 +96,12 @@ class IDeepFetcher : public BlobFetcherBase { } if (result.copied) { - atensor.to_public(outPtr); + if (atensor.get_data_type() == idtype::f32 && !atensor.has_scale()) { + itensor temp_ten(atensor.get_desc().to_default_format(), outPtr); + atensor.reorder_to(temp_ten); + } else { + atensor.to_public(outPtr); + } } return result; diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py index 1bf7b607e1b7..2e2d284f92e4 100644 --- a/caffe2/python/workspace_test.py +++ b/caffe2/python/workspace_test.py @@ -1,13 +1,16 @@ +import errno import os import shutil import tempfile import unittest from collections import namedtuple +from typing import List import caffe2.python.hypothesis_test_util as htu import hypothesis.strategies as st import numpy as np import torch +from torch import Tensor from caffe2.proto import caffe2_pb2 from caffe2.python import core, test_util, workspace, model_helper, brew from hypothesis import given, settings @@ -783,8 +786,7 @@ def multi_input(self, x: torch.Tensor, y: torch.Tensor, z: int = 2) -> torch.Ten return x + y + z @torch.jit.script_method - def multi_input_tensor_list(self, tensor_list): # pyre-ignore: PT type annotations - # type: (List[Tensor]) -> Tensor + def multi_input_tensor_list(self, tensor_list: List[Tensor]) -> Tensor: return tensor_list[0] + tensor_list[1] + tensor_list[2] @torch.jit.script_method diff --git a/caffe2/quantization/server/concat_dnnlowp_op.h b/caffe2/quantization/server/concat_dnnlowp_op.h index 9f5f5a09de4a..5a4fcead155e 100644 --- a/caffe2/quantization/server/concat_dnnlowp_op.h +++ b/caffe2/quantization/server/concat_dnnlowp_op.h @@ -1,7 +1,7 @@ #pragma once #include "caffe2/operators/concat_split_op.h" -#include "dnnlowp_op.h" +#include "caffe2/quantization/server/dnnlowp_op.h" namespace caffe2 { diff --git a/caffe2/quantization/server/conv_dnnlowp_op.cc b/caffe2/quantization/server/conv_dnnlowp_op.cc index 15b71ceb1f22..aa5a39ccdac3 100644 --- a/caffe2/quantization/server/conv_dnnlowp_op.cc +++ b/caffe2/quantization/server/conv_dnnlowp_op.cc @@ -354,8 +354,6 @@ void ConvDNNLowPOp::QuantizeBias_() { this->template Input(FILTER); column_offset_ptr = packed_filter.column_offsets.get(); } else { - vector temp_qparams; - temp_qparams.push_back(in_qparams_[1]); column_offset_temp.resize(M); ComputeColumnOffsets( KernelDim_(), @@ -367,7 +365,7 @@ void ConvDNNLowPOp::QuantizeBias_() { } for (int i = 0; i < M; ++i) { (*b_quantized_)[i] -= - in_qparams_[0].zero_point * (*column_offset_ptr)[i]; + in_qparams_[INPUT].zero_point * (*column_offset_ptr)[i]; } } } @@ -387,8 +385,6 @@ void ConvDNNLowPOp::QuantizeBias_() { this->template Input(FILTER); column_offset_ptr = packed_filter.column_offsets.get(); } else { - vector temp_qparams; - temp_qparams.push_back(in_qparams_[1]); column_offset_temp.resize(M); ComputeColumnOffsets( KernelDim_(), @@ -399,7 +395,7 @@ void ConvDNNLowPOp::QuantizeBias_() { column_offset_ptr = &column_offset_temp; } for (int i = 0; i < M; ++i) { - (*b_quantized_)[i] -= in_qparams_[0].zero_point * (*column_offset_ptr)[i]; + (*b_quantized_)[i] -= in_qparams_[INPUT].zero_point * (*column_offset_ptr)[i]; } } } diff --git a/caffe2/quantization/server/dnnlowp_test_utils.py b/caffe2/quantization/server/dnnlowp_test_utils.py index 0d56ea6ac127..eb006ffe59b0 100644 --- a/caffe2/quantization/server/dnnlowp_test_utils.py +++ b/caffe2/quantization/server/dnnlowp_test_utils.py @@ -385,6 +385,8 @@ def run_conv_or_fc( outputs, scale=None, zero_point=None, + x_scale=None, + x_zero_point=None, ): if order: # Conv @@ -407,6 +409,11 @@ def run_conv_or_fc( dnnlowp_pybind11.CreateInt8QuantParamsBlob( "quant_param", float(scale), int(zero_point) ) + if x_scale is not None and x_zero_point is not None: + with workspace.WorkspaceGuard(test_case.ws): + dnnlowp_pybind11.CreateInt8QuantParamsBlob( + "X_quant_param", float(x_scale), int(x_zero_point) + ) if init_net: test_case.ws.run(init_net) @@ -427,6 +434,10 @@ def run_conv_or_fc( dnnlowp_pybind11.CreateInt8QuantParamsBlob( "quant_param", float(scale), int(zero_point) ) + if x_scale is not None and x_zero_point is not None: + dnnlowp_pybind11.CreateInt8QuantParamsBlob( + "X_quant_param", float(x_scale), int(x_zero_point) + ) if init_net: workspace.RunNetOnce(init_net) diff --git a/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op.h b/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op.h index 53a140b30bbf..b8f7538d95f0 100644 --- a/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op.h +++ b/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op.h @@ -1,6 +1,6 @@ #pragma once -#include "fully_connected_dnnlowp_op.h" +#include "caffe2/quantization/server/fully_connected_dnnlowp_op.h" namespace caffe2 { diff --git a/caffe2/quantization/server/fully_connected_dnnlowp_op.cc b/caffe2/quantization/server/fully_connected_dnnlowp_op.cc index 4f83940f6d74..439c738c007b 100644 --- a/caffe2/quantization/server/fully_connected_dnnlowp_op.cc +++ b/caffe2/quantization/server/fully_connected_dnnlowp_op.cc @@ -34,6 +34,8 @@ FullyConnectedDNNLowPOp::FullyConnectedDNNLowPOp( : BaseType(operator_def, ws), axis_(this->template GetSingleArgument("axis", 1)), axis_w_(this->template GetSingleArgument("axis_w", 1)), + X_scale_(this->template GetSingleArgument("X_scale", -1.0)), // for fused static int8 not valid if less than 0 + X_zero_point_(this->template GetSingleArgument("X_zero_point", 0)), quantize_channelwise_(this->template GetSingleArgument( "quantize_channelwise", false)), @@ -109,9 +111,22 @@ bool FullyConnectedDNNLowPOp::RunOnDevice() { t_very_begin = t_begin; } #endif + float X_scale = X_scale_; + int32_t X_zero_point = X_zero_point_; + if (InputSize() == 5) { + // float in float out, two possibilities + // if there are only 3 input (no qparams): dyanmic + // if there are 5 input (+ input qparams): fused int8 static + // output qparams need to be added anyway even it's dummy when dequantize_output=1 + const auto* input_qparam_blob = + this->template Input>(4).get(); + // input_params overwrite input arguments + X_scale = input_qparam_blob->qparam.scale; + X_zero_point = input_qparam_blob->qparam.zero_point; + } // Get quantization parameters - if (!GetQuantizationParameters_()) { + if (!GetQuantizationParameters_(X_scale, X_zero_point)) { return false; } @@ -168,7 +183,6 @@ bool FullyConnectedDNNLowPOp::RunOnDevice() { /* if (VLOG_IS_ON(1)) */ { t_begin = chrono::system_clock::now(); } #endif - Xdata = QuantizeInputIfNeeded(this, 0, in_qparams_[0], X_temp); #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN @@ -181,7 +195,7 @@ bool FullyConnectedDNNLowPOp::RunOnDevice() { t_begin = chrono::system_clock::now(); } #endif - } + } #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN /* if (VLOG_IS_ON(1)) */ @@ -295,6 +309,7 @@ bool FullyConnectedDNNLowPOp::RunOnDevice() { if (!X.template IsType()) { // Both input and output are float + // the path for dyanmic and fused staic row_offsets_.resize( PackAWithQuantRowOffset::rowOffsetBufferSize()); X_pack_buf_.resize( @@ -628,7 +643,7 @@ bool FullyConnectedDNNLowPOp::RunOnDevice() { } template -bool FullyConnectedDNNLowPOp::GetQuantizationParameters_() { +bool FullyConnectedDNNLowPOp::GetQuantizationParameters_(float X_scale, int X_zero_point) { using namespace dnnlowp; #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN @@ -638,7 +653,13 @@ bool FullyConnectedDNNLowPOp::GetQuantizationParameters_() { #endif // Choose quantization for X - in_qparams_[0] = GetInputTensorQuantizationParamsOf(this, 0, qfactory_.get()); + if (X_scale <= 0) { // non-fused static or Dynamic + in_qparams_[0] = GetInputTensorQuantizationParamsOf(this, 0, qfactory_.get()); + } + else { // fused int8 static + in_qparams_[0].scale = X_scale; + in_qparams_[0].zero_point = X_zero_point; + } #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN /* if (VLOG_IS_ON(1)) */ @@ -887,8 +908,8 @@ bool FullyConnectedDNNLowPOp::GetQuantizationParameters_() { #endif if (!dequantize_output_ && !requantization_param_selected_) { - CAFFE_ENFORCE(InputSize() <= 4); - if (InputSize() == 4) { + CAFFE_ENFORCE(InputSize() <= 5); + if (InputSize() >= 4) { const auto* input_qparam_blob = this->template Input>( 3) diff --git a/caffe2/quantization/server/fully_connected_dnnlowp_op.h b/caffe2/quantization/server/fully_connected_dnnlowp_op.h index 5dd90e1c0935..73d93f5a1362 100644 --- a/caffe2/quantization/server/fully_connected_dnnlowp_op.h +++ b/caffe2/quantization/server/fully_connected_dnnlowp_op.h @@ -17,10 +17,12 @@ class FullyConnectedDNNLowPOp USE_DNNLOWP_OPERATOR_BASE_FUNCTIONS(T, FullyConnectedOp); protected: - bool GetQuantizationParameters_(); + bool GetQuantizationParameters_(float X_scale_=-1.0, int X_zero_point_=0); std::size_t axis_{1}; std::size_t axis_w_{1}; + float X_scale_{-1.0}; + int X_zero_point_{0}; vector Y_shape_cache_; std::vector requantization_params_; diff --git a/caffe2/quantization/server/fully_connected_dnnlowp_op_test.py b/caffe2/quantization/server/fully_connected_dnnlowp_op_test.py index 3a8b0c14931e..52209025f294 100644 --- a/caffe2/quantization/server/fully_connected_dnnlowp_op_test.py +++ b/caffe2/quantization/server/fully_connected_dnnlowp_op_test.py @@ -34,6 +34,7 @@ class DNNLowPFullyConnectedOpTest(hu.HypothesisTestCase): fuse_relu=st.booleans(), output_packed_bias=st.booleans(), use_input_qparam=st.booleans(), + use_output_qparam=st.booleans(), **hu.gcs_cpu_only ) def test_dnnlowp_fully_connected_int( @@ -50,6 +51,7 @@ def test_dnnlowp_fully_connected_int( fuse_relu, output_packed_bias, use_input_qparam, + use_output_qparam, gc, dc, ): @@ -98,22 +100,26 @@ def test_dnnlowp_fully_connected_int( Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) outputs = [] - op_engine_list = [("FC", "")] + op_engine_list = [("FC", "", False, False)] if fuse_relu: - op_engine_list += [("Int8FCRelu", "DNNLOWP")] + op_engine_list += [("Int8FCRelu", "DNNLOWP", False, False)] else: op_engine_list += [ - ("FC", "DNNLOWP"), - ("FC", "DNNLOWP_16"), - ("Int8FC", "DNNLOWP"), + # type, engine, do_fuse, skip_requantization + ("FC", "DNNLOWP", False, False), + ("FC", "DNNLOWP_16", False, False), + ("Int8FC", "DNNLOWP", False, False), + ("Int8FC", "DNNLOWP", True, False), + ("Int8FC", "DNNLOWP", False, True), + ("Int8FC", "DNNLOWP", True, True), ] - for op_type, engine in op_engine_list: + for op_type, engine, do_fuse, skip_requantization in op_engine_list: init_net = core.Net("test_init_net") net = core.Net("test_net") - do_quantize = "DNNLOWP" in engine and in_quantized - do_dequantize = "DNNLOWP" in engine and out_quantized + do_quantize = "DNNLOWP" in engine and in_quantized and not do_fuse + do_dequantize = "DNNLOWP" in engine and out_quantized and not skip_requantization do_quantize_weight = ( engine == "DNNLOWP" and weight_quantized and len(outputs) > 0 ) @@ -167,41 +173,29 @@ def test_dnnlowp_fully_connected_int( ) init_net.Proto().op.extend([pack]) - if use_input_qparam and do_dequantize and op_type != "FC": - fc = core.CreateOperator( - op_type, - [ - "X_q" if do_quantize else "X", - "W_packed" - if do_prepack_weight - else ("W_q" if do_quantize_weight else "W"), - "b_q" if do_quantize_weight else "b", - "quant_param", - ], - ["Y_q" if do_dequantize else "Y"], - dequantize_output=not do_dequantize, - preserve_activation_sparsity=preserve_activation_sparsity, - preserve_weight_sparsity=preserve_weight_sparsity, - engine=engine, - device_option=gc, - ) - else: - fc = core.CreateOperator( - op_type, - [ - "X_q" if do_quantize else "X", - "W_packed" - if do_prepack_weight - else ("W_q" if do_quantize_weight else "W"), - "b_q" if do_quantize_weight else "b", - ], - ["Y_q" if do_dequantize else "Y"], - dequantize_output=not do_dequantize, - preserve_activation_sparsity=preserve_activation_sparsity, - preserve_weight_sparsity=preserve_weight_sparsity, - engine=engine, - device_option=gc, - ) + fc = core.CreateOperator( + op_type, + [ + "X_q" if do_quantize else "X", + "W_packed" + if do_prepack_weight + else ("W_q" if do_quantize_weight else "W"), + "b_q" if do_quantize_weight else "b", + # "quant_param", + ], + ["Y_q" if do_dequantize else "Y"], + dequantize_output=not do_dequantize, + preserve_activation_sparsity=preserve_activation_sparsity, + preserve_weight_sparsity=preserve_weight_sparsity, + engine=engine, + device_option=gc, + ) + if op_type != "FC": + if (do_dequantize and use_output_qparam) or (use_input_qparam and op_type == "Int8_FC"): + fc.input.extend(["quant_param"]) + if (use_input_qparam and op_type == "Int8_FC"): + fc.input.extend(["X_quant_param"]) + if do_quantize_weight or do_prepack_weight: # When quantized weight is provided, we can't rescale the # output dynamically by looking at the range of output of each @@ -221,7 +215,9 @@ def test_dnnlowp_fully_connected_int( ) net.Proto().op.extend([dequantize]) - if use_input_qparam and do_dequantize and op_type != "FC": + + + if use_output_qparam and do_dequantize and op_type != "FC": ref_output = outputs[0][0] ref_output_min = 0 if ref_output.size == 0 else ref_output.min() ref_output_max = 0 if ref_output.size == 0 else ref_output.max() @@ -229,25 +225,37 @@ def test_dnnlowp_fully_connected_int( q_param = dnnlowp_utils.choose_quantization_params( ref_output_min, ref_output_max, preserve_activation_sparsity ) - run_conv_or_fc( - self, - init_net, - net, - X, - W, - b, - op_type, - engine, - None, - gc, - outputs, - q_param.scale, - q_param.zero_point, - ) + q_param_scale = q_param.scale + q_param_zero_point = q_param.zero_point else: - run_conv_or_fc( - self, init_net, net, X, W, b, op_type, engine, None, gc, outputs - ) + q_param_scale = None + q_param_zero_point = None + + if not (use_input_qparam and op_type == "Int8FC"): + x_q_param_scale = None + x_q_param_zero_point = None + else: + x_q_param_scale = x_q_param.scale + x_q_param_zero_point = x_q_param.zero_point + + run_conv_or_fc( + self, + init_net, + net, + X, + W, + b, + op_type, + engine, + None, + gc, + outputs, + q_param_scale, + q_param_zero_point, + x_q_param_scale, + x_q_param_zero_point, + ) + if output_packed_bias and do_prepack_weight and do_dequantize: bias_int32 = self.ws.blobs["B_q32"].fetch() @@ -264,12 +272,14 @@ def test_dnnlowp_fully_connected_int( "W": [output_channels, input_channels], "b": [output_channels], "quant_param": [1], + "X_quant_param": [1], }, blob_types={ "X": core.DataType.FLOAT, "W": core.DataType.FLOAT, "b": core.DataType.FLOAT, "quant_param": core.DataType.FLOAT, + "X_quant_param": core.DataType.FLOAT, }, ) assert ( diff --git a/caffe2/quantization/server/im2col_dnnlowp.h b/caffe2/quantization/server/im2col_dnnlowp.h index dc347142b640..4aca91811da9 100644 --- a/caffe2/quantization/server/im2col_dnnlowp.h +++ b/caffe2/quantization/server/im2col_dnnlowp.h @@ -216,7 +216,7 @@ static void Im2ColNHWC( T* data_col_temp = data_col + h * width_col * kernel_h * kernel_w * channels; int w_pad = -pad_l; - for (const auto w : c10::irange(width_col)) { + for (C10_UNUSED const auto w : c10::irange(width_col)) { int r = 0; for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) { int s = 0; diff --git a/caffe2/quantization/server/kl_minimization.h b/caffe2/quantization/server/kl_minimization.h index edf95f5b9a1a..9b43fce4e56f 100644 --- a/caffe2/quantization/server/kl_minimization.h +++ b/caffe2/quantization/server/kl_minimization.h @@ -1,6 +1,6 @@ #pragma once -#include "quantization_error_minimization.h" +#include "caffe2/quantization/server/quantization_error_minimization.h" namespace dnnlowp { diff --git a/caffe2/quantization/server/l2_minimization.h b/caffe2/quantization/server/l2_minimization.h index 5c2173f48267..2ef983b986d6 100644 --- a/caffe2/quantization/server/l2_minimization.h +++ b/caffe2/quantization/server/l2_minimization.h @@ -1,6 +1,6 @@ #pragma once -#include "quantization_error_minimization.h" +#include "caffe2/quantization/server/quantization_error_minimization.h" #include #include diff --git a/caffe2/quantization/server/quantization_error_minimization.h b/caffe2/quantization/server/quantization_error_minimization.h index a315cf1a0977..83725d8b19cf 100644 --- a/caffe2/quantization/server/quantization_error_minimization.h +++ b/caffe2/quantization/server/quantization_error_minimization.h @@ -1,6 +1,6 @@ #pragma once -#include "dnnlowp.h" +#include "caffe2/quantization/server/dnnlowp.h" namespace dnnlowp { diff --git a/caffe2/quantization/server/relu_dnnlowp_op.h b/caffe2/quantization/server/relu_dnnlowp_op.h index f308e90e2881..2885f0fda26b 100644 --- a/caffe2/quantization/server/relu_dnnlowp_op.h +++ b/caffe2/quantization/server/relu_dnnlowp_op.h @@ -3,7 +3,7 @@ #include "caffe2/operators/relu_op.h" #include "caffe2/core/tensor_int8.h" -#include "caffe2_dnnlowp_utils.h" +#include "caffe2/quantization/server/caffe2_dnnlowp_utils.h" namespace caffe2 { diff --git a/caffe2/quantization/server/sigmoid.h b/caffe2/quantization/server/sigmoid.h index 17722405e6f0..c21303420e6a 100644 --- a/caffe2/quantization/server/sigmoid.h +++ b/caffe2/quantization/server/sigmoid.h @@ -1,6 +1,6 @@ #pragma once -#include "tanh.h" +#include "caffe2/quantization/server/tanh.h" namespace dnnlowp { diff --git a/caffe2/quantization/server/tanh.h b/caffe2/quantization/server/tanh.h index 2950352131d1..823ded42982b 100644 --- a/caffe2/quantization/server/tanh.h +++ b/caffe2/quantization/server/tanh.h @@ -1,6 +1,6 @@ #pragma once -#include "dnnlowp.h" +#include "caffe2/quantization/server/dnnlowp.h" #include #include diff --git a/caffe2/queue/blobs_queue.cc b/caffe2/queue/blobs_queue.cc index 4398cf816481..4c890088fa2d 100644 --- a/caffe2/queue/blobs_queue.cc +++ b/caffe2/queue/blobs_queue.cc @@ -18,16 +18,11 @@ namespace caffe2 { // Constants for user tracepoints -// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable) -static constexpr int SDT_NONBLOCKING_OP = 0; -// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable) -static constexpr int SDT_BLOCKING_OP = 1; -// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable) -static constexpr uint64_t SDT_TIMEOUT = (uint64_t)-1; -// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable) -static constexpr uint64_t SDT_ABORT = (uint64_t)-2; -// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable) -static constexpr uint64_t SDT_CANCEL = (uint64_t)-3; +C10_UNUSED static constexpr int SDT_NONBLOCKING_OP = 0; +C10_UNUSED static constexpr int SDT_BLOCKING_OP = 1; +C10_UNUSED static constexpr uint64_t SDT_TIMEOUT = (uint64_t)-1; +C10_UNUSED static constexpr uint64_t SDT_ABORT = (uint64_t)-2; +C10_UNUSED static constexpr uint64_t SDT_CANCEL = (uint64_t)-3; BlobsQueue::BlobsQueue( Workspace* ws, @@ -66,8 +61,7 @@ bool BlobsQueue::blockingRead( float timeout_secs) { Timer readTimer; auto keeper = this->shared_from_this(); - // NOLINTNEXTLINE(clang-diagnostic-unused-variable) - const auto& name = name_.c_str(); + C10_UNUSED const auto& name = name_.c_str(); CAFFE_SDT(queue_read_start, name, (void*)this, SDT_BLOCKING_OP); std::unique_lock g(mutex_); auto canRead = [this]() { @@ -76,7 +70,6 @@ bool BlobsQueue::blockingRead( }; // Decrease queue balance before reading to indicate queue read pressure // is being increased (-ve queue balance indicates more reads than writes) - // NOLINTNEXTLINE(clang-diagnostic-unused-variable) CAFFE_EVENT(stats_, queue_balance, -1); if (timeout_secs > 0) { std::chrono::milliseconds timeout_ms(int(timeout_secs * 1000)); @@ -99,17 +92,14 @@ bool BlobsQueue::blockingRead( CAFFE_ENFORCE(inputs.size() >= result.size()); for (const auto i : c10::irange(result.size())) { auto bytes = BlobStat::sizeBytes(*result[i]); - // NOLINTNEXTLINE(clang-diagnostic-unused-variable) CAFFE_EVENT(stats_, queue_dequeued_bytes, bytes, i); using std::swap; swap(*(inputs[i]), *(result[i])); } CAFFE_SDT(queue_read_end, name, (void*)this, writer_ - reader_); - // NOLINTNEXTLINE(clang-diagnostic-unused-variable) CAFFE_EVENT(stats_, queue_dequeued_records); ++reader_; cv_.notify_all(); - // NOLINTNEXTLINE(clang-diagnostic-unused-variable) CAFFE_EVENT(stats_, read_time_ns, readTimer.NanoSeconds()); return true; } @@ -117,8 +107,7 @@ bool BlobsQueue::blockingRead( bool BlobsQueue::tryWrite(const std::vector& inputs) { Timer writeTimer; auto keeper = this->shared_from_this(); - // NOLINTNEXTLINE(clang-diagnostic-unused-variable) - const auto& name = name_.c_str(); + C10_UNUSED const auto& name = name_.c_str(); CAFFE_SDT(queue_write_start, name, (void*)this, SDT_NONBLOCKING_OP); std::unique_lock g(mutex_); if (!canWrite()) { @@ -127,11 +116,9 @@ bool BlobsQueue::tryWrite(const std::vector& inputs) { } // Increase queue balance before writing to indicate queue write pressure is // being increased (+ve queue balance indicates more writes than reads) - // NOLINTNEXTLINE(clang-diagnostic-unused-variable) CAFFE_EVENT(stats_, queue_balance, 1); DCHECK(canWrite()); doWrite(inputs); - // NOLINTNEXTLINE(clang-diagnostic-unused-variable) CAFFE_EVENT(stats_, write_time_ns, writeTimer.NanoSeconds()); return true; } @@ -139,13 +126,11 @@ bool BlobsQueue::tryWrite(const std::vector& inputs) { bool BlobsQueue::blockingWrite(const std::vector& inputs) { Timer writeTimer; auto keeper = this->shared_from_this(); - // NOLINTNEXTLINE(clang-diagnostic-unused-variable) - const auto& name = name_.c_str(); + C10_UNUSED const auto& name = name_.c_str(); CAFFE_SDT(queue_write_start, name, (void*)this, SDT_BLOCKING_OP); std::unique_lock g(mutex_); // Increase queue balance before writing to indicate queue write pressure is // being increased (+ve queue balance indicates more writes than reads) - // NOLINTNEXTLINE(clang-diagnostic-unused-variable) CAFFE_EVENT(stats_, queue_balance, 1); cv_.wait(g, [this]() { return closing_ || canWrite(); }); if (!canWrite()) { @@ -154,7 +139,6 @@ bool BlobsQueue::blockingWrite(const std::vector& inputs) { } DCHECK(canWrite()); doWrite(inputs); - // NOLINTNEXTLINE(clang-diagnostic-unused-variable) CAFFE_EVENT(stats_, write_time_ns, writeTimer.NanoSeconds()); return true; } @@ -170,7 +154,7 @@ bool BlobsQueue::canWrite() { // writer is always within [reader, reader + size) // we can write if reader is within [reader, reader + size) CAFFE_ENFORCE_LE(reader_, writer_); - CAFFE_ENFORCE_LE(writer_, reader_ + queue_.size()); + CAFFE_ENFORCE_LE(writer_, static_cast(reader_ + queue_.size())); // NOLINTNEXTLINE(clang-diagnostic-sign-compare) return writer_ != reader_ + queue_.size(); } @@ -178,8 +162,7 @@ bool BlobsQueue::canWrite() { void BlobsQueue::doWrite(const std::vector& inputs) { auto& result = queue_[writer_ % queue_.size()]; CAFFE_ENFORCE(inputs.size() >= result.size()); - // NOLINTNEXTLINE(clang-diagnostic-unused-variable) - const auto& name = name_.c_str(); + C10_UNUSED const auto& name = name_.c_str(); for (const auto i : c10::irange(result.size())) { using std::swap; swap(*(inputs[i]), *(result[i])); diff --git a/caffe2/serialize/crc_alt.h b/caffe2/serialize/crc_alt.h index 108d998b602f..3299327be430 100644 --- a/caffe2/serialize/crc_alt.h +++ b/caffe2/serialize/crc_alt.h @@ -101,8 +101,11 @@ uint32_t crc32_16bytes_prefetch(const void* data, size_t length, uint32_t previo // Windows always little endian #define __BYTE_ORDER __LITTLE_ENDIAN + #if !defined(_M_ARM64) // intrinsics / prefetching #include + #endif + #ifdef __MINGW32__ #define PREFETCH(location) __builtin_prefetch(location) #else diff --git a/caffe2/serialize/inline_container.cc b/caffe2/serialize/inline_container.cc index 9f0e9ce6194e..9847bc132264 100644 --- a/caffe2/serialize/inline_container.cc +++ b/caffe2/serialize/inline_container.cc @@ -5,6 +5,9 @@ #include #include #include +#include +#include + #include #include @@ -49,6 +52,17 @@ static std::string basename(const std::string& name) { return name.substr(start, end - start); } +static std::string parentdir(const std::string& name) { + size_t end = name.find_last_of('/'); + if(end == std::string::npos) + end = name.find_last_of('\\'); + + if(end == std::string::npos) + return ""; + + return name.substr(0, end); +} + size_t PyTorchStreamReader::read(uint64_t pos, char* buf, size_t n) { return in_->read(pos, buf, n, "reading file"); } @@ -129,22 +143,27 @@ void PyTorchStreamReader::init() { } std::string version(static_cast(version_ptr.get()), version_size); version_ = caffe2::stoull(version); - AT_ASSERTM( - // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - version_ >= kMinSupportedFileFormatVersion, - "Attempted to read a PyTorch file with version ", - c10::to_string(version_), - ", but the minimum supported version for reading is ", - c10::to_string(kMinSupportedFileFormatVersion), - ". Your PyTorch script module file is too old. Please re-export it again."); - AT_ASSERTM( - // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - version_ <= kMaxSupportedFileFormatVersion, - "Attempted to read a PyTorch file with version ", - version_, - ", but the maximum supported version for reading is ", - kMaxSupportedFileFormatVersion, - ". Your PyTorch installation may be too old."); + // NOLINTNEXTLINE(clang-diagnostic-sign-compare) + if (version_ < kMinSupportedFileFormatVersion) { + CAFFE_THROW( + "Attempted to read a PyTorch file with version ", + c10::to_string(version_), + ", but the minimum supported version for reading is ", + c10::to_string(kMinSupportedFileFormatVersion), + ". Your PyTorch script module file is too old. Please regenerate it", + " with latest version of PyTorch to mitigate this issue."); + } + + // NOLINTNEXTLINE(clang-diagnostic-sign-compare) + if (version_ > kMaxSupportedFileFormatVersion) { + CAFFE_THROW( + "Attempted to read a PyTorch file with version ", + version_, + ", but the maximum supported version for reading is ", + kMaxSupportedFileFormatVersion, + ". The version of your PyTorch installation may be too old, ", + "please upgrade PyTorch to latest version to mitigate this issue."); + } } void PyTorchStreamReader::valid(const char* what, const char* info) { @@ -333,6 +352,13 @@ void PyTorchStreamWriter::setup(const string& file_name) { file_name, std::ofstream::out | std::ofstream::trunc | std::ofstream::binary); valid("opening archive ", file_name.c_str()); + + const std::string dir_name = parentdir(file_name); + if(!dir_name.empty()) { + struct stat st; + bool dir_exists = (stat(dir_name.c_str(), &st) == 0 && (st.st_mode & S_IFDIR)); + TORCH_CHECK(dir_exists, "Parent directory ", dir_name, " does not exist."); + } TORCH_CHECK(file_stream_, "File ", file_name, " cannot be opened."); writer_func_ = [this](const void* buf, size_t nbytes) -> size_t { file_stream_.write(static_cast(buf), nbytes); diff --git a/caffe2/serialize/inline_container_test.cc b/caffe2/serialize/inline_container_test.cc index 5ceb7274b771..18f75dddfaa5 100644 --- a/caffe2/serialize/inline_container_test.cc +++ b/caffe2/serialize/inline_container_test.cc @@ -5,6 +5,7 @@ #include #include "caffe2/serialize/inline_container.h" +#include "c10/util/irange.h" namespace caffe2 { namespace serialize { @@ -22,14 +23,14 @@ TEST(PyTorchStreamWriterAndReader, SaveAndLoad) { // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,cppcoreguidelines-avoid-magic-numbers) std::array data1; - for (int i = 0; i < data1.size(); ++i) { + for (auto i: c10::irange( data1.size())) { data1[i] = data1.size() - i; } writer.writeRecord("key1", data1.data(), data1.size()); // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,cppcoreguidelines-avoid-magic-numbers) std::array data2; - for (int i = 0; i < data2.size(); ++i) { + for (auto i: c10::irange(data2.size())) { data2[i] = data2.size() - i; } writer.writeRecord("key2", data2.data(), data2.size()); @@ -83,14 +84,14 @@ TEST(PytorchStreamWriterAndReader, GetNonexistentRecordThrows) { // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,cppcoreguidelines-avoid-magic-numbers) std::array data1; - for (int i = 0; i < data1.size(); ++i) { + for (auto i: c10::irange(data1.size())) { data1[i] = data1.size() - i; } writer.writeRecord("key1", data1.data(), data1.size()); // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,cppcoreguidelines-avoid-magic-numbers) std::array data2; - for (int i = 0; i < data2.size(); ++i) { + for (auto i: c10::irange(data2.size())) { data2[i] = data2.size() - i; } writer.writeRecord("key2", data2.data(), data2.size()); diff --git a/caffe2/serialize/versions.h b/caffe2/serialize/versions.h index 9e89fe9acd64..78a91c64fe84 100644 --- a/caffe2/serialize/versions.h +++ b/caffe2/serialize/versions.h @@ -12,7 +12,7 @@ namespace serialize { constexpr uint64_t kMinSupportedFileFormatVersion = 0x1L; #if ENABLE_UPGRADERS -constexpr uint64_t kMaxSupportedFileFormatVersion = 0x9L; +constexpr uint64_t kMaxSupportedFileFormatVersion = 0xAL; #else constexpr uint64_t kMaxSupportedFileFormatVersion = 0x6L; #endif @@ -79,7 +79,11 @@ constexpr uint64_t kMaxSupportedFileFormatVersion = 0x6L; // Bump the version number to 9 to update aten::logspace and // and aten::logspace.out to error out when steps is not // provided. (see: https://github.com/pytorch/pytorch/issues/55951) -constexpr uint64_t kProducedFileFormatVersion = 0x9L; +// 3) [02/11/2022] +// Bump the version number to 10 to update aten::gelu and +// and aten::gelu.out to support the new approximate kwarg. +// (see: https://github.com/pytorch/pytorch/pull/61439) +constexpr uint64_t kProducedFileFormatVersion = 0xAL; #else constexpr uint64_t kProducedFileFormatVersion = 0x3L; #endif @@ -106,24 +110,37 @@ constexpr uint64_t kMinProducedFileFormatVersion = 0x3L; // 0x2L: (Comment missing) // 0x3L: (Comment missing) // 0x4L: (update) Added schema to function tuple. Forward-compatible change. -// 0x5L: (update) Update bytecode is sharing constant tensor files from torchscript, and only serialize -// extra tensors that are not in the torchscript constant table. Also update tensor storage schema adapting -// to the unify format, the root key of tensor storage is updated from {index} to -// {the_pointer_value_the_tensor.storage}, for example: `140245072983168.storage` -// Forward-compatibility change. +// 0x5L: (update) Update bytecode is sharing constant tensor files from +// torchscript, and only serialize extra tensors that are not in the +// torchscript constant table. Also update tensor storage schema adapting to +// the unify format, the root key of tensor storage is updated from {index} to +// {the_pointer_value_the_tensor.storage}, for example: +// `140245072983168.storage` Forward-compatibility change. // 0x6L: Implicit opereator versioning using number of specified argument. -// Refer to the summary of https://github.com/pytorch/pytorch/pull/56845 -// for details. -// 0x7L: Enable support for operators with default arguments plus out arguments. -constexpr uint64_t kProducedBytecodeVersion = 0x7L; +// Refer to the summary of https://github.com/pytorch/pytorch/pull/56845 for +// details. +// 0x7L: Enable support for operators with default arguments plus out +// arguments. Refer. See https://github.com/pytorch/pytorch/pull/63651 for +// details. +// 0x8L: Emit promoted operators as instructions. See +// https://github.com/pytorch/pytorch/pull/71662 for details. +// 0x9L: Change serialization format from pickle to format This version is to +// serve migration. v8 pickle and v9 flatbuffer are the same. Refer to the +// summary of https://github.com/pytorch/pytorch/pull/75201 for more details. +constexpr uint64_t kProducedBytecodeVersion = 0x8L; + +// static_assert( +// kProducedBytecodeVersion >= kProducedFileFormatVersion, +// "kProducedBytecodeVersion must be higher or equal to +// kProducedFileFormatVersion."); // Introduce kMinSupportedBytecodeVersion and kMaxSupportedBytecodeVersion // for limited backward/forward compatibility support of bytecode. If -// kMinSupportedBytecodeVersion <= model_version <= kMaxSupportedBytecodeVersion (in loader), -// we should support this model_version. For example, we provide a wrapper to -// handle an updated operator. -constexpr uint64_t kMinSupportedBytecodeVersion = 0x3L; -constexpr uint64_t kMaxSupportedBytecodeVersion = 0x8L; +// kMinSupportedBytecodeVersion <= model_version <= kMaxSupportedBytecodeVersion +// (in loader), we should support this model_version. For example, we provide a +// wrapper to handle an updated operator. +constexpr uint64_t kMinSupportedBytecodeVersion = 0x4L; +constexpr uint64_t kMaxSupportedBytecodeVersion = 0x9L; } // namespace serialize } // namespace caffe2 diff --git a/caffe2/sgd/learning_rate_functors.h b/caffe2/sgd/learning_rate_functors.h index c2b9dd976a1f..d733ccc14611 100644 --- a/caffe2/sgd/learning_rate_functors.h +++ b/caffe2/sgd/learning_rate_functors.h @@ -36,7 +36,7 @@ class FixedLearningRate : public LearningRateFunctor { }; // Alter: alternatate learning rate with active_period and inactive_period. -// update for for a duration of active_period and then stop for a duration of +// update for a duration of active_period and then stop for a duration of // inactive_period if active_first, and vice versa template class AlternateLearningRate : public LearningRateFunctor { diff --git a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc index 879f0d25068b..0f7e90e55b53 100644 --- a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc +++ b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op_test.cc @@ -199,7 +199,7 @@ void runConv( } // unnamed namespace -constexpr size_t kIters = 20; +constexpr int kIters = 20; TEST(DEPTHWISE3x3, Conv) { for (int i = 0; i < kIters; ++i) { diff --git a/caffe2/share/contrib/nnpack/nnpack_test.cc b/caffe2/share/contrib/nnpack/nnpack_test.cc index 398be235f7f1..fe653c4d91ab 100644 --- a/caffe2/share/contrib/nnpack/nnpack_test.cc +++ b/caffe2/share/contrib/nnpack/nnpack_test.cc @@ -236,7 +236,7 @@ void runConv( } // unnamed namespace -constexpr size_t kIters = 20; +constexpr int kIters = 20; TEST(NNPACK, Conv_3x3s1) { for (int i = 0; i < kIters; ++i) { diff --git a/caffe2/utils/threadpool/pthreadpool-cpp.cc b/caffe2/utils/threadpool/pthreadpool-cpp.cc index 38846d5b143d..2c2209f225ca 100644 --- a/caffe2/utils/threadpool/pthreadpool-cpp.cc +++ b/caffe2/utils/threadpool/pthreadpool-cpp.cc @@ -83,7 +83,7 @@ size_t getDefaultNumThreads(); PThreadPool* pthreadpool() { static auto threadpool = std::make_unique(getDefaultNumThreads()); -#if !(defined(WIN32)) && !(defined(__XROS__)) +#if !(defined(WIN32)) static std::once_flag flag; std::call_once(flag, []() { pthread_atfork(nullptr, nullptr, child_atfork); diff --git a/caffe2/utils/threadpool/pthreadpool.h b/caffe2/utils/threadpool/pthreadpool.h index 54b3cb63303c..914ebf40a699 100644 --- a/caffe2/utils/threadpool/pthreadpool.h +++ b/caffe2/utils/threadpool/pthreadpool.h @@ -8,7 +8,7 @@ #include // for size_t #include // for uint32_t -#if defined(USE_PTHREADPOOL) && !(defined(__XROS__)) +#if defined(USE_PTHREADPOOL) // This is a hack. // Mainly introduced here because // 1. NNPACK can be compiled to use internal legacy threadpool implementation because much of C2 depends on that. diff --git a/caffe2/utils/threadpool/pthreadpool_impl.cc b/caffe2/utils/threadpool/pthreadpool_impl.cc index 72bee75678ec..ae031ca2ae7e 100644 --- a/caffe2/utils/threadpool/pthreadpool_impl.cc +++ b/caffe2/utils/threadpool/pthreadpool_impl.cc @@ -2,7 +2,7 @@ #include "caffe2/utils/threadpool/pthreadpool-cpp.h" #include "caffe2/utils/threadpool/ThreadPool.h" -#if defined(USE_PTHREADPOOL) && !(defined(__XROS__)) +#if defined(USE_PTHREADPOOL) namespace caffe2 { namespace { static thread_local bool using_new_threadpool{false}; @@ -34,7 +34,7 @@ void legacy_pthreadpool_compute_1d( } return; } -#if defined(USE_PTHREADPOOL) && !(defined(__XROS__)) +#if defined(USE_PTHREADPOOL) if (caffe2::using_new_threadpool) { pthreadpool_parallelize_1d(threadpool, function, argument, range, 0u); } else { diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake index ccf4b23cc3c0..8fa42e89411f 100644 --- a/cmake/Codegen.cmake +++ b/cmake/Codegen.cmake @@ -67,108 +67,18 @@ if(INTERN_BUILD_ATEN_OPS) set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/MapAllocator.cpp PROPERTIES COMPILE_FLAGS "-fno-openmp") endif() - file(GLOB cpu_kernel_cpp_in "${PROJECT_SOURCE_DIR}/aten/src/ATen/native/cpu/*.cpp" "${PROJECT_SOURCE_DIR}/aten/src/ATen/native/quantized/cpu/kernels/*.cpp") - - list(APPEND CPU_CAPABILITY_NAMES "DEFAULT") - list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}") - - - if(CXX_AVX512_FOUND) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_AVX512_CPU_DEFINITION") - list(APPEND CPU_CAPABILITY_NAMES "AVX512") - if(MSVC) - list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX512") - else(MSVC) - list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -mavx512f -mavx512bw -mavx512vl -mavx512dq -mfma") - endif(MSVC) - endif(CXX_AVX512_FOUND) - - if(CXX_AVX2_FOUND) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_AVX2_CPU_DEFINITION") - - # Some versions of GCC pessimistically split unaligned load and store - # instructions when using the default tuning. This is a bad choice on - # new Intel and AMD processors so we disable it when compiling with AVX2. - # See https://stackoverflow.com/questions/52626726/why-doesnt-gcc-resolve-mm256-loadu-pd-as-single-vmovupd#tab-top - check_cxx_compiler_flag("-mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store" COMPILER_SUPPORTS_NO_AVX256_SPLIT) - if(COMPILER_SUPPORTS_NO_AVX256_SPLIT) - set(CPU_NO_AVX256_SPLIT_FLAGS "-mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store") - endif(COMPILER_SUPPORTS_NO_AVX256_SPLIT) - - list(APPEND CPU_CAPABILITY_NAMES "AVX2") - if(DEFINED ENV{ATEN_AVX512_256}) - if($ENV{ATEN_AVX512_256} MATCHES "TRUE") - if(CXX_AVX512_FOUND) - message("-- ATen AVX2 kernels will use 32 ymm registers") - if(MSVC) - list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX512") - else(MSVC) - list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -march=native ${CPU_NO_AVX256_SPLIT_FLAGS}") - endif(MSVC) - endif(CXX_AVX512_FOUND) - endif() - else() - if(MSVC) - list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX2") - else(MSVC) - list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -mavx2 -mfma ${CPU_NO_AVX256_SPLIT_FLAGS}") - endif(MSVC) - endif() - endif(CXX_AVX2_FOUND) - - if(CXX_VSX_FOUND) - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_VSX_CPU_DEFINITION") - LIST(APPEND CPU_CAPABILITY_NAMES "VSX") - LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} ${CXX_VSX_FLAGS}") - endif(CXX_VSX_FOUND) - - if(CXX_ZVECTOR_FOUND) - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_ZVECTOR_CPU_DEFINITION") - LIST(APPEND CPU_CAPABILITY_NAMES "ZVECTOR") - LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} ${CXX_ZVECTOR_FLAGS}") - endif(CXX_ZVECTOR_FOUND) - - list(LENGTH CPU_CAPABILITY_NAMES NUM_CPU_CAPABILITY_NAMES) - math(EXPR NUM_CPU_CAPABILITY_NAMES "${NUM_CPU_CAPABILITY_NAMES}-1") - - # The sources list might get reordered later based on the capabilites. - # See NOTE [ Linking AVX and non-AVX files ] - foreach(i RANGE ${NUM_CPU_CAPABILITY_NAMES}) - foreach(IMPL ${cpu_kernel_cpp_in}) - file(RELATIVE_PATH NAME "${PROJECT_SOURCE_DIR}/aten/src/ATen/" "${IMPL}") - list(GET CPU_CAPABILITY_NAMES ${i} CPU_CAPABILITY) - set(NEW_IMPL ${CMAKE_BINARY_DIR}/aten/src/ATen/${NAME}.${CPU_CAPABILITY}.cpp) - configure_file("${PROJECT_SOURCE_DIR}/cmake/IncludeSource.cpp.in" ${NEW_IMPL}) - set(cpu_kernel_cpp ${NEW_IMPL} ${cpu_kernel_cpp}) # Create list of copies - list(GET CPU_CAPABILITY_FLAGS ${i} FLAGS) - if(MSVC) - set(EXTRA_FLAGS "/DCPU_CAPABILITY=${CPU_CAPABILITY} /DCPU_CAPABILITY_${CPU_CAPABILITY}") - else(MSVC) - set(EXTRA_FLAGS "-DCPU_CAPABILITY=${CPU_CAPABILITY} -DCPU_CAPABILITY_${CPU_CAPABILITY}") - endif(MSVC) - # Disable certain warnings for GCC-9.X - if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0)) - if(("${NAME}" STREQUAL "native/cpu/GridSamplerKernel.cpp") AND ("${CPU_CAPABILITY}" STREQUAL "DEFAULT")) - # See https://github.com/pytorch/pytorch/issues/38855 - set(EXTRA_FLAGS "${EXTRA_FLAGS} -Wno-uninitialized") - endif() - if("${NAME}" STREQUAL "native/quantized/cpu/kernels/QuantizedOpKernels.cpp") - # See https://github.com/pytorch/pytorch/issues/38854 - set(EXTRA_FLAGS "${EXTRA_FLAGS} -Wno-deprecated-copy") - endif() - endif() - set_source_files_properties(${NEW_IMPL} PROPERTIES COMPILE_FLAGS "${FLAGS} ${EXTRA_FLAGS}") - endforeach() - endforeach() - list(APPEND ATen_CPU_SRCS ${cpu_kernel_cpp}) - - file(GLOB_RECURSE all_python "${CMAKE_CURRENT_LIST_DIR}/../tools/codegen/*.py") + file(GLOB_RECURSE all_python "${CMAKE_CURRENT_LIST_DIR}/../torchgen/*.py") set(GEN_ROCM_FLAG) if(USE_ROCM) set(GEN_ROCM_FLAG --rocm) endif() + set(GEN_MPS_FLAG) + if(USE_MPS) + set(GEN_MPS_FLAG --mps) + endif() + set(CUSTOM_BUILD_FLAGS) if(INTERN_BUILD_MOBILE) if(USE_VULKAN) @@ -193,24 +103,64 @@ if(INTERN_BUILD_ATEN_OPS) endif() if(STATIC_DISPATCH_BACKEND) - message(STATUS "Custom build with static dispatch backend: ${STATIC_DISPATCH_BACKEND}") + message(STATUS "Custom build with static dispatch backends: ${STATIC_DISPATCH_BACKEND}") + list(LENGTH STATIC_DISPATCH_BACKEND len) list(APPEND CUSTOM_BUILD_FLAGS --static_dispatch_backend ${STATIC_DISPATCH_BACKEND}) endif() + # Codegen unboxing + if(USE_LIGHTWEIGHT_DISPATCH) + file(GLOB_RECURSE all_unboxing_script "${CMAKE_CURRENT_LIST_DIR}/../tools/jit/*.py") + list(APPEND CUSTOM_BUILD_FLAGS --skip_dispatcher_op_registration) + set(GEN_UNBOXING_COMMAND + "${PYTHON_EXECUTABLE}" -m tools.jit.gen_unboxing + --source-path ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen + --install_dir ${CMAKE_BINARY_DIR}/aten/src/ATen + ) + set("GEN_UNBOXING_COMMAND_sources" + ${GEN_UNBOXING_COMMAND} + --output-dependencies ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_unboxing_sources.cmake + ) + message(STATUS "Generating sources for lightweight dispatch") + execute_process( + COMMAND ${GEN_UNBOXING_COMMAND_sources} --dry-run + RESULT_VARIABLE RETURN_VALUE + WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/.. + ) + if(NOT RETURN_VALUE EQUAL 0) + message(FATAL_ERROR "Failed to get generated_unboxing_sources list") + endif() + + include("${CMAKE_BINARY_DIR}/aten/src/ATen/generated_unboxing_sources.cmake") + add_custom_command( + COMMENT "Generating ATen unboxing sources" + OUTPUT + ${generated_unboxing_sources} + ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_unboxing_sources.cmake + COMMAND ${GEN_UNBOXING_COMMAND_sources} + DEPENDS ${all_unboxing_script} ${sources_templates} + ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/native_functions.yaml + ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/tags.yaml + WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/.. + ) + else() # Otherwise do not generate or include sources into build. + set(generated_unboxing_sources "") + endif() + set(GEN_PER_OPERATOR_FLAG) if(USE_PER_OPERATOR_HEADERS) list(APPEND GEN_PER_OPERATOR_FLAG "--per-operator-headers") endif() set(GEN_COMMAND - "${PYTHON_EXECUTABLE}" -m tools.codegen.gen + "${PYTHON_EXECUTABLE}" -m torchgen.gen --source-path ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen --install_dir ${CMAKE_BINARY_DIR}/aten/src/ATen ${GEN_PER_OPERATOR_FLAG} ${GEN_ROCM_FLAG} + ${GEN_MPS_FLAG} ${CUSTOM_BUILD_FLAGS} - ${GEN_VULKAN_FLAGS} ) file(GLOB_RECURSE headers_templates "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/templates/*\.h") @@ -245,6 +195,7 @@ if(INTERN_BUILD_ATEN_OPS) include("${CMAKE_BINARY_DIR}/aten/src/ATen/generated_${gen_type}.cmake") include("${CMAKE_BINARY_DIR}/aten/src/ATen/core_generated_${gen_type}.cmake") + include("${CMAKE_BINARY_DIR}/aten/src/ATen/cpu_vec_generated_${gen_type}.cmake") include("${CMAKE_BINARY_DIR}/aten/src/ATen/cuda_generated_${gen_type}.cmake") include("${CMAKE_BINARY_DIR}/aten/src/ATen/ops_generated_${gen_type}.cmake") @@ -256,14 +207,17 @@ if(INTERN_BUILD_ATEN_OPS) ${generated_${gen_type}} ${cuda_generated_${gen_type}} ${core_generated_${gen_type}} + ${cpu_vec_generated_${gen_type}} ${ops_generated_${gen_type}} ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_${gen_type}.cmake ${CMAKE_BINARY_DIR}/aten/src/ATen/ops_generated_${gen_type}.cmake ${CMAKE_BINARY_DIR}/aten/src/ATen/core_generated_${gen_type}.cmake + ${CMAKE_BINARY_DIR}/aten/src/ATen/cpu_vec_generated_${gen_type}.cmake ${CMAKE_BINARY_DIR}/aten/src/ATen/cuda_generated_${gen_type}.cmake COMMAND ${GEN_COMMAND_${gen_type}} DEPENDS ${all_python} ${${gen_type}_templates} ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/native_functions.yaml + ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/tags.yaml WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/.. ) endforeach() @@ -272,9 +226,9 @@ if(INTERN_BUILD_ATEN_OPS) # not tracked correctly in CMake. We make the libATen.so depend explicitly # on building the generated ATen files to workaround. add_custom_target(ATEN_CPU_FILES_GEN_TARGET DEPENDS - ${generated_headers} ${core_generated_headers} ${ops_generated_headers} - ${generated_sources} ${core_generated_sources} ${ops_generated_sources} - ${generated_declarations_yaml}) + ${generated_headers} ${core_generated_headers} ${cpu_vec_generated_headers} ${ops_generated_headers} + ${generated_sources} ${core_generated_sources} ${cpu_vec_generated_sources} ${ops_generated_sources} + ${generated_declarations_yaml} ${generated_unboxing_sources}) add_custom_target(ATEN_CUDA_FILES_GEN_TARGET DEPENDS ${cuda_generated_headers} ${cuda_generated_sources}) add_library(ATEN_CPU_FILES_GEN_LIB INTERFACE) @@ -286,6 +240,109 @@ if(INTERN_BUILD_ATEN_OPS) target_compile_definitions(ATEN_CPU_FILES_GEN_LIB INTERFACE AT_PER_OPERATOR_HEADERS) target_compile_definitions(ATEN_CUDA_FILES_GEN_LIB INTERFACE AT_PER_OPERATOR_HEADERS) endif() + + # Handle source files that need to be compiled multiple times for + # different vectorization options + file(GLOB cpu_kernel_cpp_in "${PROJECT_SOURCE_DIR}/aten/src/ATen/native/cpu/*.cpp" "${PROJECT_SOURCE_DIR}/aten/src/ATen/native/quantized/cpu/kernels/*.cpp") + + list(APPEND CPU_CAPABILITY_NAMES "DEFAULT") + list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}") + + if(CXX_AVX512_FOUND) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_AVX512_CPU_DEFINITION") + list(APPEND CPU_CAPABILITY_NAMES "AVX512") + if(MSVC) + list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX512") + else(MSVC) + list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -mavx512f -mavx512bw -mavx512vl -mavx512dq -mfma") + endif(MSVC) + endif(CXX_AVX512_FOUND) + + if(CXX_AVX2_FOUND) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_AVX2_CPU_DEFINITION") + + # Some versions of GCC pessimistically split unaligned load and store + # instructions when using the default tuning. This is a bad choice on + # new Intel and AMD processors so we disable it when compiling with AVX2. + # See https://stackoverflow.com/questions/52626726/why-doesnt-gcc-resolve-mm256-loadu-pd-as-single-vmovupd#tab-top + check_cxx_compiler_flag("-mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store" COMPILER_SUPPORTS_NO_AVX256_SPLIT) + if(COMPILER_SUPPORTS_NO_AVX256_SPLIT) + set(CPU_NO_AVX256_SPLIT_FLAGS "-mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store") + endif(COMPILER_SUPPORTS_NO_AVX256_SPLIT) + + list(APPEND CPU_CAPABILITY_NAMES "AVX2") + if(DEFINED ENV{ATEN_AVX512_256}) + if($ENV{ATEN_AVX512_256} MATCHES "TRUE") + if(CXX_AVX512_FOUND) + message("-- ATen AVX2 kernels will use 32 ymm registers") + if(MSVC) + list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX512") + else(MSVC) + list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -march=native ${CPU_NO_AVX256_SPLIT_FLAGS}") + endif(MSVC) + endif(CXX_AVX512_FOUND) + endif() + else() + if(MSVC) + list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX2") + else(MSVC) + list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -mavx2 -mfma ${CPU_NO_AVX256_SPLIT_FLAGS}") + endif(MSVC) + endif() + endif(CXX_AVX2_FOUND) + + if(CXX_VSX_FOUND) + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_VSX_CPU_DEFINITION") + LIST(APPEND CPU_CAPABILITY_NAMES "VSX") + LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} ${CXX_VSX_FLAGS}") + endif(CXX_VSX_FOUND) + + if(CXX_ZVECTOR_FOUND) + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_ZVECTOR_CPU_DEFINITION") + LIST(APPEND CPU_CAPABILITY_NAMES "ZVECTOR") + LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} ${CXX_ZVECTOR_FLAGS}") + endif(CXX_ZVECTOR_FOUND) + + list(LENGTH CPU_CAPABILITY_NAMES NUM_CPU_CAPABILITY_NAMES) + math(EXPR NUM_CPU_CAPABILITY_NAMES "${NUM_CPU_CAPABILITY_NAMES}-1") + + # The sources list might get reordered later based on the capabilites. + # See NOTE [ Linking AVX and non-AVX files ] + foreach(i RANGE ${NUM_CPU_CAPABILITY_NAMES}) + function(process_vec NAME) + list(GET CPU_CAPABILITY_NAMES ${i} CPU_CAPABILITY) + set(NEW_IMPL ${CMAKE_BINARY_DIR}/aten/src/ATen/${NAME}.${CPU_CAPABILITY}.cpp) + configure_file("${PROJECT_SOURCE_DIR}/cmake/IncludeSource.cpp.in" ${NEW_IMPL}) + set(cpu_kernel_cpp ${NEW_IMPL} ${cpu_kernel_cpp} PARENT_SCOPE) # Create list of copies + list(GET CPU_CAPABILITY_FLAGS ${i} FLAGS) + if(MSVC) + set(EXTRA_FLAGS "/DCPU_CAPABILITY=${CPU_CAPABILITY} /DCPU_CAPABILITY_${CPU_CAPABILITY}") + else(MSVC) + set(EXTRA_FLAGS "-DCPU_CAPABILITY=${CPU_CAPABILITY} -DCPU_CAPABILITY_${CPU_CAPABILITY}") + endif(MSVC) + # Disable certain warnings for GCC-9.X + if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0)) + if(("${NAME}" STREQUAL "native/cpu/GridSamplerKernel.cpp") AND ("${CPU_CAPABILITY}" STREQUAL "DEFAULT")) + # See https://github.com/pytorch/pytorch/issues/38855 + set(EXTRA_FLAGS "${EXTRA_FLAGS} -Wno-uninitialized") + endif() + if("${NAME}" STREQUAL "native/quantized/cpu/kernels/QuantizedOpKernels.cpp") + # See https://github.com/pytorch/pytorch/issues/38854 + set(EXTRA_FLAGS "${EXTRA_FLAGS} -Wno-deprecated-copy") + endif() + endif() + set_source_files_properties(${NEW_IMPL} PROPERTIES COMPILE_FLAGS "${FLAGS} ${EXTRA_FLAGS}") + endfunction() + foreach(IMPL ${cpu_kernel_cpp_in}) + file(RELATIVE_PATH NAME "${PROJECT_SOURCE_DIR}/aten/src/ATen/" "${IMPL}") + process_vec("${NAME}") + endforeach() + foreach(IMPL ${cpu_vec_generated_sources}) + file(RELATIVE_PATH NAME "${CMAKE_BINARY_DIR}/aten/src/ATen/" "${IMPL}") + process_vec("${NAME}") + endforeach() + endforeach() + list(APPEND ATen_CPU_SRCS ${cpu_kernel_cpp}) endif() function(append_filelist name outputvar) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 0969055415b9..64fa6304207b 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -216,7 +216,7 @@ elseif(BLAS STREQUAL "MKL") set(CAFFE2_USE_MKL ON) set(BLAS_INFO "mkl") set(BLAS_FOUND 1) - set(BLAS_LIBRARIES caffe2::mkl) + set(BLAS_LIBRARIES ${MKL_LIBRARIES}) else() message(WARNING "MKL could not be found. Defaulting to Eigen") set(CAFFE2_USE_EIGEN_FOR_BLAS ON) @@ -816,6 +816,10 @@ if(USE_FBGEMM) set_property(TARGET fbgemm_avx2 PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET fbgemm_avx512 PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET fbgemm PROPERTY POSITION_INDEPENDENT_CODE ON) + if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 13.0.0) + # See https://github.com/pytorch/pytorch/issues/74352 + target_compile_options(asmjit PRIVATE -Wno-deprecated-copy -Wno-unused-but-set-variable) + endif() endif() if(USE_FBGEMM) @@ -1305,7 +1309,7 @@ if(USE_ROCM) hip_include_directories(${Caffe2_HIP_INCLUDE}) set(Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS - ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${PYTORCH_RCCL_LIBRARIES} ${hipcub_LIBRARIES} ${ROCM_HIPRTC_LIB} ${ROCM_ROCTX_LIB}) + ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${hipcub_LIBRARIES} ${ROCM_HIPRTC_LIB} ${ROCM_ROCTX_LIB}) # Note [rocblas & rocfft cmake bug] # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1369,18 +1373,39 @@ if(USE_CUDA) endif() endif() +if(USE_DISTRIBUTED AND USE_TENSORPIPE) + if(MSVC) + message(WARNING "Tensorpipe cannot be used on Windows.") + else() + if(USE_CUDA) + set(TP_USE_CUDA ON CACHE BOOL "" FORCE) + set(TP_ENABLE_CUDA_IPC ON CACHE BOOL "" FORCE) + endif() + set(TP_BUILD_LIBUV ON CACHE BOOL "" FORCE) + set(TP_STATIC_OR_SHARED STATIC CACHE STRING "" FORCE) + + # Tensorpipe uses cuda_add_library + torch_update_find_cuda_flags() + add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/tensorpipe) + + list(APPEND Caffe2_DEPENDENCY_LIBS tensorpipe) + if(USE_CUDA) + list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS tensorpipe_cuda) + elseif(USE_ROCM) + message(WARNING "TensorPipe doesn't yet support ROCm") + # Not yet... + # list(APPEND Caffe2_HIP_DEPENDENCY_LIBS tensorpipe_hip) + endif() + endif() +endif() + if(USE_GLOO) if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8) message(WARNING "Gloo can only be used on 64-bit systems.") caffe2_update_option(USE_GLOO OFF) else() - if(MSVC) - # Don't install gloo on Windows - # It is already handled in builder scripts - set(GLOO_INSTALL OFF CACHE BOOL "" FORCE) - else() - set(GLOO_INSTALL ON CACHE BOOL "" FORCE) - endif() + # Don't install gloo + set(GLOO_INSTALL OFF CACHE BOOL "" FORCE) set(GLOO_STATIC_OR_SHARED STATIC CACHE STRING "" FORCE) # Temporarily override variables to avoid building Gloo tests/benchmarks @@ -1392,6 +1417,10 @@ if(USE_GLOO) set(ENV{GLOO_ROCM_ARCH} "${PYTORCH_ROCM_ARCH}") endif() if(NOT USE_SYSTEM_GLOO) + if(USE_DISTRIBUED AND USE_TENSORPIPE) + get_target_property(_include_dirs uv_a INCLUDE_DIRECTORIES) + set_target_properties(uv_a PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${_include_dirs}") + endif() # gloo uses cuda_add_library torch_update_find_cuda_flags() add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/gloo) @@ -1429,32 +1458,6 @@ if(USE_GLOO) endif() endif() -if(USE_DISTRIBUTED AND USE_TENSORPIPE) - if(MSVC) - message(WARNING "Tensorpipe cannot be used on Windows.") - else() - if(USE_CUDA) - set(TP_USE_CUDA ON CACHE BOOL "" FORCE) - set(TP_ENABLE_CUDA_IPC ON CACHE BOOL "" FORCE) - endif() - set(TP_BUILD_LIBUV ON CACHE BOOL "" FORCE) - set(TP_STATIC_OR_SHARED STATIC CACHE STRING "" FORCE) - - # Tensorpipe uses cuda_add_library - torch_update_find_cuda_flags() - add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/tensorpipe) - - list(APPEND Caffe2_DEPENDENCY_LIBS tensorpipe) - if(USE_CUDA) - list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS tensorpipe_cuda) - elseif(USE_ROCM) - message(WARNING "TensorPipe doesn't yet support ROCm") - # Not yet... - # list(APPEND Caffe2_HIP_DEPENDENCY_LIBS tensorpipe_hip) - endif() - endif() -endif() - # ---[ profiling if(USE_PROF) find_package(htrace) @@ -1843,10 +1846,6 @@ set_target_properties(fmt-header-only PROPERTIES INTERFACE_COMPILE_FEATURES "") list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only) set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libs" FORCE) -if(USE_BREAKPAD) - add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/breakpad) -endif() - # ---[ Kineto # edge profiler depends on KinetoProfiler but it only does cpu # profiling. Thus we dont need USE_CUDA/USE_ROCM @@ -1906,13 +1905,15 @@ if(USE_KINETO) find_library(CUPTI_LIBRARY_PATH ${CUPTI_LIB_NAME} PATHS ${CUDA_SOURCE_DIR} ${CUDA_SOURCE_DIR}/extras/CUPTI/lib64 - ${CUDA_SOURCE_DIR}/lib64) + ${CUDA_SOURCE_DIR}/lib64 + NO_DEFAULT_PATH) find_path(CUPTI_INCLUDE_DIR cupti.h PATHS + ${CUDA_SOURCE_DIR}/extras/CUPTI/include ${CUDA_INCLUDE_DIRS} ${CUDA_SOURCE_DIR} - ${CUDA_SOURCE_DIR}/extras/CUPTI/include - ${CUDA_SOURCE_DIR}/include) + ${CUDA_SOURCE_DIR}/include + NO_DEFAULT_PATH) if(CUPTI_LIBRARY_PATH AND CUPTI_INCLUDE_DIR) message(STATUS " CUPTI_INCLUDE_DIR = ${CUPTI_INCLUDE_DIR}") @@ -1920,6 +1921,32 @@ if(USE_KINETO) message(STATUS " CUDA_cupti_LIBRARY = ${CUDA_cupti_LIBRARY}") message(STATUS "Found CUPTI") set(LIBKINETO_NOCUPTI OFF CACHE STRING "" FORCE) + + # I've only tested this sanity check on Linux; if someone + # runs into this bug on another platform feel free to + # generalize it accordingly + if(NOT USE_CUPTI_SO AND UNIX) + include(CheckCXXSourceRuns) + # rt is handled by the CMAKE_REQUIRED_LIBRARIES set above + if(NOT APPLE) + set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} "dl") + endif() + set(CMAKE_REQUIRED_LINK_OPTIONS "-Wl,--whole-archive,${CUPTI_LIBRARY_PATH},--no-whole-archive") + check_cxx_source_runs("#include + int main() { + try { + throw std::runtime_error(\"error\"); + } catch (...) { + return 0; + } + return 1; + }" EXCEPTIONS_WORK) + set(CMAKE_REQUIRED_LINK_OPTIONS "") + if(NOT EXCEPTIONS_WORK) + message(FATAL_ERROR "Detected that statically linking against CUPTI causes exceptions to stop working. See https://github.com/pytorch/pytorch/issues/57744 for more details. Perhaps try: USE_CUPTI_SO=1 python setup.py develop --cmake") + endif() + endif() + else() message(STATUS "Could not find CUPTI library, using CPU-only Kineto build") set(LIBKINETO_NOCUPTI ON CACHE STRING "" FORCE) diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake index b79a87466252..01594a5b66e0 100644 --- a/cmake/Modules/FindMKL.cmake +++ b/cmake/Modules/FindMKL.cmake @@ -168,6 +168,26 @@ IF (EXISTS ${INTEL_OMP_DIR}) ENDIF() ENDIF() +MACRO(GET_MKL_LIB_NAMES LIBRARIES INTERFACE MKL64) + cmake_parse_arguments("" "" "THREAD" "" ${ARGN}) + SET(${LIBRARIES} mkl_${INTERFACE}${MKL64} mkl_core) + IF(_THREAD) + LIST(INSERT ${LIBRARIES} 1 ${_THREAD}) + IF(UNIX AND ${USE_STATIC_MKL}) + # The thread library defines symbols required by the other MKL libraries so also add it last + LIST(APPEND ${LIBRARIES} ${_THREAD}) + ENDIF() + ENDIF() + IF(${USE_STATIC_MKL}) + IF(UNIX) + list(TRANSFORM ${LIBRARIES} PREPEND "lib") + list(TRANSFORM ${LIBRARIES} APPEND ".a") + ELSE() + message(WARNING "Ignoring USE_STATIC_MKL") + ENDIF() + ENDIF() +ENDMACRO() + # Try linking multiple libs MACRO(CHECK_ALL_LIBRARIES LIBRARIES OPENMP_TYPE OPENMP_LIBRARY _name _list _flags) # This macro checks for the existence of the combination of libraries given by _list. @@ -304,8 +324,9 @@ IF (NOT "${MKL_THREADING}" STREQUAL "SEQ") FOREACH(mkl64 ${mkl64s} "") FOREACH(mklthread ${mklthreads}) IF (NOT MKL_LIBRARIES) + GET_MKL_LIB_NAMES(mkl_lib_names "${mkliface}" "${mkl64}" THREAD "${mklthread}") CHECK_ALL_LIBRARIES(MKL_LIBRARIES MKL_OPENMP_TYPE MKL_OPENMP_LIBRARY cblas_sgemm - "mkl_${mkliface}${mkl64};${mklthread};mkl_core;${mklrtl};${mkl_pthread};${mkl_m};${mkl_dl}" "") + "${mkl_lib_names};${mklrtl};${mkl_pthread};${mkl_m};${mkl_dl}" "") ENDIF (NOT MKL_LIBRARIES) ENDFOREACH(mklthread) ENDFOREACH(mkl64) @@ -317,8 +338,9 @@ ENDIF (NOT "${MKL_THREADING}" STREQUAL "SEQ") FOREACH(mkliface ${mklifaces}) FOREACH(mkl64 ${mkl64s} "") IF (NOT MKL_LIBRARIES) + GET_MKL_LIB_NAMES(mkl_lib_names "${mkliface}" "${mkl64}" THREAD "mkl_sequential") CHECK_ALL_LIBRARIES(MKL_LIBRARIES MKL_OPENMP_TYPE MKL_OPENMP_LIBRARY cblas_sgemm - "mkl_${mkliface}${mkl64};mkl_sequential;mkl_core;${mkl_m};${mkl_dl}" "") + "${mkl_lib_names};${mkl_m};${mkl_dl}" "") IF (MKL_LIBRARIES) SET(mklseq "_sequential") ENDIF (MKL_LIBRARIES) @@ -331,8 +353,9 @@ FOREACH(mklrtl ${mklrtls} "") FOREACH(mkliface ${mklifaces}) FOREACH(mkl64 ${mkl64s} "") IF (NOT MKL_LIBRARIES) + GET_MKL_LIB_NAMES(mkl_lib_names "${mkliface}" "${mkl64}" THREAD "${mklthread}") CHECK_ALL_LIBRARIES(MKL_LIBRARIES MKL_OPENMP_TYPE MKL_OPENMP_LIBRARY cblas_sgemm - "mkl_${mkliface}${mkl64};${mklthread};mkl_core;${mklrtl};pthread;${mkl_m};${mkl_dl}" "") + "${mkl_lib_names};${mklrtl};pthread;${mkl_m};${mkl_dl}" "") ENDIF (NOT MKL_LIBRARIES) ENDFOREACH(mkl64) ENDFOREACH(mkliface) @@ -341,6 +364,9 @@ ENDFOREACH(mklrtl) # Check for older versions IF (NOT MKL_LIBRARIES) SET(MKL_VERSION 900) + if (USE_STATIC_MKL) + message(WARNING "Ignoring USE_STATIC_MKL") + endif() CHECK_ALL_LIBRARIES(MKL_LIBRARIES MKL_OPENMP_TYPE MKL_OPENMP_LIBRARY cblas_sgemm "mkl;guide;pthread;m" "") ENDIF (NOT MKL_LIBRARIES) diff --git a/cmake/Modules/FindMKLDNN.cmake b/cmake/Modules/FindMKLDNN.cmake index 4d3febbdfc49..e2f427be67c8 100644 --- a/cmake/Modules/FindMKLDNN.cmake +++ b/cmake/Modules/FindMKLDNN.cmake @@ -12,86 +12,118 @@ # MKLDNN_USE_NATIVE_ARCH : Whether native CPU instructions should be used in MKLDNN. This should be turned off for # general packaging to avoid incompatible CPU instructions. Default: OFF. -IF (NOT MKLDNN_FOUND) +IF(NOT MKLDNN_FOUND) + SET(MKLDNN_LIBRARIES) + SET(MKLDNN_INCLUDE_DIR) -SET(MKLDNN_LIBRARIES) -SET(MKLDNN_INCLUDE_DIR) + SET(IDEEP_ROOT "${PROJECT_SOURCE_DIR}/third_party/ideep") + SET(MKLDNN_ROOT "${PROJECT_SOURCE_DIR}/third_party/ideep/mkl-dnn/third_party/oneDNN") + IF(NOT APPLE AND NOT WIN32 AND NOT BUILD_LITE_INTERPRETER) + MESSAGE("-- Will build oneDNN Graph") + SET(LLGA_ROOT "${PROJECT_SOURCE_DIR}/third_party/ideep/mkl-dnn") + SET(BUILD_ONEDNN_GRAPH ON) + ENDIF(NOT APPLE AND NOT WIN32 AND NOT BUILD_LITE_INTERPRETER) -SET(IDEEP_ROOT "${PROJECT_SOURCE_DIR}/third_party/ideep") -SET(MKLDNN_ROOT "${IDEEP_ROOT}/mkl-dnn/third_party/oneDNN") + FIND_PACKAGE(BLAS) + FIND_PATH(IDEEP_INCLUDE_DIR ideep.hpp PATHS ${IDEEP_ROOT} PATH_SUFFIXES include) + FIND_PATH(MKLDNN_INCLUDE_DIR dnnl.hpp dnnl.h PATHS ${MKLDNN_ROOT} PATH_SUFFIXES include) + IF(NOT MKLDNN_INCLUDE_DIR) + EXECUTE_PROCESS(COMMAND git${CMAKE_EXECUTABLE_SUFFIX} submodule update --init --jobs 0 mkl-dnn WORKING_DIRECTORY ${IDEEP_ROOT}) + FIND_PATH(MKLDNN_INCLUDE_DIR dnnl.hpp dnnl.h PATHS ${MKLDNN_ROOT} PATH_SUFFIXES include) + ENDIF(NOT MKLDNN_INCLUDE_DIR) + IF(BUILD_ONEDNN_GRAPH) + FIND_PATH(LLGA_INCLUDE_DIR oneapi/dnnl/dnnl_graph.hpp PATHS ${LLGA_ROOT} PATH_SUFFIXES include) + ENDIF(BUILD_ONEDNN_GRAPH) -FIND_PACKAGE(BLAS) -FIND_PATH(IDEEP_INCLUDE_DIR ideep.hpp PATHS ${IDEEP_ROOT} PATH_SUFFIXES include) -FIND_PATH(MKLDNN_INCLUDE_DIR dnnl.hpp dnnl.h PATHS ${MKLDNN_ROOT} PATH_SUFFIXES include) -IF (NOT MKLDNN_INCLUDE_DIR) - EXECUTE_PROCESS(COMMAND git${CMAKE_EXECUTABLE_SUFFIX} submodule update --init --jobs 0 mkl-dnn WORKING_DIRECTORY ${IDEEP_ROOT}) - FIND_PATH(MKLDNN_INCLUDE_DIR mkldnn.hpp mkldnn.h PATHS ${MKLDNN_ROOT} PATH_SUFFIXES include) -ENDIF(NOT MKLDNN_INCLUDE_DIR) + IF(NOT IDEEP_INCLUDE_DIR OR NOT MKLDNN_INCLUDE_DIR) + MESSAGE(STATUS "MKLDNN source files not found!") + RETURN() + ENDIF(NOT IDEEP_INCLUDE_DIR OR NOT MKLDNN_INCLUDE_DIR) + LIST(APPEND MKLDNN_INCLUDE_DIR ${IDEEP_INCLUDE_DIR}) + IF(BUILD_ONEDNN_GRAPH) + LIST(APPEND MKLDNN_INCLUDE_DIR ${LLGA_INCLUDE_DIR}) + ENDIF(BUILD_ONEDNN_GRAPH) + IF(MKL_FOUND) + ADD_DEFINITIONS(-DIDEEP_USE_MKL) + # Append to mkldnn dependencies + LIST(APPEND MKLDNN_LIBRARIES ${MKL_LIBRARIES}) + LIST(APPEND MKLDNN_INCLUDE_DIR ${MKL_INCLUDE_DIR}) + ELSE(MKL_FOUND) + SET(MKLDNN_USE_MKL "NONE" CACHE STRING "" FORCE) + ENDIF(MKL_FOUND) -IF (NOT IDEEP_INCLUDE_DIR OR NOT MKLDNN_INCLUDE_DIR) - MESSAGE(STATUS "MKLDNN source files not found!") - RETURN() -ENDIF(NOT IDEEP_INCLUDE_DIR OR NOT MKLDNN_INCLUDE_DIR) -LIST(APPEND MKLDNN_INCLUDE_DIR ${IDEEP_INCLUDE_DIR}) -IF(MKL_FOUND) - ADD_DEFINITIONS(-DIDEEP_USE_MKL) - # Append to mkldnn dependencies - LIST(APPEND MKLDNN_LIBRARIES ${MKL_LIBRARIES}) - LIST(APPEND MKLDNN_INCLUDE_DIR ${MKL_INCLUDE_DIR}) -ELSE(MKL_FOUND) - SET(MKLDNN_USE_MKL "NONE" CACHE STRING "" FORCE) -ENDIF(MKL_FOUND) + SET(MKL_cmake_included TRUE) + IF(NOT MKLDNN_CPU_RUNTIME) + SET(MKLDNN_CPU_RUNTIME "OMP" CACHE STRING "") + ELSEIF(MKLDNN_CPU_RUNTIME STREQUAL "TBB") + IF(USE_TBB) + MESSAGE(STATUS "MKL-DNN is using TBB") -SET(MKL_cmake_included TRUE) -IF (NOT MKLDNN_CPU_RUNTIME) - SET(MKLDNN_CPU_RUNTIME "OMP" CACHE STRING "") -ELSEIF (MKLDNN_CPU_RUNTIME STREQUAL "TBB") - IF (USE_TBB) - MESSAGE(STATUS "MKL-DNN is using TBB") + SET(TBB_cmake_included TRUE) + SET(Threading_cmake_included TRUE) - SET(TBB_cmake_included TRUE) - SET(Threading_cmake_included TRUE) - - SET(DNNL_CPU_THREADING_RUNTIME ${MKLDNN_CPU_RUNTIME}) - INCLUDE_DIRECTORIES(${TBB_INCLUDE_DIR}) - LIST(APPEND EXTRA_SHARED_LIBS TBB::tbb) - ELSE() - MESSAGE(FATAL_ERROR "MKLDNN_CPU_RUNTIME is set to TBB but TBB is not used") + SET(DNNL_CPU_THREADING_RUNTIME ${MKLDNN_CPU_RUNTIME}) + INCLUDE_DIRECTORIES(${TBB_INCLUDE_DIR}) + LIST(APPEND EXTRA_SHARED_LIBS TBB::tbb) + ELSE() + MESSAGE(FATAL_ERROR "MKLDNN_CPU_RUNTIME is set to TBB but TBB is not used") + ENDIF() ENDIF() -ENDIF() -MESSAGE(STATUS "MKLDNN_CPU_RUNTIME = ${MKLDNN_CPU_RUNTIME}") + MESSAGE(STATUS "MKLDNN_CPU_RUNTIME = ${MKLDNN_CPU_RUNTIME}") -SET(MKLDNN_CPU_RUNTIME ${MKLDNN_CPU_RUNTIME} CACHE STRING "" FORCE) -SET(DNNL_BUILD_TESTS FALSE CACHE BOOL "" FORCE) -SET(DNNL_BUILD_EXAMPLES FALSE CACHE BOOL "" FORCE) -SET(DNNL_LIBRARY_TYPE STATIC CACHE STRING "" FORCE) -SET(DNNL_ENABLE_PRIMITIVE_CACHE TRUE CACHE BOOL "" FORCE) -IF(MKLDNN_USE_NATIVE_ARCH) # Disable HostOpts in MKLDNN unless MKLDNN_USE_NATIVE_ARCH is set. - SET(DNNL_ARCH_OPT_FLAGS "HostOpts" CACHE STRING "" FORCE) -ELSE() - IF(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - IF(CPU_INTEL) - SET(DNNL_ARCH_OPT_FLAGS "-msse4" CACHE STRING "" FORCE) - ENDIF() + SET(MKLDNN_CPU_RUNTIME ${MKLDNN_CPU_RUNTIME} CACHE STRING "" FORCE) + SET(DNNL_BUILD_TESTS FALSE CACHE BOOL "" FORCE) + SET(DNNL_BUILD_EXAMPLES FALSE CACHE BOOL "" FORCE) + SET(DNNL_LIBRARY_TYPE STATIC CACHE STRING "" FORCE) + SET(DNNL_ENABLE_PRIMITIVE_CACHE TRUE CACHE BOOL "" FORCE) + IF(BUILD_ONEDNN_GRAPH) + SET(DNNL_GRAPH_LIBRARY_TYPE STATIC CACHE STRING "" FORCE) + ENDIF(BUILD_ONEDNN_GRAPH) + IF(MKLDNN_USE_NATIVE_ARCH) # Disable HostOpts in MKLDNN unless MKLDNN_USE_NATIVE_ARCH is set. + SET(DNNL_ARCH_OPT_FLAGS "HostOpts" CACHE STRING "" FORCE) ELSE() - SET(DNNL_ARCH_OPT_FLAGS "" CACHE STRING "" FORCE) + IF(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + IF(CPU_INTEL) + SET(DNNL_ARCH_OPT_FLAGS "-msse4" CACHE STRING "" FORCE) + ENDIF() + ELSE() + SET(DNNL_ARCH_OPT_FLAGS "" CACHE STRING "" FORCE) + ENDIF() ENDIF() -ENDIF() -ADD_SUBDIRECTORY(${MKLDNN_ROOT}) -IF(NOT TARGET dnnl) - MESSAGE("Failed to include MKL-DNN target") - RETURN() -ENDIF(NOT TARGET dnnl) + IF(BUILD_ONEDNN_GRAPH) + ADD_SUBDIRECTORY(${LLGA_ROOT}) + IF(NOT TARGET dnnl_graph) + MESSAGE("Failed to include LLGA target") + RETURN() + ENDIF(NOT TARGET dnnl_graph) + + IF(CMAKE_COMPILER_IS_GNUCC) + TARGET_COMPILE_OPTIONS(dnnl_graph PRIVATE -Wno-maybe-uninitialized) + TARGET_COMPILE_OPTIONS(dnnl_graph PRIVATE -Wno-strict-overflow) + TARGET_COMPILE_OPTIONS(dnnl_graph PRIVATE -Wno-error=strict-overflow) + ENDIF(CMAKE_COMPILER_IS_GNUCC) + ELSE(BUILD_ONEDNN_GRAPH) + ADD_SUBDIRECTORY(${MKLDNN_ROOT}) + ENDIF(BUILD_ONEDNN_GRAPH) + + IF(NOT TARGET dnnl) + MESSAGE("Failed to include MKL-DNN target") + RETURN() + ENDIF(NOT TARGET dnnl) -IF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC) - TARGET_COMPILE_OPTIONS(dnnl PRIVATE -Wno-maybe-uninitialized) - TARGET_COMPILE_OPTIONS(dnnl PRIVATE -Wno-strict-overflow) - TARGET_COMPILE_OPTIONS(dnnl PRIVATE -Wno-error=strict-overflow) -ENDIF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC) -LIST(APPEND MKLDNN_LIBRARIES dnnl) + IF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC) + TARGET_COMPILE_OPTIONS(dnnl PRIVATE -Wno-maybe-uninitialized) + TARGET_COMPILE_OPTIONS(dnnl PRIVATE -Wno-strict-overflow) + TARGET_COMPILE_OPTIONS(dnnl PRIVATE -Wno-error=strict-overflow) + ENDIF(NOT APPLE AND CMAKE_COMPILER_IS_GNUCC) + LIST(APPEND MKLDNN_LIBRARIES ${MKL_OPENMP_LIBRARY}) + IF(BUILD_ONEDNN_GRAPH) + LIST(APPEND MKLDNN_LIBRARIES "$") + ENDIF(BUILD_ONEDNN_GRAPH) + LIST(APPEND MKLDNN_LIBRARIES dnnl) -SET(MKLDNN_FOUND TRUE) -MESSAGE(STATUS "Found MKL-DNN: TRUE") + SET(MKLDNN_FOUND TRUE) + MESSAGE(STATUS "Found MKL-DNN: TRUE") ENDIF(NOT MKLDNN_FOUND) diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index 2040120701f1..1a99d1e567a1 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -138,6 +138,7 @@ function(caffe2_print_configuration_summary) message(STATUS " USE_METAL : ${USE_METAL}") message(STATUS " USE_PYTORCH_METAL : ${USE_PYTORCH_METAL}") message(STATUS " USE_PYTORCH_METAL_EXPORT : ${USE_PYTORCH_METAL_EXPORT}") + message(STATUS " USE_MPS : ${USE_MPS}") message(STATUS " USE_FFTW : ${USE_FFTW}") message(STATUS " USE_MKL : ${CAFFE2_USE_MKL}") message(STATUS " USE_MKLDNN : ${USE_MKLDNN}") @@ -148,6 +149,7 @@ function(caffe2_print_configuration_summary) message(STATUS " USE_NCCL : ${USE_NCCL}") if(${USE_NCCL}) message(STATUS " USE_SYSTEM_NCCL : ${USE_SYSTEM_NCCL}") + message(STATUS " USE_NCCL_WITH_UCC : ${USE_NCCL_WITH_UCC}") endif() message(STATUS " USE_NNPACK : ${USE_NNPACK}") message(STATUS " USE_NUMPY : ${USE_NUMPY}") @@ -171,6 +173,7 @@ function(caffe2_print_configuration_summary) message(STATUS " USE_PROF : ${USE_PROF}") message(STATUS " USE_QNNPACK : ${USE_QNNPACK}") message(STATUS " USE_PYTORCH_QNNPACK : ${USE_PYTORCH_QNNPACK}") + message(STATUS " USE_XNNPACK : ${USE_XNNPACK}") message(STATUS " USE_REDIS : ${USE_REDIS}") message(STATUS " USE_ROCKSDB : ${USE_ROCKSDB}") message(STATUS " USE_ZMQ : ${USE_ZMQ}") @@ -185,9 +188,9 @@ function(caffe2_print_configuration_summary) message(STATUS " SELECTED_OP_LIST : ${SELECTED_OP_LIST}") endif() message(STATUS " USE_DEPLOY : ${USE_DEPLOY}") - message(STATUS " USE_BREAKPAD : ${USE_BREAKPAD}") message(STATUS " Public Dependencies : ${Caffe2_PUBLIC_DEPENDENCY_LIBS}") message(STATUS " Private Dependencies : ${Caffe2_DEPENDENCY_LIBS}") # coreml message(STATUS " USE_COREML_DELEGATE : ${USE_COREML_DELEGATE}") + message(STATUS " BUILD_LAZY_TS_BACKEND : ${BUILD_LAZY_TS_BACKEND}") endfunction() diff --git a/cmake/TorchConfig.cmake.in b/cmake/TorchConfig.cmake.in index 0d8b15bd14a8..a57345f51cd4 100644 --- a/cmake/TorchConfig.cmake.in +++ b/cmake/TorchConfig.cmake.in @@ -127,10 +127,6 @@ if(@USE_KINETO@) append_torchlib_if_found(kineto) endif() -if(@USE_DEPLOY@) - append_torchlib_if_found(torch_deploy) -endif() - if(@USE_CUDA@) if(MSVC) if(NOT NVTOOLEXT_HOME) diff --git a/cmake/VulkanCodegen.cmake b/cmake/VulkanCodegen.cmake index 8f6f4b538dd0..c39b54df3af3 100644 --- a/cmake/VulkanCodegen.cmake +++ b/cmake/VulkanCodegen.cmake @@ -7,10 +7,10 @@ set(VULKAN_GEN_OUTPUT_PATH "${CMAKE_BINARY_DIR}/vulkan/ATen/native/vulkan") set(VULKAN_GEN_ARG_ENV "") if(USE_VULKAN_RELAXED_PRECISION) - string(APPEND VULKAN_GEN_ARG_ENV "precision=mediump") + list(APPEND VULKAN_GEN_ARG_ENV "precision=mediump") endif() if(USE_VULKAN_FP16_INFERENCE) - string(APPEND VULKAN_GEN_ARG_ENV "format=rgba16f") + list(APPEND VULKAN_GEN_ARG_ENV "format=rgba16f") endif() if(USE_VULKAN_SHADERC_RUNTIME) diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake index fa481dda1c53..0202f15270b2 100644 --- a/cmake/public/LoadHIP.cmake +++ b/cmake/public/LoadHIP.cmake @@ -5,7 +5,11 @@ if(NOT DEFINED ENV{ROCM_PATH}) else() set(ROCM_PATH $ENV{ROCM_PATH}) endif() - +if(NOT DEFINED ENV{ROCM_INCLUDE_DIRS}) + set(ROCM_INCLUDE_DIRS ${ROCM_PATH}/include) +else() + set(ROCM_INCLUDE_DIRS $ENV{ROCM_INCLUDE_DIRS}) +endif() # HIP_PATH if(NOT DEFINED ENV{HIP_PATH}) set(HIP_PATH ${ROCM_PATH}/hip) @@ -151,8 +155,47 @@ if(HIP_FOUND) set(PYTORCH_FOUND_HIP TRUE) # Find ROCM version for checks - file(READ "${ROCM_PATH}/.info/version-dev" ROCM_VERSION_DEV_RAW) - string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+)-.*$" ROCM_VERSION_DEV_MATCH ${ROCM_VERSION_DEV_RAW}) + # ROCM 5.0 and later will have header api for version management + if(EXISTS ${ROCM_INCLUDE_DIRS}/rocm_version.h) + + set(PROJECT_RANDOM_BINARY_DIR "${PROJECT_BINARY_DIR}") + set(file "${PROJECT_BINARY_DIR}/detect_rocm_version.cc") + file(WRITE ${file} "" + "#include \n" + "#include \n" + + "#ifndef ROCM_VERSION_PATCH\n" + "#define ROCM_VERSION_PATCH 0\n" + "#endif\n" + "#define STRINGIFYHELPER(x) #x\n" + "#define STRINGIFY(x) STRINGIFYHELPER(x)\n" + "int main() {\n" + " printf(\"%d.%d.%s\", ROCM_VERSION_MAJOR, ROCM_VERSION_MINOR, STRINGIFY(ROCM_VERSION_PATCH));\n" + " return 0;\n" + "}\n" + ) + + try_run(run_result compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file} + CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_INCLUDE_DIRS}" + RUN_OUTPUT_VARIABLE rocm_version_from_header + COMPILE_OUTPUT_VARIABLE output_var + ) + # We expect the compile to be successful if the include directory exists. + if(NOT compile_result) + message(FATAL_ERROR "Caffe2: Couldn't determine version from header: " ${output_var}) + endif() + message(STATUS "Caffe2: Header version is: " ${rocm_version_from_header}) + set(ROCM_VERSION_DEV_RAW ${rocm_version_from_header}) + message("\n***** ROCm version from rocm_version.h ****\n") + + # ROCM < 4.5, we don't have the header api file, use flat file + else() + file(READ "${ROCM_PATH}/.info/version-dev" ROCM_VERSION_DEV_RAW) + message("\n***** ROCm version from ${ROCM_PATH}/.info/version-dev ****\n") + endif() + + string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+).*$" ROCM_VERSION_DEV_MATCH ${ROCM_VERSION_DEV_RAW}) + if(ROCM_VERSION_DEV_MATCH) set(ROCM_VERSION_DEV_MAJOR ${CMAKE_MATCH_1}) set(ROCM_VERSION_DEV_MINOR ${CMAKE_MATCH_2}) @@ -160,7 +203,7 @@ if(HIP_FOUND) set(ROCM_VERSION_DEV "${ROCM_VERSION_DEV_MAJOR}.${ROCM_VERSION_DEV_MINOR}.${ROCM_VERSION_DEV_PATCH}") math(EXPR ROCM_VERSION_DEV_INT "(${ROCM_VERSION_DEV_MAJOR}*10000) + (${ROCM_VERSION_DEV_MINOR}*100) + ${ROCM_VERSION_DEV_PATCH}") endif() - message("\n***** ROCm version from ${ROCM_PATH}/.info/version-dev ****\n") + message("ROCM_VERSION_DEV: ${ROCM_VERSION_DEV}") message("ROCM_VERSION_DEV_MAJOR: ${ROCM_VERSION_DEV_MAJOR}") message("ROCM_VERSION_DEV_MINOR: ${ROCM_VERSION_DEV_MINOR}") @@ -187,21 +230,40 @@ if(HIP_FOUND) set(CMAKE_HCC_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}) ### Remove setting of Flags when FindHIP.CMake PR #558 is accepted.### - set(hip_DIR ${HIP_PATH}/lib/cmake/hip) - set(hsa-runtime64_DIR ${ROCM_PATH}/lib/cmake/hsa-runtime64) - set(AMDDeviceLibs_DIR ${ROCM_PATH}/lib/cmake/AMDDeviceLibs) - set(amd_comgr_DIR ${ROCM_PATH}/lib/cmake/amd_comgr) - set(rocrand_DIR ${ROCRAND_PATH}/lib/cmake/rocrand) - set(hiprand_DIR ${HIPRAND_PATH}/lib/cmake/hiprand) - set(rocblas_DIR ${ROCBLAS_PATH}/lib/cmake/rocblas) - set(miopen_DIR ${MIOPEN_PATH}/lib/cmake/miopen) - set(rocfft_DIR ${ROCFFT_PATH}/lib/cmake/rocfft) - set(hipfft_DIR ${HIPFFT_PATH}/lib/cmake/hipfft) - set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse) - set(rccl_DIR ${RCCL_PATH}/lib/cmake/rccl) - set(rocprim_DIR ${ROCPRIM_PATH}/lib/cmake/rocprim) - set(hipcub_DIR ${HIPCUB_PATH}/lib/cmake/hipcub) - set(rocthrust_DIR ${ROCTHRUST_PATH}/lib/cmake/rocthrust) + # As of ROCm 5.1.x, all *.cmake files are under /opt/rocm/lib/cmake/ + if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "5.1.0") + set(hip_DIR ${ROCM_PATH}/lib/cmake/hip) + set(hsa-runtime64_DIR ${ROCM_PATH}/lib/cmake/hsa-runtime64) + set(AMDDeviceLibs_DIR ${ROCM_PATH}/lib/cmake/AMDDeviceLibs) + set(amd_comgr_DIR ${ROCM_PATH}/lib/cmake/amd_comgr) + set(rocrand_DIR ${ROCM_PATH}/lib/cmake/rocrand) + set(hiprand_DIR ${ROCM_PATH}/lib/cmake/hiprand) + set(rocblas_DIR ${ROCM_PATH}/lib/cmake/rocblas) + set(miopen_DIR ${ROCM_PATH}/lib/cmake/miopen) + set(rocfft_DIR ${ROCM_PATH}/lib/cmake/rocfft) + set(hipfft_DIR ${ROCM_PATH}/lib/cmake/hipfft) + set(hipsparse_DIR ${ROCM_PATH}/lib/cmake/hipsparse) + set(rccl_DIR ${ROCM_PATH}/lib/cmake/rccl) + set(rocprim_DIR ${ROCM_PATH}/lib/cmake/rocprim) + set(hipcub_DIR ${ROCM_PATH}/lib/cmake/hipcub) + set(rocthrust_DIR ${ROCM_PATH}/lib/cmake/rocthrust) + else() + set(hip_DIR ${HIP_PATH}/lib/cmake/hip) + set(hsa-runtime64_DIR ${ROCM_PATH}/lib/cmake/hsa-runtime64) + set(AMDDeviceLibs_DIR ${ROCM_PATH}/lib/cmake/AMDDeviceLibs) + set(amd_comgr_DIR ${ROCM_PATH}/lib/cmake/amd_comgr) + set(rocrand_DIR ${ROCRAND_PATH}/lib/cmake/rocrand) + set(hiprand_DIR ${HIPRAND_PATH}/lib/cmake/hiprand) + set(rocblas_DIR ${ROCBLAS_PATH}/lib/cmake/rocblas) + set(miopen_DIR ${MIOPEN_PATH}/lib/cmake/miopen) + set(rocfft_DIR ${ROCFFT_PATH}/lib/cmake/rocfft) + set(hipfft_DIR ${HIPFFT_PATH}/lib/cmake/hipfft) + set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse) + set(rccl_DIR ${RCCL_PATH}/lib/cmake/rccl) + set(rocprim_DIR ${ROCPRIM_PATH}/lib/cmake/rocprim) + set(hipcub_DIR ${HIPCUB_PATH}/lib/cmake/hipcub) + set(rocthrust_DIR ${ROCTHRUST_PATH}/lib/cmake/rocthrust) + endif() find_package_and_print_version(hip REQUIRED) find_package_and_print_version(hsa-runtime64 REQUIRED) @@ -221,13 +283,8 @@ if(HIP_FOUND) find_package_and_print_version(hipcub REQUIRED) find_package_and_print_version(rocthrust REQUIRED) - if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "4.1.0") - message("ROCm version >= 4.1; enabling asserts") - else() - # Disable Asserts In Code (Can't use asserts on HIP stack.) - add_definitions(-DNDEBUG) - message("ROCm version < 4.1; disablng asserts") - endif() + # Disable Asserts In Code (Can't use asserts on HIP stack.) + add_definitions(-DNDEBUG) if(HIP_COMPILER STREQUAL clang) set(hip_library_name amdhip64) diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake index 30d3b52d4883..7f6272e95a6a 100644 --- a/cmake/public/cuda.cmake +++ b/cmake/public/cuda.cmake @@ -38,6 +38,12 @@ endif() # Enable CUDA language support set(CUDAToolkit_ROOT "${CUDA_TOOLKIT_ROOT_DIR}") +# Pass clang as host compiler, which according to the docs +# Must be done before CUDA language is enabled, see mast be done before +# see https://cmake.org/cmake/help/v3.15/variable/CMAKE_CUDA_HOST_COMPILER.html +if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") + set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_C_COMPILER}") +endif() enable_language(CUDA) set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD}) set(CMAKE_CUDA_STANDARD_REQUIRED ON) @@ -318,15 +324,9 @@ if(CAFFE2_USE_CUDNN) TARGET caffe2::cudnn-private PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${CUDNN_INCLUDE_PATH}) if(CUDNN_STATIC AND NOT WIN32) - if(USE_WHOLE_CUDNN) - set_property( - TARGET caffe2::cudnn-private PROPERTY INTERFACE_LINK_LIBRARIES - "-Wl,--whole-archive,\"${CUDNN_LIBRARY_PATH}\" -Wl,--no-whole-archive") - else() - set_property( - TARGET caffe2::cudnn-private PROPERTY INTERFACE_LINK_LIBRARIES - ${CUDNN_LIBRARY_PATH}) - endif() + set_property( + TARGET caffe2::cudnn-private PROPERTY INTERFACE_LINK_LIBRARIES + ${CUDNN_LIBRARY_PATH}) set_property( TARGET caffe2::cudnn-private APPEND PROPERTY INTERFACE_LINK_LIBRARIES "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a" dl) diff --git a/cmake/public/mkldnn.cmake b/cmake/public/mkldnn.cmake index 87935625f9bf..50404d3b30d0 100644 --- a/cmake/public/mkldnn.cmake +++ b/cmake/public/mkldnn.cmake @@ -16,3 +16,15 @@ set_property( set_property( TARGET caffe2::mkldnn PROPERTY INTERFACE_LINK_LIBRARIES ${MKLDNN_LIBRARIES}) +if(BUILD_ONEDNN_GRAPH) + if(NOT TARGET caffe2::dnnl_graph) + add_library(caffe2::dnnl_graph INTERFACE IMPORTED) + endif() + + set_property( + TARGET caffe2::dnnl_graph PROPERTY INTERFACE_INCLUDE_DIRECTORIES + ${MKLDNN_INCLUDE_DIR}) + set_property( + TARGET caffe2::dnnl_graph PROPERTY INTERFACE_LINK_LIBRARIES + ${MKLDNN_LIBRARIES}) +endif() diff --git a/cmake/public/utils.cmake b/cmake/public/utils.cmake index 3535a5cf7ba7..0daa6b7f6a3e 100644 --- a/cmake/public/utils.cmake +++ b/cmake/public/utils.cmake @@ -449,7 +449,6 @@ function(torch_compile_options libname) -Wall -Wextra -Wno-unused-parameter - -Wno-unused-variable -Wno-unused-function -Wno-unused-result -Wno-unused-local-typedefs @@ -470,8 +469,7 @@ function(torch_compile_options libname) if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") list(APPEND private_compile_options -Wno-range-loop-analysis) - endif() - if(NOT APPLE) + else() list(APPEND private_compile_options # Considered to be flaky. See the discussion at # https://github.com/pytorch/pytorch/pull/9608 diff --git a/docker.Makefile b/docker.Makefile index dc7942518f9b..11c438d0fd22 100644 --- a/docker.Makefile +++ b/docker.Makefile @@ -8,7 +8,7 @@ $(warning WARNING: No docker user found using results from whoami) DOCKER_ORG = $(shell whoami) endif -CUDA_VERSION = 11.1 +CUDA_VERSION = 11.3 CUDNN_VERSION = 8 BASE_RUNTIME = ubuntu:18.04 BASE_DEVEL = nvidia/cuda:$(CUDA_VERSION)-cudnn$(CUDNN_VERSION)-devel-ubuntu18.04 diff --git a/docker/cpu-blis/Dockerfile b/docker/cpu-blis/Dockerfile deleted file mode 100644 index adfce7e3ad7a..000000000000 --- a/docker/cpu-blis/Dockerfile +++ /dev/null @@ -1,68 +0,0 @@ -# syntax = docker/dockerfile:experimental -# -# NOTE: To build this you will need a docker version > 18.06 with -# experimental enabled and DOCKER_BUILDKIT=1 -# -# For reference: -# https://docs.docker.com/develop/develop-images/build_enhancements/ -# -# This Dockerfile will build Docker Image with PyTorch + DNNL + AMD BLIS and Torchvision installed for CPU only -# -# Example commandline to build PyTorch with AMD BLIS: -# sudo DOCKER_BUILDKIT=1 docker build . -t docker-image-repo-name -# Example commandline to run the built docker container: -# sudo docker run --name container-name -it docker-image-repo-name - -ARG BASE_IMAGE=ubuntu:18.04 -ARG PYTHON_VERSION=3.8 - -FROM ${BASE_IMAGE} as dev-base -CMD echo "Welcome to the PyTorch Docker Container!" && \ - echo "Version of PyTorch Installed: " && python -c 'import torch; print(torch.__version__)' && \ - echo "Version of Torchvision Installed: " && python -c 'import torchvision; print(torchvision.__version__)' && \ - echo "LDD output showing successful linking with BLIS: " && ldd /opt/conda/lib/python3.8/site-packages/torch/_C.cpython-38-x86_64-linux-gnu.so && \ - /bin/bash -RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \ - apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - ca-certificates \ - ccache \ - cmake \ - curl \ - git \ - libjpeg-dev \ - libpng-dev \ - vim \ - wget && \ - rm -rf /var/lib/apt/lists/* -RUN /usr/sbin/update-ccache-symlinks -RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache -ENV PATH /opt/conda/bin:$PATH - -FROM dev-base as conda -RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \ - chmod +x ~/miniconda.sh && \ - ~/miniconda.sh -b -p /opt/conda && \ - rm ~/miniconda.sh && \ - /opt/conda/bin/conda install -y python=${PYTHON_VERSION} conda-build && \ - /opt/conda/bin/conda install -y nomkl pyyaml numpy ipython ninja setuptools cmake cffi typing future && \ - /opt/conda/bin/conda clean -ya - -RUN conda install typing_extensions - -WORKDIR /root -ARG BLIS_URL=https://github.com/amd/blis.git -# Download, Build BLIS with multithreading support and place necessary library and include files at BLIS_HOME/lib and BLIS_HOME/include respectively -RUN git clone ${BLIS_URL} && cd blis && \ - ./configure --prefix=/root/BLISBuild --enable-cblas --enable-threading=openmp auto && make -j && make install && \ - if [ ! -e /root/BLISBuild/lib/libblis.so ] ; then cp /root/BLISBuild/lib/libblis*.so /root/BLISBuild/lib/libblis.so ; fi - -# Build PyTorch with DNNL+BLIS -RUN git clone https://github.com/pytorch/pytorch.git && cd pytorch && \ - git submodule update --init --recursive --jobs 0 && \ - export PATH=/root/BLISBuild/include/blis:$PATH LD_LIBRARY_PATH=/root/BLISBuild/lib:$LD_LIBRARY_PATH && \ - export BLIS_HOME=/root/BLISBuild BLAS=BLIS USE_MKLDNN_CBLAS=ON WITH_BLAS=blis && python setup.py install - -# Build Torchvision -RUN git clone https://github.com/pytorch/vision.git && cd vision && \ - python setup.py install diff --git a/docker/pytorch/ubuntu_cpu_gpu/Dockerfile b/docker/pytorch/ubuntu_cpu_gpu/Dockerfile deleted file mode 100644 index f7a1af093027..000000000000 --- a/docker/pytorch/ubuntu_cpu_gpu/Dockerfile +++ /dev/null @@ -1,105 +0,0 @@ -# This is the Dockerfile for an image that is ready to build PyTorch from source. -# PyTorch is not yet downloaded nor installed. -# -# Available BASE_IMAGE options: -# nvidia/cuda:11.2.1-cudnn8-devel-ubuntu18.04 -# nvidia/cuda:10.2-cudnn8-devel-ubuntu18.04 -# nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04 -# -# Available MAGMA_CUDA_VERSION options (for GPU/CUDA builds): -# magma-cuda112 -# magma-cuda111 -# magma-cuda102 -# magma-cuda101 -# -# Available TORCH_CUDA_ARCH_LIST_VAR options (for GPU/CUDA builds): -# "3.7+PTX;5.0;6.0;6.1;7.0;7.5;8.0;8.6" for CUDA 11.2/11.1 -# "3.7+PTX;5.0;6.0;6.1;7.0;7.5;8.0" for CUDA 11.0 -# "3.7+PTX;5.0;6.0;6.1;7.0;7.5" for CUDA 10.2/10.1 -# -# Build image with CPU or GPU support with the following command: -# nvidia-docker build -t ${CONTAINER_TAG} -# --build-arg BASE_IMAGE=${BASE_IMAGE_VER} \ -# --build-arg PYTHON_VERSION=${PYTHON_VER} \ -# --build-arg MAGMA_CUDA_VERSION=${MAGMA_CUDA_VER} \ #(for GPU/CUDA builds) -# --build-arg TORCH_CUDA_ARCH_LIST_VAR=${TORCH_CUDA_ARCH_LIST} \ #(for GPU/CUDA builds): -# . -# -# For example, for a CPU Ubuntu 18.04 and Python 3.7.6 build: -# docker build -t ubuntu_1804_py_37_cpu_dev \ -# --build-arg BASE_IMAGE=ubuntu:18.04 \ -# --build-arg PYTHON_VERSION=3.7.6 . -# -# For example, for a CUDA 10.2 Ubuntu 18.04 and Python 3.9.1 build: -# nvidia-docker build -t ubuntu_1804_py_39_cuda_102_cudnn_8_dev \ -# --build-arg BASE_IMAGE=nvidia/cuda:10.2-cudnn8-devel-ubuntu18.04 \ -# --build-arg PYTHON_VERSION=3.9.1 \ -# --build-arg MAGMA_CUDA_VERSION=magma-cuda102 \ -# --build-arg TORCH_CUDA_ARCH_LIST_VAR="3.7+PTX;5.0;6.0;6.1;7.0;7.5" . - -ARG BASE_IMAGE -FROM ${BASE_IMAGE} as dev-base -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - ca-certificates \ - ccache \ - cmake \ - curl \ - git \ - git-lfs \ - libjpeg-dev \ - libpng-dev \ - openmpi-bin \ - wget && \ - rm -rf /var/lib/apt/lists/* -RUN /usr/sbin/update-ccache-symlinks -RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache -ENV PATH /opt/conda/bin:$PATH - -FROM dev-base as conda -ARG PYTHON_VERSION -ENV PYTHON_VER=$PYTHON_VERSION -RUN curl -fsSL -v -o ~/miniconda.sh -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ - chmod +x ~/miniconda.sh && \ - ~/miniconda.sh -b -p /opt/conda && \ - rm ~/miniconda.sh && \ - /opt/conda/bin/conda install -y python=${PYTHON_VER} conda-build pyyaml numpy ipython cython typing typing_extensions mkl mkl-include ninja && \ - /opt/conda/bin/conda clean -ya - -ARG MAGMA_CUDA_VERSION -RUN if [ -z "$MAGMA_CUDA_VERSION" ] ; then \ - echo "Building with CPU support ..."; \ - else \ - echo "Building with GPU/CUDA support ..."; \ - conda install -y -c pytorch ${MAGMA_CUDA_VERSION} && conda clean -ya; \ - fi - -# Necessary step for Azure Pipelines Docker Build -# Docker image is build by root, but the build process -# is running from a non-priveledged user -RUN chmod -R ugo+rw /opt/conda/ - -WORKDIR /opt/pytorch -# Environment variables for PyTorch -ARG TORCH_CUDA_ARCH_LIST_VAR -RUN if [ -z "$TORCH_CUDA_ARCH_LIST_VAR" ] ; then \ - echo "Continuing CPU build ..."; \ - else \ - echo "Setting CUDA env vars and installing openmpi ..."; \ - # Set MPI links to avoid libmpi_cxx.so.1 not found error - ln -s /usr/lib/x86_64-linux-gnu/libmpi_cxx.so.20 /usr/lib/x86_64-linux-gnu/libmpi_cxx.so.1; \ - ln -s /usr/lib/x86_64-linux-gnu/libmpi.so.20.10.1 /usr/lib/x86_64-linux-gnu/libmpi.so.12; \ - fi -# If the build argument TORCH_CUDA_ARCH_LIST_VAR is given, container will be -# set for GPU/CUDA build, else for CPU build. -ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST_VAR:+${TORCH_CUDA_ARCH_LIST_VAR}} -ENV TORCH_NVCC_FLAGS=${TORCH_CUDA_ARCH_LIST_VAR:+"-Xfatbin -compress-all"} -ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" - -# Install Azure CLI and update its site packages -RUN curl -sL https://aka.ms/InstallAzureCLIDeb | bash -RUN pip install --upgrade pip --target /opt/az/lib/python3.6/site-packages/ - -# Install MKL -RUN wget https://raw.githubusercontent.com/pytorch/builder/f121b0919d799b5ea2030c92ca266cf4cddf6656/common/install_mkl.sh -RUN bash ./install_mkl.sh && rm install_mkl.sh diff --git a/docs/Makefile b/docs/Makefile index 28d910a89b49..b9719df7ade5 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -15,6 +15,10 @@ help: figures: @$(PYCMD) source/scripts/build_activation_images.py + @$(PYCMD) source/scripts/build_quantization_configs.py + +onnx_supported_aten_ops: + @$(PYCMD) source/scripts/build_onnx_supported_aten_op_csv_table.py docset: html doc2dash --name $(SPHINXPROJ) --icon $(SOURCEDIR)/_static/img/pytorch-logo-flame.png --enable-js --online-redirect-url https://pytorch.org/docs/ --force $(BUILDDIR)/html/ @@ -30,13 +34,13 @@ html-stable: # See conf.py for more details. RELEASE=1 make html -.PHONY: help Makefile docset +.PHONY: help Makefile docset onnx_supported_aten_ops # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile figures +%: Makefile figures onnx_supported_aten_ops @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) clean: @echo "Removing everything under 'build' and 'source/generated'.." - @rm -rf $(BUILDDIR)/html/ $(BUILDDIR)/doctrees $(SOURCEDIR)/generated + @rm -rf $(BUILDDIR)/html/ $(BUILDDIR)/doctrees $(SOURCEDIR)/generated $(BUILDDIR)/auto_gen_aten_op_list.csv diff --git a/docs/cpp/requirements.txt b/docs/cpp/requirements.txt index f5d49d2ebe91..ca3eb7da6846 100644 --- a/docs/cpp/requirements.txt +++ b/docs/cpp/requirements.txt @@ -1,4 +1,5 @@ sphinx==3.1.2 +Jinja2==3.0.* breathe==4.25.0 exhale==0.2.3 docutils==0.16 diff --git a/docs/cpp/source/Doxyfile b/docs/cpp/source/Doxyfile index 7785239d1539..a17d742a461e 100644 --- a/docs/cpp/source/Doxyfile +++ b/docs/cpp/source/Doxyfile @@ -44,12 +44,14 @@ INPUT = ../../../aten/src/ATen/ATen.h \ ../../../aten/src/ATen/Scalar.h \ ../../../aten/src/ATen/TensorOptions.h \ ../../../aten/src/ATen/core/Tensor.h \ + ../../../aten/src/ATen/native/TensorShape.h \ ../../../build/aten/src/ATen/Functions.h \ ../../../build/aten/src/ATen/core/TensorBody.h \ ../../../c10/core/Device.h \ ../../../c10/core/DeviceType.h \ ../../../c10/util/Half.h \ ../../../c10/util/ArrayRef.h \ + ../../../c10/util/OptionalArrayRef.h \ ../../../c10/util/Exception.h \ ../../../c10/util/Optional.h \ ../../../c10/cuda/CUDAGuard.h \ diff --git a/docs/cpp/source/check-doxygen.sh b/docs/cpp/source/check-doxygen.sh index 6ff6832cd056..a094af941278 100755 --- a/docs/cpp/source/check-doxygen.sh +++ b/docs/cpp/source/check-doxygen.sh @@ -16,12 +16,11 @@ pushd "$(dirname "$0")/../../.." cp torch/_utils_internal.py tools/shared -python -m tools.codegen.gen +python -m torchgen.gen python tools/setup_helpers/generate_code.py \ --native-functions-path aten/src/ATen/native/native_functions.yaml \ - --nn-path aten/src - + --tags-path aten/src/ATen/native/tags.yaml popd # Run doxygen and log all output. diff --git a/docs/cpp/source/conf.py b/docs/cpp/source/conf.py index 3bc56ed060aa..54cd6acdb8fd 100644 --- a/docs/cpp/source/conf.py +++ b/docs/cpp/source/conf.py @@ -119,8 +119,8 @@ # General information about the project. project = 'PyTorch' -copyright = '2019, Torch Contributors' -author = 'Torch Contributors' +copyright = '2022, PyTorch Contributors' +author = 'PyTorch Contributors' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the diff --git a/docs/requirements.txt b/docs/requirements.txt index 34ec6078225b..57bee508f61b 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,5 @@ sphinx==3.5.4 +Jinja2==3.0.* docutils==0.16 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme sphinxcontrib.katex @@ -7,3 +8,4 @@ tensorboard # required to build torch.distributed.elastic.rendezvous.etcd* docs python-etcd>=0.4.5 sphinx_copybutton +sphinx-panels diff --git a/docs/source/amp.rst b/docs/source/amp.rst index 1f70f2c6982e..0785849c579e 100644 --- a/docs/source/amp.rst +++ b/docs/source/amp.rst @@ -1,22 +1,35 @@ .. role:: hidden :class: hidden-section -Automatic Mixed Precision package - torch.cuda.amp -================================================== +Automatic Mixed Precision package - torch.amp +============================================= -.. automodule:: torch.cuda.amp -.. currentmodule:: torch.cuda.amp +.. Both modules below are missing doc entry. Adding them here for now. +.. This does not add anything to the rendered page +.. py:module:: torch.cpu +.. py:module:: torch.cpu.amp +.. py:module:: torch.cuda.amp + +.. automodule:: torch.amp +.. currentmodule:: torch.amp -:class:`torch.cuda.amp` and :class:`torch` provide convenience methods for mixed precision, +:class:`torch.amp` provides convenience methods for mixed precision, where some operations use the ``torch.float32`` (``float``) datatype and other operations -use ``torch.float16`` (``half``). Some ops, like linear layers and convolutions, -are much faster in ``float16``. Other ops, like reductions, often require the dynamic +use lower precision floating point datatype (``lower_precision_fp``): ``torch.float16`` (``half``) or ``torch.bfloat16``. Some ops, like linear layers and convolutions, +are much faster in ``lower_precision_fp``. Other ops, like reductions, often require the dynamic range of ``float32``. Mixed precision tries to match each op to its appropriate datatype. -Ordinarily, "automatic mixed precision training" uses :class:`torch.autocast` and -:class:`torch.cuda.amp.GradScaler` together, as shown in the :ref:`Automatic Mixed Precision examples` -and `Automatic Mixed Precision recipe `_. -However, :class:`torch.autocast` and :class:`GradScaler` are modular, and may be used separately if desired. +Ordinarily, "automatic mixed precision training" with datatype of ``torch.float16`` uses :class:`torch.autocast` and +:class:`torch.cuda.amp.GradScaler` together, as shown in the :ref:`CUDA Automatic Mixed Precision examples` +and `CUDA Automatic Mixed Precision recipe `_. +However, :class:`torch.autocast` and :class:`torch.cuda.amp.GradScaler` are modular, and may be used separately if desired. +As shown in the CPU example section of :class:`torch.autocast`, "automatic mixed precision training/inference" on CPU with +datatype of ``torch.bfloat16`` only uses :class:`torch.autocast`. + +For CUDA and CPU, APIs are also provided seperately: + +* ``torch.autocast("cuda", args...)`` is equivalent to ``torch.cuda.amp.autocast(args...)``. +* ``torch.autocast("cpu", args...)`` is equivalent to ``torch.cpu.amp.autocast(args...)``. For CPU, only lower precision floating point datatype of ``torch.bfloat16`` is supported for now. .. contents:: :local: @@ -38,6 +51,11 @@ Autocasting .. autofunction:: custom_bwd +.. currentmodule:: torch.cpu.amp + +.. autoclass:: autocast + :members: + .. _gradient-scaling: Gradient Scaling @@ -56,6 +74,8 @@ so they don't flush to zero. Each parameter's gradient (``.grad`` attribute) should be unscaled before the optimizer updates the parameters, so the scale factor does not interfere with the learning rate. +.. currentmodule:: torch.cuda.amp + .. autoclass:: GradScaler :members: @@ -68,8 +88,6 @@ Autocast Op Reference Op Eligibility -------------- -Only CUDA ops are eligible for autocasting. - Ops that run in ``float64`` or non-floating-point dtypes are not eligible, and will run in these types whether or not autocast is enabled. @@ -84,8 +102,10 @@ regions. Ops called with an explicit ``dtype=...`` argument are not eligible, and will produce output that respects the ``dtype`` argument. -Op-Specific Behavior --------------------- +.. _autocast-cuda-op-reference: + +CUDA Op-Specific Behavior +------------------------- The following lists describe the behavior of eligible ops in autocast-enabled regions. These ops always go through autocasting whether they are invoked as part of a :class:`torch.nn.Module`, as a function, or as a :class:`torch.Tensor` method. If functions are exposed in multiple namespaces, @@ -99,8 +119,8 @@ If an op is unlisted, we assume it's numerically stable in ``float16``. If you believe an unlisted op is numerically unstable in ``float16``, please file an issue. -Ops that can autocast to ``float16`` -"""""""""""""""""""""""""""""""""""" +CUDA Ops that can autocast to ``float16`` +""""""""""""""""""""""""""""""""""""""""" ``__matmul__``, ``addbmm``, @@ -126,8 +146,8 @@ Ops that can autocast to ``float16`` ``prelu``, ``RNNCell`` -Ops that can autocast to ``float32`` -"""""""""""""""""""""""""""""""""""" +CUDA Ops that can autocast to ``float32`` +""""""""""""""""""""""""""""""""""""""""" ``__pow__``, ``__rdiv__``, @@ -181,8 +201,8 @@ Ops that can autocast to ``float32`` ``tan``, ``triplet_margin_loss`` -Ops that promote to the widest input type -""""""""""""""""""""""""""""""""""""""""" +CUDA Ops that promote to the widest input type +"""""""""""""""""""""""""""""""""""""""""""""" These ops don't require a particular dtype for stability, but take multiple inputs and require that the inputs' dtypes match. If all of the inputs are ``float16``, the op runs in ``float16``. If any of the inputs is ``float32``, @@ -216,3 +236,142 @@ Many models use a sigmoid layer right before the binary cross entropy layer. In this case, combine the two layers using :func:`torch.nn.functional.binary_cross_entropy_with_logits` or :mod:`torch.nn.BCEWithLogitsLoss`. ``binary_cross_entropy_with_logits`` and ``BCEWithLogits`` are safe to autocast. + +.. _autocast-cpu-op-reference: + +CPU Op-Specific Behavior +------------------------ +The following lists describe the behavior of eligible ops in autocast-enabled regions. +These ops always go through autocasting whether they are invoked as part of a :class:`torch.nn.Module`, +as a function, or as a :class:`torch.Tensor` method. If functions are exposed in multiple namespaces, +they go through autocasting regardless of the namespace. + +Ops not listed below do not go through autocasting. They run in the type +defined by their inputs. However, autocasting may still change the type +in which unlisted ops run if they're downstream from autocasted ops. + +If an op is unlisted, we assume it's numerically stable in ``bfloat16``. +If you believe an unlisted op is numerically unstable in ``bfloat16``, +please file an issue. + +CPU Ops that can autocast to ``bfloat16`` +""""""""""""""""""""""""""""""""""""""""" + +``conv1d``, +``conv2d``, +``conv3d``, +``bmm``, +``mm``, +``baddbmm``, +``addmm``, +``addbmm``, +``linear``, +``matmul``, +``_convolution`` + +CPU Ops that can autocast to ``float32`` +"""""""""""""""""""""""""""""""""""""""" + +``conv_transpose1d``, +``conv_transpose2d``, +``conv_transpose3d``, +``avg_pool3d``, +``binary_cross_entropy``, +``grid_sampler``, +``grid_sampler_2d``, +``_grid_sampler_2d_cpu_fallback``, +``grid_sampler_3d``, +``polar``, +``prod``, +``quantile``, +``nanquantile``, +``stft``, +``cdist``, +``trace``, +``view_as_complex``, +``cholesky``, +``cholesky_inverse``, +``cholesky_solve``, +``inverse``, +``lu_solve``, +``matrix_rank``, +``orgqr``, +``inverse``, +``ormqr``, +``pinverse``, +``max_pool3d``, +``max_unpool2d``, +``max_unpool3d``, +``adaptive_avg_pool3d``, +``reflection_pad1d``, +``reflection_pad2d``, +``replication_pad1d``, +``replication_pad2d``, +``replication_pad3d``, +``mse_loss``, +``ctc_loss``, +``kl_div``, +``multilabel_margin_loss``, +``fft_fft``, +``fft_ifft``, +``fft_fft2``, +``fft_ifft2``, +``fft_fftn``, +``fft_ifftn``, +``fft_rfft``, +``fft_irfft``, +``fft_rfft2``, +``fft_irfft2``, +``fft_rfftn``, +``fft_irfftn``, +``fft_hfft``, +``fft_ihfft``, +``linalg_matrix_norm``, +``linalg_cond``, +``linalg_matrix_rank``, +``linalg_solve``, +``linalg_cholesky``, +``linalg_svdvals``, +``linalg_eigvals``, +``linalg_eigvalsh``, +``linalg_inv``, +``linalg_householder_product``, +``linalg_tensorinv``, +``linalg_tensorsolve``, +``fake_quantize_per_tensor_affine``, +``eig``, +``geqrf``, +``lstsq``, +``_lu_with_info``, +``qr``, +``solve``, +``svd``, +``symeig``, +``triangular_solve``, +``fractional_max_pool2d``, +``fractional_max_pool3d``, +``adaptive_max_pool3d``, +``multilabel_margin_loss_forward``, +``linalg_qr``, +``linalg_cholesky_ex``, +``linalg_svd``, +``linalg_eig``, +``linalg_eigh``, +``linalg_lstsq``, +``linalg_inv_ex`` + +CPU Ops that promote to the widest input type +""""""""""""""""""""""""""""""""""""""""""""" +These ops don't require a particular dtype for stability, but take multiple inputs +and require that the inputs' dtypes match. If all of the inputs are +``bfloat16``, the op runs in ``bfloat16``. If any of the inputs is ``float32``, +autocast casts all inputs to ``float32`` and runs the op in ``float32``. + +``cat``, +``stack``, +``index_copy`` + +Some ops not listed here (e.g., binary ops like ``add``) natively promote +inputs without autocasting's intervention. If inputs are a mixture of ``bfloat16`` +and ``float32``, these ops run in ``float32`` and produce ``float32`` output, +regardless of whether autocast is enabled. diff --git a/docs/source/backends.rst b/docs/source/backends.rst index 45d6fdf2add2..c54cf33fbe15 100644 --- a/docs/source/backends.rst +++ b/docs/source/backends.rst @@ -3,6 +3,7 @@ torch.backends ============== +.. automodule:: torch.backends `torch.backends` controls the behavior of various backends that PyTorch supports. @@ -17,6 +18,7 @@ These backends include: torch.backends.cuda ^^^^^^^^^^^^^^^^^^^ +.. automodule:: torch.backends.cuda .. autofunction:: torch.backends.cuda.is_built @@ -50,6 +52,7 @@ torch.backends.cuda torch.backends.cudnn ^^^^^^^^^^^^^^^^^^^^ +.. automodule:: torch.backends.cudnn .. autofunction:: torch.backends.cudnn.version @@ -75,20 +78,37 @@ torch.backends.cudnn A :class:`bool` that, if True, causes cuDNN to benchmark multiple convolution algorithms and select the fastest. +torch.backends.mps +^^^^^^^^^^^^^^^^^^ +.. automodule:: torch.backends.mps + +.. autofunction:: torch.backends.mps.is_available + +.. autofunction:: torch.backends.mps.is_built + torch.backends.mkl ^^^^^^^^^^^^^^^^^^ +.. automodule:: torch.backends.mkl .. autofunction:: torch.backends.mkl.is_available torch.backends.mkldnn ^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: torch.backends.mkldnn .. autofunction:: torch.backends.mkldnn.is_available torch.backends.openmp ^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: torch.backends.openmp .. autofunction:: torch.backends.openmp.is_available + +.. Docs for other backends need to be added here. +.. Automodules are just here to ensure checks run but they don't actually +.. add anything to the rendered page for now. +.. py:module:: torch.backends.quantized +.. py:module:: torch.backends.xnnpack diff --git a/docs/source/benchmark_utils.rst b/docs/source/benchmark_utils.rst index c211dcb7b580..c93fbfd66c3d 100644 --- a/docs/source/benchmark_utils.rst +++ b/docs/source/benchmark_utils.rst @@ -18,3 +18,10 @@ Benchmark Utils - torch.utils.benchmark .. autoclass:: FunctionCounts :members: + +.. These are missing documentation. Adding them here until a better place +.. is made in this file. +.. py:module:: torch.utils.benchmark.examples +.. py:module:: torch.utils.benchmark.op_fuzzers +.. py:module:: torch.utils.benchmark.utils +.. py:module:: torch.utils.benchmark.utils.valgrind_wrapper diff --git a/docs/source/bottleneck.rst b/docs/source/bottleneck.rst index d6ce122234fb..3fa1c99b5061 100644 --- a/docs/source/bottleneck.rst +++ b/docs/source/bottleneck.rst @@ -1,6 +1,7 @@ torch.utils.bottleneck ====================== +.. automodule:: torch.utils.bottleneck .. currentmodule:: torch.utils.bottleneck `torch.utils.bottleneck` is a tool that can be used as an initial step for diff --git a/docs/source/community/persons_of_interest.rst b/docs/source/community/persons_of_interest.rst index b1d4954a6576..906d5685984d 100644 --- a/docs/source/community/persons_of_interest.rst +++ b/docs/source/community/persons_of_interest.rst @@ -149,13 +149,13 @@ C10 utils and operator dispatch - Dmytro Dzhulgakov (`dzhulgakov `__) - (emeritus) Sebastian Messmer (`smessmer `__) -ONNX <-> PyTorch -~~~~~~~~~~~~~~~~ -- Negin Raoof (`neginraoof `__) -- Gary Miguel (`garymm `__) +PyTorch -> ONNX +~~~~~~~~~~~~~~~ - Bowen Bao (`BowenBao `__) -- (emeritus) Lu Fang (`houseroad `__) +- Gary Miguel (`garymm `__) - (emeritus) Lara Haidar (`lara-hdr `__) +- (emeritus) Lu Fang (`houseroad `__) +- (emeritus) Negin Raoof (`neginraoof `__) - (emeritus) Spandan Tiwari (`spandantiwari `__) Mobile / Edge diff --git a/docs/source/conf.py b/docs/source/conf.py index 0b1343145bc1..2d5b60e6af82 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -57,12 +57,16 @@ 'sphinxcontrib.katex', 'sphinx.ext.autosectionlabel', 'sphinx_copybutton', + 'sphinx_panels' ] # build the templated autosummary files autosummary_generate = True numpydoc_show_class_members = False +# Theme has bootstrap already +panels_add_bootstrap_css = False + # autosectionlabel throws warnings if section names are duplicated. # The following tells autosectionlabel to not throw a warning for # duplicated section names that are in different documents. @@ -82,6 +86,8 @@ # TODO: document these and remove them from here. coverage_ignore_functions = [ + # torch + "typename", # torch.autograd "register_py_tensor_class_for_device", "variable", @@ -125,9 +131,113 @@ "execWrapper", # torch.onnx "unregister_custom_op_symbolic", + # torch.ao.quantization + "default_eval_fn", + # torch.ao.quantization.backend_config + "validate_backend_config_dict", + # torch.backends + "disable_global_flags", + "flags_frozen", + # torch.distributed.algorithms.ddp_comm_hooks + "register_ddp_comm_hook", + # torch.nn + "factory_kwargs", + # torch.nn.parallel + "DistributedDataParallelCPU", + # torch.utils + "set_module", + # torch.utils.model_dump + "burn_in_info", + "get_info_and_burn_skeleton", + "get_inline_skeleton", + "get_model_info", + "get_storage_info", + "hierarchical_pickle", ] coverage_ignore_classes = [ + # torch + "FatalError", + "QUInt2x4Storage", + "Size", + "Storage", + "Stream", + "Tensor", + "finfo", + "iinfo", + "qscheme", + "AggregationType", + "AliasDb", + "AnyType", + "Argument", + "ArgumentSpec", + "BenchmarkConfig", + "BenchmarkExecutionStats", + "Block", + "BoolType", + "BufferDict", + "CallStack", + "Capsule", + "ClassType", + "Code", + "CompleteArgumentSpec", + "ComplexType", + "ConcreteModuleType", + "ConcreteModuleTypeBuilder", + "DeepCopyMemoTable", + "DeserializationStorageContext", + "DeviceObjType", + "DictType", + "EnumType", + "ExecutionPlan", + "FileCheck", + "FloatType", + "FunctionSchema", + "Gradient", + "Graph", + "GraphExecutorState", + "IODescriptor", + "InferredType", + "IntType", + "InterfaceType", + "ListType", + "LockingLogger", + "MobileOptimizerType", + "ModuleDict", + "Node", + "NoneType", + "NoopLogger", + "NumberType", + "OperatorInfo", + "OptionalType", + "ParameterDict", + "PyObjectType", + "PyTorchFileReader", + "PyTorchFileWriter", + "RRefType", + "ScriptClass", + "ScriptClassFunction", + "ScriptDict", + "ScriptDictIterator", + "ScriptDictKeyIterator", + "ScriptList", + "ScriptListIterator", + "ScriptMethod", + "ScriptModule", + "ScriptModuleSerializer", + "ScriptObject", + "ScriptObjectProperty", + "SerializationStorageContext", + "StaticModule", + "StringType", + "SymIntType", + "ThroughputBenchmark", + "TracingState", + "TupleType", + "Type", + "UnionType", + "Use", + "Value", # torch.cuda "BFloat16Storage", "BFloat16Tensor", @@ -153,7 +263,6 @@ "LongTensor", "ShortStorage", "ShortTensor", - "UntypedStorage", "cudaStatus", # torch.distributed.elastic.multiprocessing.errors "ChildFailedError", @@ -181,6 +290,7 @@ "ReshapeTransform", "SigmoidTransform", "SoftmaxTransform", + "SoftplusTransform", "StackTransform", "StickBreakingTransform", "TanhTransform", @@ -193,110 +303,25 @@ # torch.onnx "CheckerError", "ExportTypes", + # torch.backends + "ContextProp", + "PropModule", + # torch.backends.cuda + "cuBLASModule", + "cuFFTPlanCache", + "cuFFTPlanCacheAttrContextProp", + "cuFFTPlanCacheManager", + # torch.distributed.algorithms.ddp_comm_hooks + "DDPCommHookType", + # torch.jit.mobile + "LiteScriptModule", + # torch.nn.quantized.modules + "DeQuantize", + "Quantize", + # torch.utils.backcompat + "Warning", ] -# List of modules that do not have automodule/py:module in the doc yet -# We should NOT add anything to this list, see the CI failure message -# on how to solve missing automodule issues -coverage_missing_automodule = [ - "torch", - "torch.ao", - "torch.ao.nn", - "torch.ao.nn.sparse", - "torch.ao.nn.sparse.quantized", - "torch.ao.nn.sparse.quantized.dynamic", - "torch.ao.ns", - "torch.ao.ns.fx", - "torch.ao.quantization", - "torch.ao.quantization.fx", - "torch.ao.quantization.fx.backend_config", - "torch.ao.sparsity", - "torch.ao.sparsity.experimental", - "torch.ao.sparsity.experimental.pruner", - "torch.ao.sparsity.scheduler", - "torch.ao.sparsity.sparsifier", - "torch.backends", - "torch.backends.cuda", - "torch.backends.cudnn", - "torch.backends.mkl", - "torch.backends.mkldnn", - "torch.backends.openmp", - "torch.backends.quantized", - "torch.backends.xnnpack", - "torch.contrib", - "torch.cpu", - "torch.cpu.amp", - "torch.distributed.algorithms", - "torch.distributed.algorithms.ddp_comm_hooks", - "torch.distributed.algorithms.model_averaging", - "torch.distributed.elastic", - "torch.distributed.elastic.utils", - "torch.distributed.elastic.utils.data", - "torch.distributed.launcher", - "torch.distributed.nn", - "torch.distributed.nn.api", - "torch.distributed.nn.jit", - "torch.distributed.nn.jit.templates", - "torch.distributed.pipeline", - "torch.distributed.pipeline.sync", - "torch.distributed.pipeline.sync.skip", - "torch.fft", - "torch.for_onnx", - "torch.fx.experimental", - "torch.fx.experimental.fx_acc", - "torch.fx.experimental.unification", - "torch.fx.experimental.unification.multipledispatch", - "torch.fx.passes", - "torch.jit.mobile", - "torch.nn", - "torch.nn.backends", - "torch.nn.intrinsic", - "torch.nn.intrinsic.modules", - "torch.nn.intrinsic.qat", - "torch.nn.intrinsic.qat.modules", - "torch.nn.intrinsic.quantized", - "torch.nn.intrinsic.quantized.dynamic", - "torch.nn.intrinsic.quantized.dynamic.modules", - "torch.nn.intrinsic.quantized.modules", - "torch.nn.modules", - "torch.nn.parallel", - "torch.nn.qat", - "torch.nn.qat.modules", - "torch.nn.qat.dynamic", - "torch.nn.qat.dynamic.modules", - "torch.nn.quantizable", - "torch.nn.quantizable.modules", - "torch.nn.quantized", - "torch.nn.quantized.dynamic", - "torch.nn.quantized.dynamic.modules", - "torch.nn.quantized.modules", - "torch.nn.utils", - "torch.package", - "torch.package.analyze", - "torch.quantization", - "torch.quantization.fx", - "torch.sparse", - "torch.special", - "torch.utils", - "torch.utils.backcompat", - "torch.utils.benchmark.examples", - "torch.utils.benchmark.op_fuzzers", - "torch.utils.benchmark.utils", - "torch.utils.benchmark.utils.valgrind_wrapper", - "torch.utils.bottleneck", - "torch.utils.data.communication", - "torch.utils.data.datapipes", - "torch.utils.data.datapipes.dataframe", - "torch.utils.data.datapipes.iter", - "torch.utils.data.datapipes.map", - "torch.utils.data.datapipes.utils", - "torch.utils.ffi", - "torch.utils.hipify", - "torch.utils.model_dump", - "torch.utils.tensorboard", -] - - # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # @@ -308,8 +333,8 @@ # General information about the project. project = 'PyTorch' -copyright = '2019, Torch Contributors' -author = 'Torch Contributors' +copyright = '2022, PyTorch Contributors' +author = 'PyTorch Contributors' torch_version = str(torch.__version__) # The version info for the project you're documenting, acts as replacement for @@ -326,14 +351,11 @@ # Customized html_title here. # Default is " ".join(project, release, "documentation") if not set if RELEASE: - # remove hash (start with 'a') from version number if any - version_end = torch_version.find('a') - if version_end == -1: - html_title = " ".join((project, torch_version, "documentation")) - version = torch_version - else: - html_title = " ".join((project, torch_version[:version_end], "documentation")) - version = torch_version[:version_end] + # Turn 1.11.0aHASH into 1.11 + # Note: the release candidates should no longer have the aHASH suffix, but in any + # case we wish to leave only major.minor, even for rc builds. + version = '.'.join(torch_version.split('.')[:2]) + html_title = " ".join((project, version, "documentation")) release = version # The language for content autogenerated by Sphinx. Refer to documentation @@ -417,6 +439,11 @@ def coverage_post_process(app, exception): if not isinstance(app.builder, CoverageBuilder): return + if not torch.distributed.is_available(): + raise RuntimeError("The coverage tool cannot run with a version " + "of PyTorch that was built with USE_DISTRIBUTED=0 " + "as this module's API changes.") + # These are all the modules that have "automodule" in an rst file # These modules are the ones for which coverage is checked # Here, we make sure that no module is missing from that list @@ -443,26 +470,16 @@ def is_not_internal(modname): if modname not in modules: missing.add(modname) - expected = set(coverage_missing_automodule) - output = [] - unexpected_missing = missing - expected - if unexpected_missing: - mods = ", ".join(unexpected_missing) + if missing: + mods = ", ".join(missing) output.append(f"\nYou added the following module(s) to the PyTorch namespace '{mods}' " "but they have no corresponding entry in a doc .rst file. You should " "either make sure that the .rst file that contains the module's documentation " "properly contains either '.. automodule:: mod_name' (if you do not want " - "the paragraph added by the automodule, you can simply use py:module) or " - "make the module private (by appending an '_' at the beginning of its name.") - - unexpected_not_missing = expected - missing - if unexpected_not_missing: - mods = ", ".join(unexpected_not_missing) - output.append(f"\nThank you for adding the missing .rst entries for '{mods}', please update " - "the 'coverage_missing_automodule' in 'torch/docs/source/conf.py' to remove " - "the module(s) you fixed and make sure we do not regress on this in the future.") + "the paragraph added by the automodule, you can simply use '.. py:module:: mod_name') " + " or make the module private (by appending an '_' at the beginning of its name).") # The output file is hard-coded by the coverage tool # Our CI is setup to fail if any line is added to this file diff --git a/docs/source/__config__.rst b/docs/source/config_mod.rst similarity index 100% rename from docs/source/__config__.rst rename to docs/source/config_mod.rst diff --git a/docs/source/cpp_extension.rst b/docs/source/cpp_extension.rst index db718bdacc63..471f55228f3e 100644 --- a/docs/source/cpp_extension.rst +++ b/docs/source/cpp_extension.rst @@ -8,6 +8,6 @@ torch.utils.cpp_extension .. autofunction:: load .. autofunction:: load_inline .. autofunction:: include_paths -.. autofunction:: check_compiler_abi_compatibility +.. autofunction:: get_compiler_abi_compatibility_and_version .. autofunction:: verify_ninja_availability .. autofunction:: is_ninja_available diff --git a/docs/source/cuda.rst b/docs/source/cuda.rst index 955feaae8309..7d3998f7fa53 100644 --- a/docs/source/cuda.rst +++ b/docs/source/cuda.rst @@ -80,6 +80,7 @@ Graphs (beta) :toctree: generated :nosignatures: + is_current_stream_capturing graph_pool_handle CUDAGraph graph @@ -123,3 +124,11 @@ NVIDIA Tools Extension (NVTX) nvtx.mark nvtx.range_push nvtx.range_pop + +Jiterator (beta) +----------------------------- +.. autosummary:: + :toctree: generated + :nosignatures: + + jiterator._create_jit_fn diff --git a/docs/source/data.rst b/docs/source/data.rst index 322de88e27d9..646f41436caf 100644 --- a/docs/source/data.rst +++ b/docs/source/data.rst @@ -432,3 +432,15 @@ Example:: .. autoclass:: torch.utils.data.WeightedRandomSampler .. autoclass:: torch.utils.data.BatchSampler .. autoclass:: torch.utils.data.distributed.DistributedSampler + + +.. This module is experimental and should be private, adding it here for now +.. py:module:: torch.utils.data.communication + +.. These modules are documented as part of torch/data listing them here for +.. now until we have a clearer fix +.. py:module:: torch.utils.data.datapipes +.. py:module:: torch.utils.data.datapipes.dataframe +.. py:module:: torch.utils.data.datapipes.iter +.. py:module:: torch.utils.data.datapipes.map +.. py:module:: torch.utils.data.datapipes.utils diff --git a/docs/source/deploy.rst b/docs/source/deploy.rst index 931aed7ab7a9..9311ba8c4ee6 100644 --- a/docs/source/deploy.rst +++ b/docs/source/deploy.rst @@ -29,8 +29,7 @@ When running ``setup.py``, you will need to specify ``USE_DEPLOY=1``, like: export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} export USE_DEPLOY=1 - python setup.py bdist_wheel - python -mpip install dist/*.whl + python setup.py develop Creating a model package in Python @@ -53,28 +52,39 @@ For now, let's create a simple model that we can load and run in ``torch::deploy # Package and export it. with PackageExporter("my_package.pt") as e: e.intern("torchvision.**") + e.extern("numpy.**") e.extern("sys") + e.extern("PIL.*") e.save_pickle("model", "model.pkl", model) +Note that since "numpy", "sys" and "PIL" were marked as "extern", `torch.package` will +look for these dependencies on the system that loads this package. They will not be packaged +with the model. + Now, there should be a file named ``my_package.pt`` in your working directory. -.. note:: - Currently, ``torch::deploy`` supports only the Python standard library and - ``torch`` as ``extern`` modules in ``torch.package``. In the future we plan - to transparently support any Conda environment you point us to. +Loading and running the model in C++ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Set an environment variable (e.g. $PATH_TO_EXTERN_PYTHON_PACKAGES) to indicate to the interpreters +where the external Python dependencies can be found. In the example below, the path to the +site-packages of a conda environment is provided. +.. code-block:: bash + export PATH_TO_EXTERN_PYTHON_PACKAGES= \ + "~/anaconda/envs/deploy-example-env/lib/python3.8/site-packages" -Loading and running the model in C++ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Let's create a minimal C++ program to that loads the model. .. code-block:: cpp - #include + #include + #include #include + #include #include #include @@ -86,7 +96,11 @@ Let's create a minimal C++ program to that loads the model. } // Start an interpreter manager governing 4 embedded interpreters. - torch::deploy::InterpreterManager manager(4); + std::shared_ptr env = + std::make_shared( + std::getenv("PATH_TO_EXTERN_PYTHON_PACKAGES") + ); + torch::deploy::InterpreterManager manager(4, env); try { // Load the model from the torch.package. @@ -94,6 +108,7 @@ Let's create a minimal C++ program to that loads the model. torch::deploy::ReplicatedObj model = package.loadPickle("model", "model.pkl"); } catch (const c10::Error& e) { std::cerr << "error loading the model\n"; + std::cerr << e.msg(); return -1; } @@ -105,6 +120,9 @@ This small program introduces many of the core concepts of ``torch::deploy``. An ``InterpreterManager`` abstracts over a collection of independent Python interpreters, allowing you to load balance across them when running your code. +``PathEnvironment`` enables you to specify the location of Python +packages on your system which are external, but necessary, for your model. + Using the ``InterpreterManager::loadPackage`` method, you can load a ``torch.package`` from disk and make it available to all interpreters. @@ -120,20 +138,55 @@ an free interpreter to execute that interaction. Building and running the application ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Locate `libtorch_deployinterpreter.o` on your system. This should have been +built when PyTorch was built from source. In the same PyTorch directory, locate +the deploy source files. Set these locations to an environment variable for the build. +An example of where these can be found on a system is shown below. + +.. code-block:: bash + + export DEPLOY_INTERPRETER_PATH="/pytorch/build/torch/csrc/deploy/" + export DEPLOY_SRC_PATH="/pytorch/torch/csrc/deploy/" + +As ``torch::deploy`` is in active development, these manual steps will be removed +soon. + Assuming the above C++ program was stored in a file called, `example-app.cpp`, a minimal CMakeLists.txt file would look like: .. code-block:: cmake - cmake_minimum_required(VERSION 3.0 FATAL_ERROR) + cmake_minimum_required(VERSION 3.19 FATAL_ERROR) project(deploy_tutorial) + find_package(fmt REQUIRED) find_package(Torch REQUIRED) - add_executable(example-app example-app.cpp) - target_link_libraries(example-app "${TORCH_LIBRARIES}") - set_property(TARGET example-app PROPERTY CXX_STANDARD 14) + add_library(torch_deploy_internal STATIC + ${DEPLOY_INTERPRETER_PATH}/libtorch_deployinterpreter.o + ${DEPLOY_DIR}/deploy.cpp + ${DEPLOY_DIR}/loader.cpp + ${DEPLOY_DIR}/path_environment.cpp + ${DEPLOY_DIR}/elf_file.cpp) + + # for python builtins + target_link_libraries(torch_deploy_internal PRIVATE + crypt pthread dl util m z ffi lzma readline nsl ncursesw panelw) + target_link_libraries(torch_deploy_internal PUBLIC + shm torch fmt::fmt-header-only) + caffe2_interface_library(torch_deploy_internal torch_deploy) + + add_executable(example-app example.cpp) + target_link_libraries(example-app PUBLIC + "-Wl,--no-as-needed -rdynamic" dl torch_deploy "${TORCH_LIBRARIES}") + +Currently, it is necessary to build ``torch::deploy`` as a static library. +In order to correctly link to a static library, the utility ``caffe2_interface_library`` +is used to appropriately set and unset ``--whole-archive`` flag. +Furthermore, the ``-rdynamic`` flag is needed when linking to the executable +to ensure that symbols are exported to the dynamic table, making them accessible +to the deploy interpreters (which are dynamically loaded). The last step is configuring and building the project. Assuming that our code directory is laid out like this: @@ -152,8 +205,9 @@ We can now run the following commands to build the application from within the mkdir build cd build # Point CMake at the built version of PyTorch we just installed. - SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')" - cmake -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" .. + cmake -DCMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')" .. \ + -DDEPLOY_INTERPRETER_PATH="$DEPLOY_INTERPRETER_PATH" \ + -DDEPLOY_DIR="$DEPLOY_DIR" cmake --build . --config Release Now we can run our app: diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst index 4ddc3d5f3171..58d6d1606431 100644 --- a/docs/source/distributed.rst +++ b/docs/source/distributed.rst @@ -123,14 +123,24 @@ It is imperative that all processes specify the same number of interfaces in thi Other NCCL environment variables """""""""""""""""""""""""""""""" -NCCL has also provided a number of environment variables for fine-tuning purposes. - -Commonly used ones include the following for debugging purposes: - -- ``export NCCL_DEBUG=INFO`` -- ``export NCCL_DEBUG_SUBSYS=ALL`` - -For the full list of NCCL environment variables, please refer to +**Debugging** - in case of NCCL failure, you can set ``NCCL_DEBUG=INFO`` to print an explicit +warning message as well as basic NCCL initialization information. + +You may also use ``NCCL_DEBUG_SUBSYS`` to get more details about a specific +aspect of NCCL. For example, ``NCCL_DEBUG_SUBSYS=COLL`` would print logs of +collective calls, which may be helpful when debugging hangs, especially those +caused by collective type or message size mismatch. In case of topology +detection failure, it would be helpful to set ``NCCL_DEBUG_SUBSYS=GRAPH`` +to inspect the detailed detection result and save as reference if further help +from NCCL team is needed. + +**Performance tuning** - NCCL performs automatic tuning based on its topology detection to save users' +tuning effort. On some socket-based systems, users may still try tuning +``NCCL_SOCKET_NTHREADS`` and ``NCCL_NSOCKS_PERTHREAD`` to increase socket +network bandwidth. These two environment variables have been pre-tuned by NCCL +for some cloud providers, such as AWS or GCP. + +For a full list of NCCL environment variables, please refer to `NVIDIA NCCL's official documentation `_ @@ -575,6 +585,9 @@ Debugging ``torch.distributed`` applications Debugging distributed applications can be challenging due to hard to understand hangs, crashes, or inconsistent behavior across ranks. ``torch.distributed`` provides a suite of tools to help debug training applications in a self-serve fashion: +Monitored Barrier +^^^^^^^^^^^^^^^^^ + As of v1.10, :func:`torch.distributed.monitored_barrier` exists as an alternative to :func:`torch.distributed.barrier` which fails with helpful information about which rank may be faulty when crashing, i.e. not all ranks calling into :func:`torch.distributed.monitored_barrier` within the provided timeout. :func:`torch.distributed.monitored_barrier` implements a host-side barrier using ``send``/``recv`` communication primitives in a process similar to acknowledgements, allowing rank 0 to report which rank(s) failed to acknowledge @@ -613,7 +626,10 @@ The following error message is produced on rank 0, allowing the user to determin [gloo/transport/tcp/pair.cc:598] Connection closed by peer [2401:db00:eef0:1100:3560:0:1c05:25d]:8594 -Next, the environment variable ``TORCH_DISTRIBUTED_DEBUG`` can be used to trigger additional useful logging and collective synchronization checks to ensure all ranks +``TORCH_DISTRIBUTED_DEBUG`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +With ``TORCH_CPP_LOG_LEVEL=INFO``, the environment variable ``TORCH_DISTRIBUTED_DEBUG`` can be used to trigger additional useful logging and collective synchronization checks to ensure all ranks are synchronized appropriately. ``TORCH_DISTRIBUTED_DEBUG`` can be set to either ``OFF`` (default), ``INFO``, or ``DETAIL`` depending on the debugging level required. Please note that the most verbose option, ``DETAIL`` may impact the application performance and thus should only be used when debugging issues. @@ -662,6 +678,7 @@ include data such as forward time, backward time, gradient communication time, e if __name__ == "__main__": os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "29501" + os.environ["TORCH_CPP_LOG_LEVEL"]="INFO" os.environ[ "TORCH_DISTRIBUTED_DEBUG" ] = "DETAIL" # set to DETAIL for runtime logging. @@ -762,6 +779,7 @@ application crashes, rather than a hang or uninformative error message. As an ex if __name__ == "__main__": os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "29501" + os.environ["TORCH_CPP_LOG_LEVEL"]="INFO" os.environ["TORCH_DISTRIBUTED_DEBUG"] = "DETAIL" mp.spawn(worker, nprocs=2, args=()) @@ -774,6 +792,49 @@ With the ``NCCL`` backend, such an application would likely result in a hang whi RuntimeError: Error when verifying shape tensors for collective ALLREDUCE on rank 0. This likely indicates that input shapes into the collective are mismatched across ranks. Got shapes: 10 [ torch.LongTensor{1} ] +.. note:: + For fine-grained control of the debug level during runtime the functions :func:`torch.distributed.set_debug_level`, :func:`torch.distributed.set_debug_level_from_env`, and + :func:`torch.distributed.get_debug_level` can also be used. + In addition, `TORCH_DISTRIBUTED_DEBUG=DETAIL` can be used in conjunction with `TORCH_SHOW_CPP_STACKTRACES=1` to log the entire callstack when a collective desynchronization is detected. These collective desynchronization checks will work for all applications that use ``c10d`` collective calls backed by process groups created with the :func:`torch.distributed.init_process_group` and :func:`torch.distributed.new_group` APIs. + +Logging +------- + +In addition to explicit debugging support via :func:`torch.distributed.monitored_barrier` and ``TORCH_DISTRIBUTED_DEBUG``, the underlying C++ library of ``torch.distributed`` also outputs log +messages at various levels. These messages can be helpful to understand the execution state of a distributed training job and to troubleshoot problems such as network connection failures. The +following matrix shows how the log level can be adjusted via the combination of ``TORCH_CPP_LOG_LEVEL`` and ``TORCH_DISTRIBUTED_DEBUG`` environment variables. + ++-------------------------+-----------------------------+------------------------+ +| ``TORCH_CPP_LOG_LEVEL`` | ``TORCH_DISTRIBUTED_DEBUG`` | Effective Log Level | ++=========================+=============================+========================+ +| ``ERROR`` | ignored | Error | ++-------------------------+-----------------------------+------------------------+ +| ``WARNING`` | ignored | Warning | ++-------------------------+-----------------------------+------------------------+ +| ``INFO`` | ignored | Info | ++-------------------------+-----------------------------+------------------------+ +| ``INFO`` | ``INFO`` | Debug | ++-------------------------+-----------------------------+------------------------+ +| ``INFO`` | ``DETAIL`` | Trace (a.k.a. All) | ++-------------------------+-----------------------------+------------------------+ + + +.. Distributed modules that are missing specific entries. +.. Adding them here for tracking purposes until they are more permanently fixed. +.. py:module:: torch.distributed.algorithms +.. py:module:: torch.distributed.algorithms.ddp_comm_hooks +.. py:module:: torch.distributed.algorithms.model_averaging +.. py:module:: torch.distributed.elastic +.. py:module:: torch.distributed.elastic.utils +.. py:module:: torch.distributed.elastic.utils.data +.. py:module:: torch.distributed.launcher +.. py:module:: torch.distributed.nn +.. py:module:: torch.distributed.nn.api +.. py:module:: torch.distributed.nn.jit +.. py:module:: torch.distributed.nn.jit.templates +.. py:module:: torch.distributed.pipeline +.. py:module:: torch.distributed.pipeline.sync +.. py:module:: torch.distributed.pipeline.sync.skip diff --git a/docs/source/fft.rst b/docs/source/fft.rst index 05f6215af513..5406b6610a60 100644 --- a/docs/source/fft.rst +++ b/docs/source/fft.rst @@ -7,8 +7,6 @@ torch.fft Discrete Fourier transforms and related functions. .. automodule:: torch.fft - :noindex: - .. currentmodule:: torch.fft Fast Fourier Transforms diff --git a/docs/source/fx.rst b/docs/source/fx.rst index 65689930743d..206b39c656f8 100644 --- a/docs/source/fx.rst +++ b/docs/source/fx.rst @@ -1109,3 +1109,13 @@ API Reference :members: .. autofunction:: torch.fx.replace_pattern + + +.. The experimental and passes submodules are missing docs. +.. Adding it here for coverage but this doesn't add anything to the +.. rendered doc. +.. py:module:: torch.fx.passes +.. py:module:: torch.fx.passes.tests +.. py:module:: torch.fx.experimental +.. py:module:: torch.fx.experimental.unification +.. py:module:: torch.fx.experimental.unification.multipledispatch diff --git a/docs/source/index.rst b/docs/source/index.rst index d307fee48647..f4642d49fd3c 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -54,9 +54,9 @@ Features described in this documentation are classified by release status: tensors tensor_attributes tensor_view + torch.amp torch.autograd cuda - torch.cuda.amp torch.backends torch.distributed torch.distributed.algorithms.join @@ -84,6 +84,7 @@ Features described in this documentation are classified by release status: quantization rpc torch.random + nested sparse storage torch.testing @@ -99,16 +100,18 @@ Features described in this documentation are classified by release status: type_info named_tensor name_inference - torch.__config__ <__config__> + torch.__config__ .. toctree:: :maxdepth: 1 :caption: Libraries torchaudio + TorchData + TorchRec + TorchServe torchtext torchvision - TorchServe PyTorch on XLA Devices .. toctree:: diff --git a/docs/source/jit.rst b/docs/source/jit.rst index 8a80b6471e1a..70c5f26c2842 100644 --- a/docs/source/jit.rst +++ b/docs/source/jit.rst @@ -61,6 +61,10 @@ Creating TorchScript Code ScriptFunction freeze optimize_for_inference + enable_onednn_fusion + onednn_fusion_enabled + set_fusion_strategy + strict_fusion save load ignore @@ -877,3 +881,7 @@ References jit_python_reference jit_unsupported + +.. This package is missing doc. Adding it here for coverage +.. This does not add anything to the rendered page. +.. py:module:: torch.jit.mobile diff --git a/docs/source/linalg.rst b/docs/source/linalg.rst index f7b232448fbf..3ac9d211f7dd 100644 --- a/docs/source/linalg.rst +++ b/docs/source/linalg.rst @@ -34,6 +34,7 @@ Decompositions cholesky qr + lu lu_factor eig eigvals @@ -95,6 +96,15 @@ Tensor Operations tensorinv tensorsolve +Misc +---- + +.. autosummary:: + :toctree: generated + :nosignatures: + + vander + Experimental Functions ---------------------- .. autosummary:: @@ -104,3 +114,6 @@ Experimental Functions cholesky_ex inv_ex lu_factor_ex + ldl_factor + ldl_factor_ex + ldl_solve diff --git a/docs/source/nested.rst b/docs/source/nested.rst new file mode 100644 index 000000000000..53a5446b4b52 --- /dev/null +++ b/docs/source/nested.rst @@ -0,0 +1,62 @@ +torch.nested +============ + +.. automodule:: torch.nested + +Introduction +++++++++++++ + +.. warning:: + + The PyTorch API of nested tensors is in prototype stage and will change in the near future. + +.. warning:: + + torch.NestedTensor currently does not support autograd. It needs to be used in the context + of torch.inference_mode(). + +NestedTensor allows the user to pack a list of Tensors into a single, efficient datastructure. + +The only constraint on the input Tensors is that their dimension must match. + +This enables more efficient metadata representations and operator coverage. + +Construction is straightforward and involves passing a list of Tensors to the constructor. + +>>> a, b = torch.arange(3), torch.arange(5) + 3 +>>> a +tensor([0, 1, 2]) +>>> b +tensor([3, 4, 5, 6, 7]) +>>> nt = torch.nested_tensor([a, b]) +>>> nt +nested_tensor([ + tensor([0, 1, 2]), + tensor([3, 4, 5, 6, 7]) + ]) + +Data type and device can be chosen via the usual keyword arguments + +>>> nt = torch.nested_tensor([a, b], dtype=torch.float32, device="cuda") +>>> nt +nested_tensor([ + tensor([0., 1., 2.], device='cuda:0'), + tensor([3., 4., 5., 6., 7.], device='cuda:0') +]) + + +Operator coverage ++++++++++++++++++ + +We are currently on our path to wholesale extend operator coverage guided by specific ML use cases. + +Operator coverage thus is currently very limited and only unbind is supported. + +>>> nt = torch.nested_tensor([a, b], dtype=torch.float32, device="cuda") +>>> nt +nested_tensor([ + tensor([0., 1., 2.], device='cuda:0'), + tensor([3., 4., 5., 6., 7.], device='cuda:0') +]) +>>> nt.unbind() +[tensor([0., 1., 2.], device='cuda:0'), tensor([3., 4., 5., 6., 7.], device='cuda:0')] diff --git a/docs/source/nn.init.rst b/docs/source/nn.init.rst index 56179d30bebf..a980f16f5f6d 100644 --- a/docs/source/nn.init.rst +++ b/docs/source/nn.init.rst @@ -19,5 +19,6 @@ torch.nn.init .. autofunction:: xavier_normal_ .. autofunction:: kaiming_uniform_ .. autofunction:: kaiming_normal_ +.. autofunction:: trunc_normal_ .. autofunction:: orthogonal_ .. autofunction:: sparse_ diff --git a/docs/source/nn.rst b/docs/source/nn.rst index 6eca9d4b16b6..571af54818e2 100644 --- a/docs/source/nn.rst +++ b/docs/source/nn.rst @@ -3,6 +3,8 @@ torch.nn =================================== +.. automodule:: torch.nn +.. automodule:: torch.nn.modules These are the basic building blocks for graphs: @@ -331,6 +333,8 @@ Shuffle Layers DataParallel Layers (multi-GPU, distributed) -------------------------------------------- +.. automodule:: torch.nn.parallel +.. currentmodule:: torch .. autosummary:: :toctree: generated @@ -342,6 +346,7 @@ DataParallel Layers (multi-GPU, distributed) Utilities --------- +.. automodule:: torch.nn.utils From the ``torch.nn.utils`` module @@ -416,6 +421,14 @@ for more information on how to implement your own parametrizations. parametrize.ParametrizationList +Utility functions to calls a given Module in a stateless manner. + +.. autosummary:: + :toctree: generated + :nosignatures: + + stateless.functional_call + Utility functions in other modules .. currentmodule:: torch @@ -453,3 +466,8 @@ Lazy Modules Initialization :template: classtemplate.rst nn.modules.lazy.LazyModuleMixin + + +.. This module is kept only for backward compatibility +.. py:module:: torch.nn.backends +.. py:module:: torch.nn.utils.stateless diff --git a/docs/source/notes/amp_examples.rst b/docs/source/notes/amp_examples.rst index 90cda473cb29..b6bcc38bc0f3 100644 --- a/docs/source/notes/amp_examples.rst +++ b/docs/source/notes/amp_examples.rst @@ -1,7 +1,7 @@ .. _amp-examples: -Automatic Mixed Precision examples -================================== +CUDA Automatic Mixed Precision examples +======================================= .. currentmodule:: torch.cuda.amp diff --git a/docs/source/notes/autograd.rst b/docs/source/notes/autograd.rst index 936c0f9eddd7..9fe4551806a6 100644 --- a/docs/source/notes/autograd.rst +++ b/docs/source/notes/autograd.rst @@ -87,6 +87,22 @@ subject to change and that users should not rely on. You can control how PyTorch does packing / unpacking with :ref:`saved-tensors-hooks-doc`. +.. _non-differentiable-func-grad: + +Gradients for non-differentiable functions +------------------------------------------ + +The gradient computation using Automatic Differentiation is only valid when each elementary function being used is differentiable. +Unfortunately many of the function we use in practice do not have this property (relu or sqrt at 0 for example). +And even though we cannot always guarantee that the returned gradient will be correct. For example :math:`f(x) = x = \text{relu}(x) - \text{relu}(-x)` will give a 0 gradient at 0 instead of 1 for any value we choose for the gradient of relu at 0. +To try and reduce the impact of this limitation, we define the gradients of the elementary operations by applying the following rules in order: + +#. If the function is differentiable and thus a gradient exists at the current point, use it. +#. If the function is convex (at least locally), use the sub-gradient with minimum norm (as it the steepest descent direction, see Exercise 2.7 from "Convex Optimization Algorithms" by Bertsekas, D. P and "Steepest Descent for Optimization Problems with Nondifferentiable Cost Functionals" by Bertsekas, D. P, and Mitter, S. K., 1971. for details and proofs). +#. If the function is concave (at least locally), use the super-gradient with minimum norm (using a similar argument as above). +#. If the function is defined, define the gradient at the current point by continuity (note that :math:`inf` is possible here, for example, :math:`sqrt(0)`). If multiple values are possible, pick one arbitrarily. +#. If the function is not defined (:math:`\sqrt(-1)`, :math:`\log(-1)` or most functions when the input is :math:`nan` for example) then the value used as the gradient is arbitrary (we might also raise an error but that is not guaranteed). Most functions will use :math:`nan` as the gradient, but for performance reasons, some functions will use non-:math:`nan` values (:math:`\log(-1)` for example). + .. _locally-disable-grad-doc: Locally disabling gradient computation @@ -222,7 +238,7 @@ Evaluation Mode (``nn.Module.eval()``) Evaluation mode is not actually a mechanism to locally disable gradient computation. It is included here anyway because it is sometimes confused to be such a mechanism. -Functionally, ``module.eval()`` (or equivalently ``module.train()``) are completely +Functionally, ``module.eval()`` (or equivalently ``module.train(False)``) are completely orthogonal to no-grad mode and inference mode. How ``model.eval()`` affects your model depends entirely on the specific modules used in your model and whether they define any training-mode specific behavior. @@ -278,8 +294,8 @@ Multithreaded Autograd The autograd engine is responsible for running all the backward operations necessary to compute the backward pass. This section will describe all the details -that can help you make the best use of it in a multithreaded environment.(this is -relevant only for PyTorch 1.6+ as the behavior in previous version was different). +that can help you make the best use of it in a multithreaded environment. (This is +relevant only for PyTorch 1.6+ as the behavior in previous version was different.) User could train their model with multithreading code (e.g. Hogwild training), and does not block on the concurrent backward computations, example code could be: @@ -352,9 +368,9 @@ Since Autograd allows the caller thread to drive its backward execution for potential parallelism, it's important that we ensure thread safety on CPU with parallel backwards that share part/whole of the GraphTask. -Custom Python ``autograd.function`` is automatically thread safe because of GIL. -for built-in C++ Autograd Nodes(e.g. AccumulateGrad, CopySlices) and custom -``autograd::Function``, the Autograd Engine uses thread mutex locking to protect +Custom Python ``autograd.Function`` is automatically thread safe because of GIL. +For built-in C++ Autograd Nodes (e.g. AccumulateGrad, CopySlices) and custom +``autograd::Function``\s, the Autograd Engine uses thread mutex locking to ensure thread safety on autograd Nodes that might have state write/read. No thread safety on C++ hooks diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst index b2901a6fe336..59eb7d4c72b6 100644 --- a/docs/source/notes/cuda.rst +++ b/docs/source/notes/cuda.rst @@ -364,6 +364,26 @@ Available options: :meth:`~torch.cuda.memory_summary` methods are useful for tuning. This option should be used as a last resort for a workload that is aborting due to 'out of memory' and showing a large amount of inactive split blocks. +* ``roundup_power2_divisions`` helps with rounding the requested allocation + size to nearest power-2 division and making better use of the blocks. In + the current CUDACachingAllocator, the sizes are rounded up in multiple + of blocks size of 512, so this works fine for smaller sizes. However, this + can be inefficient for large near-by allocations as each will go to different + size of blocks and re-use of those blocks are minimized. This might create + lots of unused blocks and will waste GPU memory capacity. This option enables + the rounding of allocation size to nearest power-2 division. For example, if + we need to round-up size of 1200 and if number of divisions is 4, + the size 1200 lies between 1024 and 2048 and if we do 4 divisions between + them, the values are 1024, 1280, 1536, and 1792. So, allocation size of 1200 + will be rounded to 1280 as the nearest ceiling of power-2 division. +* ``garbage_collection_threshold`` helps actively reclaiming unused GPU memory to + avoid triggering expensive sync-and-reclaim-all operation (release_cached_blocks), + which can be unfavorable to latency-critical GPU applications (e.g., servers). + Upon setting this threshold (e.g., 0.8), the allocator will start reclaiming + GPU memory blocks if the GPU memory capacity usage exceeds the threshold (i.e., + 80% of the total memory allocated to the GPU application). The algorithm prefers + to free old & unused blocks first to avoid freeing blocks that are actively being + reused. The threshold value should be between greater than 0.0 and less than 1.0. .. _cufft-plan-cache: diff --git a/docs/source/notes/extending.rst b/docs/source/notes/extending.rst index ccc76a8a0d55..dbeb135d6e2a 100644 --- a/docs/source/notes/extending.rst +++ b/docs/source/notes/extending.rst @@ -54,7 +54,8 @@ Take the following steps: 1. Subclass :class:`~Function` and implement the :meth:`~Function.forward` and :meth:`~Function.backward` methods. 2. Call the proper methods on the `ctx` argument. -3. Declare whether your function supports double backward. +3. Declare whether your function supports +`double backward `_. 4. Validate whether your gradients are correct using gradcheck. **Step 1:** After subclassing :class:`Function`, you'll need to define 2 methods: @@ -354,7 +355,7 @@ Extending :mod:`torch` with a :class:`Tensor`-like type .. note:: This functionality is inspired by the NumPy ``__array_function__`` protocol. See `the NumPy documentation - `_ + `_ and `NEP-0018 `_ for more details. diff --git a/docs/source/notes/mps.rst b/docs/source/notes/mps.rst new file mode 100644 index 000000000000..6ad44ba97714 --- /dev/null +++ b/docs/source/notes/mps.rst @@ -0,0 +1,40 @@ +.. _MPS-Backend: + +MPS backend +=========== + +:mod:`mps` device enables high-performance +training on GPU for MacOS devices with Metal programming framework. It +introduces a new device to map Machine Learning computational graphs and +primitives on highly efficient Metal Performance Shaders Graph framework and +tuned kernels provided by Metal Performance Shaders framework respectively. + +The new MPS backend extends the PyTorch ecosystem and provides existing scripts +capabilities to setup and run operations on GPU. + +To get started, simply move your Tensor and Module to the ``mps`` device: + +.. code:: + + # Make sure the current PyTorch binary was built with MPS enabled + print(torch.backends.mps.is_built()) + # And that the current hardware and MacOS version are sufficient to + # be able to use MPS + print(torch.backends.mps.is_available()) + + mps_device = torch.device("mps") + + # Create a Tensor directly on the mps device + x = torch.ones(5, device=mps_device) + # Or + x = torch.ones(5, device="mps") + + # Any operation happens on the GPU + y = x * 2 + + # Move your model to mps just like any other device + model = YourFavoriteNet() + model.to(mps_device) + + # Now every call runs on the GPU + pred = model(x) diff --git a/docs/source/notes/numerical_accuracy.rst b/docs/source/notes/numerical_accuracy.rst index 49d21c516b96..c952fb1f7c59 100644 --- a/docs/source/notes/numerical_accuracy.rst +++ b/docs/source/notes/numerical_accuracy.rst @@ -10,7 +10,7 @@ In particular, note that floating point provides limited accuracy (about 7 decim for single precision floating point numbers, about 16 decimal digits for double precision floating point numbers) and that floating point addition and multiplication are not associative, so the order of the operations affects the results. -Because of this, pytorch is not guaranteed +Because of this, PyTorch is not guaranteed to produce bitwise identical results for floating point computations that are mathematically identical. Similarly, bitwise identical results are not guaranteed across PyTorch releases, individual commits, or different platforms. In particular, CPU and GPU @@ -20,12 +20,12 @@ the sources of randomness. Batched computations or slice computations ------------------------------------------ -Many operations in pytorch support batched computation, where the same operation is performed +Many operations in PyTorch support batched computation, where the same operation is performed for the elements of the batches of inputs. An example of this is :meth:`torch.mm` and :meth:`torch.bmm`. It is possible to implement batched computation as a loop over batch elements, and apply the necessary math operations to the individual batch elements, for efficiency reasons we are not doing that, and typically perform computation for the whole batch. The mathematical -libraries that we are calling, and pytorch internal implementations of operations can produces +libraries that we are calling, and PyTorch internal implementations of operations can produces slightly different results in this case, compared to non-batched computations. In particular, let ``A`` and ``B`` be 3D tensors with the dimensions suitable for batched matrix multiplication. Then ``(A@B)[0]`` (the first element of the batched result) is not guaranteed to be bitwise @@ -54,7 +54,7 @@ datatype. E.g.: TensorFloat-32(TF32) on Nvidia Ampere devices --------------------------------------------- -On Ampere Nvidia GPUs, pytorch by default uses TensorFloat32 (TF32) to speed up mathematically +On Ampere Nvidia GPUs, PyTorch by default uses TensorFloat32 (TF32) to speed up mathematically intensive operations, in particular matrix multiplications and convolutions. When operation is performed using TF32 tensor cores, only the first 10 bits of the input mantissa are read. This leads to less accurate results, and surprising results such as multiplying a matrix by identity matrix produces @@ -72,3 +72,50 @@ If reduced-precision reductions are problematic, they can be turned off with ``torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False`` For more information see :ref:`allow_fp16_reduced_precision_reduction` + +.. _fp16_on_mi200: + +Reduced Precision FP16 and BF16 GEMMs and Convolutions on AMD Instinct MI200 devices +------------------------------------------------------------------------------------ +On AMD Instinct MI200 GPUs, the FP16 and BF16 V_DOT2 and MFMA matrix instructions flush input and output denormal values to zero. FP32 and FP64 MFMA matrix instructions do not flush input and output denormal values to zero. The affected instructions are only used by rocBLAS (GEMM) and MIOpen (convolution) kernels; all other PyTorch operations will not encounter this behavior. All other supported AMD GPUs will not encounter this behavior. + +rocBLAS and MIOpen provide alternate implementations for affected FP16 operations. Alternate implementations for BF16 operations are not provided; BF16 numbers have a larger dynamic range than FP16 numbers and are less likely to encounter denormal values. For the FP16 alternate implementations, FP16 input values are cast to an intermediate BF16 value and then cast back to FP16 output after the accumulate FP32 operations. In this way, the input and output types are unchanged. + +When training using FP16 precision, some models may fail to converge with FP16 denorms flushed to zero. Denormal values more frequently occur in the backward pass of training during gradient calculation. PyTorch by default will use the rocBLAS and MIOpen alternate implementations during the backward pass. The default behavior can be overridden using environment variables, ROCBLAS_INTERNAL_FP16_ALT_IMPL and MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP16_ALT_IMPL. The behavior of these environment variables is as follows: + ++---------------+-----------+-----------+ +| | forward | backward | ++===============+===========+===========+ +| Env unset | original | alternate | ++---------------+-----------+-----------+ +| Env set to 1 | alternate | alternate | ++---------------+-----------+-----------+ +| Env set to 0 | original | original | ++---------------+-----------+-----------+ + +The following is the list of operations where rocBLAS may be used: + +* torch.addbmm +* torch.addmm +* torch.baddbmm +* torch.bmm +* torch.mm +* torch.nn.GRUCell +* torch.nn.LSTMCell +* torch.nn.Linear +* torch.sparse.addmm +* the following torch._C._ConvBackend implementations: + + * slowNd + * slowNd_transposed + * slowNd_dilated + * slowNd_dilated_transposed + +The following is the list of operations where MIOpen may be used: + +* torch.nn.Conv[Transpose]Nd +* the following torch._C._ConvBackend implementations: + + * ConvBackend::Miopen + * ConvBackend::MiopenDepthwise + * ConvBackend::MiopenTranspose diff --git a/docs/source/onnx.rst b/docs/source/onnx.rst index be78d7d3caa5..5ed8d2aebd0b 100644 --- a/docs/source/onnx.rst +++ b/docs/source/onnx.rst @@ -130,9 +130,9 @@ a :class:`torch.nn.Module`. If the passed-in model is not already a ``ScriptModu of different sizes. To use scripting: * Use :func:`torch.jit.script` to produce a ``ScriptModule``. - * Call ``torch.onnx.export()`` with the ``ScriptModule`` as the model, and set the - ``example_outputs`` arg. This is required so that the types and shapes of the outputs can be - captured without executing the model. + * Call ``torch.onnx.export()`` with the ``ScriptModule`` as the model. The ``args`` are still required, + but they will be used internally only to produce example outputs, so that the types and shapes of the + outputs can be captured. No tracing will be performed. See `Introduction to TorchScript `_ and `TorchScript `_ for more details, including how to compose tracing and scripting to suit the @@ -332,19 +332,32 @@ The process for adding a symbolic function depends on the type of operator. ATen operators ^^^^^^^^^^^^^^ - `ATen `_ is PyTorch’s built-in tensor library. If the operator is an ATen operator (shows up in the TorchScript graph with the prefix -``aten::``): +``aten::``), make sure it is not supported already. + +List of supported operators +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Visit the auto generated :doc:`list of supported ATen operators <../onnx_supported_aten_ops>` +for details on which operator are supported in each ``opset_version``. + +Adding support for an operator +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If the operator is not in the list above: * Define the symbolic function in ``torch/onnx/symbolic_opset.py``, for example `torch/onnx/symbolic_opset9.py `_. Make sure the function has the same name as the ATen function, which may be declared in ``torch/_C/_VariableFunctions.pyi`` or ``torch/nn/functional.pyi`` (these files are generated at build time, so will not appear in your checkout until you build PyTorch). -* The first arg is always the ONNX graph that is being built for export. +* By default, the first arg is the ONNX graph. Other arg names must EXACTLY match the names in the ``.pyi`` file, because dispatch is done with keyword arguments. +* A symbolic function that has a first arg (before the Graph object) with the + type annotation of torch.onnx.SymbolicContext will be called with that additional context. + See examples below. * In the symbolic function, if the operator is in the `ONNX standard operator set `_, we only need to create a node to represent the ONNX operator in the graph. @@ -421,8 +434,8 @@ PythonOp Symbolic Alternatively, you can register a custom symbolic function. This gives the symbolic function access to more info through the -TorchScript ``Node`` object for the original operation, which gets passed in as the second -argument (after the ``Graph`` object). +``torch.onnx.SymbolicContext`` object, which gets passed in as the first +argument (before the ``Graph`` object). All autograd ``Function``\ s appear in the TorchScript graph as ``prim::PythonOp`` nodes. In order to differentiate between different ``Function`` subclasses, the @@ -449,7 +462,8 @@ The example below shows how you can access ``requires_grad`` via the ``Node`` ob ctx.save_for_backward(input) return input.clamp(min=0) - def symbolic_python_op(g: torch._C.Graph, n: torch._C.Node, *args, **kwargs): + def symbolic_python_op(ctx: torch.onnx.SymbolicContext, g: torch._C.Graph, *args, **kwargs): + n = ctx.cur_node print("original node: ", n) for i, out in enumerate(n.outputs()): print("original output {}: {}, requires grad: {}".format(i, out, out.requiresGrad())) @@ -583,10 +597,29 @@ Q: Are lists of Tensors exportable to ONNX? Yes, for ``opset_version`` >= 11, since ONNX introduced the Sequence type in opset 11. +Contributing / developing +------------------------- +`Developer docs `_. + Functions --------------------------- +--------- .. autofunction:: export .. autofunction:: export_to_pretty_string .. autofunction:: register_custom_op_symbolic .. autofunction:: select_model_mode_for_export .. autofunction:: is_in_onnx_export +.. autofunction:: is_onnx_log_enabled +.. autofunction:: enable_log +.. autofunction:: disable_log +.. autofunction:: set_log_stream +.. autofunction:: log + +Classes +------- + +.. autosummary:: + :toctree: generated + :nosignatures: + :template: classtemplate.rst + + SymbolicContext diff --git a/docs/source/onnx_supported_aten_ops.rst b/docs/source/onnx_supported_aten_ops.rst new file mode 100644 index 000000000000..d6bf535e2e7e --- /dev/null +++ b/docs/source/onnx_supported_aten_ops.rst @@ -0,0 +1,14 @@ +:orphan: + +ONNX supported ATen operators +============================= + +This file is automatically generated during the documentation build +by cross referencing ONNX operator symbolics with Torch JIT operators via +``docs/source/scripts/build_onnx_supported_aten_op_csv_table.py``. +Do not modify directly and instead `rebuild the docs `_. + +.. csv-table:: Supported ATen operators + :file: ../build/auto_gen_aten_op_list.csv + :widths: 30, 70 + :header-rows: 1 diff --git a/docs/source/optim.rst b/docs/source/optim.rst index 62a293dec5ec..73c4d742900d 100644 --- a/docs/source/optim.rst +++ b/docs/source/optim.rst @@ -16,15 +16,6 @@ To construct an :class:`Optimizer` you have to give it an iterable containing th parameters (all should be :class:`~torch.autograd.Variable` s) to optimize. Then, you can specify optimizer-specific options such as the learning rate, weight decay, etc. -.. note:: - - If you need to move a model to GPU via ``.cuda()``, please do so before - constructing optimizers for it. Parameters of a model after ``.cuda()`` will - be different objects with those before the call. - - In general, you should make sure that optimized parameters live in - consistent locations when optimizers are constructed and used. - Example:: optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9) diff --git a/docs/source/package.rst b/docs/source/package.rst index c7881f196140..9664460ac96a 100644 --- a/docs/source/package.rst +++ b/docs/source/package.rst @@ -1,3 +1,6 @@ +.. automodule:: torch.package +.. py:module:: torch.package.analyze + .. currentmodule:: torch.package torch.package @@ -13,7 +16,7 @@ will help you learn more about ``torch.package`` and how to use it. .. warning:: - This module depends on the ``pickle`` module which is is not secure. Only unpackage data you trust. + This module depends on the ``pickle`` module which is not secure. Only unpackage data you trust. It is possible to construct malicious pickle data which will **execute arbitrary code during unpickling**. Never unpackage data that could have come from an untrusted source, or that could have been tampered with. @@ -376,7 +379,7 @@ API for accessing resources from inside a package. :: with PackageExporter(f) as exporter: - # saves text to one/a.txt in the archive + # saves text to my_resource/a.txt in the archive exporter.save_text("my_resource", "a.txt", "hello world!") # saves the tensor to my_pickle/obj.pkl exporter.save_pickle("my_pickle", "obj.pkl", torch.ones(2, 2)) diff --git a/docs/source/quantization-accuracy-debugging.rst b/docs/source/quantization-accuracy-debugging.rst new file mode 100644 index 000000000000..69bda8706cc6 --- /dev/null +++ b/docs/source/quantization-accuracy-debugging.rst @@ -0,0 +1,98 @@ +Quantization Accuracy Debugging +------------------------------- + +This document provides high level strategies for improving quantization +accuracy. If a quantized model has error compared to the original model, +we can categorize the error into: + +1. **data insensitive error** - caused by intrinsic model quantization error, + large portion of input data has large errror +2. **data sensitive error** - caused by outlier input data, small + portion of input data has large error +3. **implementation error** - quantized kernel is not matching reference implementation + +Data insensitive error +~~~~~~~~~~~~~~~~~~~~~~ + +General tips +^^^^^^^^^^^^ + +1. For PTQ, ensure that the data you are calibrating with is representative + of your dataset. For example, for a classification problem a general + guideline is to have multiple samples in every category, and the overall + number of samples should be at least 100. There is no penalty for + calibrating with more data other than calibration time. +2. If your model has Conv-BN or Linear-BN patterns, consider fusing them. + If you are using FX graph mode quantization, this is done automatically + by the workflow. If you are using Eager mode quantization, you can do + this manually with the ``torch.ao.quantization.fuse_modules`` API. +3. Increase the precision of dtype of the problematic ops. Usually, fp32 + will have the highest accuracy, followed by fp16, followed by dynamically + quantized int8, followed by statically quantized int8. + + 1. Note: this is trading off performance for accuracy. + 2. Note: availability of kernels per dtype per op can vary by backend. + 3. Note: dtype conversions add an additional performance cost. For example, + ``fp32_op -> quant -> int8_op -> dequant -> fp32_op -> quant -> int8_op -> dequant`` + will have a performance penalty compared to + ``fp32_op -> fp32_op -> quant -> int8_op -> int8_op -> dequant`` + because of a higher number of required dtype conversions. + +4. If you are using PTQ, consider using QAT to recover some of the accuracy loss + from quantization. + +Int8 quantization tips +^^^^^^^^^^^^^^^^^^^^^^ + +1. If you are using per-tensor weight quantization, consider using per-channel + weight quantization. +2. If you are doing inference on `fbgemm`, ensure that you set the `reduce_range` + argument to `False` if your CPU is Cooperlake or newer, and to `True` otherwise. +3. Audit the input activation distribution variation across different samples. + If this variation is high, the layer may be suitable for dynamic quantization + but not static quantization. + +Data sensitive error +~~~~~~~~~~~~~~~~~~~~ + +If you are using static quantization and a small portion of your input data is +resulting in high quantization error, you can try: + +1. Adjust your calibration dataset to make it more representative of your + inference dataset. +2. Manually inspect (using Numeric Suite) which layers have high quantization + error. For these layers, consider leaving them in floating point or adjusting + the observer settings to choose a better scale and zero_point. + + +Implementation error +~~~~~~~~~~~~~~~~~~~~ + +If you are using PyTorch quantization with your own backend +you may see differences between the reference implementation of an +operation (such as ``dequant -> op_fp32 -> quant``) and the quantized implementation +(such as `op_int8`) of the op on the target hardware. This could mean one of two things: + +1. the differences (usually small) are expected due to specific behavior of + the target kernel on the target hardware compared to fp32/cpu. An example of this + is accumulating in an integer dtype. Unless the kernel guarantees bitwise + equivalency with the reference implementation, this is expected. +2. the kernel on the target hardware has an accuracy issue. In this case, reach + out to the kernel developer. + +Numerical Debugging Tooling (prototype) +--------------------------------------- + +.. toctree:: + :hidden: + + torch.ao.ns._numeric_suite + torch.ao.ns._numeric_suite_fx + +.. warning :: + Numerical debugging tooling is early prototype and subject to change. + +* :ref:`torch_ao_ns_numeric_suite` + Eager mode numeric suite +* :ref:`torch_ao_ns_numeric_suite_fx` + FX numeric suite diff --git a/docs/source/quantization-backend-configuration.rst b/docs/source/quantization-backend-configuration.rst new file mode 100644 index 000000000000..07fd875fa9b3 --- /dev/null +++ b/docs/source/quantization-backend-configuration.rst @@ -0,0 +1,20 @@ +Quantization Backend Configuration +---------------------------------- + +FX Graph Mode Quantization allows the user to configure various +quantization behaviors of an op in order to match the expectation +of their backend. + +In the future, this document will contain a detailed spec of +these configurations. + + +Default values for native configurations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Below is the output of the configuration for quantization of ops +in fbgemm and qnnpack (PyTorch's default quantized backends). + +Results: + +.. literalinclude:: scripts/quantization_backend_configs/default_backend_config.txt diff --git a/docs/source/quantization-support.rst b/docs/source/quantization-support.rst index 78c5ea247c48..da6649a2fee3 100644 --- a/docs/source/quantization-support.rst +++ b/docs/source/quantization-support.rst @@ -217,6 +217,8 @@ to configure quantization settings for individual ops. torch.nn.intrinsic ~~~~~~~~~~~~~~~~~~ +.. automodule:: torch.nn.intrinsic +.. automodule:: torch.nn.intrinsic.modules This module implements the combined (fused) modules conv + relu which can then be quantized. @@ -243,6 +245,9 @@ then be quantized. torch.nn.intrinsic.qat ~~~~~~~~~~~~~~~~~~~~~~ +.. automodule:: torch.nn.intrinsic.qat +.. automodule:: torch.nn.intrinsic.qat.modules + This module implements the versions of those fused operations needed for quantization aware training. @@ -268,6 +273,9 @@ quantization aware training. torch.nn.intrinsic.quantized ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. automodule:: torch.nn.intrinsic.quantized +.. automodule:: torch.nn.intrinsic.quantized.modules + This module implements the quantized implementations of fused operations like conv + relu. No BatchNorm variants as it's usually folded into convolution @@ -289,6 +297,8 @@ for inference. torch.nn.intrinsic.quantized.dynamic ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. automodule:: torch.nn.intrinsic.quantized.dynamic +.. automodule:: torch.nn.intrinsic.quantized.dynamic.modules This module implements the quantized dynamic implementations of fused operations like linear + relu. @@ -304,6 +314,8 @@ like linear + relu. torch.nn.qat ~~~~~~~~~~~~~~~~~~~~~~ +.. automodule:: torch.nn.qat +.. automodule:: torch.nn.qat.modules This module implements versions of the key nn modules **Conv2d()** and **Linear()** which run in FP32 but with rounding applied to simulate the @@ -322,6 +334,8 @@ effect of INT8 quantization. torch.nn.qat.dynamic ~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. automodule:: torch.nn.qat.dynamic +.. automodule:: torch.nn.qat.dynamic.modules This module implements versions of the key nn modules such as **Linear()** which run in FP32 but with rounding applied to simulate the effect of INT8 @@ -338,6 +352,8 @@ quantization and will be dynamically quantized during inference. torch.nn.quantized ~~~~~~~~~~~~~~~~~~~~~~ +.. automodule:: torch.nn.quantized +.. automodule:: torch.nn.quantized.modules This module implements the quantized versions of the nn layers such as ~`torch.nn.Conv2d` and `torch.nn.ReLU`. @@ -376,6 +392,7 @@ This module implements the quantized versions of the nn layers such as torch.nn.quantized.functional ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. automodule:: torch.nn.quantized.functional This module implements the quantized versions of the functional layers such as ~`torch.nn.functional.conv2d` and `torch.nn.functional.relu`. Note: @@ -413,6 +430,8 @@ This module implements the quantized versions of the functional layers such as torch.nn.quantized.dynamic ~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. automodule:: torch.nn.quantized.dynamic +.. automodule:: torch.nn.quantized.dynamic.modules Dynamically quantized :class:`~torch.nn.Linear`, :class:`~torch.nn.LSTM`, :class:`~torch.nn.LSTMCell`, :class:`~torch.nn.GRUCell`, and @@ -492,3 +511,8 @@ the `custom operator mechanism `_ or fall back to eager mode quantization. - -The following table compares the differences between Eager Mode Quantization and FX Graph Mode Quantization: - -+-----------------+-------------------+-------------------+ -| |Eager Mode |FX Graph | -| |Quantization |Mode | -| | |Quantization | -+-----------------+-------------------+-------------------+ -|Release |beta |prototype | -|Status | | | -+-----------------+-------------------+-------------------+ -|Operator |Manual |Automatic | -|Fusion | | | -+-----------------+-------------------+-------------------+ -|Quant/DeQuant |Manual |Automatic | -|Placement | | | -+-----------------+-------------------+-------------------+ -|Quantizing |Supported |Supported | -|Modules | | | -+-----------------+-------------------+-------------------+ -|Quantizing |Manual |Automatic | -|Functionals/Torch| | | -|Ops | | | -+-----------------+-------------------+-------------------+ -|Support for |Limited Support |Fully | -|Customization | |Supported | -+-----------------+-------------------+-------------------+ -|Quantization Mode|Post Training |Post Training | -|Support |Quantization: |Quantization: | -| |Static, Dynamic, |Static, Dynamic, | -| |Weight Only |Weight Only | -| | | | -| |Quantiztion Aware |Quantiztion Aware | -| |Training: |Training: | -| |Static |Static | -+-----------------+-------------------+-------------------+ -|Input/Output |``torch.nn.Module``|``torch.nn.Module``| -|Model Type | |(May need some | -| | |refactors to make | -| | |the model | -| | |compatible with FX | -| | |Graph Mode | -| | |Quantization) | -+-----------------+-------------------+-------------------+ - - -There are three types of quantization supported: - -1. dynamic quantization (weights quantized with activations read/stored in - floating point and quantized for compute.) -2. static quantization (weights quantized, activations quantized, calibration - required post training) -3. static quantization aware training (weights quantized, activations quantized, - quantization numerics modeled during training) - -Please see our `Introduction to Quantization on Pytorch -`_ blog post -for a more comprehensive overview of the tradeoffs between these quantization -types. - -Operator coverage varies between dynamic and static quantization and is captured in the table below. -Note that for FX quantization, the corresponding functionals are also supported. - -+---------------------------+-------------------+--------------------+ -| |Static | Dynamic | -| |Quantization | Quantization | -+---------------------------+-------------------+--------------------+ -| | nn.Linear | | Y | | Y | -| | nn.Conv1d/2d/3d | | Y | | N | -+---------------------------+-------------------+--------------------+ -| | nn.LSTM | | N | | Y | -| | nn.GRU | | N | | Y | -+---------------------------+-------------------+--------------------+ -| | nn.RNNCell | | N | | Y | -| | nn.GRUCell | | N | | Y | -| | nn.LSTMCell | | N | | Y | -+---------------------------+-------------------+--------------------+ -|nn.EmbeddingBag | Y (activations | | -| | are in fp32) | Y | -+---------------------------+-------------------+--------------------+ -|nn.Embedding | Y | N | -+---------------------------+-------------------+--------------------+ -|nn.MultiheadAttention |Not Supported | Not supported | -+---------------------------+-------------------+--------------------+ -|Activations |Broadly supported | Un-changed, | -| | | computations | -| | | stay in fp32 | -+---------------------------+-------------------+--------------------+ - +----------------------------- Eager Mode Quantization ^^^^^^^^^^^^^^^^^^^^^^^ +For a general introduction to the quantization flow, including different types of quantization, please take a look at `General Quantization Flow`_. - -Dynamic Quantization -~~~~~~~~~~~~~~~~~~~~ +Post Training Dynamic Quantization +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This is the simplest to apply form of quantization where the weights are quantized ahead of time but the activations are dynamically quantized during inference. This is used for situations where the model execution time is dominated by loading weights from memory rather than computing the matrix -multiplications. This is true for for LSTM and Transformer type models with +multiplications. This is true for LSTM and Transformer type models with small batch size. Diagram:: @@ -198,16 +98,17 @@ API example:: To learn more about dynamic quantization please see our `dynamic quantization tutorial `_. -Static Quantization -~~~~~~~~~~~~~~~~~~~ +Post Training Static Quantization +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Static quantization quantizes the weights and activations of the model. It +Post Training Static Quantization (PTQ static) quantizes the weights and activations of the model. It fuses activations into preceding layers where possible. It requires calibration with a representative dataset to determine optimal quantization -parameters for activations. Post Training Quantization is typically used when +parameters for activations. Post Training Static Quantization is typically used when both memory bandwidth and compute savings are important with CNNs being a -typical use case. Static quantization is also known as Post Training -Quantization or PTQ. +typical use case. + +We may need to modify the model before applying post training static quantization. Please see `Model Preparation for Eager Mode Static Quantization`_. Diagram:: @@ -288,18 +189,19 @@ API Example:: To learn more about static quantization, please see the `static quantization tutorial `_. -Quantization Aware Training -~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Quantization Aware Training for Static Quantization +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Quantization Aware Training models the effects of quantization during training -allowing for higher accuracy compared to other quantization methods. During +Quantization Aware Training (QAT) models the effects of quantization during training +allowing for higher accuracy compared to other quantization methods. We can do QAT for static, dynamic or weight only quantization. During training, all calculations are done in floating point, with fake_quant modules modeling the effects of quantization by clamping and rounding to simulate the effects of INT8. After model conversion, weights and activations are quantized, and activations are fused into the preceding layer where possible. It is commonly used with CNNs and yields a higher accuracy -compared to static quantization. Quantization Aware Training is also known as -QAT. +compared to static quantization. + +We may need to modify the model before applying post training static quantization. Please see `Model Preparation for Eager Mode Static Quantization`_. Diagram:: @@ -383,33 +285,42 @@ To learn more about quantization aware training, please see the `QAT tutorial `_. -(Prototype) FX Graph Mode Quantization -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Quantization types supported by FX Graph Mode can be classified in two ways: - -1. Post Training Quantization (apply quantization after training, quantization parameters are calculated based on sample calibration data) -2. Quantization Aware Training (simulate quantization during training so that the quantization parameters can be learned together with the model using training data) - -And then each of these two may include any or all of the following types: - -- Weight Only Quantization (only weight is statically quantized) -- Dynamic Quantization (weight is statically quantized, activation is dynamically quantized) -- Static Quantization (both weight and activations are statically quantized) +Model Preparation for Eager Mode Static Quantization +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -These two ways of classification are independent, so theoretically we can have 6 different types of quantization. - -The supported quantization types in FX Graph Mode Quantization are: - -- Post Training Quantization +It is necessary to currently make some modifications to the model definition +prior to Eager mode quantization. This is because currently quantization works on a module +by module basis. Specifically, for all quantization techniques, the user needs to: - - Weight Only Quantization - - Dynamic Quantization - - Static Quantization +1. Convert any operations that require output requantization (and thus have + additional parameters) from functionals to module form (for example, + using ``torch.nn.ReLU`` instead of ``torch.nn.functional.relu``). +2. Specify which parts of the model need to be quantized either by assigning + ``.qconfig`` attributes on submodules or by specifying ``qconfig_dict``. + For example, setting ``model.conv1.qconfig = None`` means that the + ``model.conv`` layer will not be quantized, and setting + ``model.linear1.qconfig = custom_qconfig`` means that the quantization + settings for ``model.linear1`` will be using ``custom_qconfig`` instead + of the global qconfig. -- Quantization Aware Training +For static quantization techniques which quantize activations, the user needs +to do the following in addition: - - Static Quantization +1. Specify where activations are quantized and de-quantized. This is done using + :class:`~torch.quantization.QuantStub` and + :class:`~torch.quantization.DeQuantStub` modules. +2. Use :class:`torch.nn.quantized.FloatFunctional` to wrap tensor operations + that require special handling for quantization into modules. Examples + are operations like ``add`` and ``cat`` which require special handling to + determine output quantization parameters. +3. Fuse modules: combine operations/modules into a single module to obtain + higher accuracy and performance. This is done using the + :func:`torch.quantization.fuse_modules` API, which takes in lists of modules + to be fused. We currently support the following fusions: + [Conv, Relu], [Conv, BatchNorm], [Conv, BatchNorm, Relu], [Linear, Relu] +(Prototype) FX Graph Mode Quantization +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ There are multiple quantization types in post training quantization (weight only, dynamic and static) and the configuration is done through `qconfig_dict` (an argument of the `prepare_fx` function). @@ -472,30 +383,22 @@ Please see the following tutorials for more information about FX Graph Mode Quan - `FX Graph Mode Post Training Static Quantization `_ - `FX Graph Mode Post Training Dynamic Quantization `_ -Quantization API Reference ---------------------------- - -The :doc:`Quantization API Reference ` contains documentation -of quantization APIs, such as quantization passes, quantized tensor operations, -and supported quantized modules and functions. - -.. toctree:: - :hidden: - - quantization-support - torch.ao.ns._numeric_suite - torch.ao.ns._numeric_suite_fx +Quantization Stack +------------------------ +Quantization is the process to convert a floating point model to a quantized model. So at high level the quantization stack can be split into two parts: 1). The building blocks or abstractions for a quantized model 2). The building blocks or abstractions for the quantization flow that converts a floating point model to a quantized model -Quantized Tensors ---------------------------------------- +Quantized Model +^^^^^^^^^^^^^^^^^^^^^^^ +Quantized Tensor +~~~~~~~~~~~~~~~~~ +In order to do quantization in PyTorch, we need to be able to represent +quantized data in Tensors. A Quantized Tensor allows for storing +quantized data (represented as int8/uint8/int32) along with quantization +parameters like scale and zero\_point. Quantized Tensors allow for many +useful operations making quantized arithmetic easy, in addition to +allowing for serialization of data in a quantized format. -PyTorch supports both per tensor and per channel asymmetric linear -quantization. Per tensor means that all the values within the tensor are -scaled the same way. Per channel means that for each dimension, typically -the channel dimension of a tensor, the values -in the tensor are scaled and offset by a different value (effectively -the scale and offset become vectors). This allows for lesser error in converting tensors -to quantized values. +PyTorch supports both per tensor and per channel symmetric and asymmetric quantization. Per tensor means that all the values within the tensor are quantized the same way with the same quantization parameters. Per channel means that for each dimension, typically the channel dimension of a tensor, the values in the tensor are quantized with different quantization parameters. This allows for less error in converting tensors to quantized values since outlier values would only impact the channel it was in, instead of the entire Tensor. The mapping is performed by converting the floating point tensors using @@ -506,35 +409,243 @@ Note that, we ensure that zero in floating point is represented with no error after quantization, thereby ensuring that operations like padding do not cause additional quantization error. -In order to do quantization in PyTorch, we need to be able to represent -quantized data in Tensors. A Quantized Tensor allows for storing -quantized data (represented as int8/uint8/int32) along with quantization -parameters like scale and zero\_point. Quantized Tensors allow for many -useful operations making quantized arithmetic easy, in addition to -allowing for serialization of data in a quantized format. +Here are a few key attributes for quantized Tensor: -Natively supported backends ---------------------------- +* QScheme (torch.qscheme): a enum that specifies the way we quantize the Tensor -Today, PyTorch supports the following backends for running quantized operators efficiently: + * torch.per_tensor_affine + * torch.per_tensor_symmetric + * torch.per_channel_affine + * torch.per_channel_symmetric + +* dtype (torch.dtype): data type of the quantized Tensor + + * torch.quint8 + * torch.qint8 + * torch.qint32 + * torch.float16 -* x86 CPUs with AVX2 support or higher (without AVX2 some operations have - inefficient implementations), via `fbgemm` (``_). -* ARM CPUs (typically found in mobile/embedded devices), via - `qnnpack` (``_). +* quantization parameters (varies based on QScheme): parameters for the chosen way of quantization -The corresponding implementation is chosen automatically based on the PyTorch build mode, though users -have the option to override this by setting `torch.backends.quantization.engine` to `fbgemm` or `qnnpack`. + * torch.per_tensor_affine would have quantization parameters of -.. note:: + * scale (float) + * zero_point (int) + * torch.per_tensor_affine would have quantization parameters of - At the moment PyTorch doesn't provide quantized operator implementations on CUDA - - this is the direction for future work. Move the model to CPU in order to test the - quantized functionality. + * per_channel_scales (list of float) + * per_channel_zero_points (list of int) + * axis (int) - Quantization-aware training (through :class:`~torch.quantization.FakeQuantize`, - which emulates quantized numerics in fp32) supports both CPU and CUDA. +Quantize and Dequantize +~~~~~~~~~~~~~~~~~~~~~~~ +The input and output of a model are floating point Tensors, but activations in the quantized model are quantized, so we need operators to convert between floating point and quantized Tensors. + +* Quantize (float -> quantized) + + * torch.quantize_per_tensor(x, scale, zero_point, dtype) + * torch.quantize_per_channel(x, scales, zero_points, axis, dtype) + * torch.quantize_per_tensor_dynamic(x, dtype, reduce_range) + * to(torch.float16) + +* Dequantize (quantized -> float) + + * quantized_tensor.dequantize() - calling dequantize on a torch.float16 Tensor will convert the Tensor back to torch.float + * torch.dequantize(x) + +Quantized Operators/Modules +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +* Quantized Operator are the operators that takes quantized Tensor as inputs, and outputs a quantized Tensor. +* Quantized Modules are PyTorch Modules that performs quantized operations. They are typically defined for weighted operations like linear and conv. + +Quantized Engine +~~~~~~~~~~~~~~~~~~~~ +When a quantized model is executed, the qengine (torch.backends.quantized.engine) specifies which backend is to be used for execution. It is important to ensure that the qengine is compatible with the quantized model in terms of value range of quantized activation and weights. + +Quantization Flow +^^^^^^^^^^^^^^^^^^^^^^^ +Observer and FakeQuantize +~~~~~~~~~~~~~~~~~~~~~~~~~~ +* Observer are PyTorch Modules used to: + + * collect tensor statistics like min value and max value of the Tensor passing through the observer + * and calculate quantization parameters based on the collected tensor statistics +* FakeQuantize are PyTorch Modules used to: + + * simulate quantization (performing quantize/dequantize) for a Tensor in the network + * it can calculate quantization parameters based on the collected statistics from observer, or it can learn the quantization parameters as well + +QConfig +~~~~~~~~~~~ +* QConfig is a namedtuple of Observer or FakeQuantize Module class that can are configurable with qscheme, dtype etc. it is used to configure how an operator should be observed + + * Quantization configuration for an operator/module + + * different types of Observer/FakeQuantize + * dtype + * qscheme + * quant_min/quant_max: can be used to simulate lower precision Tensors + * Currently supports configuration for activation and weight + * We insert input/weight/output observer based on the qconfig that is configured for a given operator or module + +General Quantization Flow +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +In general, the flow is the following + +* prepare + + * insert Observer/FakeQuantize modules based on user specified qconfig + +* calibrate/train (depending on post training quantization or quantization aware training) + + * allow Observers to collect statistics or FakeQuantize modules to learn the quantization parameters + +* convert + + * convert a calibrated/trained model to a quantized model + +There are different modes of quantization, they can be classified in two ways: + +In terms of where we apply the quantization flow, we have: + +1. Post Training Quantization (apply quantization after training, quantization parameters are calculated based on sample calibration data) +2. Quantization Aware Training (simulate quantization during training so that the quantization parameters can be learned together with the model using training data) + +And in terms of how we quantize the operators, we can have: + +- Weight Only Quantization (only weight is statically quantized) +- Dynamic Quantization (weight is statically quantized, activation is dynamically quantized) +- Static Quantization (both weight and activations are statically quantized) + +We can mix different ways of quantizing operators in the same quantization flow. For example, we can have post training quantization that has both statically and dynamically quantized operators. + +Quantization Support Matrix +-------------------------------------- +Quantization Mode Support +^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++-----------------------------+------------------------------------------------------+----------------+----------------+------------+-----------------+ +| |Quantization |Dataset | Works Best For | Accuracy | Notes | +| |Mode |Requirement | | | | ++-----------------------------+---------------------------------+--------------------+----------------+----------------+------------+-----------------+ +|Post Training Quantization |Dyanmic/Weight Only Quantization |activation |None |LSTM, MLP, |good |Easy to use, | +| | |dynamically | |Embedding, | |close to static | +| | |quantized (fp16, | |Transformer | |quantization when| +| | |int8) or not | | | |performance is | +| | |quantized, weight | | | |compute or memory| +| | |statically quantized| | | |bound due to | +| | |(fp16, int8, in4) | | | |weights | +| +---------------------------------+--------------------+----------------+----------------+------------+-----------------+ +| |Static Quantization |acivation and |calibration |CNN |good |Provides best | +| | |weights statically |dataset | | |perf, may have | +| | |quantized (int8) | | | |big impact on | +| | | | | | |accuracy, good | +| | | | | | |for hardwares | +| | | | | | |that only support| +| | | | | | |int8 computation | ++-----------------------------+---------------------------------+--------------------+----------------+----------------+------------+-----------------+ +| |Dynamic Quantization |activation and |fine-tuning |MLP, Embedding |best |Limited support | +| | |weight are fake |dataset | | |for now | +| | |quantized | | | | | +| +---------------------------------+--------------------+----------------+----------------+------------+-----------------+ +| |Static Quantization |activatio nand |fine-tuning |CNN, MLP, |best |Typically used | +| | |weight are fake |dataset |Embedding | |when static | +| | |quantized | | | |quantization | +| | | | | | |leads to bad | +| | | | | | |accuracy, and | +| | | | | | |used to close the| +| | | | | | |accuracy gap | +|Quantization Aware Training | | | | | | | ++-----------------------------+---------------------------------+--------------------+----------------+----------------+------------+-----------------+ + +Please see our `Introduction to Quantization on Pytorch +`_ blog post +for a more comprehensive overview of the tradeoffs between these quantization +types. + +Quantization Flow Support +^^^^^^^^^^^^^^^^^^^^^^^^^^^ +PyTorch provides two modes of quantization: Eager Mode Quantization and FX Graph Mode Quantization. + +Eager Mode Quantization is a beta feature. User needs to do fusion and specify where quantization and dequantization happens manually, also it only supports modules and not functionals. + +FX Graph Mode Quantization is an automated quantization framework in PyTorch, and currently it's a prototype feature. It improves upon Eager Mode Quantization by adding support for functionals and automating the quantization process, although people might need to refactor the model to make the model compatible with FX Graph Mode Quantization (symbolically traceable with ``torch.fx``). Note that FX Graph Mode Quantization is not expected to work on arbitrary models since the model might not be symbolically traceable, we will integrate it into domain libraries like torchvision and users will be able to quantize models similar to the ones in supported domain libraries with FX Graph Mode Quantization. For arbitrary models we'll provide general guidelines, but to actually make it work, users might need to be familiar with ``torch.fx``, especially on how to make a model symbolically traceable. + +New users of quantization are encouraged to try out FX Graph Mode Quantization first, if it does not work, user may try to follow the guideline of `using FX Graph Mode Quantization `_ or fall back to eager mode quantization. + +The following table compares the differences between Eager Mode Quantization and FX Graph Mode Quantization: + ++-----------------+-------------------+-------------------+ +| |Eager Mode |FX Graph | +| |Quantization |Mode | +| | |Quantization | ++-----------------+-------------------+-------------------+ +|Release |beta |prototype | +|Status | | | ++-----------------+-------------------+-------------------+ +|Operator |Manual |Automatic | +|Fusion | | | ++-----------------+-------------------+-------------------+ +|Quant/DeQuant |Manual |Automatic | +|Placement | | | ++-----------------+-------------------+-------------------+ +|Quantizing |Supported |Supported | +|Modules | | | ++-----------------+-------------------+-------------------+ +|Quantizing |Manual |Automatic | +|Functionals/Torch| | | +|Ops | | | ++-----------------+-------------------+-------------------+ +|Support for |Limited Support |Fully | +|Customization | |Supported | ++-----------------+-------------------+-------------------+ +|Quantization Mode|Post Training |Post Training | +|Support |Quantization: |Quantization: | +| |Static, Dynamic, |Static, Dynamic, | +| |Weight Only |Weight Only | +| | | | +| |Quantization Aware |Quantization Aware | +| |Training: |Training: | +| |Static |Static | ++-----------------+-------------------+-------------------+ +|Input/Output |``torch.nn.Module``|``torch.nn.Module``| +|Model Type | |(May need some | +| | |refactors to make | +| | |the model | +| | |compatible with FX | +| | |Graph Mode | +| | |Quantization) | ++-----------------+-------------------+-------------------+ + +Backend/Hardware Support +^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++-----------------+---------------+------------+------------+------------+ +|Hardware |Kernel Library |Eager Mode |FX Graph |Quantization| +| | |Quantization|Mode |Mode Support| +| | | |Quantization| | ++-----------------+---------------+------------+------------+------------+ +|server CPU |fbgemm |Supported |All | +| | | |Supported | ++-----------------+---------------+ | + +|mobile CPU |qnnpack/xnnpack| | | +| | | | | ++-----------------+---------------+------------+------------+------------+ +|server GPU |TensorRT (early|Not support |Supported |Static | +| |prototype) |this it | |Quantization| +| | |requries a | | | +| | |graph | | | ++-----------------+---------------+------------+------------+------------+ + +Today, PyTorch supports the following backends for running quantized operators efficiently: + +* x86 CPUs with AVX2 support or higher (without AVX2 some operations have inefficient implementations), via `fbgemm `_ +* ARM CPUs (typically found in mobile/embedded devices), via `qnnpack `_ +* (early prototype) support for NVidia GPU via `TensorRT `_ through `fx2trt` (to be open sourced) + + +Note for native CPU backends +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +We expose both `fbgemm` and `qnnpack` with the same native pytorch quantized operators, so we need additional flag to distinguish between them. The corresponding implementation of `fbgemm` and `qnnpack` is chosen automatically based on the PyTorch build mode, though users have the option to override this by setting `torch.backends.quantization.engine` to `fbgemm` or `qnnpack`. When preparing a quantized model, it is necessary to ensure that qconfig and the engine used for quantized computations match the backend on which @@ -561,6 +672,74 @@ Default settings for qnnpack:: # set the qengine to control weight packing torch.backends.quantized.engine = 'qnnpack' +Operator Support +^^^^^^^^^^^^^^^^^^^^ + +Operator coverage varies between dynamic and static quantization and is captured in the table below. +Note that for FX Graph Mode Quantization, the corresponding functionals are also supported. + ++---------------------------+-------------------+--------------------+ +| |Static | Dynamic | +| |Quantization | Quantization | ++---------------------------+-------------------+--------------------+ +| | nn.Linear | | Y | | Y | +| | nn.Conv1d/2d/3d | | Y | | N | ++---------------------------+-------------------+--------------------+ +| | nn.LSTM | | N | | Y | +| | nn.GRU | | N | | Y | ++---------------------------+-------------------+--------------------+ +| | nn.RNNCell | | N | | Y | +| | nn.GRUCell | | N | | Y | +| | nn.LSTMCell | | N | | Y | ++---------------------------+-------------------+--------------------+ +|nn.EmbeddingBag | Y (activations | | +| | are in fp32) | Y | ++---------------------------+-------------------+--------------------+ +|nn.Embedding | Y | N | ++---------------------------+-------------------+--------------------+ +|nn.MultiheadAttention |Not Supported | Not supported | ++---------------------------+-------------------+--------------------+ +|Activations |Broadly supported | Un-changed, | +| | | computations | +| | | stay in fp32 | ++---------------------------+-------------------+--------------------+ + +Note: this will be updated with some information generated from native backend_config_dict soon. + +Quantization API Reference +--------------------------- + +The :doc:`Quantization API Reference ` contains documentation +of quantization APIs, such as quantization passes, quantized tensor operations, +and supported quantized modules and functions. + +.. toctree:: + :hidden: + + quantization-support + +Quantization Backend Configuration +---------------------------------- + +The :doc:`Quantization Backend Configuration ` contains documentation +on how to configure the quantization workflows for various backends. + +.. toctree:: + :hidden: + + quantization-backend-configuration + +Quantization Accuracy Debugging +------------------------------- + +The :doc:`Quantization Accuracy Debugging ` contains documentation +on how to debug quantization accuracy. + +.. toctree:: + :hidden: + + quantization-accuracy-debugging + Quantization Customizations --------------------------- @@ -710,46 +889,14 @@ Example:: mq = torch.quantization.quantize_fx.convert_fx( mp, convert_custom_config_dict=convert_custom_config_dict) -Model Preparation for Quantization (Eager Mode) ------------------------------------------------ - -It is necessary to currently make some modifications to the model definition -prior to Eager mode quantization. This is because currently quantization works on a module -by module basis. Specifically, for all quantization techniques, the user needs to: - -1. Convert any operations that require output requantization (and thus have - additional parameters) from functionals to module form (for example, - using ``torch.nn.ReLU`` instead of ``torch.nn.functional.relu``). -2. Specify which parts of the model need to be quantized either by assigning - ``.qconfig`` attributes on submodules or by specifying ``qconfig_dict``. - For example, setting ``model.conv1.qconfig = None`` means that the - ``model.conv`` layer will not be quantized, and setting - ``model.linear1.qconfig = custom_qconfig`` means that the quantization - settings for ``model.linear1`` will be using ``custom_qconfig`` instead - of the global qconfig. - -For static quantization techniques which quantize activations, the user needs -to do the following in addition: - -1. Specify where activations are quantized and de-quantized. This is done using - :class:`~torch.quantization.QuantStub` and - :class:`~torch.quantization.DeQuantStub` modules. -2. Use :class:`torch.nn.quantized.FloatFunctional` to wrap tensor operations - that require special handling for quantization into modules. Examples - are operations like ``add`` and ``cat`` which require special handling to - determine output quantization parameters. -3. Fuse modules: combine operations/modules into a single module to obtain - higher accuracy and performance. This is done using the - :func:`torch.quantization.fuse_modules` API, which takes in lists of modules - to be fused. We currently support the following fusions: - [Conv, Relu], [Conv, BatchNorm], [Conv, BatchNorm, Relu], [Linear, Relu] - Best Practices -------------- -1. Set the ``reduce_range`` argument on observers to `True` if you are using the - ``fbgemm`` backend. This argument prevents overflow on some int8 instructions - by reducing the range of quantized data type by 1 bit. +1. If you are using the ``fbgemm`` backend, we need to use 7 bits instead of 8 bits. Make sure you reduce the range for the ``quant\_min``, ``quant\_max``, e.g. +if ``dtype`` is ``torch.quint8``, make sure to set a custom ``quant_min`` to be ``0`` and ``quant_max`` to be ``127`` (``255`` / ``2``) +if ``dtype`` is ``torch.qint8``, make sure to set a custom ``quant_min`` to be ``-64`` (``-128`` / ``2``) and ``quant_max`` to be ``63`` (``127`` / ``2``), we already set this correctly if +you call the `torch.ao.quantization.get_default_qconfig(backend)` or `torch.ao.quantization.get_default_qat_qconfig(backend)` function to get the default ``qconfig`` for +``fbgemm`` or ``qnnpack`` backend Common Errors --------------------------------------- @@ -873,13 +1020,29 @@ An example:: b.seek(0) scripted_quantized = torch.jit.load(b) -Numerical Debugging (prototype) -------------------------------- - -.. warning :: - Numerical debugging tooling is early prototype and subject to change. - -* :ref:`torch_ao_ns_numeric_suite` - Eager mode numeric suite -* :ref:`torch_ao_ns_numeric_suite_fx` - FX numeric suite +Symbolic Trace Error when using FX Graph Mode Quantization +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Symbolic traceability is a requirement for `(Prototype) FX Graph Mode Quantization`_, so if you pass a PyTorch Model that is not symbolically traceable to `torch.ao.quantization.prepare_fx` or `torch.ao.quantization.prepare_qat_fx`, we might see an error like the following:: + + torch.fx.proxy.TraceError: symbolically traced variables cannot be used as inputs to control flow + +Please take a look at `Limitations of Symbolic Tracing `_ and use - `User Guide on Using FX Graph Mode Quantization `_ to workaround the problem. + + +.. torch.ao is missing documentation. Since part of it is mentioned here, adding them here for now. +.. They are here for tracking purposes until they are more permanently fixed. +.. py:module:: torch.ao +.. py:module:: torch.ao.nn +.. py:module:: torch.ao.nn.sparse +.. py:module:: torch.ao.nn.sparse.quantized +.. py:module:: torch.ao.nn.sparse.quantized.dynamic +.. py:module:: torch.ao.ns +.. py:module:: torch.ao.ns.fx +.. py:module:: torch.ao.quantization +.. py:module:: torch.ao.quantization.fx +.. py:module:: torch.ao.quantization.backend_config +.. py:module:: torch.ao.sparsity +.. py:module:: torch.ao.sparsity.experimental +.. py:module:: torch.ao.sparsity.experimental.pruner +.. py:module:: torch.ao.sparsity.scheduler +.. py:module:: torch.ao.sparsity.sparsifier diff --git a/docs/source/rpc.rst b/docs/source/rpc.rst index 2e801f3b69ce..89f146bfd68e 100644 --- a/docs/source/rpc.rst +++ b/docs/source/rpc.rst @@ -190,6 +190,18 @@ Example:: :members: :inherited-members: +.. note :: + The RPC framework does not automatically retry any + :meth:`~torch.distributed.rpc.rpc_sync`, + :meth:`~torch.distributed.rpc.rpc_async` and + :meth:`~torch.distributed.rpc.remote` calls. The reason being that there is + no way the RPC framework can determine whether an operation is idempotent or + not and whether it is safe to retry. As a result, it is the application's + responsibility to deal with failures and retry if necessary. RPC communication + is based on TCP and as a result failures could happen due to network failures + or intermittent network connectivity issues. In such scenarios, the application + needs to retry appropriately with reasonable backoffs to ensure the network + isn't overwhelmed by aggressive retries. .. _rref: diff --git a/docs/source/scripts/build_onnx_supported_aten_op_csv_table.py b/docs/source/scripts/build_onnx_supported_aten_op_csv_table.py new file mode 100644 index 000000000000..7d12a441c440 --- /dev/null +++ b/docs/source/scripts/build_onnx_supported_aten_op_csv_table.py @@ -0,0 +1,21 @@ +""" +This script generates a CSV table with all ATen operators +supported by `torch.onnx.export`. The generated table is included by +docs/source/onnx_supported_aten_list.rst. +""" + +import os +from torch.onnx import onnx_supported_ops + +# Constants +BUILD_DIR = 'build' +AUTO_GEN_ATEN_OPS_CSV_FILE = 'auto_gen_aten_op_list.csv' + +os.makedirs(BUILD_DIR, exist_ok=True) + +aten_list = onnx_supported_ops.onnx_supported_ops() + +with open(os.path.join(BUILD_DIR, AUTO_GEN_ATEN_OPS_CSV_FILE), 'w') as f: + f.write('Operator,opset_version(s)\n') + for name, opset_version in aten_list: + f.write(f'"``{name}``","{opset_version}"\n') diff --git a/docs/source/scripts/build_quantization_configs.py b/docs/source/scripts/build_quantization_configs.py new file mode 100644 index 000000000000..6ab4fd433eff --- /dev/null +++ b/docs/source/scripts/build_quantization_configs.py @@ -0,0 +1,62 @@ +""" +This script will generate default values of quantization configs. +These are for use in the documentation. +""" + +import torch +from torch.ao.quantization.backend_config import get_native_backend_config_dict +from torch.ao.quantization.backend_config.utils import ( + entry_to_pretty_str, + remove_boolean_dispatch_from_name, +) +import os.path + + +# Create a directory for the images, if it doesn't exist +QUANTIZATION_BACKEND_CONFIG_IMAGE_PATH = os.path.join( + os.path.realpath(os.path.join(__file__, "..")), + "quantization_backend_configs" +) + +if not os.path.exists(QUANTIZATION_BACKEND_CONFIG_IMAGE_PATH): + os.mkdir(QUANTIZATION_BACKEND_CONFIG_IMAGE_PATH) + +output_path = os.path.join(QUANTIZATION_BACKEND_CONFIG_IMAGE_PATH, "default_backend_config.txt") + +with open(output_path, "w") as f: + native_backend_config_dict = get_native_backend_config_dict() + + configs = native_backend_config_dict['configs'] + + def _sort_key_func(entry): + pattern = entry['pattern'] + while isinstance(pattern, tuple): + pattern = pattern[-1] + + pattern = remove_boolean_dispatch_from_name(pattern) + if not isinstance(pattern, str): + # methods are already strings + pattern = torch.typename(pattern) + + # we want + # + # torch.nn.modules.pooling.AdaptiveAvgPool1d + # + # and + # + # torch._VariableFunctionsClass.adaptive_avg_pool1d + # + # to be next to each other, so convert to all lower case + # and remove the underscores, and compare the last part + # of the string + pattern_str_normalized = pattern.lower().replace('_', '') + key = pattern_str_normalized.split('.')[-1] + return key + + configs.sort(key=_sort_key_func) + + entries = [] + for entry in configs: + entries.append(entry_to_pretty_str(entry)) + entries = ",\n".join(entries) + f.write(entries) diff --git a/docs/source/sparse.rst b/docs/source/sparse.rst index 178e4cb18603..564df4ef4323 100644 --- a/docs/source/sparse.rst +++ b/docs/source/sparse.rst @@ -1,3 +1,5 @@ +.. automodule:: torch.sparse + .. currentmodule:: torch .. _sparse-docs: diff --git a/docs/source/special.rst b/docs/source/special.rst index 1aa24242fad9..42acd2148a6a 100644 --- a/docs/source/special.rst +++ b/docs/source/special.rst @@ -7,8 +7,6 @@ torch.special The torch.special module, modeled after SciPy's `special `_ module. .. automodule:: torch.special - :noindex: - .. currentmodule:: torch.special Functions @@ -39,6 +37,7 @@ Functions .. autofunction:: multigammaln .. autofunction:: ndtr .. autofunction:: ndtri +.. autofunction:: log_ndtr .. autofunction:: round .. autofunction:: sinc .. autofunction:: softmax diff --git a/docs/source/storage.rst b/docs/source/storage.rst index 3aeec082b607..747acf11ed36 100644 --- a/docs/source/storage.rst +++ b/docs/source/storage.rst @@ -1,87 +1,96 @@ torch.Storage =================================== -A :class:`torch.Storage` is a contiguous, one-dimensional array of a single -data type. +A :class:`torch._TypedStorage` is a contiguous, one-dimensional array of +elements of a particular :class:`torch.dtype`. It can be given any +:class:`torch.dtype`, and the internal data will be interpretted appropriately. -Every :class:`torch.Tensor` has a corresponding storage of the same data type. +Every strided :class:`torch.Tensor` contains a :class:`torch._TypedStorage`, +which stores all of the data that the :class:`torch.Tensor` views. -.. autoclass:: torch.DoubleStorage +For backward compatibility, there are also :class:`torch.Storage` classes +(like :class:`torch.FloatStorage`, :class:`torch.IntStorage`, etc). These +classes are not actually instantiated, and calling their constructors creates +a :class:`torch._TypedStorage` with the appropriate :class:`torch.dtype`. +:class:`torch.Storage` classes have all of the same class methods that +:class:`torch._TypedStorage` has. + +Also for backward compatibility, :class:`torch.Storage` is an alias for the +storage class that corresponds with the default data type +(:func:`torch.get_default_dtype()`). For instance, if the default data type is +:attr:`torch.float`, :class:`torch.Storage` resolves to +:class:`torch.FloatStorage`. + + +.. autoclass:: torch._TypedStorage :members: :undoc-members: :inherited-members: +.. autoclass:: torch.DoubleStorage + :members: + :undoc-members: + .. autoclass:: torch.FloatStorage :members: :undoc-members: - :inherited-members: .. autoclass:: torch.HalfStorage :members: :undoc-members: - :inherited-members: .. autoclass:: torch.LongStorage :members: :undoc-members: - :inherited-members: .. autoclass:: torch.IntStorage :members: :undoc-members: - :inherited-members: .. autoclass:: torch.ShortStorage :members: :undoc-members: - :inherited-members: .. autoclass:: torch.CharStorage :members: :undoc-members: - :inherited-members: .. autoclass:: torch.ByteStorage :members: :undoc-members: - :inherited-members: .. autoclass:: torch.BoolStorage :members: :undoc-members: - :inherited-members: .. autoclass:: torch.BFloat16Storage :members: :undoc-members: - :inherited-members: .. autoclass:: torch.ComplexDoubleStorage :members: :undoc-members: - :inherited-members: .. autoclass:: torch.ComplexFloatStorage :members: :undoc-members: - :inherited-members: .. autoclass:: torch.QUInt8Storage :members: :undoc-members: - :inherited-members: .. autoclass:: torch.QInt8Storage :members: :undoc-members: - :inherited-members: .. autoclass:: torch.QInt32Storage :members: :undoc-members: - :inherited-members: .. autoclass:: torch.QUInt4x2Storage :members: :undoc-members: - :inherited-members: + +.. autoclass:: torch.QUInt2x4Storage + :members: + :undoc-members: diff --git a/docs/source/tensor_attributes.rst b/docs/source/tensor_attributes.rst index e62eb29f6d4d..aa68e8f805fe 100644 --- a/docs/source/tensor_attributes.rst +++ b/docs/source/tensor_attributes.rst @@ -12,7 +12,7 @@ Each ``torch.Tensor`` has a :class:`torch.dtype`, :class:`torch.device`, and :cl torch.dtype ----------- -.. class:: torch.dtype +.. class:: dtype A :class:`torch.dtype` is an object that represents the data type of a :class:`torch.Tensor`. PyTorch has twelve different data types: @@ -134,7 +134,7 @@ Casting Examples:: torch.device ------------ -.. class:: torch.device +.. class:: device A :class:`torch.device` is an object representing the device on which a :class:`torch.Tensor` is or will be allocated. @@ -204,7 +204,7 @@ Via a string and device ordinal: torch.layout ------------ -.. class:: torch.layout +.. class:: layout .. warning:: The ``torch.layout`` class is in beta and subject to change. @@ -236,7 +236,7 @@ For more information on ``torch.sparse_coo`` tensors, see :ref:`sparse-docs`. torch.memory_format ------------------- -.. class:: torch.memory_format +.. class:: memory_format A :class:`torch.memory_format` is an object representing the memory format on which a :class:`torch.Tensor` is or will be allocated. diff --git a/docs/source/tensorboard.rst b/docs/source/tensorboard.rst index d3205e3ba589..8cd138369288 100644 --- a/docs/source/tensorboard.rst +++ b/docs/source/tensorboard.rst @@ -1,5 +1,6 @@ torch.utils.tensorboard =================================== +.. automodule:: torch.utils.tensorboard Before going further, more details on TensorBoard can be found at https://www.tensorflow.org/tensorboard/ diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst index 090824e0ee3c..e88c382df17e 100644 --- a/docs/source/tensors.rst +++ b/docs/source/tensors.rst @@ -21,8 +21,8 @@ Data type dtype 64-bit floating point ``torch.float64`` or ``torch.double`` :class:`torch.DoubleTensor` :class:`torch.cuda.DoubleTensor` 16-bit floating point [1]_ ``torch.float16`` or ``torch.half`` :class:`torch.HalfTensor` :class:`torch.cuda.HalfTensor` 16-bit floating point [2]_ ``torch.bfloat16`` :class:`torch.BFloat16Tensor` :class:`torch.cuda.BFloat16Tensor` -32-bit complex ``torch.complex32`` -64-bit complex ``torch.complex64`` +32-bit complex ``torch.complex32`` or ``torch.chalf`` +64-bit complex ``torch.complex64`` or ``torch.cfloat`` 128-bit complex ``torch.complex128`` or ``torch.cdouble`` 8-bit integer (unsigned) ``torch.uint8`` :class:`torch.ByteTensor` :class:`torch.cuda.ByteTensor` 8-bit integer (signed) ``torch.int8`` :class:`torch.CharTensor` :class:`torch.cuda.CharTensor` @@ -32,7 +32,7 @@ Data type dtype Boolean ``torch.bool`` :class:`torch.BoolTensor` :class:`torch.cuda.BoolTensor` quantized 8-bit integer (unsigned) ``torch.quint8`` :class:`torch.ByteTensor` / quantized 8-bit integer (signed) ``torch.qint8`` :class:`torch.CharTensor` / -quantized 32-bit integer (signed) ``torch.qfint32`` :class:`torch.IntTensor` / +quantized 32-bit integer (signed) ``torch.qint32`` :class:`torch.IntTensor` / quantized 4-bit integer (unsigned) [3]_ ``torch.quint4x2`` :class:`torch.ByteTensor` / ======================================= =========================================== ============================= ================================ @@ -315,6 +315,9 @@ Tensor class reference Tensor.cumprod_ Tensor.cumsum Tensor.cumsum_ + Tensor.chalf + Tensor.cfloat + Tensor.cdouble Tensor.data_ptr Tensor.deg2rad Tensor.dequantize @@ -416,6 +419,8 @@ Tensor class reference Tensor.index_fill Tensor.index_put_ Tensor.index_put + Tensor.index_reduce_ + Tensor.index_reduce Tensor.index_select Tensor.indices Tensor.inner @@ -593,6 +598,8 @@ Tensor class reference Tensor.scatter_ Tensor.scatter_add_ Tensor.scatter_add + Tensor.scatter_reduce_ + Tensor.scatter_reduce Tensor.select Tensor.select_scatter Tensor.set_ @@ -618,7 +625,6 @@ Tensor class reference Tensor.size Tensor.slogdet Tensor.slice_scatter - Tensor.solve Tensor.sort Tensor.split Tensor.sparse_mask diff --git a/docs/source/testing.rst b/docs/source/testing.rst index 213e82b9c4ca..d1a63f645dfc 100644 --- a/docs/source/testing.rst +++ b/docs/source/testing.rst @@ -1,11 +1,6 @@ torch.testing ============= -.. warning:: - - This module is a beta release, and its interfaces and functionality may change without warning in future - PyTorch releases. - .. automodule:: torch.testing .. autofunction:: assert_close diff --git a/docs/source/torch.overrides.rst b/docs/source/torch.overrides.rst index 0630b60c4b17..ce3583afa71e 100644 --- a/docs/source/torch.overrides.rst +++ b/docs/source/torch.overrides.rst @@ -14,6 +14,8 @@ Functions .. autofunction:: get_overridable_functions +.. autofunction:: resolve_name + .. autofunction:: get_testing_overrides .. autofunction:: handle_torch_function diff --git a/docs/source/torch.rst b/docs/source/torch.rst index 1d3f3ce85b2c..6c71331440ea 100644 --- a/docs/source/torch.rst +++ b/docs/source/torch.rst @@ -1,13 +1,6 @@ torch ===== -The torch package contains data structures for multi-dimensional -tensors and defines mathematical operations over these tensors. -Additionally, it provides many utilities for efficient serializing of -Tensors and arbitrary types, and other useful utilities. - -It has a CUDA counterpart, that enables you to run your tensor computations -on an NVIDIA GPU with compute capability >= 3.0 - +.. automodule:: torch .. currentmodule:: torch Tensors @@ -58,6 +51,7 @@ Creation Ops as_tensor as_strided from_numpy + from_dlpack frombuffer zeros zeros_like @@ -102,6 +96,7 @@ Indexing, Slicing, Joining, Mutating Ops hstack index_add index_copy + index_reduce index_select masked_select movedim @@ -117,6 +112,7 @@ Indexing, Slicing, Joining, Mutating Ops select_scatter slice_scatter scatter_add + scatter_reduce split squeeze stack @@ -582,7 +578,6 @@ BLAS and LAPACK Operations outer pinverse qr - solve svd svd_lowrank pca_lowrank @@ -609,7 +604,24 @@ Utilities is_deterministic_algorithms_warn_only_enabled set_deterministic_debug_mode get_deterministic_debug_mode + set_float32_matmul_precision + get_float32_matmul_precision set_warn_always is_warn_always_enabled vmap _assert + + +.. Empty submodules added only for tracking. +.. py:module:: torch.contrib +.. py:module:: torch.utils.backcompat + +.. This submodule is split manually without a top level page. +.. py:module:: torch.utils + +.. This module is only used internally for ROCm builds. +.. py:module:: torch.utils.hipify + +.. This module needs to be documented. Adding here in the meantime +.. for tracking purposes +.. py:module:: torch.utils.model_dump diff --git a/ios/LibTorch-Lite.podspec b/ios/LibTorch-Lite.podspec index f3ccaa43e932..d2d9264e0a62 100644 --- a/ios/LibTorch-Lite.podspec +++ b/ios/LibTorch-Lite.podspec @@ -1,6 +1,6 @@ Pod::Spec.new do |s| s.name = 'LibTorch-Lite' - s.version = '1.10.0' + s.version = '1.11.0' s.authors = 'PyTorch Team' s.license = { :type => 'BSD' } s.homepage = 'https://github.com/pytorch/pytorch' diff --git a/ios/LibTorch.podspec b/ios/LibTorch.podspec index 22aaafac9d12..77bc0537e89e 100644 --- a/ios/LibTorch.podspec +++ b/ios/LibTorch.podspec @@ -1,6 +1,6 @@ Pod::Spec.new do |s| s.name = 'LibTorch' - s.version = '1.10.0' + s.version = '1.11.0' s.authors = 'PyTorch Team' s.license = { :type => 'BSD' } s.homepage = 'https://github.com/pytorch/pytorch' diff --git a/ios/TestApp/TestApp/Base.lproj/Main.storyboard b/ios/TestApp/TestApp/Base.lproj/Main.storyboard index ad8e8f7c874c..86c53ddccf22 100644 --- a/ios/TestApp/TestApp/Base.lproj/Main.storyboard +++ b/ios/TestApp/TestApp/Base.lproj/Main.storyboard @@ -1,38 +1,22 @@ - + - + - - + - - - - - - - - - - - - - - - @@ -59,12 +43,4 @@ - - - - - - - - diff --git a/ios/TestApp/TestApp/ViewController.mm b/ios/TestApp/TestApp/ViewController.mm index 38404ddac3b9..d8ecacda3c83 100644 --- a/ios/TestApp/TestApp/ViewController.mm +++ b/ios/TestApp/TestApp/ViewController.mm @@ -4,4 +4,9 @@ @interface ViewController () @end @implementation ViewController + +- (void)viewDidLoad { + [super viewDidLoad]; +} + @end diff --git a/ios/TestApp/TestAppTests/TestLiteInterpreter.mm b/ios/TestApp/TestAppTests/TestLiteInterpreter.mm index f35642a148e3..37c8692b9980 100644 --- a/ios/TestApp/TestAppTests/TestLiteInterpreter.mm +++ b/ios/TestApp/TestAppTests/TestLiteInterpreter.mm @@ -11,8 +11,8 @@ @interface TestAppTests : XCTestCase @implementation TestAppTests { } -- (void)testLiteInterpreter { - NSString* modelPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"model_lite" +- (void)testCoreML { + NSString* modelPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"model_coreml" ofType:@"ptl"]; auto module = torch::jit::_load_for_mobile(modelPath.UTF8String); c10::InferenceMode mode; @@ -21,14 +21,173 @@ - (void)testLiteInterpreter { XCTAssertTrue(outputTensor.numel() == 1000); } -- (void)testCoreML { - NSString* modelPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"model_coreml" +- (void)testModel:(NSString*)filename { + // model generated using the current pytorch revision + [self runModel:[NSString stringWithFormat:@"%@_temp", filename]]; + // model generated using older pyotrch revision + [self runModel:filename]; +} + +- (void)runModel:(NSString*)filename { + NSString* modelPath = [[NSBundle bundleForClass:[self class]] pathForResource:filename ofType:@"ptl"]; - auto module = torch::jit::_load_for_mobile(modelPath.UTF8String); + XCTAssertNotNil(modelPath); c10::InferenceMode mode; - auto input = torch::ones({1, 3, 224, 224}, at::kFloat); - auto outputTensor = module.forward({input}).toTensor(); - XCTAssertTrue(outputTensor.numel() == 1000); + auto module = torch::jit::_load_for_mobile(modelPath.UTF8String); + auto has_bundled_input = module.find_method("get_all_bundled_inputs"); + if (has_bundled_input) { + c10::IValue bundled_inputs = module.run_method("get_all_bundled_inputs"); + c10::List all_inputs = bundled_inputs.toList(); + std::vector> inputs; + for (at::IValue input : all_inputs) { + inputs.push_back(input.toTupleRef().elements()); + } + // run with the first bundled input + XCTAssertNoThrow(module.forward(inputs[0])); + } else { + XCTAssertNoThrow(module.forward({})); + } +} + +// TODO remove this once updated test script +- (void)testLiteInterpreter { + XCTAssertTrue(true); +} + +- (void)testMobileNetV2 { + [self testModel:@"mobilenet_v2"]; +} + +- (void)testPointwiseOps { + [self testModel:@"pointwise_ops"]; +} + +- (void)testReductionOps { + [self testModel:@"reduction_ops"]; +} + +- (void)testComparisonOps { + [self testModel:@"comparison_ops"]; +} + +- (void)testOtherMathOps { + [self testModel:@"other_math_ops"]; +} + +- (void)testSpectralOps { + [self testModel:@"spectral_ops"]; +} + +- (void)testBlasLapackOps { + [self testModel:@"blas_lapack_ops"]; +} + +- (void)testSamplingOps { + [self testModel:@"sampling_ops"]; +} + +- (void)testTensorOps { + [self testModel:@"tensor_general_ops"]; +} + +- (void)testTensorCreationOps { + [self testModel:@"tensor_creation_ops"]; +} + +- (void)testTensorIndexingOps { + [self testModel:@"tensor_indexing_ops"]; +} + +- (void)testTensorTypingOps { + [self testModel:@"tensor_typing_ops"]; +} + +- (void)testTensorViewOps { + [self testModel:@"tensor_view_ops"]; +} + +- (void)testConvolutionOps { + [self testModel:@"convolution_ops"]; +} + +- (void)testPoolingOps { + [self testModel:@"pooling_ops"]; +} + +- (void)testPaddingOps { + [self testModel:@"padding_ops"]; +} + +- (void)testActivationOps { + [self testModel:@"activation_ops"]; +} + +- (void)testNormalizationOps { + [self testModel:@"normalization_ops"]; +} + +- (void)testRecurrentOps { + [self testModel:@"recurrent_ops"]; +} + +- (void)testTransformerOps { + [self testModel:@"transformer_ops"]; +} + +- (void)testLinearOps { + [self testModel:@"linear_ops"]; +} + +- (void)testDropoutOps { + [self testModel:@"dropout_ops"]; +} + +- (void)testSparseOps { + [self testModel:@"sparse_ops"]; +} + +- (void)testDistanceFunctionOps { + [self testModel:@"distance_function_ops"]; +} + +- (void)testLossFunctionOps { + [self testModel:@"loss_function_ops"]; +} + +- (void)testVisionFunctionOps { + [self testModel:@"vision_function_ops"]; +} + +- (void)testShuffleOps { + [self testModel:@"shuffle_ops"]; +} + +- (void)testNNUtilsOps { + [self testModel:@"nn_utils_ops"]; +} + +- (void)testQuantOps { + [self testModel:@"general_quant_ops"]; +} + +- (void)testDynamicQuantOps { + [self testModel:@"dynamic_quant_ops"]; +} + +- (void)testStaticQuantOps { + [self testModel:@"static_quant_ops"]; +} + +- (void)testFusedQuantOps { + [self testModel:@"fused_quant_ops"]; +} + +- (void)testTorchScriptBuiltinQuantOps { + [self testModel:@"torchscript_builtin_ops"]; +} + +- (void)testTorchScriptCollectionQuantOps { + [self testModel:@"torchscript_collection_ops"]; } @end diff --git a/ios/TestApp/models/activation_ops.ptl b/ios/TestApp/models/activation_ops.ptl new file mode 100644 index 000000000000..44673efd446e Binary files /dev/null and b/ios/TestApp/models/activation_ops.ptl differ diff --git a/ios/TestApp/models/android_api_module.ptl b/ios/TestApp/models/android_api_module.ptl new file mode 100644 index 000000000000..df62dd862088 Binary files /dev/null and b/ios/TestApp/models/android_api_module.ptl differ diff --git a/ios/TestApp/models/blas_lapack_ops.ptl b/ios/TestApp/models/blas_lapack_ops.ptl new file mode 100644 index 000000000000..fea933ee644f Binary files /dev/null and b/ios/TestApp/models/blas_lapack_ops.ptl differ diff --git a/ios/TestApp/models/comparison_ops.ptl b/ios/TestApp/models/comparison_ops.ptl new file mode 100644 index 000000000000..01b1c153e751 Binary files /dev/null and b/ios/TestApp/models/comparison_ops.ptl differ diff --git a/ios/TestApp/models/convolution_ops.ptl b/ios/TestApp/models/convolution_ops.ptl new file mode 100644 index 000000000000..de776834eb77 Binary files /dev/null and b/ios/TestApp/models/convolution_ops.ptl differ diff --git a/ios/TestApp/models/distance_function_ops.ptl b/ios/TestApp/models/distance_function_ops.ptl new file mode 100644 index 000000000000..cc4d994f440a Binary files /dev/null and b/ios/TestApp/models/distance_function_ops.ptl differ diff --git a/ios/TestApp/models/dropout_ops.ptl b/ios/TestApp/models/dropout_ops.ptl new file mode 100644 index 000000000000..422c2f60e6be Binary files /dev/null and b/ios/TestApp/models/dropout_ops.ptl differ diff --git a/ios/TestApp/models/dynamic_quant_ops.ptl b/ios/TestApp/models/dynamic_quant_ops.ptl new file mode 100644 index 000000000000..573dee91f07b Binary files /dev/null and b/ios/TestApp/models/dynamic_quant_ops.ptl differ diff --git a/ios/TestApp/models/fused_quant_ops.ptl b/ios/TestApp/models/fused_quant_ops.ptl new file mode 100644 index 000000000000..d24e3d8d4caa Binary files /dev/null and b/ios/TestApp/models/fused_quant_ops.ptl differ diff --git a/ios/TestApp/models/general_quant_ops.ptl b/ios/TestApp/models/general_quant_ops.ptl new file mode 100644 index 000000000000..5254d33b4794 Binary files /dev/null and b/ios/TestApp/models/general_quant_ops.ptl differ diff --git a/ios/TestApp/models/linear_ops.ptl b/ios/TestApp/models/linear_ops.ptl new file mode 100644 index 000000000000..36915823843c Binary files /dev/null and b/ios/TestApp/models/linear_ops.ptl differ diff --git a/ios/TestApp/models/loss_function_ops.ptl b/ios/TestApp/models/loss_function_ops.ptl new file mode 100644 index 000000000000..4c0592e5485a Binary files /dev/null and b/ios/TestApp/models/loss_function_ops.ptl differ diff --git a/ios/TestApp/models/mobilenet_v2.ptl b/ios/TestApp/models/mobilenet_v2.ptl new file mode 100644 index 000000000000..b034aaf8c802 Binary files /dev/null and b/ios/TestApp/models/mobilenet_v2.ptl differ diff --git a/ios/TestApp/models/model_coreml.ptl b/ios/TestApp/models/model_coreml.ptl new file mode 100644 index 000000000000..1f2271b365f3 Binary files /dev/null and b/ios/TestApp/models/model_coreml.ptl differ diff --git a/ios/TestApp/models/model_lite.ptl b/ios/TestApp/models/model_lite.ptl new file mode 100644 index 000000000000..9aef3bd6b546 Binary files /dev/null and b/ios/TestApp/models/model_lite.ptl differ diff --git a/ios/TestApp/models/nn_utils_ops.ptl b/ios/TestApp/models/nn_utils_ops.ptl new file mode 100644 index 000000000000..726b200a67d1 Binary files /dev/null and b/ios/TestApp/models/nn_utils_ops.ptl differ diff --git a/ios/TestApp/models/normalization_ops.ptl b/ios/TestApp/models/normalization_ops.ptl new file mode 100644 index 000000000000..1846009a3b72 Binary files /dev/null and b/ios/TestApp/models/normalization_ops.ptl differ diff --git a/ios/TestApp/models/other_math_ops.ptl b/ios/TestApp/models/other_math_ops.ptl new file mode 100644 index 000000000000..7209c3b3bd1f Binary files /dev/null and b/ios/TestApp/models/other_math_ops.ptl differ diff --git a/ios/TestApp/models/padding_ops.ptl b/ios/TestApp/models/padding_ops.ptl new file mode 100644 index 000000000000..4af0418f11a6 Binary files /dev/null and b/ios/TestApp/models/padding_ops.ptl differ diff --git a/ios/TestApp/models/pointwise_ops.ptl b/ios/TestApp/models/pointwise_ops.ptl new file mode 100644 index 000000000000..948ed4832660 Binary files /dev/null and b/ios/TestApp/models/pointwise_ops.ptl differ diff --git a/ios/TestApp/models/pooling_ops.ptl b/ios/TestApp/models/pooling_ops.ptl new file mode 100644 index 000000000000..4b98f1971ee5 Binary files /dev/null and b/ios/TestApp/models/pooling_ops.ptl differ diff --git a/ios/TestApp/models/recurrent_ops.ptl b/ios/TestApp/models/recurrent_ops.ptl new file mode 100644 index 000000000000..10804040be84 Binary files /dev/null and b/ios/TestApp/models/recurrent_ops.ptl differ diff --git a/ios/TestApp/models/reduction_ops.ptl b/ios/TestApp/models/reduction_ops.ptl new file mode 100644 index 000000000000..0f1fccea7134 Binary files /dev/null and b/ios/TestApp/models/reduction_ops.ptl differ diff --git a/ios/TestApp/models/sampling_ops.ptl b/ios/TestApp/models/sampling_ops.ptl new file mode 100644 index 000000000000..416be7cb1279 Binary files /dev/null and b/ios/TestApp/models/sampling_ops.ptl differ diff --git a/ios/TestApp/models/shuffle_ops.ptl b/ios/TestApp/models/shuffle_ops.ptl new file mode 100644 index 000000000000..5e5520118764 Binary files /dev/null and b/ios/TestApp/models/shuffle_ops.ptl differ diff --git a/ios/TestApp/models/sparse_ops.ptl b/ios/TestApp/models/sparse_ops.ptl new file mode 100644 index 000000000000..a16f68f8f95f Binary files /dev/null and b/ios/TestApp/models/sparse_ops.ptl differ diff --git a/ios/TestApp/models/spectral_ops.ptl b/ios/TestApp/models/spectral_ops.ptl new file mode 100644 index 000000000000..9828dd2ba901 Binary files /dev/null and b/ios/TestApp/models/spectral_ops.ptl differ diff --git a/ios/TestApp/models/static_quant_ops.ptl b/ios/TestApp/models/static_quant_ops.ptl new file mode 100644 index 000000000000..f0f0a09b832d Binary files /dev/null and b/ios/TestApp/models/static_quant_ops.ptl differ diff --git a/ios/TestApp/models/tensor_creation_ops.ptl b/ios/TestApp/models/tensor_creation_ops.ptl new file mode 100644 index 000000000000..d897b43cd36c Binary files /dev/null and b/ios/TestApp/models/tensor_creation_ops.ptl differ diff --git a/ios/TestApp/models/tensor_general_ops.ptl b/ios/TestApp/models/tensor_general_ops.ptl new file mode 100644 index 000000000000..6f2855ea83ea Binary files /dev/null and b/ios/TestApp/models/tensor_general_ops.ptl differ diff --git a/ios/TestApp/models/tensor_indexing_ops.ptl b/ios/TestApp/models/tensor_indexing_ops.ptl new file mode 100644 index 000000000000..ac9cb8c4b94a Binary files /dev/null and b/ios/TestApp/models/tensor_indexing_ops.ptl differ diff --git a/ios/TestApp/models/tensor_typing_ops.ptl b/ios/TestApp/models/tensor_typing_ops.ptl new file mode 100644 index 000000000000..3e2f4d8cc689 Binary files /dev/null and b/ios/TestApp/models/tensor_typing_ops.ptl differ diff --git a/ios/TestApp/models/tensor_view_ops.ptl b/ios/TestApp/models/tensor_view_ops.ptl new file mode 100644 index 000000000000..5e2dc8294842 Binary files /dev/null and b/ios/TestApp/models/tensor_view_ops.ptl differ diff --git a/ios/TestApp/models/torchscript_builtin_ops.ptl b/ios/TestApp/models/torchscript_builtin_ops.ptl new file mode 100644 index 000000000000..2d2532df2fd2 Binary files /dev/null and b/ios/TestApp/models/torchscript_builtin_ops.ptl differ diff --git a/ios/TestApp/models/torchscript_collection_ops.ptl b/ios/TestApp/models/torchscript_collection_ops.ptl new file mode 100644 index 000000000000..ce434b3b4210 Binary files /dev/null and b/ios/TestApp/models/torchscript_collection_ops.ptl differ diff --git a/ios/TestApp/models/transformer_ops.ptl b/ios/TestApp/models/transformer_ops.ptl new file mode 100644 index 000000000000..4546569cd7fd Binary files /dev/null and b/ios/TestApp/models/transformer_ops.ptl differ diff --git a/ios/TestApp/models/vision_function_ops.ptl b/ios/TestApp/models/vision_function_ops.ptl new file mode 100644 index 000000000000..e1f8c39c78ab Binary files /dev/null and b/ios/TestApp/models/vision_function_ops.ptl differ diff --git a/modules/observers/perf_observer.cc b/modules/observers/perf_observer.cc index bdee55daf179..cfd6130f7255 100644 --- a/modules/observers/perf_observer.cc +++ b/modules/observers/perf_observer.cc @@ -195,7 +195,7 @@ void PerfNetObserver::Start() { int skipIters = ObserverConfig::getSkipIters(); int sampleRate = visitCount > 0 ? netFollowupSampleRate : netInitSampleRate; // NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand) - if (skipIters <= numRuns_ && sampleRate > 0 && rand() % sampleRate == 0) { + if (skipIters <= static_cast(numRuns_) && sampleRate > 0 && rand() % sampleRate == 0) { visitCount++; if (visitCount == netFollowupSampleCount) { visitCount = 0; @@ -238,9 +238,9 @@ void PerfNetObserver::Stop() { if (logType_ == PerfNetObserver::OPERATOR_DELAY) { const auto& operators = subject_->GetOperators(); - for (int idx = 0; idx < operators.size(); ++idx) { + for (unsigned idx = 0; idx < operators.size(); ++idx) { const auto* op = operators[idx]; - auto name = getObserverName(op, idx); + auto name = getObserverName(op, static_cast(idx)); PerformanceInformation p; const PerfOperatorObserver* opObserver = static_cast(observerMap_[op]); diff --git a/mypy.ini b/mypy.ini index a3ec144806e4..61442c1a7d69 100644 --- a/mypy.ini +++ b/mypy.ini @@ -41,7 +41,7 @@ files = # # `exclude` is a regex, not a list of paths like `files` (sigh) # -exclude = torch/include/|torch/csrc/|torch/distributed/elastic/agent/server/api.py|torch/testing/_internal +exclude = torch/include/|torch/csrc/|torch/distributed/elastic/agent/server/api.py|torch/testing/_internal|torch/distributed/fsdp/fully_sharded_data_parallel.py # Minimum version supported - variable annotations were introduced # in Python 3.7 diff --git a/mypy_plugins/check_mypy_version.py b/mypy_plugins/check_mypy_version.py index 02a02a60b950..a34b8683c989 100644 --- a/mypy_plugins/check_mypy_version.py +++ b/mypy_plugins/check_mypy_version.py @@ -9,7 +9,7 @@ def get_correct_mypy_version(): # there's probably a more elegant way to do this match, = re.finditer( r'mypy==(\d+(?:\.\d+)*)', - Path('.circleci/docker/common/install_conda.sh').read_text(), + Path('.circleci/docker/requirements-ci.txt').read_text(), ) version, = match.groups() return version diff --git a/pt_defs.oss.bzl b/pt_defs.oss.bzl new file mode 100644 index 000000000000..2219138a9002 --- /dev/null +++ b/pt_defs.oss.bzl @@ -0,0 +1,809 @@ +load("@bazel_skylib//lib:paths.bzl", "paths") +load( + "//tools:build_variables.bzl", + "aten_native_source_list", +) +load( + "//tools:ufunc_defs.bzl", + "aten_ufunc_generated_cpu_kernel_sources", + "aten_ufunc_generated_cpu_sources", + "aten_ufunc_generated_cuda_sources", +) +load("//tools/build_defs:fb_xplat_genrule.bzl", "fb_xplat_genrule") +load("//tools/build_defs:type_defs.bzl", "is_list", "is_string") + +USED_PT_BACKENDS = [ + "CPU", + "QuantizedCPU", + "SparseCPU", # brings ~20 kb size regression +] + +# This needs to be kept in sync with https://github.com/pytorch/pytorch/blob/release/1.9/torchgen/gen.py#L892 +PT_BACKEND_HEADERS = [ + "CPU", + "CUDA", + "CompositeExplicitAutograd", + "CompositeImplicitAutograd", + "Meta", +] + +PT_BASE_OPS = [ + "aten::_coalesced_", + "aten::_copy_from", + "aten::_empty_affine_quantized", + "aten::_empty_per_channel_affine_quantized", + "aten::_indices", + "aten::_nnz", + "aten::_values", + "aten::add", + "aten::add_", + "aten::arange", + "aten::as_strided", + "aten::as_strided_", + "aten::cat", + "aten::clone", + "aten::coalesce", + "aten::contiguous", + "aten::copy_", + "aten::copy_sparse_to_sparse_", + "aten::dense_dim", + "aten::dequantize", + "aten::div", + "aten::div_", + "aten::empty", + "aten::empty_like", + "aten::empty_strided", + "aten::empty.memory_format", + "aten::eq", + "aten::equal", + "aten::expand", + "aten::fill_", + "aten::is_coalesced", + "aten::is_complex", + "aten::is_floating_point", + "aten::is_leaf", + "aten::is_nonzero", + "aten::item", + "aten::max", + "aten::min", + "aten::mul", + "aten::mul_", + "aten::narrow", + "aten::ne", + "aten::permute", + "aten::q_per_channel_axis", + "aten::q_per_channel_scales", + "aten::q_per_channel_zero_points", + "aten::q_scale", + "aten::q_zero_point", + "aten::qscheme", + "aten::quantize_per_tensor", + "aten::reshape", + "aten::_reshape_alias", + "aten::resize_", + "aten::resize_as_", + "aten::scalar_tensor", + "aten::select", + "aten::set_", + "aten::size", + "aten::slice", + "aten::sparse_dim", + "aten::sparse_resize_and_clear_", + "aten::squeeze", + "aten::squeeze_", + "aten::stride", + "aten::sub", + "aten::sub_", + "aten::sum", + "aten::t", + "aten::to", + "aten::_to_copy", + "aten::unsqueeze", + "aten::view", + "aten::zero_", + "aten::zeros", + "aten::zeros_like", +] + +def get_aten_compiler_flags(): + return ATEN_COMPILER_FLAGS + +def get_generate_code_bin_outs(): + return { + "autograd/generated/ADInplaceOrViewTypeEverything.cpp": ["autograd/generated/ADInplaceOrViewTypeEverything.cpp"], + "autograd/generated/ADInplaceOrViewType_0.cpp": ["autograd/generated/ADInplaceOrViewType_0.cpp"], + "autograd/generated/ADInplaceOrViewType_1.cpp": ["autograd/generated/ADInplaceOrViewType_1.cpp"], + "autograd/generated/Functions.cpp": ["autograd/generated/Functions.cpp"], + "autograd/generated/Functions.h": ["autograd/generated/Functions.h"], + "autograd/generated/TraceTypeEverything.cpp": ["autograd/generated/TraceTypeEverything.cpp"], + "autograd/generated/TraceType_0.cpp": ["autograd/generated/TraceType_0.cpp"], + "autograd/generated/TraceType_1.cpp": ["autograd/generated/TraceType_1.cpp"], + "autograd/generated/TraceType_2.cpp": ["autograd/generated/TraceType_2.cpp"], + "autograd/generated/TraceType_3.cpp": ["autograd/generated/TraceType_3.cpp"], + "autograd/generated/TraceType_4.cpp": ["autograd/generated/TraceType_4.cpp"], + "autograd/generated/VariableType.h": ["autograd/generated/VariableType.h"], + "autograd/generated/VariableTypeEverything.cpp": ["autograd/generated/VariableTypeEverything.cpp"], + "autograd/generated/VariableType_0.cpp": ["autograd/generated/VariableType_0.cpp"], + "autograd/generated/VariableType_1.cpp": ["autograd/generated/VariableType_1.cpp"], + "autograd/generated/VariableType_2.cpp": ["autograd/generated/VariableType_2.cpp"], + "autograd/generated/VariableType_3.cpp": ["autograd/generated/VariableType_3.cpp"], + "autograd/generated/VariableType_4.cpp": ["autograd/generated/VariableType_4.cpp"], + "autograd/generated/variable_factories.h": ["autograd/generated/variable_factories.h"], + } + +ATEN_COMPILER_FLAGS = [ + "-fexceptions", + "-frtti", + "-fPIC", + "-Os", + "-Wno-absolute-value", + "-Wno-deprecated-declarations", + "-Wno-macro-redefined", + "-Wno-tautological-constant-out-of-range-compare", + "-Wno-unknown-pragmas", + "-Wno-unknown-warning-option", + "-Wno-unused-function", + "-Wno-unused-variable", + "-Wno-pass-failed", + "-Wno-shadow", +] + +PT_COMPILER_FLAGS = [ + "-frtti", + "-Os", + "-Wno-unknown-pragmas", + "-Wno-write-strings", + "-Wno-unused-variable", + "-Wno-unused-function", + "-Wno-deprecated-declarations", + "-Wno-shadow", + "-Wno-global-constructors", + "-Wno-missing-prototypes", + "-std=gnu++17", # to accommodate Eigen +] + +def get_template_source_dict(): + ret = {} + for file_path in TEMPLATE_SOURCE_LIST: + path_prefix = paths.dirname(file_path) + if path_prefix not in ret: + ret[path_prefix] = [] + ret[path_prefix].append(file_path) + return ret + +def get_gen_oplist_outs(): + return { + #"SupportedMobileModelsRegistration.cpp": [ + # "SupportedMobileModelsRegistration.cpp", + #], + "selected_mobile_ops.h": [ + "selected_mobile_ops.h", + ], + "selected_operators.yaml": [ + "selected_operators.yaml", + ], + } + +def get_pt_compiler_flags(): + return PT_COMPILER_FLAGS + +def get_aten_preprocessor_flags(): + # read_config is not allowed outside of function in Starlark + ATEN_PREPROCESSOR_FLAGS = [ + "-DC10_MOBILE", + "-DCPU_CAPABILITY_DEFAULT", + "-DCPU_CAPABILITY=DEFAULT", + "-DCAFFE2_USE_LITE_PROTO", + "-DATEN_CUDNN_ENABLED_FBXPLAT=0", + "-DATEN_MKLDNN_ENABLED_FBXPLAT=0", + "-DATEN_NNPACK_ENABLED_FBXPLAT=0", + "-DATEN_MKL_ENABLED_FBXPLAT=0", + "-DATEN_MKL_SEQUENTIAL_FBXPLAT=0", + "-DUSE_PYTORCH_METAL", + "-DUSE_PYTORCH_QNNPACK", + "-DUSE_XNNPACK", + "-DNO_EXPORT", + "-DPYTORCH_QNNPACK_RUNTIME_QUANTIZATION", + "-DAT_PARALLEL_OPENMP_FBXPLAT=0", + "-DAT_PARALLEL_NATIVE_FBXPLAT=1", + "-DAT_PARALLEL_NATIVE_TBB_FBXPLAT=0", + "-DUSE_LAPACK_FBXPLAT=0", + "-DAT_BLAS_F2C_FBXPLAT=0", + "-DAT_BLAS_USE_CBLAS_DOT_FBXPLAT=0", + "-DUSE_RUY_QMATMUL", # need third_party:ruy + ] + + # if get_disable_per_op_profiling(): + ATEN_PREPROCESSOR_FLAGS.append("-DPYTORCH_DISABLE_PER_OP_PROFILING") + return ATEN_PREPROCESSOR_FLAGS + +TEMPLATE_SOURCE_LIST = [ + "torch/csrc/jit/runtime/register_prim_ops.cpp", + "torch/csrc/jit/runtime/register_special_ops.cpp", +] + aten_native_source_list + +# For selective build, we can lump the CPU and CPU kernel sources altogether +# because there is only ever one vectorization variant that is compiled +def aten_ufunc_generated_all_cpu_sources(gencode_pattern = "{}"): + return ( + aten_ufunc_generated_cpu_sources(gencode_pattern) + + aten_ufunc_generated_cpu_kernel_sources(gencode_pattern) + ) + +def get_template_registration_files_outs(): + outs = {} + + for file_path in TEMPLATE_SOURCE_LIST: + outs[file_path] = [file_path] + + for base_name in aten_ufunc_generated_all_cpu_sources(): + file_path = "aten/src/ATen/{}".format(base_name) + outs[file_path] = [file_path] + + return outs + +def get_pt_preprocessor_flags(): + # read_config is not allowed outside of function in Starlark + PT_PREPROCESSOR_FLAGS = [ + "-D_THP_CORE", + "-DC10_MOBILE", + "-DUSE_SCALARS", + "-DNO_CUDNN_DESTROY_HANDLE", + "-DNO_EXPORT", + "-DBUILD_CAFFE2", + ] + return PT_PREPROCESSOR_FLAGS + +def is_arvr_mode(): + return False + +def get_build_from_deps_query(): + build_from_query = native.read_config("pt", "build_from_deps_query", "1") + return bool(int(build_from_query)) + +def get_enable_lightweight_dispatch(): + enable_lightweight_dispatch = native.read_config("pt", "enable_lightweight_dispatch", "0") + return bool(int(enable_lightweight_dispatch)) + +def get_static_dispatch_backend(): + static_dispatch_backend = native.read_config("pt", "static_dispatch_backend", None) + if static_dispatch_backend == None: + return [] + return static_dispatch_backend.split(";") + +def get_aten_codegen_extra_params(backends): + if get_build_from_deps_query(): + extra_params = { + "force_schema_registration": True, + } + static_backends = get_static_dispatch_backend() + if static_backends: + extra_params["static_dispatch_backend"] = static_backends + extra_params["enabled_backends"] = static_backends + else: + extra_params["enabled_backends"] = backends + return extra_params + else: + return {} + +def gen_aten_files( + name, + extra_flags = {}, + visibility = [], + compatible_with = []): + extra_params = [] + force_schema_registration = extra_flags.get("force_schema_registration", False) + op_registration_allowlist = extra_flags.get("op_registration_allowlist", None) + op_selection_yaml_path = extra_flags.get("op_selection_yaml_path", None) + enabled_backends = extra_flags.get("enabled_backends", None) + static_dispatch_backend = extra_flags.get("static_dispatch_backend", None) + + if force_schema_registration: + extra_params.append("--force_schema_registration") + if op_registration_allowlist != None and is_string(op_registration_allowlist): + extra_params.append("--op_registration_whitelist") + extra_params.append(op_registration_allowlist) + if op_selection_yaml_path != None and is_string(op_selection_yaml_path): + extra_params.append("--op_selection_yaml_path") + extra_params.append(op_selection_yaml_path) + if enabled_backends != None and is_list(enabled_backends): + extra_params.append("--backend_whitelist") + extra_params.extend(enabled_backends) + if get_enable_lightweight_dispatch(): + extra_params.append("--skip_dispatcher_op_registration") + if static_dispatch_backend: + extra_params.append("--static_dispatch_backend") + extra_params.extend(static_dispatch_backend) + backends = static_dispatch_backend + else: + backends = enabled_backends + fb_xplat_genrule( + name = name, + default_outs = ["."], + outs = get_aten_generated_files(backends), + cmd = "$(exe //torchgen:gen) " + " ".join([ + "--source-path $(location //:aten_src_path)/aten/src/ATen", + "--install_dir $OUT", + ] + extra_params), + visibility = visibility, + compatible_with = compatible_with, + ) + +def get_aten_generated_files(enabled_backends): + # NB: RegisterMeta counts as an optionally enabled backend, + # and is intentionally omitted from here + src_files = [ + "RegisterBackendSelect.cpp", + "RegisterCompositeImplicitAutograd.cpp", + "RegisterCompositeExplicitAutograd.cpp", + "CompositeViewCopyKernels.cpp", + "RegisterSchema.cpp", + "Declarations.yaml", + "Functions.cpp", + "Functions.h", + "RedispatchFunctions.h", + "NativeFunctions.h", + "NativeMetaFunctions.h", + "MethodOperators.h", + "FunctionalInverses.h", + "Operators.h", + "Operators_0.cpp", + "Operators_1.cpp", + "Operators_2.cpp", + "Operators_3.cpp", + "Operators_4.cpp", + "CompositeImplicitAutogradFunctions.h", + "CompositeImplicitAutogradFunctions_inl.h", + "CompositeExplicitAutogradFunctions.h", + "CompositeExplicitAutogradFunctions_inl.h", + "core/ATenOpList.cpp", + "core/TensorBody.h", + "core/TensorMethods.cpp", + "core/aten_interned_strings.h", + ] + get_aten_derived_type_srcs(enabled_backends) + + # This is tiresome. A better strategy would be to unconditionally + # generate these files, and then only actually COMPILE them depended + # on the generated set. C'est la vie... + if "CPU" in enabled_backends: + src_files.extend(aten_ufunc_generated_cpu_sources()) + src_files.extend(aten_ufunc_generated_cpu_kernel_sources()) + if "CUDA" in enabled_backends: + # Cannot unconditionally include this, because in the Edge selective + # build CUDA is not enabled and thus the ufunc codegen for CUDA gets + # skipped + src_files.extend(aten_ufunc_generated_cuda_sources()) + + res = {} + for file_name in src_files: + res[file_name] = [file_name] + return res + +def get_template_registration_file_rules(rule_name): + rules = [] + for file_path in TEMPLATE_SOURCE_LIST: + rules.append(":{}[{}]".format(rule_name, file_path)) + for file_path in aten_ufunc_generated_all_cpu_sources(): + rules.append(":{}[aten/src/ATen/{}]".format(rule_name, file_path)) + + return rules + +# Originally, there were two sets of codes in caffe2:aten_cpu, native codes and non-native. +# Now we have only non-naitve sources in aten_cpu. However, there are some aten related +# tests that may require both native and non-native codes. This rule is used to generate +# both aten_cpu and aten_native_cpu. They are using the same compilation setups. +def build_aten_cpu(name, srcs, deps = []): + cxx_library( + name = name, + srcs = srcs, + header_namespace = "", + compiler_flags = get_pt_compiler_flags(), + exported_preprocessor_flags = get_aten_preprocessor_flags(), + link_whole = True, + linker_flags = ["-Wl,--no-as-needed", "-ldl"], + visibility = ["PUBLIC"], + deps = [ + "//third_party:cpuinfo", + "//third_party:glog", + "//third_party:XNNPACK", + #"//third_party/linker_lib:omp", + ], + exported_deps = [ + "//third_party:fmt", + "//aten/src/ATen/native/quantized/cpu/qnnpack:pytorch_qnnpack", + "//c10:c10", + ":aten_header", + ":caffe2_headers", + ":common_core", + ":generated_aten_config_header", + ":generated_aten_headers_cpu", + ":jit_core_headers", + ":pthreadpool", + ":th_header", + "//third_party:ruy_lib", + ], + ) + +######### selective build ######### + +def get_pt_ops_deps(name, deps, train = False, enforce_traced_op_list = False, enable_flatbuffer = False, **kwargs): + if not get_build_from_deps_query(): + return deps + pt_operator_registry( + name, + deps, + train = train, + enforce_traced_op_list = enforce_traced_op_list, + enable_flatbuffer = enable_flatbuffer, + **kwargs + ) + return deps + [":" + name] + +# pt_operator_registry is the method that defines the fb_xplat_cxx_library that contains +# code for all selected PyTorch Operators and kernel functions. This also includes +# operator registration into the dispatcher. +# +# template_select: bool: Indicates if template based selective build is enabled. +# +# enforce_traced_op_list: bool: Enforces that only new-style operator +# lists based on the all_mobile_model_configs.yaml file and tracing based selective +# build are used in this library. +# +# train: bool: Build this library for training (True) or inference only (False). +# If built for training, codegen for VariableType is also included. +# +# pt_allow_forced_schema_registration: Manually disables forced schema registration when set to false, Default is true. +# Only does anything when train=True and the app requires full jit then force_schema_registration needs to occur. +# As Federated Learning migrates to lite interpreter +# we can slowly turn off forced schema registration as it is useless space and floods the compatibility api +# +def pt_operator_registry( + name, + deps = [], + train = False, + labels = [], + env = [], + template_select = True, + enforce_traced_op_list = False, + pt_allow_forced_schema_registration = True, + enable_flatbuffer = False, + **kwargs): + compatible_with = kwargs.get("compatible_with", []) + code_gen_files = pt_operator_query_codegen(name, deps = deps, train = train, enforce_traced_op_list = enforce_traced_op_list, pt_allow_forced_schema_registration = pt_allow_forced_schema_registration, compatible_with = compatible_with) + code_gen_srcs = code_gen_files["srcs"] + + lib_deps = [ + ":aten_cpu", + ":torch_mobile_core", + "//c10:c10", + "//third_party:glog", + ] + + #if train: + # lib_deps = lib_deps + ["fbsource//xplat/caffe2:torch_mobile_train"] + + exported_preprocessor_flags = get_aten_preprocessor_flags() + exported_preprocessor_flags += kwargs.pop("exported_preprocessor_flags", []) + if template_select: + # In addition to the + # original code-gen select, this option further filter more operators based on + # compile-time calculation. Examples include prim ops and any other ops that were + # not filtered out before. The purpose of this option is to reduce the production + # size further. However, it may have less flexibility, especially for tests from + # python, where the used operator list is not explicitly generated. If the tests + # are for functionality but not for size, and it's difficult to maintain an explicit + # operator list, it's suggested to turn this option off. + exported_preprocessor_flags.append("-DTEMPLATE_SELECTIVE_BUILD") + kwargs.pop("exported_headers", []) + cxx_library( + name = name, + srcs = code_gen_srcs, + linker_flags = [ + "-Wl,--no-as-needed", + "-ldl", + ], + link_whole = True, + soname = "libtorch-code-gen.$(ext)", + compiler_flags = get_aten_compiler_flags(), + platform_compiler_flags = get_cpukernel_avx2_flags(), + platform_deps = get_cpukernel_avx2_deps(), + header_namespace = "ATen", + exported_headers = code_gen_files["headers"], + exported_preprocessor_flags = exported_preprocessor_flags, + headers = kwargs.pop("headers", []), + deps = lib_deps + [ + "//third_party:XNNPACK", + ], + **kwargs + ) + +def get_aten_derived_type_src_rules(aten_rule_name, enabled_backends): + return [ + ":{}[{}]".format(aten_rule_name, "Register" + backend + ".cpp") + for backend in enabled_backends + ] + +def get_aten_selective_cpp_rules(aten_rule_name, enabled_backends): + return [ + ":{}[{}]".format(aten_rule_name, f) + for f in ["RegisterCompositeImplicitAutograd.cpp", "RegisterCompositeExplicitAutograd.cpp", "RegisterSchema.cpp", "RegisterBackendSelect.cpp", "CompositeViewCopyKernels.cpp"] + ] + get_aten_derived_type_src_rules(aten_rule_name, enabled_backends) + +def get_aten_derived_type_srcs(enabled_backends): + return [ + "Register" + derived_type + ".cpp" + for derived_type in enabled_backends + ] + [ + derived_type + "Functions.h" + for derived_type in enabled_backends + if derived_type in PT_BACKEND_HEADERS or derived_type in get_static_dispatch_backend() + ] + [ + derived_type + "Functions_inl.h" + for derived_type in enabled_backends + if derived_type in PT_BACKEND_HEADERS or derived_type in get_static_dispatch_backend() + ] + +def pt_operator_query_codegen(name, deps = [], train = False, enforce_traced_op_list = False, pt_allow_forced_schema_registration = True, compatible_with = []): + oplist_dir_name = name + "_pt_oplist" + + # @lint-ignore BUCKLINT + fb_xplat_genrule( + name = oplist_dir_name, + cmd = ("$(exe //:gen_oplist) " + + "--model_file_list_path $(@query_outputs 'attrfilter(labels, pt_operator_library, deps(set({deps})))') " + + ("" if enforce_traced_op_list else "--allow_include_all_overloads ") + + "--output_dir $OUT ").format(deps = " ".join(["\"{}\"".format(d) for d in deps])), + outs = get_gen_oplist_outs(), + default_outs = ["."], + compatible_with = compatible_with, + ) + + # Aten files + aten_genrule = name + "_aten" + extra_flags = { + "enabled_backends": USED_PT_BACKENDS, + "op_selection_yaml_path": "$(location :{}[selected_operators.yaml])".format(oplist_dir_name), + } + + if train and pt_allow_forced_schema_registration: + extra_flags["force_schema_registration"] = True + + # if get_enable_lightweight_dispatch(): + # unboxing_genrule = name + "_unboxing" + # gen_aten_unboxing_files( + # unboxing_genrule, + # extra_flags = extra_flags, + # ) + + static_dispatch_backend = get_static_dispatch_backend() + if static_dispatch_backend: + extra_flags["static_dispatch_backend"] = static_dispatch_backend + + gen_aten_files( + aten_genrule, + extra_flags = extra_flags, + compatible_with = compatible_with, + ) + + # unboxing_wrappers files + extra_params = [ + "--operators_yaml_path", + "$(location :" + oplist_dir_name + "[selected_operators.yaml])", + ] + unboxing_and_autograd_genrule = name + "_unboxing_and_autograd" + gen_aten_libtorch_files(unboxing_and_autograd_genrule, extra_params, compatible_with) + + # Template runtime files (prim ops, etc) + template_registration_genrule = name + "_template_registration" + copy_template_registration_files(template_registration_genrule) + + srcs = get_aten_selective_cpp_rules( + aten_genrule, + static_dispatch_backend if static_dispatch_backend else USED_PT_BACKENDS, + ) + get_template_registration_file_rules( + template_registration_genrule, + ) + ([ + ":{}[autograd/generated/VariableType_0.cpp]".format(unboxing_and_autograd_genrule), + ":{}[autograd/generated/VariableType_1.cpp]".format(unboxing_and_autograd_genrule), + ":{}[autograd/generated/VariableType_2.cpp]".format(unboxing_and_autograd_genrule), + ":{}[autograd/generated/VariableType_3.cpp]".format(unboxing_and_autograd_genrule), + ":{}[autograd/generated/VariableType_4.cpp]".format(unboxing_and_autograd_genrule), + ":{}[autograd/generated/ADInplaceOrViewType_0.cpp]".format(unboxing_and_autograd_genrule), + ":{}[autograd/generated/ADInplaceOrViewType_1.cpp]".format(unboxing_and_autograd_genrule), + ] if train else []) + ([ + #":{}[SupportedMobileModelsRegistration.cpp]".format(oplist_dir_name), + ]) + + headers = { + "selected_mobile_ops.h": ":{}[selected_mobile_ops.h]".format(oplist_dir_name), + } + + # if get_enable_lightweight_dispatch(): + # srcs.extend([ + # ":{}[UnboxingFunctions_0.cpp]".format(unboxing_genrule), + # ":{}[UnboxingFunctions_1.cpp]".format(unboxing_genrule), + # ":{}[UnboxingFunctions_2.cpp]".format(unboxing_genrule), + # ":{}[UnboxingFunctions_3.cpp]".format(unboxing_genrule), + # ":{}[UnboxingFunctions_4.cpp]".format(unboxing_genrule), + # ":{}[RegisterCodegenUnboxedKernels_0.cpp]".format(unboxing_genrule), + # ":{}[RegisterCodegenUnboxedKernels_1.cpp]".format(unboxing_genrule), + # ":{}[RegisterCodegenUnboxedKernels_2.cpp]".format(unboxing_genrule), + # ":{}[RegisterCodegenUnboxedKernels_3.cpp]".format(unboxing_genrule), + # ":{}[RegisterCodegenUnboxedKernels_4.cpp]".format(unboxing_genrule), + # ":{}[RegisterCodegenUnboxedKernels_5.cpp]".format(unboxing_genrule), + # ":{}[RegisterCodegenUnboxedKernels_6.cpp]".format(unboxing_genrule), + # ":{}[RegisterCodegenUnboxedKernels_7.cpp]".format(unboxing_genrule), + # ":{}[RegisterCodegenUnboxedKernels_8.cpp]".format(unboxing_genrule), + # ":{}[RegisterCodegenUnboxedKernels_9.cpp]".format(unboxing_genrule), + # ]) + # headers["UnboxingFunctions.h"] = ":{}[UnboxingFunctions.h]".format(unboxing_genrule) + return {"headers": headers, "srcs": srcs} + +def gen_aten_libtorch_files(name, extra_params = [], compatible_with = []): + fb_xplat_genrule( + name = name, + outs = get_generate_code_bin_outs(), + default_outs = ["."], + cmd = "mkdir -p tools && " + + "$(exe //tools/setup_helpers:generate_code_bin) " + " ".join( + # Mobile build only needs libtorch - skip python bindings for now, except + # for ovrsource, which needs Python bindings. + (["--subset libtorch"] if not is_arvr_mode() else []) + [ + "--native-functions-path $(location :aten_src_path)/aten/src/ATen/native/native_functions.yaml", + "--tags-path $(location :aten_src_path)/aten/src/ATen/native/tags.yaml", # todo D35992309 + "--install_dir $OUT", + ] + extra_params, + ), + cmd_exe = "@powershell -Command New-Item -Path tools -ItemType Directory -Force; " + + "$(exe //tools/setup_helpers:generate_code_bin) " + " ".join( + # Mobile build only needs libtorch - skip python bindings for now, except + # for ovrsource, which needs Python bindings. + (["--subset libtorch"] if not is_arvr_mode() else []) + [ + "--native-functions-path $(location :aten_src_path)/aten/src/ATen/native/native_functions.yaml", + "--tags-path $(location :aten_src_path)/aten/src/ATen/native/tags.yaml", + "--install_dir $OUT", + ] + extra_params, + ), + compatible_with = compatible_with, + ) + +def copy_template_registration_files(name): + cmd = [] + cmd_exe = [] + + template_source_dict = get_template_source_dict() + + # Ideally, we would run one copy command for a single source directory along + # with all its child directories, but it's somewhat hard to know if a directory + # is a child of another just bu looking at the metadata (directory relative + # path) that we currently have since 1 directory could look like a parent of + # another and yet come from a different filegroup() rule. + # + for (path_prefix, file_paths) in template_source_dict.items(): + cmd.append("mkdir -p $OUT/{}".format(path_prefix)) + cmd_exe.append("md $OUT/{}".format(path_prefix)) + + # Adding *.cpp is a workaround to prevent cp from thrown an error when it + # encounters a directory (since -r was not specified). If files with an + # extension other than .cpp need to be copied, then the command below + # will not work and will need to be updated. + # + cmd.append("cp -f {0}/{1}/*.cpp $OUT/{1}/".format("$(location :templated_selective_build_srcs)", path_prefix)) + cmd_exe.append("robocopy /E {0}/{1} $OUT/{1}".format("$(location :templated_selective_build_srcs)", path_prefix)) + + cmd.append("mkdir -p $OUT/aten/src/ATen") + cmd_exe.append("md $OUT/aten/src/ATen") + + # NB: CUDA is skipped here because this is selective build and CUDA is not + # supported for selective build + for ufunc_file in aten_ufunc_generated_all_cpu_sources("$(location :gen_aten[{}])"): + cmd.append("cp -f " + ufunc_file + " $OUT/aten/src/ATen") + cmd_exe.append("copy " + ufunc_file + " $OUT/aten/src/ATen") + + fb_xplat_genrule( + name = name, + cmd = " && ".join(cmd), + cmd_exe = "@powershell -Command " + ("; ".join(cmd_exe)), + outs = get_template_registration_files_outs(), + default_outs = ["."], + ) + +def pt_operator_library( + name, + ops = [], + exported_deps = [], + check_decl = True, + train = False, + model = None, + include_all_operators = False, + **kwargs): + model_name = name + + if get_build_from_deps_query(): + ops = [op.strip() for op in ops] + + # If ops are specified, then we are in static selective build mode, so we append + # base ops to this list to avoid additional special case logic in subsequent code. + if len(ops) > 0: + ops.extend(PT_BASE_OPS) + + visibility = kwargs.pop("visibility", ["PUBLIC"]) + + fb_xplat_genrule( + name = name, + out = "model_operators.yaml", + cmd = ( + "$(exe :gen_operators_yaml) " + + "{optionally_root_ops} " + + "{optionally_training_root_ops} " + + "--rule_name {rule_name} " + + "--output_path \"${{OUT}}\" " + + "--model_name {model_name} " + + "--dep_graph_yaml_path pytorch_op_deps.yaml " + + "--models_yaml_path all_mobile_model_configs.yaml " + + #"{optionally_model_versions} " + + #"{optionally_model_assets} " + + #"{optionally_model_traced_backends} " + + "{optionally_include_all_operators}" + ).format( + rule_name = name, + model_name = model_name, + optionally_root_ops = "--root_ops " + (",".join(ops)) if len(ops) > 0 else "", + optionally_training_root_ops = "--training_root_ops " + (",".join(ops)) if len(ops) > 0 and train else "", + #optionally_model_versions = "--model_versions " + (",".join(model_versions)) if model_versions != None else "", + #optionally_model_assets = "--model_assets " + (",".join(model_assets)) if model_assets != None else "", + #optionally_model_traced_backends = "--model_traced_backends " + (",".join(model_traced_backends)) if model_traced_backends != None else "", + optionally_include_all_operators = "--include_all_operators " if include_all_operators else "", + ), + labels = ["pt_operator_library"], # for pt_operator_query_codegen query + visibility = visibility, + **kwargs + ) + else: + if check_decl: + pass + # ensure_ops_are_declared(ops) + + cxx_library( + name = name, + compiler_flags = get_pt_compiler_flags(), + cxx_platform_compiler_flags = get_cpukernel_avx2_flags(), + exported_deps = exported_deps, + **kwargs + ) + +def compose_platform_setting_list(settings): + """Settings object: + os/cpu pair: should be valid key, or at most one part can be wildcard. + flags: the values added to the compiler flags + """ + result = [] + for setting in settings: + result = result.append([ + "^{}-{}$".format(setting["os"], setting["cpu"]), + setting["flags"], + ]) + return result + +def get_cpukernel_avx2_flags(): + # flags = compose_platform_setting_list([ + # { + # "cpu": "x86_64", + # "flags": ["-DHAVE_AVX2_CPU_DEFINITION"], + # "os": "macosx", + # }, + # ]) if build_cpukernel_avx2() else [] + return [] + +def build_cpukernel_avx2(): + return not is_arvr_mode() + +def get_cpukernel_avx2_deps(): + # flags = compose_platform_setting_list([ + # { + # "cpu": "x86_64", + # "flags": ["fbsource//xplat/caffe2:cpukernel_avx2"], + # "os": "macosx", + # }, + # ]) if build_cpukernel_avx2() else [] + return [] diff --git a/scripts/buck_setup.sh b/scripts/buck_setup.sh new file mode 100644 index 000000000000..0d094fd98e95 --- /dev/null +++ b/scripts/buck_setup.sh @@ -0,0 +1,29 @@ +#!/bin/bash +printf "\n[Creating .buckconfig]\n" +cp .buckconfig.oss .buckconfig + +cd third_party || return + +printf "\n[Generating wrappers for cpuionfo]\n" +python3 generate-cpuinfo-wrappers.py + +printf "\n[Generating wrappers for xnnpack]\n" +python3 generate-xnnpack-wrappers.py + +# bazel-skylib +printf "\n[Downloading bazel-skylib-1.0.2]\n" +curl -L -o /tmp/bazel-skylib-1.0.2.tar.gz https://github.com/bazelbuild/bazel-skylib/releases/download/1.0.2/bazel-skylib-1.0.2.tar.gz +mkdir bazel-skylib +tar -xf /tmp/bazel-skylib-1.0.2.tar.gz -C bazel-skylib/ + +# glog +printf "\n[Downloading glog-0.4.0]\n" +curl -L -o /tmp/glog-0.4.0.tar.gz https://github.com/google/glog/archive/v0.4.0.tar.gz +tar -xf /tmp/glog-0.4.0.tar.gz -C /tmp/ +mv /tmp/glog-0.4.0/ glog/ + +# ruy +printf "\n[Downloading ruy]\n" +curl -L -o /tmp/ruy.zip https://github.com/google/ruy/archive/a09683b8da7164b9c5704f88aef2dc65aa583e5d.zip +unzip -q /tmp/ruy.zip -d /tmp/ +mv /tmp/ruy-a09683b8da7164b9c5704f88aef2dc65aa583e5d ruy/ diff --git a/scripts/build_android.sh b/scripts/build_android.sh index a2dd690012f2..5913f5e8b768 100755 --- a/scripts/build_android.sh +++ b/scripts/build_android.sh @@ -117,6 +117,13 @@ if [ "${TRACING_BASED}" == 1 ]; then else CMAKE_ARGS+=("-DTRACING_BASED=OFF") fi +if [ "${USE_LIGHTWEIGHT_DISPATCH}" == 1 ]; then + CMAKE_ARGS+=("-DUSE_LIGHTWEIGHT_DISPATCH=ON") + CMAKE_ARGS+=("-DSTATIC_DISPATCH_BACKEND=CPU") +else + CMAKE_ARGS+=("-DUSE_LIGHTWEIGHT_DISPATCH=OFF") +fi + CMAKE_ARGS+=("-DBUILD_MOBILE_BENCHMARK=$BUILD_MOBILE_BENCHMARK") CMAKE_ARGS+=("-DBUILD_MOBILE_TEST=$BUILD_MOBILE_TEST") CMAKE_ARGS+=("-DBUILD_PYTHON=OFF") diff --git a/scripts/build_ios.sh b/scripts/build_ios.sh index b96b8094a606..2bb8763ef17d 100755 --- a/scripts/build_ios.sh +++ b/scripts/build_ios.sh @@ -88,6 +88,12 @@ if [ "${TRACING_BASED}" == 1 ]; then else CMAKE_ARGS+=("-DTRACING_BASED=OFF") fi +if [ "${USE_LIGHTWEIGHT_DISPATCH}" == 1 ]; then + CMAKE_ARGS+=("-DUSE_LIGHTWEIGHT_DISPATCH=ON") + CMAKE_ARGS+=("-DSTATIC_DISPATCH_BACKEND=CPU") +else + CMAKE_ARGS+=("-DUSE_LIGHTWEIGHT_DISPATCH=OFF") +fi CMAKE_ARGS+=("-DUSE_LITE_INTERPRETER_PROFILER=OFF") diff --git a/scripts/jit/log_extract.py b/scripts/jit/log_extract.py new file mode 100644 index 000000000000..61e3172fe0b3 --- /dev/null +++ b/scripts/jit/log_extract.py @@ -0,0 +1,104 @@ +import argparse +import functools +import traceback +from torch.utils.jit.log_extract import extract_ir, load_graph_and_inputs, run_baseline_no_fusion, run_nnc, run_nvfuser +from typing import List, Tuple, Callable, Optional + +''' +Usage: +1. Run your script and pipe into a log file + PYTORCH_JIT_LOG_LEVEL=">>graph_fuser" python3 my_test.py &> log.txt +2. Run log_extract: + log_extract.py log.txt --nvfuser --nnc-dynamic --nnc-static + +You can also extract the list of extracted IR: + log_extract.py log.txt --output + +Passing in --graphs 0 2 will only run graphs 0 and 2 +''' + + +def test_runners(graphs: List[str], runners: List[Tuple[str, Callable]], graph_set: Optional[List[int]]): + for i, ir in enumerate(graphs): + _, inputs = load_graph_and_inputs(ir) + if graph_set and i not in graph_set: + continue + + print(f"Running Graph {i}") + prev_result = None + prev_runner_name = None + for runner in runners: + runner_name, runner_fn = runner + try: + result = runner_fn(ir, inputs) + if prev_result: + improvement = (prev_result / result - 1) * 100 + print(f"{runner_name} : {result:.6f} ms improvement over {prev_runner_name}: improvement: {improvement:.2f}%") + else: + print(f"{runner_name} : {result:.6f} ms") + prev_result = result + prev_runner_name = runner_name + except RuntimeError: + print(f" Graph {i} failed for {runner_name} :", traceback.format_exc()) + + +def run(): + parser = argparse.ArgumentParser( + description="Extracts torchscript IR from log files and, optionally, benchmarks it or outputs the IR" + ) + parser.add_argument("filename", help="Filename of log file") + parser.add_argument("--nvfuser", dest="nvfuser", action="store_true", help="benchmark nvfuser") + parser.add_argument("--no-nvfuser", dest="nvfuser", action="store_false", help="DON'T benchmark nvfuser") + parser.set_defaults(nvfuser=False) + parser.add_argument("--nnc-static", dest="nnc_static", action="store_true", help="benchmark nnc static") + parser.add_argument("--no-nnc-static", dest="nnc_static", action="store_false", help="DON'T benchmark nnc static") + parser.set_defaults(nnc_static=False) + + parser.add_argument("--nnc-dynamic", dest="nnc_dynamic", action="store_true", help="nnc with dynamic shapes") + parser.add_argument( + "--no-nnc-dynamic", + dest="nnc_dynamic", + action="store_false", + help="DONT't benchmark nnc with dynamic shapes") + parser.set_defaults(nnc_dynamic=False) + + + parser.add_argument("--baseline", dest="baseline", action="store_true", help="benchmark baseline") + parser.add_argument("--no-baseline", dest="baseline", action="store_false", help="DON'T benchmark baseline") + parser.set_defaults(baseline=False) + + parser.add_argument("--output", dest="output", action="store_true", help="Output graph IR") + parser.add_argument("--no-output", dest="output", action="store_false", help="DON'T output graph IR") + parser.set_defaults(output=False) + + parser.add_argument('--graphs', nargs="+", type=int, help="Run only specified graph indices") + + + args = parser.parse_args() + graphs = extract_ir(args.filename) + + graph_set = args.graphs + graph_set = graph_set if graph_set else None + + options = [] + if args.baseline: + options.append(("Baseline no fusion", run_baseline_no_fusion)) + if args.nnc_dynamic: + options.append(("NNC Dynamic", functools.partial(run_nnc, dynamic=True))) + if args.nnc_static: + options.append(("NNC Static", functools.partial(run_nnc, dynamic=False))) + if args.nvfuser: + options.append(("NVFuser", run_nvfuser)) + + test_runners(graphs, options, graph_set) + + if args.output: + quoted = [] + for i, ir in enumerate(graphs): + if graph_set and i not in graph_set: + continue + quoted.append("\"\"\"" + ir + "\"\"\"") + print("[" + ", ".join(quoted) + "]") + +if __name__ == "__main__": + run() diff --git a/scripts/onnx/test.sh b/scripts/onnx/test.sh index 3b39f6005876..b8259eea874e 100755 --- a/scripts/onnx/test.sh +++ b/scripts/onnx/test.sh @@ -51,7 +51,7 @@ fi # onnxruntime only support py3 # "Python.h" not found in py2, needed by TorchScript custom op compilation. -if [[ "$BUILD_ENVIRONMENT" == *ort_test1* || "${SHARD_NUMBER}" == "1" ]]; then +if [[ "${SHARD_NUMBER}" == "1" ]]; then # These exclusions are for tests that take a long time / a lot of GPU # memory to run; they should be passing (and you will test them if you # run them locally @@ -69,18 +69,19 @@ if [[ "$BUILD_ENVIRONMENT" == *ort_test1* || "${SHARD_NUMBER}" == "1" ]]; then pytest "${args[@]}" \ "$top_dir/test/onnx/test_pytorch_onnx_onnxruntime.py::TestONNXRuntime_opset7" \ "$top_dir/test/onnx/test_pytorch_onnx_onnxruntime.py::TestONNXRuntime_opset8" \ - "$top_dir/test/onnx/test_pytorch_onnx_onnxruntime.py::TestONNXRuntime" \ + "$top_dir/test/onnx/test_pytorch_onnx_onnxruntime.py::TestONNXRuntime_opset9" \ "$top_dir/test/onnx/test_custom_ops.py" \ "$top_dir/test/onnx/test_models_onnxruntime.py" \ "$top_dir/test/onnx/test_utility_funs.py" \ "$top_dir/test/onnx/test_pytorch_onnx_caffe2.py" \ "$top_dir/test/onnx/test_pytorch_onnx_caffe2_quantized.py" \ - "$top_dir/test/onnx/test_pytorch_onnx_shape_inference.py" + "$top_dir/test/onnx/test_pytorch_onnx_shape_inference.py" \ + "$top_dir/test/onnx/test_onnx_export.py" fi -if [[ "$BUILD_ENVIRONMENT" == *ort_test2* || "${SHARD_NUMBER}" == "2" ]]; then +if [[ "${SHARD_NUMBER}" == "2" ]]; then # Update the loop for new opsets - for i in $(seq 10 15); do + for i in $(seq 10 16); do pytest "${args[@]}" \ "$top_dir/test/onnx/test_pytorch_onnx_onnxruntime.py::TestONNXRuntime_opset$i" done diff --git a/scripts/release/cut-release-branch.sh b/scripts/release/cut-release-branch.sh new file mode 100644 index 000000000000..468dbfb184d9 --- /dev/null +++ b/scripts/release/cut-release-branch.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash + +: ' +So you are looking to cut a release branch? Well you came +to the right script. + +This script can be used to cut any branch on any repository + +For `pytorch/pytorch` usage would be like: +> DRY_RUN=disabled cut-release-branch.sh + +For `pytorch/builder` or domains usage would be like: +> DRY_RUN=disabled GIT_BRANCH_TO_CUT_FROM=main RELEASE_VERSION=1.11 cut-release-branch.sh +' + +set -eou pipefail + +GIT_TOP_DIR=$(git rev-parse --show-toplevel) +GIT_REMOTE=${GIT_REMOTE:-origin} +GIT_BRANCH_TO_CUT_FROM=${GIT_BRANCH_TO_CUT_FROM:-viable/strict} + +# should output something like 1.11 +RELEASE_VERSION=${RELEASE_VERSION:-$(cut -d'.' -f1-2 "${GIT_TOP_DIR}/version.txt")} + +DRY_RUN_FLAG="--dry-run" +if [[ ${DRY_RUN:-enabled} == "disabled" ]]; then + DRY_RUN_FLAG="" +fi + + +( + set -x + git fetch --all + git checkout "${GIT_REMOTE}/${GIT_BRANCH_TO_CUT_FROM}" +) + +for branch in "release/${RELEASE_VERSION}" "orig/release/${RELEASE_VERSION}"; do + if git rev-parse --verify "${branch}" >/dev/null 2>/dev/null; then + echo "+ Branch ${branch} already exists, skipping..." + continue + else + ( + set -x + git checkout "${GIT_REMOTE}/${GIT_BRANCH_TO_CUT_FROM}" + git checkout -b "${branch}" + git push "${GIT_REMOTE}" "${branch}" + ) + fi +done diff --git a/scripts/release_notes/commitlist.py b/scripts/release_notes/commitlist.py index 0dd7d0a1692a..4abaffa6fb88 100644 --- a/scripts/release_notes/commitlist.py +++ b/scripts/release_notes/commitlist.py @@ -84,21 +84,35 @@ def keywordInFile(file, keywords): def categorize(commit_hash, title): features = get_features(commit_hash, return_dict=True) title = features['title'] + labels = features['labels'] category = 'Uncategorized' topic = 'Untopiced' + # We ask contributors to label their PR's appropriately + # when they're first landed. + # Check if the labels are there first. + already_categorized = already_topiced = False + for label in labels: + if label.startswith('release notes: '): + category = label.split('release notes: ', 1)[1] + already_categorized = True + if label.startswith('topic: '): + topic = label.split('topic: ', 1)[1] + already_topiced = True + if already_categorized and already_topiced: + return Commit(commit_hash, category, topic, title) + # update this to check if each file starts with caffe2 if 'caffe2' in title: return Commit(commit_hash, 'caffe2', topic, title) if '[codemod]' in title.lower(): return Commit(commit_hash, 'skip', topic, title) - labels = features['labels'] if 'Reverted' in labels: return Commit(commit_hash, 'skip', topic, title) if 'bc_breaking' in labels: topic = 'bc-breaking' if 'module: deprecation' in labels: - topic = 'module: deprecation' + topic = 'deprecation' files_changed = features['files_changed'] for file in files_changed: @@ -128,6 +142,9 @@ def categorize(commit_hash, title): if CommitList.keywordInFile(file, ['torch/fx', 'test_fx']): category = 'fx' break + if CommitList.keywordInFile(file, ['torch/ao', 'test/ao']): + category = 'ao' + break # torch/quantization, test/quantization, aten/src/ATen/native/quantized, torch/nn/{quantized, quantizable} if CommitList.keywordInFile(file, ['torch/quantization', 'test/quantization', 'aten/src/ATen/native/quantized', 'torch/nn/quantiz']): category = 'quantization' @@ -141,15 +158,32 @@ def categorize(commit_hash, title): if CommitList.keywordInFile(file, ['aten/src/ATen/native/LinearAlgebra.cpp', 'test/test_linalg.py', 'torch/linalg']): category = 'linalg_frontend' break - if CommitList.keywordInFile(file, ['torch/sparse']): + if CommitList.keywordInFile(file, ['torch/sparse', 'aten/src/ATen/native/sparse', 'torch/_masked/__init__.py']): category = 'sparse_frontend' break - if CommitList.keywordInFile(file, ['test/test_nn.py', 'test/test_module.py', 'torch/nn/modules']): + if CommitList.keywordInFile(file, ['tools/autograd']): + category = 'autograd_frontend' + break + if CommitList.keywordInFile(file, ['test/test_nn.py', 'test/test_module.py', 'torch/nn/modules', 'torch/nn/functional.py']): category = 'nn_frontend' break - if CommitList.keywordInFile(file, ['torch/csrc/jit']): + if CommitList.keywordInFile(file, ['torch/csrc/jit', 'torch/jit']): category = 'jit' break + else: + # Below are some extra quick checks that aren't necessarily file-path related, + # but I found that to catch a decent number of extra commits. + if len(files_changed) > 0 and all([f_name.endswith('.cu') or f_name.endswith('.cuh') for f_name in files_changed]): + category = 'cuda' + elif '[PyTorch Edge]' in title: + category = 'mobile' + elif len(files_changed) == 1 and 'torch/testing/_internal/common_methods_invocations.py' in files_changed[0]: + # when this is the only file changed, it's almost always an OpInfo change. + category = 'python_frontend' + elif len(files_changed) == 1 and 'torch/_torch_docs.py' in files_changed[0]: + # individual torch_docs changes are usually for python ops + category = 'python_frontend' + return Commit(commit_hash, category, topic, title) @@ -198,6 +232,14 @@ def update_existing(path, new_version): commits.update_to(new_version) commits.write_to_disk() +def rerun_with_new_filters(path): + current_commits = CommitList.from_existing(path) + for i in range(len(current_commits.commits)): + c = current_commits.commits[i] + if 'Uncategorized' in str(c): + current_commits.commits[i] = CommitList.categorize(c.commit_hash, c.title) + current_commits.write_to_disk() + def to_markdown(commit_list, category): def cleanup_title(commit): match = re.match(r'(.*) \(#\d+\)', commit.title) @@ -252,6 +294,11 @@ def main(): group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--create_new', nargs=2) group.add_argument('--update_to') + # I found this flag useful when experimenting with adding new auto-categorizing filters. + # After running commitlist.py the first time, if you add any new filters in this file, + # re-running with "rerun_with_new_filters" will update the existing commitlist.csv file, + # but only affect the rows that were previously marked as "Uncategorized" + group.add_argument('--rerun_with_new_filters', action='store_true') group.add_argument('--stat', action='store_true') group.add_argument('--export_markdown', action='store_true') @@ -264,6 +311,9 @@ def main(): if args.update_to: update_existing(args.path, args.update_to) return + if args.rerun_with_new_filters: + rerun_with_new_filters(args.path) + return if args.stat: commits = CommitList.from_existing(args.path) stats = commits.stat() diff --git a/scripts/release_notes/common.py b/scripts/release_notes/common.py index d09c4ad8ed81..355dee12adaf 100644 --- a/scripts/release_notes/common.py +++ b/scripts/release_notes/common.py @@ -10,6 +10,8 @@ categories = [ 'Uncategorized', 'distributed', + 'lazy', + 'hub', 'mobile', 'jit', 'visualization', @@ -17,7 +19,9 @@ 'caffe2', 'quantization', 'amd', + 'rocm', 'cuda', + 'cudnn', 'benchmark', 'profiler', 'performance_as_product', @@ -28,6 +32,8 @@ 'code_coverage', 'vulkan', 'skip', + 'composability', + 'meta_frontend', 'nn_frontend', 'linalg_frontend', 'cpp_frontend', diff --git a/setup.py b/setup.py index aa1669a10d30..d23603bc90cb 100644 --- a/setup.py +++ b/setup.py @@ -50,6 +50,9 @@ # MKLDNN_CPU_RUNTIME # MKL-DNN threading mode: TBB or OMP (default) # +# USE_STATIC_MKL +# Prefer to link with MKL statically - Unix only +# # USE_NNPACK=0 # disables NNPACK build # @@ -506,6 +509,10 @@ def run(self): report(' -- USE_MPI={}'.format(cmake_cache_vars['USE_OPENMPI'])) else: report('-- Building without distributed package') + if cmake_cache_vars['STATIC_DISPATCH_BACKEND']: + report('-- Using static dispatch with backend {}'.format(cmake_cache_vars['STATIC_DISPATCH_BACKEND'])) + if cmake_cache_vars['USE_LIGHTWEIGHT_DISPATCH']: + report('-- Using lightweight dispatch') # Do not use clang to compile extensions if `-fstack-clash-protection` is defined # in system CFLAGS @@ -817,7 +824,16 @@ def make_relative_rpath_args(path): include_dirs=[], library_dirs=library_dirs, extra_link_args=extra_link_args + main_link_args + make_relative_rpath_args('lib')) + C_flatbuffer = Extension("torch._C_flatbuffer", + libraries=main_libraries, + sources=["torch/csrc/stub_with_flatbuffer.c"], + language='c', + extra_compile_args=main_compile_args + extra_compile_args, + include_dirs=[], + library_dirs=library_dirs, + extra_link_args=extra_link_args + main_link_args + make_relative_rpath_args('lib')) extensions.append(C) + extensions.append(C_flatbuffer) if not IS_WINDOWS: DL = Extension("torch._dl", @@ -925,6 +941,7 @@ def print_box(msg): 'bin/*', 'test/*', '_C/*.pyi', + '_C_flatbuffer/*.pyi', 'cuda/*.pyi', 'optim/*.pyi', 'autograd/*.pyi', @@ -932,6 +949,7 @@ def print_box(msg): 'nn/*.pyi', 'nn/modules/*.pyi', 'nn/parallel/*.pyi', + 'utils/data/*.pyi', 'lib/*.so*', 'lib/*.dylib*', 'lib/*.dll', @@ -981,6 +999,7 @@ def print_box(msg): 'include/c10/cuda/impl/*.h', 'include/c10/hip/*.h', 'include/c10/hip/impl/*.h', + 'include/c10d/*.h', 'include/c10d/*.hpp', 'include/caffe2/**/*.h', 'include/torch/*.h', @@ -1010,7 +1029,8 @@ def print_box(msg): 'include/torch/csrc/autograd/utils/*.h', 'include/torch/csrc/cuda/*.h', 'include/torch/csrc/deploy/*.h', - 'include/torch/csrc/deploy/interpreter/interpreter_impl.h', + 'include/torch/csrc/deploy/interpreter/*.h', + 'include/torch/csrc/deploy/interpreter/*.hpp', 'include/torch/csrc/distributed/c10d/exception.h', 'include/torch/csrc/jit/*.h', 'include/torch/csrc/jit/backends/*.h', @@ -1031,7 +1051,9 @@ def print_box(msg): 'include/torch/csrc/profiler/*.h', 'include/torch/csrc/utils/*.h', 'include/torch/csrc/tensor/*.h', + 'include/torch/csrc/lazy/backend/*.h', 'include/torch/csrc/lazy/core/*.h', + 'include/torch/csrc/lazy/core/ops/*.h', 'include/pybind11/*.h', 'include/pybind11/detail/*.h', 'include/TH/*.h*', @@ -1058,6 +1080,7 @@ def print_box(msg): 'utils/model_dump/code.js', 'utils/model_dump/*.mjs', ], + 'torchgen': [], 'caffe2': [ 'python/serialized_test/data/operator_test/*.zip', ], diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json new file mode 100644 index 000000000000..2d6a839339d9 --- /dev/null +++ b/test/allowlist_for_publicAPI.json @@ -0,0 +1,3564 @@ +{ + "torch.amp.autocast_mode": [ + "Any", + "Optional" + ], + "torch.ao.nn.sparse.quantized.dynamic.linear": [ + "LinearBlockSparsePattern", + "Optional", + "hide_packed_params_repr" + ], + "torch.ao.nn.sparse.quantized.linear": [ + "Optional", + "hide_packed_params_repr" + ], + "torch.ao.quantization": [ + "ABC", + "ABCMeta", + "Any", + "Callable", + "Dict", + "List", + "Module", + "Optional", + "OrderedDict", + "Pattern", + "QConfigAny", + "Set", + "Tuple", + "Type", + "Union", + "abstractmethod", + "namedtuple", + "partial", + "type_before_parametrizations", + "wrap_cpp_module" + ], + "torch.ao.quantization.fake_quantize": [ + "ABC", + "Any", + "FixedQParamsObserver", + "HistogramObserver", + "Module", + "MovingAverageMinMaxObserver", + "MovingAveragePerChannelMinMaxObserver", + "Tuple", + "abstractmethod", + "default_fixed_qparams_range_0to1_fake_quant", + "default_fixed_qparams_range_0to1_observer", + "default_affine_fixed_qparams_fake_quant", + "default_affine_fixed_qparams_observer", + "default_dynamic_fake_quant", + "default_embedding_fake_quant", + "default_embedding_fake_quant_4bit", + "default_fake_quant", + "default_fused_act_fake_quant", + "default_fused_per_channel_wt_fake_quant", + "default_fused_wt_fake_quant", + "default_histogram_fake_quant", + "default_per_channel_weight_fake_quant", + "default_fixed_qparams_range_neg1to1_fake_quant", + "default_fixed_qparams_range_neg1to1_observer", + "default_symmetric_fixed_qparams_fake_quant", + "default_symmetric_fixed_qparams_observer", + "default_weight_fake_quant", + "fused_per_channel_wt_fake_quant_range_neg_127_to_127", + "fused_wt_fake_quant_range_neg_127_to_127" + ], + "torch.ao.quantization.fuse_modules": [ + "List", + "Optional", + "fuse_conv_bn", + "fuse_conv_bn_relu", + "get_fuser_method", + "type_before_parametrizations" + ], + "torch.ao.quantization.fuser_method_mappings": [ + "Callable", + "Dict", + "MatchAllNode", + "Optional", + "Pattern", + "Tuple", + "Type", + "Union", + "get_combined_dict" + ], + "torch.ao.quantization.backend_config.native": [ + "Any", + "Dict", + "FixedQParamsFakeQuantize", + "List", + "ObservationType", + "default_fixed_qparams_range_0to1_observer", + "default_fixed_qparams_range_neg1to1_observer", + "default_affine_fixed_qparams_observer", + "default_symmetric_fixed_qparams_observer", + "fuse_conv_bn", + "fuse_conv_bn_relu", + "fuse_convtranspose_bn", + "fuse_linear_bn", + "namedtuple", + "reverse2", + "reverse3", + "reverse_sequential_wrapper2" + ], + "torch.ao.quantization.backend_config.observation_type": [ + "Enum" + ], + "torch.ao.quantization.backend_config.tensorrt": [ + "ObservationType", + "reverse_sequential_wrapper2" + ], + "torch.ao.quantization.quantization_types": [ + "Any", + "Node", + "NodePattern", + "Pattern", + "QuantizerCls", + "Tuple", + "Union" + ], + "torch.ao.quantization.fx.convert": [ + "Any", + "Argument", + "Callable", + "Dict", + "Graph", + "GraphModule", + "List", + "Node", + "Optional", + "QConfigAny", + "QuantizedGraphModule", + "Set", + "Tuple", + "activation_is_statically_quantized", + "collect_producer_nodes", + "compare_prepare_convert_qconfig_dict", + "convert_dict_to_ordered_dict", + "convert_eq_obs", + "create_getattr_from_value", + "generate_qconfig_map", + "get_custom_module_class_keys", + "get_fused_module_classes", + "get_native_backend_config_dict", + "get_pattern_to_dtype_configs", + "get_qat_module_classes", + "get_qparam_dict", + "get_quantize_node_info", + "get_root_module_to_quantized_reference_module", + "get_swapped_custom_module_class", + "graph_module_from_producer_nodes", + "is_activation_post_process", + "is_observed_module", + "is_observed_standalone_module", + "is_qconfig_supported_by_dtype_configs", + "lower_to_fbgemm", + "qconfig_equals", + "update_obs_for_equalization", + "update_qconfig_for_fusion", + "update_qconfig_for_qat", + "weight_is_quantized" + ], + "torch.ao.quantization.fx.fuse": [ + "ABC", + "Any", + "Callable", + "DefaultFuseHandler", + "Dict", + "FuseHandler", + "FusedGraphModule", + "Graph", + "GraphModule", + "List", + "MatchAllNode", + "Node", + "NodePattern", + "Optional", + "Pattern", + "Tuple", + "Union", + "abstractmethod", + "get_fuser_method_mapping", + "get_fuser_method_new", + "get_fusion_pattern_to_extra_inputs_getter", + "get_fusion_pattern_to_fuse_handler_cls", + "get_fusion_pattern_to_root_node_getter", + "get_native_backend_config_dict", + "is_match", + "map_arg", + "sorted_patterns_dict" + ], + "torch.ao.quantization.fx.fusion_patterns": [ + "ABC", + "Any", + "Callable", + "Dict", + "Graph", + "List", + "MatchAllNode", + "Node", + "NodePattern", + "Optional", + "Pattern", + "Union", + "abstractmethod", + "get_fuser_method_new" + ], + "torch.ao.quantization.fx.graph_module": [ + "Any", + "Dict", + "Graph", + "GraphModule", + "Set", + "Union" + ], + "torch.ao.quantization.fx.lower_to_fbgemm": [ + "Dict", + "QConfigAny", + "QuantizedGraphModule", + "Tuple" + ], + "torch.ao.quantization.fx.match_utils": [ + "Any", + "Callable", + "Dict", + "Graph", + "List", + "MatchAllNode", + "MatchResult", + "Node", + "Optional", + "Pattern", + "QConfigAny", + "QuantizeHandler", + "Set", + "Tuple", + "is_observed_standalone_module" + ], + "torch.ao.quantization.fx.pattern_utils": [ + "Any", + "Dict", + "FixedQParamsFakeQuantize", + "List", + "MatchResult", + "Node", + "ObserverBase", + "Optional", + "OrderedDict", + "Pattern", + "QConfigAny", + "QuantizeHandler", + "Tuple" + ], + "torch.ao.quantization.fx.prepare": [ + "Any", + "Argument", + "Callable", + "Dict", + "Graph", + "GraphModule", + "List", + "MatchResult", + "Node", + "NodePattern", + "ObservedGraphModule", + "ObservedStandaloneGraphModule", + "ObserverBase", + "Optional", + "Pattern", + "QConfigAny", + "QuantizeHandler", + "Set", + "Tuple", + "Union", + "activation_is_int8_quantized", + "activation_is_statically_quantized", + "all_node_args_have_no_tensors", + "assert_and_get_unique_device", + "convert", + "convert_dict_to_ordered_dict", + "defaultdict", + "find_matches", + "generate_qconfig_map", + "get_custom_module_class_keys", + "get_flattened_qconfig_dict", + "get_fusion_pattern_to_root_node_getter", + "get_module_to_qat_module", + "get_native_backend_config_dict", + "get_new_attr_name_with_prefix", + "get_non_observable_arg_indexes_and_types", + "get_pattern_to_dtype_configs", + "get_pattern_to_input_type_to_index", + "get_pattern_to_quantize_handlers", + "get_qconfig_dtypes", + "get_standalone_module_configs", + "get_swapped_custom_module_class", + "is_activation_post_process", + "is_equalization_observer", + "is_reuse_input_qconfig", + "node_supports_equalization", + "propagate_qconfig_", + "sorted_patterns_dict", + "update_qconfig_for_fusion", + "update_qconfig_for_qat" + ], + "torch.ao.quantization.fx.qconfig_utils": [ + "Any", + "Callable", + "Dict", + "Graph", + "GraphModule", + "List", + "Optional", + "QConfig", + "QConfigAny", + "Set", + "Tuple", + "add_module_to_qconfig_obs_ctr", + "defaultdict", + "get_object_type_qconfig", + "get_qconfig_dtypes", + "is_activation_post_process", + "maybe_adjust_qconfig_for_module_type_or_name", + "qconfig_equals" + ], + "torch.ao.quantization.fx.quantization_patterns": [ + "ABC", + "Any", + "Callable", + "Dict", + "Node", + "NodePattern", + "Optional", + "Pattern", + "all_node_args_have_no_tensors" + ], + "torch.ao.quantization.fx.quantization_types": [ + "Any", + "Node", + "NodePattern", + "Pattern", + "QuantizerCls", + "Tuple", + "Union" + ], + "torch.ao.quantization.fx.utils": [ + "Any", + "Callable", + "Dict", + "Graph", + "GraphModule", + "List", + "Node", + "Optional", + "Set", + "Tuple", + "Type", + "Union", + "is_activation_post_process", + "is_per_channel", + "is_per_tensor", + "map_arg", + "namedtuple" + ], + "torch.ao.quantization.fx.backend_config_utils": [ + "Any", + "Callable", + "DefaultFuseHandler", + "Dict", + "NodePattern", + "ObservationType", + "Optional", + "Pattern", + "QuantizeHandler", + "QuantizerCls", + "activation_dtype", + "get_combined_dict", + "get_default_quant_patterns", + "get_native_backend_config_dict", + "sorted_patterns_dict", + "get_quantize_handler_cls" + ], + "torch.ao.quantization.observer": [ + "ABC", + "ABCMeta", + "Any", + "Dict", + "List", + "Optional", + "OrderedDict", + "Tuple", + "Union", + "abstractmethod", + "calculate_qmin_qmax", + "check_min_max_valid", + "partial" + ], + "torch.ao.quantization.qconfig": [ + "Any", + "FakeQuantize", + "FakeQuantizeBase", + "FusedMovingAvgObsFakeQuantize", + "HistogramObserver", + "MovingAverageMinMaxObserver", + "NoopObserver", + "Optional", + "PlaceholderObserver", + "QConfigAny", + "ReuseInputObserver", + "default_debug_observer", + "default_dynamic_fake_quant", + "default_dynamic_quant_observer", + "default_embedding_fake_quant", + "default_embedding_fake_quant_4bit", + "default_fake_quant", + "default_float_qparams_observer", + "default_float_qparams_observer_4bit", + "default_fused_act_fake_quant", + "default_fused_per_channel_wt_fake_quant", + "default_fused_wt_fake_quant", + "default_observer", + "default_per_channel_weight_fake_quant", + "default_per_channel_weight_observer", + "default_placeholder_observer", + "default_reuse_input_observer", + "default_weight_fake_quant", + "default_weight_observer", + "fused_per_channel_wt_fake_quant_range_neg_127_to_127", + "fused_wt_fake_quant_range_neg_127_to_127", + "namedtuple", + "per_channel_weight_observer_range_neg_127_to_127", + "weight_observer_range_neg_127_to_127" + ], + "torch.ao.quantization.qconfig_dict_utils": [ + "Any", + "Callable", + "Dict", + "OrderedDict", + "QConfigAny", + "Union", + "get_combined_dict", + "get_default_qat_module_mappings" + ], + "torch.ao.quantization.quantization_mappings": [ + "Any", + "Callable", + "DeQuantStub", + "Dict", + "Optional", + "QuantStub", + "Set", + "Union", + "default_fixed_qparams_range_0to1_fake_quant", + "default_fixed_qparams_range_neg1to1_fake_quant", + "default_affine_fixed_qparams_fake_quant", + "default_symmetric_fixed_qparams_fake_quant", + "get_combined_dict", + "type_before_parametrizations" + ], + "torch.ao.quantization.quantize": [ + "DeQuantStub", + "QuantWrapper", + "activation_is_memoryless", + "add_module_to_qconfig_obs_ctr", + "get_default_dynamic_quant_module_mappings", + "get_default_qat_module_mappings", + "get_default_qconfig_propagation_list", + "get_default_static_quant_module_mappings", + "get_default_static_quant_reference_module_mappings", + "get_qparam_dict", + "has_no_children_ignoring_parametrizations", + "no_observer_set", + "type_before_parametrizations" + ], + "torch.ao.quantization.quantize_jit": [ + "QConfig", + "QuantType", + "wrap_cpp_module" + ], + "torch.ao.quantization.utils": [ + "Any", + "Callable", + "Pattern", + "QuantType", + "Tuple", + "Union", + "is_parametrized", + "quant_type_to_str" + ], + "torch.ao.sparsity.experimental.pruner.base_pruner": [ + "ActivationReconstruction", + "BaseSparsifier", + "BiasHook", + "ModuleDict", + "ModuleList", + "PruningParametrization", + "ZeroesParametrization", + "fqn_to_module", + "module_to_fqn" + ], + "torch.ao.sparsity.experimental.pruner.parametrization": [ + "Any", + "List" + ], + "torch.ao.sparsity.scheduler.base_scheduler": [ + "BaseSparsifier", + "wraps" + ], + "torch.ao.sparsity.scheduler.lambda_scheduler": [ + "BaseScheduler" + ], + "torch.ao.sparsity.sparsifier.base_sparsifier": [ + "Dict", + "FakeSparsity", + "Optional", + "Tuple", + "defaultdict", + "fqn_to_module", + "module_to_fqn" + ], + "torch.ao.sparsity.sparsifier.weight_norm_sparsifier": [ + "BaseSparsifier", + "Tuple", + "reduce" + ], + "torch.autograd": [ + "NestedIOFunction", + "detect_anomaly", + "enable_grad", + "grad", + "gradcheck", + "gradgradcheck", + "inference_mode", + "no_grad", + "set_detect_anomaly", + "set_grad_enabled", + "variable" + ], + "torch.autograd.function": [ + "Any", + "List", + "Optional", + "OrderedDict", + "with_metaclass" + ], + "torch.autograd.functional": [ + "List", + "Tuple" + ], + "torch.autograd.graph": [ + "Any", + "Callable" + ], + "torch.autograd.profiler": [ + "Any", + "ContextDecorator", + "DeviceType", + "Dict", + "Future", + "List", + "Optional", + "ProfilerActivity", + "ProfilerConfig", + "ProfilerState", + "kineto_available", + "warn" + ], + "torch.autograd.profiler_legacy": [ + "DeviceType", + "EventList", + "FunctionEvent", + "ProfilerConfig", + "ProfilerState", + "warn" + ], + "torch.autograd.profiler_util": [ + "DeviceType", + "Dict", + "List", + "Optional", + "Tuple", + "attrgetter", + "defaultdict", + "namedtuple" + ], + "torch.autograd.variable": [ + "ImperativeEngine", + "with_metaclass" + ], + "torch.backends": [ + "contextmanager" + ], + "torch.backends.cuda": [ + "Union" + ], + "torch.cpu.amp.autocast_mode": [ + "Any" + ], + "torch.cuda": [ + "Any", + "Device", + "Dict", + "List", + "Optional", + "Tuple", + "Union", + "classproperty" + ], + "torch.cuda.amp.autocast_mode": [ + "Any" + ], + "torch.cuda.amp.common": [ + "find_spec" + ], + "torch.cuda.amp.grad_scaler": [ + "Any", + "Dict", + "Enum", + "List", + "Optional", + "Tuple", + "amp_definitely_not_available", + "defaultdict" + ], + "torch.cuda.nccl": [ + "init_rank", + "is_available", + "unique_id", + "version" + ], + "torch.cuda.profiler": [ + "check_error", + "cudart" + ], + "torch.distributed": [ + "AllToAllOptions", + "AllreduceCoalescedOptions", + "AllreduceOptions", + "BarrierOptions", + "BroadcastOptions", + "BuiltinCommHookType", + "Callable", + "DebugLevel", + "Dict", + "Enum", + "FileStore", + "GatherOptions", + "GradBucket", + "HashStore", + "Logger", + "Optional", + "PrefixStore", + "ProcessGroup", + "ProcessGroupGloo", + "ReduceOp", + "ReduceOptions", + "ReduceScatterOptions", + "Reducer", + "ScatterOptions", + "Store", + "TCPStore", + "Tuple", + "Union", + "get_debug_level", + "set_debug_level", + "set_debug_level_from_env", + "timedelta", + "ProcessGroupMPI", + "ProcessGroupNCCL" + ], + "torch.distributed.algorithms.ddp_comm_hooks": [ + "DistributedDataParallel", + "Enum", + "partial" + ], + "torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks": [ + "Any", + "GradBucket" + ], + "torch.distributed.algorithms.ddp_comm_hooks.default_hooks": [ + "Any", + "Callable" + ], + "torch.distributed.algorithms.ddp_comm_hooks.optimizer_overlap_hooks": [ + "Any", + "Callable" + ], + "torch.distributed.algorithms.join": [ + "ABC", + "Any", + "List", + "NamedTuple", + "Optional", + "TracebackType", + "Type", + "abstractmethod" + ], + "torch.distributed.algorithms.model_averaging.averagers": [ + "ABC", + "Dict", + "Iterable", + "Union", + "abstractmethod" + ], + "torch.distributed.algorithms.model_averaging.utils": [ + "Dict", + "Iterable", + "Iterator", + "ProcessGroup", + "Union", + "group" + ], + "torch.distributed.autograd": [ + "DistAutogradContext", + "backward", + "get_gradients" + ], + "torch.distributed.distributed_c10d": [ + "AllToAllOptions", + "AllreduceCoalescedOptions", + "AllreduceOptions", + "BarrierOptions", + "BroadcastOptions", + "Callable", + "DebugLevel", + "Dict", + "GatherOptions", + "Optional", + "PrefixStore", + "ProcessGroup", + "ProcessGroupGloo", + "ReduceOp", + "ReduceOptions", + "ReduceScatterOptions", + "ScatterOptions", + "Store", + "Tuple", + "Union", + "get_debug_level", + "register_rendezvous_handler", + "rendezvous", + "timedelta", + "ProcessGroupMPI", + "ProcessGroupNCCL" + ], + "torch.distributed.elastic.agent.server.api": [ + "Any", + "Callable", + "Dict", + "Enum", + "Event", + "EventSource", + "List", + "Optional", + "ProcessFailure", + "SignalException", + "Std", + "Store", + "Tuple", + "Union", + "closing", + "dataclass", + "field", + "get_logger", + "prof", + "put_metric", + "record" + ], + "torch.distributed.elastic.events": [ + "Dict", + "Enum", + "EventMetadataValue", + "Optional" + ], + "torch.distributed.elastic.events.api": [ + "Dict", + "Enum", + "EventMetadataValue", + "Optional", + "Union", + "asdict", + "dataclass", + "field" + ], + "torch.distributed.elastic.events.handlers": [ + "Dict" + ], + "torch.distributed.elastic.metrics": [ + "Optional" + ], + "torch.distributed.elastic.metrics.api": [ + "Dict", + "Optional", + "namedtuple", + "wraps" + ], + "torch.distributed.elastic.multiprocessing": [ + "Callable", + "Dict", + "Tuple", + "Union", + "get_logger" + ], + "torch.distributed.elastic.multiprocessing.api": [ + "Any", + "Callable", + "Dict", + "FrameType", + "IntFlag", + "Optional", + "ProcessFailure", + "Set", + "TailLog", + "Tuple", + "Union", + "dataclass", + "field", + "nullcontext", + "record", + "redirect_stderr", + "redirect_stdout" + ], + "torch.distributed.elastic.multiprocessing.errors": [ + "Any", + "Callable", + "Dict", + "GlobalRank", + "JSON", + "List", + "Optional", + "Template", + "Tuple", + "TypeVar", + "dataclass", + "datetime", + "field", + "get_logger", + "wraps" + ], + "torch.distributed.elastic.multiprocessing.errors.error_handler": [ + "Optional" + ], + "torch.distributed.elastic.multiprocessing.errors.handlers": [ + "ErrorHandler" + ], + "torch.distributed.elastic.multiprocessing.redirects": [ + "contextmanager", + "partial", + "redirect_stderr", + "redirect_stdout" + ], + "torch.distributed.elastic.multiprocessing.tail_log": [ + "Dict", + "Event", + "Future", + "List", + "TextIO", + "ThreadPoolExecutor" + ], + "torch.distributed.elastic.rendezvous": [ + "RendezvousHandlerCreator" + ], + "torch.distributed.elastic.rendezvous.api": [ + "ABC", + "Any", + "Callable", + "Dict", + "Optional", + "RendezvousHandlerCreator", + "Store", + "Tuple", + "abstractmethod" + ], + "torch.distributed.elastic.rendezvous.dynamic_rendezvous": [ + "ABC", + "Any", + "Callable", + "Dict", + "Enum", + "List", + "NodeState", + "Optional", + "PrefixStore", + "RendezvousClosedError", + "RendezvousError", + "RendezvousHandler", + "RendezvousParameters", + "RendezvousStateError", + "RendezvousTimeoutError", + "Set", + "Store", + "Token", + "Tuple", + "abstractmethod", + "cast", + "construct_and_record_rdzv_event", + "dataclass", + "datetime", + "timedelta" + ], + "torch.distributed.elastic.rendezvous.registry": [ + "RendezvousHandler", + "RendezvousParameters", + "create_handler" + ], + "torch.distributed.elastic.rendezvous.utils": [ + "Any", + "Callable", + "Dict", + "Event", + "Optional", + "Thread", + "Tuple", + "Union", + "timedelta" + ], + "torch.distributed.elastic.timer.api": [ + "Any", + "Dict", + "List", + "Optional", + "Set", + "contextmanager", + "getframeinfo", + "stack" + ], + "torch.distributed.elastic.timer.local_timer": [ + "Any", + "Dict", + "Empty", + "List", + "RequestQueue", + "Set", + "TimerClient", + "TimerRequest", + "TimerServer", + "Tuple" + ], + "torch.distributed.elastic.utils.api": [ + "Any", + "List", + "Template" + ], + "torch.distributed.elastic.utils.data.elastic_distributed_sampler": [ + "DistributedSampler" + ], + "torch.distributed.elastic.utils.logging": [ + "Optional", + "get_log_level" + ], + "torch.distributed.elastic.utils.store": [ + "List", + "timedelta" + ], + "torch.distributed.fsdp.flatten_params_wrapper": [ + "Any", + "Dict", + "Generator", + "Iterator", + "List", + "NamedTuple", + "Optional", + "ParamOffset", + "Sequence", + "SharedParamInfo", + "Tensor", + "Tuple", + "Union", + "accumulate" + ], + "torch.distributed.fsdp.fully_sharded_data_parallel": [ + "Any", + "Callable", + "Dict", + "Enum", + "FlatParameter", + "FlattenParamsWrapper", + "Generator", + "Iterable", + "Iterator", + "List", + "Mapping", + "NamedTuple", + "Optional", + "Parameter", + "ProcessGroup", + "Set", + "Shard", + "ShardedTensor", + "Tuple", + "Union", + "Variable", + "auto", + "cast", + "contextmanager", + "dataclass", + "init_from_local_shards" + ], + "torch.distributed.fsdp.utils": [ + "Any", + "Callable", + "Dict", + "List", + "OrderedDict", + "Set", + "Tuple", + "Union" + ], + "torch.distributed.fsdp.wrap": [ + "Any", + "Callable", + "Dict", + "Generator", + "Optional", + "Set", + "Tuple", + "Type", + "cast" + ], + "torch.distributed.launcher.api": [ + "Any", + "Callable", + "ChildFailedError", + "Dict", + "List", + "LocalElasticAgent", + "Optional", + "RendezvousParameters", + "SignalException", + "Std", + "Tuple", + "Union", + "WorkerSpec", + "dataclass", + "field", + "get_logger", + "parse_rendezvous_endpoint" + ], + "torch.distributed.nn": [ + "Function", + "ReduceOp", + "group" + ], + "torch.distributed.nn.api.remote_module": [ + "Any", + "Callable", + "Dict", + "Iterator", + "List", + "Mapping", + "Module", + "Optional", + "Parameter", + "RemovableHandle", + "Set", + "Tensor", + "Tuple", + "Type", + "TypeVar", + "Union", + "device", + "dtype" + ], + "torch.distributed.nn.functional": [ + "Function", + "ReduceOp", + "group" + ], + "torch.distributed.nn.jit.instantiator": [ + "Optional", + "get_remote_module_template" + ], + "torch.distributed.optim.functional_adadelta": [ + "Dict", + "List", + "Optional", + "Tensor" + ], + "torch.distributed.optim.functional_adagrad": [ + "Dict", + "List", + "Optional", + "Tensor" + ], + "torch.distributed.optim.functional_adam": [ + "Dict", + "List", + "Optional", + "Tensor", + "Tuple" + ], + "torch.distributed.optim.functional_adamax": [ + "Dict", + "List", + "Optional", + "Tensor", + "Tuple" + ], + "torch.distributed.optim.functional_adamw": [ + "Dict", + "List", + "Optional", + "Tensor", + "Tuple" + ], + "torch.distributed.optim.functional_rmsprop": [ + "Dict", + "List", + "Optional", + "Tensor" + ], + "torch.distributed.optim.functional_rprop": [ + "Dict", + "List", + "Optional", + "Tensor", + "Tuple" + ], + "torch.distributed.optim.functional_sgd": [ + "Dict", + "List", + "Optional", + "Tensor" + ], + "torch.distributed.optim.optimizer": [ + "List", + "Lock", + "Optional", + "RRef", + "Tensor", + "defaultdict" + ], + "torch.distributed.optim.utils": [ + "Type" + ], + "torch.distributed.pipeline.sync.checkpoint": [ + "Checkpoint", + "Checkpointing", + "Context", + "Function", + "Recompute", + "ThreadLocal", + "checkpoint", + "enable_checkpointing", + "enable_recomputing", + "restore_rng_states", + "save_rng_states" + ], + "torch.distributed.pipeline.sync.copy": [ + "Context", + "Copy", + "Wait" + ], + "torch.distributed.pipeline.sync.dependency": [ + "Fork", + "Join", + "fork", + "join" + ], + "torch.distributed.pipeline.sync.microbatch": [ + "Batch", + "NoChunk", + "check", + "gather", + "scatter" + ], + "torch.distributed.pipeline.sync.phony": [ + "get_phony" + ], + "torch.distributed.pipeline.sync.pipe": [ + "BalanceError", + "PipeSequential", + "Pipeline", + "WithDevice" + ], + "torch.distributed.pipeline.sync.pipeline": [ + "Pipeline" + ], + "torch.distributed.pipeline.sync.skip.layout": [ + "SkipLayout", + "inspect_skip_layout" + ], + "torch.distributed.pipeline.sync.skip.portal": [ + "Context", + "Portal", + "PortalBlue", + "PortalCopy", + "PortalOrange" + ], + "torch.distributed.pipeline.sync.skip.skippable": [ + "Skippable" + ], + "torch.distributed.pipeline.sync.skip.tracker": [ + "SkipTracker", + "SkipTrackerThroughPotals", + "ThreadLocal", + "current_skip_tracker", + "use_skip_tracker" + ], + "torch.distributed.pipeline.sync.stream": [ + "CPUStreamType", + "as_cuda", + "current_stream", + "default_stream", + "get_device", + "is_cuda", + "new_stream", + "record_stream", + "use_device", + "use_stream", + "wait_stream" + ], + "torch.distributed.pipeline.sync.worker": [ + "Task", + "create_workers", + "spawn_workers", + "worker" + ], + "torch.distributed.remote_device": [ + "Optional", + "Union" + ], + "torch.distributed.rendezvous": [ + "Dict", + "FileStore", + "Iterable", + "Optional", + "PrefixStore", + "Store", + "TCPStore", + "Tuple", + "Union", + "cast", + "timedelta", + "urlparse", + "urlunparse" + ], + "torch.distributed.rpc": [ + "Any", + "Dict", + "Future", + "Generator", + "Generic", + "GenericWithOneTypeVar", + "PyRRef", + "RemoteProfilerManager", + "RpcAgent", + "RpcBackendOptions", + "Set", + "Store", + "TensorPipeAgent", + "Tuple", + "TypeVar", + "WorkerInfo", + "enable_gil_profiling", + "get_rpc_timeout", + "method", + "timedelta", + "urlparse" + ], + "torch.distributed.rpc.api": [ + "Any", + "Dict", + "Future", + "Generic", + "GenericWithOneTypeVar", + "PyRRef", + "PythonUDF", + "RPCExecMode", + "RemoteProfilerManager", + "Set", + "TypeVar", + "WorkerInfo", + "get_rpc_timeout", + "method" + ], + "torch.distributed.rpc.backend_registry": [ + "Dict", + "List", + "Set", + "Tuple" + ], + "torch.distributed.rpc.constants": [ + "timedelta" + ], + "torch.distributed.rpc.internal": [ + "Enum" + ], + "torch.distributed.rpc.options": [ + "DeviceType", + "Dict", + "List", + "Optional", + "Union" + ], + "torch.distributed.rpc.server_process_global_profiler": [ + "profile" + ], + "torch.distributions.bernoulli": [ + "ExponentialFamily", + "Number", + "binary_cross_entropy_with_logits", + "broadcast_all", + "lazy_property", + "logits_to_probs", + "probs_to_logits" + ], + "torch.distributions.beta": [ + "Dirichlet", + "ExponentialFamily", + "Number", + "Real", + "broadcast_all" + ], + "torch.distributions.binomial": [ + "Distribution", + "broadcast_all", + "lazy_property", + "logits_to_probs", + "probs_to_logits" + ], + "torch.distributions.categorical": [ + "Distribution", + "lazy_property", + "logits_to_probs", + "probs_to_logits" + ], + "torch.distributions.cauchy": [ + "Distribution", + "Number", + "broadcast_all" + ], + "torch.distributions.chi2": [ + "Gamma" + ], + "torch.distributions.continuous_bernoulli": [ + "ExponentialFamily", + "Number", + "binary_cross_entropy_with_logits", + "broadcast_all", + "clamp_probs", + "lazy_property", + "logits_to_probs", + "probs_to_logits" + ], + "torch.distributions.dirichlet": [ + "ExponentialFamily", + "Function", + "once_differentiable" + ], + "torch.distributions.distribution": [ + "Any", + "Dict", + "Optional", + "lazy_property" + ], + "torch.distributions.exp_family": [ + "Distribution" + ], + "torch.distributions.exponential": [ + "ExponentialFamily", + "Number", + "broadcast_all" + ], + "torch.distributions.fishersnedecor": [ + "Distribution", + "Gamma", + "Number", + "broadcast_all" + ], + "torch.distributions.gamma": [ + "ExponentialFamily", + "Number", + "broadcast_all" + ], + "torch.distributions.geometric": [ + "Distribution", + "Number", + "binary_cross_entropy_with_logits", + "broadcast_all", + "lazy_property", + "logits_to_probs", + "probs_to_logits" + ], + "torch.distributions.gumbel": [ + "AffineTransform", + "ExpTransform", + "Number", + "TransformedDistribution", + "Uniform", + "broadcast_all" + ], + "torch.distributions.half_cauchy": [ + "AbsTransform", + "Cauchy", + "TransformedDistribution" + ], + "torch.distributions.half_normal": [ + "AbsTransform", + "Normal", + "TransformedDistribution" + ], + "torch.distributions.independent": [ + "Dict", + "Distribution" + ], + "torch.distributions.kl": [ + "Bernoulli", + "Beta", + "Binomial", + "Callable", + "Categorical", + "Cauchy", + "ContinuousBernoulli", + "Dict", + "Dirichlet", + "Distribution", + "Exponential", + "ExponentialFamily", + "Gamma", + "Geometric", + "Gumbel", + "HalfNormal", + "Independent", + "Laplace", + "LowRankMultivariateNormal", + "MultivariateNormal", + "Normal", + "OneHotCategorical", + "Pareto", + "Poisson", + "TransformedDistribution", + "Tuple", + "Type", + "Uniform", + "total_ordering" + ], + "torch.distributions.kumaraswamy": [ + "AffineTransform", + "PowerTransform", + "TransformedDistribution", + "Uniform", + "broadcast_all" + ], + "torch.distributions.laplace": [ + "Distribution", + "Number", + "broadcast_all" + ], + "torch.distributions.lkj_cholesky": [ + "Beta", + "Distribution", + "broadcast_all" + ], + "torch.distributions.log_normal": [ + "ExpTransform", + "Normal", + "TransformedDistribution" + ], + "torch.distributions.logistic_normal": [ + "Normal", + "StickBreakingTransform", + "TransformedDistribution" + ], + "torch.distributions.lowrank_multivariate_normal": [ + "Distribution", + "lazy_property" + ], + "torch.distributions.mixture_same_family": [ + "Categorical", + "Dict", + "Distribution" + ], + "torch.distributions.multinomial": [ + "Binomial", + "Categorical", + "Distribution", + "broadcast_all" + ], + "torch.distributions.multivariate_normal": [ + "Distribution", + "lazy_property" + ], + "torch.distributions.negative_binomial": [ + "Distribution", + "broadcast_all", + "lazy_property", + "logits_to_probs", + "probs_to_logits" + ], + "torch.distributions.normal": [ + "ExponentialFamily", + "Number", + "Real", + "broadcast_all" + ], + "torch.distributions.one_hot_categorical": [ + "Categorical", + "Distribution" + ], + "torch.distributions.pareto": [ + "AffineTransform", + "ExpTransform", + "Exponential", + "TransformedDistribution", + "broadcast_all" + ], + "torch.distributions.poisson": [ + "ExponentialFamily", + "Number", + "broadcast_all" + ], + "torch.distributions.relaxed_bernoulli": [ + "Distribution", + "Number", + "SigmoidTransform", + "TransformedDistribution", + "broadcast_all", + "clamp_probs", + "lazy_property", + "logits_to_probs", + "probs_to_logits" + ], + "torch.distributions.relaxed_categorical": [ + "Categorical", + "Distribution", + "ExpTransform", + "TransformedDistribution", + "broadcast_all", + "clamp_probs" + ], + "torch.distributions.studentT": [ + "Chi2", + "Distribution", + "broadcast_all" + ], + "torch.distributions.transformed_distribution": [ + "ComposeTransform", + "Dict", + "Distribution", + "Independent", + "Transform" + ], + "torch.distributions.uniform": [ + "Distribution", + "Number", + "broadcast_all" + ], + "torch.distributions.utils": [ + "Any", + "Dict", + "Number", + "is_tensor_like", + "update_wrapper" + ], + "torch.distributions.von_mises": [ + "Distribution", + "broadcast_all", + "lazy_property" + ], + "torch.distributions.weibull": [ + "AffineTransform", + "Exponential", + "PowerTransform", + "TransformedDistribution", + "broadcast_all" + ], + "torch.distributions.wishart": [ + "ExponentialFamily", + "Number", + "Union", + "lazy_property" + ], + "torch.fft": [ + "Tensor", + "fft", + "fft2", + "fftfreq", + "fftn", + "fftshift", + "hfft", + "ifft", + "ifft2", + "ifftn", + "ifftshift", + "ihfft", + "irfft", + "irfft2", + "irfftn", + "rfft", + "rfft2", + "rfftfreq", + "rfftn" + ], + "torch.functional": [ + "istft", + "pca_lowrank", + "svd_lowrank" + ], + "torch.futures": [ + "Callable", + "Future", + "Generic", + "List", + "Optional", + "Type", + "TypeVar", + "Union", + "cast" + ], + "torch.fx": [ + "ProxyableClassMeta", + "Tracer", + "symbolic_trace", + "wrap" + ], + "torch.fx.experimental.unification.core": [ + "Iterator", + "assoc", + "dispatch", + "isvar", + "partial", + "unify", + "walk" + ], + "torch.fx.experimental.unification.dispatch": [ + "dispatch", + "partial" + ], + "torch.fx.experimental.unification.more": [ + "dispatch", + "reify", + "unify" + ], + "torch.fx.experimental.unification.multipledispatch.conflict": [ + "groupby", + "isvariadic" + ], + "torch.fx.experimental.unification.multipledispatch.core": [ + "Dispatcher", + "MethodDispatcher" + ], + "torch.fx.experimental.unification.multipledispatch.dispatcher": [ + "AmbiguityWarning", + "Variadic", + "ambiguities", + "expand_tuples", + "isvariadic", + "ordering", + "super_signature", + "warn" + ], + "torch.fx.experimental.unification.multipledispatch.utils": [ + "OrderedDict" + ], + "torch.fx.experimental.unification.multipledispatch.variadic": [ + "typename" + ], + "torch.fx.experimental.unification.unification_tools": [ + "first", + "getter", + "groupby" + ], + "torch.fx.experimental.unification.variable": [ + "contextmanager", + "dispatch", + "hashable", + "isvar" + ], + "torch.fx.graph": [ + "Any", + "Argument", + "Callable", + "Dict", + "FrozenSet", + "List", + "NamedTuple", + "Node", + "Optional", + "Set", + "Target", + "TransformCodeFunc", + "Tuple", + "Type", + "compatibility", + "contextmanager", + "dataclass", + "map_arg" + ], + "torch.fx.graph_module": [ + "Any", + "Dict", + "Graph", + "Importer", + "List", + "Optional", + "PackageExporter", + "PackageImporter", + "Path", + "PythonCode", + "Set", + "Type", + "Union", + "compatibility" + ], + "torch.fx.immutable_collections": [ + "Any", + "Context", + "Dict", + "List", + "Tuple", + "compatibility" + ], + "torch.fx.interpreter": [ + "Any", + "Argument", + "Dict", + "Graph", + "GraphModule", + "Iterator", + "List", + "Node", + "Optional", + "Proxy", + "Target", + "Tracer", + "Tuple", + "Union", + "compatibility", + "map_aggregate", + "map_arg" + ], + "torch.fx.node": [ + "Any", + "ArgsKwargsPair", + "Argument", + "BaseArgumentTypes", + "Callable", + "Dict", + "List", + "Optional", + "Set", + "Target", + "Tuple", + "Union", + "compatibility", + "immutable_dict", + "immutable_list", + "normalize_function", + "normalize_module" + ], + "torch.fx.operator_schemas": [ + "Any", + "Callable", + "Dict", + "List", + "NamedTuple", + "OpOverload", + "OpOverloadPacket", + "Optional", + "Tuple", + "cast", + "compatibility" + ], + "torch.fx.passes.graph_drawer": [ + "Any", + "Dict", + "TensorMetadata", + "chain", + "compatibility" + ], + "torch.fx.passes.graph_manipulation": [ + "Any", + "Argument", + "Dict", + "Graph", + "GraphModule", + "List", + "NamedTuple", + "Node", + "Optional", + "ShapeProp", + "Target", + "Tuple", + "compatibility", + "lift_lowering_attrs_to_nodes", + "map_aggregate", + "map_arg" + ], + "torch.fx.passes.net_min_base": [ + "Any", + "Callable", + "Dict", + "FxNetAccFusionsFinder", + "Names", + "NodeList", + "NodeSet", + "Optional", + "ShapeProp", + "TensorOrTensors", + "Tensors", + "Tuple", + "compatibility", + "dataclass", + "map_arg", + "split_by_tags" + ], + "torch.fx.passes.operator_support": [ + "IsNodeSupported", + "SupportDict", + "SupportedArgumentDTypes", + "TargetTypeName", + "TensorMetadata", + "compatibility", + "get_node_target" + ], + "torch.fx.passes.param_fetch": [ + "Any", + "Callable", + "Dict", + "GraphModule", + "List", + "Tuple", + "Type", + "compatibility" + ], + "torch.fx.passes.shape_prop": [ + "Any", + "Dict", + "NamedTuple", + "Node", + "Optional", + "Tuple", + "compatibility", + "map_aggregate" + ], + "torch.fx.passes.split_module": [ + "Any", + "Callable", + "Dict", + "GraphModule", + "List", + "Optional", + "compatibility" + ], + "torch.fx.passes.split_utils": [ + "Dict", + "List", + "NodeList", + "NodeSet", + "Optional", + "compatibility", + "dataclass", + "field", + "map_arg" + ], + "torch.fx.passes.splitter_base": [ + "Any", + "Dict", + "FxGraphDrawer", + "FxNetAccFusionsFinder", + "Iterable", + "List", + "NamedTuple", + "NodeList", + "NodeSet", + "OperatorSupportBase", + "Optional", + "Sequence", + "ShapeProp", + "Tensors", + "Tuple", + "compatibility", + "dataclass", + "defaultdict", + "get_node_target", + "get_size_of_node", + "is_node_output_tensor", + "map_arg", + "split_by_tags" + ], + "torch.fx.passes.tools_common": [ + "Any", + "Dict", + "List", + "Mapping", + "Names", + "NodeList", + "NodeSet", + "Set", + "TensorOrTensors", + "Tensors", + "Tuple", + "Union", + "compatibility", + "dataclass" + ], + "torch.fx.proxy": [ + "Any", + "Argument", + "Callable", + "Dict", + "Graph", + "Iterable", + "Iterator", + "Node", + "Optional", + "Target", + "Tuple", + "check_for_mutable_operation", + "compatibility", + "map_aggregate" + ], + "torch.fx.subgraph_rewriter": [ + "Callable", + "Dict", + "Graph", + "GraphModule", + "List", + "NamedTuple", + "Node", + "Optional", + "Set", + "compatibility", + "symbolic_trace" + ], + "torch.hub": [ + "HTTPError", + "Path", + "Request", + "tqdm", + "urlopen", + "urlparse" + ], + "torch.jit": [ + "Attribute", + "Final", + "Iterator", + "ONNXTracedModule", + "RecursiveScriptClass", + "RecursiveScriptModule", + "ScriptModule", + "ScriptWarning", + "TopLevelTracedModule", + "TracedModule", + "TracerWarning", + "TracingCheckError", + "contextmanager", + "export", + "fork", + "freeze", + "fuser", + "ignore", + "interface", + "is_scripting", + "is_tracing", + "jit_module_from_flatbuffer", + "last_executed_optimized_graph", + "load", + "optimize_for_inference", + "optimized_execution", + "run_frozen_optimizations", + "save", + "save_jit_module_to_flatbuffer", + "script", + "script_method", + "set_fusion_strategy", + "set_module", + "trace", + "trace_module", + "unused", + "wait" + ], + "torch.jit.annotations": [ + "Any", + "AnyType", + "ComplexType", + "Dict", + "DictType", + "EvalEnv", + "FloatType", + "IntType", + "List", + "ListType", + "StringType", + "TensorType", + "Tuple", + "TupleType", + "get_enum_value_type", + "is_dict", + "is_function_or_method", + "is_list", + "is_optional", + "is_tensor", + "is_tuple", + "is_union", + "is_vararg" + ], + "torch.jit.frontend": [ + "Apply", + "Assert", + "Assign", + "Attribute", + "AugAssign", + "BinOp", + "Break", + "ClassDef", + "Const", + "Continue", + "Decl", + "Def", + "Delete", + "DictComp", + "DictLiteral", + "Dots", + "EmptyTypeAnnotation", + "ExprStmt", + "FalseLiteral", + "For", + "FunctionModifiers", + "Ident", + "If", + "List", + "ListComp", + "ListLiteral", + "NoneLiteral", + "Param", + "Pass", + "Property", + "Raise", + "Return", + "Select", + "SliceExpr", + "Starred", + "Stmt", + "StringLiteral", + "Subscript", + "TernaryIf", + "TrueLiteral", + "Tuple", + "TupleLiteral", + "UnaryOp", + "Var", + "While", + "With", + "WithItem", + "dedent", + "get_qualified_name", + "get_source_lines_and_file", + "is_static_fn", + "make_source_context", + "namedtuple", + "parse_def", + "should_drop", + "monkeytype_trace" + ], + "torch.linalg": [ + "LinAlgError", + "Tensor", + "cholesky", + "cholesky_ex", + "cond", + "cross", + "det", + "diagonal", + "eig", + "eigh", + "eigvals", + "eigvalsh", + "householder_product", + "inv", + "inv_ex", + "ldl_factor", + "ldl_factor_ex", + "ldl_solve", + "lstsq", + "lu", + "lu_factor", + "lu_factor_ex", + "matmul", + "matrix_exp", + "matrix_norm", + "matrix_power", + "matrix_rank", + "multi_dot", + "norm", + "pinv", + "qr", + "slogdet", + "solve", + "solve_triangular", + "svd", + "svdvals", + "tensorinv", + "tensorsolve", + "vander", + "vector_norm" + ], + "torch.multiprocessing": [ + "Array", + "AuthenticationError", + "Barrier", + "BoundedSemaphore", + "BufferTooShort", + "Condition", + "Event", + "JoinableQueue", + "Lock", + "Manager", + "Pipe", + "Pool", + "Process", + "ProcessContext", + "ProcessError", + "ProcessExitedException", + "ProcessRaisedException", + "Queue", + "RLock", + "RawArray", + "RawValue", + "Semaphore", + "SimpleQueue", + "SpawnContext", + "TimeoutError", + "Value", + "active_children", + "allow_connection_pickling", + "cpu_count", + "current_process", + "freeze_support", + "get_all_start_methods", + "get_context", + "get_logger", + "get_start_method", + "init_reductions", + "log_to_stderr", + "set_executable", + "set_forkserver_preload", + "set_start_method", + "spawn", + "start_processes", + "parent_process" + ], + "torch.multiprocessing.reductions": [ + "ForkingPickler", + "Union", + "check_serializing_named_tensor", + "register_after_fork" + ], + "torch.multiprocessing.spawn": [ + "Optional" + ], + "torch.nn.common_types": [ + "Optional", + "Tensor", + "Tuple", + "TypeVar", + "Union" + ], + "torch.nn.functional": [ + "Callable", + "DType", + "List", + "Optional", + "Tensor", + "Tuple", + "Union", + "adaptive_avg_pool1d", + "avg_pool1d", + "avg_pool2d", + "avg_pool3d", + "bilinear", + "boolean_dispatch", + "celu_", + "channel_shuffle", + "conv1d", + "conv2d", + "conv3d", + "conv_tbc", + "conv_transpose1d", + "conv_transpose2d", + "conv_transpose3d", + "cosine_similarity", + "elu_", + "gelu", + "handle_torch_function", + "hardshrink", + "hardtanh_", + "has_torch_function", + "has_torch_function_unary", + "has_torch_function_variadic", + "leaky_relu_", + "linear", + "logsigmoid", + "native_channel_shuffle", + "one_hot", + "pairwise_distance", + "pdist", + "pixel_shuffle", + "pixel_unshuffle", + "prelu", + "relu_", + "rrelu_", + "selu_", + "softplus", + "softshrink", + "threshold_" + ], + "torch.nn.init": [ + "Tensor" + ], + "torch.nn.intrinsic.modules": [ + "_FusedModule" + ], + "torch.nn.intrinsic.modules.fused": [ + "BatchNorm1d", + "BatchNorm2d", + "BatchNorm3d", + "Conv1d", + "Conv2d", + "Conv3d", + "Linear", + "ReLU", + "type_before_parametrizations" + ], + "torch.nn.intrinsic.qat.modules.conv_fused": [ + "Parameter", + "TypeVar", + "fuse_conv_bn_weights" + ], + "torch.nn.intrinsic.qat.modules.linear_fused": [ + "Parameter", + "fuse_linear_bn_weights" + ], + "torch.nn.intrinsic.quantized.modules.conv_relu": [ + "fuse_conv_bn_weights" + ], + "torch.nn.modules.activation": [ + "Module", + "NonDynamicallyQuantizableLinear", + "Optional", + "Parameter", + "Tensor", + "Tuple", + "constant_", + "xavier_normal_", + "xavier_uniform_" + ], + "torch.nn.modules.adaptive": [ + "Linear", + "List", + "Module", + "ModuleList", + "Sequence", + "Sequential", + "Tensor", + "log_softmax", + "namedtuple" + ], + "torch.nn.modules.batchnorm": [ + "Any", + "LazyModuleMixin", + "Module", + "Optional", + "Parameter", + "Tensor", + "UninitializedBuffer", + "UninitializedParameter", + "sync_batch_norm" + ], + "torch.nn.modules.channelshuffle": [ + "Module", + "Tensor" + ], + "torch.nn.modules.container": [ + "Any", + "Dict", + "Iterable", + "Iterator", + "Mapping", + "Module", + "Optional", + "OrderedDict", + "Parameter", + "Tuple", + "TypeVar", + "Union", + "chain", + "islice", + "overload" + ], + "torch.nn.modules.conv": [ + "LazyModuleMixin", + "List", + "Module", + "Optional", + "Parameter", + "Tensor", + "Tuple", + "UninitializedParameter", + "Union" + ], + "torch.nn.modules.distance": [ + "Module", + "Tensor" + ], + "torch.nn.modules.dropout": [ + "Module", + "Tensor" + ], + "torch.nn.modules.flatten": [ + "Module", + "Tensor", + "Tuple", + "Union" + ], + "torch.nn.modules.fold": [ + "Module", + "Tensor" + ], + "torch.nn.modules.instancenorm": [ + "Tensor" + ], + "torch.nn.modules.lazy": [ + "Protocol", + "is_lazy" + ], + "torch.nn.modules.linear": [ + "LazyModuleMixin", + "Module", + "NonDynamicallyQuantizableLinear", + "Parameter", + "Tensor", + "UninitializedParameter" + ], + "torch.nn.modules.loss": [ + "Callable", + "Module", + "Optional", + "PairwiseDistance", + "Tensor" + ], + "torch.nn.modules.module": [ + "Any", + "Callable", + "Dict", + "Iterator", + "List", + "Mapping", + "Optional", + "OrderedDict", + "Parameter", + "RemovableHandle", + "Set", + "Tensor", + "Tuple", + "TypeVar", + "Union", + "device", + "dtype", + "namedtuple", + "overload" + ], + "torch.nn.modules.normalization": [ + "List", + "Module", + "Parameter", + "Size", + "Tensor", + "Tuple", + "Union" + ], + "torch.nn.modules.padding": [ + "Module", + "Sequence", + "Tensor", + "Tuple" + ], + "torch.nn.modules.pixelshuffle": [ + "Module", + "Tensor" + ], + "torch.nn.modules.pooling": [ + "List", + "Module", + "Optional", + "Tensor" + ], + "torch.nn.modules.rnn": [ + "List", + "Module", + "Optional", + "PackedSequence", + "Parameter", + "Tensor", + "Tuple", + "overload" + ], + "torch.nn.modules.sparse": [ + "Module", + "Optional", + "Parameter", + "Tensor" + ], + "torch.nn.modules.transformer": [ + "Any", + "Callable", + "Dropout", + "LayerNorm", + "Linear", + "Module", + "ModuleList", + "MultiheadAttention", + "Optional", + "Tensor", + "Union", + "xavier_uniform_" + ], + "torch.nn.modules.upsampling": [ + "Module", + "Optional", + "Tensor" + ], + "torch.nn.modules.utils": [ + "Any", + "Dict", + "List", + "repeat" + ], + "torch.nn.parallel": [ + "DistributedDataParallelCPU" + ], + "torch.nn.parallel.comm": [ + "List" + ], + "torch.nn.parallel.data_parallel": [ + "Module", + "chain", + "gather", + "parallel_apply", + "replicate", + "scatter_kwargs" + ], + "torch.nn.parallel.distributed": [ + "Any", + "Callable", + "Enum", + "Function", + "Join", + "JoinHook", + "Joinable", + "Module", + "RRef", + "ReduceOp", + "Type", + "Variable", + "auto", + "contextmanager", + "dataclass", + "gather", + "is_namedtuple", + "scatter_kwargs", + "tree_flatten", + "tree_unflatten" + ], + "torch.nn.parallel.parallel_apply": [ + "ExceptionWrapper", + "autocast" + ], + "torch.nn.parallel.replicate": [ + "OrderedDict" + ], + "torch.nn.parallel.scatter_gather": [ + "Gather", + "Scatter" + ], + "torch.nn.parameter": [ + "OrderedDict" + ], + "torch.nn.qat.dynamic.modules.linear": [ + "activation_is_memoryless" + ], + "torch.nn.qat.modules.conv": [ + "Tuple", + "TypeVar", + "Union" + ], + "torch.nn.qat.modules.embedding_ops": [ + "Tensor" + ], + "torch.nn.qat.modules.linear": [ + "LinearReLU", + "is_parametrized", + "transfer_parametrizations_and_params", + "type_before_parametrizations" + ], + "torch.nn.quantizable.modules.activation": [ + "Optional", + "Tensor", + "Tuple" + ], + "torch.nn.quantizable.modules.rnn": [ + "Optional", + "Tensor", + "Tuple" + ], + "torch.nn.quantized": [ + "MaxPool2d" + ], + "torch.nn.quantized.dynamic.modules.conv": [ + "Tensor" + ], + "torch.nn.quantized.dynamic.modules.rnn": [ + "Dict", + "List", + "Optional", + "PackedSequence", + "Tensor", + "Tuple", + "Union" + ], + "torch.nn.quantized.functional": [ + "List", + "Optional", + "Tensor" + ], + "torch.nn.quantized.modules": [ + "MaxPool2d", + "_ConvNd" + ], + "torch.nn.quantized.modules.batchnorm": [ + "Tensor" + ], + "torch.nn.quantized.modules.conv": [ + "List", + "Optional", + "TypeVar", + "WeightedQuantizedModule", + "fuse_conv_bn_weights" + ], + "torch.nn.quantized.modules.embedding_ops": [ + "List", + "Optional", + "Tensor", + "hide_packed_params_repr" + ], + "torch.nn.quantized.modules.functional_modules": [ + "List", + "Tensor" + ], + "torch.nn.quantized.modules.linear": [ + "Iterable", + "Optional", + "WeightedQuantizedModule", + "fuse_linear_bn_weights", + "hide_packed_params_repr", + "type_before_parametrizations" + ], + "torch.nn.quantized.modules.utils": [ + "repeat" + ], + "torch.nn.utils.clip_grad": [ + "Iterable", + "Union" + ], + "torch.nn.utils.convert_parameters": [ + "Iterable", + "Optional" + ], + "torch.nn.utils.parametrizations": [ + "Enum", + "Module", + "Optional", + "Tensor", + "auto" + ], + "torch.nn.utils.parametrize": [ + "Dict", + "Module", + "ModuleDict", + "ModuleList", + "Optional", + "Parameter", + "Sequence", + "Tensor", + "Tuple", + "Union", + "contextmanager" + ], + "torch.nn.utils.rnn": [ + "Iterable", + "List", + "Optional", + "Tensor", + "Tuple", + "Union", + "namedtuple" + ], + "torch.nn.utils.spectral_norm": [ + "Any", + "Module", + "Optional", + "TypeVar", + "normalize" + ], + "torch.nn.utils.weight_norm": [ + "Any", + "Module", + "Parameter", + "TypeVar", + "UninitializedParameter", + "norm_except_dim" + ], + "torch.onnx": [ + "Dict", + "OperatorExportTypes", + "Optional", + "TensorProtoDataType", + "TrainingMode" + ], + "torch.optim.adadelta": [ + "List", + "Optimizer", + "Optional", + "Tensor" + ], + "torch.optim.adagrad": [ + "List", + "Optimizer", + "Optional", + "Tensor" + ], + "torch.optim.adam": [ + "List", + "Optimizer", + "Optional", + "Tensor" + ], + "torch.optim.adamax": [ + "List", + "Optimizer", + "Optional", + "Tensor" + ], + "torch.optim.adamw": [ + "List", + "Optimizer", + "Optional", + "Tensor" + ], + "torch.optim.asgd": [ + "List", + "Optimizer", + "Optional", + "Tensor" + ], + "torch.optim.lbfgs": [ + "Optimizer", + "reduce" + ], + "torch.optim.lr_scheduler": [ + "Counter", + "Optimizer", + "bisect_right", + "wraps" + ], + "torch.optim.nadam": [ + "List", + "Optimizer", + "Optional", + "Tensor" + ], + "torch.optim.optimizer": [ + "chain", + "deepcopy", + "defaultdict" + ], + "torch.optim.radam": [ + "List", + "Optimizer", + "Optional", + "Tensor" + ], + "torch.optim.rmsprop": [ + "List", + "Optimizer", + "Optional", + "Tensor" + ], + "torch.optim.rprop": [ + "List", + "Optimizer", + "Optional", + "Tensor" + ], + "torch.optim.sgd": [ + "List", + "Optimizer", + "Optional", + "Tensor" + ], + "torch.optim.sparse_adam": [ + "Optimizer" + ], + "torch.optim.swa_utils": [ + "Module", + "deepcopy" + ], + "torch.overrides": [ + "BaseTorchFunctionMode", + "TorchFunctionMode", + "TorchFunctionModeMeta", + "enable_torch_function_mode", + "get_default_nowrap_functions", + "has_torch_function", + "push_torch_function_mode" + ], + "torch.package.analyze.find_first_use_of_broken_modules": [ + "Dict", + "List", + "PackagingError" + ], + "torch.package.analyze.is_from_package": [ + "Any", + "ModuleType", + "is_mangled" + ], + "torch.package.analyze.trace_dependencies": [ + "Any", + "Callable", + "Iterable", + "List", + "Tuple" + ], + "torch.package.file_structure_representation": [ + "Dict", + "GlobGroup", + "GlobPattern", + "List" + ], + "torch.package.find_file_dependencies": [ + "List", + "Optional", + "Tuple" + ], + "torch.package.glob_group": [ + "GlobPattern", + "Iterable", + "Union" + ], + "torch.package.importer": [ + "ABC", + "Any", + "Dict", + "List", + "ModuleType", + "Optional", + "Tuple", + "abstractmethod", + "demangle", + "get_mangle_prefix", + "is_mangled" + ], + "torch.package.package_exporter": [ + "ActionHook", + "Any", + "BinaryIO", + "Callable", + "DefaultDict", + "DiGraph", + "Dict", + "Enum", + "GlobGroup", + "GlobPattern", + "Importer", + "List", + "Optional", + "OrderedDict", + "OrderedImporter", + "Path", + "RemovableHandle", + "Sequence", + "Set", + "Storage", + "Union", + "cast", + "create_pickler", + "dataclass", + "defaultdict", + "demangle", + "find_files_source_depends_on", + "is_mangled", + "is_stdlib_module", + "location_tag", + "normalize_storage_type" + ], + "torch.package.package_importer": [ + "Any", + "BinaryIO", + "Callable", + "Dict", + "Directory", + "DirectoryReader", + "GlobPattern", + "Importer", + "List", + "Optional", + "PackageMangler", + "PackageUnpickler", + "Path", + "Union", + "WeakValueDictionary", + "cast", + "contextmanager", + "demangle" + ], + "torch.profiler": [ + "DeviceType", + "ProfilerActivity", + "kineto_available", + "record_function" + ], + "torch.profiler.profiler": [ + "Any", + "Callable", + "Dict", + "Enum", + "Iterable", + "List", + "Optional", + "ProfilerActivity", + "Tuple", + "kineto_available", + "partial", + "warn" + ], + "torch.quantization": [ + "ABC", + "DeQuantStub", + "FakeQuantize", + "FakeQuantizeBase", + "FixedQParamsFakeQuantize", + "FusedMovingAvgObsFakeQuantize", + "HistogramObserver", + "MinMaxObserver", + "MovingAverageMinMaxObserver", + "MovingAveragePerChannelMinMaxObserver", + "NoopObserver", + "ObserverBase", + "PerChannelMinMaxObserver", + "PlaceholderObserver", + "QConfig", + "QConfigAny", + "QConfigDynamic", + "QuantStub", + "QuantType", + "QuantWrapper", + "RecordingObserver", + "add_module_to_qconfig_obs_ctr", + "add_observer_", + "add_quant_dequant", + "assert_valid_qconfig", + "convert", + "convert_dynamic_jit", + "convert_jit", + "default_fixed_qparams_range_0to1_fake_quant", + "default_affine_fixed_qparams_fake_quant", + "default_debug_observer", + "default_dynamic_quant_observer", + "default_fake_quant", + "default_float_qparams_observer", + "default_fused_act_fake_quant", + "default_fused_per_channel_wt_fake_quant", + "default_fused_wt_fake_quant", + "default_histogram_fake_quant", + "default_histogram_observer", + "default_observer", + "default_per_channel_weight_fake_quant", + "default_per_channel_weight_observer", + "default_placeholder_observer", + "default_fixed_qparams_range_neg1to1_fake_quant", + "default_symmetric_fixed_qparams_fake_quant", + "default_weight_fake_quant", + "default_weight_observer", + "disable_fake_quant", + "disable_observer", + "enable_fake_quant", + "enable_observer", + "fuse_conv_bn", + "fuse_conv_bn_jit", + "fuse_conv_bn_relu", + "fuse_linear_bn", + "fuse_modules", + "get_default_compare_output_module_list", + "get_default_dynamic_quant_module_mappings", + "get_default_float_to_quantized_operator_mappings", + "get_default_qat_module_mappings", + "get_default_qat_qconfig", + "get_default_qconfig", + "get_default_qconfig_propagation_list", + "get_default_static_quant_module_mappings", + "get_dynamic_quant_module_class", + "get_fuser_method", + "get_observer_dict", + "get_observer_state_dict", + "get_quantized_operator", + "get_static_quant_module_class", + "get_unique_devices_", + "is_activation_post_process", + "load_observer_state_dict", + "no_observer_set", + "prepare", + "prepare_dynamic_jit", + "prepare_jit", + "prepare_qat", + "propagate_qconfig_", + "qconfig_equals", + "quant_type_to_str", + "quantize", + "quantize_dynamic", + "quantize_dynamic_jit", + "quantize_jit", + "quantize_qat", + "register_activation_post_process_hook", + "script_qconfig", + "script_qconfig_dict", + "swap_module" + ], + "torch.quantization.fake_quantize": [ + "FakeQuantize", + "FakeQuantizeBase", + "FixedQParamsFakeQuantize", + "FusedMovingAvgObsFakeQuantize", + "default_fixed_qparams_range_0to1_fake_quant", + "default_affine_fixed_qparams_fake_quant", + "default_fake_quant", + "default_fused_act_fake_quant", + "default_fused_per_channel_wt_fake_quant", + "default_fused_wt_fake_quant", + "default_histogram_fake_quant", + "default_per_channel_weight_fake_quant", + "default_fixed_qparams_range_neg1to1_fake_quant", + "default_symmetric_fixed_qparams_fake_quant", + "default_weight_fake_quant", + "disable_fake_quant", + "disable_observer", + "enable_fake_quant", + "enable_observer" + ], + "torch.quantization.fuse_modules": [ + "fuse_conv_bn", + "fuse_conv_bn_relu", + "fuse_known_modules", + "fuse_modules", + "get_fuser_method" + ], + "torch.quantization.fuser_method_mappings": [ + "fuse_conv_bn", + "fuse_conv_bn_relu", + "fuse_linear_bn", + "get_fuser_method" + ], + "torch.quantization.observer": [ + "ABC", + "HistogramObserver", + "MinMaxObserver", + "MovingAverageMinMaxObserver", + "MovingAveragePerChannelMinMaxObserver", + "NoopObserver", + "ObserverBase", + "PerChannelMinMaxObserver", + "PlaceholderObserver", + "RecordingObserver", + "default_debug_observer", + "default_dynamic_quant_observer", + "default_float_qparams_observer", + "default_histogram_observer", + "default_observer", + "default_per_channel_weight_observer", + "default_placeholder_observer", + "default_weight_observer", + "get_observer_state_dict", + "load_observer_state_dict" + ], + "torch.quantization.qconfig": [ + "QConfig", + "QConfigAny", + "QConfigDynamic", + "add_module_to_qconfig_obs_ctr", + "assert_valid_qconfig", + "get_default_qat_qconfig", + "get_default_qconfig", + "qconfig_equals" + ], + "torch.quantization.quant_type": [ + "QuantType", + "quant_type_to_str" + ], + "torch.quantization.quantization_mappings": [ + "get_default_compare_output_module_list", + "get_default_dynamic_quant_module_mappings", + "get_default_float_to_quantized_operator_mappings", + "get_default_qat_module_mappings", + "get_default_qconfig_propagation_list", + "get_default_static_quant_module_mappings", + "get_dynamic_quant_module_class", + "get_quantized_operator", + "get_static_quant_module_class", + "no_observer_set" + ], + "torch.quantization.quantize": [ + "add_observer_", + "add_quant_dequant", + "convert", + "get_observer_dict", + "get_unique_devices_", + "is_activation_post_process", + "prepare", + "prepare_qat", + "propagate_qconfig_", + "quantize", + "quantize_dynamic", + "quantize_qat", + "register_activation_post_process_hook", + "swap_module" + ], + "torch.quantization.quantize_jit": [ + "convert_dynamic_jit", + "convert_jit", + "fuse_conv_bn_jit", + "prepare_dynamic_jit", + "prepare_jit", + "quantize_dynamic_jit", + "quantize_jit", + "script_qconfig", + "script_qconfig_dict" + ], + "torch.quantization.stubs": [ + "DeQuantStub", + "QuantStub", + "QuantWrapper" + ], + "torch.quasirandom": [ + "Optional" + ], + "torch.random": [ + "Generator" + ], + "torch.return_types": [ + "_det_lu_based_helper", + "_fake_quantize_per_tensor_affine_cachemask_tensor_qparams", + "_fused_moving_avg_obs_fq_helper", + "_linalg_svd", + "_linalg_svd_out", + "_lu_with_info", + "_unpack_dual", + "attr", + "pytree_register_structseq" + ], + "torch.serialization": [ + "Any", + "BinaryIO", + "Dict", + "IO", + "Optional", + "Storage", + "Tuple", + "Type", + "Union", + "cast", + "closing", + "contextmanager", + "get_source_lines_and_file" + ], + "torch.sparse": [ + "BFloat16Tensor", + "ByteTensor", + "CharTensor", + "DoubleTensor", + "FloatTensor", + "HalfTensor", + "IntTensor", + "LongTensor", + "ShortTensor", + "addmm", + "log_softmax", + "mm", + "softmax" + ], + "torch.special": [ + "digamma", + "entr", + "erf", + "erfc", + "erfcx", + "erfinv", + "exp2", + "expit", + "expm1", + "gammainc", + "gammaincc", + "gammaln", + "i0", + "i0e", + "i1", + "i1e", + "log1p", + "log_ndtr", + "log_softmax", + "logit", + "logsumexp", + "multigammaln", + "ndtr", + "ndtri", + "polygamma", + "psi", + "round", + "sinc", + "softmax", + "xlog1py", + "xlogy", + "zeta" + ], + "torch.storage": [ + "Any", + "Storage", + "Type", + "TypeVar", + "Union", + "cast", + "lru_cache" + ], + "torch.testing": [ + "FileCheck", + "all_types", + "all_types_and", + "all_types_and_complex", + "all_types_and_complex_and", + "all_types_and_half", + "assert_allclose", + "assert_close", + "complex_types", + "double_types", + "empty_types", + "floating_and_complex_types", + "floating_and_complex_types_and", + "floating_types", + "floating_types_and", + "floating_types_and_half", + "get_all_complex_dtypes", + "get_all_device_types", + "get_all_dtypes", + "get_all_fp_dtypes", + "get_all_int_dtypes", + "get_all_math_dtypes", + "integral_types", + "integral_types_and", + "make_non_contiguous", + "make_tensor", + "rand", + "randn" + ], + "torch.torch_version": [ + "Any", + "Iterable" + ], + "torch.types": [ + "Any", + "Device", + "List", + "Number", + "Sequence", + "Tuple", + "Union" + ], + "torch.utils": [ + "disable_minidumps", + "enable_minidumps", + "enable_minidumps_on_exceptions" + ], + "torch.utils.benchmark.utils.common": [ + "_make_temp_dir", + "ordered_unique", + "select_unit", + "set_torch_threads", + "trim_sigfig", + "unit_to_english" + ], + "torch.utils.benchmark.utils.compare": [ + "Colorize", + "Table", + "optional_min" + ], + "torch.utils.benchmark.utils.cpp_jit": [ + "Any", + "CallgrindModuleType", + "List", + "Optional", + "TimeitModuleType" + ], + "torch.utils.benchmark.utils.fuzzer": [ + "dtype_size", + "prod" + ], + "torch.utils.benchmark.utils.sparse_fuzzer": [ + "FuzzedTensor", + "Number", + "Optional", + "Tuple", + "Union" + ], + "torch.utils.benchmark.utils.timer": [ + "CPPTimer", + "timer" + ], + "torch.utils.benchmark.utils.valgrind_wrapper.timer_interface": [ + "GlobalsBridge", + "Serialization", + "wrapper_singleton" + ], + "torch.utils.cpp_extension": [ + "ExtensionVersioner", + "FileBaton", + "GeneratedFileCleaner", + "List", + "Optional", + "TorchVersion", + "Tuple", + "Union", + "build_ext", + "get_hip_file_path" + ], + "torch.utils.data": [ + "_DatasetKind", + "argument_validation", + "default_collate", + "default_convert", + "functional_datapipe", + "get_worker_info", + "guaranteed_datapipes_determinism", + "non_deterministic", + "runtime_validation", + "runtime_validation_disabled" + ], + "torch.utils.data.dataloader": [ + "default_collate", + "default_convert", + "get_worker_info" + ], + "torch.utils.data.datapipes.dataframe": [ + "DFIterDataPipe" + ], + "torch.utils.dlpack": [ + "Any", + "to_dlpack" + ], + "torch.utils.hipify.hipify_python": [ + "Dict", + "HipifyFinalResult", + "HipifyResult", + "Iterable", + "Iterator", + "List", + "Mapping", + "Optional" + ], + "torch.utils.hooks": [ + "Any", + "OrderedDict" + ], + "torch.utils.show_pickle": [ + "Any", + "BinaryIO", + "IO", + "Union" + ], + "torch.utils.tensorboard.summary": [ + "HistogramProto", + "Optional", + "PrCurvePluginData", + "Summary", + "SummaryMetadata", + "TensorProto", + "TensorShapeProto", + "TextPluginData", + "convert_to_HWC", + "make_np", + "range" + ], + "torch.utils.tensorboard.writer": [ + "Event", + "EventFileWriter", + "ProjectorConfig", + "SessionLog", + "audio", + "custom_scalars", + "figure_to_image", + "get_embedding_info", + "graph", + "histogram", + "histogram_raw", + "hparams", + "image", + "image_boxes", + "load_onnx_graph", + "make_mat", + "make_np", + "make_sprite", + "make_tsv", + "mesh", + "pr_curve", + "pr_curve_raw", + "scalar", + "text", + "video", + "write_pbtxt" + ], + "torch": [ + "BFloat16Storage", + "BFloat16Tensor", + "ComplexDoubleStorage", + "ComplexFloatStorage", + "DisableTorchFunction", + "Generator", + "HalfStorage", + "HalfTensor", + "QInt32Storage", + "QInt8Storage", + "QUInt2x4Storage", + "QUInt4x2Storage", + "QUInt8Storage", + "Storage", + "_TypedStorage", + "_adaptive_avg_pool2d", + "_adaptive_avg_pool3d", + "_add_batch_dim", + "_add_relu", + "_add_relu_", + "_addmm_activation", + "_aminmax", + "_amp_foreach_non_finite_check_and_unscale_", + "_amp_update_scale_", + "_assert_async", + "_batch_norm_impl_index", + "_cast_Byte", + "_cast_Char", + "_cast_Double", + "_cast_Float", + "_cast_Half", + "_cast_Int", + "_cast_Long", + "_cast_Short", + "_choose_qparams_per_tensor", + "_coalesce", + "_compute_linear_combination", + "_conj", + "_conj_copy", + "_conj_physical", + "_convert_indices_from_coo_to_csr", + "_convert_indices_from_csr_to_coo", + "_convolution", + "_convolution_mode", + "_copy_from", + "_copy_from_and_resize", + "_ctc_loss", + "_cudnn_ctc_loss", + "_cudnn_init_dropout_state", + "_cudnn_rnn", + "_cudnn_rnn_flatten_weight", + "_cufft_clear_plan_cache", + "_cufft_get_plan_cache_max_size", + "_cufft_get_plan_cache_size", + "_cufft_set_plan_cache_max_size", + "_cummax_helper", + "_cummin_helper", + "_debug_has_internal_overlap", + "_det_lu_based_helper", + "_det_lu_based_helper_backward_helper", + "_dim_arange", + "_dirichlet_grad", + "_disable_functionalization", + "_efficientzerotensor", + "_embedding_bag", + "_embedding_bag_forward_only", + "_empty_affine_quantized", + "_empty_per_channel_affine_quantized", + "_enable_functionalization", + "_euclidean_dist", + "_fake_quantize_learnable_per_channel_affine", + "_fake_quantize_learnable_per_tensor_affine", + "_fake_quantize_per_tensor_affine_cachemask_tensor_qparams", + "_fft_c2c", + "_fft_c2r", + "_fft_r2c", + "_foreach_abs", + "_foreach_abs_", + "_foreach_acos", + "_foreach_acos_", + "_foreach_add", + "_foreach_add_", + "_foreach_addcdiv", + "_foreach_addcdiv_", + "_foreach_addcmul", + "_foreach_addcmul_", + "_foreach_asin", + "_foreach_asin_", + "_foreach_atan", + "_foreach_atan_", + "_foreach_ceil", + "_foreach_ceil_", + "_foreach_cos", + "_foreach_cos_", + "_foreach_cosh", + "_foreach_cosh_", + "_foreach_div", + "_foreach_div_", + "_foreach_erf", + "_foreach_erf_", + "_foreach_erfc", + "_foreach_erfc_", + "_foreach_exp", + "_foreach_exp_", + "_foreach_expm1", + "_foreach_expm1_", + "_foreach_floor", + "_foreach_floor_", + "_foreach_frac", + "_foreach_frac_", + "_foreach_lgamma", + "_foreach_lgamma_", + "_foreach_log", + "_foreach_log10", + "_foreach_log10_", + "_foreach_log1p", + "_foreach_log1p_", + "_foreach_log2", + "_foreach_log2_", + "_foreach_log_", + "_foreach_maximum", + "_foreach_minimum", + "_foreach_mul", + "_foreach_mul_", + "_foreach_neg", + "_foreach_neg_", + "_foreach_norm", + "_foreach_reciprocal", + "_foreach_reciprocal_", + "_foreach_round", + "_foreach_round_", + "_foreach_sigmoid", + "_foreach_sigmoid_", + "_foreach_sin", + "_foreach_sin_", + "_foreach_sinh", + "_foreach_sinh_", + "_foreach_sqrt", + "_foreach_sqrt_", + "_foreach_sub", + "_foreach_sub_", + "_foreach_tan", + "_foreach_tan_", + "_foreach_tanh", + "_foreach_tanh_", + "_foreach_trunc", + "_foreach_trunc_", + "_foreach_zero_", + "_from_functional_tensor", + "_fused_dropout", + "_fused_moving_avg_obs_fq_helper", + "_fw_primal_copy", + "_grid_sampler_2d_cpu_fallback", + "_has_compatible_shallow_copy_type", + "_histogramdd_bin_edges", + "_histogramdd_from_bin_cts", + "_histogramdd_from_bin_tensors", + "_index_put_impl_", + "_indices_copy", + "_is_functional_tensor", + "_is_zerotensor", + "_linalg_check_errors", + "_linalg_inv_out_helper_", + "_linalg_qr_helper", + "_linalg_svd", + "_log_softmax", + "_log_softmax_backward_data", + "_logcumsumexp", + "_lu_with_info", + "_make_dual", + "_make_dual_copy", + "_make_per_channel_quantized_tensor", + "_make_per_tensor_quantized_tensor", + "_masked_scale", + "_masked_softmax", + "_mkldnn_reshape", + "_mkldnn_transpose", + "_mkldnn_transpose_", + "_neg_view", + "_neg_view_copy", + "_nested_from_padded", + "_nested_from_padded_and_nested_example", + "_nnpack_available", + "_nnpack_spatial_convolution", + "_pack_padded_sequence", + "_pad_packed_sequence", + "_pin_memory", + "_remove_batch_dim", + "_reshape_alias_copy", + "_reshape_from_tensor", + "_rowwise_prune", + "_sample_dirichlet", + "_saturate_weight_to_fp16", + "_shape_as_tensor", + "_sobol_engine_draw", + "_sobol_engine_ff_", + "_sobol_engine_initialize_state_", + "_sobol_engine_scramble_", + "_softmax", + "_softmax_backward_data", + "_sparse_broadcast_to", + "_sparse_broadcast_to_copy", + "_sparse_coo_tensor_unsafe", + "_sparse_csr_prod", + "_sparse_csr_sum", + "_sparse_csr_tensor_unsafe", + "_sparse_log_softmax_backward_data", + "_sparse_mask_helper", + "_sparse_softmax_backward_data", + "_sparse_sparse_matmul", + "_sparse_sum", + "_stack", + "_standard_gamma", + "_standard_gamma_grad", + "_sync", + "_test_serialization_subcmul", + "_to_cpu", + "_to_functional_tensor", + "_torch_cuda_cu_linker_symbol_op", + "_trilinear", + "_unique", + "_unique2", + "_unpack_dual", + "_use_cudnn_ctc_loss", + "_use_cudnn_rnn_flatten_weight", + "_validate_sparse_compressed_tensor_args", + "_validate_sparse_coo_tensor_args", + "_validate_sparse_csr_tensor_args", + "_values_copy", + "_weight_norm", + "_weight_norm_interface", + "autocast", + "broadcast_shapes", + "candidate", + "compiled_with_cxx11_abi", + "from_dlpack", + "lobpcg", + "lu", + "obj", + "set_default_dtype", + "set_grad_enabled", + "set_printoptions", + "unique" + ] +} diff --git a/test/ao/sparsity/test_composability.py b/test/ao/sparsity/test_composability.py new file mode 100644 index 000000000000..b44c88550774 --- /dev/null +++ b/test/ao/sparsity/test_composability.py @@ -0,0 +1,304 @@ +# -*- coding: utf-8 -*- +# Owner(s): ["module: unknown"] + + +import logging + +import torch +import torch.ao.quantization as tq +from torch import nn +from torch.ao import sparsity +from torch.testing._internal.common_utils import TestCase + +logging.basicConfig( + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO +) + +sparse_defaults = { + "sparsity_level": 0.8, + "sparse_block_shape": (1, 4), + "zeros_per_block": 4, +} + +# This series of tests are to check the composability goals for sparsity and quantization. Namely +# that performing quantization and sparsity model manipulations in various orderings +# does not cause problems +class TestComposability(TestCase): + def _get_model_and_sparsifier_and_sparse_config(self, qconfig=None): + model = nn.Sequential( + nn.Linear(4, 4), # 0 + nn.ReLU(), + nn.Linear(4, 4), # 2 + nn.ReLU(), + tq.QuantStub(), + nn.Linear(4, 4), # 5 + nn.ReLU(), + tq.DeQuantStub(), + ) + if qconfig is None: + model[4].qconfig = tq.get_default_qconfig("fbgemm") + model[5].qconfig = tq.get_default_qconfig("fbgemm") + else: + model[4].qconfig = qconfig + model[5].qconfig = qconfig + + sparsifier = sparsity.WeightNormSparsifier(**sparse_defaults) + + sparse_config = [ + { + "module": model[5], + "sparsity_level": 0.7, + "sparse_block_shape": (1, 4), + "zeros_per_block": 4, + }, + model[0], + ] + return model, sparsifier, sparse_config + + def _squash_mask_calibrate_and_convert(self, model, sparsifier, input): + sparsifier.step() + sparsifier.squash_mask() + model(input) + tq.convert(model, inplace=True) + + def _calculate_sparsity(self, tensor): + return ((tensor == 0).sum() / tensor.numel()).item() + + # This test checks whether performing quantization prepare before sparse prepare + # causes any issues and verifies that the correct observers are inserted and that + # the quantized model works as expected + def test_q_prep_before_s_prep(self): + ( + mod, + sparsifier, + sparse_config, + ) = self._get_model_and_sparsifier_and_sparse_config() + + tq.prepare(mod, inplace=True) + sparsifier.prepare(mod, config=sparse_config) + + # check that correct modules had parametrizations added + self.assertTrue(hasattr(mod[0], "parametrizations")) + self.assertTrue(hasattr(mod[5], "parametrizations")) + # check that correct observers were inserted + self.assertTrue(hasattr(mod[5], "activation_post_process")) + + self._squash_mask_calibrate_and_convert( + mod, sparsifier, torch.randn(1, 4, 4, 4) + ) + + # check that final module is the expected quantized module and that the model runs + self.assertTrue(isinstance(mod[5], torch.nn.quantized.Linear)) + self.assertEqual(mod(torch.randn(1, 4, 4, 4)).shape, torch.Size([1, 4, 4, 4])) + + # This test checks whether performing sparsity prepare before quantization prepare + # causes any issues. In particular, previous quantization flow was unable to match + # the post sparse prepare module names (adding parametrizations changes the module class names) + # which would result in those parametrized modules not being quantized. This test verifies that + # the fix for this was successful. + def test_s_prep_before_q_prep(self): + ( + mod, + sparsifier, + sparse_config, + ) = self._get_model_and_sparsifier_and_sparse_config() + + sparsifier.prepare(mod, config=sparse_config) + tq.prepare(mod, inplace=True) + + # check that correct modules had parametrizations added and + # that none were lost during prepare + self.assertTrue(hasattr(mod[0], "parametrizations")) + self.assertTrue(hasattr(mod[5], "parametrizations")) + + # check that correct observers were inserted and that matching + # occured successfully + self.assertTrue(hasattr(mod[5], "activation_post_process")) + + self._squash_mask_calibrate_and_convert( + mod, sparsifier, torch.randn(1, 4, 4, 4) + ) + + # check that final module is the expected quantized module and that the model runs + self.assertTrue(isinstance(mod[5], torch.nn.quantized.Linear)) + self.assertEqual(mod(torch.randn(1, 4, 4, 4)).shape, torch.Size([1, 4, 4, 4])) + + # if the sparsified modules have not undergone the final squash mask operation, its possible + # that the problem outlined in test_s_prep_before_q_prep would occur. This test verifies + # both that the fix to the convert flow avoids this issue and that the resulting quantized + # module uses the sparse version of the weight value. + def test_convert_without_squash_mask(self): + ( + mod, + sparsifier, + sparse_config, + ) = self._get_model_and_sparsifier_and_sparse_config() + + sparsifier.prepare(mod, config=sparse_config) + tq.prepare(mod, inplace=True) + + # check that correct modules had parametrizations added and + # that none were lost during prepare + self.assertTrue(hasattr(mod[0], "parametrizations")) + self.assertTrue(hasattr(mod[5], "parametrizations")) + + # check that correct observers were inserted and that matching + # occured successfully + self.assertTrue(hasattr(mod[5], "activation_post_process")) + sparsifier.step() + sparsity_level = self._calculate_sparsity(mod[5].weight) + mod(torch.randn(1, 4, 4, 4)) + tq.convert(mod, inplace=True) + + # check that final module is the expected quantized module and that the model runs + self.assertTrue(isinstance(mod[5], torch.nn.quantized.Linear)) + self.assertEqual(mod(torch.randn(1, 4, 4, 4)).shape, torch.Size([1, 4, 4, 4])) + + # check that module was actually sparsified + cur_sparsity = self._calculate_sparsity(mod[5]._weight_bias()[0]) + self.assertGreaterAlmostEqual(cur_sparsity, sparsity_level) + self.assertGreaterAlmostEqual( + sparsity_level, sparse_config[0]["sparsity_level"] + ) + self.assertGreaterAlmostEqual(cur_sparsity, sparse_config[0]["sparsity_level"]) + + # This tests whether performing sparse prepare before fusion causes any issues. The + # worry was that the link created between the sparsifier and the modules that need to + # be sparsified would be broken. + def test_s_prep_before_fusion(self): + ( + mod, + sparsifier, + sparse_config, + ) = self._get_model_and_sparsifier_and_sparse_config() + sparsifier.prepare(mod, config=sparse_config) + tq.fuse_modules(mod, [["5", "6"]], inplace=True) + mod[5].qconfig = tq.get_default_qconfig("fbgemm") + tq.prepare(mod, inplace=True) + + # check that correct modules had parametrizations added and + # that none were lost during prepare or fusion + self.assertTrue(hasattr(mod[0], "parametrizations")) + self.assertTrue(hasattr(mod[5][0], "parametrizations")) + + # check that correct observers were inserted and that matching + # occured successfully + self.assertTrue(hasattr(mod[5], "activation_post_process")) + self._squash_mask_calibrate_and_convert( + mod, sparsifier, torch.randn(1, 4, 4, 4) + ) + + # check that final module is the expected quantized module and that the model runs + self.assertTrue(isinstance(mod[5], torch.nn.intrinsic.quantized.LinearReLU)) + self.assertEqual(mod(torch.randn(1, 4, 4, 4)).shape, torch.Size([1, 4, 4, 4])) + + # This tests whether performing fusion before sparse prepare causes and issues. The + # main worry was that the links to the modules in the sparse config would be broken by fusion. + def test_fusion_before_s_prep(self): + ( + mod, + sparsifier, + sparse_config, + ) = self._get_model_and_sparsifier_and_sparse_config() + tq.fuse_modules(mod, [["5", "6"]], inplace=True) + sparsifier.prepare(mod, config=sparse_config) + mod[5].qconfig = tq.get_default_qconfig("fbgemm") + tq.prepare(mod, inplace=True) + + # check that correct modules had parametrizations added and + # that none were lost during prepare + self.assertTrue(hasattr(mod[0], "parametrizations")) + self.assertTrue(hasattr(mod[5][0], "parametrizations")) + + # check that correct observers were inserted and that matching + # occured successfully + self.assertTrue(hasattr(mod[5], "activation_post_process")) + sparsifier.step() + sparsity_level = self._calculate_sparsity(mod[5][0].weight) + mod(torch.randn(1, 4, 4, 4)) + tq.convert(mod, inplace=True) + + # check that final module is the expected quantized module and that the model runs + self.assertTrue(isinstance(mod[5], torch.nn.intrinsic.quantized.LinearReLU)) + self.assertEqual(mod(torch.randn(1, 4, 4, 4)).shape, torch.Size([1, 4, 4, 4])) + + # check that module was actually sparsified + cur_sparsity = self._calculate_sparsity(mod[5]._weight_bias()[0]) + self.assertGreaterAlmostEqual(cur_sparsity, sparsity_level) + self.assertGreaterAlmostEqual( + sparsity_level, sparse_config[0]["sparsity_level"] + ) + self.assertGreaterAlmostEqual(cur_sparsity, sparse_config[0]["sparsity_level"]) + + # This tests whether performing sparse prepare before qat prepare causes issues. + # The primary worries were that qat_prep wouldn't recognize the parametrized + # modules and that the convert step for qat would remove the paramerizations + # from the modules. + def test_s_prep_before_qat_prep(self): + ( + mod, + sparsifier, + sparse_config, + ) = self._get_model_and_sparsifier_and_sparse_config( + tq.get_default_qat_qconfig("fbgemm") + ) + sparsifier.prepare(mod, config=sparse_config) + tq.prepare_qat(mod, inplace=True) + self.assertTrue(hasattr(mod[0], "parametrizations")) + self.assertTrue(hasattr(mod[5], "parametrizations")) + + # check that correct observers were inserted and that matching + # occured successfully + self.assertTrue(hasattr(mod[5], "activation_post_process")) + self.assertTrue(isinstance(mod[5], torch.nn.qat.Linear)) + self._squash_mask_calibrate_and_convert( + mod, sparsifier, torch.randn(1, 4, 4, 4) + ) + # check that final module is the expected quantized module and that the model runs + self.assertTrue(isinstance(mod[5], torch.nn.quantized.Linear)) + self.assertEqual(mod(torch.randn(1, 4, 4, 4)).shape, torch.Size([1, 4, 4, 4])) + + # check that module was actually sparsified + cur_sparsity = self._calculate_sparsity(mod[5]._weight_bias()[0]) + self.assertGreaterAlmostEqual(cur_sparsity, sparse_config[0]["sparsity_level"]) + + # This tests whether performing qat prepare before sparse prepare causes issues. + def test_qat_prep_before_s_prep(self): + mod, sparsifier, _ = self._get_model_and_sparsifier_and_sparse_config( + tq.get_default_qat_qconfig("fbgemm") + ) + tq.prepare_qat(mod, inplace=True) + + # need to setup sparse_config on new modules + sparse_config = [ + { + "module": mod[5], + "sparsity_level": 0.7, + "sparse_block_shape": (1, 4), + "zeros_per_block": 4, + }, + mod[0], + ] + sparsifier.prepare(mod, config=sparse_config) + + # check that correct modules had parametrizations added and + # that none were lost during qat prepare + self.assertTrue(hasattr(mod[0], "parametrizations")) + self.assertTrue(hasattr(mod[5], "parametrizations")) + + # check that correct observers were inserted and that matching + # occured successfully + self.assertTrue(hasattr(mod[5], "activation_post_process")) + self.assertTrue(isinstance(mod[5], torch.nn.qat.Linear)) + + self._squash_mask_calibrate_and_convert( + mod, sparsifier, torch.randn(1, 4, 4, 4) + ) + + # check that final module is the expected quantized module and that the model runs + self.assertTrue(isinstance(mod[5], torch.nn.quantized.Linear)) + self.assertEqual(mod(torch.randn(1, 4, 4, 4)).shape, torch.Size([1, 4, 4, 4])) + + # check that module was actually sparsified + cur_sparsity = self._calculate_sparsity(mod[5]._weight_bias()[0]) + self.assertGreaterAlmostEqual(cur_sparsity, sparse_config[0]["sparsity_level"]) diff --git a/test/ao/sparsity/test_kernels.py b/test/ao/sparsity/test_kernels.py index 8deec46b4188..04a934345999 100644 --- a/test/ao/sparsity/test_kernels.py +++ b/test/ao/sparsity/test_kernels.py @@ -22,6 +22,7 @@ override_qengines, qengine_is_qnnpack, qengine_is_fbgemm, + qengine_is_onednn, ) # TODO: Once more test files are created, move the contents to a ao folder. @@ -48,6 +49,9 @@ def test_sparse_qlinear(self): # to other higher priority works. if qengine_is_qnnpack() and not (row_block_size == 1 and col_block_size == 4): return + # ONEDNN does not support this yet + if qengine_is_onednn(): + return dense_prepack = torch.ops.quantized.linear_prepack dense_qlinear = torch.ops.quantized.linear @@ -215,6 +219,10 @@ def test_sparse_qlinear(self): Y_hat = sqmodel(X_fp32) self.assertEqual(Y_ref, Y_hat) + # ONEDNN does not support this yet + elif qengine_is_onednn(): + return + row_block_size, col_block_size = sqmodel.linear._packed_params._weight_bias()[2:] assert row_block_size == 1 and col_block_size == 4 diff --git a/test/autograd/test_functional.py b/test/autograd/test_functional.py new file mode 100644 index 000000000000..18b8fd07d736 --- /dev/null +++ b/test/autograd/test_functional.py @@ -0,0 +1,1420 @@ +# Owner(s): ["module: autograd"] + +import types +import unittest +import warnings + +import torch +import torch.autograd.functional as autogradF + +from torch.testing._internal.common_cuda import TEST_CUDA +from torch.testing._internal.common_utils import ( + TestCase, run_tests, subtest, gradcheck, gradgradcheck, parametrize, instantiate_parametrized_tests) +from torch.testing._internal.logging_tensor import LoggingTensor + +# Utilities for parametrizing the tensor constructors used in autograd tests +# +# TODO: maybe move somewhere so other tests can also use +# +# NB: Not all factory functions included. A complete(?) list can be found here: +# https://pytorch.org/cppdocs/notes/tensor_creation.html +base_ctors_dict = { + "ones": torch.ones, + "zeros": torch.zeros, + "randn": torch.randn, + "rand": torch.rand, + "tensor": torch.tensor, +} +base_ctors = types.SimpleNamespace(**base_ctors_dict) + +def wrap_with_logging_tensor(ctor): + def wrapper(*args, **kwargs): + requires_grad = kwargs.pop("requires_grad", False) + return LoggingTensor(ctor(*args, **kwargs), requires_grad=requires_grad) + return wrapper + +logging_tensor_ctors_dict = {k: wrap_with_logging_tensor(ctor) for (k, ctor) in base_ctors_dict.items()} +logging_tensor_ctors = types.SimpleNamespace(**logging_tensor_ctors_dict) + +base_and_logging_tensor = parametrize("ctors", [subtest(base_ctors, name="base_tensor"), + subtest(logging_tensor_ctors, name="logging_tensor")]) + +FIXME_base_and_xfail_logging_tensor = parametrize("ctors", [subtest(base_ctors, name="base_tensor"), + subtest(logging_tensor_ctors, name="logging_tensor", + decorators=[unittest.expectedFailure])]) + +# NB: This is equivalent to having both @parmetrize("vectorized", [True, False]) and +# FIXME_base_and_xfail_logging_tensor, except the non-vectorized logging_tensor case is +# actually expected to succeed +FIXME_xfail_vectorized_logging_tensor = ( + parametrize("vectorize,ctors", [subtest((True, base_ctors), name="vectorized_base_tensor"), + subtest((False, base_ctors), name="base_tensor"), + subtest((True, logging_tensor_ctors), name="vectorized_logging_tensor", + decorators=[unittest.expectedFailure]), + subtest((False, logging_tensor_ctors), name="logging_tensor")])) + +vectorized_logging_tensor = ( + parametrize("vectorize,ctors", [subtest((True, base_ctors), name="vectorized_base_tensor"), + subtest((False, base_ctors), name="base_tensor"), + subtest((True, logging_tensor_ctors), name="vectorized_logging_tensor"), + subtest((False, logging_tensor_ctors), name="logging_tensor")])) + + +class TestAutogradFunctional(TestCase): + def _assert_same_struct(self, res, base): + # base and res should be Tensors or tuple of Tensors with the same size + if isinstance(base, torch.Tensor): + self.assertTrue(isinstance(res, torch.Tensor)) + self.assertEqual(base.size(), res.size()) + elif isinstance(base, tuple): + self.assertTrue(isinstance(res, tuple)) + self.assertEqual(len(base), len(res)) + for el_base, el_res in zip(base, res): + self.assertTrue(isinstance(el_base, torch.Tensor)) + self.assertTrue(isinstance(el_res, torch.Tensor)) + self.assertEqual(el_base.size(), el_res.size()) + else: + # Wrong base + raise RuntimeError("The base given to `_assert_same_struct` doesn't have" + " the right structure.") + + def _assert_interleaved_struct(self, res, base1, base2): + # base1 and base2 can be Tensors or tuples of Tensors. + # If they are tuples, res should be a tuple as well. + # The indexing works as follows for base1, base2 being + # - tuple, tuple: res[i][j][k][l] = (base1[i][k], base2[j][l]) + # - tuple, Tensor: res[i][k][l] = (base1[i][k], base2[l]) + # - Tensor, tuple: res[i][j][l] = (base1[i], base2[j][l]) + # - Tensor, Tensor: res[k][l] = (base1[k], base2[l]) + if isinstance(base1, torch.Tensor) and isinstance(base2, torch.Tensor): + self.assertTrue(isinstance(res, torch.Tensor)) + self.assertEqual(res.size(), base1.size() + base2.size()) + elif isinstance(base1, tuple) and isinstance(base2, torch.Tensor): + self.assertTrue(isinstance(res, tuple)) + self.assertEqual(len(res), len(base1)) + for el_res, el_base1 in zip(res, base1): + self.assertTrue(isinstance(el_res, torch.Tensor)) + self.assertTrue(isinstance(el_base1, torch.Tensor)) + self.assertEqual(el_res.size(), el_base1.size() + base2.size()) + elif isinstance(base1, torch.Tensor) and isinstance(base2, tuple): + self.assertTrue(isinstance(res, tuple)) + self.assertEqual(len(res), len(base2)) + for el_res, el_base2 in zip(res, base2): + self.assertTrue(isinstance(el_res, torch.Tensor)) + self.assertTrue(isinstance(el_base2, torch.Tensor)) + self.assertEqual(el_res.size(), base1.size() + el_base2.size()) + elif isinstance(base1, tuple) and isinstance(base2, tuple): + self.assertTrue(isinstance(res, tuple)) + self.assertEqual(len(res), len(base1)) + for el_res, el_base1 in zip(res, base1): + self.assertTrue(isinstance(el_res, tuple)) + self.assertEqual(len(res), len(base2)) + for el_el_res, el_base2 in zip(el_res, base2): + self.assertTrue(isinstance(el_el_res, torch.Tensor)) + self.assertTrue(isinstance(el_base2, torch.Tensor)) + self.assertEqual(el_el_res.size(), el_base1.size() + el_base2.size()) + else: + # Wrong bases + raise RuntimeError("The bases given to `_assert_interleaved_struct` don't have" + " the right structure.") + + @base_and_logging_tensor + def test_vjp_err_check(self, ctors): + def foo(a): + return 3 * a.narrow(0, 0, 3) + + def bar(a): + return 3 * a.narrow(0, 0, 3), "bar" + + inp = ctors.rand(4) + v = ctors.ones(3) + with self.assertRaisesRegex(TypeError, "The inputs given to vjp must be either a Tensor"): + res = autogradF.vjp(foo, (inp, 2), v) + + with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to vjp must"): + res = autogradF.vjp(bar, inp, v) + + with self.assertRaisesRegex(RuntimeError, "The vector v can only be None if the user-provided function returns"): + res = autogradF.vjp(foo, inp) + + with self.assertRaisesRegex(RuntimeError, "The given v should contain a single Tensor."): + res = autogradF.vjp(foo, inp, (torch.ones_like(inp), torch.ones_like(inp))) + + with self.assertRaisesRegex(RuntimeError, "v has invalid size: should be torch.Size"): + res = autogradF.vjp(foo, inp, v[:2]) + + res = autogradF.vjp(foo, inp, v)[1] + self._assert_same_struct(res, inp) + + @base_and_logging_tensor + def test_vjp_err_check_strict(self, ctors): + def foo(a): + return a.detach() + + def bar(a): + # Make a non-leaf Tensor that requires_grad but that is not connected to the input + return a.long().float().requires_grad_().clone() + + inp = ctors.rand(4) + v = ctors.rand(4) + with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."): + res = autogradF.vjp(foo, inp, v, strict=True) + res = autogradF.vjp(foo, inp, v, strict=False) + self._assert_same_struct(res[1], inp) + self.assertEqual(res[1].abs().sum(), 0.) + + with self.assertRaisesRegex(RuntimeError, "The output of the user-provided function is independent of input 0"): + res = autogradF.vjp(bar, inp, v, strict=True) + res = autogradF.vjp(bar, inp, v, strict=False) + self._assert_same_struct(res[1], inp) + self.assertEqual(res[1].abs().sum(), 0.) + + # The Jacobian does not depend on the input + def foo(a): + return a.clone() + + inp.requires_grad_() + with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function is independent of input 0."): + res = autogradF.vjp(foo, inp, v, create_graph=True, strict=True) + res = autogradF.vjp(foo, inp, v, create_graph=True, strict=False) + self._assert_same_struct(res[1], inp) + self.assertEqual(res[1], v) + + @base_and_logging_tensor + def test_vjp_no_grad(self, ctors): + def reducer(x): + return x.sum(dim=1) + inputs = ctors.rand(4, 4) + v = ctors.ones(4) + with torch.no_grad(): + res = autogradF.vjp(reducer, inputs, v) + self.assertIsNone(res[0].grad_fn) + self.assertIsNone(res[1].grad_fn) + self.assertNotEqual(res[1], ctors.zeros(4, 4)) + + inputs.requires_grad_() + v.requires_grad_() + with torch.no_grad(): + res = autogradF.vjp(reducer, inputs, v, create_graph=True) + self.assertIsNotNone(res[0].grad_fn) + self.assertIsNotNone(res[1].grad_fn) + self.assertNotEqual(res[1], ctors.zeros(4, 4)) + + @base_and_logging_tensor + def test_vjp_output(self, ctors): + def reducer(x): + return x.sum(dim=1) + inputs = ctors.rand(4, 4) + v = ctors.ones(4) + res = autogradF.vjp(reducer, inputs, v) + self._assert_same_struct(res[1], inputs) + self.assertIsNone(res[0].grad_fn) + self.assertIsNone(res[1].grad_fn) + + def adder(x, y): + return 2 * x + 3 * y + + inputs = (ctors.rand(2), ctors.rand(2)) + v = ctors.ones(2) + out, vjp_val = autogradF.vjp(adder, inputs, v) + self._assert_same_struct(vjp_val, inputs) + self.assertIsNone(out.grad_fn) + self.assertIsNone(vjp_val[0].grad_fn) + self.assertIsNone(vjp_val[1].grad_fn) + + def adder(x, y): + return 2 * x + 3 * y, x + y + + inputs = (ctors.rand(2), ctors.rand(2)) + v = (ctors.tensor([1., 0.]), ctors.tensor([1., 0.])) + out, vjp_val = autogradF.vjp(adder, inputs, v) + self._assert_same_struct(vjp_val, inputs) + self.assertIsNone(out[0].grad_fn) + self.assertIsNone(out[1].grad_fn) + self.assertIsNone(vjp_val[0].grad_fn) + self.assertIsNone(vjp_val[1].grad_fn) + + @base_and_logging_tensor + def test_vjp_scalar(self, ctors): + def reducer(x): + return x.sum() + inputs = ctors.rand(4, 4) + v = ctors.ones([]) + res = autogradF.vjp(reducer, inputs, v) + self._assert_same_struct(res[0], v) + self._assert_same_struct(res[1], inputs) + + res = autogradF.vjp(reducer, inputs) + self._assert_same_struct(res[0], v) + self._assert_same_struct(res[1], inputs) + + def expander(x): + return x.unsqueeze(0).repeat(4) + inputs = ctors.rand([]) + v = ctors.ones(4) + res = autogradF.vjp(expander, inputs, v) + self._assert_same_struct(res[0], v) + self._assert_same_struct(res[1], inputs) + + @base_and_logging_tensor + def test_vjp_create_graph(self, ctors): + def reducer(x): + return x.sum(dim=1) + inputs = ctors.rand(2, 2, dtype=torch.double) + v = ctors.ones(2, dtype=torch.double) + + inputs.requires_grad_() + v.requires_grad_() + res = autogradF.vjp(reducer, inputs, v, create_graph=True) + self._assert_same_struct(res[1], inputs) + self.assertIsNotNone(res[0].grad_fn) + self.assertIsNotNone(res[1].grad_fn) + + gradcheck(lambda inp, v: autogradF.vjp(reducer, inputs, v, create_graph=True), (inputs, v)) + gradgradcheck(lambda inp, v: autogradF.vjp(reducer, inputs, v, create_graph=True), (inputs, v)) + + def adder(x, y): + return 2 * x + 3 * y, x * y + + inputs = (ctors.rand(2, dtype=torch.double, requires_grad=True), + ctors.rand(2, dtype=torch.double, requires_grad=True)) + v = (ctors.tensor([1., 0.], dtype=torch.double, requires_grad=True), + ctors.tensor([1., 0.], dtype=torch.double, requires_grad=True)) + + gradcheck(lambda *args: autogradF.vjp(adder, args[:2], args[2:], create_graph=True)[1], inputs + v) + gradgradcheck(lambda *args: autogradF.vjp(adder, args[:2], args[2:], create_graph=True)[1], inputs + v) + + def foo(*args): + x, y = args[:2] + v = args[2:] + + x = x.cos() + val, grad = autogradF.vjp(adder, (x, y), v, create_graph=True) + + return val[0].exp() + val[1].exp() + grad[0].exp() + grad[1].exp() + x.exp() + y.exp() + + gradcheck(foo, inputs + v) + gradgradcheck(foo, inputs + v) + + @base_and_logging_tensor + def test_jvp_err_check(self, ctors): + def foo(a): + return 3 * a.narrow(0, 0, 3) + + def bar(a): + return 3 * a.narrow(0, 0, 3), "bar" + + inp = ctors.rand(4) + v = ctors.rand(4) + with self.assertRaisesRegex(TypeError, "The inputs given to jvp must be either a Tensor"): + res = autogradF.jvp(foo, (inp, 2), v) + + with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to jvp must"): + res = autogradF.jvp(bar, inp, v) + + with self.assertRaisesRegex(RuntimeError, "The vector v can only be None if the input to the user-provided function"): + res = autogradF.jvp(foo, inp) + + with self.assertRaisesRegex(RuntimeError, "The given v should contain a single Tensor."): + res = autogradF.jvp(foo, inp, (v, v)) + + with self.assertRaisesRegex(RuntimeError, "v has invalid size: should be torch.Size"): + res = autogradF.jvp(foo, inp, v[:2]) + + res = autogradF.jvp(foo, inp, v)[1] + self._assert_same_struct(res, foo(inp)) + + @base_and_logging_tensor + def test_jvp_err_check_strict(self, ctors): + def foo(a): + return a.detach() + + def bar(a): + # Make a non-leaf Tensor that requires_grad but that is not connected to the input + return a.long().float().requires_grad_().clone() + + inp = ctors.rand(4) + v = ctors.rand(4) + with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."): + res = autogradF.jvp(foo, inp, v, strict=True) + res = autogradF.jvp(foo, inp, v, strict=False) + self._assert_same_struct(res[1], res[0]) + self.assertEqual(res[1].abs().sum(), 0.) + + with self.assertRaisesRegex(RuntimeError, "The output of the user-provided function is independent of input 0"): + res = autogradF.jvp(bar, inp, v, strict=True) + res = autogradF.jvp(bar, inp, v, strict=False) + self._assert_same_struct(res[1], res[0]) + self.assertEqual(res[1].abs().sum(), 0.) + + # The Jacobian does not depend on the input + def foo(a): + return a.clone() + + inp.requires_grad_() + with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function is independent of input 0."): + res = autogradF.jvp(foo, inp, v, create_graph=True, strict=True) + res = autogradF.jvp(foo, inp, v, create_graph=True, strict=False) + self._assert_same_struct(res[1], inp) + self.assertEqual(res[1], v) + + @base_and_logging_tensor + def test_jvp_no_grad(self, ctors): + def reducer(x): + return x.sum(dim=1) + inputs = ctors.rand(4, 4) + v = ctors.ones(4, 4) + with torch.no_grad(): + res = autogradF.jvp(reducer, inputs, v) + self.assertIsNone(res[0].grad_fn) + self.assertIsNone(res[1].grad_fn) + self.assertNotEqual(res[1], ctors.zeros(4, 4)) + + inputs.requires_grad_() + v.requires_grad_() + with torch.no_grad(): + res = autogradF.jvp(reducer, inputs, v, create_graph=True) + self.assertIsNotNone(res[0].grad_fn) + self.assertIsNotNone(res[1].grad_fn) + self.assertNotEqual(res[1], ctors.zeros(4, 4)) + + @base_and_logging_tensor + def test_jvp_output(self, ctors): + def reducer(x): + return x.sum(dim=1) + inputs = ctors.rand(4, 4) + v = ctors.ones(4, 4) + res = autogradF.jvp(reducer, inputs, v) + self._assert_same_struct(res[1], res[0]) + self.assertIsNone(res[0].grad_fn) + self.assertIsNone(res[1].grad_fn) + + def adder(x, y): + return 2 * x + 3 * y + + inputs = (ctors.rand(2), ctors.rand(2)) + v = (ctors.ones(2), ctors.ones(2)) + out, jvp_val = autogradF.jvp(adder, inputs, v) + self._assert_same_struct(jvp_val, out) + self.assertIsNone(out.grad_fn) + self.assertIsNone(jvp_val[0].grad_fn) + self.assertIsNone(jvp_val[1].grad_fn) + + def adder(x, y): + return 2 * x + 3 * y, x + y + + inputs = (ctors.rand(2), ctors.rand(2)) + v = (ctors.tensor([1., 0.]), ctors.tensor([1., 0.])) + out, jvp_val = autogradF.jvp(adder, inputs, v) + self._assert_same_struct(jvp_val, out) + self.assertIsNone(out[0].grad_fn) + self.assertIsNone(out[1].grad_fn) + self.assertIsNone(jvp_val[0].grad_fn) + self.assertIsNone(jvp_val[1].grad_fn) + + @base_and_logging_tensor + def test_jvp_scalar(self, ctors): + def reducer(x): + return x.sum() + inputs = ctors.rand(4, 4) + v = ctors.ones(4, 4) + res = autogradF.jvp(reducer, inputs, v) + self._assert_same_struct(res[0], ctors.zeros([])) + self._assert_same_struct(res[1], res[0]) + + def expander(x): + return x.unsqueeze(0).repeat(4) + inputs = ctors.rand([]) + v = ctors.ones([]) + res = autogradF.jvp(expander, inputs, v) + self._assert_same_struct(res[0], ctors.zeros(4)) + self._assert_same_struct(res[1], res[0]) + + res = autogradF.jvp(expander, inputs) + self._assert_same_struct(res[0], ctors.zeros(4)) + self._assert_same_struct(res[1], res[0]) + + @base_and_logging_tensor + def test_jvp_create_graph(self, ctors): + def reducer(x): + return x.sum(dim=1) + inputs = ctors.rand(2, 2, dtype=torch.double) + v = ctors.ones(2, 2, dtype=torch.double) + + inputs.requires_grad_() + v.requires_grad_() + res = autogradF.jvp(reducer, inputs, v, create_graph=True) + self._assert_same_struct(res[1], res[0]) + self.assertIsNotNone(res[0].grad_fn) + self.assertIsNotNone(res[1].grad_fn) + + gradcheck(lambda inp, v: autogradF.jvp(reducer, inp, v, create_graph=True), (inputs, v)) + gradgradcheck(lambda inp, v: autogradF.jvp(reducer, inp, v, create_graph=True), (inputs, v)) + + def adder(x, y): + return 2 * x + 3 * y, x * y + + inputs = (ctors.rand(2, dtype=torch.double, requires_grad=True), + ctors.rand(2, dtype=torch.double, requires_grad=True)) + v = (ctors.tensor([1., 0.], dtype=torch.double, requires_grad=True), + ctors.tensor([1., 0.], dtype=torch.double, requires_grad=True)) + + gradcheck(lambda *args: autogradF.jvp(adder, args[:2], args[2:], create_graph=True)[1], inputs + v) + gradgradcheck(lambda *args: autogradF.jvp(adder, args[:2], args[2:], create_graph=True)[1], inputs + v) + + def foo(*args): + x, y = args[:2] + v = args[2:] + + x = x.cos() + val, grad = autogradF.jvp(adder, (x, y), v, create_graph=True) + + return val[0].exp() + val[1].exp() + grad[0].exp() + grad[1].exp() + x.exp() + y.exp() + + gradcheck(foo, inputs + v) + gradgradcheck(foo, inputs + v) + + def _test_construct_standard_basis_for(self, inputs): + numels = tuple(tensor.numel() for tensor in inputs) + results = autogradF._construct_standard_basis_for(inputs, numels) + for result, inp in zip(results, inputs): + self.assertEqual(result.dtype, inp.dtype) + self.assertEqual(result.device, inp.device) + results = torch.cat([result.to(device='cpu', dtype=torch.float) + for result in results], dim=1) + expected = torch.eye(results[0].shape[0], dtype=torch.float) + self.assertEqual(results, expected) + + @base_and_logging_tensor + def test_construct_standard_basis_for(self, ctors): + test_cases = [ + (ctors.randn(2, 3),), + (ctors.randn(1),), + (ctors.randn([]),), + (ctors.randn(1), ctors.randn([]), ctors.randn([])), + (ctors.randn(2), ctors.randn(3), ctors.randn([])), + (ctors.randn(2), ctors.randn([]), ctors.randn(3)), + (ctors.randn(2, 3), ctors.randn(3), ctors.randn(3, 4, 2)), + (ctors.randn(2, dtype=torch.float64), ctors.randn(3, dtype=torch.float32)), + ] + + for inputs in test_cases: + self._test_construct_standard_basis_for(inputs) + + @unittest.skipIf(not TEST_CUDA, "test requires CUDA") + @base_and_logging_tensor + def test_construct_standard_basis_for_cuda(self, ctors): + test_cases = [ + (ctors.randn(2), ctors.randn(3, device='cuda')), + (ctors.randn(3, device='cuda'), ctors.randn(2)), + ] + + for inputs in test_cases: + self._test_construct_standard_basis_for(inputs) + + def _test_vectorize_raises_no_warnings(self, api, ctors): + # vmap is an experimental prototype. When someone calls torch.vmap, + # it raises a python warning. This test checks that + # autogradF.{jacobian, hessian} don't raise that experimental prototype + # warning; it is not nice for a public-facing API to raise a warning + # no matter how it is called. + def foo(a): + return (a ** 2).sum() + + x = ctors.randn(3) + with warnings.catch_warnings(record=True) as wa: + result = api(foo, x, vectorize=True) + self.assertEqual(len(wa), 0) + + @base_and_logging_tensor + def test_jacobian_vectorize_raises_no_warnings(self, ctors): + return self._test_vectorize_raises_no_warnings(autogradF.jacobian, ctors) + + @base_and_logging_tensor + def test_hessian_vectorize_raises_no_warnings(self, ctors): + return self._test_vectorize_raises_no_warnings(autogradF.hessian, ctors) + + @parametrize("vectorize", [True, False]) + @base_and_logging_tensor + def test_jacobian_err_check(self, vectorize, ctors): + def foo(a): + return 3 * a.narrow(0, 0, 3) + + def bar(a): + return 3 * a.narrow(0, 0, 3), "bar" + + inp = ctors.rand(4) + with self.assertRaisesRegex(TypeError, "The inputs given to jacobian must be either a Tensor"): + res = autogradF.jacobian(foo, (inp, 2), vectorize=vectorize) + + with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to jacobian must"): + res = autogradF.jacobian(bar, inp, vectorize=vectorize) + + res = autogradF.jacobian(foo, inp, vectorize=vectorize) + self._assert_interleaved_struct(res, foo(inp), inp) + + def foo(a, b): + return b, 3 * a.narrow(0, 0, 3) + + inp = (ctors.rand(4), ctors.rand(5)) + + res = autogradF.jacobian(foo, inp, vectorize=vectorize) + self._assert_interleaved_struct(res, foo(*inp), inp) + + @base_and_logging_tensor + def test_jacobian_err_check_strict(self, ctors): + def foo(a): + return a.detach() + + def bar(a): + # Make a non-leaf Tensor that requires_grad but that is not connected to the input + return a.long().float().requires_grad_().clone() + + inp = ctors.rand(4) + with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."): + res = autogradF.jacobian(foo, inp, strict=True) + res = autogradF.jacobian(foo, inp, strict=False) + self._assert_interleaved_struct(res, foo(inp), inp) + self.assertEqual(res.abs().sum(), 0.) + + with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function is independent of input 0."): + res = autogradF.jacobian(bar, inp, strict=True) + res = autogradF.jacobian(bar, inp, strict=False) + self._assert_interleaved_struct(res, foo(inp), inp) + self.assertEqual(res.abs().sum(), 0.) + + # The Jacobian does not depend on the input + def foo(a): + return a.clone() + + inp.requires_grad_() + with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function is independent of input 0."): + res = autogradF.jacobian(foo, inp, create_graph=True, strict=True) + res = autogradF.jacobian(foo, inp, create_graph=True, strict=False) + self._assert_interleaved_struct(res, inp, inp) + self.assertEqual(res, torch.eye(4)) + + @base_and_logging_tensor + def test_jacobian_err_check_strict_vectorize(self, ctors): + def foo(x): + return x + + inp = ctors.rand(4) + with self.assertRaisesRegex(RuntimeError, "not supported together"): + res = autogradF.jacobian(foo, inp, strict=True, vectorize=True) + + @base_and_logging_tensor + def test_jacobian_no_grad(self, ctors): + def exp_reducer(x): + return x.exp().sum(dim=1) + + inputs = ctors.rand(4, 4) + with torch.no_grad(): + res = autogradF.jacobian(exp_reducer, inputs) + self.assertIsNone(res.grad_fn) + self.assertNotEqual(res, ctors.zeros(4, 4)) + + with torch.no_grad(): + res = autogradF.jacobian(exp_reducer, inputs, create_graph=True) + self.assertIsNotNone(res.grad_fn) + self.assertNotEqual(res, ctors.zeros(4, 4)) + + @vectorized_logging_tensor + def test_jacobian_output(self, vectorize, ctors): + def exp_reducer(x): + return x.exp().sum(dim=1) + + inputs = ctors.rand(4, 4) + res = autogradF.jacobian(exp_reducer, inputs, vectorize=vectorize) + self._assert_interleaved_struct(res, exp_reducer(inputs), inputs) + self.assertIsNone(res.grad_fn) + + def identity(x): + return x.clone() + + inputs = ctors.rand(4) + res = autogradF.jacobian(identity, inputs, vectorize=vectorize) + self._assert_interleaved_struct(res, identity(inputs), inputs) + self.assertIsNone(res.grad_fn) + self.assertEqual(res, torch.eye(4)) + + def add_exp_reducer(x, y): + return (x + y.exp()).sum(dim=1) + + inputs = (ctors.rand(4, 4), ctors.rand(4, 4)) + res = autogradF.jacobian(add_exp_reducer, inputs, vectorize=vectorize) + self._assert_interleaved_struct(res, add_exp_reducer(*inputs), inputs) + self.assertIsNone(res[0].grad_fn) + self.assertIsNone(res[1].grad_fn) + + @vectorized_logging_tensor + def test_jacobian_scalar(self, vectorize, ctors): + def reducer(x): + return x.sum() + inputs = ctors.rand(4, 4) + res = autogradF.jacobian(reducer, inputs, vectorize=vectorize) + self._assert_same_struct(res, inputs) + + def expander(x): + return x.unsqueeze(0).repeat(4) + inputs = ctors.rand([]) + res = autogradF.jacobian(expander, inputs, vectorize=vectorize) + self._assert_same_struct(res, ctors.zeros(4)) + + @parametrize("vectorize", [True, False]) + @base_and_logging_tensor + def test_jacobian_create_graph(self, vectorize, ctors): + def exp_reducer(x): + return x.exp().sum(dim=1) + + inputs = ctors.rand(4, 4, dtype=torch.double, requires_grad=True) + res = autogradF.jacobian(exp_reducer, inputs, create_graph=True, vectorize=vectorize) + self._assert_interleaved_struct(res, exp_reducer(inputs), inputs) + self.assertIsNotNone(res.grad_fn) + + gradcheck(lambda inp: autogradF.jacobian(exp_reducer, inp, create_graph=True, vectorize=vectorize), inputs) + gradgradcheck(lambda inp: autogradF.jacobian(exp_reducer, inp, create_graph=True, vectorize=vectorize), inputs) + + def add_exp_reducer(x, y): + return (x + y).exp().sum(dim=1) + + inputs = (ctors.rand(4, 4, dtype=torch.double, requires_grad=True), + ctors.rand(4, 4, dtype=torch.double, requires_grad=True)) + res = autogradF.jacobian(add_exp_reducer, inputs, create_graph=True, vectorize=vectorize) + self._assert_interleaved_struct(res, add_exp_reducer(*inputs), inputs) + self.assertIsNotNone(res[0].grad_fn) + self.assertIsNotNone(res[1].grad_fn) + + gradcheck(lambda *inp: autogradF.jacobian(add_exp_reducer, inp, create_graph=True, vectorize=vectorize), inputs) + gradgradcheck(lambda *inp: autogradF.jacobian(add_exp_reducer, inp, create_graph=True, vectorize=vectorize), inputs) + + def foo(x, y): + x = x.cos() + val, jac = autogradF.jacobian(add_exp_reducer, (x, y), create_graph=True, vectorize=vectorize) + + res = val[0].exp().sum() + val[1].exp().sum() + jac[0].exp().sum() + res = res + jac[1].exp().sum() + x.exp().sum() + y.exp().sum() + return res + + gradcheck(foo, inputs) + gradgradcheck(foo, inputs) + + def _check_jacobian_vectorize_correctness(self, f, inputs, test_forward_ad=True): + expected = autogradF.jacobian(f, inputs, vectorize=False) + result_backward_mode = autogradF.jacobian(f, inputs, vectorize=True) + self.assertEqual(result_backward_mode, expected) + + if test_forward_ad: + result_forward_mode = autogradF.jacobian(f, inputs, strategy="forward-mode", vectorize=True) + self.assertEqual(result_forward_mode, expected) + + @base_and_logging_tensor + def test_jacobian_vectorize_correctness_simple(self, ctors): + def f(x): + return 3 * x ** 2 + + x = ctors.randn(2, 3, 5) + self._check_jacobian_vectorize_correctness(f, x) + + @base_and_logging_tensor + def test_jacobian_vectorize_correctness_multi_input(self, ctors): + def f(x, y): + return (x.cos() * x) @ y.sin() + + x = ctors.randn(2, 3) + y = ctors.randn(3, 5) + self._check_jacobian_vectorize_correctness(f, (x, y)) + + @base_and_logging_tensor + def test_jacobian_vectorize_correctness_multi_input_multi_output(self, ctors): + def f(x, y): + return (x * x) @ y, x @ (x.sum(1) * y), y.sum() + + x = ctors.randn(5, 3) + y = ctors.randn(3, 5) + self._check_jacobian_vectorize_correctness(f, (x, y)) + + @base_and_logging_tensor + def test_jacobian_vectorize_correctness_unrelated_outputs(self, ctors): + def f(x, y): + return x, y, x, y + + x = ctors.randn(2) + y = ctors.randn(3) + self._check_jacobian_vectorize_correctness(f, (x, y)) + + @base_and_logging_tensor + def test_jacobian_vectorize_correctness_zero_dim(self, ctors): + # zero-dim output + def f(x, y): + return x.sum(), y.sum(), x * y + + x = ctors.randn(3) + y = ctors.randn(3) + self._check_jacobian_vectorize_correctness(f, (x, y)) + + # zero-dim input + def g(x): + return torch.stack([x, x, x]) + + x = ctors.randn([]) + self._check_jacobian_vectorize_correctness(g, x) + + # Mixed zero-dim input / zero-dim output + def h(x, y): + return y.sum(), x * y + + x = ctors.randn([]) + y = ctors.randn(1) + self._check_jacobian_vectorize_correctness(h, (x, y)) + + @unittest.skipIf(not TEST_CUDA, "test requires CUDA") + @base_and_logging_tensor + def test_jacobian_vectorize_correctness_different_devices(self, ctors): + def f(x, y): + return x * y, (x * y).cuda() + + x = ctors.randn(3) + y = ctors.randn(3) + self._check_jacobian_vectorize_correctness(f, (x, y)) + + @base_and_logging_tensor + def test_jacobian_vectorize_correctness_different_dtype(self, ctors): + def f(x, y): + return (x * y).float(), (x * y).double() + + x = ctors.randn(3) + y = ctors.randn(3) + # The Jacobian computed using forward AD has the dtype of the output + # but the Jacobian computed with reverse AD has dtype of input + self._check_jacobian_vectorize_correctness(f, (x, y), test_forward_ad=False) + + def _check_hessian_vectorize_correctness(self, f, inputs): + expected = autogradF.hessian(f, inputs, vectorize=False) + result = autogradF.hessian(f, inputs, vectorize=True) + self.assertEqual(result, expected) + + result_forward_mode = autogradF.hessian(f, inputs, outer_jacobian_strategy="forward-mode", vectorize=True) + self.assertEqual(result_forward_mode, expected) + + @base_and_logging_tensor + def test_hessian_vectorize_correctness_simple(self, ctors): + def f(x): + return (3 * x ** 2).sum() + + x = ctors.randn(2, 3, 5) + self._check_hessian_vectorize_correctness(f, x) + + @base_and_logging_tensor + def test_hessian_vectorize_correctness_multi_input(self, ctors): + def f(x, y, z): + return ((x.relu() * x) @ y.sin() @ z).sum() + + x = ctors.randn(2, 3) + y = ctors.randn(3, 5) + z = ctors.randn(5, 5) + self._check_hessian_vectorize_correctness(f, (x, y, z)) + + @base_and_logging_tensor + def test_hessian_vectorize_correctness_unrelated_outputs(self, ctors): + # output unrelated to one input + def f(x, y): + return (x ** 2).sum() + + x = ctors.randn(2) + y = ctors.randn(3) + self._check_hessian_vectorize_correctness(f, (x, y)) + + # output unrelated to all inputs + def f(x, y): + return ctors.ones([]) + + x = ctors.randn(2) + y = ctors.randn(3) + self._check_hessian_vectorize_correctness(f, (x, y)) + + @parametrize("vectorize", [True, False]) + @base_and_logging_tensor + def test_hessian_err_check(self, vectorize, ctors): + def foo(a): + return 3 * a.narrow(0, 0, 3).exp().sum() + + def bar(a): + return 3 * a.narrow(0, 0, 3), "bar" + + def bar2(a): + return 3 * a.narrow(0, 0, 3) + + def bar3(a): + return 3 * a.narrow(0, 0, 3), 3 * a.narrow(0, 0, 3) + + inp = ctors.rand(4) + with self.assertRaisesRegex(TypeError, "The inputs given to hessian must be either a Tensor"): + res = autogradF.hessian(foo, (inp, 2), vectorize=vectorize) + + with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to hessian must"): + res = autogradF.hessian(bar, inp, vectorize=vectorize) + + err_msg_out = "The Tensor returned by the function given to hessian should contain a single element" + with self.assertRaisesRegex(RuntimeError, err_msg_out): + res = autogradF.hessian(bar2, inp, vectorize=vectorize) + + with self.assertRaisesRegex(RuntimeError, "The function given to hessian should return a single Tensor"): + res = autogradF.hessian(bar3, inp, vectorize=vectorize) + + res = autogradF.hessian(foo, inp, vectorize=vectorize) + self._assert_interleaved_struct(res, inp, inp) + + def foo(a, b): + return (3 * b.narrow(0, 0, 3) * a.narrow(0, 0, 3)).sum() + + inp = (ctors.rand(4), ctors.rand(5)) + + res = autogradF.hessian(foo, inp, vectorize=vectorize) + self._assert_interleaved_struct(res, inp, inp) + + @base_and_logging_tensor + def test_hessian_err_check_strict(self, ctors): + def foo(a): + return a.detach().sum() + + def bar(a): + # Make a non-leaf Tensor that requires_grad but that is not connected to the input + return a.long().float().requires_grad_().clone().sum() + + def bar2(a): + # A Linear function for which the jacobian is independent of the input + return (3 * a).sum() + + inp = ctors.rand(4) + with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."): + res = autogradF.hessian(foo, inp, strict=True) + res = autogradF.hessian(foo, inp, strict=False) + self._assert_interleaved_struct(res, inp, inp) + self.assertEqual(res.abs().sum(), 0.) + + with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function with respect to input 0"): + res = autogradF.hessian(bar, inp, strict=True) + res = autogradF.hessian(bar, inp, strict=False) + self._assert_interleaved_struct(res, inp, inp) + self.assertEqual(res.abs().sum(), 0.) + + with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function with respect to input 0 is"): + res = autogradF.hessian(bar2, inp, strict=True) + res = autogradF.hessian(bar2, inp, strict=False) + self._assert_interleaved_struct(res, inp, inp) + self.assertEqual(res.abs().sum(), 0.) + + @base_and_logging_tensor + def test_hessian_err_check_strict_vectorize(self, ctors): + def foo(x): + return (x ** 3).sum() + + inp = ctors.rand(4) + with self.assertRaisesRegex(RuntimeError, "not supported together"): + res = autogradF.hessian(foo, inp, strict=True, vectorize=True) + + @base_and_logging_tensor + def test_hessian_no_grad(self, ctors): + def pow_reducer(x): + return x.pow(3).sum() + + inputs = ctors.rand(2, 2) + with torch.no_grad(): + res = autogradF.hessian(pow_reducer, inputs) + self.assertIsNone(res[0][0].grad_fn) + self.assertIsNone(res[0][1].grad_fn) + self.assertIsNone(res[1][0].grad_fn) + self.assertIsNone(res[1][1].grad_fn) + self.assertNotEqual(res, ctors.zeros(2, 2, 2)) + + with torch.no_grad(): + res = autogradF.hessian(pow_reducer, inputs, create_graph=True) + self.assertIsNotNone(res[0][0].grad_fn) + self.assertIsNotNone(res[0][1].grad_fn) + self.assertIsNotNone(res[1][0].grad_fn) + self.assertIsNotNone(res[1][1].grad_fn) + self.assertNotEqual(res, ctors.zeros(2, 2, 2)) + + @vectorized_logging_tensor + def test_hessian_output(self, vectorize, ctors): + def pow_reducer(x): + return x.pow(3).sum() + + inputs = ctors.rand(2, 2) + res = autogradF.hessian(pow_reducer, inputs, vectorize=vectorize) + self._assert_interleaved_struct(res, inputs, inputs) + self.assertIsNone(res.grad_fn) + + def add_pow_reducer(x, y): + return (x + y).pow(3).sum() + + inputs = (ctors.rand(2, 2), ctors.rand(2, 2)) + res = autogradF.hessian(add_pow_reducer, inputs, vectorize=vectorize) + self._assert_interleaved_struct(res, inputs, inputs) + self.assertIsNone(res[0][0].grad_fn) + self.assertIsNone(res[0][1].grad_fn) + self.assertIsNone(res[1][0].grad_fn) + self.assertIsNone(res[1][1].grad_fn) + + @parametrize("vectorize", [True, False]) + @base_and_logging_tensor + def test_hessian_scalar(self, vectorize, ctors): + def reducer(x): + return x.sum() + inputs = ctors.rand(4, 4) + res = autogradF.hessian(reducer, inputs, vectorize=vectorize) + self._assert_interleaved_struct(res, inputs, inputs) + + inputs = ctors.rand([]) + res = autogradF.hessian(reducer, inputs, vectorize=vectorize) + self._assert_same_struct(res, inputs) + + def bad_reducer(x): + return x.sum().view(1, 1, 1) + inputs = ctors.rand(4, 4) + res = autogradF.hessian(bad_reducer, inputs, vectorize=vectorize) + self._assert_interleaved_struct(res, inputs, inputs) + + @parametrize("vectorize", [True, False]) + @base_and_logging_tensor + def test_hessian_create_graph(self, vectorize, ctors): + def pow_reducer(x): + return x.pow(3).sum() + + inputs = ctors.rand(2, 2, dtype=torch.double, requires_grad=True) + res = autogradF.hessian(pow_reducer, inputs, create_graph=True, vectorize=vectorize) + self._assert_interleaved_struct(res, inputs, inputs) + self.assertIsNotNone(res.grad_fn) + + gradcheck(lambda inp: autogradF.hessian(pow_reducer, inp, create_graph=True, vectorize=vectorize), inputs) + gradgradcheck(lambda inp: autogradF.hessian(pow_reducer, inp, create_graph=True, vectorize=vectorize), inputs) + + def add_pow_reducer(x, y): + return (x + y).pow(3).sum() + + inputs = (ctors.rand(2, 2, dtype=torch.double, requires_grad=True), + ctors.rand(2, 2, dtype=torch.double, requires_grad=True)) + res = autogradF.hessian(add_pow_reducer, inputs, create_graph=True, vectorize=vectorize) + self._assert_interleaved_struct(res, inputs, inputs) + self.assertIsNotNone(res[0][0].grad_fn) + self.assertIsNotNone(res[0][1].grad_fn) + self.assertIsNotNone(res[1][0].grad_fn) + self.assertIsNotNone(res[1][1].grad_fn) + + def flatten(inp): + return tuple(el_lvl2 for el_lvl1 in inp for el_lvl2 in el_lvl1) + + gradcheck(lambda *inp: flatten(autogradF.hessian(add_pow_reducer, inp, create_graph=True, vectorize=vectorize)), inputs) + gradgradcheck(lambda *inp: flatten(autogradF.hessian(add_pow_reducer, inp, create_graph=True, vectorize=vectorize)), inputs) + + def foo(x, y): + x = x.cos() + val, hess = autogradF.hessian(add_pow_reducer, (x, y), create_graph=True, vectorize=vectorize) + + res = val[0].cos().sum() + val[1].cos().sum() + hess[0].cos().sum() + res = res + hess[1].cos().sum() + x.cos().sum() + y.cos().sum() + return res + + gradcheck(foo, inputs) + gradgradcheck(foo, inputs) + + @base_and_logging_tensor + def test_vhp_err_check(self, ctors): + def foo(a): + return 3 * a.narrow(0, 0, 3).exp().sum() + + def bar(a): + return 3 * a.narrow(0, 0, 3), "bar" + + def bar2(a): + return 3 * a.narrow(0, 0, 3) + + inp = ctors.rand(4) + v = ctors.rand(4) + with self.assertRaisesRegex(TypeError, "The inputs given to vhp must be either a Tensor"): + res = autogradF.vhp(foo, (inp, 2), v) + + with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to vhp must"): + res = autogradF.vhp(bar, inp, v) + + err_msg_out = "The Tensor returned by the function given to vhp should contain a single element" + with self.assertRaisesRegex(RuntimeError, err_msg_out): + res = autogradF.vhp(bar2, inp, v) + + with self.assertRaisesRegex(RuntimeError, "v has invalid size:"): + res = autogradF.vhp(foo, inp, ctors.rand(5)) + + with self.assertRaisesRegex(TypeError, "The v given to vhp must be either a Tensor or a tuple of Tensors"): + res = autogradF.vhp(foo, inp, (v, 2)) + + res = autogradF.vhp(foo, inp, v) + self._assert_same_struct(res[1], inp) + + def foo(a, b): + return (3 * b.narrow(0, 0, 3) * a.narrow(0, 0, 3)).sum() + + inp = (ctors.rand(4), ctors.rand(5)) + v = (ctors.rand(4), ctors.rand(5)) + + res = autogradF.vhp(foo, inp, v) + self._assert_same_struct(res[1], inp) + + @base_and_logging_tensor + def test_vhp_err_check_strict(self, ctors): + def foo(a): + return a.detach().sum() + + def bar(a): + # Make a non-leaf Tensor that requires_grad but that is not connected to the input + return a.long().float().requires_grad_().clone().sum() + + def bar2(a): + # A Linear function for which the jacobian is independent of the input + return (3 * a).sum() + + inp = ctors.rand(4) + v = ctors.rand(4) + with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."): + res = autogradF.vhp(foo, inp, v, strict=True) + res = autogradF.vhp(foo, inp, v, strict=False) + self._assert_same_struct(res[1], inp) + self.assertEqual(res[1].abs().sum(), 0.) + + with self.assertRaisesRegex(RuntimeError, "The output of the user-provided function is independent of input 0"): + res = autogradF.vhp(bar, inp, v, strict=True) + res = autogradF.vhp(bar, inp, v, strict=False) + self._assert_same_struct(res[1], inp) + self.assertEqual(res[1].abs().sum(), 0.) + + with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function with respect to input 0 is"): + res = autogradF.vhp(bar2, inp, v, strict=True) + res = autogradF.vhp(bar2, inp, v, strict=False) + self._assert_same_struct(res[1], inp) + self.assertEqual(res[1].abs().sum(), 0.) + + @base_and_logging_tensor + def test_vhp_no_grad(self, ctors): + def reducer(x): + return x.exp().sum() + inputs = ctors.rand(4, 4) + v = ctors.ones(4, 4) + with torch.no_grad(): + res = autogradF.vhp(reducer, inputs, v) + self.assertIsNone(res[0].grad_fn) + self.assertIsNone(res[1].grad_fn) + self.assertNotEqual(res[1], ctors.zeros(4, 4)) + + with torch.no_grad(): + res = autogradF.vhp(reducer, inputs, v, create_graph=True) + self.assertIsNotNone(res[0].grad_fn) + self.assertIsNotNone(res[1].grad_fn) + self.assertNotEqual(res[1], ctors.zeros(4, 4)) + + @base_and_logging_tensor + def test_vhp_output(self, ctors): + def foo(a): + return 3 * a.narrow(0, 0, 3).exp().sum() + + inputs = ctors.rand(4, 4) + v = ctors.ones(4, 4) + res = autogradF.vhp(foo, inputs, v) + self._assert_same_struct(res[1], inputs) + self.assertIsNone(res[0].grad_fn) + self.assertIsNone(res[1].grad_fn) + + def bar(a, b): + return (a + 3 * b.narrow(0, 0, 3)).exp().sum() + + inputs = (ctors.rand(3), ctors.rand(4)) + v = (ctors.ones(3), ctors.ones(4)) + out, vhp_val = autogradF.vhp(bar, inputs, v) + self._assert_same_struct(vhp_val, inputs) + self.assertIsNone(out.grad_fn) + self.assertIsNone(vhp_val[0].grad_fn) + self.assertIsNone(vhp_val[1].grad_fn) + + @base_and_logging_tensor + def test_vhp_scalar(self, ctors): + def reducer(x): + return x.sum() + inputs = ctors.rand(4, 4) + v = ctors.ones(4, 4) + res = autogradF.vhp(reducer, inputs, v) + self._assert_same_struct(res[1], inputs) + + inputs = ctors.rand([]) + v = ctors.rand([]) + res = autogradF.vhp(reducer, inputs, v) + self._assert_same_struct(res[1], inputs) + + res = autogradF.vhp(reducer, inputs) + self._assert_same_struct(res[1], inputs) + + def bad_reducer(x): + return x.sum().view(1, 1, 1) + inputs = ctors.rand(4, 4) + v = ctors.rand(4, 4) + res = autogradF.vhp(bad_reducer, inputs, v) + self._assert_same_struct(res[1], inputs) + + @base_and_logging_tensor + def test_vhp_create_graph(self, ctors): + def foo(a): + return 3 * a.narrow(0, 0, 3).exp().sum() + + inputs = ctors.rand(4, 4, dtype=torch.double, requires_grad=True) + v = ctors.ones(4, 4, dtype=torch.double, requires_grad=True) + res = autogradF.vhp(foo, inputs, v, create_graph=True) + self._assert_same_struct(res[1], inputs) + self.assertIsNotNone(res[0].grad_fn) + self.assertIsNotNone(res[1].grad_fn) + + gradcheck(lambda inp, v: autogradF.vhp(foo, inp, v, create_graph=True), (inputs, v)) + gradgradcheck(lambda inp, v: autogradF.vhp(foo, inp, v, create_graph=True), (inputs, v)) + + def bar(a, b): + return (a + 3 * b.narrow(0, 0, 3)).exp().sum() + + inputs = (ctors.rand(3, dtype=torch.double, requires_grad=True), + ctors.rand(4, dtype=torch.double, requires_grad=True)) + v = (ctors.ones(3, dtype=torch.double, requires_grad=True), + ctors.ones(4, dtype=torch.double, requires_grad=True)) + out, vhp_val = autogradF.vhp(bar, inputs, v, create_graph=True) + self._assert_same_struct(vhp_val, inputs) + self.assertIsNotNone(out.grad_fn) + self.assertIsNotNone(vhp_val[0].grad_fn) + self.assertIsNotNone(vhp_val[1].grad_fn) + + gradcheck(lambda *args: autogradF.vhp(bar, args[:2], args[2:], create_graph=True)[1], inputs + v) + gradgradcheck(lambda *args: autogradF.vhp(bar, args[:2], args[2:], create_graph=True)[1], inputs + v) + + def foo(*args): + x, y = args[:2] + v = args[2:] + + x = x.cos() + val, grad = autogradF.vhp(bar, (x, y), v, create_graph=True) + + return val.cos() + grad[0].cos().sum() + grad[1].cos() + x.cos().sum() + y.cos() + + gradcheck(foo, inputs + v) + gradgradcheck(foo, inputs + v) + + @base_and_logging_tensor + def test_hvp_err_check(self, ctors): + def foo(a): + return 3 * a.narrow(0, 0, 3).exp().sum() + + def bar(a): + return 3 * a.narrow(0, 0, 3), "bar" + + def bar2(a): + return 3 * a.narrow(0, 0, 3) + + inp = ctors.rand(4) + v = ctors.rand(4) + res = autogradF.hvp(foo, inp, v) + with self.assertRaisesRegex(TypeError, "The inputs given to hvp must be either a Tensor"): + res = autogradF.hvp(foo, (inp, 2), v) + + with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to hvp must"): + res = autogradF.hvp(bar, inp, v) + + err_msg_out = "The Tensor returned by the function given to hvp should contain a single element" + with self.assertRaisesRegex(RuntimeError, err_msg_out): + res = autogradF.hvp(bar2, inp, v) + + with self.assertRaisesRegex(RuntimeError, "v has invalid size:"): + res = autogradF.hvp(foo, inp, ctors.rand(5)) + + with self.assertRaisesRegex(TypeError, "The v given to hvp must be either a Tensor or a tuple of Tensors"): + res = autogradF.hvp(foo, inp, (v, 2)) + + res = autogradF.hvp(foo, inp, v) + self._assert_same_struct(res[1], inp) + + def foo(a, b): + return (3 * b.narrow(0, 0, 3) * a.narrow(0, 0, 3)).sum() + + inp = (ctors.rand(4), ctors.rand(5)) + v = (ctors.rand(4), ctors.rand(5)) + + res = autogradF.hvp(foo, inp, v) + self._assert_same_struct(res[1], inp) + + @base_and_logging_tensor + def test_hvp_err_check_strict(self, ctors): + def foo(a): + return a.detach().sum() + + def bar(a): + # Make a non-leaf Tensor that requires_grad but that is not connected to the input + return a.long().float().requires_grad_().clone().sum() + + def bar2(a): + # A Linear function for which the jacobian is independent of the input + return (3 * a).sum() + + inp = ctors.rand(4) + v = ctors.rand(4) + with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."): + res = autogradF.hvp(foo, inp, v, strict=True) + res = autogradF.hvp(foo, inp, v, strict=False) + self._assert_same_struct(res[1], inp) + self.assertEqual(res[1].abs().sum(), 0.) + + with self.assertRaisesRegex(RuntimeError, "The output of the user-provided function is independent of input 0"): + res = autogradF.hvp(bar, inp, v, strict=True) + res = autogradF.hvp(bar, inp, v, strict=False) + self._assert_same_struct(res[1], inp) + self.assertEqual(res[1].abs().sum(), 0.) + + with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function with respect to input 0 is"): + res = autogradF.hvp(bar2, inp, v, strict=True) + res = autogradF.hvp(bar2, inp, v, strict=False) + self._assert_same_struct(res[1], inp) + self.assertEqual(res[1].abs().sum(), 0.) + + @base_and_logging_tensor + def test_hvp_no_grad(self, ctors): + def reducer(x): + return x.exp().sum() + inputs = ctors.rand(4, 4) + v = ctors.ones(4, 4) + with torch.no_grad(): + res = autogradF.hvp(reducer, inputs, v) + self.assertIsNone(res[0].grad_fn) + self.assertIsNone(res[1].grad_fn) + self.assertNotEqual(res[1], ctors.zeros(4, 4)) + + with torch.no_grad(): + res = autogradF.hvp(reducer, inputs, v, create_graph=True) + self.assertIsNotNone(res[0].grad_fn) + self.assertIsNotNone(res[1].grad_fn) + self.assertNotEqual(res[1], ctors.zeros(4, 4)) + + @base_and_logging_tensor + def test_hvp_output(self, ctors): + def foo(a): + return 3 * a.narrow(0, 0, 3).exp().sum() + + inputs = ctors.rand(4, 4) + v = ctors.ones(4, 4) + res = autogradF.hvp(foo, inputs, v) + self._assert_same_struct(res[1], inputs) + self.assertIsNone(res[0].grad_fn) + self.assertIsNone(res[1].grad_fn) + + def bar(a, b): + return (a + 3 * b.narrow(0, 0, 3)).exp().sum() + + inputs = (ctors.rand(3), ctors.rand(4)) + v = (ctors.ones(3), ctors.ones(4)) + out, hvp_val = autogradF.hvp(bar, inputs, v) + self._assert_same_struct(hvp_val, inputs) + self.assertIsNone(out.grad_fn) + self.assertIsNone(hvp_val[0].grad_fn) + self.assertIsNone(hvp_val[1].grad_fn) + + @base_and_logging_tensor + def test_hvp_scalar(self, ctors): + def reducer(x): + return x.exp().sum() + inputs = ctors.rand(4, 4) + v = ctors.ones(4, 4) + res = autogradF.hvp(reducer, inputs, v) + self._assert_same_struct(res[1], inputs) + + inputs = ctors.rand([]) + v = ctors.rand([]) + res = autogradF.hvp(reducer, inputs, v) + self._assert_same_struct(res[1], inputs) + + res = autogradF.hvp(reducer, inputs) + self._assert_same_struct(res[1], inputs) + + def bad_reducer(x): + return x.exp().sum().view(1, 1, 1) + inputs = ctors.rand(4, 4) + v = ctors.rand(4, 4) + res = autogradF.hvp(bad_reducer, inputs, v) + self._assert_same_struct(res[1], inputs) + + @base_and_logging_tensor + def test_hvp_create_graph(self, ctors): + def foo(a): + return 3 * a.narrow(0, 0, 3).exp().sum() + + inputs = ctors.rand(4, 4, dtype=torch.double, requires_grad=True) + v = ctors.ones(4, 4, dtype=torch.double, requires_grad=True) + res = autogradF.hvp(foo, inputs, v, create_graph=True) + self._assert_same_struct(res[1], inputs) + self.assertIsNotNone(res[0].grad_fn) + self.assertIsNotNone(res[1].grad_fn) + + gradcheck(lambda inp, v: autogradF.hvp(foo, inp, v, create_graph=True), (inputs, v)) + gradgradcheck(lambda inp, v: autogradF.hvp(foo, inp, v, create_graph=True), (inputs, v)) + + def bar(a, b): + return (a + 3 * b.narrow(0, 0, 3)).exp().sum() + + inputs = (ctors.rand(3, dtype=torch.double, requires_grad=True), + ctors.rand(4, dtype=torch.double, requires_grad=True)) + v = (ctors.ones(3, dtype=torch.double, requires_grad=True), + ctors.ones(4, dtype=torch.double, requires_grad=True)) + out, hvp_val = autogradF.hvp(bar, inputs, v, create_graph=True) + self._assert_same_struct(hvp_val, inputs) + self.assertIsNotNone(out.grad_fn) + self.assertIsNotNone(hvp_val[0].grad_fn) + self.assertIsNotNone(hvp_val[1].grad_fn) + + gradcheck(lambda *args: autogradF.hvp(bar, args[:2], args[2:], create_graph=True)[1], inputs + v) + gradgradcheck(lambda *args: autogradF.hvp(bar, args[:2], args[2:], create_graph=True)[1], inputs + v) + + def foo(*args): + x, y = args[:2] + v = args[2:] + + x = x.cos() + val, grad = autogradF.hvp(bar, (x, y), v, create_graph=True) + + return val.cos() + grad[0].cos().sum() + grad[1].cos() + x.cos().sum() + y.cos() + + gradcheck(foo, inputs + v) + gradgradcheck(foo, inputs + v) + + @base_and_logging_tensor + def test_jacobian_match_vjp_jvp(self, ctors): + def foo(x): + return x ** 3 + x.sum() + + inputs = ctors.rand(4) + v = ctors.rand(4) + + jac = autogradF.jacobian(foo, inputs) + jvp = autogradF.jvp(foo, inputs, v)[1] + vjp = autogradF.vjp(foo, inputs, v)[1] + + self.assertEqual(jvp, torch.mm(jac, v.unsqueeze(1)).squeeze(1)) + self.assertEqual(vjp, torch.mm(v.unsqueeze(0), jac).squeeze(0)) + + @base_and_logging_tensor + def test_hessian_match_vhp_hvp(self, ctors): + def foo(a): + return 3 * a.narrow(0, 0, 3).exp().sum() + + inputs = ctors.rand(4) + v = ctors.rand(4) + + hes = autogradF.hessian(foo, inputs) + hvp = autogradF.hvp(foo, inputs, v)[1] + vhp = autogradF.vhp(foo, inputs, v)[1] + + self.assertEqual(hvp, torch.mm(hes, v.unsqueeze(1)).squeeze(1)) + self.assertEqual(vhp, torch.mm(v.unsqueeze(0), hes).squeeze(0)) + +instantiate_parametrized_tests(TestAutogradFunctional) + +if __name__ == '__main__': + run_tests() diff --git a/test/benchmark_utils/test_benchmark_utils.py b/test/benchmark_utils/test_benchmark_utils.py index a98c0ac97b4c..a1e2adaacfa9 100644 --- a/test/benchmark_utils/test_benchmark_utils.py +++ b/test/benchmark_utils/test_benchmark_utils.py @@ -170,6 +170,7 @@ def test_timer(self): @slowTest @unittest.skipIf(IS_SANDCASTLE, "C++ timing is OSS only.") + @unittest.skipIf(True, "Failing on clang, see 74398") def test_timer_tiny_fast_snippet(self): timer = benchmark_utils.Timer( 'auto x = 1;(void)x;', @@ -181,6 +182,7 @@ def test_timer_tiny_fast_snippet(self): @slowTest @unittest.skipIf(IS_SANDCASTLE, "C++ timing is OSS only.") + @unittest.skipIf(True, "Failing on clang, see 74398") def test_cpp_timer(self): timer = benchmark_utils.Timer( """ @@ -547,6 +549,7 @@ def add_one(x): @slowTest @unittest.skipIf(IS_WINDOWS, "Valgrind is not supported on Windows.") @unittest.skipIf(IS_SANDCASTLE, "Valgrind is OSS only.") + @unittest.skipIf(True, "Failing on clang, see 74398") def test_collect_cpp_callgrind(self): timer = benchmark_utils.Timer( "x += 1;", diff --git a/test/cpp/api/dataloader.cpp b/test/cpp/api/dataloader.cpp index c0622ba41cbd..9b71b721b3db 100644 --- a/test/cpp/api/dataloader.cpp +++ b/test/cpp/api/dataloader.cpp @@ -1982,7 +1982,7 @@ TEST(DataLoaderTest, ChunkDatasetSave) { for (const auto epoch_index : c10::irange(epoch_count)) { (void)epoch_index; // Suppress unused variable warning - int iteration_count = 0; + unsigned iteration_count = 0; for (auto iterator = data_loader->begin(); iterator != data_loader->end(); ++iterator, ++iteration_count) { if ((iteration_count + 1) % save_interval == 0) { @@ -2316,7 +2316,7 @@ TEST(DataLoaderTest, CustomPreprocessPolicy) { ++iterator) { auto batch_result = *iterator; if (batch_result.size() > chunk_size * cross_chunk_shuffle_count) { - for (int i = 0; i < batch_result.size(); i += chunk_size) { + for (unsigned i = 0; i < batch_result.size(); i += chunk_size) { ASSERT_TRUE(std::is_sorted( batch_result.begin() + i, batch_result.begin() + i + chunk_size)); diff --git a/test/cpp/api/functional.cpp b/test/cpp/api/functional.cpp index db0f4d25168f..add7e17c910d 100644 --- a/test/cpp/api/functional.cpp +++ b/test/cpp/api/functional.cpp @@ -917,15 +917,21 @@ TEST_F(FunctionalTest, ELU) { for (const auto alpha : {0.0, 0.42, 1.0, 4.2, 42.42}) { auto x = torch::linspace(-10.0, 10.0, size * size * size); x.resize_({size, size, size}); + auto x_bf16 = torch::linspace(-10.0, 10.0, size * size * size).to(torch::kBFloat16); + x_bf16.resize_({size, size, size}); + auto y_exp = torch::max(torch::zeros_like(x), x) + torch::min(torch::zeros_like(x), alpha * (torch::exp(x) - 1.0)); auto y = F::elu(x, F::ELUFuncOptions().alpha(alpha).inplace(inplace)); + auto y_bf16 = F::elu(x_bf16, F::ELUFuncOptions().alpha(alpha).inplace(inplace)); ASSERT_EQ(y.ndimension(), 3); ASSERT_EQ(y.sizes(), std::vector({size, size, size})); ASSERT_TRUE(torch::allclose(y, y_exp)); + ASSERT_TRUE(torch::allclose(y_bf16.to(torch::kFloat), y, 1e-2, 1e-2)); if (inplace) { ASSERT_TRUE(torch::allclose(x, y_exp)); + ASSERT_TRUE(torch::allclose(x_bf16.to(torch::kFloat), y, 1e-2, 1e-2)); } } } @@ -938,15 +944,19 @@ TEST_F(FunctionalTest, SELU) { const double alpha = 1.6732632423543772848170429916717; for (const auto inplace : {false, true}) { auto input = torch::randn({5, 5}); + auto input_bf16 = input.clone().to(torch::kBFloat16); auto expected = scale * (torch::max(torch::zeros_like(input), input) + torch::min( torch::zeros_like(input), alpha * (torch::exp(input) - 1))); auto output = F::selu(input, inplace); + auto output_bf16 = F::selu(input_bf16, inplace); ASSERT_TRUE(output.allclose(expected)); + ASSERT_TRUE(output_bf16.to(torch::kFloat).allclose(output, 1e-2, 1e-2)); if (inplace) { ASSERT_TRUE(input.allclose(expected)); + ASSERT_TRUE(input_bf16.to(torch::kFloat).allclose(output, 1e-2, 1e-2)); } } } @@ -973,10 +983,17 @@ TEST_F(FunctionalTest, GLU) { } TEST_F(FunctionalTest, GELU) { - GELU model; const auto x = torch::linspace(-3.0, 3.0, 100); const auto y_exp = x * 0.5 * (1.0 + torch::erf(x / std::sqrt(2.0))); - const auto y = F::gelu(x); + const auto y = F::gelu(x, F::GELUFuncOptions().approximate("none")); + ASSERT_TRUE(torch::allclose(y, y_exp, 1.4e-06, 1e-05)); +} + +TEST_F(FunctionalTest, TanhGELU) { + const auto x = torch::linspace(-3.0, 3.0, 100); + const auto inner = std::sqrt(2 / M_PI) * (x + 0.044715 * x.pow(3.0)); + const auto y_exp = 0.5 * x * (1.0 + inner.tanh()); + const auto y = F::gelu(x, F::GELUFuncOptions().approximate("tanh")); ASSERT_TRUE(torch::allclose(y, y_exp, 1.4e-06, 1e-05)); } @@ -1528,15 +1545,19 @@ TEST_F(FunctionalTest, CELU) { for (const auto alpha : {0.42, 1.0, 4.2, 42.42}) { auto x = torch::linspace(-10.0, 10.0, size * size * size); x.resize_({size, size, size}); + auto x_bf16 = x.clone().to(torch::kBFloat16); auto y_exp = torch::max(torch::zeros_like(x), x) + torch::min(torch::zeros_like(x), alpha * (torch::exp(x / alpha) - 1.0)); auto y = F::celu(x, F::CELUFuncOptions().alpha(alpha).inplace(inplace)); + auto y_bf16 = F::celu(x_bf16, F::CELUFuncOptions().alpha(alpha).inplace(inplace)); ASSERT_EQ(y.ndimension(), 3); ASSERT_EQ(y.sizes(), std::vector({size, size, size})); ASSERT_TRUE(torch::allclose(y, y_exp)); + ASSERT_TRUE(torch::allclose(y_bf16.to(torch::kFloat), y, 1e-2, 1e-2)); if (inplace) { ASSERT_TRUE(torch::allclose(x, y_exp)); + ASSERT_TRUE(torch::allclose(x_bf16.to(torch::kFloat), y, 1e-2, 1e-2)); } } } @@ -1548,13 +1569,16 @@ TEST_F(FunctionalTest, CELUDefaultOptions) { const auto alpha = 1.0; auto x = torch::linspace(-10.0, 10.0, size * size * size); x.resize_({size, size, size}); + auto x_bf16 = x.clone().to(torch::kBFloat16); auto y_exp = torch::max(torch::zeros_like(x), x) + torch::min(torch::zeros_like(x), alpha * (torch::exp(x / alpha) - 1.0)); auto y = F::celu(x); + auto y_bf16 = F::celu(x_bf16); ASSERT_EQ(y.ndimension(), 3); ASSERT_EQ(y.sizes(), std::vector({size, size, size})); ASSERT_TRUE(torch::allclose(y, y_exp)); + ASSERT_TRUE(torch::allclose(y_bf16.to(torch::kFloat), y, 1e-2, 1e-2)); } TEST_F(FunctionalTest, PixelShuffle) { @@ -2167,7 +2191,7 @@ TEST_F(FunctionalTest, Interpolate) { } } -TEST_F(FunctionalTest, Pad) { +TEST_F(FunctionalTest, Pad1) { { auto input = torch::arange(6, torch::kDouble).reshape({1, 2, 3}); auto output = F::pad(input, F::PadFuncOptions({1, 2}).mode(torch::kCircular)); @@ -2176,6 +2200,8 @@ TEST_F(FunctionalTest, Pad) { ASSERT_EQ(output.sizes(), std::vector({1, 2, 6})); ASSERT_TRUE(output.allclose(expected, 1e-04)); } +} +TEST_F(FunctionalTest, Pad2) { { auto input = torch::arange(9, torch::kDouble).reshape({1, 1, 3, 3}); auto output = F::pad(input, F::PadFuncOptions({3, 3, 3, 1}).mode(torch::kCircular)); @@ -2190,6 +2216,8 @@ TEST_F(FunctionalTest, Pad) { ASSERT_EQ(output.sizes(), std::vector({1, 1, 7, 9})); ASSERT_TRUE(output.allclose(expected, 1e-04)); } +} +TEST_F(FunctionalTest, Pad3) { { auto input = torch::arange(12, torch::kDouble).reshape({1, 1, 2, 2, 3}); auto output = F::pad(input, F::PadFuncOptions({3, 3, 2, 1, 2, 2}).mode(torch::kCircular)); @@ -2232,6 +2260,8 @@ TEST_F(FunctionalTest, Pad) { ASSERT_EQ(output.sizes(), std::vector({1, 1, 6, 5, 9})); ASSERT_TRUE(output.allclose(expected, 1e-04)); } +} +TEST_F(FunctionalTest, Pad4) { { auto input = torch::arange(16, torch::kDouble).reshape({2, 2, 2, 2}); auto output = F::pad(input, F::PadFuncOptions({1, 1, 1, 1}).mode(torch::kReflect)); @@ -2258,6 +2288,8 @@ TEST_F(FunctionalTest, Pad) { ASSERT_EQ(output.sizes(), std::vector({2, 2, 4, 4})); ASSERT_TRUE(output.allclose(expected, 1e-04)); } +} +TEST_F(FunctionalTest, Pad5) { { auto input = torch::arange(12, torch::kDouble).reshape({1, 1, 2, 2, 3}); auto output = F::pad(input, F::PadFuncOptions({1, 2, 2, 1, 1, 2}).mode(torch::kReplicate)); @@ -2294,6 +2326,8 @@ TEST_F(FunctionalTest, Pad) { ASSERT_EQ(output.sizes(), std::vector({1, 1, 5, 5, 6})); ASSERT_TRUE(output.allclose(expected, 1e-04)); } +} +TEST_F(FunctionalTest, Pad6) { { auto input = torch::arange(18, torch::kDouble).reshape({1, 1, 3, 2, 3}); auto output = F::pad(input, F::PadFuncOptions({0, 2, 1, 0, 1, 2}).mode(torch::kReflect)); @@ -2324,12 +2358,16 @@ TEST_F(FunctionalTest, Pad) { ASSERT_EQ(output.sizes(), std::vector({1, 1, 6, 3, 5})); ASSERT_TRUE(output.allclose(expected, 1e-04)); } +} +TEST_F(FunctionalTest, Pad7) { { auto input = torch::ones({1, 1, 1, 1}, torch::kDouble); auto output = F::pad(input, F::PadFuncOptions({1, 1}).mode(torch::kConstant).value(0)); ASSERT_EQ(output.sizes(), std::vector({1, 1, 1, 3})); auto expected = torch::tensor({{{{0., 1., 0.}}}}, torch::kDouble); } +} +TEST_F(FunctionalTest, Pad8) { { auto input = torch::ones({1, 1, 1, 1}, torch::kDouble); auto output = F::pad(input, F::PadFuncOptions({1, 1})); diff --git a/test/cpp/api/init.cpp b/test/cpp/api/init.cpp index 9e2ed422e28b..222d4f1171c4 100644 --- a/test/cpp/api/init.cpp +++ b/test/cpp/api/init.cpp @@ -19,7 +19,7 @@ void check_exact_values( auto layerParameters = parameters[i]; auto expectedLayerParameters = expected_parameters[i]; - if (layerParameters.size(0) != expectedLayerParameters.size()) { + if (static_cast(layerParameters.size(0)) != expectedLayerParameters.size()) { std::cout << "layer #" << i << " layerParameters size: " << layerParameters.size(0) << " != " diff --git a/test/cpp/api/misc.cpp b/test/cpp/api/misc.cpp index a8d6320e9533..734cea27e5cc 100644 --- a/test/cpp/api/misc.cpp +++ b/test/cpp/api/misc.cpp @@ -90,3 +90,14 @@ TEST(UtilsTest, AmbiguousOperatorDefaults) { at::_test_ambiguous_defaults(tmp, 1, 1); at::_test_ambiguous_defaults(tmp, 2, "2"); } + +int64_t get_first_element(c10::OptionalIntArrayRef arr) { + return arr.value()[0]; +} + +TEST(OptionalArrayRefTest, DanglingPointerFix) { + // Ensure that the converting constructor of `OptionalArrayRef` does not + // create a dangling pointer when given a single value + ASSERT_TRUE(get_first_element(300) == 300); + ASSERT_TRUE(get_first_element({400}) == 400); +} diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp index 8632f3e195cb..cdf4f0ea0deb 100644 --- a/test/cpp/api/modules.cpp +++ b/test/cpp/api/modules.cpp @@ -2860,13 +2860,23 @@ TEST_F(ModulesTest, GLU) { } TEST_F(ModulesTest, GELU) { - GELU model; + GELU model(GELUOptions().approximate("none")); const auto x = torch::linspace(-3.0, 3.0, 100); const auto y_exp = x * 0.5 * (1.0 + torch::erf(x / std::sqrt(2.0))); const auto y = model(x); ASSERT_TRUE(torch::allclose(y, y_exp, 1.4e-06, 1e-05)); } +TEST_F(ModulesTest, TanhGELU) { + GELU model(GELUOptions().approximate("tanh")); + const auto x = torch::linspace(-3.0, 3.0, 100); + const auto inner = std::sqrt(2 / M_PI) * (x + 0.044715 * x.pow(3.0)); + const auto y_exp = 0.5 * x * (1.0 + inner.tanh()); + const auto y = model(x); + ASSERT_TRUE(torch::allclose(y, y_exp, 1.4e-06, 1e-05)); +} + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) TEST_F(ModulesTest, Mish) { Mish model; auto x = torch::randn(100) * 10; diff --git a/test/cpp/api/nn_utils.cpp b/test/cpp/api/nn_utils.cpp index 451c72e9d776..be371b1ae6d4 100644 --- a/test/cpp/api/nn_utils.cpp +++ b/test/cpp/api/nn_utils.cpp @@ -615,7 +615,7 @@ TEST_F(NNUtilsTest, PackPaddedSequence) { } int64_t offset = 0; std::vector tensors_to_be_cat; - for (int64_t i = 1; i < sorted_lengths.size() + 1; i++) { + for (int64_t i = 1; i < static_cast(sorted_lengths.size() + 1); i++) { int64_t l = sorted_lengths.at(i-1); tensors_to_be_cat.emplace_back(pad(i * 100 + torch::arange(1., 5 * l + 1).view({l, 1, 5}), max_length)); } diff --git a/test/cpp/api/parameterdict.cpp b/test/cpp/api/parameterdict.cpp index 5f2eab5d6b28..21dd1b31d5a8 100644 --- a/test/cpp/api/parameterdict.cpp +++ b/test/cpp/api/parameterdict.cpp @@ -105,7 +105,7 @@ TEST_F(ParameterDictTest, Values) { auto dict = torch::nn::ParameterDict(params); std::vector values = dict->values(); std::vector true_values{ta, tb, tc}; - for (auto i = 0; i < values.size(); i += 1) { + for (auto i = 0U; i < values.size(); i += 1) { ASSERT_TRUE(torch::all(torch::eq(values[i], true_values[i])).item()); } } diff --git a/test/cpp/api/serialize.cpp b/test/cpp/api/serialize.cpp index b422662aa362..ecad2348674b 100644 --- a/test/cpp/api/serialize.cpp +++ b/test/cpp/api/serialize.cpp @@ -129,7 +129,7 @@ void test_serialize_optimizer(DerivedOptimizerOptions options, bool only_has_glo // optim3_2 and optim1 should have param_groups and state of size 1 and state_size respectively ASSERT_TRUE(optim3_2_param_groups.size() == 1); // state_size = 2 for all optimizers except LBFGS as LBFGS only maintains one global state - int state_size = only_has_global_state ? 1 : 2; + unsigned state_size = only_has_global_state ? 1 : 2; ASSERT_TRUE(optim3_2_state.size() == state_size); // optim3_2 and optim1 should have param_groups and state of same size @@ -355,6 +355,7 @@ TEST(SerializeTest, ErrorOnMissingKey) { // We want the errors to contain hierarchy information, too. ASSERT_THROWS_WITH( torch::load(model2, stream), "No such serialized tensor 'a.b.x'"); + stream.seekg(0, stream.beg); ASSERT_THROWS_WITH( torch::load(model3, stream), "No such serialized submodule: 'a.x'"); } diff --git a/test/cpp/c10d/ProcessGroupNCCLTest.cpp b/test/cpp/c10d/ProcessGroupNCCLTest.cpp index 2a00cb901f4f..6e57e92389f5 100644 --- a/test/cpp/c10d/ProcessGroupNCCLTest.cpp +++ b/test/cpp/c10d/ProcessGroupNCCLTest.cpp @@ -508,7 +508,6 @@ void testReduceScatter(const std::string& path, int rank, int size) { void testProcessGroupNCCLHealthCheckFailHelper(const std::string& path, bool timeout) { // simulate world_size > 1 here via threads. const int worldSize = 4; - std::mutex m; std::unordered_set nums; auto runTest = [&](int i) { NCCLTest test(path, worldSize, std::chrono::milliseconds(3000)); diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt index cfdbb28a6765..60b43b81fc8b 100644 --- a/test/cpp/jit/CMakeLists.txt +++ b/test/cpp/jit/CMakeLists.txt @@ -39,6 +39,7 @@ endif() # Build the cpp gtest binary containing the cpp-only tests. set(JIT_TEST_SRCS + ${JIT_TEST_ROOT}/test_add_if_then_else.cpp ${JIT_TEST_ROOT}/test_alias_analysis.cpp ${JIT_TEST_ROOT}/test_argument_spec.cpp ${JIT_TEST_ROOT}/test_autodiff.cpp @@ -89,12 +90,16 @@ set(JIT_TEST_SRCS ${JIT_TEST_ROOT}/test_script_profile.cpp ${JIT_TEST_ROOT}/test_shape_analysis.cpp ${JIT_TEST_ROOT}/test_jit_logging_levels.cpp + ${JIT_TEST_ROOT}/test_file_format.cpp ${JIT_TEST_ROOT}/test_flatbuffer.cpp ) if(USE_CUDA) - list(APPEND JIT_TEST_SRCS ${JIT_TEST_ROOT}/test_gpu.cpp) - list(APPEND JIT_TEST_SRCS ${JIT_TEST_ROOT}/test_gpu_shift.cpp) + list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu.cpp) + list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp) + list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp) + list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp) + list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_view.cpp) endif() add_executable(test_jit @@ -138,6 +143,10 @@ if(USE_CUDA) ${TORCH_CUDA_LIBRARIES}) target_compile_definitions(test_jit PRIVATE USE_CUDA) + # Suppress sign compare checks for NVFUSER JIT tests + if(NOT MSVC) + target_compile_options(test_jit PRIVATE -Wno-sign-compare) + endif() elseif(USE_ROCM) target_link_libraries(test_jit PRIVATE ${ROCM_HIPRTC_LIB} diff --git a/test/cpp/jit/source_range_test.cpp b/test/cpp/jit/source_range_test.cpp new file mode 100644 index 000000000000..16c7f850bf26 --- /dev/null +++ b/test/cpp/jit/source_range_test.cpp @@ -0,0 +1,51 @@ +#include +#include + +using namespace ::testing; +using namespace ::torch::jit; + +TEST(SourceRangeTest, test_find) { + std::vector> strings; + strings.push_back(std::make_shared("hello world")); + strings.push_back(std::make_shared("nihaoma")); + + std::vector pieces{*strings[0], *strings[1]}; + + StringCordView view(pieces, strings); + + auto x = view.find("rldni", 0); + EXPECT_EQ(x, 8); +} + +TEST(SourceRangeTest, test_substr) { + std::vector> strings; + strings.push_back(std::make_shared("hello world")); + strings.push_back(std::make_shared("nihaoma")); + + std::vector pieces{*strings[0], *strings[1]}; + + StringCordView view(pieces, strings); + + auto x = view.substr(4, 10).str(); + EXPECT_EQ(x, view.str().substr(4, 10)); + EXPECT_EQ(view.substr(0, view.size()).str(), view.str()); +} + +TEST(SourceRangeTest, test_iter) { + std::vector> strings; + strings.push_back(std::make_shared("hello world")); + strings.push_back(std::make_shared("nihaoma")); + + std::vector pieces{*strings[0], *strings[1]}; + + StringCordView view(pieces, strings); + + auto iter = view.iter_for_pos(5); + EXPECT_EQ(*iter, ' '); + EXPECT_EQ(iter.rest_line(), " world"); + EXPECT_EQ(*iter.next_iter(), 'w'); + EXPECT_EQ(iter.pos(), 5); + + iter = view.iter_for_pos(13); + EXPECT_EQ(iter.pos(), 13); +} diff --git a/test/cpp/jit/test_add_if_then_else.cpp b/test/cpp/jit/test_add_if_then_else.cpp new file mode 100644 index 000000000000..4850e1ab425b --- /dev/null +++ b/test/cpp/jit/test_add_if_then_else.cpp @@ -0,0 +1,53 @@ +#include + +#include +#include +#include + +namespace torch { +namespace jit { + +TEST(AddIfThenElseOpTest, AddIfThenElseOpSimple) { + const auto src = R"IR( + graph(%cond: bool, %a: Tensor, %b: Tensor): + %result: Tensor = prim::If(%cond) + block0(): + -> (%a) + block1(): + -> (%b) + return (%result) + )IR"; + + auto graph = std::make_shared(); + parseIR(src, graph.get()); + EXPECT_TRUE(AddIfThenElseOp(graph)); + + testing::FileCheck() + .check_count("= prim::IfThenElse", 1, /*exactly*/ true) + ->check_count("= prim::If", 0, /*exactly*/ true) + ->run(*graph); +} + +TEST(AddIfThenElseOpTest, NoIfThenElseOpMultipleOutputs) { + const auto src = R"IR( + graph(%cond: bool, %a: Tensor, %b: Tensor): + %result1: Tensor, %result2: Tensor = prim::If(%cond) + block0(): + -> (%a, %b) + block1(): + -> (%b, %a) + return (%result1, %result2) + )IR"; + + auto graph = std::make_shared(); + parseIR(src, graph.get()); + EXPECT_FALSE(AddIfThenElseOp(graph)); + + testing::FileCheck() + .check_count("= prim::IfThenElse", 0, /*exactly*/ true) + ->check_count("= prim::If", 1, /*exactly*/ true) + ->run(*graph); +} + +} // namespace jit +} // namespace torch diff --git a/test/cpp/jit/test_autodiff.cpp b/test/cpp/jit/test_autodiff.cpp index e8bfefe64263..6a087adb63c8 100644 --- a/test/cpp/jit/test_autodiff.cpp +++ b/test/cpp/jit/test_autodiff.cpp @@ -289,14 +289,11 @@ class AutodiffRemoveUnusedGradientsTest : public ::testing::Test { void SetUp() override { prev_exec = getExecutorMode(); getExecutorMode() = true; - prev_profiling = getProfilingMode(); - getProfilingMode() = true; prev_inline_autodiff = getAutodiffSubgraphInlining(); debugSetAutodiffSubgraphInlining(false); } void TearDown() override { getExecutorMode() = prev_exec; - getProfilingMode() = prev_profiling; debugSetAutodiffSubgraphInlining(prev_inline_autodiff); } diff --git a/test/cpp/jit/test_backend.cpp b/test/cpp/jit/test_backend.cpp index a6961a2e4030..dd4df40d9c13 100644 --- a/test/cpp/jit/test_backend.cpp +++ b/test/cpp/jit/test_backend.cpp @@ -143,6 +143,38 @@ TEST(BackendTest, TestCompiler) { AT_ASSERT(mres.toTensor().equal(ref.toTensor())); } +TEST(BackendTest, TestCompilerWithStringTable) { + setShouldUseFormatWithStringTable(true); + Module m("m"); + m.define(R"( + def forward(self, x, h): + return x + h + )"); + + std::vector inputs; + inputs.emplace_back(2.0 * torch::ones({})); + inputs.emplace_back(1.0 * torch::ones({})); + auto ref = m.forward(inputs); + + c10::Dict compile_spec(StringType::get(), AnyType::get()); + c10::Dict fake_dict(StringType::get(), AnyType::get()); + fake_dict.insert("", ""); + compile_spec.insert("forward", fake_dict); + auto any_dict_ty = DictType::create(StringType::get(), AnyType::get()); + // lowered module + auto lm = torch::jit::detail::codegen_backend_module( + "backend_with_compiler_demo", m, compile_spec, any_dict_ty); + auto res = lm.forward(inputs); + AT_ASSERT(res.toTensor().equal(ref.toTensor())); + + std::stringstream ss; + lm._save_for_mobile(ss); + auto mlm = _load_for_mobile(ss); + auto mres = mlm.forward(inputs); + setShouldUseFormatWithStringTable(false); + AT_ASSERT(mres.toTensor().equal(ref.toTensor())); +} + TEST(BackendTest, TestComposite) { c10::Dict compile_spec(StringType::get(), AnyType::get()); c10::Dict fake_dict(StringType::get(), AnyType::get()); @@ -276,6 +308,7 @@ TEST(BackendTest, TestConsistencyOfCompositeWithSetStates) { c._save_for_mobile(ss); auto mc = _load_for_mobile(ss); auto res_mobile = mc.forward(inputs); + ss.seekg(0, ss.beg); // check if the methods names are always the same // by reloading the script module and saving it back as mobile @@ -383,6 +416,56 @@ Traceback of TorchScript (most recent call last): ASSERT_THROWS_WITH_MESSAGE(mlm.forward(inputs), error_pattern); } +TEST(BackendTestDebugInfo, TestCompilerWithStringTable) { + setShouldUseFormatWithStringTable(true); + Module m("m"); + m.define(R"( + def forward(self, x, h): + return x + h + )"); + + std::vector inputs; + inputs.emplace_back(torch::rand({2, 4})); + inputs.emplace_back(torch::rand({13, 9})); + + c10::Dict compile_spec(StringType::get(), AnyType::get()); + c10::Dict fake_dict(StringType::get(), AnyType::get()); + fake_dict.insert("", ""); + compile_spec.insert("forward", fake_dict); + auto any_dict_ty = DictType::create(StringType::get(), AnyType::get()); + // lowered module + auto lm = torch::jit::detail::codegen_backend_module( + "backend_with_compiler_demo", m, compile_spec, any_dict_ty); + + std::stringstream ss; + lm._save_for_mobile(ss, ExtraFilesMap(), true); + auto mlm = _load_for_mobile(ss); + std::string error_pattern = R"( + Module hierarchy:top(m)::.__loweredModule__(m)::forward.aten::add +Traceback of TorchScript (most recent call last): + File "", line 3, in + + def forward(self, x: Tensor, h: Tensor): + return self.__loweredModule__.forward(x, h) + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE + + File "", line 5, in forward + typed_inputs: List[Any] = [x, h, ] + if self.__backend.is_available() : + _0, = self.__backend.execute(self.__handles["forward"], typed_inputs) + ~~~~~~~~~~~~~~~~~~~~~~ <--- HERE + assert isinstance(_0, Tensor) + return _0 + File "", line 3, in + + def forward(self, x, h): + return x + h + ~~~~~ <--- HERE + )"; + setShouldUseFormatWithStringTable(false); + ASSERT_THROWS_WITH_MESSAGE(mlm.forward(inputs), error_pattern); +} + TEST(BackendTestDebugInfo, TestExceptionStackForCompilerWithModuleHierarchy) { Module a("A"); a.define(R"( diff --git a/test/cpp/jit/test_backend_compiler_lib.cpp b/test/cpp/jit/test_backend_compiler_lib.cpp index 0db8bd428e9e..372b08a392d1 100644 --- a/test/cpp/jit/test_backend_compiler_lib.cpp +++ b/test/cpp/jit/test_backend_compiler_lib.cpp @@ -2,7 +2,10 @@ #include #include #include + +#ifndef NO_PROFILING #include +#endif namespace torch { namespace jit { @@ -72,7 +75,12 @@ class BackendWithCompiler : public PyTorchBackendInterface { return true; } - // Since the actual compilation is done AOT, + // Since the actual compilation is done AOT for this backend, compile just + // forwards everything along. In a non toy setup this could grab information + // from that runtime that might be relevant to execute, such as build flags + // the resolution of the devices camera, or basically any runtime specific + // information that wouldnt be available server side where preprocess is + // called. c10::impl::GenericDict compile( c10::IValue processed, c10::impl::GenericDict method_compile_spec) override { @@ -86,8 +94,14 @@ class BackendWithCompiler : public PyTorchBackendInterface { return c10::impl::toGenericDict(handles); } + // Function that actually executes the model in the backend. Here there is + // nothing to dispatch to, so the backend is implemented locally within + // execute and it only supports add, subtract, and constant. In a non toy + // backend you can imagine how this function could be used to actually + // dispatch the inputs to the relevant backend/device. c10::impl::GenericList execute( - c10::IValue handle, + c10::IValue + handle, // example: [('prim::Constant#1', 14), ('aten::add', 15)] c10::impl::GenericList inputs) override { TORCH_INTERNAL_ASSERT(inputs.size() == 2); c10::IValue val0 = inputs[0]; @@ -98,15 +112,20 @@ class BackendWithCompiler : public PyTorchBackendInterface { op_runtimes_us.reserve(handle.toList().size()); c10::List output_list; +#ifndef NO_PROFILING auto start_us = torch::profiler::impl::getTime() / 1000; +#endif for (const auto& token : handle.toList()) { IValue val = token; auto instruction = val.toTupleRef().elements()[0].toStringRef(); auto debug_handle = val.toTupleRef().elements()[1].toInt(); double const_val = 1.0; +#ifndef NO_PROFILING auto start_time_us = torch::profiler::impl::getTime() / 1000; +#endif try { if (instruction.rfind("prim::Constant", 0) == 0) { + // 15 is the length of 'prim::Constant#' the constant val comes after TORCH_CHECK( instruction.size() > 15, "Constant value is expected in ", @@ -146,10 +165,13 @@ class BackendWithCompiler : public PyTorchBackendInterface { } catch (c10::Error& e) { TORCH_DELEGATED_BACKEND_THROW(false, e.what(), debug_handle); } +#ifndef NO_PROFILING auto end_time_us = torch::profiler::impl::getTime() / 1000; auto duration = end_time_us - start_time_us; op_runtimes_us.emplace_back(duration, debug_handle, instruction); +#endif } +#ifndef NO_PROFILING for (const auto& tup : op_runtimes_us) { RECORD_BACKEND_EVENT_TO_EDGE_PROFILER( start_us, @@ -159,6 +181,7 @@ class BackendWithCompiler : public PyTorchBackendInterface { "test_backend"); start_us = start_us + std::get<0>(tup); } +#endif return c10::impl::toList(output_list); } }; diff --git a/test/cpp/jit/test_exception.cpp b/test/cpp/jit/test_exception.cpp new file mode 100644 index 000000000000..b6b3cbcd6793 --- /dev/null +++ b/test/cpp/jit/test_exception.cpp @@ -0,0 +1,159 @@ +/* + * We have a python unit test for exceptions in test/jit/test_exception.py . + * Add a CPP version here to verify that excepted exception types thrown from + * C++. This is hard to test in python code since C++ exceptions will be + * translated to python exceptions. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +namespace torch { +namespace jit { + +namespace py = pybind11; + +TEST(TestException, TestAssertion) { + std::string pythonCode = R"PY( + def foo(): + raise AssertionError("An assertion failed") + )PY"; + auto cu_ptr = torch::jit::compile(pythonCode); + torch::jit::GraphFunction* gf = + (torch::jit::GraphFunction*)&cu_ptr->get_function("foo"); + std::cerr << "Graph is\n" << *gf->graph() << std::endl; + + bool is_jit_exception = false; + std::string message; + c10::optional exception_class; + try { + cu_ptr->run_method("foo"); + } catch (JITException& e) { + is_jit_exception = true; + message = e.what(); + exception_class = e.getPythonClassName(); + } + EXPECT_TRUE(is_jit_exception); + EXPECT_FALSE(exception_class); + EXPECT_TRUE( + message.find("RuntimeError: AssertionError: An assertion failed") != + std::string::npos); +} + +struct MyPythonExceptionValue : public torch::jit::SugaredValue { + explicit MyPythonExceptionValue(const py::object& exception_class) { + qualified_name_ = + (py::str(py::getattr(exception_class, "__module__", py::str(""))) + + py::str(".") + + py::str(py::getattr(exception_class, "__name__", py::str("")))) + .cast(); + } + + std::string kind() const override { + return "My Python exception"; + } + + // Simplified from PythonExceptionValue::call + std::shared_ptr call( + const torch::jit::SourceRange& loc, + torch::jit::GraphFunction& caller, + at::ArrayRef args, + at::ArrayRef kwargs, + size_t n_binders) override { + TORCH_CHECK(args.size() == 1); + Value* error_message = args.at(0).value(*caller.graph()); + Value* qualified_class_name = + insertConstant(*caller.graph(), qualified_name_, loc); + return std::make_shared( + error_message, qualified_class_name); + } + + private: + std::string qualified_name_; +}; + +class SimpleResolver : public torch::jit::Resolver { + public: + explicit SimpleResolver() {} + + std::shared_ptr resolveValue( + const std::string& name, + torch::jit::GraphFunction& m, + const torch::jit::SourceRange& loc) override { + // follows toSugaredValue (toSugaredValue is defined in caffe2:_C which is + // a python extension. We can not add that as a cpp_binary's dep) + if (name == "SimpleValueError") { + py::object obj = py::globals()["SimpleValueError"]; + return std::make_shared(obj); + } + TORCH_CHECK(false, "resolveValue: can not resolve '", name, "{}'"); + } + + torch::jit::TypePtr resolveType( + const std::string& name, + const torch::jit::SourceRange& loc) override { + return nullptr; + } +}; + +/* + * - The python source code parsing for TorchScript here is learned from + * torch::jit::compile. + * - The code only parses one Def. If there are multiple in the code, those + * except the first one are skipped. + */ +TEST(TestException, TestCustomException) { + py::scoped_interpreter guard{}; + py::exec(R"PY( + class SimpleValueError(ValueError): + def __init__(self, message): + super(SimpleValueError, self).__init__(message) + )PY"); + + std::string pythonCode = R"PY( + def foo(): + raise SimpleValueError("An assertion failed") + )PY"; + + torch::jit::Parser p( + std::make_shared(pythonCode, "", 1)); + auto def = torch::jit::Def(p.parseFunction(/*is_method=*/false)); + std::cerr << "Def is:\n" << def << std::endl; + auto cu = std::make_shared(); + (void)cu->define( + c10::nullopt, + {}, + {}, + {def}, + // class PythonResolver is defined in + // torch/csrc/jit/python/script_init.cpp. It's not in a header file so I + // can not use it. Create a SimpleResolver insteand + {std::make_shared()}, + nullptr); + torch::jit::GraphFunction* gf = + (torch::jit::GraphFunction*)&cu->get_function("foo"); + std::cerr << "Graph is\n" << *gf->graph() << std::endl; + bool is_jit_exception = false; + c10::optional exception_class; + std::string message; + try { + cu->run_method("foo"); + } catch (JITException& e) { + is_jit_exception = true; + exception_class = e.getPythonClassName(); + message = e.what(); + } + EXPECT_TRUE(is_jit_exception); + EXPECT_EQ("__main__.SimpleValueError", *exception_class); + EXPECT_TRUE( + message.find("__main__.SimpleValueError: An assertion failed") != + std::string::npos); +} + +} // namespace jit +} // namespace torch diff --git a/test/cpp/jit/test_file_format.cpp b/test/cpp/jit/test_file_format.cpp new file mode 100644 index 000000000000..a3571cbf30b4 --- /dev/null +++ b/test/cpp/jit/test_file_format.cpp @@ -0,0 +1,124 @@ +#include + +#include + +#include + +// Tests go in torch::jit +namespace torch { +namespace jit { + +TEST(FileFormatTest, IdentifiesFlatbufferStream) { + // Create data whose initial bytes look like a Flatbuffer stream. + std::stringstream data; + data << "abcd" // First four bytes don't matter. + << "PTMF" // Magic string. + << "efgh"; // Trailing bytes don't matter. + + // The data should be identified as Flatbuffer. + EXPECT_EQ(getFileFormat(data), FileFormat::FlatbufferFileFormat); +} + +TEST(FileFormatTest, IdentifiesZipStream) { + // Create data whose initial bytes look like a ZIP stream. + std::stringstream data; + data << "PK\x03\x04" // Magic string. + << "abcd" // Trailing bytes don't matter. + << "efgh"; + + // The data should be identified as ZIP. + EXPECT_EQ(getFileFormat(data), FileFormat::ZipFileFormat); +} + +TEST(FileFormatTest, FlatbufferTakesPrecedence) { + // Since the Flatbuffer and ZIP magic bytes are at different offsets, + // the same data could be identified as both. Demonstrate that Flatbuffer + // takes precedence. (See details in file_format.h) + std::stringstream data; + data << "PK\x03\x04" // ZIP magic string. + << "PTMF" // Flatbuffer magic string. + << "abcd"; // Trailing bytes don't matter. + + // The data should be identified as Flatbuffer. + EXPECT_EQ(getFileFormat(data), FileFormat::FlatbufferFileFormat); +} + +TEST(FileFormatTest, HandlesUnknownStream) { + // Create data that doesn't look like any known format. + std::stringstream data; + data << "abcd" + << "efgh" + << "ijkl"; + + // The data should be classified as unknown. + EXPECT_EQ(getFileFormat(data), FileFormat::UnknownFileFormat); +} + +TEST(FileFormatTest, ShortStreamIsUnknown) { + // Create data with fewer than kFileFormatHeaderSize (8) bytes. + std::stringstream data; + data << "ABCD"; + + // The data should be classified as unknown. + EXPECT_EQ(getFileFormat(data), FileFormat::UnknownFileFormat); +} + +TEST(FileFormatTest, EmptyStreamIsUnknown) { + // Create an empty stream. + std::stringstream data; + + // The data should be classified as unknown. + EXPECT_EQ(getFileFormat(data), FileFormat::UnknownFileFormat); +} + +TEST(FileFormatTest, BadStreamIsUnknown) { + // Create a stream with valid Flatbuffer data. + std::stringstream data; + data << "abcd" + << "PTMF" // Flatbuffer magic string. + << "efgh"; + + // Demonstrate that the data would normally be identified as Flatbuffer. + EXPECT_EQ(getFileFormat(data), FileFormat::FlatbufferFileFormat); + + // Mark the stream as bad, and demonstrate that it is in an error state. + data.setstate(std::stringstream::badbit); + // Demonstrate that the stream is in an error state. + EXPECT_FALSE(data.good()); + + // The data should now be classified as unknown. + EXPECT_EQ(getFileFormat(data), FileFormat::UnknownFileFormat); +} + +TEST(FileFormatTest, StreamOffsetIsObservedAndRestored) { + // Create data with a Flatbuffer header at a non-zero offset into the stream. + std::stringstream data; + // Add initial padding. + data << "PADDING"; + size_t offset = data.str().size(); + // Add a valid Flatbuffer header. + data << "abcd" + << "PTMF" // Flatbuffer magic string. + << "efgh"; + // Seek just after the padding. + data.seekg(static_cast(offset), data.beg); + // Demonstrate that the stream points to the beginning of the Flatbuffer data, + // not to the padding. + EXPECT_EQ(data.peek(), 'a'); + + // The data should be identified as Flatbuffer. + EXPECT_EQ(getFileFormat(data), FileFormat::FlatbufferFileFormat); + + // The stream position should be where it was before identification. + EXPECT_EQ(offset, data.tellg()); +} + +TEST(FileFormatTest, HandlesMissingFile) { + // A missing file should be classified as unknown. + EXPECT_EQ( + getFileFormat("NON_EXISTENT_FILE_4965c363-44a7-443c-983a-8895eead0277"), + FileFormat::UnknownFileFormat); +} + +} // namespace jit +} // namespace torch diff --git a/test/cpp/jit/test_flatbuffer.cpp b/test/cpp/jit/test_flatbuffer.cpp index 25992fa106e7..07bd42c1b141 100644 --- a/test/cpp/jit/test_flatbuffer.cpp +++ b/test/cpp/jit/test_flatbuffer.cpp @@ -19,19 +19,25 @@ #include #include #include +#include #include #include #include +#include #include #include // Tests go in torch::jit namespace torch { namespace jit { -mobile::Module parse_mobile_module(void* data, size_t) { +mobile::Module parse_mobile_module( + void* data, + size_t, + bool should_copy_tensor_memory = false) { auto* flatbuffer_module = mobile::serialization::GetMutableModule(data); - return initialize_mobile_module(flatbuffer_module); + return initialize_mobile_module( + flatbuffer_module, c10::nullopt, should_copy_tensor_memory); } TEST(FlatbufferTest, UpsampleNearest2d) { @@ -62,6 +68,37 @@ TEST(FlatbufferTest, UpsampleNearest2d) { ASSERT_TRUE(resd2.equal(refd)); } +TEST(FlatbufferTest, UpsampleNearest2dWithCopyTensorMemory) { + Module m("m"); + m.define(R"( + def forward(self, input: Tensor, scale:float): + return torch.upsample_nearest2d(input, [1, 1], float(scale), float(scale)) + )"); + + std::vector inputs; + inputs.emplace_back(torch::rand({1, 3, 128, 128})); + inputs.emplace_back(at::Scalar(2.0)); + auto ref = m.forward(inputs); + + CompilationOptions options; + mobile::Module bc = jitModuleToMobile(m, options); + IValue res; + res = bc.forward(inputs); + + auto resd = res.toTensor(); + auto refd = ref.toTensor(); + ASSERT_TRUE(resd.equal(refd)); + + auto buff = save_mobile_module_to_bytes(bc); + mobile::Module bc2 = parse_mobile_module(buff.data(), buff.size(), true); + + buff = flatbuffers::DetachedBuffer(); + + auto res2 = bc2.forward(inputs); + auto resd2 = res2.toTensor(); + ASSERT_TRUE(resd2.equal(refd)); +} + TEST(FlatbufferTest, CheckAttrAccess) { Module m("m"); m.register_attribute("mobile_optimized", BoolType::get(), true); @@ -137,6 +174,67 @@ TEST(FlatbufferTest, MethodInvocation) { // NOLINT (use =delete in gtest) } } +#if defined(ENABLE_FLATBUFFER) && !defined(FB_XPLAT_BUILD) +TEST(FlatbufferTest, FlatbufferBackPortTest) { + Module m("m"); + m.define(R"( + def forward(self, input: Tensor, scale:float): + return torch.upsample_nearest2d(input, [1, 1], float(scale), float(scale)) + )"); + std::stringstream ss; + m._save_for_mobile(ss, {}, false, true); + + std::stringstream oss; + bool backPortSuccess = _backport_for_mobile(ss, oss, 5); + ASSERT_TRUE(backPortSuccess); +} +#endif // defined(ENABLE_FLATBUFFER) && !defined(FB_XPLAT_BUILD) + +TEST(FlatbufferTest, ExtraFiles) { + const auto script = R"JIT( + def forward(self): + x = torch.rand(5, 5) + x = x.mm(x) + return x + )JIT"; + + auto module = + std::make_shared("Module", std::make_shared()); + module->define(script); + std::ostringstream oss; + std::unordered_map extra_files; + extra_files["metadata.json"] = "abc"; + extra_files["mobile_info.json"] = "{\"key\": 23}"; + + std::unordered_map loaded_extra_files; +#if defined ENABLE_FLATBUFFER + std::stringstream ss; + module->_save_for_mobile(ss, extra_files, true, /*use_flatbuffer=*/true); + + loaded_extra_files["metadata.json"] = ""; + auto mobile_module = _load_for_mobile(ss, c10::nullopt, loaded_extra_files); + + ASSERT_EQ(loaded_extra_files["metadata.json"], "abc"); + ASSERT_EQ(loaded_extra_files["mobile_info.json"], "{\"key\": 23}"); + + // load it twice using the same stream + auto mobile_module2 = _load_for_mobile(ss, c10::nullopt, loaded_extra_files); +#else + CompilationOptions options; + mobile::Module bc = jitModuleToMobile(*module, options); + auto buff = save_mobile_module_to_bytes(bc, extra_files); + + loaded_extra_files["metadata.json"] = ""; + auto* flatbuffer_module = + mobile::serialization::GetMutableModule(buff.data()); + + parseExtraFiles(flatbuffer_module, loaded_extra_files); +#endif + + ASSERT_EQ(loaded_extra_files["metadata.json"], "abc"); + ASSERT_EQ(loaded_extra_files["mobile_info.json"], "{\"key\": 23}"); +} + TEST(FlatbufferTest, Conv) { auto s = std::getenv("PYTORCH_TEST_WITH_TSAN"); if (s && strcmp(s, "1") == 0) @@ -179,6 +277,50 @@ TEST(FlatbufferTest, Conv) { outputref[0][0][0][0].item() == output[0][0][0][0].item()); } +TEST(FlatbufferTest, ConvWithCopyTensorMemory) { + auto s = std::getenv("PYTORCH_TEST_WITH_TSAN"); + if (s && strcmp(s, "1") == 0) + return; + + std::vector inputs; + + Module m("m"); + m.register_parameter("weight", torch::ones({20, 1, 5, 5}), false); + m.register_parameter("bias", torch::ones({20}), false); + m.define(R"( + def forward(self, input): + return torch._convolution(input, self.weight, self.bias, [1, 1], [0, 0], [1, 1], False, [0, 0], 1, False, False, True, True) + )"); + + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,modernize-use-emplace) + inputs.push_back(torch::ones({1, 1, 28, 28})); + + auto outputref = m.forward(inputs).toTensor(); + + CompilationOptions options; + mobile::Module bc = jitModuleToMobile(m, options); + IValue res; + for (int i = 0; i < 3; ++i) { + res = bc.get_method("forward")(inputs); + } + auto output = res.toTensor(); + AT_ASSERT(outputref.dim() == output.dim()); + AT_ASSERT( + outputref[0][0][0][0].item() == output[0][0][0][0].item()); + + auto buff = save_mobile_module_to_bytes(bc); + mobile::Module bc2 = parse_mobile_module(buff.data(), buff.size(), true); + buff = flatbuffers::DetachedBuffer(); + + for (int i = 0; i < 3; ++i) { + res = bc2.get_method("forward")(inputs); + } + output = res.toTensor(); + AT_ASSERT(outputref.dim() == output.dim()); + AT_ASSERT( + outputref[0][0][0][0].item() == output[0][0][0][0].item()); +} + TEST(FlatbufferTest, Inline) { Module m("m"); m.define(R"JIT( @@ -204,6 +346,32 @@ TEST(FlatbufferTest, Inline) { AT_ASSERT(output.toTensor().item() == 7.0); } +TEST(FlatbufferTest, InlineWithCopyTensorMemory) { + Module m("m"); + m.define(R"JIT( + def foo1(self, x): + return x + 1 + + def foo2(self, x): + return self.foo1(x) + 2 + + def foo3(self, x): + return self.foo2(x) + 3 + )JIT"); + CompilationOptions options; + mobile::Module bc = jitModuleToMobile(m, options); + std::vector inputs({torch::ones({})}); + auto output = bc.get_method("foo3")(inputs); + AT_ASSERT(output.toTensor().item() == 7.0); + + auto buff = save_mobile_module_to_bytes(bc); + mobile::Module bc2 = parse_mobile_module(buff.data(), buff.size(), true); + buff = flatbuffers::DetachedBuffer(); + std::vector inputs2({torch::ones({})}); + output = bc2.get_method("foo3")(inputs2); + AT_ASSERT(output.toTensor().item() == 7.0); +} + TEST(FlatbufferTest, Tuple) { Module m("m"); m.define(R"JIT( @@ -1104,5 +1272,534 @@ TEST(FlatbufferTest, OperatorTest2) { // NOLINT (use =delete in gtest) } } +Module jitModuleFromBuffer(void* data) { + auto* flatbuffer_module = mobile::serialization::GetMutableModule(data); + FlatbufferLoader loader; + mobile::Module mobilem = loader.parseModule(flatbuffer_module); + ExtraFilesMap files; + std::vector constants; + loader.extractJitSourceAndConstants(&files, &constants); + return jitModuleFromSourceAndConstants( + mobilem._ivalue(), files, constants, 8); +} + +#if defined(ENABLE_FLATBUFFER) +TEST(TestSourceFlatbuffer, UpsampleNearest2d) { + Module m("m"); + m.define(R"( + def forward(self, input: Tensor, scale:float): + return torch.upsample_nearest2d(input, [1, 1], float(scale), float(scale)) + )"); + + std::vector inputs; + inputs.emplace_back(torch::rand({1, 3, 128, 128})); + inputs.emplace_back(at::Scalar(2.0)); + auto ref = m.forward(inputs); + + std::stringstream ss; + m._save_for_mobile(ss, {}, false, /*use_fatbuffer=*/true); + auto mm = _load_for_mobile(ss); + auto m2 = load(ss); + + auto res = m2.forward(inputs); + auto resm = mm.forward(inputs); + + auto resd = res.toTensor(); + auto refd = ref.toTensor(); + auto resmd = resm.toTensor(); + ASSERT_TRUE(resd.equal(refd)); + ASSERT_TRUE(resmd.equal(refd)); +} + +TEST(TestSourceFlatbuffer, CheckAttrAccess) { + Module m("m"); + m.register_attribute("mobile_optimized", BoolType::get(), true); + auto data = save_jit_module_to_bytes(m); + Module m2 = jitModuleFromBuffer(data.data()); + bool mobile_optimized = m2.attr("mobile_optimized", false).toBool(); + AT_ASSERT(mobile_optimized); + mobile::Module m3 = parse_mobile_module(data.data(), data.size()); + mobile_optimized = m3.attr("mobile_optimized", false).toBool(); + AT_ASSERT(mobile_optimized); +} + +TEST(TestSourceFlatbuffer, + MethodInvocation) { // NOLINT (use =delete in gtest) + const std::vector test_programs{ + // test invoking a method with default parameter + R"( + def test_func(self, x, b : int = 4): + return self.foo + x + b + )", + // inner method call with default parameter (gets inlined) + R"( + def add_with_default_arg(self, x, b : int = 4): + return self.foo + x + b + def test_func(self, x): + return self.add_with_default_arg(x) # invoke method w/ default arg + )", + // simple method call + R"( + def test_func(self, x): + b = 4 + return self.foo + x + b + )", + }; + for (const auto& test_program : test_programs) { + Module m("m"); + m.register_parameter("foo", torch::ones({}), false); + m.define(test_program); + + const int fortyTwo = 42; // (keep linter happy) + auto minput = fortyTwo * torch::ones({}); + auto ref = m.run_method("test_func", minput); + + auto data = save_jit_module_to_bytes(m); + Module m2 = jitModuleFromBuffer(data.data()); + const auto& test_func = m2.get_method("test_func"); + IValue res; + for (int i = 0; i < 3; ++i) { + res = test_func({minput}); + } + auto resd = res.toTensor().item(); + auto refd = ref.toTensor().item(); + AT_ASSERT(resd == refd); + + mobile::Module m3 = parse_mobile_module(data.data(), data.size()); + const auto& test_func3 = m3.get_method("test_func"); + for (int i = 0; i < 3; ++i) { + res = test_func3({minput}); + } + resd = res.toTensor().item(); + refd = ref.toTensor().item(); + AT_ASSERT(resd == refd); + } +} +#endif + +#if !defined FB_XPLAT_BUILD +// The following test run in fbcode only +TEST(FlatbufferUpgraderTest, DivTensorV2) { + std::string filePath(__FILE__); + auto test_model_file = filePath.substr(0, filePath.find_last_of("/\\") + 1); + test_model_file.append("upgrader_models/test_versioned_div_tensor_v2.ptl.ff"); + /* + (('__torch__.MyModule.forward', + (('instructions', + (('STOREN', 1, 3), + ('DROPR', 1, 0), + ('LOAD', 2, 0), + ('LOAD', 3, 0), + ('OP', 0, 0), + ('LOAD', 2, 0), + ('LOAD', 3, 0), + ('OP', 1, 0), + ('MOVE', 2, 0), + ('MOVE', 3, 0), + ('OP', 2, 0), + ('TUPLE_CONSTRUCT', 3, 0), + ('RET', 0, 0))), + ('operators', + (('aten::div', 'Tensor'), + ('aten::div', 'Tensor'), + ('aten::div', 'Tensor'))), + ('constants', ()), + ('types', ()), + ('register_size', 3))),) + + */ + mobile::Module m_module = load_mobile_module_from_file(test_model_file); + auto intrsuction_list = + m_module.get_method("forward").function().get_code().instructions_; + uint64_t number_of_call_instruction = 0; + for (auto& instruction : intrsuction_list) { + number_of_call_instruction += (instruction.op == OpCode::CALL); + } + // 3 operators will use upgrader + ASSERT_EQ(number_of_call_instruction, 3); + + std::vector inputs = { + IValue(6 * torch::ones({1})), IValue(3 * torch::ones({1}))}; + auto actual_output = m_module.forward(inputs); + auto expect_output = 2.0 * torch::ones({1}); + auto actual_output_list = actual_output.toTuple()->elements(); + ASSERT_TRUE(actual_output_list[0].toTensor().equal(expect_output)); +} + +TEST(FlatbufferUpgraderTest, DivTensorOutV2) { + std::string filePath(__FILE__); + auto test_model_file = filePath.substr(0, filePath.find_last_of("/\\") + 1); + test_model_file.append( + "upgrader_models/test_versioned_div_tensor_out_v2.ptl.ff"); + /* + (('__torch__.MyModule.forward', + (('instructions', + (('STOREN', 1, 4), + ('DROPR', 1, 0), + ('MOVE', 2, 0), + ('MOVE', 3, 0), + ('MOVE', 4, 0), + ('OP', 0, 0), + ('RET', 0, 0))), + ('operators', (('aten::div', 'out'),)), + ('constants', ()), + ('types', ()), + ('register_size', 4))),) + */ + mobile::Module m_module = load_mobile_module_from_file(test_model_file); + + auto intrsuction_list = + m_module.get_method("forward").function().get_code().instructions_; + uint64_t number_of_call_instruction = 0; + for (auto& instruction : intrsuction_list) { + number_of_call_instruction += (instruction.op == OpCode::CALL); + } + // One operator will use upgrader + ASSERT_EQ(number_of_call_instruction, 1); + + std::vector inputs{ + IValue(6 * torch::ones({1})), + IValue(3 * torch::ones({1})), + IValue(torch::empty({1}))}; + m_module.forward(inputs); + auto expect_output = 2.0 * torch::ones({1}); + auto actual_output = inputs[2].toTensor(); + // The out argument will be overwritten with the output + ASSERT_TRUE(actual_output.equal(expect_output)); +} + +TEST(FlatbufferUpgraderTest, DivTensorInplaceV2) { + std::string filePath(__FILE__); + auto test_model_file = filePath.substr(0, filePath.find_last_of("/\\") + 1); + test_model_file.append( + "upgrader_models/test_versioned_div_tensor_inplace_v2.ptl.ff"); + /* + (('__torch__.MyModule.forward', + (('instructions', + (('STOREN', 1, 3), + ('DROPR', 1, 0), + ('MOVE', 2, 0), + ('MOVE', 3, 0), + ('OP', 0, 0), + ('RET', 0, 0))), + ('operators', (('aten::div_', 'Tensor'),)), + ('constants', ()), + ('types', ()), + ('register_size', 3))),) + */ + mobile::Module m_module = load_mobile_module_from_file(test_model_file); + + auto intrsuction_list = + m_module.get_method("forward").function().get_code().instructions_; + uint64_t number_of_call_instruction = 0; + for (auto& instruction : intrsuction_list) { + number_of_call_instruction += (instruction.op == OpCode::CALL); + } + // One operator will use upgrader + ASSERT_EQ(number_of_call_instruction, 1); + + std::vector inputs{ + IValue(6 * torch::ones({1})), IValue(3 * torch::ones({1}))}; + m_module.forward(inputs); + auto expect_output = 2.0 * torch::ones({1}); + auto actual_output = inputs[0].toTensor(); + // The out argument will be overwritten with the output + ASSERT_TRUE(actual_output.equal(expect_output)); +} + +TEST(FlatbufferUpgraderTest, DivScalarFloatV2) { + std::string filePath(__FILE__); + auto test_model_file = filePath.substr(0, filePath.find_last_of("/\\") + 1); + test_model_file.append( + "upgrader_models/test_versioned_div_scalar_float_v2.ptl.ff"); + /* + (('__torch__.MyModuleFloat.forward', + (('instructions', + (('STOREN', 1, 3), + ('DROPR', 1, 0), + ('MOVE', 2, 0), + ('MOVE', 3, 0), + ('OP', 0, 0), + ('RET', 0, 0))), + ('operators', (('aten::div', 'Scalar'),)), + ('constants', ()), + ('types', ()), + ('register_size', 3))),) + */ + + mobile::Module m_module = load_mobile_module_from_file(test_model_file); + + auto intrsuction_list = + m_module.get_method("forward").function().get_code().instructions_; + uint64_t number_of_call_instruction = 0; + for (auto& instruction : intrsuction_list) { + number_of_call_instruction += (instruction.op == OpCode::CALL); + } + // One operator will use upgrader + ASSERT_EQ(number_of_call_instruction, 1); + + std::vector inputs{IValue(6 * torch::ones({1})), IValue(3.0)}; + auto output = m_module.forward(inputs); + auto expect_output = 2.0 * torch::ones({1}); + auto actual_output = output.toTensor(); + + // The out argument will be overwritten with the output + ASSERT_TRUE(actual_output.equal(expect_output)); +} + +TEST(FlatbufferUpgraderTest, DivScalarReciprocalFloatV2) { + std::string filePath(__FILE__); + auto test_model_file = filePath.substr(0, filePath.find_last_of("/\\") + 1); + test_model_file.append( + "upgrader_models/test_versioned_div_scalar_reciprocal_float_v2.ptl.ff"); + /* + (('__torch__.MyModuleFloat.forward', + (('instructions', + (('STOREN', 1, 3), + ('DROPR', 1, 0), + ('MOVE', 2, 0), + ('OP', 0, 0), + ('MOVE', 3, 0), + ('OP', 1, 0), + ('RET', 0, 0))), + ('operators', (('aten::reciprocal', ''), ('aten::mul', 'Scalar'))), + ('constants', ()), + ('types', ()), + ('register_size', 3))),) + */ + mobile::Module m_module = load_mobile_module_from_file(test_model_file); + + auto intrsuction_list = + m_module.get_method("forward").function().get_code().instructions_; + uint64_t number_of_call_instruction = 0; + for (auto& instruction : intrsuction_list) { + number_of_call_instruction += (instruction.op == OpCode::CALL); + } + // No operator will use upgrader + ASSERT_EQ(number_of_call_instruction, 0); + + std::vector inputs{IValue(6 * torch::ones({1})), IValue(3.0)}; + auto output = m_module.forward(inputs); + auto expect_output = 0.5 * torch::ones({1}); + auto actual_output = output.toTensor(); + // The out argument will be overwritten with the output + ASSERT_TRUE(actual_output.equal(expect_output)); +} + +TEST(FlatbufferUpgraderTest, DivScalarReciprocalIntV2) { + std::string filePath(__FILE__); + auto test_model_file = filePath.substr(0, filePath.find_last_of("/\\") + 1); + test_model_file.append( + "upgrader_models/test_versioned_div_scalar_reciprocal_int_v2.ptl.ff"); + /* + (('__torch__.MyModuleInt.forward', + (('instructions', + (('STOREN', 1, 3), + ('DROPR', 1, 0), + ('MOVE', 2, 0), + ('OP', 0, 0), + ('MOVE', 3, 0), + ('OP', 1, 0), + ('RET', 0, 0))), + ('operators', (('aten::reciprocal', ''), ('aten::mul', 'Scalar'))), + ('constants', ()), + ('types', ()), + ('register_size', 3))),) + */ + mobile::Module m_module = load_mobile_module_from_file(test_model_file); + + auto intrsuction_list = + m_module.get_method("forward").function().get_code().instructions_; + uint64_t number_of_call_instruction = 0; + for (auto& instruction : intrsuction_list) { + number_of_call_instruction += (instruction.op == OpCode::CALL); + } + // No operator will use upgrader + ASSERT_EQ(number_of_call_instruction, 0); + + std::vector inputs{IValue(6 * torch::ones({1})), IValue(3.0)}; + auto output = m_module.forward(inputs); + auto expect_output = 0.5 * torch::ones({1}); + auto actual_output = output.toTensor(); + + // The out argument will be overwritten with the output + ASSERT_TRUE(actual_output.equal(expect_output)); +} + +TEST(FlatbufferUpgraderTest, DivScalarScalarV2) { + std::string filePath(__FILE__); + auto test_model_file = filePath.substr(0, filePath.find_last_of("/\\") + 1); + test_model_file.append( + "upgrader_models/test_versioned_div_scalar_scalar_v2.ptl.ff"); + /* + (('__torch__.MyModule.forward', + (('instructions', + (('STOREN', 1, 5), + ('DROPR', 1, 0), + ('LOAD', 2, 0), + ('LOAD', 3, 0), + ('OP', 0, 0), + ('MOVE', 2, 0), + ('LOAD', 4, 0), + ('OP', 1, 0), + ('LOAD', 3, 0), + ('MOVE', 4, 0), + ('OP', 2, 0), + ('MOVE', 3, 0), + ('MOVE', 5, 0), + ('OP', 3, 0), + ('TUPLE_CONSTRUCT', 4, 0), + ('RET', 0, 0))), + ('operators', + (('aten::div', ''), + ('aten::div', 'float'), + ('aten::div', ''), + ('aten::div', 'int'))), + ('constants', ()), + ('types', ()), + ('register_size', 5))),) + */ + mobile::Module m_module = load_mobile_module_from_file(test_model_file); + auto intrsuction_list = + m_module.get_method("forward").function().get_code().instructions_; + uint64_t number_of_call_instruction = 0; + for (auto& instruction : intrsuction_list) { + number_of_call_instruction += (instruction.op == OpCode::CALL); + } + // No operator will use upgrader + ASSERT_EQ(number_of_call_instruction, 0); + + std::vector inputs{IValue(20.0), IValue(10), IValue(2.0), IValue(5)}; + auto output = m_module.forward(inputs); + auto output_list = output.toTupleRef().elements(); + auto expect_output = std::vector( + {IValue(2.0), IValue(10.0), IValue(5.0), IValue(2.0)}); + // auto actual_output = output.toTensor(); + for (size_t i = 0; i < expect_output.size(); i++) { + ASSERT_EQ(output_list[i], expect_output[i]); + } +} + +TEST(FlatbufferUpgraderTest, DivScalarIntV2) { + std::string filePath(__FILE__); + auto test_model_file = filePath.substr(0, filePath.find_last_of("/\\") + 1); + test_model_file.append( + "upgrader_models/test_versioned_div_scalar_int_v2.ptl.ff"); + /* + (('__torch__.MyModuleInt.forward', + (('instructions', + (('STOREN', 1, 3), + ('DROPR', 1, 0), + ('MOVE', 2, 0), + ('MOVE', 3, 0), + ('OP', 0, 0), + ('RET', 0, 0))), + ('operators', (('aten::div', 'Scalar'),)), + ('constants', ()), + ('types', ()), + ('register_size', 3))),) + */ + mobile::Module m_module = load_mobile_module_from_file(test_model_file); + + auto intrsuction_list = + m_module.get_method("forward").function().get_code().instructions_; + uint64_t number_of_call_instruction = 0; + for (auto& instruction : intrsuction_list) { + number_of_call_instruction += (instruction.op == OpCode::CALL); + } + // One operator will use upgrader + ASSERT_EQ(number_of_call_instruction, 1); + + std::vector inputs{IValue(6 * torch::ones({1})), IValue(3)}; + auto output = m_module.forward(inputs); + auto expect_output = 2.0 * torch::ones({1}); + auto actual_output = output.toTensor(); + + // The out argument will be overwritten with the output + ASSERT_TRUE(actual_output.equal(expect_output)); +} + +TEST(FlatbufferUpgraderTest, DivScalarInplaceFloatV2) { + std::string filePath(__FILE__); + auto test_model_file = filePath.substr(0, filePath.find_last_of("/\\") + 1); + test_model_file.append( + "upgrader_models/test_versioned_div_scalar_inplace_float_v2.ptl.ff"); + /* + (('__torch__.MyModuleFloat.forward', + (('instructions', + (('STOREN', 1, 3), + ('DROPR', 1, 0), + ('MOVE', 2, 0), + ('MOVE', 3, 0), + ('OP', 0, 0), + ('RET', 0, 0))), + ('operators', (('aten::div_', 'Scalar'),)), + ('constants', ()), + ('types', ()), + ('register_size', 3))),) + */ + + mobile::Module m_module = load_mobile_module_from_file(test_model_file); + + auto intrsuction_list = + m_module.get_method("forward").function().get_code().instructions_; + uint64_t number_of_call_instruction = 0; + for (auto& instruction : intrsuction_list) { + number_of_call_instruction += (instruction.op == OpCode::CALL); + } + // One operator will use upgrader + ASSERT_EQ(number_of_call_instruction, 1); + + std::vector inputs{IValue(6 * torch::ones({1})), IValue(3.0)}; + auto output = m_module.forward(inputs); + auto expect_output = 2.0 * torch::ones({1}); + auto actual_output = output.toTensor(); + + // The out argument will be overwritten with the output + ASSERT_TRUE(actual_output.equal(expect_output)); +} + +TEST(FlatbufferUpgraderTest, DivScalarInplaceIntV2) { + std::string filePath(__FILE__); + auto test_model_file = filePath.substr(0, filePath.find_last_of("/\\") + 1); + test_model_file.append( + "upgrader_models/test_versioned_div_scalar_inplace_int_v2.ptl.ff"); + /* + (('__torch__.MyModuleInt.forward', + (('instructions', + (('STOREN', 1, 3), + ('DROPR', 1, 0), + ('MOVE', 2, 0), + ('MOVE', 3, 0), + ('OP', 0, 0), + ('RET', 0, 0))), + ('operators', (('aten::div_', 'Scalar'),)), + ('constants', ()), + ('types', ()), + ('register_size', 3))),) + */ + + mobile::Module m_module = load_mobile_module_from_file(test_model_file); + + auto intrsuction_list = + m_module.get_method("forward").function().get_code().instructions_; + uint64_t number_of_call_instruction = 0; + for (auto& instruction : intrsuction_list) { + number_of_call_instruction += (instruction.op == OpCode::CALL); + } + // One operator will use upgrader + ASSERT_EQ(number_of_call_instruction, 1); + + std::vector inputs{IValue(6 * torch::ones({1})), IValue(3)}; + auto output = m_module.forward(inputs); + auto expect_output = 2.0 * torch::ones({1}); + auto actual_output = output.toTensor(); + + // The out argument will be overwritten with the output + ASSERT_TRUE(actual_output.equal(expect_output)); +} + +#endif // !defined(FB_XPLAT_BUILD) + } // namespace jit } // namespace torch diff --git a/test/cpp/jit/test_fuser.cpp b/test/cpp/jit/test_fuser.cpp index 18d5b3cc33b3..87261a8b1ce7 100644 --- a/test/cpp/jit/test_fuser.cpp +++ b/test/cpp/jit/test_fuser.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -54,7 +55,19 @@ namespace torch { namespace jit { -TEST(FuserTest, TestSimple_CUDA) { +class FuserTest : public ::testing::Test { + void SetUp() override { + old_nvfuser_value_ = fuser::cuda::setEnabled(false); + } + void TearDown() override { + fuser::cuda::setEnabled(old_nvfuser_value_); + } + + private: + bool old_nvfuser_value_; +}; + +TEST_F(FuserTest, TestSimple_CUDA) { #if defined(FBCODE_CAFFE2) return; #endif @@ -77,7 +90,7 @@ TEST(FuserTest, TestSimple_CUDA) { ASSERT_EQ(max_diff, 0); } -TEST(FuserTest, TestOne_CUDA) { +TEST_F(FuserTest, TestOne_CUDA) { #if defined(FBCODE_CAFFE2) return; #endif @@ -137,7 +150,7 @@ TEST(FuserTest, TestOne_CUDA) { testOne(0, 2); } -TEST(FuserTest, FusedConcat_CUDA) { +TEST_F(FuserTest, FusedConcat_CUDA) { #if defined(FBCODE_CAFFE2) return; #endif @@ -182,7 +195,7 @@ TEST(FuserTest, FusedConcat_CUDA) { }; } -TEST(FuserTest, FusionAliasing) { +TEST_F(FuserTest, FusionAliasing) { #if defined(FBCODE_CAFFE2) return; #endif @@ -210,7 +223,7 @@ TEST(FuserTest, FusionAliasing) { ->run(*g); } -TEST(FuserTest, KernelCaching) { +TEST_F(FuserTest, KernelCaching) { #if defined(FBCODE_CAFFE2) return; #endif diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp deleted file mode 100644 index f229ac2679e5..000000000000 --- a/test/cpp/jit/test_gpu.cpp +++ /dev/null @@ -1,19630 +0,0 @@ -#if defined(USE_CUDA) -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// fuser and IR parser -#include -#include - -#include "test_gpu_validator.h" - -#include -#include -#include - -#include -#include - -// Tests go in torch::jit -namespace torch { -namespace jit { - -using namespace torch::jit::fuser::cuda; -using namespace at::indexing; - -namespace { - -// Make a tensor that is known to be fully contiguous of dimensionality=ndims, -// but unknown sizes -TensorView* makeContigTensor(size_t ndims, DataType dtype = DataType::Float) { - return TensorViewBuilder() - .ndims(ndims) - .dtype(dtype) - .contiguity(std::vector(ndims, true)) - .build(); -} - -// Make a tensor that is known to be non-contiguous of dimensionality=ndims, -// but unknown sizes -TensorView* makeSymbolicTensor(size_t ndims, DataType dtype = DataType::Float) { - return TensorViewBuilder().ndims(ndims).dtype(dtype).build(); -} - -// Make a non-contiguous tensor of compile-time known sizes -TensorView* makeConcreteTensor( - std::vector shape, - DataType dtype = DataType::Float) { - return TensorViewBuilder().shape(shape).dtype(dtype).build(); -} - -void checkIntValue( - ExpressionEvaluator& evaluator, - Val* val, - Int::ScalarType expected_value) { - TORCH_CHECK(val->isAnInt()); - const auto actual_value = evaluator.evaluate(val); - TORCH_CHECK(actual_value.has_value()); - TORCH_CHECK(actual_value.value() == expected_value); -} - -void checkIntValue( - kir::ExpressionEvaluator& evaluator, - const kir::Val* val, - kir::Int::ScalarType expected_value) { - const auto actual_value = evaluator.evaluate(val); - TORCH_CHECK(actual_value.has_value()); - TORCH_CHECK(actual_value.value() == expected_value); -} - -bool isPredicated(TensorView* tv, GpuLower& gpulw) { - auto parent_scope = gpulw.lowerValue(tv)->definition()->parentScope(); - if (parent_scope->isA()) { - return !parent_scope->predicate()->value()->isConst(); - } - return true; -}; - -} // namespace - -// 1. Test cases are void() functions. -// 2. They start with the prefix `test` - -// A few smoke tests for IrGraphGenerator -// (These tests exercise IrGraphGenerator through a non-trivial IR, -// to make sure that it runs w/o crashing. The actual output is not -// validated) -TEST(NVFuserTest, IrGraphGenerator_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Make sure we can handle empty IRs - TORCH_CHECK(!IrGraphGenerator::toGraphviz( - &fusion, IrGraphGenerator::DetailLevel::Basic) - .empty()); - - // Construct an interesting IR - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - TensorView* tv2 = add(tv0, new Double(3.141)); - TensorView* tv3 = broadcast(tv0, {false, true, false, true}); - TensorView* tv4 = reductionOp(BinaryOpType::Add, {2}, new Double(0), tv3); - TensorView* tv5 = clamp(tv4, new Double(0.f), new Double(1.f)); - TensorView* tv6 = add(tv2, tv2); - - // Another checkpoint before adding outputs - TORCH_CHECK(!IrGraphGenerator::toGraphviz( - &fusion, IrGraphGenerator::DetailLevel::Explicit) - .empty()); - - fusion.addOutput(tv6); - - tv4->axis(2)->parallelize(ParallelType::BIDy); - tv6->merge(0); - tv6->split(0, 4); - tv6->axis(0)->parallelize(ParallelType::BIDx); - tv5->reorder({{-1, 0}}); - tv2->computeAt(tv6, 1); - - // Another checkpoint with more node types - TORCH_CHECK(!IrGraphGenerator::toGraphviz( - &fusion, IrGraphGenerator::DetailLevel::ComputeOnly) - .empty()); - - for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && - val->getValType().value() == ValType::TensorView) { - TensorView* tv = static_cast(val); - tv->axis(-1)->parallelize(ParallelType::TIDx); - } - } - - // Final IR graph - TORCH_CHECK(!IrGraphGenerator::toGraphviz( - &fusion, IrGraphGenerator::DetailLevel::Verbose) - .empty()); -} - -TEST(NVFuserTest, FusionDispatch_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - Double* f = new Double{2.f}; - std::stringstream ss1, ss2, ss3; - ss1 << f; - ss2 << static_cast(f); - ss3 << static_cast(f); - TORCH_CHECK( - ss1.str().compare(ss2.str()) == 0 && ss1.str().compare(ss3.str()) == 0, - "Error with dispatch system where results differ by passing Double* vs Val* vs Statement*."); -} - -// Evaluate basic scalar operations with constant values -TEST(NVFuserTest, FusionExprEvalConstants_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - ExpressionEvaluator evaluator(&fusion); - - auto* a = new Int(7); - auto* b = new Int(3); - - // Avoid div operation because it casts int operands to float - checkIntValue(evaluator, neg(a), -7); - checkIntValue(evaluator, add(a, b), 10); - checkIntValue(evaluator, neg(mul(sub(a, b), add(a, b))), -40); - checkIntValue(evaluator, mod(a, b), 1); - checkIntValue(evaluator, ceilDiv(a, b), 3); -} - -// Evaluate basic scalar operations with bound values -TEST(NVFuserTest, FusionExprEvalBindings_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - ExpressionEvaluator evaluator(&fusion); - - auto* a = new Int(); - auto* b = new Int(); - auto* c = add(a, b); - auto* d = neg(ceilDiv(c, b)); - auto* e = new Int(0); - - // trying to evaluate before binding should give empty results - TORCH_CHECK(!evaluator.evaluate(a).has_value()); - TORCH_CHECK(!evaluator.evaluate(d).has_value()); - - evaluator.bind(a, 7); - evaluator.bind(b, 3); - - // can't bind to the results of expressions - ASSERT_ANY_THROW(evaluator.bind(c, 100)); - - // can't bind to concrete values - ASSERT_ANY_THROW(evaluator.bind(e, 100)); - - checkIntValue(evaluator, c, 10); - checkIntValue(evaluator, sub(a, b), 4); - checkIntValue(evaluator, mod(a, b), 1); - checkIntValue(evaluator, ceilDiv(a, b), 3); - checkIntValue(evaluator, d, -4); - - // Reset evaluation context - evaluator = ExpressionEvaluator(&fusion); - - evaluator.bind(a, 2); - evaluator.bind(b, 5); - - checkIntValue(evaluator, c, 7); - checkIntValue(evaluator, sub(a, b), -3); - checkIntValue(evaluator, mod(a, b), 2); - checkIntValue(evaluator, ceilDiv(a, b), 1); - checkIntValue(evaluator, d, -2); -} - -// Evaluate expressions in a simple IR -TEST(NVFuserTest, FusionExprEvalBasic_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Create a non-trivial IR - TensorView* tv0 = makeSymbolicTensor(2); - TensorView* tv1 = makeSymbolicTensor(2); - - fusion.addInput(tv0); - fusion.addInput(tv1); - - TensorView* tv2 = add(tv1, new Double(2.0)); - TensorView* tv3 = add(tv0, tv2); - - fusion.addOutput(tv3); - - tv3->split(0, 4); - - tv0->computeAt(tv3, 1); - tv1->computeAt(tv3, 1); - - tv3->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(1)->parallelize(ParallelType::Unroll); - tv3->axis(1)->parallelize(ParallelType::Unroll); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - - // 1. Create an evaluator - ExpressionEvaluator evaluator(&fusion); - - // 2. Bind values - // - // IMPORTANT: - // a. The bindings are only as stable as the Vals are in the fusion graph - // b. You must use the original (rootDomain) extents - // (ex. `tv0->getRootDomain()[0]->extent()` - // instead of `tv0->axis(0)->extent()`) - // - evaluator.bind(tv0->getRootDomain()[0]->extent(), 6); - evaluator.bind(tv0->getRootDomain()[1]->extent(), 128); - evaluator.bind(tv1->getRootDomain()[0]->extent(), 6); - evaluator.bind(tv1->getRootDomain()[1]->extent(), 128); - - // 3. Evaluate and check result values - TORCH_CHECK(tv2->domain()->nDims() == 3); - checkIntValue(evaluator, tv2->axis(0)->extent(), 2); - checkIntValue(evaluator, tv2->axis(1)->extent(), 4); - checkIntValue(evaluator, tv2->axis(2)->extent(), 128); - - TORCH_CHECK(tv3->domain()->nDims() == 3); - checkIntValue(evaluator, tv3->axis(0)->extent(), 2); - checkIntValue(evaluator, tv3->axis(1)->extent(), 4); - checkIntValue(evaluator, tv3->axis(2)->extent(), 128); -} - -// Evaluate expressions in a more complex IR -TEST(NVFuserTest, FusionExprEvalComplex_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - TensorView* tv1 = mul(tv0, new Double(-1.0)); - TensorView* tv2 = add(tv0, new Double(3.0)); - TensorView* tv3 = mul(tv0, new Double(2.0)); - TensorView* tv4 = add(tv2, tv1); - TensorView* tv5 = add(tv4, tv3); - TensorView* tv6 = add(tv0, tv3); - - fusion.addOutput(tv5); - fusion.addOutput(tv6); - - tv5->reorder({{-1, 0}}); - - tv6->split(0, 5); - tv5->merge(0); - - // 1. Create an evaluator - ExpressionEvaluator evaluator(&fusion); - - // 2. Bind values - evaluator.bind(tv0->getRootDomain()[0]->extent(), 129); - evaluator.bind(tv0->getRootDomain()[1]->extent(), 127); - - // Evaluate and check extent values - TORCH_CHECK(tv0->domain()->nDims() == 2); - checkIntValue(evaluator, tv0->axis(0)->extent(), 129); - checkIntValue(evaluator, tv0->axis(1)->extent(), 127); - - TORCH_CHECK(tv3->domain()->nDims() == 2); - checkIntValue(evaluator, tv3->axis(0)->extent(), 129); - checkIntValue(evaluator, tv3->axis(1)->extent(), 127); - - TORCH_CHECK(tv4->domain()->nDims() == 2); - checkIntValue(evaluator, tv4->axis(0)->extent(), 129); - checkIntValue(evaluator, tv4->axis(1)->extent(), 127); - - TORCH_CHECK(tv5->domain()->nDims() == 1); - checkIntValue(evaluator, tv5->axis(0)->extent(), 16383); - - TORCH_CHECK(tv6->domain()->nDims() == 3); - checkIntValue(evaluator, tv6->axis(0)->extent(), 26); - checkIntValue(evaluator, tv6->axis(1)->extent(), 5); - checkIntValue(evaluator, tv6->axis(2)->extent(), 127); -} - -// Evaluate expressions post lowering -TEST(NVFuserTest, FusionExprEvalPostLower_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Create a non-trivial IR - TensorView* tv0 = makeSymbolicTensor(2); - TensorView* tv1 = makeSymbolicTensor(2); - - fusion.addInput(tv0); - fusion.addInput(tv1); - - TensorView* tv2 = add(tv1, new Double(2.0)); - TensorView* tv3 = add(tv0, tv2); - - fusion.addOutput(tv3); - - tv3->split(0, 4); - - tv0->computeAt(tv3, 1); - tv1->computeAt(tv3, 1); - - tv3->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(1)->parallelize(ParallelType::Unroll); - tv3->axis(1)->parallelize(ParallelType::Unroll); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - - auto* bid_x = add(tv3->axis(0)->extent(), new Int(0)); - auto* tid_x = add(tv3->axis(-1)->extent(), new Int(0)); - - // Lower - GpuLower gpulw(&fusion); - - // 1. Create an evaluation context - ExpressionEvaluator evaluator(&fusion); - - // 2. Bind values - evaluator.bind(tv0->getRootDomain()[0]->extent(), 6); - evaluator.bind(tv0->getRootDomain()[1]->extent(), 128); - evaluator.bind(tv1->getRootDomain()[0]->extent(), 6); - evaluator.bind(tv1->getRootDomain()[1]->extent(), 128); - - // 3. Evaluate and check result values - TORCH_CHECK(tv2->domain()->nDims() == 3); - checkIntValue(evaluator, tv2->axis(0)->extent(), 2); - checkIntValue(evaluator, tv2->axis(1)->extent(), 4); - checkIntValue(evaluator, tv2->axis(2)->extent(), 128); - - TORCH_CHECK(tv3->domain()->nDims() == 3); - checkIntValue(evaluator, tv3->axis(0)->extent(), 2); - checkIntValue(evaluator, tv3->axis(1)->extent(), 4); - checkIntValue(evaluator, tv3->axis(2)->extent(), 128); - - checkIntValue(evaluator, bid_x, 2); - checkIntValue(evaluator, tid_x, 128); -} - -// Kernel IR: Evaluate basic scalar operations with constant values -TEST(NVFuserTest, FusionKernelExprEvalConstants_CUDA) { - kir::Kernel kernel; - kir::IrBuilder ir_builder(&kernel); - - auto a = ir_builder.create(7); - auto b = ir_builder.create(3); - auto c = ir_builder.subExpr(a, b); - auto d = ir_builder.divExpr(a, b); - auto e = ir_builder.mulExpr(c, d); - - kir::ExpressionEvaluator evaluator; - - checkIntValue(evaluator, ir_builder.negExpr(a), -7); - checkIntValue(evaluator, ir_builder.addExpr(a, b), 10); - checkIntValue(evaluator, ir_builder.negExpr(e), -8); - checkIntValue(evaluator, ir_builder.modExpr(a, b), 1); - checkIntValue(evaluator, ir_builder.ceilDivExpr(a, b), 3); -} - -// Kernel IR: Evaluate basic scalar operations with bound values -TEST(NVFuserTest, FusionKernelExprEvalBindings_CUDA) { - kir::Kernel kernel; - kir::IrBuilder ir_builder(&kernel); - - kir::ExpressionEvaluator evaluator; - - auto a = ir_builder.create(c10::nullopt); - auto b = ir_builder.create(c10::nullopt); - auto c = ir_builder.addExpr(a, b); - auto d = ir_builder.negExpr(ir_builder.ceilDivExpr(c, b)); - auto e = ir_builder.create(0); - - // trying to evaluate before binding should give empty results - TORCH_CHECK(!evaluator.evaluate(a).has_value()); - TORCH_CHECK(!evaluator.evaluate(d).has_value()); - - evaluator.bind(a, 7); - evaluator.bind(b, 3); - - // can't bind to the results of expressions - ASSERT_ANY_THROW(evaluator.bind(c, 100)); - - // can't bind to concrete values - ASSERT_ANY_THROW(evaluator.bind(e, 100)); - - checkIntValue(evaluator, c, 10); - checkIntValue(evaluator, ir_builder.subExpr(a, b), 4); - checkIntValue(evaluator, ir_builder.modExpr(a, b), 1); - checkIntValue(evaluator, ir_builder.ceilDivExpr(a, b), 3); - checkIntValue(evaluator, d, -4); - - // Reset the evaluation context - evaluator = kir::ExpressionEvaluator(); - - evaluator.bind(a, 2); - evaluator.bind(b, 5); - - checkIntValue(evaluator, c, 7); - checkIntValue(evaluator, ir_builder.subExpr(a, b), -3); - checkIntValue(evaluator, ir_builder.modExpr(a, b), 2); - checkIntValue(evaluator, ir_builder.ceilDivExpr(a, b), 1); - checkIntValue(evaluator, d, -2); -} - -TEST(NVFuserTest, FusionClear_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // 1. Create a dummy IR - - { - TensorView* tv0 = makeSymbolicTensor(2); - TensorView* tv1 = makeSymbolicTensor(2); - - fusion.addInput(tv0); - fusion.addInput(tv1); - - TensorView* tv2 = add(tv1, new Double(2.0)); - TensorView* tv3 = add(tv0, tv2); - - fusion.addOutput(tv3); - - tv3->split(0, 4); - tv0->computeAt(tv3, 1); - tv1->computeAt(tv3, 1); - - tv3->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(1)->parallelize(ParallelType::Unroll); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - } - - // 2. Clear the IR - - fusion.clear(); - - TORCH_CHECK(fusion.unordered_exprs().empty()); - TORCH_CHECK(fusion.vals().empty()); - - TORCH_CHECK(fusion.inputs().empty()); - TORCH_CHECK(fusion.outputs().empty()); - - TORCH_CHECK(!fusion.hasReduction()); - - // 3. Rebuild the IR - - { - TensorView* tv0 = makeSymbolicTensor(3); - TensorView* tv1 = makeSymbolicTensor(3); - TensorView* tv2 = add(tv1, new Double(2.0)); - TensorView* tv3 = add(tv0, tv2); - - fusion.addInput(tv0); - fusion.addInput(tv1); - fusion.addOutput(tv3); - - // tv3 [i0, i1, i2] - tv3->reorder({{0, 2}, {2, 0}}); - // tv3 [i2, i1, i0] - tv3->split(-1, 4); - // tv3 [i2, i1, i0outer, i0inner{4}] - tv3->reorder({{2, 0}, {3, 1}, {0, 3}}); - // tv3 [i0outer, i0inner{4}, i1, i2] - tv0->computeAt(tv3, -1); - tv1->computeAt(tv3, -1); - tv3->axis(1)->parallelize(ParallelType::BIDx); - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor input1 = at::randn({16, 8, 8}, options); - at::Tensor input2 = at::randn_like(input1); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({input1, input2}); - - at::Tensor tv2_ref = input2 + 2.0; - at::Tensor output_ref = input1 + tv2_ref; - - TORCH_CHECK(output_ref.equal(outputs[0])); -} - -TEST(NVFuserTest, FusionCopy_CUDA) { - Fusion original_fusion; - - // Create the test IR - { - FusionGuard fg(&original_fusion); - - auto tv0 = makeSymbolicTensor(3); - auto tv1 = makeSymbolicTensor(3); - auto tv2 = add(tv1, new Double(2.0)); - auto tv3 = sub(add(tv0, mul(tv2, tv2)), tv2); - - original_fusion.addInput(tv0); - original_fusion.addInput(tv1); - original_fusion.addOutput(tv3); - - tv3->reorder({{0, 2}, {2, 0}}); - tv3->split(-1, 4); - tv3->reorder({{2, 0}, {3, 1}, {0, 3}}); - - tv0->computeAt(tv3, -1); - tv1->computeAt(tv3, -1); - - tv3->axis(0)->parallelize(ParallelType::BIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - } - - // Test copy before lowering - Fusion clone = original_fusion; - - // Compare IR dumps - std::stringstream original_ir; - std::stringstream clone_ir; - original_ir << original_fusion; - clone_ir << clone; - ASSERT_EQ(original_ir.str(), clone_ir.str()); - - // Lower original fusion - std::string original_kernel; - { - // TODO(kir): remove this guard once we implement the cuda codegen visitor - FusionGuard fg(&original_fusion); - original_kernel = - codegen::generateCudaKernel(GpuLower(&original_fusion).kernel()); - } - - // Make sure the "before lowering" clone was not mutated - // while lowering the original fusion IR - std::stringstream before_lowering_ir; - before_lowering_ir << clone; - ASSERT_EQ(original_ir.str(), before_lowering_ir.str()); - - // Test copy after lowering (including assignment operator) - Fusion before_lowering = clone; - clone = original_fusion; - - // Compare IR dumps - std::stringstream original_lowered_ir; - std::stringstream clone_lowered_ir; - original_lowered_ir << original_fusion; - clone_lowered_ir << clone; - ASSERT_EQ(original_lowered_ir.str(), clone_lowered_ir.str()); - - // Lower the "before lowering" and compare kernels - std::string clone_kernel; - { - // TODO(kir): remove this guard once we implement the cuda codegen visitor - FusionGuard fg(&before_lowering); - clone_kernel = - codegen::generateCudaKernel(GpuLower(&before_lowering).kernel()); - } - ASSERT_EQ(original_kernel, clone_kernel); -} - -TEST(NVFuserTest, FusionMove_CUDA) { - Fusion fusion; - - // Create the test IR - { - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(3); - auto tv1 = makeSymbolicTensor(3); - auto tv2 = add(tv1, new Double(2.0)); - auto tv3 = sub(add(tv0, mul(tv2, tv2)), tv2); - - fusion.addInput(tv0); - fusion.addInput(tv1); - fusion.addOutput(tv3); - - tv3->reorder({{0, 2}, {2, 0}}); - tv3->split(-1, 4); - tv3->reorder({{2, 0}, {3, 1}, {0, 3}}); - - tv0->computeAt(tv3, -1); - tv1->computeAt(tv3, -1); - - tv3->axis(0)->parallelize(ParallelType::BIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - } - - std::stringstream original_ir; - original_ir << fusion; - - // Test move before lowering - Fusion another_fusion = std::move(fusion); - - // Check that the original fusion is "empty" - // - // IMPORTANT: these checks assume knowledge of the internal - // implementation of the move operations. General uses - // should only assume that the moved-from object is in - // a valid, but unspecified state. This is similar to the - // standard library containers: - // https://en.cppreference.com/w/cpp/utility/move - // - TORCH_CHECK(fusion.unordered_exprs().empty()); - TORCH_CHECK(fusion.vals().empty()); - TORCH_CHECK(fusion.inputs().empty()); - TORCH_CHECK(fusion.outputs().empty()); - - // clear() has no pre-conditions so it's valid to call on a moved-from object - fusion.clear(); - - // Compare IR dumps - std::stringstream another_ir; - another_ir << another_fusion; - ASSERT_EQ(original_ir.str(), another_ir.str()); - - // Lower the fusion IR - GpuLower lower(&another_fusion); - - std::stringstream lowered_ir; - lowered_ir << another_fusion; - - // Test move assignment after lowering - fusion = std::move(another_fusion); - - // Compare IR dumps - std::stringstream moved_lowered_ir; - moved_lowered_ir << fusion; - ASSERT_EQ(lowered_ir.str(), moved_lowered_ir.str()); -} - -TEST(NVFuserTest, FusionSimpleArith_CUDA) { - std::stringstream ss1, ss2; - - Fusion fusion; - FusionGuard fg(&fusion); - - Double* d1 = new Double(1.f); - Double* d2 = new Double{2.f}; - Double* d3 = new Double(); - - // Disrupt the fusion to make sure guard works well - { - Fusion fusion2; - FusionGuard fg(&fusion2); - - Double* d1 = new Double(1.f); - Double* d2 = new Double(2.f); - add(d1, d2); - ss2 << fusion2; - } - - new BinaryOp(BinaryOpType::Add, d3, d1, d2); - ss1 << fusion; - - TORCH_CHECK( - ss1.str().compare(ss2.str()) == 0, - "Error where explicit add nodes don't match implicit add nodes."); -} - -TEST(NVFuserTest, FusionSimpleTypePromote_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - Double* d4 = new Double{4.f}; - Int* i1 = new Int{3}; - auto d5 = add(d4, i1); - - TORCH_CHECK(d5->getDataType() == DataType::Double); -} - -TEST(NVFuserTest, FusionRegister_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - Double* v1 = new Double{1.f}; - Double* v2 = new Double{2.f}; - Val* v3 = binaryOp(BinaryOpType::Add, v1, v2); - Val* v4 = binaryOp(BinaryOpType::Add, v1, v2); - TORCH_CHECK(v1->name() + 1 == v2->name()); - TORCH_CHECK(v2->name() + 1 == v3->name()); - TORCH_CHECK(v3->name() + 1 == v4->name()); - TORCH_CHECK(v3->definition()->name() + 1 == v4->definition()->name()); -} - -// dummy expr with 2 outputs only for toposort test. -struct DummyExpr : public Expr { - ~DummyExpr() = default; - DummyExpr(Val* _outlhs, Val* _outrhs, Val* _lhs, Val* _rhs) - : Expr(ExprType::UnaryOp) // Not terribly safe... - { - addOutput(_outlhs); - addOutput(_outrhs); - addInput(_lhs); - addInput(_rhs); - this->name_ = FusionGuard::getCurFusion()->registerExpr(this); - } - DummyExpr(const DummyExpr& other) = delete; - DummyExpr& operator=(const DummyExpr& other) = delete; - DummyExpr(DummyExpr&& other) = delete; - DummyExpr& operator=(DummyExpr&& other) = delete; -}; - -TEST(NVFuserTest, FusionTopoSort_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // e0: v3, v2 = dummy(v1, v0) - // e1: v4 = add(v3, v2) - // e2: v5 = add(v2, v4) - // e3: v6 = add(v5, v5) - Double* v0 = new Double{1.f}; - Double* v1 = new Double{2.f}; - Double* v2 = new Double(); - Double* v3 = new Double(); - Double* v4 = new Double(); - Double* v5 = new Double(); - Double* v6 = new Double(); - - std::vector inputs = {v0, v1}; - for (auto val : inputs) { - fusion.addInput(val); - } - - Expr* e0 = new DummyExpr(v3, v2, v1, v0); - Expr* e1 = new BinaryOp(BinaryOpType::Add, v4, v3, v2); - Expr* e2 = new BinaryOp(BinaryOpType::Add, v5, v2, v4); - Expr* e3 = new BinaryOp(BinaryOpType::Add, v6, v5, v5); - - fusion.addOutput(v2); - fusion.addOutput(v3); - auto exprs = fusion.exprs(); - TORCH_CHECK(exprs.size() == 1, "Found ", exprs.size(), " but expecting 1"); - TORCH_CHECK(exprs[0] == e0); - - fusion.addOutput(v5); - exprs = fusion.exprs(); - TORCH_CHECK(exprs.size() == 3, "Found ", exprs.size(), " but expecting 3"); - TORCH_CHECK(exprs[0] == e0); - TORCH_CHECK(exprs[1] == e1); - TORCH_CHECK(exprs[2] == e2); - - fusion.addOutput(v4); - exprs = fusion.exprs(); - TORCH_CHECK(exprs.size() == 3, "Found ", exprs.size(), " but expecting 3"); - TORCH_CHECK(exprs[0] == e0); - TORCH_CHECK(exprs[1] == e1); - TORCH_CHECK(exprs[2] == e2); - - fusion.addOutput(v6); - exprs = fusion.exprs(); - TORCH_CHECK(exprs.size() == 4, "Found ", exprs.size(), " but expecting 4"); - TORCH_CHECK(exprs[0] == e0); - TORCH_CHECK(exprs[1] == e1); - TORCH_CHECK(exprs[2] == e2); - TORCH_CHECK(exprs[3] == e3); - - TORCH_CHECK(v2->definition()->name() == 0); - TORCH_CHECK(v3->definition()->name() == 0); - TORCH_CHECK(v4->definition()->name() == 1); - TORCH_CHECK(v5->definition()->name() == 2); - TORCH_CHECK(v6->definition()->name() == 3); -} - -TEST(NVFuserTest, FusionTensor_CUDA) { - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - Fusion fusion; - FusionGuard fg(&fusion); - - { - auto tensor = at::randn({2, 3, 4, 5}, options); - auto tensor_type = TensorType::create(tensor); - auto fuser_tensor = new TensorView(tensor_type); - TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim()); - TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float); - TORCH_CHECK(fuser_tensor->domain() != nullptr); - for (const auto i : c10::irange(fuser_tensor->nDims())) { - // size 1 dimension are makred as broadcast - TORCH_CHECK( - fuser_tensor->axis(i)->isBroadcast() == (tensor.sizes()[i] == 1)); - // check contiguity information; - TORCH_CHECK(fuser_tensor->domain()->contiguity()[i]); - } - } - - // TensorType::create fills stride_properties, which helps us to mark - // IterDomain properly - // Note: implementation could change, depending on how much we want to invest - // in our home-brew contiguity coalescing. For now let's make sure that we - // properly test what we are using. - { - auto tensor = at::randn({4, 4, 4}, options); - auto sliced_tensor = tensor.slice(1, 0, -1, 2); - - auto tensor_type = TensorType::create(sliced_tensor); - auto fuser_tensor = new TensorView(tensor_type); - TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim()); - TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float); - TORCH_CHECK(fuser_tensor->domain() != nullptr); - for (const auto i : c10::irange(fuser_tensor->nDims())) { - // size 1 dimension are makred as broadcast - TORCH_CHECK(fuser_tensor->axis(i)->isBroadcast() == false); - } - TORCH_CHECK(fuser_tensor->domain()->contiguity()[0]); - TORCH_CHECK(!fuser_tensor->domain()->contiguity()[1]); - TORCH_CHECK(fuser_tensor->domain()->contiguity()[2]); - } - - { - auto tensor = at::randn({2, 3, 4, 5}, options); - auto permuted_tensor = tensor.permute({0, 3, 1, 2}); - auto tensor_type = TensorType::create(permuted_tensor); - auto fuser_tensor = new TensorView(tensor_type); - TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim()); - TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float); - TORCH_CHECK(fuser_tensor->domain() != nullptr); - for (const auto i : c10::irange(fuser_tensor->nDims())) { - // size 1 dimension are makred as broadcast - TORCH_CHECK(fuser_tensor->axis(i)->isBroadcast() == false); - } - TORCH_CHECK(!fuser_tensor->domain()->contiguity()[0]); - TORCH_CHECK(!fuser_tensor->domain()->contiguity()[1]); - TORCH_CHECK(fuser_tensor->domain()->contiguity()[2]); - TORCH_CHECK(!fuser_tensor->domain()->contiguity()[3]); - } -} - -TEST(NVFuserTest, FusionFilterVals_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - auto tv1 = makeSymbolicTensor(1); - auto scalar0 = new Double(0); - auto scalar1 = new Int(0); - auto scalar2 = new Int(1); - - const std::vector vals = {tv0, scalar0, tv1, scalar1, scalar2}; - - std::vector tvs( - ir_utils::filterByType(vals).begin(), - ir_utils::filterByType(vals).end()); - TORCH_CHECK(tvs.size() == 2); - TORCH_CHECK(tvs[0] == tv0); - TORCH_CHECK(tvs[1] == tv1); - - std::vector floats( - ir_utils::filterByType(vals).begin(), - ir_utils::filterByType(vals).end()); - TORCH_CHECK(floats.size() == 1); - TORCH_CHECK(floats[0] == scalar0); - - std::vector ints( - ir_utils::filterByType(vals).begin(), - ir_utils::filterByType(vals).end()); - TORCH_CHECK(ints.size() == 2); - TORCH_CHECK(ints[0] == scalar1); - TORCH_CHECK(ints[1] == scalar2); - - TORCH_CHECK( - ir_utils::filterByType(vals).begin() == - ir_utils::filterByType(vals).end(), - "Not expecting any results"); -} - -TEST(NVFuserTest, FusionTVSplit_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv = makeSymbolicTensor(3); - - tv = tv->split(2, 2); - TORCH_CHECK(tv->nDims() == 4); - Expr* outer = tv->axis(2)->extent()->definition(); - - TORCH_CHECK( - outer->getExprType().value() == ExprType::BinaryOp && - static_cast(outer)->getBinaryOpType() == - BinaryOpType::CeilDiv && - static_cast(outer)->lhs()->sameAs( - tv->getRootDomain()[2]->extent()) && - static_cast(static_cast(outer)->rhs()) - ->sameAs(new Int(2))); - - IterDomain* inner = static_cast(tv->axis(3)); - TORCH_CHECK( - inner->extent()->isScalar() && - static_cast(inner->extent())->isConst() && - static_cast(inner->extent())->value().value() == 2); -} - -TEST(NVFuserTest, FusionTVMerge_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv = makeSymbolicTensor(3); - - tv = tv->merge(1); - Expr* axisOp = tv->axis(1)->extent()->definition(); - - TORCH_CHECK( - tv->nDims() == 2 && axisOp->getExprType() == ExprType::BinaryOp && - static_cast(axisOp)->getBinaryOpType() == BinaryOpType::Mul && - static_cast(axisOp)->lhs() == - tv->getRootDomain()[1]->extent() && - static_cast(axisOp)->rhs() == - tv->getRootDomain()[2]->extent()); -} - -TEST(NVFuserTest, FusionTVReorder_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - std::unordered_map shift_right{{-1, 0}}; - - std::unordered_map shift_left{{0, -1}}; - - std::unordered_map shift_left_2{{0, -1}, {1, 0}, {2, 1}}; - - std::unordered_map swap{{0, 2}, {2, 0}}; - - auto tv = makeSymbolicTensor(3); - std::vector ref; - ref = std::vector( - tv->domain()->domain().begin(), tv->domain()->domain().end()); - - tv->reorder(shift_left); - for (const auto i : c10::irange(tv->nDims())) { - TORCH_CHECK(ref[i]->sameAs(tv->axis(i - 1))); - } - - tv = makeSymbolicTensor(3); - ref = std::vector( - tv->domain()->domain().begin(), tv->domain()->domain().end()); - - tv->reorder(shift_left); - for (const auto i : c10::irange(tv->nDims())) { - TORCH_CHECK(ref[i]->sameAs(tv->axis(i - 1))); - } - - tv = makeSymbolicTensor(3); - ref = std::vector( - tv->domain()->domain().begin(), tv->domain()->domain().end()); - - tv->reorder(shift_right); - TORCH_CHECK(ref[ref.size() - 1]->sameAs(tv->axis(0))); - for (const auto i : c10::irange(1, tv->nDims())) { - TORCH_CHECK(ref[i - 1]->sameAs(tv->axis(i))); - } - - tv = makeSymbolicTensor(3); - ref = std::vector( - tv->domain()->domain().begin(), tv->domain()->domain().end()); - tv->reorder(swap); - TORCH_CHECK(ref[0]->sameAs(tv->axis(2))); - TORCH_CHECK(ref[2]->sameAs(tv->axis(0))); - TORCH_CHECK(ref[1]->sameAs(tv->axis(1))); -} - -TEST(NVFuserTest, FusionEquality_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - Double* fval1 = new Double(); - Double* fval1_copy = fval1; - Double* fval2 = new Double(); - Double* fone = new Double(1.0); - - TORCH_CHECK(fval1->sameAs(fval1_copy)); - TORCH_CHECK(!fval1->sameAs(fval2)); - TORCH_CHECK(!fone->sameAs(fval1)); - TORCH_CHECK(fone->sameAs(new Double(1.0))); - - Int* ival1 = new Int(); - Int* ival1_copy = ival1; - Int* ival2 = new Int(); - Int* ione = new Int(1); - - TORCH_CHECK(ival1->sameAs(ival1_copy)); - TORCH_CHECK(!ival1->sameAs(ival2)); - TORCH_CHECK(!ione->sameAs(ival1)); - TORCH_CHECK(ione->sameAs(new Int(1))); - - BinaryOp* add1 = new BinaryOp(BinaryOpType::Add, new Double(), fval1, ival1); - BinaryOp* add1_copy = - new BinaryOp(BinaryOpType::Add, new Double(), fval1, ival1); - BinaryOp* sub1 = new BinaryOp(BinaryOpType::Sub, new Double(), fval1, ival1); - - UnaryOp* neg1 = new UnaryOp(UnaryOpType::Neg, new Double(), fval1); - UnaryOp* neg2 = new UnaryOp(UnaryOpType::Neg, new Double(), fval2); - UnaryOp* neg1_copy = new UnaryOp(UnaryOpType::Neg, new Double(), fval1); - - TORCH_CHECK(add1->sameAs(add1_copy)); - TORCH_CHECK(!add1->sameAs(sub1)); - - TORCH_CHECK(neg1->sameAs(neg1_copy)); - TORCH_CHECK(!static_cast(neg1)->sameAs(add1)); - TORCH_CHECK(!neg1->sameAs(neg2)); -} - -TEST(NVFuserTest, FusionDependency_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - Double* d0 = new Double(0.f); - Double* d1 = new Double(1.f); - auto d2 = add(d0, d1); - - auto d3 = add(d2, d2); - - Double* d4 = new Double(4.f); - Double* d5 = new Double(5.f); - auto d6 = add(d4, d5); - - Double* d7 = new Double(7.f); - Double* d8 = new Double(8.f); - auto d9 = add(d7, d8); - - auto d10 = add(d6, d9); - - auto d11 = add(d3, d10); - - TORCH_CHECK(DependencyCheck::isDependencyOf(d0, d11)); - TORCH_CHECK(DependencyCheck::isDependencyOf(d1, d11)); - TORCH_CHECK(DependencyCheck::isDependencyOf(d2, d11)); - TORCH_CHECK(DependencyCheck::isDependencyOf(d3, d11)); - TORCH_CHECK(DependencyCheck::isDependencyOf(d6, d11)); - TORCH_CHECK(DependencyCheck::isDependencyOf(d9, d11)); - TORCH_CHECK(DependencyCheck::isDependencyOf(d0, d2)); - TORCH_CHECK(DependencyCheck::isDependencyOf(d2, d3)); - TORCH_CHECK(DependencyCheck::isDependencyOf(d4, d6)); - TORCH_CHECK(DependencyCheck::isDependencyOf(d8, d10)); - - TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d0)); - TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d1)); - TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d2)); - TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d3)); - TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d4)); - TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d5)); - TORCH_CHECK(!DependencyCheck::isDependencyOf(d2, d0)); - TORCH_CHECK(!DependencyCheck::isDependencyOf(d3, d2)); - TORCH_CHECK(!DependencyCheck::isDependencyOf(d6, d4)); - TORCH_CHECK(!DependencyCheck::isDependencyOf(d10, d8)); - - auto dep_chain = DependencyCheck::getSingleDependencyChain(d0, d11); - TORCH_CHECK(dep_chain.back() == d11); - dep_chain.pop_back(); - TORCH_CHECK(dep_chain.back() == d3); - dep_chain.pop_back(); - TORCH_CHECK(dep_chain.back() == d2); - dep_chain.pop_back(); - - dep_chain = DependencyCheck::getSingleDependencyChain(d6, d11); - TORCH_CHECK(dep_chain.back() == d11); - dep_chain.pop_back(); - TORCH_CHECK(dep_chain.back() == d10); - dep_chain.pop_back(); - - dep_chain = DependencyCheck::getSingleDependencyChain(d4, d11); - TORCH_CHECK(dep_chain.back() == d11); - dep_chain.pop_back(); - TORCH_CHECK(dep_chain.back() == d10); - dep_chain.pop_back(); - TORCH_CHECK(dep_chain.back() == d6); - dep_chain.pop_back(); - - dep_chain = DependencyCheck::getSingleDependencyChain(d11, d2); - TORCH_CHECK(dep_chain.empty()); -} - -TEST(NVFuserTest, FusionParser_CUDA) { - // This test may not pass if using a custom block sync as there may - // be additional calls. Skip the test as it's not specifically - // relevant with block synchronizatin. - if (std::getenv("PYTORCH_NVFUSER_USE_BLOCK_SYNC_ATOMIC")) { - return; - } - auto g = std::make_shared(); - const auto graph0_string = R"IR( - graph(%0 : Float(2, strides=[1]), - %1 : Float(2, strides=[1])): - %c0 : Float(2, strides=[1]) = aten::mul(%0, %1) - %d0 : Float(2, strides=[1]) = aten::mul(%c0, %0) - return (%d0))IR"; - parseIR(graph0_string, g.get()); - - // strides are not yet supported in the irparser. - for (auto val : g->block()->inputs()) { - if (val->isCompleteTensor()) - val->setType(val->type()->castRaw()->contiguous()); - } - for (auto node : g->block()->nodes()) { - for (auto val : node->outputs()) { - if (val->isCompleteTensor()) - val->setType(val->type()->castRaw()->contiguous()); - } - } - - auto fusion = parseJitIR(g); - FusionGuard fg(fusion.get()); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - // Avoid vectorization here as those kernels can't be lowered twice at the - // moment - at::Tensor input1 = at::randn({16}, options); - at::Tensor input2 = at::randn({16}, options); - auto lparams = schedulePointwise(fusion.get(), {input1, input2}); - - // CONSIDER: - // 1. this can be moved to a dedicated "golden" file - // 2. use a fuzzy compare (ignore non-significant whitespaces for example) - const std::string expected_kernel = R"( -__global__ void CUDAGeneratedKernel(Tensor T0, Tensor T1, Tensor T3) { - if ((((((((((nvfuser_index_t)blockIdx.x) * 1) + 0) * 1) + 0) * 128) + ((nvfuser_index_t)threadIdx.x)) < T0.size[0])) { - constexpr nvfuser_index_t ki183 = 0; - float T5[1]; - constexpr nvfuser_index_t ki217 = 0; - T5[ki217] = 0; - constexpr nvfuser_index_t ki208 = 0; - T5[ki208] - = T1[(((((((((nvfuser_index_t)blockIdx.x) * 1) + ki183) * 1) + ki208) * 128) + ((nvfuser_index_t)threadIdx.x)) * 1)]; - float T4[1]; - constexpr nvfuser_index_t ki223 = 0; - T4[ki223] = 0; - constexpr nvfuser_index_t ki203 = 0; - T4[ki203] - = T0[(((((((((nvfuser_index_t)blockIdx.x) * 1) + ki183) * 1) + ki203) * 128) + ((nvfuser_index_t)threadIdx.x)) * 1)]; - float T6[1]; - constexpr nvfuser_index_t ki192 = 0; - float T2[1]; - T2[0] - = T4[ki192] - * T5[ki192]; - T6[ki192] - = T2[0] - * T4[ki192]; - constexpr nvfuser_index_t ki185 = 0; - T3[(((((((((nvfuser_index_t)blockIdx.x) * 1) + ki183) * 1) + ki185) * 128) + ((nvfuser_index_t)threadIdx.x)) * 1)] - = T6[ki185]; - } -} -)"; - - const std::string actual_kernel = - "\n" + codegen::generateCudaKernel(GpuLower(fusion.get()).kernel()); - if (expected_kernel.size() != actual_kernel.size() || - expected_kernel.compare(actual_kernel) != 0) { - std::cerr - << " Codegen mismatch, codegen possibly changed, or is incorrect. " - << " \n ========= EXPECTED ========= \n" - << expected_kernel << "\n========= ACTUAL ========== \n" - << actual_kernel << "\n=================" << std::endl; - auto it = std::mismatch( - expected_kernel.begin(), - expected_kernel.end(), - actual_kernel.begin(), - actual_kernel.end()); - std::string actual_mismatched_snippet(it.second, actual_kernel.end()); - actual_mismatched_snippet = actual_mismatched_snippet.substr(0, 10); - std::string expected_mismatched_snippet(it.first, expected_kernel.end()); - expected_mismatched_snippet = expected_mismatched_snippet.substr(0, 10); - std::cerr << "First mismatch found at: " << actual_mismatched_snippet - << ", expected: " << expected_mismatched_snippet << std::endl; - TORCH_CHECK(false); - } - - FusionExecutor fe; - fe.compileFusion(fusion.get()); - auto outputs = fe.runFusion({input1, input2}, lparams); - at::Tensor output_ref = input1 * input2 * input1; - TORCH_CHECK(output_ref.equal(outputs[0])); -} - -TEST(NVFuserTest, FusionForLoop_CUDA) { -// TODO(kir): re-enable this test -// due to the current "GpuLower guard" approach, we can only create -// kernel IR during GpuLower::lower() -#if 0 - Fusion fusion; - FusionGuard fg(&fusion); - - const auto TV0 = new TensorView( - new TensorDomain({new IterDomain(new Int(0), new Int(16))}), - DataType::Float); - const auto TV1 = new TensorView( - new TensorDomain({new IterDomain(new Int(0), new Int(16))}), - DataType::Float); - - fusion.addInput(TV0); - fusion.addInput(TV1); - - auto ID0 = new kir::IterDomain(new IterDomain(new Int(0), new Int(8))); - - TensorView* TV2 = add(TV0, TV1); - BinaryOp* op = static_cast(TV2->definition(); - fusion.addOutput(TV2); - - auto fl = new kir::ForLoop(new kir::Int(c10::nullopt), ID0, {op}); - - std::stringstream result; - std::stringstream ref; - result << fl; - ref << "for(size_t i3{0}; i3 < iS{8}; ++i3 ) {\nT2[ iS{16} ] = T0[ iS{16} ] + T1[ iS{16} ]\n}"; - - if (result.str().compare(ref.str()) == 0) { - std::stringstream err_msg; - err_msg << "ForLoop printing has changed or something has gone wrong. " - << result.str() << "\n does not match reference: " << ref.str() - << std::endl; - TORCH_CHECK(false, err_msg.str()); - } -#endif -} - -TEST(NVFuserTest, FusionOuterSplit_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(3); - - new BinaryOp(BinaryOpType::Add, tv0, new Double(0.0), new Double(1.0)); - TensorView* tv1 = add(tv0, new Double(2.0)); - TensorView* tv2 = add(tv1, new Double(3.0)); - fusion.addOutput(tv2); - - //[I0, I1, I2] - tv2->split(-1, 4, false); - //[I0, I1, I2o{4}, I2i] - tv2->merge(0); - tv2->merge(0); - //[I0*I1*I2o{4}, I2i] - tv2->split(0, 2); - //[I0*I1*I2o{4}o, I0*I1*I2o{4}i{2}, I2i] - tv2->reorder({{0, 1}, {1, 0}}); - // I0*I1*I2o{4}i{2}, [I0*I1*I2o{4}o, I2i] - - tv0->computeAt(tv2, -1); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor output = at::empty({2, 6, 32}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({}, {output}); - - at::Tensor output_ref = at::zeros_like(output, options); - output_ref = output_ref + 0.0 + 1.0 + 2.0 + 3.0; - - TORCH_CHECK(output_ref.equal(output)); -} - -TEST(NVFuserTest, FusionCodeGen_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(3); - - new BinaryOp(BinaryOpType::Add, tv0, new Double(0.0), new Double(1.0)); - TensorView* tv1 = add(tv0, new Double(2.0)); - TensorView* tv2 = add(tv1, new Double(3.0)); - fusion.addOutput(tv2); - - //[I0, I1, I2] - tv2 = tv2->split(0, 4); - //[I0o, I0i{4}, I1, I2] - tv2 = tv2->merge(1); - //[I0o, I0i{4}*I1, I2] - tv2 = tv2->split(-1, 2); - //[I0o, I0i{4}*I1, I2o, I2i{2}] - tv2 = tv2->reorder({{0, 1}, {1, 0}, {3, 2}}); - //[I0i{4}*I1, I0o, I2i{2}, I2o] - - tv0->computeAt(tv2, -1); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor output = at::empty({16, 8, 8}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({}, {output}); - - at::Tensor output_ref = at::zeros_like(output, options); - output_ref = output_ref + 0.0 + 1.0 + 2.0 + 3.0; - - TORCH_CHECK(output_ref.equal(output)); -} - -TEST(NVFuserTest, FusionCodeGen2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(3); - TensorView* tv1 = makeSymbolicTensor(3); - TensorView* tv2 = add(tv1, new Double(2.0)); - TensorView* tv3 = add(tv0, tv2); - - fusion.addInput(tv0); - fusion.addInput(tv1); - fusion.addOutput(tv3); - - //[I0, I1, I2] - tv3->reorder({{0, 2}, {2, 0}}); - //[I2, I1, I0] - tv3->split(-1, 4); - //[I2, I1, I0o, I0i{4}] - tv3->reorder({{2, 0}, {3, 1}, {0, 3}}); - // I0o, I0i{4}, I1, I2] - - tv0->computeAt(tv3, -1); - tv1->computeAt(tv3, -1); - - tv3->axis(0)->parallelize(ParallelType::BIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor input1 = at::randn({16, 8, 8}, options); - at::Tensor input2 = at::randn_like(input1); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({input1, input2}); - - at::Tensor tv2_ref = input2 + 2.0; - at::Tensor output_ref = input1 + tv2_ref; - - TORCH_CHECK(output_ref.equal(outputs[0])); -} - -TEST(NVFuserTest, FusionSimplePWise_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - // dimensionality of the problem - int nDims = 3; - - // Set up your input tensor views - TensorView* tv0 = makeContigTensor(nDims); - TensorView* tv1 = makeContigTensor(nDims); - - // Register your inputs - fusion.addInput(tv0); - fusion.addInput(tv1); - - // Do math with it, it returns a `Val*` but can be static_casted back to - // TensorView - TensorView* tv2 = add(tv1, new Double(2.0)); - TensorView* tv3 = add(tv0, tv2); - - // Register your outputs - fusion.addOutput(tv3); - - // Do transformations, remember, transformations are outputs to inputs - // This doesn't have to be in this order - tv3->merge(1); - tv3->merge(0); - - // Split by n_threads - tv3->split(0, 128); - tv3->split(0, 4); - - // For all inputs, computeAt the output inline, temporaries should be squeezed - // between them - tv0->computeAt(tv3, -1); - tv1->computeAt(tv3, -1); - - // Parallelize TV3 - tv3->axis(0)->parallelize(ParallelType::BIDx); - tv3->axis(-2)->parallelize(ParallelType::Unroll); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor input1 = at::randn({64, 2, 128}, options); - at::Tensor input2 = at::rand_like(input1); - at::Tensor output = at::empty_like(input1); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({input1, input2}, {output}); - - at::Tensor tv2_ref = input2 + 2.0; - at::Tensor output_ref = input1 + tv2_ref; - - TORCH_CHECK(output_ref.equal(output)); -} - -TEST(NVFuserTest, FusionExecKernel_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - TensorView* tv1 = makeSymbolicTensor(2); - - // Register your inputs - fusion.addInput(tv0); - fusion.addInput(tv1); - - // Do math with it, it returns a `Val*` but can be static_casted back to - // TensorView - TensorView* tv2 = add(tv1, new Double(2.0)); - TensorView* tv3 = add(tv0, tv2); - - // Register your outputs - fusion.addOutput(tv3); - - tv3->merge(0); - tv3->split(0, 128); - tv3->split(0, 4); - - // For all inputs, computeAt the output inline, temporaries should be squeezed - // between them - tv0->computeAt(tv3, 1); - tv1->computeAt(tv3, 1); - - // Parallelize TV3 - tv3->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(1)->parallelize(ParallelType::Unroll); - tv3->axis(1)->parallelize(ParallelType::Unroll); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor input1 = at::ones({1, 128}, options); - at::Tensor input2 = at::ones_like(input1); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({input1, input2}); - - at::Tensor check = at::full({1, 128}, 4, options); - ; - TORCH_CHECK(outputs[0].equal(check)); -} - -int ceilDiv_(int a, int b) { - return (a + b - 1) / b; -} - -TEST(NVFuserTest, FusionAdvancedComputeAt1_CUDA) { - // Case 1 - // tv1 = tv0 * 0.5 - // tv2 = tv1 * -1 - // tv3 = tv1 + 3 - // tv4 = tv1 * 2 - // tv5 = tv3 + tv2 - // tv6 = tv5 + tv4 - // tv7 = tv1 + tv4 - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - TensorView* tv1 = mul(tv0, new Double(0.5)); - TensorView* tv2 = mul(tv1, new Double(-1.0)); - TensorView* tv3 = add(tv1, new Double(3.0)); - TensorView* tv4 = mul(tv1, new Double(2.0)); - TensorView* tv5 = add(tv3, tv2); - - TensorView* tv6 = add(tv5, tv4); - TensorView* tv7 = add(tv1, tv4); - - fusion.addOutput(tv6); - fusion.addOutput(tv7); - - // Lets setup to actually run - tv7->merge(0); - tv7->split(0, 128); - tv7->split(0, 4); - - tv7->axis(0)->parallelize(ParallelType::BIDx); - - tv0->computeAt(tv7, 1); - - GpuLower gpulw(&fusion); - - // The this-position of the last tensor should be zero. - TORCH_CHECK( - tv7->nDims() == 3 && tv7->getComputeAtPosition() == 0 && - tv7->getMaxProducerPosition() == 1); - TORCH_CHECK( - tv7->nDims() == 3 && tv6->getComputeAtPosition() == 0 && - tv6->getMaxProducerPosition() == 1); - // The position of every other tensor should be 1. - for (auto tv : {tv1, tv2, tv3, tv4, tv5}) { - TORCH_CHECK(tv->nDims() == 3 && tv->getComputeAtPosition() == 1); - TORCH_CHECK(gpulw.caLoopMap().areMapped(tv7->axis(0), tv->axis(0))); - } - - for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && - val->getValType().value() == ValType::TensorView) { - TensorView* tv = static_cast(val); - tv->axis(1)->parallelize(ParallelType::Unroll); - tv->axis(-1)->parallelize(ParallelType::TIDx); - } - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor aten_input = at::randn({129, 127}, options); - - auto t1 = aten_input.mul({0.5}); - auto t2 = t1.mul({-1.0}); - auto t3 = t1.add({3.0}); - auto t4 = t1.mul({2.0}); - auto t5 = t3.add(t2); - auto t6 = t5.add(t4); - auto t7 = t1.add(t4); - - std::vector aten_outputs = {t6, t7}; - std::vector cg_outputs = { - at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({aten_input}, cg_outputs); - - testValidate( - &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedComputeAt2_CUDA) { - // Case 2 - // tv1 = tv0 * -1 - // tv2 = tv0 + 3 - // tv3 = tv0 * 2 - // tv4 = tv2 + tv1 - // tv5 = tv4 + tv3 - // tv6 = tv5 + tv3 - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - TensorView* tv1 = mul(tv0, new Double(-1.0)); - TensorView* tv2 = add(tv0, new Double(3.0)); - TensorView* tv3 = mul(tv0, new Double(2.0)); - TensorView* tv4 = add(tv2, tv1); - - TensorView* tv5 = add(tv4, tv3); - TensorView* tv6 = add(tv5, tv3); - - fusion.addOutput(tv5); - fusion.addOutput(tv6); - - // Lets setup to actually run - tv6->merge(0); - tv6->split(0, 128); - tv6->split(0, 4); - - tv6->axis(0)->parallelize(ParallelType::BIDx); - - tv0->computeAt(tv6, 1); - - for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && - val->getValType().value() == ValType::TensorView) { - TensorView* tv = static_cast(val); - - tv->axis(1)->parallelize(ParallelType::Unroll); - tv->axis(-1)->parallelize(ParallelType::TIDx); - } - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({129, 127}, options); - - auto t1 = input.mul({-1.0}); - auto t2 = input.add({3.0}); - auto t3 = input.mul({2.0}); - auto t4 = t2.add(t1); - auto t5 = t4.add(t3); - auto t6 = t5.add(t3); - - std::vector aten_outputs = {t5, t6}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({input}); - - testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedComputeAt3_CUDA) { - // Case 3 - // T2 = T1 * 0.979361 - // T3 = T2 * T0 - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(4); - fusion.addInput(tv0); - - TensorView* tv1 = makeSymbolicTensor(4); - fusion.addInput(tv1); - - TensorView* tv2 = mul(tv1, new Double(.979361)); - TensorView* tv3 = mul(tv2, tv0); - - fusion.addOutput(tv3); - - // Lets setup to actually run - while (tv3->nDims() > 1) - tv3->merge(0); - tv3->split(0, 128); - tv3->split(0, 4); - - tv0->computeAt(tv3, 1); - tv1->computeAt(tv3, 1); - - tv3->axis(0)->parallelize(ParallelType::BIDx); - - for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && - val->getValType().value() == ValType::TensorView) { - TensorView* tv = static_cast(val); - - tv->axis(1)->parallelize(ParallelType::Unroll); - tv->axis(-1)->parallelize(ParallelType::TIDx); - } - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({129, 127, 63, 65}, options); - at::Tensor t1 = at::rand_like(t0, options); - - auto t2 = t1.mul({0.979361}); - auto aten_output = t2.mul(t0); - - std::vector aten_inputs = {t0, t1}; - - at::Tensor cg_output = at::empty_like(t0, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion(aten_inputs, {cg_output}); - - testValidate( - &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedComputeAt4_CUDA) { - // Case 4 - // T4 = T2 - T3 - // T5 = T1 + T4 - // T6 = T5 - T0 - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(4); - fusion.addInput(tv0); - - TensorView* tv1 = makeSymbolicTensor(4); - fusion.addInput(tv1); - - TensorView* tv2 = makeSymbolicTensor(4); - fusion.addInput(tv2); - - TensorView* tv3 = makeSymbolicTensor(4); - fusion.addInput(tv3); - - TensorView* tv4 = sub(tv2, tv3); - TensorView* tv5 = add(tv1, tv4); - TensorView* tv6 = sub(tv5, tv0); - - fusion.addOutput(tv6); - - // Lets setup to actually run - while (tv6->nDims() > 1) - tv6->merge(0); - tv6->split(0, 128); - tv6->split(0, 4); - - tv0->computeAt(tv6, 1); - tv1->computeAt(tv6, 1); - tv2->computeAt(tv6, 1); - tv3->computeAt(tv6, 1); - - tv6->axis(0)->parallelize(ParallelType::BIDx); - - for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && - val->getValType().value() == ValType::TensorView) { - TensorView* tv = static_cast(val); - - tv->axis(1)->parallelize(ParallelType::Unroll); - tv->axis(-1)->parallelize(ParallelType::TIDx); - } - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({129, 127, 63, 65}, options); - at::Tensor t1 = at::rand_like(t0, options); - at::Tensor t2 = at::rand_like(t0, options); - at::Tensor t3 = at::rand_like(t0, options); - - auto t4 = t2.sub(t3); - auto t5 = t1.add(t4); - auto aten_output = t5.sub(t0); - - std::vector aten_inputs = {t0, t1, t2, t3}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedComputeAt5_CUDA) { - // Case 5 - // tv2 = tv0 + 2.0 - // tv3 = tv1 * tv2 - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - TensorView* tv1 = makeSymbolicTensor(2); - fusion.addInput(tv1); - TensorView* tv2 = add(tv0, new Double(2.0)); - TensorView* tv3 = mul(tv1, tv2); - fusion.addOutput(tv3); - - tv3->merge(0); - tv3->split(-1, 8); - tv3->split(-1, 4); - - tv2->computeAt(tv3, 1); - tv3->axis(0)->parallelize(ParallelType::BIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({63, 65}, options); - at::Tensor t1 = at::rand_like(t0, options); - - auto t2 = t0.add(2.0); - auto aten_output = t1.mul(t2); - - std::vector aten_inputs = {t0, t1}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedComputeAt6_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - TensorView* tv1 = makeSymbolicTensor(2); - fusion.addInput(tv1); - TensorView* tv2 = add(tv0, new Double(2.0)); - TensorView* tv3 = mul(tv1, tv2); - fusion.addOutput(tv3); - - tv2->merge(0); - tv2->split(-1, 8); - tv2->split(-1, 4); - tv3->merge(0); - tv3->split(-1, 8); - - tv2->computeAt(tv3, 1); - - tv3->axis(0)->parallelize(ParallelType::BIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({63, 65}, options); - at::Tensor t1 = at::rand_like(t0, options); - - auto t2 = t0.add(2.0); - auto aten_output = t1.mul(t2); - - std::vector aten_inputs = {t0, t1}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedComputeAt7_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1.0)); - - auto tv2 = makeSymbolicTensor(1); - fusion.addInput(tv2); - - auto tv3 = add(tv2, new Double(3.0)); - - auto tv4 = add(tv1, tv3); - fusion.addOutput(tv4); - - auto tv5 = broadcast(tv1, {false, true}); - - auto tv6 = makeSymbolicTensor(2); - fusion.addInput(tv6); - - auto tv7 = mul(tv5, tv6); - - fusion.addOutput(tv7); - - tv7->split(1, 2); - tv7->merge(0); - tv7->split(0, 4); - tv7->split(0, 128); - - tv7->axis(0)->parallelize(ParallelType::BIDx); - tv7->axis(1)->parallelize(ParallelType::TIDx); - - tv0->computeAt(tv7, 1); - auto tv5_domain = tv5->domain()->domain(); - - // These computeAt transformations should not affect the TV5 domain - tv0->computeAt(tv4, -1); - tv2->computeAt(tv4, -1); - - auto tv5_domain_current = tv5->domain()->domain(); - TORCH_CHECK(tv5_domain == tv5_domain_current, "Invalid TV5 domain"); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - const int numel_x = 100; - const int numel_y = 200; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto t0 = at::randn({numel_x}, options); - auto t2 = at::randn({numel_x}, options); - auto t6 = at::randn({numel_x, numel_y}, options); - - auto t1 = t0.add(1.0); - auto t3 = t2.add(3.0); - auto t4 = t1.add(t3); - auto t5 = t1.unsqueeze(1); - auto t7 = t5.mul(t6); - - std::vector aten_inputs = {t0, t2, t6}; - std::vector aten_outputs = {t4, t7}; - - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedComputeAt8_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1.0)); - - auto tv2 = makeSymbolicTensor(1); - fusion.addInput(tv2); - - auto tv3 = add(tv2, new Double(3.0)); - - auto tv4 = add(tv1, tv3); - fusion.addOutput(tv4); - - auto tv5 = broadcast(tv1, {false, true}); - - auto tv6 = makeSymbolicTensor(2); - fusion.addInput(tv6); - - auto tv7 = mul(tv5, tv6); - - fusion.addOutput(tv7); - - tv7->split(1, 2); - tv7->merge(0); - tv7->split(0, 128, false); - tv7->split(0, 4, false); - - tv7->axis(0)->parallelize(ParallelType::BIDx); - tv7->axis(1)->parallelize(ParallelType::TIDx); - - // Reverse computeAt structure from previous test - tv0->computeAt(tv4, -1); - tv2->computeAt(tv4, -1); - tv0->computeAt(tv7, -1); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - const int numel_x = 100; - const int numel_y = 200; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto t0 = at::randn({numel_x}, options); - auto t2 = at::randn({numel_x}, options); - auto t6 = at::randn({numel_x, numel_y}, options); - - auto t1 = t0.add(1.0); - auto t3 = t2.add(3.0); - auto t4 = t1.add(t3); - auto t5 = t1.unsqueeze(1); - auto t7 = t5.mul(t6); - - std::vector aten_inputs = {t0, t2, t6}; - std::vector aten_outputs = {t4, t7}; - - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedComputeWith1_CUDA) { - // Case 1 - // tv1 = tv0 * 0.5 - // tv2 = tv1 * -1 - // tv3 = tv1 + 3 - // tv4 = tv1 * 2 - // tv5 = tv3 + tv2 - // tv6 = tv5 + tv4 - // tv7 = tv1 + tv4 - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - TensorView* tv1 = mul(tv0, new Double(0.5)); - TensorView* tv2 = mul(tv1, new Double(-1.0)); - TensorView* tv3 = add(tv1, new Double(3.0)); - TensorView* tv4 = mul(tv1, new Double(2.0)); - TensorView* tv5 = add(tv3, tv2); - - TensorView* tv6 = add(tv5, tv4); - TensorView* tv7 = add(tv1, tv4); - - fusion.addOutput(tv6); - fusion.addOutput(tv7); - - // Lets setup to actually run - tv0->merge(0); - tv0->split(0, 128); - tv0->split(0, 4); - - tv0->axis(0)->parallelize(ParallelType::BIDx); - - tv0->computeWith(tv7, 1); - - GpuLower gpulw(&fusion); - - // The this-position of the last tensor should be zero. - TORCH_CHECK( - tv7->nDims() == 3 && tv7->getComputeAtPosition() == 0 && - tv7->getMaxProducerPosition() == 1); - TORCH_CHECK( - tv7->nDims() == 3 && tv6->getComputeAtPosition() == 0 && - tv6->getMaxProducerPosition() == 1); - - // The position of every other tensor should be 1. - for (auto tv : {tv1, tv2, tv3, tv4, tv5}) { - TORCH_CHECK(tv->nDims() == 3 && tv->getComputeAtPosition() == 1); - TORCH_CHECK(gpulw.caLoopMap().areMapped(tv7->axis(0), tv->axis(0))); - } - - for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && - val->getValType().value() == ValType::TensorView) { - TensorView* tv = static_cast(val); - tv->axis(1)->parallelize(ParallelType::Unroll); - tv->axis(-1)->parallelize(ParallelType::TIDx); - } - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor aten_input = at::randn({129, 127}, options); - - auto t1 = aten_input.mul({0.5}); - auto t2 = t1.mul({-1.0}); - auto t3 = t1.add({3.0}); - auto t4 = t1.mul({2.0}); - auto t5 = t3.add(t2); - auto t6 = t5.add(t4); - auto t7 = t1.add(t4); - - std::vector aten_outputs = {t6, t7}; - std::vector cg_outputs = { - at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({aten_input}, cg_outputs); - - testValidate( - &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedComputeWith2_CUDA) { - // Case 2 - // tv1 = tv0 * -1 - // tv2 = tv0 + 3 - // tv3 = tv0 * 2 - // tv4 = tv2 + tv1 - // tv5 = tv4 + tv3 - // tv6 = tv5 + tv3 - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - TensorView* tv1 = mul(tv0, new Double(-1.0)); - TensorView* tv2 = add(tv0, new Double(3.0)); - TensorView* tv3 = mul(tv0, new Double(2.0)); - TensorView* tv4 = add(tv2, tv1); - - TensorView* tv5 = add(tv4, tv3); - TensorView* tv6 = add(tv5, tv3); - - fusion.addOutput(tv5); - fusion.addOutput(tv6); - - // Lets setup to actually run - tv0->merge(0); - tv0->split(0, 128); - tv0->split(0, 4); - - tv0->axis(0)->parallelize(ParallelType::BIDx); - - tv0->computeWith(tv6, 1); - - for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && - val->getValType().value() == ValType::TensorView) { - TensorView* tv = static_cast(val); - - tv->axis(1)->parallelize(ParallelType::Unroll); - tv->axis(-1)->parallelize(ParallelType::TIDx); - } - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({129, 127}, options); - - auto t1 = input.mul({-1.0}); - auto t2 = input.add({3.0}); - auto t3 = input.mul({2.0}); - auto t4 = t2.add(t1); - auto t5 = t4.add(t3); - auto t6 = t5.add(t3); - - std::vector aten_outputs = {t5, t6}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({input}); - - testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedComputeWith3_CUDA) { - // Case 3 - // T2 = T1 * 0.979361 - // T3 = T2 * T0 - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(4); - fusion.addInput(tv0); - - TensorView* tv1 = makeSymbolicTensor(4); - fusion.addInput(tv1); - - TensorView* tv2 = mul(tv1, new Double(.979361)); - TensorView* tv3 = mul(tv2, tv0); - - fusion.addOutput(tv3); - - // Lets setup to actually run - while (tv0->nDims() > 1) - tv0->merge(0); - tv0->split(0, 128); - tv0->split(0, 4); - - while (tv1->nDims() > 1) - tv1->merge(0); - tv1->split(0, 128); - tv1->split(0, 4); - - tv0->computeWith(tv3, 1); - tv1->computeWith(tv3, 1); - - tv3->axis(0)->parallelize(ParallelType::BIDx); - - for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && - val->getValType().value() == ValType::TensorView) { - TensorView* tv = static_cast(val); - - tv->axis(1)->parallelize(ParallelType::Unroll); - tv->axis(-1)->parallelize(ParallelType::TIDx); - } - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({129, 127, 63, 65}, options); - at::Tensor t1 = at::rand_like(t0, options); - - auto t2 = t1.mul({0.979361}); - auto aten_output = t2.mul(t0); - - std::vector aten_inputs = {t0, t1}; - - at::Tensor cg_output = at::empty_like(t0, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion(aten_inputs, {cg_output}); - - testValidate( - &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedComputeWith4_CUDA) { - // Case 4 - // T4 = T2 - T3 - // T5 = T1 + T4 - // T6 = T5 - T0 - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(4); - fusion.addInput(tv0); - - TensorView* tv1 = makeSymbolicTensor(4); - fusion.addInput(tv1); - - TensorView* tv2 = makeSymbolicTensor(4); - fusion.addInput(tv2); - - TensorView* tv3 = makeSymbolicTensor(4); - fusion.addInput(tv3); - - TensorView* tv4 = sub(tv2, tv3); - TensorView* tv5 = add(tv1, tv4); - TensorView* tv6 = sub(tv5, tv0); - - fusion.addOutput(tv6); - std::vector tvs = {tv0, tv1, tv2}; - for (auto tv : tvs) { - // Lets setup to actually run - while (tv->nDims() > 1) { - tv->merge(0); - } - tv->split(0, 128); - tv->split(0, 4); - tv->computeWith(tv6, 1); - } - - tv6->axis(0)->parallelize(ParallelType::BIDx); - - for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && - val->getValType().value() == ValType::TensorView) { - TensorView* tv = static_cast(val); - - tv->axis(1)->parallelize(ParallelType::Unroll); - tv->axis(-1)->parallelize(ParallelType::TIDx); - } - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({129, 127, 63, 65}, options); - at::Tensor t1 = at::rand_like(t0, options); - at::Tensor t2 = at::rand_like(t0, options); - at::Tensor t3 = at::rand_like(t0, options); - - auto t4 = t2.sub(t3); - auto t5 = t1.add(t4); - auto aten_output = t5.sub(t0); - - std::vector aten_inputs = {t0, t1, t2, t3}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedComputeWith5_CUDA) { - // Case 5 - // tv2 = tv0 + 2.0 - // tv3 = tv1 * tv2 - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - TensorView* tv1 = makeSymbolicTensor(2); - fusion.addInput(tv1); - TensorView* tv2 = add(tv0, new Double(2.0)); - TensorView* tv3 = mul(tv1, tv2); - fusion.addOutput(tv3); - - tv2->merge(0); - tv2->split(-1, 8); - tv2->split(-1, 4); - - tv2->computeWith(tv3, 1); - tv3->axis(0)->parallelize(ParallelType::BIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({63, 65}, options); - at::Tensor t1 = at::rand_like(t0, options); - - auto t2 = t0.add(2.0); - auto aten_output = t1.mul(t2); - - std::vector aten_inputs = {t0, t1}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedComputeWith6_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - TensorView* tv1 = makeSymbolicTensor(2); - fusion.addInput(tv1); - TensorView* tv2 = add(tv0, new Double(2.0)); - TensorView* tv3 = mul(tv1, tv2); - fusion.addOutput(tv3); - - tv2->merge(0); - tv2->split(-1, 8); - tv2->split(-1, 4); - tv3->merge(0); - tv3->split(-1, 8); - - tv2->computeWith(tv3, 1); - - tv3->axis(0)->parallelize(ParallelType::BIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({63, 65}, options); - at::Tensor t1 = at::rand_like(t0, options); - - auto t2 = t0.add(2.0); - auto aten_output = t1.mul(t2); - - std::vector aten_inputs = {t0, t1}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionComputeAtMultiConsumers_CUDA) { - // tv1 = tv0 * 0.5 - // tv2 = tv1 * -1 - // tv3 = tv2 * -2 - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - TensorView* tv1 = mul(tv0, new Double(0.5)); - TensorView* tv2 = mul(tv1, new Double(-1.0)); - TensorView* tv3 = mul(tv1, new Double(-2.0)); - fusion.addOutput(tv2); - fusion.addOutput(tv3); - - // This computeAt will affect tv2 as well, even though tv2 is not in - // the data-flow path between tv1 and tv3. The reason is that tv1 is - // now computed at tv3, so tv2 must also be computed at the same - // location. Overall, what will happen is basically we merge - // expressions of all tensors and compute them in a single loop - // nest. - TensorView* computeAtTarget = tv3; - computeAtTarget->split(0, 128); - tv1->computeAt(computeAtTarget, 1); - - TensorView* affected_tensors[] = {tv1, tv2, tv3}; - for (auto tv : affected_tensors) { - TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); - } - - GpuLower gpulw(&fusion); - - TORCH_CHECK(tv1->getComputeAtPosition() == 1); - TORCH_CHECK( - tv2->getComputeAtPosition() == 0 && tv2->getMaxProducerPosition() == 1); - TORCH_CHECK( - tv3->getComputeAtPosition() == 0 && tv3->getMaxProducerPosition() == 1); - - // Note that tv2 is also computed at tv3. - for (auto tv : {tv1, tv2}) { - TORCH_CHECK( - gpulw.caLoopMap().areMapped(tv->axis(0), computeAtTarget->axis(0))); - } - - TORCH_CHECK(tv3->getComputeAtPosition() == 0); - - computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); - for (auto tv : affected_tensors) { - tv->axis(-1)->parallelize(ParallelType::TIDx); - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor aten_input = at::randn({1000}, options); - - auto t1 = aten_input * 0.5; - auto t2 = t1 * -1.0; - auto t3 = t1 * -2.0; - - std::vector aten_outputs = {t2, t3}; - - std::vector cg_outputs = { - at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({aten_input}, cg_outputs); - - testValidate( - &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); -} - -// Similar to ComputeAtMultiConsumers, but with a common consumer. -TEST(NVFuserTest, FusionComputeAtCommonConsumer1_CUDA) { - // tv1 = tv0 * 0.5 - // tv2 = tv1 * -1 - // tv3 = tv2 * -2 - // tv4 = tv2 + tv3 - // tv5 = tv4 * 5 - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - TensorView* tv1 = mul(tv0, new Double(0.5)); - TensorView* tv2 = mul(tv1, new Double(-1.0)); - TensorView* tv3 = mul(tv1, new Double(-2.0)); - TensorView* tv4 = add(tv2, tv3); - TensorView* tv5 = mul(tv4, new Double(5.0)); - fusion.addOutput(tv3); - fusion.addOutput(tv4); - fusion.addOutput(tv5); - - // Computing tv1 at tv3. This will affect tv2 as discussed in - // ComplexComputeAt1. Additionally, in this case, notice that tv4 is - // the common consumer of tv2 and tv3, so they are computed at - // tv4. The indirect propagation of the computeAt should stop at the - // common consumer, and no further change should occur. More - // specifically, the computeAT position of tv4 and tv5 should be zero. - TensorView* computeAtTarget = tv3; - computeAtTarget->split(0, 128); - tv1->computeAt(computeAtTarget, 1); - - TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4}; - for (auto tv : affected_tensors) { - TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); - } - - TORCH_CHECK(tv1->getComputeAtPosition() == 1); - TORCH_CHECK(tv2->getComputeAtPosition() == 1); - TORCH_CHECK(tv3->getComputeAtPosition() == 1); - TORCH_CHECK(tv4->getComputeAtPosition() == 0); - TORCH_CHECK(tv5->getComputeAtPosition() == 0); - - computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); - - for (auto tv : affected_tensors) { - tv->axis(-1)->parallelize(ParallelType::TIDx); - } - - // Transform tv5 to make it look like the rest - tv5->split(0, 128); - tv5->axis(1)->parallelize(ParallelType::TIDx); - tv5->axis(0)->parallelize(ParallelType::BIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor aten_input = at::randn({1000}, options); - - auto t1 = aten_input * 0.5; - auto t2 = t1 * -1.0; - auto t3 = t1 * -2.0; - auto t4 = t2 + t3; - auto t5 = t4 * 5.0; - - std::vector aten_outputs = {t3, t4, t5}; - std::vector cg_outputs = { - at::empty_like(aten_input, options), - at::empty_like(aten_input, options), - at::empty_like(aten_input, options)}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({aten_input}, cg_outputs); - - testValidate( - &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionComputeAtCommonConsumer2_CUDA) { - // tv1 = tv0 * 0.5 - // tv2 = tv1 * -1 - // tv3 = tv2 * -1 - // tv4 = tv1 + 4 - // tv5 = tv3 + tv4 - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - TensorView* tv1 = mul(tv0, new Double(0.5)); - TensorView* tv2 = mul(tv1, new Double(-1.0)); - TensorView* tv3 = mul(tv2, new Double(-1.0)); - TensorView* tv4 = add(tv1, new Double(4.0)); - TensorView* tv5 = add(tv3, tv4); - - fusion.addOutput(tv5); - - TensorView* computeAtTarget = tv3; - - computeAtTarget->merge(0); - computeAtTarget->split(0, 128); - computeAtTarget->split(0, 4); - - computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); - - // This computeAt will affect all tensors including tv3, tv4 and - // tv5, even though it appears to impact only tv1 and tv2. The - // reason is that tv1 is now computed at tv3, so tv4 must also be - // computed at the same location. Similarly, the consumer of tv4, - // tv5, must also be computed at the same location. Overall, what - // will happen is basically we merge expressions of all tensors and - // compute them in a single loop nest. Internally, this will be - // realized by making all tensors, except for those in the path - // between tv1 and tv3, computed at tv5, which we call the common - // consumer. - tv1->computeAt(computeAtTarget, 1); - - // All tensors should have the same dimenionality as the target - for (Val* val : fusion.vals()) { - if (fusion.hasInput(val) || - val->getValType().value() != ValType::TensorView) { - continue; - } - TensorView* tv = val->as(); - TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); - if (tv == tv5) { - TORCH_CHECK(tv->getComputeAtPosition() == 0); - } else { - TORCH_CHECK(tv->getComputeAtPosition() == 1); - } - } - - for (auto tv : ir_utils::filterByType(fusion.vals())) { - if (!fusion.hasInput(tv)) { - tv->axis(1)->parallelize(ParallelType::Unroll); - tv->axis(-1)->parallelize(ParallelType::TIDx); - } - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor aten_input = at::randn({129, 127}, options); - - auto t1 = aten_input.mul({0.5}); - auto t2 = t1.mul({-1.0}); - auto t3 = t2.mul({-1.0}); - auto t4 = t1.add({4.0}); - auto aten_output = t3 + t4; - - at::Tensor cg_output = at::empty_like(aten_input, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({aten_input}, {cg_output}); - - testValidate( - &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__); -} - -// Similar to the above common consumer test but adds an additional -// tensor that has no common consumer with the other tensors. -TEST(NVFuserTest, FusionComputeAtCommonConsumer3_CUDA) { - // tv1 = tv0 * 0.5 - // tv2 = tv1 * -1 - // tv3 = tv2 * -1 - // tv4 = tv1 + 4 - // tv5 = tv2 + tv3 - // tv6 = tv1 + 6 - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - TensorView* tv1 = mul(tv0, new Double(0.5)); - TensorView* tv2 = mul(tv1, new Double(-1.0)); - TensorView* tv3 = mul(tv2, new Double(-1.0)); - TensorView* tv4 = add(tv1, new Double(4.0)); - TensorView* tv5 = add(tv3, tv4); - TensorView* tv6 = add(tv1, new Double(6.0)); - - fusion.addOutput(tv5); - fusion.addOutput(tv6); - - TensorView* computeAtTarget = tv3; - - computeAtTarget->merge(0); - computeAtTarget->split(0, 128); - computeAtTarget->split(0, 4); - - computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); - - // This will have the same impact on the tensors except for tv5 and - // tv6. tv6 does not have any common consumer with the computeAt - // target, but since it uses tv1, it must be also computed at the - // same location as the other impacted tensors. We can either make - // tv5 computed at tv6 or tv6 computed at tv5. In this case, tv5 - // should be computed at tv6 just because the current implementation - // orders the computeAt relationship based on the order in which - // tensors are specified as outputs. - - tv1->computeAt(computeAtTarget, 1); - - // All tensors should have the same dimenionality as the target - for (auto tv : ir_utils::filterByType(fusion.vals())) { - if (fusion.hasInput(tv)) { - continue; - } - TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); - if (tv == tv5 || tv == tv6) { - TORCH_CHECK(tv->getComputeAtPosition() == 0); - TORCH_CHECK(tv->getMaxProducerPosition() == 1); - } else { - TORCH_CHECK(tv->getComputeAtPosition() == 1); - } - } - - for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && - val->getValType().value() == ValType::TensorView) { - TensorView* tv = val->as(); - tv->axis(1)->parallelize(ParallelType::Unroll); - tv->axis(-1)->parallelize(ParallelType::TIDx); - } - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor aten_input = at::randn({129, 127}, options); - - auto t1 = aten_input.mul({0.5}); - auto t2 = t1.mul({-1.0}); - auto t3 = t2.mul({-1.0}); - auto t4 = t1.add({4.0}); - auto t5 = t3 + t4; - auto t6 = t1.add({6.0}); - - std::vector aten_outputs = {t5, t6}; - std::vector cg_outputs = { - at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({aten_input}, cg_outputs); - - testValidate( - &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); -} - -// Similar to ComputeAtCommonConsumer1 but with an addtiona ltensor -// that does not have data dependency with the consumer. -TEST(NVFuserTest, FusionComputeAtNoCommonConsumer_CUDA) { - // tv1 = tv0 * 0.5 - // tv2 = tv1 * -1 - // tv3 = tv1 * -2 - // tv4 = tv2 + tv3 - // tv5 = tv4 * 5 - // tv6 = tv1 * 6 - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - TensorView* tv1 = mul(tv0, new Double(0.5)); - TensorView* tv2 = mul(tv1, new Double(-1.0)); - TensorView* tv3 = mul(tv1, new Double(-2.0)); - TensorView* tv4 = add(tv2, tv3); - TensorView* tv5 = mul(tv4, new Double(5.0)); - // Notice that tv6 is not a consumer of tv4. - TensorView* tv6 = mul(tv1, new Double(6.0)); - fusion.addOutput(tv3); - fusion.addOutput(tv4); - fusion.addOutput(tv5); - fusion.addOutput(tv6); - - TensorView* computeAtTarget = tv3; - computeAtTarget->split(0, 128); - tv1->computeAt(computeAtTarget, 1); - - TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4, tv5, tv6}; - for (auto tv : affected_tensors) { - TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); - if (tv == tv6 || tv == tv5) { - TORCH_CHECK(tv->getComputeAtPosition() == 0); - } else { - TORCH_CHECK(tv->getComputeAtPosition() == 1); - } - } - - computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); - - for (auto tv : affected_tensors) { - tv->axis(-1)->parallelize(ParallelType::TIDx); - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor aten_input = at::randn({1000}, options); - - auto t1 = aten_input * 0.5; - auto t2 = t1 * -1.0; - auto t3 = t1 * -2.0; - auto t4 = t2 + t3; - auto t5 = t4 * 5.0; - auto t6 = t1 * 6.0; - - std::vector aten_outputs = {t3, t4, t5, t6}; - std::vector cg_outputs = { - at::empty_like(aten_input, options), - at::empty_like(aten_input, options), - at::empty_like(aten_input, options), - at::empty_like(aten_input, options)}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({aten_input}, cg_outputs); - - testValidate( - &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); -} - -namespace { - -void checkIdMapped( - ComputeAtRootDomainMap& root_map, - TensorView* v0, - IterDomain* id0, - TensorView* v1, - IterDomain* id1, - bool should_map) { - if (should_map) { - TORCH_CHECK( - root_map.canMap(v0->domain(), id0, v1->domain(), id1), - "Should be mappable: ", - id0, - " of ", - v0, - " and ", - id1, - " of ", - v1); - } else { - TORCH_CHECK( - !root_map.canMap(v0->domain(), id0, v1->domain(), id1), - "Should not be mappable: ", - id0, - " of ", - v0, - " and ", - id1, - " of ", - v1); - } -} - -void checkIdMapped( - TensorView* v0, - const std::vector& root0, - const std::vector should_map0, - TensorView* v1, - const std::vector& root1, - const std::vector should_map1) { - ComputeAtRootDomainMap map; - map.build(); - TORCH_INTERNAL_ASSERT(root0.size() == should_map0.size()); - TORCH_INTERNAL_ASSERT(root1.size() == should_map1.size()); - size_t idx0 = 0; - for (const auto i : c10::irange(root0.size())) { - size_t idx1 = 0; - for (const auto j : c10::irange(root1.size())) { - if (should_map0[i] && should_map1[j] && idx0 == idx1) { - checkIdMapped(map, v0, root0[i], v1, root1[j], true); - } else { - checkIdMapped(map, v0, root0[i], v1, root1[j], false); - } - if (should_map1[j]) - ++idx1; - } - if (should_map0[i]) - ++idx0; - } -} - -void checkIdMapped( - TensorView* v0, - const std::vector& root0, - TensorView* v1, - const std::vector& root1) { - checkIdMapped( - v0, - root0, - std::vector(root0.size(), true), - v1, - root1, - std::vector(root1.size(), true)); -} - -} // namespace - -TEST(NVFuserTest, FusionRootMappingBasic_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2); - TensorView* tv1 = makeSymbolicTensor(2); - - fusion.addInput(tv0); - fusion.addInput(tv1); - auto tv3 = broadcast(tv0, {true, false, false}); - auto tv4 = broadcast(tv1, {false, true, false}); - auto tv5 = add(tv3, tv4); - fusion.addOutput(tv5); - - checkIdMapped( - tv0, - tv0->getRootDomain(), - {true, true}, - tv4, - tv4->getRootDomain(), - {false, true, true}); - checkIdMapped( - tv1, - tv1->getRootDomain(), - {true, true}, - tv4, - tv4->getRootDomain(), - {true, false, true}); - checkIdMapped( - tv0, - tv0->getRootDomain(), - {false, true}, - tv1, - tv1->getRootDomain(), - {false, true}); - checkIdMapped( - tv0, - tv0->getRootDomain(), - {true, true}, - tv5, - tv5->getRootDomain(), - {false, true, true}); - checkIdMapped( - tv1, - tv1->getRootDomain(), - {true, true}, - tv5, - tv5->getRootDomain(), - {true, false, true}); - checkIdMapped(tv3, tv3->getRootDomain(), tv4, tv4->getRootDomain()); - checkIdMapped(tv3, tv3->getRootDomain(), tv5, tv5->getRootDomain()); - checkIdMapped(tv4, tv4->getRootDomain(), tv5, tv5->getRootDomain()); -} - -TEST(NVFuserTest, FusionRootMappingRfactor_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // [I,I] - TensorView* tv0 = makeSymbolicTensor(2); - // [I,I,I] - TensorView* tv1 = makeSymbolicTensor(3); - - //[I,I,R] - auto tv2 = sum(tv1, {2}); - auto tv3 = add(tv2, tv0); - - fusion.addInput(tv0); - fusion.addInput(tv1); - fusion.addOutput(tv3); - - // scheduling: - //[B,I,R0,R1=128], root = [B,I,R] - tv2->split(2, 128); - - // root=[B,I,Irf], rfactor=[B,I,Irf,Rrf] - auto tv4 = tv2->rFactor({3}); - - checkIdMapped(tv1, tv1->getRootDomain(), tv4, tv4->getRootDomain()); - checkIdMapped( - tv4, - tv4->getRFactorDomain(), - {true, true, true, false}, - tv2, - tv2->getRootDomain(), - {true, true, true}); - checkIdMapped( - tv1, - tv1->getRootDomain(), - {true, true, false}, - tv2, - tv2->getRootDomain(), - {true, true, false}); - checkIdMapped( - tv1, - tv1->getRootDomain(), - {true, true, false}, - tv3, - tv3->getRootDomain(), - {true, true}); - checkIdMapped( - tv2, - tv2->getRootDomain(), - {true, true, false}, - tv3, - tv3->getRootDomain(), - {true, true}); - checkIdMapped(tv0, tv0->getRootDomain(), tv3, tv3->getRootDomain()); - checkIdMapped( - tv0, - tv0->getRootDomain(), - {true, true}, - tv1, - tv1->getRootDomain(), - {true, true, false}); - checkIdMapped( - tv0, - tv0->getRootDomain(), - {true, true}, - tv2, - tv2->getRootDomain(), - {true, true, false}); - checkIdMapped( - tv0, - tv0->getRootDomain(), - {true, true}, - tv4, - tv4->getRFactorDomain(), - {true, true, false, false}); - checkIdMapped( - tv0, - tv0->getRootDomain(), - {true, true}, - tv4, - tv4->getRootDomain(), - {true, true, false}); -} - -TEST(NVFuserTest, FusionRootMappingReductionDependency1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2); - auto tv1 = sum(tv0, {1}); - auto tv2 = broadcast(tv1, {false, true}); - fusion.addOutput(tv2); - - // The second dimension cannot be mapped as it would require recomputation. - checkIdMapped(tv0, tv0->getRootDomain(), tv1, tv1->getRootDomain()); - checkIdMapped( - tv1, - tv1->getRootDomain(), - {true, false}, - tv2, - tv2->getRootDomain(), - {true, false}); - checkIdMapped( - tv0, - tv0->getRootDomain(), - {true, false}, - tv2, - tv2->getRootDomain(), - {true, false}); -} - -TEST(NVFuserTest, FusionRootMappingReductionDependency2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2); - auto tv1 = sum(tv0, {1}); - auto tv2 = broadcast(tv1, {false, true}); - auto tv3 = add(tv0, tv2); - fusion.addOutput(tv3); - - checkIdMapped( - tv0, - tv0->getRootDomain(), - {true, false}, - tv1, - tv1->getRootDomain(), - {true, false}); - checkIdMapped( - tv1, - tv1->getRootDomain(), - {true, false}, - tv2, - tv2->getRootDomain(), - {true, false}); - checkIdMapped( - tv0, - tv0->getRootDomain(), - {true, false}, - tv3, - tv3->getRootDomain(), - {true, false}); - checkIdMapped(tv2, tv2->getRootDomain(), tv3, tv3->getRootDomain()); -} - -TEST(NVFuserTest, FusionRootMappingReductionDependency3_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2); - auto tv1 = sum(tv0, {1}); - auto tv2 = broadcast(tv1, {false, true}); - fusion.addOutput(tv2); - - tv1->split(-1, 4); - auto tv3 = tv1->rFactor({-2}); - - checkIdMapped(tv0, tv0->getRootDomain(), tv3, tv3->getRootDomain()); - checkIdMapped( - tv3, - tv3->getMaybeRFactorDomain(), - {true, false, true}, - tv1, - tv1->getRootDomain(), - {true, true}); - checkIdMapped( - tv1, - tv1->getRootDomain(), - {true, false}, - tv2, - tv2->getRootDomain(), - {true, false}); -} - -TEST(NVFuserTest, FusionRootMappingReductionDependency4_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2); - auto tv1 = sum(tv0, {1}); - auto tv2 = broadcast(tv1, {false, true}); - auto tv3 = add(tv0, tv2); - fusion.addOutput(tv3); - - tv1->split(-1, 4); - auto tv4 = tv1->rFactor({-2}); - - checkIdMapped( - tv0, - tv0->getRootDomain(), - {true, false}, - tv4, - tv4->getRootDomain(), - {true, false}); - checkIdMapped( - tv4, - tv4->getMaybeRFactorDomain(), - {true, false, true}, - tv1, - tv1->getRootDomain(), - {true, true}); - checkIdMapped( - tv1, - tv1->getRootDomain(), - {true, false}, - tv2, - tv2->getRootDomain(), - {true, false}); - checkIdMapped(tv2, tv2->getRootDomain(), tv3, tv3->getRootDomain()); - checkIdMapped( - tv0, - tv0->getRootDomain(), - {true, false}, - tv2, - tv2->getRootDomain(), - {true, false}); -} - -// Reproducer of issue #749 -TEST(NVFuserTest, FusionRootMappingReductionDependency5_CUDA_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = sum(tv1, {1}); - auto tv3 = broadcast(tv2, {false, true}); - auto tv4 = add(tv0, tv3); - auto tv5 = add(tv4, tv1); - fusion.addOutput(tv5); - - checkIdMapped( - tv0, - tv0->getRootDomain(), - {true, false}, - tv1, - tv1->getRootDomain(), - {true, false}); - checkIdMapped( - tv1, - tv1->getRootDomain(), - {true, false}, - tv2, - tv2->getRootDomain(), - {true, false}); - checkIdMapped( - tv2, - tv2->getRootDomain(), - {true, false}, - tv3, - tv3->getRootDomain(), - {true, false}); - checkIdMapped( - tv3, - tv3->getRootDomain(), - {true, true}, - tv4, - tv4->getRootDomain(), - {true, true}); - checkIdMapped( - tv0, - tv0->getRootDomain(), - {true, false}, - tv4, - tv4->getRootDomain(), - {true, false}); - checkIdMapped( - tv4, - tv4->getRootDomain(), - {true, true}, - tv5, - tv5->getRootDomain(), - {true, true}); -} - -// Similar to RootMappingReductionDependency5 but with rFactor -TEST(NVFuserTest, FusionRootMappingReductionDependency6_CUDA_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = sum(tv1, {1}); - auto tv3 = broadcast(tv2, {false, true}); - auto tv4 = add(tv0, tv3); - auto tv5 = add(tv4, tv1); - fusion.addOutput(tv5); - - tv2->split(1, 4); - auto tv6 = tv2->rFactor({-1}); - - checkIdMapped( - tv0, - tv0->getRootDomain(), - {true, false}, - tv1, - tv1->getRootDomain(), - {true, false}); - checkIdMapped( - tv1, - tv1->getRootDomain(), - {true, false}, - tv6, - tv6->getRootDomain(), - {true, false}); - checkIdMapped( - tv6, - tv6->getMaybeRFactorDomain(), - {true, true, false}, - tv2, - tv2->getRootDomain(), - {true, true}); - checkIdMapped( - tv1, - tv1->getRootDomain(), - {true, false}, - tv2, - tv2->getRootDomain(), - {true, false}); - checkIdMapped( - tv2, - tv2->getRootDomain(), - {true, false}, - tv3, - tv3->getRootDomain(), - {true, false}); - checkIdMapped( - tv3, - tv3->getRootDomain(), - {true, true}, - tv4, - tv4->getRootDomain(), - {true, true}); - checkIdMapped( - tv0, - tv0->getRootDomain(), - {true, false}, - tv4, - tv4->getRootDomain(), - {true, false}); - checkIdMapped( - tv4, - tv4->getRootDomain(), - {true, true}, - tv5, - tv5->getRootDomain(), - {true, true}); -} - -TEST(NVFuserTest, FusionRootMappingMultipleBroadcast_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(1); - auto tv1 = broadcast(tv0, {false, true}); - auto tv2 = broadcast(tv0, {true, false}); - auto tv3 = add(tv1, tv2); - fusion.addOutput(tv3); - - // tv0 cannot be mapped with the consumers as it would mean its only - // domain would be mapped to both the first and second domains of - // the two consumers, thus computing tv0 at both corresponding loops. - checkIdMapped( - tv0, - tv0->getRootDomain(), - {false}, - tv1, - tv1->getRootDomain(), - {false, false}); - checkIdMapped( - tv0, - tv0->getRootDomain(), - {false}, - tv2, - tv2->getRootDomain(), - {false, false}); - checkIdMapped(tv1, tv1->getRootDomain(), tv3, tv3->getRootDomain()); - checkIdMapped(tv2, tv2->getRootDomain(), tv3, tv3->getRootDomain()); - checkIdMapped( - tv0, - tv0->getRootDomain(), - {false}, - tv3, - tv3->getRootDomain(), - {false, false}); -} - -TEST(NVFuserTest, FusionRootMappingMultipleBroadcastWithNoCommonConsumer_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(1); - auto tv1 = broadcast(tv0, {false, true}); - auto tv2 = broadcast(tv0, {true, false}); - fusion.addOutput(tv1); - fusion.addOutput(tv2); - - // If there is no common consumer, there is no recomputation constraint. - checkIdMapped( - tv0, - tv0->getRootDomain(), - {true}, - tv1, - tv1->getRootDomain(), - {true, false}); - checkIdMapped( - tv0, - tv0->getRootDomain(), - {true}, - tv2, - tv2->getRootDomain(), - {false, true}); - checkIdMapped( - tv1, - tv1->getRootDomain(), - {true, false}, - tv2, - tv2->getRootDomain(), - {false, true}); -} - -TEST(NVFuserTest, FusionRootMappingBroadcastNonUniqueSize_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - auto tv1 = makeSymbolicTensor(2); - fusion.addInput(tv1); - auto tv2 = makeSymbolicTensor(2); - fusion.addInput(tv2); - auto tv3 = broadcast(tv0, {false, true}); - auto tv4 = add(tv1, tv3); - fusion.addOutput(tv4); - auto tv5 = add(tv2, tv3); - fusion.addOutput(tv5); - - // Broadcast domains can be used with multiple domains with - // different sizes. In this test, the broadcast domain of tv3 has - // two consumers, tv4 and tv5, which may have different sizes. Each - // of the consumers is used with the broadcast domain of tv3, but - // the two consumers may not have the same size, it is not possible - // to map those domains. - checkIdMapped( - tv0, - tv0->getRootDomain(), - {true}, - tv3, - tv3->getRootDomain(), - {true, false}); - checkIdMapped( - tv0, - tv0->getRootDomain(), - {true}, - tv1, - tv1->getRootDomain(), - {true, false}); - checkIdMapped( - tv0, - tv0->getRootDomain(), - {true}, - tv2, - tv2->getRootDomain(), - {true, false}); - checkIdMapped( - tv1, - tv1->getRootDomain(), - {true, false}, - tv2, - tv2->getRootDomain(), - {true, false}); - checkIdMapped( - tv1, - tv1->getRootDomain(), - {true, false}, - tv3, - tv3->getRootDomain(), - {true, false}); - checkIdMapped( - tv2, - tv2->getRootDomain(), - {true, false}, - tv3, - tv3->getRootDomain(), - {true, false}); - checkIdMapped( - tv3, - tv3->getRootDomain(), - {true, false}, - tv4, - tv4->getRootDomain(), - {true, false}); - checkIdMapped( - tv3, - tv3->getRootDomain(), - {true, false}, - tv5, - tv5->getRootDomain(), - {true, false}); - checkIdMapped( - tv4, - tv4->getRootDomain(), - {true, false}, - tv5, - tv5->getRootDomain(), - {true, false}); -} - -TEST(NVFuserTest, FusionRootMappingBroadcast_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - // tv0[I0] - fusion.addInput(tv0); - auto tv1 = broadcast(tv0, {true, false}); - // tv1[B1, I0] - auto tv2 = broadcast(tv1, {true, false, false}); - // tv2[B2, B1, I0] - fusion.addOutput(tv2); - - // In this case, tv1 and tv2 has one and two broadcast domains, - // respectively. It is the second broadcast domain that is mapped to - // the broadcast of tv1. - checkIdMapped( - tv0, - tv0->getRootDomain(), - {true}, - tv1, - tv1->getRootDomain(), - {false, true}); - checkIdMapped( - tv1, - tv1->getRootDomain(), - {true, true}, - tv2, - tv2->getRootDomain(), - {false, true, true}); // Not {true, false, true} - checkIdMapped( - tv0, - tv0->getRootDomain(), - {true}, - tv2, - tv2->getRootDomain(), - {false, false, true}); -} - -// Reproducer of issue #723 -TEST(NVFuserTest, FusionRootMappingTrivialReduction_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - auto tv1 = makeSymbolicTensor(2); - - fusion.addInput(tv0); - fusion.addInput(tv1); - - auto tv2 = broadcast(tv0, {true, false}); - auto tv3 = sum(tv2, {0}); - auto tv4 = add(tv2, tv1); - - fusion.addOutput(tv3); - fusion.addOutput(tv4); - - ComputeAtRootDomainMap map; - map.build(); - - checkIdMapped( - map, tv2, tv2->getRootDomain()[0], tv4, tv4->getRootDomain()[0], true); - checkIdMapped( - map, tv2, tv2->getRootDomain()[0], tv3, tv3->getRootDomain()[0], true); - - tv2->computeAt(tv4, -1); - - const int x = 11; - const int y = 12; - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({x}, options); - at::Tensor t1 = at::randn({y, x}, options); - std::vector aten_inputs = {t0, t1}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(aten_inputs); - - auto t3 = t0; - auto t4 = t0.unsqueeze(0).expand({y, x}) + t1; - - testValidate(&fusion, outputs, aten_inputs, {t3, t4}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionComputeAtFailDueToRootMapping_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = broadcast(tv1, {true, false}); - auto tv3 = broadcast(tv1, {false, true}); - auto tv4 = add(tv2, tv3); - fusion.addOutput(tv4); - - // computeAt should fail as there is no valid root mapping. - ASSERT_ANY_THROW(tv1->computeAt(tv4, 1)); -} - -TEST(NVFuserTest, FusionScalarInputs_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - TensorView* tv1 = makeSymbolicTensor(2); - fusion.addInput(tv1); - - Double* d0 = new Double(); - fusion.addInput(d0); - Double* d1 = new Double(); - fusion.addInput(d1); - Double* d2 = new Double(); - fusion.addInput(d2); - Double* d3 = new Double(); - fusion.addInput(d3); - Val* d4 = mul(d0, d1); - Val* d5 = sub(d2, d3); - - TensorView* tv2 = sub(tv1, d4); - TensorView* tv3 = add(tv0, d5); - TensorView* tv4 = mul(tv3, tv2); - - fusion.addOutput(tv4); - - // Lets setup to actually run - while (tv4->nDims() > 1) - tv4->merge(0); - tv4->split(0, 128); - tv4->split(0, 4); - - tv0->computeAt(tv4, 1); - tv1->computeAt(tv4, 1); - - tv4->axis(0)->parallelize(ParallelType::BIDx); - - for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && - val->getValType().value() == ValType::TensorView) { - TensorView* tv = static_cast(val); - - tv->axis(1)->parallelize(ParallelType::Unroll); - tv->axis(-1)->parallelize(ParallelType::TIDx); - } - } - - // d4 = d0 * d1 - // d5 = d2 - d3 - // t2 = t1 - d4 - // t3 = t0 + d5 - // t4 = t3 * t2 - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - float fl0 = 0.1; - float fl1 = -0.2; - float fl2 = 0.3; - float fl3 = -0.4; - float fl4 = fl0 * fl1; - float fl5 = fl2 - fl3; - - at::Tensor t0 = at::randn({129, 127}, options); - at::Tensor t1 = at::rand_like(t0, options); - - auto t2 = t1.sub(fl4); - auto t3 = t0.add(fl5); - auto aten_output = t3.mul(t2); - - at::Tensor cg_output = at::empty_like(t0, options); - - at::Scalar test(fl0); - - std::vector aten_inputs = { - t0, - t1, - at::Scalar(fl0), - at::Scalar(fl1), - at::Scalar(fl2), - at::Scalar(fl3)}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion(aten_inputs, {cg_output}); - - testValidate( - &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionLoopUnroll_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(3); - TensorView* tv1 = makeSymbolicTensor(3); - - // Register your inputs - fusion.addInput(tv0); - fusion.addInput(tv1); - - // Do math with it, it returns a `Val*` but can be static_casted back to - // TensorView - TensorView* tv2 = add(tv1, new Double(2.0)); - TensorView* tv3 = add(tv0, tv2); - - // Register your outputs - fusion.addOutput(tv3); - - int block_size = 16; - - tv3->merge(0, 1); - tv3->merge(0, 1); - - tv3->split(0, block_size); - tv3->split(0, 4); - - // For all inputs, computeAt the output inline, temporaries should be squeezed - // between them - tv0->computeAt(tv3, 1); - tv1->computeAt(tv3, 1); - - // Parallelize - tv2->axis(1)->parallelize(ParallelType::Unroll); - tv3->axis(1)->parallelize(ParallelType::Unroll); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(0)->parallelize(ParallelType::BIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor input0 = at::randn({129, 13, 3}, options); - at::Tensor input1 = at::randn({129, 13, 3}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({input0, input1}); - - TORCH_CHECK(outputs[0].equal(input0.add(input1.add(2.0)))); -} - -/* - * Helper function for single op testing that generates a codegen operand - */ - -Val* gen_jit_operand(std::pair desc) { - if (desc.first == ValType::TensorView) { - return makeSymbolicTensor(2, desc.second); - } else if (desc.first == ValType::Scalar) { - if (desc.second == DataType::Float) { - return new Double(); - } else if (desc.second == DataType::Double) { - return new Double(); - } else if (desc.second == DataType::Int) { - return new Int(); - } else { - TORCH_CHECK(false, "Not currently supported type: ", desc.first); - } - } else { - TORCH_CHECK(false, "Not currently supported type: ", desc.first); - } - return nullptr; -} - -/* - * Helper function for single op testing that generates an ATen operand - */ - -IValue gen_aten_operand( - std::pair desc, - int blocks, - int threads, - bool rand) { - if (desc.first == ValType::TensorView) { - if (desc.second == DataType::Double || desc.second == DataType::Float || - desc.second == DataType::Half || desc.second == DataType::BFloat16) { - auto options = at::TensorOptions() - .dtype(data_type_to_aten(desc.second)) - .device(at::kCUDA, 0); - if (rand) { - return IValue(at::rand({blocks, threads}, options)); - } else { - return IValue(at::empty({blocks, threads}, options)); - } - } else if (desc.second == DataType::Int || desc.second == DataType::Int32) { - auto dtype = desc.second == DataType::Int32 ? at::kInt : at::kLong; - if (rand) { - auto options = - at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - return IValue(at::randn({blocks, threads}, options).mul(5).to(dtype)); - } else { - auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0); - return IValue(at::empty({blocks, threads}, options)); - } - } else if (desc.second == DataType::Bool) { - if (rand) { - auto options = - at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - return IValue( - at::rand({blocks, threads}, options).round().to(at::kBool)); - } else { - auto options = - at::TensorOptions().dtype(at::kBool).device(at::kCUDA, 0); - return IValue(at::empty({blocks, threads}, options)); - } - } else { - TORCH_CHECK(false, "Not currently supported type: ", desc.second) - } - } else if (desc.first == ValType::Scalar) { - // IValue scalars can only be double int64 or bool - if (desc.second == DataType::Double || desc.second == DataType::Float || - desc.second == DataType::Half || desc.second == DataType::BFloat16) { - return IValue(at::Scalar(1.f)); - } else if (desc.second == DataType::Int) { - return IValue(at::Scalar(1)); - } else { - TORCH_CHECK(false, "Not currently supported type: ", desc.first); - } - } else { - TORCH_CHECK(false, "Not currently supported type: ", desc.first); - } - return nullptr; -} - -/* - * Templatized Helper Function To generate single Op comparison between the - * JIT codegen for Cuda and the ATen Library. - */ - -using OutputPair = std::pair; -template < - typename AtenFunc, - typename JitFunc, - typename InputTuple, - size_t... NumInputs> -void test_op( - int blocks, - int threads, - std::string op_str, - AtenFunc af, - JitFunc jf, - OutputPair op, - InputTuple it, - std::index_sequence) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Generate Input JIT function Inputs and add them as Inputs to the Fusion - // Graph - std::array jit_inputs = { - gen_jit_operand(std::get(it))...}; - std::for_each(jit_inputs.begin(), jit_inputs.end(), [&fusion](Val* v) { - fusion.addInput(v); - }); - TensorView* out = - static_cast(jf(std::get(jit_inputs)...)); - fusion.addOutput(out); - - std::for_each(jit_inputs.begin(), jit_inputs.end(), [out](Val* v) { - if (v->getValType() == ValType::TensorView) - static_cast(v)->computeAt(out, -1); - }); - out->axis(0)->parallelize(ParallelType::BIDx); - out->axis(-1)->parallelize(ParallelType::TIDx); - - std::array aten_inputs = {gen_aten_operand( - std::get(it), blocks, threads, /*rand*/ true)...}; - const at::ArrayRef aten_inputs_ivalues(aten_inputs); - - at::Tensor cg_output = - gen_aten_operand(op, blocks, threads, /*rand*/ false).toTensor(); - std::vector output_vect = {cg_output}; - cudaDeviceSynchronize(); - if (fusion.isStochastic()) - at::manual_seed(0); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion(aten_inputs_ivalues, output_vect); - cudaDeviceSynchronize(); - - if (fusion.isStochastic()) - at::manual_seed(0); - at::Tensor aten_output = af(aten_inputs); - cudaDeviceSynchronize(); // This sync shouldn't be necessary; - - std::string op_msg = "Operation " + op_str; - - testValidate( - &fusion, - {cg_output}, - aten_inputs, - {aten_output}, - __LINE__, - __FILE__, - op_msg); -} - -/* - * Templatized Helper Function that uses variadic templates to - * process a variable length Input Tuple of different Operand Type. - */ -template -void test_op( - int blocks, - int threads, - std::string op_str, - AtenFunc af, - JitFunc jf, - OutputPair op, - InputTuple it) { - static constexpr auto size = std::tuple_size::value; - test_op( - blocks, - threads, - op_str, - af, - jf, - op, - it, - std::make_index_sequence{}); -} - -TEST(NVFuserTest, FusionUnaryOps_CUDA) { - using OpTuple = - std::tuple; - - // [Note: explicit tuple type for uniform initialization list] - // Tuple type must be explicitly specified for each uniform initialization - // list within the vector to make this code compatible with some old env - // which we still need to support. eg. gcc 5.4 + cuda 9.2. - std::vector ops{ - OpTuple{at::abs, UnaryOpType::Abs, "abs"}, - OpTuple{at::acos, UnaryOpType::Acos, "acos"}, - OpTuple{at::asin, UnaryOpType::Asin, "asin"}, - OpTuple{at::atan, UnaryOpType::Atan, "atan"}, - // There does not appear to be an appropriate ATen function for atanh - // OpTuple{at::atanh, UnaryOpType::Atanh, "atanh" }, - OpTuple{at::ceil, UnaryOpType::Ceil, "ceil"}, - OpTuple{at::cos, UnaryOpType::Cos, "cos"}, - OpTuple{at::cosh, UnaryOpType::Cosh, "cosh"}, - OpTuple{at::erf, UnaryOpType::Erf, "erf"}, - OpTuple{at::erfc, UnaryOpType::Erfc, "erfc"}, - OpTuple{at::exp, UnaryOpType::Exp, "exp"}, - OpTuple{at::expm1, UnaryOpType::Expm1, "expm1"}, - OpTuple{at::floor, UnaryOpType::Floor, "floor"}, - OpTuple{at::frac, UnaryOpType::Frac, "frac"}, - // OpTuple{at::gelu, UnaryOpType::Gelu, "gelu"}, - OpTuple{at::lgamma, UnaryOpType::Lgamma, "lgamma"}, - OpTuple{at::log, UnaryOpType::Log, "log"}, - OpTuple{at::log10, UnaryOpType::Log10, "log10"}, - OpTuple{at::log1p, UnaryOpType::Log1p, "log1p"}, - OpTuple{at::log2, UnaryOpType::Log2, "log2"}, - OpTuple{at::neg, UnaryOpType::Neg, "neg"}, - OpTuple{at::reciprocal, UnaryOpType::Reciprocal, "reciprocal"}, - OpTuple{at::relu, UnaryOpType::Relu, "relu"}, - OpTuple{at::round, UnaryOpType::Round, "round"}, - OpTuple{at::rsqrt, UnaryOpType::Rsqrt, "rsqrt"}, - OpTuple{at::sigmoid, UnaryOpType::Sigmoid, "sigmoid"}, - OpTuple{at::sin, UnaryOpType::Sin, "sin"}, - OpTuple{at::sinh, UnaryOpType::Sinh, "sinh"}, - OpTuple{at::sqrt, UnaryOpType::Sqrt, "sqrt"}, - OpTuple{at::tan, UnaryOpType::Tan, "tan"}, - OpTuple{at::tanh, UnaryOpType::Tanh, "tanh"}, - OpTuple{at::trunc, UnaryOpType::Trunc, "trunc"}}; - - std::vector dtypes = {DataType::Float, DataType::Double}; - - for (auto dtype : dtypes) { - std::for_each(ops.begin(), ops.end(), [&](OpTuple& op) { - test_op( - /*blocks*/ 640, - /*threads*/ 64, - /*name*/ std::get<2>(op), - /*Aten Func */ - [&op](std::array& vals) { - return std::get<0>(op)(vals[0].toTensor()); - }, - /*JIT Func */ - [&op](Val* in1) -> Val* { return unaryOp(std::get<1>(op), in1); }, - /*Output */ std::make_pair(ValType::TensorView, dtype), - /*Inputs Tuple*/ - std::make_tuple(std::make_pair(ValType::TensorView, dtype))); - }); - - test_op( - /*blocks*/ 128, - /*threads*/ 64, - /*name*/ "rand_like", - /*Aten Func */ - [](std::array& vals) { - return at::rand_like(vals[0].toTensor()); - }, - /*JIT Func */ - [](Val* in1) -> Val* { return unaryOp(UnaryOpType::RandLike, in1); }, - /*Output */ std::make_pair(ValType::TensorView, dtype), - /*Inputs Tuple*/ - std::make_tuple(std::make_pair(ValType::TensorView, dtype))); - } - - dtypes = {DataType::Int, DataType::Int32, DataType::Bool}; - for (auto dtype : dtypes) { - test_op( - /*blocks*/ 128, - /*threads*/ 64, - /*name*/ "bitwise_not", - /*Aten Func */ - [](std::array& vals) { - return at::bitwise_not(vals[0].toTensor()); - }, - /*JIT Func */ - [](Val* in1) -> Val* { return unaryOp(UnaryOpType::Not, in1); }, - /*Output */ std::make_pair(ValType::TensorView, dtype), - /*Inputs Tuple*/ - std::make_tuple(std::make_pair(ValType::TensorView, dtype))); - } -} - -TEST(NVFuserTest, FusionBinaryOps_CUDA) { - using AtenFuncSig = at::Tensor (*)(const at::Tensor&, const at::Tensor&); - using OpTuple = std::tuple; - - // see [Note: explicit tuple type for uniform initialization list] - std::vector logic_ops{ - OpTuple{at::eq, BinaryOpType::Eq, "eq"}, - OpTuple{at::ge, BinaryOpType::GE, "ge"}, - OpTuple{at::gt, BinaryOpType::GT, "gt"}, - OpTuple{at::le, BinaryOpType::LE, "le"}, - OpTuple{at::lt, BinaryOpType::LT, "lt"}, - OpTuple{at::ne, BinaryOpType::NE, "ne"}}; - std::vector dtypes = {DataType::Double, DataType::Float}; - - for (auto dtype : dtypes) { - std::for_each(logic_ops.begin(), logic_ops.end(), [&](OpTuple& op) { - test_op( - /*blocks*/ 640, - /*threads*/ 64, - /*name*/ std::get<2>(op), - /*Aten Func */ - [&op](std::array& vals) { - return std::get<0>(op)(vals[0].toTensor(), vals[1].toTensor()); - }, - /*JIT Func */ - [&op](Val* in1, Val* in2) -> Val* { - return binaryOp(std::get<1>(op), in1, in2); - }, - /*Output */ std::make_pair(ValType::TensorView, DataType::Bool), - /*Inputs Tuple*/ - std::make_tuple( - std::make_pair(ValType::TensorView, dtype), - std::make_pair(ValType::TensorView, dtype))); - }); - - // see [Note: explicit tuple type for uniform initialization list] - std::vector math_ops{ - OpTuple{at::atan2, BinaryOpType::Atan2, "atan2"}, - OpTuple{at::div, BinaryOpType::Div, "div"}, - OpTuple{at::fmod, BinaryOpType::Fmod, "fmod"}, - OpTuple{at::max, BinaryOpType::Max, "max"}, - OpTuple{at::min, BinaryOpType::Min, "min"}, - OpTuple{at::mul, BinaryOpType::Mul, "mul"}, - OpTuple{at::pow, BinaryOpType::Pow, "pow"}, - // NOTE: Remainder does not match the Aten impl exactly - // despite using an identical function. - OpTuple{at::remainder, BinaryOpType::Remainder, "remainder"}, - }; - - std::for_each(math_ops.begin(), math_ops.end(), [&](OpTuple& op) { - test_op( - /*blocks*/ 640, - /*threads*/ 64, - /*name*/ std::get<2>(op), - /*Aten Func */ - [&op](std::array& vals) { - return std::get<0>(op)(vals[0].toTensor(), vals[1].toTensor()); - }, - /*JIT Func */ - [&op](Val* in1, Val* in2) -> Val* { - return binaryOp(std::get<1>(op), in1, in2); - }, - /*Output */ std::make_pair(ValType::TensorView, dtype), - /*Inputs Tuple*/ - std::make_tuple( - std::make_pair(ValType::TensorView, dtype), - std::make_pair(ValType::TensorView, dtype))); - }); - - test_op( - /*blocks*/ 640, - /*threads*/ 64, - /*name*/ "add_alpha", - /*Aten Func */ - [](std::array& vals) { - return at::add( - vals[0].toTensor(), vals[1].toTensor(), vals[2].toScalar()); - }, - /*JIT Func */ static_cast(&add_alpha), - /*Output */ std::make_pair(ValType::TensorView, dtype), - /*Inputs Tuple*/ - std::make_tuple( - std::make_pair(ValType::TensorView, dtype), - std::make_pair(ValType::TensorView, dtype), - std::make_pair(ValType::Scalar, dtype))); - - test_op( - /*blocks*/ 640, - /*threads*/ 64, - /*name*/ "sub_alpha", - /*Aten Func */ - [](std::array& vals) { - return at::sub( - vals[0].toTensor(), vals[1].toTensor(), vals[2].toScalar()); - }, - /*JIT Func */ static_cast(&sub_alpha), - /*Output */ std::make_pair(ValType::TensorView, dtype), - /*Inputs Tuple*/ - std::make_tuple( - std::make_pair(ValType::TensorView, dtype), - std::make_pair(ValType::TensorView, dtype), - std::make_pair(ValType::Scalar, dtype))); - } -} - -TEST(NVFuserTest, FusionTernaryOps_CUDA) { - std::vector dtypes = {DataType::Double, DataType::Float}; - - for (auto dtype : dtypes) { - test_op( - /*blocks*/ 640, - /*threads*/ 64, - /*name*/ "clamp", - /*Aten Func */ - [](std::array& vals) { - return at::clamp(vals[0].toTensor(), 0.f, 1.f); - }, - /*JIT Func */ - [&](Val* in1) -> Val* { - if (dtype == DataType::Float) { - return clamp(in1, new Double(0.f), new Double(1.f)); - } else { - return clamp(in1, new Double(0.f), new Double(1.f)); - } - }, - /*Output */ std::make_pair(ValType::TensorView, dtype), - /*Inputs Tuple*/ - std::make_tuple(std::make_pair(ValType::TensorView, dtype))); - test_op( - /*blocks*/ 640, - /*threads*/ 64, - /*name*/ "threshold", - /*Aten Func */ - [](std::array& vals) { - return at::threshold(vals[0].toTensor(), 0.f, 1.f); - }, - /*JIT Func */ - [&](Val* in1) -> Val* { - if (dtype == DataType::Float) { - return threshold(in1, new Double(0.f), new Double(1.f)); - } else { - return threshold(in1, new Double(0.f), new Double(1.f)); - } - }, - /*Output */ std::make_pair(ValType::TensorView, dtype), - /*Inputs Tuple*/ - std::make_tuple(std::make_pair(ValType::TensorView, dtype))); - test_op( - /*blocks*/ 640, - /*threads*/ 64, - /*name*/ "where", - /*Aten Func */ - [](std::array& vals) { - return at::where( - vals[0].toTensor(), vals[1].toTensor(), vals[2].toTensor()); - }, - /*JIT Func */ static_cast(&where), - /*Output */ std::make_pair(ValType::TensorView, dtype), - /*Inputs Tuple*/ - std::make_tuple( - std::make_pair(ValType::TensorView, DataType::Bool), - std::make_pair(ValType::TensorView, dtype), - std::make_pair(ValType::TensorView, dtype))); - } -} - -TEST(NVFuserTest, FusionCompoundOps_CUDA) { - std::vector dtypes = {DataType::Double, DataType::Float}; - - for (auto dtype : dtypes) { - test_op( - /*blocks*/ 640, - /*threads*/ 64, - /*name*/ "lerp", - /*Aten Func */ - [](std::array& vals) { - return at::lerp( - vals[0].toTensor(), vals[1].toTensor(), vals[2].toTensor()); - }, - /*JIT Func */ static_cast(&lerp), - /*Output */ std::make_pair(ValType::TensorView, dtype), - /*Inputs Tuple*/ - std::make_tuple( - std::make_pair(ValType::TensorView, dtype), - std::make_pair(ValType::TensorView, dtype), - std::make_pair(ValType::TensorView, dtype))); - test_op( - /*blocks*/ 640, - /*threads*/ 64, - /*name*/ "addcmul", - /*Aten Func */ - [](std::array& vals) { - return at::addcmul( - vals[0].toTensor(), - vals[1].toTensor(), - vals[2].toTensor(), - vals[3].toScalar()); - }, - /*JIT Func */ - static_cast(&addcmul), - /*Output */ std::make_pair(ValType::TensorView, dtype), - /*Inputs Tuple*/ - std::make_tuple( - std::make_pair(ValType::TensorView, dtype), - std::make_pair(ValType::TensorView, dtype), - std::make_pair(ValType::TensorView, dtype), - std::make_pair(ValType::Scalar, dtype))); - } -} - -TEST(NVFuserTest, FusionCastOps_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2, DataType::Half); - - TensorView* intrm1 = castOp(DataType::Float, tv0); - TensorView* out = castOp(DataType::Half, intrm1); - - fusion.addInput(tv0); - fusion.addOutput(out); - tv0->computeAt(out, -1); - - out->axis(0)->parallelize(ParallelType::BIDx); - out->axis(-1)->parallelize(ParallelType::TIDx); - - auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); - - at::Tensor input1 = at::randn({1, 4}, options); - at::Tensor ref_output = at::empty_like(input1); - - std::array inputs = {input1}; - const at::ArrayRef input_ivalues(inputs); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(input_ivalues); - - ref_output = at::_cast_Half(at::_cast_Double(input1)); - - TORCH_CHECK( - outputs[0].equal(ref_output), - "\nOp Type: -- ", - "cast FP16->FP32->FP16", - " -- had a mismatch.\n", - "\nABS MAX DIFF: ", - outputs[0].sub(ref_output).abs().max(), - "\n"); -} - -// Start off simple, block on the outer dim -// block stride + thread all reduce + unrolling on inner dim -TEST(NVFuserTest, FusionReduction1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - // tv1[I0, R1] = tv0[I0, I1] - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); - fusion.addOutput(tv1); - - TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); - - tv1->split(1, 128); - // tv1[I0, R1o, R1i{128}] = tv0[I0, I1] - tv1->split(1, 4); - // tv1[I0, R1oo, R1oi{4}, R1i{128}] = tv0[I0, I1] - - TensorView* tv2 = tv1->rFactor({1}); - // tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}] = tv0[I0, I1] - // tv1[I0, R1oi{4}, R1i{128}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}] - - TensorView* tv3 = tv1->rFactor({1}); - // tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}] = tv0[I0, I1] - // tv3[I0, R1oi{4}, Ir1i{128}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}] - // tv1[I0, R1i{128}] = tv3[I0, R1oi{4}, Ir1i{128}] - - // Incrementally, can print in between for debugging - tv0->computeAt(tv2, 1); - tv2->computeAt(tv3, 1); - tv3->computeAt(tv1, 1); - - // Re do it all at once, because why not. - tv0->computeAt(tv1, 1); - - tv2->axis(2)->parallelize(ParallelType::Unroll); - tv1->axis(0)->parallelize(ParallelType::BIDx); - - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - - int numel_x = 65000; - int numel_y = 1025; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({numel_x, numel_y}, options); - at::Tensor cg_output = at::empty({numel_x}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({input}, {cg_output}); - - auto aten_output = input.to(at::kDouble).sum({1}); - - testValidate( - &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionReduction2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - // tv1[I0, R1] = tv0[I0, I1] - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); - - fusion.addOutput(tv1); - - // switches to try some different scenarios. maybe we should iterate on all - // permutations. - bool bind_bidx = true; - bool bind_tidx = true; - bool bind_tidy = true; - bool bind_unroll = true; - - int numel_x = 1025; // Cannot exceed block dim max size / tidy - int numel_y = 129; - int tidx = 16; - int tidy = 8; - int unroll_factor = 4; - - tv1->split(1, tidx); - // tv1[I0, R1o, R1i{tidx}] = tv0[I0, I1] - - tv1->split(1, unroll_factor); - // tv1[I0, R1oo, R1oi{unroll}, R1i{tidx}] = tv0[I0, I1] - - tv1->split(0, tidy); - - TensorView* tv2 = tv1->rFactor({-3}); - // tv2[I0, >R1oo<, Ir1oi{unroll}, Ir1i{tidx}] - // tv1[I0o, I0i{tidy}, R1oi{unroll}, R1i{tidx}] - - TensorView* tv3 = tv1->rFactor({-2}); - // tv2[I0, >R1oo<, Ir1oi{unroll}, Ir1i{tidx}] - // tv3[I0, R1oi{unroll}, Ir1i{tidx}] - // tv1[I0o, I0i{tidy}, R1i{tidx}] - - tv0->computeAt(tv1, -2); - - if (bind_unroll) - tv2->axis(-2)->parallelize(ParallelType::Unroll); - if (bind_bidx) - tv1->axis(0)->parallelize(ParallelType::BIDx); - if (bind_tidy) - tv1->axis(1)->parallelize(ParallelType::TIDy); - - if (bind_tidx) { - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv1->axis(-1)->parallelize(ParallelType::TIDx); - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({numel_x, numel_y}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({input}); - - auto aten_output = input.to(at::kDouble).sum({1}); - testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionReduction3_CUDA) { - // What if Z participates in the reduction with X? - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - // tv1[I0, R1] = tv0[I0, I1] - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); - - fusion.addOutput(tv1); - - int numel_x = 1025; // Cannot exceed block dim max size / tidy - int numel_y = 129; - int tidx = 16; - int tidz = 8; - - tv1->split(1, tidz); - // tv1[I0, R1o, R1i{tidz}] = tv0[I0, I1] - - tv1->split(1, tidx); - // tv1[I0, R1oo, R1oi{tidx}, R1i{tidz}] = tv0[I0, I1] - - TensorView* tv2 = tv1->rFactor({-3}); - // tv2[I0, >R1oo<, Ir1oi{tidx}, Ir1i{tidz}] - // tv1[I0o, R1oi{tidx}, R1i{tidz}] - - tv0->computeAt(tv1, -3); - - tv1->axis(0)->parallelize(ParallelType::BIDx); - tv1->axis(-2)->parallelize(ParallelType::TIDx); - tv1->axis(-1)->parallelize(ParallelType::TIDz); - - tv2->axis(-2)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDz); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({numel_x, numel_y}, options); - at::Tensor cg_output = at::empty({numel_x}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({aten_input}, {cg_output}); - - auto aten_output = aten_input.to(at::kDouble).sum({1}); - - testValidate( - &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionReduction4_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - TensorView* tv1 = makeSymbolicTensor(2); - - TensorView* tv2 = add(tv0, tv1); - // tv2[I0, I1] = tv0[I0, I1] + tv1[I0, I1] - - fusion.addInput(tv0); - fusion.addInput(tv1); - - TensorView* tv3 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv2); - // tv3[I0, R1] = tv2[I0, I1] - - TensorView* tv4 = makeSymbolicTensor(1); - fusion.addInput(tv4); - - // tv5[I0] = tv3[I0, R1] * tv4[I0] - TensorView* tv5 = mul(tv3, tv4); - fusion.addOutput(tv5); - - int tidx = 16; - - // RFactor the reduction - tv3->split(1, tidx); - // tv3[I0, R1o, R1i{tidx}] = tv2[I0, I1] - - TensorView* tv6 = tv3->rFactor({-2}); - // tv6[I0, R1o, iR1i{tidx}] = tv2[I0, I1] - // tv3[I0, R1i{tidx}] = tv3[I0, I1] - tv2->computeAt(tv6, 2); - - // Compute at inline with tv5 (only 1D) - tv6->computeAt(tv3, 1); - tv3->computeAt(tv5, 1); - - tv5->axis(0)->parallelize(ParallelType::BIDx); - - // Intermediate tensors only need this, but doesn't hurt to do on inputs - // tv0, 1, 4 - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv6->axis(-1)->parallelize(ParallelType::TIDx); - - int numel_x = 1025; - int numel_y = 129; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - at::Tensor t1 = at::randn({numel_x, numel_y}, options); - at::Tensor t4 = at::randn({numel_x}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({t0, t1, t4}); - - auto t2 = t0.add(t1); - auto t3 = t2.to(at::kDouble).sum({1}); - auto aten_output = t3.mul(t4); - - testValidate( - &fusion, cg_outputs, {t0, t1, t4}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionReduction5_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(3); - - fusion.addInput(tv0); - - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); - - fusion.addOutput(tv1); - - int bidy = 2; - int tidy = 4; - int tidx = 5; - - int dim1 = 11; - - tv1->split(-2, tidy); - - TensorView* tv2 = tv1->rFactor({-3}); - - tv0->computeAt(tv1, 1); - tv1->axis(0)->parallelize(ParallelType::BIDy); - - for (auto* val : fusion.vals()) { - if (!fusion.hasInput(val) && - val->getValType().value() == ValType::TensorView) { - val->as()->axis(-1)->parallelize(ParallelType::TIDx); - } - } - - tv2->axis(-2)->parallelize(ParallelType::TIDy); - tv1->axis(-2)->parallelize(ParallelType::TIDy); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({bidy, dim1, tidx}, options); - - at::Tensor cg_output = at::empty({bidy, tidx}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({input}, {cg_output}); - - auto aten_output = input.to(at::kDouble).sum({1}); - testValidate( - &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionReduction6_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const int bdimx = 64; - const int bdimy = 8; - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(3); - fusion.addInput(tv0); - - // tv1[I0, R1, R2] = tv0[I0, I1, I2] - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1, 2}, new Double(0), tv0); - fusion.addOutput(tv1); - - TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); - - tv1->split(2, bdimx); - // tv1[I0, R1, R2o, R2i{128}] = tv0[I0, I1, I2] - tv1->split(1, bdimy); - // tv1[I0, R1o, R1i{8}, R2o, R2i{128}] = tv0[I0, I1, I2] - - TensorView* tv2 = tv1->rFactor({3}); - // tv2[I0, I1o, I1i{8}, R2o, I2i{128}] = tv0[I0, I1, I2] - // tv1[I0, R1o, R1i{8}, R2i{128}] = tv2[I0, I1o, I1i{8}, R2o, I2i{128}] - - TensorView* tv3 = tv1->rFactor({1}); - // tv2[I0, I1o, I1i{8}, R2o, I2i{128}] = tv0[I0, I1, I2] - // tv3[I0, R1o, I1i{8}, I2i{128}] = tv2[I0, I1o, I1i{8}, R2o, I2i{128}] - // tv1[I0, R1i{8}, R2i{128}] = tv3[I0, R1o, I1i{8}, I2i{128}] - - tv3->computeAt(tv1, 1); - tv2->computeAt(tv3, 2); - - tv1->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(0)->parallelize(ParallelType::BIDx); - tv3->axis(0)->parallelize(ParallelType::BIDx); - - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - - tv1->axis(-2)->parallelize(ParallelType::TIDy); - tv3->axis(-2)->parallelize(ParallelType::TIDy); - tv2->axis(-3)->parallelize(ParallelType::TIDy); - - int numel_x = 650; - int numel_y = 1000; - int numel_z = 4; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({numel_x, numel_y, numel_z}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({input}); - - auto aten_output = input.to(at::kDouble).sum({1, 2}); - testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionMultiGridReduction_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - TensorView* tv1 = max(tv0, {0}); - TensorView* tv2 = sum(tv0, {0}); - - fusion.addOutput(tv1); - fusion.addOutput(tv2); - - int numel_x = 4; - int numel_y = 2; - - tv1->axis(0)->parallelize(ParallelType::BIDx); - tv1->axis(1)->parallelize(ParallelType::TIDx); - - tv2->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(1)->parallelize(ParallelType::TIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({numel_x, numel_y}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({input}); - - std::vector aten_outputs = { - std::get<0>(input.to(at::kDouble).max(0)), input.to(at::kDouble).sum(0)}; - testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionMultiGridReduction2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = sum(tv0, {0}); - auto tv2 = sum(tv1, {0}); - fusion.addOutput(tv2); - - tv1->axis(0)->parallelize(ParallelType::BIDx); - tv1->axis(1)->parallelize(ParallelType::BIDy); - tv2->axis(0)->parallelize(ParallelType::BIDy); - - FusionExecutor fe; - ASSERT_ANY_THROW(fe.compileFusion(&fusion)); -} - -TEST(NVFuserTest, FusionReductionTFT_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - // tv1[I0, R1] = tv0[I0, I1] - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); - - fusion.addOutput(tv1); - - int numel_x = 1025; - int numel_y = 129; - int tidx = 16; - int tidy = 8; - int tidz = 8; - - tv1->split(1, tidx); - // tv1[I0, R1o, R1i{tidx}] - - tv1->split(1, tidz); - // tv1[I0, R1oo, R1Oi{tidz}, R1R1i{tidx}] - - tv1->split(0, tidy); - // tv1[I0o, I0i, R1oo, R1Oi{tidz}, R1R1i{tidx}] - - TensorView* tv2 = tv1->rFactor({2}); - // tv2[I0o, I0i, R1oo, I1Oi{tidz}, I11i{tidx}] - // tv1[I0o, I0i, R1Oi{tidz}, R1R1i{tidx}] - - tv2->computeAt(tv1, 2); - - tv1->axis(1)->parallelize(ParallelType::TIDy); - - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv1->axis(-1)->parallelize(ParallelType::TIDx); - - tv1->axis(-2)->parallelize(ParallelType::TIDz); - tv2->axis(-2)->parallelize(ParallelType::TIDz); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({numel_x, numel_y}, options); - at::Tensor cg_output = at::empty({numel_x}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({input}, {cg_output}); - - auto aten_output = input.to(at::kDouble).sum({1}); - testValidate( - &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionReductionOuterSplit_CUDA) { - // based off FusionReduction4 - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - TensorView* tv1 = makeSymbolicTensor(2); - - TensorView* tv2 = add(tv0, tv1); - // tv2[I0, I1] = tv0[I0, I1] + tv1[I0, I1] - - fusion.addInput(tv0); - fusion.addInput(tv1); - - TensorView* tv3 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv2); - // tv3[I0, R1] = tv2[I0, I1] - - TensorView* tv4 = makeSymbolicTensor(1); - fusion.addInput(tv4); - - // tv5[I0] = tv3[I0, R1] * tv4[I0] - TensorView* tv5 = mul(tv3, tv4); - fusion.addOutput(tv5); - - // RFactor the reduction - tv3->split(1, 16, false); - // tv3[I0, R1o{16}, R1i{tidx}] = tv2[I0, I1] - - TensorView* tv6 = tv3->rFactor({-2}); - // tv6[I0, R1o{16}, iR1i{tidx}] = tv2[I0, I1] - // tv3[I0, R1i{tidx}] = tv3[I0, I1] - tv2->computeAt(tv6, 2); - - // Compute at inline with tv5 (only 1D) - tv6->computeAt(tv3, 1); - tv3->computeAt(tv5, 1); - - tv5->axis(0)->parallelize(ParallelType::BIDx); - - // Intermediate tensors only need this, but doesn't hurt to do on inputs - // tv0, 1, 4 - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv6->axis(-1)->parallelize(ParallelType::TIDx); - - int numel_x = 1025; - int numel_y = 129; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - at::Tensor t1 = at::randn({numel_x, numel_y}, options); - at::Tensor t4 = at::randn({numel_x}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({t0, t1, t4}); - - auto t2 = t0.add(t1); - auto t3 = t2.to(at::kDouble).sum({1}); - auto aten_output = t3.mul(t4); - - testValidate( - &fusion, cg_outputs, {t0, t1, t4}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionBranches_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - TensorView* tv1 = makeSymbolicTensor(2); - TensorView* tv2 = makeSymbolicTensor(2); - fusion.addInput(tv0); - fusion.addInput(tv1); - fusion.addInput(tv2); - - auto tv3 = add(tv0, new Double(1.0)); - auto tv4 = add(tv3, tv1); - auto tv5 = add(tv3, tv2); - auto tv6 = add(tv4, tv5); - - fusion.addOutput(tv6); - - constexpr int x = 63, y = 33; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor t0 = at::randn({x, y}, options); - at::Tensor t1 = at::randn({x, y}, options); - at::Tensor t2 = at::randn({x, y}, options); - - FusionExecutor fe; - tv6->merge(0); - tv6->split(0, 128); - tv6->split(0, 4); - - tv6->axis(0)->parallelize(ParallelType::BIDx); - - tv0->computeAt(tv6, 1); - tv1->computeAt(tv6, 1); - tv2->computeAt(tv6, 1); - - tv3->axis(-2)->parallelize(ParallelType::Unroll); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv4->axis(-2)->parallelize(ParallelType::Unroll); - tv4->axis(-1)->parallelize(ParallelType::TIDx); - tv5->axis(-2)->parallelize(ParallelType::Unroll); - tv5->axis(-1)->parallelize(ParallelType::TIDx); - tv6->axis(-1)->parallelize(ParallelType::TIDx); - - std::vector aten_inputs = {t0, t1, t2}; - - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - auto t3 = t0.add(1.0); - auto t4 = t3.add(t1); - auto t5 = t3.add(t2); - auto aten_output = t4.add(t5); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionSimpleBCast1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - TensorView* tv1 = add(tv0, new Double(1.5)); - - TensorView* tv2 = makeSymbolicTensor(2); - fusion.addInput(tv2); - TensorView* tv3 = makeSymbolicTensor(2); - fusion.addInput(tv3); - TensorView* tv4 = sub(tv2, tv3); - - TensorView* tv5 = broadcast(tv1, {false, false, true}); - TensorView* tv6 = broadcast(tv4, {true, false, false}); - - TensorView* tv7 = add(tv5, tv6); - fusion.addOutput(tv7); - - tv7->split(-1, 4); - tv7->split(0, 8); - - tv0->computeAt(tv7, -1); - tv2->computeAt(tv7, -1); - - tv7->axis(0)->parallelize(ParallelType::BIDx); - tv7->axis(-1)->parallelize(ParallelType::TIDx); - - constexpr int x = 63, y = 33, z = 15; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor t0 = at::randn({x, y}, options); - at::Tensor t1 = t0.add(1.5); - - at::Tensor t2 = at::randn({y, z}, options); - at::Tensor t3 = at::randn({y, z}, options); - - at::Tensor t4 = t2.sub(t3); - at::Tensor t5 = t1.unsqueeze(-1).expand({x, y, z}); - - at::Tensor t6 = t4.expand({x, y, z}); - - at::Tensor aten_output = t5.add(t6); - - std::vector aten_inputs = {t0, t2, t3}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionSimpleBCast2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - TensorView* tv1 = makeSymbolicTensor(2); - fusion.addInput(tv1); - - TensorView* tv2 = add(tv0, tv1); - - TensorView* tv3 = broadcast(tv2, {false, false, true}); - - TensorView* tv4 = makeSymbolicTensor(2); - fusion.addInput(tv4); - - TensorView* tv5 = sub(tv4, new Double(0.1)); - - TensorView* tv6 = broadcast(tv5, {true, false, false}); - - TensorView* tv7 = add(tv3, tv6); - - fusion.addOutput(tv7); - - tv7->merge(0, 1); - - tv0->computeAt(tv7, -1); - tv4->computeAt(tv7, -1); - - tv7->axis(0)->parallelize(ParallelType::BIDx); - tv7->axis(-1)->parallelize(ParallelType::TIDx); - - constexpr int x = 63, y = 33, z = 15; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor t0 = at::randn({x, y}, options); - at::Tensor t1 = at::randn({x, y}, options); - at::Tensor t2 = t0.add(t1); - at::Tensor t3 = t2.unsqueeze(-1).expand({x, y, z}); - - at::Tensor t4 = at::randn({y, z}, options); - at::Tensor t5 = t4.sub(0.1); - at::Tensor t6 = t5.expand({x, y, z}); - at::Tensor aten_output = t3.add(t6); - - at::Tensor cg_output = at::empty({x, y, z}, options); - - std::vector aten_inputs = {t0, t1, t4}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion(aten_inputs, {cg_output}); - - testValidate( - &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionSimpleBCast3_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - std::vector dom; - dom.push_back(new IterDomain(new Int(0), new Int())); - dom.push_back(new IterDomain( - new Int(0), - new Int(1), - ParallelType::Serial, - IterType::BroadcastWithStride)); - - // tv0[I1, B{1}] - TensorView* tv0 = new TensorView(new TensorDomain(dom), DataType::Float); - fusion.addInput(tv0); - - // tv1[I0, I1, I2] - TensorView* tv2 = makeSymbolicTensor(3); - fusion.addInput(tv2); - - TensorView* tv3 = add(tv0, tv2); - - fusion.addOutput(tv3); - - tv3->merge(0); - tv3->merge(0); - - tv0->computeAt(tv3, -1); - tv2->computeAt(tv3, -1); - - tv3->axis(0)->parallelize(ParallelType::BIDx); - - constexpr int x = 2, y = 3, z = 4; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor t0 = at::randn({y, 1}, options); - at::Tensor t2 = at::randn({x, y, z}, options); - auto aten_output = t0.add(t2); - - std::vector aten_inputs = {t0, t2}; - at::Tensor cg_output = at::empty({x, y, z}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion(aten_inputs, {cg_output}); - - testValidate( - &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionSimpleBCast4_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - std::vector dom; - dom.push_back(new IterDomain( - new Int(0), - new Int(1), - ParallelType::Serial, - IterType::BroadcastWithStride)); - dom.push_back(new IterDomain(new Int(0), new Int())); - TensorView* tv0 = new TensorView(new TensorDomain(dom), DataType::Float); - - TensorView* tv1 = makeSymbolicTensor(3); - fusion.addInput(tv0); - fusion.addInput(tv1); - - TensorView* tv3 = add(tv0, tv1); - - tv3->merge(0); - tv3->merge(0); - tv3->split(0, 128); - tv3->split(0, 4); - - fusion.addOutput(tv3); - - tv0->computeAt(tv3, -1); - tv1->computeAt(tv3, -1); - - tv3->axis(0)->parallelize(ParallelType::BIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-2)->parallelize(ParallelType::Unroll); - - constexpr int x = 63, y = 33, z = 15; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor t0 = at::randn({1, z}, options); - at::Tensor t1 = at::randn({x, y, z}, options); - - auto aten_output = t0.add(t1); - - at::Tensor cg_output = at::empty({x, y, z}, options); - - std::vector aten_inputs = {t0, t1}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion(aten_inputs, {cg_output}); - - testValidate( - &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionSimpleBCast5_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - constexpr int m = 2, k = 3, n = 4; - - auto zero = new Int(0); - auto M = new IterDomain(zero, new Int(m)); - auto K = new IterDomain(zero, new Int(k)); - auto N = new IterDomain(zero, new Int(n)); - - // Set up your input tensor views - TensorView* tv0 = - new TensorView(new TensorDomain({M, K}, {true, true}), DataType::Float); - // Note: IterDomain must not be reused, so K needs to be cloned. - TensorView* tv1 = new TensorView( - new TensorDomain({K->clone(), N}, {true, true}), DataType::Float); - - fusion.addInput(tv0); - fusion.addInput(tv1); - - TensorView* tv2 = broadcast(tv0, {false, false, true}); - TensorView* tv3 = broadcast(tv1, {true, false, false}); - - TensorView* tv4 = add(tv2, tv3); - - fusion.addOutput(tv4); - - tv4->merge(0); - tv4->merge(0); - - tv0->computeAt(tv4, -1); - tv1->computeAt(tv4, -1); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor t0 = at::randn({m, k}, options); - at::Tensor t1 = at::randn({k, n}, options); - - auto t2 = t0.unsqueeze(-1).expand({m, k, n}); - auto t3 = t1.expand({m, k, n}); - auto aten_output = t2.add(t3); - - at::Tensor cg_output = at::empty({m, k, n}, options); - - std::vector aten_inputs = {t0, t1}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion(aten_inputs, {cg_output}); - - testValidate( - &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionComplexBCast1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - int x = 2, y = 3, z = 4; - - auto tv0 = makeConcreteTensor({y}); - auto tv1 = div(tv0, new Double(2.0)); - auto tv2 = broadcast(tv1, {false, true}); - auto tv3 = makeConcreteTensor({y, z}); - auto tv4 = mul(tv2, tv3); - auto tv5 = broadcast(tv4, {true, false, false}); - auto tv6 = makeConcreteTensor({x, y, z}); - auto tv7 = add(tv5, tv6); - - // tv0[ i1 ] = input - // tv1[ i1 ] = tv0/2.0 - // tv2[ i1, b2] = bcast(tv1) - // tv3[ i1, i2] = input - // tv4[ i1, i2] = tv2 * tv3 - // tv5[b0, i1, i2] = bcast(tv4) - // tv6[i0, i1, i2] = input - // tv7[i0, i1, i2] = tv5 + tv6 - - // tv4 = bcast(tv1) * tv3 - // tv7 = bcast(tv4) + tv6 - - fusion.addInput(tv0); - fusion.addInput(tv3); - fusion.addInput(tv6); - - fusion.addOutput(tv7); - - tv7->merge(0); - tv7->merge(0); - tv0->computeAt(tv7, -1); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor t0 = at::randn({y}, options); - at::Tensor t3 = at::randn({y, z}, options); - at::Tensor t6 = at::randn({x, y, z}, options); - - auto t4 = t0.div(2.0).unsqueeze(-1).expand({y, z}) * t3; - auto aten_output = t4.unsqueeze(0).expand({x, y, z}) + t6; - - std::vector aten_inputs = {t0, t3, t6}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionComplexBCast2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - int x = 2, y = 3, z = 4; - - auto tv0 = makeConcreteTensor({y, z}); - auto tv1 = div(tv0, new Double(2.0)); - auto tv2 = sum(tv1, {1}); - auto tv3 = broadcast(tv2, {true, false}); - auto tv4 = makeConcreteTensor({x, y}); - auto tv5 = add(tv3, tv4); - - // tv0[ i1, i2] = input - // tv1[ i1, i2] = tv0/2.0 - // tv2[ i1 ] = sum(tv1, 1) - // tv3[b0, i1 ] = bcast(tv2) - // tv4[i0, i1 ] = input - // tv5[i0, i1 ] = tv3 + tv4 - - // tv2 = sum(tv0/2.0, 1) - // tv5 = bcast(tv2) + tv4 - - fusion.addInput(tv0); - fusion.addInput(tv4); - - fusion.addOutput(tv5); - - tv5->merge(0); - tv0->computeAt(tv5, -1); - tv1->computeAt(tv2, -1); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor t0 = at::randn({y, z}, options); - at::Tensor t4 = at::randn({x, y}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({t0, t4}); - - auto t1 = t0.div(2.0); - auto t2 = t1.to(at::kDouble).sum(1); - auto t3 = t2.unsqueeze(0).expand({x, y}); - auto aten_output = t3.add(t4); - - testValidate( - &fusion, {cg_outputs}, {t0, t4}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedIndexing1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - int w = 3, x = 4, y = 7, z = 8; - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - auto tv0 = makeSymbolicTensor(3); - auto tv1 = makeSymbolicTensor(4); - fusion.addInput(tv0); - fusion.addInput(tv1); - - auto tv2 = add(tv0, new Double(1.0)); - auto tv3 = broadcast(tv2, {true, false, false, false}); - auto tv4 = add(tv3, tv1); - - fusion.addOutput(tv4); - - tv4->merge(0); - tv4->merge(0); - tv4->merge(0); - - tv4->split(0, 128); - tv4->split(0, 4); - - tv2->computeAt(tv4, 1); - - tv4->axis(0)->parallelize(ParallelType::BIDx); - tv4->axis(1)->parallelize(ParallelType::Unroll); - tv4->axis(2)->parallelize(ParallelType::TIDx); - - tv3->axis(1)->parallelize(ParallelType::Unroll); - tv3->axis(2)->parallelize(ParallelType::TIDx); - - tv2->axis(1)->parallelize(ParallelType::Unroll); - tv2->axis(2)->parallelize(ParallelType::TIDx); - - FusionExecutor fe; - - at::Tensor t0 = at::randn({x, y, z}, options); - at::Tensor t1 = at::randn({w, x, y, z}, options); - - auto t3 = t0.add(1.0); - auto aten_output = t3.add(t1); - - std::vector aten_inputs = {t0, t1}; - - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedIndexing2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - int w = 3, x = 4, y = 7, z = 8; - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - auto tv0 = makeSymbolicTensor(3); - auto tv1 = makeSymbolicTensor(4); - fusion.addInput(tv0); - fusion.addInput(tv1); - - auto tv2 = add(tv0, new Double(1.0)); - auto tv3 = broadcast(tv2, {true, false, false, false}); - auto tv4 = add(tv3, tv1); - - fusion.addOutput(tv4); - - tv4->merge(-2); - tv4->merge(-2); - tv4->merge(-2); - - tv4->split(0, 128); - tv4->split(0, 4); - - tv2->computeAt(tv4, 1); - - tv4->axis(0)->parallelize(ParallelType::BIDx); - tv4->axis(1)->parallelize(ParallelType::Unroll); - tv4->axis(2)->parallelize(ParallelType::TIDx); - - tv3->axis(1)->parallelize(ParallelType::Unroll); - tv3->axis(2)->parallelize(ParallelType::TIDx); - - tv2->axis(1)->parallelize(ParallelType::Unroll); - tv2->axis(2)->parallelize(ParallelType::TIDx); - - FusionExecutor fe; - - at::Tensor t0 = at::randn({x, y, z}, options); - at::Tensor t1 = at::randn({w, x, y, z}, options); - - auto t3 = t0.add(1.0); - auto aten_output = t3.add(t1); - - std::vector aten_inputs = {t0, t1}; - - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedIndexing3_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - int w = 3, x = 4, y = 7, z = 8; - - auto tv0 = makeSymbolicTensor(3); - auto tv1 = makeSymbolicTensor(4); - fusion.addInput(tv0); - fusion.addInput(tv1); - - auto tv2 = add(tv0, new Double(1.0)); - auto tv3 = add(tv2, tv1); - fusion.addOutput(tv3); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({x, y, z}, options); - at::Tensor t1 = at::randn({w, x, y, z}, options); - - auto t2 = t0.add(1.0); - auto aten_output = t2.add(t1); - - std::vector aten_inputs = {t0, t1}; - - auto lparams = schedulePointwise(&fusion, aten_inputs); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs, lparams); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedIndexing4_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeConcreteTensor({4, 8}); - fusion.addInput(tv0); - TensorView* tv1 = makeConcreteTensor({4, 4, 8}); - fusion.addInput(tv1); - - TensorView* tv2 = add(tv0, new Double(1)); - TensorView* tv3 = broadcast(tv2, {true, false, false}); - TensorView* tv4 = add(tv3, tv1); - fusion.addOutput(tv4); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({4, 8}, options); - at::Tensor t1 = at::randn({4, 4, 8}, options); - - auto t2 = t0.add(1.0); - auto aten_output = t2.add(t1); - - std::vector aten_inputs = {t0, t1}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedIndexing5_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - TensorView* tv1 = makeSymbolicTensor(3); - fusion.addInput(tv1); - - TensorView* tv2 = add(tv0, new Double(1)); - TensorView* tv3 = broadcast(tv2, {true, false, true}); - TensorView* tv4 = add(tv3, tv1); - fusion.addOutput(tv4); - - tv3->merge(0)->merge(0)->split(0, 2)->split(0, 3); - tv4->merge(0)->merge(0)->split(0, 2)->split(0, 3); - - tv0->computeAt(tv4, 1); - tv1->computeAt(tv4, 1); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({7}, options); - at::Tensor t1 = at::randn({5, 7, 11}, options); - - auto t2 = t0.add(1.0); - auto aten_output = t2.unsqueeze(-1).add(t1); - - std::vector aten_inputs = {t0, t1}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedIndexing6_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - std::vector tensor0_shape{7, 4, 7}; - std::vector tensor1_shape{4, 7}; - - TensorView* tv0 = makeSymbolicTensor(tensor0_shape.size()); - fusion.addInput(tv0); - TensorView* tv1 = makeSymbolicTensor(tensor1_shape.size()); - fusion.addInput(tv1); - - TensorView* tv2 = add(tv0, tv1); - TensorView* tv3 = sum(tv2, {0, 1}); - fusion.addOutput(tv3); - - const auto options = - at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor input0 = at::randn(tensor0_shape, options); - at::Tensor input1 = at::randn(tensor1_shape, options); - - std::vector reduction_axes{0, 1}; - auto reduction_params = getReductionHeuristics(&fusion, {input0, input1}); - TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); - scheduleReduction(&fusion, reduction_params.value()); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = - fe.runFusion({input0, input1}, reduction_params.value().lparams); - - auto aten_output = input0.add(input1).to(at::kDouble).sum(reduction_axes); - - testValidate( - &fusion, - cg_outputs, - {input0, input1}, - {aten_output}, - __LINE__, - __FILE__, - "", - reduction_params.value().lparams); -} - -TEST(NVFuserTest, FusionAdvancedIndexing7_CUDA) { - // Might be able to use this one without 6 as the heuristics in 6 may change - // and this test is to cover the same issue. - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = broadcast(tv0, {false, true}); - - auto tv2 = makeSymbolicTensor(2); - fusion.addInput(tv2); - - auto tv3 = add(tv1, tv2); - auto tv4 = sum(tv3, {0, 1}); - fusion.addOutput(tv4); - - tv4->merge(0, 1); - tv4->split(0, 128); - tv4->split(0, 4); - - auto tv5 = tv4->rFactor({0, 1}); - - tv5->computeAt(tv4, -1); - tv0->computeAt(tv5, -1); - - tv4->axis(0)->parallelize(ParallelType::TIDx); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - const int numel_x = 100; - const int numel_y = 200; - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto at_t0 = at::randn({numel_x}, options); - auto at_t1 = at::randn({numel_x, numel_y}, options); - - auto cg_outputs = fe.runFusion({at_t0, at_t1}); - - auto aten_output = (at_t0.unsqueeze(-1).expand({numel_x, numel_y}) + at_t1) - .to(at::kDouble) - .sum(); - - testValidate( - &fusion, cg_outputs, {at_t0, at_t1}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedIndexing8_CUDA) { - // Same as 7 but with outer splits instead of inner - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = broadcast(tv0, {false, true}); - - auto tv2 = makeSymbolicTensor(2); - fusion.addInput(tv2); - - auto tv3 = add(tv1, tv2); - auto tv4 = sum(tv3, {0, 1}); - fusion.addOutput(tv4); - - tv4->merge(0, 1); - tv4->split(0, 128, false); - tv4->split(0, 4, false); - - auto tv5 = tv4->rFactor({0, 1}); - - tv5->computeAt(tv4, -1); - tv0->computeAt(tv5, -1); - - tv4->axis(0)->parallelize(ParallelType::TIDx); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - const int numel_x = 100; - const int numel_y = 200; - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto at_t0 = at::randn({numel_x}, options); - auto at_t1 = at::randn({numel_x, numel_y}, options); - - auto cg_outputs = fe.runFusion({at_t0, at_t1}); - - auto aten_output = (at_t0.unsqueeze(-1).expand({numel_x, numel_y}) + at_t1) - .to(at::kDouble) - .sum(); - - testValidate( - &fusion, cg_outputs, {at_t0, at_t1}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedIndexing9_CUDA) { - // Same as 7 but with outer splits instead of inner - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = broadcast(tv0, {false, true}); - - auto tv2 = mul(tv1, new Double(2)); - fusion.addOutput(tv2); - - auto tv3 = makeSymbolicTensor(3); - fusion.addInput(tv3); - - auto tv4 = add(tv3, tv2); - fusion.addOutput(tv4); - - const int numel_x = 200; - const int numel_y = 300; - const int numel_z = 400; - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto at_t0 = at::randn({numel_y}, options); - auto at_t3 = at::randn({numel_x, numel_y, numel_z}, options); - std::vector aten_inputs = {at_t0, at_t3}; - - auto lparams = schedulePointwise(&fusion, aten_inputs); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs, lparams); - - auto at_t1 = at_t0.unsqueeze(-1); - auto at_t2 = at_t1.mul(2.0); - - auto at_t4 = at_t3.add(at_t2); - - testValidate( - &fusion, cg_outputs, aten_inputs, {at_t2, at_t4}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedIndexing10_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeContigTensor(2); - TensorView* tv1 = makeContigTensor(2); - - // Register your inputs - fusion.addInput(tv0); - fusion.addInput(tv1); - - // Do math with it, it returns a `Val*` but can be static_casted back to - // TensorView - TensorView* tv2 = add(tv1, new Double(2.0)); - TensorView* tv3 = add(tv0, tv2); - - // Register your outputs - fusion.addOutput(tv3); - - auto tv0_cache = tv0->cache_after(); - auto tv1_cache = tv1->cache_after(); - - std::vector tvs = {tv0_cache, tv1_cache, tv2, tv3}; - - for (auto tv : tvs) { - tv->split(1, 2, false); - tv->split(1, 1); - tv->split(-1, 4); - // [I0, 2, 1, I1/2/4, 4] - tv->reorder({{1, 2}, {2, 3}, {3, 1}}); - tv->axis(0)->parallelize(ParallelType::BIDx); - tv->axis(1)->parallelize(ParallelType::TIDx); - } - - // For all inputs, computeAt the output inline, temporaries should be squeezed - // between them - tv0->computeAt(tv3, 1); - tv1->computeAt(tv3, 1); - - tv0_cache->axis(-1)->parallelize(ParallelType::Vectorize); - tv1_cache->axis(-1)->parallelize(ParallelType::Vectorize); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor input1 = at::randn({64, 128}, options); - at::Tensor input2 = at::rand_like(input1); - at::Tensor output = at::empty_like(input1); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({input1, input2}, {output}); - - at::Tensor tv2_ref = input2 + 2.0; - at::Tensor output_ref = input1 + tv2_ref; - - TORCH_CHECK(output_ref.equal(output)); -} - -TEST(NVFuserTest, FusionAdvancedIndexing11_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - int w = 3, x = 4, y = 7, z = 8; - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - auto tv0 = makeSymbolicTensor(4); - auto tv1 = makeSymbolicTensor(1); - fusion.addInput(tv0); - fusion.addInput(tv1); - - auto tv2 = add(tv1, new Double(1.0)); - auto tv3 = broadcast(tv2, {true, false, true, true}); - auto tv4 = add(tv3, tv0); - - fusion.addOutput(tv4); - - tv4->merge(0); - tv4->merge(1); - - tv4->split(1, 32); - tv4->split(0, 1); - - tv4->reorder({{2, 1}}); - - tv2->computeAt(tv4, 3); - - tv2->setMemoryType(MemoryType::Global); - - tv4->axis(0)->parallelize(ParallelType::BIDx); - tv4->axis(1)->parallelize(ParallelType::BIDy); - tv4->axis(2)->parallelize(ParallelType::Unswitch); - tv4->axis(-1)->parallelize(ParallelType::TIDx); - - tv3->axis(-1)->parallelize(ParallelType::TIDx); - - FusionExecutor fe; - - at::Tensor t0 = at::randn({w, x, y, z}, options); - at::Tensor t1 = at::randn({x}, options); - - auto t3 = t1.add(1.0).unsqueeze(-1).unsqueeze(-1); - auto aten_output = t3.add(t0); - - std::vector aten_inputs = {t0, t1}; - - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -// Intended to stress the lowering of our code generator -TEST(NVFuserTest, FusionAdvancedLowering1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeConcreteTensor({9, 5}); - fusion.addInput(tv0); - - TensorView* tv1 = add(tv0, new Double(1)); - TensorView* tv2 = add(tv1, new Double(2)); - TensorView* tv3 = add(tv1, new Double(3)); - TensorView* tv4 = sum(tv3, {1}); - - fusion.addOutput(tv2); - fusion.addOutput(tv4); - - tv4->split(1, 4); - auto tv5 = tv4->rFactor({2}); - - tv1->computeAt(tv5, 2); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(1); - at::Tensor aten_input = at::randn({9, 5}, options); - - auto t1 = aten_input.add(1.0); - auto t2 = t1.add(2.0); - auto t3 = t1.add(3.0); - auto t4 = t3.sum(1); - - std::vector aten_outputs = {t2, t4}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto cg_outputs = fe.runFusion({aten_input}); - - testValidate( - &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedLowering2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Progressively broadcast tensors - TensorView* tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - TensorView* tv1 = makeSymbolicTensor(2); - fusion.addInput(tv1); - TensorView* tv2 = makeSymbolicTensor(3); - fusion.addInput(tv2); - - TensorView* tv3 = add(tv0, new Double(1)); - TensorView* tv4 = broadcast(tv3, {false, true}); - TensorView* tv5 = add(tv4, tv1); - TensorView* tv6 = add(tv5, tv2); - - fusion.addOutput(tv6); - - // Split inner dimension - tv6->split(1, 4); - // Merge middle dims with outer dimensions - tv6->merge(2); - tv6->merge(0); - - // tv6[I0*I1o, I1i*I2] - - // Compute everything inline - tv0->computeAt(tv6, -1); - - tv6->axis(0)->parallelize(ParallelType::BIDx); - tv6->axis(1)->parallelize(ParallelType::TIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - int x = 13, y = 9, z = 5; - at::Tensor t0 = at::randn({y}, options); - at::Tensor t1 = at::randn({y, z}, options); - at::Tensor t2 = at::randn({x, y, z}, options); - - auto t3 = t0.add(1.0); - auto t4 = t3.unsqueeze(-1); - auto t5 = t4.add(t1); - auto t6 = t5.add(t2); - - std::vector aten_inputs = {t0, t1, t2}; - std::vector aten_outputs = {t6}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); -} - -// TODO: Complete test -TEST(NVFuserTest, FusionAdvancedLowering3_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeConcreteTensor({1, -1}); - auto tv1 = makeSymbolicTensor(2); - fusion.addInput(tv0); - fusion.addInput(tv1); - - // [b0, i1] - auto tv2 = add(tv0, new Double(2.0)); - - // [i0, i1] - auto tv3 = add(tv1, new Double(3.0)); - - // [b0, i1] - auto tv4 = add(tv2, new Double(4.0)); - - // [io, i1] - auto tv5 = add(tv2, tv3); - - fusion.addOutput(tv4); - fusion.addOutput(tv5); - - tv0->computeAt(tv4, -1); - - tv3->setMemoryType(MemoryType::Global); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - int x = 13, y = 9; - at::Tensor t0 = at::randn({1, y}, options); - at::Tensor t1 = at::randn({x, y}, options); - - auto t4 = t0 + 2 + 4; - auto t5 = t0 + 2 + t1 + 3; - - std::vector aten_inputs = {t0, t1}; - std::vector aten_outputs = {t4, t5}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); -} - -// This excercises indexing with broadcast root axes. Non-broadcast -// axes need to be preferred when propagating index exprs to root -// axes. See, e.g., Index::getConsumerIndex_impl. -TEST(NVFuserTest, FusionAdvancedLowering4_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - auto tv1 = broadcast(tv0, {false, true}); - auto tv2 = broadcast(tv1, {false, false, true}); - auto tv3 = makeSymbolicTensor(3); - fusion.addInput(tv3); - auto tv4 = add(tv2, tv3); - fusion.addOutput(tv4); - - tv4->merge(1)->merge(0); - tv4->split(0, 8); - tv0->computeAt(tv4, 1); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - const int bx = 10; - const int by = 20; - const int bz = 30; - at::Tensor t0 = at::randn({bx}, options); - at::Tensor t3 = at::randn({bx, by, bz}, options); - std::vector aten_inputs = {t0, t3}; - - auto cg_outputs = fe.runFusion(aten_inputs); - - auto aten_output = - t0.unsqueeze(-1).expand({bx, by}).unsqueeze(-1).expand({bx, by, bz}) + t3; - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedLowering5_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeConcreteTensor({5, 4, 3}); - fusion.addInput(tv0); - - TensorView* tv1 = makeConcreteTensor({5, 3}); - fusion.addInput(tv1); - - auto tv2 = broadcast(tv1, {false, true, false}); - - auto tv3 = add(tv0, tv2); - - fusion.addOutput(tv3); - - tv2->merge(0); - tv1->computeAt(tv2, 1); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(1); - at::Tensor t0 = at::randn({5, 4, 3}, options); - at::Tensor t1 = at::randn({5, 3}, options); - auto t2 = t1.unsqueeze(1); - auto t3 = t0 + t2; - - std::vector aten_inputs = {t0, t1}; - std::vector aten_outputs = {t3}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedLowering6_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeConcreteTensor({5, 4, 3}); - fusion.addInput(tv0); - auto tv1 = makeConcreteTensor({4}); - fusion.addInput(tv1); - auto tv2 = unaryOp(UnaryOpType::Set, tv0); - auto tv3 = unaryOp(UnaryOpType::Set, tv1); - - auto tv4 = sum(tv2, {0, 2}); - auto tv5 = add(tv4, tv3); - fusion.addOutput(tv5); - - auto tv6 = broadcast(tv3, {true, false, true}); - auto tv7 = add(tv2, tv6); - fusion.addOutput(tv7); - - tv2->computeAt(tv4, -1, ComputeAtMode::BestEffort); - tv3->computeAt(tv7, -1, ComputeAtMode::BestEffort); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(1); - at::Tensor t0 = at::randn({5, 4, 3}, options); - at::Tensor t1 = at::randn({4}, options); - - auto t2 = t0; - auto t3 = t1; - - std::vector reduction_axes{0, 2}; - auto t4 = t2.sum(reduction_axes); - auto t5 = add(t4, t3); - auto t6 = t3.unsqueeze(0).unsqueeze(-1); - auto t7 = t2.add(t6); - - std::vector aten_inputs = {t0, t1}; - std::vector aten_outputs = {t5, t7}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); -} - -// Test a simple Gemm but also play around with fusion executor features -TEST(NVFuserTest, FusionSimpleGemm_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); // M, K - TensorView* tv1 = makeSymbolicTensor(2); // K, N - fusion.addInput(tv0); - fusion.addInput(tv1); - - TensorView* tv2 = broadcast(tv0, {false, false, true}); - // tv2[I0, I1, B] = tv0[I0, I1] - - TensorView* tv3 = broadcast(tv1, {true, false, false}); - // tv3[B, I1, I2] = tv1[I1, I2] - - // tv4[I0, I1, I2] = tv2[I0, I1, B] * tv3[B, I1, I2] - TensorView* tv4 = mul(tv2, tv3); - // tv5[I0, R1, I2] = tv4[I0, I1, I2] - TensorView* tv5 = sum(tv4, {1}); - fusion.addOutput(tv5); - - tv5->split(1, 32); - // tv5[I0, R1o, R1i{32}, I2] - - auto tv6 = tv5->rFactor({1}); - // tv6[I0, R1o, I1i{32}, I2] = tv4[I0, I1, I2] - // tv5[I0, , R1i{32}, I2] = tv6[I0, R1o, I1i{32}, I2] - - tv5->split(0, 4); - tv5->split(-1, 4); - // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}] - // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}] - - tv0->computeAt(tv5, -1); - tv1->computeAt(tv5, -1); - - // tv6[I0o, I0i{4}, R1o, I1i{32}, I2o, I2i{4}] - // tv5[I0o, I0i{4}, , R1i{32}, I2o, I2i{4}] - //--> (line symbolizes compute at location) - // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, I1o] - // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, R1o] - // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|] - - tv0->computeAt(tv6, -1); - tv1->computeAt(tv6, -1); - // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, I1o |] - // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, R1o |] - // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|] - - tv5->axis(0)->parallelize(ParallelType::BIDz); - tv5->axis(1)->parallelize(ParallelType::TIDz); - - tv5->axis(-2)->parallelize(ParallelType::BIDy); - tv5->axis(-1)->parallelize(ParallelType::TIDy); - - tv5->axis(2)->parallelize(ParallelType::TIDx); - tv6->axis(2)->parallelize(ParallelType::TIDx); - - constexpr int M = 65, K = 33, N = 17; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor t0 = at::randn({M, K}, options); - at::Tensor t1 = at::randn({K, N}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - // Lets specify a few bounds in launch params to make sure it works - fe.runFusion({t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4)); - - // Make sure bad launch params throws - // TODO: Re-enable once we have parallelization validation in. - // ASSERT_ANY_THROW(fe.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6))); - - // Don't specify any launch params - auto cg_outputs = fe.runFusion({t0, t1}); - - auto aten_output = t0.to(at::kDouble).matmul(t1.to(at::kDouble)); - - testValidate( - &fusion, cg_outputs, {t0, t1}, {aten_output}, __LINE__, __FILE__); -} - -// Softmax with a 1D tensor. Parallelized only with a single thread block. -TEST(NVFuserTest, FusionSoftmax1D_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const int tidx = 128; - const int dimx = 1000; - - // Set up your input tensor views - TensorView* input_tv0 = makeSymbolicTensor(1); - fusion.addInput(input_tv0); - - TensorView* exp_tv1 = unaryOp(UnaryOpType::Exp, input_tv0); - TensorView* sum_exp_tv2 = sum(exp_tv1, {-1}); - TensorView* bcast_sum_tv3 = broadcast(sum_exp_tv2, {true}); - - // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be - // computed at sum_exp_rf_tv8. - TensorView* exp_tv1_copy = unaryOp(UnaryOpType::Exp, input_tv0); - - TensorView* output_tv4 = div(exp_tv1_copy, bcast_sum_tv3); - - fusion.addOutput(output_tv4); - - bcast_sum_tv3->split(0, tidx); - - sum_exp_tv2->split(-1, tidx); - TensorView* sum_exp_rf_tv5 = sum_exp_tv2->rFactor({-2}); - - output_tv4->split(-1, tidx); - - exp_tv1->computeAt(sum_exp_rf_tv5, -1); - exp_tv1_copy->computeAt(output_tv4, -1); - - TensorView* tensors_to_parallelize[] = { - sum_exp_tv2, bcast_sum_tv3, output_tv4, sum_exp_rf_tv5}; - - for (auto tv : tensors_to_parallelize) { - tv->axis(-1)->parallelize(ParallelType::TIDx); - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({dimx}, options); - at::Tensor cg_output = at::empty({dimx}, options); - at::Tensor t3_output = at::empty_like(cg_output, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({t0}, {cg_output}); - - auto aten_output = at::_softmax(t0.to(at::kDouble), -1, false); - - testValidate(&fusion, {cg_output}, {t0}, {aten_output}, __LINE__, __FILE__); -} - -// Softmax with a 1D tensor with input normalization. -TEST(NVFuserTest, FusionSoftmax1DNormalized_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const int tidx = 128; - const int dimx = 1000; - - // Set up your input tensor views - TensorView* input_tv0 = makeSymbolicTensor(1); - fusion.addInput(input_tv0); - - // Normalize with the max value before computing exp. - TensorView* max_val_tv1 = - reductionOp(BinaryOpType::Max, {-1}, new Double(0), input_tv0); - TensorView* bcast_max_tv2 = broadcast(max_val_tv1, {true}); - TensorView* sub_tv3 = sub(input_tv0, bcast_max_tv2); - TensorView* exp_tv4 = unaryOp(UnaryOpType::Exp, sub_tv3); - TensorView* sum_exp_tv5 = sum(exp_tv4, {-1}); - TensorView* bcast_sum_tv6 = broadcast(sum_exp_tv5, {true}); - - // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be - // computed at sum_exp_rf_tv8. - TensorView* sub_tv3_copy = sub(input_tv0, bcast_max_tv2); - TensorView* exp_tv4_copy = unaryOp(UnaryOpType::Exp, sub_tv3_copy); - - TensorView* output_tv7 = div(exp_tv4_copy, bcast_sum_tv6); - - fusion.addOutput(output_tv7); - bcast_max_tv2->split(0, tidx); - bcast_sum_tv6->split(0, tidx); - - max_val_tv1->split(-1, tidx); - TensorView* max_val_rf_tv8 = max_val_tv1->rFactor({-2}); - - sum_exp_tv5->split(-1, tidx); - TensorView* sum_exp_rf_tv9 = sum_exp_tv5->rFactor({-2}); - - output_tv7->split(-1, tidx); - - sub_tv3->computeAt(sum_exp_rf_tv9, -1); - sub_tv3_copy->computeAt(output_tv7, -1); - - TensorView* tensors_to_parallelize[] = { - max_val_tv1, - bcast_max_tv2, - sum_exp_tv5, - bcast_sum_tv6, - output_tv7, - max_val_rf_tv8, - sum_exp_rf_tv9}; - - for (auto tv : tensors_to_parallelize) { - tv->axis(-1)->parallelize(ParallelType::TIDx); - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({dimx}, options); - at::Tensor t3_output = at::empty({dimx}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({input}); - - auto aten_output = at::_softmax(input.to(at::kDouble), -1, false); - - testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); -} - -// Softmax with a 3D tensor, where the inner-most 3rd dimension is -// normalized. Pallelized with multiple thread blocks. -TEST(NVFuserTest, FusionSoftmax3D_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const int tidx = 32; - const int dimx = 32; - const int dimy = 16; - const int dimz = 130; - - // Set up your input tensor views - TensorView* input_tv0 = makeSymbolicTensor(3); - fusion.addInput(input_tv0); - - TensorView* exp_tv1 = unaryOp(UnaryOpType::Exp, input_tv0); - TensorView* sum_exp_tv2 = sum(exp_tv1, {-1}); - TensorView* bcast_sum_tv3 = broadcast(sum_exp_tv2, {false, false, true}); - - // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be - // computed at sum_exp_rf_tv8. - TensorView* exp_tv1_copy = unaryOp(UnaryOpType::Exp, input_tv0); - - TensorView* output_tv4 = div(exp_tv1_copy, bcast_sum_tv3); - - fusion.addOutput(output_tv4); - - bcast_sum_tv3->split(-1, tidx); - - sum_exp_tv2->split(-1, tidx); - TensorView* sum_exp_rf_tv5 = sum_exp_tv2->rFactor({-2}); - - output_tv4->split(-1, tidx); - - exp_tv1->computeAt(sum_exp_rf_tv5, -1); - exp_tv1_copy->computeAt(output_tv4, -1); - - TensorView* tensors_to_parallelize[] = { - sum_exp_tv2, bcast_sum_tv3, output_tv4, sum_exp_rf_tv5}; - - for (auto tv : tensors_to_parallelize) { - tv->axis(0)->parallelize(ParallelType::BIDx); - tv->axis(1)->parallelize(ParallelType::BIDy); - tv->axis(-1)->parallelize(ParallelType::TIDx); - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({dimx, dimy, dimz}, options); - - at::Tensor cg_output = at::empty({dimx, dimy, dimz}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({input}, {cg_output}); - - auto aten_output = at::_softmax(input.to(at::kDouble), -1, false); - - testValidate( - &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); -} - -// Softmax with a 3D tensor with input normalization. -TEST(NVFuserTest, FusionSoftmax3DNormalized_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const int tidx = 32; - const int dimx = 32; - const int dimy = 16; - const int dimz = 130; - - // Set up your input tensor views - TensorView* input_tv0 = makeSymbolicTensor(3); - fusion.addInput(input_tv0); - - // Normalize with the max value before computing exp. - TensorView* max_val_tv1 = - reductionOp(BinaryOpType::Max, {-1}, new Double(0), input_tv0); - TensorView* bcast_max_tv2 = broadcast(max_val_tv1, {false, false, true}); - TensorView* sub_tv3 = sub(input_tv0, bcast_max_tv2); - TensorView* exp_tv4 = unaryOp(UnaryOpType::Exp, sub_tv3); - TensorView* sum_exp_tv5 = sum(exp_tv4, {-1}); - TensorView* bcast_sum_tv6 = broadcast(sum_exp_tv5, {false, false, true}); - - // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be - // computed at sum_exp_rf_tv8. - TensorView* sub_tv3_copy = sub(input_tv0, bcast_max_tv2); - TensorView* exp_tv4_copy = unaryOp(UnaryOpType::Exp, sub_tv3_copy); - - TensorView* output_tv7 = div(exp_tv4_copy, bcast_sum_tv6); - - fusion.addOutput(output_tv7); - - bcast_max_tv2->split(-1, tidx); - bcast_sum_tv6->split(-1, tidx); - - max_val_tv1->split(-1, tidx); - TensorView* max_val_rf_tv8 = max_val_tv1->rFactor({-2}); - - sum_exp_tv5->split(-1, tidx); - TensorView* sum_exp_rf_tv9 = sum_exp_tv5->rFactor({-2}); - - output_tv7->split(-1, tidx); - - sub_tv3->computeAt(sum_exp_rf_tv9, -1); - sub_tv3_copy->computeAt(output_tv7, -1); - - TensorView* tensors_to_parallelize[] = { - max_val_tv1, - bcast_max_tv2, - sum_exp_tv5, - bcast_sum_tv6, - output_tv7, - max_val_rf_tv8, - sum_exp_rf_tv9}; - - for (auto tv : tensors_to_parallelize) { - tv->axis(0)->parallelize(ParallelType::BIDx); - tv->axis(1)->parallelize(ParallelType::BIDy); - tv->axis(-1)->parallelize(ParallelType::TIDx); - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({dimx, dimy, dimz}, options); - at::Tensor t3_output = at::empty({dimx, dimy, dimz}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({input}); - - auto aten_output = at::_softmax(input.to(at::kDouble), -1, false); - - testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionSoftmaxComputeAt_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = sum(tv0, {1}); - auto tv2 = broadcast(tv1, {false, true}); - - auto tv3 = add(tv0, new Double(1.0)); - - auto tv4 = mul(tv2, tv3); - - auto tv5 = sum(tv4, {1}); - auto tv6 = broadcast(tv5, {false, true}); - - auto tv7 = sub(tv6, tv4); - fusion.addOutput(tv7); - - tv1->computeAt(tv7, 1); - ASSERT_ANY_THROW(tv1->computeAt(tv7, -1)); -} - -// Similar to FusionReduction but uses grid reduction -TEST(NVFuserTest, FusionGridReduction1_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 6) { - return; - } - const int gdimx = 32; - const int bdimx = 128; - - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - // tv1[I0, R1] = tv0[I0, I1] - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); - fusion.addOutput(tv1); - - TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); - - tv1->split(1, bdimx); - // tv1[I0, R1o, R1i{128}] = tv0[I0, I1] - tv1->split(1, gdimx); - // tv1[I0, R1oo, R1oi{32}, R1i{128}] = tv0[I0, I1] - - TensorView* tv2 = tv1->rFactor({1}); - // tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] = tv0[I0, I1] - // tv1[I0, R1oi{32}, R1i{128}] = tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] - - // Incrementally, can print in between for debugging - tv0->computeAt(tv2, 1); - tv2->computeAt(tv1, 1); - - // Re do it all at once, because why not. - tv0->computeAt(tv1, 1); - - tv1->axis(0)->parallelize(ParallelType::BIDy); - tv1->axis(1)->parallelize(ParallelType::BIDx); - tv2->axis(2)->parallelize(ParallelType::BIDx); - - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - - int numel_x = 10000; - int numel_y = 65000; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({numel_x, numel_y}, options); - at::Tensor cg_output = at::empty({numel_x}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({input}, {cg_output}); - - auto aten_output = input.to(at::kDouble).sum({1}); - - testValidate( - &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); -} - -// Same test as the above but uses BIDy and TIDx for reduction -TEST(NVFuserTest, FusionGridReduction2_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 6) { - return; - } - const int gdimy = 32; - const int bdimx = 128; - - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - // tv1[I0, R1] = tv0[I0, I1] - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); - fusion.addOutput(tv1); - - TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); - - tv1->split(1, bdimx); - // tv1[I0, R1o, R1i{128}] = tv0[I0, I1] - tv1->split(1, gdimy); - // tv1[I0, R1oo, R1oi{32}, R1i{128}] = tv0[I0, I1] - - TensorView* tv2 = tv1->rFactor({1}); - // tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] = tv0[I0, I1] - // tv1[I0, R1oi{32}, R1i{128}] = tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] - - // Incrementally, can print in between for debugging - tv0->computeAt(tv2, 1); - tv2->computeAt(tv1, 1); - - // Re do it all at once, because why not. - tv0->computeAt(tv1, 1); - - tv1->axis(0)->parallelize(ParallelType::BIDx); - tv1->axis(1)->parallelize(ParallelType::BIDy); - tv2->axis(2)->parallelize(ParallelType::BIDy); - - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - - int numel_x = 10000; - int numel_y = 65000; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({numel_x, numel_y}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({input}); - - auto aten_output = input.to(at::kDouble).sum({1}); - - testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); -} - -// Same test but uses BIDy and BIDz for reduction. No TID used. -TEST(NVFuserTest, FusionGridReduction3dim1_CUDA) { - // Grid reductions when there aren't any threads are serial reductions - // keep these numbers low so our error isn't too high compared to normal cuda - // reductions - const int gdimz = 15; - const int gdimy = 9; - - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - // tv1[I0, R1] = tv0[I0, I1] - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); - fusion.addOutput(tv1); - - TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); - - tv1->split(1, gdimy); - // tv1[I0, R1o, R1i{128}] = tv0[I0, I1] - tv1->split(1, gdimz); - // tv1[I0, R1oo, R1oi{32}, R1i{128}] = tv0[I0, I1] - - TensorView* tv2 = tv1->rFactor({1}); - // tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] = tv0[I0, I1] - // tv1[I0, R1oi{32}, R1i{128}] = tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] - - // Incrementally, can print in between for debugging - tv0->computeAt(tv2, 1); - tv2->computeAt(tv1, 1); - - // Re do it all at once, because why not. - tv0->computeAt(tv1, 1); - - tv1->axis(0)->parallelize(ParallelType::BIDx); - tv1->axis(1)->parallelize(ParallelType::BIDz); - tv2->axis(2)->parallelize(ParallelType::BIDz); - tv1->axis(-1)->parallelize(ParallelType::BIDy); - tv2->axis(-1)->parallelize(ParallelType::BIDy); - - int numel_x = 100; - int numel_y = 6500; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({numel_x, numel_y}, options); - at::Tensor cg_output = at::empty({numel_x}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({input}, {cg_output}); - - auto aten_output = input.to(at::kDouble).sum({1}); - testValidate( - &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); -} - -// Same as testGPU_FusionGridReduction3dim1 but reduces dimension 0 -TEST(NVFuserTest, FusionGridReduction3dim0_CUDA) { - // Grid reductions when there aren't any threads are serial reductions - // keep these numbers low so our error isn't too high compared to normal cuda - // reductions - const int gdimz = 15; - const int gdimy = 9; - - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - // tv1[R0, I1] = tv0[I0, I1] - TensorView* tv1 = reductionOp(BinaryOpType::Add, {0}, new Double(0), tv0); - fusion.addOutput(tv1); - - TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); - - tv1->split(0, gdimy); - // tv1[R0o, R0i{128}, I1] = tv0[I0, I1] - tv1->split(0, gdimz); - // tv1[R0oo, R0oi{32}, R0i{128}, I1] = tv0[I0, I1] - - TensorView* tv2 = tv1->rFactor({0}); - // tv2[R0oo, I0oi{32}, I0i{128}, I1] = tv0[I0, I1] - // tv1[ R0oi{32}, R0i{128}, I1] = tv2[R0oo, I0oi{32}, I0i{128}, I1] - - // Note that computeAt isn't going to make anything better as there - // is no dynamically sized dimension. - - // Map parallelism as [Serial, BIDz, BIDy, BIDx] - tv1->axis(-1)->parallelize(ParallelType::BIDx); - tv2->axis(-1)->parallelize(ParallelType::BIDx); - tv1->axis(-2)->parallelize(ParallelType::BIDy); - tv2->axis(-2)->parallelize(ParallelType::BIDy); - tv1->axis(-3)->parallelize(ParallelType::BIDz); - tv2->axis(-3)->parallelize(ParallelType::BIDz); - - int numel_x = 6500; - int numel_y = 100; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor input = at::randn({numel_x, numel_y}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({input}); - - auto aten_output = input.to(at::kDouble).sum({0}); - - testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); -} - -// This is similar to the FusionReduction, but swaps BIDx and TIDx -TEST(NVFuserTest, FusionGridReduction4_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const int bdimx = 128; - const int gdimx = 1024; - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - // tv1[I0, R1] = tv0[I0, I1] - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); - fusion.addOutput(tv1); - - TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); - - tv1->split(1, gdimx); - // tv1[I0, R1o, R1i{1024}] = tv0[I0, I1] - tv1->split(1, 4); - // tv1[I0, R1oo, R1oi{4}, R1i{128}] = tv0[I0, I1] - - TensorView* tv2 = tv1->rFactor({1}); - // tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}] = tv0[I0, I1] - // tv1[I0, R1oi{4}, R1i{1024}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}] - - TensorView* tv3 = tv1->rFactor({1}); - // tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}] = tv0[I0, I1] - // tv3[I0, R1oi{4}, Ir1i{1024}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}] - // tv1[I0, R1i{1024}] = tv3[I0, R1oi{4}, Ir1i{1024}] - - // Incrementally, can print in between for debugging - tv0->computeAt(tv2, 1); - tv2->computeAt(tv3, 1); - tv3->computeAt(tv1, 1); - - // Re do it all at once, because why not. - tv0->computeAt(tv1, 1); - - tv2->axis(2)->parallelize(ParallelType::Unroll); - tv1->axis(0)->parallelize(ParallelType::TIDx); - - tv1->axis(-1)->parallelize(ParallelType::BIDx); - tv2->axis(-1)->parallelize(ParallelType::BIDx); - tv3->axis(-1)->parallelize(ParallelType::BIDx); - - int numel_x = bdimx; - int numel_y = 65000; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({numel_x, numel_y}, options); - at::Tensor cg_output = at::empty({numel_x}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({input}, {cg_output}); - - auto aten_output = input.to(at::kDouble).sum({1}); - testValidate( - &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); -} - -// Grid reduction with 2D thread blocks but only TIDx and BIDx are -// mapped to a reduction dim -TEST(NVFuserTest, FusionGridReduction5_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const int bdimx = 64; - const int bdimy = 16; - const int gdimx = 4; - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - // tv1[I0, R1] = tv0[I0, I1] - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); - fusion.addOutput(tv1); - - TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); - - tv1->split(1, bdimx); - // tv1[I0, R1o, R1i{64}] = tv0[I0, I1] - tv1->split(1, gdimx); - // tv1[I0, R1oo, R1oi{4}, R1i{64}] = tv0[I0, I1] - - TensorView* tv2 = tv1->rFactor({1}); - // tv2[I0, R1oo, Ir1oi{4}, Ir1i{64}] = tv0[I0, I1] - // tv1[I0, R1oi{4}, R1i{64}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{64}] - - tv0->computeAt(tv1, 1); - - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - - tv1->axis(-2)->parallelize(ParallelType::BIDx); - tv2->axis(-2)->parallelize(ParallelType::BIDx); - - tv1->axis(0)->parallelize(ParallelType::TIDy); - - int numel_x = bdimy; - int numel_y = 6500; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({numel_x, numel_y}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({input}); - - auto aten_output = input.to(at::kDouble).sum({1}); - testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); -} - -// Similar to FusionGridReduction1 but with 3D tensors -TEST(NVFuserTest, FusionGridReduction6_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(3); - fusion.addInput(tv0); - - // tv1[I0, R1, R2] = tv0[I0, I1, I2] - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1, 2}, new Double(0), tv0); - fusion.addOutput(tv1); - - TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); - - // Splitting for TID - tv1->split(2, 128); - // tv1[I0, R1, R2o, R2i{128}] = tv0[I0, I1, I2] - - // Splitting for BID - tv1->split(1, 128); - - // tv1[I0, R1o, R1i{128}, R2o, R2i{128}] = tv0[I0, I1, I2] - - TensorView* tv2 = tv1->rFactor({3}); - // tv2[I0, I1o, I1i{128}, R2o, I2i{128}] - // tv1[I0, R1o, R1i{128}, R2i{128}] - - TensorView* tv3 = tv1->rFactor({1}); - // tv2[I0, I1o, I1i{128}, R2o, I2i{128}] - // tv3[I0, R1o, I1i{128}, I2i{128}] - // tv1[I0, R1i{128}, R2i{128}] - - tv3->computeAt(tv1, 1); - tv2->computeAt(tv3, 3); - - tv1->axis(0)->parallelize(ParallelType::BIDy); - - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - - tv1->axis(-2)->parallelize(ParallelType::BIDx); - tv2->axis(-3)->parallelize(ParallelType::BIDx); - tv3->axis(-2)->parallelize(ParallelType::BIDx); - - int numel_x = 6500; - int numel_y = 200; - int numel_z = numel_y; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({numel_x, numel_y, numel_z}, options); - at::Tensor cg_output = at::empty({numel_x}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({input}, {cg_output}); - - auto aten_output = input.to(at::kDouble).sum({1, 2}); - - testValidate( - &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); -} - -// See issue #1049 -TEST(NVFuserTest, FusionGridReduction7_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = sum(tv0, {0}); - fusion.addOutput(tv1); - - tv1->split(0, 1000); - - tv1->axis(0)->parallelize(ParallelType::BIDx); - tv1->axis(1)->parallelize(ParallelType::BIDy); - - const int numel_x = 1; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({numel_x}, options); - at::Tensor cg_output = at::empty({numel_x}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto out = fe.runFusion({input}); - - auto aten_output = input.sum({0}); - - testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionGridReduction8_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = sum(tv0, {0}); - fusion.addOutput(tv1); - - tv1->axis(0)->parallelize(ParallelType::BIDx); - tv1->axis(1)->parallelize(ParallelType::TIDx); - - const int numel_x = 2; - const int numel_y = 4; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({numel_x, numel_y}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto out = fe.runFusion({input}); - - auto aten_output = input.sum({0}); - - testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionGridReduction9_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 6) { - return; - } - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = sum(tv0, {1}); - - auto tv2 = makeSymbolicTensor(1); - fusion.addInput(tv2); - - auto tv3 = add(tv2, tv1); - fusion.addOutput(tv3); - - tv1->split(1, 2); - - tv1->axis(1)->parallelize(ParallelType::BIDx); - tv1->axis(2)->parallelize(ParallelType::BIDy); - - tv1->computeAt(tv3, 1); - - const int numel_x = 4; - const int numel_y = 10; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - at::Tensor t2 = at::randn({numel_x}, options); - - at::ArrayRef aten_inputs = {t0, t2}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_output = fe.runFusion(aten_inputs); - - auto aten_output = t0.sum({1}).add(t2); - - testValidate(&fusion, cg_output, {t0, t2}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionGridReduction10_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(4); - fusion.addInput(tv0); - - auto tv1 = sum(tv0, {-1}); - auto tv2 = sum(tv1, {-1}); - auto tv3 = sum(tv2, {-1}); - - fusion.addOutput(tv3); - tv1->axis(0)->parallelize(ParallelType::TIDx); - tv1->axis(1)->parallelize(ParallelType::BIDx); - tv1->axis(2)->parallelize(ParallelType::TIDy); - tv1->axis(3)->parallelize(ParallelType::TIDz); - - tv2->axis(0)->parallelize(ParallelType::TIDx); - tv2->axis(1)->parallelize(ParallelType::BIDx); - tv2->axis(2)->parallelize(ParallelType::TIDy); - - tv3->axis(0)->parallelize(ParallelType::TIDx); - tv3->axis(1)->parallelize(ParallelType::BIDx); - - tv0->computeAt(tv3, 1); - - const int numel_w = 2; - const int numel_x = 3; - const int numel_y = 4; - const int numel_z = 5; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_w, numel_x, numel_y, numel_z}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_output = fe.runFusion({t0}); - - auto aten_output = t0.sum({1, 2, 3}); - - testValidate(&fusion, cg_output, {t0}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionNonRedAxisBind_CUDA) { - int bid_x = 3; - int tid_x = 2; - int red_dim = 0; - - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - TensorView* tv1 = - reductionOp(BinaryOpType::Add, {red_dim}, new Double(0), tv0); - fusion.addOutput(tv1); - - tv1->split(-1, tid_x); - tv1->axis(-2)->parallelize(ParallelType::BIDx); - tv1->axis(-1)->parallelize(ParallelType::TIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({16, bid_x * tid_x}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({input}); - - auto aten_output = input.to(at::kDouble).sum({red_dim}); - - testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionSplitBCast_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* input_tv0 = makeSymbolicTensor(3); - TensorView* input_tv1 = makeSymbolicTensor(3); - fusion.addInput(input_tv0); - fusion.addInput(input_tv1); - - TensorView* sum_tv2 = - reductionOp(BinaryOpType::Add, {2}, new Double(0), input_tv0); - TensorView* bcast_tv3 = broadcast(sum_tv2, {false, false, true}); - TensorView* output_tv4 = div(input_tv1, bcast_tv3); - - sum_tv2->split(-1, 32); - TensorView* sum_rf_tv5 = sum_tv2->rFactor({-2}); - - bcast_tv3->split(-1, 32); - output_tv4->split(-1, 32); - - sum_rf_tv5->axis(0)->parallelize(ParallelType::BIDx); - sum_tv2->axis(0)->parallelize(ParallelType::BIDx); - bcast_tv3->axis(0)->parallelize(ParallelType::BIDx); - output_tv4->axis(0)->parallelize(ParallelType::BIDx); - - sum_rf_tv5->axis(1)->parallelize(ParallelType::BIDy); - sum_tv2->axis(1)->parallelize(ParallelType::BIDy); - bcast_tv3->axis(1)->parallelize(ParallelType::BIDy); - output_tv4->axis(1)->parallelize(ParallelType::BIDy); - - sum_rf_tv5->axis(-1)->parallelize(ParallelType::TIDx); - sum_tv2->axis(-1)->parallelize(ParallelType::TIDx); - bcast_tv3->axis(-1)->parallelize(ParallelType::TIDx); - output_tv4->axis(-1)->parallelize(ParallelType::TIDx); - - fusion.addOutput(output_tv4); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({32, 32, 128}, options); - at::Tensor t1 = at::randn({32, 32, 128}, options); - at::Tensor cg_output = at::empty({32, 32, 128}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({t0, t1}, {cg_output}); -} - -TEST(NVFuserTest, FusionBCastInnerDim_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - // reduce then broadcast - auto tv1 = sum(tv0, {0}); - auto tv2 = broadcast(tv1, {false, true}); - - TORCH_CHECK(!tv2->axis(0)->isReduction() && tv2->axis(1)->isBroadcast()); -} - -TEST(NVFuserTest, FusionBCastReduce_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - - auto tv1 = broadcast(tv0, {true, false, false}); - auto tv2 = sum(tv1, {1}); - TORCH_CHECK( - tv2->axis(0)->isBroadcast() && tv2->axis(1)->isReduction() && - !tv2->axis(2)->isBroadcast() && !tv2->axis(2)->isReduction()); -} - -// Multiple consumer reduction with computeAt -// https://github.com/csarofeen/pytorch/issues/110 -TEST(NVFuserTest, FusionReductionMultiConsumer_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = unaryOp(UnaryOpType::Exp, tv0); - auto tv2 = reductionOp(BinaryOpType::Max, {-1}, new Double(0), tv1); - auto tv3 = reductionOp(BinaryOpType::Min, {-1}, new Double(0), tv1); - auto tv4 = add(tv2, tv3); - fusion.addOutput(tv4); - tv1->computeAt(tv2, -1, ComputeAtMode::BestEffort); - - TORCH_CHECK(tv1->getComputeAtPosition() == 2); -} - -TEST(NVFuserTest, FusionComputeAtExprOrder1_CUDA) { - for (const auto i : c10::irange(2)) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv0, new Double(1)); - TensorView* tv3 = add(tv1, tv2); - // Set outputs tv2 or tv1 and then tv3 - if (i == 0) { - fusion.addOutput(tv2); - } else { - fusion.addOutput(tv1); - } - fusion.addOutput(tv3); - - if (i == 0) { - tv1->computeAt(tv3, -1); - } else { - tv2->computeAt(tv3, -1); - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({100}, options); - std::vector aten_outputs = { - aten_input + 1, (aten_input + 1) * 2}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({aten_input}); - - testValidate( - &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); - } -} - -TEST(NVFuserTest, FusionComputeAtExprOrder2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv0, new Double(1)); - TensorView* tv3 = add(tv1, tv2); - fusion.addOutput(tv3); - - tv3->split(-1, 32); - - tv1->computeAt(tv3, -1); - tv2->computeAt(tv3, -2); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({100, 100}, options); - auto aten_output = (aten_input + 1) * 2; - - at::Tensor cg_output = at::empty_like(aten_input, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({aten_input}, {cg_output}); - - testValidate( - &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionComputeAtExprOrder3_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const size_t dimx = 13; - const size_t dimy = 15; - - TensorView* tv0 = makeConcreteTensor({dimx, dimy}); - fusion.addInput(tv0); - TensorView* tv1 = add(tv0, new Double(1)); - TensorView* tv2 = add(tv1, new Double(2)); - TensorView* tv3 = add(tv2, new Double(3)); - TensorView* tv4 = add(tv3, new Double(4)); - TensorView* tv5 = mul(tv2, tv4); - fusion.addOutput(tv5); - - tv1->computeAt(tv2, 2); - tv3->computeAt(tv4, 1); - tv4->computeAt(tv5, 2); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({dimx, dimy}, options); - auto t1 = aten_input.add(1.); - auto t2 = t1.add(2.); - auto t3 = t2.add(3.); - auto t4 = t3.add(4.); - auto aten_output = t2.mul(t4); - - torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({aten_input}); - - testValidate( - &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionZeroDimComputeAt_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = sum(tv0, {0}); - auto tv2 = add(tv1, new Double(1)); - fusion.addOutput(tv2); - TORCH_CHECK(tv2->nDims() == 0); - tv1->computeAt(tv2, 0); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({100}, options); - auto aten_output = aten_input.to(at::kDouble).sum() + 1; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({aten_input}); - - testValidate( - &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionZeroDimBroadcast_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(0); - fusion.addInput(tv0); - - auto tv1 = broadcast(tv0, {true, true}); - TORCH_CHECK(tv1->nDims() == 2); - - TensorView* tv2 = makeSymbolicTensor(2); - fusion.addInput(tv2); - - auto tv3 = add(tv1, tv2); - auto tv4 = sum(tv3, {0, 1}); - fusion.addOutput(tv4); - - tv3->computeAt(tv4, -1); - tv3->axis(-2)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDy); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({}, options); - at::Tensor t1 = at::randn({10, 10}, options); - - auto aten_output = (t0.unsqueeze(-1).unsqueeze(-1).expand({10, 10}) + t1) - .to(at::kDouble) - .sum(); - - std::vector aten_inputs = {t0, t1}; - at::Tensor cg_output = at::empty({}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion(aten_inputs, {cg_output}); - - testValidate( - &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionZeroDimReduction_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const int bdimx = 32; - const int gdimx = 32; - - TensorView* tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = sum(tv0, {0}); - fusion.addOutput(tv1); - - tv1->split(0, bdimx); - tv1->split(0, gdimx); - auto tv2 = tv1->rFactor({0}); - - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv1->axis(-2)->parallelize(ParallelType::BIDx); - tv2->axis(-2)->parallelize(ParallelType::BIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({1000}, options); - auto aten_output = aten_input.to(at::kDouble).sum(); - - at::Tensor cg_output = at::empty({}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({aten_input}, {cg_output}); - - testValidate( - &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionBCastAfterReduce_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - const int tidx = 128; - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = sum(tv0, {1}); - auto tv2 = broadcast(tv1, {false, true}); - - tv1->split(1, tidx); - auto tv3 = tv1->rFactor({-2}); - - TensorView* tv4 = makeSymbolicTensor(2); - fusion.addInput(tv4); - - auto tv5 = add(tv2, tv4); - fusion.addOutput(tv5); - tv5->split(1, tidx); - - tv3->computeAt(tv5, 1); - - tv2->split(1, tidx); - - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv5->axis(-1)->parallelize(ParallelType::TIDx); - - tv5->axis(0)->parallelize(ParallelType::BIDx); - - int x = 63, y = 200; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor t0 = at::randn({x, y}, options); - at::Tensor t4 = at::randn({x, y}, options); - - auto t3 = t0.to(at::kDouble).sum({1}).unsqueeze(-1).expand({x, y}); - auto aten_output = t3.add(t4); - - std::vector aten_inputs = {t0, t4}; - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({t0, t4}); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionOutputBroadcast_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeConcreteTensor({2, 3}); - fusion.addInput(tv0); - - TensorView* tv1 = broadcast(tv0, {true, false, true, false, true}); - - fusion.addOutput(tv1); - - const auto options = - at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor aten_input = at::randn({2, 3}, options); - auto aten_output = aten_input.unsqueeze(2).unsqueeze(1).unsqueeze(0); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto cg_outputs = fe.runFusion({aten_input}); - - testValidate( - &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionReductionKeepDimBasic_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeConcreteTensor({2, 3, 4, 5, 6}); - fusion.addInput(tv0); - - TensorView* tv1 = sum(tv0, {0, 2, -1}, /*keep_dim=*/true); - - fusion.addOutput(tv1); - - const auto options = - at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor aten_input = at::randn({2, 3, 4, 5, 6}, options); - auto aten_output = - aten_input.to(at::kDouble).sum({0, 2, -1}, /*keepdim=*/true); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto cg_outputs = fe.runFusion({aten_input}); - - testValidate( - &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionReductionKeepDimScheduler_CUDA) { - constexpr int bid_x = 80; - constexpr int tid_x = 4096; - constexpr int red_dim = 1; - - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeConcreteTensor({bid_x, tid_x}); - fusion.addInput(tv0); - - TensorView* tv1 = reductionOp( - BinaryOpType::Add, {red_dim}, new Double(0), tv0, /*keep_dim=*/true); - - fusion.addOutput(tv1); - - const auto options = - at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor aten_input = at::randn({bid_x, tid_x}, options); - auto aten_output = - aten_input.to(at::kDouble).sum({red_dim}, /*keepdim=*/true); - - // Apply reduction heuristic - auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); - TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); - scheduleReduction(&fusion, reduction_params.value()); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto lparams = reduction_params.value().lparams; - - auto cg_outputs = fe.runFusion({aten_input}, lparams); - - testValidate( - &fusion, - cg_outputs, - {aten_input}, - {aten_output}, - __LINE__, - __FILE__, - "", - lparams); -} - -TEST(NVFuserTest, FusionSumTo_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - std::vector tensor_shape{2, 3, 4, 5, 6}; - std::vector sum_to_shape{1, 5, 6}; - - std::vector tensor_shape_ref{2, 3, 4, 5, 6}; - std::vector sum_to_shape_ref{1, 5, 6}; - - std::vector sum_to_symb; - std::transform( - sum_to_shape.begin(), - sum_to_shape.end(), - std::back_inserter(sum_to_symb), - [](int s) -> Int* { return new Int(s); }); - - TensorView* tv0 = makeConcreteTensor(tensor_shape); - fusion.addInput(tv0); - - TensorView* tv1 = sum_to(tv0, sum_to_symb); - fusion.addOutput(tv1); - - const auto options = - at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor aten_input = at::randn(tensor_shape_ref, options); - auto aten_output = at::sum_to(aten_input.to(at::kDouble), sum_to_shape_ref); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto cg_outputs = fe.runFusion({aten_input}); - - TORCH_CHECK( - cg_outputs[0].dim() == sum_to_shape.size(), - "sum_to not keeping the final dimension"); - - testValidate( - &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionSumToNoop_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - std::vector tensor_shape{4, 5, 6}; - std::vector sum_to_shape{4, 5, 6}; - - std::vector tensor_shape_ref{4, 5, 6}; - std::vector sum_to_shape_ref{4, 5, 6}; - - std::vector sum_to_symb; - std::transform( - sum_to_shape.begin(), - sum_to_shape.end(), - std::back_inserter(sum_to_symb), - [](int s) -> Int* { return new Int(s); }); - - TensorView* tv0 = makeConcreteTensor(tensor_shape); - fusion.addInput(tv0); - - TensorView* tv1 = sum_to(tv0, sum_to_symb); - - // Dummy operator to avoid tv0 both input and output - TensorView* tv2 = add(tv1, new Double(0)); - fusion.addOutput(tv2); - - const auto options = - at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor aten_input = at::randn(tensor_shape_ref, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto cg_outputs = fe.runFusion({aten_input}); - auto aten_output = at::sum_to(aten_input.to(at::kDouble), sum_to_shape_ref); - - TORCH_CHECK( - cg_outputs[0].dim() == sum_to_shape.size(), - "sum_to not keeping the final dimension"); - - testValidate( - &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionReductionScheduler_CUDA) { - constexpr int bid_x = 80; - constexpr int tid_x = 4096; - constexpr int red_dim = 1; - - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - TensorView* tv1 = - reductionOp(BinaryOpType::Add, {red_dim}, new Double(0), tv0); - fusion.addOutput(tv1); - - const auto options = - at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor aten_input = at::randn({bid_x, tid_x}, options); - auto aten_output = aten_input.to(at::kDouble).sum({red_dim}); - - // Apply reduction heuristic - auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); - TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); - scheduleReduction(&fusion, reduction_params.value()); - - auto lparams = reduction_params.value().lparams; - - FusionExecutor fe; - fe.compileFusion(&fusion); - // no broadcasting needed, omitting the last optional argument; - auto cg_outputs = fe.runFusion({aten_input}, lparams); - - testValidate( - &fusion, - cg_outputs, - {aten_input}, - {aten_output}, - __LINE__, - __FILE__, - "", - lparams); -} - -// Simple reduction parallelized on a symbolic size. -TEST(NVFuserTest, FusionSymbolicReduction_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - // tv1[I0, R1] = tv0[I0, I1] - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); - fusion.addOutput(tv1); - - // Interface should just be a direct split with a Parallel type. We can - // include the parallelize call if we do this. - tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx)); - // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1] - - TensorView* tv2 = tv1->rFactor({1}); - // tv2[I0, R1oo, Ir1oi{4}, Ir1i{BIDx}] = tv0[I0, I1] - // tv1[I0, R1oi{4}, R1i{BIDx}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{BIDx}] - - // Incrementally, can print in between for debugging - tv0->computeAt(tv2, 1); - tv2->computeAt(tv1, 1); - - tv2->axis(-1)->parallelize(ParallelType::TIDx); - - tv1->axis(0)->parallelize(ParallelType::BIDx); - tv1->axis(-1)->parallelize(ParallelType::TIDx); - - int numel_x = 65000; - int numel_y = 1025; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({numel_x, numel_y}, options); - auto aten_output = aten_input.to(at::kDouble).sum({1}); - - // How many threads to use for the block reduction - int runtime_threadIdx_dim = 128; - - LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({aten_input}, lparams); - - testValidate( - &fusion, - cg_outputs, - {aten_input}, - {aten_output}, - __LINE__, - __FILE__, - "", - lparams); -} - -TEST(NVFuserTest, FusionReductionSchedulerMultiDimNonFastest_CUDA) { - const std::vector red_dims = {0, 2}; - // Copy is because CodeGen requires int and Pytorch requires int64_t - // for a vector of reduction dimensions - const std::vector red_dims64 = {0, 2}; - const std::vector tensor_dims_in = {5, 10, 15, 20}; - const std::vector tensor_dims_out = {10, 20}; - - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(tensor_dims_in.size()); - fusion.addInput(tv0); - - TensorView* tv1 = - reductionOp(BinaryOpType::Add, red_dims, new Double(0), tv0); - fusion.addOutput(tv1); - - const auto options = - at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn(tensor_dims_in, options); - auto aten_output = aten_input.to(at::kDouble).sum(red_dims64); - at::Tensor cg_output = at::empty(tensor_dims_out, options); - - // Apply reduction heuristic - auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); - TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); - scheduleReduction(&fusion, reduction_params.value()); - auto lparams = reduction_params.value().lparams; - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({aten_input}, {cg_output}, lparams); - - testValidate( - &fusion, - {cg_output}, - {aten_input}, - {aten_output}, - __LINE__, - __FILE__, - "", - lparams); -} - -TEST(NVFuserTest, FusionReductionSchedulerMultiDimFastest_CUDA) { - const std::vector red_dims = {1, 3}; - // Copy is because CodeGen requires int and Pytorch requires int64_t - // for a vector of reduction dimensions - const std::vector red_dims64 = {1, 3}; - const std::vector tensor_dims_in = {5, 10, 15, 20}; - - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(tensor_dims_in.size()); - fusion.addInput(tv0); - - TensorView* tv1 = - reductionOp(BinaryOpType::Add, red_dims, new Double(0), tv0); - fusion.addOutput(tv1); - - const auto options = - at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn(tensor_dims_in, options); - auto aten_output = aten_input.to(at::kDouble).sum(red_dims64); - - auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); - TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); - scheduleReduction(&fusion, reduction_params.value()); - auto lparams = reduction_params.value().lparams; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({aten_input}, lparams); - - testValidate( - &fusion, - cg_outputs, - {aten_input}, - {aten_output}, - __LINE__, - __FILE__, - "", - lparams); -} - -TEST(NVFuserTest, FusionReductionSchedulerNoODimShmoo_CUDA) { - std::vector dtypes = { - DataType::Double, DataType::Float, DataType::Half}; -#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 - if (at::cuda::getDeviceProperties(0)->major >= 8) { - dtypes.insert(dtypes.end(), DataType::BFloat16); - } -#endif - - std::vector red_dims; - - // Tried to cut down the number iterations with just - // doing every other power of 2. - for (int i = 1; i <= 1024 * 1024; i <<= 2) { - red_dims.push_back(i); - } - - for (auto dtype : dtypes) { - at::ScalarType aten_dtype = data_type_to_aten(dtype); - for (auto& rdim : red_dims) { - Fusion fusion; - FusionGuard fg(&fusion); - - bool is_fp16 = dtype == DataType::Half; - bool is_bf16 = dtype == DataType::BFloat16; - - TensorView* tv0 = makeSymbolicTensor(1, dtype); - fusion.addInput(tv0); - - TensorView* tv0_cast = tv0; - if (is_fp16 || is_bf16) { - tv0_cast = castOp(DataType::Float, tv0); - } - - TensorView* tv1 = sum(tv0_cast, {0}); - - TensorView* tv1_cast = tv1; - if (is_fp16) { - tv1_cast = castOp(DataType::Half, tv1); - } - if (is_bf16) { - tv1_cast = castOp(DataType::BFloat16, tv1); - } - - fusion.addOutput(tv1_cast); - - auto options = at::TensorOptions().dtype(aten_dtype).device(at::kCUDA, 0); - - at::Tensor aten_input = at::randn({rdim}, options); - auto aten_output = aten_input.to(at::kDouble).sum({0}); - - auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); - TORCH_CHECK(reduction_params.has_value(), "Reduction is not found!"); - scheduleReduction(&fusion, reduction_params.value()); - auto lparams = reduction_params.value().lparams; - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto cg_outputs = fe.runFusion({aten_input}, lparams); - - testValidate( - &fusion, - cg_outputs, - {aten_input}, - {aten_output}, - __LINE__, - __FILE__, - "", - lparams); - } - } -} - -TEST(NVFuserTest, FusionReductionSchedulerDimShmoo_CUDA) { - std::vector dtypes = { - DataType::Double, DataType::Float, DataType::Half}; -#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 - if (at::cuda::getDeviceProperties(0)->major >= 8) { - dtypes.insert(dtypes.end(), DataType::BFloat16); - } -#endif - - std::vector red_axis = {1, 0}; - std::vector output_dims = {160, 320}; - std::vector red_dims; - - // Tried to cut down the number iterations with just - // doing every other power of 2. - for (int i = 1; i <= 1024 * 1024; i <<= 2) { - red_dims.push_back(i); - } - - for (auto dtype : dtypes) { - at::ScalarType aten_dtype = data_type_to_aten(dtype); - for (auto& axis : red_axis) { - for (auto& odim : output_dims) { - for (auto& rdim : red_dims) { - Fusion fusion; - FusionGuard fg(&fusion); - - bool is_fp16 = dtype == DataType::Half; - bool is_bf16 = dtype == DataType::BFloat16; - - TensorView* tv0 = makeSymbolicTensor(2, dtype); - fusion.addInput(tv0); - - TensorView* tv0_cast = tv0; - if (is_fp16 || is_bf16) { - tv0_cast = castOp(DataType::Float, tv0); - } - - TensorView* tv1 = sum(tv0_cast, {axis}); - - TensorView* tv1_cast = tv1; - if (is_fp16) { - tv1_cast = castOp(DataType::Half, tv1); - } - if (is_bf16) { - tv1_cast = castOp(DataType::BFloat16, tv1); - } - fusion.addOutput(tv1_cast); - - auto options = - at::TensorOptions().dtype(aten_dtype).device(at::kCUDA, 0); - - at::Tensor aten_input = - (axis ? at::randn({odim, rdim}, options) - : at::randn({rdim, odim}, options)); - - auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); - TORCH_CHECK(reduction_params.has_value(), "Reduction is not found!"); - scheduleReduction(&fusion, reduction_params.value()); - auto lparams = reduction_params.value().lparams; - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto cg_outputs = fe.runFusion({aten_input}, lparams); - auto aten_output = aten_input.to(at::kDouble).sum({axis}); - testValidate( - &fusion, - cg_outputs, - {aten_input}, - {aten_output}, - __LINE__, - __FILE__, - "", - lparams); - } - } - } - } -} - -TEST(NVFuserTest, FusionCacheBefore_CUDA) { - // TVM Cache Write - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2); - TensorView* tv1 = add(tv0, new Double(1.0)); - TensorView* tv2 = mul(tv1, new Double(3.0)); - fusion.addInput(tv0); - fusion.addOutput(tv2); - - // Before: TV2 = TV1 * 3 - // After: TV3 = TV1 * 3; - // TV2 = TV3; - TensorView* tv3 = tv2->cache_before(); - - constexpr int BSX = 32; - tv2->split(-1, BSX); - tv0->computeAt(tv2, -1); - - // Thread and Block binding - tv2->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - - constexpr int M = 32, N = 750; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({M, N}, options); - at::Tensor aten_output = (aten_input + 1.0) * 3.0; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({aten_input}); - - testValidate( - &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionCacheAfter_CUDA) { - // TVM Cache Read - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2); - TensorView* tv1 = add(tv0, new Double(1.0)); - TensorView* tv2 = mul(tv1, new Double(3.0)); - fusion.addInput(tv0); - fusion.addOutput(tv2); - - // Before: TV1 = TV0 + 1 - // After: TV3 = TV0; - // TV1 = TV3 + 1 - TensorView* tv3 = tv0->cache_after(); - - constexpr int BSX = 32; - tv2->split(-1, BSX); - tv0->computeAt(tv2, -1); - - // Thread and Block binding - tv2->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - - constexpr int M = 32, N = 457; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({M, N}, options); - at::Tensor aten_output = (aten_input + 1.0) * 3.0; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({aten_input}); - - testValidate( - &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionCacheFork_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2); - TensorView* tv1 = add(tv0, new Double(1.0)); - TensorView* tv2 = mul(tv1, new Double(3.0)); - fusion.addInput(tv0); - fusion.addOutput(tv1); - fusion.addOutput(tv2); - // Before: TV1 = TV0 + 1 - // TV2 = TV1 * 1 - // Output: TV1, TV2 - - // After: TV1 = TV0 + 1 - // TV3 = TV1 - // TV2 = TV1 * 1 - // Output: TV3, TV2 - - // cache_fork !!does not!! automatically apply ComputeAt to the cache - auto tv3 = tv1->cache_fork(); - - constexpr int BSX = 32; - tv2->split(-1, BSX); - tv0->computeAt(tv2, -1); - - // Thread and Block binding - tv2->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - - constexpr int M = 32, N = 457; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({M, N}, options); - at::Tensor aten_output1 = aten_input + 1.0; - at::Tensor aten_output2 = aten_output1 * 3.0; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({aten_input}); - - testValidate( - &fusion, - cg_outputs, - {aten_input}, - {aten_output1, aten_output2}, - __LINE__, - __FILE__); -} - -TEST(NVFuserTest, FusionCacheIndirect_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2); - TensorView* tv1 = makeSymbolicTensor(2); - TensorView* tv2 = makeSymbolicTensor(2); - TensorView* tv3 = makeSymbolicTensor(2); - TensorView* tv4 = sub(tv2, tv3); - TensorView* tv5 = add(tv1, tv4); - TensorView* tv6 = sub(tv5, tv0); - fusion.addInput(tv0); - fusion.addInput(tv1); - fusion.addInput(tv2); - fusion.addInput(tv3); - fusion.addOutput(tv6); - // t6 = ((t1 + (t2 - t3)) - t0) - - tv5->cache_after(); - tv5->cache_before(); - - // cache_after on inputs placed before schedule - constexpr int BSX = 32; - tv6->split(-1, BSX); - tv2->computeAt(tv6, -1); - - // Thread and Block binding - tv6->axis(0)->parallelize(ParallelType::BIDx); - tv6->axis(-1)->parallelize(ParallelType::TIDx); - - constexpr int M = 32, N = 810; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({M, N}, options); - at::Tensor t1 = at::randn({M, N}, options); - at::Tensor t2 = at::randn({M, N}, options); - at::Tensor t3 = at::randn({M, N}, options); - - std::vector aten_inputs = {t0, t1, t2, t3}; - at::Tensor aten_output = (t1 + (t2 - t3)) - t0; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionCacheBcast_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Algorithm - TensorView* tv0 = makeSymbolicTensor(1); // (M, 1) - TensorView* tv1 = broadcast(tv0, {false, true}); - TensorView* tv2 = makeSymbolicTensor(1); // (1, N) - TensorView* tv3 = broadcast(tv2, {true, false}); - TensorView* tv4 = mul(tv1, tv3); - fusion.addInput(tv0); - fusion.addInput(tv2); - fusion.addOutput(tv4); - - // Case 1 - tv0->cache_after(); - - // Case 2 - tv1->cache_before(); - - // Case 3 - tv1->cache_after(); - - // Case 4 - TensorView* tv8 = tv4->cache_before(); - - constexpr int BSX = 128; - tv4->split(0, BSX); - tv4->split(-1, BSX); - tv4->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}}); - // M/BSX, N/BSY, BSX, BSY - tv0->computeAt(tv4, 2); - tv2->computeAt(tv4, 2); - // 0, 1 | 2, 3, 4 - - tv4->axis(0)->parallelize(ParallelType::BIDx); - tv4->axis(1)->parallelize(ParallelType::BIDy); - tv4->axis(-1)->parallelize(ParallelType::TIDx); - // Manual Replay on TV3 - tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv8->axis(-1)->parallelize(ParallelType::TIDx); - - constexpr int M = 92, N = 500; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({M}, options); - at::Tensor t1 = at::randn({N}, options); - std::vector aten_inputs = {t0, t1}; - at::Tensor aten_output = - t0.to(at::kDouble).unsqueeze(1).matmul(t1.to(at::kDouble).unsqueeze(0)); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionCacheMultiConsumer_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(1); - TensorView* tv1 = add(tv0, new Double(1)); - TensorView* tv2 = add(tv1, new Double(2)); - TensorView* tv3 = add(tv0, new Double(1)); - TensorView* tv4 = add(tv3, new Double(2)); - - fusion.addInput(tv0); - fusion.addOutput(tv2); - fusion.addOutput(tv4); - - auto tv5 = tv1->cache_before(); - auto tv6 = tv3->cache_before(); - tv5->setMemoryType(MemoryType::Shared); - tv6->setMemoryType(MemoryType::Shared); - - tv1->computeAt(tv2, -1); - tv3->computeAt(tv4, -1); - - // Fails because tensor must be recomputed twice - // auto tv7 = tv0->cache_after(); - - constexpr int N = 800; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({N}, options); - auto aten_output = (aten_input + 1) + 2; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({aten_input}); - - testValidate( - &fusion, - cg_outputs, - {aten_input}, - {aten_output, aten_output}, - __LINE__, - __FILE__); -} - -TEST(NVFuserTest, FusionSmem_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Algorithm - TensorView* tv0 = makeSymbolicTensor(2); // (M, N) - TensorView* tv1 = makeSymbolicTensor(2); // (M, N) - TensorView* tv2 = mul(tv0, tv1); - fusion.addInput(tv0); - fusion.addInput(tv1); - fusion.addOutput(tv2); - - // Schedule - TensorView* tv3 = tv0->cache_after(); - TensorView* tv4 = tv1->cache_after(); - tv3->setMemoryType(MemoryType::Shared); - tv4->setMemoryType(MemoryType::Shared); - - constexpr int BSY = 32; - constexpr int BSX = 128; - tv2->split(0, BSY); - tv2->split(2, BSX); - // M/BSX, BSX, N/BSX, BSX - tv2->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}}); - // M/BSX, N/BSX, BSX, BSX - - tv0->computeAt(tv2, 2); - tv1->computeAt(tv2, 2); - - // Thread and Block binding - tv2->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(1)->parallelize(ParallelType::BIDy); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - // Manual Binding - tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv4->axis(-1)->parallelize(ParallelType::TIDx); - - constexpr int M = 128, N = 10240; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({M, N}, options); - at::Tensor t1 = at::randn({M, N}, options); - at::Tensor aten_output = mul(t0, t1); - - std::vector aten_inputs = {t0, t1}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({t0, t1}); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); - - TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); -} - -TEST(NVFuserTest, FusionSmemReduce_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Algorithm - TensorView* tv0 = makeSymbolicTensor(3); // M, K, N - TensorView* tv1 = sum(tv0, {1}); // M, R, N - fusion.addInput(tv0); - fusion.addOutput(tv1); - - TensorView* tv2 = tv0->cache_after(); - tv2->setMemoryType(MemoryType::Shared); - - // Schedule - constexpr int BSX = 32; - tv1->split(2, BSX); - tv1->split(1, 128); - tv1->split(0, BSX); - // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX - tv1->reorder({{0, 0}, {1, 2}, {2, 4}, {3, 5}, {4, 1}, {5, 3}}); - TensorView* tv3 = tv1->rFactor({-2}); - - tv0->computeAt(tv1, -2); - tv0->computeAt(tv3, -2); - - // Thread and Block binding - tv1->axis(0)->parallelize(ParallelType::BIDx); - tv1->axis(1)->parallelize(ParallelType::BIDy); - tv1->axis(-1)->parallelize(ParallelType::TIDx); - // Manual Binding - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - - constexpr int M = 154, K = 45, N = 1524; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({M, K, N}, options); - at::Tensor aten_output = sum(aten_input.to(at::kDouble), {1}); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({aten_input}); - - testValidate( - &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); - TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1); -} - -TEST(NVFuserTest, FusionSmemBlockGemm_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Algorithm - TensorView* tv0 = makeSymbolicTensor(2); // (M, K) - TensorView* tv1 = makeSymbolicTensor(2); // (K, N) - TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B) - TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N) - TensorView* tv4 = mul(tv2, tv3); // M, K, N - TensorView* tv5 = sum(tv4, {1}); // M, R, N - fusion.addInput(tv0); - fusion.addInput(tv1); - fusion.addOutput(tv5); - - // Schedule - constexpr int BSX = 16; - tv5->split(2, BSX); - tv5->split(1, BSX); - tv5->split(0, BSX); - // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX - tv5->reorder({{0, 0}, {1, 3}, {2, 2}, {3, 5}, {4, 1}, {5, 4}}); - // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX - TensorView* tv6 = tv5->rFactor({-1}); - - tv2->setMemoryType(MemoryType::Shared); - tv3->setMemoryType(MemoryType::Shared); - tv4->setMemoryType(MemoryType::Shared); - tv6->setMemoryType(MemoryType::Shared); - - tv0->computeAt(tv5, 3); - tv1->computeAt(tv5, 3); - - // Thread and Block binding - tv5->axis(0)->parallelize(ParallelType::BIDx); - tv5->axis(1)->parallelize(ParallelType::BIDy); - tv5->axis(-2)->parallelize(ParallelType::TIDy); - tv5->axis(-1)->parallelize(ParallelType::TIDx); - // Manual Binding - tv2->axis(-3)->parallelize(ParallelType::TIDy); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv4->axis(-3)->parallelize(ParallelType::TIDy); - tv4->axis(-1)->parallelize(ParallelType::TIDx); - tv6->axis(-3)->parallelize(ParallelType::TIDy); - tv6->axis(-2)->parallelize(ParallelType::TIDx); - - // Make sure BIDx is makred as exact (see issue #1119) - GpuLower gpulw(&fusion); - TORCH_CHECK(gpulw.parallelDimensionMap().isExact(ParallelType::BIDx)); - - constexpr int M = 154, K = 45, N = 1524; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({M, K}, options); - at::Tensor t1 = at::randn({K, N}, options); - - std::vector aten_inputs = {t0, t1}; - at::Tensor aten_output = matmul(t0.to(at::kDouble), t1.to(at::kDouble)); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({t0, t1}); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); - - TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); -} - -TEST(NVFuserTest, FusionSmemBlockGemmCache_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Algorithm - TensorView* tv0 = makeSymbolicTensor(2); // (M, K) - TensorView* tv1 = makeSymbolicTensor(2); // (K, N) - TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B) - TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N) - TensorView* tv4 = mul(tv2, tv3); // M, K, N - TensorView* tv5 = sum(tv4, {1}); // M, R, N - fusion.addInput(tv0); - fusion.addInput(tv1); - fusion.addOutput(tv5); - - // Schedule - // Remove reduction axis from tv5 - // tv6 = (M, R, N) - // tv5 = (M, N) - TensorView* tv6 = tv5->cache_before(); - - constexpr int BSX = 16; - tv5->split(1, BSX); - tv5->split(0, BSX); - // M/BSX, BSX, N/BSX, BSX - tv5->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}}); - // tv5 = M/BSX, N/BSX, MSX, NSX - - tv6->computeAt(tv5, 2); - tv6->computeAt(tv5, 2); - - tv6->split(-1, BSX); - // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX - tv6->reorder({{0, 0}, {1, 1}, {2, 3}, {3, 4}, {4, 2}, {5, 5}}); - // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX - TensorView* tv7 = tv6->rFactor({-1}); - // tv7 = M/BSX, N/BSX, K/BSXrf, MSX, NSX, KSXr - // tv6 = M/BSX, N/BSX, K/BSXr, MSX, NSX - - tv0->computeAt(tv6, 3); - tv1->computeAt(tv6, 3); - - tv0->computeAt(tv7, 3); - tv1->computeAt(tv7, 3); - - tv2->setMemoryType(MemoryType::Shared); - tv3->setMemoryType(MemoryType::Shared); - tv4->setMemoryType(MemoryType::Shared); - tv6->setMemoryType(MemoryType::Shared); - tv7->setMemoryType(MemoryType::Shared); - // Memory Type - - // Thread and Block binding - tv5->axis(0)->parallelize(ParallelType::BIDx); - tv5->axis(1)->parallelize(ParallelType::BIDy); - tv5->axis(-2)->parallelize(ParallelType::TIDy); - tv5->axis(-1)->parallelize(ParallelType::TIDx); - // Manual Binding - tv2->axis(-3)->parallelize(ParallelType::TIDy); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv4->axis(-3)->parallelize(ParallelType::TIDy); - tv4->axis(-1)->parallelize(ParallelType::TIDx); - - tv7->axis(-3)->parallelize(ParallelType::TIDy); - tv7->axis(-2)->parallelize(ParallelType::TIDx); - - tv6->axis(-2)->parallelize(ParallelType::TIDy); - tv6->axis(-1)->parallelize(ParallelType::TIDx); - - constexpr int M = 154, K = 45, N = 1524; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({M, K}, options); - at::Tensor t1 = at::randn({K, N}, options); - at::Tensor aten_output = matmul(t0.to(at::kDouble), t1.to(at::kDouble)); - - std::vector aten_inputs = {t0, t1}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); - - TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); -} - -TEST(NVFuserTest, FusionSmemDynamicPersistentSoftmax2D_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* x = makeSymbolicTensor(2); - fusion.addInput(x); - TensorView* max_val = reductionOp( - BinaryOpType::Max, - {-1}, - new Double(std::numeric_limits::lowest()), - x); // (M) - TensorView* bcast_max = broadcast(max_val, {false, true}); // (M, B) - TensorView* x_max_sub = sub(x, bcast_max); // (M, N) - TensorView* exp = unaryOp(UnaryOpType::Exp, x_max_sub); // (M, N) - TensorView* sum_exp = sum(exp, {-1}); // (M, R) - TensorView* bcast_sum = broadcast(sum_exp, {false, true}); // (M, B) - TensorView* softmax = div(exp, bcast_sum); // (M, N) - fusion.addOutput(softmax); - - // Read Input into Shared Memory - // Load Input + Pwise into shared memory - auto cache_x = x->cache_after(); - cache_x->setMemoryType(MemoryType::Shared); - exp->setMemoryType(MemoryType::Shared); - - std::vector all_tensors( - {x, - cache_x, - max_val, - bcast_max, - x_max_sub, - exp, - sum_exp, - bcast_sum, - softmax}); - - auto tidx = new Int(); - fusion.addInput(tidx); - - for (auto tensor : all_tensors) { - tensor->split(-1, tidx); - } - - auto sum_exp_rf = sum_exp->rFactor({1}); - all_tensors.push_back(sum_exp_rf); - - // computeAt - x->computeAt(x_max_sub, 1); - exp->computeAt(softmax, 1); - x_max_sub->computeAt(exp, 2); - - softmax->axis(0)->parallelize(ParallelType::BIDx); - for (auto tensor : all_tensors) { - tensor->axis(-1)->parallelize(ParallelType::TIDx); - } - - const size_t dimx = 1024; - const size_t dimy = 4096; - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({dimx, dimy}, options); - auto aten_output = at::_softmax(aten_input.to(at::kDouble), -1, false); - - torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({aten_input, 128}); - - testValidate( - &fusion, - cg_outputs, - {aten_input, 128}, - {aten_output}, - __LINE__, - __FILE__); -} - -TEST(NVFuserTest, FusionMagicSchedulerSoftmax_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const int kReductionAxis = 3; - std::vector input_shape{10, 10, 10, 67}; - TensorView* input = makeSymbolicTensor(input_shape.size()); - fusion.addInput(input); - - auto output = softmax(input, kReductionAxis); - - fusion.addOutput(output); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn(input_shape, options); - auto aten_output = - at::_softmax(aten_input.to(at::kDouble), kReductionAxis, false); - - auto reduction_params = getPersistentHeuristics(&fusion, {aten_input}); - TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); - - schedulePersistentKernel(&fusion, reduction_params.value()); - - auto lparams = reduction_params.value().lparams; - - torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({aten_input}, lparams); - - testValidate( - &fusion, - cg_outputs, - {aten_input}, - {aten_output}, - __LINE__, - __FILE__, - "", - lparams); -} - -TEST(NVFuserTest, TestMaskSoftmax_CUDA) { - // This test is testing the usage of all padding tokens - // with softmax like Bert might might use in a full padding - // sequence. - Fusion fusion; - FusionGuard fg(&fusion); - - const int kReductionAxis = 3; - std::vector input_shape{256, 16, 128, 128}; - TensorView* input = makeSymbolicTensor(input_shape.size()); - TensorView* mask = makeSymbolicTensor(input_shape.size()); - fusion.addInput(input); - fusion.addInput(mask); - - auto out1 = add(input, mask); - auto output = softmax(out1, kReductionAxis); - - fusion.addOutput(output); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn(input_shape, options); - at::Tensor aten_mask = at::ones(input_shape, options); - // -10,000 is used here as a magic number because the padding - // tokens need to be a value that gives a value close to zero - // as to not influence softmax. Bert, in particular, does - // not use -Infinity because sometimes it will have a - // softmax of all padding tokkens that can result a divide by - // zero that creates NaN result. - aten_mask = aten_mask * -10000.0; - auto aten_out1 = aten_input + aten_mask; - auto aten_output = at::_softmax(aten_out1, kReductionAxis, false); - - auto reduction_params = - getPersistentHeuristics(&fusion, {aten_input, aten_mask}); - TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); - - schedulePersistentKernel(&fusion, reduction_params.value()); - - auto lparams = reduction_params.value().lparams; - - torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({aten_input, aten_mask}, lparams); - - testValidate( - &fusion, - cg_outputs, - {aten_input, aten_mask}, - {aten_output}, - __LINE__, - __FILE__, - "", - lparams); -} - -TEST(NVFuserTest, FusionMagicSchedulerLayerNormBackward_CUDA) { - std::unique_ptr fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); - FusionGuard fg(&fusion); - - std::vector shape{20, 100, 35, 67}; - std::vector norm_shape{67}; - - const size_t kM = shape.size(); - const size_t kN = norm_shape.size(); - const size_t kOuterNumDims = kM - kN; - - std::vector outer_shape; - for (const auto idx : c10::irange(kOuterNumDims)) { - outer_shape.push_back(shape[idx]); - } - for (const auto idx : c10::irange(kOuterNumDims, kM)) { - outer_shape.push_back(1); - } - - auto grad_out = makeSymbolicTensor(shape.size()); - auto input = makeSymbolicTensor(shape.size()); - auto mean = makeConcreteTensor(outer_shape); - auto rstd = makeConcreteTensor(outer_shape); - auto weight = makeSymbolicTensor(norm_shape.size()); - auto bias = makeSymbolicTensor(norm_shape.size()); - fusion.addInput(grad_out); - fusion.addInput(input); - fusion.addInput(mean); - fusion.addInput(rstd); - fusion.addInput(weight); - fusion.addInput(bias); - - auto grads = layer_norm_backward( - grad_out, - input, - norm_shape, - mean, - rstd, - weight, - bias, - {true, true, true}); - - fusion.addOutput(grads.grad_input); - fusion.addOutput(grads.grad_weight); - fusion.addOutput(grads.grad_bias); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_grad_out = at::randn(shape, options); - at::Tensor aten_input = at::randn(shape, options); - at::Tensor aten_weight = at::randn(norm_shape, options); - at::Tensor aten_bias = at::randn(norm_shape, options); - auto at_weight = c10::optional(aten_weight); - auto at_bias = c10::optional(aten_bias); - - const float kEps = 1e-5; - auto aten_results = - at::native_layer_norm(aten_input, norm_shape, at_weight, at_bias, kEps); - auto aten_output = std::get<0>(aten_results); - auto aten_mean = std::get<1>(aten_results); - auto aten_rstd = std::get<2>(aten_results); - - FusionExecutorCache fec(std::move(fusion_ptr)); - std::vector aten_inputs = { - aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias}; - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); - - auto aten_gradients = at::native_layer_norm_backward( - aten_grad_out.to(at::kDouble), - aten_input.to(at::kDouble), - norm_shape, - aten_mean.to(at::kDouble), - aten_rstd.to(at::kDouble), - c10::optional(aten_weight.to(at::kDouble)), - c10::optional(aten_bias.to(at::kDouble)), - {true, true, true}); - - testValidate( - &fusion, - cg_outputs, - aten_inputs, - {std::get<0>(aten_gradients), - std::get<1>(aten_gradients), - std::get<2>(aten_gradients)}, - __LINE__, - __FILE__); -} - -TEST(NVFuserTest, FusionMagicSchedulerLayerNormalization_CUDA) { - std::unique_ptr fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); - FusionGuard fg(&fusion); - - const float kEps = 1e-5; - Double* eps_ptr = new Double(kEps); - - std::vector input_shape{20, 100, 35, 67}; - std::vector norm_shape{67}; - - auto input = makeSymbolicTensor(input_shape.size()); - fusion.addInput(input); - - auto result = layer_norm(input, norm_shape, nullptr, nullptr, eps_ptr); - - fusion.addOutput(result.output); - fusion.addOutput(result.mean); - fusion.addOutput(result.invstd); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn(input_shape, options); - c10::optional aten_weight = c10::nullopt; - c10::optional aten_bias = c10::nullopt; - auto aten_outputs = at::native_layer_norm( - aten_input, norm_shape, aten_weight, aten_bias, kEps); - - // Check reduction axis is same for all reductions - // Generate Launch Parameters - auto reduction_params = getPersistentHeuristics(&fusion, {aten_input}); - TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); - - schedulePersistentKernel(&fusion, reduction_params.value()); - auto lparams = reduction_params.value().lparams; - - torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({aten_input}, lparams); - - testValidate( - &fusion, - cg_outputs, - {aten_input}, - {std::get<0>(aten_outputs), - std::get<1>(aten_outputs), - std::get<2>(aten_outputs)}, - __LINE__, - __FILE__, - "", - lparams); -} - -TEST(NVFuserTest, FusionMagicSchedulerBatchNormalization_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 7) { - return; - } - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); - - const float kMomentum = 0.1; - const float kEps = 1e-5; - const bool kTraining = true; - std::vector input_shape{20, 100, 35, 45}; - - auto input = makeSymbolicTensor(input_shape.size()); - auto weight = makeSymbolicTensor(1); - auto bias = makeSymbolicTensor(1); - auto running_mean = makeSymbolicTensor(1); - auto running_var = makeSymbolicTensor(1); - fusion->addInput(input); - fusion->addInput(weight); - fusion->addInput(bias); - fusion->addInput(running_mean); - fusion->addInput(running_var); - - Double* momentum = new Double(kMomentum); - Double* eps = new Double(kEps); - - auto result = batch_norm( - input, weight, bias, running_mean, running_var, kTraining, momentum, eps); - - fusion->addOutput(result.output); - fusion->addOutput(result.mean); - fusion->addOutput(result.invstd); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto at_input = at::randn(input_shape, options); - auto at_weight = at::ones({input_shape[1]}, options); - auto at_bias = at::zeros({input_shape[1]}, options); - auto at_run_mean = at::zeros({input_shape[1]}, options); - auto at_run_var = at::ones({input_shape[1]}, options); - - std::vector aten_inputs = { - at_input, at_weight, at_bias, at_run_mean, at_run_var}; - - FusionExecutorCache executor_cache(std::move(fusion)); - - auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); - - auto aten_outputs = at::native_batch_norm( - at_input, - c10::optional(at_weight), - c10::optional(at_bias), - c10::optional(at_run_mean), - c10::optional(at_run_var), - kTraining, - kMomentum, - kEps); - - testValidate( - executor_cache.fusion(), - cg_outputs, - aten_inputs, - {at_run_mean, - at_run_var, - std::get<0>(aten_outputs), - std::get<1>(aten_outputs), - std::get<2>(aten_outputs)}, - __LINE__, - __FILE__, - ""); -} - -TEST(NVFuserTest, FusionPersistentSoftmaxLocalSmem_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const int pixels_per_thread = 64; - const int TIDX = 128; - const int static_size = pixels_per_thread * TIDX; - - TensorView* sx = makeConcreteTensor({-1, static_size}); - TensorView* dx = makeSymbolicTensor(2); - fusion.addInput(sx); - fusion.addInput(dx); - - TensorView* max_sx = reductionOp( - BinaryOpType::Max, - {-1}, - new Double(std::numeric_limits::lowest()), - sx); // (M) - TensorView* max_dx = reductionOp( - BinaryOpType::Max, - {-1}, - new Double(std::numeric_limits::lowest()), - dx); // (M) - - // Reduction => merge local and shared memory TensorViews - TensorView* max_val = binaryOp(BinaryOpType::Max, max_sx, max_dx); - TensorView* bcast_max = broadcast(max_val, {false, true}); // (M, B) - - TensorView* sx_max_sub = sub(sx, bcast_max); // (M, N) - TensorView* dx_max_sub = sub(dx, bcast_max); // (M, N) - - TensorView* sx_exp = unaryOp(UnaryOpType::Exp, sx_max_sub); // (M, N) - TensorView* dx_exp = unaryOp(UnaryOpType::Exp, dx_max_sub); // (M, N) - - TensorView* sx_sum_exp = sum(sx_exp, {-1}); // (M, R) - TensorView* dx_sum_exp = sum(dx_exp, {-1}); // (M, R) - - // Reduction => merge local and shared memory TensorViews - TensorView* sum_exp = binaryOp(BinaryOpType::Add, sx_sum_exp, dx_sum_exp); - TensorView* bcast_sum = broadcast(sum_exp, {false, true}); // (M, B) - - TensorView* sx_softmax = div(sx_exp, bcast_sum); // (M, N) - TensorView* dx_softmax = div(dx_exp, bcast_sum); // (M, N) - fusion.addOutput(sx_softmax); - fusion.addOutput(dx_softmax); - - auto sx_cache = sx->cache_after(); - auto dx_cache = dx->cache_after(); - dx_cache->setMemoryType(MemoryType::Shared); - dx_exp->setMemoryType(MemoryType::Shared); - - // Reduction and Broadcast Tensors common to both memory TVs - std::vector common_tensors( - {max_val, sum_exp, bcast_max, bcast_sum}); - - // Static Local Memory TVs - std::vector static_tensors( - {sx, sx_cache, max_sx, sx_max_sub, sx_exp, sx_sum_exp, sx_softmax}); - - // Dynamic Local Memory TVs - std::vector dynamic_tensors( - {dx, dx_cache, max_dx, dx_max_sub, dx_exp, dx_sum_exp, dx_softmax}); - - std::vector all_tensors; - all_tensors.insert( - all_tensors.end(), common_tensors.begin(), common_tensors.end()); - all_tensors.insert( - all_tensors.end(), static_tensors.begin(), static_tensors.end()); - all_tensors.insert( - all_tensors.end(), dynamic_tensors.begin(), dynamic_tensors.end()); - - // M => M - // M, N => M, N/128, 128 - for (auto tensor : all_tensors) { - if (tensor->nDims() > 1) { - tensor->split(-1, TIDX); - } - } - - auto sx_sum_exp_rf = sx_sum_exp->rFactor({1}); - auto dx_sum_exp_rf = dx_sum_exp->rFactor({1}); - all_tensors.push_back(sx_sum_exp_rf); - all_tensors.push_back(dx_sum_exp_rf); - - // computeAt - sx->computeAt(sx_max_sub, 1); - dx->computeAt(dx_max_sub, 1); - - sx_exp->computeAt(sx_softmax, 1); - dx_exp->computeAt(dx_softmax, 1); - - sx_max_sub->computeAt(sx_exp, 2); - dx_max_sub->computeAt(dx_exp, 2); - - sx_softmax->axis(0)->parallelize(ParallelType::BIDx); - dx_softmax->axis(0)->parallelize(ParallelType::BIDx); - for (auto tensor : all_tensors) { - if (tensor->nDims() > 1) { - tensor->axis(-1)->parallelize(ParallelType::TIDx); - } - } - - const size_t dimx = 1024; - const size_t dimy = 16384; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({dimx, dimy}, options); - at::Tensor aten_static_in = aten_input.narrow(1, 0, static_size); - at::Tensor aten_dynamic_in = - aten_input.narrow(1, static_size, dimy - static_size); - - at::Tensor out = at::zeros({dimx, dimy}, options); - at::Tensor cg_static_out = out.narrow(1, 0, static_size); - at::Tensor cg_dynamic_out = out.narrow(1, static_size, dimy - static_size); - - std::vector aten_outputs; - - auto aten_output = at::_softmax(aten_input.to(at::kDouble), -1, false); - at::Tensor aten_static_out = aten_output.narrow(1, 0, static_size); - at::Tensor aten_dynamic_out = - aten_output.narrow(1, static_size, dimy - static_size); - - torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion( - {aten_static_in, aten_dynamic_in}, {cg_static_out, cg_dynamic_out}); - - testValidate( - &fusion, - {cg_static_out, cg_dynamic_out}, - {aten_static_in, aten_dynamic_in}, - {cg_static_out, cg_dynamic_out}, - __LINE__, - __FILE__); -} - -TEST(NVFuserTest, FusionPersistentNormLocalShared_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const int pixels_per_thread = 64; - const int TIDX = 128; - const int static_size = pixels_per_thread * TIDX; - - TensorView* sx = makeConcreteTensor({-1, static_size}); - TensorView* dx = makeSymbolicTensor(2); - fusion.addInput(sx); - fusion.addInput(dx); - - Double* gamma = new Double(); - Double* beta = new Double(); - Double* eps = new Double(); - Int* N = new Int(); - fusion.addInput(gamma); - fusion.addInput(beta); - fusion.addInput(eps); - fusion.addInput(N); - - // Reduction - auto sx_sum = sum(sx, {-1}); // (M, R) - auto dx_sum = sum(dx, {-1}); // (M, R) - // Reduction => merge local and shared memory TensorViews - auto x_sum = binaryOp(BinaryOpType::Add, sx_sum, dx_sum); - - // Broadcast - auto x_sum_bcast = broadcast(x_sum, {false, true}); // (M, B) - // Pwise - auto x_mean = div(x_sum_bcast, N); // (M, B) - - auto sx_mean_sub = sub(sx, x_mean); // (M, N) - auto dx_mean_sub = sub(dx, x_mean); // (M, N) - - auto sx_mean_sub_pow = mul(sx_mean_sub, sx_mean_sub); // (M, N) - auto dx_mean_sub_pow = mul(dx_mean_sub, dx_mean_sub); // (M, N) - - // Reduction - auto sx_var_sum = sum(sx_mean_sub_pow, {-1}); // (M, R) - auto dx_var_sum = sum(dx_mean_sub_pow, {-1}); // (M, R) - // Reduction => merge local and shared memory TensorViews - auto var_sum = binaryOp(BinaryOpType::Add, sx_var_sum, dx_var_sum); - - // Broadcast - auto var_sum_bcast = broadcast(var_sum, {false, true}); // (M, B) - // Pwise - auto var = div(var_sum_bcast, N); // (M, B) - auto var_eps = add(var, eps); // (M, B) - auto rvar = unaryOp(UnaryOpType::Rsqrt, var_eps); // (M, B) - - auto sx_norm = mul(sx_mean_sub, rvar); - auto dx_norm = mul(dx_mean_sub, rvar); - - auto sx_norm_gamma = mul(sx_norm, gamma); - auto dx_norm_gamma = mul(dx_norm, gamma); - - auto sx_norm_gamma_beta = add(sx_norm_gamma, beta); - auto dx_norm_gamma_beta = add(dx_norm_gamma, beta); - - fusion.addOutput(sx_norm_gamma_beta); - fusion.addOutput(dx_norm_gamma_beta); - - sx_norm_gamma_beta->setContiguity(false); - dx_norm_gamma_beta->setContiguity(false); - - // Read Input into Shared Memory - // Read Input minus Input_Mean into Shared Memory - auto sx_cache = sx->cache_after(); - auto dx_cache = dx->cache_after(); - dx_cache->setMemoryType(MemoryType::Shared); - dx_mean_sub->setMemoryType(MemoryType::Shared); - - std::vector common_tensors( - {x_sum, x_sum_bcast, x_mean, var_sum, var_sum_bcast, var, var_eps, rvar}); - - std::vector static_tensors( - {sx, - sx_cache, - sx_sum, - sx_mean_sub, - sx_mean_sub_pow, - sx_var_sum, - sx_norm, - sx_norm_gamma, - sx_norm_gamma_beta}); - - std::vector dynamic_tensors( - {dx, - dx_cache, - dx_sum, - dx_mean_sub, - dx_mean_sub_pow, - dx_var_sum, - dx_norm, - dx_norm_gamma, - dx_norm_gamma_beta}); - - std::vector all_tensors; - all_tensors.insert( - all_tensors.end(), common_tensors.begin(), common_tensors.end()); - all_tensors.insert( - all_tensors.end(), static_tensors.begin(), static_tensors.end()); - all_tensors.insert( - all_tensors.end(), dynamic_tensors.begin(), dynamic_tensors.end()); - - // M => M - // M, N => M, N/128, 128 - for (auto tensor : all_tensors) { - if (tensor->nDims() > 1) { - tensor->split(-1, TIDX); - } - } - - // Local Sum => Block Broadcast - TensorView* sx_sum_rf = sx_sum->rFactor({1}); - TensorView* sx_var_sum_rf = sx_var_sum->rFactor({1}); - TensorView* dx_sum_rf = dx_sum->rFactor({1}); - TensorView* dx_var_sum_rf = dx_var_sum->rFactor({1}); - all_tensors.push_back(sx_sum_rf); - all_tensors.push_back(sx_var_sum_rf); - all_tensors.push_back(dx_sum_rf); - all_tensors.push_back(dx_var_sum_rf); - - // ComputeAt - sx->computeAt(sx_mean_sub_pow, 1); - dx->computeAt(dx_mean_sub_pow, 1); - - var_sum->computeAt(rvar, 1); - - sx_mean_sub_pow->computeAt(sx_var_sum_rf, 2); - dx_mean_sub_pow->computeAt(dx_var_sum_rf, 2); - - sx_norm->computeAt(sx_norm_gamma_beta, 2); - dx_norm->computeAt(dx_norm_gamma_beta, 2); - - sx_norm_gamma_beta->axis(0)->parallelize(ParallelType::BIDx); - dx_norm_gamma_beta->axis(0)->parallelize(ParallelType::BIDx); - for (auto tensor : all_tensors) { - if (tensor->nDims() > 1) { - tensor->axis(-1)->parallelize(ParallelType::TIDx); - } - } - - const int dimx = 1024; - const int dimy = 16384; - const float kGamma = 1.0f; - const float kBeta = 0.0f; - const float kEps = 1e-5; - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor aten_input = at::randn({dimx, dimy}, options); - at::Tensor aten_static_in = aten_input.narrow(1, 0, static_size); - at::Tensor aten_dynamic_in = - aten_input.narrow(1, static_size, dimy - static_size); - - at::Tensor out = at::zeros({dimx, dimy}, options); - at::Tensor cg_static_out = out.narrow(1, 0, static_size); - at::Tensor cg_dynamic_out = out.narrow(1, static_size, dimy - static_size); - - std::vector aten_inputs = { - aten_static_in, aten_dynamic_in, kGamma, kBeta, kEps, dimy}; - - torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion(aten_inputs, {cg_static_out, cg_dynamic_out}); - - auto at_mu = at::mean(aten_input.to(at::kDouble), -1).unsqueeze(1); - auto at_var = at::var(aten_input.to(at::kDouble), -1, false).unsqueeze(1); - auto at_rvar = at::rsqrt(at::add(at_var, kEps)); - auto at_norm = at::mul(at::sub(aten_input, at_mu), at_rvar); - auto aten_output = at::add(at::mul(at_norm, kGamma), kBeta); - at::Tensor aten_static_out = aten_output.narrow(1, 0, static_size); - at::Tensor aten_dynamic_out = - aten_output.narrow(1, static_size, dimy - static_size); - - testValidate( - &fusion, - {cg_static_out, cg_dynamic_out}, - aten_inputs, - {aten_static_out, aten_dynamic_out}, - __LINE__, - __FILE__); -} - -TEST(NVFuserTest, FusionSmemDynamicPersistentNorm_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - auto x = makeSymbolicTensor(2); - Double* gamma = new Double(); - Double* beta = new Double(); - Double* eps = new Double(); - Int* N = new Int(); - fusion.addInput(x); - fusion.addInput(gamma); - fusion.addInput(beta); - fusion.addInput(eps); - fusion.addInput(N); - - // Reduction - auto x_sum = sum(x, {-1}); // (M, R) - // Broadcast - auto x_sum_bcast = broadcast(x_sum, {false, true}); // (M, B) - // Pwise - auto x_mean = div(x_sum_bcast, N); // (M, B) - auto x_mean_sub = sub(x, x_mean); // (M, N) - auto x_mean_sub_pow = mul(x_mean_sub, x_mean_sub); // (M, N) - // Reduction - auto var_sum = sum(x_mean_sub_pow, {-1}); // (M, R) - // Broadcast - auto var_sum_bcast = broadcast(var_sum, {false, true}); // (M, B) - // Pwise - auto var = div(var_sum_bcast, N); // (M, B) - auto var_eps = add(var, eps); // (M, B) - auto rvar = unaryOp(UnaryOpType::Rsqrt, var_eps); // (M, B) - auto norm = mul(x_mean_sub, rvar); - auto norm_gamma = mul(norm, gamma); - auto norm_gamma_beta = add(norm_gamma, beta); - fusion.addOutput(norm_gamma_beta); - - // Read Input into Shared Memory - // Read Input minus Input_Mean into Shared Memory - auto cache_x = x->cache_after(); - cache_x->setMemoryType(MemoryType::Shared); - x_mean_sub->setMemoryType(MemoryType::Shared); - - std::vector all_tensors( - {x_sum, - x_mean, - cache_x, - x_sum_bcast, - x_mean_sub, - x_mean_sub_pow, - var_sum, - var_sum_bcast, - var, - var_eps, - rvar, - norm, - norm_gamma, - norm_gamma_beta}); - - auto tidx = new Int(); - fusion.addInput(tidx); - - for (auto tensor : all_tensors) { - tensor->split(-1, tidx); - } - - // Local Sum => Block Broadcast - TensorView* x_sum_rf = x_sum->rFactor({1}); - TensorView* var_sum_rf = var_sum->rFactor({1}); - all_tensors.push_back(x_sum_rf); - all_tensors.push_back(var_sum_rf); - - // ComputeAt - x->computeAt(x_mean_sub_pow, 1); - var_sum->computeAt(rvar, 1); - x_mean_sub_pow->computeAt(var_sum_rf, 2); - norm->computeAt(norm_gamma_beta, 2); - - for (auto tv : all_tensors) { - tv->axis(0)->parallelize(ParallelType::BIDx); - tv->axis(-1)->parallelize(ParallelType::TIDx); - } - - const int dimx = 128; - const int dimy = 2048; - const float kGamma = 1.0f; - const float kBeta = 0.0f; - const float kEps = 1e-5; - const int TIDX = 128; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({dimx, dimy}, options); - auto at_mu = at::mean(aten_input.to(at::kDouble), -1).unsqueeze(1); - auto at_var = at::var(aten_input.to(at::kDouble), -1).unsqueeze(1); - auto at_rvar = at::rsqrt(at::add(at_var, kEps)); - auto at_norm = at::mul(at::sub(aten_input, at_mu), at_rvar); - auto aten_output = at::add(at::mul(at_norm, kGamma), kBeta); - - std::vector aten_inputs = { - aten_input, kGamma, kBeta, kEps, dimy, TIDX}; - - torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionSmemDynamicReductionSymbolic_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); - fusion.addInput(tv0); - fusion.addOutput(tv1); - // tv1[I0, R1] = tv0[I0, I1] - - // Interface should just be a direct split with a Parallel type. We can - // include the parallelize call if we do this. - tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx)); - // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1] - - TensorView* tv2 = tv1->rFactor({2}); - tv2->setMemoryType(MemoryType::Shared); - // tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1] - // tv1[I0, R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}] - - tv0->computeAt(tv1, 1); - - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv1->axis(0)->parallelize(ParallelType::BIDx); - - constexpr int numel_x = 65000, numel_y = 1024; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({numel_x, numel_y}, options); - auto aten_output = aten_input.to(at::kDouble).sum({1}); - - // How many threads to use for the block reduction - constexpr int runtime_threadIdx_dim = 128; - - LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({aten_input}, lparams); - - testValidate( - &fusion, - cg_outputs, - {aten_input}, - {aten_output}, - __LINE__, - __FILE__, - "", - lparams); - TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); -} - -TEST(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Algorithm - Int* sym_bsx = new Int(); - TensorView* tv0 = makeSymbolicTensor(3); // M, K, N - fusion.addInput(tv0); - fusion.addInput(sym_bsx); - - TensorView* tv1 = sum(tv0, {1}); // M, R, N - fusion.addOutput(tv1); - - TensorView* tv2 = tv0->cache_after(); - tv2->setMemoryType(MemoryType::Shared); - - // Schedule - constexpr int BSX = 32; - tv1->split(2, BSX); - tv1->split(1, sym_bsx); - tv1->split(0, BSX); - // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX - tv1->reorder({{0, 0}, {1, 2}, {2, 4}, {3, 5}, {4, 1}, {5, 3}}); - TensorView* tv3 = tv1->rFactor({-2}); - - tv0->computeAt(tv1, -2); - tv0->computeAt(tv3, -2); - - // Thread and Block binding - tv1->axis(0)->parallelize(ParallelType::BIDx); - tv1->axis(1)->parallelize(ParallelType::BIDy); - tv1->axis(-1)->parallelize(ParallelType::TIDx); - // Manual Binding - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - - constexpr int M = 154, K = 45, N = 1524; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({M, K, N}, options); - at::Tensor aten_output = aten_input.to(at::kDouble).sum({1}); - - // How many threads to use for the block reduction - constexpr int runtime_threadIdx_dim = 128; - - auto lparams = LaunchParams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({aten_input, runtime_threadIdx_dim}, lparams); - - testValidate( - &fusion, - cg_outputs, - {aten_input, runtime_threadIdx_dim}, - {aten_output}, - __LINE__, - __FILE__, - "", - lparams); - - TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1); -} - -TEST(NVFuserTest, FusionSmemDynamicPwiseMulSymbolicArgWAR_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - Int* sym_bsx = new Int(); - TensorView* tv0 = makeSymbolicTensor(2); // (M, K) - TensorView* tv1 = makeSymbolicTensor(2); // (K, N) - TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B) - TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N) - TensorView* tv4 = mul(tv2, tv3); // M, K, N - fusion.addInput(tv0); - fusion.addInput(tv1); - fusion.addInput(sym_bsx); - fusion.addOutput(tv4); - // Algorithm - - tv2->setMemoryType(MemoryType::Shared); - tv3->setMemoryType(MemoryType::Shared); - - constexpr int BSX = 32; - tv4->split(2, BSX); - tv4->split(1, sym_bsx); - tv4->split(0, BSX); - // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX - tv4->reorder({{0, 0}, {1, 3}, {2, 1}, {3, 4}, {4, 2}, {5, 5}}); - // M/BSX, K/BSX, N/BSX, MSX, KSX, NSX - - tv0->computeAt(tv4, 3); - tv1->computeAt(tv4, 3); - // Schedule - - tv4->axis(0)->parallelize(ParallelType::BIDx); - tv4->axis(2)->parallelize(ParallelType::BIDy); - // Manual Binding - tv2->axis(-2)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - // Thread and Block binding - - constexpr int M = 128, K = 457, N = 1024; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({M, K}, options); - at::Tensor t1 = at::randn({K, N}, options); - at::Tensor aten_output = mul(t0.unsqueeze(2), t1.unsqueeze(0)); - std::vector aten_inputs = {t0, t1, BSX}; - - LaunchParams lparams(-1, -1, -1, BSX, -1, -1); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs, lparams); - - testValidate( - &fusion, - cg_outputs, - aten_inputs, - {aten_output}, - __LINE__, - __FILE__, - "", - lparams); - - TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1); -} - -TEST(NVFuserTest, FusionSmemDynamicTiledGemm_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Symbolic integers we will use for runtime tiling - Int* symbolic_m_tile_dim = new Int(); // bound to threadIdx.z - Int* symbolic_split_k_tile_dim = new Int(); // bound to blockIdx.x - Int* symbolic_block_k_tile_dim = new Int(); // bound to threadIdx.x - // Compile-time integer for tiling - int n_smem_tile = 8; // bound to threadIdx.y - - // Symbolic 2D tensors TV0[M, K], TV1[K, N] - TensorView* tv0 = makeSymbolicTensor(2); - TensorView* tv1 = makeSymbolicTensor(2); - - // Broadcast tv0 to [M, K, *] - TensorView* tv2 = broadcast(tv0, {false, false, true}); - // Broadcast tv1 to [*, K, N] - TensorView* tv3 = broadcast(tv1, {true, false, false}); - - // Pointwise multiplication resulting in tv3[M, K, N] - TensorView* tv4 = mul(tv2, tv3); - - // Turn the K-dimension of tv4 into a reduction dimension - TensorView* tv5 = sum(tv4, {1}); - - // Register inputs and outputs - fusion.addInput(tv0); - fusion.addInput(tv1); - fusion.addOutput(tv5); - - // Register runtime tile dims as inputs - fusion.addInput(symbolic_m_tile_dim); - fusion.addInput(symbolic_split_k_tile_dim); - fusion.addInput(symbolic_block_k_tile_dim); - - // Make a 3D tile, mix of symbolic and constant, do in reverse order because - // dims are inserted - // [M, K, N] - tv5->split(2, n_smem_tile); - tv5->split(1, symbolic_block_k_tile_dim); - tv5->split(1, symbolic_split_k_tile_dim); - tv5->split(0, symbolic_m_tile_dim); - // [Mo, Mi, Koo, Koi, Ki, No, Ni] - - // Reorder so all outer tiles are in the leftmost 3 positions - tv5->reorder({{1, 5}, {5, 1}}); - // [Mo, No, Koo, Koi, Ki, Mi, Ni] - - // Factor out the outer reduction IterDomain, then run the inter-cta - // reduction, and intra-cta reduction - auto tv6 = tv5->rFactor({2}); - // [Mo, No, rKoo, rKoi, rKi, Mi, Ni] - // [Mo, No, rKoi, rKi, Mi, Ni] - - // Scope computations - tv6->computeAt(tv5, 2); - // [Mo, No, rKoo, Koi, Ki, Mi, Ni] - // [Mo, No, rKoi, rKi, Mi, Ni] - - // Setup compute at schedule - tv0->computeAt(tv6, 3); - tv1->computeAt(tv6, 3); - tv4->computeAt(tv6, -1); - // - // T2[Mo, bNo, Koo, Koi, Kii, Mi, bNi] CA(4, 3) - // T3[bMo, No, Koo, Koi, Kii, bMi, Ni] CA(4, 3) - // T4[ Mo, No, Koo, Koi, Kii, Mi, Ni] - // T6[ Mo, No, rKoo, Koi, Kii, Mi, Ni] - // T5[ Mo, No, rKoi, rKii, Mi, Ni] - - // Cache smem tiles - tv2->setMemoryType(MemoryType::Shared); - tv3->setMemoryType(MemoryType::Shared); - tv4->setMemoryType(MemoryType::Local); - tv6->setMemoryType(MemoryType::Local); - - tv5->axis(0)->parallelize(ParallelType::BIDz); - tv5->axis(1)->parallelize(ParallelType::BIDy); - - std::vector tv_list = {tv2, tv3, tv4, tv5, tv6}; - for (auto tv : tv_list) { - tv->axis(-2)->parallelize(ParallelType::TIDz); - tv->axis(-1)->parallelize(ParallelType::TIDy); - } - tv2->axis(3)->parallelize(ParallelType::TIDx); - tv3->axis(3)->parallelize(ParallelType::TIDx); - tv4->axis(3)->parallelize(ParallelType::TIDx); - tv6->axis(3)->parallelize(ParallelType::TIDx); - tv5->axis(2)->parallelize(ParallelType::TIDx); - - tv2->axis(4)->parallelize(ParallelType::BIDx); - tv3->axis(4)->parallelize(ParallelType::BIDx); - tv4->axis(4)->parallelize(ParallelType::BIDx); - tv6->axis(4)->parallelize(ParallelType::BIDx); - tv5->axis(3)->parallelize(ParallelType::BIDx); - - constexpr int M = 31, K = 65, N = 33; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({M, K}, options); - at::Tensor t1 = at::randn({K, N}, options); - - FusionExecutor fe; - // Generate CUDA and compile with nvRTC - fe.compileFusion(&fusion); - - // Runtime tiling - int m_tile = 4; // bound to threadIdx.z - int split_k = 7; // bound to blockIdx.x - int intra_cta = 8; // bound to threadIdx.x - - std::vector aten_inputs = {t0, t1, m_tile, split_k, intra_cta}; - at::Tensor aten_output = - mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1); - - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); - - TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1); -} - -TEST(NVFuserTest, FusionGlobalIntermediate_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); - fusion.addInput(tv0); - fusion.addOutput(tv1); - // tv1[I0, R1] = tv0[I0, I1] - - // Interface should just be a direct split with a Parallel type. We can - // include the parallelize call if we do this. - tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx)); - // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1] - - TensorView* tv2 = tv1->rFactor({2}); - tv2->setMemoryType(MemoryType::Global); - // tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1] - // tv1[I0, R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}] - - tv0->computeAt(tv1, 1); - - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv1->axis(0)->parallelize(ParallelType::BIDx); - - constexpr int numel_x = 65000, numel_y = 1024; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({numel_x, numel_y}, options); - - // How many threads to use for the block reduction - constexpr int runtime_threadIdx_dim = 128; - - auto lparams = LaunchParams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({input}, lparams); - - auto aten_output = input.to(at::kDouble).sum({1}); - testValidate( - &fusion, - cg_outputs, - {input}, - {aten_output}, - __LINE__, - __FILE__, - "", - lparams); -} - -TEST(NVFuserTest, FusionGlobalIntermediateDefaultSchedule_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2); - TensorView* tv1 = makeSymbolicTensor(2); - TensorView* tv2 = makeSymbolicTensor(2); - TensorView* tv3 = makeSymbolicTensor(2); - TensorView* tv4 = sub(tv2, tv3); - TensorView* tv5 = add(tv1, tv4); - TensorView* tv6 = sub(tv5, tv0); - fusion.addInput(tv0); - fusion.addInput(tv1); - fusion.addInput(tv2); - fusion.addInput(tv3); - fusion.addOutput(tv6); - // t6 = ((t1 + (t2 - t3)) - t0) - - tv4->setMemoryType(MemoryType::Global); - tv5->setMemoryType(MemoryType::Global); - tv6->setMemoryType(MemoryType::Global); - - constexpr int M = 32, N = 810; - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({M, N}, options); - at::Tensor t1 = at::randn({M, N}, options); - at::Tensor t2 = at::randn({M, N}, options); - at::Tensor t3 = at::randn({M, N}, options); - - at::Tensor aten_output = (t1 + (t2 - t3)) - t0; - - std::vector aten_inputs = {t0, t1, t2, t3}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({t0, t1, t2, t3}); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionConstCheck_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto one = new Int(1); - TORCH_CHECK(one->isConstScalar()); - - auto one_x2 = mul(one, one); - TORCH_CHECK(one_x2->isConstScalar()); - - auto one_x3 = mul(one_x2, one); - TORCH_CHECK(one_x3->isConstScalar()); - - auto one_x4 = mul(one_x3, one); - TORCH_CHECK(one_x4->isConstScalar()); -} - -TEST(NVFuserTest, FusionUnrollWithAlloc_CUDA) { - const std::vector tensor_dims_in = {128, 128}; - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(tensor_dims_in.size()); - fusion.addInput(tv0); - - TensorView* tv1 = add(tv0, new Double(0)); - TensorView* tv2 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv1); - fusion.addOutput(tv2); - - const auto options = - at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn(tensor_dims_in, options); - at::Tensor cg_output = at::empty({tensor_dims_in[0]}, options); - - // Schedule - tv2->split(1, 32); - tv2->split(1, 4); // unroll - - auto tv2_rf = tv2->rFactor({-3, -2}); - - tv2->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - - tv2_rf->axis(0)->parallelize(ParallelType::BIDx); - tv2_rf->axis(-1)->parallelize(ParallelType::TIDx); - tv2_rf->axis(-2)->parallelize(ParallelType::Unroll); - - tv1->computeAt(tv2_rf, -1); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({input}); - - auto aten_output = (input + 0).to(at::kDouble).sum(1); - - testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); -} - -// Test isZeroInt -TEST(NVFuserTest, FusionIsZeroInt_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - Int* x = new Int(0); - Int* y = new Int(1); - Val* z = mul(x, y); - TORCH_CHECK(x->isZeroInt()); - TORCH_CHECK(!y->isZeroInt()); - TORCH_CHECK(!z->isZeroInt()); -} - -// Test isOneInt -TEST(NVFuserTest, FusionIsOneInt_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - Int* x = new Int(1); - Int* y = new Int(1); - Val* z = mul(x, y); - TORCH_CHECK(x->isOneInt()); - TORCH_CHECK(y->isOneInt()); - TORCH_CHECK(!z->isOneInt()); -} - -// This is to verify no cycle of computeAt is created. A more complex -// variation of this pattern appears in one of the Python tests -// (test_random_topo). -TEST(NVFuserTest, FusionComputeAtNonterminatingOutput_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - // Common intermediate tensor - auto tv1 = add(tv0, new Double(1)); - // tv1 -> tv2 - auto tv2 = add(tv1, new Double(2)); - // tv1 -> tv3 -> tv4 - auto tv3 = add(tv1, new Double(3)); - auto tv4 = add(tv3, new Double(4)); - - // NOTE: This should no longer occur as of PR #201. - // The order of adding outputs matters. If tv3 is added before tv4, - // it should be fine. However, if tv4 is added before tv3, there - // will be a cycle of tv3->tv4 and tv4->tv3. tv3->tv4 is created - // first, and then tv4->tv3 is created at the final phase of - // computeAt (ComputeAt::setupOutputs). - fusion.addOutput(tv2); - fusion.addOutput(tv4); - fusion.addOutput(tv3); - - tv0->computeAt(tv2, -1); - - TORCH_CHECK(tv3->hasComputeAt()); - TORCH_CHECK(!tv4->hasComputeAt()); - - const auto options = - at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn(100, options); - - auto t1 = aten_input + 1; - auto t2 = t1 + 2; - auto t3 = t1 + 3; - auto t4 = t3 + 4; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({aten_input}); - - std::vector aten_outputs = {t2, t4, t3}; - testValidate( - &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionTraversalOrder1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - TensorView* tv1 = add(tv0, new Double(1)); - TensorView* tv2 = add(tv0, new Double(2)); - TensorView* tv3 = add(tv1, new Double(3)); - TensorView* tv4 = add(tv1, new Double(4)); - - fusion.addOutput(tv2); - fusion.addOutput(tv3); - fusion.addOutput(tv4); - - tv1->computeAt(tv3, -1); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({10, 10}, options); - - auto t1 = aten_input + 1; - auto t2 = aten_input + 2; - auto t3 = t1 + 3; - auto t4 = t1 + 4; - - std::vector aten_outputs = {t2, t3, t4}; - - std::vector cg_outputs = { - at::empty_like(aten_input, options), - at::empty_like(aten_input, options), - at::empty_like(aten_input, options)}; - - fe.runFusion({aten_input}, cg_outputs); - testValidate( - &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionTraversalOrder2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - TensorView* tv1 = add(tv0, new Double(1)); - TensorView* tv2 = add(tv1, new Double(2)); - - TensorView* tv3 = add(tv0, new Double(3)); - TensorView* tv4 = add(tv3, new Double(4)); - - TensorView* tv5 = add(tv1, tv3); - - fusion.addOutput(tv2); - fusion.addOutput(tv4); - fusion.addOutput(tv5); - - tv1->computeAt(tv5, -1); - tv3->computeAt(tv5, -1); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({10, 10}, options); - - auto t1 = aten_input + 1; - auto t2 = t1 + 2; - auto t3 = aten_input + 3; - auto t4 = t3 + 4; - auto t5 = t1 + t3; - - std::vector aten_outputs = {t2, t4, t5}; - - std::vector cg_outputs = { - at::empty_like(aten_input, options), - at::empty_like(aten_input, options), - at::empty_like(aten_input, options)}; - - fe.runFusion({aten_input}, cg_outputs); - - testValidate( - &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionTraversalOrder3_CUDA) { - for (const auto i : c10::irange(2)) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - TensorView* tv1 = add(tv0, new Double(1)); - TensorView* tv2 = add(tv1, new Double(2)); - - TensorView* tv3 = add(tv0, new Double(3)); - TensorView* tv4 = add(tv3, new Double(4)); - - TensorView* tv5 = add(tv1, tv3); - - fusion.addOutput(tv2); - fusion.addOutput(tv4); - fusion.addOutput(tv5); - - const int tile = 32; - - tv1->split(-1, tile); - tv2->split(-1, tile); - tv3->split(-1, tile); - tv4->split(-1, tile); - tv5->split(-1, tile); - - auto compute_at_outer = tv1; - auto compute_at_inner = tv3; - if (i == 1) { - std::swap(compute_at_inner, compute_at_outer); - } - - compute_at_outer->computeAt(tv5, -2); - compute_at_inner->computeAt(tv5, -1); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({100}, options); - auto t1 = aten_input + 1; - auto t2 = t1 + 2; - auto t3 = aten_input + 3; - auto t4 = t3 + 4; - auto t5 = t1 + t3; - - std::vector aten_outputs = {t2, t4, t5}; - - std::vector cg_outputs = { - at::empty_like(aten_input, options), - at::empty_like(aten_input, options), - at::empty_like(aten_input, options)}; - - fe.runFusion({aten_input}, cg_outputs); - - testValidate( - &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); - } -} - -TEST(NVFuserTest, FusionTraversalOrder4_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // First tree - TensorView* tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - TensorView* tv1 = add(tv0, new Double(1)); - TensorView* tv2 = add(tv1, new Double(2)); - TensorView* tv3 = add(tv1, new Double(3)); - fusion.addOutput(tv2); - fusion.addOutput(tv3); - - // Second tree - TensorView* tv4 = makeSymbolicTensor(1); - fusion.addInput(tv4); - TensorView* tv5 = add(tv4, new Double(5)); - TensorView* tv6 = add(tv5, new Double(6)); - TensorView* tv7 = add(tv5, new Double(7)); - fusion.addOutput(tv6); - fusion.addOutput(tv7); - - tv1->computeAt(tv2, -1); - tv5->computeAt(tv6, -1); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({100}, options); - at::Tensor t4 = at::rand_like(t0, options); - - auto t1 = t0 + 1; - auto t2 = t1 + 2; - auto t3 = t1 + 3; - auto t5 = t4 + 5; - auto t6 = t5 + 6; - auto t7 = t5 + 7; - - std::vector aten_outputs = {t2, t3, t6, t7}; - std::vector aten_inputs = {t0, t4}; - std::vector cg_outputs = { - at::empty_like(t0, options), - at::empty_like(t0, options), - at::empty_like(t0, options), - at::empty_like(t0, options)}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion(aten_inputs, cg_outputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionTraversalOrder5_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - TensorView* tv1 = add(tv0, new Double(1)); - TensorView* tv2 = add(tv1, new Double(2)); - TensorView* tv3 = add(tv0, new Double(3)); - TensorView* tv4 = add(tv3, new Double(4)); - TensorView* tv5 = add(tv2, tv4); - - fusion.addOutput(tv1); - fusion.addOutput(tv3); - fusion.addOutput(tv5); - - tv2->computeAt(tv5, -1); - tv4->computeAt(tv5, -1); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({100}, options); - std::vector cg_outputs = { - at::empty_like(aten_input, options), - at::empty_like(aten_input, options), - at::empty_like(aten_input, options)}; - - fe.runFusion({aten_input}, cg_outputs); - - auto t1 = aten_input + 1; - auto t2 = t1 + 2; - auto t3 = aten_input + 3; - auto t4 = t3 + 4; - auto t5 = t2 + t4; - - std::vector aten_outputs = {t1, t3, t5}; - - testValidate( - &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionTraversalOrder6_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - TensorView* tv1 = add(tv0, new Double(1)); - TensorView* tv2 = add(tv0, new Double(2)); - TensorView* tv3 = add(tv1, tv2); - TensorView* tv4 = add(tv3, new Double(4)); - - fusion.addOutput(tv4); - - tv1->split(0, 32); - tv2->split(0, 32); - tv3->split(0, 32); - tv4->split(0, 32); - - tv3->computeAt(tv4, -2); - tv1->computeAt(tv3, -1); - tv2->computeAt(tv3, -2); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({100}, options); - - auto t1 = aten_input + 1; - auto t2 = aten_input + 2; - auto t3 = t1 + t2; - auto aten_output = t3 + 4; - - at::Tensor cg_output = at::empty_like(aten_input, options); - - fe.runFusion({aten_input}, {cg_output}); - - testValidate( - &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionTraversalOrder7_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - TensorView* tv1 = add(tv0, new Double(1)); - TensorView* tv2 = add(tv1, new Double(2)); - TensorView* tv3 = add(tv0, new Double(3)); - TensorView* tv4 = add(tv3, new Double(4)); - TensorView* tv5 = add(tv2, tv4); - - fusion.addOutput(tv5); - - TensorView* tvs[] = {tv1, tv2, tv3, tv4, tv5}; - for (auto tv : tvs) { - tv->split(0, 2); - tv->split(0, 4); - tv->split(0, 8); - } - - // computeAt into inner loop nests - tv1->computeAt(tv2, -1); - tv3->computeAt(tv4, -2); - - tv2->computeAt(tv5, -4); - tv4->computeAt(tv5, -3); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({100}, options); - - auto t1 = aten_input + 1; - auto t2 = t1 + 2; - auto t3 = aten_input + 3; - auto t4 = t3 + 4; - auto aten_output = t2 + t4; - - at::Tensor cg_output = at::empty_like(aten_input, options); - fe.runFusion({aten_input}, {cg_output}); - - testValidate( - &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__); -} - -// Test predication of grid reduction -TEST(NVFuserTest, FusionThreadPredicate_CUDA) { - const int gdimx = 4; - const int bdimx = 128; - - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); - TensorView* tv2 = unaryOp(UnaryOpType::Neg, tv1); - TensorView* tv3 = add(tv0, new Double(2)); - - fusion.addOutput(tv3); - fusion.addOutput(tv2); - - tv1->split(1, bdimx); - tv1->split(1, gdimx); - tv3->split(1, bdimx); - tv3->split(1, gdimx); - - TensorView* tv1_rf = tv1->rFactor({1}); - - tv1->computeAt(tv2, -1); - - tv1->axis(0)->parallelize(ParallelType::BIDy); - tv1_rf->axis(0)->parallelize(ParallelType::BIDy); - tv2->axis(0)->parallelize(ParallelType::BIDy); - tv1->axis(-2)->parallelize(ParallelType::BIDx); - tv1_rf->axis(-2)->parallelize(ParallelType::BIDx); - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv1_rf->axis(-1)->parallelize(ParallelType::TIDx); - - tv3->axis(3)->parallelize(ParallelType::TIDx); - tv3->axis(2)->parallelize(ParallelType::BIDx); - tv3->axis(0)->parallelize(ParallelType::BIDy); - - int numel_x = 100; - int numel_y = 1000; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({numel_x, numel_y}, options); - - auto t2 = -aten_input.to(at::kDouble).sum({1}); - auto t3 = aten_input + 2.0; - - std::vector aten_outputs = {t3, t2}; - - std::vector cg_outputs = { - at::empty_like(aten_input, options), at::empty({numel_x}, options)}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({aten_input}, cg_outputs); - - testValidate( - &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionLSTMCell_CUDA) { - const int hidden_features = 512; - const int batch_size = 64; - - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tvs[16]; - for (const auto i : c10::irange(16)) { - tvs[i] = makeSymbolicTensor(2); - fusion.addInput(tvs[i]); - } - - auto ingate = unaryOp( - UnaryOpType::Sigmoid, add(add(add(tvs[0], tvs[1]), tvs[2]), tvs[3])); - - auto forgetgate = unaryOp( - UnaryOpType::Sigmoid, add(add(add(tvs[4], tvs[5]), tvs[6]), tvs[7])); - - auto cellgate = unaryOp( - UnaryOpType::Tanh, add(add(add(tvs[8], tvs[9]), tvs[10]), tvs[11])); - - auto outgate = unaryOp( - UnaryOpType::Sigmoid, add(add(add(tvs[12], tvs[13]), tvs[14]), tvs[15])); - - auto cx = makeContigTensor(2); - fusion.addInput(cx); - - auto cy = add(mul(forgetgate, cx), mul(ingate, cellgate)); - - auto hy = mul(outgate, unaryOp(UnaryOpType::Tanh, cy)); - - fusion.addOutput(cy); - fusion.addOutput(hy); - - std::vector aten_inputs; - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor large_tensor0 = - at::randn({batch_size, hidden_features * 4}, options); - at::Tensor large_tensor1 = - at::randn({batch_size, hidden_features * 4}, options); - at::Tensor large_tensor2 = - at::randn({batch_size, hidden_features * 4}, options); - at::Tensor large_tensor3 = - at::randn({batch_size, hidden_features * 4}, options); - - auto chunked0 = large_tensor0.chunk(4, 1); - auto chunked1 = large_tensor1.chunk(4, 1); - auto chunked2 = large_tensor2.chunk(4, 1); - auto chunked3 = large_tensor3.chunk(4, 1); - - aten_inputs.insert(aten_inputs.end(), chunked0.begin(), chunked0.end()); - aten_inputs.insert(aten_inputs.end(), chunked1.begin(), chunked1.end()); - aten_inputs.insert(aten_inputs.end(), chunked2.begin(), chunked2.end()); - aten_inputs.insert(aten_inputs.end(), chunked3.begin(), chunked3.end()); - - auto at_ingate = - chunked0[0].add(chunked0[1]).add(chunked0[2]).add(chunked0[3]).sigmoid(); - auto at_forgetgate = - chunked1[0].add(chunked1[1]).add(chunked1[2]).add(chunked1[3]).sigmoid(); - auto at_cellgate = - chunked2[0].add(chunked2[1]).add(chunked2[2]).add(chunked2[3]).tanh(); - auto at_outgate = - chunked3[0].add(chunked3[1]).add(chunked3[2]).add(chunked3[3]).sigmoid(); - - auto at_cx = at::randn({batch_size, hidden_features}, options); - aten_inputs.push_back(at_cx); - auto at_cy = at_forgetgate.mul(at_cx).add(at_ingate.mul(at_cellgate)); - auto at_hy = at_outgate.mul(at_cy.tanh()); - - auto lparams = schedulePointwise(&fusion, aten_inputs); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs, lparams); - - testValidate( - &fusion, cg_outputs, aten_inputs, {at_cy, at_hy}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionComputeAtMultiBCast_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - TensorView* tv1 = mul(tv0, new Double(0.5)); - TensorView* tv2 = broadcast(tv1, {true, false}); - TensorView* tv3 = broadcast(tv1, {false, true}); - TensorView* tv4 = add(tv2, tv3); - fusion.addOutput(tv4); - - // Not possible to do computeAt at position -1 as recomputation - // would be required. An exception should be thrown. - ASSERT_ANY_THROW(tv1->computeAt(tv3, -1)); -} - -TEST(NVFuserTest, FusionReductionHalf_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(3, DataType::Half); - fusion.addInput(tv0); - - auto tv1 = castOp(DataType::Float, tv0); - auto tv2 = add(tv1, new Double(1.0)); - auto tv3 = sum(tv2, {2}); - auto tv4 = castOp(DataType::Half, tv3); - - fusion.addOutput(tv4); - - const auto options = - at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({8, 8, 16}, options); - - auto reduction_tv = tv3; - - auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); - TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); - scheduleReduction(&fusion, reduction_params.value()); - - TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); - - auto lparams = reduction_params.value().lparams; - - FusionExecutor fe; - fe.compileFusion(&fusion); - // no broadcasting needed, omitting the last optional argument; - auto cg_outputs = fe.runFusion({aten_input}, lparams); - - auto aten_output = aten_input.add(1.0).to(at::kDouble).sum({2}); - - testValidate( - &fusion, - cg_outputs, - {aten_input}, - {aten_output}, - __LINE__, - __FILE__, - "", - lparams); -} - -TEST(NVFuserTest, FusionReduceSingle_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeConcreteTensor({100, 1}); - fusion.addInput(tv0); - auto tv1 = sum(tv0, {1}); - fusion.addOutput(tv1); - - const auto options = - at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({100, 1}, options); - - // Grab only tensor views, though there shouldn't be any other type - FusionExecutor fe; - fe.compileFusion(&fusion); - // no broadcasting needed, omitting the last optional argument; - auto cg_outputs = fe.runFusion({aten_input}); - - auto aten_output = aten_input.to(at::kDouble).sum({1}); - testValidate( - &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionReduceImplicitBroadcast_CUDA) { - constexpr int bid_x = 80; - constexpr int tid_x = 4096; - constexpr int red_dim = 1; - - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, 1}); - fusion.addInput(tv0); - - TensorView* tv1 = - reductionOp(BinaryOpType::Add, {red_dim, 2}, new Double(0), tv0); - fusion.addOutput(tv1); - - const auto options = - at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({bid_x, tid_x, 1}, options); - - // Apply reduction heuristic - auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); - TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); - scheduleReduction(&fusion, reduction_params.value()); - auto lparams = reduction_params.value().lparams; - - FusionExecutor fe; - fe.compileFusion(&fusion); - // no broadcasting needed, omitting the last optional argument; - auto cg_outputs = fe.runFusion({aten_input}, lparams); - auto aten_output = aten_input.to(at::kDouble).sum({red_dim, 2}); - - testValidate( - &fusion, - cg_outputs, - {aten_input}, - {aten_output}, - __LINE__, - __FILE__, - "", - lparams); -} - -TEST(NVFuserTest, FusionReduceImplicitBroadcast2_CUDA) { - constexpr int bid_x = 80; - constexpr int tid_x = 4096; - constexpr int red_dim = 1; - - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, 1}); - fusion.addInput(tv0); - - TensorView* tv1 = reductionOp(BinaryOpType::Add, {2}, new Double(0), tv0); - - TensorView* tv2 = - reductionOp(BinaryOpType::Add, {red_dim}, new Double(0), tv1); - fusion.addOutput(tv2); - - const auto options = - at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({bid_x, tid_x, 1}, options); - - // Apply reduction heuristic - auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); - TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); - - scheduleReduction(&fusion, reduction_params.value()); - auto lparams = reduction_params.value().lparams; - - FusionExecutor fe; - fe.compileFusion(&fusion); - // no broadcasting needed, omitting the last optional argument; - auto cg_outputs = fe.runFusion({aten_input}, lparams); - auto aten_output = aten_input.to(at::kDouble).sum({1, 2}); - - testValidate( - &fusion, - cg_outputs, - {aten_input}, - {aten_output}, - __LINE__, - __FILE__, - "", - lparams); -} - -TEST(NVFuserTest, FusionReduceImplicitBroadcast3_CUDA) { - constexpr int bid_x = 80; - constexpr int tid_x = 4096; - constexpr int red_dim = 1; - - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, 1}); - fusion.addInput(tv0); - - TensorView* tv1 = - reductionOp(BinaryOpType::Add, {red_dim}, new Double(0), tv0); - - TensorView* tv2 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv1); - fusion.addOutput(tv2); - - const auto options = - at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({bid_x, tid_x, 1}, options); - - // Apply reduction heuristic - auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); - TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); - scheduleReduction(&fusion, reduction_params.value()); - auto lparams = reduction_params.value().lparams; - - FusionExecutor fe; - fe.compileFusion(&fusion); - // no broadcasting needed, omitting the last optional argument; - auto cg_outputs = fe.runFusion({aten_input}, lparams); - auto aten_output = aten_input.to(at::kDouble).sum({2, 1}); - - testValidate( - &fusion, - cg_outputs, - {aten_input}, - {aten_output}, - __LINE__, - __FILE__, - "", - lparams); -} - -TEST(NVFuserTest, FusionTrivialReduction_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeConcreteTensor({10, 20, 1}); - fusion.addInput(tv0); - TensorView* tv1 = reductionOp(BinaryOpType::Add, {2}, new Double(0), tv0); - fusion.addOutput(tv1); - - TORCH_CHECK(!fusion.hasReduction(), "Trivial reduction picked up by fusion"); - - const auto options = - at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({10, 20, 1}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({aten_input}); - auto aten_output = aten_input.to(at::kDouble).sum({2}); - - testValidate( - &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionTrivialReduction2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - int w = 1, x = 1, y = 7, z = 8; - - auto tv0 = makeSymbolicTensor(2); - auto tv1 = makeConcreteTensor({w, x, y, z}); - fusion.addInput(tv0); - fusion.addInput(tv1); - - auto tv2 = sum(tv1, {0}); - auto tv3 = sum(tv2, {0}); - auto tv4 = add(tv3, tv0); - - fusion.addOutput(tv4); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({y, z}, options); - at::Tensor t1 = at::randn({w, x, y, z}, options); - auto aten_output = t1.to(at::kDouble).sum({0}).sum({0}).add(t0); - - std::vector aten_inputs = {t0, t1}; - - auto lparams = schedulePointwise(&fusion, aten_inputs); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs, lparams); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionTrivialReduction3_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - int v = 1, w = 1, x = 1, y = 7, z = 8; - - auto tv0 = makeSymbolicTensor(2); - auto tv1 = makeConcreteTensor({v, w, x, y, z}); - fusion.addInput(tv0); - fusion.addInput(tv1); - - auto tv2 = sum(tv1, {0, 1, 2}); - auto tv3 = add(tv2, tv0); - - fusion.addOutput(tv3); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({y, z}, options); - at::Tensor t1 = at::randn({v, w, x, y, z}, options); - auto aten_output = t1.sum({0, 1, 2}).add(t0); - - std::vector aten_inputs = {t0, t1}; - - auto lparams = schedulePointwise(&fusion, aten_inputs); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs, lparams); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -// Make sure trivial reductions are correctly detected even with -// scheduling applied. -TEST(NVFuserTest, FusionDetectTrivialReduction1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = broadcast(tv0, {false, true}); - auto tv2 = sum(tv1, {1}); - fusion.addOutput(tv2); - - tv2->split(1, 4); - tv2->split(1, 8); - auto tv3 = tv2->rFactor({-1}); - auto tv4 = tv2->rFactor({-1}); - - auto tv5 = broadcast(tv0, {true, false}); - auto tv6 = add(tv5, new Double(1)); - auto tv7 = sub(tv6, new Double(1)); - auto tv8 = sum(tv7, {0}); - fusion.addOutput(tv8); - - auto tv9 = broadcast(tv0, {false, true, true}); - auto tv10 = sum(tv9, {1}); - auto tv11 = sum(tv10, {1}); - fusion.addOutput(tv11); - - tv8->split(0, 3); - tv10->split(1, 4); - tv11->split(1, 5); - - tv0->computeAt(tv2, -1); - tv0->computeAt(tv8, -1); - tv0->computeAt(tv11, 1); - - // Test indexing to gmem-backed tensors - tv3->setMemoryType(MemoryType::Global); - tv8->setMemoryType(MemoryType::Global); - - GpuLower gpulw(&fusion); - - // No kir::ReductionOp should be generated as all the reduction - // exprs should be replaced with a unary set op. - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - TORCH_CHECK(!kir_node->isA()); - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({100}, options); - std::vector aten_inputs = {t0}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, {t0, t0, t0}, __LINE__, __FILE__); -} - -// Test detection of partially trivial reduction -TEST(NVFuserTest, FusionDetectTrivialReduction2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = sum(tv0, {1}); - auto tv2 = add(tv1, new Double(1)); - fusion.addOutput(tv2); - - tv1->split(1, 1); - // tv1->axis(1): non-trivial - // tv1->axis(2): trivial - - auto tv3 = tv1->rFactor({-1}); - - // Just to suppress register-allocation warning - tv0->computeAt(tv2, 1); - tv3->computeAt(tv1, -1); - - GpuLower gpulw(&fusion); - - // tv3's reduction axis is a trivial reduction. The only - // kir::ReductionOp should be for tv1. - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (kir_node->isA()) { - auto reduction_out = - kir_node->as()->outputs()[0]->as(); - TORCH_CHECK(reduction_out->fuserTv() == tv1); - } - } -} - -TEST(NVFuserTest, FusionInputsIdLookup_CUDA) { - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({16, 8, 8}, options); - at::Tensor t1 = at::randn({8, 8}, options); - at::Tensor t2 = at::randn({6, 4}, options); - - // create a cache with max size 2; - torch::jit::fuser::cuda::InputsIdLookup inputs_id_lookup(2); - - // testing basic function, same encoding for identical inputs - auto id_0 = inputs_id_lookup.lookupId({t0, t1, 5.0}); - auto id_0_lookup = inputs_id_lookup.lookupId({t0, t1, 2.5}); - TORCH_CHECK(id_0.id == id_0_lookup.id); - TORCH_CHECK(inputs_id_lookup.size() == 1); - TORCH_CHECK(id_0.eviction == false); - - // new input (even tho same shape, but we have different signature because of - // missing scalar input - auto id_1 = inputs_id_lookup.lookupId({t0, t1}); - auto id_1_lookup = inputs_id_lookup.lookupId({t0, t1}); - TORCH_CHECK(id_1.id == id_1_lookup.id); - TORCH_CHECK(inputs_id_lookup.size() == 2); - TORCH_CHECK(id_1.eviction == false); - - // eviction should happen at this point - auto id_2 = inputs_id_lookup.lookupId({t2, t1}); - TORCH_CHECK(id_2.id != id_0.id); - TORCH_CHECK(id_2.id != id_1.id); - TORCH_CHECK(inputs_id_lookup.size() == 2); - TORCH_CHECK(id_2.eviction == true); - TORCH_CHECK(id_2.evict_id == id_0.id); - - // look at input 1 again - auto id_1_relook = inputs_id_lookup.lookupId({t0, t1}); - TORCH_CHECK(id_1_relook.id == id_1.id); - TORCH_CHECK(id_1_relook.eviction == false); -} - -TEST(NVFuserTest, FusionGroupGuardSimpleTensor_CUDA) { - std::vector sizes_vec({16, 8, 8}); - std::vector strides_vec({64, 8, 1}); - auto tensor_type = TensorType::create( - at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - // pass with identical shape - auto t0 = at::randn({16, 8, 8}, options); - TORCH_CHECK(complyWith(t0, tensor_type)); - - // pass with dynamic shape - auto t1 = at::randn({16, 16, 8}, options); - TORCH_CHECK(complyWith(t1, tensor_type)); - - // broadcasting semantic change failure - auto t2 = at::randn({16, 1, 8}, options); - TORCH_CHECK(!complyWith(t2, tensor_type)); - - // contiguity failure via slicing - auto t3 = t0.slice(1, 0, 8, 2); - TORCH_CHECK(!complyWith(t3, tensor_type)); - - // contiguity failure via slicing - auto t4 = t0.slice(2, 0, 8, 2); - TORCH_CHECK(!complyWith(t4, tensor_type)); - - // rank failure - auto t5 = at::randn({16, 8, 8, 8}, options); - TORCH_CHECK(!complyWith(t5, tensor_type)); - - // contiguity on stride 1 dimension with implicit broadcasting - auto t = at::randn({4}, options); - auto t6 = t.unsqueeze(1).expand({4, 8}); - TORCH_CHECK(complyWith(t6, TensorType::create(t6))); -} - -TEST(NVFuserTest, FusionGroupGuardBroadcastTensor_CUDA) { - std::vector sizes_vec({16, 1, 8}); - std::vector strides_vec({8, 8, 1}); - auto tensor_type = TensorType::create( - at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - // broadcasting semantic change - auto t0 = at::randn({16, 8, 8}, options); - TORCH_CHECK(!complyWith(t0, tensor_type)); - - // dtype failure - auto t1 = at::randn({16, 1, 8}, options.dtype(at::kHalf)); - TORCH_CHECK(!complyWith(t1, tensor_type)); - - // dtype failure - auto t2 = at::randn({16, 1, 8}, options); - TORCH_CHECK(complyWith(t2, tensor_type)); - - // device inconsistency shouldn't fail - auto t3 = at::randn({16, 1, 8}, options.device(at::kCPU, 0)); - TORCH_CHECK(complyWith(t3, tensor_type)); -} - -TEST(NVFuserTest, FusionGroupGuardPermutedTensor_CUDA) { - std::vector sizes_vec({16, 8, 8}); - std::vector strides_vec({64, 1, 8}); - auto tensor_type = TensorType::create( - at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - // failing permutation - auto t0 = at::randn({16, 8, 8}, options); - TORCH_CHECK(!complyWith(t0, tensor_type)); - - // passing with dynamic shape - auto t1 = t0.permute({0, 2, 1}); - TORCH_CHECK(complyWith(t1, tensor_type)); -} - -TEST(NVFuserTest, FusionGroupGuardRelaxedCheck_CUDA) { - std::vector sizes_vec({16, 8, 8}); - std::vector strides_vec({128, 16, 1}); - auto tensor_type = TensorType::create( - at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - // contiguity check passes although it differs - auto t0 = at::randn({16, 16, 8}, options); - TORCH_CHECK(complyWith(t0, tensor_type)); - - // passing with dynamic shape - auto t1 = t0.slice(1, 0, 16, 2); - TORCH_CHECK(complyWith(t1, tensor_type)); -} - -TEST(NVFuserTest, FusionDisjointSet_CUDA) { - DisjointSet set; - - const std::set group_x({0, 1, 2}); - const std::set group_y({3, 4, 5}); - const std::set group_z({6, 7, 8}); - const std::vector> groups({group_x, group_y, group_z}); - std::set group_all; - std::for_each(groups.begin(), groups.end(), [&](const auto& g) { - group_all.insert(g.begin(), g.end()); - }); - - // Initially, nothing should be considered equivalent - for (auto i : group_all) { - for (auto j : group_all) { - TORCH_CHECK(!set.areEquivalent(i, j)); - } - } - - // Sets values in group_x are equivalent - for (auto i : group_x) { - for (auto j : group_x) { - set.join(i, j); - TORCH_CHECK(set.contains(i)); - TORCH_CHECK(set.contains(j)); - } - } - - // All values in group_x shoudl be equivalent with each other - for (auto i : group_x) { - for (auto j : group_x) { - TORCH_CHECK(set.areEquivalent(i, j)); - } - } - // But nothing else should be equivalent - for (auto i : group_all) { - for (auto j : group_y) { - TORCH_CHECK(!set.areEquivalent(i, j)); - } - for (auto j : group_z) { - TORCH_CHECK(!set.areEquivalent(i, j)); - } - } - - // Sets values in group_y are equivalent - for (auto i : group_y) { - for (auto j : group_y) { - set.join(i, j); - TORCH_CHECK(set.contains(i)); - TORCH_CHECK(set.contains(j)); - } - } - - // group_x should be still equivalent - for (auto i : group_x) { - for (auto j : group_x) { - TORCH_CHECK(set.areEquivalent(i, j)); - } - } - // group_y should be now equivalent - for (auto i : group_y) { - for (auto j : group_y) { - TORCH_CHECK(set.areEquivalent(i, j)); - } - } - // But group_z should not be equivalent with anything yet - for (auto i : group_all) { - for (auto j : group_z) { - TORCH_CHECK(!set.areEquivalent(i, j)); - } - } - - // Sets values in group_z are equivalent - for (auto i : group_z) { - for (auto j : group_z) { - set.join(i, j); - TORCH_CHECK(set.contains(i)); - TORCH_CHECK(set.contains(j)); - } - } - - // Now each of the three groups should be equivalent within each - // group - for (const auto gi : c10::irange(groups.size())) { - for (const auto gj : c10::irange(groups.size())) { - for (auto i : groups[gi]) { - for (auto j : groups[gj]) { - TORCH_CHECK( - (gi == gj && set.areEquivalent(i, j)) || - (gi != gj && !set.areEquivalent(i, j))); - } - } - } - } - - auto all_elements = set.getAllElements(); - std::sort(all_elements.begin(), all_elements.end()); - std::vector group_all_vec(group_all.begin(), group_all.end()); - std::sort(group_all_vec.begin(), group_all_vec.end()); - TORCH_CHECK(all_elements == group_all_vec); - - set.clear(); - all_elements = set.getAllElements(); - TORCH_CHECK(all_elements.size() == 0); - - // All cleared. Nothing should be considered equivalent. - for (auto i : group_all) { - for (auto j : group_all) { - TORCH_CHECK(!set.areEquivalent(i, j)); - } - } -} - -TEST(NVFuserTest, FusionNonUniqueBroadcastSize_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - auto tv1 = makeSymbolicTensor(2); - auto tv2 = makeSymbolicTensor(2); - fusion.addInput(tv0); - fusion.addInput(tv1); - fusion.addInput(tv2); - - auto tv3 = broadcast(tv0, {false, true}); - auto tv4 = add(tv3, tv1); - auto tv5 = add(tv3, tv2); - - fusion.addOutput(tv4); - fusion.addOutput(tv5); - - // In order to do this, tv1->axis(1) and tv2->axis(1) must have the - // same size, but we can't prove it, so this should throw an error. - ASSERT_ANY_THROW(tv3->computeAt(tv4, -1)); -} - -TEST(NVFuserTest, FusionBiasGeluFwd_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const float k_079 = 0.79788456; - const float k_004 = 0.044715; - - // bias vector - auto t0 = makeSymbolicTensor(1, DataType::Half); - fusion.addInput(t0); - auto t1 = castOp(DataType::Float, t0); - // input tensor - auto t2 = makeSymbolicTensor(3, DataType::Half); - fusion.addInput(t2); - auto t3 = castOp(DataType::Float, t2); - auto t4 = broadcast(t1, {true, true, false}); - auto t5 = add(t4, t3); - auto t6 = mul(t5, new Double(0.5)); - auto t7 = mul(t5, new Double(k_079)); - auto t8 = mul(t5, new Double(k_004)); - auto t9 = mul(t8, t5); - auto t10 = add(t9, new Int(1)); - auto t11 = mul(t7, t10); - auto t12 = unaryOp(UnaryOpType::Tanh, t11); - auto t13 = add(t12, new Double(1)); - auto t14 = mul(t6, t13); - auto t15 = castOp(DataType::Half, t14); - fusion.addOutput(t15); - - auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); - at::manual_seed(0); - std::vector input_shape{6, 512, 4096}; - std::vector bias_shape{4096}; - - auto at_input = at::randn(input_shape, options); - auto at_bias = at::randn(bias_shape, options); - - auto at_x = - at_bias.to(c10::ScalarType::Float) + at_input.to(c10::ScalarType::Float); - auto aten_output_float = - at_x * 0.5 * (1.0 + (k_079 * at_x * (1 + k_004 * at_x * at_x)).tanh()); - auto aten_output = aten_output_float.to(c10::ScalarType::Half); - - std::vector aten_inputs = {at_bias, at_input}; - auto lparams = schedulePointwise(&fusion, aten_inputs); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto cg_outputs = fe.runFusion(aten_inputs, lparams); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionBiasGeluBwd_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 6) { - return; - } - Fusion fusion; - FusionGuard fg(&fusion); - - const float k_079 = 0.79788456; - const float k_004 = 0.044715; - const float k_010 = 0.1070322243; - - // gradient tensor - auto t0 = makeSymbolicTensor(3, DataType::Half); - fusion.addInput(t0); - auto t1 = castOp(DataType::Float, t0); - // bias tensor - auto t2 = makeSymbolicTensor(1, DataType::Half); - fusion.addInput(t2); - auto t3 = castOp(DataType::Float, t2); - // input tensor - auto t4 = makeSymbolicTensor(3, DataType::Half); - fusion.addInput(t4); - auto t5 = castOp(DataType::Float, t4); - auto t6 = broadcast(t3, {true, true, false}); - auto t7 = add(t6, t5); - auto t8 = mul(t7, new Double(k_079)); - auto t9 = mul(t7, new Double(k_004)); - auto t10 = mul(t9, t7); - auto t11 = add(t10, new Int(1)); - auto t12 = mul(t8, t11); - auto t13 = unaryOp(UnaryOpType::Tanh, t12); - auto t14 = mul(t7, new Double(0.5)); - auto t15 = mul(t13, t13); - auto t16 = unaryOp(UnaryOpType::Neg, t15); - auto t17 = add(t16, new Int(1)); - auto t18 = mul(t7, new Double(k_010)); - auto t19 = mul(t18, t7); - auto t20 = add(t19, new Double(k_079)); - auto t21 = mul(t17, t20); - auto t22 = mul(t14, t21); - auto t23 = add(t13, new Int(1)); - auto t24 = mul(t23, new Double(0.5)); - auto t25 = add(t22, t24); - auto t26 = mul(t25, t1); - // Save float output for validation - fusion.addOutput(t26); - auto t27 = castOp(DataType::Half, t26); - fusion.addOutput(t27); - - auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); - at::manual_seed(1); - std::vector input_shape{6, 512, 4096}; - std::vector bias_shape{4096}; - auto at_input = at::randn(input_shape, options); - auto at_bias = at::randn(bias_shape, options); - auto at_grad = at::randn(input_shape, options); - - auto at_x = - at_bias.to(c10::ScalarType::Float) + at_input.to(c10::ScalarType::Float); - auto at_tanh_out = (k_079 * at_x * (1 + k_004 * at_x * at_x)).tanh(); - auto at_ff = 0.5 * at_x * - ((1 - at_tanh_out * at_tanh_out) * (k_079 + k_010 * at_x * at_x)) + - 0.5 * (1 + at_tanh_out); - auto at_out = at_ff * at_grad; - auto at_out_half = at_out.to(c10::ScalarType::Half); - - std::vector aten_inputs = {at_grad, at_bias, at_input}; - std::vector aten_outputs = {at_out, at_out_half}; - - auto lparams = schedulePointwise(&fusion, aten_inputs); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto cg_outputs = fe.runFusion(aten_inputs, lparams); - - testValidate( - &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); -} - -// Reproducer of issue #459 -TEST(NVFuserTest, FusionIssue459_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - auto tv1 = makeSymbolicTensor(2); - fusion.addInput(tv1); - - auto tv2 = add(tv0, new Double(1)); - auto tv3 = broadcast(tv2, {true, false}); - auto tv4 = add(tv1, tv3); - - // Create two outputs from the final arithmetic result - auto tv5 = add(tv4, new Double(1)); - fusion.addOutput(tv5); - auto tv6 = add(tv4, new Double(1)); - fusion.addOutput(tv6); - - // Scheduling - for (auto output : ir_utils::filterByType(fusion.outputs())) { - output->merge(-2, -1); - } - for (auto output : ir_utils::filterByType(fusion.outputs())) { - output->split(0, 128); - } - - tv0->computeAt(tv5, -1); - - tv6->axis(0)->parallelize(ParallelType::BIDx); - tv6->axis(1)->parallelize(ParallelType::TIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(0); - const int numel_x = 10; - const int numel_y = 20; - auto t0 = at::randn({numel_x}, options); - auto t1 = at::randn({numel_y, numel_x}, options); - auto aten_output = (t0 + 1).unsqueeze(0) + t1 + 1; - - std::vector aten_inputs = {t0, t1}; - - torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); - - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, - cg_outputs, - aten_inputs, - {aten_output, aten_output}, - __LINE__, - __FILE__); -} - -TEST(NVFuserTest, FusionSmemIndexingSimple_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); - auto tv3 = add(tv2, new Double(1)); - fusion.addOutput(tv3); - - tv3->axis(0)->parallelize(ParallelType::BIDx); - tv3->axis(1)->parallelize(ParallelType::TIDx); - - tv0->computeAt(tv3, -1); - - tv1->setMemoryType(MemoryType::Shared); - tv2->setMemoryType(MemoryType::Global); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - auto aten_input = at::randn({12, 34}, options); - at::Tensor aten_output = aten_input + 1.0 + 1.0 + 1.0; - - auto cg_outputs = fe.runFusion({aten_input}); - - testValidate( - &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionSmemIndexing_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Symbolic integers we will use for runtime tiling - Int* symbolic_m_tile_dim = new Int(); - Int* symbolic_split_k_tile_dim = new Int(); - Int* symbolic_block_k_tile_dim = new Int(); - // Compile-time integer for tiling - int n_smem_tile = 32; - - // Symbolic 2D tensors TV0[M, K], TV1[K, N] - TensorView* tv0 = makeSymbolicTensor(2); - TensorView* tv1 = makeSymbolicTensor(2); - - // Broadcast tv0 to [M, K, *] - TensorView* tv2 = broadcast(tv0, {false, false, true}); - // Broadcast tv1 to [*, K, N] - TensorView* tv3 = broadcast(tv1, {true, false, false}); - - // Pointwise multiplication resulting in tv3[M, K, N] - TensorView* tv4 = mul(tv2, tv3); - - // Sum the K-dim - TensorView* tv5 = sum(tv4, {1}); - - // Register inputs and outputs - fusion.addInput(tv0); - fusion.addInput(tv1); - fusion.addOutput(tv5); - - // Register runtime tile dims as inputs - fusion.addInput(symbolic_m_tile_dim); - fusion.addInput(symbolic_split_k_tile_dim); - fusion.addInput(symbolic_block_k_tile_dim); - - // Make a 3D tile, mix of symbolic and constant, do in reverse order because - // dims are inserted - // [M, rK, N] - tv5->split(2, n_smem_tile); - // [M, rK, No, Ni{32}] - tv5->split(1, symbolic_block_k_tile_dim); - // [M, rKo, rKi{i2}, No, Ni{32}] - tv5->split(1, symbolic_split_k_tile_dim); - // [M, rKoo, rKoi{i1}, rKi{i2}, No, Ni{32}] - tv5->split(0, symbolic_m_tile_dim); - // [Mo, Mi{i0}, rKoo, rKoi{i1}, rKi{i2}, No, Ni{32}] - - // Reorder so all outer tiles are in the leftmost 3 positions - // [Mo, Mi{i0}, rKoo, rKoi{i1}, rKi{i2}, No, Ni{32}] - // [Mo, No, rKoo, rKoi{i1}, rKi{i2}, Mi{i0}, Ni{32}] - tv5->reorder({{1, 5}, {5, 1}}); - - // Factor out the outer reduction IterDomain, then run the inter-cta - // reduction, and intra-cta reduction - // [Mo, No, rKoo, Koi{i1}, Ki{i2}, Mi{i0}, Ni{32}] - // [Mo, No, rKoi{i1}, rKi{i2}, Mi{i0}, Ni{32}] - auto tv6 = tv5->rFactor({2}); - - // Scope computations - tv6->computeAt(tv5, 2); - - // [Mo, No, rKoo, Koi{i1}, Ki{i2}, Mi{i0}, Ni{32}] - // [Mo, No, Ki{i2}, Mi{i0}, Ni{32}, rKoo, Koi{i1}] - tv6->reorder({ - {5, -2}, - {6, -1}, - {2, 2}, - {3, 3}, - {4, 4}, - }); - - // Setup compute at schedule - tv0->computeAt(tv6, 3); - tv1->computeAt(tv6, 3); - tv4->computeAt(tv6, -1); - - // Cache smem tiles - tv2->setMemoryType(MemoryType::Shared); - tv3->setMemoryType(MemoryType::Shared); - tv4->setMemoryType(MemoryType::Shared); - tv6->setMemoryType(MemoryType::Shared); - - tv5->axis(0)->parallelize(ParallelType::BIDz); - tv5->axis(1)->parallelize(ParallelType::BIDy); - - std::vector tv_list = {tv2, tv3, tv4, tv5, tv6}; - for (auto tv : tv_list) { - tv->axis(-2)->parallelize(ParallelType::TIDz); - tv->axis(-1)->parallelize(ParallelType::TIDy); - } - - constexpr int M = 31, K = 65, N = 32; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({M, K}, options); - at::Tensor t1 = at::randn({K, N}, options); - - at::Tensor aten_output = - mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1); - - // A, B, m_tile_dim, split_k, intra_cta_tile - std::vector aten_inputs = {t0, t1, 3, 4, 5}; - - torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); - - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -// Reproducer of issue 408 -TEST(NVFuserTest, FusionCacheBeforeReduction_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = sum(tv1, {1}); - fusion.addOutput(tv2); - - tv2->split(0, 4); - - auto tv3 = tv2->cache_before(); - - tv0->computeAt(tv3, -1); - tv3->computeAt(tv2, -1); - - tv3->axis(-1)->parallelize(ParallelType::TIDx); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - const int numel_x = 100; - const int numel_y = 200; - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor aten_input = at::randn({numel_x, numel_y}, options); - at::Tensor cg_output = at::empty({numel_x}, options); - - auto aten_output = (aten_input + 1).to(at::kDouble).sum({1}); - - fe.runFusion({aten_input}, {cg_output}); - - testValidate( - &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionCacheBeforeReduction2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(3); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = sum(tv1, {1}); - auto tv3 = add(tv2, new Double(1)); - fusion.addOutput(tv2); - fusion.addOutput(tv3); - - auto tv4 = tv2->cache_before(); - - tv4->computeAt(tv3, 1); - tv0->computeAt(tv4, -1); - - tv3->axis(0)->parallelize(ParallelType::BIDx); - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv4->axis(-1)->parallelize(ParallelType::TIDx); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - const int numel_x = 10; - const int numel_y = 20; - const int numel_z = 30; - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor aten_input = at::randn({numel_x, numel_y, numel_z}, options); - auto t2 = (aten_input + 1).to(at::kDouble).sum({1}); - auto t3 = t2 + 1; - std::vector aten_outputs = {t2, t3}; - - auto cg_outputs = fe.runFusion({aten_input}); - - testValidate( - &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionIssue367_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Symbolic integers we will use for runtime tiling - Int* symbolic_m_tile_dim = new Int(); - Int* symbolic_split_k_tile_dim = new Int(); - Int* symbolic_block_k_tile_dim = new Int(); - // Compile-time integer for tiling - int n_smem_tile = 32; - - // Symbolic 2D tensors TV0[M, K], TV1[K, N] - TensorView* tv0 = makeSymbolicTensor(2); - TensorView* tv1 = makeSymbolicTensor(2); - - // Broadcast tv0 to [M, K, *] - TensorView* tv2 = broadcast(tv0, {false, false, true}); - // Broadcast tv1 to [*, K, N] - TensorView* tv3 = broadcast(tv1, {true, false, false}); - - // Pointwise multiplication resulting in tv3[M, K, N] - TensorView* tv4 = mul(tv2, tv3); - - // Sum the K-dim - TensorView* tv5 = sum(tv4, {1}); - - // Register inputs and outputs - fusion.addInput(tv0); - fusion.addInput(tv1); - fusion.addOutput(tv5); - - // Register runtime tile dims as inputs - fusion.addInput(symbolic_m_tile_dim); - fusion.addInput(symbolic_split_k_tile_dim); - fusion.addInput(symbolic_block_k_tile_dim); - - // Make a 3D tile, mix of symbolic and constant, do in reverse order because - // dims are inserted - // [M, K, N] - tv5->split(2, n_smem_tile); - tv5->split(1, symbolic_block_k_tile_dim); - tv5->split(1, symbolic_split_k_tile_dim); - tv5->split(0, symbolic_m_tile_dim); - // [Mo, Mi, Koo, Koi, Ki, No, Ni] - tv5->reorder({{1, 5}, {5, 1}}); - // [Mo, No, Koo, Koi, Ki, Mi, Ni] - - auto tv6 = tv5->rFactor({2}); - auto tv7 = tv5->rFactor({2}); - // [Mo, No, rKoo, Koi, Ki, Mi, Ni] - // [Mo, No, rKoi, rKi, Mi, Ni] - - // Scope computations - tv6->computeAt(tv5, 2); - - tv0->computeAt(tv6, 3); - tv1->computeAt(tv6, 3); - tv4->computeAt(tv6, -1); - - // Cache smem tiles - tv2->setMemoryType(MemoryType::Shared); - tv3->setMemoryType(MemoryType::Shared); - tv4->setMemoryType(MemoryType::Local); - tv6->setMemoryType(MemoryType::Local); - tv7->setMemoryType(MemoryType::Local); - - tv5->axis(0)->parallelize(ParallelType::BIDz); - tv5->axis(1)->parallelize(ParallelType::BIDy); - - std::vector tv_list = {tv2, tv3, tv4, tv5, tv6, tv7}; - for (auto tv : tv_list) { - tv->axis(-2)->parallelize(ParallelType::TIDz); - tv->axis(-1)->parallelize(ParallelType::TIDy); - } - tv2->axis(3)->parallelize(ParallelType::TIDx); - tv3->axis(3)->parallelize(ParallelType::TIDx); - tv4->axis(3)->parallelize(ParallelType::TIDx); - tv6->axis(3)->parallelize(ParallelType::TIDx); - tv7->axis(2)->parallelize(ParallelType::TIDx); - - tv2->axis(4)->parallelize(ParallelType::BIDx); - tv3->axis(4)->parallelize(ParallelType::BIDx); - tv4->axis(4)->parallelize(ParallelType::BIDx); - tv6->axis(4)->parallelize(ParallelType::BIDx); - tv7->axis(3)->parallelize(ParallelType::BIDx); - tv5->axis(2)->parallelize(ParallelType::BIDx); - - constexpr int M = 3, K = 6, N = 16; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor t0 = at::randn({M, K}, options); - at::Tensor t1 = at::randn({K, N}, options); - - // A, B, m, split_k, block_k - std::vector aten_inputs = {t0, t1, 2, 2, 3}; - at::Tensor aten_output = - mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1); - - torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionIssue468_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = sum(tv0, {1}); - auto tv2 = sum(tv1, {0}); - fusion.addOutput(tv2); - - tv1->axis(0)->parallelize(ParallelType::TIDy); - tv1->axis(1)->parallelize(ParallelType::TIDx); - - tv2->axis(0)->parallelize(ParallelType::TIDy); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({10, 100}, options); - at::Tensor aten_output = aten_input.to(at::kDouble).sum({1}).sum({0}); - - torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({aten_input}); - - testValidate( - &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionIssue363_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Symbolic 2D tensors TV0[M, K], TV1[K, N] - TensorView* tv0 = makeSymbolicTensor(2); - TensorView* tv1 = makeSymbolicTensor(2); - - // Broadcast tv0 to [M, K, *] - TensorView* tv2 = broadcast(tv0, {false, false, true}); - // Broadcast tv1 to [*, K, N] - TensorView* tv3 = broadcast(tv1, {true, false, false}); - - // Pointwise multiplication resulting in tv3[M, K, N] - TensorView* tv4 = mul(tv2, tv3); - - // Sum the K-dim - TensorView* tv5 = sum(tv4, {1}); - - // Register inputs and outputs - fusion.addInput(tv0); - fusion.addInput(tv1); - fusion.addOutput(tv5); - - tv2->setMemoryType(MemoryType::Global); - tv3->setMemoryType(MemoryType::Global); - tv4->setMemoryType(MemoryType::Global); - - tv0->computeAt(tv5, -1); - tv1->computeAt(tv5, -1); - - tv5->axis(0)->parallelize(ParallelType::BIDz); - tv5->axis(1)->parallelize(ParallelType::BIDy); - - tv5->axis(2)->parallelize(ParallelType::BIDx); - - constexpr int M = 3, K = 6, N = 16; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor t0 = at::randn({M, K}, options); - at::Tensor t1 = at::randn({K, N}, options); - at::Tensor aten_output = - mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1); - - std::vector aten_inputs = {t0, t1}; - - torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionIssue484_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = sum(tv0, {1}); - auto tv2 = add(tv1, new Double(0)); - fusion.addOutput(tv2); - - tv1->setMemoryType(MemoryType::Global); - tv1->axis(1)->parallelize(ParallelType::TIDx); - - constexpr int M = 100; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor aten_input = at::randn({M, M}, options); - at::Tensor aten_output = aten_input.to(at::kDouble).sum({1}); - - torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({aten_input}); - - testValidate( - &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionIssue329_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = sum(tv1, {1}); - fusion.addOutput(tv2); - auto tv3 = sum(tv1, {1}); - fusion.addOutput(tv3); - - tv1->computeAt(tv2, -1); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - std::vector t0_shape{17, 19}; - auto aten_input = at::randn(t0_shape, options); - auto t2 = (aten_input + 1).to(at::kDouble).sum({1}); - auto t3 = (aten_input + 1).to(at::kDouble).sum({1}); - std::vector aten_outputs = {t2, t3}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto cg_outputs = fe.runFusion({aten_input}); - - testValidate( - &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionIssue382_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = broadcast(tv1, {false, false, true}); - auto tv3 = makeSymbolicTensor(3); - fusion.addInput(tv3); - auto tv4 = add(tv2, tv3); - fusion.addOutput(tv4); - - tv2->merge(1); - tv4->merge(1); - - tv1->computeAt(tv4, 1); - - tv4->axis(0)->parallelize(ParallelType::BIDx); - - tv1->setMemoryType(MemoryType::Global); - tv2->setMemoryType(MemoryType::Global); - - torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); - - const int numel_x = 12; - const int numel_y = 34; - const int numel_z = 56; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(0); - auto t0 = at::randn({numel_x, numel_y}, options); - auto t3 = at::randn({numel_x, numel_y, numel_z}, options); - - std::vector aten_inputs = {t0, t3}; - auto aten_output = (t0 + 1).unsqueeze(-1) + t3; - - auto cg_outputs = fe.runFusion(aten_inputs); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionIssue507_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); - fusion.addOutput(tv2); - - tv1->setMemoryType(MemoryType::Shared); - - tv1->axis(1)->parallelize(ParallelType::TIDx); - tv2->axis(1)->parallelize(ParallelType::TIDx); - tv1->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(0)->parallelize(ParallelType::BIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - std::vector t0_shape{17, 19}; - auto aten_input = at::randn(t0_shape, options); - auto t1 = (aten_input + 1); - auto aten_output = (t1 + 1); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto cg_outputs = fe.runFusion({aten_input}); - - testValidate( - &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionIssue532_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Algorithm - TensorView* tv0 = makeSymbolicTensor(1); - TensorView* tv1 = add(tv0, new Double(1)); - TensorView* tv2 = add(tv1, new Double(1)); - fusion.addInput(tv0); - fusion.addOutput(tv2); - - const int M_BLOCK = 64; - const int M_THREAD = 4; - - tv2->split(0, M_BLOCK); - // tv2: [M/M_BLOCK, M_BLOCK] - tv1->computeAt(tv2, 1); - // tv1: [M/M_BLOCK, M_BLOCK] - - tv1->split(-1, M_BLOCK / M_THREAD); - // tv1: [M/M_BLOCK, M_THREAD, M_BLOCK / M_THREAD] - - tv2->split(-1, M_THREAD); - // tv2: [M/M_BLOCK, M_BLOCK / M_THREAD, M_THREAD] - - constexpr int M = 1000; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor t0 = at::randn({M}, options); - std::vector aten_inputs = {t0}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(aten_inputs); - - at::Tensor aten_output = t0 + 1 + 1; - - testValidate( - &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionLoopUnswitch_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Algorithm - TensorView* tv0 = makeSymbolicTensor(1); - TensorView* tv1 = add(tv0, new Double(1)); - TensorView* tv2 = add(tv1, new Double(1)); - fusion.addInput(tv0); - fusion.addOutput(tv2); - - tv2->split(0, 32); - tv1->computeAt(tv2, -1); - - tv2->axis(1)->parallelize(ParallelType::Unswitch); - - constexpr int M = 1000; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor t0 = at::randn({M}, options); - std::vector aten_inputs = {t0}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(aten_inputs); - - at::Tensor aten_output = t0 + 1 + 1; - - testValidate( - &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionIssue549_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); // M, K - TensorView* tv1 = makeSymbolicTensor(2); // K, N - fusion.addInput(tv0); - fusion.addInput(tv1); - - auto tv2 = add(tv0, new Double(1)); - - TensorView* tv3 = broadcast(tv2, {false, false, true}); - // tv3[I0, I1, B] = tv0[I0, I1] - - TensorView* tv4 = broadcast(tv1, {true, false, false}); - // tv4[B, I1, I2] = tv1[I1, I2] - - // tv5[I0, I1, I2] = tv3[I0, I1, B] * tv4[B, I1, I2] - TensorView* tv5 = mul(tv3, tv4); - // tv6[I0, R1, I2] = tv5[I0, I1, I2] - TensorView* tv6 = sum(tv5, {1}); - fusion.addOutput(tv6); - - tv6->split(1, 32); - // tv6[I0, R1o, R1i{32}, I2] - - auto tv7 = tv6->rFactor({1}); - // tv7[I0, R1o, I1i{32}, I2] = tv5[I0, I1, I2] - // tv6[I0, , R1i{32}, I2] = tv7[I0, R1o, I1i{32}, I2] - - tv6->split(0, 4); - tv6->split(-1, 4); - // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}] - // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}] - - tv0->computeAt(tv6, -1); - tv1->computeAt(tv6, -1); - - // tv7[I0o, I0i{4}, R1o, I1i{32}, I2o, I2i{4}] - // tv6[I0o, I0i{4}, , R1i{32}, I2o, I2i{4}] - //--> (line symbolizes compute at location) - // tv5[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, I1o] - // tv7[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, R1o] - // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|] - - tv0->computeAt(tv7, -1); - tv1->computeAt(tv7, -1); - // tv5[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, I1o |] - // tv7[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, R1o |] - // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|] - - tv6->axis(0)->parallelize(ParallelType::BIDz); - tv6->axis(1)->parallelize(ParallelType::TIDz); - - tv6->axis(-2)->parallelize(ParallelType::BIDy); - tv6->axis(-1)->parallelize(ParallelType::TIDy); - - tv6->axis(2)->parallelize(ParallelType::TIDx); - tv7->axis(2)->parallelize(ParallelType::TIDx); - - constexpr int M = 65, K = 33, N = 17; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor t0 = at::randn({M, K}, options); - at::Tensor t1 = at::randn({K, N}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - // Lets specify a few bounds in launch params to make sure it works - fe.runFusion({t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4)); - - // Make sure bad launch params throws - // TODO: Re-enable once we have parallelization validation in. - // ASSERT_ANY_THROW(fe.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6))); - - // Don't specify any launch params - auto cg_outputs = fe.runFusion({t0, t1}); - - auto aten_output = (t0 + 1).to(at::kDouble).matmul(t1.to(at::kDouble)); - - testValidate( - &fusion, cg_outputs, {t0, t1}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, simplecompileRtc_CUDA) { - FusionExecutor fe; - std::string kernel = R"( -__global__ void kernel1(Tensor T0, Tensor T1) { - if(threadIdx.x==0){ - for(size_t ki28 = 0; ki28 < T0.size[0]; ++ki28) { - T1[ki28*T1.stride[0]] = T0[ki28*T0.stride[0]]*2; - } - } -} - )"; - fe.compileRtc(kernel, "CudaCodeGen::kernel1"); - LaunchParams lp( - 256, // gdimx - 1, // gdimy - 1, // gdimz - 1, // bdimx - 1, // bdimy - 1 // bdimz - ); - lp.setSmem(0); - const auto options = - at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - const std::vector tensor_dims = {8}; - auto in0 = at::randn(tensor_dims, options); - auto out0 = at::empty_like(in0); - fe.runRtc(lp, {in0, out0}); - - auto out_ref = in0 * 2; - TORCH_CHECK(out_ref.allclose(out0)); -} - -TEST(NVFuserTest, FusionSerialWelford_CUDA) { - FusionExecutor fe; - int x = 128, y = 64, z = 64; - - std::string kernel = R"( -__global__ void kernel1( - Tensor inp, - Tensor out_var, - Tensor out_avg -){ - for(int i0=0;i0 tensor_dims = {x, y, z}; - auto in0 = at::randn(tensor_dims, options); - auto out_var = at::empty({x}, options); - auto out_avg = at::empty({x}, options); - fe.runRtc(lp, {in0, out_var, out_avg}); - - TORCH_CHECK(in0.var({1, 2}, false).allclose(out_var)); - TORCH_CHECK(in0.mean({1, 2}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6)); -} - -TEST(NVFuserTest, FusionBlockWelford_CUDA) { - FusionExecutor fe; - int x = 7, y = 8, z = 9; - - std::string kernel = R"( -__global__ void kernel1( - Tensor inp, - Tensor out_avg, - Tensor out_var, - Tensor init_avg, - Tensor init_var, - Tensor init_N -){ - //actual generated kernel will use dynamic shared mem, - // here is just for prototype - __shared__ float mem_avg[512]; - __shared__ float mem_M2[512]; - __shared__ long mem_N[512]; - float in=inp[threadIdx.x*inp.stride[0]+ - threadIdx.y*inp.stride[1]]; - float tmp_avg=0; - float tmp_M2=0; - long tmp_N=0; - blockWelford( - tmp_avg, - tmp_M2, - tmp_N, - in, - 0.f, - (long)1, - threadIdx, - blockDim, - (float*)mem_avg, - (float*)mem_M2, - (long*)mem_N, - (bool)(threadIdx.x tensor_dims = {x, y}; - const std::vector init_dims = {x, z}; - - // generate initial values - auto init_in = at::randn(init_dims, options); - auto init_var = init_in.var({1}, false); - auto init_avg = init_in.mean({1}); - auto init_N = - at::tensor(z, at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0)); - - auto in0 = at::randn(tensor_dims, options); - - // run kernel - auto out_var = at::zeros({x}, options); - auto out_avg = at::zeros({x}, options); - fe.runRtc(lp, {in0, out_avg, out_var, init_avg, init_var, init_N}); - - // compare with reference output - auto cat_tensor = at::cat({init_in, in0}, 1); - TORCH_CHECK(cat_tensor.var({1}, false).allclose(out_var)); - TORCH_CHECK( - cat_tensor.mean({1}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6)); -} - -TEST(NVFuserTest, FusionBlockWelfordNoInit_CUDA) { - FusionExecutor fe; - int x = 7, y = 8, z = 9; - - // need support IValue for integer input as initial count - std::string kernel = R"( -__global__ void kernel1( - Tensor inp, - Tensor out_avg, - Tensor out_var -){ - //actual generated kernel will use dynamic shared mem, - // here is just for prototype - __shared__ float mem_avg[512]; - __shared__ float mem_M2[512]; - __shared__ long mem_N[512]; - float in=inp[threadIdx.x*inp.stride[0]+ - threadIdx.y*inp.stride[1]+ - threadIdx.z*inp.stride[2]]; - float tmp_avg=0; - float tmp_M2=0; - long tmp_N=0; - block_sync::init(); - blockWelford( - tmp_avg, - tmp_M2, - tmp_N, - in, - 0.f, - (long) 1, - threadIdx, - blockDim, - (float*)mem_avg, - (float*)mem_M2, - (long*)mem_N, - (bool)(threadIdx.x tensor_dims = {x, y, z}; - auto in0 = at::randn(tensor_dims, options); - auto out_var = at::empty({x}, options); - auto out_avg = at::empty({x}, options); - fe.runRtc(lp, {in0, out_avg, out_var}); - - TORCH_CHECK(in0.var({1, 2}, false).allclose(out_var)); - TORCH_CHECK(in0.mean({1, 2}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6)); -} - -TEST(NVFuserTest, FusionGridWelfordNoInit_CUDA) { - FusionExecutor fe; - int x = 128, y = 64, z = 128; - - std::string kernel = R"( -__global__ void kernel1( - Tensor inp, - Tensor out_avg, - Tensor out_var, - Tensor work_buf_avg, - Tensor work_buf_M2, - Tensor work_buf_N, - Tensor sync_flag -){ - __shared__ float shared_buf_avg[512]; - __shared__ float shared_buf_M2[512]; - __shared__ long shared_buf_N[512]; - float tmp_avg=0; - float tmp_M2=0; - long tmp_N=0; - float in = inp[ blockIdx.x * inp.stride[0]+ - blockIdx.y * inp.stride[1]+ - threadIdx.x * inp.stride[2]]; - block_sync::init(); - welford::gridWelford< - true,true,false, - true,false,false, - false - >( - tmp_avg, - tmp_M2, - tmp_N, - in, - 0.f, - (long) 1, - &work_buf_avg[0], - &work_buf_M2[0], - &work_buf_N[0], - sync_flag, - (float*)shared_buf_avg, - (float*)shared_buf_M2, - (long*)shared_buf_N, - threadIdx.x tensor_dims = {x, y, z}; - auto in0 = at::randn(tensor_dims, options); - - auto out_avg = at::empty({z}, options); - auto out_var = at::empty({z}, options); - auto work_buf_avg = at::empty({x * y * z}, options); - auto work_buf_var = at::empty({x * y * z}, options); - auto work_buf_N = at::empty({x * y * z}, options_int); - auto sync_flag = at::zeros({1}, options_int); - fe.runRtc( - lp, - {in0, - out_avg, - out_var, - work_buf_avg, - work_buf_var, - work_buf_N, - sync_flag}); - std::vector dims{0, 1}; - - TORCH_CHECK(in0.mean(dims).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6)); - TORCH_CHECK(in0.var(dims, false).allclose(out_var)); -} - -TEST(NVFuserTest, FusionWelfordOp_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - int M = 64, N = 128; - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = mul(tv0, new Double(1)); - auto tvs = Welford(tv1, {1}); - auto tv_avg = tvs.avg; - auto tv_M2 = tvs.var_sum; - auto tv_N = tvs.n; - fusion.addOutput(tv_avg); - fusion.addOutput(tv_M2); - fusion.addOutput(tv_N); - - tv_avg->split(1, 32); - tv_avg->split(0, 32); - tv_avg->split(0, 4); - tv_avg->reorder({{-1, -3}, {-3, -1}}); - tv1->computeAt(tv_avg, -1); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor t0 = at::randn({M, N}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({t0}); - - // by default Welford outputs sum of square diff so need to divide to get var - outputs[1] /= N; - - testValidate( - &fusion, - outputs, - {t0}, - {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N}, - __LINE__, - __FILE__); -} - -TEST(NVFuserTest, FusionBlockWelfordOp_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - int M = 64, N = 128; - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = mul(tv0, new Double(1)); - auto tvs = Welford(tv1, {1}); - auto tv_avg = tvs.avg; - auto tv_M2 = tvs.var_sum; - auto tv_N = tvs.n; - fusion.addOutput(tv_avg); - fusion.addOutput(tv_M2); - fusion.addOutput(tv_N); - - tv_avg->axis(-1)->parallelize(ParallelType::TIDx); - - tv1->computeAt(tv_avg, -1); - - // - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor t0 = at::randn({M, N}, options); - at::Tensor t_var = at::empty({M}, options); - at::Tensor t_avg = at::empty({M}, options); - at::Tensor t_N = at::empty({M}, options_int); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({t0}); - - // by default Welford outputs sum of square diff so need to divide to get var - outputs[1] /= N; - - testValidate( - &fusion, - outputs, - {t0}, - {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N}, - __LINE__, - __FILE__); -} - -TEST(NVFuserTest, FusionGridWelfordOp_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - int M = 64, N = 128; - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = mul(tv0, new Double(1)); - auto tvs = Welford(tv1, {1}); - auto tv_avg = tvs.avg; - auto tv_M2 = tvs.var_sum; - auto tv_N = tvs.n; - fusion.addOutput(tv_avg); - fusion.addOutput(tv_M2); - fusion.addOutput(tv_N); - - tv_avg->axis(0)->parallelize(ParallelType::TIDx); - tv_avg->axis(-1)->parallelize(ParallelType::BIDx); - - tv1->computeAt(tv_avg, -1); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor t0 = at::randn({M, N}, options); - at::Tensor t_avg = at::empty({M}, options); - at::Tensor t_var = at::empty({M}, options); - at::Tensor t_N = at::empty({M}, options_int); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({t0}); - - // by default Welford outputs sum of square diff so need to divide to get var - outputs[1] /= N; - - testValidate( - &fusion, - outputs, - {t0}, - {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N}, - __LINE__, - __FILE__); -} - -TEST(NVFuserTest, FusionRfactorWelfordOp_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - int M = 64, N = 128; - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = mul(tv0, new Double(1)); - auto tvs = Welford(tv1, {1}); - auto tv_avg = tvs.avg; - auto tv_M2 = tvs.var_sum; - auto tv_N = tvs.n; - fusion.addOutput(tv_avg); - fusion.addOutput(tv_M2); - fusion.addOutput(tv_N); - - tv_avg->split(1, 4); - auto rtvs = tvs.rFactor({2}); - tv1->computeAt(tv_avg, -1); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor t0 = at::randn({M, N}, options); - at::Tensor t_avg = at::empty({M}, options); - at::Tensor t_var = at::empty({M}, options); - at::Tensor t_N = at::empty({M}, options_int); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({t0}); - - // by default Welford outputs sum of square diff so need to divide to get var - outputs[1] /= N; - - testValidate( - &fusion, - outputs, - {t0}, - {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N}, - __LINE__, - __FILE__); -} - -TEST(NVFuserTest, FusionWelfordSchedule_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - int M = 64, N = 128; - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = mul(tv0, new Double(1)); - auto tvs = Welford(tv1, {1}); - auto tv_avg = tvs.avg; - auto tv_M2 = tvs.var_sum; - auto tv_N = tvs.n; - fusion.addOutput(tv_avg); - fusion.addOutput(tv_M2); - fusion.addOutput(tv_N); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor t0 = at::randn({M, N}, options); - // TODO: Why do we use launch params from here, but not scheduling??? - auto reduction_params = getReductionHeuristics(&fusion, {t0}); - scheduleReduction(&fusion, reduction_params.value()); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({t0}, reduction_params.value().lparams); - - // by default Welford outputs sum of square diff so need to divide to get var - outputs[1] /= N; - - auto at_avg = t0.mean({1}); - auto at_var = t0.var({1}, false); - auto at_n = at::ones({M}, options_int) * N; - - testValidate( - &fusion, - outputs, - {t0}, - {at_avg, at_var, at_n}, - __LINE__, - __FILE__, - "validate welford", - reduction_params.value().lparams); -} - -namespace { -void testWelford(DataType dtype, int red_axis, int odim, int rdim) { - const int axis = red_axis; - at::ScalarType aten_dtype = data_type_to_aten(dtype); - - Fusion fusion; - FusionGuard fg(&fusion); - TensorView* tv0 = makeSymbolicTensor(2, dtype); - bool is_fp16 = dtype == DataType::Half; - bool is_bf16 = dtype == DataType::BFloat16; - TensorView* tv0_cast = tv0; - if (is_fp16 || is_bf16) { - tv0_cast = castOp(DataType::Float, tv0); - } - fusion.addInput(tv0); - auto tv1 = mul(tv0_cast, new Double(1)); - auto tvs = Welford(tv1, {axis}); - auto tv_avg = tvs.avg; - auto tv_M2 = tvs.var_sum; - auto tv_N = tvs.n; - - TensorView* avg_cast = tv_avg; - TensorView* M2_cast = tv_M2; - - if (is_fp16) { - avg_cast = castOp(DataType::Half, tv_avg); - M2_cast = castOp(DataType::Half, tv_M2); - } - if (is_bf16) { - avg_cast = castOp(DataType::BFloat16, tv_avg); - M2_cast = castOp(DataType::BFloat16, tv_M2); - } - - fusion.addOutput(avg_cast); - fusion.addOutput(M2_cast); - fusion.addOutput(tv_N); - - auto options = at::TensorOptions().dtype(aten_dtype).device(at::kCUDA, 0); - auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0); - at::manual_seed(0); - std::vector outputs_of_red; - at::Tensor aten_input = - (axis ? at::randn({odim, rdim}, options) - : at::randn({rdim, odim}, options)); - - if (is_fp16 || is_bf16) { - outputs_of_red.push_back(avg_cast); - outputs_of_red.push_back(M2_cast); - } - - auto reduction_params = getReductionHeuristics(&fusion, {aten_input}); - scheduleReduction(&fusion, reduction_params.value()); - - auto lparams = reduction_params.value().lparams; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({aten_input}, reduction_params.value().lparams); - - // by default Welford outputs sum of square diff so need to divide to - // get var - - outputs[1] /= rdim; - - auto at_avg = aten_input.mean({axis}); - auto at_var = aten_input.var({axis}, false); - auto at_n = - (axis ? at::ones({odim, rdim}, options) - : at::ones({rdim, odim}, options)); - at_n = at_n.sum({axis}); - - testValidate( - &fusion, - outputs, - {aten_input}, - {at_avg, at_var, at_n}, - __LINE__, - __FILE__, - "validate welford", - reduction_params.value().lparams); -} -} // namespace - -TEST(NVFuserTest, FusionWelfordShmoo_CUDA) { - std::vector dtypes = { - DataType::Double, DataType::Float, DataType::Half}; -#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 - if (at::cuda::getDeviceProperties(0)->major >= 8) { - dtypes.insert(dtypes.end(), DataType::BFloat16); - } -#endif - - std::vector red_axis = {1, 0}; - std::vector output_dims = {160, 320}; - std::vector red_dims; - - // Tried to cut down the number iterations with just - // doing every other power of 2. - for (int i = 1; i <= 1024 * 1024; i <<= 2) { - red_dims.push_back(i); - } - - for (auto dtype : dtypes) { - for (auto& axis : red_axis) { - for (auto& odim : output_dims) { - for (auto& rdim : red_dims) { - // TODO: original welford algorithm actually keeps a running sum of - // squares, i.e. M_{2n} in the - // cf: - // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - // algorithm notation, and it can reach inf for large numbers - // with half precision. skipping too large volumes for half for - // nwo might need further numerical experiments to re-design - // this. - if (rdim > 32768 && - (dtype == DataType::Half || dtype == DataType::BFloat16)) { - continue; - } - testWelford(dtype, axis, odim, rdim); - } - } - } - } -} - -TEST(NVFuserTest, FusionTranspose1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - constexpr int M = 10; - constexpr int N = 20; - - auto tv0 = makeSymbolicTensor(2); - auto tv1 = transpose(tv0, {{0, 1}}); - fusion.addInput(tv0); - fusion.addOutput(tv1); - - tv1->axis(0)->parallelize(ParallelType::BIDx); - tv1->axis(1)->parallelize(ParallelType::TIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor t0 = at::randn({M, N}, options); - std::vector aten_inputs = {t0}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(aten_inputs); - - at::Tensor aten_output = t0.t(); - - testValidate( - &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionTranspose2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - constexpr int M = 10; - constexpr int N = 20; - - auto tv0 = makeSymbolicTensor(2); - auto tv1 = transpose(tv0, {{0, 1}}); - fusion.addInput(tv0); - fusion.addOutput(tv1); - - tv1->merge(0); - tv1->split(0, 32); - - tv1->axis(0)->parallelize(ParallelType::BIDx); - tv1->axis(1)->parallelize(ParallelType::TIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor t0 = at::randn({M, N}, options); - std::vector aten_inputs = {t0}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(aten_inputs); - - at::Tensor aten_output = t0.t(); - - testValidate( - &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionSimpleGemmTransposed_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - - TensorView* tv0 = makeSymbolicTensor(2); // K, M - TensorView* tv1 = makeSymbolicTensor(2); // N, K - fusion.addInput(tv0); - fusion.addInput(tv1); - - TensorView* tv0_t = transpose(tv0, {{0, 1}}); - TensorView* tv1_t = transpose(tv1, {{0, 1}}); - - TensorView* tv2 = broadcast(tv0_t, {false, false, true}); - // tv2[I0, I1, B] = tv0[I0, I1] - - TensorView* tv3 = broadcast(tv1_t, {true, false, false}); - // tv3[B, I1, I2] = tv1[I1, I2] - - // tv4[I0, I1, I2] = tv2[I0, I1, B] * tv3[B, I1, I2] - TensorView* tv4 = mul(tv2, tv3); - // tv5[I0, R1, I2] = tv4[I0, I1, I2] - TensorView* tv5 = sum(tv4, {1}); - fusion.addOutput(tv5); - - tv5->split(1, 32); - // tv5[I0, R1o, R1i{32}, I2] - - auto tv6 = tv5->rFactor({1}); - // tv6[I0, R1o, I1i{32}, I2] = tv4[I0, I1, I2] - // tv5[I0, , R1i{32}, I2] = tv6[I0, R1o, I1i{32}, I2] - - tv5->split(0, 4); - tv5->split(-1, 4); - // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}] - // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}] - - tv0_t->computeAt(tv5, -1); - tv1_t->computeAt(tv5, -1); - - // tv6[I0o, I0i{4}, R1o, I1i{32}, I2o, I2i{4}] - // tv5[I0o, I0i{4}, , R1i{32}, I2o, I2i{4}] - //--> (line symbolizes compute at location) - // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, I1o] - // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, R1o] - // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|] - - tv0_t->computeAt(tv6, -1); - tv1_t->computeAt(tv6, -1); - // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, I1o |] - // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, R1o |] - // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|] - - tv5->axis(0)->parallelize(ParallelType::BIDz); - tv5->axis(1)->parallelize(ParallelType::TIDz); - - tv5->axis(-2)->parallelize(ParallelType::BIDy); - tv5->axis(-1)->parallelize(ParallelType::TIDy); - - tv5->axis(2)->parallelize(ParallelType::TIDx); - tv6->axis(2)->parallelize(ParallelType::TIDx); - - constexpr int M = 65, K = 33, N = 17; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor t0 = at::randn({K, M}, options); - at::Tensor t1 = at::randn({N, K}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - // Lets specify a few bounds in launch params to make sure it works - fe.runFusion({t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4)); - - // Don't specify any launch params - auto cg_outputs = fe.runFusion({t0, t1}); - - auto aten_output = t0.t().to(at::kDouble).matmul(t1.t().to(at::kDouble)); - - testValidate( - &fusion, cg_outputs, {t0, t1}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionSoftmax3DTransposed_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const int tidx = 32; - const int dimx = 32; - const int dimy = 16; - const int dimz = 130; - - // Set up your input tensor views - TensorView* input_tv0 = makeSymbolicTensor(3); - fusion.addInput(input_tv0); - - TensorView* input_t = transpose(input_tv0, {{1, 2}}); - - TensorView* exp_tv1 = unaryOp(UnaryOpType::Exp, input_t); - TensorView* sum_exp_tv2 = sum(exp_tv1, {-1}); - TensorView* bcast_sum_tv3 = broadcast(sum_exp_tv2, {false, false, true}); - - // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be - // computed at sum_exp_rf_tv8. - TensorView* input_t_copy = transpose(input_tv0, {{1, 2}}); - TensorView* exp_tv1_copy = unaryOp(UnaryOpType::Exp, input_t_copy); - - TensorView* output_tv4 = div(exp_tv1_copy, bcast_sum_tv3); - - fusion.addOutput(output_tv4); - - bcast_sum_tv3->split(-1, tidx); - - sum_exp_tv2->split(-1, tidx); - TensorView* sum_exp_rf_tv5 = sum_exp_tv2->rFactor({-2}); - - output_tv4->split(-1, tidx); - - input_t->computeAt(sum_exp_rf_tv5, -1); - input_t_copy->computeAt(output_tv4, -1); - - TensorView* tensors_to_parallelize[] = { - sum_exp_tv2, bcast_sum_tv3, output_tv4, sum_exp_rf_tv5}; - - for (auto tv : tensors_to_parallelize) { - tv->axis(0)->parallelize(ParallelType::BIDx); - tv->axis(1)->parallelize(ParallelType::BIDy); - tv->axis(-1)->parallelize(ParallelType::TIDx); - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({dimx, dimz, dimy}, options); - - at::Tensor cg_output = at::empty({dimx, dimy, dimz}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({input}, {cg_output}); - - auto aten_input_t = at::transpose(input, 1, 2); - auto aten_output = at::_softmax(aten_input_t.to(at::kDouble), -1, false); - - testValidate( - &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedComputeAtTransposed1_CUDA) { - // Case 1 - // tv1 = tv0 * 0.5 - // tv2 = tv1 * -1 - // tv3 = tv1 + 3 - // tv4 = tv1 * 2 - // tv5 = tv3 + tv2 - // tv6 = tv5 + tv4 - // tv7 = tv1 + tv4 - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - tv0 = transpose(tv0, {{0, 1}}); - - TensorView* tv1 = mul(tv0, new Double(0.5)); - TensorView* tv2 = mul(tv1, new Double(-1.0)); - TensorView* tv3 = add(tv1, new Double(3.0)); - TensorView* tv4 = mul(tv1, new Double(2.0)); - TensorView* tv5 = add(tv3, tv2); - - TensorView* tv6 = add(tv5, tv4); - TensorView* tv7 = add(tv1, tv4); - - fusion.addOutput(tv6); - fusion.addOutput(tv7); - - // Lets setup to actually run - tv7->merge(0); - tv7->split(0, 128); - tv7->split(0, 4); - - tv7->axis(0)->parallelize(ParallelType::BIDx); - - tv0->computeAt(tv7, 1); - - // The this-position of the last tensor should be zero. - TORCH_CHECK( - tv7->nDims() == 3 && tv7->getComputeAtPosition() == 0 && - tv7->getMaxProducerPosition() == 1); - TORCH_CHECK( - tv6->nDims() == 3 && tv6->getComputeAtPosition() == 0 && - tv6->getMaxProducerPosition() == 1); - // The position of every other tensor should be 1. - for (auto tv : {tv1, tv2, tv3, tv4, tv5}) { - TORCH_CHECK(tv->nDims() == 3 && tv->getComputeAtPosition() == 1); - } - - for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && - val->getValType().value() == ValType::TensorView) { - TensorView* tv = static_cast(val); - tv->axis(1)->parallelize(ParallelType::Unroll); - tv->axis(-1)->parallelize(ParallelType::TIDx); - } - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor aten_input = at::randn({129, 127}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({aten_input}); - - at::Tensor aten_input_t = aten_input.t(); - - auto t1 = aten_input_t.mul({0.5}); - auto t2 = t1.mul({-1.0}); - auto t3 = t1.add({3.0}); - auto t4 = t1.mul({2.0}); - auto t5 = t3.add(t2); - auto t6 = t5.add(t4); - auto t7 = t1.add(t4); - - std::vector aten_outputs = {t6, t7}; - - testValidate( - &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedComputeAtTransposed2_CUDA) { - // Case 2 - // tv1 = tv0 * -1 - // tv2 = tv0 + 3 - // tv3 = tv0 * 2 - // tv4 = tv2 + tv1 - // tv5 = tv4 + tv3 - // tv6 = tv5 + tv3 - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - tv0 = transpose(tv0, {{0, 1}}); - - TensorView* tv1 = mul(tv0, new Double(-1.0)); - TensorView* tv2 = add(tv0, new Double(3.0)); - TensorView* tv3 = mul(tv0, new Double(2.0)); - TensorView* tv4 = add(tv2, tv1); - - TensorView* tv5 = add(tv4, tv3); - TensorView* tv6 = add(tv5, tv3); - - fusion.addOutput(tv5); - fusion.addOutput(tv6); - - // Lets setup to actually run - tv6->merge(0); - tv6->split(0, 128); - tv6->split(0, 4); - - tv6->axis(0)->parallelize(ParallelType::BIDx); - - tv0->computeAt(tv6, 1); - - for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && - val->getValType().value() == ValType::TensorView) { - TensorView* tv = static_cast(val); - - tv->axis(1)->parallelize(ParallelType::Unroll); - tv->axis(-1)->parallelize(ParallelType::TIDx); - } - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({129, 127}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({input}); - - auto input_t = input.t(); - auto t1 = input_t.mul({-1.0}); - auto t2 = input_t.add({3.0}); - auto t3 = input_t.mul({2.0}); - auto t4 = t2.add(t1); - auto t5 = t4.add(t3); - auto t6 = t5.add(t3); - - std::vector aten_outputs = {t5, t6}; - - testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedComputeAtTransposed3_CUDA) { - // Case 3 - // T2 = T1 * 0.979361 - // T3 = T2 * T0 - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(4); - fusion.addInput(tv0); - - tv0 = transpose(tv0, {{0, 1}, {1, 2}, {2, 3}, {3, 0}}); - - TensorView* tv1 = makeSymbolicTensor(4); - fusion.addInput(tv1); - - tv1 = transpose(tv1, {{0, 1}, {1, 2}, {2, 3}, {3, 0}}); - - TensorView* tv2 = mul(tv1, new Double(.979361)); - TensorView* tv3 = mul(tv2, tv0); - - fusion.addOutput(tv3); - - // Lets setup to actually run - while (tv3->nDims() > 1) - tv3->merge(0); - tv3->split(0, 128); - tv3->split(0, 4); - - tv0->computeAt(tv3, 1); - tv1->computeAt(tv3, 1); - - tv3->axis(0)->parallelize(ParallelType::BIDx); - - for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && - val->getValType().value() == ValType::TensorView) { - TensorView* tv = static_cast(val); - - tv->axis(1)->parallelize(ParallelType::Unroll); - tv->axis(-1)->parallelize(ParallelType::TIDx); - } - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({129, 127, 63, 65}, options); - at::Tensor t1 = at::rand_like(t0, options); - - std::vector aten_inputs = {t0, t1}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - auto t0_t = t0.permute({3, 0, 1, 2}); - auto t1_t = t1.permute({3, 0, 1, 2}); - auto t2 = t1_t.mul({0.979361}); - auto aten_output = t2.mul(t0_t); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedComputeAtTransposed4_CUDA) { - // Case 4 - // T4 = T2 - T3 - // T5 = T1 + T4 - // T6 = T5 - T0 - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(4); - fusion.addInput(tv0); - - tv0 = transpose(tv0, {{0, 1}, {1, 2}, {2, 3}, {3, 0}}); - - TensorView* tv1 = makeSymbolicTensor(4); - fusion.addInput(tv1); - - tv1 = transpose(tv1, {{0, 1}, {1, 2}, {2, 3}, {3, 0}}); - - TensorView* tv2 = makeSymbolicTensor(4); - fusion.addInput(tv2); - - tv2 = transpose(tv2, {{0, 1}, {1, 2}, {2, 3}, {3, 0}}); - - TensorView* tv3 = makeSymbolicTensor(4); - fusion.addInput(tv3); - - tv3 = transpose(tv3, {{0, 1}, {1, 2}, {2, 3}, {3, 0}}); - - TensorView* tv4 = sub(tv2, tv3); - TensorView* tv5 = add(tv1, tv4); - TensorView* tv6 = sub(tv5, tv0); - - fusion.addOutput(tv6); - - // Lets setup to actually run - while (tv6->nDims() > 1) - tv6->merge(0); - tv6->split(0, 128); - tv6->split(0, 4); - - tv0->computeAt(tv6, 1); - tv1->computeAt(tv6, 1); - tv2->computeAt(tv6, 1); - tv3->computeAt(tv6, 1); - - tv6->axis(0)->parallelize(ParallelType::BIDx); - - for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && - val->getValType().value() == ValType::TensorView) { - TensorView* tv = static_cast(val); - - tv->axis(1)->parallelize(ParallelType::Unroll); - tv->axis(-1)->parallelize(ParallelType::TIDx); - } - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({129, 127, 63, 65}, options); - at::Tensor t1 = at::rand_like(t0, options); - at::Tensor t2 = at::rand_like(t0, options); - at::Tensor t3 = at::rand_like(t0, options); - - std::vector aten_inputs = {t0, t1, t2, t3}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - auto t0_t = t0.permute({3, 0, 1, 2}); - auto t1_t = t1.permute({3, 0, 1, 2}); - auto t2_t = t2.permute({3, 0, 1, 2}); - auto t3_t = t3.permute({3, 0, 1, 2}); - auto t4 = t2_t.sub(t3_t); - auto t5 = t1_t.add(t4); - auto aten_output = t5.sub(t0_t); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedComputeAtTransposed5_CUDA) { - // Case 5 - // tv2 = tv0 + 2.0 - // tv3 = tv1 * tv2 - Fusion fusion; - FusionGuard fg(&fusion); - - // Set up your input tensor views - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - tv0 = transpose(tv0, {{0, 1}}); - TensorView* tv1 = makeSymbolicTensor(2); - fusion.addInput(tv1); - tv1 = transpose(tv1, {{0, 1}}); - TensorView* tv2 = add(tv0, new Double(2.0)); - TensorView* tv3 = mul(tv1, tv2); - fusion.addOutput(tv3); - - tv3->merge(0); - tv3->split(-1, 8); - tv3->split(-1, 4); - - tv0->computeAt(tv3, 1); - tv1->computeAt(tv3, 1); - tv3->axis(0)->parallelize(ParallelType::BIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({63, 65}, options); - at::Tensor t1 = at::rand_like(t0, options); - - std::vector aten_inputs = {t0, t1}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - auto t2 = t0.t().add(2.0); - auto aten_output = t1.t().mul(t2); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionAdvancedComputeAtTransposed6_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - tv0 = transpose(tv0, {{0, 1}}); - TensorView* tv1 = makeSymbolicTensor(2); - fusion.addInput(tv1); - tv1 = transpose(tv1, {{0, 1}}); - TensorView* tv2 = add(tv0, new Double(2.0)); - TensorView* tv3 = mul(tv1, tv2); - fusion.addOutput(tv3); - - tv2->merge(0); - tv2->split(-1, 8); - tv2->split(-1, 4); - tv3->merge(0); - tv3->split(-1, 8); - - tv0->computeAt(tv3, 1); - tv1->computeAt(tv3, 1); - - tv3->axis(0)->parallelize(ParallelType::BIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({63, 65}, options); - at::Tensor t1 = at::rand_like(t0, options); - - std::vector aten_inputs = {t0, t1}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - auto t2 = t0.t().add(2.0); - auto aten_output = t1.t().mul(t2); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionSegmentReducePointwise_CUDA) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); - - TensorView* tv0 = makeSymbolicTensor(2); - TensorView* tv1 = makeSymbolicTensor(1); - TensorView* tv2 = makeSymbolicTensor(2); - - fusion->addInput(tv0); - fusion->addInput(tv1); - fusion->addInput(tv2); - - TensorView* tv3 = add(tv0, new Double(1)); // Group 0 - TensorView* tv4 = - max(tv3, {0}); // Group 0 (use max instead to avoid numerical issues) - TensorView* tv5 = add(tv4, tv1); // Group 0 (Non Broadcast after reduce, - // keeps normalization scheduler away) - TensorView* tv6 = add(tv5, tv2); // Group 1 (Broadcast after reduce) - - fusion->addOutput(tv6); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({128, 65}, options); - at::Tensor t1 = at::randn({65}, options); - at::Tensor t2 = at::randn({128, 65}, options); - - auto t3 = t0.add(1.0); - auto t4 = std::get<0>(at::max(t3, 0)); - auto t5 = t4.add(t1); - auto t6 = t5.add(t2); - - FusionExecutorCache executor_cache(std::move(fusion)); - - auto outputs = executor_cache.runFusionWithInputs({t0, t1, t2}); - - TORCH_CHECK( - executor_cache.getMostRecentKernelRuntime()->isSegmented(), - "segmentation didn't happen"); - TORCH_CHECK( - executor_cache.getMostRecentKernelRuntime() - ->fusionSegments() - ->groups() - .size() == 2, - "segmentation didn't happen as expected"); - - testValidate( - executor_cache.fusion(), outputs, {t0, t1, t2}, {t6}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionMultipleVectorize_CUDA) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); - - TensorView* tv0 = makeContigTensor(1); - TensorView* tv1 = makeContigTensor(1); - - fusion->addInput(tv0); - fusion->addInput(tv1); - - TensorView* tv3 = add(tv0, tv1); - fusion->addOutput(tv3); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({40960}, options); - at::Tensor t1 = at::randn({40960}, options); - auto t2 = t0 + t1; - - FusionExecutorCache executor_cache(std::move(fusion)); - executor_cache.profile(true); - - auto outputs = executor_cache.runFusionWithInputs({t0, t1}); - auto runtime1 = executor_cache.getMostRecentKernelRuntime(); - auto log1 = executor_cache.getMostRecentExecutorInfo().pointwise_params; - TORCH_CHECK(log1.has_value()); - TORCH_CHECK(log1->vectorize); - - testValidate( - executor_cache.fusion(), outputs, {t0, t1}, {t2}, __LINE__, __FILE__); - - t0 = at::randn({40964}, options); - t1 = at::randn({40964}, options); - t2 = t0 + t1; - - outputs = executor_cache.runFusionWithInputs({t0, t1}); - auto runtime2 = executor_cache.getMostRecentKernelRuntime(); - auto log2 = executor_cache.getMostRecentExecutorInfo().pointwise_params; - TORCH_CHECK(log2.has_value()); - TORCH_CHECK(log2->vectorize); - - testValidate( - executor_cache.fusion(), outputs, {t0, t1}, {t2}, __LINE__, __FILE__); - - t0 = at::randn({40962}, options); - t1 = at::randn({40962}, options); - t2 = t0 + t1; - - outputs = executor_cache.runFusionWithInputs({t0, t1}); - auto runtime3 = executor_cache.getMostRecentKernelRuntime(); - auto log3 = executor_cache.getMostRecentExecutorInfo().pointwise_params; - TORCH_CHECK(log3.has_value()); - TORCH_CHECK(log3->vectorize); - - testValidate( - executor_cache.fusion(), outputs, {t0, t1}, {t2}, __LINE__, __FILE__); - - TORCH_CHECK(runtime1 == runtime2); - TORCH_CHECK(runtime1 != runtime3); -} - -TEST(NVFuserTest, FusionVectorizeSimple_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* tv0 = makeContigTensor(3); - - fusion.addInput(tv0); - - auto tv1 = unaryOp(UnaryOpType::Sin, tv0); - - fusion.addOutput(tv1); - - auto tv0_cache = tv0->cache_after(); - - auto tv1_cache = tv1->cache_before(); - - tv1->merge(0); - tv1->merge(0); - tv1->split(0, 4); - tv1->split(0, 128); - - tv1->axis(0)->parallelize(ParallelType::BIDx); - tv1->axis(1)->parallelize(ParallelType::TIDx); - - tv0->computeAt(tv1, 2); - - tv0_cache->axis(2)->parallelize(ParallelType::Vectorize); - tv1->axis(2)->parallelize(ParallelType::Vectorize); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor aten_input = at::empty({2, 6, 32}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({aten_input}); - - at::Tensor aten_output = aten_input.sin(); - - testValidate( - &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionSimpleVectorizeUnroll_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - // dimensionality of the problem - int nDims = 3; - - // Set up your input tensor views - TensorView* tv0 = makeContigTensor(nDims); - TensorView* tv1 = makeContigTensor(nDims); - - // Register your inputs - fusion.addInput(tv0); - fusion.addInput(tv1); - - // Do math with it, it returns a `Val*` but can be static_casted back to - // TensorView - TensorView* tv2 = add(tv1, new Double(2.0)); - TensorView* tv3 = add(tv0, tv2); - - // Register your outputs - fusion.addOutput(tv3); - - auto tv0_cache = tv0->cache_after(); - auto tv1_cache = tv1->cache_after(); - auto tv3_cache = tv3->cache_before(); - - // Do transformations, remember, transformations are outputs to inputs - // This doesn't have to be in this order - tv3->merge(1); - - // Split by n_threads - tv3->split(1, 2); - tv3->split(0, 3); - tv3->split(0, 1); - - // [bidx, unswitch, unroll{2}, tidx, vectorize{2}] - - // Parallelize TV3 - tv3->axis(0)->parallelize(ParallelType::BIDx); - tv3->axis(1)->parallelize(ParallelType::Unswitch); - tv3->axis(2)->parallelize(ParallelType::Unroll); - tv3->axis(3)->parallelize(ParallelType::TIDx); - - tv3->reorder({{4, 2}}); - // [bidx, unswitch, vectorize{2}, unroll{2}, tidx] - - TransformPropagator::from(tv3); - scheduler_utils::parallelizeAllLike(tv3, ir_utils::allTvs(&fusion)); - - tv0_cache->axis(2)->parallelize(ParallelType::Vectorize); - tv1_cache->axis(2)->parallelize(ParallelType::Vectorize); - tv3->axis(2)->parallelize(ParallelType::Vectorize); - - // For all inputs, computeAt the output inline, temporaries should be squeezed - // between them - tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined); - tv1->computeAt(tv3, -1, ComputeAtMode::MostInlined); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor input1 = at::randn({64, 2, 128}, options); - at::Tensor input2 = at::rand_like(input1); - at::Tensor output = at::empty_like(input1); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({input1, input2}, {output}); - - at::Tensor tv2_ref = input2 + 2.0; - at::Tensor output_ref = input1 + tv2_ref; - - TORCH_CHECK(output_ref.equal(output)); -} - -TEST(NVFuserTest, FusionSegmentReduceSoftmax_CUDA) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); - - std::vector input_shape{32, 64, 8}; - const int kReductionAxis = 1; - - auto tv0 = TensorViewBuilder() - .ndims(input_shape.size()) - .dtype(DataType::Double) - .build(); - - fusion->addInput(tv0); - - auto tv1 = add(tv0, new Double(1.0)); - auto tv2 = sum(tv1, {2}); // Group 0 - - auto output = softmax(tv2, kReductionAxis); // Group 1 - fusion->addOutput(output); - - auto options = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA, 0); - at::Tensor at_x = at::randn(input_shape, options); - - FusionExecutorCache executor_cache(std::move(fusion)); - - auto outputs = executor_cache.runFusionWithInputs({at_x}); - - auto t1 = at_x.add(1.0); - auto t2 = t1.sum({2}); - auto t3 = at::_softmax(t2.to(at::kDouble), -1, false); - - auto optimized_fusion = executor_cache.getMostRecentKernelRuntime(); - TORCH_CHECK(optimized_fusion->isSegmented(), "segmentation didn't happen"); - TORCH_CHECK( - optimized_fusion->fusionSegments()->groups().size() == 2, - "segmentation didn't happen as expected"); - - testValidate( - executor_cache.fusion(), outputs, {at_x}, {t3}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionSwizzle1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = mul(tv1, new Double(2)); - fusion.addOutput(tv2); - - tv2->split(0, 7); - tv2->split(0, 9); - - tv0->computeAt(tv2, 1); - - tv2->axis(0)->parallelize(ParallelType::BIDx); - - tv1->setMemoryType(MemoryType::Shared); - tv1->swizzle(SwizzleType::Transpose, {1, 2}); - - tv1->axis(1)->parallelize(ParallelType::TIDx); - tv1->axis(2)->parallelize(ParallelType::TIDy); - - tv2->axis(1)->parallelize(ParallelType::TIDx); - tv2->axis(2)->parallelize(ParallelType::TIDy); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({100}, options); - - std::vector aten_inputs = {t0}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - auto aten_output = (t0 + 1) * 2; - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionSwizzle2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = mul(tv1, new Double(2)); - fusion.addOutput(tv2); - - tv1->split(-1, 4); - tv1->split(-2, 4); - - tv2->split(-1, 4); - tv2->split(-2, 4); - - tv0->computeAt(tv2, 1); - - tv2->reorder({{-1, -2}}); - - tv1->setMemoryType(MemoryType::Shared); - tv1->swizzle(SwizzleType::Transpose, {-2, -1}); - - tv2->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(-2)->parallelize(ParallelType::TIDy); - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv1->axis(-2)->parallelize(ParallelType::TIDy); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({123}, options); - - std::vector aten_inputs = {t0}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - auto aten_output = (t0 + 1) * 2; - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionTransposeWithSwizzle_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = transpose(tv0, {{0, 1}}); - fusion.addOutput(tv1); - - // tv0: [I0, I1] - // tv1: [I1, I0] - - const int BS = 32; - - // CTA tiling by BS*BS - tv1->split(1, BS); - tv1->split(0, BS); - tv1->reorder({{1, 2}}); - // tv1: [I1/BS, I0/BS, BS(I1), BS(I0)] - - // Create a smem buffer to cache each tile - auto tv0_cache = tv0->cache_after(); - tv0_cache->setMemoryType(MemoryType::Shared); - - tv0->computeAt(tv1, 2); - // tv0: [I0, I1] - // tv0_cache: [I1/BS, I0/BS, BS(I1), BS(I0)] - // tv1: [I1/BS, I0/BS, BS(I1), BS(I0)] - - // Assign each thread block to a tile - tv1->axis(0)->parallelize(ParallelType::BIDy); - tv1->axis(1)->parallelize(ParallelType::BIDx); - - // Thread mapping for each tile. For both of the input and output - // tiles, map TIDx to the fastest-changing dimension to facilitate - // coalesced gmem accesses. - tv1->axis(2)->parallelize(ParallelType::TIDy); - tv1->axis(3)->parallelize(ParallelType::TIDx); - // Note that the fastest-changing axis is next to the inner-most - // axis since computeAt reorders the axes as the output tensor. - tv0_cache->axis(2)->parallelize(ParallelType::TIDx); - tv0_cache->axis(3)->parallelize(ParallelType::TIDy); - - // Swizzles the smem cache to avoid bank conflicts - tv0_cache->swizzle(SwizzleType::Transpose, {3, 2}); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - const int bx = 100; - const int by = 200; - at::Tensor t0 = at::randn({bx, by}, options); - std::vector aten_inputs = {t0}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto cg_outputs = fe.runFusion(aten_inputs); - - auto aten_output = t0.t(); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionTransposeWithSwizzle1DThreadBlock_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = transpose(tv0, {{0, 1}}); - fusion.addOutput(tv1); - - // tv0: [I0, I1] - // tv1: [I1, I0] - - const int BS = 32; - const int BDIM = 256; - - // CTA tiling by BS*BS - tv1->split(1, BS); - tv1->split(0, BS); - tv1->reorder({{1, 2}}); - // tv1: [I1/BS, I0/BS, BS(I1), BS(I0)] - - // Create a smem buffer to cache each tile - auto tv0_cache = tv0->cache_after(); - tv0_cache->setMemoryType(MemoryType::Shared); - - tv0->computeAt(tv1, 2); - // tv0: [I0, I1] - // tv0_cache: [I1/BS, I0/BS, BS*BS/BDIM, BDIM] - // tv1: [I1/BS, I0/BS, BS*BS/BDIM, BDIM] - - // Tranform the tile axes for 1D thread mapping - tv1->merge(-2, -1); - tv1->split(-1, BDIM); - // tv1: [I1/BS, I0/BS, BS*BS/BDIM, BDIM] - - // Transform the cache similarly but apply swizzle to the 2D tile axes. - tv0_cache->reorder({{-2, -1}}); - tv0_cache->swizzle(SwizzleType::Transpose, {2, 3}); - tv0_cache->merge(-2, -1); - tv0_cache->split(-1, BDIM); - // tv0: [I1/BS, I0/BS, BS*BS/BDIM, BDIM] - - // Assign each thread block to a tile - tv1->axis(0)->parallelize(ParallelType::BIDy); - tv1->axis(1)->parallelize(ParallelType::BIDx); - - // Thread mapping for each tile. - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv0_cache->axis(-1)->parallelize(ParallelType::TIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - const int bx = 100; - const int by = 200; - at::Tensor t0 = at::randn({bx, by}, options); - std::vector aten_inputs = {t0}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto cg_outputs = fe.runFusion(aten_inputs); - - auto aten_output = t0.t(); - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionGridPersistence_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 6) { - return; - } - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = sum(tv0, {0}); - auto tv2 = broadcast(tv1, {true}); - auto tv3 = add(tv0, tv2); - fusion.addOutput(tv3); - - std::vector tvs = {tv1, tv2, tv3}; - for (auto tv : tvs) { - tv->split(0, 2); - tv->axis(0)->parallelize(ParallelType::BIDx); - tv->axis(1)->parallelize(ParallelType::BIDy); - } - - const int numel_x = 10; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({numel_x}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto out = fe.runFusion({input}); - - auto aten_output = input.sum({0}).unsqueeze(-1).add(input); - - testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionGridPersistence2_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 6) { - return; - } - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = sum(tv0, {0}); - auto tv2 = broadcast(tv1, {true, false}); - auto tv3 = add(tv0, tv2); - fusion.addOutput(tv3); - - std::vector tvs = {tv1, tv2, tv3}; - for (auto tv : tvs) { - tv->split(0, 2); - tv->axis(0)->parallelize(ParallelType::BIDx); - tv->axis(1)->parallelize(ParallelType::TIDy); - tv->axis(2)->parallelize(ParallelType::TIDx); - } - - const int numel_x = 10; - const int numel_y = 3; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({numel_x, numel_y}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto out = fe.runFusion({input}); - - auto aten_output = input.sum({0}).unsqueeze(0).add(input); - - testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionWelfordPersistence_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 6) { - return; - } - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tvs = Welford(tv0, {0}); - auto tv4 = add(tvs.avg, tvs.var_sum); - auto tv5 = broadcast(tv4, {true}); - auto tv6 = add(tv0, tv5); - fusion.addOutput(tv6); - - std::vector schedule_tvs = { - tvs.avg, tvs.var_sum, tvs.n, tv5, tv6}; - - for (auto tv : schedule_tvs) { - tv->split(0, 2); - tv->axis(0)->parallelize(ParallelType::BIDx); - tv->axis(1)->parallelize(ParallelType::BIDy); - } - - const int numel_x = 10; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({numel_x}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto out = fe.runFusion({input}); - - auto aten_output = (input.mean({0}) + (input.var({0}, false) * numel_x)) - .unsqueeze(-1) - .add(input); - - testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionWelfordPersistence2_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 6) { - return; - } - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tvs = Welford(tv0, {0}); - auto tv4 = add(tvs.avg, tvs.var_sum); - auto tv5 = broadcast(tv4, {true, false}); - auto tv6 = add(tv0, tv5); - fusion.addOutput(tv6); - - std::vector schedule_tvs = { - tvs.avg, tvs.var_sum, tvs.n, tv5, tv6}; - for (auto tv : schedule_tvs) { - tv->split(0, 2); - tv->axis(0)->parallelize(ParallelType::BIDx); - tv->axis(1)->parallelize(ParallelType::TIDy); - tv->axis(2)->parallelize(ParallelType::TIDx); - } - tv4->axis(0)->parallelize(ParallelType::TIDx); - - const int numel_x = 10; - const int numel_y = 3; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({numel_x, numel_y}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto out = fe.runFusion({input}); - - auto aten_output = (input.mean({0}) + (input.var({0}, false) * numel_x)) - .unsqueeze(0) - .add(input); - - testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionIssue633_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const int dx = 10; - const int dy = 11; - const int dz = 12; - - auto tv0 = makeConcreteTensor({dx, dy, dz}); - fusion.addInput(tv0); - auto tv1 = makeConcreteTensor({dx, dy, 1}); - fusion.addInput(tv1); - auto tv2 = add(tv0, tv1); - fusion.addOutput(tv2); - - tv2->merge(1); - tv2->merge(0); - tv2->split(-1, 128); - - tv2->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(1)->parallelize(ParallelType::TIDx); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({dx, dy, dz}, options); - at::Tensor t1 = at::randn({dx, dy, 1}, options); - std::vector aten_inputs = {t0, t1}; - - auto cg_outputs = fe.runFusion(aten_inputs); - - auto aten_output = t0 + t1; - - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionKirScoping_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(2)); - fusion.addOutput(tv2); - - tv2->merge(0); - tv2->split(0, 4); - tv0->computeAt(tv2, -1); - - GpuLower gpulw(&fusion); - - auto kir_tv1 = gpulw.lowerValue(tv1); - auto tv1_scope = kir_tv1->definition()->scope(); - TORCH_CHECK(tv1_scope != nullptr); - TORCH_CHECK(tv1_scope->owner()->as()); - - auto kir_tv2 = gpulw.lowerValue(tv2); - auto tv2_scope = kir_tv2->definition()->scope(); - TORCH_CHECK(tv2_scope != nullptr); - TORCH_CHECK(tv2_scope->owner()->as()); - - TORCH_CHECK(tv1_scope != tv2_scope); - - // tv1 and tv2 should have the same inner-most ForLoop - auto parent_scope = tv1_scope->owner()->scope(); - TORCH_CHECK(parent_scope == tv2_scope->owner()->scope()); - TORCH_CHECK(parent_scope->owner()->as()); - // There should be one more loop - parent_scope = parent_scope->owner()->scope(); - TORCH_CHECK(parent_scope->owner()->as()); - - // scope() should return nullptr for top-level exprs - auto top_level_scope = parent_scope->owner()->scope(); - TORCH_CHECK(top_level_scope == nullptr); -} - -TEST(NVFuserTest, FusionBroadcastAcrossComputeAt_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - std::vector shape{17, 19}; - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - auto tv1 = makeSymbolicTensor(2); - fusion.addInput(tv1); - auto tv2 = broadcast(tv0, {false, true}); - auto tv3 = add(tv1, tv2); - fusion.addOutput(tv3); - - tv3->split(1, 128); - tv0->computeAt(tv3, 2); - - for (auto tv : {tv2, tv3}) { - tv->axis(-1)->parallelize(ParallelType::TIDx); - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({shape[0]}, options); - at::Tensor t1 = at::randn(shape, options); - std::vector aten_inputs = {t0, t1}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - auto t3 = t0.unsqueeze(-1).expand(shape) + t1; - - testValidate(&fusion, cg_outputs, aten_inputs, {t3}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionVectorizeMisalignedPointwise_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeContigTensor(2); - auto tv1 = makeContigTensor(2); - fusion.addInput(tv0); - fusion.addInput(tv1); - - auto tv2 = add(tv0, tv1); - fusion.addOutput(tv2); - - const int kTDX = 64; - const int kVecSize = 4; - const int kNumElems = kTDX * kVecSize; - - tv2->split(1, kNumElems); - - auto c0 = tv0->cache_after(); - auto c1 = tv1->cache_after(); - auto c2 = tv2->cache_before(); - - tv2->split(-1, kVecSize); - - c0->computeAt(tv2, -2); - c1->computeAt(tv2, -2); - - c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize); - c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize); - - tv2->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(-2)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - const int bx = 128; - const int by = 457; - at::Tensor t0 = at::randn({bx, by}, options); - at::Tensor t1 = at::randn({bx, by}, options); - - std::vector aten_inputs = {t0, t1}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - auto aten_output = t0 + t1; - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeContig_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeContigTensor(4); - auto tv1 = makeContigTensor(4); - fusion.addInput(tv0); - fusion.addInput(tv1); - - auto tv2 = add(tv0, tv1); - fusion.addOutput(tv2); - - tv2->reorder({{0, 1}, {1, 0}}); - tv2->merge(-2); - - const int kTDX = 64; - const int kVecSize = 2; - const int kNumElems = kTDX * kVecSize; - - tv2->split(-1, kNumElems); - - auto c0 = tv0->cache_after(); - auto c1 = tv1->cache_after(); - auto c2 = tv2->cache_before(); - - tv2->split(0, 128); - tv2->split(-1, kVecSize); - - c0->computeAt(tv2, -2); - c1->computeAt(tv2, -2); - - c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize); - c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize); - - tv2->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(1)->parallelize(ParallelType::BIDy); - tv2->axis(-2)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - const int n = 32; - const int c = 127; - const int h = 51; - const int w = 23; - at::Tensor t0 = at::randn({n, c, h, w}, options); - at::Tensor t1 = at::randn({n, c, h, w}, options); - - std::vector aten_inputs = {t0, t1}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - auto aten_output = t0 + t1; - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicPass_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - constexpr int kNumDims = 4; - constexpr int kTDX = 64; - constexpr int kVecSize = 2; - constexpr int kNumElems = kTDX * kVecSize; - - auto tv0 = makeSymbolicTensor(kNumDims); - auto tv1 = makeSymbolicTensor(kNumDims); - fusion.addInput(tv0); - fusion.addInput(tv1); - - auto tv2 = add(tv0, tv1); - fusion.addOutput(tv2); - - // Create caches for vectorization - auto c0 = tv0->cache_after(); - auto c1 = tv1->cache_after(); - auto c2 = tv2->cache_before(); - - // Merge all dimensions together except inner-most dim - for (const auto idx : c10::irange(kNumDims - 2)) { - tv2->merge(0); - } - // Split inner-most dim - tv2->split(-1, kNumElems); - tv2->split(-1, kVecSize); - TransformPropagator::from(tv2); - - c0->computeAt(tv2, -2); - c1->computeAt(tv2, -2); - - // Parallelization Strategy - c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize); - c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize); - - tv2->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(2)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - const int n = 5; - const int c = 3; - const int h = 51; - const int w = 257; - at::Tensor t0 = at::randn({n, c, h, w}, options); - at::Tensor t1 = at::randn({n, c, h, w}, options); - - std::vector aten_inputs = {t0, t1}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - auto aten_output = t0 + t1; - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicFail_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - constexpr int kNumDims = 4; - constexpr int kTDX = 64; - constexpr int kVecSize = 2; - constexpr int kNumElems = kTDX * kVecSize; - std::vector bcast_shape{1, 1, 1, -1}; - - auto tv0 = makeContigTensor(kNumDims); - auto tv1 = TensorViewBuilder().shape(bcast_shape).build(); - fusion.addInput(tv0); - fusion.addInput(tv1); - - auto tv2 = add(tv0, tv1); - fusion.addOutput(tv2); - - // Create caches for vectorization - auto c0 = tv0->cache_after(); - auto c1 = tv1->cache_after(); - auto c2 = tv2->cache_before(); - - // Merge all dimensions together - // Backward merge order is necessary for vectorize validation - for (int idx = kNumDims - 1; idx > 0; --idx) { - tv2->merge(idx - 1); - } - tv2->split(-1, kNumElems); - tv2->split(-1, kVecSize); - TransformPropagator::from(tv2); - - c0->computeAt(tv2, -2); - c1->computeAt(tv2, -2); - - // Parallelization Strategy - c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize); - c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize); - - tv2->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(1)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - const int n = 32; - const int c = 128; - const int h = 51; - const int w = 23; - at::Tensor t0 = at::randn({n, c, h, w}, options); - at::Tensor t1 = at::randn({1, 1, 1, w}, options); - - std::vector aten_inputs = {t0, t1}; - - FusionExecutor fe; - // TODO: throw assertion - cannot merge non-contiguous vectorization axes - // Make sure compilation fails - ASSERT_ANY_THROW(fe.compileFusion(&fusion)); -} - -TEST(NVFuserTest, FusionVectorizeMisalignedRFactor_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeContigTensor(2); - auto tv1 = makeContigTensor(2); - - fusion.addInput(tv0); - fusion.addInput(tv1); - - auto tv2 = add(tv0, tv1); - - auto tv3 = sum(tv2, {-1}); - - fusion.addOutput(tv3); - - auto c0 = tv0->cache_after(); - auto c1 = tv1->cache_after(); - - tv3->split(-1, 128 * 4); - tv3->split(-1, 4); - // Reduce outer dim first - auto tv4 = tv3->rFactor({-3, -1}); - // Tv3 will reduce threads - - tv0->computeAt(tv3, 1); - tv1->computeAt(tv3, 1); - - tv3->axis(0)->parallelize(ParallelType::BIDx); - - tv0->computeAt(tv4, -2); - tv1->computeAt(tv4, -2); - - c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize); - c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize); - - tv4->axis(-2)->parallelize(ParallelType::TIDx); - tv3->axis(1)->parallelize(ParallelType::TIDx); - - tv2->computeAt(tv4, -1); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - const int bx = 128; - const int by = 2050; - at::Tensor t0 = at::randn({bx, by}, options); - at::Tensor t1 = at::randn({bx, by}, options); - - std::vector aten_inputs = {t0, t1}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - auto aten_output = t0.add(t1).sum(1); - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionVectorizeMisalignedWrongDimFail_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeContigTensor(2); - auto tv1 = makeContigTensor(2); - - fusion.addInput(tv0); - fusion.addInput(tv1); - - auto tv2 = add(tv0, tv1); - fusion.addOutput(tv2); - - tv2->split(1, 16); - tv2->split(1, 64); - - tv2->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(2)->parallelize(ParallelType::TIDx); - - auto c0 = tv0->cache_after(); - auto c1 = tv1->cache_after(); - auto c2 = tv2->cache_before(); - - c0->computeAt(tv2, -2); - c1->computeAt(tv2, -2); - - std::vector vectorized_tvs = {c0, c1, tv2}; - for (auto tv : vectorized_tvs) { - tv->split(-1, 4); - // Vectorize the wrong dimension - tv->axis(-2)->parallelize(ParallelType::MisalignedVectorize); - } - - FusionExecutor fe; - // Make sure compilation fails - ASSERT_ANY_THROW(fe.compileFusion(&fusion)); -} - -TEST(NVFuserTest, FusionVectorizeMisalignedStride_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - auto tv1 = makeSymbolicTensor(2); - - fusion.addInput(tv0); - fusion.addInput(tv1); - - auto tv2 = add(tv0, tv1); - fusion.addOutput(tv2); - - const int kTDX = 64; - const int kVecSize = 4; - const int kNumElems = kTDX * kVecSize; - - tv2->split(1, kNumElems); - - auto c0 = tv0->cache_after(); - auto c1 = tv1->cache_after(); - - tv2->split(-1, kVecSize); - - c0->computeAt(tv2, -2); - c1->computeAt(tv2, -2); - - c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize); - c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize); - - tv2->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(-2)->parallelize(ParallelType::TIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - const int bx = 128; - const int by = 2049; - at::Tensor t0 = at::randn({bx, by}, options).index({"...", Slice(3)}); - at::Tensor t1 = at::randn({bx, by}, options).index({"...", Slice(3)}); - std::vector aten_inputs = {t0, t1}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - auto aten_output = t0 + t1; - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionVectorizeMisalignedStrideFail_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - auto tv1 = makeSymbolicTensor(2); - - fusion.addInput(tv0); - fusion.addInput(tv1); - - auto tv2 = add(tv0, tv1); - fusion.addOutput(tv2); - - const int kTDX = 64; - const int kVecSize = 4; - const int kNumElems = kTDX * kVecSize; - - tv2->split(1, kNumElems); - - auto c0 = tv0->cache_after(); - auto c1 = tv1->cache_after(); - auto c2 = tv2->cache_before(); - - tv2->split(-1, kVecSize); - - c0->computeAt(tv2, -2); - c1->computeAt(tv2, -2); - - c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize); - c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize); - - tv2->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(-2)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - const int bx = 128; - const int by = 2049; - at::Tensor t0 = at::randn({bx, by}, options).index({"...", Slice(3)}); - at::Tensor t1 = at::randn({bx, by}, options).index({"...", Slice(3)}); - std::vector aten_inputs = {t0, t1}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - - // Failure because the input + output tensors do not have the same stride - ASSERT_ANY_THROW(fe.runFusion(aten_inputs)); -} - -TEST(NVFuserTest, FusionViewOutput_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - std::vector input_shape{2, 10, 40}; - std::vector output_shape{2, 10, 4, 10}; - - TensorView* x = makeSymbolicTensor(input_shape.size()); - TensorView* bias = makeSymbolicTensor(input_shape.size()); - fusion.addInput(x); - fusion.addInput(bias); - - auto x_add_bias = add(x, bias); - auto x_view = view(x_add_bias, input_shape, output_shape); - fusion.addOutput(x_view); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor at_x = at::randn(input_shape, options); - at::Tensor at_bias = at::randn(input_shape, options); - std::vector aten_inputs = {at_x, at_bias}; - - auto lparams = schedulePointwise(&fusion, aten_inputs); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(aten_inputs, lparams); - - auto at_x_add_bias = at_x + at_bias; - auto at_x_view = at::native::view(at_x_add_bias, output_shape); - - testValidate(&fusion, outputs, aten_inputs, {at_x_view}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionViewFailMismatchSize_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // The number of elements in input and output shapes do not match, - // so this view transformation is invalid. - // 2 * 10 * 40 != 2 * 50 * 4 * 10 - - std::vector input_shape{2, 10, 40}; - std::vector output_shape{2, 50, 4, 10}; - - TensorView* x = makeSymbolicTensor(input_shape.size()); - TensorView* bias = makeSymbolicTensor(input_shape.size()); - fusion.addInput(x); - fusion.addInput(bias); - - auto x_add_bias = add(x, bias); - ASSERT_ANY_THROW(view(x_add_bias, input_shape, output_shape)); -} - -TEST(NVFuserTest, FusionViewFailMulitDimInference_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Only one dimension can be inferred in the output shape. - // Otherwise, the size of the dimensions is ambiguous. - std::vector input_shape{2, 10, 40}; - std::vector output_shape{2, -1, 4, -1}; - - TensorView* x = makeSymbolicTensor(input_shape.size()); - TensorView* bias = makeSymbolicTensor(input_shape.size()); - fusion.addInput(x); - fusion.addInput(bias); - - auto x_add_bias = add(x, bias); - ASSERT_ANY_THROW(view(x_add_bias, input_shape, output_shape)); -} - -TEST(NVFuserTest, FusionViewFailReduction_CUDA) { - std::unique_ptr fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); - FusionGuard fg(&fusion); - - // View is only supported by the pointwise scheduler, - // so it should fail with any reduction operations - std::vector input_shape{2, 10, 40}; - std::vector output_shape{2, 10, 2, 20}; - - TensorView* x = makeSymbolicTensor(input_shape.size()); - TensorView* bias = makeSymbolicTensor(input_shape.size()); - fusion.addInput(x); - fusion.addInput(bias); - - auto x_add_bias = add(x, bias); - auto x_view = view(x_add_bias, input_shape, output_shape); - auto x_sum = sum(x_view, {-1}); - - fusion.addOutput(x_sum); - - const auto options = - at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor at_x = at::randn(input_shape, options); - at::Tensor at_bias = at::randn(input_shape, options); - - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); - ASSERT_ANY_THROW(fusion_executor_cache.runFusionWithInputs({at_x, at_bias})); -} - -TEST(NVFuserTest, FusionViewFailPersistent_CUDA) { - std::unique_ptr fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); - FusionGuard fg(&fusion); - - // View is only supported by the pointwise scheduler, - // so it should fail with any persistent normalization operations - std::vector input_shape{2, 10, 40}; - std::vector output_shape{2, 10, 2, 20}; - - TensorView* x = makeSymbolicTensor(input_shape.size()); - TensorView* bias = makeSymbolicTensor(input_shape.size()); - fusion.addInput(x); - fusion.addInput(bias); - - auto x_add_bias = add(x, bias); - auto x_view = view(x_add_bias, input_shape, output_shape); - auto x_softmax = softmax(x_view, -1); - - fusion.addOutput(x_softmax); - - const auto options = - at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor at_x = at::randn(input_shape, options); - at::Tensor at_bias = at::randn(input_shape, options); - - FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); - ASSERT_ANY_THROW(fusion_executor_cache.runFusionWithInputs({at_x, at_bias})); -} - -void addViewGeluFusion( - std::vector& input_shape, - std::vector& output_shape) { - for (auto hasImplicitBroadcast : {false, true}) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* x = (hasImplicitBroadcast) - ? makeConcreteTensor(input_shape) - : makeSymbolicTensor(input_shape.size()); - TensorView* bias = (hasImplicitBroadcast) - ? makeConcreteTensor(input_shape) - : makeSymbolicTensor(input_shape.size()); - fusion.addInput(x); - fusion.addInput(bias); - - auto x_add_bias = add(x, bias); - auto x_view = view(x_add_bias, input_shape, output_shape); - auto y = gelu(x_view); - fusion.addOutput(y); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor at_x = at::randn(input_shape, options); - at::Tensor at_bias = at::randn(input_shape, options); - std::vector aten_inputs = {at_x, at_bias}; - - auto lparams = schedulePointwise(&fusion, aten_inputs); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(aten_inputs, lparams); - - auto at_x_add_bias = at_x + at_bias; - auto at_x_view = at::native::view(at_x_add_bias, output_shape); - auto at_y = at::gelu(at_x_view); - - testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__); - } -} - -TEST(NVFuserTest, FusionViewSplit_CUDA) { - std::vector input_shape{80}; - std::vector output_shape{2, 4, 10}; - addViewGeluFusion(input_shape, output_shape); -} - -TEST(NVFuserTest, FusionViewBroadcast_CUDA) { - std::vector input_shape{80}; - std::vector output_shape{1, 80}; - addViewGeluFusion(input_shape, output_shape); -} - -TEST(NVFuserTest, FusionViewMerge_CUDA) { - std::vector input_shape{2, 40, 7}; - std::vector output_shape{560}; - addViewGeluFusion(input_shape, output_shape); -} - -TEST(NVFuserTest, FusionViewAllShmoo_CUDA) { - typedef std::vector shape; - typedef std::pair view_example; - - std::vector examples = { - {{1, 19, 1, 12, 7, 1, 99}, {1, 19, 1, 3, 2772}}, - {{3, 17, 80, 1}, {51, 1, 2, 4, 10}}, - {{3, 17, 80, 1, 9}, {51, 1, 2, 4, 10, 9}}, - {{2, 3, 4, 5}, {1, 6, 1, 2, 2, 5, 1}}, - {{22, 22, 2}, {22, 11, 1, 1, 4}}, - {{37, 9, 7, 6, 10}, {333, 2, 2, 3, 35}}, - {{1, 1, 333, 1}, {1, 1, 333, 1}}, - {{8, 1, 1, 8, 1, 8}, {8, 2, 4, 1, 8}}, - {{1, 333, 1}, {1, 37, 9, 1}}, - {{1, 333}, {1, 1, 1, 111, 1, 3}}, - {{22, 1, 22, 1}, {484}}, - {{1, 333, 1}, {333}}, - {{1, 27454, 1, 2}, {1, 7844, 1, 7}}, - {{1, 7844, 1, 7}, {1, 27454, 2}}}; - - for (auto e : examples) { - addViewGeluFusion(e.first, e.second); - } -} - -TEST(NVFuserTest, FusionViewInferShmoo_CUDA) { - typedef std::vector shape; - typedef std::pair view_example; - - std::vector examples = { - {{1, 19, 1, 12, 7, 1, 99}, {1, 19, -1, 3, 2772}}, - {{3, 17, 80, 1}, {51, 1, 2, 4, -1}}, - {{3, 17, 80, 1, 9}, {-1, 1, 2, 4, 10, 9}}, - {{2, 3, 4, 5}, {1, 6, 1, -1, 2, 5, 1}}, - {{22, 22, 2}, {22, -1, 1, 1, 4}}, - {{37, 9, 7, 6, 10}, {333, 2, -1, 3, 35}}, - {{1, 1, 333, 1}, {1, 1, -1, 1}}, - {{8, 1, 1, 8, 1, 8}, {8, 2, 4, 1, -1}}, - {{1, 333, 1}, {1, 37, -1, 1}}, - {{1, 333}, {1, 1, 1, -1, 1, 3}}, - {{22, 1, 22, 1}, {-1}}, - {{1, 333, 1}, {-1}}, - {{1, 27454, 1, 2}, {1, 7844, 1, -1}}, - {{1, 7844, 1, 7}, {1, -1, 2}}}; - - for (auto e : examples) { - addViewGeluFusion(e.first, e.second); - } -} - -void geluViewAddFusion( - std::vector input_shape, - std::vector output_shape) { - for (auto hasImplicitBroadcast : {false, true}) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* x = (hasImplicitBroadcast) - ? makeConcreteTensor(input_shape) - : makeSymbolicTensor(input_shape.size()); - TensorView* bias = (hasImplicitBroadcast) - ? makeConcreteTensor(output_shape) - : makeSymbolicTensor(output_shape.size()); - fusion.addInput(x); - fusion.addInput(bias); - - auto x_gelu = gelu(x); - auto x_view = view(x_gelu, input_shape, output_shape); - auto y = add(x_view, bias); - fusion.addOutput(y); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor at_x = at::randn(input_shape, options); - at::Tensor at_bias = at::randn(output_shape, options); - std::vector aten_inputs = {at_x, at_bias}; - - auto lparams = schedulePointwise(&fusion, aten_inputs); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(aten_inputs, lparams); - - auto at_x_gelu = at::gelu(at_x); - auto at_x_view = at::native::view(at_x_gelu, output_shape); - auto at_y = at_x_view + at_bias; - - testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__); - } -} - -TEST(NVFuserTest, FusionViewStride_CUDA) { - typedef std::vector shape; - typedef std::pair view_example; - - std::vector examples = { - {{1, 27454, 2}, {1, 7844, 7}}, - {{1, 19, 1, 12, 7, 1, 99}, {1, 19, 1, 3, 2772}}, - {{1, 7844, 1, 7}, {1, 27454, 2}}}; - - for (auto e : examples) { - geluViewAddFusion(e.first, e.second); - } -} - -void geluViewBinaryAddFusion( - std::vector input_shape1, - std::vector input_shape2, - std::vector output_shape) { - for (auto hasImplicitBroadcast : {false, true}) { - Fusion fusion; - FusionGuard fg(&fusion); - - TensorView* x = (hasImplicitBroadcast) - ? makeConcreteTensor(input_shape1) - : makeSymbolicTensor(input_shape1.size()); - TensorView* bias = (hasImplicitBroadcast) - ? makeConcreteTensor(input_shape2) - : makeSymbolicTensor(input_shape2.size()); - fusion.addInput(x); - fusion.addInput(bias); - - auto x_gelu = gelu(x); - auto x_view = view(x_gelu, input_shape1, output_shape); - auto bias_view = view(bias, input_shape2, output_shape); - auto y = add(x_view, bias_view); - fusion.addOutput(y); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor at_x = at::randn(input_shape1, options); - at::Tensor at_bias = at::randn(input_shape2, options); - std::vector aten_inputs = {at_x, at_bias}; - - auto lparams = schedulePointwise(&fusion, aten_inputs); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(aten_inputs, lparams); - - auto at_x_gelu = at::gelu(at_x); - auto at_x_view = at::native::view(at_x_gelu, output_shape); - auto at_bias_view = at::native::view(at_bias, output_shape); - auto at_y = at_x_view + at_bias_view; - - testValidate(&fusion, outputs, aten_inputs, {at_y}, __LINE__, __FILE__); - } -} - -TEST(NVFuserTest, FusionViewBinary_CUDA) { - geluViewBinaryAddFusion({27454, 2}, {54908}, {7844, 7}); -} - -TEST(NVFuserTest, FusionVectorization1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - - auto tv1 = makeSymbolicTensor(2); - fusion.addInput(tv0); - fusion.addInput(tv1); - - auto tv2 = add(tv0, tv1); - fusion.addOutput(tv2); - - tv2->split(1, 16); - tv2->split(1, 64); - - tv2->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(2)->parallelize(ParallelType::TIDx); - - auto c0 = tv0->cache_after(); - auto c1 = tv1->cache_after(); - auto c2 = tv2->cache_before(); - - c0->computeAt(tv2, -2); - c1->computeAt(tv2, -2); - - std::vector vectorized_tvs = {c0, c1, tv2}; - for (auto tv : vectorized_tvs) { - tv->split(-1, 4); - tv->axis(-1)->parallelize(ParallelType::Vectorize); - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - const int bx = 128; - const int by = 2048; - at::Tensor t0 = at::randn({bx, by}, options); - at::Tensor t1 = at::randn({bx, by}, options); - - std::vector aten_inputs = {t0, t1}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - auto aten_output = t0 + t1; - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionVectorization2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - - auto tv1 = makeSymbolicTensor(2); - fusion.addInput(tv0); - fusion.addInput(tv1); - - auto tv2 = add(tv0, tv1); - fusion.addOutput(tv2); - - tv2->split(1, 16); - tv2->split(1, 64); - - tv2->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(2)->parallelize(ParallelType::TIDx); - - auto c0 = tv0->cache_after(); - auto c1 = tv1->cache_after(); - auto c2 = tv2->cache_before(); - - c0->computeAt(tv2, -2); - c1->computeAt(tv2, -2); - - std::vector vectorized_tvs = {c0, c1, tv2}; - for (auto tv : vectorized_tvs) { - tv->split(-1, 4); - // Vectorize the wrong dimension - tv->axis(-2)->parallelize(ParallelType::Vectorize); - } - - FusionExecutor fe; - // Make sure compilation fails - ASSERT_ANY_THROW(fe.compileFusion(&fusion)); -} - -TEST(NVFuserTest, FusionVectorization3_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - - auto tv1 = makeSymbolicTensor(2); - fusion.addInput(tv0); - fusion.addInput(tv1); - - auto tv2 = add(tv0, tv1); - fusion.addOutput(tv2); - - tv2->split(1, 16); - tv2->split(1, 64); - - tv2->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(2)->parallelize(ParallelType::TIDx); - - auto c0 = tv0->cache_after(); - auto c1 = tv1->cache_after(); - auto c2 = tv2->cache_before(); - - c0->computeAt(tv2, -2); - c1->computeAt(tv2, -2); - - std::vector vectorized_tvs = {c0, c1, tv2}; - for (auto tv : vectorized_tvs) { - tv->split(-1, 4); - tv->axis(-1)->parallelize(ParallelType::Vectorize); - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - const int bx = 128; - const int by = 2049; - at::Tensor t0 = at::randn({bx, by}, options); - at::Tensor t1 = at::randn({bx, by}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - std::vector aten_inputs = {t0, t1}; - ASSERT_ANY_THROW(fe.runFusion(aten_inputs)); - - aten_inputs[0] = t0.index({"...", Slice(1)}); - aten_inputs[1] = t1.index({"...", Slice(1)}); - ASSERT_ANY_THROW(fe.runFusion(aten_inputs)); - - t0 = at::randn({bx, 2048}, options).index({"...", Slice(4)}); - t1 = at::randn({bx, 2048}, options).index({"...", Slice(4)}); - aten_inputs = {t0, t1}; - auto cg_outputs = fe.runFusion(aten_inputs); - - auto aten_output = t0 + t1; - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionVectorizationRFactor_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - - auto tv1 = makeSymbolicTensor(2); - fusion.addInput(tv0); - fusion.addInput(tv1); - - auto tv2 = add(tv0, tv1); - - auto tv3 = sum(tv2, {-1}); - - fusion.addOutput(tv3); - - tv3->split(-1, 128 * 4); - tv3->split(-1, 4); - // Reduce outer dim first - auto tv4 = tv3->rFactor({-3, -1}); - // Tv3 will reduce threads - - auto tv6 = tv0->cache_after(); - auto tv7 = tv1->cache_after(); - - tv0->computeAt(tv3, 1); - tv1->computeAt(tv3, 1); - - tv3->axis(0)->parallelize(ParallelType::BIDx); - - tv0->computeAt(tv4, -2); - tv1->computeAt(tv4, -2); - - tv6->axis(-1)->parallelize(ParallelType::Vectorize); - tv7->axis(-1)->parallelize(ParallelType::Vectorize); - - tv4->axis(-2)->parallelize(ParallelType::TIDx); - tv3->axis(1)->parallelize(ParallelType::TIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - const int bx = 128; - const int by = 2048; - at::Tensor t0 = at::randn({bx, by}, options); - at::Tensor t1 = at::randn({bx, by}, options); - - std::vector aten_inputs = {t0, t1}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - - auto aten_output = t0.add(t1).sum(1); - testValidate( - &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); - - auto t3 = t0.add(t1).sum(1); - - testValidate(&fusion, cg_outputs, aten_inputs, {t3}, __LINE__, __FILE__); -} - -// Unswitched loops with extent one may omit else clause. -TEST(NVFuserTest, FusionSizeOneLoop1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Progressively broadcast tensors - TensorView* tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - TensorView* tv1 = makeSymbolicTensor(2); - fusion.addInput(tv1); - TensorView* tv2 = makeSymbolicTensor(3); - fusion.addInput(tv2); - - TensorView* tv3 = broadcast(tv0, {false, true}); - TensorView* tv4 = add(tv3, tv1); - TensorView* tv5 = add(tv4, tv2); - - fusion.addOutput(tv5); - - // Split inner dimension - tv5->split(1, 8); - // Merge middle dims with outer dimensions - tv5->merge(2); - tv5->merge(0); - - // tv5[I0*I1o, I1i*I2] - // Get a dim of size 1 to unswitch - tv5->split(0, 1, false); - - // Compute everything inline - tv0->computeAt(tv5, -1); - - tv5->axis(0)->parallelize(ParallelType::Unswitch); - tv5->axis(1)->parallelize(ParallelType::BIDx); - tv5->axis(2)->parallelize(ParallelType::TIDx); - - // Make sure the unswitched loop does not have an else clause. - GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto fl = dynamic_cast(kir_node.get())) { - if (fl->iter_domain()->parallelType() != ParallelType::Unswitch) { - continue; - } - if (auto pred = dynamic_cast(fl->parentScope())) { - TORCH_CHECK(!pred->hasElse()); - } - } - } - - const int x = 11; - const int y = 12; - const int z = 13; - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({x}, options); - at::Tensor t1 = at::randn({x, y}, options); - at::Tensor t2 = at::randn({z, x, y}, options); - std::vector aten_inputs = {t0, t1, t2}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - auto t6 = (t0.unsqueeze(-1) + t1).unsqueeze(0) + t2; - - testValidate(&fusion, cg_outputs, aten_inputs, {t6}, __LINE__, __FILE__); -} - -// The unswitched loop has extent one but inner loops don't. The else -// part should not be omitted. -TEST(NVFuserTest, FusionSizeOneLoop2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const int x = 15; - auto tv0 = makeConcreteTensor({x}); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - fusion.addOutput(tv1); - - tv1->split(-1, 4); - tv1->split(-2, 1); - - tv1->axis(-2)->parallelize(ParallelType::Unswitch); - - // Make sure the size-one unswitched loop does not omit the else clause. - GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto fl = dynamic_cast(kir_node.get())) { - if (fl->iter_domain()->parallelType() != ParallelType::Unswitch) { - continue; - } - if (auto pred = dynamic_cast(fl->parentScope())) { - TORCH_CHECK(pred->hasElse()); - } - } - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({x}, options); - std::vector aten_inputs = {t0}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion(aten_inputs); - auto t1 = t0 + 1; - - testValidate(&fusion, cg_outputs, aten_inputs, {t1}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionValidateParallelize1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); - fusion.addOutput(tv2); - - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDy); - - // Invalid as tv1 and tv2 do have the same ParallelType - FusionExecutor fe; - ASSERT_ANY_THROW(fe.compileFusion(&fusion)); -} - -TEST(NVFuserTest, FusionValidateParallelize2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); - fusion.addOutput(tv2); - - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDy); - tv1->setMemoryType(MemoryType::Shared); - - // tv1 and tv2 do have the same ParallelType, but tv1 is on shared - // memory, so it is valid - FusionExecutor fe; - fe.compileFusion(&fusion); -} - -TEST(NVFuserTest, FusionValidateParallelize3_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); - fusion.addOutput(tv2); - - tv1->split(-1, 4); - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv2->split(-1, 4); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - - tv1->setMemoryType(MemoryType::Global); - - // tv1 and tv2 have the same shape and ParallelType - FusionExecutor fe; - fe.compileFusion(&fusion); -} - -TEST(NVFuserTest, FusionValidateParallelize4_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); - fusion.addOutput(tv2); - - tv1->split(-1, 4); - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv2->split(-1, 8); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - - tv1->setMemoryType(MemoryType::Global); - - // tv1 and tv2 do not have the same shape - FusionExecutor fe; - ASSERT_ANY_THROW(fe.compileFusion(&fusion)); -} - -TEST(NVFuserTest, FusionValidateParallelize5_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); - fusion.addOutput(tv2); - - tv1->split(-1, 4); - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv1->setMemoryType(MemoryType::Shared); - - tv2->split(-1, 8); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - - // tv1 and tv2 do not have the same shape, but tv1 is on shared - // memory, so it is valid - FusionExecutor fe; - fe.compileFusion(&fusion); -} - -// See issue #995 -TEST(NVFuserTest, FusionValidateParallelize6_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(3); - auto tv1 = makeSymbolicTensor(4); - fusion.addInput(tv0); - fusion.addInput(tv1); - - auto tv2 = add(tv0, new Double(1)); - auto tv3 = broadcast(tv2, {true, false, false, false}); - auto tv4 = add(tv3, tv1); - fusion.addOutput(tv4); - - tv4->merge(0); - tv4->merge(0); - tv4->merge(0); - tv4->split(0, 128); - tv4->split(0, 1); - tv4->split(0, 1); - - TransformPropagator::from(tv4); - - tv0->computeAt(tv2, 2); - tv3->computeAt(tv4, 2); - - tv4->axis(0)->parallelize(ParallelType::BIDx); - tv4->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - - // Validation should throw an exception saying the first axes of tv2 - // and tv3 have incompatible parallelization. See also issue #995. - ASSERT_ANY_THROW(fusion.printKernel()); -} - -TEST(NVFuserTest, FusionDAGMerging_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(5); - auto tv1 = makeSymbolicTensor(1); - fusion.addInput(tv0); - fusion.addInput(tv1); - - // Branch 0 - auto tv2 = sum(tv0, {0}); // 0 - auto tv3 = sum(tv2, {0}); // 1 - auto tv4 = sum(tv3, {0}); // 2 - auto tv5 = sum(tv4, {0}); // 3 - - // Branch 1 - auto tv6 = add(tv1, new Double(1)); // 4 - - // Merge - auto tv7 = add(tv6, tv5); // 5 - - // Maximum expected output groups (can improve overtime): - // {0}, {1}, {2}, {3,4,5} - // without final merge would have been {0}, {1}, {2}, {3,4}, {5} - - fusion.addOutput(tv7); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({2, 2, 2, 2, 2}, options); - at::Tensor t1 = at::randn({2}, options); - - auto fusion_segments = fusion.segment({t0, t1}); - TORCH_CHECK(fusion_segments->groups().size() <= 4); -} - -TEST(NVFuserTest, FusionDAGScalarMerging_CUDA) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); - - auto tv0 = makeSymbolicTensor(3); - auto i0 = new Double(); - - fusion->addInput(tv0); - fusion->addInput(i0); - - auto i1 = add(i0, new Double(1.0)); - auto i2 = mul(i1, i1); - auto i3 = add(i2, i1); - - // Branch 0 - auto tv1 = sum(tv0, {0}); // 0 - auto tv2 = add(tv1, i2); - // Branch 1 - auto tv3 = sum(tv2, {0}); // 1 - auto tv4 = add(tv3, i3); - - auto tv5 = add(tv4, i0); - - fusion->addOutput(tv5); - - FusionExecutorCache executor_cache(std::move(fusion)); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({16, 16, 16}, options); - double s0 = 0.5; - - auto s1 = s0 + 1.0; - auto s2 = s1 * s1; - auto s3 = s2 + s1; - auto t1 = t0.sum({0}); - auto t2 = t1 + s2; - auto t3 = sum(t2, {0}); - auto t4 = t3 + s3; - auto t5 = t4 + s0; - - auto outputs = executor_cache.runFusionWithInputs({t0, s0}); - - TORCH_CHECK( - executor_cache.getMostRecentKernelRuntime()->isSegmented(), - "segmentation didn't happen"); - TORCH_CHECK( - executor_cache.getMostRecentKernelRuntime() - ->fusionSegments() - ->groups() - .size() == 2, - "segmentation didn't happen as expected"); - - testValidate( - executor_cache.fusion(), outputs, {t0, s0}, {t5}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionBlockReduceInSerialLoop_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - constexpr int M = 10; - constexpr int N = 20; - constexpr int K = 20; - - auto tv0 = makeSymbolicTensor(3); - auto tv1 = sum(tv0, {{1, 2}}); - fusion.addInput(tv0); - fusion.addOutput(tv1); - - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv1->axis(0)->parallelize(ParallelType::BIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor t0 = at::randn({M, N, K}, options); - std::vector aten_inputs = {t0}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(aten_inputs); - at::Tensor aten_output = t0.sum({1, 2}); - testValidate( - &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionBlockWelfordInSerialLoop_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - constexpr int M = 10; - constexpr int N = 20; - constexpr int K = 20; - - auto tv0 = makeSymbolicTensor(3); - auto tvs = Welford(tv0, {{1, 2}}); - fusion.addInput(tv0); - auto tv_avg = tvs.avg; - auto tv_M2 = tvs.var_sum; - auto tv_N = tvs.n; - fusion.addOutput(tv_avg); - fusion.addOutput(tv_M2); - - tv_avg->axis(-1)->parallelize(ParallelType::TIDx); - tv_avg->axis(0)->parallelize(ParallelType::BIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor t0 = at::randn({M, N, K}, options); - std::vector aten_inputs = {t0}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(aten_inputs); - at::Tensor aten_avg = t0.mean({1, 2}); - at::Tensor aten_M2 = t0.var({1, 2}, false) * N * K; - testValidate( - &fusion, outputs, aten_inputs, {aten_avg, aten_M2}, __LINE__, __FILE__); -} - -// See Issue #716 -TEST(NVFuserTest, FusionIOTensorTrivialReductionRepro_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - constexpr int M = 10; - constexpr int N = 11; - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - std::vector reduction_axes = {1}; - std::vector broadcast_mask = {false, true}; - - auto tv0_bcast = broadcast(tv0, broadcast_mask); - auto path1_bcast = add(tv0_bcast, new Double(1.0)); - auto path1 = sum(path1_bcast, reduction_axes); - fusion.addOutput(path1); - - auto p = path1->split(1, 1); - path1->rFactor({1}); - path1->axis(0)->parallelize(ParallelType::BIDx); - tv0->computeAt(path1, 1); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor t0 = at::randn({M}, options); - at::Tensor t0_ref = t0.clone(); - std::vector aten_inputs = {t0}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - - // inplace op, we are adding t0 to itself - auto outputs = fe.runFusion(aten_inputs, {t0}); - - TORCH_CHECK(outputs[0].allclose(t0_ref.add(1))); -} - -TEST(NVFuserTest, FusionReductionPredicate_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = sum(tv0, {0}); - fusion.addOutput(tv1); - - auto tv2 = tv0->cache_after(); - - const int bdimx = 128; - tv1->split(1, bdimx); - tv1->split(1, 4); - tv1->split(1, 1); - - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv1->axis(2)->parallelize(ParallelType::Unroll); - tv1->split(0, 10); - tv0->computeAt(tv1, 4); - - tv2->axis(-1)->parallelize(ParallelType::TIDx); - - int numel_x = 650; - int numel_y = 102; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({numel_x, numel_y}, options); - at::Tensor cg_output = at::empty({numel_y}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({input}, {cg_output}); - - auto aten_output = input.to(at::kDouble).sum({0}); - - testValidate( - &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionIssue728_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addOutput(tv0); - auto tv1 = makeSymbolicTensor(1); - fusion.addOutput(tv1); - auto tv2 = makeSymbolicTensor(1); - fusion.addOutput(tv2); - - auto tv3 = add(tv0, new Double(1)); - auto tv4 = add(tv3, tv1); - auto tv5 = add(tv4, new Double(1)); - auto tv6 = add(tv2, new Double(1)); - fusion.addOutput(tv5); - fusion.addOutput(tv6); - - // tv0 -> tv3 -+ - // tv1 --------+-> tv4 -> tv5 - // - // tv2 -> tv6 - - auto all_vals_under_tv3 = - DependencyCheck::getAllValsBetween({tv3}, fusion.outputs()); - std::unordered_set included_tensors({tv3, tv4, tv5}); - for (auto tv : included_tensors) { - TORCH_CHECK( - std::find(all_vals_under_tv3.begin(), all_vals_under_tv3.end(), tv) != - all_vals_under_tv3.end(), - "TV", - tv->name(), - " not found"); - } - for (auto tv : ir_utils::filterByType(fusion.vals())) { - if (included_tensors.find(tv) == included_tensors.end()) { - TORCH_CHECK( - std::find(all_vals_under_tv3.begin(), all_vals_under_tv3.end(), tv) == - all_vals_under_tv3.end(), - "TV", - tv->name(), - " should not be found"); - } - } - - auto no_dependency = DependencyCheck::getAllValsBetween({}, fusion.outputs()); - TORCH_CHECK(no_dependency.empty(), "No val should be returned"); - - auto no_dep_path = DependencyCheck::getAllValsBetween({tv0, tv1}, {tv6}); - TORCH_CHECK(no_dep_path.empty(), "No val should be returned"); - - auto no_dep_path2 = DependencyCheck::getAllValsBetween({tv2}, {tv5}); - TORCH_CHECK(no_dep_path2.empty(), "No val should be returned"); - - auto just_tv3 = DependencyCheck::getAllValsBetween({tv3}, {tv3}); - TORCH_CHECK( - just_tv3.size() == 1 && *(just_tv3.begin()) == tv3, - "Only tv3 should be included"); -} - -TEST(NVFuserTest, FusionIssue757_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = sum(tv0, {1}); - auto tv2 = broadcast(tv1, {false, true}); - auto tv3 = makeSymbolicTensor(2); - fusion.addInput(tv3); - auto tv4 = add(tv2, tv3); - fusion.addOutput(tv4); - - tv1->computeAt(tv4, -1); - - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv4->axis(-1)->parallelize(ParallelType::TIDx); - tv1->axis(-1)->parallelize(ParallelType::TIDx); - - int numel_x = 650; - int numel_y = 102; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - at::Tensor t3 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0, t3}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(inputs); - - auto t1 = t0.sum({1}); - auto t2 = t1.unsqueeze(-1).expand({numel_x, numel_y}); - auto t4 = t2 + t3; - - testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__); -} - -// See issue #759 -TEST(NVFuserTest, FusionPredicatedBlockBroadcast_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = sum(tv0, {1}); - auto tv2 = broadcast(tv1, {false, true}); - auto tv3 = makeSymbolicTensor(2); - fusion.addInput(tv3); - auto tv4 = add(tv2, tv3); - fusion.addOutput(tv4); - - tv4->split(0, 4); - tv1->computeAt(tv4, -1); - - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(1)->parallelize(ParallelType::TIDy); - tv4->axis(-1)->parallelize(ParallelType::TIDx); - tv4->axis(1)->parallelize(ParallelType::TIDy); - tv1->axis(-1)->parallelize(ParallelType::TIDx); - - int numel_x = 100; - int numel_y = 101; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - at::Tensor t3 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0, t3}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(inputs); - - auto t1 = t0.sum({1}); - auto t2 = t1.unsqueeze(-1).expand({numel_x, numel_y}); - auto t4 = t2 + t3; - - testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionSegmentVerticalMerge_CUDA) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); - - auto tv0 = makeSymbolicTensor(3); - - fusion->addInput(tv0); - // {first kernel} - auto tv1 = sum(tv0, {0}); - auto tv2 = add(tv1, tv0); - auto tv3 = sum(tv2, {0}); - auto tv4 = add(tv3, tv0); - auto tv5 = sum(tv4, {0}); - auto tv6 = sum(tv5, {0}); - // {second kernel} - auto tv7 = add(tv6, tv5); - auto tv8 = add(tv7, tv5); - auto tv9 = sum(tv8, {0}); - - fusion->addOutput(tv9); - - SegmentCandidateFinderOptions segment_options; - segment_options.run_herrmann_merge = false; - segment_options.run_final_merge = false; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({2, 2, 2}, options); - - auto segmented_fusion = - SegmentCandidateFinder::segment(fusion.get(), {t0}, segment_options); - - TORCH_CHECK(segmented_fusion->groups().size() == 2); -} - -TEST(NVFuserTest, FusionSegmentHorizontalMerge_CUDA) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); - - auto tv0 = makeSymbolicTensor(3); - auto i0 = new Double(); - - fusion->addInput(tv0); - fusion->addInput(i0); - - // Branch 0 {first kernel} - auto tv1 = sum(tv0, {0}); - auto tv2 = add(tv0, i0); - auto tv3 = unaryOp(UnaryOpType::Rsqrt, tv2); - auto tv4 = sum(tv3, {0}); - - // Branch 1 {first kernel} - auto tv5 = unaryOp(UnaryOpType::Rsqrt, tv3); - auto tv6 = sum(tv5, {0}); - - // Incompatible {second kernel} - auto tv7 = sum(tv6, {0}); - - fusion->addOutput(tv1); - fusion->addOutput(tv4); - fusion->addOutput(tv7); - - SegmentCandidateFinderOptions segment_options; - segment_options.run_herrmann_merge = false; - segment_options.run_final_merge = false; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({2, 2, 2}, options); - - auto segmented_fusion = - SegmentCandidateFinder::segment(fusion.get(), {t0, 1.0}, segment_options); - - TORCH_CHECK(segmented_fusion->groups().size() == 2); -} - -TEST(NVFuserTest, FusionSegmentMixReduction_CUDA) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); - - auto tv0 = makeSymbolicTensor(3); - - fusion->addInput(tv0); - - // def of tv1 in kernel 1 through horizontal - auto tv1 = sum(tv0, {0, 1}); - // kernel 2 - auto tv2 = sum(tv0, {2}); - auto tv3 = broadcast(tv2, {false, false, true}); - auto tv4 = add(tv0, tv3); - auto tv5 = sum(tv4, {2}); - // end of kernel 2 - // kernel 1 - auto tv6 = unaryOp(UnaryOpType::Rsqrt, tv0); - auto tv7 = sum(tv6, {0, 1}); - auto tv8 = sum(tv6, {0, 1}); - - fusion->addOutput(tv1); - fusion->addOutput(tv5); - fusion->addOutput(tv7); - fusion->addOutput(tv8); - - SegmentCandidateFinderOptions segment_options; - segment_options.run_herrmann_merge = false; - segment_options.run_final_merge = false; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({2, 2, 2}, options); - - auto segmented_fusion = - SegmentCandidateFinder::segment(fusion.get(), {t0}, segment_options); - - TORCH_CHECK(segmented_fusion->groups().size() <= 2); -} - -TEST(NVFuserTest, FusionSBAR_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // N, H, W, C format - std::vector input_shape{656, 7, 7, 64}; - - auto x = makeContigTensor(4); - auto y = makeContigTensor(4); - auto weight = makeContigTensor(1); - auto bias = makeContigTensor(1); - - fusion.addInput(x); - fusion.addInput(y); - fusion.addInput(weight); - fusion.addInput(bias); - - const size_t kNumberOfDims = x->nDims(); - std::vector broadcast_mask(kNumberOfDims, false); - for (const auto axis : c10::irange(kNumberOfDims - 1)) { - broadcast_mask[axis] = true; - } - - auto weight_bcast = broadcast(weight, broadcast_mask); - auto scale = mul(x, weight_bcast); - auto bias_bcast = broadcast(bias, broadcast_mask); - auto scale_bias = add(scale, bias_bcast); - auto scale_bias_add = add(scale_bias, y); - auto scale_bias_add_relu = unaryOp(UnaryOpType::Relu, scale_bias_add); - - fusion.addOutput(scale_bias_add_relu); - - // inputs - at::manual_seed(0); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor at_x = at::randn(input_shape, options); - at::Tensor at_y = at::randn(input_shape, options); - at::Tensor at_weight = at::ones({input_shape[3]}, options); - at::Tensor at_bias = at::zeros({input_shape[3]}, options); - - // inputs - std::vector inputs = {at_x, at_y, at_weight, at_bias}; - - // outputs - std::vector outputs; - - auto lparams = schedulePointwise(&fusion, c10::ArrayRef(inputs)); - - FusionExecutor executor; - executor.compileFusion(&fusion); - - outputs = executor.runFusion(c10::ArrayRef(inputs), lparams); - - auto at_scale = at::mul(at_x, at_weight); - auto at_scale_bias = at::add(at_scale, at_bias); - auto pwise_add = at::add(at_scale_bias, at_y); - auto output = at::relu(pwise_add); - - testValidate(&fusion, outputs, inputs, {output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionSingleElement_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(0); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(2.5)); - - auto tv2 = add(tv1, new Double(3.5)); - fusion.addOutput(tv2); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({}, options); - - at::Tensor cg_output = at::empty({}, options); - - auto lparams = schedulePointwise(&fusion, {input}); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({input}, {cg_output}, lparams); - - auto aten_output = input.add(2.5).add(3.5); - - testValidate( - &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionBNBackwardRepro_CUDA) { - std::unique_ptr fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); - FusionGuard fg(&fusion); - - int batch = 4; - int c = 4; - int h = 4; - int w = 4; - int numDims = 4; - - auto input = makeSymbolicTensor(numDims); - fusion.addInput(input); - auto weight = makeSymbolicTensor(1); - fusion.addInput(weight); - auto running_mean = makeSymbolicTensor(1); - fusion.addInput(running_mean); - auto running_var = makeSymbolicTensor(1); - fusion.addInput(running_var); - auto save_mean = makeSymbolicTensor(1); - fusion.addInput(save_mean); - auto save_invstd = makeSymbolicTensor(1); - fusion.addInput(save_invstd); - - auto grad_out_prev = makeSymbolicTensor(numDims); - fusion.addInput(grad_out_prev); - auto gt_0 = - makeSymbolicTensor(numDims); // single tensor broadcasted is dangerous. - fusion.addInput(gt_0); - - auto gt_bool = binaryOp(BinaryOpType::GT, gt_0, new Int(1)); - auto gt_float = castOp(DataType::Float, gt_bool); - - auto grad_out = mul(grad_out_prev, gt_float); - - Val* eps_ptr = new Double(1e-5); - - auto grads = batch_norm_backward( - input, - grad_out, - weight, - running_mean, - running_var, - save_mean, - save_invstd, - true, - eps_ptr, - {true, true, true}); - - fusion.addOutput(grads.grad_input); - fusion.addOutput(grads.grad_weight); - fusion.addOutput(grads.grad_bias); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input0 = at::randn({batch, c, h, w}, options); - at::Tensor input1 = at::randn({c}, options); - at::Tensor input2 = at::randn_like(input1); - at::Tensor input3 = at::randn_like(input1); - at::Tensor input4 = at::randn_like(input1); - at::Tensor input5 = at::randn_like(input1); - at::Tensor input6 = at::randn_like(input0); - at::Tensor input7 = at::randn_like(input0); - - FusionExecutorCache fec(std::move(fusion_ptr)); - std::vector inputs = { - input0, input1, input2, input3, input4, input5, input6, input7}; - auto outputs = fec.runFusionWithInputs(inputs); -} - -// TODO: We only changed inputs, merge this with the test above. -TEST(NVFuserTest, FusionBNBackwardRepro2_CUDA) { - std::unique_ptr fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); - FusionGuard fg(&fusion); - - int batch = 2; - int c = 81; - int h = 1; - int w = 1; - int numDims = 4; - - // auto input = makeSymbolicTensor(numDims); - auto input = makeConcreteTensor({-1, -1, 1, 1}); - fusion.addInput(input); - auto weight = makeSymbolicTensor(1); - fusion.addInput(weight); - auto running_mean = makeSymbolicTensor(1); - fusion.addInput(running_mean); - auto running_var = makeSymbolicTensor(1); - fusion.addInput(running_var); - auto save_mean = makeSymbolicTensor(1); - fusion.addInput(save_mean); - auto save_invstd = makeSymbolicTensor(1); - fusion.addInput(save_invstd); - - // auto grad_out_prev = makeSymbolicTensor(numDims); - auto grad_out_prev = makeConcreteTensor({-1, -1, 1, 1}); - fusion.addInput(grad_out_prev); - // auto gt_0 = - // makeSymbolicTensor(numDims); // single tensor broadcasted is dangerous. - auto gt_0 = makeConcreteTensor({-1, -1, 1, 1}); - fusion.addInput(gt_0); - - auto gt_bool = binaryOp(BinaryOpType::GT, gt_0, new Int(1)); - auto gt_float = castOp(DataType::Float, gt_bool); - - auto grad_out = mul(grad_out_prev, gt_float); - - Val* eps_ptr = new Double(1e-5); - - auto grads = batch_norm_backward( - input, - grad_out, - weight, - running_mean, - running_var, - save_mean, - save_invstd, - true, - eps_ptr, - {true, true, true}); - - fusion.addOutput(grads.grad_input); - fusion.addOutput(grads.grad_weight); - fusion.addOutput(grads.grad_bias); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input0 = at::randn({batch, c, h, w}, options); - at::Tensor input1 = at::randn({c}, options); - at::Tensor input2 = at::randn_like(input1); - at::Tensor input3 = at::randn_like(input1); - at::Tensor input4 = at::randn_like(input1); - at::Tensor input5 = at::randn_like(input1); - at::Tensor input6 = at::randn_like(input0); - at::Tensor input7 = at::randn_like(input0); - - FusionExecutorCache fec(std::move(fusion_ptr)); - std::vector inputs = { - input0, input1, input2, input3, input4, input5, input6, input7}; - auto outputs = fec.runFusionWithInputs(inputs); -} - -TEST(NVFuserTest, FusionBNRepro_CUDA) { - std::unique_ptr fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); - FusionGuard fg(&fusion); - - const bool kTraining = true; - const float kMomentum = 0.1; - const float kEps = 1e-5; - - int batch = 14; - int c = 65; - int h = 7; - int w = 7; - int numDims = 4; - - auto input = makeSymbolicTensor(numDims); - fusion.addInput(input); - auto weight = makeSymbolicTensor(1); - fusion.addInput(weight); - auto bias = makeSymbolicTensor(1); - fusion.addInput(bias); - auto running_mean = makeSymbolicTensor(1); - fusion.addInput(running_mean); - auto running_var = makeSymbolicTensor(1); - fusion.addInput(running_var); - - auto momentum_ptr = new Double(kMomentum); - auto eps_ptr = new Double(kEps); - - auto result = batch_norm( - input, - weight, - bias, - running_mean, - running_var, - kTraining, - momentum_ptr, - eps_ptr); - - fusion.addOutput(result.output); - fusion.addOutput(result.mean); - fusion.addOutput(result.invstd); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input1 = at::randn({batch, c, h, w}, options); - at::Tensor input2 = at::randn({c}, options); - at::Tensor input3 = at::randn_like(input2); - at::Tensor input4 = at::randn_like(input2); - at::Tensor input5 = at::randn_like(input2); - - auto input1_ref = input1.clone(); - auto input2_ref = input2.clone(); - auto input3_ref = input3.clone(); - auto input4_ref = input4.clone(); - auto input5_ref = input5.clone(); - - FusionExecutorCache fec(std::move(fusion_ptr)); - std::vector aten_inputs = {input1, input2, input3, input4, input5}; - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); - - auto at_results = at::native_batch_norm( - input1_ref, - input2_ref, - input3_ref, - input4_ref, - input5_ref, - kTraining, - kMomentum, - kEps); - - auto at_output = std::get<0>(at_results); - auto at_mean = std::get<1>(at_results); - auto at_invstd = std::get<2>(at_results); - - std::vector aten_outputs = { - input4_ref, input5_ref, at_output, at_mean, at_invstd}; - - testValidate( - &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionBNRepro2_CUDA) { - std::unique_ptr fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); - FusionGuard fg(&fusion); - - const bool kTraining = true; - const float kMomentum = 0.1; - const float kEps = 1e-5; - - int batch = 2; - int c = 4; - int h = 17; - int w = 17; - int numDims = 4; - - auto input = makeSymbolicTensor(numDims); - fusion.addInput(input); - - Val* momentum_ptr = new Double(kMomentum); - Val* eps_ptr = new Double(kEps); - - auto result = batch_norm( - input, - nullptr, - nullptr, - nullptr, - nullptr, - kTraining, - momentum_ptr, - eps_ptr); - - fusion.addOutput(result.output); - fusion.addOutput(result.mean); - fusion.addOutput(result.invstd); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input1 = at::randn({batch, c, h, w}, options); - - auto input1_ref = input1.clone(); - at::Tensor r_m; - at::Tensor r_v; - at::Tensor weight; - at::Tensor bias; - - FusionExecutorCache fec(std::move(fusion_ptr)); - std::vector aten_inputs = {input1}; - auto cg_outputs = fec.runFusionWithInputs(aten_inputs); - - auto at_results = at::native_batch_norm( - input1_ref, r_m, r_v, weight, bias, kTraining, kMomentum, kEps); - - auto at_output = std::get<0>(at_results); - auto at_mean = std::get<1>(at_results); - auto at_invstd = std::get<2>(at_results); - - std::vector aten_outputs = {at_output, at_mean, at_invstd}; - - testValidate( - &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionZeroSizeTensorPW_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = makeConcreteTensor({0}); - fusion.addInput(tv1); - - auto tv2 = add(tv0, new Double(2.5)); - fusion.addOutput(tv2); - - auto tv3 = makeConcreteTensor({0}); - fusion.addOutput(tv3); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor input0 = at::randn({2}, options); - at::Tensor input1 = at::randn({0}, options); - at::Tensor cg_output2 = at::empty({2}, options); - at::Tensor cg_output3 = at::empty({0}, options); - - auto lparams = schedulePointwise(&fusion, {input0, input1}); - - FusionExecutor fe; - fe.compileFusion(&fusion); - fe.runFusion({input0, input1}, {cg_output2, cg_output3}, lparams); - - auto aten_output2 = input0.add(2.5); - at::Tensor aten_output3 = at::empty({0}, options); - - testValidate( - &fusion, - {cg_output2, cg_output3}, - {input0, input1}, - {aten_output2, aten_output3}, - __LINE__, - __FILE__); -} - -TEST(NVFuserTest, FusionZeroSizeTensorReduction_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = makeConcreteTensor({0}); - fusion.addInput(tv1); - - auto tv2 = sum(tv0, {1}); - fusion.addOutput(tv2); - - auto tv3 = makeConcreteTensor({0}); - fusion.addOutput(tv3); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor input0 = at::randn({2, 4}, options); - at::Tensor input1 = at::randn({0}, options); - at::Tensor cg_output2 = at::empty({2}, options); - at::Tensor cg_output3 = at::empty({0}, options); - - auto reduction_params = getReductionHeuristics(&fusion, {input0, input1}); - TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); - scheduleReduction(&fusion, reduction_params.value()); - TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); - - auto lparams = reduction_params.value().lparams; - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({input0, input1}, lparams); - auto aten_output2 = input0.sum({1}); - at::Tensor aten_output3 = at::empty({0}, options); - - testValidate( - &fusion, - cg_outputs, - {input0, input1}, - {aten_output2, aten_output3}, - __LINE__, - __FILE__, - "", - lparams); -} - -TEST(NVFuserTest, FusionZeroSizeTensorNormalization_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = makeConcreteTensor({0}); - fusion.addInput(tv1); - - auto tv2 = sum(tv0, {0}); - auto tv3 = broadcast(tv2, {true, false}); - auto tv4 = add(tv0, tv3); - fusion.addOutput(tv4); - - auto tv5 = makeConcreteTensor({0}); - fusion.addOutput(tv5); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor input0 = at::randn({2, 4}, options); - at::Tensor input1 = at::randn({0}, options); - at::Tensor cg_output2 = at::empty({2, 4}, options); - at::Tensor cg_output3 = at::empty({0}, options); - - auto reduction_params = getPersistentHeuristics(&fusion, {input0, input1}); - TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); - schedulePersistentKernel(&fusion, reduction_params.value()); - - auto lparams = reduction_params.value().lparams; - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({input0, input1}, lparams); - auto aten_output2 = input0.sum({0}).add(input0); - at::Tensor aten_output3 = at::empty({0}, options); - - testValidate( - &fusion, - cg_outputs, - {input0, input1}, - {aten_output2, aten_output3}, - __LINE__, - __FILE__, - "", - lparams); -} - -TEST(NVFuserTest, FusionSegmentIoAlias_CUDA) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); - - TensorView* tv0 = makeSymbolicTensor(2); - TensorView* tv1 = makeSymbolicTensor(1); - TensorView* tv2 = makeSymbolicTensor(2); - - fusion->addInput(tv0); - fusion->addInput(tv1); - fusion->addInput(tv2); - - TensorView* tv3 = add(tv0, new Double(1)); // Group 0 - TensorView* tv4 = - max(tv3, {0}); // Group 0 (use max instead to avoid numerical issues) - TensorView* tv5 = add(tv4, tv1); // Group 0 (Non Broadcast after reduce, - // keeps normalization scheduler away) - TensorView* tv6 = add(tv5, tv2); // Group 1 (Broadcast after reduce) - - fusion->addOutput(tv6); - // Note: test alias; - fusion->aliasOutputToInput(tv6, tv0); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({128, 65}, options); - at::Tensor t1 = at::randn({65}, options); - at::Tensor t2 = at::randn({128, 65}, options); - - auto t3 = t0.add(1.0); - auto t4 = std::get<0>(at::max(t3, 0)); - auto t5 = t4.add(t1); - auto t6 = t5.add(t2); - - FusionExecutorCache executor_cache(std::move(fusion)); - - auto outputs = executor_cache.runFusionWithInputs({t0, t1, t2}); - - // validating aliasing - TORCH_INTERNAL_ASSERT(outputs[0].data_ptr() == t0.data_ptr()); - - TORCH_CHECK( - executor_cache.getMostRecentKernelRuntime()->isSegmented(), - "segmentation didn't happen"); - TORCH_CHECK( - executor_cache.getMostRecentKernelRuntime() - ->fusionSegments() - ->groups() - .size() == 2, - "segmentation didn't happen as expected"); - - testValidate( - executor_cache.fusion(), outputs, {t0, t1, t2}, {t6}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionWelford1Output_CUDA) { - auto fusion_ptr = std::make_unique(); - auto fusion = fusion_ptr.get(); - FusionGuard fg(fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion->addInput(tv0); - - auto tvs = Welford(tv0, {1}); - fusion->addOutput(tvs.var_sum); - FusionExecutorCache executor_cache(std::move(fusion_ptr)); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({128, 65}, options); - auto outputs = executor_cache.runFusionWithInputs({t0}); - - auto t1 = t0.var({1}, false) * 65; - testValidate(fusion, outputs, {t0}, {t1}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionTranslate1Welford_CUDA) { - auto fusion_ptr = std::make_unique(); - auto fusion = fusion_ptr.get(); - FusionGuard fg(fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion->addInput(tv0); - - auto tvs = Welford(tv0, {1}); - fusion->addOutput(tvs.var_sum); - FusionExecutorCache executor_cache(std::move(fusion_ptr)); - - auto run_test = [&executor_cache, - fusion](auto inner_size) -> FusionKernelRuntime* { - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({128, inner_size}, options); - auto outputs = executor_cache.runFusionWithInputs({t0}); - // Square sums does not fit well in the testValidate assumptions, - // so we just compare the divided output here. - outputs[0] /= inner_size; - auto t1 = t0.var({1}, false); - testValidate(fusion, outputs, {t0}, {t1}, __LINE__, __FILE__); - - return executor_cache.getMostRecentKernelRuntime(); - }; - - // Run a translated welford - auto runtime1 = run_test(64); - // Check it was translated - TORCH_CHECK(runtime1->singleKernelFusion()->unordered_exprs().size() > 2); - TORCH_CHECK( - runtime1->schedulerHeuristics()->singleKernelHeuristics()->heuristc() == - ScheduleHeuristic::Persistent); - - // Run an un-translated welford - auto runtime2 = run_test(65536); - // Check it was not translated - TORCH_CHECK(runtime2->singleKernelFusion()->unordered_exprs().size() == 1); - TORCH_CHECK( - runtime2->schedulerHeuristics()->singleKernelHeuristics()->heuristc() == - ScheduleHeuristic::Reduction); -} - -TEST(NVFuserTest, FusionTranslate2Welford_CUDA) { - auto fusion_ptr = std::make_unique(); - auto fusion = fusion_ptr.get(); - FusionGuard fg(fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion->addInput(tv0); - - auto tvs1 = Welford(tv0, {1}); - auto tvs2 = Welford(tv0, {1}); - - fusion->addOutput(tvs1.var_sum); - fusion->addOutput(tvs2.var_sum); - - FusionExecutorCache executor_cache(std::move(fusion_ptr)); - - auto run_test = [&executor_cache, - fusion](auto inner_size) -> FusionKernelRuntime* { - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({128, inner_size}, options); - auto outputs = executor_cache.runFusionWithInputs({t0}); - - // Square sums does not fit well in the testValidate assumptions, - // so we just compare the divided output here. - outputs[0] /= inner_size; - outputs[1] /= inner_size; - auto t1 = t0.var({1}, false); - testValidate(fusion, outputs, {t0}, {t1, t1}, __LINE__, __FILE__); - - return executor_cache.getMostRecentKernelRuntime(); - }; - - // Run a translated welford - auto runtime1 = run_test(64); - // Check it was translated - TORCH_CHECK(runtime1->singleKernelFusion()->unordered_exprs().size() > 4); - TORCH_CHECK( - runtime1->schedulerHeuristics()->singleKernelHeuristics()->heuristc() == - ScheduleHeuristic::Persistent); - - // Run an un-translated welford - auto runtime2 = run_test(65536); - // // Check it was not translated - TORCH_CHECK(runtime2->singleKernelFusion()->unordered_exprs().size() == 2); -} - -TEST(NVFuserTest, FusionLargeWelfordNormalization_CUDA) { - auto fusion_ptr = std::make_unique(); - auto fusion = fusion_ptr.get(); - FusionGuard fg(fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion->addInput(tv0); - - auto tvs1 = Welford(tv0, {1}); - auto sum_of_tv0 = sum(tv0, {1}); - - fusion->addOutput(tvs1.var_sum); - fusion->addOutput(sum_of_tv0); - - FusionExecutorCache executor_cache(std::move(fusion_ptr)); - - auto run_test = [&executor_cache, - fusion](auto inner_size) -> FusionKernelRuntime* { - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({128, inner_size}, options); - auto outputs = executor_cache.runFusionWithInputs({t0}); - - auto t1 = t0.var({1}, false) * inner_size; - auto t2 = t0.sum({1}); - testValidate(fusion, outputs, {t0}, {t1, t2}, __LINE__, __FILE__); - - return executor_cache.getMostRecentKernelRuntime(); - }; - - auto runtime = run_test(65536); - TORCH_CHECK(!runtime->isSegmented()); -} - -TEST(NVFuserTest, FusionWelfordOtherPersistence_CUDA) { - auto fusion_ptr = std::make_unique(); - auto fusion = fusion_ptr.get(); - FusionGuard fg(fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion->addInput(tv0); - - auto tvs1 = Welford(tv0, {1}); - auto sum_of_tv0 = sum(tv0, {1}); - auto sum_bcasted = broadcast(sum_of_tv0, {false, true}); - auto avg_bcasted = broadcast(tvs1.avg, {false, true}); - auto tv0_plus_sum = add(tv0, sum_bcasted); - auto tv0_plus_avg = add(tv0, avg_bcasted); - - fusion->addOutput(tv0_plus_sum); - fusion->addOutput(tv0_plus_avg); - - FusionExecutorCache executor_cache(std::move(fusion_ptr)); - - auto run_test = [&executor_cache, - fusion](auto inner_size) -> FusionKernelRuntime* { - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({128, inner_size}, options); - auto outputs = executor_cache.runFusionWithInputs({t0}); - - auto t1 = t0.mean({1}).unsqueeze(1) + t0; - auto t2 = t0.sum({1}).unsqueeze(1) + t0; - testValidate(fusion, outputs, {t0}, {t2, t1}, __LINE__, __FILE__); - - return executor_cache.getMostRecentKernelRuntime(); - }; - - for (auto inner_size : {4096, 8192, 32768}) { - auto runtime = run_test(4096); - TORCH_CHECK(!runtime->isSegmented()); - } -} - -TEST(NVFuserTest, FusionSegmentIslands_CUDA) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); - - auto tv0 = makeSymbolicTensor(2); - auto tv1 = makeSymbolicTensor(2); - fusion->addInput(tv0); - fusion->addInput(tv1); - - auto tv2 = sum(tv0, {0}); - auto tv3 = sum(tv1, {1}); - fusion->addOutput(tv2); - fusion->addOutput(tv3); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({16, 16}, options); - at::Tensor t1 = at::randn({16, 16}, options); - - FusionExecutorCache fusion_executor_cache(std::move(fusion)); - fusion_executor_cache.runFusionWithInputs({t0, t1}); -} - -TEST(NVFuserTest, FusionBackOffInnerBroadcast_CUDA) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); - - auto tv0 = makeSymbolicTensor(1); - auto tv1 = makeSymbolicTensor(2); - auto tv2 = makeSymbolicTensor(4); - fusion->addInput(tv0); - fusion->addInput(tv1); - - auto tv3 = broadcast(tv0, {false, true, true, true}); - auto tv4 = broadcast(tv1, {false, false, true, true}); - auto tv5 = unaryOp(UnaryOpType::Rsqrt, tv2); - - auto tv6 = add(tv3, tv5); - auto tv7 = add(tv4, tv5); - auto tv8 = add(tv3, tv4); - - auto tv9 = add(tv6, tv7); - auto tv10 = add(tv9, tv8); - - fusion->addOutput(tv10); - - tv0->computeAt(tv10, -2); - tv1->computeAt(tv10, -2); - tv2->computeAt(tv10, -2); - - TORCH_CHECK(tv3->getComputeAtPosition() == 1); - TORCH_CHECK(tv4->getComputeAtPosition() == 2); - TORCH_CHECK(tv5->getComputeAtPosition() == 3); - - TORCH_CHECK(tv6->getMaxProducerPosition() == 3); - TORCH_CHECK(tv7->getMaxProducerPosition() == 3); - TORCH_CHECK(tv8->getMaxProducerPosition() == 2); -} - -TEST(NVFuserTest, FusionBackOffInnerBroadcast2_CUDA) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); - - auto tv0 = makeSymbolicTensor(2); - auto tv1 = makeSymbolicTensor(3); - fusion->addInput(tv0); - fusion->addInput(tv1); - auto tv2 = broadcast(tv0, {false, false, true}); - auto tv3 = add(tv2, tv1); - - fusion->addOutput(tv3); - tv3->split(-2, 4); - tv3->reorder({{-1, -2}}); - tv0->computeAt(tv3, -2); - tv1->computeAt(tv3, -2); - TORCH_CHECK(tv2->getComputeAtPosition() == 2); - TORCH_CHECK(tv3->getMaxProducerPosition() == 2); -} - -TEST(NVFuserTest, FusionBackOffInnerBroadcast3_CUDA) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); - - auto tv0 = makeSymbolicTensor(2); - auto tv1 = makeSymbolicTensor(4); - fusion->addInput(tv0); - fusion->addInput(tv1); - auto tv2 = broadcast(tv0, {false, false, true}); - auto tv3 = broadcast(tv2, {false, true, false, false}); - auto tv4 = add(tv3, tv1); - - fusion->addOutput(tv4); - tv0->computeAt(tv4, -1); - tv1->computeAt(tv4, -1); - TORCH_CHECK(tv2->getComputeAtPosition() == 2); - TORCH_CHECK(tv3->getMaxProducerPosition() == 3); -} - -TEST(NVFuserTest, FusionSimpleWarp_CUDA) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); - - auto tv0 = makeSymbolicTensor(2); - fusion->addInput(tv0); - - auto tv1 = sum(tv0, {1}); - auto tv2 = broadcast(tv1, {false, true}); - auto tv3 = add(tv2, tv0); - - fusion->addOutput(tv3); - - tv1->split(1, 32); - auto tv1_rf = tv1->rFactor({1}); - TransformPropagator::from(tv1_rf); - tv1_rf->axis(-1)->parallelize(ParallelType::TIDx); - tv1->axis(0)->parallelize(ParallelType::BIDx); - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(0)->parallelize(ParallelType::BIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input1 = at::randn({16, 128}, options); - - auto at_output = input1.sum({1}, true).add(input1); - - FusionExecutor fe; - fe.compileFusion(fusion.get()); - auto outputs = fe.runFusion({input1}); - - testValidate( - fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionSimpleWarpPad_CUDA) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); - - auto tv0 = makeSymbolicTensor(2); - - fusion->addInput(tv0); - - auto tv1 = sum(tv0, {1}); - auto tv2 = broadcast(tv1, {false, true}); - auto tv3 = add(tv2, tv0); - - fusion->addOutput(tv3); - - // Schedule a persistent kernel - auto tv0_cache = tv0->cache_after(); - tv1->split(1, 8, false); - auto tv1_rf = tv1->rFactor({1}); - tv1_rf->axis(0)->parallelize(ParallelType::BIDx); - tv1_rf->axis(-1)->parallelize(ParallelType::TIDx); - tv1_rf->axis(-1)->padToMultipleOfWarp(32); - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv1->axis(-1)->padToMultipleOfWarp(32); - TransformPropagator::from(tv1_rf); - tv0->axis(-1)->parallelize(ParallelType::TIDx); - tv0->axis(-1)->padToMultipleOfWarp(32); - tv0_cache->axis(-1)->parallelize(ParallelType::TIDx); - tv0_cache->axis(-1)->padToMultipleOfWarp(32); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->padToMultipleOfWarp(32); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->padToMultipleOfWarp(32); - - tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input1 = at::randn({16, 127}, options); - - auto at_output = input1.sum({1}, true).add(input1); - - FusionExecutor fe; - fe.compileFusion(fusion.get()); - auto outputs = fe.runFusion({input1}); - testValidate( - fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionWarpPadMergeSplit_CUDA) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); - - auto tv0 = makeSymbolicTensor(3); - - fusion->addInput(tv0); - - auto tv1 = sum(tv0, {1, 2}); - auto tv2 = broadcast(tv1, {false, true, true}); - auto tv3 = add(tv2, tv0); - - fusion->addOutput(tv3); - - // Schedule a persistent kernel - auto tv0_cache = tv0->cache_after(); - tv1->merge(1); - tv1->split(1, 8, false); - - auto tv1_rf = tv1->rFactor({1}); - tv1_rf->axis(0)->parallelize(ParallelType::BIDx); - tv1_rf->axis(-1)->parallelize(ParallelType::TIDx); - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv1->axis(-1)->padToMultipleOfWarp(); - TransformPropagator::from(tv1_rf); - tv0->axis(-1)->parallelize(ParallelType::TIDx); - tv0_cache->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - - tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input1 = at::randn({16, 17, 128}, options); - - auto at_output = input1.sum({1, 2}, true).add(input1); - - FusionExecutor fe; - fe.compileFusion(fusion.get()); - auto outputs = fe.runFusion({input1}); - testValidate( - fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionSerialWarpReduction_CUDA) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); - - auto tv0 = makeSymbolicTensor(3); - - fusion->addInput(tv0); - - auto tv1 = sum(tv0, {1, 2}); - auto tv2 = broadcast(tv1, {false, true, true}); - auto tv3 = add(tv2, tv0); - - fusion->addOutput(tv3); - - // Schedule a persistent kernel - auto tv0_cache = tv0->cache_after(); - tv1->merge(1); - tv1->split(1, 8, false); - - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv1->axis(-1)->padToMultipleOfWarp(); - TransformPropagator::from(tv1); - tv0->axis(-1)->parallelize(ParallelType::TIDx); - tv0_cache->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - - tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input1 = at::randn({16, 17, 128}, options); - - auto at_output = input1.sum({1, 2}, true).add(input1); - - FusionExecutor fe; - fe.compileFusion(fusion.get()); - auto outputs = fe.runFusion({input1}); - testValidate( - fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionTrivialWarpReduction_CUDA) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); - - auto tv0 = makeConcreteTensor({17, 18, 128, 1}); - - fusion->addInput(tv0); - - auto tv1 = sum(tv0, {1, 2, 3}); - auto tv2 = broadcast(tv1, {false, true, true, true}); - auto tv3 = add(tv2, tv0); - - fusion->addOutput(tv3); - - // Schedule a persistent kernel - auto tv0_cache = tv0->cache_after(); - tv1->merge(1); - tv1->split(1, 8, false); - - auto tv1_rf = tv1->rFactor({1}); - tv1_rf->axis(0)->parallelize(ParallelType::BIDx); - tv1_rf->axis(-2)->parallelize(ParallelType::TIDx); - tv1->axis(-2)->parallelize(ParallelType::TIDx); - tv1->axis(-2)->padToMultipleOfWarp(); - TransformPropagator::from(tv1_rf); - tv0->axis(-2)->parallelize(ParallelType::TIDx); - tv0_cache->axis(-2)->parallelize(ParallelType::TIDx); - tv2->axis(-2)->parallelize(ParallelType::TIDx); - tv3->axis(-2)->parallelize(ParallelType::TIDx); - - tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input1 = at::randn({17, 18, 128, 1}, options); - - auto at_output = input1.sum({1, 2, 3}, true).add(input1); - - FusionExecutor fe; - fe.compileFusion(fusion.get()); - auto outputs = fe.runFusion({input1}); - testValidate( - fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionMultipleDimBinding_CUDA) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); - - auto tv0 = makeSymbolicTensor(2); - auto tv_add = makeSymbolicTensor(2); - - fusion->addInput(tv0); - fusion->addInput(tv_add); - - auto tv1 = sum(tv0, {1}); - auto tv2 = broadcast(tv1, {false, true}); - auto tv3 = add(tv2, tv0); - auto tv4 = add(tv0, tv_add); - - fusion->addOutput(tv3); - fusion->addOutput(tv4); - - // Schedule a persistent kernel - auto tv0_cache = tv0->cache_after(); - tv1->split(1, 8, false); - auto tv1_rf = tv1->rFactor({1}); - tv1_rf->axis(0)->parallelize(ParallelType::BIDx); - tv1_rf->axis(-1)->parallelize(ParallelType::TIDx); - tv1_rf->axis(-1)->padToMultipleOfWarp(32); - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv1->axis(-1)->padToMultipleOfWarp(32); - TransformPropagator::from(tv1_rf); - tv0->axis(-1)->parallelize(ParallelType::TIDx); - tv0->axis(-1)->padToMultipleOfWarp(32); - tv0_cache->axis(-1)->parallelize(ParallelType::TIDx); - tv0_cache->axis(-1)->padToMultipleOfWarp(32); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->padToMultipleOfWarp(32); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->padToMultipleOfWarp(32); - tv4->axis(-1)->parallelize(ParallelType::TIDx); - tv4->axis(-1)->padToMultipleOfWarp(64); - - tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input1 = at::randn({16, 128}, options); - at::Tensor input2 = at::randn({16, 128}, options); - - auto at_output = input1.sum({1}, true).add(input1); - - FusionExecutor fe; - fe.compileFusion(fusion.get()); - auto outputs = fe.runFusion({input1, input2}); - testValidate( - fusion.get(), - outputs, - {input1, input2}, - {at_output, input1 + input2}, - __LINE__, - __FILE__); -} - -TEST(NVFuserTest, FusionPadNoWarpReduce_CUDA) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); - - auto tv0 = makeSymbolicTensor(2); - - fusion->addInput(tv0); - - auto tv1 = sum(tv0, {1}); - auto tv2 = broadcast(tv1, {false, true}); - auto tv3 = add(tv2, tv0); - - fusion->addOutput(tv3); - - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv1->axis(-1)->padToMultipleOfWarp(); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - - tv1->axis(0)->parallelize(ParallelType::TIDy); - tv2->axis(0)->parallelize(ParallelType::TIDy); - tv3->axis(0)->parallelize(ParallelType::TIDy); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input1 = at::randn({16, 31}, options); - - auto at_output = input1.sum({1}, true).add(input1); - - FusionExecutor fe; - fe.compileFusion(fusion.get()); - auto outputs = fe.runFusion({input1}); - testValidate( - fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionWarpMutipleThreadDim_CUDA) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); - - auto tv0 = makeSymbolicTensor(2); - fusion->addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = sum(tv1, {1}); - fusion->addOutput(tv2); - - tv2->split(1, 8); - auto tv2_rf = tv2->rFactor({-1}); - tv2_rf->axis(-1)->parallelize(ParallelType::TIDx); - tv2_rf->axis(-1)->padToMultipleOfWarp(); - - TransformPropagator::from(tv2_rf); - - tv0->axis(-1)->parallelize(ParallelType::TIDx); - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(0)->parallelize(ParallelType::BIDx); - tv2->axis(1)->parallelize(ParallelType::TIDy); - tv0->computeAt(tv2, 2); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input1 = at::randn({16, 31}, options); - - auto at_output = (input1 + 1).sum({1}); - - FusionExecutor fe; - fe.compileFusion(fusion.get()); - auto outputs = fe.runFusion({input1}); - testValidate( - fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionWarpReduceUnrollOuterLoop_CUDA) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); - - auto tv0 = makeSymbolicTensor(2); - - fusion->addInput(tv0); - - auto tv1 = sum(tv0, {1}); - auto tv2 = broadcast(tv1, {false, true}); - auto tv3 = add(tv2, tv0); - - fusion->addOutput(tv3); - - // Schedule a persistent kernel - auto tv0_cache = tv0->cache_after(); - tv1->split(1, 8, false); - tv1->split(0, 4); - auto tv1_rf = tv1->rFactor({2}); - - tv1_rf->axis(0)->parallelize(ParallelType::BIDx); - tv1_rf->axis(1)->parallelize(ParallelType::Unroll); - tv1_rf->axis(-1)->parallelize(ParallelType::TIDx); - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv1->axis(-1)->padToMultipleOfWarp(); - tv1->axis(1)->parallelize(ParallelType::Unroll); - TransformPropagator::from(tv1_rf); - tv0->axis(-1)->parallelize(ParallelType::TIDx); - tv0->axis(1)->parallelize(ParallelType::Unroll); - tv0_cache->axis(-1)->parallelize(ParallelType::TIDx); - tv0_cache->axis(1)->parallelize(ParallelType::Unroll); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(1)->parallelize(ParallelType::Unroll); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(1)->parallelize(ParallelType::Unroll); - - tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input1 = at::randn({16, 128}, options); - - auto at_output = input1.sum({1}, true).add(input1); - - FusionExecutor fe; - fe.compileFusion(fusion.get()); - auto outputs = fe.runFusion({input1}); - testValidate( - fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionSegfaultReduction_CUDA) { - std::unique_ptr fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); - FusionGuard fg(&fusion); - - int batch = 2; - int c = 1; - int h = 1; - int w = 1; - int numDims = 4; - - auto input = makeConcreteTensor({-1, 1, 1, 1}); - fusion.addInput(input); - auto bcast_bias = makeConcreteTensor({-1, 1, 1, 1}); - fusion.addInput(bcast_bias); - - std::vector at_sum_axes; - std::vector outer_reduction_axes; - std::vector outer_broadcast_mask(numDims, false); - Val* N = new Double(1); - for (const auto axis : c10::irange(numDims)) { - if (axis != 1) { - outer_reduction_axes.push_back(axis); - at_sum_axes.push_back(axis); - outer_broadcast_mask[axis] = true; - N = mul(N, input->domain()->domain()[axis]->extent()); - } - } - - auto output0 = mul(input, bcast_bias); - fusion.addOutput(output0); - auto output1 = sum(output0, outer_reduction_axes); - fusion.addOutput(output1); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input0 = at::randn({batch, c, h, w}, options); - at::Tensor input1 = at::randn({batch, c, h, w}, options); - - auto at_output0 = input0.mul(input1); - auto at_output1 = at_output0.sum(at_sum_axes); - - FusionExecutorCache fec(std::move(fusion_ptr)); - std::vector inputs = {input0, input1}; - auto outputs = fec.runFusionWithInputs(inputs); - - testValidate( - &fusion, outputs, inputs, {at_output0, at_output1}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionPredicateElimination_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(2)); - auto tv3 = add(tv2, new Double(3)); - - fusion.addOutput(tv3); - - tv3->split(0, 32); - tv0->computeAt(tv3, 1); - - tv2->axis(1)->parallelize(ParallelType::Unswitch); - - { - GpuLower gpulw(&fusion); - TORCH_CHECK(!isPredicated(tv2, gpulw)); - } - - tv2->axis(1)->parallelize(ParallelType::Serial); - tv2->split(1, 5); - - { - GpuLower gpulw(&fusion); - TORCH_CHECK(isPredicated(tv2, gpulw)); - } -} - -TEST(NVFuserTest, FusionForceFp16Simple_CUDA) { - std::unique_ptr fusion_ptr = std::make_unique(); - auto fusion = fusion_ptr.get(); - FusionGuard fg(fusion); - - auto tv0 = makeSymbolicTensor(2); - auto tv1 = makeSymbolicTensor(2); - - fusion->addInput(tv0); - fusion->addInput(tv1); - - // Group 1 - auto tv2 = sum(tv0, {1}); - auto tv3 = broadcast(tv2, {false, true}); - - // Group 2 - auto tv4 = add(tv3, tv1); // Edge: tv3: expect cast - auto tv5 = castOp(DataType::Half, tv4); - - fusion->addOutput(tv5); - - FusionExecutorCache fec(std::move(fusion_ptr)); - - std::vector shape{15, 16}; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto in0 = at::randn(shape, options); - auto in1 = at::randn(shape, options); - fec.runFusionWithInputs({in0, in1}); - - // Check the segmented edge is fp16 - auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments(); - for (auto edge : segmented_fusion->edges()) { - auto edge_tv = edge->val->as(); - TORCH_CHECK(edge_tv->getDataType() == DataType::Half); - } -} - -TEST(NVFuserTest, FusionForceBf16Simple_CUDA) { -#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 - if (at::cuda::getDeviceProperties(0)->major >= 8) { - std::unique_ptr fusion_ptr = std::make_unique(); - auto fusion = fusion_ptr.get(); - FusionGuard fg(fusion); - - auto tv0 = makeSymbolicTensor(2); - auto tv1 = makeSymbolicTensor(2); - - fusion->addInput(tv0); - fusion->addInput(tv1); - - // Group 1 - auto tv2 = sum(tv0, {1}); - auto tv3 = broadcast(tv2, {false, true}); - - // Group 2 - auto tv4 = add(tv3, tv1); // Edge: tv3: expect cast - auto tv5 = castOp(DataType::BFloat16, tv4); - - fusion->addOutput(tv5); - - FusionExecutorCache fec(std::move(fusion_ptr)); - - std::vector shape{15, 16}; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto in0 = at::randn(shape, options); - auto in1 = at::randn(shape, options); - fec.runFusionWithInputs({in0, in1}); - - // Check the segmented edge is bf16 - auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments(); - for (auto edge : segmented_fusion->edges()) { - auto edge_tv = edge->val->as(); - TORCH_CHECK(edge_tv->getDataType() == DataType::BFloat16); - } - } else { - GTEST_SKIP(); - } -#else - GTEST_SKIP(); -#endif -} - -TEST(NVFuserTest, FusionForceFp16NotAllCast_CUDA) { - std::unique_ptr fusion_ptr = std::make_unique(); - auto fusion = fusion_ptr.get(); - FusionGuard fg(fusion); - - auto tv0 = makeSymbolicTensor(3); - auto tv1 = makeSymbolicTensor(3); - - fusion->addInput(tv0); - fusion->addInput(tv1); - - // Group 1 - auto tv3 = sum(tv0, {1}); - auto tv4 = broadcast(tv3, {false, true, false}); - auto tv5 = sum(tv0, {1}); - - // Group 2 - auto tv6 = add(tv4, tv1); // edge tv4, expect cast - auto tv7 = castOp(DataType::Half, tv6); - - // Group 3 - auto tv8 = sum(tv5, {1}); // edge tv5, don't expect cast - - fusion->addOutput(tv7); - fusion->addOutput(tv8); - - FusionExecutorCache fec(std::move(fusion_ptr)); - - std::vector shape{16, 16, 16}; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto in0 = at::randn(shape, options); - auto in1 = at::randn(shape, options); - fec.runFusionWithInputs({in0, in1}); - - auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments(); - auto complete_fusion = segmented_fusion->completeFusion(); - - // Check that the edge that wasn't fp16 is the producer of the - // reduction op, i.e. tv8 = sum(tv5,{1});. - for (auto edge : segmented_fusion->edges()) { - auto edge_tv = edge->val->as(); - if (edge_tv->getDataType() == DataType::Float) { - auto consumer = *(complete_fusion->unordered_uses(edge_tv).begin()); - TORCH_CHECK(consumer->isA()); - } - } -} - -TEST(NVFuserTest, FusionForceBf16NotAllCast_CUDA) { -#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 - if (at::cuda::getDeviceProperties(0)->major >= 8) { - std::unique_ptr fusion_ptr = std::make_unique(); - auto fusion = fusion_ptr.get(); - FusionGuard fg(fusion); - - auto tv0 = makeSymbolicTensor(3); - auto tv1 = makeSymbolicTensor(3); - - fusion->addInput(tv0); - fusion->addInput(tv1); - - // Group 1 - auto tv3 = sum(tv0, {1}); - auto tv4 = broadcast(tv3, {false, true, false}); - auto tv5 = sum(tv0, {1}); - - // Group 2 - auto tv6 = add(tv4, tv1); // edge tv4, expect cast - auto tv7 = castOp(DataType::BFloat16, tv6); - - // Group 3 - auto tv8 = sum(tv5, {1}); // edge tv5, don't expect cast - - fusion->addOutput(tv7); - fusion->addOutput(tv8); - - FusionExecutorCache fec(std::move(fusion_ptr)); - - std::vector shape{16, 16, 16}; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto in0 = at::randn(shape, options); - auto in1 = at::randn(shape, options); - fec.runFusionWithInputs({in0, in1}); - - auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments(); - auto complete_fusion = segmented_fusion->completeFusion(); - - // Check that the edge that wasn't fp16 is the producer of the - // reduction op, i.e. tv8 = sum(tv5,{1});. - for (auto edge : segmented_fusion->edges()) { - auto edge_tv = edge->val->as(); - if (edge_tv->getDataType() == DataType::Float) { - auto consumer = *(complete_fusion->unordered_uses(edge_tv).begin()); - TORCH_CHECK(consumer->isA()); - } - } - } else { - GTEST_SKIP(); - } -#else - GTEST_SKIP(); -#endif -} - -TEST(NVFuserTest, FusionBufferReuseBroadCastMultiVisit_CUDA) { - std::unique_ptr fusion_ptr = std::make_unique(); - auto fusion = fusion_ptr.get(); - FusionGuard fg(fusion); - - auto tv0 = makeConcreteTensor({2, 2}); - auto tv1 = makeConcreteTensor({2, 2, 2}); - - fusion->addInput(tv0); - fusion->addInput(tv1); - - auto tv2 = mul(tv0, new Double(2)); - auto tv3 = broadcast(tv2, {false, false, true}); - auto tv4 = add(tv3, tv1); - auto tv5 = mul(tv4, new Double(3)); - fusion->addOutput(tv5); - - // t4 cannot inner re-use t2, because there's a broadcast - // between them. - tv0->computeAt(tv5, 1, ComputeAtMode::BestEffort); - tv3->computeAt(tv5, 2, ComputeAtMode::BestEffort); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto in0 = at::randn({2, 2}, options); - auto in1 = at::randn({2, 2, 2}, options); - - auto at_output = ((in0 * 2).unsqueeze(2) + in1) * 3; - FusionExecutor fe; - fe.compileFusion(fusion); - auto outputs = fe.runFusion({in0, in1}); - - testValidate(fusion, outputs, {in0, in1}, {at_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionBufferReuseStressTest_CUDA) { - std::unique_ptr fusion_ptr = std::make_unique(); - auto fusion = fusion_ptr.get(); - FusionGuard fg(fusion); - - auto tv0 = makeConcreteTensor({2, 2}); - auto tv1 = makeConcreteTensor({2, 2, 2}); - - fusion->addInput(tv0); - fusion->addInput(tv1); - - auto tv2 = mul(tv0, new Double(2)); - auto tv3 = mul(tv0, new Double(3)); - auto tv4 = mul(tv2, tv3); - // Broadcast buffer can be reused through outer sharing - auto tv5 = broadcast(tv4, {true, false, false}); - auto tv6 = mul(tv5, new Double(5)); - auto tv7 = mul(tv6, tv1); - auto tv8 = mul(tv7, new Double(7)); - // tv9 shouldn't alias to avoid buffer over-subscription - auto tv9 = broadcast(tv4, {true, false, false}); - auto tv10 = mul(tv9, new Double(9)); - auto tv11 = add(tv5, tv9); - fusion->addOutput(tv7); - fusion->addOutput(tv11); - - tv0->computeAt(tv5, 1, ComputeAtMode::BestEffort); - tv0->computeAt(tv9, 1, ComputeAtMode::BestEffort); - - tv5->computeAt(tv7, 1, ComputeAtMode::BestEffort); - tv5->computeAt(tv11, 1, ComputeAtMode::BestEffort); - tv9->computeAt(tv11, 1, ComputeAtMode::BestEffort); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto in0 = at::randn({2, 2}, options); - auto in1 = at::randn({2, 2, 2}, options); - auto t2 = in0 * 2; - auto t3 = in0 * 3; - auto t4 = t2 * t3; - auto t5 = t4.unsqueeze(0); - auto t6 = t5 * 5; - auto t7 = t6 * in1; - auto t8 = t7 * 7; - auto t9 = t4.unsqueeze(0); - auto t10 = t9 * 9; - auto t11 = t5 + t9; - FusionExecutor fe; - fe.compileFusion(fusion); - - auto at_output = ((in0 * 2).unsqueeze(2) + in1) * 3; - auto outputs = fe.runFusion({in0, in1}); - - testValidate(fusion, outputs, {in0, in1}, {t7, t11}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionBufferReuseLargeBuffer_CUDA) { - std::unique_ptr fusion_ptr = std::make_unique(); - auto fusion = fusion_ptr.get(); - FusionGuard fg(fusion); - - auto tv0 = makeConcreteTensor({256, 512}); - - fusion->addInput(tv0); - - auto tv1 = mul(tv0, new Double(2)); - auto tv2 = mul(tv1, new Double(2)); - auto tv3 = mul(tv2, new Double(2)); - auto tv4 = mul(tv3, new Double(2)); - auto tv5 = mul(tv4, new Double(2)); - auto tv6 = mul(tv5, new Double(2)); - - fusion->addOutput(tv6); - - tv0->computeAt(tv6, 1, ComputeAtMode::BestEffort); - tv6->axis(0)->parallelize(ParallelType::TIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto in0 = at::randn({256, 512}, options); - - FusionExecutor fe; - fe.compileFusion(fusion); - auto outputs = fe.runFusion({in0}); - - auto at_out = in0.mul(2).mul(2).mul(2).mul(2).mul(2).mul(2); - - testValidate(fusion, outputs, {in0}, {at_out}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionBufferReuseNo2hop_CUDA) { - std::unique_ptr fusion_ptr = std::make_unique(); - auto fusion = fusion_ptr.get(); - FusionGuard fg(fusion); - - auto tv0 = makeConcreteTensor({2, 2}); - auto tv1 = makeConcreteTensor({2, 2, 2}); - - fusion->addInput(tv0); - fusion->addInput(tv1); - - auto tv2 = mul(tv0, new Double(2)); - auto tv3 = broadcast(tv2, {false, false, true}); - auto tv4 = add(tv3, tv1); // T4 to be inner aliased first, and - // shouldn't outer alias on top - auto tv5 = mul(tv4, new Double(3)); - auto tv6 = mul(tv5, new Double(3)); - fusion->addOutput(tv6); - - tv0->computeAt(tv6, 1, ComputeAtMode::BestEffort); - tv4->computeAt(tv6, 2, ComputeAtMode::BestEffort); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto in0 = at::randn({2, 2}, options); - auto in1 = at::randn({2, 2, 2}, options); - FusionExecutor fe; - fe.compileFusion(fusion); - auto outputs = fe.runFusion({in0, in1}); - - auto at_out = (in0.mul(2.0).unsqueeze(2) + in1).mul(3.0).mul(3.0); - - testValidate(fusion, outputs, {in0, in1}, {at_out}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionBufferReuseAllocationOrder_CUDA) { - std::unique_ptr fusion_ptr = std::make_unique(); - auto fusion = fusion_ptr.get(); - FusionGuard fg(fusion); - - auto tv0 = makeConcreteTensor({3, 3, 3}); - - fusion->addInput(tv0); - - auto tv1 = sum(tv0, {1}); - auto tv2 = mul(tv1, new Double(2)); - auto tv3 = mul(tv2, new Double(2)); - - fusion->addOutput(tv3); - - // In this case tv1 "reuses" allocation of tv2 - // due to the switched allocation order - tv1->computeAt(tv2, 1, ComputeAtMode::BestEffort); - - tv0->axis(0)->parallelize(ParallelType::TIDx); - tv1->axis(0)->parallelize(ParallelType::TIDx); - tv2->axis(0)->parallelize(ParallelType::TIDx); - tv3->axis(0)->parallelize(ParallelType::TIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto in0 = at::randn({3, 3, 3}, options); - - FusionExecutor fe; - fe.compileFusion(fusion); - auto outputs = fe.runFusion({in0}); - - auto at_out = in0.sum(1).mul(2).mul(2); - - testValidate(fusion, outputs, {in0}, {at_out}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionBufferReuseLiveInterval_CUDA) { - std::unique_ptr fusion_ptr = std::make_unique(); - auto fusion = fusion_ptr.get(); - FusionGuard fg(fusion); - - auto tv0 = makeConcreteTensor({16, 16}); - - fusion->addInput(tv0); - - auto tv1 = mul(tv0, new Double(3)); - auto tv2 = mul(tv1, new Double(2)); - auto tv3 = mul(tv2, new Double(2)); - // tv1 used till here, cannot be reused by tv2 or tv3 - auto tv4 = mul(tv3, tv1); - - fusion->addOutput(tv4); - - tv0->computeAt(tv4, 1); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto in0 = at::randn({16, 16}, options); - - FusionExecutor fe; - fe.compileFusion(fusion); - auto cg_outputs = fe.runFusion({in0}); - - auto at_t0 = in0 * 3.0; - auto at_out = at_t0 * 2.0 * 2.0 * at_t0; - - testValidate(fusion, cg_outputs, {in0}, {at_out}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionBufferReuseNoAcrossBroadcast_CUDA) { - std::unique_ptr fusion_ptr = std::make_unique(); - auto fusion = fusion_ptr.get(); - FusionGuard fg(fusion); - - auto tv0 = makeConcreteTensor({2, 2}); - auto tv1 = makeConcreteTensor({2, 2, 2}); - - fusion->addInput(tv0); - fusion->addInput(tv1); - - auto tv2 = mul(tv0, new Double(2)); - auto tv3 = mul(tv0, new Double(3)); - auto tv4 = mul(tv2, tv3); - auto tv5 = broadcast(tv4, {false, false, true}); - auto tv6 = mul(tv5, tv1); - auto tv7 = mul(tv6, new Double(7)); - fusion->addOutput(tv7); - - // tv6 shouldn't re-use t2 or t3 because of - // the broadcast in between - tv0->computeAt(tv4, 1, ComputeAtMode::BestEffort); - tv4->computeAt(tv7, 2, ComputeAtMode::BestEffort); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto in0 = at::randn({2, 2}, options); - auto in1 = at::randn({2, 2, 2}, options); - FusionExecutor fe; - fe.compileFusion(fusion); - auto outputs = fe.runFusion({in0, in1}); - - auto t2 = in0 * 2; - auto t3 = in0 * 3; - auto t4 = t2 * t3; - auto t5 = t4.unsqueeze(2); - auto t6 = t5 * in1; - auto t7 = t6 * 7; - testValidate(fusion, outputs, {in0, in1}, {t7}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionIssue970_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const int nelm = 10; - - // tv3 = tv0 + sum(tv0) - auto tv0 = makeConcreteTensor({nelm, nelm}); - fusion.addInput(tv0); - auto tv1 = sum(tv0, {1}); - auto tv2 = broadcast(tv1, {false, true}); - auto tv3 = add(tv2, tv0); - fusion.addOutput(tv3); - - tv1->split(1, 4); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor t0 = at::randn({nelm, nelm}, options); - - auto outputs = fe.runFusion({t0}); - - auto ref = sum(t0, {1}).unsqueeze(-1).expand({nelm, nelm}) + t0; - - testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__); -} - -// Reproducer of #1016 -TEST(NVFuserTest, FusionIssue1016_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(2)); - - fusion.addOutput(tv2); - - tv1->setMemoryType(MemoryType::Shared); - - tv2->split(-1, 8); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 10; - int numel_y = 11; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto ref = t0 + 1 + 2; - - testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__); -} - -// Reproducer of #1021 -TEST(NVFuserTest, FusionIssue1021_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = broadcast(tv1, {false, true}); - fusion.addOutput(tv2); - - auto tv3 = tv2->cache_before(); - - tv2->split(0, 2); - - tv1->computeAt(tv2, 1); - - tv2->axis(0)->parallelize(ParallelType::TIDx); - tv2->axis(1)->parallelize(ParallelType::Vectorize); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({10}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto ref = (t0 + 1).unsqueeze(-1); - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -// Reproducer of issue #1053 -TEST(NVFuserTest, FusionNonUniqueThreadDim_CUDA) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); - - auto tv0 = makeSymbolicTensor(1); - fusion->addInput(tv0); - auto tv1 = sum(tv0, {0}); - fusion->addOutput(tv1); - - auto tv2 = add(tv0, new Double(1)); - fusion->addOutput(tv2); - - tv1->split(0, 8); - auto tv1_rf = tv1->rFactor({-1}); - - tv1_rf->computeAt(tv1, 1); - - tv1_rf->axis(-1)->parallelize(ParallelType::TIDx); - - tv2->axis(0)->parallelize(ParallelType::TIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input1 = at::randn({32}, options); - - auto at_tv1 = (input1).sum({0}); - auto at_tv2 = input1 + 1; - - FusionExecutor fe; - fe.compileFusion(fusion.get()); - auto outputs = fe.runFusion({input1}); - testValidate( - fusion.get(), outputs, {input1}, {at_tv1, at_tv2}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionParallelDimensionMap1_CUDA) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); - - auto tv0 = makeSymbolicTensor(1); - fusion->addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv0, new Double(1)); - fusion->addOutput(tv1); - fusion->addOutput(tv2); - - tv1->split(0, 8, false); - tv1->axis(1)->parallelize(ParallelType::TIDx); - tv2->split(0, 8, false); - tv2->axis(1)->parallelize(ParallelType::TIDx); - - // The extents of tv1 and tv2 axes are equal even though their - // actual values are not statically known - GpuLower gpulw(fusion.get()); - const auto& pdmap = gpulw.parallelDimensionMap(); - auto kir_tv1 = gpulw.lowerValue(tv1)->as(); - auto kir_tv2 = gpulw.lowerValue(tv2)->as(); - for (const auto i : c10::irange(kir_tv1->domain()->domain().size())) { - auto dom1 = kir_tv1->domain()->domain()[i]; - auto dom2 = kir_tv2->domain()->domain()[i]; - TORCH_INTERNAL_ASSERT(pdmap.equalDim(dom1->extent(), dom2->extent())); - } - - TORCH_CHECK(pdmap.isExact(ParallelType::TIDx)); - TORCH_CHECK( - pdmap.get(ParallelType::TIDx)->isA() && - pdmap.get(ParallelType::TIDx)->as()->name() == - "blockDim.x"); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input1 = at::randn({32}, options); - - FusionExecutor fe; - fe.compileFusion(fusion.get()); - auto outputs = fe.runFusion({input1}); - - testValidate( - fusion.get(), - outputs, - {input1}, - {input1 + 1, input1 + 1}, - __LINE__, - __FILE__); -} - -TEST(NVFuserTest, FusionParallelDimensionMap2_CUDA) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); - - auto tv0 = makeSymbolicTensor(1); - fusion->addInput(tv0); - auto tv1 = makeSymbolicTensor(2); - fusion->addInput(tv1); - auto tv2 = broadcast(tv0, {false, true}); - auto tv3 = add(tv1, tv2); - fusion->addOutput(tv3); - - tv3->split(-1, 8, false); - tv2->computeAt(tv3, -1); - - tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - - GpuLower gpulw(fusion.get()); - const auto& pdmap = gpulw.parallelDimensionMap(); - TORCH_CHECK(pdmap.isExact(ParallelType::TIDx)); - TORCH_CHECK( - pdmap.get(ParallelType::TIDx)->isA() && - pdmap.get(ParallelType::TIDx)->as()->name() == - "blockDim.x"); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input1 = at::randn({11}, options); - at::Tensor input2 = at::randn({11, 13}, options); - - FusionExecutor fe; - fe.compileFusion(fusion.get()); - auto outputs = fe.runFusion({input1, input2}); - - auto ref = input1.unsqueeze(-1) + input2; - - testValidate( - fusion.get(), outputs, {input1, input2}, {ref}, __LINE__, __FILE__); -} - -// Mix symbolic and concrete tensors -TEST(NVFuserTest, FusionParallelDimensionMap3_CUDA) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); - - auto tv0 = makeSymbolicTensor(1); - fusion->addInput(tv0); - - auto tv2 = add(tv0, new Double(1)); - fusion->addOutput(tv2); - auto tv3 = add(tv0, new Double(1)); - fusion->addOutput(tv3); - - tv2->split(0, 10); - tv3->split(0, 20); - - auto tv4 = add(tv0, new Double(1)); - fusion->addOutput(tv4); - auto tv5 = add(tv0, new Double(1)); - fusion->addOutput(tv5); - - // Not mapped but equal extent - tv4->split(0, 10); - tv5->split(0, 10); - - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - - tv4->axis(-1)->parallelize(ParallelType::TIDy); - tv5->axis(-1)->parallelize(ParallelType::TIDy); - - GpuLower gpulw(fusion.get()); - const auto& pdmap = gpulw.parallelDimensionMap(); - TORCH_CHECK(!pdmap.isExact(ParallelType::TIDx)); - TORCH_CHECK( - pdmap.get(ParallelType::TIDx)->isA() && - pdmap.get(ParallelType::TIDx)->as()->name() == - "blockDim.x"); - TORCH_CHECK(pdmap.isExact(ParallelType::TIDy)); - TORCH_CHECK( - pdmap.get(ParallelType::TIDy)->isConst() && - pdmap.get(ParallelType::TIDy)->as()->value().value() == 10); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input1 = at::randn({13}, options); - - FusionExecutor fe; - fe.compileFusion(fusion.get()); - auto outputs = fe.runFusion({input1}); - - testValidate( - fusion.get(), - outputs, - {input1}, - {input1 + 1, input1 + 1, input1 + 1, input1 + 1}, - __LINE__, - __FILE__); -} - -// Parallelizing merged broadcast domains -TEST(NVFuserTest, FusionParallelDimensionMap4_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - auto tv1 = makeSymbolicTensor(2); - fusion.addInput(tv1); - auto tv2 = add(tv0, new Double(1)); - auto tv3 = broadcast(tv2, {true, false}); - auto tv4 = add(tv3, tv1); - fusion.addOutput(tv4); - - tv4->split(1, 4); - tv4->reorder({{1, 2}, {2, 1}}); - tv4->merge(0); - tv0->computeAt(tv4, 1); - tv1->computeAt(tv4, 1); - - // TIDx is mapped to tv4.axis(0) as well as tv2.axis(0), so it's not - // exact. - tv4->axis(0)->parallelize(ParallelType::TIDx); - - tv2->setMemoryType(MemoryType::Shared); - tv3->setMemoryType(MemoryType::Shared); - - GpuLower gpulw(&fusion); - const auto& pdmap = gpulw.parallelDimensionMap(); - TORCH_CHECK(!pdmap.isExact(ParallelType::TIDx)); - TORCH_CHECK( - pdmap.get(ParallelType::TIDx)->isA() && - pdmap.get(ParallelType::TIDx)->as()->name() == - "blockDim.x"); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input1 = at::randn({13}, options); - at::Tensor input2 = at::randn({15, 13}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({input1, input2}); - - auto ref = (input1 + 1).unsqueeze(0) + input2; - - testValidate(&fusion, outputs, {input1, input2}, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionParallelDimensionMap5_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - auto tv1 = makeSymbolicTensor(2); - fusion.addInput(tv1); - auto tv3 = broadcast(tv0, {false, true}); - auto tv4 = add(tv3, tv1); - fusion.addOutput(tv4); - - tv4->split(1, 4); - tv0->computeAt(tv4, -1); - tv1->computeAt(tv4, -1); - - tv4->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv4->axis(-2)->parallelize(ParallelType::TIDy); - tv3->axis(-2)->parallelize(ParallelType::TIDy); - - GpuLower gpulw(&fusion); - const auto& pdmap = gpulw.parallelDimensionMap(); - TORCH_CHECK(pdmap.isExact(ParallelType::TIDx)); - TORCH_CHECK(pdmap.isExact(ParallelType::TIDy)); - TORCH_CHECK( - pdmap.get(ParallelType::TIDx)->isConst() && - pdmap.get(ParallelType::TIDx)->as()->value().value() == 4); - TORCH_CHECK( - pdmap.get(ParallelType::TIDy)->isA() && - pdmap.get(ParallelType::TIDy)->as()->name() == - "blockDim.y"); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input1 = at::randn({13}, options); - at::Tensor input2 = at::randn({13, 15}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({input1, input2}); - - auto ref = (input1).unsqueeze(-1) + input2; - - testValidate(&fusion, outputs, {input1, input2}, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionSegmenterCombineReductionsCycleRepro_CUDA) { - auto fusion_ptr = std::make_unique(); - auto& fusion = *fusion_ptr.get(); - FusionGuard fg(&fusion); - - auto t0 = makeSymbolicTensor(3, DataType::Float); - auto t1 = makeSymbolicTensor(3, DataType::Half); - auto t3 = makeSymbolicTensor(3, DataType::Half); - auto t5 = makeSymbolicTensor(3, DataType::Half); - auto t7 = makeSymbolicTensor(1, DataType::Half); - auto t11 = makeSymbolicTensor(3, DataType::Half); - auto t13 = makeSymbolicTensor(3, DataType::Half); - auto t15 = makeSymbolicTensor(3, DataType::Half); - auto t17 = makeSymbolicTensor(3, DataType::Half); - auto d56 = new Double(); - - fusion.addInput(t0); - fusion.addInput(t1); - fusion.addInput(t3); - fusion.addInput(t5); - fusion.addInput(t7); - fusion.addInput(t11); - fusion.addInput(t13); - fusion.addInput(t15); - fusion.addInput(t17); - fusion.addInput(d56); - - auto t2 = castOp(DataType::Float, t1); - auto t4 = castOp(DataType::Float, t3); - auto t22 = sub(t2, t4); - auto t6 = castOp(DataType::Float, t5); - auto t23 = mul(t22, t6); - auto t16 = castOp(DataType::Float, t15); - auto t18 = castOp(DataType::Float, t17); - auto t19 = add(t16, t18); - auto t14 = castOp(DataType::Float, t13); - auto t20 = add(t19, t14); - auto t12 = castOp(DataType::Float, t11); - auto t21 = add(t20, t12); - auto t8 = castOp(DataType::Float, t7); - auto t24 = broadcast(t8, {true, true, false}); - auto t25 = mul(t21, t24); - auto t27 = sum(t25, {2}); - auto t28 = broadcast(t27, {false, false, true}); - auto t29 = mul(t25, t23); - auto t30 = sum(t29, {2}); - auto t31 = broadcast(t30, {false, false, true}); - auto d59 = mul(t1->getRootDomain()[2]->extent(), new Double(1)); - auto t26 = mul(d59, t25); - auto txx = mul(t26, new Double(1)); - auto t33 = sub(txx, t28); - auto d70 = unaryOp(UnaryOpType::Reciprocal, d59); - auto t35 = mul(d70, t6); - auto t39 = sum(t21, {0, 1}); - auto t47 = castOp(DataType::Half, t39); - auto t37 = mul(t21, t23); - auto t38 = sum(t37, {0, 1}); - auto t46 = castOp(DataType::Half, t38); - auto t32 = mul(t23, t31); - auto t34 = sub(t33, t32); - auto t36 = mul(t35, t34); - auto t45 = castOp(DataType::Half, t36); - auto t40 = mul(t36, t0); - auto t41 = mul(t40, d56); - auto t44 = castOp(DataType::Half, t41); - auto t42 = sum(t41, {0, 1}); - auto t43 = castOp(DataType::Half, t42); - - fusion.addOutput(t43); - fusion.addOutput(t44); - fusion.addOutput(t45); - fusion.addOutput(t46); - fusion.addOutput(t47); - - auto options_half = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); - auto options_float = - at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor at_t0 = at::randn({128, 64, 1024}, options_float); - at::Tensor at_t1 = at::randn({128, 64, 1024}, options_half); - at::Tensor at_t3 = at::randn({128, 64, 1024}, options_half); - at::Tensor at_t5 = at::randn({128, 64, 1024}, options_half); - at::Tensor at_t7 = at::randn({1024}, options_half); - at::Tensor at_t11 = at::randn({128, 64, 1024}, options_half); - at::Tensor at_t13 = at::randn({128, 64, 1024}, options_half); - at::Tensor at_t15 = at::randn({128, 64, 1024}, options_half); - at::Tensor at_t17 = at::randn({128, 64, 1024}, options_half); - double at_d56 = 1.1111; - - std::vector aten_inputs = { - at_t0, - at_t1, - at_t3, - at_t5, - at_t7, - at_t11, - at_t13, - at_t15, - at_t17, - at_d56}; - for (auto _ : c10::irange(5)) { - auto segmented_fusion = - SegmentCandidateFinder::segment(fusion_ptr.get(), aten_inputs); - } -} - -TEST(NVFuserTest, FusionSerialAndParallelIndexing_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); - fusion.addOutput(tv2); - - auto tv3 = add(tv0, new Double(1)); - auto tv4 = add(tv3, new Double(1)); - fusion.addOutput(tv4); - - auto tv5 = add(tv0, new Double(1)); - auto tv6 = add(tv5, new Double(1)); - fusion.addOutput(tv6); - - // Case 1: local memory tensor computed serially and used by - // parallel threads - tv2->split(-1, 4); - tv1->computeAt(tv2, -2); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - - // Case 2: shared memory tensor computed serially and used by BID - tv4->split(-1, 4); - tv3->computeAt(tv4, -2); - tv4->axis(-1)->parallelize(ParallelType::BIDx); - tv3->setMemoryType(MemoryType::Shared); - - // Case 3: shared memory tensor computed by TID and used by BID - tv6->split(-1, 4); - tv5->computeAt(tv6, -2); - tv6->axis(-1)->parallelize(ParallelType::BIDx); - tv5->axis(-1)->parallelize(ParallelType::TIDx); - tv5->setMemoryType(MemoryType::Shared); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - const int nx = 11; - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({nx}, options); - std::vector aten_inputs = {t0}; - auto outputs = fe.runFusion(aten_inputs); - - auto ref = t0 + 2; - - testValidate( - &fusion, outputs, aten_inputs, {ref, ref, ref}, __LINE__, __FILE__); -} - -// Repro of issue #1105 -TEST(NVFuserTest, FusionWARSyncAliasedSmem_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); - auto tv3 = add(tv2, new Double(1)); - - fusion.addOutput(tv3); - - tv1->setMemoryType(MemoryType::Shared); - tv2->setMemoryType(MemoryType::Shared); - - tv3->split(0, 4); - tv0->computeAt(tv3, 1); - - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - - // Make sure a WAR sync is inserted at the end of the outer loop - GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->topLevelExprs()) { - if (auto loop = dynamic_cast(kir_node)) { - const auto& body = loop->body().exprs(); - TORCH_CHECK(!body.empty()); - auto last_expr = dynamic_cast(body.back()); - TORCH_CHECK(last_expr != nullptr, "Invalid expr found"); - TORCH_CHECK(last_expr->isWarHazardSync(), "Not a sync for WAR hazard"); - } - } - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({17}, options); - std::vector aten_inputs = {t0}; - auto outputs = fe.runFusion(aten_inputs); - - auto ref1 = t0 + 3; - - testValidate(&fusion, outputs, aten_inputs, {ref1}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionIssue1099_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); - fusion.addOutput(tv2); - - auto tv3 = makeSymbolicTensor(1); - fusion.addInput(tv3); - - // Just to make TIDx/y/z non-exact - auto tv4 = add(tv3, new Double(1)); - auto tv5 = add(tv4, new Double(1)); - auto tv6 = add(tv5, new Double(1)); - fusion.addOutput(tv6); - - tv2->split(0, 4); - tv0->computeAt(tv2, 1); - - tv0->axis(-1)->parallelize(ParallelType::TIDx); - tv1->axis(-1)->parallelize(ParallelType::TIDy); - tv2->axis(-1)->parallelize(ParallelType::TIDz); - tv2->axis(0)->parallelize(ParallelType::BIDx); - - tv1->setMemoryType(MemoryType::Shared); - - tv4->split(0, 5); - tv4->axis(-1)->parallelize(ParallelType::TIDx); - tv4->setMemoryType(MemoryType::Shared); - tv5->split(0, 6); - tv5->axis(-1)->parallelize(ParallelType::TIDy); - tv5->setMemoryType(MemoryType::Shared); - tv6->split(0, 7); - tv6->axis(-1)->parallelize(ParallelType::TIDz); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({17}, options); - at::Tensor t3 = at::randn({19}, options); - std::vector aten_inputs = {t0, t3}; - auto outputs = fe.runFusion(aten_inputs); - - auto ref_t2 = t0 + 2; - auto ref_t3 = t3 + 3; - - testValidate( - &fusion, outputs, aten_inputs, {ref_t2, ref_t3}, __LINE__, __FILE__); -} - -// Repro of issue #1080 -TEST(NVFuserTest, FusionUnswitchPredicate_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 6) { - return; - } - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); - fusion.addOutput(tv2); - - tv2->split(0, 4); - tv0->computeAt(tv2, 2); - - tv2->split(-1, 8); - tv1->split(-1, 8); - - tv2->axis(1)->parallelize(ParallelType::Unswitch); - - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(-2)->parallelize(ParallelType::TIDy); - - // swap TIDx and TIDy - tv1->axis(-1)->parallelize(ParallelType::TIDy); - tv1->axis(-2)->parallelize(ParallelType::TIDx); - - tv1->setMemoryType(MemoryType::Shared); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - const int nx = 4; - const int ny = 10; - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({nx, ny}, options); - std::vector aten_inputs = {t0}; - auto outputs = fe.runFusion(aten_inputs); - - auto ref = t0 + 2; - - testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionIssue1189_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeConcreteTensor({16, 16}); - auto tv1 = makeConcreteTensor({16, 16}); - - auto tv0b = broadcast(tv0, {false, false, true}); - auto tv1b = broadcast(tv1, {false, false, true}); - - fusion.addInput(tv0b); - fusion.addInput(tv1b); - - auto tv2 = add(tv0b, tv1b); - auto tv3 = sum(tv2, {1}); - fusion.addOutput(tv3); - - auto parallelize = [](auto tv) { - tv->axis(0)->parallelize(ParallelType::TIDx); - tv->axis(1)->parallelize(ParallelType::BIDx); - tv->axis(2)->parallelize(ParallelType::BIDy); - }; - - parallelize(tv0b); - parallelize(tv1b); - parallelize(tv2); - parallelize(tv3); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({16, 16, 1}, options); - at::Tensor t1 = at::randn({16, 16, 1}, options); - auto outputs = fe.runFusion({t0, t1}); - - auto ref = (t0 + t1).sum({1}); - - testValidate(&fusion, outputs, {t0, t1}, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionIssue1052_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - auto tv1 = makeSymbolicTensor(1); - fusion.addInput(tv1); - - auto tv2 = add(tv0, new Double(1)); - fusion.addOutput(tv2); - - auto tv3 = add(tv1, new Double(1)); - fusion.addOutput(tv3); - - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - - scheduler_utils::parallelizeAllLike(tv2, {tv0}); - scheduler_utils::parallelizeAllLike(tv3, {tv1}); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({10}, options); - at::Tensor t1 = at::randn({100}, options); - std::vector aten_inputs = {t0, t1}; - auto outputs = fe.runFusion(aten_inputs); - - auto ref_t2 = t0 + 1; - auto ref_t3 = t1 + 1; - - testValidate( - &fusion, outputs, aten_inputs, {ref_t2, ref_t3}, __LINE__, __FILE__); -} - -// Repro of issue #1115 -TEST(NVFuserTest, FusionPointwiseBroadcast_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - std::vector input_shape{3, 17, 80}; - std::vector output_shape{3, 17, 1, 80}; - - TensorView* x = makeSymbolicTensor(input_shape.size()); - TensorView* bias = makeSymbolicTensor(input_shape.size()); - fusion.addInput(x); - fusion.addInput(bias); - - auto x_add_bias = add(x, bias); - auto x_bcast = broadcast(x_add_bias, {false, false, true, false}); - auto y = unaryOp(UnaryOpType::Gelu, x_bcast); - fusion.addOutput(y); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor at_x = at::randn(input_shape, options); - at::Tensor at_bias = at::randn(input_shape, options); - std::vector aten_inputs = {at_x, at_bias}; - - schedulePointwise(&fusion, aten_inputs); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(aten_inputs); - - auto at_x_add_bias = at_x + at_bias; - auto at_x_view = at::native::view(at_x_add_bias, output_shape); - auto aten_y = at::gelu(at_x_view); - - testValidate(&fusion, outputs, aten_inputs, {aten_y}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionSmemAliasSerial_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); - auto tv3 = add(tv2, new Double(1)); - - fusion.addOutput(tv3); - - // Just set the dimension of TIDx - auto tv4 = makeSymbolicTensor(1); - fusion.addInput(tv4); - auto tv5 = add(tv4, new Double(1)); - fusion.addOutput(tv5); - - tv1->setMemoryType(MemoryType::Shared); - tv2->setMemoryType(MemoryType::Shared); - - tv5->axis(0)->parallelize(ParallelType::TIDx); - - // tv1 and tv2 are on shared memory and are not parallelized with - // TIDx. They should be predicated as they are redundant and can - // interfere with smem aliasing (issue #1100). - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({10}, options); - - at::Tensor t4 = at::randn({1024}, options); - std::vector aten_inputs = {t0, t4}; - auto outputs = fe.runFusion(aten_inputs); - - auto ref1 = t0 + 3; - auto ref2 = t4 + 1; - - testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - fusion.addOutput(tv1); - - auto tv2 = makeSymbolicTensor(1); - fusion.addInput(tv2); - auto tv3 = sum(tv2, {0}); - fusion.addOutput(tv3); - - tv1->axis(0)->parallelize(ParallelType::TIDx); - tv3->axis(0)->parallelize(ParallelType::BIDx); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({17}, options); - at::Tensor t2 = at::randn({19}, options); - std::vector aten_inputs = {t0, t2}; - auto outputs = fe.runFusion(aten_inputs); - - auto ref1 = t0 + 1; - auto ref2 = sum(t2); - - testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - fusion.addOutput(tv1); - - auto tv2 = makeSymbolicTensor(1); - fusion.addInput(tv2); - auto tv3 = Welford(tv2, {0}).avg; - fusion.addOutput(tv3); - - tv1->axis(0)->parallelize(ParallelType::TIDx); - tv3->axis(0)->parallelize(ParallelType::BIDx); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({17}, options); - at::Tensor t2 = at::randn({19}, options); - std::vector aten_inputs = {t0, t2}; - auto outputs = fe.runFusion(aten_inputs); - - auto ref1 = t0 + 1; - auto ref2 = mean(t2, {0}); - - testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = sum(tv0, {0, 1}); - fusion.addOutput(tv1); - - auto tv2 = makeSymbolicTensor(3); - fusion.addInput(tv2); - auto tv3 = add(tv2, new Double(1)); - fusion.addOutput(tv3); - - auto tv4 = makeSymbolicTensor(3); - fusion.addInput(tv4); - auto tv5 = add(tv4, new Double(1)); - fusion.addOutput(tv5); - - tv1->axis(0)->parallelize(ParallelType::BIDx); - tv1->axis(1)->parallelize(ParallelType::TIDx); - - tv3->axis(0)->parallelize(ParallelType::TIDx); - tv3->axis(1)->parallelize(ParallelType::TIDy); - tv3->axis(2)->parallelize(ParallelType::TIDz); - - tv5->axis(0)->parallelize(ParallelType::BIDx); - tv5->axis(1)->parallelize(ParallelType::BIDy); - tv5->axis(2)->parallelize(ParallelType::BIDz); - - // TODO: This needs a fix for issue #1102. - // Also, need to allow predicated grid reductions. -#if 0 - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({2, 3}, options); - at::Tensor t2 = at::randn({5, 6, 7}, options); - at::Tensor t4 = at::randn({8, 9, 10}, options); - std::vector aten_inputs = {t0, t2, t4}; - auto outputs = fe.runFusion(aten_inputs); - - auto ref1 = t0.sum(at::IntArrayRef{0, 1}); - auto ref2 = t2 + 1; - auto ref3 = t4 + 1; - - testValidate( - &fusion, outputs, aten_inputs, {ref1, ref2, ref3}, __LINE__, __FILE__); -#endif -} - -TEST(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tvs = Welford(tv0, {0, 1}); - fusion.addOutput(tvs.avg); - - auto tv2 = makeSymbolicTensor(3); - fusion.addInput(tv2); - auto tv3 = add(tv2, new Double(1)); - fusion.addOutput(tv3); - - auto tv4 = makeSymbolicTensor(3); - fusion.addInput(tv4); - auto tv5 = add(tv4, new Double(1)); - fusion.addOutput(tv5); - - tvs.avg->axis(0)->parallelize(ParallelType::BIDx); - tvs.avg->axis(1)->parallelize(ParallelType::TIDx); - - tv3->axis(0)->parallelize(ParallelType::TIDx); - tv3->axis(1)->parallelize(ParallelType::TIDy); - tv3->axis(2)->parallelize(ParallelType::TIDz); - - tv5->axis(0)->parallelize(ParallelType::BIDx); - tv5->axis(1)->parallelize(ParallelType::BIDy); - tv5->axis(2)->parallelize(ParallelType::BIDz); - - // TODO: needs a fix for issue #1102 - // Also, need to allow predicated grid reductions. -#if 0 - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({2, 3}, options); - at::Tensor t2 = at::randn({5, 6, 7}, options); - at::Tensor t4 = at::randn({8, 9, 10}, options); - std::vector aten_inputs = {t0, t2, t4}; - auto outputs = fe.runFusion(aten_inputs); - - auto ref1 = t0.mean(at::IntArrayRef{0, 1}); - auto ref2 = t2 + 1; - auto ref3 = t4 + 1; - - testValidate( - &fusion, outputs, aten_inputs, {ref1, ref2, ref3}, __LINE__, __FILE__); -#endif -} - -// Repro of issue #1102 -TEST(NVFuserTest, FusionPredicateParallelizedDomains_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - // Just to make TIDx/y/z non-exact - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); - auto tv3 = add(tv2, new Double(1)); - fusion.addOutput(tv3); - - auto tv4 = makeSymbolicTensor(1); - fusion.addInput(tv4); - - auto tv5 = add(tv4, new Double(1)); - auto tv6 = add(tv5, new Double(1)); - auto tv7 = add(tv6, new Double(1)); - auto tv8 = add(tv7, new Double(1)); - auto tv9 = sum(tv8, {0}); - fusion.addOutput(tv9); - - tv1->split(0, 5); - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv1->setMemoryType(MemoryType::Shared); - tv2->split(0, 6); - tv2->axis(-1)->parallelize(ParallelType::TIDy); - tv2->setMemoryType(MemoryType::Shared); - tv3->split(0, 7); - tv3->axis(-1)->parallelize(ParallelType::TIDz); - - tv9->split(0, 4); - tv4->computeAt(tv9, 1); - - tv4->axis(-1)->parallelize(ParallelType::TIDx); - tv5->axis(-1)->parallelize(ParallelType::TIDy); - tv6->axis(-1)->parallelize(ParallelType::TIDz); - tv7->axis(-1)->parallelize(ParallelType::TIDz); - tv8->axis(-1)->parallelize(ParallelType::TIDz); - tv9->axis(-1)->parallelize(ParallelType::TIDz); - tv9->axis(0)->parallelize(ParallelType::BIDx); - - tv5->setMemoryType(MemoryType::Shared); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({17}, options); - at::Tensor t4 = at::randn({19}, options); - std::vector aten_inputs = {t0, t4}; - auto outputs = fe.runFusion(aten_inputs); - - auto ref1 = t0 + 3; - auto ref2 = sum(t4 + 4); - - testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__); -} - -// Repro of #1102 and #1129 -TEST(NVFuserTest, FusionSmemPredicateUnswitch_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 7) { - return; - } - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - auto tv1 = makeSymbolicTensor(1); - fusion.addInput(tv1); - - auto tv2 = add(tv0, new Double(1)); - auto tv3 = add(tv2, new Double(1)); - auto tv4 = add(tv3, new Double(1)); - auto tv5 = add(tv4, new Double(1)); - fusion.addOutput(tv5); - - // Just to make TIDx/y/z non-exact - auto tvx = add(tv1, new Double(1)); - auto tvy = add(tvx, new Double(1)); - auto tvz = add(tvy, new Double(1)); - fusion.addOutput(tvz); - - tv5->split(0, 4); - tv0->computeAt(tv5, 1); - - tv0->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDy); - tv3->axis(-1)->parallelize(ParallelType::TIDz); - tv4->axis(-1)->parallelize(ParallelType::TIDx); - tv5->axis(-1)->parallelize(ParallelType::TIDy); - tv5->axis(0)->parallelize(ParallelType::Unswitch); - - tvx->split(0, 5); - tvx->axis(-1)->parallelize(ParallelType::TIDx); - tvy->split(0, 6); - tvy->axis(-1)->parallelize(ParallelType::TIDy); - tvz->split(0, 7); - tvz->axis(-1)->parallelize(ParallelType::TIDz); - - for (auto tv : {tv2, tv3, tv4, tvx, tvy}) { - tv->setMemoryType(MemoryType::Shared); - } - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({17}, options); - at::Tensor t1 = at::randn({19}, options); - std::vector aten_inputs = {t0, t1}; - auto outputs = fe.runFusion(aten_inputs); - - auto ref1 = t0 + 4; - auto ref2 = t1 + 3; - - testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__); -} - -// Repro of issue #1136 -TEST(NVFuserTest, FusionFloatPow_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = binaryOp(BinaryOpType::Pow, tv0, new Int(4)); - // To check if pow(tv0, 2) is replaced with tv0 * tv0 - auto tv2 = binaryOp(BinaryOpType::Pow, tv0, new Int(2)); - // To check if pow(tv0, 2.0) is replaced with tv0 * tv0 - auto tv3 = binaryOp(BinaryOpType::Pow, tv0, new Double(2)); - auto tv4 = binaryOp(BinaryOpType::Pow, tv0, new Int(3)); - auto tv5 = binaryOp(BinaryOpType::Pow, tv0, new Double(3)); - auto s = binaryOp(BinaryOpType::Pow, new Double(3), new Double(3)); - auto tv6 = add(tv0, s); - - fusion.addOutput(tv1); - fusion.addOutput(tv2); - fusion.addOutput(tv3); - fusion.addOutput(tv4); - fusion.addOutput(tv5); - fusion.addOutput(tv6); - - tv1->split(0, 32); - tv1->axis(0)->parallelize(ParallelType::BIDx); - tv1->axis(1)->parallelize(ParallelType::TIDx); - - TransformPropagator::from(tv1); - scheduler_utils::parallelizeAllLike(tv1, {tv2, tv3, tv4, tv5, tv6}); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({1000}, options); - // Negative inputs cause nan in Fuesr as use_fast_math is enabled - t0 = abs(t0); - std::vector aten_inputs = {t0}; - auto outputs = fe.runFusion(aten_inputs); - - auto p4 = at::pow(t0, 4); - auto p2 = at::pow(t0, 2); - auto p3 = at::pow(t0, 3); - auto t6 = t0 + std::pow(3, 3); - - testValidate( - &fusion, - outputs, - aten_inputs, - {p4, p2, p2, p3, p3, t6}, - __LINE__, - __FILE__); -} - -TEST(NVFuserTest, FusionIssue1127_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const int numel = 4; - - auto tv0 = makeConcreteTensor({numel}); - fusion.addInput(tv0); - - auto tv1 = sum(tv0, {0}); - auto tv2 = broadcast(tv1, {true}); - - auto tv3 = makeConcreteTensor({numel, numel}); - fusion.addInput(tv3); - - auto tv4 = sum(tv3, {1}); - - auto tv5 = add(tv2, tv4); - fusion.addOutput(tv5); - - tv1->axis(0)->parallelize(ParallelType::TIDx); - tv2->axis(0)->parallelize(ParallelType::TIDx); - tv4->axis(1)->parallelize(ParallelType::TIDx); - tv5->axis(0)->parallelize(ParallelType::TIDx); - - // Lowering should fail since tv5 is predicated and paralellized with TIDx. - ASSERT_ANY_THROW(fusion.printKernel()); -} - -TEST(NVFuserTest, FusionChannelsLastParser_CUDA) { - // This test may not pass if using a custom block sync as there may - // be additional calls. Skip the test as it's not specifically - // relevant with block synchronizatin. - if (std::getenv("PYTORCH_NVFUSER_USE_BLOCK_SYNC_ATOMIC")) { - return; - } - auto g = std::make_shared(); - const auto graph0_string = R"IR( - graph(%0 : Half(8, 4, 10, 16, strides=[640, 1, 64, 4]), - %1 : Half(8, 4, 10, 16, strides=[640, 160, 16, 1])): - %o.1 : Half(8, 4, 10, 16, strides=[640, 1, 64, 4]) = aten::mul(%0, %1) # sum_dyn.py:5:6 - %3 : Half(8, 4, 10, 16, strides=[640, 1, 64, 4]) = aten::relu(%o.1) # sum_dyn.py:6:9 - return (%3))IR"; - parseIR(graph0_string, g.get()); - - // strides are not yet supported in the irparser. - { - auto val = g->block()->inputs()[0]; - val->setType(val->type()->castRaw()->withSizesStrides( - {8, 4, 10, 16}, {640, 1, 64, 4})); - } - - { - auto val = g->block()->inputs()[1]; - val->setType(val->type()->castRaw()->withSizesStrides( - {8, 4, 10, 16}, {640, 160, 16, 1})); - } - - for (auto node : g->block()->nodes()) { - for (auto val : node->outputs()) { - if (val->isCompleteTensor()) - val->setType(val->type()->castRaw()->withSizesStrides( - {8, 4, 10, 16}, {640, 1, 64, 4})); - } - } - - auto fusion = parseJitIR(g); - FusionGuard fg(fusion.get()); - auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); - at::Tensor input0 = - at::randn({2, 2, 2, 16}, options).clone(c10::MemoryFormat::ChannelsLast); - at::Tensor input1 = at::randn({2, 2, 2, 16}, options); - auto lparams = schedulePointwise(fusion.get(), {input0, input1}); - - // CONSIDER: - // 1. this can be moved to a dedicated "golden" file - // 2. use a fuzzy compare (ignore non-significant whitespaces for example) - const std::string expected_kernel = R"( -__global__ void CUDAGeneratedKernel(Tensor<__half, 4> T0, Tensor<__half, 4> T2, Tensor<__half, 4> T7) { - if ((((((((((nvfuser_index_t)blockIdx.x) * 1) + 0) * 1) + 0) * 128) + ((nvfuser_index_t)threadIdx.x)) < (T0.size[0] * (T0.size[1] * (T0.size[2] * T0.size[3]))))) { - constexpr nvfuser_index_t ki674 = 0; - __half T9[1]; - constexpr nvfuser_index_t ki716 = 0; - T9[ki716] = 0; - constexpr nvfuser_index_t ki707 = 0; - T9[ki707] - = T2[((((((((((nvfuser_index_t)blockIdx.x) * 1) + ki674) * 1) + ki707) * 128) + ((nvfuser_index_t)threadIdx.x)) / (T0.size[1] * (T0.size[2] * T0.size[3]))) * (((1 * T0.size[2]) * T0.size[1]) * T0.size[3])) + ((((((((((((nvfuser_index_t)blockIdx.x) * 1) + ki674) * 1) + ki707) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) % (T0.size[2] * T0.size[3])) % T0.size[3]) * ((1 * T0.size[2]) * T0.size[1])) + (((((((((((nvfuser_index_t)blockIdx.x) * 1) + ki674) * 1) + ki707) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) / (T0.size[2] * T0.size[3])) * (1 * T0.size[2])) + ((((((((((((nvfuser_index_t)blockIdx.x) * 1) + ki674) * 1) + ki707) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) % (T0.size[2] * T0.size[3])) / T0.size[3]) * 1)]; - __half T8[1]; - constexpr nvfuser_index_t ki722 = 0; - T8[ki722] = 0; - constexpr nvfuser_index_t ki702 = 0; - T8[ki702] - = T0[(((((((((nvfuser_index_t)blockIdx.x) * 1) + ki674) * 1) + ki702) * 128) + ((nvfuser_index_t)threadIdx.x)) * 1)]; - __half T10[1]; - constexpr nvfuser_index_t ki683 = 0; - float T3[1]; - T3[0] - = __half2float(T9[ki683]); - float T4[1]; - T4[0] - = T3[0]; - float T1[1]; - T1[0] - = __half2float(T8[ki683]); - float T5[1]; - T5[0] - = T1[0] - * T4[0]; - float T6[1]; - T6[0] - = relu(T5[0]); - T10[ki683] - = __float2half(T6[0]); - constexpr nvfuser_index_t ki676 = 0; - T7[(((((((((nvfuser_index_t)blockIdx.x) * 1) + ki674) * 1) + ki676) * 128) + ((nvfuser_index_t)threadIdx.x)) * 1)] - = T10[ki676]; - } -} -)"; - - const std::string actual_kernel = - "\n" + codegen::generateCudaKernel(GpuLower(fusion.get()).kernel()); - - if (expected_kernel.size() != actual_kernel.size() || - expected_kernel.compare(actual_kernel) != 0) { - std::cerr - << " Codegen mismatch, codegen possibly changed, or is incorrect. " - << " \n ========= EXPECTED ========= \n" - << expected_kernel << "\n========= ACTUAL ========== \n" - << actual_kernel << "\n=================" << std::endl; - auto it = std::mismatch( - expected_kernel.begin(), - expected_kernel.end(), - actual_kernel.begin(), - actual_kernel.end()); - std::string actual_mismatched_snippet(it.second, actual_kernel.end()); - actual_mismatched_snippet = actual_mismatched_snippet.substr(0, 10); - std::string expected_mismatched_snippet(it.first, expected_kernel.end()); - expected_mismatched_snippet = expected_mismatched_snippet.substr(0, 10); - std::cerr << "First mismatch found at: " << actual_mismatched_snippet - << ", expected: " << expected_mismatched_snippet << std::endl; - TORCH_CHECK(false); - } - - // TODO: runFusion hits assertion. I'm probably doing something wrong here. - // FusionExecutor fe; - // fe.compileFusion(fusion.get()); - // auto outputs = fe.runFusion({input0, input1}, lparams); - // at::Tensor output_ref = (input0 * input1).relu(); - // TORCH_CHECK(output_ref.equal(outputs[0])); -} - -TEST(NVFuserTest, FusionThreadPredicateUnswitch_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeConcreteTensor({10, 1024}); - fusion.addInput(tv0); - - auto tv1 = sum(tv0, {1}); - auto tv2 = add(tv1, new Double(1)); - auto tv3 = add(tv2, new Double(1)); - - fusion.addOutput(tv3); - - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv2->computeAt(tv3, -1); - tv3->axis(0)->parallelize(ParallelType::Unswitch); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({10, 1024}, options); - std::vector aten_inputs = {t0}; - auto outputs = fe.runFusion(aten_inputs); - - auto ref = sum(t0, {1}) + 2; - - testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionNonContigOutputs_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - fusion.addOutput(tv1); - - tv1->setContiguity(false); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor at_input = at::randn({10}, options); - at::Tensor at_output = at::empty_strided({10}, {2}, options); - auto returned_outputs = fe.runFusion({at_input}, {at_output}); - - // Returned outputs should only contain one tensor that is the same - // as the output tensor given to runFusion - TORCH_CHECK(returned_outputs.size() == 1); - TORCH_CHECK(returned_outputs[0].is_same(at_output)); - TORCH_CHECK(!returned_outputs[0].is_contiguous()); - - auto at_ref = at_input + 1; - - testValidate(&fusion, {at_output}, {at_input}, {at_ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionTestWarpSoftMax_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Setup softmax fusion - auto input = makeContigTensor(2); - fusion.addInput(input); - auto output = softmax(input, 1); - fusion.addOutput(output); - - // Setup runtime input - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_input = at::randn({8, 16 * 197}, options); - std::vector aten_inputs({aten_input}); - - // Schedule through magic scheduler - auto runtime_info = SchedulerRuntimeInfo(&fusion, aten_inputs, true); - TORCH_CHECK(SchedulerEntry::canSchedule( - ScheduleHeuristic::Persistent, &fusion, runtime_info)); - auto scheduler = SchedulerEntry::makeEntry( - ScheduleHeuristic::Persistent, &fusion, runtime_info); - scheduler->schedule(&fusion); - - // Modify the schedule to use warp reduction - auto used_vals = fusion.usedMathVals(); - for (auto tv : ir_utils::filterByType(used_vals)) { - for (IterDomain* id : tv->domain()->domain()) { - if (id->getParallelType() == ParallelType::TIDx) { - id->padToMultipleOfWarp(); - } - } - } - - // Test result - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(aten_inputs); - auto ref_output = at::_softmax(aten_input, 1, false); - testValidate(&fusion, outputs, aten_inputs, {ref_output}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionIssue1133_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 7) { - return; - } - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = sum(tv1, {1}); - auto tv3 = add(tv2, new Double(1)); - - fusion.addOutput(tv3); - - tv0->computeAt(tv3, 1); - - const int split_factor = 32; - - tv2->split(-1, split_factor); - tv1->computeAt(tv2, -2); - - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - - tv3->axis(0)->parallelize(ParallelType::Unswitch); - - tv1->setMemoryType(MemoryType::Shared); - tv2->setMemoryType(MemoryType::Shared); - - // Both tv1 and tv2 should be allocated at the top-level scope - GpuLower gpulw(&fusion); - bool tv1_validated = false; - bool tv2_validated = false; - for (const auto& kir_node : gpulw.kernel()->topLevelExprs()) { - if (auto alloc = dynamic_cast(kir_node)) { - auto size = alloc->size(); - if (!(alloc->buffer()->name() == 1 || alloc->buffer()->name() == 2)) { - // There should be no allocation other than those for tv1 and tv2 - TORCH_CHECK(false, "Invalid allocation detected"); - } - TORCH_CHECK(size->isA(), "Invalid allocation size"); - TORCH_CHECK(size->as()->isConst(), "Allocation not constant"); - auto size_int = size->as()->value().value(); - if (alloc->buffer()->name() == 1) { - TORCH_CHECK( - size_int == split_factor, - "Invalid allocation size: ", - size->as()->value().value()); - tv1_validated = true; - } else { - TORCH_CHECK( - size_int == 1, - "Invalid allocation size: ", - size->as()->value().value()); - tv2_validated = true; - } - } - } - - TORCH_CHECK(tv1_validated, "Failed to validate tv1 allocation"); - TORCH_CHECK(tv2_validated, "Failed to validate tv2 allocation"); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({99, 101}, options); - std::vector aten_inputs = {t0}; - auto outputs = fe.runFusion(aten_inputs); - - auto ref = (t0 + 1).sum({1}) + 1; - - testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionRfactorContigIDs_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = sum(tv0, {1}); - fusion.addOutput(tv1); - - tv1->split(1, 32); - - auto tv2 = tv1->rFactor({1}); - - // This merged domain is not contiguous. - tv2->merge(0, 2); - - tv2->setMemoryType(MemoryType::Shared); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({99, 101}, options); - std::vector aten_inputs = {t0}; - auto outputs = fe.runFusion(aten_inputs); - - auto ref = t0.sum({1}); - - testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionPersistentBufferCalculation1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = set(tv0); - auto tv2 = sum(tv1, {1}); - auto tv3 = broadcast(tv2, {false, true}); - auto tv4 = set(tv1); - auto tv5 = add(tv3, tv4); - fusion.addOutput(tv5); - - auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion); - - auto isTvWithinVec = [](std::vector& vec, TensorView* tv) { - return std::find(vec.begin(), vec.end(), tv) != vec.end(); - }; - - auto tvEntryInVecVec = [](std::vector>& vec_o_vec, - std::vector& buffer_vec, - TensorView* tv) { - auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv); - return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it); - }; - - auto& buffers = persistent_buffer_info.persistent_buffers; - auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points; - auto& projectable = persistent_buffer_info.projectable_persistent_buffers; - auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs; - - TORCH_INTERNAL_ASSERT(buffers.size() == 1); - TORCH_INTERNAL_ASSERT(resolution.size() == 1 && resolution[0].size() == 1); - TORCH_INTERNAL_ASSERT(projectable.size() == 1); - TORCH_INTERNAL_ASSERT(projectable_inputs.size() == 1); - - TORCH_INTERNAL_ASSERT(isTvWithinVec(buffers, tv1)); - TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable, tv1)); - TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0)); - - auto tv1_resolution_it = tvEntryInVecVec(resolution, buffers, tv1); - TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end()) - - TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv5)); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_t0 = at::randn({99, 101}, options); - - // Schedule through magic scheduler - auto runtime_info = SchedulerRuntimeInfo(&fusion, {aten_t0}, true); - auto persistent_buffer_size = - persistentBufferSize(&fusion, runtime_info, persistent_buffer_info); - - TORCH_INTERNAL_ASSERT( - persistent_buffer_size.persistent_buffer_size == - aten_t0.size(1) * dataTypeSize(DataType::Float)); - TORCH_INTERNAL_ASSERT( - persistent_buffer_size.projected_persistent_buffer_size == - aten_t0.size(1) * dataTypeSize(DataType::Float)); -} - -TEST(NVFuserTest, FusionPersistentBufferCalculation2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2, DataType::Half); - fusion.addInput(tv0); - - auto tv1 = castOp(DataType::Float, tv0); - auto tv2 = sum(tv1, {1}); - auto tv3 = broadcast(tv2, {false, true}); - auto tv4 = set(tv1); - auto tv5 = add(tv3, tv4); - auto tv6 = castOp(DataType::Half, tv5); - fusion.addOutput(tv6); - - auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion); - - auto isTvWithinVec = [](std::vector& vec, TensorView* tv) { - return std::find(vec.begin(), vec.end(), tv) != vec.end(); - }; - - auto tvEntryInVecVec = [](std::vector>& vec_o_vec, - std::vector& buffer_vec, - TensorView* tv) { - auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv); - return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it); - }; - - auto& buffers = persistent_buffer_info.persistent_buffers; - auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points; - auto& projectable = persistent_buffer_info.projectable_persistent_buffers; - auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs; - - TORCH_INTERNAL_ASSERT(buffers.size() == 1); - TORCH_INTERNAL_ASSERT(resolution.size() == 1 && resolution[0].size() == 1); - TORCH_INTERNAL_ASSERT(projectable.size() == 1); - TORCH_INTERNAL_ASSERT(projectable_inputs.size() == 1); - - TORCH_INTERNAL_ASSERT(isTvWithinVec(buffers, tv1)); - TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable, tv1)); - TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0)); - - auto tv1_resolution_it = tvEntryInVecVec(resolution, buffers, tv1); - TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end()) - - TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv5)); - - auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); - at::Tensor aten_t0 = at::randn({99, 101}, options); - - // Schedule through magic scheduler - auto runtime_info = SchedulerRuntimeInfo(&fusion, {aten_t0}, true); - auto persistent_buffer_size = - persistentBufferSize(&fusion, runtime_info, persistent_buffer_info); - - TORCH_INTERNAL_ASSERT( - persistent_buffer_size.persistent_buffer_size == - aten_t0.size(1) * dataTypeSize(DataType::Float)); - TORCH_INTERNAL_ASSERT( - persistent_buffer_size.projected_persistent_buffer_size == - aten_t0.size(1) * dataTypeSize(DataType::Half)); -} - -TEST(NVFuserTest, FusionPersistentBufferCalculation3_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2, DataType::Half); - fusion.addInput(tv0); - - auto tv1 = castOp(DataType::Float, tv0); - auto tv2 = set(tv1); - auto tv3 = sum(tv2, {1}); - auto tv4 = broadcast(tv3, {false, true}); - - auto tv5 = makeSymbolicTensor(2, DataType::Half); - fusion.addInput(tv5); - - auto tv6 = castOp(DataType::Float, tv5); - - auto tv7 = add(tv6, tv4); - auto tv8 = set(tv1); - auto tv9 = add(tv7, tv8); - auto tv10 = sum(tv9, {1}); - auto tv11 = broadcast(tv10, {false, true}); - auto tv12 = set(tv7); - auto tv13 = add(tv12, tv11); - - fusion.addOutput(tv13); - - auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion); - - auto isTvWithinVec = [](std::vector& vec, TensorView* tv) { - return std::find(vec.begin(), vec.end(), tv) != vec.end(); - }; - - auto tvEntryInVecVec = [](std::vector>& vec_o_vec, - std::vector& buffer_vec, - TensorView* tv) { - auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv); - return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it); - }; - - auto& buffers = persistent_buffer_info.persistent_buffers; - auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points; - auto& projectable = persistent_buffer_info.projectable_persistent_buffers; - auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs; - - TORCH_INTERNAL_ASSERT(buffers.size() == 2); - TORCH_INTERNAL_ASSERT( - resolution.size() == 2 && resolution[0].size() == 1 && - resolution[1].size() == 1); - TORCH_INTERNAL_ASSERT(projectable.size() == 1); - TORCH_INTERNAL_ASSERT(projectable_inputs.size() == 1); - - TORCH_INTERNAL_ASSERT( - isTvWithinVec(buffers, tv1) && isTvWithinVec(buffers, tv7)); - TORCH_INTERNAL_ASSERT( - isTvWithinVec(projectable, tv1) && !isTvWithinVec(projectable, tv7)); - - TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0)); - - auto tv1_resolution_it = tvEntryInVecVec(resolution, buffers, tv1); - TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end()) - TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv9)); - - auto tv7_resolution_it = tvEntryInVecVec(resolution, buffers, tv7); - TORCH_INTERNAL_ASSERT(tv7_resolution_it != resolution.end()) - TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv7_resolution_it, tv13)); - - auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); - at::Tensor aten_t0 = at::randn({99, 101}, options); - at::Tensor aten_t5 = at::randn({99, 101}, options); - - // Schedule through magic scheduler - auto runtime_info = SchedulerRuntimeInfo(&fusion, {aten_t0, aten_t5}, true); - auto persistent_buffer_size = - persistentBufferSize(&fusion, runtime_info, persistent_buffer_info); - - TORCH_INTERNAL_ASSERT( - persistent_buffer_size.persistent_buffer_size == - aten_t0.size(1) * dataTypeSize(DataType::Float) * 2); - TORCH_INTERNAL_ASSERT( - persistent_buffer_size.projected_persistent_buffer_size == - aten_t0.size(1) * - (dataTypeSize(DataType::Half) + dataTypeSize(DataType::Float))); -} - -TEST(NVFuserTest, FusionPersistentBufferCalculation4_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2, DataType::Half); - fusion.addInput(tv0); - - auto tv1 = castOp(DataType::Float, tv0); - auto tv2 = set(tv1); - auto tv3 = sum(tv2, {1}); - auto tv4 = broadcast(tv3, {false, true}); - auto tv5 = set(tv1); - auto tv6 = add(tv4, tv5); - auto tv7 = set(tv2); - auto tv8 = add(tv7, tv6); - auto tv9 = castOp(DataType::Half, tv8); - - fusion.addOutput(tv9); - - auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion); - - auto isTvWithinVec = [](std::vector& vec, TensorView* tv) { - return std::find(vec.begin(), vec.end(), tv) != vec.end(); - }; - - auto tvEntryInVecVec = [](std::vector>& vec_o_vec, - std::vector& buffer_vec, - TensorView* tv) { - auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv); - return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it); - }; - - auto& buffers = persistent_buffer_info.persistent_buffers; - auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points; - auto& projectable = persistent_buffer_info.projectable_persistent_buffers; - auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs; - - TORCH_INTERNAL_ASSERT(buffers.size() == 2); - TORCH_INTERNAL_ASSERT( - resolution.size() == 2 && resolution[0].size() == 1 && - resolution[1].size() == 1); - - TORCH_INTERNAL_ASSERT(projectable.size() == 2); - TORCH_INTERNAL_ASSERT(projectable_inputs.size() == 1); - - TORCH_INTERNAL_ASSERT( - isTvWithinVec(buffers, tv1) && isTvWithinVec(buffers, tv2)); - TORCH_INTERNAL_ASSERT( - isTvWithinVec(projectable, tv1) && isTvWithinVec(projectable, tv2)); - - TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0)); - - auto tv1_resolution_it = tvEntryInVecVec(resolution, buffers, tv1); - TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end()) - TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv6)); - - auto tv2_resolution_it = tvEntryInVecVec(resolution, buffers, tv2); - TORCH_INTERNAL_ASSERT(tv2_resolution_it != resolution.end()) - TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv2_resolution_it, tv8)); - - auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); - at::Tensor aten_t0 = at::randn({99, 101}, options); - - // Schedule through magic scheduler - auto runtime_info = SchedulerRuntimeInfo(&fusion, {aten_t0}, true); - auto persistent_buffer_size = - persistentBufferSize(&fusion, runtime_info, persistent_buffer_info); - - TORCH_INTERNAL_ASSERT( - persistent_buffer_size.persistent_buffer_size == - aten_t0.size(1) * dataTypeSize(DataType::Float) * 2); - - TORCH_INTERNAL_ASSERT( - persistent_buffer_size.projected_persistent_buffer_size == - aten_t0.size(1) * dataTypeSize(DataType::Half)); -} - -TEST(NVFuserTest, PersistentBufferProjection_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 6) { - return; - } - std::unique_ptr fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2, DataType::Half); - fusion.addInput(tv0); - - auto tv1 = castOp(DataType::Float, tv0); - auto tv2 = set(tv1); - auto tv3 = sum(tv2, {1}); - auto tv4 = broadcast(tv3, {false, true}); - auto tv5 = set(tv1); - auto tv6 = add(tv4, tv5); - auto tv7 = set(tv2); - auto tv8 = add(tv7, tv6); - auto tv9 = castOp(DataType::Half, tv8); - - fusion.addOutput(tv9); - - reduction_scheduler_utils::projectPersistentBuffers(&fusion); - - auto tv5_producers = ir_utils::producerTvsOf(tv5); - auto tv7_producers = ir_utils::producerTvsOf(tv7); - - // Projection should have broken these dependencies - - TORCH_INTERNAL_ASSERT( - std::find(tv5_producers.begin(), tv5_producers.end(), tv1) == - tv5_producers.end()); - TORCH_INTERNAL_ASSERT( - std::find(tv7_producers.begin(), tv7_producers.end(), tv2) == - tv7_producers.end()); - - auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); - at::Tensor aten_t0 = at::randn({99, 101}, options); - - FusionExecutorCache fec(std::move(fusion_ptr)); - auto cg_outputs = fec.runFusionWithInputs({aten_t0}); - - auto aten_t1 = aten_t0.to(c10::kDouble); - auto aten_t3 = aten_t1.sum({1}); - auto aten_t4 = aten_t3.unsqueeze(1); - auto aten_t7 = aten_t4.add(aten_t1).add(aten_t1); - - testValidate(&fusion, cg_outputs, {aten_t0}, {aten_t7}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionIssue1223_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 7) { - return; - } - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeContigTensor(2); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = sum(tv1, {0, 1}); - fusion.addOutput(tv2); - - auto tv3 = add(tv0, new Double(0)); - fusion.addOutput(tv3); - - tv2->split(0, 4); - tv2->split(1, 1, false); - tv2->split(-1, 4); - - tv2->axis(1)->parallelize(ParallelType::Unswitch); - tv2->axis(-3)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDy); - - tv1->computeAt(tv2, -1); - - // Make TIDx and TIDy non-exact - tv3->split(0, 32); - tv3->split(-1, 32); - tv3->axis(1)->parallelize(ParallelType::TIDx); - tv3->axis(3)->parallelize(ParallelType::TIDy); - - // The second axis of both tv1 and tv2 are fully unswitched, so they - // don't need to predicate the parallel type usage of TIDy, whereas - // the first axis is only partially unswitched, i.e., part of its - // split output domains is outside the unswitched axis, so the first - // axis, which uses TIDx, needs to predicate the parallel - // dimension. Previously, as reported in issue #1223, unswitched - // expressions didn't predicate parallel dimensions. It should be - // fixed by PR #1222. - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor at_t0 = at::ones({11, 10}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({at_t0}); - - auto at_t1 = (at_t0 + 1).sum(); - - testValidate( - &fusion, cg_outputs, {at_t0}, {at_t1, at_t0}, __LINE__, __FILE__); -} - -// See #1247 and #1250 -TEST(NVFuserTest, FusionRfactorPredication1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeContigTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = min(tv1, {0}); - - fusion.addOutput(tv2); - - // Make TIDx non-exact - auto tv3 = makeContigTensor(1); - fusion.addInput(tv3); - - auto tv4 = add(tv3, new Double(1)); - fusion.addOutput(tv4); - - tv2->split(0, 4); - auto tv5 = tv2->rFactor({1}); - - tv0->computeAt(tv2, 1); - - tv2->axis(0)->parallelize(ParallelType::TIDx); - - tv4->axis(0)->parallelize(ParallelType::TIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor at_t0 = at::randn({9}, options); - at_t0 = at::abs(at_t0); - at::Tensor at_t3 = at::randn({128}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({at_t0, at_t3}); - - auto at_t2 = (at_t0 + 1).min(); - auto at_t4 = at_t3 + 1; - - testValidate( - &fusion, cg_outputs, {at_t0, at_t3}, {at_t2, at_t4}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionRfactorPredication2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeContigTensor(1); - fusion.addInput(tv0); - - auto tv1 = min(tv0, {0}); - fusion.addOutput(tv1); - - // Make TIDx non-exact - auto tv2 = makeContigTensor(1); - fusion.addInput(tv2); - - auto tv3 = add(tv2, new Double(1)); - fusion.addOutput(tv3); - - tv1->split(0, 4); - auto tv4 = tv1->rFactor({0}); - - tv1->split(0, 3); - - // tv0->computeAt(tv1, 3); - tv4->reorder({{0, 1}}); - tv4->split(0, 3); - tv4->setMemoryType(MemoryType::Shared); - - // tv0: [I] - // tv4: [4/3, 3, I/4] - // tv1: [4/3, 3] - - tv1->axis(0)->parallelize(ParallelType::TIDx); - scheduler_utils::parallelizeAllLike(tv1, {tv4}); - - tv3->axis(0)->parallelize(ParallelType::TIDx); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor at_t0 = at::randn({9}, options); - at_t0 = at::abs(at_t0); - at::Tensor at_t3 = at::randn({128}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({at_t0, at_t3}); - - auto at_t2 = std::get<0>(at_t0.min(0)); - auto at_t4 = at_t3 + 1; - - testValidate( - &fusion, cg_outputs, {at_t0, at_t3}, {at_t2, at_t4}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionNonDivisibleSplit1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = sum(tv0, {0}); - fusion.addOutput(tv1); - - // [I] - tv1->split(0, 5); - // [ceilDiv(I, 5), 5] - - // This second split is non-divisible. The split domain must be predicated. - tv1->split(1, 3); - // [ceilDiv(I, 5), 2, 3] - - auto tv2 = sum(tv0, {0}); - fusion.addOutput(tv2); - - // tv2 shouldn't need to have another predicate - tv2->split(0, 4); - tv2->split(1, 2); - - GpuLower gpulw(&fusion); - TORCH_CHECK( - gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(), - "There must be no split to validate"); - TORCH_CHECK( - gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 1, - "Only tv1 should have a non-divisible predicate."); - for (auto tv : {tv1}) { - auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv); - TORCH_CHECK( - it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(), - "No info found for ", - tv); - const auto& splits_to_predicate = it->second; - TORCH_CHECK( - splits_to_predicate.size() == 1, - "There must be one split to predicate"); - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor t0 = at::randn({24}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({t0}); - - auto ref = t0.sum(); - - testValidate(&fusion, cg_outputs, {t0}, {ref, ref}, __LINE__, __FILE__); -} - -// Repro of issue #1074 -TEST(NVFuserTest, FusionNonDivisibleSplit2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); - fusion.addOutput(tv2); - - tv2->split(0, 2); - tv2->split(-1, 4); - tv2->reorder({{1, 2}, {2, 1}}); - tv0->computeAt(tv2, 2); - - tv2->split(-1, 3); - - // To make the sanitizer catch the invalid accesses. Not necessary - // to expose the bug. - tv1->setMemoryType(MemoryType::Shared); - - GpuLower gpulw(&fusion); - TORCH_CHECK( - gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(), - "There must be no split to validate"); - TORCH_CHECK( - gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 1, - "Only tv2 should have a non-divisible predicate."); - for (auto tv : {tv2}) { - auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv); - TORCH_CHECK( - it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(), - "No info found for ", - tv); - const auto& splits_to_predicate = it->second; - TORCH_CHECK( - splits_to_predicate.size() == 1, - "There must be one split to predicate"); - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor t0 = at::randn({13, 17}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({t0}); - - auto ref = t0 + 2; - - testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); -} - -// Similar to FusionNonDivisibleSplit1 but with unswitch -TEST(NVFuserTest, FusionNonDivisibleSplit3_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = sum(tv1, {0}); - fusion.addOutput(tv2); - - tv2->split(0, 5); - tv2->split(1, 3); - - tv0->computeAt(tv2, -1); - - tv2->axis(0)->parallelize(ParallelType::Unswitch); - - GpuLower gpulw(&fusion); - TORCH_CHECK( - gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(), - "There must be no split to validate"); - TORCH_CHECK( - gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 2, - "Both tv1 and tv2 should have a non-divisible predicate."); - for (auto tv : {tv1, tv2}) { - auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv); - TORCH_CHECK( - it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(), - "No info found for ", - tv); - const auto& splits_to_predicate = it->second; - TORCH_CHECK( - splits_to_predicate.size() == 1, - "There must be one split to predicate"); - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor t0 = at::randn({24}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({t0}); - - auto ref = (t0 + 1).sum(); - - testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); -} - -// Non-divisible split through merge -TEST(NVFuserTest, FusionNonDivisibleSplit4_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = sum(tv1, {0, 1}); - fusion.addOutput(tv2); - - tv2->split(0, 5); - tv2->merge(1, 2); - tv2->split(1, 3); - - tv0->computeAt(tv2, -1); - - GpuLower gpulw(&fusion); - TORCH_CHECK( - gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(), - "There must be no split to validate"); - TORCH_CHECK( - gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 2, - "Both tv1 and tv2 should have a non-divisible predicate."); - for (auto tv : {tv1, tv2}) { - auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv); - TORCH_CHECK( - it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(), - "No info found for ", - tv); - const auto& splits_to_predicate = it->second; - TORCH_CHECK( - splits_to_predicate.size() == 1, - "There must be one split to predicate"); - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor t0 = at::randn({24, 2}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({t0}); - - auto ref = (t0 + 1).sum(); - - testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); -} - -// Nested splits -TEST(NVFuserTest, FusionNonDivisibleSplit5_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = sum(tv1, {0}); - fusion.addOutput(tv2); - - // [I] - tv2->split(0, 8); - // [I/8, 8] - tv2->split(1, 2); - // [I/8, 4, 2] - tv2->split(1, 3); // non-divisible split of outer output - // [I/8, 2, 3, 2] - - tv0->computeAt(tv2, -1); - - GpuLower gpulw(&fusion); - TORCH_CHECK( - gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(), - "There must be no split to validate"); - TORCH_CHECK( - gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 2, - "Both tv1 and tv2 should have a non-divisible predicate."); - for (auto tv : {tv1, tv2}) { - auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv); - TORCH_CHECK( - it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(), - "No info found for ", - tv); - const auto& splits_to_predicate = it->second; - TORCH_CHECK( - splits_to_predicate.size() == 1, - "There must be one split to predicate"); - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor t0 = at::randn({24}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({t0}); - - auto ref = (t0 + 1).sum(); - - testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); -} - -// Vectorized non-divisible split. Must be validated at run time -TEST(NVFuserTest, FusionNonDivisibleSplitVectorize1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeContigTensor(1); - fusion.addInput(tv0); - - auto tv1 = set(tv0); - fusion.addOutput(tv1); - - tv1->split(0, 8, false); - tv1->split(1, 4); - - tv1->axis(-1)->parallelize(ParallelType::Vectorize); - - GpuLower gpulw(&fusion); - TORCH_CHECK( - gpulw.nonDivisibleSplitInfo().splitsToValidate().size() == 1, - "There should be one split to validate"); - for (const auto& kv : gpulw.nonDivisibleSplitInfo().splitsToPredicate()) { - const auto& splits_to_predicate = kv.second; - TORCH_CHECK( - splits_to_predicate.empty(), - "There must be no split to predicate, but tensor t", - kv.first->name(), - " has:", - splits_to_predicate); - } - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(0); - - auto t0 = at::randn({32}, options); - auto cg_outputs = fe.runFusion({t0}); - - auto ref = t0; - - testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); - - auto t0_non_divisible = at::randn({8}, options); - // Since ceilDiv(8, 8) is not divisible by 4, the vectorization is - // illegal. The run-time validation of vectorization should throw an error. - ASSERT_ANY_THROW(fe.runFusion({t0_non_divisible})); -} - -// If a split is validated at run time, it's not necessary to predicate. -TEST(NVFuserTest, FusionNonDivisibleSplitVectorize2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeContigTensor(1); - fusion.addInput(tv0); - - auto tv1 = set(tv0); - auto tv2 = add(tv1, new Double(1)); - auto tv3 = sum(tv2, {0}); - fusion.addOutput(tv3); - - tv3->split(0, 8, false); - tv3->split(1, 4); - TransformPropagator::from(tv3); - - tv3->axis(1)->parallelize(ParallelType::TIDx); - scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2}); - - tv1->axis(2)->parallelize(ParallelType::Vectorize); - - GpuLower gpulw(&fusion); - TORCH_CHECK( - gpulw.nonDivisibleSplitInfo().splitsToValidate().size() == 1, - "There should be one split to validate"); - for (const auto& kv : gpulw.nonDivisibleSplitInfo().splitsToPredicate()) { - const auto& splits_to_predicate = kv.second; - TORCH_CHECK( - splits_to_predicate.empty(), - "There must be no split to predicate, but tensor t", - kv.first->name(), - " has:", - splits_to_predicate); - } - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(0); - - auto t0 = at::randn({1024}, options); - auto cg_outputs = fe.runFusion({t0}); - - auto ref = (t0 + 1).sum(); - - testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); -} - -} // namespace jit -} // namespace torch -#endif // #if defined(USE_CUDA) diff --git a/test/cpp/jit/test_gpu_shift.cpp b/test/cpp/jit/test_gpu_shift.cpp deleted file mode 100644 index 71fa156c2d24..000000000000 --- a/test/cpp/jit/test_gpu_shift.cpp +++ /dev/null @@ -1,4637 +0,0 @@ -#if defined(USE_CUDA) -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// fuser and IR parser -#include "test_gpu_validator.h" - -#include -#include -#include - -#include -#include - -// Tests go in torch::jit -namespace torch { -namespace jit { - -using namespace torch::jit::fuser::cuda; -using namespace at::indexing; - -namespace { - -// Make a tensor that is known to be fully contiguous of dimensionality=ndims, -// but unknown sizes -TensorView* makeContigTensor(size_t ndims, DataType dtype = DataType::Float) { - return TensorViewBuilder() - .ndims(ndims) - .dtype(dtype) - .contiguity(std::vector(ndims, true)) - .build(); -} - -// Make a tensor that is known to be non-contiguous of dimensionality=ndims, -// but unknown sizes -TensorView* makeSymbolicTensor(size_t ndims, DataType dtype = DataType::Float) { - return TensorViewBuilder().ndims(ndims).dtype(dtype).build(); -} - -// Make a non-contiguous tensor of compile-time known sizes -TensorView* makeConcreteTensor( - std::vector shape, - DataType dtype = DataType::Float) { - return TensorViewBuilder().shape(shape).dtype(dtype).build(); -} - -void checkIntValue( - ExpressionEvaluator& evaluator, - Val* val, - Int::ScalarType expected_value) { - TORCH_CHECK(val->isAnInt()); - const auto actual_value = evaluator.evaluate(val); - TORCH_CHECK(actual_value.has_value()); - TORCH_CHECK(actual_value.value() == expected_value); -} - -void checkIntValue( - kir::ExpressionEvaluator& evaluator, - const kir::Val* val, - kir::Int::ScalarType expected_value) { - const auto actual_value = evaluator.evaluate(val); - TORCH_CHECK(actual_value.has_value()); - TORCH_CHECK(actual_value.value() == expected_value); -} - -// ATen version of tensor shifting -auto shift( - at::Tensor tensor, - const std::vector& offsets, - std::vector strides = {}) { - TORCH_INTERNAL_ASSERT(tensor.ndimension() == offsets.size()); - if (strides.empty()) { - strides = std::vector(tensor.ndimension(), 1); - } - at::Tensor t = tensor; - std::vector stride_indices; - for (size_t i = 0; i < offsets.size(); ++i) { - auto stride = strides[i]; - stride_indices.push_back( - at::indexing::Slice(0, at::indexing::None, stride)); - const auto offset = offsets[i]; - if (offset == 0) { - continue; - } - t = t.roll(offsets[i], i); - std::vector indices( - tensor.ndimension(), at::indexing::Slice(0, at::indexing::None)); - if (offset > 0) { - indices[i] = at::indexing::Slice(0, offset); - } else { - indices[i] = at::indexing::Slice(offset, at::indexing::None); - } - t.index(indices) = 0; - } - t = t.index(stride_indices); - return t; -} - -// ATen version of tensor gather -auto gather( - at::Tensor tensor, - const std::vector& window_shape, - const std::vector>& pad_width, - std::vector strides = {}) { - TORCH_CHECK( - tensor.ndimension() == window_shape.size(), - "Invalid window shape: ", - window_shape, - ". Size of the window shape is different from the tensor dimension."); - TORCH_CHECK( - tensor.ndimension() == pad_width.size(), - "Invalid pad width: ", - pad_width, - ". Size of the pad width is different from the tensor dimension."); - if (strides.empty()) { - strides = std::vector(tensor.ndimension(), 1); - } else { - TORCH_CHECK( - tensor.ndimension() == strides.size(), - "Invalid strides: ", - strides, - ". Size of strides is different from the tensor dimension."); - } - at::Tensor t = tensor; - for (size_t i = 0; i < window_shape.size(); ++i) { - const auto w_size = window_shape[i]; - TORCH_CHECK(w_size != 0); - const auto& pad = pad_width[i]; - TORCH_CHECK(pad.size() == 2); - at::Tensor concat_tensor; - for (int w = 0; w < w_size; ++w) { - std::vector shift_offsets(t.ndimension(), 0); - shift_offsets[i] = pad[0] - w; - std::vector shift_strides(t.ndimension(), 1); - shift_strides[i] = strides[i]; - auto shifted = shift(t, shift_offsets, shift_strides); - shifted = shifted.unsqueeze(-1); - if (w == 0) { - concat_tensor = shifted; - } else { - concat_tensor = at::cat({concat_tensor, shifted}, -1); - } - } - t = concat_tensor; - } - return t; -} - -} // namespace - -// Shift an input tensor -TEST(NVFuserTest, FusionShift1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = shift(tv0, {-1, 0}); - fusion.addOutput(tv1); - - auto tv2 = shift(tv0, {0, 1}); - fusion.addOutput(tv2); - - auto tv3 = shift(tv0, {2, 2}); - fusion.addOutput(tv3); - - auto tv4 = shift(tv0, {-2, -2}); - fusion.addOutput(tv4); - - int numel_x = 9; - int numel_y = 11; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(inputs); - - auto t1 = shift(t0, {-1, 0}); - TORCH_CHECK(t1.equal(outputs[0])); - - auto t2 = shift(t0, {0, 1}); - TORCH_CHECK(t2.equal(outputs[1])); - - auto t3 = shift(t0, {2, 2}); - TORCH_CHECK(t3.equal(outputs[2])); - - auto t4 = shift(t0, {-2, -2}); - TORCH_CHECK(t4.equal(outputs[3])); -} - -// Shifts an intermediate tensor -TEST(NVFuserTest, FusionShift2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = shift(tv1, {-1, 0}); - fusion.addOutput(tv2); - - // make it a little more complex - auto tv3 = add(tv0, new Double(3)); - auto tv4 = add(tv3, new Double(4)); - auto tv5 = shift(tv4, {-1, 0}); - auto tv6 = shift(tv4, {0, -1}); - auto tv7 = shift(tv4, {1, 0}); - auto tv8 = shift(tv4, {0, 0}); - auto tv9 = add(tv5, tv6); - auto tv10 = add(tv9, tv7); - auto tv11 = add(tv10, tv8); - fusion.addOutput(tv11); - - for (auto tv : {tv1, tv2, tv3, tv4, tv5, tv6, tv7, tv8, tv9, tv10, tv11}) { - tv->setMemoryType(MemoryType::Global); - } - - // t1 allocation: (t1.size[0] + 1) * (t1.size[1]) - // t3 allocation: (t3.size[0] + 2) * (t3.size[1] + 1) - // t4 allocation: (t3.size[0] + 2) * (t3.size[1] + 1) - GpuLower gpulw(&fusion); - - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { - auto tensor_name = alloc->buffer()->name(); - if (tensor_name == 1 || tensor_name == 3 || tensor_name == 4) { - TORCH_CHECK(alloc->shape().size() == 2); - for (int i = 0; i < 2; ++i) { - if (tensor_name == 1 && i == 1) { - TORCH_CHECK(alloc->shape().at(i)->isA()); - continue; - } - auto def = - dynamic_cast(alloc->shape().at(i)->definition()); - TORCH_CHECK(def != nullptr && def->operation() == BinaryOpType::Add); - TORCH_CHECK(def->as()->lhs()->isA()); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - if (tensor_name == 1) { - TORCH_CHECK(i == 0); - TORCH_CHECK(rhs_value == 1); - } else { - if (i == 0) { - TORCH_CHECK(rhs_value == 2); - } else { - TORCH_CHECK(rhs_value == 1); - } - } - } - } - } - } - - int numel_x = 9; - int numel_y = 11; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(inputs); - - auto t1 = t0 + 1; - auto t2 = shift(t1, {-1, 0}); - - auto t3 = t0 + 3; - auto t4 = t3 + 4; - auto t5 = shift(t4, {-1, 0}); - auto t6 = shift(t4, {0, -1}); - auto t7 = shift(t4, {1, 0}); - auto t8 = shift(t4, {0, 0}); - auto t9 = t5 + t6; - auto t10 = t9 + t7; - auto t11 = t10 + t8; - - testValidate(&fusion, outputs, inputs, {t2, t11}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionShiftRightOfCA_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = shift(tv1, {0, 1}); - fusion.addOutput(tv2); - - tv0->computeAt(tv2, -2); - - tv1->setMemoryType(MemoryType::Global); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 100; - int numel_y = 101; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto t1 = t0 + 1; - auto t2 = shift(t1, {0, 1}); - - TORCH_CHECK(t2.allclose(outputs[0])); -} - -TEST(NVFuserTest, FusionShiftLeftOfCA_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); - auto tv3 = shift(tv2, {-1, 0}); - auto tv4 = add(tv3, new Double(1)); - fusion.addOutput(tv4); - - tv0->computeAt(tv4, -1); - - // Lowering should trigger an assertion failure as a shifted axis is - // found inside an allocation position. - ASSERT_ANY_THROW(fusion.printKernel()); -} - -TEST(NVFuserTest, FusionShiftSplit1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = shift(tv1, {0, 1}); - auto tv3 = shift(tv1, {0, -2}); - fusion.addOutput(tv2); - fusion.addOutput(tv3); - - int split_factor = 4; - tv2->split(-1, split_factor); - tv3->split(-1, split_factor); - - tv0->computeAt(tv2, -2); - tv0->computeAt(tv3, -2); - - // t1 allocation: (4 + 3) - GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { - auto tensor_name = alloc->buffer()->name(); - if (tensor_name == 1) { - TORCH_CHECK(alloc->shape().size() == 1); - auto def = - dynamic_cast(alloc->shape().at(0)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor && rhs_value == 3); - } - } - } - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 9; - int numel_y = 11; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto t1 = t0 + 1; - auto t2 = shift(t1, {0, 1}); - auto t3 = shift(t1, {0, -2}); - - testValidate(&fusion, outputs, inputs, {t2, t3}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionShiftSplit2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); - auto tv3 = shift(tv2, {0, -1}); - auto tv4 = shift(tv2, {0, 1}); - auto tv5 = add(tv3, tv4); - fusion.addOutput(tv5); - - auto tv6 = add(tv0, new Double(1)); - auto tv7 = shift(tv6, {0, 0}); - auto tv8 = add(tv7, new Double(1)); - fusion.addOutput(tv8); - - int split_factor = 4; - - tv5->split(-1, split_factor); - tv8->split(-1, split_factor); - - tv0->computeAt(tv5, -2); - tv0->computeAt(tv8, -2); - - // t1 and t2 allocation: (4 + 2) - // t4 allocation: (4) - GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { - auto tensor_name = alloc->buffer()->name(); - if (tensor_name == 1 || tensor_name == 2) { - TORCH_CHECK(alloc->shape().size() == 1); - auto def = - dynamic_cast(alloc->shape().at(0)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor && rhs_value == 2); - } else if (tensor_name == 4) { - TORCH_CHECK(alloc->shape().size() == 1); - auto size = dynamic_cast(alloc->shape().at(0)); - TORCH_CHECK(size != nullptr && size->isConst()); - int size_value = *size->value(); - TORCH_CHECK(size_value == split_factor); - } - } - } - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 9; - int numel_y = 11; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto t1 = t0 + 2; - auto t3 = shift(t1, {0, -1}); - auto t4 = shift(t1, {0, 1}); - auto t5 = t3 + t4; - - auto t6 = t0 + 1; - auto t7 = t6; - auto t8 = t7 + 1; - - testValidate(&fusion, outputs, inputs, {t5, t8}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionShiftDoubleSplit_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(2)); - auto tv3 = shift(tv2, {0, 1}); - fusion.addOutput(tv3); - - int split_factor1 = 8; - int split_factor2 = 4; - - tv3->split(-1, split_factor1); - - tv0->computeAt(tv3, -2); - - tv1->split(-1, split_factor2); - - // t1: [i1, i2/8, 8/4, 4] - // t2: [i1, i2/8, 8] - // t3: [i1, i2/8, 8] - - // t1 and t2 allocation: (split_factor1 + 1) - GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { - auto tensor_name = alloc->buffer()->name(); - if (tensor_name == 1 || tensor_name == 2) { - TORCH_CHECK(alloc->shape().size() == 1); - auto def = - dynamic_cast(alloc->shape().at(0)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor1 && rhs_value == 1); - } - } - } - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 99; - int numel_y = 101; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto t1 = t0 + 3; - auto ref = shift(t1, {0, 1}); - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionShift3ptStencil_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // 3-pt stencil - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - std::vector> offsets = {{-1}, {1}}; - - std::vector tvs; - for (const auto& offset : offsets) { - tvs.push_back(shift(tv0, offset)); - } - - auto tv_out = tv0; - - for (auto tv : tvs) { - tv_out = add(tv_out, tv); - } - - tv_out = div(tv_out, new Double(tvs.size() + 1)); - - fusion.addOutput(tv_out); - - int split_factor = 4; - - tv_out->split(0, split_factor); - - // This seems fine but not verified yet - // tv_out->axis(-1)->parallelize(ParallelType::Unswitch); - - auto cache = tv0->cache_after(); - - tv0->computeAt(tv_out, 1); - - // Inline completely except for the cache - for (auto tv : tvs) { - tv->computeAt(tv_out, -1); - } - - // cache allocation: (split_factor + 2) - GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { - auto tensor_name = alloc->buffer()->name(); - if (tensor_name == cache->name()) { - TORCH_CHECK(alloc->shape().size() == 1); - auto def = - dynamic_cast(alloc->shape().at(0)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor && rhs_value == 2); - } - } - } - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 99; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto ref = (t0 + shift(t0, {-1}) + shift(t0, {1})) / 3; - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionShift5ptStencil_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // 5-pt stencil - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - std::vector> offsets = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}}; - - std::vector tvs; - for (const auto& offset : offsets) { - tvs.push_back(shift(tv0, offset)); - } - - auto tv_out = tv0; - - for (auto tv : tvs) { - tv_out = add(tv_out, tv); - } - - tv_out = div(tv_out, new Double(tvs.size() + 1)); - - fusion.addOutput(tv_out); - - std::vector split_factor({4, 8}); - - tv_out->split(-1, split_factor[1]); - tv_out->split(0, split_factor[0]); - tv_out->reorder({{1, 2}, {2, 1}}); - - auto cache = tv0->cache_after(); - - tv0->computeAt(tv_out, 2); - - // Inline completely except for the cache - for (auto tv : tvs) { - tv->computeAt(tv_out, -1); - } - - // cache allocation: (split_factor + 2) * (split_factor + 2) - GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { - auto tensor_name = alloc->buffer()->name(); - if (tensor_name == cache->name()) { - TORCH_CHECK(alloc->shape().size() == 2); - for (int i = 0; i < 2; ++i) { - auto def = - dynamic_cast(alloc->shape().at(i)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor[i] && rhs_value == 2); - } - } - } - } - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 99; - int numel_y = 101; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto ref = t0; - for (const auto& offset : offsets) { - ref = ref + shift(t0, offset); - } - ref = ref / int(offsets.size() + 1); - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionShift9ptStencil_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // 9-pt stencil - std::vector> offsets; - for (int i = -1; i < 2; ++i) { - for (int j = -1; j < 2; ++j) { - if (i == 0 && j == 0) { - continue; - } - offsets.push_back({i, j}); - } - } - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - std::vector tvs; - for (const auto& offset : offsets) { - tvs.push_back(shift(tv0, offset)); - } - - auto tv_out = tv0; - - for (auto tv : tvs) { - tv_out = add(tv_out, tv); - } - - tv_out = div(tv_out, new Double(tvs.size() + 1)); - - fusion.addOutput(tv_out); - - std::vector split_factor({4, 8}); - tv_out->split(-1, split_factor[1]); - tv_out->split(0, split_factor[0]); - tv_out->reorder({{1, 2}, {2, 1}}); - - auto cache = tv0->cache_after(); - - tv0->computeAt(tv_out, 2); - - // Inline completely except for the cache - for (auto tv : tvs) { - tv->computeAt(tv_out, -1); - } - - // This seems fine but not yet verified - // tv_out->axis(-1)->parallelize(ParallelType::Unswitch); - - // cache allocation: (split_factor + 2) * (split_factor + 2) - GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { - auto tensor_name = alloc->buffer()->name(); - if (tensor_name == cache->name()) { - TORCH_CHECK(alloc->shape().size() == 2); - for (int i = 0; i < 2; ++i) { - auto def = - dynamic_cast(alloc->shape().at(i)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor[i] && rhs_value == 2); - } - } - } - } - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 99; - int numel_y = 101; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto ref = t0; - for (const auto& offset : offsets) { - ref = ref + shift(t0, offset); - } - ref = ref / int(offsets.size() + 1); - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionShiftSmemBlocking_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = shift(tv1, {0, 1}); - fusion.addOutput(tv2); - - int smem_block_factor = 32; - - tv2->split(-1, smem_block_factor); - - tv0->computeAt(tv2, -2); - - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - - tv1->setMemoryType(MemoryType::Shared); - - // tv1 allocation: (split_factor + 1) - GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { - auto tensor_name = alloc->buffer()->name(); - if (tensor_name == tv1->name()) { - TORCH_CHECK(alloc->shape().size() == 1); - for (int i = 0; i < 1; ++i) { - auto def = - dynamic_cast(alloc->shape().at(i)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == smem_block_factor && rhs_value == 1); - } - } - } - } - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 100; - int numel_y = 101; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto t1 = t0 + 1; - auto t2 = shift(t1, {0, 1}); - auto ref = t2; - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionShift3ptStencilParallel_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // 3-pt stencil - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - std::vector tvs; - tvs.push_back(shift(tv0, {-1})); - tvs.push_back(shift(tv0, {1})); - - auto tv_out = tv0; - - for (auto tv : tvs) { - tv_out = add(tv_out, tv); - } - - tv_out = div(tv_out, new Double(tvs.size() + 1)); - - fusion.addOutput(tv_out); - - int smem_block_factor = 32; - - tv_out->split(0, smem_block_factor); - // tv_out->axis(-1)->parallelize(ParallelType::Unswitch); - - auto tv0_cache = tv0->cache_after(); - - tv0->computeAt(tv_out, 1); - - for (auto tv : tvs) { - tv->computeAt(tv_out, -1); - } - - tv0_cache->setMemoryType(MemoryType::Shared); - tv_out->axis(-1)->parallelize(ParallelType::TIDx); - tv0_cache->axis(-1)->parallelize(ParallelType::TIDx); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 99; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto ref = (t0 + shift(t0, {-1}) + shift(t0, {1})) / 3; - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionShift5ptStencilParallel_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // 5-pt stencil - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - std::vector> offsets = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}}; - - std::vector tvs; - for (const auto& offset : offsets) { - tvs.push_back(shift(tv0, offset)); - } - - auto tv_out = tv0; - - for (auto tv : tvs) { - tv_out = add(tv_out, tv); - } - - tv_out = div(tv_out, new Double(tvs.size() + 1)); - - fusion.addOutput(tv_out); - - int smem_block_factor = 32; - - tv_out->split(-1, smem_block_factor); - tv_out->split(0, smem_block_factor); - - tv_out->reorder({{1, 2}, {2, 1}}); - - auto tv0_cache = tv0->cache_after(); - - tv0->computeAt(tv_out, 2); - - for (auto tv : tvs) { - tv->computeAt(tv_out, -1); - } - - tv_out->axis(-1)->parallelize(ParallelType::TIDx); - tv_out->axis(-2)->parallelize(ParallelType::TIDy); - tv_out->axis(-3)->parallelize(ParallelType::BIDx); - tv_out->axis(-4)->parallelize(ParallelType::BIDy); - - tv0_cache->setMemoryType(MemoryType::Shared); - tv0_cache->axis(-1)->parallelize(ParallelType::TIDx); - tv0_cache->axis(-2)->parallelize(ParallelType::TIDy); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 99; - int numel_y = 101; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto ref = t0; - for (const auto& offset : offsets) { - ref = ref + shift(t0, offset); - } - ref = ref / int(offsets.size() + 1); - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionShiftMerge1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = shift(tv1, {-1, 1}); - fusion.addOutput(tv2); - - int split_factor = 4; - - tv2->split(-1, split_factor); - tv2->split(0, split_factor); - tv2->reorder({{1, 2}, {2, 1}}); - tv2->merge(2, 3); - - tv0->computeAt(tv2, 2); - - // t1 allocation: (split_factor + 1) * (split_factor + 1) - GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { - auto tensor_name = alloc->buffer()->name(); - if (tensor_name == 1) { - TORCH_CHECK(alloc->shape().size() == 2); - for (int i = 0; i < 2; ++i) { - auto def = - dynamic_cast(alloc->shape().at(i)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor && rhs_value == 1); - } - } - } - } - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 99; - int numel_y = 101; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto t1 = t0 + 1; - auto t2 = shift(t1, {-1, 1}); - auto ref = t2; - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionShiftMerge2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = shift(tv1, {1, -1}); - auto tv3 = shift(tv1, {-1, 1}); - auto tv4 = add(tv2, tv3); - fusion.addOutput(tv4); - - int split_factor = 4; - - tv4->split(-1, split_factor); - tv4->split(0, split_factor); - tv4->reorder({{1, 2}, {2, 1}}); - tv4->merge(2, 3); - - tv0->computeAt(tv4, -2); - - // t1 allocation: (split_factor + 2) * (split_factor + 2) - GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { - auto tensor_name = alloc->buffer()->name(); - if (tensor_name == 1) { - TORCH_CHECK(alloc->shape().size() == 2); - for (int i = 0; i < 2; ++i) { - auto def = - dynamic_cast(alloc->shape().at(i)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor && rhs_value == 2); - } - } - } - } - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 99; - int numel_y = 101; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto t1 = t0 + 1; - auto t2 = shift(t1, {1, -1}); - auto t3 = shift(t1, {-1, 1}); - auto t4 = t2 + t3; - - TORCH_CHECK(t4.allclose(outputs[0])); -} - -TEST(NVFuserTest, FusionShiftGlobal_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = shift(tv1, {0, 1}); - auto tv3 = shift(tv1, {-1, 0}); - auto tv4 = add(tv2, tv3); - fusion.addOutput(tv4); - - tv1->split(-1, 4); - tv2->split(-1, 8); - tv3->split(-1, 2); - tv4->split(-1, 3); - - tv1->merge(-2, -1); - - tv1->setMemoryType(MemoryType::Global); - tv2->setMemoryType(MemoryType::Global); - tv3->setMemoryType(MemoryType::Global); - - // t1 allocation: (t1.size[0] + 1) * (t1.size[1] + 1) - GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { - auto tensor_name = alloc->buffer()->name(); - if (tensor_name == 1) { - TORCH_CHECK(alloc->shape().size() == 2); - for (int i = 0; i < 2; ++i) { - auto def = - dynamic_cast(alloc->shape().at(i)->definition()); - TORCH_CHECK(def != nullptr && def->operation() == BinaryOpType::Add); - TORCH_CHECK(def->as()->lhs()->isA()); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(rhs_value == 1); - } - } - } - } - - int numel_x = 99; - int numel_y = 101; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(inputs); - - auto t1 = t0 + 1; - auto t2 = shift(t1, {0, 1}); - auto t3 = shift(t1, {-1, 0}); - auto t4 = t2 + t3; - auto ref = t4; - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionShiftDoubleSplitMerge1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(2)); - auto tv3 = shift(tv2, {0, 1}); - fusion.addOutput(tv3); - - int split_factor1 = 8; - int split_factor2 = 4; - - tv3->split(-1, split_factor1); - - tv0->computeAt(tv3, -2); - - tv1->split(-1, split_factor2); - tv1->merge(-2, -1); - - // t1 and t2 allocation: (split_factor1 + 1) - GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { - auto tensor_name = alloc->buffer()->name(); - if (tensor_name == 1 || tensor_name == 2) { - TORCH_CHECK(alloc->shape().size() == 1); - auto def = - dynamic_cast(alloc->shape().at(0)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor1 && rhs_value == 1); - } - } - } - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 99; - int numel_y = 101; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto t1 = t0 + 3; - auto ref = shift(t1, {0, 1}); - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionShiftDoubleSplitMerge2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(2)); - auto tv3 = shift(tv2, {1, 1}); - fusion.addOutput(tv3); - - auto out = tv3; - - int split_factor1 = 32; - int split_factor2 = 4; - - out->split(-1, split_factor1); - out->split(-1, split_factor2); - out->split(0, split_factor1); - out->split(1, split_factor2); - out->reorder({{3, 1}, {1, 2}, {4, 3}, {2, 4}}); - out->merge(2, 3); - out->merge(2, 3); - out->merge(2, 3); - out->merge(0, 1); - - TransformPropagator::from(out); - - tv0->computeAt(out, 1); - - out->axis(0)->parallelize(ParallelType::BIDx); - out->axis(1)->parallelize(ParallelType::TIDx); - - scheduler_utils::parallelizeAllLike(out, {tv1, tv2}); - - for (auto tv : {tv1, tv2}) { - tv->setMemoryType(MemoryType::Shared); - } - - // t1 and t2 allocation: (split_factor1 + 1) * (split_factor1 + 1) - GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { - auto tensor_name = alloc->buffer()->name(); - if (tensor_name == 1 || tensor_name == 2) { - TORCH_CHECK(alloc->shape().size() == 2); - for (int i = 0; i < 2; ++i) { - auto def = - dynamic_cast(alloc->shape().at(i)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor1 && rhs_value == 1); - } - } - } - } - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 99; - int numel_y = 101; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto ref = shift(t0 + 1 + 2, {1, 1}); - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionShift5ptStencilParallel1DThreadBlock_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // 5-pt stencil - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - std::vector> offsets = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}}; - - std::vector tvs; - for (const auto& offset : offsets) { - tvs.push_back(shift(tv0, offset)); - } - - auto tv_out = tv0; - - for (auto tv : tvs) { - tv_out = add(tv_out, tv); - } - - tv_out = div(tv_out, new Double(tvs.size() + 1)); - - fusion.addOutput(tv_out); - - std::vector split_factor({4, 32}); - - tv_out->split(-1, split_factor[1]); - tv_out->split(0, split_factor[0]); - tv_out->reorder({{1, 2}, {2, 1}}); - - auto tv0_cache = tv0->cache_after(); - - // Merge the inner-most two axes and create - // a 1D thread block of split_factor1*split_factor2 threads - tv_out->merge(-2, -1); - - tv0->computeAt(tv_out, 2); - - // Inline completely except for the cache - for (auto tv : tvs) { - tv->computeAt(tv_out, -1); - } - - tv0_cache->merge(-2, -1); - - tv_out->axis(-1)->parallelize(ParallelType::TIDx); - tv_out->axis(1)->parallelize(ParallelType::BIDx); - tv_out->axis(0)->parallelize(ParallelType::BIDy); - - tv0_cache->setMemoryType(MemoryType::Shared); - tv0_cache->axis(-1)->parallelize(ParallelType::TIDx); - - // cache allocation: (split_factor1 + 2) * (split_factor2 + 2) - GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { - auto tensor_name = alloc->buffer()->name(); - if (tensor_name == tv0_cache->name()) { - TORCH_CHECK(alloc->shape().size() == 2); - for (int i = 0; i < 2; ++i) { - auto def = - dynamic_cast(alloc->shape().at(i)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor[i] && rhs_value == 2); - } - } - } - } - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 99; - int numel_y = 101; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto ref = t0; - for (const auto& offset : offsets) { - ref = ref + shift(t0, offset); - } - ref = ref / int(offsets.size() + 1); - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionShiftChain1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = shift(tv0, {0, 1}); - auto tv2 = shift(tv1, {0, 1}); - fusion.addOutput(tv2); - - int split_factor = 4; - tv2->split(-1, split_factor); - - tv0->computeAt(tv2, -2); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 99; - int numel_y = 101; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto ref = shift(shift(t0, {0, 1}), {0, 1}); - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionShiftChain2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = shift(tv0, {0, 1}); - auto tv2 = shift(tv1, {0, -1}); - fusion.addOutput(tv2); - - tv2->split(-1, 4); - - tv0->computeAt(tv2, -2); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 99; - int numel_y = 101; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto ref = shift(shift(t0, {0, 1}), {0, -1}); - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionShiftChain3_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = shift(tv1, {0, 1}); - auto tv3 = shift(tv2, {0, 1}); - fusion.addOutput(tv3); - - int split_factor = 4; - tv3->split(-1, split_factor); - - tv0->computeAt(tv3, -2); - - // Halo size of tv1 is 2 as it needs to account for both of the two - // shift operations , while that of tv2 is still just 1 - - // tv1: (split_factor + 2) - // tv2: (split_factor + 1) - GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { - auto tensor_name = alloc->buffer()->name(); - if (tensor_name == 1 || tensor_name == 2) { - TORCH_CHECK(alloc->shape().size() == 1); - for (int i = 0; i < 1; ++i) { - auto def = - dynamic_cast(alloc->shape().at(i)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor); - if (tensor_name == 1) { - TORCH_CHECK(rhs_value == 2); - } else if (tensor_name == 2) { - TORCH_CHECK(rhs_value == 1); - } - } - } - } - } - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 99; - int numel_y = 101; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto t1 = t0 + 1; - auto t2 = shift(t1, {0, 1}); - auto t3 = shift(t2, {0, 1}); - auto ref = t3; - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionShiftChain4_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = shift(tv0, {1, -1}); - auto tv2 = shift(tv1, {2, -2}); - auto tv3 = shift(tv2, {3, -3}); - auto tv4 = shift(tv3, {4, -4}); - auto tv_out = tv4; - - fusion.addOutput(tv_out); - - int split_factor = 4; - - tv_out->split(-1, split_factor); - tv_out->split(0, split_factor); - tv_out->reorder({{1, 2}, {2, 1}}); - - tv0->computeAt(tv_out, 2); - - tv1->merge(-2, -1); - tv2->merge(-2, -1); - tv3->merge(-2, -1); - - // tv1: (split_factor + 9) * (split_factor + 9) - // tv2: (split_factor + 7) * (split_factor + 7) - // tv3: (split_factor + 4) * (split_factor + 4) - GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { - auto tensor_name = alloc->buffer()->name(); - if (tensor_name == 1 || tensor_name == 2) { - TORCH_CHECK(alloc->shape().size() == 2); - for (int i = 0; i < 2; ++i) { - auto def = - dynamic_cast(alloc->shape().at(i)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor); - if (tensor_name == 1) { - TORCH_CHECK(rhs_value == 9); - } else if (tensor_name == 2) { - TORCH_CHECK(rhs_value == 7); - } else if (tensor_name == 3) { - TORCH_CHECK(rhs_value == 4); - } - } - } - } - } - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 99; - int numel_y = 101; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto t1 = shift(t0, {1, -1}); - auto t2 = shift(t1, {2, -2}); - auto t3 = shift(t2, {3, -3}); - auto t4 = shift(t3, {4, -4}); - auto ref = t4; - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionShift5ptStencilChain_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - std::vector> offsets = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}}; - - // First stencil: 5pt stencil - // stencil1 = (tv0 + tv0[+1][0] + tv0[-1][0] + tv0[0][+1] + tv0[0][-1]) / 5 - std::vector tv_stencil1_shifts; - for (const auto& offset : offsets) { - tv_stencil1_shifts.push_back(shift(tv0, offset)); - } - - auto tv_stencil1 = tv0; - for (auto tv : tv_stencil1_shifts) { - tv_stencil1 = add(tv_stencil1, tv); - } - - tv_stencil1 = div(tv_stencil1, new Double(tv_stencil1_shifts.size() + 1)); - - // Second stencil: Same 5pt stencil - std::vector tv_stencil2_shifts; - for (const auto& offset : offsets) { - tv_stencil2_shifts.push_back(shift(tv_stencil1, offset)); - } - - auto tv_stencil2 = tv_stencil1; - for (auto tv : tv_stencil2_shifts) { - tv_stencil2 = add(tv_stencil2, tv); - } - - tv_stencil2 = div(tv_stencil2, new Double(tv_stencil2_shifts.size() + 1)); - - auto tv_out = tv_stencil2; - - fusion.addOutput(tv_out); - - auto tv0_cache = tv0->cache_after(); - - std::vector split_factor({16, 16}); - - tv_out->split(-1, split_factor[1]); - tv_out->split(0, split_factor[0]); - tv_out->reorder({{1, 2}, {2, 1}}); - - tv0->computeAt(tv_out, 2); - - // Inline completely all inputs to the first stencil output, except for the - // tv0 cache - for (auto tv : tv_stencil1_shifts) { - tv->computeAt(tv_stencil1, -1); - } - - // Inline completely all inputs to the second stencil output, except - // for the first stencil output - for (auto tv : tv_stencil2_shifts) { - tv->computeAt(tv_stencil2, -1); - } - - tv_out->axis(1)->parallelize(ParallelType::BIDx); - tv_out->axis(0)->parallelize(ParallelType::BIDy); - - auto all_values = DependencyCheck::getAllValsBetween( - {fusion.inputs().begin(), fusion.inputs().end()}, fusion.outputs()); - for (auto tv : ir_utils::filterByType(all_values)) { - tv->axis(-1)->parallelize(ParallelType::TIDx); - tv->axis(-2)->parallelize(ParallelType::TIDy); - } - - tv0_cache->setMemoryType(MemoryType::Shared); - tv_stencil1->setMemoryType(MemoryType::Shared); - - // tv0_cache: (split_factor + 4) * (split_factor + 4) - // tv_stencil1: (split_factor + 2) * (split_factor + 2) - GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { - auto tensor_name = alloc->buffer()->name(); - if (tensor_name == tv0_cache->name() || - tensor_name == tv_stencil1->name()) { - TORCH_CHECK(alloc->shape().size() == 2); - for (int i = 0; i < 2; ++i) { - auto def = - dynamic_cast(alloc->shape().at(i)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor[i]); - if (tensor_name == tv0_cache->name()) { - TORCH_CHECK(rhs_value == 4); - } else if (tensor_name == tv_stencil1->name()) { - TORCH_CHECK(rhs_value == 2); - } - } - } - } - } - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 99; - int numel_y = 101; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto stencil1 = t0; - for (const auto& offset : offsets) { - stencil1 = stencil1 + shift(t0, offset); - } - stencil1 = stencil1 / int(offsets.size() + 1); - auto stencil2 = stencil1; - for (const auto& offset : offsets) { - stencil2 = stencil2 + shift(stencil1, offset); - } - stencil2 = stencil2 / int(offsets.size() + 1); - auto ref = stencil2; - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -// Shift a reduced tensor -TEST(NVFuserTest, FusionShiftReduction1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = sum(tv1, {1}); - auto tv3 = shift(tv2, {1}); - fusion.addOutput(tv3); - - tv3->split(0, 4); - tv0->computeAt(tv3, 1); - tv0->computeAt(tv2, -1); - - const int numel_x = 9; - const int numel_y = 11; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(inputs); - - auto t1 = t0 + 1; - auto t2 = sum(t1, {1}); - auto t3 = shift(t2, {1}); - auto ref = t3; - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -// Parallelized version of FusionShiftReduction1 -TEST(NVFuserTest, FusionShiftReduction2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = sum(tv1, {1}); - auto tv3 = shift(tv2, {1}); - fusion.addOutput(tv3); - - tv3->split(0, 4); - tv0->computeAt(tv3, 1); - - tv2->split(-1, 32); - tv0->computeAt(tv2, -1); - - tv2->axis(-1)->parallelize(ParallelType::TIDx); - - tv2->setMemoryType(MemoryType::Shared); - - const int numel_x = 201; - const int numel_y = 301; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(inputs); - - auto t1 = t0 + 1; - auto t2 = sum(t1, {1}); - auto t3 = shift(t2, {1}); - auto ref = t3; - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionShiftRfactor1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = sum(tv1, {1}); - auto tv3 = shift(tv2, {1}); - fusion.addOutput(tv3); - - tv3->split(0, 4); - tv0->computeAt(tv3, 1); - - tv2->split(-1, 32); - auto rf = tv2->rFactor({-2}); - tv0->computeAt(tv2, -1); - tv0->computeAt(rf, -1); - - tv2->axis(-1)->parallelize(ParallelType::TIDx); - - tv2->setMemoryType(MemoryType::Shared); - - const int numel_x = 201; - const int numel_y = 301; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(inputs); - - auto t1 = t0 + 1; - auto t2 = sum(t1, {1}); - auto t3 = shift(t2, {1}); - auto ref = t3; - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionShiftBcast1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - auto tv1 = makeSymbolicTensor(2); - fusion.addInput(tv1); - auto tv2 = broadcast(tv0, {false, true}); - auto tv3 = shift(tv2, {0, 1}); - auto tv4 = add(tv3, tv1); - fusion.addOutput(tv4); - - tv0->computeAt(tv4, -1); - tv1->computeAt(tv4, -1); - - const int numel_x = 9; - const int numel_y = 11; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x}, options); - at::Tensor t1 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0, t1}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(inputs); - - auto t4 = t0.unsqueeze(-1).expand({numel_x, numel_y}) + t1; - auto ref = t4; - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionShiftBcast2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - auto tv1 = makeSymbolicTensor(2); - fusion.addInput(tv1); - auto tv2 = broadcast(tv0, {false, true}); - auto tv3 = shift(tv2, {1, 0}); - auto tv4 = add(tv3, tv1); - fusion.addOutput(tv4); - - tv4->split(0, 4); - tv0->computeAt(tv4, 1); - - const int numel_x = 9; - const int numel_y = 11; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x}, options); - at::Tensor t1 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0, t1}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(inputs); - - auto t2 = t0.unsqueeze(-1).expand({numel_x, numel_y}); - auto t3 = shift(t2, {1, 0}); - auto ref = t3 + t1; - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -// Combine ShiftBcast1 and ShiftBcast2 with parallelization -TEST(NVFuserTest, FusionShiftBcast3_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - auto tv1 = makeSymbolicTensor(2); - fusion.addInput(tv1); - auto tv2 = broadcast(tv0, {false, true}); - auto tv3 = shift(tv2, {1, 0}); - auto tv4 = shift(tv2, {0, 1}); - auto tv5 = shift(tv2, {-1, -1}); - auto tv6 = add(tv3, tv4); - auto tv7 = add(tv6, tv5); - auto tv8 = add(tv7, tv1); - fusion.addOutput(tv8); - - tv8->split(0, 4); - tv8->split(-1, 4); - tv0->computeAt(tv8, 1); - - tv8->axis(-1)->parallelize(ParallelType::TIDx); - for (auto tv : {tv8, tv7, tv6, tv5, tv4, tv3, tv2}) { - tv->axis(1)->parallelize(ParallelType::TIDy); - } - - tv2->setMemoryType(MemoryType::Shared); - - const int numel_x = 101; - const int numel_y = 201; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x}, options); - at::Tensor t1 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0, t1}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(inputs); - - auto t2 = t0.unsqueeze(-1).expand({numel_x, numel_y}); - auto t3 = shift(t2, {1, 0}); - auto t4 = t2; - auto t5 = shift(t2, {-1, 0}); - auto ref = t3 + t4 + t5 + t1; - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -// See issue #893 -TEST(NVFuserTest, FusionShiftSyncPlacement1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv0, new Double(2)); - auto tv3 = add(tv1, tv2); - auto tv4 = shift(tv3, {0, 1}); - fusion.addOutput(tv4); - - tv4->split(1, 8); - tv0->computeAt(tv4, 2); - - tv2->computeAt(tv3, -1); - - tv1->setMemoryType(MemoryType::Shared); - tv3->setMemoryType(MemoryType::Shared); - - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv4->axis(-1)->parallelize(ParallelType::TIDx); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 99; - int numel_y = 101; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto t1 = t0 + 1; - auto t2 = t0 + 2; - auto t3 = add(t1, t2); - auto t4 = shift(t3, {0, 1}); - - testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__); -} - -// See issue #893. Top-level placement. -TEST(NVFuserTest, FusionShiftSyncPlacement2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv0, new Double(2)); - auto tv3 = add(tv1, tv2); - auto tv4 = shift(tv3, {1}); - fusion.addOutput(tv4); - - tv2->computeAt(tv3, -1); - - tv1->setMemoryType(MemoryType::Shared); - tv3->setMemoryType(MemoryType::Shared); - - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv4->axis(-1)->parallelize(ParallelType::TIDx); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 99; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto t1 = t0 + 1; - auto t2 = t0 + 2; - auto t3 = add(t1, t2); - auto t4 = shift(t3, {1}); - - testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionShiftSyncPlacement3_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(2)); - auto tv3 = shift(tv2, {1}); - fusion.addOutput(tv3); - - // This doesn't work. syncthreads is needed between tv1 and tv2, but - // both the loop extent of both tv1 and tv2 has halo, so the loop is - // not eliminated even though it is parallelized. Moving syncthreads - // out of the loop would make it placed before tv1, which would make - // it meaningless. - // Ideally, an exception should be thrown at this computeAt, but at - // this point, the fusion is not yet parallelized, nor memory type - // is set, so this computeAt itself is not an error yet. - tv1->computeAt(tv2, -1); - - tv1->setMemoryType(MemoryType::Shared); - tv2->setMemoryType(MemoryType::Shared); - - tv1->axis(-1)->parallelize(ParallelType::TIDx); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - - // The error should be detected when the fusion is lowered. - ASSERT_ANY_THROW(fusion.printKernel()); -} - -// Based on original CUDA provided by Vishal Mehta. -// Major differences with the original version: -// - The original version uses additional 2 warps to load the halos -// along the Y dimension. The other 10 warps are used to load a 32x10 -// tile, and all warps will do coalesced loads. No such optimization -// is done in the fuser version. -TEST(NVFuserTest, FusionHdiff_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto inp = makeSymbolicTensor(3); - fusion.addInput(inp); - auto coeff = makeSymbolicTensor(3); - fusion.addInput(coeff); - - std::vector> offsets{ - {0, 1, 0}, {0, -1, 0}, {0, 0, 1}, {0, 0, -1}}; - - // T2, T3, T4, T5 - std::vector inp_neighbors; - for (const auto& offset : offsets) { - inp_neighbors.push_back(shift(inp, offset, false)); - } - - // T8 - TensorView* sum_of_neighbors = nullptr; - for (auto inp_neighbor : inp_neighbors) { - if (sum_of_neighbors == nullptr) { - sum_of_neighbors = inp_neighbor; - } else { - sum_of_neighbors = add(sum_of_neighbors, inp_neighbor); - } - } - - // T9 = T0 * 4 - // T10 = T9 - T8 - auto lap = sub(mul(inp, new Double(4)), sum_of_neighbors); - - // T11 = shift(T10) - // T12 = T11 - T10 - auto flx = sub(shift(lap, {0, 0, -1}, false), lap); - // T14 = T13 - T0 - // T15 = T12 * T14 - // T16 = T15 > 0 - // T17 = T16 ? 0 : T12 - auto flx_cond = - gt(mul(flx, sub(shift(inp, {0, 0, -1}, false), inp)), new Double(0)); - auto flx0 = where(flx_cond, new Double(0), flx); - - // T18 = shift(T10) - // T19 = T18 - T10 - auto fly = sub(shift(lap, {0, -1, 0}, false), lap); - // T20 = shift(T0) - // T21 = T20 - T0 - // T22 = T19 * T21 - // T23 = T22 > 0 - auto fly_cond = - gt(mul(fly, sub(shift(inp, {0, -1, 0}, false), inp)), new Double(0)); - // T24 = T23 ? 0 : T19 - auto fly0 = where(fly_cond, new Double(0), fly); - - // T25 = shift(flx0) - // T26 = T17 - T25 - // T27 = shift(fly0) - // T28 = T24 - T27 - // T29 = T26 + T28 - // T30 = T1 * T29 - // T31 = T0 - T30 - auto out = - sub(inp, - mul(coeff, - add(sub(flx0, shift(flx0, {0, 0, 1}, false)), - sub(fly0, shift(fly0, {0, 1, 0}, false))))); - - fusion.addOutput(out); - - ///////////////////////////////// - // Scheduling - ///////////////////////////////// - - out->setContiguity(false); - - // Step 1: 2D Tiling - - const int tile_x = 32; - const int tile_y = 8; - - out->split(-1, tile_x); - out->split(-3, tile_y); - out->reorder({{-2, -3}}); - inp->computeAt(out, -3); - coeff->computeAt(out, -3); - - // Step 2: Inlining - - // Inline inputs to lap - auto lap_vals = DependencyCheck::getAllValsBetween({inp}, {lap}); - for (auto val : ir_utils::filterByType(lap_vals)) { - if (val != lap && val != inp) { - val->computeAt(lap, -1); - } - } - - // Inline inputs to flx0 - auto flx0_vals = DependencyCheck::getAllValsBetween({lap, inp}, {flx0}); - for (auto val : ir_utils::filterByType(flx0_vals)) { - if (val != lap && val != flx0 && val != inp) { - val->computeAt(flx0, -1); - } - } - - // Inline inputs to fly0 - auto flxy_vals = DependencyCheck::getAllValsBetween({lap, inp}, {fly0}); - for (auto val : ir_utils::filterByType(flxy_vals)) { - if (val != lap && val != fly0 && val != inp) { - val->computeAt(fly0, -1); - } - } - - // Inline inputs to out - auto out_vals = DependencyCheck::getAllValsBetween({flx0, fly0}, {out}); - for (auto val : ir_utils::filterByType(out_vals)) { - if (val != flx0 && val != fly0 && val != out) { - val->computeAt(out, -1); - } - } - - // Step 3: Parallelization - - // Block parallelization - out->axis(0)->parallelize(ParallelType::BIDz); - out->axis(1)->parallelize(ParallelType::BIDy); - out->axis(2)->parallelize(ParallelType::BIDx); - // Thread parallelization - out->axis(3)->parallelize(ParallelType::TIDy); - out->axis(4)->parallelize(ParallelType::TIDx); - // Apply the same parallelization to all other tensors - scheduler_utils::parallelizeAllLike(out, ir_utils::allTvs(&fusion)); - - // Store intermediate stencil results on smem so that they can be - // accessed by threads - for (auto tv : {flx0, fly0, lap}) { - tv->setMemoryType(MemoryType::Shared); - } - - ///////////////////////////////// - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 101; - int numel_y = 99; - int numel_z = 10; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor inp_at = at::randn({numel_z, numel_y, numel_x}, options); - at::Tensor coeff_at = at::randn({numel_z, numel_y, numel_x}, options); - std::vector inputs = {inp_at, coeff_at}; - auto fuser_output = fe.runFusion(inputs)[0]; - // Trim the outer rim - std::vector indices{ - at::indexing::Slice(0, at::indexing::None), - at::indexing::Slice(2, -2), - at::indexing::Slice(2, -2)}; - fuser_output = fuser_output.index(indices); - - { - at::Tensor zeros = at::zeros({numel_z, numel_y, numel_x}, options); - auto lap = inp_at * 4 - - (shift(inp_at, {0, 1, 0}) + shift(inp_at, {0, -1, 0}) + - shift(inp_at, {0, 0, 1}) + shift(inp_at, {0, 0, -1})); - auto flx = shift(lap, {0, 0, -1}) - lap; - auto flx_cond = (flx * (shift(inp_at, {0, 0, -1}) - inp_at)) > 0; - auto flx0 = at::where(flx_cond, zeros, flx); - auto fly = shift(lap, {0, -1, 0}) - lap; - auto fly_cond = (fly * (shift(inp_at, {0, -1, 0}) - inp_at)) > 0; - auto fly0 = at::where(fly_cond, zeros, fly); - - auto ref = inp_at - - coeff_at * - ((flx0 - shift(flx0, {0, 0, 1})) + (fly0 - shift(fly0, {0, 1, 0}))); - ref = ref.index(indices); - - testValidate(&fusion, {fuser_output}, inputs, {ref}, __LINE__, __FILE__); - } -} - -TEST(NVFuserTest, FusionHdiffPartialSplitUnswitch_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto inp = makeSymbolicTensor(3); - fusion.addInput(inp); - auto coeff = makeSymbolicTensor(3); - fusion.addInput(coeff); - - std::vector> offsets{ - {0, 1, 0}, {0, -1, 0}, {0, 0, 1}, {0, 0, -1}}; - - // T2, T3, T4, T5 - std::vector inp_neighbors; - for (const auto& offset : offsets) { - inp_neighbors.push_back(shift(inp, offset, false)); - } - - // T8 - TensorView* sum_of_neighbors = nullptr; - for (auto inp_neighbor : inp_neighbors) { - if (sum_of_neighbors == nullptr) { - sum_of_neighbors = inp_neighbor; - } else { - sum_of_neighbors = add(sum_of_neighbors, inp_neighbor); - } - } - - // T9 = T0 * 4 - // T10 = T9 - T8 - auto lap = sub(mul(inp, new Double(4)), sum_of_neighbors); - - // T11 = shift(T10) - // T12 = T11 - T10 - auto flx = sub(shift(lap, {0, 0, -1}, false), lap); - // T14 = T13 - T0 - // T15 = T12 * T14 - // T16 = T15 > 0 - // T17 = T16 ? 0 : T12 - auto flx_cond = - gt(mul(flx, sub(shift(inp, {0, 0, -1}, false), inp)), new Double(0)); - auto flx0 = where(flx_cond, new Double(0), flx); - - // T18 = shift(T10) - // T19 = T18 - T10 - auto fly = sub(shift(lap, {0, -1, 0}, false), lap); - // T20 = shift(T0) - // T21 = T20 - T0 - // T22 = T19 * T21 - // T23 = T22 > 0 - auto fly_cond = - gt(mul(fly, sub(shift(inp, {0, -1, 0}, false), inp)), new Double(0)); - // T24 = T23 ? 0 : T19 - auto fly0 = where(fly_cond, new Double(0), fly); - - // T25 = shift(flx0) - // T26 = T17 - T25 - // T27 = shift(fly0) - // T28 = T24 - T27 - // T29 = T26 + T28 - // T30 = T1 * T29 - // T31 = T0 - T30 - auto out = - sub(inp, - mul(coeff, - add(sub(flx0, shift(flx0, {0, 0, 1}, false)), - sub(fly0, shift(fly0, {0, 1, 0}, false))))); - - fusion.addOutput(out); - - out->setContiguity(false); - - ///////////////////////////////// - // Scheduling - ///////////////////////////////// - - const auto all_vals = fusion.usedMathVals(); - const std::vector all_tensors( - {ir_utils::filterByType(all_vals).begin(), - ir_utils::filterByType(all_vals).end()}); - - // Step 1: Blocking - // - Thread block size: (tile_x, tile_y) - // - Each thread computes a vertical column of length tile_z along the Z - // axis. - // - Grid dize: (NX / block_x, NY / block_y, NZ / tile_z) - - const int tile_x = 32; - const int tile_y = 8; - const int tile_z = 16; - - out->split(0, tile_z); - out->split(-1, tile_x, true, true); - out->split(-3, tile_y, true, true); - // out: [NZ/tz, tz, NY/by, by, NX/bx, bx] - out->reorder({{1, 3}, {2, 1}, {3, 4}, {4, 2}}); - // out: [NZ/tz, NY/by, NX/bx, tz, by, bx] - - TransformPropagator::from(out); - - inp->computeAt(out, 4); - - // Step 2: Inlining - - // Inline inputs to lap - auto lap_vals = DependencyCheck::getAllValsBetween({inp}, {lap}); - for (auto val : ir_utils::filterByType(lap_vals)) { - if (val != lap && val != inp) { - val->computeAt(lap, -1); - } - } - - // Inline inputs to flx0 - auto flx0_vals = DependencyCheck::getAllValsBetween({lap, inp}, {flx0}); - for (auto val : ir_utils::filterByType(flx0_vals)) { - if (val != lap && val != flx0 && val != inp) { - val->computeAt(flx0, -1); - } - } - - // Inline inputs to fly0 - auto flxy_vals = DependencyCheck::getAllValsBetween({lap, inp}, {fly0}); - for (auto val : ir_utils::filterByType(flxy_vals)) { - if (val != lap && val != fly0 && val != inp) { - val->computeAt(fly0, -1); - } - } - - // Inline inputs to out - auto out_vals = DependencyCheck::getAllValsBetween({flx0, fly0}, {out}); - for (auto val : ir_utils::filterByType(out_vals)) { - if (val != flx0 && val != fly0 && val != out) { - val->computeAt(out, -1); - } - } - - // Step 3: Parallelization - - // Block parallelization - out->axis(0)->parallelize(ParallelType::BIDz); - out->axis(1)->parallelize(ParallelType::BIDy); - out->axis(2)->parallelize(ParallelType::BIDx); - out->axis(4)->parallelize(ParallelType::TIDy); - out->axis(5)->parallelize(ParallelType::TIDx); - // Unswitch at the tz axis - out->axis(3)->parallelize(ParallelType::Unswitch); - - scheduler_utils::parallelizeAllLike(out, all_tensors); - - // These need to be on smem - for (auto tv : {flx0, fly0, lap}) { - tv->setMemoryType(MemoryType::Shared); - } - - ///////////////////////////////// - FusionExecutor fe; - fe.compileFusion(&fusion); - - const int halo_extent = 2; - const int numel_x = 64 + halo_extent * 2; - const int numel_y = 64 + halo_extent * 2; - const int numel_z = 32; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor inp_at = at::randn({numel_z, numel_y, numel_x}, options); - at::Tensor coeff_at = at::randn({numel_z, numel_y, numel_x}, options); - std::vector inputs = {inp_at, coeff_at}; - auto fuser_output = fe.runFusion(inputs)[0]; - // Trim the outer rim - std::vector indices{ - at::indexing::Slice(0, at::indexing::None), - at::indexing::Slice(2, -2), - at::indexing::Slice(2, -2)}; - fuser_output = fuser_output.index(indices); - - { - at::Tensor zeros = at::zeros({numel_z, numel_y, numel_x}, options); - auto lap = inp_at * 4 - - (shift(inp_at, {0, 1, 0}) + shift(inp_at, {0, -1, 0}) + - shift(inp_at, {0, 0, 1}) + shift(inp_at, {0, 0, -1})); - auto flx = shift(lap, {0, 0, -1}) - lap; - auto flx_cond = (flx * (shift(inp_at, {0, 0, -1}) - inp_at)) > 0; - auto flx0 = at::where(flx_cond, zeros, flx); - auto fly = shift(lap, {0, -1, 0}) - lap; - auto fly_cond = (fly * (shift(inp_at, {0, -1, 0}) - inp_at)) > 0; - auto fly0 = at::where(fly_cond, zeros, fly); - - auto ref = inp_at - - coeff_at * - ((flx0 - shift(flx0, {0, 0, 1})) + (fly0 - shift(fly0, {0, 1, 0}))); - ref = ref.index(indices); - - testValidate(&fusion, {fuser_output}, inputs, {ref}, __LINE__, __FILE__); - } -} - -// 3x3 max pooling -TEST(NVFuserTest, FusionMaxPooling_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Format: CHW - auto inp = makeSymbolicTensor(3); - fusion.addInput(inp); - - // 3x3 pooling of the HW spatial domain - std::vector> offsets; - for (int i = -1; i <= 1; ++i) { - for (int j = -1; j <= 1; ++j) { - if (i == 0 && j == 0) { - continue; - } - offsets.push_back({i, j}); - } - } - - std::vector inp_tile({inp}); - for (auto offset : offsets) { - offset.insert(offset.begin(), 0); - inp_tile.push_back(shift(inp, offset)); - } - - TensorView* max_tensor = nullptr; - for (auto tv : inp_tile) { - if (max_tensor == nullptr) { - max_tensor = tv; - } else { - max_tensor = binaryOp(BinaryOpType::Max, max_tensor, tv); - } - } - - fusion.addOutput(max_tensor); - - //////////////////////////////////// - - // Cache the input and weight tensors - auto inp_cache = inp->cache_after(); - - // Tiling the spatial domain - const int tile_x = 32; - const int tile_y = 8; - - max_tensor->split(-2, tile_y); - max_tensor->axis(-2)->parallelize(ParallelType::TIDy); - max_tensor->split(-1, tile_x); - max_tensor->axis(-1)->parallelize(ParallelType::TIDx); - max_tensor->reorder({{-3, -2}}); - - inp_cache->computeAt(max_tensor, 3); - inp_cache->axis(-2)->parallelize(ParallelType::TIDy); - inp_cache->axis(-1)->parallelize(ParallelType::TIDx); - inp_cache->setMemoryType(MemoryType::Shared); - - auto max_tensor_dep = - DependencyCheck::getAllValsBetween({inp_cache}, {max_tensor}); - for (auto tv : ir_utils::filterByType(max_tensor_dep)) { - if (tv == inp_cache || tv == max_tensor) { - continue; - } - tv->computeAt(max_tensor, -1); - } - - max_tensor->axis(0)->parallelize(ParallelType::BIDx); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - const int hw = 50; - const int num_channels = 20; - const int pooling_window = 3; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_inp = at::randn({num_channels, hw, hw}, options); - // shift always pads by zero, so if all surrounding values are - // negative, max pooling would pick a padded value, which isn't the - // correct behavior. We need to be able to choose the value of - // padding. In this case, padding by the minimum value would not - // have this problem. For now, avoid the problem by making sure all - // values are not negative. - aten_inp = at::abs(aten_inp); - std::vector inputs = {aten_inp}; - - auto outputs = fe.runFusion(inputs); - - auto ref = at::max_pool2d( - aten_inp, {pooling_window, pooling_window}, {1, 1}, {1, 1}); - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionGatherPadding1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - const std::vector window_shape = {1, 3}; - const std::vector> padding_width = {{0, 0}, {1, 1}}; - - auto tv1 = gather(tv0, window_shape, padding_width); - - fusion.addOutput(tv1); - - const int s1 = 11; - const int s2 = 13; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({s1, s2}, options); - - auto ref = gather(t0, window_shape, padding_width); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({t0}); - - TORCH_CHECK(ref.equal(outputs[0])); -} - -TEST(NVFuserTest, FusionGatherPadding2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const std::vector window_shape = {1, 3}; - const std::vector> padding_width = {{0, 0}, {1, 1}}; - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - - auto tv2 = gather(tv1, window_shape, padding_width); - - auto tv3 = sum(tv2, {-1}); - - fusion.addOutput(tv3); - - tv3->split(1, 32); - tv0->computeAt(tv3, 2); - tv2->computeAt(tv3, -1); - - tv3->axis(0)->parallelize(ParallelType::BIDy); - tv3->axis(1)->parallelize(ParallelType::BIDx); - tv3->axis(2)->parallelize(ParallelType::TIDx); - tv1->axis(2)->parallelize(ParallelType::TIDx); - - tv1->setMemoryType(MemoryType::Shared); - - const int s1 = 99; - const int s2 = 101; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({s1, s2}, options); - std::vector inputs = {t0}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(inputs); - - auto t1 = t0 + 1; - auto t2 = gather(t1, window_shape, padding_width); - auto ref = sum(t2, {-1}); - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionConv2DStatic_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Input: [C, H, W] - auto inp = makeSymbolicTensor(3); - fusion.addInput(inp); - - // Weights: [K, C, 3, 3] - auto w = makeSymbolicTensor(4); - fusion.addInput(w); - - // Gather a neighbor tile of [3, 3] with padding size of 1 for each - // side of the spatial dimensions - auto inp_tile = gather(inp, {1, 3, 3}, {{0, 0}, {1, 1}, {1, 1}}); - // inp_tile: [C, H, W, 1, 3, 3] - - auto inp_bc = - broadcast(inp_tile, {true, false, false, false, false, false, false}); - auto w_bc = broadcast(w, {false, false, true, true, true, false, false}); - - auto inp_times_w = mul(inp_bc, w_bc); - - // Reduce the channel and neighbor tile dimensions - auto out = sum(inp_times_w, {1, 4, 5, 6}); - - fusion.addOutput(out); - - //////////////////////////////////// - - // Cache the input and weight tensors - auto inp_cache = inp->cache_after(); - - // Blocking the spatial dimensions - const int block_w = 16; - const int block_h = 4; - // Blocking the channel dimension - const int block_c = 8; - - out->split(2, block_h); - out->split(4, block_w); - out->reorder({{3, 4}}); - // out: [K, C, Ho, Wo, Hi, Wi, 1, 3, 3] - - out->split(1, block_c); - // out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 3, 3] - - auto out_rf = out->rFactor({1, -3, -2, -1}); - // out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 3, 3] - // out_rf: [K, Ci, Ho, Wo, Hi, Wi] - - // Create a [block_x, block_y] tile on smem - inp_cache->computeAt(out, 4); - // inp_cache: [Co, Ho, Wo, Ci, Hi, Wi] - inp_cache->setMemoryType(MemoryType::Shared); - - // Move Ci forward - out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}}); - inp_cache->computeAt(out_rf, 5); - - inp_tile->computeAt(out_rf, -1); - w->computeAt(out_rf, -1); - - out->axis(0)->parallelize(ParallelType::BIDx); - out->axis(1)->parallelize(ParallelType::TIDz); - out->axis(4)->parallelize(ParallelType::TIDy); - out->axis(5)->parallelize(ParallelType::TIDx); - - scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf}); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - const int dim_h = 99; - const int dim_w = 101; - const int dim_c = 10; - const int dim_f = 20; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options); - at::Tensor at_w = at::randn({dim_f, dim_c, 3, 3}, options); - std::vector inputs = {at_inp, at_w}; - - auto cg_outputs = fe.runFusion(inputs); - - at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis - auto at_out = at::conv2d(at_inp, at_w, {}, 1, 1); - at_out = at_out.squeeze(0); // drop the N axis - - testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__); -} - -// Mostly the same as the static conv test, but the shape of the weights, -// 3x3 in this case, is given dynamically -TEST(NVFuserTest, FusionConv2DDynamic_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Input: [C, H, W] - auto inp = makeSymbolicTensor(3); - fusion.addInput(inp); - - // Weights: [K, C, S, T] - auto w = makeSymbolicTensor(4); - fusion.addInput(w); - - auto w_h = new Int(); - fusion.addInput(w_h); - auto w_w = new Int(); - fusion.addInput(w_w); - - auto pad_h = new Int(); - fusion.addInput(pad_h); - auto pad_w = new Int(); - fusion.addInput(pad_w); - - // Gather a neighbor tile of [w_dim_h, w_dim_w] with padding - auto inp_tile = gather( - inp, - {new Int(1), w_h, w_w}, - {{new Int(0), new Int(0)}, {pad_h, pad_h}, {pad_w, pad_w}}); - // inp_tile: [C, 1, H - w_h + 1, W - w_w + 1, w_h, w_w] - - auto inp_bc = - broadcast(inp_tile, {true, false, false, false, false, false, false}); - auto w_bc = broadcast(w, {false, false, true, true, true, false, false}); - - auto inp_times_w = mul(inp_bc, w_bc); - - // Reduce the channel and neighbor tile dimensions - auto out = sum(inp_times_w, {1, 4, 5, 6}); - - fusion.addOutput(out); - - //////////////////////////////////// - // Cache the input and weight tensors - auto inp_cache = inp->cache_after(); - - // Blocking the spatial dimensions - const int block_w = 16; - const int block_h = 4; - // Blocking the channel dimension - const int block_c = 8; - - out->split(2, block_h); - out->split(4, block_w); - out->reorder({{3, 4}}); - // out: [K, C, Ho, Wo, Hi, Wi, 1, 3, 3] - - out->split(1, block_c); - // out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 3, 3] - - auto out_rf = out->rFactor({1, -3, -2, -1}); - // out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 3, 3] - // out_rf: [K, Ci, Ho, Wo, Hi, Wi] - - // Create a [block_x, block_y] tile on smem - inp_cache->computeAt(out, 4); - // inp_cache: [Co, Ho, Wo, Ci, Hi, Wi] - inp_cache->setMemoryType(MemoryType::Shared); - - // Move Ci forward - out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}}); - inp_cache->computeAt(out_rf, 5); - - inp_tile->computeAt(out_rf, -1); - w->computeAt(out_rf, -1); - - out->axis(0)->parallelize(ParallelType::BIDx); - out->axis(1)->parallelize(ParallelType::TIDz); - out->axis(4)->parallelize(ParallelType::TIDy); - out->axis(5)->parallelize(ParallelType::TIDx); - - scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf}); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - const int dim_h = 99; - const int dim_w = 101; - const int dim_c = 10; - const int dim_f = 20; - const int dim_w_h = 3; - const int dim_w_w = 3; - const int dim_pad_h = (dim_w_h - 1) / 2; - const int dim_pad_w = (dim_w_w - 1) / 2; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options); - at::Tensor at_w = at::randn({dim_f, dim_c, dim_w_h, dim_w_w}, options); - std::vector inputs = { - at_inp, at_w, dim_w_h, dim_w_w, dim_pad_h, dim_pad_w}; - - auto cg_outputs = fe.runFusion(inputs); - - at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis - auto at_out = at::conv2d(at_inp, at_w, {}, 1, 1); - at_out = at_out.squeeze(0); // drop the N axis - - testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__); -} - -// 5x5 followed by 3x3 -TEST(NVFuserTest, FusionConv2DDynamicChain_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Input: [K1, H, W] - auto inp = makeSymbolicTensor(3); - fusion.addInput(inp); - - // Weights: [K2, K1, S1, T1] - auto w1 = makeSymbolicTensor(4); - fusion.addInput(w1); - - // Weights: [K3, K2, S2, T2] - auto w2 = makeSymbolicTensor(4); - fusion.addInput(w2); - - auto w1_h = new Int(); - fusion.addInput(w1_h); - auto w1_w = new Int(); - fusion.addInput(w1_w); - - auto w2_h = new Int(); - fusion.addInput(w2_h); - auto w2_w = new Int(); - fusion.addInput(w2_w); - - auto pad_h1 = new Int(); - fusion.addInput(pad_h1); - auto pad_w1 = new Int(); - fusion.addInput(pad_w1); - - auto pad_h2 = new Int(); - fusion.addInput(pad_h2); - auto pad_w2 = new Int(); - fusion.addInput(pad_w2); - - // Gather a neighbor tile of [w1_h, w1_w] with padding - auto inp_tile = gather( - inp, - {new Int(1), w1_h, w1_w}, - {{new Int(0), new Int(0)}, {pad_h1, pad_h1}, {pad_w1, pad_w1}}); - // inp_tile: [C, 1, H - w1_h + 1, W - w1_w + 1, w1_h, w1_w] - - auto inp_bc = - broadcast(inp_tile, {true, false, false, false, false, false, false}); - auto w1_bc = broadcast(w1, {false, false, true, true, true, false, false}); - - auto inp_times_w1 = mul(inp_bc, w1_bc); - - // Reduce the channel and neighbor tile dimensions - auto out1 = sum(inp_times_w1, {1, 4, 5, 6}); - - // Second conv - auto out1_tile = gather( - out1, - {new Int(1), w2_h, w2_w}, - {{new Int(0), new Int(0)}, {pad_h2, pad_h2}, {pad_w2, pad_w2}}); - - auto out1_bc = - broadcast(out1_tile, {true, false, false, false, false, false, false}); - auto w2_bc = broadcast(w2, {false, false, true, true, true, false, false}); - - auto out1_times_w2 = mul(out1_bc, w2_bc); - - auto out2 = sum(out1_times_w2, {1, 4, 5, 6}); - - fusion.addOutput(out2); - - //////////////////////////////////// - // Cache the input and weight tensors - auto inp_cache = inp->cache_after(); - - // Blocking the spatial dimensions - const int block_w = 16; - const int block_h = 4; - - out2->split(2, block_h); - out2->split(4, block_w); - out2->reorder({{3, 4}}); - // out2: [K3, K2, Ho, Wo, Hi, Wi, 1, 3, 3] - - // Create a [block_x, block_y] tile on smem - inp_cache->computeAt(out2, 4); - // inp_cache: [Co, Ho, Wo, Ci, Hi, Wi] - inp_cache->setMemoryType(MemoryType::Shared); - - // Move Ci forward - out1->reorder({{5, 3}, {3, 4}, {4, 5}}); - out1->setMemoryType(MemoryType::Shared); - - inp_cache->computeAt(out1, 4); - - inp_tile->computeAt(out1, -1); - w1->computeAt(out1, -1); - - out1_tile->computeAt(out2, -1); - w2->computeAt(out2, -1); - - out2->axis(0)->parallelize(ParallelType::BIDx); - out2->axis(4)->parallelize(ParallelType::TIDy); - out2->axis(5)->parallelize(ParallelType::TIDx); - - scheduler_utils::parallelizeAllLike(out2, {inp_cache, out1}); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - const int dim_h = 99; - const int dim_w = 101; - const int dim_k1 = 3; - const int dim_k2 = 5; - const int dim_k3 = 7; - const int dim_w1_h = 5; - const int dim_w1_w = 5; - const int dim_pad1_h = (dim_w1_h - 1) / 2; - const int dim_pad1_w = (dim_w1_w - 1) / 2; - const int dim_w2_h = 3; - const int dim_w2_w = 3; - const int dim_pad2_h = (dim_w2_h - 1) / 2; - const int dim_pad2_w = (dim_w2_w - 1) / 2; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor at_inp = at::randn({dim_k1, dim_h, dim_w}, options); - at::Tensor at_w1 = at::randn({dim_k2, dim_k1, dim_w1_h, dim_w1_w}, options); - at::Tensor at_w2 = at::randn({dim_k3, dim_k2, dim_w2_h, dim_w2_w}, options); - std::vector inputs = { - at_inp, - at_w1, - at_w2, - dim_w1_h, - dim_w1_w, - dim_w2_h, - dim_w2_w, - dim_pad1_h, - dim_pad1_w, - dim_pad2_h, - dim_pad2_w}; - - auto cg_outputs = fe.runFusion(inputs); - - at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis - auto at_out1 = at::conv2d(at_inp, at_w1, {}, 1, 2); - auto at_out2 = at::conv2d(at_out1, at_w2, {}, 1, 1); - at_out2 = at_out2.squeeze(0); // drop the N axis - - testValidate(&fusion, cg_outputs, inputs, {at_out2}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionConv2DStaticEvenSizedWindow_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Input: [C, H, W] - auto inp = makeSymbolicTensor(3); - fusion.addInput(inp); - - // Weights: [K, C, 2, 2] - auto w = makeSymbolicTensor(4); - fusion.addInput(w); - - // Gather a neighbor tile of [2, 2] with padding size of 1 only for - // the right side of the spatial dimensions. The left padding is - // zero so that the output axis stays the same. - auto inp_tile = gather(inp, {1, 2, 2}, {{0, 0}, {0, 1}, {0, 1}}); - // inp_tile: [C, H, W, 1, 2, 2] - - auto inp_bc = - broadcast(inp_tile, {true, false, false, false, false, false, false}); - auto w_bc = broadcast(w, {false, false, true, true, true, false, false}); - - auto inp_times_w = mul(inp_bc, w_bc); - - // Reduce the channel and neighbor tile dimensions - auto out = sum(inp_times_w, {1, 4, 5, 6}); - - fusion.addOutput(out); - - //////////////////////////////////// - - // Cache the input and weight tensors - auto inp_cache = inp->cache_after(); - - // Blocking the spatial dimensions - const int block_w = 16; - const int block_h = 4; - // Blocking the channel dimension - const int block_c = 8; - - out->split(2, block_h); - out->split(4, block_w); - out->reorder({{3, 4}}); - // out: [K, C, Ho, Wo, Hi, Wi, 1, 2, 2] - - out->split(1, block_c); - // out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 2, 2] - - auto out_rf = out->rFactor({1, -3, -2, -1}); - // out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 2, 2] - // out_rf: [K, Ci, Ho, Wo, Hi, Wi] - - // Create a [block_x, block_y] tile on smem - inp_cache->computeAt(out, 4); - // inp_cache: [Co, Ho, Wo, Ci, Hi, Wi] - inp_cache->setMemoryType(MemoryType::Shared); - - // Move Ci forward - out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}}); - inp_cache->computeAt(out_rf, 5); - - inp_tile->computeAt(out_rf, -1); - w->computeAt(out_rf, -1); - - out->axis(0)->parallelize(ParallelType::BIDx); - out->axis(1)->parallelize(ParallelType::TIDz); - out->axis(4)->parallelize(ParallelType::TIDy); - out->axis(5)->parallelize(ParallelType::TIDx); - - scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf}); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - const int dim_h = 99; - const int dim_w = 101; - const int dim_c = 10; - const int dim_f = 20; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options); - at::Tensor at_w = at::randn({dim_f, dim_c, 2, 2}, options); - std::vector inputs = {at_inp, at_w}; - - auto cg_outputs = fe.runFusion(inputs); - - at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis - auto at_out = at::conv2d(at_inp, at_w, {}, 1, 1); - at_out = at_out.squeeze(0); // drop the N axis - // The shape of the spatial domain is (dim_h+1)x(dim_w+1), whereas - // the fuser output has dim_h*dim_w. Drop the first elements to make - // it match with the fuser output. - std::vector indices{ - at::indexing::Slice(0, at::indexing::None), - at::indexing::Slice(1, at::indexing::None), - at::indexing::Slice(1, at::indexing::None)}; - at_out = at_out.index(indices); - - testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__); -} - -// POC implementation of im2col for 3-by-3 kernels -TEST(NVFuserTest, FusionIm2Col_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - // Input: [N, C, H, W] - auto inp = makeSymbolicTensor(4); - fusion.addInput(inp); - - // Gather a neighbor tile of [3, 3] with padding size of 1 for each - // side of the spatial dimensions - auto inp_tile = gather(inp, {1, 1, 3, 3}, {{0, 0}, {0, 0}, {1, 1}, {1, 1}}); - // inp_tile: [N, C, H, W, 1, 1, 3, 3] - - auto inp_col = transpose(inp_tile, {{1, 3}, {2, 1}, {3, 2}}); - // inp_col: [N, H, W, C, 1, 1, 3, 3] - - fusion.addOutput(inp_col); - - //////////////////////////////////// - - // Cache the input tensor - auto inp_cache = inp->cache_after(); - - // Blocking the spatial dimensions - const int block_w = 16; - const int block_h = 4; - - auto out = inp_col; - - out->split(1, block_h); - out->split(3, block_w); - out->reorder({{2, 3}}); - // out: [N, Ho, Wo, Hi, Wi, C, 1, 1, 3, 3] - // Move the C axis out of Hi*Wi - out->reorder({{5, 3}, {3, 4}, {4, 5}}); - // out: [N, Ho, Wo, C, Hi, Wi, 1, 1, 3, 3] - - // Create a [block_x, block_y] tile on smem - inp_cache->computeAt(out, 4); - inp_cache->setMemoryType(MemoryType::Shared); - // Fully inline inp_tile - inp_tile->computeAt(out, -1); - - out->axis(0)->parallelize(ParallelType::BIDz); - out->axis(1)->parallelize(ParallelType::BIDy); - out->axis(2)->parallelize(ParallelType::BIDx); - out->axis(4)->parallelize(ParallelType::TIDy); - out->axis(5)->parallelize(ParallelType::TIDx); - - scheduler_utils::parallelizeAllLike(out, {inp_cache, inp_tile}); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - const int dim_h = 31; - const int dim_w = 33; - const int dim_c = 5; - const int dim_n = 3; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor at_inp = at::randn({dim_n, dim_c, dim_h, dim_w}, options); - std::vector inputs = {at_inp}; - - auto cg_outputs = fe.runFusion(inputs); - - auto at_out = at::im2col(at_inp, {3, 3}, {1, 1}, {1, 1}, {1, 1}); - - // at::im2col outputs [N, C*3*3, N*H] - at_out = at::transpose(at_out, 1, 2); - at_out = at::reshape(at_out, {dim_n, dim_h, dim_w, dim_c, 1, 1, 3, 3}); - - testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionShiftNoPadding1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = shift(tv1, {1, -1}, false); - auto tv3 = shift(tv1, {-1, 1}, false); - auto tv4 = add(tv2, tv3); - auto tv5 = sum(tv4, {0, 1}); - - fusion.addOutput(tv5); - - tv1->setMemoryType(MemoryType::Shared); - - tv5->split(0, 4); - tv5->split(-1, 8); - tv5->reorder({{1, 2}}); - - TransformPropagator::from(tv5); - - tv2->computeAt(tv5, -1); - tv3->computeAt(tv5, -1); - - tv5->axis(-1)->parallelize(ParallelType::TIDx); - tv5->axis(-2)->parallelize(ParallelType::TIDy); - scheduler_utils::parallelizeAllLike(tv5, ir_utils::allTvs(&fusion)); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 99; - int numel_y = 101; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto t1 = t0 + 1; - auto t2 = shift(t1, {1, -1}); - auto t3 = shift(t1, {-1, 1}); - auto t4 = t2 + t3; - std::vector indices{ - at::indexing::Slice(1, -1), at::indexing::Slice(1, -1)}; - t4 = t4.index(indices); - auto ref = t4.sum(at::ArrayRef{0, 1}); - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -// Split and merge -TEST(NVFuserTest, FusionShiftNoPadding2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = shift(tv1, {1, -1}, false); - auto tv3 = shift(tv1, {-1, 1}, false); - auto tv4 = add(tv2, tv3); - auto tv5 = sum(tv4, {0, 1}); - - fusion.addOutput(tv5); - - tv1->setMemoryType(MemoryType::Shared); - - tv5->split(0, 4); - tv5->split(-1, 8); - tv5->reorder({{1, 2}}); - tv5->merge(-2, -1); - - TransformPropagator::from(tv5); - - tv2->computeAt(tv5, -1); - tv3->computeAt(tv5, -1); - - tv5->axis(-1)->parallelize(ParallelType::TIDx); - scheduler_utils::parallelizeAllLike(tv5, ir_utils::allTvs(&fusion)); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 99; - int numel_y = 101; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto t1 = t0 + 1; - auto t2 = shift(t1, {1, -1}); - auto t3 = shift(t1, {-1, 1}); - auto t4 = t2 + t3; - std::vector indices{ - at::indexing::Slice(1, -1), at::indexing::Slice(1, -1)}; - t4 = t4.index(indices); - auto ref = t4.sum(at::ArrayRef{0, 1}); - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -// Split and merge, then welford -TEST(NVFuserTest, FusionShiftNoPadding3_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = shift(tv1, {1, -1}, false); - auto tv3 = shift(tv1, {-1, 1}, false); - auto tv4 = add(tv2, tv3); - auto tvs = Welford(tv4, {0, 1}); - auto tv_avg = tvs.avg; - auto tv_M2 = tvs.var_sum; - auto tv_N = tvs.n; - - fusion.addOutput(tv_avg); - fusion.addOutput(tv_M2); - fusion.addOutput(tv_N); - - tv1->setMemoryType(MemoryType::Shared); - - tv_avg->split(0, 4); - tv_avg->split(-1, 8); - tv_avg->reorder({{1, 2}}); - tv_avg->merge(-2, -1); - - TransformPropagator::from(tv_avg); - - tv2->computeAt(tv_avg, -1); - tv3->computeAt(tv_avg, -1); - - tv_avg->axis(-1)->parallelize(ParallelType::TIDx); - scheduler_utils::parallelizeAllLike(tv_avg, ir_utils::allTvs(&fusion)); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 99; - int numel_y = 101; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - outputs[1] /= (numel_x - 2) * (numel_y - 2); - - auto t1 = t0 + 1; - auto t2 = shift(t1, {1, -1}); - auto t3 = shift(t1, {-1, 1}); - auto t4 = t2 + t3; - std::vector indices{ - at::indexing::Slice(1, -1), at::indexing::Slice(1, -1)}; - t4 = t4.index(indices); - auto ref_avg = t4.mean(at::ArrayRef{0, 1}); - auto ref_M2 = t4.var(at::ArrayRef{0, 1}, false); - auto ref_N = at::ones({}, options_int) * (numel_x - 2) * (numel_y - 2); - - testValidate( - &fusion, outputs, inputs, {ref_avg, ref_M2, ref_N}, __LINE__, __FILE__); -} - -// Shift indexing and predication with contiguous merge -TEST(NVFuserTest, FusionShiftNoPaddingContigMerge_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = shift(tv1, {1, -1}, true); - auto tv3 = shift(tv1, {-1, 1}, false); - auto tv4 = add(tv2, tv3); - fusion.addOutput(tv4); - - tv2->merge(0); - tv3->merge(0); - tv4->merge(0); - - tv1->setMemoryType(MemoryType::Global); - tv2->setMemoryType(MemoryType::Global); - tv3->setMemoryType(MemoryType::Global); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 9; - int numel_y = 11; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - std::vector indices{ - at::indexing::Slice(1, -1), at::indexing::Slice(1, -1)}; - - auto fuser_out = outputs[0].index(indices); - - auto t1 = t0 + 1; - auto t2 = shift(t1, {1, -1}); - auto t3 = shift(t1, {-1, 1}); - auto ref = t2 + t3; - - ref = ref.index(indices); - - testValidate(&fusion, {fuser_out}, inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionShiftNoPaddingChain_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = shift(tv1, {1, -1}, false); - auto tv3 = shift(tv2, {1, -1}, false); - auto tv4 = sum(tv3, {0, 1}); - fusion.addOutput(tv4); - - tv1->setMemoryType(MemoryType::Shared); - tv2->setMemoryType(MemoryType::Shared); - - tv4->split(0, 4); - tv4->split(-1, 8); - tv4->reorder({{1, 2}}); - - tv1->computeAt(tv4, 2); - - tv4->axis(-1)->parallelize(ParallelType::TIDx); - tv4->axis(-2)->parallelize(ParallelType::TIDy); - - tv4->axis(0)->parallelize(ParallelType::BIDy); - tv4->axis(1)->parallelize(ParallelType::BIDx); - - scheduler_utils::parallelizeAllLike(tv4, {tv1, tv2, tv3}); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - int numel_x = 99; - int numel_y = 101; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - auto t1 = t0 + 1; - auto t2 = shift(t1, {1, -1}); - auto t3 = shift(t2, {1, -1}); - std::vector indices{ - at::indexing::Slice(2, at::indexing::None), at::indexing::Slice(0, -2)}; - t3 = t3.index(indices); - auto ref = t3.sum(at::ArrayRef{0, 1}); - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -// Rfactor is not allowed with partial domains -TEST(NVFuserTest, FusionShiftNoPaddingRfactor_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = shift(tv1, {1, -1}, false); - auto tv3 = sum(tv2, {0, 1}); - fusion.addOutput(tv3); - - tv3->split(0, 4); - tv3->split(-1, 8); - tv3->reorder({{1, 2}}); - - ASSERT_ANY_THROW(tv3->rFactor({-2})); -} - -TEST(NVFuserTest, FusionPartialSplit1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - // [I] - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(0)); - // [I] - auto tv2 = shift(tv1, {1}, false); - // [1:I] - auto tv3 = shift(tv1, {-1}, false); - // [0:I-1] - auto tv4 = add(tv2, tv3); - // [1:I-1] - fusion.addOutput(tv4); - - // Partial split of tv4. Split only the valid range, which is - // [1:-1]. - tv4->split(0, 8, true, true); - // [(I-2)/8, 8] - - // Propagates the partial split back to tv1. This means that all of - // the other tensors are also shaped as [(I-2)/8, 8], which appears - // to mean only the sub region of ((I-2)/8 * 8) is - // computed for tv1, tv2 and tv3. It's fine for the tv2 and tv3 - // tensors as only that sub region is used by tv4. It's also fine - // for tv1 since it has halo of size one at each side, so the whole - // region is actually calculated for tv1. - tv1->computeAt(tv4, 1); - - tv4->axis(-1)->parallelize(ParallelType::TIDx); - tv4->axis(-2)->parallelize(ParallelType::BIDx); - scheduler_utils::parallelizeAllLike(tv4, {tv1, tv2, tv3}); - - tv1->setMemoryType(MemoryType::Shared); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - // gridDim.x is ceilDiv(numel_x - 2, 8), not ceilDiv(numel_x, 8), - // so it's going to be just 2 rather than 3. - const int numel_x = 18; - - ExpressionEvaluator evaluator(&fusion); - auto root_extent = tv4->getRootDomain()[0]->extent(); - evaluator.bind(root_extent, numel_x); - auto extent_eval = evaluator.evaluate(tv4->axis(0)->extent()); - TORCH_CHECK( - extent_eval.has_value(), - "Invalid evaluation of outer domain extent of partial split"); - TORCH_CHECK( - extent_eval.value() == (numel_x - 2) / 8, - "Invalid extent of outer domain of partial split"); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor t0 = at::randn({numel_x}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - std::vector indices{at::indexing::Slice(1, -1)}; - - outputs[0] = outputs[0].index(indices); - - auto ref = (shift(t0, {1}) + shift(t0, {-1})).index(indices); - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionPartialSplit2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(0)); - auto tv2 = shift(tv1, {1}, false); - auto tv3 = shift(tv1, {-1}, false); - auto tv4 = add(tv2, tv3); - fusion.addOutput(tv4); - - auto tv5 = add(tv1, new Double(1)); - auto tv6 = add(tv5, new Double(1)); - fusion.addOutput(tv6); - - tv4->split(0, 4, true, true); - - // This causes tv5 and tv6 also to be split with the same partial - // offsets, however, since they need to be calculated entirely, the - // resulting code would be invalid. It should be detected as part of - // initial fusion validation during lowering. - tv1->computeAt(tv4, 1); - - // Validation should throw an error due to tv5 and tv6. - ASSERT_ANY_THROW(fusion.printKernel()); -} - -// 2D version of PartialSplit1 -TEST(NVFuserTest, FusionPartialSplit3_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(0)); - auto tv2 = shift(tv1, {1, 2}, false); - auto tv3 = shift(tv1, {-2, -1}, false); - auto tv4 = add(tv2, tv3); - fusion.addOutput(tv4); - - tv4->split(1, 8, true, true); - tv4->split(0, 4, true, true); - tv4->reorder({{1, 2}, {2, 1}}); - - tv1->computeAt(tv4, 2); - - tv4->axis(0)->parallelize(ParallelType::BIDy); - tv4->axis(1)->parallelize(ParallelType::BIDx); - tv4->axis(2)->parallelize(ParallelType::TIDy); - tv4->axis(3)->parallelize(ParallelType::TIDx); - scheduler_utils::parallelizeAllLike(tv4, {tv1, tv2, tv3}); - - tv1->setMemoryType(MemoryType::Shared); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - const int numel_x = 32 + 3; - const int numel_y = 32 + 3; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - std::vector indices{ - at::indexing::Slice(1, -2), at::indexing::Slice(2, -1)}; - - outputs[0] = outputs[0].index(indices); - - auto ref = (shift(t0, {1, 2}) + shift(t0, {-2, -1})).index(indices); - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -// Almost same fusion with Shift5ptStencilChain but non-padded shift -// and partial split. -TEST(NVFuserTest, FusionPartialSplit4_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - std::vector> offsets = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}}; - - // First stencil: 5pt stencil - // stencil1 = (tv0 + tv0[+1][0] + tv0[-1][0] + tv0[0][+1] + tv0[0][-1]) / 5 - std::vector tv_stencil1_shifts; - for (const auto& offset : offsets) { - tv_stencil1_shifts.push_back(shift(tv0, offset, false)); - } - - auto tv_stencil1 = tv0; - for (auto tv : tv_stencil1_shifts) { - tv_stencil1 = add(tv_stencil1, tv); - } - - tv_stencil1 = div(tv_stencil1, new Double(tv_stencil1_shifts.size() + 1)); - - // Second stencil: Same 5pt stencil - std::vector tv_stencil2_shifts; - for (const auto& offset : offsets) { - tv_stencil2_shifts.push_back(shift(tv_stencil1, offset, false)); - } - - auto tv_stencil2 = tv_stencil1; - for (auto tv : tv_stencil2_shifts) { - tv_stencil2 = add(tv_stencil2, tv); - } - - tv_stencil2 = div(tv_stencil2, new Double(tv_stencil2_shifts.size() + 1)); - - auto tv_out = tv_stencil2; - - fusion.addOutput(tv_out); - - auto tv0_cache = tv0->cache_after(); - - std::vector split_factor({16, 16}); - - tv_out->split(-1, split_factor[1], true, true); - tv_out->split(0, split_factor[0], true, true); - tv_out->reorder({{1, 2}, {2, 1}}); - - tv0->computeAt(tv_out, 2); - - // Inline completely all inputs to the first stencil output, except for the - // tv0 cache - for (auto tv : tv_stencil1_shifts) { - tv->computeAt(tv_stencil1, -1); - } - - // Inline completely all inputs to the second stencil output, except - // for the first stencil output - for (auto tv : tv_stencil2_shifts) { - tv->computeAt(tv_stencil2, -1); - } - - tv_out->axis(0)->parallelize(ParallelType::BIDy); - tv_out->axis(1)->parallelize(ParallelType::BIDx); - tv_out->axis(2)->parallelize(ParallelType::TIDy); - tv_out->axis(3)->parallelize(ParallelType::TIDx); - - auto all_values = DependencyCheck::getAllValsBetween( - {fusion.inputs().begin(), fusion.inputs().end()}, fusion.outputs()); - for (auto tv : ir_utils::filterByType(all_values)) { - scheduler_utils::parallelizeAllLike(tv_out, {tv}); - } - - tv0_cache->setMemoryType(MemoryType::Shared); - tv_stencil1->setMemoryType(MemoryType::Shared); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - // Input matrix size is 68x68, and the output is 64x64. Both - // gridDim.x and gridim.y should be ceilDiv(numel - 4, - // split_factor), which is 4. If full split is used, the grid - // dimension would be 5. - const int numel_x = 64 + 4; - const int numel_y = 64 + 4; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - std::vector indices{ - at::indexing::Slice(2, -2), at::indexing::Slice(2, -2)}; - - outputs[0] = outputs[0].index(indices); - - auto stencil1 = t0; - for (const auto& offset : offsets) { - stencil1 = stencil1 + shift(t0, offset); - } - stencil1 = stencil1 / int(offsets.size() + 1); - auto stencil2 = stencil1; - for (const auto& offset : offsets) { - stencil2 = stencil2 + shift(stencil1, offset); - } - stencil2 = stencil2 / int(offsets.size() + 1); - auto ref = stencil2.index(indices); - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionPartialSplit5_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const int numel_x = 10; - const int numel_y = 11; - - // auto tv0 = makeSymbolicTensor(2); - auto tv0 = makeConcreteTensor({numel_x, numel_y}); - fusion.addInput(tv0); - - auto tv1 = shift(tv0, {0, 1}, false); - auto tv2 = add(tv1, new Double(1)); - - fusion.addOutput(tv2); - - // Partially split tv2 but not tv1. Producer indexing with tv2 as a consumer - // requires adjustment of the index to account for the difference of split - // offsets. - tv2->split(1, 4, true, true); - tv1->split(1, 4); - - tv1->computeAt(tv2, 1); - - tv2->axis(1)->parallelize(ParallelType::TIDx); - tv1->axis(1)->parallelize(ParallelType::TIDx); - - tv1->setMemoryType(MemoryType::Shared); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - std::vector indices{ - at::indexing::Slice(0, at::indexing::None), - at::indexing::Slice(1, at::indexing::None)}; - - outputs[0] = outputs[0].index(indices); - - auto ref = (shift(t0, {0, 1}) + 1).index(indices); - - testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionPartialSplit6_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const int numel_x = 9; - - auto tv0 = makeConcreteTensor({numel_x}); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = shift(tv1, {1}, false); - auto tv3 = add(tv2, new Double(1)); - - fusion.addOutput(tv3); - - // Another mix of partial and non-partial split - tv1->split(0, 4); - tv2->split(0, 4, true, true); - tv3->split(0, 4); - - // Just make it easier for compute-sanitizer to flag invalid memory accesses - tv1->setMemoryType(MemoryType::Shared); - tv2->setMemoryType(MemoryType::Shared); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x}, options); - std::vector inputs = {t0}; - auto outputs = fe.runFusion(inputs); - - std::vector indices{ - at::indexing::Slice(1, at::indexing::None)}; - - outputs[0] = outputs[0].index(indices); - - auto ref = (shift(t0 + 1, {1}) + 1).index(indices); - - testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionShiftUnswitch1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = shift(tv0, {-1, 0}); - fusion.addOutput(tv1); - - auto tv2 = shift(tv0, {0, 1}); - fusion.addOutput(tv2); - - auto tv3 = shift(tv0, {2, 2}); - fusion.addOutput(tv3); - - auto tv4 = shift(tv0, {-2, -2}); - fusion.addOutput(tv4); - - auto tv5 = add(tv0, new Double(1)); - auto tv6 = shift(tv5, {0, -1}); - fusion.addOutput(tv6); - - tv1->axis(1)->parallelize(ParallelType::Unswitch); - tv2->axis(1)->parallelize(ParallelType::Unswitch); - tv3->axis(0)->parallelize(ParallelType::Unswitch); - tv4->axis(0)->parallelize(ParallelType::Unswitch); - - tv5->axis(1)->parallelize(ParallelType::TIDx); - tv6->axis(1)->parallelize(ParallelType::TIDx); - tv5->axis(0)->parallelize(ParallelType::Unswitch); - tv5->setMemoryType(MemoryType::Shared); - - int numel_x = 9; - int numel_y = 11; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x, numel_y}, options); - std::vector inputs = {t0}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(inputs); - - auto t1 = shift(t0, {-1, 0}); - TORCH_CHECK(t1.equal(outputs[0])); - - auto t2 = shift(t0, {0, 1}); - TORCH_CHECK(t2.equal(outputs[1])); - - auto t3 = shift(t0, {2, 2}); - TORCH_CHECK(t3.equal(outputs[2])); - - auto t4 = shift(t0, {-2, -2}); - TORCH_CHECK(t4.equal(outputs[3])); - - auto t6 = shift(t0 + 1, {0, -1}); - TORCH_CHECK(t6.equal(outputs[4])); -} - -TEST(NVFuserTest, FusionGatherUnswitch1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1_gather_param = new Int(); - fusion.addInput(tv1_gather_param); - auto tv1_gather_pad_param = new Int(); - fusion.addInput(tv1_gather_pad_param); - auto tv1 = gather( - tv0, {tv1_gather_param}, {{tv1_gather_pad_param, tv1_gather_pad_param}}); - fusion.addOutput(tv1); - - auto tv2_gather_param = new Int(); - fusion.addInput(tv2_gather_param); - auto tv2_gather_pad_param = new Int(); - fusion.addInput(tv2_gather_pad_param); - auto tv2 = gather( - tv0, {tv2_gather_param}, {{tv2_gather_pad_param, tv2_gather_pad_param}}); - fusion.addOutput(tv2); - - // Static gather - auto tv3 = gather(tv0, {3}, {{1, 1}}); - fusion.addOutput(tv3); - - // Static gather - auto tv4 = gather(tv0, {5}, {{2, 2}}); - fusion.addOutput(tv4); - - auto tv0_cache = tv0->cache_after(); - tv0_cache->setMemoryType(MemoryType::Shared); - - tv4->split(0, 32); - - tv0->computeAt(tv4, 1); - - tv4->axis(0)->parallelize(ParallelType::Unswitch); - tv4->axis(1)->parallelize(ParallelType::TIDx); - - const int numel_x = 100; - const int tv1_gather = 3; - const int tv1_gather_pad = 1; - const int tv2_gather = 5; - const int tv2_gather_pad = 2; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({numel_x}, options); - std::vector inputs = { - t0, tv1_gather, tv1_gather_pad, tv2_gather, tv2_gather_pad}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(inputs); - - auto t1 = gather(t0, {tv1_gather}, {{tv1_gather_pad, tv1_gather_pad}}); - TORCH_CHECK(t1.equal(outputs[0])); - - auto t2 = gather(t0, {tv2_gather}, {{tv2_gather_pad, tv2_gather_pad}}); - TORCH_CHECK(t2.equal(outputs[1])); - - auto t3 = gather(t0, {3}, {{1, 1}}); - TORCH_CHECK(t3.equal(outputs[2])); - - auto t4 = gather(t0, {5}, {{2, 2}}); - TORCH_CHECK(t4.equal(outputs[3])); -} - -TEST(NVFuserTest, FusionGatherStrided1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - const std::vector window_shape = {1, 3}; - const std::vector> padding_width = {{0, 0}, {1, 1}}; - - const std::vector strides = {1, 3}; - - auto tv1 = gather(tv0, window_shape, padding_width, strides); - - fusion.addOutput(tv1); - - const int s1 = 11; - const int s2 = 13; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({s1, s2}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({t0}); - - // tv1 has a stride dimension, so its number of dimensions should be - // input_ndims + window_ndims + stride. - TORCH_CHECK(tv1->nDims() == tv0->nDims() * 2 + 1); - - // However, the number of dimensions of the Aten tensor should still - // be just the twice of the number of dimensions of the input - // tensor. - auto fuser_out = outputs[0]; - TORCH_CHECK( - fuser_out.ndimension() == tv0->nDims() * 2, - "Invalid dimensionality of output tensor: ", - fuser_out.ndimension()); - - // Each output dimension should be: ceilDiv(input_size + padding_width - - // window, stride). - for (const auto i : c10::irange(window_shape.size())) { - auto valid_dim = ceilDiv( - t0.size(i) + padding_width[i][0] + padding_width[i][1] - - window_shape[i] + 1, - strides[i]); - auto actual_dim = outputs[0].size(i); - TORCH_CHECK( - valid_dim == actual_dim, - "Invalid output size at dimension ", - i, - ". Expected: ", - valid_dim, - ", actual: ", - actual_dim); - } - - auto ref = gather(t0, window_shape, padding_width, strides); - - TORCH_CHECK(ref.equal(outputs[0])); -} - -// Split strided domain -TEST(NVFuserTest, FusionGatherStrided2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const std::vector window_shape = {3}; - const std::vector> padding_width = {{1, 1}}; - const std::vector strides = {3}; - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - - auto tv2 = gather(tv1, window_shape, padding_width, strides); - - auto tv3 = sum(tv2, {-1}); - - fusion.addOutput(tv3); - - // Split the strided domain - tv3->split(0, 4); - - // Propagate the split by 4 of the tv3 domain to pre-stride domains, - // making them split by 4 * 3 - tv0->computeAt(tv3, 1); - - tv2->computeAt(tv3, -1); - - tv3->axis(0)->parallelize(ParallelType::BIDx); - tv3->axis(1)->parallelize(ParallelType::TIDx); - scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2}); - - tv1->setMemoryType(MemoryType::Shared); - - const int s1 = 100; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({s1}, options); - std::vector inputs = {t0}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(inputs); - - auto t1 = t0 + 1; - auto t2 = gather(t1, window_shape, padding_width, strides); - auto ref = sum(t2, {-1}); - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -// Outer split -TEST(NVFuserTest, FusionGatherStrided3_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const std::vector window_shape = {3}; - const std::vector> padding_width = {{1, 1}}; - const std::vector strides = {3}; - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - - auto tv2 = gather(tv1, window_shape, padding_width, strides); - - auto tv3 = sum(tv2, {-1}); - fusion.addOutput(tv3); - - // Outer split - tv3->split(0, 2, false); - - tv0->computeAt(tv3, 1); - - tv3->axis(0)->parallelize(ParallelType::BIDx); - tv3->axis(1)->parallelize(ParallelType::TIDx); - scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2}); - - tv1->setMemoryType(MemoryType::Shared); - tv2->setMemoryType(MemoryType::Shared); - - const int s1 = 100; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({s1}, options); - std::vector inputs = {t0}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(inputs); - - auto t1 = t0 + 1; - auto t2 = gather(t1, window_shape, padding_width, strides); - auto ref = sum(t2, {-1}); - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionGatherStrided4_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const std::vector window_shape = {3}; - const std::vector> padding_width = {{1, 1}}; - const std::vector strides = {3}; - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - - // Test propagation of split from one gather output to another - auto tv2 = gather(tv1, window_shape, padding_width, strides); - auto tv3 = gather(tv1, window_shape, padding_width, strides); - - auto tv4 = sum(tv2, {-1}); - fusion.addOutput(tv4); - - auto tv5 = sum(tv3, {-1}); - fusion.addOutput(tv5); - - tv4->split(0, 2); - - // Test forward computeAt propagation from tv1 to tv3 - tv0->computeAt(tv4, 1); - - const int s1 = 101; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({s1}, options); - std::vector inputs = {t0}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(inputs); - - auto t1 = t0 + 1; - auto t2 = gather(t1, window_shape, padding_width, strides); - auto ref = sum(t2, {-1}); - - testValidate(&fusion, outputs, inputs, {ref, ref}, __LINE__, __FILE__); -} - -// Same as GatherStrided1 but with stride != window -TEST(NVFuserTest, FusionGatherStrided5_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - const std::vector window_shape = {1, 3}; - const std::vector> padding_width = {{0, 0}, {1, 1}}; - - const std::vector strides = {1, 2}; - - auto tv1 = gather(tv0, window_shape, padding_width, strides); - - fusion.addOutput(tv1); - - const int s1 = 11; - const int s2 = 13; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({s1, s2}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({t0}); - - auto ref = gather(t0, window_shape, padding_width, strides); - - TORCH_CHECK(ref.equal(outputs[0])); -} - -// Same as GatherStrided2 but with stride != window -TEST(NVFuserTest, FusionGatherStrided6_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const std::vector window_shape = {3}; - const std::vector> padding_width = {{1, 1}}; - const std::vector strides = {2}; - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - - auto tv2 = gather(tv1, window_shape, padding_width, strides); - - auto tv3 = sum(tv2, {-1}); - - fusion.addOutput(tv3); - - // Split the strided domain - tv3->split(0, 4); - - // Propagate the split by 4 of the tv3 domain to pre-stride domains, - // making them split by 4 * 2 - tv0->computeAt(tv3, 1); - - tv2->computeAt(tv3, -1); - - tv3->axis(0)->parallelize(ParallelType::BIDx); - tv3->axis(1)->parallelize(ParallelType::TIDx); - scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2}); - - tv1->setMemoryType(MemoryType::Shared); - - const int s1 = 100; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({s1}, options); - std::vector inputs = {t0}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(inputs); - - auto t1 = t0 + 1; - auto t2 = gather(t1, window_shape, padding_width, strides); - auto ref = sum(t2, {-1}); - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -// Same as GatherStrided4 but different strides -TEST(NVFuserTest, FusionGatherStrided7_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const std::vector window_shape = {3}; - const std::vector> padding_width = {{1, 1}}; - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - - // Use different strides - auto tv2 = gather(tv1, window_shape, padding_width, {3}); - auto tv3 = gather(tv1, window_shape, padding_width, {2}); - - auto tv4 = sum(tv2, {-1}); - fusion.addOutput(tv4); - - auto tv5 = sum(tv3, {-1}); - fusion.addOutput(tv5); - - tv4->split(0, 2); - - // Since tv3 has a different stride factor, this should fail. - ASSERT_ANY_THROW(tv0->computeAt(tv4, 1)); -} - -// Same as GatherStrided2 but with unswitch -TEST(NVFuserTest, FusionGatherStrided8_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const std::vector window_shape = {3}; - const std::vector> padding_width = {{1, 1}}; - const std::vector strides = {3}; - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - - auto tv2 = gather(tv1, window_shape, padding_width, strides); - - auto tv3 = sum(tv2, {-1}); - - fusion.addOutput(tv3); - - const int tidx = 32; - - // Split the strided domain - tv3->split(0, tidx); - - // Split for unswitch - tv3->split(0, 1); - - tv0->computeAt(tv3, 2); - - tv2->computeAt(tv3, -1); - - tv3->axis(0)->parallelize(ParallelType::BIDx); - tv3->axis(1)->parallelize(ParallelType::Unswitch); - tv3->axis(2)->parallelize(ParallelType::TIDx); - scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2}); - - tv1->setMemoryType(MemoryType::Shared); - - const int s1 = 1023; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({s1}, options); - std::vector inputs = {t0}; - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion(inputs); - - auto t1 = t0 + 1; - auto t2 = gather(t1, window_shape, padding_width, strides); - auto ref = sum(t2, {-1}); - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -// Chained strided gather. Not supported yet. -TEST(NVFuserTest, FusionGatherStridedChain_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - const std::vector window_shape = {3}; - const std::vector> padding_width = {{1, 1}}; - const std::vector strides = {3}; - // const std::vector strides = {1}; - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - - auto tv2 = gather(tv1, window_shape, padding_width, strides); - // Reduce gathered window - auto tv3 = sum(tv2, {-1}); - - // Repeat - auto tv4 = gather(tv3, window_shape, padding_width, strides); - auto tv5 = sum(tv4, {-1}); - auto out = tv5; - - fusion.addOutput(out); - - // This should throw an error at HaloInfo::build. - ASSERT_ANY_THROW(GpuLower gpulw(&fusion)); -} - -TEST(NVFuserTest, FusionMaxPoolingStrided_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 6) { - return; - } - Fusion fusion; - FusionGuard fg(&fusion); - - // Input: CHW - // Pooling window: 3x3 - // Strides: 3 - // Padding: 1 at each end of the inner 2 dimensions - - // [C, H, W] - auto inp = makeSymbolicTensor(3); - fusion.addInput(inp); - - // [C, H/3, W/3, 1, 3, 3] - auto inp_tile = gather(inp, {1, 3, 3}, {{0, 0}, {1, 1}, {1, 1}}, {1, 3, 3}); - - // [C, H/3, W/3] - auto max_tensor = reductionOp( - BinaryOpType::Max, - {-3, -2, -1}, - new Double(std::numeric_limits::lowest()), - inp_tile); - fusion.addOutput(max_tensor); - - //////////////////////////////////// - - // Cache the input and weight tensors - auto inp_cache = inp->cache_after(); - - // Tiling the spatial domain - const int tile_x = 32; - const int tile_y = 8; - - max_tensor->split(1, tile_y); - max_tensor->split(3, tile_x); - max_tensor->reorder({{2, 3}}); - // [C, H/tile_y, W/tile_x, tile_y, tile_x] - max_tensor->split(2, 1); - // [C, H/tile_y, W/tile_x, 1, tile_y, tile_x] - - inp->computeAt(max_tensor, 4); - - max_tensor->axis(0)->parallelize(ParallelType::BIDx); - max_tensor->axis(3)->parallelize(ParallelType::Unswitch); - max_tensor->axis(4)->parallelize(ParallelType::TIDy); - max_tensor->axis(5)->parallelize(ParallelType::TIDx); - - scheduler_utils::parallelizeAllLike(max_tensor, ir_utils::allTvs(&fusion)); - - inp_cache->setMemoryType(MemoryType::Shared); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - const int hw = 50; - const int num_channels = 20; - const int pooling_window = 3; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor aten_inp = at::randn({num_channels, hw, hw}, options); - // We always pad inputs by zero, so if all surrounding values are - // negative, max pooling would pick a padded value, which isn't the - // correct behavior. We need to be able to choose the value of - // padding. In this case, padding by the minimum value would not - // have this problem. For now, avoid the problem by making sure all - // values are not negative. - aten_inp = at::abs(aten_inp); - std::vector inputs = {aten_inp}; - - auto outputs = fe.runFusion(inputs); - - auto ref = at::max_pool2d( - aten_inp, {pooling_window, pooling_window}, {3, 3}, {1, 1}); - - testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionConv2DStaticStrided_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 6) { - return; - } - Fusion fusion; - FusionGuard fg(&fusion); - - // Input: [C, H, W] - auto inp = makeSymbolicTensor(3); - fusion.addInput(inp); - - // Weights: [K, C, 3, 3] - auto w = makeSymbolicTensor(4); - fusion.addInput(w); - - // Gather a neighbor tile of [3, 3] with padding size of 1 for each - // side of the spatial dimensions - auto inp_tile = gather(inp, {1, 3, 3}, {{0, 0}, {1, 1}, {1, 1}}, {1, 3, 3}); - // inp_tile: [C, H/3, s3, W/3, s3, 1, 3, 3] - - auto inp_bc = - broadcast(inp_tile, {true, false, false, false, false, false, false}); - auto w_bc = broadcast(w, {false, false, true, true, true, false, false}); - - auto inp_times_w = mul(inp_bc, w_bc); - - // Reduce the channel and neighbor tile dimensions - auto out = sum(inp_times_w, {1, 4, 5, 6}); - - fusion.addOutput(out); - - //////////////////////////////////// - - // Cache the input and weight tensors - auto inp_cache = inp->cache_after(); - - // Blocking the spatial dimensions - const int block_w = 16; - const int block_h = 4; - const int block_c = 2; - - // [K, C, H/s, W/s, 1, 3, 3] - out->split(2, block_h); - // [K, C, H/s/block_h, block_h, W/s, 1, 3, 3] - out->split(4, block_w); - // [K, C, H/s/block_h, block_h, W/s/block_w, block_w, 1, 3, 3] - out->reorder({{3, 4}}); - // [K, C, H/s/block_h, W/s/block_w, block_h, block_w, 1, 3, 3] - out->split(1, block_c); - // [K, C/block_c, block_c, H/s/block_h, W/s/block_w, block_h, block_w, 1, 3, - // 3] - out->split(4, 1); - // [K, C/block_c, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, 1, - // 3, 3] - - auto out_rf = out->rFactor({1, -3, -2, -1}); - // [K, C/block_c, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, 1, - // 3, 3] - - // out: [K, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w] - - inp_cache->computeAt(out, 5); - inp_cache->setMemoryType(MemoryType::Shared); - // [K, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, C/block_c, 1, - // 3, 3] - - // Move C/block_c before block_h/2 and share the domain from - // inp_cache to out_rf - out_rf->reorder({{7, 5}, {5, 6}, {6, 7}}); - inp_cache->computeAt(out_rf, 6); - - inp_tile->computeAt(out_rf, -1); - w->computeAt(out_rf, -1); - - out->axis(0)->parallelize(ParallelType::BIDx); - out->axis(1)->parallelize(ParallelType::TIDz); - out->axis(4)->parallelize(ParallelType::Unswitch); - out->axis(5)->parallelize(ParallelType::TIDy); - out->axis(6)->parallelize(ParallelType::TIDx); - - scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf}); - - FusionExecutor fe; - fe.compileFusion(&fusion); - - const int dim_h = 99; - const int dim_w = 101; - const int dim_c = 10; - const int dim_f = 20; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::manual_seed(0); - at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options); - at::Tensor at_w = at::randn({dim_f, dim_c, 3, 3}, options); - std::vector inputs = {at_inp, at_w}; - - auto cg_outputs = fe.runFusion(inputs); - - at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis - auto at_out = at::conv2d(at_inp, at_w, {}, 3, 1); - at_out = at_out.squeeze(0); // drop the N axis - - testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionNonDivisibleHalo1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - - auto tv1 = add(tv0, new Double(1)); - auto tv2 = shift(tv1, {-1}); - fusion.addOutput(tv2); - - // [I] - tv2->split(0, 8); - // [I/8, 8] - tv2->split(1, 3); - // [I/8, 3, 3] - - tv0->computeAt(tv2, -2); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({24}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({t0}); - - auto ref = shift((t0 + 1), {-1}); - - testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); -} - -TEST(NVFuserTest, FusionNonDivisibleHalo2_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - - auto tv1 = gather(tv0, {3, 3}, {{1, 1}, {1, 1}}); - auto tv2 = sum(tv1, {-2, -1}); - auto tv3 = add(tv0, tv2); - auto tv4 = sum(tv3, {0, 1}); - fusion.addOutput(tv4); - - const int gy = 50; - const int gx = 50; - const int by = 8; - const int bx = 16; - - auto tv5 = tv0->cache_after(); - - // [I, J] - tv4->split(0, gy); - // [I/gy, gy, J] - tv4->split(1, by); - // [I/gy, gy/by, by, J] - tv4->split(-1, gx); - // [I/gy, gy/by, by, J/gx, gx] - tv4->split(-1, bx); - // [I/gy, gy/by, by, J/gx, gx/bx, bx] - tv4->reorder({{3, 1}, {1, 2}, {4, 3}, {2, 4}}); - // [I/gy, J/gx, gy/by, gx/bx, by, bx] - - auto tv6 = tv4->rFactor({2, 3}); - - tv0->computeAt(tv6, 4); - - tv4->axis(0)->parallelize(ParallelType::BIDy); - tv4->axis(1)->parallelize(ParallelType::BIDx); - tv4->axis(2)->parallelize(ParallelType::TIDy); - tv4->axis(3)->parallelize(ParallelType::TIDx); - - scheduler_utils::parallelizeAllLike(tv4, {tv1, tv2, tv3, tv5, tv6}); - - tv5->setMemoryType(MemoryType::Shared); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({111, 222}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion); - auto cg_outputs = fe.runFusion({t0}); - - auto t1 = gather(t0, {3, 3}, {{1, 1}, {1, 1}}); - auto t2 = t1.sum({-2, -1}); - auto t3 = t0 + t2; - auto t4 = t3.sum({-2, -1}); - - testValidate(&fusion, cg_outputs, {t0}, {t4}, __LINE__, __FILE__); -} - -} // namespace jit -} // namespace torch -#endif // #if defined(USE_CUDA) diff --git a/test/cpp/jit/test_graph_iterator.cpp b/test/cpp/jit/test_graph_iterator.cpp index 75edac875b19..00d1f9a6a28c 100644 --- a/test/cpp/jit/test_graph_iterator.cpp +++ b/test/cpp/jit/test_graph_iterator.cpp @@ -62,7 +62,7 @@ void assert_ordering( ASSERT_EQ(expected.size(), actual.size()) << "Got " << actual.size() << " elements (" << actual << ")" << " expected " << expected.size() << " elements (" << expected << ")"; - for (int i = 0; i < expected.size(); i++) { + for (unsigned i = 0; i < expected.size(); i++) { ASSERT_EQ(expected[i], actual[i]) << "Difference at index " << i << " in " << actual << " (expected " << actual << ")"; diff --git a/test/cpp/jit/test_lite_interpreter.cpp b/test/cpp/jit/test_lite_interpreter.cpp index 0e40e48514d1..d01c611bbaec 100644 --- a/test/cpp/jit/test_lite_interpreter.cpp +++ b/test/cpp/jit/test_lite_interpreter.cpp @@ -571,20 +571,35 @@ namespace { void compareModelOutput( c10::ArrayRef actual_result_list, - const std::vector& expect_result_list) { + const std::vector& expect_result_list) { AT_ASSERT(actual_result_list.size() == expect_result_list.size()); - AT_ASSERT(actual_result_list[0].toTensor().equal(expect_result_list[0])); AT_ASSERT( - actual_result_list[1].toTensor().dim() == expect_result_list[1].dim()); - AT_ASSERT(actual_result_list[2].toTensor().equal(expect_result_list[2])); - AT_ASSERT(actual_result_list[3].toTensor().equal(expect_result_list[3])); + actual_result_list[0].toTensor().equal(expect_result_list[0].toTensor())); + AT_ASSERT( + actual_result_list[1].toTensor().dim() == + expect_result_list[1].toTensor().dim()); + AT_ASSERT( + actual_result_list[2].toTensor().equal(expect_result_list[2].toTensor())); + AT_ASSERT( + actual_result_list[3].toTensor().equal(expect_result_list[3].toTensor())); + ASSERT_EQ( + actual_result_list[4].toStringRef(), expect_result_list[4].toStringRef()); + ASSERT_EQ(actual_result_list[5].toBool(), expect_result_list[5].toBool()); + ASSERT_EQ(actual_result_list[6].toBool(), expect_result_list[6].toBool()); + ASSERT_EQ(actual_result_list[7].toBool(), expect_result_list[7].toBool()); + AT_ASSERT( + actual_result_list[8].toTensor().equal(expect_result_list[8].toTensor())); + ASSERT_EQ( + actual_result_list[9].toStringRef(), expect_result_list[9].toStringRef()); + ASSERT_EQ(actual_result_list[10].toInt(), expect_result_list[10].toInt()); + ASSERT_EQ(actual_result_list[11].toBool(), expect_result_list[11].toBool()); } void runAndCheckTorchScriptModel( std::stringstream& input_model_stream, const std::vector& input_data, - const std::vector& expect_result_list, - const int64_t expect_version) { + const std::vector& expect_result_list, + const uint64_t expect_version) { auto actual_version = _get_model_bytecode_version(input_model_stream); AT_ASSERT(actual_version == expect_version); @@ -600,8 +615,8 @@ void runAndCheckTorchScriptModel( void runAndCheckBytecodeModel( std::stringstream& input_model_stream, const std::vector& input_data, - const std::vector& expect_result_list, - const int64_t expect_version) { + const std::vector& expect_result_list, + const uint64_t expect_version) { auto actual_version = _get_model_bytecode_version(input_model_stream); AT_ASSERT(actual_version == expect_version); @@ -618,14 +633,15 @@ void runAndCheckBytecodeModel( void backportAllVersionCheck( std::stringstream& test_model_file_stream, std::vector& input_data, - std::vector& expect_result_list, - const int64_t expect_from_version) { + std::vector& expect_result_list, + const uint64_t expect_from_version) { auto from_version = _get_model_bytecode_version(test_model_file_stream); AT_ASSERT(from_version == expect_from_version); + AT_ASSERT(from_version > 0); // Backport script_module_v5.ptl to an older version constexpr int64_t minimum_to_version = 4; - int64_t current_to_version = from_version - 1; + auto current_to_version = from_version - 1; // Verify all candidate to_version work as expected. All backport to version // larger than minimum_to_version should success. @@ -641,12 +657,14 @@ void backportAllVersionCheck( // Check backport model version auto backport_version = _get_model_bytecode_version(oss); + backport_version = _get_model_bytecode_version(oss); AT_ASSERT(backport_version == current_to_version); // Load and run the backport model, then compare the result with expect // result runAndCheckBytecodeModel( oss, input_data, expect_result_list, current_to_version); + oss.seekg(0, oss.beg); runAndCheckTorchScriptModel( oss, input_data, expect_result_list, current_to_version); @@ -668,6 +686,9 @@ TEST(LiteInterpreterTest, BackPortByteCodeModelAllVersions) { // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) module.register_parameter("bias", torch::ones({20}), false); module.define(R"( + def fn(self, x:float=1.0): + return x + def forward(self, input): x1 = torch.zeros(2, 2) x2 = torch.empty_like(torch.empty(2, 2)) @@ -677,21 +698,52 @@ TEST(LiteInterpreterTest, BackPortByteCodeModelAllVersions) { x = 2 * torch.ones(1) h = torch.ones(1) torch.add(x, h, out=x) - return (x1, x2, x3, x) - )"); + device = torch.ones(1, 1).cpu().device.type + is_cuda = x1.is_cuda + bool_val = True + check_is = [] is None + check_is_not = [1] is not None + check_not = not bool_val + num_to_tensor = torch.tensor([self.fn()]) + d = {"a": "abc"} + check_dict_index = d["a"] + check_dim = x1.dim() + return ( + x1, x2, x3, x, device, is_cuda, check_is, + check_is_not, num_to_tensor, check_dict_index, + check_dim, check_not + ) + )"); torch::jit::Module module_freeze = freeze(module); std::stringstream input_model_stream; +#if defined(ENABLE_FLATBUFFER) + module_freeze._save_for_mobile( + input_model_stream, + /*extra_files=*/{}, + /*save_mobile_debug_info=*/false, + /*use_flatbuffer=*/true); +#else module_freeze._save_for_mobile(input_model_stream); +#endif std::vector input_data = std::vector({torch::ones({1, 1, 28, 28})}); - std::vector expect_result_list; + std::vector expect_result_list; expect_result_list.emplace_back(at::ones({2, 2}, ScalarType::Float) * 0); expect_result_list.emplace_back(at::ones({2, 2}, ScalarType::Float)); expect_result_list.emplace_back( at::ones({1, 20, 24, 24}, ScalarType::Float) * 26); expect_result_list.emplace_back(3 * at::ones({1})); + // "cpu" False, False, True, tensor(1), "abc", 2, False) + expect_result_list.emplace_back(c10::IValue("cpu")); + expect_result_list.emplace_back(c10::IValue(false)); + expect_result_list.emplace_back(c10::IValue(false)); + expect_result_list.emplace_back(c10::IValue(true)); + expect_result_list.emplace_back(c10::IValue(at::ones({1}))); + expect_result_list.emplace_back(c10::IValue("abc")); + expect_result_list.emplace_back(c10::IValue(2)); + expect_result_list.emplace_back(c10::IValue(false)); backportAllVersionCheck( input_model_stream, @@ -950,7 +1002,6 @@ TEST(LiteInterpreterTest, ExtraFiles) { module->_save_for_mobile(oss, extra_files); std::istringstream iss(oss.str()); - caffe2::serialize::IStreamAdapter adapter{&iss}; std::unordered_map loaded_extra_files; loaded_extra_files["metadata.json"] = ""; torch::jit::_load_for_mobile(iss, torch::kCPU, loaded_extra_files); @@ -965,7 +1016,7 @@ TEST(LiteInterpreterTest, ExtraFiles) { loaded_extra_files[file_name.substr(6)] = ""; } } - + iss.seekg(0, iss.beg); torch::jit::_load_for_mobile(iss, torch::kCPU, loaded_extra_files); ASSERT_EQ(loaded_extra_files["metadata.json"], "abc"); ASSERT_EQ(loaded_extra_files["mobile_info.json"], "{\"key\": 23}"); @@ -1145,7 +1196,6 @@ TEST(RunTimeTest, ParseOperator) { function.get()); parseOperators( std::move(*c10::ivalue::Tuple::create(operators)).elements(), - model_version, 1, function.get()); const size_t rsize = 5; @@ -1528,7 +1578,6 @@ TEST(RunTimeTest, RuntimeCall) { foo.get()); parseOperators( std::move(*c10::ivalue::Tuple::create(operatorsFoo)).elements(), - model_version, 1, foo.get()); parseConstants( @@ -1545,7 +1594,6 @@ TEST(RunTimeTest, RuntimeCall) { call.get()); parseOperators( std::move(*c10::ivalue::Tuple::create(operatorsCall)).elements(), - model_version, 1, call.get()); parseConstants( @@ -2043,16 +2091,14 @@ TEST(LiteInterpreterUpgraderTest, Upgrader) { std::vector upgrader_functions; for (auto& byteCodeFunctionWithOperator : getUpgraderBytecodeList()) { + byteCodeFunctionWithOperator.function.initialize_operators(true); ASSERT_EQ( byteCodeFunctionWithOperator.function.get_code().operators_.size(), byteCodeFunctionWithOperator.function.get_code().op_names_.size()); if (byteCodeFunctionWithOperator.function.get_code().operators_.empty()) { for (const auto& op : byteCodeFunctionWithOperator.operators) { byteCodeFunctionWithOperator.function.append_operator( - op.name, - op.overload_name, - op.num_specified_args, - caffe2::serialize::kMaxSupportedFileFormatVersion); + op.name, op.overload_name, op.num_specified_args); } } upgrader_functions.push_back(byteCodeFunctionWithOperator.function); diff --git a/test/cpp/jit/test_lite_trainer.cpp b/test/cpp/jit/test_lite_trainer.cpp index cf3040f4fba4..ede1c3a8355b 100644 --- a/test/cpp/jit/test_lite_trainer.cpp +++ b/test/cpp/jit/test_lite_trainer.cpp @@ -158,6 +158,139 @@ TEST(MobileTest, SaveLoadParametersEmpty) { AT_ASSERT(mobile_params.size() == 0); } +TEST(MobileTest, SaveParametersDefaultsToZip) { + // Save some empty parameters. + std::map empty_parameters; + std::stringstream ss_data; + _save_parameters(empty_parameters, ss_data); + + // Verify that parameters were serialized to a ZIP container. + EXPECT_GE(ss_data.str().size(), 4); + EXPECT_EQ(ss_data.str()[0], 'P'); + EXPECT_EQ(ss_data.str()[1], 'K'); + EXPECT_EQ(ss_data.str()[2], '\x03'); + EXPECT_EQ(ss_data.str()[3], '\x04'); +} + +#if defined(ENABLE_FLATBUFFER) +TEST(MobileTest, SaveParametersCanUseFlatbuffer) { + // Save some empty parameters using flatbuffer. + std::map empty_parameters; + std::stringstream ss_data; + _save_parameters(empty_parameters, ss_data, /*use_flatbuffer=*/true); + + // Verify that parameters were serialized to a flatbuffer. The flatbuffer + // magic bytes should be at offsets 4..7. The first four bytes contain an + // offset to the actual flatbuffer data. + EXPECT_GE(ss_data.str().size(), 8); + EXPECT_EQ(ss_data.str()[4], 'P'); + EXPECT_EQ(ss_data.str()[5], 'T'); + EXPECT_EQ(ss_data.str()[6], 'M'); + EXPECT_EQ(ss_data.str()[7], 'F'); +} +#else // !defined(ENABLE_FLATBUFFER) +TEST(MobileTest, SaveParametersThrowsWithoutFlatbufferSupport) { + // Some empty parameters to try saving. + std::map empty_parameters; + std::stringstream ss_data; + + // Save using flatbuffers should fail when support isn't compiled in. Make + // sure we get the exception that explicitly mentions the lack of flatbuffer + // support. + try { + _save_parameters(empty_parameters, ss_data, /*use_flatbuffer=*/true); + FAIL() << "_save_parameters should have thrown"; + } catch (const ::c10::Error& e) { + static const std::string kExpectedSubstring = + "build hasn't enabled flatbuffer"; + EXPECT_TRUE( + std::string(e.msg()).find(kExpectedSubstring) != std::string::npos) + << "Exception message does not contain expected substring \"" + << kExpectedSubstring << "\": actual message \"" << e.msg() << "\""; + } catch (...) { + FAIL() << "Unexpected exception type"; + } +} +#endif // !defined(ENABLE_FLATBUFFER) + +#if defined(ENABLE_FLATBUFFER) +TEST(MobileTest, SaveLoadParametersUsingFlatbuffers) { + // Create some simple parameters to save. + std::map input_params; + input_params["four_by_ones"] = 4 * torch::ones({}); + input_params["three_by_ones"] = 3 * torch::ones({}); + + // Serialize them using flatbuffers. + std::stringstream data; + _save_parameters(input_params, data, /*use_flatbuffer=*/true); + + // The flatbuffer magic bytes should be at offsets 4..7. + EXPECT_EQ(data.str()[4], 'P'); + EXPECT_EQ(data.str()[5], 'T'); + EXPECT_EQ(data.str()[6], 'M'); + EXPECT_EQ(data.str()[7], 'F'); + + // Read them back and check that they survived the trip. + auto output_params = _load_parameters(data); + EXPECT_EQ(output_params.size(), 2); + { + auto four_by_ones = 4 * torch::ones({}); + EXPECT_EQ( + output_params["four_by_ones"].item(), four_by_ones.item()); + } + { + auto three_by_ones = 3 * torch::ones({}); + EXPECT_EQ( + output_params["three_by_ones"].item(), three_by_ones.item()); + } +} +#else // !defined(ENABLE_FLATBUFFER) +TEST(MobileTest, LoadParametersFailsWithoutFlatbufferSupport) { + // Create some data that looks like a flatbuffer header. + std::stringstream data; + data << "abcd" + << "PTMF" // Flatbuffer magic + << "ijkl"; + + // Loading the "flatbuffer" data should fail. Make sure we see the expected + // exception, not just any exception; since this isn't properly-formed + // flatbuffer data, any attempt to parse it might throw a different error type + // or message, but we don't expect anyone to try parsing it. + try { + _load_parameters(data); + FAIL() << "_load_parameters should have thrown"; + } catch (const ::c10::Error& e) { + static const std::string kExpectedSubstring = + "build hasn't enabled flatbuffer"; + EXPECT_TRUE( + std::string(e.msg()).find(kExpectedSubstring) != std::string::npos) + << "Exception message does not contain expected substring \"" + << kExpectedSubstring << "\": actual message \"" << e.msg() << "\""; + } catch (...) { + FAIL() << "Unexpected exception type"; + } +} +#endif // !defined(ENABLE_FLATBUFFER) + +TEST(MobileTest, LoadParametersUnexpectedFormatShouldThrow) { + // Manually create some data that doesn't look like a ZIP or Flatbuffer file. + // Make sure it's longer than 8 bytes, since getFileFormat() needs that much + // data to detect the type. + std::stringstream bad_data; + bad_data << "abcd" + << "efgh" + << "ijkl"; + + // Loading parameters from it should throw an exception. + EXPECT_ANY_THROW(_load_parameters(bad_data)); +} + +TEST(MobileTest, LoadParametersEmptyDataShouldThrow) { + // Loading parameters from an empty data stream should throw an exception. + std::stringstream empty; + EXPECT_ANY_THROW(_load_parameters(empty)); +} + TEST(LiteTrainerTest, SGD) { Module m("m"); m.register_parameter("foo", torch::ones({1}, at::requires_grad()), false); diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp index 099588d90d45..88d447fdf2d7 100644 --- a/test/cpp/jit/test_misc.cpp +++ b/test/cpp/jit/test_misc.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -42,15 +43,18 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -1379,6 +1383,39 @@ TEST(ThreadLocalDebugInfoTest, Basic) { } } +TEST(TestSymIntArrayRef, BasicConversion) { + const size_t X = 2, Y = 4, Z = 5; + std::vector tgt_size_v{2, 4, 5}; + std::vector tgt_size({SymInt(X), SymInt(Y), SymInt(Z)}); + auto a = at::randn({1, 4, 1}, at::kCPU); + auto b = a.expand(tgt_size); + auto c = a.expand(tgt_size_v); + ASSERT_TRUE(torch::allclose(b, c)); +} + +TEST(TestSymInt, NarrowCopyWithSymbolicInt) { + static const size_t LENGTH = 5; + auto a = at::randn({10}, at::kCPU); + c10::SymInt si(LENGTH); + auto b = a.narrow_copy(0, 0, si); + auto c = a.narrow(0, 0, LENGTH); + ASSERT_TRUE(torch::allclose(b, c)); +} + +TEST(TestSymInt, NarrowCopy) { + static const size_t LENGTH = 5; + auto a = at::randn({10}, at::kCPU); + auto b = a.narrow_copy(0, 0, LENGTH); + auto c = a.narrow(0, 0, LENGTH); + ASSERT_TRUE(torch::allclose(b, c)); +} + +TEST(TestSymInt, AddSymbolicInt) { + c10::SymInt a(5); + c10::SymInt b(3); + ASSERT_TRUE((a + b).expect_int() == 8); +} + TEST(FallbackGraphsTest, Basic) { static const auto nestGraphIntoFallbackGraph = [](const std::shared_ptr& graph) { @@ -2867,6 +2904,33 @@ graph(%x.1 : Tensor): testing::FileCheck().check_not("aten::relu_")->run(*graph); } +TEST(TestRegisterShapeOp, Basic) { + auto graph = std::make_shared(); + std::unordered_map vmap; + parseIR( + R"IR( +graph(): + %2 : int = prim::Constant[value=5]() + %3: int[] = prim::ListConstruct(%2, %2) + return (%3))IR", + &*graph, + vmap); + + auto g2 = std::make_shared(); + parseIR( + R"IR( +graph(): + %2 : Tensor = prim::MakeTestTensor() + return (%2))IR", + &*g2, + vmap); + + const FunctionSchema& schema = g2->nodes().begin()->schema(); + torch::jit::RegisterShapeComputeGraphForSchema(schema, graph); + PropagateShapesOnGraph(g2); + testing::FileCheck().check("5, 5")->run(*g2); +} + TEST(TestFunctionalToInplaceActivation, Basic) { auto graph = std::make_shared(); std::unordered_map vmap; @@ -2884,6 +2948,70 @@ graph(%x.1 : Tensor): testing::FileCheck().check_not("aten::relu(")->run(*graph); } +TEST(TestFunctionExecutor, SimpleExecutorTest) { + auto graph = std::make_shared(); + parseIR( + R"IR( +graph(%x.1 : Tensor): + %2 : int = prim::Constant[value=1]() + %x.3 : Tensor = aten::add(%x.1, %2, %2) + %y : Tensor = aten::relu(%x.3) + return (%y))IR", + &*graph); + { + auto func = torch::make_unique( + "name", graph, [](GraphFunction&) {}, ExecutorExecutionMode::PROFILING); + auto a = at::rand({2, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat)); + Stack stack = {a}; + func->run(stack); + auto g = lastExecutedOptimizedGraph(); + testing::FileCheck() + .check("prim::profile") + ->check("aten::add") + ->check("aten::relu") + ->run(*g); + } + { + auto func = torch::make_unique( + "name", graph, [](GraphFunction&) {}, ExecutorExecutionMode::SIMPLE); + auto a = at::rand({2, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat)); + Stack stack = {a}; + func->run(stack); + auto g = func->getDebugState().graph; + testing::FileCheck() + .check_not("prim::profile") + ->check("aten::add") + ->check("aten::relu") + ->run(*g); + } +} + +TEST(TestFunctionExecutor, RunDecompositionTest) { + static auto* func = torch::jit::GetDecompositionExecutor( + "aten::var(Tensor self, bool unbiased=True) -> Tensor"); + for (bool unbiased : {true, false}) { + auto input = at::rand({4, 4}); + Stack stack = {input, unbiased}; + func->run(stack); + at::Tensor out = pop(stack).toTensor(); + ASSERT_TRUE(at::allclose(out, input.var(unbiased))); + } +} + +TEST(TestShapeGraphLinting, Basic) { + auto schemas = RegisteredShapeComputeSchemas(); + for (const auto& schema : schemas) { + // arange does not acually support complex, leave as + // union[int, float] for now + if (schema->name() == "aten::arange") { + continue; + } + auto g = shapeComputeGraphForSchema(*schema); + TORCH_INTERNAL_ASSERT(g); + LintShapeComputeGraph(schema, *g); + } +} + // TODO: move to test_kernel when global settings are explicit // fusion parameters class Composed : public ::testing::Test { diff --git a/test/cpp/jit/test_save_load.cpp b/test/cpp/jit/test_save_load.cpp index 88bff7ea93e8..6ecf67917ec0 100644 --- a/test/cpp/jit/test_save_load.cpp +++ b/test/cpp/jit/test_save_load.cpp @@ -3,7 +3,9 @@ #include #include +#include #include +#include #include #include #include @@ -13,6 +15,32 @@ namespace torch { namespace jit { +namespace { + +Module roundtripThroughMobile(const Module& m) { + ExtraFilesMap files; + std::vector constants; + jitModuleToPythonCodeAndConstants(m, &files, &constants); + CompilationOptions options; + mobile::Module mobilem = jitModuleToMobile(m, options); + return jitModuleFromSourceAndConstants( + mobilem._ivalue(), files, constants, 8); +} + +template +inline void expectThrowsEq(Functor&& functor, const char* expectedMessage) { + try { + std::forward(functor)(); + } catch (const Error& e) { + EXPECT_STREQ(e.what_without_backtrace(), expectedMessage); + return; + } + ADD_FAILURE() << "Expected to throw exception with message \"" + << expectedMessage << "\" but didn't throw"; +} + +} // namespace + TEST(SerializationTest, ExtraFilesHookPreference) { // Tests that an extra file written explicitly has precedence over // extra files written by a hook @@ -149,5 +177,87 @@ TEST(SerializationTest, TestJitStream_CUDA) { // Check if both the output tensors are equal ASSERT_TRUE(op.equal(c)); } + +TEST(TestSourceRoundTrip, UpsampleNearest2d) { + Module m("m"); + m.define(R"( + def forward(self, input: Tensor, scale:float): + return torch.upsample_nearest2d(input, [1, 1], float(scale), float(scale)) + )"); + + std::vector inputs; + inputs.emplace_back(torch::rand({1, 3, 128, 128})); + inputs.emplace_back(at::Scalar(2.0)); + auto ref = m.forward(inputs); + + Module m2 = roundtripThroughMobile(m); + auto res = m2.forward(inputs); + + auto resd = res.toTensor(); + auto refd = ref.toTensor(); + ASSERT_TRUE(resd.equal(refd)); +} + +TEST(TestSourceRoundTrip, CheckAttrAccess) { + Module m("m"); + m.register_attribute("mobile_optimized", BoolType::get(), true); + Module m2 = roundtripThroughMobile(m); + bool mobile_optimized = m2.attr("mobile_optimized", false).toBool(); + AT_ASSERT(mobile_optimized); +} + +TEST(TestSourceRoundTrip, + MethodInvocation) { // NOLINT (use =delete in gtest) + const std::vector test_programs{ + // test invoking a method with default parameter + R"( + def test_func(self, x, b : int = 4): + return self.foo + x + b + )", + // inner method call with default parameter (gets inlined) + R"( + def add_with_default_arg(self, x, b : int = 4): + return self.foo + x + b + def test_func(self, x): + return self.add_with_default_arg(x) # invoke method w/ default arg + )", + // simple method call + R"( + def test_func(self, x): + b = 4 + return self.foo + x + b + )", + }; + for (const auto& test_program : test_programs) { + Module m("m"); + m.register_parameter("foo", torch::ones({}), false); + m.define(test_program); + + const int fortyTwo = 42; // (keep linter happy) + auto minput = fortyTwo * torch::ones({}); + auto ref = m.run_method("test_func", minput); + + Module m2 = roundtripThroughMobile(m); + const auto& test_func = m2.get_method("test_func"); + IValue res; + for (int i = 0; i < 3; ++i) { + res = test_func({minput}); + } + + auto resd = res.toTensor().item(); + auto refd = ref.toTensor().item(); + AT_ASSERT(resd == refd); + } +} + +TEST(SerializationTest, ParentDirNotExist) { + expectThrowsEq( + []() { + auto t = torch::nn::Linear(5, 5); + torch::save(t, "./doesnotexist/file.pt"); + }, + "Parent directory ./doesnotexist does not exist."); +} + } // namespace jit } // namespace torch diff --git a/test/cpp/jit/test_shape_analysis.cpp b/test/cpp/jit/test_shape_analysis.cpp index baf9f16e6e79..15f41da22952 100644 --- a/test/cpp/jit/test_shape_analysis.cpp +++ b/test/cpp/jit/test_shape_analysis.cpp @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include #include #include @@ -30,7 +32,6 @@ Node* findNode(std::shared_ptr& g, Symbol k) { } TORCH_INTERNAL_ASSERT(false, "Couldn't find node"); } - } // namespace TEST(ShapeAnalysisTest, DynamicShapesFusion) { @@ -169,7 +170,7 @@ TEST(ShapeAnalysisTest, DynamicShapesFusion) { /* Test guard behaves correctly at runtime and symbolic shapes are computed - correctly. As we don't have have TE Kernel support for dynamic shapes we're + correctly. As we don't have TE Kernel support for dynamic shapes we're going to return all of the computed runtime symbolic dimensions as outputs of the graph on guard success, and return None on guard failure */ @@ -292,5 +293,191 @@ TEST(ShapeAnalysisTest, MovingConstantOutOfFusionGroups) { ->run(*g); } +namespace { + +// NOLINTNEXTLINE(bugprone-easily-swappable-parameters) +void assertShapeEqual(c10::SymbolicShape& a, c10::SymbolicShape& e) { + auto a_canonical = CanonicalizedSymbolicShape(a); + auto e_canonical = CanonicalizedSymbolicShape(e); + EXPECT_EQ(a_canonical, e_canonical); +} + +void assertShapeEqual( + c10::optional>& actual, + std::vector> expected) { + ASSERT_TRUE(actual.has_value()); + ASSERT_EQ(actual->size(), 1); + + auto symb_expected = c10::SymbolicShape(expected); + assertShapeEqual(actual->at(0), symb_expected); +} + +const FunctionSchema* getSchema(const char* name) { + return &(getOperatorForLiteral(name)->schema()); +} +} // namespace + +TEST(ShapeAnalysisTest, SymbolicShapeAPI) { + // Figure out how to fetch a function schema + + // Ask someone else how to create a function schema / operator in C++ + auto schema = getSchema( + "aten::sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor"); + + c10::IValue const_size_1 = std::vector{64, 56, 56}; + c10::IValue const_size_2 = std::vector{1, 56, 56}; + + // Check vector initializer list syntax + c10::optional sym_dim = c10::nullopt; + c10::SymbolicShape ss_concrete = + std::vector>{1, 56, 56}; + c10::SymbolicShape ss1 = std::vector>{sym_dim, 56, 56}; + c10::SymbolicShape ss2 = + std::vector>{64, sym_dim, sym_dim}; + c10::SymbolicShape ss3 = + std::vector>{sym_dim, sym_dim, sym_dim, sym_dim}; + + auto res = calculateSymbolicShapesOnOp( + schema, std::vector{const_size_1, const_size_1}); + assertShapeEqual(res, {64, 56, 56}); + + res = calculateSymbolicShapesOnOp( + schema, std::vector{const_size_1, const_size_2}); + assertShapeEqual(res, {64, 56, 56}); + + res = calculateSymbolicShapesOnOp( + schema, std::vector{const_size_1, ss1}); + assertShapeEqual(res, {64, 56, 56}); + + res = calculateSymbolicShapesOnOp( + schema, std::vector{const_size_2, ss1}); + assertShapeEqual(res, {sym_dim, 56, 56}); + + res = calculateSymbolicShapesOnOp( + schema, std::vector{ss_concrete, ss2}); + assertShapeEqual(res, {64, 56, 56}); + + res = calculateSymbolicShapesOnOp(schema, std::vector{ss2, ss3}); + assertShapeEqual(res, {sym_dim, 64, sym_dim, sym_dim}); +} + +TEST(ShapeAnalysisTest, SymbolicShapeCaching) { + clear_shape_cache(); + auto schema = getSchema("aten::mm(Tensor self, Tensor mat2) -> Tensor"); + + c10::IValue const_size_1 = std::vector{64, 56}; + c10::IValue const_size_2 = std::vector{64, 56}; + c10::IValue const_size_3 = std::vector{64, 20}; + + c10::optional sym_dim = c10::nullopt; + c10::SymbolicShape ss1 = c10::SymbolicShape({sym_dim, 64}); + c10::SymbolicShape ss2 = c10::SymbolicShape({sym_dim, 64}); + c10::SymbolicShape ss3 = c10::SymbolicShape({sym_dim, sym_dim}); + + auto res = calculateSymbolicShapesOnOp(schema, {ss1, const_size_1}); + assertShapeEqual(res, {sym_dim, 56}); + auto res1_val = res->at(0); + + // The exact same arguments should return the exact same result + res = calculateSymbolicShapesOnOp(schema, {ss1, const_size_1}); + auto res2_val = res->at(0); + EXPECT_EQ(res1_val, res2_val); + EXPECT_EQ(get_shape_cache_size(), 1); + + // Same shape but different symbols should return same shape + // but different symbolic indicies + res = calculateSymbolicShapesOnOp(schema, {ss2, const_size_2}); + auto res3_val = res->at(0); + + assertShapeEqual(res3_val, res2_val); + EXPECT_NE(res3_val, res2_val); + EXPECT_EQ(get_shape_cache_size(), 1); + + // Different concrete shape should be cached separately + res = calculateSymbolicShapesOnOp(schema, {ss1, const_size_3}); + assertShapeEqual(res, {sym_dim, 20}); + EXPECT_EQ(get_shape_cache_size(), 2); + + res = calculateSymbolicShapesOnOp(schema, {ss3, const_size_3}); + assertShapeEqual(res, {sym_dim, 20}); + EXPECT_EQ(get_shape_cache_size(), 3); + + res = calculateSymbolicShapesOnOp(schema, {ss3, ss3}); + assertShapeEqual(res, {sym_dim, sym_dim}); + EXPECT_EQ(get_shape_cache_size(), 4); +} + +TEST(ShapeAnalysisTest, ShapeCacheMultipleFns) { + clear_shape_cache(); + + auto squeeze_op = + getSchema("aten::squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)"); + auto mul_tensor = + getSchema("aten::mul.Tensor(Tensor self, Tensor other) -> Tensor"); + auto mul_scalar = + getSchema("aten::mul.Scalar(Tensor self, Scalar other) -> Tensor"); + auto div_tensor = + getSchema("aten::div.Tensor(Tensor self, Tensor other) -> Tensor"); + auto matmul = getSchema("aten::mm(Tensor self, Tensor mat2) -> Tensor"); + + c10::IValue const_int = 1; + + c10::optional sym_dim = c10::nullopt; + c10::SymbolicShape ss1 = c10::SymbolicShape({sym_dim, 64}); + + auto res = calculateSymbolicShapesOnOp(squeeze_op, {ss1, const_int}); + assertShapeEqual(res, {sym_dim, 64}); + + // Show that cache can handle multiple functions + res = calculateSymbolicShapesOnOp(mul_scalar, {ss1, const_int}); + assertShapeEqual(res, {sym_dim, 64}); + EXPECT_EQ(get_shape_cache_size(), 2); + + res = calculateSymbolicShapesOnOp(mul_tensor, {ss1, ss1}); + assertShapeEqual(res, {sym_dim, 64}); + EXPECT_EQ(get_shape_cache_size(), 3); + + // Even when the expected outcome is the same, should not collide + res = calculateSymbolicShapesOnOp(div_tensor, {ss1, ss1}); + assertShapeEqual(res, {sym_dim, 64}); + EXPECT_EQ(get_shape_cache_size(), 4); + + // Don't lose cached objects + res = calculateSymbolicShapesOnOp(mul_scalar, {ss1, const_int}); + assertShapeEqual(res, {sym_dim, 64}); + EXPECT_EQ(get_shape_cache_size(), 4); + + res = calculateSymbolicShapesOnOp(matmul, {ss1, ss1}); + // SSA can infer that sym_dim is 64 as both tensors + // use the same sym_dim + assertShapeEqual(res, {64, 64}); + EXPECT_EQ(get_shape_cache_size(), 5); +} + +TEST(ShapeAnalysisTest, TestShapeMultipleReturns) { + clear_shape_cache(); + + auto max_dim_op = getSchema( + "aten::max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)"); + c10::IValue const_int = 1; + c10::IValue false_ival = false; + + c10::optional sym_dim = c10::nullopt; + c10::SymbolicShape ss1 = c10::SymbolicShape({sym_dim, 64}); + c10::SymbolicShape ss2 = c10::SymbolicShape({sym_dim, 64}); + + auto res = + calculateSymbolicShapesOnOp(max_dim_op, {ss1, const_int, false_ival}); + c10::SymbolicShape expected_res = c10::SymbolicShape({sym_dim}); + assertShapeEqual(res->at(0), expected_res); + // res0 and res1 should share the same symbolic symbol + EXPECT_EQ(res->at(0), res->at(1)); + + // Also test that the shape cache also returns consistent result shapes + res = calculateSymbolicShapesOnOp(max_dim_op, {ss2, const_int, false_ival}); + assertShapeEqual(res->at(0), expected_res); + EXPECT_EQ(res->at(0), res->at(1)); + EXPECT_EQ(get_shape_cache_size(), 1); +} } // namespace jit } // namespace torch diff --git a/test/cpp/jit/test_utils.h b/test/cpp/jit/test_utils.h index 1a1e1b82b10e..a1b1a76c851c 100644 --- a/test/cpp/jit/test_utils.h +++ b/test/cpp/jit/test_utils.h @@ -17,37 +17,33 @@ static inline void trim(std::string& s) { [](unsigned char ch) { return !std::isspace(ch); }) .base(), s.end()); - for (int64_t i = 0; i < s.size(); ++i) { - if (s[i] == '\n') { + for (size_t i = 0; i < s.size(); ++i) { + while (i < s.size() && s[i] == '\n') { s.erase(i, 1); - i--; } } - for (int64_t i = 0; i < s.size(); ++i) { + for (size_t i = 0; i < s.size(); ++i) { if (s[i] == ' ') { - for (int64_t j = i + 1; j < s.size(); j++) { - if (s[j] == ' ') { - s.erase(j, 1); - j--; - } else { - break; - } + while (i + 1 < s.size() && s[i + 1] == ' ') { + s.erase(i + 1, 1); } } } } } // namespace -#define ASSERT_THROWS_WITH_MESSAGE(statement, substring) \ - try { \ - (void)statement; \ - FAIL(); \ - } catch (const std::exception& e) { \ - std::string substring_s(substring); \ - trim(substring_s); \ - auto exception_string = std::string(e.what()); \ - trim(exception_string); \ - ASSERT_NE(exception_string.find(substring_s), std::string::npos); \ +#define ASSERT_THROWS_WITH_MESSAGE(statement, substring) \ + try { \ + (void)statement; \ + FAIL(); \ + } catch (const std::exception& e) { \ + std::string substring_s(substring); \ + trim(substring_s); \ + auto exception_string = std::string(e.what()); \ + trim(exception_string); \ + ASSERT_NE(exception_string.find(substring_s), std::string::npos) \ + << " Error was: \n" \ + << exception_string; \ } namespace torch { diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_float_v2.ptl b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_float_v2.ptl index be67cecf9705..ddee6be4c35a 100644 Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_float_v2.ptl and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_float_v2.ptl differ diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_float_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_float_v2.ptl.ff new file mode 100644 index 000000000000..4f62dbfbeb80 Binary files /dev/null and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_float_v2.ptl.ff differ diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_float_v2.ptl b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_float_v2.ptl index e5663224ac76..cb36f9aeba8b 100644 Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_float_v2.ptl and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_float_v2.ptl differ diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_float_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_float_v2.ptl.ff new file mode 100644 index 000000000000..01891bc9e4a9 Binary files /dev/null and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_float_v2.ptl.ff differ diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_int_v2.ptl b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_int_v2.ptl index 8698001427a9..443074fe7130 100644 Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_int_v2.ptl and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_int_v2.ptl differ diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_int_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_int_v2.ptl.ff new file mode 100644 index 000000000000..f932d478d0ab Binary files /dev/null and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_inplace_int_v2.ptl.ff differ diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_int_v2.ptl b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_int_v2.ptl index c52d92b29f44..ac8b1b918de7 100644 Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_int_v2.ptl and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_int_v2.ptl differ diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_int_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_int_v2.ptl.ff new file mode 100644 index 000000000000..d20ba9bf4820 Binary files /dev/null and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_int_v2.ptl.ff differ diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_float_v2.ptl b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_float_v2.ptl index 749614fa5309..323aa42dde4e 100644 Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_float_v2.ptl and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_float_v2.ptl differ diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_float_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_float_v2.ptl.ff new file mode 100644 index 000000000000..7299062135c9 Binary files /dev/null and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_float_v2.ptl.ff differ diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_int_v2.ptl b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_int_v2.ptl index b20c456058be..6d06dea6b589 100644 Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_int_v2.ptl and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_int_v2.ptl differ diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_int_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_int_v2.ptl.ff new file mode 100644 index 000000000000..700a0e5bae11 Binary files /dev/null and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_reciprocal_int_v2.ptl.ff differ diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_scalar_v2.ptl b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_scalar_v2.ptl index f33f3a8cf8de..4fd551d073ae 100644 Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_scalar_v2.ptl and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_scalar_v2.ptl differ diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_scalar_scalar_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_scalar_v2.ptl.ff new file mode 100644 index 000000000000..0b1200312851 Binary files /dev/null and b/test/cpp/jit/upgrader_models/test_versioned_div_scalar_scalar_v2.ptl.ff differ diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_tensor_inplace_v2.ptl b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_inplace_v2.ptl index ac7cc7479e79..9680713a83e2 100644 Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_tensor_inplace_v2.ptl and b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_inplace_v2.ptl differ diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_tensor_inplace_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_inplace_v2.ptl.ff new file mode 100644 index 000000000000..ce5daf444635 Binary files /dev/null and b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_inplace_v2.ptl.ff differ diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_tensor_out_v2.ptl b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_out_v2.ptl index 0b70614b0936..0381636677b5 100644 Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_tensor_out_v2.ptl and b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_out_v2.ptl differ diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_tensor_out_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_out_v2.ptl.ff new file mode 100644 index 000000000000..46b57c83fe78 Binary files /dev/null and b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_out_v2.ptl.ff differ diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_tensor_v2.ptl b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_v2.ptl index 5f6ae1a90b1e..21792d35b892 100644 Binary files a/test/cpp/jit/upgrader_models/test_versioned_div_tensor_v2.ptl and b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_v2.ptl differ diff --git a/test/cpp/jit/upgrader_models/test_versioned_div_tensor_v2.ptl.ff b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_v2.ptl.ff new file mode 100644 index 000000000000..963070db5149 Binary files /dev/null and b/test/cpp/jit/upgrader_models/test_versioned_div_tensor_v2.ptl.ff differ diff --git a/test/cpp/lazy/CMakeLists.txt b/test/cpp/lazy/CMakeLists.txt index ede4308816cf..4d98400323fb 100644 --- a/test/cpp/lazy/CMakeLists.txt +++ b/test/cpp/lazy/CMakeLists.txt @@ -9,9 +9,16 @@ set(LAZY_TEST_SRCS ${LAZY_TEST_ROOT}/test_misc.cpp ${LAZY_TEST_ROOT}/test_permutation_util.cpp ${LAZY_TEST_ROOT}/test_shape.cpp - ${LAZY_TEST_ROOT}/test_tensor_impl.cpp + ${LAZY_TEST_ROOT}/test_symbolic_shape.cpp + ${LAZY_TEST_ROOT}/test_trie_cache.cpp ${LAZY_TEST_ROOT}/test_util.cpp ) +if(BUILD_LAZY_TS_BACKEND) + list(APPEND LAZY_TEST_SRCS + ${LAZY_TEST_ROOT}/test_lazy_ops.cpp + ${LAZY_TEST_ROOT}/test_lazy_ops_util.cpp + ) +endif() add_executable(test_lazy ${TORCH_ROOT}/test/cpp/common/main.cpp diff --git a/test/cpp/lazy/test_backend_device.cpp b/test/cpp/lazy/test_backend_device.cpp index b75f0512d387..f8ce49b9e287 100644 --- a/test/cpp/lazy/test_backend_device.cpp +++ b/test/cpp/lazy/test_backend_device.cpp @@ -74,9 +74,13 @@ TEST(BackendDeviceTest, FromAten) { auto device = c10::Device(c10::kCPU); EXPECT_THROW(atenDeviceToBackendDevice(device), c10::Error); - // TODO(alanwaketan): Update the following test once we have TorchScript backend upstreamed. device = c10::Device(c10::kLazy); +#ifndef FBCODE_CAFFE2 + auto backend_device = atenDeviceToBackendDevice(device); +#else + // Lazy Tensor is disabled in FBCODE until addressing non-virtual methods (e.g. sizes) in TensorImpl EXPECT_THROW(atenDeviceToBackendDevice(device), c10::Error); +#endif // FBCODE_CAFFE2 } TEST(BackendDeviceTest, ToAten) { diff --git a/test/cpp/lazy/test_cache.cpp b/test/cpp/lazy/test_cache.cpp index 033b6c21b1e7..ddbf6611d36a 100644 --- a/test/cpp/lazy/test_cache.cpp +++ b/test/cpp/lazy/test_cache.cpp @@ -4,6 +4,8 @@ #include #include #include +#include +#include namespace torch { namespace lazy { @@ -11,7 +13,8 @@ namespace lazy { class CacheNode : public Node { public: explicit CacheNode(const std::string& str) - : Node(OpKind(), /* num_outputs */ 1, /* hash_seed */ Hash(str)), + : Node(OpKind(), /* num_outputs */ 1), + hash_(Hash(str)), str_(str) {} ~CacheNode() override = default; @@ -23,7 +26,10 @@ class CacheNode : public Node { TORCH_INTERNAL_ASSERT(false, "Can't access operand[i] of test node"); } + hash_t hash() const override { return hash_; } + hash_t shapeHash() const override { return hash_; } private: + hash_t hash_; std::string str_; }; @@ -33,30 +39,57 @@ TEST(CacheTest, BasicTest) { std::shared_ptr c = std::make_shared("c"); Cache cache(2); - cache.Add(a->node_hash(), a); - EXPECT_EQ(cache.Get(a->node_hash()), a); - EXPECT_EQ(cache.Get(b->node_hash()), nullptr); - EXPECT_EQ(cache.Get(c->node_hash()), nullptr); + cache.Add(a->hash(), a); + EXPECT_EQ(cache.Get(a->hash()), a); + EXPECT_EQ(cache.Get(b->hash()), nullptr); + EXPECT_EQ(cache.Get(c->hash()), nullptr); - cache.Add(b->node_hash(), b); - EXPECT_EQ(cache.Get(a->node_hash()), a); - EXPECT_EQ(cache.Get(b->node_hash()), b); - EXPECT_EQ(cache.Get(c->node_hash()), nullptr); + cache.Add(b->hash(), b); + EXPECT_EQ(cache.Get(a->hash()), a); + EXPECT_EQ(cache.Get(b->hash()), b); + EXPECT_EQ(cache.Get(c->hash()), nullptr); - cache.Add(c->node_hash(), c); - EXPECT_EQ(cache.Get(a->node_hash()), nullptr); // a has been evicted - EXPECT_EQ(cache.Get(b->node_hash()), b); - EXPECT_EQ(cache.Get(c->node_hash()), c); + cache.Add(c->hash(), c); + EXPECT_EQ(cache.Get(a->hash()), nullptr); // a has been evicted + EXPECT_EQ(cache.Get(b->hash()), b); + EXPECT_EQ(cache.Get(c->hash()), c); - cache.Erase(c->node_hash()); - EXPECT_EQ(cache.Get(a->node_hash()), nullptr); - EXPECT_EQ(cache.Get(b->node_hash()), b); - EXPECT_EQ(cache.Get(c->node_hash()), nullptr); // c has been removed + cache.Erase(c->hash()); + EXPECT_EQ(cache.Get(a->hash()), nullptr); + EXPECT_EQ(cache.Get(b->hash()), b); + EXPECT_EQ(cache.Get(c->hash()), nullptr); // c has been removed cache.Clear(); - EXPECT_EQ(cache.Get(a->node_hash()), nullptr); - EXPECT_EQ(cache.Get(b->node_hash()), nullptr); - EXPECT_EQ(cache.Get(c->node_hash()), nullptr); + EXPECT_EQ(cache.Get(a->hash()), nullptr); + EXPECT_EQ(cache.Get(b->hash()), nullptr); + EXPECT_EQ(cache.Get(c->hash()), nullptr); +} + +class CacheNodeWithShape : public TsNode { + public: + explicit CacheNodeWithShape(const Shape& shape) + : TsNode(OpKind(), shape, /* num_outputs */ 1, /* seed */ 0){} +}; + +TEST(CacheTest, ShapeCacheTestForDynamicShape) { + // enable dynamic shape + FLAGS_ltc_enable_dynamic_shapes = true; + + CacheNodeWithShape nodes[] = { + CacheNodeWithShape(Shape(c10::kFloat, {2, 4})), + CacheNodeWithShape(Shape(c10::kFloat, {4, 2})) }; + + /* + * Make sure the cached shape for node (2, 4) is not used for node (4, 2) + */ + for (auto& node : nodes) { + EXPECT_EQ(node.shape(), node.computeShape([&]() { + return node.shape(); + })); + } + + // reset the flag + FLAGS_ltc_enable_dynamic_shapes = false; } } // namespace lazy diff --git a/test/cpp/lazy/test_ir.cpp b/test/cpp/lazy/test_ir.cpp index 78b94618c7fd..1ce666164a64 100644 --- a/test/cpp/lazy/test_ir.cpp +++ b/test/cpp/lazy/test_ir.cpp @@ -1,18 +1,29 @@ #include +#include #include #include #include +#include +#include #include #include +#include +#include +#include namespace torch { namespace lazy { class TestLeafNode : public Node { public: + static OpKind ClassOpKind() { + return OpKind(); + } + explicit TestLeafNode(size_t param) - : Node(OpKind(), /* num_outputs */ 1, /* hash_seed */ Hash(param)), + : Node(ClassOpKind(), /* num_outputs */ 1), + hash_(Hash(param)), param_(param) {} ~TestLeafNode() override = default; @@ -24,7 +35,10 @@ class TestLeafNode : public Node { TORCH_INTERNAL_ASSERT(false, "Can't access operand[i] of leaf node"); } + hash_t hash() const override { return hash_; } + hash_t shapeHash() const override { return hash_; } private: + hash_t hash_; size_t param_; }; @@ -35,7 +49,7 @@ TEST(IrTest, BasicTest) { EXPECT_EQ(node1->num_outputs(), 1); - const TestLeafNode* leafptr = NodeCast(node1.get(), OpKind()); + const TestLeafNode* leafptr = NodeCast(node1.get()); EXPECT_TRUE(leafptr != nullptr); } @@ -51,22 +65,22 @@ TEST(IrTest, MetaDataTest) { node = MakeNode(1); auto metaWithEmptyDebug = node->metadata(); EXPECT_EQ(metaWithEmptyDebug.scope.size(), 0); - EXPECT_EQ(metaWithEmptyDebug.frame_info.size(), 0); + EXPECT_EQ(metaWithEmptyDebug.frame_info.size(), 1); { ScopePusher scope("TestScope"); node = MakeNode(1); auto metaWithScope = node->metadata(); EXPECT_EQ(metaWithScope.scope, "TestScope.1"); - EXPECT_EQ(metaWithScope.frame_info.size(), 0); + EXPECT_EQ(metaWithScope.frame_info.size(), 1); } SourceLocation dummySourceLocation; dummySourceLocation.file = "file"; dummySourceLocation.function = "function"; dummySourceLocation.line = 10; - RegisterGetFrameInfo( - [&]() -> std::vector { return {dummySourceLocation}; }); + GetPythonFramesFunction() = + [&]() -> std::vector { return {dummySourceLocation}; }; node = MakeNode(1); auto metaWithSourceLoc = node->metadata(); EXPECT_EQ(metaWithSourceLoc.scope.size(), 0); @@ -77,7 +91,7 @@ TEST(IrTest, MetaDataTest) { FLAGS_torch_lazy_ir_debug = restore_FLAGS_torch_lazy_ir_debug; } -TEST(IrTest, TsNode) { +TEST(IrTest, TsNodeTest) { NodePtr node1 = MakeNode( OpKind(at::aten::view), Shape(), @@ -92,9 +106,32 @@ TEST(IrTest, TsNode) { EXPECT_EQ(node1->num_outputs(), 1); - const TsNode* leafptr = NodeCast(node1.get(), OpKind(at::aten::view)); + const TsNode* leafptr = dynamic_cast(node1.get()); EXPECT_TRUE(leafptr != nullptr); } +TEST(IrTest, DimensionNodeTest) { + + const size_t DIM0 = 5; + const size_t DIM1 = 8; + NodePtr node1 = MakeNode( + OpKind(at::aten::view), + Shape(c10::kFloat, {DIM0, DIM1}), + /*num_outputs*/ 1, + /*hash_seed*/ kHashSeed); + + auto size0 = std::dynamic_pointer_cast(MakeNode(Value{node1}, 0)); + auto size1 = std::dynamic_pointer_cast(MakeNode(Value{node1}, 1)); + + ASSERT_EQ(DIM0, size0->getStaticValue()); + ASSERT_EQ(DIM1, size1->getStaticValue()); + + auto add_dim = std::dynamic_pointer_cast(MakeNode(Value{size0}, Value{size1})); + ASSERT_EQ(DIM0 + DIM1, add_dim->getStaticValue()); + + auto mul_dim = std::dynamic_pointer_cast(MakeNode(Value{size0}, Value{size1})); + ASSERT_EQ(DIM0 * DIM1, mul_dim->getStaticValue()); +} + } // namespace lazy } // namespace torch diff --git a/test/cpp/lazy/test_ir_util.cpp b/test/cpp/lazy/test_ir_util.cpp index 5c216258f9ac..ad951956db7d 100644 --- a/test/cpp/lazy/test_ir_util.cpp +++ b/test/cpp/lazy/test_ir_util.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -12,7 +13,7 @@ namespace lazy { class IrUtilNode : public Node { public: explicit IrUtilNode() - : Node(OpKind(), /* num_outputs */ 1, /* hash_seed */ Hash(0)) {} + : Node(OpKind(), /* num_outputs */ 1), hash_(Hash(0)) {} ~IrUtilNode() override = default; void AddOperand(Value v) { @@ -23,17 +24,10 @@ class IrUtilNode : public Node { operands_.push_back(std::move(v.node)); } - const std::vector& operands() const override { - return operands_as_outputs_; - } - - const Output& operand(size_t i) const override { - return operands_as_outputs_.at(i); - } - + hash_t hash() const override { return hash_; } + hash_t shapeHash() const override { return hash_; } private: - std::vector operands_; - std::vector operands_as_outputs_; + hash_t hash_; }; /* a diff --git a/test/cpp/lazy/test_lazy_ops.cpp b/test/cpp/lazy/test_lazy_ops.cpp new file mode 100644 index 000000000000..f12d357760e6 --- /dev/null +++ b/test/cpp/lazy/test_lazy_ops.cpp @@ -0,0 +1,10773 @@ +#include +#include +#include "c10/core/DeviceType.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace torch { +namespace lazy { + + +// Lazy Tensor is disabled in FBCODE until addressing non-virtual methods (e.g. sizes) in TensorImpl +#ifndef FBCODE_CAFFE2 + +namespace { + // This registers the torchscript backend, without which lazy device won't work +static bool inline init_backend(){ + torch::lazy::InitTorchScriptBackend(); + return true; +} +static const bool backend_initialized = init_backend(); + +} + +class LazyTsTest : public ::testing::Test { + protected: + void SetUp() override; + + void TearDown() override; + + static void CommonSetup() {} + + void ExpectCounterNotChanged( + const std::string& counter_regex, + const std::unordered_set* ignore_set) {} + + void ExpectCounterChanged(const std::string& counter_regex, + const std::unordered_set* ignore_set) { + } + + void ResetCounters() {} + + private: + void MakeEndSnapshot() {} +}; + +class LazyOpsTestBase : public LazyTsTest { + protected: + static void SetUpTestCase() {} +}; + +void LazyTsTest::SetUp() { + (void)backend_initialized; // avoid unused parameter warning + at::manual_seed(42); + torch::lazy::LazyGraphExecutor::Get()->SetRngSeed(torch::lazy::BackendDevice(), 42); +} + +void LazyTsTest::TearDown() {} + +namespace { +using torch::lazy::DebugUtil; + +class LazyOpsTest : public LazyOpsTestBase {}; + +static inline bool IsCuda() { + return torch::lazy::getBackend()->EagerFallbackDeviceType() == at::kCUDA; +} + +static inline at::DeviceType DefaultDevice() { + return torch::lazy::getBackend()->EagerFallbackDeviceType(); +} + + +} // namespace + +TEST(LazyDynamicOpsTest, NarrowCopy) { + auto x = torch::rand({5, 10, 10}).to(kLazy); + const size_t Y_DIM = 3; + const size_t X_DIM_INDEX = 2; + auto y = torch::rand({Y_DIM}).to(kLazy); + auto ly = torch::lazy::TryGetLtcTensor(y); + auto dim_node = MakeNode(ly->GetIrValue(), 0); + auto lmn = std::make_shared(dim_node); + auto z = x.narrow_copy(X_DIM_INDEX, 0, lmn->toSymInt()); + AllClose(z.cpu(), x.cpu().narrow_copy(X_DIM_INDEX, 0, Y_DIM)); +} + +TEST_F(LazyOpsTest, TestScalarTensor) { + torch::Tensor scalar_tensor = torch::scalar_tensor( + 1., torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_scalar_tensor = torch::scalar_tensor( + 1., torch::TensorOptions(torch::kFloat).device(torch::kLazy)); + AllClose(scalar_tensor, lazy_scalar_tensor); + }); +} + +TEST_F(LazyOpsTest, TestClone) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = lazy_a.clone(); + AllClose(a, lazy_b); + lazy_a.add_(1.0); + AllClose(a, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestTo) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_a = CopyToDevice(a, device); + AllClose(a, lazy_a); + }); +} + +TEST_F(LazyOpsTest, TestIsFloatingPoint) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_a = CopyToDevice(a, device); + bool is_float = torch::is_floating_point(a); + bool lazy_is_float = torch::is_floating_point(lazy_a); + EXPECT_EQ(is_float, lazy_is_float); + }); +} + +TEST_F(LazyOpsTest, TestIsSigned) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_a = CopyToDevice(a, device); + bool is_signed = torch::is_signed(a); + bool lazy_is_signed = torch::is_signed(lazy_a); + EXPECT_EQ(is_signed, lazy_is_signed); + }); +} + +TEST_F(LazyOpsTest, TestCastByte) { + torch::Tensor a = + torch::rand({2, 2}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 100.0; + torch::Tensor b = torch::_cast_Byte(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::_cast_Byte(lazy_a); + AllEqual(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestCastChar) { + torch::Tensor a = + torch::rand({2, 2}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 100.0; + torch::Tensor b = torch::_cast_Char(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::_cast_Char(lazy_a); + AllEqual(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestCastShort) { + torch::Tensor a = + torch::rand({2, 2}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 100.0; + torch::Tensor b = torch::_cast_Short(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::_cast_Short(lazy_a); + AllEqual(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestCastInt) { + torch::Tensor a = + torch::rand({2, 2}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 100.0; + torch::Tensor b = torch::_cast_Int(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::_cast_Int(lazy_a); + AllEqual(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestCastLong) { + torch::Tensor a = + torch::rand({2, 2}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 100.0; + torch::Tensor b = torch::_cast_Long(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::_cast_Long(lazy_a); + AllEqual(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestCastFloat) { + torch::Tensor a = + torch::rand({2, 2}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 100.0; + torch::Tensor b = torch::_cast_Float(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::_cast_Float(lazy_a); + AllEqual(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestRetainType) { + torch::Tensor lazy_a = torch::zeros( + {2, 2}, torch::TensorOptions(torch::kByte).device(torch::kLazy)); + torch::Tensor lazy_b = torch::ones( + {2, 2}, torch::TensorOptions(torch::kByte).device(torch::kLazy)); + torch::Tensor lazy_c = lazy_a + lazy_b; + EXPECT_EQ(lazy_c.scalar_type(), torch::ScalarType::Byte); +} + +TEST_F(LazyOpsTest, TestLogicalTypeWithInterop) { + torch::Tensor query = + torch::rand({2, 12, 20, 64}, + torch::TensorOptions(torch::kFloat).device(torch::kLazy)); + torch::Tensor key = + torch::rand({2, 12, 64, 20}, + torch::TensorOptions(torch::kFloat).device(torch::kLazy)); + torch::Tensor scores = + torch::matmul(query, key) / + torch::scalar_tensor( + 8, torch::TensorOptions(torch::kDouble).device(torch::kLazy)); + torch::Tensor p_attn = torch::softmax(scores, /*dim=*/-1); + EXPECT_EQ(p_attn.scalar_type(), torch::ScalarType::Float); +} + +TEST_F(LazyOpsTest, TestAdd) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::add(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::add(lazy_a, lazy_b); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestAddHalf) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kHalf).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {2, 2}, torch::TensorOptions(torch::kHalf).device(DefaultDevice())); + torch::Tensor c = torch::add(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::add(lazy_a, lazy_b); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestAddMixedPrecision) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {2, 2}, torch::TensorOptions(torch::kHalf).device(DefaultDevice())); + torch::Tensor c = torch::add(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::add(lazy_a, lazy_b); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestAddInPlace) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor b = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor c = a.add_(b); + torch::Tensor lazy_c = lazy_a.add_(lazy_b); + AllClose(a, lazy_a); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestAddScalar) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar b(1); + torch::Tensor c = torch::add(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_c = torch::add(lazy_a, b); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestAddScalarInPlace) { + torch::Scalar b(1); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor c = a.add_(b); + torch::Tensor lazy_c = lazy_a.add_(b); + AllClose(a, lazy_a); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestAddZeroSizeDim) { + torch::Tensor a = torch::rand( + {0, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {1, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::add(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::add(lazy_a, lazy_b); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestSub) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::sub(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::sub(lazy_a, lazy_b); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestSubInPlace) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor b = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor c = a.sub_(b); + torch::Tensor lazy_c = lazy_a.sub_(lazy_b); + AllClose(a, lazy_a); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestSubScalar) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar b(1); + torch::Tensor c = torch::sub(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_c = torch::sub(lazy_a, b); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestSubScalarInPlace) { + torch::Scalar b(1); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor c = a.sub_(b); + torch::Tensor lazy_c = lazy_a.sub_(b); + AllClose(a, lazy_a); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestMul) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::mul(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::mul(lazy_a, lazy_b); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestMulInPlace) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor b = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor c = a.mul_(b); + torch::Tensor lazy_c = lazy_a.mul_(lazy_b); + AllClose(a, lazy_a); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestMulScalar) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar b(3); + torch::Tensor c = torch::mul(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_c = torch::mul(lazy_a, b); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestMulScalarInPlace) { + torch::Scalar b(3); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor c = a.mul_(b); + torch::Tensor lazy_c = lazy_a.mul_(b); + AllClose(a, lazy_a); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestDiv) { + for (torch::ScalarType scalar_type1 : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor a = + isFloatingType(scalar_type1) + ? torch::rand({3, 4}, torch::TensorOptions(scalar_type1)) + : torch::randint(0, 100, {3, 4}, + torch::TensorOptions(scalar_type1)); + for (torch::ScalarType scalar_type2 : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor b = + isFloatingType(scalar_type2) + ? torch::rand({3, 4}, torch::TensorOptions(scalar_type2)) + : torch::randint(1, 100, {3, 4}, + torch::TensorOptions(scalar_type2)); + torch::Tensor c = torch::div(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::div(lazy_a, lazy_b); + AllClose(c, lazy_c); + }); + } + } +} + +TEST_F(LazyOpsTest, TestDivWithRoundingMode) { + c10::optional rounding_modes[] = {"trunc", "floor", + c10::nullopt}; + for (const auto& rounding_mode : rounding_modes) { + for (torch::ScalarType scalar_type1 : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + int lower_bound = (scalar_type1 == torch::kByte) ? 0 : -100; + torch::Tensor a = + isFloatingType(scalar_type1) + ? torch::rand({3, 4}, torch::TensorOptions(scalar_type1)) + : torch::randint(lower_bound, 50, {3, 4}, + torch::TensorOptions(scalar_type1)); + for (torch::ScalarType scalar_type2 : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, + torch::kInt, torch::kLong}) { + torch::Tensor b = + isFloatingType(scalar_type2) + ? torch::rand({3, 4}, torch::TensorOptions(scalar_type2)) + : torch::randint(51, 100, {3, 4}, + torch::TensorOptions(scalar_type2)); + torch::Tensor c = torch::div(a, b, rounding_mode); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::div(lazy_a, lazy_b, rounding_mode); + AllClose(c, lazy_c); + }); + } + } + } +} + +TEST_F(LazyOpsTest, TestDivInPlace) { + for (torch::ScalarType scalar_type1 : {torch::kFloat}) { + torch::Tensor a = + isFloatingType(scalar_type1) + ? torch::rand({3, 4}, torch::TensorOptions(scalar_type1)) + : torch::randint(0, 100, {3, 4}, + torch::TensorOptions(scalar_type1)); + for (torch::ScalarType scalar_type2 : {torch::kFloat}) { + torch::Tensor b = + isFloatingType(scalar_type2) + ? torch::rand({3, 4}, torch::TensorOptions(scalar_type2)) + : torch::randint(1, 100, {3, 4}, + torch::TensorOptions(scalar_type2)); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor c = a.div_(b); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = lazy_a.div_(lazy_b); + ; + AllClose(c, lazy_c); + }); + } + } +} + +TEST_F(LazyOpsTest, TestDivInPlaceWithRoundingMode) { + c10::optional rounding_modes[] = {"trunc", "floor", + c10::nullopt}; + for (const auto& rounding_mode : rounding_modes) { + for (torch::ScalarType scalar_type1 : {torch::kFloat}) { + torch::Tensor a = + isFloatingType(scalar_type1) + ? torch::rand({3, 4}, torch::TensorOptions(scalar_type1)) + : torch::randint(-100, 100, {3, 4}, + torch::TensorOptions(scalar_type1)); + for (torch::ScalarType scalar_type2 : {torch::kFloat}) { + torch::Tensor b = + isFloatingType(scalar_type2) + ? torch::rand({3, 4}, torch::TensorOptions(scalar_type2)) + : torch::randint(1, 100, {3, 4}, + torch::TensorOptions(scalar_type2)); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor c = a.div_(b, rounding_mode); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = lazy_a.div_(lazy_b, rounding_mode); + AllClose(c, lazy_c); + }); + } + } + } +} + +TEST_F(LazyOpsTest, TestDivScalar) { + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor a = + isFloatingType(scalar_type) + ? torch::rand( + {3, 4}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 1, 100, {3, 4}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + for (bool is_float : {true, false}) { + torch::Scalar b = is_float ? torch::Scalar(3.0) : torch::Scalar(3); + torch::Tensor c = torch::div(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_c = torch::div(lazy_a, b); + AllClose(c, lazy_c); + }); + } + } +} + +TEST_F(LazyOpsTest, TestDivScalarInPlace) { + for (torch::ScalarType scalar_type : {torch::kFloat}) { + torch::Tensor a = + isFloatingType(scalar_type) + ? torch::rand( + {3, 4}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 1, 100, {3, 4}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + for (bool is_float : {true, false}) { + torch::Scalar b = is_float ? torch::Scalar(3.0) : torch::Scalar(3); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor c = a.div_(b); + torch::Tensor lazy_c = lazy_a.div_(b); + AllClose(c, lazy_c); + }); + } + } +} + +TEST_F(LazyOpsTest, TestDivOut) { + for (torch::ScalarType scalar_type : {torch::kFloat, torch::kDouble}) { + torch::Tensor a = torch::rand( + {3, 4}, torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {3, 4}, torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor c = torch::empty( + {3, 4}, torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::div_out(c, a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::empty({3, 4}, lazy_b.options()); + torch::div_out(lazy_c, lazy_a, lazy_b); + AllClose(c, lazy_c); + }); + } +} + +TEST_F(LazyOpsTest, TestRsubScalar) { + torch::Tensor input = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar other(1.5); + torch::Scalar alpha(2.5); + torch::Tensor result = torch::rsub(input, other, alpha); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_result = torch::rsub(lazy_input, other, alpha); + AllClose(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestNe) { + torch::Tensor a = torch::rand( + {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::ne(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::ne(lazy_a, lazy_b); + AllEqual(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestNeInplace) { + torch::Tensor a = torch::rand( + {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor a_copy = a.clone(); + torch::Tensor b = a.clone(); + b[0] += 1; + a.ne_(b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a_copy, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + lazy_a.ne_(lazy_b); + AllClose(a, lazy_a); + }); +} + +TEST_F(LazyOpsTest, TestEq) { + torch::Tensor a = torch::rand( + {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = a.clone(); + torch::Tensor c = torch::eq(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::eq(lazy_a, lazy_b); + AllEqual(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestEqInplace) { + torch::Tensor a = torch::rand( + {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = a.clone(); + b[0] += 1; + torch::Tensor a_copy = a.clone(); + a.eq_(b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a_copy, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + lazy_a.eq_(lazy_b); + AllClose(lazy_a, a); + }); +} + +TEST_F(LazyOpsTest, TestGe) { + torch::Tensor a = torch::rand( + {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = a.clone(); + torch::Tensor c = torch::ge(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::ge(lazy_a, lazy_b); + AllEqual(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestGeInplace) { + torch::Tensor a = torch::rand( + {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = a.clone(); + b[0] += 1; + b[1] -= 1; + torch::Tensor a_copy = a.clone(); + a.ge_(b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a_copy, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + lazy_a.ge_(lazy_b); + AllClose(lazy_a, a); + }); +} + +TEST_F(LazyOpsTest, TestLe) { + torch::Tensor a = torch::rand( + {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = a.clone(); + torch::Tensor c = torch::le(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::le(lazy_a, lazy_b); + AllEqual(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestLeInplace) { + torch::Tensor a = torch::rand( + {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = a.clone(); + b[0] += 1; + b[1] -= 1; + torch::Tensor a_copy = a.clone(); + a.le_(b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a_copy, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + lazy_a.le_(lazy_b); + AllClose(lazy_a, a); + }); +} + +TEST_F(LazyOpsTest, TestGt) { + torch::Tensor a = torch::rand( + {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::add(a.clone(), torch::ones_like(a)); + torch::Tensor c = torch::gt(b, a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::gt(lazy_b, lazy_a); + AllEqual(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestGtInplace) { + torch::Tensor a = torch::rand( + {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = a.clone(); + b[0] += 1; + b[1] -= 1; + torch::Tensor a_copy = a.clone(); + a.gt_(b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a_copy, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + lazy_a.gt_(lazy_b); + AllClose(lazy_a, a); + }); +} + +TEST_F(LazyOpsTest, TestLt) { + torch::Tensor a = torch::rand( + {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::add(a.clone(), torch::ones_like(a)); + torch::Tensor c = torch::lt(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::lt(lazy_a, lazy_b); + AllEqual(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestLtInplace) { + torch::Tensor a = torch::rand( + {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = a.clone(); + b[0] += 1; + b[1] -= 1; + torch::Tensor a_copy = a.clone(); + a.lt_(b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a_copy, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + lazy_a.lt_(lazy_b); + AllClose(lazy_a, a); + }); +} + +TEST_F(LazyOpsTest, TestNeScalar) { + torch::Tensor input = torch::ones({2, 3}); + torch::Scalar other(float(0)); + torch::Tensor result = torch::ne(input, other); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_result = torch::ne(lazy_input, other); + AllEqual(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestEqScalar) { + torch::Tensor input = torch::ones({2, 3}); + torch::Scalar other(float(1)); + torch::Tensor result = torch::eq(input, other); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_result = torch::eq(lazy_input, other); + AllEqual(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestGeScalar) { + torch::Tensor input = torch::ones({2, 3}); + torch::Scalar other(float(1)); + torch::Tensor result = torch::ge(input, other); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_result = torch::ge(lazy_input, other); + AllEqual(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestGeScalarInplace) { + torch::Tensor input = torch::arange( + -1., 1.5, 0.5, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar other(float(0)); + torch::Tensor input_copy = input.clone(); + input.ge_(other); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input_copy, device); + lazy_input.ge_(other); + AllClose(lazy_input, input); + }); +} + +TEST_F(LazyOpsTest, TestLeScalar) { + torch::Tensor input = torch::ones({2, 3}); + torch::Scalar other(float(1)); + torch::Tensor result = torch::le(input, other); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_result = torch::le(lazy_input, other); + AllEqual(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestLeScalarInplace) { + torch::Tensor input = torch::arange( + -1., 1.5, 0.5, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar other(float(0)); + torch::Tensor input_copy = input.clone(); + input.le_(other); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input_copy, device); + lazy_input.le_(other); + AllClose(lazy_input, input); + }); +} + +TEST_F(LazyOpsTest, TestGtScalar) { + torch::Tensor input = torch::ones({2, 3}); + torch::Scalar other(float(0.5)); + torch::Tensor result = torch::gt(input, other); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_result = torch::gt(lazy_input, other); + AllEqual(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestGtScalarInplace) { + torch::Tensor input = torch::arange( + -1., 1.5, 0.5, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar other(float(0)); + torch::Tensor input_copy = input.clone(); + input.gt_(other); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input_copy, device); + lazy_input.gt_(other); + AllClose(lazy_input, input); + }); +} + +TEST_F(LazyOpsTest, TestLtScalar) { + torch::Tensor input = torch::ones({2, 3}); + torch::Scalar other(float(1.5)); + torch::Tensor result = torch::lt(input, other); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_result = torch::lt(lazy_input, other); + AllEqual(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestLtScalarInplace) { + torch::Tensor input = torch::arange( + -1., 1.5, 0.5, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar other(float(0)); + torch::Tensor input_copy = input.clone(); + input.lt_(other); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input_copy, device); + lazy_input.lt_(other); + AllClose(lazy_input, input); + }); +} + +TEST_F(LazyOpsTest, TestIntegerAdd) { + std::vector types( + {torch::kByte, torch::kChar, torch::kShort, torch::kInt, torch::kLong}); + + ForEachDevice([&](const torch::Device& device) { + for (auto type : types) { + torch::Tensor a = + torch::randint(0, 63, {2, 2}, torch::TensorOptions(type)); + torch::Tensor b = + torch::randint(0, 63, {2, 2}, torch::TensorOptions(type)); + torch::Scalar one = + isIntegralType(type) ? torch::Scalar(1) : torch::Scalar(1.0); + torch::Tensor c = torch::add(b, one); + + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::add(lazy_b, one); + + AllEqual(c, lazy_c); + } + }); +} + +TEST_F(LazyOpsTest, TestSVD) { + static const int dims[] = {4, 7}; + for (auto m : dims) { + for (auto n : dims) { + torch::Tensor a = torch::rand( + {m, n}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + auto b = torch::svd(a, /*some=*/true, /*compute_uv=*/true); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + auto lazy_b = torch::svd(lazy_a, /*some=*/true, /*compute_uv=*/true); + // The U and V matrices might have different sign for column vectors, so + // cannot be compared if not by absolute value. + AllClose(std::get<0>(b).abs(), std::get<0>(lazy_b).abs(), /*rtol=*/1e-3, + /*atol=*/1e-4); + torch::Tensor diag = std::get<1>(b); + torch::Tensor lazy_diag = std::get<1>(lazy_b); + ASSERT_EQ(diag.sizes(), lazy_diag.sizes()); + AllClose(diag, lazy_diag, /*rtol=*/1e-3, + /*atol=*/1e-4); + AllClose(std::get<2>(b).abs(), std::get<2>(lazy_b).abs(), /*rtol=*/1e-3, + /*atol=*/1e-4); + }); + } + } +} + +TEST_F(LazyOpsTest, TestQR) { + static const int dims[] = {4, 7}; + for (auto m : dims) { + for (auto n : dims) { + torch::Tensor a = torch::rand( + {m, n}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + auto b = torch::qr(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + auto lazy_b = torch::qr(lazy_a); + AllClose(std::get<0>(b).abs(), std::get<0>(lazy_b).abs(), /*rtol=*/1e-3, + /*atol=*/1e-4); + AllClose(std::get<1>(b).abs(), std::get<1>(lazy_b).abs(), /*rtol=*/1e-3, + /*atol=*/1e-4); + }); + } + } +} + +TEST_F(LazyOpsTest, TestSymEig) { + static const int dims[] = {4, 7}; + for (auto m : dims) { + for (bool eigenvectors : {true, false}) { + for (bool upper : {true, false}) { + torch::Tensor a = torch::rand( + {m, m}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor sym_a = a.mm(a.t()); + auto b = torch::symeig(sym_a, eigenvectors, upper); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(sym_a, device); + auto lazy_b = torch::symeig(lazy_a, eigenvectors, upper); + AllClose(std::get<0>(b), std::get<0>(lazy_b), /*rtol=*/3e-2, + /*atol=*/1e-2); + if (eigenvectors) { + AllClose(std::get<1>(b).abs(), std::get<1>(lazy_b).abs(), + /*rtol=*/3e-2, + /*atol=*/1e-2); + } else { + EXPECT_EQ(std::get<1>(b).sizes(), std::get<1>(lazy_b).sizes()); + } + }); + } + } + } +} + +TEST_F(LazyOpsTest, TestCholesky) { + static const int dims[] = {4, 7}; + for (auto m : dims) { + for (bool upper : {true, false}) { + torch::Tensor a = torch::rand( + {3, m, m}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor pd_a = + torch::matmul(a, torch::transpose(a, 1, 2)) + + torch::eye( + m, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + auto b = torch::cholesky(pd_a, upper); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(pd_a, device); + auto lazy_b = torch::cholesky(lazy_a, upper); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-4); + }); + } + } +} + +TEST_F(LazyOpsTest, TestLogDet) { + static const int dims[] = {4, 7}; + for (auto m : dims) { + torch::Tensor a = torch::rand( + {3, m, m}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor pd_a = + torch::matmul(a, torch::transpose(a, 1, 2)) + + torch::eye(m, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::logdet(pd_a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(pd_a, device); + torch::Tensor lazy_b = torch::logdet(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-4); + }); + } +} + +TEST_F(LazyOpsTest, TestTriangularSolve) { + static const int dims[] = {4, 7}; + for (bool batched_a : {true, false}) { + for (bool batched_b : {true, false}) { + for (auto m : dims) { + for (auto n : dims) { + for (bool upper : {true, false}) { + for (bool transpose : {true, false}) { + for (bool unitriangular : {true, false}) { + torch::Tensor a = + torch::randn({m, m}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice())); + torch::Tensor b = + torch::randn({m, n}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice())); + a = batched_a ? a.expand({3, m, m}).clone() : a; + b = batched_b ? b.expand({3, m, n}).clone() : b; + auto result = torch::triangular_solve( + b, a, /*upper=*/upper, /*transpose=*/transpose, + /*unitriangular=*/unitriangular); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + auto lazy_result = torch::triangular_solve( + lazy_b, lazy_a, /*upper=*/upper, /*transpose=*/transpose, + /*unitriangular=*/unitriangular); + AllClose(std::get<0>(result), std::get<0>(lazy_result), + /*rtol=*/1e-3, /*atol=*/1e-4); + AllClose(std::get<1>(result), std::get<1>(lazy_result), + /*rtol=*/1e-3, /*atol=*/1e-4); + }); + } + } + } + } + } + } + } +} + +TEST_F(LazyOpsTest, TestKthValue) { + torch::Tensor a = torch::rand( + {4, 5, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (int k = 1; k <= 3; ++k) { + int rank = a.dim(); + for (int dim = -rank; dim < rank; ++dim) { + for (bool keepdim : {false, true}) { + auto b = torch::kthvalue(a, k, dim, keepdim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + auto lazy_b = torch::kthvalue(lazy_a, k, dim, keepdim); + AllClose(std::get<0>(b), std::get<0>(lazy_b)); + AllEqual(std::get<1>(b), std::get<1>(lazy_b)); + }); + } + } + } +} + +TEST_F(LazyOpsTest, TestTopK) { + torch::Tensor a = torch::rand( + {4, 5, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (int k = 1; k <= 3; ++k) { + int rank = a.dim(); + for (int dim = -rank; dim < rank; ++dim) { + for (bool largest : {false, true}) { + auto b = torch::topk(a, k, dim, largest, /*sorted=*/true); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + auto lazy_b = torch::topk(lazy_a, k, dim, largest, /*sorted=*/true); + AllClose(std::get<0>(b), std::get<0>(lazy_b)); + AllEqual(std::get<1>(b), std::get<1>(lazy_b)); + }); + } + } + } +} + +TEST_F(LazyOpsTest, TestSort) { + torch::Tensor a = torch::rand( + {4, 5, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (int k = 1; k <= 3; ++k) { + for (int dim = 0; dim < 3; ++dim) { + for (bool descending : {false, true}) { + auto b = torch::sort(a, dim, descending); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + auto lazy_b = torch::sort(lazy_a, dim, descending); + AllClose(std::get<0>(b), std::get<0>(lazy_b)); + AllEqual(std::get<1>(b), std::get<1>(lazy_b)); + }); + } + } + } +} + +TEST_F(LazyOpsTest, TestSortDescWithMinValue) { + std::vector values{-128, 100}; + torch::Tensor input = + torch::tensor(values, torch::TensorOptions(torch::kChar)); + auto output = torch::sort(input, /*dim=*/0, /*descending=*/true); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + auto lazy_output = torch::sort(lazy_input, /*dim=*/0, /*descending=*/true); + AllEqual(std::get<0>(output), std::get<0>(lazy_output)); + AllEqual(std::get<1>(output), std::get<1>(lazy_output)); + }); +} + +TEST_F(LazyOpsTest, TestArgSort) { + torch::Tensor a = torch::rand( + {4, 5, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (int k = 1; k <= 3; ++k) { + for (int dim = 0; dim < 3; ++dim) { + for (bool descending : {false, true}) { + torch::Tensor b = torch::argsort(a, dim, descending); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::argsort(lazy_a, dim, descending); + AllEqual(b, lazy_b); + }); + } + } + } +} + +TEST_F(LazyOpsTest, TestMin) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::min(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::min(lazy_a, lazy_b); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestMax) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::max(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::max(lazy_a, lazy_b); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestUnaryMin) { + torch::Tensor input = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output = torch::min(input); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::min(lazy_input); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestUnaryMax) { + torch::Tensor input = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output = torch::max(input); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::max(lazy_input); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestAll) { + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor a = + isFloatingType(scalar_type) + ? torch::rand( + {3, 4}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {3, 4}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor b = torch::all(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::all(lazy_a); + EqualValues(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestAllDim) { + torch::Tensor a = torch::randint( + 0, 5, {2, 3, 4}, + torch::TensorOptions(torch::kByte).device(DefaultDevice())); + int rank = a.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor b = torch::all(a, dim, /*keepdim=*/false); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::all(lazy_a, dim, /*keepdim=*/false); + EqualValues(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestAllDimKeep) { + torch::Tensor a = torch::randint( + 0, 5, {2, 3, 4}, + torch::TensorOptions(torch::kByte).device(DefaultDevice())); + int rank = a.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor b = torch::all(a, dim, /*keepdim=*/true); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::all(lazy_a, dim, /*keepdim=*/true); + EqualValues(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestAmax) { + torch::Tensor input = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int rank = input.dim(); + for (bool keepdim : {false, true}) { + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor values = torch::amax(input, {dim}, /*keepdim=*/keepdim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_values = + torch::amax(lazy_input, {dim}, /*keepdim=*/keepdim); + AllClose(values, lazy_values); + }); + } + for (int dim1 = -rank; dim1 < rank; ++dim1) { + for (int dim2 = -rank; dim2 < rank; ++dim2) { + if ((dim1 == dim2) || (dim1 == rank + dim2) || (dim2 == rank + dim1)) + continue; + torch::Tensor values = + torch::amax(input, {dim1, dim2}, /*keepdim=*/keepdim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_values = + torch::amax(lazy_input, {dim1, dim2}, /*keepdim=*/keepdim); + AllClose(values, lazy_values); + }); + } + } + } + ExpectCounterNotChanged("aten::.*", GetIgnoredCounters()); + ExpectCounterChanged("xla::amax", GetIgnoredCounters()); +} + +TEST_F(LazyOpsTest, TestAmin) { + torch::Tensor input = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int rank = input.dim(); + for (bool keepdim : {false, true}) { + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor values = torch::amin(input, {dim}, /*keepdim=*/keepdim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_values = + torch::amin(lazy_input, {dim}, /*keepdim=*/keepdim); + AllClose(values, lazy_values); + }); + } + for (int dim1 = -rank; dim1 < rank; ++dim1) { + for (int dim2 = -rank; dim2 < rank; ++dim2) { + if ((dim1 == dim2) || (dim1 == rank + dim2) || (dim2 == rank + dim1)) + continue; + torch::Tensor values = + torch::amin(input, {dim1, dim2}, /*keepdim=*/keepdim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_values = + torch::amin(lazy_input, {dim1, dim2}, /*keepdim=*/keepdim); + AllClose(values, lazy_values); + }); + } + } + } + ExpectCounterNotChanged("aten::.*", GetIgnoredCounters()); + ExpectCounterChanged("xla::amin", GetIgnoredCounters()); +} + +TEST_F(LazyOpsTest, TestAny) { + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor a = + isFloatingType(scalar_type) + ? torch::rand( + {3, 4}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {3, 4}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor b = torch::any(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::any(lazy_a); + EqualValues(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestAnyDim) { + torch::Tensor a = torch::randint( + 0, 5, {2, 3, 4}, + torch::TensorOptions(torch::kByte).device(DefaultDevice())); + int rank = a.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor b = torch::any(a, dim, /*keepdim=*/false); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::any(lazy_a, dim, /*keepdim=*/false); + EqualValues(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestAnyDimKeep) { + torch::Tensor a = torch::randint( + 0, 5, {2, 3, 4}, + torch::TensorOptions(torch::kByte).device(DefaultDevice())); + int rank = a.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor b = torch::any(a, dim, /*keepdim=*/true); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::any(lazy_a, dim, /*keepdim=*/true); + EqualValues(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestMean) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::mean(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::mean(lazy_a); + ASSERT_EQ(b.sizes(), lazy_b.sizes()); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestMeanCast) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::mean(a, torch::kDouble); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::mean(lazy_a, torch::kDouble); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestMeanInDim) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int rank = a.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor b = torch::mean(a, {dim}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::mean(lazy_a, {dim}); + AllClose(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestMeanInDims) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (auto dims : std::vector>{{0, 1}, {-3, -2}}) { + torch::Tensor b = torch::mean(a, dims); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::mean(lazy_a, dims); + AllClose(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestMeanInDimsKeepCast) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (auto dims : std::vector>{{0, 1}, {-3, -2}}) { + torch::Tensor b = torch::mean(a, dims, true, torch::kDouble); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::mean(lazy_a, dims, true, torch::kDouble); + AllClose(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestMeanInDimOut) { + torch::Tensor a = torch::rand( + {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int rank = a.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor b = torch::empty( + {4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::mean_out(b, a, {dim}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::empty({4, 4}, lazy_a.options()); + torch::mean_out(lazy_b, lazy_a, {dim}); + AllClose(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestStd) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (auto unbiased : {true, false}) { + torch::Tensor b = torch::std(a, unbiased); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::std(lazy_a, unbiased); + AllClose(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestStdInDim) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int rank = a.dim(); + for (auto unbiased : {true, false}) { + for (auto keepdim : {true, false}) { + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor b = torch::std(a, {dim}, unbiased, keepdim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::std(lazy_a, {dim}, unbiased, keepdim); + AllClose(b, lazy_b); + }); + } + } + } +} + +TEST_F(LazyOpsTest, TestStdWithCorrection) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + // int rank = a.dim(); + c10::optional corrections[] = {1, 2, c10::nullopt}; + for (const auto& correction : corrections) { + for (auto keepdim : {true, false}) { + for (const auto& dim : + std::vector>{{0, 1}, {-3, -2}}) { + torch::Tensor b = torch::std(a, dim, correction, keepdim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::std(lazy_a, dim, correction, keepdim); + AllClose(b, lazy_b); + }); + } + } + } +} + +TEST_F(LazyOpsTest, TestStdMeanWithCorrection) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + // int rank = a.dim(); + c10::optional corrections[] = {1, 2, c10::nullopt}; + for (const auto& correction : corrections) { + for (auto keepdim : {true, false}) { + for (const auto& dim : + std::vector>{{0, 1}, {-3, -2}}) { + auto b = torch::std_mean(a, dim, correction, keepdim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + auto lazy_b = torch::std_mean(lazy_a, dim, correction, keepdim); + AllClose(std::get<0>(b), std::get<0>(lazy_b)); + AllClose(std::get<1>(b), std::get<1>(lazy_b)); + }); + } + } + } +} + +TEST_F(LazyOpsTest, TestSum) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::sum(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::sum(lazy_a); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestSumCast) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::sum(a, torch::kDouble); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::sum(lazy_a, torch::kDouble); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestSumU8) { + torch::Tensor a = torch::ones( + {256}, torch::TensorOptions(torch::kByte).device(DefaultDevice())); + torch::Tensor b = torch::sum(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::sum(lazy_a); + AllEqual(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestSumInDim) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int rank = a.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor b = torch::sum(a, {dim}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::sum(lazy_a, {dim}); + AllClose(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestSumInDims) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (auto dims : std::vector>{{0, 1}, {-3, -2}}) { + torch::Tensor b = torch::sum(a, dims); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::sum(lazy_a, dims); + AllClose(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestSumInDimsKeep) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (auto dims : std::vector>{{0, 1}, {-3, -2}}) { + torch::Tensor b = torch::sum(a, dims, /*keepdim=*/true); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::sum(lazy_a, dims, /*keepdim=*/true); + AllClose(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestSumInDimsKeepCast) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (auto dims : std::vector>{{0, 1}, {-3, -2}}) { + torch::Tensor b = torch::sum(a, dims, /*keepdim=*/true, torch::kDouble); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = + torch::sum(lazy_a, dims, /*keepdim=*/true, torch::kDouble); + AllClose(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestVar) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (bool unbiased : {true, false}) { + torch::Tensor b = torch::var(a, unbiased); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::var(lazy_a, unbiased); + AllClose(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestVarWithDim) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (auto dims : std::vector>{{0, 1}, {-3, -2}}) { + for (bool keepDim : {true, false}) { + for (bool unbiased : {true, false}) { + torch::Tensor b = torch::var(a, dims, unbiased, keepDim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::var(lazy_a, dims, unbiased, keepDim); + AllClose(b, lazy_b); + }); + } + } + } +} + +TEST_F(LazyOpsTest, TestVarWithCorrection) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + c10::optional corrections[] = {1, 2, c10::nullopt}; + for (const auto& dim : std::vector>{{0, 1}, {-3, -2}}) { + for (bool keepDim : {true, false}) { + for (const auto& correction : corrections) { + torch::Tensor b = torch::var(a, dim, correction, keepDim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::var(lazy_a, dim, correction, keepDim); + AllClose(b, lazy_b); + }); + } + } + } + ExpectCounterNotChanged("aten::.*", GetIgnoredCounters()); + ExpectCounterChanged("lazy::var", GetIgnoredCounters()); +} + +TEST_F(LazyOpsTest, TestVarMeanWithCorrection) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + c10::optional corrections[] = {1, 2, c10::nullopt}; + for (const auto& dim : std::vector>{{0, 1}, {-3, -2}}) { + for (const auto& correction : corrections) { + for (auto keepdim : {true, false}) { + auto b = torch::var_mean(a, dim, correction, keepdim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + auto lazy_b = torch::var_mean(lazy_a, dim, correction, keepdim); + AllClose(std::get<0>(b), std::get<0>(lazy_b)); + AllClose(std::get<1>(b), std::get<1>(lazy_b)); + }); + } + } + } +} + +TEST_F(LazyOpsTest, TestMaxInDim) { + torch::Tensor input = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int rank = input.dim(); + for (int dim = -rank; dim < rank; ++dim) { + for (bool keepdim : {false, true}) { + auto values_indices = torch::max(input, dim, /*keepdim=*/keepdim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + auto lazy_values_indices = + torch::max(lazy_input, dim, /*keepdim=*/keepdim); + AllClose(std::get<0>(values_indices), std::get<0>(lazy_values_indices)); + AllEqual(std::get<1>(values_indices), std::get<1>(lazy_values_indices)); + }); + } + } +} + +TEST_F(LazyOpsTest, TestMinInDim) { + torch::Tensor input = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int rank = input.dim(); + for (int dim = -rank; dim < rank; ++dim) { + for (bool keepdim : {false, true}) { + auto values_indices = torch::min(input, dim, /*keepdim=*/keepdim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + auto lazy_values_indices = + torch::min(lazy_input, dim, /*keepdim=*/keepdim); + AllClose(std::get<0>(values_indices), std::get<0>(lazy_values_indices)); + AllEqual(std::get<1>(values_indices), std::get<1>(lazy_values_indices)); + }); + } + } +} + +TEST_F(LazyOpsTest, TestNorm) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::norm(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::norm(lazy_a); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestNormInDim) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (int dim : {1, -2}) { + torch::Tensor b = torch::norm(a, 2, {dim}, /*keepdim=*/false); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::norm(lazy_a, 2, {dim}, /*keepdim=*/false); + AllClose(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestNormInDims) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (auto dims : std::vector>{{1, 2}, {-2, -1}}) { + torch::Tensor b = torch::norm(a, 2, dims, /*keepdim=*/false); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::norm(lazy_a, 2, dims, /*keepdim=*/false); + AllClose(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestNormInDimsKeep) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (auto dims : std::vector>{{1, 2}, {-2, -1}}) { + torch::Tensor b = torch::norm(a, 2, dims, /*keepdim=*/true); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::norm(lazy_a, 2, dims, /*keepdim=*/true); + AllClose(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestNormalTwoTensor) { + at::Tensor mean = at::zeros({10, 10, 10}, at::dtype(at::kFloat)); + at::Tensor std = at::ones({10, 10, 10}, at::dtype(at::kFloat)); + ForEachDevice([&](const torch::Device& device) { + at::Tensor lazy_mean = CopyToDevice(mean, device); + at::Tensor lazy_std = CopyToDevice(std, device); + at::Tensor lazy_normal = at::normal(lazy_mean, lazy_std); + double res_mean = lazy_normal.mean().item().toDouble(); + double res_std = lazy_normal.std().item().toDouble(); + EXPECT_GT(res_mean, -0.06); + EXPECT_LT(res_mean, 0.06); + EXPECT_GT(res_std, 0.94); + EXPECT_LT(res_std, 1.06); + }); +} + +TEST_F(LazyOpsTest, TestNormalDoubleMean) { + at::Tensor std = at::ones({10, 10, 10}, at::dtype(at::kFloat)); + ForEachDevice([&](const torch::Device& device) { + at::Tensor lazy_std = CopyToDevice(std, device); + at::Tensor lazy_normal = at::normal(0, lazy_std); + double res_mean = lazy_normal.mean().item().toDouble(); + double res_std = lazy_normal.std().item().toDouble(); + EXPECT_GT(res_mean, -0.06); + EXPECT_LT(res_mean, 0.06); + EXPECT_GT(res_std, 0.94); + EXPECT_LT(res_std, 1.06); + }); +} + +TEST_F(LazyOpsTest, TestNormalDoubleStd) { + at::Tensor mean = at::zeros({10, 10, 10}, at::dtype(at::kFloat)); + ForEachDevice([&](const torch::Device& device) { + at::Tensor lazy_mean = CopyToDevice(mean, device); + at::Tensor lazy_normal = at::normal(lazy_mean, 1); + double res_mean = lazy_normal.mean().item().toDouble(); + double res_std = lazy_normal.std().item().toDouble(); + EXPECT_GT(res_mean, -0.06); + EXPECT_LT(res_mean, 0.06); + EXPECT_GT(res_std, 0.94); + EXPECT_LT(res_std, 1.06); + }); +} + +TEST_F(LazyOpsTest, TestNormalInPlace) { + at::Tensor a = at::zeros({10, 10, 10}, at::dtype(at::kFloat)); + ForEachDevice([&](const torch::Device& device) { + at::Tensor lazy_a = CopyToDevice(a, device); + lazy_a.normal_(/*mean=*/0, /*std=*/1); + double res_mean = lazy_a.mean().item().toDouble(); + double res_std = lazy_a.std().item().toDouble(); + EXPECT_GT(res_mean, -0.06); + EXPECT_LT(res_mean, 0.06); + EXPECT_GT(res_std, 0.94); + EXPECT_LT(res_std, 1.06); + }); +} + +TEST_F(LazyOpsTest, TestUniformInPlace) { + const double eps = 1e-3; + at::Tensor a = at::zeros({10, 10, 10}, at::dtype(at::kFloat)); + ForEachDevice([&](const torch::Device& device) { + at::Tensor lazy_a = CopyToDevice(a, device); + lazy_a.uniform_(/*from=*/0, /*to=*/1); + at::Tensor cpu_a = ToCpuTensor(lazy_a); + double res_min = cpu_a.min().item().toDouble(); + double res_max = cpu_a.max().item().toDouble(); + EXPECT_GT(res_min, 0.0 - eps); + EXPECT_LT(res_max, 1.0 + eps); + }); +} + +TEST_F(LazyOpsTest, TestRandomInPlace) { + for (auto dtype : {torch::kFloat, torch::kDouble, torch::kByte, torch::kChar, + torch::kShort, torch::kInt, torch::kLong}) { + const double eps = 0.2; + torch::Tensor a = torch::zeros({10, 10, 10}, torch::TensorOptions(dtype)); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + lazy_a.random_(/*from=*/0, /*to=*/10); + double res_mean = lazy_a.sum().item().toDouble() / a.numel(); + double res_min = lazy_a.min().item().toDouble(); + double res_max = lazy_a.max().item().toDouble(); + EXPECT_GT(res_mean, 4.5 - eps); + EXPECT_LT(res_mean, 4.5 + eps); + EXPECT_EQ(res_min, 0.0); + EXPECT_EQ(res_max, 9.0); + }); + } +} + +TEST_F(LazyOpsTest, TestRandomInPlaceDefaultFrom) { + for (auto dtype : {torch::kFloat, torch::kDouble, torch::kByte, torch::kChar, + torch::kShort, torch::kInt, torch::kLong}) { + const double eps = 0.2; + torch::Tensor a = torch::zeros({10, 10, 10}, torch::TensorOptions(dtype)); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + lazy_a.random_(/*to=*/10); + double res_mean = lazy_a.sum().item().toDouble() / a.numel(); + double res_min = lazy_a.min().item().toDouble(); + double res_max = lazy_a.max().item().toDouble(); + EXPECT_GT(res_mean, 4.5 - eps); + EXPECT_LT(res_mean, 4.5 + eps); + EXPECT_EQ(res_min, 0.0); + EXPECT_EQ(res_max, 9.0); + }); + } +} + +TEST_F(LazyOpsTest, TestRandomInPlaceDefault) { + for (auto dtype : {torch::kFloat, torch::kDouble, torch::kByte, torch::kChar, + torch::kShort, torch::kInt, torch::kLong}) { + auto input = torch::zeros({10}, torch::TensorOptions(dtype)); + ForEachDevice([&](const torch::Device& device) { + auto lazyInput = CopyToDevice(input, device); + lazyInput.random_(); + auto output = ToCpuTensor(lazyInput); + EXPECT_TRUE(torch::all(output.ne(input)).item()); + }); + } +} + +TEST_F(LazyOpsTest, TestNormGeneral) { + torch::Tensor a = torch::randn( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::norm(a, 3.5); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::norm(lazy_a, 3.5); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestNormNuclear) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::norm(a, 1); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::norm(lazy_a, 1); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestFrobeniusNorm) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::frobenius_norm(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::frobenius_norm(lazy_a); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestFrobeniusNormInDim) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (int dim : {1, -2}) { + torch::Tensor b = torch::frobenius_norm(a, {dim}, /*keepdim=*/false); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = + torch::frobenius_norm(lazy_a, {dim}, /*keepdim=*/false); + AllClose(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestFrobeniusNormInDims) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (auto dims : std::vector>{{1, 2}, {-2, -1}}) { + torch::Tensor b = torch::frobenius_norm(a, dims, /*keepdim=*/false); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = + torch::frobenius_norm(lazy_a, dims, /*keepdim=*/false); + AllClose(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestGroupNorm) { + int num_channels = 6; + torch::Tensor input = + torch::rand({20, num_channels, 10, 10}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor weight = + torch::rand({num_channels}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor bias = + torch::rand({num_channels}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + double eps = 1e-05; + for (int num_groups : {3, 6, 1}) { + torch::Tensor output = + torch::group_norm(input, num_groups, weight, bias, eps, + /*cudnn_enabled=*/false); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_weight = CopyToDevice(weight, device); + torch::Tensor lazy_bias = CopyToDevice(bias, device); + torch::Tensor lazy_output = + torch::group_norm(lazy_input, num_groups, lazy_weight, lazy_bias, eps, + /*cudnn_enabled=*/false); + AllClose(output, lazy_output, /*rtol=*/1e-3, /*atol=*/1e-5); + }); + } +} + +TEST_F(LazyOpsTest, TestGroupNormBackward) { + int num_channels = 6; + torch::Tensor input = + torch::rand({2, num_channels, 5, 5}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)); + torch::Tensor weight = + torch::rand({num_channels}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)); + torch::Tensor bias = + torch::rand({num_channels}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)); + double eps = 1e-05; + for (bool undef_weight : {true, false}) { + for (int num_groups : {3, 6, 1}) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::group_norm( + /*input=*/inputs[0], num_groups, inputs[1], inputs[2], + /*eps=*/eps, + /*cudnn_enabled=*/false); + }; + torch::Tensor undef; + ForEachDevice([&](const torch::Device& device) { + TestBackward( + {input, undef_weight ? undef : weight, undef_weight ? undef : bias}, + device, testfn, + /*rtol=*/1e-3, /*atol=*/1e-3, + /*derivative_level=*/2); + }); + } + } +} + +TEST_F(LazyOpsTest, TestInstanceNorm) { + int batch = 5; + int num_channels = 20; + torch::Tensor input = + torch::rand({batch, num_channels, 10, 10}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor weight = + torch::rand({num_channels}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor bias = + torch::rand({num_channels}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor running_mean = + torch::zeros({num_channels}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor running_var = + torch::ones({num_channels}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + double momentum = 0.1; + double eps = 1e-05; + torch::Tensor output = torch::instance_norm( + input, weight, bias, running_mean, running_var, + /*use_input_stats=*/true, momentum, eps, /*cudnn_enabled=*/false); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_weight = CopyToDevice(weight, device); + torch::Tensor lazy_bias = CopyToDevice(bias, device); + torch::Tensor lazy_running_mean = CopyToDevice(running_mean, device); + torch::Tensor lazy_running_var = CopyToDevice(running_var, device); + torch::Tensor lazy_output = torch::instance_norm( + lazy_input, lazy_weight, lazy_bias, lazy_running_mean, lazy_running_var, + /*use_input_stats=*/true, momentum, eps, /*cudnn_enabled=*/false); + AllClose(output, lazy_output, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestLayerNorm) { + torch::Tensor input = + torch::rand({20, 10, 10, 10}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + double eps = 1e-05; + torch::Tensor undef; + for (bool undef_weight : {true, false}) { + for (int64_t normalized_size : {2, 3}) { + std::vector normalized_shape(normalized_size, 10); + torch::Tensor weight = torch::rand( + normalized_shape, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor bias = torch::rand( + normalized_shape, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output = torch::layer_norm(input, normalized_shape, + undef_weight ? undef : weight, + undef_weight ? undef : bias, eps, + /*cudnn_enabled=*/false); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_weight = + undef_weight ? undef : CopyToDevice(weight, device); + torch::Tensor lazy_bias = + undef_weight ? undef : CopyToDevice(bias, device); + torch::Tensor lazy_output = torch::layer_norm( + lazy_input, normalized_shape, lazy_weight, lazy_bias, eps, + /*cudnn_enabled=*/false); + AllClose(output, lazy_output, /*rtol=*/1e-3, /*atol=*/1e-5); + }); + } + } +} + +TEST_F(LazyOpsTest, TestLayerNormBackward) { + torch::Tensor input = + torch::rand({2, 3, 3, 3}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)); + double eps = 1e-05; + for (bool undef_weight : {true, false}) { + for (int64_t normalized_size : {2, 3}) { + std::vector normalized_shape(normalized_size, 3); + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::layer_norm( + /*input=*/inputs[0], normalized_shape, inputs[1], inputs[2], + /*eps=*/eps, + /*cudnn_enabled=*/false); + }; + torch::Tensor weight = + torch::rand(normalized_shape, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)); + torch::Tensor bias = + torch::rand(normalized_shape, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)); + torch::Tensor undef; + ForEachDevice([&](const torch::Device& device) { + TestBackward( + {input, undef_weight ? undef : weight, undef_weight ? undef : bias}, + device, testfn, + /*rtol=*/1e-3, /*atol=*/1e-4, /*derivative_level=*/2); + }); + } + } +} + +TEST_F(LazyOpsTest, TestNuclearNorm) { + torch::Tensor a = torch::rand( + {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::nuclear_norm(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::nuclear_norm(lazy_a); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestPairwiseDistance) { + torch::Tensor x1 = torch::rand( + {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor x2 = torch::rand( + {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + double eps = 1e-6; + for (bool keepdim : {false, true}) { + for (double p : {1, 2, 3, 4}) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor output = + torch::pairwise_distance(x1, x2, p, eps, keepdim); + torch::Tensor lazy_x1 = CopyToDevice(x1, device); + torch::Tensor lazy_x2 = CopyToDevice(x2, device); + torch::Tensor lazy_output = + torch::pairwise_distance(lazy_x1, lazy_x2, p, eps, keepdim); + AllClose(output, lazy_output, /*rtol=*/1e-5, /*atol=*/1e-5); + }); + } + } +} + +TEST_F(LazyOpsTest, TestCosineSimilarity) { + torch::Tensor x1 = torch::rand( + {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor x2 = torch::rand( + {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + double eps = 1e-8; + int rank = x1.dim(); + for (int dim = -rank; dim < rank; ++dim) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor output = torch::cosine_similarity(x1, x2, dim, eps); + torch::Tensor lazy_x1 = CopyToDevice(x1, device); + torch::Tensor lazy_x2 = CopyToDevice(x2, device); + torch::Tensor lazy_output = + torch::cosine_similarity(lazy_x1, lazy_x2, dim, eps); + AllClose(output, lazy_output); + }); + } +} + +TEST_F(LazyOpsTest, TestCosineEmbeddingLoss) { + torch::Tensor input1 = torch::rand( + {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor input2 = torch::rand( + {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor target = torch::rand( + {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (torch::Reduction::Reduction reduction : + {torch::Reduction::Mean, torch::Reduction::Sum}) { + for (double margin : {0., 0.2}) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor output = torch::cosine_embedding_loss( + input1, input2, target, margin, reduction); + torch::Tensor lazy_input1 = CopyToDevice(input1, device); + torch::Tensor lazy_input2 = CopyToDevice(input2, device); + torch::Tensor lazy_target = CopyToDevice(target, device); + torch::Tensor lazy_output = torch::cosine_embedding_loss( + lazy_input1, lazy_input2, lazy_target, margin, reduction); + AllClose(output, lazy_output); + }); + } + } +} + +TEST_F(LazyOpsTest, TestHingeEmbeddingLoss) { + torch::Tensor input = torch::rand( + {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor target = torch::rand( + {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (torch::Reduction::Reduction reduction : + {torch::Reduction::Mean, torch::Reduction::Sum}) { + for (double margin : {0., 0.2}) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor output = + torch::hinge_embedding_loss(input, target, margin, reduction); + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_target = CopyToDevice(target, device); + torch::Tensor lazy_output = torch::hinge_embedding_loss( + lazy_input, lazy_target, margin, reduction); + AllClose(output, lazy_output); + }); + } + } +} + +TEST_F(LazyOpsTest, TestTripletMarginLoss) { + torch::Tensor anchor = torch::rand( + {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor positive = torch::abs(torch::rand( + {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()))); + torch::Tensor negative = torch::neg(torch::abs(torch::rand( + {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())))); + double eps = 1e-6; + for (double margin : {0., 0.2}) { + for (double p : {1, 2, 3, 4}) { + for (bool swap : {false, true}) { + for (torch::Reduction::Reduction reduction : + {torch::Reduction::Mean, torch::Reduction::Sum}) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor output = torch::triplet_margin_loss( + anchor, positive, negative, margin, p, eps, swap, reduction); + torch::Tensor lazy_anchor = CopyToDevice(anchor, device); + torch::Tensor lazy_positive = CopyToDevice(positive, device); + torch::Tensor lazy_negative = CopyToDevice(negative, device); + torch::Tensor lazy_output = torch::triplet_margin_loss( + lazy_anchor, lazy_positive, lazy_negative, margin, p, eps, swap, + reduction); + AllClose(output, lazy_output); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestBinaryCrossEntropy) { + int batch = 10; + int classes = 5; + torch::Tensor input = + torch::rand({batch, classes}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor target = + torch::rand({batch, classes}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor weight = + torch::rand({batch, classes}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor undef; + for (torch::Reduction::Reduction reduction : + {torch::Reduction::Mean, torch::Reduction::Sum, + torch::Reduction::None}) { + for (bool undef_weight : {false, true}) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor output = torch::binary_cross_entropy( + input, target, undef_weight ? undef : weight, reduction); + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_target = CopyToDevice(target, device); + torch::Tensor lazy_weight = + undef_weight ? undef : CopyToDevice(weight, device); + torch::Tensor lazy_output = torch::binary_cross_entropy( + lazy_input, lazy_target, lazy_weight, reduction); + AllClose(output, lazy_output, /*rtol=*/1e-4, /*atol=*/1e-5); + }); + } + } +} + +TEST_F(LazyOpsTest, TestMarginRankingLoss) { + torch::Tensor input1 = torch::rand( + {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor input2 = torch::rand( + {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor target = torch::rand( + {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (torch::Reduction::Reduction reduction : + {torch::Reduction::Mean, torch::Reduction::Sum}) { + for (double margin : {0., 0.2}) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor output = torch::margin_ranking_loss( + input1, input2, target, margin, reduction); + torch::Tensor lazy_input1 = CopyToDevice(input1, device); + torch::Tensor lazy_input2 = CopyToDevice(input2, device); + torch::Tensor lazy_target = CopyToDevice(target, device); + torch::Tensor lazy_output = torch::margin_ranking_loss( + lazy_input1, lazy_input2, lazy_target, margin, reduction); + AllClose(output, lazy_output); + }); + } + } +} + +TEST_F(LazyOpsTest, TestBCEWithLogits) { + int batch = 10; + int classes = 5; + torch::Tensor input = + torch::rand({batch, classes}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor target = + torch::rand({batch, classes}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor weight = torch::rand( + {classes}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor pos_weight = torch::rand( + {classes}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor undef; + for (torch::Reduction::Reduction reduction : + {torch::Reduction::Mean, torch::Reduction::Sum}) { + for (bool undef_weight : {false, true}) { + for (bool undef_pos_weight : {false, true}) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor output = torch::binary_cross_entropy_with_logits( + input, target, undef_weight ? undef : weight, + undef_pos_weight ? undef : pos_weight, reduction); + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_target = CopyToDevice(target, device); + torch::Tensor lazy_weight = + undef_weight ? undef : CopyToDevice(weight, device); + torch::Tensor lazy_pos_weight = + undef_pos_weight ? undef : CopyToDevice(pos_weight, device); + torch::Tensor lazy_output = torch::binary_cross_entropy_with_logits( + lazy_input, lazy_target, lazy_weight, lazy_pos_weight, reduction); + }); + } + } + } +} + +TEST_F(LazyOpsTest, TestKlDiv) { + torch::Tensor input = torch::rand( + {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor target = torch::rand( + {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (bool log_target : {true, false}) { + for (torch::Reduction::Reduction reduction : + {torch::Reduction::Mean, torch::Reduction::Sum}) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor output = + torch::kl_div(input, target, reduction, log_target); + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_target = CopyToDevice(target, device); + torch::Tensor lazy_output = + torch::kl_div(lazy_input, lazy_target, reduction, log_target); + AllClose(output, lazy_output); + }); + } + } +} + +TEST_F(LazyOpsTest, TestProd) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::prod(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::prod(lazy_a); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestProdCast) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::prod(a, torch::kDouble); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::prod(lazy_a, torch::kDouble); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestProdInDim) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int rank = a.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor b = torch::prod(a, dim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::prod(lazy_a, dim); + AllClose(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestProdInDimKeepCast) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int rank = a.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor b = torch::prod(a, dim, /*keepdim=*/true, torch::kDouble); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = + torch::prod(lazy_a, dim, /*keepdim=*/true, torch::kDouble); + AllClose(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestProdInDimKeep) { + torch::Tensor a = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int rank = a.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor b = torch::prod(a, dim, /*keepdim=*/true); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::prod(lazy_a, dim, /*keepdim=*/true); + AllClose(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestCumSum) { + torch::Tensor input = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int rank = input.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor result = torch::cumsum(input, dim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_result = torch::cumsum(lazy_input, dim); + AllClose(result, lazy_result); + }); + } +} + +TEST_F(LazyOpsTest, TestCumSumCast) { + torch::Tensor input = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int rank = input.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor result = torch::cumsum(input, dim, torch::kDouble); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_result = torch::cumsum(lazy_input, dim, torch::kDouble); + AllClose(result, lazy_result); + }); + } +} + +TEST_F(LazyOpsTest, TestCumSumLong) { + torch::Tensor input = torch::randint( + 1000, {4, 3, 4}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + int rank = input.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor result = torch::cumsum(input, dim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_result = torch::cumsum(lazy_input, dim); + AllEqual(result, lazy_result); + }); + } +} + +TEST_F(LazyOpsTest, TestCumSumCastLong) { + torch::Tensor input = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int rank = input.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor result = torch::cumsum(input, dim, torch::kLong); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_result = torch::cumsum(lazy_input, dim, torch::kLong); + AllEqual(result, lazy_result); + }); + } +} + +TEST_F(LazyOpsTest, TestCumProd) { + torch::Tensor input = torch::rand( + {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int rank = input.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor result = torch::cumprod(input, dim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_result = torch::cumprod(lazy_input, dim); + AllClose(result, lazy_result); + }); + } +} + +TEST_F(LazyOpsTest, TestCumProdCast) { + torch::Tensor input = torch::mul( + torch::rand({4, 3, 4}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())), + 10); + int rank = input.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor result = torch::cumprod(input, dim, torch::kDouble); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_result = torch::cumprod(lazy_input, dim, torch::kDouble); + AllClose(result, lazy_result); + }); + } +} + +TEST_F(LazyOpsTest, TestCumProdLong) { + torch::Tensor input = torch::randint( + 7, {2, 3}, torch::TensorOptions(torch::kLong).device(DefaultDevice())); + int rank = input.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor result = torch::cumsum(input, dim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_result = torch::cumsum(lazy_input, dim); + AllEqual(result, lazy_result); + }); + } +} + +TEST_F(LazyOpsTest, TestCumProdCastLong) { + torch::Tensor input = + torch::rand({2, 3}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 7; + int rank = input.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor result = torch::cumsum(input, dim, torch::kLong); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_result = torch::cumsum(lazy_input, dim, torch::kLong); + AllEqual(result, lazy_result); + }); + } +} + +TEST_F(LazyOpsTest, TestArgMin) { + torch::Tensor a = torch::rand( + {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::argmin(a, c10::nullopt, /*keepdim=*/false); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::argmin(lazy_a, c10::nullopt, /*keepdim=*/false); + AllEqual(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestArgMinDim) { + torch::Tensor a = torch::rand( + {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (int dim : {1, -2}) { + torch::Tensor b = torch::argmin(a, dim, /*keepdim=*/false); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::argmin(lazy_a, dim, /*keepdim=*/false); + AllEqual(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestArgMinDimKeep) { + torch::Tensor a = torch::rand( + {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (int dim : {1, -2}) { + torch::Tensor b = torch::argmin(a, dim, /*keepdim=*/true); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::argmin(lazy_a, dim, /*keepdim=*/true); + AllEqual(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestArgMinSameValue) { + torch::Tensor a = torch::ones( + {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::argmin(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::argmin(lazy_a); + AllEqual(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestArgMinWrapper) { + torch::Tensor a = torch::rand( + {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (int dim : {1, -2}) { + torch::Tensor b = torch::argmin(a, dim, /*keepdim=*/false); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::argmin(lazy_a, dim, /*keepdim=*/false); + AllEqual(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestArgMax) { + torch::Tensor a = torch::rand( + {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::argmax(a, c10::nullopt, /*keepdim=*/false); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::argmax(lazy_a, c10::nullopt, /*keepdim=*/false); + AllEqual(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestArgMaxDim) { + torch::Tensor a = torch::rand( + {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (int dim : {1, -2}) { + torch::Tensor b = torch::argmax(a, dim, /*keepdim=*/false); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::argmax(lazy_a, dim, /*keepdim=*/false); + AllEqual(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestArgMaxDimKeep) { + torch::Tensor a = torch::rand( + {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (int dim : {1, -2}) { + torch::Tensor b = torch::argmax(a, dim, /*keepdim=*/true); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::argmax(lazy_a, dim, /*keepdim=*/true); + AllEqual(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestArgMaxSameValue) { + torch::Tensor a = torch::ones( + {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::argmax(a, c10::nullopt, /*keepdim=*/false); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::argmax(lazy_a, c10::nullopt, /*keepdim=*/false); + AllEqual(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestArgMaxWrapper) { + torch::Tensor a = torch::rand( + {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (int dim : {1, -2}) { + torch::Tensor b = torch::argmax(a, dim, /*keepdim=*/false); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::argmax(lazy_a, dim, /*keepdim=*/false); + AllEqual(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestAsin) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::asin(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::asin(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestAsinh) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::asinh(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::asinh(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestAsinhInPlace) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor b = torch::asinh_(a); + torch::Tensor lazy_b = torch::asinh_(lazy_a); + AllClose(a, lazy_a, /*rtol=*/1e-3, /*atol=*/1e-5); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestSin) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::sin(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::sin(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestSinh) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::sinh(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::sinh(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestAcos) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::acos(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::acos(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestAcosh) { + torch::Tensor a = + torch::rand({2, 2}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 100; + torch::Tensor b = torch::acosh(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::acosh(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestAcoshInPlace) { + torch::Tensor a = + torch::rand({2, 2}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 100; + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor b = torch::acosh_(a); + torch::Tensor lazy_b = torch::acosh_(lazy_a); + AllClose(a, lazy_a, /*rtol=*/1e-3, /*atol=*/1e-5); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestCos) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::cos(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::cos(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestCosh) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::cosh(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::cosh(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestAtan) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::atan(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::atan(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestAtanh) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::atanh(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::atanh(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestAtanhInPlace) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor b = torch::atanh_(a); + torch::Tensor lazy_b = torch::atanh_(lazy_a); + AllClose(a, lazy_a, /*rtol=*/1e-3, /*atol=*/1e-5); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestAtan2) { + torch::Tensor a = torch::randn( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::randn( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::atan2(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::atan2(lazy_a, lazy_b); + AllClose(c, lazy_c, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestTan) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::tan(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::tan(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestTanh) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::tanh(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::tanh(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestClampMinMax) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar min_val(0.311); + torch::Scalar max_val(0.409); + torch::Tensor b = torch::clamp(a, min_val, max_val); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::clamp(lazy_a, min_val, max_val); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestClampMin) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar min_val(0.311); + torch::Tensor b = torch::clamp(a, min_val, c10::nullopt); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::clamp(lazy_a, min_val, c10::nullopt); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestClampMax) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar max_val(0.409); + torch::Tensor b = torch::clamp(a, c10::nullopt, max_val); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::clamp(lazy_a, c10::nullopt, max_val); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestClampMinExplicit) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar min_val(0.311); + torch::Tensor b = torch::clamp_min(a, min_val); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::clamp_min(lazy_a, min_val); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestClampMaxExplicit) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar max_val(0.409); + torch::Tensor b = torch::clamp_max(a, max_val); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::clamp_max(lazy_a, max_val); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestClampMinExplicitInPlace) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar min_val(0.311); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor b = torch::clamp_min_(a, min_val); + torch::Tensor lazy_b = torch::clamp_min_(lazy_a, min_val); + AllClose(a, lazy_a); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestClampMaxExplicitInPlace) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar max_val(0.409); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor b = torch::clamp_max_(a, max_val); + torch::Tensor lazy_b = torch::clamp_max_(lazy_a, max_val); + AllClose(a, lazy_a); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestCeil) { + torch::Tensor a = + torch::randn( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 100.0; + torch::Tensor b = torch::ceil(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::ceil(lazy_a); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestFloor) { + torch::Tensor a = + torch::randn( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 100.0; + torch::Tensor b = torch::floor(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::floor(lazy_a); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestRound) { + torch::Tensor a = torch::cat( + {torch::randn( + {8}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 100.0, + // Special case: 0.5, -0.5. lazy::Round impl rounds to -1/1 whereas + // lazy::RoundToEven properly implements bankers rounding. + torch::tensor( + {-0.5, 0.5}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice()))}, + 0); + torch::Tensor b = torch::round(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::round(lazy_a); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestTrunc) { + torch::Tensor a = + torch::randn( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 100.0; + torch::Tensor b = torch::trunc(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::trunc(lazy_a); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestFrac) { + torch::Tensor a = + torch::randn( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 100.0; + torch::Tensor b = torch::frac(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::frac(lazy_a); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestNeg) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::neg(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::neg(lazy_a); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestBitwiseNot) { + std::vector types( + {torch::kByte, torch::kChar, torch::kShort, torch::kInt, torch::kLong}); + + ForEachDevice([&](const torch::Device& device) { + for (auto type : types) { + torch::Tensor a = + torch::randint(0, 63, {2, 2}, torch::TensorOptions(type)); + torch::Tensor b = torch::bitwise_not(a); + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::bitwise_not(lazy_a); + AllEqual(b, lazy_b); + } + }); +} + +TEST_F(LazyOpsTest, TestBitwiseNotInPlace) { + std::vector types( + {torch::kByte, torch::kChar, torch::kShort, torch::kInt, torch::kLong}); + + ForEachDevice([&](const torch::Device& device) { + for (auto type : types) { + torch::Tensor a = + torch::randint(0, 63, {2, 2}, torch::TensorOptions(type)); + torch::Tensor lazy_a = CopyToDevice(a, device); + a.bitwise_not_(); + lazy_a.bitwise_not_(); + AllEqual(a, lazy_a); + } + }); +} + +TEST_F(LazyOpsTest, TestSign) { + torch::Tensor a = + torch::randn( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 100.0; + torch::Tensor b = torch::sign(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::sign(lazy_a); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestSignByte) { + torch::Tensor a = torch::randint( + 256, {2, 2}, torch::TensorOptions(torch::kByte).device(DefaultDevice())); + torch::Tensor b = torch::sign(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::sign(lazy_a); + AllEqual(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestAbs) { + torch::Tensor a = torch::randn( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::abs(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::abs(lazy_a); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestAbsByte) { + torch::Tensor a = torch::randint( + 256, {2, 2}, torch::TensorOptions(torch::kByte).device(DefaultDevice())); + torch::Tensor b = torch::abs(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::abs(lazy_a); + AllEqual(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestEmptyLike) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::empty_like(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::empty_like(lazy_a); + EXPECT_EQ(b.sizes(), lazy_b.sizes()); + }); +} + +TEST_F(LazyOpsTest, TestEmptyLikeOptions) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::empty_like( + a, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::empty_like( + lazy_a, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + EXPECT_EQ(b.sizes(), lazy_b.sizes()); + }); +} + +TEST_F(LazyOpsTest, TestEmpty) { + torch::Tensor a = torch::zeros( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = torch::empty( + {2, 2}, torch::TensorOptions(torch::kFloat).device(device)); + EXPECT_EQ(a.sizes(), lazy_a.sizes()); + }); +} + +TEST_F(LazyOpsTest, TestZeroInPlace) { + torch::Tensor input = torch::ones( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazyInput = CopyToDevice(input, device); + auto& output = torch::zero_(input); + auto& lazyOutput = torch::zero_(lazyInput); + AllClose(output, lazyOutput); + }); +} + +TEST_F(LazyOpsTest, TestZerosLike) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::zeros_like(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::zeros_like(lazy_a); + AllClose(a, lazy_a); + }); +} + +TEST_F(LazyOpsTest, TestZerosLikeOptions) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::zeros_like( + a, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::zeros_like( + lazy_a, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + AllClose(a, lazy_a); + }); +} + +TEST_F(LazyOpsTest, TestZeros) { + torch::Tensor a = torch::zeros( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = torch::zeros( + {2, 2}, torch::TensorOptions(torch::kFloat).device(device)); + AllClose(a, lazy_a); + }); +} + +TEST_F(LazyOpsTest, TestOnes) { + torch::Tensor a = torch::ones( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = + torch::ones({2, 2}, torch::TensorOptions(torch::kFloat).device(device)); + AllClose(a, lazy_a); + }); +} + +TEST_F(LazyOpsTest, TestOnesLike) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::ones_like(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::ones_like(lazy_a); + AllClose(a, lazy_a); + }); +} + +TEST_F(LazyOpsTest, TestOnesLikeOptions) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::ones_like( + a, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::ones_like( + lazy_a, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + AllClose(a, lazy_a); + }); +} + +TEST_F(LazyOpsTest, TestFull) { + torch::Tensor a = + torch::full({2, 2}, 3.1165, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = torch::full( + {2, 2}, 3.1165, torch::TensorOptions(torch::kFloat).device(device)); + AllClose(a, lazy_a); + }); +} + +TEST_F(LazyOpsTest, TestFullLike) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::full_like(a, 3.1165); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::full_like(lazy_a, 3.1165); + AllClose(a, lazy_a); + }); +} + +TEST_F(LazyOpsTest, TestFullLikeOptions) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::full_like( + a, 3.1165, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::full_like( + lazy_a, 3.1165, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + AllClose(a, lazy_a); + }); +} + +TEST_F(LazyOpsTest, TestARange) { + for (auto& ranges : std::vector>{{0.0, 100.0, 0.5}, + {0.0, -100.0, -0.5}}) { + torch::Tensor a = torch::arange( + ranges[0], ranges[1], ranges[2], + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = + torch::arange(ranges[0], ranges[1], ranges[2], + torch::TensorOptions(torch::kFloat).device(device)); + AllClose(a, lazy_a); + }); + } +} + +TEST_F(LazyOpsTest, TestARangeOut) { + torch::Tensor a = torch::randn( + {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (auto& ranges : std::vector>{{0.0, 100.0, 0.5}, + {0.0, -100.0, -0.5}}) { + torch::Tensor b = torch::arange_out(a, ranges[0], ranges[1], ranges[2]); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = + torch::arange_out(lazy_a, ranges[0], ranges[1], ranges[2]); + AllClose(b, lazy_b); + }); + } +} + +TEST_F(LazyOpsTest, TestDimARange) { + torch::Tensor like = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor a = torch::_dim_arange(like, 1); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_like = CopyToDevice(like, device); + torch::Tensor lazy_a = torch::_dim_arange(lazy_like, 1); + AllClose(a, lazy_a); + }); +} + +TEST_F(LazyOpsTest, TestBartlettWindow) { + int window_length = 10; + for (bool periodic : {false, true}) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor output = torch::bartlett_window( + window_length, periodic, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + + torch::Tensor lazy_output = torch::bartlett_window( + window_length, periodic, + torch::TensorOptions(torch::kFloat).device(device)); + AllClose(output, lazy_output, /*rtol=*/1e-5, /*atol=*/1e-7); + }); + } +} + +TEST_F(LazyOpsTest, TestBlackmanWindow) { + int window_length = 10; + for (bool periodic : {false, true}) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor output = torch::blackman_window( + window_length, periodic, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_output = torch::blackman_window( + window_length, periodic, + torch::TensorOptions(torch::kFloat).device(device)); + AllClose(output, lazy_output, /*rtol=*/1e-5, /*atol=*/1e-7); + }); + } +} + +TEST_F(LazyOpsTest, TestHammingWindow) { + double alpha = 0.54; + double beta = 0.46; + int window_length = 10; + for (bool periodic : {false, true}) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor output = torch::hamming_window( + window_length, periodic, alpha, beta, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_output = torch::hamming_window( + window_length, periodic, alpha, beta, + torch::TensorOptions(torch::kFloat).device(device)); + AllClose(output, lazy_output); + }); + } +} + +TEST_F(LazyOpsTest, TestHannWindow) { + int window_length = 10; + for (bool periodic : {false, true}) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor output = torch::hann_window( + window_length, periodic, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_output = torch::hann_window( + window_length, periodic, + torch::TensorOptions(torch::kFloat).device(device)); + AllClose(output, lazy_output); + }); + } +} + +TEST_F(LazyOpsTest, TestLogSigmoid) { + torch::Tensor a = torch::empty( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + a.uniform_(-1.0, 1.0); + torch::Tensor b = torch::log_sigmoid(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::log_sigmoid(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestLogSigmoidForward) { + torch::Tensor a = torch::empty( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + a.uniform_(-1.0, 1.0); + auto tuple = torch::log_sigmoid_forward(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + auto lazy_tuple = torch::log_sigmoid_forward(lazy_a); + AllClose(std::get<0>(tuple), std::get<0>(lazy_tuple), + /*rtol=*/1e-3, /*atol=*/1e-5); + AllClose(std::get<1>(tuple), std::get<1>(lazy_tuple), + /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestLogsumexp) { + torch::Tensor a = torch::rand( + {3, 4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (auto dims : std::vector>{{0, 1}, {-3, -2}}) { + for (bool keepdim : {false, true}) { + torch::Tensor b = torch::logsumexp(a, dims, keepdim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::logsumexp(lazy_a, dims, keepdim); + AllClose(b, lazy_b); + }); + } + } +} + +TEST_F(LazyOpsTest, TestSiLU) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::silu(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::silu(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); + ExpectCounterChanged("lazy::silu_out", GetIgnoredCounters()); +} + +TEST_F(LazyOpsTest, TestSigmoid) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::sigmoid(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::sigmoid(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestMatmul_1x1) { + torch::Tensor a = torch::rand( + {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::matmul(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::matmul(lazy_a, lazy_b); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestMatmul_2x1) { + torch::Tensor a = torch::rand( + {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::matmul(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::matmul(lazy_a, lazy_b); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestMatmul_1x2) { + torch::Tensor a = torch::rand( + {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::matmul(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::matmul(lazy_a, lazy_b); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestMatmul_2x2) { + torch::Tensor a = torch::rand( + {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::matmul(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::matmul(lazy_a, lazy_b); + AllClose(c, lazy_c, /*rtol=*/1e-3, /*atol=*/1e-4); + }); +} + +TEST_F(LazyOpsTest, TestMatmulBcast) { + torch::Tensor a = + torch::rand({4, 2, 3, 2, 4}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = + torch::rand({2, 1, 4, 3}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::matmul(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::matmul(lazy_a, lazy_b); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestDot) { + torch::Tensor a = torch::rand( + {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::dot(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::dot(lazy_a, lazy_b); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestTensorDot) { + torch::Tensor a = torch::rand( + {6, 4, 8}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {4, 7, 8}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + std::vector dims_a = {1, 2}; + std::vector dims_b = {0, 2}; + torch::Tensor c = torch::tensordot(a, b, dims_a, dims_b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::tensordot(lazy_a, lazy_b, dims_a, dims_b); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestGer) { + torch::Tensor a = torch::rand( + {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::ger(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::ger(lazy_a, lazy_b); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestMv) { + torch::Tensor a = torch::rand( + {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::mv(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::mv(lazy_a, lazy_b); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestMvOut) { + torch::Tensor a = torch::rand( + {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::empty( + {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::mv_out(c, a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::empty({4}, lazy_b.options()); + torch::mv_out(lazy_c, lazy_a, lazy_b); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestBatchAddBatchMatMul) { + torch::Tensor a = torch::rand( + {3, 6, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {3, 6, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::rand( + {3, 4, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar alpha = 0.5; + torch::Scalar beta = 1.5; + torch::Tensor d = torch::baddbmm(a, b, c, beta, alpha); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = CopyToDevice(c, device); + torch::Tensor lazy_d = torch::baddbmm(lazy_a, lazy_b, lazy_c, beta, alpha); + AllClose(d, lazy_d, /*rtol=*/1e-3, /*atol=*/1e-4); + }); +} + +TEST_F(LazyOpsTest, TestBatchAddBatchMatMulInPlace) { + torch::Tensor a = torch::rand( + {3, 6, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {3, 6, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::rand( + {3, 4, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar alpha = 0.5; + torch::Scalar beta = 1.5; + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = CopyToDevice(c, device); + torch::Tensor d = a.baddbmm_(b, c, beta, alpha); + torch::Tensor lazy_d = lazy_a.baddbmm_(lazy_b, lazy_c, beta, alpha); + AllClose(d, lazy_d, /*rtol=*/1e-3, /*atol=*/1e-4); + AllClose(a, lazy_a, /*rtol=*/1e-3, /*atol=*/1e-4); + }); +} + +TEST_F(LazyOpsTest, TestBatchMatMul) { + torch::Tensor a = torch::rand( + {3, 6, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {3, 4, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::bmm(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::bmm(lazy_a, lazy_b); + AllClose(c, lazy_c, /*rtol=*/1e-3, /*atol=*/1e-4); + }); +} + +TEST_F(LazyOpsTest, TestChainMatMul) { + torch::Tensor a = torch::rand( + {5, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {4, 6}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::rand( + {6, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor d = torch::rand( + {2, 7}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor result = torch::chain_matmul({a, b, c, d}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = CopyToDevice(c, device); + torch::Tensor lazy_d = CopyToDevice(d, device); + torch::Tensor lazy_result = + torch::chain_matmul({lazy_a, lazy_b, lazy_c, lazy_d}); + AllClose(result, lazy_result, /*rtol=*/1e-3, /*atol=*/1e-4); + }); +} + +TEST_F(LazyOpsTest, TestLinear) { + torch::Tensor input = torch::rand( + {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor weight = torch::rand( + {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor bias = torch::rand( + {3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor result = torch::linear(input, weight); + torch::Tensor result_with_bias = torch::linear(input, weight, bias); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_weight = CopyToDevice(weight, device); + torch::Tensor lazy_bias = CopyToDevice(bias, device); + torch::Tensor lazy_result = torch::linear(lazy_input, lazy_weight); + torch::Tensor lazy_result_with_bias = + torch::linear(lazy_input, lazy_weight, lazy_bias); + AllClose(result, lazy_result, /*rtol=*/1e-2, /*atol=*/1e-4); + AllClose(result_with_bias, lazy_result_with_bias, /*rtol=*/1e-2, + /*atol=*/1e-4); + }); +} + +TEST_F(LazyOpsTest, TestPinverse) { + torch::Tensor input = torch::rand( + {4, 6}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor result = torch::pinverse(input); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_result = torch::pinverse(lazy_input); + AllClose(result, lazy_result, /*rtol=*/1e-4); + }); +} + +TEST_F(LazyOpsTest, TestEinsumOuter) { + torch::Tensor a = torch::rand( + {5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + std::string equation = "i,j->ij"; + torch::Tensor c = torch::einsum(equation, {a, b}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::einsum(equation, {lazy_a, lazy_b}); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestEinsumOuterBackward) { + torch::Tensor a = torch::rand({5}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)); + torch::Tensor b = torch::rand({5}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)); + std::string equation = "i,j->ij"; + auto testfn = [&](const std::vector& inputs) -> torch::Tensor { + return torch::einsum(equation, inputs); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({a, b}, device, testfn, /*rtol=*/1e-3, /*atol=*/1e-4); + }); +} + +TEST_F(LazyOpsTest, TestEinsumBatchMatMul) { + torch::Tensor a = torch::rand( + {3, 2, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {3, 5, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + std::string equation = "bij,bjk->bik"; + torch::Tensor c = torch::einsum(equation, {a, b}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::einsum(equation, {lazy_a, lazy_b}); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestEinsumPyTorchLowerBilinear) { + torch::Tensor a = torch::rand( + {3, 5, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor l = torch::rand( + {2, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor r = torch::rand( + {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + std::string equation = "bn,anm,bm->ba"; + torch::Tensor c = torch::einsum(equation, {l, a, r}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_l = CopyToDevice(l, device); + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_r = CopyToDevice(r, device); + torch::Tensor lazy_c = torch::einsum(equation, {lazy_l, lazy_a, lazy_r}); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestEinsumPyTorchLowerDiagonal) { + torch::Tensor input = torch::rand( + {3, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + std::string equation = "ii->i"; + torch::Tensor result = torch::einsum(equation, {input}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_result = torch::einsum(equation, {lazy_input}); + AllClose(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestEinsumPyTorchLowerBatchDiagonal) { + torch::Tensor input = torch::rand( + {4, 3, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + std::string equation = "...ii->...i"; + torch::Tensor result = torch::einsum(equation, {input}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_result = torch::einsum(equation, {lazy_input}); + AllClose(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestEinsumPyTorchLowerBatchPermute) { + torch::Tensor input = + torch::rand({2, 3, 4, 5}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + std::string equation = "...ij->...ji"; + torch::Tensor result = torch::einsum(equation, {input}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_result = torch::einsum(equation, {lazy_input}); + AllClose(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestEinsumPyTorchLowerRepeatedAxis) { + torch::Tensor x = torch::rand( + {2, 3, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor y = torch::rand( + {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + std::string equation = "ijj,k->ik"; + torch::Tensor result = torch::einsum(equation, {x, y}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_x = CopyToDevice(x, device); + torch::Tensor lazy_y = CopyToDevice(y, device); + torch::Tensor lazy_result = torch::einsum(equation, {lazy_x, lazy_y}); + AllClose(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestBilinear) { + int batch_size = 16; + int in1_features = 4; + int in2_features = 6; + int out_features = 8; + torch::Tensor input1 = + torch::rand({batch_size, in1_features}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor input2 = + torch::rand({batch_size, in2_features}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor weight = + torch::rand({out_features, in1_features, in2_features}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor bias = + torch::rand({out_features}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input1 = CopyToDevice(input1, device); + torch::Tensor lazy_input2 = CopyToDevice(input2, device); + torch::Tensor lazy_weight = CopyToDevice(weight, device); + torch::Tensor lazy_bias = CopyToDevice(bias, device); + torch::Tensor result = torch::bilinear(input1, input2, weight, bias); + torch::Tensor lazy_result = + torch::bilinear(lazy_input1, lazy_input2, lazy_weight, lazy_bias); + AllClose(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestUpsampleNearest2D) { + int batch_size = 2; + int h = 5; + int w = 5; + int uh = 8; + int uw = 8; + int chans = 2; + torch::Tensor input = + torch::rand({batch_size, chans, h, w}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor result = torch::upsample_nearest2d(input, {uh, uw}); + torch::Tensor lazy_result = torch::upsample_nearest2d(lazy_input, {uh, uw}); + AllClose(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestUpsampleNearest2DBackward) { + int batch_size = 2; + int h = 5; + int w = 5; + int uh = 8; + int uw = 8; + int chans = 2; + auto testfn = [&](const std::vector& inputs) -> torch::Tensor { + return torch::upsample_nearest2d(inputs[0], {uh, uw}); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::rand({batch_size, chans, h, w}, + torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); +} + +TEST_F(LazyOpsTest, TestUpsampleNearest2DWithScale) { + int batch_size = 2; + int h = 5; + int w = 5; + int chans = 2; + double scale_h = 2.5; + double scale_w = 3.4; + torch::Tensor input = + torch::rand({batch_size, chans, h, w}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor result = torch::upsample_nearest2d( + input, c10::nullopt, at::ArrayRef{scale_h, scale_w}); + torch::Tensor lazy_result = torch::upsample_nearest2d( + lazy_input, c10::nullopt, at::ArrayRef{scale_h, scale_w}); + AllClose(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestUpsampleNearest2DBackwardWithScale) { + int batch_size = 2; + int h = 5; + int w = 5; + int chans = 2; + double scale_h = 2.5; + double scale_w = 3.4; + auto testfn = [&](const std::vector& inputs) -> torch::Tensor { + return torch::upsample_nearest2d(inputs[0], c10::nullopt, + at::ArrayRef{scale_h, scale_w}); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::rand({batch_size, chans, h, w}, + torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); +} + +TEST_F(LazyOpsTest, TestUpsampleBilinear2D) { + int batch_size = 2; + int h = 5; + int w = 5; + int uh = 8; + int uw = 8; + int chans = 2; + for (bool align_corners : {true, false}) { + torch::Tensor input = torch::rand( + {batch_size, chans, h, w}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor result = + torch::upsample_bilinear2d(input, {uh, uw}, align_corners); + torch::Tensor lazy_result = + torch::upsample_bilinear2d(lazy_input, {uh, uw}, align_corners); + AllClose(result, lazy_result); + }); + } +} + +TEST_F(LazyOpsTest, TestUpsampleBilinear2DBackward) { + int batch_size = 2; + int h = 5; + int w = 5; + int uh = 8; + int uw = 8; + int chans = 2; + for (bool align_corners : {true, false}) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::upsample_bilinear2d(inputs[0], {uh, uw}, align_corners); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::rand({batch_size, chans, h, w}, + torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); + } +} + +TEST_F(LazyOpsTest, TestAddCMul) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor d = torch::addcmul(a, b, c, 3.1165); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = CopyToDevice(c, device); + torch::Tensor lazy_d = torch::addcmul(lazy_a, lazy_b, lazy_c, 3.1165); + AllClose(d, lazy_d); + }); +} + +TEST_F(LazyOpsTest, TestAddCDiv) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = + torch::abs(torch::rand( + {2, 2}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice()))) + + 1.0; + torch::Tensor d = torch::addcdiv(a, b, c, 3.1165); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = CopyToDevice(c, device); + torch::Tensor lazy_d = torch::addcdiv(lazy_a, lazy_b, lazy_c, 3.1165); + AllClose(d, lazy_d); + }); +} + +TEST_F(LazyOpsTest, TestAddCDivWithBroadcast) { + torch::Tensor a = torch::rand( + {1, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {3, 1}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = + torch::abs(torch::rand( + {1, 3}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice()))) + + 1.0; + torch::Tensor d = torch::addcdiv(a, b, c, 3.1165); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = CopyToDevice(c, device); + torch::Tensor lazy_d = torch::addcdiv(lazy_a, lazy_b, lazy_c, 3.1165); + AllClose(d, lazy_d); + }); +} + +TEST_F(LazyOpsTest, TestSize) { + torch::Tensor input = + torch::rand({2, 1, 4, 6}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int rank = input.dim(); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + for (int dim = -rank; dim < rank; ++dim) { + EXPECT_EQ(torch::size(input, dim), torch::size(lazy_input, dim)); + } + }); +} + +TEST_F(LazyOpsTest, TestSelect) { + std::vector input_sizes = {14, 24, 8}; + int rank = input_sizes.size(); + for (int dim = -rank; dim < rank; ++dim) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::select(inputs[0], dim, 0); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::rand(input_sizes, torch::TensorOptions(torch::kFloat) + .requires_grad(true))}, + device, testfn); + }); + }; +} + +TEST_F(LazyOpsTest, TestBernoulliScalarProb) { + torch::Tensor input = torch::zeros( + 1000, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::bernoulli(lazy_input, 0.1); + double frac = lazy_output.sum().item().toDouble() / input.numel(); + EXPECT_GT(frac, 0.06); + EXPECT_LT(frac, 0.14); + }); +} + +TEST_F(LazyOpsTest, TestBernoulliTensorProb) { + std::vector prob_values(1000, 0.1); + torch::Tensor input = torch::tensor( + prob_values, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::bernoulli(lazy_input); + double frac = lazy_output.sum().item().toDouble() / input.numel(); + EXPECT_GT(frac, 0.06); + EXPECT_LT(frac, 0.14); + }); +} + +TEST_F(LazyOpsTest, TestBernoulliScalarProbInPlace) { + torch::Tensor input = torch::zeros( + 1000, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + lazy_input.bernoulli_(0.1); + double frac = lazy_input.sum().item().toDouble() / input.numel(); + EXPECT_GT(frac, 0.06); + EXPECT_LT(frac, 0.14); + }); +} + +TEST_F(LazyOpsTest, TestBernoulliTensorProbInPlace) { + torch::Tensor input = torch::zeros( + 1000, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor prob = torch::scalar_tensor( + 0.1, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_prob = CopyToDevice(prob, device); + lazy_input.bernoulli_(lazy_prob); + double frac = lazy_input.sum().item().toDouble() / input.numel(); + EXPECT_GT(frac, 0.06); + EXPECT_LT(frac, 0.14); + }); +} + +TEST_F(LazyOpsTest, TestDropout) { + torch::Tensor a = torch::rand( + {17, 21}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::dropout(lazy_a, 0.1, /*train=*/true); + double prob = + static_cast(lazy_b.cpu().ne(0.0f).sum().item().toDouble()) / + a.numel(); + EXPECT_GT(prob, 0.86); + EXPECT_LT(prob, 0.94); + }); +} + +TEST_F(LazyOpsTest, TestDropoutInPlace) { + torch::Tensor a = torch::rand( + {17, 21}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::dropout_(lazy_a, 0.1, /*train=*/true); + double prob = + static_cast(lazy_a.cpu().ne(0.0f).sum().item().toDouble()) / + a.numel(); + EXPECT_GT(prob, 0.85); + EXPECT_LT(prob, 0.94); + }); +} + +TEST_F(LazyOpsTest, TestRandperm) { + unsigned n = 5; + torch::Tensor shuffle = torch::randperm( + n, torch::TensorOptions(torch::kLong).device(torch::kLazy)); + torch::Tensor shuffle_cpu = CopyToDevice(shuffle, torch::kCPU); + std::vector shuffle_data(shuffle_cpu.data_ptr(), + shuffle_cpu.data_ptr() + n); + EXPECT_TRUE(shuffle_data.size() == n && + torch::lazy::IsPermutation(shuffle_data)); +} + +TEST_F(LazyOpsTest, TestSlice) { + torch::Tensor a = + torch::rand({32, 24, 16}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::slice(a, 1, 0, 16, 1); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::slice(lazy_a, 1, 0, 16, 1); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestTake) { + torch::Tensor a = torch::rand( + {4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::randint( + 16, {5}, torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor c = torch::take(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::take(lazy_a, lazy_b); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestTakeBackward) { + auto testfn = [&](const std::vector& inputs) -> torch::Tensor { + return torch::take(inputs[0], inputs[1]); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward( + {torch::rand({4, 4}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)), + torch::randint( + 16, {5}, + torch::TensorOptions(torch::kLong).device(DefaultDevice()))}, + device, testfn); + }); +} + +TEST_F(LazyOpsTest, TestStack) { + torch::Tensor a = torch::rand( + {2, 4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {2, 4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::rand( + {2, 4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int rank = a.dim() + 1; + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor d = torch::stack({a, b, c}, dim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = CopyToDevice(c, device); + torch::Tensor lazy_d = torch::stack({lazy_a, lazy_b, lazy_c}, dim); + AllClose(d, lazy_d); + }); + } +} + +TEST_F(LazyOpsTest, TestCat) { + torch::Tensor a = torch::rand( + {2, 1, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {2, 2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::rand( + {2, 3, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (int dim : {1, -2}) { + torch::Tensor d = torch::cat({a, b, c}, dim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = CopyToDevice(c, device); + torch::Tensor lazy_d = torch::cat({lazy_a, lazy_b, lazy_c}, dim); + EXPECT_TRUE(d.sizes() == lazy_d.sizes() && d.dtype() == lazy_d.dtype()); + AllClose(d, lazy_d); + }); + } +} + +TEST_F(LazyOpsTest, TestUnbind) { + torch::Tensor input = torch::rand( + {4, 3, 7}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int rank = input.dim(); + for (int dim = -rank; dim < rank; ++dim) { + std::vector output = torch::unbind(input, dim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + std::vector lazy_output = torch::unbind(lazy_input, dim); + ASSERT_EQ(output.size(), lazy_output.size()); + for (size_t i = 0; i < output.size(); ++i) { + AllClose(output[i], lazy_output[i]); + } + }); + } +} + +TEST_F(LazyOpsTest, TestRepeat) { + std::vector> repeats_list = {{4, 2}, {4, 2, 3}}; + std::vector> input_size_list = {{3}, {2, 4}}; + for (const auto& repeats : repeats_list) { + for (const auto& input_size : input_size_list) { + torch::Tensor input = torch::rand( + input_size, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output = input.repeat(repeats); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = lazy_input.repeat(repeats); + AllClose(output, lazy_output); + }); + } + } +} + +TEST_F(LazyOpsTest, TestGather) { + torch::Tensor a = torch::rand( + {3, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::empty( + {3, 3}, torch::TensorOptions(torch::kLong).device(DefaultDevice())); + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 3; j++) { + b[i][j] = (i + j) % 3; + } + } + for (bool sparse_grad : {false, true}) { + torch::Tensor c = torch::gather(a, 1, b, sparse_grad); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::gather(lazy_a, 1, lazy_b, sparse_grad); + AllClose(c, lazy_c); + }); + } +} + +TEST_F(LazyOpsTest, TestScatter) { + torch::Tensor a = torch::rand( + {3, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {3, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::empty( + {3, 5}, torch::TensorOptions(torch::kLong).device(DefaultDevice())); + for (int dim = 0; dim < 2; ++dim) { + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 5; j++) { + c[i][j] = (i + j) % c.sizes()[dim]; + } + } + torch::Tensor d = torch::scatter(a, dim, c, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = CopyToDevice(c, device); + torch::Tensor lazy_d = torch::scatter(lazy_a, dim, lazy_c, lazy_b); + AllClose(d, lazy_d); + }); + } +} + +TEST_F(LazyOpsTest, TestScatterR1) { + torch::Tensor a = torch::rand( + {5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::empty( + {2}, torch::TensorOptions(torch::kLong).device(DefaultDevice())); + c[0] = 1; + c[1] = 3; + torch::Tensor d = torch::scatter(a, 0, c, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = CopyToDevice(c, device); + torch::Tensor lazy_d = torch::scatter(lazy_a, 0, lazy_c, lazy_b); + AllClose(d, lazy_d); + }); +} + +TEST_F(LazyOpsTest, TestScatterR3) { + torch::Tensor a = torch::rand( + {3, 5, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {3, 4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::empty( + {3, 4, 2}, torch::TensorOptions(torch::kLong).device(DefaultDevice())); + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 4; j++) { + for (int k = 0; k < 2; k++) { + c[i][j][k] = (i + j + k) % 4; + } + } + } + torch::Tensor d = torch::scatter(a, 1, c, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = CopyToDevice(c, device); + torch::Tensor lazy_d = torch::scatter(lazy_a, 1, lazy_c, lazy_b); + AllClose(d, lazy_d); + }); +} + +TEST_F(LazyOpsTest, TestScatterBiggerSource) { + torch::Tensor a = torch::rand( + {4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {8, 8}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::empty( + {4, 4}, torch::TensorOptions(torch::kLong).device(DefaultDevice())); + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + c[i][j] = (i + j) % 4; + } + } + for (int dim = 0; dim < 2; ++dim) { + torch::Tensor d = torch::scatter(a, dim, c, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = CopyToDevice(c, device); + torch::Tensor lazy_d = torch::scatter(lazy_a, dim, lazy_c, lazy_b); + AllClose(d, lazy_d); + }); + } +} + +TEST_F(LazyOpsTest, TestScatterScalar) { + torch::Tensor a = torch::rand( + {4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar b = 1.0f; + torch::Tensor c = torch::empty( + {4, 4}, torch::TensorOptions(torch::kLong).device(DefaultDevice())); + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + c[i][j] = (i + j) % 4; + } + } + for (int dim = 0; dim < 2; ++dim) { + torch::Tensor d = torch::scatter(a, dim, c, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_c = CopyToDevice(c, device); + torch::Tensor lazy_d = torch::scatter(lazy_a, dim, lazy_c, b); + AllClose(d, lazy_d); + }); + } +} + +TEST_F(LazyOpsTest, TestScatterReduceAdd) { + torch::Tensor a = torch::rand( + {3, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {3, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::empty( + {3, 5}, torch::TensorOptions(torch::kLong).device(DefaultDevice())); + for (int dim = 0; dim < 2; ++dim) { + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 5; j++) { + c[i][j] = (i + j) % c.sizes()[dim]; + } + } + torch::Tensor d = torch::scatter(a, dim, c, b, "add"); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = CopyToDevice(c, device); + torch::Tensor lazy_d = torch::scatter(lazy_a, dim, lazy_c, lazy_b, "add"); + AllClose(d, lazy_d); + }); + } + + ExpectCounterNotChanged("aten::.*", GetIgnoredCounters()); + ExpectCounterChanged("lazy::scatter_out", GetIgnoredCounters()); +} + +TEST_F(LazyOpsTest, TestScatterAdd) { + torch::Tensor a = torch::rand( + {3, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {3, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::empty( + {3, 5}, torch::TensorOptions(torch::kLong).device(DefaultDevice())); + for (int dim = 0; dim < 2; ++dim) { + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 5; j++) { + c[i][j] = (i + j) % c.sizes()[dim]; + } + } + torch::Tensor d = torch::scatter_add(a, dim, c, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = CopyToDevice(c, device); + torch::Tensor lazy_d = torch::scatter_add(lazy_a, dim, lazy_c, lazy_b); + AllClose(d, lazy_d); + }); + } +} + +TEST_F(LazyOpsTest, TestScatterAddInPlace) { + torch::Tensor b = torch::rand( + {4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::empty( + {4, 4}, torch::TensorOptions(torch::kLong).device(DefaultDevice())); + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + c[i][j] = (i + j) % 4; + } + } + for (int dim = 0; dim < 2; ++dim) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor a = torch::rand( + {4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor d = a.scatter_add_(dim, c, b); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = CopyToDevice(c, device); + torch::Tensor lazy_d = lazy_a.scatter_add_(dim, lazy_c, lazy_b); + AllClose(d, lazy_d); + AllClose(a, lazy_a); + }); + } +} + +TEST_F(LazyOpsTest, TestIndexSelect) { + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor a = + isFloatingType(scalar_type) + ? torch::rand( + {3, 4}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {3, 4}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + for (torch::ScalarType index_scalar_type : {torch::kInt, torch::kLong}) { + torch::Tensor b = torch::empty( + {2}, torch::TensorOptions(index_scalar_type).device(DefaultDevice())); + b[0] = 0; + b[1] = 2; + for (auto offset : {-2, 0}) { + torch::Tensor c0 = torch::index_select(a, 0 + offset, b); + torch::Tensor c1 = torch::index_select(a, 1 + offset, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c0 = torch::index_select(lazy_a, 0 + offset, lazy_b); + torch::Tensor lazy_c1 = torch::index_select(lazy_a, 1 + offset, lazy_b); + AllEqual(c0, lazy_c0); + AllEqual(c1, lazy_c1); + }); + } + } + } +} + +TEST_F(LazyOpsTest, TestIndexSelectRank0) { + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor a = + isFloatingType(scalar_type) + ? torch::rand( + {3, 4}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {3, 4}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor b = torch::scalar_tensor( + 2, torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor c0 = torch::index_select(a, 0, b); + torch::Tensor c1 = torch::index_select(a, 1, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c0 = torch::index_select(lazy_a, 0, lazy_b); + torch::Tensor lazy_c1 = torch::index_select(lazy_a, 1, lazy_b); + AllEqual(c0, lazy_c0); + AllEqual(c1, lazy_c1); + }); + } +} + +TEST_F(LazyOpsTest, TestInverse) { + if (IsCuda()) { + // TODO(whc) debug failure on cuda, lazy_b comes back transposed + GTEST_SKIP(); + } + torch::Tensor a = torch::randn( + {5, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::inverse(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::inverse(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-4); + }); +} + +TEST_F(LazyOpsTest, TestIsnan) { + torch::Tensor a = torch::tensor( + {1.0, 2.0, std::nan("1"), 4.0}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::isnan(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::isnan(lazy_a); + AllEqual(b, lazy_b); + }); + ExpectCounterNotChanged("aten::.*", GetIgnoredCounters()); + ExpectCounterChanged("lazy::isnan", GetIgnoredCounters()); +} + +TEST_F(LazyOpsTest, TestExpand) { + torch::Tensor a = torch::rand( + {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = a.expand({2, 3, 4}, /*implicit=*/false); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = lazy_a.expand({2, 3, 4}, /*implicit=*/false); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestExpandBack) { + torch::Tensor a = torch::rand( + {3, 1}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = a.expand({3, 4}, /*implicit=*/false); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = lazy_a.expand({3, 4}, /*implicit=*/false); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestExpandAs) { + torch::Tensor a = torch::rand( + {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {2, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::native::expand_as(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::native::expand_as(lazy_a, lazy_b); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestEye) { + int n = 5; + ForEachDevice([&](const torch::Device& device) { + torch::Tensor out = torch::eye( + n, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_out = + torch::eye(n, torch::TensorOptions(torch::kFloat).device(device)); + AllClose(out, lazy_out); + }); +} + +TEST_F(LazyOpsTest, TestEyeWide) { + int lines = 3; + int cols = 5; + ForEachDevice([&](const torch::Device& device) { + torch::Tensor out = + torch::eye(lines, cols, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_out = torch::eye( + lines, cols, torch::TensorOptions(torch::kFloat).device(device)); + AllClose(out, lazy_out); + }); +} + +TEST_F(LazyOpsTest, TestEyeNarrow) { + int lines = 5; + int cols = 3; + ForEachDevice([&](const torch::Device& device) { + torch::Tensor out = + torch::eye(lines, cols, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_out = torch::eye( + lines, cols, torch::TensorOptions(torch::kFloat).device(device)); + AllClose(out, lazy_out); + }); +} + +TEST_F(LazyOpsTest, TestBroadcastTensors) { + torch::Tensor a = torch::rand( + {2, 1, 1}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {2, 1}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + std::vector c = torch::broadcast_tensors({a, b}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + std::vector lazy_c = torch::broadcast_tensors({lazy_a, lazy_b}); + ASSERT_EQ(c.size(), lazy_c.size()); + for (size_t i = 0; i < c.size(); ++i) { + AllClose(c[i], lazy_c[i]); + } + }); +} + +TEST_F(LazyOpsTest, TestOneIndex) { + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor params = + isFloatingType(scalar_type) + ? torch::rand( + {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor indices = torch::randint( + -3, 3, {2, 4, 3}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor result = torch::index(params, {indices}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_params = CopyToDevice(params, device); + torch::Tensor lazy_indices = CopyToDevice(indices, device); + torch::Tensor lazy_result = torch::index(lazy_params, {lazy_indices}); + AllEqual(result, lazy_result); + }); + } +} + +TEST_F(LazyOpsTest, TestOneIndexTransfer) { + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor params = + isFloatingType(scalar_type) + ? torch::rand( + {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor indices = torch::randint( + -3, 3, {2, 4, 3}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor result = torch::index(params, {indices}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_params = CopyToDevice(params, device); + torch::Tensor lazy_result = torch::index(lazy_params, {indices}); + AllEqual(result, lazy_result); + }); + } +} + +TEST_F(LazyOpsTest, TestNonzero) { + torch::Tensor a = torch::zeros( + {4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + a[0][1] = 1.0; + a[1][0] = 2.0; + a[3][1] = 3.0; + torch::Tensor b = torch::nonzero(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::nonzero(lazy_a); + AllClose(b, lazy_b); + + if (DebugUtil::ExperimentEnabled("nonzero")) { + // If the nonzero support is enabled, we must not see any aten:: calls. + ExpectCounterNotChanged("aten::.*", GetIgnoredCounters()); + } + ResetCounters(); + }); +} + +TEST_F(LazyOpsTest, TestMaskedSelect) { + torch::Tensor a = torch::rand( + {3, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::randint( + 0, 2, {5}, torch::TensorOptions(torch::kBool).device(DefaultDevice())); + torch::Tensor c = torch::masked_select(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::masked_select(lazy_a, lazy_b); + AllClose(c, lazy_c); + + if (DebugUtil::ExperimentEnabled("masked_select")) { + // If the masked_select support is enabled, we must not see any aten:: + // calls. + ExpectCounterNotChanged("aten::.*", GetIgnoredCounters()); + } + ResetCounters(); + }); +} + +TEST_F(LazyOpsTest, TestMaskedScatter) { + torch::Tensor a = torch::rand( + {3, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::randint( + 0, 2, {3, 5}, torch::TensorOptions(torch::kBool).device(DefaultDevice())); + torch::Tensor c = torch::rand( + {15}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor d = torch::masked_scatter(a, b, c); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = CopyToDevice(c, device); + torch::Tensor lazy_d = torch::masked_scatter(lazy_a, lazy_b, lazy_c); + AllClose(d, lazy_d); + + if (DebugUtil::ExperimentEnabled("masked_scatter")) { + // If the masked_select support is enabled, we must not see any aten:: + // calls. + ExpectCounterNotChanged("aten::.*", GetIgnoredCounters()); + } + ResetCounters(); + }); +} + +TEST_F(LazyOpsTest, TestMultiIndexHeadNull) { + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor params = + isFloatingType(scalar_type) + ? torch::rand( + {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor indices_null; + torch::Tensor indices_0 = torch::randint( + -3, 3, {2, 4, 3}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor indices_1 = torch::randint( + -3, 3, {2, 4, 3}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor result = + torch::index(params, {indices_null, indices_0, indices_1}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_params = CopyToDevice(params, device); + torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device); + torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device); + torch::Tensor lazy_result = torch::index( + lazy_params, {indices_null, lazy_indices_0, lazy_indices_1}); + AllEqual(result, lazy_result); + }); + } +} + +TEST_F(LazyOpsTest, TestMultiIndexMiddleNull) { + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor params = + isFloatingType(scalar_type) + ? torch::rand( + {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor indices_0 = torch::randint( + -3, 3, {2, 4, 3}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor indices_null; + torch::Tensor indices_1 = torch::randint( + -3, 3, {2, 4, 3}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor result = + torch::index(params, {indices_0, indices_null, indices_1}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_params = CopyToDevice(params, device); + torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device); + torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device); + torch::Tensor lazy_result = torch::index( + lazy_params, {lazy_indices_0, indices_null, lazy_indices_1}); + AllEqual(result, lazy_result); + }); + } +} + +TEST_F(LazyOpsTest, TestMultiIndexTailNull) { + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor params = + isFloatingType(scalar_type) + ? torch::rand( + {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor indices_0 = torch::randint( + -3, 3, {2, 4, 3}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor indices_null; + torch::Tensor indices_1 = torch::randint( + -3, 3, {2, 4, 3}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor result = + torch::index(params, {indices_0, indices_1, indices_null}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_params = CopyToDevice(params, device); + torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device); + torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device); + torch::Tensor lazy_result = torch::index( + lazy_params, {lazy_indices_0, lazy_indices_1, indices_null}); + AllEqual(result, lazy_result); + }); + } +} + +TEST_F(LazyOpsTest, TestMultiIndexMiddleBroadcast) { + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor params = + isFloatingType(scalar_type) + ? torch::rand( + {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor indices_0 = torch::randint( + -3, 3, {2, 4, 3}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor indices_1 = torch::randint( + -3, 3, {2, 1, 3}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor result = torch::index(params, {indices_0, indices_1}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_params = CopyToDevice(params, device); + torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device); + torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device); + torch::Tensor lazy_result = + torch::index(lazy_params, {lazy_indices_0, lazy_indices_1}); + AllEqual(result, lazy_result); + }); + } +} + +TEST_F(LazyOpsTest, TestMultiIndexTailBroadcast) { + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor params = + isFloatingType(scalar_type) + ? torch::rand( + {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor indices_0 = torch::randint( + -3, 3, {2, 1, 3}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor indices_1 = torch::randint( + -3, 3, {2, 1}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor result = torch::index(params, {indices_0, indices_1}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_params = CopyToDevice(params, device); + torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device); + torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device); + torch::Tensor lazy_result = + torch::index(lazy_params, {lazy_indices_0, lazy_indices_1}); + AllEqual(result, lazy_result); + }); + } +} + +TEST_F(LazyOpsTest, TestMaskIndex) { + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor params = + isFloatingType(scalar_type) + ? torch::rand( + {2, 2}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {2, 2}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor indices = torch::randint( + 0, 2, {2, 2}, + torch::TensorOptions(torch::kBool).device(DefaultDevice())); + torch::Tensor result = torch::index(params, {indices}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_params = CopyToDevice(params, device); + torch::Tensor lazy_indices = CopyToDevice(indices, device); + torch::Tensor lazy_result = torch::index(lazy_params, {lazy_indices}); + AllEqual(result, lazy_result); + }); + } +} + +TEST_F(LazyOpsTest, TestOneIndexPut) { + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor params = + isFloatingType(scalar_type) + ? torch::rand( + {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor indices = torch::randint( + -3, 3, {2, 4, 3}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor values = + isFloatingType(scalar_type) + ? torch::rand( + {3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + for (bool accumulate : {false, true}) { + if (accumulate && IsCuda()) { + GTEST_SKIP(); + } + torch::Tensor result = + torch::index_put(params, {indices}, values, accumulate); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_params = CopyToDevice(params, device); + torch::Tensor lazy_indices = CopyToDevice(indices, device); + torch::Tensor lazy_values = CopyToDevice(values, device); + torch::Tensor lazy_result = + torch::index_put(lazy_params, {lazy_indices}, lazy_values, accumulate); + AllEqual(result, lazy_result); + }); + } + } +} + +TEST_F(LazyOpsTest, TestOneIndexPutInPlace) { + torch::Tensor indices = torch::randint( + -3, 3, {2, 4, 3}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor values = + torch::ones({3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + for (bool accumulate : {false, true}) { + if (accumulate && IsCuda()) { + GTEST_SKIP(); + } + ForEachDevice([&](const torch::Device& device) { + torch::Tensor params = + isFloatingType(scalar_type) + ? torch::rand( + {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint(100, {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type) + .device(DefaultDevice())); + torch::Tensor lazy_params = CopyToDevice(params.clone(), device); + torch::Tensor result = + torch::index_put_(params, {indices}, values, accumulate); + torch::Tensor lazy_indices = CopyToDevice(indices, device); + torch::Tensor lazy_values = CopyToDevice(values, device); + torch::Tensor lazy_result = torch::index_put_(lazy_params, {lazy_indices}, + lazy_values, accumulate); + AllEqual(result, lazy_result); + AllEqual(params, lazy_params); + }); + } + } +} + +TEST_F(LazyOpsTest, TestOneIndexPutTransfer) { + torch::Tensor indices = torch::randint( + -3, 3, {2, 4, 3}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor params = + isFloatingType(scalar_type) + ? torch::rand( + {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor values = + torch::ones({3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + for (bool accumulate : {false, true}) { + if (accumulate && IsCuda()) { + GTEST_SKIP(); + } + torch::Tensor result = + torch::index_put(params, {indices}, values, accumulate); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_params = CopyToDevice(params, device); + torch::Tensor lazy_values = CopyToDevice(values, device); + torch::Tensor lazy_result = + torch::index_put(lazy_params, {indices}, lazy_values, accumulate); + AllEqual(result, lazy_result); + }); + } + } +} + +TEST_F(LazyOpsTest, TestMultiIndexPut) { + torch::Tensor indices_0 = torch::randint( + -3, 3, {2, 4, 3}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor indices_1 = torch::randint( + -3, 3, {2, 4, 3}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor params = + isFloatingType(scalar_type) + ? torch::rand( + {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor values = torch::ones( + {5, 6, 7}, torch::TensorOptions(scalar_type).device(DefaultDevice())); + for (bool accumulate : {false, true}) { + if (accumulate && IsCuda()) { + GTEST_SKIP(); + } + torch::Tensor result = + torch::index_put(params, {indices_0, indices_1}, values, accumulate); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_params = CopyToDevice(params, device); + torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device); + torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device); + torch::Tensor lazy_values = CopyToDevice(values, device); + torch::Tensor lazy_result = torch::index_put( + lazy_params, {lazy_indices_0, lazy_indices_1}, lazy_values, accumulate); + AllEqual(result, lazy_result); + }); + } + } +} + +TEST_F(LazyOpsTest, TestMultiIndexPutHeadNull) { + torch::Tensor indices_0 = torch::randint( + -3, 3, {2, 4, 3}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor indices_null; + torch::Tensor indices_1 = torch::randint( + -3, 3, {2, 4, 3}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor params = + isFloatingType(scalar_type) + ? torch::rand( + {4, 3, 3, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {4, 3, 3, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor values = torch::ones( + {3, 6, 7}, torch::TensorOptions(scalar_type).device(DefaultDevice())); + for (bool accumulate : {false, true}) { + if (accumulate && IsCuda()) { + GTEST_SKIP(); + } + torch::Tensor result = torch::index_put( + params, {indices_null, indices_0, indices_1}, values, accumulate); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_params = CopyToDevice(params, device); + torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device); + torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device); + torch::Tensor lazy_values = CopyToDevice(values, device); + torch::Tensor lazy_result = torch::index_put( + lazy_params, {indices_null, lazy_indices_0, lazy_indices_1}, + lazy_values, accumulate); + AllEqual(result, lazy_result); + }); + } + } +} + +TEST_F(LazyOpsTest, TestMultiIndexPutMiddleNull) { + torch::Tensor indices_0 = torch::randint( + -3, 3, {2, 4, 3}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor indices_null; + torch::Tensor indices_1 = torch::randint( + -3, 3, {2, 4, 3}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor params = + isFloatingType(scalar_type) + ? torch::rand( + {4, 3, 3, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {4, 3, 3, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor values = torch::ones( + {3, 6, 7}, torch::TensorOptions(scalar_type).device(DefaultDevice())); + for (bool accumulate : {false, true}) { + if (accumulate && IsCuda()) { + GTEST_SKIP(); + } + torch::Tensor result = torch::index_put( + params, {indices_0, indices_null, indices_1}, values, accumulate); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_params = CopyToDevice(params, device); + torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device); + torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device); + torch::Tensor lazy_values = CopyToDevice(values, device); + torch::Tensor lazy_result = torch::index_put( + lazy_params, {lazy_indices_0, indices_null, lazy_indices_1}, + lazy_values, accumulate); + AllEqual(result, lazy_result); + }); + } + } +} + +TEST_F(LazyOpsTest, TestMultiIndexPutTailNull) { + torch::Tensor indices_0 = torch::randint( + -3, 3, {2, 4, 3}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor indices_1 = torch::randint( + -3, 3, {2, 4, 3}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor indices_null; + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor params = + isFloatingType(scalar_type) + ? torch::rand( + {4, 3, 3, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {4, 3, 3, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor values = torch::ones( + {3, 6, 7}, torch::TensorOptions(scalar_type).device(DefaultDevice())); + for (bool accumulate : {false, true}) { + if (accumulate && IsCuda()) { + GTEST_SKIP(); + } + torch::Tensor result = torch::index_put( + params, {indices_0, indices_1, indices_null}, values, accumulate); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_params = CopyToDevice(params, device); + torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device); + torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device); + torch::Tensor lazy_values = CopyToDevice(values, device); + torch::Tensor lazy_result = torch::index_put( + lazy_params, {lazy_indices_0, lazy_indices_1, indices_null}, + lazy_values, accumulate); + AllEqual(result, lazy_result); + }); + } + } +} + +TEST_F(LazyOpsTest, TestMultiIndexPutMiddleBroadcast) { + torch::Tensor indices_0 = torch::randint( + -3, 3, {2, 4, 3}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor indices_1 = torch::randint( + -3, 3, {2, 1, 3}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor params = + isFloatingType(scalar_type) + ? torch::rand( + {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor values = torch::ones( + {5, 6, 7}, torch::TensorOptions(scalar_type).device(DefaultDevice())); + for (bool accumulate : {false, true}) { + if (accumulate && IsCuda()) { + GTEST_SKIP(); + } + torch::Tensor result = + torch::index_put(params, {indices_0, indices_1}, values, accumulate); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_params = CopyToDevice(params, device); + torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device); + torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device); + torch::Tensor lazy_values = CopyToDevice(values, device); + torch::Tensor lazy_result = torch::index_put( + lazy_params, {lazy_indices_0, lazy_indices_1}, lazy_values, accumulate); + AllEqual(result, lazy_result); + }); + } + } +} + +TEST_F(LazyOpsTest, TestMultiIndexPutTailBroadcast) { + torch::Tensor indices_0 = torch::randint( + -3, 3, {2, 1, 3}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor indices_1 = torch::randint( + -3, 3, {2, 1}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor params = + isFloatingType(scalar_type) + ? torch::rand( + {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor values = torch::ones( + {5, 6, 7}, torch::TensorOptions(scalar_type).device(DefaultDevice())); + for (bool accumulate : {false, true}) { + if (accumulate && IsCuda()) { + GTEST_SKIP(); + } + torch::Tensor result = + torch::index_put(params, {indices_0, indices_1}, values, accumulate); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_params = CopyToDevice(params, device); + torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device); + torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device); + torch::Tensor lazy_values = CopyToDevice(values, device); + torch::Tensor lazy_result = torch::index_put( + lazy_params, {lazy_indices_0, lazy_indices_1}, lazy_values, accumulate); + AllEqual(result, lazy_result); + }); + } + } +} + +TEST_F(LazyOpsTest, TestMaskIndexPut) { + torch::Tensor indices = + torch::tensor({0, 1}, + torch::TensorOptions(torch::kByte).device(DefaultDevice())) + .to(torch::kBool); + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor params = + isFloatingType(scalar_type) + ? torch::rand( + {2, 2}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {2, 2}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor values = torch::ones( + {2}, torch::TensorOptions(scalar_type).device(DefaultDevice())); + for (bool accumulate : {false, true}) { + torch::Tensor result = + torch::index_put(params, {indices}, values, accumulate); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_params = CopyToDevice(params, device); + torch::Tensor lazy_indices = CopyToDevice(indices, device); + torch::Tensor lazy_values = CopyToDevice(values, device); + torch::Tensor lazy_result = + torch::index_put(lazy_params, {lazy_indices}, lazy_values, accumulate); + AllEqual(result, lazy_result); + }); + } + } +} + +TEST_F(LazyOpsTest, TestIndexPutImpl) { + torch::Tensor indices = torch::randint( + -3, 3, {2, 4, 3}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor values = + torch::ones({3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + for (bool accumulate : {false, true}) { + if (accumulate && IsCuda()) { + GTEST_SKIP(); + } + ForEachDevice([&](const torch::Device& device) { + torch::Tensor params = + isFloatingType(scalar_type) + ? torch::rand( + {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint(100, {4, 3, 5, 6, 7}, + torch::TensorOptions(scalar_type) + .device(DefaultDevice())); + torch::Tensor lazy_params = CopyToDevice(params.clone(), device); + torch::Tensor result = torch::_index_put_impl_( + params, {indices}, values, accumulate, /*unsafe=*/true); + torch::Tensor lazy_indices = CopyToDevice(indices, device); + torch::Tensor lazy_values = CopyToDevice(values, device); + torch::Tensor lazy_result = torch::_index_put_impl_( + lazy_params, {lazy_indices}, lazy_values, accumulate, /*unsafe=*/true); + AllEqual(result, lazy_result); + AllEqual(params, lazy_params); + }); + } + } +} + +TEST_F(LazyOpsTest, TestIndexFillWithScalar) { + torch::Tensor index = torch::tensor( + {0, 2}, torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Scalar value = 42; + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor base = + isFloatingType(scalar_type) + ? torch::rand( + {3, 4, 5}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {3, 4, 5}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + int rank = base.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor result = torch::index_fill(base, dim, index, value); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_base = CopyToDevice(base, device); + torch::Tensor lazy_index = CopyToDevice(index, device); + torch::Tensor lazy_result = + torch::index_fill(lazy_base, dim, lazy_index, value); + AllEqual(result, lazy_result); + }); + } + } +} + +TEST_F(LazyOpsTest, TestIndexFillWithScalarInPlace) { + torch::Tensor index = torch::tensor( + {0, 2}, torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Scalar value = 42; + int rank = 3; + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + for (int dim = -rank; dim < rank; ++dim) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor base = + isFloatingType(scalar_type) + ? torch::rand( + {3, 4, 5}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint(100, {3, 4, 5}, + torch::TensorOptions(scalar_type) + .device(DefaultDevice())); + torch::Tensor lazy_base = CopyToDevice(base.clone(), device); + torch::Tensor result = base.index_fill_(dim, index, value); + torch::Tensor lazy_index = CopyToDevice(index, device); + torch::Tensor lazy_result = lazy_base.index_fill_(dim, lazy_index, value); + AllEqual(result, lazy_result); + AllEqual(base, lazy_base); + }); + } + } +} + +TEST_F(LazyOpsTest, TestIndexFillWithTensor) { + torch::Tensor index = torch::tensor( + {0, 2}, torch::TensorOptions(torch::kLong).device(DefaultDevice())); + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor base = + isFloatingType(scalar_type) + ? torch::rand( + {3, 4, 5}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {3, 4, 5}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor value = torch::scalar_tensor( + 42, torch::TensorOptions(scalar_type).device(DefaultDevice())); + int rank = base.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor result = torch::index_fill(base, dim, index, value); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_base = CopyToDevice(base, device); + torch::Tensor lazy_index = CopyToDevice(index, device); + torch::Tensor lazy_value = CopyToDevice(value, device); + torch::Tensor lazy_result = + torch::index_fill(lazy_base, dim, lazy_index, lazy_value); + AllEqual(result, lazy_result); + }); + } + } +} + +TEST_F(LazyOpsTest, TestIndexFillWithTensorInPlace) { + torch::Tensor index = torch::tensor( + {0, 2}, torch::TensorOptions(torch::kLong).device(DefaultDevice())); + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor value = torch::scalar_tensor( + 42, torch::TensorOptions(scalar_type).device(DefaultDevice())); + int rank = 3; + for (int dim = -rank; dim < rank; ++dim) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor base = + isFloatingType(scalar_type) + ? torch::rand( + {3, 4, 5}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint(100, {3, 4, 5}, + torch::TensorOptions(scalar_type) + .device(DefaultDevice())); + torch::Tensor lazy_base = CopyToDevice(base.clone(), device); + torch::Tensor result = base.index_fill_(dim, index, value); + torch::Tensor lazy_index = CopyToDevice(index, device); + torch::Tensor lazy_value = CopyToDevice(value, device); + torch::Tensor lazy_result = + lazy_base.index_fill_(dim, lazy_index, lazy_value); + AllEqual(result, lazy_result); + AllEqual(base, lazy_base); + }); + } + } +} + +TEST_F(LazyOpsTest, TestIndexFillRank0) { + torch::Tensor index = torch::scalar_tensor( + 2, torch::TensorOptions(torch::kLong).device(DefaultDevice())); + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor base = + isFloatingType(scalar_type) + ? torch::rand( + {3, 4, 5}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {3, 4, 5}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor value = torch::scalar_tensor( + 42, torch::TensorOptions(scalar_type).device(DefaultDevice())); + int rank = base.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor result = torch::index_fill(base, dim, index, value); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_base = CopyToDevice(base, device); + torch::Tensor lazy_index = CopyToDevice(index, device); + torch::Tensor lazy_value = CopyToDevice(value, device); + torch::Tensor lazy_result = + torch::index_fill(lazy_base, dim, lazy_index, lazy_value); + AllEqual(result, lazy_result); + }); + } + } +} + +TEST_F(LazyOpsTest, TestIndexAdd) { + int index_size = 10; + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor base = + isFloatingType(scalar_type) + ? torch::rand( + {5, 3, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {5, 3, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + int rank = base.dim(); + for (int dim = -rank; dim < rank; ++dim) { + for (torch::ScalarType index_scalar_type : {torch::kInt, torch::kLong}) { + torch::Tensor index = torch::randint( + 0, base.size(dim), {index_size}, + torch::TensorOptions(index_scalar_type).device(DefaultDevice())); + std::vector value_sizes(base.sizes().begin(), + base.sizes().end()); + int canonical_dim = dim < 0 ? dim + rank : dim; + value_sizes[canonical_dim] = index_size; + torch::Tensor value = + isFloatingType(scalar_type) + ? torch::rand( + value_sizes, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint(100, value_sizes, + torch::TensorOptions(scalar_type) + .device(DefaultDevice())); + torch::Tensor result = torch::index_add(base, dim, index, value); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_base = CopyToDevice(base, device); + torch::Tensor lazy_index = CopyToDevice(index, device); + torch::Tensor lazy_value = CopyToDevice(value, device); + torch::Tensor lazy_result = + torch::index_add(lazy_base, dim, lazy_index, lazy_value); + AllClose(result, lazy_result); + }); + } + } + } +} + +TEST_F(LazyOpsTest, TestIndexAddInPlace) { + int index_size = 10; + int rank = 3; + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + for (int dim = -rank; dim < rank; ++dim) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor base = + isFloatingType(scalar_type) + ? torch::rand( + {5, 3, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint(100, {5, 3, 7}, + torch::TensorOptions(scalar_type) + .device(DefaultDevice())); + torch::Tensor index = torch::randint( + 0, base.size(dim), {index_size}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + std::vector value_sizes(base.sizes().begin(), + base.sizes().end()); + int canonical_dim = dim < 0 ? dim + rank : dim; + value_sizes[canonical_dim] = index_size; + torch::Tensor value = + isFloatingType(scalar_type) + ? torch::rand( + value_sizes, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint(100, value_sizes, + torch::TensorOptions(scalar_type) + .device(DefaultDevice())); + torch::Tensor lazy_base = CopyToDevice(base.clone(), device); + torch::Tensor result = base.index_add_(dim, index, value); + torch::Tensor lazy_index = CopyToDevice(index, device); + torch::Tensor lazy_value = CopyToDevice(value, device); + torch::Tensor lazy_result = + lazy_base.index_add_(dim, lazy_index, lazy_value); + AllClose(result, lazy_result); + AllClose(base, lazy_base); + }); + } + } +} + +TEST_F(LazyOpsTest, TestIndexAddRank0) { + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor base = + isFloatingType(scalar_type) + ? torch::rand( + {5, 3, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {5, 3, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + int rank = base.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor index = torch::randint( + 0, base.size(dim), at::IntArrayRef{}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + std::vector value_sizes(base.sizes().begin(), + base.sizes().end()); + int canonical_dim = dim < 0 ? dim + rank : dim; + value_sizes[canonical_dim] = 1; + torch::Tensor value = + isFloatingType(scalar_type) + ? torch::rand( + value_sizes, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, value_sizes, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor result = torch::index_add(base, dim, index, value); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_base = CopyToDevice(base, device); + torch::Tensor lazy_index = CopyToDevice(index, device); + torch::Tensor lazy_value = CopyToDevice(value, device); + torch::Tensor lazy_result = + torch::index_add(lazy_base, dim, lazy_index, lazy_value); + AllEqual(result, lazy_result); + }); + } + } +} + +TEST_F(LazyOpsTest, TestIndexCopy) { + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor base = + isFloatingType(scalar_type) + ? torch::rand( + {5, 3, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {5, 3, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + int rank = base.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor index = torch::randperm( + base.size(dim), + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor value = + isFloatingType(scalar_type) + ? torch::rand( + base.sizes(), + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, base.sizes(), + torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor result = torch::index_copy(base, dim, index, value); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_base = CopyToDevice(base, device); + torch::Tensor lazy_index = CopyToDevice(index, device); + torch::Tensor lazy_value = CopyToDevice(value, device); + torch::Tensor lazy_result = + torch::index_copy(lazy_base, dim, lazy_index, lazy_value); + AllEqual(result, lazy_result); + }); + } + } +} + +TEST_F(LazyOpsTest, TestIndexCopyInPlace) { + if (IsCuda()) { + GTEST_SKIP(); + } + int index_size = 10; + int rank = 3; + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + for (int dim = -rank; dim < rank; ++dim) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor base = + isFloatingType(scalar_type) + ? torch::rand( + {5, 3, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint(100, {5, 3, 7}, + torch::TensorOptions(scalar_type) + .device(DefaultDevice())); + torch::Tensor index = torch::randint( + 0, base.size(dim), {index_size}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + std::vector value_sizes(base.sizes().begin(), + base.sizes().end()); + int canonical_dim = dim < 0 ? dim + rank : dim; + value_sizes[canonical_dim] = index_size; + torch::Tensor value = + isFloatingType(scalar_type) + ? torch::rand( + value_sizes, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint(100, value_sizes, + torch::TensorOptions(scalar_type) + .device(DefaultDevice())); + torch::Tensor lazy_base = CopyToDevice(base.clone(), device); + torch::Tensor result = base.index_copy_(dim, index, value); + torch::Tensor lazy_index = CopyToDevice(index, device); + torch::Tensor lazy_value = CopyToDevice(value, device); + torch::Tensor lazy_result = + lazy_base.index_copy_(dim, lazy_index, lazy_value); + AllEqual(result, lazy_result); + AllEqual(base, lazy_base); + }); + } + } +} + +TEST_F(LazyOpsTest, TestIndexCopyRank0) { + for (torch::ScalarType scalar_type : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor base = + isFloatingType(scalar_type) + ? torch::rand( + {5, 3, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, {5, 3, 7}, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + int rank = base.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor index = torch::randint( + 0, base.size(dim), at::IntArrayRef{}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + std::vector value_sizes(base.sizes().begin(), + base.sizes().end()); + int canonical_dim = dim < 0 ? dim + rank : dim; + value_sizes[canonical_dim] = 1; + torch::Tensor value = + isFloatingType(scalar_type) + ? torch::rand( + value_sizes, + torch::TensorOptions(scalar_type).device(DefaultDevice())) + : torch::randint( + 100, value_sizes, + torch::TensorOptions(scalar_type).device(DefaultDevice())); + torch::Tensor result = torch::index_copy(base, dim, index, value); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_base = CopyToDevice(base, device); + torch::Tensor lazy_index = CopyToDevice(index, device); + torch::Tensor lazy_value = CopyToDevice(value, device); + torch::Tensor lazy_result = + torch::index_copy(lazy_base, dim, lazy_index, lazy_value); + AllEqual(result, lazy_result); + }); + } + } +} + +TEST_F(LazyOpsTest, TestRelu) { + torch::Tensor input = + torch::rand({2, 1, 4, 6}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output = torch::relu(input); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::relu(lazy_input); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestReluInPlace) { + torch::Tensor input = + torch::rand({2, 1, 4, 6}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor output = torch::relu_(input); + torch::Tensor lazy_output = torch::relu_(lazy_input); + AllClose(output, lazy_output); + AllClose(input, lazy_input); + }); +} + +TEST_F(LazyOpsTest, TestHardshrink) { + torch::Tensor input = torch::randn( + {10}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output = torch::hardshrink(input); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::hardshrink(lazy_input); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestHardSigmoid) { + torch::Tensor input = torch::randn( + {10}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output = torch::hardsigmoid(input); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::hardsigmoid(lazy_input); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestHardSigmoidInPlace) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor input = torch::randn( + {10}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor output = torch::hardsigmoid_(input); + torch::Tensor lazy_output = torch::hardsigmoid_(lazy_input); + AllClose(input, lazy_input); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestHardSigmoidBackward) { + auto testfn = [&](const std::vector& inputs) -> torch::Tensor { + return torch::hardsigmoid(inputs[0]); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::randn({10}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); +} + +TEST_F(LazyOpsTest, TestSoftshrink) { + torch::Tensor input = torch::randn( + {10}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output = torch::softshrink(input); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::softshrink(lazy_input); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestHardtanh) { + torch::Tensor input = torch::randn( + {10}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output = torch::hardtanh(input); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::hardtanh(lazy_input); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestHardtanhInPlace) { + torch::Tensor input = torch::randn( + {10}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor output = torch::hardtanh_(input); + torch::Tensor lazy_output = torch::hardtanh_(lazy_input); + AllClose(output, lazy_output); + AllClose(input, lazy_input); + }); +} + +TEST_F(LazyOpsTest, TestLeakyRelu) { + torch::Tensor input = + torch::rand({2, 1, 4, 6}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + double negative_slope = 0.01; + torch::Tensor output = torch::leaky_relu(input, negative_slope); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::leaky_relu(lazy_input, negative_slope); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestLeakyReluInPlace) { + torch::Tensor input = + torch::rand({2, 1, 4, 6}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + double negative_slope = 0.01; + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor output = torch::leaky_relu_(input, negative_slope); + torch::Tensor lazy_output = torch::leaky_relu_(lazy_input, negative_slope); + AllClose(output, lazy_output); + AllClose(input, lazy_input); + }); +} + +TEST_F(LazyOpsTest, TestExp) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::exp(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::exp(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestExpm1) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::expm1(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::expm1(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestLog) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::log(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::log(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestLog2) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::log2(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::log2(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestLog10) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::log10(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::log10(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestLog1p) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::log1p(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::log1p(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestErf) { + torch::Tensor a = torch::randn( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::erf(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::erf(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestErfc) { + torch::Tensor a = torch::randn( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::erfc(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::erfc(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestErfinv) { + torch::Tensor a = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::erfinv(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::erfinv(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestSqrt) { + torch::Tensor a = torch::abs(torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()))); + torch::Tensor b = torch::sqrt(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::sqrt(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestRsqrt) { + torch::Tensor a = torch::abs(torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()))); + torch::Tensor b = torch::rsqrt(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::rsqrt(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestReciprocal) { + torch::Tensor a = torch::randn( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::reciprocal(a); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::reciprocal(lazy_a); + AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestPowTensorScalar) { + torch::Tensor base = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar exponent = 4.09; + torch::Tensor result = torch::pow(base, exponent); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_base = CopyToDevice(base, device); + torch::Tensor lazy_result = torch::pow(lazy_base, exponent); + AllClose(result, lazy_result, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestPowTensorScalarInPlace) { + torch::Tensor base = torch::rand( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar exponent = 4.09; + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_base = CopyToDevice(base.clone(), device); + torch::Tensor result = base.pow_(exponent); + torch::Tensor lazy_result = lazy_base.pow_(exponent); + AllClose(result, lazy_result, /*rtol=*/1e-3, /*atol=*/1e-5); + AllClose(base, lazy_base, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestPowTensorTensor) { + torch::Tensor base = torch::abs(torch::rand( + {4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()))); + torch::Tensor exponent = torch::rand( + {4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor result = torch::pow(base, exponent); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_base = CopyToDevice(base, device); + torch::Tensor lazy_exponent = CopyToDevice(exponent, device); + torch::Tensor lazy_result = torch::pow(lazy_base, lazy_exponent); + AllClose(result, lazy_result, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestPowTensorTensorInPlace) { + torch::Tensor base = torch::abs(torch::rand( + {4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()))); + torch::Tensor exponent = torch::rand( + {4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_base = CopyToDevice(base.clone(), device); + torch::Tensor result = base.pow_(exponent); + torch::Tensor lazy_exponent = CopyToDevice(exponent, device); + torch::Tensor lazy_result = lazy_base.pow_(lazy_exponent); + AllClose(result, lazy_result, /*rtol=*/1e-3, /*atol=*/1e-5); + AllClose(base, lazy_base, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestPowTensorTensorBroadcast) { + torch::Tensor base = torch::abs(torch::rand( + {4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()))); + torch::Tensor exponent = torch::rand( + {4, 1}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor result = torch::pow(base, exponent); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_base = CopyToDevice(base, device); + torch::Tensor lazy_exponent = CopyToDevice(exponent, device); + torch::Tensor lazy_result = torch::pow(lazy_base, lazy_exponent); + AllClose(result, lazy_result, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestPowScalarTensor) { + torch::Scalar base = 3.5; + torch::Tensor exponent = torch::rand({4, 2}); + torch::Tensor result = torch::pow(base, exponent); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_exponent = CopyToDevice(exponent, device); + torch::Tensor lazy_result = torch::pow(base, lazy_exponent); + AllClose(result, lazy_result, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestPowIntExponent) { + torch::Tensor base = torch::abs(torch::rand( + {4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()))); + torch::Scalar exponent = 3; + torch::Tensor result = torch::pow(base, exponent); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_base = CopyToDevice(base, device); + torch::Tensor lazy_result = torch::pow(lazy_base, exponent); + AllClose(result, lazy_result, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestFmodScalar) { + torch::Tensor a = + torch::rand({2, 2}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 100.0; + torch::Scalar divisor = 2.0; + torch::Tensor b = torch::fmod(a, divisor); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::fmod(lazy_a, divisor); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestFmodScalarInPlace) { + torch::Scalar divisor = 2.0; + ForEachDevice([&](const torch::Device& device) { + torch::Tensor a = + torch::rand( + {2, 2}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 100.0; + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor b = a.fmod_(divisor); + torch::Tensor lazy_b = lazy_a.fmod_(divisor); + AllClose(b, lazy_b); + AllClose(a, lazy_a); + }); +} + +TEST_F(LazyOpsTest, TestFmodTensor) { + torch::Tensor a = + torch::rand({2, 2}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 100.0; + torch::Tensor b = + torch::rand({2, 2}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 10.0; + torch::Tensor c = torch::fmod(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::fmod(lazy_a, lazy_b); + AllClose(c, lazy_c); + }); +} + +TEST_F(LazyOpsTest, TestFmodTensorInPlace) { + torch::Tensor b = + torch::rand({2, 2}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 10.0; + ForEachDevice([&](const torch::Device& device) { + torch::Tensor a = + torch::rand( + {2, 2}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 100.0; + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor c = a.fmod_(b); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = lazy_a.fmod_(lazy_b); + AllClose(c, lazy_c); + AllClose(a, lazy_a); + }); +} + +TEST_F(LazyOpsTest, TestRemainderScalar) { + torch::Tensor a = + torch::randn( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 100.0; + torch::Scalar divisor = -2.0; + torch::Tensor b = torch::remainder(a, divisor); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = torch::remainder(lazy_a, divisor); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestRemainderScalarInPlace) { + torch::Scalar divisor = -2.0; + ForEachDevice([&](const torch::Device& device) { + torch::Tensor a = + torch::randn( + {2, 2}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 100.0; + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor b = a.remainder_(divisor); + torch::Tensor lazy_b = lazy_a.remainder_(divisor); + AllClose(b, lazy_b); + AllClose(a, lazy_a); + }); +} + +TEST_F(LazyOpsTest, TestRemainderTensor) { + torch::Tensor a = + torch::randn( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 100.0; + torch::Tensor b = + torch::randn( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 10.0; + torch::Tensor c = torch::remainder(a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = torch::remainder(lazy_a, lazy_b); + AllClose(c, lazy_c, /*rtol=*/1e-4, /*atol=*/1e-6); + }); +} + +TEST_F(LazyOpsTest, TestRemainderTensorInPlace) { + torch::Tensor b = + torch::randn( + {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 10.0; + ForEachDevice([&](const torch::Device& device) { + torch::Tensor a = + torch::randn( + {2, 2}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())) * + 100.0; + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor c = a.remainder_(b); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = lazy_a.remainder_(lazy_b); + AllClose(c, lazy_c, /*rtol=*/1e-4, /*atol=*/1e-6); + AllClose(a, lazy_a, /*rtol=*/1e-4, /*atol=*/1e-6); + }); +} + +TEST_F(LazyOpsTest, TestWhere) { + torch::Tensor a = torch::rand( + {3, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {3, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::empty( + {3, 3}, torch::TensorOptions(torch::kByte).device(DefaultDevice())); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 3; ++j) { + c[i][j] = i == j; + } + } + torch::Tensor d = torch::where(c, a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = CopyToDevice(c, device); + torch::Tensor lazy_d = torch::where(lazy_c, lazy_a, lazy_b); + AllClose(d, lazy_d); + }); +} + +TEST_F(LazyOpsTest, TestWhereBroadcast) { + torch::Tensor a = torch::rand( + {3, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::zeros( + {}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::empty( + {3, 3}, torch::TensorOptions(torch::kByte).device(DefaultDevice())); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 3; ++j) { + c[i][j] = i == j; + } + } + torch::Tensor d = torch::where(c, a, b); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = CopyToDevice(c, device); + torch::Tensor lazy_d = torch::where(lazy_c, lazy_a, lazy_b); + AllClose(d, lazy_d); + }); +} + +TEST_F(LazyOpsTest, TestThreshold) { + torch::Tensor input = + torch::rand({2, 1, 4, 6}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + float threshold = 0.4; + float value = 20; + torch::Tensor output = torch::threshold(input, threshold, value); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::threshold(lazy_input, threshold, value); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestThresholdBackward) { + float threshold = 0.4; + float value = 20; + + auto testFunction = [&](const std::vector& inputs) -> torch::Tensor { + return torch::threshold(inputs[0], threshold, value); + }; + + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::rand({2, 1, 4, 6}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testFunction); + }); +} + +TEST_F(LazyOpsTest, TestThresholdInPlace) { + torch::Tensor input = + torch::rand({2, 1, 4, 6}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output = input.clone(); + float threshold = 0.4; + float value = 20; + torch::threshold_(output, threshold, value); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_output = CopyToDevice(input, device); + torch::threshold_(lazy_output, threshold, value); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestElu) { + torch::Tensor input = + torch::rand({2, 1, 4, 6}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar alpha = 0.5; + torch::Scalar scale = 2.5; + torch::Scalar input_scale = 1.5; + torch::Tensor output = torch::elu(input, alpha, scale, input_scale); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::elu(lazy_input, alpha, scale, input_scale); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestEluInPlace) { + torch::Tensor input = + torch::rand({2, 1, 4, 6}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar alpha = 0.5; + torch::Scalar scale = 2.5; + torch::Scalar input_scale = 1.5; + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor output = torch::elu_(input, alpha, scale, input_scale); + torch::Tensor lazy_output = + torch::elu_(lazy_input, alpha, scale, input_scale); + AllClose(output, lazy_output); + AllClose(input, lazy_input); + }); +} + +TEST_F(LazyOpsTest, TestSelu) { + torch::Tensor input = + torch::rand({2, 1, 4, 6}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output = torch::selu(input); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::selu(lazy_input); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestSeluInPlace) { + torch::Tensor input = + torch::rand({2, 1, 4, 6}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor output = torch::selu_(input); + torch::Tensor lazy_output = torch::selu_(lazy_input); + AllClose(output, lazy_output); + AllClose(input, lazy_input); + }); +} + +TEST_F(LazyOpsTest, TestCelu) { + torch::Tensor input = + torch::rand({2, 1, 4, 6}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar alpha = 2.5; + torch::Tensor output = torch::celu(input, alpha); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::celu(lazy_input, alpha); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestCeluInPlace) { + torch::Tensor input = + torch::rand({2, 1, 4, 6}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar alpha = 2.5; + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor output = torch::celu_(input, alpha); + torch::Tensor lazy_output = torch::celu_(lazy_input, alpha); + AllClose(output, lazy_output); + AllClose(input, lazy_input); + }); +} + +TEST_F(LazyOpsTest, TestGelu) { + torch::Tensor input = torch::rand( + {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output = torch::gelu(input); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::gelu(lazy_input); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestAddMatMul) { + int in_channels = 32; + int out_channels = 320; + int labels = 50; + torch::Tensor input = + torch::rand({in_channels, out_channels}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor weight = + torch::rand({out_channels, labels}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor bias = torch::rand( + {labels}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + // Test beta != 1. through the CPU interop. + for (double beta : {1., 2.}) { + torch::Tensor output = torch::addmm(bias, input, weight, /*beta=*/beta); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_weight = CopyToDevice(weight, device); + torch::Tensor lazy_bias = CopyToDevice(bias, device); + torch::Tensor lazy_output = + torch::addmm(lazy_bias, lazy_input, lazy_weight, /*beta=*/beta); + AllClose(output, lazy_output); + }); + } +} + +TEST_F(LazyOpsTest, TestEmbedding) { + torch::Tensor a = torch::rand( + {32, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor i = torch::randint( + 0, 31, {3, 4}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor b = + torch::embedding(a, i, /*padding_idx=*/0, /*scale_grad_by_freq=*/false, + /*sparse=*/false); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_i = CopyToDevice(i, device); + torch::Tensor lazy_b = torch::embedding(lazy_a, lazy_i, /*padding_idx=*/0, + /*scale_grad_by_freq=*/false, + /*sparse=*/false); + AllClose(b, lazy_b); + }); +} + +TEST_F(LazyOpsTest, TestOneHot) { + int num_classes = 5; + torch::Tensor input = torch::randint( + 0, num_classes, {10}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor output = torch::one_hot(input, num_classes); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::one_hot(lazy_input, num_classes); + AllEqual(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestTranspose) { + torch::Tensor input = torch::rand( + {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output = torch::t(input); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::t(lazy_input); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestTransposeInPlace) { + torch::Tensor input = torch::rand( + {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor output = input.t_(); + torch::Tensor lazy_output = lazy_input.t_(); + EXPECT_EQ(lazy_output.sizes(), output.sizes()); + AllClose(output, lazy_output); + AllClose(input, lazy_input); + }); +} + +TEST_F(LazyOpsTest, TestReshape) { + torch::Tensor input = + torch::rand({32, 20, 4, 4}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output = torch::reshape(input, {-1, 320}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::reshape(lazy_input, {-1, 320}); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestResize) { + // Testing a resize_() with target size bigger than original size is not + // possible, as we fill with zeros, while pytorch fills with random garbage. + torch::Tensor input = torch::rand( + {2, 2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor saved_input = input.clone(); + input.resize_({3, 3}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(saved_input, device); + lazy_input.resize_({3, 3}); + AllClose(input, lazy_input); + }); +} + +TEST_F(LazyOpsTest, TestViewResize) { + torch::Tensor input = torch::zeros( + {8, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor saved_input = input.clone(); + torch::Tensor output = input.view({4, 4}); + output.resize_({3, 3}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(saved_input, device); + torch::Tensor lazy_output = lazy_input.view({4, 4}); + lazy_output.resize_({3, 3}); + AllClose(input, lazy_input); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestView) { + torch::Tensor input = + torch::rand({32, 20, 4, 4}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output = input.view({-1, 320}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = lazy_input.view({-1, 320}); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestViewMod) { + torch::Tensor input = + torch::zeros({32, 20, 4, 4}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor one = torch::tensor( + 1.0, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output = input.view({-1, 320}); + output.add_(one, 1.0); + input.add_(one, 1.0); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor xinput = torch::zeros( + {32, 20, 4, 4}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_input = CopyToDevice(xinput, device); + torch::Tensor lazy_one = CopyToDevice(one, device); + torch::Tensor lazy_output = lazy_input.view({-1, 320}); + lazy_output.add_(lazy_one, 1.0); + lazy_input.add_(lazy_one, 1.0); + AllClose(output, lazy_output); + AllClose(input, lazy_input); + }); +} + +TEST_F(LazyOpsTest, TestViewModComplex) { + torch::Tensor input = + torch::zeros({32, 20, 4, 4}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor one = torch::tensor( + 1.0, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output1 = input.view({-1, 320}); + output1.add_(one, 1.0); + torch::Tensor output2 = input.view({-1, 160}); + output2.add_(one, 1.0); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor xinput = torch::zeros( + {32, 20, 4, 4}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_input = CopyToDevice(xinput, device); + torch::Tensor lazy_one = CopyToDevice(one, device); + torch::Tensor lazy_output1 = lazy_input.view({-1, 320}); + lazy_output1.add_(lazy_one, 1.0); + torch::Tensor lazy_output2 = lazy_input.view({-1, 160}); + lazy_output2.add_(lazy_one, 1.0); + AllClose(output1, lazy_output1); + AllClose(output2, lazy_output2); + }); +} + +TEST_F(LazyOpsTest, TestViewOfViewMod) { + torch::Tensor input = + torch::zeros({32, 20, 4, 4}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor one = torch::tensor( + 1.0, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output1 = input.view({-1, 320}); + output1.add_(one, 1.0); + torch::Tensor output2 = output1.view({-1, 160}); + output2.add_(one, 1.0); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor xinput = torch::zeros( + {32, 20, 4, 4}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_input = CopyToDevice(xinput, device); + torch::Tensor lazy_one = CopyToDevice(one, device); + torch::Tensor lazy_output1 = lazy_input.view({-1, 320}); + lazy_output1.add_(lazy_one, 1.0); + torch::Tensor lazy_output2 = lazy_output1.view({-1, 160}); + lazy_output2.add_(lazy_one, 1.0); + AllClose(output1, lazy_output1); + AllClose(output2, lazy_output2); + }); +} + +TEST_F(LazyOpsTest, TestViewSqueezeAddInPlace) { + torch::Tensor input = torch::zeros( + {2, 3, 1}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + std::vector view_size = {2, 3, 1, 1}; + int squeeze_dim = 2; + torch::Tensor one = torch::tensor( + 1.0, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor output = input.view(view_size); + output.squeeze_(squeeze_dim); + output.add_(one, 1.0); + torch::Tensor lazy_one = CopyToDevice(one, device); + torch::Tensor lazy_output = lazy_input.view(view_size); + lazy_output.squeeze_(squeeze_dim); + lazy_output.add_(lazy_one, 1.0); + AllClose(output, lazy_output); + AllClose(input, lazy_input); + }); +} + +TEST_F(LazyOpsTest, TestUnsafeView) { + torch::Tensor input = + torch::rand({32, 20, 4, 4}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output = torch::_unsafe_view(input, {-1, 320}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::_unsafe_view(lazy_input, {-1, 320}); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestNarrow) { + torch::Tensor a = + torch::rand({8, 10, 4, 4}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (int64_t dim : {1, -3}) { + for (int64_t start : {2, -8}) { + torch::Tensor b = a.narrow(dim, start, 6); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = lazy_a.narrow(dim, start, 6); + AllClose(b, lazy_b); + }); + } + } +} + +TEST_F(LazyOpsTest, TestNarrowUpdate) { + for (int64_t dim : {1, -2}) { + for (int64_t start : {2, -6}) { + torch::Tensor a = torch::rand( + {3, 8, 3}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor a_copy = a.clone(); + torch::Tensor b = torch::rand( + {3, 4, 3}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = a.narrow(dim, start, 4); + c.add_(b, 1.0); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a_copy, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = lazy_a.narrow(dim, start, 4); + lazy_c.add_(lazy_b, 1.0); + AllClose(c, lazy_c); + }); + } + } +} + +TEST_F(LazyOpsTest, TestNarrowUpdateBaseCheck) { + for (int64_t dim : {0, -2}) { + for (int64_t start : {2, -6}) { + torch::Tensor a = torch::zeros( + {8, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor a_copy = a.clone(); + torch::Tensor b = torch::ones( + {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = a.narrow(dim, start, 4); + c.add_(b, 1.0); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a_copy, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = lazy_a.narrow(dim, start, 4); + lazy_c.add_(lazy_b, 1.0); + AllClose(a, lazy_a); + }); + } + } +} + +TEST_F(LazyOpsTest, TestNarrowUpdateTwoSlices) { + for (int64_t dim : {0, -2}) { + for (int64_t start0 : {2, -6}) { + for (int64_t start1 : {6, -2}) { + torch::Tensor a = torch::zeros( + {8, 3}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor a_copy = a.clone(); + torch::Tensor b = torch::ones( + {2, 3}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = b + 1; + torch::Tensor d = a.narrow(dim, start0, 2); + torch::Tensor e = a.narrow(dim, start1, 2); + d.add_(b, 1.0); + e.add_(c, 1.0); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a_copy, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = CopyToDevice(c, device); + torch::Tensor lazy_d = lazy_a.narrow(dim, start0, 2); + torch::Tensor lazy_e = lazy_a.narrow(dim, start1, 2); + lazy_d.add_(lazy_b, 1.0); + lazy_e.add_(lazy_c, 1.0); + AllClose(d, lazy_d); + AllClose(e, lazy_e); + AllClose(a, lazy_a); + }); + } + } + } +} + +TEST_F(LazyOpsTest, TestNarrowUpdateView) { + for (int64_t dim : {0, -3}) { + for (int64_t start : {2, -6}) { + torch::Tensor a = torch::rand( + {8, 2, 3}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor a_copy = a.clone(); + torch::Tensor b = torch::rand( + {4, 6}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = a.narrow(dim, start, 4); + torch::Tensor d = c.view({4, 6}); + d.add_(b, 1.0); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a_copy, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = lazy_a.narrow(dim, start, 4); + torch::Tensor lazy_d = lazy_c.view({4, 6}); + lazy_d.add_(lazy_b, 1.0); + AllClose(d, lazy_d); + }); + } + } +} + +TEST_F(LazyOpsTest, TestNarrowInNarrowUpdate) { + for (int64_t dim : {1, -2}) { + for (int64_t start0 : {1, -7}) { + for (int64_t start1 : {1, -5}) { + torch::Tensor a = torch::rand( + {3, 8, 3}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor a_copy = a.clone(); + torch::Tensor b = torch::rand( + {3, 2, 3}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = a.narrow(dim, start0, 6); + torch::Tensor d = c.narrow(dim, start1, 2); + d.add_(b, 1.0); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a_copy, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = lazy_a.narrow(dim, start0, 6); + torch::Tensor lazy_d = lazy_c.narrow(dim, start1, 2); + lazy_d.add_(lazy_b, 1.0); + AllClose(a, lazy_a); + }); + } + } + } +} + +TEST_F(LazyOpsTest, TestNarrowCopy) { + for (int64_t dim : {1, -3}) { + for (int64_t start : {2, -8}) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor input = torch::rand( + {8, 10, 4, 4}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor result = input.narrow_copy(dim, start, 6); + input.add_(1); + torch::Tensor lazy_result = lazy_input.narrow_copy(dim, start, 6); + lazy_input.add_(1); + AllClose(result, lazy_result); + }); + } + } +} + +TEST_F(LazyOpsTest, TestViewAs) { + torch::Tensor input = + torch::rand({32, 20, 4, 4}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor empty = torch::empty({32, 320}); + torch::Tensor output = input.view_as(empty); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_empty = CopyToDevice(empty, device); + torch::Tensor lazy_output = lazy_input.view_as(lazy_empty); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestLogSoftmax) { + torch::Tensor input = + torch::rand({5, 3, 4, 2}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + int rank = input.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor output = torch::log_softmax(input, dim); + torch::Tensor lazy_output = torch::log_softmax(lazy_input, dim); + AllClose(output, lazy_output, /*rtol=*/1e-3); + } + }); +} + +TEST_F(LazyOpsTest, TestLogSoftmaxCast) { + torch::Tensor input = + torch::rand({5, 3, 4, 2}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + int rank = input.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor output = torch::log_softmax(input, dim, torch::kDouble); + torch::Tensor lazy_output = + torch::log_softmax(lazy_input, dim, torch::kDouble); + AllClose(output, lazy_output, /*rtol=*/1e-3); + } + }); +} + +TEST_F(LazyOpsTest, TestLogSoftmaxWrapper) { + torch::Tensor input = + torch::rand({10, 2, 6, 4}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + int rank = input.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor output = + torch::_log_softmax(input, dim, /*half_to_float=*/false); + torch::Tensor lazy_output = + torch::_log_softmax(lazy_input, dim, /*half_to_float=*/false); + AllClose(output, lazy_output, /*rtol=*/1e-3); + } + }); +} + +TEST_F(LazyOpsTest, TestSoftmax) { + torch::Tensor input = + torch::rand({10, 2, 6, 4}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + int rank = input.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor output = torch::softmax(input, dim); + torch::Tensor lazy_output = torch::softmax(lazy_input, dim); + AllClose(output, lazy_output, /*rtol=*/1e-3); + } + }); +} + +TEST_F(LazyOpsTest, TestSoftmaxCast) { + torch::Tensor input = + torch::rand({10, 2, 6, 4}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + int rank = input.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor output = torch::softmax(input, dim, torch::kDouble); + torch::Tensor lazy_output = torch::softmax(lazy_input, dim, torch::kDouble); + AllClose(output, lazy_output, /*rtol=*/1e-3); + } + }); +} + +TEST_F(LazyOpsTest, TestSoftmaxWrapper) { + torch::Tensor input = + torch::rand({10, 2, 6, 4}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + int rank = input.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor output = + torch::_softmax(input, dim, /*half_to_float=*/false); + torch::Tensor lazy_output = + torch::_softmax(lazy_input, dim, /*half_to_float=*/false); + AllClose(output, lazy_output, /*rtol=*/1e-3); + } + }); +} + +TEST_F(LazyOpsTest, TestSoftplus) { + torch::Tensor input = + torch::rand({2, 1, 4, 6}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output = torch::softplus(input); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::softplus(lazy_input); + AllClose(output, lazy_output, /*rtol=*/1e-4); + }); +} + +TEST_F(LazyOpsTest, TestMaxPool1D) { + torch::Tensor input = torch::rand( + {1, 16, 56}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int kernel_size = 3; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + // Test dilation through the CPU interop. + for (int dilation = 1; dilation <= 2; ++dilation) { + torch::Tensor output = + torch::max_pool1d(input, /*kernel_size=*/{kernel_size}, + /*stride=*/{stride}, + /*padding=*/{padding}, /*dilation=*/{dilation}, + /*ceil_mode=*/ceil_mode); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = + torch::max_pool1d(lazy_input, + /*kernel_size=*/{kernel_size}, + /*stride=*/{stride}, + /*padding=*/{padding}, + /*dilation=*/{dilation}, + /*ceil_mode=*/ceil_mode); + AllClose(output, lazy_output); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestMaxPool2D) { + torch::Tensor input = + torch::rand({1, 4, 14, 14}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int kernel_size = 3; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + // Test dilation through the CPU interop. + for (int dilation = 1; dilation <= 2; ++dilation) { + torch::Tensor output = torch::max_pool2d( + input, /*kernel_size=*/{kernel_size, kernel_size}, + /*stride=*/{stride, stride}, + /*padding=*/{padding, padding}, /*dilation=*/{dilation, dilation}, + /*ceil_mode=*/ceil_mode); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = + torch::max_pool2d(lazy_input, + /*kernel_size=*/{kernel_size, kernel_size}, + /*stride=*/{stride, stride}, + /*padding=*/{padding, padding}, + /*dilation=*/{dilation, dilation}, + /*ceil_mode=*/ceil_mode); + AllClose(output, lazy_output); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestMaxPool2DWithIndices) { + torch::Tensor input = + torch::rand({1, 4, 14, 14}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int kernel_size = 3; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + // Test dilation through the CPU interop. + for (int dilation = 1; dilation <= 2; ++dilation) { + auto outputs = torch::max_pool2d_with_indices( + input, /*kernel_size=*/{kernel_size, kernel_size}, + /*stride=*/{stride, stride}, + /*padding=*/{padding, padding}, /*dilation=*/{dilation, dilation}, + /*ceil_mode=*/ceil_mode); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + auto lazy_outputs = torch::max_pool2d_with_indices( + lazy_input, + /*kernel_size=*/{kernel_size, kernel_size}, + /*stride=*/{stride, stride}, + /*padding=*/{padding, padding}, + /*dilation=*/{dilation, dilation}, + /*ceil_mode=*/ceil_mode); + AllClose(std::get<0>(outputs), std::get<0>(lazy_outputs)); + AllClose(std::get<1>(outputs), std::get<1>(lazy_outputs)); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestMaxPool2DNonSquare) { + torch::Tensor input = + torch::rand({1, 4, 14, 14}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int kernel_size = 4; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + // Test dilation through the CPU interop. + for (int dilation = 1; dilation <= 2; ++dilation) { + torch::Tensor output = torch::max_pool2d( + input, /*kernel_size=*/{kernel_size, kernel_size + 1}, + /*stride=*/{stride, stride + 1}, + /*padding=*/{padding, padding + 1}, + /*dilation=*/{dilation, dilation}, + /*ceil_mode=*/ceil_mode); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::max_pool2d( + lazy_input, + /*kernel_size=*/{kernel_size, kernel_size + 1}, + /*stride=*/{stride, stride + 1}, + /*padding=*/{padding, padding + 1}, + /*dilation=*/{dilation, dilation}, + /*ceil_mode=*/ceil_mode); + AllClose(output, lazy_output); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestMaxPool3D) { + torch::Tensor input = + torch::rand({1, 1, 8, 8, 8}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int kernel_size = 3; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + // Test dilation through the CPU interop. + for (int dilation = 1; dilation <= 2; ++dilation) { + torch::Tensor output = torch::max_pool3d( + input, /*kernel_size=*/{kernel_size, kernel_size, kernel_size}, + /*stride=*/{stride, stride, stride}, + /*padding=*/{padding, padding, padding}, + /*dilation=*/{dilation, dilation, dilation}, + /*ceil_mode=*/ceil_mode); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::max_pool3d( + lazy_input, + /*kernel_size=*/{kernel_size, kernel_size, kernel_size}, + /*stride=*/{stride, stride, stride}, + /*padding=*/{padding, padding, padding}, + /*dilation=*/{dilation, dilation, dilation}, + /*ceil_mode=*/ceil_mode); + AllClose(output, lazy_output); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestMaxPool3DWithIndices) { + torch::Tensor input = + torch::rand({1, 1, 8, 8, 8}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int kernel_size = 3; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + // Test dilation through the CPU interop. + for (int dilation = 1; dilation <= 2; ++dilation) { + auto outputs = torch::max_pool3d_with_indices( + input, /*kernel_size=*/{kernel_size, kernel_size, kernel_size}, + /*stride=*/{stride, stride, stride}, + /*padding=*/{padding, padding, padding}, + /*dilation=*/{dilation, dilation, dilation}, + /*ceil_mode=*/ceil_mode); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + auto lazy_outputs = torch::max_pool3d_with_indices( + lazy_input, + /*kernel_size=*/{kernel_size, kernel_size, kernel_size}, + /*stride=*/{stride, stride, stride}, + /*padding=*/{padding, padding, padding}, + /*dilation=*/{dilation, dilation, dilation}, + /*ceil_mode=*/ceil_mode); + + AllClose(std::get<0>(outputs), std::get<0>(lazy_outputs)); + AllClose(std::get<1>(outputs), std::get<1>(lazy_outputs)); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestMaxPool3DIncompleteAttributes) { + torch::Tensor input = + torch::rand({1, 1, 8, 8, 8}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int kernel_size = 3; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + // Test dilation through the CPU interop. + for (int dilation = 1; dilation <= 2; ++dilation) { + torch::Tensor output = torch::max_pool3d( + input, /*kernel_size=*/{kernel_size, kernel_size, kernel_size}, + /*stride=*/{}, + /*padding=*/{padding}, + /*dilation=*/{dilation, dilation, dilation}, + /*ceil_mode=*/ceil_mode); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::max_pool3d( + lazy_input, + /*kernel_size=*/{kernel_size, kernel_size, kernel_size}, + /*stride=*/{}, + /*padding=*/{padding}, + /*dilation=*/{dilation, dilation, dilation}, + /*ceil_mode=*/ceil_mode); + AllClose(output, lazy_output); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestMaxPool3DNonSquare) { + torch::Tensor input = + torch::rand({1, 1, 8, 8, 8}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int kernel_size = 4; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + // Test dilation through the CPU interop. + for (int dilation = 1; dilation <= 2; ++dilation) { + torch::Tensor output = torch::max_pool3d( + input, + /*kernel_size=*/{kernel_size, kernel_size + 1, kernel_size}, + /*stride=*/{stride, stride + 1, stride}, + /*padding=*/{padding, padding + 1, padding}, + /*dilation=*/{dilation, dilation, dilation}, + /*ceil_mode=*/ceil_mode); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::max_pool3d( + lazy_input, + /*kernel_size=*/{kernel_size, kernel_size + 1, kernel_size}, + /*stride=*/{stride, stride + 1, stride}, + /*padding=*/{padding, padding + 1, padding}, + /*dilation=*/{dilation, dilation, dilation}, + /*ceil_mode=*/ceil_mode); + AllClose(output, lazy_output); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestMaxPool2DNoBatch) { + torch::Tensor input = torch::rand( + {4, 14, 14}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int kernel_size = 3; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + // Test dilation through the CPU interop. + for (int dilation = 1; dilation <= 2; ++dilation) { + torch::Tensor output = torch::max_pool2d( + input, /*kernel_size=*/{kernel_size, kernel_size}, + /*stride=*/{stride, stride}, + /*padding=*/{padding, padding}, /*dilation=*/{dilation, dilation}, + /*ceil_mode=*/ceil_mode); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = + torch::max_pool2d(lazy_input, + /*kernel_size=*/{kernel_size, kernel_size}, + /*stride=*/{stride, stride}, + /*padding=*/{padding, padding}, + /*dilation=*/{dilation, dilation}, + /*ceil_mode=*/ceil_mode); + AllClose(output, lazy_output); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestMaxPool3DNoBatch) { + torch::Tensor input = + torch::rand({1, 8, 8, 8}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int kernel_size = 3; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + // Test dilation through the CPU interop. + for (int dilation = 1; dilation <= 2; ++dilation) { + torch::Tensor output = torch::max_pool3d( + input, /*kernel_size=*/{kernel_size, kernel_size, kernel_size}, + /*stride=*/{stride, stride, stride}, + /*padding=*/{padding, padding, padding}, + /*dilation=*/{dilation, dilation, dilation}, + /*ceil_mode=*/ceil_mode); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::max_pool3d( + lazy_input, + /*kernel_size=*/{kernel_size, kernel_size, kernel_size}, + /*stride=*/{stride, stride, stride}, + /*padding=*/{padding, padding, padding}, + /*dilation=*/{dilation, dilation, dilation}, + /*ceil_mode=*/ceil_mode); + AllClose(output, lazy_output); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestAvgPool1D) { + torch::Tensor input = torch::rand( + {4, 1, 28}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int kernel_size = 2; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + for (bool count_include_pad : {true, false}) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + torch::Tensor output = + torch::avg_pool1d(input, /*kernel_size=*/{kernel_size}, + /*stride=*/{stride}, + /*padding=*/{padding}, /*ceil_mode=*/ceil_mode, + /*count_include_pad=*/count_include_pad); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = + torch::avg_pool1d(lazy_input, + /*kernel_size=*/{kernel_size}, + /*stride=*/{stride}, + /*padding=*/{padding}, + /*ceil_mode=*/ceil_mode, + /*count_include_pad=*/count_include_pad); + AllClose(output, lazy_output); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestAvgPool2D) { + torch::Tensor input = + torch::rand({2, 1, 14, 14}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int kernel_size = 2; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + for (bool count_include_pad : {true, false}) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + torch::Tensor output = torch::avg_pool2d( + input, /*kernel_size=*/{kernel_size, kernel_size}, + /*stride=*/{stride, stride}, + /*padding=*/{padding, padding}, /*ceil_mode=*/ceil_mode, + /*count_include_pad=*/count_include_pad); + ForEachDevice([&](const torch::Device& device) { + // torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = + torch::avg_pool2d(lazy_input, + /*kernel_size=*/{kernel_size, kernel_size}, + /*stride=*/{stride, stride}, + /*padding=*/{padding, padding}, + /*ceil_mode=*/ceil_mode, + /*count_include_pad=*/count_include_pad); + AllClose(output, lazy_output.to(torch::kCPU)); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestAvgPool2DNonSquare) { + torch::Tensor input = + torch::rand({2, 1, 14, 14}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int kernel_size = 4; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + for (bool count_include_pad : {true, false}) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + torch::Tensor output = torch::avg_pool2d( + input, /*kernel_size=*/{kernel_size, kernel_size + 1}, + /*stride=*/{stride, stride + 1}, + /*padding=*/{padding, padding + 1}, /*ceil_mode=*/ceil_mode, + /*count_include_pad=*/count_include_pad); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::avg_pool2d( + lazy_input, + /*kernel_size=*/{kernel_size, kernel_size + 1}, + /*stride=*/{stride, stride + 1}, + /*padding=*/{padding, padding + 1}, + /*ceil_mode=*/ceil_mode, + /*count_include_pad=*/count_include_pad); + AllClose(output, lazy_output); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestAvgPool3D) { + torch::Tensor input = + torch::rand({1, 1, 7, 7, 7}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int kernel_size = 2; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + for (bool count_include_pad : {true, false}) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + torch::Tensor output = torch::avg_pool3d( + input, /*kernel_size=*/{kernel_size, kernel_size, kernel_size}, + /*stride=*/{stride, stride, stride}, + /*padding=*/{padding, padding, padding}, /*ceil_mode=*/ceil_mode, + /*count_include_pad=*/count_include_pad); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::avg_pool3d( + lazy_input, + /*kernel_size=*/{kernel_size, kernel_size, kernel_size}, + /*stride=*/{stride, stride, stride}, + /*padding=*/{padding, padding, padding}, + /*ceil_mode=*/ceil_mode, + /*count_include_pad=*/count_include_pad); + AllClose(output, lazy_output); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestAvgPool3DIncompleteAttributes) { + torch::Tensor input = + torch::rand({1, 1, 7, 7, 7}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int kernel_size = 2; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + for (bool count_include_pad : {true, false}) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + torch::Tensor output = torch::avg_pool3d( + input, /*kernel_size=*/{kernel_size, kernel_size, kernel_size}, + /*stride=*/{}, + /*padding=*/{padding, padding, padding}, /*ceil_mode=*/ceil_mode, + /*count_include_pad=*/count_include_pad); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::avg_pool3d( + lazy_input, + /*kernel_size=*/{kernel_size, kernel_size, kernel_size}, + /*stride=*/{}, + /*padding=*/{padding, padding, padding}, + /*ceil_mode=*/ceil_mode, + /*count_include_pad=*/count_include_pad); + AllClose(output, lazy_output); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestAvgPool3DNonSquare) { + torch::Tensor input = + torch::rand({1, 1, 7, 7, 7}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int kernel_size = 4; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + for (bool count_include_pad : {true, false}) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + torch::Tensor output = torch::avg_pool3d( + input, + /*kernel_size=*/{kernel_size, kernel_size + 1, kernel_size}, + /*stride=*/{stride, stride + 1, stride}, + /*padding=*/{padding, padding + 1, padding}, + /*ceil_mode=*/ceil_mode, + /*count_include_pad=*/count_include_pad); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::avg_pool3d( + lazy_input, + /*kernel_size=*/{kernel_size, kernel_size + 1, kernel_size}, + /*stride=*/{stride, stride + 1, stride}, + /*padding=*/{padding, padding + 1, padding}, + /*ceil_mode=*/ceil_mode, + /*count_include_pad=*/count_include_pad); + AllClose(output, lazy_output); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestAvgPool2DNoBatch) { + torch::Tensor input = torch::rand( + {1, 7, 7}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int kernel_size = 2; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + for (bool count_include_pad : {true, false}) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + torch::Tensor output = torch::avg_pool2d( + input, /*kernel_size=*/{kernel_size, kernel_size}, + /*stride=*/{stride, stride}, + /*padding=*/{padding, padding}, /*ceil_mode=*/ceil_mode, + /*count_include_pad=*/count_include_pad); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = + torch::avg_pool2d(lazy_input, + /*kernel_size=*/{kernel_size, kernel_size}, + /*stride=*/{stride, stride}, + /*padding=*/{padding, padding}, + /*ceil_mode=*/ceil_mode, + /*count_include_pad=*/count_include_pad); + AllClose(output, lazy_output); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestAvgPool3DNoBatch) { + torch::Tensor input = + torch::rand({1, 7, 7, 7}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int kernel_size = 2; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + for (bool count_include_pad : {true, false}) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + torch::Tensor output = torch::avg_pool3d( + input, /*kernel_size=*/{kernel_size, kernel_size, kernel_size}, + /*stride=*/{stride, stride, stride}, + /*padding=*/{padding, padding, padding}, /*ceil_mode=*/ceil_mode, + /*count_include_pad=*/count_include_pad); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::avg_pool3d( + lazy_input, + /*kernel_size=*/{kernel_size, kernel_size, kernel_size}, + /*stride=*/{stride, stride, stride}, + /*padding=*/{padding, padding, padding}, + /*ceil_mode=*/ceil_mode, + /*count_include_pad=*/count_include_pad); + AllClose(output, lazy_output); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestAdaptiveAvgPool2D) { + torch::Tensor input = + torch::rand({4, 1, 28, 28}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (int64_t output_size : {7, 4}) { + torch::Tensor output = + torch::adaptive_avg_pool2d(input, {output_size, output_size}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = + torch::adaptive_avg_pool2d(lazy_input, {output_size, output_size}); + AllClose(output, lazy_output); + }); + } +} + +TEST_F(LazyOpsTest, TestAdaptiveAvgPool3D) { + torch::Tensor input = + torch::rand({9, 4, 56, 28, 28}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (int64_t output_size : {7, 4}) { + torch::Tensor output = torch::adaptive_avg_pool3d( + input, {output_size, output_size, output_size}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::adaptive_avg_pool3d( + lazy_input, {output_size, output_size, output_size}); + AllClose(output, lazy_output); + }); + } +} + +TEST_F(LazyOpsTest, TestAdaptiveAvgPool3DNoBatch) { + torch::Tensor input = + torch::rand({3, 56, 28, 28}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (int64_t output_size : {7, 4}) { + torch::Tensor output = torch::adaptive_avg_pool3d( + input, {output_size, output_size, output_size}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::adaptive_avg_pool3d( + lazy_input, {output_size, output_size, output_size}); + AllClose(output, lazy_output); + }); + } +} + +TEST_F(LazyOpsTest, TestAdaptiveAvgPool2DNoBatch) { + torch::Tensor input = torch::rand( + {1, 56, 56}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (int64_t output_size : {7, 8}) { + torch::Tensor output = + torch::adaptive_avg_pool2d(input, {output_size, output_size}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = + torch::adaptive_avg_pool2d(lazy_input, {output_size, output_size}); + AllClose(output, lazy_output); + }); + } +} + +TEST_F(LazyOpsTest, TestMaxUnpool2D) { + int kernel_size = 2; + torch::Tensor input = + torch::rand({2, 2, 8, 8}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + // Test dilation through the CPU interop. + for (int dilation = 1; dilation <= 2; ++dilation) { + torch::Tensor output; + torch::Tensor indices; + std::tie(output, indices) = torch::max_pool2d_with_indices( + input, /*kernel_size=*/{kernel_size, kernel_size}, + /*stride=*/{stride, stride}, + /*padding=*/{padding, padding}, /*dilation=*/{dilation, dilation}, + /*ceil_mode=*/ceil_mode); + + std::vector output_size({input.size(2), input.size(3)}); + at::Tensor utensor = + torch::max_unpool2d(output, indices, output_size); + + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_output = CopyToDevice(output, device); + torch::Tensor lazy_indices = CopyToDevice(indices, device); + at::Tensor lazy_utensor = + torch::max_unpool2d(lazy_output, lazy_indices, output_size); + AllClose(utensor, lazy_utensor); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestMaxUnpool3D) { + int kernel_size = 2; + torch::Tensor input = + torch::rand({1, 1, 4, 4, 4}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + // Test dilation through the CPU interop. + for (int dilation = 1; dilation <= 2; ++dilation) { + torch::Tensor output; + torch::Tensor indices; + std::tie(output, indices) = torch::max_pool3d_with_indices( + input, /*kernel_size=*/{kernel_size, kernel_size, kernel_size}, + /*stride=*/{stride, stride, stride}, + /*padding=*/{padding, padding, padding}, + /*dilation=*/{dilation, dilation, dilation}, + /*ceil_mode=*/ceil_mode); + + std::vector output_size( + {input.size(2), input.size(3), input.size(4)}); + at::Tensor utensor = torch::max_unpool3d( + output, indices, output_size, /*stride=*/{stride, stride, stride}, + /*padding=*/{padding, padding, padding}); + + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_output = CopyToDevice(output, device); + torch::Tensor lazy_indices = CopyToDevice(indices, device); + at::Tensor lazy_utensor = + torch::max_unpool3d(lazy_output, lazy_indices, output_size, + /*stride=*/{stride, stride, stride}, + /*padding=*/{padding, padding, padding}); + AllClose(utensor, lazy_utensor); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestNllLoss) { + + // TODO(whc) debug divide-by-zero failure under ASAN + GTEST_SKIP(); + + int batch = 6; + int classes = 2; + // TODO(asuhan): Fix the torch::kDouble case. + for (auto dtype : {torch::kFloat}) { + for (int ignore_index : {-1, 0, 1, 5}) { + for (bool def_weight : {false, true}) { + torch::Tensor input = + torch::rand({batch, classes}, + torch::TensorOptions(dtype).device(DefaultDevice())); + torch::Tensor target = torch::randint( + std::min(ignore_index, 0), classes, {batch}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor weight; + if (def_weight) { + weight = torch::rand( + {classes}, torch::TensorOptions(dtype).device(DefaultDevice())); + } + for (torch::Reduction::Reduction reduction : + {torch::Reduction::Mean, torch::Reduction::Sum, + torch::Reduction::None}) { + torch::Tensor output = + torch::nll_loss(/*self=*/input, /*target=*/target, + /*weight=*/weight, + /*reduction=*/reduction, + /*ignore_index=*/ignore_index); + + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_target = CopyToDevice(target, device); + torch::Tensor lazy_weight = + def_weight ? CopyToDevice(weight, device) : torch::Tensor(); + torch::Tensor lazy_output = torch::nll_loss( + /*self=*/lazy_input, /*target=*/lazy_target, + /*weight=*/lazy_weight, + /*reduction=*/reduction, /*ignore_index=*/ignore_index); + AllClose(output, lazy_output); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestNllLoss2d) { + int batch = 6; + int classes = 2; + int height = 3; + int width = 3; + // TODO(asuhan): Fix the torch::kDouble case. + for (auto dtype : {torch::kFloat}) { + for (int ignore_index : {-1, 0, 1, 5}) { + for (bool def_weight : {false, true}) { + torch::Tensor input = + torch::rand({batch, classes, height, width}, + torch::TensorOptions(dtype).device(DefaultDevice())); + torch::Tensor target = torch::randint( + std::min(ignore_index, 0), classes, {batch, height, width}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor weight; + if (def_weight) { + weight = torch::rand( + {classes}, torch::TensorOptions(dtype).device(DefaultDevice())); + } + for (torch::Reduction::Reduction reduction : + {torch::Reduction::Mean, torch::Reduction::Sum, + torch::Reduction::None}) { + torch::Tensor output = + torch::nll_loss2d(/*self=*/input, /*target=*/target, + /*weight=*/weight, + /*reduction=*/reduction, + /*ignore_index=*/ignore_index); + + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_target = CopyToDevice(target, device); + torch::Tensor lazy_weight = + def_weight ? CopyToDevice(weight, device) : torch::Tensor(); + torch::Tensor lazy_output = torch::nll_loss2d( + /*self=*/lazy_input, /*target=*/lazy_target, + /*weight=*/lazy_weight, + /*reduction=*/reduction, /*ignore_index=*/ignore_index); + AllClose(output, lazy_output); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestSmoothL1Loss) { + torch::Tensor input = torch::randn( + {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor target = torch::randn( + {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (torch::Reduction::Reduction reduction : + {torch::Reduction::None, torch::Reduction::Mean, + torch::Reduction::Sum}) { + for (double beta : {0.25, 1.}) { + torch::Tensor output = + torch::smooth_l1_loss(input, target, reduction, beta); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_target = CopyToDevice(target, device); + torch::Tensor lazy_output = + torch::smooth_l1_loss(lazy_input, lazy_target, reduction, beta); + AllClose(output, lazy_output); + }); + } + } +} + +TEST_F(LazyOpsTest, TestL1Loss) { + torch::Tensor input = torch::randn( + {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor target = torch::randn( + {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (torch::Reduction::Reduction reduction : + {torch::Reduction::None, torch::Reduction::Mean, + torch::Reduction::Sum}) { + torch::Tensor output = torch::l1_loss(input, target, reduction); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_target = CopyToDevice(target, device); + torch::Tensor lazy_output = + torch::l1_loss(lazy_input, lazy_target, reduction); + AllClose(output, lazy_output); + }); + } +} + +TEST_F(LazyOpsTest, TestL1LossBackward) { + for (torch::Reduction::Reduction reduction : + {torch::Reduction::None, torch::Reduction::Mean, + torch::Reduction::Sum}) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::l1_loss(inputs[0], inputs[1], reduction); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::rand({2, 4}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)), + torch::rand({2, 4}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()))}, + device, testfn); + }); + } +} + +TEST_F(LazyOpsTest, TestMseLoss) { + torch::Tensor input = torch::randn( + {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor target = torch::randn( + {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (torch::Reduction::Reduction reduction : + {torch::Reduction::None, torch::Reduction::Mean, + torch::Reduction::Sum}) { + torch::Tensor output = torch::mse_loss(input, target, reduction); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_target = CopyToDevice(target, device); + torch::Tensor lazy_output = + torch::mse_loss(lazy_input, lazy_target, reduction); + AllClose(output, lazy_output); + }); + } +} + +TEST_F(LazyOpsTest, TestMseLossBackward) { + for (torch::Reduction::Reduction reduction : + {torch::Reduction::None, torch::Reduction::Mean, + torch::Reduction::Sum}) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::mse_loss(inputs[0], inputs[1], reduction); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::rand({2, 4}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)), + torch::rand({2, 4}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()))}, + device, testfn); + }); + } +} + +TEST_F(LazyOpsTest, TestBatchNorm1D) { + int num_features = 3; + torch::Tensor input = + torch::rand({2, num_features, 4}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor weight = + torch::rand({num_features}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor bias = + torch::rand({num_features}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor running_mean = + torch::zeros({num_features}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor running_var = + torch::ones({num_features}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + double momentum = 0.1; + double eps = 0.5; + torch::Tensor undef; + for (bool training : {true, false}) { + for (bool undef_weight_bias : {false, true}) { + torch::Tensor output = torch::batch_norm( + /*input=*/input, /*weight=*/undef_weight_bias ? undef : weight, + /*bias=*/undef_weight_bias ? undef : bias, + /*running_mean=*/running_mean, /*running_var=*/running_var, + /*training=*/training, /*momentum=*/momentum, /*eps=*/eps, + /*cudnn_enabled=*/false); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_weight = + undef_weight_bias ? undef : CopyToDevice(weight, device); + torch::Tensor lazy_bias = + undef_weight_bias ? undef : CopyToDevice(bias, device); + torch::Tensor lazy_running_mean = CopyToDevice(running_mean, device); + torch::Tensor lazy_running_var = CopyToDevice(running_var, device); + torch::Tensor lazy_output = torch::batch_norm( + /*input=*/lazy_input, /*weight=*/lazy_weight, /*bias=*/lazy_bias, + /*running_mean=*/lazy_running_mean, /*running_var=*/lazy_running_var, + /*training=*/training, /*momentum=*/momentum, /*eps=*/eps, + /*cudnn_enabled=*/false); + AllClose(output, lazy_output, /*rtol=*/1e-3, /*atol=*/1e-5); + }); + } + } +} + +TEST_F(LazyOpsTest, TestBatchNorm2D) { + int num_features = 3; + torch::Tensor input = + torch::rand({2, num_features, 4, 4}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor weight = + torch::rand({num_features}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor bias = + torch::rand({num_features}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor running_mean = + torch::zeros({num_features}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor running_var = + torch::ones({num_features}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + double momentum = 0.1; + double eps = 0.5; + torch::Tensor undef; + for (bool training : {true, false}) { + for (bool undef_weight_bias : {false, true}) { + torch::Tensor output = torch::batch_norm( + /*input=*/input, /*weight=*/undef_weight_bias ? undef : weight, + /*bias=*/undef_weight_bias ? undef : bias, + /*running_mean=*/running_mean, /*running_var=*/running_var, + /*training=*/training, /*momentum=*/momentum, /*eps=*/eps, + /*cudnn_enabled=*/false); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_weight = + undef_weight_bias ? undef : CopyToDevice(weight, device); + torch::Tensor lazy_bias = + undef_weight_bias ? undef : CopyToDevice(bias, device); + torch::Tensor lazy_running_mean = CopyToDevice(running_mean, device); + torch::Tensor lazy_running_var = CopyToDevice(running_var, device); + torch::Tensor lazy_output = torch::batch_norm( + /*input=*/lazy_input, /*weight=*/lazy_weight, /*bias=*/lazy_bias, + /*running_mean=*/lazy_running_mean, /*running_var=*/lazy_running_var, + /*training=*/training, /*momentum=*/momentum, /*eps=*/eps, + /*cudnn_enabled=*/false); + AllClose(output, lazy_output, /*rtol=*/1e-3, /*atol=*/1e-5); + }); + } + } +} + +TEST_F(LazyOpsTest, TestDim) { + torch::Tensor input = torch::rand( + {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + EXPECT_EQ(input.dim(), lazy_input.dim()); + }); +} + +TEST_F(LazyOpsTest, TestContiguous) { + torch::Tensor input = torch::rand( + {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output = torch::native::contiguous(input); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::native::contiguous(lazy_input); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestSqueezeAll) { + torch::Tensor input = + torch::rand({2, 1, 3, 1}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output = torch::squeeze(input); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::squeeze(lazy_input); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestSqueezeAllInPlace) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor input = torch::rand( + {2, 1, 3, 1}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor output = input.squeeze_(); + torch::Tensor lazy_output = lazy_input.squeeze_(); + AllClose(output, lazy_output); + AllClose(input, lazy_input); + ASSERT_EQ(input.dim(), lazy_input.dim()); + for (int64_t dim_idx = 0; dim_idx < input.dim(); ++dim_idx) { + ASSERT_EQ(input.size(dim_idx), lazy_input.size(dim_idx)); + } + }); +} + +TEST_F(LazyOpsTest, TestSqueezeOne) { + torch::Tensor input = + torch::rand({2, 1, 3, 1}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int rank = input.dim(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor output = torch::squeeze(input, dim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::squeeze(lazy_input, dim); + AllClose(output, lazy_output); + }); + } +} + +TEST_F(LazyOpsTest, TestSqueezeOneInPlace) { + int rank = 4; + for (int dim = -rank; dim < rank; ++dim) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor input = torch::rand( + {2, 1, 3, 1}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor output = input.squeeze_(dim); + torch::Tensor lazy_output = lazy_input.squeeze_(dim); + AllClose(output, lazy_output); + AllClose(input, lazy_input); + ASSERT_EQ(input.dim(), lazy_input.dim()); + for (int64_t dim_idx = 0; dim_idx < input.dim(); ++dim_idx) { + ASSERT_EQ(input.size(dim_idx), lazy_input.size(dim_idx)); + } + }); + } +} + +TEST_F(LazyOpsTest, TestUnsqueeze) { + torch::Tensor input = torch::rand( + {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int rank = input.dim() + 1; + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor output = torch::unsqueeze(input, dim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::unsqueeze(lazy_input, dim); + AllClose(output, lazy_output); + }); + } +} + +TEST_F(LazyOpsTest, TestUnsqueezeInPlace) { + torch::Tensor input = torch::rand( + {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int rank = input.dim() + 1; + for (int dim = -rank; dim < rank; ++dim) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor output = input.unsqueeze_(dim); + torch::Tensor lazy_output = lazy_input.unsqueeze_(dim); + AllClose(output, lazy_output); + AllClose(input, lazy_input); + ASSERT_EQ(input.dim(), lazy_input.dim()); + for (int64_t dim_idx = 0; dim_idx < input.dim(); ++dim_idx) { + ASSERT_EQ(input.size(dim_idx), lazy_input.size(dim_idx)); + } + }); + } +} + +TEST_F(LazyOpsTest, TestMaskedFill) { + torch::Tensor input = torch::rand( + {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor mask = torch::randint( + 0, 2, {2, 3}, torch::TensorOptions(torch::kBool).device(DefaultDevice())); + torch::Scalar value(42); + torch::Tensor result = torch::masked_fill(input, mask, value); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_mask = CopyToDevice(mask, device); + torch::Tensor lazy_result = torch::masked_fill(lazy_input, lazy_mask, value); + AllClose(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestMaskedFillInPlace) { + torch::Scalar value(42); + torch::Tensor mask = torch::randint( + 0, 2, {2, 3}, torch::TensorOptions(torch::kBool).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor input = torch::rand( + {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_mask = CopyToDevice(mask, device); + torch::Tensor result = input.masked_fill_(mask, value); + torch::Tensor lazy_result = lazy_input.masked_fill_(lazy_mask, value); + AllClose(result, lazy_result); + AllClose(input, lazy_input); + }); +} + +TEST_F(LazyOpsTest, TestMaskedFillBroadcast) { + torch::Tensor input = + torch::rand({2, 5, 4, 3}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor mask = torch::randint( + 0, 2, {4, 1}, torch::TensorOptions(torch::kBool).device(DefaultDevice())); + torch::Scalar value(42); + torch::Tensor result = torch::masked_fill(input, mask, value); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_mask = CopyToDevice(mask, device); + torch::Tensor lazy_result = torch::masked_fill(lazy_input, lazy_mask, value); + AllClose(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestFill) { + torch::Scalar value(42); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor input = torch::empty( + {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor result = torch::fill_(input, value); + torch::Tensor lazy_result = torch::fill_(lazy_input, value); + AllClose(result, lazy_result); + AllClose(input, lazy_input); + }); +} + +TEST_F(LazyOpsTest, TestFillWithRank0) { + torch::Tensor value = torch::scalar_tensor(42); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor input = torch::empty( + {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor result = torch::fill_(input, value); + torch::Tensor lazy_value = CopyToDevice(value, device); + torch::Tensor lazy_result = torch::fill_(lazy_input, value); + AllClose(result, lazy_result); + AllClose(input, lazy_input); + }); +} + +TEST_F(LazyOpsTest, TestPermute) { + torch::Tensor input = torch::rand( + {2, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + std::vector> dims_permutations = { + {0, 1, 2}, {0, 2, 1}, {1, 0, 2}, {1, 2, 0}, {2, 0, 1}, {2, 1, 0}}; + int rank = input.dim(); + for (std::vector dims_permutation : dims_permutations) { + for (bool negative_dims : {false, true}) { + if (negative_dims) { + std::for_each(dims_permutation.begin(), dims_permutation.end(), + [rank](int64_t& dim) { dim -= rank; }); + } + torch::Tensor output = input.permute(dims_permutation); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = lazy_input.permute(dims_permutation); + AllClose(output, lazy_output); + }); + } + } +} + +TEST_F(LazyOpsTest, TestPermuteMod) { + std::vector> dims_permutations = { + {0, 1, 2}, {0, 2, 1}, {1, 0, 2}, {1, 2, 0}, {2, 0, 1}, {2, 1, 0}}; + std::vector input_sizes = {2, 3, 4}; + int rank = input_sizes.size(); + for (std::vector dims_permutation : dims_permutations) { + for (bool negative_dims : {false, true}) { + if (negative_dims) { + std::for_each(dims_permutation.begin(), dims_permutation.end(), + [rank](int64_t& dim) { dim -= rank; }); + } + torch::Tensor input = torch::zeros( + input_sizes, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor one = torch::tensor( + 1.0, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output = input.permute(dims_permutation); + output.add_(one, 1.0); + input.add_(one, 1.0); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor xinput = torch::zeros( + input_sizes, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_input = CopyToDevice(xinput, device); + torch::Tensor lazy_one = CopyToDevice(one, device); + torch::Tensor lazy_output = lazy_input.permute(dims_permutation); + lazy_output.add_(lazy_one, 1.0); + lazy_input.add_(lazy_one, 1.0); + AllClose(output, lazy_output); + AllClose(input, lazy_input); + }); + } + } +} + +TEST_F(LazyOpsTest, TestFlip) { + torch::Tensor input = torch::rand( + {2, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + std::vector> dim_powerset = { + {0}, {1}, {2}, {0, 1}, {1, 2}, {2, 0}, {0, 1, 2}}; + for (std::vector flip_dims : dim_powerset) { + for (bool negative_dims : {false, true}) { + if (negative_dims) { + std::for_each(flip_dims.begin(), flip_dims.end(), + [](int64_t& dim) { dim -= 3; }); + } + torch::Tensor output = torch::flip(input, flip_dims); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::flip(lazy_input, flip_dims); + AllClose(output, lazy_output); + }); + } + } +} + +TEST_F(LazyOpsTest, TestPixelShuffle) { + torch::Tensor input = + torch::rand({5, 18, 4, 4}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int upscale_factor = 3; + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor output = torch::pixel_shuffle(input, upscale_factor); + torch::Tensor lazy_output = torch::pixel_shuffle(lazy_input, upscale_factor); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestSumToSize) { + torch::Tensor input = + torch::rand({4, 6, 3, 7}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + std::vector out_size = {4, 1, 1, 7}; + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor output = input.sum_to_size(out_size); + torch::Tensor lazy_output = lazy_input.sum_to_size(out_size); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestTransposeDims) { + torch::Tensor input = torch::rand( + {2, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int dim0 = 0; + int dim1 = 2; + torch::Tensor output = torch::transpose(input, dim0, dim1); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::transpose(lazy_input, dim0, dim1); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestTransposeDimsMod) { + std::vector input_sizes = {2, 3, 4}; + int dim0 = 0; + int dim1 = 2; + torch::Tensor input = torch::zeros( + input_sizes, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor one = torch::tensor( + 1.0, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output = torch::transpose(input, dim0, dim1); + output.add_(one, 1.0); + input.add_(one, 1.0); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor xinput = torch::zeros( + input_sizes, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_input = CopyToDevice(xinput, device); + torch::Tensor lazy_one = CopyToDevice(one, device); + torch::Tensor lazy_output = torch::transpose(lazy_input, dim0, dim1); + lazy_output.add_(lazy_one, 1.0); + lazy_input.add_(lazy_one, 1.0); + AllClose(output, lazy_output); + AllClose(input, lazy_input); + }); +} + +TEST_F(LazyOpsTest, TestTransposeDimsInPlace) { + torch::Tensor input = torch::rand( + {2, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int dim0 = 0; + int dim1 = 2; + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor output = input.transpose_(dim0, dim1); + torch::Tensor lazy_output = lazy_input.transpose_(dim0, dim1); + AllClose(output, lazy_output); + AllClose(input, lazy_input); + }); +} + +TEST_F(LazyOpsTest, TestSplit) { + torch::Tensor input = torch::rand( + {7, 8, 9}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int rank = input.dim(); + for (int split_size : {2, 3}) { + for (int dim = -rank; dim < rank; ++dim) { + std::vector outputs = torch::split(input, split_size, dim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + std::vector lazy_outputs = + torch::split(lazy_input, split_size, dim); + ASSERT_EQ(outputs.size(), lazy_outputs.size()); + for (size_t i = 0; i < outputs.size(); ++i) { + AllClose(outputs[i], lazy_outputs[i]); + } + }); + } + } +} + +TEST_F(LazyOpsTest, TestSplitEmpty) { + torch::Tensor input = torch::rand( + {0}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int split_size = 0; + int dim = 0; + std::vector outputs = torch::split(input, split_size, dim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + std::vector lazy_outputs = + torch::split(lazy_input, split_size, dim); + ASSERT_EQ(outputs.size(), lazy_outputs.size()); + for (size_t i = 0; i < outputs.size(); ++i) { + AllClose(outputs[i], lazy_outputs[i]); + } + }); +} + +TEST_F(LazyOpsTest, TestSplitWithSizes) { + torch::Tensor input = + torch::rand({15, 15, 15}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int rank = input.dim(); + for (int dim = -rank; dim < rank; ++dim) { + std::vector outputs = + torch::split_with_sizes(input, {4, 5, 6}, dim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + std::vector lazy_outputs = + torch::split_with_sizes(lazy_input, {4, 5, 6}, dim); + ASSERT_EQ(outputs.size(), lazy_outputs.size()); + for (size_t i = 0; i < outputs.size(); ++i) { + AllClose(outputs[i], lazy_outputs[i]); + } + }); + } +} + +TEST_F(LazyOpsTest, TestCrossImplicitDim) { + std::vector> dim_sizes = { + {4, 5, 3}, {4, 3, 5}, {3, 4, 5}}; + for (auto dim_size : dim_sizes) { + torch::Tensor input = torch::rand( + dim_size, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor other = torch::rand( + dim_size, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor result = torch::cross(input, other); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_other = CopyToDevice(other, device); + torch::Tensor lazy_result = torch::cross(lazy_input, lazy_other); + AllClose(result, lazy_result); + }); + } +} + +TEST_F(LazyOpsTest, TestCrossExplicitDim) { + std::vector dim_size = {3, 3}; + torch::Tensor input = torch::rand( + dim_size, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor other = torch::rand( + dim_size, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + int rank = dim_size.size(); + for (int dim = -rank; dim < rank; ++dim) { + torch::Tensor result = torch::cross(input, other, dim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_other = CopyToDevice(other, device); + torch::Tensor lazy_result = torch::cross(lazy_input, lazy_other, dim); + AllClose(result, lazy_result); + }); + } +} + +TEST_F(LazyOpsTest, TestCrossZeroDim) { + torch::Tensor input = + torch::rand({0, 1, 3, 0}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor result = torch::cross(input, input); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_result = torch::cross(lazy_input, lazy_input); + AllClose(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestTriu) { + int size = 5; + torch::Tensor input = + torch::rand({size, size}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + // Test all diagonals and out of bounds (must be no-op). + for (int diagonal = -size; diagonal <= size; ++diagonal) { + torch::Tensor output = torch::triu(input, diagonal); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::triu(lazy_input, diagonal); + AllClose(output, lazy_output); + }); + } +} + +TEST_F(LazyOpsTest, TestTriuNonSquare) { + int size = 5; + torch::Tensor input = + torch::rand({size, size + 1}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + // Test all diagonals and out of bounds (must be no-op). + for (int diagonal = -size; diagonal <= size; ++diagonal) { + torch::Tensor output = torch::triu(input, diagonal); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::triu(lazy_input, diagonal); + AllClose(output, lazy_output); + }); + } +} + +TEST_F(LazyOpsTest, TestTriuBatch) { + int size = 5; + int batch_size = 3; + torch::Tensor input = + torch::rand({batch_size, size, size}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + // Test all diagonals and out of bounds (must be no-op). + for (int diagonal = -size; diagonal <= size; ++diagonal) { + torch::Tensor output = torch::triu(input, diagonal); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::triu(lazy_input, diagonal); + AllClose(output, lazy_output); + }); + } +} + +TEST_F(LazyOpsTest, TestTril) { + int size = 5; + torch::Tensor input = + torch::rand({size, size}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + // Test all diagonals and out of bounds (must be no-op). + for (int diagonal = -size; diagonal <= size; ++diagonal) { + torch::Tensor output = torch::tril(input, diagonal); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::tril(lazy_input, diagonal); + AllClose(output, lazy_output); + }); + } +} + +TEST_F(LazyOpsTest, TestTrilNonSquare) { + int size = 5; + torch::Tensor input = + torch::rand({size, size + 1}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + // Test all diagonals and out of bounds (must be no-op). + for (int diagonal = -size; diagonal <= size; ++diagonal) { + torch::Tensor output = torch::tril(input, diagonal); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::tril(lazy_input, diagonal); + AllClose(output, lazy_output); + }); + } +} + +TEST_F(LazyOpsTest, TestTrilBatch) { + int size = 5; + int batch_size = 3; + torch::Tensor input = + torch::rand({batch_size, size, size}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + // Test all diagonals and out of bounds (must be no-op). + for (int diagonal = -size; diagonal <= size; ++diagonal) { + torch::Tensor output = torch::tril(input, diagonal); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::tril(lazy_input, diagonal); + AllClose(output, lazy_output); + }); + } +} + +TEST_F(LazyOpsTest, TestTriuInPlace) { + int size = 5; + // Test all diagonals and out of bounds (must be no-op). + for (int diagonal = -size; diagonal <= size; ++diagonal) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor input = torch::rand( + {size, size}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor output = input.triu_(diagonal); + torch::Tensor lazy_output = lazy_input.triu_(diagonal); + AllClose(output, lazy_output); + AllClose(input, lazy_input); + }); + } +} + +TEST_F(LazyOpsTest, TestTrilInPlace) { + int size = 5; + // Test all diagonals and out of bounds (must be no-op). + for (int diagonal = -size; diagonal <= size; ++diagonal) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor input = torch::rand( + {size, size}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor output = input.tril_(diagonal); + torch::Tensor lazy_output = lazy_input.tril_(diagonal); + AllClose(output, lazy_output); + AllClose(input, lazy_input); + }); + } +} + +TEST_F(LazyOpsTest, TestTrace) { + int n = 5; + torch::Tensor input = torch::rand( + {n, n}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output = torch::trace(input); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::trace(lazy_input); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestTraceWide) { + int lines = 3; + int cols = 5; + torch::Tensor input = + torch::rand({lines, cols}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output = torch::trace(input); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::trace(lazy_input); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestTraceNarrow) { + int lines = 5; + int cols = 3; + torch::Tensor input = + torch::rand({lines, cols}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor output = torch::trace(input); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::trace(lazy_input); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestDiagRank1) { + int size = 7; + torch::Tensor input = torch::rand( + {size}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + // Test all diagonals and out of bounds (must be no-op). + for (int diagonal = -2 * size; diagonal <= 2 * size; ++diagonal) { + torch::Tensor output = torch::diag(input, diagonal); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::diag(lazy_input, diagonal); + AllClose(output, lazy_output); + }); + } +} + +TEST_F(LazyOpsTest, TestDiagRank2) { + int size = 7; + torch::Tensor input = + torch::rand({size, size}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + // Test all diagonals and out of bounds (must be no-op). + for (int diagonal = -size; diagonal <= size; ++diagonal) { + torch::Tensor output = torch::diag(input, diagonal); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::diag(lazy_input, diagonal); + AllClose(output, lazy_output); + }); + } +} + +TEST_F(LazyOpsTest, TestDiagFlat) { + torch::Tensor input = + torch::rand({4, 3, 6, 7}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (int diagonal = -10; diagonal < 10; ++diagonal) { + torch::Tensor output = torch::diagflat(input, diagonal); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::diagflat(lazy_input, diagonal); + AllClose(output, lazy_output); + }); + } +} + +TEST_F(LazyOpsTest, TestDiagonal) { + int size = 5; + torch::Tensor input = + torch::rand({size, size}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + // Test all diagonals and out of bounds (must be no-op). + for (int diagonal = -size; diagonal <= size; ++diagonal) { + torch::Tensor output = torch::diagonal(input, diagonal); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::diagonal(lazy_input, diagonal); + AllClose(output, lazy_output); + }); + } +} + +TEST_F(LazyOpsTest, TestDiagonalUpdate) { + int size = 5; + // Test all diagonals and out of bounds (must be no-op). + for (int diagonal = -size; diagonal <= size; ++diagonal) { + auto input = torch::rand({size, size}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + auto input_clone = input.clone(); + auto output = torch::diagonal(input, diagonal); + output.add_(1); + + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input_clone, device); + torch::Tensor lazy_output = torch::diagonal(lazy_input, diagonal); + lazy_output.add_(1); + + AllClose(output, lazy_output); + AllClose(input, lazy_input); + }); + } +} + +TEST_F(LazyOpsTest, TestDiagonalNonSquare) { + int size = 5; + torch::Tensor input = + torch::rand({size, size + 1}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + // Test all diagonals and out of bounds (must be no-op). + for (int diagonal = -size; diagonal <= size; ++diagonal) { + torch::Tensor output = torch::diagonal(input, diagonal); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::diagonal(lazy_input, diagonal); + AllClose(output, lazy_output); + }); + } +} + +TEST_F(LazyOpsTest, TestDiagonalBatch) { + int size = 5; + int batch_size = 3; + int dim1 = 1; + int dim2 = 2; + torch::Tensor input = + torch::rand({batch_size, size, size}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + // Test all diagonals and out of bounds (must be no-op). + for (int diagonal = -size; diagonal <= size; ++diagonal) { + torch::Tensor output = + torch::diagonal(input, diagonal, /*dim1=*/dim1, /*dim1=*/dim2); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = + torch::diagonal(lazy_input, diagonal, /*dim1=*/dim1, /*dim1=*/dim2); + AllClose(output, lazy_output); + }); + } +} + +TEST_F(LazyOpsTest, TestFlatten) { + torch::Tensor input = torch::rand({4, 7, 5, 3}); + int rank = input.dim(); + for (int pos_start_dim = 0; pos_start_dim < rank; ++pos_start_dim) { + for (int pos_end_dim = pos_start_dim; pos_end_dim < rank; ++pos_end_dim) { + for (bool negative_start_dim : {false, true}) { + for (bool negative_end_dim : {false, true}) { + int start_dim = + negative_start_dim ? pos_start_dim - rank : pos_start_dim; + int end_dim = negative_end_dim ? pos_end_dim - rank : pos_end_dim; + torch::Tensor output = torch::flatten(input, start_dim, end_dim); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = + torch::flatten(lazy_input, start_dim, end_dim); + AllClose(output, lazy_output); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestLogicalAnd) { + for (torch::ScalarType scalar_type1 : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor lhs = + isFloatingType(scalar_type1) + ? torch::rand({3, 4}, torch::TensorOptions(scalar_type1)) + : torch::randint(0, 100, {3, 4}, + torch::TensorOptions(scalar_type1)); + for (torch::ScalarType scalar_type2 : + {torch::kFloat, torch::kByte, torch::kChar, torch::kShort, torch::kInt, + torch::kLong}) { + torch::Tensor rhs = + isFloatingType(scalar_type2) + ? torch::rand({3, 4}, torch::TensorOptions(scalar_type2)) + : torch::randint(1, 100, {3, 4}, + torch::TensorOptions(scalar_type2)); + torch::Tensor result = torch::logical_and(lhs, rhs); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_lhs = CopyToDevice(lhs, device); + torch::Tensor lazy_rhs = CopyToDevice(rhs, device); + torch::Tensor lazy_result = torch::logical_and(lazy_lhs, lazy_rhs); + AllEqual(result, lazy_result); + }); + } + } + + ExpectCounterNotChanged("aten::.*", GetIgnoredCounters()); + ExpectCounterChanged("xla::logical_and_out", GetIgnoredCounters()); +} + +TEST_F(LazyOpsTest, TestBitwiseAnd) { + torch::Tensor lhs = torch::randint(0, std::numeric_limits::max(), + {4, 2}, torch::TensorOptions(torch::kInt)); + torch::Tensor rhs = torch::randint(0, std::numeric_limits::max(), + {4, 2}, torch::TensorOptions(torch::kInt)); + torch::Tensor result = lhs.__and__(rhs); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_lhs = CopyToDevice(lhs, device); + torch::Tensor lazy_rhs = CopyToDevice(rhs, device); + torch::Tensor lazy_result = lazy_lhs.__and__(lazy_rhs); + AllEqual(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestBitwiseAndInPlace) { + torch::Tensor lhs = torch::randint(0, std::numeric_limits::max(), + {4, 2}, torch::TensorOptions(torch::kInt)); + torch::Tensor rhs = torch::randint(0, std::numeric_limits::max(), + {4, 2}, torch::TensorOptions(torch::kInt)); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_lhs = CopyToDevice(lhs, device); + torch::Tensor result = lhs.__iand__(rhs); + torch::Tensor lazy_rhs = CopyToDevice(rhs, device); + torch::Tensor lazy_result = lazy_lhs.__iand__(lazy_rhs); + AllEqual(result, lazy_result); + AllEqual(lhs, lazy_lhs); + }); +} + +TEST_F(LazyOpsTest, TestBitwiseAndScalar) { + torch::Tensor lhs = torch::randint(0, std::numeric_limits::max(), + {4, 2}, torch::TensorOptions(torch::kInt)); + torch::Scalar rhs(123456789); + torch::Tensor result = lhs.__and__(rhs); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_lhs = CopyToDevice(lhs, device); + torch::Tensor lazy_result = lazy_lhs.__and__(rhs); + AllEqual(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestBitwiseAndScalarInPlace) { + torch::Tensor lhs = torch::randint(0, std::numeric_limits::max(), + {4, 2}, torch::TensorOptions(torch::kInt)); + torch::Scalar rhs(123456789); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_lhs = CopyToDevice(lhs, device); + torch::Tensor result = lhs.__iand__(rhs); + torch::Tensor lazy_result = lazy_lhs.__iand__(rhs); + AllEqual(result, lazy_result); + AllEqual(lhs, lazy_lhs); + }); +} + +TEST_F(LazyOpsTest, TestBitwiseAndPromotion) { + torch::Tensor input = torch::rand( + {4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor view = input.reshape(-1); + torch::Tensor result = torch::__and__(view.gt(0), view.ne(0)); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_view = lazy_input.reshape(-1); + torch::Tensor lazy_result = torch::__and__(lazy_view.gt(0), lazy_view.ne(0)); + AllEqual(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestBitwiseOr) { + torch::Tensor lhs = torch::randint(0, std::numeric_limits::max(), + {4, 2}, torch::TensorOptions(torch::kInt)); + torch::Tensor rhs = torch::randint(0, std::numeric_limits::max(), + {4, 2}, torch::TensorOptions(torch::kInt)); + torch::Tensor result = lhs.__or__(rhs); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_lhs = CopyToDevice(lhs, device); + torch::Tensor lazy_rhs = CopyToDevice(rhs, device); + torch::Tensor lazy_result = lazy_lhs.__or__(lazy_rhs); + AllEqual(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestBitwiseOrInPlace) { + torch::Tensor lhs = torch::randint(0, std::numeric_limits::max(), + {4, 2}, torch::TensorOptions(torch::kInt)); + torch::Tensor rhs = torch::randint(0, std::numeric_limits::max(), + {4, 2}, torch::TensorOptions(torch::kInt)); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_lhs = CopyToDevice(lhs, device); + torch::Tensor result = lhs.__ior__(rhs); + torch::Tensor lazy_rhs = CopyToDevice(rhs, device); + torch::Tensor lazy_result = lazy_lhs.__ior__(lazy_rhs); + AllEqual(result, lazy_result); + AllEqual(lhs, lazy_lhs); + }); +} + +TEST_F(LazyOpsTest, TestBitwiseOrScalar) { + torch::Tensor lhs = torch::randint(0, std::numeric_limits::max(), + {4, 2}, torch::TensorOptions(torch::kInt)); + torch::Scalar rhs(123456789); + torch::Tensor result = lhs.__or__(rhs); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_lhs = CopyToDevice(lhs, device); + torch::Tensor lazy_result = lazy_lhs.__or__(rhs); + AllEqual(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestBitwiseOrScalarInPlace) { + torch::Tensor lhs = torch::randint(0, std::numeric_limits::max(), + {4, 2}, torch::TensorOptions(torch::kInt)); + torch::Scalar rhs(123456789); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_lhs = CopyToDevice(lhs, device); + torch::Tensor result = lhs.__ior__(rhs); + torch::Tensor lazy_result = lazy_lhs.__ior__(rhs); + AllEqual(result, lazy_result); + AllEqual(lhs, lazy_lhs); + }); +} + +TEST_F(LazyOpsTest, TestBitwiseXor) { + torch::Tensor lhs = torch::randint(0, std::numeric_limits::max(), + {4, 2}, torch::TensorOptions(torch::kInt)); + torch::Tensor rhs = torch::randint(0, std::numeric_limits::max(), + {4, 2}, torch::TensorOptions(torch::kInt)); + torch::Tensor result = lhs.__xor__(rhs); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_lhs = CopyToDevice(lhs, device); + torch::Tensor lazy_rhs = CopyToDevice(rhs, device); + torch::Tensor lazy_result = lazy_lhs.__xor__(lazy_rhs); + AllEqual(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestBitwiseXorInPlace) { + torch::Tensor lhs = torch::randint(0, std::numeric_limits::max(), + {4, 2}, torch::TensorOptions(torch::kInt)); + torch::Tensor rhs = torch::randint(0, std::numeric_limits::max(), + {4, 2}, torch::TensorOptions(torch::kInt)); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_lhs = CopyToDevice(lhs, device); + torch::Tensor result = lhs.__ixor__(rhs); + torch::Tensor lazy_rhs = CopyToDevice(rhs, device); + torch::Tensor lazy_result = lazy_lhs.__ixor__(lazy_rhs); + AllEqual(result, lazy_result); + AllEqual(lhs, lazy_lhs); + }); +} + +TEST_F(LazyOpsTest, TestBitwiseXorScalar) { + torch::Tensor lhs = torch::randint(0, std::numeric_limits::max(), + {4, 2}, torch::TensorOptions(torch::kInt)); + torch::Scalar rhs(123456789); + torch::Tensor result = lhs.__xor__(rhs); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_lhs = CopyToDevice(lhs, device); + torch::Tensor lazy_result = lazy_lhs.__xor__(rhs); + AllEqual(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestBitwiseXorScalarInPlace) { + torch::Tensor lhs = torch::randint(0, std::numeric_limits::max(), + {4, 2}, torch::TensorOptions(torch::kInt)); + torch::Scalar rhs(123456789); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_lhs = CopyToDevice(lhs, device); + torch::Tensor result = lhs.__ixor__(rhs); + torch::Tensor lazy_result = lazy_lhs.__ixor__(rhs); + AllEqual(result, lazy_result); + AllEqual(lhs, lazy_lhs); + }); +} + +TEST_F(LazyOpsTest, TestLshift) { + torch::Tensor input = torch::ones( + {4, 2}, torch::TensorOptions(torch::kInt32).device(DefaultDevice())); + torch::Tensor shift_amount = torch::randint( + 16, + input.sizes(), + torch::TensorOptions(torch::kInt32).device(DefaultDevice())); + torch::Tensor result = torch::__lshift__(input, shift_amount); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_shift_amount = CopyToDevice(shift_amount, device); + torch::Tensor lazy_result = + torch::__lshift__(lazy_input, lazy_shift_amount); + AllClose(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestLshiftInPlace) { + torch::Tensor input = torch::ones( + {4, 2}, torch::TensorOptions(torch::kInt32).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor shift_amount = torch::randint( + 16, + input.sizes(), + torch::TensorOptions(torch::kInt32).device(DefaultDevice())); + torch::Tensor result = input.__ilshift__(shift_amount); + torch::Tensor lazy_shift_amount = CopyToDevice(shift_amount, device); + torch::Tensor lazy_result = lazy_input.__ilshift__(lazy_shift_amount); + AllClose(result, lazy_result); + AllClose(input, lazy_input); + }); +} + +TEST_F(LazyOpsTest, TestLshiftScalar) { + torch::Tensor input = torch::ones( + {4, 2}, torch::TensorOptions(torch::kInt32).device(DefaultDevice())); + torch::Scalar shift_amount = 3; + torch::Tensor result = torch::__lshift__(input, shift_amount); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_result = torch::__lshift__(lazy_input, shift_amount); + AllClose(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestLshiftScalarInPlace) { + torch::Tensor input = torch::ones( + {4, 2}, torch::TensorOptions(torch::kInt32).device(DefaultDevice())); + torch::Scalar shift_amount = 3; + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor result = input.__ilshift__(shift_amount); + torch::Tensor lazy_result = lazy_input.__ilshift__(shift_amount); + AllClose(result, lazy_result); + AllClose(input, lazy_input); + }); +} + +TEST_F(LazyOpsTest, TestRshift) { + torch::Tensor input = torch::ones( + {4, 2}, torch::TensorOptions(torch::kInt32).device(DefaultDevice())); + torch::Tensor shift_amount = torch::randint( + 16, + input.sizes(), + torch::TensorOptions(torch::kInt32).device(DefaultDevice())); + torch::Tensor result = torch::__rshift__(input, shift_amount); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_shift_amount = CopyToDevice(shift_amount, device); + torch::Tensor lazy_result = + torch::__rshift__(lazy_input, lazy_shift_amount); + AllClose(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestRshiftInPlace) { + torch::Tensor input = torch::ones( + {4, 2}, torch::TensorOptions(torch::kInt32).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor shift_amount = torch::randint( + 16, + input.sizes(), + torch::TensorOptions(torch::kInt32).device(DefaultDevice())); + torch::Tensor result = input.__irshift__(shift_amount); + torch::Tensor lazy_shift_amount = CopyToDevice(shift_amount, device); + torch::Tensor lazy_result = lazy_input.__irshift__(lazy_shift_amount); + AllClose(result, lazy_result); + AllClose(input, lazy_input); + }); +} + +TEST_F(LazyOpsTest, TestRshiftScalar) { + torch::Tensor input = torch::ones( + {4, 2}, torch::TensorOptions(torch::kInt32).device(DefaultDevice())); + torch::Scalar shift_amount = 3; + torch::Tensor result = torch::__rshift__(input, shift_amount); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_result = torch::__rshift__(lazy_input, shift_amount); + AllClose(result, lazy_result); + }); +} + +TEST_F(LazyOpsTest, TestRshiftScalarInPlace) { + torch::Tensor input = torch::ones( + {4, 2}, torch::TensorOptions(torch::kInt32).device(DefaultDevice())); + torch::Scalar shift_amount = 3; + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor result = input.__irshift__(shift_amount); + torch::Tensor lazy_result = lazy_input.__irshift__(shift_amount); + AllClose(result, lazy_result); + AllClose(input, lazy_input); + }); +} + +TEST_F(LazyOpsTest, TestMeshgrid) { + torch::Tensor a = torch::rand( + {3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor b = torch::rand( + {2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor c = torch::rand( + {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + auto d = torch::meshgrid({a, b, c}); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_a = CopyToDevice(a, device); + torch::Tensor lazy_b = CopyToDevice(b, device); + torch::Tensor lazy_c = CopyToDevice(c, device); + auto lazy_d = torch::meshgrid({lazy_a, lazy_b, lazy_c}); + EXPECT_EQ(d.size(), lazy_d.size()); + for (size_t i = 0; i < d.size(); ++i) { + AllClose(d[i], lazy_d[i]); + } + }); +} + +TEST_F(LazyOpsTest, TestConstantPad) { + torch::Tensor input = torch::rand( + {4, 2, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + std::vector pad{1, 2, 3, 4, 5, 6}; + float pad_value = 5; + torch::Tensor output = torch::constant_pad_nd(input, pad, pad_value); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = + torch::constant_pad_nd(lazy_input, pad, pad_value); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestConstantPadIncomplete) { + torch::Tensor input = torch::rand( + {4, 2, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + std::vector pad{1, 2}; + float pad_value = 5; + torch::Tensor output = torch::constant_pad_nd(input, pad, pad_value); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = + torch::constant_pad_nd(lazy_input, pad, pad_value); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestReflectionPad2dRank3) { + torch::Tensor input = torch::rand( + {2, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + std::vector pad{2, 2, 2, 2}; + torch::Tensor output = torch::reflection_pad2d(input, pad); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::reflection_pad2d(lazy_input, pad); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestReflectionPad2dRank4) { + torch::Tensor input = + torch::rand({2, 2, 3, 4}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + std::vector pad{2, 2, 2, 2}; + torch::Tensor output = torch::reflection_pad2d(input, pad); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::reflection_pad2d(lazy_input, pad); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestReflectionPad2dBackward) { + std::vector pad{2, 3, 1, 2}; + auto testfn = [&](const std::vector& inputs) -> torch::Tensor { + return torch::reflection_pad2d(inputs[0], pad); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::rand({1, 2, 4, 4}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); +} + +TEST_F(LazyOpsTest, TestReplicationPad1d) { + torch::Tensor input = torch::rand( + {1, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + std::vector pad{1, 2}; + torch::Tensor output = torch::replication_pad1d(input, pad); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::replication_pad1d(lazy_input, pad); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestReplicationPad1dZeroPad) { + torch::Tensor input = torch::rand( + {1, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + std::vector pad{1, 0}; + torch::Tensor output = torch::replication_pad1d(input, pad); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::replication_pad1d(lazy_input, pad); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestReplicationPad1dBackward) { + std::vector pad{2, 3}; + auto testfn = [&](const std::vector& inputs) -> torch::Tensor { + return torch::replication_pad1d(inputs[0], pad); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::rand({2, 4}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); +} + +TEST_F(LazyOpsTest, TestReplicationPad2d) { + torch::Tensor input = torch::rand( + {1, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + std::vector pad{1, 2, 2, 1}; + torch::Tensor output = torch::replication_pad2d(input, pad); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::replication_pad2d(lazy_input, pad); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestReplicationPad2dZeroPad) { + torch::Tensor input = torch::rand( + {1, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + std::vector pad{1, 0, 0, 1}; + torch::Tensor output = torch::replication_pad2d(input, pad); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = torch::replication_pad2d(lazy_input, pad); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestReplicationPad2dBackward) { + std::vector pad{2, 3, 1, 1}; + auto testfn = [&](const std::vector& inputs) -> torch::Tensor { + return torch::replication_pad2d(inputs[0], pad); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::rand({2, 3, 4}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); +} + +TEST_F(LazyOpsTest, TestAsStrided) { + torch::Tensor input = torch::rand( + {128, 320}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + std::vector size = {128, 20, 4, 4}; + std::vector stride = {320, 16, 4, 1}; + torch::Tensor output = + torch::as_strided(input, /*size=*/size, /*stride=*/stride); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = + torch::as_strided(lazy_input, /*size=*/size, /*stride=*/stride); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestAsStridedInPlace) { + torch::Tensor input = torch::rand( + {128, 320}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + std::vector size = {128, 20, 4, 4}; + std::vector stride = {320, 16, 4, 1}; + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor output = + torch::as_strided_(input, /*size=*/size, /*stride=*/stride); + torch::Tensor lazy_output = + torch::as_strided_(lazy_input, /*size=*/size, /*stride=*/stride); + AllClose(output, lazy_output); + AllClose(input, lazy_input); + }); +} + +TEST_F(LazyOpsTest, TestAsStridedWithOffset) { + torch::Tensor input = torch::rand( + {4, 8, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + std::vector size = {4, 4, 2}; + std::vector stride = {8, 2, 1}; + int64_t storage_offset = 4; + torch::Tensor output = + torch::as_strided(input, /*size=*/size, /*stride=*/stride, + /*storage_offset=*/storage_offset); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_output = + torch::as_strided(lazy_input, /*size=*/size, /*stride=*/stride, + /*storage_offset=*/storage_offset); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestAsStridedWithInplaceCopy) { + torch::Tensor grad = torch::ones( + {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + std::vector size = {4}; + std::vector stride = {1}; + torch::Tensor output = torch::zeros({4}, grad.options()); + output.as_strided(size, stride).copy_(grad); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_grad = CopyToDevice(grad, device); + torch::Tensor lazy_output = torch::zeros({4}, lazy_grad.options()); + lazy_output.as_strided(size, stride).copy_(lazy_grad); + AllClose(output, lazy_output); + }); +} + +TEST_F(LazyOpsTest, TestEmptyStrided) { + std::vector size = {4, 4, 2}; + std::vector stride = {8, 2, 1}; + torch::Tensor output = torch::empty_strided(/*size=*/size, /*stride=*/stride); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_output = + torch::empty_strided(/*size=*/size, /*stride=*/stride); + EXPECT_EQ(output.sizes(), lazy_output.sizes()); + EXPECT_EQ(output.strides(), lazy_output.strides()); + }); +} + +TEST_F(LazyOpsTest, TestAvgPool2DBackward) { + int kernel_size = 2; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + for (bool count_include_pad : {true, false}) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::avg_pool2d(inputs[0], + /*kernel_size=*/{kernel_size, kernel_size}, + /*stride=*/{stride, stride}, + /*padding=*/{padding, padding}, + /*ceil_mode=*/ceil_mode, + /*count_include_pad=*/count_include_pad); + }; + + ForEachDevice([&](const torch::Device& device) { + TestBackward( + {torch::rand({1, 1, 7, 7}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestAvgPool3DBackward) { + int kernel_size = 2; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + for (bool count_include_pad : {true, false}) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::avg_pool3d( + inputs[0], + /*kernel_size=*/{kernel_size, kernel_size, kernel_size}, + /*stride=*/{stride, stride, stride}, + /*padding=*/{padding, padding, padding}, + /*ceil_mode=*/ceil_mode, + /*count_include_pad=*/count_include_pad); + }; + + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::rand({1, 1, 7, 7, 7}, + torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestAvgPool2DNoBatchBackward) { + int kernel_size = 2; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + for (bool count_include_pad : {true, false}) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::avg_pool2d(inputs[0], + /*kernel_size=*/{kernel_size, kernel_size}, + /*stride=*/{stride, stride}, + /*padding=*/{padding, padding}, + /*ceil_mode=*/ceil_mode, + /*count_include_pad=*/count_include_pad); + }; + + ForEachDevice([&](const torch::Device& device) { + TestBackward( + {torch::rand({1, 7, 7}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestAvgPool3DNoBatchBackward) { + int kernel_size = 2; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + for (bool count_include_pad : {true, false}) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::avg_pool3d( + inputs[0], + /*kernel_size=*/{kernel_size, kernel_size, kernel_size}, + /*stride=*/{stride, stride, stride}, + /*padding=*/{padding, padding, padding}, + /*ceil_mode=*/ceil_mode, + /*count_include_pad=*/count_include_pad); + }; + + ForEachDevice([&](const torch::Device& device) { + TestBackward( + {torch::rand({1, 7, 7, 7}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestAdaptiveAvgPool3DNoBatchBackward) { + if (IsCuda()) { + GTEST_SKIP(); + } + for (int64_t output_size : {7, 4}) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::adaptive_avg_pool3d( + inputs[0], {output_size, output_size, output_size}); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward( + {torch::rand({1, 56, 28, 28}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); + } +} + +TEST_F(LazyOpsTest, TestAdaptiveAvgPool3DBackward) { + if (IsCuda()) { + GTEST_SKIP(); + } + for (int64_t output_size : {7, 4}) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::adaptive_avg_pool3d( + inputs[0], {output_size, output_size, output_size}); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward( + {torch::rand({4, 1, 56, 28, 28}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); + } +} + +TEST_F(LazyOpsTest, TestAdaptiveAvgPool2DBackward) { + for (int64_t output_size : {7, 8}) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::adaptive_avg_pool2d(inputs[0], {output_size, output_size}); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward( + {torch::rand({4, 1, 56, 56}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); + } +} + +TEST_F(LazyOpsTest, TestAdaptiveAvgPool2DNoBatchBackward) { + for (int64_t output_size : {7, 8}) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::adaptive_avg_pool2d(inputs[0], {output_size, output_size}); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::rand({1, 56, 56}, torch::TensorOptions(torch::kFloat) + .requires_grad(true))}, + device, testfn); + }); + } +} + +TEST_F(LazyOpsTest, TestConv2D) { + int in_channels = 4; + int out_channels = 4; + int kernel_size = 3; + for (int stride = 1; stride <= 3; ++stride) { + for (int padding = 0; padding <= 2; ++padding) { + for (bool with_bias : {true, false}) { + for (int dilation = 1; dilation <= 3; ++dilation) { + for (int groups : + {1, 2, 4}) { // covers normal, grouped, depthwise conv. + ForEachDevice([&](const torch::Device& device) { + torch::Tensor input = torch::rand( + {1, in_channels, 7, 7}, + torch::TensorOptions(torch::kDouble).device(DefaultDevice())); + torch::Tensor weight = torch::rand( + {out_channels, in_channels / groups, kernel_size, + kernel_size}, + torch::TensorOptions(torch::kDouble).device(DefaultDevice())); + torch::Tensor bias = + with_bias ? torch::rand({out_channels}, + torch::TensorOptions(torch::kDouble) + .device(DefaultDevice())) + : torch::Tensor(); + + torch::Tensor lazy_input = CopyToDevice(input, device); + torch::Tensor lazy_weight = CopyToDevice(weight, device); + torch::Tensor lazy_bias = + with_bias ? CopyToDevice(bias, device) : torch::Tensor(); + + torch::Tensor output = + torch::conv2d(input, weight, bias, + /*stride=*/{stride, stride}, + /*padding=*/{padding, padding}, + /*dilation=*/{dilation, dilation}, groups); + torch::Tensor lazy_output = + torch::conv2d(lazy_input, lazy_weight, lazy_bias, + /*stride=*/{stride, stride}, + /*padding=*/{padding, padding}, + /*dilation=*/{dilation, dilation}, groups); + AllClose(output, lazy_output); + }); + } + } + } + } + } +} + +TEST_F(LazyOpsTest, TestConv2DBackward) { + int in_channels = 4; + int out_channels = 4; + int kernel_size = 3; + for (int stride = 1; stride <= 3; ++stride) { + for (int padding = 0; padding <= 2; ++padding) { + for (bool with_bias : {true, false}) { + for (int dilation = 1; dilation <= 3; ++dilation) { + for (int groups : + {1, 2, 4}) { // covers normal, grouped, depthwise conv. + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::conv2d(inputs[0], inputs[1], inputs[2], + /*stride=*/{stride, stride}, + /*padding=*/{padding, padding}, + /*dilation=*/{dilation, dilation}, groups); + }; + + ForEachDevice([&](const torch::Device& device) { + torch::Tensor bias = + with_bias ? torch::rand({out_channels}, + torch::TensorOptions(torch::kDouble) + .device(DefaultDevice())) + : torch::Tensor(); + TestBackward({torch::rand({1, in_channels, 7, 7}, + torch::TensorOptions(torch::kDouble) + .device(DefaultDevice()) + .requires_grad(true)), + torch::rand({out_channels, in_channels / groups, + kernel_size, kernel_size}, + torch::TensorOptions(torch::kDouble) + .device(DefaultDevice()) + .requires_grad(true)), + bias}, + device, testfn); + }); + } + }; + } + } + } +} + +TEST_F(LazyOpsTest, TestTransposedConv2DBackward) { + int in_channels = 4; + int out_channels = 4; + int kernel_size = 3; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + for (int dilation = 1; dilation <= 2; ++dilation) { + for (int output_padding = 0; + output_padding < std::max(stride, dilation); ++output_padding) { + for (bool with_bias : {true, false}) { + for (int groups : + {1, 2, 4}) { // covers normal, grouped, depthwise conv. + auto testfn = [&](const std::vector& inputs) + -> torch::Tensor { + return torch::conv_transpose2d( + inputs[0], inputs[1], inputs[2], + /*stride=*/{stride, stride + 1}, + /*padding=*/{padding, padding + 1}, + /*output_padding=*/output_padding, + /*groups=*/groups, + /*dilation=*/{dilation, dilation + 1}); + }; + ForEachDevice([&](const torch::Device& device) { + torch::Tensor input = torch::rand( + {4, out_channels, 7, 7}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)); + torch::Tensor weight = + torch::rand({out_channels, in_channels / groups, + kernel_size, kernel_size}, + torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)); + torch::Tensor bias = + with_bias ? torch::rand({in_channels}, + torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)) + : torch::Tensor(); + TestBackward({input, weight, bias}, device, testfn, + /*rtol=*/1e-5, /*atol=*/1e-5); + }); + } + }; + } + } + } + } +} + +TEST_F(LazyOpsTest, TestConv3DBackward) { + int in_channels = 4; + int out_channels = 4; + int kernel_size = 3; + for (int stride = 1; stride <= 3; ++stride) { + for (int padding = 1; padding <= 2; ++padding) { + for (bool with_bias : {true, false}) { + for (int dilation = 1; dilation <= 2; ++dilation) { + for (int groups : + {1, 2, 4}) { // covers normal, grouped, depthwise conv. + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::conv3d(inputs[0], inputs[1], inputs[2], + /*stride=*/{stride, stride, stride}, + /*padding=*/{padding, padding, padding}, + /*dilation=*/{dilation, dilation, dilation}, + groups); + }; + + ForEachDevice([&](const torch::Device& device) { + torch::Tensor bias = + with_bias ? torch::rand({out_channels}, + torch::TensorOptions(torch::kDouble) + .device(DefaultDevice())) + : torch::Tensor(); + TestBackward({torch::rand({4, in_channels, 7, 7, 7}, + torch::TensorOptions(torch::kDouble) + .device(DefaultDevice()) + .requires_grad(true)), + torch::rand({out_channels, in_channels / groups, + kernel_size, kernel_size, kernel_size}, + torch::TensorOptions(torch::kDouble) + .device(DefaultDevice()) + .requires_grad(true)), + bias}, + device, testfn); + }); + } + }; + } + } + } +} + +TEST_F(LazyOpsTest, TestTransposedConv3DBackward) { + int in_channels = 4; + int out_channels = 4; + int kernel_size = 3; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + for (int dilation = 1; dilation <= 2; ++dilation) { + for (int output_padding = 0; + output_padding < std::max(stride, dilation); ++output_padding) { + for (bool with_bias : {true, false}) { + for (int groups : + {1, 2, 4}) { // covers normal, grouped, depthwise conv. + auto testfn = [&](const std::vector& inputs) + -> torch::Tensor { + return torch::conv_transpose3d( + inputs[0], inputs[1], inputs[2], + /*stride=*/{stride, stride + 1, stride}, + /*padding=*/{padding, padding + 1, stride}, + /*output_padding=*/output_padding, + /*groups=*/groups, + /*dilation=*/{dilation, dilation + 1, dilation}); + }; + ForEachDevice([&](const torch::Device& device) { + torch::Tensor input = + torch::rand({4, out_channels, 7, 7, 7}, + torch::TensorOptions(torch::kDouble) + .device(DefaultDevice()) + .requires_grad(true)); + torch::Tensor weight = + torch::rand({out_channels, in_channels / groups, + kernel_size, kernel_size, kernel_size}, + torch::TensorOptions(torch::kDouble) + .device(DefaultDevice()) + .requires_grad(true)); + torch::Tensor bias = + with_bias ? torch::rand({in_channels}, + torch::TensorOptions(torch::kDouble) + .device(DefaultDevice()) + .requires_grad(true)) + : torch::Tensor(); + TestBackward({input, weight, bias}, device, testfn); + }); + } + }; + } + } + } + } +} + +TEST_F(LazyOpsTest, TestMaxPool2DBackward) { + int kernel_size = 3; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::max_pool2d( + inputs[0], /*kernel_size=*/{kernel_size, kernel_size}, + /*stride=*/{stride, stride}, + /*padding=*/{padding, padding}, /*dilation=*/{1, 1}, + /*ceil_mode=*/ceil_mode); + }; + + ForEachDevice([&](const torch::Device& device) { + TestBackward( + {torch::rand({1, 2, 8, 8}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); + } + } + } +} + +TEST_F(LazyOpsTest, TestMaxPool3DBackward) { + int kernel_size = 3; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::max_pool3d( + inputs[0], + /*kernel_size=*/{kernel_size, kernel_size, kernel_size}, + /*stride=*/{stride, stride, stride}, + /*padding=*/{padding, padding, padding}, /*dilation=*/{1, 1, 1}, + /*ceil_mode=*/ceil_mode); + }; + + ForEachDevice([&](const torch::Device& device) { + TestBackward( + {torch::rand({1, 2, 4, 4, 4}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); + } + } + } +} + +TEST_F(LazyOpsTest, TestMaxPool2DNoBatchBackward) { + int kernel_size = 3; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::max_pool2d( + inputs[0], /*kernel_size=*/{kernel_size, kernel_size}, + /*stride=*/{stride, stride}, + /*padding=*/{padding, padding}, /*dilation=*/{1, 1}, + /*ceil_mode=*/ceil_mode); + }; + + ForEachDevice([&](const torch::Device& device) { + TestBackward( + {torch::rand({2, 8, 8}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); + } + } + } +} + +TEST_F(LazyOpsTest, TestMaxPool3DNoBatchBackward) { + int kernel_size = 3; + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::max_pool3d( + inputs[0], + /*kernel_size=*/{kernel_size, kernel_size, kernel_size}, + /*stride=*/{stride, stride, stride}, + /*padding=*/{padding, padding, padding}, /*dilation=*/{1, 1, 1}, + /*ceil_mode=*/ceil_mode); + }; + + ForEachDevice([&](const torch::Device& device) { + TestBackward( + {torch::rand({2, 4, 4, 4}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); + } + } + } +} + +TEST_F(LazyOpsTest, TestMaxUnpool2DBackward) { + int kernel_size = 2; + torch::Tensor input = + torch::rand({2, 2, 8, 8}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + for (int dilation = 1; dilation <= 2; ++dilation) { + torch::Tensor output; + torch::Tensor indices; + std::tie(output, indices) = torch::max_pool2d_with_indices( + input, /*kernel_size=*/{kernel_size, kernel_size}, + /*stride=*/{stride, stride}, + /*padding=*/{padding, padding}, /*dilation=*/{dilation, dilation}, + /*ceil_mode=*/ceil_mode); + + std::vector output_size({input.size(2), input.size(3)}); + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::max_unpool2d(inputs[0], inputs[1], output_size); + }; + + ForEachDevice([&](const torch::Device& device) { + TestBackward({output.requires_grad_(true), indices}, device, + testfn); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestMaxUnpool3DBackward) { + int kernel_size = 2; + torch::Tensor input = + torch::rand({1, 1, 4, 4, 4}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (int stride = 1; stride <= 2; ++stride) { + for (int padding = 0; padding <= 1; ++padding) { + // Test ceil_mode=true through the CPU interop. + for (bool ceil_mode : {false, true}) { + for (int dilation = 1; dilation <= 2; ++dilation) { + torch::Tensor output; + torch::Tensor indices; + std::tie(output, indices) = torch::max_pool3d_with_indices( + input, /*kernel_size=*/{kernel_size, kernel_size, kernel_size}, + /*stride=*/{stride, stride, stride}, + /*padding=*/{padding, padding, padding}, + /*dilation=*/{dilation, dilation, dilation}, + /*ceil_mode=*/ceil_mode); + + std::vector output_size( + {input.size(2), input.size(3), input.size(4)}); + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::max_unpool3d(inputs[0], inputs[1], output_size, + /*stride=*/{stride, stride, stride}, + /*padding=*/{padding, padding, padding}); + }; + + ForEachDevice([&](const torch::Device& device) { + TestBackward({output.requires_grad_(true), indices}, device, + testfn); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestTanhBackward) { + auto testfn = [&](const std::vector& inputs) -> torch::Tensor { + return torch::tanh(inputs[0]); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::rand({2, 2}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestSigmoidBackward) { + auto testfn = [&](const std::vector& inputs) -> torch::Tensor { + return torch::sigmoid(inputs[0]); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::rand({2, 2}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); +} + +TEST_F(LazyOpsTest, TestLogSigmoidBackward) { + auto testfn = [&](const std::vector& inputs) -> torch::Tensor { + return torch::log_sigmoid(inputs[0]); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::rand({2, 2}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn, /*rtol=*/1e-3, /*atol=*/1e-5); + }); +} + +TEST_F(LazyOpsTest, TestLogSoftmaxBackward) { + for (int dim = -4; dim < 4; ++dim) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::log_softmax(inputs[0], dim); + }; + + ForEachDevice([&](const torch::Device& device) { + TestBackward( + {torch::rand({5, 3, 4, 2}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn, /*rtol=*/1e-3, /*atol=*/1e-4); + }); + } +} + +TEST_F(LazyOpsTest, TestSoftmaxBackward) { + for (int dim = -4; dim < 4; ++dim) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::softmax(inputs[0], dim); + }; + + ForEachDevice([&](const torch::Device& device) { + TestBackward( + {torch::rand({5, 3, 4, 2}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn, /*rtol=*/1e-3, /*atol=*/1e-4); + }); + } +} + +TEST_F(LazyOpsTest, TestSoftplusBackward) { + auto testfn = [&](const std::vector& inputs) -> torch::Tensor { + return torch::softplus(inputs[0]); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::rand({2, 1, 4, 6}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn, /*rtol=*/1e-4); + }); +} + +TEST_F(LazyOpsTest, TestReluBackward) { + auto testfn = [&](const std::vector& inputs) -> torch::Tensor { + return torch::relu(inputs[0]); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::rand({2, 1, 4, 6}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); +} + +TEST_F(LazyOpsTest, TestRreluBackward) { + auto testfn = [&](const std::vector& inputs) -> torch::Tensor { + return torch::rrelu(inputs[0]); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::rand({2, 1, 4, 6}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); +} + +TEST_F(LazyOpsTest, TestHardshrinkBackward) { + auto testfn = [&](const std::vector& inputs) -> torch::Tensor { + return torch::hardshrink(inputs[0]); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::randn({100}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); +} + +TEST_F(LazyOpsTest, TestSoftshrinkBackward) { + auto testfn = [&](const std::vector& inputs) -> torch::Tensor { + return torch::softshrink(inputs[0]); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::randn({100}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); +} + +TEST_F(LazyOpsTest, TestHardtanhBackward) { + auto testfn = [&](const std::vector& inputs) -> torch::Tensor { + return torch::hardtanh(inputs[0]); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::randn({100}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); +} + +TEST_F(LazyOpsTest, TestEluBackward) { + torch::Scalar alpha = 0.5; + torch::Scalar scale = 2.5; + torch::Scalar input_scale = 1.5; + auto testfn = [&](const std::vector& inputs) -> torch::Tensor { + return torch::elu(inputs[0], alpha, scale, input_scale); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::rand({2, 1, 4, 6}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); +} + +TEST_F(LazyOpsTest, TestGeluBackward) { + auto testfn = [&](const std::vector& inputs) -> torch::Tensor { + return torch::gelu(inputs[0]); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::rand({2, 3}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); + ExpectCounterChanged("lazy::gelu_backward", GetIgnoredCounters()); +} + +TEST_F(LazyOpsTest, TestLeakyReluBackward) { + double negative_slope = 0.01; + auto testfn = [=](const std::vector& inputs) -> torch::Tensor { + return torch::leaky_relu(inputs[0], negative_slope); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::rand({2, 1, 4, 6}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); +} + +TEST_F(LazyOpsTest, TestTransposeBackward) { + auto testfn = [&](const std::vector& inputs) -> torch::Tensor { + return torch::t(inputs[0]); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::rand({2, 3}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); +} + +TEST_F(LazyOpsTest, TestAddMatMulBackward) { + int in_channels = 32; + int out_channels = 320; + int labels = 50; + // Test beta != 1. through the CPU interop. + for (double beta : {1., 2.}) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::addmm(inputs[0], inputs[1], inputs[2], /*beta=*/beta); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({torch::rand({labels}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)), + torch::rand({in_channels, out_channels}, + torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)), + torch::rand({out_channels, labels}, + torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); + } +} + +TEST_F(LazyOpsTest, TestBinaryCrossEntropyBackward) { + int batch = 6; + int classes = 2; + // TODO(asuhan): Fix the torch::kDouble case. + for (auto dtype : {torch::kFloat}) { + for (bool def_weight : {false, true}) { + torch::Tensor input = torch::rand( + {batch, classes}, torch::TensorOptions(dtype).requires_grad(true)); + torch::Tensor target = + torch::rand({batch, classes}, torch::TensorOptions(dtype)); + torch::Tensor weight; + if (def_weight) { + weight = torch::rand({batch, classes}, torch::TensorOptions(dtype)); + } + for (torch::Reduction::Reduction reduction : + {torch::Reduction::Mean, torch::Reduction::Sum, + torch::Reduction::None}) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::binary_cross_entropy( + /*self=*/inputs[0], /*target=*/inputs[1], + /*weight=*/inputs[2], + /*reduction=*/reduction); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({input, target, weight}, device, testfn, /*rtol=*/1e-4, + /*atol=*/1e-7); + }); + } + } + } +} + +TEST_F(LazyOpsTest, TestNllLossBackward) { + // TODO(whc) debug divide-by-zero failure under ASAN + GTEST_SKIP(); + + int batch = 6; + int classes = 2; + // TODO(asuhan): Fix the torch::kDouble case. + for (auto dtype : {torch::kFloat}) { + for (int ignore_index : {-1, 0, 1, 5}) { + for (bool def_weight : {false, true}) { + torch::Tensor input = + torch::rand({batch, classes}, torch::TensorOptions(dtype) + .device(DefaultDevice()) + .requires_grad(true)); + torch::Tensor target = torch::randint( + std::min(ignore_index, 0), classes, {batch}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor weight; + if (def_weight) { + weight = torch::rand( + {classes}, torch::TensorOptions(dtype).device(DefaultDevice())); + } + for (torch::Reduction::Reduction reduction : + {torch::Reduction::Mean, torch::Reduction::Sum, + torch::Reduction::None}) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::nll_loss( + /*self=*/inputs[0], /*target=*/inputs[1], + /*weight=*/inputs[2], + /*reduction=*/reduction, /*ignore_index=*/ignore_index); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({input, target, weight}, device, testfn, /*rtol=*/1e-5, + /*atol=*/1e-8); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestNllLoss2dBackward) { + int batch = 6; + int classes = 2; + int height = 3; + int width = 3; + // TODO(asuhan): Fix the torch::kDouble case. + for (auto dtype : {torch::kFloat}) { + for (int ignore_index : {-1, 0, 1, 5}) { + for (bool def_weight : {false, true}) { + torch::Tensor input = torch::rand({batch, classes, height, width}, + torch::TensorOptions(dtype) + .device(DefaultDevice()) + .requires_grad(true)); + torch::Tensor target = torch::randint( + std::min(ignore_index, 0), classes, {batch, height, width}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + torch::Tensor weight; + if (def_weight) { + weight = torch::rand( + {classes}, torch::TensorOptions(dtype).device(DefaultDevice())); + } + for (torch::Reduction::Reduction reduction : + {torch::Reduction::Mean, torch::Reduction::Sum, + torch::Reduction::None}) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::nll_loss2d( + /*self=*/inputs[0], /*target=*/inputs[1], + /*weight=*/inputs[2], + /*reduction=*/reduction, /*ignore_index=*/ignore_index); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({input, target, weight}, device, testfn, /*rtol=*/1e-5, + /*atol=*/1e-8); + }); + } + } + } + } +} + +TEST_F(LazyOpsTest, TestSmoothL1LossBackward) { + torch::Tensor input = torch::randn({2, 4}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)); + torch::Tensor target = torch::randn( + {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + for (torch::Reduction::Reduction reduction : + {torch::Reduction::None, torch::Reduction::Mean, + torch::Reduction::Sum}) { + for (double beta : {0.25, 1.}) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::smooth_l1_loss(/*input=*/inputs[0], /*target=*/inputs[1], + /*reduction=*/reduction, /*beta=*/beta); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({input, target}, device, testfn, /*rtol=*/1e-5, + /*atol=*/1e-8); + }); + } + } +} + +TEST_F(LazyOpsTest, TestViewBackward) { + auto testfn = [&](const std::vector& inputs) -> torch::Tensor { + return inputs[0].view({-1, 320}); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward( + {torch::rand({32, 20, 4, 4}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true))}, + device, testfn); + }); +} + +TEST_F(LazyOpsTest, TestBatchNorm2DBackward) { + double momentum = 0.1; + double eps = 0.5; + auto testfn = [&](const std::vector& inputs) -> torch::Tensor { + return torch::batch_norm( + /*input=*/inputs[0], /*weight=*/inputs[1], /*bias=*/inputs[2], + /*running_mean=*/inputs[3], /*running_var=*/inputs[4], + /*training=*/true, /*momentum=*/momentum, /*eps=*/eps, + /*cudnn_enabled=*/false); + }; + int num_features = 3; + torch::Tensor undef; + for (bool undef_weight_bias : {false, true}) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor input = torch::rand({2, num_features, 4, 4}, + torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)); + torch::Tensor weight = + undef_weight_bias + ? undef + : torch::rand({num_features}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)); + torch::Tensor bias = + undef_weight_bias + ? undef + : torch::rand({num_features}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)); + torch::Tensor running_mean = torch::zeros( + {num_features}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor running_var = torch::ones( + {num_features}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + TestBackward({input, weight, bias, running_mean, running_var}, device, + testfn, + /*rtol=*/1e-3, /*atol=*/1e-4); + }); + } +} + +TEST_F(LazyOpsTest, TestBatchNorm3DBackward) { + double momentum = 0.1; + double eps = 0.5; + auto testfn = [&](const std::vector& inputs) -> torch::Tensor { + return torch::batch_norm( + /*input=*/inputs[0], /*weight=*/inputs[1], /*bias=*/inputs[2], + /*running_mean=*/inputs[3], /*running_var=*/inputs[4], + /*training=*/true, /*momentum=*/momentum, /*eps=*/eps, + /*cudnn_enabled=*/false); + }; + int num_features = 3; + torch::Tensor undef; + for (bool undef_weight_bias : {false, true}) { + ForEachDevice([&](const torch::Device& device) { + torch::Tensor input = torch::rand({2, num_features, 4, 4, 2}, + torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)); + torch::Tensor weight = + undef_weight_bias + ? undef + : torch::rand({num_features}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)); + torch::Tensor bias = + undef_weight_bias + ? undef + : torch::rand({num_features}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)); + torch::Tensor running_mean = torch::zeros( + {num_features}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor running_var = torch::ones( + {num_features}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + TestBackward({input, weight, bias, running_mean, running_var}, device, + testfn, + /*rtol=*/1e-3, /*atol=*/1e-3); + }); + } +} + +TEST_F(LazyOpsTest, TestBCEWithLogitsBackward) { + int batch = 10; + int classes = 5; + torch::Tensor undef; + for (torch::Reduction::Reduction reduction : + {torch::Reduction::None, torch::Reduction::Mean, + torch::Reduction::Sum}) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::binary_cross_entropy_with_logits( + /*input=*/inputs[0], /*target=*/inputs[1], /*weight=*/inputs[2], + /*pos_weight=*/inputs[3], + /*reduction=*/reduction); + }; + for (bool undef_weight : {false, true}) { + for (bool undef_pos_weight : {false, true}) { + torch::Tensor input = + torch::rand({batch, classes}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)); + torch::Tensor target = + torch::rand({batch, classes}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)); + torch::Tensor weight = + undef_weight + ? undef + : torch::rand({classes}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice())); + torch::Tensor pos_weight = + undef_pos_weight + ? undef + : torch::rand({classes}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + TestBackward({input, target, weight, pos_weight}, device, testfn, + /*rtol=*/1e-3, /*atol=*/1e-5); + }); + } + } + } +} + +TEST_F(LazyOpsTest, TestKlDivBackward) { + torch::Tensor input = torch::rand({4, 3}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)); + torch::Tensor target = torch::rand({4, 3}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)); + for (torch::Reduction::Reduction reduction : + {torch::Reduction::Mean, torch::Reduction::Sum, + torch::Reduction::None}) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::kl_div(/*self=*/inputs[0], /*target=*/inputs[1], reduction); + }; + ForEachDevice([&](const torch::Device& device) { + TestBackward({input, target}, device, testfn, /*rtol=*/1e-4, + /*atol=*/1e-5); + }); + } +} + +TEST_F(LazyOpsTest, TestEmbeddingBackward) { + int num_weights = 32; + for (int padding_idx = -1; padding_idx < num_weights; ++padding_idx) { + for (bool scale_grad_by_freq : {false, true}) { + auto testfn = + [&](const std::vector& inputs) -> torch::Tensor { + return torch::embedding(inputs[0], inputs[1], + /*padding_idx=*/padding_idx, + /*scale_grad_by_freq=*/scale_grad_by_freq, + /*sparse=*/false); + }; + ForEachDevice([&](const torch::Device& device) { + torch::Tensor weight = + torch::rand({num_weights, 7}, torch::TensorOptions(torch::kFloat) + .device(DefaultDevice()) + .requires_grad(true)); + torch::Tensor indices = torch::randint( + num_weights, {3, 9, 4}, + torch::TensorOptions(torch::kLong).device(DefaultDevice())); + TestBackward({weight, indices}, device, testfn, /*rtol=*/1e-5, + /*atol=*/1e-8); + }); + } + } +} + +TEST_F(LazyOpsTest, TestAmpForeachNonFiniteCheckAndUnscale) { + if (IsCuda()) { + // TODO(whc) debug failure on cuda + GTEST_SKIP(); + } + + torch::Tensor grads0 = torch::tensor( + {1, 2, 3, 4}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor grads1 = torch::tensor( + {1.0, 2.0, std::nan("1"), 4.0}, + torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor inv_scale = torch::scalar_tensor( + 0.2, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor found_inf = torch::scalar_tensor( + 0, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor grads_output0 = grads0 * inv_scale; + torch::Tensor found_inf_output0 = torch::scalar_tensor( + 0, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor found_inf_output1 = torch::scalar_tensor( + 1, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ForEachDevice([&](const torch::Device& device) { + if (grads0.device() == at::kCPU) { + GTEST_SKIP(); + } + torch::Tensor lazy_grads0 = CopyToDevice(grads0, device); + torch::Tensor lazy_inv_scale = CopyToDevice(inv_scale, device); + torch::Tensor lazy_found_inf = CopyToDevice(found_inf, device); + torch::_amp_foreach_non_finite_check_and_unscale_(lazy_grads0, lazy_found_inf, + lazy_inv_scale); + AllClose(grads_output0, lazy_grads0, /*rtol=*/1e-2, /*atol=*/1e-4); + AllEqual(found_inf_output0, lazy_found_inf); + + torch::Tensor lazy_grads1 = CopyToDevice(grads1, device); + torch::_amp_foreach_non_finite_check_and_unscale_(lazy_grads1, lazy_found_inf, + lazy_inv_scale); + AllEqual(found_inf_output1, lazy_found_inf); + }); +} + +TEST_F(LazyOpsTest, TestAmpUpdateScale) { + torch::Tensor growth_tracker = torch::scalar_tensor( + 0, torch::TensorOptions(torch::kInt32).device(DefaultDevice())); + torch::Tensor current_scale = torch::scalar_tensor( + 4, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor found_inf = torch::scalar_tensor( + 1, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor not_found_inf = torch::scalar_tensor( + 0, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + float scale_growth_factor = 2.0; + float scale_backoff_factor = 0.5; + int growth_interval = 3; + + torch::Tensor growth_tracker_result0 = torch::scalar_tensor( + 1, torch::TensorOptions(torch::kInt32).device(DefaultDevice())); + torch::Tensor current_scale_result0 = torch::scalar_tensor( + 4, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor growth_tracker_result1 = torch::scalar_tensor( + 2, torch::TensorOptions(torch::kInt32).device(DefaultDevice())); + torch::Tensor current_scale_result1 = torch::scalar_tensor( + 4, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor growth_tracker_result2 = torch::scalar_tensor( + 0, torch::TensorOptions(torch::kInt32).device(DefaultDevice())); + torch::Tensor current_scale_result2 = torch::scalar_tensor( + 8, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor growth_tracker_result3 = torch::scalar_tensor( + 0, torch::TensorOptions(torch::kInt32).device(DefaultDevice())); + torch::Tensor current_scale_result3 = torch::scalar_tensor( + 4, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + + ForEachDevice([&](const torch::Device& device) { + if (growth_tracker.device() == at::kCPU) { + GTEST_SKIP(); + } + torch::Tensor lazy_growth_tracker = CopyToDevice(growth_tracker, device); + torch::Tensor lazy_current_scale = CopyToDevice(current_scale, device); + torch::Tensor lazy_found_inf = CopyToDevice(found_inf, device); + torch::Tensor lazy_not_found_inf = CopyToDevice(not_found_inf, device); + + torch::_amp_update_scale_(lazy_current_scale, lazy_growth_tracker, + lazy_not_found_inf, scale_growth_factor, + scale_backoff_factor, growth_interval); + AllClose(current_scale_result0, lazy_current_scale, /*rtol=*/1e-2, + /*atol=*/1e-4); + AllEqual(growth_tracker_result0, lazy_growth_tracker); + + torch::_amp_update_scale_(lazy_current_scale, lazy_growth_tracker, + lazy_not_found_inf, scale_growth_factor, + scale_backoff_factor, growth_interval); + AllClose(current_scale_result1, lazy_current_scale, /*rtol=*/1e-2, + /*atol=*/1e-4); + AllEqual(growth_tracker_result1, lazy_growth_tracker); + + // torch::_amp_update_scale_ returns the reference of current_scale + lazy_current_scale = torch::_amp_update_scale_( + lazy_current_scale, lazy_growth_tracker, lazy_not_found_inf, + scale_growth_factor, scale_backoff_factor, growth_interval); + AllClose(current_scale_result2, lazy_current_scale, /*rtol=*/1e-2, + /*atol=*/1e-4); + AllEqual(growth_tracker_result2, lazy_growth_tracker); + + lazy_current_scale = torch::_amp_update_scale_( + lazy_current_scale, lazy_growth_tracker, lazy_found_inf, + scale_growth_factor, scale_backoff_factor, growth_interval); + AllClose(current_scale_result3, lazy_current_scale, /*rtol=*/1e-2, + /*atol=*/1e-4); + AllEqual(growth_tracker_result3, lazy_growth_tracker); + }); + ExpectCounterNotChanged("aten::.*", GetIgnoredCounters()); + ExpectCounterChanged("lazy::_amp_update_scale_", + GetIgnoredCounters()); +} + +TEST_F(LazyOpsTest, TestEarlySyncLiveTensors) { + torch::Tensor scalar_tensor = torch::scalar_tensor( + 1., torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar scalar1 = scalar_tensor.item(); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_scalar_tensor = CopyToDevice(scalar_tensor, device); + torch::Scalar scalar2 = lazy_scalar_tensor.item(); + ASSERT_EQ(scalar1.to(), scalar2.to()); + }); + if (DebugUtil::ExperimentEnabled("early_sync")) { + ExpectCounterChanged("EarlySyncLiveTensorsCount", + GetIgnoredCounters()); + } else { + ExpectCounterNotChanged("EarlySyncLiveTensorsCount", + GetIgnoredCounters()); + } + ExpectCounterChanged("aten::_local_scalar_dense", + GetIgnoredCounters()); +} + +TEST_F(LazyOpsTest, TestLerp) { + torch::Tensor start = torch::rand( + {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor end = torch::rand( + {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor weight = torch::rand( + {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor res = torch::lerp(start, end, weight); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_start = CopyToDevice(start, device); + torch::Tensor lazy_end = CopyToDevice(end, device); + torch::Tensor lazy_weight = CopyToDevice(weight, device); + torch::Tensor lazy_res = torch::lerp(lazy_start, lazy_end, lazy_weight); + AllClose(res, lazy_res); + }); + ExpectCounterNotChanged("aten::.*", GetIgnoredCounters()); + ExpectCounterChanged("lazy::lerp", GetIgnoredCounters()); +} + +TEST_F(LazyOpsTest, TestLerpScalar) { + torch::Tensor start = torch::rand( + {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor end = torch::rand( + {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar weight = torch::Scalar(3.0); + torch::Tensor res = torch::lerp(start, end, weight); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_start = CopyToDevice(start, device); + torch::Tensor lazy_end = CopyToDevice(end, device); + torch::Tensor lazy_res = torch::lerp(lazy_start, lazy_end, weight); + AllClose(res, lazy_res); + }); + ExpectCounterNotChanged("aten::.*", GetIgnoredCounters()); + ExpectCounterChanged("lazy::lerp", GetIgnoredCounters()); +} + +TEST_F(LazyOpsTest, TestLerpInplace) { + torch::Tensor input = torch::rand( + {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor end = torch::rand( + {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor weight = torch::rand( + {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor input_copy = input.clone(); + input.lerp_(end, weight); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input_copy, device); + torch::Tensor lazy_end = CopyToDevice(end, device); + torch::Tensor lazy_weight = CopyToDevice(weight, device); + lazy_input.lerp_(lazy_end, lazy_weight); + AllClose(lazy_input, input); + }); + ExpectCounterNotChanged("aten::.*", GetIgnoredCounters()); + ExpectCounterChanged("lazy::lerp", GetIgnoredCounters()); +} + +TEST_F(LazyOpsTest, TestLerpScalarInplace) { + torch::Tensor input = torch::rand( + {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor end = torch::rand( + {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar weight = torch::Scalar(3.0); + torch::Tensor input_copy = input.clone(); + input.lerp_(end, weight); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_input = CopyToDevice(input_copy, device); + torch::Tensor lazy_end = CopyToDevice(end, device); + lazy_input.lerp_(lazy_end, weight); + AllClose(lazy_input, input); + }); + ExpectCounterNotChanged("aten::.*", GetIgnoredCounters()); + ExpectCounterChanged("lazy::lerp", GetIgnoredCounters()); +} + +TEST_F(LazyOpsTest, TestLerpOut) { + torch::Tensor start = torch::rand( + {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor end = torch::rand( + {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor weight = torch::rand( + {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor res = torch::empty( + {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + ; + torch::lerp_out(res, start, end, weight); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_start = CopyToDevice(start, device); + torch::Tensor lazy_end = CopyToDevice(end, device); + torch::Tensor lazy_weight = CopyToDevice(weight, device); + torch::Tensor lazy_res = torch::empty({3, 4}, lazy_start.options()); + torch::lerp_out(lazy_res, lazy_start, lazy_end, lazy_weight); + AllClose(res, lazy_res); + }); + ExpectCounterNotChanged("aten::.*", GetIgnoredCounters()); + ExpectCounterChanged("lazy::lerp", GetIgnoredCounters()); +} + +TEST_F(LazyOpsTest, TestLerpScalarOut) { + torch::Tensor start = torch::rand( + {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Tensor end = torch::rand( + {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::Scalar weight = torch::Scalar(3.0); + torch::Tensor res = torch::empty( + {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + torch::lerp_out(res, start, end, weight); + ForEachDevice([&](const torch::Device& device) { + torch::Tensor lazy_start = CopyToDevice(start, device); + torch::Tensor lazy_end = CopyToDevice(end, device); + torch::Tensor lazy_res = torch::empty({3, 4}, lazy_start.options()); + torch::lerp_out(lazy_res, lazy_start, lazy_end, weight); + AllClose(res, lazy_res); + }); + ExpectCounterNotChanged("aten::.*", GetIgnoredCounters()); + ExpectCounterChanged("lazy::lerp", GetIgnoredCounters()); +} + +TEST_F(LazyOpsTest, IsAliasOf) { + auto a = torch::empty(4, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + auto b = torch::empty(4, torch::TensorOptions(torch::kFloat).device(DefaultDevice())); + + ForEachDevice([&](const torch::Device& device) { + auto lazy_a = CopyToDevice(a, device); + auto lazy_b = CopyToDevice(b, device); + EXPECT_EQ(!a.is_alias_of(b), !lazy_a.is_alias_of(lazy_b)); + + auto c = a.view({2, 2}); + auto lazy_c = lazy_a.view({2, 2}); + EXPECT_EQ(a.is_alias_of(c), lazy_a.is_alias_of(lazy_c)); + + auto d = c.view({1, 4}); + auto lazy_d = lazy_c.view({1, 4}); + EXPECT_EQ(d.is_alias_of(c), lazy_d.is_alias_of(lazy_c)); + EXPECT_EQ(d.is_alias_of(a), lazy_d.is_alias_of(lazy_a)); + }); +} + +#endif // FBCODE_CAFFE2 + +} // namespace lazy +} // namespace torch diff --git a/test/cpp/lazy/test_lazy_ops_util.cpp b/test/cpp/lazy/test_lazy_ops_util.cpp new file mode 100644 index 000000000000..91c9b653e041 --- /dev/null +++ b/test/cpp/lazy/test_lazy_ops_util.cpp @@ -0,0 +1,194 @@ +#include + +#include +#include +#include +#include + +#include +#include + + +namespace torch { +namespace lazy { +namespace { + +bool IsLtcTensor(const at::Tensor& tensor) { + return dynamic_cast(tensor.unsafeGetTensorImpl()); +} + +std::unordered_set* CreateIgnoredCounters() { + std::unordered_set* icounters = + new std::unordered_set(); + // Add below the counters whose name need to be ignored when doing + // is-any-counter-changed assertins. + icounters->insert("aten::rand"); + return icounters; +} + +} // namespace + +const std::unordered_set* GetIgnoredCounters() { + static const std::unordered_set* icounters = + CreateIgnoredCounters(); + return icounters; +} + +at::Tensor ToCpuTensor(const at::Tensor& tensor) { + // tensor.to() implicitly triggers a sync if t.device=torch::kLazy. + return tensor.to(torch::kCPU); +} + +torch::Tensor CopyToDevice(const torch::Tensor& tensor, + const torch::Device& device) { + return tensor.clone().to(device, /*non_blocking=*/false, /*copy=*/true); +} + +bool EqualValues(at::Tensor tensor1, at::Tensor tensor2) { + tensor1 = ToCpuTensor(tensor1); + tensor2 = ToCpuTensor(tensor2); + if (torch::isnan(tensor1).any().item()) { + EXPECT_TRUE(EqualValues(torch::isnan(tensor1), torch::isnan(tensor2))); + tensor1.nan_to_num_(); + tensor2.nan_to_num_(); + } + if (tensor1.sizes() != tensor2.sizes() || + tensor1.dtype() != tensor2.dtype()) { + std::cerr << "Different shape:\n" + << tensor1.dtype() << " " << tensor1.sizes() << "\n-vs-\n" + << tensor2.dtype() << " " << tensor2.sizes() << "\n"; + return false; + } + at::ScalarType type1 = tensor1.scalar_type(); + at::ScalarType type2 = tensor2.scalar_type(); + if (type1 != type2) { + tensor1 = tensor1.toType(type2); + } + bool equal = tensor1.equal(tensor2); + return equal; +} + +bool EqualValuesNoElementTypeCheck(at::Tensor tensor1, at::Tensor tensor2) { + tensor1 = ToCpuTensor(tensor1); + tensor2 = ToCpuTensor(tensor2); + if (tensor1.sizes() != tensor2.sizes()) { + std::cerr << "Different shape:\n" + << tensor1.dtype() << " " << tensor1.sizes() << "\n-vs-\n" + << tensor2.dtype() << " " << tensor2.sizes() << "\n"; + return false; + } + at::ScalarType type1 = tensor1.scalar_type(); + at::ScalarType type2 = tensor2.scalar_type(); + if (type1 != type2) { + tensor1 = tensor1.toType(type2); + } + bool equal = tensor1.equal(tensor2); + return equal; +} + +void ForEachDevice(const std::function& devfn) { + // Currently TorchScript backend only supports one type of hardware per process, + // which is set by env. And the ordinal is always 0 given distributed training/ + // multi-device is not supported yet. + auto device = torch::lazy::BackendDevice(); + torch::Device torch_device = torch::lazy::backendDeviceToAtenDevice(device); + devfn(torch_device); +} + +bool CloseValues(at::Tensor tensor1, at::Tensor tensor2, double rtol, + double atol) { + tensor1 = ToCpuTensor(tensor1); + tensor2 = ToCpuTensor(tensor2); + if (torch::isnan(tensor1).any().item()) { + EXPECT_TRUE(EqualValues(torch::isnan(tensor1), torch::isnan(tensor2))); + tensor1.nan_to_num_(); + tensor2.nan_to_num_(); + } + if (tensor1.sizes() != tensor2.sizes() || + tensor1.dtype() != tensor2.dtype()) { + std::cerr << "Different shape:\n" + << tensor1.dtype() << " " << tensor1.sizes() << "\n-vs-\n" + << tensor2.dtype() << " " << tensor2.sizes() << "\n"; + return false; + } + bool equal = tensor1.allclose(tensor2, rtol, atol); + return equal; +} + +std::string GetTensorTextGraph(at::Tensor tensor) { + torch::lazy::LazyTensorPtr lazy_tensor = torch::lazy::TryGetLtcTensor(tensor); + return torch::lazy::DumpUtil::ToText({lazy_tensor->GetIrValue().node.get()}); +} + +std::string GetTensorDotGraph(at::Tensor tensor) { + torch::lazy::LazyTensorPtr lazy_tensor = torch::lazy::TryGetLtcTensor(tensor); + return torch::lazy::DumpUtil::ToDot({lazy_tensor->GetIrValue().node.get()}); +} + +void TestBackward( + const std::vector& inputs, const torch::Device& device, + const std::function&)>& + testfn, + double rtol, double atol, int derivative_level) { + std::vector input_vars; + std::vector xinput_vars; + std::vector inputs_w_grad; + std::vector xinputs_w_grad; + for (size_t i = 0; i < inputs.size(); ++i) { + const torch::Tensor& input = inputs[i]; + if (input.defined()) { + torch::Tensor oinput = + input.clone().detach().set_requires_grad(input.requires_grad()); + input_vars.push_back(oinput); + + torch::Tensor xinput = CopyToDevice(input, device) + .detach() + .set_requires_grad(input.requires_grad()); + xinput_vars.push_back(xinput); + if (input.requires_grad()) { + inputs_w_grad.push_back(oinput); + xinputs_w_grad.push_back(xinput); + } + } else { + input_vars.emplace_back(); + xinput_vars.emplace_back(); + } + } + + torch::Tensor output = testfn(input_vars); + torch::Tensor xoutput = testfn(xinput_vars); + torch::lazy::AllClose(output, xoutput, rtol, atol); + + std::vector outs = {output}; + std::vector xouts = {xoutput}; + for (int d = 1; d <= derivative_level; ++d) { + // Check grad of sum(outs) w.r.t inputs_w_grad. + torch::Tensor sum = torch::zeros_like(outs[0]).sum(); + torch::Tensor xsum = torch::zeros_like(xouts[0]).sum(); + for (size_t i = 0; i < outs.size(); ++i) { + if (outs[i].requires_grad()) { + sum += outs[i].sum(); + xsum += xouts[i].sum(); + } + } + // Calculating higher order derivative requires create_graph=true + bool create_graph = d != derivative_level; + outs = torch::autograd::grad({sum}, inputs_w_grad, /*grad_outputs=*/{}, + /*retain_graph=*/c10::nullopt, + /*create_graph=*/create_graph, + /*allow_unused=*/true); + xouts = torch::autograd::grad({xsum}, xinputs_w_grad, /*grad_outputs=*/{}, + /*retain_graph=*/c10::nullopt, + /*create_graph=*/create_graph, + /*allow_unused=*/true); + for (size_t i = 0; i < outs.size(); ++i) { + ASSERT_EQ(outs[i].defined(), xouts[i].defined()); + if (outs[i].defined()) { + AllClose(outs[i], xouts[i], rtol, atol); + } + } + } +} + +} // namespace lazy +} // namespace torch diff --git a/test/cpp/lazy/test_lazy_ops_util.h b/test/cpp/lazy/test_lazy_ops_util.h new file mode 100644 index 000000000000..6dc26b48be95 --- /dev/null +++ b/test/cpp/lazy/test_lazy_ops_util.h @@ -0,0 +1,68 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace torch { +namespace lazy { + +const std::unordered_set* GetIgnoredCounters(); + +// Converts an at::Tensor(device=torch::kLazy) to at::Tensor(device=torch::kCPU) +// This at::Tensor can be torch::Tensor which is a Variable, or at::Tensor which +// know nothing about autograd. If the input tensor is already a CPU tensor, it +// will be returned. Needed because EqualValues and AllClose require CPU tensors +// on both sides. +at::Tensor ToCpuTensor(const at::Tensor& tensor); + +// Helper function to copy a tensor to device. +torch::Tensor CopyToDevice(const torch::Tensor& tensor, + const torch::Device& device); + +bool EqualValues(at::Tensor tensor1, at::Tensor tensor2); + +bool EqualValuesNoElementTypeCheck(at::Tensor tensor1, at::Tensor tensor2); + +bool CloseValues(at::Tensor tensor1, at::Tensor tensor2, double rtol = 1e-5, + double atol = 1e-8); + +static inline void AllClose(at::Tensor tensor, at::Tensor xla_tensor, + double rtol = 1e-5, double atol = 1e-8) { + EXPECT_TRUE(CloseValues(tensor, xla_tensor, rtol, atol)); +} + +static inline void AllClose(at::Tensor tensor, torch::lazy::LazyTensor& xla_tensor, + double rtol = 1e-5, double atol = 1e-8) { + EXPECT_TRUE( + CloseValues(tensor, xla_tensor.ToTensor(/*detached=*/false), rtol, atol)); +} + +static inline void AllEqual(at::Tensor tensor, at::Tensor xla_tensor) { + EXPECT_TRUE(EqualValues(tensor, xla_tensor)); +} + +void ForEachDevice(const std::function& devfn); + +std::string GetTensorTextGraph(at::Tensor tensor); + +std::string GetTensorDotGraph(at::Tensor tensor); + +std::string GetTensorHloGraph(at::Tensor tensor); + +void TestBackward( + const std::vector& inputs, const torch::Device& device, + const std::function&)>& + testfn, + double rtol = 1e-5, double atol = 1e-8, int derivative_level = 1); + +} // namespace lazy +} // namespace torch diff --git a/test/cpp/lazy/test_misc.cpp b/test/cpp/lazy/test_misc.cpp index 45b54fd2824b..b2f941c42dd6 100644 --- a/test/cpp/lazy/test_misc.cpp +++ b/test/cpp/lazy/test_misc.cpp @@ -71,6 +71,11 @@ TEST(HashTest, Sanity) { auto b = std::vector({1, 1, 2, 3, 5, 8, 12}); test_hash_repeatable_sensitive(a, b); test_hash_repeatable_sensitive(c10::ArrayRef(a), c10::ArrayRef(b)); + + // vector is a special case bc it is implemented as vector + auto bool_a = std::vector({true, false, false, true}); + auto bool_b = std::vector({true, true, false, true}); + test_hash_repeatable_sensitive(bool_a, bool_b); } } // namespace lazy diff --git a/test/cpp/lazy/test_symbolic_shape.cpp b/test/cpp/lazy/test_symbolic_shape.cpp new file mode 100644 index 000000000000..b2224aec0d1c --- /dev/null +++ b/test/cpp/lazy/test_symbolic_shape.cpp @@ -0,0 +1,159 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace torch { +namespace lazy { + +// Lazy Tensor is disabled in FBCODE until addressing non-virtual methods (e.g. +// sizes) in TensorImpl +#ifndef FBCODE_CAFFE2 + +namespace { +// This registers the torchscript backend, without which lazy device won't work +torch::lazy::BackendRegistrar g_registrar(GetTSBackendImpl()); + +static inline at::DeviceType DefaultDevice() { + return torch::lazy::getBackend()->EagerFallbackDeviceType(); +} + +std::vector getIsSymbolic(at::Tensor& lazy_tensor) { + auto ltc_tensor = GetLtcTensor(lazy_tensor); + Value ir_val = ltc_tensor->GetIrValue(); + const Shape& shape = ir_val->shape(); + return shape.is_symbolic().value(); +} + +class LazyShapeTest : public ::testing::Test { + protected: + static void SetUpTestCase() {} + void SetUp() override { + at::manual_seed(42); + torch::lazy::LazyGraphExecutor::Get()->SetRngSeed( + torch::lazy::BackendDevice(), 42); + FLAGS_ltc_enable_symbolic_shapes = true; + } + void TearDown() override { + FLAGS_ltc_enable_symbolic_shapes = false; + } +}; + +class DynamicInputShapeNode : public Node { + public: + explicit DynamicInputShapeNode(Shape& shape) + : Node(OpKind(), /* num_outputs */ 1), + hash_(0), + shape_(shape) {} + ~DynamicInputShapeNode() override = default; + + const std::vector& operands() const override { + TORCH_INTERNAL_ASSERT(false, "Can't access operands of test node"); + } + + const Output& operand(size_t i) const override { + TORCH_INTERNAL_ASSERT(false, "Can't access operand[i] of test node"); + } + const Shape& shape(size_t i) const override { + return shape_; + } + c10::ArrayRef shapes() const override { + return {shape_}; + } + + hash_t hash() const override { return hash_; } + hash_t shapeHash() const override { return hash_; } + + private: + hash_t hash_; + Shape shape_; +}; + +} // namespace + +Tensor tensorWithSymbolicShape( + const std::vector& sizes, + const std::vector& is_symbolic) { + Shape shape = Shape(torch::kFloat32, sizes); + Shape shape_with_symbolic = shape.with_symbolic_dims(is_symbolic); + auto n = torch::lazy::MakeNode(shape_with_symbolic); + auto device = BackendDevice(); + auto lt = torch::lazy::LazyTensor::Create(n, device); + return torch::lazy::CreateAtenFromLtcTensor(lt); +} + +TEST_F(LazyShapeTest, TestMulBasic) { + // Basic propagation + torch::Tensor a = tensorWithSymbolicShape({2, 2}, {true, false}); + torch::Tensor b = tensorWithSymbolicShape({2, 2}, {true, false}); + torch::Tensor res = torch::mul(a, b); + + std::vector expected = {true, false}; + EXPECT_EQ(getIsSymbolic(res), expected); + + // Test when some inputs are symbolic + a = tensorWithSymbolicShape({2, 2}, {true, true}); + b = tensorWithSymbolicShape({2, 2}, {true, false}); + res = torch::mul(a, b); + + // This is not {true, false}, as the SSA shape propagation + // is not able to simplify + // expandedSizes.append(sizeB if sizeA == 1 else sizeA) + // in broadcast() in shape_functions_1.h + // due to sizeA being symbolic + expected = {true, true}; + EXPECT_EQ(getIsSymbolic(res), expected); + + // Test correct handling of broadcasting dim + a = tensorWithSymbolicShape({2, 2}, {false, true}); + b = tensorWithSymbolicShape({2, 1}, {true, false}); + res = torch::mul(a, b); + + expected = {false, true}; + EXPECT_EQ(getIsSymbolic(res), expected); + + // Test correct handling of scalar values + a = tensorWithSymbolicShape({2, 2}, {false, true}); + res = torch::mul(a, 3); + expected = {false, true}; + EXPECT_EQ(getIsSymbolic(res), expected); +}; + +TEST_F(LazyShapeTest, TestCatBasic) { + // Basic propagation + torch::Tensor a = tensorWithSymbolicShape({2, 2}, {true, false}); + torch::Tensor b = tensorWithSymbolicShape({2, 2}, {true, false}); + torch::Tensor c = tensorWithSymbolicShape({2, 2}, {true, false}); + + auto res = torch::cat({a, b, c}, 1); + std::vector expected = {true, false}; + EXPECT_EQ(getIsSymbolic(res), expected); + + torch::Tensor d = tensorWithSymbolicShape({2, 2}, {false, true}); + res = torch::cat({a, d}, 0); + expected = {true, false}; + EXPECT_EQ(getIsSymbolic(res), expected); + + // Test handling of symbolic dims of inequal sizes, Currently crashes + // As we can't handle cases where upper bound dims are not equal + /* + torch::Tensor e = tensorWithSymbolicShape({2, 2}, {true, false}); + torch::Tensor f = tensorWithSymbolicShape({2, 3}, {false, true}); + res = torch::cat({e, f}, 0); + expected = {true, false}; + EXPECT_EQ(getIsSymbolic(res), expected); + */ +} +#endif // FBCODE_CAFFE2 +} // namespace lazy +} // namespace torch diff --git a/test/cpp/lazy/test_tensor_impl.cpp b/test/cpp/lazy/test_tensor_impl.cpp index 2a7f2893c724..8d968f620b6b 100644 --- a/test/cpp/lazy/test_tensor_impl.cpp +++ b/test/cpp/lazy/test_tensor_impl.cpp @@ -6,12 +6,14 @@ namespace torch { namespace lazy { -// TODO(alanwaketan): Update the following unit tests once the TorchScript backend is merged. +#ifdef FBCODE_CAFFE2 +// Lazy Tensor is disabled in FBCODE until addressing non-virtual methods (e.g. sizes) in TensorImpl TEST(LazyTensorImplTest, BasicThrow) { EXPECT_THROW({ auto input = torch::rand({0, 1, 3, 0}, torch::TensorOptions(torch::kFloat).device("lazy")); }, ::c10::Error); } +#endif // FBCODE_CAFFE2 } // namespace lazy } // namespace torch diff --git a/test/cpp/lazy/test_trie_cache.cpp b/test/cpp/lazy/test_trie_cache.cpp new file mode 100644 index 000000000000..df7d578b94b4 --- /dev/null +++ b/test/cpp/lazy/test_trie_cache.cpp @@ -0,0 +1,92 @@ +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace torch { +namespace lazy { + +class TrieCacheNode : public Node { + public: + static OpKind ClassOpKind() { + return OpKind(); + } + + explicit TrieCacheNode(size_t id) + : Node(ClassOpKind(), /* num_outputs */ 1), id_(id), hash_(Hash(id_)) {} + ~TrieCacheNode() override = default; + + bool CanBeReused(size_t id) const { + return (id_ == id); + } + + void AddOperand(Value v) { + if (!v.node) { + return; + } + operands_as_outputs_.emplace_back(v.node.get(), v.index); + operands_.push_back(std::move(v.node)); + } + + hash_t hash() const override { return hash_; } + hash_t shapeHash() const override { return hash_; } + private: + size_t id_; + hash_t hash_; +}; + +TEST(TrieCacheTest, TestSinglePath) { + FLAGS_torch_lazy_reuse_ir = true; + TrieCache::Get()->Clear(); + + NodePtr a = ReuseOrMakeNode(0); + NodePtr b = ReuseOrMakeNode(1); + NodePtr c = ReuseOrMakeNode(2); + TrieCache::Get()->ResetCurrent(); // MarkStep + + EXPECT_EQ(ReuseOrMakeNode(0).get(), a.get()); + EXPECT_EQ(ReuseOrMakeNode(1).get(), b.get()); + EXPECT_EQ(ReuseOrMakeNode(2).get(), c.get()); + TrieCache::Get()->ResetCurrent(); // MarkStep +} + +/* +* 0 +* | +* 1 +* / \ +* 2 3 +*/ +TEST(TrieCacheTest, TestTwoPaths) { + FLAGS_torch_lazy_reuse_ir = true; + TrieCache::Get()->Clear(); + + NodePtr a = ReuseOrMakeNode(0); + NodePtr b = ReuseOrMakeNode(1); + NodePtr c = ReuseOrMakeNode(2); + TrieCache::Get()->ResetCurrent(); // MarkStep + + EXPECT_EQ(ReuseOrMakeNode(0).get(), a.get()); + EXPECT_EQ(ReuseOrMakeNode(1).get(), b.get()); + NodePtr d = ReuseOrMakeNode(3); + EXPECT_NE(d.get(), c.get()); + TrieCache::Get()->ResetCurrent(); // MarkStep + + EXPECT_EQ(ReuseOrMakeNode(0).get(), a.get()); + EXPECT_EQ(ReuseOrMakeNode(1).get(), b.get()); + EXPECT_EQ(ReuseOrMakeNode(3).get(), d.get()); + TrieCache::Get()->ResetCurrent(); // MarkStep + + EXPECT_EQ(ReuseOrMakeNode(0).get(), a.get()); + EXPECT_EQ(ReuseOrMakeNode(1).get(), b.get()); + EXPECT_EQ(ReuseOrMakeNode(2).get(), c.get()); + TrieCache::Get()->ResetCurrent(); // MarkStep +} + +} // namespace lazy +} // namespace torch diff --git a/test/cpp/lite_interpreter_runtime/CMakeLists.txt b/test/cpp/lite_interpreter_runtime/CMakeLists.txt index 503203d7be08..6a2e6db6eaa9 100644 --- a/test/cpp/lite_interpreter_runtime/CMakeLists.txt +++ b/test/cpp/lite_interpreter_runtime/CMakeLists.txt @@ -23,6 +23,10 @@ target_include_directories( target_link_libraries(test_lite_interpreter_runtime PRIVATE torch gtest backend_with_compiler_runtime) +if(LINUX) + target_link_libraries(test_lite_interpreter_runtime PRIVATE "-Wl,--no-as-needed,$,--as-needed") +endif() + if(INSTALL_TEST) install(TARGETS test_lite_interpreter_runtime DESTINATION bin) # Install PDB files for MSVC builds diff --git a/test/cpp/profiler/containers.cpp b/test/cpp/profiler/containers.cpp new file mode 100644 index 000000000000..60e6d0f238b1 --- /dev/null +++ b/test/cpp/profiler/containers.cpp @@ -0,0 +1,76 @@ +#include +#include +#include +#include + +#include + +#include +#include +#include + +TEST(ProfilerTest, AppendOnlyList) { + const int n = 4096; + torch::profiler::impl::AppendOnlyList list; + for (const auto i : c10::irange(n)) { + list.emplace_back(i); + ASSERT_EQ(list.size(), i + 1); + } + + int expected = 0; + for (const auto i : list) { + ASSERT_EQ(i, expected++); + } + ASSERT_EQ(expected, n); + + list.clear(); + ASSERT_EQ(list.size(), 0); +} + +TEST(ProfilerTest, AppendOnlyList_ref) { + const int n = 512; + torch::profiler::impl::AppendOnlyList, 64> list; + std::vector*> refs; + for (const auto _ : c10::irange(n)) { + refs.push_back(list.emplace_back()); + } + + for (const auto i : c10::irange(n)) { + *refs.at(i) = {i, 0}; + } + + int expected = 0; + for (const auto& i : list) { + ASSERT_EQ(i.first, expected++); + } +} + +// Test that we can convert TSC measurements back to wall clock time. +TEST(ProfilerTest, clock_converter) { + const int n = 10001; + torch::profiler::impl::ApproximateClockToUnixTimeConverter converter; + std::vector pairs; + for (const auto i : c10::irange(n)) { + pairs.push_back(torch::profiler::impl::ApproximateClockToUnixTimeConverter::measurePair()); + } + auto count_to_ns = converter.makeConverter(); + std::vector deltas; + for (const auto& i : pairs) { + deltas.push_back(i.t_ - count_to_ns(i.approx_t_)); + } + std::sort(deltas.begin(), deltas.end()); + + // In general it's not a good idea to put clocks in unit tests as it leads + // to flakiness. We mitigate this by: + // 1) Testing the clock itself. While the time to complete a task may + // vary, two clocks measuring the same time should be much more + // consistent. + // 2) Only testing the interquartile range. Context switches between + // calls to the two timers do occur and can result in hundreds of + // nanoseconds of noise, but such switches are only a few percent + // of cases. + // 3) We're willing to accept a somewhat large bias which can emerge from + // differences in the cost of calling each clock. + EXPECT_LT(std::abs(deltas[n / 2]), 200); + EXPECT_LT(deltas[n * 3 / 4] - deltas[n / 4], 50); +} diff --git a/test/cpp/profiler/record_function.cpp b/test/cpp/profiler/record_function.cpp new file mode 100644 index 000000000000..ba76c5af5888 --- /dev/null +++ b/test/cpp/profiler/record_function.cpp @@ -0,0 +1,307 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +// Test that we can add and remove callbacks (both global and thread local.) +TEST(RecordFunctionTest, AddRemove) { + at::clearCallbacks(); + ASSERT_FALSE(at::hasCallbacks()); + + auto start_callback = + [](const at::RecordFunction& fn) -> std::unique_ptr { + return nullptr; + }; + auto end_callback = [](const at::RecordFunction& fn, at::ObserverContext*) {}; + + auto handle = at::addThreadLocalCallback( + at::RecordFunctionCallback(start_callback, end_callback)); + + ASSERT_TRUE(at::hasCallbacks()); + ASSERT_TRUE(at::hasThreadLocalCallbacks()); + ASSERT_FALSE(at::hasGlobalCallbacks()); + + at::removeCallback(handle); + ASSERT_FALSE(at::hasCallbacks()); + + handle = at::addGlobalCallback( + at::RecordFunctionCallback(start_callback, end_callback)); + + ASSERT_TRUE(at::hasCallbacks()); + ASSERT_FALSE(at::hasThreadLocalCallbacks()); + ASSERT_TRUE(at::hasGlobalCallbacks()); + + at::removeCallback(handle); + ASSERT_FALSE(at::hasCallbacks()); +} + +// Test that the callbacks that we register are actually run. +TEST(RecordFunctionTest, ThreadLocalState) { + at::clearCallbacks(); + ASSERT_FALSE(at::hasCallbacks()); + + static int tls_test_start_counter; + static int tls_test_end_counter; + tls_test_start_counter = 0; + tls_test_end_counter = 0; + + auto start_callback = + [](const at::RecordFunction&) -> std::unique_ptr { + ++tls_test_start_counter; + return nullptr; + }; + auto end_callback = [](const at::RecordFunction&, at::ObserverContext*) { + ++tls_test_end_counter; + }; + + auto handle = at::addThreadLocalCallback( + at::RecordFunctionCallback(start_callback, end_callback)); + + { + at::RecordFunction guard(at::RecordScope::USER_SCOPE); + guard.before("Test"); + EXPECT_EQ(tls_test_start_counter, 1); + EXPECT_EQ(tls_test_end_counter, 0); + } + EXPECT_EQ(tls_test_start_counter, 1); + EXPECT_EQ(tls_test_end_counter, 1); + + { + tls_test_start_counter = 0; + tls_test_end_counter = 0; + at::DisableRecordFunctionGuard no_profile_guard; + at::RecordFunction guard(at::RecordScope::USER_SCOPE); + guard.before("Test"); + EXPECT_EQ(tls_test_start_counter, 0); + EXPECT_EQ(tls_test_end_counter, 0); + } + EXPECT_EQ(tls_test_start_counter, 0); + EXPECT_EQ(tls_test_end_counter, 0); + + { + tls_test_start_counter = 0; + tls_test_end_counter = 0; + RECORD_FUNCTION("Test", {}); + EXPECT_EQ(tls_test_start_counter, 1); + EXPECT_EQ(tls_test_end_counter, 0); + } + EXPECT_EQ(tls_test_start_counter, 1); + EXPECT_EQ(tls_test_end_counter, 1); + + at::removeCallback(handle); + ASSERT_FALSE(at::hasCallbacks()); +} + +// Test that callbacks are run in the order that they are registered. +TEST(RecordFunctionTest, CallOrder) { + at::clearCallbacks(); + ASSERT_FALSE(at::hasCallbacks()); + + static int current_index; + current_index = 0; + + static std::array expected_order = { + "Start Callback 0 Outer", + "Start Callback 1 Outer", + "Start Callback 0 Inner", + "Start Callback 1 Inner", + "End Callback 0 Inner", + "End Callback 1 Inner", + "End Callback 0 Outer", + "End Callback 1 Outer", + }; + +#define REGISTER_CALLBACK(index) \ + at::addThreadLocalCallback( \ + at::RecordFunctionCallback( \ + [](const at::RecordFunction& fn) \ + -> std::unique_ptr { \ + EXPECT_EQ( \ + fmt::format("Start Callback {} {}", index, fn.name()), \ + expected_order[current_index++]); \ + return nullptr; \ + }, \ + [](const at::RecordFunction& fn, at::ObserverContext*) { \ + EXPECT_EQ( \ + fmt::format("End Callback {} {}", index, fn.name()), \ + expected_order[current_index++]); \ + }) \ + .scopes({at::RecordScope::FUNCTION})) + + REGISTER_CALLBACK(0); + REGISTER_CALLBACK(1); +#undef REGISTER_CALLBACK + + RECORD_FUNCTION("Outer", {}); + { RECORD_FUNCTION("Inner", {}); } + + at::clearCallbacks(); + ASSERT_FALSE(at::hasCallbacks()); +} + +// Make sure TLS migrates when tasks are launched. +TEST(RecordFunctionTest, ThreadMigration) { + at::clearCallbacks(); + ASSERT_FALSE(at::hasCallbacks()); + + static int call_count; + call_count = 0; + + auto handle = at::addThreadLocalCallback( + at::RecordFunctionCallback( + [](const at::RecordFunction&) + -> std::unique_ptr { return nullptr; }, + [](const at::RecordFunction&, at::ObserverContext*) { + ++call_count; + }) + .scopes({at::RecordScope::FUNCTION})); + + EXPECT_EQ(call_count, 0); + + std::condition_variable cv; + std::mutex lock; + at::launch([&cv]() { + RECORD_FUNCTION("Test", {}); + cv.notify_all(); + }); + auto guard = std::unique_lock(lock); + cv.wait(guard, []{ return call_count > 0; }); + + EXPECT_EQ(call_count, 1); + + at::removeCallback(handle); + ASSERT_FALSE(at::hasCallbacks()); +} + +// Test sampling logic and validate that callbacks fire at the correct times. +TEST(RecordFunctionTest, Sampling) { + at::clearCallbacks(); + ASSERT_FALSE(at::hasCallbacks()); + + static int sample_test_counter; + sample_test_counter = 0; + + uint32_t seed = 12345; + double p = 0.25; + + at::set_record_function_seed_for_testing(seed); + std::mt19937 generator; + generator.seed(seed); + auto dist = std::geometric_distribution(p); + + // Make sure we know which steps should fire. + auto outcomes = std::array{7, 0, 0, 6, 2}; + for (const auto i : c10::irange(outcomes.size())) { + ASSERT_EQ(dist(generator), outcomes[i]); + } + + std::vector expected_counts; + int running_count = 0; + for (const auto i : c10::irange(outcomes.size())) { + for (const auto j : c10::irange(outcomes[i])) { + expected_counts.push_back(running_count); + } + expected_counts.push_back(++running_count); + } + + auto start_callback = + [](const at::RecordFunction& fn) -> std::unique_ptr { + ++sample_test_counter; + return nullptr; + }; + auto end_callback = [](const at::RecordFunction& fn, at::ObserverContext*) {}; + + auto handle = at::addThreadLocalCallback( + at::RecordFunctionCallback(start_callback, end_callback) + .samplingProb(p) + .scopes({at::RecordScope::FUNCTION})); + + for (const auto i : c10::irange(expected_counts.size())) { + RECORD_FUNCTION("Test", {}); + EXPECT_EQ(sample_test_counter, expected_counts[i]); + } + + at::removeCallback(handle); + ASSERT_FALSE(at::hasCallbacks()); +} + +// Validate sampling against a simple reference implementation for a complex set +// of registered callbacks. +TEST(RecordFunctionTest, MultipleCallbacks) { + at::clearCallbacks(); + ASSERT_FALSE(at::hasCallbacks()); + + uint32_t seed = 54321; + + std::mt19937 generator; + generator.seed(seed); + + auto sample = [&](double p) { + return (p < 1.0 ? std::geometric_distribution(p)(generator) : 0) + 1; + }; + + std::array probabilities{0.1, 1.0, 1.0, 0.3}; + std::array next_call; + std::array counts; + static std::array counts_from_rec_fn; + counts_from_rec_fn.fill(0); + + auto start_callback_0 = + [](const at::RecordFunction& fn) -> std::unique_ptr { + ++counts_from_rec_fn[0]; + return nullptr; + }; + + auto end_callback = [](const at::RecordFunction& fn, at::ObserverContext*) {}; + +#define REGISTER_CALLBACK(register_fn, index) \ + register_fn(at::RecordFunctionCallback( \ + [](const at::RecordFunction& fn) \ + -> std::unique_ptr { \ + ++counts_from_rec_fn[index]; \ + return nullptr; \ + }, \ + end_callback) \ + .samplingProb(probabilities[index]) \ + .scopes({at::RecordScope::FUNCTION})) + + REGISTER_CALLBACK(at::addGlobalCallback, 0); + REGISTER_CALLBACK(at::addGlobalCallback, 1); + REGISTER_CALLBACK(at::addThreadLocalCallback, 2); + + // The RecordFunction machinery will rebuild callbacks whenever a new observer + // is registered, so we need to wait until the last callback to seed the + // random number generator. + at::set_record_function_seed_for_testing(seed); + REGISTER_CALLBACK(at::addThreadLocalCallback, 3); +#undef REGISTER_CALLBACK + + for (const auto i : c10::irange(probabilities.size())) { + next_call[i] = sample(probabilities[i]); + } + + for (const auto i : c10::irange(50)) { + RECORD_FUNCTION("Test", {}); + for (const auto j : c10::irange(next_call.size())) { + if (!(--next_call[j])) { + ++counts[j]; + next_call[j] = sample(probabilities[j]); + } + EXPECT_EQ(counts[j], counts_from_rec_fn[j]); + } + } + + at::clearCallbacks(); + ASSERT_FALSE(at::hasCallbacks()); +} diff --git a/test/cpp/tensorexpr/CMakeLists.txt b/test/cpp/tensorexpr/CMakeLists.txt index 8fc5a0a18331..7dff70630d3e 100644 --- a/test/cpp/tensorexpr/CMakeLists.txt +++ b/test/cpp/tensorexpr/CMakeLists.txt @@ -23,6 +23,7 @@ set(TENSOREXPR_TEST_SRCS ${TENSOREXPR_TEST_ROOT}/test_simplify.cpp ${TENSOREXPR_TEST_ROOT}/test_te_fuser_pass.cpp ${TENSOREXPR_TEST_ROOT}/test_type.cpp + ${TENSOREXPR_TEST_ROOT}/test_type_specializations.cpp ) if(USE_CUDA) diff --git a/test/cpp/tensorexpr/test_base.h b/test/cpp/tensorexpr/test_base.h index 4a8e667de3ac..510cad450012 100644 --- a/test/cpp/tensorexpr/test_base.h +++ b/test/cpp/tensorexpr/test_base.h @@ -78,7 +78,7 @@ static void assertAllEqual(const std::vector& vec, const T& val) { template static void assertAllEqual(const std::vector& v1, const std::vector& v2) { ASSERT_EQ(v1.size(), v2.size()); - for (int i = 0; i < v1.size(); i++) { + for (size_t i = 0; i < v1.size(); ++i) { ASSERT_EQ(v1[i], v2[i]); } } diff --git a/test/cpp/tensorexpr/test_boundsinference.cpp b/test/cpp/tensorexpr/test_boundsinference.cpp index 7cabee0ce55e..a7df88b8ab99 100644 --- a/test/cpp/tensorexpr/test_boundsinference.cpp +++ b/test/cpp/tensorexpr/test_boundsinference.cpp @@ -49,8 +49,7 @@ TEST(BoundsInference, _1) { // {{b, kStore, 0, 99}, {a, kLoad, 0, 99}} ExprHandle n(100); BufHandle a("a", {n}, kFloat); - Tensor b = - Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); }); + Tensor b = Compute("b", {n}, [&](const VarHandle& i) { return a.load(i); }); LoopNest l({b}); auto bounds_info = inferBounds(l.root_stmt()); @@ -73,8 +72,7 @@ TEST(BoundsInference, _2) { // {{b, kStore, 0, n-1}, {a, kLoad, 0, n-1}} VarHandle n("n", kInt); BufHandle a("a", {n}, kFloat); - Tensor b = - Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); }); + Tensor b = Compute("b", {n}, [&](const VarHandle& i) { return a.load(i); }); LoopNest l({b}); auto bounds_info = inferBounds(l.root_stmt()); @@ -97,9 +95,8 @@ TEST(BoundsInference, _3) { // {{b, kStore, 0, 99}, {a, kLoad, 0, 109}} ExprHandle n(100); BufHandle a("a", {n + 10}, kFloat); - Tensor b = Compute("b", {{n, "i"}}, [&](const VarHandle& i) { - return a.load(i) * a.load(i + 10); - }); + Tensor b = Compute( + "b", {n}, [&](const VarHandle& i) { return a.load(i) * a.load(i + 10); }); LoopNest l({b}); auto bounds_info = inferBounds(l.root_stmt()); @@ -126,14 +123,12 @@ TEST(BoundsInference, _4) { ExprHandle W(320); ExprHandle H(200); BufHandle a("a", {H, W}, kFloat); - Tensor b = Compute( - "b", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) { - return x * y; - }); - Tensor c = Compute( - "c", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) { - return a.load(y, x) * b.load(y, x); - }); + Tensor b = Compute("b", {H, W}, [&](const VarHandle& y, const VarHandle& x) { + return x * y; + }); + Tensor c = Compute("c", {H, W}, [&](const VarHandle& y, const VarHandle& x) { + return a.load(y, x) * b.load(y, x); + }); LoopNest l({c}); std::vector loops = l.getLoopStmtsFor(c); StmtPtr body = l.getLoopBodyFor(c); @@ -204,8 +199,7 @@ TEST(BoundsInference, _5) { // b[i_tail + (100/16)*16] = a[i_tail + (100/16)*16]; ExprHandle n(100); BufHandle a("a", {n}, kFloat); - Tensor b = - Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); }); + Tensor b = Compute("b", {n}, [&](const VarHandle& i) { return a.load(i); }); LoopNest l({b}); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) @@ -258,12 +252,11 @@ TEST(BoundsInference, _6) { ExprHandle CW(32); ExprHandle CH(20); BufHandle a("a", {H, W}, kFloat); - Tensor b = Compute( - "b", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) { - return x * y; - }); - Tensor c = Compute( - "c", {{CH, "y"}, {CW, "x"}}, [&](const VarHandle& y, const VarHandle& x) { + Tensor b = Compute("b", {H, W}, [&](const VarHandle& y, const VarHandle& x) { + return x * y; + }); + Tensor c = + Compute("c", {CH, CW}, [&](const VarHandle& y, const VarHandle& x) { return a.load(y + 100, x + 100) * b.load(y * 2, x * 5); }); LoopNest l({c}); @@ -325,10 +318,9 @@ TEST(BoundsInference, _6) { TEST(BoundsInference, Adjacent) { ExprHandle H(6); BufHandle a("a", {20}, kFloat); - Tensor b = - Compute("b", {{H, "x"}}, [&](const VarHandle& x) { return a.load(x); }); - Tensor c = Compute( - "c", {{H, "x"}}, [&](const VarHandle& x) { return a.load(x + H); }); + Tensor b = Compute("b", {H}, [&](const VarHandle& x) { return a.load(x); }); + Tensor c = + Compute("c", {H}, [&](const VarHandle& x) { return a.load(x + H); }); LoopNest l({b, c}); std::vector loops = NodeFinder::find(l.root_stmt()); @@ -383,12 +375,11 @@ TEST(BoundsInference, Adjacent) { TEST(BoundsInference, MultipleTopLoopLoad) { BufHandle a("a", {100}, kFloat); - Tensor b = - Compute("b", {{64, "x"}}, [&](const VarHandle& x) { return a.load(x); }); - Tensor c = Compute( - "c", {{32, "x"}}, [&](const VarHandle& x) { return a.load(x + 10); }); - Tensor d = Compute( - "d", {{96, "x"}}, [&](const VarHandle& x) { return a.load(x + 2); }); + Tensor b = Compute("b", {64}, [&](const VarHandle& x) { return a.load(x); }); + Tensor c = + Compute("c", {32}, [&](const VarHandle& x) { return a.load(x + 10); }); + Tensor d = + Compute("d", {96}, [&](const VarHandle& x) { return a.load(x + 2); }); LoopNest l({b, c, d}); auto bounds_info = inferBounds(l.root_stmt()); @@ -496,16 +487,15 @@ TEST(BoundsInference, MultipleTopLoopStore) { } TEST(BoundsInference, CacheReads) { - Tensor A = Compute( - "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { - return i * j; - }); - Tensor B = Compute( - "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) { + Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) { + return i * j; + }); + Tensor B = + Compute("B", {20, 10}, [&](const VarHandle& i, const VarHandle& j) { return A.load(i + 30, j + 3); }); - Tensor C = Compute( - "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) { + Tensor C = + Compute("C", {20, 10}, [&](const VarHandle& i, const VarHandle& j) { return A.load(i + 10, j + 20) + A.load(i + 30, j + 40); }); @@ -562,7 +552,7 @@ TEST(BoundsInference, CacheReads) { TEST(BoundsInference, Flattened) { Tensor b = Compute( "b", - {{3, "z"}, {4, "y"}, {5, "x"}}, + {3, 4, 5}, [&](const VarHandle& z, const VarHandle& y, const VarHandle& x) { return x * y + z; }); @@ -637,14 +627,12 @@ TEST(BoundsInference, GetPotentialHazards) { } TEST(BoundsInference, GetPotentialHazardsLoopNoHazard) { - Tensor A = Compute( - "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { - return i * j; - }); - Tensor B = Compute( - "B", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { - return (i + 1) * (j + 1); - }); + Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) { + return i * j; + }); + Tensor B = Compute("B", {64, 64}, [](const VarHandle& i, const VarHandle& j) { + return (i + 1) * (j + 1); + }); LoopNest l({A, B}); @@ -663,12 +651,11 @@ TEST(BoundsInference, GetPotentialHazardsLoopNoHazard) { } TEST(BoundsInference, GetPotentialHazardsLoopCall) { - Tensor A = Compute( - "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { - return i * j; - }); - Tensor B = Compute( - "B", {{64, "i"}, {64, "j"}}, [&](const VarHandle& i, const VarHandle& j) { + Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) { + return i * j; + }); + Tensor B = + Compute("B", {64, 64}, [&](const VarHandle& i, const VarHandle& j) { return A.load(i, j) + 5; }); @@ -688,10 +675,9 @@ TEST(BoundsInference, GetPotentialHazardsLoopCall) { } TEST(BoundsInference, GetPotentialHazardsLoopSplit) { - Tensor A = Compute( - "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { - return i * j; - }); + Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) { + return i * j; + }); LoopNest l({A}); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) diff --git a/test/cpp/tensorexpr/test_conv.cpp b/test/cpp/tensorexpr/test_conv.cpp index 4f43e4f8621c..cf458af02095 100644 --- a/test/cpp/tensorexpr/test_conv.cpp +++ b/test/cpp/tensorexpr/test_conv.cpp @@ -191,7 +191,7 @@ TEST(Conv, Conv2D) { te::Tensor conv = te::Reduce( "conv", - {{N, "n"}, {K, "k"}, {OH, "oh"}, {OW, "ow"}}, + {N, K, OH, OW}, te::Sum(), // FIXME: We have to use a `std::vector` parameter here and then unpack // it, because we don't have an overload allowing for an arbitrary number @@ -211,7 +211,7 @@ TEST(Conv, Conv2D) { }, // FIXME: If you forget one of the reduction dims, you get a segfault. // Could that be caught by a verifier? - {{C, "c"}, {R, "r"}, {S, "s"}}); + {C, R, S}); // FIXME: It'd be nice to have a single header that pulls in things like // LoopNest, IRSimplifier, etc. diff --git a/test/cpp/tensorexpr/test_cuda.cpp b/test/cpp/tensorexpr/test_cuda.cpp index feca646a657c..cc945834d7a5 100644 --- a/test/cpp/tensorexpr/test_cuda.cpp +++ b/test/cpp/tensorexpr/test_cuda.cpp @@ -37,9 +37,9 @@ static void testCudaTestVectorAdd01_impl() { Tensor c = Compute( "c", { - {num_iter, "n"}, - {block_count, "b_id"}, - {block_size, "t_id"}, + num_iter, + block_count, + block_size, }, [&](const VarHandle& n, const VarHandle& b_id, const VarHandle& t_id) { return a_buf.load(n, b_id, t_id) + b_buf.load(n, b_id, t_id); @@ -101,9 +101,9 @@ TEST(Cuda, Sigmoid_CUDA) { Tensor c = Compute( "c", { - {num_iter, "n"}, - {block_count, "b_id"}, - {block_size, "t_id"}, + num_iter, + block_count, + block_size, }, [&](const VarHandle& n, const VarHandle& b_id, const VarHandle& t_id) { return sigmoid(sigmoid(a_buf.load(n, b_id, t_id))); @@ -163,12 +163,9 @@ TEST(Cuda, TestVectorAdd01_CUDA) { static void testCudaTestVectorAdd02_impl(int64_t N, int64_t block_size) { BufHandle a_buf("a", {N}, kFloat); BufHandle b_buf("b", {N}, kFloat); - Tensor c = Compute( - "c", - { - {N, "N"}, - }, - [&](const VarHandle& n) { return a_buf.load(n) + b_buf.load(n); }); + Tensor c = Compute("c", {N}, [&](const VarHandle& n) { + return a_buf.load(n) + b_buf.load(n); + }); LoopNest l({c}); ForPtr n_inner; std::vector loops = l.getLoopStmtsFor(c); @@ -222,7 +219,7 @@ TEST(Cuda, TestVectorAdd02_CUDA) { TEST(Cuda, HalfCast_CUDA) { auto half = ToDtype(); BufHandle a("a", {4}, half); - Tensor b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) { + Tensor b = Compute("b", {4}, [&](const VarHandle& i) { return Cast::make(kFloat, a.load(i)); }); @@ -263,8 +260,8 @@ TEST(Cuda, DynamicShape2D_CUDA) { VarHandle n("n", kInt); BufHandle a("a", {m, n}, kFloat); BufHandle b("b", {m, n}, kFloat); - Tensor c = Compute( - "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) { + Tensor c = + Compute("c", {m, n}, [&](const VarHandle& i, const VarHandle& j) { return a.load(i, j) + b.load(i, j); }); LoopNest l({c}); @@ -326,9 +323,9 @@ TEST(Cuda, TestRand01_CUDA) { Tensor c = Compute( "c", { - {num_iter, "n"}, - {block_count, "b_id"}, - {block_size, "t_id"}, + num_iter, + block_count, + block_size, }, [&](const VarHandle& n, const VarHandle& b_id, const VarHandle& t_id) { return Intrinsics::make(IntrinsicsOp::kRand, kFloat); @@ -381,8 +378,8 @@ TEST(Cuda, DynamicShapeSplit_CUDA) { constexpr int64_t N = 4096; VarHandle n("n", kLong); BufHandle a("a", {n}, kFloat); - Tensor b = Compute( - "b", {{n, "n"}}, [&](const VarHandle& i) { return a.load(i) * 2.0f; }); + Tensor b = + Compute("b", {n}, [&](const VarHandle& i) { return a.load(i) * 2.0f; }); LoopNest l({b}); ForPtr inner; std::vector loops = l.getLoopStmtsFor(b); @@ -914,15 +911,15 @@ TEST(Cuda, LocalMemReduce_1_CUDA) { TEST(Cuda, HalfSupport_CUDA) { auto half = ToDtype(); BufHandle a("a", {4}, half); - Tensor b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) { + Tensor b = Compute("b", {4}, [&](const VarHandle& i) { return Cast::make(half, ExprHandle(2.0f) * a.load(i)); }); - Tensor c = Compute("c", {{4, "n"}}, [&](const VarHandle& i) { + Tensor c = Compute("c", {4}, [&](const VarHandle& i) { return Cast::make(kFloat, Cast::make(half, ExprHandle(42)) + b.load(i)); }); - Tensor d = Compute("d", {{4, "n"}}, [&](const VarHandle& i) { + Tensor d = Compute("d", {4}, [&](const VarHandle& i) { return Cast::make(half, c.load(i)); }); @@ -971,7 +968,7 @@ TEST(Cuda, HalfSupport_CUDA) { TEST(Cuda, HalfPropagation_CUDA) { auto half = ToDtype(); BufHandle a("a", {4}, half); - Tensor relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) { + Tensor relu = Compute("relu", {4}, [&](const VarHandle& i) { return Max::make(a.load(i), ExprHandle(alloc(0)), true); }); @@ -987,8 +984,8 @@ TEST(Cuda, HalfPropagation_CUDA) { const std::string& verification_pattern = R"IR( # CHECK: for ( -# CHECK: float v = float(a[n]); -# CHECK: relu[n] = half(Max(v, 0.f +# CHECK: float v = float(a[i]); +# CHECK: relu[i] = half(Max(v, 0.f # CHECK: })IR"; torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); @@ -1020,7 +1017,7 @@ TEST(Cuda, UnusedHalfArgument_CUDA) { BufHandle a("a", {4}, kFloat); auto half = ToDtype(); BufHandle b("b", {4}, half); - Tensor relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) { + Tensor relu = Compute("relu", {4}, [&](const VarHandle& i) { return Max::make(a.load(i), ExprHandle(alloc(0)), true); }); @@ -1036,8 +1033,8 @@ TEST(Cuda, UnusedHalfArgument_CUDA) { const std::string& verification_pattern = R"IR( # CHECK: for ( -# CHECK: float v = a[n]; -# CHECK: relu[n] = Max(v, 0.f +# CHECK: float v = a[i]; +# CHECK: relu[i] = Max(v, 0.f # CHECK: })IR"; torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); @@ -1150,10 +1147,9 @@ TEST(Cuda, MaskBlockDim_CUDA) { int B_SIZE = 50; BufHandle a_buf("a", {A_SIZE}, kFloat); BufHandle b_buf("b", {B_SIZE}, kFloat); - Tensor c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { - return a_buf.load(i) + 10; - }); - Tensor d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) { + Tensor c = Compute( + "c", {A_SIZE}, [&](const VarHandle& i) { return a_buf.load(i) + 10; }); + Tensor d = Compute("d", {B_SIZE}, [&](const VarHandle& i) { return a_buf.load(i) + b_buf.load(i); }); @@ -1242,10 +1238,9 @@ TEST(Cuda, MaskThreadDim_CUDA) { int B_SIZE = 100; BufHandle a_buf("a", {A_SIZE}, kFloat); BufHandle b_buf("b", {B_SIZE}, kFloat); - Tensor c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { - return a_buf.load(i) + 10; - }); - Tensor d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) { + Tensor c = Compute( + "c", {A_SIZE}, [&](const VarHandle& i) { return a_buf.load(i) + 10; }); + Tensor d = Compute("d", {B_SIZE}, [&](const VarHandle& i) { return a_buf.load(i / 2) + b_buf.load(i); }); @@ -1336,10 +1331,9 @@ TEST(Cuda, MaskMultiBlockDim_CUDA) { int B_SIZE = 50; BufHandle a_buf("a", {A_SIZE}, kFloat); BufHandle b_buf("b", {B_SIZE}, kFloat); - Tensor c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { - return a_buf.load(i) + 10; - }); - Tensor d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) { + Tensor c = Compute( + "c", {A_SIZE}, [&](const VarHandle& i) { return a_buf.load(i) + 10; }); + Tensor d = Compute("d", {B_SIZE}, [&](const VarHandle& i) { return a_buf.load(i) + b_buf.load(i); }); @@ -1429,10 +1423,9 @@ TEST(Cuda, MaskBlockAndThreadDim_CUDA) { int B_SIZE = 50; BufHandle a_buf("a", {A_SIZE}, kFloat); BufHandle b_buf("b", {B_SIZE}, kFloat); - Tensor c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { - return a_buf.load(i) + 10; - }); - Tensor d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) { + Tensor c = Compute( + "c", {A_SIZE}, [&](const VarHandle& i) { return a_buf.load(i) + 10; }); + Tensor d = Compute("d", {B_SIZE}, [&](const VarHandle& i) { return a_buf.load(i) + b_buf.load(i); }); @@ -1522,15 +1515,11 @@ TEST(Cuda, MaskMultiDim_CUDA) { BufHandle a_buf("a", {OUTER_SIZE, A_SIZE}, kFloat); BufHandle b_buf("b", {OUTER_SIZE, B_SIZE}, kFloat); Tensor c = Compute( - "C", - {{OUTER_SIZE, "i"}, {A_SIZE, "j"}}, - [&](const VarHandle& i, const VarHandle& j) { + "C", {OUTER_SIZE, A_SIZE}, [&](const VarHandle& i, const VarHandle& j) { return ExprHandle(2) * a_buf.load(i, j); }); Tensor d = Compute( - "D", - {{OUTER_SIZE, "i"}, {B_SIZE, "j"}}, - [&](const VarHandle& i, const VarHandle& j) { + "D", {OUTER_SIZE, B_SIZE}, [&](const VarHandle& i, const VarHandle& j) { return c.load(i, j * 2) + b_buf.load(i, j); }); @@ -1651,15 +1640,11 @@ TEST(Cuda, MaskMultiDimSymbolic_CUDA) { BufHandle a_buf("a", {OUTER_SIZE, A_SIZE}, kFloat); BufHandle b_buf("b", {OUTER_SIZE, B_SIZE}, kFloat); Tensor c = Compute( - "C", - {{OUTER_SIZE, "i"}, {A_SIZE, "j"}}, - [&](const VarHandle& i, const VarHandle& j) { + "C", {OUTER_SIZE, A_SIZE}, [&](const VarHandle& i, const VarHandle& j) { return ExprHandle(2) * a_buf.load(i, j); }); Tensor d = Compute( - "D", - {{OUTER_SIZE, "i"}, {B_SIZE, "j"}}, - [&](const VarHandle& i, const VarHandle& j) { + "D", {OUTER_SIZE, B_SIZE}, [&](const VarHandle& i, const VarHandle& j) { return c.load(i, j * 2) + b_buf.load(i, j); }); @@ -2062,15 +2047,11 @@ TEST(Cuda, MaskMultiDimMultiAxis_CUDA) { BufHandle a_buf("a", {OUTER_SIZE, A_SIZE}, kFloat); BufHandle b_buf("b", {OUTER_SIZE, B_SIZE}, kFloat); Tensor c = Compute( - "C", - {{OUTER_SIZE, "i"}, {A_SIZE, "j"}}, - [&](const VarHandle& i, const VarHandle& j) { + "C", {OUTER_SIZE, A_SIZE}, [&](const VarHandle& i, const VarHandle& j) { return ExprHandle(2) * a_buf.load(i, j); }); Tensor d = Compute( - "D", - {{OUTER_SIZE, "i"}, {B_SIZE, "j"}}, - [&](const VarHandle& i, const VarHandle& j) { + "D", {OUTER_SIZE, B_SIZE}, [&](const VarHandle& i, const VarHandle& j) { return c.load(i, j * 2) + b_buf.load(i, j); }); @@ -2192,15 +2173,11 @@ TEST(Cuda, MaskMultiDimMultiLevel_CUDA) { BufHandle a_buf("a", {OUTER_A_SIZE, A_SIZE}, kFloat); BufHandle b_buf("b", {OUTER_B_SIZE, B_SIZE}, kFloat); Tensor c = Compute( - "C", - {{OUTER_A_SIZE, "i"}, {A_SIZE, "j"}}, - [&](const VarHandle& i, const VarHandle& j) { + "C", {OUTER_A_SIZE, A_SIZE}, [&](const VarHandle& i, const VarHandle& j) { return ExprHandle(2) * a_buf.load(i, j); }); Tensor d = Compute( - "D", - {{OUTER_B_SIZE, "i"}, {B_SIZE, "j"}}, - [&](const VarHandle& i, const VarHandle& j) { + "D", {OUTER_B_SIZE, B_SIZE}, [&](const VarHandle& i, const VarHandle& j) { return c.load(i, j * 2) + b_buf.load(i, j); }); diff --git a/test/cpp/tensorexpr/test_dynamic_shapes.cpp b/test/cpp/tensorexpr/test_dynamic_shapes.cpp index 46b55272ddf7..07b9872fb832 100644 --- a/test/cpp/tensorexpr/test_dynamic_shapes.cpp +++ b/test/cpp/tensorexpr/test_dynamic_shapes.cpp @@ -1,5 +1,7 @@ #include +#include +#include #include #include #include @@ -10,6 +12,7 @@ #include #include #include +#include namespace torch { namespace jit { @@ -626,5 +629,73 @@ TEST(DynamicShapes, GraphFromModel) { #endif } +TEST(DynamicShapes, MultiThreadedExecution) { +#ifdef TORCH_ENABLE_LLVM + const auto graph_template = R"IR( + graph(%x : Float(SS(-2), SS(-3), requires_grad=0, device=${device}), + %y : Float(SS(-2), SS(-3), requires_grad=0, device=${device}), + %SS_2 : int, + %SS_3 : int): + %3 : Float(SS(-2), SS(-3), requires_grad=0, device=${device}) = aten::tanh(%x) + %4 : Float(SS(-2), SS(-3), requires_grad=0, device=${device}) = aten::erf(%3) + %5 : Float(SS(-2), SS(-3), requires_grad=0, device=${device}) = aten::mul(%4, %y) + return (%5))IR"; + for (bool use_cuda : {false, true}) { + if (!torch::cuda::is_available() && use_cuda) { + continue; + } + auto device = use_cuda ? at::kCUDA : at::kCPU; + at::jit::TemplateEnv env; + env.s("device", use_cuda ? "cuda:0" : "cpu"); + const auto graph_string = format(graph_template, env); + std::shared_ptr graph = std::make_shared(); + torch::jit::parseIR(graph_string, graph.get()); + + std::vector symbolic_shape_inputs = {-2, -3}; + + std::vector input_desc = { + torch::jit::StrideInput::TENSOR_CONT}; + std::unordered_map< + const torch::jit::Value*, + std::vector> + symbolic_strides; + symbolic_strides[graph->inputs().at(0)] = input_desc; + symbolic_strides[graph->inputs().at(1)] = input_desc; + symbolic_strides[graph->outputs().at(0)] = input_desc; + + TensorExprKernel kernel( + graph, {}, symbolic_shape_inputs, false, symbolic_strides); + + auto run_kernel = [&](int dim1, int dim2) { + auto a = + at::rand({dim1, dim2}, at::TensorOptions(device).dtype(at::kFloat)); + auto b = + at::rand({dim1, dim2}, at::TensorOptions(device).dtype(at::kFloat)); + + auto ref = at::mul(at::erf(at::tanh(a)), b); + + std::vector stack = fmap(std::vector({a, b})); + stack.emplace_back(dim1); + stack.emplace_back(dim2); + kernel.run(stack); + + auto o = stack[0].toTensor(); + ASSERT_TRUE(at::allclose(o, ref)); + }; + + // Run the kernel in parallel to ensure that the run() method calls in + // TensorExprKernel are not changing any state. + constexpr size_t kNumThreads = 4; + std::vector threads; + for (size_t id = 0; id < kNumThreads; ++id) { + threads.emplace_back(run_kernel, id + 5, id + 20); + } + for (auto& t : threads) { + t.join(); + } + } +#endif +} + } // namespace jit } // namespace torch diff --git a/test/cpp/tensorexpr/test_expr.cpp b/test/cpp/tensorexpr/test_expr.cpp index 1b2a393fea7b..6a7a7e75704d 100644 --- a/test/cpp/tensorexpr/test_expr.cpp +++ b/test/cpp/tensorexpr/test_expr.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,133 @@ TEST(Expr, BasicValueTest02) { ASSERT_EQ(eval.value(), -4.0f); } +TEST(Expr, IsChannelsLastContiguous) { + std::vector vars = { + VarHandle("var1", kLong), + VarHandle("var2", kLong), + VarHandle("var3", kLong), + VarHandle("var4", kLong), + VarHandle("var5", kLong)}; + + // { + // key: ndims, + // value: [ + // ... + // [dim_2, dim_1, ..., dim_n] + // ] + // } + using shapGenInfo = std::unordered_map>>; + + // { + // size: [ExprHandle_1, ExprHandle_2, ..., ExprHandle_n], + // strides: [ + // ... + // [ExprHandle_x, ExprHandle_y, ..., ExprHandle_z] + // ] + // } + using shapeInfo = + std::pair, std::vector>>; + + std::vector dims = {3, 4, 5}; + + std::unordered_map> dims_expr_vec_conf = { + {3, std::vector(vars.begin(), vars.begin() + 2)}, + {4, std::vector(vars.begin(), vars.begin() + 3)}, + {5, std::vector(vars.begin(), vars.begin() + 4)}, + }; + + shapGenInfo channels_last_cont_shape_conf = { + {3, {{1, 2, 0}}}, {4, {{1, 3, 2, 0}}}, {5, {{1, 4, 3, 2, 0}}}}; + shapGenInfo channels_last_non_cont_shape_conf = { + {3, {{2, 1, 0}, {1, 0, 2}}}, + {4, {{3, 1, 2, 0}, {1, 2, 3, 0}, {1, 0, 2, 3}}}, + {5, {{4, 3, 2, 1, 0}, {1, 3, 2, 4, 0}, {1, 4, 3, 2, 0}}}}; + + shapGenInfo cont_shape_conf = { + {3, {{0, 1, 2}}}, {4, {{0, 1, 2, 3}}}, {5, {{0, 1, 2, 3, 4}}}}; + + auto shape_gen_fn = [dims_expr_vec_conf]( + int ndims, shapGenInfo shape_gen_info) -> shapeInfo { + auto dims_expr_vec = dims_expr_vec_conf.at(ndims); + std::vector> strides_expr_vec; + for (size_t i = 0; i < strides_expr_vec.size(); i++) { + strides_expr_vec[i].resize(ndims); + } + + auto stride_gen_fn = [](int indicator, ExprHandle a, ExprHandle b) { + if (indicator % 2 == 0) { + return a * b; + } else { + return b * a; + } + }; + + auto stride_order_vec = shape_gen_info.at(ndims); + for (size_t i = 0; i < strides_expr_vec.size(); i++) { + auto stride_order = stride_order_vec[i]; + + strides_expr_vec[i][stride_order[0]] = 1; + for (size_t j = 1; j < stride_order.size(); j++) { + auto cur_dim_idx = stride_order[j]; + auto adjacent_dim_idx = stride_order[j - 1]; + + strides_expr_vec[i][cur_dim_idx] = stride_gen_fn( + i, + dims_expr_vec[adjacent_dim_idx], + strides_expr_vec[i][adjacent_dim_idx]); + } + } + + return {dims_expr_vec, strides_expr_vec}; + }; + + auto check_channels_last_fn = [](int ndims, BufHandle buf_handle) -> bool { + if (ndims == 3) { + return buf_handle.is_channels_last_1d_contiguous(); + } else if (ndims == 4) { + return buf_handle.is_contiguous(at::MemoryFormat::ChannelsLast); + } else { + return buf_handle.is_contiguous(at::MemoryFormat::ChannelsLast3d); + } + }; + + // channels-last contigous + for (size_t i = 0; i < dims.size(); i++) { + auto shape_info = shape_gen_fn(dims[i], channels_last_cont_shape_conf); + for (size_t j = 0; j < shape_info.second.size(); j++) { + BufHandle buf_handle("a", shape_info.first, shape_info.second[j], kFloat); + ASSERT_EQ(check_channels_last_fn(dims[i], buf_handle), true); + } + } + + // channels-last non-contigous + for (size_t i = 0; i < dims.size(); i++) { + auto shape_info = shape_gen_fn(dims[i], channels_last_non_cont_shape_conf); + for (size_t j = 0; j < shape_info.second.size(); j++) { + BufHandle buf_handle("a", shape_info.first, shape_info.second[j], kFloat); + ASSERT_EQ(check_channels_last_fn(dims[i], buf_handle), false); + } + } + + // contiguous + for (size_t i = 0; i < dims.size(); i++) { + auto shape_info = shape_gen_fn(dims[i], cont_shape_conf); + for (size_t j = 0; j < shape_info.second.size(); j++) { + BufHandle buf_handle("a", shape_info.first, shape_info.second[j], kFloat); + ASSERT_EQ(buf_handle.is_contiguous(), true); + } + } + + // non-contiguous + for (size_t i = 0; i < dims.size(); i++) { + auto shape_info = shape_gen_fn(dims[i], channels_last_cont_shape_conf); + for (size_t j = 0; j < shape_info.second.size(); j++) { + BufHandle buf_handle("a", shape_info.first, shape_info.second[j], kFloat); + ASSERT_EQ(buf_handle.is_contiguous(), false); + } + } +} + TEST(Expr, LetTest01) { VarHandle x("x", kFloat); ExprHandle body = ExprHandle(2.f) + (x * ExprHandle(3.f) + ExprHandle(4.f)); diff --git a/test/cpp/tensorexpr/test_external_calls.cpp b/test/cpp/tensorexpr/test_external_calls.cpp index b814ae344df3..88b75667b654 100644 --- a/test/cpp/tensorexpr/test_external_calls.cpp +++ b/test/cpp/tensorexpr/test_external_calls.cpp @@ -2,8 +2,17 @@ #include +#include +#include +#include +#include +#include + #include +#include +#include #include +#include #include #include #include @@ -11,6 +20,9 @@ #include #include +#include +#include + #include #include #include @@ -777,14 +789,14 @@ TEST(ExternalCall, ComputeInterop) { Tensor Input = Compute( "Input", - {{1, "n"}, {16, "c"}, {32, "h"}, {32, "w"}}, + {1, 16, 32, 32}, [&](const VarHandle& n, const VarHandle& c, const VarHandle& h, const VarHandle& w) { return FloatImm::make(5.0f); }); Tensor Weight = Compute( "Weight", - {{16, "n"}, {16, "c"}, {1, "kh"}, {1, "kw"}}, + {16, 16, 1, 1}, [&](const VarHandle& n, const VarHandle& c, const VarHandle& h, @@ -806,7 +818,7 @@ TEST(ExternalCall, ComputeInterop) { {})); Tensor Result = Compute( "Result", - {{1, "n"}, {16, "c"}, {32, "h"}, {32, "w"}}, + {1, 16, 32, 32}, [&](const VarHandle& n, const VarHandle& c, const VarHandle& h, @@ -866,14 +878,12 @@ TEST(ExternalCall, Inlining) { BufHandle MatmulResultBuf("MatmulResult", {8, 8}, kFloat); - Tensor A = Compute( - "A", {{8, "i"}, {8, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return FloatImm::make(5.0f); - }); - Tensor B = Compute( - "B", {{8, "i"}, {8, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return FloatImm::make(4.0f); - }); + Tensor A = Compute("A", {8, 8}, [&](const VarHandle& i, const VarHandle& j) { + return FloatImm::make(5.0f); + }); + Tensor B = Compute("B", {8, 8}, [&](const VarHandle& i, const VarHandle& j) { + return FloatImm::make(4.0f); + }); Tensor MatmulResult = Tensor( MatmulResultBuf.node(), ExternalCall::make( @@ -881,14 +891,12 @@ TEST(ExternalCall, Inlining) { "nnc_aten_matmul", {BufHandle(A.buf()), BufHandle(B.buf())}, {})); - Tensor Result = Compute( - "Result", - {{8, "i"}, {8, "j"}}, - [&](const VarHandle& i, const VarHandle& j) { + Tensor Result = + Compute("Result", {8, 8}, [&](const VarHandle& i, const VarHandle& j) { return MatmulResult.load(i, j) + FloatImm::make(3.0f); }); - StmtPtr root_stmt = alloc(std::vector( + StmtPtr root_stmt = alloc(std::vector( {A.stmt(), B.stmt(), MatmulResult.stmt(), Result.stmt()})); LoopNest l(root_stmt, {Result.buf()}); @@ -927,5 +935,131 @@ TEST(ExternalCall, Inlining) { ASSERT_TRUE(at::allclose(nnc_result, ref)); } +TEST(ExternalCall, JitCustomFusionOp) { + const char* custom_op_schema_literal = + "nnc_custom::add_mul(Tensor a, Tensor b, Tensor c) -> Tensor"; + const char* external_func_name = "nnc_add_mul"; + + auto add_mul_lowering_func = + [external_func_name]( + const std::vector& inputs, + const std::vector& output_shape, + const std::vector& output_strides, + const c10::optional& output_type, + at::Device device) { + auto output_dtype = Dtype(*output_type); + torch::jit::tensorexpr::BufHandle result_buf( + "nnc_add_mul_res_buf", output_shape, output_dtype); + const torch::jit::tensorexpr::BufHandle& a = + c10::get(inputs[0]); + const torch::jit::tensorexpr::BufHandle& b = + c10::get(inputs[1]); + const torch::jit::tensorexpr::BufHandle& c = + c10::get(inputs[1]); + torch::jit::tensorexpr::StmtPtr s = + torch::jit::tensorexpr::ExternalCall::make( + result_buf, external_func_name, {a, b, c}, {}); + return Tensor(result_buf.node(), s); + }; + + auto add_mul_external_func = [](int64_t bufs_num, + void** buf_data, + int64_t* buf_ranks, + int64_t* buf_dims, + int64_t* buf_strides, + int8_t* buf_dtypes, + int64_t args_num, + int64_t* extra_args) {}; + + torch::jit::RegisterOperators reg({Operator( + custom_op_schema_literal, + [](const Node* node) -> Operation { + return [](Stack& _stack) { + auto a = std::move(peek(_stack, 0, 3)).toTensor(); + auto b = std::move(peek(_stack, 1, 3)).toTensor(); + auto c = std::move(peek(_stack, 2, 3)).toTensor(); + drop(_stack, 3); + auto result = (a + b) * c; + pack(_stack, std::move(result)); + return 0; + }; + }, + c10::AliasAnalysisKind::FROM_SCHEMA)}); + + auto& custom_operator_set = torch::jit::tensorexpr::getCustomOperatorSet(); + custom_operator_set.insert({custom_op_schema_literal}); + + auto& te_lowering_registry = torch::jit::tensorexpr::getNNCLoweringRegistry(); + te_lowering_registry.insert( + parseSchema(custom_op_schema_literal), add_mul_lowering_func); + + auto& te_nnc_func_registry = torch::jit::tensorexpr::getNNCFunctionRegistry(); + te_nnc_func_registry[external_func_name] = add_mul_external_func; + + std::string graph_string = R"IR( + graph(%a : Float(10, 20, strides=[20, 1], device=cpu), + %b : Float(10, 20, strides=[20, 1], device=cpu), + %c : Float(10, 20, strides=[20, 1], device=cpu)): + %res : Float(10, 20, strides=[20, 1], device=cpu) = nnc_custom::add_mul(%a, %b, %c) + return (%res))IR"; + + auto graph = std::make_shared(); + torch::jit::parseIR(graph_string, graph.get()); + + std::string shape_compute_python_string = R"PY( + def computOutput(a: List[int], b: List[int], c: List[int]): + expandedSizes: List[int] = [] + dimsA = len(a) + dimsB = len(b) + dimsC = len(c) + ndim = max(dimsA, dimsB, dimsC) + for i in range(ndim): + offset = ndim - 1 - i + dimA = dimsA - 1 - offset + dimB = dimsB - 1 - offset + dimC = dimsC - 1 - offset + sizeA = a[dimA] if (dimA >= 0) else 1 + sizeB = b[dimB] if (dimB >= 0) else 1 + sizeC = a[dimC] if (dimC >= 0) else 1 + + if sizeA != sizeB and sizeB != sizeC and sizeA != 1 and sizeB != 1 and sizeC != 1: + # TODO: only assertion error is bound in C++ compilation right now + raise AssertionError( + "The size of tensor a {} must match the size of tensor b (" + "{} and c {}) at non-singleton dimension {}".format(sizeA, sizeB, sizeC, i) + ) + + expandedSizes.append(max(sizeA, sizeB, sizeC)) + + return expandedSizes + )PY"; + auto cu_ptr = torch::jit::compile(shape_compute_python_string); + torch::jit::GraphFunction* gf = + (torch::jit::GraphFunction*)&cu_ptr->get_function("computOutput"); + ASSERT_TRUE(gf); + +#ifdef TORCH_ENABLE_LLVM + auto static_graph_case = graph->copy(); + FuseTensorExprs(static_graph_case, 1); + torch::jit::testing::FileCheck() + .check("prim::TensorExprGroup_") + ->check("nnc_custom::add_mul") + ->run(*static_graph_case); + + auto dynamic_graph_case = graph->copy(); + auto custom_op = torch::jit::getOperatorForLiteral(custom_op_schema_literal); + ASSERT_TRUE(custom_op); + torch::jit::RegisterShapeComputeGraphForSchema( + custom_op->schema(), gf->graph()); + FuseTensorExprs(dynamic_graph_case, 1, false, true); + torch::jit::testing::FileCheck() + .check("prim::TensorExprGroup_") + ->check("nnc_custom::add_mul") + ->run(*dynamic_graph_case); +#else + torch::jit::testing::FileCheck().check("nnc_custom::add_mul")->run(*graph); +#endif +} + } // namespace jit } // namespace torch diff --git a/test/cpp/tensorexpr/test_ir_printer.cpp b/test/cpp/tensorexpr/test_ir_printer.cpp index 820f12689acc..2c98e093afcc 100644 --- a/test/cpp/tensorexpr/test_ir_printer.cpp +++ b/test/cpp/tensorexpr/test_ir_printer.cpp @@ -53,42 +53,36 @@ TEST(IRPrinter, FunctionName) { int N = 20; Tensor producer = Compute( - "producer", - {{M, "m"}, {N, "n"}}, - [&](const ExprHandle& m, const ExprHandle& n) { return m * n; }); + "producer", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) { + return m * n; + }); Tensor chunk_0 = Compute( - "chunk", - {{M, "m"}, {N / 2, "n"}}, - [&](const ExprHandle& m, const ExprHandle& n) { + "chunk_0", {M, N / 2}, [&](const ExprHandle& m, const ExprHandle& n) { return producer.load(m, n); }); Tensor chunk_1 = Compute( - "chunk", - {{M, "m"}, {N / 2, "n"}}, - [&](const ExprHandle& m, const ExprHandle& n) { + "chunk_1", {M, N / 2}, [&](const ExprHandle& m, const ExprHandle& n) { return producer.load(m, n + ExprHandle(N / 2)); }); Tensor consumer = Compute( - "consumer", - {{M, "i"}, {N / 2, "j"}}, - [&](const ExprHandle& i, const ExprHandle& j) { + "consumer", {M, N / 2}, [&](const ExprHandle& i, const ExprHandle& j) { return i * chunk_1.load(i, j); }); LoopNest l({chunk_0, chunk_1, consumer}); - auto body = l.root_stmt(); + auto body = LoopNest::sanitizeNames(l.root_stmt()); std::stringstream ss; ss << *body; const std::string& verification_pattern = R"IR( - # CHECK: for (int i - # CHECK: for (int j - # CHECK: consumer[i, j] = i * (chunk_1[i, j])IR"; + # CHECK: for (int i_2 + # CHECK: for (int j_2 + # CHECK: consumer[i_2, j_2] = i_2 * (chunk_1[i_2, j_2])IR"; torch::jit::testing::FileCheck().run(verification_pattern, ss.str()); } diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp index 2ae99ef58b22..fc755ed6caae 100644 --- a/test/cpp/tensorexpr/test_kernel.cpp +++ b/test/cpp/tensorexpr/test_kernel.cpp @@ -647,6 +647,7 @@ TEST_F(Kernel, CatWithEmptyInputs) { } TEST_F(Kernel, CatWoConditionals) { + bool old_cat_wo_conditionals = getCatWoConditionals(); getCatWoConditionals() = true; const auto graph_string = R"IR( graph(%a : Float(5, 3, 2, strides=[6, 2, 1], device=cpu), @@ -702,7 +703,7 @@ TEST_F(Kernel, CatWoConditionals) { for (const auto i : c10::irange(num_el)) { CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]); } - getCatWoConditionals() = false; + getCatWoConditionals() = old_cat_wo_conditionals; } TEST_F(Kernel, OptimizeConditionals) { @@ -1597,12 +1598,14 @@ TEST_F(Kernel, CodegenInspection) { Tensor lowerNanToNum( const std::vector& inputs, const std::vector& outputShape, + const std::vector& outputStrides, const c10::optional& outputType, at::Device device) { auto input_buf = c10::get(inputs[0]); auto e = Compute( "custom_nan_to_num", - c10::fmap(outputShape), + outputShape, + outputStrides, [&](const std::vector& axes) { std::vector indices(axes.begin(), axes.end()); auto load = input_buf.load(indices); @@ -1836,5 +1839,228 @@ graph(%x : int, ASSERT_TRUE(at::equal(stack[3].toTensor(), xt * yt)); } +TEST_F(Kernel, FuseLoopsWithVariableBounds) { +#ifdef TORCH_ENABLE_LLVM + bool old_cat_wo_conditionals = getCatWoConditionals(); + getCatWoConditionals() = true; + const auto graph_string = R"IR( + graph(%a : Float(SS(-2), 3, SS(-3), requires_grad=0, device=cpu), + %b : Float(SS(-2), 7, SS(-3), requires_grad=0, device=cpu), + %c : Float(SS(-2), 9, SS(-3), requires_grad=0, device=cpu), + %SS_2 : int, + %SS_3 : int): + %dim : int = prim::Constant[value=1]() + %inputs : Tensor[] = prim::ListConstruct(%a, %b, %c) + %r : Float(SS(-2), 19, SS(-3), requires_grad=0, device=cpu) = aten::cat(%inputs, %dim) # new size: [5,19,2] + return (%r))IR"; + std::shared_ptr graph = std::make_shared(); + torch::jit::parseIR(graph_string, graph.get()); + + std::vector symbolic_shape_inputs = {-2, -3}; + + std::vector input_desc = { + torch::jit::StrideInput::TENSOR_CONT}; + std::unordered_map< + const torch::jit::Value*, + std::vector> + symbolic_strides; + symbolic_strides[graph->inputs().at(0)] = input_desc; + symbolic_strides[graph->inputs().at(1)] = input_desc; + symbolic_strides[graph->inputs().at(2)] = input_desc; + symbolic_strides[graph->outputs().at(0)] = input_desc; + + TensorExprKernel kernel( + graph, {}, symbolic_shape_inputs, false, symbolic_strides); + + std::ostringstream oss; + oss << *kernel.getCodeGenStmt(); + const std::string& verification_pattern = + R"IR( +# CHECK: for (int64_t i +# CHECK-NEXT: for (int64_t j +# CHECK-NEXT: for (int64_t k +# CHECK: for (int64_t j +# CHECK-NEXT: for (int64_t k +# CHECK: for (int64_t j +# CHECK-NEXT: for (int64_t k +# CHECK-NOT: for (int64_t i + )IR"; + torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); + + auto run_kernel = [&](int dim1, int dim2) { + auto a = + at::rand({dim1, 3, dim2}, at::TensorOptions(kCPU).dtype(at::kFloat)); + auto b = + at::rand({dim1, 7, dim2}, at::TensorOptions(kCPU).dtype(at::kFloat)); + auto c = + at::rand({dim1, 9, dim2}, at::TensorOptions(kCPU).dtype(at::kFloat)); + + auto ref = at::cat({a, b, c}, 1); + + std::vector stack = + fmap(std::vector({a, b, c})); + stack.emplace_back(dim1); + stack.emplace_back(dim2); + kernel.run(stack); + + auto o = stack[0].toTensor(); + ASSERT_TRUE(at::allclose(o, ref)); + }; + + run_kernel(10, 20); + getCatWoConditionals() = old_cat_wo_conditionals; +#endif +} + +TEST_F(Kernel, FuseLoopsWithVariableConcatDim) { +#ifdef TORCH_ENABLE_LLVM + bool old_cat_wo_conditionals = getCatWoConditionals(); + getCatWoConditionals() = true; + const auto graph_string = R"IR( + graph(%a : Float(SS(-2), SS(-4), SS(-3), requires_grad=0, device=cpu), + %b : Float(SS(-2), SS(-4), SS(-3), requires_grad=0, device=cpu), + %c : Float(SS(-2), SS(-4), SS(-3), requires_grad=0, device=cpu), + %SS_2 : int, + %SS_3 : int, + %SS_4 : int, + %SS_5 : int): + %dim : int = prim::Constant[value=1]() + %inputs : Tensor[] = prim::ListConstruct(%a, %b, %c) + %r : Float(SS(-2), SS(-5), SS(-3), requires_grad=0, device=cpu) = aten::cat(%inputs, %dim) # new size: [5,19,2] + return (%r))IR"; + std::shared_ptr graph = std::make_shared(); + torch::jit::parseIR(graph_string, graph.get()); + + std::vector symbolic_shape_inputs = {-2, -3, -4, -5}; + + std::vector input_desc = { + torch::jit::StrideInput::TENSOR_CONT}; + std::unordered_map< + const torch::jit::Value*, + std::vector> + symbolic_strides; + symbolic_strides[graph->inputs().at(0)] = input_desc; + symbolic_strides[graph->inputs().at(1)] = input_desc; + symbolic_strides[graph->inputs().at(2)] = input_desc; + symbolic_strides[graph->outputs().at(0)] = input_desc; + + TensorExprKernel kernel( + graph, {}, symbolic_shape_inputs, false, symbolic_strides); + + std::ostringstream oss; + oss << *kernel.getCodeGenStmt(); + const std::string& verification_pattern = + R"IR( +# CHECK: for (int64_t i +# CHECK-NEXT: for (int64_t j +# CHECK-NEXT: for (int64_t k +# CHECK: for (int64_t j +# CHECK-NEXT: for (int64_t k +# CHECK: for (int64_t j +# CHECK-NEXT: for (int64_t k +# CHECK-NOT: for (int64_t i + )IR"; + torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); + + auto run_kernel = [&](int dim1, int dim2, int dim3) { + auto a = + at::rand({dim1, dim3, dim2}, at::TensorOptions(kCPU).dtype(at::kFloat)); + auto b = + at::rand({dim1, dim3, dim2}, at::TensorOptions(kCPU).dtype(at::kFloat)); + auto c = + at::rand({dim1, dim3, dim2}, at::TensorOptions(kCPU).dtype(at::kFloat)); + + auto ref = at::cat({a, b, c}, 1); + + std::vector stack = + fmap(std::vector({a, b, c})); + stack.emplace_back(dim1); + stack.emplace_back(dim2); + stack.emplace_back(dim3); + stack.emplace_back(3 * dim3); + kernel.run(stack); + + auto o = stack[0].toTensor(); + ASSERT_TRUE(at::allclose(o, ref)); + }; + + run_kernel(10, 20, 15); + getCatWoConditionals() = old_cat_wo_conditionals; +#endif +} + +TEST_F(Kernel, DoNotFuseLoopsWithMismatchingVariableDims) { +#ifdef TORCH_ENABLE_LLVM + bool old_cat_wo_conditionals = getCatWoConditionals(); + getCatWoConditionals() = true; + const auto graph_string = R"IR( + graph(%a : Float(SS(-2), SS(-4), SS(-3), requires_grad=0, device=cpu), + %b : Float(SS(-2), SS(-5), SS(-3), requires_grad=0, device=cpu), + %SS_2 : int, + %SS_3 : int, + %SS_4 : int, + %SS_5 : int, + %SS_6 : int): + %dim : int = prim::Constant[value=1]() + %inputs : Tensor[] = prim::ListConstruct(%a, %b) + %r : Float(SS(-2), SS(-6), SS(-3), requires_grad=0, device=cpu) = aten::cat(%inputs, %dim) # new size: [5,19,2] + return (%r))IR"; + std::shared_ptr graph = std::make_shared(); + torch::jit::parseIR(graph_string, graph.get()); + + std::vector symbolic_shape_inputs = {-2, -3, -4, -5, -6}; + + std::vector input_desc = { + torch::jit::StrideInput::TENSOR_CONT}; + std::unordered_map< + const torch::jit::Value*, + std::vector> + symbolic_strides; + symbolic_strides[graph->inputs().at(0)] = input_desc; + symbolic_strides[graph->inputs().at(1)] = input_desc; + symbolic_strides[graph->outputs().at(0)] = input_desc; + + TensorExprKernel kernel( + graph, {}, symbolic_shape_inputs, false, symbolic_strides); + + std::ostringstream oss; + oss << *kernel.getCodeGenStmt(); + const std::string& verification_pattern = + R"IR( +# CHECK: for (int64_t i +# CHECK-NEXT: for (int64_t j +# CHECK-NEXT: for (int64_t k +# CHECK: for (int64_t j +# CHECK-NEXT: for (int64_t k +# CHECK-NOT: for (int64_t j +# CHECK-NOT: for (int64_t i + )IR"; + torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); + + auto run_kernel = [&](int dim2, int dim3, int dim4, int dim5) { + auto a = + at::rand({dim2, dim4, dim3}, at::TensorOptions(kCPU).dtype(at::kFloat)); + auto b = + at::rand({dim2, dim5, dim3}, at::TensorOptions(kCPU).dtype(at::kFloat)); + + auto ref = at::cat({a, b}, 1); + + std::vector stack = fmap(std::vector({a, b})); + stack.emplace_back(dim2); + stack.emplace_back(dim3); + stack.emplace_back(dim4); + stack.emplace_back(dim5); + stack.emplace_back(dim4 + dim5); + kernel.run(stack); + + auto o = stack[0].toTensor(); + ASSERT_TRUE(at::allclose(o, ref)); + }; + + run_kernel(10, 20, 15, 8); + getCatWoConditionals() = old_cat_wo_conditionals; +#endif +} + } // namespace jit } // namespace torch diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp index 52464a6d0afa..520ae6301ceb 100644 --- a/test/cpp/tensorexpr/test_llvm.cpp +++ b/test/cpp/tensorexpr/test_llvm.cpp @@ -584,8 +584,7 @@ DOUBLE_INTRINSICS_TEST(lgamma, 4) TEST(LLVM, VectorizerLoadStoreTest) { BufHandle a("A", {1}, kInt); - Tensor c = - Compute("c", {{4, "i"}}, [&](const VarHandle& i) { return a.load(i); }); + Tensor c = Compute("c", {4}, [&](const VarHandle& i) { return a.load(i); }); BufHandle c_buf(c.buf()); LoopNest l({c}); @@ -606,7 +605,7 @@ TEST(LLVM, VectorizerLoadStoreTest) { TEST(LLVM, VectorizeBitCast) { BufHandle a("A", {128}, kInt); - Tensor c = Compute("c", {{128, "i"}}, [&](const VarHandle& i) { + Tensor c = Compute("c", {128}, [&](const VarHandle& i) { return bitcast(a.load(i)); }); @@ -1186,9 +1185,8 @@ TEST(LLVM, StoreFloat) { TEST(LLVM, SimpleMath01) { const int N = 1024; - Tensor tensor = Compute("f", {{N, "i"}}, [](const VarHandle& i) { - return cast(i * i + 1); - }); + Tensor tensor = Compute( + "f", {N}, [](const VarHandle& i) { return cast(i * i + 1); }); LoopNest l({tensor}); StmtPtr stmt = l.root_stmt(); BufHandle f_buf(tensor.buf()); @@ -1209,9 +1207,8 @@ TEST(LLVM, ComputeMul) { const int N = 1024; BufHandle a("a", {N}, kFloat); BufHandle b("b", {N}, kFloat); - Tensor c = Compute("c", {{N, "i"}}, [&](const VarHandle& i) { - return a.load(i) * b.load(i); - }); + Tensor c = Compute( + "c", {N}, [&](const VarHandle& i) { return a.load(i) * b.load(i); }); BufHandle c_buf(c.buf()); LoopNest l({c}); @@ -1232,10 +1229,9 @@ TEST(LLVM, BroadcastAdd) { const int N = 1024; BufHandle a("a", {M, N}, kFloat); BufHandle b("b", {N}, kFloat); - Tensor c = Compute( - "c", {{M, "i"}, {N, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return a.load(i, j) + b.load(j); - }); + Tensor c = Compute("c", {M, N}, [&](const VarHandle& i, const VarHandle& j) { + return a.load(i, j) + b.load(j); + }); BufHandle c_buf(c.buf()); LoopNest l({c}); @@ -1333,9 +1329,8 @@ TEST(LLVM, TensorDynamicShapeAdd) { VarHandle n("n", kInt); BufHandle a("a", {n}, kFloat); BufHandle b("b", {n}, kFloat); - Tensor c = Compute("c", {{n, "n"}}, [&](const VarHandle& i) { - return a.load(i) + b.load(i); - }); + Tensor c = Compute( + "c", {n}, [&](const VarHandle& i) { return a.load(i) + b.load(i); }); LoopNest l({c}); StmtPtr s = l.root_stmt(); LLVMCodeGen cg(s, {a, b, c, n}); @@ -1356,8 +1351,8 @@ TEST(LLVM, DynamicShape2D) { VarHandle n("n", kInt); BufHandle a("a", {m, n}, kFloat); BufHandle b("b", {m, n}, kFloat); - Tensor c = Compute( - "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) { + Tensor c = + Compute("c", {m, n}, [&](const VarHandle& i, const VarHandle& j) { return a.load(i, j) + b.load(i, j); }); LoopNest l({c}); @@ -1386,7 +1381,7 @@ TEST(LLVM, EmptyStmt) { TEST(LLVM, EliminatedStmt) { BufHandle a("a", {1}, kFloat); - Tensor c = Compute("c", {{0, "m"}}, [&](const VarHandle& m) { return m; }); + Tensor c = Compute("c", {0}, [&](const VarHandle& m) { return m; }); LoopNest l({c}); l.prepareForCodegen(); @@ -1405,10 +1400,7 @@ TEST(LLVM, SimpleReduction) { BufHandle a("a", {1, M, N}, kFloat); - // TODO: why doesn't implicit vector work? - std::vector axis = {DimArg(1)}; - std::vector reduce_axis = {DimArg(M), DimArg(N)}; - Tensor b = Reduce("sum", axis, Sum(), a, reduce_axis); + Tensor b = Reduce("sum", {1}, Sum(), a, {M, N}); LoopNest loop({b}); loop.prepareForCodegen(); @@ -1442,10 +1434,7 @@ TEST(LLVM, RFactorReduction) { BufHandle a("a", {1, M, N}, kFloat); - // TODO: why doesn't implicit vector work? - std::vector axis = {DimArg(1)}; - std::vector reduce_axis = {DimArg(M), DimArg(N)}; - Tensor b = Reduce("sum", axis, Sum(), a, reduce_axis); + Tensor b = Reduce("sum", {1}, Sum(), a, {M, N}); LoopNest loop({b}); std::vector loops = loop.getLoopStmtsFor(b); @@ -1490,7 +1479,7 @@ TEST(LLVM, RFactorVectorizedReduction) { BufHandle a("a", {1, M, N}, kFloat); - Tensor b = Reduce("sum", {{1, "K"}}, Sum(), a, {{M, "M"}, {N, "N"}}); + Tensor b = Reduce("sum", {1}, Sum(), a, {M, N}); LoopNest loopnest({b}); std::vector loops = loopnest.getLoopStmtsFor(b); // Reorder n and m loops @@ -1536,10 +1525,9 @@ static void testSimpleParallel() { // parallel or sequential. const int M = 4; const int N = 6; - Tensor f = Compute( - "f", {{M, "m"}, {N, "n"}}, [](const VarHandle& m, const VarHandle& n) { - return cast(m + n); - }); + Tensor f = Compute("f", {M, N}, [](const VarHandle& m, const VarHandle& n) { + return cast(m + n); + }); LoopNest loop_nest({f}); auto const& loops = loop_nest.getLoopStmtsFor(f); ForPtr m = loops[0]; @@ -1588,20 +1576,14 @@ TEST(LLVM, CompositeParallel) { for (const auto test_cfg : c10::irange(test_count)) { int M = 5; int N = 7; - Tensor t1 = - Compute("t1", {{M, "M"}}, [](const VarHandle& m) { return m + 1.f; }); - Tensor t2 = - Compute("t2", {{N, "N"}}, [](const VarHandle& n) { return n + 2.f; }); - Tensor t3 = Compute( - "t3", - {{M, "M"}, {N, "N"}}, - [=](const VarHandle& m, const VarHandle& n) { + Tensor t1 = Compute("t1", {M}, [](const VarHandle& m) { return m + 1.f; }); + Tensor t2 = Compute("t2", {N}, [](const VarHandle& n) { return n + 2.f; }); + Tensor t3 = + Compute("t3", {M, N}, [=](const VarHandle& m, const VarHandle& n) { return t1.load(m) * t2.load(n); }); - Tensor t4 = Compute( - "t4", - {{M, "M"}, {N, "N"}}, - [=](const VarHandle& m, const VarHandle& n) { + Tensor t4 = + Compute("t4", {M, N}, [=](const VarHandle& m, const VarHandle& n) { return t3.load(m, n) + m + n; }); LoopNest loop_nest({t4}, {t1, t2, t3, t4}); @@ -1657,12 +1639,12 @@ TEST(LLVM, VectorizedGEMM) { BufHandle BP("B", {K, N}, kFloat); Tensor CT = Reduce( "gemm", - {{M, "M"}, {N, "N"}}, + {M, N}, Sum(), [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); }, - {{K, "K"}}); + {K}); LoopNest loop({CT}); { @@ -1735,10 +1717,9 @@ TEST(LLVM, CallRaw) { VarHandle N("N", kInt); BufHandle a("a", {M, N}, kFloat); BufHandle b("b", {N}, kFloat); - Tensor c = Compute( - "c", {{M, "i"}, {N, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return a.load(i, j) + b.load(j); - }); + Tensor c = Compute("c", {M, N}, [&](const VarHandle& i, const VarHandle& j) { + return a.load(i, j) + b.load(j); + }); LoopNest l({c}); l.prepareForCodegen(); @@ -1776,7 +1757,7 @@ TEST(LLVM, CustomTarget) { BufHandle a("a", {M}, kFloat); BufHandle b("b", {M}, kFloat); BufHandle c("c", {M}, kFloat); - Tensor d = Compute("d", {{M, "m"}}, [&](const VarHandle& m) { + Tensor d = Compute("d", {M}, [&](const VarHandle& m) { return a.load(m) * b.load(m) + c.load(m); }); LoopNest nest({d}); diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp index becf3bdffbac..f2609b0f4166 100644 --- a/test/cpp/tensorexpr/test_loopnest.cpp +++ b/test/cpp/tensorexpr/test_loopnest.cpp @@ -41,8 +41,8 @@ void checkExprIR(const ExprHandle& e, const std::string& pattern) { } TEST(LoopNest, ExprSimple01) { - Tensor tensor = Compute( - "f", {{16, "X"}, {5, "y"}}, [](const VarHandle& x, const VarHandle& y) { + Tensor tensor = + Compute("f", {16, 5}, [](const VarHandle& x, const VarHandle& y) { return ExprHandle(1.0f) + cast(x) * x + cast(y) * y; }); LoopNest l({tensor}); @@ -53,8 +53,8 @@ TEST(LoopNest, ExprSimple01) { } TEST(LoopNest, ExprLower01) { - Tensor tensor = Compute( - "f", {{16, "x"}, {5, "y"}}, [](const VarHandle& x, const VarHandle& y) { + Tensor tensor = + Compute("f", {16, 5}, [](const VarHandle& x, const VarHandle& y) { return ExprHandle(1.0f) + cast(x) * x + cast(y) * y; }); LoopNest l({tensor}); @@ -69,7 +69,7 @@ TEST(LoopNest, ExprSimple02) { auto func = [](const ExprHandle& x, const ExprHandle& y) { return ExprHandle(1.0f) + cast(x) * x + cast(y) * y; }; - Tensor tensor = Compute("f", {{26, "x"}, {5, "y"}}, func); + Tensor tensor = Compute("f", {26, 5}, func); LoopNest l({tensor}); std::vector loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); @@ -83,10 +83,10 @@ TEST(LoopNest, ExprSimple02) { { // Compare to a reference loop structure structure. - VarHandle x_outer("x_outer", kInt); - VarHandle x_inner("x_inner", kInt); - VarHandle y("y", kInt); - VarHandle x_tail("x_tail", kInt); + VarHandle x_outer("i_outer", kInt); + VarHandle x_inner("i_inner", kInt); + VarHandle y("i", kInt); + VarHandle x_tail("i_tail", kInt); BufHandle f("f", {26, 5}, kFloat); ExprHandle x_1 = x_outer * 4 + x_inner; ExprHandle x_outer_end = (ExprHandle(26) - 0) / 4; @@ -162,7 +162,7 @@ TEST(LoopNest, ExprSliceHeadWithLoopOptions) { auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; - Tensor tensor = Compute("f", {{10, "x"}}, func); + Tensor tensor = Compute("f", {10}, func); LoopNest l({tensor}); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr head; @@ -185,7 +185,7 @@ TEST(LoopNest, ExprSliceTailWithLoopOptions) { auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; - Tensor tensor = Compute("f", {{10, "x"}}, func); + Tensor tensor = Compute("f", {10}, func); LoopNest l({tensor}); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr head; @@ -217,7 +217,7 @@ TEST(LoopNest, ExprSliceHeadWhenFactorEqualsSize) { auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; - Tensor tensor = Compute("f", {{10, "x"}}, func); + Tensor tensor = Compute("f", {10}, func); LoopNest l({tensor}); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr head; @@ -237,7 +237,7 @@ TEST(LoopNest, ExprSliceHeadWhenFactorLargerThanSize) { auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; - Tensor tensor = Compute("f", {{10, "x"}}, func); + Tensor tensor = Compute("f", {10}, func); LoopNest l({tensor}); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr head; @@ -257,7 +257,7 @@ TEST(LoopNest, ExprSliceHead) { auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; - Tensor tensor = Compute("f", {{10, "x"}}, func); + Tensor tensor = Compute("f", {10}, func); LoopNest l({tensor}); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr head; @@ -279,7 +279,7 @@ TEST(LoopNest, ExprSliceHeadWithNonZeroStart) { auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; - Tensor tensor = Compute("f", {{10, "x"}}, func); + Tensor tensor = Compute("f", {10}, func); LoopNest l({tensor}); std::vector loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); @@ -305,7 +305,7 @@ TEST(LoopNest, ExprSliceTailWhenFactorEqualsSize) { auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; - Tensor tensor = Compute("f", {{10, "x"}}, func); + Tensor tensor = Compute("f", {10}, func); LoopNest l({tensor}); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr head; @@ -327,7 +327,7 @@ TEST(LoopNest, ExprSliceTailWhenFactorLargerThanSize) { auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; - Tensor tensor = Compute("f", {{10, "x"}}, func); + Tensor tensor = Compute("f", {10}, func); LoopNest l({tensor}); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr head; @@ -347,7 +347,7 @@ TEST(LoopNest, ExprSliceTail) { auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; - Tensor tensor = Compute("f", {{10, "x"}}, func); + Tensor tensor = Compute("f", {10}, func); LoopNest l({tensor}); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr head; @@ -372,7 +372,7 @@ TEST(LoopNest, ExprSplitAndSlice) { auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; - Tensor tensor = Compute("f", {{100, "x"}}, func); + Tensor tensor = Compute("f", {100}, func); LoopNest l({tensor}); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) @@ -424,7 +424,7 @@ TEST(LoopNest, ExprSliceAndNormalize) { auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; - Tensor tensor = Compute("f", {{10, "x"}}, func); + Tensor tensor = Compute("f", {10}, func); LoopNest l({tensor}); std::vector loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); @@ -455,7 +455,7 @@ TEST(LoopNest, ExprSliceWithVariableDimension) { const std::vector>& expected_for_ranges) { VarHandle dim("dim", kInt); Tensor tensor = - Compute("f", {{dim, "x"}}, [](const ExprHandle& x) { return x; }); + Compute("f", {dim}, [](const ExprHandle& x) { return x; }); LoopNest l({tensor}); std::vector loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); @@ -492,7 +492,7 @@ TEST(LoopNest, ExprSplitWithTail) { auto func = [](const ExprHandle& x) { return ExprHandle(1.0f) + cast(x); }; - Tensor tensor = Compute("f", {{199, "x"}}, func); + Tensor tensor = Compute("f", {199}, func); LoopNest l({tensor}); std::vector loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) @@ -521,7 +521,7 @@ TEST(LoopNest, ExprSplitWithTailNone) { auto func = [](const ExprHandle& x, const ExprHandle& y) { return ExprHandle(1.0f) + cast(x) * x + cast(y) * y; }; - Tensor tensor = Compute("f", {{24, "x"}, {5, "y"}}, func); + Tensor tensor = Compute("f", {24, 5}, func); LoopNest l({tensor}); std::vector loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::splitWithTail(loops[0], 4); @@ -534,10 +534,10 @@ TEST(LoopNest, ExprSplitWithTailNone) { { // Compare to a reference loop structure structure. - VarHandle x_outer("x_outer", kInt); - VarHandle x_inner("x_inner", kInt); - VarHandle y("y", kInt); - VarHandle x_tail("x_tail", kInt); + VarHandle x_outer("i_outer", kInt); + VarHandle x_inner("i_inner", kInt); + VarHandle y("i", kInt); + VarHandle x_tail("i_tail", kInt); // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks,cppcoreguidelines-avoid-magic-numbers) BufHandle f("f", {24, 5}, kFloat); ExprHandle x_1 = x_outer * 4 + x_inner; @@ -579,8 +579,8 @@ TEST(LoopNest, ExprSplitWithMask01) { const int N = 5; BufHandle a_buf("a", {M, N}, kFloat); BufHandle b_buf("b", {M, N}, kFloat); - Tensor tensor = Compute( - "f", {{M, "m"}, {N, "n"}}, [&](const ExprHandle& m, const ExprHandle& n) { + Tensor tensor = + Compute("f", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) { return a_buf.load(m, n) + b_buf.load(m, n) + 1.0f; }); @@ -613,7 +613,7 @@ TEST(LoopNest, ExprSplitWithMaskRepeatedNoMask) { const int M = 64; BufHandle a_buf("a", {M}, kFloat); BufHandle b_buf("b", {M}, kFloat); - Tensor tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) { + Tensor tensor = Compute("f", {M}, [&](const ExprHandle& m) { return a_buf.load(m) + b_buf.load(m) + 1.0f; }); @@ -697,8 +697,8 @@ TEST(LoopNest, TileSimple) { const int M = 64, N = 64; BufHandle a_buf("a", {M, N}, kFloat); BufHandle b_buf("b", {M, N}, kFloat); - Tensor tensor = Compute( - "f", {{M, "m"}, {N, "n"}}, [&](const ExprHandle& m, const ExprHandle& n) { + Tensor tensor = + Compute("f", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) { return a_buf.load({m, n}) + b_buf.load({m, n}) + 1.0f; }); @@ -710,13 +710,13 @@ TEST(LoopNest, TileSimple) { // IR check StmtPtr stmt = IRSimplifier::simplify(l.root_stmt()); checkIR(stmt, R"IR( -# CHECK: for (int m_outer -# CHECK: for (int n_outer -# CHECK: for (int m_inner -# CHECK: for (int n_inner +# CHECK: for (int i_outer +# CHECK: for (int i_outer_1 +# CHECK: for (int i_inner +# CHECK: for (int i_inner_1 # CHECK: f[ -# CHECK-NOT: for (int n_tail -# CHECK-NOT: for (int m_tail)IR"); +# CHECK-NOT: for (int i_tail +# CHECK-NOT: for (int i_tail)IR"); // Correctness check PaddedBuffer a_v(M, N, "a"); @@ -742,8 +742,8 @@ TEST(LoopNest, TileWithTails) { const int M = 64, N = 64; BufHandle a_buf("a", {M, N}, kFloat); BufHandle b_buf("b", {M, N}, kFloat); - Tensor tensor = Compute( - "f", {{M, "m"}, {N, "n"}}, [&](const ExprHandle& m, const ExprHandle& n) { + Tensor tensor = + Compute("f", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) { return a_buf.load({m, n}) + b_buf.load({m, n}) + 1.0f; }); @@ -755,14 +755,14 @@ TEST(LoopNest, TileWithTails) { // IR check StmtPtr stmt = IRSimplifier::simplify(l.root_stmt()); checkIR(stmt, R"IR( -# CHECK: for (int m_outer -# CHECK: for (int n_outer -# CHECK: for (int m_inner -# CHECK: for (int n_inner +# CHECK: for (int i_outer +# CHECK: for (int i_outer_1 +# CHECK: for (int i_inner +# CHECK: for (int i_inner_1 # CHECK: f[ -# CHECK: for (int m_inner +# CHECK: for (int i_inner # CHECK: f[ -# CHECK: for (int m_tail)IR"); +# CHECK: for (int i_tail)IR"); // Correctness check PaddedBuffer a_v(M, N, "a"); @@ -790,7 +790,7 @@ TEST(LoopNest, TileInMiddle) { BufHandle b_buf("b", {M, N, L, K}, kFloat); Tensor tensor = Compute( "f", - {{M, "m"}, {N, "n"}, {L, "l"}, {K, "k"}}, + {M, N, L, K}, [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& l, @@ -807,18 +807,18 @@ TEST(LoopNest, TileInMiddle) { // IR check StmtPtr stmt = IRSimplifier::simplify(nest.root_stmt()); checkIR(stmt, R"IR( -# CHECK: for (int m -# CHECK: for (int n_outer -# CHECK: for (int l_outer -# CHECK: for (int n_inner -# CHECK: for (int l_inner -# CHECK: for (int k +# CHECK: for (int i +# CHECK: for (int i_outer +# CHECK: for (int i_outer_1 +# CHECK: for (int i_inner +# CHECK: for (int i_inner_1 +# CHECK: for (int i_1 # CHECK: f[ -# CHECK: for (int l_tail -# CHECK: for (int n_inner -# CHECK: for (int k +# CHECK: for (int i_tail_1 +# CHECK: for (int i_inner_1 +# CHECK: for (int i_1 # CHECK: f[ -# CHECK: for (int n_tail)IR"); +# CHECK: for (int i_tail)IR"); // Correctness check PaddedBuffer a_v(M, N, L, K, "a"); @@ -847,7 +847,7 @@ TEST(LoopNest, SplitWithTailWithLoopOptions) { const int M = 21; BufHandle a_buf("a", {M}, kFloat); BufHandle b_buf("b", {M}, kFloat); - Tensor tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) { + Tensor tensor = Compute("f", {M}, [&](const ExprHandle& m) { return a_buf.load(m) + b_buf.load(m) + 1.0f; }); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) @@ -877,7 +877,7 @@ TEST(LoopNest, SplitWithMaskWithLoopOptions) { const int M = 21; BufHandle a_buf("a", {M}, kFloat); BufHandle b_buf("b", {M}, kFloat); - Tensor tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) { + Tensor tensor = Compute("f", {M}, [&](const ExprHandle& m) { return a_buf.load(m) + b_buf.load(m) + 1.0f; }); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) @@ -905,7 +905,7 @@ TEST(LoopNest, ScheduleBroadcastAddBuffer) { BufHandle b_buf("b", {N, K}, kFloat); Tensor c = Compute( "broadcast_add", - {{M, "m"}, {N, "n"}, {K, "k"}}, + {M, N, K}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return a_buf.load(m, n) + b_buf.load(n, k); }); @@ -953,13 +953,13 @@ TEST(LoopNest, ScheduleFunctionCall01) { BufHandle b_buf("b", {N, K}, kFloat); Tensor c = Compute( "broadcast_add", - {{M, "m"}, {N, "n"}, {K, "k"}}, + {M, N, K}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return a_buf.load(m, n) + b_buf.load(n, k); }); Tensor d = Compute( "d", - {{M, "m"}, {N, "n"}, {K, "k"}}, + {M, N, K}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return c.load(m, n, k) + 1; }); @@ -1012,13 +1012,13 @@ TEST(LoopNest, ScheduleInlineSimple) { Tensor x = Compute( "x", - {{M, "m1"}, {N, "n1"}, {K, "k1"}}, + {M, N, K}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return a_buf.load(m, n) * b_buf.load(n, k); }); Tensor y = Compute( "y", - {{M, "m2"}, {N, "n2"}, {K, "k2"}}, + {M, N, K}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return c_buf.load(m, n) * d_buf.load(m, k) + x.load(m, n, k); }); @@ -1092,19 +1092,19 @@ void InlineFunc01Helper(const std::vector& inline_order) { Tensor x = Compute( "x", - {{M, "m1"}, {N, "n1"}, {K, "k1"}}, + {M, N, K}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return a_buf.load(m, n) * b_buf.load(n, k); }); Tensor y = Compute( "y", - {{M, "m2"}, {N, "n2"}, {K, "k2"}}, + {M, N, K}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return c_buf.load(m, n) * d_buf.load(m, k) + x.load(m, n, k); }); Tensor z = Compute( "z", - {{M, "m3"}, {N, "n3"}, {K, "k3"}}, + {M, N, K}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return x.load(m, n, k) + y.load(m, n, k); }); @@ -1171,7 +1171,7 @@ void InlineFunc01Helper(const std::vector& inline_order) { if (inline_order.size() == 2) { Tensor z2 = Compute( "z", - {{M, "m3"}, {N, "n3"}, {K, "k3"}}, + {M, N, K}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return a_buf.load(m, n) * b_buf.load(n, k) + (c_buf.load(m, n) * d_buf.load(m, k) + @@ -1206,13 +1206,13 @@ TEST(LoopNest, ScheduleInlineRandom) { Tensor x = Compute( "x", - {{M, "m1"}, {N, "n1"}, {K, "k1"}}, + {M, N, K}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return Mod::make(Intrinsics::make(kRand, kInt), 5); }); Tensor y = Compute( "y", - {{M, "m2"}, {N, "n2"}, {K, "k2"}}, + {M, N, K}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return x.load(m, n, k) + x.load(m, n, k); }); @@ -1226,11 +1226,11 @@ TEST(LoopNest, ScheduleInlineRandom) { // Check the IR we produced checkIR(stmt1, R"IR( -# CHECK: for (int m2 = 0; m2 < 4; m2++) -# CHECK: for (int n2 = 0; n2 < 5; n2++) -# CHECK: for (int k2 = 0; k2 < 6; k2++) +# CHECK: for (int i = 0; i < 4; i++) +# CHECK: for (int i_1 = 0; i_1 < 5; i_1++) +# CHECK: for (int i_2 = 0; i_2 < 6; i_2++) # CHECK: int x = rand(); -# CHECK: y[m2, n2, k2] = 2 * (x % 5);)IR"); +# CHECK: y[i, i_1, i_2] = 2 * (x % 5);)IR"); } // Make sure we don't cache random vars that are not being inlined. @@ -1241,13 +1241,13 @@ TEST(LoopNest, ScheduleInlineRandomUnrelated) { Tensor x = Compute( "x", - {{M, "m1"}, {N, "n1"}, {K, "k1"}}, + {M, N, K}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return m * n * k; }); Tensor y = Compute( "y", - {{M, "m2"}, {N, "n2"}, {K, "k2"}}, + {M, N, K}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return x.load(m, n, k) + Intrinsics::make(kRand, kInt) + Intrinsics::make(kRand, kInt); @@ -1262,10 +1262,10 @@ TEST(LoopNest, ScheduleInlineRandomUnrelated) { // Check the IR we produced checkIR(stmt1, R"IR( -# CHECK: for (int m2 = 0; m2 < 4; m2++) -# CHECK: for (int n2 = 0; n2 < 5; n2++) -# CHECK: for (int k2 = 0; k2 < 6; k2++) -# CHECK: y[m2, n2, k2] = ((k2 * m2) * n2 + (rand())) + (rand());)IR"); +# CHECK: for (int i = 0; i < 4; i++) +# CHECK: for (int i_1 = 0; i_1 < 5; i_1++) +# CHECK: for (int i_2 = 0; i_2 < 6; i_2++) +# CHECK: y[i, i_1, i_2] = ((i * i_1) * i_2 + (rand())) + (rand());)IR"); } // Make sure we generate the right number of random values == the dimensionality @@ -1275,12 +1275,12 @@ TEST(LoopNest, ScheduleInlineRandomLowerDimensions) { const int N = 5; const int K = 6; - Tensor x = Compute("x", {{M, "m1"}}, [&](const VarHandle& m) { + Tensor x = Compute("x", {M}, [&](const VarHandle& m) { return Mod::make(Intrinsics::make(kRand, kInt), 5); }); Tensor y = Compute( "y", - {{M, "m2"}, {N, "n2"}, {K, "k2"}}, + {M, N, K}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return x.load(m) + x.load(m); }); @@ -1294,11 +1294,11 @@ TEST(LoopNest, ScheduleInlineRandomLowerDimensions) { // Check the IR we produced checkIR(stmt1, R"IR( -# CHECK: for (int m2 = 0; m2 < 4; m2++) +# CHECK: for (int i = 0; i < 4; i++) # CHECK: int x = rand(); -# CHECK: for (int n2 = 0; n2 < 5; n2++) -# CHECK: for (int k2 = 0; k2 < 6; k2++) -# CHECK: y[m2, n2, k2] = 2 * (x % 5);)IR"); +# CHECK: for (int i_1 = 0; i_1 < 5; i_1++) +# CHECK: for (int i_2 = 0; i_2 < 6; i_2++) +# CHECK: y[i, i_1, i_2] = 2 * (x % 5);)IR"); } // Make sure we don't screw up intrinsics thinking they're rand. @@ -1311,13 +1311,13 @@ TEST(LoopNest, ScheduleInlineIntrinsics) { Tensor x = Compute( "x", - {{M, "m1"}, {N, "n1"}, {K, "k1"}}, + {M, N, K}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return a_buf.load(m, n) * b_buf.load(n, k); }); Tensor y = Compute( "y", - {{M, "m2"}, {N, "n2"}, {K, "k2"}}, + {M, N, K}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return Intrinsics::make(kSqrt, x.load(m, n, k)); }); @@ -1369,13 +1369,13 @@ TEST(LoopNest, ScheduleInlineRandWithIntrinsics) { Tensor x = Compute( "x", - {{M, "m1"}, {N, "n1"}, {K, "k1"}}, + {M, N, K}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return Intrinsics::make(kRand, kFloat); }); Tensor y = Compute( "y", - {{M, "m2"}, {N, "n2"}, {K, "k2"}}, + {M, N, K}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return Intrinsics::make(kSqrt, x.load(m, n, k)); }); @@ -1387,20 +1387,18 @@ TEST(LoopNest, ScheduleInlineRandWithIntrinsics) { // Check the IR we produced checkIR(stmt1, R"IR( -# CHECK: for (int m2 = 0; m2 < 4; m2++) -# CHECK: for (int n2 = 0; n2 < 5; n2++) -# CHECK: for (int k2 = 0; k2 < 6; k2++) +# CHECK: for (int i = 0; i < 4; i++) +# CHECK: for (int i_1 = 0; i_1 < 5; i_1++) +# CHECK: for (int i_2 = 0; i_2 < 6; i_2++) # CHECK: float x = rand(); -# CHECK: y[m2, n2, k2] = sqrt(x);)IR"); +# CHECK: y[i, i_1, i_2] = sqrt(x);)IR"); } // Split a Compute then inline it into another compute. TEST(LoopNest, ScheduleSplitAThenInline) { - Tensor a = - Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); - Tensor b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) { - return a.load(j + ExprHandle(8)); - }); + Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; }); + Tensor b = Compute( + "b", {2}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); }); LoopNest l({b}, {a, b}); std::vector loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0); @@ -1410,11 +1408,9 @@ TEST(LoopNest, ScheduleSplitAThenInline) { // Split a Compute then inline another Compute into it. TEST(LoopNest, ScheduleSplitBThenInline) { - Tensor a = - Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); - Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) { - return a.load(j + ExprHandle(8)); - }); + Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; }); + Tensor b = Compute( + "b", {6}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); }); LoopNest l({b}, {a, b}); std::vector loops = l.getAllLoopNestsWritingToBuf(b.buf()).at(0); @@ -1434,11 +1430,9 @@ TEST(LoopNest, ScheduleSplitBThenInline) { // Split a Compute twice then inline it. TEST(LoopNest, ScheduleSplitTwiceThenInline) { - Tensor a = - Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); - Tensor b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) { - return a.load(j + ExprHandle(8)); - }); + Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; }); + Tensor b = Compute( + "b", {2}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); }); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) ForPtr i_inner; @@ -1451,11 +1445,9 @@ TEST(LoopNest, ScheduleSplitTwiceThenInline) { // Inline a Compute, then split. TEST(LoopNest, ScheduleInlineThenSplit) { - Tensor a = - Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); - Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) { - return a.load(j + ExprHandle(8)); - }); + Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; }); + Tensor b = Compute( + "b", {6}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); }); LoopNest l({b}, {a, b}); l.computeInline(a.buf()); @@ -1475,11 +1467,9 @@ TEST(LoopNest, ScheduleInlineThenSplit) { // Split a Compute, inline it, then split the result. TEST(LoopNest, ScheduleSplitInlineThenSplit) { - Tensor a = - Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); - Tensor b = Compute("b", {{16, "j"}}, [&](const VarHandle& j) { - return a.load(j + ExprHandle(8)); - }); + Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; }); + Tensor b = Compute( + "b", {16}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); }); LoopNest l({b}, {a, b}); auto loops = NodeFinder::find(l.root_stmt()); @@ -1501,12 +1491,11 @@ TEST(LoopNest, ScheduleSplitInlineThenSplit) { // Oversplit a loop that is simplified out after inlining. TEST(LoopNest, ScheduleSplitInlineSimplify) { - Tensor a = Compute("a", {{18, "i"}}, [&](const VarHandle& i) { + Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return ExprHandle(4) * i - ExprHandle(2) * i; }); - Tensor b = Compute("b", {{2, "j"}}, [&](const VarHandle& j) { - return a.load(j) - ExprHandle(1); - }); + Tensor b = Compute( + "b", {2}, [&](const VarHandle& j) { return a.load(j) - ExprHandle(1); }); LoopNest l({b}, {a, b}); std::vector loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0); @@ -1516,15 +1505,12 @@ TEST(LoopNest, ScheduleSplitInlineSimplify) { // Inline a Compute with two consumers. TEST(LoopNest, ScheduleInlineThreeMixedOnce) { - Tensor a = - Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); - Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) { - return a.load(j + ExprHandle(8)); + Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; }); + Tensor b = Compute( + "b", {6}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); }); + Tensor c = Compute("c", {4, 3}, [&](const VarHandle& k, const VarHandle& l) { + return a.load(k) * b.load(l); }); - Tensor c = Compute( - "c", {{4, "k"}, {3, "l"}}, [&](const VarHandle& k, const VarHandle& l) { - return a.load(k) * b.load(l); - }); LoopNest l({c}, {a, b, c}); std::vector loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0); @@ -1545,15 +1531,12 @@ TEST(LoopNest, ScheduleInlineThreeMixedOnce) { // Inline Compute A into B, then inline B into C. TEST(LoopNest, ScheduleInlineThreeMixedTwice) { - Tensor a = - Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); - Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) { - return a.load(j + ExprHandle(8)); + Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; }); + Tensor b = Compute( + "b", {6}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); }); + Tensor c = Compute("c", {4, 3}, [&](const VarHandle& k, const VarHandle& l) { + return a.load(k) * b.load(l); }); - Tensor c = Compute( - "c", {{4, "k"}, {3, "l"}}, [&](const VarHandle& k, const VarHandle& l) { - return a.load(k) * b.load(l); - }); LoopNest l({c}, {a, b, c}); std::vector loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0); @@ -1575,15 +1558,12 @@ TEST(LoopNest, ScheduleInlineThreeMixedTwice) { // Inline a Compute that is both a producer and consumer. TEST(LoopNest, ScheduleInlineThreeMixedInner) { - Tensor a = - Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); - Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) { - return a.load(j + ExprHandle(8)); + Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; }); + Tensor b = Compute( + "b", {6}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); }); + Tensor c = Compute("c", {4, 3}, [&](const VarHandle& k, const VarHandle& l) { + return a.load(k) * b.load(l); }); - Tensor c = Compute( - "c", {{4, "k"}, {3, "l"}}, [&](const VarHandle& k, const VarHandle& l) { - return a.load(k) * b.load(l); - }); LoopNest l({c}, {a, b, c}); std::vector loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0); @@ -1604,15 +1584,12 @@ TEST(LoopNest, ScheduleInlineThreeMixedInner) { // Split 3 Computes, then inline the first two into the last. TEST(LoopNest, ScheduleInlineThreeMixedSplit) { - Tensor a = - Compute("a", {{18, "i"}}, [&](const VarHandle& i) { return i * i; }); - Tensor b = Compute("b", {{6, "j"}}, [&](const VarHandle& j) { - return a.load(j + ExprHandle(8)); + Tensor a = Compute("a", {18}, [&](const VarHandle& i) { return i * i; }); + Tensor b = Compute( + "b", {6}, [&](const VarHandle& j) { return a.load(j + ExprHandle(8)); }); + Tensor c = Compute("c", {4, 3}, [&](const VarHandle& k, const VarHandle& l) { + return a.load(k) * b.load(l); }); - Tensor c = Compute( - "c", {{4, "k"}, {3, "l"}}, [&](const VarHandle& k, const VarHandle& l) { - return a.load(k) * b.load(l); - }); LoopNest l({c}, {a, b, c}); std::vector loops = l.getAllLoopNestsWritingToBuf(a.buf()).at(0); @@ -1633,13 +1610,13 @@ TEST(LoopNest, ScheduleInlineOutputTensors) { Tensor x = Compute( "x", - {{M, "m1"}, {N, "n1"}, {K, "k1"}}, + {M, N, K}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return m * n * k; }); Tensor y = Compute( "y", - {{M, "m2"}, {N, "n2"}, {K, "k2"}}, + {M, N, K}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return x.load(m, n, k) + m; }); @@ -1653,14 +1630,14 @@ TEST(LoopNest, ScheduleInlineOutputTensors) { // Check the IR we produced checkIR(stmt1, R"IR( -# CHECK: for (int m1 = 0; m1 < 4; m1++) -# CHECK: for (int n1 = 0; n1 < 5; n1++) -# CHECK: for (int k1 = 0; k1 < 6; k1++) -# CHECK: x[m1, n1, k1] = (k1 * m1) * n1; -# CHECK: for (int m2 = 0; m2 < 4; m2++) -# CHECK: for (int n2 = 0; n2 < 5; n2++) -# CHECK: for (int k2 = 0; k2 < 6; k2++) -# CHECK: y[m2, n2, k2] = (k2 * m2) * n2 + m2;)IR"); +# CHECK: for (int i = 0; i < 4; i++) +# CHECK: for (int i_1 = 0; i_1 < 5; i_1++) +# CHECK: for (int i_2 = 0; i_2 < 6; i_2++) +# CHECK: x[i, i_1, i_2] = (i * i_1) * i_2; +# CHECK: for (int i_3 = 0; i_3 < 4; i_3++) +# CHECK: for (int i_4 = 0; i_4 < 5; i_4++) +# CHECK: for (int i_5 = 0; i_5 < 6; i_5++) +# CHECK: y[i_3, i_4, i_5] = i_3 + (i_3 * i_4) * i_5;)IR"); } TEST(LoopNest, ScheduleInlineWithCompoundIndices) { @@ -1790,13 +1767,13 @@ TEST(LoopNest, ScheduleFuserStyle) { BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat); - Tensor b = Compute( - "f", {{kTotalSize, "i"}}, [&](const std::vector& axes) { + Tensor b = + Compute("f", {kTotalSize}, [&](const std::vector& axes) { return a_buf.load(axes[0]) + 11.0f; }); - Tensor c = Compute( - "g", {{kTotalSize, "i"}}, [&](const std::vector& axes) { + Tensor c = + Compute("g", {kTotalSize}, [&](const std::vector& axes) { return b.load(axes[0]) + 1.0f; }); @@ -1825,13 +1802,13 @@ TEST(LoopNest, ScheduleFuserThreeArg) { BufHandle c("C", {ExprHandle(kTotalSize)}, kFloat); BufHandle d("D", {ExprHandle(kTotalSize)}, kFloat); - Tensor e = Compute("e", {{kTotalSize, "i"}}, [&](const VarHandle& i) { + Tensor e = Compute("e", {kTotalSize}, [&](const VarHandle& i) { return a.load(i) + b.load(i); }); - Tensor f = Compute("f", {{kTotalSize, "i"}}, [&](const VarHandle& i) { + Tensor f = Compute("f", {kTotalSize}, [&](const VarHandle& i) { return e.load(i) + c.load(i); }); - Tensor g = Compute("g", {{kTotalSize, "i"}}, [&](const VarHandle& i) { + Tensor g = Compute("g", {kTotalSize}, [&](const VarHandle& i) { return f.load(i) + d.load(i); }); @@ -1859,8 +1836,8 @@ TEST(LoopNest, ScheduleDynamicShape2D) { VarHandle n("n", kInt); BufHandle a("a", {m, n}, kFloat); BufHandle b("b", {m, n}, kFloat); - Tensor c = Compute( - "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) { + Tensor c = + Compute("c", {m, n}, [&](const VarHandle& i, const VarHandle& j) { return a.load(i, j) + b.load(i, j); }); LoopNest l({c}); @@ -1893,10 +1870,9 @@ TEST(LoopNest, LoopNestComputeAt_1) { // should not be inlined into B. Instead, it should be computed into the temp, // and the temp should be used in B. VarHandle N("N", kInt); - Tensor A = Compute( - "A", {{N, "i_a"}}, [&](const VarHandle& i_a) { return i_a * i_a; }); - Tensor B = Compute( - "B", {{N, "i_b"}}, [&](const VarHandle& i_b) { return A.load(i_b); }); + Tensor A = Compute("A", {N}, [&](const VarHandle& i_a) { return i_a * i_a; }); + Tensor B = + Compute("B", {N}, [&](const VarHandle& i_b) { return A.load(i_b); }); LoopNest l({B}, {A, B}); std::vector loops = l.getAllLoopNestsWritingToBuf(B.buf()).at(0); LoopNest::computeAt(l.getLoopBodyFor(A), loops[0]); @@ -1906,10 +1882,10 @@ TEST(LoopNest, LoopNestComputeAt_1) { checkIR(s, R"IR( # CHECK: Allocate(temp); // dtype=int, dims=[1] -# CHECK: for (int i_b = 0; i_b < N; i_b++) +# CHECK: for (int i = 0; i < N; i++) # CHECK: temp[ # CHECK-NOT: A[ -# CHECK: B[i_b] = temp[0] +# CHECK: B[i_1] = temp[0] # CHECK: Free(temp))IR"); // Now check that the loop still produces the correct result. @@ -1942,13 +1918,11 @@ TEST(LoopNest, LoopNestComputeAt_2) { VarHandle W("W", kInt); VarHandle H("H", kInt); Tensor p = Compute( - "prod", - {{H + 1, "py"}, {W + 1, "px"}}, - [&](const VarHandle& py, const VarHandle& px) { return px * py; }); - Tensor c = Compute( - "cons", - {{H, "cy"}, {W, "cx"}}, - [&](const VarHandle& y, const VarHandle& x) { + "prod", {H + 1, W + 1}, [&](const VarHandle& py, const VarHandle& px) { + return px * py; + }); + Tensor c = + Compute("cons", {H, W}, [&](const VarHandle& y, const VarHandle& x) { return p.load(y, x) + p.load(y + 1, x) + p.load(y, x + 1) + p.load(y + 1, x + 1); }); @@ -1973,10 +1947,10 @@ TEST(LoopNest, LoopNestComputeAt_2) { // Check the IR we produced checkIR(s, R"IR( # CHECK: Allocate(temp); // dtype=int, dims=[2, W + 1] -# CHECK: for (int cy = 0; cy < H; cy++) +# CHECK: for (int i_2 = 0; i_2 < H; i_2++) # CHECK: for # CHECK: for -# CHECK: for (int cx = 0; cx < W; cx++) +# CHECK: for (int i_3 = 0; i_3 < W; i_3++) # CHECK-NOT: prod[ # CHECK: cons[ # CHECK: Free(temp))IR"); @@ -1999,8 +1973,8 @@ TEST(LoopNest, LoopNestComputeAt_2) { // Check the IR we produced checkIR(s, R"IR( # CHECK: Allocate(temp); // dtype=int, dims=[2, 2] -# CHECK: for (int cy = 0; cy < H; cy++) -# CHECK: for (int cx = 0; cx < W; cx++) +# CHECK: for (int i_2 = 0; i_2 < H; i_2++) +# CHECK: for (int i_3 = 0; i_3 < W; i_3++) # CHECK: for # CHECK: for # CHECK-NOT: prod[ @@ -2029,23 +2003,19 @@ TEST(LoopNest, LoopNestComputeAt_3) { VarHandle W("W", kInt); VarHandle H("H", kInt); Tensor A = Compute( - "A", - {{H + 1, "ay"}, {W + 1, "ax"}}, - [&](const VarHandle& ay, const VarHandle& ax) { return ax * ay; }); + "A", {H + 1, W + 1}, [&](const VarHandle& ay, const VarHandle& ax) { + return ax * ay; + }); Tensor B = Compute( - "B", - {{H + 1, "by"}, {W + 1, "bx"}}, - [&](const VarHandle& by, const VarHandle& bx) { return A.load(by, bx); }); - Tensor C = Compute( - "C", - {{H, "cy"}, {W, "cx"}}, - [&](const VarHandle& cy, const VarHandle& cx) { + "B", {H + 1, W + 1}, [&](const VarHandle& by, const VarHandle& bx) { + return A.load(by, bx); + }); + Tensor C = + Compute("C", {H, W}, [&](const VarHandle& cy, const VarHandle& cx) { return B.load(cy, cx + 1); }); - Tensor D = Compute( - "D", - {{H, "dy"}, {W, "dx"}}, - [&](const VarHandle& dy, const VarHandle& dx) { + Tensor D = + Compute("D", {H, W}, [&](const VarHandle& dy, const VarHandle& dx) { return A.load(dy + 1, dx) + C.load(dy, dx); }); @@ -2069,17 +2039,17 @@ TEST(LoopNest, LoopNestComputeAt_3) { // Check the IR we produced checkIR(s, R"IR( # CHECK: Allocate(temp); // dtype=int, dims=[1, W] -# CHECK: for (int ay = 0; ay < H + 1; ay++) -# CHECK: for (int ax = 0; ax < W + 1; ax++) +# CHECK: for (int i = 0; i < H + 1; i++) +# CHECK: for (int i_1 = 0; i_1 < W + 1; i_1++) # CHECK: A[ -# CHECK: for (int by = 0; by < H + 1; by++) -# CHECK: for (int bx = 0; bx < W + 1; bx++) +# CHECK: for (int i_2 = 0; i_2 < H + 1; i_2++) +# CHECK: for (int i_3 = 0; i_3 < W + 1; i_3++) # CHECK: B[ -# CHECK: for (int cy = 0; cy < H; cy++) -# CHECK: for (int cx = 0; cx < W; cx++) +# CHECK: for (int i_4 = 0; i_4 < H; i_4++) +# CHECK: for (int i_5 = 0; i_5 < W; i_5++) # CHECK: C[ -# CHECK: for (int dy = 0; dy < H; dy++) -# CHECK: for (int dx = 0; dx < W; dx++) +# CHECK: for (int i_6 = 0; i_6 < H; i_6++) +# CHECK: for (int i_7 = 0; i_7 < W; i_7++) # CHECK-NOT: A[)IR"); // Now check that the loop still produces the correct result. @@ -2100,17 +2070,17 @@ TEST(LoopNest, LoopNestComputeAt_3) { // Check the IR we produced checkIR(s, R"IR( # CHECK: Allocate(temp); // dtype=int, dims=[1, 1] -# CHECK: for (int ay = 0; ay < H + 1; ay++) -# CHECK: for (int ax = 0; ax < W + 1; ax++) +# CHECK: for (int i = 0; i < H + 1; i++) +# CHECK: for (int i_1 = 0; i_1 < W + 1; i_1++) # CHECK: A[ -# CHECK: for (int by = 0; by < H + 1; by++) -# CHECK: for (int bx = 0; bx < W + 1; bx++) +# CHECK: for (int i_2 = 0; i_2 < H + 1; i_2++) +# CHECK: for (int i_3 = 0; i_3 < W + 1; i_3++) # CHECK: B[ -# CHECK: for (int cy = 0; cy < H; cy++) -# CHECK: for (int cx = 0; cx < W; cx++) +# CHECK: for (int i_4 = 0; i_4 < H; i_4++) +# CHECK: for (int i_5 = 0; i_5 < W; i_5++) # CHECK: C[ -# CHECK: for (int dy = 0; dy < H; dy++) -# CHECK: for (int dx = 0; dx < W; dx++) +# CHECK: for (int i_6 = 0; i_6 < H; i_6++) +# CHECK: for (int i_7 = 0; i_7 < W; i_7++) # CHECK-NOT: A[)IR"); // Now check that the loop still produces the correct result. @@ -2128,16 +2098,14 @@ TEST(LoopNest, Reduce2dComputeAt) { VarHandle W("W", kInt); VarHandle H("H", kInt); - Tensor p = - Compute("prod", {{H + 1, "py"}, {W + 1, "px"}}, [&](Axis py, Axis px) { - return px * py; - }); + Tensor p = Compute( + "prod", {H + 1, W + 1}, [&](Axis py, Axis px) { return px * py; }); Tensor c = Reduce( "cons", - {{H, "cy"}, {W, "cx"}}, + {H, W}, Sum(), [&](Axis y, Axis x, Axis r, Axis s) { return p.load(y + r, x + s); }, - {{2, "r"}, {2, "s"}}); + {2, 2}); std::vector c_ref(kW * kH, 0); for (int y = 0; y < kH; y++) { @@ -2147,17 +2115,17 @@ TEST(LoopNest, Reduce2dComputeAt) { } LoopNest orig_loopnest({c}, {p, c}); checkIR(orig_loopnest.root_stmt(), R"IR( -# CHECK: for (int py = 0; py < H + 1; py++) { -# CHECK: for (int px = 0; px < W + 1; px++) { -# CHECK: prod[py, px] = px * py; +# CHECK: for (int i = 0; i < H + 1; i++) { +# CHECK: for (int i_1 = 0; i_1 < W + 1; i_1++) { +# CHECK: prod[i, i_1] = i_1 * i; # CHECK: } # CHECK: } -# CHECK: for (int cy = 0; cy < H; cy++) { -# CHECK: for (int cx = 0; cx < W; cx++) { -# CHECK: cons[cy, cx] = int(0); -# CHECK: for (int r = 0; r < 2; r++) { -# CHECK: for (int s = 0; s < 2; s++) { -# CHECK: cons[cy, cx] = ReduceOp((cons[cy, cx]) + (prod[cy + r, cx + s]), reduce_args={r, s}); +# CHECK: for (int i_2 = 0; i_2 < H; i_2++) { +# CHECK: for (int i_3 = 0; i_3 < W; i_3++) { +# CHECK: cons[i_2, i_3] = int(0); +# CHECK: for (int i_4 = 0; i_4 < 2; i_4++) { +# CHECK: for (int i_5 = 0; i_5 < 2; i_5++) { +# CHECK: cons[i_2, i_3] = ReduceOp((cons[i_2, i_3]) + (prod[i_2 + i_4, i_3 + i_5]), reduce_args={i_4, i_5}); # CHECK: } # CHECK: } # CHECK: } @@ -2177,17 +2145,17 @@ TEST(LoopNest, Reduce2dComputeAt) { SimpleIREvaluator cg(l.root_stmt(), {c, W, H}); checkIR(cg.stmt(), R"IR( # CHECK: Allocate(temp); // dtype=int, dims=[2, W + 1] -# CHECK: for (int cy = 0; cy < H; cy++) { +# CHECK: for (int i = 0; i < H; i++) { # CHECK: for (int idx0 = 0; idx0 < 2; idx0++) { # CHECK: for (int idx1 = 0; idx1 < W + 1; idx1++) { -# CHECK: temp[(0 + idx0 * (1 * (W + 1))) + idx1 * 1] = (idx0 + cy) * (idx1 + 0); +# CHECK: temp[(0 + idx0 * (1 * (W + 1))) + idx1 * 1] = (idx0 + i) * (idx1 + 0); # CHECK: } # CHECK: } -# CHECK: for (int cx = 0; cx < W; cx++) { -# CHECK: cons[(0 + cy * (1 * W)) + cx * 1] = int(0); -# CHECK: for (int r = 0; r < 2; r++) { -# CHECK: for (int s = 0; s < 2; s++) { -# CHECK: cons[(0 + cy * (1 * W)) + cx * 1] = (cons[(0 + cy * (1 * W)) + cx * 1]) + (temp[(0 + r * (1 * (W + 1))) + (cx + s) * 1]); +# CHECK: for (int i_1 = 0; i_1 < W; i_1++) { +# CHECK: cons[(0 + i * (1 * W)) + i_1 * 1] = int(0); +# CHECK: for (int i_2 = 0; i_2 < 2; i_2++) { +# CHECK: for (int i_3 = 0; i_3 < 2; i_3++) { +# CHECK: cons[(0 + i * (1 * W)) + i_1 * 1] = (cons[(0 + i * (1 * W)) + i_1 * 1]) + (temp[(0 + i_2 * (1 * (W + 1))) + (i_1 + i_3) * 1]); # CHECK: } # CHECK: } # CHECK: } @@ -2211,17 +2179,17 @@ TEST(LoopNest, Reduce2dComputeAt) { SimpleIREvaluator cg(l.root_stmt(), {c, W, H}); checkIR(cg.stmt(), R"IR( # CHECK: Allocate(temp); // dtype=int, dims=[2, 2] -# CHECK: for (int cy = 0; cy < H; cy++) { -# CHECK: for (int cx = 0; cx < W; cx++) { +# CHECK: for (int i = 0; i < H; i++) { +# CHECK: for (int i_1 = 0; i_1 < W; i_1++) { # CHECK: for (int idx0 = 0; idx0 < 2; idx0++) { # CHECK: for (int idx1 = 0; idx1 < 2; idx1++) { -# CHECK: temp[(0 + idx0 * (1 * 2)) + idx1 * 1] = (cy + idx0) * (cx + idx1); +# CHECK: temp[(0 + idx0 * (1 * 2)) + idx1 * 1] = (i + idx0) * (i_1 + idx1); # CHECK: } # CHECK: } -# CHECK: cons[(0 + cy * (1 * W)) + cx * 1] = 0; -# CHECK: for (int r = 0; r < 2; r++) { -# CHECK: for (int s = 0; s < 2; s++) { -# CHECK: cons[(0 + cy * (1 * W)) + cx * 1] = (cons[(0 + cy * (1 * W)) + cx * 1]) + (temp[(0 + r * (1 * 2)) + s * 1]); +# CHECK: cons[(0 + i * (1 * W)) + i_1 * 1] = 0; +# CHECK: for (int i_2 = 0; i_2 < 2; i_2++) { +# CHECK: for (int i_3 = 0; i_3 < 2; i_3++) { +# CHECK: cons[(0 + i * (1 * W)) + i_1 * 1] = (cons[(0 + i * (1 * W)) + i_1 * 1]) + (temp[(0 + i_2 * (1 * 2)) + i_3 * 1]); # CHECK: } # CHECK: } # CHECK: } @@ -2247,18 +2215,17 @@ TEST(LoopNest, DISABLED_Conv1d_NH) { int Pad = 1; BufHandle IP("input", {H}, kFloat); - Tensor A = - Compute("A", {{N, "np"}, {H + 2 * Pad, "hp"}}, [&](Axis n, Axis h) { - auto cond = CompareSelect::make(h, Pad, 1, 0, kLT); - cond = CompareSelect::make(h, H + Pad, 1, cond, kGE); - return ifThenElse(cond, 0.f, IP.load(n, h - Pad)); - }); + Tensor A = Compute("A", {N, H + 2 * Pad}, [&](Axis n, Axis h) { + auto cond = CompareSelect::make(h, Pad, 1, 0, kLT); + cond = CompareSelect::make(h, H + Pad, 1, cond, kGE); + return ifThenElse(cond, 0.f, IP.load(n, h - Pad)); + }); Tensor B = Reduce( "B", - {{N, "n"}, {H, "h"}}, + {N, H}, Sum(), [&](Axis n, Axis h, Axis r) { return A.load(n, h + r); }, - {{R, "r"}}); + {R}); LoopNest l({B}); checkIR(l.root_stmt(), R"IR( # CHECK: for (int np = 0; np < 4; np++) { @@ -2333,12 +2300,12 @@ class LoopOrderHelper : public IRVisitor { }; TEST(LoopNest, LoopNestReorderAxis1) { - Tensor tensor = Compute( - "f", {{2, "x"}, {3, "y"}}, [](const VarHandle& x, const VarHandle& y) { + Tensor tensor = + Compute("f", {2, 3}, [](const VarHandle& x, const VarHandle& y) { return ExprHandle(1.0f) + cast(x) * x + cast(y) * y; }); LoopNest l({tensor}); - StmtPtr stmt1 = Stmt::clone(l.root_stmt()); + StmtPtr stmt1 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt())); std::vector stmt1_output(6, 0); SimpleIREvaluator cg(stmt1, {tensor}); @@ -2346,15 +2313,15 @@ TEST(LoopNest, LoopNestReorderAxis1) { auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::reorderAxis(loops[0], loops[1]); - StmtPtr stmt2 = Stmt::clone(l.root_stmt()); + StmtPtr stmt2 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt())); ASSERT_NE(stmt1, stmt2); LoopOrderHelper loopOrderHelper; std::string order1 = loopOrderHelper.getOrder(stmt1); std::string order2 = loopOrderHelper.getOrder(stmt2); - ASSERT_EQ(order1, "x,y,"); - ASSERT_EQ(order2, "y,x,"); + ASSERT_EQ(order1, "j,i,"); + ASSERT_EQ(order2, "i,j,"); std::vector stmt2_output(6, 0); SimpleIREvaluator cg2(stmt2, {tensor}); @@ -2383,7 +2350,7 @@ TEST(LoopNest, LoopNestReorderAxis1) { TEST(LoopNest, LoopNestReorderPartialAxes) { Tensor tensor = Compute( "f", - {{2, "x"}, {3, "y"}, {4, "z"}}, + {2, 3, 4}, [](const VarHandle& x, const VarHandle& y, const VarHandle& z) { return ExprHandle(1.0f) + cast(x) * x + cast(y) * y + cast(z) * z; @@ -2391,8 +2358,8 @@ TEST(LoopNest, LoopNestReorderPartialAxes) { LoopNest l({tensor}); LoopOrderHelper loopOrderHelper; - StmtPtr stmt1 = Stmt::clone(l.root_stmt()); - ASSERT_EQ(loopOrderHelper.getOrder(stmt1), "x,y,z,"); + StmtPtr stmt1 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt())); + ASSERT_EQ(loopOrderHelper.getOrder(stmt1), "i,j,k,"); std::vector stmt1_output(24, 0); SimpleIREvaluator cg(stmt1, {tensor}); @@ -2400,7 +2367,7 @@ TEST(LoopNest, LoopNestReorderPartialAxes) { auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::reorderAxis(loops[0], loops[1]); - ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "y,x,z,"); + ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "j,i,k,"); StmtPtr stmt2 = Stmt::clone(l.root_stmt()); @@ -2414,7 +2381,7 @@ TEST(LoopNest, LoopNestReorderPartialAxes) { loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::reorderAxis(loops[1], loops[2]); - ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "y,z,x,"); + ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "j,k,i,"); StmtPtr stmt3 = Stmt::clone(l.root_stmt()); @@ -2430,7 +2397,7 @@ TEST(LoopNest, LoopNestReorderPartialAxes) { TEST(LoopNest, LoopNestReorderInternalAxis) { Tensor tensor = Compute( "f", - {{1, "w"}, {2, "x"}, {3, "y"}, {4, "z"}}, + {1, 2, 3, 4}, [](const VarHandle& w, const VarHandle& x, const VarHandle& y, @@ -2441,8 +2408,8 @@ TEST(LoopNest, LoopNestReorderInternalAxis) { LoopNest l({tensor}); LoopOrderHelper loopOrderHelper; - StmtPtr stmt1 = Stmt::clone(l.root_stmt()); - ASSERT_EQ(loopOrderHelper.getOrder(stmt1), "w,x,y,z,"); + StmtPtr stmt1 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt())); + ASSERT_EQ(loopOrderHelper.getOrder(stmt1), "i,j,k,l,"); std::vector stmt1_output(24, 0); SimpleIREvaluator cg(stmt1, {tensor}); @@ -2450,7 +2417,7 @@ TEST(LoopNest, LoopNestReorderInternalAxis) { auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::reorderAxis(loops[2], loops[1]); - ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "w,y,x,z,"); + ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "i,k,j,l,"); StmtPtr stmt2 = l.root_stmt(); @@ -2466,7 +2433,7 @@ TEST(LoopNest, LoopNestReorderInternalAxis) { TEST(LoopNest, LoopNestReorderEnclosingAxis) { Tensor tensor = Compute( "f", - {{1, "w"}, {2, "x"}, {3, "y"}, {4, "z"}}, + {1, 2, 3, 4}, [](const VarHandle& w, const VarHandle& x, const VarHandle& y, @@ -2477,7 +2444,7 @@ TEST(LoopNest, LoopNestReorderEnclosingAxis) { LoopNest l({tensor}); LoopOrderHelper loopOrderHelper; - StmtPtr stmt1 = Stmt::clone(l.root_stmt()); + StmtPtr stmt1 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt())); std::vector stmt1_output(24, 0); SimpleIREvaluator cg(stmt1, {tensor}); @@ -2485,7 +2452,7 @@ TEST(LoopNest, LoopNestReorderEnclosingAxis) { auto loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::reorderAxis(loops[0], loops[3]); - ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "z,x,y,w,"); + ASSERT_EQ(loopOrderHelper.getOrder(l.root_stmt()), "l,j,k,i,"); StmtPtr stmt2 = l.root_stmt(); @@ -2499,8 +2466,8 @@ TEST(LoopNest, LoopNestReorderEnclosingAxis) { } TEST(LoopNest, LoopNestReorderSameAxis) { - Tensor tensor = Compute( - "f", {{2, "x"}, {3, "y"}}, [](const VarHandle& x, const VarHandle& y) { + Tensor tensor = + Compute("f", {2, 3}, [](const VarHandle& x, const VarHandle& y) { return ExprHandle(1.0f) + cast(x) * x + cast(y) * y; }); LoopNest l({tensor}); @@ -2518,18 +2485,18 @@ TEST(LoopNest, LoopNestReorderSameAxis) { TEST(LoopNest, LoopNestReorderExtraStatements) { /* We're going for a structure like this: - * for x in ... + * for i in ... * Stmt 1 - * for y in ... + * for j in ... * Stmt 2 - * for z in ... + * for k in ... * Stmt 3 * Stmt 4 */ Tensor tensor = Compute( "f", - {{2, "x"}, {3, "y"}, {4, "z"}}, + {2, 3, 4}, [](const VarHandle& x, const VarHandle& y, const VarHandle& z) { return ExprHandle(1.0f) + cast(x) * x + cast(y) * y + cast(z) * z; @@ -2542,15 +2509,15 @@ TEST(LoopNest, LoopNestReorderExtraStatements) { VarHandle i = VarHandle(loops[0]->var()); - StmtPtr store_1 = Store::make(extra, {i, 0}, ExprHandle(1.f)); - StmtPtr store_2 = Store::make(extra, {i, 1}, ExprHandle(2.f)); + StmtPtr store_1 = Store::make(extra, {i, 0}, 1.f); + StmtPtr store_2 = Store::make(extra, {i, 1}, 2.f); // stmt 3 is the Function body. - StmtPtr store_3 = Store::make(extra, {i, 2}, ExprHandle(4.f)); + StmtPtr store_3 = Store::make(extra, {i, 2}, 4.f); loops[0]->body()->prepend_stmt(store_1); loops[1]->body()->prepend_stmt(store_2); loops[1]->body()->append_stmt(store_3); - StmtPtr stmt1 = Stmt::clone(l.root_stmt()); + StmtPtr stmt1 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt())); std::vector extra1(6, 0); std::vector res1(24, 0); @@ -2559,14 +2526,14 @@ TEST(LoopNest, LoopNestReorderExtraStatements) { /* Then we reorder loop y and z, we want it to look like: * - * for x in ... + * for i in ... * Stmt 1 - * for y in ... + * for j in ... * Stmt 2 - * for z in ... - * for y in ... + * for j_1 in ... + * for k in ... * Stmt 3 - * for y in ... + * for j_2 in ... * Stmt 4 * * We need extra loops because we don't have dependency info about stmt 3 @@ -2575,19 +2542,19 @@ TEST(LoopNest, LoopNestReorderExtraStatements) { */ LoopNest::reorderAxis(loops[1], loops[2]); - StmtPtr stmt2 = Stmt::clone(l.root_stmt()); + StmtPtr stmt2 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt())); // Check the IR we produced checkIR(stmt2, R"IR( -# CHECK: for (int x -# CHECK: res[x, 0] = 1 -# CHECK: for (int y -# CHECK: res[x, 1] = 2 -# CHECK: for (int z -# CHECK: for (int y +# CHECK: for +# CHECK: res[i, 0] = 1 +# CHECK: for +# CHECK: res[i, 1] = 2 +# CHECK: for +# CHECK: for # CHECK: f[ -# CHECK: for (int y -# CHECK: res[x, 2] = 4 +# CHECK: for +# CHECK: res[i, 2] = 4 )IR"); std::vector extra2(6, 0); @@ -2623,21 +2590,21 @@ TEST(LoopNest, LoopNestReorderExtraStatements) { */ loops = l.getAllLoopNestsWritingToBuf(tensor.buf()).at(0); LoopNest::reorderAxis(loops[0], loops[2]); - StmtPtr stmt3 = Stmt::clone(l.root_stmt()); + StmtPtr stmt3 = LoopNest::sanitizeNames(Stmt::clone(l.root_stmt())); // Check the IR we produced checkIR(stmt3, R"IR( -# CHECK: for (int x -# CHECK: res[x, 0] = 1 -# CHECK: for (int y -# CHECK: res[x, 1] = 2 -# CHECK: for (int y -# CHECK: for (int z -# CHECK: for (int x +# CHECK: for +# CHECK: res[i, 0] = 1 +# CHECK: for +# CHECK: res[i, 1] = 2 +# CHECK: for +# CHECK: for +# CHECK: for # CHECK: f[ -# CHECK: for (int x -# CHECK: for (int y -# CHECK: res[x, 2] = 4 +# CHECK: for +# CHECK: for +# CHECK: res[i_2, 2] = 4 )IR"); std::vector extra3(6, 0); @@ -2659,9 +2626,7 @@ void LoopNestReorderTestHelper( int index1, int index2) { Tensor c = Compute( - "5d", - {{2, "a"}, {3, "b"}, {2, "c"}, {3, "d"}, {2, "e"}}, - [](const std::vector&) { return -1; }); + "5d", {2, 3, 2, 3, 2}, [](const std::vector&) { return -1; }); LoopNest l({c}); BufHandle extra("extra", {5}, kInt); @@ -2783,34 +2748,26 @@ TEST(LoopNest, LoopNestReorderInternalLoopNest) { Tensor x = Compute( "x", - {{M, "m1"}, {N, "n1"}, {K, "k1"}}, + {M, N, K}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return a_buf.load(m, n) * b_buf.load(n, k); }); Tensor y = Compute( "y", - {{M, "m2"}, {N, "n2"}, {K, "k2"}}, + {M, N, K}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return c_buf.load(m, n) * d_buf.load(m, k) + x.load(m, n, k); }); Tensor z = Compute( "z", - {{M, "m3"}, {N, "n3"}, {K, "k3"}}, + {M, N, K}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return x.load(m, n, k) + y.load(m, n, k); }); LoopNest l({z}, {x, y, z}); - ForPtr a = nullptr; - ForPtr b = nullptr; - auto fors = NodeFinder::find(l.root_stmt()); - for (auto f : fors) { - if (f->var()->name_hint() == "m2") { - a = f; - } else if (f->var()->name_hint() == "k2") { - b = f; - } - } + ForPtr a = l.getAllLoopNestsWritingToBuf(y.buf())[0][2]; + ForPtr b = l.getAllLoopNestsWritingToBuf(y.buf())[0][0]; LoopNest::reorderAxis(a, b); l.prepareForCodegen(); @@ -2819,15 +2776,15 @@ TEST(LoopNest, LoopNestReorderInternalLoopNest) { // Check the IR we produced has the 3 nests in the right order, but k and m // swapped in the middle. checkIR(stmt, R"IR( -# CHECK: for (int m1 -# CHECK: for (int n1 -# CHECK: for (int k1 -# CHECK: for (int k2 -# CHECK: for (int n2 -# CHECK: for (int m2 -# CHECK: for (int m3 -# CHECK: for (int n3 -# CHECK: for (int k3)IR"); +# CHECK: < 4 +# CHECK: < 5 +# CHECK: < 6 +# CHECK: < 6 +# CHECK: < 5 +# CHECK: < 4 +# CHECK: < 4 +# CHECK: < 5 +# CHECK: < 6)IR"); { PaddedBuffer a_v(M, N); @@ -2873,8 +2830,8 @@ TEST(LoopNest, LoopNestReorderInternalLoopNest) { } TEST(LoopNest, OuterLoopVectorization) { - Tensor tensor = Compute( - "f", {{8, "X"}, {8, "y"}}, [](const VarHandle& x, const VarHandle& y) { + Tensor tensor = + Compute("f", {8, 8}, [](const VarHandle& x, const VarHandle& y) { return ExprHandle(1.0f) + cast(x) * x + cast(y) * y; }); LoopNest l({tensor}); @@ -2924,8 +2881,8 @@ namespace { std::string constantUpperBoundLoopIR(int upper_bound_val) { ExprHandle upper_bound(upper_bound_val); - Tensor A = Compute( - "A", {{upper_bound, "x"}}, [&](const VarHandle& x) { return x * 2; }); + Tensor A = + Compute("A", {upper_bound}, [&](const VarHandle& x) { return x * 2; }); LoopNest l({A}); std::vector loops = l.getAllLoopNestsWritingToBuf(A.buf())[0]; StmtPtr unrolled = nullptr; @@ -2953,21 +2910,21 @@ TEST(LoopNest, UnrollOuter) { ExprHandle inner_bound(4); Tensor A = Compute( "A", - {{outer_bound, "x"}, {inner_bound, "y"}}, + {outer_bound, inner_bound}, [&](const VarHandle& x, const VarHandle& y) { return x + y; }); LoopNest l({A}); std::vector loops = l.getAllLoopNestsWritingToBuf(A.buf())[0]; StmtPtr unrolled = nullptr; LoopNest::fullUnroll(loops[0], &unrolled); checkIR(unrolled, R"IR( -# CHECK: for (int y = 0; y < 4; y++) { -# CHECK: A[0, y] = y; +# CHECK: for (int i = 0; i < 4; i++) { +# CHECK: A[0, i] = i; # CHECK: } -# CHECK: for (int y = 0; y < 4; y++) { -# CHECK: A[1, y] = y + 1; +# CHECK: for (int i = 0; i < 4; i++) { +# CHECK: A[1, i] = i + 1; # CHECK: } -# CHECK: for (int y = 0; y < 4; y++) { -# CHECK: A[2, y] = y + 2; +# CHECK: for (int i = 0; i < 4; i++) { +# CHECK: A[2, i] = i + 2; # CHECK: })IR"); } @@ -2976,7 +2933,7 @@ TEST(LoopNest, UnrollInner) { ExprHandle inner_bound(4); Tensor A = Compute( "A", - {{outer_bound, "x"}, {inner_bound, "y"}}, + {outer_bound, inner_bound}, [&](const VarHandle& x, const VarHandle& y) { return x + y; }); LoopNest l({A}); std::vector loops = l.getAllLoopNestsWritingToBuf(A.buf())[0]; @@ -2984,11 +2941,11 @@ TEST(LoopNest, UnrollInner) { LoopNest::fullUnroll( static_to(loops[0]->body()->stmts().front()), &unrolled); checkIR(loops[0], R"IR( -# CHECK: for (int x = 0; x < 3; x++) { -# CHECK: A[x, 0] = x; -# CHECK: A[x, 1] = x + 1; -# CHECK: A[x, 2] = x + 2; -# CHECK: A[x, 3] = x + 3; +# CHECK: for (int i = 0; i < 3; i++) { +# CHECK: A[i, 0] = i; +# CHECK: A[i, 1] = i + 1; +# CHECK: A[i, 2] = i + 2; +# CHECK: A[i, 3] = i + 3; # CHECK: })IR"); } @@ -3174,8 +3131,8 @@ TEST(LoopNest, UnrollEmpty) { TEST(LoopNest, NoUnroll) { VarHandle upper_bound("N", kInt); - Tensor A = Compute( - "A", {{upper_bound, "x"}}, [&](const VarHandle& x) { return x * 2; }); + Tensor A = + Compute("A", {upper_bound}, [&](const VarHandle& x) { return x * 2; }); LoopNest l({A}); std::vector loops = l.getAllLoopNestsWritingToBuf(A.buf())[0]; StmtPtr unrolled = nullptr; @@ -3439,8 +3396,7 @@ TEST(LoopNest, NormalizeAndSplitWithTail) { // Create a dummy tensor to construct LoopNest. ExprHandle n(100); BufHandle a("a", {n}, kFloat); - Tensor b = - Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); }); + Tensor b = Compute("b", {n}, [&](const VarHandle& i) { return a.load(i); }); LoopNest l({b}); // Input IR: @@ -3486,8 +3442,7 @@ TEST(LoopNest, NotNormalizeAndSplitWithTail) { // Create a dummy tensor to construct LoopNest. ExprHandle n(100); BufHandle a("a", {n}, kFloat); - Tensor b = - Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); }); + Tensor b = Compute("b", {n}, [&](const VarHandle& i) { return a.load(i); }); LoopNest l({b}); // Input IR: @@ -3760,7 +3715,7 @@ TEST(LoopNest, FlattenReductionLoopNestFromTensor) { VarHandle m("m", kInt); VarHandle n("n", kInt); BufHandle b("b", {m, n}, kFloat); - Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}}); + Tensor c = Reduce("sum", {M}, Sum(), b, {N}); LoopNest loop({c}); HashProvider hasher; auto hash_before = hasher.hash(loop.root_stmt()); @@ -3815,28 +3770,26 @@ TEST(LoopNest, DetectInlineRankMismatch) { const int kTotalSize = 8; BufHandle a_buf("A", {ExprHandle(kTotalSize)}, kFloat); - Tensor a = Compute("a", {{kTotalSize, "i"}}, [&](const VarHandle& i) { - return a_buf.load(i); - }); + Tensor a = Compute( + "a", {kTotalSize}, [&](const VarHandle& i) { return a_buf.load(i); }); Tensor reshape = Compute( "reshape", - {{kTotalSize / 2, "i"}, {2, "j"}}, + {kTotalSize / 2, 2}, [&](const VarHandle& i, const VarHandle& j) { return a.load(i, j); }); LoopNest l({reshape}, {a, reshape}); ASSERT_FALSE(l.computeInline(l.getLoopBodyFor(a))); } TEST(LoopNest, CacheReadsSimple) { - Tensor A = Compute( - "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { - return i * j; - }); - Tensor B = Compute( - "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) { + Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) { + return i * j; + }); + Tensor B = + Compute("B", {20, 10}, [&](const VarHandle& i, const VarHandle& j) { return A.load(i + 30, j + 3); }); - Tensor C = Compute( - "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) { + Tensor C = + Compute("C", {20, 10}, [&](const VarHandle& i, const VarHandle& j) { return A.load(i + 10, j + 20) + A.load(i + 30, j + 40); }); @@ -3845,7 +3798,8 @@ TEST(LoopNest, CacheReadsSimple) { LoopNest::cacheAccesses(A.buf(), "A_local", j_loop); l.prepareForCodegen(); - StmtPtr result = IRSimplifier::simplify(l.root_stmt()); + StmtPtr result = + LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt())); SimpleIREvaluator cg(result, {B, C}); result = cg.stmt(); @@ -3894,16 +3848,15 @@ TEST(LoopNest, CacheReadsSimple) { } TEST(LoopNest, CacheReadsOuter) { - Tensor A = Compute( - "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { - return i * j; - }); - Tensor B = Compute( - "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) { + Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) { + return i * j; + }); + Tensor B = + Compute("B", {20, 10}, [&](const VarHandle& i, const VarHandle& j) { return A.load(i + 30, j + 40) + A.load(i + 31, j + 41); }); - Tensor C = Compute( - "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) { + Tensor C = + Compute("C", {20, 10}, [&](const VarHandle& i, const VarHandle& j) { return A.load(i + 10, j + 20) + A.load(i + 30, j + 40); }); @@ -3912,7 +3865,8 @@ TEST(LoopNest, CacheReadsOuter) { LoopNest::cacheAccesses(A.buf(), "A_local", i_loop); l.prepareForCodegen(); - StmtPtr result = IRSimplifier::simplify(l.root_stmt()); + StmtPtr result = + LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt())); SimpleIREvaluator cg(result, {B, C}); result = cg.stmt(); @@ -3941,16 +3895,15 @@ TEST(LoopNest, CacheReadsOuter) { } TEST(LoopNest, CacheReadsInternal) { - Tensor A = Compute( - "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { - return i * j; - }); - Tensor B = Compute( - "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) { + Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) { + return i * j; + }); + Tensor B = + Compute("B", {20, 10}, [&](const VarHandle& i, const VarHandle& j) { return A.load(i + 30, j + 40) + A.load(i + 31, j + 41); }); - Tensor C = Compute( - "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) { + Tensor C = + Compute("C", {20, 10}, [&](const VarHandle& i, const VarHandle& j) { return A.load(i + 10, j + 20) + A.load(i + 30, j + 40); }); @@ -3958,13 +3911,14 @@ TEST(LoopNest, CacheReadsInternal) { StmtPtr j_loop = l.getAllLoopNestsWritingToBuf(B.buf())[0][1]; LoopNest::cacheAccesses(A.buf(), "A_local", j_loop); l.prepareForCodegen(); - StmtPtr result = IRSimplifier::simplify(l.root_stmt()); + StmtPtr result = + LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt())); SimpleIREvaluator cg(result, {B, C}); result = cg.stmt(); checkIR(result, R"IR( #CHECK: Allocate(A_local); // dtype=int, dims=[2, 11] -#CHECK: A_local[j_1 + 11 * i_2] = +#CHECK: A_local[k + 11 * j_1] = #CHECK: B[j_2 + 10 * i_1] = (A_local[j_2 + 12]) + (A_local[j_2]); )IR"); @@ -3987,17 +3941,16 @@ TEST(LoopNest, CacheReadsInternal) { } TEST(LoopNest, CacheReadsInner) { - Tensor A = Compute( - "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { - return i * j; - }); + Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) { + return i * j; + }); // note im changing the offset of the first arg of the first call to A. - Tensor B = Compute( - "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) { + Tensor B = + Compute("B", {20, 10}, [&](const VarHandle& i, const VarHandle& j) { return A.load(i + 34, j + 40) + A.load(i + 30, j + 41); }); - Tensor C = Compute( - "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) { + Tensor C = + Compute("C", {20, 10}, [&](const VarHandle& i, const VarHandle& j) { return A.load(i + 10, j + 20) + A.load(i + 30, j + 40); }); @@ -4005,13 +3958,14 @@ TEST(LoopNest, CacheReadsInner) { StmtPtr body = l.getLoopBodyFor(B); LoopNest::cacheAccesses(A.buf(), "A_local", body); l.prepareForCodegen(); - StmtPtr result = IRSimplifier::simplify(l.root_stmt()); + StmtPtr result = + LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt())); SimpleIREvaluator cg(result, {B, C}); result = cg.stmt(); checkIR(result, R"IR( #CHECK: Allocate(A_local); // dtype=int, dims=[5, 2] -#CHECK: A_local[j_2 + 2 * i_2] = +#CHECK: A_local[l + 2 * k] = #CHECK: B[j_1 + 10 * i_1] = (A_local[1]) + (A_local[8]); )IR"); @@ -4034,16 +3988,15 @@ TEST(LoopNest, CacheReadsInner) { } TEST(LoopNest, CacheWritesSimple) { - Tensor A = Compute( - "A", {{64, "i"}, {64, "j"}}, [](const VarHandle& i, const VarHandle& j) { - return i * j; - }); - Tensor B = Compute( - "B", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) { + Tensor A = Compute("A", {64, 64}, [](const VarHandle& i, const VarHandle& j) { + return i * j; + }); + Tensor B = + Compute("B", {20, 10}, [&](const VarHandle& i, const VarHandle& j) { return A.load(i + 30, j + 40) + A.load(i + 31, j + 41); }); - Tensor C = Compute( - "C", {{20, "i"}, {10, "j"}}, [&](const VarHandle& i, const VarHandle& j) { + Tensor C = + Compute("C", {20, 10}, [&](const VarHandle& i, const VarHandle& j) { return A.load(i + 10, j + 20) + A.load(i + 30, j + 40); }); @@ -4052,7 +4005,8 @@ TEST(LoopNest, CacheWritesSimple) { LoopNest::cacheAccesses(A.buf(), "A_local", a_loop); l.prepareForCodegen(); - StmtPtr result = IRSimplifier::simplify(l.root_stmt()); + StmtPtr result = + LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt())); SimpleIREvaluator cg(result, {B, C}); result = cg.stmt(); @@ -4212,13 +4166,13 @@ TEST(LoopNest, InlineConstantIndex) { BufHandle x_buf("a", {1, N, 1}, kFloat); Tensor y = Compute( "f", - {{1, "m"}, {N, "n"}, {1, "o"}}, + {1, N, 1}, [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& o) { return x_buf.load(m, n, o); }); Tensor z = Compute( "f", - {{1, "m"}, {N, "n"}, {1, "o"}}, + {1, N, 1}, [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& o) { return y.load(m, n, o); }); @@ -4244,10 +4198,9 @@ TEST(LoopNest, CompoundTensorUsed) { BlockPtr body = Block::make({outer_for1, outer_for2}); Tensor A = Tensor(a_buf.node(), body); - Tensor B = Compute( - "B", {{10, "i"}, {3, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return A.load(i, j + 1) + A.load(i, j + 2); - }); + Tensor B = Compute("B", {10, 3}, [&](const VarHandle& i, const VarHandle& j) { + return A.load(i, j + 1) + A.load(i, j + 2); + }); LoopNest l({B}, {A, B}); ASSERT_FALSE(l.computeInline(A.buf())); @@ -4485,7 +4438,7 @@ TEST(LoopNest, OptimizeConditionalsMultipleStoresInOneLoop) { R"IR( # CHECK: for (int i = 0; i < 5 # CHECK-NEXT: A[i] = B[i] -# CHECK-NEXT: B[i] = IfThenElse(i<30 ? 1 : 0, C[i], D[i]) +# CHECK-NEXT: B[i] = C[i] # CHECK: for (int i = 0; i < 45 # CHECK-NEXT: A[i + 5] = C[i] # CHECK-NEXT: B[i + 5] = IfThenElse(i + 5<30 ? 1 : 0, C[i + 5], D[i + 5]) @@ -4813,11 +4766,11 @@ static std::pair colReduce(int M, int N) { BufHandle a("a", {M, N}, kFloat); Tensor t = Reduce( "b", - {{N, "n"}}, + {N}, Sum(), [&](const VarHandle& n, const VarHandle& m) { return a.load(m, n); }, - {{M, "m"}}); - return {a, t}; + {M}); + return {a, Tensor(t.buf(), LoopNest::sanitizeNames(t.stmt()))}; } static StmtPtr splitTailReorder(Tensor b) { @@ -4827,23 +4780,23 @@ static StmtPtr splitTailReorder(Tensor b) { nest.splitWithTail(loops[0], kVectorWidth); // Now the loopnests will look like: // - // for (int n_outer = 0; ... - // for (int n_inner = 0; ... - // b[n_outer * 8 + n_inner] = float(0); - // for (int m = 0; ... - // b[n_outer * 8 + n_inner] = ReduceOp(...); + // for (int i_outer = 0; ... + // for (int i_inner = 0; ... + // b[i_outer * 8 + i_inner] = float(0); + // for (int j = 0; ... + // b[i_outer * 8 + i_inner] = ReduceOp(...); // - // for (int n_tail = 0; ... - // b[n_tail + ((100 - 0) / 8) * 8] = float(0); - // for (int m = 0; ... - // b[n_tail + ((100 - 0) / 8) * 8] = ReduceOp(...); + // for (int i_tail = 0; ... + // b[i_tail + ((100 - 0) / 8) * 8] = float(0); + // for (int j = 0; ... + // b[i_tail + ((100 - 0) / 8) * 8] = ReduceOp(...); // // Since there are 4 writes to b, we will get 4 loopnests from the // call to `getAllLoopNestsWritingToBuf` below. // - // Write #2: "b[n_outer * 8 + n_inner] = ReduceOp(...)" - // Loopnest #2: {n_outer, n_inner, m}; - // We will have to reorder n_inner and m. + // Write #2: "b[i_outer * 8 + i_inner] = ReduceOp(...)" + // Loopnest #2: {i_outer, i_inner, j}; + // We will have to reorder i_inner and j. auto loopnests = nest.getAllLoopNestsWritingToBuf(b.buf()); LoopNest::reorderAxis(loopnests[1][1], loopnests[1][2]); nest.prepareForCodegen(); @@ -4891,11 +4844,11 @@ TEST(LoopNest, ColReduceSplitTailEvenReorder) { oss << *s; const std::string& verification_pattern = R"IR( -# CHECK: for (int n_outer -# CHECK-NEXT: for (int n_inner +# CHECK: for (int i_outer +# CHECK-NEXT: for (int i_inner # CHECK-NEXT: b[ -# CHECK: for (int m -# CHECK-NEXT: for (int n_inner +# CHECK: for (int j +# CHECK-NEXT: for (int i_inner # CHECK-NEXT: b[ # CHECK-NOT: for ( )IR"; @@ -4913,15 +4866,15 @@ TEST(LoopNest, ColReduceSplitTailUnevenReorder) { oss << *s; const std::string& verification_pattern = R"IR( -# CHECK: for (int n_outer -# CHECK-NEXT: for (int n_inner +# CHECK: for (int i_outer +# CHECK-NEXT: for (int i_inner # CHECK-NEXT: b[ -# CHECK: for (int m -# CHECK-NEXT: for (int n_inner +# CHECK: for (int j +# CHECK-NEXT: for (int i_inner # CHECK-NEXT: b[ -# CHECK: for (int n_tail +# CHECK: for (int i_tail # CHECK-NEXT: b[ -# CHECK-NEXT: for (int m +# CHECK-NEXT: for (int j # CHECK-NEXT: b[ )IR"; torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); @@ -4985,10 +4938,10 @@ TEST(LoopNest, ReorderAxisWithMultipleConds) { TEST(LoopNest, VectorizeUse) { constexpr int N = 8; BufHandle a("a", {N}, kFloat); - Tensor b = Compute( - "b", {{N, "n"}}, [&](const VarHandle& n) { return a.load(n) + 1.0f; }); - Tensor c = Compute( - "c", {{N, "n"}}, [&](const VarHandle& n) { return b.load(n) + 2.0f; }); + Tensor b = + Compute("b", {N}, [&](const VarHandle& n) { return a.load(n) + 1.0f; }); + Tensor c = + Compute("c", {N}, [&](const VarHandle& n) { return b.load(n) + 2.0f; }); LoopNest nest({c}, {b, c}); auto loops = nest.getAllLoopNestsWritingToBuf(b.buf())[0]; ASSERT_TRUE(LoopNest::vectorize(loops[0])); @@ -5007,8 +4960,8 @@ TEST(LoopNest, VectorizeUse) { } const char* int64Loop = R"IR( -# CHECK: for (int64_t n = 0ll; n < 12ll; n++) { -# CHECK: b[n] = (a[n]) + 1ll; +# CHECK: for (int64_t i = 0ll; i < 12ll; i++) { +# CHECK: b[i] = (a[i]) + 1ll; # CHECK: } )IR"; @@ -5016,7 +4969,7 @@ TEST(LoopNest, Int64Direct) { constexpr int64_t N = 12; BufHandle a("a", {N}, kLong); BufHandle b("b", {N}, kLong); - VarHandle n("n", kLong); + VarHandle n("i", kLong); StmtPtr s = For::make( n, LongImm::make(0l), N, b.store({n}, a.load({n}) + LongImm::make(1l))); s = IRSimplifier::simplify(s); @@ -5028,7 +4981,7 @@ TEST(LoopNest, Int64Direct) { TEST(LoopNest, Int64Compute) { constexpr int64_t N = 12; BufHandle a("a", {N}, kLong); - Tensor b = Compute("b", {{N, "n"}}, [&](const VarHandle& n) { + Tensor b = Compute("b", {N}, [&](const VarHandle& n) { return a.load(n) + LongImm::make(1l); }); LoopNest nest({b}); @@ -6935,15 +6888,15 @@ TEST(LoopNest, compressMultipleBuffers) { } TEST(LoopNest, sanitizeNames) { - std::vector dim_args; + std::vector dim_args; // Let's pick names that would overlap with default index names if not // sanitized properly: - dim_args.emplace_back(ExprHandle(alloc("i", kInt)), ""); - dim_args.emplace_back(ExprHandle(alloc("N:2", kInt)), ""); + dim_args.emplace_back(ExprHandle(alloc("i", kInt))); + dim_args.emplace_back(ExprHandle(alloc("N:2", kInt))); // Now let's create a many dimensions so that we had to use the same letter // for different loops for (int i = 0; i < 10; i++) { - dim_args.emplace_back(ExprHandle(alloc("N", kInt)), ""); + dim_args.emplace_back(ExprHandle(alloc("N", kInt))); } // Now create two Computes with conflicting after sanitization names: diff --git a/test/cpp/tensorexpr/test_memdependency.cpp b/test/cpp/tensorexpr/test_memdependency.cpp index 7019353937b7..03ea24a87afd 100644 --- a/test/cpp/tensorexpr/test_memdependency.cpp +++ b/test/cpp/tensorexpr/test_memdependency.cpp @@ -76,6 +76,134 @@ TEST(MemDependency, BoundOverlap) { ASSERT_EQ(ContainedOrEqual, boundOverlap(CB(15, 15), CB(2, 15))); } +TEST(MemDependency, BoundComparison) { + using namespace analysis; + + auto CB = [](int s, int e) { + return Bound(alloc(s), alloc(e)); + }; + + ASSERT_EQ( + CmpEvalResult::NOT_DETERMINED, + compareBound(CB(10, 100), CB(10, 100), CompareSelectOperation::kEQ)); + ASSERT_EQ( + CmpEvalResult::TRUE, + compareBound(CB(10, 10), CB(10, 10), CompareSelectOperation::kEQ)); + ASSERT_EQ( + CmpEvalResult::FALSE, + compareBound(CB(10, 20), CB(30, 40), CompareSelectOperation::kEQ)); + ASSERT_EQ( + CmpEvalResult::FALSE, + compareBound(CB(30, 40), CB(10, 20), CompareSelectOperation::kEQ)); + ASSERT_EQ( + CmpEvalResult::NOT_DETERMINED, + compareBound(CB(30, 40), CB(40, 50), CompareSelectOperation::kEQ)); + ASSERT_EQ( + CmpEvalResult::NOT_DETERMINED, + compareBound(CB(30, 40), CB(20, 30), CompareSelectOperation::kEQ)); + ASSERT_EQ( + CmpEvalResult::NOT_DETERMINED, + compareBound(CB(30, 45), CB(40, 50), CompareSelectOperation::kEQ)); + + ASSERT_EQ( + CmpEvalResult::NOT_DETERMINED, + compareBound(CB(10, 100), CB(10, 100), CompareSelectOperation::kNE)); + ASSERT_EQ( + CmpEvalResult::FALSE, + compareBound(CB(10, 10), CB(10, 10), CompareSelectOperation::kNE)); + ASSERT_EQ( + CmpEvalResult::TRUE, + compareBound(CB(10, 20), CB(30, 40), CompareSelectOperation::kNE)); + ASSERT_EQ( + CmpEvalResult::TRUE, + compareBound(CB(30, 40), CB(10, 20), CompareSelectOperation::kNE)); + ASSERT_EQ( + CmpEvalResult::NOT_DETERMINED, + compareBound(CB(30, 40), CB(40, 50), CompareSelectOperation::kNE)); + ASSERT_EQ( + CmpEvalResult::NOT_DETERMINED, + compareBound(CB(30, 40), CB(20, 30), CompareSelectOperation::kEQ)); + ASSERT_EQ( + CmpEvalResult::NOT_DETERMINED, + compareBound(CB(30, 45), CB(40, 50), CompareSelectOperation::kNE)); + + ASSERT_EQ( + CmpEvalResult::TRUE, + compareBound(CB(10, 20), CB(30, 40), CompareSelectOperation::kLT)); + ASSERT_EQ( + CmpEvalResult::FALSE, + compareBound(CB(30, 40), CB(10, 20), CompareSelectOperation::kLT)); + ASSERT_EQ( + CmpEvalResult::FALSE, + compareBound(CB(30, 40), CB(10, 30), CompareSelectOperation::kLT)); + ASSERT_EQ( + CmpEvalResult::NOT_DETERMINED, + compareBound(CB(10, 100), CB(10, 100), CompareSelectOperation::kLT)); + ASSERT_EQ( + CmpEvalResult::NOT_DETERMINED, + compareBound(CB(30, 40), CB(40, 50), CompareSelectOperation::kLT)); + ASSERT_EQ( + CmpEvalResult::NOT_DETERMINED, + compareBound(CB(30, 45), CB(40, 50), CompareSelectOperation::kLT)); + + ASSERT_EQ( + CmpEvalResult::FALSE, + compareBound(CB(10, 20), CB(30, 40), CompareSelectOperation::kGE)); + ASSERT_EQ( + CmpEvalResult::TRUE, + compareBound(CB(30, 40), CB(10, 20), CompareSelectOperation::kGE)); + ASSERT_EQ( + CmpEvalResult::TRUE, + compareBound(CB(30, 40), CB(10, 30), CompareSelectOperation::kGE)); + ASSERT_EQ( + CmpEvalResult::NOT_DETERMINED, + compareBound(CB(10, 100), CB(10, 100), CompareSelectOperation::kGE)); + ASSERT_EQ( + CmpEvalResult::NOT_DETERMINED, + compareBound(CB(30, 40), CB(40, 50), CompareSelectOperation::kGE)); + ASSERT_EQ( + CmpEvalResult::NOT_DETERMINED, + compareBound(CB(30, 45), CB(40, 50), CompareSelectOperation::kGE)); + + ASSERT_EQ( + CmpEvalResult::FALSE, + compareBound(CB(10, 20), CB(30, 40), CompareSelectOperation::kGT)); + ASSERT_EQ( + CmpEvalResult::FALSE, + compareBound(CB(30, 40), CB(40, 50), CompareSelectOperation::kGT)); + ASSERT_EQ( + CmpEvalResult::NOT_DETERMINED, + compareBound(CB(10, 100), CB(10, 100), CompareSelectOperation::kGT)); + ASSERT_EQ( + CmpEvalResult::TRUE, + compareBound(CB(30, 40), CB(10, 20), CompareSelectOperation::kGT)); + ASSERT_EQ( + CmpEvalResult::NOT_DETERMINED, + compareBound(CB(30, 40), CB(10, 30), CompareSelectOperation::kGT)); + ASSERT_EQ( + CmpEvalResult::NOT_DETERMINED, + compareBound(CB(30, 45), CB(40, 50), CompareSelectOperation::kGT)); + + ASSERT_EQ( + CmpEvalResult::TRUE, + compareBound(CB(10, 20), CB(30, 40), CompareSelectOperation::kLE)); + ASSERT_EQ( + CmpEvalResult::TRUE, + compareBound(CB(30, 40), CB(40, 50), CompareSelectOperation::kLE)); + ASSERT_EQ( + CmpEvalResult::NOT_DETERMINED, + compareBound(CB(10, 100), CB(10, 100), CompareSelectOperation::kLE)); + ASSERT_EQ( + CmpEvalResult::FALSE, + compareBound(CB(30, 40), CB(10, 20), CompareSelectOperation::kLE)); + ASSERT_EQ( + CmpEvalResult::NOT_DETERMINED, + compareBound(CB(30, 40), CB(10, 30), CompareSelectOperation::kLE)); + ASSERT_EQ( + CmpEvalResult::NOT_DETERMINED, + compareBound(CB(30, 45), CB(40, 50), CompareSelectOperation::kLE)); +} + TEST(MemDependency, BoundOverlapSymbolic) { VarHandle x("x", kInt); VarHandle y("y", kInt); @@ -274,7 +402,7 @@ TEST(MemDependency, BoundSubtractMultiDim) { if (x.size() != y.size()) { return false; } - for (auto i = 0; i < x.size(); ++i) { + for (auto i = 0U; i < x.size(); ++i) { if (!indexBoundsEquals(x[i], y[i])) { return false; } @@ -338,7 +466,7 @@ TEST(MemDependency, BoundSubtractMultiDimSymbolic) { if (x.size() != y.size()) { return false; } - for (auto i = 0; i < x.size(); ++i) { + for (auto i = 0U; i < x.size(); ++i) { if (!indexBoundsEquals(x[i], y[i])) { return false; } @@ -543,8 +671,7 @@ TEST(MemDependency, MemDependencyCheckerLoopReduce) { */ StorePtr aInit = Store::make(a, {0}, 0); - ExprHandle reduce = - ExprHandle(Sum()(a.node(), ExprHandle(1), {x.node()}, {x.node()})); + ExprHandle reduce = Sum()(a, 1, {x}, {x}); StorePtr aReduce = Store::make(a, {0}, reduce); StmtPtr loop = For::make(x, 0, 10, aReduce); StorePtr bStore = Store::make(b, {0}, Load::make(a, {0})); @@ -2697,13 +2824,13 @@ TEST(MemDependency, MemDependencyCheckerComputeAPI) { BufHandle b_buf("b", {5, 6}, kFloat); Tensor c = Compute( "broadcast_add", - {{4, "m"}, {5, "n"}, {6, "k"}}, + {4, 5, 6}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return a_buf.load(m, n) + b_buf.load(n, k); }); Tensor d = Compute( "d", - {{4, "m"}, {5, "n"}, {6, "k"}}, + {4, 5, 6}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return c.load(m, n, k) + 1; }); @@ -2742,13 +2869,13 @@ TEST(MemDependency, MemDependencyCheckerComputeInline) { BufHandle b_buf("b", {5, 6}, kFloat); Tensor c = Compute( "broadcast_add", - {{4, "m"}, {5, "n"}, {6, "k"}}, + {4, 5, 6}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return a_buf.load(m, n) + b_buf.load(n, k); }); Tensor d = Compute( "d", - {{4, "m"}, {5, "n"}, {6, "k"}}, + {4, 5, 6}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return c.load(m, n, k) + 1; }); @@ -2777,7 +2904,7 @@ TEST(MemDependency, MemDependencyCheckerComputeSplit) { BufHandle b_buf("b", {5, 6}, kFloat); Tensor c = Compute( "broadcast_add", - {{4, "m"}, {5, "n"}, {6, "k"}}, + {4, 5, 6}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return a_buf.load(m, n) + b_buf.load(n, k); }); @@ -2823,7 +2950,7 @@ TEST(MemDependency, MemDependencyCheckerComputeReorder) { BufHandle b_buf("b", {5, 6}, kFloat); Tensor c = Compute( "broadcast_add", - {{4, "m"}, {5, "n"}, {6, "k"}}, + {4, 5, 6}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return a_buf.load(m, n) + b_buf.load(n, k); }); @@ -2889,11 +3016,11 @@ TEST(MemDependency, MemDependencyCheckerComputeReduce) { Tensor c = Compute( "scale", - {{2, "l2"}, {3, "n1"}, {6, "m1"}}, + {2, 3, 6}, [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { return b.load(l, n, m) * a.load(l, n, m); }); - Tensor d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {6, "m1"}}); + Tensor d = Reduce("sum", {2}, Sum(), c, {3, 6}); LoopNest l({d}, {c, d}); MemDependencyChecker analyzer({a.node(), b.node()}, {d.buf()}); @@ -2925,12 +3052,12 @@ TEST(MemDependency, MemDependencyCheckerComputeGEMM) { BufHandle BP("B", {K, N}, kFloat); Tensor CT = Reduce( "gemm", - {{M, "M"}, {N, "N"}}, + {M, N}, Sum(), [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); }, - {{K, "K"}}); + {K}); LoopNest loop({CT}); { diff --git a/test/cpp/tensorexpr/test_memplanning.cpp b/test/cpp/tensorexpr/test_memplanning.cpp index ec58aa8f6668..f5ee8747650f 100644 --- a/test/cpp/tensorexpr/test_memplanning.cpp +++ b/test/cpp/tensorexpr/test_memplanning.cpp @@ -1,6 +1,8 @@ #include #include +#include +#include #include #include #include @@ -85,6 +87,232 @@ TEST(BufLiveRange, MulRangeLine) { ASSERT_TRUE(std::get<1>(range_b) == 1); } +TEST(MemPlanning, MemReuseWithTypeCast) { + int M = 4; + int N = 4; + int K = 4; + + BufHandle AP("A", {M, K}, kFloat); + BufHandle BP("B", {K, N}, kFloat); + + Tensor CT = Reduce( + "gemm", + {M, N}, + Sum(), + [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) { + return AP.load(m, k) * BP.load(k, n); + }, + {K}); + Tensor DT = + Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) { + return CompareSelect::make( + CT.load(m, n), 0.0f, 0.0f, CT.load(m, n), kLT); + }); + Tensor ET = + Compute("E", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) { + return Cast::make(kQUInt8, DT.load(m, n) + DT.load(m, n)); + }); + Tensor FT = + Compute("F", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) { + return ET.load(m, n); + }); + StmtPtr stmt = + tensorexpr::Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt()}); + + // Constructed stmt: + // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2], + // E [2, 3]. The dimensions of 'gemm' and 'E' are the same but their types are + // different: 'E' type quint8 < 'gemm' type float. We'll reuse 'gemm' for 'E' + // with typecasting. + //{ + // for (int i = 0; i < 4; i++) { + // for (int i_1 = 0; i_1 < 4; i_1++) { + // gemm[i, i_1] = float(0); + // for (int i_2 = 0; i_2 < 4; i_2++) { + // gemm[i, i_1] = ReduceOp((gemm[i, i_1]) + (A[i, i_2]) * (B[i_2, + // i_1]), reduce_args={i_2}); + // } + // } + // } + // for (int i_3 = 0; i_3 < 4; i_3++) { + // for (int i_4 = 0; i_4 < 4; i_4++) { + // relu[i_3, i_4] = (gemm[i_3, i_4])<0.f ? 0.f : (gemm[i_3, i_4]); + // } + // } + // for (int i_5 = 0; i_5 < 4; i_5++) { + // for (int i_6 = 0; i_6 < 4; i_6++) { + // E[i_5, i_6] = quint8((relu[i_5, i_6]) + (relu[i_5, i_6])); + // } + // } + // for (int i_7 = 0; i_7 < 4; i_7++) { + // for (int i_8 = 0; i_8 < 4; i_8++) { + // F[i_7, i_8] = E[i_7, i_8]; + // } + // } + //} + + LoopNest l(stmt, {FT.buf()}); + l.prepareForCodegen(); + SimpleIREvaluator cg(Stmt::clone(l.root_stmt()), {AP, BP, FT}); + + checkIR(cg.stmt(), R"IR( +# CHECK: Allocate(gemm); // dtype=float, dims=[4, 4] +# CHECK: Allocate(relu); // dtype=float, dims=[4, 4] +# CHECK: Alias(E,gemm); +# CHECK: Free(relu); +# CHECK: Free(gemm))IR"); + + PaddedBuffer a_v(M, K, "a"); + PaddedBuffer b_v(K, N, "b"); + PaddedBuffer o1(M, N, "e_before"); + PaddedBuffer o2(M, N, "e_after"); + + for (const auto m : c10::irange(M)) { + for (const auto k : c10::irange(K)) { + a_v(m, k) = at::randn({1}).item().to(); + } + } + + for (const auto k : c10::irange(K)) { + for (const auto n : c10::irange(N)) { + b_v(k, n) = at::randn({1}).item().to(); + } + } + + cg.call({a_v, b_v, o1}); + +#ifdef TORCH_ENABLE_LLVM + LLVMCodeGen cg_llvm(Stmt::clone(l.root_stmt()), {AP, BP, FT}); + + checkIR(cg_llvm.stmt(), R"IR( +# CHECK: Allocate(gemm); // dtype=float, dims=[4, 4] +# CHECK: Allocate(relu); // dtype=float, dims=[4, 4] +# CHECK: Alias(E,gemm); +# CHECK: Free(relu); +# CHECK: Free(gemm))IR"); + + cg_llvm.call({a_v, b_v, o2}); + + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) + ExpectAllNear(o1, o2, 1e-5); +#endif +} + +TEST(MemPlanning, NoMemReuseForLargerType) { + int M = 4; + int N = 4; + int K = 4; + + BufHandle AP("A", {M, K}, kShort); + BufHandle BP("B", {K, N}, kShort); + + Tensor CT = Reduce( + "gemm", + {M, N}, + Sum(), + [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) { + return AP.load(m, k) * BP.load(k, n); + }, + {K}); + auto zero = Cast::make(CT.buf()->dtype(), 0); + Tensor DT = + Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) { + return CompareSelect::make( + CT.load(m, n), zero, zero, CT.load(m, n), kLT); + }); + Tensor ET = + Compute("E", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) { + return Cast::make(kFloat, DT.load(m, n) + DT.load(m, n)); + }); + Tensor FT = + Compute("F", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) { + return ET.load(m, n); + }); + StmtPtr stmt = + tensorexpr::Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt()}); + + // Constructed stmt: + // Intermediate buffers and their liveness ranges: gemm [0, 1], relu [1, 2], + // E [2, 3]. The dimensions of 'gemm' and 'E' are the same but their types are + // different: 'E' type float > 'gemm' type int16. We won't reuse 'gemm' for + // 'E'. + //{ + // for (int i = 0; i < 4; i++) { + // for (int i_1 = 0; i_1 < 4; i_1++) { + // gemm[i, i_1] = int16_t(0); + // for (int i_2 = 0; i_2 < 4; i_2++) { + // gemm[i, i_1] = ReduceOp((gemm[i, i_1]) + (A[i, i_2]) * (B[i_2, + // i_1]), reduce_args={i_2}); + // } + // } + // } + // for (int i_3 = 0; i_3 < 4; i_3++) { + // for (int i_4 = 0; i_4 < 4; i_4++) { + // relu[i_3, i_4] = (gemm[i_3, i_4]) a_v(M, K, "a"); + PaddedBuffer b_v(K, N, "b"); + PaddedBuffer o1(M, N, "e_before"); + PaddedBuffer o2(M, N, "e_after"); + + for (const auto m : c10::irange(M)) { + for (const auto k : c10::irange(K)) { + a_v(m, k) = at::randn({1}).item().to(); + } + } + + for (const auto k : c10::irange(K)) { + for (const auto n : c10::irange(N)) { + b_v(k, n) = at::randn({1}).item().to(); + } + } + + cg.call({a_v, b_v, o1}); + +#ifdef TORCH_ENABLE_LLVM + LLVMCodeGen cg_llvm(Stmt::clone(l.root_stmt()), {AP, BP, FT}); + + checkIR(cg_llvm.stmt(), R"IR( +# CHECK: Allocate(gemm); // dtype=int16_t, dims=[4, 4] +# CHECK: Allocate(relu); // dtype=int16_t, dims=[4, 4] +# CHECK: Allocate(E); // dtype=float, dims=[4, 4] +# CHECK: Free(E); +# CHECK: Free(relu); +# CHECK: Free(gemm))IR"); + + cg_llvm.call({a_v, b_v, o2}); + + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) + ExpectAllNear(o1, o2, 1e-5); +#endif +} + TEST(MemPlanning, SameBufSizeMemReuse) { int M = 1024; int N = 1024; @@ -95,30 +323,24 @@ TEST(MemPlanning, SameBufSizeMemReuse) { Tensor CT = Reduce( "gemm", - {{M, "M"}, {N, "N"}}, + {M, N}, Sum(), [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); }, - {{K, "K"}}); - Tensor DT = Compute( - "relu", - {{M, "M"}, {N, "N"}}, - [&](const ExprHandle& m, const ExprHandle& n) { + {K}); + Tensor DT = + Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) { auto zero = Cast::make(CT.buf()->dtype(), 0); return CompareSelect::make( CT.load(m, n), zero, zero, CT.load(m, n), kLT); }); - Tensor ET = Compute( - "add", - {{M, "M"}, {N, "N"}}, - [&](const ExprHandle& m, const ExprHandle& n) { + Tensor ET = + Compute("add", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) { return DT.load(m, n) + DT.load(m, n); }); - Tensor FT = Compute( - "mul", - {{M, "M"}, {N, "N"}}, - [&](const ExprHandle& m, const ExprHandle& n) { + Tensor FT = + Compute("mul", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) { return ET.load(m, n) * ET.load(m, n); }); auto stmt = Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt()}); @@ -188,36 +410,28 @@ TEST(MemPlanning, SameBufSizeMultiMemReuses) { Tensor CT = Reduce( "gemm", - {{M, "M"}, {N, "N"}}, + {M, N}, Sum(), [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); }, - {{K, "K"}}); - Tensor DT = Compute( - "relu", - {{M, "M"}, {N, "N"}}, - [&](const ExprHandle& m, const ExprHandle& n) { + {K}); + Tensor DT = + Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) { auto zero = Cast::make(CT.buf()->dtype(), 0); return CompareSelect::make( CT.load(m, n), zero, zero, CT.load(m, n), kLT); }); - Tensor ET = Compute( - "add", - {{M, "M"}, {N, "N"}}, - [&](const ExprHandle& m, const ExprHandle& n) { + Tensor ET = + Compute("add", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) { return DT.load(m, n) + DT.load(m, n); }); - Tensor FT = Compute( - "mul", - {{M, "M"}, {N, "N"}}, - [&](const ExprHandle& m, const ExprHandle& n) { + Tensor FT = + Compute("mul", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) { return ET.load(m, n) * ET.load(m, n); }); - Tensor GT = Compute( - "sub", - {{M, "M"}, {N, "N"}}, - [&](const ExprHandle& m, const ExprHandle& n) { + Tensor GT = + Compute("sub", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) { return FT.load(m, n) - ET.load(m, n); }); @@ -296,42 +510,32 @@ TEST(MemPlanning, SameBufSizeMultiMemReusesOfOneBuf) { Tensor CT = Reduce( "gemm", - {{M, "M"}, {N, "N"}}, + {M, N}, Sum(), [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); }, - {{K, "K"}}); - Tensor DT = Compute( - "relu", - {{M, "M"}, {N, "N"}}, - [&](const ExprHandle& m, const ExprHandle& n) { + {K}); + Tensor DT = + Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) { auto zero = Cast::make(CT.buf()->dtype(), 0); return CompareSelect::make( CT.load(m, n), zero, zero, CT.load(m, n), kLT); }); - Tensor ET = Compute( - "add", - {{M, "M"}, {N, "N"}}, - [&](const ExprHandle& m, const ExprHandle& n) { + Tensor ET = + Compute("add", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) { return DT.load(m, n) + DT.load(m, n); }); - Tensor FT = Compute( - "mul", - {{M, "M"}, {N, "N"}}, - [&](const ExprHandle& m, const ExprHandle& n) { + Tensor FT = + Compute("mul", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) { return ET.load(m, n) * ET.load(m, n); }); - Tensor GT = Compute( - "sub", - {{M, "M"}, {N, "N"}}, - [&](const ExprHandle& m, const ExprHandle& n) { + Tensor GT = + Compute("sub", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) { return FT.load(m, n) - 1; }); - Tensor HT = Compute( - "div", - {{M, "M"}, {N, "N"}}, - [&](const ExprHandle& m, const ExprHandle& n) { + Tensor HT = + Compute("div", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) { return GT.load(m, n) / 2; }); @@ -418,30 +622,24 @@ TEST(MemPlanning, SmallerBufSizeNonMemReuse) { Tensor CT = Reduce( "gemm", - {{M, "M"}, {N, "N"}}, + {M, N}, Sum(), [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) { return AP.load(m, k) * BP.load(k, n); }, - {{K, "K"}}); - Tensor DT = Compute( - "relu", - {{M, "M"}, {N, "N"}}, - [&](const ExprHandle& m, const ExprHandle& n) { + {K}); + Tensor DT = + Compute("relu", {M, N}, [&](const ExprHandle& m, const ExprHandle& n) { auto zero = Cast::make(CT.buf()->dtype(), 0); return CompareSelect::make( CT.load(m, n), zero, zero, CT.load(m, n), kLT); }); Tensor ET = Compute( - "add", - {{M * 2, "EM"}, {N * 2, "EN"}}, - [&](const ExprHandle& em, const ExprHandle& en) { + "add", {M * 2, N * 2}, [&](const ExprHandle& em, const ExprHandle& en) { return DT.load(em / 2, en / 2) + DT.load(em / 2, en / 2); }); Tensor FT = Compute( - "mul", - {{M * 2, "FM"}, {N * 2, "FN"}}, - [&](const ExprHandle& fm, const ExprHandle& fn) { + "mul", {M * 2, N * 2}, [&](const ExprHandle& fm, const ExprHandle& fn) { return ET.load(fm, fn) * ET.load(fm, fn); }); auto stmt = Block::make({CT.stmt(), DT.stmt(), ET.stmt(), FT.stmt()}); diff --git a/test/cpp/tensorexpr/test_ops.cpp b/test/cpp/tensorexpr/test_ops.cpp index e4c9155ff60c..379c901968d5 100644 --- a/test/cpp/tensorexpr/test_ops.cpp +++ b/test/cpp/tensorexpr/test_ops.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -24,12 +25,15 @@ TEST(Ops, Sum) { constexpr int N = 16; std::vector testDims = {{0}, {1}, {0, 1}}; std::vector> outputShapes = {{N}, {M}, {}}; - for (int idx = 0; idx < testDims.size(); idx++) { + for (unsigned idx = 0; idx < testDims.size(); idx++) { const auto& dims = testDims[idx]; const auto& outShape = outputShapes[idx]; BufHandle a("a", {M, N}, kFloat); - Tensor b = computeSum({a, dims, false}, outShape, c10::kFloat, at::kCPU); + std::vector outStrides = + c10::fmap(make_contiguous_strides(outShape)); + Tensor b = computeSum( + {a, dims, false}, outShape, outStrides, c10::kFloat, at::kCPU); auto cg = compile({a}, {b}); auto at = at::arange(M * N, at::kFloat).view({M, N}); @@ -41,3 +45,34 @@ TEST(Ops, Sum) { ASSERT_TRUE(at::allclose(bt, ref)); } } + +TEST(Ops, ChannelsLastSum) { + constexpr int A = 2; + constexpr int B = 3; + constexpr int C = 4; + constexpr int D = 5; + constexpr int E = 6; + std::vector testDims = {{0}, {1}, {0, 1}}; + + std::vector> outputShapes = { + {B, C, D, E}, {A, C, D, E}, {C, D, E}}; + for (unsigned idx = 0; idx < testDims.size(); idx++) { + const auto& dims = testDims[idx]; + const auto& outShape = outputShapes[idx]; + + BufHandle a("a", {A, B, C, D, E}, kFloat); + std::vector outStrides = + c10::fmap(make_channels_last_strides(outShape)); + Tensor b = computeSum( + {a, dims, false}, outShape, outStrides, c10::kFloat, at::kCPU); + auto cg = compile({a}, {b}); + + auto at = at::arange(A * B * C * D * E, at::kFloat).view({A, B, C, D, E}); + auto ref = at::sum(at, dims); + auto bt = at::empty_like(ref); + + cg->call({at.data_ptr(), bt.data_ptr()}); + + ASSERT_TRUE(at::allclose(bt, ref)); + } +} diff --git a/test/cpp/tensorexpr/test_quantization.cpp b/test/cpp/tensorexpr/test_quantization.cpp index f6643c86846f..82eb8573cff5 100644 --- a/test/cpp/tensorexpr/test_quantization.cpp +++ b/test/cpp/tensorexpr/test_quantization.cpp @@ -1,6 +1,6 @@ #include -#include +#include #include #include #include @@ -90,6 +90,38 @@ TEST_F(Quantization, QuantDequantUInt8) { CHECK_EQ(check, 1); } +TEST_F(Quantization, QuantDequantUInt8_NLC) { + const auto graph_string = R"IR( + graph(%x.1 : Float(1, 2, 2, strides=[4, 1, 2], device=cpu)): + %2 : int = prim::Constant[value=13]() + %3 : int = prim::Constant[value=122]() + %4 : float = prim::Constant[value=0.1]() + %q.1 : QUInt8(1, 2, 2) = aten::quantize_per_tensor(%x.1, %4, %3, %2) + %6 : Float(1, 2, 2) = aten::dequantize(%q.1) + return (%6))IR"; + auto graph = std::make_shared(); + parseIR(graph_string, &*graph); + + auto x = 2 * at::rand({1, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat)); + x.unsafeGetTensorImpl()->set_sizes_and_strides({1, 2, 2}, {4, 1, 2}); + auto q = at::quantize_per_tensor(x, 0.1f, 122, at::kQUInt8); + auto y_expected = at::dequantize(q); + TensorExprKernel k(graph); + std::vector inputs = {x}; + StmtPtr s = k.getCodeGenStmt(); + + std::vector stack = fmap(inputs); + k.run(stack); + auto y = stack[0].toTensor(); + bool check = at::allclose(y_expected, y); + if (!check) { + std::cout << "x:\n" << x << std::endl; + std::cout << "y_expected:\n" << y_expected << std::endl; + std::cout << "y:\n" << y << std::endl; + } + CHECK_EQ(check, 1); +} + at::Tensor quantized_add( at::Tensor x1, at::Tensor x2, @@ -189,7 +221,99 @@ TEST_F(Quantization, QuantAddDequantUInt8) { CHECK_EQ(check, 1); } -TEST_F(Quantization, QuantUpsampleNearest2dDequantUInt8) { +TEST_F(Quantization, QuantSigmoidDequantUInt8) { + const auto graph_string = R"IR( + graph(%x1 : Float(2, 2, strides=[2, 1], device=cpu)): + %2 : int = prim::Constant[value=13]() + %qz1 : int = prim::Constant[value=13]() + %qs1 : float = prim::Constant[value=0.1]() + %q1 : QUInt8(2, 2) = aten::quantize_per_tensor(%x1, %qs1, %qz1, %2) + %qa : QUInt8(2, 2) = aten::sigmoid(%q1) + %6 : Float(2, 2) = aten::dequantize(%qa) + return (%6))IR"; + auto graph = std::make_shared(); + parseIR(graph_string, &*graph); + + auto x1 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat)); + auto q1 = at::quantize_per_tensor(x1, 0.1f, 13, at::kQUInt8); + auto qs = at::sigmoid(q1); + auto y_expected = at::dequantize(qs); + + TensorExprKernel k(graph); + std::vector inputs = {x1}; + StmtPtr s = k.getCodeGenStmt(); + + std::vector stack = fmap(inputs); + k.run(stack); + auto y = stack[0].toTensor(); + bool check = at::allclose(y_expected, y); + if (!check) { + std::cout << "x1:\n" << x1 << std::endl; + std::cout << "q1:\n" << q1 << std::endl; + std::cout << "qs:\n" << qs << std::endl; + std::cout << "y_expected:\n" << y_expected << std::endl; + std::cout << "y:\n" << y << std::endl; + } + CHECK_EQ(check, 1); +} + +at::Tensor quantized_mul( + at::Tensor x1, + at::Tensor x2, + double scale, + int64_t zero) { + const auto op = + c10::Dispatcher::singleton() + .findSchemaOrThrow("quantized::mul", "") + .typed(); + return op.call(x1, x2, scale, zero); +} + +TEST_F(Quantization, QuantMulDequantUInt8) { + const auto graph_string = R"IR( + graph(%x1 : Float(2, 2, strides=[2, 1], device=cpu), %x2 : Float(2, 2, strides=[2, 1], device=cpu)): + %2 : int = prim::Constant[value=13]() + %qz1 : int = prim::Constant[value=13]() + %qs1 : float = prim::Constant[value=0.1]() + %qz2 : int = prim::Constant[value=13]() + %qs2 : float = prim::Constant[value=0.1]() + %qza : int = prim::Constant[value=13]() + %qsa : float = prim::Constant[value=0.1]() + %q1 : QUInt8(2, 2) = aten::quantize_per_tensor(%x1, %qs1, %qz1, %2) + %q2 : QUInt8(2, 2) = aten::quantize_per_tensor(%x2, %qs2, %qz2, %2) + %qa : QUInt8(2, 2) = quantized::mul(%q1, %q2, %qsa, %qza) + %6 : Float(2, 2) = aten::dequantize(%qa) + return (%6))IR"; + auto graph = std::make_shared(); + parseIR(graph_string, &*graph); + + auto x1 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat)); + auto x2 = at::rand({2, 2}, TensorOptions(kCPU).dtype(at::kFloat)); + auto q1 = at::quantize_per_tensor(x1, 0.1f, 13, at::kQUInt8); + auto q2 = at::quantize_per_tensor(x2, 0.1f, 13, at::kQUInt8); + auto qa = quantized_mul(q1, q2, 0.1f, 13); + auto y_expected = at::dequantize(qa); + + TensorExprKernel k(graph); + std::vector inputs = {x1, x2}; + StmtPtr s = k.getCodeGenStmt(); + + std::vector stack = fmap(inputs); + k.run(stack); + auto y = stack[0].toTensor(); + bool check = at::allclose(y_expected, y); + if (!check) { + std::cout << "x1:\n" << x1 << std::endl; + std::cout << "q1:\n" << q1 << std::endl; + std::cout << "x2:\n" << x2 << std::endl; + std::cout << "q2:\n" << q2 << std::endl; + std::cout << "y_expected:\n" << y_expected << std::endl; + std::cout << "y:\n" << y << std::endl; + } + CHECK_EQ(check, 1); +} + +TEST_F(Quantization, QuantUpsampleNearst2dDequantUInt8) { const auto graph_string = R"IR( graph(%x : Float(1, 1, 4, 4, strides=[16, 16, 4, 1], device=cpu)): %2 : int = prim::Constant[value=13]() diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp index dc0ecceb980e..5d3c44574234 100644 --- a/test/cpp/tensorexpr/test_reductions.cpp +++ b/test/cpp/tensorexpr/test_reductions.cpp @@ -35,7 +35,7 @@ TEST(Reductions, ReduceSum0D_1) { std::vector out(M, -1.f); - Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {}); + Tensor c = Reduce("sum", {M}, Sum(), b, {}); LoopNest loop({c}); loop.prepareForCodegen(); StmtPtr s = loop.root_stmt(); @@ -80,7 +80,7 @@ TEST(Reductions, ReduceSum1D) { std::vector out(1, -1.f); - Tensor c = Reduce("sum", {}, Sum(), b, {{10, "m"}}); + Tensor c = Reduce("sum", {}, Sum(), b, {10}); LoopNest loop({c}); loop.prepareForCodegen(); StmtPtr s = loop.root_stmt(); @@ -109,7 +109,7 @@ TEST(Reductions, ReduceSum2D) { std::vector out(M, -1.f); - Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}}); + Tensor c = Reduce("sum", {M}, Sum(), b, {N}); LoopNest loop({c}); loop.prepareForCodegen(); StmtPtr s = loop.root_stmt(); @@ -138,7 +138,7 @@ TEST(Reductions, ReduceSum3D) { BufHandle b("b", {2, 3, m}, kFloat); - Tensor c = Reduce("sum", {{2, "l"}, {3, "n"}}, Sum(), b, {{m, "m"}}); + Tensor c = Reduce("sum", {2, 3}, Sum(), b, {m}); LoopNest loop({c}); loop.prepareForCodegen(); StmtPtr s = loop.root_stmt(); @@ -168,7 +168,7 @@ TEST(Reductions, ReduceSum3D) { ASSERT_EQ(cData[i], expected); } - Tensor d = Reduce("sum2", {{2, "l"}}, Sum(), b, {{3, "n"}, {m, "m"}}); + Tensor d = Reduce("sum2", {2}, Sum(), b, {3, m}); LoopNest loop2({d}); loop2.prepareForCodegen(); StmtPtr s2 = loop2.root_stmt(); @@ -186,7 +186,7 @@ TEST(Reductions, ReduceSum3D) { // This is the same as just reducing the original result across that axis. BufHandle c_buf(c.buf()); - Tensor e = Reduce("sum3", {{2, "l"}}, Sum(), c_buf, {{3, "m"}}); + Tensor e = Reduce("sum3", {2}, Sum(), c_buf, {3}); LoopNest loop3({e}); loop3.prepareForCodegen(); StmtPtr s3 = loop3.root_stmt(); @@ -210,12 +210,7 @@ TEST(Reductions, ReduceSum10D) { std::vector in(InputSize, 1.f); std::vector out(OutputSize, -1.f); - Tensor c = Reduce( - "sum", - {{2, "a"}, {3, "b"}, {2, "c"}, {3, "d"}, {2, "e"}}, - Sum(), - in_, - {{3, "f"}, {2, "g"}, {3, "h"}, {2, "i"}, {3, "j"}}); + Tensor c = Reduce("sum", {2, 3, 2, 3, 2}, Sum(), in_, {3, 2, 3, 2, 3}); LoopNest loop({c}); loop.prepareForCodegen(); StmtPtr s = loop.root_stmt(); @@ -250,7 +245,7 @@ TEST(Reductions, ReduceProduct) { Reducer product( ExprHandle(1.f), [](ExprHandle a, ExprHandle b) { return a * b; }); - Tensor c = Reduce("product", {{M, "m"}}, product, b, {{N, "n"}}); + Tensor c = Reduce("product", {M}, product, b, {N}); LoopNest loop({c}); loop.prepareForCodegen(); StmtPtr s = loop.root_stmt(); @@ -281,7 +276,7 @@ TEST(Reductions, ReduceMax) { in[j] = j; } - Tensor dm1 = Reduce("max", {}, Maximum(kFloat), in_, {{10, "m"}}); + Tensor dm1 = Reduce("max", {}, Maximum(kFloat), in_, {10}); LoopNest loop({dm1}); loop.prepareForCodegen(); @@ -296,7 +291,7 @@ TEST(Reductions, ReduceMax) { BufHandle in2_("b", {2, 5}, kFloat); std::vector out2(2, -1.f); - Tensor m2d = Reduce("max", {{2, "n"}}, Maximum(kFloat), in2_, {{5, "m"}}); + Tensor m2d = Reduce("max", {2}, Maximum(kFloat), in2_, {5}); LoopNest loop2({m2d}); loop2.prepareForCodegen(); @@ -326,7 +321,7 @@ TEST(Reductions, ReduceMinCustomInitializer) { {}, Minimum(ExprHandle(minInit)), [&](ParameterList& v) { return in_.load(v); }, - {{10, "m"}}); + {10}); LoopNest loop({min}); loop.prepareForCodegen(); @@ -357,12 +352,12 @@ TEST(Reductions, ReduceAnyAll) { Tensor any = Reduce( "anyEqual", - {{4, "i"}}, + {4}, anyEqSV, [&](const auto& i, const auto& j) { return CompareSelect::make(b.load(i, j), searchValue, kEQ); }, - {{10, "j"}}); + {10}); LoopNest loop({any}); loop.prepareForCodegen(); @@ -400,12 +395,12 @@ TEST(Reductions, ReduceAnyAll) { Tensor allGreaterThan = Reduce( "allGreaterThan", - {{4, "i"}}, + {4}, allGTSV, [&](const auto& i, const auto& j) { return CompareSelect::make(b.load(i, j), searchValue, kGT); }, - {{10, "j"}}); + {10}); LoopNest loop2({allGreaterThan}); loop2.prepareForCodegen(); @@ -448,12 +443,12 @@ TEST(Reductions, ReduceMatmul2D) { Tensor mm = Reduce( "mm", - {{3, "m"}, {3, "n"}}, + {3, 3}, Sum(), [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) { return tA.load(m, k) * tB.load(k, n); }, - {{2, "k"}}); + {2}); LoopNest loop({mm}); loop.prepareForCodegen(); @@ -480,10 +475,10 @@ TEST(Reductions, ReduceRfactorLike) { std::vector in_rf_(10, -2.f); std::vector out(1, -1.f); - Tensor l1 = Reduce("l1", {{10, "i"}}, Sum(), in, {{10, "j"}}); + Tensor l1 = Reduce("l1", {10}, Sum(), in, {10}); BufHandle in_rf(l1.buf()); - Tensor l2 = Reduce("l2", {}, Sum(), in_rf, {{10, "i"}}); + Tensor l2 = Reduce("l2", {}, Sum(), in_rf, {10}); LoopNest loop({l1, l2}); loop.prepareForCodegen(); @@ -503,11 +498,9 @@ TEST(Reductions, ReduceAsProducer) { BufHandle a("a", {2, 3}, kFloat); BufHandle b("b", {2, 3, m}, kFloat); - Tensor c = Reduce("sum", {{2, "l1"}, {3, "n1"}}, Sum(), b, {{m, "m1"}}); - Tensor d = Compute( - "scale", - {{2, "l2"}, {3, "n1"}}, - [&](const VarHandle& l, const VarHandle& n) { + Tensor c = Reduce("sum", {2, 3}, Sum(), b, {m}); + Tensor d = + Compute("scale", {2, 3}, [&](const VarHandle& l, const VarHandle& n) { return c.load(l, n) * a.load(l, n); }); LoopNest loop({d}, {c, d}); @@ -548,11 +541,11 @@ TEST(Reductions, ReduceAsConsumer) { Tensor c = Compute( "scale", - {{2, "l2"}, {3, "n1"}, {m, "m1"}}, + {2, 3, m}, [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { return b.load(l, n, m) * a.load(l, n, m); }); - Tensor d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {m, "m1"}}); + Tensor d = Reduce("sum", {2}, Sum(), c, {3, m}); LoopNest loop({d}, {c, d}); loop.prepareForCodegen(); StmtPtr s = loop.root_stmt(); @@ -599,7 +592,7 @@ TEST(Reductions, SplitReduceAxis) { } std::vector out(16, -1.f); - Tensor tensor = Reduce("sum", {{16, "m"}}, Sum(), in, {{8, "n"}}); + Tensor tensor = Reduce("sum", {16}, Sum(), in, {8}); LoopNest l({tensor}); std::vector loops = l.getLoopStmtsFor(tensor); LoopNest::splitWithTail(loops[1], 2); @@ -627,7 +620,7 @@ TEST(Reductions, SplitNonReduceAxis) { } } std::vector out(16, -1.f); - Tensor tensor = Reduce("sum", {{16, "m"}}, Sum(), in, {{8, "n"}}); + Tensor tensor = Reduce("sum", {16}, Sum(), in, {8}); LoopNest l({tensor}); std::vector loops = l.getLoopStmtsFor(tensor); LoopNest::splitWithTail(loops[0], 2); @@ -657,14 +650,14 @@ TEST(Reductions, ReorderedReductionInitializer) { BufHandle in("in", {1, 12, 6}, kFloat); std::vector in_(12 * 6, 1.f); - Tensor tensor_ = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}}); + Tensor tensor_ = Reduce("sum", {1, 12}, Sum(), in, {6}); LoopNest l_({tensor_}); l_.prepareForCodegen(); StmtPtr s_ = Stmt::clone(l_.root_stmt()); s_ = IRSimplifier::simplify(s_); - Tensor tensor = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}}); + Tensor tensor = Reduce("sum", {1, 12}, Sum(), in, {6}); LoopNest l({tensor}); auto loops = l.getLoopStmtsFor(tensor); @@ -709,7 +702,7 @@ TEST(Reductions, ReduceRfactor) { std::vector out(1, -1.f); - Tensor c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}}); + Tensor c = Reduce("sum", {}, Sum(), b, {m, n}); LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); auto c_body = loop.getAllWritesToBuf(c.buf())[1]; @@ -742,7 +735,7 @@ TEST(Reductions, Reduce3DRfactorInner) { std::vector out(1, -1.f); - Tensor c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}}); + Tensor c = Reduce("sum", {}, Sum(), b, {m, n, k}); LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); auto c_body = loop.getAllWritesToBuf(c.buf())[1]; @@ -775,7 +768,7 @@ TEST(Reductions, Reduce3DRfactorOuter) { std::vector out(1, -1.f); - Tensor c = Reduce("sum", {}, Sum(), b, {{m, "m"}, {n, "n"}, {k, "k"}}); + Tensor c = Reduce("sum", {}, Sum(), b, {m, n, k}); LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); auto c_body = loop.getAllWritesToBuf(c.buf())[1]; @@ -799,12 +792,7 @@ TEST(Reductions, ReduceRepeatedInternalRfactor) { std::vector out(1, -1.f); std::vector ref(1, -1.f); - Tensor c = Reduce( - "sum", - {}, - Sum(), - in_, - {{2, "a"}, {3, "b"}, {4, "c"}, {5, "d"}, {6, "e"}}); + Tensor c = Reduce("sum", {}, Sum(), in_, {2, 3, 4, 5, 6}); LoopNest orig_loop({c}); // Try rfactoring N outer loops @@ -850,7 +838,7 @@ TEST(Reductions, ReduceSplitTail) { for (const auto i : c10::irange(3)) { std::vector out(M, -1.f); - Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); + Tensor c = Reduce("sum", {M}, Sum(), b, {N, K}); LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); LoopNest::splitWithTail(loops[i], 8); @@ -880,7 +868,7 @@ TEST(Reductions, ReduceSplitNoTail) { for (const auto i : c10::irange(3)) { std::vector out(M, -1.f); - Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); + Tensor c = Reduce("sum", {M}, Sum(), b, {N, K}); LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); LoopNest::splitWithTail(loops[i], 5); @@ -912,7 +900,7 @@ TEST(Reductions, ReduceOverSplitTail) { for (const auto i : c10::irange(3)) { std::vector out(M, -1.f); - Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); + Tensor c = Reduce("sum", {M}, Sum(), b, {N, K}); LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); LoopNest::splitWithTail(loops[i], 16); @@ -943,7 +931,7 @@ TEST(Reductions, ReduceSplitMask) { for (const auto i : c10::irange(3)) { std::vector out(M, -1.f); - Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); + Tensor c = Reduce("sum", {M}, Sum(), b, {N, K}); LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); LoopNest::splitWithMask(loops[i], 8); @@ -973,7 +961,7 @@ TEST(Reductions, ReduceSplitNoMask) { for (const auto i : c10::irange(3)) { std::vector out(M, -1.f); - Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); + Tensor c = Reduce("sum", {M}, Sum(), b, {N, K}); LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); LoopNest::splitWithMask(loops[i], 5); @@ -1004,7 +992,7 @@ TEST(Reductions, ReduceOverSplitMask) { for (const auto i : c10::irange(3)) { std::vector out(M, -1.f); - Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); + Tensor c = Reduce("sum", {M}, Sum(), b, {N, K}); LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); LoopNest::splitWithMask(loops[i], 16); @@ -1038,7 +1026,7 @@ TEST(Reductions, ReduceSplitRfactor) { std::vector out(M, -1.f); - Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); + Tensor c = Reduce("sum", {M}, Sum(), b, {N, K}); LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); LoopNest::splitWithTail(loops[2], SPLIT_FACTOR); @@ -1078,7 +1066,7 @@ TEST(Reductions, ReduceOverSplitRfactor) { std::vector out(1, -1.f); - Tensor c = Reduce("sum", {}, Sum(), b, {{N, "n"}, {K, "k"}}); + Tensor c = Reduce("sum", {}, Sum(), b, {N, K}); LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) @@ -1128,10 +1116,9 @@ TEST(Reductions, ReduceInlineReduction) { BufHandle a_buf("a", {M}, kFloat); BufHandle b_buf("b", {M, N, K}, kFloat); - Tensor x = Reduce("x", {{M, "m1"}}, Sum(), b_buf, {{N, "n1"}, {K, "k1"}}); - Tensor y = Compute("y", {{M, "m2"}}, [&](const VarHandle& m) { - return a_buf.load(m) + x.load(m); - }); + Tensor x = Reduce("x", {M}, Sum(), b_buf, {N, K}); + Tensor y = Compute( + "y", {M}, [&](const VarHandle& m) { return a_buf.load(m) + x.load(m); }); PaddedBuffer a_v(M); PaddedBuffer b_v(M, N, K); @@ -1162,11 +1149,11 @@ TEST(Reductions, ReduceInlineConsumer) { Tensor x = Compute( "x", - {{M, "m1"}, {N, "n1"}, {K, "k1"}}, + {M, N, K}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return a_buf.load(m, n, k) + b_buf.load(m, n, k); }); - Tensor y = Reduce("y", {{M, "m2"}}, Sum(), x, {{N, "n2"}, {K, "k2"}}); + Tensor y = Reduce("y", {M}, Sum(), x, {N, K}); PaddedBuffer a_v(M, N, K); PaddedBuffer b_v(M, N, K); @@ -1215,7 +1202,7 @@ TEST(Reductions, ReduceInlineReducerInternal) { Tensor x = Compute( "x", - {{M, "m1"}, {N, "n1"}, {K, "k1"}}, + {M, N, K}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { return a_buf.load(m, n, k) + b_buf.load(m, n, k); }); @@ -1223,7 +1210,7 @@ TEST(Reductions, ReduceInlineReducerInternal) { Reducer minimum(ExprHandle(0.f), [&](ExprHandle a, ExprHandle b) { return Add::make(ExprHandle(1.f), Min::make(a, b, false)); }); - Tensor y = Reduce("y", {{M, "m2"}}, minimum, x, {{N, "n2"}, {K, "k2"}}); + Tensor y = Reduce("y", {M}, minimum, x, {N, K}); PaddedBuffer a_v(M, N, K); PaddedBuffer b_v(M, N, K); @@ -1272,26 +1259,28 @@ TEST(Reductions, ReductionCacheAccessesOperatorAxis) { Tensor c = Compute( "scale", - {{L, "l2"}, {N, "n1"}, {M, "m1"}}, + {L, N, M}, [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { return b.load(l, n, m) * a.load(l, n, m); }); - Tensor d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}}); + Tensor d = Reduce("sum", {L}, Sum(), c, {N, M}); - Tensor e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) { + Tensor e = Compute("scale", {L}, [&](const VarHandle& l) { return b.load(0, 0, l) * d.load(l); }); LoopNest l({e}, {c, d, e}); LoopNest l_before(l); l_before.prepareForCodegen(); - SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e}); + SimpleIREvaluator cg_before( + LoopNest::sanitizeNames(l_before.root_stmt()), {a, b, e}); StmtPtr d_loop = l.getLoopStmtsFor(d)[0]; l.cacheAccesses(d.buf(), "d_local", d_loop); l.prepareForCodegen(); - StmtPtr result = IRSimplifier::simplify(l.root_stmt()); + StmtPtr result = + LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt())); SimpleIREvaluator cg_after(result, {a, b, e}); std::ostringstream oss; @@ -1299,16 +1288,16 @@ TEST(Reductions, ReductionCacheAccessesOperatorAxis) { const std::string& expected_ir = R"IR( #CHECK: Allocate(d_local); // dtype=float, dims=[4] -#CHECK: for (int l1 -#CHECK: d_local[l1] = 0.f -#CHECK: for (int n1 -#CHECK: for (int m1 -#CHECK: d_local[l1] = (d_local[l1]) + (scale[ +#CHECK: for (int i_2 +#CHECK: d_local[i_2] = 0.f +#CHECK: for (int +#CHECK: for (int +#CHECK: d_local[i_2] = (d_local[i_2]) + (scale[ #CHECK: } #CHECK: } #CHECK: } -#CHECK: for (int i -#CHECK: sum[i] = d_local[i] +#CHECK: for (int i_3 +#CHECK: sum[i_3] = d_local[i_3] #CHECK: Free(d_local); #CHECK-NOT: d_local )IR"; @@ -1347,13 +1336,13 @@ TEST(Reductions, ReductionCacheAccessesOuterReduceAxis) { Tensor c = Compute( "scale", - {{L, "l2"}, {N, "n1"}, {M, "m1"}}, + {L, N, M}, [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { return b.load(l, n, m) * a.load(l, n, m); }); - Tensor d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}}); + Tensor d = Reduce("sum", {L}, Sum(), c, {N, M}); - Tensor e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) { + Tensor e = Compute("scale", {L}, [&](const VarHandle& l) { return b.load(0, 0, l) * d.load(l); }); @@ -1366,7 +1355,8 @@ TEST(Reductions, ReductionCacheAccessesOuterReduceAxis) { l.cacheAccesses(d.buf(), "d_local", d_loop); l.prepareForCodegen(); - StmtPtr result = IRSimplifier::simplify(l.root_stmt()); + StmtPtr result = + LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt())); SimpleIREvaluator cg_after(result, {a, b, e}); std::ostringstream oss; @@ -1374,14 +1364,14 @@ TEST(Reductions, ReductionCacheAccessesOuterReduceAxis) { const std::string& expected_ir = R"IR( #CHECK: Allocate(d_local); // dtype=float, dims=[1] -#CHECK: sum[l1] = 0 -#CHECK: d_local[0] = sum[l1] -#CHECK: for (int n1 -#CHECK: for (int m1 +#CHECK: sum[i_1] = 0 +#CHECK: d_local[0] = sum[i_1] +#CHECK: for (int j_1 +#CHECK: for (int k_1 #CHECK: d_local[0] = (d_local[0]) + (scale[ #CHECK: } #CHECK: } -#CHECK: sum[l1] = d_local[0] +#CHECK: sum[i_1] = d_local[0] #CHECK: Free(d_local); #CHECK-NOT: d_local )IR"; @@ -1420,13 +1410,13 @@ TEST(Reductions, ReductionCacheAccessesInnerReduceAxis) { Tensor c = Compute( "scale", - {{L, "l2"}, {N, "n1"}, {M, "m1"}}, + {L, N, M}, [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { return b.load(l, n, m) * a.load(l, n, m); }); - Tensor d = Reduce("sum", {{L, "l1"}}, Sum(), c, {{N, "n1"}, {M, "m1"}}); + Tensor d = Reduce("sum", {L}, Sum(), c, {N, M}); - Tensor e = Compute("scale", {{L, "l"}}, [&](const VarHandle& l) { + Tensor e = Compute("scale", {L}, [&](const VarHandle& l) { return b.load(0, 0, l) * d.load(l); }); @@ -1439,7 +1429,8 @@ TEST(Reductions, ReductionCacheAccessesInnerReduceAxis) { l.cacheAccesses(d.buf(), "d_local", d_loop); l.prepareForCodegen(); - StmtPtr result = IRSimplifier::simplify(l.root_stmt()); + StmtPtr result = + LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt())); SimpleIREvaluator cg_after(result, {a, b, e}); std::ostringstream oss; @@ -1447,13 +1438,13 @@ TEST(Reductions, ReductionCacheAccessesInnerReduceAxis) { const std::string& expected_ir = R"IR( #CHECK: Allocate(d_local); // dtype=float, dims=[1] -#CHECK: sum[l1] = 0 -#CHECK: for (int n1 +#CHECK: sum[i_1] = 0 +#CHECK: for (int #CHECK: d_local[0] = 0 -#CHECK: for (int m1 +#CHECK: for (int #CHECK: d_local[0] = (d_local[0]) + (scale[ #CHECK: } -#CHECK: sum[l1] = (sum[l1]) + (d_local[0]) +#CHECK: sum[i_1] = (sum[i_1]) + (d_local[0]) #CHECK: } #CHECK: Free(d_local); #CHECK-NOT: d_local @@ -1489,13 +1480,13 @@ TEST(Reductions, ReductionCacheBodyAccess) { Tensor c = Compute( "scale", - {{24, "l2"}, {32, "n1"}, {12, "m1"}}, + {24, 32, 12}, [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { return b.load(l, n, m) * a.load(l, n, m); }); - Tensor d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}}); + Tensor d = Reduce("sum", {24}, Sum(), c, {32, 12}); - Tensor e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) { + Tensor e = Compute("scale", {24}, [&](const VarHandle& l) { return b.load(0, 0, l) * d.load(l); }); @@ -1505,7 +1496,8 @@ TEST(Reductions, ReductionCacheBodyAccess) { l.cacheAccesses(c.buf(), "scale_local", d_loop); l.prepareForCodegen(); - StmtPtr result = IRSimplifier::simplify(l.root_stmt()); + StmtPtr result = + LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt())); SimpleIREvaluator cg(result, {a, b, e}); std::ostringstream oss; @@ -1513,11 +1505,11 @@ TEST(Reductions, ReductionCacheBodyAccess) { const std::string& expected_ir = R"IR( #CHECK: Allocate(scale_local); // dtype=float, dims=[1, 32, 12] -#CHECK: for (int j = 0; j < 32; j++) { -#CHECK: for (int k = 0; k < 12; k++) { -#CHECK: scale_local[k + 12 * j] = scale[(k + 12 * j) + 384 * l1]; -#CHECK: sum[l1] = (sum[l1]) + (scale_local[m1_1 + 12 * n1_1]); -#CHECK: scale_1[l] = (b[l]) * (sum[l]); +#CHECK: for (int j_1 = 0; j_1 < 32; j_1++) { +#CHECK: for (int k_1 = 0; k_1 < 12; k_1++) { +#CHECK: scale_local[k_1 + 12 * j_1] = scale[(k_1 + 12 * j_1) + 384 * i_1]; +#CHECK: sum[i_1] = (sum[i_1]) + (scale_local[k_2 + 12 * j_2]); +#CHECK: scale_1[i_2] = (b[i_2]) * (sum[i_2]); #CHECK: Free(scale_local); )IR"; torch::jit::testing::FileCheck().run(expected_ir, oss.str()); @@ -1529,13 +1521,13 @@ TEST(Reductions, ReductionCacheConsumerAccess) { Tensor c = Compute( "scale", - {{24, "l2"}, {32, "n1"}, {12, "m1"}}, + {24, 32, 12}, [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { return b.load(l, n, m) * a.load(l, n, m); }); - Tensor d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}}); + Tensor d = Reduce("sum", {24}, Sum(), c, {32, 12}); - Tensor e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) { + Tensor e = Compute("scale", {24}, [&](const VarHandle& l) { return b.load(0, 0, l) * d.load(l); }); @@ -1547,7 +1539,8 @@ TEST(Reductions, ReductionCacheConsumerAccess) { l.cacheAccesses(d.buf(), "sum_local", e_loop); l.prepareForCodegen(); - StmtPtr result = IRSimplifier::simplify(l.root_stmt()); + StmtPtr result = + LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt())); SimpleIREvaluator cg(result, {a, b, e}); std::ostringstream oss; @@ -1555,10 +1548,10 @@ TEST(Reductions, ReductionCacheConsumerAccess) { const std::string& expected_ir = R"IR( #CHECK: Alias(sum_local,scale); -#CHECK: sum[l1] = (sum[l1]) + (scale[ -#CHECK: for (int i = 0; i < 4 -#CHECK: sum_local[i] = sum[i + 4 * l_outer]; -#CHECK: scale_1[l_inner + 4 * l_outer] = (b[l_inner + 4 * l_outer]) * (sum_local[l_inner]); +#CHECK: sum[i_1] = (sum[i_1]) + (scale[ +#CHECK: for (int j_2 = 0; j_2 < 4 +#CHECK: sum_local[j_2] = sum[j_2 + 4 * i_2]; +#CHECK: scale_1[j_3 + 4 * i_2] = (b[j_3 + 4 * i_2]) * (sum_local[j_3]); )IR"; torch::jit::testing::FileCheck().run(expected_ir, oss.str()); } @@ -1569,13 +1562,13 @@ TEST(Reductions, ReductionSplitCacheConsumerAccess) { Tensor c = Compute( "scale", - {{24, "l2"}, {32, "n1"}, {12, "m1"}}, + {24, 32, 12}, [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { return b.load(l, n, m) * a.load(l, n, m); }); - Tensor d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}}); + Tensor d = Reduce("sum", {24}, Sum(), c, {32, 12}); - Tensor e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) { + Tensor e = Compute("scale", {24}, [&](const VarHandle& l) { return b.load(0, 0, l) * d.load(l); }); @@ -1593,7 +1586,8 @@ TEST(Reductions, ReductionSplitCacheConsumerAccess) { l.cacheAccesses(d.buf(), "sum_local", inner); l.prepareForCodegen(); - StmtPtr result = IRSimplifier::simplify(l.root_stmt()); + StmtPtr result = + LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt())); SimpleIREvaluator cg(result, {a, b, e}); // reduction changes but cache does not. @@ -1602,10 +1596,12 @@ TEST(Reductions, ReductionSplitCacheConsumerAccess) { const std::string& expected_ir = R"IR( #CHECK: Alias(sum_local,scale); -#CHECK: sum[l1_inner + 4 * l1_outer] = (sum[l1_inner + 4 * l1_outer]) + (scale[((m1_1 + 12 * n1_1) + 1536 * l1_outer) + 384 * l1_inner]); -#CHECK: for (int i = 0; i < 4 -#CHECK: sum_local[i] = sum[i + 4 * l_outer]; -#CHECK: scale_1[l_inner + 4 * l_outer] = (b[l_inner + 4 * l_outer]) * (sum_local[l_inner]); +#CHECK: sum[j_1 + 4 * i_1] = (sum[j_1 + 4 * i_1]) + (scale[((l + 12 * k_1) + 1536 * i_1) + 384 * j_1]); +#CHECK: for (int i_2 = 0; i_2 < 6 +#CHECK: for (int j_2 = 0; j_2 < 4 +#CHECK: sum_local[j_2] = sum[j_2 + 4 * i_2]; +#CHECK: for (int j_3 = 0; j_3 < 4 +#CHECK: scale_1[j_3 + 4 * i_2] = (b[j_3 + 4 * i_2]) * (sum_local[j_3]); )IR"; torch::jit::testing::FileCheck().run(expected_ir, oss.str()); } @@ -1616,13 +1612,13 @@ TEST(Reductions, ReductionReorderCacheConsumerAccess) { Tensor c = Compute( "scale", - {{24, "l2"}, {32, "n1"}, {12, "m1"}}, + {24, 32, 12}, [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { return b.load(l, n, m) * a.load(l, n, m); }); - Tensor d = Reduce("sum", {{24, "l1"}}, Sum(), c, {{32, "n1"}, {12, "m1"}}); + Tensor d = Reduce("sum", {24}, Sum(), c, {32, 12}); - Tensor e = Compute("scale", {{24, "l"}}, [&](const VarHandle& l) { + Tensor e = Compute("scale", {24}, [&](const VarHandle& l) { return b.load(0, 0, l) * d.load(l); }); @@ -1641,7 +1637,8 @@ TEST(Reductions, ReductionReorderCacheConsumerAccess) { l.cacheAccesses(d.buf(), "sum_local", inner); l.prepareForCodegen(); - StmtPtr result = IRSimplifier::simplify(l.root_stmt()); + StmtPtr result = + LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt())); SimpleIREvaluator cg(result, {a, b, e}); // neither reduction body not cache changes. @@ -1649,10 +1646,12 @@ TEST(Reductions, ReductionReorderCacheConsumerAccess) { oss << *cg.stmt(); const std::string& expected_ir = R"IR( -#CHECK: sum[l1] = (sum[l1]) + (scale[(m1_1 + 12 * n1_1) + 384 * l1]); -#CHECK: for (int i = 0; i < 4 -#CHECK: sum_local[i] = sum[i + 4 * l_outer]; -#CHECK: scale_1[l_inner + 4 * l_outer] = (b[l_inner + 4 * l_outer]) * (sum_local[l_inner]); +#CHECK: sum[j_1] = (sum[j_1]) + (scale[(k_1 + 12 * i_2) + 384 * j_1]); +#CHECK: for (int i_3 = 0; i_3 < 6; +#CHECK: for (int j_2 = 0; j_2 < 4; +#CHECK: sum_local[j_2] = sum[j_2 + 4 * i_3]; +#CHECK: for (int j_3 = 0; j_3 < 4; +#CHECK: scale_1[j_3 + 4 * i_3] = (b[j_3 + 4 * i_3]) * (sum_local[j_3]); )IR"; torch::jit::testing::FileCheck().run(expected_ir, oss.str()); } @@ -1673,7 +1672,7 @@ TEST(Reductions, ReductionRfactorCacheTempOuter) { std::vector out(1, -1.f); - Tensor c = Reduce("sum", {}, Sum(), b, {{m, "a"}, {n, "b"}, {k, "c"}}); + Tensor c = Reduce("sum", {}, Sum(), b, {m, n, k}); LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); @@ -1693,7 +1692,7 @@ TEST(Reductions, ReductionRfactorCacheTempOuter) { LoopNest::cacheAccesses(rfac_buf, "tmp", all_loops[1][1]); loop.simplify(); loop.prepareForCodegen(); - StmtPtr s = loop.root_stmt(); + StmtPtr s = LoopNest::sanitizeNames(loop.root_stmt()); SimpleIREvaluator cg(s, {b, c, m, n, k}); std::ostringstream oss; @@ -1702,17 +1701,17 @@ TEST(Reductions, ReductionRfactorCacheTempOuter) { R"IR( #CHECK: Allocate(sum_rfac); // dtype=float, dims=[n] #CHECK: Allocate(tmp); // dtype=float, dims=[n] -#CHECK: for (int a = 0; a < m -#CHECK: for (int i = 0; i < n -#CHECK: tmp[i] = 0 +#CHECK: for (int i_1 = 0; i_1 < m +#CHECK: for (int j = 0; j < n +#CHECK: tmp[j] = 0 #CHECK: } -#CHECK: for (int b = 0; b < n -#CHECK: for (int c -#CHECK: tmp[b] = (tmp[b]) + (B[ +#CHECK: for (int j_1 = 0; j_1 < n +#CHECK: for (int k +#CHECK: tmp[j_1] = (tmp[j_1]) + (B[ #CHECK: } #CHECK: } -#CHECK: for (int i = 0; i < n -#CHECK: sum_rfac[i] = (sum_rfac[i]) + (tmp[i]); +#CHECK: for (int j_2 = 0; j_2 < n +#CHECK: sum_rfac[j_2] = (sum_rfac[j_2]) + (tmp[j_2]); #CHECK: } #CHECK: Free(tmp); #CHECK-NOT: tmp @@ -1739,7 +1738,7 @@ TEST(Reductions, ReductionRfactorCacheTempInner) { std::vector out(1, -1.f); - Tensor c = Reduce("sum", {}, Sum(), b, {{m, "a"}, {n, "b"}, {k, "c"}}); + Tensor c = Reduce("sum", {}, Sum(), b, {m, n, k}); LoopNest loop({c}); std::vector loops = loop.getLoopStmtsFor(c); auto c_body = loop.getAllWritesToBuf(c.buf())[1]; @@ -1759,7 +1758,7 @@ TEST(Reductions, ReductionRfactorCacheTempInner) { LoopNest::cacheAccesses(rfac_buf, "tmp", all_loops[1][2]); loop.prepareForCodegen(); loop.simplify(); - StmtPtr s = loop.root_stmt(); + StmtPtr s = LoopNest::sanitizeNames(loop.root_stmt()); SimpleIREvaluator cg(s, {b, c, m, n, k}); std::ostringstream oss; @@ -1768,13 +1767,13 @@ TEST(Reductions, ReductionRfactorCacheTempInner) { R"IR( #CHECK: Allocate(sum_rfac); // dtype=float, dims=[n] #CHECK: Allocate(tmp); // dtype=float, dims=[1] -#CHECK: for (int a = 0; a < m -#CHECK: for (int b = 0; b < n +#CHECK: for (int i_1 = 0; i_1 < m +#CHECK: for (int j = 0; j < n #CHECK: tmp[0] = 0 -#CHECK: for (int c +#CHECK: for (int k #CHECK: tmp[0] = (tmp[0]) + (B[ #CHECK: } -#CHECK: sum_rfac[b] = (sum_rfac[b]) + (tmp[0]); +#CHECK: sum_rfac[j] = (sum_rfac[j]) + (tmp[0]); #CHECK: Free(tmp); #CHECK-NOT: tmp )IR"; @@ -1796,7 +1795,7 @@ TEST(Reductions, ReductionVectorize) { BufHandle in("in", {8, 8}, kFloat); - Tensor tensor = Reduce("sum", {{8, "m"}}, Sum(), in, {{8, "n"}}); + Tensor tensor = Reduce("sum", {8}, Sum(), in, {8}); LoopNest l_before({tensor}); LoopNest l(l_before); l_before.prepareForCodegen(); @@ -1806,15 +1805,15 @@ TEST(Reductions, ReductionVectorize) { ASSERT_TRUE(LoopNest::vectorize(l.getLoopStmtsFor(tensor)[0])); StmtPtr s = l.root_stmt(); - s = IRSimplifier::simplify(s); + s = LoopNest::sanitizeNames(IRSimplifier::simplify(s)); std::ostringstream oss; oss << *s; const std::string& expected_ir = R"IR( #CHECK: sum[Ramp(0, 1, 8)] = Broadcast(0.f, 8); -#CHECK: for (int n = 0; n < 8; n++) { -#CHECK: sum[Ramp(0, 1, 8)] = ReduceOp((sum[Ramp(0, 1, 8)]) + (in[Ramp(n, 8, 8)]), reduce_args={n}); +#CHECK: for (int i = 0; i < 8; i++) { +#CHECK: sum[Ramp(0, 1, 8)] = ReduceOp((sum[Ramp(0, 1, 8)]) + (in[Ramp(i, 8, 8)]), reduce_args={i}); #CHECK: } )IR"; torch::jit::testing::FileCheck().run(expected_ir, oss.str()); @@ -1832,7 +1831,7 @@ TEST(Reductions, ReductionVectorize) { TEST(Reductions, ReductionVectorizeInner) { BufHandle in("in", {8, 8}, kFloat); - Tensor tensor = Reduce("sum", {{8, "m"}}, Sum(), in, {{8, "n"}}); + Tensor tensor = Reduce("sum", {8}, Sum(), in, {8}); LoopNest l({tensor}); ASSERT_FALSE(LoopNest::vectorize(l.getLoopStmtsFor(tensor)[1])); @@ -1850,7 +1849,7 @@ TEST(Reductions, ReductionVectorizeRfactor) { BufHandle in("in", {8, 8}, kFloat); - Tensor tensor = Reduce("sum", {}, Sum(), in, {{8, "m"}, {8, "n"}}); + Tensor tensor = Reduce("sum", {}, Sum(), in, {8, 8}); LoopNest l_before({tensor}); LoopNest l(l_before); @@ -1875,21 +1874,21 @@ TEST(Reductions, ReductionVectorizeRfactor) { ASSERT_TRUE(LoopNest::vectorize(rfac_loops[1][0])); l.simplify(); - StmtPtr s = l.root_stmt(); + StmtPtr s = LoopNest::sanitizeNames(l.root_stmt()); std::ostringstream oss; oss << *s; const std::string& expected_ir = R"IR( #CHECK: sum = 0.f; -#CHECK: for (int n = 0; n < 8; n++) { -#CHECK: sum_rfac[n] = 0.f; +#CHECK: for (int i = 0; i < 8; i++) { +#CHECK: sum_rfac[i] = 0.f; #CHECK: } -#CHECK: for (int m = 0; m < 8; m++) { -#CHECK: sum_rfac[Ramp(0, 1, 8)] = ReduceOp((sum_rfac[Ramp(0, 1, 8)]) + (in[Ramp(8 * m, 1, 8)]), reduce_args={m}); +#CHECK: for (int i_1 = 0; i_1 < 8; i_1++) { +#CHECK: sum_rfac[Ramp(0, 1, 8)] = ReduceOp((sum_rfac[Ramp(0, 1, 8)]) + (in[Ramp(8 * i_1, 1, 8)]), reduce_args={i_1}); #CHECK: } -#CHECK: for (int n = 0; n < 8; n++) { -#CHECK: sum = ReduceOp((sum) + (sum_rfac[n]), reduce_args={n}); +#CHECK: for (int i_2 = 0; i_2 < 8; i_2++) { +#CHECK: sum = ReduceOp((sum) + (sum_rfac[i_2]), reduce_args={i_2}); #CHECK: } )IR"; torch::jit::testing::FileCheck().run(expected_ir, oss.str()); @@ -1910,22 +1909,22 @@ TEST(Reductions, InitFunction) { BufHandle B("B", {N}, kFloat); Tensor C = Reduce( "C", - {{N, "n"}}, + {N}, Sum(), [&](const std::vector& v) { return B.load(v[0]); }, [&](const std::vector& v) { return A.load(v[1], v[0]); }, - {{M, "m"}}); + {M}); LoopNest nest({C}); nest.prepareForCodegen(); - StmtPtr s = IRSimplifier::simplify(nest.root_stmt()); + StmtPtr s = LoopNest::sanitizeNames(IRSimplifier::simplify(nest.root_stmt())); std::ostringstream oss; oss << *s << "\n"; const std::string& expected_ir = R"IR( -#CHECK: for (int n = 0; n < 16; n++) { -#CHECK: C[n] = B[n]; -#CHECK: for (int m = 0; m < 32; m++) { -#CHECK: C[n] = (C[n]) + (A[n + 16 * m]); +#CHECK: for (int i = 0; i < 16; i++) { +#CHECK: C[i] = B[i]; +#CHECK: for (int j = 0; j < 32; j++) { +#CHECK: C[i] = (C[i]) + (A[i + 16 * j]); #CHECK: } #CHECK: } )IR"; diff --git a/test/cpp/tensorexpr/test_simplify.cpp b/test/cpp/tensorexpr/test_simplify.cpp index 21f85ce160af..2a4322a64f9c 100644 --- a/test/cpp/tensorexpr/test_simplify.cpp +++ b/test/cpp/tensorexpr/test_simplify.cpp @@ -3858,26 +3858,25 @@ TEST(Simplify, SimplifyForCleansUp) { BufHandle a("a", {1, 12, 1}, kFloat); VarHandle x("x", kInt); Tensor b = Compute( - // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) "x", - {{1, "i"}, {12, "m"}, {1, "n"}}, + {1, 12, 1}, [](const VarHandle& i, const VarHandle& m, const VarHandle& n) { return i + m + n; }); LoopNest l({b}); l.prepareForCodegen(); - StmtPtr body = l.root_stmt(); + StmtPtr body = LoopNest::sanitizeNames(l.root_stmt()); StmtPtr simplified = IRSimplifier::simplify(body); BlockPtr block = to(simplified); IS_NODE_WITH_NAME(For, block->front(), for_); // for is over "m". - IS_VAR_WITH_NAME(for_->var(), "m"); + IS_VAR_WITH_NAME(for_->var(), "j"); // x[m] = m; IS_NODE_WITH_NAME(Store, for_->body()->front(), store); - IS_VAR_WITH_NAME(store->flat_index(), "m"); - IS_VAR_WITH_NAME(store->value(), "m"); + IS_VAR_WITH_NAME(store->flat_index(), "j"); + IS_VAR_WITH_NAME(store->value(), "j"); } } @@ -4118,7 +4117,7 @@ TEST(Simplify, SimplifyReorderForCond) { 0, 4, Cond::make( - CompareSelect::make(i, 10, CompareSelectOperation::kLT), + CompareSelect::make(i, 2, CompareSelectOperation::kEQ), Store::make(c, {i}, Load::make(a, {i})), nullptr)); @@ -4235,7 +4234,7 @@ TEST(Simplify, SimplifyReorderForCond) { CompareSelect::make( Load::make(a, {0}), 10, CompareSelectOperation::kLT), Cond::make( - CompareSelect::make(i, 10, CompareSelectOperation::kEQ), + CompareSelect::make(i, 3, CompareSelectOperation::kEQ), Store::make(c, {0}, Load::make(a, {i})), nullptr), nullptr)); @@ -4825,7 +4824,739 @@ TEST(Simplify, SimplifyBroadcastTermExpander) { } } -TEST(Simplify, DISABLED_CompareSelectCondAlwaysInLoopBounds) { +TEST(Simplify, CompareSelectLoopBounds) { + constexpr int N = 8; + BufHandle b("b", {N}, kFloat); + VarHandle n("n", kInt); + VarHandle m("m", kInt); + VarHandle var_N("var_N", kInt); + VarHandle var_M("var_M", kInt); + + auto test_case_fn = [](const VarHandle& n, + const BufHandle& b, + const ExprHandle& start, + const ExprHandle& stop, + const int& cmp_val, + const CompareSelectOperation& cmp_op, + const std::string& check_string) { + StmtPtr s = For::make( + n, + start, + stop, + b.store({n}, CompareSelect::make(n, cmp_val, 0.f, 1.0f, cmp_op))); + s = IRSimplifier::simplify(s); + std::ostringstream oss; + oss << *s; + std::string target_string = "# CHECK: "; + target_string += check_string; + torch::jit::testing::FileCheck().run(target_string, oss.str()); + }; + + auto test_case_nest_loops_fn = [](const VarHandle& n, + const VarHandle& m, + const BufHandle& b, + const ExprHandle& n_start, + const ExprHandle& n_stop, + const ExprHandle& m_start, + const ExprHandle& m_stop, + const CompareSelectOperation& cmp_op, + const std::string& check_string) { + StmtPtr s = For::make( + m, + m_start, + m_stop, + b.store({n, m}, CompareSelect::make(n, m, 0.f, 1.0f, cmp_op))); + StmtPtr root_s = For::make(n, n_start, n_stop, s); + root_s = IRSimplifier::simplify(root_s); + std::ostringstream oss; + oss << *root_s; + std::string target_string = "# CHECK: "; + target_string += check_string; + torch::jit::testing::FileCheck().run(target_string, oss.str()); + }; + + // Before: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n < 1 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, N)) { + // b[n] = 1.f; + // } + test_case_fn(n, b, 1, N, 1, kLT, "b[n] = 1.f;"); + + // Before: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n <= 1 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n <= 1 ? 0.f : 1.f; + // } + test_case_fn(n, b, 1, N, 1, kLE, "b[n] = n<=1 ? 0.f : 1.f;"); + + // Before: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n <= 0 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, N)) { + // b[n] = 1.f; + // } + test_case_fn(n, b, 1, N, 0, kLE, "b[n] = 1.f;"); + + // Before: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n < 0 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, N)) { + // b[n] = 1.f; + // } + test_case_fn(n, b, 1, N, 0, kLT, "b[n] = 1.f;"); + + // Before: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n < 8 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, N)) { + // b[n] = 0.f; + // } + test_case_fn(n, b, 1, N, N, kLT, "b[n] = 0.f;"); + + // Before: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n <= 7 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, N)) { + // b[n] = 0.f; + // } + test_case_fn(n, b, 1, N, N - 1, kLE, "b[n] = 0.f;"); + + // Before: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n <= 8 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, N)) { + // b[n] = 0.f; + // } + test_case_fn(n, b, 1, N, N, kLE, "b[n] = 0.f;"); + + // Before: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n < 7 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n < 7 ? 0.f : 1.f; + // } + test_case_fn(n, b, 1, N, N - 1, kLT, "b[n] = n<7 ? 0.f : 1.f;"); + + // Before: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n > 0 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, N)) { + // b[n] = 0.f; + // } + test_case_fn(n, b, 1, N, 0, kGT, "b[n] = 0.f;"); + + // Before: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n > 1 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n > 1 ? 0.f : 1.f; + // } + test_case_fn(n, b, 1, N, 1, kGT, "b[n] = n>1 ? 0.f : 1.f;"); + + // Before: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n >= 1 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, N)) { + // b[n] = 0.f; + // } + test_case_fn(n, b, 1, N, 1, kGE, "b[n] = 0.f;"); + + // Before: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n > 7 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, N)) { + // b[n] = 1.f; + // } + test_case_fn(n, b, 1, N, N - 1, kGT, "b[n] = 1.f;"); + + // Before: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n >= 7 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n >= 7 ? 0.f : 1.f; + // } + test_case_fn(n, b, 1, N, N - 1, kGE, "b[n] = n>=7 ? 0.f : 1.f;"); + + // Before: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n > 5 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n > 5 ? 0.f : 1.f; + // } + test_case_fn(n, b, 1, N, 5, kGT, "b[n] = n>5 ? 0.f : 1.f;"); + + // Before: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n >= 5 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n >= 5 ? 0.f : 1.f; + // } + test_case_fn(n, b, 1, N, 5, kGE, "b[n] = n>=5 ? 0.f : 1.f;"); + + // Before: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n > 8 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, N)) { + // b[n] = 1.f; + // } + test_case_fn(n, b, 1, N, N, kGT, "b[n] = 1.f;"); + + // Before: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n >= 8 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, N)) { + // b[n] = 1.f; + // } + test_case_fn(n, b, 1, N, N, kGE, "b[n] = 1.f;"); + + // Before: + // for (const auto n : c10::irange(1, 2)) { + // b[n] = n == 1 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, 2)) { + // b[1] = 0.f; + // } + test_case_fn(n, b, 1, 2, 1, kEQ, "b[1] = 0.f;"); + + // Before: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n == 1 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n == 1 ? 0.f : 1.f; + // } + test_case_fn(n, b, 1, N, 1, kEQ, "b[n] = n==1 ? 0.f : 1.f;"); + + // Before: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n == 0 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, N)) { + // b[n] = 1.f; + // } + test_case_fn(n, b, 1, N, 0, kEQ, "b[n] = 1.f;"); + + // Before: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n == 7 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n == 7 ? 0.f : 1.f; + // } + test_case_fn(n, b, 1, N, N - 1, kEQ, "b[n] = n==7 ? 0.f : 1.f;"); + + // Before: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n == 8 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, N)) { + // b[n] = 1.f; + // } + test_case_fn(n, b, 1, N, N, kEQ, "b[n] = 1.f;"); + + // Before: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n != 1 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n != 1 ? 0.f : 1.f; + // } + test_case_fn(n, b, 1, N, 1, kNE, "b[n] = n!=1 ? 0.f : 1.f;"); + + // Before: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n != 7 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n != 7 ? 0.f : 1.f; + // } + test_case_fn(n, b, 1, N, N - 1, kNE, "b[n] = n!=7 ? 0.f : 1.f;"); + + // Before: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n != 5 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n != 5 ? 0.f : 1.f; + // } + test_case_fn(n, b, 1, N, 5, kNE, "b[n] = n!=5 ? 0.f : 1.f;"); + + // Before: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n != 0 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, N)) { + // b[n] = 0.f; + // } + test_case_fn(n, b, 1, N, 0, kNE, "b[n] = 0.f;"); + + // Before: + // for (const auto n : c10::irange(1, N)) { + // b[n] = n != 8 ? 0.f : 1.f; + // } + // After: + // for (const auto n : c10::irange(1, N)) { + // b[n] = 0.f; + // } + test_case_fn(n, b, 1, N, N, kNE, "b[n] = 0.f;"); + + // Before: + // for (const auto n : c10::irange(10, 20)) { + // for(const auto m : c10::irange(30, 40)) { + // b[n, m] = (n != m) ? 0.f : 1.f; + // } + // } + // After: + // for (const auto n : c10::irange(10, 20)) { + // for(const auto m : c10::irange(30, 40)) { + // b[n, m] = 0.f; + // } + // } + test_case_nest_loops_fn(n, m, b, 10, 20, 30, 40, kNE, "b[n, m] = 0.f;"); + test_case_nest_loops_fn( + n, + m, + b, + var_N + 10, + var_N + 20, + var_N + 30, + var_N + 40, + kNE, + "b[n, m] = 0.f;"); + test_case_nest_loops_fn( + n, + m, + b, + var_N + 10, + var_N + 20, + var_M + 30, + var_M + 40, + kNE, + "b[n, m] = n!=m ? 0.f : 1.f;"); + + // Before: + // for (const auto n : c10::irange(30, 40)) { + // for(const auto m : c10::irange(10, 20)) { + // b[n, m] = (n != m) ? 0.f : 1.f; + // } + // } + // After: + // for (const auto n : c10::irange(30, 40)) { + // for(const auto m : c10::irange(10, 20)) { + // b[n, m] = 0.f; + // } + // } + test_case_nest_loops_fn(n, m, b, 30, 40, 10, 20, kNE, "b[n, m] = 0.f;"); + test_case_nest_loops_fn( + n, + m, + b, + var_N + 30, + var_N + 40, + var_N + 10, + var_N + 20, + kNE, + "b[n, m] = 0.f;"); + test_case_nest_loops_fn( + n, + m, + b, + var_N + 30, + var_N + 40, + var_M + 10, + var_M + 20, + kNE, + "b[n, m] = n!=m ? 0.f : 1.f;"); + + // Before: + // for (const auto n : c10::irange(30, 40)) { + // for(const auto m : c10::irange(10, 31)) { + // b[n, m] = (n != m) ? 0.f : 1.f; + // } + // } + // After: + // for (const auto n : c10::irange(30, 40)) { + // for(const auto m : c10::irange(10, 31)) { + // b[n, m] = (n != m) ? 0.f : 1.f; + // } + // } + test_case_nest_loops_fn( + n, m, b, 30, 40, 10, 31, kNE, "b[n, m] = n!=m ? 0.f : 1.f;"); + test_case_nest_loops_fn( + n, + m, + b, + var_N + 30, + var_N + 40, + var_N + 10, + var_N + 31, + kNE, + "b[n, m] = n!=m ? 0.f : 1.f;"); + test_case_nest_loops_fn( + n, + m, + b, + var_N + 30, + var_N + 40, + var_M + 10, + var_M + 31, + kNE, + "b[n, m] = n!=m ? 0.f : 1.f;"); + + // Before: + // for (const auto n : c10::irange(10, 31)) { + // for(const auto m : c10::irange(30, 40)) { + // b[n, m] = (n != m) ? 0.f : 1.f; + // } + // } + // After: + // for (const auto n : c10::irange(10, 31)) { + // for(const auto m : c10::irange(30, 40)) { + // b[n, m] = (n != m) ? 0.f : 1.f; + // } + // } + test_case_nest_loops_fn( + n, m, b, 10, 31, 30, 40, kNE, "b[n, m] = n!=m ? 0.f : 1.f;"); + test_case_nest_loops_fn( + n, + m, + b, + var_N + 10, + var_N + 31, + var_N + 30, + var_N + 40, + kNE, + "b[n, m] = n!=m ? 0.f : 1.f;"); + test_case_nest_loops_fn( + n, + m, + b, + var_N + 10, + var_N + 31, + var_M + 30, + var_M + 40, + kNE, + "b[n, m] = n!=m ? 0.f : 1.f;"); + + // Before: + // for (const auto n : c10::irange(10, 20)) { + // for(const auto m : c10::irange(30, 40)) { + // b[n, m] = (n < m) ? 0.f : 1.f; + // } + // } + // After: + // for (const auto n : c10::irange(10, 20)) { + // for(const auto m : c10::irange(30, 40)) { + // b[n, m] = 0.f; + // } + // } + test_case_nest_loops_fn(n, m, b, 10, 20, 30, 40, kLT, "b[n, m] = 0.f;"); + test_case_nest_loops_fn( + n, + m, + b, + var_N + 10, + var_N + 20, + var_N + 30, + var_N + 40, + kLT, + "b[n, m] = 0.f;"); + test_case_nest_loops_fn( + n, + m, + b, + var_N + 10, + var_N + 20, + var_M + 30, + var_M + 40, + kLT, + "b[n, m] = n m) ? 0.f : 1.f; + // } + // } + // After: + // for (const auto n : c10::irange(30, 40)) { + // for(const auto m : c10::irange(10, 20)) { + // b[n, m] = 0.f; + // } + // } + test_case_nest_loops_fn(n, m, b, 30, 40, 10, 20, kGT, "b[n, m] = 0.f;"); + test_case_nest_loops_fn( + n, + m, + b, + var_N + 30, + var_N + 40, + var_N + 10, + var_N + 20, + kGT, + "b[n, m] = 0.f;"); + test_case_nest_loops_fn( + n, + m, + b, + var_N + 30, + var_N + 40, + var_M + 10, + var_M + 20, + kGT, + "b[n, m] = n>m ? 0.f : 1.f;"); + + // Before: + // for (const auto n : c10::irange(10, 31)) { + // for(const auto m : c10::irange(30, 40)) { + // b[n, m] = (n > m) ? 0.f : 1.f; + // } + // } + // After: + // for (const auto n : c10::irange(10, 31)) { + // for(const auto m : c10::irange(30, 40)) { + // b[n, m] = 1.f; + // } + // } + test_case_nest_loops_fn(n, m, b, 10, 31, 30, 40, kGT, "b[n, m] = 1.f;"); + test_case_nest_loops_fn( + n, + m, + b, + var_N + 10, + var_N + 31, + var_N + 30, + var_N + 40, + kGT, + "b[n, m] = 1.f;"); + test_case_nest_loops_fn( + n, + m, + b, + var_N + 10, + var_N + 31, + var_M + 30, + var_M + 40, + kGT, + "b[n, m] = n>m ? 0.f : 1.f;"); + + // Before: + // for (const auto n : c10::irange(30, 40)) { + // for(const auto m : c10::irange(10, 31)) { + // b[n, m] = (n >= m) ? 0.f : 1.f; + // } + // } + // After: + // for (const auto n : c10::irange(30, 40)) { + // for(const auto m : c10::irange(10, 31)) { + // b[n, m] = 0.f; + // } + // } + test_case_nest_loops_fn(n, m, b, 30, 40, 10, 31, kGE, "b[n, m] = 0.f;"); + test_case_nest_loops_fn( + n, + m, + b, + var_N + 30, + var_N + 40, + var_N + 10, + var_N + 31, + kGE, + "b[n, m] = 0.f;"); + test_case_nest_loops_fn( + n, + m, + b, + var_N + 30, + var_N + 40, + var_M + 10, + var_M + 31, + kGE, + "b[n, m] = n>=m ? 0.f : 1.f;"); + + // Before: + // for (const auto n : c10::irange(10, 20)) { + // for(const auto m : c10::irange(30, 40)) { + // b[n, m] = (n >= m) ? 0.f : 1.f; + // } + // } + // After: + // for (const auto n : c10::irange(10, 20)) { + // for(const auto m : c10::irange(30, 40)) { + // b[n, m] = 1.f; + // } + // } + test_case_nest_loops_fn(n, m, b, 10, 20, 30, 40, kGE, "b[n, m] = 1.f;"); + test_case_nest_loops_fn( + n, + m, + b, + var_N + 10, + var_N + 20, + var_N + 30, + var_N + 40, + kGE, + "b[n, m] = 1.f;"); + test_case_nest_loops_fn( + n, + m, + b, + var_N + 10, + var_N + 20, + var_M + 30, + var_M + 40, + kGE, + "b[n, m] = n>=m ? 0.f : 1.f;"); + + // Before: + // for (const auto n : c10::irange(10, 31)) { + // for(const auto m : c10::irange(30, 40)) { + // b[n, m] = (n <= m) ? 0.f : 1.f; + // } + // } + // After: + // for (const auto n : c10::irange(10, 31)) { + // for(const auto m : c10::irange(30, 40)) { + // b[n, m] = 0.f; + // } + // } + test_case_nest_loops_fn(n, m, b, 10, 31, 30, 40, kLE, "b[n, m] = 0.f;"); + test_case_nest_loops_fn( + n, + m, + b, + var_N + 10, + var_N + 31, + var_N + 30, + var_N + 40, + kLE, + "b[n, m] = 0.f;"); + test_case_nest_loops_fn( + n, + m, + b, + var_N + 10, + var_N + 31, + var_M + 30, + var_M + 40, + kLE, + "b[n, m] = n<=m ? 0.f : 1.f;"); + + // Before: + // for (const auto n : c10::irange(30, 40)) { + // for(const auto m : c10::irange(10, 20)) { + // b[n, m] = (n <= m) ? 0.f : 1.f; + // } + // } + // After: + // for (const auto n : c10::irange(30, 40)) { + // for(const auto m : c10::irange(10, 20)) { + // b[n, m] = 0.f; + // } + // } + test_case_nest_loops_fn(n, m, b, 30, 40, 10, 20, kLE, "b[n, m] = 1.f;"); + test_case_nest_loops_fn( + n, + m, + b, + var_N + 30, + var_N + 40, + var_N + 10, + var_N + 20, + kLE, + "b[n, m] = 1.f;"); + test_case_nest_loops_fn( + n, + m, + b, + var_N + 30, + var_N + 40, + var_M + 10, + var_M + 20, + kLE, + "b[n, m] = n<=m ? 0.f : 1.f;"); +} + +TEST(Simplify, CompareSelectCondAlwaysInLoopBounds) { // Before: // for (const auto n : c10::irange(1, N)) { // b[n] = n < 1 ? 0.f : 1.f; @@ -4849,7 +5580,7 @@ TEST(Simplify, DISABLED_CompareSelectCondAlwaysInLoopBounds) { oss.str()); } -TEST(Simplify, DISABLED_IfThenCondAlwaysInLoopBounds) { +TEST(Simplify, IfThenCondAlwaysInLoopBounds) { // Before: // for (const auto n : c10::irange(1, N)) { // b[n] = IfThenElse(n < 1 ? 1 : 0, 0.f, 1.f); @@ -4873,7 +5604,7 @@ TEST(Simplify, DISABLED_IfThenCondAlwaysInLoopBounds) { oss.str()); } -TEST(Simplify, DISABLED_MultiClauseCondAlwaysInLoopBounds) { +TEST(Simplify, MultiClauseCondAlwaysInLoopBounds) { // This test mimics the unpadded region of a conv2d. We want to remove any // conditional that is provably satisfied (or unsatisfied) by the entire loop // range. @@ -4902,7 +5633,7 @@ TEST(Simplify, DISABLED_MultiClauseCondAlwaysInLoopBounds) { oss << *s; torch::jit::testing::FileCheck().run( R"IR( -# CHECK: b[n] = 1.f; +# CHECK: b[i, j] = 1.f; )IR", oss.str()); } diff --git a/test/cpp/tensorexpr/test_te_fuser_pass.cpp b/test/cpp/tensorexpr/test_te_fuser_pass.cpp index d3e91784fb56..56535de914e4 100644 --- a/test/cpp/tensorexpr/test_te_fuser_pass.cpp +++ b/test/cpp/tensorexpr/test_te_fuser_pass.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -350,5 +351,52 @@ TEST(TEFuserPass, FuserPass_WhereList) { testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g); } +TEST(TEFuserPass, DynamicShapeFusion) { + WithCPUFuser cf; + const auto graph_string = R"IR( + graph(%0 : Float(10, 5, strides=[5, 1], device=cpu), + %1 : Float(10, 5, strides=[5, 1], device=cpu)): + %2 : Float(10, 5, strides=[5, 1], device=cpu) = aten::mul(%0, %1) + %3 : Float(10, 5, strides=[5, 1], device=cpu) = aten::mul(%2, %1) + return (%3))IR"; + auto g = std::make_shared(); + torch::jit::parseIR(graph_string, g.get()); + + g->lint(); + FuseTensorExprs( + g, + /* min_group_size = */ 2, + /* add_composed_op = */ true, + /* fuse_to_dynamic_shapes = */ true); + Code code(g, ""); + + testing::FileCheck() + .check("prim::TensorExprDynamicGroup_") + ->check("prim::TensorExprDynamicGuard") + ->check("prim::TensorExprGroup_") + ->run(*g); + + auto run_and_compare = [&](const std::vector& inputs) { + TORCH_INTERNAL_ASSERT(inputs.size() == 2); + + auto ref = at::mul(at::mul(inputs[0], inputs[1]), inputs[1]); + + InterpreterState interp(code); + Stack stack(inputs.begin(), inputs.end()); + interp.run(stack); + at::Tensor out = pop(stack).toTensor(); + ASSERT_TRUE(at::allclose(out, ref)); + }; + + std::vector inputs = {at::rand({10, 5}), at::rand({10, 5})}; + run_and_compare(inputs); + + std::vector inputs2 = {at::rand({20, 5}), at::rand({20, 5})}; + run_and_compare(inputs2); + + std::vector inputs3 = {at::rand({25, 60}), at::rand({25, 60})}; + run_and_compare(inputs3); +} + } // namespace jit } // namespace torch diff --git a/test/cpp/tensorexpr/test_type_specializations.cpp b/test/cpp/tensorexpr/test_type_specializations.cpp new file mode 100644 index 000000000000..5d2e9462e4aa --- /dev/null +++ b/test/cpp/tensorexpr/test_type_specializations.cpp @@ -0,0 +1,75 @@ +#include + +#include +#include +#include +#include +#include +#include +#include + +// Test that tensor type specializations are availabie in +// the custom passes + +namespace torch { +namespace jit { + +namespace { + +bool hasTensorTypeSpecializations(torch::jit::Block* block) { + for (Value* v : block->inputs()) { + if (hasTensorTypeSpecialization(v)) + return true; + } + for (Node* n : block->nodes()) { + for (torch::jit::Block* b : n->blocks()) { + if (hasTensorTypeSpecializations(b)) + return true; + } + for (Value* v : n->outputs()) { + if (hasTensorTypeSpecialization(v)) + return true; + } + } + return false; +} + +static bool hasSpecializations = false; +void detectTTSpecializationPass(std::shared_ptr& graph) { + GRAPH_DUMP("In detectTTSpecialization Custom Post Pass: ", graph); + hasSpecializations = hasTensorTypeSpecializations(graph->block()); +} + +} // namespace + +TEST(SpecializationsInCustomPasses, Basic) { + RegisterPass p(detectTTSpecializationPass); + hasSpecializations = false; + std::shared_ptr graph = std::make_shared(); + parseIR( + R"IR( +graph(%a.1 : Tensor, + %b.1 : Tensor): + %c.1 : Tensor = aten::mul(%a.1, %b.1) # misc/test_specializations.py:5:8 + %d.1 : Tensor = aten::mul(%c.1, %b.1) # misc/test_specializations.py:6:8 + return (%d.1) + )IR", + &*graph); + + IValue ival = IValue(torch::randn({22}, at::kCPU)); + std::vector stack = {ival, ival}; + auto run = [&](std::shared_ptr& graph, std::vector stack) { + GraphExecutor executor(graph, ""); + executor.run(stack); + return stack; + }; + run(graph, stack); + + // Priofiling mode will not be run with simple executor + if (!getExecutorMode()) { + EXPECT_TRUE(hasSpecializations); + } +} + +} // namespace jit +} // namespace torch diff --git a/test/cpp/tensorexpr/tutorial.cpp b/test/cpp/tensorexpr/tutorial.cpp index b89fcc3396df..e34d980cf708 100644 --- a/test/cpp/tensorexpr/tutorial.cpp +++ b/test/cpp/tensorexpr/tutorial.cpp @@ -186,10 +186,10 @@ int main(int argc, char* argv[]) { // structure is simply a pair of a buffer that was created to represent the // result of the computation (BufPtr) and a statement representing the // computation itself (StmtPtr). - Tensor C = Compute( - "C", - {{64, "i"}, {32, "j"}}, - [&](const VarHandle& i, const VarHandle& j) { return i * j; }); + Tensor C = + Compute("C", {64, 32}, [&](const VarHandle& i, const VarHandle& j) { + return i * j; + }); std::cout << "Stmt produced by 'Compute' API: " << std::endl << *C.stmt() << std::endl; // Prints: @@ -209,7 +209,7 @@ int main(int argc, char* argv[]) { {}, Sum(), [&](const VarHandle& i, const VarHandle& j) { return C.load(i, j); }, - {{64, "i"}, {32, "j"}}); + {64, 32}); std::cout << "Stmt produced by 'Reduce' API: " << std::endl << *D.stmt() << std::endl; } @@ -223,15 +223,13 @@ int main(int argc, char* argv[]) { // Let's look at a couple of transformations that are used in NNC. We will // begin with constructing a Block statement like we did before. - Tensor C = Compute( - "C", - {{64, "i"}, {32, "j"}}, - [&](const VarHandle& i, const VarHandle& j) { return i * (j + 1); }); + Tensor C = + Compute("C", {64, 32}, [&](const VarHandle& i, const VarHandle& j) { + return i * (j + 1); + }); BufHandle c_buf(C.buf()); - Tensor D = Compute( - "D", - {{64, "i"}, {32, "j"}}, - [&](const VarHandle& i, const VarHandle& j) { + Tensor D = + Compute("D", {64, 32}, [&](const VarHandle& i, const VarHandle& j) { return c_buf.load(i, j) - i; }); StmtPtr block = Block::make({C.stmt(), D.stmt()}); @@ -353,10 +351,8 @@ int main(int argc, char* argv[]) { // Let's start by constructing a simple computation for us to work with: BufHandle A("A", {64, 32}, kInt); BufHandle B("B", {64, 32}, kInt); - Tensor X = Compute( - "X", - {{64, "i"}, {32, "j"}}, - [&](const VarHandle& i, const VarHandle& j) { + Tensor X = + Compute("X", {64, 32}, [&](const VarHandle& i, const VarHandle& j) { return A.load(i, j) + B.load(i, j); }); diff --git a/test/cpp_extensions/setup.py b/test/cpp_extensions/setup.py index 3b25f1e60bb9..df5339417304 100644 --- a/test/cpp_extensions/setup.py +++ b/test/cpp_extensions/setup.py @@ -51,15 +51,18 @@ # todo(mkozuki): Figure out the root cause if (not IS_WINDOWS) and torch.cuda.is_available() and CUDA_HOME is not None: + # malfet: One shoudl not assume that PyTorch re-exports CUDA dependencies cublas_extension = CUDAExtension( name='torch_test_cpp_extension.cublas_extension', - sources=['cublas_extension.cpp'] + sources=['cublas_extension.cpp'], + libraries=['cublas'] if torch.version.hip is None else [], ) ext_modules.append(cublas_extension) cusolver_extension = CUDAExtension( name='torch_test_cpp_extension.cusolver_extension', - sources=['cusolver_extension.cpp'] + sources=['cusolver_extension.cpp'], + libraries=['cusolver'] if torch.version.hip is None else [], ) ext_modules.append(cusolver_extension) diff --git a/test/create_dummy_torchscript_model.py b/test/create_dummy_torchscript_model.py new file mode 100644 index 000000000000..ffd869e27f0b --- /dev/null +++ b/test/create_dummy_torchscript_model.py @@ -0,0 +1,28 @@ +# Usage: python create_dummy_model.py +import sys +import torch +from torch import nn + + +class NeuralNetwork(nn.Module): + + def __init__(self): + super(NeuralNetwork, self).__init__() + self.flatten = nn.Flatten() + self.linear_relu_stack = nn.Sequential( + nn.Linear(28 * 28, 512), + nn.ReLU(), + nn.Linear(512, 512), + nn.ReLU(), + nn.Linear(512, 10), + ) + + def forward(self, x): + x = self.flatten(x) + logits = self.linear_relu_stack(x) + return logits + + +if __name__ == '__main__': + jit_module = torch.jit.script(NeuralNetwork()) + torch.jit.save(jit_module, sys.argv[1]) diff --git a/test/custom_backend/CMakeLists.txt b/test/custom_backend/CMakeLists.txt index 96322e397d63..71f83442e085 100644 --- a/test/custom_backend/CMakeLists.txt +++ b/test/custom_backend/CMakeLists.txt @@ -2,6 +2,10 @@ cmake_minimum_required(VERSION 3.1 FATAL_ERROR) project(custom_backend) +if(USE_ROCM) +include(utils) +include(LoadHIP) +endif() find_package(Torch REQUIRED) add_library(custom_backend SHARED custom_backend.cpp) diff --git a/test/custom_operator/CMakeLists.txt b/test/custom_operator/CMakeLists.txt index 883424e36da9..47c1c9d45e81 100644 --- a/test/custom_operator/CMakeLists.txt +++ b/test/custom_operator/CMakeLists.txt @@ -2,6 +2,10 @@ cmake_minimum_required(VERSION 3.1 FATAL_ERROR) project(custom_ops) +if(USE_ROCM) +include(utils) +include(LoadHIP) +endif() find_package(Torch REQUIRED) add_library(custom_ops SHARED op.cpp) diff --git a/test/distributed/_shard/checkpoint/test_checkpoint.py b/test/distributed/_shard/checkpoint/test_checkpoint.py new file mode 100644 index 000000000000..4816b0c38b34 --- /dev/null +++ b/test/distributed/_shard/checkpoint/test_checkpoint.py @@ -0,0 +1,517 @@ +# Owner(s): ["oncall: distributed"] + +import random +import sys +from typing import Optional, List, Union +from torch.distributed._shard.checkpoint import ( + StorageReader, + StorageWriter, + CheckpointException, + load_state_dict, + save_state_dict, +) + +import torch +import torch.distributed as dist +import torch.nn +import torch.futures +from torch.futures import Future +from torch.testing._internal.common_utils import TestCase + +from torch.distributed._shard.checkpoint.resharding import ( + _prepare_sharded_tensor_write, + _create_storage_key +) + +from torch.distributed._shard import sharded_tensor +from torch.distributed._shard.checkpoint.state_dict_loader import ( + validate_metadata, +) + +from torch.distributed._shard.checkpoint.state_dict_saver import ( + _prepare, +) + +from torch.distributed._shard.checkpoint.metadata import ( + Metadata, + BytesReadRequest, + BytesWriteRequest, + TensorReadRequest, + TensorWriteRequest, +) + +from torch.distributed._shard.sharded_tensor import ( + state_dict_hook, + ShardedTensor, +) +from torch.distributed._shard.sharding_spec import ChunkShardingSpec +from torch.testing._internal.common_distributed import ( + requires_nccl, + skip_if_lt_x_gpu, +) +from torch.testing._internal.distributed._shard.sharded_tensor import ( + ShardedTensorTestBase, + with_comms, +) + +from torch.testing._internal.common_utils import ( + TEST_WITH_DEV_DBG_ASAN, + run_tests, +) + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) + + + +class TestModule(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.sharded: ShardedTensor = sharded_tensor.zeros(self.spec(), 4, 4) + self.regular = torch.nn.Parameter(torch.ones(4, 4)) + self.extra_sharded: Optional[ShardedTensor] = None + self.extra_param: Optional[torch.nn.Parameter] = None + self._register_state_dict_hook(state_dict_hook) + + def spec(self) -> ChunkShardingSpec: + # pyre-fixme [28]: Unexpected keyword argument `dim` to call `dist._sharding_spec.api.ChunkShardingSpec.__init__`. + return ChunkShardingSpec( + dim=0, + placements=[ + "rank:0/cuda:0", + "rank:1/cuda:1", + ], + ) + + +class TestDistributedCheckpointing(ShardedTensorTestBase): + @property + def world_size(self) -> int: + return 2 + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(2) + @requires_nccl() + def test_validate_metadata(self) -> None: + module = TestModule() + + metadata, _, _ = _prepare(module.state_dict(), True) + self.assertTrue( + "regular" in metadata.state_dict_metadata, + f"keys: {metadata.state_dict_metadata.keys()}", + ) + + module = TestModule() + validate_metadata(module.state_dict(), metadata) + + module = TestModule() + module.extra_param = torch.nn.Parameter(torch.zeros(2, 2)) + with self.assertRaisesRegex(ValueError, "Could not find Tensor metadata"): + validate_metadata(module.state_dict(), metadata) + + module = TestModule() + module.regular = torch.nn.Parameter(torch.zeros(2, 4)) + + with self.assertRaisesRegex(ValueError, "Incompatible tensor size"): + validate_metadata(module.state_dict(), metadata) + + module = TestModule() + module.extra_sharded = sharded_tensor.zeros(module.spec(), 4, 2) + with self.assertRaisesRegex(ValueError, "Could not find ShardedTensor metadata"): + validate_metadata(module.state_dict(), metadata) + + module = TestModule() + module.sharded = sharded_tensor.zeros(module.spec(), 4, 2) + with self.assertRaisesRegex(ValueError, "Incompatible ShardedTensor size"): + validate_metadata(module.state_dict(), metadata) + + def gen_metadata(self) -> Metadata: + module = TestModule() + # compute the default saved metadata (must pass include_non_replicated_tensors or we'll get incomplete MD) + metadata, _, _ = _prepare(module.state_dict(), True) + + # _prepare only produc + metadata = [metadata] + dist.broadcast_object_list(metadata) + + return metadata[0] + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(2) + @requires_nccl() + def test_checkpoint_has_shard_too_small(self) -> None: + metadata = self.gen_metadata() + + # we make the first stored shard smaller + self.assertTrue( + ".sharded" in metadata.state_dict_metadata, + f"keys: {metadata.state_dict_metadata.keys()}", + ) + + sizes = ( + metadata.state_dict_metadata[".sharded"] + .storage_metadata[0] + .shard_metadata.shard_sizes + ) + for i in range(len(sizes)): + sizes[i] = 1 + + module = TestModule() + with self.assertRaisesRegex(ValueError, "only has 1 available"): + validate_metadata(module.state_dict(), metadata) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(2) + @requires_nccl() + def test_checkpoint_has_shard_overlap(self) -> None: + metadata = self.gen_metadata() + + # we make the first stored shard smaller + self.assertTrue( + ".sharded" in metadata.state_dict_metadata, + f"keys: {metadata.state_dict_metadata.keys()}", + ) + + sizes = ( + metadata.state_dict_metadata[".sharded"] + .storage_metadata[0] + .shard_metadata.shard_sizes + ) + for i in range(len(sizes)): + sizes[i] += 1 + + module = TestModule() + with self.assertRaisesRegex(ValueError, "overlap"): + validate_metadata(module.state_dict(), metadata) + + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(2) + @requires_nccl() + def test_checkpoint_has_storage_type_mismatch(self) -> None: + module = TestModule() + + metadata = self.gen_metadata() + regular = metadata.state_dict_metadata["regular"] + metadata.state_dict_metadata[".sharded"] = regular + with self.assertRaisesRegex(ValueError, "ShardedTensorStorageMetadata but found"): + validate_metadata(module.state_dict(), metadata) + + metadata = self.gen_metadata() + sharded = metadata.state_dict_metadata[".sharded"] + metadata.state_dict_metadata["regular"] = sharded + with self.assertRaisesRegex(ValueError, "TensorStorageMetadata but found"): + validate_metadata(module.state_dict(), metadata) + + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(2) + @requires_nccl() + def test_tensor_metadata_with_missing_rank_spec(self) -> None: + spec = ChunkShardingSpec( + dim=0, + placements=[ + "rank:1/cuda:1", + ], + ) + + st = sharded_tensor.zeros(spec, 4, 4, dtype=torch.float64) + mapping = dict() + + (_, md) = _prepare_sharded_tensor_write(st, "tensor", mapping) + + self.assertEqual(1, len(md.storage_metadata)) + self.assertEqual(4 * 4 * 8, md.storage_metadata[0].length) + self.assertEqual(1, len(mapping)) + + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(2) + @requires_nccl() + def test_storage_key_mapping(self) -> None: + device = f"cuda:{dist.get_rank()}" + spec = ChunkShardingSpec( + dim=0, + placements=[ + "rank:0/cuda:0", + "rank:1/cuda:1", + ], + ) + + state_dict = { + 'sharded': sharded_tensor.rand(spec, (10, 10, )), + 'replicated': torch.rand(4, device=device), + 'bytes': [1, 2, 3, 4], + } + + metadata, bytes_reqs, tensor_reqs = _prepare(state_dict, write_replicated_data=self.rank == 0) + + if self.rank == 0: + self.assertEqual(1, len(bytes_reqs)) + self.assertEqual(2, len(tensor_reqs)) + + self.assertTrue('bytes' in metadata.state_dict_metadata) + self.assertEqual(bytes_reqs[0].storage_key, metadata.state_dict_metadata['bytes'].storage_key) + + # tensor ordering is unspecified + if len(tensor_reqs[0].tensor.size()) == 1: + replicated = tensor_reqs[0] + shard = tensor_reqs[1] + else: + replicated = tensor_reqs[1] + shard = tensor_reqs[0] + + self.assertTrue('replicated' in metadata.state_dict_metadata) + self.assertEqual(replicated.storage_key, metadata.state_dict_metadata['replicated'].storage_key) + else: + self.assertEqual(0, len(bytes_reqs)) + self.assertEqual(1, len(tensor_reqs)) + shard = tensor_reqs[0] + + self.assertTrue('sharded' in metadata.state_dict_metadata) + shard_keys = [sm.storage_key for sm in metadata.state_dict_metadata['sharded'].storage_metadata] + self.assertTrue(shard.storage_key in shard_keys) + +class TestStorageKeys(TestCase): + def test_create_key_handles_collision(self): + keys = dict() + key0 = _create_storage_key(keys, "foo") + key1 = _create_storage_key(keys, "foo") + self.assertNotEqual(key0, key1) + + + + +class TestStorageBase: + def __init__( + self, + fail_conf + ): + self.fail_conf = fail_conf + self.rank = 0 if not dist.is_initialized() else dist.get_rank() + + def _get_ranks(self, name): + return self.fail_conf[name] if name in self.fail_conf else None + + def _fail_rank(self, name): + ranks = self._get_ranks(name) + if ranks is not None and self.rank in ranks: + raise ValueError(f"rank fail {self.rank} for {name}") + + def _fail_rank_async(self, name): + ranks = self._get_ranks(name) + fut = Future() + if ranks is not None and self.rank in ranks: + fut.set_exception(ValueError(f"async rank fail {self.rank} for {name}")) + else: + fut.set_result(None) + return fut + + +class FaultyStorageWriter(TestStorageBase, StorageWriter): + def __init__( + self, + fail_conf + ): + super(FaultyStorageWriter, self).__init__(fail_conf) + + def prepare(self) -> None: + self._fail_rank("fail_prepare") + + def write_bytes(self, requests: List[BytesWriteRequest]) -> Future[None]: + self._fail_rank("fail_write_bytes_on_ranks") + return self._fail_rank_async("fail_write_bytes_on_ranks_async") + + def write_tensors(self, requests: List[TensorWriteRequest]) -> Future[None]: + self._fail_rank("fail_write_tensors_on_ranks") + return self._fail_rank_async("fail_write_tensors_on_ranks_async") + + def finish(self, metadata: Metadata) -> None: + self._fail_rank("fail_finish") + + def prepare_storage(self, storage_writes: List[Union[TensorWriteRequest, BytesWriteRequest]]) -> None: + self._fail_rank("fail_prepare_storage") + +class FaultyStorageReader(TestStorageBase, StorageReader): + def __init__( + self, + metadata, + fail_conf + ): + super(FaultyStorageReader, self).__init__(fail_conf) + self.metadata = metadata + + def read_bytes(self, requests: List[BytesReadRequest]) -> Future[None]: + self._fail_rank("fail_read_bytes") + bad_ranks = self._get_ranks("fail_deser_bytes") + for r in requests: + if bad_ranks is not None and self.rank in bad_ranks: + # this is not "guaranteed" to fail, but hard to beat + rand = random.Random(1237) + r.bytes.write(rand.randbytes(32)) + else: + torch.save([1, 2, 3], r.bytes) + + return self._fail_rank_async("fail_read_bytes_async") + + def read_tensors(self, requests: List[TensorReadRequest]) -> Future[None]: + self._fail_rank("fail_read_tensors") + return self._fail_rank_async("fail_read_tensors_async") + + def read_metadata(self) -> Metadata: + self._fail_rank("fail_read_metadata") + return self.metadata + +class TestDistributedFailure(ShardedTensorTestBase): + def get_spec(self): + return ChunkShardingSpec( + dim=0, + placements=[ + f"rank:{r}/cuda:{r}" for r in range(dist.get_world_size()) + ] + ) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(2) + @requires_nccl() + def test_dummy_writer_works(self) -> None: + state_dict = { + 'sharded': sharded_tensor.rand(self.get_spec(), 20, 20), + 'replicated': torch.rand(10, 10), + 'bytes': [1, 2, 3, 4] + } + + save_state_dict(state_dict, FaultyStorageWriter({})) + + + def _test_dist_failure(self, callback, kwargs): + bad_ranks = list(kwargs.values())[0] if len(kwargs) > 0 else [] + + # Empty bad_ranks means it must work + if len(bad_ranks) == 0: + callback() + else: + with self.assertRaises(CheckpointException) as cm: + callback() + e = cm.exception + for rank, ex in e.failures.items(): + self.assertTrue(rank in bad_ranks, msg=f"{rank} did not fail") + if not kwargs.get("ignore_exception_type", False): + self.assertEqual(ValueError, type(ex), str(ex)) + + failed_ranks = e.failures.keys() + for rank in bad_ranks: + self.assertTrue(rank in failed_ranks, msg=f"{rank} was supposed to fail was fine") + + + def _test_save(self, state_dict, coordinator=0, **kwargs): + no_dist = not dist.is_initialized() + + def _save(): + save_state_dict( + state_dict, + storage_writer=FaultyStorageWriter(kwargs), + coordinator_rank=coordinator, + no_dist=no_dist, + ) + self._test_dist_failure(_save, kwargs) + + def _test_load(self, state_dict, coordinator=0, **kwargs): + no_dist = not dist.is_initialized() + write_replicated = dist.is_initialized() and dist.get_rank() == coordinator + + def _load(): + metadata, _, _ = _prepare(state_dict, write_replicated) + load_state_dict( + state_dict, + storage_reader=FaultyStorageReader(metadata, kwargs), + coordinator_rank=coordinator, + no_dist=no_dist, + ) + + self._test_dist_failure(_load, kwargs) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(4) + @requires_nccl() + def test_save_error_handling(self) -> None: + state_dict = { + 'sharded': sharded_tensor.rand(self.get_spec(), 20, 20), + 'replicated': torch.rand(10, 10), + 'bytes': [1, 2, 3, 4] + } + + self._test_save(state_dict, fail_prepare=[0]) + self._test_save(state_dict, fail_finish=[0]) + + self._test_save(state_dict, fail_prepare_storage=[0]) + self._test_save(state_dict, fail_write_tensors_on_ranks=[1]) + self._test_save(state_dict, fail_write_tensors_on_ranks_async=[2]) + self._test_save(state_dict, fail_write_bytes_on_ranks=[3]) + self._test_save(state_dict, fail_write_bytes_on_ranks_async=[1]) + + self._test_save(state_dict, fail_write_tensors_on_ranks_async=[1, 3]) + + self._test_save(state_dict, coordinator=1, fail_prepare=[1]) + self._test_save(state_dict, coordinator=1, fail_finish=[1]) + + + def test_save_error_handling_no_dist(self) -> None: + state_dict = { + 'replicated': torch.rand(10, 10), + 'bytes': [1, 2, 3, 4] + } + + self.assertFalse(dist.is_initialized()) + + self._test_save(state_dict, fail_prepare=[0]) + self._test_save(state_dict, fail_finish=[0]) + + self._test_save(state_dict, fail_prepare_storage=[0]) + self._test_save(state_dict, fail_write_tensors_on_ranks=[0]) + self._test_save(state_dict, fail_write_tensors_on_ranks_async=[0]) + self._test_save(state_dict, fail_write_bytes_on_ranks=[0]) + self._test_save(state_dict, fail_write_bytes_on_ranks_async=[0]) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(4) + @requires_nccl() + def test_load_error_handling(self) -> None: + state_dict = { + 'sharded': sharded_tensor.rand(self.get_spec(), 20, 20), + 'replicated': torch.rand(10, 10), + 'bytes': [1, 2, 3, 4] + } + + self._test_load(state_dict) + self._test_load(state_dict, fail_read_metadata=[0]) + self._test_load(state_dict, fail_read_bytes=[1]) + self._test_load(state_dict, fail_read_bytes_async=[2]) + self._test_load(state_dict, fail_read_tensors=[3]) + self._test_load(state_dict, fail_read_tensors_async=[1]) + # We don't want to depend on the actual exception raised by pickle + self._test_load(state_dict, fail_deser_bytes=[2], ignore_exception_type=True) + + self._test_load(state_dict, coordinator=1, fail_read_metadata=[3]) + self._test_load(state_dict, coordinator=2, fail_read_bytes=[0]) + self._test_load(state_dict, coordinator=3, fail_read_tensors_async=[2]) + + + def test_load_error_handling_no_dist(self) -> None: + state_dict = { + 'replicated': torch.rand(10, 10), + 'bytes': [1, 2, 3, 4] + } + self._test_load(state_dict) + self._test_load(state_dict, fail_read_metadata=[0]) + self._test_load(state_dict, fail_read_bytes=[0]) + self._test_load(state_dict, fail_read_bytes_async=[0]) + self._test_load(state_dict, fail_read_tensors=[0]) + self._test_load(state_dict, fail_read_tensors_async=[0]) + self._test_load(state_dict, fail_deser_bytes=[0], ignore_exception_type=True) +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/_shard/checkpoint/test_file_system_checkpoint.py b/test/distributed/_shard/checkpoint/test_file_system_checkpoint.py new file mode 100644 index 000000000000..ca0c121b3638 --- /dev/null +++ b/test/distributed/_shard/checkpoint/test_file_system_checkpoint.py @@ -0,0 +1,466 @@ +# Owner(s): ["oncall: distributed"] + +import sys +import os +import shutil +import tempfile +from typing import Dict, cast + +import torch +import torch.distributed as dist +from torch import Tensor +from torch.distributed._shard import sharded_tensor +from torch.distributed._shard.sharded_tensor import ShardedTensor, state_dict_hook +from torch.distributed._shard.sharding_spec import ( + ChunkShardingSpec, + EnumerableShardingSpec, + ShardingSpec, + ShardMetadata, +) +from torch.testing._internal.common_distributed import requires_nccl, skip_if_lt_x_gpu +from torch.testing._internal.common_utils import TestCase +from torch.testing._internal.distributed._shard.sharded_tensor import ( + ShardedTensorTestBase, + with_comms, +) +from torch.testing._internal.distributed._shard.sharded_tensor._test_st_common import ( + MyShardedModel1 +) + + +from torch.testing._internal.common_utils import ( + TEST_WITH_DEV_DBG_ASAN, + run_tests, +) + +from torch.distributed._shard.checkpoint import ( + FileSystemReader, + FileSystemWriter, + load_state_dict, + save_state_dict, +) + + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) + +def _sharded_tensor_gather( + self, + dst=0, + out=None, +): + """ + This is a reimplementation of ST:gather using gather instead of gather_object. + The later hangs on CI inside NCCL. + """ + + def shard_size(shard_md): + res = 1 + for s in shard_md.shard_sizes: + res *= s + return res + rank = dist.get_rank(self._process_group) + full_size = self.metadata().size + + world_size = dist.get_world_size(self._process_group) + rank_sizes = [0 for _ in range(world_size)] + max_rank_size = 0 + shard_placement = dict() + local_shards_placement = [] + # collect sizes + for shard_idx, shard_md in enumerate(self.metadata().shards_metadata): + shard_rank = shard_md.placement.rank() + shard_placement[shard_idx] = (shard_rank, rank_sizes[shard_rank]) + if shard_rank == rank: + local_shards_placement.append((shard_md, rank_sizes[shard_rank],)) + + rank_sizes[shard_rank] += shard_size(shard_md) + max_rank_size = max(max_rank_size, rank_sizes[shard_rank]) + + + if rank == dst: + gather_list = [torch.empty((max_rank_size,), device=out.device) for _ in range(world_size)] + else: + gather_list = None + + # FIXME is a rank allowed to not have any data? + with torch.no_grad(): + # XXX we can fastpath this to torch.cat if max_rank_size == rank_sizes[rank] + data = torch.empty(max_rank_size, device=self.local_shards()[0].tensor.device) + for shard in self.local_shards(): + for placement in local_shards_placement: + if placement[0] == shard.metadata: + src = shard.tensor.flatten() + data[placement[1]: placement[1] + src.numel()].copy_(src) + break + + dist.gather( + tensor=data, + gather_list=gather_list, + dst=dst, + group=self._process_group, + ) + if rank != dst: + return + if out is None: + raise ValueError("`out` Tensor must be provided on dst rank!") + + full_size = self.metadata().size + dims = len(full_size) + + + for shard_idx, shard_md in enumerate(self.metadata().shards_metadata): + placement = shard_placement[shard_idx] + tensor = gather_list[placement[0]] + tensor = tensor[placement[1] : placement[1] + shard_size(shard_md)] + tensor = tensor.view(shard_md.shard_sizes) + + out_narrow_view = out + for dim in range(dims): + out_narrow_view = out_narrow_view.narrow( + dim, + shard_md.shard_offsets[dim], + shard_md.shard_sizes[dim], + ) + + out_narrow_view.copy_(tensor) + + +def assert_state_dict_equal( + self: TestCase, + state_dict_1: Dict[str, torch.Tensor], + state_dict_2: Dict[str, torch.Tensor], +) -> bool: + self.assertEqual( + len(state_dict_1), len(state_dict_2), "state_dict must be the same size" + ) + self.assertEqual( + set(state_dict_1.keys()), + set(state_dict_2.keys()), + "state_dict keys do not match", + ) + + for key, value_1 in state_dict_1.items(): + value_2 = state_dict_2[key] + if isinstance(value_1, torch.Tensor): + self.assertTrue( + torch.equal(value_1, value_2), f"Key {key}'s tensor does not match" + ) + elif isinstance(value_1, ShardedTensor): + for local_shard_1, local_shard_2 in zip( + value_1.local_shards(), value_2.local_shards() + ): + self.assertTrue( + torch.equal(local_shard_1.tensor, local_shard_1.tensor), + f"Key {key}'s shard does not match", + ) + + return True + + +class MyTestModule(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.linear_1 = torch.nn.Linear(5, 5) + self.linear_2 = torch.nn.Linear(5, 1) + self.emb = torch.nn.EmbeddingBag(5, 10) + + +# The ShardedModels are borrowed from test/distributed/_sharded_tensor/test_sharded_tensor.py +class MyShardedModel3(torch.nn.Module): + def __init__( + self, + spec: ShardingSpec, + ) -> None: + super(MyShardedModel3, self).__init__() + self.sharded_tensor: ShardedTensor = sharded_tensor.rand( + spec, 10, 20, init_rrefs=False + ) + + +class TestDistributedStateDictSaveLoad(TestCase): + def test_read_write_only_tensor(self) -> None: + with tempfile.TemporaryDirectory() as path: + state_dict_to_save = MyTestModule().state_dict() + + fs_writer = FileSystemWriter(path=path) + save_state_dict(state_dict=state_dict_to_save, storage_writer=fs_writer, no_dist=True) + + state_dict_to_load_to = MyTestModule().state_dict() + + with self.assertRaises(AssertionError): + assert_state_dict_equal(self, state_dict_to_load_to, state_dict_to_save) + + # Load from file without any resharding + fs_reader = FileSystemReader(path=path) + load_state_dict(state_dict=state_dict_to_load_to, storage_reader=fs_reader, no_dist=True) + + assert_state_dict_equal(self, state_dict_to_load_to, state_dict_to_save) + + +class TestDistributedStateDictSaveLoadWithSharedTensor(ShardedTensorTestBase): + @property + def world_size(self) -> int: + return 2 + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(2) + @requires_nccl() + def test_read_write_shard_tensor(self) -> None: + paths = [tempfile.mkdtemp()] + dist.broadcast_object_list(paths) + + path = paths[0] + + # pyre-fixme [28]: Unexpected keyword argument `dim` to call `dist._sharding_spec.api.ChunkShardingSpec.__init__`. + spec = ChunkShardingSpec( + dim=0, + placements=[ + "rank:0/cuda:0", + "rank:1/cuda:1", + ], + ) + + model_to_save = MyShardedModel1(spec, init_rrefs=False) + + # Test save + model_to_save._register_state_dict_hook(state_dict_hook) + state_dict_to_save = model_to_save.state_dict() + + fs_writer = FileSystemWriter(path=path) + save_state_dict(state_dict=state_dict_to_save, storage_writer=fs_writer) + + dist.barrier() + + # Create a new model + model_to_load = MyShardedModel1(spec, init_rrefs=False) + # This is not the correct hook for loading the state dict + # model_to_load._register_load_state_dict_pre_hook(pre_load_state_dict_hook, True) + model_to_load._register_state_dict_hook(state_dict_hook) + state_dict_to_load_to = model_to_load.state_dict() + + dist.barrier() + + with self.assertRaises(AssertionError): + assert_state_dict_equal(self, state_dict_to_load_to, state_dict_to_save) + + # Test load. + fs_reader = FileSystemReader(path=path) + load_state_dict(state_dict=state_dict_to_load_to, storage_reader=fs_reader) + + assert_state_dict_equal(self, state_dict_to_load_to, state_dict_to_save) + dist.barrier() + + +class TestDistributedReshardOnLoad(ShardedTensorTestBase): + @property + def world_size(self) -> int: + return 2 + + def get_file_path(self) -> str: + paths = [tempfile.mkdtemp()] if dist.get_rank() == 0 else [None] + dist.broadcast_object_list(paths) + return paths[0] + + def load_tensor(self, tensor: ShardedTensor) -> torch.Tensor: + res = torch.zeros(tensor.shape, device="cuda:0") if dist.get_rank() == 0 else None + _sharded_tensor_gather(tensor, out=res) + return cast(Tensor, res) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(2) + @requires_nccl() + def test_load_with_different_shard_plan(self) -> None: + path = self.get_file_path() + + # We hardcode the assumption of how many shards are around + self.assertEqual(self.world_size, dist.get_world_size()) + + specs = [ + # pyre-fixme [28]: Unexpected keyword argument `dim` to call `dist._sharding_spec.api.ChunkShardingSpec.__init__`. + ChunkShardingSpec( + dim=0, + placements=[ + "rank:0/cuda:0", + "rank:1/cuda:1", + ], + ), + # pyre-fixme [28]: Unexpected keyword argument `dim` to call `dist._sharding_spec.api.ChunkShardingSpec.__init__`. + ChunkShardingSpec( + dim=0, + placements=[ + "rank:0/cuda:0", + "rank:1/cuda:1", + "rank:1/cuda:1", + "rank:0/cuda:0", + ], + ), + # This requires the tensors to be [10, 20] + EnumerableShardingSpec( + shards=[ + ShardMetadata( + shard_offsets=[0, 0], + shard_sizes=[2, 20], + placement="rank:0/cuda:0", + ), + ShardMetadata( + shard_offsets=[2, 0], + shard_sizes=[1, 20], + placement="rank:1/cuda:1", + ), + ShardMetadata( + shard_offsets=[3, 0], + shard_sizes=[3, 20], + placement="rank:0/cuda:0", + ), + ShardMetadata( + shard_offsets=[6, 0], + shard_sizes=[3, 20], + placement="rank:1/cuda:1", + ), + ShardMetadata( + shard_offsets=[9, 0], + shard_sizes=[1, 20], + placement="rank:0/cuda:0", + ), + ] + ), + # This requires the tensors to be [10, 20] + EnumerableShardingSpec( + shards=[ + ShardMetadata( + shard_offsets=[0, 0], + shard_sizes=[8, 20], + placement="rank:1/cuda:1", + ), + ShardMetadata( + shard_offsets=[8, 0], + shard_sizes=[2, 20], + placement="rank:0/cuda:0", + ), + ] + ), + ] + + for s0 in specs: + for s1 in specs: + if s0 == s1: + continue + + if dist.get_rank() == 0: + shutil.rmtree(path, ignore_errors=True) + os.makedirs(path) + dist.barrier() + + model_to_save = MyShardedModel3(s0) + model_to_save._register_state_dict_hook(state_dict_hook) + state_dict_to_save = model_to_save.state_dict() + + fs_writer = FileSystemWriter(path=path) + save_state_dict(state_dict=state_dict_to_save, storage_writer=fs_writer) + + dist.barrier() + + model_to_load = MyShardedModel3(s1) + model_to_load._register_state_dict_hook(state_dict_hook) + state_dict_to_load_to = model_to_load.state_dict() + dist.barrier() + + fs_reader = FileSystemReader(path=path) + load_state_dict( + state_dict=state_dict_to_load_to, storage_reader=fs_reader + ) + + dist.barrier() + store_tensor = self.load_tensor(model_to_save.sharded_tensor) + dist.barrier() + load_tensor = self.load_tensor(model_to_load.sharded_tensor) + + if dist.get_rank() == 0: + self.assertTrue( + torch.allclose(store_tensor, load_tensor), msg=f"{s0} vs {s1}" + ) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(2) + @requires_nccl() + def test_load_rowwise_to_colwise(self) -> None: + path = self.get_file_path() + self.assertEqual(self.world_size, dist.get_world_size()) + + # pyre-fixme [28]: Unexpected keyword argument `dim` to call `dist._sharding_spec.api.ChunkShardingSpec.__init__`. + src_spec = ChunkShardingSpec( + dim=0, + placements=[ + "rank:0/cuda:0", + "rank:1/cuda:1", + ], + ) + + # pyre-fixme [28]: Unexpected keyword argument `dim` to call `dist._sharding_spec.api.ChunkShardingSpec.__init__`. + dst_spec = ChunkShardingSpec( + dim=1, + placements=[ + "rank:0/cuda:0", + "rank:1/cuda:1", + ], + ) + + if dist.get_rank() == 0: + shutil.rmtree(path, ignore_errors=True) + os.makedirs(path) + + model_to_save = MyShardedModel3(src_spec).cuda(dist.get_rank()) + model_to_save._register_state_dict_hook(state_dict_hook) + state_dict_to_save = model_to_save.state_dict() + + fs_writer = FileSystemWriter(path=path) + save_state_dict(state_dict=state_dict_to_save, storage_writer=fs_writer) + + model_to_load = MyShardedModel3(dst_spec).cuda(dist.get_rank()) + model_to_load._register_state_dict_hook(state_dict_hook) + state_dict_to_load_to = model_to_load.state_dict() + + fs_reader = FileSystemReader(path=path) + + load_state_dict(state_dict=state_dict_to_load_to, storage_reader=fs_reader) + + # We can't use torch.allclose since each ST has a different sharding spec + store_tensor = self.load_tensor(model_to_save.sharded_tensor) + load_tensor = self.load_tensor(model_to_load.sharded_tensor) + + if dist.get_rank() == 0: + self.assertTrue(torch.allclose(store_tensor, load_tensor)) + + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(2) + @requires_nccl() + def test_save_load_bytes(self) -> None: + path = self.get_file_path() + + state_dict_to_save = { + 'bytes0': [1], + 'bytes1': 'string' + } + + fs_writer = FileSystemWriter(path=path) + save_state_dict(state_dict=state_dict_to_save, storage_writer=fs_writer) + + state_dict_to_load = { + 'bytes0': [2], + 'bytes1': 'other' + } + + fs_reader = FileSystemReader(path=path) + load_state_dict(state_dict=state_dict_to_load, storage_reader=fs_reader) + + self.assertEqual([1], state_dict_to_load['bytes0']) + self.assertEqual('string', state_dict_to_load['bytes1']) + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/_shard/sharded_optim/test_sharded_optim.py b/test/distributed/_shard/sharded_optim/test_sharded_optim.py index 085c928985eb..d3f1468aea3c 100644 --- a/test/distributed/_shard/sharded_optim/test_sharded_optim.py +++ b/test/distributed/_shard/sharded_optim/test_sharded_optim.py @@ -2,7 +2,10 @@ import torch import torch.optim as optim -import torch.distributed._shard.sharded_tensor +from torch.distributed._shard import ( + sharded_tensor, + shard_parameter +) from copy import deepcopy from torch.distributed._shard.sharding_spec import ( @@ -77,8 +80,8 @@ def shard_parameter(self): ], ) - sharded_tensor.shard_parameter(self.linear1, "weight", rowwise_sharding_spec) - sharded_tensor.shard_parameter(self.linear2, "weight", colwise_sharding_spec) + shard_parameter(self.linear1, "weight", rowwise_sharding_spec) + shard_parameter(self.linear2, "weight", colwise_sharding_spec) def forward(self, inp): return self.linear2(self.gelu(self.linear1(inp))) diff --git a/test/distributed/_shard/sharded_tensor/ops/test_binary_cmp.py b/test/distributed/_shard/sharded_tensor/ops/test_binary_cmp.py index c20727169523..33fc49f81c0f 100644 --- a/test/distributed/_shard/sharded_tensor/ops/test_binary_cmp.py +++ b/test/distributed/_shard/sharded_tensor/ops/test_binary_cmp.py @@ -106,11 +106,17 @@ def _test_common_failures(self, cmp_op): pg = dist.new_group([1, 0, 3, 2]) st1, st2 = self.get_random_tensors(spec, spec, 10, 10, pg2=pg) - self.assertFalse(cmp_op(st1, st2)) + with self.assertRaisesRegex( + RuntimeError, "All distributed tensors should use the same ProcessGroup" + ): + cmp_op(st1, st2) pg = dist.new_group([0, 1, 2, 3]) st1, st2 = self.get_random_tensors(spec, spec, 10, 10, pg2=pg) - self.assertFalse(cmp_op(st1, st2)) + with self.assertRaisesRegex( + RuntimeError, "All distributed tensors should use the same ProcessGroup" + ): + cmp_op(st1, st2) @with_comms @skip_if_lt_x_gpu(4) diff --git a/test/distributed/_shard/sharded_tensor/ops/test_chunk.py b/test/distributed/_shard/sharded_tensor/ops/test_chunk.py new file mode 100644 index 000000000000..f0dcd4d7aad8 --- /dev/null +++ b/test/distributed/_shard/sharded_tensor/ops/test_chunk.py @@ -0,0 +1,90 @@ +# Owner(s): ["oncall: distributed"] + +import sys + +import torch +from torch.distributed._shard import sharded_tensor, _shard_tensor +from torch.testing._internal.common_distributed import ( + requires_nccl, + skip_if_lt_x_gpu, +) +from torch.testing._internal.common_utils import ( + TEST_WITH_DEV_DBG_ASAN, + run_tests, +) +from torch.testing._internal.distributed._shard.sharded_tensor import ( + TEST_GPU_NUM, + ShardedTensorTestBase, + with_comms, +) +from torch.testing._internal.distributed._shard.sharded_tensor._test_ops_common import ( + generate_chunk_sharding_specs_for_test, + generate_enumerable_sharding_specs_for_test, +) + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) + + +class TestShardedTensorChunkOps(ShardedTensorTestBase): + def _compare_chunk_result(self, chunked_list, chunked_st_list): + self.assertEqual(len(chunked_list), len(chunked_st_list)) + for idx, chunked_st in enumerate(chunked_st_list): + tensor = chunked_list[idx] + st = _shard_tensor(tensor.contiguous(), chunked_st.sharding_spec()) + # _shard_tensor generate sharded tensor with metadata ranked by # of rank. + st._metadata.shards_metadata.sort( + key=lambda x: x.shard_offsets[chunked_st.sharding_spec().dim], + ) + self.assertTrue(torch.allclose(chunked_st, st)) + + def _run_sharded_chunk_test(self, local_tensor_size, shard_spec, chunk_num): + torch.manual_seed(0) + local_tensor = torch.rand(*local_tensor_size).cuda(self.rank) + st_tensor = _shard_tensor(local_tensor.clone().detach(), shard_spec) + local_tensor_chunked = torch.chunk(local_tensor, chunk_num, dim=-1) + chunked_st = torch.chunk(st_tensor, chunk_num, dim=-1) + self._compare_chunk_result(local_tensor_chunked, chunked_st) + chunked_st = st_tensor.chunk(chunk_num, dim=-1) + self._compare_chunk_result(local_tensor_chunked, chunked_st) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_sharded_chunk(self): + sharding_dims = [0] + specs = [] + for dim in sharding_dims: + specs.extend(generate_chunk_sharding_specs_for_test(dim)) + for spec in specs: + self._run_sharded_chunk_test([17, 14], spec, 3) + self._run_sharded_chunk_test([17, 15, 20], spec, 5) + self._run_sharded_chunk_test([17, 16], spec, 2) + # Large matrix case. + self._run_sharded_chunk_test([128, 512], spec, 8) + self._run_sharded_chunk_test([1024, 2048], spec, 4) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_sharded_chunk_error(self): + chunk_spec = generate_chunk_sharding_specs_for_test(-1) + with self.assertRaisesRegex( + NotImplementedError, "Chunk by sharding dim is not supported." + ): + st = sharded_tensor.rand(chunk_spec[0], [17, 24]) + torch.chunk(st, 5, dim=-1) + enumerable_spec = generate_enumerable_sharding_specs_for_test() + with self.assertRaisesRegex( + NotImplementedError, "Only ChunkShardingSpec is supported for chunk." + ): + st = sharded_tensor.rand(enumerable_spec[0], [10, 10]) + torch.chunk(st, 5, dim=-1) + + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/_shard/sharded_tensor/ops/test_elementwise_ops.py b/test/distributed/_shard/sharded_tensor/ops/test_elementwise_ops.py index 50f880b55b3a..382af65ab0f5 100644 --- a/test/distributed/_shard/sharded_tensor/ops/test_elementwise_ops.py +++ b/test/distributed/_shard/sharded_tensor/ops/test_elementwise_ops.py @@ -30,14 +30,18 @@ class TestShardedTensorElementWiseOps(ShardedTensorTestBase): - def _run_sharded_elementwise_ops(self, spec, input_size, op): + def _run_sharded_elementwise_ops( + self, spec, input_size, op, reset_seed=None, **kwargs + ): torch.manual_seed(self.rank) st = sharded_tensor.rand(spec, *input_size) - new_st = op(st) + reset_seed() if reset_seed else None + new_st = op(st, **kwargs) local_shard = st.local_tensor() new_st_local_shard = new_st.local_tensor() + reset_seed() if reset_seed else None self.assertEqual( - op(local_shard), + op(local_shard, **kwargs), new_st_local_shard, ) @@ -67,6 +71,37 @@ def test_sharded_relu(self): self._run_sharded_elementwise_ops(spec, [17, 23], torch.nn.functional.relu) self._run_sharded_elementwise_ops(spec, [14, 15], torch.nn.functional.relu) + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_sharded_dropout(self): + def _reset_random_seed(): + torch.manual_seed(self.rank + 4) + + specs = generate_chunk_sharding_specs_for_test( + 0 + ) + generate_chunk_sharding_specs_for_test(1) + for spec in specs: + self._run_sharded_elementwise_ops( + spec, + [12, 17], + torch.nn.functional.dropout, + p=0.4, + reset_seed=_reset_random_seed, + ) + self._run_sharded_elementwise_ops( + spec, + [18, 21], + torch.nn.functional.dropout, + p=0.5, + reset_seed=_reset_random_seed, + ) + _reset_random_seed() + dropout = torch.nn.Dropout(p=0.8) + self._run_sharded_elementwise_ops( + spec, [17, 23], dropout, reset_seed=_reset_random_seed + ) + if __name__ == "__main__": run_tests() diff --git a/test/distributed/_shard/sharded_tensor/ops/test_linear.py b/test/distributed/_shard/sharded_tensor/ops/test_linear.py index f08797cb7b23..67e7dd2cb774 100644 --- a/test/distributed/_shard/sharded_tensor/ops/test_linear.py +++ b/test/distributed/_shard/sharded_tensor/ops/test_linear.py @@ -5,15 +5,17 @@ import torch import torch.distributed as dist -from torch.distributed._shard import shard_parameter +from torch.distributed._shard.api import ( + shard_parameter, + _collect_local_shard, + _reshard_output, +) from torch.distributed._shard.sharded_optim import ( ShardedOptimizer, named_params_with_sharded_tensor, ) from torch.distributed._shard.sharded_tensor import ( empty, - _collect_local_shard, - _reshard_output, ) from torch.distributed._shard.sharding_spec import ( ChunkShardingSpec, @@ -68,6 +70,7 @@ def _run_sharded_linear( inp = torch.rand(*input_size).cuda(self.rank) reshard_spec = copy.deepcopy(spec) reshard_spec.dim = 0 + reshard_spec.placements.sort(key=lambda placement: placement.rank()) sharded_linear = _collect_local_shard( _reshard_output(sharded_linear, reshard_spec) ) @@ -241,7 +244,10 @@ def test_sharded_linear_errors(self): ]) fc6.weight = empty(enumerable_spec, 10, 10) - with self.assertRaisesRegex(ValueError, 'Only ChunkShardingSpec supported for ShardedTensor ops!'): + # Sharded Tensor metadata has parenthesis imbalance issue when using re.compile + error_msg = r"torch function 'linear', with args: (?s).* " + r"and kwargs: None not supported for ShardedTensor!" + with self.assertRaisesRegex(RuntimeError, error_msg): fc6(torch.rand(10, 10).cuda(self.rank)) fc7 = torch.nn.Linear(10, 80).cuda(self.rank) diff --git a/test/distributed/_shard/sharded_tensor/ops/test_math_ops.py b/test/distributed/_shard/sharded_tensor/ops/test_math_ops.py new file mode 100644 index 000000000000..e080a6387515 --- /dev/null +++ b/test/distributed/_shard/sharded_tensor/ops/test_math_ops.py @@ -0,0 +1,186 @@ +# Owner(s): ["oncall: distributed"] + +import torch +from torch.distributed._shard import _shard_tensor +import torch.distributed._shard.sharded_tensor as sharded_tensor +import torch.distributed as dist + +from torch.distributed._shard.sharding_spec import ( + ChunkShardingSpec, + EnumerableShardingSpec, + ShardMetadata +) +from torch.testing._internal.common_distributed import ( + requires_nccl, + skip_if_lt_x_gpu, +) + +from torch.testing._internal.distributed._shard.sharded_tensor import ( + TEST_GPU_NUM, + ShardedTensorTestBase, + with_comms, +) + +from torch.testing._internal.distributed._shard.sharded_tensor._test_ops_common import ( + gen_binary_op_func, + generate_chunk_sharding_specs_for_test, +) + +class TestMathOps(ShardedTensorTestBase): + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_basic_math_ops(self): + ops = ["torch.add", "torch.sub", "torch.mul", "torch.div", "+", "-", "*", "/"] + + spec = ChunkShardingSpec( + dim=0, + placements=[ + "rank:0/cuda:0", + "rank:1/cuda:1", + "rank:2/cuda:2", + "rank:3/cuda:3", + ], + ) + + sharded_lhs = sharded_tensor.rand(spec, (12, 3)) + sharded_rhs = sharded_tensor.rand(spec, (12, 3)) + current_rank = dist.get_rank() + global_lhs = torch.empty((12, 3), device=current_rank) if current_rank == 0 else None + global_rhs = torch.empty((12, 3), device=current_rank) if current_rank == 0 else None + sharded_lhs.gather(dst=0, out=global_lhs) + sharded_rhs.gather(dst=0, out=global_rhs) + + for op in ops: + binary_op = gen_binary_op_func(op) + binary_op_ = gen_binary_op_func(op, inplace=True) + # test basic math ops between ShardedTensors + sharded_output = binary_op(sharded_lhs, sharded_rhs) + output = torch.empty((12, 3), device=current_rank) if current_rank == 0 else None + sharded_output.gather(dst=0, out=output) + + if current_rank == 0: + global_output = binary_op(global_lhs, global_rhs) + + self.assertEqual(output, global_output) + + # test basic math ops between ShardedTensor and scalar + scalars = [3, 1.8] + for scalar in scalars: + sharded_output_lhs = binary_op(sharded_lhs, scalar) + + sharded_output_lhs_ = binary_op_(sharded_lhs, scalar) + self.assertTrue(torch.allclose(sharded_output_lhs, sharded_output_lhs_)) + output_lhs = torch.empty((12, 3), device=current_rank) if current_rank == 0 else None + sharded_output_lhs.gather(dst=0, out=output_lhs) + + sharded_output_rhs = binary_op(scalar, sharded_lhs) + output_rhs = torch.empty((12, 3), device=current_rank) if current_rank == 0 else None + sharded_output_rhs.gather(dst=0, out=output_rhs) + + if current_rank == 0: + global_output_lhs = binary_op(global_lhs, scalar) + global_output_rhs = binary_op(scalar, global_lhs) + + self.assertEqual(output_lhs, global_output_lhs) + self.assertEqual(output_rhs, global_output_rhs) + + + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_math_ops_errors(self): + spec = ChunkShardingSpec( + dim=0, + placements=[ + "rank:0/cuda:0", + "rank:1/cuda:1", + "rank:2/cuda:2", + "rank:3/cuda:3", + ], + ) + sharded_lhs = sharded_tensor.rand(spec, (20, 3)) + sharded_rhs = sharded_tensor.rand(spec, (12, 3)) + + with self.assertRaisesRegex(RuntimeError, 'Implicit broadcasting not supported'): + torch.add(sharded_lhs, sharded_rhs) + + spec = EnumerableShardingSpec([ + ShardMetadata( + shard_offsets=[0, 0], + shard_sizes=[5, 5], + placement="rank:0/cuda:0", + ), + ShardMetadata( + shard_offsets=[0, 5], + shard_sizes=[5, 5], + placement="rank:1/cuda:1", + ), + ShardMetadata( + shard_offsets=[5, 0], + shard_sizes=[5, 5], + placement="rank:2/cuda:2", + ), + ShardMetadata( + shard_offsets=[5, 5], + shard_sizes=[5, 5], + placement="rank:3/cuda:3", + ) + ]) + + st = sharded_tensor.rand(spec, 10, 10) + + with self.assertRaisesRegex(RuntimeError, 'not supported'): + torch.add(st, sharded_rhs) + + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_sharded_bmm(self): + for spec in generate_chunk_sharding_specs_for_test(0): + lhs = torch.rand(15, 4, 5).cuda(self.rank) + rhs = torch.rand(15, 5, 6).cuda(self.rank) + tensor = lhs.bmm(rhs) + st_lhs = _shard_tensor(lhs, spec) + st_rhs = _shard_tensor(rhs, spec) + st_expected = _shard_tensor(tensor, spec) + self.assertTrue(torch.allclose(torch.bmm(st_lhs, st_rhs), st_expected)) + self.assertTrue(torch.allclose(st_lhs.bmm(st_rhs), st_expected)) + + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_sharded_bmm_errors(self): + specs = generate_chunk_sharding_specs_for_test(0) + st_lhs = sharded_tensor.rand(specs[0], (15, 5, 6)) + st_rhs = sharded_tensor.rand(specs[1], (15, 5, 6)) + with self.assertRaisesRegex( + NotImplementedError, + 'Both st and st2 need to have same placements for bmm', + ): + torch.bmm(st_lhs, st_rhs) + for spec in specs: + st_lhs = sharded_tensor.rand(spec, (20, 3)) + st_rhs = sharded_tensor.rand(spec, (20, 3)) + with self.assertRaisesRegex( + TypeError, + 'both st and st2 need to be a 3D ShardedTensor', + ): + torch.bmm(st_lhs, st_rhs) + rhs = torch.rand(15, 5, 6).cuda(self.rank) + with self.assertRaisesRegex( + TypeError, + 'st2 needs to be a ShardedTensor for torch.bmm', + ): + torch.bmm(st_lhs, rhs) + spec.dim = 1 + st_lhs = sharded_tensor.rand(spec, (15, 5, 6)) + st_rhs = sharded_tensor.rand(spec, (15, 5, 6)) + with self.assertRaisesRegex( + NotImplementedError, + 'Only support performing bmm on tensors sharded on dim 0 now', + ): + torch.bmm(st_lhs, st_rhs) diff --git a/test/distributed/_shard/sharded_tensor/ops/test_matrix_ops.py b/test/distributed/_shard/sharded_tensor/ops/test_matrix_ops.py new file mode 100644 index 000000000000..dd074f324df4 --- /dev/null +++ b/test/distributed/_shard/sharded_tensor/ops/test_matrix_ops.py @@ -0,0 +1,294 @@ +# Owner(s): ["oncall: distributed"] + +import copy +import itertools +import sys + +import torch +from torch.distributed._shard import sharded_tensor, _shard_tensor +from torch.testing._internal.common_distributed import ( + requires_nccl, + skip_if_lt_x_gpu, +) +from torch.testing._internal.common_utils import ( + TEST_WITH_DEV_DBG_ASAN, + run_tests, +) +from torch.testing._internal.distributed._shard.sharded_tensor import ( + TEST_GPU_NUM, + ShardedTensorTestBase, + with_comms, +) +from torch.testing._internal.distributed._shard.sharded_tensor._test_ops_common import ( + generate_enumerable_sharding_specs_for_test, +) +from torch.testing._internal.distributed._shard.sharded_tensor._test_st_common import ( + _chunk_sharding_specs_list_for_test, +) + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) + + +class TestShardedTensorMatrixOps(ShardedTensorTestBase): + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_sharded_tensor_contiguous(self): + specs = _chunk_sharding_specs_list_for_test([0], seed=7) + for spec in specs: + st = sharded_tensor.rand(spec, 10, 22, 5, init_rrefs=False) + st = st.transpose(1, 0) + st = st.contiguous() + self.assertTrue(st.is_contiguous()) + self.assertTrue(st.local_tensor().is_contiguous()) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_sharded_tensor_type_as(self): + specs = _chunk_sharding_specs_list_for_test([0], seed=7) + for spec in specs: + st = sharded_tensor.rand( + spec, 16, 30, 5, init_rrefs=False, dtype=torch.double + ) + st_2 = sharded_tensor.rand( + spec, 16, 30, 5, init_rrefs=False, dtype=torch.float + ) + st_3 = st.type_as(st_2) + self.assertEqual(torch.float, st_3.dtype) + self.assertEqual(torch.float, st_3.local_tensor().dtype) + st_3 = st.type_as(torch.zeros(10).type(torch.BoolTensor).cuda()) + self.assertEqual(torch.bool, st_3.dtype) + self.assertEqual(torch.bool, st_3.local_tensor().dtype) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_sharded_tensor_transpose(self): + specs = _chunk_sharding_specs_list_for_test([0, 1, 2], seed=7) + for spec in specs: + tensor = torch.rand(15, 27, 16).cuda(self.rank) + tensor_t = tensor.transpose(0, 1).contiguous() + spec_n = copy.deepcopy(spec) + if spec_n.dim in (0, 1): + spec_n.dim = 1 - spec_n.dim + st_expected = _shard_tensor(tensor_t, spec_n) + self.assertTrue( + torch.allclose( + torch.transpose(_shard_tensor(tensor, spec), 0, 1), st_expected + ) + ) + tensor_t = torch.transpose(tensor, 1, 2).contiguous() + spec_n = copy.deepcopy(spec) + if spec_n.dim in (1, 2): + spec_n.dim = 3 - spec_n.dim + st_expected = _shard_tensor(tensor_t, spec_n) + self.assertTrue( + torch.allclose(_shard_tensor(tensor, spec).transpose(1, 2), st_expected) + ) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_sharded_tensor_transpose_error(self): + enumerable_spec = generate_enumerable_sharding_specs_for_test()[0] + st = sharded_tensor.rand( + enumerable_spec, 10, 10, init_rrefs=False, dtype=torch.double + ) + with self.assertRaisesRegex( + NotImplementedError, + "Only ChunkShardingSpec supported for 'transpose'", + ): + st.transpose(1, 0) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_sharded_tensor_softmax(self): + specs = _chunk_sharding_specs_list_for_test([0, 2], seed=17) + for spec in specs: + tensor = torch.rand(15, 27, 16).cuda(self.rank) + tensor_n = torch.nn.functional.softmax(tensor, dim=1, dtype=torch.float32) + st_expected = _shard_tensor(tensor_n, spec) + self.assertTrue( + torch.allclose( + torch.nn.functional.softmax( + _shard_tensor(tensor, spec), dim=1, dtype=torch.float32 + ), + st_expected, + ) + ) + + def _test_masked_fill_with_sizes(self, mask_size, broadcast_style=False): + specs = _chunk_sharding_specs_list_for_test([0, 1, 2], seed=7) + for spec in specs: + tensor = torch.rand(35, 17, 26).cuda(self.rank) + mask = torch.randint(0, 2, mask_size).type(torch.BoolTensor).cuda(self.rank) + if broadcast_style: + mask = mask.unsqueeze(1) + tensor_m = tensor.masked_fill(mask, 25.0) + st_expected = _shard_tensor(tensor_m, spec) + self.assertTrue( + torch.allclose( + _shard_tensor(tensor, spec).masked_fill(mask, 25.0), + st_expected, + ) + ) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_sharded_tensor_masked_fill(self): + self._test_masked_fill_with_sizes((35, 17, 26)) + self._test_masked_fill_with_sizes((17, 26)) + self._test_masked_fill_with_sizes((35, 26), broadcast_style=True) + self._test_masked_fill_with_sizes((26,)) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_sharded_tensor_masked_fill_error(self): + specs = _chunk_sharding_specs_list_for_test([1, 2], seed=7) + for spec in specs: + st = sharded_tensor.rand( + spec, 35, 17, 26, init_rrefs=False, dtype=torch.double + ) + mask = ( + torch.randint(0, 2, (2, 35, 17, 26)) + .type(torch.BoolTensor) + .cuda(self.rank) + ) + with self.assertRaisesRegex( + ValueError, + "mask dim must not greater than the dim of the sharded tensor.", + ): + st.masked_fill(mask, 25.0) + mask = torch.randint(0, 2, (16, 26)).type(torch.BoolTensor).cuda(self.rank) + with self.assertRaisesRegex( + ValueError, + "The size of mask 0 must match the size of sharded tensor 1 " + "at non-singleton dimension 0", + ): + st.masked_fill(mask, 25.0) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_sharded_tensor_view(self): + specs = _chunk_sharding_specs_list_for_test([0, 0, -3], seed=10) + for spec in specs: + tensor = torch.rand(16, 35, 26).cuda(self.rank) + tensor_v = tensor.view(16, 35, 26).view(4, 4, 35, 26) + new_spec = copy.deepcopy(spec) + if new_spec.dim < 0: + new_spec.dim -= 1 + st_expected = _shard_tensor(tensor_v, new_spec) + self.assertTrue( + torch.allclose( + _shard_tensor(tensor, spec).view(4, 4, 35, 26), + st_expected, + ) + ) + st_expected = _shard_tensor(tensor, spec) + self.assertTrue( + torch.allclose( + _shard_tensor(tensor_v, new_spec).view(16, 35, 26), + st_expected, + ) + ) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_sharded_tensor_view_error(self): + for spec in _chunk_sharding_specs_list_for_test([2], seed=7): + st = sharded_tensor.rand( + spec, 35, 17, 26, init_rrefs=False, dtype=torch.double + ) + with self.assertRaisesRegex( + NotImplementedError, + "Shape having dim 2 is not supported " + "for sharded tensor sharded on dim 2.", + ): + st.view(35 * 17, 26) + with self.assertRaisesRegex( + ValueError, + r"Shape '\[5, 7, 35, 17, 26\]' is invalid for sharded tensor size 15470.", + ): + st.view(5, 7, 35, 17, 26) + with self.assertRaisesRegex( + ValueError, + "Only one dimension can be inferred for sharded view op.", + ): + st.view(5, 7, -1, -1) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_sharded_tensor_layer_norm(self): + specs = _chunk_sharding_specs_list_for_test([1, 2], seed=10) + flags = [True, False] + for spec, flag in itertools.product(specs, flags): + tensor = torch.rand(16, 35, 26).cuda(self.rank) + layer_norm = torch.nn.LayerNorm((35, 26), elementwise_affine=flag).cuda( + self.rank + ) + st = layer_norm(_shard_tensor(tensor, spec)) + with torch.no_grad(): + tensor_normed = layer_norm(tensor) + st_expected = _shard_tensor(tensor_normed, spec) + self.assertEqual( + st.local_tensor(), + st_expected.local_tensor(), + ) + self.assertTrue( + torch.allclose( + st, + st_expected, + atol=1e-6, + ) + ) + st_expected = torch.nn.functional.layer_norm( + _shard_tensor(tensor, spec), + (35, 26), + weight=layer_norm.weight, + bias=layer_norm.bias, + ) + self.assertTrue( + torch.allclose( + st, + st_expected, + atol=1e-6, + ) + ) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_sharded_tensor_layer_norm_error(self): + specs = _chunk_sharding_specs_list_for_test([2], seed=10) + for spec in specs: + tensor = torch.rand(16, 35, 26).cuda(self.rank) + with self.assertRaisesRegex( + ValueError, + "normalized_shape dim must not be greater " + "than the dim of the sharded tensor.", + ): + layer_norm = torch.nn.LayerNorm((14, 55, 35, 26)).cuda(self.rank) + layer_norm(_shard_tensor(tensor, spec)) + with self.assertRaisesRegex( + ValueError, + r"Given normalized_shape=\[35\], expected input with shape " + r"\[\*, 35\], but got input of size \[16, 35, 26\].", + ): + layer_norm = torch.nn.LayerNorm((35)).cuda(self.rank) + layer_norm(_shard_tensor(tensor, spec)) + + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/_shard/sharded_tensor/ops/test_softmax.py b/test/distributed/_shard/sharded_tensor/ops/test_softmax.py new file mode 100644 index 000000000000..f55ca9391d9f --- /dev/null +++ b/test/distributed/_shard/sharded_tensor/ops/test_softmax.py @@ -0,0 +1,57 @@ +# Owner(s): ["oncall: distributed"] + +import sys +import torch +from torch.testing._internal.common_utils import ( + TEST_WITH_DEV_DBG_ASAN, + run_tests, +) +from torch.testing._internal.distributed._shard.sharded_tensor import ( + TEST_GPU_NUM, + ShardedTensorTestBase, + with_comms, +) +from torch.testing._internal.common_distributed import ( + requires_nccl, + skip_if_lt_x_gpu, +) +from torch.distributed._shard.sharding_spec import ChunkShardingSpec +from torch.distributed._shard import _shard_tensor + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) + + +class TestShardedSoftmax(ShardedTensorTestBase): + + def _test_sharded_softmax(self, softmax_dim, sharding_dim): + torch.manual_seed(0) + local_tensor = torch.rand(10, 10, device=self.rank) + local_softmax = torch.nn.functional.softmax(local_tensor, softmax_dim) + + spec = ChunkShardingSpec(dim=sharding_dim, placements=[f'rank:{idx}/cuda:{idx}' for idx in range(self.world_size)]) + st = _shard_tensor(local_tensor, spec) + sharded_softmax = torch.nn.functional.softmax(st, softmax_dim) + + self.assertEqual(local_softmax.chunk(self.world_size, dim=sharding_dim)[self.rank], sharded_softmax.local_tensor()) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_sharded_softmax_basic(self): + self._test_sharded_softmax(0, 1) + self._test_sharded_softmax(-2, 1) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_sharded_softmax_on_sharding_dim(self): + self._test_sharded_softmax(1, 1) + self._test_sharded_softmax(-1, 1) + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/_shard/sharded_tensor/ops/test_tensor_ops.py b/test/distributed/_shard/sharded_tensor/ops/test_tensor_ops.py new file mode 100644 index 000000000000..3f9bec1f38f5 --- /dev/null +++ b/test/distributed/_shard/sharded_tensor/ops/test_tensor_ops.py @@ -0,0 +1,115 @@ +# Owner(s): ["oncall: distributed"] + +import copy + +import torch.distributed._shard.sharded_tensor as sharded_tensor + +from torch.distributed._shard.sharding_spec import ( + ChunkShardingSpec, +) +from torch.testing._internal.common_distributed import ( + requires_nccl, + skip_if_lt_x_gpu, +) + +from torch.testing._internal.distributed._shard.sharded_tensor import ( + TEST_GPU_NUM, + ShardedTensorTestBase, + with_comms, +) +from torch.testing._internal.common_utils import ( + run_tests, +) + +class TestTensorOps(ShardedTensorTestBase): + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_deep_copy(self): + spec = ChunkShardingSpec( + dim=0, + placements=[ + "rank:0/cuda:0", + "rank:1/cuda:1", + "rank:2/cuda:2", + "rank:3/cuda:3", + ], + ) + st = sharded_tensor.rand(spec, (12, 5)) + copied_st = copy.deepcopy(st) + self.assertTrue(type(copied_st) is type(st)) + self.assertEqual(copied_st.local_tensor(), st.local_tensor()) + self.assertFalse(copied_st is st) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_clone(self): + spec = ChunkShardingSpec( + dim=0, + placements=[ + "rank:0/cuda:0", + "rank:1/cuda:1", + "rank:2/cuda:2", + "rank:3/cuda:3", + ], + ) + st = sharded_tensor.rand(spec, (12, 5)) + copied_st = st.clone() + self.assertTrue(type(copied_st) is type(st)) + self.assertEqual(copied_st.local_tensor(), st.local_tensor()) + self.assertFalse(copied_st is st) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_detach(self): + spec = ChunkShardingSpec( + dim=0, + placements=[ + "rank:0/cuda:0", + "rank:1/cuda:1", + "rank:2/cuda:2", + "rank:3/cuda:3", + ], + ) + st = sharded_tensor.rand(spec, (12, 5), requires_grad=True) + local_shards = st.local_shards() + # before set requires_grad, all local shards should not require grads + for local_shard in local_shards: + self.assertTrue(local_shard.tensor.requires_grad) + + detached_st = st.detach() + self.assertFalse(detached_st.requires_grad) + + for local_shard in detached_st.local_shards(): + self.assertFalse(local_shard.tensor.requires_grad) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_set_requires_grad(self): + spec = ChunkShardingSpec( + dim=0, + placements=[ + "rank:0/cuda:0", + "rank:1/cuda:1", + "rank:2/cuda:2", + "rank:3/cuda:3", + ], + ) + st = sharded_tensor.rand(spec, (12, 5)) + local_shards = st.local_shards() + # before set requires_grad, all local shards should not require grads + for local_shard in local_shards: + self.assertFalse(local_shard.tensor.requires_grad) + + st.requires_grad_() + self.assertTrue(st.requires_grad) + + for local_shard in local_shards: + self.assertTrue(local_shard.tensor.requires_grad) + + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py b/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py index 2b11c49d9589..cff259aad8a9 100644 --- a/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py +++ b/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py @@ -9,12 +9,10 @@ ShardedOptimizer, named_params_with_sharded_tensor, ) -from torch.distributed._shard import ( +from torch.distributed._shard.api import ( shard_parameter, -) -from torch.distributed._shard.sharded_tensor import ( - _collect_local_shard, _reshard_output, + _collect_local_shard ) from torch.testing._internal.common_distributed import ( requires_nccl, @@ -34,6 +32,7 @@ generate_chunk_sharding_specs_for_test, generate_local_weight_sharding_params_for_test, ) +from torch.testing._internal.distributed._shard.test_common import SimpleMegatronLM if TEST_WITH_DEV_DBG_ASAN: print( @@ -44,19 +43,6 @@ class TestShardedTensorMegatronLinear(ShardedTensorTestBase): - class SimpleMegatronLM(torch.nn.Module): - def __init__(self, linear_size, rank=None): - super().__init__() - self.fc1 = torch.nn.Linear(*linear_size[0]) - self.gelu = torch.nn.GELU() - self.fc2 = torch.nn.Linear(*linear_size[1]) - if rank: - self.fc1.cuda(rank) - self.fc2.cuda(rank) - - def forward(self, inp): - return self.fc2(self.gelu(self.fc1(inp))) - def _run_megatron_linear(self, spec, input_size, linear_size): def _weight_override(module_dst, module_src): module_dst.fc1.weight = clone_module_parameter(module_src.fc1, "weight") @@ -68,30 +54,12 @@ def _shard_parameter(module, spec): shard_parameter(module.fc1, "weight", spec[0]) shard_parameter(module.fc2, "weight", spec[1]) - def _get_weight_grad(module): - return (module.fc1.weight.grad, module.fc2.weight.grad) - - def _get_bias_grad(module): - return (module.fc1.bias.grad, module.fc2.bias.grad) - - def _get_weights(module): - return (module.fc1.weight, module.fc2.weight) - - def _get_bias(module): - return (module.fc1.bias, module.fc2.bias) - - def _get_weight_local_shard(module): - return ( - module.fc1.weight.local_tensor(), - module.fc2.weight.local_tensor(), - ) - # Use same seed. torch.manual_seed(0) - local_megatron_lm = self.SimpleMegatronLM(linear_size, rank=self.rank).cuda( + local_megatron_lm = SimpleMegatronLM(linear_size, rank=self.rank).cuda( self.rank ) - sharded_megatron_lm = self.SimpleMegatronLM(linear_size) + sharded_megatron_lm = SimpleMegatronLM(linear_size) _weight_override(sharded_megatron_lm, local_megatron_lm) # Shard the parameter. First col-wise sharding and then row-wise @@ -121,15 +89,15 @@ def _get_weight_local_shard(module): ( local_weight_grad_fc1, local_weight_grad_fc2, - ) = _get_weight_grad(local_megatron_lm) - local_bias_grad_fc1, local_bias_grad_fc2 = _get_bias_grad(local_megatron_lm) + ) = local_megatron_lm.get_weight_grads() + local_bias_grad_fc1, local_bias_grad_fc2 = local_megatron_lm.get_bias_grads() # Verify that weights in both layers and biases in the sharded linear has non-None grad. ( sharded_weight_fc1, sharded_weight_fc2, - ) = _get_weight_local_shard(sharded_megatron_lm) - bias_grad_fc1, bias_grad_fc2 = _get_bias_grad(sharded_megatron_lm) + ) = sharded_megatron_lm.get_weights() + bias_grad_fc1, bias_grad_fc2 = sharded_megatron_lm.get_bias_grads() self.assertNotEqual(sharded_weight_fc1.grad, None) self.assertNotEqual(sharded_weight_fc2.grad, None) self.assertNotEqual(bias_grad_fc1, None) @@ -140,7 +108,7 @@ def _get_weight_local_shard(module): dist.all_reduce(local_weight_grad_fc2) dist.all_reduce(local_bias_grad_fc1) dist.all_reduce(local_bias_grad_fc2) - local_weight_fc1, local_weight_fc2 = _get_weights(local_megatron_lm) + local_weight_fc1, local_weight_fc2 = local_megatron_lm.get_weights() ( start_pos_fc1, chunk_size_fc1, @@ -167,8 +135,8 @@ def _get_weight_local_shard(module): self.assertEqual(bias_grad_fc2, local_bias_grad_fc2) # Test optimizer. - bias_fc1, bias_fc2 = _get_bias(sharded_megatron_lm) - local_bias_fc1, local_bias_fc2 = _get_bias(local_megatron_lm) + bias_fc1, bias_fc2 = sharded_megatron_lm.get_biases() + local_bias_fc1, local_bias_fc2 = local_megatron_lm.get_biases() self.assertEqual(bias_fc1, local_bias_fc1) self.assertEqual(bias_fc2, local_bias_fc2) self.assertEqual(bias_fc1.grad, local_bias_fc1.grad) @@ -201,7 +169,7 @@ def _get_weight_local_shard(module): self.assertEqual(sharded_weight_fc2, local_weight_fc2_narrowed) # Test bias value after optimizer. - local_bias_fc1, local_bias_fc2 = _get_bias(local_megatron_lm) + local_bias_fc1, local_bias_fc2 = local_megatron_lm.get_biases() self.assertNotEqual(previous_bias_fc1, bias_fc1) self.assertEqual(bias_fc1, local_bias_fc1) self.assertNotEqual(previous_bias_fc2, bias_fc2) diff --git a/test/distributed/_shard/sharded_tensor/test_partial_tensor.py b/test/distributed/_shard/sharded_tensor/test_partial_tensor.py deleted file mode 100644 index 18418f8fb517..000000000000 --- a/test/distributed/_shard/sharded_tensor/test_partial_tensor.py +++ /dev/null @@ -1,108 +0,0 @@ -# Owner(s): ["oncall: distributed"] - -import sys - -import torch -import torch.distributed as dist -from torch.distributed._shard.sharded_tensor import ( - _PartialTensor, -) -from torch.distributed._shard.sharding_spec import ( - EnumerableShardingSpec, - ShardMetadata, -) -from torch.testing._internal.common_distributed import ( - requires_nccl, - skip_if_lt_x_gpu, -) -from torch.testing._internal.common_utils import ( - TEST_WITH_DEV_DBG_ASAN, -) -from torch.testing._internal.distributed._shard.sharded_tensor import ( - ShardedTensorTestBase, - with_comms, -) -from torch.testing._internal.distributed._shard.sharded_tensor._test_st_common import ( - _chunk_sharding_specs_list_for_test, -) - -if TEST_WITH_DEV_DBG_ASAN: - print( - "Skip dev-asan as torch + multiprocessing spawn have known issues", - file=sys.stderr, - ) - sys.exit(0) - - -class TestPartialTensorReshard(ShardedTensorTestBase): - def _run_partial_tensor_n_reshard( - self, reshard_spec, input_size, world_size, reduce_op, dtype=torch.float - ): - results = [] - results_compare = [] - for _ in range(0, world_size): - tensor = torch.rand(*input_size, dtype=dtype).cuda(self.rank) - results.append(tensor) - results_compare.append(tensor.clone().detach()) - pg = dist.distributed_c10d._get_default_group() - parital_tensor = _PartialTensor(torch.cat(results), pg, reduce_op=reduce_op) - local_sharded_result = parital_tensor.reshard(reshard_spec) - local_shards = local_sharded_result.local_shards() - local_result_compare = torch.empty_like(results_compare[0]) - dist.reduce_scatter(local_result_compare, results_compare, op=reduce_op) - self.assertEqual(1, len(local_shards)) - self.assertEqual(local_shards[0].tensor, local_result_compare) - - @with_comms(init_rpc=False) - @skip_if_lt_x_gpu(4) - @requires_nccl() - def test_partial_tensor_reshard(self): - specs = _chunk_sharding_specs_list_for_test([0], seed=7) - spec = specs[0] - self._run_partial_tensor_n_reshard(spec, [13, 21], 4, dist.ReduceOp.SUM) - self._run_partial_tensor_n_reshard(spec, [12, 22], 4, dist.ReduceOp.MAX) - - @with_comms(init_rpc=False) - @skip_if_lt_x_gpu(4) - @requires_nccl() - def test_partial_tensor_reshard_errors(self): - enumerable_sharding_spec = EnumerableShardingSpec( - [ - ShardMetadata( - shard_offsets=[0, 0], - shard_sizes=[5, 5], - placement="rank:0/cuda:0", - ), - ShardMetadata( - shard_offsets=[5, 0], - shard_sizes=[5, 5], - placement="rank:1/cuda:1", - ), - ] - ) - with self.assertRaisesRegex( - NotImplementedError, "Only ChunkShardingSpec supported for reshard." - ): - self._run_partial_tensor_n_reshard( - enumerable_sharding_spec, [13, 21], 4, dist.ReduceOp.SUM - ) - self._run_partial_tensor_n_reshard( - enumerable_sharding_spec, [12, 22], 4, dist.ReduceOp.MAX - ) - specs = _chunk_sharding_specs_list_for_test([0], seed=7) - spec = specs[0] - with self.assertRaisesRegex( - NotImplementedError, "Only real partial tensor supported for reshard." - ): - self._run_partial_tensor_n_reshard( - spec, [13, 21], 4, dist.ReduceOp.SUM, dtype=torch.cfloat - ) - self._run_partial_tensor_n_reshard( - spec, [12, 22], 4, dist.ReduceOp.MAX, dtype=torch.cfloat - ) - with self.assertRaisesRegex( - ValueError, "World size need to divide the length of the dimension." - ): - self._run_partial_tensor_n_reshard( - spec, [13, 21], 3, dist.ReduceOp.SUM, dtype=torch.cfloat - ) diff --git a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py index cbad9458ae4f..ae00f47cecff 100644 --- a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py +++ b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py @@ -9,25 +9,29 @@ import torch import torch.distributed as dist from torch.distributed import rpc -from torch.distributed._shard import ( +from torch.distributed import distributed_c10d +from torch.distributed._shard import sharded_tensor +from torch.distributed._shard.api import ( shard_parameter, - sharded_tensor, _shard_tensor, + load_with_process_group, + _collect_local_shard, + _reshard_output, ) from torch.distributed._shard.sharded_tensor import ( sharded_op_impl, - load_with_process_group, pre_load_state_dict_hook, state_dict_hook, ShardedTensor, - _collect_local_shard, - _reshard_output, ) from torch.distributed._shard.sharding_spec import ( ChunkShardingSpec, EnumerableShardingSpec, ShardMetadata, ) +from torch.distributed._shard.sharded_tensor.utils import ( + _parse_and_validate_remote_device +) from torch.distributed._shard.sharded_tensor.api import ( TensorProperties, _create_tensor_from_params, @@ -35,6 +39,7 @@ from torch.testing._internal.common_distributed import ( requires_nccl, skip_if_lt_x_gpu, + tp_transports, ) from torch.testing._internal.common_utils import ( TestCase, @@ -49,32 +54,13 @@ from torch.distributed.remote_device import _remote_device from torch.testing._internal.distributed._shard.sharded_tensor._test_st_common import ( _chunk_sharding_specs_list_for_test, + MyShardedModel1, ) if TEST_WITH_DEV_DBG_ASAN: print("Skip dev-asan as torch + multiprocessing spawn have known issues", file=sys.stderr) sys.exit(0) -class MyShardedModel2(torch.nn.Module): - def __init__(self, spec=None, group=None): - super(MyShardedModel2, self).__init__() - if spec is not None: - self.sharded_tensor2 = sharded_tensor.empty(spec, 10, 20, process_group=group, init_rrefs=True) - else: - self.sharded_tensor2 = None - self.random_tensor2 = torch.nn.Parameter(torch.rand(2, 2)) - - -class MyShardedModel1(torch.nn.Module): - def __init__(self, spec=None, group=None): - super(MyShardedModel1, self).__init__() - if spec is not None: - self.sharded_tensor1 = sharded_tensor.empty(spec, 10, 20, process_group=group, init_rrefs=True) - else: - self.sharded_tensor1 = None - self.random_tensor1 = torch.nn.Parameter(torch.rand(2, 2)) - self.submodule = MyShardedModel2(spec, group) - class TestShardedTensorMetadata(TestCase): def test_serialize_and_deserialize(self): shard_metadatas = [ @@ -188,7 +174,7 @@ def test_shard_parameter_errors(self): with self.assertRaisesRegex(ValueError, 'does not match with src_rank'): shard_parameter(fc, 'weight', spec, src_rank=self.rank) - with self.assertRaisesRegex(ValueError, 'does not have parameter'): + with self.assertRaisesRegex(AttributeError, 'Linear have no attribute'): shard_parameter(fc, 'foo', spec) with self.assertRaisesRegex(ValueError, 'Expected Linear.bias to be a Tensor, but found str'): @@ -224,9 +210,7 @@ def test_shard_parameter_errors(self): placement="rank:1/cuda:1", ), ]) - with self.assertRaisesRegex( - NotImplementedError, 'Only ChunkShardingspec is supported.' - ): + with self.assertRaisesRegex(NotImplementedError, 'not implemented yet!'): shard_parameter(fc, 'weight', spec) @@ -301,7 +285,7 @@ def test_shard_tensor_errors(self): ), ]) with self.assertRaisesRegex( - NotImplementedError, 'Only ChunkShardingspec is supported.' + NotImplementedError, 'not implemented yet!' ): _shard_tensor(tensor, spec) @@ -634,7 +618,7 @@ def test_create_sharded_tensor_with_zeros(self): @skip_if_lt_x_gpu(4) @requires_nccl() def test_create_sharded_tensor_with_rand(self): - """ Test sharded_tensor.rand(...) """ + """ Test sharded_tensor.rand(...)/randn(...) """ spec = ChunkShardingSpec( dim=0, @@ -652,6 +636,7 @@ def test_create_sharded_tensor_with_rand(self): expected_device = torch.device(f"cuda:{self.rank}") dtype = torch.double torch.manual_seed(seed) + # Test sharded_tensor.rand creation expected = torch.rand(expected_h, w, device=expected_device, dtype=dtype) # reset seed to ensure the same random numbers are generated torch.manual_seed(seed) @@ -665,6 +650,20 @@ def test_create_sharded_tensor_with_rand(self): self.assertEqual((expected_h, w), local_shard.size()) self.assertEqual(expected, local_shard) + # Test sharded_tensor.randn creation + torch.manual_seed(seed) + expected_randn = torch.randn(expected_h, w, device=expected_device, dtype=dtype) + # reset seed to ensure the same random numbers are generated + torch.manual_seed(seed) + st_randn = sharded_tensor.randn(spec, h, w, dtype=dtype) + + # Validate local shard is initialized with torch.randn + local_shards = st_randn.local_shards() + self.assertEqual(1, len(local_shards)) + local_shard = local_shards[0].tensor + self.assertEqual(expected_device, local_shard.device) + self.assertEqual((expected_h, w), local_shard.size()) + self.assertEqual(expected_randn, local_shard) @with_comms @skip_if_lt_x_gpu(4) @@ -696,6 +695,52 @@ def test_create_sharded_tensor_with_full(self): self.assertEqual(local_shard, torch.full(size=(expected_h, w), fill_value=fill_value, dtype=torch.int32)) + @with_comms + @skip_if_lt_x_gpu(4) + @requires_nccl() + def test_create_sharded_tensor_like(self): + """ Test tensor like methods, i.e. torch.zeros_like(...), torch.full_like, etc. """ + + spec = ChunkShardingSpec( + dim=0, + placements=[ + "rank:0/cuda:0", + "rank:1/cuda:1", + "rank:2/cuda:2", + "rank:3/cuda:3", + ], + ) + h, w = 8, 8 + expected_h = 2 + seed = 1234 + dtype = torch.double + expected_device = torch.device(f"cuda:{self.rank}") + st = sharded_tensor.rand(spec, (h, w), dtype=dtype) + tensor_like_ops = { + torch.zeros_like: torch.zeros, + torch.ones_like: torch.ones, + torch.rand_like: torch.rand, + torch.randn_like: torch.randn, + torch.empty_like: torch.empty, + torch.full_like: torch.full + } + for op, expect_local_op in tensor_like_ops.items(): + if op == torch.full_like: + # special handle full/full_like as it needs to have additional fill_value arg + expect_tensor = expect_local_op((expected_h, w), 8.8, device=expected_device, dtype=dtype) + new_op_st = op(st, 8.8, dtype=dtype) + self.assertEqual(new_op_st.local_tensor(), expect_tensor) + elif op == torch.empty_like: + # empty/empty_like we only compare the shape + expect_tensor = expect_local_op(expected_h, w, device=expected_device, dtype=dtype) + new_op_st = op(st, dtype=dtype) + self.assertEqual(new_op_st.local_tensor().shape, expect_tensor.shape) + else: + torch.manual_seed(seed) + expect_tensor = expect_local_op(expected_h, w, device=expected_device, dtype=dtype) + torch.manual_seed(seed) + new_op_st = op(st, dtype=dtype) + self.assertEqual(new_op_st.local_tensor(), expect_tensor) @with_comms @skip_if_lt_x_gpu(4) @@ -885,8 +930,8 @@ def test_sharding_columns(self): def test_invalid_sharding(self): self.init_pg() - spec = ChunkShardingSpec(dim='H', placements=["rank:1/cuda:1"]) - with self.assertRaisesRegex(ValueError, 'needs to be an integer'): + with self.assertRaisesRegex(NotImplementedError, 'does not support named dimension'): + spec = ChunkShardingSpec(dim='H', placements=["rank:1/cuda:1"]) sharded_tensor.empty(spec, 10, 20) for dim in [2, 3, 4, -3, -4, -5]: @@ -901,7 +946,7 @@ def test_invalid_sharding(self): spec = ChunkShardingSpec(dim=0, placements=["rank:0/cuda:1"]) st = sharded_tensor.empty(spec, 10, 20) tensor = torch.empty(10, 20) - with self.assertRaisesRegex(RuntimeError, "not supported for ShardedTensor!"): + with self.assertRaisesRegex(RuntimeError, "not supported yet for ShardedTensor!"): torch.add(st, tensor) spec = ChunkShardingSpec(dim=0, placements=["rank:0/cuda:1"]) @@ -935,7 +980,7 @@ def test_invalid_pg_rpc_ranks(self): self.init_pg() # Init RPC with different ranks. - rpc_backend_options = rpc.TensorPipeRpcBackendOptions() + rpc_backend_options = rpc.TensorPipeRpcBackendOptions(_transports=tp_transports()) rpc_backend_options.init_method = f"file://{self.file_name}" rank = (self.rank + 1) % self.world_size rpc.init_rpc( @@ -1025,9 +1070,9 @@ def test_sharded_tensor_sizes(self): # Test with invalid input st = sharded_tensor.empty(spec, (10, 20), init_rrefs=True) - with self.assertRaisesRegex(ValueError, 'must be within the range of tensor dimensions \\[0, 2\\)'): - st.size(-1) - with self.assertRaisesRegex(ValueError, 'must be within the range of tensor dimensions \\[0, 2\\)'): + with self.assertRaisesRegex(ValueError, 'must be within the range of tensor dimensions \\[-2, 2\\)'): + st.size(-3) + with self.assertRaisesRegex(ValueError, 'must be within the range of tensor dimensions \\[-2, 2\\)'): st.size(2) with self.assertRaises(TypeError): @@ -1463,6 +1508,92 @@ def test_gather_uneven(self) -> None: else: self.assertIsNone(full_tensor) + @with_comms + @skip_if_lt_x_gpu(4) + @requires_nccl() + def test_sharded_tensor_to_cpu(self): + cpu_spec = ChunkShardingSpec( + dim=0, + placements=[ + "rank:0/cpu", + "rank:1/cpu", + "rank:2/cpu", + "rank:3/cpu", + ], + ) + spec = ChunkShardingSpec( + dim=0, + placements=[ + "rank:0/cuda:0", + "rank:1/cuda:1", + "rank:2/cuda:2", + "rank:3/cuda:3", + ], + ) + h, w = 10, 20 + gloo_pg = dist.new_group(backend="gloo") + + # CPU sharded tensor should return the same instance (no copy) + st_cpu = sharded_tensor.zeros(cpu_spec, h, w, process_group=gloo_pg) + new_st_cpu = st_cpu.cpu() + self.assertEqual(st_cpu, new_st_cpu) + + # GPU sharded tensor to cpu + st = sharded_tensor.zeros(spec, h, w) + # test ability to move st to CPU + spec_before_move = st.sharding_spec() + new_st = st.cpu(process_group=gloo_pg) + # return a copy of orginal st + self.assertNotEqual(st, new_st) + # check the spec is still ChunkShardingSpec + spec_after_move = new_st.sharding_spec() + self.assertIsInstance(spec_after_move, ChunkShardingSpec) + # now it should be ProcessGroupGloo since it's on CPU + self.assertIsInstance(new_st._process_group, distributed_c10d.ProcessGroupGloo) + # test specs before and after the move almost the same except placement device + self.assertEqual(spec_before_move.dim, spec_after_move.dim) + self.assertEqual(len(spec_before_move.placements), len(spec_after_move.placements)) + for i, remote_device_after in enumerate(spec_after_move.placements): + remote_device_before = spec_before_move.placements[i] + self.assertEqual(remote_device_before.rank(), remote_device_after.rank()) + self.assertEqual(str(remote_device_after.device()), "cpu") + + # ensure metdata also get changed to CPU + metas = new_st.metadata().shards_metadata + for meta in metas: + self.assertEqual(str(meta.placement.device()), "cpu") + + # Test if a mixed sharded tensor (ShardedTensor with different devices) to cpu + mixed_spec = ChunkShardingSpec( + dim=0, + placements=[ + "rank:0/cpu", + "rank:1/cpu", + "rank:2/cuda:2", + "rank:3/cuda:3", + ], + ) + + st = sharded_tensor.zeros(mixed_spec, h, w, process_group=gloo_pg) + new_st = st.cpu() + # return a copy of orginal st + self.assertNotEqual(st, new_st) + # check the spec is still ChunkShardingSpec + spec_after_move = new_st.sharding_spec() + self.assertIsInstance(spec_after_move, ChunkShardingSpec) + # test specs before and after the move almost the same except placement device + self.assertEqual(mixed_spec.dim, spec_after_move.dim) + self.assertEqual(len(mixed_spec.placements), len(spec_after_move.placements)) + for i, remote_device_after in enumerate(spec_after_move.placements): + remote_device_before = mixed_spec.placements[i] + self.assertEqual(remote_device_before.rank(), remote_device_after.rank()) + self.assertEqual(str(remote_device_after.device()), "cpu") + + # ensure metdata also get changed to CPU + metas = new_st.metadata().shards_metadata + for meta in metas: + self.assertEqual(str(meta.placement.device()), "cpu") + @skip_if_lt_x_gpu(4) @requires_nccl() def test_uneven_shards(self): @@ -1781,6 +1912,116 @@ def test_with_rpc_names(self): self.assertEqual((5, 5), shard.tensor.size()) +class TestShardedTensorFromLocalTensor(ShardedTensorTestBase): + def _generate_st_from_chunk_local_tensor(self, st_size, sharding_spec): + tensor_meta = sharding_spec.build_metadata(st_size, TensorProperties()) + pg = dist.distributed_c10d._get_default_group() + + local_tensor = None + local_shard_metadata = None + rank_to_metadata = {} + for shard_metadata in tensor_meta.shards_metadata: + rank, device = _parse_and_validate_remote_device(pg, shard_metadata.placement) + rank_to_metadata[rank] = shard_metadata + if rank == self.rank: + local_tensor = torch.rand(shard_metadata.shard_sizes).cuda(device) + local_shard_metadata = shard_metadata + + # TODO: figure out what the API should behave when some rank have no shard + # see https://github.com/pytorch/pytorch/issues/73133 + assert local_tensor is not None + st = ShardedTensor._init_from_local_tensor( + local_tensor, + sharding_spec, + st_size, + init_rrefs=True, + ) + self.assertEqual(tuple(st_size), st.size()) + self.assertEqual(1, len(st.local_shards())) + + # Verify local shard. + local_shard = st.local_shards()[0] + self.assertEqual(st.local_tensor(), local_tensor) + self.assertEqual(torch.device(f"cuda:{self.rank}"), local_shard.tensor.device) + + # Verify local shard metadata. + self.assertEqual( + local_shard_metadata.shard_offsets, local_shard.metadata.shard_offsets + ) + self.assertEqual( + local_shard_metadata.shard_sizes, local_shard.metadata.shard_sizes + ) + self.assertEqual(local_shard_metadata.placement, local_shard.metadata.placement) + + # Verify global metadata. + st_shards_metadata = st.metadata().shards_metadata + self.assertEqual(self.world_size, len(st_shards_metadata)) + self.assertEqual(tensor_meta.shards_metadata, st_shards_metadata) + + # Validate remote shards. + remote_shards = st.remote_shards() + self.assertEqual(self.world_size - 1, len(remote_shards)) + for rpc_rank, shards in remote_shards.items(): + self.assertEqual(1, len(shards)) + for remote_shard in shards: + self.assertEqual(rpc_rank, remote_shard.owner().id) + # If remote shard does not exist, to_here() will throw exception. + if tensor_meta.shards_metadata[rpc_rank]: + shard = remote_shard.to_here() + self.assertEqual( + rank_to_metadata[rpc_rank].shard_sizes, shard.tensor.size() + ) + + @with_comms + @skip_if_lt_x_gpu(4) + @requires_nccl() + def test_init_from_local_tensor(self): + chunk_specs = _chunk_sharding_specs_list_for_test([0, 1, 1, 0], seed=31) + for spec in chunk_specs: + self._generate_st_from_chunk_local_tensor([20, 10], spec) + self._generate_st_from_chunk_local_tensor([21, 11], spec) + self._generate_st_from_chunk_local_tensor([23, 16], spec) + self._generate_st_from_chunk_local_tensor([44, 16, 8], spec) + + @with_comms + @skip_if_lt_x_gpu(4) + @requires_nccl() + def test_init_from_local_tensor_errors(self): + enumerable_sharding_spec = EnumerableShardingSpec( + [ + ShardMetadata( + shard_offsets=[0, 0], + shard_sizes=[5, 5], + placement="rank:0/cuda:0", + ), + ShardMetadata( + shard_offsets=[5, 0], + shard_sizes=[5, 5], + placement="rank:1/cuda:1", + ), + ] + ) + st_size = [24, 12] + local_tensor = torch.rand(*st_size).cuda(self.rank) + with self.assertRaisesRegex( + ValueError, "do not cover the entire tensor" + ): + ShardedTensor._init_from_local_tensor( + local_tensor, + enumerable_sharding_spec, + st_size, + ) + chunk_specs = _chunk_sharding_specs_list_for_test([0], seed=31) + with self.assertRaisesRegex( + ValueError, "local_tensor is not a contiguous Tensor." + ): + ShardedTensor._init_from_local_tensor( + local_tensor.t(), + chunk_specs[0], + st_size, + ) + + class TestShardedTensorFromLocalShards(ShardedTensorTestBase): @with_comms(init_rpc=False) @@ -2247,8 +2488,10 @@ def test_custom_op_override(self): t = torch.rand(10, 10).cuda(self.rank) - @sharded_op_impl(torch.nn.functional.linear) - def my_sharded_linear(types, args, kwargs, process_group): + from torch.distributed._shard.sharding_spec.api import custom_sharding_spec_op + + @custom_sharding_spec_op(ChunkShardingSpec, torch.nn.functional.linear) + def my_sharded_linear(types, args, kwargs): return t spec = ChunkShardingSpec( diff --git a/test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py b/test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py index 1a106772e673..ec053c95b47a 100644 --- a/test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py +++ b/test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py @@ -18,6 +18,7 @@ ) from torch.testing._internal.common_utils import ( TEST_WITH_DEV_DBG_ASAN, + run_tests, ) from torch.testing._internal.distributed._shard.sharded_tensor import ( ShardedTensorTestBase, @@ -44,6 +45,7 @@ def _run_sharded_tensor_reshard(self, sharding_spec, reshard_spec, input_size): st.reshard(reshard_spec) self.assertEqual(1, len(st.local_shards())) self.assertEqual(1, len(st_compare.local_shards())) + st_compare._metadata.shards_metadata.sort(key=lambda metadata: metadata.placement.rank()) self.assertEqual(st._metadata, st_compare._metadata) self.assertEqual(st.local_tensor(), st_compare.local_tensor()) self.assertEqual( @@ -95,3 +97,7 @@ def test_sharded_tensor_reshard_errors(self): NotImplementedError, "Only single local shard supported for reshard." ): st.reshard(reshard_spec) + + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/_shard/sharding_plan/test_sharding_plan.py b/test/distributed/_shard/sharding_plan/test_sharding_plan.py new file mode 100644 index 000000000000..e9c907c9d6d4 --- /dev/null +++ b/test/distributed/_shard/sharding_plan/test_sharding_plan.py @@ -0,0 +1,331 @@ + +# Owner(s): ["oncall: distributed"] +import sys +import copy + +import torch +import torch.nn as nn +import torch.distributed as dist +from torch.distributed._shard.sharded_optim import ( + ShardedOptimizer, + named_params_with_sharded_tensor, +) +from torch.testing._internal.common_distributed import ( + requires_nccl, + skip_if_lt_x_gpu, +) +from torch.distributed._shard import shard_module +from torch.distributed._shard.sharding_plan import ShardingPlan, ShardingPlanner +from torch.distributed._shard.sharding_spec import ChunkShardingSpec +from torch.distributed._shard.sharded_tensor import ShardedTensor + +from torch.testing._internal.common_utils import TEST_WITH_DEV_DBG_ASAN +from torch.testing._internal.distributed._shard.sharded_tensor import ( + TEST_GPU_NUM, + ShardedTensorTestBase, + with_comms, +) +from torch.testing._internal.distributed._shard.sharded_tensor._test_ops_common import ( + generate_chunk_sharding_specs_for_test, + generate_local_weight_sharding_params_for_test, +) +from torch.testing._internal.distributed._shard.test_common import SimpleMegatronLM + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) + + +# Example ShardingPlanner that chunks every parameter in the module +# to all available devices defined. +class ChunkAllShardingPlanner(ShardingPlanner): + dim = 0 + devices = [] + + def __init__(self, chunk_dim=0, device_count=0): + self.dim = chunk_dim + self.devices = [f"rank:{i}/cuda:{i}" for i in range(device_count)] + + def build_plan(self, module: nn.Module) -> ShardingPlan: + named_params = module.named_parameters() + plan = {} + for name, param in named_params: + plan[name] = ChunkShardingSpec(self.dim, placements=self.devices) + + return ShardingPlan(plan=plan) + + +class TestShardingPlan(ShardedTensorTestBase): + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_sharding_plan_simple_megatron(self): + colwise_sharding_spec = generate_chunk_sharding_specs_for_test(0) + rowwise_sharding_spec = generate_chunk_sharding_specs_for_test(1) + for spec in zip(colwise_sharding_spec, rowwise_sharding_spec): + # test each sharding spec pair and see if we can apply sharding + reshard_spec = copy.deepcopy(spec[1]) + reshard_spec.placements.sort(key=lambda placement: placement.rank()) + reshard_spec.dim = 0 + + sharding_plan = ShardingPlan( + plan={ + "fc1.weight": spec[0], + "fc2.weight": spec[1] + }, + output_plan={ + "": reshard_spec + }, + return_local_tensor=[""]) + + # Use same seed. + torch.manual_seed(0) + local_megatron_lm = SimpleMegatronLM([[17, 12], [12, 29]]).cuda(self.rank) + megatron_lm = copy.deepcopy(local_megatron_lm) + + # shard the module with the provided sharding plan + shard_module(megatron_lm, sharding_plan) + + # check to make sure the module already been sharded + self.assertTrue(isinstance(megatron_lm.fc1.weight, ShardedTensor)) + self.assertTrue(isinstance(megatron_lm.fc2.weight, ShardedTensor)) + self.assertEqual(megatron_lm.fc1.weight.sharding_spec(), spec[0]) + self.assertEqual(megatron_lm.fc2.weight.sharding_spec(), spec[1]) + + # make sure we can run sharded computation + input = torch.rand(22, 17).cuda(self.rank) + sharded_output = megatron_lm(input) + local_output = local_megatron_lm(input) + + # verify and make sure local and sharded output matches + self.assertEqual(local_output, sharded_output) + + # Compute loss and run backward pass. + local_output.sum().backward() + sharded_output.sum().backward() + ( + local_weight_grad_fc1, + local_weight_grad_fc2, + ) = local_megatron_lm.get_weight_grads() + local_bias_grad_fc1, local_bias_grad_fc2 = local_megatron_lm.get_bias_grads() + + # Verify that weights in both layers and biases in the sharded linear has non-None grad. + ( + sharded_weight_fc1, + sharded_weight_fc2, + ) = megatron_lm.get_weights() + bias_grad_fc1, bias_grad_fc2 = megatron_lm.get_bias_grads() + self.assertNotEqual(sharded_weight_fc1.grad, None) + self.assertNotEqual(sharded_weight_fc2.grad, None) + self.assertNotEqual(bias_grad_fc1, None) + self.assertNotEqual(bias_grad_fc2, None) + + # Shard the local linear's weight grad so that we can compare. + dist.all_reduce(local_weight_grad_fc1) + dist.all_reduce(local_weight_grad_fc2) + dist.all_reduce(local_bias_grad_fc1) + dist.all_reduce(local_bias_grad_fc2) + local_weight_fc1, local_weight_fc2 = local_megatron_lm.get_weights() + ( + start_pos_fc1, + chunk_size_fc1, + ) = generate_local_weight_sharding_params_for_test( + local_weight_fc1, 0, TEST_GPU_NUM, spec[0], self.rank + ) + local_grad_narrowed_fc1 = local_weight_grad_fc1.narrow( + 0, start_pos_fc1, chunk_size_fc1 + ) + ( + start_pos_fc2, + chunk_size_fc2, + ) = generate_local_weight_sharding_params_for_test( + local_weight_fc2, 1, TEST_GPU_NUM, spec[1], self.rank + ) + local_grad_narrowed_fc2 = local_weight_grad_fc2.narrow( + 1, start_pos_fc2, chunk_size_fc2 + ) + + # Test backward gradient calculation. + self.assertEqual(sharded_weight_fc1.grad, local_grad_narrowed_fc1) + self.assertEqual(sharded_weight_fc2.grad, local_grad_narrowed_fc2) + self.assertEqual(bias_grad_fc1, local_bias_grad_fc1) + self.assertEqual(bias_grad_fc2, local_bias_grad_fc2) + + # Test optimizer. + bias_fc1, bias_fc2 = megatron_lm.get_biases() + local_bias_fc1, local_bias_fc2 = local_megatron_lm.get_biases() + self.assertEqual(bias_fc1, local_bias_fc1) + self.assertEqual(bias_fc2, local_bias_fc2) + self.assertEqual(bias_fc1.grad, local_bias_fc1.grad) + self.assertEqual(bias_fc2.grad, local_bias_fc2.grad) + previous_sharded_weight_fc1 = sharded_weight_fc1.clone() + previous_sharded_weight_fc2 = sharded_weight_fc2.clone() + previous_bias_fc1 = bias_fc1.clone() + previous_bias_fc2 = bias_fc2.clone() + optim = torch.optim.SGD(local_megatron_lm.parameters(), lr=0.1) + optim.step() + sharded_optim = ShardedOptimizer( + dict(named_params_with_sharded_tensor(megatron_lm)), + torch.optim.SGD, + lr=0.1, + ) + sharded_optim.step() + local_weight_fc1_narrowed = local_weight_fc1.narrow( + 0, start_pos_fc1, chunk_size_fc1 + ) + local_weight_fc2_narrowed = local_weight_fc2.narrow( + 1, start_pos_fc2, chunk_size_fc2 + ) + + # Test weight value after optimizer. + self.assertEqual(sharded_weight_fc1.size(), local_weight_fc1_narrowed.size()) + self.assertEqual(sharded_weight_fc2.size(), local_weight_fc2_narrowed.size()) + self.assertNotEqual(previous_sharded_weight_fc1, sharded_weight_fc1) + self.assertNotEqual(previous_sharded_weight_fc2, sharded_weight_fc2) + self.assertEqual(sharded_weight_fc1, local_weight_fc1_narrowed) + self.assertEqual(sharded_weight_fc2, local_weight_fc2_narrowed) + + # Test bias value after optimizer. + local_bias_fc1, local_bias_fc2 = local_megatron_lm.get_biases() + self.assertNotEqual(previous_bias_fc1, bias_fc1) + self.assertEqual(bias_fc1, local_bias_fc1) + self.assertNotEqual(previous_bias_fc2, bias_fc2) + self.assertEqual(bias_fc2, local_bias_fc2) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_reshard_to_ddp_sharding_plan(self): + colwise_sharding_spec = generate_chunk_sharding_specs_for_test(0)[0] + rowwise_sharding_spec = generate_chunk_sharding_specs_for_test(1)[0] + + # test each sharding spec pair and see if we can apply sharding + output_spec = copy.deepcopy(rowwise_sharding_spec) + output_spec.placements.sort(key=lambda placement: placement.rank()) + output_spec.dim = 0 + + # new module with megatron as submodule + class MyModule(nn.Module): + def __init__(self, rank=None): + super().__init__() + self.megatron = SimpleMegatronLM([[17, 12], [12, 29]], rank=rank) + self.relu = nn.ReLU() + + def forward(self, input): + return self.relu(self.megatron(input)) + + sharding_plan = ShardingPlan( + plan={ + "megatron.fc1.weight": colwise_sharding_spec, + "megatron.fc2.weight": rowwise_sharding_spec, + }, + output_plan={ + "megatron": output_spec + }, + return_local_tensor=[ + "megatron" + ] + ) + + # Use same seed. + torch.manual_seed(0) + local_module = MyModule().cuda(self.rank) + sharded_module = copy.deepcopy(local_module) + + # shard the module with the provided sharding plan + shard_module(sharded_module, sharding_plan) + + # check to make sure the module already been sharded + self.assertTrue(isinstance(sharded_module.megatron.fc1.weight, ShardedTensor)) + self.assertTrue(isinstance(sharded_module.megatron.fc2.weight, ShardedTensor)) + self.assertEqual(sharded_module.megatron.fc1.weight.sharding_spec(), colwise_sharding_spec) + self.assertEqual(sharded_module.megatron.fc2.weight.sharding_spec(), rowwise_sharding_spec) + + # make sure we can run sharded computation + input = torch.rand(22, 17).cuda(self.rank) + sharded_output = sharded_module(input) + local_output = local_module(input) + + # verify and make sure local and sharded output matches + self.assertEqual(local_output, sharded_output) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_sharding_plan_errors(self): + rowwise_sharding_spec = generate_chunk_sharding_specs_for_test(1)[0] + sharding_plan_wrong_plan = ShardingPlan( + plan={ + "fc1.weight": torch.randn(3, 4), + }, + output_plan={ + "": rowwise_sharding_spec + }, + ) + + megatron_lm = SimpleMegatronLM([[17, 12], [12, 29]]).cuda(self.rank) + + with self.assertRaisesRegex( + TypeError, "Only `ShardingSpec` and `Sharder` are supported to shard" + ): + # shard the module with the provided sharding plan + shard_module(megatron_lm, sharding_plan_wrong_plan) + + sharding_plan_wrong_output_plan = ShardingPlan( + plan={ + "fc1.weight": rowwise_sharding_spec, + }, + output_plan={ + "": torch.randn(3, 4) + }, + ) + + with self.assertRaisesRegex( + TypeError, "Only `ShardingSpec` is supported as output_plan" + ): + # shard the module with the provided sharding plan + shard_module(megatron_lm, sharding_plan_wrong_output_plan) + + sharding_plan_wrong_module_path = ShardingPlan( + plan={ + "fc3.weight": rowwise_sharding_spec, + }, + ) + with self.assertRaisesRegex( + AttributeError, "has no attribute" + ): + # shard the module with the provided sharding plan + shard_module(megatron_lm, sharding_plan_wrong_module_path) + + sharding_plan_wrong_param_path = ShardingPlan( + plan={ + "fc1.biass": rowwise_sharding_spec, + }, + ) + with self.assertRaisesRegex( + AttributeError, "has no attribute" + ): + # shard the module with the provided sharding plan + shard_module(megatron_lm, sharding_plan_wrong_param_path) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_custom_sharding_planner(self): + megatron_lm = SimpleMegatronLM([[17, 12], [12, 29]], rank=self.rank).cuda( + self.rank + ) + planner = ChunkAllShardingPlanner(device_count=TEST_GPU_NUM) + sharding_plan = planner.build_plan(megatron_lm) + + shard_module(megatron_lm, sharding_plan) + + # check to make sure the module already been sharded + self.assertTrue(isinstance(megatron_lm.fc1.weight, ShardedTensor)) + self.assertTrue(isinstance(megatron_lm.fc2.weight, ShardedTensor)) + self.assertTrue(isinstance(megatron_lm.fc1.bias, ShardedTensor)) + self.assertTrue(isinstance(megatron_lm.fc2.bias, ShardedTensor)) diff --git a/test/distributed/_shard/sharding_spec/test_sharding_spec.py b/test/distributed/_shard/sharding_spec/test_sharding_spec.py index d760b5499fd8..a0e13d80d93e 100644 --- a/test/distributed/_shard/sharding_spec/test_sharding_spec.py +++ b/test/distributed/_shard/sharding_spec/test_sharding_spec.py @@ -1,12 +1,27 @@ # Owner(s): ["oncall: distributed"] +from typing import List, Union +from dataclasses import dataclass +import copy import torch from torch.testing._internal.common_utils import TestCase +from torch.testing._internal.common_distributed import ( + requires_nccl, + skip_if_lt_x_gpu, +) +from torch.distributed._shard import sharded_tensor, _shard_tensor from torch.distributed._shard.sharding_spec import ( + ShardingSpec, ChunkShardingSpec, DevicePlacementSpec, EnumerableShardingSpec, ShardMetadata, + _infer_sharding_spec_from_shards_metadata, +) +from torch.distributed._shard.sharded_tensor import ( + TensorProperties, + ShardedTensor, + ShardedTensorMetadata, ) from torch.distributed._shard.sharding_spec._internals import ( check_tensor, @@ -19,6 +34,13 @@ run_tests, sandcastle_skip_if, ) +from torch.testing._internal.distributed._shard.sharded_tensor._test_st_common import ( + _chunk_sharding_specs_list_for_test, +) +from torch.testing._internal.distributed._shard.sharded_tensor import ( + ShardedTensorTestBase, + with_comms, +) class TestShardingSpec(TestCase): @@ -46,18 +68,21 @@ def test_device_placement(self): def test_chunked_sharding_spec(self): # Test valid specs. ChunkShardingSpec(0, [torch.device(0), torch.device(1)]) - # Named dimension. - ChunkShardingSpec("N", ["cuda:0", "cuda:1"]) ChunkShardingSpec(0, [torch.device("cuda:0"), torch.device("cuda:1")]) ChunkShardingSpec(-1, ["cuda:0", "cuda:1"]) ChunkShardingSpec(0, ["rank:0/cuda:0", "rank:0/cuda:1"]) ChunkShardingSpec(0, ["rank:0", "rank:1"]) ChunkShardingSpec(0, ["rank:0/cpu", "rank:1/cpu"]) + # Test unimplemented error + with self.assertRaisesRegex(NotImplementedError, "not support named dimension"): + # Named dimension. + ChunkShardingSpec("N", ["cuda:0", "cuda:1"]) + # Test invalid specs - with self.assertRaisesRegex(ValueError, "int or str"): + with self.assertRaisesRegex(ValueError, "needs to be an integer"): ChunkShardingSpec(None, ["cuda:0", "cuda:1"]) - with self.assertRaisesRegex(ValueError, "int or str"): + with self.assertRaisesRegex(ValueError, "needs to be an integer"): ChunkShardingSpec({}, ["cuda:0", "cuda:1"]) with self.assertRaisesRegex(ValueError, "Could not parse remote_device"): ChunkShardingSpec(0, ["random:0", "cuda:1"]) @@ -276,5 +301,224 @@ def test_get_chunk_sharding_params(self): self.assertEqual(0, result[0]) self.assertEqual(6, result[1]) + def _infer_enum_sharding_spec_case(self): + shards_metadata = [ + ShardMetadata( + shard_offsets=[0, 0], + shard_sizes=[5, 5], + placement="cuda:0", + ), + ShardMetadata( + shard_offsets=[5, 0], + shard_sizes=[10, 5], + placement="cuda:1", + ) + ] + spec = _infer_sharding_spec_from_shards_metadata(shards_metadata) + self.assertTrue(isinstance(spec, EnumerableShardingSpec)) + self.assertEqual(spec.shards, shards_metadata) + + shards_metadata = [ + ShardMetadata( + shard_offsets=[0], + shard_sizes=[16], + placement="cuda:0", + ), + ShardMetadata( + shard_offsets=[16], + shard_sizes=[9], + placement="cuda:1", + ) + ] + spec = _infer_sharding_spec_from_shards_metadata(shards_metadata) + self.assertTrue(isinstance(spec, EnumerableShardingSpec)) + self.assertEqual(spec.shards, shards_metadata) + + shards_metadata = [ + ShardMetadata( + shard_offsets=[0, 0], + shard_sizes=[5, 5], + placement="rank:0/cuda:0", + ), + ShardMetadata( + shard_offsets=[5, 0], + shard_sizes=[5, 5], + placement="rank:1/cuda:1", + ), + ShardMetadata( + shard_offsets=[0, 5], + shard_sizes=[5, 5], + placement="rank:2/cuda:2", + ), + ShardMetadata( + shard_offsets=[5, 5], + shard_sizes=[5, 5], + placement="rank:3/cuda:3", + ), + ] + spec = _infer_sharding_spec_from_shards_metadata(shards_metadata) + self.assertTrue(isinstance(spec, EnumerableShardingSpec)) + self.assertEqual(spec.shards, shards_metadata) + + def _infer_chunk_sharding_spec_case(self, placements, sharding_dim, st_size): + world_size = len(placements) + split_size = get_split_size(st_size[sharding_dim], world_size) + shards_metadata = [None] * world_size + for idx, placement in enumerate(placements): + shard_size = copy.deepcopy(st_size) + offsets = [0] * len(st_size) + offsets[sharding_dim] = split_size * idx + shard_size[sharding_dim] = get_chunked_dim_size(st_size[sharding_dim], split_size, idx) + shards_metadata[placement.rank()] = ShardMetadata( + shard_offsets=offsets, + shard_sizes=shard_size, + placement=placement, + ) + + spec = _infer_sharding_spec_from_shards_metadata(shards_metadata) + self.assertTrue(isinstance(spec, ChunkShardingSpec)) + self.assertEqual(spec.dim, sharding_dim) + self.assertEqual(spec.placements, placements) + + def test_infer_sharding_spec_from_shards_metadata(self): + self._infer_enum_sharding_spec_case() + chunk_specs = _chunk_sharding_specs_list_for_test([0, 0, 1, 1], seed=31) + for spec in chunk_specs: + self._infer_chunk_sharding_spec_case(spec.placements, 0, [4, 16]) + self._infer_chunk_sharding_spec_case(spec.placements, 0, [5, 15, 16]) + self._infer_chunk_sharding_spec_case(spec.placements, 1, [12, 16]) + self._infer_chunk_sharding_spec_case(spec.placements, 2, [4, 18, 15]) + self._infer_chunk_sharding_spec_case(spec.placements, 3, [7, 12, 16, 37]) + self._infer_chunk_sharding_spec_case(spec.placements, 4, [50, 4, 18, 15, 77]) + +# Custom ShardingSpec, an simple example to do grid sharding +@dataclass +class GridShardingSpec(ShardingSpec): + grid_size: int + placements: List[Union[torch.distributed._remote_device, str]] + + def __post_init__(self): + for i, remote_device in enumerate(self.placements): + if not isinstance(remote_device, torch.distributed._remote_device): + self.placements[i] = torch.distributed._remote_device(remote_device) + + def build_metadata(self, + tensor_sizes: torch.Size, + tensor_properties: TensorProperties, + ) -> ShardedTensorMetadata: + tensor_num_dim = len(tensor_sizes) + assert tensor_num_dim == 2, "only support 2-dim tensor for grid sharding" + shards_metadata = [] + + def chunk_num(dim_size, grid_size): + assert dim_size % grid_size == 0, "only support dim_size mod grid_size == 0" + return dim_size // grid_size + + row_chunks = chunk_num(tensor_sizes[0], self.grid_size) + col_chunks = chunk_num(tensor_sizes[1], self.grid_size) + + assert row_chunks * col_chunks == len(self.placements) + for row_idx in range(row_chunks): + for col_idx in range(col_chunks): + shards_metadata.append( + ShardMetadata( + shard_offsets=[row_idx * self.grid_size, col_idx * self.grid_size], + shard_sizes=[self.grid_size, self.grid_size], + placement=self.placements[row_idx * row_chunks + col_idx] + ) + ) + return ShardedTensorMetadata( + shards_metadata=shards_metadata, + size=tensor_sizes, + tensor_properties=tensor_properties + ) + + + def shard(self, + tensor: torch.Tensor, + src_rank: int = 0, + process_group=None) -> ShardedTensor: + + raise NotImplementedError("GridShardingSpec.shard not implemented yet!") + +class TestCustomShardingSpec(ShardedTensorTestBase): + def test_custom_sharding_spec(self): + ranks = [ + "rank:0/cuda:0", + "rank:1/cuda:1", + "rank:2/cuda:2", + "rank:3/cuda:3", + ] + + grid_spec = GridShardingSpec( + grid_size=4, + placements=ranks + ) + + tensor_properties = TensorProperties( + dtype=torch.get_default_dtype(), + layout=torch.strided, + requires_grad=False, + memory_format=torch.contiguous_format, + pin_memory=False, + ) + + meta = grid_spec.build_metadata(torch.Size((8, 8)), tensor_properties) + check_tensor(meta.shards_metadata, torch.Size((8, 8))) + + @with_comms + @skip_if_lt_x_gpu(4) + @requires_nccl() + def test_custom_sharding_spec_tensor_ctor(self): + """ Test sharded_tensor.ones(...) with the custom + grid sharding spec. + """ + + ranks = [ + "rank:0/cuda:0", + "rank:1/cuda:1", + "rank:2/cuda:2", + "rank:3/cuda:3", + ] + + grid_spec = GridShardingSpec( + grid_size=2, + placements=ranks + ) + + st = sharded_tensor.ones(grid_spec, 4, 4) + + # Validate local shard is initialized with torch.ones + local_shards = st.local_shards() + self.assertEqual(1, len(local_shards)) + local_shard = local_shards[0].tensor + self.assertEqual(torch.device(f"cuda:{self.rank}"), local_shard.device) + self.assertEqual((2, 2), local_shard.size()) + self.assertEqual(local_shard, torch.ones(2, 2)) + + @with_comms + @skip_if_lt_x_gpu(4) + @requires_nccl() + def test_custom_sharding_spec_shard_tensor(self): + """ Test custom spec can be invoked from the + _shard_tensor callsite. + """ + + ranks = [ + "rank:0/cuda:0", + "rank:1/cuda:1", + "rank:2/cuda:2", + "rank:3/cuda:3", + ] + + grid_spec = GridShardingSpec( + grid_size=2, + placements=ranks + ) + + with self.assertRaisesRegex(NotImplementedError, 'not implemented'): + _shard_tensor(torch.randn(8, 8), grid_spec) + + if __name__ == '__main__': run_tests() diff --git a/test/distributed/_shard/test_partial_tensor.py b/test/distributed/_shard/test_partial_tensor.py new file mode 100644 index 000000000000..fd0b58a4aabb --- /dev/null +++ b/test/distributed/_shard/test_partial_tensor.py @@ -0,0 +1,174 @@ +# Owner(s): ["oncall: distributed"] + +import sys + +import torch +import torch.distributed as dist +from torch.distributed._shard.partial_tensor import ( + _PartialTensor, +) +from torch.distributed._shard.sharding_spec import ( + EnumerableShardingSpec, + ShardMetadata, +) +from torch.testing._internal.common_distributed import ( + requires_nccl, + skip_if_lt_x_gpu, +) +from torch.testing._internal.common_utils import ( + TEST_WITH_DEV_DBG_ASAN, + run_tests, +) +from torch.testing._internal.distributed._shard.sharded_tensor import ( + ShardedTensorTestBase, + with_comms, + TEST_GPU_NUM +) +from torch.testing._internal.distributed._shard.sharded_tensor._test_st_common import ( + _chunk_sharding_specs_list_for_test, +) + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) + + +class TestPartialTensorReshard(ShardedTensorTestBase): + def _run_partial_tensor_n_reshard( + self, reshard_spec, input_size, world_size, reduce_op, dtype=torch.float + ): + results_compare = [] + local_result = [] + pg = dist.distributed_c10d._get_default_group() + for rank in range(pg.size()): + torch.manual_seed(rank) + results = [] + for _ in range(world_size): + tensor = torch.rand(*input_size, dtype=dtype).cuda(self.rank) + results.append(tensor) + if self.rank == rank: + local_result.append(tensor.clone().detach()) + results_compare.append(torch.cat(results)) + parital_tensor = _PartialTensor( + torch.cat(local_result), pg, reduce_op=reduce_op + ) + local_sharded_result = parital_tensor.reshard(reshard_spec) + local_shards = local_sharded_result.local_shards() + results_compare = torch.stack(results_compare) + if reduce_op == dist.ReduceOp.SUM: + results_compare = torch.sum(results_compare, dim=0) + else: + results_compare = torch.max(results_compare, dim=0).values + rank_idx = None + for idx, placement in enumerate(reshard_spec.placements): + if placement.rank() == self.rank: + rank_idx = idx + local_result_compare = results_compare.chunk(pg.size())[rank_idx] + self.assertEqual(1, len(local_shards)) + self.assertEqual(local_shards[0].tensor, local_result_compare) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_partial_tensor_reshard(self): + specs = _chunk_sharding_specs_list_for_test([0], seed=7) + spec = specs[0] + self._run_partial_tensor_n_reshard(spec, [13, 21], 4, dist.ReduceOp.SUM) + self._run_partial_tensor_n_reshard(spec, [12, 22], 4, dist.ReduceOp.MAX) + self._run_partial_tensor_n_reshard(spec, [13, 21], 3, dist.ReduceOp.SUM) + self._run_partial_tensor_n_reshard(spec, [17, 21], 2, dist.ReduceOp.MAX) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_partial_tensor_reshard_errors(self): + enumerable_sharding_spec = EnumerableShardingSpec( + [ + ShardMetadata( + shard_offsets=[0, 0], + shard_sizes=[5, 5], + placement="rank:0/cuda:0", + ), + ShardMetadata( + shard_offsets=[5, 0], + shard_sizes=[5, 5], + placement="rank:1/cuda:1", + ), + ] + ) + with self.assertRaisesRegex( + NotImplementedError, "Only ChunkShardingSpec supported for reshard." + ): + self._run_partial_tensor_n_reshard( + enumerable_sharding_spec, [13, 21], 4, dist.ReduceOp.SUM + ) + self._run_partial_tensor_n_reshard( + enumerable_sharding_spec, [12, 22], 4, dist.ReduceOp.MAX + ) + specs = _chunk_sharding_specs_list_for_test([0], seed=7) + spec = specs[0] + with self.assertRaisesRegex( + NotImplementedError, "Only real partial tensor supported for reshard." + ): + self._run_partial_tensor_n_reshard( + spec, [13, 21], 4, dist.ReduceOp.SUM, dtype=torch.cfloat + ) + self._run_partial_tensor_n_reshard( + spec, [12, 22], 4, dist.ReduceOp.MAX, dtype=torch.cfloat + ) + +class TestPartialTensorOps(ShardedTensorTestBase): + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_transpose(self): + partial_tensor = _PartialTensor(torch.rand(5, 10)) + partial_tensor = partial_tensor.transpose(0, 1) + self.assertEqual(partial_tensor.size(), torch.Size((10, 5))) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_cat(self): + t1 = torch.rand(5, 10) + t2 = torch.rand(3, 10) + t3 = torch.rand(4, 10) + partial_tensors = [_PartialTensor(t1), _PartialTensor(t2), _PartialTensor(t3)] + partial_concat = torch.cat(partial_tensors) + local_concat = torch.cat([t1, t2, t3]) + self.assertEqual(local_concat.size(), partial_concat.size()) + + # Test dim kwarg + t1 = torch.rand(5, 10) + t2 = torch.rand(5, 12) + t3 = torch.rand(5, 11) + partial_tensors = [_PartialTensor(t1), _PartialTensor(t2), _PartialTensor(t3)] + partial_concat = torch.cat(partial_tensors, dim=1) + local_concat = torch.cat([t1, t2, t3], dim=1) + self.assertEqual(local_concat.size(), partial_concat.size()) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_cat_errors(self): + with self.assertRaisesRegex( + RuntimeError, 'All inputs need to be an instance of _PartialTensor' + ): + torch.cat([_PartialTensor(torch.rand(10)), torch.rand(10)]) + + with self.assertRaisesRegex( + RuntimeError, 'reduce_ops need to be the same' + ): + torch.cat([_PartialTensor(torch.rand(10)), _PartialTensor(torch.rand(10), reduce_op=dist.ReduceOp.MAX)]) + + with self.assertRaisesRegex( + RuntimeError, '"out" kwarg is not supported' + ): + torch.cat([_PartialTensor(torch.rand(10)), _PartialTensor(torch.rand(10))], out=torch.rand(10)) + + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/_shard/test_replicated_tensor.py b/test/distributed/_shard/test_replicated_tensor.py new file mode 100644 index 000000000000..9dfdd8703588 --- /dev/null +++ b/test/distributed/_shard/test_replicated_tensor.py @@ -0,0 +1,336 @@ +# Owner(s): ["oncall: distributed"] +import io + +import torch +import torch.distributed._shard.sharded_tensor as sharded_tensor + +import torch.distributed as dist +from torch.nn.parallel import DistributedDataParallel as DDP + +from torch.distributed._shard import _shard_tensor +from torch.distributed._shard.replicated_tensor import ReplicatedTensor +from torch.distributed._shard.sharding_spec import ChunkShardingSpec +from torch.testing._internal.common_distributed import ( + requires_nccl, + skip_if_lt_x_gpu, +) + +from torch.testing._internal.distributed._shard.sharded_tensor import ( + ShardedTensorTestBase, + with_comms, +) +from torch.testing._internal.distributed._shard.sharded_tensor._test_ops_common import ( + gen_binary_op_func +) +from torch.testing._internal.distributed._shard.sharded_tensor import TEST_GPU_NUM + + +class TestReplicatedTensor(ShardedTensorTestBase): + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_replicated_tensor_basics(self): + local_tensor = torch.ones(3, 3, device=f"cuda:{self.rank}") * 4 + replica_tensor = ReplicatedTensor(local_tensor) + # validate it's a replicated tensor by checking values on all rank + validated = replica_tensor.validate() + self.assertEqual(validated, True) + res = replica_tensor + 2 + self.assertIsInstance(res, torch.Tensor) + self.assertNotIsInstance(res, ReplicatedTensor) + self.assertEqual(res, torch.ones(3, 3) * 6) + + # modify local tensor on certain rank, and test if validation raise + if self.rank == 2: + local_tensor += 3 + + with self.assertRaisesRegex(ValueError, 'have different values'): + replica_tensor.validate() + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_replicated_tensor_inter_op_replicated_tensor(self): + local_tensor = torch.ones(3, 3, device=f"cuda:{self.rank}") + replica_tensor1 = ReplicatedTensor(local_tensor * 4) + replica_tensor2 = ReplicatedTensor(local_tensor * 6) + + new_tensor = replica_tensor1 * replica_tensor2 + self.assertIsInstance(new_tensor, ReplicatedTensor) + self.assertEqual(new_tensor, torch.ones(3, 3) * 24) + + # test replicated tensor inter-op with different pgs + new_pg = dist.new_group(ranks=[1, 2, 3]) + replica_tensor_new_group = ReplicatedTensor(local_tensor * 3, process_group=new_pg) + + with self.assertRaisesRegex(RuntimeError, 'must be in the same'): + replica_tensor_new_group * replica_tensor1 + + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_replicated_tensor_inter_op_tensor(self): + local_tensor = torch.ones(3, 3, device=f"cuda:{self.rank}") * 4 + replica_tensor = ReplicatedTensor(local_tensor) + + local_rand_tensor = torch.randn(3, 3, device=f"cuda:{self.rank}") + + new_tensor = replica_tensor + local_rand_tensor + self.assertIsInstance(new_tensor, torch.Tensor) + self.assertNotIsInstance(new_tensor, ReplicatedTensor) + + self.assertEqual(new_tensor, local_tensor + local_rand_tensor) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_replicated_tensor_inter_op_sharded_tensor(self): + torch.manual_seed(self.rank) + + local_tensor1 = torch.rand(12, 3, device=f"cuda:{self.rank}") * 4 + local_tensor2 = torch.ones(12, 3, device=f"cuda:{self.rank}") * 4 + + spec = ChunkShardingSpec( + dim=0, + placements=[ + "rank:0/cuda:0", + "rank:1/cuda:1", + "rank:2/cuda:2", + "rank:3/cuda:3", + ], + ) + + st = _shard_tensor(local_tensor1, spec, src_rank=0) + replica_tensor = ReplicatedTensor(local_tensor2) + + ops = ["torch.add", "torch.sub", "torch.mul", "torch.div", "+", "-", "*", "/"] + + for op in ops: + binary_op = gen_binary_op_func(op) + res = binary_op(st, replica_tensor) + self.assertIsInstance(res, sharded_tensor.ShardedTensor) + self.assertNotIsInstance(res, ReplicatedTensor) + output = torch.empty((12, 3), device=self.rank) if self.rank == 0 else None + res.gather(dst=0, out=output) + + if self.rank == 0: + local_output = binary_op(local_tensor1, local_tensor2) + self.assertEqual(output, local_output) + + # reflective + reflect_res = binary_op(replica_tensor, st) + self.assertIsInstance(reflect_res, sharded_tensor.ShardedTensor) + self.assertNotIsInstance(reflect_res, ReplicatedTensor) + reflect_output = torch.empty((12, 3), device=self.rank) if self.rank == 0 else None + reflect_res.gather(dst=0, out=reflect_output) + + if self.rank == 0: + reflect_local_output = binary_op(local_tensor2, local_tensor1) + self.assertEqual(reflect_output, reflect_local_output) + + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_replicated_tensor_implicit_broadcasting(self): + # use same seed + torch.manual_seed(self.rank) + + # test implicit broadcasting + local_tensor1 = torch.rand(12, 3, device=f"cuda:{self.rank}") * 4 + # we use size (3) to trigger the implicit broadcasting logic + # and it will fail if implicit broadcasting not happen. + local_tensor2 = torch.ones(3, device=f"cuda:{self.rank}") + + spec = ChunkShardingSpec( + dim=0, + placements=[ + "rank:0/cuda:0", + "rank:1/cuda:1", + "rank:2/cuda:2", + "rank:3/cuda:3", + ], + ) + + st = _shard_tensor(local_tensor1, spec, src_rank=0) + replica_tensor = ReplicatedTensor(local_tensor2) + + ops = ["torch.add", "torch.sub", "torch.mul", "torch.div", "+", "-", "*", "/"] + + for op in ops: + binary_op = gen_binary_op_func(op) + # replicated tensor should automatically broadcasted + res = binary_op(st, replica_tensor) + + self.assertIsInstance(res, sharded_tensor.ShardedTensor) + output = torch.empty((12, 3), device=self.rank) if self.rank == 0 else None + res.gather(dst=0, out=output) + + if self.rank == 0: + local_output = binary_op(local_tensor1, local_tensor2) + self.assertEqual(output, local_output) + + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_replicated_tensor_inter_op_sharded_tensor_errors(self): + local_tensor = torch.ones(3, 3, device=f"cuda:{self.rank}") * 4 + replica_tensor = ReplicatedTensor(local_tensor) + + torch.manual_seed(self.rank) + spec = ChunkShardingSpec( + dim=0, + placements=[ + "rank:0/cuda:0", + "rank:1/cuda:1", + "rank:2/cuda:2", + "rank:3/cuda:3", + ], + ) + + st1 = sharded_tensor.rand(spec, (20, 3, 3)) + st2 = sharded_tensor.rand(spec, (30, 3, 3)) + + with self.assertRaisesRegex(RuntimeError, 'Implicit broadcasting'): + st1 + st2 + + with self.assertRaisesRegex(RuntimeError, 'not supported for ShardedTensor'): + st1 % replica_tensor + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_with_ddp(self): + # Test Replicated params for DDP + replica_tensor = ReplicatedTensor(torch.rand(4, 8, device=self.rank)) + model = torch.nn.Linear(8, 2).cuda(self.rank) + optim = torch.optim.SGD(model.parameters(), lr=0.1) + ddp = DDP(model) + + # Test module.parameters. + params = list(ddp.parameters()) + self.assertEqual(2, len(params)) + self.assertEqual(ddp.module.weight, params[0]) + self.assertEqual(ddp.module.bias, params[1]) + + params = list(model.parameters()) + self.assertEqual(2, len(params)) + self.assertEqual(model.weight, params[0]) + self.assertEqual(model.bias, params[1]) + + # Validate output + out = ddp(replica_tensor) + self.assertIsInstance(out, ReplicatedTensor) + + # Test backward and optimizer. + + # Validate backward. + out.sum().backward() + self.assertIsNotNone(model.weight.grad) + self.assertIsNotNone(model.bias.grad) + self.assertIsNotNone(ddp.module.weight.grad) + self.assertIsNotNone(ddp.module.bias.grad) + + original_params = [] + for param_group in optim.param_groups: + for original_param in param_group['params']: + self.assertIsNotNone(original_param.grad) + original_params.append(original_param) + + self.assertEqual(model.weight.grad, original_params[0].grad) + self.assertEqual(model.bias.grad, original_params[1].grad) + self.assertEqual(model.weight.grad, ddp.module.weight.grad) + self.assertEqual(model.bias.grad, ddp.module.bias.grad) + + # Validate optimizer. + optim.step() + self.assertEqual(model.weight, ddp.module.weight) + self.assertEqual(model.weight, original_params[0]) + + self.assertEqual(model.bias, ddp.module.bias) + self.assertEqual(model.bias, original_params[1]) + + # Validate zero_grad + optim.zero_grad() + self.assertEqual(model.weight.grad, torch.zeros_like(model.weight.grad)) + self.assertEqual(model.weight.grad, ddp.module.weight.grad) + self.assertEqual(model.weight.grad, original_params[0].grad) + + self.assertEqual(model.bias.grad, torch.zeros_like(model.bias.grad)) + self.assertEqual(model.bias.grad, ddp.module.bias.grad) + self.assertEqual(model.bias.grad, original_params[1].grad) + + # Validate zero_grad set_to_none + optim.zero_grad(set_to_none=True) + self.assertIsNone(model.weight.grad) + self.assertEqual(model.weight.grad, ddp.module.weight.grad) + self.assertEqual(model.weight.grad, original_params[0].grad) + + self.assertIsNone(model.bias.grad) + self.assertEqual(model.bias.grad, ddp.module.bias.grad) + self.assertEqual(model.bias.grad, original_params[1].grad) + + # Multiple forward passes. + for _ in range(5): + out = ddp(replica_tensor) + self.assertIsInstance(out, ReplicatedTensor) + + # Test with context manager. + from torch.nn.parallel._replicated_tensor_ddp_utils import _ddp_replicated_tensor + with _ddp_replicated_tensor(False): + for _ in range(5): + with _ddp_replicated_tensor(True): + ddp = DDP(model) + out = ddp(replica_tensor) + self.assertIsInstance(out, ReplicatedTensor) + + # Test save and load. + with _ddp_replicated_tensor(False): + ddp = DDP(model) + expected_state_dict = ddp.state_dict() + buffer = io.BytesIO() + torch.save(ddp, buffer) + + buffer.seek(0) + obj = torch.load(buffer) + self.assertEqual(expected_state_dict, obj.state_dict()) + + with _ddp_replicated_tensor(True): + ddp = DDP(model) + buffer = io.BytesIO() + torch.save(ddp, buffer) + + buffer.seek(0) + obj = torch.load(buffer) + self.assertEqual(expected_state_dict, obj.state_dict()) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_unsqueeze(self): + local_tensor = torch.rand(3, 3, device=self.rank) + replicated_tensor = ReplicatedTensor(local_tensor) + + unsqueezed_replicated_tensor = replicated_tensor.unsqueeze(0) + unsqueezed_local_tensor = local_tensor.unsqueeze(0) + + self.assertIsInstance(unsqueezed_replicated_tensor, ReplicatedTensor) + self.assertIsInstance(torch.unsqueeze(replicated_tensor, 0), ReplicatedTensor) + self.assertEqual(unsqueezed_local_tensor, unsqueezed_replicated_tensor) + self.assertEqual(torch.unsqueeze(replicated_tensor, 0), unsqueezed_replicated_tensor) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_getitem(self): + local_tensor = torch.rand(3, 3, device=self.rank) + replicated_tensor = ReplicatedTensor(local_tensor) + + replicated_tensor_view = replicated_tensor[0] + local_tensor_view = local_tensor[0] + + self.assertIsInstance(replicated_tensor_view, ReplicatedTensor) + self.assertEqual(local_tensor_view, replicated_tensor_view) diff --git a/test/distributed/_shard/test_sharder.py b/test/distributed/_shard/test_sharder.py new file mode 100644 index 000000000000..d6e0b799ec6e --- /dev/null +++ b/test/distributed/_shard/test_sharder.py @@ -0,0 +1,165 @@ + +# Owner(s): ["oncall: distributed"] +import sys +import copy + +import torch +import torch.nn as nn +from torch.testing._internal.common_distributed import ( + requires_nccl, + skip_if_lt_x_gpu, +) +from torch.distributed._shard import shard_module +from torch.distributed._shard.sharding_plan import ShardingPlan +from torch.distributed._shard.sharder import Sharder +from torch.distributed._shard.sharding_spec import ChunkShardingSpec +from torch.distributed._shard.sharded_tensor import ShardedTensor + +from torch.testing._internal.common_utils import TEST_WITH_DEV_DBG_ASAN +from torch.testing._internal.distributed._shard.sharded_tensor import ( + TEST_GPU_NUM, + ShardedTensorTestBase, + with_comms, +) + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) + +# a simple collection of embedding bag implementation +class CustomEmbeddingBagCollection(nn.Module): + def __init__(self, num_bags, num_embeddings_per_bag, num_dims): + super().__init__() + self.num_bags = num_bags + self.embedding_bags: nn.ModuleDict = nn.ModuleDict() + + for i in range(num_bags): + self.embedding_bags[f"embedding_bag_{i}"] = nn.EmbeddingBag( + num_embeddings_per_bag, + num_dims, + mode="sum") + + def forward(self, inputs): + outputs = [] + for bag in self.embedding_bags.values(): + outputs.append(bag(inputs)) + return torch.cat(outputs) + +# a simple sharded version of EBC +class CustomShardedEBC(nn.Module): + def __init__(self, ebc, split_idx, specs): + super().__init__() + self.split_idx = split_idx + row_spec, col_spec = specs + + # create embedding bags base on the spec + self.embedding_bags: nn.ModuleDict = nn.ModuleDict() + + assert self.split_idx < ebc.num_bags + for i in range(ebc.num_bags): + bag_key = f"embedding_bag_{i}" + if i < self.split_idx: + shard_module(ebc, plan=ShardingPlan(plan={f"embedding_bags.{bag_key}.weight": row_spec})) + else: + shard_module(ebc, plan=ShardingPlan(plan={f"embedding_bags.{bag_key}.weight": col_spec})) + + self.embedding_bags[bag_key] = ebc.embedding_bags[bag_key] + + +class CustomSharder(Sharder): + def __init__(self, devices, split_sharding_idx): + self.devices = devices + self.split_sharding_idx = split_sharding_idx + self.rowwise_spec = ChunkShardingSpec(dim=0, placements=devices) + self.colwise_spec = ChunkShardingSpec(dim=1, placements=devices) + + def shard(self, ebc: nn.Module) -> nn.Module: + if not isinstance(ebc, CustomEmbeddingBagCollection): + raise RuntimeError("The custom sharder only supports CustomEmbeddingBagCollection") + + return CustomShardedEBC(ebc, self.split_sharding_idx, (self.rowwise_spec, self.colwise_spec)) + + +class TestCustomSharder(ShardedTensorTestBase): + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_custom_sharder(self): + class MyModule(nn.Module): + def __init__(self): + super().__init__() + self.ebc = CustomEmbeddingBagCollection(10, 10, 8) + + def forward(self, inputs): + return self.ebc(inputs) + + custom_sharder = CustomSharder( + devices=[f"rank:{i}/cuda:{i}" for i in range(TEST_GPU_NUM)], + split_sharding_idx=TEST_GPU_NUM // 2 + ) + + sharding_plan = ShardingPlan( + plan={ + "ebc": custom_sharder, + }) + + local_model = MyModule().cuda(self.rank) + sharded_model = copy.deepcopy(local_model) + + # shard the module with the provided sharding plan + shard_module(sharded_model, sharding_plan) + + # check to make sure the module already been sharded + emb_bags = sharded_model.ebc.embedding_bags + self.assertTrue(isinstance(emb_bags["embedding_bag_0"].weight, ShardedTensor)) + self.assertTrue(isinstance(emb_bags["embedding_bag_9"].weight, ShardedTensor)) + self.assertEqual(emb_bags["embedding_bag_0"].weight.sharding_spec(), custom_sharder.rowwise_spec) + self.assertEqual(emb_bags["embedding_bag_9"].weight.sharding_spec(), custom_sharder.colwise_spec) + + # make sure we can run sharded computation and compare outputs + # with the local model version + input = torch.arange(8).reshape((2, 4)).cuda(self.rank) + local_output = local_model(input) + sharded_output = sharded_model(input) + + self.assertEqual(local_output, sharded_output) + + @with_comms(init_rpc=False) + @skip_if_lt_x_gpu(TEST_GPU_NUM) + @requires_nccl() + def test_custom_sharder_errors(self): + custom_sharder = CustomSharder( + devices=[f"rank:{i}/cuda:{i}" for i in range(TEST_GPU_NUM)], + split_sharding_idx=TEST_GPU_NUM // 2 + ) + + sharding_plan = ShardingPlan( + plan={ + "": custom_sharder, + }) + + sharded_model = CustomEmbeddingBagCollection(10, 10, 8).cuda(self.rank) + + with self.assertRaisesRegex( + KeyError, "path must not be empty for custom sharder!" + ): + # shard the module with the provided sharding plan + shard_module(sharded_model, sharding_plan) + + # test conflicted sharding plan + spec = ChunkShardingSpec(dim=0, placements=["rank:0/cuda:0", "rank:1/cuda:1"]) + sharding_plan = ShardingPlan( + plan={ + "embedding_bags.embedding_bag_0.weight": spec, + "embedding_bags": custom_sharder, + }) + + with self.assertRaisesRegex( + RuntimeError, "should not conflict with the submodule tree" + ): + # shard the module with the provided sharding plan + shard_module(sharded_model, sharding_plan) diff --git a/test/distributed/elastic/agent/server/test/api_test.py b/test/distributed/elastic/agent/server/test/api_test.py index 7388c55db197..9f382a97f6ab 100644 --- a/test/distributed/elastic/agent/server/test/api_test.py +++ b/test/distributed/elastic/agent/server/test/api_test.py @@ -12,7 +12,7 @@ import unittest import uuid from typing import Any, Dict -from unittest.mock import call, patch +from unittest.mock import call, patch, MagicMock import torch.distributed.elastic.rendezvous.registry as rdzv_registry from torch.distributed.elastic.agent.server.api import ( @@ -497,8 +497,8 @@ def verify_worker_ranks( ) self.assertEqual(expected_role_ranks, [worker.role_rank for worker in workers]) - @patch("torch.distributed.elastic.utils.store.get_all") - def test_share_and_gather(self, store_mock): + @patch("torch.distributed.elastic.utils.store.synchronize") + def test_share_and_gather(self, sync_mock): # when the state is unknown we exit immediately; no retries spec = self._get_worker_spec(max_restarts=100, monitor_interval=0.1) agent = TestAgent(spec) @@ -508,26 +508,15 @@ def test_share_and_gather(self, store_mock): _RoleInstanceInfo("validator", 2, 10), ] - store_mock.return_value = [obj.serialize() for obj in expected_agent_infos] - - class DummyStore: - def __init__(self): - self.key = None - self.value = None - - def set(self, key, value): - self.key = key - self.value = value - - def set_timeout(self, timeout): - pass - - store = DummyStore() - agent._share_and_gather(store, 1, 3, spec) - self.assertEquals("torchelastic/role_info1", store.key) - expected_info = _RoleInstanceInfo(spec.role, 1, spec.local_world_size) - self.assertEquals(expected_info.serialize(), store.value) - store_mock.assert_called_once() + sync_mock.return_value = [obj.serialize() for obj in expected_agent_infos] + result = agent._share_and_gather(MagicMock(), 1, 3, spec) + sync_mock.assert_called_once() + for expected_role_info, actual_role_info in zip(expected_agent_infos, result): + self.assertEqual(expected_role_info.role, actual_role_info.role) + self.assertEqual(expected_role_info.rank, actual_role_info.rank) + self.assertEqual( + expected_role_info.local_world_size, actual_role_info.local_world_size + ) def test_get_event(self): spec = self._get_worker_spec(max_restarts=1) diff --git a/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py b/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py index a931f3ef1d4e..9c5a39505490 100644 --- a/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py +++ b/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py @@ -38,8 +38,8 @@ from torch.distributed.rpc.backend_registry import BackendType from torch.testing._internal.common_utils import ( TEST_WITH_DEV_DBG_ASAN, - sandcastle_skip_if, TEST_WITH_TSAN, + sandcastle_skip_if, ) @@ -170,11 +170,26 @@ def _check_env_function(): "TORCHELASTIC_MAX_RESTARTS", "TORCHELASTIC_RUN_ID", "TORCHELASTIC_USE_AGENT_STORE", + "NCCL_ASYNC_ERROR_HANDLING", ] for var in env_vars: _ = os.environ[var] +def _check_env_value(key: str, expected: str): + # checks if the env var ``key`` matches ``value`` + # this function is intended to be used as the entrypoint to the elastic run + if key not in os.environ: + raise RuntimeError(f"Environment variable {key} not found in os.environ") + else: + actual = os.getenv(key) + if expected != actual: + raise RuntimeError( + f"os.environ['{key}']={actual}" + f" does not equal the expected value: {expected}" + ) + + def acquire_available_port(): """ Uses sockets to acquire an available port from the os for use. @@ -184,10 +199,7 @@ def acquire_available_port(): the port as quickly as possible. """ addrs = socket.getaddrinfo( - host="localhost", - port=None, - family=socket.AF_UNSPEC, - type=socket.SOCK_STREAM + host="localhost", port=None, family=socket.AF_UNSPEC, type=socket.SOCK_STREAM ) for addr in addrs: @@ -398,7 +410,6 @@ def run_test_with_backend(self, backend: str, test_to_run: Callable): test_to_run() - def dummy_compute(self): res = self.run_agent(Conf(entrypoint=dummy_compute, local_world_size=2)) self.assertFalse(res.is_failed()) @@ -406,21 +417,15 @@ def dummy_compute(self): self.assertIsInstance(return_value, torch.Tensor) self.assertEqual((100, 100), return_value.shape) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_dummy_compute_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.dummy_compute) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_dummy_compute_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.dummy_compute) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_dummy_compute_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.dummy_compute) @@ -430,23 +435,19 @@ def run_happy_function(self): self.assertIsNone(res.return_values[0]) self.assertIsNone(res.return_values[1]) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_run_happy_function_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.run_happy_function) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_run_happy_function_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.run_happy_function) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_run_happy_function_etcd_v2(self): - self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_happy_function) + self.run_test_with_backend( + backend="etcd-v2", test_to_run=self.run_happy_function + ) def check_master_addr_port_override(self): master_addr = "test_host" @@ -463,17 +464,17 @@ def check_master_addr_port_override(self): self.assertFalse(res.is_failed()) self.assertIsNone(res.return_values[0]) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_check_master_addr_port_override_etcd(self): - self.run_test_with_backend(backend="etcd", test_to_run=self.check_master_addr_port_override) + self.run_test_with_backend( + backend="etcd", test_to_run=self.check_master_addr_port_override + ) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_check_master_addr_port_override_etcd_v2(self): - self.run_test_with_backend(backend="etcd-v2", test_to_run=self.check_master_addr_port_override) + self.run_test_with_backend( + backend="etcd-v2", test_to_run=self.check_master_addr_port_override + ) def run_check_env_function(self): # just checks that all env vars that we need to set on the user script @@ -481,11 +482,47 @@ def run_check_env_function(self): res = self.run_agent(Conf(entrypoint=_check_env_function, local_world_size=1)) self.assertFalse(res.is_failed()) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + def run_check_nccl_async_error_handling_env(self): + # make sure NCCL_ASYNC_ERROR_HANDLING set in os.environ is honored + with patch.dict(os.environ, {"NCCL_ASYNC_ERROR_HANDLING": "0"}): + res = self.run_agent( + Conf( + entrypoint=_check_env_value, + local_world_size=1, + args=("NCCL_ASYNC_ERROR_HANDLING", "0"), + ) + ) + self.assertFalse(res.is_failed()) + + def run_check_nccl_async_error_handling_env_default(self): + # if not present in env var it should default to 1 + res = self.run_agent( + Conf( + entrypoint=_check_env_value, + local_world_size=1, + args=("NCCL_ASYNC_ERROR_HANDLING", "1"), + ) + ) + self.assertFalse(res.is_failed()) + + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_run_check_env_function_etcd(self): - self.run_test_with_backend(backend="etcd", test_to_run=self.run_check_env_function) + self.run_test_with_backend( + backend="etcd", test_to_run=self.run_check_env_function + ) + + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") + def test_run_check_nccl_async_error_handling_env_c10d(self): + self.run_test_with_backend( + backend="c10d", test_to_run=self.run_check_nccl_async_error_handling_env + ) + + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") + def test_run_check_nccl_async_error_handling_env_default_c10d(self): + self.run_test_with_backend( + backend="c10d", + test_to_run=self.run_check_nccl_async_error_handling_env_default, + ) def run_function_with_return_value(self): res = self.run_agent(Conf(entrypoint=_echo, args=("foo",), local_world_size=2)) @@ -493,44 +530,38 @@ def run_function_with_return_value(self): self.assertEqual("foo", res.return_values[0]) self.assertEqual("foo", res.return_values[1]) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_run_function_with_return_value_c10d(self): - self.run_test_with_backend(backend="c10d", test_to_run=self.run_function_with_return_value) + self.run_test_with_backend( + backend="c10d", test_to_run=self.run_function_with_return_value + ) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_run_function_with_return_value_etcd(self): - self.run_test_with_backend(backend="etcd", test_to_run=self.run_function_with_return_value) + self.run_test_with_backend( + backend="etcd", test_to_run=self.run_function_with_return_value + ) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_run_function_with_return_value_etcd_v2(self): - self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_function_with_return_value) + self.run_test_with_backend( + backend="etcd-v2", test_to_run=self.run_function_with_return_value + ) def simple_dist_sum(self): res = self.run_agent(Conf(entrypoint=_dist_sum, local_world_size=2)) self.assertFalse(res.is_failed()) # _dist_sum internally checks that the sum computed is valid - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_simple_dist_sum_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.simple_dist_sum) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_simple_dist_sum_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.simple_dist_sum) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_simple_dist_sum_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.simple_dist_sum) @@ -556,21 +587,27 @@ def run_distributed_sum_homogeneous(self): "test incompatible with dev/dbg asan or tsan", ) def test_run_distributed_sum_homogeneous_c10d(self): - self.run_test_with_backend(backend="c10d", test_to_run=self.run_distributed_sum_homogeneous) + self.run_test_with_backend( + backend="c10d", test_to_run=self.run_distributed_sum_homogeneous + ) @unittest.skipIf( TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with dev/dbg asan or tsan", ) def test_run_distributed_sum_homogeneous_etcd(self): - self.run_test_with_backend(backend="etcd", test_to_run=self.run_distributed_sum_homogeneous) + self.run_test_with_backend( + backend="etcd", test_to_run=self.run_distributed_sum_homogeneous + ) @unittest.skipIf( TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with dev/dbg asan or tsan", ) def test_run_distributed_sum_homogeneous_etcd_v2(self): - self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_distributed_sum_homogeneous) + self.run_test_with_backend( + backend="etcd-v2", test_to_run=self.run_distributed_sum_homogeneous + ) def run_distributed_sum_heterogeneous(self): # sums all ranks on 3 agents; each running 1, 2, 3 workers respectively @@ -593,23 +630,23 @@ def run_distributed_sum_heterogeneous(self): ranks.update(run_results.return_values.keys()) self.assertSetEqual(set(range(1 + 2 + 3)), ranks) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_run_distributed_sum_heterogeneous_c10d(self): - self.run_test_with_backend(backend="c10d", test_to_run=self.run_distributed_sum_heterogeneous) + self.run_test_with_backend( + backend="c10d", test_to_run=self.run_distributed_sum_heterogeneous + ) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_run_distributed_sum_heterogeneous_etcd(self): - self.run_test_with_backend(backend="etcd", test_to_run=self.run_distributed_sum_heterogeneous) + self.run_test_with_backend( + backend="etcd", test_to_run=self.run_distributed_sum_heterogeneous + ) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_run_distributed_sum_heterogeneous_etcd_v2(self): - self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_distributed_sum_heterogeneous) + self.run_test_with_backend( + backend="etcd-v2", test_to_run=self.run_distributed_sum_heterogeneous + ) def run_sad_function(self): """ @@ -632,21 +669,15 @@ def run_sad_function(self): self.assertEqual(data["message"], failure_data["message"]) self.assertEqual(int(data["extraInfo"]["timestamp"]), failure.timestamp) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_run_sad_function_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.run_sad_function) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_run_sad_function_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.run_sad_function) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_run_sad_function_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_sad_function) @@ -663,23 +694,23 @@ def run_bipolar_function(self): self.assertEqual(WorkerState.FAILED, agent.get_worker_group().state) self.assertTrue(agent._total_execution_time > 0) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_run_bipolar_function_c10d(self): - self.run_test_with_backend(backend="c10d", test_to_run=self.run_bipolar_function) + self.run_test_with_backend( + backend="c10d", test_to_run=self.run_bipolar_function + ) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_run_bipolar_function_etcd(self): - self.run_test_with_backend(backend="etcd", test_to_run=self.run_bipolar_function) + self.run_test_with_backend( + backend="etcd", test_to_run=self.run_bipolar_function + ) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_run_bipolar_function_etcd_v2(self): - self.run_test_with_backend(backend="etcd-v2", test_to_run=self.run_bipolar_function) + self.run_test_with_backend( + backend="etcd-v2", test_to_run=self.run_bipolar_function + ) def correct_rank_assignment_heterogeneous(self): node_configs = [ @@ -710,14 +741,18 @@ def correct_rank_assignment_heterogeneous(self): "test incompatible with dev/dbg asan or tsan", ) def test_correct_rank_assignment_heterogeneous_etcd(self): - self.run_test_with_backend(backend="etcd", test_to_run=self.correct_rank_assignment_heterogeneous) + self.run_test_with_backend( + backend="etcd", test_to_run=self.correct_rank_assignment_heterogeneous + ) @unittest.skipIf( TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with dev/dbg asan or tsan", ) def test_correct_rank_assignment_heterogeneous_etcd_v2(self): - self.run_test_with_backend(backend="etcd-v2", test_to_run=self.correct_rank_assignment_heterogeneous) + self.run_test_with_backend( + backend="etcd-v2", test_to_run=self.correct_rank_assignment_heterogeneous + ) def correct_rank_assignment_homogeneous(self): node_configs = [ @@ -744,14 +779,18 @@ def correct_rank_assignment_homogeneous(self): "test incompatible with dev/dbg asan or tsan", ) def test_correct_rank_assignment_homogeneous_etcd(self): - self.run_test_with_backend(backend="etcd", test_to_run=self.correct_rank_assignment_homogeneous) + self.run_test_with_backend( + backend="etcd", test_to_run=self.correct_rank_assignment_homogeneous + ) @unittest.skipIf( TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with dev/dbg asan or tsan", ) def test_correct_rank_assignment_homogeneous_etcd_v2(self): - self.run_test_with_backend(backend="etcd-v2", test_to_run=self.correct_rank_assignment_homogeneous) + self.run_test_with_backend( + backend="etcd-v2", test_to_run=self.correct_rank_assignment_homogeneous + ) def assert_rank_consistency( self, @@ -853,14 +892,18 @@ def double_agent_fault_tolerance(self): "test incompatible with dev/dbg asan or tsan", ) def test_double_agent_fault_tolerance_etcd(self): - self.run_test_with_backend(backend="etcd", test_to_run=self.double_agent_fault_tolerance) + self.run_test_with_backend( + backend="etcd", test_to_run=self.double_agent_fault_tolerance + ) @unittest.skipIf( TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with dev/dbg asan or tsan", ) def test_double_agent_fault_tolerance_etcd_v2(self): - self.run_test_with_backend(backend="etcd-v2", test_to_run=self.double_agent_fault_tolerance) + self.run_test_with_backend( + backend="etcd-v2", test_to_run=self.double_agent_fault_tolerance + ) def double_agent_elastic(self): """ @@ -907,21 +950,27 @@ def double_agent_elastic(self): "test incompatible with dev/dbg asan or tsan", ) def test_double_agent_elastic_c10d(self): - self.run_test_with_backend(backend="c10d", test_to_run=self.double_agent_elastic) + self.run_test_with_backend( + backend="c10d", test_to_run=self.double_agent_elastic + ) @unittest.skipIf( TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with dev/dbg asan or tsan", ) def test_double_agent_elastic_etcd(self): - self.run_test_with_backend(backend="etcd", test_to_run=self.double_agent_elastic) + self.run_test_with_backend( + backend="etcd", test_to_run=self.double_agent_elastic + ) @unittest.skipIf( TEST_WITH_DEV_DBG_ASAN or TEST_WITH_TSAN, "test incompatible with dev/dbg asan or tsan", ) def test_double_agent_elastic_etcd_v2(self): - self.run_test_with_backend(backend="etcd-v2", test_to_run=self.double_agent_elastic) + self.run_test_with_backend( + backend="etcd-v2", test_to_run=self.double_agent_elastic + ) def torch_rpc(self): """ @@ -1056,21 +1105,15 @@ def barrier_failed(self, barrier_mock): self.assertFalse(res.is_failed()) barrier_mock.assert_called_once() - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_barrier_failed_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.barrier_failed) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_barrier_failed_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.barrier_failed) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_barrier_failed_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.barrier_failed) @@ -1089,20 +1132,14 @@ def shutdown_called(self, start_processes_mock): agent.run("worker") pcontext_mock.close.assert_called_once() - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_shutdown_called_c10d(self): self.run_test_with_backend(backend="c10d", test_to_run=self.shutdown_called) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_shutdown_called_etcd(self): self.run_test_with_backend(backend="etcd", test_to_run=self.shutdown_called) - @sandcastle_skip_if( - TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan" - ) + @sandcastle_skip_if(TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan") def test_shutdown_called_etcd_v2(self): self.run_test_with_backend(backend="etcd-v2", test_to_run=self.shutdown_called) diff --git a/test/distributed/elastic/multiprocessing/api_test.py b/test/distributed/elastic/multiprocessing/api_test.py index 6f10dc8e386c..915a848a160a 100644 --- a/test/distributed/elastic/multiprocessing/api_test.py +++ b/test/distributed/elastic/multiprocessing/api_test.py @@ -17,7 +17,6 @@ from itertools import product from typing import Callable, Dict, List, Union from unittest import mock -from unittest.mock import patch import torch import torch.multiprocessing as mp @@ -31,7 +30,7 @@ _wrap, to_map, ) -from torch.distributed.elastic.multiprocessing.errors.error_handler import _write_error +from torch.distributed.elastic.multiprocessing.errors import ErrorHandler from torch.testing._internal.common_utils import ( IS_IN_CI, IS_MACOS, @@ -40,9 +39,9 @@ TEST_WITH_ASAN, TEST_WITH_DEV_DBG_ASAN, TEST_WITH_TSAN, + TestCase, run_tests, sandcastle_skip_if, - TestCase ) @@ -65,27 +64,29 @@ def test_is_failed(self): pr_fail = RunProcsResult(failures={0: fail0}) self.assertTrue(pr_fail.is_failed()) - @patch("torch.distributed.elastic.multiprocessing.errors.log") - def test_get_failures(self, log_mock): - with mock.patch("time.time", side_effect=[3, 2, 1]): - error_file0 = os.path.join(self.test_dir, "error0.json") - error_file1 = os.path.join(self.test_dir, "error1.json") - _write_error(RuntimeError("error 0"), error_file0) - _write_error(RuntimeError("error 1"), error_file1) + def test_get_failures(self): - fail0 = ProcessFailure( - local_rank=0, pid=997, exitcode=1, error_file=error_file0 - ) - fail1 = ProcessFailure( - local_rank=1, pid=998, exitcode=3, error_file=error_file1 - ) - fail2 = ProcessFailure( - local_rank=2, pid=999, exitcode=15, error_file="no_exist.json" - ) + error_file0 = os.path.join(self.test_dir, "error0.json") + error_file1 = os.path.join(self.test_dir, "error1.json") + eh = ErrorHandler() + with mock.patch.dict(os.environ, {"TORCHELASTIC_ERROR_FILE": error_file0}): + eh.record_exception(RuntimeError("error 0")) + + with mock.patch.dict(os.environ, {"TORCHELASTIC_ERROR_FILE": error_file0}): + eh.record_exception(RuntimeError("error 1")) + + fail0 = ProcessFailure( + local_rank=0, pid=997, exitcode=1, error_file=error_file0 + ) + fail1 = ProcessFailure( + local_rank=1, pid=998, exitcode=3, error_file=error_file1 + ) + fail2 = ProcessFailure( + local_rank=2, pid=999, exitcode=15, error_file="no_exist.json" + ) - self.assertEqual(3, fail0.timestamp) - self.assertEqual(2, fail1.timestamp) - self.assertEqual(1, fail2.timestamp) + self.assertLessEqual(fail0.timestamp, fail1.timestamp) + self.assertLessEqual(fail1.timestamp, fail2.timestamp) class StdTest(TestCase): diff --git a/test/distributed/elastic/multiprocessing/errors/api_test.py b/test/distributed/elastic/multiprocessing/errors/api_test.py index 7868624b8603..a9590bea313d 100644 --- a/test/distributed/elastic/multiprocessing/errors/api_test.py +++ b/test/distributed/elastic/multiprocessing/errors/api_test.py @@ -14,7 +14,7 @@ ProcessFailure, record, ) -from torch.distributed.elastic.multiprocessing.errors.error_handler import _write_error +from torch.distributed.elastic.multiprocessing.errors.error_handler import ErrorHandler class SentinelError(Exception): @@ -36,7 +36,8 @@ def good_fn(): @record def raise_child_failure_error_fn(name, child_error_file=""): if child_error_file: - _write_error(SentinelError("foobar"), child_error_file) + with mock.patch.dict(os.environ, {"TORCHELASTIC_ERROR_FILE": child_error_file}): + ErrorHandler().record_exception(SentinelError("foobar")) pf = ProcessFailure(local_rank=0, pid=997, exitcode=1, error_file=child_error_file) raise ChildFailedError(name, {0: pf}) @@ -64,7 +65,10 @@ def test_failure_incorrect_reply_file(self): ) def failure_with_error_file(self, exception): - _write_error(exception, self.test_error_file) + with mock.patch.dict( + os.environ, {"TORCHELASTIC_ERROR_FILE": self.test_error_file} + ): + ErrorHandler().record_exception(exception) return ProcessFailure( local_rank=0, pid=997, exitcode=1, error_file=self.test_error_file ) diff --git a/test/distributed/elastic/multiprocessing/errors/error_handler_test.py b/test/distributed/elastic/multiprocessing/errors/error_handler_test.py index 9905859a6aa7..6adf97de9a27 100644 --- a/test/distributed/elastic/multiprocessing/errors/error_handler_test.py +++ b/test/distributed/elastic/multiprocessing/errors/error_handler_test.py @@ -9,7 +9,7 @@ import unittest from unittest.mock import patch -from torch.distributed.elastic.multiprocessing.errors.error_handler import ErrorHandler, _write_error +from torch.distributed.elastic.multiprocessing.errors.error_handler import ErrorHandler from torch.distributed.elastic.multiprocessing.errors.handlers import get_error_handler @@ -78,15 +78,15 @@ def test_record_exception_no_error_file(self): def test_dump_error_file(self): src_error_file = os.path.join(self.test_dir, "src_error.json") - _write_error(RuntimeError("foobar"), src_error_file) + eh = ErrorHandler() + with patch.dict(os.environ, {"TORCHELASTIC_ERROR_FILE": src_error_file}): + eh.record_exception(RuntimeError("foobar")) with patch.dict(os.environ, {"TORCHELASTIC_ERROR_FILE": self.test_error_file}): - eh = ErrorHandler() eh.dump_error_file(src_error_file) self.assertTrue(filecmp.cmp(src_error_file, self.test_error_file)) with patch.dict(os.environ, {}): - eh = ErrorHandler() eh.dump_error_file(src_error_file) # just validate that dump_error_file works when # my error file is not set @@ -95,10 +95,13 @@ def test_dump_error_file(self): def test_dump_error_file_overwrite_existing(self): dst_error_file = os.path.join(self.test_dir, "dst_error.json") src_error_file = os.path.join(self.test_dir, "src_error.json") - _write_error(RuntimeError("foo"), dst_error_file) - _write_error(RuntimeError("bar"), src_error_file) + eh = ErrorHandler() + with patch.dict(os.environ, {"TORCHELASTIC_ERROR_FILE": dst_error_file}): + eh.record_exception(RuntimeError("foo")) + + with patch.dict(os.environ, {"TORCHELASTIC_ERROR_FILE": src_error_file}): + eh.record_exception(RuntimeError("bar")) with patch.dict(os.environ, {"TORCHELASTIC_ERROR_FILE": dst_error_file}): - eh = ErrorHandler() eh.dump_error_file(src_error_file) self.assertTrue(filecmp.cmp(src_error_file, dst_error_file)) diff --git a/test/distributed/elastic/utils/util_test.py b/test/distributed/elastic/utils/util_test.py index f6f29d7c6438..fefe40537a8f 100644 --- a/test/distributed/elastic/utils/util_test.py +++ b/test/distributed/elastic/utils/util_test.py @@ -7,48 +7,77 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from unittest import mock + import torch.distributed.elastic.utils.store as store_util from torch.distributed.elastic.utils.logging import get_logger from torch.testing._internal.common_utils import run_tests, TestCase -class TestStore: - def get(self, key: str): - return f"retrieved:{key}" - - class StoreUtilTest(TestCase): - def test_get_data(self): - store = TestStore() - data = store_util.get_all(store, "test/store", 10) - for idx in range(0, 10): - self.assertEqual(f"retrieved:test/store{idx}", data[idx]) + def test_get_all_rank_0(self): + store = mock.MagicMock() + world_size = 3 + store_util.get_all(store, 0, "test/store", world_size) + # omit empty kwargs, get only key + actual_set_call_args = [ + call_args[0][0] for call_args in store.set.call_args_list + ] + self.assertListEqual(["test/store0.FIN"], actual_set_call_args) + + actual_get_call_args = [call_args[0] for call_args in store.get.call_args_list] + expected_get_call_args = [ + ("test/store0",), + ("test/store1",), + ("test/store2",), + ("test/store0.FIN",), + ("test/store1.FIN",), + ("test/store2.FIN",), + ] + self.assertListEqual(expected_get_call_args, actual_get_call_args) + + def test_get_all_rank_n(self): + store = mock.MagicMock() + world_size = 3 + store_util.get_all(store, 1, "test/store", world_size) + # omit empty kwargs, get only key + actual_set_call_args = [ + call_args[0][0] for call_args in store.set.call_args_list + ] + self.assertListEqual(["test/store1.FIN"], actual_set_call_args) + + actual_get_call_args = [call_args[0] for call_args in store.get.call_args_list] + expected_get_call_args = [ + ("test/store0",), + ("test/store1",), + ("test/store2",), + ] + self.assertListEqual(expected_get_call_args, actual_get_call_args) def test_synchronize(self): - class DummyStore: - def __init__(self): - self._data = { - "torchelastic/test0": "data0".encode(encoding="UTF-8"), - "torchelastic/test1": "data1".encode(encoding="UTF-8"), - "torchelastic/test2": "data2".encode(encoding="UTF-8"), - } - - def set(self, key, value): - self._data[key] = value - - def get(self, key): - return self._data[key] - - def set_timeout(self, timeout): - pass - + store_mock = mock.MagicMock() data = "data0".encode(encoding="UTF-8") - store = DummyStore() - res = store_util.synchronize(store, data, 0, 3, key_prefix="torchelastic/test") - self.assertEqual(3, len(res)) - for idx, res_data in enumerate(res): - actual_str = res_data.decode(encoding="UTF-8") - self.assertEqual(f"data{idx}", actual_str) + store_util.synchronize(store_mock, data, 0, 3, key_prefix="torchelastic/test") + actual_set_call_args = store_mock.set.call_args_list + # omit empty kwargs + actual_set_call_args = [call_args[0] for call_args in actual_set_call_args] + expected_set_call_args = [ + ("torchelastic/test0", b"data0"), + ("torchelastic/test0.FIN", b"FIN"), + ] + self.assertListEqual(expected_set_call_args, actual_set_call_args) + + expected_get_call_args = [ + ("torchelastic/test0",), + ("torchelastic/test1",), + ("torchelastic/test2",), + ("torchelastic/test0.FIN",), + ("torchelastic/test1.FIN",), + ("torchelastic/test2.FIN",), + ] + actual_get_call_args = store_mock.get.call_args_list + actual_get_call_args = [call_args[0] for call_args in actual_get_call_args] + self.assertListEqual(expected_get_call_args, actual_get_call_args) class UtilTest(TestCase): diff --git a/test/distributed/fsdp/test_distributed_checkpoint.py b/test/distributed/fsdp/test_distributed_checkpoint.py new file mode 100644 index 000000000000..ef95973764c4 --- /dev/null +++ b/test/distributed/fsdp/test_distributed_checkpoint.py @@ -0,0 +1,105 @@ +# Owner(s): ["oncall: distributed"] + +import sys +import tempfile + +import torch +from torch import distributed as dist +from torch.distributed._shard.checkpoint import ( + FileSystemReader, + FileSystemWriter, + save_state_dict, + load_state_dict, +) +from torch.distributed.fsdp import ( + FullyShardedDataParallel as FSDP, + StateDictType, +) +from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel +from torch.distributed.fsdp.wrap import enable_wrap, wrap +from torch.testing._internal.common_distributed import skip_if_lt_x_gpu +from torch.testing._internal.common_fsdp import ( + FSDPTest, + SkipModel, +) +from torch.testing._internal.common_utils import ( + instantiate_parametrized_tests, + parametrize, + run_tests, + TEST_WITH_DEV_DBG_ASAN, +) + + +if not dist.is_available(): + print("Distributed not available, skipping tests", file=sys.stderr) + sys.exit(0) + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) + + +_DISTRIBUTED_STATE_DICT_IMPLS = { + StateDictType.LOCAL_STATE_DICT, + StateDictType.SHARDED_STATE_DICT, +} + + +class TestDistributedCheckpoint(FSDPTest): + @property + def world_size(self): + return 2 + + @skip_if_lt_x_gpu(2) + @parametrize("state_dict_type", _DISTRIBUTED_STATE_DICT_IMPLS) + def test_distributed_checkpoint(self, state_dict_type) -> None: + with enable_wrap(wrapper_cls=FSDP): + torch.manual_seed(100) + model = wrap(SkipModel(double_nest=True)) + torch.manual_seed(200) + new_model = wrap(SkipModel(double_nest=True)) + + with FullyShardedDataParallel.summon_full_params( + model + ), FullyShardedDataParallel.summon_full_params(new_model): + params = list(model.parameters()) + new_params = list(new_model.parameters()) + self.assertNotEqual(params, new_params) + + with tempfile.TemporaryDirectory() as path: + paths = [path] + dist.broadcast_object_list(paths) + path = paths[0] + writer = FileSystemWriter(path) + reader = FileSystemReader(path) + with FSDP.state_dict_type( + model, state_dict_type + ), FSDP.state_dict_type(new_model, state_dict_type): + state_dict = model.state_dict() + + save_state_dict(state_dict, writer) + + with FSDP.state_dict_type( + model, state_dict_type + ), FSDP.state_dict_type(new_model, state_dict_type): + state_dict = new_model.state_dict() + load_state_dict(state_dict, reader) + new_model.load_state_dict(state_dict) + + with FullyShardedDataParallel.summon_full_params( + model + ), FullyShardedDataParallel.summon_full_params(new_model): + params = list(model.parameters()) + new_params = list(new_model.parameters()) + self.assertEqual(params, new_params) + + # TODO: add resharding test case. + + +instantiate_parametrized_tests(TestDistributedCheckpoint) + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/fsdp/test_flatten_params_wrapper.py b/test/distributed/fsdp/test_flatten_params_wrapper.py index c4a7eb657078..69c78ee6dde7 100644 --- a/test/distributed/fsdp/test_flatten_params_wrapper.py +++ b/test/distributed/fsdp/test_flatten_params_wrapper.py @@ -198,7 +198,7 @@ def _test(kwargs, expected, exception=None, regex=None): expected, msg=f"{flat_p.shard_metadata()}, {expected}", ) - self.assertEqual(flat_p._num_padded, kwargs["num_padded"]) + self.assertEqual(flat_p.num_padded, kwargs["num_padded"]) _test( kwargs={"start": -1, "end": -1, "num_padded": 0}, diff --git a/test/distributed/fsdp/test_fsdp_apply.py b/test/distributed/fsdp/test_fsdp_apply.py new file mode 100644 index 000000000000..7870804d78fc --- /dev/null +++ b/test/distributed/fsdp/test_fsdp_apply.py @@ -0,0 +1,104 @@ +# Owner(s): ["oncall: distributed"] + +import sys + +import torch +import torch.distributed as dist +import torch.nn as nn +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.testing._internal.common_distributed import ( + skip_if_lt_x_gpu, +) +from torch.testing._internal.common_fsdp import ( + FSDPTest, + NestedWrappedModule, +) +from torch.testing._internal.common_utils import ( + TEST_WITH_DEV_DBG_ASAN, + run_tests, +) + +if not dist.is_available(): + print("Distributed not available, skipping tests", file=sys.stderr) + sys.exit(0) + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) + + +class TestApply(FSDPTest): + @property + def world_size(self): + return 2 + + @torch.no_grad() + def _init_linear_weights(self, m): + if type(m) == nn.Linear: + m.weight.fill_(1.0) + m.bias.fill_(1.0) + + @property + def process_group(self): + return dist.distributed_c10d._get_default_group() + + def check_weights(self, fsdp, expected_tensor_fn, check): + with fsdp.summon_full_params(fsdp, recurse=True): + linear_modules = [ + module for module in fsdp.modules() if type(module) == nn.Linear + ] + for module in linear_modules: + for param in module.parameters(): + expected = expected_tensor_fn(param) + check(param, expected, f"Got {param} but expected {expected}") + + def _check_apply(self, fsdp): + # Assert linear weights are not all 1.0 + self.check_weights( + fsdp, lambda param: torch.empty_like(param).fill_(1.0), self.assertNotEqual + ) + + fsdp.apply(self._init_linear_weights) + + # Ensure all weights are 1.0 + self.check_weights( + fsdp, lambda param: torch.empty_like(param).fill_(1.0), self.assertEqual + ) + + @skip_if_lt_x_gpu(2) + def test_nested_module_apply(self): + """ + Checks apply() modifies weights appropriately on a nested FSDP instance. + """ + nested_module = NestedWrappedModule( + self.process_group, wrap_fsdp=True, wrap_everything=True + ) + fsdp_module = FSDP(nested_module, self.process_group).cuda(self.rank) + self._check_apply(fsdp_module) + + @skip_if_lt_x_gpu(2) + def test_transformer_module_apply(self): + """ + Checks apply() modifies weights appropriately on a wrapped Transformer + module. + """ + transformer = self._get_wrapped_model(group=self.process_group).cuda(self.rank) + self._check_apply(transformer) + + @skip_if_lt_x_gpu(2) + def test_apply_in_summon_raises_error(self): + """ + Ensures that if user calls apply() on FSDP instance within full param + summon context, appropriate error is raised. + """ + transformer = self._get_wrapped_model(group=self.process_group).cuda(self.rank) + with transformer.summon_full_params(transformer, recurse=True): + with self.assertRaisesRegex(ValueError, "expected to be in states"): + transformer.apply(self._init_linear_weights) + + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/fsdp/test_fsdp_checkpoint.py b/test/distributed/fsdp/test_fsdp_checkpoint.py index e3dd483eaf69..1b3510e2b55e 100644 --- a/test/distributed/fsdp/test_fsdp_checkpoint.py +++ b/test/distributed/fsdp/test_fsdp_checkpoint.py @@ -10,7 +10,7 @@ FullyShardedDataParallel as FSDP, CPUOffload, ) -from torch.distributed.algorithms._checkpoint._checkpoint_wrapper import ( +from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ( checkpoint_wrapper, ) from torch.testing._internal.common_distributed import ( @@ -115,7 +115,7 @@ def test_checkpoint_fsdp_wrapping(self, cpu_offload, offload_activations): models = [ckpt_sequential_wrapped_fsdp, inner_ckpt, baseline] - offload_to_cpu_event = "Memcpy DtoH" + offload_to_cpu_event = "Memcpy DtoH" if torch.version.cuda else "CopyDeviceToHost" for i in range(2): losses = [] @@ -177,7 +177,7 @@ def test_basic_checkpoint_end_to_end(self, cpu_offload, offload_activations): fsdp_call_checkpoint, ] - offload_to_cpu_event = "Memcpy DtoH" + offload_to_cpu_event = "Memcpy DtoH" if torch.version.cuda else "CopyDeviceToHost" for i in range(6): losses = [] diff --git a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py new file mode 100644 index 000000000000..9e39254ec423 --- /dev/null +++ b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py @@ -0,0 +1,105 @@ +# Owner(s): ["oncall: distributed"] + +import sys +from math import inf + +import torch +from torch import distributed as dist +from torch.distributed.fsdp.fully_sharded_data_parallel import ( + FullyShardedDataParallel as FSDP, + CPUOffload, + _calc_grad_norm, +) +from torch.nn import utils as nn_utils +from torch.testing._internal.common_distributed import skip_if_lt_x_gpu +from torch.testing._internal.common_fsdp import ( + DeterministicModel, + FSDPTest, + _collect_total_grad_norm_fsdp, + _collect_total_grad_norm_local, +) +from torch.testing._internal.common_utils import ( + TEST_WITH_DEV_DBG_ASAN, + run_tests, + parametrize, + instantiate_parametrized_tests, +) + + +if not dist.is_available(): + print("Distributed not available, skipping tests", file=sys.stderr) + sys.exit(0) + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) + + +class TestClipGradNorm(FSDPTest): + def _run_fsdp_one_iteration(self, norm_type, nested_fsdp, cpu_offload): + """Test FSDP with clip grad norm.""" + fsdp_model = DeterministicModel(nested_fsdp, cpu_offload=cpu_offload) + local_model = DeterministicModel(False) + input = torch.rand(14, 2, device=self.rank) + fsdp_model = FSDP(fsdp_model, cpu_offload=cpu_offload) + self.assertTrue(len(input) >= self.world_size) + out = local_model(input[: self.world_size]) + out.sum().backward() + in_data = torch.tensor(input[self.rank], device=self.rank) + out_fsdp = fsdp_model(in_data) + out_fsdp.sum().backward() + total_norms_fsdp = _collect_total_grad_norm_fsdp( + fsdp_model, norm_type, self.rank + ) + total_norms_local = _collect_total_grad_norm_local(local_model, norm_type) + total_norms_local /= self.world_size + norm_cap = total_norms_fsdp / 2.0 + self.assertEqual(total_norms_local, total_norms_fsdp) + fsdp_model.clip_grad_norm_(norm_cap, norm_type=norm_type) + nn_utils.clip_grad_norm_( + local_model.parameters(), norm_cap, norm_type=norm_type + ) + total_norms_after_clip_fsdp = _collect_total_grad_norm_fsdp( + fsdp_model, norm_type, self.rank + ) + total_norms_after_clip_local = _collect_total_grad_norm_local( + local_model, norm_type + ) + self.assertTrue(total_norms_after_clip_fsdp <= norm_cap) + self.assertEqual(total_norms_after_clip_local, total_norms_after_clip_fsdp) + + @skip_if_lt_x_gpu(2) + @parametrize("norm_type", [2.0, inf]) + @parametrize("nested_fsdp", [True, False]) + @parametrize( + "cpu_offload", + [CPUOffload(offload_params=True), CPUOffload(offload_params=False)], + ) + def test_fsdp_clip_grad_norm(self, norm_type, nested_fsdp, cpu_offload): + """Test FSDP with clip grad norm.""" + self._run_fsdp_one_iteration(norm_type, nested_fsdp, cpu_offload) + + +class TestCalcuGradNorm(FSDPTest): + @skip_if_lt_x_gpu(2) + @parametrize("norm_type", [2.0, inf, 1.3, 2.5]) + @parametrize("nested_fsdp", [True, False]) + def test_fsdp_calc_grad_norm(self, norm_type, nested_fsdp): + """Test grad norm cal API.""" + model = FSDP(DeterministicModel(nested_fsdp)) + input = torch.rand(15, 2, device=self.rank) + out = model(input) + out.sum().backward() + total_norm = _calc_grad_norm(model.params_with_grad, norm_type) + total_norm_expected = _collect_total_grad_norm_local(model, norm_type) + self.assertEqual(total_norm, total_norm_expected) + + +instantiate_parametrized_tests(TestClipGradNorm) +instantiate_parametrized_tests(TestCalcuGradNorm) + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/fsdp/test_fsdp_comm.py b/test/distributed/fsdp/test_fsdp_comm.py new file mode 100644 index 000000000000..c527ca7aebc8 --- /dev/null +++ b/test/distributed/fsdp/test_fsdp_comm.py @@ -0,0 +1,253 @@ +# Owner(s): ["oncall: distributed"] + +import sys +from contextlib import suppress +from enum import Enum, auto +from typing import Optional +from unittest.mock import patch + +import torch +from torch import distributed as dist +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.distributed.fsdp.fully_sharded_data_parallel import ShardingStrategy +from torch.testing._internal.common_distributed import skip_if_lt_x_gpu +from torch.testing._internal.common_fsdp import FSDPTest, NestedWrappedModule +from torch.testing._internal.common_utils import ( + TEST_WITH_DEV_DBG_ASAN, + instantiate_parametrized_tests, + parametrize, + run_tests, +) + +if not dist.is_available(): + print("Distributed not available, skipping tests", file=sys.stderr) + sys.exit(0) + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) + + +class PassType(Enum): + __order__ = "FWD BWD" + FWD = auto() + BWD = auto() + + +class TestCommunication(FSDPTest): + """Tests ``FullyShardedDataParallel``'s collective communication usage.""" + def _init_model( + self, + nested_model: bool, + sharding_strategy: ShardingStrategy, + device: torch.device, + ): + group = dist.distributed_c10d._get_default_group() + if nested_model: + model = NestedWrappedModule( + group, wrap_fsdp=True, sharding_strategy=sharding_strategy, + ) + fsdp_model: FSDP = FSDP( + model, group, sharding_strategy=sharding_strategy, + ).to(device) + else: + fsdp_model: FSDP = self._get_wrapped_model( + group, + cuda_first=False, + config={"sharding_strategy": sharding_strategy}, + ) + return fsdp_model + + def _run_iter(self, fsdp_model, batch, use_no_sync: bool): + """Runs an iteration inside or outside the ``no_sync()`` context.""" + context = fsdp_model.no_sync() if use_no_sync else suppress() + with context: + output = fsdp_model(*batch) + loss = fsdp_model.module.get_loss(batch, output) + loss.backward() + + def _get_ref_num_reduce_scatters( + self, + num_fsdp: int, + in_no_sync: bool, + ) -> int: + """Returns the reference number of reduce-scatters for an iteration + in the ``no_sync()`` context.""" + return num_fsdp if not in_no_sync else 0 + + def _get_ref_num_all_gathers( + self, + num_fsdp: int, + sharding_strategy: Optional[ShardingStrategy], + is_first_iter: bool, + is_last_iter_no_sync: bool, + ) -> int: + """Returns the reference number of all-gathers in an iteration, summing + over the forward and backward passes.""" + return sum( + self._get_ref_num_all_gathers_in_pass( + num_fsdp, + sharding_strategy, + pass_type, + is_first_iter, + is_last_iter_no_sync, + ) for pass_type in PassType + ) + + def _get_ref_num_all_gathers_in_pass( + self, + num_fsdp: int, + sharding_strategy: Optional[ShardingStrategy], + pass_type: PassType, + is_first_iter: bool, + is_last_iter_no_sync: bool, + ): + """Returns the reference number of all-gathers for a given setting.""" + if sharding_strategy is None: + sharding_strategy = ShardingStrategy.FULL_SHARD # default + # Forward pass: + if pass_type == PassType.FWD and \ + sharding_strategy == ShardingStrategy.SHARD_GRAD_OP and \ + is_last_iter_no_sync: + # Modules do not free the full parameters in the last + # iteration's backward pass if it was in `no_sync()` + num_all_gathers = 0 + elif pass_type == PassType.FWD: + # Otherwise, all modules all-gather the full parameters in the + # forward pass + num_all_gathers = num_fsdp + # Backward pass: + elif pass_type == PassType.BWD and \ + sharding_strategy == ShardingStrategy.FULL_SHARD: + # Root does not free the full parameters at the end of the + # forward pass + num_all_gathers = num_fsdp - 1 + elif pass_type == PassType.BWD and \ + sharding_strategy == ShardingStrategy.SHARD_GRAD_OP: + # Modules do not free the full parameters at the end of the + # forward pass + num_all_gathers = 0 + else: + assert 0, f"Unsupported: add a branch for pass_type={pass_type} " \ + f"is_first_iter={is_first_iter} " \ + f"is_last_iter_no_sync={is_last_iter_no_sync} " \ + f"sharding_strategy={sharding_strategy}" + if is_first_iter and pass_type == PassType.FWD: + # With execution order validation, on the first iteration, we have + # an additional all-gather before every actual all-gather in the + # forward pass + num_all_gathers *= 2 + return num_all_gathers + + def _print_ref_num_all_gathers_in_pass( + self, + num_fsdp: int, + sharding_strategy: ShardingStrategy, + pass_type: PassType, + is_first_iter: bool, + is_last_iter_no_sync: bool, + ): + """Helper method for printing the number of all-gathers for a specific + setting. This may be helpful since the branching is complex.""" + if self.rank != 0: + return # only print on one rank + num_all_gathers = self._get_ref_num_all_gathers_in_pass( + num_fsdp, sharding_strategy, pass_type, is_first_iter, + is_last_iter_no_sync, + ) + print( + f"Pass: {pass_type}\n" + f"Is First Iteration: {is_first_iter}\n" + f"Sharding Strategy: {sharding_strategy}\n" + f"Last iteration in `no_sync()`: {is_last_iter_no_sync}\n" + f"Number of all-gathers: {num_all_gathers}" + ) + + @skip_if_lt_x_gpu(2) + @parametrize("nested_model", [False, True]) + @parametrize("use_no_sync", [False, True]) + @parametrize("sharding_strategy", [ShardingStrategy.SHARD_GRAD_OP, None]) + def test_communication( + self, + nested_model: bool, + use_no_sync: bool, + sharding_strategy: Optional[ShardingStrategy], + ): + """ + Tests FSDP's communication cost in terms of calls to collective + communication primitives (i.e. all-gather and reduce-scatter). + + Arguments: + nested_model (bool): If ``True``, uses ``NestedWrappedModule``, + which has nested FSDP instances; if ``False``, uses the default + model, which does not have nested FSDP instances. + use_no_sync (bool): If ``True``, runs some iterations inside the + ``no_sync()`` context manager to accumulate gradients, followed + by some iterations outside the context manager; if ``False``, + only runs some iterations outside the context manager. + sharding_strategy (Optional[ShardingStrategy]): Configures the + FSDP algorithm. + """ + # Initialize the model and inputs + device = torch.device("cuda") + fsdp_model = self._init_model(nested_model, sharding_strategy, device) + batch = fsdp_model.module.get_input(device) + + # Count the number of FSDP instances that manage parameters since the + # number of collectives are a function of this number + num_fsdp = sum( + (isinstance(m, FSDP) and len(m.params) > 0) + for m in fsdp_model.modules() + ) + + # If `use_no_sync=True`, we run `num_iters` iterations inside + # `no_sync()` followed by `num_iters` iterations outside `no_sync()`, + # and if `use_no_sync=False`, we only run `num_iters` iterations + # outside `no_sync()` + num_iters = 3 + with patch("torch.distributed._all_gather_base") as mock_all_gather, \ + patch("torch.distributed._reduce_scatter_base") as mock_reduce_scatter: + def reset_mocks(): + mock_all_gather.reset_mock() + mock_reduce_scatter.reset_mock() + # Check the communication cost when using `no_sync()` + if use_no_sync: + for i in range(num_iters): + reset_mocks() + self._run_iter(fsdp_model, batch, use_no_sync=True) + num_all_gathers = mock_all_gather.call_count + num_reduce_scatters = mock_reduce_scatter.call_count + ref_num_all_gathers = self._get_ref_num_all_gathers( + num_fsdp, sharding_strategy, is_first_iter=i == 0, + is_last_iter_no_sync=i > 0, + ) + ref_num_reduce_scatters = self._get_ref_num_reduce_scatters( + num_fsdp, in_no_sync=True, + ) + self.assertEqual(num_all_gathers, ref_num_all_gathers) + self.assertEqual(num_reduce_scatters, ref_num_reduce_scatters) + # Check the normal communication cost (when not using `no_sync()`) + for i in range(num_iters): + reset_mocks() + self._run_iter(fsdp_model, batch, use_no_sync=False) + num_all_gathers = mock_all_gather.call_count + num_reduce_scatters = mock_reduce_scatter.call_count + ref_num_all_gathers = self._get_ref_num_all_gathers( + num_fsdp, sharding_strategy, + is_first_iter=not use_no_sync and i == 0, + is_last_iter_no_sync=use_no_sync and i == 0, + ) + ref_num_reduce_scatters = self._get_ref_num_reduce_scatters( + num_fsdp, in_no_sync=False, + ) + self.assertEqual(num_all_gathers, ref_num_all_gathers) + self.assertEqual(num_reduce_scatters, ref_num_reduce_scatters) + + +instantiate_parametrized_tests(TestCommunication) + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/fsdp/test_fsdp_core.py b/test/distributed/fsdp/test_fsdp_core.py index ef91d4db0836..38e29fc29e34 100644 --- a/test/distributed/fsdp/test_fsdp_core.py +++ b/test/distributed/fsdp/test_fsdp_core.py @@ -1,6 +1,7 @@ # Owner(s): ["oncall: distributed"] import functools +import itertools import sys from unittest import mock @@ -18,6 +19,7 @@ NestedWrappedModule, NestedWrappedModuleWithDelay, TransformerWithSharedParams, + subtest_name ) from torch.testing._internal.common_utils import ( TEST_WITH_DEV_DBG_ASAN, @@ -26,8 +28,8 @@ run_tests, ) -from torch.distributed.fsdp import CPUOffload -from torch.distributed.fsdp.fully_sharded_data_parallel import BackwardPrefetch +from torch.distributed.fsdp import CPUOffload, MixedPrecision +from torch.distributed.fsdp.fully_sharded_data_parallel import BackwardPrefetch, ShardingStrategy if not dist.is_available(): @@ -41,6 +43,24 @@ ) sys.exit(0) +params = "cpu_offload,backward_prefetch,sharding_strategy" +cpu_offload_config = [CPUOffload(offload_params=True), CPUOffload(offload_params=False)] +backward_prefetch_config = [BackwardPrefetch.BACKWARD_PRE, BackwardPrefetch.BACKWARD_POST, None] +sharding_strategy_config = [ShardingStrategy.SHARD_GRAD_OP, None, ShardingStrategy.NO_SHARD] +configs = list(itertools.product(cpu_offload_config, + backward_prefetch_config, + sharding_strategy_config)) +test_name_mapping = { + str(CPUOffload(offload_params=True)): "offload_true", + str(CPUOffload(offload_params=False)): "offload_false", + str(BackwardPrefetch.BACKWARD_PRE): "prefetch_pre", + str(BackwardPrefetch.BACKWARD_POST): "prefetch_post", + str(ShardingStrategy.SHARD_GRAD_OP): "shard_grad_op", + str(ShardingStrategy.NO_SHARD): "no_shard", +} + +subtest_name = functools.partial(subtest_name, test_name_mapping) + class TestParityWithDDP(FSDPTest): """ @@ -63,15 +83,8 @@ def _get_init_modes_for_test(self, cpu_offload): return modes @skip_if_lt_x_gpu(2) - @parametrize( - "cpu_offload", - [CPUOffload(offload_params=True), CPUOffload(offload_params=False)] - ) - @parametrize( - "backward_prefetch", - [BackwardPrefetch.BACKWARD_PRE, BackwardPrefetch.BACKWARD_POST, None] - ) - def test_nested_wrapped_model(self, cpu_offload, backward_prefetch): + @parametrize(params, configs, subtest_name) + def test_nested_wrapped_model(self, cpu_offload, backward_prefetch, sharding_strategy): init_modes = self._get_init_modes_for_test(cpu_offload) for fsdp_init_mode in init_modes: with self.subTest(fsdp_init_mode=fsdp_init_mode): @@ -80,18 +93,43 @@ def test_nested_wrapped_model(self, cpu_offload, backward_prefetch): fsdp_init_mode=fsdp_init_mode, cpu_offload=cpu_offload, backward_prefetch=backward_prefetch, + sharding_strategy=sharding_strategy, ) @skip_if_lt_x_gpu(2) - @parametrize( - "cpu_offload", - [CPUOffload(offload_params=True), CPUOffload(offload_params=False)] - ) - @parametrize( - "backward_prefetch", - [BackwardPrefetch.BACKWARD_PRE, BackwardPrefetch.BACKWARD_POST, None] - ) - def test_nested_all_wrapped_model(self, cpu_offload, backward_prefetch): + @parametrize("cpu_offload", cpu_offload_config) + @parametrize("sharding_strategy", sharding_strategy_config) + @parametrize("mixed_precision", [True, False]) + def test_nested_wrapped_model_single_iteration_mixed_precision( + self, + cpu_offload, + sharding_strategy, + mixed_precision + ): + init_modes = self._get_init_modes_for_test(cpu_offload) + mixed_precision = MixedPrecision( + param_dtype=torch.float16, + buffer_dtype=torch.float16, + reduce_dtype=torch.float16, + ) if mixed_precision else None + for fsdp_init_mode in init_modes: + with self.subTest(fsdp_init_mode=fsdp_init_mode): + self._test_identical_outputs( + NestedWrappedModule, + # Only run one step for comparison, as usually grad scaler + # is needed to avoid NaN after first step. + num_steps=1, + fsdp_init_mode=fsdp_init_mode, + cpu_offload=cpu_offload, + sharding_strategy=sharding_strategy, + mixed_precision=mixed_precision, + ) + + + @skip_if_lt_x_gpu(2) + @parametrize(params, configs, subtest_name) + @parametrize("clip_norm_type", [2.0, None]) + def test_nested_all_wrapped_model(self, cpu_offload, backward_prefetch, sharding_strategy, clip_norm_type): init_modes = self._get_init_modes_for_test(cpu_offload) for fsdp_init_mode in init_modes: with self.subTest(fsdp_init_mode=fsdp_init_mode): @@ -101,18 +139,14 @@ def test_nested_all_wrapped_model(self, cpu_offload, backward_prefetch): fsdp_init_mode=fsdp_init_mode, cpu_offload=cpu_offload, backward_prefetch=backward_prefetch, + norm_type=clip_norm_type, + sharding_strategy=sharding_strategy, ) @skip_if_lt_x_gpu(2) - @parametrize( - "cpu_offload", - [CPUOffload(offload_params=True), CPUOffload(offload_params=False)] - ) - @parametrize( - "backward_prefetch", - [BackwardPrefetch.BACKWARD_PRE, BackwardPrefetch.BACKWARD_POST, None] - ) - def test_transformer_parameterized(self, cpu_offload, backward_prefetch): + @parametrize(params, configs, subtest_name) + @parametrize("clip_norm_type", [2.0, None]) + def test_transformer_parameterized(self, cpu_offload, backward_prefetch, sharding_strategy, clip_norm_type): init_modes = self._get_init_modes_for_test(cpu_offload) for fsdp_init_mode in init_modes: with self.subTest(fsdp_init_mode=fsdp_init_mode): @@ -121,18 +155,13 @@ def test_transformer_parameterized(self, cpu_offload, backward_prefetch): fsdp_init_mode=fsdp_init_mode, cpu_offload=cpu_offload, backward_prefetch=backward_prefetch, + norm_type=clip_norm_type, + sharding_strategy=sharding_strategy, ) @skip_if_lt_x_gpu(2) - @parametrize( - "cpu_offload", - [CPUOffload(offload_params=True), CPUOffload(offload_params=False)] - ) - @parametrize( - "backward_prefetch", - [BackwardPrefetch.BACKWARD_PRE, BackwardPrefetch.BACKWARD_POST, None] - ) - def test_delayed_optim_step(self, cpu_offload, backward_prefetch): + @parametrize(params, configs, subtest_name) + def test_delayed_optim_step(self, cpu_offload, backward_prefetch, sharding_strategy): # We use a model with a long CUDA delay right before the optimizer step. # This tests our streams logic, and that we don't start the allgather # until after the optimization step completes. @@ -147,18 +176,12 @@ def test_delayed_optim_step(self, cpu_offload, backward_prefetch): fsdp_init_mode=fsdp_init_mode, cpu_offload=cpu_offload, backward_prefetch=backward_prefetch, + sharding_strategy=sharding_strategy, ) @skip_if_lt_x_gpu(2) - @parametrize( - "cpu_offload", - [CPUOffload(offload_params=True), CPUOffload(offload_params=False)] - ) - @parametrize( - "backward_prefetch", - [BackwardPrefetch.BACKWARD_PRE, BackwardPrefetch.BACKWARD_POST, None] - ) - def test_delayed_reduce_scatter(self, cpu_offload, backward_prefetch): + @parametrize(params, configs, subtest_name) + def test_delayed_reduce_scatter(self, cpu_offload, backward_prefetch, sharding_strategy): # We insert a delay in the torch.distributed._reduce_scatter_base op, so that # the post_backward_stream takes much longer than the backward pass. # This tests that we properly block at the end of the backward pass for @@ -174,21 +197,16 @@ def test_delayed_reduce_scatter(self, cpu_offload, backward_prefetch): fsdp_init_mode=fsdp_init_mode, cpu_offload=cpu_offload, backward_prefetch=backward_prefetch, + sharding_strategy=sharding_strategy, ) def _dummy_ddp_fn(self, model): return DummyDDP(model) @skip_if_lt_x_gpu(2) - @parametrize( - "cpu_offload", - [CPUOffload(offload_params=True), CPUOffload(offload_params=False)] - ) - @parametrize( - "backward_prefetch", - [BackwardPrefetch.BACKWARD_PRE, BackwardPrefetch.BACKWARD_POST, None] - ) - def test_mixture_of_experts(self, cpu_offload, backward_prefetch): + @parametrize(params, configs, subtest_name) + @parametrize("clip_norm_type", [2.0, None]) + def test_mixture_of_experts(self, cpu_offload, backward_prefetch, sharding_strategy, clip_norm_type): init_modes = self._get_init_modes_for_test(cpu_offload) for fsdp_init_mode in init_modes: with self.subTest(fsdp_init_mode=fsdp_init_mode): @@ -200,18 +218,13 @@ def test_mixture_of_experts(self, cpu_offload, backward_prefetch): fsdp_init_mode=fsdp_init_mode, cpu_offload=cpu_offload, backward_prefetch=backward_prefetch, + norm_type=clip_norm_type, + sharding_strategy=sharding_strategy, ) @skip_if_lt_x_gpu(2) - @parametrize( - "cpu_offload", - [CPUOffload(offload_params=True), CPUOffload(offload_params=False)] - ) - @parametrize( - "backward_prefetch", - [BackwardPrefetch.BACKWARD_PRE, BackwardPrefetch.BACKWARD_POST, None] - ) - def test_mixture_of_experts_with_delay_before_free(self, cpu_offload, backward_prefetch): + @parametrize(params, configs, subtest_name) + def test_mixture_of_experts_with_delay_before_free(self, cpu_offload, backward_prefetch, sharding_strategy): init_modes = self._get_init_modes_for_test(cpu_offload) for fsdp_init_mode in init_modes: with self.subTest(fsdp_init_mode=fsdp_init_mode): @@ -222,15 +235,21 @@ def test_mixture_of_experts_with_delay_before_free(self, cpu_offload, backward_p fsdp_init_mode=fsdp_init_mode, cpu_offload=cpu_offload, backward_prefetch=backward_prefetch, + sharding_strategy=sharding_strategy, ) class TestParamInit(FSDPTest): @skip_if_lt_x_gpu(2) - def test_param_change_after_init(self): + @parametrize("mixed_precision", [True, False]) + def test_param_change_after_init(self, mixed_precision): group = dist.distributed_c10d._get_default_group() # Establish reference behavior. - model = self._get_wrapped_model(group, cuda_first=False) + mixed_precision = MixedPrecision() if mixed_precision else None + config = {"mixed_precision": mixed_precision} + model = self._get_wrapped_model( + group, mixed_precision=mixed_precision, cuda_first=False + ) model.eval() # no dropout for this test input = model.module.get_input(torch.device("cuda")) ref_output = model(*input) @@ -284,10 +303,15 @@ def _test_output_backward_hooks(self, model): @skip_if_lt_x_gpu(2) @parametrize("cuda_first", [False, True]) - def test_register_functions_called(self, cuda_first): + @parametrize("mixed_precision", [True, False]) + def test_register_functions_called(self, cuda_first, mixed_precision): """Tests that _register_{pre|post}_backward_hooks called during forward.""" group = dist.distributed_c10d._get_default_group() - model = self._get_wrapped_model(group, cuda_first=cuda_first) + mixed_precision = MixedPrecision() if mixed_precision else None + config = {"mixed_precision": mixed_precision} + model = self._get_wrapped_model( + group, mixed_precision=mixed_precision, cuda_first=cuda_first + ) input = model.module.get_input(torch.device("cuda")) model._register_post_backward_hooks = mock.MagicMock(return_value=None) model._register_pre_backward_hooks = mock.MagicMock(return_value=None) @@ -300,11 +324,23 @@ def test_register_functions_called(self, cuda_first): class TestNoGrad(FSDPTest): @skip_if_lt_x_gpu(2) - def test_transformer_no_grad(self): + @parametrize("mixed_precision", [True, False]) + def test_transformer_no_grad(self, mixed_precision): group = dist.distributed_c10d._get_default_group() - model = self._get_wrapped_model(group, cuda_first=False) + mixed_precision = MixedPrecision( + param_dtype=torch.float16, + reduce_dtype=torch.float16, + buffer_dtype=torch.float16, + ) if mixed_precision else None + config = {"mixed_precision": mixed_precision} + model = self._get_wrapped_model(group, config=config, cuda_first=False) # Train model for a step - self._train_for_several_steps(model, num_steps=1, autocast=False) + self._train_for_several_steps( + model, + num_steps=1, + autocast=False, + mixed_precision=config["mixed_precision"] + ) model.eval() # no dropout for this test @@ -321,6 +357,8 @@ def test_transformer_no_grad(self): instantiate_parametrized_tests(TestHooks) instantiate_parametrized_tests(TestParityWithDDP) +instantiate_parametrized_tests(TestNoGrad) +instantiate_parametrized_tests(TestParamInit) if __name__ == "__main__": run_tests() diff --git a/test/distributed/fsdp/test_fsdp_exec_order.py b/test/distributed/fsdp/test_fsdp_exec_order.py new file mode 100644 index 000000000000..14a704b53f78 --- /dev/null +++ b/test/distributed/fsdp/test_fsdp_exec_order.py @@ -0,0 +1,194 @@ +# Owner(s): ["oncall: distributed"] + +import sys +import warnings +from contextlib import suppress + +import torch +from torch import distributed as dist +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.distributed.fsdp.fully_sharded_data_parallel import ShardingStrategy +from torch.testing._internal.common_distributed import skip_if_lt_x_gpu +from torch.testing._internal.common_fsdp import FSDPTest +from torch.testing._internal.common_utils import ( + TEST_WITH_DEV_DBG_ASAN, + instantiate_parametrized_tests, + parametrize, + run_tests, +) + +if not dist.is_available(): + print("Distributed not available, skipping tests", file=sys.stderr) + sys.exit(0) + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) + + +class Model(torch.nn.Module): + """ + Model that supports two computation paths: `layer0` -> `layer1` and + `layer0` -> `layer2`. Notably, both `layer1` and `layer2` have 36 elements + when flattened, which means that their corresponding all-gathers and + reduce-scatters may be silently matched if we do not perform any checks. + """ + def __init__(self) -> None: + super().__init__() + self.layer0 = torch.nn.Linear(5, 6) + self.layer1 = torch.nn.Linear(6, 6, bias=False) + self.layer2 = torch.nn.Sequential( + torch.nn.Linear(6, 3, bias=False), + torch.nn.ReLU(), + torch.nn.Linear(3, 6, bias=False), + ) + self.relu = torch.nn.ReLU() + self.use_alt_path = False + for param in self.layer2.parameters(): + param.requires_grad = False + + def forward(self, x): + # `layer0` -> `layer1` (normal) + # `layer0` -> `layer2` (alternate) + z = self.relu(self.layer0(x)) + z = self.relu(self.layer2(z)) if self.use_alt_path \ + else self.relu(self.layer1(z)) + return z + + def get_input(self, device: torch.device): + return (torch.randn((8, 5)).to(device),) + + def get_loss(self, input, output): + return output.sum() + + def run_backward(self, loss): + loss.backward() + + def flip_path(self): + params_to_freeze = self.layer2.parameters() if self.use_alt_path \ + else self.layer1.parameters() + params_to_unfreeze = self.layer1.parameters() if self.use_alt_path \ + else self.layer2.parameters() + for param in params_to_freeze: + param.requires_grad = False + for param in params_to_unfreeze: + param.requires_grad = True + self.use_alt_path = not self.use_alt_path + + @staticmethod + def wrap(sharding_strategy: ShardingStrategy, device: torch.device): + model = Model() + model.layer1 = FSDP(model.layer1, sharding_strategy=sharding_strategy) + model.layer2 = FSDP(model.layer2, sharding_strategy=sharding_strategy) + fsdp_model = FSDP(model, sharding_strategy=sharding_strategy) + return fsdp_model.to(device) + + +class TestFSDPExecOrder(FSDPTest): + @property + def device(self): + return torch.device("cuda") + + @skip_if_lt_x_gpu(2) + @parametrize( + "sharding_strategy", + [ShardingStrategy.FULL_SHARD, ShardingStrategy.SHARD_GRAD_OP], + ) + def test_invalid_first_iter_order( + self, + sharding_strategy: ShardingStrategy, + ): + """Tests that FSDP errors if the all-gather order differs across ranks + in the first iteration.""" + # Rank 0 runs the forward pass in one order and all other ranks run in + # different order + fsdp_model = Model.wrap(sharding_strategy, self.device) + if self.rank != 0: + fsdp_model.flip_path() + inp = fsdp_model.module.get_input(self.device) + # Match the error message with the following prefix + error_regex = "^(Forward order differs across ranks)" + with self.assertRaisesRegex(RuntimeError, error_regex): + fsdp_model(*inp) + + @skip_if_lt_x_gpu(2) + @parametrize( + "sharding_strategy", + [ShardingStrategy.FULL_SHARD, ShardingStrategy.SHARD_GRAD_OP], + ) + @parametrize("iters_before_path_change", [1, 3]) + def test_invalid_later_iter_order( + self, + sharding_strategy: ShardingStrategy, + iters_before_path_change: int, + ): + """Tests that FSDP warns the user if the all-gather order changes after + the first iteration.""" + # On the first iteration, all ranks run the same order, and on the next + # iteration, all but rank 0 run in a different order + fsdp_model = Model.wrap(sharding_strategy, self.device) + for _ in range(iters_before_path_change): + inp = fsdp_model.module.get_input(self.device) + output = fsdp_model(*inp) + loss = fsdp_model.module.get_loss(inp, output).to(self.device) + fsdp_model.module.run_backward(loss) + # Match the warning message with the following prefix + regex = "^(Forward order differs from that of the first iteration " \ + f"on rank {self.rank} -- collectives are unchecked and may give " \ + "incorrect results or hang)" + context = self.assertWarnsRegex( + expected_warning=UserWarning, expected_regex=regex, + ) if self.rank != 0 else suppress() + if self.rank != 0: + fsdp_model.flip_path() + inp = fsdp_model.module.get_input(self.device) + # Expect a warning for the forward pass all-gather + with context: # warning for forward pass all-gather + output = fsdp_model(*inp) + loss = fsdp_model.module.get_loss(inp, output).to(self.device) + fsdp_model.module.run_backward(loss) + # Run an additional iteration to check that there are no more warnings + inp = fsdp_model.module.get_input(self.device) + output = fsdp_model(*inp) + loss = fsdp_model.module.get_loss(inp, output).to(self.device) + fsdp_model.module.run_backward(loss) + + @skip_if_lt_x_gpu(2) + @parametrize( + "sharding_strategy", + [ShardingStrategy.FULL_SHARD, ShardingStrategy.SHARD_GRAD_OP], + ) + def test_train_eval(self, sharding_strategy: ShardingStrategy): + fsdp_model = Model.wrap(sharding_strategy, self.device) + NUM_ITERS = 3 + NUM_EPOCHS = 2 + with warnings.catch_warnings(record=True) as w: # records warnings to `w` + for _ in range(NUM_EPOCHS): + fsdp_model.train() + for _ in range(NUM_ITERS): + inp = fsdp_model.module.get_input(self.device) + output = fsdp_model(*inp) + loss = fsdp_model.module.get_loss(inp, output).to(self.device) + fsdp_model.module.run_backward(loss) + fsdp_model.eval() + for _ in range(NUM_ITERS): + inp = fsdp_model.module.get_input(self.device) + output = fsdp_model(*inp) + fsdp_model.module.get_loss(inp, output).to(self.device) + # Check that the order validation warning was not issued (errors do not + # need to be checked since they will be directly reported) + warning_prefix = "Forward order differs" + for warning in w: + if str(warning.message).startswith(warning_prefix): + raise AssertionError(f"Warning was incorrectly issued: {warning.message}") + # If we still validate the forward execution order in eval mode, then + # an `AssertionError` will be raised above for both sharding strategies + + +instantiate_parametrized_tests(TestFSDPExecOrder) + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/fsdp/test_fsdp_freezing_weights.py b/test/distributed/fsdp/test_fsdp_freezing_weights.py index 6a45ff9039db..9a92d9d0f546 100644 --- a/test/distributed/fsdp/test_fsdp_freezing_weights.py +++ b/test/distributed/fsdp/test_fsdp_freezing_weights.py @@ -143,7 +143,7 @@ def _dist_train( optimizer.step() if with_fsdp: - get_full_params(model) + return get_full_params(model) return list(model.parameters()) diff --git a/test/distributed/fsdp/test_fsdp_grad_acc.py b/test/distributed/fsdp/test_fsdp_grad_acc.py new file mode 100644 index 000000000000..f2569266c347 --- /dev/null +++ b/test/distributed/fsdp/test_fsdp_grad_acc.py @@ -0,0 +1,261 @@ +# Owner(s): ["oncall: distributed"] + +import contextlib +import itertools +import sys +from dataclasses import dataclass +from typing import List, Optional, Tuple + +import torch +from torch import distributed as dist +from torch.distributed.fsdp import CPUOffload +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.distributed.fsdp.fully_sharded_data_parallel import BackwardPrefetch +from torch.testing._internal.common_distributed import skip_if_lt_x_gpu +from torch.testing._internal.common_fsdp import FSDPTest +from torch.testing._internal.common_utils import ( + TEST_WITH_DEV_DBG_ASAN, + instantiate_parametrized_tests, + parametrize, + run_tests, +) + +if not dist.is_available(): + print("Distributed not available, skipping tests", file=sys.stderr) + sys.exit(0) + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) + + +@dataclass +class _GradAccConfig: + """ + This configures how gradients are accumulated in :meth:`_test_grad_acc`. + Each instance of this class represents ``num_iters``-many consecutive + iterations, where the ``no_sync()`` context manager is used or not as given + by ``use_no_sync``. + + Attributes: + use_no_sync (bool): Indicates whether to use the ``no_sync()`` context + manager as the way to accumulate gradients. + num_iters (int): Number of iterations to accumulate gradients. + """ + use_no_sync: bool + num_iters: int + + def __repr__(self) -> str: + # Override to remove any spaces in the string to appease the internal + # build's test name parser + return ( + f"(use_no_sync={self.use_no_sync}," + f"num_iters={self.num_iters})" + ) + + +@dataclass +class _GradAccConfigs: + """ + This wraps a :class:`list` of :class:`_GradAccConfig` instances with the + sole purpose of overriding :meth:`__repr__` to remove spaces. + """ + configs: List[_GradAccConfig] + + def __repr__(self) -> str: + # Override to remove any spaces in the string to appease the internal + # build's test name parser + return ( + "[" + ",".join(config.__repr__() for config in self.configs) + "]" + ) + + +class TestGradAcc(FSDPTest): + """Tests ``FullyShardedDataParallel``'s gradient accumulation via both its + ``no_sync()`` context manager and without the context manager.""" + + def _test_grad_acc( + self, + batch_dim: int, + configs: List[_GradAccConfig], + cpu_offload: CPUOffload, + backward_prefetch: Optional[BackwardPrefetch], + ): + """ + Tests gradient accumulation by comparing a run that trains sequentially + through some batches while accumulating gradients with a run that + trains on the concatenation of those batches in a single iteration. + + The last iteration always synchronizes gradients regardless of what is + specified by the last element of ``configs``. + + Arguments: + batch_dim (int): Batch dimension in the input tensor to be passed + into the model for the forward pass. + configs (List[_GradAccConfig]): :class:`list` of configurations + specifying how gradients are accumulated; for example, a list + corresponding to [(False, 2), (True, 2), (False, 2)] indicates + to accumulate over 2 + 2 + 2 = 6 total iterations, where the + first two do not use ``no_sync()``, the middle two do use + ``no_sync()``, and the final two again do not use + ``no_sync()``. + cpu_offload (CPUOffload): Configures CPU offloading. + backward_prefetch (Optional[BackwardPrefetch]): Specifies at which + point to prefetch the next layer's full parameters during the + backward pass, if at all. + """ + # Gradient accumulation outside `no_sync()` is not currently compatible + # with CPU offloading + if cpu_offload.offload_params and \ + any(not config.use_no_sync for config in configs): + return + old_allow_tf32 = torch.backends.cuda.matmul.allow_tf32 + try: + # Disable TF32 to prevent floating point drift + torch.backends.cuda.matmul.allow_tf32 = False + + # Initialize the FSDP model and optimizer + group = dist.distributed_c10d._get_default_group() + fsdp_model: FSDP = self._get_wrapped_model( + group, cuda_first=False, add_bn=False, + config={ + "cpu_offload": cpu_offload, + "backward_prefetch": backward_prefetch, + }, + ) # disable BN since the test uses varying batch sizes + fsdp_model.eval() # disable dropout + device = torch.device("cuda") + optim = torch.optim.SGD( + fsdp_model.parameters(), lr=0.01, momentum=0.9, + ) + + # Generate the sequence of batches, each containing the same data + # but permuted + def permute_tensor(x: torch.Tensor): + return x.view(-1)[torch.randperm(x.numel())].view_as(x) + + batch: Tuple[torch.Tensor, ...] = \ + fsdp_model.module.get_input(device) + batches: List[Tuple[torch.Tensor, ...]] = [batch] + num_iters_to_acc = sum(config.num_iters for config in configs) + for _ in range(num_iters_to_acc - 1): + batches.append(tuple(permute_tensor(t) for t in batch)) + for (batch1, batch2) in itertools.combinations(batches, r=2): + for t1, t2 in zip(batch1, batch2): + assert not torch.all(t1 == t2), \ + "Check the test to make sure that batches are distinct" + + # Concatenate the batches along the given batch dimension + concat_batch: Tuple[torch.Tensor, ...] = tuple( + torch.cat(ts, dim=batch_dim) for ts in zip(*batches) + ) + + # Establish reference gradients using the concatenated batch + fsdp_model.zero_grad() + output = fsdp_model(*concat_batch) + ref_loss = fsdp_model.module.get_loss(concat_batch, output) + ref_loss.backward() + ref_grads = [ + p.grad.detach().clone() for p in fsdp_model.parameters() + ] + + # Compute and accumulate the gradients + fsdp_model.zero_grad() + losses = [] + batch_idx = 0 + for config in configs: + sync_context = fsdp_model.no_sync() if config.use_no_sync \ + else contextlib.suppress() + with sync_context: + for _ in range(config.num_iters): + if batch_idx == num_iters_to_acc - 1: + break # always sync on the last iteration + batch = batches[batch_idx] + batch_idx += 1 + output = fsdp_model(*batch) + loss = fsdp_model.module.get_loss(batch, output) + loss.backward() + losses.append(loss) + output = fsdp_model(*batches[-1]) + loss = fsdp_model.module.get_loss(batches[-1], output) + loss.backward() + losses.append(loss) + acc_loss = sum(losses) + acc_grads = [ + p.grad.detach().clone() for p in fsdp_model.parameters() + ] + + # Compare the losses and gradients + torch.testing.assert_close(ref_loss, acc_loss) + self.assertEqual(len(ref_grads), len(acc_grads)) + for ref_grad, acc_grad in zip(ref_grads, acc_grads): + self.assertEqual(ref_grad.device, acc_grad.device) + self.assertEqual(ref_grad.size(), acc_grad.size()) + self.assertEqual(ref_grad.dtype, acc_grad.dtype) + torch.testing.assert_close(ref_grad, acc_grad) + + # Check that the optimizer step does not error + optim.step() + finally: + torch.backends.cuda.matmul.allow_tf32 = old_allow_tf32 + + @skip_if_lt_x_gpu(2) + @parametrize( + "configs", + [ + _GradAccConfigs([ + _GradAccConfig(use_no_sync=True, num_iters=3), + _GradAccConfig(use_no_sync=False, num_iters=3), + _GradAccConfig(use_no_sync=True, num_iters=3), + ]), + _GradAccConfigs([ + _GradAccConfig(use_no_sync=False, num_iters=3), + _GradAccConfig(use_no_sync=True, num_iters=3), + _GradAccConfig(use_no_sync=False, num_iters=3), + ]), + ] + ) + @parametrize( + "cpu_offload", + [CPUOffload(offload_params=False), CPUOffload(offload_params=True)], + ) + @parametrize( + "backward_prefetch", + [BackwardPrefetch.BACKWARD_PRE, BackwardPrefetch.BACKWARD_POST, None], + ) + def test_grad_acc( + self, + configs: _GradAccConfigs, + cpu_offload: CPUOffload, + backward_prefetch: Optional[BackwardPrefetch], + ): + """ + Tests gradient accumulation. + + This exercises gradient accumulation inside and outside the + ``no_sync()`` context manager, in particular by interleaving the two. + It tests both interleaving starting with (and ending with, resp.) + inside versus outside ``no_sync()`` to ensure that initial conditions + (and final conditions, resp.) do not affect the correctness. This test + also checks for compatibility with the CPU offload and backward + prefetch options. + + NOTE: Gradient accumulation without using the ``no_sync()`` context + manager is not currently compatible with CPU offloading, so those tests + are vacuous. + """ + self._test_grad_acc( + batch_dim=1, + configs=configs.configs, + cpu_offload=cpu_offload, + backward_prefetch=backward_prefetch, + ) + + +instantiate_parametrized_tests(TestGradAcc) + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/fsdp/test_fsdp_ignored_modules.py b/test/distributed/fsdp/test_fsdp_ignored_modules.py new file mode 100644 index 000000000000..6c653b92ece4 --- /dev/null +++ b/test/distributed/fsdp/test_fsdp_ignored_modules.py @@ -0,0 +1,136 @@ +# Owner(s): ["oncall: distributed"] + +import sys + +import torch +from torch import distributed as dist +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.testing._internal.common_distributed import skip_if_lt_x_gpu +from torch.testing._internal.common_fsdp import FSDPTest +from torch.testing._internal.common_utils import ( + TEST_WITH_DEV_DBG_ASAN, + instantiate_parametrized_tests, + run_tests, +) + +if not dist.is_available(): + print("Distributed not available, skipping tests", file=sys.stderr) + sys.exit(0) + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) + + +class Model(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.layer0 = torch.nn.Linear(3, 5) + self.layer1 = torch.nn.Sequential( + torch.nn.Linear(5, 5), + torch.nn.Linear(5, 4), + torch.nn.Linear(4, 4), + ) + self.layer2 = torch.nn.Linear(4, 1) + + def forward(self, x): + return self.layer2(self.layer1(self.layer0(x))) + + def get_input(self, device): + return (torch.randn((8, 3)).to(device),) + + def get_loss(self, input, output): + return output.sum() + + def run_backward(self, loss): + loss.backward() + +class TestFSDPIgnoredModules(FSDPTest): + @skip_if_lt_x_gpu(2) + def test_ignored_modules_transformer(self): + """Tests that ignored modules' parameters are not flattened for a + transformer model with shared parameters.""" + # Initialize an FSDP-wrapped transformer model that has FSDP ignore + # the `nn.Transformer` module's parameters + group = dist.distributed_c10d._get_default_group() + wrapped_model = self._get_wrapped_model(group, ignore_modules=True) + # Check that the wrapped model's flattened parameter does not include + # the ignored transformer module's parameters + nonwrapped_model = self._get_nonwrapped_model(group) + total_numel = sum(p.numel() for p in nonwrapped_model.parameters()) + ignored_numel = sum( + p.numel() for p in nonwrapped_model.transformer.parameters() + ) + nonignored_numel = total_numel - ignored_numel + with FSDP.summon_full_params(wrapped_model): + flat_param_numel = wrapped_model.params[0].numel() + self.assertEqual(flat_param_numel, nonignored_numel) + # Check that we can run a few iterations + device = torch.device("cuda") + optim = torch.optim.Adam(wrapped_model.parameters(), lr=1e-3) + for _ in range(3): + inp = wrapped_model.module.get_input(device) + output = wrapped_model(*inp) + loss = wrapped_model.module.get_loss(inp, output).to(device) + wrapped_model.module.run_backward(loss) + optim.step() + + @skip_if_lt_x_gpu(2) + def test_ignored_modules_nested(self): + """Tests that passing a module with nested FSDP modules does not + error and still ignores non-FSDP modules' parameters.""" + # Initialize an FSDP-wrapped nested model that first wraps the nested + # sequential's middle linear layer (`layer1[1]`) and then wraps the + # overall model while ignoring the nested sequential (`layer1`) + model = Model().cuda() + model.layer1[1] = FSDP(model.layer1[1]) + wrapped_model = FSDP(model, ignored_modules=[model.layer1]) + # Check that the wrapped model's flattened parameter does not include + # the ignored nested sequential's parameters + nonwrapped_model = Model() + total_numel = sum(p.numel() for p in nonwrapped_model.parameters()) + ignored_numel = sum( + p.numel() for p in nonwrapped_model.layer1.parameters() + ) + nonignored_numel = total_numel - ignored_numel + with FSDP.summon_full_params(wrapped_model): + flat_param_numel = wrapped_model.params[0].numel() + self.assertEqual(flat_param_numel, nonignored_numel) + # Check that we can run a few iterations + device = torch.device("cuda") + optim = torch.optim.Adam(wrapped_model.parameters(), lr=1e-3) + for _ in range(3): + inp = wrapped_model.get_input(device) + output = wrapped_model(*inp) + loss = wrapped_model.get_loss(inp, output).to(device) + wrapped_model.run_backward(loss) + optim.step() + + @skip_if_lt_x_gpu(2) + def test_ignored_modules_invalid(self): + """Tests that passing an FSDP module as an ignored module or the + top-level module itself errors.""" + model = Model() + model.layer1 = FSDP(model.layer1) + # Passing an FSDP module as an ignored module should error + with self.assertRaises( + ValueError, + msg="`ignored_modules` should not include FSDP modules", + ): + FSDP(model, ignored_modules=[model.layer1]) + with self.assertRaises( + ValueError, + msg="Trying to ignore the top-level module passed into the FSDP " + "constructor itself will result in all parameters being ignored " + "and is not supported", + ): + FSDP(model, ignored_modules=[model]) + + +instantiate_parametrized_tests(TestFSDPIgnoredModules) + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/fsdp/test_fsdp_meta.py b/test/distributed/fsdp/test_fsdp_meta.py new file mode 100644 index 000000000000..1aa426800db6 --- /dev/null +++ b/test/distributed/fsdp/test_fsdp_meta.py @@ -0,0 +1,328 @@ +# Owner(s): ["oncall: distributed"] + +import sys + +import torch +import torch.distributed as dist +import torch.nn as nn +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.distributed.fsdp.wrap import always_wrap_policy as always_wrap +from torch.distributed.fsdp.wrap import wrap, enable_wrap +from torch.testing._internal.common_fsdp import ( + FSDPTest, +) +from torch.testing._internal.common_utils import ( + TEST_WITH_DEV_DBG_ASAN, + run_tests, + parametrize, + instantiate_parametrized_tests, + sandcastle_skip_if, +) +from torch.testing._internal.common_distributed import ( + skip_if_lt_x_gpu, +) + +_TORCHDISTX_AVAIL = True +try: + from torchdistx import deferred_init +except ImportError: + _TORCHDISTX_AVAIL = False + + +if not dist.is_available(): + print("Distributed not available, skipping tests", file=sys.stderr) + sys.exit(0) + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) + + +def _reset_params_if_meta(is_meta, model): + # For torchdistX init, we don't need to call reset_params, as + # deferred_init(model).materialize() is equivalent to model(). + if is_meta: + model.reset_parameters() + +class MyLinear(nn.Linear): + """ + Linear layer with deterministic reset_parameters for testing. + """ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def reset_parameters(self, *args, **kwargs): + with torch.no_grad(): + self.weight.fill_(1) + +class MyModel(nn.Module): + def __init__(self, device): + super().__init__() + self.lin1 = MyLinear(2, 2, bias=False, device=device) + self.lin2 = MyLinear(2, 2, bias=False, device=device) + + def forward(self, x): + return self.lin2(self.lin1(x)) + + def reset_parameters(self, *args, **kwargs): + for m in [self.lin1, self.lin2]: + if not isinstance(m, FSDP): + m.reset_parameters() + + +class NestedModel(nn.Module): + def __init__(self, device): + super().__init__() + self.lin1 = MyLinear(2, 2, bias=False, device=device) + self.lin1 = wrap(self.lin1) + self.lin2 = MyLinear(2, 2, bias=False, device=device) + self.l3 = MyModel(device=device) + self.l3 = wrap(self.l3) + + def forward(self, x): + return self.l3(self.lin2(self.lin1(x))) + + def reset_parameters(self): + for m in [self.lin1, self.lin2, self.l3]: + if not isinstance(m, FSDP): + m.reset_parameters() + +def _init_with_reset_params(module): + """ + to_empty + reset_parameters() init function example for modules + initailized with device="meta" + """ + is_meta = any(t.is_meta for t in module.parameters()) + if is_meta: + module.to_empty(device=torch.cuda.current_device()) + with torch.no_grad(): + module.reset_parameters() + +def _init_with_torchdistX(module): + """ + torchdistX-based deferred module initialization function example + using ``materialize_module``. + """ + assert _TORCHDISTX_AVAIL + + def check_fn(k): + return not isinstance(k, FSDP) + + deferred_init.materialize_module(module, check_fn=check_fn) + +class TestFSDPWithMetaDevice(FSDPTest): + @property + def world_size(self): + return 2 + + @property + def process_group(self): + return dist.distributed_c10d._get_default_group() + + def _compare_fsdp(self, fsdp1, fsdp2): + with FSDP.summon_full_params(fsdp1): + with FSDP.summon_full_params(fsdp2): + for p1, p2 in zip(fsdp1.parameters(), fsdp2.parameters()): + self.assertTrue(torch.allclose(p1, p2), f"{p1} vs {p2}") + + def _test_simple_model_with_meta_device(self, meta_module_fn, init_fn=None): + # Create model on meta device and wrap with FSDP. + model = meta_module_fn() + is_meta = next(model.parameters()).is_meta + fsdp_meta = FSDP( + model, + auto_wrap_policy=always_wrap, + param_init_fn=init_fn, + ) + + meta_opt = torch.optim.SGD(fsdp_meta.parameters(), lr=1e-3) + + # Test to make sure it is the same model parameters as regular FSDP + # approach. + regular = MyModel(device="cuda") + _reset_params_if_meta(is_meta, regular) + fsdp_regular = FSDP(regular, auto_wrap_policy=always_wrap) + regular_opt = torch.optim.SGD(fsdp_regular.parameters(), lr=1e-3) + + self._compare_fsdp(fsdp_meta, fsdp_regular) + inp = torch.randn(10, 2, device='cuda') + fsdp_meta(inp).sum().backward() + fsdp_regular(inp).sum().backward() + meta_opt.step() + regular_opt.step() + self._compare_fsdp(fsdp_meta, fsdp_regular) + + # Test that meta init works if all submodules are contained in only a + # single FSDP unit. + model = meta_module_fn() + fsdp_meta = FSDP(model, param_init_fn=init_fn) + meta_opt = torch.optim.SGD(fsdp_meta.parameters(), lr=1e-3) + regular = MyModel(device="cuda") + _reset_params_if_meta(is_meta, regular) + fsdp_regular = FSDP(regular, auto_wrap_policy=always_wrap) + regular_opt = torch.optim.SGD(fsdp_regular.parameters(), lr=1e-3) + + # Run a forward + backward pass + optimizer step + fsdp_meta(inp).sum().backward() + fsdp_regular(inp).sum().backward() + meta_opt.step() + regular_opt.step() + self._compare_fsdp(fsdp_meta, fsdp_regular) + + @skip_if_lt_x_gpu(2) + def test_simple_model_with_meta_device_reset_params(self): + def meta_module_fn(): + return MyModel(device="meta") + self._test_simple_model_with_meta_device( + meta_module_fn, _init_with_reset_params + ) + + @skip_if_lt_x_gpu(2) + def test_simple_model_with_meta_device_default_init(self): + def meta_module_fn(): + return MyModel(device="meta") + self._test_simple_model_with_meta_device(meta_module_fn) + + @skip_if_lt_x_gpu(2) + @sandcastle_skip_if( + not _TORCHDISTX_AVAIL, "Test requires torchdistX: https://github.com/pytorch/torchdistX" + ) + def test_simple_model_with_torchdistX_default_init(self): + def meta_module_fn(): + return deferred_init.deferred_init(MyModel, device="cuda") + + self._test_simple_model_with_meta_device(meta_module_fn) + + @skip_if_lt_x_gpu(2) + @sandcastle_skip_if( + not _TORCHDISTX_AVAIL, "Test requires torchdistX: https://github.com/pytorch/torchdistX" + ) + def test_simple_model_with_torchdistX_init_fn(self): + def meta_module_fn(): + return deferred_init.deferred_init(MyModel, device="cuda") + + self._test_simple_model_with_meta_device(meta_module_fn, init_fn=_init_with_torchdistX) + + def _test_nested_model_with_meta_device(self, auto_wrap, meta_module_fn, init_fn=None): + if auto_wrap: + module = meta_module_fn() + is_meta = next(module.parameters()).is_meta + fsdp_meta = FSDP( + module, + auto_wrap_policy=always_wrap, + param_init_fn=init_fn, + ) + meta_opt = torch.optim.SGD(fsdp_meta.parameters(), lr=1e-3) + module_regular = NestedModel(device="cuda") + _reset_params_if_meta(is_meta, module_regular) + fsdp_regular = FSDP( + module_regular, + auto_wrap_policy=always_wrap, + ) + regular_opt = torch.optim.SGD(fsdp_regular.parameters(), lr=1e-3) + else: + with enable_wrap( + wrapper_cls=FSDP, param_init_fn=init_fn, + ): + module = meta_module_fn() + is_meta = next(module.parameters()).is_meta + # Non FSDP modules will still be initialized because they bubble up + # to be part of a larger FSDP unit. + fsdp_meta = wrap(module) + meta_opt = torch.optim.SGD(fsdp_meta.parameters(), lr=1e-3) + + # Init and reset parameters before wrapping so that reset_params + # matches up with meta device's initialization. + module_regular = NestedModel(device="cuda") + _reset_params_if_meta(is_meta, module_regular) + with enable_wrap(wrapper_cls=FSDP): + module_regular.lin1 = wrap(module_regular.lin1) + module_regular.l3 = wrap(module_regular.l3) + fsdp_regular = wrap(module_regular) + regular_opt = torch.optim.SGD(fsdp_regular.parameters(), lr=1e-3) + + # Compare it before training + self._compare_fsdp(fsdp_meta, fsdp_regular) + inp = torch.randn(10, 2, device='cuda') + fsdp_meta(inp).sum().backward() + fsdp_regular(inp).sum().backward() + meta_opt.step() + regular_opt.step() + self._compare_fsdp(fsdp_meta, fsdp_regular) + + @skip_if_lt_x_gpu(2) + @parametrize("auto_wrap", [True, False]) + def test_nested_model_with_meta_device_reset_params(self, auto_wrap): + def meta_module_fn(): + return NestedModel(device="meta") + + self._test_nested_model_with_meta_device( + auto_wrap=auto_wrap, meta_module_fn=meta_module_fn, init_fn=_init_with_reset_params + ) + + @skip_if_lt_x_gpu(2) + @parametrize("auto_wrap", [True, False]) + def test_nested_model_with_meta_device_default_init(self, auto_wrap): + def meta_module_fn(): + return NestedModel(device="meta") + + self._test_nested_model_with_meta_device( + auto_wrap=auto_wrap, meta_module_fn=meta_module_fn, + ) + + @skip_if_lt_x_gpu(2) + @sandcastle_skip_if( + not _TORCHDISTX_AVAIL, "Test requires torchdistX: https://github.com/pytorch/torchdistX" + ) + @parametrize("auto_wrap", [True, False]) + def test_nested_model_with_torchdistX_default_init(self, auto_wrap): + def meta_module_fn(): + return deferred_init.deferred_init(NestedModel, device="cuda") + + self._test_nested_model_with_meta_device( + auto_wrap=auto_wrap, meta_module_fn=meta_module_fn + ) + + @skip_if_lt_x_gpu(2) + @sandcastle_skip_if( + not _TORCHDISTX_AVAIL, "Test requires torchdistX: https://github.com/pytorch/torchdistX" + ) + @parametrize("auto_wrap", [True, False]) + def test_nested_model_with_torchdistX_init_fn(self, auto_wrap): + def meta_module_fn(): + return deferred_init.deferred_init(NestedModel, device="cuda") + + self._test_nested_model_with_meta_device( + auto_wrap=auto_wrap, meta_module_fn=meta_module_fn, init_fn=_init_with_torchdistX, + ) + + def _test_bad_arg(self, meta_module_fn): + mod = meta_module_fn() + with self.assertRaisesRegex(ValueError, "to be callable"): + FSDP(mod, param_init_fn=42) + + @skip_if_lt_x_gpu(2) + @sandcastle_skip_if( + not _TORCHDISTX_AVAIL, "Test requires torchdistX: https://github.com/pytorch/torchdistX" + ) + def test_bad_arg_torchdistx(self): + def meta_module_fn(): + return deferred_init.deferred_init(NestedModel, "cuda") + + self._test_bad_arg(meta_module_fn) + + @skip_if_lt_x_gpu(2) + def test_bad_arg_meta(self): + def meta_module_fn(): + return NestedModel(device="meta") + + self._test_bad_arg(meta_module_fn) + + +instantiate_parametrized_tests(TestFSDPWithMetaDevice) + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/fsdp/test_fsdp_misc.py b/test/distributed/fsdp/test_fsdp_misc.py new file mode 100644 index 000000000000..4d486d7b0407 --- /dev/null +++ b/test/distributed/fsdp/test_fsdp_misc.py @@ -0,0 +1,253 @@ +# Owner(s): ["oncall: distributed"] + +import sys +from contextlib import suppress +import functools + +import torch +import torch.distributed as dist +import torch.nn as nn +from torch.nn import TransformerEncoderLayer, TransformerDecoderLayer +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.testing._internal.common_distributed import ( + skip_if_lt_x_gpu, +) +from torch.testing._internal.common_fsdp import ( + FSDPTest, + NestedWrappedModule, + FSDPInitMode, + TransformerWithSharedParams, + _validate, +) +from torch.testing._internal.common_utils import ( + instantiate_parametrized_tests, + parametrize, + TEST_WITH_DEV_DBG_ASAN, + run_tests, +) + +from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy + +if not dist.is_available(): + print("Distributed not available, skipping tests", file=sys.stderr) + sys.exit(0) + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) + + +class TestFSDPMisc(FSDPTest): + @property + def world_size(self): + return 2 + + @property + def process_group(self): + return dist.distributed_c10d._get_default_group() + + @skip_if_lt_x_gpu(2) + def test_device_id_auto_wrap(self): + """ + Test auto wrapping propagates the device id. + """ + model = TransformerWithSharedParams(group=self.process_group) + my_auto_wrap_policy = functools.partial( + transformer_auto_wrap_policy, + transformer_layer_cls={TransformerEncoderLayer, TransformerDecoderLayer} + ) + wrapped = FSDP( + model, + auto_wrap_policy=my_auto_wrap_policy, + device_id=torch.cuda.current_device() + ) + # All FSDP instances should have device_id set + for m in FSDP.fsdp_modules(wrapped): + self.assertEqual(m.device_id, torch.device("cuda", torch.cuda.current_device())) + + @skip_if_lt_x_gpu(2) + @parametrize("use_index", [True, False]) + def test_fsdp_device_id(self, use_index): + """ + If CPU module is passed into FSDP with device_id + argument, it is moved to the GPU with that device_id. + """ + dev_id = ( + torch.cuda.current_device() if use_index + else torch.device("cuda", torch.cuda.current_device()) + ) + + def _check_device_matches(fsdp, dev_id): + devices = {p.device for p in fsdp.parameters()} + self.assertEqual(1, len(devices)) + found_dev = devices.pop() + if use_index and not isinstance(dev_id, torch.device): + dev_id = torch.device("cuda", dev_id) + self.assertEqual(found_dev, dev_id) + + mod = NestedWrappedModule( + group=self.process_group, + wrap_fsdp=True, + wrap_everything=True, + fsdp_init_mode=FSDPInitMode.CUDA_NEVER, + device_id=dev_id + ) + fsdp = FSDP(mod, device_id=dev_id) + # Check FSDP parameters are moved. + _check_device_matches(fsdp, dev_id) + # device_id matching module device before FSDP construction + # should not throw errors. + mod = NestedWrappedModule( + group=self.process_group, + wrap_fsdp=True, + wrap_everything=True, + fsdp_init_mode=FSDPInitMode.CUDA_BEFORE, + device_id=dev_id + ) + fsdp = FSDP(mod, device_id=dev_id) + _check_device_matches(fsdp, dev_id) + # Passing in torch.device("cuda") should work. + regex = "does not have explicit index" + context = self.assertWarnsRegex( + expected_warning=UserWarning, expected_regex=regex + ) + with context: + mod = NestedWrappedModule( + group=self.process_group, + wrap_fsdp=True, + wrap_everything=True, + fsdp_init_mode=FSDPInitMode.CUDA_BEFORE, + device_id=torch.device("cuda") + ) + fsdp = FSDP(mod, device_id=torch.device("cuda")) + _check_device_matches(fsdp, torch.device("cuda", torch.cuda.current_device())) + + @skip_if_lt_x_gpu(2) + def test_module_device_mismatches_device_id(self): + """ + FSDP raises errors when module is on a GPU that does + not match device_id. + """ + context = ( + self.assertRaisesRegex( + RuntimeError, + f"on rank {self.rank}.*cuda:0, but is on cuda:{self.rank}" + ) if self.rank != 0 else suppress() + ) + with context: + mod = NestedWrappedModule( + group=self.process_group, + wrap_fsdp=True, + wrap_everything=True, + # Would move module to current cuda device before + # wrapping with FSDP + fsdp_init_mode=FSDPInitMode.CUDA_BEFORE, + # Rank 1 is given device id 0, but model is on cuda:1, + # should throw errors. + device_id=0 + ) + + @skip_if_lt_x_gpu(2) + def test_multi_device_not_supported(self): + """ + FSDP throws appropriate error when we wrap multi-device module. + """ + class MyModule(nn.Module): + def __init__(self): + super().__init__() + self.a = nn.Linear(1, 1).cuda() + self.b = nn.Linear(1, 1) + + with self.assertRaisesRegex( + RuntimeError, "FSDP only supports single device modules" + ): + FSDP(MyModule()) + + @skip_if_lt_x_gpu(2) + def test_no_params(self): + """ + Test that device_id and cpu init work if module has no params + (they are effective noops, but ensure FSDP does not assume module + has parameters during init) + """ + # Test CPU + no_params = nn.ReLU() + module = FSDP(no_params) + # Test CUDA + no_params = nn.ReLU().cuda() + module = FSDP(no_params) + # Test CPU + device_id + no_params = nn.ReLU() + module = FSDP(no_params, device_id=torch.cuda.current_device()) + # For modules with no params, wrong device_id will raise error about + # inconsistency between compute_device and device_id, since compute_device + # is computed as torch.cuda.current_device when there are no params. + no_params = nn.ReLU().cuda() + context = ( + self.assertRaisesRegex( + AssertionError, + f"Inconsistent.*cuda:{self.rank} vs cuda:0" + ) + ) if self.rank != 0 else suppress() + with context: + module = FSDP(no_params, device_id=0) + + @skip_if_lt_x_gpu(2) + def test_fsdp_cpu_init_stays_on_cpu(self): + """ + Ensure that CPU model input stays on CPU + after FSDP init even though sharding, flattening + is run on GPU. + """ + torch.cuda.set_device(self.rank) + regex = "Module is input on CPU" + context = self.assertWarnsRegex( + expected_warning=UserWarning, expected_regex=regex + ) + with context: + mod = NestedWrappedModule( + group=self.process_group, + wrap_fsdp=True, + wrap_everything=True, + fsdp_init_mode=FSDPInitMode.CUDA_NEVER, + ) + fsdp = FSDP(mod) + devices = {p.device for p in fsdp.parameters()} + self.assertEqual(1, len(devices)) + self.assertEqual(torch.device("cpu"), devices.pop()) + fsdp = fsdp.cuda() + # Ensure fwd + backward can be performed after moving to CUDA. + # CPU input also tests that input is correctly moved to appropriate + # CUDA device. + inp = mod.get_input(device=torch.device("cpu")) + fsdp(inp[0]).sum().backward() + + @skip_if_lt_x_gpu(2) + def test_fsdp_same_model_across_ranks(self): + """ + FSDP broadcasts model from rank 0 to ensure it starts off with the same + values. + """ + class MyModel(nn.Module): + def __init__(self, rank): + super().__init__() + # Seed via rank to make model different across ranks + torch.manual_seed(rank) + torch.cuda.manual_seed(rank) + self.lin = nn.Linear(10, 10, bias=False) + self.register_buffer("buffer", torch.ones(1) * rank) + + m = MyModel(self.rank).cuda() + _validate(m, process_group=self.process_group, assert_fn=self.assertNotEqual) + # Passing sync_module_states into FSDP makes model the same during init. + fsdp = FSDP(m, sync_module_states=True) + with fsdp.summon_full_params(fsdp): + _validate(fsdp, process_group=self.process_group, assert_fn=self.assertEqual) + +instantiate_parametrized_tests(TestFSDPMisc) + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/fsdp/test_fsdp_mixed_precision.py b/test/distributed/fsdp/test_fsdp_mixed_precision.py new file mode 100644 index 000000000000..f0bac76fd1d0 --- /dev/null +++ b/test/distributed/fsdp/test_fsdp_mixed_precision.py @@ -0,0 +1,665 @@ +# Owner(s): ["oncall: distributed"] + +import sys +import contextlib +from functools import partial +from itertools import product + +import torch +import torch.cuda.nccl as nccl +import torch.nn as nn +import torch.nn.functional as F +from torch import distributed as dist +from torch.distributed.fsdp import ( + FullyShardedDataParallel as FSDP, + CPUOffload, + MixedPrecision, + BackwardPrefetch, + ShardingStrategy, +) +from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy +from torch.nn.modules.batchnorm import _BatchNorm +from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler +from torch.testing._internal.common_distributed import skip_if_lt_x_gpu +from torch.testing._internal.common_fsdp import ( + FSDPTest, + subtest_name, +) +from torch.testing._internal.common_utils import ( + instantiate_parametrized_tests, + parametrize, + run_tests, + TEST_WITH_DEV_DBG_ASAN, + sandcastle_skip_if, +) +from torch.testing._internal.common_cuda import CUDA11OrLater + +try: + import torchvision + HAS_TORCHVISION = True +except ImportError: + HAS_TORCHVISION = False + +skipIfNoTorchVision = sandcastle_skip_if(not HAS_TORCHVISION, "no torchvision") + + +if not dist.is_available(): + print("Distributed not available, skipping tests", file=sys.stderr) + sys.exit(0) + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) + +# Various mixed precision configs to test under. +default_mp = MixedPrecision( + param_dtype=torch.float16, + buffer_dtype=torch.float16, + reduce_dtype=torch.float16, +) + +# Params and buffers are not cast, comm only happens +# in reduced precision. +mp_only_reduce = MixedPrecision(reduce_dtype=torch.float16) + +# Only parameters are cast (thus comm should happen in the param_dtype precision) +mp_only_param_and_buf = MixedPrecision(param_dtype=torch.float16, buffer_dtype=torch.float16) + +# Nothing is cast (thus param, comm, grad, and buffer should be in the full precision) +mp_no_mixed_precision = MixedPrecision() + +nccl_supports_bf16 = ( + CUDA11OrLater and dist.is_nccl_available() and nccl.version() >= (2, 10) +) + +mp_configs = [default_mp, mp_only_reduce, mp_only_param_and_buf, mp_no_mixed_precision] +if nccl_supports_bf16: + mp_diff_buffer_and_reduce = MixedPrecision( + param_dtype=torch.float16, + buffer_dtype=torch.bfloat16, + reduce_dtype=torch.float32 + ) + mp_configs.extend([mp_diff_buffer_and_reduce]) + +# Buffer original dtype, which can differ from model params. +_BUFFER_ORIG_DTYPE = torch.float64 + +params = "mp_config,cpu_offload,backward_prefetch,full_precision_param_dtype,sharded_grad_scaler" +cpu_offload_config = [ + CPUOffload(offload_params=True), CPUOffload(offload_params=False) +] +backward_prefetch_config = [ + BackwardPrefetch.BACKWARD_PRE, BackwardPrefetch.BACKWARD_POST +] +full_precision_param_dtype_config = [torch.float32, torch.float64] +sharded_grad_scaler = ["enable_sharded_grad_scaler", None] + +configs = list(product( + mp_configs, + cpu_offload_config, + backward_prefetch_config, + full_precision_param_dtype_config, + sharded_grad_scaler, +)) + +test_name_mapping = { + str(CPUOffload(offload_params=True)): "offload_true", + str(CPUOffload(offload_params=False)): "offload_false", + str(BackwardPrefetch.BACKWARD_PRE): "prefetch_pre", + str(BackwardPrefetch.BACKWARD_POST): "prefetch_post", + str(default_mp): "mp_fp16", + str(mp_only_reduce): "mp_only_reduce", + str(mp_only_param_and_buf): "mp_only_param_and_buf", + str(mp_no_mixed_precision): "mp_no_mp", + str(torch.float32): "fp32", + str(torch.float64): "fp64", + "enable_sharded_grad_scaler": "sharded_grad_scaler" +} + +if nccl_supports_bf16: + test_name_mapping.update({ + str(mp_diff_buffer_and_reduce): "mp_diff_buffer_reduce", + }) + +subtest_name = partial(subtest_name, test_name_mapping) + +_CURRENT_FULL_PRECISION_PARAM_DTYPE = None + +@contextlib.contextmanager +def patch_reduce_scatter(new_reduce_scatter, full_precision_param_dtype): + """ + Patches dist._reduce_scatter_base with a new reduce_scatter_base and + restores upon exiting. Used for validation of mixed precision + """ + orig_reduce_scatter = dist._reduce_scatter_base + dist._reduce_scatter_base = new_reduce_scatter + global _CURRENT_FULL_PRECISION_PARAM_DTYPE + _CURRENT_FULL_PRECISION_PARAM_DTYPE = full_precision_param_dtype + try: + yield + finally: + dist._reduce_scatter_base = orig_reduce_scatter + _CURRENT_FULL_PRECISION_PARAM_DTYPE = None + +class LinearMixedPrecision(nn.Module): + """ + A linear module with extra checks for mixed precision training. + """ + def __init__(self, param_dtype): + super().__init__() + self.lin = nn.Linear(10, 10, bias=False).to(param_dtype) + self.register_buffer('buffer', torch.randn((1, 2), dtype=_BUFFER_ORIG_DTYPE)) + self._orig_param_type = param_dtype + self._orig_buffer_dtype = _BUFFER_ORIG_DTYPE + + def forward(self, tup): + # Param and input should be the mixed precision type + inp, cls, fsdp, mp_config, full_precision_param_dtype = tup + expected_param_type = ( + mp_config.param_dtype if mp_config.param_dtype is not None + else self._orig_param_type + ) + expected_buffer_type = ( + mp_config.buffer_dtype if mp_config.buffer_dtype is not None + else self._orig_buffer_dtype + ) + cls.assertEqual(inp.dtype, expected_param_type) + # Buffer should be in specified precision as well. + cls.assertEqual(self.buffer.dtype, expected_buffer_type) + + # In FSDP, self.params should point to the right type. + num_active_fsdp = 0 + for fsdp_module in FSDP.fsdp_modules(fsdp): + fsdp_managed_params = fsdp_module.params + # Single param assumption + cls.assertEqual(1, len(fsdp_managed_params)) + for param in fsdp_managed_params: + # FSDP unit is currently active if it is not using the param + # local shard. This supports both FULL_SHARD and SHARD_GRAD_OP + # cases. In FULL_SHARD, we have the additional property that + # param._full_param_padded has not been freed. + is_fsdp_unit_active = ( + param._is_sharded and + (param.data.data_ptr() != param._local_shard.data_ptr()) + ) + if is_fsdp_unit_active: + num_active_fsdp += 1 + # This FSDP unit is active, verify param points to mixed + cls.assertEqual(param.dtype, expected_param_type) + # _rebuild_full_param should have also freed the fp16 shard. + # Shard is never allocated if param_dtype mixed precision is not + # enabled. + if mp_config.param_dtype is not None: + cls.assertEqual(0, param._mp_shard.storage().size()) + else: + cls.assertFalse(hasattr(param, '_mp_shard')) + elif param._is_sharded: + # This FSDP unit is not active as full param has been + # freed or not yet allocated. Ensure param points to full + # precision param. + cls.assertEqual(param.dtype, full_precision_param_dtype) + # We should have gotten at least one active FSDP unit for sharded + # (world size > 1) cases. For cases where param is not sharded + # (ie world_size == 1) it is a bit hard to check if FSDP unit is active + # as we'd always point to the local shard, so we rely on the forward + # pass self.lin(inp) working well and inp being reduced precision to + # implicitly validate that the param is indeed in the reduced precision. + if cls.world_size > 1: + cls.assertGreater(num_active_fsdp, 0) + + return (self.lin(inp), cls, fsdp, mp_config, full_precision_param_dtype) + + +class TestFSDPMixedPrecision(FSDPTest): + @property + def world_size(self): + raise ValueError("To be implemented by child classes") + + def _get_simple_nested_model(self, param_dtype, *fsdp_args, **fsdp_kwargs): + model = FSDP( + nn.Sequential( + FSDP(LinearMixedPrecision(param_dtype).cuda(), *fsdp_args, **fsdp_kwargs), + LinearMixedPrecision(param_dtype).cuda(), + ), + *fsdp_args, + **fsdp_kwargs, + ) + return model + + def _get_simple_model(self, param_dtype, *fsdp_args, **fsdp_kwargs): + model = FSDP(LinearMixedPrecision(param_dtype).cuda(), *fsdp_args, **fsdp_kwargs) + return model + + def _validate_no_mp_shard(self, fsdp_model): + """ + Validates that there is no mixed precision _mp_shard allocated + when it is not expected to be. + """ + fsdp_units = FSDP.fsdp_modules(fsdp_model) + for fsdp in fsdp_units: + for param in fsdp.params: + self.assertFalse(hasattr(param, '_mp_shard')) + + def _validate_mp_shard_freed(self, fsdp_model): + """ + Ensures that the mixed precision shard is greed for all FSDP units. + """ + fsdp_units = FSDP.fsdp_modules(fsdp_model) + for fsdp in fsdp_units: + for param in fsdp.params: + self.assertEqual(0, param._mp_shard.storage().size()) + + def _reduce_scatter_base_validate_mp( + self, + orig_reduce_scatter, + mp_config, + *args, + **kwargs + ): + """ + Performs dist._reduce_scatter_base but verifies mixed precision settings + before. This is to test mixed precision is working as expected during + backward pass. In particular it ensures that the gradients were cast to the right type + and comm. is going to happen in the right type. + """ + tensors = [] + for x in args: + if isinstance(x, torch.Tensor): + tensors.append(x) + for _, x in kwargs.items(): + if isinstance(x, torch.Tensor): + tensors.append(x) + + # reduce_dtype has higher priority than param_dtype, because mixed_precision + # supports overriding param_dtype with reduce_dtype to control the + # reduction precision. In the case where reduce_dtype == param_dtype + # this tests that gradients are in the expected precision as well. + # If reduce_dtype is not specified (is None) we comm. in the param_dtype + # if that is specified, otherwise full precision dtype. + expected_dtype = ( + mp_config.reduce_dtype if mp_config.reduce_dtype is not None + else ( + mp_config.param_dtype if mp_config.param_dtype is not None + else _CURRENT_FULL_PRECISION_PARAM_DTYPE + ) + ) + + # for t in tensors: + # print(f"tensor type {t.dtype} expected {expected_dtype}") + for t in tensors: + self.assertEqual(expected_dtype, t.dtype) + + return orig_reduce_scatter(*args, **kwargs) + + def _run_test_mixed_precision_e2e( + self, + mp_config, + cpu_offload, + backward_prefetch, + full_precision_param_dtype, + sharding_strategy, + sharded_grad_scaler, + ): + torch.cuda.set_device(self.rank) + fsdp_models = [ + self._get_simple_model( + param_dtype=full_precision_param_dtype, + sharding_strategy=sharding_strategy, + cpu_offload=cpu_offload, + mixed_precision=mp_config, + backward_prefetch=backward_prefetch + ), + self._get_simple_nested_model( + param_dtype=full_precision_param_dtype, + sharding_strategy=sharding_strategy, + cpu_offload=cpu_offload, + mixed_precision=mp_config, + backward_prefetch=backward_prefetch + ), + ] + for model in fsdp_models: + if not cpu_offload.offload_params: + model.cuda() + + # Patch reduce_scatter to add validation for mixed precision types. + orig_reduce_scatter = dist._reduce_scatter_base + test_reduce_scatter = partial( + self._reduce_scatter_base_validate_mp, orig_reduce_scatter, mp_config, + ) + with patch_reduce_scatter(test_reduce_scatter, full_precision_param_dtype): + scaler = ShardedGradScaler(enabled=sharded_grad_scaler) + optim = torch.optim.Adam(model.parameters()) + + for _ in range(3): + inp = torch.randn(3, 10, device='cuda', dtype=full_precision_param_dtype) + # Forward pass of LinearMixedPrecision check casting of + # inputs, params, buffers. + act, *_ = model( + (inp, self, model, mp_config, full_precision_param_dtype) + ) + # Buffers should be casted. + for buf in model.buffers(): + if mp_config.buffer_dtype is not None: + self.assertEqual(buf.dtype, mp_config.buffer_dtype) + else: + self.assertEqual(buf.dtype, _BUFFER_ORIG_DTYPE) + # p._mp_shard should be freed. + if model.params[0]._is_sharded: # i.e. world_size > 1 + # TODO: free the mixed precision shard after forward + # when world_size == 1 as well, currently when + # world_size == 1 it is only freed after backward. + if mp_config.param_dtype is not None: + self._validate_mp_shard_freed(model) + else: + # We never should have allocated an _mp_shard. + self._validate_no_mp_shard(model) + + loss = act.sum() + loss = scaler.scale(loss) + if mp_config.param_dtype is not None: + self.assertEqual(loss.dtype, mp_config.param_dtype) + else: + self.assertEqual(loss.dtype, full_precision_param_dtype) + # Will run patched reduce scatter that validates mixed_precision + # types in backward. + loss.backward() + # Buffers stay casted even after backwards. + for buf in model.buffers(): + if mp_config.buffer_dtype is not None: + self.assertEqual(buf.dtype, mp_config.buffer_dtype) + else: + self.assertEqual(buf.dtype, _BUFFER_ORIG_DTYPE) + # p._mp_shard should be freed. + if mp_config.param_dtype is not None: + self._validate_mp_shard_freed(model) + else: + self._validate_no_mp_shard(model) + + # Ensure params and grads are in full precision, + # as after fwd/backward we maintain full precision shards. + for param in model.parameters(): + self.assertEqual(param.dtype, full_precision_param_dtype) + if param.grad is not None: + self.assertEqual(param.grad.dtype, full_precision_param_dtype) + + # Unscale the gradients and step + scaler.step(optim) + # Update the scale factor + scaler.update() + + # Summon full params should be in full precision + with model.summon_full_params(model): + # It is not expected for summon_full_params to allocate + # a mixed precision shard. + if mp_config.param_dtype is not None: + self._validate_mp_shard_freed(model) + else: + self._validate_no_mp_shard(model) + params = list(model.parameters()) + for p in params: + self.assertEqual(p.dtype, full_precision_param_dtype) + + # Note that buffers are cast only once and only restored + # to the original buffer dtype in state_dict, so + # summon_full_params is not expected to restore buffer + # types to their original. + named_buffers = dict(model.named_buffers()) + for v in named_buffers.values(): + if mp_config.buffer_dtype is not None: + self.assertEqual(v.dtype, mp_config.buffer_dtype) + else: + self.assertEqual(v.dtype, _BUFFER_ORIG_DTYPE) + + # state_dict should be in full precision + state_dict = {k: v.clone() for k, v in model.state_dict().items()} + for name, tensor in state_dict.items(): + # Parameters and buffers are checkpointed in their + # original dtypes, which may be different. + if name in named_buffers.keys(): + self.assertEqual(tensor.dtype, _BUFFER_ORIG_DTYPE) + else: + self.assertEqual( + tensor.dtype, full_precision_param_dtype, + f"{name}: {tensor.dtype} vs {full_precision_param_dtype}" + ) + + # After state_dict, buffer's dtype should have been restored + # to the mixed precision one. + for buf in model.buffers(): + if mp_config.buffer_dtype is not None: + self.assertEqual(buf.dtype, mp_config.buffer_dtype) + else: + self.assertEqual(buf.dtype, _BUFFER_ORIG_DTYPE) + + +class TestFSDPMixedPrecisionSharded(TestFSDPMixedPrecision): + + @property + def world_size(self): + return 2 + + @skip_if_lt_x_gpu(2) + def test_mixed_precision_no_reshard_after_forward(self): + # Note that we don't exercise all possible different configs so as to + # not increase test TTS too much. + mp = default_mp if not nccl_supports_bf16 else mp_diff_buffer_and_reduce + self._run_test_mixed_precision_e2e( + mp_config=mp, + cpu_offload=CPUOffload(offload_params=True), + backward_prefetch=None, + full_precision_param_dtype=torch.float64, + sharding_strategy=ShardingStrategy.SHARD_GRAD_OP, + sharded_grad_scaler=False, + ) + + @skip_if_lt_x_gpu(2) + @parametrize(params, configs, subtest_name) + def test_mixed_precision_e2e_full_shard( + self, + mp_config, + cpu_offload, + backward_prefetch, + full_precision_param_dtype, + sharded_grad_scaler, + ): + self._run_test_mixed_precision_e2e( + mp_config, + cpu_offload, + backward_prefetch, + full_precision_param_dtype, + ShardingStrategy.FULL_SHARD, + sharded_grad_scaler, + ) + + def _test_mixed_precision_embedding_table(self, mp_config): + # Basic test to ensure int inputs are not casted which would break + # modules such as embedding tables. + param_dtype = mp_config.param_dtype or torch.float32 + orig_reduce_scatter = dist._reduce_scatter_base + test_reduce_scatter = partial( + self._reduce_scatter_base_validate_mp, orig_reduce_scatter, mp_config, + ) + with patch_reduce_scatter(test_reduce_scatter, param_dtype): + model = self._get_wrapped_model( + group=torch.distributed.distributed_c10d._get_default_group(), + config={"mixed_precision": mp_config} + ) + optim = torch.optim.SGD(model.parameters(), lr=0.1) + for _ in range(6): + inp = model.module.get_input(torch.device("cuda")) + # This would fail if we casted integer module inputs such as for + # embedding tables. + output = model(*inp) + loss = model.module.get_loss(inp, output).cuda() + self.assertEqual(loss.dtype, param_dtype) + model.module.run_backward(loss) + optim.step() + + @skip_if_lt_x_gpu(2) + def test_mp_embedding_reduce(self): + self._test_mixed_precision_embedding_table( + mp_config=MixedPrecision(reduce_dtype=torch.float16) + ) + + @skip_if_lt_x_gpu(2) + def test_mp_embedding_only_params_and_bufs(self): + self._test_mixed_precision_embedding_table( + mp_config=MixedPrecision( + param_dtype=torch.float16, + buffer_dtype=torch.float16, + ) + ) + + @skip_if_lt_x_gpu(2) + def test_mp_embedding_default(self): + default_mp_config = MixedPrecision( + param_dtype=torch.float16, + buffer_dtype=torch.float16, + reduce_dtype=torch.float16, + ) + self._test_mixed_precision_embedding_table(mp_config=default_mp_config) + + @skip_if_lt_x_gpu(2) + def test_mp_embedding_params_and_reduce_diff(self): + params_and_reduce_different = MixedPrecision( + param_dtype=torch.float16, + reduce_dtype=torch.float32, + buffer_dtype=torch.float16 + ) + self._test_mixed_precision_embedding_table(mp_config=params_and_reduce_different) + + @skip_if_lt_x_gpu(2) + @skipIfNoTorchVision + def test_mixed_precision_resnet(self): + """ + End to end test to ensure mixed precision + auto_wrap works + for ResNet model. + """ + resnet_model = torchvision.models.resnet50().cuda() + resnet_model = nn.SyncBatchNorm.convert_sync_batchnorm( + resnet_model, + process_group=dist.distributed_c10d._get_default_group() + ) + n_bn = sum(1 if isinstance(x, _BatchNorm) else 0 for x in resnet_model.modules()) + inp = torch.ones(1, 3, 1000, 1000, device='cuda') + mp_config = MixedPrecision( + param_dtype=torch.float16, + reduce_dtype=torch.float16, + buffer_dtype=torch.float16, + ) + fsdp = FSDP( + resnet_model, + auto_wrap_policy=size_based_auto_wrap_policy, + mixed_precision=mp_config + ) + # Batchnorm units should be wrapped individually. Validate this by + # ensuring there are equal no. of FSDP units that are BN as BN units + # in original resnet model. + fsdp_bn = 0 + for module in fsdp.fsdp_modules(fsdp): + wrapped_module = module.module.module + if isinstance(wrapped_module, _BatchNorm): + fsdp_bn += 1 + + self.assertEqual(fsdp_bn, n_bn) + # Would throw type mismatch issue without mixed precision autowrapping. + loss = fsdp(inp).sum() + loss.backward() + + @skip_if_lt_x_gpu(2) + @parametrize("convert_sync_bn", [True, False]) + def test_mp_batchnorm(self, convert_sync_bn): + class BatchNormNet(nn.Module): + def __init__(self, affine=True): + super(BatchNormNet, self).__init__() + self.fc1 = nn.Linear(2, 40, bias=False) + self.bn = nn.BatchNorm1d(4, affine=affine) + self.fc2 = nn.Linear(40, 4, bias=False) + + def forward(self, x): + x = torch.reshape(self.fc1(x), (-1, 4, 10)) + x = self.bn(x) + x = torch.reshape(x, (-1, 40)) + x = self.fc2(x) + return F.softmax(x, dim=1) + + def never_wrap_policy(*args, **kwargs): + return False + + net = BatchNormNet().cuda() + if convert_sync_bn: + net = nn.SyncBatchNorm.convert_sync_batchnorm(net) + # FSDP detects that mixed precision + batchnorm will cause issues + # and thus wrap batchnorm in a distinct FSDP unit that does not + # use mixed precision. + mp_config = MixedPrecision( + param_dtype=torch.float16, + reduce_dtype=torch.float16, + buffer_dtype=torch.float16, + ) + with self.assertWarnsRegex( + expected_warning=UserWarning, + expected_regex="BatchNorm units will be wrapped as a separate" + ): + model = FSDP( + net, + mixed_precision=mp_config, + auto_wrap_policy=never_wrap_policy, + ) + + bn = model.bn + self.assertTrue(isinstance(bn, FSDP)) + # policy should not have wrapped any other submodules + self.assertFalse(isinstance(model.fc1, FSDP)) + self.assertFalse(isinstance(model.fc2, FSDP)) + self.assertEqual(None, bn.mixed_precision) + self.assertNotEqual(None, model.mixed_precision) + + inp = torch.randn((1, 2), device='cuda') + # Without FSDP BN mixed precision fix, this would result in + # RuntimeError: Expected counts to have type Half but got Float + # for syncBN + model(inp).sum().backward() + + +class TestFSDPMixedPrecisionUnsharded(TestFSDPMixedPrecision): + """ + Smaller test suite for unshared param (i.e. world_size == 1) case. + """ + @property + def world_size(self): + return 1 + + @skip_if_lt_x_gpu(1) + def test_mixed_precision_no_reshard_after_forward(self): + # Note that we don't exercise all possible different configs so as to + # not increase test TTS too much. + mp = default_mp if not nccl_supports_bf16 else mp_diff_buffer_and_reduce + self._run_test_mixed_precision_e2e( + mp_config=mp, + cpu_offload=CPUOffload(offload_params=True), + backward_prefetch=None, + full_precision_param_dtype=torch.float64, + sharding_strategy=ShardingStrategy.SHARD_GRAD_OP, + sharded_grad_scaler=False, + ) + + @skip_if_lt_x_gpu(1) + def test_mixed_precision_e2e_full_shard(self): + mp = default_mp if not nccl_supports_bf16 else mp_diff_buffer_and_reduce + self._run_test_mixed_precision_e2e( + mp_config=mp, + cpu_offload=CPUOffload(offload_params=True), + backward_prefetch=None, + full_precision_param_dtype=torch.float64, + sharding_strategy=ShardingStrategy.FULL_SHARD, + sharded_grad_scaler=False, + ) + +instantiate_parametrized_tests(TestFSDPMixedPrecisionSharded) + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/fsdp/test_fsdp_multiple_forward.py b/test/distributed/fsdp/test_fsdp_multiple_forward.py index e0a3ccea16c9..c9afbd465f28 100644 --- a/test/distributed/fsdp/test_fsdp_multiple_forward.py +++ b/test/distributed/fsdp/test_fsdp_multiple_forward.py @@ -66,7 +66,7 @@ def _dist_train(self, wrap_fsdp): optim.zero_grad() if wrap_fsdp: - get_full_params(model) + return get_full_params(model) return list(model.parameters()) diff --git a/test/distributed/fsdp/test_fsdp_optim_state.py b/test/distributed/fsdp/test_fsdp_optim_state.py new file mode 100644 index 000000000000..9a51405cfaeb --- /dev/null +++ b/test/distributed/fsdp/test_fsdp_optim_state.py @@ -0,0 +1,774 @@ +# Owner(s): ["oncall: distributed"] + +import bisect +import sys +from enum import Enum, auto +from typing import Any, Dict, List, Tuple, Type + +import torch +from torch import distributed as dist +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.distributed.fsdp.fully_sharded_data_parallel import ( + OptimStateKeyType, +) +from torch.testing._internal.common_distributed import skip_if_lt_x_gpu +from torch.testing._internal.common_fsdp import FSDPTest +from torch.testing._internal.common_utils import ( + TEST_WITH_DEV_DBG_ASAN, + instantiate_parametrized_tests, + parametrize, + run_tests, +) + +if not dist.is_available(): + print("Distributed not available, skipping tests", file=sys.stderr) + sys.exit(0) + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) + + +class _OSDCommMethod(Enum): + """Method for communicating the optimizer state dict for internal tests.""" + BROADCAST_OBJECT_LIST = auto() + SCATTER_FULL_OSD = auto() + + +class Bias(torch.nn.Module): + """This module applies a 1D additive bias with dimension ``dim``.""" + def __init__(self, dim: int) -> None: + super().__init__() + assert dim > 0 + torch.manual_seed(0) + self.bias = torch.nn.Parameter(torch.randn((dim,))) + + def forward(self, x): + return x + self.bias + + +class BlockA(torch.nn.Module): + """ + Used to define interesting nested structure for FSDP wrapping. + BlockA + Bias0 + bias + weight + Bias1 + bias + """ + def __init__(self, in_dim: int, out_dim: int) -> None: + super().__init__() + assert all(v > 0 for v in (in_dim, out_dim)) + torch.manual_seed(0) + self.bias_module0 = Bias(out_dim) + self.weight = torch.nn.Parameter(torch.randn((in_dim, out_dim))) + self.bias_module1 = Bias(out_dim) + self.relu = torch.nn.ReLU() + + def forward(self, x): + x = x @ self.weight + x = self.bias_module0(x) + x = self.relu(x) # ensure biases have different gradients + x = self.bias_module1(x) + return x + +class BlockB(torch.nn.Module): + """ + Used to define interesting nested structure for FSDP wrapping. + BlockB + weight + Bias + bias + Bias + bias + """ + def __init__(self, in_dim: int, out_dim: int) -> None: + super().__init__() + assert all(v > 0 for v in (in_dim, out_dim)) + torch.manual_seed(0) + self.weight = torch.nn.Parameter(torch.randn((in_dim, out_dim))) + self.bias_module0 = Bias(out_dim) + self.bias_module1 = Bias(out_dim) + self.relu = torch.nn.ReLU() + + def forward(self, x): + x = x @ self.weight + x = self.bias_module0(x) + x = self.relu(x) # ensure biases have different gradients + x = self.bias_module1(x) + return x + + +class NestedModel(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.block0 = BlockB(5, 7) + self.block1 = BlockB(7, 7) + self.bias = torch.nn.Parameter(torch.randn((5,))) + self.block2 = torch.nn.Sequential( + BlockA(7, 9), + BlockA(9, 9), + BlockB(9, 5), + ) + self.relu = torch.nn.ReLU() + + def forward(self, x) -> torch.Tensor: + x = self.relu(self.block0(x)) + x = self.relu(self.block1(x)) + x = self.relu(self.block2(x)) + x = x + self.bias + return x + + def get_input(self, device): + BATCH_SIZE = 8 + return (torch.randn((BATCH_SIZE, 5)).to(device),) + + def get_loss(self, inp, output): + return output.sum() + + def run_backward(self, loss): + loss.backward() + + @staticmethod + def wrap(model, group=None) -> torch.nn.Module: + # Flatten Bias0; then flatten weight and Bias1 together into `block1` + model.block1.bias_module0 = FSDP( + model.block1.bias_module0, process_group=group, + ) + model.block1 = FSDP(model.block1, process_group=group) + # Flatten Bias0; flatten Bias1; then flatten weight into `block2[1]` + model.block2[1].bias_module0 = FSDP( + model.block2[1].bias_module0, process_group=group, + ) + model.block2[1].bias_module1 = FSDP( + model.block2[1].bias_module1, process_group=group, + ) + model.block2[1] = FSDP(model.block2[1], process_group=group) + # Flatten weight, Bias, bias into `block2[2]` + model.block2[2] = FSDP(model.block2[2], process_group=group) + return model + + @staticmethod + def wrap_alt(model, group=None) -> torch.nn.Module: + model.block0.bias_module0 = FSDP( + model.block0.bias_module0, process_group=group, + ) + model.block0 = FSDP(model.block0, process_group=group) + return model + + @staticmethod + def wrap_with_unmanaged_params( + model, + add_to_fsdp_module: bool, + group=None, + ) -> Tuple[torch.nn.Module, List[torch.nn.Parameter]]: + """Registers unmanaged parameters before wrapping with :meth:`wrap`.""" + device = next(model.parameters()).device + unmanaged_param = torch.nn.Parameter(torch.randn(5, 5, device=device)) + # Either register the parameter to a module to be wrapped with FSDP + # (`model.block2[2]`) or a module not to be wrapped with FSDP (`model`) + register_module = model.block2[2] if add_to_fsdp_module else model + register_module.register_parameter( + "unmanaged_param", unmanaged_param, + ) + # For simplicity, we only add a single unmanaged parameter, but should + # be easy to generalize if needed + return NestedModel.wrap(model, group), [unmanaged_param] + + @staticmethod + def add_unmanaged_param_entry(osd, unmanaged_param, step) -> None: + """Adds an entry for the unmanaged parameter ``unmanaged_param`` + assuming Adam optimizer and a single parameter group.""" + # The unmanaged parameters should be passed to this method in + # `model.parameters()` order since their parameter IDs will be assigned + # in order of the skipped IDs + # Assign a parameter ID to the unmanaged parameter + unmanaged_param_id = -1 + param_ids = osd["param_groups"][0]["params"] + for i in range(1, len(param_ids)): + diff = param_ids[i] - param_ids[i - 1] + if diff != 1: + assert diff > 1, f"Invalid IDs: {param_ids[i - 1]} {param_ids[i]}" + unmanaged_param_id = param_ids[i - 1] + 1 + break + if unmanaged_param_id == -1: + unmanaged_param_id = len(param_ids) # last ID skipped + assert unmanaged_param_id >= 0, "One parameter ID should be skipped" + # Add a state entry for the unmanaged parameter + state_device = next(iter(next(iter(osd["state"].values())).values())).device + osd["state"][unmanaged_param_id] = { + "step": torch.tensor(float(step), device=state_device), + "exp_avg": torch.randn(unmanaged_param.shape, device=state_device), + "exp_avg_sq": torch.randn(unmanaged_param.shape, device=state_device), + } + # Insert the ID into the parameter group in order + bisect.insort(osd["param_groups"][0]["params"], unmanaged_param_id) + + # NOTE: We exclude `self.bias` from either parameter group to test the + # case where the optimizer input does not include all model parameters + def param_group0(self) -> List[torch.nn.Parameter]: + # Use `block1`'s parameters for the first parameter group to deviate + # from the `model.parameters()` order + return list(self.block1.parameters()) + + def param_group1(self) -> List[torch.nn.Parameter]: + # Deviate from the `model.parameters()` order further by rearranging + # `block2`'s parameters to be before `block0`'s parameters + return list(self.block2.parameters()) + \ + list(self.block0.parameters()) + + +class TestFSDPOptimState(FSDPTest): + def _init_nested_model( + self, + wrap: bool, + wrap_alt: bool = False, # ignored if `wrap=False` + device: torch.device = torch.device("cuda"), + group=None, + optim_class: Type[torch.optim.Optimizer] = torch.optim.Adam, + use_multiple_param_groups: bool = False, + ): + model = NestedModel().to(device) + if wrap: + model = NestedModel.wrap_alt(model, group) if wrap_alt \ + else NestedModel.wrap(model, group) + if not use_multiple_param_groups: + optim_input = list(model.parameters()) + else: + optim_input = [ + {"params": model.param_group0()}, + {"params": model.param_group1(), "weight_decay": 0.9} + ] + optim = optim_class(optim_input, lr=0.01) + return model, optim, optim_input + + def _init_transformer_model( + self, + wrap: bool, + device: torch.device = torch.device("cuda"), + group=None, + optim_class: Type[torch.optim.Optimizer] = torch.optim.Adam, + use_multiple_param_groups: bool = False, + ): + assert not use_multiple_param_groups, \ + "Multiple parameter groups for the transformer is not implemented" + if group is None: + group = dist.distributed_c10d._get_default_group() + model = self._get_wrapped_model(group=group).to(device) if wrap \ + else self._get_nonwrapped_model(group=group).to(device) + model.eval() # disable dropout for determinism + optim = optim_class(model.parameters(), lr=0.01) + return model, optim, None + + def _step_model( + self, + model: torch.nn.Module, + optim: torch.optim.Optimizer, + device: torch.device = torch.device("cuda"), + num_iters: int = 1, + ) -> List[float]: + """Performs a forward pass, backward pass, and optimizer step + ``num_iters``-many times, and returns the per-iteration losses.""" + torch.manual_seed(0) # set seed for determinism + losses = [] + module = model.module if hasattr(model, "module") else model + for _ in range(num_iters): + inp = module.get_input(device) + output = model(*inp) + loss = module.get_loss(inp, output).to(device) + losses.append(loss.item()) + module.run_backward(loss) + optim.step() + return losses + + def _broadcast_full_osd(self, full_osd: Dict[str, Any], group=None): + """Broadcasts the full optimizer state dict in place of using + ``torch.save()`` and ``torch.load()`` so that all ranks can have it.""" + obj_list = [full_osd] + dist.broadcast_object_list( + obj_list, src=0, group=group, + ) + full_osd = obj_list[0] + return full_osd + + def _are_equal_states( + self, + state1: Dict[str, Any], + state2: Dict[str, Any], + ) -> bool: + """Checks if ``state1`` and ``state2`` contain the same mappings.""" + if set(state1.keys()) != set(state2.keys()): + return False + for state_name, value1 in state1.items(): + value2 = state2[state_name] + if type(value1) != type(value2): + return False + if torch.is_tensor(value1): # tensor state + assert torch.is_tensor(value2) + # Check the values on CPU to be device-agnostic + value1 = value1.cpu() + value2 = value2.cpu() + if value1.shape != value2.shape or \ + not torch.all(torch.isclose(value1, value2)): + return False + else: # non-tensor state + if value1 != value2: + return False + return True + + def _check_same_state( + self, + full_osd, + ref_osd, + check_same_param_keys: bool, + ): + """Checks that ``full_osd`` and ``ref_osd`` have the same "state" part. + If ``check_same_param_keys=True``, then checks that the parameter keys + match (e.g. when both should be parameter names), and does not check + the parameter keys otherwise.""" + assert "state" in ref_osd + self.assertTrue("state" in full_osd) + ref_osd_state = ref_osd["state"] + full_osd_state = full_osd["state"] + if check_same_param_keys: + # Check parameter keys are the same + ref_osd_param_ids = set(ref_osd_state.keys()) + full_osd_param_ids = set(full_osd_state.keys()) + self.assertTrue(ref_osd_param_ids == full_osd_param_ids) + for param_id, param_state in full_osd_state.items(): + for state_name, value in param_state.items(): + ref_value = ref_osd_state[param_id][state_name] + self.assertEqual(value, ref_value) + return + # Otherwise, only require the parameter keys to be isomorphic (e.g. + # between IDs and names) + ref_osd_states = list(ref_osd["state"].values()) + full_osd_states = list(full_osd["state"].values()) + assert len(ref_osd_states) == len(full_osd_states) + # Use brute-force quadratic-time comparison since it is hard to + # hash a tensor by value instead of by object + for full_osd_state in full_osd_states: + # Check for at least one match (may be > 1 in toy edge cases, e.g. + # multiple biases); nonetheless, each having >= 1 match and the two + # lists having equal length imply that the list contents are equal + self.assertTrue(any( + self._are_equal_states(full_osd_state, ref_osd_state) + for ref_osd_state in ref_osd_states + )) + + def _check_same_param_groups( + self, + full_osd, + ref_osd, + check_same_param_keys: bool, + ): + """Checks that ``full_osd`` and ``ref_osd`` have the same + "param_groups" part. If ``check_same_param_keys=True`, then checks that + the parameter keys match (e.g. when both should be parameter names), + and does not check the parameter keys otherwise.""" + assert "param_groups" in ref_osd + self.assertTrue("param_groups" in full_osd) + ref_osd_param_groups = ref_osd["param_groups"] + full_osd_param_groups = full_osd["param_groups"] + self.assertTrue(len(full_osd_param_groups), len(ref_osd_param_groups)) + if self.rank == 0: + for full_osd_pg, ref_osd_pg in zip( + full_osd_param_groups, ref_osd_param_groups, + ): + self.assertEqual( + set(full_osd_pg.keys()), set(ref_osd_pg.keys()), + ) + for name, full_osd_value in full_osd_pg.items(): + if name == "params" and not check_same_param_keys: + continue + self.assertEqual(full_osd_value, ref_osd_pg[name]) + + def _check_state_device(self, osd: Dict[str, Any], on_gpu: bool): + """Checks that all tensors in ``osd["state"]`` are on GPU if + ``on_gpu=True`` and on CPU if ``on_gpu=False``.""" + for param_state in osd["state"].values(): + for value in param_state.values(): + if torch.is_tensor(value): + if on_gpu: + self.assertTrue(value.is_cuda) + else: + self.assertFalse(value.is_cuda) + + @skip_if_lt_x_gpu(2) + @parametrize("use_multiple_param_groups", [False, True]) + @parametrize("rank0_only", [False, True]) + def test_full_optim_state_dict_nested( + self, + use_multiple_param_groups: bool, + rank0_only: bool, + ) -> None: + """ + Tests :meth:`full_optim_state_dict` by comparing the returned dict for + an FSDP-wrapped model with that of an equivalent non-wrapped model. + + The parameter groups in the "param_groups" part and the values in the + "state" part should be the same, but the parameter keys may be + different (e.g. the full optimizer state dict uses parameter names + while the non-wrapped equivalent uses parameter IDs). + """ + NUM_ITERS = 3 + model1, optim1, optim_input = self._init_nested_model( + wrap=True, use_multiple_param_groups=use_multiple_param_groups, + ) + losses1 = self._step_model(model1, optim1, num_iters=NUM_ITERS) + full_osd = FSDP.full_optim_state_dict( + model1, optim1, optim_input, rank0_only=rank0_only, + ) + # Non-target ranks get an empty state dict + if rank0_only and self.rank != 0: + self.assertEqual(len(full_osd), 0) + return + model2, optim2, _ = self._init_nested_model( + wrap=False, use_multiple_param_groups=use_multiple_param_groups, + ) + losses2 = self._step_model(model2, optim2, num_iters=NUM_ITERS) + ref_osd = optim2.state_dict() + # Check the losses to eliminate model drift as a source of error + for i, (l1, l2) in enumerate(zip(losses1, losses2)): + assert l1 == l2, f"Losses differ on iter {i}: {l1:.5f} {l2:.5f}" + # Do not check the parameter keys since the full optimizer state dict + # uses parameter names, while the non-wrapped equivalent uses parameter + # IDs + check_same_param_keys = False + self._check_same_param_groups( + full_osd, ref_osd, check_same_param_keys=check_same_param_keys, + ) + self._check_same_state( + full_osd, ref_osd, check_same_param_keys=check_same_param_keys, + ) + + # Require 4 GPUs since we test halving the world size + @skip_if_lt_x_gpu(4) + @parametrize("use_multiple_param_groups", [False, True]) + @parametrize("wrap_alt", [False, True]) + @parametrize("halve_world_size", [False, True]) + def test_shard_full_optim_state_dict_nested( + self, + use_multiple_param_groups: bool, + wrap_alt: bool, + halve_world_size: bool, + ): + """Tests :meth:`shard_full_optim_state_dict` for a non-FSDP-root model + with nested FSDP instances.""" + self._test_shard_full_optim_state( + model_class="nested", + use_multiple_param_groups=use_multiple_param_groups, + halve_world_size=halve_world_size, + osd_comm_method=_OSDCommMethod.BROADCAST_OBJECT_LIST, + wrap_alt=wrap_alt, + ) + + # Require 4 GPUs since we test halving the world size + @skip_if_lt_x_gpu(4) + def test_shard_full_optim_state_dict_transformer(self) -> None: + """Tests :meth:`shard_full_optim_state_dict` for an FSDP-root + transformer model with shared parameters.""" + self._test_shard_full_optim_state( + model_class="transformer", use_multiple_param_groups=False, + halve_world_size=True, + osd_comm_method=_OSDCommMethod.BROADCAST_OBJECT_LIST, + ) + + # Require 4 GPUs since we test halving the world size + @skip_if_lt_x_gpu(4) + @parametrize("use_multiple_param_groups", [False, True]) + @parametrize("wrap_alt", [False, True]) + @parametrize("halve_world_size", [False, True]) + def test_scatter_full_optim_state_dict_nested( + self, + use_multiple_param_groups: bool, + wrap_alt: bool, + halve_world_size: bool, + ): + """Tests :meth:`scatter_full_optim_state_dict` for a non-FSDP-root + model with nested FSDP instances.""" + self._test_shard_full_optim_state( + model_class="nested", + use_multiple_param_groups=use_multiple_param_groups, + halve_world_size=halve_world_size, + osd_comm_method=_OSDCommMethod.SCATTER_FULL_OSD, + wrap_alt=wrap_alt, + ) + + # Require 4 GPUs since we test halving the world size + @skip_if_lt_x_gpu(4) + def test_scatter_full_optim_state_dict_transformer(self) -> None: + """Tests :meth:`scatter_full_optim_state_dict` for an FSDP-root + transformer model with shared parameters.""" + self._test_shard_full_optim_state( + model_class="transformer", use_multiple_param_groups=False, + halve_world_size=True, + osd_comm_method=_OSDCommMethod.SCATTER_FULL_OSD, + ) + + def _test_shard_full_optim_state( + self, + model_class: str, + use_multiple_param_groups: bool, + halve_world_size: bool, + osd_comm_method: _OSDCommMethod, + **new_model_kwargs, + ): + """ + (1) Runs a model with full world size for K iterations to generate a + full optimizer state dict; + (2) initializes a model with halved world size and possibly different + FSDP wrapping scheme (based on ``new_model_kwargs``); + (3) shards the full optimizer state dict from (1) according to the + halved-world-size model; + (4) runs the halved-world-size model for K iterations; and + (5) checks that the sharded optimizer state dict from (3) matches the + halved-world-size model's local optimizer state dict, meaning that the + former could have equivalently been loaded into the local optimizer. + """ + NUM_ITERS = 3 + initializer = self._init_nested_model if model_class == "nested" \ + else self._init_transformer_model if model_class == "transformer" \ + else None + assert initializer is not None, f"Unsupported model: {model_class}" + # First, run a wrapped model with full world size for a few iterations + model1, optim1, optim_input1 = initializer( + wrap=True, use_multiple_param_groups=use_multiple_param_groups, + ) + self._step_model(model1, optim1, num_iters=NUM_ITERS) + full_osd1 = FSDP.full_optim_state_dict(model1, optim1, optim_input1) + if halve_world_size: + # Create a new process group with halved world size + new_group_ranks = [r for r in range(self.world_size) if r % 2 == 0] + new_group = dist.new_group(ranks=new_group_ranks) + if self.rank not in new_group_ranks: + return + else: + # Continue using the same group and hence world size + new_group = dist.distributed_c10d._get_default_group() + # Second, run a wrapped model with (possibly) halved world size + model2, optim2, optim_input2 = initializer( + wrap=True, group=new_group, + use_multiple_param_groups=use_multiple_param_groups, + **new_model_kwargs, # specify `wrap_alt` to change wrapping + ) + self._step_model(model2, optim2, num_iters=NUM_ITERS) + full_osd2 = FSDP.full_optim_state_dict(model2, optim2, optim_input2) + # Compute two sharded optim state dicts: (1) for the first model + # according to the second model and (2) for the second model according + # to the second model + if osd_comm_method == _OSDCommMethod.BROADCAST_OBJECT_LIST: + full_osd1 = self._broadcast_full_osd(full_osd1, group=new_group) + sharded_osd1 = FSDP.shard_full_optim_state_dict( + full_osd1, model2, optim_input2, + ) + full_osd2 = self._broadcast_full_osd(full_osd2, group=new_group) + sharded_osd2 = FSDP.shard_full_optim_state_dict( + full_osd2, model2, optim_input2, + ) + elif osd_comm_method == _OSDCommMethod.SCATTER_FULL_OSD: + sharded_osd1 = FSDP.scatter_full_optim_state_dict( + full_osd1 if self.rank == 0 else None, model2, optim_input2, + group=new_group, + ) + sharded_osd2 = FSDP.scatter_full_optim_state_dict( + full_osd2 if self.rank == 0 else None, model2, optim_input2, + group=new_group, + ) + self._check_state_device(sharded_osd1, on_gpu=True) + self._check_state_device(sharded_osd2, on_gpu=True) + # As a sanity check, check that sharding the second model's full + # optimizer state dict according to itself is equivalent to its local + # optimizer's state dict + local_osd2 = optim2.state_dict() + check_same_param_keys = True # should all have matching parameter IDs + self._check_same_param_groups( + sharded_osd2, local_osd2, + check_same_param_keys=check_same_param_keys, + ) + self._check_same_state( + sharded_osd2, local_osd2, + check_same_param_keys=check_same_param_keys, + ) + # Check that sharding the first model's full optimizer state dict + # according to the second model is equivalent to the second model's + # local optimizer state dict + self._check_same_param_groups( + sharded_osd1, local_osd2, + check_same_param_keys=check_same_param_keys, + ) + self._check_same_state( + sharded_osd1, local_osd2, + check_same_param_keys=check_same_param_keys, + ) + # As a sanity check, check that we can load and run a few iterations + optim2.load_state_dict(sharded_osd1) + self._step_model(model2, optim2, num_iters=NUM_ITERS) + + @skip_if_lt_x_gpu(2) + @parametrize("add_to_fsdp_module", [False, True]) + def test_shard_full_optim_state_dict_unmanaged_params( + self, + add_to_fsdp_module: bool, + ): + """ + Tests :meth:`shard_full_optim_state_dict` when there are unmanaged + parameters. + - If ``add_to_fsdp_module=True``, then the unmanaged parameters are + added to a module to be wrapped with FSDP, in which case there should + be an error since we require that all unflattened parameter + comprising a flattened parameter have the same scalar state (e.g. + Adam "step") but the added parameter is missing its entry. + - If ``add_to_fsdp_module=False``, then the unmanaged parameters are + added to a module not to be wrapped with FSDP, in which case there + should be no error (emulating model parallel use cases where some + parameters may be managed externally to FSDP). + We do not separately test unmanaged parameters for + :meth:`scatter_full_optim_state_dict` to save CI cost since it calls + into the same subroutine :meth:`_flatten_full_optim_state_dict`. + """ + NUM_ITERS = 1 + # Create a normal wrapped model + model, optim, optim_input = self._init_nested_model(wrap=True) + self._step_model(model, optim, num_iters=NUM_ITERS) + full_osd = FSDP.full_optim_state_dict( + model, optim, optim_input, rank0_only=False, + ) # save on all ranks to avoid having to broadcast from rank 0 + # Create a new model with the same structure but additional unmanaged + # parameters, representing the model for which we want to load + device = torch.device("cuda") + model = NestedModel().to(device) + model, unmanaged_params = NestedModel.wrap_with_unmanaged_params( + model, add_to_fsdp_module, + ) + optim_input = list(model.parameters()) + if add_to_fsdp_module: + # If we add the unmanaged parameters to a module wrapped with FSDP, + # then the flattened parameter will be comprised of some + # unflattened parameters with zero-dimensional tensor state (i.e. + # Adam "step") and others without (i.e. the unmanaged parameters), + # which triggers an error that we have to ensure correctness + error_prefix = "^(All unflattened parameters comprising a " \ + "single flattened parameter must have scalar state with the " \ + "same value and dtype)" + with self.assertRaisesRegex(ValueError, error_prefix): + FSDP.shard_full_optim_state_dict( + full_osd, model, optim_input, + ) + else: + # If we add the unmanaged parameters to a module not wrapped with + # FSDP, then we simply ignore them without erroring to enable + # model parallelism use cases, where some parameters are managed + # externally to FSDP + sharded_osd = FSDP.shard_full_optim_state_dict( + full_osd, model, optim_input, + ) + # Add entries for the unmanaged parameters to be able to load + for unmanaged_param in unmanaged_params: + NestedModel.add_unmanaged_param_entry( + sharded_osd, unmanaged_param, NUM_ITERS, + ) + # Check that we can load the optimizer state dict + optim = torch.optim.Adam(optim_input, lr=1e-3) + optim.load_state_dict(sharded_osd) + + @skip_if_lt_x_gpu(2) + @parametrize("use_multiple_param_groups", [False, True]) + def test_rekey_optim_state_dict_to_ids( + self, + use_multiple_param_groups: bool, + ): + """Tests :meth:`rekey_optim_state_dict` with the new keys being + parameter IDs by checking that a wrapped model (i.e. with FSDP modules) + can rekey its optimizer state dict to match that of an equivalent + non-wrapped model (i.e. without FSDP modules).""" + NUM_ITERS = 3 + # Run a wrapped model for a few iterations + model1, optim1, optim_input1 = self._init_nested_model( + wrap=True, use_multiple_param_groups=use_multiple_param_groups, + ) + self._step_model(model1, optim1, num_iters=NUM_ITERS) + full_osd = FSDP.full_optim_state_dict(model1, optim1, optim_input1) + # Broadcast instead of `torch.save()`/`torch.load()` so that all ranks + # have the full state dict + full_osd = self._broadcast_full_osd(full_osd) + # Run a non-wrapped model for a few iterations + model2, optim2, optim_input2 = self._init_nested_model( + wrap=False, use_multiple_param_groups=use_multiple_param_groups, + ) + self._step_model(model2, optim2, num_iters=NUM_ITERS) + # Re-key the wrapped model's optimizer state dict using parameter IDs + # according to the non-wrapped model + rekeyed_osd = FSDP.rekey_optim_state_dict( + full_osd, OptimStateKeyType.PARAM_ID, model2, optim_input2, + ) + # Check that the re-keyed dict and actual dict are the same + osd = optim2.state_dict() + check_same_param_keys = True + self._check_same_param_groups( + rekeyed_osd, osd, check_same_param_keys=check_same_param_keys, + ) + self._check_same_state( + rekeyed_osd, osd, check_same_param_keys=check_same_param_keys, + ) + # As a sanity check, check that we can load and run a few iterations + optim2.load_state_dict(rekeyed_osd) + self._step_model(model2, optim2, num_iters=NUM_ITERS) + + @skip_if_lt_x_gpu(2) + @parametrize("use_multiple_param_groups", [False]) + def test_rekey_optim_state_dict_to_names( + self, + use_multiple_param_groups: bool, + ): + """Tests :meth:`rekey_optim_state_dict` with the new keys being + parameter names by checking that a non-wrapped model (i.e. without FSDP + modules) can rekey its optimizer state dict to match the expected + output of :meth:`full_optim_state_dict`, hence be sharded using + :meth:`shard_full_optim_state_dict`, and finally match the per-rank + optimizer state dict of a wrapped model (i.e. with FSDP modules).""" + NUM_ITERS = 3 + # Run a wrapped model for a few iterations + model1, optim1, optim_input1 = self._init_nested_model( + wrap=True, use_multiple_param_groups=use_multiple_param_groups, + ) + self._step_model(model1, optim1, num_iters=NUM_ITERS) + # Run a non-wrapped model for a few iterations + model2, optim2, optim_input2 = self._init_nested_model( + wrap=False, use_multiple_param_groups=use_multiple_param_groups, + ) + self._step_model(model2, optim2, num_iters=NUM_ITERS) + # Re-key the non-wrapped model's optimizer state dict using parameter + # names (still according to itself) + osd2 = optim2.state_dict() + rekeyed_osd = FSDP.rekey_optim_state_dict( + osd2, OptimStateKeyType.PARAM_NAME, model2, optim_input2, + ) + # Shard the non-wrapped model's re-keyed optimizer state dict, which + # maps back to (flattened) parameter IDs + sharded_osd = FSDP.shard_full_optim_state_dict( + rekeyed_osd, model1, optim_input1, + ) + # Check that this sharded optimizer state dict matches the wrapped + # model's per-rank optimizer state dict + osd1 = optim1.state_dict() + check_same_param_keys = True + self._check_same_param_groups( + sharded_osd, osd1, check_same_param_keys=check_same_param_keys, + ) + self._check_same_state( + sharded_osd, osd1, check_same_param_keys=check_same_param_keys, + ) + # As a sanity check, check that we can load and run a few iterations + optim1.load_state_dict(sharded_osd) + self._step_model(model1, optim1, num_iters=NUM_ITERS) + + +instantiate_parametrized_tests(TestFSDPOptimState) + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/fsdp/test_fsdp_pure_fp16.py b/test/distributed/fsdp/test_fsdp_pure_fp16.py index 4d98fbfa8e2c..82648ea457a8 100644 --- a/test/distributed/fsdp/test_fsdp_pure_fp16.py +++ b/test/distributed/fsdp/test_fsdp_pure_fp16.py @@ -5,13 +5,13 @@ import torch from torch import distributed as dist from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, CPUOffload -from torch.nn import Linear, Module from torch.nn.parallel import DistributedDataParallel from torch.optim import SGD from torch.testing._internal.common_distributed import skip_if_lt_x_gpu from torch.testing._internal.common_fsdp import ( FSDPTest, get_full_params, + DeterministicModel, ) from torch.testing._internal.common_utils import ( instantiate_parametrized_tests, @@ -33,21 +33,6 @@ sys.exit(0) -class Model(Module): - def __init__(self, wrap_fsdp, cpu_offload=CPUOffload(offload_params=False)): - super().__init__() - # keep everything deterministic for model initialization - torch.manual_seed(0) - self.inner = Linear(2, 2).cuda() - if wrap_fsdp: - self.inner = FSDP(self.inner, cpu_offload=cpu_offload) - self.outer = Linear(2, 2).cuda() - - def forward(self, x): - y = self.inner(x) - return self.outer(y) - - # Test pure fp16 training, also testing the case when the parameter's data type is # changed after FSDP wrapping and before training loop starts. # Only run one step for comparision, as usually grad scaler is needed to avoid NaN value @@ -57,7 +42,7 @@ def _dist_train(self, wrap_fsdp, cpu_offload=CPUOffload(offload_params=False)): # keep everything deterministic for input data torch.manual_seed(0) - model = Model(wrap_fsdp, cpu_offload) + model = DeterministicModel(wrap_fsdp, cpu_offload) if wrap_fsdp: model = FSDP(model, cpu_offload=cpu_offload) else: @@ -74,7 +59,9 @@ def _dist_train(self, wrap_fsdp, cpu_offload=CPUOffload(offload_params=False)): optim.zero_grad() if wrap_fsdp: - get_full_params(model) + full_params = get_full_params(model) + torch.cuda.synchronize() + return full_params return list(model.parameters()) diff --git a/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py b/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py new file mode 100644 index 000000000000..44b8815a9a4b --- /dev/null +++ b/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py @@ -0,0 +1,159 @@ +# Owner(s): ["oncall: distributed"] + +import functools +import itertools +import sys +import torch +import unittest + +from torch import distributed as dist +from torch.cuda.amp.common import amp_definitely_not_available +from torch.distributed.fsdp.fully_sharded_data_parallel import ShardingStrategy +from torch.distributed.fsdp import MixedPrecision, CPUOffload +from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler +from torch.testing._internal.common_fsdp import DummyProcessGroup, subtest_name, FSDPInitMode, NestedWrappedModule, FSDPTest +from torch.testing._internal.common_distributed import skip_if_lt_x_gpu +from torch.testing._internal.common_utils import ( + TestCase, run_tests, + instantiate_parametrized_tests, + parametrize, + TEST_WITH_DEV_DBG_ASAN, +) + + +if not dist.is_available(): + print("Distributed not available, skipping tests", file=sys.stderr) + sys.exit(0) + + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) + + +params = "cpu_offload,sharding_strategy,mixed_precision" +cpu_offload_config = [CPUOffload(offload_params=True), CPUOffload(offload_params=False)] +sharding_strategy_config = [ShardingStrategy.SHARD_GRAD_OP, None] +mixed_precision = ["enable_mixed_precision", None] + +configs = list(itertools.product(cpu_offload_config, + sharding_strategy_config, + mixed_precision)) +test_name_mapping = { + str(CPUOffload(offload_params=True)): "offload_true", + str(CPUOffload(offload_params=False)): "offload_false", + str(ShardingStrategy.SHARD_GRAD_OP): "shard_grad_op", + "enable_mixed_precision": "mixed_precision" +} + +subtest_name = functools.partial(subtest_name, test_name_mapping) + + +class TestShardGradScaler(TestCase): + @unittest.skipIf(amp_definitely_not_available(), "no supported device (cuda, xla) found") + def test_grad_scaling(self): + pg = DummyProcessGroup(0, 1) + scaler = ShardedGradScaler(init_scale=2.0, process_group=pg, enabled=True) + t0 = torch.full((1,), 4.0, dtype=torch.float32, device="cpu") + t1 = torch.full((1,), 8.0, dtype=torch.float32, device="cpu") + outputs = [t1.clone(), (t0.clone(), t1.clone()), [t0.clone(), t1.clone()]] + outputs = scaler.scale(outputs) + self.assertTrue(outputs[0] == 16.0 and outputs[1][0] == 8.0 and outputs[1][1] == 16.0) + self.assertTrue(outputs[2][0] == 8.0 and outputs[2][1] == 16.0) + self.assertTrue(scaler._scale.device == t1.device) + + @unittest.skipIf(amp_definitely_not_available(), "no supported device (cuda, xla) found") + def test_scaling_unscaling_sparse(self): + pg = DummyProcessGroup(0, 1) + scaler = ShardedGradScaler(init_scale=2.0, process_group=pg, enabled=True) + inv_scale = torch.full((1,), 0.5, dtype=torch.float, device="cpu") + found_inf = torch.full((1,), 0, dtype=torch.float, device="cpu") + + i = torch.tensor([[0, 1, 1], + [2, 0, 2]], device="cpu", dtype=torch.int64) + v = torch.tensor([16.0, 32.0, 64.0], dtype=torch.float, device="cpu") + s = torch.sparse_coo_tensor(i, v, torch.Size([2, 3]), device="cpu", dtype=torch.float) + + # unscale sparse tensors + s1 = s.clone() + s1.grad = s.clone() + opt = torch.optim.SGD([s1], lr=1.0) + found_inf.zero_() + found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf)[s1.device] + self.assertEqual(found_inf, 0.0) + self.assertEqual(s1.grad.to_dense(), (s / 2).to_dense()) + + # unscale sparse tensor: inf + v = torch.tensor([16.0, 32.0, float('inf')], dtype=torch.float, device="cpu") + s1.grad = torch.sparse_coo_tensor(i, v, torch.Size([2, 3]), device="cpu", dtype=torch.float) + found_inf.zero_() + found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf)[s1.device] + self.assertEqual(found_inf, 1.0) + + # unscale sparse tensor: overflow (marked as inf) + i = torch.tensor([[1, 1, 1], + [0, 0, 2]], device="cpu", dtype=torch.int64) + # coalescing sparse tensor here will cause the value to be Inf + v = torch.tensor([2**15, 2**15, 1.0], dtype=torch.float16, device="cpu") + s1 = torch.sparse_coo_tensor(i, v, torch.Size([2, 3]), device="cpu", dtype=torch.float16) + s1.grad = s1.clone() + found_inf.zero_() + found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf)[s1.device] + self.assertEqual(found_inf, 1.0) + + @unittest.skipIf(amp_definitely_not_available(), "no supported device (cuda, xla) found") + def test_inf_gradients_skip_optim_step(self): + pg = DummyProcessGroup(0, 1) + scaler = ShardedGradScaler(init_scale=2.0, process_group=pg, enabled=True) + loss = torch.full((1,), 4.0, dtype=torch.float32, device="cpu") + t0 = torch.tensor([float('inf')], dtype=torch.float32, device="cpu") + t0.grad = t0.clone() + opt = torch.optim.SGD([t0], lr=1.0) + scaler.scale(loss) + ret_val = scaler.step(opt) + self.assertTrue(ret_val is None) + + +class TestShardedGradScalerParityWithDDP(FSDPTest): + def _get_init_modes_for_test(self, cpu_offload): + modes = [ + FSDPInitMode.CUDA_AFTER, + FSDPInitMode.CUDA_BEFORE + ] + # Note that FSDPInitMode.CUDA_NEVER works currently only with CPU + # offload as we explicitly bring the param back to CUDA device. In + # general, it will not work since we try to all_gather p.data which is + # on CPU but NCCL only supports GPU. + if cpu_offload.offload_params: + modes.append(FSDPInitMode.CUDA_NEVER) + + return modes + + @skip_if_lt_x_gpu(2) + @parametrize(params, configs, subtest_name) + def test_scaler_enabled(self, cpu_offload, sharding_strategy, mixed_precision): + init_modes = self._get_init_modes_for_test(cpu_offload) + mp = MixedPrecision( + param_dtype=torch.float16, + reduce_dtype=torch.float16, + buffer_dtype=torch.float16, + ) if mixed_precision else None + for fsdp_init_mode in init_modes: + self._test_identical_outputs( + NestedWrappedModule, + fsdp_init_mode=fsdp_init_mode, + cpu_offload=cpu_offload, + sharding_strategy=sharding_strategy, + mixed_precision=mp, + enable_sharded_grad_scaler=True, + ) + + +instantiate_parametrized_tests(TestShardGradScaler) +instantiate_parametrized_tests(TestShardedGradScalerParityWithDDP) + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/fsdp/test_fsdp_state_dict.py b/test/distributed/fsdp/test_fsdp_state_dict.py new file mode 100644 index 000000000000..6d8b9959efb5 --- /dev/null +++ b/test/distributed/fsdp/test_fsdp_state_dict.py @@ -0,0 +1,734 @@ +# Owner(s): ["oncall: distributed"] + +import sys +from contextlib import suppress +from copy import deepcopy +from functools import partial +from typing import Any, Dict + +import torch +import torch.nn as nn +from torch.nn import TransformerEncoderLayer, TransformerDecoderLayer +from torch import distributed as dist +from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import checkpoint_wrapper +from torch.distributed.fsdp import ( + FullyShardedDataParallel as FSDP, + StateDictType, + FullStateDictConfig, + LocalStateDictConfig, + CPUOffload, + MixedPrecision, +) +from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel +from torch.distributed.fsdp.shard_utils import _gather_state_dict +from torch.distributed.fsdp.wrap import enable_wrap, wrap, transformer_auto_wrap_policy +from torch.nn import Linear, Module +from torch.nn.parallel import DistributedDataParallel +from torch.optim import SGD +from torch.testing._internal.common_distributed import skip_if_lt_x_gpu +from torch.testing._internal.common_fsdp import ( + FSDPTest, + get_full_params, + _get_full_detached_param, + _get_state_dict, + SkipModel, + _zero_model, + TransformerWithSharedParams, + _validate, +) +from torch.testing._internal.common_utils import ( + instantiate_parametrized_tests, + parametrize, + run_tests, + TEST_WITH_DEV_DBG_ASAN, +) + + +if not dist.is_available(): + print("Distributed not available, skipping tests", file=sys.stderr) + sys.exit(0) + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) + +INNER_SHAPE = [4, 4] +OUTER_SHAPE = [4, 5] +BUFFER_SHAPE = [5, 5] + +NON_ROOT_FSDP_PREFIX = 'non_fsdp_lin' + +_UNFLATTENED_STATE_DICT_IMPLS = ["state_dict", "sharded_state_dict"] +_FLATTENED_STATE_DICT_IMPLS = ["local_state_dict"] +_SUPPORTED_STATE_DICT_IMPLS = ( + _UNFLATTENED_STATE_DICT_IMPLS + _FLATTENED_STATE_DICT_IMPLS +) + +STATE_DICT_MAPPING = { + "state_dict": StateDictType.FULL_STATE_DICT, + "local_state_dict": StateDictType.LOCAL_STATE_DICT, + "sharded_state_dict": StateDictType.SHARDED_STATE_DICT, +} + + +class Model(Module): + def __init__(self, wrap_fsdp, register_buffers=False): + super().__init__() + self.inner = Linear(*INNER_SHAPE) + if register_buffers: + self.inner.register_buffer("buffer", torch.randn(BUFFER_SHAPE)) + if wrap_fsdp: + self.inner = FSDP(self.inner) + self.outer = Linear(*OUTER_SHAPE) + if register_buffers: + self.outer.register_buffer("buffer", torch.randn(BUFFER_SHAPE)) + + def forward(self, x): + # Forward twice. + i = self.inner(x) + j = self.inner(x) + return self.outer(i + j) + + +class TestFSDPStateDict(FSDPTest): + @property + def world_size(self): + return 2 + + def _broadcast_state_dict(self, state_dict): + olist = [state_dict if self.rank == 0 else None] + dist.broadcast_object_list(olist) + return olist[0] + + def _compare_models(self, model, model_new, assert_fn, check_fp16=False): + with FullyShardedDataParallel.summon_full_params(model): + with FullyShardedDataParallel.summon_full_params(model_new): + params = list(model.parameters()) + params_new = list(model_new.parameters()) + assert_fn(params, params_new) + if check_fp16: + for tensor in model_new.parameters(): + self.assertEqual(tensor.dtype, torch.float16) + + def _get_simple_nested_model(self, *fsdp_args, wrap=True, checkpoint_wrap=False, **fsdp_kwargs): + if wrap: + lin1 = nn.Linear(10, 10, bias=False).cuda() + lin2 = nn.Linear(10, 10, bias=False).cuda() + if checkpoint_wrap: + lin1 = checkpoint_wrapper(lin1) + lin2 = checkpoint_wrapper(lin2) + seq = nn.Sequential(FSDP(lin1, *fsdp_args, **fsdp_kwargs), lin2) + if checkpoint_wrap: + seq = checkpoint_wrapper(seq) + model = FSDP(seq, *fsdp_args, **fsdp_kwargs) + else: + model = nn.Sequential( + nn.Linear(10, 10, bias=False).cuda(), nn.Linear(10, 10, bias=False).cuda() + ) + return model + + def _get_simple_model(self, *fsdp_args, checkpoint_wrap=False, **fsdp_kwargs): + lin = nn.Linear(10, 10, bias=False).cuda() + if checkpoint_wrap: + lin = checkpoint_wrapper(lin) + model = FSDP(lin, *fsdp_args, **fsdp_kwargs) + return model + + def _get_non_fsdp_root_module(self, *fsdp_args, wrap=True, **fsdp_kwargs): + class FSDPContainer(nn.Module): + def __init__(self, fsdp_1, fsdp_2): + super().__init__() + self.non_fsdp_lin = nn.Linear(10, 10, bias=False).cuda() + self.fsdp_1 = fsdp_1 + self.fsdp_2 = fsdp_2 + + def forward(self, x): + x = self.non_fsdp_lin(x) + x = self.fsdp_1(x) + x = self.fsdp_2(x) + return x + + return FSDPContainer( + self._get_simple_nested_model(*fsdp_args, wrap=wrap, **fsdp_kwargs), + self._get_simple_nested_model(*fsdp_args, wrap=wrap, **fsdp_kwargs), + ) + + def _get_state_dict_mgr(self, model, state_dict_type, state_dict_rank0_and_offload): + _state_dict_type = STATE_DICT_MAPPING[state_dict_type] + if state_dict_type == "state_dict": + config = FullStateDictConfig( + rank0_only=state_dict_rank0_and_offload, + offload_to_cpu=state_dict_rank0_and_offload, + ) + else: + config = None + return FSDP.state_dict_type(model, _state_dict_type, config) + + def _validate_state_dict_contents( + self, model, fsdp_state_dict, state_dict_rank0_and_offload, ignore_keys=None + ): + if state_dict_rank0_and_offload: + if self.rank == 0: + self.assertNotEqual(fsdp_state_dict, {}) + for key, tensor in fsdp_state_dict.items(): + if ignore_keys and key in ignore_keys: + continue + self.assertEqual( + tensor.device, + torch.device("cpu"), + f"{key} is unexpectedly on device {tensor.device}", + ) + else: + # For non-FSDP roots, the non FSDP portion can still have parameters on rank 0, + # so bypass the check for now. + if isinstance(model, FSDP): + self.assertEqual(fsdp_state_dict, {}) + + @skip_if_lt_x_gpu(2) + def test_load_activation_checkpointed_module(self): + # TODO: move this tests to checkpoint_wrapper tests once there is a dedicated + # test suite for them: https://github.com/pytorch/pytorch/issues/77478. + lin = nn.Linear(10, 10, bias=False).cuda() + lin = checkpoint_wrapper(lin) + state_dict = deepcopy(lin.state_dict()) + # Load into non-checkpoint wrapped linear module + lin_new = nn.Linear(10, 10, bias=False).cuda() + lin_new.load_state_dict(state_dict) + for p1, p2 in zip(lin.parameters(), lin_new.parameters()): + self.assertEqual(p1, p2) + + # Load non-checkpoint wrapped module into checkpoint wrapped one + # Make params different + for p in lin_new.parameters(): + with torch.no_grad(): + p.add_(0.5) + + state_dict = deepcopy(lin_new.state_dict()) + # Verify checkpoint wrapped linear can load unwrapped linear + lin.load_state_dict(state_dict) + print(type(lin)) + for p1, p2 in zip(lin.parameters(), lin_new.parameters()): + self.assertEqual(p1, p2) + + @skip_if_lt_x_gpu(2) + @parametrize("checkpoint_wrap", ["first", "second", "both"]) + def test_fsdp_state_dict_with_activation_checkpoint(self, checkpoint_wrap): + for model_call in [ + partial(self._get_simple_model), + partial(self._get_simple_nested_model) + ]: + model = model_call(checkpoint_wrap=(checkpoint_wrap in ["first", "both"])) + state_dict = _get_state_dict(model, False, False) + # Possibly wrap new model in activation checkpoint wrapper to test save/ + # load with this wrapper + model_new = model_call(checkpoint_wrap=(checkpoint_wrap in ["second", "both"])) + _zero_model(model_new) + self._compare_models(model, model_new, self.assertNotEqual) + # Would fail if checkpoint_wrapper did not correctly implement state_dict pre/post hooks + model_new.load_state_dict(state_dict) + self._compare_models(model, model_new, self.assertEqual) + + @skip_if_lt_x_gpu(2) + def test_state_dict_rank0_offload_save_load_flow(self): + # Test taking checkpoint on rank 0 only, and reload + # without redundant CPU memories. + model = TransformerWithSharedParams(group=dist.distributed_c10d._get_default_group()) + my_auto_wrap_policy = partial( + transformer_auto_wrap_policy, + transformer_layer_cls={TransformerEncoderLayer, TransformerDecoderLayer} + ) + model = FSDP(model, auto_wrap_policy=my_auto_wrap_policy) + ctx = self._get_state_dict_mgr( + model, "state_dict", True + ) + with ctx: + state_dict = deepcopy(_get_state_dict(model)) + + # All ranks initialize non-FSDP model + grp = dist.distributed_c10d._get_default_group() + model_new = TransformerWithSharedParams(group=grp) + for p in model_new.parameters(): + with torch.no_grad(): + p.zero_() + # Only rank 0 loads the checkpoint + if self.rank == 0: + model_new.load_state_dict(state_dict) + + # TransformerWithSharedParams has a buffer of zeros, so can't pass in + # self.assertNotEqual since the buffers would be equal. So just checking that + # there is some difference in the model across ranks before state_dict is + # broadcasted. + with self.assertRaisesRegex(AssertionError, "Tensor-likes are not close"): + _validate(model_new, process_group=grp, assert_fn=self.assertEqual) + # FSDP with sync_module_states=True broadcasts the checkpointed states. + model_new = FSDP( + model_new, + device_id=torch.cuda.current_device(), + auto_wrap_policy=my_auto_wrap_policy, + sync_module_states=True + ) + # After wrapping with FSDP models are equal across ranks, and have loaded the checkpoint + with FSDP.summon_full_params(model_new): + _validate(model_new, process_group=grp, assert_fn=self.assertEqual) + + with FullyShardedDataParallel.summon_full_params(model): + with FullyShardedDataParallel.summon_full_params(model_new): + params = list(model.parameters()) + params_new = list(model_new.parameters()) + self.assertEqual(params, params_new) + + @skip_if_lt_x_gpu(2) + @parametrize("state_dict_type", _SUPPORTED_STATE_DICT_IMPLS) + @parametrize( + "cpu_offload", + [CPUOffload(offload_params=True), CPUOffload(offload_params=False)], + ) + @parametrize("fp16", [True, False]) + @parametrize("state_dict_rank0_and_offload", [True, False]) + def test_basic_save_and_load_state_dict( + self, state_dict_type, cpu_offload, fp16, state_dict_rank0_and_offload + ): + """ + Tests that we can save a state_dict and load it into a blank model + with various configs such as fp16 and cpu offload and parameters + match as expected. + """ + if state_dict_rank0_and_offload and state_dict_type != "state_dict": + return + for model_call in [ + partial(self._get_non_fsdp_root_module, cpu_offload=cpu_offload), + partial(self._get_simple_nested_model, cpu_offload=cpu_offload), + partial(self._get_simple_model, cpu_offload=cpu_offload), + ]: + model = model_call() + + ctx = self._get_state_dict_mgr( + model, state_dict_type, state_dict_rank0_and_offload + ) + with ctx: + fsdp_state_dict = _get_state_dict( + model, cpu_offload.offload_params, fp16 + ) + + # if self.rank == 0: + # print(f"FSDP keys {fsdp_state_dict.keys()}") + + ignore_keys = [k for k in fsdp_state_dict.keys() if NON_ROOT_FSDP_PREFIX in k] + + self._validate_state_dict_contents( + model, fsdp_state_dict, state_dict_rank0_and_offload, ignore_keys=ignore_keys, + ) + if fp16: + # Verify fp16 is the type + for tensor in fsdp_state_dict.values(): + self.assertEqual(tensor.dtype, torch.float16) + + model_new = model_call() + if not cpu_offload.offload_params: + model_new = model_new.cuda() + if fp16: + model_new.half() + + # zero the model to ensure parameters are different. + _zero_model(model_new) + self._compare_models(model, model_new, self.assertNotEqual) + + # Verify parameters are the same in the new model. + if state_dict_rank0_and_offload: + # Broadcast the state dict and move it back to GPU in + # preparation for loading. + if not isinstance(model, FSDP): + # Move everything to CPU to avoid running into + # https://github.com/pytorch/pytorch/issues/77113, some params + # will still be on GPU for non FSDP root modules. + for k in fsdp_state_dict.keys(): + fsdp_state_dict[k] = fsdp_state_dict[k].cpu() + fsdp_state_dict = self._broadcast_state_dict(fsdp_state_dict) + for key in fsdp_state_dict.keys(): + fsdp_state_dict[key] = fsdp_state_dict[key].cuda() + with FSDP.state_dict_type(model_new, STATE_DICT_MAPPING[state_dict_type]): + model_new.load_state_dict(fsdp_state_dict) + + self._compare_models(model, model_new, self.assertEqual, check_fp16=fp16) + + @skip_if_lt_x_gpu(2) + @parametrize("state_dict_type", _SUPPORTED_STATE_DICT_IMPLS) + @parametrize("mixed_precision", [True, False]) + @parametrize("state_dict_rank0_and_offload", [True, False]) + def test_save_and_load_after_forward_state_dict( + self, state_dict_type, mixed_precision, state_dict_rank0_and_offload + ): + """ + Test that saving after some training results in params being updated as + expected. + """ + if state_dict_rank0_and_offload and state_dict_type != "state_dict": + return + torch.cuda.set_device(self.rank) + mixed_precision = ( + MixedPrecision( + param_dtype=torch.float16, + reduce_dtype=torch.float16, + buffer_dtype=torch.float16, + ) + if mixed_precision + else None + ) + model = self._get_simple_nested_model(mixed_precision=mixed_precision) + optim = torch.optim.SGD(model.parameters(), lr=0.1) + initial_params = _get_full_detached_param(model) + for _ in range(6): + inp = torch.randn(1, 10, device=torch.cuda.current_device()) + output = model(*inp) + loss = output.sum() + expected_dtype = torch.float32 if mixed_precision is None else torch.float16 + self.assertEqual(expected_dtype, loss.dtype) + loss.backward() + optim.step() + + trained_params = _get_full_detached_param(model) + # Ensure some training occured + self.assertNotEqual(initial_params, trained_params) + # Save a copy of the state_dict + fsd_mgr = self._get_state_dict_mgr( + model, state_dict_type, state_dict_rank0_and_offload + ) + with fsd_mgr: + state_dict = model.state_dict() + if state_dict_type == "state_dict": + state_dict = {k: v.clone() for k, v in state_dict.items()} + else: + for sharded_tensor in state_dict.values(): + shard = sharded_tensor._local_shards[0] + shard.tensor = shard.tensor.clone().detach_() + self._validate_state_dict_contents(model, state_dict, state_dict_rank0_and_offload) + _zero_model(model) + + # Ensure checkpointed params have the full param dtype + for tensor in state_dict.values(): + self.assertEqual(tensor.dtype, torch.float32) + + # Load state_dict into zeroed model + if state_dict_rank0_and_offload: + # Broadcast the state dict and move it back to GPU in + # preparation for loading. + state_dict = self._broadcast_state_dict(state_dict) + for key in state_dict.keys(): + state_dict[key] = state_dict[key].cuda() + + with FSDP.state_dict_type(model, STATE_DICT_MAPPING[state_dict_type]): + model.load_state_dict(state_dict) + loaded_params = _get_full_detached_param(model) + self.assertEqual(loaded_params, trained_params) + + def _initialize_model( + self, + wrap_fsdp: bool, + wrap_ddp: bool = True, + register_buffers: bool = False, + ): + # keep everything deterministic for input data + torch.manual_seed(0) + + model = Model(wrap_fsdp, register_buffers=register_buffers).cuda() + if wrap_fsdp: + model = FSDP(model) + elif wrap_ddp: + model = DistributedDataParallel(model, device_ids=[self.rank]) + return model + + @staticmethod + def _state_dict(model: Module, state_dict_type: str): + try: + enum_val = STATE_DICT_MAPPING[state_dict_type] + except KeyError: + raise ValueError(f"No state_dict type for {state_dict_type}") + + with FSDP.state_dict_type(model, enum_val): + return model.state_dict() + + @staticmethod + def _load_state_dict( + model: Module, state_dict_type: str, state_dict: Dict[str, Any] + ): + try: + enum_val = STATE_DICT_MAPPING[state_dict_type] + except KeyError: + raise ValueError(f"No state_dict for {state_dict_type}") + + with FSDP.state_dict_type(model, enum_val): + return model.load_state_dict(state_dict) + + def _dist_train(self, wrap_fsdp: bool, state_dict_type: str = ""): + # TODO: Move this test to common_fsdp. + model = self._initialize_model(wrap_fsdp) + optim = SGD(model.parameters(), lr=0.1) + + in_data = torch.rand(64, 4, requires_grad=True, device=torch.device("cuda")) + for _ in range(3): + out = model(in_data) + out.sum().backward() + optim.step() + optim.zero_grad() + + if wrap_fsdp: + blank_model = FSDP(Model(True).cuda()) + _zero_model(blank_model) + state_dict = self._state_dict(model, state_dict_type) + self._load_state_dict(blank_model, state_dict_type, state_dict) + return get_full_params(blank_model) + else: + return list(model.parameters()) + + @skip_if_lt_x_gpu(2) + @parametrize("state_dict_type", _SUPPORTED_STATE_DICT_IMPLS) + def test_state_dict_save_load_flow(self, state_dict_type): + fsdp_params = self._dist_train(wrap_fsdp=True, state_dict_type=state_dict_type) + ddp_params = self._dist_train(wrap_fsdp=False) + self.assertEqual(ddp_params, fsdp_params) + + @skip_if_lt_x_gpu(2) + @parametrize("state_dict_type", _SUPPORTED_STATE_DICT_IMPLS) + def test_fsdp_state_dict_keys(self, state_dict_type): + state_dict = self._state_dict(self._initialize_model(True), state_dict_type) + if state_dict_type == "local_state_dict": + self.assertEqual(set(["flat_param", "inner.flat_param"]), state_dict.keys()) + elif state_dict_type in ("state_dict", "sharded_state_dict"): + # Keys should match local model. + local_model = self._initialize_model(wrap_fsdp=False, wrap_ddp=False) + local_keys = local_model.state_dict().keys() + self.assertEqual(state_dict.keys(), local_keys) + else: + raise NotImplementedError(f"No test for {state_dict_type}!") + + @skip_if_lt_x_gpu(2) + @parametrize("state_dict_type", _UNFLATTENED_STATE_DICT_IMPLS) + @parametrize("state_dict_rank0_and_offload", [True, False]) + @parametrize("fsdp_root", [True, False]) + def test_state_dict_load_into_local_module( + self, state_dict_type, state_dict_rank0_and_offload, fsdp_root, + ): + """ + Tests that FSDP's state_dict can be loaded into a local model. + """ + if state_dict_rank0_and_offload and state_dict_type != "state_dict": + return + if not fsdp_root: + model = self._get_non_fsdp_root_module() + else: + model = self._initialize_model(wrap_fsdp=True, register_buffers=True) + optim = SGD(model.parameters(), lr=0.1) + if not fsdp_root: + in_data = torch.randn(1, 10, requires_grad=True, device=torch.device("cuda")) + else: + in_data = torch.rand(64, 4, requires_grad=True, device=torch.device("cuda")) + for _ in range(3): + out = model(in_data) + out.sum().backward() + optim.step() + optim.zero_grad() + + with FullyShardedDataParallel.summon_full_params(model): + fsdp_params = deepcopy(list(model.parameters())) + + # get FSDP state_dict. Note that by default we return full_state_dict. + sd_mgr = self._get_state_dict_mgr( + model, state_dict_type, state_dict_rank0_and_offload + ) + with sd_mgr: + fsdp_state_dict = model.state_dict() + + ignore_keys = [k for k in fsdp_state_dict.keys() if NON_ROOT_FSDP_PREFIX in k] + self._validate_state_dict_contents( + model, fsdp_state_dict, state_dict_rank0_and_offload, ignore_keys=ignore_keys, + ) + # Create zeroed local model + if not fsdp_root: + blank_local_model = self._get_non_fsdp_root_module(wrap=False) + else: + blank_local_model = self._initialize_model( + wrap_fsdp=False, wrap_ddp=False, register_buffers=True + ) + + # Nothing should be FSDP + for mod in blank_local_model.modules(): + self.assertFalse(isinstance(mod, FSDP)) + + for param in blank_local_model.parameters(): + with torch.no_grad(): + param.zero_() + + fsdp_state_dict = _gather_state_dict(fsdp_state_dict) + + # Load fsdp's full state dict into the local and verify params are as + # expected. + if state_dict_rank0_and_offload: + # Broadcast + CUDA state_dict + if not isinstance(model, FSDP): + # Some portions of the model on rank 0 might not be on CPU, + # move everything to CPU to avoid running into + # https://github.com/pytorch/pytorch/issues/77113. + for k, t in fsdp_state_dict.items(): + if t.device != torch.device("cpu"): + fsdp_state_dict[k] = t.cpu() + fsdp_state_dict = self._broadcast_state_dict(fsdp_state_dict) + for key in fsdp_state_dict.keys(): + fsdp_state_dict[key] = fsdp_state_dict[key].cuda() + + # if self.rank == 0: + blank_local_model.load_state_dict(fsdp_state_dict) + local_params = list(blank_local_model.parameters()) + for fsdp_param, local_param in zip(fsdp_params, local_params): + self.assertEqual(fsdp_param, local_param) + + @skip_if_lt_x_gpu(2) + @parametrize("state_dict_type", _SUPPORTED_STATE_DICT_IMPLS) + @parametrize("double_nest", [True]) + def test_state_dict_skip_module(self, state_dict_type, double_nest): + torch.cuda.set_device(self.rank) + + def _create_module(wrap_fsdp=True): + LINEAR_SKIP = "linear_skip" + ctx = enable_wrap(wrapper_cls=FSDP) if wrap_fsdp else suppress() + with ctx: + module = SkipModel(double_nest=double_nest) + # Full name of linear_skip param tensors in SkipModel, as would be + # stored in checkpoint. + linear_skip_tensor_names = [ + k + for k in dict(module.named_parameters()).keys() + if LINEAR_SKIP in k + ] + # skip SkipModule + linear_skip = getattr(module, LINEAR_SKIP) + delattr(module, LINEAR_SKIP) + # Wrap FSDP + fsdp = wrap(module) + # reattach + setattr(module, LINEAR_SKIP, linear_skip) + return fsdp, linear_skip_tensor_names + + fsdp, linear_skip_tensor_names = _create_module() + # Run a forward pass + inp = torch.randn((1, 10), device=torch.cuda.current_device()) + loss = fsdp(inp) + loss.sum().backward() + + with FSDP.state_dict_type(fsdp, STATE_DICT_MAPPING[state_dict_type]): + state_dict = fsdp.state_dict() + if self.rank == 0 and state_dict_type != "local_state_dict": + sd_keys = list(state_dict.keys()) + expected = list(SkipModel(double_nest=False).state_dict().keys()) + self.assertEqual(sorted(sd_keys), sorted(expected)) + # TODO: parameters in linear_skip_tensor_names should not be handled + # by FSDP.state_dict(). Have a check once this is implemented in + # FSDP.state_dict(). + + # Check that it can be loaded into FSDP. + new_fsdp, _ = _create_module() + _zero_model(new_fsdp) + for (p1, p2) in zip(fsdp.parameters(), new_fsdp.parameters()): + self.assertNotEqual(p1, p2) + with FSDP.state_dict_type(new_fsdp, STATE_DICT_MAPPING[state_dict_type]): + if state_dict_type != "local_state_dict": + # FlatParameter has not supported deepcopy yet. + state_dict = deepcopy(state_dict) + new_fsdp.load_state_dict(state_dict) + for (p1, p2) in zip(fsdp.parameters(), new_fsdp.parameters()): + self.assertEqual(p1, p2) + + # Test that the checkpoint can be loaded into a local model. + local, _ = _create_module(wrap_fsdp=False) + for param in local.parameters(): + with torch.no_grad(): + param.zero_() + + with fsdp.summon_full_params(fsdp): + for (p1, p2) in zip(fsdp.parameters(), local.parameters()): + self.assertNotEqual(p1, p2) + + if state_dict_type == "local_state_dict": + return + state_dict = _gather_state_dict(state_dict) + with fsdp.summon_full_params(fsdp): + if self.rank == 0: + local.load_state_dict(state_dict) + for (p1, p2) in zip(fsdp.parameters(), local.parameters()): + self.assertEqual(p1, p2) + + @skip_if_lt_x_gpu(2) + def test_wrong_state_dict_config(self): + model = FSDP(Model(wrap_fsdp=True).cuda()) + with self.assertRaisesRegex(RuntimeError, "Expected state_dict_config of type"): + with model.state_dict_type( + model, StateDictType.FULL_STATE_DICT, LocalStateDictConfig() + ): + pass + + @skip_if_lt_x_gpu(2) + def test_state_dict_with_ignored_modules(self): + # Initialize an FSDP-wrapped model with an ignored module that includes + # both parameters and a buffer + model = Model(wrap_fsdp=True, register_buffers=True).cuda() + ignored_modules = [model.outer] + ignored_tensor_to_tensor_name = { + model.outer.bias: "outer.bias", + model.outer.weight: "outer.weight", + model.outer.buffer: "outer.buffer", + } + buffer_to_buffer_name = { + model.inner.buffer: "inner.buffer", model.outer.buffer: "outer.buffer", + } + fsdp_model = FSDP(model, ignored_modules=ignored_modules) + with FSDP.state_dict_type(fsdp_model, StateDictType.FULL_STATE_DICT): + sd1 = fsdp_model.state_dict() + with FSDP.summon_full_params(fsdp_model): + fsdp_params = deepcopy(list(fsdp_model.parameters())) + # Check that the ignored parameters and all buffers are not cloned + for tensor, tensor_name in { + **ignored_tensor_to_tensor_name, + **buffer_to_buffer_name, + }.items(): + self.assertTrue(tensor_name in sd1) + self.assertEqual(tensor.data_ptr(), sd1[tensor_name].data_ptr()) + # Check that the state dict can be loaded into a non-wrapped version of + # the model + nonwrapped_model = Model(wrap_fsdp=False, register_buffers=True).cuda() + for param in nonwrapped_model.parameters(): + with torch.no_grad(): + param.zero_() + nonwrapped_model.load_state_dict(sd1) + local_params = list(nonwrapped_model.parameters()) + for fsdp_param, local_param in zip(fsdp_params, local_params): + self.assertEqual(fsdp_param, local_param) + # Check that if we save a state dict again, the ignored parameters and + # buffer still have the same data pointer + with FSDP.state_dict_type(fsdp_model, StateDictType.FULL_STATE_DICT): + sd2 = fsdp_model.state_dict() + for tensor, tensor_name in { + **ignored_tensor_to_tensor_name, + **buffer_to_buffer_name, + }.items(): + self.assertTrue(tensor_name in sd1) # check again just in case + self.assertTrue(tensor_name in sd2) + self.assertEqual(tensor.data_ptr(), sd2[tensor_name].data_ptr()) + self.assertEqual(sd1[tensor_name].data_ptr(), sd2[tensor_name].data_ptr()) + + @skip_if_lt_x_gpu(2) + def test_state_dict_type(self): + module = SkipModel(double_nest=True) + with enable_wrap(wrapper_cls=FSDP): + fsdp = wrap(module) + with FSDP.state_dict_type(fsdp, StateDictType.LOCAL_STATE_DICT): + pass + for module in FSDP.fsdp_modules(fsdp): + self.assertEqual(module._state_dict_type, StateDictType.FULL_STATE_DICT) + + +instantiate_parametrized_tests(TestFSDPStateDict) + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/fsdp/test_fsdp_summon_full_params.py b/test/distributed/fsdp/test_fsdp_summon_full_params.py index e2dee05178a2..dbacacc1450e 100644 --- a/test/distributed/fsdp/test_fsdp_summon_full_params.py +++ b/test/distributed/fsdp/test_fsdp_summon_full_params.py @@ -1,15 +1,23 @@ # Owner(s): ["oncall: distributed"] -import sys +import itertools import math +import sys +from copy import deepcopy import torch import torch.nn as nn from torch import distributed as dist +from torch.distributed.fsdp import CPUOffload, MixedPrecision +from torch.distributed.fsdp import FlatParameter from torch.distributed.fsdp import FullyShardedDataParallel as FSDP -from torch.distributed.fsdp import CPUOffload +from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel +from torch.distributed.fsdp.wrap import wrap, enable_wrap from torch.testing._internal.common_distributed import skip_if_lt_x_gpu from torch.testing._internal.common_fsdp import ( + FSDPInitMode, FSDPTest, + NestedWrappedModule, + DeterministicModel, ) from torch.testing._internal.common_utils import ( TEST_WITH_DEV_DBG_ASAN, @@ -31,6 +39,62 @@ sys.exit(0) +def _run_test_summon_full_param_writeback( + cls, writeback, modify_outer, *fsdp_args, **fsdp_kwargs +): + with enable_wrap(wrapper_cls=FSDP, *fsdp_args, **fsdp_kwargs): + lin1 = wrap(nn.Linear(5, 5, bias=False).cuda(cls.rank)) + lin2 = nn.Linear(5, 3, bias=False).cuda(cls.rank) + model = wrap(nn.Sequential(lin1, lin2)) + + # set the value + outer_param = model.get_parameter("_fsdp_wrapped_module.flat_param") + inner_param = model.get_parameter( + "_fsdp_wrapped_module._fpw_module.0._fsdp_wrapped_module.flat_param" + ) + p = outer_param if modify_outer else inner_param + + with torch.no_grad(): + # This sets the local shard value + p[0] = cls.rank + 2 + + with model.summon_full_params(model, writeback=writeback): + with torch.no_grad(): + p.copy_(torch.zeros_like(p)) + + if writeback or cls.world_size == 1: + # When world_size = 1, FSDP does not shard and parameter is not set to + # a local shard, so write is always reflected. + cls.assertEqual(p.cpu()[0], 0) + else: + cls.assertEqual(p.cpu()[0], cls.rank + 2) + + +class TestSummonFullParamsNoShard(FSDPTest): + @property + def world_size(self): + return 1 # does not shard + + @skip_if_lt_x_gpu(2) + @parametrize("writeback", [True, False]) + @parametrize("modify_outer", [True, False]) + @parametrize("mixed_precision", [True, False]) + # TODO: CPUOffload summon + writeback does not + # work when param is not sharded + # (currently when world_size == 1) + def test_summon_full_param_writeback( + self, writeback, modify_outer, mixed_precision + ): + mixed_precision = MixedPrecision() if mixed_precision else None + return _run_test_summon_full_param_writeback( + self, + writeback, + modify_outer=modify_outer, + cpu_offload=CPUOffload(offload_params=False), + mixed_precision=mixed_precision, + ) + + class TestSummonFullParams(FSDPTest): @property def world_size(self): @@ -44,50 +108,34 @@ def get_expected_sharded_size(self, global_size): return int(math.ceil(global_size / self.world_size)) @skip_if_lt_x_gpu(2) - @parametrize( - "writeback", - [True, False] - ) + @parametrize("writeback", [True, False]) @parametrize( "cpu_offload", - [CPUOffload(offload_params=True), CPUOffload(offload_params=False)] - ) - @parametrize( - "modify_outer", - [True, False] + [CPUOffload(offload_params=True), CPUOffload(offload_params=False)], ) - def test_summon_full_param_writeback(self, writeback, cpu_offload, modify_outer): - model = FSDP(nn.Sequential( - FSDP(nn.Linear(5, 5, bias=False)), - nn.Linear(5, 3, bias=False) - )).cuda(self.rank) - - # set the value - outer_param = model.get_parameter("_fsdp_wrapped_module.flat_param") - inner_param = model.get_parameter("_fsdp_wrapped_module._fpw_module.0._fsdp_wrapped_module.flat_param") - p = outer_param if modify_outer else inner_param - - with torch.no_grad(): - # This sets the local shard value - p[0] = self.rank + 2 - - with model._summon_full_params(writeback=writeback): - with torch.no_grad(): - p.copy_(torch.zeros_like(p)) - - if writeback: - self.assertEqual(p.cpu()[0], 0) - else: - self.assertEqual(p.cpu()[0], self.rank + 2) + @parametrize("mixed_precision", [True, False]) + @parametrize("modify_outer", [True, False]) + def test_summon_full_param_writeback( + self, writeback, cpu_offload, mixed_precision, modify_outer + ): + mixed_precision = MixedPrecision() if mixed_precision else None + return _run_test_summon_full_param_writeback( + self, + writeback, + modify_outer, + cpu_offload=cpu_offload, + mixed_precision=mixed_precision, + ) @skip_if_lt_x_gpu(2) - def test_summon_full_param_shard_value(self): - + @parametrize("mixed_precision", [True, False]) + def test_summon_full_param_shard_value(self, mixed_precision): + mixed_precision = MixedPrecision() if mixed_precision else None raw_model = nn.Linear(10, 11) raw_model_size = self.get_model_param_count(raw_model) expected_shard_size = self.get_expected_sharded_size(raw_model_size) - model = FSDP(raw_model.cuda(self.rank)) + model = FSDP(raw_model.cuda(self.rank), mixed_precision=mixed_precision) self.assertEqual(expected_shard_size, self.get_model_param_count(model)) # we're assuming a single flatenned param @@ -95,29 +143,31 @@ def test_summon_full_param_shard_value(self): my_shard = torch.clone(next(model.parameters())) - with model._summon_full_params(): + with model.summon_full_params(model): self.assertEqual(raw_model_size, self.get_model_param_count(model)) - all_shards = next(model.parameters()) + parameters = list(model.parameters()) + all_shards = FlatParameter(parameters, requires_grad=False) my_slice = torch.chunk(all_shards, self.world_size)[self.rank] # shards are padded but the full_param tensor is not - a, b = my_shard[0: my_slice.numel()], my_slice - self.assertTrue(torch.equal(my_shard[0: my_slice.numel()].cpu(), my_slice.cpu())) + a, b = my_shard[0 : my_slice.numel()], my_slice + self.assertTrue( + torch.equal(my_shard[0 : my_slice.numel()].cpu(), my_slice.cpu()) + ) @skip_if_lt_x_gpu(2) - @parametrize( - "recurse", - [True, False] - ) - @parametrize( - "summon_outer", - [True, False] - ) - def test_summon_full_param_recursive(self, recurse, summon_outer): - model = FSDP(nn.Sequential( - FSDP(nn.Linear(5, 5, bias=False)), - nn.Linear(5, 3, bias=False) - )).cuda(self.rank) + @parametrize("recurse", [True, False]) + @parametrize("summon_outer", [True, False]) + @parametrize("mixed_precision", [True, False]) + def test_summon_full_param_recursive(self, recurse, summon_outer, mixed_precision): + mixed_precision = MixedPrecision() if mixed_precision else None + model = FSDP( + nn.Sequential( + FSDP(nn.Linear(5, 5, bias=False), mixed_precision=mixed_precision), + nn.Linear(5, 3, bias=False), + ), + mixed_precision=mixed_precision, + ).cuda(self.rank) global_inner_numel = self.get_model_param_count(nn.Linear(5, 5, bias=False)) global_outer_numel = self.get_model_param_count(nn.Linear(5, 3, bias=False)) @@ -126,7 +176,9 @@ def test_summon_full_param_recursive(self, recurse, summon_outer): shard_outer_numel = int(math.ceil(global_outer_numel / self.world_size)) outer_param = model.get_parameter("_fsdp_wrapped_module.flat_param") - inner_param = model.get_parameter("_fsdp_wrapped_module._fpw_module.0._fsdp_wrapped_module.flat_param") + inner_param = model.get_parameter( + "_fsdp_wrapped_module._fpw_module.0._fsdp_wrapped_module.flat_param" + ) self.assertEqual(shard_outer_numel, outer_param.numel()) self.assertEqual(shard_inner_numel, inner_param.numel()) @@ -135,9 +187,11 @@ def test_summon_full_param_recursive(self, recurse, summon_outer): expected_outer_numel = global_outer_numel if summon_outer else shard_outer_numel # inner is summoned if _summon_full_param is called with recursion or on the inner FSDP module - expected_inner_numel = global_inner_numel if recurse or not summon_outer else shard_inner_numel + expected_inner_numel = ( + global_inner_numel if recurse or not summon_outer else shard_inner_numel + ) - with model_to_summon._summon_full_params(recurse=recurse): + with model_to_summon.summon_full_params(model_to_summon, recurse=recurse): self.assertEqual(expected_outer_numel, outer_param.numel()) self.assertEqual(expected_inner_numel, inner_param.numel()) @@ -149,14 +203,15 @@ def __init__(self): self.a = nn.Parameter(torch.zeros(5)) def forward(self, fsdp_module): - with fsdp_module._summon_full_params(): + with fsdp_module.summon_full_params(fsdp_module): pass model = FSDP(MyModule()).cuda(self.rank) - with self.assertRaisesRegex(ValueError, "current state is TrainingState_.FORWARD"): + with self.assertRaisesRegex( + ValueError, "current state is TrainingState_.FORWARD" + ): model(model) - @skip_if_lt_x_gpu(2) def test_cannot_summon_full_params_from_backward(self): model = FSDP(nn.Linear(2, 1)).cuda(self.rank) @@ -164,39 +219,50 @@ def test_cannot_summon_full_params_from_backward(self): output = model(torch.ones(2).cuda(self.rank)) def bad_backwards_hook(tensor): - with model._summon_full_params(): + with model.summon_full_params(model): pass return None self.assertTrue(output.requires_grad) output.register_hook(bad_backwards_hook) - with self.assertRaisesRegex(ValueError, "current state is TrainingState_.BACKWARD_PRE"): + with self.assertRaisesRegex( + ValueError, "current state is TrainingState_.BACKWARD_PRE" + ): output.backward() - @skip_if_lt_x_gpu(2) - def test_summon_full_params_respects_reshard_after_forward(self): - model = FSDP(nn.Sequential( - FSDP(nn.Linear(5, 5, bias=False)), - nn.Linear(5, 3, bias=False) - )).cuda(self.rank) + @parametrize("mixed_precision", [True, False]) + def test_summon_full_params_respects_reshard_after_forward(self, mixed_precision): + mixed_precision = MixedPrecision() if mixed_precision else None + model = FSDP( + nn.Sequential( + FSDP(nn.Linear(5, 5, bias=False), mixed_precision=mixed_precision), + nn.Linear(5, 3, bias=False), + ), + mixed_precision=mixed_precision, + ).cuda(self.rank) outer_param = model.get_parameter("_fsdp_wrapped_module.flat_param") - inner_param = model.get_parameter("_fsdp_wrapped_module._fpw_module.0._fsdp_wrapped_module.flat_param") + inner_param = model.get_parameter( + "_fsdp_wrapped_module._fpw_module.0._fsdp_wrapped_module.flat_param" + ) outer_full_param_size = outer_param.numel() * self.world_size # trigger lazy init model(torch.zeros(5).cuda(self.rank)) - # the root FSDP module keeps all params around - self.assertEqual(outer_full_param_size, outer_param._full_param_padded.storage().size()) + self.assertEqual( + outer_full_param_size, outer_param._full_param_padded.storage().size() + ) self.assertEqual(0, inner_param._full_param_padded.storage().size()) - # similarly _summon_full_params should have the same behavior - with model._summon_full_params(): + # similarly summon_full_params should have the same behavior + with model.summon_full_params(model): pass - self.assertEqual(outer_full_param_size, outer_param._full_param_padded.storage().size()) + self.assertEqual( + outer_full_param_size, outer_param._full_param_padded.storage().size() + ) self.assertEqual(0, inner_param._full_param_padded.storage().size()) @skip_if_lt_x_gpu(2) @@ -210,7 +276,7 @@ def test_summon_single_param(self): # This sets the local shard value p[0] = self.rank + 2 - with model._summon_full_params(writeback=True): + with model.summon_full_params(model, writeback=True): self.assertEqual(1, p.numel()) with torch.no_grad(): p.copy_(torch.zeros_like(p)) @@ -222,21 +288,97 @@ def test_summon_single_param(self): self.assertEqual(self.rank + 2, p[0]) @skip_if_lt_x_gpu(2) - def test_reshard_outside_forward_backward_iteration(self): - model = FSDP(nn.Sequential( - FSDP(nn.Linear(5, 5, bias=False)), - nn.Linear(5, 1, bias=False) - )).cuda(self.rank) + @parametrize("rank0_only", [True, False]) + @parametrize("offload_to_cpu", [True, False]) + def test_summon_full_params_equivalence(self, rank0_only, offload_to_cpu): + offload = CPUOffload(offload_params=True) + model = FSDP( + DeterministicModel(wrap_fsdp=True, cpu_offload=offload), cpu_offload=offload + ) + local_model = DeterministicModel(wrap_fsdp=False) + + dev = ( + torch.device("cpu") + if offload_to_cpu + else torch.device("cuda", torch.cuda.current_device()) + ) + + params_to_compare = ( + [p.clone() for p in model.parameters()] + if rank0_only and self.rank != 0 + else list(local_model.parameters()) + ) + + with model.summon_full_params( + model, + recurse=True, + rank0_only=rank0_only, + writeback=not rank0_only, + offload_to_cpu=offload_to_cpu, + ): + # Below sleep causes failures without stream synchronization in + # summon_full_params fix. + torch.cuda._sleep(1000000) + # FSDP param deepcopy() of params has issues + fsdp_params = [p.clone() for p in model.parameters()] + + self.assertEqual(fsdp_params, params_to_compare) + + @skip_if_lt_x_gpu(2) + def test_summon_from_non_fsdp(self): + class FSDPContainer(nn.Module): + def __init__(self, fsdp_1, fsdp_2, fsdp_3): + super().__init__() + self.fsdp_1 = fsdp_1 + self.fsdp_2 = fsdp_2 + self.fsdp_3 = fsdp_3 + + model_fsdp = FSDPContainer( + FSDP(DeterministicModel(wrap_fsdp=True)), + FSDP(DeterministicModel(wrap_fsdp=True)), + DeterministicModel(wrap_fsdp=False), + ) + model_no_fsdp = FSDPContainer( + DeterministicModel(wrap_fsdp=False), + DeterministicModel(wrap_fsdp=False), + DeterministicModel(wrap_fsdp=False), + ) + + params_to_compare = list(model_no_fsdp.parameters()) + with FullyShardedDataParallel.summon_full_params(model_fsdp): + fsdp_params = [p.clone() for p in model_fsdp.parameters()] + + self.assertEqual(params_to_compare, fsdp_params) + + @skip_if_lt_x_gpu(2) + @parametrize("rank0_only", [True, False]) + @parametrize("offload_to_cpu", [True, False]) + @parametrize("mixed_precision", [True, False]) + def test_reshard_outside_forward_backward_iteration( + self, rank0_only, offload_to_cpu, mixed_precision + ): + mixed_precision = MixedPrecision() if mixed_precision else None + model = FSDP( + nn.Sequential( + FSDP(nn.Linear(5, 5, bias=False), mixed_precision=mixed_precision), + nn.Linear(5, 1, bias=False), + ), + mixed_precision=mixed_precision, + ).cuda(self.rank) outer_param = model.get_parameter("_fsdp_wrapped_module.flat_param") - inner_param = model.get_parameter("_fsdp_wrapped_module._fpw_module.0._fsdp_wrapped_module.flat_param") + inner_param = model.get_parameter( + "_fsdp_wrapped_module._fpw_module.0._fsdp_wrapped_module.flat_param" + ) outer_full_param_size = outer_param.numel() * self.world_size # First lets validate our assumption about resharding output = model(torch.zeros(5).cuda(self.rank)) # the root FSDP module keeps all params around - self.assertEqual(outer_full_param_size, outer_param._full_param_padded.storage().size()) + self.assertEqual( + outer_full_param_size, outer_param._full_param_padded.storage().size() + ) self.assertEqual(0, inner_param._full_param_padded.storage().size()) output.backward() @@ -247,31 +389,171 @@ def test_reshard_outside_forward_backward_iteration(self): # now lets repeat it with summon done in between output = model(torch.zeros(5).cuda(self.rank)) - with model._summon_full_params(): + self.assertEqual( + outer_full_param_size, outer_param._full_param_padded.storage().size() + ) + self.assertEqual(0, inner_param._full_param_padded.storage().size()) + with model.summon_full_params( + model, + rank0_only=rank0_only, + writeback=not rank0_only, + offload_to_cpu=offload_to_cpu, + ): pass - self.assertEqual(outer_full_param_size, outer_param._full_param_padded.storage().size()) + self.assertEqual( + outer_full_param_size, outer_param._full_param_padded.storage().size() + ) self.assertEqual(0, inner_param._full_param_padded.storage().size()) output.backward() - with model._summon_full_params(): + with model.summon_full_params( + model, + rank0_only=rank0_only, + writeback=not rank0_only, + offload_to_cpu=offload_to_cpu, + ): pass self.assertEqual(0, outer_param._full_param_padded.storage().size()) self.assertEqual(0, inner_param._full_param_padded.storage().size()) + @skip_if_lt_x_gpu(2) + @parametrize("rank0_only", [True, False]) + @parametrize("offload_to_cpu", [True, False]) + @parametrize("mixed_precision", [True, False]) + def test_params_are_unflattenned(self, rank0_only, offload_to_cpu, mixed_precision): + layer_shape = (10, 12) + model = nn.Linear(*layer_shape, bias=False).cuda(self.rank) + mixed_precision = MixedPrecision() if mixed_precision else None + fsdp_model = FSDP(deepcopy(model), mixed_precision=mixed_precision).cuda( + self.rank + ) + + def _get_flat_param(): + return fsdp_model.get_parameter("_fsdp_wrapped_module.flat_param") + + flattened_param = _get_flat_param() + self.assertEqual(layer_shape[0] * layer_shape[1] / 2, flattened_param.numel()) + + with fsdp_model.summon_full_params( + fsdp_model, + rank0_only=rank0_only, + writeback=not rank0_only, + offload_to_cpu=offload_to_cpu, + ): + if self.rank == 0 or not rank0_only: + self.assertEqual(fsdp_model.weight.shape, model.weight.shape) + expected_device = ( + torch.device("cpu") + if offload_to_cpu + else torch.device("cuda", torch.cuda.current_device()) + ) + self.assertTrue(expected_device == fsdp_model.weight.device) + else: + # Nonzero rank with rank0_only maintains original params. + flat_within_ctx = _get_flat_param() + self.assertEqual(flat_within_ctx, flattened_param) + self.assertEqual( + flat_within_ctx.device, torch.device(torch.cuda.current_device()) + ) + + # CPU offload should restore the param device + param = next(fsdp_model.parameters()) + self.assertTrue( + param.device == torch.device("cuda", torch.cuda.current_device()) + ) + + @skip_if_lt_x_gpu(2) + @parametrize("rank0_only", [True, False]) + @parametrize("offload_to_cpu", [True, False]) + @parametrize("mixed_precision", [True, False]) + def test_params_count_and_value(self, rank0_only, offload_to_cpu, mixed_precision): + mixed_precision = MixedPrecision() if mixed_precision else None + fsdp_model = FSDP( + NestedWrappedModule( + group=dist.distributed_c10d._get_default_group(), + wrap_fsdp=True, + fsdp_init_mode=FSDPInitMode.CUDA_BEFORE, + mixed_precision=mixed_precision, + ), + mixed_precision=mixed_precision, + ) + model = NestedWrappedModule( + group=dist.distributed_c10d._get_default_group(), + wrap_fsdp=False, + fsdp_init_mode=FSDPInitMode.CUDA_BEFORE, + ) + + dev = ( + torch.device("cpu") + if offload_to_cpu + else torch.device("cuda", torch.cuda.current_device()) + ) + + params_to_compare = ( + [p.to(dev) for p in model.module.parameters()] + if not rank0_only or self.rank == 0 + else list(p.clone() for p in fsdp_model.parameters()) + ) + with fsdp_model.summon_full_params( + fsdp_model, rank0_only=rank0_only, writeback=not rank0_only + ): + for p1, p2 in itertools.zip_longest( + fsdp_model.parameters(), params_to_compare + ): + self.assertEqual(p1, p2) + + # CPU offload should restore the param device + param = next(fsdp_model.parameters()) + self.assertTrue( + param.device == torch.device("cuda", torch.cuda.current_device()) + ) @skip_if_lt_x_gpu(2) - def test_params_are_unflatenned(self): - model = FSDP(nn.Linear(self.world_size, 1, bias=False)).cuda(self.rank) + def test_raises_rank0_with_writeback(self): + fsdp_model = FSDP( + NestedWrappedModule( + group=dist.distributed_c10d._get_default_group(), + wrap_fsdp=True, + fsdp_init_mode=FSDPInitMode.CUDA_BEFORE, + ) + ) + + with self.assertRaisesRegex(ValueError, "is not supported"): + with fsdp_model.summon_full_params( + fsdp_model, rank0_only=True, writeback=True + ): + pass - flattened_param = model.get_parameter("_fsdp_wrapped_module.flat_param") - self.assertEqual(1, flattened_param.numel()) + @skip_if_lt_x_gpu(2) + @parametrize("prefix", ["", "test_prefix"]) + @parametrize("recurse", [False, True]) + def test_named_parameters_buffers(self, prefix: str, recurse: bool): + fsdp_model = FSDP( + NestedWrappedModule( + group=dist.distributed_c10d._get_default_group(), + wrap_fsdp=True, + fsdp_init_mode=FSDPInitMode.CUDA_BEFORE, + ) + ) + fsdp_model.register_buffer("buffer", torch.ones(1)) + model = NestedWrappedModule( + group=dist.distributed_c10d._get_default_group(), + wrap_fsdp=False, + fsdp_init_mode=FSDPInitMode.CUDA_BEFORE, + ) + model.register_buffer("buffer", torch.ones(1)) + with fsdp_model.summon_full_params(fsdp_model): + for call in ["named_parameters", "named_buffers"]: + for (n1, p1), (n2, p2) in itertools.zip_longest( + getattr(fsdp_model, call)(prefix=prefix, recurse=recurse), + getattr(model, call)(prefix=prefix, recurse=recurse), + ): + self.assertEqual(n1, n2) + self.assertEqual(p1, p2) - with model._summon_full_params(): - a = model.weight.flatten().detach() - b = flattened_param.detach() - self.assertTrue(torch.equal(a, b)) instantiate_parametrized_tests(TestSummonFullParams) +instantiate_parametrized_tests(TestSummonFullParamsNoShard) if __name__ == "__main__": diff --git a/test/distributed/fsdp/test_fsdp_traversal.py b/test/distributed/fsdp/test_fsdp_traversal.py new file mode 100644 index 000000000000..69ceca082441 --- /dev/null +++ b/test/distributed/fsdp/test_fsdp_traversal.py @@ -0,0 +1,57 @@ +# Owner(s): ["oncall: distributed"] + +import sys + +from torch import distributed as dist +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.testing._internal.common_distributed import skip_if_lt_x_gpu +from torch.testing._internal.common_fsdp import ( + FSDPTest, + NestedWrappedModule, +) +from torch.testing._internal.common_utils import ( + TEST_WITH_DEV_DBG_ASAN, + run_tests, +) + + +if not dist.is_available(): + print("Distributed not available, skipping tests", file=sys.stderr) + sys.exit(0) + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) + + +class TestTraversal(FSDPTest): + @property + def world_size(self): + return 2 + + @skip_if_lt_x_gpu(2) + def test_fsdp_modules(self): + group = dist.distributed_c10d._get_default_group() + model = NestedWrappedModule(group, wrap_fsdp=True) + modules = FSDP.fsdp_modules(model) + self.assertEquals( + modules, [ + model.module.get_submodule("1"), + model.module.get_submodule("1").get_submodule("0"), + model.module.get_submodule("2"), + ] + ) + modules = FSDP.fsdp_modules(model, root_only=True) + self.assertEqual( + modules, [ + model.module.get_submodule("1"), + model.module.get_submodule("2"), + ] + ) + + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/fsdp/test_fsdp_uneven.py b/test/distributed/fsdp/test_fsdp_uneven.py index 59b111d6d3cd..93b89f547e1f 100644 --- a/test/distributed/fsdp/test_fsdp_uneven.py +++ b/test/distributed/fsdp/test_fsdp_uneven.py @@ -10,7 +10,6 @@ from torch.testing._internal.common_distributed import skip_if_lt_x_gpu from torch.testing._internal.common_fsdp import ( FSDPTest, - get_full_params, ) from torch.testing._internal.common_utils import TEST_WITH_DEV_DBG_ASAN, run_tests @@ -61,11 +60,13 @@ def test_one_iteration(self): out.float().sum().backward() optim.step() optim.zero_grad() - get_full_params(model) - weight_out = model.module.weight.T.clone() - self.assertEqual(ref_forward_output_my_rank, out) - self.assertEqual(ref_weight_out, weight_out) + with model.summon_full_params(model): + torch.cuda.synchronize() # TODO: This is here because it was + # originally part of get_full_params(), debug why it is needed here. + weight_out = model.module.weight.T.clone() + self.assertEqual(ref_forward_output_my_rank, out) + self.assertEqual(ref_weight_out, weight_out) if __name__ == "__main__": diff --git a/test/distributed/fsdp/test_shard_utils.py b/test/distributed/fsdp/test_shard_utils.py new file mode 100644 index 000000000000..1d24b2e3c681 --- /dev/null +++ b/test/distributed/fsdp/test_shard_utils.py @@ -0,0 +1,187 @@ +# Owner(s): ["oncall: distributed"] + +import torch +from torch.distributed._shard.sharded_tensor import ( + init_from_local_shards, + Shard, + ShardMetadata, +) +from torch.distributed._shard.sharding_spec import ( + ChunkShardingSpec, + EnumerableShardingSpec, +) +from torch.distributed.distributed_c10d import _get_default_group +from torch.distributed.fsdp.shard_utils import ( + _offsets_to_split_sizes, + _reshard_flatten_tensor, +) +from torch.testing._internal.common_distributed import skip_if_lt_x_gpu +from torch.testing._internal.common_fsdp import FSDPTest +from torch.testing._internal.common_utils import TestCase + + +class TestShardUtils(TestCase): + def test_offsets_to_split_sizes(self): + tensor_numel = 40 + + def _get_and_check_split_sizes( + world_size, + in_offsets, + out_offsets, + in_split_sizes, + ): + + for my_rank in range(world_size): + _in_split_sizes = in_split_sizes[my_rank] + _out_split_sizes = [ + in_split_sizes[i][my_rank] for i in range(world_size) + ] + res_in_split_sizes, res_out_split_sizes = _offsets_to_split_sizes( + in_offsets, out_offsets, tensor_numel, world_size, my_rank + ) + self.assertEqual(_in_split_sizes, res_in_split_sizes) + self.assertEqual(_out_split_sizes, res_out_split_sizes) + + # The tensor size can be evenly divided by the world size. + world_size = 4 + in_offsets = [0, 10, 20, 30] + out_offsets = [0, 10, 20, 30] + in_split_sizes = [ + [10, 0, 0, 0], + [0, 10, 0, 0], + [0, 0, 10, 0], + [0, 0, 0, 10], + ] + _get_and_check_split_sizes(world_size, in_offsets, out_offsets, in_split_sizes) + + world_size = 4 + in_offsets = [0, 3, 17, 18] + out_offsets = [0, 10, 20, 30] + in_split_sizes = [ + [3, 0, 0, 0], + [7, 7, 0, 0], + [0, 1, 0, 0], + [0, 2, 10, 10], + ] + _get_and_check_split_sizes(world_size, in_offsets, out_offsets, in_split_sizes) + + world_size = 4 + in_offsets = [0, 10, 20, 30] + out_offsets = [0, 3, 17, 18] + in_split_sizes = [ + [3, 7, 0, 0], + [0, 7, 1, 2], + [0, 0, 0, 10], + [0, 0, 0, 10], + ] + _get_and_check_split_sizes(world_size, in_offsets, out_offsets, in_split_sizes) + + world_size = 4 + in_offsets = [0, 7, 11, 25] + out_offsets = [0, 10, 17, 18] + in_split_sizes = [ + [7, 0, 0, 0], + [3, 1, 0, 0], + [0, 6, 1, 7], + [0, 0, 0, 15], + ] + _get_and_check_split_sizes(world_size, in_offsets, out_offsets, in_split_sizes) + + # The tensor size cannot be evenly divided by the world size. + world_size = 6 + in_offsets = [0, 7, 14, 21, 28, 35] + out_offsets = [0, 7, 14, 21, 28, 35] + in_split_sizes = [ + [7, 0, 0, 0, 0, 0], + [0, 7, 0, 0, 0, 0], + [0, 0, 7, 0, 0, 0], + [0, 0, 0, 7, 0, 0], + [0, 0, 0, 0, 7, 0], + [0, 0, 0, 0, 0, 5], + ] + _get_and_check_split_sizes(world_size, in_offsets, out_offsets, in_split_sizes) + + world_size = 6 + in_offsets = [0, 0, 10, 11, 28, 40] + out_offsets = [0, 7, 14, 21, 28, 35] + in_split_sizes = [ + [0, 0, 0, 0, 0, 0], + [7, 3, 0, 0, 0, 0], + [0, 1, 0, 0, 0, 0], + [0, 3, 7, 7, 0, 0], + [0, 0, 0, 0, 7, 5], + [0, 0, 0, 0, 0, 0], + ] + _get_and_check_split_sizes(world_size, in_offsets, out_offsets, in_split_sizes) + + +class TestShardUtilsDistributed(FSDPTest): + @property + def world_size(self): + return 2 + + def _create_local_chunk(self, tensor): + chunk = tensor.chunk(2)[self.rank] + offsets = [0] if self.rank == 0 else [tensor.shape[0] - chunk.shape[0]] + shard = Shard.from_tensor_and_offsets(chunk, offsets, self.rank) + return init_from_local_shards([shard], tensor.numel()) + + def _create_enumerate_spec(self, tensor): + # Since placement is not used, always set placement to rank0 to mimic + # the actual usage. + metadata = [ + ShardMetadata([0], [101], placement="rank0/cuda:0"), + ShardMetadata([101], [900], placement="rank0/cuda:0"), + ] + return EnumerableShardingSpec(metadata) + + def _create_chunk_spec(self): + return ChunkShardingSpec(dim=0, placements=["rank0/cuda:0"]) + + def _create_tensor(self): + # Keep everything deterministic. + torch.manual_seed(0) + return torch.rand(1001).cuda() + + @skip_if_lt_x_gpu(2) + def test_reshard_flatten_tensor(self): + def get_offsets(tensor, shard): + if self.rank == 0: + return [0] + else: + return [tensor.shape[0] - shard.shape[0]] + + tensor = self._create_tensor() + + shard = _reshard_flatten_tensor( + self._create_local_chunk(tensor), + self._create_enumerate_spec(tensor), + self.world_size, + self.rank, + tensor.device, + _get_default_group(), + ) + offsets = [0] if self.rank == 0 else [tensor.shape[0] - shard.shape[0]] + shard = Shard.from_tensor_and_offsets(shard, offsets, self.rank) + uneven_sharded_tensor = init_from_local_shards([shard], tensor.numel()) + + shard = _reshard_flatten_tensor( + uneven_sharded_tensor, + self._create_chunk_spec(), + self.world_size, + self.rank, + tensor.device, + _get_default_group(), + ) + offsets = [0] if self.rank == 0 else [tensor.shape[0] - shard.shape[0]] + shard = Shard.from_tensor_and_offsets(shard, offsets, self.rank) + even_sharded_tensor = init_from_local_shards([shard], tensor.numel()) + + output = torch.empty(tensor.shape).cuda() if self.rank == 0 else None + even_sharded_tensor.gather(0, output) + if self.rank == 0: + self.assertEqual(tensor, output) + output = torch.empty(tensor.shape).cuda() if self.rank == 0 else None + uneven_sharded_tensor.gather(0, output) + if self.rank == 0: + self.assertEqual(tensor, output) diff --git a/test/distributed/fsdp/test_utils.py b/test/distributed/fsdp/test_utils.py index 2fdede0e4a05..2326c7137c30 100644 --- a/test/distributed/fsdp/test_utils.py +++ b/test/distributed/fsdp/test_utils.py @@ -1,14 +1,17 @@ # Owner(s): ["oncall: distributed"] +from collections import OrderedDict import random import sys import unittest import torch +import torch.nn as nn from torch import distributed as dist -from torch.distributed.fsdp.utils import ( +from torch.distributed.fsdp._utils import ( _apply_to_tensors, ) +from torch.distributed.utils import _replace_by_prefix from torch.testing._internal.common_utils import ( TEST_WITH_DEV_DBG_ASAN, instantiate_parametrized_tests, @@ -57,7 +60,7 @@ def get_a_tensor(): data.append({"key1": get_a_tensor(), "key2": {1: get_a_tensor()}, "key3": 3}) data.insert(0, set(["x", get_a_tensor(), get_a_tensor()])) data.append(([1], get_a_tensor(), (1), [get_a_tensor()], set((1, 2)))) - od = dict() + od = OrderedDict() od["k"] = "value" data.append(od) @@ -73,6 +76,39 @@ def fn(t): for i, v in enumerate(data): self.assertEqual(type(new_data[i]), type(v)) + def test_replace_by_prefix(self): + state_dict = { + "layer.a": torch.tensor(1), + "abc.layer.def": torch.tensor(2), + "layer.b": torch.tensor(3), + } + original_state_dict = state_dict.copy() + _replace_by_prefix(state_dict, "layer.", "module.layer.") + assert state_dict == { + "module.layer.a": torch.tensor(1), + "abc.layer.def": torch.tensor(2), + "module.layer.b": torch.tensor(3), + } + _replace_by_prefix(state_dict, "module.layer.", "layer.") + assert state_dict == original_state_dict + + + def test_packed_sequence(self): + """Test to ensure RNN packed sequences are modified correctly.""" + rnn = nn.RNN(5, 5) + + x = torch.rand((5, 1, 5), dtype=torch.float) + seq_length = torch.tensor([4], dtype=torch.int) + + def fill_fn(x): + x.fill_(0) + + x = nn.utils.rnn.pack_padded_sequence(x, seq_length) + x, h = rnn(x) + x = _apply_to_tensors(fill_fn, x) + x, _ = nn.utils.rnn.pad_packed_sequence(x) + self.assertEqual(torch.sum(x), 0) + instantiate_parametrized_tests(TestUtils) diff --git a/test/distributed/fsdp/test_wrap.py b/test/distributed/fsdp/test_wrap.py index 0b4c1f8acc6c..2a72860d1f5b 100644 --- a/test/distributed/fsdp/test_wrap.py +++ b/test/distributed/fsdp/test_wrap.py @@ -5,7 +5,6 @@ import os import tempfile import unittest - import torch import torch.nn as nn import torch.nn.functional as F @@ -15,9 +14,13 @@ BackwardPrefetch, ) from torch.distributed.fsdp.wrap import ( - default_auto_wrap_policy, + always_wrap_policy, + size_based_auto_wrap_policy, enable_wrap, + _or_policy, wrap, + _wrap_batchnorm_individually, + transformer_auto_wrap_policy, ) from torch.testing._internal.common_distributed import ( skip_if_lt_x_gpu, @@ -27,6 +30,7 @@ FSDPTest, FSDPInitMode, _maybe_cuda, + TransformerWithSharedParams, ) from torch.testing._internal.common_utils import ( FILE_SCHEMA, @@ -36,6 +40,16 @@ parametrize, instantiate_parametrized_tests, ) +from torch.nn import TransformerEncoderLayer, TransformerDecoderLayer + +class BatchNormNet(nn.Module): + def __init__(self): + super().__init__() + self.lin = nn.Linear(10, 10, bias=False) + self.bn1 = nn.BatchNorm1d(10) + self.bn2 = nn.BatchNorm2d(10) + self.bn3 = nn.BatchNorm3d(10) + self.sync_bn = nn.SyncBatchNorm(10) class WrapMethod(Enum): FSDP_CTOR = auto() @@ -67,6 +81,15 @@ def get_model(cuda=True): sequential = sequential.cuda() return sequential + @staticmethod + def verify_model_all_wrapped(cls, model): + cls.assertTrue(isinstance(model, FSDP)) + cls.assertTrue(isinstance(model.module[0], FSDP)) + cls.assertTrue(isinstance(model.module[1], FSDP)) + cls.assertTrue(isinstance(model.module[2], FSDP)) + cls.assertTrue(isinstance(model.module[2].module[0], FSDP)) + cls.assertTrue(isinstance(model.module[2].module[1], FSDP)) + @staticmethod def verify_model(cls, model): cls.assertTrue(isinstance(model, FSDP)) @@ -123,7 +146,76 @@ def test_error_already_wrapped(self, nested, fsdp_init_mode): wrapped_fsdp = wrapped_fsdp.cuda() with self.assertRaisesRegex(ValueError, "to NOT be FullyShardedDataParallel"): - mod = FSDP(wrapped_fsdp, fsdp_auto_wrap_policy=default_auto_wrap_policy) + mod = FSDP(wrapped_fsdp, auto_wrap_policy=size_based_auto_wrap_policy) + + @skip_if_lt_x_gpu(2) + @parametrize("use_or_policy", [True, False]) + def test_wrap_batchnorm_individually(self, use_or_policy): + def never_wrap_policy(*args, **kwargs): + return False + + policy = ( + functools.partial( + _or_policy, + policies=[never_wrap_policy, _wrap_batchnorm_individually] + ) if use_or_policy else _wrap_batchnorm_individually + ) + model = BatchNormNet() + fsdp = FSDP(model, auto_wrap_policy=policy) + # Batchnorms should be wrapped + for layer in [fsdp.bn1, fsdp.bn2, fsdp.bn3, fsdp.sync_bn]: + self.assertTrue(isinstance(layer, FSDP)) + + self.assertFalse(isinstance(fsdp.lin, FSDP)) + + @skip_if_lt_x_gpu(2) + def test_bn_always_wrapped_individually(self): + """ + Ensures that by using _or_policy with _wrap_batchnorm_individually, even + if the other policy results in a module containing a BN unit being + wrapped, the contained BN unit will still be individually wrapped. + """ + class MyModule(nn.Module): + def __init__(self): + super().__init__() + self.bn_container = BatchNormNet() + + def wrap_bn_container(module, recurse, *args, **kwargs): + if recurse: + return True + return isinstance(module, BatchNormNet) + + my_policy = functools.partial( + _or_policy, + policies=[wrap_bn_container, _wrap_batchnorm_individually] + ) + mod = MyModule() + fsdp = FSDP(mod, auto_wrap_policy=my_policy) + + # Wrapping should be FSDP(FSDP(BatchNormNet(FSDP(BN)))) + # and not FSDP(FSDP(BatchNormNet(BN))) (in the latter the inner + # BN is not individually wrapped.) + + for bn in [ + fsdp.bn_container.bn1, + fsdp.bn_container.bn2, + fsdp.bn_container.bn3, + fsdp.bn_container.sync_bn + ]: + self.assertTrue(isinstance(bn, FSDP)) + + # if we just wrapped BN container, individual batchnorms are not + # wrapped. + mod = MyModule() + fsdp = FSDP(mod, auto_wrap_policy=wrap_bn_container) + self.assertTrue(isinstance(mod.bn_container, FSDP)) + for bn in [ + fsdp.bn_container.bn1, + fsdp.bn_container.bn2, + fsdp.bn_container.bn3, + fsdp.bn_container.sync_bn + ]: + self.assertFalse(isinstance(bn, FSDP)) @skip_if_lt_x_gpu(2) @parametrize( @@ -168,8 +260,8 @@ def forward(self, input): model = MyModel() wrapped_model = FSDP( model, - fsdp_auto_wrap_policy=functools.partial( - default_auto_wrap_policy, + auto_wrap_policy=functools.partial( + size_based_auto_wrap_policy, min_num_params=0, # wrap all modules ), cpu_offload=cpu_offload, @@ -216,6 +308,7 @@ def setUp(self) -> None: # For all the tests here, we use a fake group self.process_group = DummyProcessGroup(rank=0, size=1) + @unittest.skipIf(torch.cuda.device_count() < 2, "Requires at least 2 GPUs") @parametrize("wrap_method", [WrapMethod.FSDP_CTOR, WrapMethod.WRAP_API]) def test_wrap(self, wrap_method): if wrap_method == WrapMethod.WRAP_API: @@ -226,12 +319,13 @@ def test_wrap(self, wrap_method): layer = FSDP( nn.Linear(5, 5), process_group=self.process_group, - fsdp_auto_wrap_policy=functools.partial(default_auto_wrap_policy, min_num_params=1) + auto_wrap_policy=functools.partial(size_based_auto_wrap_policy, min_num_params=1) ) self.assertTrue(isinstance(layer, FSDP)) self.assertEqual(layer.rank, self.process_group.rank()) self.assertEqual(layer.world_size, self.process_group.size()) + @unittest.skipIf(torch.cuda.device_count() < 2, "Requires at least 2 GPUs") def test_wrap_disabled_outside_context(self): pg = self.process_group @@ -248,6 +342,7 @@ def __init__(self): self.assertFalse(isinstance(model.lin, FSDP)) self.assertTrue(isinstance(model.lin, nn.Linear)) + @unittest.skipIf(torch.cuda.device_count() < 2, "Requires at least 2 GPUs") def test_wrap_override_defaults(self): new_process_group = DummyProcessGroup(rank=0, size=2) with enable_wrap(wrapper_cls=FSDP, process_group=self.process_group): @@ -257,6 +352,35 @@ def test_wrap_override_defaults(self): self.assertEqual(layer.rank, 0) self.assertEqual(layer.world_size, 2) + @unittest.skipIf(not torch.cuda.is_available(), "Test Requires CUDA") + def test_always_wrap(self): + """ + Test to ensure that if `always_wrap_policy` is + passed into FSDP, all submodules are wrapped. + """ + seq = TestFSDPWrap.NestedSequentialModel.get_model(cuda=True) + model = FSDP(seq, process_group=self.process_group, auto_wrap_policy=always_wrap_policy) + TestFSDPWrap.NestedSequentialModel.verify_model_all_wrapped(self, model) + + @unittest.skipIf(torch.cuda.device_count() < 2, "Requires at least 2 GPUs") + def test_transformer_auto_wrap_policy(self): + model = TransformerWithSharedParams(group=self.process_group) + my_auto_wrap_policy = functools.partial( + transformer_auto_wrap_policy, + transformer_layer_cls={TransformerEncoderLayer, TransformerDecoderLayer} + ) + fsdp_model = FSDP( + model, + process_group=self.process_group, + auto_wrap_policy=my_auto_wrap_policy + ) + self.assertTrue(isinstance(fsdp_model, FSDP)) + for layer in fsdp_model.module.module.transformer.encoder.layers: + self.assertTrue(isinstance(layer, FSDP)) + for layer in fsdp_model.module.module.transformer.decoder.layers: + self.assertTrue(isinstance(layer, FSDP)) + + @unittest.skipIf(torch.cuda.device_count() < 2, "Requires at least 2 GPUs") def test_auto_wrap_api(self): """ Test to ensure with auto wrap, we wrap child modules correctly based on the min_num_params. @@ -264,37 +388,38 @@ def test_auto_wrap_api(self): """ sequential = TestFSDPWrap.NestedSequentialModel.get_model(cuda=False) my_auto_wrap_policy = functools.partial( - default_auto_wrap_policy, min_num_params=40 + size_based_auto_wrap_policy, min_num_params=40 ) model = FSDP( sequential, process_group=self.process_group, - fsdp_auto_wrap_policy=my_auto_wrap_policy + auto_wrap_policy=my_auto_wrap_policy ) TestFSDPWrap.NestedSequentialModel.verify_model(self, model) - + @unittest.skipIf(torch.cuda.device_count() < 2, "Requires at least 2 GPUs") def test_auto_wrap_preset_exclude_wrap(self): """ Test to ensure excluded modules are not wrapped, regardless if the total param size is greater than the - min_num_params. the default_auto_wrap_policy excludes wrapping for {nn.ModuleList, nn.ModuleDict} + min_num_params. the size_based_auto_wrap_policy excludes wrapping for {nn.ModuleList, nn.ModuleDict} """ sequential = nn.ModuleList([nn.Linear(5, 5), nn.Linear(5, 5)]) my_auto_wrap_policy = functools.partial( - default_auto_wrap_policy, min_num_params=40 + size_based_auto_wrap_policy, min_num_params=40 ) model = FSDP( sequential, process_group=self.process_group, - fsdp_auto_wrap_policy=my_auto_wrap_policy + auto_wrap_policy=my_auto_wrap_policy ) self.assertTrue(isinstance(model, FSDP)) self.assertTrue(isinstance(model[0], nn.Linear)) self.assertTrue(isinstance(model[1], nn.Linear)) + @unittest.skipIf(torch.cuda.device_count() < 2, "Requires at least 2 GPUs") def test_auto_wrap_preset_exclude_wrap_include_children(self): """ Test to ensure excluded modules are not wrapped, but children are if param size is greater than @@ -302,43 +427,45 @@ def test_auto_wrap_preset_exclude_wrap_include_children(self): """ sequential = nn.ModuleList([nn.Linear(10, 10)]) my_auto_wrap_policy = functools.partial( - default_auto_wrap_policy, min_num_params=40 + size_based_auto_wrap_policy, min_num_params=40 ) - model = FSDP(sequential, process_group=self.process_group, fsdp_auto_wrap_policy=my_auto_wrap_policy) + model = FSDP(sequential, process_group=self.process_group, auto_wrap_policy=my_auto_wrap_policy) self.assertTrue(isinstance(model, FSDP)) self.assertTrue(isinstance(model[0], FSDP)) + @unittest.skipIf(torch.cuda.device_count() < 2, "Requires at least 2 GPUs") def test_auto_wrap_preset_force_leaf(self): """ Test to ensure force-leaf modules are not wrapped, and children are not wrapped. The - default_auto_wrap_policy forces leaf modules of type {nn.MultiheadAttention} to not be wrapped + size_based_auto_wrap_policy forces leaf modules of type {nn.MultiheadAttention} to not be wrapped """ sequential = nn.Sequential(nn.Linear(10, 10), nn.MultiheadAttention(100, 1)) my_auto_wrap_policy = functools.partial( - default_auto_wrap_policy, min_num_params=40 + size_based_auto_wrap_policy, min_num_params=40 ) - model = FSDP(sequential, process_group=self.process_group, fsdp_auto_wrap_policy=my_auto_wrap_policy) + model = FSDP(sequential, process_group=self.process_group, auto_wrap_policy=my_auto_wrap_policy) self.assertTrue(isinstance(model.module[0], FSDP)) # Assert children of multihead attention are not wrapped self.assertTrue(isinstance(model.module[1], nn.MultiheadAttention)) self.assertTrue(isinstance(model.module[1].out_proj, nn.Linear)) + @unittest.skipIf(torch.cuda.device_count() < 2, "Requires at least 2 GPUs") def test_auto_wrap_preset_force_leaf_custom(self): """ Test to ensure force-leaf modules are not wrapped. """ my_auto_wrap_policy = functools.partial( - default_auto_wrap_policy, + size_based_auto_wrap_policy, min_num_params=40, - force_leaf_modules=default_auto_wrap_policy.FORCE_LEAF_MODULES.union( + force_leaf_modules=size_based_auto_wrap_policy.FORCE_LEAF_MODULES.union( {nn.Linear} ), ) sequential = nn.Sequential( nn.Linear(10, 10), nn.ModuleList([nn.Linear(10, 10)]) ) - model = FSDP(sequential, process_group=self.process_group, fsdp_auto_wrap_policy=my_auto_wrap_policy) + model = FSDP(sequential, process_group=self.process_group, auto_wrap_policy=my_auto_wrap_policy) # Model was wrapped in FSDP as no inner modules were wrapped. self.assertTrue(isinstance(model, FSDP)) self.assertTrue(isinstance(model.module[0], nn.Linear)) @@ -350,7 +477,8 @@ def test_auto_wrap_preset_force_leaf_custom(self): "cpu_offload", [CPUOffload(offload_params=False), CPUOffload(offload_params=True)] ) - def test_auto_wrap_smoke_test(self, fsdp_init_mode, cpu_offload): + @parametrize("use_device_id", [True, False]) + def test_auto_wrap_smoke_test(self, fsdp_init_mode, cpu_offload, use_device_id): # CPU offload and CUDA after don't work together as expected. if ( cpu_offload.offload_params and fsdp_init_mode == FSDPInitMode.CUDA_AFTER @@ -359,6 +487,9 @@ def test_auto_wrap_smoke_test(self, fsdp_init_mode, cpu_offload): device = torch.device("cuda") torch.cuda.set_device(0) + device_id = ( + torch.device("cuda", torch.cuda.current_device()) if use_device_id else None + ) # Random port in case the next test run quickly, same port would cause conflict. os.environ["MASTER_ADDR"] = "localhost" @@ -378,9 +509,11 @@ def test_auto_wrap_smoke_test(self, fsdp_init_mode, cpu_offload): try: sequential = TestFSDPWrap.NestedSequentialModel.get_model(cuda=(not cuda_after_init)) my_auto_wrap_policy = functools.partial( - default_auto_wrap_policy, min_num_params=40 + size_based_auto_wrap_policy, min_num_params=40 + ) + model = FSDP( + sequential, cpu_offload=cpu_offload, auto_wrap_policy=my_auto_wrap_policy, device_id=device_id ) - model = FSDP(sequential, cpu_offload=cpu_offload, fsdp_auto_wrap_policy=my_auto_wrap_policy) TestFSDPWrap.NestedSequentialModel.verify_model(self, model) if cuda_after_init: model = model.cuda() @@ -396,6 +529,62 @@ def test_auto_wrap_smoke_test(self, fsdp_init_mode, cpu_offload): except FileNotFoundError: pass + @unittest.skipIf(torch.cuda.device_count() < 2, "Requires at least 2 GPUs") + @parametrize("wrap_method", [WrapMethod.FSDP_CTOR, WrapMethod.WRAP_API]) + def test_always_wrap_with_ignored_modules(self, wrap_method: WrapMethod): + sequential = TestFSDPWrap.NestedSequentialModel.get_model(cuda=False) + ignored_modules = [sequential[1], sequential[2][0]] + fsdp_kwargs = { + "process_group": self.process_group, + "auto_wrap_policy": always_wrap_policy, + "ignored_modules": ignored_modules, + } + if wrap_method == WrapMethod.FSDP_CTOR: + model = FSDP(sequential, **fsdp_kwargs) + elif wrap_method == WrapMethod.WRAP_API: + with enable_wrap(wrapper_cls=FSDP, **fsdp_kwargs): + model = wrap(sequential) + else: + assert 0, f"Unsupported wrap method: {wrap_method}" + # All non-ignored modules should be wrapped with FSDP + self.assertTrue(isinstance(model, FSDP)) + self.assertTrue(isinstance(model.module[0], FSDP)) + self.assertTrue(isinstance(model.module[1], nn.Linear)) + self.assertTrue(isinstance(model.module[2], FSDP)) + self.assertTrue(isinstance(model.module[2].module[0], nn.Linear)) + self.assertTrue(isinstance(model.module[2].module[1], FSDP)) + + @unittest.skipIf(torch.cuda.device_count() < 2, "Requires at least 2 GPUs") + @parametrize("wrap_method", [WrapMethod.FSDP_CTOR, WrapMethod.WRAP_API]) + def test_auto_wrap_with_ignored_modules(self, wrap_method: WrapMethod): + sequential = TestFSDPWrap.NestedSequentialModel.get_model(cuda=False) + ignored_modules = [sequential[1], sequential[2][0]] + my_auto_wrap_policy = functools.partial( + size_based_auto_wrap_policy, min_num_params=40, + ) + fsdp_kwargs = { + "process_group": self.process_group, + "auto_wrap_policy": my_auto_wrap_policy, + "ignored_modules": ignored_modules, + } + if wrap_method == WrapMethod.FSDP_CTOR: + model = FSDP(sequential, **fsdp_kwargs) + elif wrap_method == WrapMethod.WRAP_API: + with enable_wrap(wrapper_cls=FSDP, **fsdp_kwargs): + model = wrap(sequential) + else: + assert 0, f"Unsupported wrap method: {wrap_method}" + # Since the 2nd linear (`sequential[1]`) is ignored, the wrapping + # policy does not exceed the parameter threshold before the inner + # sequential (`sequential[2]`) anymore; hence, it flattens + # `sequential[0]` and `sequential[2][0]` into `model` and leaves + # `sequential[1]` and `sequential[2][1]` as-is since they are ignored + self.assertTrue(isinstance(model, FSDP)) + self.assertTrue(isinstance(model.module[0], nn.Linear)) + self.assertTrue(isinstance(model.module[1], nn.Linear)) + self.assertTrue(isinstance(model.module[2], nn.Sequential)) + self.assertTrue(isinstance(model.module[2][0], nn.Linear)) + self.assertTrue(isinstance(model.module[2][1], nn.Linear)) instantiate_parametrized_tests(TestFSDPWrap) diff --git a/test/distributed/optim/test_zero_redundancy_optimizer.py b/test/distributed/optim/test_zero_redundancy_optimizer.py index de8ea511b636..ec7db75d49a1 100644 --- a/test/distributed/optim/test_zero_redundancy_optimizer.py +++ b/test/distributed/optim/test_zero_redundancy_optimizer.py @@ -6,19 +6,17 @@ # LICENSE file in the root directory of this source tree. import copy -import itertools import os import sys +import unittest from contextlib import suppress -from typing import Any, List, Type, cast +from typing import Any, List, cast import numpy as np import torch import torch.distributed as dist -import unittest - if not dist.is_available(): print("Distributed not available, skipping tests", file=sys.stderr) sys.exit(0) @@ -33,16 +31,17 @@ from torch.distributed.optim import ZeroRedundancyOptimizer from torch.distributed.optim.zero_redundancy_optimizer import _broadcast_object from torch.nn.parallel import DistributedDataParallel as DDP -from torch.optim import SGD -from torch.testing._internal import common_distributed, common_utils +from torch.optim import SGD, AdamW +from torch.testing._internal import common_distributed from torch.testing._internal.common_utils import ( + IS_WINDOWS, TEST_WITH_ASAN, TEST_WITH_DEV_DBG_ASAN, - sandcastle_skip_if, + instantiate_parametrized_tests, + parametrize, + run_tests, ) -from torch.testing._internal.common_utils import IS_WINDOWS - try: import torchvision HAS_TORCHVISION = True @@ -60,30 +59,19 @@ def _get_backend_for_tests(): BACKEND = _get_backend_for_tests() -DEVICE = "cuda" if torch.cuda.is_available() else "cpu" - - -def check_same_model_params(model_a: torch.nn.Module, model_b: torch.nn.Module, message: str = "") -> None: - for p_a, p_b in zip(model_a.parameters(), model_b.parameters()): - assert torch.allclose(p_a, p_b, atol=1e-3), f"Model parameters differ\n{p_a} {p_b}\n" + message - - for b_a, b_b in zip(model_a.buffers(), model_b.buffers()): - assert torch.allclose(b_a, b_b), f"Model buffers differ {b_a} - {b_b}\n" + message - - @unittest.skipIf( - TEST_WITH_ASAN or TEST_WITH_DEV_DBG_ASAN, "CUDA + ASAN doesnt work." + TEST_WITH_ASAN or TEST_WITH_DEV_DBG_ASAN, "CUDA + ASAN does not work." ) class TestZeroRedundancyOptimizer(common_distributed.MultiProcessTestCase): def setUp(self): super(TestZeroRedundancyOptimizer, self).setUp() os.environ["WORLD_SIZE"] = str(self.world_size) - self._spawn_processes() @property def device(self): - return torch.device(self.rank) if torch.cuda.is_available() else torch.device("cpu") + return torch.device("cuda") if torch.cuda.is_available() \ + else torch.device("cpu") @property def world_size(self): @@ -94,7 +82,6 @@ def tearDown(self): torch.distributed.destroy_process_group() except AssertionError: pass - try: os.remove(self.file_name) except OSError: @@ -104,75 +91,94 @@ def dist_init(self, rank, world_size=-1, backend=BACKEND): if (world_size < 1): world_size = self.world_size store = dist.FileStore(self.file_name, world_size) - return dist.init_process_group(backend=backend, store=store, rank=rank, world_size=world_size) + return dist.init_process_group( + backend=backend, store=store, rank=rank, world_size=world_size, + ) # TODO: sandcastle_skip_if does not work here. @unittest.skipIf( - TEST_WITH_ASAN or TEST_WITH_DEV_DBG_ASAN, "CUDA + ASAN doesnt work." + TEST_WITH_ASAN or TEST_WITH_DEV_DBG_ASAN, "CUDA + ASAN does not work." ) class TestZeroRedundancyOptimizerSingleRank(TestZeroRedundancyOptimizer): def test_state_dict(self): - """Check that the ZeroRedundancyOptimizer exposes the expected state dict interface, - irrespective of the sharding. - """ + """Check that ZeroRedundancyOptimizer exposes the expected state dict + interface, irrespective of the sharding.""" self.dist_init(self.rank) - x = torch.tensor([1.0], device=DEVICE, requires_grad=True) - o = ZeroRedundancyOptimizer([x], optimizer_class=SGD, lr=0.1, momentum=0.9) + LR1 = 0.1 + LR2 = 0.01 + MOMENTUM = 0.9 + RECIPIENT_RANK = 0 # rank 0 is the only rank since the world size is 1 + x = torch.tensor([1.0], device=self.device, requires_grad=True) + o = ZeroRedundancyOptimizer( + [x], optimizer_class=SGD, lr=LR1, momentum=MOMENTUM, + ) x.backward() o.step() - self.assertEqual(x, torch.tensor([0.9], device=DEVICE)) - self.assertEqual(o.optim.state[x]["momentum_buffer"], torch.tensor([1.0], device=DEVICE)) + self.assertEqual(x, torch.tensor([0.9], device=self.device)) + self.assertEqual( + o.optim.state[x]["momentum_buffer"], + torch.tensor([1.0], device=self.device), + ) o.zero_grad() - o.consolidate_state_dict() # Sync state dict in between replicas - even if there are none + o.consolidate_state_dict(to=RECIPIENT_RANK) state_dict = o.state_dict() - # Check that the state dict is pytorch-compliant key wise + # Check that the state dict has keys compliant with PyTorch self.assertIn("param_groups", state_dict.keys()) self.assertIn("state", state_dict.keys()) - # Check that the pulled state is what we expect, and that we have all the expected keys + # Check that the state has the expected keys self.assertEqual(state_dict["param_groups"][0]["lr"], 0.1) self.assertEqual(state_dict["param_groups"][0]["momentum"], 0.9) self.assertFalse(state_dict["param_groups"][0]["nesterov"]) self.assertEqual(state_dict["param_groups"][0]["weight_decay"], 0.0) self.assertEqual(state_dict["param_groups"][0]["dampening"], 0.0) - # Check that the pulled state and the .param_groups attribute are in sync - for k in state_dict["param_groups"][0].keys(): + # Check that the state and the `param_groups` attribute are in sync + for k in state_dict["param_groups"][0]: if k != "params": - self.assertEqual(state_dict["param_groups"][0][k], o.param_groups[0][k]) + self.assertEqual( + state_dict["param_groups"][0][k], + o.param_groups[0][k], + ) - # Check that it's correctly loaded - o = ZeroRedundancyOptimizer([x], optimizer_class=SGD, lr=0.01) + # Check that the state is reloaded with the correct values and device + o = ZeroRedundancyOptimizer([x], optimizer_class=SGD, lr=LR2) o.load_state_dict(state_dict) + self.assertEqual( + o.optim.state[x]["momentum_buffer"], + torch.tensor([1.0], device=self.device), + ) - # Check that state is correct and on proper device - self.assertEqual(o.optim.state[x]["momentum_buffer"], torch.tensor([1.0], device=DEVICE)) - - # We should now be using a lr of 0.1, both within the optimizer - # and as exposed by the .param_groups attribute - assert o.param_groups[0]["lr"] == 0.1 + # We should we using `LR1` and not `LR2` after reloading, both within + # the optimizer and as exposed by the `param_groups` attribute + self.assertEqual(o.param_groups[0]["lr"], LR1) x.backward() o.step() - self.assertEqual(x, torch.tensor([0.71], device=DEVICE)) - self.assertEqual(o.optim.state[x]["momentum_buffer"], torch.tensor([1.9], device=DEVICE)) + self.assertEqual(x, torch.tensor([0.71], device=self.device)) + self.assertEqual( + o.optim.state[x]["momentum_buffer"], + torch.tensor([1.9], device=self.device), + ) - # Check that the exposed param_groups are on the proper device + # Check that the exposed `param_groups`` are on the proper device self.assertEqual(o.param_groups[0]["params"][0].device, x.device) def test_lr_scheduler(self): - """ Check that a normal torch lr_scheduler is usable with ZeroRedundancyOptimizer""" - + """Check that a normal PyTorch ``lr_scheduler`` is usable with + ZeroRedundancyOptimizer.""" self.dist_init(self.rank) - x = torch.tensor([1.0], device=DEVICE, requires_grad=True) - x2 = torch.tensor([1.0], device=DEVICE, requires_grad=True) - o = ZeroRedundancyOptimizer([x], optimizer_class=SGD, lr=0.01) - o2 = torch.optim.SGD([x2], lr=0.01) + NUM_ITERS = 5 + LR = 0.01 + x = torch.tensor([1.0], device=self.device, requires_grad=True) + x2 = torch.tensor([1.0], device=self.device, requires_grad=True) + o = ZeroRedundancyOptimizer([x], optimizer_class=SGD, lr=LR) + o2 = torch.optim.SGD([x2], lr=LR) s = torch.optim.lr_scheduler.StepLR(o, 1) s2 = torch.optim.lr_scheduler.StepLR(o2, 1) - for _ in range(5): + for _ in range(NUM_ITERS): x.backward() o.zero_grad() o.step() @@ -184,8 +190,9 @@ def test_lr_scheduler(self): self.assertEqual(x, x2) def test_step_with_kwargs(self): - """ Check that the `step(**kwargs)` interface is properly exposed""" + """Check that the ``step(**kwargs)`` interface is properly exposed.""" self.dist_init(self.rank) + LR = 0.1 class SGDWithStepKWArg(torch.optim.SGD): def step(self, closure=None, kwarg=None): @@ -193,18 +200,21 @@ def step(self, closure=None, kwarg=None): kwarg.append(5) kwarg: List[Any] = [] - x = torch.tensor([1.0], device=DEVICE, requires_grad=True) - o = ZeroRedundancyOptimizer([x], optimizer_class=SGDWithStepKWArg, lr=0.1) + x = torch.tensor([1.0], device=self.device, requires_grad=True) + o = ZeroRedundancyOptimizer( + [x], optimizer_class=SGDWithStepKWArg, lr=LR, + ) x.backward() o.step(0, kwarg=kwarg) self.assertEqual(kwarg, [5]) - self.assertEqual(x, torch.tensor([0.9], device=DEVICE)) + self.assertEqual(x, torch.tensor([0.9], device=self.device)) def test_step_with_extra_inner_key(self): - """Check that an optimizer adding extra keys to the param_groups - is properly handled, in that the new key is exposed to the user - """ + """Check that ZeroRedundancyOptimizer wrapping an optimizer that adds + extra keys to ``param_groups`` exposes those keys through ZeRO's own + ``param_groups``.""" self.dist_init(self.rank) + LR = 0.1 class SGDWithNewKey(torch.optim.SGD): # Dummy optimizer which adds a new key to the param groups @@ -212,33 +222,38 @@ def step(self, closure=None): super().step() self.param_groups[0]["new_key"] = 0.1 - x = torch.tensor([1.0], device=DEVICE, requires_grad=True) - o = ZeroRedundancyOptimizer([x], optimizer_class=SGDWithNewKey, lr=0.1) + x = torch.tensor([1.0], device=self.device, requires_grad=True) + o = ZeroRedundancyOptimizer([x], optimizer_class=SGDWithNewKey, lr=LR) x.backward() o.step() self.assertEqual(o.param_groups[0]["new_key"], 0.1) - self.assertEqual(x, torch.tensor([0.9], device=DEVICE)) + self.assertEqual(x, torch.tensor([0.9], device=self.device)) def test_step_without_closure(self): - """Check that the step() method (without closure) is handlded as expected""" + """Check that the ``step()`` method (without closure) is handled as + expected.""" self.dist_init(self.rank) + LR = 0.1 class SGDWithoutClosure(torch.optim.SGD): def step(self): return super().step() - x = torch.tensor([1.0], device=DEVICE, requires_grad=True) - o = ZeroRedundancyOptimizer([x], optimizer_class=SGDWithoutClosure, lr=0.1) + x = torch.tensor([1.0], device=self.device, requires_grad=True) + o = ZeroRedundancyOptimizer( + [x], optimizer_class=SGDWithoutClosure, lr=LR, + ) x.backward() o.step() - self.assertEqual(x, torch.tensor([0.9], device=DEVICE)) + self.assertEqual(x, torch.tensor([0.9], device=self.device)) def test_zero_grad(self): - """Check that the zero_grad attribute is properly handled""" + """Check that the ``zero_grad`` method is properly handled.""" self.dist_init(self.rank) + LR = 0.01 x = torch.rand(1) m = torch.nn.Linear(1, 1) - o = ZeroRedundancyOptimizer(m.parameters(), optimizer_class=SGD, lr=0.1) + o = ZeroRedundancyOptimizer(m.parameters(), optimizer_class=SGD, lr=LR) y = m(x) y.backward(x) self.assertNotEqual(m.weight.grad, torch.zeros_like(m.weight)) @@ -249,27 +264,51 @@ def test_zero_grad(self): def test_constructor(self): """Check the robustness of the ZeroRedundancyOptimizer constructor by - passing different values for `params`""" + passing different values for the ``params`` argument.""" self.dist_init(self.rank) - - m = torch.nn.Linear(1, 1) - # (input, expected error) - inputs = [ - ([], ValueError), # empty parameter list - (torch.randn(1), TypeError), # non-iterable: `torch.Tensor` - (1.2, TypeError), # non-iterable: `float` - ([{"params": m.parameters()}], TypeError), # iterable of dict - (list(m.parameters()) + [42], TypeError), # iterable containing non-`torch.Tensor` - (m.parameters(), None), # `params` as a generator - (list(m.parameters()), None) # `params` as a list + LR = 0.01 + m = torch.nn.Sequential( + torch.nn.Linear(5, 10), + torch.nn.Linear(10, 10), + torch.nn.Linear(10, 10), + ) + # Test various constructor inputs in the form: (input, expected error) + ctor_inputs = [ + ([], ValueError), # empty parameter list + (torch.randn(1), TypeError), # non-iterable: `torch.Tensor` + (1.2, TypeError), # non-iterable: `float` + ([ + {"params": [l.weight for l in m]}, + {"params": [l.bias for l in m]}, + ], None), # iterable of dict + (list(m.parameters()) + [42], TypeError), # iterable containing invalid type + (m.parameters(), None), # `params` as a generator + (list(m.parameters()), None) # `params` as a list ] + for ctor_input, error in ctor_inputs: + context = self.assertRaises(error) if error else suppress() + with context: + ZeroRedundancyOptimizer( + ctor_input, optimizer_class=SGD, lr=LR, + ) - for input, error in inputs: - if (error): - with self.assertRaises(error): - ZeroRedundancyOptimizer(input, optimizer_class=SGD, lr=0.1) - else: - ZeroRedundancyOptimizer(input, optimizer_class=SGD, lr=0.1) + # Test constructing with multiple parameter groups more thoroughly + WD = 0.01 + BETAS = (0.9, 0.999) + EPS = 1e-8 + params = [ + {"params": [l.weight for l in m], "weight_decay": 0.}, + {"params": [l.bias for l in m], "weight_decay": WD}, + ] + o = ZeroRedundancyOptimizer( + params, optimizer_class=AdamW, + lr=LR, betas=BETAS, eps=EPS, + ) + assert len(o.param_groups) == 2, \ + f"Expected 2 ZeRO param groups, but got {len(o.param_groups)}" + assert len(o.optim.param_groups) == 2, \ + "Expected 2 local optimizer param groups, but got " \ + f"{len(o.optim.param_groups)}" def test_same_dense_param_type(self): """Check that ZeroRedundancyOptimizer raises an exception if the input @@ -279,7 +318,7 @@ def test_same_dense_param_type(self): and varying parameter types is added. """ self.dist_init(self.rank) - + LR = 0.01 inputs = [ [torch.sparse_coo_tensor(size=(2, 3))], [torch.FloatTensor(1), torch.DoubleTensor(1)], @@ -288,37 +327,63 @@ def test_same_dense_param_type(self): ] for input in inputs: with self.assertRaises(ValueError): - ZeroRedundancyOptimizer(input, optimizer_class=SGD, lr=0.1) + ZeroRedundancyOptimizer(input, optimizer_class=SGD, lr=LR) class TestZeroRedundancyOptimizerDistributed(TestZeroRedundancyOptimizer): + @property + def device(self): + return torch.device(self.rank) if torch.cuda.is_available() \ + else torch.device("cpu") + @property def world_size(self): return min(4, max(2, torch.cuda.device_count())) - @common_distributed.skip_if_rocm - def test_step(self): - """ Check that the ZeroRedundancyOptimizer wrapper properly exposes the `.step()` interface""" + @property + def context(self): + return suppress() if not torch.cuda.is_available() \ + else torch.cuda.device(self.rank) - if self.rank >= self.world_size or (torch.cuda.is_available() and torch.cuda.device_count() < 2): - return + def _check_same_model_params( + self, + model_a: torch.nn.Module, + model_b: torch.nn.Module, + message: str = "", + ) -> None: + # Check that model parameters match + for p_a, p_b in zip(model_a.parameters(), model_b.parameters()): + torch.testing.assert_close( + p_a, p_b, atol=1e-3, rtol=1e-5, + msg=f"Model parameters differ:\n{p_a} {p_b}\n" + message, + ) + # Check that model buffers match + for b_a, b_b in zip(model_a.buffers(), model_b.buffers()): + torch.testing.assert_close( + b_a, b_b, + msg=f"Model buffers differ:\n{b_a} {b_b}\n" + message, + ) + @common_distributed.skip_if_no_gpu + @common_distributed.skip_if_rocm + def test_step(self): + """Check that ZeroRedundancyOptimizer properly exposes the ``step()`` + interface.""" self.dist_init(self.rank, world_size=self.world_size) + LR = 0.01 - context = suppress() if not torch.cuda.is_available() else torch.cuda.device(self.rank) - - with context: + with self.context: x = torch.tensor([float(self.rank + 1)], device=self.device) m = torch.nn.Linear(1, 1) m.weight.data = torch.tensor([[1.0]]) m.bias.data = torch.tensor([2.0]) - m_zero = copy.deepcopy(m) - m.to(self.device) - m_zero.to(self.device) + m = m.to(self.device) + m_zero = copy.deepcopy(m).to(self.device) - lr = 0.1 - o = SGD(m.parameters(), lr=lr) - o_zero = ZeroRedundancyOptimizer(m_zero.parameters(), optimizer_class=SGD, lr=lr) + o = SGD(m.parameters(), lr=LR) + o_zero = ZeroRedundancyOptimizer( + m_zero.parameters(), optimizer_class=SGD, lr=LR, + ) y = m(x) y.backward(x) @@ -337,24 +402,23 @@ def test_step(self): self.assertEqual(m.weight, m_zero.weight) self.assertEqual(m.bias, m_zero.bias) + @common_distributed.skip_if_no_gpu @common_distributed.skip_if_rocm def test_step_with_closure(self): - """ Check that the ZeroRedundancyOptimizer wrapper properly exposes the `.step(closure)` interface""" - - if self.rank >= self.world_size or (torch.cuda.is_available() and torch.cuda.device_count() < 2): - return - + """Check that ZeroRedundancyOptimizer properly exposes the + ``step(closure)`` interface.""" self.dist_init(self.rank, world_size=self.world_size) - context = suppress() if not torch.cuda.is_available() else torch.cuda.device(self.rank) - - with context: + with self.context: for bucket_view in [False, True]: x_val = self.rank + 1 weight = 1.0 bias = 2.0 error = 1.0 - target = torch.tensor([x_val * weight + bias + error], device=self.device) + target = torch.tensor( + [x_val * weight + bias + error], + device=self.device, + ) loss_fn = torch.nn.L1Loss() x = torch.tensor([float(x_val)], device=self.device) @@ -389,32 +453,62 @@ def closure(): self.assertEqual(m.weight, torch.tensor([[1.1]])) self.assertEqual(m.bias, torch.tensor([2.1])) + @common_distributed.skip_if_no_gpu + def test_lr_scheduler(self): + """Check that a normal PyTorch ``lr_scheduler`` is usable with + ZeroRedundancyOptimizer.""" + self.dist_init(self.rank) + x = torch.tensor([1.0], device=self.device, requires_grad=True) + x2 = torch.tensor([1.0], device=self.device, requires_grad=True) + o = ZeroRedundancyOptimizer([x], optimizer_class=SGD, lr=0.01) + o2 = torch.optim.SGD([x2], lr=0.01) + s = torch.optim.lr_scheduler.StepLR(o, 1) + s2 = torch.optim.lr_scheduler.StepLR(o2, 1) + for _ in range(5): + x.backward() + o.zero_grad() + o.step() + s.step() + x2.backward() + o2.zero_grad() + o2.step() + s2.step() + self.assertEqual(x, x2) + def test_sharding(self): - """ Check the sharding at construction time + """ + Check ZeroRedundancyOptimizer's parameter sharding at construction + time. NOTE: The correctness of this test depends on the ZeRO implementation using the sorted-greedy partitioning algorithm. For details, see - `ZeroRedundancyOptimizer._partition_parameters()` in - `zero_redundancy_optimizer.py`. + ``ZeroRedundancyOptimizer._partition_parameters()`` in + zero_redundancy_optimizer.py. """ self.dist_init(self.rank) + LR = 0.01 sizes = [9, 7, 5, 3] params = [] for size in sizes * self.world_size: params.append(torch.rand(size, 1)) - o = ZeroRedundancyOptimizer(params, optimizer_class=SGD, lr=0.1) - self.assertEqual(sum([x.numel() for x in o.optim.param_groups[0]["params"]]), sum(sizes)) + o = ZeroRedundancyOptimizer(params, optimizer_class=SGD, lr=LR) + self.assertEqual( + sum([x.numel() for x in o.optim.param_groups[0]["params"]]), + sum(sizes), + ) def test_add_param_group(self): - """Check that ZeroRedundancyOptimizer properly handles adding a new param_group a posteriori, - and that all ranks get a shard + """Check that ZeroRedundancyOptimizer properly handles adding a new + parameter group a posteriori and that all ranks get a shard of the + contained parameters. NOTE: The correctness of this test depends on the ZeRO implementation using the sorted-greedy partitioning algorithm. For details, see - `ZeroRedundancyOptimizer._partition_parameters()` in - `zero_redundancy_optimizer.py`. + ``ZeroRedundancyOptimizer._partition_parameters()`` in + zero_redundancy_optimizer.py. """ self.dist_init(self.rank) + LR = 0.01 # Test with all parameters trainable to begin with def all_trainable(): @@ -424,19 +518,26 @@ def all_trainable(): for size in sizes_world[:-1]: params.append(torch.rand(size, 1)) - # Make sure that the params are trainable, enforces size-based partitioning + # Make sure that the params are trainable so that they are factored + # into the size-based parameter partitioning for p in params: p.requires_grad = True - o = ZeroRedundancyOptimizer(params, optimizer_class=SGD, lr=0.1) - - assert len(o.param_groups) == 1 + o = ZeroRedundancyOptimizer(params, optimizer_class=SGD, lr=LR) + self.assertEqual(len(o.param_groups), 1) o.add_param_group({"params": [torch.rand(3, 1)]}) - - assert len(o.param_groups) == 2 - # Verify that added group is added to the correct partition making all have the same elements. - assert sum([x.numel() for g in o.optim.param_groups for x in g["params"]]) == sum(sizes) - assert len(o.optim.param_groups) == 2 + # Verify that new group is added to the correct partition, making + # all partitions have the same elements + self.assertEqual(len(o.param_groups), 2) + self.assertEqual( + sum([ + x.numel() + for g in o.optim.param_groups + for x in g["params"] + ]), + sum(sizes), + ) + self.assertEqual(len(o.optim.param_groups), 2) # Test a pathological config with a first big non-trainable param def some_trainable(): @@ -444,40 +545,108 @@ def some_trainable(): for size in [100, 3, 5, 2, 6, 4]: params.append(torch.rand(size, 1)) - # Make sure that the params are trainable, enforces size-based partitioning + # Make sure that all but the first param are trainable so that they + # are factored into the size-based parameter partitioning for p in params[1:]: p.requires_grad = True - o = ZeroRedundancyOptimizer(params, optimizer_class=SGD, lr=0.1) - - assert len(o.param_groups) == 1 + o = ZeroRedundancyOptimizer(params, optimizer_class=SGD, lr=LR) + self.assertEqual(len(o.param_groups), 1) o.add_param_group({"params": [torch.rand(3, 1)]}) - - assert len(o.param_groups) == 2 - assert len(o.optim.param_groups) == 2 + self.assertEqual(len(o.param_groups), 2) + self.assertEqual(len(o.optim.param_groups), 2) all_trainable() some_trainable() - @common_distributed.skip_if_lt_x_gpu(2) - def test_collect_shards(self): - """ Check the state consolidation mechanism, and the state dict exposed by ZeroRedundancyOptimizer""" + @common_distributed.skip_if_no_gpu + def test_multiple_param_groups(self): + """ + Check parity between constructing ZeRO with multiple parameter groups + upfront versus adding parameter groups to ZeRO after construction + versus a non-sharded optimizer. + """ self.dist_init(self.rank) - RECIPIENT_RANK = 0 - - # Run a dummy step so that the optimizer state dict exists - batch, input_width, hidden, target_width = 3, 20, 10, 5 - target = torch.rand((batch, target_width), device=self.device) - inputs = torch.rand((batch, input_width), device=self.device) - - model = torch.nn.Sequential(torch.nn.Linear(input_width, hidden), torch.nn.Linear(hidden, target_width)) - model.to(self.device) + BATCH_SIZE, NUM_ITERS = 8, 3 + INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM = 5, 10, 5 + WD, LR = 0.01, 0.01 + model1 = torch.nn.Sequential( + torch.nn.Linear(INPUT_DIM, HIDDEN_DIM), + torch.nn.Linear(HIDDEN_DIM, HIDDEN_DIM), + torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM), + ) + model2 = copy.deepcopy(model1) + model3 = copy.deepcopy(model1) + model1 = model1.to(self.device) + model2 = model2.to(self.device) + model3 = model3.to(self.device) + inputs = [ + torch.randn(BATCH_SIZE, INPUT_DIM).to(self.device) + for _ in range(NUM_ITERS) + ] + # Construct `optim1` with both parameter groups upfront + optim1 = ZeroRedundancyOptimizer( + [ + {"params": [l.weight for l in model1], "weight_decay": 0.}, + {"params": [l.bias for l in model1], "weight_decay": WD}, + ], + optimizer_class=AdamW, lr=LR, + ) + # Construct `optim2` by adding the second parameter after + optim2 = ZeroRedundancyOptimizer( + [l.weight for l in model2], + optimizer_class=AdamW, lr=LR, weight_decay=0., + ) + optim2.add_param_group( + {"params": [l.bias for l in model2], "weight_decay": WD} + ) + # Construct `optim3` as a non-sharded optimizer + optim3 = AdamW( + [ + {"params": [l.weight for l in model3], "weight_decay": 0.}, + {"params": [l.bias for l in model3], "weight_decay": WD}, + ], lr=LR, + ) + # Check parity over a few iterations + for input in inputs: + for model, optim in ( + (model1, optim1), (model2, optim2), (model3, optim3), + ): + optim.zero_grad() + out = model(input) + loss = out.sum() + loss.backward() + optim.step() + for layer1, layer2, layer3 in zip(model1, model2, model3): + torch.testing.assert_close(layer1.weight, layer2.weight) + torch.testing.assert_close(layer1.weight, layer3.weight) + torch.testing.assert_close(layer1.bias, layer2.bias) + torch.testing.assert_close(layer1.bias, layer3.bias) + @common_distributed.skip_if_no_gpu + @common_distributed.skip_if_rocm + def test_collect_shards(self): + """Check the state consolidation mechanism and the state dict exposed + by ZeroRedundancyOptimizer.""" + self.dist_init(self.rank) + LR = 1e-3 + MOMENTUM = 0.99 + BATCH_SIZE, INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM = 3, 20, 10, 5 + REFERENCE_RANK = 0 + target = torch.rand((BATCH_SIZE, OUTPUT_DIM), device=self.device) + inputs = torch.rand((BATCH_SIZE, INPUT_DIM), device=self.device) + model = torch.nn.Sequential( + torch.nn.Linear(INPUT_DIM, HIDDEN_DIM), + torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM), + ).to(self.device) loss_fn = torch.nn.L1Loss() loss_fn.to(self.device) - - # With SGD, Momentum is required to get a state to shard - optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=SGD, lr=0.1, momentum=0.99) + optimizer = ZeroRedundancyOptimizer( + model.parameters(), + optimizer_class=SGD, + lr=LR, + momentum=MOMENTUM, # ensure there exists state to shard + ) def closure(): optimizer.zero_grad() @@ -486,56 +655,78 @@ def closure(): loss.backward() return loss + # Run a dummy step so that the optimizer state dict exists _ = optimizer.step(closure=closure) - # Update the optimizer state on the reference rank - optimizer.consolidate_state_dict(to=RECIPIENT_RANK) - - # Fetch the state on the reference rank - # - check that it has the correct size - # - load it again - if self.rank == RECIPIENT_RANK: + # Get the optimizer state on the reference rank + optimizer.consolidate_state_dict(to=REFERENCE_RANK) + if self.rank == REFERENCE_RANK: + # Check that the state has the correct size optimizer_state_dict = optimizer.state_dict() - self.assertEqual(len(optimizer_state_dict["state"]), len(list(model.parameters()))) + self.assertEqual( + len(optimizer_state_dict["state"]), + len(list(model.parameters())), + ) else: optimizer_state_dict = {} + # Load the optimizer state on all ranks without any exceptions optimizer_state_dict = _broadcast_object( optimizer_state_dict, - src_rank=RECIPIENT_RANK, + src_rank=REFERENCE_RANK, group=dist.group.WORLD, device=self.device, ) - - # Load the optimizer state dict, check that no exception is raised optimizer.load_state_dict(optimizer_state_dict) - @sandcastle_skip_if( - IS_WINDOWS, - "Test is flaky on windows: https://github.com/pytorch/pytorch/issues/66059" - ) - def test_multiple_groups(self): - """ Check that the ZeroRedundancyOptimizer handles working with multiple process groups""" - self.dist_init(self.rank, self.world_size, dist.Backend.GLOO) - - # Only work with the even ranks, to check that the global_rank indexing is properly used - sub_group_ranks = list(filter(lambda x: x % 2 == 0, range(self.world_size))) - process_group = torch.distributed.new_group(ranks=sub_group_ranks, backend="gloo") + def test_nondefault_process_group(self): + """Check that ZeroRedundancyOptimizer works with a non-default process + group consisting only of even ranks.""" + # Skip the test if below the minimum world size since then the test is + # trivial + MIN_WORLD_SIZE = 4 + if self.world_size < MIN_WORLD_SIZE: + common_distributed.logger.info( + "Skipping `test_nondefault_process_group()` since world size " + f"of {self.world_size} is less than {MIN_WORLD_SIZE}" + ) + return + BACKEND = dist.Backend.GLOO + self.dist_init(self.rank, self.world_size, BACKEND) + # Use GPU if enough are available, or fall back to CPU otherwise, which + # is fine since Gloo backend supports both + if torch.cuda.is_available() and \ + torch.cuda.device_count() >= self.world_size: + device = torch.device(self.rank) + else: + device = torch.device("cpu") + # Create a new process group consisting of the even ranks to exercise + # the case where the global and local ranks do not necessarily match + subgroup_ranks = [r for r in range(self.world_size) if r % 2 == 0] + process_group = dist.new_group( + ranks=subgroup_ranks, backend=BACKEND, + ) + # Ranks not participating in the new process group are no longer needed + if self.rank not in subgroup_ranks: + return - # Make sure that all the ranks get different training data - # So that the sync check in between their models is meaningful + # Set different seeds across ranks so that each rank gets different + # training data and hence the model sync check is meaningful torch.manual_seed(self.rank) np.random.seed(self.rank) - # Standard deep learning setup - epochs, batch, input_width, hidden, target_width = 5, 3, 20, 10, 5 - loss_fn = torch.nn.L1Loss().to(self.device) + EPOCHS, BATCH_SIZE, INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM = 5, 3, 20, 10, 5 + LR = 1e-3 + MOMENTUM = 0.99 + REFERENCE_RANK = 0 + assert REFERENCE_RANK in subgroup_ranks, \ + "Reference rank must be in the new process group" + loss_fn = torch.nn.L1Loss().to(device) def check(optimizer): - # Just run a couple of epochs, check that the model is properly updated - for _ in range(epochs): - target = torch.rand((batch, target_width), device=self.device) - inputs = torch.rand((batch, input_width), device=self.device) + for _ in range(EPOCHS): + target = torch.rand((BATCH_SIZE, OUTPUT_DIM), device=device) + inputs = torch.rand((BATCH_SIZE, INPUT_DIM), device=device) def closure(): optimizer.zero_grad() @@ -543,167 +734,189 @@ def closure(): loss = loss_fn(output, target) loss /= self.world_size loss.backward() - dist.all_reduce(loss, group=process_group) # Not strictly needed for the test below - + dist.all_reduce(loss, group=process_group) return loss _ = optimizer.step(closure=closure) - # Check that all the params are the same on all ranks + # Check that the parameters match across ranks after a step for pg in optimizer.param_groups: for p in pg["params"]: - receptacle = [p.clone() for _ in sub_group_ranks] if self.rank == 0 else [] - dist.gather(p, receptacle, dst=0, group=process_group) - if self.rank == 0: - for sync_p in receptacle[1:]: - assert torch.all(torch.eq(receptacle[0], sync_p)), "Models differ in between ranks" - - if self.rank in sub_group_ranks: - # Model fitting in the broadcast bucket - model = torch.nn.Sequential( - torch.nn.Linear(input_width, hidden), - torch.nn.Linear(hidden, target_width), - ).to(self.device) + receptacle = [ + p.clone() for _ in subgroup_ranks + ] if self.rank == REFERENCE_RANK else [] + dist.gather( + p, receptacle, dst=REFERENCE_RANK, + group=process_group, + ) + if self.rank == REFERENCE_RANK: + reference_param = receptacle[0] + for param in receptacle[1:]: + torch.testing.assert_close( + reference_param, + param, + msg="Models differ between ranks", + ) - # With SGD, Momentum is required to get a state to shard - optimizer = ZeroRedundancyOptimizer( - model.parameters(), optimizer_class=SGD, lr=0.1, momentum=0.99, process_group=process_group - ) - check(optimizer) + model = torch.nn.Sequential( + torch.nn.Linear(INPUT_DIM, HIDDEN_DIM), + torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM), + ).to(device) + optimizer = ZeroRedundancyOptimizer( + model.parameters(), + optimizer_class=SGD, + lr=LR, + momentum=MOMENTUM, # ensure there exists state to shard + process_group=process_group, + ) + check(optimizer) - # Model not-fitting in the broadcast bucket + @common_distributed.skip_if_no_gpu + @parametrize( + "optimizer_class_str", + ["Adam", "AdamW", "SGD"], + # Use string to appease the internal test name parser + ) + @parametrize( + "maximize", + [False, True], + ) + def test_local_optimizer_parity( + self, + optimizer_class_str: str, + maximize: bool, + ): + """When combined with DDP, check that a local optimizer gives the same + results as wrapping that optimizer with ZeroRedundancyOptimizer.""" + self.dist_init(self.rank) + BATCHES = 20 + BATCH_SIZE = 64 + LR = 1e-3 + INPUT_DIM = 2 + HIDDEN_DIM = 3 + OUTPUT_DIM = 3 + torch.manual_seed(self.rank) + np.random.seed(self.rank) + if optimizer_class_str == "Adam": + optimizer_class = torch.optim.Adam + elif optimizer_class_str == "AdamW": + optimizer_class = torch.optim.AdamW + elif optimizer_class_str == "SGD": + optimizer_class = torch.optim.SGD + else: + assert 0, f"Unsupported optimizer class: {optimizer_class_str}" + + with self.context: + # Define a base model with a different buffer for each rank model = torch.nn.Sequential( - torch.nn.Linear(input_width, hidden), - torch.nn.Linear(hidden, target_width), + torch.nn.Linear(INPUT_DIM, HIDDEN_DIM), + torch.nn.Linear(HIDDEN_DIM, HIDDEN_DIM), + torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM), ).to(self.device) - - # With SGD, Momentum is required to get a state to shard - optimizer = ZeroRedundancyOptimizer( - model.parameters(), - optimizer_class=SGD, - lr=0.1, - momentum=0.99, - process_group=process_group, + model.register_buffer( + "test_buffer", torch.ones((1), device=self.device) * self.rank, + ) + # Define models/optimizers for DDP with ZeRO and DDP with local + # optimizer + defaults = {"maximize": True} if maximize else {} + sharded_optimizer = ZeroRedundancyOptimizer( + params=model.parameters(), optimizer_class=optimizer_class, + lr=LR, **defaults, + ) + sharded_ddp_model = DDP( + module=model, device_ids=[self.rank], + broadcast_buffers=True, find_unused_parameters=True, + ) + local_model = copy.deepcopy(model).to(self.device) + ddp_optimizer = optimizer_class( + local_model.parameters(), lr=LR, **defaults, + ) + ddp_model = DDP( + local_model, device_ids=[self.rank], + broadcast_buffers=True, find_unused_parameters=True, + ) + # Check that the model is properly synchronized between ranks + # at construction time + self._check_same_model_params( + sharded_ddp_model, ddp_model, + "Models differ from the start", ) - check(optimizer) - - @common_distributed.skip_if_no_gpu - def test_local_optimizer_parity(self): - """When combined with DDP, check that ZeroRedundancyOptimizer(optimizer) and the same monolithic optimizer - give the exact same results - """ - self.dist_init(self.rank) - BATCHS = 20 - - with torch.cuda.device(self.rank): - torch.manual_seed(self.rank) - np.random.seed(self.rank) - - def check_optimizer_equivalence(optimizer: Type[torch.optim.Optimizer], maximize: bool = False): - # Any model works. Add one different buffer per rank - model = torch.nn.Sequential( - torch.nn.Linear(2, 3), - torch.nn.Linear(3, 3), - torch.nn.Linear(3, 3), - ) - model.register_buffer("test_buffer", torch.ones((1)) * self.rank) - model.to(self.device) + def check_step(): + input_tensor = torch.rand((BATCH_SIZE, INPUT_DIM)) - defaults = dict() + def closure_ddp(input_tensor=input_tensor): + ddp_optimizer.zero_grad() + ddp_loss = ddp_model(input_tensor).abs().sum() + ddp_loss.backward() + return ddp_loss - if maximize: - defaults['maximize'] = True + def closure_sharded(input_tensor=input_tensor): + sharded_optimizer.zero_grad() + sharded_loss = sharded_ddp_model(input_tensor).abs().sum() + sharded_loss.backward() + return sharded_loss - sharded_optimizer = ZeroRedundancyOptimizer( - params=model.parameters(), optimizer_class=optimizer, lr=1e-3, **defaults + loss_ddp = cast( + torch.Tensor, ddp_optimizer.step(closure=closure_ddp), ) - sharded_ddp_model = DDP( - module=model, device_ids=[self.rank], broadcast_buffers=True, find_unused_parameters=True + loss_sharded_optim = cast( + torch.Tensor, + sharded_optimizer.step(closure=closure_sharded), ) - - ddp_model_single = copy.deepcopy(model) - ddp_model_single.to(self.device) - - ddp_optimizer = optimizer(ddp_model_single.parameters(), lr=1e-3, **defaults) - ddp_model = DDP( - ddp_model_single, device_ids=[self.rank], broadcast_buffers=True, find_unused_parameters=True + torch.testing.assert_close( + loss_ddp, loss_sharded_optim, + msg="Losses differ between local optimizer and ZeRO", + ) + self._check_same_model_params( + sharded_ddp_model, ddp_model, + "Models differ after a step", ) - # The model should be synchronized in between the ranks at construction time, check that - check_same_model_params(sharded_ddp_model, ddp_model, "Models differ from the start") - - def check_step(): - input_tensor = torch.rand((64, 2)) - - def closure_ddp(input_tensor=input_tensor): - ddp_optimizer.zero_grad() - ddp_loss = ddp_model(input_tensor).abs().sum() - ddp_loss.backward() - return ddp_loss - - def closure_sharded(input_tensor=input_tensor): - sharded_optimizer.zero_grad() - sharded_loss = sharded_ddp_model(input_tensor).abs().sum() - sharded_loss.backward() - return sharded_loss - - loss_ddp = cast(torch.Tensor, ddp_optimizer.step(closure=closure_ddp)) - loss_sharded_optim = cast(torch.Tensor, sharded_optimizer.step(closure=closure_sharded)) - - assert torch.allclose( - loss_ddp, loss_sharded_optim - ), "Losses differ in between Pytorch optim and ZeroRedundancyOptimizer" - - check_same_model_params(sharded_ddp_model, ddp_model, "Models differ after a step") - - # The models should stay the same in between the ranks - for i in range(BATCHS): - check_step() - - # Change the models trainability, check that parity is maintained - # only check after a couple of constant batchs to go through both regimes - if i > BATCHS // 2: - next(ddp_model.parameters()).requires_grad = bool(i % 2) - next(sharded_ddp_model.parameters()).requires_grad = bool(i % 2) - - # Check that the checkpoints are compatible - reference_rank = 0 - # - get states - ddp_state_dict = ddp_optimizer.state_dict() - sharded_optimizer.consolidate_state_dict(to=reference_rank) - sharded_optim_state_dict = [sharded_optimizer.state_dict() if self.rank == reference_rank else {}] - dist.broadcast_object_list(sharded_optim_state_dict, src=reference_rank, group=dist.group.WORLD) - sharded_optim_state_dict = sharded_optim_state_dict[0] - - # - cross load the states - # run one step and check that the models are still the same - ddp_state_dict_ref = copy.deepcopy(ddp_state_dict) # OSS will remove some states - ddp_optimizer.load_state_dict(sharded_optim_state_dict) # mixup on purpose ! - sharded_optimizer.load_state_dict(ddp_state_dict) - check_step() - - # - self load, rewind, check no problem - # run one step and check that the models are still the same - ddp_optimizer.load_state_dict(ddp_state_dict_ref) - sharded_optimizer.load_state_dict(sharded_optim_state_dict) + # Check that parity is maintained + for i in range(BATCHES): check_step() + # For the second half of batches, change the parameter + # trainability to further test parity + if i > BATCHES // 2: + next(ddp_model.parameters()).requires_grad = bool(i % 2) + next(sharded_ddp_model.parameters()).requires_grad = bool(i % 2) + + # Check that the `state_dict` checkpoints are compatible between + # the local optimizer and ZeRO + REFERENCE_RANK = 0 + # - Get states + ddp_state_dict = ddp_optimizer.state_dict() + sharded_optimizer.consolidate_state_dict(to=REFERENCE_RANK) + sharded_optim_state_dict = [ + sharded_optimizer.state_dict() + if self.rank == REFERENCE_RANK else {} + ] + dist.broadcast_object_list( + sharded_optim_state_dict, src=REFERENCE_RANK, + group=dist.group.WORLD, + ) + sharded_optim_state_dict = sharded_optim_state_dict[0] - for opt in [torch.optim.Adam, torch.optim.AdamW, torch.optim.SGD]: - for maximize in (True, False): - check_optimizer_equivalence(opt, maximize=maximize) + # - Cross-load the states + # Run one step and check that the models are still the same + ddp_state_dict_ref = copy.deepcopy(ddp_state_dict) + ddp_optimizer.load_state_dict(sharded_optim_state_dict) + sharded_optimizer.load_state_dict(ddp_state_dict) + check_step() + # - Reload their respective states + # Run one step and check that the models are still the same + ddp_optimizer.load_state_dict(ddp_state_dict_ref) + sharded_optimizer.load_state_dict(sharded_optim_state_dict) + check_step() def _test_zero_join(self, device): - r""" - Check that the ZeRO join hook allows training with uneven inputs when using the given device. - - Arguments: - device (torch.device): device used to store parameters and perform - collective communications. - """ + """Check that the ZeRO join hook allows training with uneven inputs + when using the given device.""" NUM_INPUTS = 3 NUM_EPOCHS = 2 + LR = 0.01 torch.manual_seed(0) torch.cuda.manual_seed(0) @@ -712,8 +925,6 @@ def _test_zero_join(self, device): is_gpu = device.type == "cuda" backend = _get_backend_for_tests() if is_gpu else dist.Backend.GLOO self.dist_init(rank, world_size, backend) - if is_gpu: - torch.cuda.set_device(self.device) model = torch.nn.Sequential( torch.nn.Linear(2, 3), @@ -726,14 +937,18 @@ def _test_zero_join(self, device): # local optimizers on uneven inputs should be equivalent to ZeRO on # uneven inputs with gradients being manually set ddp_model = DDP(model, device_ids=[rank]) if is_gpu else DDP(model) - local_optim = torch.optim.Adam(ddp_model.parameters(), lr=0.01) + local_optim = torch.optim.Adam(ddp_model.parameters(), lr=LR) zero_model = copy.deepcopy(model) zero_model.to(device) - zero_optim = ZeroRedundancyOptimizer(zero_model.parameters(), torch.optim.Adam, lr=0.01) + zero_optim = ZeroRedundancyOptimizer( + zero_model.parameters(), torch.optim.Adam, lr=LR, + ) loss_fn = torch.nn.MSELoss() # Use uneven inputs: rank i has i extra inputs - inputs = [torch.randn(20, 2).to(device) for _ in range(NUM_INPUTS + rank)] + inputs = [ + torch.randn(20, 2).to(device) for _ in range(NUM_INPUTS + rank) + ] labels = torch.randn(20, 3).to(device) # Save the gradients and parameters from DDP as the ground truth; do @@ -760,7 +975,10 @@ def _test_zero_join(self, device): # Broadcast the saved gradients and parameters to all of the other # ranks (which joined early) grads_and_params = [grads_at_each_iter, params_at_each_iter] - grads_and_params = _broadcast_object(grads_and_params, src_rank=world_size - 1, group=dist.group.WORLD, device=device) + grads_and_params = _broadcast_object( + grads_and_params, src_rank=world_size - 1, group=dist.group.WORLD, + device=device, + ) grads_at_each_iter = grads_and_params[0] params_at_each_iter = grads_and_params[1] # TODO: Replace this `_broadcast_object` with `broadcast_object_list` @@ -781,8 +999,9 @@ def __init__(self, zero_optim, grads): super().__init__() def main_hook(self): - grads = self.zero._join_grad_info.grads[self.zero._join_grad_info.index] - self.zero._join_grad_info.index += 1 + join_grad_info = self.zero._join_grad_info + grads = self.zero._join_grad_info.grads[join_grad_info.index] + join_grad_info.index += 1 for p, grad in zip(self.zero._all_params, grads): p.grad = grad.detach().clone().to(device) @@ -809,39 +1028,48 @@ def join_process_group(self): grads = grads_at_each_iter[-num_grads_after_joining:] gradient_setter = _GradientSetter() iter = 0 - with Join([gradient_setter, zero_optim], zero_optim=zero_optim, grads=grads): + with Join( + [gradient_setter, zero_optim], zero_optim=zero_optim, grads=grads, + ): for _ in range(NUM_EPOCHS): for input in inputs: # Notify join context that this process has not joined Join.notify_join_context(gradient_setter) - # Set gradients manually - for p, grad in zip(zero_model.parameters(), grads_at_each_iter[iter]): + for p, grad in zip( + zero_model.parameters(), grads_at_each_iter[iter], + ): p.grad = grad.detach().clone().to(device) - # Perform optimizer step and check parity zero_optim.step() - for p, ddp_p in zip(zero_model.parameters(), params_at_each_iter[iter]): - assert torch.allclose(p, ddp_p), \ - "Parameters differ between using ZeRO and local optimizer" + for p, ddp_p in zip( + zero_model.parameters(), params_at_each_iter[iter], + ): + torch.testing.assert_close( + p, ddp_p, + msg="Parameters differ between using ZeRO and " + "local optimizer", + ) iter += 1 @common_distributed.requires_nccl() - @common_distributed.skip_if_lt_x_gpu(2) + @common_distributed.skip_if_no_gpu def test_zero_join_gpu(self): - """Check that the ZeRO join hook allows training with uneven inputs on GPU.""" + """Check that the ZeRO join hook allows training with uneven inputs + on GPU.""" self._test_zero_join(self.device) @common_distributed.requires_gloo() def test_zero_join_cpu(self): - """Check that the ZeRO join hook allows training with uneven inputs on CPU.""" + """Check that the ZeRO join hook allows training with uneven inputs + on CPU.""" self._test_zero_join(torch.device("cpu")) def _test_zero_model_parallel(self, parameters_as_bucket_view: bool): # Use two processes each with two GPUs assert self.rank < 2 - NUM_EPOCHS = 3 - NUM_INPUTS = 5 + NUM_EPOCHS = 2 + NUM_INPUTS = 4 LR = 0.01 torch.manual_seed(0) torch.cuda.manual_seed(0) @@ -871,17 +1099,20 @@ def __init__(self): def forward(self, x): return self.net1(self.relu(self.net0(x))) - dev0 = 2 * self.rank - dev1 = 2 * self.rank + 1 + dev0 = torch.device(2 * self.rank) + dev1 = torch.device(2 * self.rank + 1) mp_model = ModelParallelModel(dev0, dev1) ddp_model = DDP(mp_model) - local_model = LocalModel() - cpu_device = torch.device("cpu") + local_model = LocalModel().to(dev0) + # Ensure the parameters are the same across the two models - local_model.net0.weight = torch.nn.Parameter(mp_model.net0.weight.detach().clone().to(cpu_device)) - local_model.net0.bias = torch.nn.Parameter(mp_model.net0.bias.detach().clone().to(cpu_device)) - local_model.net1.weight = torch.nn.Parameter(mp_model.net1.weight.detach().clone().to(cpu_device)) - local_model.net1.bias = torch.nn.Parameter(mp_model.net1.bias.detach().clone().to(cpu_device)) + def copy_param(p): + return torch.nn.Parameter(p.detach().clone().to(dev0)) + + local_model.net0.weight = copy_param(mp_model.net0.weight) + local_model.net0.bias = copy_param(mp_model.net0.bias) + local_model.net1.weight = copy_param(mp_model.net1.weight) + local_model.net1.bias = copy_param(mp_model.net1.bias) # Compare parity between DDP with model parallelism using ZeRO and # a local model using a local optimizer @@ -889,10 +1120,10 @@ def forward(self, x): ddp_model.parameters(), optimizer_class=torch.optim.Adam, parameters_as_bucket_view=parameters_as_bucket_view, - lr=LR + lr=LR, ) local_optim = torch.optim.Adam(local_model.parameters(), lr=LR) - inputs = [torch.randn(20, 10) for _ in range(NUM_INPUTS)] + inputs = [torch.randn(20, 10).to(dev0) for _ in range(NUM_INPUTS)] for _ in range(NUM_EPOCHS): for input in inputs: @@ -908,40 +1139,42 @@ def closure_ddp(): ddp_loss.backward() return ddp_loss - local_loss = cast(torch.Tensor, local_optim.step(closure=closure_local)) - ddp_loss = cast(torch.Tensor, zero_optim.step(closure=closure_ddp)).to(cpu_device) - - # Increased tolerances are needed to pass test when using TensorFloat32 - # see https://github.com/pytorch/pytorch/issues/67764 - assert torch.allclose( - local_loss, ddp_loss, rtol=1e-03 - ), "Losses differ between local optim and ZeroRedundancyOptimizer" + local_loss = cast( + torch.Tensor, local_optim.step(closure=closure_local) + ) + ddp_loss = cast( + torch.Tensor, zero_optim.step(closure=closure_ddp) + ) - for local_p, ddp_p in zip(local_model.parameters(), ddp_model.parameters()): - ddp_p = ddp_p.to(cpu_device) - assert torch.allclose(local_p, ddp_p, rtol=1e-03, atol=1e-04), "Models differ after a step" + # Increased tolerances are needed to pass when using TF32 + # See: https://github.com/pytorch/pytorch/issues/67764 + torch.testing.assert_close( + local_loss.cpu(), ddp_loss.cpu(), rtol=1e-03, atol=1e-08, + ), "Losses differ between local optimizer and ZeRO" - @common_distributed.skip_if_lt_x_gpu(4) - def test_zero_model_parallel_with_bucket_view(self): - """ - Check that ZeRO works with model parallelism where layers are sharded - across devices when ``parameters_as_bucket_view=True``. - """ - if self.rank >= 2: - return - self.dist_init(self.rank, world_size=2) - self._test_zero_model_parallel(parameters_as_bucket_view=True) + for local_p, ddp_p in zip( + local_model.parameters(), + ddp_model.parameters() + ): + torch.testing.assert_close( + local_p.cpu(), ddp_p.cpu(), rtol=1e-03, atol=1e-04, + ), "Models differ after a step" @common_distributed.skip_if_lt_x_gpu(4) - def test_zero_model_parallel_without_bucket_view(self): - """ - Check that ZeRO works with model parallelism where layers are sharded - across devices when ``parameters_as_bucket_view=False``. - """ + @parametrize( + "parameters_as_bucket_view", + [False, True], + ) + def test_zero_model_parallel( + self, + parameters_as_bucket_view: bool, + ): + """Check that ZeRO works with model parallelism where the model's + layers are assigned to different devices.""" if self.rank >= 2: return self.dist_init(self.rank, world_size=2) - self._test_zero_model_parallel(parameters_as_bucket_view=False) + self._test_zero_model_parallel(parameters_as_bucket_view) def _test_ddp_zero_overlap( self, @@ -962,22 +1195,21 @@ def _test_ddp_zero_overlap( is_gpu = device.type == "cuda" if is_gpu: torch.cuda.set_device(device) - models_to_test = [ - ( - torch.nn.Sequential( - torch.nn.Linear(1000, 2000), - torch.nn.Linear(2000, 500) - ), - [torch.randn(1, 1000).to(device) for _ in range(NUM_INPUTS)] + models_to_test = [( + torch.nn.Sequential( + torch.nn.Linear(1000, 2000), + torch.nn.Linear(2000, 500), ), - ] + [torch.randn(1, 1000).to(device) for _ in range(NUM_INPUTS)], + )] if HAS_TORCHVISION: - models_to_test.append( - ( - torchvision.models.resnet50(), - [torch.randn(1, 3, 3, 1000).to(device) for _ in range(NUM_INPUTS)] - ) - ) + models_to_test.append(( + torchvision.models.resnet50(), + [ + torch.randn(1, 3, 3, 1000).to(device) + for _ in range(NUM_INPUTS) + ] + )) for (model, inputs) in models_to_test: # Enable determinism in cudnn operators with torch.backends.cudnn.flags( @@ -1002,7 +1234,10 @@ def _test_ddp_zero_overlap( ) ddp_model_overlap.register_comm_hook( None, - hook_constructor(allreduce_hook, ddp_model_overlap, zero_optim, **kwargs) + hook_constructor( + allreduce_hook, ddp_model_overlap, zero_optim, + **kwargs, + ) ) # Set up the DDP model with local optimizer @@ -1067,120 +1302,73 @@ def _test_ddp_zero_overlap( self.assertEqual(p1, p2) # Check that the parameters were updated - self.assertNotEqual(init_params_overlap, list(ddp_model_overlap.parameters())) + self.assertNotEqual( + init_params_overlap, list(ddp_model_overlap.parameters()), + ) # Ensure that this test runs independently dist.barrier() + # NOTE: The test is skipped if using Windows since functional optimizers + # are not currently supported. @common_distributed.skip_if_win32() @common_distributed.requires_nccl() @common_distributed.skip_if_no_gpu @common_distributed.skip_if_rocm - def test_ddp_with_zero_step_parity_gpu(self): - r""" - Check that overlapping DDP with ZeRO using ``hook_with_zero_step()`` - achieves parity with DDP using a local optimizer when running on GPU. - - NOTE: The test is skipped if using Windows since functional optimizers - are not currently supported. + @parametrize( + "use_gpu", + [True], + # Add `False` once the Gloo sync issue causing hangs is fixed + # See: https://github.com/pytorch/pytorch/issues/62300 + ) + @parametrize( + "use_interleaved_hook", + [False, True], + ) + @parametrize( + "gradient_as_bucket_view", + [False, True], + ) + @parametrize( + "static_graph", + [False, True], + ) + @parametrize( + "shard_buckets", + [False, True], + ) + def test_ddp_zero_overlap( + self, + use_gpu: bool, + use_interleaved_hook: bool, + gradient_as_bucket_view: bool, + static_graph: bool, + shard_buckets: bool, + ): """ - self.dist_init(self.rank, self.world_size, dist.Backend.NCCL) - for gradient_as_bucket_view, static_graph in itertools.product( - [True, False], - [True, False] - ): - self._test_ddp_zero_overlap( - torch.device(self.rank), - hook_with_zero_step, - gradient_as_bucket_view, - static_graph - ) - # TODO: Add `test_ddp_with_zero_step_parity_cpu()` once the Gloo - # synchronization issue causing hangs is fixed. - - @common_distributed.skip_if_win32() - @common_distributed.requires_nccl() - @common_distributed.skip_if_no_gpu - @common_distributed.skip_if_rocm - def test_ddp_with_zero_step_interleaved_parity_gpu(self): - r""" - Check that overlapping DDP with ZeRO using - ``hook_with_zero_step_interleaved()`` achieves parity with DDP using a - local optimizer when running on GPU. - - NOTE: The test is skipped if using Windows since functional optimizers - are not currently supported. + Check that overlapping DDP with ZeRO using the given method determined + by ``hook_constructor`` and ``shard_buckets`` and using the given ZeRO + and DDP arguments achieves parity with DDP using a local optimizer. """ - self.dist_init(self.rank, self.world_size, dist.Backend.NCCL) - for gradient_as_bucket_view, static_graph in itertools.product( - [True, False], - [True, False] - ): + device = torch.device(self.rank) if use_gpu else torch.device("cpu") + backend = _get_backend_for_tests() + self.dist_init(self.rank, self.world_size, backend) + hook_constructor = hook_with_zero_step if not use_interleaved_hook \ + else hook_with_zero_step_interleaved + + # Disable DDP + ReplicatedTensor since ZeroRedundancyOptimizer + # modifies the model parameters in place. + from torch.nn.parallel._replicated_tensor_ddp_utils import _ddp_replicated_tensor + with _ddp_replicated_tensor(False): self._test_ddp_zero_overlap( - torch.device(self.rank), - hook_with_zero_step_interleaved, - gradient_as_bucket_view, - static_graph + device, hook_constructor, gradient_as_bucket_view, static_graph, + shard_buckets=shard_buckets, ) - # TODO: Add `test_ddp_with_zero_step_interleaved_parity_cpu()` once the - # Gloo synchronization issue causing hangs is fixed. - @common_distributed.skip_if_win32() - @common_distributed.requires_nccl() - @common_distributed.skip_if_no_gpu - @common_distributed.skip_if_rocm - def test_ddp_with_zero_step_uniform_parity_gpu(self): - r""" - Check that overlapping DDP with ZeRO using - ``hook_with_zero_step()`` with ``shard_buckets=True`` - achieves parity with DDP using a local optimizer when running on GPU. - - NOTE: The test is skipped if using Windows since functional optimizers - are not currently supported. - """ - self.dist_init(self.rank, self.world_size, dist.Backend.NCCL) - for gradient_as_bucket_view, static_graph in itertools.product( - [True, False], - [True, False] - ): - self._test_ddp_zero_overlap( - torch.device(self.rank), - hook_with_zero_step, - gradient_as_bucket_view, - static_graph, - shard_buckets=True, - ) - # TODO: Add `test_ddp_with_zero_step_uniform_parity_cpu()` once the Gloo - # synchronization issue causing hangs is fixed. - @common_distributed.skip_if_win32() - @common_distributed.requires_nccl() - @common_distributed.skip_if_no_gpu - @common_distributed.skip_if_rocm - def test_ddp_with_zero_step_interleaved_uniform_parity_gpu(self): - r""" - Check that overlapping DDP with ZeRO using - ``hook_with_zero_step()`` with ``shard_buckets=True`` - achieves parity with DDP using a local optimizer when running on GPU. - - NOTE: The test is skipped if using Windows since functional optimizers - are not currently supported. - """ - self.dist_init(self.rank, self.world_size, dist.Backend.NCCL) - for gradient_as_bucket_view, static_graph in itertools.product( - [True, False], - [True, False] - ): - self._test_ddp_zero_overlap( - torch.device(self.rank), - hook_with_zero_step_interleaved, - gradient_as_bucket_view, - static_graph, - shard_buckets=True, - ) - # TODO: Add `test_ddp_with_zero_step_interleaved_uniform_parity_cpu()` once - # the Gloo synchronization issue causing hangs is fixed. +instantiate_parametrized_tests(TestZeroRedundancyOptimizerSingleRank) +instantiate_parametrized_tests(TestZeroRedundancyOptimizerDistributed) if __name__ == "__main__": # ! unittest should not be used here, else the tests are not properly registered - common_utils.run_tests() + run_tests() diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py index 822cd3b09d3a..5c29f1fd448d 100644 --- a/test/distributed/test_c10d_common.py +++ b/test/distributed/test_c10d_common.py @@ -9,6 +9,7 @@ from datetime import timedelta from itertools import product from sys import platform +from contextlib import suppress import torch import torch.distributed as dist @@ -18,6 +19,7 @@ sys.exit(0) import torch.distributed.distributed_c10d as c10d +from torch.utils.checkpoint import checkpoint import torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook as powerSGD import torch.nn.functional as F import torch.testing._internal.common_utils as common @@ -25,12 +27,16 @@ from torch.nn.parallel import DistributedDataParallel from torch.testing._internal.common_distributed import ( MultiProcessTestCase, + skip_if_lt_x_gpu, ) + from torch.testing._internal.common_utils import ( TestCase, load_tests, run_tests, TEST_WITH_DEV_DBG_ASAN, + instantiate_parametrized_tests, + parametrize ) if TEST_WITH_DEV_DBG_ASAN: @@ -238,7 +244,7 @@ def forward(self, x): return F.softmax(self.embedding(x), dim=1) -class AbstractDistributedDataParallelTest(object): +class CommonDistributedDataParallelTest(object): def tearDown(self): # DistributedDataParallel test doesn't seem to call FileStore destructor # TODO: investigate this test and the test is known to have issues @@ -307,6 +313,363 @@ def _prepare_multi_device_module( return model, ddp_model, input, target + def _get_store(self): + return dist.FileStore(self.file_name, self.world_size) + + def _get_process_group(self): + raise NotImplementedError("To be implemented by child class") + + def _train_model(self, model, input_var, target, loss, run_checkpoint=False, use_reentrant=True): + model.train() + if run_checkpoint: + output = checkpoint(model, input_var, use_reentrant=use_reentrant) + else: + output = model(input_var) + l = loss(output, target) + l.backward() + + def _test_ddp_checkpointing( + self, + input_model, + process_group, + use_bucket_view, + find_unused_parameters=False, + static_graph=False, + run_checkpoint=False, + use_reentrant=True, + allow_none_grads=False, + ): + # to reproduce the same training results + torch.cuda.set_device(self.rank) + torch.manual_seed(31415) + model = copy.deepcopy(input_model).cuda() + ddp_model = copy.deepcopy(input_model).cuda() + ddp_model = nn.parallel.DistributedDataParallel( + ddp_model, + bucket_cap_mb=1, + gradient_as_bucket_view=use_bucket_view, + device_ids=[self.rank], + process_group=process_group, + find_unused_parameters=find_unused_parameters, + static_graph=static_graph, + ) + self.assertEqual( + ddp_model._get_ddp_logging_data().get("static_graph", 0), static_graph + ) + input, ddp_input, target, ddp_target = self._prepare_dummy_data() + loss = nn.MSELoss() + n_iters = 5 + for i in range(n_iters): + model.zero_grad(set_to_none=False) + ddp_model.zero_grad(set_to_none=False) + self._train_model(model, input, target, loss, run_checkpoint=run_checkpoint, use_reentrant=use_reentrant) + self._train_model( + ddp_model, ddp_input, ddp_target, loss, run_checkpoint=run_checkpoint, use_reentrant=use_reentrant + ) + for i, j in zip(model.parameters(), ddp_model.parameters()): + if not allow_none_grads: + self.assertTrue(i.grad is not None) + self.assertTrue(j.grad is not None) + self.assertEqual(i.grad, j.grad, rtol=1.3e-06, atol=5e-5) + + # A list of tests for ddp with activation checkpointing + # when gradient_as_bucket_view=True, False. + # Most of the tests are referred to + # https://github.com/facebookresearch/fairscale/blob/main/tests/nn/pipe/test_checkpoint_ddp.py + class CheckpointOnceModule(nn.Module): + """ + Runs checkpoint for a single layer in the model. + """ + def __init__(self, use_reentrant=True): + super().__init__() + self.l1 = nn.Linear(20, 20) + self.l2 = nn.Linear(20, 20) + self.use_reentrant = use_reentrant + + def forward(self, inp): + x = self.l1(inp) + x = checkpoint(self.l2, x, use_reentrant=self.use_reentrant) + return x + + class CheckpointTwiceModule(CheckpointOnceModule): + """ + Runs checkpoint for the same layer twice in a model. This simulates use + cases such as pipeline parallel where the same layer can be checkpointed + more than one time. + """ + def __init__(self, use_reentrant=True): + super().__init__(use_reentrant=use_reentrant) + + def forward(self, inp): + x = self.l1(inp) + x = checkpoint(self.l2, x, use_reentrant=self.use_reentrant) + x = checkpoint(self.l2, x, use_reentrant=self.use_reentrant) + return x + + class CheckpointTwiceModuleWeightSharing(CheckpointTwiceModule): + """ + Similar to CheckpointTwiceModule but the weights are shared. + """ + def __init__(self, use_reentrant=True): + super().__init__(use_reentrant=use_reentrant) + # Share weights + self.l1.weight = self.l2.weight + + def forward(self, inp): + x = self.l1(inp) + x = checkpoint(self.l2, x, use_reentrant=self.use_reentrant) + x = checkpoint(self.l2, x, use_reentrant=self.use_reentrant) + return x + + + class DynamicCheckpointTwiceModule(CheckpointTwiceModule): + def __init__(self, use_reentrant=True): + super().__init__(use_reentrant=use_reentrant) + self.count = 0 + + def forward(self, inp): + if self.count % 2: + x = checkpoint(self.l1, inp, use_reentrant=self.use_reentrant) + else: + x = checkpoint(self.l2, inp, use_reentrant=self.use_reentrant) + + self.count += 1 + return x + + class DynamicCheckpointTwiceModuleWeightSharing(DynamicCheckpointTwiceModule): + def __init__(self, use_reentrant=True): + super().__init__(use_reentrant=use_reentrant) + # Share weights + self.l1.weight = self.l2.weight + + + def _prepare_dummy_data(self): + ddp_bs = 16 + bs = ddp_bs * self.world_size + input = torch.rand((bs, 20), device="cuda", requires_grad=True) + target = torch.randn((bs, 20), device="cuda") + offset = self.rank * ddp_bs + ddp_input = input[offset : offset + ddp_bs] + ddp_target = target[offset : offset + ddp_bs] + return input, ddp_input, target, ddp_target + + + @skip_if_lt_x_gpu(2) + @parametrize("use_reentrant", [True, False]) + def test_ddp_checkpointing_once(self, use_reentrant): + """ + DDP works as expected when layer is checkpointed only once. + """ + process_group = self._get_process_group() + for use_bucket_view, static_graph in product((False, True), (False, True)): + self._test_ddp_checkpointing( + self.CheckpointOnceModule(use_reentrant=use_reentrant), + process_group=process_group, + use_bucket_view=use_bucket_view, + static_graph=static_graph, + ) + if static_graph: + # find_unused_parameters does not make a difference, since it is + # ignored for static graph. + self._test_ddp_checkpointing( + self.CheckpointOnceModule(), + process_group=process_group, + use_bucket_view=use_bucket_view, + static_graph=static_graph, + find_unused_parameters=True, + ) + + @skip_if_lt_x_gpu(2) + @parametrize("use_reentrant", [True, False]) + def test_ddp_checkpointing_unused_params(self, use_reentrant): + """ + With reentrant autograd checkpointing impl, DDP will fail when there are + unused params in the model and no static graph training. With + non-reentrant checkpointing implementation, this works as expected. + """ + process_group = self._get_process_group() + for use_bucket_view in (True, False): + err_ctx = ( + suppress() if not use_reentrant else + self.assertRaisesRegex( + RuntimeError, + "Expected to mark a variable ready only once." + ) + ) + with err_ctx: + model = self._test_ddp_checkpointing( + self.CheckpointOnceModule(use_reentrant=use_reentrant), + process_group=process_group, + use_bucket_view=use_bucket_view, + find_unused_parameters=True, + ) + # test passes when static_graph is true + model = self._test_ddp_checkpointing( + self.CheckpointOnceModule(use_reentrant=use_reentrant), + process_group=process_group, + use_bucket_view=use_bucket_view, + find_unused_parameters=True, + static_graph=True, + ) + + @skip_if_lt_x_gpu(2) + @parametrize("use_reentrant", [True, False]) + def test_ddp_checkpointing_twice(self, use_reentrant): + """ + Checkpoitning twice fails for non-static graph with reentrant checkpoint + implementation, succeeds with non-reentrant checkpoint implementation. + """ + process_group = self._get_process_group() + for use_bucket_view in (True, False): + err_ctx = ( + suppress() if not use_reentrant else + self.assertRaisesRegex( + RuntimeError, + "Expected to mark a variable ready only once." + ) + ) + with err_ctx: + model = self._test_ddp_checkpointing( + self.CheckpointTwiceModule(use_reentrant=use_reentrant), + process_group=process_group, + use_bucket_view=use_bucket_view, + static_graph=False, + ) + + with err_ctx: + model = self._test_ddp_checkpointing( + self.CheckpointTwiceModule(use_reentrant=use_reentrant), + process_group=process_group, + use_bucket_view=use_bucket_view, + static_graph=False, + find_unused_parameters=True, + ) + + @skip_if_lt_x_gpu(2) + @parametrize("use_reentrant", [True, False]) + def test_ddp_checkpointing_twice_static_graph(self, use_reentrant): + """ + Regardless of reentrant or non-reentrant checkpointing impl, + checkpointing twice works with static graph enabled. + """ + process_group = self._get_process_group() + for use_bucket_view in (True, False): + # Test passes when static_graph=True. + model = self._test_ddp_checkpointing( + self.CheckpointTwiceModule(use_reentrant=use_reentrant), + process_group=process_group, + use_bucket_view=use_bucket_view, + static_graph=True, + ) + + @skip_if_lt_x_gpu(2) + def test_ddp_checkpointing_dynamic_module(self): + """ + Dynamic module can be checkpointed, multiple times, with non-reentrant + checkpointing implementation. + """ + process_group = self._get_process_group() + for use_bucket_view in (True, False): + model = self._test_ddp_checkpointing( + self.DynamicCheckpointTwiceModule(use_reentrant=False), + process_group=process_group, + use_bucket_view=use_bucket_view, + static_graph=False, + find_unused_parameters=True, + # Grads can be none sometimes due to dynamic module not using + # all params. + allow_none_grads=True + ) + + @skip_if_lt_x_gpu(2) + def test_ddp_checkpointing_dynamic_weight_sharing(self): + """ + Dynamic module can be checkpointed multiple times with weight sharing + using non-reentrant checkpointing implementation. + """ + process_group = self._get_process_group() + for use_bucket_view in (True, False): + model = self._test_ddp_checkpointing( + self.DynamicCheckpointTwiceModuleWeightSharing(use_reentrant=False), + process_group=process_group, + use_bucket_view=use_bucket_view, + static_graph=False, + find_unused_parameters=True, + # Grads can be none sometimes due to dynamic module not using + # all params. + allow_none_grads=True + ) + + # DDP works as expected if there is weight sharing among layers + @skip_if_lt_x_gpu(2) + @parametrize("use_reentrant", [True, False]) + def test_ddp_checkpointing_weight_sharing(self, use_reentrant): + """ + Test that checkpointing with weight sharing works. + """ + process_group = self._get_process_group() + torch.cuda.set_device(self.rank) + for use_bucket_view, static_graph in product((False, True), (False, True)): + torch.manual_seed(31415) + l1 = nn.Linear(20, 20) + l2 = nn.Linear(20, 20) + l1.weight = l2.weight + model = nn.Sequential(l1, l2) + # TODO: non-reentrant based checkpointing of DDP module with + # static_graph runs into the below issue, see + # https://github.com/pytorch/pytorch/issues/70865 and + # https://github.com/pytorch/pytorch/issues/58111 for details. + err_ctx = ( + self.assertRaisesRegex( + RuntimeError, + "Your training graph has changed in this iteration" + ) if static_graph and not use_reentrant else suppress() + ) + with err_ctx: + self._test_ddp_checkpointing( + model, + process_group=process_group, + use_bucket_view=use_bucket_view, + static_graph=static_graph, + run_checkpoint=True, + use_reentrant=use_reentrant, + ) + + @skip_if_lt_x_gpu(2) + def test_ddp_checkpointing_twice_weight_sharing(self): + """ + Checkpointing should work with static graph in the case of checkpointing + same layer twice and having weights shared acrosss layers. + """ + process_group = self._get_process_group() + torch.cuda.set_device(self.rank) + for use_bucket_view in (True, False): + model = self._test_ddp_checkpointing( + self.CheckpointTwiceModuleWeightSharing(), + process_group=process_group, + use_bucket_view=use_bucket_view, + static_graph=True, + ) + + def test_invalid_powerSGD_state(self): + for start_powerSGD_iter, use_error_feedback, warm_start in product( + [0, 1], [True, False], [True, False] + ): + if not use_error_feedback and not warm_start: + continue + with self.assertRaisesRegex( + ValueError, + "Expect `start_powerSGD_iter` > 1 if `use_error_feedback` or `warm_start` is enabled, " + "because PowerSGD can only be applied after the first two iterations in DDP.", + ): + state = powerSGD.PowerSGDState( + process_group=None, + matrix_approximation_rank=1, + start_powerSGD_iter=start_powerSGD_iter, + use_error_feedback=use_error_feedback, + warm_start=warm_start, + ) + def _test_ddp_with_process_group( self, process_group, @@ -443,33 +806,101 @@ def fut_then(fut): return fut.then(fut_then) + def _test_not_nan(self, model, x): + y = model(x) + self.assertFalse(y.isnan().any().item()) + y.sum().backward() + for p in model.parameters(): + self.assertFalse(p.grad.isnan().any().item()) + + @skip_if_lt_x_gpu(2) + def test_sync_batch_norm_only_empty_input(self): + pg = self._get_process_group() + + model = torch.nn.Sequential( + nn.BatchNorm2d(2), + ).to(device=self.rank) + model = DistributedDataParallel( + model, + device_ids=[self.rank], + process_group=pg, + ) + model = nn.SyncBatchNorm.convert_sync_batchnorm( + model, + process_group=pg, + ) -class DistributedDataParallelTest( - AbstractDistributedDataParallelTest, MultiProcessTestCase -): - def setUp(self): - super(DistributedDataParallelTest, self).setUp() - self._spawn_processes() + model.train() - def test_invalid_powerSGD_state(self): - for start_powerSGD_iter, use_error_feedback, warm_start in product( - [0, 1], [True, False], [True, False] - ): - if not use_error_feedback and not warm_start: - continue - with self.assertRaisesRegex( - ValueError, - "Expect `start_powerSGD_iter` > 1 if `use_error_feedback` or `warm_start` is enabled, " - "because PowerSGD can only be applied after the first two iterations in DDP.", - ): - state = powerSGD.PowerSGDState( - process_group=None, - matrix_approximation_rank=1, - start_powerSGD_iter=start_powerSGD_iter, - use_error_feedback=use_error_feedback, - warm_start=warm_start, - ) + # only rank 0 receives empty inputs + x = torch.zeros( + (1 if self.rank != 0 else 0, 2, 11, 13), + dtype=torch.float32, + device=self.rank + ) + # input requires grad, this will trigger the collective communication + # in the backward pass + x.requires_grad = True + self._test_not_nan(model, x) + + # input does not requires grad + x.requires_grad = False + self._test_not_nan(model, x) + + # all ranks receive empty inputs + x = torch.zeros( + (0, 2, 11, 13), + dtype=torch.float32, + device=self.rank + ) + + # input requires grad, this will trigger the collective communication + # in the backward pass + x.requires_grad = True + self._test_not_nan(model, x) + + # input does not requires grad + x.requires_grad = False + self._test_not_nan(model, x) + + @skip_if_lt_x_gpu(2) + def test_sync_batch_norm_empty_input(self): + pg = self._get_process_group() + + model = torch.nn.Sequential( + nn.Conv2d(2, 2, 3), + nn.BatchNorm2d(2), + nn.Linear(28, 2), + ).to(device=self.rank) + model = DistributedDataParallel( + model, + device_ids=[self.rank], + process_group=pg, + ) + model = nn.SyncBatchNorm.convert_sync_batchnorm( + model, + process_group=pg, + ) + + model.train() + # only rank 0 receives empty inputs + x = torch.zeros( + (3 if self.rank != 0 else 0, 2, 30, 30), + dtype=torch.float32, + device=self.rank + ) + + self._test_not_nan(model, x) + + # all ranks receive empty inputs + x = torch.zeros( + (0, 2, 30, 30), + dtype=torch.float32, + device=self.rank + ) + + self._test_not_nan(model, x) class ComputeBucketAssignmentTest(TestCase): def test_single_limit_single_dtype(self): @@ -698,20 +1129,33 @@ def tearDown(self): except OSError: pass - def test_distributed_debug_mode(self): + def test_debug_level(self): + try: + del os.environ["TORCH_DISTRIBUTED_DEBUG"] + except KeyError: + pass + + dist.set_debug_level_from_env() # Default should be off - default_debug_mode = dist._get_debug_mode() - self.assertEqual(default_debug_mode, dist._DistributedDebugLevel.OFF) + default_debug_mode = dist.get_debug_level() + self.assertEqual(default_debug_mode, dist.DebugLevel.OFF) mapping = { - "OFF": dist._DistributedDebugLevel.OFF, - "INFO": dist._DistributedDebugLevel.INFO, - "DETAIL": dist._DistributedDebugLevel.DETAIL, + "OFF": dist.DebugLevel.OFF, + "off": dist.DebugLevel.OFF, + "oFf": dist.DebugLevel.OFF, + "INFO": dist.DebugLevel.INFO, + "info": dist.DebugLevel.INFO, + "INfO": dist.DebugLevel.INFO, + "DETAIL": dist.DebugLevel.DETAIL, + "detail": dist.DebugLevel.DETAIL, + "DeTaIl": dist.DebugLevel.DETAIL, } invalid_debug_modes = ["foo", 0, 1, -1] for mode in mapping.keys(): os.environ["TORCH_DISTRIBUTED_DEBUG"] = str(mode) - set_debug_mode = dist._get_debug_mode() + dist.set_debug_level_from_env() + set_debug_mode = dist.get_debug_level() self.assertEqual( set_debug_mode, mapping[mode], @@ -720,8 +1164,8 @@ def test_distributed_debug_mode(self): for mode in invalid_debug_modes: os.environ["TORCH_DISTRIBUTED_DEBUG"] = str(mode) - with self.assertRaisesRegex(RuntimeError, "to be one of"): - dist._get_debug_mode() + with self.assertRaisesRegex(RuntimeError, "The value of TORCH_DISTRIBUTED_DEBUG must"): + dist.set_debug_level_from_env() class DummyWork(dist._Work): @@ -879,6 +1323,8 @@ def test_send_recv(self): # user applications would explicitly that. +instantiate_parametrized_tests(CommonDistributedDataParallelTest) + if __name__ == "__main__": assert ( diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py index 0594aae287fc..e49d65ea33d2 100644 --- a/test/distributed/test_c10d_gloo.py +++ b/test/distributed/test_c10d_gloo.py @@ -39,7 +39,6 @@ skip_if_win32, create_device, verify_ddp_error_logged, - skip_if_rocm, ) from torch.testing._internal.common_utils import ( TestCase, @@ -538,7 +537,6 @@ def test_allreduce_stress(self): self._test_allreduce_stress(inputs) @skip_if_lt_x_gpu(2) - @skip_if_rocm @requires_gloo() def test_allreduce_stress_cuda(self): inputs = [torch.tensor([i + self.rank]).cuda() for i in range(1000)] @@ -979,7 +977,7 @@ def _test_gather_basics(self, fn): futures.append(pg.gather([], input, opts).get_future()) # Wait for work to complete - expected = [torch.tensor([rank]) for rank in range(self.world_size)] + expected = [fn(torch.tensor([rank])) for rank in range(self.world_size)] for i in range(self.world_size): futures[i].wait() result = futures[i].value() @@ -995,6 +993,11 @@ def test_gather_basics(self): def test_gather_basics_cuda(self): self._test_gather_basics(lambda t: t.clone().cuda()) + @requires_gloo() + def test_gather_noncontiguous_input(self): + # Take a column of 2D tensor, such that memory is not dense + self._test_gather_basics(lambda t: t.expand(2, 2).contiguous()[:, 0]) + def _test_gather_stress(self, inputs, fn): store = c10d.FileStore(self.file_name, self.world_size) pg = self._create_process_group_gloo( @@ -1037,7 +1040,6 @@ def test_gather_stress(self): self._test_gather_stress(inputs, lambda t: t.clone()) @skip_if_lt_x_gpu(2) - @skip_if_rocm @requires_gloo() def test_gather_stress_cuda(self): inputs = [torch.tensor([i + self.rank]).cuda() for i in range(1000)] @@ -1103,7 +1105,7 @@ def _test_allgather_basics(self, fn): for _ in range(n) ] expected_output = [ - [torch.tensor([i]) for i in range(n * self.world_size)] + [fn(torch.tensor([i])) for i in range(n * self.world_size)] for _ in range(n) ] fut = pg.allgather(output, input).get_future() @@ -1122,6 +1124,11 @@ def test_allgather_basics(self): def test_allgather_basics_cuda(self): self._test_allgather_basics(lambda t: t.clone().cuda()) + @requires_gloo() + def test_allgather_noncontiguous_input(self): + # Take a column of 2D tensor, such that memory is not dense + self._test_allgather_basics(lambda t: t.expand(2, 2).contiguous()[:, 0]) + def _test_allgather_stress(self, inputs, fn): store = c10d.FileStore(self.file_name, self.world_size) pg = self._create_process_group_gloo( @@ -1136,8 +1143,14 @@ def _test_allgather_stress(self, inputs, fn): [[torch.tensor([i + j]) for j in range(self.world_size)]] for i in range(len(inputs)) ] + input_holder = {} for i in range(len(inputs)): - fut = pg.allgather(outputs[i], [fn(inputs[i])]).get_future() + # Note that this works around the data race discussed in + # https://github.com/pytorch/pytorch/issues/75529, but we should + # actually be able to pass the list directly into allgather when + # that race is fixed. + input_holder[i] = [fn(inputs[i])] + fut = pg.allgather(outputs[i], input_holder[i]).get_future() future_handles.append(fut) for i, future_handle in enumerate(future_handles): @@ -1155,7 +1168,6 @@ def test_allgather_stress(self): self._test_allgather_stress(inputs, lambda t: t.clone()) @skip_if_lt_x_gpu(2) - @skip_if_rocm @requires_gloo() def test_allgather_stress_cuda(self): inputs = [torch.tensor([i + self.rank]).cuda() for i in range(1000)] @@ -1336,7 +1348,6 @@ def test_reduce_stress(self): self._test_reduce_stress(inputs) @skip_if_lt_x_gpu(2) - @skip_if_rocm @requires_gloo() def test_reduce_stress_cuda(self): inputs = [torch.tensor([i + self.rank]).cuda() for i in range(1000)] @@ -1457,12 +1468,16 @@ def create(num, prefix): class DistributedDataParallelTest( - test_c10d_common.AbstractDistributedDataParallelTest, MultiProcessTestCase + test_c10d_common.CommonDistributedDataParallelTest, MultiProcessTestCase ): def setUp(self): super(DistributedDataParallelTest, self).setUp() self._spawn_processes() + def _get_process_group(self): + store = self._get_store() + return c10d.ProcessGroupGloo(store, self.rank, self.world_size) + def _test_gloo_backend( self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False ): @@ -1757,7 +1772,7 @@ def _run_and_verify_sparse_gradients(self, vanilla_model, ddp_model): # Check that the gradients are sparse and identical vanilla_parameter = next(vanilla_model.parameters()) ddp_parameter = next(ddp_model.parameters()) - self.assertEqual(vanilla_parameter.grad, ddp_parameter.grad) + self.assertEqual(vanilla_parameter.grad.coalesce(), ddp_parameter.grad.coalesce()) @requires_gloo() @skip_if_lt_x_gpu(2) diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py index 1c8b224776a0..5ceadc43b265 100644 --- a/test/distributed/test_c10d_nccl.py +++ b/test/distributed/test_c10d_nccl.py @@ -9,7 +9,7 @@ import tempfile import threading import time -from contextlib import contextmanager, suppress +from contextlib import contextmanager from datetime import timedelta from itertools import product from unittest import mock @@ -49,11 +49,8 @@ TEST_WITH_DEV_DBG_ASAN, TEST_WITH_ROCM, sandcastle_skip, - instantiate_parametrized_tests, - parametrize, sandcastle_skip_if, ) -from torch.utils.checkpoint import checkpoint if TEST_WITH_DEV_DBG_ASAN: print( @@ -949,7 +946,7 @@ def allreduce(tensors): class DistributedDataParallelTest( - test_c10d_common.AbstractDistributedDataParallelTest, MultiProcessTestCase + test_c10d_common.CommonDistributedDataParallelTest, MultiProcessTestCase ): def setUp(self): super(DistributedDataParallelTest, self).setUp() @@ -958,6 +955,10 @@ def setUp(self): os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1" self._spawn_processes() + def _get_process_group(self): + store = self._get_store() + return c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + def _test_nccl_backend( self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False ): @@ -1350,7 +1351,7 @@ def test_find_unused_parameters( # Only one such parameter in model.fc3, since bias=False break - if dist._get_debug_mode() != dist._DistributedDebugLevel.OFF: + if dist.get_debug_level() != dist.DebugLevel.OFF: unused_index_str += f" with name {unused_fqn}" self.assertTrue(unused_index_str in str(ex)) @@ -2013,12 +2014,15 @@ def _test_powerSGD_ddp_comm_hook_nccl(self, gradient_as_bucket_view=False): # Get GPU model with the hook registered. # Test the hook with different algorithmic configs. - for use_error_feedback, warm_start in product([True, False], [True, False]): + for use_error_feedback, warm_start, batch_tensors_with_same_shape in product( + [True, False], [True, False], [True, False], + ): state = powerSGD.PowerSGDState( process_group=process_group, matrix_approximation_rank=1, use_error_feedback=use_error_feedback, warm_start=warm_start, + batch_tensors_with_same_shape=batch_tensors_with_same_shape, ) for hook in [powerSGD.powerSGD_hook, powerSGD.batched_powerSGD_hook]: gpu_model = self._gpu_model_with_ddp_comm_hook( @@ -2216,349 +2220,6 @@ def test_ddp_weight_sharing(self): ), ) - # A list of tests for ddp with activation checkpointing - # when gradient_as_bucket_view=True, False. - # Most of the tests are referred to - # https://github.com/facebookresearch/fairscale/blob/main/tests/nn/pipe/test_checkpoint_ddp.py - class CheckpointOnceModule(nn.Module): - """ - Runs checkpoint for a single layer in the model. - """ - def __init__(self, use_reentrant=True): - super().__init__() - self.l1 = nn.Linear(20, 20) - self.l2 = nn.Linear(20, 20) - self.use_reentrant = use_reentrant - - def forward(self, inp): - x = self.l1(inp) - x = checkpoint(self.l2, x, use_reentrant=self.use_reentrant) - return x - - class CheckpointTwiceModule(CheckpointOnceModule): - """ - Runs checkpoint for the same layer twice in a model. This simulates use - cases such as pipeline parallel where the same layer can be checkpointed - more than one time. - """ - def __init__(self, use_reentrant=True): - super().__init__(use_reentrant=use_reentrant) - - def forward(self, inp): - x = self.l1(inp) - x = checkpoint(self.l2, x, use_reentrant=self.use_reentrant) - x = checkpoint(self.l2, x, use_reentrant=self.use_reentrant) - return x - - class CheckpointTwiceModuleWeightSharing(CheckpointTwiceModule): - """ - Similar to CheckpointTwiceModule but the weights are shared. - """ - def __init__(self, use_reentrant=True): - super().__init__(use_reentrant=use_reentrant) - - def forward(self, inp): - x = self.l1(inp) - x = checkpoint(self.l2, x, use_reentrant=self.use_reentrant) - x = checkpoint(self.l2, x, use_reentrant=self.use_reentrant) - return x - - - class DynamicCheckpointTwiceModule(CheckpointTwiceModule): - def __init__(self, use_reentrant=True): - super().__init__(use_reentrant=use_reentrant) - self.count = 0 - - def forward(self, inp): - if self.count % 2: - x = checkpoint(self.l1, inp, use_reentrant=self.use_reentrant) - else: - x = checkpoint(self.l2, inp, use_reentrant=self.use_reentrant) - - self.count += 1 - return x - - class DynamicCheckpointTwiceModuleWeightSharing(DynamicCheckpointTwiceModule): - def __init__(self, use_reentrant=True): - super().__init__(use_reentrant=use_reentrant) - self.l1.weight = self.l2.weight - - - def _prepare_dummy_data(self): - ddp_bs = 16 - bs = ddp_bs * self.world_size - input = torch.rand((bs, 20), device="cuda", requires_grad=True) - target = torch.randn((bs, 20), device="cuda") - offset = self.rank * ddp_bs - ddp_input = input[offset : offset + ddp_bs] - ddp_target = target[offset : offset + ddp_bs] - return input, ddp_input, target, ddp_target - - def _train_model(self, model, input_var, target, loss, run_checkpoint=False, use_reentrant=True): - model.train() - if run_checkpoint: - output = checkpoint(model, input_var, use_reentrant=use_reentrant) - else: - output = model(input_var) - l = loss(output, target) - l.backward() - - def _test_ddp_checkpointing( - self, - input_model, - process_group, - use_bucket_view, - find_unused_parameters=False, - static_graph=False, - run_checkpoint=False, - use_reentrant=True, - allow_none_grads=False, - ): - # to reproduce the same training results - torch.cuda.set_device(self.rank) - torch.manual_seed(31415) - model = copy.deepcopy(input_model).cuda() - ddp_model = copy.deepcopy(input_model).cuda() - ddp_model = nn.parallel.DistributedDataParallel( - ddp_model, - bucket_cap_mb=1, - gradient_as_bucket_view=use_bucket_view, - device_ids=[self.rank], - process_group=process_group, - find_unused_parameters=find_unused_parameters, - static_graph=static_graph, - ) - self.assertEqual( - ddp_model._get_ddp_logging_data().get("static_graph", 0), static_graph - ) - input, ddp_input, target, ddp_target = self._prepare_dummy_data() - loss = nn.MSELoss() - n_iters = 5 - for i in range(n_iters): - model.zero_grad(set_to_none=False) - ddp_model.zero_grad(set_to_none=False) - self._train_model(model, input, target, loss, run_checkpoint=run_checkpoint, use_reentrant=use_reentrant) - self._train_model( - ddp_model, ddp_input, ddp_target, loss, run_checkpoint=run_checkpoint, use_reentrant=use_reentrant - ) - for i, j in zip(model.parameters(), ddp_model.parameters()): - if not allow_none_grads: - self.assertTrue(i.grad is not None) - self.assertTrue(j.grad is not None) - self.assertEqual(i.grad, j.grad, rtol=1.3e-06, atol=5e-5) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - @parametrize("use_reentrant", [True, False]) - def test_ddp_checkpointing_once(self, use_reentrant): - """ - DDP works as expected when layer is checkpointed only once. - """ - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - for use_bucket_view, static_graph in product((False, True), (False, True)): - self._test_ddp_checkpointing( - self.CheckpointOnceModule(use_reentrant=use_reentrant), - process_group=process_group, - use_bucket_view=use_bucket_view, - static_graph=static_graph, - ) - if static_graph: - # find_unused_parameters does not make a difference, since it is - # ignored for static graph. - self._test_ddp_checkpointing( - self.CheckpointOnceModule(), - process_group=process_group, - use_bucket_view=use_bucket_view, - static_graph=static_graph, - find_unused_parameters=True, - ) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - @parametrize("use_reentrant", [True, False]) - def test_ddp_checkpointing_unused_params(self, use_reentrant): - """ - With reentrant autograd checkpointing impl, DDP will fail when there are - unused params in the model and no static graph training. With - non-reentrant checkpointing implementation, this works as expected. - """ - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - for use_bucket_view in (True, False): - err_ctx = ( - suppress() if not use_reentrant else - self.assertRaisesRegex( - RuntimeError, - "Expected to mark a variable ready only once." - ) - ) - with err_ctx: - model = self._test_ddp_checkpointing( - self.CheckpointOnceModule(use_reentrant=use_reentrant), - process_group=process_group, - use_bucket_view=use_bucket_view, - find_unused_parameters=True, - ) - # test passes when static_graph is true - model = self._test_ddp_checkpointing( - self.CheckpointOnceModule(use_reentrant=use_reentrant), - process_group=process_group, - use_bucket_view=use_bucket_view, - find_unused_parameters=True, - static_graph=True, - ) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - @parametrize("use_reentrant", [True, False]) - def test_ddp_checkpointing_twice(self, use_reentrant): - """ - Checkpoitning twice fails for non-static graph with reentrant checkpoint - implementation, succeeds with non-reentrant checkpoint implementation. - """ - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - for use_bucket_view in (True, False): - err_ctx = ( - suppress() if not use_reentrant else - self.assertRaisesRegex( - RuntimeError, - "Expected to mark a variable ready only once." - ) - ) - with err_ctx: - model = self._test_ddp_checkpointing( - self.CheckpointTwiceModule(use_reentrant=use_reentrant), - process_group=process_group, - use_bucket_view=use_bucket_view, - static_graph=False, - ) - - with err_ctx: - model = self._test_ddp_checkpointing( - self.CheckpointTwiceModule(use_reentrant=use_reentrant), - process_group=process_group, - use_bucket_view=use_bucket_view, - static_graph=False, - find_unused_parameters=True, - ) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - @parametrize("use_reentrant", [True, False]) - def test_ddp_checkpointing_twice_static_graph(self, use_reentrant): - """ - Regardless of reentrant or non-reentrant checkpointing impl, - checkpointing twice works with static graph enabled. - """ - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - for use_bucket_view in (True, False): - # Test passes when static_graph=True. - model = self._test_ddp_checkpointing( - self.CheckpointTwiceModule(use_reentrant=use_reentrant), - process_group=process_group, - use_bucket_view=use_bucket_view, - static_graph=True, - ) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_ddp_checkpointing_dynamic_module(self): - """ - Dynamic module can be checkpointed, multiple times, with non-reentrant - checkpointing implementation. - """ - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - for use_bucket_view in (True, False): - model = self._test_ddp_checkpointing( - self.DynamicCheckpointTwiceModule(use_reentrant=False), - process_group=process_group, - use_bucket_view=use_bucket_view, - static_graph=False, - find_unused_parameters=True, - # Grads can be none sometimes due to dynamic module not using - # all params. - allow_none_grads=True - ) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_ddp_checkpointing_dynamic_weight_sharing(self): - """ - Dynamic module can be checkpointed multiple times with weight sharing - using non-reentrant checkpointing implementation. - """ - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - for use_bucket_view in (True, False): - model = self._test_ddp_checkpointing( - self.DynamicCheckpointTwiceModuleWeightSharing(use_reentrant=False), - process_group=process_group, - use_bucket_view=use_bucket_view, - static_graph=False, - find_unused_parameters=True, - # Grads can be none sometimes due to dynamic module not using - # all params. - allow_none_grads=True - ) - - # DDP works as expected if there is weight sharing among layers - @requires_nccl() - @skip_if_lt_x_gpu(2) - @parametrize("use_reentrant", [True, False]) - def test_ddp_checkpointing_weight_sharing(self, use_reentrant): - """ - Test that checkpointing with weight sharing works. - """ - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - torch.cuda.set_device(self.rank) - for use_bucket_view, static_graph in product((False, True), (False, True)): - torch.manual_seed(31415) - l1 = nn.Linear(20, 20) - l2 = nn.Linear(20, 20) - l1.weight = l2.weight - model = nn.Sequential(l1, l2) - # TODO: non-reentrant based checkpointing of DDP module with - # static_graph runs into the below issue, see - # https://github.com/pytorch/pytorch/issues/70865 and - # https://github.com/pytorch/pytorch/issues/58111 for details. - err_ctx = ( - self.assertRaisesRegex( - RuntimeError, - "Your training graph has changed in this iteration" - ) if static_graph and not use_reentrant else suppress() - ) - with err_ctx: - self._test_ddp_checkpointing( - model, - process_group=process_group, - use_bucket_view=use_bucket_view, - static_graph=static_graph, - run_checkpoint=True, - use_reentrant=use_reentrant, - ) - - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_ddp_checkpointing_twice_weight_sharing(self): - """ - Checkpointing should work with static graph in the case of checkpointing - same layer twice and having weights shared acrosss layers. - """ - store = c10d.FileStore(self.file_name, self.world_size) - process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - torch.cuda.set_device(self.rank) - for use_bucket_view in (True, False): - model = self._test_ddp_checkpointing( - self.CheckpointTwiceModuleWeightSharing(), - process_group=process_group, - use_bucket_view=use_bucket_view, - static_graph=True, - ) class NcclErrorHandlingTest(MultiProcessTestCase): @@ -3053,8 +2714,6 @@ def test_nccl_warn_not_in_group_debug_info(self): def test_nccl_warn_not_in_group_debug_off(self): self._test_warn_not_in_group(backend="nccl") -instantiate_parametrized_tests(DistributedDataParallelTest) - if __name__ == "__main__": assert ( not torch.cuda._initialized diff --git a/test/distributed/test_data_parallel.py b/test/distributed/test_data_parallel.py index 92ce8ccc56e5..c1720344e49d 100644 --- a/test/distributed/test_data_parallel.py +++ b/test/distributed/test_data_parallel.py @@ -383,7 +383,7 @@ def test_data_parallel_sparse(self): self.assertEqual(out.get_device(), dev_id[0]) self.assertEqual(out, expected_out) for expected, param in zip(expected_grads, l.parameters()): - self.assertEqual(param.grad, expected) + self.assertEqual(param.grad.coalesce(), expected.coalesce()) # Check for None device_ids l = l.cuda() diff --git a/test/distributed/test_store.py b/test/distributed/test_store.py index 02484585c68e..bcff510bfe0c 100644 --- a/test/distributed/test_store.py +++ b/test/distributed/test_store.py @@ -1,7 +1,6 @@ # Owner(s): ["oncall: distributed"] import os -import random import sys import tempfile import time @@ -248,7 +247,7 @@ def test_numkeys_delkeys(self): self._test_numkeys_delkeys(self._create_store()) def _create_client(self, index, addr, port, world_size): - client_store = dist.TCPStore(addr, port, world_size, timeout=timedelta(seconds=10)) + client_store = dist.TCPStore(addr, port, world_size=world_size, timeout=timedelta(seconds=10)) self.assertEqual("value".encode(), client_store.get("key")) client_store.set(f"new_key{index}", f"new_value{index}") self.assertEqual(f"next_value{index}".encode(), @@ -259,15 +258,16 @@ def _multi_worker_helper(self, world_size): server_store = create_tcp_store(addr, world_size, wait_for_workers=False) server_store.set("key", "value") port = server_store.port - world_size = random.randint(5, 10) if world_size == -1 else world_size - for i in range(world_size): + + num_indices = world_size if world_size else 1 + for i in range(num_indices): self._create_client(i, addr, port, world_size) def test_multi_worker_with_fixed_world_size(self): self._multi_worker_helper(5) def test_multi_worker_with_nonfixed_world_size(self): - self._multi_worker_helper(-1) + self._multi_worker_helper(None) class PrefixTCPStoreTest(TestCase, StoreTestBase): def setUp(self): @@ -404,6 +404,14 @@ def test_common_errors(self): gen = dist.rendezvous("tcp://127.0.0.1:23456?rank=0") next(gen) + def test_dns_timeout(self): + with self.assertRaisesRegex(TimeoutError, "client socket has timed out after.*dnsnotexist"): + gen = dist.rendezvous( + "tcp://dnsnotexist:23456?world_size=2&rank=0", + timeout=timedelta(seconds=1), + ) + next(gen) + @retry_on_connect_failures def test_nominal(self): url = self.create_tcp_url() diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py index 1855c8434bec..c8b5551c8937 100644 --- a/test/distributions/test_distributions.py +++ b/test/distributions/test_distributions.py @@ -34,6 +34,7 @@ from collections import namedtuple from itertools import product from random import shuffle +from packaging import version import torch @@ -41,7 +42,7 @@ # Distributions tests use double as the default dtype torch.set_default_dtype(torch.double) -from torch._six import inf +from torch._six import inf, nan from torch.testing._internal.common_utils import \ (TestCase, run_tests, set_rng_seed, TEST_WITH_UBSAN, load_tests, gradcheck) @@ -480,27 +481,27 @@ def is_all_nan(tensor): Example(Wishart, [ { 'covariance_matrix': torch.tensor([[2.0, 0.3], [0.3, 0.25]], requires_grad=True), - 'df': torch.tensor([4.], requires_grad=True), + 'df': torch.tensor([3.], requires_grad=True), }, { 'precision_matrix': torch.tensor([[2.0, 0.1, 0.0], [0.1, 0.25, 0.0], [0.0, 0.0, 0.3]], requires_grad=True), - 'df': torch.tensor([2.5, 3], requires_grad=True), + 'df': torch.tensor([5., 4], requires_grad=True), }, { 'scale_tril': torch.tensor([[[2.0, 0.0], [-0.5, 0.25]], [[2.0, 0.0], [0.3, 0.25]], [[5.0, 0.0], [-0.5, 1.5]]], requires_grad=True), - 'df': torch.tensor([5., 3.5, 2], requires_grad=True), + 'df': torch.tensor([5., 3.5, 3], requires_grad=True), }, { 'covariance_matrix': torch.tensor([[5.0, -0.5], [-0.5, 1.5]]), - 'df': torch.tensor([2.0]), + 'df': torch.tensor([3.0]), }, { 'covariance_matrix': torch.tensor([[5.0, -0.5], [-0.5, 1.5]]), - 'df': 2.0, + 'df': 3.0, }, ]), Example(MixtureSameFamily, [ @@ -866,9 +867,15 @@ def _check_sampler_discrete(self, torch_dist, ref_dist, message, torch_samples = torch_samples.cpu().numpy() unique, counts = np.unique(torch_samples, return_counts=True) pmf = ref_dist.pmf(unique) + pmf = pmf / pmf.sum() # renormalize to 1.0 for chisq test msk = (counts > 5) & ((pmf * num_samples) > 5) self.assertGreater(pmf[msk].sum(), 0.9, "Distribution is too sparse for test; try increasing num_samples") - chisq, p = scipy.stats.chisquare(counts[msk], pmf[msk] * num_samples) + # Add a remainder bucket that combines counts for all values + # below threshold, if such values exist (i.e. mask has False entries). + if not msk.all(): + counts = np.concatenate([counts[msk], np.sum(counts[~msk], keepdims=True)]) + pmf = np.concatenate([pmf[msk], np.sum(pmf[~msk], keepdims=True)]) + chisq, p = scipy.stats.chisquare(counts, pmf * num_samples) self.assertGreater(p, failure_rate, message) def _check_enumerate_support(self, dist, examples): @@ -2214,39 +2221,42 @@ def test_multivariate_normal_moments(self): # We applied same tests in Multivariate Normal distribution for Wishart distribution def test_wishart_shape(self): - df = (torch.rand(5, requires_grad=True) + 1) * 10 - df_no_batch = (torch.rand([], requires_grad=True) + 1) * 10 - df_multi_batch = (torch.rand(6, 5, requires_grad=True) + 1) * 10 + set_rng_seed(0) # see Note [Randomized statistical tests] + ndim = 3 + + df = torch.rand(5, requires_grad=True) + ndim + df_no_batch = torch.rand([], requires_grad=True) + ndim + df_multi_batch = torch.rand(6, 5, requires_grad=True) + ndim # construct PSD covariance - tmp = torch.randn(3, 10) + tmp = torch.randn(ndim, 10) cov = (torch.matmul(tmp, tmp.t()) / tmp.size(-1)).requires_grad_() prec = cov.inverse().requires_grad_() scale_tril = torch.linalg.cholesky(cov).requires_grad_() # construct batch of PSD covariances - tmp = torch.randn(6, 5, 3, 10) + tmp = torch.randn(6, 5, ndim, 10) cov_batched = (tmp.unsqueeze(-2) * tmp.unsqueeze(-3)).mean(-1).requires_grad_() prec_batched = cov_batched.inverse() scale_tril_batched = torch.linalg.cholesky(cov_batched) # ensure that sample, batch, event shapes all handled correctly - self.assertEqual(Wishart(df, cov).sample().size(), (5, 3, 3)) - self.assertEqual(Wishart(df_no_batch, cov).sample().size(), (3, 3)) - self.assertEqual(Wishart(df_multi_batch, cov).sample().size(), (6, 5, 3, 3)) - self.assertEqual(Wishart(df, cov).sample((2,)).size(), (2, 5, 3, 3)) - self.assertEqual(Wishart(df_no_batch, cov).sample((2,)).size(), (2, 3, 3)) - self.assertEqual(Wishart(df_multi_batch, cov).sample((2,)).size(), (2, 6, 5, 3, 3)) - self.assertEqual(Wishart(df, cov).sample((2, 7)).size(), (2, 7, 5, 3, 3)) - self.assertEqual(Wishart(df_no_batch, cov).sample((2, 7)).size(), (2, 7, 3, 3)) - self.assertEqual(Wishart(df_multi_batch, cov).sample((2, 7)).size(), (2, 7, 6, 5, 3, 3)) - self.assertEqual(Wishart(df, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3, 3)) - self.assertEqual(Wishart(df_no_batch, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3, 3)) - self.assertEqual(Wishart(df_multi_batch, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3, 3)) - self.assertEqual(Wishart(df, precision_matrix=prec).sample((2, 7)).size(), (2, 7, 5, 3, 3)) - self.assertEqual(Wishart(df, precision_matrix=prec_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3, 3)) - self.assertEqual(Wishart(df, scale_tril=scale_tril).sample((2, 7)).size(), (2, 7, 5, 3, 3)) - self.assertEqual(Wishart(df, scale_tril=scale_tril_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3, 3)) + self.assertEqual(Wishart(df, cov).sample().size(), (5, ndim, ndim)) + self.assertEqual(Wishart(df_no_batch, cov).sample().size(), (ndim, ndim)) + self.assertEqual(Wishart(df_multi_batch, cov).sample().size(), (6, 5, ndim, ndim)) + self.assertEqual(Wishart(df, cov).sample((2,)).size(), (2, 5, ndim, ndim)) + self.assertEqual(Wishart(df_no_batch, cov).sample((2,)).size(), (2, ndim, ndim)) + self.assertEqual(Wishart(df_multi_batch, cov).sample((2,)).size(), (2, 6, 5, ndim, ndim)) + self.assertEqual(Wishart(df, cov).sample((2, 7)).size(), (2, 7, 5, ndim, ndim)) + self.assertEqual(Wishart(df_no_batch, cov).sample((2, 7)).size(), (2, 7, ndim, ndim)) + self.assertEqual(Wishart(df_multi_batch, cov).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim)) + self.assertEqual(Wishart(df, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim)) + self.assertEqual(Wishart(df_no_batch, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim)) + self.assertEqual(Wishart(df_multi_batch, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim)) + self.assertEqual(Wishart(df, precision_matrix=prec).sample((2, 7)).size(), (2, 7, 5, ndim, ndim)) + self.assertEqual(Wishart(df, precision_matrix=prec_batched).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim)) + self.assertEqual(Wishart(df, scale_tril=scale_tril).sample((2, 7)).size(), (2, 7, 5, ndim, ndim)) + self.assertEqual(Wishart(df, scale_tril=scale_tril_batched).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim)) # check gradients # Modified and applied the same tests for multivariate_normal @@ -2272,14 +2282,21 @@ def gradcheck_func(samples, nu, sigma, prec, scale_tril): wishart_log_prob_gradcheck(df_no_batch, None, None, scale_tril_batched) def test_wishart_stable_with_precision_matrix(self): - x = torch.randn(10) + set_rng_seed(0) # see Note [Randomized statistical tests] + ndim = 10 + x = torch.randn(ndim) P = torch.exp(-(x - x.unsqueeze(-1)) ** 2) # RBF kernel - Wishart(torch.tensor(10), precision_matrix=P) + Wishart(torch.tensor(ndim), precision_matrix=P) @unittest.skipIf(not TEST_NUMPY, "Numpy not found") def test_wishart_log_prob(self): - df = (torch.rand([], requires_grad=True) + 1) * 10 - tmp = torch.randn(3, 10) + set_rng_seed(0) # see Note [Randomized statistical tests] + ndim = 3 + df = torch.rand([], requires_grad=True) + ndim - 1 + # SciPy allowed ndim -1 < df < ndim for Wishar distribution after version 1.7.0 + if version.parse(scipy.__version__) < version.parse("1.7.0"): + df += 1. + tmp = torch.randn(ndim, 10) cov = (torch.matmul(tmp, tmp.t()) / tmp.size(-1)).requires_grad_() prec = cov.inverse().requires_grad_() scale_tril = torch.linalg.cholesky(cov).requires_grad_() @@ -2291,7 +2308,7 @@ def test_wishart_log_prob(self): dist3 = Wishart(df, scale_tril=scale_tril) ref_dist = scipy.stats.wishart(df.item(), cov.detach().numpy()) - x = dist1.sample((10,)) + x = dist1.sample((1000,)) expected = ref_dist.logpdf(x.transpose(0, 2).numpy()) self.assertEqual(0.0, np.mean((dist1.log_prob(x).detach().numpy() - expected)**2), atol=1e-3, rtol=0) @@ -2299,14 +2316,17 @@ def test_wishart_log_prob(self): self.assertEqual(0.0, np.mean((dist3.log_prob(x).detach().numpy() - expected)**2), atol=1e-3, rtol=0) # Double-check that batched versions behave the same as unbatched - df = (torch.rand(5, requires_grad=True) + 1) * 3 - tmp = torch.randn(5, 3, 10) + df = torch.rand(5, requires_grad=True) + ndim - 1 + # SciPy allowed ndim -1 < df < ndim for Wishar distribution after version 1.7.0 + if version.parse(scipy.__version__) < version.parse("1.7.0"): + df += 1. + tmp = torch.randn(5, ndim, 10) cov = (tmp.unsqueeze(-2) * tmp.unsqueeze(-3)).mean(-1).requires_grad_() dist_batched = Wishart(df, cov) dist_unbatched = [Wishart(df[i], cov[i]) for i in range(df.size(0))] - x = dist_batched.sample((10,)) + x = dist_batched.sample((1000,)) batched_prob = dist_batched.log_prob(x) unbatched_prob = torch.stack([dist_unbatched[i].log_prob(x[:, i]) for i in range(5)]).t() @@ -2316,28 +2336,36 @@ def test_wishart_log_prob(self): @unittest.skipIf(not TEST_NUMPY, "NumPy not found") def test_wishart_sample(self): set_rng_seed(0) # see Note [Randomized statistical tests] - df = (torch.rand([], requires_grad=True) + 1) * 3 - tmp = torch.randn(3, 10) + ndim = 3 + df = torch.rand([], requires_grad=True) + ndim - 1 + # SciPy allowed ndim -1 < df < ndim for Wishar distribution after version 1.7.0 + if version.parse(scipy.__version__) < version.parse("1.7.0"): + df += 1. + tmp = torch.randn(ndim, 10) cov = (torch.matmul(tmp, tmp.t()) / tmp.size(-1)).requires_grad_() prec = cov.inverse().requires_grad_() scale_tril = torch.linalg.cholesky(cov).requires_grad_() + ref_dist = scipy.stats.wishart(df.item(), cov.detach().numpy()) + self._check_sampler_sampler(Wishart(df, cov), - scipy.stats.wishart(df.item(), cov.detach().numpy()), + ref_dist, 'Wishart(df={}, covariance_matrix={})'.format(df, cov), multivariate=True) self._check_sampler_sampler(Wishart(df, precision_matrix=prec), - scipy.stats.wishart(df.item(), cov.detach().numpy()), + ref_dist, 'Wishart(df={}, precision_matrix={})'.format(df, prec), multivariate=True) self._check_sampler_sampler(Wishart(df, scale_tril=scale_tril), - scipy.stats.wishart(df.item(), cov.detach().numpy()), + ref_dist, 'Wishart(df={}, scale_tril={})'.format(df, scale_tril), multivariate=True) def test_wishart_properties(self): - df = (torch.rand([]) + 1) * 5 - scale_tril = transform_to(constraints.lower_cholesky)(torch.randn(5, 5)) + set_rng_seed(0) # see Note [Randomized statistical tests] + ndim = 5 + df = torch.rand([]) + ndim - 1 + scale_tril = transform_to(constraints.lower_cholesky)(torch.randn(ndim, ndim)) m = Wishart(df=df, scale_tril=scale_tril) self.assertEqual(m.covariance_matrix, m.scale_tril.mm(m.scale_tril.t())) self.assertEqual(m.covariance_matrix.mm(m.precision_matrix), torch.eye(m.event_shape[0])) @@ -2345,14 +2373,15 @@ def test_wishart_properties(self): def test_wishart_moments(self): set_rng_seed(0) # see Note [Randomized statistical tests] - df = (torch.rand([]) + 1) * 3 - scale_tril = transform_to(constraints.lower_cholesky)(torch.randn(3, 3)) + ndim = 3 + df = torch.rand([]) + ndim - 1 + scale_tril = transform_to(constraints.lower_cholesky)(torch.randn(ndim, ndim)) d = Wishart(df=df, scale_tril=scale_tril) - samples = d.rsample((100000,)) + samples = d.rsample((ndim * ndim * 100000,)) empirical_mean = samples.mean(0) - self.assertEqual(d.mean, empirical_mean, atol=5, rtol=0) + self.assertEqual(d.mean, empirical_mean, atol=0.5, rtol=0) empirical_var = samples.var(0) - self.assertEqual(d.variance, empirical_var, atol=5, rtol=0) + self.assertEqual(d.variance, empirical_var, atol=0.5, rtol=0) def test_exponential(self): rate = torch.randn(5, 5).abs().requires_grad_() @@ -2727,6 +2756,18 @@ def test_dirichlet_sample(self): 'Dirichlet(alpha={})'.format(list(alpha)), multivariate=True) + def test_dirichlet_mode(self): + # Test a few edge cases for the Dirichlet distribution mode. This also covers beta distributions. + concentrations_and_modes = [ + ([2, 2, 1], [.5, .5, 0.]), + ([3, 2, 1], [2 / 3, 1 / 3, 0]), + ([.5, .2, .2], [1., 0., 0.]), + ([1, 1, 1], [nan, nan, nan]), + ] + for concentration, mode in concentrations_and_modes: + dist = Dirichlet(torch.tensor(concentration)) + self.assertEqual(dist.mode, torch.tensor(mode)) + def test_beta_shape(self): con1 = torch.randn(2, 3).exp().requires_grad_() con0 = torch.randn(2, 3).exp().requires_grad_() @@ -2922,6 +2963,14 @@ def test_cdf_icdf_inverse(self): 'icdf(cdf(x)) = {}'.format(actual), ])) + @unittest.skipIf(not TEST_NUMPY, "NumPy not found") + def test_gamma_log_prob_at_boundary(self): + for concentration, log_prob in [(.5, inf), (1, 0), (2, -inf)]: + dist = Gamma(concentration, 1) + scipy_dist = scipy.stats.gamma(concentration) + self.assertAlmostEqual(dist.log_prob(0), log_prob) + self.assertAlmostEqual(dist.log_prob(0), scipy_dist.logpdf(0)) + def test_cdf_log_prob(self): # Tests if the differentiation of the CDF gives the PDF at a given value for Dist, params in EXAMPLES: @@ -3105,18 +3154,88 @@ def test_invalid_parameter_broadcasting(self): 'alpha': torch.tensor([1, 1, 1]) }), (StudentT, { - 'df': torch.tensor([1, 1]), - 'scale': torch.tensor([1, 1, 1]) + 'df': torch.tensor([1., 1.]), + 'scale': torch.tensor([1., 1., 1.]) }), (StudentT, { - 'df': torch.tensor([1, 1]), - 'loc': torch.tensor([1, 1, 1]) + 'df': torch.tensor([1., 1.]), + 'loc': torch.tensor([1., 1., 1.]) }) ] for dist, kwargs in invalid_examples: self.assertRaises(RuntimeError, dist, **kwargs) + def _test_discrete_distribution_mode(self, dist, sanitized_mode, batch_isfinite): + # We cannot easily check the mode for discrete distributions, but we can look left and right + # to ensure the log probability is smaller than at the mode. + for step in [-1, 1]: + log_prob_mode = dist.log_prob(sanitized_mode) + if isinstance(dist, OneHotCategorical): + idx = (dist._categorical.mode + 1) % dist.probs.shape[-1] + other = torch.nn.functional.one_hot(idx, num_classes=dist.probs.shape[-1]).to(dist.mode) + else: + other = dist.mode + step + mask = batch_isfinite & dist.support.check(other) + self.assertTrue(mask.any() or dist.mode.unique().numel() == 1) + # Add a dimension to the right if the event shape is not a scalar, e.g. OneHotCategorical. + other = torch.where(mask[..., None] if mask.ndim < other.ndim else mask, other, dist.sample()) + log_prob_other = dist.log_prob(other) + delta = log_prob_mode - log_prob_other + self.assertTrue((-1e-12 < delta[mask].detach()).all()) # Allow up to 1e-12 rounding error. + + def _test_continuous_distribution_mode(self, dist, sanitized_mode, batch_isfinite): + if isinstance(dist, Wishart): + return + # We perturb the mode in the unconstrained space and expect the log probability to decrease. + num_points = 10 + transform = transform_to(dist.support) + unconstrained_mode = transform.inv(sanitized_mode) + perturbation = 1e-5 * (torch.rand((num_points,) + unconstrained_mode.shape) - 0.5) + perturbed_mode = transform(perturbation + unconstrained_mode) + log_prob_mode = dist.log_prob(sanitized_mode) + log_prob_other = dist.log_prob(perturbed_mode) + delta = log_prob_mode - log_prob_other + + # We pass the test with a small tolerance to allow for rounding and manually set the + # difference to zero if both log probs are infinite with the same sign. + both_infinite_with_same_sign = (log_prob_mode == log_prob_other) & (log_prob_mode.abs() == inf) + delta[both_infinite_with_same_sign] = 0. + ordering = (delta > -1e-12).all(axis=0) + self.assertTrue(ordering[batch_isfinite].all()) + + def test_mode(self): + discrete_distributions = ( + Bernoulli, Binomial, Categorical, Geometric, NegativeBinomial, OneHotCategorical, Poisson, + ) + no_mode_available = ( + ContinuousBernoulli, LKJCholesky, LogisticNormal, MixtureSameFamily, Multinomial, + RelaxedBernoulli, RelaxedOneHotCategorical, + ) + + for dist_cls, params in EXAMPLES: + for param in params: + dist = dist_cls(**param) + if isinstance(dist, no_mode_available) or type(dist) is TransformedDistribution: + with self.assertRaises(NotImplementedError): + dist.mode + continue + + # Check that either all or no elements in the event shape are nan: the mode cannot be + # defined for part of an event. + isfinite = dist.mode.isfinite().reshape(dist.batch_shape + (dist.event_shape.numel(),)) + batch_isfinite = isfinite.all(axis=-1) + self.assertTrue((batch_isfinite | ~isfinite.any(axis=-1)).all()) + + # We sanitize undefined modes by sampling from the distribution. + sanitized_mode = torch.where(~dist.mode.isnan(), dist.mode, dist.sample()) + if isinstance(dist, discrete_distributions): + self._test_discrete_distribution_mode(dist, sanitized_mode, batch_isfinite) + else: + self._test_continuous_distribution_mode(dist, sanitized_mode, batch_isfinite) + + self.assertFalse(dist.log_prob(sanitized_mode).isnan().any()) + # These tests are only needed for a few distributions that implement custom # reparameterized gradients. Most .rsample() implementations simply rely on @@ -4617,8 +4736,16 @@ def setUp(self): scipy.stats.weibull_min(c=positive_var2[0], scale=positive_var[0]) ), ( - Wishart(20 + positive_var[0], cov_tensor), # scipy var for Wishart only supports scalars - scipy.stats.wishart(20 + positive_var[0].item(), cov_tensor), + # scipy var for Wishart only supports scalars + # SciPy allowed ndim -1 < df < ndim for Wishar distribution after version 1.7.0 + Wishart( + (20 if version.parse(scipy.__version__) < version.parse("1.7.0") else 19) + positive_var[0], + cov_tensor, + ), + scipy.stats.wishart( + (20 if version.parse(scipy.__version__) < version.parse("1.7.0") else 19) + positive_var[0].item(), + cov_tensor, + ), ), ] @@ -4894,7 +5021,7 @@ def _examples(self): def _perturb_tensor(self, value, constraint): if isinstance(constraint, constraints._IntegerGreaterThan): return value + 1 - if isinstance(constraint, constraints._PositiveDefinite): + if isinstance(constraint, constraints._PositiveDefinite) or isinstance(constraint, constraints._PositiveSemidefinite): return value + torch.eye(value.shape[-1]) if value.dtype in [torch.float, torch.double]: transform = transform_to(constraint) diff --git a/test/distributions/test_transforms.py b/test/distributions/test_transforms.py index 40f636c53f7e..da645e0e5036 100644 --- a/test/distributions/test_transforms.py +++ b/test/distributions/test_transforms.py @@ -12,7 +12,7 @@ ExpTransform, IndependentTransform, LowerCholeskyTransform, PowerTransform, ReshapeTransform, SigmoidTransform, TanhTransform, - SoftmaxTransform, StickBreakingTransform, + SoftmaxTransform, SoftplusTransform, StickBreakingTransform, identity_transform, Transform, _InverseTransform) from torch.distributions.utils import tril_matrix_to_vec, vec_to_tril_matrix @@ -38,6 +38,7 @@ def get_transforms(cache_size): torch.randn(4, 5), cache_size=cache_size), SoftmaxTransform(cache_size=cache_size), + SoftplusTransform(cache_size=cache_size), StickBreakingTransform(cache_size=cache_size), LowerCholeskyTransform(cache_size=cache_size), CorrCholeskyTransform(cache_size=cache_size), diff --git a/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect b/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect index 32b8be2f5cd7..f01221172b70 100644 --- a/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect +++ b/test/expect/TestFXAPIBackwardCompatibility.test_class_member_back_compat-fx_backcompat_class_members.expect @@ -1,6 +1,6 @@ torch.fx._symbolic_trace.ProxyableClassMeta [] torch.fx._symbolic_trace.Tracer ['call_module', 'create_arg', 'create_args_for_root', 'is_leaf_module', 'path_of_module', 'trace'] -torch.fx.graph.Graph ['call_function', 'call_method', 'call_module', 'create_node', 'eliminate_dead_code', 'erase_node', 'flatten_inps', 'get_attr', 'graph_copy', 'inserting_after', 'inserting_before', 'lint', 'node_copy', 'nodes', 'on_generate_code', 'output', 'owning_module', 'placeholder', 'print_tabular', 'python_code', 'unflatten_outs'] +torch.fx.graph.Graph ['call_function', 'call_method', 'call_module', 'create_node', 'eliminate_dead_code', 'erase_node', 'get_attr', 'graph_copy', 'inserting_after', 'inserting_before', 'lint', 'node_copy', 'nodes', 'on_generate_code', 'output', 'owning_module', 'placeholder', 'print_tabular', 'process_inputs', 'process_outputs', 'python_code', 'set_codegen'] torch.fx.graph.PythonCode [] torch.fx.graph_module.GraphModule ['add_submodule', 'code', 'delete_all_unused_submodules', 'delete_submodule', 'graph', 'recompile', 'to_folder'] torch.fx.immutable_collections.immutable_dict ['clear', 'pop', 'popitem', 'update'] @@ -15,5 +15,5 @@ torch.fx.proxy.Attribute ['node'] torch.fx.proxy.GraphAppendingTracer [] torch.fx.proxy.Proxy ['keys'] torch.fx.proxy.TraceError [] -torch.fx.proxy.TracerBase ['check_mutable_operations', 'create_arg', 'create_node', 'create_proxy', 'iter', 'keys', 'proxy', 'record_stack_traces', 'to_bool', 'trace_asserts'] +torch.fx.proxy.TracerBase ['check_mutable_operations', 'create_arg', 'create_node', 'create_proxy', 'iter', 'keys', 'proxy', 'proxy_buffer_attributes', 'record_stack_traces', 'to_bool', 'trace_asserts', 'traced_func_name'] torch.fx.subgraph_rewriter.Match ['anchor', 'nodes_map'] \ No newline at end of file diff --git a/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect index a7a2a37d98d0..bd8c0e63a52c 100644 --- a/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect +++ b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect @@ -6,7 +6,7 @@ torch.fx._symbolic_trace.Tracer.path_of_module(self, mod: torch.nn.modules.modul torch.fx._symbolic_trace.Tracer.trace(self, root: Union[torch.nn.modules.module.Module, Callable[..., Any]], concrete_args: Optional[Dict[str, Any]] = None) -> torch.fx.graph.Graph torch.fx._symbolic_trace.symbolic_trace(root: Union[torch.nn.modules.module.Module, Callable[..., Any]], concrete_args: Optional[Dict[str, Any]] = None) -> torch.fx.graph_module.GraphModule torch.fx._symbolic_trace.wrap(fn_or_name: Union[str, Callable]) -torch.fx.graph.Graph.__init__(self, owning_module: Optional[GraphModule] = None, tracer_cls: Optional[Type[Tracer]] = None) +torch.fx.graph.Graph.__init__(self, owning_module: Optional[GraphModule] = None, tracer_cls: Optional[Type[Tracer]] = None, tracer_extras: Optional[Dict[str, Any]] = None) torch.fx.graph.Graph.call_function(self, the_function: Callable[..., Any], args: Optional[Tuple[Argument, ...]] = None, kwargs: Optional[Dict[str, Argument]] = None, type_expr: Optional[Any] = None) -> torch.fx.node.Node torch.fx.graph.Graph.call_method(self, method_name: str, args: Optional[Tuple[Argument, ...]] = None, kwargs: Optional[Dict[str, Argument]] = None, type_expr: Optional[Any] = None) -> torch.fx.node.Node torch.fx.graph.Graph.call_module(self, module_name: str, args: Optional[Tuple[Argument, ...]] = None, kwargs: Optional[Dict[str, Argument]] = None, type_expr: Optional[Any] = None) -> torch.fx.node.Node @@ -41,7 +41,7 @@ torch.fx.interpreter.Interpreter.get_attr(self, target: 'Target', args: Tuple[to torch.fx.interpreter.Interpreter.map_nodes_to_values(self, args: torch.fx.node.Argument, n: torch.fx.node.Node) -> torch.fx.node.Argument torch.fx.interpreter.Interpreter.output(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> Any torch.fx.interpreter.Interpreter.placeholder(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> Any -torch.fx.interpreter.Interpreter.run(self, *args, initial_env: Optional[Dict[torch.fx.node.Node, Any]] = None) -> Any +torch.fx.interpreter.Interpreter.run(self, *args, initial_env: Optional[Dict[torch.fx.node.Node, Any]] = None, enable_io_processing: bool = True) -> Any torch.fx.interpreter.Interpreter.run_node(self, n: torch.fx.node.Node) -> Any torch.fx.interpreter.Transformer.__init__(self, module) torch.fx.interpreter.Transformer.call_function(self, target: 'Target', args: Tuple[torch.fx.node.Argument, ...], kwargs: Dict[str, Any]) -> Any @@ -53,13 +53,13 @@ torch.fx.node.Node.__init__(self, graph: 'Graph', name: str, op: str, target: 'T torch.fx.node.Node.append(self, x: 'Node') -> None torch.fx.node.Node.format_node(self, placeholder_names: Optional[List[str]] = None, maybe_return_typename: Optional[List[str]] = None) -> Optional[str] torch.fx.node.Node.prepend(self, x: 'Node') -> None -torch.fx.node.Node.replace_all_uses_with(self, replace_with: 'Node') -> List[Node] +torch.fx.node.Node.replace_all_uses_with(self, replace_with: 'Node', delete_user_cb: Callable[[Node], bool] = >) -> List[Node] torch.fx.node.Node.replace_input_with(self, old_input: 'Node', new_input: 'Node') torch.fx.node.Node.update_arg(self, idx: int, arg: torch.fx.node.Argument) -> None torch.fx.node.Node.update_kwarg(self, key: str, arg: torch.fx.node.Argument) -> None torch.fx.node.map_aggregate(a: torch.fx.node.Argument, fn: Callable[[torch.fx.node.Argument], torch.fx.node.Argument]) -> torch.fx.node.Argument torch.fx.node.map_arg(a: torch.fx.node.Argument, fn: Callable[[torch.fx.node.Node], torch.fx.node.Argument]) -> torch.fx.node.Argument -torch.fx.passes.split_module.split_module(m: torch.fx.graph_module.GraphModule, root_m: torch.nn.modules.module.Module, split_callback: Callable[[torch.fx.node.Node], int]) +torch.fx.passes.split_module.split_module(m: torch.fx.graph_module.GraphModule, root_m: torch.nn.modules.module.Module, split_callback: Callable[[torch.fx.node.Node], int], qualname_map: Optional[Dict[str, str]] = None) torch.fx.proxy.Attribute.__init__(self, root: torch.fx.proxy.Proxy, attr: str) torch.fx.proxy.Proxy.__init__(self, node: torch.fx.node.Node, tracer: 'Optional[TracerBase]' = None) torch.fx.proxy.Proxy.keys(self) diff --git a/test/expect/TestPytorchExportModes.test_aten_fallback.expect b/test/expect/TestPytorchExportModes.test_aten_fallback.expect index 41059587af0b..83c481fd7e9b 100644 --- a/test/expect/TestPytorchExportModes.test_aten_fallback.expect +++ b/test/expect/TestPytorchExportModes.test_aten_fallback.expect @@ -11,7 +11,7 @@ ModelProto { nodes: [ Node {type: "Add", inputs: [0,1], outputs: [2], attributes: []}, Node {type: "Constant", inputs: [], outputs: [3], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}, - Node {type: "ATen", inputs: [2,3], outputs: [4,5], attributes: [{ name: 'operator', type: string, value: 'qr'}]} + Node {type: "ATen", domain: "org.pytorch.aten", inputs: [2,3], outputs: [4,5], attributes: [{ name: 'operator', type: string, value: 'qr'}, { name: 'overload_name', type: string, value: ''}]} ] } opset_import: [OperatorSetIdProto { domain: }OperatorSetIdProto { domain: org.pytorch.aten}], diff --git a/test/expect/TestPytorchExportModes.test_onnx_aten.expect b/test/expect/TestPytorchExportModes.test_onnx_aten.expect index 22f1c57f9570..3c2960f91f96 100644 --- a/test/expect/TestPytorchExportModes.test_onnx_aten.expect +++ b/test/expect/TestPytorchExportModes.test_onnx_aten.expect @@ -9,7 +9,7 @@ ModelProto { outputs: [{name: "2", type:Tensor dims: 3 4}] initializers: [] nodes: [ - Node {type: "ATen", inputs: [0,1], outputs: [2], attributes: [{ name: 'operator', type: string, value: 'fmod'}]} + Node {type: "ATen", domain: "org.pytorch.aten", inputs: [0,1], outputs: [2], attributes: [{ name: 'operator', type: string, value: 'fmod'}, { name: 'overload_name', type: string, value: ''}]} ] } opset_import: [OperatorSetIdProto { domain: }OperatorSetIdProto { domain: org.pytorch.aten}], diff --git a/test/expect/TestScript.test_listconstruct_erasure.expect b/test/expect/TestScript.test_listconstruct_erasure.expect index 0f7d470b0709..8172b3fe0c76 100644 --- a/test/expect/TestScript.test_listconstruct_erasure.expect +++ b/test/expect/TestScript.test_listconstruct_erasure.expect @@ -13,7 +13,7 @@ ModelProto { Node {type: "Less", inputs: [0,1], outputs: [2], attributes: []}, Node {type: "Cast", inputs: [2], outputs: [3], attributes: [{ name: 'to', type: int, value: 2}]}, Node {type: "Cast", inputs: [3], outputs: [4], attributes: [{ name: 'to', type: int, value: 9}]}, - Node {type: "ATen", inputs: [0,4], outputs: [5], attributes: [{ name: 'operator', type: string, value: 'index'}]} + Node {type: "ATen", domain: "org.pytorch.aten", inputs: [0,4], outputs: [5], attributes: [{ name: 'operator', type: string, value: 'index'}, { name: 'overload_name', type: string, value: ''}]} ] } opset_import: [OperatorSetIdProto { domain: }OperatorSetIdProto { domain: org.pytorch.aten}], diff --git a/test/expect/TestSparseCSRCPU.test_sparse_csr_print_cpu.expect b/test/expect/TestSparseCSRCPU.test_sparse_csr_print_cpu.expect deleted file mode 100644 index a30958d09d97..000000000000 --- a/test/expect/TestSparseCSRCPU.test_sparse_csr_print_cpu.expect +++ /dev/null @@ -1,176 +0,0 @@ -# shape: torch.Size([10, 10]) -# nnz: 10 -# crow_indices shape: torch.Size([11]) -# col_indices shape: torch.Size([10]) -# values_shape: torch.Size([10]) -########## torch.float32/torch.int32 ########## -# sparse tensor -tensor(crow_indices=tensor([0, 2, 4]), - col_indices=tensor([0, 1, 0, 1]), - values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4, - layout=torch.sparse_csr) -# _crow_indices -tensor([0, 2, 4], dtype=torch.int32) -# _col_indices -tensor([0, 1, 0, 1], dtype=torch.int32) -# _values -tensor([1., 2., 3., 4.]) - -########## torch.float64/torch.int32 ########## -# sparse tensor -tensor(crow_indices=tensor([0, 2, 4]), - col_indices=tensor([0, 1, 0, 1]), - values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4, - dtype=torch.float64, layout=torch.sparse_csr) -# _crow_indices -tensor([0, 2, 4], dtype=torch.int32) -# _col_indices -tensor([0, 1, 0, 1], dtype=torch.int32) -# _values -tensor([1., 2., 3., 4.], dtype=torch.float64) - - -########## torch.float32/torch.int64 ########## -# sparse tensor -tensor(crow_indices=tensor([0, 2, 4]), - col_indices=tensor([0, 1, 0, 1]), - values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4, - layout=torch.sparse_csr) -# _crow_indices -tensor([0, 2, 4]) -# _col_indices -tensor([0, 1, 0, 1]) -# _values -tensor([1., 2., 3., 4.]) - -########## torch.float64/torch.int64 ########## -# sparse tensor -tensor(crow_indices=tensor([0, 2, 4]), - col_indices=tensor([0, 1, 0, 1]), - values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4, - dtype=torch.float64, layout=torch.sparse_csr) -# _crow_indices -tensor([0, 2, 4]) -# _col_indices -tensor([0, 1, 0, 1]) -# _values -tensor([1., 2., 3., 4.], dtype=torch.float64) - - -# shape: torch.Size([100, 10]) -# nnz: 10 -# crow_indices shape: torch.Size([101]) -# col_indices shape: torch.Size([10]) -# values_shape: torch.Size([10]) -########## torch.float32/torch.int32 ########## -# sparse tensor -tensor(crow_indices=tensor([0, 2, 4]), - col_indices=tensor([0, 1, 0, 1]), - values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4, - layout=torch.sparse_csr) -# _crow_indices -tensor([0, 2, 4], dtype=torch.int32) -# _col_indices -tensor([0, 1, 0, 1], dtype=torch.int32) -# _values -tensor([1., 2., 3., 4.]) - -########## torch.float64/torch.int32 ########## -# sparse tensor -tensor(crow_indices=tensor([0, 2, 4]), - col_indices=tensor([0, 1, 0, 1]), - values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4, - dtype=torch.float64, layout=torch.sparse_csr) -# _crow_indices -tensor([0, 2, 4], dtype=torch.int32) -# _col_indices -tensor([0, 1, 0, 1], dtype=torch.int32) -# _values -tensor([1., 2., 3., 4.], dtype=torch.float64) - - -########## torch.float32/torch.int64 ########## -# sparse tensor -tensor(crow_indices=tensor([0, 2, 4]), - col_indices=tensor([0, 1, 0, 1]), - values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4, - layout=torch.sparse_csr) -# _crow_indices -tensor([0, 2, 4]) -# _col_indices -tensor([0, 1, 0, 1]) -# _values -tensor([1., 2., 3., 4.]) - -########## torch.float64/torch.int64 ########## -# sparse tensor -tensor(crow_indices=tensor([0, 2, 4]), - col_indices=tensor([0, 1, 0, 1]), - values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4, - dtype=torch.float64, layout=torch.sparse_csr) -# _crow_indices -tensor([0, 2, 4]) -# _col_indices -tensor([0, 1, 0, 1]) -# _values -tensor([1., 2., 3., 4.], dtype=torch.float64) - - -# shape: torch.Size([1000, 10]) -# nnz: 10 -# crow_indices shape: torch.Size([1001]) -# col_indices shape: torch.Size([10]) -# values_shape: torch.Size([10]) -########## torch.float32/torch.int32 ########## -# sparse tensor -tensor(crow_indices=tensor([0, 2, 4]), - col_indices=tensor([0, 1, 0, 1]), - values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4, - layout=torch.sparse_csr) -# _crow_indices -tensor([0, 2, 4], dtype=torch.int32) -# _col_indices -tensor([0, 1, 0, 1], dtype=torch.int32) -# _values -tensor([1., 2., 3., 4.]) - -########## torch.float64/torch.int32 ########## -# sparse tensor -tensor(crow_indices=tensor([0, 2, 4]), - col_indices=tensor([0, 1, 0, 1]), - values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4, - dtype=torch.float64, layout=torch.sparse_csr) -# _crow_indices -tensor([0, 2, 4], dtype=torch.int32) -# _col_indices -tensor([0, 1, 0, 1], dtype=torch.int32) -# _values -tensor([1., 2., 3., 4.], dtype=torch.float64) - - -########## torch.float32/torch.int64 ########## -# sparse tensor -tensor(crow_indices=tensor([0, 2, 4]), - col_indices=tensor([0, 1, 0, 1]), - values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4, - layout=torch.sparse_csr) -# _crow_indices -tensor([0, 2, 4]) -# _col_indices -tensor([0, 1, 0, 1]) -# _values -tensor([1., 2., 3., 4.]) - -########## torch.float64/torch.int64 ########## -# sparse tensor -tensor(crow_indices=tensor([0, 2, 4]), - col_indices=tensor([0, 1, 0, 1]), - values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4, - dtype=torch.float64, layout=torch.sparse_csr) -# _crow_indices -tensor([0, 2, 4]) -# _col_indices -tensor([0, 1, 0, 1]) -# _values -tensor([1., 2., 3., 4.], dtype=torch.float64) - diff --git a/test/expect/TestSparseCSRCUDA.test_sparse_csr_print_cuda.expect b/test/expect/TestSparseCSRCUDA.test_sparse_csr_print_cuda.expect deleted file mode 100644 index 551092b4a56e..000000000000 --- a/test/expect/TestSparseCSRCUDA.test_sparse_csr_print_cuda.expect +++ /dev/null @@ -1,176 +0,0 @@ -# shape: torch.Size([10, 10]) -# nnz: 10 -# crow_indices shape: torch.Size([11]) -# col_indices shape: torch.Size([10]) -# values_shape: torch.Size([10]) -########## torch.float32/torch.int32 ########## -# sparse tensor -tensor(crow_indices=tensor([0, 2, 4]), - col_indices=tensor([0, 1, 0, 1]), - values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4, - layout=torch.sparse_csr) -# _crow_indices -tensor([0, 2, 4], device='cuda:0', dtype=torch.int32) -# _col_indices -tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32) -# _values -tensor([1., 2., 3., 4.], device='cuda:0') - -########## torch.float64/torch.int32 ########## -# sparse tensor -tensor(crow_indices=tensor([0, 2, 4]), - col_indices=tensor([0, 1, 0, 1]), - values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4, - dtype=torch.float64, layout=torch.sparse_csr) -# _crow_indices -tensor([0, 2, 4], device='cuda:0', dtype=torch.int32) -# _col_indices -tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32) -# _values -tensor([1., 2., 3., 4.], device='cuda:0', dtype=torch.float64) - - -########## torch.float32/torch.int64 ########## -# sparse tensor -tensor(crow_indices=tensor([0, 2, 4]), - col_indices=tensor([0, 1, 0, 1]), - values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4, - layout=torch.sparse_csr) -# _crow_indices -tensor([0, 2, 4], device='cuda:0') -# _col_indices -tensor([0, 1, 0, 1], device='cuda:0') -# _values -tensor([1., 2., 3., 4.], device='cuda:0') - -########## torch.float64/torch.int64 ########## -# sparse tensor -tensor(crow_indices=tensor([0, 2, 4]), - col_indices=tensor([0, 1, 0, 1]), - values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4, - dtype=torch.float64, layout=torch.sparse_csr) -# _crow_indices -tensor([0, 2, 4], device='cuda:0') -# _col_indices -tensor([0, 1, 0, 1], device='cuda:0') -# _values -tensor([1., 2., 3., 4.], device='cuda:0', dtype=torch.float64) - - -# shape: torch.Size([100, 10]) -# nnz: 10 -# crow_indices shape: torch.Size([101]) -# col_indices shape: torch.Size([10]) -# values_shape: torch.Size([10]) -########## torch.float32/torch.int32 ########## -# sparse tensor -tensor(crow_indices=tensor([0, 2, 4]), - col_indices=tensor([0, 1, 0, 1]), - values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4, - layout=torch.sparse_csr) -# _crow_indices -tensor([0, 2, 4], device='cuda:0', dtype=torch.int32) -# _col_indices -tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32) -# _values -tensor([1., 2., 3., 4.], device='cuda:0') - -########## torch.float64/torch.int32 ########## -# sparse tensor -tensor(crow_indices=tensor([0, 2, 4]), - col_indices=tensor([0, 1, 0, 1]), - values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4, - dtype=torch.float64, layout=torch.sparse_csr) -# _crow_indices -tensor([0, 2, 4], device='cuda:0', dtype=torch.int32) -# _col_indices -tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32) -# _values -tensor([1., 2., 3., 4.], device='cuda:0', dtype=torch.float64) - - -########## torch.float32/torch.int64 ########## -# sparse tensor -tensor(crow_indices=tensor([0, 2, 4]), - col_indices=tensor([0, 1, 0, 1]), - values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4, - layout=torch.sparse_csr) -# _crow_indices -tensor([0, 2, 4], device='cuda:0') -# _col_indices -tensor([0, 1, 0, 1], device='cuda:0') -# _values -tensor([1., 2., 3., 4.], device='cuda:0') - -########## torch.float64/torch.int64 ########## -# sparse tensor -tensor(crow_indices=tensor([0, 2, 4]), - col_indices=tensor([0, 1, 0, 1]), - values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4, - dtype=torch.float64, layout=torch.sparse_csr) -# _crow_indices -tensor([0, 2, 4], device='cuda:0') -# _col_indices -tensor([0, 1, 0, 1], device='cuda:0') -# _values -tensor([1., 2., 3., 4.], device='cuda:0', dtype=torch.float64) - - -# shape: torch.Size([1000, 10]) -# nnz: 10 -# crow_indices shape: torch.Size([1001]) -# col_indices shape: torch.Size([10]) -# values_shape: torch.Size([10]) -########## torch.float32/torch.int32 ########## -# sparse tensor -tensor(crow_indices=tensor([0, 2, 4]), - col_indices=tensor([0, 1, 0, 1]), - values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4, - layout=torch.sparse_csr) -# _crow_indices -tensor([0, 2, 4], device='cuda:0', dtype=torch.int32) -# _col_indices -tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32) -# _values -tensor([1., 2., 3., 4.], device='cuda:0') - -########## torch.float64/torch.int32 ########## -# sparse tensor -tensor(crow_indices=tensor([0, 2, 4]), - col_indices=tensor([0, 1, 0, 1]), - values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4, - dtype=torch.float64, layout=torch.sparse_csr) -# _crow_indices -tensor([0, 2, 4], device='cuda:0', dtype=torch.int32) -# _col_indices -tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32) -# _values -tensor([1., 2., 3., 4.], device='cuda:0', dtype=torch.float64) - - -########## torch.float32/torch.int64 ########## -# sparse tensor -tensor(crow_indices=tensor([0, 2, 4]), - col_indices=tensor([0, 1, 0, 1]), - values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4, - layout=torch.sparse_csr) -# _crow_indices -tensor([0, 2, 4], device='cuda:0') -# _col_indices -tensor([0, 1, 0, 1], device='cuda:0') -# _values -tensor([1., 2., 3., 4.], device='cuda:0') - -########## torch.float64/torch.int64 ########## -# sparse tensor -tensor(crow_indices=tensor([0, 2, 4]), - col_indices=tensor([0, 1, 0, 1]), - values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4, - dtype=torch.float64, layout=torch.sparse_csr) -# _crow_indices -tensor([0, 2, 4], device='cuda:0') -# _col_indices -tensor([0, 1, 0, 1], device='cuda:0') -# _values -tensor([1., 2., 3., 4.], device='cuda:0', dtype=torch.float64) - diff --git a/test/expect/TestSparseCompressedCPU.test_print_SparseBSC_cpu.expect b/test/expect/TestSparseCompressedCPU.test_print_SparseBSC_cpu.expect new file mode 100644 index 000000000000..bcffa8293c93 --- /dev/null +++ b/test/expect/TestSparseCompressedCPU.test_print_SparseBSC_cpu.expect @@ -0,0 +1,907 @@ +########## torch.float32/torch.int32/batch_shape=()/block_shape=(1, 2) ########## +# sparse tensor +tensor(ccol_indices=tensor([0, 2, 4]), + row_indices=tensor([0, 1, 0, 1]), + values=tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]), size=(2, 2), nnz=4, + layout=torch.sparse_bsc) +# _ccol_indices +tensor([0, 2, 4], dtype=torch.int32) +# _row_indices +tensor([0, 1, 0, 1], dtype=torch.int32) +# _values +tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]) + +########## torch.float32/torch.int32/batch_shape=()/block_shape=(0, 0) ########## +# sparse tensor +tensor(ccol_indices=tensor([0]), + row_indices=tensor([], size=(0,)), + values=tensor([], size=(1, 0, 0)), size=(0, 0), nnz=0, + layout=torch.sparse_bsc) +# _ccol_indices +tensor([0], dtype=torch.int32) +# _row_indices +tensor([], dtype=torch.int32) +# _values +tensor([], size=(1, 0, 0)) + +########## torch.float32/torch.int32/batch_shape=(2,)/block_shape=(1, 2) ########## +# sparse tensor +tensor(ccol_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + row_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]), size=(2, 2, 2), nnz=4, + layout=torch.sparse_bsc) +# _ccol_indices +tensor([[0, 2, 4], + [0, 2, 4]], dtype=torch.int32) +# _row_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]], dtype=torch.int32) +# _values +tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]) + +########## torch.float32/torch.int32/batch_shape=(2, 3)/block_shape=(1, 2) ########## +# sparse tensor +tensor(ccol_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + row_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]]), size=(2, 3, 2, 2), nnz=4, + layout=torch.sparse_bsc) +# _ccol_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]], dtype=torch.int32) +# _row_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]], dtype=torch.int32) +# _values +tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]]) + + +########## torch.float64/torch.int32/batch_shape=()/block_shape=(1, 2) ########## +# sparse tensor +tensor(ccol_indices=tensor([0, 2, 4]), + row_indices=tensor([0, 1, 0, 1]), + values=tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]), size=(2, 2), nnz=4, dtype=torch.float64, + layout=torch.sparse_bsc) +# _ccol_indices +tensor([0, 2, 4], dtype=torch.int32) +# _row_indices +tensor([0, 1, 0, 1], dtype=torch.int32) +# _values +tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], dtype=torch.float64) + +########## torch.float64/torch.int32/batch_shape=()/block_shape=(0, 0) ########## +# sparse tensor +tensor(ccol_indices=tensor([0]), + row_indices=tensor([], size=(0,)), + values=tensor([], size=(1, 0, 0)), size=(0, 0), nnz=0, + dtype=torch.float64, layout=torch.sparse_bsc) +# _ccol_indices +tensor([0], dtype=torch.int32) +# _row_indices +tensor([], dtype=torch.int32) +# _values +tensor([], size=(1, 0, 0), dtype=torch.float64) + +########## torch.float64/torch.int32/batch_shape=(2,)/block_shape=(1, 2) ########## +# sparse tensor +tensor(ccol_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + row_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]), size=(2, 2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_bsc) +# _ccol_indices +tensor([[0, 2, 4], + [0, 2, 4]], dtype=torch.int32) +# _row_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]], dtype=torch.int32) +# _values +tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], dtype=torch.float64) + +########## torch.float64/torch.int32/batch_shape=(2, 3)/block_shape=(1, 2) ########## +# sparse tensor +tensor(ccol_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + row_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]]), size=(2, 3, 2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_bsc) +# _ccol_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]], dtype=torch.int32) +# _row_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]], dtype=torch.int32) +# _values +tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]], dtype=torch.float64) + + +########## torch.float32/torch.int64/batch_shape=()/block_shape=(1, 2) ########## +# sparse tensor +tensor(ccol_indices=tensor([0, 2, 4]), + row_indices=tensor([0, 1, 0, 1]), + values=tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]), size=(2, 2), nnz=4, + layout=torch.sparse_bsc) +# _ccol_indices +tensor([0, 2, 4]) +# _row_indices +tensor([0, 1, 0, 1]) +# _values +tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]) + +########## torch.float32/torch.int64/batch_shape=()/block_shape=(0, 0) ########## +# sparse tensor +tensor(ccol_indices=tensor([0]), + row_indices=tensor([], size=(0,)), + values=tensor([], size=(1, 0, 0)), size=(0, 0), nnz=0, + layout=torch.sparse_bsc) +# _ccol_indices +tensor([0]) +# _row_indices +tensor([], dtype=torch.int64) +# _values +tensor([], size=(1, 0, 0)) + +########## torch.float32/torch.int64/batch_shape=(2,)/block_shape=(1, 2) ########## +# sparse tensor +tensor(ccol_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + row_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]), size=(2, 2, 2), nnz=4, + layout=torch.sparse_bsc) +# _ccol_indices +tensor([[0, 2, 4], + [0, 2, 4]]) +# _row_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]) +# _values +tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]) + +########## torch.float32/torch.int64/batch_shape=(2, 3)/block_shape=(1, 2) ########## +# sparse tensor +tensor(ccol_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + row_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]]), size=(2, 3, 2, 2), nnz=4, + layout=torch.sparse_bsc) +# _ccol_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]) +# _row_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]) +# _values +tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]]) + + +########## torch.float64/torch.int64/batch_shape=()/block_shape=(1, 2) ########## +# sparse tensor +tensor(ccol_indices=tensor([0, 2, 4]), + row_indices=tensor([0, 1, 0, 1]), + values=tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]), size=(2, 2), nnz=4, dtype=torch.float64, + layout=torch.sparse_bsc) +# _ccol_indices +tensor([0, 2, 4]) +# _row_indices +tensor([0, 1, 0, 1]) +# _values +tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], dtype=torch.float64) + +########## torch.float64/torch.int64/batch_shape=()/block_shape=(0, 0) ########## +# sparse tensor +tensor(ccol_indices=tensor([0]), + row_indices=tensor([], size=(0,)), + values=tensor([], size=(1, 0, 0)), size=(0, 0), nnz=0, + dtype=torch.float64, layout=torch.sparse_bsc) +# _ccol_indices +tensor([0]) +# _row_indices +tensor([], dtype=torch.int64) +# _values +tensor([], size=(1, 0, 0), dtype=torch.float64) + +########## torch.float64/torch.int64/batch_shape=(2,)/block_shape=(1, 2) ########## +# sparse tensor +tensor(ccol_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + row_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]), size=(2, 2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_bsc) +# _ccol_indices +tensor([[0, 2, 4], + [0, 2, 4]]) +# _row_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]) +# _values +tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], dtype=torch.float64) + +########## torch.float64/torch.int64/batch_shape=(2, 3)/block_shape=(1, 2) ########## +# sparse tensor +tensor(ccol_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + row_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]]), size=(2, 3, 2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_bsc) +# _ccol_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]) +# _row_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]) +# _values +tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]], dtype=torch.float64) + diff --git a/test/expect/TestSparseCompressedCPU.test_print_SparseBSR_cpu.expect b/test/expect/TestSparseCompressedCPU.test_print_SparseBSR_cpu.expect new file mode 100644 index 000000000000..9f74cd7eb53f --- /dev/null +++ b/test/expect/TestSparseCompressedCPU.test_print_SparseBSR_cpu.expect @@ -0,0 +1,907 @@ +########## torch.float32/torch.int32/batch_shape=()/block_shape=(1, 2) ########## +# sparse tensor +tensor(crow_indices=tensor([0, 2, 4]), + col_indices=tensor([0, 1, 0, 1]), + values=tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]), size=(2, 2), nnz=4, + layout=torch.sparse_bsr) +# _crow_indices +tensor([0, 2, 4], dtype=torch.int32) +# _col_indices +tensor([0, 1, 0, 1], dtype=torch.int32) +# _values +tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]) + +########## torch.float32/torch.int32/batch_shape=()/block_shape=(0, 0) ########## +# sparse tensor +tensor(crow_indices=tensor([0]), + col_indices=tensor([], size=(0,)), + values=tensor([], size=(1, 0, 0)), size=(0, 0), nnz=0, + layout=torch.sparse_bsr) +# _crow_indices +tensor([0], dtype=torch.int32) +# _col_indices +tensor([], dtype=torch.int32) +# _values +tensor([], size=(1, 0, 0)) + +########## torch.float32/torch.int32/batch_shape=(2,)/block_shape=(1, 2) ########## +# sparse tensor +tensor(crow_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + col_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]), size=(2, 2, 2), nnz=4, + layout=torch.sparse_bsr) +# _crow_indices +tensor([[0, 2, 4], + [0, 2, 4]], dtype=torch.int32) +# _col_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]], dtype=torch.int32) +# _values +tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]) + +########## torch.float32/torch.int32/batch_shape=(2, 3)/block_shape=(1, 2) ########## +# sparse tensor +tensor(crow_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + col_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]]), size=(2, 3, 2, 2), nnz=4, + layout=torch.sparse_bsr) +# _crow_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]], dtype=torch.int32) +# _col_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]], dtype=torch.int32) +# _values +tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]]) + + +########## torch.float64/torch.int32/batch_shape=()/block_shape=(1, 2) ########## +# sparse tensor +tensor(crow_indices=tensor([0, 2, 4]), + col_indices=tensor([0, 1, 0, 1]), + values=tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]), size=(2, 2), nnz=4, dtype=torch.float64, + layout=torch.sparse_bsr) +# _crow_indices +tensor([0, 2, 4], dtype=torch.int32) +# _col_indices +tensor([0, 1, 0, 1], dtype=torch.int32) +# _values +tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], dtype=torch.float64) + +########## torch.float64/torch.int32/batch_shape=()/block_shape=(0, 0) ########## +# sparse tensor +tensor(crow_indices=tensor([0]), + col_indices=tensor([], size=(0,)), + values=tensor([], size=(1, 0, 0)), size=(0, 0), nnz=0, + dtype=torch.float64, layout=torch.sparse_bsr) +# _crow_indices +tensor([0], dtype=torch.int32) +# _col_indices +tensor([], dtype=torch.int32) +# _values +tensor([], size=(1, 0, 0), dtype=torch.float64) + +########## torch.float64/torch.int32/batch_shape=(2,)/block_shape=(1, 2) ########## +# sparse tensor +tensor(crow_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + col_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]), size=(2, 2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_bsr) +# _crow_indices +tensor([[0, 2, 4], + [0, 2, 4]], dtype=torch.int32) +# _col_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]], dtype=torch.int32) +# _values +tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], dtype=torch.float64) + +########## torch.float64/torch.int32/batch_shape=(2, 3)/block_shape=(1, 2) ########## +# sparse tensor +tensor(crow_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + col_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]]), size=(2, 3, 2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_bsr) +# _crow_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]], dtype=torch.int32) +# _col_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]], dtype=torch.int32) +# _values +tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]], dtype=torch.float64) + + +########## torch.float32/torch.int64/batch_shape=()/block_shape=(1, 2) ########## +# sparse tensor +tensor(crow_indices=tensor([0, 2, 4]), + col_indices=tensor([0, 1, 0, 1]), + values=tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]), size=(2, 2), nnz=4, + layout=torch.sparse_bsr) +# _crow_indices +tensor([0, 2, 4]) +# _col_indices +tensor([0, 1, 0, 1]) +# _values +tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]) + +########## torch.float32/torch.int64/batch_shape=()/block_shape=(0, 0) ########## +# sparse tensor +tensor(crow_indices=tensor([0]), + col_indices=tensor([], size=(0,)), + values=tensor([], size=(1, 0, 0)), size=(0, 0), nnz=0, + layout=torch.sparse_bsr) +# _crow_indices +tensor([0]) +# _col_indices +tensor([], dtype=torch.int64) +# _values +tensor([], size=(1, 0, 0)) + +########## torch.float32/torch.int64/batch_shape=(2,)/block_shape=(1, 2) ########## +# sparse tensor +tensor(crow_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + col_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]), size=(2, 2, 2), nnz=4, + layout=torch.sparse_bsr) +# _crow_indices +tensor([[0, 2, 4], + [0, 2, 4]]) +# _col_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]) +# _values +tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]) + +########## torch.float32/torch.int64/batch_shape=(2, 3)/block_shape=(1, 2) ########## +# sparse tensor +tensor(crow_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + col_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]]), size=(2, 3, 2, 2), nnz=4, + layout=torch.sparse_bsr) +# _crow_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]) +# _col_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]) +# _values +tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]]) + + +########## torch.float64/torch.int64/batch_shape=()/block_shape=(1, 2) ########## +# sparse tensor +tensor(crow_indices=tensor([0, 2, 4]), + col_indices=tensor([0, 1, 0, 1]), + values=tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]), size=(2, 2), nnz=4, dtype=torch.float64, + layout=torch.sparse_bsr) +# _crow_indices +tensor([0, 2, 4]) +# _col_indices +tensor([0, 1, 0, 1]) +# _values +tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], dtype=torch.float64) + +########## torch.float64/torch.int64/batch_shape=()/block_shape=(0, 0) ########## +# sparse tensor +tensor(crow_indices=tensor([0]), + col_indices=tensor([], size=(0,)), + values=tensor([], size=(1, 0, 0)), size=(0, 0), nnz=0, + dtype=torch.float64, layout=torch.sparse_bsr) +# _crow_indices +tensor([0]) +# _col_indices +tensor([], dtype=torch.int64) +# _values +tensor([], size=(1, 0, 0), dtype=torch.float64) + +########## torch.float64/torch.int64/batch_shape=(2,)/block_shape=(1, 2) ########## +# sparse tensor +tensor(crow_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + col_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]), size=(2, 2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_bsr) +# _crow_indices +tensor([[0, 2, 4], + [0, 2, 4]]) +# _col_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]) +# _values +tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], dtype=torch.float64) + +########## torch.float64/torch.int64/batch_shape=(2, 3)/block_shape=(1, 2) ########## +# sparse tensor +tensor(crow_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + col_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]]), size=(2, 3, 2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_bsr) +# _crow_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]) +# _col_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]) +# _values +tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]], dtype=torch.float64) + diff --git a/test/expect/TestSparseCompressedCPU.test_print_SparseCSC_cpu.expect b/test/expect/TestSparseCompressedCPU.test_print_SparseCSC_cpu.expect new file mode 100644 index 000000000000..a449883a3fe2 --- /dev/null +++ b/test/expect/TestSparseCompressedCPU.test_print_SparseCSC_cpu.expect @@ -0,0 +1,379 @@ +########## torch.float32/torch.int32/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([0, 2, 4]), + row_indices=tensor([0, 1, 0, 1]), + values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4, + layout=torch.sparse_csc) +# _ccol_indices +tensor([0, 2, 4], dtype=torch.int32) +# _row_indices +tensor([0, 1, 0, 1], dtype=torch.int32) +# _values +tensor([1., 2., 3., 4.]) + +########## torch.float32/torch.int32/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([0]), + row_indices=tensor([], size=(0,)), + values=tensor([], size=(0,)), size=(0, 0), nnz=0, + layout=torch.sparse_csc) +# _ccol_indices +tensor([0], dtype=torch.int32) +# _row_indices +tensor([], dtype=torch.int32) +# _values +tensor([]) + +########## torch.float32/torch.int32/batch_shape=(2,)/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + row_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]]), size=(2, 2, 2), nnz=4, + layout=torch.sparse_csc) +# _ccol_indices +tensor([[0, 2, 4], + [0, 2, 4]], dtype=torch.int32) +# _row_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]], dtype=torch.int32) +# _values +tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]]) + +########## torch.float32/torch.int32/batch_shape=(2, 3)/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + row_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]]), size=(2, 3, 2, 2), nnz=4, + layout=torch.sparse_csc) +# _ccol_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]], dtype=torch.int32) +# _row_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]], dtype=torch.int32) +# _values +tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]]) + + +########## torch.float64/torch.int32/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([0, 2, 4]), + row_indices=tensor([0, 1, 0, 1]), + values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_csc) +# _ccol_indices +tensor([0, 2, 4], dtype=torch.int32) +# _row_indices +tensor([0, 1, 0, 1], dtype=torch.int32) +# _values +tensor([1., 2., 3., 4.], dtype=torch.float64) + +########## torch.float64/torch.int32/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([0]), + row_indices=tensor([], size=(0,)), + values=tensor([], size=(0,)), size=(0, 0), nnz=0, dtype=torch.float64, + layout=torch.sparse_csc) +# _ccol_indices +tensor([0], dtype=torch.int32) +# _row_indices +tensor([], dtype=torch.int32) +# _values +tensor([], dtype=torch.float64) + +########## torch.float64/torch.int32/batch_shape=(2,)/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + row_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]]), size=(2, 2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_csc) +# _ccol_indices +tensor([[0, 2, 4], + [0, 2, 4]], dtype=torch.int32) +# _row_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]], dtype=torch.int32) +# _values +tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]], dtype=torch.float64) + +########## torch.float64/torch.int32/batch_shape=(2, 3)/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + row_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]]), size=(2, 3, 2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_csc) +# _ccol_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]], dtype=torch.int32) +# _row_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]], dtype=torch.int32) +# _values +tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]], dtype=torch.float64) + + +########## torch.float32/torch.int64/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([0, 2, 4]), + row_indices=tensor([0, 1, 0, 1]), + values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4, + layout=torch.sparse_csc) +# _ccol_indices +tensor([0, 2, 4]) +# _row_indices +tensor([0, 1, 0, 1]) +# _values +tensor([1., 2., 3., 4.]) + +########## torch.float32/torch.int64/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([0]), + row_indices=tensor([], size=(0,)), + values=tensor([], size=(0,)), size=(0, 0), nnz=0, + layout=torch.sparse_csc) +# _ccol_indices +tensor([0]) +# _row_indices +tensor([], dtype=torch.int64) +# _values +tensor([]) + +########## torch.float32/torch.int64/batch_shape=(2,)/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + row_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]]), size=(2, 2, 2), nnz=4, + layout=torch.sparse_csc) +# _ccol_indices +tensor([[0, 2, 4], + [0, 2, 4]]) +# _row_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]) +# _values +tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]]) + +########## torch.float32/torch.int64/batch_shape=(2, 3)/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + row_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]]), size=(2, 3, 2, 2), nnz=4, + layout=torch.sparse_csc) +# _ccol_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]) +# _row_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]) +# _values +tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]]) + + +########## torch.float64/torch.int64/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([0, 2, 4]), + row_indices=tensor([0, 1, 0, 1]), + values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_csc) +# _ccol_indices +tensor([0, 2, 4]) +# _row_indices +tensor([0, 1, 0, 1]) +# _values +tensor([1., 2., 3., 4.], dtype=torch.float64) + +########## torch.float64/torch.int64/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([0]), + row_indices=tensor([], size=(0,)), + values=tensor([], size=(0,)), size=(0, 0), nnz=0, dtype=torch.float64, + layout=torch.sparse_csc) +# _ccol_indices +tensor([0]) +# _row_indices +tensor([], dtype=torch.int64) +# _values +tensor([], dtype=torch.float64) + +########## torch.float64/torch.int64/batch_shape=(2,)/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + row_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]]), size=(2, 2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_csc) +# _ccol_indices +tensor([[0, 2, 4], + [0, 2, 4]]) +# _row_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]) +# _values +tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]], dtype=torch.float64) + +########## torch.float64/torch.int64/batch_shape=(2, 3)/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + row_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]]), size=(2, 3, 2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_csc) +# _ccol_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]) +# _row_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]) +# _values +tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]], dtype=torch.float64) + diff --git a/test/expect/TestSparseCompressedCPU.test_print_SparseCSR_cpu.expect b/test/expect/TestSparseCompressedCPU.test_print_SparseCSR_cpu.expect new file mode 100644 index 000000000000..02476652e4b7 --- /dev/null +++ b/test/expect/TestSparseCompressedCPU.test_print_SparseCSR_cpu.expect @@ -0,0 +1,379 @@ +########## torch.float32/torch.int32/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([0, 2, 4]), + col_indices=tensor([0, 1, 0, 1]), + values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4, + layout=torch.sparse_csr) +# _crow_indices +tensor([0, 2, 4], dtype=torch.int32) +# _col_indices +tensor([0, 1, 0, 1], dtype=torch.int32) +# _values +tensor([1., 2., 3., 4.]) + +########## torch.float32/torch.int32/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([0]), + col_indices=tensor([], size=(0,)), + values=tensor([], size=(0,)), size=(0, 0), nnz=0, + layout=torch.sparse_csr) +# _crow_indices +tensor([0], dtype=torch.int32) +# _col_indices +tensor([], dtype=torch.int32) +# _values +tensor([]) + +########## torch.float32/torch.int32/batch_shape=(2,)/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + col_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]]), size=(2, 2, 2), nnz=4, + layout=torch.sparse_csr) +# _crow_indices +tensor([[0, 2, 4], + [0, 2, 4]], dtype=torch.int32) +# _col_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]], dtype=torch.int32) +# _values +tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]]) + +########## torch.float32/torch.int32/batch_shape=(2, 3)/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + col_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]]), size=(2, 3, 2, 2), nnz=4, + layout=torch.sparse_csr) +# _crow_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]], dtype=torch.int32) +# _col_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]], dtype=torch.int32) +# _values +tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]]) + + +########## torch.float64/torch.int32/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([0, 2, 4]), + col_indices=tensor([0, 1, 0, 1]), + values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_csr) +# _crow_indices +tensor([0, 2, 4], dtype=torch.int32) +# _col_indices +tensor([0, 1, 0, 1], dtype=torch.int32) +# _values +tensor([1., 2., 3., 4.], dtype=torch.float64) + +########## torch.float64/torch.int32/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([0]), + col_indices=tensor([], size=(0,)), + values=tensor([], size=(0,)), size=(0, 0), nnz=0, dtype=torch.float64, + layout=torch.sparse_csr) +# _crow_indices +tensor([0], dtype=torch.int32) +# _col_indices +tensor([], dtype=torch.int32) +# _values +tensor([], dtype=torch.float64) + +########## torch.float64/torch.int32/batch_shape=(2,)/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + col_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]]), size=(2, 2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_csr) +# _crow_indices +tensor([[0, 2, 4], + [0, 2, 4]], dtype=torch.int32) +# _col_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]], dtype=torch.int32) +# _values +tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]], dtype=torch.float64) + +########## torch.float64/torch.int32/batch_shape=(2, 3)/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + col_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]]), size=(2, 3, 2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_csr) +# _crow_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]], dtype=torch.int32) +# _col_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]], dtype=torch.int32) +# _values +tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]], dtype=torch.float64) + + +########## torch.float32/torch.int64/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([0, 2, 4]), + col_indices=tensor([0, 1, 0, 1]), + values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4, + layout=torch.sparse_csr) +# _crow_indices +tensor([0, 2, 4]) +# _col_indices +tensor([0, 1, 0, 1]) +# _values +tensor([1., 2., 3., 4.]) + +########## torch.float32/torch.int64/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([0]), + col_indices=tensor([], size=(0,)), + values=tensor([], size=(0,)), size=(0, 0), nnz=0, + layout=torch.sparse_csr) +# _crow_indices +tensor([0]) +# _col_indices +tensor([], dtype=torch.int64) +# _values +tensor([]) + +########## torch.float32/torch.int64/batch_shape=(2,)/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + col_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]]), size=(2, 2, 2), nnz=4, + layout=torch.sparse_csr) +# _crow_indices +tensor([[0, 2, 4], + [0, 2, 4]]) +# _col_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]) +# _values +tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]]) + +########## torch.float32/torch.int64/batch_shape=(2, 3)/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + col_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]]), size=(2, 3, 2, 2), nnz=4, + layout=torch.sparse_csr) +# _crow_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]) +# _col_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]) +# _values +tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]]) + + +########## torch.float64/torch.int64/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([0, 2, 4]), + col_indices=tensor([0, 1, 0, 1]), + values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_csr) +# _crow_indices +tensor([0, 2, 4]) +# _col_indices +tensor([0, 1, 0, 1]) +# _values +tensor([1., 2., 3., 4.], dtype=torch.float64) + +########## torch.float64/torch.int64/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([0]), + col_indices=tensor([], size=(0,)), + values=tensor([], size=(0,)), size=(0, 0), nnz=0, dtype=torch.float64, + layout=torch.sparse_csr) +# _crow_indices +tensor([0]) +# _col_indices +tensor([], dtype=torch.int64) +# _values +tensor([], dtype=torch.float64) + +########## torch.float64/torch.int64/batch_shape=(2,)/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + col_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]]), size=(2, 2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_csr) +# _crow_indices +tensor([[0, 2, 4], + [0, 2, 4]]) +# _col_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]) +# _values +tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]], dtype=torch.float64) + +########## torch.float64/torch.int64/batch_shape=(2, 3)/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + col_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]]), size=(2, 3, 2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_csr) +# _crow_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]) +# _col_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]) +# _values +tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]], dtype=torch.float64) + diff --git a/test/expect/TestSparseCompressedCUDA.test_print_SparseBSC_cuda.expect b/test/expect/TestSparseCompressedCUDA.test_print_SparseBSC_cuda.expect new file mode 100644 index 000000000000..df75cb3a4f61 --- /dev/null +++ b/test/expect/TestSparseCompressedCUDA.test_print_SparseBSC_cuda.expect @@ -0,0 +1,907 @@ +########## torch.float32/torch.int32/batch_shape=()/block_shape=(1, 2) ########## +# sparse tensor +tensor(ccol_indices=tensor([0, 2, 4]), + row_indices=tensor([0, 1, 0, 1]), + values=tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]), device='cuda:0', size=(2, 2), nnz=4, + layout=torch.sparse_bsc) +# _ccol_indices +tensor([0, 2, 4], device='cuda:0', dtype=torch.int32) +# _row_indices +tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32) +# _values +tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], device='cuda:0') + +########## torch.float32/torch.int32/batch_shape=()/block_shape=(0, 0) ########## +# sparse tensor +tensor(ccol_indices=tensor([0]), + row_indices=tensor([], size=(0,)), + values=tensor([], size=(1, 0, 0)), device='cuda:0', size=(0, 0), nnz=0, + layout=torch.sparse_bsc) +# _ccol_indices +tensor([0], device='cuda:0', dtype=torch.int32) +# _row_indices +tensor([], device='cuda:0', dtype=torch.int32) +# _values +tensor([], device='cuda:0', size=(1, 0, 0)) + +########## torch.float32/torch.int32/batch_shape=(2,)/block_shape=(1, 2) ########## +# sparse tensor +tensor(ccol_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + row_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 2), nnz=4, + layout=torch.sparse_bsc) +# _ccol_indices +tensor([[0, 2, 4], + [0, 2, 4]], device='cuda:0', dtype=torch.int32) +# _row_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]], device='cuda:0', dtype=torch.int32) +# _values +tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], device='cuda:0') + +########## torch.float32/torch.int32/batch_shape=(2, 3)/block_shape=(1, 2) ########## +# sparse tensor +tensor(ccol_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + row_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 2), + nnz=4, layout=torch.sparse_bsc) +# _ccol_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]], device='cuda:0', dtype=torch.int32) +# _row_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]], device='cuda:0', dtype=torch.int32) +# _values +tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]], device='cuda:0') + + +########## torch.float64/torch.int32/batch_shape=()/block_shape=(1, 2) ########## +# sparse tensor +tensor(ccol_indices=tensor([0, 2, 4]), + row_indices=tensor([0, 1, 0, 1]), + values=tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]), device='cuda:0', size=(2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_bsc) +# _ccol_indices +tensor([0, 2, 4], device='cuda:0', dtype=torch.int32) +# _row_indices +tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32) +# _values +tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], device='cuda:0', dtype=torch.float64) + +########## torch.float64/torch.int32/batch_shape=()/block_shape=(0, 0) ########## +# sparse tensor +tensor(ccol_indices=tensor([0]), + row_indices=tensor([], size=(0,)), + values=tensor([], size=(1, 0, 0)), device='cuda:0', size=(0, 0), nnz=0, + dtype=torch.float64, layout=torch.sparse_bsc) +# _ccol_indices +tensor([0], device='cuda:0', dtype=torch.int32) +# _row_indices +tensor([], device='cuda:0', dtype=torch.int32) +# _values +tensor([], device='cuda:0', size=(1, 0, 0), dtype=torch.float64) + +########## torch.float64/torch.int32/batch_shape=(2,)/block_shape=(1, 2) ########## +# sparse tensor +tensor(ccol_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + row_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_bsc) +# _ccol_indices +tensor([[0, 2, 4], + [0, 2, 4]], device='cuda:0', dtype=torch.int32) +# _row_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]], device='cuda:0', dtype=torch.int32) +# _values +tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], device='cuda:0', dtype=torch.float64) + +########## torch.float64/torch.int32/batch_shape=(2, 3)/block_shape=(1, 2) ########## +# sparse tensor +tensor(ccol_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + row_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 2), + nnz=4, dtype=torch.float64, layout=torch.sparse_bsc) +# _ccol_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]], device='cuda:0', dtype=torch.int32) +# _row_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]], device='cuda:0', dtype=torch.int32) +# _values +tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]], device='cuda:0', dtype=torch.float64) + + +########## torch.float32/torch.int64/batch_shape=()/block_shape=(1, 2) ########## +# sparse tensor +tensor(ccol_indices=tensor([0, 2, 4]), + row_indices=tensor([0, 1, 0, 1]), + values=tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]), device='cuda:0', size=(2, 2), nnz=4, + layout=torch.sparse_bsc) +# _ccol_indices +tensor([0, 2, 4], device='cuda:0') +# _row_indices +tensor([0, 1, 0, 1], device='cuda:0') +# _values +tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], device='cuda:0') + +########## torch.float32/torch.int64/batch_shape=()/block_shape=(0, 0) ########## +# sparse tensor +tensor(ccol_indices=tensor([0]), + row_indices=tensor([], size=(0,)), + values=tensor([], size=(1, 0, 0)), device='cuda:0', size=(0, 0), nnz=0, + layout=torch.sparse_bsc) +# _ccol_indices +tensor([0], device='cuda:0') +# _row_indices +tensor([], device='cuda:0', dtype=torch.int64) +# _values +tensor([], device='cuda:0', size=(1, 0, 0)) + +########## torch.float32/torch.int64/batch_shape=(2,)/block_shape=(1, 2) ########## +# sparse tensor +tensor(ccol_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + row_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 2), nnz=4, + layout=torch.sparse_bsc) +# _ccol_indices +tensor([[0, 2, 4], + [0, 2, 4]], device='cuda:0') +# _row_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]], device='cuda:0') +# _values +tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], device='cuda:0') + +########## torch.float32/torch.int64/batch_shape=(2, 3)/block_shape=(1, 2) ########## +# sparse tensor +tensor(ccol_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + row_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 2), + nnz=4, layout=torch.sparse_bsc) +# _ccol_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]], device='cuda:0') +# _row_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]], device='cuda:0') +# _values +tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]], device='cuda:0') + + +########## torch.float64/torch.int64/batch_shape=()/block_shape=(1, 2) ########## +# sparse tensor +tensor(ccol_indices=tensor([0, 2, 4]), + row_indices=tensor([0, 1, 0, 1]), + values=tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]), device='cuda:0', size=(2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_bsc) +# _ccol_indices +tensor([0, 2, 4], device='cuda:0') +# _row_indices +tensor([0, 1, 0, 1], device='cuda:0') +# _values +tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], device='cuda:0', dtype=torch.float64) + +########## torch.float64/torch.int64/batch_shape=()/block_shape=(0, 0) ########## +# sparse tensor +tensor(ccol_indices=tensor([0]), + row_indices=tensor([], size=(0,)), + values=tensor([], size=(1, 0, 0)), device='cuda:0', size=(0, 0), nnz=0, + dtype=torch.float64, layout=torch.sparse_bsc) +# _ccol_indices +tensor([0], device='cuda:0') +# _row_indices +tensor([], device='cuda:0', dtype=torch.int64) +# _values +tensor([], device='cuda:0', size=(1, 0, 0), dtype=torch.float64) + +########## torch.float64/torch.int64/batch_shape=(2,)/block_shape=(1, 2) ########## +# sparse tensor +tensor(ccol_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + row_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_bsc) +# _ccol_indices +tensor([[0, 2, 4], + [0, 2, 4]], device='cuda:0') +# _row_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]], device='cuda:0') +# _values +tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], device='cuda:0', dtype=torch.float64) + +########## torch.float64/torch.int64/batch_shape=(2, 3)/block_shape=(1, 2) ########## +# sparse tensor +tensor(ccol_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + row_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 2), + nnz=4, dtype=torch.float64, layout=torch.sparse_bsc) +# _ccol_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]], device='cuda:0') +# _row_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]], device='cuda:0') +# _values +tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]], device='cuda:0', dtype=torch.float64) + diff --git a/test/expect/TestSparseCompressedCUDA.test_print_SparseBSR_cuda.expect b/test/expect/TestSparseCompressedCUDA.test_print_SparseBSR_cuda.expect new file mode 100644 index 000000000000..5ab909227272 --- /dev/null +++ b/test/expect/TestSparseCompressedCUDA.test_print_SparseBSR_cuda.expect @@ -0,0 +1,907 @@ +########## torch.float32/torch.int32/batch_shape=()/block_shape=(1, 2) ########## +# sparse tensor +tensor(crow_indices=tensor([0, 2, 4]), + col_indices=tensor([0, 1, 0, 1]), + values=tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]), device='cuda:0', size=(2, 2), nnz=4, + layout=torch.sparse_bsr) +# _crow_indices +tensor([0, 2, 4], device='cuda:0', dtype=torch.int32) +# _col_indices +tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32) +# _values +tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], device='cuda:0') + +########## torch.float32/torch.int32/batch_shape=()/block_shape=(0, 0) ########## +# sparse tensor +tensor(crow_indices=tensor([0]), + col_indices=tensor([], size=(0,)), + values=tensor([], size=(1, 0, 0)), device='cuda:0', size=(0, 0), nnz=0, + layout=torch.sparse_bsr) +# _crow_indices +tensor([0], device='cuda:0', dtype=torch.int32) +# _col_indices +tensor([], device='cuda:0', dtype=torch.int32) +# _values +tensor([], device='cuda:0', size=(1, 0, 0)) + +########## torch.float32/torch.int32/batch_shape=(2,)/block_shape=(1, 2) ########## +# sparse tensor +tensor(crow_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + col_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 2), nnz=4, + layout=torch.sparse_bsr) +# _crow_indices +tensor([[0, 2, 4], + [0, 2, 4]], device='cuda:0', dtype=torch.int32) +# _col_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]], device='cuda:0', dtype=torch.int32) +# _values +tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], device='cuda:0') + +########## torch.float32/torch.int32/batch_shape=(2, 3)/block_shape=(1, 2) ########## +# sparse tensor +tensor(crow_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + col_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 2), + nnz=4, layout=torch.sparse_bsr) +# _crow_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]], device='cuda:0', dtype=torch.int32) +# _col_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]], device='cuda:0', dtype=torch.int32) +# _values +tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]], device='cuda:0') + + +########## torch.float64/torch.int32/batch_shape=()/block_shape=(1, 2) ########## +# sparse tensor +tensor(crow_indices=tensor([0, 2, 4]), + col_indices=tensor([0, 1, 0, 1]), + values=tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]), device='cuda:0', size=(2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_bsr) +# _crow_indices +tensor([0, 2, 4], device='cuda:0', dtype=torch.int32) +# _col_indices +tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32) +# _values +tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], device='cuda:0', dtype=torch.float64) + +########## torch.float64/torch.int32/batch_shape=()/block_shape=(0, 0) ########## +# sparse tensor +tensor(crow_indices=tensor([0]), + col_indices=tensor([], size=(0,)), + values=tensor([], size=(1, 0, 0)), device='cuda:0', size=(0, 0), nnz=0, + dtype=torch.float64, layout=torch.sparse_bsr) +# _crow_indices +tensor([0], device='cuda:0', dtype=torch.int32) +# _col_indices +tensor([], device='cuda:0', dtype=torch.int32) +# _values +tensor([], device='cuda:0', size=(1, 0, 0), dtype=torch.float64) + +########## torch.float64/torch.int32/batch_shape=(2,)/block_shape=(1, 2) ########## +# sparse tensor +tensor(crow_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + col_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_bsr) +# _crow_indices +tensor([[0, 2, 4], + [0, 2, 4]], device='cuda:0', dtype=torch.int32) +# _col_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]], device='cuda:0', dtype=torch.int32) +# _values +tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], device='cuda:0', dtype=torch.float64) + +########## torch.float64/torch.int32/batch_shape=(2, 3)/block_shape=(1, 2) ########## +# sparse tensor +tensor(crow_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + col_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 2), + nnz=4, dtype=torch.float64, layout=torch.sparse_bsr) +# _crow_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]], device='cuda:0', dtype=torch.int32) +# _col_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]], device='cuda:0', dtype=torch.int32) +# _values +tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]], device='cuda:0', dtype=torch.float64) + + +########## torch.float32/torch.int64/batch_shape=()/block_shape=(1, 2) ########## +# sparse tensor +tensor(crow_indices=tensor([0, 2, 4]), + col_indices=tensor([0, 1, 0, 1]), + values=tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]), device='cuda:0', size=(2, 2), nnz=4, + layout=torch.sparse_bsr) +# _crow_indices +tensor([0, 2, 4], device='cuda:0') +# _col_indices +tensor([0, 1, 0, 1], device='cuda:0') +# _values +tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], device='cuda:0') + +########## torch.float32/torch.int64/batch_shape=()/block_shape=(0, 0) ########## +# sparse tensor +tensor(crow_indices=tensor([0]), + col_indices=tensor([], size=(0,)), + values=tensor([], size=(1, 0, 0)), device='cuda:0', size=(0, 0), nnz=0, + layout=torch.sparse_bsr) +# _crow_indices +tensor([0], device='cuda:0') +# _col_indices +tensor([], device='cuda:0', dtype=torch.int64) +# _values +tensor([], device='cuda:0', size=(1, 0, 0)) + +########## torch.float32/torch.int64/batch_shape=(2,)/block_shape=(1, 2) ########## +# sparse tensor +tensor(crow_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + col_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 2), nnz=4, + layout=torch.sparse_bsr) +# _crow_indices +tensor([[0, 2, 4], + [0, 2, 4]], device='cuda:0') +# _col_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]], device='cuda:0') +# _values +tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], device='cuda:0') + +########## torch.float32/torch.int64/batch_shape=(2, 3)/block_shape=(1, 2) ########## +# sparse tensor +tensor(crow_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + col_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 2), + nnz=4, layout=torch.sparse_bsr) +# _crow_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]], device='cuda:0') +# _col_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]], device='cuda:0') +# _values +tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]], device='cuda:0') + + +########## torch.float64/torch.int64/batch_shape=()/block_shape=(1, 2) ########## +# sparse tensor +tensor(crow_indices=tensor([0, 2, 4]), + col_indices=tensor([0, 1, 0, 1]), + values=tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]), device='cuda:0', size=(2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_bsr) +# _crow_indices +tensor([0, 2, 4], device='cuda:0') +# _col_indices +tensor([0, 1, 0, 1], device='cuda:0') +# _values +tensor([[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], device='cuda:0', dtype=torch.float64) + +########## torch.float64/torch.int64/batch_shape=()/block_shape=(0, 0) ########## +# sparse tensor +tensor(crow_indices=tensor([0]), + col_indices=tensor([], size=(0,)), + values=tensor([], size=(1, 0, 0)), device='cuda:0', size=(0, 0), nnz=0, + dtype=torch.float64, layout=torch.sparse_bsr) +# _crow_indices +tensor([0], device='cuda:0') +# _col_indices +tensor([], device='cuda:0', dtype=torch.int64) +# _values +tensor([], device='cuda:0', size=(1, 0, 0), dtype=torch.float64) + +########## torch.float64/torch.int64/batch_shape=(2,)/block_shape=(1, 2) ########## +# sparse tensor +tensor(crow_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + col_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_bsr) +# _crow_indices +tensor([[0, 2, 4], + [0, 2, 4]], device='cuda:0') +# _col_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]], device='cuda:0') +# _values +tensor([[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], device='cuda:0', dtype=torch.float64) + +########## torch.float64/torch.int64/batch_shape=(2, 3)/block_shape=(1, 2) ########## +# sparse tensor +tensor(crow_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + col_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 2), + nnz=4, dtype=torch.float64, layout=torch.sparse_bsr) +# _crow_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]], device='cuda:0') +# _col_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]], device='cuda:0') +# _values +tensor([[[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]], + + + + [[[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]], + + + [[[ 1., 11.]], + + [[ 2., 22.]], + + [[ 3., 33.]], + + [[ 4., 44.]]]]], device='cuda:0', dtype=torch.float64) + diff --git a/test/expect/TestSparseCompressedCUDA.test_print_SparseCSC_cuda.expect b/test/expect/TestSparseCompressedCUDA.test_print_SparseCSC_cuda.expect new file mode 100644 index 000000000000..4292bfcd2199 --- /dev/null +++ b/test/expect/TestSparseCompressedCUDA.test_print_SparseCSC_cuda.expect @@ -0,0 +1,379 @@ +########## torch.float32/torch.int32/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([0, 2, 4]), + row_indices=tensor([0, 1, 0, 1]), + values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4, + layout=torch.sparse_csc) +# _ccol_indices +tensor([0, 2, 4], device='cuda:0', dtype=torch.int32) +# _row_indices +tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32) +# _values +tensor([1., 2., 3., 4.], device='cuda:0') + +########## torch.float32/torch.int32/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([0]), + row_indices=tensor([], size=(0,)), + values=tensor([], size=(0,)), device='cuda:0', size=(0, 0), nnz=0, + layout=torch.sparse_csc) +# _ccol_indices +tensor([0], device='cuda:0', dtype=torch.int32) +# _row_indices +tensor([], device='cuda:0', dtype=torch.int32) +# _values +tensor([], device='cuda:0') + +########## torch.float32/torch.int32/batch_shape=(2,)/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + row_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]]), device='cuda:0', size=(2, 2, 2), + nnz=4, layout=torch.sparse_csc) +# _ccol_indices +tensor([[0, 2, 4], + [0, 2, 4]], device='cuda:0', dtype=torch.int32) +# _row_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]], device='cuda:0', dtype=torch.int32) +# _values +tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]], device='cuda:0') + +########## torch.float32/torch.int32/batch_shape=(2, 3)/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + row_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]]), device='cuda:0', size=(2, 3, 2, 2), + nnz=4, layout=torch.sparse_csc) +# _ccol_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]], device='cuda:0', dtype=torch.int32) +# _row_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]], device='cuda:0', dtype=torch.int32) +# _values +tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]], device='cuda:0') + + +########## torch.float64/torch.int32/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([0, 2, 4]), + row_indices=tensor([0, 1, 0, 1]), + values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_csc) +# _ccol_indices +tensor([0, 2, 4], device='cuda:0', dtype=torch.int32) +# _row_indices +tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32) +# _values +tensor([1., 2., 3., 4.], device='cuda:0', dtype=torch.float64) + +########## torch.float64/torch.int32/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([0]), + row_indices=tensor([], size=(0,)), + values=tensor([], size=(0,)), device='cuda:0', size=(0, 0), nnz=0, + dtype=torch.float64, layout=torch.sparse_csc) +# _ccol_indices +tensor([0], device='cuda:0', dtype=torch.int32) +# _row_indices +tensor([], device='cuda:0', dtype=torch.int32) +# _values +tensor([], device='cuda:0', dtype=torch.float64) + +########## torch.float64/torch.int32/batch_shape=(2,)/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + row_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]]), device='cuda:0', size=(2, 2, 2), + nnz=4, dtype=torch.float64, layout=torch.sparse_csc) +# _ccol_indices +tensor([[0, 2, 4], + [0, 2, 4]], device='cuda:0', dtype=torch.int32) +# _row_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]], device='cuda:0', dtype=torch.int32) +# _values +tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]], device='cuda:0', dtype=torch.float64) + +########## torch.float64/torch.int32/batch_shape=(2, 3)/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + row_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]]), device='cuda:0', size=(2, 3, 2, 2), + nnz=4, dtype=torch.float64, layout=torch.sparse_csc) +# _ccol_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]], device='cuda:0', dtype=torch.int32) +# _row_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]], device='cuda:0', dtype=torch.int32) +# _values +tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]], device='cuda:0', dtype=torch.float64) + + +########## torch.float32/torch.int64/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([0, 2, 4]), + row_indices=tensor([0, 1, 0, 1]), + values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4, + layout=torch.sparse_csc) +# _ccol_indices +tensor([0, 2, 4], device='cuda:0') +# _row_indices +tensor([0, 1, 0, 1], device='cuda:0') +# _values +tensor([1., 2., 3., 4.], device='cuda:0') + +########## torch.float32/torch.int64/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([0]), + row_indices=tensor([], size=(0,)), + values=tensor([], size=(0,)), device='cuda:0', size=(0, 0), nnz=0, + layout=torch.sparse_csc) +# _ccol_indices +tensor([0], device='cuda:0') +# _row_indices +tensor([], device='cuda:0', dtype=torch.int64) +# _values +tensor([], device='cuda:0') + +########## torch.float32/torch.int64/batch_shape=(2,)/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + row_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]]), device='cuda:0', size=(2, 2, 2), + nnz=4, layout=torch.sparse_csc) +# _ccol_indices +tensor([[0, 2, 4], + [0, 2, 4]], device='cuda:0') +# _row_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]], device='cuda:0') +# _values +tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]], device='cuda:0') + +########## torch.float32/torch.int64/batch_shape=(2, 3)/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + row_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]]), device='cuda:0', size=(2, 3, 2, 2), + nnz=4, layout=torch.sparse_csc) +# _ccol_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]], device='cuda:0') +# _row_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]], device='cuda:0') +# _values +tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]], device='cuda:0') + + +########## torch.float64/torch.int64/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([0, 2, 4]), + row_indices=tensor([0, 1, 0, 1]), + values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_csc) +# _ccol_indices +tensor([0, 2, 4], device='cuda:0') +# _row_indices +tensor([0, 1, 0, 1], device='cuda:0') +# _values +tensor([1., 2., 3., 4.], device='cuda:0', dtype=torch.float64) + +########## torch.float64/torch.int64/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([0]), + row_indices=tensor([], size=(0,)), + values=tensor([], size=(0,)), device='cuda:0', size=(0, 0), nnz=0, + dtype=torch.float64, layout=torch.sparse_csc) +# _ccol_indices +tensor([0], device='cuda:0') +# _row_indices +tensor([], device='cuda:0', dtype=torch.int64) +# _values +tensor([], device='cuda:0', dtype=torch.float64) + +########## torch.float64/torch.int64/batch_shape=(2,)/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + row_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]]), device='cuda:0', size=(2, 2, 2), + nnz=4, dtype=torch.float64, layout=torch.sparse_csc) +# _ccol_indices +tensor([[0, 2, 4], + [0, 2, 4]], device='cuda:0') +# _row_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]], device='cuda:0') +# _values +tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]], device='cuda:0', dtype=torch.float64) + +########## torch.float64/torch.int64/batch_shape=(2, 3)/block_shape=() ########## +# sparse tensor +tensor(ccol_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + row_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]]), device='cuda:0', size=(2, 3, 2, 2), + nnz=4, dtype=torch.float64, layout=torch.sparse_csc) +# _ccol_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]], device='cuda:0') +# _row_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]], device='cuda:0') +# _values +tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]], device='cuda:0', dtype=torch.float64) + diff --git a/test/expect/TestSparseCompressedCUDA.test_print_SparseCSR_cuda.expect b/test/expect/TestSparseCompressedCUDA.test_print_SparseCSR_cuda.expect new file mode 100644 index 000000000000..918f2570807f --- /dev/null +++ b/test/expect/TestSparseCompressedCUDA.test_print_SparseCSR_cuda.expect @@ -0,0 +1,379 @@ +########## torch.float32/torch.int32/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([0, 2, 4]), + col_indices=tensor([0, 1, 0, 1]), + values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4, + layout=torch.sparse_csr) +# _crow_indices +tensor([0, 2, 4], device='cuda:0', dtype=torch.int32) +# _col_indices +tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32) +# _values +tensor([1., 2., 3., 4.], device='cuda:0') + +########## torch.float32/torch.int32/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([0]), + col_indices=tensor([], size=(0,)), + values=tensor([], size=(0,)), device='cuda:0', size=(0, 0), nnz=0, + layout=torch.sparse_csr) +# _crow_indices +tensor([0], device='cuda:0', dtype=torch.int32) +# _col_indices +tensor([], device='cuda:0', dtype=torch.int32) +# _values +tensor([], device='cuda:0') + +########## torch.float32/torch.int32/batch_shape=(2,)/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + col_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]]), device='cuda:0', size=(2, 2, 2), + nnz=4, layout=torch.sparse_csr) +# _crow_indices +tensor([[0, 2, 4], + [0, 2, 4]], device='cuda:0', dtype=torch.int32) +# _col_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]], device='cuda:0', dtype=torch.int32) +# _values +tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]], device='cuda:0') + +########## torch.float32/torch.int32/batch_shape=(2, 3)/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + col_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]]), device='cuda:0', size=(2, 3, 2, 2), + nnz=4, layout=torch.sparse_csr) +# _crow_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]], device='cuda:0', dtype=torch.int32) +# _col_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]], device='cuda:0', dtype=torch.int32) +# _values +tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]], device='cuda:0') + + +########## torch.float64/torch.int32/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([0, 2, 4]), + col_indices=tensor([0, 1, 0, 1]), + values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_csr) +# _crow_indices +tensor([0, 2, 4], device='cuda:0', dtype=torch.int32) +# _col_indices +tensor([0, 1, 0, 1], device='cuda:0', dtype=torch.int32) +# _values +tensor([1., 2., 3., 4.], device='cuda:0', dtype=torch.float64) + +########## torch.float64/torch.int32/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([0]), + col_indices=tensor([], size=(0,)), + values=tensor([], size=(0,)), device='cuda:0', size=(0, 0), nnz=0, + dtype=torch.float64, layout=torch.sparse_csr) +# _crow_indices +tensor([0], device='cuda:0', dtype=torch.int32) +# _col_indices +tensor([], device='cuda:0', dtype=torch.int32) +# _values +tensor([], device='cuda:0', dtype=torch.float64) + +########## torch.float64/torch.int32/batch_shape=(2,)/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + col_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]]), device='cuda:0', size=(2, 2, 2), + nnz=4, dtype=torch.float64, layout=torch.sparse_csr) +# _crow_indices +tensor([[0, 2, 4], + [0, 2, 4]], device='cuda:0', dtype=torch.int32) +# _col_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]], device='cuda:0', dtype=torch.int32) +# _values +tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]], device='cuda:0', dtype=torch.float64) + +########## torch.float64/torch.int32/batch_shape=(2, 3)/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + col_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]]), device='cuda:0', size=(2, 3, 2, 2), + nnz=4, dtype=torch.float64, layout=torch.sparse_csr) +# _crow_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]], device='cuda:0', dtype=torch.int32) +# _col_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]], device='cuda:0', dtype=torch.int32) +# _values +tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]], device='cuda:0', dtype=torch.float64) + + +########## torch.float32/torch.int64/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([0, 2, 4]), + col_indices=tensor([0, 1, 0, 1]), + values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4, + layout=torch.sparse_csr) +# _crow_indices +tensor([0, 2, 4], device='cuda:0') +# _col_indices +tensor([0, 1, 0, 1], device='cuda:0') +# _values +tensor([1., 2., 3., 4.], device='cuda:0') + +########## torch.float32/torch.int64/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([0]), + col_indices=tensor([], size=(0,)), + values=tensor([], size=(0,)), device='cuda:0', size=(0, 0), nnz=0, + layout=torch.sparse_csr) +# _crow_indices +tensor([0], device='cuda:0') +# _col_indices +tensor([], device='cuda:0', dtype=torch.int64) +# _values +tensor([], device='cuda:0') + +########## torch.float32/torch.int64/batch_shape=(2,)/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + col_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]]), device='cuda:0', size=(2, 2, 2), + nnz=4, layout=torch.sparse_csr) +# _crow_indices +tensor([[0, 2, 4], + [0, 2, 4]], device='cuda:0') +# _col_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]], device='cuda:0') +# _values +tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]], device='cuda:0') + +########## torch.float32/torch.int64/batch_shape=(2, 3)/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + col_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]]), device='cuda:0', size=(2, 3, 2, 2), + nnz=4, layout=torch.sparse_csr) +# _crow_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]], device='cuda:0') +# _col_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]], device='cuda:0') +# _values +tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]], device='cuda:0') + + +########## torch.float64/torch.int64/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([0, 2, 4]), + col_indices=tensor([0, 1, 0, 1]), + values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_csr) +# _crow_indices +tensor([0, 2, 4], device='cuda:0') +# _col_indices +tensor([0, 1, 0, 1], device='cuda:0') +# _values +tensor([1., 2., 3., 4.], device='cuda:0', dtype=torch.float64) + +########## torch.float64/torch.int64/batch_shape=()/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([0]), + col_indices=tensor([], size=(0,)), + values=tensor([], size=(0,)), device='cuda:0', size=(0, 0), nnz=0, + dtype=torch.float64, layout=torch.sparse_csr) +# _crow_indices +tensor([0], device='cuda:0') +# _col_indices +tensor([], device='cuda:0', dtype=torch.int64) +# _values +tensor([], device='cuda:0', dtype=torch.float64) + +########## torch.float64/torch.int64/batch_shape=(2,)/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([[0, 2, 4], + [0, 2, 4]]), + col_indices=tensor([[0, 1, 0, 1], + [0, 1, 0, 1]]), + values=tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]]), device='cuda:0', size=(2, 2, 2), + nnz=4, dtype=torch.float64, layout=torch.sparse_csr) +# _crow_indices +tensor([[0, 2, 4], + [0, 2, 4]], device='cuda:0') +# _col_indices +tensor([[0, 1, 0, 1], + [0, 1, 0, 1]], device='cuda:0') +# _values +tensor([[1., 2., 3., 4.], + [1., 2., 3., 4.]], device='cuda:0', dtype=torch.float64) + +########## torch.float64/torch.int64/batch_shape=(2, 3)/block_shape=() ########## +# sparse tensor +tensor(crow_indices=tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]]), + col_indices=tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]]), + values=tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]]), device='cuda:0', size=(2, 3, 2, 2), + nnz=4, dtype=torch.float64, layout=torch.sparse_csr) +# _crow_indices +tensor([[[0, 2, 4], + [0, 2, 4], + [0, 2, 4]], + + [[0, 2, 4], + [0, 2, 4], + [0, 2, 4]]], device='cuda:0') +# _col_indices +tensor([[[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]], + + [[0, 1, 0, 1], + [0, 1, 0, 1], + [0, 1, 0, 1]]], device='cuda:0') +# _values +tensor([[[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]], + + [[1., 2., 3., 4.], + [1., 2., 3., 4.], + [1., 2., 3., 4.]]], device='cuda:0', dtype=torch.float64) + diff --git a/test/forward_backward_compatibility/check_forward_backward_compatibility.py b/test/forward_backward_compatibility/check_forward_backward_compatibility.py index b5e3343489b8..b8927d0cfc70 100644 --- a/test/forward_backward_compatibility/check_forward_backward_compatibility.py +++ b/test/forward_backward_compatibility/check_forward_backward_compatibility.py @@ -34,9 +34,11 @@ ("prim::MKLDNNRelu6", datetime.date(9999, 1, 1)), ("prim::MKLDNNRelu6_", datetime.date(9999, 1, 1)), ("prim::Concat", datetime.date(9999, 1, 1)), + ("prim::is_mlc", datetime.date(2022, 5, 20)), # Internal, profiler-specific ops ("profiler::_call_end_callbacks_on_jit_fut*", datetime.date(9999, 1, 1)), ("profiler::_record_function_enter", datetime.date(9999, 1, 1)), + ("aten::_sparse_addmm", datetime.date(2022, 6, 30)), ("aten::linalg_matrix_rank", datetime.date(2021, 10, 30)), ("aten::linalg_pinv", datetime.date(2021, 10, 30)), ("aten::_cholesky_helper", datetime.date(9999, 1, 1)), @@ -50,12 +52,8 @@ ("aten::adaptive_avg_pool3d_backward", datetime.date(9999, 1, 1)), ("aten::_embedding_bag_dense_backward", datetime.date(9999, 1, 1)), ("aten::randperm", datetime.date(9999, 1, 1)), - ("aten::_conv_depthwise2d_backward", datetime.date(2022, 1, 31)), - ("aten::conv_depthwise3d_backward", datetime.date(2022, 1, 31)), - ("aten::cudnn_convolution.deprecated", datetime.date(2022, 1, 31)), - ("aten::cudnn_convolution.deprecated2", datetime.date(2022, 1, 31)), - ("aten::cudnn_convolution_transpose.deprecated", datetime.date(2022, 1, 31)), - ("aten::cudnn_convolution_transpose.deprecated2", datetime.date(2022, 1, 31)), + ("aten::gelu", datetime.date(2022, 3, 1)), + ("aten::gelu_backward", datetime.date(2022, 3, 1)), ("aten::cudnn_convolution_backward", datetime.date(2022, 1, 31)), ("aten::cudnn_convolution_backward_input", datetime.date(2022, 1, 31)), ("aten::cudnn_convolution_backward_weight", datetime.date(2022, 1, 31)), @@ -78,16 +76,23 @@ ("aten::slow_conv_transpose2d_backward", datetime.date(2022, 1, 31)), ("aten::slow_conv_transpose3d", datetime.date(2022, 1, 31)), ("aten::slow_conv_transpose3d_backward", datetime.date(2022, 1, 31)), + ("aten::solve", datetime.date(9999, 1, 1)), + ("aten::solve.solution", datetime.date(9999, 1, 1)), + ("aten::_solve_helper", datetime.date(9999, 1, 1)), ("aten::_index_copy_", datetime.date(2022, 5, 31)), ("aten::_svd_helper", datetime.date(2022, 3, 31)), ("aten::linalg_svdvals", datetime.date(2022, 3, 31)), ("aten::linalg_svdvals_out", datetime.date(2022, 3, 31)), ("aten::linalg_svd", datetime.date(2022, 3, 31)), ("aten::linalg_svd_out", datetime.date(2022, 3, 31)), + ("aten::linalg_qr_out", datetime.date(2022, 5, 31)), + ("aten::linalg_qr", datetime.date(2022, 5, 31)), ("aten::_max_pool1d_cpu_forward", datetime.date(2022, 2, 8)), + ("aten::max_unpool2d_backward", datetime.date(2022, 5, 15)), + ("aten::max_unpool2d_backward.grad_input", datetime.date(2022, 5, 15)), + ("aten::max_unpool3d_backward", datetime.date(2022, 5, 15)), + ("aten::max_unpool3d_backward.grad_input", datetime.date(2022, 5, 15)), ("aten::_convolution_nogroup", datetime.date(9999, 1, 1)), - ("aten::linspace", datetime.date(2022, 3, 1)), # TODO this will be removed soon - ("aten::logspace", datetime.date(2022, 3, 1)), # TODO this will be removed soon ("aten::miopen_convolution_backward", datetime.date(9999, 1, 1)), ("aten::miopen_convolution_backward_bias", datetime.date(9999, 1, 1)), ("aten::miopen_convolution_backward_input", datetime.date(9999, 1, 1)), @@ -98,6 +103,8 @@ ("aten::miopen_depthwise_convolution_backward", datetime.date(9999, 1, 1)), ("aten::miopen_depthwise_convolution_backward_input", datetime.date(9999, 1, 1)), ("aten::miopen_depthwise_convolution_backward_weight", datetime.date(9999, 1, 1)), + ("aten::is_mlc", datetime.date(2022, 5, 20)), + ("aten::_nested_tensor", datetime.date(9999, 1, 1)), ("caffe2::", datetime.date(2021, 10, 23)), ("prepacked::unpack_prepacked_sizes_conv2d", datetime.date(9999, 1, 1)), ("prepacked::unpack_prepacked_sizes_linear", datetime.date(9999, 1, 1)), @@ -112,6 +119,25 @@ ("aten::_scatter_reduce", datetime.date(2022, 1, 31)), ("aten::native_multi_head_self_attention", datetime.date(9999, 1, 1)), ("aten::_native_multi_head_self_attention", datetime.date(9999, 1, 1)), + ("aten::grid_sampler_3d_backward", datetime.date(9999, 1, 1)), + ("aten::_transform_bias_rescale_qkv", datetime.date(9999, 1, 1)), + ("aten::scatter_reduce.two", datetime.date(2022, 4, 15)), + ("aten::_s_where", datetime.date(2022, 9, 30)), + ("quantized::conv2d_cudnn", datetime.date(2022, 3, 22)), + ("quantized::conv2d_relu_cudnn", datetime.date(2022, 3, 22)), + ("prim::infer_squeeze_size.dim", datetime.date(9999, 1, 1)), + ("prim::infer_squeeze_size", datetime.date(9999, 1, 1)), + ("aten::_cat", datetime.date(2022, 5, 15)), + ("aten::nansum", datetime.date(2022, 5, 15)), + ("aten::zero", datetime.date(2022, 5, 15)), + ("aten::_validate_sparse_compressed_tensor_args", datetime.date(2022, 5, 15)), + ("aten::stft", datetime.date(2022, 5, 23)), + ("aten::linalg_lu_solve", datetime.date(2022, 5, 23)), + ("aten::linalg_lu_solve.out", datetime.date(2022, 5, 23)), + ("aten::_index_reduce", datetime.date(2022, 5, 15)), + ("aten::_csr_to_block_csr", datetime.date(2022, 5, 20)), + ("aten::_weight_norm_cuda_interface", datetime.date(9999, 1, 1)), + ("aten::_weight_norm_cuda_interface_backward", datetime.date(9999, 1, 1)), ] ALLOW_LIST_COMPILED = [ @@ -140,6 +166,33 @@ def allow_listed(schema): ("dist_c10d", datetime.date(2099, 9, 17)), ] +def has_valid_upgraders(schema, version_map): + # we want to parse through the map to find if + # the schema has valid upgraders. Since the + # version map has entry for each overload + # we need to do some ugly parsing. + + # the name of the operator + schema_name = schema.name + + if schema_name not in version_map: + return False + + entries = version_map[schema_name] + + possible_overloads = [] + possible_schemas = [] + for key, upgrader_schema_entries in entries.items(): + possible_overloads.append(key) + possible_schemas.extend(upgrader_schema_entries) + + # let's make sure this existing schema is part of possible + # schemas + for old_schema in possible_schemas: + if old_schema == schema: + return True + + return False def dont_parse(schema_line): for item in dont_parse_list: @@ -158,14 +211,33 @@ def load_schemas_to_dict(): new_schema_dict[s.name].append(s) return new_schema_dict +def process_version_map(version_map): + # version map maps full schema name to + # list of upgraders. Since we only have + # the name of the schema (aka no overload) + # we want to first process the map to make + # the key lookup easier. After this it will be: + # Dict[schema_name, Dict[overload, List[schema]]] + + output = defaultdict(dict) + for (key, entries) in version_map.items(): + operator_name = key.split(".")[0] + schema_entries = [parse_schema(entry.old_schema) for entry in entries] + output[operator_name][key] = schema_entries + return output + def check_bc(existing_schemas): new_schema_dict = load_schemas_to_dict() + version_map = process_version_map(torch._C._get_operator_version_map()) is_bc = True broken_ops = [] for existing_schema in existing_schemas: if allow_listed(existing_schema): print("schema: ", str(existing_schema), " found on allowlist, skipping") continue + if has_valid_upgraders(existing_schema, version_map): + print("schema: ", str(existing_schema), " has valid upgrader, skipping") + continue print("processing existing schema: ", str(existing_schema)) matching_new_schemas = new_schema_dict.get(existing_schema.name, []) found = False diff --git a/test/fx/test_fx_const_fold.py b/test/fx/test_fx_const_fold.py index 0d178e956c47..80198c2baeaa 100644 --- a/test/fx/test_fx_const_fold.py +++ b/test/fx/test_fx_const_fold.py @@ -5,7 +5,6 @@ import torch import torch.fx from torch.fx.experimental import const_fold -from torch.fx.experimental.fx_acc import acc_tracer, acc_ops from torch.testing._internal.common_utils import TestCase @@ -610,14 +609,14 @@ def forward(self, x): mod = ConstFoldTestModule() in_x = torch.randn(2, 4) - gm = acc_tracer.trace(mod, in_x) + gm = torch.fx.symbolic_trace(mod) def skip_folding_quant_dequant(node: torch.fx.Node): - if node.target != acc_ops.quantize_per_tensor: + if node.target != torch.quantize_per_tensor: return False # If quantize_per_node -> dequantize, then skip folding. for user in node.users: - if user.target == acc_ops.dequantize: + if user.target == torch.dequantize: return True return False diff --git a/test/fx_acc/test_acc_tracer.py b/test/fx_acc/test_acc_tracer.py deleted file mode 100644 index f16eef8e5286..000000000000 --- a/test/fx_acc/test_acc_tracer.py +++ /dev/null @@ -1,2104 +0,0 @@ -# Owner(s): ["oncall: fx"] - -import unittest -from typing import Callable, List - -import numpy as np -import torch -import torch.fx.experimental.fx_acc.acc_normalizer as acc_normalizer -import torch.fx.experimental.fx_acc.acc_ops as acc_ops -import torch.fx.experimental.fx_acc.acc_tracer as acc_tracer -import torch.fx.experimental.fx_acc.acc_utils as acc_utils -import torch.nn as nn -import torchvision -from parameterized import parameterized, param - -torch.manual_seed(0) - - -class AccTracerTest(unittest.TestCase): - def _make_model_unit_test( - self, - model, - *args, - input_shape=None, - enable_allclose=False, - **kwargs, - ): - """ - Test that the model can be traced correctly and is producing correct - result. - """ - if input_shape is None: - input_shape = [1, 3, 224, 224] - input = torch.randn(input_shape) - traced = acc_tracer.trace(model, [input]) - if enable_allclose: - torch.testing.assert_allclose(model(input), traced(input)) - else: - self.assertTrue(torch.equal(model(input), traced(input))) - traced_again = acc_tracer.trace(traced, [input]) - if enable_allclose: - torch.testing.assert_allclose(model(input), traced_again(input)) - else: - self.assertTrue(torch.equal(model(input), traced_again(input))) - - def _make_acc_op_function_test( - self, - acc_op: Callable, - torch_op, - *args, - input_shape=(2, 3), - validate_same_kwargs=True, - enable_allclose=False, - **kwargs, - ): - """ - Test that acc_op is traced somewhat. - """ - - class TestModule(torch.nn.Module): - def __init__(self, torch_op, args, kwargs): - super().__init__() - self._torch_op = torch_op - self._args = args - self._kwargs = kwargs - - def forward(self, a: torch.Tensor) -> torch.Tensor: - return self._torch_op(a, *self._args, **self._kwargs) - m = TestModule(torch_op, args, kwargs) - m.eval() - a = torch.randn(*input_shape) - traced = acc_tracer.trace(m, [a]) - ph_a = acc_op_node = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "a": - ph_a = node - elif node.op == "call_function": - self.assertEqual(node.target, acc_op) - self.assertEqual(node.kwargs["input"], ph_a) - if validate_same_kwargs: - for key, value in kwargs.items(): - self.assertEqual(node.kwargs[key], value) - acc_op_node = node - elif node.op == "output": - if acc_op is None: - # If we expect no new acc_op after graph building - # and found we have only output in traced graph - continue - self.assertEqual(acc_op_node, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - ref_outputs = m(a) - outputs = traced(a) - traced_again = acc_tracer.trace(traced, [a]) - outputs_again = traced_again(a) - if isinstance(ref_outputs, torch.Tensor): - ref_outputs = [ref_outputs] - outputs = [outputs] - outputs_again = [outputs_again] - - for ref_output, output, output_again in zip( - ref_outputs, outputs, outputs_again - ): - if enable_allclose: - torch.testing.assert_allclose( - torch.nan_to_num(ref_output), torch.nan_to_num(output) - ) - torch.testing.assert_allclose( - torch.nan_to_num(ref_output), torch.nan_to_num(output_again) - ) - else: - self.assertTrue( - torch.equal(torch.nan_to_num(ref_output), torch.nan_to_num(output)) - ) - self.assertTrue( - torch.equal( - torch.nan_to_num(ref_output), torch.nan_to_num(output_again) - ) - ) - - def test_sum(self): - self._make_acc_op_function_test(acc_ops.sum, torch.sum) - self._make_acc_op_function_test(acc_ops.sum, torch.sum, dim=(1,), keepdim=True) - - def test_prod(self): - self._make_acc_op_function_test(acc_ops.prod, torch.prod) - self._make_acc_op_function_test(acc_ops.prod, torch.prod, dim=1, keepdim=True) - - def test_mean(self): - self._make_acc_op_function_test(acc_ops.mean, torch.mean) - self._make_acc_op_function_test( - acc_ops.mean, torch.mean, dim=(1,), keepdim=True - ) - - def test_pad(self): - self._make_acc_op_function_test( - acc_ops.pad, torch.nn.functional.pad, pad=(2, 0) - ) - - def test_max(self): - def torch_max(x, *args, **kwargs): - return x.max(*args, **kwargs) - - self._make_acc_op_function_test(acc_ops.max_full_reduce, torch_max) - self._make_acc_op_function_test( - acc_ops.max_dim_reduce, torch_max, dim=1, keepdim=True - ) - self._make_acc_op_function_test( - acc_ops.max_dim_reduce, torch_max, input_shape=(1, 4), dim=1, keepdim=True - ) - self._make_acc_op_function_test( - acc_ops.max_dim_reduce, torch_max, input_shape=(3, 4, 3), dim=2 - ) - - @parameterized.expand( - [ - param("max_maximum", orig_op=torch.max, expected_op=acc_ops.maximum), - param( - "maximum_maximum", orig_op=torch.maximum, expected_op=acc_ops.maximum - ), - param("min_minimum", orig_op=torch.min, expected_op=acc_ops.minimum), - param( - "minimum_minimum", orig_op=torch.minimum, expected_op=acc_ops.minimum - ), - ] - ) - def test_maximum_minimum(self, _: str, orig_op, expected_op): - class TestModule(torch.nn.Module): - def __init__(self, orig_op): - super().__init__() - self.orig_op = orig_op - - def forward(self, input: torch.Tensor, other: torch.Tensor) -> torch.Tensor: - return self.orig_op(input, other) - - m = TestModule(orig_op) - input, other = torch.randn(2, 2), torch.randn(2, 2) - traced = acc_tracer.trace(m, [input, other]) - - ph_in = ph_oth = mxm = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "other": - ph_oth = node - else: - self.assertTrue(str(node.target) == "input") - ph_in = node - elif node.op == "call_function": - if node.target == expected_op: - self.assertEqual(node.kwargs["input"], ph_in) - self.assertEqual(node.kwargs["other"], ph_oth) - mxm = node - elif node.op == "output": - self.assertEqual(mxm, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - self.assertTrue(torch.equal(m(input, other), traced(input, other))) - - def test_conv(self): - """ - Test that a conv is traced as expected. - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.conv = nn.Conv2d(8, 7, 3, stride=2) - - def forward(self, a: torch.Tensor) -> torch.Tensor: - return self.conv(a) - - m = TestModule() - input = torch.randn(3, 8, 10, 10) - traced = acc_tracer.trace(m, [input]) - - ph = weight_attr = bias_attr = conv = None - for node in traced.graph.nodes: - if node.op == "placeholder": - self.assertEqual(str(node.target), "a") - ph = node - elif node.op == "get_attr" and node.target == "conv.weight": - weight_attr = node - elif node.op == "get_attr" and node.target == "conv.bias": - bias_attr = node - elif node.op == "call_function": - self.assertEqual(node.target, acc_ops.conv2d) - self.assertEqual(node.kwargs["input"], ph) - self.assertEqual(node.kwargs["weight"], weight_attr) - self.assertEqual(node.kwargs["bias"], bias_attr) - self.assertEqual(node.kwargs["stride"], (2, 2)) - self.assertEqual(node.kwargs["padding"], (0, 0)) - self.assertEqual(node.kwargs["dilation"], (1, 1)) - self.assertEqual(node.kwargs["groups"], 1) - conv = node - elif node.op == "output": - self.assertEqual(conv, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - self.assertTrue(torch.equal(m(input), traced(input))) - - def test_quantized_conv2d(self): - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.conv = nn.quantized.Conv2d(3, 3, 1) - - def forward(self, a: torch.Tensor) -> torch.Tensor: - return self.conv(a) - - m = TestModule() - input = torch.quantize_per_tensor( - torch.randn(1, 3, 1, 1), scale=0.01, zero_point=3, dtype=torch.quint8 - ) - traced = acc_tracer.trace(m, [input]) - print(traced.graph) - ph = weight_attr = bias_attr = conv = None - for node in traced.graph.nodes: - if node.op == "placeholder": - self.assertEqual(str(node.target), "a") - ph = node - elif node.op == "get_attr" and node.target == "conv_weight": - weight_attr = node - elif node.op == "get_attr" and node.target == "conv_bias": - bias_attr = node - elif node.op == "call_function": - self.assertEqual(node.target, acc_ops.quantized_conv2d) - self.assertEqual(node.kwargs["input"], ph) - self.assertEqual(node.kwargs["weight"], weight_attr) - self.assertEqual(node.kwargs["bias"], bias_attr) - conv = node - elif node.op == "output": - self.assertEqual(conv, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - self.assertTrue(torch.equal(m(input), traced(input))) - - def test_quantized_convrelu2d(self): - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.conv = nn.intrinsic.quantized.ConvReLU2d(3, 3, 1) - - def forward(self, a: torch.Tensor) -> torch.Tensor: - return self.conv(a) - - m = TestModule() - input = torch.quantize_per_tensor( - torch.randn(1, 3, 1, 1), scale=0.01, zero_point=3, dtype=torch.quint8 - ) - traced = acc_tracer.trace(m, [input]) - ph = weight_attr = bias_attr = conv = relu = None - for node in traced.graph.nodes: - if node.op == "placeholder": - self.assertEqual(str(node.target), "a") - ph = node - elif node.op == "get_attr" and node.target == "conv_weight": - weight_attr = node - elif node.op == "get_attr" and node.target == "conv_bias": - bias_attr = node - elif node.op == "call_function" and node.target == acc_ops.quantized_conv2d: - self.assertEqual(node.target, acc_ops.quantized_conv2d) - self.assertEqual(node.kwargs["input"], ph) - self.assertEqual(node.kwargs["weight"], weight_attr) - self.assertEqual(node.kwargs["bias"], bias_attr) - conv = node - elif node.op == "call_function" and node.target == acc_ops.relu: - self.assertEqual(node.target, acc_ops.relu) - self.assertEqual(node.kwargs["input"], conv) - relu = node - elif node.op == "output": - self.assertEqual(relu, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - self.assertTrue(torch.equal(m(input), traced(input))) - - def test_embedding_bag(self): - """ - Test that an embedding_bag is traced as expected. - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.eb = nn.EmbeddingBag(10, 3, mode="sum", include_last_offset=True) - - def forward(self, inp: torch.Tensor, offsets: torch.Tensor) -> torch.Tensor: - return self.eb(inp, offsets) - - m = TestModule() - inp = torch.LongTensor([1, 2, 4, 5, 4, 3, 2, 9]) - offsets = torch.LongTensor([0, 4]) - traced = acc_tracer.trace(m, [inp, offsets]) - - inp_node = offsets_node = weight_attr = eb_node = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "inp": - inp_node = node - elif str(node.target) == "offsets": - offsets_node = node - else: - self.fail(f"Unexpected placeholder {node.target}.") - continue - elif node.op == "get_attr" and node.target == "eb.weight": - weight_attr = node - elif node.op == "call_function": - self.assertEqual(node.target, acc_ops.embedding_bag) - # Note: Normalization called from acc_tracer means we use all kwargs. - self.assertEqual(node.kwargs["input"], inp_node) - self.assertEqual(node.kwargs["offsets"], offsets_node) - self.assertEqual(node.kwargs["weight"], weight_attr) - self.assertEqual(node.kwargs["mode"], "sum") - self.assertEqual(node.kwargs["include_last_offset"], True) - # The rest of these were unspecified, so verify they fell back - # to their respective default values thanks to normalization. - self.assertEqual(node.kwargs["max_norm"], None) - self.assertEqual(node.kwargs["norm_type"], 2.0) - self.assertEqual(node.kwargs["scale_grad_by_freq"], False) - self.assertEqual(node.kwargs["sparse"], False) - self.assertEqual(node.kwargs["per_sample_weights"], None) - eb_node = node - elif node.op == "output": - self.assertEqual(eb_node, node.args[0]) - - self.assertTrue(torch.equal(m(inp, offsets), traced(inp, offsets))) - - def test_embedding_bag_byte_and_4bit_rowwise_offsets(self): - """ - Test that 4 bit quantized embedding_bag is traced as expected. - """ - - class TestModule(nn.Module): - def __init__( - self, - op, - q_weights, - per_index_weights, - ): - super().__init__() - self.emb = op - self.q_weights = q_weights - self.per_index_weights = per_index_weights - - def forward( - self, - indices, - offsets, - ): - return self.emb( - self.q_weights, - indices, - offsets, - mode=0, - per_sample_weights=self.per_index_weights, - include_last_offset=True, - ) - - def run_embedding_bag_test(is_4bit, use_weights): - # generate random indices, offsets, and weights. - num_embeddings = 16 - embedding_dim = 32 - num_lengths = 10 - - weights = torch.from_numpy( - (np.random.random_sample((num_embeddings, embedding_dim)) + 1).astype( - np.float32 - ) - ) - q_weights = ( - torch.ops.quantized.embedding_bag_4bit_prepack(weights) - if is_4bit - else torch.ops.quantized.embedding_bag_byte_prepack(weights) - ) - np_lengths = np.random.randint(0, num_lengths, size=10).astype(np.int32) - - num_lengths = np.sum(np_lengths) - indices = torch.from_numpy( - np.random.randint(low=0, high=num_embeddings, size=num_lengths) - ).int() - - lengths = torch.from_numpy(np_lengths) - offsets = torch.cat([torch.zeros([1]), torch.cumsum(lengths, 0)]).int() - - weights = torch.randint(low=0, high=4, size=indices.size()) - per_sample_weights = weights.to(torch.float32) - - indices = indices.to(torch.int32) - offsets = offsets.to(torch.int32) - inputs = [ - indices, - offsets, - ] - - op = ( - torch.ops.quantized.embedding_bag_4bit_rowwise_offsets - if is_4bit - else torch.ops.quantized.embedding_bag_byte_rowwise_offsets - ) - - m = TestModule( - op, - q_weights, - per_sample_weights, - ) - - traced = acc_tracer.trace(m, inputs) - print(traced.graph) - - expected_target = ( - acc_ops.embedding_bag_4bit_rowwise_offsets - if is_4bit - else acc_ops.embedding_bag_byte_rowwise_offsets - ) - - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "indices": - inp_node = node - elif str(node.target) == "offsets": - offsets_node = node - else: - self.fail(f"Unexpected placeholder {node.target}.") - continue - elif node.op == "get_attr" and node.target == "q_weights": - weight_attr = node - elif node.op == "call_function": - self.assertEqual(node.target, expected_target) - # Note: Normalization called from acc_tracer means we use all kwargs. - self.assertEqual(node.kwargs["indices"], inp_node) - self.assertEqual(node.kwargs["offsets"], offsets_node) - self.assertEqual(node.kwargs["weight"], weight_attr) - self.assertEqual(node.kwargs["mode"], 0) - self.assertEqual(node.kwargs["include_last_offset"], True) - # The rest of these were unspecified, so verify they fell back - # to their respective default values thanks to normalization. - eb_node = node - elif node.op == "output": - self.assertEqual(eb_node, node.args[0]) - self.assertTrue(torch.equal(m(indices, offsets), traced(indices, offsets))) - - # test 8-bit - run_embedding_bag_test(is_4bit=False, use_weights=True) - # test 4-bit - run_embedding_bag_test(is_4bit=True, use_weights=True) - - def test_quantized_batch_norm2d(self): - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.bn = nn.quantized.BatchNorm2d(3) - - def forward(self, a: torch.Tensor) -> torch.Tensor: - return self.bn(a) - - m = TestModule() - m.eval() - input = torch.quantize_per_tensor( - torch.randn(1, 3, 1, 1), scale=0.01, zero_point=3, dtype=torch.quint8 - ) - traced = acc_tracer.trace(m, [input]) - ph = weight_attr = bias_attr = bn_mean = bn_var = bn = None - for node in traced.graph.nodes: - if node.op == "placeholder": - self.assertEqual(str(node.target), "a") - ph = node - elif node.op == "get_attr" and node.target == "bn.weight": - weight_attr = node - elif node.op == "get_attr" and node.target == "bn.bias": - bias_attr = node - elif node.op == "get_attr" and node.target == "bn.running_mean": - bn_mean = node - elif node.op == "get_attr" and node.target == "bn.running_var": - bn_var = node - elif node.op == "get_attr" and node.target == "bn.scale": - bn_scale = node - elif node.op == "get_attr" and node.target == "bn.zero_point": - bn_zero_point = node - elif node.op == "call_function": - self.assertEqual(node.target, acc_ops.quantized_batch_norm2d) - self.assertEqual(node.kwargs["input"], ph) - self.assertEqual(node.kwargs["weight"], weight_attr) - self.assertEqual(node.kwargs["bias"], bias_attr) - self.assertEqual(node.kwargs["running_mean"], bn_mean) - self.assertEqual(node.kwargs["running_var"], bn_var) - self.assertEqual(node.kwargs["acc_out_ty"][6]["scale"], bn_scale) - self.assertEqual( - node.kwargs["acc_out_ty"][6]["zero_point"], bn_zero_point - ) - bn = node - elif node.op == "output": - self.assertEqual(bn, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - self.assertTrue(torch.equal(m(input), traced(input))) - - def test_linear(self): - """ - Test that a linear is traced as expected, i.e. to the functional level and with - kwarg normalization. Also verify that symbolic shape inference worked as part of - the acc_tracer. - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.linear = nn.Linear(3, 5, bias=True) - - def forward(self, a: torch.Tensor) -> torch.Tensor: - return self.linear(a) - - m = TestModule() - test_input = torch.randn(1, 3) - traced = acc_tracer.trace(m, test_input) - ph = weight_attr = bias_attr = linear = None - for node in traced.graph.nodes: - if node.op == "placeholder": - self.assertEqual(str(node.target), "a") - ph = node - elif node.op == "get_attr" and node.target == "linear.weight": - weight_attr = node - elif node.op == "get_attr" and node.target == "linear.bias": - bias_attr = node - elif node.op == "call_function": - self.assertEqual(node.target, acc_ops.linear) - self.assertEqual(node.kwargs["input"], ph) - self.assertEqual(node.kwargs["weight"], weight_attr) - self.assertEqual(node.kwargs["bias"], bias_attr) - linear = node - elif node.op == "output": - self.assertEqual(linear, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - self.assertTrue(torch.equal(m(test_input), traced(test_input))) - - def test_quantized_linear(self): - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.linear = nn.quantized.Linear(3, 5) - - def forward(self, a: torch.Tensor) -> torch.Tensor: - return self.linear(a) - - m = TestModule() - input = torch.quantize_per_tensor( - torch.randn(2, 3), scale=0.01, zero_point=3, dtype=torch.quint8 - ) - traced = acc_tracer.trace(m, [input]) - ph = weight_attr = bias_attr = linear = None - for node in traced.graph.nodes: - if node.op == "placeholder": - self.assertEqual(str(node.target), "a") - ph = node - elif node.op == "get_attr" and node.target == "linear_weight": - weight_attr = node - elif node.op == "get_attr" and node.target == "linear_bias": - bias_attr = node - elif node.op == "call_function": - self.assertEqual(node.target, acc_ops.quantized_linear) - self.assertEqual(node.kwargs["input"], ph) - self.assertEqual(node.kwargs["weight"], weight_attr) - self.assertEqual(node.kwargs["bias"], bias_attr) - linear = node - elif node.op == "output": - self.assertEqual(linear, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - self.assertTrue(torch.equal(m(input), traced(input))) - - @parameterized.expand( - [ - param("remove_exceptions_false", remove_exceptions=False), - param("remove_exceptions_true", remove_exceptions=True), - ] - ) - def test_batch_norm(self, _, remove_exceptions): - """ - Test that a batch norm is traced as expected, i.e. to the functional level - and with kwarg normalization. Note that we also expect to see a - ConditionalExceptionWrapper in the graph that the AST rewriter converted - from `if x: raise y`. - - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.bn = torch.nn.BatchNorm2d(2) - - def forward(self, a: torch.Tensor) -> torch.Tensor: - return self.bn(a) - - m = TestModule() - input = torch.randn(2, 2, 1, 1) - # Note: Explicitly not removing exceptions so that we can check they - # were found and exist below. - traced = acc_tracer.trace( - m, - [input], - remove_exceptions=remove_exceptions, - ) - - ph = exception_wrapper = weight = bias = mean = var = bn = None - for node in traced.graph.nodes: - if node.op == "placeholder": - self.assertEqual(str(node.target), "a") - ph = node - elif node.op == "get_attr" and node.target == "bn.weight": - weight = node - elif node.op == "get_attr" and node.target == "bn.bias": - bias = node - elif node.op == "get_attr" and node.target == "bn.running_mean": - mean = node - elif node.op == "get_attr" and node.target == "bn.running_var": - var = node - elif node.op == "call_function" and node.target == acc_ops.batch_norm: - # Note: Normalization called from acc_tracer means we use - # all kwargs. - self.assertEqual(node.kwargs["input"], ph) - self.assertEqual(node.kwargs["weight"], weight) - self.assertEqual(node.kwargs["bias"], bias) - self.assertEqual(node.kwargs["running_mean"], mean) - self.assertEqual(node.kwargs["running_var"], var) - bn = node - elif ( - node.op == "call_module" - and node.target == "bn._conditional_exception_wrapper_ValueError" - ): - exception_wrapper = node - elif node.op == "output": - self.assertEqual(bn, node.args[0]) - - self.assertTrue(remove_exceptions or exception_wrapper is not None) - - self.assertTrue(torch.equal(m(input), traced(input))) - - def test_remove_asserts(self): - """ - Test that a Module with asserts has the asserts automatically removed, as - well as calls to a class method that should be dead. - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - - def _test_method(self, a): - return a - - def forward(self, a: torch.Tensor) -> torch.Tensor: - assert torch.equal(self._test_method(a), a) - return a - - m = TestModule() - input = torch.randn(10) - traced = acc_tracer.trace(m, [input], ast_rewriter_allow_list={TestModule}) - # Check we have no call_functions. If remove asserts didn't work - # correctly we would see a call to torch._assert, _test_method, and - # torch.equal. - for node in traced.graph.nodes: - self.assertFalse(node.op == "call_function") - - self.assertTrue(torch.equal(m(input), traced(input))) - - def test_no_rewrite_leaf_module(self): - """ - Test that when we supply a leaf module, we don't rewrite it - """ - - class TestChildModule(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, a: torch.Tensor) -> torch.Tensor: - return a.relu() - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.child = TestChildModule() - - def forward(self, a: torch.Tensor) -> torch.Tensor: - return self.child(a) + self.child(a) - - m = TestModule() - input = torch.randn(10) - traced = acc_tracer.trace(m, [input], leaf_module_list={TestChildModule}) - # trace it again just in case - traced = acc_tracer.trace(traced, [input], leaf_module_list={TestChildModule}) - - for _, m in traced.named_children(): - self.assertFalse("__AccRewrittenModule" in str(type(m)), str(type(m))) - - def test_sequential(self): - """ - Test that the tracer works for torch.nn.Sequential. - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.model = nn.Sequential(nn.Sigmoid(), nn.ReLU()) - - def forward(self, a: torch.Tensor) -> torch.Tensor: - return self.model(a) - - m = TestModule() - input = torch.randn(10) - traced = acc_tracer.trace(m, [input]) - - for node in traced.graph.nodes: - if node.op == "call_function": - is_sigmoid = node.target == acc_ops.sigmoid - is_relu = node.target == acc_ops.relu - self.assertTrue(is_sigmoid or is_relu) - else: - self.assertTrue(node.op == "placeholder" or node.op == "output") - - self.assertTrue(torch.equal(m(input), traced(input))) - - def test_unsqueeze(self): - """ - Test that torch.unsqueeze is traced correctly. - """ - self._make_acc_op_function_test( - acc_ops.unsqueeze, - torch.unsqueeze, - validate_same_kwargs=False, - dim=1, - ) - - def test_stack(self): - """ - Test that torch.stack is traced correctly. - """ - - class TestModule(torch.nn.Module): - def forward(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: - return torch.stack((a, b), dim=1) - - a, b = torch.randn(4, 5, 6), torch.randn(4, 5, 6) - mod = TestModule() - traced = acc_tracer.trace(mod, [a, b]) - self.assertTrue(torch.equal(mod(a, b), traced(a, b))) - - ph_a = ph_b = unsqueeze_a = unsqueeze_b = cat_node = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "a": - ph_a = node - else: - self.assertTrue(str(node.target) == "b") - ph_b = node - elif node.op == "call_function": - if node.target == acc_ops.unsqueeze: - if node.kwargs["input"] is ph_a: - unsqueeze_a = node - else: - self.assertEqual(node.kwargs["input"], ph_b) - unsqueeze_b = node - else: - self.assertEqual(node.target, acc_ops.cat) - self.assertEqual(node.kwargs["tensors"], [unsqueeze_a, unsqueeze_b]) - cat_node = node - elif node.op == "output": - self.assertEqual(cat_node, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - def test_no_raise(self): - """ - self that we can trace `if x: raise y(msg)` when the raise isn't executed. - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, a, b): - if torch.equal(a, b): - raise AssertionError("a equaled b!") - return a - - m = TestModule() - in_a, in_b = torch.randn(5), torch.randn(5) - traced = acc_tracer.trace( - m, - [in_a, in_b], - remove_exceptions=False, - use_acc_normalization=False, - ast_rewriter_allow_list={TestModule}, - ) - - # Verify the structure of the graph, including the existence of the - # exception_wrapper. - ph_a = exception_wrapper = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "a": - ph_a = node - else: - self.assertTrue(str(node.target) == "b") - elif node.op == "call_module": - self.assertEqual( - node.target, "_conditional_exception_wrapper_AssertionError" - ) - exception_wrapper = node - elif node.op == "output": - self.assertEqual(ph_a, node.args[0]) - - self.assertTrue(exception_wrapper is not None) - - self.assertTrue(torch.equal(m(in_a, in_b), traced(in_a, in_b))) - - def test_yes_raise(self): - """ - Test that we can trace `if x: raise y(msg)` when the raise is executed. - """ - err_str = "a equaled b!" - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.err_str = err_str - - def forward(self, a, b): - if torch.equal(a, b): - raise RuntimeError(self.err_str) - return a - - m = TestModule() - # Note: We must use different inputs here in order for shape_prop to work, as - # otherwise the exception is thrown (as expected/checked below). - in_a, in_b = torch.randn(5), torch.randn(5) - traced = acc_tracer.trace( - m, - [in_a, in_b], - remove_exceptions=False, - ast_rewriter_allow_list={TestModule}, - ) - - # Verify the structure of the graph, including the existence of the - # exception_wrapper. - ph_a = exception_wrapper = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "a": - ph_a = node - else: - self.assertTrue(str(node.target) == "b") - elif node.op == "call_module": - self.assertEqual( - node.target, "_conditional_exception_wrapper_RuntimeError" - ) - exception_wrapper = node - elif node.op == "output": - self.assertEqual(ph_a, node.args[0]) - - self.assertTrue(exception_wrapper is not None) - - def test(mod): - try: - # Note: Use the same input here to ensure the exception is thrown. - mod(in_a, in_a) - self.fail("Shouldn't get here because exception should be thrown.") - except RuntimeError as e: - self.assertEqual(err_str, str(e)) - - test(m) - test(traced) - - def test_remove_raise(self): - """ - Test that we can trace `if x: raise y(msg)` and then remove the exception_wrapper. - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, a, b): - if torch.equal(a, b): - raise AssertionError("a equaled b!") - return a - - m = TestModule() - in_a, in_b = torch.randn(5), torch.randn(5) - traced = acc_tracer.trace( - m, - [in_a, in_b], - remove_exceptions=True, - ast_rewriter_allow_list={TestModule}, - ) - - # Verify the structure of the graph, including the existence of the - # exception_wrapper. - ph_a = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "a": - ph_a = node - else: - self.assertTrue(str(node.target) == "b") - elif node.op == "output": - self.assertEqual(ph_a, node.args[0]) - else: - # Should not encounter any call_modules, e.g. to the - # exception_wrapper. - self.assertFalse(node.op == "call_module") - - # Note: Using input in_a twice for the tracer version, which would - # trigger the raise if it was still there. - self.assertTrue(torch.equal(m(in_a, in_b), traced(in_a, in_a))) - - def test_raise_no_message(self): - """ - Test that we can trace `if x: raise y` when `y` has no message. - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, a, b): - if torch.equal(a, b): - raise AssertionError - return a - - m = TestModule() - in_a, in_b = torch.randn(5), torch.randn(5) - traced = acc_tracer.trace( - m, - [in_a, in_b], - remove_exceptions=False, - use_acc_normalization=False, - ast_rewriter_allow_list={TestModule}, - ) - - # Verify the structure of the graph, including the existence of the - # exception_wrapper. - ph_a = exception_wrapper = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "a": - ph_a = node - else: - self.assertTrue(str(node.target) == "b") - elif node.op == "call_module": - self.assertEqual( - node.target, "_conditional_exception_wrapper_AssertionError" - ) - exception_wrapper = node - elif node.op == "output": - self.assertEqual(ph_a, node.args[0]) - - self.assertTrue(exception_wrapper is not None) - self.assertTrue(torch.equal(m(in_a, in_b), traced(in_a, in_b))) - - def test_quantized_add(self): - """ - Test that a quantized_add and acc_ops.quantize_per_tensor are traced as expected, - verifying the acc_out_tys are set as expected. - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.q_input = torch.nn.quantized.Quantize( - scale=1.0 / 128, zero_point=5, dtype=torch.quint8 - ) - self.q_other = torch.nn.quantized.Quantize( - scale=1.0 / 128, zero_point=10, dtype=torch.quint8 - ) - - def forward(self, input: torch.Tensor, other: torch.Tensor) -> torch.Tensor: - return torch.ops.quantized.add( - self.q_input(input), - self.q_other(other), - scale=0.05, - zero_point=1, - ) - - m = TestModule() - input, other = torch.randn(2, 3, 4), torch.randn(2, 3, 4) - traced = acc_tracer.trace(m, [input, other]) - - input_ph = other_ph = q_input = q_other = q_add = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "input": - input_ph = node - else: - self.assertTrue(str(node.target) == "other") - other_ph = node - elif ( - node.op == "call_function" - and node.target == acc_ops.quantize_per_tensor - ): - qparams = { - "scale": 1.0 / 128, - "zero_point": 5, - } - expected_md = acc_utils.build_raw_tensor_meta( - dtype=torch.quint8, - qparams=qparams, - ) - if node.kwargs["input"] == input_ph: - q_input = node - else: - self.assertTrue(node.kwargs["input"] == other_ph) - q_other = node - qparams_copy = qparams.copy() - qparams_copy["zero_point"] = 10 - expected_md = expected_md._replace(qparams=qparams_copy) - self.assertEqual(node.kwargs["acc_out_ty"], expected_md) - elif node.op == "call_function" and node.target == acc_ops.quantized_add: - self.assertEqual(node.kwargs["input"], q_input) - self.assertEqual(node.kwargs["other"], q_other) - qparams = { - "scale": 0.05, - "zero_point": 1, - } - expected_md = acc_utils.build_raw_tensor_meta(qparams=qparams) - self.assertEqual(node.kwargs["acc_out_ty"], expected_md) - q_add = node - elif node.op == "output": - self.assertEqual(q_add, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - self.assertTrue(torch.equal(m(input, other), traced(input, other))) - - def test_quantized_mul(self): - """ - Test that a quantized_mul and acc_ops.quantize_per_tensor are traced as expected, - verifying the acc_out_tys are set as expected. - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.q_input = torch.nn.quantized.Quantize( - scale=1.0 / 128, zero_point=5, dtype=torch.quint8 - ) - self.q_other = torch.nn.quantized.Quantize( - scale=1.0 / 128, zero_point=10, dtype=torch.quint8 - ) - - def forward(self, input: torch.Tensor, other: torch.Tensor) -> torch.Tensor: - return torch.ops.quantized.mul( - self.q_input(input), - self.q_other(other), - scale=0.05, - zero_point=1, - ) - - m = TestModule() - input, other = torch.randn(2, 3, 4), torch.randn(2, 3, 4) - traced = acc_tracer.trace(m, [input, other]) - - input_ph = other_ph = q_input = q_other = q_add = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "input": - input_ph = node - else: - self.assertTrue(str(node.target) == "other") - other_ph = node - elif ( - node.op == "call_function" - and node.target == acc_ops.quantize_per_tensor - ): - qparams = { - "scale": 1.0 / 128, - "zero_point": 5, - } - expected_md = acc_utils.build_raw_tensor_meta( - dtype=torch.quint8, - qparams=qparams, - ) - if node.kwargs["input"] == input_ph: - q_input = node - else: - self.assertTrue(node.kwargs["input"] == other_ph) - q_other = node - qparams_copy = qparams.copy() - qparams_copy["zero_point"] = 10 - expected_md = expected_md._replace(qparams=qparams_copy) - self.assertEqual(node.kwargs["acc_out_ty"], expected_md) - elif node.op == "call_function" and node.target == acc_ops.quantized_mul: - self.assertEqual(node.kwargs["input"], q_input) - self.assertEqual(node.kwargs["other"], q_other) - qparams = { - "scale": 0.05, - "zero_point": 1, - } - expected_md = acc_utils.build_raw_tensor_meta(qparams=qparams) - self.assertEqual(node.kwargs["acc_out_ty"], expected_md) - q_add = node - elif node.op == "output": - self.assertEqual(q_add, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - self.assertTrue(torch.equal(m(input, other), traced(input, other))) - - def test_cat(self): - """ - Test that torch.cat is traced correctly. - """ - - class TestModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: - return torch.cat([a, a, b], 0) - - m = TestModule() - a, b = torch.randn(2, 2), torch.randn(2, 2) - traced = acc_tracer.trace(m, (a, b)) - - ph_a = ph_b = cat = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "a": - ph_a = node - else: - self.assertTrue(str(node.target) == "b") - ph_b = node - elif node.op == "call_function": - self.assertEqual(node.target, acc_ops.cat) - self.assertEqual(node.kwargs["tensors"][0], ph_a) - self.assertEqual(node.kwargs["tensors"][1], ph_a) - self.assertEqual(node.kwargs["tensors"][2], ph_b) - self.assertEqual(node.kwargs["dim"], 0) - cat = node - elif node.op == "output": - self.assertEqual(cat, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - self.assertTrue(torch.equal(m(a, b), traced(a, b))) - - def test_square(self): - """ - Test that torch.square is traced correctly. - """ - self._make_acc_op_function_test(acc_ops.mul, torch.square) - - def test_reshape(self): - """ - Test that torch.reshape is traced correctly. - """ - self._make_acc_op_function_test(acc_ops.reshape, torch.reshape, (1, -1)) - # arg = (1, -1) - self._make_acc_op_function_test(acc_ops.reshape, lambda x: x.reshape(1, -1)) - # arg = ((1, -1)) - self._make_acc_op_function_test(acc_ops.reshape, lambda x: x.reshape((1, -1))) - - def test_transpose(self): - """ - Test that torch.transpose is traced correctly. - """ - self._make_acc_op_function_test( - acc_ops.permute, lambda x: torch.transpose(x, 1, 0) - ) - - def test_permute(self): - """ - Test that torch.permute is traced correctly. - """ - - def torch_permute(a, *dim): - return a.permute(*dim) - - self._make_acc_op_function_test(acc_ops.permute, torch_permute, 1, 0) - - def test_min_full_reduce(self): - """ - Test that test_min_full_reduce is traced correctly. - """ - self._make_acc_op_function_test(acc_ops.min_full_reduce, torch.min) - - def test_matmul(self): - """ - Test that torch.matmul is traced correctly. - """ - - class TestModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: - return torch.matmul(a, b) - - m = TestModule() - a, b = torch.randn(2, 2), torch.randn(2, 2) - traced = acc_tracer.trace(m, [a, b]) - - ph_a = ph_b = matmul = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "a": - ph_a = node - else: - self.assertTrue(str(node.target) == "b") - ph_b = node - elif node.op == "call_function": - self.assertEqual(node.target, acc_ops.matmul) - self.assertEqual(node.kwargs["input"], ph_a) - self.assertEqual(node.kwargs["other"], ph_b) - matmul = node - elif node.op == "output": - self.assertEqual(matmul, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - self.assertTrue(torch.equal(m(a, b), traced(a, b))) - - def test_bmm(self): - self._make_acc_op_function_test( - acc_ops.matmul, lambda x: torch.bmm(x, x), input_shape=(2, 4, 4) - ) - - def test_tile(self): - return self._make_acc_op_function_test( - acc_ops.tile, lambda x: torch.tile(x, (2, 1, 2)), input_shape=(1, 2) - ) - - def test_dropout(self): - self._make_acc_op_function_test( - None, - lambda x: nn.functional.dropout(x, training=False), - input_shape=(1, 2, 3), - ) - - def test_stochastic_depth(self): - self._make_acc_op_function_test( - None, - lambda x, p, mode, training: torchvision.ops.stochastic_depth( - x, p=p, mode=mode, training=training - ), - input_shape=(1, 2, 3), - p=0.5, - mode="row", - training=False, - ) - - def test_hardsigmoid(self): - self._make_acc_op_function_test( - acc_ops.hardsigmoid, - lambda x: nn.functional.hardsigmoid(x), - input_shape=(3, 4, 5), - ) - - def test_hardtanh(self): - self._make_acc_op_function_test( - acc_ops.hardtanh, - lambda x: nn.functional.hardtanh(x), - input_shape=(3, 4, 5), - ) - - def test_hardswish(self): - class TestModule(nn.Module): - def forward(self, x: torch.Tensor) -> torch.Tensor: - y = nn.functional.hardswish(x) - return y - - m = TestModule() - x = torch.randn(3, 4, 5) - traced = acc_tracer.trace(m, x) - ph_x = hardsigmoid_y = res_y = None - for node in traced.graph.nodes: - if node.op == "placeholder": - ph_x = node - elif node.op == "call_function" and node.target == acc_ops.hardsigmoid: - hardsigmoid_y = node - self.assertEqual(node.kwargs["input"], ph_x) - elif node.op == "call_function" and node.target == acc_ops.mul: - res_y = node - self.assertEqual(node.kwargs["input"], hardsigmoid_y) - self.assertEqual(node.kwargs["other"], ph_x) - elif node.op == "output": - self.assertEqual(node.args[0], res_y) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - ref = m(x) - res = traced(x) - torch.testing.assert_allclose(ref, res) - - def test_add_with_alpha(self): - """ - Test that normalization works for torch add with alpha, which requires special - normalization handling. - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: - a1 = torch.add(a, b) - a2 = torch.add(a, b, alpha=1.0) - a3 = torch.add(a, b, alpha=0.5) - return a1, a2, a3 - - m = TestModule() - input_a = torch.randn(2, 3) - input_b = torch.randn(2, 3) - traced = acc_tracer.trace(m, [input_a, input_b]) - - ph_a = ph_b = add_1 = add_2 = add_3 = mul = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "a": - ph_a = node - elif str(node.target) == "b": - ph_b = node - else: - self.fail(f"Unexpected placeholder {node.target}.") - elif node.op == "call_function" and node.target == acc_ops.mul: - mul = node - self.assertEqual(node.kwargs["input"], ph_b) - self.assertEqual(node.kwargs["other"], 0.5) - elif node.op == "call_function" and node.target == acc_ops.add: - if add_1 is None: - add_1 = node - self.assertEqual(node.kwargs["input"], ph_a) - self.assertEqual(node.kwargs["other"], ph_b) - elif add_2 is None: - add_2 = node - self.assertEqual(node.kwargs["input"], ph_a) - self.assertEqual(node.kwargs["other"], ph_b) - elif add_3 is None: - add_3 = node - self.assertEqual(node.kwargs["input"], ph_a) - self.assertEqual(node.kwargs["other"], mul) - else: - self.fail(f"Unexpected add: {node.format_node()}") - elif node.op == "output": - self.assertEqual(node.args[0][0], add_1) - self.assertEqual(node.args[0][1], add_2) - self.assertEqual(node.args[0][2], add_3) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - ref = m(input_a, input_b) - res = traced(input_a, input_b) - self.assertTrue(torch.equal(ref[0], res[0])) - self.assertTrue(torch.equal(ref[1], res[1])) - self.assertTrue(torch.equal(ref[2], res[2])) - - def test_leaf_module_list(self): - """ - Test leaf_module_list is working properly. - """ - - class LeafModule(nn.Module): - def forward(self, x): - return x - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.mod = LeafModule() - - def forward(self, x): - return self.mod(x) - - x = torch.randn(1, 1) - mod = TestModule() - acc_mod = acc_tracer.trace( - mod, - [x], - leaf_module_list={LeafModule}, - ) - ph = leaf_module = None - for node in acc_mod.graph.nodes: - if node.op == "placeholder": - ph = node - elif node.op == "call_module": - leaf_module = node - self.assertEqual(leaf_module.target, "mod") - self.assertEqual(leaf_module.args[0], ph) - elif node.op == "output": - self.assertEqual(node.args[0], leaf_module) - else: - self.fail(f"Unexpected node: {node.format_node()}") - self.assertTrue(torch.equal(mod(x), acc_mod(x))) - - def test_sign(self): - self._make_acc_op_function_test(acc_ops.sign, torch.sign) - - def test_relu(self): - self._make_acc_op_function_test(acc_ops.relu, torch.relu) - - def test_leaky_relu(self): - self._make_acc_op_function_test( - acc_ops.leaky_relu, torch.nn.functional.leaky_relu - ) - - def test_elu(self): - self._make_acc_op_function_test(acc_ops.elu, torch.nn.functional.elu) - - def test_selu(self): - self._make_acc_op_function_test(acc_ops.selu, torch.nn.functional.selu) - - def test_softsign(self): - self._make_acc_op_function_test(acc_ops.softsign, torch.nn.functional.softsign) - - def test_sigmoid(self): - self._make_acc_op_function_test(acc_ops.sigmoid, torch.sigmoid) - - def test_sin(self): - self._make_acc_op_function_test(acc_ops.sin, torch.sin) - - def test_cos(self): - self._make_acc_op_function_test(acc_ops.cos, torch.cos) - - def test_tan(self): - self._make_acc_op_function_test(acc_ops.tan, torch.tan) - - def test_sinh(self): - self._make_acc_op_function_test(acc_ops.sinh, torch.sinh) - - def test_cosh(self): - self._make_acc_op_function_test(acc_ops.cosh, torch.cosh) - - def test_tanh(self): - self._make_acc_op_function_test(acc_ops.tanh, torch.tanh) - - def test_asin(self): - self._make_acc_op_function_test(acc_ops.asin, torch.asin) - - def test_acos(self): - self._make_acc_op_function_test(acc_ops.acos, torch.acos) - - def test_atan(self): - self._make_acc_op_function_test(acc_ops.atan, torch.atan) - - def test_exp(self): - self._make_acc_op_function_test(acc_ops.exp, torch.exp) - - def test_log(self): - self._make_acc_op_function_test(acc_ops.log, torch.log) - - def test_sqrt(self): - self._make_acc_op_function_test(acc_ops.sqrt, torch.sqrt) - - def test_reciprocal(self): - self._make_acc_op_function_test(acc_ops.reciprocal, torch.reciprocal) - - def test_abs(self): - self._make_acc_op_function_test(acc_ops.abs, torch.abs) - - def test_neg(self): - self._make_acc_op_function_test(acc_ops.neg, torch.neg) - - def test_floor(self): - self._make_acc_op_function_test(acc_ops.floor, torch.floor) - - def test_ceil(self): - self._make_acc_op_function_test(acc_ops.ceil, torch.ceil) - - def test_softmax(self): - self._make_acc_op_function_test(acc_ops.softmax, torch.nn.functional.softmax) - - def test_tensor_squeeze(self): - self._make_acc_op_function_test(acc_ops.squeeze, lambda x: x.squeeze()) - - def test_torch_squeeze(self): - self._make_acc_op_function_test(acc_ops.squeeze, lambda x: torch.squeeze(x)) - - def test_operator_mul(self): - self._make_acc_op_function_test(acc_ops.mul, lambda x: x * 7) - - def test_torch_mul(self): - self._make_acc_op_function_test(acc_ops.mul, lambda x: torch.mul(x, 7)) - - def test_div(self): - self._make_acc_op_function_test(acc_ops.div, lambda x: torch.div(x, 2)) - self._make_acc_op_function_test(acc_ops.div, lambda x: x / 2) - - def test_floor_div(self): - self._make_acc_op_function_test( - acc_ops.floor_div, lambda x: torch.div(x, 2, rounding_mode="floor") - ) - - def test_trunc_div(self): - self._make_acc_op_function_test( - acc_ops.trunc_div, lambda x: torch.div(x, 2, rounding_mode="trunc") - ) - self._make_acc_op_function_test( - acc_ops.trunc_div, lambda x: torch.floor_divide(x, 2) - ) - - def test_view(self): - """ - Test that Tensor.view is traced correctly. - """ - - self._make_acc_op_function_test(acc_ops.reshape, lambda x: x.view(1, -1)) - - def test_narrow(self): - """ - Test that torch.narrow is traced correctly. - """ - return self._make_acc_op_function_test( - acc_ops.slice_tensor, - torch.narrow, - validate_same_kwargs=False, - dim=1, - start=1, - length=2, - ) - - def test_pow(self): - self._make_acc_op_function_test(acc_ops.pow, torch.pow, exponent=2) - - def test_size(self): - class TestModule(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, a): - idx = a.size(1) - return a.shape[idx] - - m = TestModule() - a = torch.randn(2, 1, 4) - traced = acc_tracer.trace(m, [a]) - - ph_a = size_1 = size_2 = getitem_1 = getitem_2 = None - for node in traced.graph.nodes: - if node.op == "placeholder": - self.assertTrue(node.target == "a") - ph_a = node - elif node.op == "call_function" and node.target == acc_ops.size: - if size_1: - size_2 = node - self.assertTrue(size_2.kwargs["input"] is ph_a) - else: - size_1 = node - self.assertTrue(size_1.kwargs["input"] is ph_a) - elif node.op == "call_function" and node.target == acc_ops.getitem: - if getitem_1: - getitem_2 = node - self.assertTrue(getitem_2.kwargs["idx"] == getitem_1) - self.assertTrue(getitem_2.kwargs["input"] == size_2) - else: - getitem_1 = node - self.assertTrue(getitem_1.kwargs["idx"] == 1) - self.assertTrue(getitem_1.kwargs["input"] == size_1) - elif node.op == "output": - self.assertEqual(node.args[0], getitem_2) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - ref = m(a) - res = traced(a) - self.assertEqual(ref, res) - - def test_flatten(self): - """ - Test that torch.flatten is traced correctly. - """ - self._make_acc_op_function_test( - acc_ops.flatten, torch.flatten, start_dim=1, end_dim=1 - ) - self._make_acc_op_function_test(acc_ops.flatten, lambda x: x.flatten()) - - def test_topk_multi_output(self): - """ - Test that torch.topk multi outputs work. - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, a: torch.Tensor) -> torch.Tensor: - return torch.topk(a, 3)[1] - - m = TestModule() - input_a = torch.randn(10) - traced = acc_tracer.trace(m, [input_a]) - - ph_a = topk = getitem = None - for node in traced.graph.nodes: - if node.op == "placeholder" and str(node.target) == "a": - ph_a = node - elif node.op == "call_function" and node.target == acc_ops.topk: - topk = node - self.assertEqual(node.kwargs["input"], ph_a) - self.assertEqual(node.kwargs["k"], 3) - elif node.op == "call_function" and node.target == acc_ops.getitem: - getitem = node - self.assertEqual(node.kwargs["input"], topk) - self.assertEqual(node.kwargs["idx"], 1) - elif node.op == "output": - self.assertEqual(node.args[0], getitem) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - self.assertTrue(torch.equal(m(input_a), traced(input_a))) - - def test_addmm_with_alpha_beta(self): - class TestModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward( - self, input: torch.Tensor, a: torch.Tensor, b: torch.Tensor - ) -> torch.Tensor: - return torch.addmm(input, a, b, alpha=1.2, beta=1.1) - - m = TestModule() - input, a, b = torch.randn(2, 2), torch.randn(2, 2), torch.randn(2, 2) - traced = acc_tracer.trace(m, [input, a, b]) - - ph_in = ph_a = ph_b = mm = add = mm_mul = add_mul = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "a": - ph_a = node - elif str(node.target) == "b": - ph_b = node - else: - self.assertTrue(str(node.target) == "input") - ph_in = node - elif node.op == "call_function": - if node.target == acc_ops.matmul: - self.assertEqual(node.kwargs["input"], ph_a) - self.assertEqual(node.kwargs["other"], ph_b) - mm = node - elif node.target == acc_ops.add: - self.assertEqual(node.kwargs["input"], mm_mul) - self.assertEqual(node.kwargs["other"], add_mul) - add = node - elif mm_mul: - self.assertEqual(node.kwargs["input"], ph_in) - self.assertEqual(node.kwargs["other"], 1.1) - add_mul = node - else: - self.assertEqual(node.kwargs["input"], mm) - self.assertEqual(node.kwargs["other"], 1.2) - mm_mul = node - elif node.op == "output": - self.assertEqual(add, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - torch.testing.assert_allclose(m(input, a, b), traced(input, a, b)) - - def test_log1p(self): - class TestModule(torch.nn.Module): - def forward(self, input: torch.Tensor) -> torch.Tensor: - return torch.log1p(input) - - m = TestModule().eval() - input = torch.tensor([[1.2, 0.3, -0.4]]) - traced = acc_tracer.trace(m, [input]) - - ph_in = add = log = None - for node in traced.graph.nodes: - if node.op == "placeholder": - self.assertTrue(str(node.target) == "input") - ph_in = node - elif node.op == "call_function": - if node.target == acc_ops.add: - self.assertEqual(node.kwargs["input"], ph_in) - self.assertEqual(node.kwargs["other"], 1) - add = node - else: - self.assertEqual(node.target, acc_ops.log) - self.assertEqual(node.kwargs["input"], add) - log = node - elif node.op == "output": - self.assertEqual(log, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - torch.testing.assert_allclose(m(input), traced(input)) - - def test_addmm(self): - class TestModule(torch.nn.Module): - def forward( - self, input: torch.Tensor, a: torch.Tensor, b: torch.Tensor - ) -> torch.Tensor: - return torch.addmm(input, a, b) - - m = TestModule() - input, a, b = torch.randn(2, 2), torch.randn(2, 2), torch.randn(2, 2) - traced = acc_tracer.trace(m, [input, a, b]) - - ph_in = ph_a = ph_b = mm = add = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "a": - ph_a = node - elif str(node.target) == "b": - ph_b = node - else: - self.assertTrue(str(node.target) == "input") - ph_in = node - elif node.op == "call_function": - if node.target == acc_ops.matmul: - self.assertEqual(node.kwargs["input"], ph_a) - self.assertEqual(node.kwargs["other"], ph_b) - mm = node - else: - self.assertEqual(node.target, acc_ops.add) - self.assertEqual(node.kwargs["input"], mm) - self.assertEqual(node.kwargs["other"], ph_in) - add = node - elif node.op == "output": - self.assertEqual(add, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - self.assertTrue(torch.equal(m(input, a, b), traced(input, a, b))) - - def test_gelu(self): - return self._make_acc_op_function_test(acc_ops.gelu, torch.nn.functional.gelu) - - @parameterized.expand( - [ - (1, True), - (1, False), - (None, False), - ] - ) - def test_argmin(self, dim, keepdim): - class TestModule(torch.nn.Module): - def __init__(self, dim, keepdim): - super().__init__() - self.dim = dim - self.keepdim = keepdim - - def forward(self, input: torch.Tensor) -> torch.Tensor: - return torch.argmin(input, dim=self.dim, keepdim=self.keepdim) - - m = TestModule(dim, keepdim) - input = torch.randn(2, 2) - traced = acc_tracer.trace(m, [input]) - - ph_in = flatten = topk = getitem = squeeze = None - for node in traced.graph.nodes: - if node.op == "placeholder": - self.assertTrue(str(node.target) == "input") - ph_in = node - elif node.op == "call_function": - if node.target == acc_ops.flatten: - self.assertEqual(node.kwargs["input"], ph_in) - flatten = node - elif node.target == acc_ops.topk: - self.assertEqual( - node.kwargs["input"], flatten if flatten else ph_in - ) - topk = node - elif node.target == acc_ops.getitem: - self.assertEqual(node.kwargs["input"], topk) - getitem = node - elif node.target == acc_ops.squeeze: - self.assertEqual(node.kwargs["input"], getitem) - squeeze = node - elif node.op == "output": - self.assertEqual(squeeze if squeeze else getitem, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - if dim is None: - self.assertTrue(flatten is not None) - if not keepdim: - self.assertTrue(squeeze is not None) - self.assertTrue(torch.equal(m(input), traced(input))) - - def test_t(self): - """ - Test Tensor.t() is traced correctly. - """ - self._make_acc_op_function_test(acc_ops.permute, lambda x: x.t()) - self._make_acc_op_function_test( - acc_ops.permute, lambda x: x.t(), input_shape=(3,) - ) - - def test_split_size(self): - self._make_acc_op_function_test( - acc_ops.split, - torch.split, - validate_same_kwargs=False, - split_size_or_sections=2, - dim=1, - ) - - def test_split_sections(self): - class TestModule(torch.nn.Module): - def forward(self, input: torch.Tensor) -> torch.Tensor: - return torch.split(input, [2, 5, 3], 1) - - m = TestModule() - input = torch.randn(1, 10) - traced = acc_tracer.trace(m, [input]) - - ph_in = slice_node_0 = slice_node_1 = slice_node_2 = None - tuple_construct_node = None - for node in traced.graph.nodes: - if node.op == "placeholder": - self.assertTrue(str(node.target) == "input") - ph_in = node - elif node.op == "call_function": - if node.target == acc_ops.slice_tensor: - self.assertEqual(node.kwargs["input"], ph_in) - if slice_node_0: - if slice_node_1: - slice_node_2 = node - else: - slice_node_1 = node - else: - slice_node_0 = node - else: - self.assertEqual(node.target, acc_ops.tuple_construct) - self.assertEqual( - node.kwargs["tensors"], - (slice_node_0, slice_node_1, slice_node_2), - ) - tuple_construct_node = node - elif node.op == "output": - self.assertEqual(tuple_construct_node, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - ref_output = m(input) - output = traced(input) - for i, j in zip(ref_output, output): - self.assertTrue(torch.equal(i, j)) - - @parameterized.expand( - [ - ("neg_1", -1, 1, 3), - ("neg_2", -2, 1, 3), - ("neg_4", -4, 1, 1), - ] - ) - def test_negative_slicing(self, _, dim, start, length): - """ - Test that slicing with negative dims works. - """ - self._make_acc_op_function_test( - acc_ops.slice_tensor, - torch.narrow, - input_shape=(2, 3, 4, 5), - validate_same_kwargs=False, - dim=dim, - start=start, - length=length, - ) - - def test_list_input(self): - """ - Test that list inputs are traced correctly. - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, a: List[torch.Tensor]) -> torch.Tensor: - return a[0] + a[1] - - m = TestModule() - input = [torch.randn(2, 3), torch.randn(2, 3)] - traced = acc_tracer.trace(m, [input]) - - ph = getitem_0 = getitem_1 = add = None - for node in traced.graph.nodes: - if node.op == "placeholder": - self.assertEqual(str(node.target), "a") - ph = node - elif node.op == "call_function" and node.target == acc_ops.getitem: - self.assertTrue(node.kwargs["idx"] == 0 or node.kwargs["idx"] == 1) - if node.kwargs["idx"] == 0: - getitem_0 = node - else: - getitem_1 = node - elif node.op == "call_function": - self.assertEqual(node.target, acc_ops.add) - self.assertEqual(node.kwargs["input"], getitem_0) - self.assertEqual(node.kwargs["other"], getitem_1) - add = node - elif node.op == "output": - self.assertEqual(add, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - # Check the tensor metadatas are correct given the input is a list. - self.assertTrue(isinstance(ph.meta["tensor_meta"], list)) - self.assertEqual(len(ph.meta["tensor_meta"]), 2) - self.assertEqual(getitem_0.meta["tensor_meta"], ph.meta["tensor_meta"][0]) - self.assertEqual(getitem_1.meta["tensor_meta"], ph.meta["tensor_meta"][1]) - - self.assertTrue(torch.equal(m(input), traced(input))) - - def test_mobilenet_v3(self): - """ - Test that we can trace mobilenet v3 small and run/compare against the untraced version. - """ - m = torchvision.models.mobilenet_v3_small(pretrained=True) - self._make_model_unit_test(m, enable_allclose=True) - - def test_mobilenet_v2(self): - """ - Test that we can trace mobilenet v2 small and run/compare against the untraced version. - """ - m = torchvision.models.mobilenet_v2(pretrained=True) - self._make_model_unit_test(m) - - def test_vgg16(self): - """ - Test that we can trace vgg16 and run/compare against the untraced version. - """ - m = torchvision.models.vgg16(pretrained=True) - self._make_model_unit_test(m) - - def test_resnet18(self): - """ - Test that we can trace resnet18 and run/compare against the untraced version. - """ - m = torchvision.models.resnet18(pretrained=True) - self._make_model_unit_test(m) - - def test_resnext50_32x4d(self): - """ - Test that we can trace resnext and run/compare against the untraced version. - """ - m = torchvision.models.resnext50_32x4d(pretrained=True) - self._make_model_unit_test(m) - - def test_cumsum(self): - self._make_acc_op_function_test(acc_ops.cumsum, torch.cumsum, dim=1) - self._make_acc_op_function_test( - acc_ops.cumsum, torch.cumsum, dim=1, dtype=torch.float - ) - - def test_chunk(self): - self._make_acc_op_function_test(acc_ops.chunk, torch.chunk, chunks=2, dim=0) - - def test_retrace_reshape(self): - """ - Retrace reshape to verify it's retraceable. - """ - - class TestModule(torch.nn.Module): - def forward(self, a: torch.Tensor) -> torch.Tensor: - return a.reshape(a.size()[0], 1, 2) - - m = TestModule() - a = torch.randn(2, 2) - gm = acc_tracer.trace(m, [a]) - self.assertTrue(torch.equal(m(a), gm(a))) - gm_retrace = acc_tracer.trace(gm, [a]) - self.assertTrue(torch.equal(m(a), gm_retrace(a))) - - def test_all_acc_ops_registered(self): - self.assertEqual( - acc_normalizer._acc_ops, - { - acc_ops.linear, - acc_ops.max_pool2d, - acc_ops.flatten, - acc_ops.adaptive_avg_pool2d, - acc_ops.avg_pool2d, - acc_ops.add, - acc_ops.min_full_reduce, - acc_ops.min_dim_reduce, - acc_ops.minimum, - acc_ops.cat, - acc_ops.softmax, - acc_ops.sign, - acc_ops.permute, - acc_ops.matmul, - acc_ops.quantize_per_tensor, - acc_ops.quantize_per_channel, - acc_ops.quantized_add, - acc_ops.quantized_mul, - acc_ops.dequantize, - acc_ops.sub, - acc_ops.mul, - acc_ops.div, - acc_ops.floor_div, - acc_ops.trunc_div, - acc_ops.pow, - acc_ops.relu, - acc_ops.leaky_relu, - acc_ops.elu, - acc_ops.selu, - acc_ops.softsign, - acc_ops.tuple_construct, - acc_ops.unsqueeze, - acc_ops.sigmoid, - acc_ops.sum, - acc_ops.prod, - acc_ops.max_full_reduce, - acc_ops.max_dim_reduce, - acc_ops.maximum, - acc_ops.sinh, - acc_ops.cosh, - acc_ops.tanh, - acc_ops.asin, - acc_ops.acos, - acc_ops.atan, - acc_ops.exp, - acc_ops.log, - acc_ops.sqrt, - acc_ops.reciprocal, - acc_ops.abs, - acc_ops.neg, - acc_ops.floor, - acc_ops.ceil, - acc_ops.size, - acc_ops.split, - acc_ops.conv2d, - acc_ops.batch_norm, - acc_ops.embedding_bag, - acc_ops.embedding_bag_byte_rowwise_offsets, - acc_ops.embedding_bag_4bit_rowwise_offsets, - acc_ops.contiguous, - acc_ops.pad, - acc_ops.sin, - acc_ops.cos, - acc_ops.tan, - acc_ops.topk, - acc_ops.getitem, - acc_ops.squeeze, - acc_ops.tile, - acc_ops.reshape, - acc_ops.quantized_linear, - acc_ops.quantized_conv2d, - acc_ops.quantized_batch_norm2d, - acc_ops.to_dtype, - acc_ops.clamp, - acc_ops.layer_norm, - acc_ops.linalg_norm, - acc_ops.slice_tensor, - acc_ops.hardsigmoid, - acc_ops.mean, - acc_ops.hardtanh, - acc_ops.gelu, - acc_ops.cumsum, - acc_ops.chunk, - acc_ops.rescale_quantize_per_tensor, - acc_ops.rescale_quantize_per_channel, - acc_ops.nan_to_num, - }, - ) diff --git a/test/jit/fixtures/test_versioned_gelu_out_v9.ptl b/test/jit/fixtures/test_versioned_gelu_out_v9.ptl new file mode 100644 index 000000000000..208ae5100757 Binary files /dev/null and b/test/jit/fixtures/test_versioned_gelu_out_v9.ptl differ diff --git a/test/jit/fixtures/test_versioned_gelu_v9.ptl b/test/jit/fixtures/test_versioned_gelu_v9.ptl new file mode 100644 index 000000000000..5e4ffb20f823 Binary files /dev/null and b/test/jit/fixtures/test_versioned_gelu_v9.ptl differ diff --git a/test/jit/fixtures_srcs/fixtures_src.py b/test/jit/fixtures_srcs/fixtures_src.py index 545152b6a3a0..dff23702311a 100644 --- a/test/jit/fixtures_srcs/fixtures_src.py +++ b/test/jit/fixtures_srcs/fixtures_src.py @@ -42,3 +42,18 @@ def __init__(self): def forward(self, a: Union[int, float, complex], b: Union[int, float, complex], out: torch.Tensor): return torch.logspace(a, b, out=out) + +class TestVersionedGeluV9(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + return torch._C._nn.gelu(x) + +class TestVersionedGeluOutV9(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + out = torch.zeros_like(x) + return torch._C._nn.gelu(x, out=out) diff --git a/test/jit/fixtures_srcs/generate_models.py b/test/jit/fixtures_srcs/generate_models.py index 36b6b5ffe684..e00153745138 100644 --- a/test/jit/fixtures_srcs/generate_models.py +++ b/test/jit/fixtures_srcs/generate_models.py @@ -52,7 +52,7 @@ def div_Tensor_0_3(self: Tensor, other: Tensor) -> Tensor: fbcode/caffe2/torch/csrc/jit/mobile/upgrader_mobile.cpp ``` -python pytorch/tools/codegen/operator_versions/gen_mobile_upgraders.py +python pytorch/torchgen/operator_versions/gen_mobile_upgraders.py ``` 4. Generate the test to cover upgrader. @@ -94,6 +94,8 @@ def div_Tensor_0_3(self: Tensor, other: Tensor) -> Tensor: TestVersionedLinspaceOutV7(): "aten::linspace.out", TestVersionedLogspaceV8(): "aten::logspace", TestVersionedLogspaceOutV8(): "aten::logspace.out", + TestVersionedGeluV9(): "aten::gelu", + TestVersionedGeluOutV9(): "aten::gelu.out", } """ diff --git a/test/jit/myexception.py b/test/jit/myexception.py new file mode 100644 index 000000000000..5937bd3c91b7 --- /dev/null +++ b/test/jit/myexception.py @@ -0,0 +1,8 @@ +r""" +Define exceptions used in test_exception.py. We define them in a +separate file on purpose to make sure the fully qualified exception class name +is captured correctly in suce cases. +""" +class MyKeyError(KeyError): + def __init__(self, msg): + super(KeyError, self).__init__(msg) diff --git a/test/jit/test_alias_analysis.py b/test/jit/test_alias_analysis.py index 00c015ccfab2..2f8216eaaf9a 100644 --- a/test/jit/test_alias_analysis.py +++ b/test/jit/test_alias_analysis.py @@ -42,3 +42,52 @@ def foo(x): output = next(graph.outputs()) self.assertTrue(alias_db.may_contain_alias(ten_construct, output)) self.assertFalse(alias_db.may_contain_alias(next(graph.inputs()), ten_construct)) + + def test_recursive_calls(self): + @torch.jit.script + def foo(x, y): + x.add_(1) + return x + y + + @torch.jit.script + def caller(): + a = torch.rand([2, 2]) + b = torch.ones([2, 2]) + out1 = foo(a, b) + c = torch.rand([1]) + d = torch.ones([2]) + out2 = foo(d, c) + return out1, out2 + + isFrozen = False + descend_function_calls = True + alias_db = caller.graph.alias_db(isFrozen, descend_function_calls) + func_calls = caller.graph.findAllNodes("prim::CallFunction") + self.assertEqual(len(func_calls), 2) + for node in func_calls: + inps = list(node.inputs()) + self.assertTrue(alias_db.has_writers(inps[1])) + self.assertFalse(alias_db.has_writers(inps[2])) + + class Mod(torch.nn.Module): + def forward(self): + a = torch.rand([2, 2]) + b = torch.ones([2, 2]) + out1 = self.foo2(a, b) + c = torch.rand([1]) + d = torch.ones([2]) + out2 = self.foo2(d, c) + return out1, out2 + + def foo2(self, x, y): + x.add_(1) + return x + y + + mod = torch.jit.script(Mod()) + alias_db = mod.graph.alias_db(isFrozen, descend_function_calls) + func_calls = mod.graph.findAllNodes("prim::CallMethod") + self.assertEqual(len(func_calls), 2) + for node in func_calls: + inps = list(node.inputs()) + self.assertTrue(alias_db.has_writers(inps[1])) + self.assertFalse(alias_db.has_writers(inps[2])) diff --git a/test/jit/test_autodiff.py b/test/jit/test_autodiff.py new file mode 100644 index 000000000000..518826f602e1 --- /dev/null +++ b/test/jit/test_autodiff.py @@ -0,0 +1,51 @@ +# Owner(s): ["oncall: jit"] + +import torch + +from torch.testing._internal.jit_utils import JitTestCase +from typing import List + +class TestAutodiffJit(JitTestCase): + def test_undefined_tensor_lists(self): + def fn(tensor_list: List[torch.Tensor], add_tensor): + cat = torch.cat(tensor_list, dim=1) + r = torch.sin(cat + add_tensor) + return r + + fn_s = torch.jit.script(fn) + + a = torch.rand((3, 6), requires_grad=True) + b = torch.rand((3, 10), requires_grad=True) + x = [a, b] + y = torch.rand((3, 16), requires_grad=True) + + ret = fn_s(x, y) + ret.sum().backward() + ret = fn_s(x, y) + ret.sum().backward() + + ret = fn_s(x, y) + s = ret.sum() + + # backward_fn expects 2 inputs: (grad_output, current_grad_r) + # current_grad_r is provided because we need to add this contribution + # to grad_r when we return it. + backward_fn = s.grad_fn.next_functions[0][0] + + # check behavior with defined tensor + grad_out = torch.rand((3, 16)) + grad_inputs = backward_fn(grad_out, None) + + # expect 3 tensors: grad_y, grad_a, grad_b + self.assertEqual(3, len(grad_inputs)) + for x in grad_inputs: + self.assertTrue(isinstance(x, torch.Tensor)) + + # now test with undefined grad_out + grad_inputs = backward_fn(None, None) + + # expect all of them to be None + self.assertEqual(3, len(grad_inputs)) + for x in grad_inputs: + if x is not None: + self.assertEqual(0, torch.max(torch.abs(x)).item()) diff --git a/test/jit/test_autodiff_subgraph_slicing.py b/test/jit/test_autodiff_subgraph_slicing.py index 8454f786edb8..4b72fc6f4561 100644 --- a/test/jit/test_autodiff_subgraph_slicing.py +++ b/test/jit/test_autodiff_subgraph_slicing.py @@ -447,7 +447,7 @@ def test_aliased_outputs(self): %0 : int[] = prim::Constant[value=[2, 2, 1]]() %1 : int = prim::Constant[value=0]() %2 : Tensor = aten::t(%b) - %3 : Tensor = aten::gelu(%2) + %3 : Tensor = aten::relu(%2) %4 : (Tensor, Tensor, Tensor[]) = prim::TupleConstruct(%b, %3, %2) return (%4) """ @@ -471,7 +471,7 @@ def test_aliased_outputs(self): %1 : int = prim::Constant[value=0]() %d : Tensor = aten::t(%c) %2 : Tensor = aten::t(%b) - %3 : Tensor = aten::gelu(%2) + %3 : Tensor = aten::relu(%2) %4 : (Tensor, Tensor, Tensor[]) = prim::TupleConstruct(%3, %2, %d, %b, %c, %b) return (%4) """ diff --git a/test/jit/test_backends.py b/test/jit/test_backends.py index 086a44eee3f1..0ed7d0c19b2d 100644 --- a/test/jit/test_backends.py +++ b/test/jit/test_backends.py @@ -81,7 +81,7 @@ def setUp(self): # Subclasses are expected to set up three variables in their setUp methods: # module - a regular, Python version of the module being tested # scripted_module - a scripted version of module - # lowered_modle - a version of module lowered to a backend + # lowered_module - a version of module lowered to a backend def check_function(self, function_name, input): """ @@ -498,7 +498,7 @@ def setUp(self): # Subclasses are expected to set up four variables in their setUp methods: # module - a regular, Python version of the module being tested # scripted_module - a scripted version of module - # lowered_modle - a version of module lowered to a backend + # lowered_module - a version of module lowered to a backend # mobile_module - a module with a format that Pytorch Mobile can execute def check_forward(self, input): diff --git a/test/jit/test_class_type.py b/test/jit/test_class_type.py index 56c3831341ee..09a58b3cd735 100644 --- a/test/jit/test_class_type.py +++ b/test/jit/test_class_type.py @@ -1430,8 +1430,8 @@ def __init__(self, val): class Mod(nn.Module): def __init__(self): super(Mod, self).__init__() - self.mod1 = ValHolder(1) - self.mod2 = ValHolder(2) + self.mod1 = ValHolder("1") + self.mod2 = ValHolder("2") def forward(self, cond: bool): if cond: diff --git a/test/jit/test_complex.py b/test/jit/test_complex.py index e4137067a8ea..3b7d34427167 100644 --- a/test/jit/test_complex.py +++ b/test/jit/test_complex.py @@ -328,3 +328,33 @@ def tensor_imag(x): t = torch.randn(2, 3, dtype=torch.cdouble) self.checkScript(tensor_real, (t, )) self.checkScript(tensor_imag, (t, )) + + def test_binary_op_complex_tensor(self): + def mul(x: complex, y: torch.Tensor): + return x * y + + def add(x: complex, y: torch.Tensor): + return x + y + + def eq(x: complex, y: torch.Tensor): + return x == y + + def ne(x: complex, y: torch.Tensor): + return x != y + + def sub(x: complex, y: torch.Tensor): + return x - y + + def div(x: complex, y: torch.Tensor): + return x - y + + ops = [mul, add, eq, ne, sub, div] + + for shape in [(1, ), (2, 2)]: + x = 0.71 + 0.71j + y = torch.randn(shape, dtype=torch.cfloat) + for op in ops: + eager_result = op(x, y) + scripted = torch.jit.script(op) + jit_result = scripted(x, y) + self.assertEqual(eager_result, jit_result) diff --git a/test/jit/test_custom_operators.py b/test/jit/test_custom_operators.py index cdb973590cb4..feb3b8eb8fb6 100644 --- a/test/jit/test_custom_operators.py +++ b/test/jit/test_custom_operators.py @@ -50,10 +50,6 @@ def test_default_arguments_are_used(self): output = torch.ops._test.leaky_relu(torch.tensor([-1.0, 1.0])) self.assertEqual(output, torch.tensor([-0.01, 1])) - def test_only_kwargs(self): - output = torch.ops._test.leaky_relu(self=torch.tensor(-1.0)) - self.assertEqual(output, torch.tensor(-0.01)) - def test_passing_too_many_args(self): with self.assertRaisesRegexWithHighlight( RuntimeError, @@ -78,14 +74,6 @@ def test_passing_one_positional_but_not_the_second(self): ): torch.ops.aten.type_as(torch.ones(5, 5)) - def test_passing_an_argument_both_as_positional_and_kwarg(self): - with self.assertRaisesRegexWithHighlight( - RuntimeError, - "Argument 'self' specified both as positional and keyword argument", - "" - ): - torch.ops._test.leaky_relu(torch.ones(5), self=torch.ones(5)) - def test_passing_unknown_kwargs(self): with self.assertRaisesRegexWithHighlight( RuntimeError, diff --git a/test/jit/test_device_analysis.py b/test/jit/test_device_analysis.py index efdc2fc92e6c..3ce42e171b65 100644 --- a/test/jit/test_device_analysis.py +++ b/test/jit/test_device_analysis.py @@ -6,6 +6,7 @@ import torch from torch.testing._internal.common_utils import TEST_CUDA from torch.testing._internal.jit_utils import JitTestCase +from torch.jit._passes._property_propagation import apply_input_props_using_example try: from torchvision import models @@ -19,40 +20,6 @@ "instead." ) -# TODO: Delete this when PR #67786 is merged. -def apply_input_props_using_example(graph, example_input): - """ - Applies properties for each tensor in the graph inputs - using the example supplied. - """ - graph_inputs = list(graph.inputs()) - if len(graph_inputs) == 0: - return - - # Strip self args off for methods - in_0 = graph_inputs[0] - if isinstance(in_0.type(), torch._C.ClassType) and in_0.debugName() == "self": - graph_inputs = graph_inputs[1:] - - if not len(graph_inputs) == len(example_input): - raise RuntimeError( - "Number of inputs in graph does not match number of inputs in the example" - ) - - for i, (graph_i, example_i) in enumerate(zip(graph_inputs, example_input)): - if example_i is None: - continue # Skip the type check - - if isinstance(example_i, torch.Tensor) != isinstance( - graph_i.type(), torch.TensorType - ): - raise RuntimeError( - f"Input {i} does not match type of example", graph_i, example_i - ) - - if isinstance(example_i, torch.Tensor): - graph_i.setType(torch.TensorType.create_from_tensor(example_i)) # type: ignore[arg-type] - class TestDeviceAnalysis(JitTestCase): @classmethod diff --git a/test/jit/test_exception.py b/test/jit/test_exception.py new file mode 100644 index 000000000000..dce38e3be892 --- /dev/null +++ b/test/jit/test_exception.py @@ -0,0 +1,176 @@ +# Owner(s): ["oncall: jit"] +from torch.testing._internal.common_utils import TestCase +import torch +from torch import nn + +r""" +Test TorchScript exception handling. +""" +class TestException(TestCase): + def test_pyop_exception_message(self): + class Foo(torch.jit.ScriptModule): + def __init__(self): + super(Foo, self).__init__() + self.conv = nn.Conv2d(1, 10, kernel_size=5) + + @torch.jit.script_method + def forward(self, x): + return self.conv(x) + foo = Foo() + # testing that the correct error message propagates + with self.assertRaisesRegex(RuntimeError, r"Expected 3D \(unbatched\) or 4D \(batched\) input to conv2d"): + foo(torch.ones([123])) # wrong size + + def test_builtin_error_messsage(self): + with self.assertRaisesRegex(RuntimeError, "Arguments for call are not valid"): + @torch.jit.script + def close_match(x): + return x.masked_fill(True) + + with self.assertRaisesRegex(RuntimeError, "This op may not exist or may not be currently " + "supported in TorchScript"): + @torch.jit.script + def unknown_op(x): + torch.set_anomaly_enabled(True) + return x + + def test_exceptions(self): + cu = torch.jit.CompilationUnit(''' + def foo(cond): + if bool(cond): + raise ValueError(3) + return 1 + ''') + + cu.foo(torch.tensor(0)) + with self.assertRaisesRegex(torch.jit.Error, "3"): + cu.foo(torch.tensor(1)) + + def foo(cond): + a = 3 + if bool(cond): + raise ArbitraryError(a, "hi") + if 1 == 2: + raise ArbitraryError + return a + + with self.assertRaisesRegex(RuntimeError, "undefined value ArbitraryError"): + torch.jit.script(foo) + + def exception_as_value(): + a = Exception() + print(a) + + with self.assertRaisesRegex(RuntimeError, "cannot be used as a value"): + torch.jit.script(exception_as_value) + + @torch.jit.script + def foo_no_decl_always_throws(): + raise RuntimeError("Hi") + + # function that has no declared type but always throws set to None + output_type = next(foo_no_decl_always_throws.graph.outputs()).type() + self.assertTrue(str(output_type) == "NoneType") + + @torch.jit.script + def foo_decl_always_throws(): + # type: () -> Tensor + raise Exception("Hi") + + output_type = next(foo_decl_always_throws.graph.outputs()).type() + self.assertTrue(str(output_type) == "Tensor") + + def foo(): + raise 3 + 4 + + with self.assertRaisesRegex(RuntimeError, "must derive from BaseException"): + torch.jit.script(foo) + + # a escapes scope + @torch.jit.script + def foo(): + if 1 == 1: + a = 1 + else: + if 1 == 1: + raise Exception("Hi") + else: + raise Exception("Hi") + return a + self.assertEqual(foo(), 1) + + @torch.jit.script + def tuple_fn(): + raise RuntimeError("hello", "goodbye") + + with self.assertRaisesRegex(torch.jit.Error, "hello, goodbye"): + tuple_fn() + + @torch.jit.script + def no_message(): + raise RuntimeError + + with self.assertRaisesRegex(torch.jit.Error, "RuntimeError"): + no_message() + + def test_assertions(self): + cu = torch.jit.CompilationUnit(''' + def foo(cond): + assert bool(cond), "hi" + return 0 + ''') + + cu.foo(torch.tensor(1)) + with self.assertRaisesRegex(torch.jit.Error, "AssertionError: hi"): + cu.foo(torch.tensor(0)) + + @torch.jit.script + def foo(cond): + assert bool(cond), "hi" + + foo(torch.tensor(1)) + # we don't currently validate the name of the exception + with self.assertRaisesRegex(torch.jit.Error, "AssertionError: hi"): + foo(torch.tensor(0)) + + def test_python_op_exception(self): + @torch.jit.ignore + def python_op(x): + raise Exception("bad!") + + @torch.jit.script + def fn(x): + return python_op(x) + + with self.assertRaisesRegex(RuntimeError, "operation failed in the TorchScript interpreter"): + fn(torch.tensor(4)) + + def test_dict_expansion_raises_error(self): + def fn(self): + d = {"foo": 1, "bar": 2, "baz": 3} + return {**d} + + with self.assertRaisesRegex(torch.jit.frontend.NotSupportedError, + "Dict expansion "): + torch.jit.script(fn) + + def test_custom_python_exception(self): + class MyValueError(ValueError): + def __init__(self, msg): + super(MyValueError, self).__init__(msg) + + @torch.jit.script + def fn(): + raise MyValueError("test custom exception") + + with self.assertRaisesRegex(torch.jit.Error, "jit.test_exception.MyValueError: test custom exception"): + fn() + + def test_custom_python_exception_defined_elsewhere(self): + from jit.myexception import MyKeyError + + @torch.jit.script + def fn(): + raise MyKeyError("This is a user defined key error") + with self.assertRaisesRegex(torch.jit.Error, "jit.myexception.MyKeyError: This is a user defined key error"): + fn() diff --git a/test/jit/test_export_modes.py b/test/jit/test_export_modes.py index 70d2193201a3..dbf10cddc059 100644 --- a/test/jit/test_export_modes.py +++ b/test/jit/test_export_modes.py @@ -15,7 +15,7 @@ pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) sys.path.append(pytorch_test_dir) from torch.testing._internal.jit_utils import JitTestCase -from torch.testing._internal.common_utils import skipIfNoLapack +from torch.testing._internal.common_utils import skipIfNoLapack, skipIfCaffe2, skipIfNoCaffe2 if __name__ == '__main__': raise RuntimeError("This test file is not meant to be run directly, use:\n\n" @@ -68,8 +68,9 @@ def foo(a): x = torch.ones(3) torch.onnx._export(foo, (x,), f) + @skipIfNoCaffe2 @skipIfNoLapack - def test_aten_fallback(self): + def test_caffe2_aten_fallback(self): class ModelWithAtenNotONNXOp(nn.Module): def forward(self, x, y): abcd = x + y @@ -84,6 +85,25 @@ def forward(self, x, y): do_constant_folding=False, operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK) + @skipIfCaffe2 + @skipIfNoLapack + def test_aten_fallback(self): + class ModelWithAtenNotONNXOp(nn.Module): + def forward(self, x, y): + abcd = x + y + defg = torch.linalg.qr(abcd) + return defg + + x = torch.rand(3, 4) + y = torch.rand(3, 4) + torch.onnx.export_to_pretty_string( + ModelWithAtenNotONNXOp(), (x, y), + add_node_names=False, + do_constant_folding=False, + operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK, + # support for linalg.qr was added in later op set versions. + opset_version=9) + # torch.fmod is using to test ONNX_ATEN. # If you plan to remove fmod from aten, or found this test failed. # please contact @Rui. diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py index b4cc5a10a075..599ada43f1b2 100644 --- a/test/jit/test_freezing.py +++ b/test/jit/test_freezing.py @@ -1678,6 +1678,36 @@ def make_prediction(self, x): scripted_mod = torch.jit.freeze(scripted_mod, preserved_attrs=["make_prediction", "amt"]) FileCheck().check("conv").check_not("aten::batch_norm").run(scripted_mod.make_prediction.graph) + @unittest.skipIf(True, "Caching allocator leak sometimes causes failures") + @unittest.skipIf(not TEST_CUDA, "Optimization currently only run for GPU") + def test_conv_bn_folding_autocast_scenario_cuda(self): + # CUDA conv takes input tensors which must all be the same dtype, + # which can cause issues if folding produces inputs of different dtypes. + + class ConvBN(torch.nn.Module): + def __init__(self, in_channels, out_channels, **kwargs): + super(ConvBN, self).__init__() + self.conv = torch.nn.Conv2d(in_channels, out_channels, bias=False, dtype=torch.half, **kwargs) + self.bn = torch.nn.BatchNorm2d(out_channels, eps=0.001, dtype=torch.float) + + def forward(self, x): + return self.bn(self.conv(x)) + + mod_eager = ConvBN(3, 32, kernel_size=3, stride=2).cuda().eval() + scripted_mod = torch.jit.script(mod_eager) + scripted_mod = torch.jit.freeze(scripted_mod) + FileCheck().check("conv").check_not("aten::batch_norm").run(scripted_mod.graph) + conv_node = scripted_mod.graph.findNode("aten::conv2d", True) + self.assertTrue(conv_node is not None) + bias_input = conv_node.namedInput("bias") + self.assertTrue(bias_input is not None) + self.assertTrue(bias_input.type().dtype() == torch.half) + + x = torch.rand((3, 3, 32, 32), dtype=torch.half).cuda() + + self.assertEqual(mod_eager(x), scripted_mod(x), atol=1e-2, rtol=1e-2) + self.assertEqual(mod_eager(x), scripted_mod(x), atol=1e-2, rtol=1e-2) + def test_conv_add_folding(self): @torch.no_grad() @@ -1760,7 +1790,32 @@ def forward(self, x): # add with different dtype test_conv_fusion(use_bias, nn.Conv2d, False, pytorch_op, False, - add_tensor=torch.rand(1).to(torch.int), expect_success=False) + add_tensor=torch.tensor([2]).to(torch.int), expect_success=True) + + def test_conv_mul_add_bn(self): + class Conv_Mul_Add_Bn(nn.Module): + + def __init__(self, in_channels, out_channels, **kwargs): + super(Conv_Mul_Add_Bn, self).__init__() + self.conv = nn.Conv2d(in_channels, out_channels, **kwargs) + self.bn = nn.BatchNorm2d(out_channels, eps=0.001) + self.tensor1 = torch.tensor(2.2) + self.tensor2 = torch.tensor(2) + + def forward(self, x): + return self.bn(torch.add(torch.mul(self.conv(x), self.tensor1), self.tensor2)) + + input = torch.randn(8, 3, 64, 64) + model = Conv_Mul_Add_Bn(3, 32, kernel_size=3, stride=1).eval() + + with torch.no_grad(): + result = model(input) + traced_model = torch.jit.trace(model, input).eval() + traced_model = torch.jit.freeze(traced_model) + tresult = traced_model(input) + self.assertEqual(result, tresult) + FileCheck().check("conv").check_not("aten::batch_norm").run(traced_model.graph) + FileCheck().check("conv").check_not("aten::add").run(traced_model.graph) @unittest.skipIf(not TEST_CUDA, "Optimization currently only run for GPU") def test_linear_concat(self): diff --git a/test/jit/test_if_hoisting.py b/test/jit/test_if_hoisting.py deleted file mode 100644 index 939ceda3c56c..000000000000 --- a/test/jit/test_if_hoisting.py +++ /dev/null @@ -1,214 +0,0 @@ -# Owner(s): ["oncall: jit"] - -import torch -from torch.testing import FileCheck -from torch.testing._internal.jit_utils import JitTestCase - -if __name__ == "__main__": - raise RuntimeError( - "This test file is not meant to be run directly, use:\n\n" - "\tpython test/test_jit.py TESTNAME\n\n" - "instead." - ) - - -class TestIfHoisting(JitTestCase): - def test_if_hoist_basic(self): - def fn(x: bool, y: int): - if x: - z = y + 3 - else: - z = y + 3 - return z - - - fn_script = torch.jit.script(fn) - op_graph = fn_script.graph - self.run_pass("common_expression_hoisting", op_graph) - self.run_pass("dce", op_graph) - FileCheck().check_count("prim::If", 0, exactly=True).run(op_graph) - FileCheck().check_count("aten::add", 1, exactly=True).run(op_graph) - self.assertEqual(fn(True, 1), fn_script(True, 1)) - - def test_if_hoist_transposed_expr(self): - """ - Making sure that we can properly eliminate - an expression even if it is not at the start - of a block - """ - def fn(x: bool, y: int): - if x: - a = y + 3 - b = y * 2 - else: - b = y * 2 - a = y + 3 - return a, b - - fn_script = torch.jit.script(fn) - op_graph = fn_script.graph - self.run_pass("common_expression_hoisting", op_graph) - self.run_pass("dce", op_graph) - - FileCheck().check_count("prim::If", 0, exactly=True).run(op_graph) - FileCheck().check_count("aten::add", 1, exactly=True).run(op_graph) - - self.assertEqual(fn(True, 1), fn_script(True, 1)) - self.assertEqual(fn(False, 5), fn_script(False, 5)) - - def test_if_hoist_swapped_expr(self): - """ - Making sure that the if statement - doesn't get fully eliminated here - """ - def fn(x: bool, y: int): - if x: - a = y + 3 - b = y * 2 - else: - a = y * 2 - b = y + 3 - return a, b - - fn_script = torch.jit.script(fn) - op_graph = fn_script.graph - self.run_pass("common_expression_hoisting", op_graph) - self.run_pass("dce", op_graph) - - FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph) - FileCheck().check_count("aten::add", 1, exactly=True).run(op_graph) - - self.assertEqual(fn(True, 1), fn_script(True, 1)) - self.assertEqual(fn(False, 5), fn_script(False, 5)) - - def test_if_hoist_reused_var(self): - """ - Making sure that cases where the python variable is reused - is handled correctly - """ - def fn(x: bool, y: int): - b = 6 - if x: - a = y + 3 - a = y * 2 - else: - a = y * 2 - b = y + 3 - return a, b - - fn_script = torch.jit.script(fn) - op_graph = fn_script.graph - self.run_pass("common_expression_hoisting", op_graph) - self.run_pass("dce", op_graph) - - FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph) - FileCheck().check_count("aten::add", 1, exactly=True).run(op_graph) - FileCheck().check_count("aten::mul", 1, exactly=True).run(op_graph) - - self.assertEqual(fn(True, 1), fn_script(True, 1)) - self.assertEqual(fn(False, 5), fn_script(False, 5)) - - def test_no_hoist(self): - """ - Nothing should happen here, expressions are different - """ - def fn(x: bool, y: int, z: int): - if x: - a = y + 3 - else: - a = z + 3 - return a - - fn_script = torch.jit.script(fn) - op_graph = fn_script.graph - self.run_pass("common_expression_hoisting", op_graph) - self.run_pass("dce", op_graph) - - FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph) - FileCheck().check_count("aten::add", 2, exactly=True).run(op_graph) - - self.assertEqual(fn(True, 1, 3), fn_script(True, 1, 3)) - self.assertEqual(fn(False, 5, 10), fn_script(False, 5, 10)) - - def test_mutate_before(self): - """ - Make sure that if there is a mutation before the common - op, the hoist doesn't happen - """ - def fn(x: bool, y: torch.Tensor): - if x: - y.add_(8) - a = y + 3 - else: - a = y + 3 - return a - - fn_script = torch.jit.script(fn) - op_graph = fn_script.graph - self.run_pass("common_expression_hoisting", op_graph) - self.run_pass("dce", op_graph) - - FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph) - FileCheck().check_count("aten::add", 2, exactly=True).run(op_graph) - FileCheck().check_count("aten::add_", 1, exactly=True).run(op_graph) - - t1 = torch.Tensor([1]) - t2 = torch.Tensor([5, 6]) - self.assertEqual(fn(True, t1), fn_script(True, t1)) - self.assertEqual(fn(False, t2), fn_script(False, t2)) - - def test_mutate_after(self): - """ - Check that the hoist can happen properly, and - that the output is still correct. - """ - def fn(x: bool, y: torch.Tensor): - if x: - b = 1 - a = y + 3 - y.add_(8) - else: - b = 2 - a = y + 3 - c = b + a - return a - - fn_script = torch.jit.script(fn) - op_graph = fn_script.graph - self.run_pass("common_expression_hoisting", op_graph) - self.run_pass("dce", op_graph) - - FileCheck().check_count("prim::If", 1, exactly=True).run(op_graph) - FileCheck().check_count("aten::add", 2, exactly=True).run(op_graph) - - t1 = torch.Tensor([1]) - t2 = torch.Tensor([5, 6]) - self.assertEqual(fn(True, t1.clone()), fn_script(True, t1.clone())) - self.assertEqual(fn(False, t2.clone()), fn_script(False, t2.clone())) - - def test_multiple_hoists(self): - """ - test that hoists that depend on other hoists are done correctly - """ - def fn(x: bool, y: torch.Tensor): - if x: - a = y + 3 - b = a + y - else: - a = y + 3 - b = a + y - c = b * 2 - return c - - fn_script = torch.jit.script(fn) - op_graph = fn_script.graph - self.run_pass("common_expression_hoisting", op_graph) - self.run_pass("dce", op_graph) - - FileCheck().check_count("prim::If", 0, exactly=True).run(op_graph) - FileCheck().check_count("aten::add", 2, exactly=True).run(op_graph) - - t1 = torch.Tensor([1]) - t2 = torch.Tensor([5, 6]) - self.assertEqual(fn(True, t1), fn_script(True, t1)) - self.assertEqual(fn(False, t2), fn_script(False, t2)) diff --git a/test/jit/test_misc.py b/test/jit/test_misc.py index bf3c3c3e71c1..4d10ad37aa65 100644 --- a/test/jit/test_misc.py +++ b/test/jit/test_misc.py @@ -12,6 +12,7 @@ import torch import torch.testing._internal.jit_utils import torch.nn as nn +from torch.testing._internal.common_utils import freeze_rng_state # Make the helper files in test/ importable pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) @@ -171,6 +172,22 @@ def if_function(inp: torch.Tensor) -> Any: self.checkScript(if_function, (torch.randn(5),)) + def test_hacked_twin(self): + + def gen_data(): + with freeze_rng_state(): + return torch.randn(10), torch.randint(10, (20,)), torch.randn(20) + + input, index, value, = gen_data() + input1, index1, value1, = gen_data() + out1 = torch.ops.aten.index_put.hacked_twin(input, [index], value, accumulate=False) + out2 = torch.index_put(input1, [index1], value1, accumulate=False) + self.assertEqual(out1, out2) + + torch.ops.aten.index_put_.hacked_twin(input, [index], value, accumulate=False) + torch.index_put_(input1, [index1], value1, accumulate=False) + self.assertEqual(input, input1) + def test_export_opnames_interface(self): @torch.jit.interface @@ -228,6 +245,91 @@ def use_module_interface(mod_list: List[OneTwoModule], x: torch.Tensor): self.assertTrue(set(['aten::add.Tensor', 'aten::mul.Scalar']).issubset( set(torch.jit.export_opnames(scripted_M_mod)))) + def test_math_inf(self): + from math import inf + + def foo(): + return inf + + self.checkScript(foo, ()) + + def test_list_literal_infer(self): + def expects_intlist(x: List[int]): + x.append(3) + return x + + def foo(): + return expects_intlist([]) + + self.checkScript(foo, ()) + + def annotated_list_fail(): + return expects_intlist(torch.jit.annotate([], List[Tensor])) + + with self.assertRaises(RuntimeError): + torch.jit.script(annotated_list_fail) + + def non_temporary_fail(): + a = [] + return expects_intlist(a) + + with self.assertRaises(RuntimeError): + torch.jit.script(non_temporary_fail) + + + @torch.jit.script + def test_return(): + return [] + + FileCheck().check("Tensor[] = prim::ListConstruct").run(test_return.graph) + + def test_legacy_tensor_constructor(self): + # testing PyObject overload + def test_all_dtypes(): + return ( + torch.BoolTensor([2]), + torch.LongTensor([3]), + torch.ByteTensor([4]), + torch.CharTensor([5]), + torch.DoubleTensor([6]), + torch.FloatTensor([7]), + torch.IntTensor([8]), + torch.ShortTensor([1]), + torch.HalfTensor([1]), + ) + + self.checkScript(test_all_dtypes, ()) + + # now test empty overload + def empty_overload(): + return torch.LongTensor(2, 3, 4) + + eager = empty_overload() + jit = torch.jit.script(empty_overload)() + eager[:] = 1 + jit[:] = 1 + self.assertEqual(eager, jit) + + def no_inputs(): + return torch.DoubleTensor() + + self.checkScript(no_inputs, ()) + + # bad schema + def multiple_args(): + return torch.LongTensor(1, [2]) + + with self.assertRaisesRegex(RuntimeError, "multiple positional arguments that were not all integers"): + torch.jit.script(multiple_args) + + # kwarg bad schema + def bad_kwarg(): + return torch.LongTensor(hello="1") + + with self.assertRaisesRegex(RuntimeError, "hello"): + torch.jit.script(bad_kwarg) + + def test_broadcasting_list(self): """ Test BroadcastingList and torch.nn._size_N_t alias @@ -243,3 +345,38 @@ def sum_f(x: BroadcastingList2[float]) -> float: self.assertTrue(torch.jit.script(sum_i)(4) == 8) self.assertTrue(torch.jit.script(sum_f)(4.5) == 9.) + + def test_parse_ir_annotate(self): + ir = """ + graph(): + %3 : int[] = prim::Constant[value=annotate(List[int], [])]() + return (%3) + """ + graph = torch._C.parse_ir(ir, True) + func = torch._C._create_function_from_graph("forward", graph) + ret = func() + self.assertTrue(ret == []) + + def test_parse_ir_single_element_tensor_positive(self): + ir = """ + graph(): + %7 : Long(1, strides=[1], requires_grad=0, device=cpu) = prim::Constant[value={0}]() + return (%7) + """ + graph = torch._C.parse_ir(ir, True) + func = torch._C._create_function_from_graph("forward", graph) + ret = func() + self.assertTrue(ret.numel() == 1) + self.assertTrue(len(ret.size()) == 1) + + def test_parse_ir_single_element_tensor_negative(self): + ir = """ + graph(): + %7 : Long(1, strides=[1], requires_grad=0, device=cpu) = prim::Constant[value={-17}]() + return (%7) + """ + graph = torch._C.parse_ir(ir, True) + func = torch._C._create_function_from_graph("forward", graph) + ret = func() + self.assertTrue(ret.numel() == 1) + self.assertTrue(len(ret.size()) == 1) diff --git a/test/jit/test_models.py b/test/jit/test_models.py index 8cab53168236..2f67e27cb1d7 100644 --- a/test/jit/test_models.py +++ b/test/jit/test_models.py @@ -41,7 +41,7 @@ def __init__(self): def forward(self, x): x = F.relu(F.max_pool2d(self.conv1(x), 2)) x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) - x = x.view(-1, 320) + x = x.reshape(-1, 320) x = F.relu(self.fc1(x)) x = F.dropout(x, training=self.training) x = self.fc2(x) diff --git a/test/jit/test_module_containers.py b/test/jit/test_module_containers.py index a4892aa6ea3a..f253c2453b3b 100644 --- a/test/jit/test_module_containers.py +++ b/test/jit/test_module_containers.py @@ -663,3 +663,43 @@ def forward(self, x): # Check that ignored method is still intact. self.assertEqual(inp, n.ignored_method(inp)) + + def test_parameterlist_script_getitem(self): + class MyModule(nn.Module): + def __init__(self): + super().__init__() + self.module_list = nn.ModuleList([nn.Linear(1, 1) for _ in range(10)]) + self.parameter_list = nn.ParameterList([nn.Parameter(torch.zeros(1)) for _ in range(10)]) + + def forward(self, x): + self.module_list[0] + self.parameter_list[0] + return x + + self.checkModule(MyModule(), (torch.zeros(1))) + + def test_parameterlist_script_iter(self): + class MyModule(nn.Module): + def __init__(self): + super().__init__() + self.module_list = nn.ModuleList([nn.Linear(1, 1) for _ in range(10)]) + self.parameter_list = nn.ParameterList([nn.Parameter(torch.zeros(1)) for _ in range(10)]) + + def forward(self, x): + r = x + for i, p in enumerate(self.parameter_list): + r = r + p + i + return r + + self.checkModule(MyModule(), (torch.zeros(1),)) + + def test_parameterdict_script_getitem(self): + class MyModule(nn.Module): + def __init__(self): + super().__init__() + self.parameter_dict = nn.ParameterDict({k: nn.Parameter(torch.zeros(1)) for k in ['a', 'b', 'c']}) + + def forward(self, x): + return self.parameter_dict['a'] * x + self.parameter_dict['b'] * self.parameter_dict['c'] + + self.checkModule(MyModule(), (torch.ones(1),)) diff --git a/test/jit/test_op_decompositions.py b/test/jit/test_op_decompositions.py new file mode 100644 index 000000000000..6b4569cd6e39 --- /dev/null +++ b/test/jit/test_op_decompositions.py @@ -0,0 +1,38 @@ +# Owner(s): ["oncall: jit"] + +import torch +from torch.testing import FileCheck +from torch.testing._internal.jit_utils import JitTestCase + +if __name__ == '__main__': + raise RuntimeError("This test file is not meant to be run directly, use:\n\n" + "\tpython test/test_jit.py TESTNAME\n\n" + "instead.") + +class TestOpDecompositions(JitTestCase): + def test_op_decomposition(self): + def foo(x): + return torch.var(x, unbiased=True) + + # TODO: more robust testing + foo_s = torch.jit.script(foo) + FileCheck().check("aten::var").run(foo_s.graph) + torch._C._jit_pass_run_decompositions(foo_s.graph) + inp = torch.rand([10, 10]) + self.assertEqual(foo(inp), foo_s(inp)) + FileCheck().check_not("aten::var").run(foo_s.graph) + + def test_registered_decomposition(self): + @torch.jit.script + def foo(x): + return torch.square(x) + + @torch.jit.script + def square_decomp(x): + return torch.pow(x, 2) + + torch.jit._register_decomposition(torch.ops.aten.square.default, square_decomp.graph) + torch._C._jit_pass_run_decompositions(foo.graph) + FileCheck().check_not("aten::square").check("aten::pow").run(foo.graph) + x = torch.rand([4]) + self.assertEqual(foo(x), torch.square(x)) diff --git a/test/jit/test_optimize_for_mobile_preserve_debug_info.py b/test/jit/test_optimize_for_mobile_preserve_debug_info.py index b0a14f56d41f..a6527a3ffdff 100644 --- a/test/jit/test_optimize_for_mobile_preserve_debug_info.py +++ b/test/jit/test_optimize_for_mobile_preserve_debug_info.py @@ -2,9 +2,9 @@ import torch import torch._C -import torch.backends.xnnpack import torch.nn.functional as F from torch.testing._internal.jit_utils import JitTestCase +from torch.testing._internal.common_utils import skipIfNoXNNPACK class TestOptimizeForMobilePreserveDebugInfo(JitTestCase): def check_replacement( @@ -36,6 +36,7 @@ def check_replacement( original_source_ranges[replacements[node.kind()]], ) + @skipIfNoXNNPACK def test_replace_conv1d_with_conv2d(self): class TestConv1d(torch.nn.Module): def __init__(self, weight, bias): @@ -63,6 +64,7 @@ def forward(self, x): jit_pass=torch._C._jit_pass_transform_conv1d_to_conv2d, ) + @skipIfNoXNNPACK def test_insert_pre_packed_linear_before_inline_and_conv_2d_op(self): class TestPrepackedLinearBeforeInlineAndConv2dOp(torch.nn.Module): def __init__( @@ -139,6 +141,7 @@ def forward(self, x): jit_pass=torch._C._jit_pass_insert_prepacked_ops, ) + @skipIfNoXNNPACK def test_insert_pre_packed_linear_op(self): self.check_replacement( model=torch.jit.trace(torch.nn.Linear(5, 4), torch.rand(3, 2, 5)), @@ -230,6 +233,7 @@ def forward(self, x): jit_pass=torch._C._jit_pass_fuse_clamp_w_prepacked_linear_conv, ) + @skipIfNoXNNPACK def test_fuse_activation_with_pack_ops_linear_conv2d_1(self): self.run_test_fuse_activation_with_pack_ops_linear_conv2d( linear_activation=F.hardtanh, @@ -238,6 +242,7 @@ def test_fuse_activation_with_pack_ops_linear_conv2d_1(self): conv2d_activation_kind="aten::hardtanh_" ) + @skipIfNoXNNPACK def test_fuse_activation_with_pack_ops_linear_conv2d_2(self): self.run_test_fuse_activation_with_pack_ops_linear_conv2d( linear_activation=F.hardtanh_, @@ -246,6 +251,7 @@ def test_fuse_activation_with_pack_ops_linear_conv2d_2(self): conv2d_activation_kind="aten::hardtanh" ) + @skipIfNoXNNPACK def test_fuse_activation_with_pack_ops_linear_conv2d_3(self): self.run_test_fuse_activation_with_pack_ops_linear_conv2d( linear_activation=F.relu, @@ -254,6 +260,7 @@ def test_fuse_activation_with_pack_ops_linear_conv2d_3(self): conv2d_activation_kind="aten::relu_" ) + @skipIfNoXNNPACK def test_fuse_activation_with_pack_ops_linear_conv2d_4(self): self.run_test_fuse_activation_with_pack_ops_linear_conv2d( linear_activation=F.relu_, diff --git a/test/jit/test_profiler.py b/test/jit/test_profiler.py index 74f85dc22deb..81df055f55b7 100644 --- a/test/jit/test_profiler.py +++ b/test/jit/test_profiler.py @@ -18,7 +18,7 @@ class TestProfiler(JitTestCase): def setUp(self): self.prev_exec = torch._C._jit_set_profiling_executor(True) - self.prev_profiling = torch._C._jit_set_profiling_mode(True) + self.prev_profiling = torch._C._get_graph_executor_optimize(True) self.inline_autodiff = torch._C._debug_set_autodiff_subgraph_inlining(False) self.texpr_fuser_state = torch._C._jit_texpr_fuser_enabled() self.can_fuse_on_cpu = torch._C._jit_can_fuse_on_cpu() @@ -34,7 +34,7 @@ def setUp(self): def tearDown(self): torch._C._jit_set_profiling_executor(self.prev_exec) - torch._C._jit_set_profiling_mode(self.prev_profiling) + torch._C._get_graph_executor_optimize(self.prev_profiling) torch._C._debug_set_autodiff_subgraph_inlining(self.inline_autodiff) torch._C._jit_set_texpr_fuser_enabled(self.texpr_fuser_state) torch._C._jit_override_can_fuse_on_cpu(self.can_fuse_on_cpu) @@ -232,6 +232,24 @@ def foo(a, b): g = torch.jit.last_executed_optimized_graph() FileCheck().check_count("aten::add", 2, exactly=True).run(g) + def test_local_fusion_strategy(self): + @torch.jit.script + def foo(x): + return x + x + x + + torch.jit.set_fusion_strategy([("STATIC", 1)]) + for _ in range(3): + foo(torch.rand([10])) + + torch.jit.set_fusion_strategy([("STATIC", 10)]) + + for i in range(10): + foo(torch.rand([i])) + foo(torch.rand([i])) + + g = torch.jit.last_executed_optimized_graph() + FileCheck().check_count(":TensorExprGroup", 2, exactly=True).run(g) + def test_iterative_fusion(self): @torch.jit.script def foo(a, b, c, d): diff --git a/test/jit/test_python_bindings.py b/test/jit/test_python_bindings.py index 2f086feaa904..37c2ef7f85af 100644 --- a/test/jit/test_python_bindings.py +++ b/test/jit/test_python_bindings.py @@ -1,6 +1,7 @@ # Owner(s): ["oncall: jit"] import torch +from torch.testing import FileCheck from torch.testing._internal.jit_utils import JitTestCase if __name__ == "__main__": @@ -82,3 +83,28 @@ def test_graph_create(self): gr = torch._C.Graph() with self.assertRaises(ValueError): gr.create("prim::Constant", [None]) + + def test_canonicalize(self): + ir = """ +graph(%p207 : Tensor, + %1 : Tensor, + %p407 : int): + %11 : Tensor = aten::view_expand_placeholder(%1) + %12 : Tensor = aten::pointwise_placeholder(%11, %p207, %p407) + %13 : Tensor = aten::view_expand_placeholder(%12) + %14 : Tensor = aten::pointwise_placeholder(%13) + return (%14) + """ + + graph1 = torch._C.parse_ir(ir) + graph1 = torch._C._jit_pass_canonicalize(graph1, True) + + graph2 = torch._C.parse_ir(ir) + graph2 = torch._C._jit_pass_canonicalize(graph2) + + self.assertEqual(str(graph1), str(graph2)) + FileCheck().check("%p207").check_not("%14").run(graph1) + + graph3 = torch._C.parse_ir(ir) + graph3 = torch._C._jit_pass_canonicalize(graph3, False) + FileCheck().check_not("%p207").run(graph3) diff --git a/test/jit/test_remove_mutation.py b/test/jit/test_remove_mutation.py index 0544a039286e..4c393a7f1a0f 100644 --- a/test/jit/test_remove_mutation.py +++ b/test/jit/test_remove_mutation.py @@ -146,16 +146,16 @@ def test_successful(): # full_like is not implemented for a tensor fill value - def test_unsuccessful(): + def test_successful(): x = torch.tensor([2, 2]) y = torch.tensor([2, 4]) x.fill_(y) return x + x - fn = torch.jit.script(test_unsuccessful) + fn = torch.jit.script(test_successful) graph = fn.graph self.run_pass('remove_mutation', graph) - FileCheck().check('aten::fill_').run(graph) + FileCheck().check_not('aten::fill_').run(graph) def normal(): return torch.rand(2, 1, 3, 4).normal_() diff --git a/test/jit/test_save_load.py b/test/jit/test_save_load.py index fbc1443024cb..47cbc0fd9b3a 100644 --- a/test/jit/test_save_load.py +++ b/test/jit/test_save_load.py @@ -1,20 +1,22 @@ # Owner(s): ["oncall: jit"] -from typing import NamedTuple, Optional import io import os import pathlib import sys +import unittest +from typing import NamedTuple, Optional +import torch from torch import Tensor from torch.testing._internal.common_utils import TemporaryFileName -import torch # Make the helper files in test/ importable pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) sys.path.append(pytorch_test_dir) -from torch.testing._internal.jit_utils import (JitTestCase, - clear_class_registry) +from torch.testing._internal.jit_utils import JitTestCase, clear_class_registry + +ENABLE_FLATBUFFER = os.environ.get("ENABLE_FLATBUFFER", "0") == "1" if __name__ == "__main__": raise RuntimeError( @@ -23,12 +25,14 @@ "instead." ) + class TestSaveLoad(JitTestCase): def test_different_modules(self): """ Exercise the situation where we have the same qualified name in two different CompilationUnits on save/load. """ + class Foo(torch.nn.Module): def __init__(self): super(Foo, self).__init__() @@ -64,7 +68,8 @@ def forward(self, x): clear_class_registry() self.assertEqual( - first_script_module._c.qualified_name, second_script_module._c.qualified_name + first_script_module._c.qualified_name, + second_script_module._c.qualified_name, ) class ContainsBoth(torch.nn.Module): @@ -89,6 +94,7 @@ def test_different_functions(self): Exercise the situation where we have the same qualified name in two different CompilationUnits on save/load. """ + def lol(x): return x @@ -118,7 +124,8 @@ def forward(self, x): clear_class_registry() self.assertEqual( - first_script_module._c.qualified_name, second_script_module._c.qualified_name + first_script_module._c.qualified_name, + second_script_module._c.qualified_name, ) class ContainsBoth(torch.nn.Module): @@ -143,6 +150,7 @@ def test_different_interfaces(self): Exercise the situation where we have the same qualified name in two different CompilationUnits on save/load. """ + @torch.jit.interface class MyInterface(object): def bar(self, x: Tensor) -> Tensor: @@ -204,7 +212,8 @@ def forward(self, x): clear_class_registry() self.assertEqual( - first_script_module._c.qualified_name, second_script_module._c.qualified_name + first_script_module._c.qualified_name, + second_script_module._c.qualified_name, ) class ContainsBoth(torch.nn.Module): @@ -261,7 +270,6 @@ def forward(self, x): return x, MyCoolNamedTuple(a=5) - first_script_module = torch.jit.script(Foo()) first_saved_module = io.BytesIO() torch.jit.save(first_script_module, first_saved_module) @@ -310,7 +318,8 @@ def forward(self, x): clear_class_registry() self.assertEqual( - first_script_module._c.qualified_name, second_script_module._c.qualified_name + first_script_module._c.qualified_name, + second_script_module._c.qualified_name, ) class ContainsBoth(torch.nn.Module): @@ -340,44 +349,44 @@ def forward(self, a): value = b"bar\x00\xffbaz" expected_extra_files = {} - expected_extra_files['foo'] = value + expected_extra_files["foo"] = value # verify that str to bytes conversion also works - expected_extra_files['foo2'] = "bar" + expected_extra_files["foo2"] = "bar" m = MyMod() # Save to file. with TemporaryFileName() as fname: m.save(fname, _extra_files=expected_extra_files) # values don't matter - extra_files = {'foo': '', 'foo2': None} + extra_files = {"foo": "", "foo2": None} torch.jit.load(fname, _extra_files=extra_files) - self.assertEqual(value, extra_files['foo']) + self.assertEqual(value, extra_files["foo"]) # results come back always as bytes - self.assertEqual(b"bar", extra_files['foo2']) + self.assertEqual(b"bar", extra_files["foo2"]) # Use torch.jit API torch.jit.save(m, fname, _extra_files=expected_extra_files) - extra_files['foo'] = '' + extra_files["foo"] = "" torch.jit.load(fname, _extra_files=extra_files) - self.assertEqual(value, extra_files['foo']) + self.assertEqual(value, extra_files["foo"]) # Save to buffer. buffer = io.BytesIO(m.save_to_buffer(_extra_files=expected_extra_files)) - extra_files = {'foo': ''} + extra_files = {"foo": ""} torch.jit.load(buffer, _extra_files=extra_files) - self.assertEqual(value, extra_files['foo']) + self.assertEqual(value, extra_files["foo"]) # Use torch.jit API buffer = io.BytesIO() torch.jit.save(m, buffer, _extra_files=expected_extra_files) buffer.seek(0) - extra_files = {'foo': ''} + extra_files = {"foo": ""} torch.jit.load(buffer, _extra_files=extra_files) - self.assertEqual(value, extra_files['foo']) + self.assertEqual(value, extra_files["foo"]) # Non-existent file 'bar' with self.assertRaises(RuntimeError): - extra_files['bar'] = '' + extra_files["bar"] = "" torch.jit.load(buffer, _extra_files=extra_files) def test_save_load_using_pathlib(self): @@ -394,7 +403,7 @@ def forward(self, a): m.save(path) m2 = torch.jit.load(path) - x = torch.tensor([1., 2., 3., 4.]) + x = torch.tensor([1.0, 2.0, 3.0, 4.0]) self.assertTrue(torch.equal(m(x), m2(x))) def test_save_nonexit_file(self): @@ -455,7 +464,9 @@ class TestModule(torch.nn.Module): def __init__(self): super().__init__() self.add_module("submodule_a", Submodule()) - self.register_parameter("parameter_a", torch.nn.Parameter(torch.randn(4))) + self.register_parameter( + "parameter_a", torch.nn.Parameter(torch.randn(4)) + ) self.register_buffer("buffer", torch.randn(4)) self.t = torch.rand(4) # not buffer @@ -466,7 +477,9 @@ def __init__(self): m_loaded = self.getExportImportCopy(torch.jit.script(m)) # Check submodules. - self.assertEqual(len(list(m.named_modules())), len(list(m_loaded.named_modules()))) + self.assertEqual( + len(list(m.named_modules())), len(list(m_loaded.named_modules())) + ) for m_s, loaded_s in zip(m.named_modules(), m_loaded.named_modules()): m_name, _ = m_s loaded_name, _ = loaded_s @@ -478,7 +491,504 @@ def __init__(self): self.assertEqual(m_p, loaded_p) # Check buffers. - self.assertEqual(len(list(m.named_buffers())), len(list(m_loaded.named_buffers()))) + self.assertEqual( + len(list(m.named_buffers())), len(list(m_loaded.named_buffers())) + ) + for m_b, loaded_b in zip(m.named_buffers(), m_loaded.named_buffers()): + m_name, m_buffer = m_b + loaded_name, loaded_buffer = loaded_b + self.assertEqual(m_name, loaded_name) + self.assertEqual(m_buffer, loaded_buffer) + + def test_save_load_meta_tensors(self): + """ + Check that parameters, buffers, and submodules are the same after loading + for a module with parameters and buffers that are meta tensors + """ + + class Foo(torch.nn.Module): + def __init__(self): + super(Foo, self).__init__() + self.foo = torch.nn.Linear(2, 3, device="meta") + self.bar = torch.nn.Linear(3, 4) + self.register_buffer("buffer", torch.randn(4, device="meta")) + + def forward(self, x): + x = self.foo(x) + x = self.bar(x) + return x + + m = Foo() + m_loaded = self.getExportImportCopy(torch.jit.script(m)) + # Check submodules. + self.assertEqual( + len(list(m.named_modules())), len(list(m_loaded.named_modules())) + ) + self.assertEqual( + set(name for name, _ in m.named_modules()), + set(name for name, _ in m_loaded.named_modules()), + ) + # Check parameters. + m_params = dict(m.named_parameters()) + m_loaded_params = dict(m_loaded.named_parameters()) + self.assertEqual(len(m_params), len(m_loaded_params)) + self.assertEqual(m_params, m_loaded_params) + # Check buffers. + m_buffers = dict(m.named_buffers()) + m_loaded_buffers = dict(m_loaded.named_buffers()) + self.assertEqual(len(m_buffers), len(m_loaded_buffers)) + self.assertEqual(m_buffers, m_loaded_buffers) + # Check params and buffers that are/are not meta tensors + self.assertTrue(m_params["foo.weight"].is_meta) + self.assertTrue(m_loaded_params["foo.weight"].is_meta) + self.assertTrue(m_params["foo.bias"].is_meta) + self.assertTrue(m_loaded_params["foo.bias"].is_meta) + self.assertFalse(m_params["bar.weight"].is_meta) + self.assertFalse(m_loaded_params["bar.weight"].is_meta) + self.assertFalse(m_params["bar.bias"].is_meta) + self.assertFalse(m_loaded_params["bar.bias"].is_meta) + self.assertTrue(m_buffers["buffer"].is_meta) + self.assertTrue(m_loaded_buffers["buffer"].is_meta) + + +def script_module_to_buffer(script_module): + module_buffer = io.BytesIO( + script_module._save_to_buffer_for_lite_interpreter(_use_flatbuffer=True) + ) + module_buffer.seek(0) + return module_buffer + + +@unittest.skipIf( + not ENABLE_FLATBUFFER, "Need to enable flatbuffer to run the below tests" +) +class TestSaveLoadFlatbuffer(JitTestCase): + def test_different_modules(self): + """ + Exercise the situation where we have the same qualified name + in two different CompilationUnits on save/load. + """ + + class Foo(torch.nn.Module): + def __init__(self): + super(Foo, self).__init__() + self.foo = torch.nn.Linear(2, 2) + self.bar = torch.nn.Linear(2, 2) + + def forward(self, x): + x = self.foo(x) + x = self.bar(x) + return x + + first_script_module = torch.jit.script(Foo()) + first_saved_module = script_module_to_buffer(first_script_module) + + clear_class_registry() + + class Foo(torch.nn.Module): + def __init__(self): + super(Foo, self).__init__() + self.foo = torch.nn.Linear(2, 2) + + def forward(self, x): + x = self.foo(x) + return x + + second_script_module = torch.jit.script(Foo()) + second_saved_module = script_module_to_buffer(second_script_module) + + clear_class_registry() + + self.assertEqual( + first_script_module._c.qualified_name, + second_script_module._c.qualified_name, + ) + + class ContainsBoth(torch.nn.Module): + def __init__(self): + super().__init__() + self.add_module( + "second", torch.jit.load(second_saved_module) + ) + self.add_module( + "first", torch.jit.load(first_saved_module) + ) + + def forward(self, x): + x = self.first(x) + x = self.second(x) + return x + + sm = torch.jit.script(ContainsBoth()) + contains_both = script_module_to_buffer(sm) + sm = torch.jit.load(contains_both) + + def test_different_functions(self): + """ + Exercise the situation where we have the same qualified name + in two different CompilationUnits on save/load. + """ + + def lol(x): + return x + + class Foo(torch.nn.Module): + def forward(self, x): + return lol(x) + + first_script_module = torch.jit.script(Foo()) + first_saved_module = script_module_to_buffer(first_script_module) + clear_class_registry() + + def lol(x): # noqa: F811 + return "hello" + + class Foo(torch.nn.Module): + def forward(self, x): + return lol(x) + + second_script_module = torch.jit.script(Foo()) + second_saved_module = script_module_to_buffer(second_script_module) + + clear_class_registry() + + self.assertEqual( + first_script_module._c.qualified_name, + second_script_module._c.qualified_name, + ) + + class ContainsBoth(torch.nn.Module): + def __init__(self): + super().__init__() + self.add_module( + "second", torch.jit.load(second_saved_module) + ) + self.add_module( + "first", torch.jit.load(first_saved_module) + ) + + def forward(self, x): + x = self.first(x) + x = self.second(x) + return x + + sm = torch.jit.script(ContainsBoth()) + contains_both = script_module_to_buffer(sm) + sm = torch.jit.load(contains_both) + + def test_different_interfaces(self): + """ + Exercise the situation where we have the same qualified name + in two different CompilationUnits on save/load. + """ + + @torch.jit.interface + class MyInterface(object): + def bar(self, x: Tensor) -> Tensor: + pass + + @torch.jit.script + class ImplementInterface(object): + def __init__(self): + pass + + def bar(self, x): + return x + + class Foo(torch.nn.Module): + __annotations__ = {"interface": MyInterface} + + def __init__(self): + super().__init__() + self.interface = ImplementInterface() + + def forward(self, x): + return self.interface.bar(x) + + first_script_module = torch.jit.script(Foo()) + first_saved_module = script_module_to_buffer(first_script_module) + clear_class_registry() + + @torch.jit.interface + class MyInterface(object): + def not_bar(self, x: Tensor) -> Tensor: + pass + + @torch.jit.script # noqa: F811 + class ImplementInterface(object): # noqa: F811 + def __init__(self): + pass + + def not_bar(self, x): + return x + + class Foo(torch.nn.Module): + __annotations__ = {"interface": MyInterface} + + def __init__(self): + super().__init__() + self.interface = ImplementInterface() + + def forward(self, x): + return self.interface.not_bar(x) + + second_script_module = torch.jit.script(Foo()) + second_saved_module = script_module_to_buffer(second_script_module) + + clear_class_registry() + + self.assertEqual( + first_script_module._c.qualified_name, + second_script_module._c.qualified_name, + ) + + class ContainsBoth(torch.nn.Module): + def __init__(self): + super().__init__() + self.add_module( + "second", torch.jit.load(second_saved_module) + ) + self.add_module( + "first", torch.jit.load(first_saved_module) + ) + + def forward(self, x): + x = self.first(x) + x = self.second(x) + return x + + sm = torch.jit.script(ContainsBoth()) + contains_both = script_module_to_buffer(sm) + sm = torch.jit.load(contains_both) + + def test_many_collisions(self): + class MyCoolNamedTuple(NamedTuple): + a: int + + @torch.jit.interface + class MyInterface(object): + def bar(self, x: Tensor) -> Tensor: + pass + + @torch.jit.script + class ImplementInterface(object): + def __init__(self): + pass + + def bar(self, x): + return x + + def lol(x): + return x + + class Foo(torch.nn.Module): + interface: MyInterface + + def __init__(self): + super().__init__() + self.foo = torch.nn.Linear(2, 2) + self.bar = torch.nn.Linear(2, 2) + self.interface = ImplementInterface() + + def forward(self, x): + x = self.foo(x) + x = self.bar(x) + x = lol(x) + x = self.interface.bar(x) + + return x, MyCoolNamedTuple(a=5) + + first_script_module = torch.jit.script(Foo()) + first_saved_module = script_module_to_buffer(first_script_module) + + clear_class_registry() + + @torch.jit.interface + class MyInterface(object): + def not_bar(self, x: Tensor) -> Tensor: + pass + + @torch.jit.script # noqa: F811 + class ImplementInterface(object): # noqa: F811 + def __init__(self): + pass + + def not_bar(self, x): + return x + + def lol(x): # noqa: F811 + return "asdofij" + + class MyCoolNamedTuple(NamedTuple): # noqa: F811 + a: str + + class Foo(torch.nn.Module): + interface: MyInterface + + def __init__(self): + super().__init__() + self.foo = torch.nn.Linear(2, 2) + self.interface = ImplementInterface() + + def forward(self, x): + x = self.foo(x) + self.interface.not_bar(x) + x = lol(x) + return x, MyCoolNamedTuple(a="hello") + + second_script_module = torch.jit.script(Foo()) + second_saved_module = script_module_to_buffer(second_script_module) + + clear_class_registry() + + self.assertEqual( + first_script_module._c.qualified_name, + second_script_module._c.qualified_name, + ) + + class ContainsBoth(torch.nn.Module): + def __init__(self): + super().__init__() + self.add_module( + "second", torch.jit.load(second_saved_module) + ) + self.add_module( + "first", torch.jit.load(first_saved_module) + ) + + def forward(self, x): + x, named_tuple_1 = self.first(x) + x, named_tuple_2 = self.second(x) + return len(x + named_tuple_2.a) + named_tuple_1.a + + sm = torch.jit.script(ContainsBoth()) + contains_both = script_module_to_buffer(sm) + sm = torch.jit.load(contains_both) + + def test_save_load_using_pathlib(self): + class MyMod(torch.jit.ScriptModule): + @torch.jit.script_method + def forward(self, a): + return 2 * a + + m = MyMod() + + # Save then load. + with TemporaryFileName() as fname: + path = pathlib.Path(fname) + torch.jit.save_jit_module_to_flatbuffer(m, path) + m2 = torch.jit.load(path) + + x = torch.tensor([1.0, 2.0, 3.0, 4.0]) + self.assertTrue(torch.equal(m(x), m2(x))) + + def test_save_namedtuple_input_only(self): + """ + Even if a NamedTuple is only used as an input argument, saving and + loading should work correctly. + """ + global FooTuple # see [local resolution in python] + + class FooTuple(NamedTuple): + a: int + + class MyModule(torch.nn.Module): + def forward(self, x: FooTuple) -> torch.Tensor: + return torch.tensor(3) + + m_loaded = self.getExportImportCopy(torch.jit.script(MyModule())) + output = m_loaded(FooTuple(a=5)) + self.assertEqual(output, torch.tensor(3)) + + def test_save_namedtuple_output_only(self): + """ + Even if a NamedTuple is only used as an output argument, saving and + loading should work correctly. + """ + global FooTuple # see [local resolution in python] + + class FooTuple(NamedTuple): + a: int + + class MyModule(torch.nn.Module): + def forward(self) -> Optional[FooTuple]: + return None + + m_loaded = self.getExportImportCopy(torch.jit.script(MyModule())) + output = m_loaded() + self.assertEqual(output, None) + + def test_module_info_flatbuffer(self): + class Foo(torch.nn.Module): + def __init__(self): + super(Foo, self).__init__() + self.foo = torch.nn.Linear(2, 2) + self.bar = torch.nn.Linear(2, 2) + + def forward(self, x): + x = self.foo(x) + x = self.bar(x) + return x + + first_script_module = torch.jit.script(Foo()) + first_saved_module = io.BytesIO() + torch.jit.save_jit_module_to_flatbuffer( + first_script_module, first_saved_module) + first_saved_module.seek(0) + expected = { + 'bytecode_version': 4, + 'operator_version': 4, + 'function_names': {'__torch__.___torch_mangle_0.Foo.forward'}, + 'type_names': set(), + 'opname_to_num_args': {'aten::linear': 3}} + self.assertEqual( + torch.jit._serialization.get_flatbuffer_module_info(first_saved_module), + expected) + + + def test_save_load_params_buffers_submodules(self): + """ + Check that parameters, buffers, and submodules are the same after loading. + """ + + class Submodule(torch.nn.Module): + def __init__(self): + super().__init__() + + class TestModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.add_module("submodule_a", Submodule()) + self.register_parameter( + "parameter_a", torch.nn.Parameter(torch.randn(4)) + ) + self.register_buffer("buffer", torch.randn(4)) + self.t = torch.rand(4) # not buffer + + self.parameter_b = torch.nn.Parameter(torch.randn(4)) + self.submodule_b = Submodule() + + m = TestModule() + m_loaded = self.getExportImportCopy(torch.jit.script(m)) + + # Check submodules. + self.assertEqual( + len(list(m.named_modules())), len(list(m_loaded.named_modules())) + ) + for m_s, loaded_s in zip(m.named_modules(), m_loaded.named_modules()): + m_name, _ = m_s + loaded_name, _ = loaded_s + self.assertEqual(m_name, loaded_name) + + # Check parameters. + self.assertEqual(len(list(m.parameters())), len(list(m_loaded.parameters()))) + for m_p, loaded_p in zip(m.parameters(), m_loaded.parameters()): + self.assertEqual(m_p, loaded_p) + + # Check buffers. + self.assertEqual( + len(list(m.named_buffers())), len(list(m_loaded.named_buffers())) + ) for m_b, loaded_b in zip(m.named_buffers(), m_loaded.named_buffers()): m_name, m_buffer = m_b loaded_name, loaded_buffer = loaded_b diff --git a/test/jit/test_symbolic_shape_analysis.py b/test/jit/test_symbolic_shape_analysis.py index cd25caa92b2b..e756cdb67889 100644 --- a/test/jit/test_symbolic_shape_analysis.py +++ b/test/jit/test_symbolic_shape_analysis.py @@ -12,6 +12,7 @@ ) from torch.testing._internal.common_utils import make_tensor from torch.testing._internal.jit_utils import JitTestCase, execWrapper +from typing import List, Any if __name__ == '__main__': raise RuntimeError("This test file is not meant to be run directly, use:\n\n" @@ -498,3 +499,37 @@ def test_shape_function_includes(self): m2_shape = [20, 10] res = torch.jit._shapes.matmul(m1_shape, m2_shape) self.assertEqual(res, [10, 10]) + + def test_register_function_error_checking(self): + # this will error before registering on global map, so + # no issue in overwriting schema mappings + @torch.jit.script + def foo(x, y): + return x + y + + node = foo.graph.findNode("aten::add") + + @torch.jit.script + def wrong_input_types(x, y): + x: List[int] = [] + return x + with self.assertRaisesRegex(RuntimeError, "Expected supertype of int"): + torch._C._jit_register_shape_compute_graph_for_node(node, wrong_input_types.graph) + + @torch.jit.script + def wrong_output_types(x: List[int], y: List[int]): + x: List[Tensor] = [] + return x + + with self.assertRaisesRegex(RuntimeError, "but got graph_type"): + torch._C._jit_register_shape_compute_graph_for_node(node, wrong_output_types.graph) + + @torch.jit.script + def too_many_inputs(x: List[int], y: List[int], z: Any, z2: Any): + x: List[int] = [] + return x + + with self.assertRaises(RuntimeError) as error: + torch._C._jit_register_shape_compute_graph_for_node(node, too_many_inputs.graph) + + self.assertTrue("fewer arguments than schema" in str(error.exception)) diff --git a/test/jit/test_tensor_methods.py b/test/jit/test_tensor_methods.py new file mode 100644 index 000000000000..c761a3884c92 --- /dev/null +++ b/test/jit/test_tensor_methods.py @@ -0,0 +1,39 @@ +# Owner(s): ["oncall: jit"] + +import os +import sys + +import torch + +# Make the helper files in test/ importable +pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) +sys.path.append(pytorch_test_dir) +from torch.testing._internal.jit_utils import JitTestCase +from torch.testing import FileCheck + +if __name__ == "__main__": + raise RuntimeError( + "This test file is not meant to be run directly, use:\n\n" + "\tpython test/test_jit.py TESTNAME\n\n" + "instead." + ) + +class TestTensorMethods(JitTestCase): + def test_getitem(self): + def tensor_getitem(inp: torch.Tensor): + indices = torch.tensor([0, 2], dtype=torch.long) + return inp.__getitem__(indices) + + inp = torch.rand(3, 4) + self.checkScript(tensor_getitem, (inp, )) + + scripted = torch.jit.script(tensor_getitem) + FileCheck().check("aten::index").run(scripted.graph) + + def test_getitem_invalid(self): + def tensor_getitem_invalid(inp: torch.Tensor): + return inp.__getitem__() + + with self.assertRaisesRegexWithHighlight( + RuntimeError, "expected exactly 1 argument", "inp.__getitem__"): + torch.jit.script(tensor_getitem_invalid) diff --git a/test/jit/test_tracer.py b/test/jit/test_tracer.py index 9d6849829240..99d078dd4ad1 100644 --- a/test/jit/test_tracer.py +++ b/test/jit/test_tracer.py @@ -17,7 +17,7 @@ sys.path.append(pytorch_test_dir) from torch.testing._internal.common_utils import suppress_warnings, \ skipIfCompiledWithoutNumpy, enable_profiling_mode_for_profiling_tests, \ - IS_SANDCASTLE, TemporaryFileName + IS_SANDCASTLE, TemporaryFileName, skipIfCrossRef from torch.testing._internal.jit_utils import JitTestCase, enable_cpu_fuser, \ _tmp_donotuse_dont_inline_everything, _trace, RUN_CUDA, \ RUN_CUDA_MULTI_GPU, make_global @@ -377,6 +377,17 @@ def test_trace_size(self): def test_trace_size_with_grad(self): self.do_trace_size(True) + def test_trace_numel(self): + def fn(x): + return x.numel() + + x = torch.randn(2, 3, 4) + y = torch.randn(4, 5, 6) + + traced_fn = torch.jit.trace(fn, x) + self.assertEqual(traced_fn(y), fn(y)) + self.assertEqual(traced_fn(x), fn(x)) + def do_trace_arange(self, requires_grad): def arange(x): return torch.arange(x.shape[0]) @@ -500,6 +511,7 @@ def to_tensor(x, y): self.assertEqual(to_tensor_trace(x, y), to_tensor(x, y)) @skipIfCompiledWithoutNumpy + @skipIfCrossRef def test_trace_warn(self): def fn(x): int(x) # Warning 1. @@ -836,7 +848,7 @@ def forward(self, x): def test_trace_c10_ops(self): try: _ = torch.ops._caffe2.GenerateProposals - except RuntimeError: + except AttributeError: self.skipTest("Skip the test since c2 ops are not registered.") class MyModel(torch.nn.Module): @@ -1768,6 +1780,7 @@ def forward(self, x): torch.jit.trace(Mod(), (torch.rand(3, 4),)) + @skipIfCrossRef def test_trace_records_names(self): def foo(bar, baz): baz = bar + 3 diff --git a/test/jit/test_types.py b/test/jit/test_types.py index 9fadbedb272b..ca3da3c17c8c 100644 --- a/test/jit/test_types.py +++ b/test/jit/test_types.py @@ -39,7 +39,7 @@ def fn(x: torch.Tensor) -> Tuple[Tuple[torch.Tensor], Dict[str, int]]: expected = fn(x) scripted = torch.jit.script(fn)(x) - self.assertEquals(expected, scripted) + self.assertEqual(expected, scripted) def test_types_as_values(self): def fn(m: torch.Tensor) -> torch.device: diff --git a/test/jit/test_typing.py b/test/jit/test_typing.py index e2800e0119e9..e0932d40ebde 100644 --- a/test/jit/test_typing.py +++ b/test/jit/test_typing.py @@ -591,4 +591,4 @@ def foo(x): with self.assertRaisesRegex(RuntimeError, r'aka NamedTuple\(logits, aux_logits2, aux_logits1\)'): - out = foo(_GoogLeNetOutputs(logits=3, aux_logits2=4, aux_logits1=5)) + out = foo(_GoogLeNetOutputs(logits="3", aux_logits2="4", aux_logits1="5")) diff --git a/test/jit/test_upgraders.py b/test/jit/test_upgraders.py index 8b180c43b989..ab1ee534531f 100644 --- a/test/jit/test_upgraders.py +++ b/test/jit/test_upgraders.py @@ -133,11 +133,53 @@ def test_func(): traced_func = torch.jit.trace(test_func, ()) buffer = io.BytesIO() torch.jit.save(traced_func, buffer) + + current_flag_value = torch._C._get_version_calculator_flag() + # calculate based on old version + torch._C._calculate_package_version_based_on_upgraders(False) + buffer.seek(0) + loaded_func = torch.jit.load(buffer) + version = self._load_model_version(loaded_func) + self.assertTrue(version == 4) + + # calculate based on new version + torch._C._calculate_package_version_based_on_upgraders(True) buffer.seek(0) loaded_func = torch.jit.load(buffer) version = self._load_model_version(loaded_func) self.assertTrue(version == 4) + # make sure we preserve old behaviou + torch._C._calculate_package_version_based_on_upgraders(current_flag_value) + + @unittest.skipIf(not _is_upgraders_enabled(), "Skipping because upgraders are not enabled") + def test_aten_full_other_variants(self): + def test_func(): + a = torch.full([4, 5, 6], 4, names=["a", "b", "c"], dtype=torch.int64) + return a + + scripted_func = torch.jit.script(test_func) + buffer = io.BytesIO() + torch.jit.save(scripted_func, buffer) + + current_flag_value = torch._C._get_version_calculator_flag() + # calculate based on old version + torch._C._calculate_package_version_based_on_upgraders(False) + buffer.seek(0) + loaded_func = torch.jit.load(buffer) + version = self._load_model_version(loaded_func) + self.assertTrue(version == 5) + + # calculate based on new version + torch._C._calculate_package_version_based_on_upgraders(True) + buffer.seek(0) + loaded_func = torch.jit.load(buffer) + version = self._load_model_version(loaded_func) + self.assertTrue(version == 5) + + # make sure we preserve old behaviou + torch._C._calculate_package_version_based_on_upgraders(current_flag_value) + @unittest.skipIf(not _is_upgraders_enabled(), "Skipping because upgraders are not enabled") def test_aten_linspace(self): model_path = pytorch_test_dir + "/jit/fixtures/test_versioned_linspace_v7.ptl" @@ -248,7 +290,7 @@ def test_aten_div_scalar_at_3(self): torch.jit.save(loaded_model, buffer) buffer.seek(0) version = self._load_model_version(loaded_model) - self.assertTrue(version == 4) + self.assertEqual(version, 4) loaded_model_twice = torch.jit.load(buffer) self.assertEqual(loaded_model(torch.Tensor([5.0, 3.0]), 2.0), diff --git a/test/jit/test_with.py b/test/jit/test_with.py index b56324093ce1..bd09a36c6860 100644 --- a/test/jit/test_with.py +++ b/test/jit/test_with.py @@ -621,7 +621,7 @@ def with_rf(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: function_events = p.function_events # Event with name "foo" should be recorded. rf_events = [evt for evt in function_events if evt.name == "foo"] - self.assertTrue(len(rf_events), 1) + self.assertEqual(len(rf_events), 1) rf_event = rf_events[0] child_events = rf_event.cpu_children # Ensure we find nested record_function event diff --git a/test/jit_hooks/CMakeLists.txt b/test/jit_hooks/CMakeLists.txt index be29bb463390..546a3040f49b 100644 --- a/test/jit_hooks/CMakeLists.txt +++ b/test/jit_hooks/CMakeLists.txt @@ -2,6 +2,10 @@ cmake_minimum_required(VERSION 3.1 FATAL_ERROR) project(jit_hooks) +if(USE_ROCM) +include(utils) +include(LoadHIP) +endif() find_package(Torch REQUIRED) add_executable(test_jit_hooks test_jit_hooks.cpp) diff --git a/tools/codegen/__init__.py b/test/lazy/__init__.py similarity index 100% rename from tools/codegen/__init__.py rename to test/lazy/__init__.py diff --git a/test/lazy/test_bindings.py b/test/lazy/test_bindings.py new file mode 100644 index 000000000000..57151d408560 --- /dev/null +++ b/test/lazy/test_bindings.py @@ -0,0 +1,7 @@ +# Owner(s): ["oncall: jit"] + +import torch._lazy.metrics + +def test_metrics(): + names = torch._lazy.metrics.counter_names() + assert len(names) == 0, f"Expected no counter names, but got {names}" diff --git a/test/lazy/test_extract_compiled_graph.py b/test/lazy/test_extract_compiled_graph.py new file mode 100644 index 000000000000..f4152d0af68b --- /dev/null +++ b/test/lazy/test_extract_compiled_graph.py @@ -0,0 +1,195 @@ +# Owner(s): ["oncall: jit"] + +import unittest + +from torch._lazy.ts_backend import init as init_ts_backend +init_ts_backend() +from torch._lazy import config +from torch._lazy.extract_compiled_graph import extract_compiled_graph +import torch +from torch import nn +import dis +import inspect +from torch import fx +import re +from contextlib import contextmanager +import copy + +class ModuleConstScale(nn.Module): + def __init__(self): + super(ModuleConstScale, self).__init__() + + def forward(self, a): + return a * 2 + +class ModuleSub(nn.Module): + def __init__(self): + super(ModuleSub, self).__init__() + + def forward(self, a, b): + return a - b + +class ModuleAddcmul(nn.Module): + """ + addcmul function takes a at::Scalar which results in a special TSData containing a Scalar rather than a Tensor. + """ + def __init__(self): + super(ModuleAddcmul, self).__init__() + + def forward(self, a, b, c): + return torch.addcmul(a, b, c, value=5) + +class ModuleReturnMulti(nn.Module): + def __init__(self): + super(ModuleReturnMulti, self).__init__() + + def forward(self, a, b): + return (b + 1, a - 1) + +# The default fx tracer will convert torch.randn to a constant.. We may need +# a custom tracer. +# class ModuleEagerTensor(nn.Module): +# def __init__(self): +# super(ModuleEagerTensor, self).__init__() +# +# def forward(self, a): +# b = torch.randn(2, 3, device="cpu") # eager device +# return a + b + +# The module was planned to cover the case that a Fx graph return an eager +# tensor on the default device. It's harder than ModuleEagerTensor because +# we can not just override the device argument to Lazy since there is no +# explicit device argument. +# +# Unfortunately, the default fx tracer convert the return value of the forward +# method to a constant.. Comment out for now +# class ModuleReturnEagerTensorOnDefaultDevice(nn.Module): +# def __init__(self): +# super(ModuleReturnEagerTensorOnDefaultDevice, self).__init__() +# +# def forward(self): +# return torch.tensor((2, 3), dtype=torch.float32) + +class ModuleReturnDupTensor(nn.Module): + """ + Handle the corner case that the same tensor appears multiple times in the + returned tuple. torchbench like drq will hit this corner case when running + thru torchdynamo.. + """ + def __init__(self): + super(ModuleReturnDupTensor, self).__init__() + + def forward(self, a, b): + c = a + b + return a - b, c, a + 1, c + +class ModuleInplaceUpdate(nn.Module): + def __init__(self): + super(ModuleInplaceUpdate, self).__init__() + + def forward(self, a, b): + a.sub_(b) + return b - 1, b + 1 + +@contextmanager +def force_fallback_ctx_mgr(fallback_op): + oldconfig = config.get_force_fallback() + config.set_force_fallback(fallback_op) + try: + yield None + finally: + config.set_force_fallback(oldconfig) + +@contextmanager +def nop_ctx_mgr(): + try: + yield None + finally: + pass + +def gen_rand_args(mod): + args = [] + for _ in range(len(inspect.signature(mod.forward).parameters)): + args.append(torch.randn(2, 3)) + return args + +def allclose(expected, actual): + def unwrap(cont): + if isinstance(cont, (list, tuple)) and len(cont) == 1: + return cont[0] + return cont + expected = unwrap(expected) + actual = unwrap(actual) + + if isinstance(expected, torch.Tensor) and isinstance(actual, torch.Tensor): + return torch.allclose(expected, actual) + elif isinstance(expected, (tuple, list)) and isinstance(actual, (tuple, list)): + return len(expected) == len(actual) and all(torch.allclose(a, b) for a, b in zip(expected, actual)) + else: + raise RuntimeError("Unexpected types") + +def verify_reusing_compiled_graph(mod, exception_msg_pattern, ncase=10): + args = gen_rand_args(mod) + out = mod(*args) + + dis.dis(mod.forward) + + try: + optimized_mod = extract_compiled_graph(fx.symbolic_trace(mod), args) + except RuntimeError as e: + if exception_msg_pattern is None: + raise e # reraise the exception + exception_message = str(e) + if not re.search(exception_msg_pattern, exception_message): + raise RuntimeError(f"Expection message does not match the required pattern: {exception_message}") + else: + # We are done for the test case that expects an exception + return + + if exception_msg_pattern is not None: + raise RuntimeError(f"Expect an exception matching pattern {exception_msg_pattern}") + print("return value of optimized_mod", optimized_mod(*args)) + + # check correctness + failed_index = [] + for i in range(ncase): + rand_args = gen_rand_args(mod) + rand_args_copy = copy.deepcopy(rand_args) + expected = mod(*rand_args) + actual = optimized_mod(*rand_args_copy) + + if not allclose(expected, actual): + print(f"Incorrect results. expected {expected}, actual {actual}") + failed_index.append(i) + continue + + # make sure arguments match after calling the model forward method to handle inplace + # updates. + if not allclose(rand_args, rand_args_copy): + print(f"Incorrect updated arguments. expected {rand_args}, actual {rand_args_copy}") + failed_index.append(i) + continue + + if len(failed_index) > 0: + raise RuntimeError(f"Failed {len(failed_index)}/{ncase} cases") + +def maketest(module_cls, exception_msg_pattern=None, ctxmgr=None): + def wrapper(self): + nonlocal ctxmgr + if not ctxmgr: + ctxmgr = nop_ctx_mgr() + with ctxmgr: + verify_reusing_compiled_graph(module_cls(), exception_msg_pattern) + + return wrapper + +class OptimizeTest(unittest.TestCase): + test_sub = maketest(ModuleSub) + # Same as test_sub but force aten::sub to fallback + # We expect an exception caught because of LTC fallabck. + test_ltc_fallback = maketest(ModuleSub, exception_msg_pattern="fallback.*aten::sub", ctxmgr=force_fallback_ctx_mgr("aten::sub")) + test_const_scale = maketest(ModuleConstScale) + test_addcmul = maketest(ModuleAddcmul) + test_return_multi = maketest(ModuleReturnMulti) + test_return_dup_tensor = maketest(ModuleReturnDupTensor) + test_inplace_update = maketest(ModuleInplaceUpdate) diff --git a/test/lazy/test_reuse_ir.py b/test/lazy/test_reuse_ir.py new file mode 100644 index 000000000000..9a8c1400a4a1 --- /dev/null +++ b/test/lazy/test_reuse_ir.py @@ -0,0 +1,106 @@ +# Owner(s): ["oncall: jit"] + +import torch +import torch._lazy +import torch._lazy.config +import torch._lazy.ir_cache +import torch._lazy.ts_backend +import torch._lazy.metrics as metrics +from torch.testing._internal.common_utils import IS_WINDOWS, run_tests, TestCase +import os +import unittest + +torch._lazy.ts_backend.init() +torch._lazy.config.set_reuse_ir(True) + +def get_test_device(): + return 'cuda' if 'LTC_TS_CUDA' in os.environ else 'cpu' + +@unittest.skipIf(IS_WINDOWS, "To be fixed") +class TestLazyReuseIr(TestCase): + def testAdd(self): + device = get_test_device() + x = torch.randn(2, 3, 4, device=device) + y = torch.randn(2, 3, 4, device=device) + z = torch.zeros(2, 3, 4, device=device) + + device = 'lazy' + x_lazy = x.detach().clone().to(device=device) + y_lazy = y.detach().clone().to(device=device) + z_lazy = z.detach().clone().to(device=device) + + for i in range(10): + z += (x + y) + + for i in range(10): + z_lazy += (x_lazy + y_lazy) + torch._lazy.mark_step() + + torch.testing.assert_close(z.cpu(), z_lazy.cpu()) + assert metrics.counter_value("IrNodeReused_torch::lazy::AddTensor") >= 16 + metrics.reset() + torch._lazy.ir_cache.reset() + + def testAddSub(self): + device = get_test_device() + x = torch.randn(2, 3, 4, device=device) + y = torch.randn(2, 3, 4, device=device) + z = torch.zeros(2, 3, 4, device=device) + + device = 'lazy' + x_lazy = x.detach().clone().to(device=device) + y_lazy = y.detach().clone().to(device=device) + z_lazy = z.detach().clone().to(device=device) + + for i in range(10): + if i < 5: + z += (x + y) + else: + z += (x - y) + + for i in range(10): + if i < 5: + z_lazy += (x_lazy + y_lazy) + else: + z_lazy += (x_lazy - y_lazy) + torch._lazy.mark_step() + + torch.testing.assert_close(z.cpu(), z_lazy.cpu()) + assert metrics.counter_value("IrNodeReused_torch::lazy::AddTensor") >= 10 + assert metrics.counter_value("IrNodeReused_torch::lazy::AddTensor") >= 4 + metrics.reset() + torch._lazy.ir_cache.reset() + + def testAddSubFallback(self): + torch._lazy.config.set_force_fallback("aten::sub") + device = get_test_device() + x = torch.randn(2, 3, 4, device=device) + y = torch.randn(2, 3, 4, device=device) + z = torch.zeros(2, 3, 4, device=device) + + device = 'lazy' + x_lazy = x.detach().clone().to(device=device) + y_lazy = y.detach().clone().to(device=device) + z_lazy = z.detach().clone().to(device=device) + + for i in range(10): + if i < 5: + z += (x + y) + else: + z += (x - y) + + for i in range(10): + if i < 5: + z_lazy += (x_lazy + y_lazy) + else: + z_lazy += (x_lazy - y_lazy) + torch._lazy.mark_step() + + torch.testing.assert_close(z.cpu(), z_lazy.cpu()) + assert metrics.counter_value("IrNodeReused_torch::lazy::AddTensor") >= 11 + metrics.reset() + torch._lazy.ir_cache.reset() + torch._lazy.config.set_force_fallback("") + +if __name__ == '__main__': + run_tests() diff --git a/test/lazy/test_ts_opinfo.py b/test/lazy/test_ts_opinfo.py new file mode 100644 index 000000000000..c14483cf6308 --- /dev/null +++ b/test/lazy/test_ts_opinfo.py @@ -0,0 +1,232 @@ +# Owner(s): ["oncall: jit"] + +from typing import Sequence +import torch +import functools + +from torch.testing._internal.common_utils import run_tests, TestCase +from torch.testing._internal.jit_utils import JitTestCase +from torch.testing._internal.common_methods_invocations import op_db +from torch.testing._internal.common_device_type import ops, instantiate_device_type_tests +import torch._lazy +import torch._lazy.config +import torch._lazy.metrics +import torch._lazy.ir_cache +import torch._lazy.ts_backend +import itertools +import yaml +import os +import pathlib + +torch._lazy.ts_backend.init() + +def get_test_device(): + return 'cuda' if 'LTC_TS_CUDA' in os.environ else 'cpu' + +def remove_suffixes(l): + return [x.split(".")[0] for x in l] + +def init_lists(): + path_to_script = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) + TS_NATIVE_FUNCTIONS_PATH = path_to_script.parent.parent / "aten/src/ATen/native/ts_native_functions.yaml" + with open(TS_NATIVE_FUNCTIONS_PATH) as f: + yaml_ts = yaml.load(f, yaml.Loader) + LAZY_OPS_LIST = set(remove_suffixes(itertools.chain(yaml_ts["full_codegen"], yaml_ts["supported"], yaml_ts["autograd"]))) + FALLBACK_LIST = set(["clamp"]) + SKIP_RUNTIME_ERROR_LIST = set([ + 'index_select', # Empty output_sizes is not supported + 'clone', # is clone decomposed? + + # General ASAN Failure due to related to generating bool values. + # https://github.com/pytorch/pytorch/issues/74519 + # https://github.com/pytorch/pytorch/issues/63034 + 'nonzero', # ASAN failure (paste: P501906539) + 'all', # ASAN failure + 'any', # ASAN failure + 'logdet', # ASAN failure + ]) + SKIP_INCORRECT_RESULTS_LIST = set([ + 'squeeze', # Value out of range + 't', # Value out of range + 'transpose', # Value out of range + 'bernoulli', # incorrect results + 'pow', # incorrect results + 'addcdiv', # incorrect results (on CI not locally?) + ]) + + return (LAZY_OPS_LIST, FALLBACK_LIST, SKIP_RUNTIME_ERROR_LIST, SKIP_INCORRECT_RESULTS_LIST) + +(LAZY_OPS_LIST, FALLBACK_LIST, SKIP_RUNTIME_ERROR_LIST, SKIP_INCORRECT_RESULTS_LIST) = init_lists() + +torch.manual_seed(42) + +def clone_move(t): + dev = 'lazy' + copy_t = t.detach().clone().requires_grad_(True).to(device=dev) + return copy_t + +class TestLazyTensor(JitTestCase): + def testConvolutionBackward(self): + test_device = get_test_device() + inp = torch.rand(1, 3, 128, 128, device=test_device, requires_grad=True) + inp_copy = clone_move(inp) + grad = torch.rand(1, 32, 121, 121, device=test_device) # no requires_grad + grad_copy = clone_move(grad) + weight = torch.rand(32, 3, 8, 8, device=test_device, requires_grad=True) + weight_copy = clone_move(weight) + bias = torch.rand(32, device=test_device, requires_grad=True) + bias_copy = clone_move(bias) + + # run eager + conv_out = torch.nn.functional.conv2d(inp, weight, bias) + (inp_grad, weight_grad, bias_grad) = torch.autograd.grad([conv_out], [inp, weight, bias], [grad]) + + # run lazy + conv_copy_out = torch.nn.functional.conv2d(inp_copy, weight_copy, bias_copy) + (inp_copy_grad, weight_copy_grad, bias_copy_grad) = torch.autograd.grad( + [conv_copy_out], [inp_copy, weight_copy, bias_copy], [grad_copy]) + + # check numerics + torch.testing.assert_close(bias_copy_grad.cpu(), bias_grad.cpu()) + + torch.testing.assert_close(weight_copy_grad.cpu(), weight_grad.cpu()) + torch.testing.assert_close(inp_copy_grad.cpu(), inp_grad.cpu()) + +class TestLazyOpInfo(TestCase): + + @ops([op for op in op_db if op.name in LAZY_OPS_LIST and op.name not in SKIP_RUNTIME_ERROR_LIST], allowed_dtypes=(torch.float,)) + def test_dispatched_to_lazy(self, device, dtype, op): + def get_name(op): + l = [op.name] + if op.variant_test_name != '': + l.append(op.variant_test_name) + return '.'.join(l) + + global FALLBACK_LIST + samples = op.sample_inputs("lazy", dtype, requires_grad=False) + sample = list(samples)[0] + args = [sample.input] + list(sample.args) + kwargs = sample.kwargs + torch._lazy.mark_step() + torch._lazy.wait_device_ops() + torch._lazy.metrics.reset() + + r = op(*args, **kwargs) + torch._lazy.mark_step() + torch._lazy.wait_device_ops() + prefix = "aten" if op.name in FALLBACK_LIST else "lazy" + found = f"{prefix}::{op.name}" in remove_suffixes(torch._lazy.metrics.counter_names()) + # check aliases + if not found: + for alias in op.aliases: + alias_found = f"{prefix}::{alias.name}" in remove_suffixes(torch._lazy.metrics.counter_names()) + found = found or alias_found + if found: + break + self.assertTrue(found) + + + @ops([op for op in op_db if op.name in LAZY_OPS_LIST and op.name not in SKIP_RUNTIME_ERROR_LIST | SKIP_INCORRECT_RESULTS_LIST], allowed_dtypes=(torch.float,)) # noqa: B950 + def test_correctness(self, device, dtype, op): + + test_device = get_test_device() + + def clone_to_device(input, dev): + if isinstance(input, torch.Tensor): + return input.detach().clone().to(device=dev) + if isinstance(input, Sequence) and not isinstance(input, str): + return tuple(map(functools.partial(clone_to_device, dev=dev), input)) + return input + + def assert_allclose_rec(t): + a, b = t + self.assertEqual(type(a), type(b)) + if isinstance(a, torch.Tensor): + self.assertTrue(torch.allclose(clone_to_device(a, test_device), b, atol=1e-4)) + + if isinstance(a, Sequence): + map(assert_allclose_rec, zip(a, b)) + + samples = op.sample_inputs("lazy", dtype, requires_grad=False) + for sample in samples: + args = [sample.input] + list(sample.args) + kwargs = sample.kwargs + copy_args = clone_to_device(args, test_device) + + r_exp = op(*copy_args, **kwargs) + r_actual = op(*args, **kwargs) + + assert_allclose_rec((r_actual, r_exp)) + + @ops([op for op in op_db if op.name in LAZY_OPS_LIST and op.name not in SKIP_RUNTIME_ERROR_LIST | SKIP_INCORRECT_RESULTS_LIST], allowed_dtypes=(torch.float,)) # noqa: B950 + def test_correctness_with_reusing_ir(self, device, dtype, op): + torch._lazy.config.set_reuse_ir(True) + test_device = get_test_device() + + def clone_to_device(input, dev): + if isinstance(input, torch.Tensor): + return input.detach().clone().to(device=dev) + if isinstance(input, Sequence) and not isinstance(input, str): + return tuple(map(functools.partial(clone_to_device, dev=dev), input)) + return input + + def assert_allclose_rec(t): + a, b = t + self.assertEqual(type(a), type(b)) + if isinstance(a, torch.Tensor): + self.assertTrue(torch.allclose(clone_to_device(a, test_device), b, atol=1e-4)) + + if isinstance(a, Sequence): + map(assert_allclose_rec, zip(a, b)) + + samples = op.sample_inputs("lazy", dtype, requires_grad=False) + for sample in samples: + args = [sample.input] + list(sample.args) + kwargs = sample.kwargs + copy_args = clone_to_device(args, test_device) + + r_exp = op(*copy_args, **kwargs) + r_actual = op(*args, **kwargs) + + torch._lazy.mark_step() + assert_allclose_rec((r_actual, r_exp)) + + torch._lazy.ir_cache.reset() + torch._lazy.config.set_reuse_ir(False) + + + +# TODO: after we move to master, add Lazy as a new Device here: +# https://github.com/pytorch/pytorch/blob/master/torch/testing/_internal/common_device_type.py#L532 +instantiate_device_type_tests(TestLazyOpInfo, globals(), only_for="cpu") + + +class TestLazyDynamicOps(TestCase): + @classmethod + def setUpClass(cls) -> None: + # Setup the dynamic shape mode + cls.old_ssa_mode = torch._C._lazy._get_symbolic_shape_mode() + torch._C._lazy._set_symbolic_shape_mode(True) + return super().setUpClass() + + @classmethod + def tearDownClass(cls) -> None: + torch._C._lazy._set_symbolic_shape_mode(cls.old_ssa_mode) + return super().tearDownClass() + + def test_nonzero_dynamic(self): + # Test that nonzero gives upper bounds sizes when symbolic shape mode is enabled + test_device = get_test_device() + x1 = torch.tensor([[0, 1.0, 2.0], [3.0, 0, 0]], device=test_device, requires_grad=True) + x1_lazy = clone_move(x1) + x2_lazy = torch.nonzero(x1_lazy) + print(x2_lazy.size()) + self.assertEqual(tuple(x2_lazy.size()), (6, 2)) + + # We should still be able to instantiate it and get the actual result + x2_eager = x2_lazy.cpu() + self.assertEqual(tuple(x2_eager.size()), (3, 2)) + + +if __name__ == '__main__': + run_tests() diff --git a/test/load_torchscript_model.py b/test/load_torchscript_model.py new file mode 100644 index 000000000000..dc8d4159d7ff --- /dev/null +++ b/test/load_torchscript_model.py @@ -0,0 +1,6 @@ +import sys +import torch + +if __name__ == '__main__': + print(torch.jit.load(sys.argv[1])) + sys.exit(0) diff --git a/test/mobile/lightweight_dispatch/CMakeLists.txt b/test/mobile/lightweight_dispatch/CMakeLists.txt new file mode 100644 index 000000000000..5ab3232f6a44 --- /dev/null +++ b/test/mobile/lightweight_dispatch/CMakeLists.txt @@ -0,0 +1,23 @@ +cmake_minimum_required(VERSION 3.1) + +set(TORCH_ROOT ${CMAKE_CURRENT_LIST_DIR}/../../..) +set(TEST_ROOT ${TORCH_ROOT}/test/mobile/lightweight_dispatch) + +add_executable(test_codegen_unboxing + ${TEST_ROOT}/test_lightweight_dispatch.cpp + ${TEST_ROOT}/test_codegen_unboxing.cpp +) + +target_include_directories(test_codegen_unboxing PRIVATE ${ATen_CPU_INCLUDE}) + +target_compile_definitions(test_codegen_unboxing PRIVATE USE_GTEST) + +set(TEST_UNBOXING_DEPENDENCIES torch gtest) + +target_link_libraries(test_codegen_unboxing PRIVATE + ${TEST_UNBOXING_DEPENDENCIES} +) + +if(INSTALL_TEST) + install(TARGETS test_codegen_unboxing DESTINATION bin) +endif() diff --git a/test/mobile/lightweight_dispatch/build.sh b/test/mobile/lightweight_dispatch/build.sh new file mode 100755 index 000000000000..13de97d55829 --- /dev/null +++ b/test/mobile/lightweight_dispatch/build.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# This script should be called from .jenkins/pytorch/build.sh. Assuming we are at pytorch source root directory. + +# Required environment variable: $BUILD_ENVIRONMENT +# (This is set by default in the Docker images we build, so you don't +# need to set it yourself. + +set -ex -o pipefail + +# shellcheck disable=SC2034 +echo "Build lite interpreter with lightweight dispatch." + +CUSTOM_TEST_ARTIFACT_BUILD_DIR=${CUSTOM_TEST_ARTIFACT_BUILD_DIR:-${PWD}/../} +mkdir -pv "${CUSTOM_TEST_ARTIFACT_BUILD_DIR}" + +BUILD_LIBTORCH_PY="$PWD/tools/build_libtorch.py" +TEST_SRC_ROOT="$PWD/test/mobile/lightweight_dispatch" + +pushd "$CUSTOM_TEST_ARTIFACT_BUILD_DIR" + +# prepare test +python "$TEST_SRC_ROOT/tests_setup.py" setup + +export USE_DISTRIBUTED=0 +export USE_LIGHTWEIGHT_DISPATCH=1 +export STATIC_DISPATCH_BACKEND="CPU" +export BUILD_LITE_INTERPRETER=1 + +python "${BUILD_LIBTORCH_PY}" +ret=$? + +if [ "$ret" -ne 0 ]; then + echo "Lite interpreter build failed!" + exit "$ret" +fi + + +# run test +if ! build/bin/test_codegen_unboxing; then + echo "test_codegen_unboxing has failure!" + exit 1 +fi + +# shutdown test +python "$TEST_SRC_ROOT/tests_setup.py" shutdown + +# run lite interpreter tests +if ! build/bin/test_lite_interpreter_runtime; then + echo "test_lite_interpreter_runtime has failure!" + exit 1 +fi + +popd + +exit 0 diff --git a/test/mobile/lightweight_dispatch/test_codegen_unboxing.cpp b/test/mobile/lightweight_dispatch/test_codegen_unboxing.cpp new file mode 100644 index 000000000000..07a845d6008b --- /dev/null +++ b/test/mobile/lightweight_dispatch/test_codegen_unboxing.cpp @@ -0,0 +1,219 @@ +#include +#include +#include +#include +#include +#include +#include +// Cover codegen'd unboxing logic for these types: +//'Device', +//'Device?', +//'Dimname', +//'Dimname[1]', +//'Dimname[]', +//'Dimname[]?', +//'Generator?', +//'Layout?', +//'MemoryFormat', +//'MemoryFormat?', +//'Scalar', +//'Scalar?', +//'ScalarType', +//'ScalarType?', +//'Scalar[]', +//'Storage', +//'Stream', +//'Tensor', +//'Tensor(a!)', +//'Tensor(a!)[]', +//'Tensor(a)', +//'Tensor(b!)', +//'Tensor(c!)', +//'Tensor(d!)', +//'Tensor?', +//'Tensor?[]', +//'Tensor[]', +//'bool', +//'bool?', +//'bool[2]', +//'bool[3]', +//'bool[4]', +//'float', +//'float?', +//'float[]?', +//'int', +//'int?', +//'int[1]', +//'int[1]?', +//'int[2]', +//'int[2]?', +//'int[3]', +//'int[4]', +//'int[5]', +//'int[6]', +//'int[]', +//'int[]?', +//'str', +//'str?' +namespace torch { +namespace jit { +namespace mobile { +// covers int[], ScalarType?, Layout?, Device?, bool? +TEST(LiteInterpreterTest, Ones) { + // Load check in model: ones.ptl + auto testModelFile = "ones.ptl"; + + // class Model(torch.nn.Module): + // def forward(self, x: int): + // a = torch.ones([3, x], dtype=torch.int64, layout=torch.strided, device="cpu") + // return a + Module bc = _load_for_mobile(testModelFile); + std::vector input{c10::IValue(4)}; + const auto result = bc.forward(input); + ASSERT_EQ(result.toTensor().size(0), 3); + ASSERT_EQ(result.toTensor().size(1), 4); +} + +TEST(LiteInterpreterTest, Index) { + // Load check in model: index.ptl + auto testModelFile = "index.ptl"; + + // class Model(torch.nn.Module): + // def forward(self, index): + // a = torch.zeros(2, 2) + // a[0][1] = 1 + // a[1][0] = 2 + // a[1][1] = 3 + // return a[index] + Module bc = _load_for_mobile(testModelFile); + int64_t ind_1 = 0; + + const auto result_1 = bc.forward({at::tensor(ind_1)}); + + at::Tensor expected = at::empty({1, 2}, c10::TensorOptions(c10::ScalarType::Float)); + expected[0][0] = 0; + expected[0][1] = 1; + + AT_ASSERT(result_1.toTensor().equal(expected)); +} + +TEST(LiteInterpreterTest, Gradient) { + // Load check in model: gradient.ptl + auto testModelFile = "gradient.ptl"; + + // class Model(torch.nn.Module): + // def forward(self, a: int): + // values = torch.tensor([4., 1., 1., 16.], ) + // if a == 0: + // return torch.gradient(values, spacing=torch.scalar_tensor(2., dtype=torch.float64)) + // elif a == 1: + // return torch.gradient(values, spacing=[torch.tensor(1.).item()]) + Module bc = _load_for_mobile(testModelFile); + + const auto result_1 = bc.forward({0}); + at::Tensor expected_1 = at::tensor({-1.5, -0.75, 3.75, 7.5}, c10::TensorOptions(c10::ScalarType::Float)); + AT_ASSERT(result_1.toList().get(0).toTensor().equal(expected_1)); + + const auto result_2 = bc.forward({1}); + at::Tensor expected_2 = at::tensor({-3.0, -1.5, 7.5, 15.0}, c10::TensorOptions(c10::ScalarType::Float)); + AT_ASSERT(result_2.toList().get(0).toTensor().equal(expected_2)); +} + +TEST(LiteInterpreterTest, Upsample) { + // Load check in model: upsample.ptl + auto testModelFile = "upsample.ptl"; + + // model = torch.nn.Upsample(scale_factor=(2.0,), mode="linear") + Module bc = _load_for_mobile(testModelFile); + + const auto result_1 = bc.forward({at::ones({1, 2, 3})}); + at::Tensor expected_1 = at::ones({1, 2, 6}, c10::TensorOptions(c10::ScalarType::Float)); + AT_ASSERT(result_1.toTensor().equal(expected_1)); +} + +TEST(LiteInterpreterTest, IndexTensor) { + // Load check in model: Index_Tensor.ptl + auto testModelFile = "index_Tensor.ptl"; + + // class Model(torch.nn.Module): + // def forward(self, index): + // values = torch.tensor([4., 1., 1., 16.], ) + // return values[[index, torch.tensor(0)]] + Module bc = _load_for_mobile(testModelFile); + const auto result_1 = bc.forward({at::tensor({1}, c10::TensorOptions(c10::ScalarType::Long))}); + + at::Tensor expected_1 = at::tensor({1.}, c10::TensorOptions(c10::ScalarType::Float)); + AT_ASSERT(result_1.toTensor().equal(expected_1)); +} + +TEST(LiteInterpreterTest, Conv2d) { + // Load check in model: conv2d.ptl + auto testModelFile = "conv2d.ptl"; + + // model = torch.nn.Conv2d(1, 2, (2, 2), stride=(1, 1), padding=(1, 1)) + Module bc = _load_for_mobile(testModelFile); + const auto result_1 = bc.forward({at::ones({1, 1, 1, 1})}); + + ASSERT_EQ(result_1.toTensor().sizes(), c10::IntArrayRef ({1,2,2,2})); +} + +TEST(LiteInterpreterTest, AddTensor) { + // Load check in model: add_Tensor.ptl + auto testModelFile = "add_Tensor.ptl"; + + // class Model(torch.nn.Module): + // def forward(self, a): + // values = torch.ones(size=[2, 3], names=['N', 'C']) + // values[0][0] = a[0] + // return values + Module bc = _load_for_mobile(testModelFile); + const auto result_1 = bc.forward({at::tensor({1, 2, 3}, c10::TensorOptions(c10::ScalarType::Long))}); + + at::Tensor expected_1 = at::tensor({2, 3, 4}, c10::TensorOptions(c10::ScalarType::Long)); + AT_ASSERT(result_1.toTensor().equal(expected_1)); +} + +TEST(LiteInterpreterTest, DivideTensor) { + // Load check in model: add_Tensor.ptl + auto testModelFile = "divide_Tensor.ptl"; + + // class Model(torch.nn.Module): + // def forward(self, b): + // a = torch.tensor(3, dtype=torch.int64) + // out = torch.empty(size=[1], dtype=torch.float) + // torch.div(b, a, out=out) + // return [torch.div(b, a, rounding_mode='trunc'), out] + Module bc = _load_for_mobile(testModelFile); + const auto result_1 = bc.forward({at::tensor({-12}, c10::TensorOptions(c10::ScalarType::Long))}); + + at::Tensor expected_1 = at::tensor({-4}, c10::TensorOptions(c10::ScalarType::Long)); + at::Tensor expected_2 = at::tensor({-4.}, c10::TensorOptions(c10::ScalarType::Float)); + AT_ASSERT(result_1.toList().get(0).toTensor().equal(expected_1)); + AT_ASSERT(result_1.toList().get(1).toTensor().equal(expected_2)); +} + +TEST(LiteInterpreterTest, MultipleOps) { + // Load check in model: multiple_ops.ptl + auto testModelFile = "multiple_ops.ptl"; + + // class Model(torch.nn.Module): + // def __init__(self): + // super(Model, self).__init__() + // self.ops = torch.nn.Sequential( + // torch.nn.ReLU(), + // torch.nn.Flatten(), + // ) + // def forward(self, x): + // x[1] = -2 + // return self.ops(x) + + Module bc = _load_for_mobile(testModelFile); + auto b = at::ones({2, 2, 2, 2}); + const auto result = bc.forward({b}); + + at::Tensor expected = torch::tensor({{1, 1, 1, 1, 1, 1, 1, 1}, {0, 0, 0, 0, 0, 0, 0, 0}}, c10::TensorOptions(c10::ScalarType::Float)); + AT_ASSERT(result.toTensor().equal(expected)); +} +} // namespace mobile +} // namespace jit +} // namespace torch diff --git a/test/mobile/lightweight_dispatch/test_lightweight_dispatch.cpp b/test/mobile/lightweight_dispatch/test_lightweight_dispatch.cpp new file mode 100644 index 000000000000..5c5cabccaaaa --- /dev/null +++ b/test/mobile/lightweight_dispatch/test_lightweight_dispatch.cpp @@ -0,0 +1,18 @@ +#include + +std::string add_negative_flag(const std::string& flag) { + std::string filter = ::testing::GTEST_FLAG(filter); + if (filter.find('-') == std::string::npos) { + filter.push_back('-'); + } else { + filter.push_back(':'); + } + filter += flag; + return filter; +} +int main(int argc, char* argv[]) { + ::testing::InitGoogleTest(&argc, argv); + ::testing::GTEST_FLAG(filter) = add_negative_flag("*_CUDA:*_MultiCUDA"); + + return RUN_ALL_TESTS(); +} diff --git a/test/mobile/lightweight_dispatch/tests_setup.py b/test/mobile/lightweight_dispatch/tests_setup.py new file mode 100644 index 000000000000..91af29796b9d --- /dev/null +++ b/test/mobile/lightweight_dispatch/tests_setup.py @@ -0,0 +1,203 @@ +import os +import sys + +import torch + + +class Setup(object): + def setup(self): + raise NotImplementedError() + + def shutdown(self): + raise NotImplementedError() + + +class FileSetup(object): + path = None + + def shutdown(self): + if os.path.exists(self.path): + os.remove(self.path) + pass + + +class ModelWithDTypeDeviceLayoutPinMemory(FileSetup): + path = 'ones.ptl' + + def setup(self): + class Model(torch.nn.Module): + def forward(self, x: int): + a = torch.ones(size=[3, x], dtype=torch.int64, layout=torch.strided, device="cpu", pin_memory=False) + return a + + model = Model() + + # Script the model and save + script_model = torch.jit.script(model) + script_model._save_for_lite_interpreter(self.path) + + +class ModelWithTensorOptional(FileSetup): + path = 'index.ptl' + + def setup(self): + class Model(torch.nn.Module): + def forward(self, index): + a = torch.zeros(2, 2) + a[0][1] = 1 + a[1][0] = 2 + a[1][1] = 3 + return a[index] + + model = Model() + + # Script the model and save + script_model = torch.jit.script(model) + script_model._save_for_lite_interpreter(self.path) + + +# gradient.scalarrayint(Tensor self, *, Scalar[] spacing, int? dim=None, int edge_order=1) -> Tensor[] +class ModelWithScalarList(FileSetup): + path = 'gradient.ptl' + + def setup(self): + + class Model(torch.nn.Module): + def forward(self, a: int): + values = torch.tensor([4., 1., 1., 16.], ) + if a == 0: + return torch.gradient(values, spacing=torch.scalar_tensor(2., dtype=torch.float64)) + elif a == 1: + return torch.gradient(values, spacing=[torch.tensor(1.).item()]) + + model = Model() + + # Script the model and save + script_model = torch.jit.script(model) + script_model._save_for_lite_interpreter(self.path) + + +# upsample_linear1d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor +class ModelWithFloatList(FileSetup): + path = 'upsample.ptl' + + def setup(self): + model = torch.nn.Upsample(scale_factor=(2.0,), mode="linear", align_corners=False, recompute_scale_factor=True) + + # Script the model and save + script_model = torch.jit.script(model) + script_model._save_for_lite_interpreter(self.path) + + +# index.Tensor(Tensor self, Tensor?[] indices) -> Tensor +class ModelWithListOfOptionalTensors(FileSetup): + path = 'index_Tensor.ptl' + + def setup(self): + class Model(torch.nn.Module): + def forward(self, index): + values = torch.tensor([[4., 1., 1., 16.]]) + return values[torch.tensor(0), index] + + model = Model() + # Script the model and save + script_model = torch.jit.script(model) + script_model._save_for_lite_interpreter(self.path) + + +# conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, +# int groups=1) -> Tensor +class ModelWithArrayOfInt(FileSetup): + path = 'conv2d.ptl' + + def setup(self): + model = torch.nn.Conv2d(1, 2, (2, 2), stride=(1, 1), padding=(1, 1)) + # Script the model and save + script_model = torch.jit.script(model) + script_model._save_for_lite_interpreter(self.path) + + +# add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor +# ones_like(Tensor self, *, ScalarType?, dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, +# MemoryFormat? memory_format=None) -> Tensor +class ModelWithTensors(FileSetup): + path = 'add_Tensor.ptl' + + def setup(self): + class Model(torch.nn.Module): + def forward(self, a): + b = torch.ones_like(a) + return a + b + model = Model() + # Script the model and save + script_model = torch.jit.script(model) + script_model._save_for_lite_interpreter(self.path) + + +class ModelWithStringOptional(FileSetup): + path = 'divide_Tensor.ptl' + + def setup(self): + class Model(torch.nn.Module): + def forward(self, b): + a = torch.tensor(3, dtype=torch.int64) + out = torch.empty(size=[1], dtype=torch.float) + torch.div(b, a, out=out) + return [torch.div(b, a, rounding_mode='trunc'), out] + model = Model() + # Script the model and save + script_model = torch.jit.script(model) + script_model._save_for_lite_interpreter(self.path) + + +class ModelWithMultipleOps(FileSetup): + path = 'multiple_ops.ptl' + + def setup(self): + class Model(torch.nn.Module): + def __init__(self): + super(Model, self).__init__() + self.ops = torch.nn.Sequential( + torch.nn.ReLU(), + torch.nn.Flatten(), + ) + + def forward(self, x): + x[1] = -2 + return self.ops(x) + + model = Model() + # Script the model and save + script_model = torch.jit.script(model) + script_model._save_for_lite_interpreter(self.path) + + +tests = [ + ModelWithDTypeDeviceLayoutPinMemory(), + ModelWithTensorOptional(), + ModelWithScalarList(), + ModelWithFloatList(), + ModelWithListOfOptionalTensors(), + ModelWithArrayOfInt(), + ModelWithTensors(), + ModelWithStringOptional(), + ModelWithMultipleOps(), +] + + +def setup(): + for test in tests: + test.setup() + + +def shutdown(): + for test in tests: + test.shutdown() + + +if __name__ == "__main__": + command = sys.argv[1] + if command == "setup": + setup() + elif command == "shutdown": + shutdown() diff --git a/test/mobile/model_test/README.md b/test/mobile/model_test/README.md new file mode 100644 index 000000000000..49b21051c655 --- /dev/null +++ b/test/mobile/model_test/README.md @@ -0,0 +1,87 @@ +## What is this? +Python scripts in this folder are used to generate lite interpreter models for Android and iOS simulator tests. The goal of these tests is to detect changes that would break existing mobile models used in production (usually they are generated by earlier PyTorch versions). These scripts are based on PyTorch public API (https://pytorch.org/docs/stable/), and are grouped in a similar way: +- math_ops (https://pytorch.org/docs/stable/torch.html#math-operations) + - pointwise_ops + - reduction_ops + - comparison_ops + - spectral_ops + - other_math_ops + - blas_lapack_ops +- sampling_ops (https://pytorch.org/docs/stable/torch.html#random-sampling) +- tensor ops (https://pytorch.org/docs/stable/torch.html#tensors) + - tensor_general_ops + - tensor_creation_ops + - tensor_indexing_ops + - tensor_typing_ops + - tensor_view_ops +- nn ops (https://pytorch.org/docs/stable/nn.html) + - convolution_ops + - pooling_ops + - padding_ops + - activation_ops + - normalization_ops + - recurrent_ops + - transformer_ops + - linear_ops + - dropout_ops + - sparse_ops + - distance_function_ops + - loss_function_ops + - vision_function_ops + - shuffle_ops + - nn_utils_ops +- quantization ops (https://pytorch.org/docs/stable/quantization.html) + - general_quant_ops + - dynamic_quant_ops + - static_quant_ops + - fused_quant_ops +- TorchScript builtin ops (https://pytorch.org/docs/stable/jit_builtin_functions.html) + - torchscript_builtin_ops + - torchscript_collection_ops +- torchvision_models (https://pytorch.org/vision/stable/models.html) + - mobilenet_v2 + +The generated models are located at +https://github.com/pytorch/pytorch/tree/master/android/pytorch_android/src/androidTest/assets (Android) +https://github.com/pytorch/pytorch/tree/master/ios/TestApp/models/ (iOS) + +These test models will be executed in Android and iOS simulator tests. Note that we only check if there's error in model execution, but don't check the correctness of model output. + +## Checked-in models and on-the-fly models +Each test model has a checked-in version and a on-the-fly version. The checked-in versions are stored in this repo (see above model paths) and will only be updated when necessary. The on-the-fly version will be generated during simulator test, with a "_temp" suffix, e.g., "reduction_ops_temp.ptl". Do not commit them. + +NOTE: currently Android simulator test does not generate on-the-fly models. Only iOS test does. + +## Diagnose failed test +If the simulator test is falling, that means the current change will potentially break a production model. So be careful. The detailed error message can be found in test log. If the change has to be made, make sure it doesn't break existing production models, and update the failed test model as appropriate (see the next section). + +You can also run these tests locally, please see the insturction in android and ios folder. Remember to generate on-the-fly test models if you want to test it locally (but don't commit these models with _temp suffix). +``` +python test/mobile/model_test/gen_test_model.py ios-test +``` + +## Update test model +If for any reason a test model needs to be updated, run this script: +``` +python test/mobile/model_test/gen_test_model.py +``` +For example, +``` +python test/mobile/model_test/gen_test_model.py reduction_ops +python test/mobile/model_test/gen_test_model.py mobilenet_v2 +``` + +You can also update all test models for android and iOS: +``` +python test/mobile/model_test/gen_test_model.py android +python test/mobile/model_test/gen_test_model.py ios +``` + +## Test Coverage +The test coverage is based on the number of root ops tested in these test models. The full list of generated ops can be found in: +https://github.com/pytorch/pytorch/blob/master/test/mobile/model_test/coverage.yaml + +In additional, the simulator tests will also report the percentage of Meta's production ops that are covered. The list of production ops changes overtime, so a Meta employee needs to regularly udpate the list it using +``` +python test/mobile/model_test/update_production_ops.py ~/fbsource/xplat/pytorch_models/build/all_mobile_model_configs.yaml +``` diff --git a/test/mobile/model_test/android_api_module.py b/test/mobile/model_test/android_api_module.py new file mode 100644 index 000000000000..109e3aa963e8 --- /dev/null +++ b/test/mobile/model_test/android_api_module.py @@ -0,0 +1,128 @@ +from typing import Dict, List, Tuple, Optional + +import torch +from torch import Tensor + + +class AndroidAPIModule(torch.jit.ScriptModule): + def __init__(self): + super(AndroidAPIModule, self).__init__() + + @torch.jit.script_method + def forward(self, input): + return None + + @torch.jit.script_method + def eqBool(self, input: bool) -> bool: + return input + + @torch.jit.script_method + def eqInt(self, input: int) -> int: + return input + + @torch.jit.script_method + def eqFloat(self, input: float) -> float: + return input + + @torch.jit.script_method + def eqStr(self, input: str) -> str: + return input + + @torch.jit.script_method + def eqTensor(self, input: Tensor) -> Tensor: + return input + + @torch.jit.script_method + def eqDictStrKeyIntValue(self, input: Dict[str, int]) -> Dict[str, int]: + return input + + @torch.jit.script_method + def eqDictIntKeyIntValue(self, input: Dict[int, int]) -> Dict[int, int]: + return input + + @torch.jit.script_method + def eqDictFloatKeyIntValue(self, input: Dict[float, int]) -> Dict[float, int]: + return input + + @torch.jit.script_method + def listIntSumReturnTuple(self, input: List[int]) -> Tuple[List[int], int]: + sum = 0 + for x in input: + sum += x + return (input, sum) + + @torch.jit.script_method + def listBoolConjunction(self, input: List[bool]) -> bool: + res = True + for x in input: + res = res and x + return res + + @torch.jit.script_method + def listBoolDisjunction(self, input: List[bool]) -> bool: + res = False + for x in input: + res = res or x + return res + + @torch.jit.script_method + def tupleIntSumReturnTuple( + self, input: Tuple[int, int, int] + ) -> Tuple[Tuple[int, int, int], int]: + sum = 0 + for x in input: + sum += x + return (input, sum) + + @torch.jit.script_method + def optionalIntIsNone(self, input: Optional[int]) -> bool: + return input is None + + @torch.jit.script_method + def intEq0None(self, input: int) -> Optional[int]: + if input == 0: + return None + return input + + @torch.jit.script_method + def str3Concat(self, input: str) -> str: + return input + input + input + + @torch.jit.script_method + def newEmptyShapeWithItem(self, input): + return torch.tensor([int(input.item())])[0] + + @torch.jit.script_method + def testAliasWithOffset(self) -> List[Tensor]: + x = torch.tensor([100, 200]) + a = [x[0], x[1]] + return a + + @torch.jit.script_method + def testNonContiguous(self): + x = torch.tensor([100, 200, 300])[::2] + assert not x.is_contiguous() + assert x[0] == 100 + assert x[1] == 300 + return x + + @torch.jit.script_method + def conv2d(self, x: Tensor, w: Tensor, toChannelsLast: bool) -> Tensor: + r = torch.nn.functional.conv2d(x, w) + if toChannelsLast: + r = r.contiguous(memory_format=torch.channels_last) + else: + r = r.contiguous() + return r + + @torch.jit.script_method + def contiguous(self, x: Tensor) -> Tensor: + return x.contiguous() + + @torch.jit.script_method + def contiguousChannelsLast(self, x: Tensor) -> Tensor: + return x.contiguous(memory_format=torch.channels_last) + + @torch.jit.script_method + def contiguousChannelsLast3d(self, x: Tensor) -> Tensor: + return x.contiguous(memory_format=torch.channels_last_3d) diff --git a/test/mobile/model_test/builtin_ops.py b/test/mobile/model_test/builtin_ops.py new file mode 100644 index 000000000000..75b57f7b0613 --- /dev/null +++ b/test/mobile/model_test/builtin_ops.py @@ -0,0 +1,125 @@ +import torch + + +# https://pytorch.org/docs/stable/jit_builtin_functions.html#builtin-functions + + +class TSBuiltinOpsModule(torch.nn.Module): + def __init__(self): + super(TSBuiltinOpsModule, self).__init__() + + def forward(self): + x = torch.tensor(1) + y = torch.tensor(0.5) + b = float(1) + s = "abcde" + l = ["1", "2", "test", "a{}b"] + d = {"key": 1} + d2 = {0: 100} + return len( + # type + bool(x), + bool(x.item()), + int(y), + int(y.item()), + float(x), + float(x.item()), + # math + x & x, + bool(x) & bool(x), + int(x) & int(x), + x | x, + bool(x) | bool(x), + int(x) | int(x), + x << x, + int(x) << int(x), + x >> x, + int(x) >> int(x), + x ^ x, + bool(x) ^ bool(x), + int(x) ^ int(x), + b * float(x), + b * int(x), + b + float(x), + b - float(x), + x.item() + y.item(), + x.item() - y.item(), + x.item() * y.item(), + x.item() / y.item(), + float(x) < float(y), + float(x) <= float(y), + float(x) > float(y), + float(x) > int(y), + float(x) >= float(y), + float(x) >= int(y), + float(x) == float(y), + float(x) == int(y), + float(x) != float(y), + int(x) != float(y), + float(x) / float(y), + int(x) / int(y), + max(x), + max(x.item(), y.item()), + max(int(x), int(y)), + max(float(x), float(y)), + min(x), + min(x.item(), y.item()), + min(int(x), int(y)), + min(float(x), float(y)), + int(l[0]), + float(l[0]), + # string + str(torch.tensor(1)), + l[2].find("t"), + l[2].replace("t", "x"), + l[2].lower(), + l[2].startswith("t"), + l[2].split("t"), + l[2].strip(), + l[2].rstrip(), + l[2].lstrip(), + l[2][slice(2)], + l[3].format("x"), + ord(l[2][0]), + len(torch.randn(3)), + len(l), + len(l[2]), + len(d), + len(d2), + ) + + +class TSCollectionOpsModule(torch.nn.Module): + def __init__(self): + super(TSCollectionOpsModule, self).__init__() + + def forward(self): + s = "abcde" + # list + l = ["1", "2", "test"] + l.reverse() + l.reverse() + l[1] = "3" + l.extend(["4"]) + # str dict + d = {"key": 1} + d.clear() + d.update({"key": 0}) + if "key" in d: + d["key"] = 2 + # int dict + d2 = {0: 100} + if 0 in d2: + d2.clear() + d2[0] = 100 + + return len( + s[torch.tensor(1)], + d["key"], + d2[0], + d.keys(), + d.items(), + d.values(), + d2.values(), + l.pop(), + ) diff --git a/test/mobile/model_test/coverage.yaml b/test/mobile/model_test/coverage.yaml new file mode 100644 index 000000000000..5433fea4df10 --- /dev/null +++ b/test/mobile/model_test/coverage.yaml @@ -0,0 +1,1094 @@ +_coverage: 87.53 +_covered_ops: 344 +_generated_ops: 693 +_production_ops: 393 +_uncovered_ops: 49 +all_generated_ops: +- aten::Bool.Tensor +- aten::Bool.int +- aten::Float.Scalar +- aten::Float.Tensor +- aten::Float.str +- aten::FloatImplicit +- aten::Int.Scalar +- aten::Int.Tensor +- aten::Int.float +- aten::Int.str +- aten::IntImplicit +- aten::ScalarImplicit +- aten::__and__.Tensor +- aten::__and__.bool +- aten::__and__.int +- aten::__contains__.int +- aten::__contains__.int_list +- aten::__contains__.str +- aten::__contains__.str_list +- aten::__derive_index +- aten::__getitem__.str +- aten::__getitem__.t +- aten::__lshift__.Tensor +- aten::__lshift__.int +- aten::__or__.Tensor +- aten::__or__.bool +- aten::__or__.int +- aten::__range_length +- aten::__rshift__.Tensor +- aten::__rshift__.int +- aten::__xor__.Tensor +- aten::__xor__.bool +- aten::__xor__.int +- aten::_infer_size +- aten::_set_item.int +- aten::_set_item.str +- aten::_set_item.t +- aten::_shape_as_tensor +- aten::_unique2 +- aten::abs +- aten::acos +- aten::acosh +- aten::adaptive_avg_pool1d +- aten::adaptive_avg_pool2d +- aten::adaptive_avg_pool3d +- aten::adaptive_max_pool1d +- aten::adaptive_max_pool2d +- aten::adaptive_max_pool3d +- aten::add +- aten::add.Scalar +- aten::add.Tensor +- aten::add.float +- aten::add.int +- aten::add.out +- aten::add.str +- aten::add.t +- aten::add_.Scalar +- aten::add_.Tensor +- aten::add_.t +- aten::addbmm +- aten::addcdiv +- aten::addcmul +- aten::addmm +- aten::addmv +- aten::addr +- aten::all +- aten::allclose +- aten::alpha_dropout +- aten::alpha_dropout_ +- aten::amax +- aten::amin +- aten::aminmax +- aten::angle +- aten::any +- aten::append.t +- aten::arange +- aten::arange.start +- aten::arange.start_step +- aten::argmax +- aten::argmin +- aten::argsort +- aten::as_strided +- aten::as_tensor.list +- aten::asin +- aten::asinh +- aten::atan +- aten::atan2 +- aten::atanh +- aten::atleast_1d +- aten::atleast_2d +- aten::atleast_3d +- aten::avg_pool1d +- aten::avg_pool2d +- aten::avg_pool3d +- aten::baddbmm +- aten::bartlett_window +- aten::batch_norm +- aten::bernoulli +- aten::bernoulli_.float +- aten::bilinear +- aten::binary_cross_entropy +- aten::binary_cross_entropy_with_logits +- aten::bincount +- aten::bitwise_and.Tensor +- aten::bitwise_not +- aten::bitwise_or.Tensor +- aten::bitwise_xor.Tensor +- aten::blackman_window +- aten::block_diag +- aten::bmm +- aten::broadcast_tensors +- aten::broadcast_to +- aten::bucketize.Tensor +- aten::cartesian_prod +- aten::cat +- aten::cauchy_ +- aten::cdist +- aten::ceil +- aten::ceil.Scalar +- aten::ceil.float +- aten::celu +- aten::chain_matmul +- aten::channel_shuffle +- aten::chunk +- aten::clamp +- aten::clamp_ +- aten::clamp_min +- aten::clear.int +- aten::clear.str +- aten::clone +- aten::coalesce +- aten::col2im +- aten::column_stack +- aten::combinations +- aten::complex +- aten::conj +- aten::constant_pad_nd +- aten::contiguous +- aten::conv1d +- aten::conv2d +- aten::conv3d +- aten::conv_transpose1d +- aten::conv_transpose2d.input +- aten::conv_transpose3d.input +- aten::copy_ +- aten::copy_.float +- aten::copy_.int +- aten::copysign.Scalar +- aten::copysign.Tensor +- aten::corrcoef +- aten::cos +- aten::cosh +- aten::cosine_embedding_loss +- aten::cosine_similarity +- aten::count_nonzero +- aten::cpu +- aten::cross +- aten::cross_entropy_loss +- aten::ctc_loss.Tensor +- aten::cummax +- aten::cummin +- aten::cumprod +- aten::cumsum +- aten::cumulative_trapezoid.x +- aten::deg2rad +- aten::dense_dim +- aten::dequantize.self +- aten::detach +- aten::detach_ +- aten::diag +- aten::diag_embed +- aten::diagflat +- aten::diagonal +- aten::diagonal_scatter +- aten::diff +- aten::digamma +- aten::dist +- aten::div +- aten::div.Scalar +- aten::div.Tensor +- aten::div.Tensor_mode +- aten::div.float +- aten::div.int +- aten::div_.Tensor +- aten::dot +- aten::dropout +- aten::dropout_ +- aten::dsplit.array +- aten::dstack +- aten::einsum +- aten::element_size +- aten::elu +- aten::embedding +- aten::embedding_bag.padding_idx +- aten::empty.memory_format +- aten::empty_like +- aten::empty_strided +- aten::eq.Scalar +- aten::eq.Tensor +- aten::eq.float +- aten::eq.float_int +- aten::eq.int +- aten::eq.int_list +- aten::eq.str +- aten::equal +- aten::erf +- aten::erfc +- aten::erfinv +- aten::exp +- aten::exp.float +- aten::exp2 +- aten::expand +- aten::expand_as +- aten::expm1 +- aten::exponential_ +- aten::extend.t +- aten::eye +- aten::fake_quantize_per_channel_affine +- aten::fake_quantize_per_tensor_affine +- aten::feature_alpha_dropout +- aten::feature_alpha_dropout_ +- aten::feature_dropout +- aten::feature_dropout_ +- aten::fill_.Scalar +- aten::fill_diagonal_ +- aten::find +- aten::flatten.using_ints +- aten::flip +- aten::fliplr +- aten::flipud +- aten::float_power.Tensor_Scalar +- aten::float_power.Tensor_Tensor +- aten::floor +- aten::floor.float +- aten::floor_divide +- aten::floor_divide.Scalar +- aten::floordiv.int +- aten::fmax +- aten::fmin +- aten::fmod.Scalar +- aten::frac +- aten::fractional_max_pool2d +- aten::fractional_max_pool3d +- aten::frobenius_norm.dim +- aten::frobenius_norm.out +- aten::full +- aten::full_like +- aten::gather +- aten::gcd +- aten::ge.Scalar +- aten::ge.Tensor +- aten::ge.float +- aten::ge.float_int +- aten::ge.int +- aten::gelu +- aten::geometric_ +- aten::glu +- aten::grid_sampler +- aten::group_norm +- aten::gru.input +- aten::gru_cell +- aten::gt.Scalar +- aten::gt.Tensor +- aten::gt.float +- aten::gt.float_int +- aten::gt.int +- aten::hamming_window +- aten::hann_window +- aten::hardshrink +- aten::hardsigmoid +- aten::hardsigmoid_ +- aten::hardswish +- aten::hardswish_ +- aten::hardtanh +- aten::hardtanh_ +- aten::heaviside +- aten::hinge_embedding_loss +- aten::histc +- aten::histogram.bin_ct +- aten::hsplit.array +- aten::hstack +- aten::huber_loss +- aten::hypot +- aten::i0 +- aten::igamma +- aten::igammac +- aten::im2col +- aten::imag +- aten::index.Tensor +- aten::index_fill.int_Scalar +- aten::index_put.hacked_twin +- aten::index_put_.hacked_twin +- aten::index_select +- aten::inner +- aten::instance_norm +- aten::is_coalesced +- aten::is_complex +- aten::is_conj +- aten::is_contiguous +- aten::is_floating_point +- aten::is_leaf +- aten::is_nonzero +- aten::is_pinned +- aten::is_set_to +- aten::is_signed +- aten::isclose +- aten::isfinite +- aten::isin.Tensor_Tensor +- aten::isinf +- aten::isnan +- aten::isneginf +- aten::isposinf +- aten::isreal +- aten::istft +- aten::item +- aten::items.str +- aten::kaiser_window +- aten::keys.str +- aten::kl_div +- aten::kron +- aten::kthvalue +- aten::l1_loss +- aten::layer_norm +- aten::lcm +- aten::ldexp.Tensor +- aten::le.Scalar +- aten::le.Tensor +- aten::le.float +- aten::le.int +- aten::leaky_relu +- aten::leaky_relu_ +- aten::len.Dict_int +- aten::len.Dict_str +- aten::len.Tensor +- aten::len.str +- aten::len.t +- aten::lerp.Scalar +- aten::lerp.Tensor +- aten::lgamma +- aten::linalg_matrix_exp +- aten::linalg_matrix_power +- aten::linear +- aten::linspace +- aten::list.t +- aten::log +- aten::log10 +- aten::log1p +- aten::log2 +- aten::log_normal_ +- aten::log_sigmoid +- aten::log_softmax.int +- aten::logaddexp +- aten::logaddexp2 +- aten::logcumsumexp +- aten::logical_and +- aten::logical_and.out +- aten::logical_not +- aten::logical_not.out +- aten::logical_or +- aten::logical_or.out +- aten::logical_xor +- aten::logical_xor.out +- aten::logit +- aten::logspace +- aten::logsumexp +- aten::lower +- aten::lstm.input +- aten::lstm_cell +- aten::lstrip +- aten::lt.Scalar +- aten::lt.Tensor +- aten::lt.float +- aten::lt.int +- aten::margin_ranking_loss +- aten::masked_fill.Scalar +- aten::masked_fill_.Scalar +- aten::masked_select +- aten::matmul +- aten::max +- aten::max.dim +- aten::max.other +- aten::max_pool1d +- aten::max_pool2d +- aten::max_pool3d +- aten::maximum +- aten::mean +- aten::mean.dim +- aten::median +- aten::meshgrid +- aten::meshgrid.indexing +- aten::min +- aten::min.dim +- aten::min.other +- aten::minimum +- aten::mish +- aten::mm +- aten::mode +- aten::movedim.int +- aten::mse_loss +- aten::msort +- aten::mul +- aten::mul.Scalar +- aten::mul.Tensor +- aten::mul.float +- aten::mul.float_int +- aten::mul.int +- aten::mul.int_float +- aten::mul.left_t +- aten::mul.out +- aten::mul_.Scalar +- aten::mul_.Tensor +- aten::multi_margin_loss +- aten::multilabel_margin_loss +- aten::multinomial +- aten::mv +- aten::mvlgamma +- aten::nan_to_num +- aten::nan_to_num_ +- aten::nanmean +- aten::nanmedian +- aten::nanquantile +- aten::nansum +- aten::narrow +- aten::ne.Scalar +- aten::ne.Tensor +- aten::ne.float +- aten::ne.int +- aten::ne.int_float +- aten::ne.int_list +- aten::ne.str +- aten::neg +- aten::neg.int +- aten::new_empty +- aten::new_full +- aten::new_ones +- aten::new_zeros +- aten::nll_loss_nd +- aten::nonzero +- aten::norm.Scalar +- aten::norm.ScalarOpt_dim +- aten::norm.ScalarOpt_dim_dtype +- aten::norm.dtype_out +- aten::norm.out +- aten::normal.float_float +- aten::normal_ +- aten::nuclear_norm +- aten::nuclear_norm.dim +- aten::nuclear_norm.dim_out +- aten::nuclear_norm.out +- aten::numel +- aten::one_hot +- aten::ones +- aten::ones_like +- aten::ord +- aten::outer +- aten::pad_sequence +- aten::pairwise_distance +- aten::pdist +- aten::permute +- aten::pixel_shuffle +- aten::pixel_unshuffle +- aten::poisson +- aten::poisson_nll_loss +- aten::polar +- aten::polygamma +- aten::pop.t +- aten::pow.Tensor_Scalar +- aten::pow.Tensor_Tensor +- aten::pow.int_float +- aten::prelu +- aten::prod +- aten::quantile +- aten::quantile.scalar +- aten::quantize_per_channel +- aten::quantize_per_tensor +- aten::quantize_per_tensor.tensor_qparams +- aten::quantized_gru.input +- aten::quantized_lstm.input +- aten::rad2deg +- aten::rand +- aten::rand_like +- aten::randint +- aten::randint.low +- aten::randint_like +- aten::randn +- aten::randn_like +- aten::random_ +- aten::randperm +- aten::range.step +- aten::ravel +- aten::real +- aten::reciprocal +- aten::reflection_pad1d +- aten::reflection_pad2d +- aten::reflection_pad3d +- aten::relu +- aten::relu_ +- aten::remainder.Scalar +- aten::remainder.int +- aten::renorm +- aten::repeat +- aten::repeat_interleave.Tensor +- aten::replace +- aten::replication_pad1d +- aten::replication_pad2d +- aten::replication_pad3d +- aten::requires_grad_ +- aten::reshape +- aten::resize_as_ +- aten::resolve_conj +- aten::resolve_neg +- aten::reverse.t +- aten::rnn_tanh.input +- aten::rnn_tanh_cell +- aten::roll +- aten::rot90 +- aten::round +- aten::round.Scalar +- aten::rrelu +- aten::rsqrt +- aten::rstrip +- aten::scatter.src +- aten::scatter_.src +- aten::scatter_add +- aten::scatter_add_ +- aten::searchsorted.Tensor +- aten::select.int +- aten::select_scatter +- aten::selu +- aten::sgn +- aten::sigmoid +- aten::sign +- aten::signbit +- aten::silu +- aten::sin +- aten::sinc +- aten::sinh +- aten::size +- aten::size.int +- aten::slice.Tensor +- aten::slice.str +- aten::slice.t +- aten::slice_scatter +- aten::smooth_l1_loss +- aten::soft_margin_loss +- aten::softmax.int +- aten::softplus +- aten::softshrink +- aten::sort +- aten::split +- aten::split.Tensor +- aten::split.str +- aten::sqrt +- aten::sqrt.int +- aten::square +- aten::squeeze.dim +- aten::squeeze_.dim +- aten::stack +- aten::startswith +- aten::std +- aten::std_mean +- aten::stft +- aten::str +- aten::strip +- aten::sub +- aten::sub.Scalar +- aten::sub.Tensor +- aten::sub.float +- aten::sub.int +- aten::sub_.Tensor +- aten::sum +- aten::sum.dim_IntList +- aten::sum.int +- aten::t +- aten::take +- aten::take_along_dim +- aten::tan +- aten::tanh +- aten::tensor +- aten::tensor.float +- aten::tensor.int +- aten::tensor_split.indices +- aten::tensor_split.sections +- aten::tensordot +- aten::tensordot.out +- aten::tile +- aten::to.device +- aten::to.dtype +- aten::to.dtype_layout +- aten::to.prim_Device +- aten::topk +- aten::trace +- aten::transpose.int +- aten::trapezoid.x +- aten::trapz.x +- aten::tril +- aten::tril_indices +- aten::triplet_margin_loss +- aten::triu +- aten::triu_indices +- aten::trunc +- aten::trunc_ +- aten::type_as +- aten::unbind.int +- aten::unflatten.int +- aten::unfold +- aten::uniform_ +- aten::unique_consecutive +- aten::unique_dim +- aten::unsqueeze +- aten::unsqueeze_ +- aten::update.str +- aten::upsample_bicubic2d.vec +- aten::upsample_bilinear2d.vec +- aten::upsample_linear1d.vec +- aten::upsample_nearest1d.vec +- aten::upsample_nearest2d.vec +- aten::upsample_nearest3d.vec +- aten::upsample_trilinear3d.vec +- aten::values.int +- aten::values.str +- aten::vander +- aten::var +- aten::var_mean +- aten::vdot +- aten::view +- aten::view_as +- aten::view_as_complex +- aten::view_as_real +- aten::vsplit.array +- aten::vstack +- aten::where +- aten::where.ScalarOther +- aten::where.self +- aten::xlogy.Scalar_Other +- aten::xlogy.Scalar_Self +- aten::xlogy.Tensor +- aten::zeros +- aten::zeros.out +- aten::zeros_like +- prepacked::conv2d_clamp_run +- prepacked::linear_clamp_run +- prim::TupleUnpack +- prim::is_meta +- prim::is_quantized +- prim::is_sparse +- prim::max +- prim::max.float +- prim::max.int +- prim::max.self_int +- prim::min +- prim::min.float +- prim::min.int +- prim::min.self_int +- prim::unchecked_cast +- quantized::add +- quantized::add_relu +- quantized::add_scalar +- quantized::batch_norm2d +- quantized::batch_norm3d +- quantized::cat +- quantized::conv1d +- quantized::conv1d_prepack +- quantized::conv1d_relu +- quantized::conv1d_unpack +- quantized::conv2d.new +- quantized::conv2d_prepack +- quantized::conv2d_relu.new +- quantized::conv2d_unpack +- quantized::conv3d.new +- quantized::conv3d_prepack +- quantized::conv3d_relu.new +- quantized::conv3d_unpack +- quantized::conv_transpose1d +- quantized::conv_transpose1d_prepack +- quantized::conv_transpose1d_unpack +- quantized::conv_transpose2d +- quantized::conv_transpose2d_prepack +- quantized::conv_transpose3d_prepack +- quantized::embedding_4bit +- quantized::embedding_byte +- quantized::hardswish +- quantized::instance_norm +- quantized::leaky_relu +- quantized::linear +- quantized::linear_dynamic +- quantized::linear_dynamic_fp16 +- quantized::linear_relu +- quantized::mul +- quantized::mul_scalar +- quantized::quantized_gru_cell_dynamic +- quantized::quantized_lstm_cell_dynamic +- quantized::quantized_rnn_tanh_cell_dynamic +covered_ops: + aten::Bool.Tensor: 19 + aten::Bool.int: 7 + aten::Float.Scalar: 18 + aten::Float.Tensor: 11 + aten::Float.str: 6 + aten::FloatImplicit: 2 + aten::Int.Scalar: 19 + aten::Int.Tensor: 35 + aten::Int.float: 6 + aten::Int.str: 12 + aten::IntImplicit: 11 + aten::ScalarImplicit: 3 + aten::__and__.Tensor: 13 + aten::__and__.bool: 11 + aten::__and__.int: 2 + aten::__contains__.int: 5 + aten::__contains__.int_list: 17 + aten::__contains__.str: 22 + aten::__contains__.str_list: 5 + aten::__derive_index: 24 + aten::__getitem__.str: 20 + aten::__getitem__.t: 178 + aten::__lshift__.int: 2 + aten::__range_length: 23 + aten::__rshift__.int: 2 + aten::__xor__.bool: 10 + aten::_infer_size: 7 + aten::_set_item.int: 7 + aten::_set_item.str: 163 + aten::_set_item.t: 8 + aten::_shape_as_tensor: 10 + aten::adaptive_avg_pool1d: 1 + aten::adaptive_avg_pool2d: 33 + aten::adaptive_avg_pool3d: 1 + aten::add.Scalar: 33 + aten::add.Tensor: 63 + aten::add.float: 5 + aten::add.int: 49 + aten::add.out: 2 + aten::add.str: 29 + aten::add.t: 11 + aten::add_.Scalar: 15 + aten::add_.Tensor: 29 + aten::addcmul: 2 + aten::addmm: 7 + aten::all: 6 + aten::allclose: 1 + aten::any: 14 + aten::append.t: 59 + aten::arange: 16 + aten::arange.start: 6 + aten::arange.start_step: 16 + aten::argmax: 2 + aten::as_strided: 10 + aten::as_tensor.list: 4 + aten::atan: 4 + aten::avg_pool1d: 6 + aten::avg_pool2d: 7 + aten::batch_norm: 15 + aten::binary_cross_entropy: 15 + aten::binary_cross_entropy_with_logits: 3 + aten::bitwise_not: 13 + aten::bmm: 16 + aten::broadcast_tensors: 1 + aten::cat: 90 + aten::ceil: 3 + aten::ceil.float: 7 + aten::chunk: 19 + aten::clamp: 36 + aten::clamp_: 12 + aten::clamp_min: 3 + aten::clear.str: 2 + aten::clone: 26 + aten::coalesce: 2 + aten::conj: 1 + aten::constant_pad_nd: 17 + aten::contiguous: 113 + aten::conv1d: 12 + aten::conv2d: 10 + aten::conv_transpose2d.input: 5 + aten::copy_: 15 + aten::copy_.int: 1 + aten::cos: 4 + aten::count_nonzero: 4 + aten::ctc_loss.Tensor: 1 + aten::cumsum: 13 + aten::dequantize.self: 30 + aten::detach: 34 + aten::div: 9 + aten::div.Scalar: 8 + aten::div.Tensor: 71 + aten::div.Tensor_mode: 7 + aten::div.float: 3 + aten::div.int: 7 + aten::div_.Tensor: 7 + aten::dropout: 41 + aten::embedding: 16 + aten::embedding_bag.padding_idx: 2 + aten::empty.memory_format: 11 + aten::empty_like: 11 + aten::empty_strided: 3 + aten::eq.Scalar: 24 + aten::eq.Tensor: 6 + aten::eq.int: 57 + aten::eq.int_list: 20 + aten::eq.str: 43 + aten::exp: 18 + aten::exp.float: 4 + aten::expand: 26 + aten::expand_as: 3 + aten::extend.t: 38 + aten::feature_dropout: 1 + aten::fill_.Scalar: 17 + aten::find: 3 + aten::flatten.using_ints: 45 + aten::flip: 1 + aten::floor: 5 + aten::floor.float: 2 + aten::floor_divide: 4 + aten::floor_divide.Scalar: 7 + aten::floordiv.int: 21 + aten::full: 10 + aten::full_like: 10 + aten::gather: 10 + aten::ge.Scalar: 4 + aten::ge.Tensor: 6 + aten::ge.int: 29 + aten::gelu: 12 + aten::glu: 18 + aten::grid_sampler: 3 + aten::gt.Scalar: 16 + aten::gt.float: 16 + aten::gt.float_int: 3 + aten::gt.int: 52 + aten::hardsigmoid: 3 + aten::hardsigmoid_: 2 + aten::hardswish_: 4 + aten::hardtanh: 3 + aten::hardtanh_: 3 + aten::hstack: 2 + aten::index.Tensor: 23 + aten::index_fill.int_Scalar: 15 + aten::index_select: 31 + aten::is_coalesced: 2 + aten::is_floating_point: 9 + aten::isnan: 1 + aten::item: 40 + aten::items.str: 3 + aten::keys.str: 15 + aten::layer_norm: 26 + aten::le.Scalar: 1 + aten::le.Tensor: 10 + aten::le.float: 2 + aten::le.int: 17 + aten::leaky_relu: 1 + aten::leaky_relu_: 5 + aten::len.Dict_int: 5 + aten::len.Tensor: 19 + aten::len.str: 23 + aten::len.t: 177 + aten::linear: 46 + aten::linspace: 3 + aten::list.t: 24 + aten::log: 18 + aten::log10: 4 + aten::log1p: 5 + aten::log_softmax.int: 31 + aten::logical_and: 1 + aten::logical_not: 10 + aten::logit: 7 + aten::lower: 10 + aten::lstm.input: 4 + aten::lt.Scalar: 8 + aten::lt.Tensor: 1 + aten::lt.float: 16 + aten::lt.int: 46 + aten::masked_fill.Scalar: 16 + aten::matmul: 12 + aten::max: 18 + aten::max.dim: 30 + aten::max.other: 7 + aten::max_pool2d: 10 + aten::maximum: 4 + aten::mean: 10 + aten::mean.dim: 16 + aten::meshgrid.indexing: 2 + aten::min: 2 + aten::min.dim: 4 + aten::min.other: 17 + aten::minimum: 4 + aten::mse_loss: 1 + aten::mul.Scalar: 26 + aten::mul.Tensor: 90 + aten::mul.float: 5 + aten::mul.float_int: 3 + aten::mul.int: 26 + aten::mul.int_float: 4 + aten::mul.left_t: 15 + aten::mul.out: 1 + aten::mul_.Scalar: 11 + aten::mul_.Tensor: 5 + aten::nan_to_num: 3 + aten::nan_to_num_: 10 + aten::narrow: 10 + aten::ne.Scalar: 14 + aten::ne.Tensor: 5 + aten::ne.int: 44 + aten::ne.int_float: 2 + aten::ne.int_list: 20 + aten::ne.str: 3 + aten::neg: 29 + aten::neg.int: 19 + aten::new_zeros: 6 + aten::nll_loss_nd: 3 + aten::nonzero: 4 + aten::norm.Scalar: 1 + aten::norm.ScalarOpt_dim: 4 + aten::numel: 8 + aten::one_hot: 2 + aten::ones: 38 + aten::ones_like: 16 + aten::ord: 20 + aten::permute: 43 + aten::pop.t: 7 + aten::pow.Tensor_Scalar: 3 + aten::pow.int_float: 2 + aten::quantile.scalar: 1 + aten::quantize_per_tensor: 66 + aten::quantize_per_tensor.tensor_qparams: 1 + aten::rand: 25 + aten::randint.low: 2 + aten::randn_like: 17 + aten::reciprocal: 1 + aten::reflection_pad2d: 1 + aten::relu: 82 + aten::relu_: 9 + aten::remainder.Scalar: 2 + aten::remainder.int: 22 + aten::repeat: 16 + aten::replace: 1 + aten::replication_pad1d: 1 + aten::replication_pad2d: 2 + aten::replication_pad3d: 1 + aten::requires_grad_: 4 + aten::reshape: 36 + aten::resize_as_: 1 + aten::resolve_conj: 1 + aten::resolve_neg: 1 + aten::reverse.t: 2 + aten::round.Scalar: 4 + aten::rstrip: 1 + aten::scatter_.src: 6 + aten::scatter_add_: 10 + aten::select.int: 57 + aten::selu: 2 + aten::sigmoid: 93 + aten::sin: 4 + aten::size: 66 + aten::size.int: 66 + aten::slice.Tensor: 75 + aten::slice.str: 12 + aten::slice.t: 43 + aten::softmax.int: 63 + aten::softplus: 2 + aten::sort: 18 + aten::split.str: 10 + aten::sqrt: 1 + aten::squeeze.dim: 26 + aten::stack: 30 + aten::startswith: 10 + aten::str: 16 + aten::strip: 3 + aten::sub: 8 + aten::sub.Scalar: 26 + aten::sub.Tensor: 94 + aten::sub.int: 52 + aten::sub_.Tensor: 4 + aten::sum: 17 + aten::sum.dim_IntList: 19 + aten::sum.int: 1 + aten::t: 3 + aten::tanh: 26 + aten::tensor: 51 + aten::tensor.float: 28 + aten::tensor.int: 34 + aten::tensor_split.indices: 4 + aten::to.device: 11 + aten::to.dtype: 23 + aten::to.dtype_layout: 27 + aten::to.prim_Device: 23 + aten::topk: 10 + aten::transpose.int: 33 + aten::triu: 10 + aten::trunc_: 3 + aten::type_as: 6 + aten::unbind.int: 24 + aten::unique_consecutive: 2 + aten::unsqueeze: 34 + aten::unsqueeze_: 6 + aten::update.str: 4 + aten::upsample_bicubic2d.vec: 1 + aten::upsample_bilinear2d.vec: 8 + aten::upsample_linear1d.vec: 1 + aten::upsample_nearest1d.vec: 2 + aten::upsample_nearest2d.vec: 30 + aten::upsample_nearest3d.vec: 2 + aten::upsample_trilinear3d.vec: 1 + aten::values.int: 3 + aten::view: 61 + aten::vstack: 1 + aten::where.ScalarOther: 4 + aten::where.self: 10 + aten::zeros: 75 + aten::zeros.out: 1 + aten::zeros_like: 7 + prepacked::conv2d_clamp_run: 32 + prepacked::linear_clamp_run: 26 + prim::TupleUnpack: 120 + prim::max.float: 7 + prim::max.int: 14 + prim::max.self_int: 17 + prim::min: 4 + prim::min.int: 35 + prim::min.self_int: 25 + prim::unchecked_cast: 100 + quantized::add: 58 + quantized::add_relu: 1 + quantized::batch_norm2d: 1 + quantized::cat: 4 + quantized::conv1d: 1 + quantized::conv2d.new: 55 + quantized::conv2d_prepack: 14 + quantized::conv2d_relu.new: 50 + quantized::conv_transpose2d: 2 + quantized::embedding_4bit: 1 + quantized::embedding_byte: 14 + quantized::hardswish: 1 + quantized::instance_norm: 1 + quantized::leaky_relu: 2 + quantized::linear: 27 + quantized::linear_dynamic: 21 + quantized::linear_dynamic_fp16: 18 + quantized::linear_relu: 2 + quantized::mul: 4 +uncovered_ops: + aten::__getitem__.Dict_int: 4 + aten::__getitem__.Dict_str: 39 + aten::__is__: 83 + aten::__isnot__: 81 + aten::__not__: 32 + aten::_aminmax: 4 + aten::_convolution: 12 + aten::_convolution.deprecated: 3 + aten::_make_per_tensor_quantized_tensor: 2 + aten::_pack_padded_sequence: 10 + aten::_pad_packed_sequence: 10 + aten::_reshape_from_tensor: 10 + aten::backward: 23 + aten::copy_.Tensor: 27 + aten::dequantize.list: 1 + aten::dequantize.tensor: 36 + aten::dim: 36 + aten::format: 58 + aten::get.default_str: 14 + aten::index_put_: 16 + aten::lstm.data: 8 + aten::nll_loss: 1 + aten::nll_loss2d: 1 + aten::quantized_lstm.data: 2 + aten::rsub.Scalar: 5 + aten::sparse_coo_tensor.indices: 1 + aten::sparse_resize_and_clear_: 1 + aten::to.prim_dtype: 38 + aten::true_divide.Tensor: 2 + aten::upsample_nearest2d: 7 + prepacked::conv2d_clamp_prepack: 2 + prepacked::conv2d_transpose_clamp_prepack: 1 + prepacked::conv2d_transpose_clamp_run: 1 + prim::ModuleContainerIndex.list: 2 + prim::NumToTensor.Scalar: 15 + prim::Print: 1 + prim::RaiseException: 103 + prim::TupleIndex: 157 + prim::Uninitialized: 80 + prim::device: 46 + prim::dtype: 45 + prim::is_cuda: 1 + quantized::conv2d: 4 + quantized::conv_prepack: 5 + quantized::linear_prepack: 29 + quantized::linear_prepack_fp16: 25 + quantized::linear_unpack: 4 + quantized::linear_unpack_fp16: 4 + quantized::mul.Scalar: 1 diff --git a/test/mobile/model_test/gen_test_model.py b/test/mobile/model_test/gen_test_model.py new file mode 100644 index 000000000000..e9e3908630be --- /dev/null +++ b/test/mobile/model_test/gen_test_model.py @@ -0,0 +1,243 @@ +import io +import sys +import torch +import yaml +from android_api_module import AndroidAPIModule +from builtin_ops import ( + TSBuiltinOpsModule, + TSCollectionOpsModule, +) +from math_ops import ( + PointwiseOpsModule, + ReductionOpsModule, + ComparisonOpsModule, + OtherMathOpsModule, + SpectralOpsModule, + BlasLapackOpsModule, +) +from nn_ops import ( + NNConvolutionModule, + NNPoolingModule, + NNPaddingModule, + NNNormalizationModule, + NNActivationModule, + NNRecurrentModule, + NNTransformerModule, + NNLinearModule, + NNDropoutModule, + NNSparseModule, + NNDistanceModule, + NNLossFunctionModule, + NNVisionModule, + NNShuffleModule, + NNUtilsModule, +) +from quantization_ops import ( + GeneralQuantModule, + DynamicQuantModule, + StaticQuantModule, + FusedQuantModule, +) +from sampling_ops import SamplingOpsModule +from tensor_ops import ( + TensorOpsModule, + TensorCreationOpsModule, + TensorIndexingOpsModule, + TensorTypingOpsModule, + TensorViewOpsModule, +) +from torch.jit.mobile import _load_for_lite_interpreter +from torchvision_models import MobileNetV2Module + +test_path_ios = "ios/TestApp/models/" +test_path_android = "android/pytorch_android/src/androidTest/assets/" + +production_ops_path = "test/mobile/model_test/model_ops.yaml" +coverage_out_path = "test/mobile/model_test/coverage.yaml" + +all_modules = { + # math ops + "pointwise_ops": PointwiseOpsModule(), + "reduction_ops": ReductionOpsModule(), + "comparison_ops": ComparisonOpsModule(), + "spectral_ops": SpectralOpsModule(), + "other_math_ops": OtherMathOpsModule(), + "blas_lapack_ops": BlasLapackOpsModule(), + # sampling + "sampling_ops": SamplingOpsModule(), + # tensor ops + "tensor_general_ops": TensorOpsModule(), + "tensor_creation_ops": TensorCreationOpsModule(), + "tensor_indexing_ops": TensorIndexingOpsModule(), + "tensor_typing_ops": TensorTypingOpsModule(), + "tensor_view_ops": TensorViewOpsModule(), + # nn ops + "convolution_ops": NNConvolutionModule(), + "pooling_ops": NNPoolingModule(), + "padding_ops": NNPaddingModule(), + "activation_ops": NNActivationModule(), + "normalization_ops": NNNormalizationModule(), + "recurrent_ops": NNRecurrentModule(), + "transformer_ops": NNTransformerModule(), + "linear_ops": NNLinearModule(), + "dropout_ops": NNDropoutModule(), + "sparse_ops": NNSparseModule(), + "distance_function_ops": NNDistanceModule(), + "loss_function_ops": NNLossFunctionModule(), + "vision_function_ops": NNVisionModule(), + "shuffle_ops": NNShuffleModule(), + "nn_utils_ops": NNUtilsModule(), + # quantization ops + "general_quant_ops": GeneralQuantModule(), + "dynamic_quant_ops": DynamicQuantModule(), + "static_quant_ops": StaticQuantModule(), + "fused_quant_ops": FusedQuantModule(), + # TorchScript buildin ops + "torchscript_builtin_ops": TSBuiltinOpsModule(), + "torchscript_collection_ops": TSCollectionOpsModule(), + # vision + "mobilenet_v2": MobileNetV2Module(), + # android api module + "android_api_module": AndroidAPIModule(), +} + +models_need_trace = [ + "static_quant_ops", +] + + +def calcOpsCoverage(ops): + with open(production_ops_path) as input_yaml_file: + production_ops_dict = yaml.safe_load(input_yaml_file) + + production_ops = set(production_ops_dict["root_operators"].keys()) + all_generated_ops = set(ops) + covered_ops = production_ops.intersection(all_generated_ops) + uncovered_ops = production_ops - covered_ops + coverage = round(100 * len(covered_ops) / len(production_ops), 2) + + # weighted coverage (take op occurances into account) + total_occurances = sum(production_ops_dict["root_operators"].values()) + covered_ops_dict = {op: production_ops_dict["root_operators"][op] for op in covered_ops} + uncovered_ops_dict = {op: production_ops_dict["root_operators"][op] for op in uncovered_ops} + covered_occurances = sum(covered_ops_dict.values()) + occurances_coverage = round(100 * covered_occurances / total_occurances, 2) + + print(f"\n{len(uncovered_ops)} uncovered ops: {uncovered_ops}\n") + print(f"Generated {len(all_generated_ops)} ops") + print(f"Covered {len(covered_ops)}/{len(production_ops)} ({coverage}%) production ops") + print(f"Covered {covered_occurances}/{total_occurances} ({occurances_coverage}%) occurances") + print(f"pytorch ver {torch.__version__}\n") + + with open(coverage_out_path, "w") as f: + yaml.safe_dump( + { + "_covered_ops": len(covered_ops), + "_production_ops": len(production_ops), + "_generated_ops": len(all_generated_ops), + "_uncovered_ops": len(uncovered_ops), + "_coverage": round(coverage, 2), + "uncovered_ops": uncovered_ops_dict, + "covered_ops": covered_ops_dict, + "all_generated_ops": sorted(list(all_generated_ops)), + }, + f, + ) + + +def getModuleFromName(model_name): + if model_name not in all_modules: + print("Cannot find test model for " + model_name) + return None, [] + + module = all_modules[model_name] + if not isinstance(module, torch.nn.Module): + module = module.getModule() + + has_bundled_inputs = False # module.find_method("get_all_bundled_inputs") + + if model_name in models_need_trace: + module = torch.jit.trace(module, []) + else: + module = torch.jit.script(module) + + ops = torch.jit.export_opnames(module) + print(ops) + + # try to run the model + runModule(module) + + return module, ops + + +def runModule(module): + buffer = io.BytesIO(module._save_to_buffer_for_lite_interpreter()) + buffer.seek(0) + lite_module = _load_for_lite_interpreter(buffer) + if lite_module.find_method("get_all_bundled_inputs"): + # run with the first bundled input + input = lite_module.run_method("get_all_bundled_inputs")[0] + lite_module.forward(*input) + else: + # assuming model has no input + lite_module() + + +# generate all models in the given folder. +# If it's "on the fly" mode, add "_temp" suffix to the model file. +def generateAllModels(folder, on_the_fly=False): + all_ops = [] + for name in all_modules: + module, ops = getModuleFromName(name) + all_ops = all_ops + ops + path = folder + name + ("_temp.ptl" if on_the_fly else ".ptl") + module._save_for_lite_interpreter(path) + print("model saved to " + path) + calcOpsCoverage(all_ops) + + +# generate/update a given model for storage +def generateModel(name): + module, ops = getModuleFromName(name) + if module is None: + return + path_ios = test_path_ios + name + ".ptl" + path_android = test_path_android + name + ".ptl" + module._save_for_lite_interpreter(path_ios) + module._save_for_lite_interpreter(path_android) + print("model saved to " + path_ios + " and " + path_android) + + +def main(argv): + if argv is None or len(argv) != 1: + print( + """ +This script generate models for mobile test. For each model we have a "storage" version +and an "on-the-fly" version. The "on-the-fly" version will be generated during test,and +should not be committed to the repo. +The "storage" version is for back compatibility # test (a model generated today should +run on master branch in the next 6 months). We can use this script to update a model that +is no longer supported. +- use 'python gen_test_model.py android-test' to generate on-the-fly models for android +- use 'python gen_test_model.py ios-test' to generate on-the-fly models for ios +- use 'python gen_test_model.py android' to generate checked-in models for android +- use 'python gen_test_model.py ios' to generate on-the-fly models for ios +- use 'python gen_test_model.py ' to update the given storage model +""" + ) + return + + if argv[0] == "android": + generateAllModels(test_path_android, on_the_fly=False) + elif argv[0] == "ios": + generateAllModels(test_path_ios, on_the_fly=False) + elif argv[0] == "android-test": + generateAllModels(test_path_android, on_the_fly=True) + elif argv[0] == "ios-test": + generateAllModels(test_path_ios, on_the_fly=True) + else: + generateModel(argv[0]) + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/test/mobile/model_test/math_ops.py b/test/mobile/model_test/math_ops.py new file mode 100644 index 000000000000..f89e3bca70d6 --- /dev/null +++ b/test/mobile/model_test/math_ops.py @@ -0,0 +1,469 @@ +# https://pytorch.org/docs/stable/torch.html#math-operations + +import math + +import torch + + +class PointwiseOpsModule(torch.nn.Module): + def __init__(self): + super(PointwiseOpsModule, self).__init__() + + def forward(self): + return self.pointwise_ops() + + def pointwise_ops(self): + a = torch.randn(4) + b = torch.randn(4) + t = torch.tensor([-1, -2, 3], dtype=torch.int8) + r = torch.tensor([0, 1, 10, 0], dtype=torch.int8) + t = torch.tensor([-1, -2, 3], dtype=torch.int8) + s = torch.tensor([4, 0, 1, 0], dtype=torch.int8) + f = torch.zeros(3) + g = torch.tensor([-1, 0, 1]) + w = torch.tensor([0.3810, 1.2774, -0.2972, -0.3719, 0.4637]) + return len( + torch.abs(torch.tensor([-1, -2, 3])), + torch.absolute(torch.tensor([-1, -2, 3])), + torch.acos(a), + torch.arccos(a), + torch.acosh(a.uniform_(1.0, 2.0)), + torch.add(a, 20), + torch.add(a, b, out=a), + b.add(a), + b.add(a, out=b), + b.add_(a), + b.add(1), + torch.add(a, torch.randn(4, 1), alpha=10), + torch.addcdiv( + torch.randn(1, 3), torch.randn(3, 1), torch.randn(1, 3), value=0.1 + ), + torch.addcmul( + torch.randn(1, 3), torch.randn(3, 1), torch.randn(1, 3), value=0.1 + ), + torch.angle(a), + torch.asin(a), + torch.arcsin(a), + torch.asinh(a), + torch.arcsinh(a), + torch.atan(a), + torch.arctan(a), + torch.atanh(a.uniform_(-1.0, 1.0)), + torch.arctanh(a.uniform_(-1.0, 1.0)), + torch.atan2(a, a), + torch.bitwise_not(t), + torch.bitwise_and(t, torch.tensor([1, 0, 3], dtype=torch.int8)), + torch.bitwise_or(t, torch.tensor([1, 0, 3], dtype=torch.int8)), + torch.bitwise_xor(t, torch.tensor([1, 0, 3], dtype=torch.int8)), + torch.ceil(a), + torch.ceil(float(torch.tensor(0.5))), + torch.ceil(torch.tensor(0.5).item()), + torch.clamp(a, min=-0.5, max=0.5), + torch.clamp(a, min=0.5), + torch.clamp(a, max=0.5), + torch.clip(a, min=-0.5, max=0.5), + torch.conj(a), + torch.copysign(a, 1), + torch.copysign(a, b), + torch.cos(a), + torch.cosh(a), + torch.deg2rad( + torch.tensor([[180.0, -180.0], [360.0, -360.0], [90.0, -90.0]]) + ), + torch.div(a, b), + a.div(b), + a.div(1), + a.div_(b), + torch.divide(a, b, rounding_mode="trunc"), + torch.divide(a, b, rounding_mode="floor"), + torch.digamma(torch.tensor([1.0, 0.5])), + torch.erf(torch.tensor([0.0, -1.0, 10.0])), + torch.erfc(torch.tensor([0.0, -1.0, 10.0])), + torch.erfinv(torch.tensor([0.0, 0.5, -1.0])), + torch.exp(torch.tensor([0.0, math.log(2.0)])), + torch.exp(float(torch.tensor(1))), + torch.exp2(torch.tensor([0.0, math.log(2.0), 3.0, 4.0])), + torch.expm1(torch.tensor([0.0, math.log(2.0)])), + torch.fake_quantize_per_channel_affine( + torch.randn(2, 2, 2), + (torch.randn(2) + 1) * 0.05, + torch.zeros(2), + 1, + 0, + 255, + ), + torch.fake_quantize_per_tensor_affine(a, 0.1, 0, 0, 255), + torch.float_power(torch.randint(10, (4,)), 2), + torch.float_power(torch.arange(1, 5), torch.tensor([2, -3, 4, -5])), + torch.floor(a), + torch.floor(float(torch.tensor(1))), + torch.floor_divide(torch.tensor([4.0, 3.0]), torch.tensor([2.0, 2.0])), + torch.floor_divide(torch.tensor([4.0, 3.0]), 1.4), + torch.fmod(torch.tensor([-3, -2, -1, 1, 2, 3]), 2), + torch.fmod(torch.tensor([1, 2, 3, 4, 5]), 1.5), + torch.frac(torch.tensor([1.0, 2.5, -3.2])), + torch.randn(4, dtype=torch.cfloat).imag, + torch.ldexp(torch.tensor([1.0]), torch.tensor([1])), + torch.ldexp(torch.tensor([1.0]), torch.tensor([1, 2, 3, 4])), + torch.lerp(torch.arange(1.0, 5.0), torch.empty(4).fill_(10), 0.5), + torch.lerp( + torch.arange(1.0, 5.0), + torch.empty(4).fill_(10), + torch.full_like(torch.arange(1.0, 5.0), 0.5), + ), + torch.lgamma(torch.arange(0.5, 2, 0.5)), + torch.log(torch.arange(5) + 10), + torch.log10(torch.rand(5)), + torch.log1p(torch.randn(5)), + torch.log2(torch.rand(5)), + torch.logaddexp(torch.tensor([-1.0]), torch.tensor([-1, -2, -3])), + torch.logaddexp( + torch.tensor([-100.0, -200.0, -300.0]), torch.tensor([-1, -2, -3]) + ), + torch.logaddexp( + torch.tensor([1.0, 2000.0, 30000.0]), torch.tensor([-1, -2, -3]) + ), + torch.logaddexp2(torch.tensor([-1.0]), torch.tensor([-1, -2, -3])), + torch.logaddexp2( + torch.tensor([-100.0, -200.0, -300.0]), torch.tensor([-1, -2, -3]) + ), + torch.logaddexp2( + torch.tensor([1.0, 2000.0, 30000.0]), torch.tensor([-1, -2, -3]) + ), + torch.logical_and(r, s), + torch.logical_and(r.double(), s.double()), + torch.logical_and(r.double(), s), + torch.logical_and(r, s, out=torch.empty(4, dtype=torch.bool)), + torch.logical_not(torch.tensor([0, 1, -10], dtype=torch.int8)), + torch.logical_not(torch.tensor([0.0, 1.5, -10.0], dtype=torch.double)), + torch.logical_not( + torch.tensor([0.0, 1.0, -10.0], dtype=torch.double), + out=torch.empty(3, dtype=torch.int16), + ), + torch.logical_or(r, s), + torch.logical_or(r.double(), s.double()), + torch.logical_or(r.double(), s), + torch.logical_or(r, s, out=torch.empty(4, dtype=torch.bool)), + torch.logical_xor(r, s), + torch.logical_xor(r.double(), s.double()), + torch.logical_xor(r.double(), s), + torch.logical_xor(r, s, out=torch.empty(4, dtype=torch.bool)), + torch.logit(torch.rand(5), eps=1e-6), + torch.hypot(torch.tensor([4.0]), torch.tensor([3.0, 4.0, 5.0])), + torch.i0(torch.arange(5, dtype=torch.float32)), + torch.igamma(a, b), + torch.igammac(a, b), + torch.mul(torch.randn(3), 100), + b.mul(a), + b.mul(5), + b.mul(a, out=b), + b.mul_(a), + b.mul_(5), + torch.multiply(torch.randn(4, 1), torch.randn(1, 4)), + torch.mvlgamma(torch.empty(2, 3).uniform_(1.0, 2.0), 2), + torch.tensor([float("nan"), float("inf"), -float("inf"), 3.14]), + torch.nan_to_num(w), + torch.nan_to_num_(w), + torch.nan_to_num(w, nan=2.0), + torch.nan_to_num(w, nan=2.0, posinf=1.0), + torch.neg(torch.randn(5)), + # torch.nextafter(torch.tensor([1, 2]), torch.tensor([2, 1])) == torch.tensor([eps + 1, 2 - eps]), + torch.polygamma(1, torch.tensor([1.0, 0.5])), + torch.polygamma(2, torch.tensor([1.0, 0.5])), + torch.polygamma(3, torch.tensor([1.0, 0.5])), + torch.polygamma(4, torch.tensor([1.0, 0.5])), + torch.pow(a, 2), + torch.pow(2, float(torch.tensor(0.5))), + torch.pow(torch.arange(1.0, 5.0), torch.arange(1.0, 5.0)), + torch.rad2deg( + torch.tensor([[3.142, -3.142], [6.283, -6.283], [1.570, -1.570]]) + ), + torch.randn(4, dtype=torch.cfloat).real, + torch.reciprocal(a), + torch.remainder(torch.tensor([-3.0, -2.0]), 2), + torch.remainder(torch.tensor([1, 2, 3, 4, 5]), 1.5), + torch.round(a), + torch.round(torch.tensor(0.5).item()), + torch.rsqrt(a), + torch.sigmoid(a), + torch.sign(torch.tensor([0.7, -1.2, 0.0, 2.3])), + torch.sgn(a), + torch.signbit(torch.tensor([0.7, -1.2, 0.0, 2.3])), + torch.sin(a), + torch.sinc(a), + torch.sinh(a), + torch.sqrt(a), + torch.square(a), + torch.sub(torch.tensor((1, 2)), torch.tensor((0, 1)), alpha=2), + b.sub(a), + b.sub_(a), + b.sub(5), + torch.sum(5), + torch.tan(a), + torch.tanh(a), + torch.true_divide(a, a), + torch.trunc(a), + torch.trunc_(a), + torch.xlogy(f, g), + torch.xlogy(f, g), + torch.xlogy(f, 4), + torch.xlogy(2, g), + ) + + +class ReductionOpsModule(torch.nn.Module): + def __init__(self): + super(ReductionOpsModule, self).__init__() + + def forward(self): + return self.reduction_ops() + + def reduction_ops(self): + a = torch.randn(4) + b = torch.randn(4) + c = torch.tensor(0.5) + return len( + torch.argmax(a), + torch.argmin(a), + torch.amax(a), + torch.amin(a), + torch.aminmax(a), + torch.all(a), + torch.any(a), + torch.max(a), + a.max(a), + torch.max(a, 0), + torch.min(a), + a.min(a), + torch.min(a, 0), + torch.dist(a, b), + torch.logsumexp(a, 0), + torch.mean(a), + torch.mean(a, 0), + torch.nanmean(a), + torch.median(a), + torch.nanmedian(a), + torch.mode(a), + torch.norm(a), + a.norm(2), + torch.norm(a, dim=0), + torch.norm(c, torch.tensor(2)), + torch.nansum(a), + torch.prod(a), + torch.quantile(a, torch.tensor([0.25, 0.5, 0.75])), + torch.quantile(a, 0.5), + torch.nanquantile(a, torch.tensor([0.25, 0.5, 0.75])), + torch.std(a), + torch.std_mean(a), + torch.sum(a), + torch.unique(a), + torch.unique_consecutive(a), + torch.var(a), + torch.var_mean(a), + torch.count_nonzero(a), + ) + + +class ComparisonOpsModule(torch.nn.Module): + def __init__(self): + super(ComparisonOpsModule, self).__init__() + + def forward(self): + a = torch.tensor(0) + b = torch.tensor(1) + return len( + torch.allclose(a, b), + torch.argsort(a), + torch.eq(a, b), + torch.eq(a, 1), + torch.equal(a, b), + torch.ge(a, b), + torch.ge(a, 1), + torch.greater_equal(a, b), + torch.greater_equal(a, 1), + torch.gt(a, b), + torch.gt(a, 1), + torch.greater(a, b), + torch.isclose(a, b), + torch.isfinite(a), + torch.isin(a, b), + torch.isinf(a), + torch.isposinf(a), + torch.isneginf(a), + torch.isnan(a), + torch.isreal(a), + torch.kthvalue(a, 1), + torch.le(a, b), + torch.le(a, 1), + torch.less_equal(a, b), + torch.lt(a, b), + torch.lt(a, 1), + torch.less(a, b), + torch.maximum(a, b), + torch.minimum(a, b), + torch.fmax(a, b), + torch.fmin(a, b), + torch.ne(a, b), + torch.ne(a, 1), + torch.not_equal(a, b), + torch.sort(a), + torch.topk(a, 1), + torch.msort(a), + ) + + +class OtherMathOpsModule(torch.nn.Module): + def __init__(self): + super(OtherMathOpsModule, self).__init__() + + def forward(self): + return self.other_ops() + + def other_ops(self): + a = torch.randn(4) + b = torch.randn(4) + c = torch.randint(0, 8, (5,), dtype=torch.int64) + e = torch.randn(4, 3) + f = torch.randn(4, 4, 4) + size = [0, 1] + dims = [0, 1] + return len( + torch.atleast_1d(a), + torch.atleast_2d(a), + torch.atleast_3d(a), + torch.bincount(c), + torch.block_diag(a), + torch.broadcast_tensors(a), + torch.broadcast_to(a, (4)), + # torch.broadcast_shapes(a), + torch.bucketize(a, b), + torch.cartesian_prod(a), + torch.cdist(e, e), + torch.clone(a), + torch.combinations(a), + torch.corrcoef(a), + # torch.cov(a), + torch.cross(e, e), + torch.cummax(a, 0), + torch.cummin(a, 0), + torch.cumprod(a, 0), + torch.cumsum(a, 0), + torch.diag(a), + torch.diag_embed(a), + torch.diagflat(a), + torch.diagonal(e), + torch.diff(a), + torch.einsum("iii", f), + torch.flatten(a), + torch.flip(e, dims), + torch.fliplr(e), + torch.flipud(e), + torch.kron(a, b), + torch.rot90(e), + torch.gcd(c, c), + torch.histc(a), + torch.histogram(a), + torch.meshgrid(a), + torch.meshgrid(a, indexing="xy"), + torch.lcm(c, c), + torch.logcumsumexp(a, 0), + torch.ravel(a), + torch.renorm(e, 1, 0, 5), + torch.repeat_interleave(c), + torch.roll(a, 1, 0), + torch.searchsorted(a, b), + torch.tensordot(e, e), + torch.trace(e), + torch.tril(e), + torch.tril_indices(3, 3), + torch.triu(e), + torch.triu_indices(3, 3), + torch.vander(a), + torch.view_as_real(torch.randn(4, dtype=torch.cfloat)), + torch.view_as_complex(torch.randn(4, 2)).real, + torch.resolve_conj(a), + torch.resolve_neg(a), + ) + + +class SpectralOpsModule(torch.nn.Module): + def __init__(self): + super(SpectralOpsModule, self).__init__() + + def forward(self): + return self.spectral_ops() + + def spectral_ops(self): + a = torch.randn(10) + b = torch.randn(10, 8, 4, 2) + return len( + torch.stft(a, 8), + torch.stft(a, torch.tensor(8)), + torch.istft(b, 8), + torch.bartlett_window(2, dtype=torch.float), + torch.blackman_window(2, dtype=torch.float), + torch.hamming_window(4, dtype=torch.float), + torch.hann_window(4, dtype=torch.float), + torch.kaiser_window(4, dtype=torch.float), + ) + + +class BlasLapackOpsModule(torch.nn.Module): + def __init__(self): + super(BlasLapackOpsModule, self).__init__() + + def forward(self): + return self.blas_lapack_ops() + + def blas_lapack_ops(self): + m = torch.randn(3, 3) + a = torch.randn(10, 3, 4) + b = torch.randn(10, 4, 3) + v = torch.randn(3) + return len( + torch.addbmm(m, a, b), + torch.addmm(torch.randn(2, 3), torch.randn(2, 3), torch.randn(3, 3)), + torch.addmv(torch.randn(2), torch.randn(2, 3), torch.randn(3)), + torch.addr(torch.zeros(3, 3), v, v), + torch.baddbmm(m, a, b), + torch.bmm(a, b), + torch.chain_matmul(torch.randn(3, 3), torch.randn(3, 3), torch.randn(3, 3)), + # torch.cholesky(a), # deprecated + # torch.cholesky_inverse(torch.randn(3, 3)), # had some error + # torch.cholesky_solve(torch.randn(3, 3), torch.randn(3, 3)), + torch.dot(v, v), + # torch.linalg.eig(m), # not build with lapack + # torch.geqrf(a), + torch.ger(v, v), + torch.inner(m, m), + # torch.inverse(m), + # torch.det(m), + # torch.logdet(m), + # torch.slogdet(m), + # torch.lstsq(m, m), + # torch.lu(m), + # torch.lu_solve(m, *torch.lu(m)), + # torch.lu_unpack(*torch.lu(m)), + torch.matmul(m, m), + torch.matrix_power(m, 2), + # torch.matrix_rank(m), + torch.matrix_exp(m), + torch.mm(m, m), + torch.mv(m, v), + # torch.orgqr(a, m), + # torch.ormqr(a, m, v), + torch.outer(v, v), + # torch.pinverse(m), + # torch.qr(a), + # torch.solve(m, m), + # torch.svd(a), + # torch.svd_lowrank(a), + # torch.pca_lowrank(a), + # torch.symeig(a), # deprecated + # torch.lobpcg(a, b), # not supported + torch.trapz(m, m), + torch.trapezoid(m, m), + torch.cumulative_trapezoid(m, m), + # torch.triangular_solve(m, m), + torch.vdot(v, v), + ) diff --git a/test/mobile/model_test/model_ops.yaml b/test/mobile/model_test/model_ops.yaml new file mode 100644 index 000000000000..06a3640e4cbe --- /dev/null +++ b/test/mobile/model_test/model_ops.yaml @@ -0,0 +1,752 @@ +root_operators: + aten::Bool.Tensor: 19 + aten::Bool.int: 7 + aten::Float.Scalar: 18 + aten::Float.Tensor: 11 + aten::Float.str: 6 + aten::FloatImplicit: 2 + aten::Int.Scalar: 19 + aten::Int.Tensor: 35 + aten::Int.float: 6 + aten::Int.str: 12 + aten::IntImplicit: 11 + aten::ScalarImplicit: 3 + aten::__and__.Tensor: 13 + aten::__and__.bool: 11 + aten::__and__.int: 2 + aten::__contains__.int: 5 + aten::__contains__.int_list: 17 + aten::__contains__.str: 22 + aten::__contains__.str_list: 5 + aten::__derive_index: 24 + aten::__getitem__.Dict_int: 4 + aten::__getitem__.Dict_str: 39 + aten::__getitem__.str: 20 + aten::__getitem__.t: 178 + aten::__is__: 83 + aten::__isnot__: 81 + aten::__lshift__.int: 2 + aten::__not__: 32 + aten::__range_length: 23 + aten::__rshift__.int: 2 + aten::__xor__.bool: 10 + aten::_aminmax: 4 + aten::_convolution: 12 + aten::_convolution.deprecated: 3 + aten::_infer_size: 7 + aten::_make_per_tensor_quantized_tensor: 2 + aten::_pack_padded_sequence: 10 + aten::_pad_packed_sequence: 10 + aten::_reshape_from_tensor: 10 + aten::_set_item.int: 7 + aten::_set_item.str: 163 + aten::_set_item.t: 8 + aten::_shape_as_tensor: 10 + aten::adaptive_avg_pool1d: 1 + aten::adaptive_avg_pool2d: 33 + aten::adaptive_avg_pool3d: 1 + aten::add.Scalar: 33 + aten::add.Tensor: 63 + aten::add.float: 5 + aten::add.int: 49 + aten::add.out: 2 + aten::add.str: 29 + aten::add.t: 11 + aten::add_.Scalar: 15 + aten::add_.Tensor: 29 + aten::addcmul: 2 + aten::addmm: 7 + aten::all: 6 + aten::allclose: 1 + aten::any: 14 + aten::append.t: 59 + aten::arange: 16 + aten::arange.start: 6 + aten::arange.start_step: 16 + aten::argmax: 2 + aten::as_strided: 10 + aten::as_tensor.list: 4 + aten::atan: 4 + aten::avg_pool1d: 6 + aten::avg_pool2d: 7 + aten::backward: 23 + aten::batch_norm: 15 + aten::binary_cross_entropy: 15 + aten::binary_cross_entropy_with_logits: 3 + aten::bitwise_not: 13 + aten::bmm: 16 + aten::broadcast_tensors: 1 + aten::cat: 90 + aten::ceil: 3 + aten::ceil.float: 7 + aten::chunk: 19 + aten::clamp: 36 + aten::clamp_: 12 + aten::clamp_min: 3 + aten::clear.str: 2 + aten::clone: 26 + aten::coalesce: 2 + aten::conj: 1 + aten::constant_pad_nd: 17 + aten::contiguous: 113 + aten::conv1d: 12 + aten::conv2d: 10 + aten::conv_transpose2d.input: 5 + aten::copy_: 15 + aten::copy_.Tensor: 27 + aten::copy_.int: 1 + aten::cos: 4 + aten::count_nonzero: 4 + aten::ctc_loss.Tensor: 1 + aten::cumsum: 13 + aten::dequantize.list: 1 + aten::dequantize.self: 30 + aten::dequantize.tensor: 36 + aten::detach: 34 + aten::dim: 36 + aten::div: 9 + aten::div.Scalar: 8 + aten::div.Tensor: 71 + aten::div.Tensor_mode: 7 + aten::div.float: 3 + aten::div.int: 7 + aten::div_.Tensor: 7 + aten::dropout: 41 + aten::embedding: 16 + aten::embedding_bag.padding_idx: 2 + aten::empty.memory_format: 11 + aten::empty_like: 11 + aten::empty_strided: 3 + aten::eq.Scalar: 24 + aten::eq.Tensor: 6 + aten::eq.int: 57 + aten::eq.int_list: 20 + aten::eq.str: 43 + aten::exp: 18 + aten::exp.float: 4 + aten::expand: 26 + aten::expand_as: 3 + aten::extend.t: 38 + aten::feature_dropout: 1 + aten::fill_.Scalar: 17 + aten::find: 3 + aten::flatten.using_ints: 45 + aten::flip: 1 + aten::floor: 5 + aten::floor.float: 2 + aten::floor_divide: 4 + aten::floor_divide.Scalar: 7 + aten::floordiv.int: 21 + aten::format: 58 + aten::full: 10 + aten::full_like: 10 + aten::gather: 10 + aten::ge.Scalar: 4 + aten::ge.Tensor: 6 + aten::ge.int: 29 + aten::gelu: 12 + aten::get.default_str: 14 + aten::glu: 18 + aten::grid_sampler: 3 + aten::gt.Scalar: 16 + aten::gt.float: 16 + aten::gt.float_int: 3 + aten::gt.int: 52 + aten::hardsigmoid: 3 + aten::hardsigmoid_: 2 + aten::hardswish_: 4 + aten::hardtanh: 3 + aten::hardtanh_: 3 + aten::hstack: 2 + aten::index.Tensor: 23 + aten::index_fill.int_Scalar: 15 + aten::index_put_: 16 + aten::index_select: 31 + aten::is_coalesced: 2 + aten::is_floating_point: 9 + aten::isnan: 1 + aten::item: 40 + aten::items.str: 3 + aten::keys.str: 15 + aten::layer_norm: 26 + aten::le.Scalar: 1 + aten::le.Tensor: 10 + aten::le.float: 2 + aten::le.int: 17 + aten::leaky_relu: 1 + aten::leaky_relu_: 5 + aten::len.Dict_int: 5 + aten::len.Tensor: 19 + aten::len.str: 23 + aten::len.t: 177 + aten::linear: 46 + aten::linspace: 3 + aten::list.t: 24 + aten::log: 18 + aten::log10: 4 + aten::log1p: 5 + aten::log_softmax.int: 31 + aten::logical_and: 1 + aten::logical_not: 10 + aten::logit: 7 + aten::lower: 10 + aten::lstm.data: 8 + aten::lstm.input: 4 + aten::lt.Scalar: 8 + aten::lt.Tensor: 1 + aten::lt.float: 16 + aten::lt.int: 46 + aten::masked_fill.Scalar: 16 + aten::matmul: 12 + aten::max: 18 + aten::max.dim: 30 + aten::max.other: 7 + aten::max_pool2d: 10 + aten::maximum: 4 + aten::mean: 10 + aten::mean.dim: 16 + aten::meshgrid.indexing: 2 + aten::min: 2 + aten::min.dim: 4 + aten::min.other: 17 + aten::minimum: 4 + aten::mse_loss: 1 + aten::mul.Scalar: 26 + aten::mul.Tensor: 90 + aten::mul.float: 5 + aten::mul.float_int: 3 + aten::mul.int: 26 + aten::mul.int_float: 4 + aten::mul.left_t: 15 + aten::mul.out: 1 + aten::mul_.Scalar: 11 + aten::mul_.Tensor: 5 + aten::nan_to_num: 3 + aten::nan_to_num_: 10 + aten::narrow: 10 + aten::ne.Scalar: 14 + aten::ne.Tensor: 5 + aten::ne.int: 44 + aten::ne.int_float: 2 + aten::ne.int_list: 20 + aten::ne.str: 3 + aten::neg: 29 + aten::neg.int: 19 + aten::new_zeros: 6 + aten::nll_loss: 1 + aten::nll_loss2d: 1 + aten::nll_loss_nd: 3 + aten::nonzero: 4 + aten::norm.Scalar: 1 + aten::norm.ScalarOpt_dim: 4 + aten::numel: 8 + aten::one_hot: 2 + aten::ones: 38 + aten::ones_like: 16 + aten::ord: 20 + aten::permute: 43 + aten::pop.t: 7 + aten::pow.Tensor_Scalar: 3 + aten::pow.int_float: 2 + aten::quantile.scalar: 1 + aten::quantize_per_tensor: 66 + aten::quantize_per_tensor.tensor_qparams: 1 + aten::quantized_lstm.data: 2 + aten::rand: 25 + aten::randint.low: 2 + aten::randn_like: 17 + aten::reciprocal: 1 + aten::reflection_pad2d: 1 + aten::relu: 82 + aten::relu_: 9 + aten::remainder.Scalar: 2 + aten::remainder.int: 22 + aten::repeat: 16 + aten::replace: 1 + aten::replication_pad1d: 1 + aten::replication_pad2d: 2 + aten::replication_pad3d: 1 + aten::requires_grad_: 4 + aten::reshape: 36 + aten::resize_as_: 1 + aten::resolve_conj: 1 + aten::resolve_neg: 1 + aten::reverse.t: 2 + aten::round.Scalar: 4 + aten::rstrip: 1 + aten::rsub.Scalar: 5 + aten::scatter_.src: 6 + aten::scatter_add_: 10 + aten::select.int: 57 + aten::selu: 2 + aten::sigmoid: 93 + aten::sin: 4 + aten::size: 66 + aten::size.int: 66 + aten::slice.Tensor: 75 + aten::slice.str: 12 + aten::slice.t: 43 + aten::softmax.int: 63 + aten::softplus: 2 + aten::sort: 18 + aten::sparse_coo_tensor.indices: 1 + aten::sparse_resize_and_clear_: 1 + aten::split.str: 10 + aten::sqrt: 1 + aten::squeeze.dim: 26 + aten::stack: 30 + aten::startswith: 10 + aten::str: 16 + aten::strip: 3 + aten::sub: 8 + aten::sub.Scalar: 26 + aten::sub.Tensor: 94 + aten::sub.int: 52 + aten::sub_.Tensor: 4 + aten::sum: 17 + aten::sum.dim_IntList: 19 + aten::sum.int: 1 + aten::t: 3 + aten::tanh: 26 + aten::tensor: 51 + aten::tensor.float: 28 + aten::tensor.int: 34 + aten::tensor_split.indices: 4 + aten::to.device: 11 + aten::to.dtype: 23 + aten::to.dtype_layout: 27 + aten::to.prim_Device: 23 + aten::to.prim_dtype: 38 + aten::topk: 10 + aten::transpose.int: 33 + aten::triu: 10 + aten::true_divide.Tensor: 2 + aten::trunc_: 3 + aten::type_as: 6 + aten::unbind.int: 24 + aten::unique_consecutive: 2 + aten::unsqueeze: 34 + aten::unsqueeze_: 6 + aten::update.str: 4 + aten::upsample_bicubic2d.vec: 1 + aten::upsample_bilinear2d.vec: 8 + aten::upsample_linear1d.vec: 1 + aten::upsample_nearest1d.vec: 2 + aten::upsample_nearest2d: 7 + aten::upsample_nearest2d.vec: 30 + aten::upsample_nearest3d.vec: 2 + aten::upsample_trilinear3d.vec: 1 + aten::values.int: 3 + aten::view: 61 + aten::vstack: 1 + aten::where.ScalarOther: 4 + aten::where.self: 10 + aten::zeros: 75 + aten::zeros.out: 1 + aten::zeros_like: 7 + prepacked::conv2d_clamp_prepack: 2 + prepacked::conv2d_clamp_run: 32 + prepacked::conv2d_transpose_clamp_prepack: 1 + prepacked::conv2d_transpose_clamp_run: 1 + prepacked::linear_clamp_run: 26 + prim::ModuleContainerIndex.list: 2 + prim::NumToTensor.Scalar: 15 + prim::Print: 1 + prim::RaiseException: 103 + prim::TupleIndex: 157 + prim::TupleUnpack: 120 + prim::Uninitialized: 80 + prim::device: 46 + prim::dtype: 45 + prim::is_cuda: 1 + prim::max.float: 7 + prim::max.int: 14 + prim::max.self_int: 17 + prim::min: 4 + prim::min.int: 35 + prim::min.self_int: 25 + prim::unchecked_cast: 100 + quantized::add: 58 + quantized::add_relu: 1 + quantized::batch_norm2d: 1 + quantized::cat: 4 + quantized::conv1d: 1 + quantized::conv2d: 4 + quantized::conv2d.new: 55 + quantized::conv2d_prepack: 14 + quantized::conv2d_relu.new: 50 + quantized::conv_prepack: 5 + quantized::conv_transpose2d: 2 + quantized::embedding_4bit: 1 + quantized::embedding_byte: 14 + quantized::hardswish: 1 + quantized::instance_norm: 1 + quantized::leaky_relu: 2 + quantized::linear: 27 + quantized::linear_dynamic: 21 + quantized::linear_dynamic_fp16: 18 + quantized::linear_prepack: 29 + quantized::linear_prepack_fp16: 25 + quantized::linear_relu: 2 + quantized::linear_unpack: 4 + quantized::linear_unpack_fp16: 4 + quantized::mul: 4 + quantized::mul.Scalar: 1 +traced_operators: + aten::__and__.Tensor: 13 + aten::__iand__.Tensor: 1 + aten::__ior__.Tensor: 1 + aten::_adaptive_avg_pool2d: 23 + aten::_aminmax: 4 + aten::_batch_norm_impl_index: 15 + aten::_cat: 95 + aten::_coalesce: 2 + aten::_coalesced_: 3 + aten::_convolution: 34 + aten::_convolution.deprecated: 3 + aten::_ctc_loss: 1 + aten::_embedding_bag: 2 + aten::_embedding_bag_backward: 1 + aten::_embedding_bag_sparse_backward: 1 + aten::_empty_affine_quantized: 87 + aten::_empty_per_channel_affine_quantized: 28 + aten::_index_put_impl_: 16 + aten::_indices: 4 + aten::_local_scalar_dense: 188 + aten::_log_softmax: 28 + aten::_log_softmax_backward_data: 4 + aten::_make_per_tensor_quantized_tensor: 2 + aten::_nnz: 3 + aten::_pack_padded_sequence: 10 + aten::_pack_padded_sequence_backward: 3 + aten::_pad_packed_sequence: 10 + aten::_reshape_alias: 93 + aten::_reshape_from_tensor: 10 + aten::_s_where: 15 + aten::_shape_as_tensor: 10 + aten::_slow_conv2d_backward.output_mask: 3 + aten::_slow_conv2d_forward: 33 + aten::_softmax: 63 + aten::_sparse_coo_tensor_unsafe: 4 + aten::_sparse_coo_tensor_with_dims_and_tensors: 5 + aten::_to_copy: 188 + aten::_unsafe_view: 28 + aten::_values: 4 + aten::abs: 1 + aten::abs.out: 1 + aten::adaptive_avg_pool2d: 29 + aten::add.Scalar: 30 + aten::add.Tensor: 72 + aten::add.out: 2 + aten::add_.Scalar: 11 + aten::add_.Tensor: 48 + aten::addmm: 41 + aten::alias: 14 + aten::all: 8 + aten::allclose: 1 + aten::aminmax: 4 + aten::any: 14 + aten::any.dim: 1 + aten::arange: 10 + aten::arange.start: 26 + aten::arange.start_out: 28 + aten::arange.start_step: 8 + aten::argmax: 2 + aten::as_strided: 188 + aten::as_strided_: 39 + aten::atan: 4 + aten::atleast_1d.Sequence: 2 + aten::atleast_2d.Sequence: 1 + aten::avg_pool2d: 7 + aten::batch_norm: 15 + aten::bernoulli_.float: 2 + aten::binary_cross_entropy: 13 + aten::binary_cross_entropy_backward: 12 + aten::binary_cross_entropy_with_logits: 3 + aten::binary_cross_entropy_with_logits_backward: 2 + aten::bitwise_and.Tensor: 13 + aten::bitwise_and_.Tensor: 1 + aten::bitwise_not: 13 + aten::bitwise_or_.Tensor: 1 + aten::bmm: 18 + aten::broadcast_tensors: 1 + aten::cat: 95 + aten::ceil: 4 + aten::ceil_: 1 + aten::chunk: 20 + aten::clamp: 38 + aten::clamp_: 12 + aten::clamp_min: 73 + aten::clamp_min.out: 74 + aten::clamp_min_: 4 + aten::clone: 134 + aten::coalesce: 2 + aten::conj: 1 + aten::constant_pad_nd: 14 + aten::contiguous: 139 + aten::conv1d: 12 + aten::conv2d: 7 + aten::conv_transpose2d.input: 5 + aten::convolution: 19 + aten::convolution_backward: 3 + aten::copy_: 188 + aten::copy_sparse_to_sparse_: 3 + aten::cos: 4 + aten::count_nonzero: 4 + aten::count_nonzero.dim_IntList: 4 + aten::ctc_loss.Tensor: 1 + aten::cudnn_is_acceptable: 12 + aten::cumsum: 14 + aten::dense_dim: 3 + aten::dequantize.self: 63 + aten::dequantize.tensors: 1 + aten::detach: 49 + aten::div.Scalar: 188 + aten::div.Tensor: 188 + aten::div.Tensor_mode: 8 + aten::div_.Scalar: 27 + aten::div_.Tensor: 34 + aten::dropout: 41 + aten::elu: 2 + aten::embedding: 16 + aten::embedding_backward: 4 + aten::embedding_bag.padding_idx: 2 + aten::embedding_dense_backward: 4 + aten::embedding_sparse_backward: 1 + aten::empty.memory_format: 188 + aten::empty_like: 162 + aten::empty_strided: 188 + aten::eq.Scalar: 25 + aten::eq.Tensor: 188 + aten::exp: 15 + aten::exp_: 3 + aten::expand: 63 + aten::expand_as: 17 + aten::feature_dropout: 1 + aten::fill_.Scalar: 188 + aten::flatten.using_ints: 42 + aten::flip: 1 + aten::floor: 6 + aten::floor_divide: 7 + aten::floor_divide.Scalar: 7 + aten::full: 21 + aten::full_like: 10 + aten::gather: 11 + aten::ge.Scalar: 2 + aten::gelu: 12 + aten::glu: 18 + aten::grid_sampler: 3 + aten::grid_sampler_2d: 3 + aten::gt.Scalar: 16 + aten::hardsigmoid: 3 + aten::hardsigmoid_: 2 + aten::hardswish_: 4 + aten::hardtanh: 3 + aten::hstack: 2 + aten::index.Tensor: 20 + aten::index_add_: 4 + aten::index_fill.int_Scalar: 1 + aten::index_fill_.int_Scalar: 1 + aten::index_put_: 16 + aten::index_select: 28 + aten::index_select_backward: 3 + aten::is_coalesced: 3 + aten::is_floating_point: 8 + aten::isclose: 1 + aten::isfinite: 1 + aten::isnan: 1 + aten::item: 188 + aten::layer_norm: 26 + aten::le.Scalar: 2 + aten::le.Tensor: 1 + aten::leaky_relu: 1 + aten::leaky_relu_: 5 + aten::lerp_.Tensor: 1 + aten::linear: 51 + aten::linspace: 3 + aten::linspace.out: 3 + aten::log: 15 + aten::log10: 4 + aten::log1p: 5 + aten::log_: 3 + aten::log_softmax.int: 28 + aten::logical_and: 1 + aten::logical_and.out: 2 + aten::logical_and_: 1 + aten::logit: 7 + aten::lstm.data: 8 + aten::lstm.input: 4 + aten::lt.Scalar: 8 + aten::lt.Tensor: 1 + aten::masked_fill.Scalar: 3 + aten::masked_fill_.Scalar: 18 + aten::matmul: 31 + aten::max: 27 + aten::max.dim: 31 + aten::max.other: 4 + aten::max_pool2d: 7 + aten::maximum: 4 + aten::mean: 16 + aten::mean.dim: 26 + aten::meshgrid.indexing: 2 + aten::min: 25 + aten::min.dim: 5 + aten::min.other: 4 + aten::minimum: 5 + aten::mm: 40 + aten::mul.Scalar: 31 + aten::mul.Tensor: 103 + aten::mul.out: 12 + aten::mul_.Scalar: 11 + aten::mul_.Tensor: 7 + aten::nan_to_num: 3 + aten::nan_to_num.out: 13 + aten::nan_to_num_: 10 + aten::narrow: 188 + aten::native_batch_norm: 15 + aten::native_layer_norm: 26 + aten::native_layer_norm_backward: 1 + aten::ne.Scalar: 15 + aten::ne.Tensor: 6 + aten::neg: 29 + aten::new_empty_strided: 188 + aten::nll_loss: 4 + aten::nll_loss_backward: 4 + aten::nll_loss_forward: 4 + aten::nll_loss_nd: 3 + aten::nonzero: 16 + aten::norm.Scalar: 1 + aten::norm.ScalarOpt_dim: 5 + aten::normal_: 17 + aten::one_hot: 2 + aten::ones: 188 + aten::ones_like: 25 + aten::permute: 44 + aten::pow.Tensor_Scalar: 3 + aten::q_per_channel_scales: 28 + aten::q_per_channel_zero_points: 28 + aten::q_scale: 65 + aten::q_zero_point: 85 + aten::qscheme: 85 + aten::quantile.scalar: 1 + aten::quantize_per_tensor: 84 + aten::quantize_per_tensor.tensor_qparams: 1 + aten::quantized_lstm.data: 2 + aten::quantized_max_pool2d: 3 + aten::rand: 25 + aten::randint.low: 2 + aten::randn_like: 17 + aten::random_.from: 2 + aten::reciprocal: 1 + aten::reflection_pad2d: 1 + aten::relu: 79 + aten::relu_: 4 + aten::remainder.Scalar: 2 + aten::remainder.Tensor: 2 + aten::repeat: 14 + aten::replication_pad2d: 2 + aten::requires_grad_: 2 + aten::reshape: 69 + aten::resize_: 188 + aten::resize_as_: 18 + aten::resolve_conj: 70 + aten::resolve_neg: 1 + aten::result_type.Scalar: 3 + aten::rsub.Scalar: 5 + aten::scalar_tensor: 1 + aten::scatter_.src: 6 + aten::scatter_.value: 2 + aten::scatter_add_: 10 + aten::select.int: 77 + aten::select_backward: 1 + aten::selu: 2 + aten::set_.source_Storage: 186 + aten::set_.source_Storage_storage_offset: 186 + aten::sigmoid: 90 + aten::sigmoid_: 14 + aten::sigmoid_backward: 17 + aten::sin: 4 + aten::slice.Tensor: 188 + aten::slice_backward: 4 + aten::slow_conv_transpose2d: 6 + aten::softmax.int: 63 + aten::softplus: 2 + aten::sort: 20 + aten::sparse_coo_tensor.indices: 1 + aten::sparse_dim: 3 + aten::sparse_resize_and_clear_: 1 + aten::split.Tensor: 20 + aten::sqrt: 1 + aten::squeeze: 13 + aten::squeeze.dim: 38 + aten::squeeze_.dim: 36 + aten::stack: 39 + aten::sub.Scalar: 23 + aten::sub.Tensor: 105 + aten::sub_.Scalar: 1 + aten::sub_.Tensor: 7 + aten::sum: 18 + aten::sum.IntList_out: 29 + aten::sum.dim_IntList: 41 + aten::t: 49 + aten::tanh: 40 + aten::tanh_: 14 + aten::tanh_backward: 5 + aten::tensor_split.indices: 4 + aten::thnn_conv2d: 33 + aten::threshold_backward: 17 + aten::to.device: 35 + aten::to.dtype: 188 + aten::to.dtype_layout: 184 + aten::topk: 10 + aten::transpose.int: 73 + aten::triu: 10 + aten::true_divide.Tensor: 2 + aten::trunc_: 4 + aten::type_as: 6 + aten::unbind.int: 38 + aten::unfold: 14 + aten::uniform_: 25 + aten::unique_consecutive: 2 + aten::unsafe_chunk: 14 + aten::unsafe_split.Tensor: 14 + aten::unsqueeze: 56 + aten::unsqueeze_: 31 + aten::upsample_bilinear2d: 7 + aten::upsample_bilinear2d.vec: 7 + aten::upsample_nearest2d: 31 + aten::upsample_nearest2d.vec: 27 + aten::value_selecting_reduction_backward: 3 + aten::view: 95 + aten::vstack: 1 + aten::where.ScalarOther: 4 + aten::where.self: 15 + aten::zero_: 188 + aten::zeros: 188 + aten::zeros.out: 1 + aten::zeros_like: 6 + prepacked::conv2d_clamp_prepack: 1 + prepacked::conv2d_clamp_run: 32 + prepacked::conv2d_transpose_clamp_run: 1 + prepacked::linear_clamp_run: 26 + quantized::add: 58 + quantized::add_relu: 1 + quantized::batch_norm2d: 1 + quantized::cat: 4 + quantized::conv1d: 1 + quantized::conv2d: 4 + quantized::conv2d.new: 55 + quantized::conv2d_prepack: 14 + quantized::conv2d_relu.new: 50 + quantized::conv_prepack: 5 + quantized::conv_transpose2d: 2 + quantized::embedding_byte: 14 + quantized::hardswish: 1 + quantized::instance_norm: 1 + quantized::leaky_relu: 2 + quantized::linear: 27 + quantized::linear_dynamic: 21 + quantized::linear_prepack: 29 + quantized::linear_relu: 2 + quantized::mul: 4 + quantized::mul.Scalar: 1 diff --git a/test/mobile/model_test/nn_ops.py b/test/mobile/model_test/nn_ops.py new file mode 100644 index 000000000000..338359c96408 --- /dev/null +++ b/test/mobile/model_test/nn_ops.py @@ -0,0 +1,427 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +# https://pytorch.org/docs/stable/nn.html +class NNConvolutionModule(torch.nn.Module): + def __init__(self): + super(NNConvolutionModule, self).__init__() + self.input1d = torch.randn(1, 4, 36) + self.input2d = torch.randn(1, 4, 30, 10) + self.input3d = torch.randn(1, 4, 10, 4, 4) + self.module1d = nn.ModuleList( + [ + nn.Conv1d(4, 33, 3), + nn.ConvTranspose1d(4, 33, 3), + nn.Fold(output_size=(5, 10), kernel_size=(2, 2)), + ] + ) + self.module2d = nn.ModuleList( + [ + nn.Conv2d(4, 33, 3), + nn.ConvTranspose2d(4, 33, 3), + nn.Unfold(kernel_size=3), + ] + ) + self.module3d = nn.ModuleList( + [ + nn.Conv3d(4, 33, 2), + nn.ConvTranspose3d(4, 33, 3), + ] + ) + + def forward(self): + return len(( + [module(self.input1d) for i, module in enumerate(self.module1d)], + [module(self.input2d) for i, module in enumerate(self.module2d)], + [module(self.input3d) for i, module in enumerate(self.module3d)], + )) + + +class NNPoolingModule(torch.nn.Module): + def __init__(self): + super(NNPoolingModule, self).__init__() + self.input1d = torch.randn(1, 16, 50) + self.module1d = nn.ModuleList( + [ + nn.MaxPool1d(3, stride=2), + nn.AvgPool1d(3, stride=2), + nn.LPPool1d(2, 3, stride=2), + nn.AdaptiveMaxPool1d(3), + nn.AdaptiveAvgPool1d(3), + ] + ) + + self.input2d = torch.randn(1, 16, 30, 10) + self.module2d = nn.ModuleList( + [ + nn.MaxPool2d((3, 2), stride=(2, 1)), + nn.AvgPool2d((3, 2), stride=(2, 1)), + nn.FractionalMaxPool2d(3, output_ratio=(0.5, 0.5)), + nn.LPPool2d(2, 3, stride=(2, 1)), + nn.AdaptiveMaxPool2d((5, 7)), + nn.AdaptiveAvgPool2d((7)), + ] + ) + + self.input3d = torch.randn(1, 16, 20, 4, 4) + self.module3d = nn.ModuleList( + [ + nn.MaxPool3d(2), + nn.AvgPool3d(2), + nn.FractionalMaxPool3d(2, output_ratio=(0.5, 0.5, 0.5)), + nn.AdaptiveMaxPool3d((5, 7, 9)), + nn.AdaptiveAvgPool3d((5, 7, 9)), + ] + ) + # TODO max_unpool + + def forward(self): + return len(( + [module(self.input1d) for i, module in enumerate(self.module1d)], + [module(self.input2d) for i, module in enumerate(self.module2d)], + [module(self.input3d) for i, module in enumerate(self.module3d)], + )) + + +class NNPaddingModule(torch.nn.Module): + def __init__(self): + super(NNPaddingModule, self).__init__() + self.input1d = torch.randn(1, 4, 50) + self.module1d = nn.ModuleList( + [ + nn.ReflectionPad1d(2), + nn.ReplicationPad1d(2), + nn.ConstantPad1d(2, 3.5), + ] + ) + + self.input2d = torch.randn(1, 4, 30, 10) + self.module2d = nn.ModuleList( + [ + nn.ReflectionPad2d(2), + nn.ReplicationPad2d(2), + nn.ZeroPad2d(2), + nn.ConstantPad2d(2, 3.5), + ] + ) + + self.input3d = torch.randn(1, 4, 10, 4, 4) + self.module3d = nn.ModuleList( + [ + nn.ReflectionPad3d(1), + nn.ReplicationPad3d(3), + nn.ConstantPad3d(3, 3.5), + ] + ) + + def forward(self): + return len(( + [module(self.input1d) for i, module in enumerate(self.module1d)], + [module(self.input2d) for i, module in enumerate(self.module2d)], + [module(self.input3d) for i, module in enumerate(self.module3d)], + )) + + +class NNNormalizationModule(torch.nn.Module): + def __init__(self): + super(NNNormalizationModule, self).__init__() + self.input1d = torch.randn(1, 4, 50) + self.module1d = nn.ModuleList( + [ + nn.BatchNorm1d(4), + nn.InstanceNorm1d(4), + ] + ) + + self.input2d = torch.randn(1, 4, 30, 10) + self.module2d = nn.ModuleList( + [ + nn.BatchNorm2d(4), + nn.GroupNorm(4, 4), + nn.InstanceNorm2d(4), + nn.LayerNorm([4, 30, 10]), + nn.LocalResponseNorm(2), + ] + ) + + self.input3d = torch.randn(1, 4, 10, 4, 4) + self.module3d = nn.ModuleList( + [ + nn.BatchNorm3d(4), + nn.InstanceNorm3d(4), + nn.ChannelShuffle(2), + ] + ) + + def forward(self): + return len(( + [module(self.input1d) for i, module in enumerate(self.module1d)], + [module(self.input2d) for i, module in enumerate(self.module2d)], + [module(self.input3d) for i, module in enumerate(self.module3d)], + )) + + +class NNActivationModule(torch.nn.Module): + def __init__(self): + super(NNActivationModule, self).__init__() + self.activations = nn.ModuleList( + [ + nn.ELU(), + nn.Hardshrink(), + nn.Hardsigmoid(), + nn.Hardtanh(), + nn.Hardswish(), + nn.LeakyReLU(), + nn.LogSigmoid(), + # nn.MultiheadAttention(), + nn.PReLU(), + nn.ReLU(), + nn.ReLU6(), + nn.RReLU(), + nn.SELU(), + nn.CELU(), + nn.GELU(), + nn.Sigmoid(), + nn.SiLU(), + nn.Mish(), + nn.Softplus(), + nn.Softshrink(), + nn.Softsign(), + nn.Tanh(), + nn.Tanhshrink(), + # nn.Threshold(0.1, 20), + nn.GLU(), + nn.Softmin(), + nn.Softmax(), + nn.Softmax2d(), + nn.LogSoftmax(), + # nn.AdaptiveLogSoftmaxWithLoss(), + ] + ) + + def forward(self): + input = torch.randn(2, 3, 4) + return len(( + [module(input) for i, module in enumerate(self.activations)], + )) + + +class NNRecurrentModule(torch.nn.Module): + def __init__(self): + super(NNRecurrentModule, self).__init__() + self.rnn = nn.ModuleList( + [ + nn.RNN(4, 8, 2), + nn.RNNCell(4, 8), + ] + ) + self.gru = nn.ModuleList([nn.GRU(4, 8, 2), nn.GRUCell(4, 8)]) + self.lstm = nn.ModuleList( + [ + nn.LSTM(4, 8, 2), + nn.LSTMCell(4, 8), + ] + ) + + def forward(self): + input = torch.randn(5, 3, 4) + h = torch.randn(2, 3, 8) + c = torch.randn(2, 3, 8) + r = self.rnn[0](input, h) + r = self.rnn[1](input[0], h[0]) + r = self.gru[0](input, h) + r = self.gru[1](input[0], h[0]) + r = self.lstm[0](input, (h, c)) + r = self.lstm[1](input[0], (h[0], c[0])) + return len(r) + + +class NNTransformerModule(torch.nn.Module): + def __init__(self): + super(NNTransformerModule, self).__init__() + self.transformers = nn.ModuleList( + [ + nn.Transformer( + d_model=2, nhead=2, num_encoder_layers=1, num_decoder_layers=1 + ), + nn.TransformerEncoder( + nn.TransformerEncoderLayer(d_model=2, nhead=2), num_layers=1 + ), + nn.TransformerDecoder( + nn.TransformerDecoderLayer(d_model=2, nhead=2), num_layers=1 + ), + ] + ) + + def forward(self): + input = torch.rand(1, 16, 2) + tgt = torch.rand((1, 16, 2)) + r = self.transformers[0](input, tgt) + r = self.transformers[1](input) + r = self.transformers[2](input, tgt) + return len(r) + + +class NNLinearModule(torch.nn.Module): + def __init__(self): + super(NNLinearModule, self).__init__() + self.linears = nn.ModuleList( + [ + nn.Identity(54), + nn.Linear(20, 20), + nn.Bilinear(20, 20, 40), + # nn.LazyLinear(20, 30), + ] + ) + + def forward(self): + input = torch.randn(32, 20) + r = self.linears[0](input) + r = self.linears[1](input) + r = self.linears[2](input, input) + return len(r) + + +class NNDropoutModule(torch.nn.Module): + def __init__(self): + super(NNDropoutModule, self).__init__() + + def forward(self): + a = torch.randn(8, 4) + b = torch.randn(8, 4, 4, 4) + c = torch.randn(8, 4, 4, 4, 4) + return len( + F.dropout(a), + F.dropout2d(b), + F.dropout3d(c), + F.alpha_dropout(a), + F.feature_alpha_dropout(c), + ) + + +class NNSparseModule(torch.nn.Module): + def __init__(self): + super(NNSparseModule, self).__init__() + + def forward(self): + input = torch.tensor([[1, 2, 4, 5], [4, 3, 2, 9]]) + input2 = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9]) + embedding_matrix = torch.rand(10, 3) + offsets = torch.tensor([0, 4]) + return len( + F.embedding(input, embedding_matrix), + F.embedding_bag(input2, embedding_matrix, offsets), + F.one_hot(torch.arange(0, 5) % 3, num_classes=5), + ) + + +class NNDistanceModule(torch.nn.Module): + def __init__(self): + super(NNDistanceModule, self).__init__() + + def forward(self): + a = torch.randn(8, 4) + b = torch.randn(8, 4) + return len( + F.pairwise_distance(a, b), + F.cosine_similarity(a, b), + F.pdist(a), + ) + + +class NNLossFunctionModule(torch.nn.Module): + def __init__(self): + super(NNLossFunctionModule, self).__init__() + self.x = torch.FloatTensor([[0.1, 0.2, 0.4, 0.8]]) + self.y = torch.LongTensor([[3, 0, -1, 1]]) + + def forward(self): + a = torch.randn(3, 2) + b = torch.rand(3, 2) + c = torch.rand(3) + log_probs = torch.randn(50, 16, 20).log_softmax(2).detach() + targets = torch.randint(1, 20, (16, 30), dtype=torch.long) + input_lengths = torch.full((16,), 50, dtype=torch.long) + target_lengths = torch.randint(10, 30, (16,), dtype=torch.long) + return len( + F.binary_cross_entropy(torch.sigmoid(a), b), + F.binary_cross_entropy_with_logits(torch.sigmoid(a), b), + F.poisson_nll_loss(a, b), + F.cosine_embedding_loss(a, b, c), + F.cross_entropy(a, b), + F.ctc_loss(log_probs, targets, input_lengths, target_lengths), + # F.gaussian_nll_loss(a, b, torch.ones(5, 1)), # ENTER is not supported in mobile module + F.hinge_embedding_loss(a, b), + F.kl_div(a, b), + F.l1_loss(a, b), + F.mse_loss(a, b), + F.margin_ranking_loss(c, c, c), + F.multilabel_margin_loss(self.x, self.y), + F.multilabel_soft_margin_loss(self.x, self.y), + F.multi_margin_loss(self.x, torch.tensor([3])), + F.nll_loss(a, torch.tensor([1, 0, 1])), + F.huber_loss(a, b), + F.smooth_l1_loss(a, b), + F.soft_margin_loss(a, b), + F.triplet_margin_loss(a, b, -b), + # F.triplet_margin_with_distance_loss(a, b, -b), # can't take variable number of arguments + ) + + +class NNVisionModule(torch.nn.Module): + def __init__(self): + super(NNVisionModule, self).__init__() + self.input = torch.randn(1, 4, 9, 9) + self.vision_modules = nn.ModuleList( + [ + nn.PixelShuffle(2), + nn.PixelUnshuffle(3), + nn.Upsample(scale_factor=2, mode="nearest"), + nn.Upsample(scale_factor=2, mode="bilinear"), + nn.Upsample(scale_factor=2, mode="bicubic"), + nn.UpsamplingNearest2d(scale_factor=2), + nn.UpsamplingBilinear2d(scale_factor=2), + ] + ) + self.linear_sample = nn.Upsample(scale_factor=2, mode="linear") + self.trilinear_sample = nn.Upsample(scale_factor=2, mode="trilinear") + + def forward(self): + input = torch.randn(1, 3, 16, 16) + for i, module in enumerate(self.vision_modules): + r = module(self.input) + return len( + r, + self.linear_sample(torch.randn(4, 9, 9)), + self.trilinear_sample(torch.randn(1, 3, 4, 9, 9)), + F.grid_sample(input, torch.ones(1, 4, 4, 2)), + ) + + +class NNShuffleModule(torch.nn.Module): + def __init__(self): + super(NNShuffleModule, self).__init__() + self.shuffle = nn.ChannelShuffle(2) + + def forward(self): + return len(self.shuffle(torch.randn(1, 4, 2, 2)),) + + +class NNUtilsModule(torch.nn.Module): + def __init__(self): + super(NNUtilsModule, self).__init__() + self.flatten = nn.Sequential( + nn.Linear(50, 50), + nn.Unflatten(1, (2, 5, 5)) + ) + + def forward(self): + a = [torch.tensor([1, 2, 3]), torch.tensor([3, 4])] + b = nn.utils.rnn.pad_sequence(a, batch_first=True) + # c = nn.utils.rnn.pack_padded_sequence(b, batch_first=True, lengths=torch.tensor([3, 2])) + input = torch.randn(2, 50) + return len( + self.flatten(input), + b, + ) diff --git a/test/mobile/model_test/quantization_ops.py b/test/mobile/model_test/quantization_ops.py new file mode 100644 index 000000000000..d0fdb346545e --- /dev/null +++ b/test/mobile/model_test/quantization_ops.py @@ -0,0 +1,227 @@ +import torch +import torch.nn as nn + + +class GeneralQuantModule(torch.nn.Module): + def __init__(self): + super(GeneralQuantModule, self).__init__() + self.embedding = torch.nn.quantized.Embedding( + num_embeddings=10, embedding_dim=12 + ) + self.embedding_input = torch.tensor([9, 6, 5, 7, 8, 8, 9, 2, 8]) + self.func = torch.nn.quantized.QFunctional() + self.conv1 = torch.nn.quantized.ConvTranspose1d(16, 33, 3, stride=2) + self.conv2 = torch.nn.quantized.ConvTranspose2d(16, 33, 3, stride=2) + self.conv3 = torch.nn.quantized.ConvTranspose3d(16, 33, 3, stride=2) + + def forward(self): + a = torch.quantize_per_tensor(torch.tensor([3.0]), 1.0, 0, torch.qint32) + b = torch.quantize_per_tensor(torch.tensor(4.0), 1.0, 0, torch.qint32) + c = torch.quantize_per_tensor( + torch.tensor([3.0]), torch.tensor(1.0), torch.tensor(0), torch.qint32 + ) + input1 = torch.randn(1, 16, 4) + input2 = torch.randn(1, 16, 4, 4) + input3 = torch.randn(1, 16, 4, 4, 4) + return len( + self.func.add(a, b), + self.func.cat((a, a), 0), + self.func.mul(a, b), + self.func.add_relu(a, b), + self.func.add_scalar(a, b), + self.func.mul_scalar(a, b), + self.embedding(self.embedding_input), + self.conv1( + torch.quantize_per_tensor( + input1, scale=1.0, zero_point=0, dtype=torch.quint8 + ) + ), + self.conv2( + torch.quantize_per_tensor( + input2, scale=1.0, zero_point=0, dtype=torch.quint8 + ) + ), + c, + # self.conv3(torch.quantize_per_tensor(input3, scale=1.0, zero_point=0, dtype=torch.quint8)), # failed on iOS + ) + + +class DynamicQuantModule: + def __init__(self): + super(DynamicQuantModule, self).__init__() + self.module = self.M() + + def getModule(self): + return torch.quantization.quantize_dynamic(self.module, dtype=torch.qint8) + + class M(torch.nn.Module): + def __init__(self): + super(DynamicQuantModule.M, self).__init__() + self.rnn = nn.RNN(4, 8, 2) + self.rnncell = nn.RNNCell(4, 8) + self.gru = nn.GRU(4, 8, 2) + self.grucell = nn.GRUCell(4, 8) + self.lstm = nn.LSTM(4, 8, 2) + self.lstmcell = nn.LSTMCell(4, 8) + self.linears = nn.ModuleList( + [ + nn.Identity(54), + nn.Linear(20, 20), + nn.Bilinear(20, 20, 40), + ] + ) + self.transformers = nn.ModuleList( + [ + nn.Transformer( + d_model=2, nhead=2, num_encoder_layers=1, num_decoder_layers=1 + ), + nn.TransformerEncoder( + nn.TransformerEncoderLayer(d_model=2, nhead=2), num_layers=1 + ), + nn.TransformerDecoder( + nn.TransformerDecoderLayer(d_model=2, nhead=2), num_layers=1 + ), + ] + ) + # self.a = torch.nn.utils.rnn.pad_sequence([torch.tensor([1,2,3]), torch.tensor([3,4])], batch_first=True) + + def forward(self): + input = torch.randn(5, 3, 4) + h = torch.randn(2, 3, 8) + c = torch.randn(2, 3, 8) + linear_input = torch.randn(32, 20) + trans_input = torch.randn(1, 16, 2) + tgt = torch.rand(1, 16, 2) + + return len(( + self.rnn(input, h), + self.rnncell(input[0], h[0]), + self.gru(input, h), + self.grucell(input[0], h[0]), + self.lstm(input, (h, c)), + # self.lstm(torch.nn.utils.rnn.pack_padded_sequence(self.a, lengths=torch.tensor([3,2,1])), (h, c)), + self.lstmcell(input[0], (h[0], c[0])), + self.transformers[0](trans_input, tgt), + self.transformers[1](trans_input), + self.transformers[2](trans_input, tgt), + self.linears[0](linear_input), + self.linears[1](linear_input), + self.linears[2](linear_input, linear_input), + )) + + +class StaticQuantModule: + def __init__(self): + super(StaticQuantModule, self).__init__() + + def getModule(self): + model_fp32 = self.M() + model_fp32.eval() + model_fp32.qconfig = torch.quantization.get_default_qconfig("qnnpack") + model_fp32_prepared = torch.quantization.prepare(model_fp32) + model_int8 = torch.quantization.convert(model_fp32_prepared) + return model_int8 + + class M(torch.nn.Module): + def __init__(self): + super(StaticQuantModule.M, self).__init__() + self.quant = torch.quantization.QuantStub() + self.input1d = torch.randn(4, 2, 2) + self.input2d = torch.randn((4, 2, 4, 4)) + self.input3d = torch.randn(4, 2, 2, 4, 4) + self.linear_input = torch.randn(32, 20) + + self.layer1 = nn.Sequential( + nn.Conv1d(2, 2, 1), nn.InstanceNorm1d(1), nn.Hardswish() + ) + self.layer2 = nn.Sequential( + nn.Conv2d(2, 2, 1), + nn.BatchNorm2d(2), + nn.InstanceNorm2d(1), + nn.LeakyReLU(), + ) + self.layer3 = nn.Sequential( + nn.Conv3d(2, 2, 1), nn.BatchNorm3d(2), nn.InstanceNorm3d(1), nn.ReLU() + ) + self.layer4 = nn.Sequential(nn.Linear(4, 3)) + self.dequant = torch.quantization.DeQuantStub() + + def forward(self): + x = self.quant(self.input1d) + x = self.layer1(x) + x = self.dequant(x) + + y = self.input2d + y = self.quant(y) + y = self.layer2(y) + y = self.layer4(y) + y = self.dequant(y) + + z = self.quant(self.input3d) + z = self.layer3(z) + z = self.dequant(z) + + return (x, y, z) + + +class FusedQuantModule: + def __init__(self): + super(FusedQuantModule, self).__init__() + + def getModule(self): + model_fp32 = self.M() + model_fp32.eval() + model_fp32.qconfig = torch.quantization.get_default_qconfig("qnnpack") + model_fp32_fused = torch.quantization.fuse_modules( + model_fp32, + [ + ["conv1d", "relu1"], + ["conv2d", "relu2"], + ["conv3d", "relu3"], + ["linear", "relu4"], + ], + ) + model_fp32_prepared = torch.quantization.prepare(model_fp32_fused) + model_int8 = torch.quantization.convert(model_fp32_prepared) + return model_int8 + + class M(torch.nn.Module): + def __init__(self): + super(FusedQuantModule.M, self).__init__() + self.quant = torch.quantization.QuantStub() + self.input1d = torch.randn(4, 2, 2) + self.input2d = torch.randn((4, 2, 4, 4)) + self.input3d = torch.randn(4, 2, 2, 4, 4) + self.conv1d = nn.Conv1d(2, 2, 1) + self.conv2d = nn.Conv2d(2, 2, 1) + self.conv3d = nn.Conv3d(2, 2, 1) + self.linear = nn.Linear(4, 2) + self.relu1 = nn.ReLU() + self.relu2 = nn.ReLU() + self.relu3 = nn.ReLU() + self.relu4 = nn.ReLU() + self.dequant = torch.quantization.DeQuantStub() + + def forward(self): + x = self.input1d + y = self.input2d + z = self.input3d + + x = self.quant(x) + x = self.conv1d(x) + x = self.relu1(x) + x = self.dequant(x) + + y = self.quant(y) + y = self.conv2d(y) + y = self.relu2(y) + y = self.dequant(y) + + z = self.quant(z) + z = self.conv3d(z) + z = self.relu3(z) + z = self.linear(z) + z = self.relu4(z) + z = self.dequant(z) + + return (x, y, z) diff --git a/test/mobile/model_test/sampling_ops.py b/test/mobile/model_test/sampling_ops.py new file mode 100644 index 000000000000..a1ac71a3a319 --- /dev/null +++ b/test/mobile/model_test/sampling_ops.py @@ -0,0 +1,37 @@ +import torch + + +# https://pytorch.org/docs/stable/torch.html#random-sampling + +class SamplingOpsModule(torch.nn.Module): + def __init__(self): + super(SamplingOpsModule, self).__init__() + + def forward(self): + a = torch.empty(3, 3).uniform_(0.0, 1.0) + size = (1, 4) + weights = torch.tensor([0, 10, 3, 0], dtype=torch.float) + return len( + # torch.seed(), + # torch.manual_seed(0), + torch.bernoulli(a), + # torch.initial_seed(), + torch.multinomial(weights, 2), + torch.normal(2.0, 3.0, size), + torch.poisson(a), + torch.rand(2, 3), + torch.rand_like(a), + torch.randint(10, size), + torch.randint_like(a, 4), + torch.rand(4), + torch.randn_like(a), + torch.randperm(4), + a.bernoulli_(), + a.cauchy_(), + a.exponential_(), + a.geometric_(0.5), + a.log_normal_(), + a.normal_(), + a.random_(), + a.uniform_(), + ) diff --git a/test/mobile/model_test/tensor_ops.py b/test/mobile/model_test/tensor_ops.py new file mode 100644 index 000000000000..9e04c6703d27 --- /dev/null +++ b/test/mobile/model_test/tensor_ops.py @@ -0,0 +1,279 @@ +import torch + + +class TensorOpsModule(torch.nn.Module): + def __init__(self): + super(TensorOpsModule, self).__init__() + + def forward(self): + return self.tensor_general_ops() + + def tensor_general_ops(self): + a = torch.randn(4) + b = torch.tensor([1.5]) + x = torch.ones((2,)) + c = torch.randn(4, dtype=torch.cfloat) + w = torch.rand(4, 4, 4, 4) + v = torch.rand(4, 4, 4, 4) + return len( + # torch.is_tensor(a), + # torch.is_storage(a), + torch.is_complex(a), + torch.is_conj(a), + torch.is_floating_point(a), + torch.is_nonzero(b), + # torch.set_default_dtype(torch.float32), + # torch.get_default_dtype(), + # torch.set_default_tensor_type(torch.DoubleTensor), + torch.numel(a), + # torch.set_printoptions(), + # torch.set_flush_denormal(False), + # https://pytorch.org/docs/stable/tensors.html#tensor-class-reference + # x.new_tensor([[0, 1], [2, 3]]), + x.new_full((3, 4), 3.141592), + x.new_empty((2, 3)), + x.new_ones((2, 3)), + x.new_zeros((2, 3)), + x.is_cuda, + x.is_quantized, + x.is_meta, + x.device, + x.dim(), + c.real, + c.imag, + # x.backward(), + x.clone(), + w.contiguous(), + w.contiguous(memory_format=torch.channels_last), + w.copy_(v), + w.copy_(1), + w.copy_(0.5), + x.cpu(), + # x.cuda(), + # x.data_ptr(), + x.dense_dim(), + w.fill_diagonal_(0), + w.element_size(), + w.exponential_(), + w.fill_(0), + w.geometric_(0.5), + a.index_fill(0, torch.tensor([0, 2]), 1), + a.index_put_([torch.argmax(a)], torch.tensor(1.0)), + a.index_put([torch.argmax(a)], torch.tensor(1.0)), + w.is_contiguous(), + c.is_complex(), + w.is_conj(), + w.is_floating_point(), + w.is_leaf, + w.is_pinned(), + w.is_set_to(w), + # w.is_shared, + w.is_coalesced(), + w.coalesce(), + w.is_signed(), + w.is_sparse, + torch.tensor([1]).item(), + x.log_normal_(), + # x.masked_scatter_(), + # x.masked_scatter(), + # w.normal(), + w.numel(), + # w.pin_memory(), + # w.put_(0, torch.tensor([0, 1], w)), + x.repeat(4, 2), + a.clamp_(0), + a.clamp(0), + a.clamp_min(0), + a.hardsigmoid_(), + a.hardsigmoid(), + a.hardswish_(), + a.hardswish(), + a.hardtanh_(), + a.hardtanh(), + a.leaky_relu_(), + a.leaky_relu(), + a.relu_(), + a.relu(), + a.resize_as_(a), + a.type_as(a), + a._shape_as_tensor(), + a.requires_grad_(False), + ) + + +class TensorCreationOpsModule(torch.nn.Module): + def __init__(self): + super(TensorCreationOpsModule, self).__init__() + + def forward(self): + return self.tensor_creation_ops() + + def tensor_creation_ops(self): + i = torch.tensor([[0, 1, 1], [2, 0, 2]]) + v = torch.tensor([3, 4, 5], dtype=torch.float32) + real = torch.tensor([1, 2], dtype=torch.float32) + imag = torch.tensor([3, 4], dtype=torch.float32) + inp = torch.tensor([-1.5, 0.0, 2.0]) + values = torch.tensor([0.5]) + quantized = torch.quantize_per_channel( + torch.tensor([[-1.0, 0.0], [1.0, 2.0]]), + torch.tensor([0.1, 0.01]), + torch.tensor([10, 0]), + 0, + torch.quint8, + ) + return len( + torch.tensor([[0.1, 1.2], [2.2, 3.1], [4.9, 5.2]]), + # torch.sparse_coo_tensor(i, v, [2, 3]), # not work for iOS + torch.as_tensor([1, 2, 3]), + torch.as_strided(torch.randn(3, 3), (2, 2), (1, 2)), + torch.zeros(2, 3), + torch.zeros((2, 3)), + torch.zeros([2, 3], out=i), + torch.zeros(5), + torch.zeros_like(torch.empty(2, 3)), + torch.ones(2, 3), + torch.ones((2, 3)), + torch.ones([2, 3]), + torch.ones(5), + torch.ones_like(torch.empty(2, 3)), + torch.arange(5), + torch.arange(1, 4), + torch.arange(1, 2.5, 0.5), + torch.range(1, 4), + torch.range(1, 4, 0.5), + torch.linspace(3.0, 3.0, steps=1), + torch.logspace(start=2, end=2, steps=1, base=2.0), + torch.eye(3), + torch.empty(2, 3), + torch.empty_like(torch.empty(2, 3), dtype=torch.int64), + torch.empty_strided((2, 3), (1, 2)), + torch.full((2, 3), 3.141592), + torch.full_like(torch.full((2, 3), 3.141592), 2.71828), + torch.quantize_per_tensor( + torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8 + ), + torch.dequantize(quantized), + torch.complex(real, imag), + torch.polar(real, imag), + torch.heaviside(inp, values), + ) + + +class TensorIndexingOpsModule(torch.nn.Module): + def __init__(self): + super(TensorIndexingOpsModule, self).__init__() + + def forward(self): + return self.tensor_indexing_ops() + + def tensor_indexing_ops(self): + x = torch.randn(2, 4) + y = torch.randn(4, 4) + t = torch.tensor([[0, 0], [1, 0]]) + mask = x.ge(0.5) + i = [0, 1] + return len( + torch.cat((x, x, x), 0), + torch.concat((x, x, x), 0), + torch.conj(x), + torch.chunk(x, 2), + torch.dsplit(torch.randn(2, 2, 4), i), + torch.column_stack((x, x)), + torch.dstack((x, x)), + torch.gather(x, 0, t), + torch.hsplit(x, i), + torch.hstack((x, x)), + torch.index_select(x, 0, torch.tensor([0, 1])), + x.index(t), + torch.masked_select(x, mask), + torch.movedim(x, 1, 0), + torch.moveaxis(x, 1, 0), + torch.narrow(x, 0, 0, 2), + torch.nonzero(x), + torch.permute(x, (0, 1)), + torch.reshape(x, (-1,)), + torch.row_stack((x, x)), + torch.select(x, 0, 0), + torch.scatter(x, 0, t, x), + x.scatter(0, t, x.clone()), + torch.diagonal_scatter(y, torch.ones(4)), + torch.select_scatter(y, torch.ones(4), 0, 0), + torch.slice_scatter(x, x), + torch.scatter_add(x, 0, t, x), + x.scatter_(0, t, y), + x.scatter_add_(0, t, y), + # torch.scatter_reduce(x, 0, t, reduce="sum"), + torch.split(x, 1), + torch.squeeze(x, 0), + torch.stack([x, x]), + torch.swapaxes(x, 0, 1), + torch.swapdims(x, 0, 1), + torch.t(x), + torch.take(x, t), + torch.take_along_dim(x, torch.argmax(x)), + torch.tensor_split(x, 1), + torch.tensor_split(x, [0, 1]), + torch.tile(x, (2, 2)), + torch.transpose(x, 0, 1), + torch.unbind(x), + torch.unsqueeze(x, -1), + torch.vsplit(x, i), + torch.vstack((x, x)), + torch.where(x), + torch.where(t > 0, t, 0), + torch.where(t > 0, t, t), + ) + + +class TensorTypingOpsModule(torch.nn.Module): + def __init__(self): + super(TensorTypingOpsModule, self).__init__() + + def forward(self): + return self.tensor_typing_ops() + + def tensor_typing_ops(self): + x = torch.randn(1, 3, 4, 4) + return len( + x.to(torch.float), + x.to(torch.double), + x.to(torch.cfloat), + x.to(torch.cdouble), + x.to(torch.half), + x.to(torch.bfloat16), + x.to(torch.uint8), + x.to(torch.int8), + x.to(torch.short), + x.to(torch.int), + x.to(torch.long), + x.to(torch.bool), + x.to(torch.device("cpu")), + x.to(device="cpu", dtype=torch.float), + x.to(memory_format=torch.channels_last), + ) + + +class TensorViewOpsModule(torch.nn.Module): + def __init__(self): + super(TensorViewOpsModule, self).__init__() + + def forward(self): + return self.tensor_view_ops() + + def tensor_view_ops(self): + x = torch.randn(4, 4, 1) + y = torch.randn(4, 4, 2) + return len( + x[0, 2:], + x.detach(), + x.detach_(), + x.diagonal(), + x.expand(-1, -1, 3), + x.expand_as(y), + x.select(0, 1), + x.unflatten(1, (2, 2)), + x.unfold(1, 2, 2), + x.view(16), + x.view_as(torch.randn(16)), + ) diff --git a/test/mobile/model_test/torchvision_models.py b/test/mobile/model_test/torchvision_models.py new file mode 100644 index 000000000000..232afbc54b1e --- /dev/null +++ b/test/mobile/model_test/torchvision_models.py @@ -0,0 +1,24 @@ +import torch +import torchvision +from torch.utils.bundled_inputs import augment_model_with_bundled_inputs +from torch.utils.mobile_optimizer import optimize_for_mobile + + +class MobileNetV2Module: + def __init__(self): + super(MobileNetV2Module, self).__init__() + + def getModule(self): + model = torchvision.models.mobilenet_v2(pretrained=True) + model.eval() + example = torch.zeros(1, 3, 224, 224) + traced_script_module = torch.jit.trace(model, example) + optimized_module = optimize_for_mobile(traced_script_module) + augment_model_with_bundled_inputs( + optimized_module, + [ + (example, ), + ], + ) + optimized_module(example) + return optimized_module diff --git a/test/mobile/model_test/update_production_ops.py b/test/mobile/model_test/update_production_ops.py new file mode 100644 index 000000000000..6bb685e6296d --- /dev/null +++ b/test/mobile/model_test/update_production_ops.py @@ -0,0 +1,35 @@ +""" +This is a script to aggregate production ops from xplat/pytorch_models/build/all_mobile_model_configs.yaml. +Specify the file path in the first argument. The results will be dump to model_ops.yaml +""" + +import sys +import yaml + +root_operators = {} +traced_operators = {} +kernel_metadata = {} + +with open(sys.argv[1]) as input_yaml_file: + model_infos = yaml.safe_load(input_yaml_file) + for info in model_infos: + for op in info["root_operators"]: + # aggregate occurance per op + root_operators[op] = 1 + (root_operators[op] if op in root_operators else 0) + for op in info["traced_operators"]: + # aggregate occurance per op + traced_operators[op] = 1 + (traced_operators[op] if op in traced_operators else 0) + # merge dtypes for each kernel + for kernal, dtypes in info["kernel_metadata"].items(): + new_dtypes = dtypes + (kernel_metadata[kernal] if kernal in kernel_metadata else []) + kernel_metadata[kernal] = list(set(new_dtypes)) + + +# Only test these built-in ops. No custom ops or non-CPU ops. +namespaces = ["aten", "prepacked", "prim", "quantized"] +root_operators = {x: root_operators[x] for x in root_operators if x.split("::")[0] in namespaces} +traced_operators = {x: traced_operators[x] for x in traced_operators if x.split("::")[0] in namespaces} + +out_path = "test/mobile/model_test/model_ops.yaml" +with open(out_path, "w") as f: + yaml.safe_dump({"root_operators": root_operators}, f) diff --git a/test/mobile/nnc/test_nnc_backend.cpp b/test/mobile/nnc/test_nnc_backend.cpp index f7adcb62459f..35bf60f2cca7 100644 --- a/test/mobile/nnc/test_nnc_backend.cpp +++ b/test/mobile/nnc/test_nnc_backend.cpp @@ -23,7 +23,9 @@ c10::Dict create_compile_spec( const std::string& method_name, const std::string& model_name, const std::string& input_shapes, - const std::string& input_types) { + const std::string& input_types, + const std::string& memory_formats, + const std::string& dynamic_sizes) { c10::Dict method_spec( c10::StringType::get(), c10::AnyType::get()); @@ -33,6 +35,8 @@ c10::Dict create_compile_spec( method_spec.insert("model_version", "v1"); method_spec.insert("asmfile", "fake_nnc_model.s"); method_spec.insert("arch", "x86-64"); + method_spec.insert("memory_formats", memory_formats); + method_spec.insert("dynamic_sizes", dynamic_sizes); c10::Dict compile_spec( c10::StringType::get(), c10::AnyType::get()); @@ -63,7 +67,7 @@ REGISTER_NNC_KERNEL( TEST(NNCBackendTest, AOTCompileThenExecute) { torch::jit::Module m("m"); - auto param = torch::ones({}); + auto param = torch::ones({1}); m.register_parameter("param", param, false); m.define(R"( def forward(self, input): @@ -77,7 +81,7 @@ TEST(NNCBackendTest, AOTCompileThenExecute) { // Compile the model with NNC. auto compile_spec = create_compile_spec( - "forward", "_add_kernel_nnc_fake_model", "4,4", "float"); + "forward", "_add_kernel_nnc_fake_model", "4,4", "float", "", ""); auto any_dict_ty = c10::DictType::create(c10::StringType::get(), c10::AnyType::get()); auto frozen_m = torch::jit::freeze_module(m.clone()); diff --git a/test/mobile/test_lite_script_module.py b/test/mobile/test_lite_script_module.py index 90abdab4ceea..638ac37eb88b 100644 --- a/test/mobile/test_lite_script_module.py +++ b/test/mobile/test_lite_script_module.py @@ -522,6 +522,49 @@ def forward(self, x): input = torch.randn(4, 1, 4, 4) self._compare_script_and_mobile(model=model_int8, input=input) + def test_bundled_input_with_dynamic_type(self): + class Model(torch.nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward( + self, + x: Dict[int, torch.Tensor], + y: Dict[int, torch.Tensor], + z: Dict[int, torch.Tensor], + ): + return x + + model = Model() + script_module = torch.jit.script(model) + + sample_input = { + script_module.forward: [ + ( + {0: torch.ones(1)}, + {1: torch.ones(1)}, + {2: torch.ones(1)}, + ) + ] + } + + bundled_model = torch.utils.bundled_inputs.bundle_inputs( + script_module, sample_input + ) + + buf = bundled_model._save_to_buffer_for_lite_interpreter() + mobile_module = _load_for_lite_interpreter(io.BytesIO(buf)) + + i = mobile_module.run_method("get_all_bundled_inputs") + + self.assertEqual( + i[0], + ( + {0: torch.ones(1)}, + {1: torch.ones(1)}, + {2: torch.ones(1)}, + ), + ) if __name__ == '__main__': run_tests() diff --git a/test/mobile/test_lite_script_type.py b/test/mobile/test_lite_script_type.py index 53db16ce9031..67d6ac859683 100644 --- a/test/mobile/test_lite_script_type.py +++ b/test/mobile/test_lite_script_type.py @@ -3,7 +3,7 @@ import torch import torch.utils.bundled_inputs import io -from typing import List, NamedTuple +from typing import Dict, List, NamedTuple from torch.jit.mobile import _load_for_lite_interpreter from torch.testing._internal.common_utils import TestCase, run_tests @@ -33,6 +33,69 @@ def forward(self, a: torch.Tensor): mobile_module_result ) + + def test_typing_dict_with_namedtuple(self): + class Foo(NamedTuple): + id: torch.Tensor + + class Bar(torch.nn.Module): + def __init__(self): + super(Bar, self).__init__() + self.foo = Foo(torch.tensor(1)) + + def forward(self, a: torch.Tensor): + self.foo = Foo(a) + re: Dict[str, Foo] = dict() + re["test"] = Foo(a) + return self.foo, re["test"] + + # The corresponding bytecode is + # (8, + # ('__torch__.___torch_mangle_2.Bar.forward', + # (('instructions', + # (('STOREN', 1, 2), + # ('DROPR', 1, 0), + # ('DICT_CONSTRUCT', 0, 0), + # ('STORE', 3, 0), + # ('LOAD', 3, 0), + # ('LOADC', 1, 0), + # ('MOVE', 2, 0), + # ('NAMED_TUPLE_CONSTRUCT', 1, 1), + # ('OP', 0, 0), + # ('MOVE', 3, 0), + # ('LOADC', 1, 0), + # ('DICT_INDEX', 0, 0), + # ('LOADC', 0, 0), + # ('TUPLE_INDEX', 0, 0), + # ('RET', 0, 0))), + # ('operators', (('aten::_set_item', 'str', 3),)), + # ('constants', (0, 'test')), + # ('types', + # ('Dict[str,__torch__.Foo[NamedTuple, [[id, Tensor]]]]', + # '__torch__.Foo[NamedTuple, [[id, Tensor]]]')), + # ('register_size', 3)), + # (('arguments', + # ((('name', 'self'), + # ('type', '__torch__.___torch_mangle_2.Bar'), + # ('default_value', None)), + # (('name', 'a'), ('type', 'Tensor'), ('default_value', None)))), + # ('returns', + # ((('name', ''), ('type', 'Tensor'), ('default_value', None)),))))) + + sample_input = torch.tensor(5) + script_module = torch.jit.script(Bar()) + + script_module_result = script_module(sample_input) + + buffer_mobile = io.BytesIO(script_module._save_to_buffer_for_lite_interpreter()) + buffer_mobile.seek(0) + mobile_module = _load_for_lite_interpreter(buffer_mobile) + mobile_module_result = mobile_module(sample_input) + torch.testing.assert_allclose( + script_module_result, + mobile_module_result + ) + def test_typing_namedtuple_custom_classtype(self): class Foo(NamedTuple): id: torch.Tensor diff --git a/test/mobile/test_upgrader_codegen.py b/test/mobile/test_upgrader_codegen.py index 5a09ad8a877d..5ccf9a020a5b 100644 --- a/test/mobile/test_upgrader_codegen.py +++ b/test/mobile/test_upgrader_codegen.py @@ -2,7 +2,7 @@ from torch.testing._internal.common_utils import TestCase, run_tests -from tools.codegen.operator_versions.gen_mobile_upgraders import ( +from torchgen.operator_versions.gen_mobile_upgraders import ( sort_upgrader, write_cpp, ) diff --git a/test/onnx/autograd_helper.py b/test/onnx/autograd_helper.py new file mode 100644 index 000000000000..4a3a3eca3844 --- /dev/null +++ b/test/onnx/autograd_helper.py @@ -0,0 +1,19 @@ +# Owner(s): ["module: onnx"] + +import torch + + +# Autograd funtion that is a replica of the autograd funtion in +# test_utility_funs.py (test_autograd_module_name) +class CustomFunction(torch.autograd.Function): + @staticmethod + def forward(ctx, input): + ctx.save_for_backward(input) + return input.clamp(min=0) + + @staticmethod + def backward(ctx, grad_output): + (input,) = ctx.saved_tensors + grad_input = grad_output.clone() + grad_input[input < 0] = 0 + return grad_input diff --git a/test/onnx/debug_embed_params.py b/test/onnx/debug_embed_params.py index 8499a1d8d216..7fe40a5906dc 100644 --- a/test/onnx/debug_embed_params.py +++ b/test/onnx/debug_embed_params.py @@ -1,13 +1,12 @@ import sys -import torch -import torch.jit -from torch.autograd import Variable - import onnx -import caffe2.python.onnx.backend as c2 from test_pytorch_common import flatten +import caffe2.python.onnx.backend as c2 +import torch +import torch.jit +from torch.autograd import Variable torch.set_default_tensor_type("torch.FloatTensor") try: diff --git a/test/onnx/expect/TestOperators.test_acos.expect b/test/onnx/expect/TestOperators.test_acos.expect index 0d978b1e3687..40fc61e29b7f 100644 --- a/test/onnx/expect/TestOperators.test_acos.expect +++ b/test/onnx/expect/TestOperators.test_acos.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -8,7 +8,7 @@ graph { name: "Acos_0" op_type: "Acos" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Acos_0" type { @@ -43,5 +43,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_add_broadcast.expect b/test/onnx/expect/TestOperators.test_add_broadcast.expect index 455df1b03a27..569b2400df88 100644 --- a/test/onnx/expect/TestOperators.test_add_broadcast.expect +++ b/test/onnx/expect/TestOperators.test_add_broadcast.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -9,7 +9,7 @@ graph { name: "Add_0" op_type: "Add" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Add_0" type { @@ -57,5 +57,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_add_left_broadcast.expect b/test/onnx/expect/TestOperators.test_add_left_broadcast.expect index c1dd2341ba8f..ffa632ca475b 100644 --- a/test/onnx/expect/TestOperators.test_add_left_broadcast.expect +++ b/test/onnx/expect/TestOperators.test_add_left_broadcast.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -9,7 +9,7 @@ graph { name: "Add_0" op_type: "Add" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Add_0" type { @@ -57,5 +57,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_add_size1_broadcast.expect b/test/onnx/expect/TestOperators.test_add_size1_broadcast.expect index 9219269bc5e2..9917880a8a22 100644 --- a/test/onnx/expect/TestOperators.test_add_size1_broadcast.expect +++ b/test/onnx/expect/TestOperators.test_add_size1_broadcast.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -9,7 +9,7 @@ graph { name: "Add_0" op_type: "Add" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Add_0" type { @@ -60,5 +60,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect b/test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect index 455df1b03a27..569b2400df88 100644 --- a/test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect +++ b/test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -9,7 +9,7 @@ graph { name: "Add_0" op_type: "Add" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Add_0" type { @@ -57,5 +57,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect b/test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect index 45cd1d21faf4..96d2dca59325 100644 --- a/test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect +++ b/test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -9,7 +9,7 @@ graph { name: "Add_0" op_type: "Add" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Add_0" type { @@ -60,5 +60,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_addconstant.expect b/test/onnx/expect/TestOperators.test_addconstant.expect index 684f08eaf9be..0e1570eb62da 100644 --- a/test/onnx/expect/TestOperators.test_addconstant.expect +++ b/test/onnx/expect/TestOperators.test_addconstant.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -22,7 +22,7 @@ graph { name: "Add_1" op_type: "Add" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Add_0" type { @@ -57,5 +57,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_addmm.expect b/test/onnx/expect/TestOperators.test_addmm.expect index f700dcd41ed3..1ef0a81e2a90 100644 --- a/test/onnx/expect/TestOperators.test_addmm.expect +++ b/test/onnx/expect/TestOperators.test_addmm.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -38,7 +38,7 @@ graph { type: FLOAT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Gemm_0" type { @@ -102,5 +102,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_arange_dynamic.expect b/test/onnx/expect/TestOperators.test_arange_dynamic.expect index 6de30ee46c49..09d75955ca26 100644 --- a/test/onnx/expect/TestOperators.test_arange_dynamic.expect +++ b/test/onnx/expect/TestOperators.test_arange_dynamic.expect @@ -16,7 +16,7 @@ graph { type: TENSOR } } - name: "torch-jit-export" + name: "torch_jit" output { name: "1" type { diff --git a/test/onnx/expect/TestOperators.test_argmax.expect b/test/onnx/expect/TestOperators.test_argmax.expect index 36822ef07ff7..38add716ff36 100644 --- a/test/onnx/expect/TestOperators.test_argmax.expect +++ b/test/onnx/expect/TestOperators.test_argmax.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -17,8 +17,13 @@ graph { i: 0 type: INT } + attribute { + name: "select_last_index" + i: 0 + type: INT + } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::ArgMax_0" type { @@ -50,5 +55,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_asin.expect b/test/onnx/expect/TestOperators.test_asin.expect index aa584d51c854..f5a44b850eb1 100644 --- a/test/onnx/expect/TestOperators.test_asin.expect +++ b/test/onnx/expect/TestOperators.test_asin.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -8,7 +8,7 @@ graph { name: "Asin_0" op_type: "Asin" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Asin_0" type { @@ -43,5 +43,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_at_op.expect b/test/onnx/expect/TestOperators.test_at_op.expect index a0f13b754247..8890f6535756 100644 --- a/test/onnx/expect/TestOperators.test_at_op.expect +++ b/test/onnx/expect/TestOperators.test_at_op.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -13,8 +13,14 @@ graph { s: "add" type: STRING } + attribute { + name: "overload_name" + s: "" + type: STRING + } + domain: "org.pytorch.aten" } - name: "torch-jit-export" + name: "torch_jit" input { name: "x" type { @@ -49,5 +55,9 @@ graph { } } opset_import { - version: 9 + version: 13 +} +opset_import { + domain: "org.pytorch.aten" + version: 1 } diff --git a/test/onnx/expect/TestOperators.test_atan.expect b/test/onnx/expect/TestOperators.test_atan.expect index 72ff4ba536bc..c8d189e1415e 100644 --- a/test/onnx/expect/TestOperators.test_atan.expect +++ b/test/onnx/expect/TestOperators.test_atan.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -8,7 +8,7 @@ graph { name: "Atan_0" op_type: "Atan" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Atan_0" type { @@ -43,5 +43,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_aten_embedding_1.expect b/test/onnx/expect/TestOperators.test_aten_embedding_1.expect index cb4454337751..25a4fb256e2e 100644 --- a/test/onnx/expect/TestOperators.test_aten_embedding_1.expect +++ b/test/onnx/expect/TestOperators.test_aten_embedding_1.expect @@ -16,7 +16,7 @@ graph { type: TENSOR } } - name: "torch-jit-export" + name: "torch_jit" output { name: "3" type { diff --git a/test/onnx/expect/TestOperators.test_aten_embedding_2.expect b/test/onnx/expect/TestOperators.test_aten_embedding_2.expect index 2b175cf621b2..98779b99d98d 100644 --- a/test/onnx/expect/TestOperators.test_aten_embedding_2.expect +++ b/test/onnx/expect/TestOperators.test_aten_embedding_2.expect @@ -6,19 +6,24 @@ graph { input: "emb.weight" input: "input_1" output: "onnx::Add_3" - name: "ATenOp_0" - op_type: "ATenOp" + name: "ATen_0" + op_type: "ATen" attribute { name: "custom_attributes_json" s: "{\"padding_idx\":-1,\"scale_grad_by_freq\":false,\"sparse\":false}" type: STRING } attribute { - name: "name" - s: "aten::embedding" + name: "operator" + s: "embedding" type: STRING } - domain: "com.microsoft" + attribute { + name: "overload_name" + s: "" + type: STRING + } + domain: "org.pytorch.aten" } node { input: "onnx::Add_3" @@ -95,7 +100,7 @@ graph { type: TENSOR } } - name: "torch-jit-export" + name: "torch_jit" initializer { dims: 4 dims: 8 @@ -145,27 +150,11 @@ graph { } } } - value_info { - name: "onnx::Add_3" - type { - tensor_type { - elem_type: 1 - shape { - dim { - dim_param: "ATenOponnx::Add_3_dim_0" - } - dim { - dim_param: "ATenOponnx::Add_3_dim_1" - } - } - } - } - } } opset_import { version: 12 } opset_import { - domain: "com.microsoft" + domain: "org.pytorch.aten" version: 1 } diff --git a/test/onnx/expect/TestOperators.test_avg_pool2d.expect b/test/onnx/expect/TestOperators.test_avg_pool2d.expect index 5647d2b36ff4..344022ec2688 100644 --- a/test/onnx/expect/TestOperators.test_avg_pool2d.expect +++ b/test/onnx/expect/TestOperators.test_avg_pool2d.expect @@ -1,40 +1,43 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { + node { + output: "onnx::Pad_1" + name: "Constant_0" + op_type: "Constant" + attribute { + name: "value" + t { + dims: 8 + data_type: 7 + raw_data: "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" + } + type: TENSOR + } + } node { input: "onnx::Pad_0" - output: "onnx::AveragePool_1" - name: "Pad_0" + input: "onnx::Pad_1" + output: "onnx::AveragePool_2" + name: "Pad_1" op_type: "Pad" attribute { name: "mode" s: "constant" type: STRING } - attribute { - name: "pads" - ints: 0 - ints: 0 - ints: 0 - ints: 0 - ints: 0 - ints: 0 - ints: 0 - ints: 0 - type: INTS - } - attribute { - name: "value" - f: 0 - type: FLOAT - } } node { - input: "onnx::AveragePool_1" - output: "2" - name: "AveragePool_1" + input: "onnx::AveragePool_2" + output: "3" + name: "AveragePool_2" op_type: "AveragePool" + attribute { + name: "ceil_mode" + i: 0 + type: INT + } attribute { name: "kernel_shape" ints: 3 @@ -56,7 +59,7 @@ graph { type: INTS } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Pad_0" type { @@ -80,7 +83,7 @@ graph { } } output { - name: "2" + name: "3" type { tensor_type { elem_type: 1 @@ -103,5 +106,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_baddbmm.expect b/test/onnx/expect/TestOperators.test_baddbmm.expect index c021baac505d..fc7eb0f8295e 100644 --- a/test/onnx/expect/TestOperators.test_baddbmm.expect +++ b/test/onnx/expect/TestOperators.test_baddbmm.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -30,7 +30,7 @@ graph { name: "Add_3" op_type: "Add" } - name: "torch-jit-export" + name: "torch_jit" initializer { data_type: 1 name: "onnx::Mul_11" @@ -119,5 +119,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_basic.expect b/test/onnx/expect/TestOperators.test_basic.expect index 280b8114034c..3d151aefabdb 100644 --- a/test/onnx/expect/TestOperators.test_basic.expect +++ b/test/onnx/expect/TestOperators.test_basic.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -34,7 +34,7 @@ graph { name: "Neg_4" op_type: "Neg" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Add_0" type { @@ -76,5 +76,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_batchnorm.expect b/test/onnx/expect/TestOperators.test_batchnorm.expect index 5071995d8b34..d9c9ec338c8c 100644 --- a/test/onnx/expect/TestOperators.test_batchnorm.expect +++ b/test/onnx/expect/TestOperators.test_batchnorm.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -22,7 +22,7 @@ graph { type: FLOAT } } - name: "torch-jit-export" + name: "torch_jit" initializer { dims: 2 data_type: 1 @@ -145,5 +145,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_batchnorm_1d.expect b/test/onnx/expect/TestOperators.test_batchnorm_1d.expect index 136fc681ecfc..a4d2e1f10249 100644 --- a/test/onnx/expect/TestOperators.test_batchnorm_1d.expect +++ b/test/onnx/expect/TestOperators.test_batchnorm_1d.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -22,7 +22,7 @@ graph { type: FLOAT } } - name: "torch-jit-export" + name: "torch_jit" initializer { dims: 2 data_type: 1 @@ -133,5 +133,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect b/test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect index 7ca2c910c7ab..a421443cdcda 100644 --- a/test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect +++ b/test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -50,7 +50,7 @@ graph { type: FLOAT } } - name: "torch-jit-export" + name: "torch_jit" initializer { dims: 128 data_type: 1 @@ -135,5 +135,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_batchnorm_onnx_irv4.expect b/test/onnx/expect/TestOperators.test_batchnorm_onnx_irv4.expect index 68ae47d8d243..a556e38c7198 100644 --- a/test/onnx/expect/TestOperators.test_batchnorm_onnx_irv4.expect +++ b/test/onnx/expect/TestOperators.test_batchnorm_onnx_irv4.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -22,7 +22,7 @@ graph { type: FLOAT } } - name: "torch-jit-export" + name: "torch_jit" initializer { dims: 2 data_type: 1 @@ -93,5 +93,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_batchnorm_training.expect b/test/onnx/expect/TestOperators.test_batchnorm_training.expect index a0bc171ed9e5..5e8f2049e143 100644 --- a/test/onnx/expect/TestOperators.test_batchnorm_training.expect +++ b/test/onnx/expect/TestOperators.test_batchnorm_training.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -26,7 +26,7 @@ graph { type: FLOAT } } - name: "torch-jit-export" + name: "torch_jit" initializer { dims: 2 data_type: 1 @@ -149,5 +149,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_bitshift.expect b/test/onnx/expect/TestOperators.test_bitshift.expect index 3b2affd5c36f..10199d03efcd 100644 --- a/test/onnx/expect/TestOperators.test_bitshift.expect +++ b/test/onnx/expect/TestOperators.test_bitshift.expect @@ -3,48 +3,22 @@ producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { - output: "onnx::Pow_4" - name: "Constant_0" - op_type: "Constant" - attribute { - name: "value" - t { - data_type: 1 - raw_data: "\000\000\000@" - } - type: TENSOR - } - } - node { - input: "onnx::Pow_4" - input: "onnx::Pow_11" - output: "onnx::Cast_5" - name: "Pow_1" - op_type: "Pow" - } - node { - input: "onnx::Cast_5" - output: "onnx::Div_6" - name: "Cast_2" - op_type: "Cast" + input: "onnx::BitShift_0" + input: "onnx::BitShift_7" + output: "3" + name: "BitShift_0" + op_type: "BitShift" attribute { - name: "to" - i: 1 - type: INT + name: "direction" + s: "RIGHT" + type: STRING } } node { - input: "onnx::Div_0" - input: "onnx::Div_6" - output: "7" - name: "Div_3" - op_type: "Div" - } - node { - input: "onnx::BitShift_1" - input: "onnx::BitShift_12" - output: "10" - name: "BitShift_4" + input: "onnx::BitShift_0" + input: "onnx::BitShift_8" + output: "6" + name: "BitShift_1" op_type: "BitShift" attribute { name: "direction" @@ -52,38 +26,19 @@ graph { type: STRING } } - name: "torch-jit-export" + name: "torch_jit" initializer { - data_type: 1 - name: "onnx::Pow_11" - raw_data: "\000\000\200?" + data_type: 2 + name: "onnx::BitShift_7" + raw_data: "\001" } initializer { data_type: 2 - name: "onnx::BitShift_12" + name: "onnx::BitShift_8" raw_data: "\002" } input { - name: "onnx::Div_0" - type { - tensor_type { - elem_type: 1 - shape { - dim { - dim_value: 3 - } - dim { - dim_value: 4 - } - dim { - dim_value: 2 - } - } - } - } - } - input { - name: "onnx::BitShift_1" + name: "onnx::BitShift_0" type { tensor_type { elem_type: 2 @@ -102,10 +57,10 @@ graph { } } output { - name: "7" + name: "3" type { tensor_type { - elem_type: 1 + elem_type: 2 shape { dim { dim_value: 3 @@ -121,7 +76,7 @@ graph { } } output { - name: "10" + name: "6" type { tensor_type { elem_type: 2 diff --git a/test/onnx/expect/TestOperators.test_c2_op.expect b/test/onnx/expect/TestOperators.test_c2_op.expect index 941cde493661..bd525b881aee 100644 --- a/test/onnx/expect/TestOperators.test_c2_op.expect +++ b/test/onnx/expect/TestOperators.test_c2_op.expect @@ -63,7 +63,7 @@ graph { } domain: "org.pytorch._caffe2" } - name: "torch-jit-export" + name: "torch_jit" input { name: "_caffe2::GenerateProposals_0" type { diff --git a/test/onnx/expect/TestOperators.test_chunk.expect b/test/onnx/expect/TestOperators.test_chunk.expect index d623c913aeec..575245c807eb 100644 --- a/test/onnx/expect/TestOperators.test_chunk.expect +++ b/test/onnx/expect/TestOperators.test_chunk.expect @@ -1,28 +1,158 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { - input: "onnx::Split_0" - output: "1" - output: "2" - name: "Split_0" - op_type: "Split" + input: "onnx::Shape_0" + output: "onnx::Gather_1" + name: "Shape_0" + op_type: "Shape" + } + node { + output: "onnx::Gather_2" + name: "Constant_1" + op_type: "Constant" + attribute { + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\000\000\000\000\000\000\000\000" + } + type: TENSOR + } + } + node { + input: "onnx::Gather_1" + input: "onnx::Gather_2" + output: "onnx::Add_3" + name: "Gather_2" + op_type: "Gather" attribute { name: "axis" i: 0 type: INT } + } + node { + output: "onnx::Slice_4" + name: "Constant_3" + op_type: "Constant" + attribute { + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\000\000\000\000\000\000\000\000" + } + type: TENSOR + } + } + node { + output: "onnx::Add_5" + name: "Constant_4" + op_type: "Constant" + attribute { + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\001\000\000\000\000\000\000\000" + } + type: TENSOR + } + } + node { + input: "onnx::Add_3" + input: "onnx::Add_5" + output: "onnx::Div_6" + name: "Add_5" + op_type: "Add" + } + node { + output: "onnx::Div_7" + name: "Constant_6" + op_type: "Constant" + attribute { + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\002\000\000\000\000\000\000\000" + } + type: TENSOR + } + } + node { + input: "onnx::Div_6" + input: "onnx::Div_7" + output: "onnx::Mul_8" + name: "Div_7" + op_type: "Div" + } + node { + output: "onnx::Mul_9" + name: "Constant_8" + op_type: "Constant" attribute { - name: "split" - ints: 2 - ints: 1 - type: INTS + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\001\000\000\000\000\000\000\000" + } + type: TENSOR } } - name: "torch-jit-export" + node { + input: "onnx::Mul_8" + input: "onnx::Mul_9" + output: "onnx::Slice_10" + name: "Mul_9" + op_type: "Mul" + } + node { + input: "onnx::Shape_0" + input: "onnx::Slice_4" + input: "onnx::Slice_10" + input: "onnx::Gather_2" + output: "11" + name: "Slice_10" + op_type: "Slice" + } + node { + output: "onnx::Mul_12" + name: "Constant_11" + op_type: "Constant" + attribute { + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\002\000\000\000\000\000\000\000" + } + type: TENSOR + } + } + node { + input: "onnx::Mul_8" + input: "onnx::Mul_12" + output: "onnx::Slice_13" + name: "Mul_12" + op_type: "Mul" + } + node { + input: "onnx::Shape_0" + input: "onnx::Slice_10" + input: "onnx::Slice_13" + input: "onnx::Gather_2" + output: "14" + name: "Slice_13" + op_type: "Slice" + } + name: "torch_jit" input { - name: "onnx::Split_0" + name: "onnx::Shape_0" type { tensor_type { elem_type: 1 @@ -35,7 +165,7 @@ graph { } } output { - name: "1" + name: "11" type { tensor_type { elem_type: 1 @@ -48,7 +178,7 @@ graph { } } output { - name: "2" + name: "14" type { tensor_type { elem_type: 1 @@ -62,5 +192,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_clip.expect b/test/onnx/expect/TestOperators.test_clip.expect index d2b0febe45c2..81606851e785 100644 --- a/test/onnx/expect/TestOperators.test_clip.expect +++ b/test/onnx/expect/TestOperators.test_clip.expect @@ -1,24 +1,26 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { input: "onnx::Clip_0" - output: "1" + input: "onnx::Clip_6" + input: "onnx::Clip_7" + output: "5" name: "Clip_0" op_type: "Clip" - attribute { - name: "max" - f: 0.5 - type: FLOAT - } - attribute { - name: "min" - f: -0.5 - type: FLOAT - } } - name: "torch-jit-export" + name: "torch_jit" + initializer { + data_type: 1 + name: "onnx::Clip_6" + raw_data: "\000\000\000\277" + } + initializer { + data_type: 1 + name: "onnx::Clip_7" + raw_data: "\000\000\000?" + } input { name: "onnx::Clip_0" type { @@ -36,7 +38,7 @@ graph { } } output { - name: "1" + name: "5" type { tensor_type { elem_type: 1 @@ -53,5 +55,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_clip_max.expect b/test/onnx/expect/TestOperators.test_clip_max.expect index 0a254a516e5c..ceb89b3048c6 100644 --- a/test/onnx/expect/TestOperators.test_clip_max.expect +++ b/test/onnx/expect/TestOperators.test_clip_max.expect @@ -1,19 +1,21 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { input: "onnx::Clip_0" - output: "1" + input: "" + input: "onnx::Clip_7" + output: "5" name: "Clip_0" op_type: "Clip" - attribute { - name: "max" - f: 0.1 - type: FLOAT - } } - name: "torch-jit-export" + name: "torch_jit" + initializer { + data_type: 1 + name: "onnx::Clip_7" + raw_data: "\315\314\314=" + } input { name: "onnx::Clip_0" type { @@ -37,22 +39,22 @@ graph { } } output { - name: "1" + name: "5" type { tensor_type { elem_type: 1 shape { dim { - dim_value: 1 + dim_param: "Clip5_dim_0" } dim { - dim_value: 2 + dim_param: "Clip5_dim_1" } dim { - dim_value: 3 + dim_param: "Clip5_dim_2" } dim { - dim_value: 4 + dim_param: "Clip5_dim_3" } } } @@ -60,5 +62,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_clip_min.expect b/test/onnx/expect/TestOperators.test_clip_min.expect index d54354d6f212..22826be3fd54 100644 --- a/test/onnx/expect/TestOperators.test_clip_min.expect +++ b/test/onnx/expect/TestOperators.test_clip_min.expect @@ -1,19 +1,21 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { input: "onnx::Clip_0" - output: "1" + input: "onnx::Clip_7" + input: "" + output: "5" name: "Clip_0" op_type: "Clip" - attribute { - name: "min" - f: -0.1 - type: FLOAT - } } - name: "torch-jit-export" + name: "torch_jit" + initializer { + data_type: 1 + name: "onnx::Clip_7" + raw_data: "\315\314\314\275" + } input { name: "onnx::Clip_0" type { @@ -37,22 +39,22 @@ graph { } } output { - name: "1" + name: "5" type { tensor_type { elem_type: 1 shape { dim { - dim_value: 1 + dim_param: "Clip5_dim_0" } dim { - dim_value: 2 + dim_param: "Clip5_dim_1" } dim { - dim_value: 3 + dim_param: "Clip5_dim_2" } dim { - dim_value: 4 + dim_param: "Clip5_dim_3" } } } @@ -60,5 +62,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_concat2.expect b/test/onnx/expect/TestOperators.test_concat2.expect index a5f3481305ae..f5b6aec0c229 100644 --- a/test/onnx/expect/TestOperators.test_concat2.expect +++ b/test/onnx/expect/TestOperators.test_concat2.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -14,7 +14,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Concat_0" type { @@ -65,5 +65,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_conv.expect b/test/onnx/expect/TestOperators.test_conv.expect index 9f63f64dae0a..f1078cef39c1 100644 --- a/test/onnx/expect/TestOperators.test_conv.expect +++ b/test/onnx/expect/TestOperators.test_conv.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -40,7 +40,7 @@ graph { type: INTS } } - name: "torch-jit-export" + name: "torch_jit" initializer { dims: 13 dims: 16 @@ -118,5 +118,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_conv_onnx_irv4.expect b/test/onnx/expect/TestOperators.test_conv_onnx_irv4.expect index 2347dc7da914..18e3c683e9bc 100644 --- a/test/onnx/expect/TestOperators.test_conv_onnx_irv4.expect +++ b/test/onnx/expect/TestOperators.test_conv_onnx_irv4.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -40,7 +40,7 @@ graph { type: INTS } } - name: "torch-jit-export" + name: "torch_jit" initializer { dims: 13 dims: 16 @@ -96,5 +96,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_conv_onnx_irv4_opset8.expect b/test/onnx/expect/TestOperators.test_conv_onnx_irv4_opset8.expect index 3737ed4a571d..94ad47523905 100644 --- a/test/onnx/expect/TestOperators.test_conv_onnx_irv4_opset8.expect +++ b/test/onnx/expect/TestOperators.test_conv_onnx_irv4_opset8.expect @@ -40,7 +40,7 @@ graph { type: INTS } } - name: "torch-jit-export" + name: "torch_jit" initializer { dims: 4 dims: 2 diff --git a/test/onnx/expect/TestOperators.test_convtranspose.expect b/test/onnx/expect/TestOperators.test_convtranspose.expect index b1f9bda52040..0beedca2f292 100644 --- a/test/onnx/expect/TestOperators.test_convtranspose.expect +++ b/test/onnx/expect/TestOperators.test_convtranspose.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -46,7 +46,7 @@ graph { type: INTS } } - name: "torch-jit-export" + name: "torch_jit" initializer { dims: 3 dims: 3 @@ -124,5 +124,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_cos.expect b/test/onnx/expect/TestOperators.test_cos.expect index c0821c91e8aa..1185bca62c59 100644 --- a/test/onnx/expect/TestOperators.test_cos.expect +++ b/test/onnx/expect/TestOperators.test_cos.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -8,7 +8,7 @@ graph { name: "Cos_0" op_type: "Cos" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Cos_0" type { @@ -43,5 +43,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_cumsum.expect b/test/onnx/expect/TestOperators.test_cumsum.expect index 0f8c5981a222..19d4d97d0817 100644 --- a/test/onnx/expect/TestOperators.test_cumsum.expect +++ b/test/onnx/expect/TestOperators.test_cumsum.expect @@ -22,7 +22,7 @@ graph { name: "CumSum_1" op_type: "CumSum" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::CumSum_0" type { diff --git a/test/onnx/expect/TestOperators.test_det.expect b/test/onnx/expect/TestOperators.test_det.expect index 8495b85fc50d..a15b0e2d32de 100644 --- a/test/onnx/expect/TestOperators.test_det.expect +++ b/test/onnx/expect/TestOperators.test_det.expect @@ -8,7 +8,7 @@ graph { name: "Det_0" op_type: "Det" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Det_0" type { diff --git a/test/onnx/expect/TestOperators.test_dict.expect b/test/onnx/expect/TestOperators.test_dict.expect index 26b7031d7760..e041d535d768 100644 --- a/test/onnx/expect/TestOperators.test_dict.expect +++ b/test/onnx/expect/TestOperators.test_dict.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -9,7 +9,7 @@ graph { name: "Add_0" op_type: "Add" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Add_0" type { @@ -60,5 +60,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_dict_str.expect b/test/onnx/expect/TestOperators.test_dict_str.expect index 0bdfa7638616..eaab2752fb7d 100644 --- a/test/onnx/expect/TestOperators.test_dict_str.expect +++ b/test/onnx/expect/TestOperators.test_dict_str.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -22,7 +22,7 @@ graph { name: "Add_1" op_type: "Add" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Add_0" type { @@ -63,5 +63,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_dim.expect b/test/onnx/expect/TestOperators.test_dim.expect index 4b1f6a3881d1..59e910a646ca 100644 --- a/test/onnx/expect/TestOperators.test_dim.expect +++ b/test/onnx/expect/TestOperators.test_dim.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -15,7 +15,7 @@ graph { type: TENSOR } } - name: "torch-jit-export" + name: "torch_jit" output { name: "1" type { @@ -28,5 +28,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_dropout.expect b/test/onnx/expect/TestOperators.test_dropout.expect index 6cea69c5c17f..27aab5c71821 100644 --- a/test/onnx/expect/TestOperators.test_dropout.expect +++ b/test/onnx/expect/TestOperators.test_dropout.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -13,7 +13,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "x" type { @@ -42,5 +42,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_dropout_default.expect b/test/onnx/expect/TestOperators.test_dropout_default.expect index a8fce1dd2745..89c0e988aacb 100644 --- a/test/onnx/expect/TestOperators.test_dropout_default.expect +++ b/test/onnx/expect/TestOperators.test_dropout_default.expect @@ -1,23 +1,46 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { - input: "x" - output: "onnx::ReduceMax_1" - output: "2" - name: "Dropout_0" - op_type: "Dropout" + output: "onnx::Dropout_1" + name: "Constant_0" + op_type: "Constant" + attribute { + name: "value" + t { + data_type: 1 + raw_data: "\000\000\000?" + } + type: TENSOR + } + } + node { + output: "onnx::Dropout_2" + name: "Constant_1" + op_type: "Constant" attribute { - name: "ratio" - f: 0.5 - type: FLOAT + name: "value" + t { + data_type: 9 + raw_data: "\001" + } + type: TENSOR } } node { - input: "onnx::ReduceMax_1" - output: "3" - name: "ReduceMax_1" + input: "x" + input: "onnx::Dropout_1" + input: "onnx::Dropout_2" + output: "onnx::ReduceMax_3" + output: "4" + name: "Dropout_2" + op_type: "Dropout" + } + node { + input: "onnx::ReduceMax_3" + output: "5" + name: "ReduceMax_3" op_type: "ReduceMax" attribute { name: "keepdims" @@ -25,7 +48,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "x" type { @@ -43,7 +66,7 @@ graph { } } output { - name: "3" + name: "5" type { tensor_type { elem_type: 1 @@ -54,5 +77,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_dropout_opset12.expect b/test/onnx/expect/TestOperators.test_dropout_opset12.expect index b2f908d4b1c2..af5738700bc3 100644 --- a/test/onnx/expect/TestOperators.test_dropout_opset12.expect +++ b/test/onnx/expect/TestOperators.test_dropout_opset12.expect @@ -13,7 +13,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "x" type { diff --git a/test/onnx/expect/TestOperators.test_dropout_training.expect b/test/onnx/expect/TestOperators.test_dropout_training.expect index a8fce1dd2745..89c0e988aacb 100644 --- a/test/onnx/expect/TestOperators.test_dropout_training.expect +++ b/test/onnx/expect/TestOperators.test_dropout_training.expect @@ -1,23 +1,46 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { - input: "x" - output: "onnx::ReduceMax_1" - output: "2" - name: "Dropout_0" - op_type: "Dropout" + output: "onnx::Dropout_1" + name: "Constant_0" + op_type: "Constant" + attribute { + name: "value" + t { + data_type: 1 + raw_data: "\000\000\000?" + } + type: TENSOR + } + } + node { + output: "onnx::Dropout_2" + name: "Constant_1" + op_type: "Constant" attribute { - name: "ratio" - f: 0.5 - type: FLOAT + name: "value" + t { + data_type: 9 + raw_data: "\001" + } + type: TENSOR } } node { - input: "onnx::ReduceMax_1" - output: "3" - name: "ReduceMax_1" + input: "x" + input: "onnx::Dropout_1" + input: "onnx::Dropout_2" + output: "onnx::ReduceMax_3" + output: "4" + name: "Dropout_2" + op_type: "Dropout" + } + node { + input: "onnx::ReduceMax_3" + output: "5" + name: "ReduceMax_3" op_type: "ReduceMax" attribute { name: "keepdims" @@ -25,7 +48,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "x" type { @@ -43,7 +66,7 @@ graph { } } output { - name: "3" + name: "5" type { tensor_type { elem_type: 1 @@ -54,5 +77,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_dropout_training_opset12.expect b/test/onnx/expect/TestOperators.test_dropout_training_opset12.expect index 657f7bd38817..7effb1e17421 100644 --- a/test/onnx/expect/TestOperators.test_dropout_training_opset12.expect +++ b/test/onnx/expect/TestOperators.test_dropout_training_opset12.expect @@ -48,7 +48,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "x" type { diff --git a/test/onnx/expect/TestOperators.test_dynamic_axes_add.expect b/test/onnx/expect/TestOperators.test_dynamic_axes_add.expect index 7ad54ca1031c..cf6c6b358037 100644 --- a/test/onnx/expect/TestOperators.test_dynamic_axes_add.expect +++ b/test/onnx/expect/TestOperators.test_dynamic_axes_add.expect @@ -9,7 +9,7 @@ graph { name: "Add_0" op_type: "Add" } - name: "torch-jit-export" + name: "torch_jit" input { name: "input_1" type { diff --git a/test/onnx/expect/TestOperators.test_dynamic_axes_add_inputs_same_symbolic_shape.expect b/test/onnx/expect/TestOperators.test_dynamic_axes_add_inputs_same_symbolic_shape.expect index 9698a3285217..8fd558eda82f 100644 --- a/test/onnx/expect/TestOperators.test_dynamic_axes_add_inputs_same_symbolic_shape.expect +++ b/test/onnx/expect/TestOperators.test_dynamic_axes_add_inputs_same_symbolic_shape.expect @@ -9,7 +9,7 @@ graph { name: "Add_0" op_type: "Add" } - name: "torch-jit-export" + name: "torch_jit" input { name: "input_1" type { diff --git a/test/onnx/expect/TestOperators.test_dynamic_axes_matmul.expect b/test/onnx/expect/TestOperators.test_dynamic_axes_matmul.expect index dd0943e8ece4..ee4c3f82c7b4 100644 --- a/test/onnx/expect/TestOperators.test_dynamic_axes_matmul.expect +++ b/test/onnx/expect/TestOperators.test_dynamic_axes_matmul.expect @@ -9,7 +9,7 @@ graph { name: "MatMul_0" op_type: "MatMul" } - name: "torch-jit-export" + name: "torch_jit" input { name: "input_1" type { diff --git a/test/onnx/expect/TestOperators.test_dynamic_axes_reduce_mean.expect b/test/onnx/expect/TestOperators.test_dynamic_axes_reduce_mean.expect index c41a46d54519..673e34cc4dc4 100644 --- a/test/onnx/expect/TestOperators.test_dynamic_axes_reduce_mean.expect +++ b/test/onnx/expect/TestOperators.test_dynamic_axes_reduce_mean.expect @@ -18,7 +18,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "input" type { diff --git a/test/onnx/expect/TestOperators.test_dynamic_axes_unchange.expect b/test/onnx/expect/TestOperators.test_dynamic_axes_unchange.expect index a3931b6ebd1d..2dbf0d186ccc 100644 --- a/test/onnx/expect/TestOperators.test_dynamic_axes_unchange.expect +++ b/test/onnx/expect/TestOperators.test_dynamic_axes_unchange.expect @@ -37,7 +37,7 @@ graph { type: INTS } } - name: "torch-jit-export" + name: "torch_jit" input { name: "input" type { diff --git a/test/onnx/expect/TestOperators.test_elu.expect b/test/onnx/expect/TestOperators.test_elu.expect index 5dc0dc6db48b..9fc2d5aab1fe 100644 --- a/test/onnx/expect/TestOperators.test_elu.expect +++ b/test/onnx/expect/TestOperators.test_elu.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -13,7 +13,7 @@ graph { type: FLOAT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "input" type { @@ -60,5 +60,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_embedding_bags.expect b/test/onnx/expect/TestOperators.test_embedding_bags.expect index 630fee8ba8e0..eb4a94b75590 100644 --- a/test/onnx/expect/TestOperators.test_embedding_bags.expect +++ b/test/onnx/expect/TestOperators.test_embedding_bags.expect @@ -1,43 +1,354 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { - input: "weight" - input: "input" - input: "offsets" - output: "3" - output: "4" output: "5" - output: "6" - op_type: "ATen" + name: "Constant_0" + op_type: "Constant" attribute { - name: "include_last_offset" - i: 0 - type: INT + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\000\000\000\000\000\000\000\000" + } + type: TENSOR } + } + node { + input: "input" + output: "onnx::Gather_6" + name: "Shape_1" + op_type: "Shape" + } + node { + output: "onnx::Gather_7" + name: "Constant_2" + op_type: "Constant" attribute { - name: "mode" - i: 1 + name: "value" + t { + data_type: 7 + raw_data: "\000\000\000\000\000\000\000\000" + } + type: TENSOR + } + } + node { + input: "onnx::Gather_6" + input: "onnx::Gather_7" + output: "onnx::Unsqueeze_8" + name: "Gather_3" + op_type: "Gather" + attribute { + name: "axis" + i: 0 type: INT } + } + node { + output: "onnx::Unsqueeze_9" + name: "Constant_4" + op_type: "Constant" attribute { - name: "operator" - s: "embedding_bag" - type: STRING + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\000\000\000\000\000\000\000\000" + } + type: TENSOR } + } + node { + input: "onnx::Unsqueeze_8" + input: "onnx::Unsqueeze_9" + output: "onnx::Concat_10" + name: "Unsqueeze_5" + op_type: "Unsqueeze" + } + node { + input: "offsets" + input: "onnx::Concat_10" + output: "onnx::Slice_11" + name: "Concat_6" + op_type: "Concat" attribute { - name: "scale_grad_by_freq" + name: "axis" i: 0 type: INT } + } + node { + output: "onnx::Slice_12" + name: "Constant_7" + op_type: "Constant" + attribute { + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\000\000\000\000\000\000\000\000" + } + type: TENSOR + } + } + node { + output: "onnx::Slice_13" + name: "Constant_8" + op_type: "Constant" + attribute { + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\001\000\000\000\000\000\000\000" + } + type: TENSOR + } + } + node { + output: "onnx::Slice_14" + name: "Constant_9" + op_type: "Constant" + attribute { + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\377\377\377\377\377\377\377\177" + } + type: TENSOR + } + } + node { + output: "onnx::Slice_15" + name: "Constant_10" + op_type: "Constant" + attribute { + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\001\000\000\000\000\000\000\000" + } + type: TENSOR + } + } + node { + input: "onnx::Slice_11" + input: "onnx::Slice_13" + input: "onnx::Slice_14" + input: "onnx::Slice_12" + input: "onnx::Slice_15" + output: "onnx::Shape_16" + name: "Slice_11" + op_type: "Slice" + } + node { + input: "onnx::Shape_16" + output: "onnx::Gather_17" + name: "Shape_12" + op_type: "Shape" + } + node { + output: "onnx::Gather_18" + name: "Constant_13" + op_type: "Constant" + attribute { + name: "value" + t { + data_type: 7 + raw_data: "\000\000\000\000\000\000\000\000" + } + type: TENSOR + } + } + node { + input: "onnx::Gather_17" + input: "onnx::Gather_18" + output: "onnx::Loop_19" + name: "Gather_14" + op_type: "Gather" attribute { - name: "sparse" + name: "axis" i: 0 type: INT } } - name: "torch-jit-export" + node { + input: "onnx::Loop_19" + input: "onnx::Loop_33" + output: "20" + name: "Loop_15" + op_type: "Loop" + attribute { + name: "body" + g { + node { + input: "onnx::Slice_11" + input: "21" + output: "23" + name: "Gather_16" + op_type: "Gather" + attribute { + name: "axis" + i: 0 + type: INT + } + } + node { + input: "onnx::Shape_16" + input: "21" + output: "24" + name: "Gather_17" + op_type: "Gather" + attribute { + name: "axis" + i: 0 + type: INT + } + } + node { + output: "25" + name: "Constant_18" + op_type: "Constant" + attribute { + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\000\000\000\000\000\000\000\000" + } + type: TENSOR + } + } + node { + input: "23" + input: "25" + output: "26" + name: "Unsqueeze_19" + op_type: "Unsqueeze" + } + node { + output: "27" + name: "Constant_20" + op_type: "Constant" + attribute { + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\000\000\000\000\000\000\000\000" + } + type: TENSOR + } + } + node { + input: "24" + input: "27" + output: "28" + name: "Unsqueeze_21" + op_type: "Unsqueeze" + } + node { + input: "input" + input: "26" + input: "28" + input: "5" + output: "29" + name: "Slice_22" + op_type: "Slice" + } + node { + input: "weight" + input: "29" + output: "30" + name: "Gather_23" + op_type: "Gather" + attribute { + name: "axis" + i: 0 + type: INT + } + } + node { + input: "30" + output: "31" + name: "ReduceMean_24" + op_type: "ReduceMean" + attribute { + name: "axes" + ints: 0 + type: INTS + } + attribute { + name: "keepdims" + i: 0 + type: INT + } + } + node { + input: "onnx::Loop_33" + output: "32" + name: "Cast_25" + op_type: "Cast" + attribute { + name: "to" + i: 9 + type: INT + } + } + name: "torch_jit1" + input { + name: "21" + type { + tensor_type { + elem_type: 7 + shape { + } + } + } + } + input { + name: "22" + type { + tensor_type { + elem_type: 9 + shape { + } + } + } + } + output { + name: "32" + type { + tensor_type { + elem_type: 9 + shape { + } + } + } + } + output { + name: "31" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_param: "Loop20_dim_1" + } + } + } + } + } + } + type: GRAPH + } + } + name: "torch_jit" initializer { dims: 10 dims: 8 @@ -45,6 +356,11 @@ graph { name: "weight" raw_data: "\264\314\344\275\017A\376\276\313\374&>J\266a\277s\306\\=\212\032+?\211[t\275\344[\357\276Dk\\\276OKb?\234\'B\277A\334\274\2767N\257\276\320s\263\277\371+\244>:\314\202\277K\200L??\001\275\275\236u4\2774\032\315\277\214\004\224>Z\320\372>\267B\305\276\346G6\277N\265.\276\343\316\272\277t\364a>\201)|>p\223\251\277Qm2?\346\275)\277\354\235\233?\027X\277\277\253\206a?\354\335\226\277L\032o\277\251J\021\277\311\360\215\276\312\274\013\300\252\320\273>\220\"p?\267\020\000\222\233\314?\334\360?\275|t\303\277\214\351\000\300\3065\302\2775\206\306>X\251\227\277x\2160?U^\251?d\221\350?\237F.?\rp9?9X\004=/c\324\277SL\360\277\'\274\332\356\226\275\211\035\241>*\271\204\277>\025W>\036K\035?\036\233\200=\035\313\250\276\017\003\346\277\374p_?\313WD?!\006\351\275\232\\q\277\230\007A?" } + initializer { + data_type: 9 + name: "onnx::Loop_33" + raw_data: "\001" + } input { name: "input" type { @@ -87,17 +403,27 @@ graph { } } } + input { + name: "onnx::Loop_33" + type { + tensor_type { + elem_type: 9 + shape { + } + } + } + } output { - name: "3" + name: "20" type { tensor_type { elem_type: 1 shape { dim { - dim_value: 1 + dim_param: "Loop20_dim_0" } dim { - dim_value: 8 + dim_param: "Loop20_dim_1" } } } @@ -105,5 +431,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_empty_like.expect b/test/onnx/expect/TestOperators.test_empty_like.expect index 27ac22983187..e4f6c6ede2ca 100644 --- a/test/onnx/expect/TestOperators.test_empty_like.expect +++ b/test/onnx/expect/TestOperators.test_empty_like.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -17,7 +17,7 @@ graph { type: TENSOR } } - name: "torch-jit-export" + name: "torch_jit" output { name: "1" type { @@ -36,5 +36,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_empty_like_opset7.expect b/test/onnx/expect/TestOperators.test_empty_like_opset7.expect index 095d3dc89bdb..504162493a00 100644 --- a/test/onnx/expect/TestOperators.test_empty_like_opset7.expect +++ b/test/onnx/expect/TestOperators.test_empty_like_opset7.expect @@ -29,7 +29,7 @@ graph { type: FLOAT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Shape_0" type { diff --git a/test/onnx/expect/TestOperators.test_equal.expect b/test/onnx/expect/TestOperators.test_equal.expect index 9da730440ef5..5a9877d484f8 100644 --- a/test/onnx/expect/TestOperators.test_equal.expect +++ b/test/onnx/expect/TestOperators.test_equal.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -9,7 +9,7 @@ graph { name: "Equal_0" op_type: "Equal" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Equal_0" type { @@ -72,5 +72,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_erf.expect b/test/onnx/expect/TestOperators.test_erf.expect index 023f051c05e6..f8f70c37598d 100644 --- a/test/onnx/expect/TestOperators.test_erf.expect +++ b/test/onnx/expect/TestOperators.test_erf.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -8,7 +8,7 @@ graph { name: "Erf_0" op_type: "Erf" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Erf_0" type { @@ -55,5 +55,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_exp.expect b/test/onnx/expect/TestOperators.test_exp.expect index f51786608e44..49d9f74cb20d 100644 --- a/test/onnx/expect/TestOperators.test_exp.expect +++ b/test/onnx/expect/TestOperators.test_exp.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -8,7 +8,7 @@ graph { name: "Exp_0" op_type: "Exp" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Exp_0" type { @@ -43,5 +43,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_expand.expect b/test/onnx/expect/TestOperators.test_expand.expect index 2e0fd4fdf725..6634173a0a63 100644 --- a/test/onnx/expect/TestOperators.test_expand.expect +++ b/test/onnx/expect/TestOperators.test_expand.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -87,7 +87,7 @@ graph { name: "Expand_7" op_type: "Expand" } - name: "torch-jit-export" + name: "torch_jit" initializer { dims: 1 data_type: 7 @@ -131,5 +131,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_flatten.expect b/test/onnx/expect/TestOperators.test_flatten.expect index e44e542c4138..12160e8b9e66 100644 --- a/test/onnx/expect/TestOperators.test_flatten.expect +++ b/test/onnx/expect/TestOperators.test_flatten.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -9,29 +9,59 @@ graph { op_type: "Shape" } node { - input: "onnx::Slice_1" - output: "onnx::Concat_2" - name: "Slice_1" - op_type: "Slice" + output: "onnx::Slice_2" + name: "Constant_1" + op_type: "Constant" attribute { - name: "axes" - ints: 0 - type: INTS + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\000\000\000\000\000\000\000\000" + } + type: TENSOR } + } + node { + output: "onnx::Slice_3" + name: "Constant_2" + op_type: "Constant" attribute { - name: "ends" - ints: 0 - type: INTS + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\000\000\000\000\000\000\000\000" + } + type: TENSOR } + } + node { + output: "onnx::Slice_4" + name: "Constant_3" + op_type: "Constant" attribute { - name: "starts" - ints: 0 - type: INTS + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\000\000\000\000\000\000\000\000" + } + type: TENSOR } } node { - output: "onnx::Concat_3" - name: "Constant_2" + input: "onnx::Slice_1" + input: "onnx::Slice_3" + input: "onnx::Slice_4" + input: "onnx::Slice_2" + output: "onnx::Concat_5" + name: "Slice_4" + op_type: "Slice" + } + node { + output: "onnx::Concat_6" + name: "Constant_5" op_type: "Constant" attribute { name: "value" @@ -44,10 +74,10 @@ graph { } } node { - input: "onnx::Concat_2" - input: "onnx::Concat_3" - output: "onnx::Reshape_4" - name: "Concat_3" + input: "onnx::Concat_5" + input: "onnx::Concat_6" + output: "onnx::Reshape_7" + name: "Concat_6" op_type: "Concat" attribute { name: "axis" @@ -57,12 +87,12 @@ graph { } node { input: "onnx::Shape_0" - input: "onnx::Reshape_4" - output: "5" - name: "Reshape_4" + input: "onnx::Reshape_7" + output: "8" + name: "Reshape_7" op_type: "Reshape" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Shape_0" type { @@ -86,7 +116,7 @@ graph { } } output { - name: "5" + name: "8" type { tensor_type { elem_type: 1 @@ -100,5 +130,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_flatten2D.expect b/test/onnx/expect/TestOperators.test_flatten2D.expect index c8f2e1c16045..f60b1ba7066f 100644 --- a/test/onnx/expect/TestOperators.test_flatten2D.expect +++ b/test/onnx/expect/TestOperators.test_flatten2D.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -13,7 +13,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Flatten_0" type { @@ -54,5 +54,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_fmod.expect b/test/onnx/expect/TestOperators.test_fmod.expect index 91c8f93ee004..a93ed8980ce2 100644 --- a/test/onnx/expect/TestOperators.test_fmod.expect +++ b/test/onnx/expect/TestOperators.test_fmod.expect @@ -14,7 +14,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Mod_0" type { diff --git a/test/onnx/expect/TestOperators.test_frobenius_norm.expect b/test/onnx/expect/TestOperators.test_frobenius_norm.expect index 5b759b291c82..fba4585b18b8 100644 --- a/test/onnx/expect/TestOperators.test_frobenius_norm.expect +++ b/test/onnx/expect/TestOperators.test_frobenius_norm.expect @@ -1,38 +1,52 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { + node { + output: "onnx::ReduceSum_1" + name: "Constant_0" + op_type: "Constant" + attribute { + name: "value" + t { + dims: 2 + data_type: 7 + raw_data: "\000\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000" + } + type: TENSOR + } + } node { input: "x" input: "x" - output: "onnx::ReduceSum_1" - name: "Mul_0" + output: "onnx::ReduceSum_2" + name: "Mul_1" op_type: "Mul" } node { + input: "onnx::ReduceSum_2" input: "onnx::ReduceSum_1" - output: "onnx::Sqrt_2" - name: "ReduceSum_1" + output: "onnx::Sqrt_3" + name: "ReduceSum_2" op_type: "ReduceSum" - attribute { - name: "axes" - ints: 0 - ints: 1 - type: INTS - } attribute { name: "keepdims" i: 1 type: INT } + attribute { + name: "noop_with_empty_axes" + i: 0 + type: INT + } } node { - input: "onnx::Sqrt_2" - output: "3" - name: "Sqrt_2" + input: "onnx::Sqrt_3" + output: "4" + name: "Sqrt_3" op_type: "Sqrt" } - name: "torch-jit-export" + name: "torch_jit" input { name: "x" type { @@ -53,7 +67,7 @@ graph { } } output { - name: "3" + name: "4" type { tensor_type { elem_type: 1 @@ -73,5 +87,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_full.expect b/test/onnx/expect/TestOperators.test_full.expect index a832bd8e2c58..fc8acf5ee80d 100644 --- a/test/onnx/expect/TestOperators.test_full.expect +++ b/test/onnx/expect/TestOperators.test_full.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -17,7 +17,7 @@ graph { type: TENSOR } } - name: "torch-jit-export" + name: "torch_jit" output { name: "1" type { @@ -36,5 +36,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_full_like.expect b/test/onnx/expect/TestOperators.test_full_like.expect index a832bd8e2c58..fc8acf5ee80d 100644 --- a/test/onnx/expect/TestOperators.test_full_like.expect +++ b/test/onnx/expect/TestOperators.test_full_like.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -17,7 +17,7 @@ graph { type: TENSOR } } - name: "torch-jit-export" + name: "torch_jit" output { name: "1" type { @@ -36,5 +36,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_gather.expect b/test/onnx/expect/TestOperators.test_gather.expect index 97658076b969..609f89853ac6 100644 --- a/test/onnx/expect/TestOperators.test_gather.expect +++ b/test/onnx/expect/TestOperators.test_gather.expect @@ -1,114 +1,22 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { - output: "onnx::OneHot_2" - name: "Constant_0" - op_type: "Constant" - attribute { - name: "value" - t { - dims: 2 - data_type: 7 - raw_data: "\000\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000" - } - type: TENSOR - } - } - node { - output: "onnx::Gather_3" - name: "Constant_1" - op_type: "Constant" - attribute { - name: "value" - t { - dims: 1 - data_type: 7 - raw_data: "\001\000\000\000\000\000\000\000" - } - type: TENSOR - } - } - node { - input: "onnx::Shape_0" - output: "onnx::Gather_4" - name: "Shape_2" - op_type: "Shape" - } - node { - input: "onnx::Gather_4" - input: "onnx::Gather_3" - output: "onnx::OneHot_5" - name: "Gather_3" - op_type: "Gather" - attribute { - name: "axis" - i: 0 - type: INT - } - } - node { - input: "onnx::OneHot_1" - input: "onnx::OneHot_5" - input: "onnx::OneHot_2" - output: "onnx::Cast_6" - name: "OneHot_4" - op_type: "OneHot" + input: "onnx::GatherElements_0" + input: "onnx::GatherElements_1" + output: "2" + name: "GatherElements_0" + op_type: "GatherElements" attribute { name: "axis" i: 1 type: INT } } - node { - input: "onnx::Cast_6" - output: "onnx::Mul_7" - name: "Cast_5" - op_type: "Cast" - attribute { - name: "to" - i: 1 - type: INT - } - } - node { - input: "onnx::Shape_0" - output: "onnx::Mul_8" - name: "Unsqueeze_6" - op_type: "Unsqueeze" - attribute { - name: "axes" - ints: 2 - type: INTS - } - } - node { - input: "onnx::Mul_8" - input: "onnx::Mul_7" - output: "onnx::ReduceSum_9" - name: "Mul_7" - op_type: "Mul" - } - node { - input: "onnx::ReduceSum_9" - output: "10" - name: "ReduceSum_8" - op_type: "ReduceSum" - attribute { - name: "axes" - ints: 1 - type: INTS - } - attribute { - name: "keepdims" - i: 0 - type: INT - } - } - name: "torch-jit-export" + name: "torch_jit" input { - name: "onnx::Shape_0" + name: "onnx::GatherElements_0" type { tensor_type { elem_type: 1 @@ -127,7 +35,7 @@ graph { } } input { - name: "onnx::OneHot_1" + name: "onnx::GatherElements_1" type { tensor_type { elem_type: 7 @@ -146,7 +54,7 @@ graph { } } output { - name: "10" + name: "2" type { tensor_type { elem_type: 1 @@ -166,5 +74,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_gather_opset11.expect b/test/onnx/expect/TestOperators.test_gather_opset11.expect index 42d94992e7ea..88c8a4542f45 100644 --- a/test/onnx/expect/TestOperators.test_gather_opset11.expect +++ b/test/onnx/expect/TestOperators.test_gather_opset11.expect @@ -14,7 +14,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::GatherElements_0" type { diff --git a/test/onnx/expect/TestOperators.test_ge.expect b/test/onnx/expect/TestOperators.test_ge.expect index 9d338b1e2ae1..8d578a4d25bd 100644 --- a/test/onnx/expect/TestOperators.test_ge.expect +++ b/test/onnx/expect/TestOperators.test_ge.expect @@ -1,23 +1,17 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { - input: "onnx::Less_0" - input: "onnx::Less_1" - output: "onnx::Not_2" - name: "Less_0" - op_type: "Less" + input: "onnx::GreaterOrEqual_0" + input: "onnx::GreaterOrEqual_1" + output: "2" + name: "GreaterOrEqual_0" + op_type: "GreaterOrEqual" } - node { - input: "onnx::Not_2" - output: "3" - name: "Not_1" - op_type: "Not" - } - name: "torch-jit-export" + name: "torch_jit" input { - name: "onnx::Less_0" + name: "onnx::GreaterOrEqual_0" type { tensor_type { elem_type: 6 @@ -33,7 +27,7 @@ graph { } } input { - name: "onnx::Less_1" + name: "onnx::GreaterOrEqual_1" type { tensor_type { elem_type: 6 @@ -49,7 +43,7 @@ graph { } } output { - name: "3" + name: "2" type { tensor_type { elem_type: 9 @@ -66,5 +60,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_gelu.expect b/test/onnx/expect/TestOperators.test_gelu.expect index 65265bc4f860..dfc7d1d88468 100644 --- a/test/onnx/expect/TestOperators.test_gelu.expect +++ b/test/onnx/expect/TestOperators.test_gelu.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -75,7 +75,7 @@ graph { name: "Mul_7" op_type: "Mul" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Div_0" type { @@ -122,5 +122,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_gt.expect b/test/onnx/expect/TestOperators.test_gt.expect index 08dec7abe29d..5aab77798bf6 100644 --- a/test/onnx/expect/TestOperators.test_gt.expect +++ b/test/onnx/expect/TestOperators.test_gt.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -9,7 +9,7 @@ graph { name: "Greater_0" op_type: "Greater" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Greater_0" type { @@ -72,5 +72,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_hardtanh.expect b/test/onnx/expect/TestOperators.test_hardtanh.expect index 3648a367d050..1268a4c14cfd 100644 --- a/test/onnx/expect/TestOperators.test_hardtanh.expect +++ b/test/onnx/expect/TestOperators.test_hardtanh.expect @@ -1,24 +1,42 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { - input: "input" - output: "1" - name: "Clip_0" - op_type: "Clip" + output: "onnx::Clip_1" + name: "Constant_0" + op_type: "Constant" attribute { - name: "max" - f: 0.5 - type: FLOAT + name: "value" + t { + data_type: 1 + raw_data: "\000\000\000\277" + } + type: TENSOR } + } + node { + output: "onnx::Clip_2" + name: "Constant_1" + op_type: "Constant" attribute { - name: "min" - f: -0.5 - type: FLOAT + name: "value" + t { + data_type: 1 + raw_data: "\000\000\000?" + } + type: TENSOR } } - name: "torch-jit-export" + node { + input: "input" + input: "onnx::Clip_1" + input: "onnx::Clip_2" + output: "3" + name: "Clip_2" + op_type: "Clip" + } + name: "torch_jit" input { name: "input" type { @@ -36,7 +54,7 @@ graph { } } output { - name: "1" + name: "3" type { tensor_type { elem_type: 1 @@ -53,5 +71,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_implicit_expand.expect b/test/onnx/expect/TestOperators.test_implicit_expand.expect index 9d64c5d16568..3c94edc85b4b 100644 --- a/test/onnx/expect/TestOperators.test_implicit_expand.expect +++ b/test/onnx/expect/TestOperators.test_implicit_expand.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -22,7 +22,7 @@ graph { name: "Add_1" op_type: "Add" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Add_0" type { @@ -57,5 +57,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_index.expect b/test/onnx/expect/TestOperators.test_index.expect index c65cffea5c2b..330d2de0d7fc 100644 --- a/test/onnx/expect/TestOperators.test_index.expect +++ b/test/onnx/expect/TestOperators.test_index.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -27,7 +27,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Gather_0" type { @@ -59,5 +59,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_isnan.expect b/test/onnx/expect/TestOperators.test_isnan.expect index b0f390e4d6f6..198d3bdb2387 100644 --- a/test/onnx/expect/TestOperators.test_isnan.expect +++ b/test/onnx/expect/TestOperators.test_isnan.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -8,7 +8,7 @@ graph { name: "IsNaN_0" op_type: "IsNaN" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::IsNaN_0" type { @@ -37,5 +37,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_layer_norm_aten.expect b/test/onnx/expect/TestOperators.test_layer_norm_aten.expect index 41a1ae695019..071437686117 100644 --- a/test/onnx/expect/TestOperators.test_layer_norm_aten.expect +++ b/test/onnx/expect/TestOperators.test_layer_norm_aten.expect @@ -1,36 +1,106 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { input: "input" - input: "weight" - input: "bias" - output: "3" - op_type: "ATen" + output: "onnx::Sub_3" + name: "ReduceMean_0" + op_type: "ReduceMean" attribute { - name: "cudnn_enable" - i: 1 - type: INT + name: "axes" + ints: -2 + ints: -1 + type: INTS } + } + node { + input: "input" + input: "onnx::Sub_3" + output: "onnx::Pow_4" + name: "Sub_1" + op_type: "Sub" + } + node { + output: "onnx::Pow_5" + name: "Constant_2" + op_type: "Constant" attribute { - name: "eps" - f: 1e-05 - type: FLOAT + name: "value" + t { + data_type: 1 + raw_data: "\000\000\000@" + } + type: TENSOR } + } + node { + input: "onnx::Pow_4" + input: "onnx::Pow_5" + output: "onnx::ReduceMean_6" + name: "Pow_3" + op_type: "Pow" + } + node { + input: "onnx::ReduceMean_6" + output: "onnx::Add_7" + name: "ReduceMean_4" + op_type: "ReduceMean" attribute { - name: "normalized_shape" - ints: 10 - ints: 10 + name: "axes" + ints: -2 + ints: -1 type: INTS } + } + node { + output: "onnx::Add_8" + name: "Constant_5" + op_type: "Constant" attribute { - name: "operator" - s: "layer_norm" - type: STRING + name: "value" + t { + data_type: 1 + raw_data: "\254\305\'7" + } + type: TENSOR } } - name: "torch-jit-export" + node { + input: "onnx::Add_7" + input: "onnx::Add_8" + output: "onnx::Sqrt_9" + name: "Add_6" + op_type: "Add" + } + node { + input: "onnx::Sqrt_9" + output: "onnx::Div_10" + name: "Sqrt_7" + op_type: "Sqrt" + } + node { + input: "onnx::Pow_4" + input: "onnx::Div_10" + output: "onnx::Mul_11" + name: "Div_8" + op_type: "Div" + } + node { + input: "onnx::Mul_11" + input: "weight" + output: "onnx::Add_12" + name: "Mul_9" + op_type: "Mul" + } + node { + input: "onnx::Add_12" + input: "bias" + output: "13" + name: "Add_10" + op_type: "Add" + } + name: "torch_jit" initializer { dims: 10 dims: 10 @@ -100,7 +170,7 @@ graph { } } output { - name: "3" + name: "13" type { tensor_type { elem_type: 1 @@ -123,5 +193,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_le.expect b/test/onnx/expect/TestOperators.test_le.expect index a29a46f89ebb..374a0d0e0d52 100644 --- a/test/onnx/expect/TestOperators.test_le.expect +++ b/test/onnx/expect/TestOperators.test_le.expect @@ -1,23 +1,17 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { - input: "onnx::Greater_0" - input: "onnx::Greater_1" - output: "onnx::Not_2" - name: "Greater_0" - op_type: "Greater" + input: "onnx::LessOrEqual_0" + input: "onnx::LessOrEqual_1" + output: "2" + name: "LessOrEqual_0" + op_type: "LessOrEqual" } - node { - input: "onnx::Not_2" - output: "3" - name: "Not_1" - op_type: "Not" - } - name: "torch-jit-export" + name: "torch_jit" input { - name: "onnx::Greater_0" + name: "onnx::LessOrEqual_0" type { tensor_type { elem_type: 6 @@ -33,7 +27,7 @@ graph { } } input { - name: "onnx::Greater_1" + name: "onnx::LessOrEqual_1" type { tensor_type { elem_type: 6 @@ -49,7 +43,7 @@ graph { } } output { - name: "3" + name: "2" type { tensor_type { elem_type: 9 @@ -66,5 +60,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_linear.expect b/test/onnx/expect/TestOperators.test_linear.expect index 806f1cf83eed..71c64dfe5a50 100644 --- a/test/onnx/expect/TestOperators.test_linear.expect +++ b/test/onnx/expect/TestOperators.test_linear.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -25,7 +25,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" initializer { dims: 5 dims: 4 @@ -102,5 +102,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_log_sigmoid.expect b/test/onnx/expect/TestOperators.test_log_sigmoid.expect index 528952692684..2681f1193102 100644 --- a/test/onnx/expect/TestOperators.test_log_sigmoid.expect +++ b/test/onnx/expect/TestOperators.test_log_sigmoid.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -14,7 +14,7 @@ graph { name: "Log_1" op_type: "Log" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Sigmoid_0" type { @@ -61,5 +61,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_logsoftmax.expect b/test/onnx/expect/TestOperators.test_logsoftmax.expect index 4ae37e23b95e..1c4de89b6402 100644 --- a/test/onnx/expect/TestOperators.test_logsoftmax.expect +++ b/test/onnx/expect/TestOperators.test_logsoftmax.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -13,7 +13,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "input" type { @@ -60,5 +60,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_lstm_none_sequence_lens.expect b/test/onnx/expect/TestOperators.test_lstm_none_sequence_lens.expect index db2fcb4dcda3..94c9c72db8e6 100644 --- a/test/onnx/expect/TestOperators.test_lstm_none_sequence_lens.expect +++ b/test/onnx/expect/TestOperators.test_lstm_none_sequence_lens.expect @@ -18,7 +18,7 @@ graph { type: TENSOR } } - name: "torch-jit-export" + name: "torch_jit" output { name: "7" type { diff --git a/test/onnx/expect/TestOperators.test_lt.expect b/test/onnx/expect/TestOperators.test_lt.expect index 29be0d629e82..2dbcc07cd9e1 100644 --- a/test/onnx/expect/TestOperators.test_lt.expect +++ b/test/onnx/expect/TestOperators.test_lt.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -9,7 +9,7 @@ graph { name: "Less_0" op_type: "Less" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Less_0" type { @@ -72,5 +72,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_master_opset.expect b/test/onnx/expect/TestOperators.test_master_opset.expect index b9a30f4b545c..f468400e7c6a 100644 --- a/test/onnx/expect/TestOperators.test_master_opset.expect +++ b/test/onnx/expect/TestOperators.test_master_opset.expect @@ -9,7 +9,7 @@ graph { name: "Add_0" op_type: "Add" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Add_0" type { diff --git a/test/onnx/expect/TestOperators.test_max.expect b/test/onnx/expect/TestOperators.test_max.expect index 29476215fd34..d9fcc0fb5f7a 100644 --- a/test/onnx/expect/TestOperators.test_max.expect +++ b/test/onnx/expect/TestOperators.test_max.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -9,7 +9,7 @@ graph { name: "Max_0" op_type: "Max" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Max_0" type { @@ -60,5 +60,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_maxpool.expect b/test/onnx/expect/TestOperators.test_maxpool.expect index 4def8b60c6dd..f43712bbfc58 100644 --- a/test/onnx/expect/TestOperators.test_maxpool.expect +++ b/test/onnx/expect/TestOperators.test_maxpool.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -7,6 +7,11 @@ graph { output: "1" name: "MaxPool_0" op_type: "MaxPool" + attribute { + name: "ceil_mode" + i: 0 + type: INT + } attribute { name: "kernel_shape" ints: 3 @@ -24,7 +29,7 @@ graph { type: INTS } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::MaxPool_0" type { @@ -65,5 +70,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_maxpool_dilations.expect b/test/onnx/expect/TestOperators.test_maxpool_dilations.expect index b0cfe51b0545..2d07fc6fadc7 100644 --- a/test/onnx/expect/TestOperators.test_maxpool_dilations.expect +++ b/test/onnx/expect/TestOperators.test_maxpool_dilations.expect @@ -34,7 +34,7 @@ graph { type: INTS } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::MaxPool_0" type { diff --git a/test/onnx/expect/TestOperators.test_maxpool_indices.expect b/test/onnx/expect/TestOperators.test_maxpool_indices.expect index 9b999cb6bf8d..46c23e3a4cae 100644 --- a/test/onnx/expect/TestOperators.test_maxpool_indices.expect +++ b/test/onnx/expect/TestOperators.test_maxpool_indices.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -8,6 +8,11 @@ graph { output: "onnx::Sub_2" name: "MaxPool_0" op_type: "MaxPool" + attribute { + name: "ceil_mode" + i: 0 + type: INT + } attribute { name: "kernel_shape" ints: 3 @@ -43,34 +48,64 @@ graph { } } node { - input: "onnx::Slice_4" - output: "onnx::Sub_5" - name: "Slice_2" - op_type: "Slice" + output: "onnx::Slice_5" + name: "Constant_2" + op_type: "Constant" attribute { - name: "axes" - ints: 2 - type: INTS + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\002\000\000\000\000\000\000\000" + } + type: TENSOR } + } + node { + output: "onnx::Slice_6" + name: "Constant_3" + op_type: "Constant" attribute { - name: "ends" - ints: 1 - type: INTS + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\000\000\000\000\000\000\000\000" + } + type: TENSOR } + } + node { + output: "onnx::Slice_7" + name: "Constant_4" + op_type: "Constant" attribute { - name: "starts" - ints: 0 - type: INTS + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\001\000\000\000\000\000\000\000" + } + type: TENSOR } } + node { + input: "onnx::Slice_4" + input: "onnx::Slice_6" + input: "onnx::Slice_7" + input: "onnx::Slice_5" + output: "onnx::Sub_8" + name: "Slice_5" + op_type: "Slice" + } node { input: "onnx::Sub_2" - input: "onnx::Sub_5" - output: "6" - name: "Sub_3" + input: "onnx::Sub_8" + output: "9" + name: "Sub_6" op_type: "Sub" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::MaxPool_0" type { @@ -110,7 +145,7 @@ graph { } } output { - name: "6" + name: "9" type { tensor_type { elem_type: 7 @@ -130,5 +165,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_mean.expect b/test/onnx/expect/TestOperators.test_mean.expect index 6a3e26dc6d2d..b53b8c2f1248 100644 --- a/test/onnx/expect/TestOperators.test_mean.expect +++ b/test/onnx/expect/TestOperators.test_mean.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -13,7 +13,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::ReduceMean_0" type { @@ -48,5 +48,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_mean_dtype.expect b/test/onnx/expect/TestOperators.test_mean_dtype.expect index acac724e1c16..92ce0ae3aa99 100644 --- a/test/onnx/expect/TestOperators.test_mean_dtype.expect +++ b/test/onnx/expect/TestOperators.test_mean_dtype.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -24,7 +24,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Cast_0" type { @@ -59,5 +59,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_meshgrid.expect b/test/onnx/expect/TestOperators.test_meshgrid.expect index 7f9321046a8c..05b9de875d94 100644 --- a/test/onnx/expect/TestOperators.test_meshgrid.expect +++ b/test/onnx/expect/TestOperators.test_meshgrid.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -219,7 +219,7 @@ graph { name: "Expand_21" op_type: "Expand" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Reshape_0" type { @@ -318,5 +318,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_min.expect b/test/onnx/expect/TestOperators.test_min.expect index 13c08b82a548..28ca14779f71 100644 --- a/test/onnx/expect/TestOperators.test_min.expect +++ b/test/onnx/expect/TestOperators.test_min.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -9,7 +9,7 @@ graph { name: "Min_0" op_type: "Min" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Min_0" type { @@ -60,5 +60,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_mm.expect b/test/onnx/expect/TestOperators.test_mm.expect index 97f7761b985f..9492d651fd9e 100644 --- a/test/onnx/expect/TestOperators.test_mm.expect +++ b/test/onnx/expect/TestOperators.test_mm.expect @@ -1,27 +1,12 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { - node { - output: "onnx::Gemm_2" - name: "Constant_0" - op_type: "Constant" - attribute { - name: "value" - t { - dims: 1 - data_type: 1 - raw_data: "\000\000\200?" - } - type: TENSOR - } - } node { input: "onnx::Gemm_0" input: "onnx::Gemm_1" - input: "onnx::Gemm_2" - output: "3" - name: "Gemm_1" + output: "2" + name: "Gemm_0" op_type: "Gemm" attribute { name: "alpha" @@ -34,7 +19,7 @@ graph { type: FLOAT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Gemm_0" type { @@ -68,7 +53,7 @@ graph { } } output { - name: "3" + name: "2" type { tensor_type { elem_type: 1 @@ -85,5 +70,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_narrow.expect b/test/onnx/expect/TestOperators.test_narrow.expect index 70b4ef8b35c8..a7b13c89a646 100644 --- a/test/onnx/expect/TestOperators.test_narrow.expect +++ b/test/onnx/expect/TestOperators.test_narrow.expect @@ -1,29 +1,35 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { input: "onnx::Slice_0" - output: "1" + input: "onnx::Slice_14" + input: "onnx::Slice_15" + input: "onnx::Slice_16" + output: "12" name: "Slice_0" op_type: "Slice" - attribute { - name: "axes" - ints: 0 - type: INTS - } - attribute { - name: "ends" - ints: 2 - type: INTS - } - attribute { - name: "starts" - ints: 0 - type: INTS - } } - name: "torch-jit-export" + name: "torch_jit" + initializer { + dims: 1 + data_type: 7 + name: "onnx::Slice_14" + raw_data: "\000\000\000\000\000\000\000\000" + } + initializer { + dims: 1 + data_type: 7 + name: "onnx::Slice_15" + raw_data: "\002\000\000\000\000\000\000\000" + } + initializer { + dims: 1 + data_type: 7 + name: "onnx::Slice_16" + raw_data: "\000\000\000\000\000\000\000\000" + } input { name: "onnx::Slice_0" type { @@ -41,7 +47,7 @@ graph { } } output { - name: "1" + name: "12" type { tensor_type { elem_type: 1 @@ -58,5 +64,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_ne.expect b/test/onnx/expect/TestOperators.test_ne.expect index 6849f2711765..ab053fbcf67e 100644 --- a/test/onnx/expect/TestOperators.test_ne.expect +++ b/test/onnx/expect/TestOperators.test_ne.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -15,7 +15,7 @@ graph { name: "Not_1" op_type: "Not" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Equal_0" type { @@ -78,5 +78,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_nonzero.expect b/test/onnx/expect/TestOperators.test_nonzero.expect index 48a57dc61587..cfcb1f505f87 100644 --- a/test/onnx/expect/TestOperators.test_nonzero.expect +++ b/test/onnx/expect/TestOperators.test_nonzero.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -20,7 +20,7 @@ graph { type: INTS } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::NonZero_0" type { @@ -58,5 +58,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_norm_p1.expect b/test/onnx/expect/TestOperators.test_norm_p1.expect index 519819695b20..ec5e12b90a16 100644 --- a/test/onnx/expect/TestOperators.test_norm_p1.expect +++ b/test/onnx/expect/TestOperators.test_norm_p1.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -18,7 +18,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::ReduceL1_0" type { @@ -62,5 +62,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_norm_p2.expect b/test/onnx/expect/TestOperators.test_norm_p2.expect index 68d1762f4174..0388ec620821 100644 --- a/test/onnx/expect/TestOperators.test_norm_p2.expect +++ b/test/onnx/expect/TestOperators.test_norm_p2.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -18,7 +18,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::ReduceL2_0" type { @@ -62,5 +62,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_ones_like.expect b/test/onnx/expect/TestOperators.test_ones_like.expect index ec25d269f79d..fafec789b174 100644 --- a/test/onnx/expect/TestOperators.test_ones_like.expect +++ b/test/onnx/expect/TestOperators.test_ones_like.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -17,7 +17,7 @@ graph { type: TENSOR } } - name: "torch-jit-export" + name: "torch_jit" output { name: "1" type { @@ -36,5 +36,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_pad.expect b/test/onnx/expect/TestOperators.test_pad.expect index db2cdefe75cb..293877ab834a 100644 --- a/test/onnx/expect/TestOperators.test_pad.expect +++ b/test/onnx/expect/TestOperators.test_pad.expect @@ -1,33 +1,192 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { - input: "input" - output: "1" - name: "Pad_0" - op_type: "Pad" + input: "onnx::ConstantOfShape_27" + output: "onnx::Concat_10" + name: "ConstantOfShape_0" + op_type: "ConstantOfShape" attribute { - name: "mode" - s: "reflect" - type: STRING + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\000\000\000\000\000\000\000\000" + } + type: TENSOR } + } + node { + input: "onnx::Concat_28" + input: "onnx::Concat_10" + output: "onnx::Reshape_11" + name: "Concat_1" + op_type: "Concat" attribute { - name: "pads" - ints: 0 - ints: 0 - ints: 0 - ints: 2 - ints: 0 - ints: 0 + name: "axis" + i: 0 + type: INT + } + } + node { + output: "onnx::Reshape_12" + name: "Constant_2" + op_type: "Constant" + attribute { + name: "value" + t { + dims: 2 + data_type: 7 + raw_data: "\377\377\377\377\377\377\377\377\002\000\000\000\000\000\000\000" + } + type: TENSOR + } + } + node { + input: "onnx::Reshape_11" + input: "onnx::Reshape_12" + output: "onnx::Slice_13" + name: "Reshape_3" + op_type: "Reshape" + } + node { + output: "onnx::Slice_14" + name: "Constant_4" + op_type: "Constant" + attribute { + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\000\000\000\000\000\000\000\000" + } + type: TENSOR + } + } + node { + output: "onnx::Slice_15" + name: "Constant_5" + op_type: "Constant" + attribute { + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\377\377\377\377\377\377\377\377" + } + type: TENSOR + } + } + node { + output: "onnx::Slice_16" + name: "Constant_6" + op_type: "Constant" + attribute { + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\001\000\000\000\000\000\000\200" + } + type: TENSOR + } + } + node { + output: "onnx::Slice_17" + name: "Constant_7" + op_type: "Constant" + attribute { + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\377\377\377\377\377\377\377\377" + } + type: TENSOR + } + } + node { + input: "onnx::Slice_13" + input: "onnx::Slice_15" + input: "onnx::Slice_16" + input: "onnx::Slice_14" + input: "onnx::Slice_17" + output: "onnx::Transpose_18" + name: "Slice_8" + op_type: "Slice" + } + node { + input: "onnx::Transpose_18" + output: "onnx::Reshape_19" + name: "Transpose_9" + op_type: "Transpose" + attribute { + name: "perm" ints: 1 - ints: 3 + ints: 0 type: INTS } } - name: "torch-jit-export" + node { + output: "onnx::Reshape_20" + name: "Constant_10" + op_type: "Constant" + attribute { + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\377\377\377\377\377\377\377\377" + } + type: TENSOR + } + } + node { + input: "onnx::Reshape_19" + input: "onnx::Reshape_20" + output: "onnx::Cast_21" + name: "Reshape_11" + op_type: "Reshape" + } + node { + input: "onnx::Cast_21" + output: "onnx::Pad_22" + name: "Cast_12" + op_type: "Cast" + attribute { + name: "to" + i: 7 + type: INT + } + } + node { + input: "onnx::Pad_0" + input: "onnx::Pad_22" + output: "23" + name: "Pad_13" + op_type: "Pad" + attribute { + name: "mode" + s: "reflect" + type: STRING + } + } + name: "torch_jit" + initializer { + dims: 1 + data_type: 7 + name: "onnx::ConstantOfShape_27" + raw_data: "\004\000\000\000\000\000\000\000" + } + initializer { + dims: 4 + data_type: 7 + name: "onnx::Concat_28" + raw_data: "\002\000\000\000\000\000\000\000\003\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000" + } input { - name: "input" + name: "onnx::Pad_0" type { tensor_type { elem_type: 1 @@ -49,22 +208,22 @@ graph { } } output { - name: "1" + name: "23" type { tensor_type { elem_type: 1 shape { dim { - dim_value: 1 + dim_param: "Pad23_dim_0" } dim { - dim_value: 1 + dim_param: "Pad23_dim_1" } dim { - dim_value: 3 + dim_param: "Pad23_dim_2" } dim { - dim_value: 9 + dim_param: "Pad23_dim_3" } } } @@ -72,5 +231,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_params.expect b/test/onnx/expect/TestOperators.test_params.expect index e12dc3843b25..67064d8087ae 100644 --- a/test/onnx/expect/TestOperators.test_params.expect +++ b/test/onnx/expect/TestOperators.test_params.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -34,7 +34,7 @@ graph { name: "Neg_4" op_type: "Neg" } - name: "torch-jit-export" + name: "torch_jit" initializer { dims: 2 dims: 2 @@ -92,5 +92,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_params_onnx_irv4.expect b/test/onnx/expect/TestOperators.test_params_onnx_irv4.expect index 3ee76556d819..8dbc34a20640 100644 --- a/test/onnx/expect/TestOperators.test_params_onnx_irv4.expect +++ b/test/onnx/expect/TestOperators.test_params_onnx_irv4.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -34,7 +34,7 @@ graph { name: "Neg_4" op_type: "Neg" } - name: "torch-jit-export" + name: "torch_jit" initializer { dims: 2 dims: 2 @@ -76,5 +76,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_permute2.expect b/test/onnx/expect/TestOperators.test_permute2.expect index f98608d97962..7f7b6afd9d2d 100644 --- a/test/onnx/expect/TestOperators.test_permute2.expect +++ b/test/onnx/expect/TestOperators.test_permute2.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -18,7 +18,7 @@ graph { type: INTS } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Transpose_0" type { @@ -77,5 +77,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_pixel_shuffle.expect b/test/onnx/expect/TestOperators.test_pixel_shuffle.expect index 1fe12cb04a1a..c5b5a8008d51 100644 --- a/test/onnx/expect/TestOperators.test_pixel_shuffle.expect +++ b/test/onnx/expect/TestOperators.test_pixel_shuffle.expect @@ -18,7 +18,7 @@ graph { type: STRING } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::DepthToSpace_0" type { diff --git a/test/onnx/expect/TestOperators.test_pow.expect b/test/onnx/expect/TestOperators.test_pow.expect index 5bdefab8bbab..f20fd9555090 100644 --- a/test/onnx/expect/TestOperators.test_pow.expect +++ b/test/onnx/expect/TestOperators.test_pow.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -9,7 +9,7 @@ graph { name: "Pow_0" op_type: "Pow" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Pow_0" type { @@ -78,5 +78,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_prelu.expect b/test/onnx/expect/TestOperators.test_prelu.expect index 36e318f858f0..f2bcb50ef777 100644 --- a/test/onnx/expect/TestOperators.test_prelu.expect +++ b/test/onnx/expect/TestOperators.test_prelu.expect @@ -1,21 +1,21 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { input: "onnx::PRelu_0" - input: "onnx::PRelu_4" - output: "3" + input: "onnx::PRelu_5" + output: "4" name: "PRelu_0" op_type: "PRelu" } - name: "torch-jit-export" + name: "torch_jit" initializer { dims: 2 dims: 1 dims: 1 data_type: 1 - name: "onnx::PRelu_4" + name: "onnx::PRelu_5" raw_data: "\000\000\200>\000\000\200>" } input { @@ -41,7 +41,7 @@ graph { } } input { - name: "onnx::PRelu_4" + name: "onnx::PRelu_5" type { tensor_type { elem_type: 1 @@ -60,7 +60,7 @@ graph { } } output { - name: "3" + name: "4" type { tensor_type { elem_type: 1 @@ -83,5 +83,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_prod.expect b/test/onnx/expect/TestOperators.test_prod.expect index 5c4960f49285..0cfeafa4da32 100644 --- a/test/onnx/expect/TestOperators.test_prod.expect +++ b/test/onnx/expect/TestOperators.test_prod.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -13,7 +13,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::ReduceProd_0" type { @@ -48,5 +48,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_prod_dtype.expect b/test/onnx/expect/TestOperators.test_prod_dtype.expect index ec46842ed037..26a63ac840ad 100644 --- a/test/onnx/expect/TestOperators.test_prod_dtype.expect +++ b/test/onnx/expect/TestOperators.test_prod_dtype.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -24,7 +24,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Cast_0" type { @@ -59,5 +59,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_rand.expect b/test/onnx/expect/TestOperators.test_rand.expect index 76f59f55f556..b4d2dbd6cb19 100644 --- a/test/onnx/expect/TestOperators.test_rand.expect +++ b/test/onnx/expect/TestOperators.test_rand.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -22,7 +22,7 @@ graph { name: "Add_1" op_type: "Add" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Add_0" type { @@ -69,5 +69,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_randn.expect b/test/onnx/expect/TestOperators.test_randn.expect index 919e8252474c..bc2d0b23dd7b 100644 --- a/test/onnx/expect/TestOperators.test_randn.expect +++ b/test/onnx/expect/TestOperators.test_randn.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -22,7 +22,7 @@ graph { name: "Add_1" op_type: "Add" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Add_0" type { @@ -69,5 +69,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_reduce_sum_negative_indices.expect b/test/onnx/expect/TestOperators.test_reduce_sum_negative_indices.expect index c3db24de6651..7e5fefad2eb7 100644 --- a/test/onnx/expect/TestOperators.test_reduce_sum_negative_indices.expect +++ b/test/onnx/expect/TestOperators.test_reduce_sum_negative_indices.expect @@ -1,24 +1,34 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { - input: "onnx::ReduceSum_0" - output: "1" - name: "ReduceSum_0" - op_type: "ReduceSum" + output: "onnx::ReduceSum_1" + name: "Constant_0" + op_type: "Constant" attribute { - name: "axes" - ints: -1 - type: INTS + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\377\377\377\377\377\377\377\377" + } + type: TENSOR } + } + node { + input: "onnx::ReduceSum_0" + input: "onnx::ReduceSum_1" + output: "2" + name: "ReduceSum_1" + op_type: "ReduceSum" attribute { name: "keepdims" i: 0 type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::ReduceSum_0" type { @@ -36,7 +46,7 @@ graph { } } output { - name: "1" + name: "2" type { tensor_type { elem_type: 1 @@ -50,5 +60,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_reduced_mean.expect b/test/onnx/expect/TestOperators.test_reduced_mean.expect index e06b21babdd5..ce69ab65a6a6 100644 --- a/test/onnx/expect/TestOperators.test_reduced_mean.expect +++ b/test/onnx/expect/TestOperators.test_reduced_mean.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -18,7 +18,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::ReduceMean_0" type { @@ -62,5 +62,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_reduced_mean_dtype.expect b/test/onnx/expect/TestOperators.test_reduced_mean_dtype.expect index bef20c43d0ba..71d9d296aecd 100644 --- a/test/onnx/expect/TestOperators.test_reduced_mean_dtype.expect +++ b/test/onnx/expect/TestOperators.test_reduced_mean_dtype.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -29,7 +29,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Cast_0" type { @@ -73,5 +73,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect b/test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect index d421af653a72..98bb26aaea36 100644 --- a/test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect +++ b/test/onnx/expect/TestOperators.test_reduced_mean_keepdim.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -19,7 +19,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::ReduceMean_0" type { @@ -66,5 +66,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_reduced_prod.expect b/test/onnx/expect/TestOperators.test_reduced_prod.expect index d43679dce55b..cdfbc0f5fbb6 100644 --- a/test/onnx/expect/TestOperators.test_reduced_prod.expect +++ b/test/onnx/expect/TestOperators.test_reduced_prod.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -18,7 +18,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::ReduceProd_0" type { @@ -62,5 +62,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_reduced_prod_dtype.expect b/test/onnx/expect/TestOperators.test_reduced_prod_dtype.expect index 6f10f754eaf2..641d21cb9c79 100644 --- a/test/onnx/expect/TestOperators.test_reduced_prod_dtype.expect +++ b/test/onnx/expect/TestOperators.test_reduced_prod_dtype.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -29,7 +29,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Cast_0" type { @@ -73,5 +73,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect b/test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect index 01405e24d99a..62befc2cf1cf 100644 --- a/test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect +++ b/test/onnx/expect/TestOperators.test_reduced_prod_keepdim.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -18,7 +18,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::ReduceProd_0" type { @@ -65,5 +65,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_reduced_sum.expect b/test/onnx/expect/TestOperators.test_reduced_sum.expect index 1d43496bf517..e03a204a3f99 100644 --- a/test/onnx/expect/TestOperators.test_reduced_sum.expect +++ b/test/onnx/expect/TestOperators.test_reduced_sum.expect @@ -1,25 +1,34 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { - input: "onnx::ReduceSum_0" - output: "1" - name: "ReduceSum_0" - op_type: "ReduceSum" + output: "onnx::ReduceSum_1" + name: "Constant_0" + op_type: "Constant" attribute { - name: "axes" - ints: 1 - ints: 2 - type: INTS + name: "value" + t { + dims: 2 + data_type: 7 + raw_data: "\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000" + } + type: TENSOR } + } + node { + input: "onnx::ReduceSum_0" + input: "onnx::ReduceSum_1" + output: "2" + name: "ReduceSum_1" + op_type: "ReduceSum" attribute { name: "keepdims" i: 0 type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::ReduceSum_0" type { @@ -43,7 +52,7 @@ graph { } } output { - name: "1" + name: "2" type { tensor_type { elem_type: 1 @@ -60,5 +69,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_reduced_sum_dtype.expect b/test/onnx/expect/TestOperators.test_reduced_sum_dtype.expect index d8a839d4c747..e8ffa49295a5 100644 --- a/test/onnx/expect/TestOperators.test_reduced_sum_dtype.expect +++ b/test/onnx/expect/TestOperators.test_reduced_sum_dtype.expect @@ -1,11 +1,25 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { - input: "onnx::Cast_0" output: "onnx::ReduceSum_1" - name: "Cast_0" + name: "Constant_0" + op_type: "Constant" + attribute { + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\000\000\000\000\000\000\000\000" + } + type: TENSOR + } + } + node { + input: "onnx::Cast_0" + output: "onnx::ReduceSum_2" + name: "Cast_1" op_type: "Cast" attribute { name: "to" @@ -14,22 +28,18 @@ graph { } } node { + input: "onnx::ReduceSum_2" input: "onnx::ReduceSum_1" - output: "2" - name: "ReduceSum_1" + output: "3" + name: "ReduceSum_2" op_type: "ReduceSum" - attribute { - name: "axes" - ints: 0 - type: INTS - } attribute { name: "keepdims" i: 0 type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Cast_0" type { @@ -53,7 +63,7 @@ graph { } } output { - name: "2" + name: "3" type { tensor_type { elem_type: 11 @@ -73,5 +83,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect b/test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect index e6711d19a7e8..7d05fdc26041 100644 --- a/test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect +++ b/test/onnx/expect/TestOperators.test_reduced_sum_keepdim.expect @@ -1,24 +1,34 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { - input: "onnx::ReduceSum_0" - output: "1" - name: "ReduceSum_0" - op_type: "ReduceSum" + output: "onnx::ReduceSum_1" + name: "Constant_0" + op_type: "Constant" attribute { - name: "axes" - ints: 2 - type: INTS + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\002\000\000\000\000\000\000\000" + } + type: TENSOR } + } + node { + input: "onnx::ReduceSum_0" + input: "onnx::ReduceSum_1" + output: "2" + name: "ReduceSum_1" + op_type: "ReduceSum" attribute { name: "keepdims" i: 1 type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::ReduceSum_0" type { @@ -42,7 +52,7 @@ graph { } } output { - name: "1" + name: "2" type { tensor_type { elem_type: 1 @@ -65,5 +75,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_reducemax.expect b/test/onnx/expect/TestOperators.test_reducemax.expect index c5df8d7ddc34..bbd770761f3a 100644 --- a/test/onnx/expect/TestOperators.test_reducemax.expect +++ b/test/onnx/expect/TestOperators.test_reducemax.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -13,7 +13,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::ReduceMax_0" type { @@ -48,5 +48,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_reducemin.expect b/test/onnx/expect/TestOperators.test_reducemin.expect index 5beb9eef96a7..a555fac90f0a 100644 --- a/test/onnx/expect/TestOperators.test_reducemin.expect +++ b/test/onnx/expect/TestOperators.test_reducemin.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -13,7 +13,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::ReduceMin_0" type { @@ -48,5 +48,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_remainder.expect b/test/onnx/expect/TestOperators.test_remainder.expect index aa75e08f4ba2..ecf44141260e 100644 --- a/test/onnx/expect/TestOperators.test_remainder.expect +++ b/test/onnx/expect/TestOperators.test_remainder.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -29,7 +29,7 @@ graph { name: "Sub_3" op_type: "Sub" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Div_0" type { @@ -89,5 +89,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_repeat.expect b/test/onnx/expect/TestOperators.test_repeat.expect index 01d061c19c63..5206bce0d88f 100644 --- a/test/onnx/expect/TestOperators.test_repeat.expect +++ b/test/onnx/expect/TestOperators.test_repeat.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -45,7 +45,7 @@ graph { name: "Tile_3" op_type: "Tile" } - name: "torch-jit-export" + name: "torch_jit" initializer { dims: 1 data_type: 7 @@ -98,5 +98,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect b/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect index e29b825e2a18..2dbb3a436d42 100644 --- a/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect +++ b/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -45,7 +45,7 @@ graph { name: "Tile_3" op_type: "Tile" } - name: "torch-jit-export" + name: "torch_jit" initializer { dims: 1 data_type: 7 @@ -92,5 +92,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_round.expect b/test/onnx/expect/TestOperators.test_round.expect index 069fb7efc7f7..07809e1abdb8 100644 --- a/test/onnx/expect/TestOperators.test_round.expect +++ b/test/onnx/expect/TestOperators.test_round.expect @@ -8,7 +8,7 @@ graph { name: "Round_0" op_type: "Round" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Round_0" type { diff --git a/test/onnx/expect/TestOperators.test_rrelu.expect b/test/onnx/expect/TestOperators.test_rrelu.expect index ed5f0c5b865c..3fb75ab0bb4a 100644 --- a/test/onnx/expect/TestOperators.test_rrelu.expect +++ b/test/onnx/expect/TestOperators.test_rrelu.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -25,7 +25,7 @@ graph { name: "PRelu_1" op_type: "PRelu" } - name: "torch-jit-export" + name: "torch_jit" input { name: "input" type { @@ -72,5 +72,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_rsqrt.expect b/test/onnx/expect/TestOperators.test_rsqrt.expect index 45c1468d5331..32e4df543ae9 100644 --- a/test/onnx/expect/TestOperators.test_rsqrt.expect +++ b/test/onnx/expect/TestOperators.test_rsqrt.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -28,7 +28,7 @@ graph { name: "Div_2" op_type: "Div" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Sqrt_0" type { @@ -63,5 +63,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_rsub.expect b/test/onnx/expect/TestOperators.test_rsub.expect index 21a031c72ded..75344bfc68de 100644 --- a/test/onnx/expect/TestOperators.test_rsub.expect +++ b/test/onnx/expect/TestOperators.test_rsub.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -22,7 +22,7 @@ graph { name: "Sub_1" op_type: "Sub" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Sub_0" type { @@ -57,5 +57,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_scatter_add.expect b/test/onnx/expect/TestOperators.test_scatter_add.expect index 19302ffcb396..fd7514e30630 100644 --- a/test/onnx/expect/TestOperators.test_scatter_add.expect +++ b/test/onnx/expect/TestOperators.test_scatter_add.expect @@ -1,9 +1,9 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { - output: "onnx::Scatter_3" + output: "onnx::ScatterElements_3" name: "Constant_0" op_type: "Constant" attribute { @@ -18,12 +18,12 @@ graph { } } node { - input: "onnx::Scatter_3" - input: "onnx::Scatter_1" - input: "onnx::Scatter_2" + input: "onnx::ScatterElements_3" + input: "onnx::ScatterElements_1" + input: "onnx::ScatterElements_2" output: "onnx::Add_4" - name: "Scatter_1" - op_type: "Scatter" + name: "ScatterElements_1" + op_type: "ScatterElements" attribute { name: "axis" i: 1 @@ -37,7 +37,7 @@ graph { name: "Add_2" op_type: "Add" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Add_0" type { @@ -55,7 +55,7 @@ graph { } } input { - name: "onnx::Scatter_1" + name: "onnx::ScatterElements_1" type { tensor_type { elem_type: 7 @@ -71,7 +71,7 @@ graph { } } input { - name: "onnx::Scatter_2" + name: "onnx::ScatterElements_2" type { tensor_type { elem_type: 1 @@ -104,5 +104,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_scatter_add_opset11.expect b/test/onnx/expect/TestOperators.test_scatter_add_opset11.expect index 1695e9db8120..bc4fabc15ddb 100644 --- a/test/onnx/expect/TestOperators.test_scatter_add_opset11.expect +++ b/test/onnx/expect/TestOperators.test_scatter_add_opset11.expect @@ -37,7 +37,7 @@ graph { name: "Add_2" op_type: "Add" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Add_0" type { diff --git a/test/onnx/expect/TestOperators.test_selu.expect b/test/onnx/expect/TestOperators.test_selu.expect index bbfedf15051c..7cdc4dc8bac4 100644 --- a/test/onnx/expect/TestOperators.test_selu.expect +++ b/test/onnx/expect/TestOperators.test_selu.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -8,7 +8,7 @@ graph { name: "Selu_0" op_type: "Selu" } - name: "torch-jit-export" + name: "torch_jit" input { name: "input" type { @@ -55,5 +55,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_shape_value_map.expect b/test/onnx/expect/TestOperators.test_shape_value_map.expect index 0962d8b8cc83..174551f9a7c5 100644 --- a/test/onnx/expect/TestOperators.test_shape_value_map.expect +++ b/test/onnx/expect/TestOperators.test_shape_value_map.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -34,23 +34,33 @@ graph { } } node { - input: "onnx::Unsqueeze_3" - output: "onnx::Concat_7" - name: "Unsqueeze_3" - op_type: "Unsqueeze" + output: "onnx::Unsqueeze_7" + name: "Constant_3" + op_type: "Constant" attribute { - name: "axes" - ints: 0 - type: INTS + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\000\000\000\000\000\000\000\000" + } + type: TENSOR } } node { - input: "onnx::Concat_7" - input: "onnx::Concat_21" - input: "onnx::Concat_22" - input: "onnx::Concat_23" - output: "onnx::Reshape_11" - name: "Concat_4" + input: "onnx::Unsqueeze_3" + input: "onnx::Unsqueeze_7" + output: "onnx::Concat_8" + name: "Unsqueeze_4" + op_type: "Unsqueeze" + } + node { + input: "onnx::Concat_8" + input: "onnx::Concat_26" + input: "onnx::Concat_27" + input: "onnx::Concat_28" + output: "onnx::Reshape_15" + name: "Concat_5" op_type: "Concat" attribute { name: "axis" @@ -60,66 +70,62 @@ graph { } node { input: "x" - input: "onnx::Reshape_11" - output: "onnx::Transpose_12" - name: "Reshape_5" + input: "onnx::Reshape_15" + output: "onnx::Transpose_16" + name: "Reshape_6" op_type: "Reshape" } node { - input: "onnx::Transpose_12" - output: "onnx::Softmax_13" - name: "Transpose_6" + input: "onnx::Transpose_16" + output: "x.1" + name: "Transpose_7" op_type: "Transpose" attribute { name: "perm" ints: 0 - ints: 3 - ints: 1 ints: 2 + ints: 1 + ints: 3 type: INTS } } node { - input: "onnx::Softmax_13" - output: "onnx::Transpose_14" - name: "Softmax_7" + input: "x.1" + output: "onnx::Reshape_18" + name: "Softmax_8" op_type: "Softmax" attribute { name: "axis" - i: 3 + i: 1 type: INT } } node { - input: "onnx::Transpose_14" - output: "onnx::Reshape_15" - name: "Transpose_8" - op_type: "Transpose" + output: "onnx::Unsqueeze_20" + name: "Constant_9" + op_type: "Constant" attribute { - name: "perm" - ints: 0 - ints: 3 - ints: 2 - ints: 1 - type: INTS + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\000\000\000\000\000\000\000\000" + } + type: TENSOR } } node { input: "onnx::Unsqueeze_3" - output: "onnx::Concat_17" - name: "Unsqueeze_9" + input: "onnx::Unsqueeze_20" + output: "onnx::Concat_21" + name: "Unsqueeze_10" op_type: "Unsqueeze" - attribute { - name: "axes" - ints: 0 - type: INTS - } } node { - input: "onnx::Concat_17" - input: "onnx::Concat_24" - output: "onnx::Reshape_19" - name: "Concat_10" + input: "onnx::Concat_21" + input: "onnx::Concat_29" + output: "onnx::Reshape_24" + name: "Concat_11" op_type: "Concat" attribute { name: "axis" @@ -128,35 +134,35 @@ graph { } } node { - input: "onnx::Reshape_15" - input: "onnx::Reshape_19" - output: "20" - name: "Reshape_11" + input: "onnx::Reshape_18" + input: "onnx::Reshape_24" + output: "25" + name: "Reshape_12" op_type: "Reshape" } - name: "torch-jit-export" + name: "torch_jit" initializer { dims: 1 data_type: 7 - name: "onnx::Concat_21" + name: "onnx::Concat_26" raw_data: "\001\000\000\000\000\000\000\000" } initializer { dims: 1 data_type: 7 - name: "onnx::Concat_22" + name: "onnx::Concat_27" raw_data: "\002\000\000\000\000\000\000\000" } initializer { dims: 1 data_type: 7 - name: "onnx::Concat_23" + name: "onnx::Concat_28" raw_data: "\377\377\377\377\377\377\377\377" } initializer { dims: 1 data_type: 7 - name: "onnx::Concat_24" + name: "onnx::Concat_29" raw_data: "\377\377\377\377\377\377\377\377" } input { @@ -182,7 +188,7 @@ graph { } } output { - name: "20" + name: "25" type { tensor_type { elem_type: 1 @@ -199,5 +205,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_sign.expect b/test/onnx/expect/TestOperators.test_sign.expect index baeb2b1505eb..6cb9200dc073 100644 --- a/test/onnx/expect/TestOperators.test_sign.expect +++ b/test/onnx/expect/TestOperators.test_sign.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -8,7 +8,7 @@ graph { name: "Sign_0" op_type: "Sign" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Sign_0" type { @@ -43,5 +43,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_sin.expect b/test/onnx/expect/TestOperators.test_sin.expect index 4852ac6060f7..4ca6284c48d9 100644 --- a/test/onnx/expect/TestOperators.test_sin.expect +++ b/test/onnx/expect/TestOperators.test_sin.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -8,7 +8,7 @@ graph { name: "Sin_0" op_type: "Sin" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Sin_0" type { @@ -43,5 +43,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_slice.expect b/test/onnx/expect/TestOperators.test_slice.expect index 1524b308a951..15aa37bc2f7e 100644 --- a/test/onnx/expect/TestOperators.test_slice.expect +++ b/test/onnx/expect/TestOperators.test_slice.expect @@ -1,29 +1,74 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { - input: "onnx::Slice_0" - output: "1" - name: "Slice_0" - op_type: "Slice" + output: "onnx::Slice_1" + name: "Constant_0" + op_type: "Constant" attribute { - name: "axes" - ints: 1 - type: INTS + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\001\000\000\000\000\000\000\000" + } + type: TENSOR } + } + node { + output: "onnx::Slice_2" + name: "Constant_1" + op_type: "Constant" + attribute { + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\001\000\000\000\000\000\000\000" + } + type: TENSOR + } + } + node { + output: "onnx::Slice_3" + name: "Constant_2" + op_type: "Constant" attribute { - name: "ends" - ints: 2 - type: INTS + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\002\000\000\000\000\000\000\000" + } + type: TENSOR } + } + node { + output: "onnx::Slice_4" + name: "Constant_3" + op_type: "Constant" attribute { - name: "starts" - ints: 1 - type: INTS + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\001\000\000\000\000\000\000\000" + } + type: TENSOR } } - name: "torch-jit-export" + node { + input: "onnx::Slice_0" + input: "onnx::Slice_2" + input: "onnx::Slice_3" + input: "onnx::Slice_1" + input: "onnx::Slice_4" + output: "5" + name: "Slice_4" + op_type: "Slice" + } + name: "torch_jit" input { name: "onnx::Slice_0" type { @@ -41,7 +86,7 @@ graph { } } output { - name: "1" + name: "5" type { tensor_type { elem_type: 1 @@ -58,5 +103,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_slice_dynamic.expect b/test/onnx/expect/TestOperators.test_slice_dynamic.expect index f954b3a11128..5a47c596d3f5 100644 --- a/test/onnx/expect/TestOperators.test_slice_dynamic.expect +++ b/test/onnx/expect/TestOperators.test_slice_dynamic.expect @@ -93,7 +93,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Slice_0" type { diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy.expect index 6282b2b4016d..a7d7237e2212 100644 --- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy.expect +++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy.expect @@ -19,7 +19,7 @@ graph { type: STRING } } - name: "torch-jit-export" + name: "torch_jit" input { name: "input" type { diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d.expect index 6d35c3f3bc30..99870e60c6b7 100644 --- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d.expect +++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d.expect @@ -19,7 +19,7 @@ graph { type: STRING } } - name: "torch-jit-export" + name: "torch_jit" input { name: "input" type { diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d_none.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d_none.expect index 3eba0f943ac1..bad2ffc222be 100644 --- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d_none.expect +++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d_none.expect @@ -19,7 +19,7 @@ graph { type: STRING } } - name: "torch-jit-export" + name: "torch_jit" input { name: "input" type { diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_4d.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_4d.expect index 223f8d3d3219..198f2b568912 100644 --- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_4d.expect +++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_4d.expect @@ -19,7 +19,7 @@ graph { type: STRING } } - name: "torch-jit-export" + name: "torch_jit" input { name: "input" type { diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_ignore_index.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_ignore_index.expect index 21c0a3540cda..4b861c407122 100644 --- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_ignore_index.expect +++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_ignore_index.expect @@ -19,7 +19,7 @@ graph { type: STRING } } - name: "torch-jit-export" + name: "torch_jit" input { name: "input" type { diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_weights.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_weights.expect index ede001c6b8ce..830de3396953 100644 --- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_weights.expect +++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_weights.expect @@ -20,7 +20,7 @@ graph { type: STRING } } - name: "torch-jit-export" + name: "torch_jit" initializer { dims: 5 data_type: 1 diff --git a/test/onnx/expect/TestOperators.test_split.expect b/test/onnx/expect/TestOperators.test_split.expect index 5566c1bfa3bb..e1616e4a52cd 100644 --- a/test/onnx/expect/TestOperators.test_split.expect +++ b/test/onnx/expect/TestOperators.test_split.expect @@ -1,28 +1,36 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { + node { + output: "onnx::Split_1" + name: "Constant_0" + op_type: "Constant" + attribute { + name: "value" + t { + dims: 3 + data_type: 7 + raw_data: "\002\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000" + } + type: TENSOR + } + } node { input: "tensor" - output: "1" + input: "onnx::Split_1" output: "2" output: "3" - name: "Split_0" + output: "4" + name: "Split_1" op_type: "Split" attribute { name: "axis" i: 1 type: INT } - attribute { - name: "split" - ints: 2 - ints: 2 - ints: 2 - type: INTS - } } - name: "torch-jit-export" + name: "torch_jit" input { name: "tensor" type { @@ -40,7 +48,7 @@ graph { } } output { - name: "1" + name: "2" type { tensor_type { elem_type: 1 @@ -56,7 +64,7 @@ graph { } } output { - name: "2" + name: "3" type { tensor_type { elem_type: 1 @@ -72,7 +80,7 @@ graph { } } output { - name: "3" + name: "4" type { tensor_type { elem_type: 1 @@ -89,5 +97,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_split_with_sizes.expect b/test/onnx/expect/TestOperators.test_split_with_sizes.expect index addd7dba3425..964ba363a56e 100644 --- a/test/onnx/expect/TestOperators.test_split_with_sizes.expect +++ b/test/onnx/expect/TestOperators.test_split_with_sizes.expect @@ -1,28 +1,36 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { + node { + output: "onnx::Split_1" + name: "Constant_0" + op_type: "Constant" + attribute { + name: "value" + t { + dims: 3 + data_type: 7 + raw_data: "\002\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\003\000\000\000\000\000\000\000" + } + type: TENSOR + } + } node { input: "tensor" - output: "1" + input: "onnx::Split_1" output: "2" output: "3" - name: "Split_0" + output: "4" + name: "Split_1" op_type: "Split" attribute { name: "axis" i: 1 type: INT } - attribute { - name: "split" - ints: 2 - ints: 1 - ints: 3 - type: INTS - } } - name: "torch-jit-export" + name: "torch_jit" input { name: "tensor" type { @@ -40,7 +48,7 @@ graph { } } output { - name: "1" + name: "2" type { tensor_type { elem_type: 1 @@ -56,7 +64,7 @@ graph { } } output { - name: "2" + name: "3" type { tensor_type { elem_type: 1 @@ -72,7 +80,7 @@ graph { } } output { - name: "3" + name: "4" type { tensor_type { elem_type: 1 @@ -89,5 +97,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_sqrt.expect b/test/onnx/expect/TestOperators.test_sqrt.expect index d46c5b7272c6..91fc7bac0b77 100644 --- a/test/onnx/expect/TestOperators.test_sqrt.expect +++ b/test/onnx/expect/TestOperators.test_sqrt.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -8,7 +8,7 @@ graph { name: "Sqrt_0" op_type: "Sqrt" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Sqrt_0" type { @@ -43,5 +43,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_std.expect b/test/onnx/expect/TestOperators.test_std.expect index adf8398352a2..69df37b90452 100644 --- a/test/onnx/expect/TestOperators.test_std.expect +++ b/test/onnx/expect/TestOperators.test_std.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -144,7 +144,7 @@ graph { name: "Sqrt_13" op_type: "Sqrt" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::ReduceMean_0" type { @@ -185,5 +185,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_sum.expect b/test/onnx/expect/TestOperators.test_sum.expect index 75195a79a7a5..6722064ace20 100644 --- a/test/onnx/expect/TestOperators.test_sum.expect +++ b/test/onnx/expect/TestOperators.test_sum.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -13,7 +13,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::ReduceSum_0" type { @@ -48,5 +48,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_sum_dtype.expect b/test/onnx/expect/TestOperators.test_sum_dtype.expect index 3e149b422bf9..2b5f417b0eee 100644 --- a/test/onnx/expect/TestOperators.test_sum_dtype.expect +++ b/test/onnx/expect/TestOperators.test_sum_dtype.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -24,7 +24,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Cast_0" type { @@ -59,5 +59,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_tan.expect b/test/onnx/expect/TestOperators.test_tan.expect index e6f6e855f8e3..84bc3e9420df 100644 --- a/test/onnx/expect/TestOperators.test_tan.expect +++ b/test/onnx/expect/TestOperators.test_tan.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -8,7 +8,7 @@ graph { name: "Tan_0" op_type: "Tan" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Tan_0" type { @@ -43,5 +43,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_topk.expect b/test/onnx/expect/TestOperators.test_topk.expect index 0310fe86367a..25c206668f87 100644 --- a/test/onnx/expect/TestOperators.test_topk.expect +++ b/test/onnx/expect/TestOperators.test_topk.expect @@ -36,7 +36,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::TopK_0" type { diff --git a/test/onnx/expect/TestOperators.test_topk_smallest_unsorted.expect b/test/onnx/expect/TestOperators.test_topk_smallest_unsorted.expect index b76aaf172a6a..f94c62abcbed 100644 --- a/test/onnx/expect/TestOperators.test_topk_smallest_unsorted.expect +++ b/test/onnx/expect/TestOperators.test_topk_smallest_unsorted.expect @@ -46,7 +46,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::TopK_0" type { diff --git a/test/onnx/expect/TestOperators.test_transpose.expect b/test/onnx/expect/TestOperators.test_transpose.expect index 1a30352aeb65..f1350a1b2623 100644 --- a/test/onnx/expect/TestOperators.test_transpose.expect +++ b/test/onnx/expect/TestOperators.test_transpose.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -8,7 +8,7 @@ graph { name: "Identity_0" op_type: "Identity" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Identity_0" type { @@ -43,5 +43,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_type_as.expect b/test/onnx/expect/TestOperators.test_type_as.expect index c9480c168ddf..31803483edbd 100644 --- a/test/onnx/expect/TestOperators.test_type_as.expect +++ b/test/onnx/expect/TestOperators.test_type_as.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -8,7 +8,7 @@ graph { name: "Identity_0" op_type: "Identity" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Identity_0" type { @@ -37,5 +37,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_unfold.expect b/test/onnx/expect/TestOperators.test_unfold.expect index 164c5caeecc4..9b5e20281d20 100644 --- a/test/onnx/expect/TestOperators.test_unfold.expect +++ b/test/onnx/expect/TestOperators.test_unfold.expect @@ -1,76 +1,156 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { - input: "onnx::Slice_0" - output: "onnx::Unsqueeze_1" - name: "Slice_0" - op_type: "Slice" + output: "onnx::Slice_1" + name: "Constant_0" + op_type: "Constant" attribute { - name: "axes" - ints: 2 - type: INTS + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\002\000\000\000\000\000\000\000" + } + type: TENSOR } + } + node { + output: "onnx::Slice_2" + name: "Constant_1" + op_type: "Constant" attribute { - name: "ends" - ints: 2 - type: INTS + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\000\000\000\000\000\000\000\000" + } + type: TENSOR } + } + node { + output: "onnx::Slice_3" + name: "Constant_2" + op_type: "Constant" attribute { - name: "starts" - ints: 0 - type: INTS + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\002\000\000\000\000\000\000\000" + } + type: TENSOR } } node { input: "onnx::Slice_0" - output: "onnx::Unsqueeze_2" - name: "Slice_1" + input: "onnx::Slice_2" + input: "onnx::Slice_3" + input: "onnx::Slice_1" + output: "onnx::Unsqueeze_4" + name: "Slice_3" op_type: "Slice" + } + node { + output: "onnx::Slice_5" + name: "Constant_4" + op_type: "Constant" attribute { - name: "axes" - ints: 2 - type: INTS + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\002\000\000\000\000\000\000\000" + } + type: TENSOR } + } + node { + output: "onnx::Slice_6" + name: "Constant_5" + op_type: "Constant" attribute { - name: "ends" - ints: 4 - type: INTS + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\002\000\000\000\000\000\000\000" + } + type: TENSOR } + } + node { + output: "onnx::Slice_7" + name: "Constant_6" + op_type: "Constant" attribute { - name: "starts" - ints: 2 - type: INTS + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\004\000\000\000\000\000\000\000" + } + type: TENSOR } } node { - input: "onnx::Unsqueeze_1" - output: "onnx::Concat_3" - name: "Unsqueeze_2" - op_type: "Unsqueeze" + input: "onnx::Slice_0" + input: "onnx::Slice_6" + input: "onnx::Slice_7" + input: "onnx::Slice_5" + output: "onnx::Unsqueeze_8" + name: "Slice_7" + op_type: "Slice" + } + node { + output: "onnx::Unsqueeze_9" + name: "Constant_8" + op_type: "Constant" attribute { - name: "axes" - ints: 2 - type: INTS + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\002\000\000\000\000\000\000\000" + } + type: TENSOR } } node { - input: "onnx::Unsqueeze_2" - output: "onnx::Concat_4" - name: "Unsqueeze_3" + input: "onnx::Unsqueeze_4" + input: "onnx::Unsqueeze_9" + output: "onnx::Concat_10" + name: "Unsqueeze_9" op_type: "Unsqueeze" + } + node { + output: "onnx::Unsqueeze_11" + name: "Constant_10" + op_type: "Constant" attribute { - name: "axes" - ints: 2 - type: INTS + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\002\000\000\000\000\000\000\000" + } + type: TENSOR } } node { - input: "onnx::Concat_3" - input: "onnx::Concat_4" - output: "5" - name: "Concat_4" + input: "onnx::Unsqueeze_8" + input: "onnx::Unsqueeze_11" + output: "onnx::Concat_12" + name: "Unsqueeze_11" + op_type: "Unsqueeze" + } + node { + input: "onnx::Concat_10" + input: "onnx::Concat_12" + output: "13" + name: "Concat_12" op_type: "Concat" attribute { name: "axis" @@ -78,7 +158,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Slice_0" type { @@ -99,7 +179,7 @@ graph { } } output { - name: "5" + name: "13" type { tensor_type { elem_type: 1 @@ -122,5 +202,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_unique.expect b/test/onnx/expect/TestOperators.test_unique.expect index 1ceb90528b28..55e6e2909a3f 100644 --- a/test/onnx/expect/TestOperators.test_unique.expect +++ b/test/onnx/expect/TestOperators.test_unique.expect @@ -21,7 +21,7 @@ graph { type: INT } } - name: "torch-jit-export" + name: "torch_jit" input { name: "input" type { diff --git a/test/onnx/expect/TestOperators.test_unsqueeze.expect b/test/onnx/expect/TestOperators.test_unsqueeze.expect index 32cfe697e3dd..49a61c2b8451 100644 --- a/test/onnx/expect/TestOperators.test_unsqueeze.expect +++ b/test/onnx/expect/TestOperators.test_unsqueeze.expect @@ -1,19 +1,29 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { - input: "onnx::Unsqueeze_0" - output: "1" - name: "Unsqueeze_0" - op_type: "Unsqueeze" + output: "onnx::Unsqueeze_1" + name: "Constant_0" + op_type: "Constant" attribute { - name: "axes" - ints: 2 - type: INTS + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\002\000\000\000\000\000\000\000" + } + type: TENSOR } } - name: "torch-jit-export" + node { + input: "onnx::Unsqueeze_0" + input: "onnx::Unsqueeze_1" + output: "2" + name: "Unsqueeze_1" + op_type: "Unsqueeze" + } + name: "torch_jit" input { name: "onnx::Unsqueeze_0" type { @@ -31,7 +41,7 @@ graph { } } output { - name: "1" + name: "2" type { tensor_type { elem_type: 1 @@ -51,5 +61,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_upsample_nearest_scale.expect b/test/onnx/expect/TestOperators.test_upsample_nearest_scale.expect index 198d2367ad08..e1f31dc406a0 100644 --- a/test/onnx/expect/TestOperators.test_upsample_nearest_scale.expect +++ b/test/onnx/expect/TestOperators.test_upsample_nearest_scale.expect @@ -1,24 +1,40 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { input: "x" - input: "onnx::Upsample_5" - output: "4" - name: "Upsample_0" - op_type: "Upsample" + input: "" + input: "onnx::Resize_6" + output: "5" + name: "Resize_0" + op_type: "Resize" + attribute { + name: "coordinate_transformation_mode" + s: "asymmetric" + type: STRING + } + attribute { + name: "cubic_coeff_a" + f: -0.75 + type: FLOAT + } attribute { name: "mode" s: "nearest" type: STRING } + attribute { + name: "nearest_mode" + s: "floor" + type: STRING + } } - name: "torch-jit-export" + name: "torch_jit" initializer { dims: 4 data_type: 1 - name: "onnx::Upsample_5" + name: "onnx::Resize_6" raw_data: "\000\000\200?\000\000\200?\000\000\000@\000\000\000@" } input { @@ -44,22 +60,22 @@ graph { } } output { - name: "4" + name: "5" type { tensor_type { elem_type: 1 shape { dim { - dim_value: 1 + dim_param: "Resize5_dim_0" } dim { - dim_value: 2 + dim_param: "Resize5_dim_1" } dim { - dim_value: 6 + dim_param: "Resize5_dim_2" } dim { - dim_value: 8 + dim_param: "Resize5_dim_3" } } } @@ -67,5 +83,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_upsample_nearest_scale_default_scale_factor.expect b/test/onnx/expect/TestOperators.test_upsample_nearest_scale_default_scale_factor.expect index 198d2367ad08..e1f31dc406a0 100644 --- a/test/onnx/expect/TestOperators.test_upsample_nearest_scale_default_scale_factor.expect +++ b/test/onnx/expect/TestOperators.test_upsample_nearest_scale_default_scale_factor.expect @@ -1,24 +1,40 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { input: "x" - input: "onnx::Upsample_5" - output: "4" - name: "Upsample_0" - op_type: "Upsample" + input: "" + input: "onnx::Resize_6" + output: "5" + name: "Resize_0" + op_type: "Resize" + attribute { + name: "coordinate_transformation_mode" + s: "asymmetric" + type: STRING + } + attribute { + name: "cubic_coeff_a" + f: -0.75 + type: FLOAT + } attribute { name: "mode" s: "nearest" type: STRING } + attribute { + name: "nearest_mode" + s: "floor" + type: STRING + } } - name: "torch-jit-export" + name: "torch_jit" initializer { dims: 4 data_type: 1 - name: "onnx::Upsample_5" + name: "onnx::Resize_6" raw_data: "\000\000\200?\000\000\200?\000\000\000@\000\000\000@" } input { @@ -44,22 +60,22 @@ graph { } } output { - name: "4" + name: "5" type { tensor_type { elem_type: 1 shape { dim { - dim_value: 1 + dim_param: "Resize5_dim_0" } dim { - dim_value: 2 + dim_param: "Resize5_dim_1" } dim { - dim_value: 6 + dim_param: "Resize5_dim_2" } dim { - dim_value: 8 + dim_param: "Resize5_dim_3" } } } @@ -67,5 +83,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_upsample_nearest_size.expect b/test/onnx/expect/TestOperators.test_upsample_nearest_size.expect index dc30ada92252..cbd32608d2ae 100644 --- a/test/onnx/expect/TestOperators.test_upsample_nearest_size.expect +++ b/test/onnx/expect/TestOperators.test_upsample_nearest_size.expect @@ -1,34 +1,112 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { - output: "onnx::Upsample_1" - name: "Constant_0" + input: "x" + output: "onnx::Slice_2" + name: "Shape_0" + op_type: "Shape" + } + node { + output: "onnx::Slice_3" + name: "Constant_1" + op_type: "Constant" + attribute { + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\000\000\000\000\000\000\000\000" + } + type: TENSOR + } + } + node { + output: "onnx::Slice_4" + name: "Constant_2" + op_type: "Constant" + attribute { + name: "value" + t { + dims: 1 + data_type: 7 + raw_data: "\000\000\000\000\000\000\000\000" + } + type: TENSOR + } + } + node { + output: "onnx::Slice_5" + name: "Constant_3" op_type: "Constant" attribute { name: "value" t { - dims: 4 - data_type: 1 - raw_data: "\000\000\200?\000\000\200?\253\252\252@\000\000\200@" + dims: 1 + data_type: 7 + raw_data: "\002\000\000\000\000\000\000\000" } type: TENSOR } } + node { + input: "onnx::Slice_2" + input: "onnx::Slice_4" + input: "onnx::Slice_5" + input: "onnx::Slice_3" + output: "onnx::Concat_6" + name: "Slice_4" + op_type: "Slice" + } + node { + input: "onnx::Concat_6" + input: "onnx::Concat_12" + output: "onnx::Resize_8" + name: "Concat_5" + op_type: "Concat" + attribute { + name: "axis" + i: 0 + type: INT + } + } node { input: "x" - input: "onnx::Upsample_1" - output: "2" - name: "Upsample_1" - op_type: "Upsample" + input: "" + input: "" + input: "onnx::Resize_8" + output: "11" + name: "Resize_6" + op_type: "Resize" + attribute { + name: "coordinate_transformation_mode" + s: "asymmetric" + type: STRING + } + attribute { + name: "cubic_coeff_a" + f: -0.75 + type: FLOAT + } attribute { name: "mode" s: "nearest" type: STRING } + attribute { + name: "nearest_mode" + s: "floor" + type: STRING + } + } + name: "torch_jit" + initializer { + dims: 2 + data_type: 7 + name: "onnx::Concat_12" + raw_data: "\020\000\000\000\000\000\000\000\020\000\000\000\000\000\000\000" } - name: "torch-jit-export" input { name: "x" type { @@ -52,22 +130,22 @@ graph { } } output { - name: "2" + name: "11" type { tensor_type { elem_type: 1 shape { dim { - dim_value: 1 + dim_param: "Resize11_dim_0" } dim { - dim_value: 2 + dim_param: "Resize11_dim_1" } dim { - dim_value: 16 + dim_param: "Resize11_dim_2" } dim { - dim_value: 16 + dim_param: "Resize11_dim_3" } } } @@ -75,5 +153,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_view.expect b/test/onnx/expect/TestOperators.test_view.expect index c7a1eb6adbd6..097625822969 100644 --- a/test/onnx/expect/TestOperators.test_view.expect +++ b/test/onnx/expect/TestOperators.test_view.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -23,7 +23,7 @@ graph { name: "Reshape_1" op_type: "Reshape" } - name: "torch-jit-export" + name: "torch_jit" input { name: "onnx::Reshape_0" type { @@ -55,5 +55,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_view_flatten.expect b/test/onnx/expect/TestOperators.test_view_flatten.expect index 2465f1d0032d..ac814160d5bd 100644 --- a/test/onnx/expect/TestOperators.test_view_flatten.expect +++ b/test/onnx/expect/TestOperators.test_view_flatten.expect @@ -1,19 +1,19 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { input: "onnx::Reshape_0" - input: "onnx::Reshape_9" - output: "6" + input: "onnx::Reshape_11" + output: "8" name: "Reshape_0" op_type: "Reshape" } - name: "torch-jit-export" + name: "torch_jit" initializer { dims: 2 data_type: 7 - name: "onnx::Reshape_9" + name: "onnx::Reshape_11" raw_data: "\001\000\000\000\000\000\000\000\030\000\000\000\000\000\000\000" } input { @@ -39,7 +39,7 @@ graph { } } output { - name: "6" + name: "8" type { tensor_type { elem_type: 1 @@ -56,5 +56,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/expect/TestOperators.test_zeros_like.expect b/test/onnx/expect/TestOperators.test_zeros_like.expect index 27ac22983187..e4f6c6ede2ca 100644 --- a/test/onnx/expect/TestOperators.test_zeros_like.expect +++ b/test/onnx/expect/TestOperators.test_zeros_like.expect @@ -1,4 +1,4 @@ -ir_version: 4 +ir_version: 7 producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { @@ -17,7 +17,7 @@ graph { type: TENSOR } } - name: "torch-jit-export" + name: "torch_jit" output { name: "1" type { @@ -36,5 +36,5 @@ graph { } } opset_import { - version: 9 + version: 13 } diff --git a/test/onnx/export_onnx_tests_filter.py b/test/onnx/export_onnx_tests_filter.py index 0cb42cd439d6..9b781fa53c8f 100644 --- a/test/onnx/export_onnx_tests_filter.py +++ b/test/onnx/export_onnx_tests_filter.py @@ -1,25 +1,30 @@ import argparse import glob -import onnx.backend.test import os import shutil -from test_caffe2_common import run_generated_test +import traceback + import google.protobuf.text_format +import onnx.backend.test import test_onnx_common -import traceback +from test_caffe2_common import run_generated_test from torch.testing._internal.common_device_type import get_all_device_types -_fail_test_dir = os.path.join(os.path.dirname( - os.path.realpath(__file__)), "fail", "generated") +_fail_test_dir = os.path.join( + os.path.dirname(os.path.realpath(__file__)), "fail", "generated" +) -_expect_dir = os.path.join(os.path.dirname( - os.path.realpath(__file__)), "expect") +_expect_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "expect") -def collect_generated_testcases(root_dir=test_onnx_common.pytorch_converted_dir, - verbose=False, fail_dir=None, expect=True): +def collect_generated_testcases( + root_dir=test_onnx_common.pytorch_converted_dir, + verbose=False, + fail_dir=None, + expect=True, +): total_pass = 0 total_fail = 0 for d in os.listdir(root_dir): @@ -33,13 +38,16 @@ def collect_generated_testcases(root_dir=test_onnx_common.pytorch_converted_dir, for device in get_all_device_types(): run_generated_test(model_file, data_dir, device) if expect: - expect_file = os.path.join(_expect_dir, - "PyTorch-generated-{}.expect".format(d)) + expect_file = os.path.join( + _expect_dir, "PyTorch-generated-{}.expect".format(d) + ) with open(expect_file, "w") as text_file: model = onnx.load(model_file) onnx.checker.check_model(model) onnx.helper.strip_doc_string(model) - text_file.write(google.protobuf.text_format.MessageToString(model)) + text_file.write( + google.protobuf.text_format.MessageToString(model) + ) total_pass += 1 except Exception as e: if verbose: @@ -53,17 +61,28 @@ def collect_generated_testcases(root_dir=test_onnx_common.pytorch_converted_dir, shutil.rmtree(target_dir) shutil.move(dir_name, target_dir) total_fail += 1 - print("Successfully generated/updated {} test cases from PyTorch.".format(total_pass)) + print( + "Successfully generated/updated {} test cases from PyTorch.".format(total_pass) + ) if expect: print("Expected pbtxt files are generated in {}.".format(_expect_dir)) print("Failed {} testcases are moved to {}.".format(total_fail, _fail_test_dir)) if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Check and filter the failed test cases.") + parser = argparse.ArgumentParser( + description="Check and filter the failed test cases." + ) parser.add_argument("-v", action="store_true", default=False, help="verbose") - parser.add_argument("--delete", action="store_true", default=False, help="delete failed test cases") - parser.add_argument("--no-expect", action="store_true", default=False, help="generate expect txt files") + parser.add_argument( + "--delete", action="store_true", default=False, help="delete failed test cases" + ) + parser.add_argument( + "--no-expect", + action="store_true", + default=False, + help="generate expect txt files", + ) args = parser.parse_args() verbose = args.v delete = args.delete @@ -77,5 +96,9 @@ def collect_generated_testcases(root_dir=test_onnx_common.pytorch_converted_dir, collect_generated_testcases(verbose=verbose, fail_dir=fail_dir, expect=expect) # We already generate the expect files for test_operators.py. - collect_generated_testcases(root_dir=test_onnx_common.pytorch_operator_dir, - verbose=verbose, fail_dir=fail_dir, expect=False) + collect_generated_testcases( + root_dir=test_onnx_common.pytorch_operator_dir, + verbose=verbose, + fail_dir=fail_dir, + expect=False, + ) diff --git a/test/onnx/export_onnx_tests_generator.py b/test/onnx/export_onnx_tests_generator.py index b658a19f6530..2a949af80000 100644 --- a/test/onnx/export_onnx_tests_generator.py +++ b/test/onnx/export_onnx_tests_generator.py @@ -1,17 +1,17 @@ -from torch.autograd import Variable -from onnx import numpy_helper - import io -import onnx import os import shutil -import torch import traceback +import onnx import test_onnx_common -from torch.testing._internal.common_nn import module_tests +from onnx import numpy_helper from test_nn import new_module_tests +import torch +from torch.autograd import Variable +from torch.testing._internal.common_nn import module_tests + # Take a test case (a dict) as input, return the test name. def get_test_name(testcase): @@ -27,7 +27,11 @@ def get_test_name(testcase): # Take a test case (a dict) as input, return the input for the module. def gen_input(testcase): if "input_size" in testcase: - if testcase["input_size"] == () and "desc" in testcase and testcase["desc"][-6:] == "scalar": + if ( + testcase["input_size"] == () + and "desc" in testcase + and testcase["desc"][-6:] == "scalar" + ): testcase["input_size"] = (1,) return Variable(torch.randn(*testcase["input_size"])) elif "input_fn" in testcase: @@ -54,11 +58,11 @@ def print_stats(FunctionalModule_nums, nn_module): unsupported = [] not_fully_supported = [] for key, value in nn_module.items(): - if (value == 1): + if value == 1: supported.append(key) - elif (value == 2): + elif value == 2: unsupported.append(key) - elif (value == 3): + elif value == 3: not_fully_supported.append(key) def fun(info, l): @@ -69,12 +73,14 @@ def fun(info, l): # Fully Supported Ops: All related test cases of these ops have been exported # Semi-Supported Ops: Part of related test cases of these ops have been exported # Unsupported Ops: None of related test cases of these ops have been exported - for info, l in [["{} Fully Supported Operators:".format(len(supported)), - supported], - ["{} Semi-Supported Operators:".format(len(not_fully_supported)), - not_fully_supported], - ["{} Unsupported Operators:".format(len(unsupported)), - unsupported]]: + for info, l in [ + ["{} Fully Supported Operators:".format(len(supported)), supported], + [ + "{} Semi-Supported Operators:".format(len(not_fully_supported)), + not_fully_supported, + ], + ["{} Unsupported Operators:".format(len(unsupported)), unsupported], + ]: fun(info, l) @@ -87,16 +93,20 @@ def convert_tests(testcases, sets=1): test_name = get_test_name(t) module = gen_module(t) module_name = str(module).split("(")[0] - if (module_name == "FunctionalModule"): + if module_name == "FunctionalModule": FunctionalModule_nums += 1 else: - if (module_name not in nn_module): + if module_name not in nn_module: nn_module[module_name] = 0 try: input = gen_input(t) f = io.BytesIO() - torch.onnx._export(module, input, f, - operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK) + torch.onnx._export( + module, + input, + f, + operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK, + ) onnx_model = onnx.load_from_string(f.getvalue()) onnx.checker.check_model(onnx_model) onnx.helper.strip_doc_string(onnx_model) @@ -115,26 +125,38 @@ def convert_tests(testcases, sets=1): for index, var in enumerate([input]): tensor = numpy_helper.from_array(var.data.numpy()) - with open(os.path.join(data_dir, "input_{}.pb".format(index)), "wb") as file: + with open( + os.path.join(data_dir, "input_{}.pb".format(index)), "wb" + ) as file: file.write(tensor.SerializeToString()) for index, var in enumerate([output]): tensor = numpy_helper.from_array(var.data.numpy()) - with open(os.path.join(data_dir, "output_{}.pb".format(index)), "wb") as file: + with open( + os.path.join(data_dir, "output_{}.pb".format(index)), "wb" + ) as file: file.write(tensor.SerializeToString()) input = gen_input(t) - if (module_name != "FunctionalModule"): + if module_name != "FunctionalModule": nn_module[module_name] |= 1 except: # noqa: E722,B001 traceback.print_exc() - if (module_name != "FunctionalModule"): + if module_name != "FunctionalModule": nn_module[module_name] |= 2 failed += 1 - print("Collect {} test cases from PyTorch repo, failed to export {} cases.".format( - len(testcases), failed)) - print("PyTorch converted cases are stored in {}.".format(test_onnx_common.pytorch_converted_dir)) + print( + "Collect {} test cases from PyTorch repo, failed to export {} cases.".format( + len(testcases), failed + ) + ) + print( + "PyTorch converted cases are stored in {}.".format( + test_onnx_common.pytorch_converted_dir + ) + ) print_stats(FunctionalModule_nums, nn_module) + if __name__ == "__main__": testcases = module_tests + new_module_tests convert_tests(testcases) diff --git a/test/onnx/model_defs/__init__.py b/test/onnx/model_defs/__init__.py index 07967b0df70d..7bfa2c833cf3 100644 --- a/test/onnx/model_defs/__init__.py +++ b/test/onnx/model_defs/__init__.py @@ -1,4 +1,4 @@ -from .squeezenet import * # noqa: F403 -from .super_resolution import * # noqa: F403 from .op_test import * # noqa: F403 +from .squeezenet import * # noqa: F403 from .srresnet import * # noqa: F403 +from .super_resolution import * # noqa: F403 diff --git a/test/onnx/model_defs/dcgan.py b/test/onnx/model_defs/dcgan.py index b65cd10106cc..5054835ca13f 100644 --- a/test/onnx/model_defs/dcgan.py +++ b/test/onnx/model_defs/dcgan.py @@ -1,7 +1,6 @@ import torch import torch.nn as nn - # configurable bsz = 64 imgsz = 64 @@ -14,9 +13,9 @@ # custom weights initialization called on netG and netD def weights_init(m): classname = m.__class__.__name__ - if classname.find('Conv') != -1: + if classname.find("Conv") != -1: m.weight.data.normal_(0.0, 0.02) - elif classname.find('BatchNorm') != -1: + elif classname.find("BatchNorm") != -1: m.weight.data.normal_(1.0, 0.02) m.bias.data.fill_(0) @@ -78,7 +77,7 @@ def __init__(self, ngpu): nn.LeakyReLU(0.2, inplace=True), # state size. (ndf*8) x 4 x 4 nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False), - nn.Sigmoid() + nn.Sigmoid(), ) def forward(self, input): diff --git a/test/onnx/model_defs/emb_seq.py b/test/onnx/model_defs/emb_seq.py index 09a289aaf821..5200aa4f5888 100644 --- a/test/onnx/model_defs/emb_seq.py +++ b/test/onnx/model_defs/emb_seq.py @@ -1,4 +1,3 @@ - import torch.nn as nn @@ -17,15 +16,10 @@ def forward(self, input): class EmbeddingNetwork2(nn.Module): - def __init__(self, in_space=10, dim=3): super(EmbeddingNetwork2, self).__init__() self.embedding = nn.Embedding(in_space, dim) - self.seq = nn.Sequential( - self.embedding, - nn.Linear(dim, 1), - nn.Sigmoid() - ) + self.seq = nn.Sequential(self.embedding, nn.Linear(dim, 1), nn.Sigmoid()) def forward(self, indices): return self.seq(indices) diff --git a/test/onnx/model_defs/lstm_flattening_result.py b/test/onnx/model_defs/lstm_flattening_result.py index dbbc07ed21f5..62e8450eff92 100644 --- a/test/onnx/model_defs/lstm_flattening_result.py +++ b/test/onnx/model_defs/lstm_flattening_result.py @@ -7,27 +7,39 @@ def forward(self, input, *fargs, **fkwargs): output, (hidden, cell) = nn.LSTM.forward(self, input, *fargs, **fkwargs) return output, hidden, cell + class LstmFlatteningResultWithSeqLength(nn.Module): def __init__(self, input_size, hidden_size, layers, bidirect, dropout, batch_first): super(LstmFlatteningResultWithSeqLength, self).__init__() self.batch_first = batch_first - self.inner_model = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=layers, - bidirectional=bidirect, dropout=dropout, - batch_first=batch_first) + self.inner_model = nn.LSTM( + input_size=input_size, + hidden_size=hidden_size, + num_layers=layers, + bidirectional=bidirect, + dropout=dropout, + batch_first=batch_first, + ) def forward(self, input: PackedSequence, hx=None): output, (hidden, cell) = self.inner_model.forward(input, hx) return output, hidden, cell + class LstmFlatteningResultWithoutSeqLength(nn.Module): def __init__(self, input_size, hidden_size, layers, bidirect, dropout, batch_first): super(LstmFlatteningResultWithoutSeqLength, self).__init__() self.batch_first = batch_first - self.inner_model = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=layers, - bidirectional=bidirect, dropout=dropout, - batch_first=batch_first) + self.inner_model = nn.LSTM( + input_size=input_size, + hidden_size=hidden_size, + num_layers=layers, + bidirectional=bidirect, + dropout=dropout, + batch_first=batch_first, + ) def forward(self, input, hx=None): output, (hidden, cell) = self.inner_model.forward(input, hx) diff --git a/test/onnx/model_defs/mnist.py b/test/onnx/model_defs/mnist.py index a8a0b3fe4231..176822852c94 100644 --- a/test/onnx/model_defs/mnist.py +++ b/test/onnx/model_defs/mnist.py @@ -3,7 +3,6 @@ class MNIST(nn.Module): - def __init__(self): super(MNIST, self).__init__() self.conv1 = nn.Conv2d(1, 10, kernel_size=5) diff --git a/test/onnx/model_defs/op_test.py b/test/onnx/model_defs/op_test.py index d223c071bec7..774f3070824c 100644 --- a/test/onnx/model_defs/op_test.py +++ b/test/onnx/model_defs/op_test.py @@ -5,13 +5,12 @@ class DummyNet(nn.Module): - def __init__(self, num_classes=1000): super(DummyNet, self).__init__() self.features = nn.Sequential( nn.LeakyReLU(0.02), nn.BatchNorm2d(3), - nn.AvgPool2d(kernel_size=3, stride=2, padding=1, ceil_mode=False) + nn.AvgPool2d(kernel_size=3, stride=2, padding=1, ceil_mode=False), ) def forward(self, x): @@ -20,7 +19,6 @@ def forward(self, x): class ConcatNet(nn.Module): - def __init__(self): super(ConcatNet, self).__init__() @@ -29,7 +27,6 @@ def forward(self, inputs): class PermuteNet(nn.Module): - def __init__(self): super(PermuteNet, self).__init__() @@ -38,7 +35,6 @@ def forward(self, input): class PReluNet(nn.Module): - def __init__(self): super(PReluNet, self).__init__() self.features = nn.Sequential( @@ -49,6 +45,7 @@ def forward(self, x): output = self.features(x) return output + class FakeQuantNet(nn.Module): def __init__(self): super(FakeQuantNet, self).__init__() diff --git a/test/onnx/model_defs/rnn_model_with_packed_sequence.py b/test/onnx/model_defs/rnn_model_with_packed_sequence.py index b0288baeb33e..153d9b7da5e7 100644 --- a/test/onnx/model_defs/rnn_model_with_packed_sequence.py +++ b/test/onnx/model_defs/rnn_model_with_packed_sequence.py @@ -16,6 +16,7 @@ def forward(self, input, *args): ret, _ = rnn_utils.pad_packed_sequence(ret, self.batch_first) return tuple([ret] + list(rets)) + class RnnModelWithPackedSequenceWithoutState(nn.Module): def __init__(self, model, batch_first): super(RnnModelWithPackedSequenceWithoutState, self).__init__() @@ -29,6 +30,7 @@ def forward(self, input, seq_lengths): ret, _ = rnn_utils.pad_packed_sequence(ret, self.batch_first) return list([ret] + list(rets)) + class RnnModelWithPackedSequenceWithState(nn.Module): def __init__(self, model, batch_first): super(RnnModelWithPackedSequenceWithState, self).__init__() diff --git a/test/onnx/model_defs/squeezenet.py b/test/onnx/model_defs/squeezenet.py index 984f724c1562..acf4dc5e2375 100644 --- a/test/onnx/model_defs/squeezenet.py +++ b/test/onnx/model_defs/squeezenet.py @@ -4,35 +4,37 @@ class Fire(nn.Module): - - def __init__(self, inplanes, squeeze_planes, - expand1x1_planes, expand3x3_planes): + def __init__(self, inplanes, squeeze_planes, expand1x1_planes, expand3x3_planes): super(Fire, self).__init__() self.inplanes = inplanes self.squeeze = nn.Conv2d(inplanes, squeeze_planes, kernel_size=1) self.squeeze_activation = nn.ReLU(inplace=True) - self.expand1x1 = nn.Conv2d(squeeze_planes, expand1x1_planes, - kernel_size=1) + self.expand1x1 = nn.Conv2d(squeeze_planes, expand1x1_planes, kernel_size=1) self.expand1x1_activation = nn.ReLU(inplace=True) - self.expand3x3 = nn.Conv2d(squeeze_planes, expand3x3_planes, - kernel_size=3, padding=1) + self.expand3x3 = nn.Conv2d( + squeeze_planes, expand3x3_planes, kernel_size=3, padding=1 + ) self.expand3x3_activation = nn.ReLU(inplace=True) def forward(self, x): x = self.squeeze_activation(self.squeeze(x)) - return torch.cat([ - self.expand1x1_activation(self.expand1x1(x)), - self.expand3x3_activation(self.expand3x3(x)) - ], 1) + return torch.cat( + [ + self.expand1x1_activation(self.expand1x1(x)), + self.expand3x3_activation(self.expand3x3(x)), + ], + 1, + ) class SqueezeNet(nn.Module): - def __init__(self, version=1.0, num_classes=1000, ceil_mode=False): super(SqueezeNet, self).__init__() if version not in [1.0, 1.1]: - raise ValueError("Unsupported SqueezeNet version {version}:" - "1.0 or 1.1 expected".format(version=version)) + raise ValueError( + "Unsupported SqueezeNet version {version}:" + "1.0 or 1.1 expected".format(version=version) + ) self.num_classes = num_classes if version == 1.0: self.features = nn.Sequential( @@ -69,10 +71,7 @@ def __init__(self, version=1.0, num_classes=1000, ceil_mode=False): # Final convolution is initialized differently from the rest final_conv = nn.Conv2d(512, self.num_classes, kernel_size=1) self.classifier = nn.Sequential( - nn.Dropout(p=0.5), - final_conv, - nn.ReLU(inplace=True), - nn.AvgPool2d(13) + nn.Dropout(p=0.5), final_conv, nn.ReLU(inplace=True), nn.AvgPool2d(13) ) for m in self.modules(): diff --git a/test/onnx/model_defs/srresnet.py b/test/onnx/model_defs/srresnet.py index 0328d39f7a34..65795471293f 100644 --- a/test/onnx/model_defs/srresnet.py +++ b/test/onnx/model_defs/srresnet.py @@ -14,10 +14,14 @@ def _initialize_orthogonal(conv): class ResidualBlock(nn.Module): def __init__(self, n_filters): super(ResidualBlock, self).__init__() - self.conv1 = nn.Conv2d(n_filters, n_filters, kernel_size=3, padding=1, bias=False) + self.conv1 = nn.Conv2d( + n_filters, n_filters, kernel_size=3, padding=1, bias=False + ) self.bn1 = nn.BatchNorm2d(n_filters) self.prelu = nn.PReLU(n_filters) - self.conv2 = nn.Conv2d(n_filters, n_filters, kernel_size=3, padding=1, bias=False) + self.conv2 = nn.Conv2d( + n_filters, n_filters, kernel_size=3, padding=1, bias=False + ) self.bn2 = nn.BatchNorm2d(n_filters) # Orthogonal initialisation @@ -33,7 +37,9 @@ def forward(self, x): class UpscaleBlock(nn.Module): def __init__(self, n_filters): super(UpscaleBlock, self).__init__() - self.upscaling_conv = nn.Conv2d(n_filters, 4 * n_filters, kernel_size=3, padding=1) + self.upscaling_conv = nn.Conv2d( + n_filters, 4 * n_filters, kernel_size=3, padding=1 + ) self.upscaling_shuffler = nn.PixelShuffle(2) self.upscaling = nn.PReLU(n_filters) _initialize_orthogonal(self.upscaling_conv) @@ -54,14 +60,21 @@ def __init__(self, rescale_factor, n_filters, n_blocks): for residual_block_num in range(1, n_blocks + 1): residual_block = ResidualBlock(self.n_filters) - self.add_module('residual_block' + str(residual_block_num), nn.Sequential(residual_block)) - - self.skip_conv = nn.Conv2d(n_filters, n_filters, kernel_size=3, padding=1, bias=False) + self.add_module( + "residual_block" + str(residual_block_num), + nn.Sequential(residual_block), + ) + + self.skip_conv = nn.Conv2d( + n_filters, n_filters, kernel_size=3, padding=1, bias=False + ) self.skip_bn = nn.BatchNorm2d(n_filters) for upscale_block_num in range(1, self.rescale_levels + 1): upscale_block = UpscaleBlock(self.n_filters) - self.add_module('upscale_block' + str(upscale_block_num), nn.Sequential(upscale_block)) + self.add_module( + "upscale_block" + str(upscale_block_num), nn.Sequential(upscale_block) + ) self.output_conv = nn.Conv2d(n_filters, 3, kernel_size=9, padding=4) @@ -74,8 +87,8 @@ def forward(self, x): x_init = self.prelu1(self.conv1(x)) x = self.residual_block1(x_init) for residual_block_num in range(2, self.n_blocks + 1): - x = getattr(self, 'residual_block' + str(residual_block_num))(x) + x = getattr(self, "residual_block" + str(residual_block_num))(x) x = self.skip_bn(self.skip_conv(x)) + x_init for upscale_block_num in range(1, self.rescale_levels + 1): - x = getattr(self, 'upscale_block' + str(upscale_block_num))(x) + x = getattr(self, "upscale_block" + str(upscale_block_num))(x) return self.output_conv(x) diff --git a/test/onnx/model_defs/super_resolution.py b/test/onnx/model_defs/super_resolution.py index 958d2f95b62e..dc84ec4192ee 100644 --- a/test/onnx/model_defs/super_resolution.py +++ b/test/onnx/model_defs/super_resolution.py @@ -10,7 +10,7 @@ def __init__(self, upscale_factor): self.conv1 = nn.Conv2d(1, 64, (5, 5), (1, 1), (2, 2)) self.conv2 = nn.Conv2d(64, 64, (3, 3), (1, 1), (1, 1)) self.conv3 = nn.Conv2d(64, 32, (3, 3), (1, 1), (1, 1)) - self.conv4 = nn.Conv2d(32, upscale_factor ** 2, (3, 3), (1, 1), (1, 1)) + self.conv4 = nn.Conv2d(32, upscale_factor**2, (3, 3), (1, 1), (1, 1)) self.pixel_shuffle = nn.PixelShuffle(upscale_factor) self._initialize_weights() @@ -23,7 +23,7 @@ def forward(self, x): return x def _initialize_weights(self): - init.orthogonal_(self.conv1.weight, init.calculate_gain('relu')) - init.orthogonal_(self.conv2.weight, init.calculate_gain('relu')) - init.orthogonal_(self.conv3.weight, init.calculate_gain('relu')) + init.orthogonal_(self.conv1.weight, init.calculate_gain("relu")) + init.orthogonal_(self.conv2.weight, init.calculate_gain("relu")) + init.orthogonal_(self.conv3.weight, init.calculate_gain("relu")) init.orthogonal_(self.conv4.weight) diff --git a/test/onnx/model_defs/word_language_model.py b/test/onnx/model_defs/word_language_model.py index 2b500d93eff4..e4ad3bf51976 100644 --- a/test/onnx/model_defs/word_language_model.py +++ b/test/onnx/model_defs/word_language_model.py @@ -1,28 +1,43 @@ # The model is from here: # https://github.com/pytorch/examples/blob/master/word_language_model/model.py +from typing import Optional, Tuple + import torch import torch.nn as nn from torch import Tensor -from typing import Tuple, Optional + class RNNModel(nn.Module): """Container module with an encoder, a recurrent module, and a decoder.""" - def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, - dropout=0.5, tie_weights=False, batchsize=2): + def __init__( + self, + rnn_type, + ntoken, + ninp, + nhid, + nlayers, + dropout=0.5, + tie_weights=False, + batchsize=2, + ): super(RNNModel, self).__init__() self.drop = nn.Dropout(dropout) self.encoder = nn.Embedding(ntoken, ninp) - if rnn_type in ['LSTM', 'GRU']: + if rnn_type in ["LSTM", "GRU"]: self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout) else: try: - nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type] + nonlinearity = {"RNN_TANH": "tanh", "RNN_RELU": "relu"}[rnn_type] except KeyError: - raise ValueError("""An invalid option for `--model` was supplied, - options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""") from None - self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout) + raise ValueError( + """An invalid option for `--model` was supplied, + options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""" + ) from None + self.rnn = nn.RNN( + ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout + ) self.decoder = nn.Linear(nhid, ntoken) # Optionally tie weights as in: @@ -33,7 +48,9 @@ def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, # https://arxiv.org/abs/1611.01462 if tie_weights: if nhid != ninp: - raise ValueError('When using the tied flag, nhid must be equal to emsize') + raise ValueError( + "When using the tied flag, nhid must be equal to emsize" + ) self.decoder.weight = self.encoder.weight self.init_weights() @@ -61,20 +78,26 @@ def forward(self, input, hidden): emb = self.drop(self.encoder(input)) output, hidden = self.rnn(emb, hidden) output = self.drop(output) - decoded = self.decoder(output.view(output.size(0) * output.size(1), output.size(2))) + decoded = self.decoder( + output.view(output.size(0) * output.size(1), output.size(2)) + ) self.hidden = RNNModel.repackage_hidden(hidden) return decoded.view(output.size(0), output.size(1), decoded.size(1)) def init_hidden(self, bsz): weight = next(self.parameters()).data - if self.rnn_type == 'LSTM': - return (weight.new(self.nlayers, bsz, self.nhid).zero_(), - weight.new(self.nlayers, bsz, self.nhid).zero_()) + if self.rnn_type == "LSTM": + return ( + weight.new(self.nlayers, bsz, self.nhid).zero_(), + weight.new(self.nlayers, bsz, self.nhid).zero_(), + ) else: return weight.new(self.nlayers, bsz, self.nhid).zero_() + class RNNModelWithTensorHidden(RNNModel): """Supports GRU scripting.""" + @staticmethod def repackage_hidden(h): """Detach hidden states from their history.""" @@ -84,12 +107,16 @@ def forward(self, input: Tensor, hidden: Tensor): emb = self.drop(self.encoder(input)) output, hidden = self.rnn(emb, hidden) output = self.drop(output) - decoded = self.decoder(output.view(output.size(0) * output.size(1), output.size(2))) + decoded = self.decoder( + output.view(output.size(0) * output.size(1), output.size(2)) + ) self.hidden = RNNModelWithTensorHidden.repackage_hidden(hidden) return decoded.view(output.size(0), output.size(1), decoded.size(1)) + class RNNModelWithTupleHidden(RNNModel): """Supports LSTM scripting.""" + @staticmethod def repackage_hidden(h: Tuple[Tensor, Tensor]): """Detach hidden states from their history.""" @@ -99,6 +126,8 @@ def forward(self, input: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None) emb = self.drop(self.encoder(input)) output, hidden = self.rnn(emb, hidden) output = self.drop(output) - decoded = self.decoder(output.view(output.size(0) * output.size(1), output.size(2))) + decoded = self.decoder( + output.view(output.size(0) * output.size(1), output.size(2)) + ) self.hidden = self.repackage_hidden(tuple(hidden)) return decoded.view(output.size(0), output.size(1), decoded.size(1)) diff --git a/test/onnx/pytorch_helper.py b/test/onnx/pytorch_helper.py index a07652b4d06f..38e7f7b672a1 100644 --- a/test/onnx/pytorch_helper.py +++ b/test/onnx/pytorch_helper.py @@ -1,9 +1,10 @@ import io -import torch.onnx + import onnx -from caffe2.python.onnx.backend import Caffe2Backend -from caffe2.python.core import BlobReference, Net +import torch.onnx +from caffe2.python.core import BlobReference, Net +from caffe2.python.onnx.backend import Caffe2Backend _next_idx = 0 # Clone net takes a dict instead of a lambda @@ -54,19 +55,23 @@ def PyTorchModule(helper, model, sample_arguments, caffe2_inputs, prefix_name=No # TODO: handle the case where model cannot be exported # and embed as a Python op in Caffe2 f = io.BytesIO() - torch.onnx.export( - model, sample_arguments, f, export_params=True) + torch.onnx.export(model, sample_arguments, f, export_params=True) onnx_model = onnx.load(io.BytesIO(f.getvalue())) - init_net, predict_net = Caffe2Backend.onnx_graph_to_caffe2_net( - onnx_model) + init_net, predict_net = Caffe2Backend.onnx_graph_to_caffe2_net(onnx_model) initialized = set([x.name for x in onnx_model.graph.initializer]) - uninitialized_inputs = {x.name: i for i, x in enumerate( - onnx_model.graph.input) if x.name not in initialized} + uninitialized_inputs = { + x.name: i + for i, x in enumerate(onnx_model.graph.input) + if x.name not in initialized + } - if(len(uninitialized_inputs) != len(caffe2_inputs)): - raise ValueError("Expected {} inputs but found {}".format( - len(uninitialized_inputs), len(caffe2_inputs))) + if len(uninitialized_inputs) != len(caffe2_inputs): + raise ValueError( + "Expected {} inputs but found {}".format( + len(uninitialized_inputs), len(caffe2_inputs) + ) + ) def remap_blob_name(name): if name in uninitialized_inputs: @@ -80,6 +85,10 @@ def remap_blob_name(name): init_net = Net(init_net).Clone("anon", _FakeDict(remap_blob_name)) helper.param_init_net.AppendNet(init_net) - results = tuple([BlobReference(remap_blob_name(x.name), helper.net) - for x in onnx_model.graph.output]) + results = tuple( + [ + BlobReference(remap_blob_name(x.name), helper.net) + for x in onnx_model.graph.output + ] + ) return results diff --git a/test/onnx/test_caffe2_common.py b/test/onnx/test_caffe2_common.py index 52dc6363760e..9f7f288e0e7c 100644 --- a/test/onnx/test_caffe2_common.py +++ b/test/onnx/test_caffe2_common.py @@ -1,12 +1,14 @@ # Owner(s): ["module: onnx"] import glob +import os + import numpy as np import onnx.backend.test -import caffe2.python.onnx.backend as c2 -import os from onnx import numpy_helper +import caffe2.python.onnx.backend as c2 + def load_tensor_as_numpy_array(f): tensor = onnx.TensorProto() @@ -26,13 +28,23 @@ def run_generated_test(model_file, data_dir, device="CPU"): input_num = len(glob.glob(os.path.join(data_dir, "input_*.pb"))) inputs = [] for i in range(input_num): - inputs.append(numpy_helper.to_array(load_tensor_as_numpy_array( - os.path.join(data_dir, "input_{}.pb".format(i))))) + inputs.append( + numpy_helper.to_array( + load_tensor_as_numpy_array( + os.path.join(data_dir, "input_{}.pb".format(i)) + ) + ) + ) output_num = len(glob.glob(os.path.join(data_dir, "output_*.pb"))) outputs = [] for i in range(output_num): - outputs.append(numpy_helper.to_array(load_tensor_as_numpy_array( - os.path.join(data_dir, "output_{}.pb".format(i))))) + outputs.append( + numpy_helper.to_array( + load_tensor_as_numpy_array( + os.path.join(data_dir, "output_{}.pb".format(i)) + ) + ) + ) prepared = c2.prepare(model, device=device) c2_outputs = prepared.run(inputs) assert_similar(outputs, c2_outputs) diff --git a/test/onnx/test_custom_ops.py b/test/onnx/test_custom_ops.py index f356317a5b81..b04d5df34a17 100644 --- a/test/onnx/test_custom_ops.py +++ b/test/onnx/test_custom_ops.py @@ -1,20 +1,19 @@ # Owner(s): ["module: onnx"] import unittest -import torch -import torch.utils.cpp_extension - -import onnx -import caffe2.python.onnx.backend as c2 import numpy as np - +import onnx from test_pytorch_onnx_caffe2 import do_export from test_pytorch_onnx_onnxruntime import run_model_test + +import caffe2.python.onnx.backend as c2 +import torch +import torch.utils.cpp_extension from torch.onnx.symbolic_helper import _unimplemented -class TestCustomOps(unittest.TestCase): +class TestCustomOps(unittest.TestCase): def test_custom_add(self): op_source = """ #include @@ -42,7 +41,10 @@ def symbolic_custom_add(g, self, other): return g.op("Add", self, other) from torch.onnx import register_custom_op_symbolic - register_custom_op_symbolic("custom_namespace::custom_add", symbolic_custom_add, 9) + + register_custom_op_symbolic( + "custom_namespace::custom_add", symbolic_custom_add, 9 + ) x = torch.randn(2, 3, 4, requires_grad=False) y = torch.randn(2, 3, 4, requires_grad=False) @@ -62,7 +64,6 @@ class TestCustomAutogradFunction(unittest.TestCase): def test_symbolic(self): class MyClip(torch.autograd.Function): - @staticmethod def forward(ctx, input, scalar): ctx.save_for_backward(input) @@ -83,18 +84,16 @@ def forward(self, x): x = torch.randn(2, 3, 4, requires_grad=True) model = MyModule() - run_model_test(self, model, input=(x, )) + run_model_test(self, model, input=(x,)) def test_register_custom_op(self): class MyClip(torch.autograd.Function): - @staticmethod def forward(ctx, input, scalar): ctx.save_for_backward(input) return input.clamp(min=scalar) class MyRelu(torch.autograd.Function): - @staticmethod def forward(ctx, input): ctx.save_for_backward(input) @@ -111,21 +110,24 @@ def forward(self, x): h = self.relu(h) return h - def symbolic_pythonop(g, n, *args, **kwargs): + def symbolic_pythonop(ctx: torch.onnx.SymbolicContext, g, *args, **kwargs): + n = ctx.cur_node name = kwargs["name"] if name == "MyClip": - return g.op("Clip", args[0], min_f=args[1]) + return g.op("Clip", args[0], min_f=args[1], outputs=n.outputsSize()) elif name == "MyRelu": - return g.op("Relu", args[0]) + return g.op("Relu", args[0], outputs=n.outputsSize()) else: return _unimplemented("prim::PythonOp", "unknown node kind: " + name) from torch.onnx import register_custom_op_symbolic + register_custom_op_symbolic("prim::PythonOp", symbolic_pythonop, 1) x = torch.randn(2, 3, 4, requires_grad=True) model = MyModule() - run_model_test(self, model, input=(x, )) + run_model_test(self, model, input=(x,)) + class TestExportAsContribOps(unittest.TestCase): opset_version = 14 @@ -136,7 +138,7 @@ def test_contrib_op_with_loop(self): class M(torch.nn.Module): def __init__(self): super().__init__() - self.gelu = torch.nn.GELU() + self.gelu = torch.nn.GELU(approximate="none") def forward(self, x): res = [] @@ -149,15 +151,17 @@ def forward(self, x): res.append(x[0]) return torch.stack(res), torch.stack(res2) - def symbolic_custom_gelu(g, input): + def symbolic_custom_gelu(g, input, approximate): return g.op("com.microsoft::Gelu", input).setType(input.type()) from torch.onnx import register_custom_op_symbolic + register_custom_op_symbolic("::gelu", symbolic_custom_gelu, 1) x = torch.randn(3, 3, 4, requires_grad=True) model = torch.jit.script(M()) - run_model_test(self, model, input=(x, )) + run_model_test(self, model, input=(x,)) + if __name__ == "__main__": unittest.main() diff --git a/test/onnx/test_models.py b/test/onnx/test_models.py index 5d22c255f832..dc849528842a 100644 --- a/test/onnx/test_models.py +++ b/test/onnx/test_models.py @@ -1,90 +1,104 @@ # Owner(s): ["module: onnx"] +import unittest + +from model_defs.dcgan import _netD, _netG, bsz, imgsz, nz, weights_init +from model_defs.emb_seq import EmbeddingNetwork1, EmbeddingNetwork2 +from model_defs.mnist import MNIST +from model_defs.op_test import ( + ConcatNet, + DummyNet, + FakeQuantNet, + PermuteNet, + PReluNet, +) +from model_defs.squeezenet import SqueezeNet +from model_defs.srresnet import SRResNet +from model_defs.super_resolution import SuperResolutionNet +from test_pytorch_common import ( + TestCase, + run_tests, + skipIfNoLapack, + skipIfUnsupportedMinOpsetVersion, + skipScriptTest, +) +from torchvision.models import shufflenet_v2_x1_0 from torchvision.models.alexnet import alexnet -from torchvision.models.inception import inception_v3 from torchvision.models.densenet import densenet121 -from torchvision.models.resnet import resnet50 -from torchvision.models.vgg import vgg16, vgg16_bn, vgg19, vgg19_bn from torchvision.models.googlenet import googlenet +from torchvision.models.inception import inception_v3 from torchvision.models.mnasnet import mnasnet1_0 from torchvision.models.mobilenet import mobilenet_v2 -from torchvision.models import shufflenet_v2_x1_0 -from torchvision.models.segmentation import fcn_resnet101, deeplabv3_resnet101 -from torchvision.models.video import r3d_18, mc3_18, r2plus1d_18 - -from model_defs.mnist import MNIST -from model_defs.squeezenet import SqueezeNet -from model_defs.super_resolution import SuperResolutionNet -from model_defs.srresnet import SRResNet -from model_defs.dcgan import _netD, _netG, weights_init, bsz, imgsz, nz -from model_defs.op_test import DummyNet, ConcatNet, PermuteNet, PReluNet, FakeQuantNet -from model_defs.emb_seq import EmbeddingNetwork1, EmbeddingNetwork2 - -from test_pytorch_common import TestCase, run_tests, skipIfNoLapack, skipIfUnsupportedMinOpsetVersion, disableScriptTest +from torchvision.models.resnet import resnet50 +from torchvision.models.segmentation import deeplabv3_resnet101, fcn_resnet101 +from torchvision.models.vgg import vgg16, vgg16_bn, vgg19, vgg19_bn +from torchvision.models.video import mc3_18, r2plus1d_18, r3d_18 +from verify import verify +import caffe2.python.onnx.backend as backend import torch import torch.onnx import torch.onnx.utils +from torch import quantization from torch.autograd import Variable from torch.onnx import OperatorExportTypes -from torch import quantization - -import unittest - -import caffe2.python.onnx.backend as backend - -from verify import verify if torch.cuda.is_available(): + def toC(x): return x.cuda() + else: + def toC(x): return x + BATCH_SIZE = 2 class TestModels(TestCase): + opset_version = 9 # Caffe2 doesn't support the default. keep_initializers_as_inputs = False - from torch.onnx.symbolic_helper import _export_onnx_opset_version - opset_version = _export_onnx_opset_version def exportTest(self, model, inputs, rtol=1e-2, atol=1e-7): with torch.onnx.select_model_mode_for_export(model, None): graph = torch.onnx.utils._trace(model, inputs, OperatorExportTypes.ONNX) torch._C._jit_pass_lint(graph) - verify(model, inputs, backend, rtol=rtol, atol=atol) + verify( + model, + inputs, + backend, + rtol=rtol, + atol=atol, + opset_version=self.opset_version, + ) def test_ops(self): - x = Variable( - torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0) - ) + x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)) self.exportTest(toC(DummyNet()), toC(x)) def test_prelu(self): - x = Variable( - torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0) - ) + x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)) self.exportTest(PReluNet(), x) - @disableScriptTest() + @skipScriptTest() def test_concat(self): input_a = Variable(torch.randn(BATCH_SIZE, 3)) input_b = Variable(torch.randn(BATCH_SIZE, 3)) - inputs = ((toC(input_a), toC(input_b)), ) + inputs = ((toC(input_a), toC(input_b)),) self.exportTest(toC(ConcatNet()), inputs) def test_permute(self): x = Variable(torch.randn(BATCH_SIZE, 3, 10, 12)) self.exportTest(PermuteNet(), x) - @disableScriptTest() + @skipScriptTest() def test_embedding_sequential_1(self): x = Variable(torch.randint(0, 10, (BATCH_SIZE, 3))) self.exportTest(EmbeddingNetwork1(), x) - @disableScriptTest() + @skipScriptTest() def test_embedding_sequential_2(self): x = Variable(torch.randint(0, 10, (BATCH_SIZE, 3))) self.exportTest(EmbeddingNetwork2(), x) @@ -92,19 +106,17 @@ def test_embedding_sequential_2(self): @unittest.skip("This model takes too much memory") def test_srresnet(self): x = Variable(torch.randn(1, 3, 224, 224).fill_(1.0)) - self.exportTest(toC(SRResNet(rescale_factor=4, n_filters=64, n_blocks=8)), toC(x)) + self.exportTest( + toC(SRResNet(rescale_factor=4, n_filters=64, n_blocks=8)), toC(x) + ) @skipIfNoLapack def test_super_resolution(self): - x = Variable( - torch.randn(BATCH_SIZE, 1, 224, 224).fill_(1.0) - ) + x = Variable(torch.randn(BATCH_SIZE, 1, 224, 224).fill_(1.0)) self.exportTest(toC(SuperResolutionNet(upscale_factor=3)), toC(x), atol=1e-6) def test_alexnet(self): - x = Variable( - torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0) - ) + x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)) self.exportTest(toC(alexnet()), toC(x)) def test_mnist(self): @@ -140,7 +152,7 @@ def test_resnet(self): x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)) self.exportTest(toC(resnet50()), toC(x), atol=1e-6) - @disableScriptTest() # None type in outputs + @skipScriptTest(min_opset_version=15) # None type in outputs def test_inception(self): x = Variable(torch.randn(BATCH_SIZE, 3, 299, 299)) self.exportTest(toC(inception_v3()), toC(x)) @@ -163,14 +175,14 @@ def test_densenet(self): x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)) self.exportTest(toC(densenet121()), toC(x), rtol=1e-2, atol=1e-5) - @disableScriptTest() + @skipScriptTest() def test_dcgan_netD(self): netD = _netD(1) netD.apply(weights_init) input = Variable(torch.empty(bsz, 3, imgsz, imgsz).normal_(0, 1)) self.exportTest(toC(netD), toC(input)) - @disableScriptTest() + @skipScriptTest() def test_dcgan_netG(self): netG = _netG(1) netG.apply(weights_init) @@ -190,7 +202,9 @@ def test_qat_resnet_pertensor(self): # Use per tensor for weight. Per channel support will come with opset 13 qat_resnet50.qconfig = quantization.QConfig( - activation=quantization.default_fake_quant, weight=quantization.default_fake_quant) + activation=quantization.default_fake_quant, + weight=quantization.default_fake_quant, + ) quantization.prepare_qat(qat_resnet50, inplace=True) qat_resnet50.apply(torch.ao.quantization.enable_observer) qat_resnet50.apply(torch.ao.quantization.enable_fake_quant) @@ -211,7 +225,8 @@ def test_qat_resnet_per_channel(self): qat_resnet50.qconfig = quantization.QConfig( activation=quantization.default_fake_quant, - weight=quantization.default_per_channel_weight_fake_quant) + weight=quantization.default_per_channel_weight_fake_quant, + ) quantization.prepare_qat(qat_resnet50, inplace=True) qat_resnet50.apply(torch.ao.quantization.enable_observer) qat_resnet50.apply(torch.ao.quantization.enable_fake_quant) @@ -224,7 +239,7 @@ def test_qat_resnet_per_channel(self): self.exportTest(toC(qat_resnet50), toC(x)) - @disableScriptTest() # None type in outputs + @skipScriptTest(min_opset_version=15) # None type in outputs def test_googlenet(self): x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)) self.exportTest(toC(googlenet()), toC(x), rtol=1e-3, atol=1e-5) @@ -237,7 +252,7 @@ def test_mobilenet(self): x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)) self.exportTest(toC(mobilenet_v2()), toC(x), rtol=1e-3, atol=1e-5) - @disableScriptTest() # prim_data + @skipScriptTest() # prim_data def test_shufflenet(self): x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)) self.exportTest(toC(shufflenet_v2_x1_0()), toC(x), rtol=1e-3, atol=1e-5) @@ -245,12 +260,22 @@ def test_shufflenet(self): @skipIfUnsupportedMinOpsetVersion(11) def test_fcn(self): x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)) - self.exportTest(toC(fcn_resnet101()), toC(x), rtol=1e-3, atol=1e-5) + self.exportTest( + toC(fcn_resnet101(pretrained=False, pretrained_backbone=False)), + toC(x), + rtol=1e-3, + atol=1e-5, + ) @skipIfUnsupportedMinOpsetVersion(11) def test_deeplab(self): x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)) - self.exportTest(toC(deeplabv3_resnet101()), toC(x), rtol=1e-3, atol=1e-5) + self.exportTest( + toC(deeplabv3_resnet101(pretrained=False, pretrained_backbone=False)), + toC(x), + rtol=1e-3, + atol=1e-5, + ) def test_r3d_18_video(self): x = Variable(torch.randn(1, 3, 4, 112, 112).fill_(1.0)) diff --git a/test/onnx/test_models_onnxruntime.py b/test/onnx/test_models_onnxruntime.py index 62006fd8a068..a4f4295fad65 100644 --- a/test/onnx/test_models_onnxruntime.py +++ b/test/onnx/test_models_onnxruntime.py @@ -1,10 +1,11 @@ # Owner(s): ["module: onnx"] import unittest -import onnxruntime # noqa: F401 +import onnxruntime # noqa: F401 from test_models import TestModels from test_pytorch_onnx_onnxruntime import run_model_test + import torch @@ -14,29 +15,33 @@ def exportTest(self, model, inputs, rtol=1e-2, atol=1e-7, opset_versions=None): for opset_version in opset_versions: self.opset_version = opset_version self.onnx_shape_inference = True - run_model_test(self, model, False, - input=inputs, rtol=rtol, atol=atol) + run_model_test(self, model, False, input=inputs, rtol=rtol, atol=atol) if self.is_script_test_enabled and opset_version > 11: script_model = torch.jit.script(model) - run_model_test(self, script_model, False, - input=inputs, rtol=rtol, atol=atol) + run_model_test( + self, script_model, False, input=inputs, rtol=rtol, atol=atol + ) -TestModels = type(str("TestModels"), - (unittest.TestCase,), - dict(TestModels.__dict__, - is_script_test_enabled=False, - exportTest=exportTest)) +TestModels = type( + str("TestModels"), + (unittest.TestCase,), + dict(TestModels.__dict__, is_script_test_enabled=False, exportTest=exportTest), +) # model tests for scripting with new JIT APIs and shape inference -TestModels_new_jit_API = type(str("TestModels_new_jit_API"), - (unittest.TestCase,), - dict(TestModels.__dict__, - exportTest=exportTest, - is_script_test_enabled=True, - onnx_shape_inference=True)) +TestModels_new_jit_API = type( + str("TestModels_new_jit_API"), + (unittest.TestCase,), + dict( + TestModels.__dict__, + exportTest=exportTest, + is_script_test_enabled=True, + onnx_shape_inference=True, + ), +) if __name__ == "__main__": diff --git a/test/onnx/test_onnx_common.py b/test/onnx/test_onnx_common.py index dabc10fb649f..2e5a907a361a 100644 --- a/test/onnx/test_onnx_common.py +++ b/test/onnx/test_onnx_common.py @@ -2,10 +2,16 @@ import os - -onnx_model_dir = os.path.join(os.path.dirname( - os.path.realpath(__file__)), os.pardir, "repos", "onnx", "onnx", - "backend", "test", "data") +onnx_model_dir = os.path.join( + os.path.dirname(os.path.realpath(__file__)), + os.pardir, + "repos", + "onnx", + "onnx", + "backend", + "test", + "data", +) pytorch_converted_dir = os.path.join(onnx_model_dir, "pytorch-converted") diff --git a/test/onnx/test_onnx_export.py b/test/onnx/test_onnx_export.py new file mode 100644 index 000000000000..6e955d1d0b98 --- /dev/null +++ b/test/onnx/test_onnx_export.py @@ -0,0 +1,143 @@ +# Owner(s): ["module: onnx"] + +import contextlib +import io +import itertools +import os +import sys +import unittest.mock +from typing import Callable, Iterable, Optional, Tuple, Union + +import onnx +from test_pytorch_common import TestCase + +import torch +from torch.onnx import OperatorExportTypes, symbolic_registry +from torch.onnx._globals import GLOBALS +from torch.onnx.symbolic_helper import _onnx_unsupported +from torch.testing._internal.common_utils import custom_op, skipIfCaffe2 + +# Make the helper files in test/ importable +pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) +sys.path.append(pytorch_test_dir) + + +def export_to_onnx( + model: Union[torch.nn.Module, torch.jit.ScriptFunction], + input: Tuple[torch.Tensor], + custom_ops: Optional[ + Iterable[ + Union[contextlib.AbstractContextManager, contextlib.ContextDecorator], + ] + ] = None, + mocks: Optional[Iterable] = None, + operator_export_type: OperatorExportTypes = OperatorExportTypes.ONNX, + opset_version: int = GLOBALS.export_onnx_opset_version, +) -> onnx.ModelProto: + """Exports `model(input)` to ONNX and returns it. + + Custom operators and/or unittest patches can be used help reproducing specific behaviors. + + Args: + model: model to export + input: model input with same format as `torch.onnx.export(..,args,...)` + custom_ops: list of custom operators to use during export + mocks: list of mocks to use during export + operator_export_type: export type as described by `torch.onnx.export(...operator_export_type,...)` + opset_version: ONNX opset version as described by `torch.onnx.export(...opset_version,...)` + Returns: + A valid ONNX model (`onnx.ModelProto`) + """ + custom_ops = custom_ops or [] + mocks = mocks or [] + with contextlib.ExitStack() as stack: + for ctx in itertools.chain(custom_ops, mocks): + stack.enter_context(ctx) + + f = io.BytesIO() + torch.onnx.export( + model, + input, + f, + operator_export_type=operator_export_type, + opset_version=opset_version, + ) + + # Validate ONNX graph before returning it + onnx_model = onnx.load_from_string(f.getvalue()) + onnx.checker.check_model(onnx_model) + return onnx_model + + +class TestONNXExport(TestCase): + @skipIfCaffe2 + def test_clip_aten_fallback_due_exception(self): + def bad_clamp(g, self, min, max): + return _onnx_unsupported("Bad boy!") + + class MyClip(torch.nn.Module): + def forward(self, x): + return torch.clamp(x, min=-0.5, max=0.5) + + onnx_model = export_to_onnx( + MyClip(), + torch.randn(3, 4, requires_grad=True), + custom_ops=[custom_op("aten::clamp", bad_clamp, 9)], + operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK, + ) + self.assertAtenOp(onnx_model, "clamp", "Tensor") + + @skipIfCaffe2 + def test_clip_aten_fallback_explicit_request(self): + class MyClip(torch.nn.Module): + def forward(self, x): + return torch.clamp(x, min=-0.5, max=0.5) + + def break_is_registered_op_api(opname, domain, version): + fake_missing_symbolics = ("clamp",) + if opname in fake_missing_symbolics: + return False + return ( + (domain, version) in symbolic_registry._registry + and opname in symbolic_registry._registry[(domain, version)] + ) + + # Force missing symbolic for well-known op using a mock + onnx_model = export_to_onnx( + MyClip(), + torch.randn(3, 4, requires_grad=True), + mocks=[ + unittest.mock.patch( + "torch.onnx.symbolic_registry.is_registered_op", + side_effect=break_is_registered_op_api, + ) + ], + operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK, + ) + self.assertAtenOp(onnx_model, "clamp", "Tensor") + + def _helper_test_to_(self, cast_fn: Callable[[torch.Tensor], torch.Tensor]): + """Helper to test aten::to(device) variants. + + `cast_fn` is converted into a `torch.jit.script`. It wraps `aten::to` + during export to preventing the devices to be hard-coded. + + Needed by detectron2 after https://github.com/facebookresearch/detectron2/pull/4132/ + """ + cast_fn = torch.jit.script(cast_fn) + onnx_model = export_to_onnx(cast_fn, torch.zeros([1, 3, 32, 32])) + for n in onnx_model.graph.node: + self.assertNotEqual(n.op_type, "To") + self.assertNotEqual(n.op_type, "Cast") + + def test_to__cpu_string(self): + def cast_cpu_string(src: torch.Tensor) -> torch.Tensor: + return src.to("cpu") + + self._helper_test_to_(cast_cpu_string) + + def test_to__device_cpu_string(self): + def cast_device_cpu_string(src: torch.Tensor) -> torch.Tensor: + return src.to(device="cpu") + + self._helper_test_to_(cast_device_cpu_string) diff --git a/test/onnx/test_onnx_opset.py b/test/onnx/test_onnx_opset.py index c6b13f8693bd..cd672ac0dc3a 100644 --- a/test/onnx/test_onnx_opset.py +++ b/test/onnx/test_onnx_opset.py @@ -1,25 +1,27 @@ # Owner(s): ["module: onnx"] +import io +import itertools + +import onnx from test_pytorch_common import TestCase, run_tests import torch import torch.onnx from torch.nn import Module - -import onnx - -import io - -from torch.onnx.symbolic_helper import _export_onnx_opset_version from torch.onnx import producer_name, producer_version +from torch.onnx._globals import GLOBALS -def check_onnx_opset_operator(model, ops, opset_version=_export_onnx_opset_version): +def check_onnx_opset_operator( + model, ops, opset_version=GLOBALS.export_onnx_opset_version +): # check_onnx_components assert ( - model.producer_name == producer_name and - model.producer_version == producer_version and - model.opset_import[0].version == opset_version) + model.producer_name == producer_name + and model.producer_version == producer_version + and model.opset_import[0].version == opset_version + ) # check the schema with the onnx checker onnx.checker.check_model(model) @@ -34,36 +36,48 @@ def check_onnx_opset_operator(model, ops, opset_version=_export_onnx_opset_versi assert len(ops) == len(graph.node) for i in range(0, len(ops)): assert graph.node[i].op_type == ops[i]["op_name"] - if "attributes" in ops[i] : + if "attributes" in ops[i]: attributes = ops[i]["attributes"] assert len(attributes) == len(graph.node[i].attribute) for j in range(0, len(attributes)): for attribute_field in attributes[j].keys(): - assert attributes[j][attribute_field] == getattr(graph.node[i].attribute[j], attribute_field) - - -def check_onnx_opsets_operator(module, x, ops, opset_versions, training=torch.onnx.TrainingMode.EVAL, - input_names=None, dynamic_axes=None): + assert attributes[j][attribute_field] == getattr( + graph.node[i].attribute[j], attribute_field + ) + + +def check_onnx_opsets_operator( + module, + x, + ops, + opset_versions, + training=torch.onnx.TrainingMode.EVAL, + input_names=None, + dynamic_axes=None, +): for opset_version in opset_versions: f = io.BytesIO() - torch.onnx.export(module, x, f, - opset_version=opset_version, - training=training, - input_names=input_names, - dynamic_axes=dynamic_axes) + torch.onnx.export( + module, + x, + f, + opset_version=opset_version, + training=training, + input_names=input_names, + dynamic_axes=dynamic_axes, + ) model = onnx.load(io.BytesIO(f.getvalue())) check_onnx_opset_operator(model, ops[opset_version], opset_version) class TestONNXOpset(TestCase): - def test_opset_fallback(self): class MyModule(Module): def forward(self, x): return torch.isnan(x) - ops = [{"op_name" : "IsNaN"}] - ops = {9 : ops, 10 : ops} + ops = [{"op_name": "IsNaN"}] + ops = {9: ops, 10: ops} x = torch.tensor([1.0, float("nan"), 2.0]) check_onnx_opsets_operator(MyModule(), x, ops, opset_versions=[9, 10]) @@ -72,11 +86,20 @@ class MyModule(Module): def forward(self, x): return torch.topk(x, 3) - ops_9 = [{"op_name": "TopK", "attributes": [{"name": "axis", "i": -1, "type": 2}, - {"name": "k", "i": 3, "type": 2}]}] - ops_10 = [{"op_name": "TopK", "attributes": [{"name": "axis", "i": -1, "type": 2}]}] + ops_9 = [ + { + "op_name": "TopK", + "attributes": [ + {"name": "axis", "i": -1, "type": 2}, + {"name": "k", "i": 3, "type": 2}, + ], + } + ] + ops_10 = [ + {"op_name": "TopK", "attributes": [{"name": "axis", "i": -1, "type": 2}]} + ] ops = {9: ops_9, 10: ops_10} - x = torch.arange(1., 6., requires_grad=True) + x = torch.arange(1.0, 6.0, requires_grad=True) check_onnx_opsets_operator(MyModule(), x, ops, opset_versions=[9, 10]) # test with dynamic k @@ -85,45 +108,61 @@ class MyModuleDynamic(torch.jit.ScriptModule): def forward(self, input, k): return torch.topk(input, k) - ops_10 = [{"op_name": "Constant", "attributes": [{"name": "value", "type": 4}]}, - {"op_name": "Reshape"}, - {"op_name": "TopK", "attributes": [{"name": "axis", "i": -1, "type": 2}]}] + ops_10 = [ + {"op_name": "Constant", "attributes": [{"name": "value", "type": 4}]}, + {"op_name": "Reshape"}, + {"op_name": "TopK", "attributes": [{"name": "axis", "i": -1, "type": 2}]}, + ] ops = {10: ops_10} - x = torch.arange(1., 6., requires_grad=True) + x = torch.arange(1.0, 6.0, requires_grad=True) k = torch.tensor(3) module = MyModuleDynamic() - check_onnx_opsets_operator(module, [x, k], ops, - opset_versions=[10]) + check_onnx_opsets_operator(module, [x, k], ops, opset_versions=[10]) def test_maxpool(self): module = torch.nn.MaxPool1d(2, stride=1) - ops_9 = [{"op_name" : "MaxPool", - "attributes" : - [{"name": "kernel_shape", "ints": [2], "type": 7}, - {"name": "pads", "ints": [0, 0], "type": 7}, - {"name": "strides", "ints": [1], "type": 7}]}] - ops_10 = [{"op_name" : "MaxPool", - "attributes" : - [{"name": "ceil_mode", "i": 0, "type": 2}, + ops_9 = [ + { + "op_name": "MaxPool", + "attributes": [ + {"name": "kernel_shape", "ints": [2], "type": 7}, + {"name": "pads", "ints": [0, 0], "type": 7}, + {"name": "strides", "ints": [1], "type": 7}, + ], + } + ] + ops_10 = [ + { + "op_name": "MaxPool", + "attributes": [ + {"name": "ceil_mode", "i": 0, "type": 2}, {"name": "kernel_shape", "ints": [2], "type": 7}, {"name": "pads", "ints": [0, 0], "type": 7}, - {"name": "strides", "ints": [1], "type": 7}]}] - ops = {9 : ops_9, 10 : ops_10} + {"name": "strides", "ints": [1], "type": 7}, + ], + } + ] + ops = {9: ops_9, 10: ops_10} x = torch.randn(20, 16, 50) check_onnx_opsets_operator(module, x, ops, opset_versions=[9, 10]) # add test with dilations module = torch.nn.MaxPool1d(2, stride=1, dilation=2) - ops_10 = [{"op_name" : "MaxPool", - "attributes" : - [{"name": "ceil_mode", "i": 0, "type": 2}, + ops_10 = [ + { + "op_name": "MaxPool", + "attributes": [ + {"name": "ceil_mode", "i": 0, "type": 2}, {"name": "dilations", "ints": [2], "type": 7}, {"name": "kernel_shape", "ints": [2], "type": 7}, {"name": "pads", "ints": [0, 0], "type": 7}, - {"name": "strides", "ints": [1], "type": 7}]}] - ops = {10 : ops_10} + {"name": "strides", "ints": [1], "type": 7}, + ], + } + ] + ops = {10: ops_10} x = torch.randn(20, 16, 50) check_onnx_opsets_operator(module, x, ops, opset_versions=[10]) @@ -138,11 +177,23 @@ def forward(self, x): return torch.nn.functional.interpolate(x, size=size, mode="nearest") module = MyModule() - ops8 = [{"op_name" : "Upsample", "attributes" : [{"name": "mode", "s": ("nearest").encode(), "type": 3}, - {"name": "scales", "floats": [1.0, 1.0, 2.0, 2.0], "type": 6}]}] - ops9 = [{"op_name" : "Constant"}, - {"op_name" : "Upsample", "attributes" : [{"name": "mode", "s": ("nearest").encode(), "type": 3}]}] - ops = {8 : ops8, 9 : ops9} + ops8 = [ + { + "op_name": "Upsample", + "attributes": [ + {"name": "mode", "s": ("nearest").encode(), "type": 3}, + {"name": "scales", "floats": [1.0, 1.0, 2.0, 2.0], "type": 6}, + ], + } + ] + ops9 = [ + {"op_name": "Constant"}, + { + "op_name": "Upsample", + "attributes": [{"name": "mode", "s": ("nearest").encode(), "type": 3}], + }, + ] + ops = {8: ops8, 9: ops9} x = torch.randn(2, 2, 2, 2) check_onnx_opsets_operator(module, x, ops, opset_versions=[8, 9]) @@ -155,11 +206,13 @@ def forward(self, x): return x - 1 module = MyModule() - ops_8 = [{"op_name" : "Constant"}, - {"op_name" : "Cast", "attributes": [{"name": "to", "i": 7, "type": 2}]}, - {"op_name" : "Sub"}] - ops_9 = [{"op_name" : "Constant"}, {"op_name" : "Sub"}] - ops = {8 : ops_8, 9 : ops_9} + ops_8 = [ + {"op_name": "Constant"}, + {"op_name": "Cast", "attributes": [{"name": "to", "i": 7, "type": 2}]}, + {"op_name": "Sub"}, + ] + ops_9 = [{"op_name": "Constant"}, {"op_name": "Sub"}] + ops = {8: ops_8, 9: ops_9} x = torch.ones(5, 6, dtype=torch.long) check_onnx_opsets_operator(module, x, ops, opset_versions=[8, 9]) @@ -168,48 +221,63 @@ class MyModule(Module): def forward(self, x): return x[0:1] - ops_9 = [{"op_name" : "Slice", - "attributes" : - [{"name": "axes", "ints": [0], "type": 7}, - {"name": "ends", "ints": [1], "type": 7}, - {"name": "starts", "ints": [0], "type": 7}]}] - ops_10 = [{"op_name" : "Constant"}, - {"op_name" : "Constant"}, - {"op_name" : "Constant"}, - {"op_name" : "Constant"}, - {"op_name" : "Slice", - "attributes" : []}] - ops = {9 : ops_9, 10 : ops_10} + ops_9 = [ + { + "op_name": "Slice", + "attributes": [ + {"name": "axes", "ints": [0], "type": 7}, + {"name": "ends", "ints": [1], "type": 7}, + {"name": "starts", "ints": [0], "type": 7}, + ], + } + ] + ops_10 = [ + {"op_name": "Constant"}, + {"op_name": "Constant"}, + {"op_name": "Constant"}, + {"op_name": "Constant"}, + {"op_name": "Slice", "attributes": []}, + ] + ops = {9: ops_9, 10: ops_10} x = torch.randn(3) check_onnx_opsets_operator(MyModule(), x, ops, opset_versions=[9, 10]) class DynamicSliceModel(torch.jit.ScriptModule): @torch.jit.script_method def forward(self, x): - return x[1:x.size(0)] + return x[1 : x.size(0)] module = DynamicSliceModel() x = torch.rand(1, 2) - ops_10 = [{"op_name" : "Shape"}, - {"op_name" : "Constant"}, - {"op_name" : "Gather", - "attributes" : [{"name" : "axis", "i" : 0, "type" : 2}]}, - {"op_name" : "Unsqueeze", - "attributes" : [{"name" : "axes", "i" : 0, "type" : 7}]}, - {"op_name": "Constant"}, - {"op_name" : "Slice", - "attributes" : []}] - ops = {10 : ops_10} - check_onnx_opsets_operator(module, x, ops, opset_versions=[10], - input_names=['x'], dynamic_axes={"x": [0, 1]}) - - ops_10 = [{"op_name" : "Constant"}, - {"op_name" : "Constant"}, - {"op_name" : "Constant"}, - {"op_name" : "Constant"}, - {"op_name" : "Slice", - "attributes" : []}] - ops = {10 : ops_10} + ops_10 = [ + {"op_name": "Shape"}, + {"op_name": "Constant"}, + {"op_name": "Gather", "attributes": [{"name": "axis", "i": 0, "type": 2}]}, + { + "op_name": "Unsqueeze", + "attributes": [{"name": "axes", "i": 0, "type": 7}], + }, + {"op_name": "Constant"}, + {"op_name": "Slice", "attributes": []}, + ] + ops = {10: ops_10} + check_onnx_opsets_operator( + module, + x, + ops, + opset_versions=[10], + input_names=["x"], + dynamic_axes={"x": [0, 1]}, + ) + + ops_10 = [ + {"op_name": "Constant"}, + {"op_name": "Constant"}, + {"op_name": "Constant"}, + {"op_name": "Constant"}, + {"op_name": "Slice", "attributes": []}, + ] + ops = {10: ops_10} check_onnx_opsets_operator(module, x, ops, opset_versions=[10]) def test_flip(self): @@ -217,14 +285,16 @@ class MyModule(Module): def forward(self, x): return torch.flip(x, dims=[0]) - ops_10 = [{"op_name" : "Constant"}, - {"op_name" : "Constant"}, - {"op_name" : "Constant"}, - {"op_name" : "Constant"}, - {"op_name" : "Slice", - "attributes" : []}] - ops = {10 : ops_10} + ops_10 = [ + {"op_name": "Constant"}, + {"op_name": "Constant"}, + {"op_name": "Constant"}, + {"op_name": "Constant"}, + {"op_name": "Slice", "attributes": []}, + ] + ops = {10: ops_10} import numpy + x = torch.tensor(numpy.arange(6.0).reshape(2, 3)) check_onnx_opsets_operator(MyModule(), x, ops, opset_versions=[10]) @@ -242,110 +312,145 @@ def forward(self, x): # we should only export the onnx Dropout op in training mode; test both modes # test training mode - ops = [{"op_name" : "Dropout", "attributes" : [{"name" : "ratio", "f" : 0.5, "type" : 1}]}] - ops = {9 : ops, 10 : ops} - check_onnx_opsets_operator(MyModule(), x, ops, opset_versions=[9, 10], training=torch.onnx.TrainingMode.TRAINING) + ops = [ + { + "op_name": "Dropout", + "attributes": [{"name": "ratio", "f": 0.5, "type": 1}], + } + ] + ops = {9: ops, 10: ops} + check_onnx_opsets_operator( + MyModule(), + x, + ops, + opset_versions=[9, 10], + training=torch.onnx.TrainingMode.TRAINING, + ) # test eval mode - ops = [{"op_name" : "Identity"}] - ops = {9 : ops, 10 : ops} - check_onnx_opsets_operator(MyModule(), x, ops, opset_versions=[9, 10], training=torch.onnx.TrainingMode.EVAL) + ops = [{"op_name": "Identity"}] + ops = {9: ops, 10: ops} + check_onnx_opsets_operator( + MyModule(), + x, + ops, + opset_versions=[9, 10], + training=torch.onnx.TrainingMode.EVAL, + ) def test_full(self): class MyModule(Module): def forward(self, x): return torch.full((3, 4), x) - ops = [{"op_name" : "Constant"}, - {"op_name" : "ConstantOfShape"}, - {"op_name" : "Add"}] - ops = {9 : ops, 10 : ops} - x = torch.tensor(12.) + ops = [ + {"op_name": "Constant"}, + {"op_name": "ConstantOfShape"}, + {"op_name": "Add"}, + ] + ops = {9: ops, 10: ops} + x = torch.tensor(12.0) check_onnx_opsets_operator(MyModule(), x, ops, opset_versions=[9, 10]) def test_interpolate(self): class MyModel(torch.nn.Module): def forward(self, x): size = [v * 2 for v in x.size()[2:]] - return torch.nn.functional.interpolate(x, - size=size, - mode="nearest") - ops_9 = [{"op_name" : "Shape"}, - {"op_name" : "Constant"}, - {"op_name" : "Gather"}, - {"op_name" : "Shape"}, - {"op_name" : "Constant"}, - {"op_name" : "Gather"}, - {"op_name" : "Constant"}, - {"op_name" : "Mul"}, - {"op_name" : "Constant"}, - {"op_name" : "Mul"}, - {"op_name" : "Unsqueeze"}, - {"op_name" : "Unsqueeze"}, - {"op_name" : "Concat"}, - {"op_name" : "Constant"}, - {"op_name" : "Cast"}, - {"op_name" : "Shape"}, - {"op_name" : "Slice"}, - {"op_name" : "Cast"}, - {"op_name" : "Div"}, - {"op_name" : "Concat"}, - {"op_name" : "Upsample", - "attributes" : - [{"name": "mode", "s": ("nearest").encode(), "type": 3}]}] - ops_10 = [{"op_name" : "Shape"}, - {"op_name" : "Constant"}, - {"op_name" : "Gather"}, - {"op_name" : "Shape"}, - {"op_name" : "Constant"}, - {"op_name" : "Gather"}, - {"op_name" : "Constant"}, - {"op_name" : "Mul"}, - {"op_name" : "Constant"}, - {"op_name" : "Mul"}, - {"op_name" : "Unsqueeze"}, - {"op_name" : "Unsqueeze"}, - {"op_name" : "Concat"}, - {"op_name" : "Constant"}, - {"op_name" : "Cast"}, - {"op_name" : "Shape"}, - {"op_name" : "Constant"}, - {"op_name" : "Constant"}, - {"op_name" : "Constant"}, - {"op_name" : "Slice"}, - {"op_name" : "Cast"}, - {"op_name" : "Div"}, - {"op_name" : "Concat"}, - {"op_name" : "Resize", - "attributes" : - [{"name": "mode", "s": ("nearest").encode(), "type": 3}]}] - - ops = {9 : ops_9, 10 : ops_10} + return torch.nn.functional.interpolate(x, size=size, mode="nearest") + + ops_9 = [ + {"op_name": "Shape"}, + {"op_name": "Constant"}, + {"op_name": "Gather"}, + {"op_name": "Shape"}, + {"op_name": "Constant"}, + {"op_name": "Gather"}, + {"op_name": "Constant"}, + {"op_name": "Mul"}, + {"op_name": "Constant"}, + {"op_name": "Mul"}, + {"op_name": "Unsqueeze"}, + {"op_name": "Unsqueeze"}, + {"op_name": "Concat"}, + {"op_name": "Cast"}, + {"op_name": "Shape"}, + {"op_name": "Slice"}, + {"op_name": "Cast"}, + {"op_name": "Div"}, + {"op_name": "Constant"}, + {"op_name": "Concat"}, + { + "op_name": "Upsample", + "attributes": [{"name": "mode", "s": ("nearest").encode(), "type": 3}], + }, + ] + ops_10 = [ + {"op_name": "Shape"}, + {"op_name": "Constant"}, + {"op_name": "Gather"}, + {"op_name": "Shape"}, + {"op_name": "Constant"}, + {"op_name": "Gather"}, + {"op_name": "Constant"}, + {"op_name": "Mul"}, + {"op_name": "Constant"}, + {"op_name": "Mul"}, + {"op_name": "Unsqueeze"}, + {"op_name": "Unsqueeze"}, + {"op_name": "Concat"}, + {"op_name": "Cast"}, + {"op_name": "Shape"}, + {"op_name": "Constant"}, + {"op_name": "Constant"}, + {"op_name": "Constant"}, + {"op_name": "Slice"}, + {"op_name": "Cast"}, + {"op_name": "Div"}, + {"op_name": "Constant"}, + {"op_name": "Concat"}, + { + "op_name": "Resize", + "attributes": [{"name": "mode", "s": ("nearest").encode(), "type": 3}], + }, + ] + + ops = {9: ops_9, 10: ops_10} x = torch.randn(1, 2, 3, 4, requires_grad=True) - check_onnx_opsets_operator(MyModel(), x, ops, opset_versions=[9, 10], - input_names=["x"], dynamic_axes={"x": [0, 1, 2, 3]}) - - ops_9 = [{"op_name" : "Constant"}, - {"op_name" : "Shape"}, - {"op_name" : "Slice"}, - {"op_name" : "Cast"}, - {"op_name" : "Div"}, - {"op_name" : "Concat"}, - {"op_name" : "Upsample", - "attributes" : - [{"name": "mode", "s": ("nearest").encode(), "type": 3}]}] - ops_10 = [{"op_name" : "Constant"}, - {"op_name" : "Shape"}, - {"op_name" : "Constant"}, - {"op_name" : "Constant"}, - {"op_name" : "Constant"}, - {"op_name" : "Slice"}, - {"op_name" : "Cast"}, - {"op_name" : "Div"}, - {"op_name" : "Concat"}, - {"op_name" : "Resize"}] - - ops = {9 : ops_9, 10 : ops_10} + check_onnx_opsets_operator( + MyModel(), + x, + ops, + opset_versions=[9, 10], + input_names=["x"], + dynamic_axes={"x": [0, 1, 2, 3]}, + ) + + ops_9 = [ + {"op_name": "Shape"}, + {"op_name": "Slice"}, + {"op_name": "Cast"}, + {"op_name": "Div"}, + {"op_name": "Constant"}, + {"op_name": "Concat"}, + { + "op_name": "Upsample", + "attributes": [{"name": "mode", "s": ("nearest").encode(), "type": 3}], + }, + ] + ops_10 = [ + {"op_name": "Shape"}, + {"op_name": "Constant"}, + {"op_name": "Constant"}, + {"op_name": "Constant"}, + {"op_name": "Slice"}, + {"op_name": "Cast"}, + {"op_name": "Div"}, + {"op_name": "Constant"}, + {"op_name": "Concat"}, + {"op_name": "Resize"}, + ] + + ops = {9: ops_9, 10: ops_10} x = torch.randn(1, 2, 3, 4, requires_grad=True) check_onnx_opsets_operator(MyModel(), x, ops, opset_versions=[9, 10]) @@ -354,21 +459,64 @@ def forward(self, x): size = [v * 2 for v in x.size()[2:]] # work around for now: turn the dynamic sizes into constant size = [int(i) for i in size] - return torch.nn.functional.interpolate(x, - size=size, - mode="nearest") - ops_9 = [{"op_name" : "Constant"}, - {"op_name" : "Upsample", - "attributes" : - [{"name": "mode", "s": ("nearest").encode(), "type": 3}]}] - ops_10 = [{"op_name" : "Constant"}, - {"op_name" : "Resize", - "attributes" : - [{"name": "mode", "s": ("nearest").encode(), "type": 3}]}] - ops = {9 : ops_9, 10 : ops_10} + return torch.nn.functional.interpolate(x, size=size, mode="nearest") + + ops_9 = [ + {"op_name": "Constant"}, + { + "op_name": "Upsample", + "attributes": [{"name": "mode", "s": ("nearest").encode(), "type": 3}], + }, + ] + ops_10 = [ + {"op_name": "Constant"}, + { + "op_name": "Resize", + "attributes": [{"name": "mode", "s": ("nearest").encode(), "type": 3}], + }, + ] + ops = {9: ops_9, 10: ops_10} x = torch.randn(20, 16, 50) check_onnx_opsets_operator(MyDynamicModel(), x, ops, opset_versions=[9, 10]) + def test_grid_sample(self): + n, c, h_in, w_in, h_out, w_out = 1, 1, 3, 2, 2, 4 + ops = {16: [{"op_name": "GridSample"}]} + + class MyModule(Module): + def forward(self, x, grid, mode, padding_mode, align_corers): + return torch.nn.functional.grid_sample( + x, grid, mode, padding_mode, align_corners + ) + + for mode, padding_mode, align_corners in itertools.product( + ("bilinear", "nearest", "bicubic"), + ("zeros", "border", "reflection"), + (True, False), + ): + + args = ( + torch.randn(n, c, h_in, w_in), # x + torch.randn(n, h_out, w_out, 2), # grid, + mode, + padding_mode, + align_corners, + ) + check_onnx_opsets_operator( + MyModule(), + args, + ops, + opset_versions=[16], + training=torch.onnx.TrainingMode.TRAINING, + ) + check_onnx_opsets_operator( + MyModule(), + args, + ops, + opset_versions=[16], + training=torch.onnx.TrainingMode.EVAL, + ) + if __name__ == "__main__": run_tests() diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py index ca69f0fb0306..1cded5a9b9f2 100644 --- a/test/onnx/test_operators.py +++ b/test/onnx/test_operators.py @@ -1,31 +1,52 @@ # Owner(s): ["module: onnx"] -from test_pytorch_common import TestCase, run_tests, flatten, skipIfNoLapack, \ - BATCH_SIZE, RNN_SEQUENCE_LENGTH, RNN_INPUT_SIZE, RNN_HIDDEN_SIZE - -import torch -import torch.onnx -from torch.onnx.symbolic_helper import parse_args, _get_tensor_dim_size, _get_tensor_sizes -from torch.onnx import register_custom_op_symbolic, unregister_custom_op_symbolic -from torch.autograd import Variable, Function -from torch.nn import Module, functional -import torch.nn as nn -import torch.nn.functional as F - -import itertools -import io -import inspect import glob +import inspect +import io +import itertools import os import shutil import tempfile -import torch.testing._internal.common_utils as common -'''Usage: python test/onnx/test_operators.py [--no-onnx] [--produce-onnx-test-data] +from test_pytorch_common import ( + BATCH_SIZE, + RNN_HIDDEN_SIZE, + RNN_INPUT_SIZE, + RNN_SEQUENCE_LENGTH, + TestCase, + flatten, + run_tests, + skipIfNoLapack, +) + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.onnx +import torch.testing._internal.common_utils as common +from torch.autograd import Function, Variable +from torch.nn import Module, functional +from torch.onnx import ( + register_custom_op_symbolic, + unregister_custom_op_symbolic, +) +from torch.onnx.symbolic_helper import ( + _get_tensor_dim_size, + _get_tensor_sizes, + parse_args, +) +from torch.testing._internal.common_utils import skipIfCaffe2 + +"""Usage: python test/onnx/test_operators.py [--no-onnx] [--produce-onnx-test-data] --no-onnx: no onnx python dependence --produce-onnx-test-data: generate onnx test data --accept: accept onnx updates and overwrite models -''' +""" + +# Full diff for expect files +import unittest + +unittest.TestCase.maxDiff = None _onnx_test = False # flag to produce onnx test cases. _onnx_dep = True # flag to import onnx package. @@ -33,7 +54,8 @@ def export_to_pbtxt(model, inputs, *args, **kwargs): return torch.onnx.export_to_pretty_string( - model, inputs, google_printer=True, *args, **kwargs) + model, inputs, google_printer=True, *args, **kwargs + ) def export_to_pb(model, inputs, *args, **kwargs): @@ -56,7 +78,6 @@ def forward(self, *args): class TestOperators(TestCase): - def assertONNX(self, f, args, params=None, **kwargs): if params is None: params = () @@ -74,16 +95,21 @@ def assertONNX(self, f, args, params=None, **kwargs): import onnx.checker import onnx.numpy_helper import test_onnx_common + model_def = onnx.ModelProto.FromString(onnx_model_pb) onnx.checker.check_model(model_def) if _onnx_test: test_function = inspect.stack()[1][0].f_code.co_name test_name = test_function[0:4] + "_operator" + test_function[4:] - output_dir = os.path.join(test_onnx_common.pytorch_operator_dir, test_name) + output_dir = os.path.join( + test_onnx_common.pytorch_operator_dir, test_name + ) # Assume: # 1) the old test should be delete before the test. # 2) only one assertONNX in each test, otherwise will override the data. - assert not os.path.exists(output_dir), "{} should not exist!".format(output_dir) + assert not os.path.exists(output_dir), "{} should not exist!".format( + output_dir + ) os.makedirs(output_dir) with open(os.path.join(output_dir, "model.onnx"), "wb") as file: file.write(model_def.SerializeToString()) @@ -93,14 +119,18 @@ def assertONNX(self, f, args, params=None, **kwargs): args = (args,) for index, var in enumerate(flatten(args)): tensor = onnx.numpy_helper.from_array(var.data.numpy()) - with open(os.path.join(data_dir, "input_{}.pb".format(index)), "wb") as file: + with open( + os.path.join(data_dir, "input_{}.pb".format(index)), "wb" + ) as file: file.write(tensor.SerializeToString()) outputs = m(*args) if isinstance(outputs, Variable): outputs = (outputs,) for index, var in enumerate(flatten(outputs)): tensor = onnx.numpy_helper.from_array(var.data.numpy()) - with open(os.path.join(data_dir, "output_{}.pb".format(index)), "wb") as file: + with open( + os.path.join(data_dir, "output_{}.pb".format(index)), "wb" + ) as file: file.write(tensor.SerializeToString()) def assertONNXRaises(self, err, f, args, params=None, **kwargs): @@ -181,11 +211,15 @@ def test_chunk(self): self.assertONNX(lambda x: x.chunk(2), x) def test_split(self): - x = torch.tensor([[0.0, 1.0, 1.0, 0.0, 2.0, 2.0], [2.0, 3.0, 3.0, 2.0, 1.0, 1.0]]) + x = torch.tensor( + [[0.0, 1.0, 1.0, 0.0, 2.0, 2.0], [2.0, 3.0, 3.0, 2.0, 1.0, 1.0]] + ) self.assertONNX(lambda x: torch.split(x, 2, 1), x) def test_split_with_sizes(self): - x = torch.tensor([[0.0, 1.0, 1.0, 0.0, 2.0, 2.0], [2.0, 3.0, 3.0, 2.0, 1.0, 1.0]]) + x = torch.tensor( + [[0.0, 1.0, 1.0, 0.0, 2.0, 2.0], [2.0, 3.0, 3.0, 2.0, 1.0, 1.0]] + ) self.assertONNX(lambda x: torch.split(x, [2, 1, 3], 1), x) def test_concat2(self): @@ -202,27 +236,39 @@ def test_addmm(self): m1 = torch.randn(2, 3, requires_grad=True) m2 = torch.randn(3, 4, requires_grad=True) m3 = torch.randn(4, requires_grad=True) - self.assertONNX(lambda x, y, z: torch.addmm(torch.addmm(z, x, y), x, y), (m1, m2, m3)) + self.assertONNX( + lambda x, y, z: torch.addmm(torch.addmm(z, x, y), x, y), (m1, m2, m3) + ) def test_permute2(self): x = torch.tensor([[[[[[0.0]]]]]], requires_grad=True) self.assertONNX(lambda x: x.permute(0, 1, 4, 2, 5, 3), x) def test_pad(self): - x = torch.tensor([[[[0.0, 1.0, 1.0, 1.0], [2.0, 3.0, 7.0, 7.0]]]], requires_grad=True) + x = torch.tensor( + [[[[0.0, 1.0, 1.0, 1.0], [2.0, 3.0, 7.0, 7.0]]]], requires_grad=True + ) self.assertONNX(nn.ReflectionPad2d((2, 3, 0, 1)), x) def test_params(self): x = torch.tensor([[1.0, 2.0], [3.0, 4.0]], requires_grad=True) y = nn.Parameter(torch.tensor([[1.0, 2.0], [3.0, 4.0]], requires_grad=True)) - self.assertONNX(lambda x, y: -torch.sigmoid(torch.tanh(x * (x + y))), x, params=(y, ), - keep_initializers_as_inputs=True) + self.assertONNX( + lambda x, y: -torch.sigmoid(torch.tanh(x * (x + y))), + x, + params=(y,), + keep_initializers_as_inputs=True, + ) def test_params_onnx_irv4(self): x = torch.tensor([[1.0, 2.0], [3.0, 4.0]], requires_grad=True) y = nn.Parameter(torch.tensor([[1.0, 2.0], [3.0, 4.0]], requires_grad=True)) - self.assertONNX(lambda x, y: -torch.sigmoid(torch.tanh(x * (x + y))), x, params=(y, ), - keep_initializers_as_inputs=False) + self.assertONNX( + lambda x, y: -torch.sigmoid(torch.tanh(x * (x + y))), + x, + params=(y,), + keep_initializers_as_inputs=False, + ) def test_symbolic_mismatch(self): class MyFun(Function): @@ -258,11 +304,18 @@ def test_batchnorm_1d(self): def test_batchnorm_training(self): x = torch.ones(2, 2, 2, 2, requires_grad=True) - self.assertONNX(nn.BatchNorm2d(2), x, training=torch.onnx.TrainingMode.TRAINING, keep_initializers_as_inputs=True) + self.assertONNX( + nn.BatchNorm2d(2), + x, + training=torch.onnx.TrainingMode.TRAINING, + keep_initializers_as_inputs=True, + ) def test_conv(self): x = torch.ones(20, 16, 50, 40, requires_grad=True) - self.assertONNX(nn.Conv2d(16, 13, 3, bias=False), x, keep_initializers_as_inputs=True) + self.assertONNX( + nn.Conv2d(16, 13, 3, bias=False), x, keep_initializers_as_inputs=True + ) def test_conv_onnx_irv4(self): x = torch.ones(20, 16, 50, 40, requires_grad=True) @@ -276,35 +329,67 @@ def test_conv_onnx_irv4_opset8(self): x = torch.ones(1, 2, 5, 7, requires_grad=True) conv_node = nn.Conv2d(2, 4, 3, bias=False) conv_node.weight.data.fill_(1.0) - self.assertONNX(conv_node, x, opset_version=8, keep_initializers_as_inputs=False) + self.assertONNX( + conv_node, x, opset_version=8, keep_initializers_as_inputs=False + ) def test_conv_variable_length(self): x = torch.ones(5, 3, 6, 6, requires_grad=True) model = torch.nn.Conv2d(3, 2, 3) - dynamic_axes = {"input_1": [0, 2, 3], "output_1": {0: "output_1_variable_dim_0", 1: "output_1_variable_dim_1"}} + dynamic_axes = { + "input_1": [0, 2, 3], + "output_1": {0: "output_1_variable_dim_0", 1: "output_1_variable_dim_1"}, + } model_proto_file = tempfile.NamedTemporaryFile() - torch.onnx.export(model, x, model_proto_file.name, verbose=True, input_names=["input_1"], output_names=["output_1"], - dynamic_axes=dynamic_axes) + torch.onnx.export( + model, + x, + model_proto_file.name, + verbose=True, + input_names=["input_1"], + output_names=["output_1"], + dynamic_axes=dynamic_axes, + ) import onnx + onnx_model = onnx.load(model_proto_file.name) onnx.checker.check_model(onnx_model) # Asserting the default dynamic axes names are generated when custom names are not provided - assert(onnx_model.graph.input[0].type.tensor_type.shape.dim[0].dim_param == "input_1_dynamic_axes_1") - assert(onnx_model.graph.input[0].type.tensor_type.shape.dim[2].dim_param == "input_1_dynamic_axes_2") - assert(onnx_model.graph.input[0].type.tensor_type.shape.dim[3].dim_param == "input_1_dynamic_axes_3") + assert ( + onnx_model.graph.input[0].type.tensor_type.shape.dim[0].dim_param + == "input_1_dynamic_axes_1" + ) + assert ( + onnx_model.graph.input[0].type.tensor_type.shape.dim[2].dim_param + == "input_1_dynamic_axes_2" + ) + assert ( + onnx_model.graph.input[0].type.tensor_type.shape.dim[3].dim_param + == "input_1_dynamic_axes_3" + ) # Asserting the custom names are applied when provided - assert(onnx_model.graph.output[0].type.tensor_type.shape.dim[0].dim_param == "output_1_variable_dim_0") - assert(onnx_model.graph.output[0].type.tensor_type.shape.dim[1].dim_param == "output_1_variable_dim_1") + assert ( + onnx_model.graph.output[0].type.tensor_type.shape.dim[0].dim_param + == "output_1_variable_dim_0" + ) + assert ( + onnx_model.graph.output[0].type.tensor_type.shape.dim[1].dim_param + == "output_1_variable_dim_1" + ) def test_convtranspose(self): x = torch.ones(2, 3, 4, 5, requires_grad=True) - self.assertONNX(nn.ConvTranspose2d(3, 3, 3, stride=3, bias=False, - padding=1, output_padding=2), x, - keep_initializers_as_inputs=True) + self.assertONNX( + nn.ConvTranspose2d( + 3, 3, 3, stride=3, bias=False, padding=1, output_padding=2 + ), + x, + keep_initializers_as_inputs=True, + ) def test_maxpool(self): x = torch.randn(20, 16, 50) @@ -322,11 +407,11 @@ def test_maxpool_indices(self): x = torch.randn(20, 16, 50) self.assertONNX(nn.MaxPool1d(3, stride=2, return_indices=True), x) + @skipIfCaffe2 def test_at_op(self): x = torch.randn(3, 4) class MyFun(Function): - @staticmethod def symbolic(g, x): return g.at("add", x, x) @@ -339,7 +424,11 @@ class MyModule(Module): def forward(self, x): return MyFun.apply(x) - self.assertONNX(MyModule(), x) + self.assertONNX( + MyModule(), + x, + operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK, + ) def test_clip(self): x = torch.randn(3, 4, requires_grad=True) @@ -359,7 +448,7 @@ def test_hardtanh(self): def test_full(self): x = torch.randn(3, 4, requires_grad=True) - self.assertONNX(lambda x: torch.full(x.shape, 2.), x) + self.assertONNX(lambda x: torch.full(x.shape, 2.0), x) def test_full_like(self): x = torch.randn(3, 4, requires_grad=True) @@ -498,7 +587,7 @@ def test_slice(self): def test_slice_dynamic(self): x = torch.rand(3, 4, requires_grad=True) - self.assertONNX(lambda x: x[x.size(0):, x.size(1) - 3], x, opset_version=10) + self.assertONNX(lambda x: x[x.size(0) :, x.size(1) - 3], x, opset_version=10) def test_sign(self): x = torch.rand(3, 4, requires_grad=True) @@ -567,17 +656,24 @@ def test_norm_p2(self): def test_upsample_nearest_scale(self): x = torch.randn(1, 2, 3, 4, requires_grad=True) - self.assertONNX(lambda x: nn.functional.interpolate(x, scale_factor=2., - mode="nearest", recompute_scale_factor=False), x) + self.assertONNX( + lambda x: nn.functional.interpolate( + x, scale_factor=2.0, mode="nearest", recompute_scale_factor=False + ), + x, + ) def test_upsample_nearest_scale_default_scale_factor(self): x = torch.randn(1, 2, 3, 4, requires_grad=True) - self.assertONNX(lambda x: nn.functional.interpolate(x, scale_factor=2., - mode="nearest"), x) + self.assertONNX( + lambda x: nn.functional.interpolate(x, scale_factor=2.0, mode="nearest"), x + ) def test_upsample_nearest_size(self): x = torch.randn(1, 2, 3, 4, requires_grad=True) - self.assertONNX(lambda x: nn.functional.interpolate(x, size=16, mode="nearest"), x) + self.assertONNX( + lambda x: nn.functional.interpolate(x, size=16, mode="nearest"), x + ) def test_unsqueeze(self): x = torch.randn(3, 4, requires_grad=True) @@ -585,15 +681,23 @@ def test_unsqueeze(self): def test_batchnorm_noaffine(self): x = torch.randn(128, 128, 1, 1, requires_grad=True) - self.assertONNX(nn.BatchNorm2d(128, affine=False, momentum=0.3), x, - keep_initializers_as_inputs=True) + self.assertONNX( + nn.BatchNorm2d(128, affine=False, momentum=0.3), + x, + keep_initializers_as_inputs=True, + ) + @skipIfCaffe2 def test_embedding_bags(self): emb_bag = nn.EmbeddingBag(10, 8) input = torch.tensor([1, 2, 3, 4]).long() offset = torch.tensor([0]).long() - self.assertONNX(emb_bag, (input, offset), keep_initializers_as_inputs=True, - operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK) + self.assertONNX( + emb_bag, + (input, offset), + keep_initializers_as_inputs=True, + operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK, + ) def test_implicit_expand(self): x = torch.randn(3, 4, requires_grad=True) @@ -625,8 +729,9 @@ def test_log_sigmoid(self): def test_linear(self): x = torch.randn(3, 4) - self.assertONNX(torch.nn.Linear(4, 5, bias=True), x, - keep_initializers_as_inputs=True) + self.assertONNX( + torch.nn.Linear(4, 5, bias=True), x, keep_initializers_as_inputs=True + ) def test_empty_like(self): x = torch.randn(5, 8, requires_grad=True) @@ -671,22 +776,44 @@ def test_dropout(self): def test_dropout_default(self): x = torch.randn(3, 4, requires_grad=True) - self.assertONNX(lambda x: torch.max(functional.dropout(x,)), x) + self.assertONNX( + lambda x: torch.max( + functional.dropout( + x, + ) + ), + x, + ) def test_dropout_training(self): x = torch.randn(3, 4, requires_grad=True) - self.assertONNX(lambda x: torch.max(functional.dropout(x)), x, training=torch.onnx.TrainingMode.TRAINING) + self.assertONNX( + lambda x: torch.max(functional.dropout(x)), + x, + training=torch.onnx.TrainingMode.TRAINING, + ) def test_dropout_opset12(self): x = torch.randn(3, 4, requires_grad=True) - self.assertONNX(lambda x: torch.max(functional.dropout(x, training=False)), x, opset_version=12) + self.assertONNX( + lambda x: torch.max(functional.dropout(x, training=False)), + x, + opset_version=12, + ) def test_dropout_training_opset12(self): x = torch.randn(3, 4, requires_grad=True) - self.assertONNX(lambda x: torch.max(functional.dropout(x)), x, opset_version=12, training=torch.onnx.TrainingMode.TRAINING) + self.assertONNX( + lambda x: torch.max(functional.dropout(x)), + x, + opset_version=12, + training=torch.onnx.TrainingMode.TRAINING, + ) def test_nonzero(self): - x = torch.tensor([[[2., 2.], [1., 0.]], [[0., 0.], [1., 1.]]], requires_grad=True) + x = torch.tensor( + [[[2.0, 2.0], [1.0, 0.0]], [[0.0, 0.0], [1.0, 1.0]]], requires_grad=True + ) self.assertONNX(lambda x: torch.nonzero(x), x) def test_gather(self): @@ -697,19 +824,28 @@ def test_gather(self): def test_gather_opset11(self): data = torch.randn(3, 4, 3, requires_grad=True) index = torch.tensor([2, 0]).view(1, 2, 1).expand(3, 2, 3) - self.assertONNX(lambda data, index: data.gather(1, index), (data, index), opset_version=11) + self.assertONNX( + lambda data, index: data.gather(1, index), (data, index), opset_version=11 + ) def test_scatter_add(self): - data = torch.tensor([[0., 0., 0.], [0., 0., 0.], [0., 0., 0.]]) + data = torch.tensor([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]) indices = torch.tensor([[1, 0], [0, 1], [0, 1]], dtype=torch.int64) values = torch.tensor([[1.0, 1.1], [2.0, 2.1], [3.0, 3.1]]) - self.assertONNX(lambda data, index: data.scatter_add(1, indices, values), (data, (indices, values))) + self.assertONNX( + lambda data, index: data.scatter_add(1, indices, values), + (data, (indices, values)), + ) def test_scatter_add_opset11(self): - data = torch.tensor([[0., 0., 0.], [0., 0., 0.], [0., 0., 0.]]) + data = torch.tensor([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]) indices = torch.tensor([[1, 0], [0, 1], [0, 1]], dtype=torch.int64) values = torch.tensor([[1.0, 1.1], [2.0, 2.1], [3.0, 3.1]]) - self.assertONNX(lambda data, index: data.scatter_add(1, indices, values), (data, (indices, values)), opset_version=11) + self.assertONNX( + lambda data, index: data.scatter_add(1, indices, values), + (data, (indices, values)), + opset_version=11, + ) def test_master_opset(self): x = torch.randn(2, 3).float() @@ -718,54 +854,58 @@ def test_master_opset(self): def test_std(self): x = torch.randn(2, 3, 4).float() - self.assertONNX(lambda x: torch.std(x, dim=(0, 1), unbiased=True, keepdim=True), x) + self.assertONNX( + lambda x: torch.std(x, dim=(0, 1), unbiased=True, keepdim=True), x + ) def test_cumsum(self): x = torch.randn(2, 3, 4, requires_grad=True) self.assertONNX(lambda x: torch.cumsum(x, dim=1), x, opset_version=11) -# Github Issue: https://github.com/pytorch/pytorch/issues/71095 -# def test_c2_op(self): -# class MyModel(torch.nn.Module): -# def __init__(self): -# super(MyModel, self).__init__() -# -# def forward(self, scores, bbox_deltas, im_info, anchors): -# a, b = torch.ops._caffe2.GenerateProposals( -# (scores), (bbox_deltas), (im_info), (anchors), -# 2.0, 6000, 300, 0.7, 16, True, -90, 90, 1.0, True, -# ) -# return a, b -# -# model = MyModel() -# A = 4 -# H = 10 -# W = 8 -# img_count = 3 -# scores = torch.ones(img_count, A, H, W, dtype=torch.float32) -# bbox_deltas = torch.linspace(0, 10, steps=img_count * 4 * A * H * W, -# dtype=torch.float32) -# bbox_deltas = bbox_deltas.view(img_count, 4 * A, H, W) -# im_info = torch.ones(img_count, 3, dtype=torch.float32) -# anchors = torch.ones(A, 4, dtype=torch.float32) -# inputs = (scores, bbox_deltas, im_info, anchors) -# self.assertONNX(model, inputs, custom_opsets={"org.pytorch._caffe2": 0}) + # Github Issue: https://github.com/pytorch/pytorch/issues/71095 + # def test_c2_op(self): + # class MyModel(torch.nn.Module): + # def __init__(self): + # super(MyModel, self).__init__() + # + # def forward(self, scores, bbox_deltas, im_info, anchors): + # a, b = torch.ops._caffe2.GenerateProposals( + # (scores), (bbox_deltas), (im_info), (anchors), + # 2.0, 6000, 300, 0.7, 16, True, -90, 90, 1.0, True, + # ) + # return a, b + # + # model = MyModel() + # A = 4 + # H = 10 + # W = 8 + # img_count = 3 + # scores = torch.ones(img_count, A, H, W, dtype=torch.float32) + # bbox_deltas = torch.linspace(0, 10, steps=img_count * 4 * A * H * W, + # dtype=torch.float32) + # bbox_deltas = bbox_deltas.view(img_count, 4 * A, H, W) + # im_info = torch.ones(img_count, 3, dtype=torch.float32) + # anchors = torch.ones(A, 4, dtype=torch.float32) + # inputs = (scores, bbox_deltas, im_info, anchors) + # self.assertONNX(model, inputs, custom_opsets={"org.pytorch._caffe2": 0}) def test_dict(self): class MyModel(torch.nn.Module): def forward(self, x_in): x_out = {} - x_out["test_key_out"] = torch.add(x_in[list(x_in.keys())[0]], list(x_in.keys())[0]) + x_out["test_key_out"] = torch.add( + x_in[list(x_in.keys())[0]], list(x_in.keys())[0] + ) return x_out - x = {torch.tensor(1.): torch.randn(1, 2, 3)} + x = {torch.tensor(1.0): torch.randn(1, 2, 3)} self.assertONNX(MyModel(), (x, {})) def test_dict_str(self): class MyModel(torch.nn.Module): def forward(self, x_in): x_out = {} - x_out["test_key_out"] = torch.add(x_in["test_key_in"], 2.) + x_out["test_key_out"] = torch.add(x_in["test_key_in"], 2.0) return x_out x = {"test_key_in": torch.randn(1, 2, 3)} @@ -781,21 +921,27 @@ def forward(self, input): def test_bitshift(self): class BitshiftModel(torch.nn.Module): - def forward(self, input, input2): - return input >> 1, input2 >> 2 - input = torch.arange(24, dtype=torch.float32).reshape(3, 4, 2) - input2 = torch.arange(24, dtype=torch.uint8).reshape(3, 4, 2) - self.assertONNX(BitshiftModel(), (input, input2), opset_version=11) + def forward(self, input): + return input >> 1, input >> 2 + + input = torch.arange(24, dtype=torch.uint8).reshape(3, 4, 2) + self.assertONNX(BitshiftModel(), input, opset_version=11) + @skipIfCaffe2 def test_layer_norm_aten(self): model = torch.nn.LayerNorm([10, 10]) x = torch.randn(20, 5, 10, 10) - self.assertONNX(model, x, - operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK) + self.assertONNX( + model, + x, + operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK, + ) def test_pixel_shuffle(self): x = torch.randn(2, 8, 3, 4).float() - self.assertONNX(lambda x: torch.pixel_shuffle(x, upscale_factor=2), x, opset_version=11) + self.assertONNX( + lambda x: torch.pixel_shuffle(x, upscale_factor=2), x, opset_version=11 + ) def test_frobenius_norm(self): x = torch.randn(2, 3, 4).float() @@ -821,8 +967,13 @@ def test_gelu(self): def test_unique(self): x = torch.randint(3, (2, 3, 4, 5)).float() - self.assertONNX(lambda x: torch.unique(x, dim=0, sorted=True, return_inverse=False, return_counts=True), x, - opset_version=11) + self.assertONNX( + lambda x: torch.unique( + x, dim=0, sorted=True, return_inverse=False, return_counts=True + ), + x, + opset_version=11, + ) def test_meshgrid(self): x = torch.ones(3, requires_grad=True) @@ -831,14 +982,18 @@ def test_meshgrid(self): self.assertONNX(lambda x, y, z: torch.meshgrid(x, y, z), (x, y, z)) def test_topk(self): - x = torch.arange(1., 6., requires_grad=True) + x = torch.arange(1.0, 6.0, requires_grad=True) k = torch.tensor(3) self.assertONNX(lambda x, k: torch.topk(x, k), (x, k), opset_version=10) def test_topk_smallest_unsorted(self): - x = torch.arange(1., 6., requires_grad=True) + x = torch.arange(1.0, 6.0, requires_grad=True) k = torch.tensor(3) - self.assertONNX(lambda x, k: torch.topk(x, k, largest=False, sorted=False), (x, k), opset_version=11) + self.assertONNX( + lambda x, k: torch.topk(x, k, largest=False, sorted=False), + (x, k), + opset_version=11, + ) def test_baddbmm(self): x = torch.randn(10, 3, 5) @@ -868,12 +1023,16 @@ def test_softmaxcrossentropy(self): def test_softmaxcrossentropy_ignore_index(self): x = torch.randn(3, 5) y = torch.empty(3, dtype=torch.long).random_(5) - self.assertONNX(torch.nn.CrossEntropyLoss(ignore_index=1), (x, y), opset_version=12) + self.assertONNX( + torch.nn.CrossEntropyLoss(ignore_index=1), (x, y), opset_version=12 + ) def test_softmaxcrossentropy_weights(self): x = torch.randn(3, 5) y = torch.empty(3, dtype=torch.long).random_(5) - self.assertONNX(torch.nn.CrossEntropyLoss(weight=torch.randn(5)), (x, y), opset_version=12) + self.assertONNX( + torch.nn.CrossEntropyLoss(weight=torch.randn(5)), (x, y), opset_version=12 + ) def test_softmaxcrossentropy_3d(self): x = torch.randn(3, 5, 2) @@ -883,7 +1042,9 @@ def test_softmaxcrossentropy_3d(self): def test_softmaxcrossentropy_3d_none(self): x = torch.randn(3, 5, 2) y = torch.empty(3, 2, dtype=torch.long).random_(5) - self.assertONNX(torch.nn.CrossEntropyLoss(reduction="none"), (x, y), opset_version=12) + self.assertONNX( + torch.nn.CrossEntropyLoss(reduction="none"), (x, y), opset_version=12 + ) def test_softmaxcrossentropy_4d(self): x = torch.randn(3, 5, 2, 1) @@ -899,66 +1060,96 @@ def test_lstm_none_sequence_lens(self): class LSTMModel(torch.nn.Module): def __init__(self): super().__init__() - self.rnn = torch.nn.LSTM(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False) + self.rnn = torch.nn.LSTM( + RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False + ) def forward(self, x, h0, c0): a, b = self.rnn(x, (h0, c0)) return torch.ones(b[0].shape) - self.assertONNX(LSTMModel(), - (input, h0, c0), input_names=["x", "y"], - dynamic_axes={"x" : {0: 'batch'}}, opset_version=12) + self.assertONNX( + LSTMModel(), + (input, h0, c0), + input_names=["x", "y"], + dynamic_axes={"x": {0: "batch"}}, + opset_version=12, + ) def test_dynamic_axes_add(self): m1 = torch.randn(2, 3, requires_grad=True) m2 = torch.randn(2, 1, requires_grad=True) - self.assertONNX(lambda x, y: torch.add(x, y), (m1, m2), input_names=["input_1", "input_2"], - dynamic_axes={"input_1": {1: "dim_1"}, "input_2": {1: "dim_2"}}, - opset_version=12) + self.assertONNX( + lambda x, y: torch.add(x, y), + (m1, m2), + input_names=["input_1", "input_2"], + dynamic_axes={"input_1": {1: "dim_1"}, "input_2": {1: "dim_2"}}, + opset_version=12, + ) def test_dynamic_axes_add_inputs_same_symbolic_shape(self): m1 = torch.randn(2, 3, requires_grad=True) - self.assertONNX(lambda x: torch.add(x, x), (m1,), input_names=["input_1"], - dynamic_axes={"input_1": {1: "dim_1"}}, - opset_version=12) + self.assertONNX( + lambda x: torch.add(x, x), + (m1,), + input_names=["input_1"], + dynamic_axes={"input_1": {1: "dim_1"}}, + opset_version=12, + ) def test_dynamic_axes_matmul(self): m1 = torch.randn(2, 2, 4, requires_grad=True) m2 = torch.randn(2, 4, 3, requires_grad=True) - self.assertONNX(lambda x, y: torch.matmul(x, y), (m1, m2), input_names=["input_1", "input_2"], - dynamic_axes={"input_1": {1: "dim_0"}, "input_2": {2: "dim_1"}}, - opset_version=12) + self.assertONNX( + lambda x, y: torch.matmul(x, y), + (m1, m2), + input_names=["input_1", "input_2"], + dynamic_axes={"input_1": {1: "dim_0"}, "input_2": {2: "dim_1"}}, + opset_version=12, + ) def test_dynamic_axes_reduce_mean(self): m1 = torch.randn(2, 3, 4, requires_grad=True) - self.assertONNX(lambda x: torch.mean(x, dim=1), (m1), input_names=["input"], - dynamic_axes={"input": {1: "dim_1", 2: "dim_2"}}, - opset_version=12) + self.assertONNX( + lambda x: torch.mean(x, dim=1), + (m1), + input_names=["input"], + dynamic_axes={"input": {1: "dim_1", 2: "dim_2"}}, + opset_version=12, + ) def test_dynamic_axes_unchange(self): """Test ProcessUnchangeNode in symbolic shape inference.""" m1 = torch.randn(2, 3, requires_grad=True) - self.assertONNX(lambda x: torch.softmax(x, dim=0), (m1,), input_names=["input"], - dynamic_axes={"input": {1: "dim_1"}}, - opset_version=12) + self.assertONNX( + lambda x: torch.softmax(x, dim=0), + (m1,), + input_names=["input"], + dynamic_axes={"input": {1: "dim_1"}}, + opset_version=12, + ) def test_aten_embedding_1(self): _onnx_opset_version = 12 - @parse_args('v', 'v', 'i', 'b', 'b') + @parse_args("v", "v", "i", "b", "b") def embedding(g, weight, indices, padding_idx, scale_grad_by_freq, sparse): custom_attributes_json = ( - '{' + "{" f'"padding_idx":{str(padding_idx)},' f'"scale_grad_by_freq":{str(scale_grad_by_freq).lower()},' f'"sparse":{str(sparse).lower()}' - '}' + "}" + ) + output = g.at( + "embedding", + weight, + indices, + custom_attributes_json_s=custom_attributes_json, ) - output = g.op("com.microsoft::ATenOp", weight, indices, name_s='aten::embedding', - custom_attributes_json_s=custom_attributes_json) return output - register_custom_op_symbolic('::embedding', embedding, _onnx_opset_version) + register_custom_op_symbolic("::embedding", embedding, _onnx_opset_version) class Model(torch.nn.Module): def __init__(self): @@ -975,32 +1166,39 @@ def forward(self, x, y): y = torch.randn(1, 8) self.assertONNX(model, (x, y), opset_version=_onnx_opset_version) - unregister_custom_op_symbolic('::embedding', _onnx_opset_version) + unregister_custom_op_symbolic("::embedding", _onnx_opset_version) # This is test_aten_embedding_1 with shape inference on custom symbolic aten::embedding. + @skipIfCaffe2 def test_aten_embedding_2(self): _onnx_opset_version = 12 - @parse_args('v', 'v', 'i', 'b', 'b') + @parse_args("v", "v", "i", "b", "b") def embedding(g, weight, indices, padding_idx, scale_grad_by_freq, sparse): custom_attributes_json = ( - '{' + "{" f'"padding_idx":{str(padding_idx)},' f'"scale_grad_by_freq":{str(scale_grad_by_freq).lower()},' f'"sparse":{str(sparse).lower()}' - '}' + "}" + ) + output = g.at( + "embedding", + weight, + indices, + custom_attributes_json_s=custom_attributes_json, ) - output = g.op("com.microsoft::ATenOp", weight, indices, name_s='aten::embedding', - custom_attributes_json_s=custom_attributes_json) # do shape inference and set it via setType indices_shape = _get_tensor_sizes(indices) - if indices_shape is not None and hasattr(weight.type(), 'with_sizes'): - output_type = weight.type().with_sizes(indices_shape + [_get_tensor_dim_size(weight, 1)]) + if indices_shape is not None and hasattr(weight.type(), "with_sizes"): + output_type = weight.type().with_sizes( + indices_shape + [_get_tensor_dim_size(weight, 1)] + ) output.setType(output_type) return output - register_custom_op_symbolic('::embedding', embedding, _onnx_opset_version) + register_custom_op_symbolic("::embedding", embedding, _onnx_opset_version) class Model(torch.nn.Module): def __init__(self): @@ -1015,10 +1213,17 @@ def forward(self, x, y): model = Model() x = torch.ones(32, dtype=torch.long) y = torch.randn(1, 8) - self.assertONNX(model, (x, y), opset_version=_onnx_opset_version, input_names=['input_1', 'input_2'], - dynamic_axes={"input_1": {0: "dim_0"}, 'input_2': {0: "dim_1", 1: "dim_2"}}) - - unregister_custom_op_symbolic('::embedding', _onnx_opset_version) + self.assertONNX( + model, + (x, y), + opset_version=_onnx_opset_version, + input_names=["input_1", "input_2"], + dynamic_axes={"input_1": {0: "dim_0"}, "input_2": {0: "dim_1", 1: "dim_2"}}, + keep_initializers_as_inputs=False, + operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK, + ) + + unregister_custom_op_symbolic("::embedding", _onnx_opset_version) # Without shapeValueMap, the onnx graph looks like: # graph(%0 : Float(*, 1, 128, 1, strides=[128, 128, 1, 1], requires_grad=0, device=cpu)): @@ -1048,12 +1253,17 @@ def forward(self, x): x = F.softmax(x, dim=1) x = x.reshape(batch, -1) return x + radix = 2 cardinality = 1 x = torch.randn(10, 1, 128, 1) - self.assertONNX(RSoftMax(radix, cardinality), (x,), - input_names=["x"], - dynamic_axes={"x": {0: "dim_0"}}) + self.assertONNX( + RSoftMax(radix, cardinality), + (x,), + input_names=["x"], + dynamic_axes={"x": {0: "dim_0"}}, + ) + if __name__ == "__main__": no_onnx_dep_flag = "--no-onnx" @@ -1067,6 +1277,9 @@ def forward(self, x): if _onnx_test: _onnx_dep = True import test_onnx_common - for d in glob.glob(os.path.join(test_onnx_common.pytorch_operator_dir, "test_operator_*")): + + for d in glob.glob( + os.path.join(test_onnx_common.pytorch_operator_dir, "test_operator_*") + ): shutil.rmtree(d) run_tests() diff --git a/test/onnx/test_pytorch_common.py b/test/onnx/test_pytorch_common.py index 13b4585a5def..44ccc303cff7 100644 --- a/test/onnx/test_pytorch_common.py +++ b/test/onnx/test_pytorch_common.py @@ -2,8 +2,9 @@ import functools import os -import unittest import sys +import unittest + import torch import torch.autograd.function as function @@ -29,18 +30,19 @@ def wrapper(*args, **kwargs): if condition(): raise unittest.SkipTest(reason) return f(*args, **kwargs) + return wrapper + return decorator -skipIfNoCuda = _skipper(lambda: not torch.cuda.is_available(), - "CUDA is not available") +skipIfNoCuda = _skipper(lambda: not torch.cuda.is_available(), "CUDA is not available") -skipIfTravis = _skipper(lambda: os.getenv("TRAVIS"), - "Skip In Travis") +skipIfTravis = _skipper(lambda: os.getenv("TRAVIS"), "Skip In Travis") -skipIfNoBFloat16Cuda = _skipper(lambda: not torch.cuda.is_bf16_supported(), - "BFloat16 CUDA is not available") +skipIfNoBFloat16Cuda = _skipper( + lambda: not torch.cuda.is_bf16_supported(), "BFloat16 CUDA is not available" +) # skips tests for all versions below min_opset_version. # if exporting the op is only supported after a specific version, @@ -50,48 +52,55 @@ def skipIfUnsupportedMinOpsetVersion(min_opset_version): def skip_dec(func): def wrapper(self): if self.opset_version < min_opset_version: - raise unittest.SkipTest("Skip verify test for unsupported opset_version") + raise unittest.SkipTest( + f"Unsupported opset_version: {self.opset_version} < {min_opset_version}" + ) return func(self) + return wrapper + return skip_dec -# skips tests for all versions above min_opset_version. -def skipIfUnsupportedMaxOpsetVersion(min_opset_version): + +# skips tests for all versions above max_opset_version. +def skipIfUnsupportedMaxOpsetVersion(max_opset_version): def skip_dec(func): def wrapper(self): - if self.opset_version > min_opset_version: - raise unittest.SkipTest("Skip verify test for unsupported opset_version") + if self.opset_version > max_opset_version: + raise unittest.SkipTest( + f"Unsupported opset_version: {self.opset_version} > {max_opset_version}" + ) return func(self) + return wrapper + return skip_dec + # skips tests for all opset versions. def skipForAllOpsetVersions(): def skip_dec(func): def wrapper(self): if self.opset_version: - raise unittest.SkipTest("Skip verify test for unsupported opset_version") + raise unittest.SkipTest( + "Skip verify test for unsupported opset_version" + ) return func(self) - return wrapper - return skip_dec -# Enables tests for scripting, instead of only tracing the model. -def enableScriptTest(): - def script_dec(func): - def wrapper(self): - self.is_script_test_enabled = True - return func(self) return wrapper - return script_dec + return skip_dec -# Disable tests for scripting. -def disableScriptTest(): + +# skips tests for scripting. +def skipScriptTest(min_opset_version=float("inf")): def script_dec(func): def wrapper(self): - self.is_script_test_enabled = False + self.is_script_test_enabled = self.opset_version >= min_opset_version return func(self) + return wrapper + return script_dec @@ -102,19 +111,15 @@ def skipIfUnsupportedOpsetVersion(unsupported_opset_versions): def skip_dec(func): def wrapper(self): if self.opset_version in unsupported_opset_versions: - raise unittest.SkipTest("Skip verify test for unsupported opset_version") + raise unittest.SkipTest( + "Skip verify test for unsupported opset_version" + ) return func(self) - return wrapper - return skip_dec -def skipIfONNXShapeInference(onnx_shape_inference): - def skip_dec(func): - def wrapper(self): - if self.onnx_shape_inference is onnx_shape_inference: - raise unittest.SkipTest("Skip verify test for unsupported opset_version") - return func(self) return wrapper + return skip_dec + def flatten(x): return tuple(function._iter_filter(lambda o: isinstance(o, torch.Tensor))(x)) diff --git a/test/onnx/test_pytorch_helper.py b/test/onnx/test_pytorch_helper.py index 3ffd88746ff3..ca6ad876b13b 100644 --- a/test/onnx/test_pytorch_helper.py +++ b/test/onnx/test_pytorch_helper.py @@ -1,24 +1,23 @@ # Owner(s): ["module: onnx"] # Some standard imports -import numpy as np -from torch import nn -import torch.onnx -import torch.nn.init as init -from caffe2.python.model_helper import ModelHelper -from pytorch_helper import PyTorchModule import unittest -from caffe2.python.core import workspace +import numpy as np +from pytorch_helper import PyTorchModule from test_pytorch_common import skipIfNoLapack +import torch.nn.init as init +import torch.onnx +from caffe2.python.core import workspace +from caffe2.python.model_helper import ModelHelper +from torch import nn -class TestCaffe2Backend(unittest.TestCase): +class TestCaffe2Backend(unittest.TestCase): @skipIfNoLapack @unittest.skip("test broken because Lapack was always missing.") def test_helper(self): - class SuperResolutionNet(nn.Module): def __init__(self, upscale_factor, inplace=False): super(SuperResolutionNet, self).__init__() @@ -27,7 +26,7 @@ def __init__(self, upscale_factor, inplace=False): self.conv1 = nn.Conv2d(1, 64, (5, 5), (1, 1), (2, 2)) self.conv2 = nn.Conv2d(64, 64, (3, 3), (1, 1), (1, 1)) self.conv3 = nn.Conv2d(64, 32, (3, 3), (1, 1), (1, 1)) - self.conv4 = nn.Conv2d(32, upscale_factor ** 2, (3, 3), (1, 1), (1, 1)) + self.conv4 = nn.Conv2d(32, upscale_factor**2, (3, 3), (1, 1), (1, 1)) self.pixel_shuffle = nn.PixelShuffle(upscale_factor) self._initialize_weights() @@ -53,7 +52,7 @@ def _initialize_weights(self): helper = ModelHelper(name="test_model") start = helper.Sigmoid(["the_input"]) # Embed the ONNX-converted pytorch net inside it - toutput, = PyTorchModule(helper, torch_model, (fake_input,), [start]) + (toutput,) = PyTorchModule(helper, torch_model, (fake_input,), [start]) output = helper.Sigmoid(toutput) workspace.RunNetOnce(helper.InitProto()) diff --git a/test/onnx/test_pytorch_jit_onnx.py b/test/onnx/test_pytorch_jit_onnx.py new file mode 100644 index 000000000000..aaa842d171a3 --- /dev/null +++ b/test/onnx/test_pytorch_jit_onnx.py @@ -0,0 +1,97 @@ +# Owner(s): ["module: onnx"] +import unittest + +import onnxruntime +from test_pytorch_onnx_onnxruntime import ort_compare_with_pytorch, run_ort + +import torch +from torch._C import parse_ir + + +def _jit_graph_to_onnx_model(graph, operator_export_type, opset_version): + r""" + This function exports torch::jit::Graph object + to serialized ONNX ModelProto. + This function is for testing purpose. + It only keeps the essential parts for IR graph conversions. + It also does not interact with actual PyTorch modules nor + PyTorch tensor inputs. + """ + from torch.onnx.symbolic_helper import ( + _set_onnx_shape_inference, + _set_opset_version, + ) + from torch.onnx.utils import _optimize_graph + + # Shape inference is required because some ops' symbolic functions + # generate sub-graphs based on inputs' types. + _set_onnx_shape_inference(True) + _set_opset_version(opset_version) + graph = _optimize_graph(graph, operator_export_type, params_dict={}) + proto, _, _, _ = graph._export_onnx( + {}, + opset_version, + {}, + False, + operator_export_type, + False, + False, + {}, + True, + "", + {}, + ) + return proto + + +class _TestJITIRToONNX: + """Abstract base class for test cases. + + Intentionally not a sub-class of unittest.TestCase so that unittest / pytest + don't run it directly. unitest.TestCase is mixed in as another base class when + creating concrete sub-types. See MakeTestCase(). + """ + + opset_version = -1 # Sub-classes must override + ort_providers = ["CPUExecutionProvider"] + + def run_test(self, graph_ir, example_inputs): + graph = parse_ir(graph_ir) + jit_outs = torch._C._jit_interpret_graph(graph, example_inputs) + + onnx_proto = _jit_graph_to_onnx_model( + graph, torch.onnx.OperatorExportTypes.ONNX, self.opset_version + ) + ort_sess = onnxruntime.InferenceSession( + onnx_proto, providers=self.ort_providers + ) + ort_outs = run_ort(ort_sess, example_inputs) + + ort_compare_with_pytorch(ort_outs, jit_outs, rtol=1e-3, atol=1e-7) + + def test_example_ir(self): + graph_ir = """ + graph(%1 : Float(2, 3), + %2 : Float(2, 3)): + %3 : int = prim::Constant[value=1]() + %4 : Float(2, 3) = aten::add(%1, %2, %3) + return (%4) + """ + a = torch.randn(2, 3) + b = torch.randn(2, 3) + self.run_test(graph_ir, (a, b)) + + +def MakeTestCase(opset_version: int) -> type: + name = f"TestJITIRToONNX_opset{opset_version}" + return type( + str(name), + (unittest.TestCase,), + dict(_TestJITIRToONNX.__dict__, opset_version=opset_version), + ) + + +TestJITIRToONNX_opset14 = MakeTestCase(14) + +if __name__ == "__main__": + unittest.main() diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py index 77c2b85f27f0..79ae0a36f37b 100644 --- a/test/onnx/test_pytorch_onnx_caffe2.py +++ b/test/onnx/test_pytorch_onnx_caffe2.py @@ -1,21 +1,37 @@ # Owner(s): ["module: onnx"] -from typing import Tuple import io import itertools import sys import unittest +from typing import Tuple +import model_defs.dcgan as dcgan +import model_defs.word_language_model as word_language_model import numpy as np - +import onnx +import verify from debug_embed_params import run_embed_params -from torch import nn -from torch.autograd import Variable, function -from torch.nn.utils import rnn as rnn_utils -from torch.onnx import ExportTypes -import torch.onnx -import torch.onnx.operators -import torch.utils.model_zoo as model_zoo +from model_defs.lstm_flattening_result import LstmFlatteningResult +from model_defs.mnist import MNIST +from model_defs.rnn_model_with_packed_sequence import ( + RnnModelWithPackedSequence, +) +from model_defs.squeezenet import SqueezeNet +from model_defs.srresnet import SRResNet +from model_defs.super_resolution import SuperResolutionNet +from test_pytorch_common import ( + BATCH_SIZE, + RNN_BATCH_SIZE, + RNN_HIDDEN_SIZE, + RNN_INPUT_SIZE, + RNN_SEQUENCE_LENGTH, + skipIfNoCuda, + skipIfNoLapack, + skipIfTravis, + skipIfUnsupportedMinOpsetVersion, + skipIfUnsupportedOpsetVersion, +) # Import various models for testing from torchvision.models.alexnet import alexnet @@ -24,24 +40,18 @@ from torchvision.models.resnet import resnet50 from torchvision.models.vgg import vgg16, vgg16_bn, vgg19, vgg19_bn -from model_defs.squeezenet import SqueezeNet -from model_defs.super_resolution import SuperResolutionNet -from model_defs.srresnet import SRResNet -import model_defs.dcgan as dcgan -import model_defs.word_language_model as word_language_model -from model_defs.mnist import MNIST -from model_defs.lstm_flattening_result import LstmFlatteningResult -from model_defs.rnn_model_with_packed_sequence import RnnModelWithPackedSequence -from caffe2.python.operator_test.torch_integration_test import (generate_rois_rotated, - create_bbox_transform_inputs) - -import onnx import caffe2.python.onnx.backend as c2 - -from test_pytorch_common import skipIfTravis, skipIfNoLapack, skipIfNoCuda -from test_pytorch_common import BATCH_SIZE, RNN_BATCH_SIZE, RNN_SEQUENCE_LENGTH, RNN_INPUT_SIZE, RNN_HIDDEN_SIZE -from test_pytorch_common import skipIfUnsupportedOpsetVersion, skipIfUnsupportedMinOpsetVersion -import verify +import torch.onnx +import torch.onnx.operators +import torch.utils.model_zoo as model_zoo +from caffe2.python.operator_test.torch_integration_test import ( + create_bbox_transform_inputs, + generate_rois_rotated, +) +from torch import nn +from torch.autograd import Variable, function +from torch.nn.utils import rnn as rnn_utils +from torch.onnx import ExportTypes skip = unittest.skip @@ -51,15 +61,19 @@ def wrapper(self): if self.embed_params: raise unittest.SkipTest("Skip embed_params verify test") return func(self) + return wrapper + def skipIfNoEmbed(func): def wrapper(self): if not self.embed_params: raise unittest.SkipTest("Skip debug embed_params test") return func(self) + return wrapper + # def import_model(proto, input, workspace=None, use_gpu=True): # model_def = onnx.ModelProto.FromString(proto) # onnx.checker.check_model(model_def) @@ -117,8 +131,7 @@ def do_export(model, inputs, *args, **kwargs): class TestCaffe2Backend_opset9(unittest.TestCase): - from torch.onnx.symbolic_helper import _export_onnx_opset_version - opset_version = _export_onnx_opset_version + opset_version = 9 embed_params = False def setUp(self): @@ -132,12 +145,20 @@ def convert_cuda(self, model, input): # input might be nested - we want to move everything to GPU cuda_input = function._nested_map( lambda o: isinstance(o, Variable) or isinstance(o, torch.Tensor), - lambda o: o.cuda())(input) + lambda o: o.cuda(), + )(input) return cuda_model, cuda_input - def run_debug_test(self, model, train, batch_size, state_dict=None, - input=None, use_gpu=True, - operator_export_type=torch.onnx.OperatorExportTypes.ONNX): + def run_debug_test( + self, + model, + train, + batch_size, + state_dict=None, + input=None, + use_gpu=True, + operator_export_type=torch.onnx.OperatorExportTypes.ONNX, + ): """ # TODO: remove this from the final release version This test is for our debugging only for the case where @@ -154,12 +175,17 @@ def run_debug_test(self, model, train, batch_size, state_dict=None, if use_gpu: model, input = self.convert_cuda(model, input) - onnxir, torch_out = do_export(model, input, export_params=self.embed_params, verbose=False, - do_constant_folding=False, - opset_version=self.opset_version, - keep_initializers_as_inputs=True, - add_node_names=False, - operator_export_type=operator_export_type) + onnxir, torch_out = do_export( + model, + input, + export_params=self.embed_params, + verbose=False, + do_constant_folding=False, + opset_version=self.opset_version, + keep_initializers_as_inputs=True, + add_node_names=False, + operator_export_type=operator_export_type, + ) if isinstance(torch_out, torch.autograd.Variable): torch_out = (torch_out,) @@ -167,12 +193,22 @@ def run_debug_test(self, model, train, batch_size, state_dict=None, for _, (x, y) in enumerate(zip(torch_out, caffe2_out)): np.testing.assert_almost_equal(x.data.cpu().numpy(), y, decimal=3) - def run_actual_test(self, model, train, batch_size, state_dict=None, - input=None, use_gpu=True, rtol=0.001, atol=1e-7, - do_constant_folding=True, - operator_export_type=torch.onnx.OperatorExportTypes.ONNX, - input_names=None, dynamic_axes=None, - remained_onnx_input_idx=None): + def run_actual_test( + self, + model, + train, + batch_size, + state_dict=None, + input=None, + use_gpu=True, + rtol=0.001, + atol=1e-7, + do_constant_folding=True, + operator_export_type=torch.onnx.OperatorExportTypes.ONNX, + input_names=None, + dynamic_axes=None, + remained_onnx_input_idx=None, + ): """ This is what the user facing version will look like """ @@ -191,37 +227,68 @@ def run_actual_test(self, model, train, batch_size, state_dict=None, model, input = self.convert_cuda(model, input) # Verify the model runs the same in Caffe2 - verify.verify(model, input, c2, rtol=rtol, atol=atol, - do_constant_folding=do_constant_folding, - opset_version=self.opset_version, - keep_initializers_as_inputs=True, - operator_export_type=operator_export_type, - input_names=input_names, - dynamic_axes=dynamic_axes, - remained_onnx_input_idx=remained_onnx_input_idx) - - def run_model_test(self, model, train, batch_size, state_dict=None, - input=None, use_gpu=True, rtol=0.001, atol=1e-7, - do_constant_folding=True, - operator_export_type=torch.onnx.OperatorExportTypes.ONNX, - input_names=None, dynamic_axes=None, - remained_onnx_input_idx=None): + verify.verify( + model, + input, + c2, + rtol=rtol, + atol=atol, + do_constant_folding=do_constant_folding, + opset_version=self.opset_version, + keep_initializers_as_inputs=True, + operator_export_type=operator_export_type, + input_names=input_names, + dynamic_axes=dynamic_axes, + remained_onnx_input_idx=remained_onnx_input_idx, + ) + + def run_model_test( + self, + model, + train, + batch_size, + state_dict=None, + input=None, + use_gpu=True, + rtol=0.001, + atol=1e-7, + do_constant_folding=True, + operator_export_type=torch.onnx.OperatorExportTypes.ONNX, + input_names=None, + dynamic_axes=None, + remained_onnx_input_idx=None, + ): use_gpu_ = torch.cuda.is_available() and use_gpu # NOTE: do_constant_folding is turned on only when model has # parameters embedded (which are needed for constant folding), # i.e. for self.embed_params=True case. self.embed_params=True # for the TestCaffe2BackendEmbed class defined at the bottom. if self.embed_params: - self.run_actual_test(model, train, batch_size, state_dict, input, - use_gpu=use_gpu_, rtol=rtol, atol=atol, - do_constant_folding=do_constant_folding, - operator_export_type=operator_export_type, - input_names=input_names, - dynamic_axes=dynamic_axes, - remained_onnx_input_idx=remained_onnx_input_idx) + self.run_actual_test( + model, + train, + batch_size, + state_dict, + input, + use_gpu=use_gpu_, + rtol=rtol, + atol=atol, + do_constant_folding=do_constant_folding, + operator_export_type=operator_export_type, + input_names=input_names, + dynamic_axes=dynamic_axes, + remained_onnx_input_idx=remained_onnx_input_idx, + ) else: - self.run_debug_test(model, train, batch_size, state_dict, input, - use_gpu=use_gpu_, operator_export_type=operator_export_type) + self.run_debug_test( + model, + train, + batch_size, + state_dict, + input, + use_gpu=use_gpu_, + operator_export_type=operator_export_type, + ) def test_linear(self): class MyModel(torch.nn.Module): @@ -259,9 +326,15 @@ def forward(self, input): # Note that the export call explicitly sets the names of not just the input, # but also the parameters. This test checks that the model can be loaded and # executed in Caffe2 backend correctly. - torch.onnx._export(model, input, f, verbose=True, export_type=ExportTypes.ZIP_ARCHIVE, - input_names=["input1", "parameter1", "parameter2"], - keep_initializers_as_inputs=True) + torch.onnx._export( + model, + input, + f, + verbose=True, + export_type=ExportTypes.ZIP_ARCHIVE, + input_names=["input1", "parameter1", "parameter2"], + keep_initializers_as_inputs=True, + ) f.seek(0) model_c2 = c2.prepare_zip_archive(f) @@ -286,9 +359,15 @@ def forward(self, input): # But note that the target first parameter name is the same as the second parameter name. # This test checks that given this edge condition, the model can be loaded and executed # in Caffe2 backend correctly. - torch.onnx._export(model, input, f, verbose=True, export_type=ExportTypes.ZIP_ARCHIVE, - input_names=["input1", "fc1.bias"], - keep_initializers_as_inputs=True) + torch.onnx._export( + model, + input, + f, + verbose=True, + export_type=ExportTypes.ZIP_ARCHIVE, + input_names=["input1", "fc1.bias"], + keep_initializers_as_inputs=True, + ) f.seek(0) model_c2 = c2.prepare_zip_archive(f) @@ -300,13 +379,21 @@ def test_lstm_cell(self): input = torch.randn(BATCH_SIZE, RNN_INPUT_SIZE) h0 = torch.randn(BATCH_SIZE, RNN_HIDDEN_SIZE) c0 = torch.randn(BATCH_SIZE, RNN_HIDDEN_SIZE) - self.run_model_test(model, train=False, batch_size=BATCH_SIZE, input=(input, (h0, c0)), use_gpu=False) + self.run_model_test( + model, + train=False, + batch_size=BATCH_SIZE, + input=(input, (h0, c0)), + use_gpu=False, + ) def test_gru_cell(self): model = nn.GRUCell(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE) input = torch.randn(BATCH_SIZE, RNN_INPUT_SIZE) h0 = torch.randn(BATCH_SIZE, RNN_HIDDEN_SIZE) - self.run_model_test(model, train=False, batch_size=BATCH_SIZE, input=(input, h0), use_gpu=False) + self.run_model_test( + model, train=False, batch_size=BATCH_SIZE, input=(input, h0), use_gpu=False + ) def _dispatch_rnn_test(self, name, *args, **kwargs): if name == "elman": @@ -316,15 +403,25 @@ def _dispatch_rnn_test(self, name, *args, **kwargs): if name == "gru": self._gru_test(*args, **kwargs) - def _elman_rnn_test(self, layers, nonlinearity, bidirectional, - initial_state, packed_sequence, dropout): + def _elman_rnn_test( + self, + layers, + nonlinearity, + bidirectional, + initial_state, + packed_sequence, + dropout, + ): batch_first = True if packed_sequence == 2 else False - model = nn.RNN(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, - layers, - nonlinearity=nonlinearity, - bidirectional=bidirectional, - dropout=dropout, - batch_first=batch_first) + model = nn.RNN( + RNN_INPUT_SIZE, + RNN_HIDDEN_SIZE, + layers, + nonlinearity=nonlinearity, + bidirectional=bidirectional, + dropout=dropout, + batch_first=batch_first, + ) if packed_sequence == 1: model = RnnModelWithPackedSequence(model, False) @@ -352,24 +449,41 @@ def make_input(batch_size): return input input = make_input(RNN_BATCH_SIZE) - self.run_model_test(model, train=False, batch_size=RNN_BATCH_SIZE, input=input, use_gpu=False, atol=1e-7) + self.run_model_test( + model, + train=False, + batch_size=RNN_BATCH_SIZE, + input=input, + use_gpu=False, + atol=1e-7, + ) # test that the model still runs with a different batch size # (save the model with a batch_size of 1 with rnn with a variable batch size, # otherwise expand will fail) variable_batch_size_init_input = make_input(1) # Constant folding works when model has parameters embedded. For this case, we need to disable it - onnxir, _ = do_export(model, variable_batch_size_init_input, keep_initializers_as_inputs=True, - do_constant_folding=False) + onnxir, _ = do_export( + model, + variable_batch_size_init_input, + keep_initializers_as_inputs=True, + do_constant_folding=False, + ) other_input = make_input(RNN_BATCH_SIZE + 1) _ = run_embed_params(onnxir, model, other_input, use_gpu=False) - def _lstm_test(self, layers, bidirectional, initial_state, - packed_sequence, dropout): + def _lstm_test( + self, layers, bidirectional, initial_state, packed_sequence, dropout + ): batch_first = True if packed_sequence == 2 else False model = LstmFlatteningResult( - RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, layers, - bidirectional=bidirectional, dropout=dropout, batch_first=batch_first) + RNN_INPUT_SIZE, + RNN_HIDDEN_SIZE, + layers, + bidirectional=bidirectional, + dropout=dropout, + batch_first=batch_first, + ) if packed_sequence == 1: model = RnnModelWithPackedSequence(model, False) if packed_sequence == 2: @@ -397,23 +511,34 @@ def make_input(batch_size): return input input = make_input(RNN_BATCH_SIZE) - self.run_model_test(model, train=False, batch_size=RNN_BATCH_SIZE, input=input, use_gpu=False) + self.run_model_test( + model, train=False, batch_size=RNN_BATCH_SIZE, input=input, use_gpu=False + ) # test that the model still runs with a different batch size # (save the model with a batch_size of 1 with rnn with a variable batch size, # otherwise expand will fail) variable_batch_size_init_input = make_input(1) # Constant folding works when model has parameters embedded. For this case, we need to disable it - onnxir, _ = do_export(model, variable_batch_size_init_input, keep_initializers_as_inputs=True, - do_constant_folding=False) + onnxir, _ = do_export( + model, + variable_batch_size_init_input, + keep_initializers_as_inputs=True, + do_constant_folding=False, + ) other_input = make_input(RNN_BATCH_SIZE + 1) _ = run_embed_params(onnxir, model, other_input, use_gpu=False) - def _gru_test(self, layers, bidirectional, initial_state, - packed_sequence, dropout): + def _gru_test(self, layers, bidirectional, initial_state, packed_sequence, dropout): batch_first = True if packed_sequence == 2 else False - model = nn.GRU(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, layers, - bidirectional=bidirectional, dropout=dropout, batch_first=batch_first) + model = nn.GRU( + RNN_INPUT_SIZE, + RNN_HIDDEN_SIZE, + layers, + bidirectional=bidirectional, + dropout=dropout, + batch_first=batch_first, + ) if packed_sequence == 1: model = RnnModelWithPackedSequence(model, False) if packed_sequence == 2: @@ -440,15 +565,21 @@ def make_input(batch_size): return input input = make_input(RNN_BATCH_SIZE) - self.run_model_test(model, train=False, batch_size=RNN_BATCH_SIZE, input=input, use_gpu=False) + self.run_model_test( + model, train=False, batch_size=RNN_BATCH_SIZE, input=input, use_gpu=False + ) # test that the model still runs with a different batch size # (save the model with a batch_size of 1 with rnn with a variable batch size, # otherwise expand will fail) variable_batch_size_init_input = make_input(1) # Constant folding works when model has parameters embedded. For this case, we need to disable it - onnxir, _ = do_export(model, variable_batch_size_init_input, keep_initializers_as_inputs=True, - do_constant_folding=False) + onnxir, _ = do_export( + model, + variable_batch_size_init_input, + keep_initializers_as_inputs=True, + do_constant_folding=False, + ) other_input = make_input(RNN_BATCH_SIZE + 1) _ = run_embed_params(onnxir, model, other_input, use_gpu=False) @@ -463,9 +594,15 @@ def test_rnn_init_predict_split(self): # Test that we are correctly splitting between init and # predict net. When we embed parameters, there should be more # ops in the init net. - mp = onnx.ModelProto.FromString(do_export(model, input, export_params=self.embed_params, - keep_initializers_as_inputs=True, - do_constant_folding=False)[0]) + mp = onnx.ModelProto.FromString( + do_export( + model, + input, + export_params=self.embed_params, + keep_initializers_as_inputs=True, + do_constant_folding=False, + )[0] + ) prepared = c2.prepare(mp, device="CPU") if self.embed_params: assert len(prepared.init_net.op) == 950 @@ -476,8 +613,13 @@ def test_rnn_init_predict_split(self): def test_alexnet(self): state_dict = model_zoo.load_url(model_urls["alexnet"], progress=False) - self.run_model_test(alexnet(), train=False, batch_size=BATCH_SIZE, - state_dict=state_dict, atol=1e-3) + self.run_model_test( + alexnet(), + train=False, + batch_size=BATCH_SIZE, + state_dict=state_dict, + atol=1e-3, + ) @skipIfNoCuda def test_dcgan(self): @@ -490,23 +632,35 @@ def test_dcgan(self): netD = dcgan._netD(1) netD.apply(dcgan.weights_init) input = torch.randn(BATCH_SIZE, 3, dcgan.imgsz, dcgan.imgsz) - self.run_model_test(netD, train=False, batch_size=BATCH_SIZE, - input=input) + self.run_model_test(netD, train=False, batch_size=BATCH_SIZE, input=input) netG = dcgan._netG(1) netG.apply(dcgan.weights_init) state_dict = model_zoo.load_url(model_urls["dcgan_b"], progress=False) # state_dict = model_zoo.load_url(model_urls["dcgan_f"], progress=False) noise = torch.randn(BATCH_SIZE, dcgan.nz, 1, 1).normal_(0, 1) - self.run_model_test(netG, train=False, batch_size=BATCH_SIZE, - input=noise, state_dict=state_dict, rtol=1e-2, atol=1e-6) + self.run_model_test( + netG, + train=False, + batch_size=BATCH_SIZE, + input=noise, + state_dict=state_dict, + rtol=1e-2, + atol=1e-6, + ) - @unittest.skipIf(not torch.cuda.is_available(), - "model on net has cuda in it, awaiting fix") + @unittest.skipIf( + not torch.cuda.is_available(), "model on net has cuda in it, awaiting fix" + ) def test_densenet(self): state_dict = model_zoo.load_url(model_urls["densenet121"], progress=False) - self.run_model_test(densenet121(), train=False, batch_size=BATCH_SIZE, - state_dict=state_dict, atol=1e-7) + self.run_model_test( + densenet121(), + train=False, + batch_size=BATCH_SIZE, + state_dict=state_dict, + atol=1e-7, + ) @skip("doesn't match exactly...") # TODO: figure out the numerical instabilities @@ -514,33 +668,48 @@ def test_inception(self): x = torch.randn(BATCH_SIZE, 3, 299, 299, requires_grad=True) # state_dict = model_zoo.load_url(model_urls["inception_v3_google"], progress=False) state_dict = None - self.run_model_test(inception_v3(), train=False, batch_size=BATCH_SIZE, - state_dict=state_dict, input=x) + self.run_model_test( + inception_v3(), + train=False, + batch_size=BATCH_SIZE, + state_dict=state_dict, + input=x, + ) @skipIfNoEmbed def test_resnet(self): state_dict = model_zoo.load_url(model_urls["resnet50"], progress=False) - self.run_model_test(resnet50(), train=False, batch_size=BATCH_SIZE, - state_dict=state_dict, atol=1e-5) + self.run_model_test( + resnet50(), + train=False, + batch_size=BATCH_SIZE, + state_dict=state_dict, + atol=1e-5, + ) def test_squeezenet(self): sqnet_v1_1 = SqueezeNet(version=1.1) state_dict = model_zoo.load_url(model_urls["squeezenet1_1"], progress=False) # state_dict = model_zoo.load_url(model_urls["squeezenet1_0"], progress=False) - self.run_model_test(sqnet_v1_1, train=False, batch_size=BATCH_SIZE, - state_dict=state_dict) + self.run_model_test( + sqnet_v1_1, train=False, batch_size=BATCH_SIZE, state_dict=state_dict + ) # @skip("takes long to run, LAPACK needed for gpu") @skipIfNoLapack @unittest.skip("This model takes too much memory") def test_srresnet(self): - super_resolution_net = SRResNet( - rescale_factor=4, n_filters=64, n_blocks=8) + super_resolution_net = SRResNet(rescale_factor=4, n_filters=64, n_blocks=8) state_dict = model_zoo.load_url(model_urls["srresNet"], progress=False) x = torch.randn(1, 3, 224, 224, requires_grad=True) - self.run_model_test(super_resolution_net, train=False, - batch_size=1, state_dict=state_dict, - input=x, use_gpu=False) + self.run_model_test( + super_resolution_net, + train=False, + batch_size=1, + state_dict=state_dict, + input=x, + use_gpu=False, + ) @skipIfTravis @skipIfNoLapack @@ -549,31 +718,37 @@ def test_super_resolution(self): super_resolution_net = SuperResolutionNet(upscale_factor=3) state_dict = model_zoo.load_url(model_urls["super_resolution"], progress=False) x = torch.randn(1, 1, 224, 224, requires_grad=True) - self.run_model_test(super_resolution_net, train=False, - batch_size=BATCH_SIZE, state_dict=state_dict, - input=x, use_gpu=False, atol=1e-6) + self.run_model_test( + super_resolution_net, + train=False, + batch_size=BATCH_SIZE, + state_dict=state_dict, + input=x, + use_gpu=False, + atol=1e-6, + ) @unittest.skip("This model takes too much memory") def test_vgg16(self): state_dict = model_zoo.load_url(model_urls["vgg16"], progress=False) - self.run_model_test(vgg16(), train=False, batch_size=BATCH_SIZE, - state_dict=state_dict) + self.run_model_test( + vgg16(), train=False, batch_size=BATCH_SIZE, state_dict=state_dict + ) @skip("disable to run tests faster...") def test_vgg16_bn(self): - self.run_model_test(vgg16_bn(), train=False, - batch_size=BATCH_SIZE) + self.run_model_test(vgg16_bn(), train=False, batch_size=BATCH_SIZE) @skip("disable to run tests faster...") def test_vgg19(self): state_dict = model_zoo.load_url(model_urls["vgg19"], progress=False) - self.run_model_test(vgg19(), train=False, batch_size=BATCH_SIZE, - state_dict=state_dict) + self.run_model_test( + vgg19(), train=False, batch_size=BATCH_SIZE, state_dict=state_dict + ) @skip("disable to run tests faster...") def test_vgg19_bn(self): - self.run_model_test(vgg19_bn(), train=False, - batch_size=BATCH_SIZE) + self.run_model_test(vgg19_bn(), train=False, batch_size=BATCH_SIZE) def run_word_language_model(self, model_name): ntokens = 50 @@ -583,13 +758,18 @@ def run_word_language_model(self, model_name): dropout = 0.2 tied = False batchsize = 5 - model = word_language_model.RNNModel(model_name, ntokens, emsize, - nhid, nlayers, dropout, tied, - batchsize) + model = word_language_model.RNNModel( + model_name, ntokens, emsize, nhid, nlayers, dropout, tied, batchsize + ) x = torch.arange(0, ntokens).long().view(-1, batchsize) # Only support CPU version, since tracer is not working in GPU RNN. - self.run_model_test(model, train=False, input=(x, model.hidden), - batch_size=batchsize, use_gpu=False) + self.run_model_test( + model, + train=False, + input=(x, model.hidden), + batch_size=batchsize, + use_gpu=False, + ) @unittest.skip("Disabled due to onnx optimizer deprecation") @skipIfUnsupportedOpsetVersion([10]) @@ -705,20 +885,49 @@ def test_tensor_index_newaxis(self): def test_tensor_index_advanced_indexing(self): self._test_index_generic( - lambda input: input[:, torch.tensor([[0, 2], [1, 1]]), :, torch.tensor([2, 1]), torch.tensor([0, 3])]) + lambda input: input[ + :, + torch.tensor([[0, 2], [1, 1]]), + :, + torch.tensor([2, 1]), + torch.tensor([0, 3]), + ] + ) @skipIfUnsupportedOpsetVersion([10]) def test_tensor_index_advanced_indexing_with_slice(self): - self._test_index_generic(lambda input: input[:, torch.tensor([0, 2]), None, 2:4, torch.tensor([[1, 3], [4, 0]])]) - self._test_index_generic(lambda input: input[:, torch.tensor([0, 2]), torch.tensor([1]), 2:4, torch.tensor([[1], [4]])]) + self._test_index_generic( + lambda input: input[ + :, torch.tensor([0, 2]), None, 2:4, torch.tensor([[1, 3], [4, 0]]) + ] + ) + self._test_index_generic( + lambda input: input[ + :, + torch.tensor([0, 2]), + torch.tensor([1]), + 2:4, + torch.tensor([[1], [4]]), + ] + ) def test_tensor_index_advanced_indexing_consecutive(self): - self._test_index_generic(lambda input: input[:, torch.tensor([0, 2]), torch.tensor([[1, 3], [4, 0]]), None]) + self._test_index_generic( + lambda input: input[ + :, torch.tensor([0, 2]), torch.tensor([[1, 3], [4, 0]]), None + ] + ) @skipIfUnsupportedMinOpsetVersion(9) def test_tensor_index_advanced_indexing_masked(self): self._test_index_generic( - lambda input: input[:, torch.tensor([1, 0, 1, 0], dtype=torch.uint8), torch.tensor([[1, 3], [4, 0]]), None]) + lambda input: input[ + :, + torch.tensor([1, 0, 1, 0], dtype=torch.uint8), + torch.tensor([[1, 3], [4, 0]]), + None, + ] + ) def test_chunk(self): class MyModel(torch.nn.Module): @@ -729,6 +938,7 @@ def forward(self, input): # TODO: Why index? This returns a tuple and test runner doesn't # support tuple comparison. return input.chunk(8, dim=2)[-1] + self.run_model_test(MyModel(), train=False, batch_size=BATCH_SIZE) def test_sqrt(self): @@ -738,6 +948,7 @@ def __init__(self): def forward(self, input): return input.sqrt() + input = torch.empty(BATCH_SIZE, 10, 10).uniform_(4, 9) self.run_model_test(MyModel(), train=False, input=input, batch_size=BATCH_SIZE) @@ -756,6 +967,7 @@ def __init__(self): def forward(self, input): return input.log() + input = torch.empty(BATCH_SIZE, 10, 10).uniform_(4, 9) self.run_model_test(MyModel(), train=False, input=input, batch_size=BATCH_SIZE) @@ -767,6 +979,7 @@ def __init__(self): def forward(self, input): return input.erf() + input = torch.empty(BATCH_SIZE, 10, 10).uniform_(4, 9) self.run_model_test(MyModel(), train=False, input=input, batch_size=BATCH_SIZE) @@ -778,8 +991,11 @@ def __init__(self): def forward(self, input): return getattr(input, name)() + input = torch.empty(BATCH_SIZE, 10, 10).uniform_() - self.run_model_test(MyModel(), train=False, input=input, batch_size=BATCH_SIZE) + self.run_model_test( + MyModel(), train=False, input=input, batch_size=BATCH_SIZE + ) test_func("cos") test_func("sin") @@ -797,6 +1013,7 @@ def forward(self, input): # TODO: Why index? This returns a tuple and test runner doesn't # support tuple comparison. return input + 1 + self.run_model_test(MyModel(), train=False, batch_size=BATCH_SIZE) def test_subconstant(self): @@ -808,6 +1025,7 @@ def forward(self, input): # TODO: Why index? This returns a tuple and test runner doesn't # support tuple comparison. return input - 1 + self.run_model_test(MyModel(), train=False, batch_size=BATCH_SIZE) def test_arithmetic(self): @@ -820,7 +1038,9 @@ def forward(self, x): return x x = torch.randn(2, 3, 4) - self.run_model_test(ArithmeticModule(), input=x, train=False, batch_size=BATCH_SIZE) + self.run_model_test( + ArithmeticModule(), input=x, train=False, batch_size=BATCH_SIZE + ) def test_embedding(self): model = nn.Embedding(10, 3, padding_idx=-1) @@ -938,17 +1158,20 @@ def test_adaptive_max_pool3D(self): def test_weight_norm(self): model = nn.utils.weight_norm(nn.Conv1d(1, 1, 3)) input = torch.randn(1, 1, 5, requires_grad=True) - self.run_model_test( - model, train=True, batch_size=0, input=input, use_gpu=False - ) + self.run_model_test(model, train=True, batch_size=0, input=input, use_gpu=False) def test_mnist(self): model = MNIST() input = torch.randn(BATCH_SIZE, 1, 28, 28) state_dict = None # TODO: test with state_dict - self.run_model_test(model, train=False, input=input, batch_size=BATCH_SIZE, - state_dict=state_dict) + self.run_model_test( + model, + train=False, + input=input, + batch_size=BATCH_SIZE, + state_dict=state_dict, + ) def test_mm(self): class MyModel(torch.nn.Module): @@ -957,9 +1180,12 @@ def __init__(self): def forward(self, m1, m2): return torch.mm(m1, m2) + m1 = torch.randn(3, 4) m2 = torch.randn(4, 5) - self.run_model_test(MyModel(), train=False, input=(m1, m2), batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + MyModel(), train=False, input=(m1, m2), batch_size=BATCH_SIZE, use_gpu=False + ) def test_addmm(self): class MyModel(torch.nn.Module): @@ -968,10 +1194,17 @@ def __init__(self): def forward(self, ma, m1, m2): return torch.addmm(ma, m1, m2) + ma = torch.randn(5) m1 = torch.randn(3, 4) m2 = torch.randn(4, 5) - self.run_model_test(MyModel(), train=False, input=(ma, m1, m2), batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + MyModel(), + train=False, + input=(ma, m1, m2), + batch_size=BATCH_SIZE, + use_gpu=False, + ) def test_fuse_addmm(self): class AddmmModel(torch.nn.Module): @@ -979,7 +1212,9 @@ def forward(self, x): return torch.mm(x, x) + x x = torch.randn(3, 3) - self.run_model_test(AddmmModel(), train=False, input=x, batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + AddmmModel(), train=False, input=x, batch_size=BATCH_SIZE, use_gpu=False + ) def test_scalar_type(self): class ArithmeticModel(torch.nn.Module): @@ -987,14 +1222,18 @@ def forward(self, x): return x.size(0) * 2 * x x = torch.ones(2, 3, dtype=torch.float32) - self.run_model_test(ArithmeticModel(), input=x, train=False, batch_size=BATCH_SIZE) + self.run_model_test( + ArithmeticModel(), input=x, train=False, batch_size=BATCH_SIZE + ) class ReciprocalModel(torch.nn.Module): def forward(self, x): return torch.reciprocal(x) x = torch.tensor([2.0, 4.0], dtype=torch.double) - self.run_model_test(ReciprocalModel(), input=x, train=False, batch_size=BATCH_SIZE) + self.run_model_test( + ReciprocalModel(), input=x, train=False, batch_size=BATCH_SIZE + ) class ComparisonModel(torch.nn.Module): def forward(self, x, y): @@ -1002,7 +1241,9 @@ def forward(self, x, y): x = torch.ones(2, 3, dtype=torch.int32) y = torch.ones(2, 3, dtype=torch.float32) - self.run_model_test(ComparisonModel(), input=(x, y), train=False, batch_size=BATCH_SIZE) + self.run_model_test( + ComparisonModel(), input=(x, y), train=False, batch_size=BATCH_SIZE + ) class MatMulModel(torch.nn.Module): def forward(self, x, y): @@ -1010,7 +1251,9 @@ def forward(self, x, y): x = torch.ones(3, 4) y = torch.ones(4, 5) - self.run_model_test(MatMulModel(), input=(x, y), train=False, batch_size=BATCH_SIZE) + self.run_model_test( + MatMulModel(), input=(x, y), train=False, batch_size=BATCH_SIZE + ) class AddMMModel(torch.nn.Module): def forward(self, x): @@ -1027,41 +1270,61 @@ def __init__(self): def forward(self, x): return x.transpose(1, 2).transpose(2, 3) + x = torch.randn(5, 6, 7, 8) - self.run_model_test(MyModel(), train=False, input=x, batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + MyModel(), train=False, input=x, batch_size=BATCH_SIZE, use_gpu=False + ) def test_sum(self): shape = (3, 4, 5) for params in [{}] + [{"dim": i} for i in range(len(shape))]: + class MyModel(torch.nn.Module): def __init__(self): super(MyModel, self).__init__() def forward(self, x): return torch.sum(x, **params) + x = torch.randn(*shape) - self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False + ) def test_cumsum(self): shape = (3, 4, 5) for params in [{"dim": i} for i in range(len(shape))]: + class MyModel(torch.nn.Module): def __init__(self): super(MyModel, self).__init__() def forward(self, x): return torch.cumsum(x, **params) + x = torch.randn(*shape) - self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False, - operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK) + self.run_model_test( + MyModel(), + train=False, + input=(x), + batch_size=BATCH_SIZE, + use_gpu=False, + operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK, + ) def test_cosine_similarity(self): shape = (100, 128) x = torch.randn(*shape) y = torch.randn(*shape) - self.run_model_test(torch.nn.CosineSimilarity(dim=1, eps=1e-6), train=False, - input=(x, y), batch_size=BATCH_SIZE, use_gpu=False, - operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK) + self.run_model_test( + torch.nn.CosineSimilarity(dim=1, eps=1e-6), + train=False, + input=(x, y), + batch_size=BATCH_SIZE, + use_gpu=False, + operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK, + ) @unittest.skip("Disabled due to onnx optimizer deprecation") @skipIfUnsupportedOpsetVersion([10]) @@ -1069,13 +1332,16 @@ def test_lstm_constant_folding(self): class LstmNet(nn.Module): def __init__(self, input_size, hidden_size, num_layers, bidirectional): super(LstmNet, self).__init__() - self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bidirectional=bidirectional) + self.lstm = nn.LSTM( + input_size, hidden_size, num_layers, bidirectional=bidirectional + ) def forward(self, input, initial_state): return self.lstm(input, initial_state) - def get_LstmNet_model_and_inputs(input_size, hidden_size, num_layers, batch_size, - seq_len, bidirectional): + def get_LstmNet_model_and_inputs( + input_size, hidden_size, num_layers, batch_size, seq_len, bidirectional + ): num_directions = 2 if bidirectional else 1 model = LstmNet(input_size, hidden_size, num_layers, bidirectional) input = torch.randn(seq_len, batch_size, input_size) @@ -1085,11 +1351,25 @@ def get_LstmNet_model_and_inputs(input_size, hidden_size, num_layers, batch_size batch_size1 = 3 model1, input1 = get_LstmNet_model_and_inputs(7, 3, 2, batch_size1, 5, True) - self.run_actual_test(model1, train=False, batch_size=batch_size1, input=input1, use_gpu=False, do_constant_folding=True) + self.run_actual_test( + model1, + train=False, + batch_size=batch_size1, + input=input1, + use_gpu=False, + do_constant_folding=True, + ) batch_size2 = 4 model2, input2 = get_LstmNet_model_and_inputs(5, 4, 3, batch_size2, 7, False) - self.run_actual_test(model2, train=False, batch_size=batch_size2, input=input2, use_gpu=False, do_constant_folding=True) + self.run_actual_test( + model2, + train=False, + batch_size=batch_size2, + input=input2, + use_gpu=False, + do_constant_folding=True, + ) @unittest.skip("Disabled due to onnx optimizer deprecation") @skipIfUnsupportedOpsetVersion([10]) @@ -1097,14 +1377,17 @@ def test_gru_constant_folding(self): class GruNet(nn.Module): def __init__(self, input_size, hidden_size, num_layers, bidirectional): super(GruNet, self).__init__() - self.mygru = nn.GRU(input_size, hidden_size, num_layers, bidirectional=bidirectional) + self.mygru = nn.GRU( + input_size, hidden_size, num_layers, bidirectional=bidirectional + ) def forward(self, input, initial_state): out = self.mygru(input, initial_state) return out - def get_GruNet_model_and_inputs(input_size, hidden_size, num_layers, batch_size, - seq_len, bidirectional): + def get_GruNet_model_and_inputs( + input_size, hidden_size, num_layers, batch_size, seq_len, bidirectional + ): num_directions = 2 if bidirectional else 1 model = GruNet(input_size, hidden_size, num_layers, bidirectional) input = torch.randn(seq_len, batch_size, input_size) @@ -1113,11 +1396,25 @@ def get_GruNet_model_and_inputs(input_size, hidden_size, num_layers, batch_size, batch_size1 = 3 model1, input1 = get_GruNet_model_and_inputs(7, 3, 2, batch_size1, 5, True) - self.run_actual_test(model1, train=False, batch_size=batch_size1, input=input1, use_gpu=False, do_constant_folding=True) + self.run_actual_test( + model1, + train=False, + batch_size=batch_size1, + input=input1, + use_gpu=False, + do_constant_folding=True, + ) batch_size2 = 4 model2, input2 = get_GruNet_model_and_inputs(5, 4, 3, batch_size2, 7, False) - self.run_actual_test(model2, train=False, batch_size=batch_size2, input=input2, use_gpu=False, do_constant_folding=True) + self.run_actual_test( + model2, + train=False, + batch_size=batch_size2, + input=input2, + use_gpu=False, + do_constant_folding=True, + ) def test_repeat(self): class MyModel(torch.nn.Module): @@ -1128,14 +1425,17 @@ def forward(self, x): return x.repeat(1, 2, 3, 4) x = torch.randn(4, 3, 2, 1, requires_grad=True) - self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False + ) @skipIfUnsupportedOpsetVersion([10]) def test_upsample(self): x = torch.randn(1, 2, 3, 4, requires_grad=True) model = nn.Upsample(size=[v * 2 for v in x.size()[2:]], mode="nearest") - self.run_model_test(model, train=False, input=(x), - batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + model, train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False + ) @skipIfUnsupportedOpsetVersion([10]) def test_interpolate_upsample(self): @@ -1147,14 +1447,13 @@ def forward(self, x): size = [v * 2 for v in x.size()[2:]] # work around for now: turn the dynamic sizes into constant size = [int(i) for i in size] - return nn.functional.interpolate(x, - size=size, - mode="nearest") + return nn.functional.interpolate(x, size=size, mode="nearest") x = torch.randn(1, 2, 3, 4, requires_grad=True) model = MyModel() - self.run_model_test(model, train=False, input=(x), - batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + model, train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False + ) @skipIfUnsupportedOpsetVersion([7, 8, 10]) def test_interpolate_upsample_dynamic_sizes(self): @@ -1164,14 +1463,13 @@ def __init__(self): def forward(self, x): size = [v * 2 for v in x.size()[2:]] - return nn.functional.interpolate(x, - size=size, - mode="nearest") + return nn.functional.interpolate(x, size=size, mode="nearest") x = torch.randn(1, 2, 3, 4, requires_grad=True) model = MyModel() - self.run_model_test(model, train=False, input=(x), - batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + model, train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False + ) def test_repeat_dim_overflow(self): class MyModel(torch.nn.Module): @@ -1182,7 +1480,9 @@ def forward(self, x): return x.repeat(1, 2, 3, 4) x = torch.randn(1, 2, requires_grad=True) - self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False + ) def test_repeat_dynamic(self): class MyModel(torch.nn.Module): @@ -1194,21 +1494,39 @@ def forward(self, x, y): x = torch.randn(1, 2, requires_grad=True) y = torch.randn(2, 4, requires_grad=True) - self.run_model_test(MyModel(), train=False, input=(x, y), batch_size=BATCH_SIZE, use_gpu=False, - input_names=["x", "y"], dynamic_axes={"x": [0, 1], "y": [0, 1]}) - self.run_model_test(MyModel(), train=False, input=(x, y), batch_size=BATCH_SIZE, use_gpu=False, remained_onnx_input_idx=[0]) + self.run_model_test( + MyModel(), + train=False, + input=(x, y), + batch_size=BATCH_SIZE, + use_gpu=False, + input_names=["x", "y"], + dynamic_axes={"x": [0, 1], "y": [0, 1]}, + ) + self.run_model_test( + MyModel(), + train=False, + input=(x, y), + batch_size=BATCH_SIZE, + use_gpu=False, + remained_onnx_input_idx=[0], + ) def test_mean(self): shape = (3, 4, 5) for params in [{}] + [{"dim": i} for i in range(len(shape))]: + class MyModel(torch.nn.Module): def __init__(self): super(MyModel, self).__init__() def forward(self, x): return torch.mean(x, **params) + x = torch.randn(*shape) - self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False + ) # TODO: Add test cases for prod once Caffe2 has support for ReduceProd def test_softmax(self): @@ -1217,7 +1535,9 @@ def test_softmax(self): model = nn.Softmax(dim=d) dims = [2] * (i - 2) + [3, 4] input = torch.ones(*dims, requires_grad=True) - self.run_model_test(model, train=False, batch_size=BATCH_SIZE, input=input) + self.run_model_test( + model, train=False, batch_size=BATCH_SIZE, input=input + ) def test_softmax_dtype(self): class SoftmaxModel(torch.nn.Module): @@ -1246,8 +1566,15 @@ def test_randn(self): class MyModule(torch.nn.Module): def forward(self, x): return (torch.randn(1, 2, 3, 4) + x).shape - self.run_model_test(MyModule(), train=False, input=(x), - batch_size=BATCH_SIZE, use_gpu=False, remained_onnx_input_idx=[]) + + self.run_model_test( + MyModule(), + train=False, + input=(x), + batch_size=BATCH_SIZE, + use_gpu=False, + remained_onnx_input_idx=[], + ) def test_rand(self): x = torch.randn(1, 2, 3, 4) @@ -1255,11 +1582,20 @@ def test_rand(self): class MyModule(torch.nn.Module): def forward(self, x): return (torch.rand(1, 2, 3, 4) + x).shape - self.run_model_test(MyModule(), train=False, input=(x), - batch_size=BATCH_SIZE, use_gpu=False, remained_onnx_input_idx=[]) + + self.run_model_test( + MyModule(), + train=False, + input=(x), + batch_size=BATCH_SIZE, + use_gpu=False, + remained_onnx_input_idx=[], + ) def test_convtranspose(self): - model = nn.ConvTranspose2d(3, 3, 3, stride=3, bias=False, padding=1, output_padding=2) + model = nn.ConvTranspose2d( + 3, 3, 3, stride=3, bias=False, padding=1, output_padding=2 + ) self.run_model_test(model, train=False, batch_size=BATCH_SIZE, atol=1e-7) def test_unsqueeze(self): @@ -1273,8 +1609,11 @@ def __init__(self): def forward(self, x): return x.unsqueeze(dim) + x = torch.randn(*shape) - self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, atol=1e-7) + self.run_model_test( + MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, atol=1e-7 + ) def test_squeeze(self): shape = (1, 1, 1) @@ -1287,8 +1626,11 @@ def __init__(self): def forward(self, x): return x.squeeze(dim) + x = torch.randn(*shape) - self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, atol=1e-7) + self.run_model_test( + MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, atol=1e-7 + ) # NB: InstanceNorm model includes unused weights, so skip this in TestCaffe2BackendEmbed # TODO: We should have another pass to eliminate the unused initializers in ONNX models. @@ -1301,10 +1643,10 @@ def test_instance_norm(self): def test_pixel_shuffle(self): underlying = nn.PixelShuffle(4) shape = (1, 32, 5, 5) - input = Variable(torch.randn(*shape), - requires_grad=True) - self.run_model_test(underlying, train=False, input=(input), - batch_size=BATCH_SIZE) + input = Variable(torch.randn(*shape), requires_grad=True) + self.run_model_test( + underlying, train=False, input=(input), batch_size=BATCH_SIZE + ) def test_dynamic_sizes(self): class MyModel(torch.nn.Module): @@ -1315,8 +1657,11 @@ def forward(self, x): shape = torch.onnx.operators.shape_as_tensor(x) new_shape = torch.cat((torch.LongTensor([-1]), shape[0].view(1))) return torch.onnx.operators.reshape_from_tensor_shape(x, new_shape) + x = torch.randn(3, 5, 7) - self.run_model_test(MyModel(), train=False, input=x, batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + MyModel(), train=False, input=x, batch_size=BATCH_SIZE, use_gpu=False + ) def test_advanced_broadcast(self): class MyModel(torch.nn.Module): @@ -1325,9 +1670,12 @@ def __init__(self): def forward(self, x, y): return torch.mul(x, y) + x = torch.randn(1, 5, 10) y = torch.randn(1, 5, 1) - self.run_model_test(MyModel(), train=False, input=(x, y), batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + MyModel(), train=False, input=(x, y), batch_size=BATCH_SIZE, use_gpu=False + ) def test_int8_export(self): class MyModel(torch.nn.Module): @@ -1339,15 +1687,24 @@ def forward(self, x): return x * self.param.float() import io + f = io.BytesIO() from torch.onnx import ExportTypes - torch.onnx._export(MyModel(), (torch.rand(3, 4),), f, verbose=True, export_type=ExportTypes.ZIP_ARCHIVE, - keep_initializers_as_inputs=True) + + torch.onnx._export( + MyModel(), + (torch.rand(3, 4),), + f, + verbose=True, + export_type=ExportTypes.ZIP_ARCHIVE, + keep_initializers_as_inputs=True, + ) X = np.random.rand(3, 4).astype(np.float32) f.seek(0) import caffe2.python.onnx.backend as c2 + model = c2.prepare_zip_archive(f) model.run(X) @@ -1358,7 +1715,9 @@ def forward(self, x): return x[-1, :, :] x = torch.randn(3, 4, 5) - self.run_model_test(NegSlice(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + NegSlice(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False + ) @skipIfUnsupportedOpsetVersion([10]) def test_neg_slice_large(self): @@ -1367,7 +1726,9 @@ def forward(self, x): return x[:, :, :, :, -3] x = torch.randn(3, 4, 5, 6, 7) - self.run_model_test(NegSlice(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + NegSlice(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False + ) @unittest.skip("https://github.com/pytorch/pytorch/issues/10984") @skipIfUnsupportedOpsetVersion([10]) @@ -1377,7 +1738,9 @@ def forward(self, x): return x[:, :, :, :, -1] x = torch.randn(3, 4, 5, 6, 7) - self.run_model_test(NegSlice(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + NegSlice(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False + ) @skipIfUnsupportedMinOpsetVersion(11) def test_dynamic_slice(self): @@ -1385,22 +1748,34 @@ class DynamicSliceExportMod(torch.nn.Module): def forward(self, x): results = [] for i in range(4): - results.append(x[:x.size(0) - i, i:x.size(2), i:3]) + results.append(x[: x.size(0) - i, i : x.size(2), i:3]) return tuple(results) x = torch.rand(5, 5, 5) - self.run_model_test(DynamicSliceExportMod(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + DynamicSliceExportMod(), + train=False, + input=(x,), + batch_size=BATCH_SIZE, + use_gpu=False, + ) @skipIfUnsupportedMinOpsetVersion(11) def test_dynamic_slice_script(self): class DynamicSliceModel(torch.jit.ScriptModule): @torch.jit.script_method def forward(self, x): - return x[1:x.size(0)] + return x[1 : x.size(0)] + module = DynamicSliceModel() x = torch.rand(1, 2) - self.run_model_test(DynamicSliceModel(), train=False, input=(x,), - batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + DynamicSliceModel(), + train=False, + input=(x,), + batch_size=BATCH_SIZE, + use_gpu=False, + ) @skipIfUnsupportedMinOpsetVersion(11) def test_dynamic_slice_to_the_end(self): @@ -1412,7 +1787,13 @@ def forward(self, x): return tuple(results) x = torch.rand(5, 5, 5) - self.run_model_test(DynamicSliceExportMod(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + DynamicSliceExportMod(), + train=False, + input=(x,), + batch_size=BATCH_SIZE, + use_gpu=False, + ) def test_unbind(self): class UnbindModel(torch.nn.Module): @@ -1420,7 +1801,9 @@ def forward(self, input): return input.unbind() x = torch.randn(3, 4, 5) - self.run_model_test(UnbindModel(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + UnbindModel(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False + ) class UnbindModel2(torch.nn.Module): def forward(self, input): @@ -1428,7 +1811,13 @@ def forward(self, input): return out x = torch.randn(3, 4, 5) - self.run_model_test(UnbindModel2(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + UnbindModel2(), + train=False, + input=(x,), + batch_size=BATCH_SIZE, + use_gpu=False, + ) @skipIfUnsupportedMinOpsetVersion(9) def test_inplace_zero(self): @@ -1437,9 +1826,23 @@ def forward(self, x): return x.zero_() x = torch.randn(2, 3, 4) - self.run_model_test(Zero_(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False, - input_names=['x'], dynamic_axes={'x': [0, 1, 2]}) - self.run_model_test(Zero_(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False, remained_onnx_input_idx=[]) + self.run_model_test( + Zero_(), + train=False, + input=(x,), + batch_size=BATCH_SIZE, + use_gpu=False, + input_names=["x"], + dynamic_axes={"x": [0, 1, 2]}, + ) + self.run_model_test( + Zero_(), + train=False, + input=(x,), + batch_size=BATCH_SIZE, + use_gpu=False, + remained_onnx_input_idx=[], + ) @skipIfUnsupportedMinOpsetVersion(9) def test_inplace_fill(self): @@ -1448,9 +1851,23 @@ def forward(self, x): return x.fill_(3) x = torch.randn(2, 3, 4) - self.run_model_test(Fill_(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False, - input_names=['x'], dynamic_axes={'x': [0, 1, 2]}) - self.run_model_test(Fill_(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False, remained_onnx_input_idx=[]) + self.run_model_test( + Fill_(), + train=False, + input=(x,), + batch_size=BATCH_SIZE, + use_gpu=False, + input_names=["x"], + dynamic_axes={"x": [0, 1, 2]}, + ) + self.run_model_test( + Fill_(), + train=False, + input=(x,), + batch_size=BATCH_SIZE, + use_gpu=False, + remained_onnx_input_idx=[], + ) # ConstantFill is a deprecated experimental op (used in opsets < 9). # Shape inference does not cover this op. @@ -1467,9 +1884,9 @@ def forward(self): x = torch.ones(2, 3, 4) y = torch.ones(2, 3, 4) * 2 - self.run_model_test(Arithmetic(), - train=False, input=(), batch_size=BATCH_SIZE, - use_gpu=False) + self.run_model_test( + Arithmetic(), train=False, input=(), batch_size=BATCH_SIZE, use_gpu=False + ) def test_tensor_factories(self): class TensorFactory(torch.nn.Module): @@ -1477,37 +1894,88 @@ def forward(self, x): return torch.zeros(x.size()) + torch.ones(x.size()) x = torch.randn(2, 3, 4) - self.run_model_test(TensorFactory(), train=False, input=(x,), batch_size=BATCH_SIZE, - use_gpu=False, input_names=['x'], dynamic_axes={'x': [0, 1, 2]}) - self.run_model_test(TensorFactory(), train=False, input=(x,), batch_size=BATCH_SIZE, - use_gpu=False, remained_onnx_input_idx=[]) + self.run_model_test( + TensorFactory(), + train=False, + input=(x,), + batch_size=BATCH_SIZE, + use_gpu=False, + input_names=["x"], + dynamic_axes={"x": [0, 1, 2]}, + ) + self.run_model_test( + TensorFactory(), + train=False, + input=(x,), + batch_size=BATCH_SIZE, + use_gpu=False, + remained_onnx_input_idx=[], + ) def test_tensor_factories_script(self): class TensorFactory(torch.jit.ScriptModule): @torch.jit.script_method def forward(self, x): - return torch.zeros(x.shape, dtype=torch.float) + torch.ones(x.shape, dtype=torch.float) + return torch.zeros(x.shape, dtype=torch.float) + torch.ones( + x.shape, dtype=torch.float + ) x = torch.randn(2, 3, 4) - self.run_model_test(TensorFactory(), train=False, input=(x,), batch_size=BATCH_SIZE, - use_gpu=False, input_names=['x'], dynamic_axes={'x': [0, 1, 2]}) - self.run_model_test(TensorFactory(), train=False, input=(x,), batch_size=BATCH_SIZE, - use_gpu=False, remained_onnx_input_idx=[]) + self.run_model_test( + TensorFactory(), + train=False, + input=(x,), + batch_size=BATCH_SIZE, + use_gpu=False, + input_names=["x"], + dynamic_axes={"x": [0, 1, 2]}, + ) + self.run_model_test( + TensorFactory(), + train=False, + input=(x,), + batch_size=BATCH_SIZE, + use_gpu=False, + remained_onnx_input_idx=[], + ) def test_tensor_like_factories_script(self): class TensorFactory(torch.jit.ScriptModule): @torch.jit.script_method def forward(self, x): - zeros = torch.zeros_like(x, dtype=torch.float, layout=torch.strided, device=torch.device("cpu")) - ones = torch.ones_like(x, dtype=torch.float, layout=torch.strided, device=torch.device("cpu")) + zeros = torch.zeros_like( + x, + dtype=torch.float, + layout=torch.strided, + device=torch.device("cpu"), + ) + ones = torch.ones_like( + x, + dtype=torch.float, + layout=torch.strided, + device=torch.device("cpu"), + ) return zeros + ones x = torch.randn(2, 3, 4) - self.run_model_test(TensorFactory(), train=False, input=(x,), batch_size=BATCH_SIZE, - use_gpu=False, input_names=['x'], dynamic_axes={'x': [0, 1, 2]}) + self.run_model_test( + TensorFactory(), + train=False, + input=(x,), + batch_size=BATCH_SIZE, + use_gpu=False, + input_names=["x"], + dynamic_axes={"x": [0, 1, 2]}, + ) remained_onnx_input_idx = None if self.opset_version < 9 else [] - self.run_model_test(TensorFactory(), train=False, input=(x,), batch_size=BATCH_SIZE, - use_gpu=False, remained_onnx_input_idx=remained_onnx_input_idx) + self.run_model_test( + TensorFactory(), + train=False, + input=(x,), + batch_size=BATCH_SIZE, + use_gpu=False, + remained_onnx_input_idx=remained_onnx_input_idx, + ) def test_full(self): class FullModel(torch.nn.Module): @@ -1515,8 +1983,9 @@ def forward(self, x): return torch.full((3, 4), x, dtype=torch.long) x = torch.tensor(12) - self.run_model_test(FullModel(), train=False, input=(x,), batch_size=BATCH_SIZE, - use_gpu=False) + self.run_model_test( + FullModel(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False + ) def test_full_script(self): class FullClass(torch.jit.ScriptModule): @@ -1525,7 +1994,9 @@ def forward(self, x): return torch.full((4, 5), x, dtype=torch.long) x = torch.tensor(12) - self.run_model_test(FullClass(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + FullClass(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False + ) def test_clamp(self): class ClampModel(torch.nn.Module): @@ -1533,21 +2004,27 @@ def forward(self, x): return x.clamp(-0.5, 0.5) x = torch.randn(3, 4) - self.run_model_test(ClampModel(), train=False, input=(x,), batch_size=BATCH_SIZE) + self.run_model_test( + ClampModel(), train=False, input=(x,), batch_size=BATCH_SIZE + ) class ClampMinModel(torch.nn.Module): def forward(self, x): return x.clamp(min=-0.5) x = torch.randn(3, 4) - self.run_model_test(ClampMinModel(), train=False, input=(x,), batch_size=BATCH_SIZE) + self.run_model_test( + ClampMinModel(), train=False, input=(x,), batch_size=BATCH_SIZE + ) class ClampMaxModel(torch.nn.Module): def forward(self, x): return x.clamp(max=0.5) x = torch.randn(3, 4) - self.run_model_test(ClampMaxModel(), train=False, input=(x,), batch_size=BATCH_SIZE) + self.run_model_test( + ClampMaxModel(), train=False, input=(x,), batch_size=BATCH_SIZE + ) @skipIfUnsupportedMinOpsetVersion(9) def test_where_functional(self): @@ -1556,7 +2033,13 @@ def forward(self, x): return torch.where(x > 2.0, x, torch.neg(x)) x = torch.randn(3, 4) - self.run_model_test(WhereFunctional(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + WhereFunctional(), + train=False, + input=(x,), + batch_size=BATCH_SIZE, + use_gpu=False, + ) @skipIfUnsupportedMinOpsetVersion(9) def test_where_method(self): @@ -1565,15 +2048,25 @@ def forward(self, x): return x.where(x > 2.0, torch.neg(x)) x = torch.randn(3, 4) - self.run_model_test(WhereMethod(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + WhereMethod(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False + ) def test_data_dependent_zeros_factory(self): class ZerosFactory(torch.nn.Module): def forward(self, input): - return torch.cat([input, torch.zeros(input.size(0), 1).type_as(input)], dim=1) + return torch.cat( + [input, torch.zeros(input.size(0), 1).type_as(input)], dim=1 + ) x = torch.zeros(3, 4) - self.run_model_test(ZerosFactory(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + ZerosFactory(), + train=False, + input=(x,), + batch_size=BATCH_SIZE, + use_gpu=False, + ) def test_implicit_expand(self): class ImplicitExpandExportMod(torch.nn.Module): @@ -1581,7 +2074,13 @@ def forward(self, x): return x + 1 x = torch.randn(3, 4) - self.run_model_test(ImplicitExpandExportMod(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + ImplicitExpandExportMod(), + train=False, + input=(x,), + batch_size=BATCH_SIZE, + use_gpu=False, + ) def test_reduce_sum(self): class ReduceSumNegativeIndices(torch.nn.Module): @@ -1589,7 +2088,13 @@ def forward(self, x): return x.sum(-1) x = torch.randn(2, 3, 4) - self.run_model_test(ReduceSumNegativeIndices(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + ReduceSumNegativeIndices(), + train=False, + input=(x,), + batch_size=BATCH_SIZE, + use_gpu=False, + ) def test_reduce_sum_multi_dim(self): class ReduceSumMultipleAxes(torch.nn.Module): @@ -1597,7 +2102,13 @@ def forward(self, x): return x.sum(dim=(2, 3), keepdim=True) x = torch.randn(16, 3, 256, 256) - self.run_model_test(ReduceSumMultipleAxes(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + ReduceSumMultipleAxes(), + train=False, + input=(x,), + batch_size=BATCH_SIZE, + use_gpu=False, + ) # InstanceNorm model (used in the subgraph) includes unused weights, # so skip this in TestCaffe2BackendEmbed @@ -1621,8 +2132,9 @@ def forward(self, x): return 1 - x x = torch.randn(1, 2) - self.run_model_test(RsubModel(), train=False, input=(x,), - batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + RsubModel(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False + ) @skipIfUnsupportedMinOpsetVersion(9) def test_isnan(self): @@ -1631,7 +2143,9 @@ def forward(self, input): return torch.isnan(input) x = torch.tensor([1.0, float("nan"), 2.0]) - self.run_model_test(IsNaNModel(), train=False, input=x, batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + IsNaNModel(), train=False, input=x, batch_size=BATCH_SIZE, use_gpu=False + ) @skipIfUnsupportedMinOpsetVersion(9) def test_scatter(self): @@ -1642,21 +2156,36 @@ def forward(self, input, indices, values): input = torch.tensor([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]) indices = torch.tensor([[1, 0], [0, 2], [0, 1]], dtype=torch.int64) values = torch.tensor([[1.0, 1.1], [2.0, 2.1], [3.0, 3.1]]) - self.run_model_test(ScatterModel(), train=False, input=(input, indices, values), - batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + ScatterModel(), + train=False, + input=(input, indices, values), + batch_size=BATCH_SIZE, + use_gpu=False, + ) input = torch.zeros(3, 4, 5, 6) indices = torch.tensor([[1, 0], [0, 2], [0, 1]], dtype=torch.int64) indices = indices.view(3, 2, 1, 1).expand(3, 2, 5, 6) values = torch.arange(3 * 2 * 5 * 6, dtype=torch.float32).view(3, 2, 5, 6) - self.run_model_test(ScatterModel(), train=False, input=(input, indices, values), - batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + ScatterModel(), + train=False, + input=(input, indices, values), + batch_size=BATCH_SIZE, + use_gpu=False, + ) input = torch.zeros(3, 4, 2) indices = torch.tensor([[[1, 0], [0, 2]], [[1, 1], [0, 1]], [[2, 1], [2, 2]]]) values = torch.arange(3 * 2 * 2, dtype=torch.float32).view(3, 2, 2) - self.run_model_test(ScatterModel(), train=False, input=(input, indices, values), - batch_size=BATCH_SIZE, use_gpu=False) + self.run_model_test( + ScatterModel(), + train=False, + input=(input, indices, values), + batch_size=BATCH_SIZE, + use_gpu=False, + ) @skipIfUnsupportedOpsetVersion([10]) def test_flatten(self): @@ -1698,7 +2227,9 @@ def forward(self, input, other): x = torch.randn(4, 4, requires_grad=True) y = torch.randn(4, 4, requires_grad=True) - self.run_model_test(MaxModel(), train=False, input=(x, y), batch_size=BATCH_SIZE) + self.run_model_test( + MaxModel(), train=False, input=(x, y), batch_size=BATCH_SIZE + ) def test_min(self): class MinModel(torch.nn.Module): @@ -1755,7 +2286,9 @@ def forward(self, input): return input.reshape_as(y) x = torch.randn(2, 3, requires_grad=True) - self.run_model_test(ReshapeAsModel(), train=False, input=x, batch_size=BATCH_SIZE) + self.run_model_test( + ReshapeAsModel(), train=False, input=x, batch_size=BATCH_SIZE + ) @skipIfUnsupportedOpsetVersion([10]) def test_narrow(self): @@ -1795,11 +2328,24 @@ def __init__(self): def forward(self, feature, im_info, anchors): bbox_deltas = self.conv(feature) a, b = torch.ops._caffe2.GenerateProposals( - feature, bbox_deltas, im_info, anchors, - 2.0, 6000, 300, 0.7, 16, True, -90, 90, 1.0, True, + feature, + bbox_deltas, + im_info, + anchors, + 2.0, + 6000, + 300, + 0.7, + 16, + True, + -90, + 90, + 1.0, + True, ) output = torch.ops._caffe2.RoIAlign( - feature, a, + feature, + a, order="NCHW", spatial_scale=1.0, pooled_h=3, @@ -1816,7 +2362,9 @@ def forward(self, feature, im_info, anchors): model = MyModel() with torch.no_grad(): - self.run_model_test(MyModel(), train=False, input=inputs, batch_size=BATCH_SIZE) + self.run_model_test( + MyModel(), train=False, input=inputs, batch_size=BATCH_SIZE + ) def test_c2_roi_align(self): class MyModel(torch.nn.Module): @@ -1825,8 +2373,14 @@ def __init__(self): def forward(self, feature, rois): roi_feature = torch.ops._caffe2.RoIAlign( - feature, rois, order="NCHW", spatial_scale=1.0, - pooled_h=3, pooled_w=3, sampling_ratio=3, aligned=False, + feature, + rois, + order="NCHW", + spatial_scale=1.0, + pooled_h=3, + pooled_w=3, + sampling_ratio=3, + aligned=False, ) return roi_feature @@ -1852,8 +2406,20 @@ def __init__(self): def forward(self, scores, bbox_deltas, im_info, anchors): a, b = torch.ops._caffe2.GenerateProposals( - scores, bbox_deltas, im_info, anchors, - 2.0, 6000, 300, 0.7, 16, True, -90, 90, 1.0, True, + scores, + bbox_deltas, + im_info, + anchors, + 2.0, + 6000, + 300, + 0.7, + 16, + True, + -90, + 90, + 1.0, + True, ) return a, b @@ -1862,8 +2428,9 @@ def forward(self, scores, bbox_deltas, im_info, anchors): W = 8 img_count = 3 scores = torch.ones(img_count, A, H, W, dtype=torch.float32) - bbox_deltas = torch.linspace(0, 10, steps=img_count * 4 * A * H * W, - dtype=torch.float32) + bbox_deltas = torch.linspace( + 0, 10, steps=img_count * 4 * A * H * W, dtype=torch.float32 + ) bbox_deltas = bbox_deltas.view(img_count, 4 * A, H, W) im_info = torch.ones(img_count, 3, dtype=torch.float32) anchors = torch.ones(A, 4, dtype=torch.float32) @@ -1880,7 +2447,7 @@ def forward(self, rois, deltas, im_info): rois, deltas, im_info, - weights=[1., 1., 1., 1.], + weights=[1.0, 1.0, 1.0, 1.0], apply_scale=False, rotated=True, angle_bound_on=True, @@ -1905,7 +2472,9 @@ def forward(self, rois, deltas, im_info): im_info[:, 2] = 1.0 im_info = torch.zeros((batch_size, 3)) inputs = (torch.tensor(rois), torch.tensor(deltas), torch.tensor(im_info)) - self.run_model_test(MyModel(), train=False, input=inputs, batch_size=3, use_gpu=False) + self.run_model_test( + MyModel(), train=False, input=inputs, batch_size=3, use_gpu=False + ) # BoxWithNMSLimits has requirements for the inputs, so randomly generated inputs # in Caffe2BackendTestEmbed doesn't work with this op. @@ -1964,8 +2533,14 @@ def forward(self, class_prob, pred_bbox, batch_splits): ) return a, b, c, d, e, f - inputs = (torch.tensor(class_prob), torch.tensor(pred_bbox), torch.tensor(batch_splits)) - self.run_model_test(MyModel(), train=False, input=inputs, batch_size=3, use_gpu=False) + inputs = ( + torch.tensor(class_prob), + torch.tensor(pred_bbox), + torch.tensor(batch_splits), + ) + self.run_model_test( + MyModel(), train=False, input=inputs, batch_size=3, use_gpu=False + ) def test_c2_inference_lstm(self): num_layers = 4 @@ -1998,39 +2573,54 @@ def forward(self, lstm_in): bias=has_bias, num_layers=num_layers, ) - lstm_in = [ - torch.from_numpy(inputs), - torch.from_numpy(hx), - torch.from_numpy(hx), - ] + [param.detach() for param in torch_lstm._flat_weights] + lstm_in = ( + [ + torch.from_numpy(inputs), + torch.from_numpy(hx), + torch.from_numpy(hx), + ] + + [param.detach() for param in torch_lstm._flat_weights], + ) - self.run_model_test(MyModel(), train=False, input=lstm_in, batch_size=3, use_gpu=False) + self.run_model_test( + MyModel(), train=False, input=lstm_in, batch_size=3, use_gpu=False + ) def test_tuple_input_output(self): class TupleModel(torch.jit.ScriptModule): @torch.jit.script_method - def forward(self, a: Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]: + def forward( + self, a: Tuple[torch.Tensor, torch.Tensor] + ) -> Tuple[torch.Tensor, torch.Tensor]: return a x = (torch.randn(3, 4), torch.randn(4, 3)) - self.run_model_test(TupleModel(), train=False, input=(x,), batch_size=BATCH_SIZE) + self.run_model_test( + TupleModel(), train=False, input=(x,), batch_size=BATCH_SIZE + ) def test_nested_tuple_input_output(self): class NestedTupleModel(torch.jit.ScriptModule): @torch.jit.script_method - def forward(self, a: torch.Tensor, b: Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]) -> torch.Tensor: + def forward( + self, + a: torch.Tensor, + b: Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], + ) -> torch.Tensor: return a + b[0] + b[1][0] + b[1][1] x = torch.randn(4, 5) y = (torch.randn(4, 5), (torch.randn(4, 5), torch.randn(4, 5))) - self.run_model_test(NestedTupleModel(), train=False, input=(x, y), batch_size=BATCH_SIZE) + self.run_model_test( + NestedTupleModel(), train=False, input=(x, y), batch_size=BATCH_SIZE + ) def test_topk(self): class TopKModel(torch.nn.Module): def forward(self, input): return torch.topk(input, 3) - x = torch.arange(1., 6.) + x = torch.arange(1.0, 6.0) self.run_model_test(TopKModel(), train=False, input=x, batch_size=BATCH_SIZE) def test_topk_script(self): @@ -2065,8 +2655,13 @@ def forward(self, input): return torch._dim_arange(input, 1) x = torch.ones(5, 6) - self.run_model_test(DimArange(), train=False, input=x, batch_size=BATCH_SIZE, - operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK) + self.run_model_test( + DimArange(), + train=False, + input=x, + batch_size=BATCH_SIZE, + operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK, + ) @skipIfUnsupportedMinOpsetVersion(9) def test_arange_end(self): @@ -2076,13 +2671,17 @@ def forward(self, a): return torch.arange(a.size(0), dtype=torch.float).view(-1, 1) + a x = torch.randn(3, 4, requires_grad=True) - self.run_model_test(ArangeScript(), train=False, input=(x,), batch_size=BATCH_SIZE) + self.run_model_test( + ArangeScript(), train=False, input=(x,), batch_size=BATCH_SIZE + ) class ArangeModel(torch.nn.Module): def forward(self, a): return torch.arange(a.size(0), dtype=torch.float).view(-1, 1) + a - self.run_model_test(ArangeModel(), train=False, input=(x,), batch_size=BATCH_SIZE) + self.run_model_test( + ArangeModel(), train=False, input=(x,), batch_size=BATCH_SIZE + ) @skipIfUnsupportedMinOpsetVersion(9) def test_arange_start_end(self): @@ -2092,29 +2691,47 @@ def forward(self, a): return torch.arange(2, a.size(0) + 2, dtype=torch.float).view(-1, 1) + a x = torch.randn(3, 4, requires_grad=True) - self.run_model_test(ArangeScript(), train=False, input=(x,), batch_size=BATCH_SIZE) + self.run_model_test( + ArangeScript(), train=False, input=(x,), batch_size=BATCH_SIZE + ) class ArangeModel(torch.nn.Module): def forward(self, a): return torch.arange(2, a.size(0) + 2, dtype=torch.float).view(-1, 1) + a - self.run_model_test(ArangeModel(), train=False, input=(x,), batch_size=BATCH_SIZE) + self.run_model_test( + ArangeModel(), train=False, input=(x,), batch_size=BATCH_SIZE + ) @skipIfUnsupportedMinOpsetVersion(9) def test_arange_start_end_step(self): class ArangeScript(torch.jit.ScriptModule): @torch.jit.script_method def forward(self, a): - return torch.arange(2, a.size(0) * a.size(1) + 2, a.size(1), dtype=torch.float).view(-1, 1) + a + return ( + torch.arange( + 2, a.size(0) * a.size(1) + 2, a.size(1), dtype=torch.float + ).view(-1, 1) + + a + ) x = torch.randn(3, 4, requires_grad=True) - self.run_model_test(ArangeScript(), train=False, input=(x,), batch_size=BATCH_SIZE) + self.run_model_test( + ArangeScript(), train=False, input=(x,), batch_size=BATCH_SIZE + ) class ArangeModel(torch.nn.Module): def forward(self, a): - return torch.arange(2, a.size(0) * a.size(1) + 2, a.size(1), dtype=torch.float).view(-1, 1) + a + return ( + torch.arange( + 2, a.size(0) * a.size(1) + 2, a.size(1), dtype=torch.float + ).view(-1, 1) + + a + ) - self.run_model_test(ArangeModel(), train=False, input=(x,), batch_size=BATCH_SIZE) + self.run_model_test( + ArangeModel(), train=False, input=(x,), batch_size=BATCH_SIZE + ) @skipIfUnsupportedMinOpsetVersion(9) def test_size(self): @@ -2123,9 +2740,21 @@ def forward(self, input): return torch.arange(input.size(0)), torch.arange(input.size(-1)) x = torch.randn(5, 3, 2) - self.run_model_test(SizeModel(), train=False, input=(x,), batch_size=BATCH_SIZE, - input_names=['x'], dynamic_axes={'x': [0, 1, 2]}) - self.run_model_test(SizeModel(), train=False, input=(x,), batch_size=BATCH_SIZE, remained_onnx_input_idx=[]) + self.run_model_test( + SizeModel(), + train=False, + input=(x,), + batch_size=BATCH_SIZE, + input_names=["x"], + dynamic_axes={"x": [0, 1, 2]}, + ) + self.run_model_test( + SizeModel(), + train=False, + input=(x,), + batch_size=BATCH_SIZE, + remained_onnx_input_idx=[], + ) def test_log2(self): class Log2Model(torch.nn.Module): @@ -2142,8 +2771,12 @@ def forward(self, input): x = torch.randn(2, 3, 4, requires_grad=False) model = DirichletModel() - onnxir, _ = do_export(model, x, keep_initializers_as_inputs=True, - operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK) + onnxir, _ = do_export( + model, + x, + keep_initializers_as_inputs=True, + operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK, + ) onnx_model = onnx.ModelProto.FromString(onnxir) prepared = c2.prepare(onnx_model) caffe2_out = prepared.run(inputs=[x.cpu().numpy()]) @@ -2156,8 +2789,12 @@ def forward(self, input): x = torch.randn(2, 3, 4, requires_grad=False) model = GammaModel() - onnxir, _ = do_export(model, x, keep_initializers_as_inputs=True, - operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK) + onnxir, _ = do_export( + model, + x, + keep_initializers_as_inputs=True, + operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK, + ) onnx_model = onnx.ModelProto.FromString(onnxir) prepared = c2.prepare(onnx_model) caffe2_out = prepared.run(inputs=[x.cpu().numpy()]) @@ -2176,8 +2813,12 @@ def forward(self, weight): return torch.multinomial(weight, 1) weight = torch.tensor([[0, 10, 0, 0], [0, 0, 100, 0]], dtype=torch.float) - self.run_model_test(Multinomial(), train=False, input=weight, batch_size=BATCH_SIZE) - self.run_model_test(MultinomialNoReplacement(), train=False, input=weight, batch_size=BATCH_SIZE) + self.run_model_test( + Multinomial(), train=False, input=weight, batch_size=BATCH_SIZE + ) + self.run_model_test( + MultinomialNoReplacement(), train=False, input=weight, batch_size=BATCH_SIZE + ) def test_prim_shape(self): x = torch.randn(4, 5, requires_grad=True) @@ -2189,7 +2830,10 @@ def view_by_prim_shape(x): class PrimShapeModel(torch.nn.Module): def forward(self, input): return view_by_prim_shape(input) - self.run_model_test(PrimShapeModel(), train=False, input=x, batch_size=BATCH_SIZE) + + self.run_model_test( + PrimShapeModel(), train=False, input=x, batch_size=BATCH_SIZE + ) def test_and(self): class AndModel(torch.nn.Module): @@ -2198,7 +2842,9 @@ def forward(self, x, y): x = torch.randint(0, 1, (3, 5), dtype=torch.bool) y = torch.randint(0, 1, (3, 5), dtype=torch.bool) - self.run_model_test(AndModel(), train=False, input=(x, y), batch_size=BATCH_SIZE) + self.run_model_test( + AndModel(), train=False, input=(x, y), batch_size=BATCH_SIZE + ) def test_or(self): class OrModel(torch.nn.Module): @@ -2233,16 +2879,21 @@ def forward(self, x): model = WhileModel() inputs = torch.zeros(1, 2, 3, dtype=torch.long) - self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE,) + self.run_model_test( + model, + train=False, + input=(inputs,), + batch_size=BATCH_SIZE, + ) def test_while_cond(self): class WhileModel(torch.jit.ScriptModule): @torch.jit.script_method def forward(self, x, a): - b = (a < 4) + b = a < 4 while b: a += b.to(torch.long) - b = (a < 4) + b = a < 4 return x + a model = WhileModel() @@ -2293,7 +2944,12 @@ def forward(self, x): model = NestedLoopsModel() inputs = torch.zeros(1, 2, 3, dtype=torch.long) - self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE,) + self.run_model_test( + model, + train=False, + input=(inputs,), + batch_size=BATCH_SIZE, + ) def test_select(self): class SelectModel(torch.nn.Module): @@ -2302,7 +2958,7 @@ def forward(self, x): model = SelectModel() inputs = torch.randn(3, 2, 1) - self.run_model_test(model, train=False, input=(inputs, ), batch_size=BATCH_SIZE) + self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE) def test_std(self): class StandardDeviation(torch.nn.Module): @@ -2330,14 +2986,18 @@ def forward(self, x): return x.masked_fill(mask, 2) x = torch.zeros(4, 2, 3, requires_grad=True) - self.run_model_test(MaskedFillModel(), input=(x, ), train=False, batch_size=BATCH_SIZE) + self.run_model_test( + MaskedFillModel(), input=(x,), train=False, batch_size=BATCH_SIZE + ) class MaskedFillModel2(torch.nn.Module): def forward(self, x): return x.masked_fill(x > 3, -1) x = torch.arange(16).view(2, 2, 4).to(torch.float32) - self.run_model_test(MaskedFillModel2(), input=(x, ), train=False, batch_size=BATCH_SIZE) + self.run_model_test( + MaskedFillModel2(), input=(x,), train=False, batch_size=BATCH_SIZE + ) @skipIfUnsupportedMinOpsetVersion(8) def test_meshgrid(self): @@ -2368,22 +3028,42 @@ def forward(self, input): inputs = torch.randint(10, (2, 3)) model = RemainderModel() - self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE,) + self.run_model_test( + model, + train=False, + input=(inputs,), + batch_size=BATCH_SIZE, + ) def test_baddbmm(self): class MyModule(torch.nn.Module): def forward(self, input, batch1, batch2): - return torch.baddbmm(input, batch1, batch2, alpha=torch.tensor(5), beta=3.5) + return torch.baddbmm( + input, batch1, batch2, alpha=torch.tensor(5), beta=3.5 + ) + x = torch.randn(10, 3, 5) batch1 = torch.randn(10, 3, 4) batch2 = torch.randn(10, 4, 5) - self.run_model_test(MyModule(), input=(x, batch1, batch2), train=False, batch_size=BATCH_SIZE) + self.run_model_test( + MyModule(), input=(x, batch1, batch2), train=False, batch_size=BATCH_SIZE + ) @skipIfUnsupportedMinOpsetVersion(9) def test_gelu(self): class GeluModel(torch.nn.Module): def forward(self, x): - return torch.nn.functional.gelu(x) + return torch.nn.functional.gelu(x, approximate="none") + + model = GeluModel() + inputs = torch.randn(2, 4, 5, 6, requires_grad=True) + self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE) + + @skipIfUnsupportedMinOpsetVersion(9) + def test_tanh_gelu(self): + class GeluModel(torch.nn.Module): + def forward(self, x): + return torch.nn.functional.gelu(x, approximate="tanh") model = GeluModel() inputs = torch.randn(2, 4, 5, 6, requires_grad=True) @@ -2397,7 +3077,9 @@ def forward(self, input): return input.index_fill(2, index, -1) x = torch.randn(3, 4, 5, requires_grad=True) - self.run_model_test(IndexFillModel(), input=(x, ), train=False, batch_size=BATCH_SIZE) + self.run_model_test( + IndexFillModel(), input=(x,), train=False, batch_size=BATCH_SIZE + ) @skipIfUnsupportedMinOpsetVersion(9) def test_index_copy(self): @@ -2408,19 +3090,37 @@ def forward(self, input): return input.index_copy(1, index, source) x = torch.randn(3, 4, 5, requires_grad=True) - self.run_model_test(IndexCopyModel(), input=(x, ), train=False, batch_size=BATCH_SIZE) + self.run_model_test( + IndexCopyModel(), input=(x,), train=False, batch_size=BATCH_SIZE + ) + # a bit of metaprogramming to set up all the rnn tests -def make_test(name, base, layer, bidirectional, initial_state, - variable_length, dropout, - **extra_kwargs): - test_name = str("_".join([ - "test", name, layer[1], - bidirectional[1], initial_state[1], - variable_length[1], dropout[1] - ])) +def make_test( + name, + base, + layer, + bidirectional, + initial_state, + variable_length, + dropout, + **extra_kwargs +): + test_name = str( + "_".join( + [ + "test", + name, + layer[1], + bidirectional[1], + initial_state[1], + variable_length[1], + dropout[1], + ] + ) + ) @unittest.skip("Disabled due to onnx optimizer deprecation") @skipIfUnsupportedOpsetVersion([10]) @@ -2433,53 +3133,54 @@ def f(self): initial_state=initial_state[0], packed_sequence=variable_length[0], dropout=dropout[0], - **extra_kwargs) + **extra_kwargs + ) f.__name__ = test_name setattr(TestCaffe2Backend_opset9, f.__name__, f) def setup_rnn_tests(): - layers_opts = [ - (1, "unilayer"), - (3, "trilayer") - ] - bidirectional_opts = [ - (False, "forward"), - (True, "bidirectional") - ] - initial_state_opts = [ - (True, "with_initial_state"), - (False, "no_initial_state") - ] + layers_opts = [(1, "unilayer"), (3, "trilayer")] + bidirectional_opts = [(False, "forward"), (True, "bidirectional")] + initial_state_opts = [(True, "with_initial_state"), (False, "no_initial_state")] variable_length_opts = [ (0, "without_sequence_lengths"), (1, "with_variable_length_sequences"), - (2, "with_batch_first_sequence_lengths") - ] - dropout_opts = [ - (0.2, "with_dropout"), - (0.0, "without_dropout") + (2, "with_batch_first_sequence_lengths"), ] + dropout_opts = [(0.2, "with_dropout"), (0.0, "without_dropout")] test_count = 0 - for (layer, bidirectional, initial_state, variable_length, dropout) in \ - itertools.product( - layers_opts, - bidirectional_opts, - initial_state_opts, - variable_length_opts, - dropout_opts, + for ( + layer, + bidirectional, + initial_state, + variable_length, + dropout, + ) in itertools.product( + layers_opts, + bidirectional_opts, + initial_state_opts, + variable_length_opts, + dropout_opts, ): for base, name, extra_kwargs in ( - ("elman", "elman_relu", {"nonlinearity": u"relu"}), - ("elman", "elman_tanh", {"nonlinearity": u"tanh"}), - ("lstm", "lstm", {}), - ("gru", "gru", {}) + ("elman", "elman_relu", {"nonlinearity": "relu"}), + ("elman", "elman_tanh", {"nonlinearity": "tanh"}), + ("lstm", "lstm", {}), + ("gru", "gru", {}), ): - make_test(name, base, layer, bidirectional, initial_state, - variable_length, dropout, - **extra_kwargs) + make_test( + name, + base, + layer, + bidirectional, + initial_state, + variable_length, + dropout, + **extra_kwargs + ) test_count += 1 # sanity check that a representative example does exist @@ -2488,47 +3189,62 @@ def setup_rnn_tests(): # make sure no one accidentally disables all the tests without # noticing assert test_count == 192, test_count + + setup_rnn_tests() # add the same test suite as above, but switch embed_params=False # to embed_params=True -TestCaffe2BackendEmbed_opset9 = type(str("TestCaffe2BackendEmbed_opset9"), - (unittest.TestCase,), - dict(TestCaffe2Backend_opset9.__dict__, embed_params=True)) +TestCaffe2BackendEmbed_opset9 = type( + str("TestCaffe2BackendEmbed_opset9"), + (unittest.TestCase,), + dict(TestCaffe2Backend_opset9.__dict__, embed_params=True), +) # opset 7 tests -TestCaffe2Backend_opset7 = type(str("TestCaffe2Backend_opset7"), - (unittest.TestCase,), - dict(TestCaffe2Backend_opset9.__dict__, opset_version=7)) -TestCaffe2BackendEmbed_opset7 = type(str("TestCaffe2BackendEmbed_opset7"), - (unittest.TestCase,), - dict(TestCaffe2Backend_opset9.__dict__, - embed_params=True, opset_version=7)) +TestCaffe2Backend_opset7 = type( + str("TestCaffe2Backend_opset7"), + (unittest.TestCase,), + dict(TestCaffe2Backend_opset9.__dict__, opset_version=7), +) +TestCaffe2BackendEmbed_opset7 = type( + str("TestCaffe2BackendEmbed_opset7"), + (unittest.TestCase,), + dict(TestCaffe2Backend_opset9.__dict__, embed_params=True, opset_version=7), +) # opset 8 tests -TestCaffe2Backend_opset8 = type(str("TestCaffe2Backend_opset8"), - (unittest.TestCase,), - dict(TestCaffe2Backend_opset9.__dict__, opset_version=8)) -TestCaffe2BackendEmbed_opset8 = type(str("TestCaffe2BackendEmbed_opset8"), - (unittest.TestCase,), - dict(TestCaffe2Backend_opset9.__dict__, - embed_params=True, opset_version=8)) +TestCaffe2Backend_opset8 = type( + str("TestCaffe2Backend_opset8"), + (unittest.TestCase,), + dict(TestCaffe2Backend_opset9.__dict__, opset_version=8), +) +TestCaffe2BackendEmbed_opset8 = type( + str("TestCaffe2BackendEmbed_opset8"), + (unittest.TestCase,), + dict(TestCaffe2Backend_opset9.__dict__, embed_params=True, opset_version=8), +) # opset 10 tests -TestCaffe2Backend_opset10 = type(str("TestCaffe2Backend_opset10"), - (unittest.TestCase,), - dict(TestCaffe2Backend_opset9.__dict__, opset_version=10)) - -TestCaffe2BackendEmbed_opset10 = type(str("TestCaffe2BackendEmbed_opset10"), - (unittest.TestCase,), - dict(TestCaffe2Backend_opset9.__dict__, - embed_params=True, opset_version=10)) +TestCaffe2Backend_opset10 = type( + str("TestCaffe2Backend_opset10"), + (unittest.TestCase,), + dict(TestCaffe2Backend_opset9.__dict__, opset_version=10), +) + +TestCaffe2BackendEmbed_opset10 = type( + str("TestCaffe2BackendEmbed_opset10"), + (unittest.TestCase,), + dict(TestCaffe2Backend_opset9.__dict__, embed_params=True, opset_version=10), +) # add the same test suite as above, but switch embed_params=False # to embed_params=True -TestCaffe2BackendEmbed_opset9_new_jit_API = type(str("TestCaffe2BackendEmbed_opset9_new_jit_API"), - (unittest.TestCase,), - dict(TestCaffe2Backend_opset9.__dict__, embed_params=True)) +TestCaffe2BackendEmbed_opset9_new_jit_API = type( + str("TestCaffe2BackendEmbed_opset9_new_jit_API"), + (unittest.TestCase,), + dict(TestCaffe2Backend_opset9.__dict__, embed_params=True), +) if __name__ == "__main__": unittest.main() diff --git a/test/onnx/test_pytorch_onnx_caffe2_quantized.py b/test/onnx/test_pytorch_onnx_caffe2_quantized.py index b427b85a2b56..2bd8ac54941f 100644 --- a/test/onnx/test_pytorch_onnx_caffe2_quantized.py +++ b/test/onnx/test_pytorch_onnx_caffe2_quantized.py @@ -1,19 +1,21 @@ # Owner(s): ["module: unknown"] -import numpy as np -import unittest -import torch.onnx -import torch.nn as nn -import torch.nn.quantized as nnq import io +import unittest +import numpy as np import onnx -import caffe2.python.onnx.backend as c2 -class TestQuantizedOps(unittest.TestCase): +import caffe2.python.onnx.backend as c2 +import torch.nn as nn +import torch.nn.quantized as nnq +import torch.onnx - def generic_test(self, model, sample_inputs, input_names=None, decimal=3, relaxed_check=False): +class TestQuantizedOps(unittest.TestCase): + def generic_test( + self, model, sample_inputs, input_names=None, decimal=3, relaxed_check=False + ): torch.backends.quantized.engine = "qnnpack" pt_inputs = tuple(torch.from_numpy(x) for x in sample_inputs) model.qconfig = torch.ao.quantization.get_default_qconfig("qnnpack") @@ -30,8 +32,15 @@ def generic_test(self, model, sample_inputs, input_names=None, decimal=3, relaxe output = q_model(*pt_inputs) f = io.BytesIO() - torch.onnx.export(q_model, pt_inputs, f, input_names=input_names, - operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK) + torch.onnx.export( + q_model, + pt_inputs, + f, + input_names=input_names, + operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK, + # Caffe2 doesn't support newer opset versions + opset_version=9, + ) f.seek(0) onnx_model = onnx.load(f) caffe_res = c2.run_model(onnx_model, dict(zip(input_names, sample_inputs)))[0] @@ -45,10 +54,13 @@ def generic_test(self, model, sample_inputs, input_names=None, decimal=3, relaxe # This check had to be changed to account for changes in # qnnpack's requant logic. - np.testing.assert_(max_diff <= 1, "Maximum absolute difference must be less than 1") + np.testing.assert_( + max_diff <= 1, "Maximum absolute difference must be less than 1" + ) else: - np.testing.assert_almost_equal(output.detach().numpy(), caffe_res, decimal=decimal) - + np.testing.assert_almost_equal( + output.detach().numpy(), caffe_res, decimal=decimal + ) def generic_unary_test(self, op): class QModule(torch.nn.Module): @@ -65,7 +77,6 @@ def forward(self, x): x = np.random.random((1, 2)).astype("float32") self.generic_test(QModule(op), (x,), input_names=["x"]) - def test_quantized_add(self): class QAddModule(torch.nn.Module): def __init__(self): @@ -93,8 +104,15 @@ def export_to_onnx(self, model, input, input_names): model = torch.jit.load(buf) f = io.BytesIO() - torch.onnx.export(model, input, f, input_names=input_names, - operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK) + torch.onnx.export( + model, + input, + f, + input_names=input_names, + operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK, + # Caffe2 doesn't support newer opset versions + opset_version=9, + ) f.seek(0) onnx_model = onnx.load(f) @@ -105,7 +123,9 @@ class LinearModel(torch.nn.Module): def __init__(self): super(LinearModel, self).__init__() self.qconfig = torch.ao.quantization.default_qconfig - self.fc1 = torch.ao.quantization.QuantWrapper(torch.nn.Linear(5, 10).to(dtype=torch.float)) + self.fc1 = torch.ao.quantization.QuantWrapper( + torch.nn.Linear(5, 10).to(dtype=torch.float) + ) def forward(self, x): x = self.fc1(x) @@ -131,18 +151,23 @@ def forward(self, x): # Permute pytorch output to NHWC # This check had to be changed to account for changes in # qnnpack's requant logic. - np.testing.assert_(max_diff <= 1, "Maximum absolute difference must be less than 1") + np.testing.assert_( + max_diff <= 1, "Maximum absolute difference must be less than 1" + ) def test_qconv_model(self): class ConvModel(torch.nn.Module): def __init__(self): super(ConvModel, self).__init__() self.qconfig = torch.ao.quantization.default_qconfig - self.fc1 = torch.ao.quantization.QuantWrapper(torch.nn.Conv2d(3, 5, 2, bias=True).to(dtype=torch.float)) + self.fc1 = torch.ao.quantization.QuantWrapper( + torch.nn.Conv2d(3, 5, 2, bias=True).to(dtype=torch.float) + ) def forward(self, x): x = self.fc1(x) return x + torch.backends.quantized.engine = "qnnpack" qconfig = torch.ao.quantization.default_qconfig model = ConvModel() @@ -164,7 +189,9 @@ def forward(self, x): # Permute pytorch output to NHWC # This check had to be changed to account for changes in # qnnpack's requant logic. - np.testing.assert_(max_diff <= 1, "Maximum absolute difference must be less than 1") + np.testing.assert_( + max_diff <= 1, "Maximum absolute difference must be less than 1" + ) def test_upsample(self): class QUpsampleModule(torch.nn.Module): @@ -174,7 +201,9 @@ def __init__(self): self.dequant = torch.ao.quantization.DeQuantStub() def forward(self, x): - res = torch.nn.quantized.functional.interpolate(self.quant1(x), size=[6, 8], mode="nearest") + res = torch.nn.quantized.functional.interpolate( + self.quant1(x), size=[6, 8], mode="nearest" + ) return self.dequant(res) x = np.random.rand(1, 2, 3, 4).astype("float32") @@ -188,11 +217,15 @@ def __init__(self): self.dequant = torch.ao.quantization.DeQuantStub() def forward(self, x): - res = torch.nn.functional.avg_pool2d(self.quant1(x), kernel_size=2, stride=1, padding=0) + res = torch.nn.functional.avg_pool2d( + self.quant1(x), kernel_size=2, stride=1, padding=0 + ) return self.dequant(res) x = np.random.rand(1, 2, 8, 8).astype("float32") - self.generic_test(QAvgPool2dModule(), (x,), input_names=["x"], relaxed_check=True) + self.generic_test( + QAvgPool2dModule(), (x,), input_names=["x"], relaxed_check=True + ) def test_reshape(self): class QReshapeModule(torch.nn.Module): @@ -231,12 +264,21 @@ def __init__(self): self.dequant = torch.ao.quantization.DeQuantStub() def forward(self, x, y): - res = torch.ops.quantized.cat([self.quant1(x), self.quant1(y)], dim=1, scale=1.0, zero_point=0) + res = torch.ops.quantized.cat( + [self.quant1(x), self.quant1(y)], dim=1, scale=1.0, zero_point=0 + ) return self.dequant(res) x = np.random.rand(1, 2, 3, 4).astype("float32") y = np.random.rand(1, 4, 3, 4).astype("float32") - self.generic_test(QConcatModule(), (x, y,), input_names=["x", "y"]) + self.generic_test( + QConcatModule(), + ( + x, + y, + ), + input_names=["x", "y"], + ) def test_max_pool2d(self): class QMaxPool2dModule(torch.nn.Module): @@ -246,7 +288,9 @@ def __init__(self): self.dequant = torch.ao.quantization.DeQuantStub() def forward(self, x): - res = torch.nn.functional.max_pool2d(self.quant1(x), kernel_size=2, stride=1, padding=0) + res = torch.nn.functional.max_pool2d( + self.quant1(x), kernel_size=2, stride=1, padding=0 + ) return self.dequant(res) x = np.random.rand(1, 2, 8, 8).astype("float32") @@ -288,7 +332,7 @@ def __init__(self): super().__init__( nn.Conv2d(3, 3, 1, 1, bias=False), nn.BatchNorm2d(3), - nn.ReLU(inplace=False) + nn.ReLU(inplace=False), ) class ModelWithClassifierHead(nn.Module): @@ -318,14 +362,20 @@ def forward(self, x): return x model = ModelWithClassifierHead().eval() - torch.ao.quantization.fuse_modules(model, [["conv1", "relu1"] , - ["features.0.0", "features.0.1", "features.0.2"], - ["features.1.0", "features.1.1", "features.1.2"], - ["features.2.0", "features.2.1", "features.2.2"]], inplace=True) - + torch.ao.quantization.fuse_modules( + model, + [ + ["conv1", "relu1"], + ["features.0.0", "features.0.1", "features.0.2"], + ["features.1.0", "features.1.1", "features.1.2"], + ["features.2.0", "features.2.1", "features.2.2"], + ], + inplace=True, + ) x = np.random.rand(1, 3, 10, 10).astype("float32") self.generic_test(model, (x,), input_names=["x"], relaxed_check=True) + if __name__ == "__main__": unittest.main() diff --git a/test/onnx/test_pytorch_onnx_no_runtime.py b/test/onnx/test_pytorch_onnx_no_runtime.py new file mode 100644 index 000000000000..429e3ba82ed6 --- /dev/null +++ b/test/onnx/test_pytorch_onnx_no_runtime.py @@ -0,0 +1,114 @@ +# Owner(s): ["module: onnx"] + +"""Tests for onnx export that don't run the exported model.""" + +import io +import unittest +from typing import Optional, Type + +import onnx + +import torch +from torch import Tensor +from torch.onnx import symbolic_helper +from torch.testing._internal.common_utils import ( + instantiate_parametrized_tests, + parametrize, +) + + +class TestOptionalOutput(unittest.TestCase): + # TODO: Move these tests to test_pytorch_onnx_onnxruntime once + # ONNX Runtime 1.11 is released and supports opset 16. + + class IfNoneInput(torch.nn.Module): + def forward(self, x) -> Optional[Tensor]: + y: Optional[Tensor] = None + if x.size(0) > 1: + y = x + return y + + class IfNoneOutput(torch.nn.Module): + def forward(self, x) -> Optional[Tensor]: + y: Optional[Tensor] = x + if x.size(0) > 1: + y = None + return y + + class LoopNoneInput(torch.nn.Module): + def forward(self, x) -> Optional[Tensor]: + y: Optional[Tensor] = None + for _ in range(x.size(0)): + y = x + return y + + class LoopNoneOutput(torch.nn.Module): + def forward(self, x) -> Optional[Tensor]: + y: Optional[Tensor] = x + for _ in range(x.size(0)): + y = None + return y + + @parametrize( + "module_class", + (IfNoneInput, IfNoneOutput, LoopNoneInput, LoopNoneOutput), + name_fn=lambda module_class: module_class.__name__, + ) + @parametrize("x_size", (0, 1), name_fn=lambda x_size: str(x_size)) + def test_optional_output(self, module_class: Type[torch.nn.Module], x_size: int): + # Need scripting to preserve control flow for this test to be meaningful. + model = torch.jit.script(module_class()) + f = io.BytesIO() + x = torch.ones(x_size) + dynamic_axis_name = "condition" + torch.onnx.export( + model, + (x,), + f, + opset_version=15, + # Ensure condition is not constant + dynamic_axes={"x": {0: dynamic_axis_name}}, + input_names=["x"], + ) + exported = onnx.load_from_string(f.getvalue()) + expected_elem_type = symbolic_helper.scalar_type_to_onnx[ + symbolic_helper.scalar_type_to_pytorch_type.index(x.dtype) + ].value + expected_output_type = onnx.helper.make_optional_type_proto( + onnx.helper.make_tensor_type_proto(expected_elem_type, (dynamic_axis_name,)) + ) + self.assertEqual(expected_output_type, exported.graph.output[0].type) + for node in exported.graph.node: + # Both branches output types should match. + if node.op_type == "If": + for attr in node.attribute: + if attr.name in ("then_branch", "else_branch"): + self.assertEqual(expected_output_type, attr.g.output[0].type) + + def test_uninitialized_optional(self): + class Module(torch.nn.Module): + def forward(self, y: Optional[Tensor]) -> Optional[Tensor]: + if y is not None: + if y.shape[1] < 5: + if y.size(0) == 1: + y = y + 4 + else: + return y + return y + + y = torch.ones((3, 4), dtype=torch.int) + torch.onnx.export( + torch.jit.script(Module()), + y, + io.BytesIO(), + opset_version=15, + dynamic_axes={"y": {0: "y0", 1: "y1"}}, + input_names=["y"], + ) + + +instantiate_parametrized_tests(TestOptionalOutput) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index af4c6b9ec5bb..a3bb26571c59 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -1,65 +1,84 @@ # Owner(s): ["module: onnx"] -import unittest -import onnxruntime -import torch -import torchvision - -import numpy as np +import copy import io import itertools -import copy import os import random +import unittest +from collections import OrderedDict +from typing import Dict, List, Optional, Tuple, Union import model_defs.word_language_model as word_language_model +import numpy as np import onnx - -import torch.nn.functional as F -from torch.nn.utils import rnn as rnn_utils -from model_defs.lstm_flattening_result import (LstmFlatteningResultWithSeqLength, - LstmFlatteningResultWithoutSeqLength) -from model_defs.rnn_model_with_packed_sequence import (RnnModelWithPackedSequence, - RnnModelWithPackedSequenceWithState, - RnnModelWithPackedSequenceWithoutState) -from test_pytorch_common import (skipIfUnsupportedMinOpsetVersion, skipIfUnsupportedOpsetVersion, - skipIfNoLapack, disableScriptTest, skipIfONNXShapeInference, - skipIfUnsupportedMaxOpsetVersion, skipForAllOpsetVersions) -from test_pytorch_common import BATCH_SIZE -from test_pytorch_common import RNN_BATCH_SIZE, RNN_SEQUENCE_LENGTH, RNN_INPUT_SIZE, RNN_HIDDEN_SIZE -from typing import List, Tuple, Optional, Dict -from torch import Tensor - +import onnxruntime +import torchvision +from model_defs.lstm_flattening_result import ( + LstmFlatteningResultWithoutSeqLength, + LstmFlatteningResultWithSeqLength, +) +from model_defs.rnn_model_with_packed_sequence import ( + RnnModelWithPackedSequence, + RnnModelWithPackedSequenceWithoutState, + RnnModelWithPackedSequenceWithState, +) +from test_pytorch_common import ( + BATCH_SIZE, + RNN_BATCH_SIZE, + RNN_HIDDEN_SIZE, + RNN_INPUT_SIZE, + RNN_SEQUENCE_LENGTH, + skipIfNoLapack, + skipIfUnsupportedMaxOpsetVersion, + skipIfUnsupportedMinOpsetVersion, + skipIfUnsupportedOpsetVersion, + skipScriptTest, +) from torchvision import ops +from torchvision.models.detection.faster_rcnn import FastRCNNPredictor, TwoMLPHead from torchvision.models.detection.image_list import ImageList -from torchvision.models.detection.transform import GeneralizedRCNNTransform -from torchvision.models.detection.rpn import AnchorGenerator, RPNHead, RegionProposalNetwork from torchvision.models.detection.roi_heads import RoIHeads -from torchvision.models.detection.faster_rcnn import FastRCNNPredictor, TwoMLPHead -from collections import OrderedDict +from torchvision.models.detection.rpn import ( + AnchorGenerator, + RegionProposalNetwork, + RPNHead, +) +from torchvision.models.detection.transform import GeneralizedRCNNTransform +import torch +import torch.nn.functional as F +from torch import Tensor +from torch.nn.utils import rnn as rnn_utils from torch.nn.utils.rnn import PackedSequence -from torch.onnx import CheckerError, register_custom_op_symbolic, unregister_custom_op_symbolic +from torch.onnx import ( + CheckerError, + register_custom_op_symbolic, + unregister_custom_op_symbolic, +) from torch.onnx.symbolic_helper import _unimplemented +from torch.onnx.utils import unpack_quantized_tensor + +_ORT_PROVIDERS = ["CPUExecutionProvider"] def flatten_tuples(elem): - tup = [] + flattened = [] for t in elem: - if isinstance(t, (tuple)): - tup += flatten_tuples(t) + if isinstance(t, tuple): + flattened.extend(flatten_tuples(t)) else: - tup += [t] - return tup + flattened.append(t) + return flattened def to_numpy(elem): - if isinstance(elem, torch.Tensor): + if isinstance(elem, Tensor): if elem.requires_grad: return elem.detach().cpu().numpy() else: return elem.cpu().numpy() - elif isinstance(elem, list) or isinstance(elem, tuple): + elif isinstance(elem, (list, tuple)): return [to_numpy(inp) for inp in elem] elif isinstance(elem, bool): return np.array(elem, dtype=bool) @@ -68,72 +87,123 @@ def to_numpy(elem): elif isinstance(elem, float): return np.array(elem, dtype=float) elif isinstance(elem, dict): - dict_ = [] + flattened = [] for k in elem: - dict_ += [to_numpy(k)] + [to_numpy(elem[k])] - return dict_ - else: - return RuntimeError("Input has unknown type.") - - -def convert_to_onnx(model, input=None, opset_version=9, do_constant_folding=True, - keep_initializers_as_inputs=True, dynamic_axes=None, - input_names=None, output_names=None, - fixed_batch_size=False, training=None, - onnx_shape_inference=True): - # export the model to ONNX + flattened += [to_numpy(k)] + [to_numpy(elem[k])] + return flattened + return elem + + +def convert_to_onnx( + model, + input=None, + opset_version=9, + do_constant_folding=True, + keep_initializers_as_inputs=True, + dynamic_axes=None, + input_names=None, + output_names=None, + fixed_batch_size=False, + training=None, + verbose=False, +): f = io.BytesIO() input_copy = copy.deepcopy(input) - torch.onnx._export(model, input_copy, f, - opset_version=opset_version, - do_constant_folding=do_constant_folding, - keep_initializers_as_inputs=keep_initializers_as_inputs, - dynamic_axes=dynamic_axes, - input_names=input_names, output_names=output_names, - fixed_batch_size=fixed_batch_size, training=training, - onnx_shape_inference=onnx_shape_inference) + torch.onnx._export( + model, + input_copy, + f, + opset_version=opset_version, + do_constant_folding=do_constant_folding, + keep_initializers_as_inputs=keep_initializers_as_inputs, + dynamic_axes=dynamic_axes, + input_names=input_names, + output_names=output_names, + fixed_batch_size=fixed_batch_size, + training=training, + verbose=verbose, + ) # compute onnxruntime output prediction so = onnxruntime.SessionOptions() # suppress ort warnings. # 0:Verbose, 1:Info, 2:Warning. 3:Error, 4:Fatal. Default is 2. so.log_severity_level = 3 - ort_sess = onnxruntime.InferenceSession(f.getvalue(), so) + ort_sess = onnxruntime.InferenceSession(f.getvalue(), so, providers=_ORT_PROVIDERS) return ort_sess def inline_flatten_list(inputs, res_list): for i in inputs: - res_list.append(i) if not isinstance(i, (list, tuple)) else inline_flatten_list(i, res_list) + res_list.append(i) if not isinstance(i, (list, tuple)) else inline_flatten_list( + i, res_list + ) return res_list -def run_ort(ort_sess, input): - input = flatten_tuples(input) - input = to_numpy(input) - ort_inputs = dict((ort_sess.get_inputs()[i].name, input) for i, input in enumerate(input)) +def unpack_to_numpy(values): + value_unpacked = [] + for value in values: + value_unpacked.extend(unpack_quantized_tensor(value)) + return [to_numpy(v) for v in value_unpacked] + + +def run_ort(ort_sess, inputs): + kw_inputs = {} + if inputs and isinstance(inputs[-1], dict): + kw_inputs = inputs[-1] + inputs = inputs[:-1] + inputs = unpack_to_numpy(flatten_tuples(inputs)) + ort_inputs = {} + for input_name, input in kw_inputs.items(): + ort_inputs[input_name] = to_numpy(input) + inputs = to_numpy(inputs) + ort_sess_inputs = ort_sess.get_inputs() + for i, input in enumerate(inputs): + if i == len(ort_sess_inputs) or ort_sess_inputs[i].name in ort_inputs: + raise ValueError( + f"got too many positional inputs. inputs: {inputs}. kw_inputs: {kw_inputs}" + ) + ort_inputs[ort_sess_inputs[i].name] = input ort_outs = ort_sess.run(None, ort_inputs) return inline_flatten_list(ort_outs, []) def ort_compare_with_pytorch(ort_outs, output, rtol, atol): output, _ = torch.jit._flatten(output) - outputs = [to_numpy(outp) for outp in output] + outputs = unpack_to_numpy(output) # compare onnxruntime and PyTorch results assert len(outputs) == len(ort_outs), "number of outputs differ" # compare onnxruntime and PyTorch results - [np.testing.assert_allclose(out, ort_out, rtol=rtol, atol=atol) for out, ort_out in zip(outputs, ort_outs)] + [ + np.testing.assert_allclose(out, ort_out, rtol=rtol, atol=atol) + for out, ort_out in zip(outputs, ort_outs) + ] -def run_model_test(self, model, batch_size=2, state_dict=None, - input=None, use_gpu=True, rtol=0.001, atol=1e-7, - do_constant_folding=True, dynamic_axes=None, - test_with_inputs=None, input_names=None, - output_names=None, fixed_batch_size=False, - dict_check=True, training=None, - remained_onnx_input_idx=None, flatten=True): +def run_model_test( + self, + model, + batch_size=2, + state_dict=None, + input=None, + use_gpu=True, + rtol=0.001, + atol=1e-7, + do_constant_folding=True, + dynamic_axes=None, + test_with_inputs=None, + input_names=None, + output_names=None, + fixed_batch_size=False, + dict_check=True, + training=None, + remained_onnx_input_idx=None, + flatten=True, + verbose=False, +): if training is not None and training == torch.onnx.TrainingMode.TRAINING: model.train() elif training is None or training == torch.onnx.TrainingMode.EVAL: @@ -141,12 +211,10 @@ def run_model_test(self, model, batch_size=2, state_dict=None, if input is None: input = torch.randn(batch_size, 3, 224, 224, requires_grad=True) with torch.no_grad(): - if isinstance(input, torch.Tensor): + if isinstance(input, (Tensor, dict)): input = (input,) # In-place operators will update input tensor data as well. # Thus inputs are replicated before every forward call. - if isinstance(input, dict): - input = (input,) input_args = copy.deepcopy(input) input_kwargs = {} if dict_check and isinstance(input_args[-1], dict): @@ -157,18 +225,25 @@ def run_model_test(self, model, batch_size=2, state_dict=None, output = model_copy(*input_args, **input_kwargs) except Exception: output = model(*input_args, **input_kwargs) - if isinstance(output, torch.Tensor): + if isinstance(output, Tensor): output = (output,) if not dict_check and isinstance(input[-1], dict): input = input + ({},) - ort_sess = convert_to_onnx(model, input=input, opset_version=self.opset_version, - do_constant_folding=do_constant_folding, - keep_initializers_as_inputs=self.keep_initializers_as_inputs, - dynamic_axes=dynamic_axes, input_names=input_names, - output_names=output_names, fixed_batch_size=fixed_batch_size, training=training, - onnx_shape_inference=self.onnx_shape_inference) + ort_sess = convert_to_onnx( + model, + input=input, + opset_version=self.opset_version, + do_constant_folding=do_constant_folding, + keep_initializers_as_inputs=self.keep_initializers_as_inputs, + dynamic_axes=dynamic_axes, + input_names=input_names, + output_names=output_names, + fixed_batch_size=fixed_batch_size, + training=training, + verbose=verbose, + ) # compute onnxruntime output prediction if remained_onnx_input_idx is not None: input_onnx = [] @@ -179,20 +254,21 @@ def run_model_test(self, model, batch_size=2, state_dict=None, input_copy = copy.deepcopy(input) if flatten: input_copy, _ = torch.jit._flatten(input_copy) - + elif input_copy and input_copy[-1] == {}: + # Handle empty kwargs (normally removed by flatten). + input_copy = input_copy[:-1] ort_outs = run_ort(ort_sess, input_copy) ort_compare_with_pytorch(ort_outs, output, rtol, atol) - # if additional test inputs are provided run the onnx # model with these inputs and check the outputs if test_with_inputs is not None: for test_input in test_with_inputs: - if isinstance(test_input, torch.Tensor): + if isinstance(test_input, Tensor): test_input = (test_input,) test_input_copy = copy.deepcopy(test_input) output = model(*test_input_copy) - if isinstance(output, torch.Tensor): + if isinstance(output, Tensor): output = (output,) if remained_onnx_input_idx is not None: test_input_onnx = [] @@ -230,13 +306,20 @@ def _init_test_rpn(): rpn_score_thresh = 0.0 rpn = RegionProposalNetwork( - rpn_anchor_generator, rpn_head, - rpn_fg_iou_thresh, rpn_bg_iou_thresh, - rpn_batch_size_per_image, rpn_positive_fraction, - rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh, - score_thresh=rpn_score_thresh) + rpn_anchor_generator, + rpn_head, + rpn_fg_iou_thresh, + rpn_bg_iou_thresh, + rpn_batch_size_per_image, + rpn_positive_fraction, + rpn_pre_nms_top_n, + rpn_post_nms_top_n, + rpn_nms_thresh, + score_thresh=rpn_score_thresh, + ) return rpn + def _init_test_roi_heads_faster_rcnn(): out_channels = 256 num_classes = 91 @@ -251,39 +334,74 @@ def _init_test_roi_heads_faster_rcnn(): box_detections_per_img = 100 box_roi_pool = ops.MultiScaleRoIAlign( - featmap_names=["0", "1", "2", "3"], - output_size=7, - sampling_ratio=2) + featmap_names=["0", "1", "2", "3"], output_size=7, sampling_ratio=2 + ) resolution = box_roi_pool.output_size[0] representation_size = 1024 - box_head = TwoMLPHead( - out_channels * resolution ** 2, - representation_size) + box_head = TwoMLPHead(out_channels * resolution**2, representation_size) representation_size = 1024 - box_predictor = FastRCNNPredictor( - representation_size, - num_classes) + box_predictor = FastRCNNPredictor(representation_size, num_classes) roi_heads = RoIHeads( - box_roi_pool, box_head, box_predictor, - box_fg_iou_thresh, box_bg_iou_thresh, - box_batch_size_per_image, box_positive_fraction, + box_roi_pool, + box_head, + box_predictor, + box_fg_iou_thresh, + box_bg_iou_thresh, + box_batch_size_per_image, + box_positive_fraction, bbox_reg_weights, - box_score_thresh, box_nms_thresh, box_detections_per_img) + box_score_thresh, + box_nms_thresh, + box_detections_per_img, + ) return roi_heads + +def _construct_tensor_for_quantization_test( + shape: Tuple[int, ...], + offset: Optional[Union[int, float]] = None, + max_val: Optional[Union[int, float]] = None, +) -> torch.Tensor: + """Helper function to generate weights and test inputs in a deterministic way. + + Due to difference in implementation details between PyTorch and ONNXRuntime, randomly generated + test data for quantization tests can be flaky. To help stablize the test, this helper function is + used to generate weights and test inputs in a deterministic way. + + Args: + shape (Tuple[int]): Shape for tensor to construct. + offset (Optional[Union[int, float]]): Offset to be added to the generated tensor. + max_val (Optional[Union[int, float]]): If any element within tensor has a larger absolute value than + max_val, the tensor will be scaled by max_val / tensor.abs().max(). This step is done after + applying offset. + """ + tensor = torch.arange(np.prod(shape), dtype=torch.float).view(shape) + if offset is not None: + tensor = tensor + offset + if max_val is not None and tensor.abs().max() > max_val: + tensor = tensor * max_val / tensor.abs().max() + return tensor + + def set_rng_seed(seed): torch.manual_seed(seed) random.seed(seed) np.random.seed(seed) -class TestONNXRuntime(unittest.TestCase): - from torch.onnx.symbolic_helper import _export_onnx_opset_version - opset_version = _export_onnx_opset_version + +class _TestONNXRuntime: + """Abstract base class for test cases. + + Intentionally not a sub-class of unittest.TestCase so that unittest / pytest + don't run it directly. unitest.TestCase is mixed in as another base class when + creating concrete sub-types. See MakeTestCase(). + """ + + opset_version = -1 # Sub-classes must override keep_initializers_as_inputs = True # For IR version 3 type export. - onnx_shape_inference = True def setUp(self): torch.manual_seed(0) @@ -298,37 +416,78 @@ def setUp(self): # This mostly happens in unit test, where we widely use torch.size or torch.shape. # So the output is only dependent on the input shape, not value. # remained_onnx_input_idx is used to indicate which pytorch model input idx is remained in ONNX model. - def run_test(self, model, input, rtol=1e-3, atol=1e-7, do_constant_folding=True, - batch_size=2, use_gpu=True, dynamic_axes=None, test_with_inputs=None, - input_names=None, output_names=None, fixed_batch_size=False, dict_check=True, - training=None, remained_onnx_input_idx=None): + def run_test( + self, + model, + input, + rtol=1e-3, + atol=1e-7, + do_constant_folding=True, + batch_size=2, + use_gpu=True, + dynamic_axes=None, + test_with_inputs=None, + input_names=None, + output_names=None, + fixed_batch_size=False, + dict_check=True, + training=None, + remained_onnx_input_idx=None, + verbose=False, + ): def _run_test(m, remained_onnx_input_idx, flatten=True): - return run_model_test(self, m, batch_size=batch_size, - input=input, use_gpu=use_gpu, rtol=rtol, atol=atol, - do_constant_folding=do_constant_folding, - dynamic_axes=dynamic_axes, test_with_inputs=test_with_inputs, - input_names=input_names, output_names=output_names, - fixed_batch_size=fixed_batch_size, dict_check=dict_check, - training=training, remained_onnx_input_idx=remained_onnx_input_idx, - flatten=flatten) + return run_model_test( + self, + m, + batch_size=batch_size, + input=input, + use_gpu=use_gpu, + rtol=rtol, + atol=atol, + do_constant_folding=do_constant_folding, + dynamic_axes=dynamic_axes, + test_with_inputs=test_with_inputs, + input_names=input_names, + output_names=output_names, + fixed_batch_size=fixed_batch_size, + dict_check=dict_check, + training=training, + remained_onnx_input_idx=remained_onnx_input_idx, + flatten=flatten, + verbose=verbose, + ) if isinstance(remained_onnx_input_idx, dict): - scripting_remained_onnx_input_idx = remained_onnx_input_idx['scripting'] - tracing_remained_onnx_input_idx = remained_onnx_input_idx['tracing'] + scripting_remained_onnx_input_idx = remained_onnx_input_idx["scripting"] + tracing_remained_onnx_input_idx = remained_onnx_input_idx["tracing"] else: scripting_remained_onnx_input_idx = remained_onnx_input_idx tracing_remained_onnx_input_idx = remained_onnx_input_idx - if self.is_script_test_enabled and not isinstance(model, torch.jit.ScriptModule): - script_model = torch.jit.script(model) - _run_test(script_model, scripting_remained_onnx_input_idx, flatten=False) + is_script = isinstance( + model, (torch.jit.ScriptModule, torch.jit.ScriptFunction) + ) - _run_test(model, tracing_remained_onnx_input_idx) + if self.is_script_test_enabled: + script_model = model if is_script else torch.jit.script(model) + _run_test(script_model, scripting_remained_onnx_input_idx, flatten=False) - def run_model_test_with_external_data(self, model, input, rtol=0.001, atol=1e-7, - do_constant_folding=True, dynamic_axes=None, - input_names=None, output_names=None, - ort_optim_on=True, training=None): + if not is_script: + _run_test(model, tracing_remained_onnx_input_idx) + + def run_model_test_with_external_data( + self, + model, + input, + rtol=0.001, + atol=1e-7, + do_constant_folding=True, + dynamic_axes=None, + input_names=None, + output_names=None, + ort_optim_on=True, + training=None, + ): import os import tempfile @@ -337,41 +496,51 @@ def run_model_test_with_external_data(self, model, input, rtol=0.001, atol=1e-7, elif training is None or training == torch.onnx.TrainingMode.EVAL: model.eval() with torch.no_grad(): - if isinstance(input, torch.Tensor): + if isinstance(input, Tensor): input = (input,) # In-place operators will update input tensor data as well. # Thus inputs are replicated before every forward call. input_copy = copy.deepcopy(input) output = model(*input_copy) - if isinstance(output, torch.Tensor): + if isinstance(output, Tensor): output = (output,) # export the model to ONNX with tempfile.TemporaryDirectory() as tmpdirname: model_file_name = os.path.join(tmpdirname, "model.onnx") input_copy = copy.deepcopy(input) - torch.onnx.export(model, input_copy, model_file_name, - opset_version=self.opset_version, - verbose=False, - do_constant_folding=do_constant_folding, - keep_initializers_as_inputs=self.keep_initializers_as_inputs, - dynamic_axes=dynamic_axes, - input_names=input_names, output_names=output_names) + torch.onnx.export( + model, + input_copy, + model_file_name, + opset_version=self.opset_version, + verbose=False, + do_constant_folding=do_constant_folding, + keep_initializers_as_inputs=self.keep_initializers_as_inputs, + dynamic_axes=dynamic_axes, + input_names=input_names, + output_names=output_names, + ) # compute onnxruntime output prediction ort_sess_opt = onnxruntime.SessionOptions() - ort_sess_opt.graph_optimization_level = \ - onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED if ort_optim_on else \ - onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL + ort_sess_opt.graph_optimization_level = ( + onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED + if ort_optim_on + else onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL + ) # suppress ort warnings. # 0:Verbose, 1:Info, 2:Warning. 3:Error, 4:Fatal. Default is 2. ort_sess_opt.log_severity_level = 3 - ort_sess = onnxruntime.InferenceSession(model_file_name, sess_options=ort_sess_opt) + ort_sess = onnxruntime.InferenceSession( + model_file_name, sess_options=ort_sess_opt, providers=_ORT_PROVIDERS + ) input_copy = copy.deepcopy(input) ort_outs = run_ort(ort_sess, input_copy) ort_compare_with_pytorch(ort_outs, output, rtol, atol) - - @skipIfUnsupportedMinOpsetVersion(9) # Because external data format was released with Opset 9. + @skipIfUnsupportedMinOpsetVersion( + 9 + ) # Because external data format was released with Opset 9. def test_embedding_model_with_external_data(self): class LargeModel(torch.nn.Module): def __init__(self): @@ -392,13 +561,15 @@ def forward(self, input): x = torch.tensor([2], dtype=torch.long) self.run_model_test_with_external_data(model, x) - @skipIfUnsupportedMinOpsetVersion(9) # Because external data format was released with Opset 9. + @skipIfUnsupportedMinOpsetVersion( + 9 + ) # Because external data format was released with Opset 9. def test_large_model_with_external_data(self): class LargeModel(torch.nn.Module): def __init__(self): super(LargeModel, self).__init__() dim = 5 - n = 40 * 4 * 10 ** 6 + n = 40 * 4 * 10**6 self.emb = torch.nn.Embedding(n, dim) self.lin1 = torch.nn.Linear(dim, 1) self.seq = torch.nn.Sequential( @@ -412,13 +583,15 @@ def forward(self, input): x = torch.tensor([2], dtype=torch.long) self.run_model_test_with_external_data(LargeModel(), x) - @skipIfUnsupportedMinOpsetVersion(9) # Because external data format was released with Opset 9. + @skipIfUnsupportedMinOpsetVersion( + 9 + ) # Because external data format was released with Opset 9. def test_large_model_with_non_str_file(self): class LargeModel(torch.nn.Module): def __init__(self): super(LargeModel, self).__init__() dim = 5 - n = 40 * 4 * 10 ** 6 + n = 40 * 4 * 10**6 self.emb = torch.nn.Embedding(n, dim) self.lin1 = torch.nn.Linear(dim, 1) self.seq = torch.nn.Sequential( @@ -431,9 +604,11 @@ def forward(self, input): x = torch.tensor([2], dtype=torch.long) f = io.BytesIO() - err_msg = ("The serialized model is larger than the 2GiB limit imposed by the protobuf library. " - "Therefore the output file must be a file path, so that the ONNX external data can be written to " - "the same directory. Please specify the output file name.") + err_msg = ( + "The serialized model is larger than the 2GiB limit imposed by the protobuf library. " + "Therefore the output file must be a file path, so that the ONNX external data can be written to " + "the same directory. Please specify the output file name." + ) with self.assertRaisesRegex(RuntimeError, err_msg): torch.onnx.export(LargeModel(), x, f) @@ -456,7 +631,9 @@ def test_fuse_conv_bn2d(self): class Fuse(torch.nn.Module): def __init__(self): super(Fuse, self).__init__() - self.conv = torch.nn.Conv2d(3, 2, kernel_size=1, stride=2, padding=3, bias=False) + self.conv = torch.nn.Conv2d( + 3, 2, kernel_size=1, stride=2, padding=3, bias=False + ) self.bn = torch.nn.BatchNorm2d(2) def forward(self, x): @@ -471,7 +648,9 @@ def test_fuse_conv_bn3d(self): class Fuse(torch.nn.Module): def __init__(self): super(Fuse, self).__init__() - self.conv = torch.nn.Conv3d(3, 2, (3, 5, 2), stride=(2, 1, 1), padding=(3, 2, 0), bias=False) + self.conv = torch.nn.Conv3d( + 3, 2, (3, 5, 2), stride=(2, 1, 1), padding=(3, 2, 0), bias=False + ) self.bn = torch.nn.BatchNorm3d(2) def forward(self, x): @@ -492,7 +671,7 @@ def __init__(self): kernel_size=3, stride=1, padding=2, - dilation=1 + dilation=1, ) self.bn = torch.nn.BatchNorm1d(5) @@ -510,9 +689,14 @@ def forward(self, x): model = Fuse() x = torch.randn(2, 5, 9, requires_grad=True) - self.run_test(torch.jit.script(model), (x,), - input_names=['x'], dynamic_axes={'x': [0, 2]}, - rtol=1e-3, atol=1e-6) + self.run_test( + torch.jit.script(model), + (x,), + input_names=["x"], + dynamic_axes={"x": [0, 2]}, + rtol=1e-3, + atol=1e-6, + ) def test_conv_tbc(self): from torch.nn.modules.utils import _single @@ -526,9 +710,9 @@ def __init__(self, in_channels, out_channels, kernel_size, padding=0): self.padding = _single(padding) self.weight = torch.nn.Parameter( - torch.Tensor(self.kernel_size[0], in_channels, out_channels) + Tensor(self.kernel_size[0], in_channels, out_channels) ) - self.bias = torch.nn.Parameter(torch.Tensor(out_channels)) + self.bias = torch.nn.Parameter(Tensor(out_channels)) self.reset_parameters() def reset_parameters(self): @@ -552,7 +736,9 @@ def forward(self, input): def test_reshape_constant_fold(self): class Reshape(torch.nn.Module): - def __init__(self, ): + def __init__( + self, + ): super(Reshape, self).__init__() self.register_buffer("weight", torch.ones(5)) @@ -572,23 +758,24 @@ def run_word_language_model(self, model_name): tied = False batchsize = 5 if model_name == "GRU": - model = word_language_model.RNNModelWithTensorHidden(model_name, ntokens, emsize, - nhid, nlayers, dropout, tied, - batchsize) + model = word_language_model.RNNModelWithTensorHidden( + model_name, ntokens, emsize, nhid, nlayers, dropout, tied, batchsize + ) elif model_name == "LSTM": - model = word_language_model.RNNModelWithTupleHidden(model_name, ntokens, emsize, - nhid, nlayers, dropout, tied, - batchsize) + model = word_language_model.RNNModelWithTupleHidden( + model_name, ntokens, emsize, nhid, nlayers, dropout, tied, batchsize + ) else: - model = word_language_model.RNNModel(model_name, ntokens, emsize, - nhid, nlayers, dropout, tied, - batchsize) + model = word_language_model.RNNModel( + model_name, ntokens, emsize, nhid, nlayers, dropout, tied, batchsize + ) x = torch.arange(0, ntokens).long().view(-1, batchsize) # Only support CPU version, since tracer is not working in GPU RNN. self.run_test(model, (x, model.hidden)) - def get_image(self, rel_path: str, size: Tuple[int, int]) -> torch.Tensor: + def get_image(self, rel_path: str, size: Tuple[int, int]) -> Tensor: import os + from PIL import Image from torchvision import transforms @@ -598,29 +785,53 @@ def get_image(self, rel_path: str, size: Tuple[int, int]) -> torch.Tensor: return transforms.ToTensor()(image) - def get_test_images(self) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: - return ([self.get_image("grace_hopper_517x606.jpg", (100, 320))], - [self.get_image("rgb_pytorch.png", (250, 380))]) + def get_test_images(self) -> Tuple[List[Tensor], List[Tensor]]: + return ( + [self.get_image("grace_hopper_517x606.jpg", (100, 320))], + [self.get_image("rgb_pytorch.png", (250, 380))], + ) @skipIfUnsupportedMinOpsetVersion(11) - @disableScriptTest() # Faster RCNN model is not scriptable + @skipScriptTest() # Faster RCNN model is not scriptable def test_faster_rcnn(self): - model = torchvision.models.detection.faster_rcnn.fasterrcnn_resnet50_fpn(pretrained=False, min_size=200, - max_size=300) + model = torchvision.models.detection.faster_rcnn.fasterrcnn_resnet50_fpn( + pretrained=False, pretrained_backbone=True, min_size=200, max_size=300 + ) model.eval() x1 = torch.randn(3, 200, 300, requires_grad=True) x2 = torch.randn(3, 200, 300, requires_grad=True) self.run_test(model, ([x1, x2],), rtol=1e-3, atol=1e-5) - self.run_test(model, ([x1, x2],), input_names=["images_tensors"], output_names=["outputs"], - dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]}, rtol=1e-3, atol=1e-5) + self.run_test( + model, + ([x1, x2],), + input_names=["images_tensors"], + output_names=["outputs"], + dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]}, + rtol=1e-3, + atol=1e-5, + ) dummy_image = [torch.ones(3, 100, 100) * 0.3] images, test_images = self.get_test_images() - self.run_test(model, (images,), test_with_inputs=[(images, ), (test_images, ), (dummy_image, )], - input_names=["images_tensors"], output_names=["outputs"], - dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]}, rtol=1e-3, atol=1e-5) - self.run_test(model, (dummy_image,), test_with_inputs=[(dummy_image, ), (images, )], - input_names=["images_tensors"], output_names=["outputs"], - dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]}, rtol=1e-3, atol=1e-5) + self.run_test( + model, + (images,), + test_with_inputs=[(images,), (test_images,), (dummy_image,)], + input_names=["images_tensors"], + output_names=["outputs"], + dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]}, + rtol=1e-3, + atol=1e-5, + ) + self.run_test( + model, + (dummy_image,), + test_with_inputs=[(dummy_image,), (images,)], + input_names=["images_tensors"], + output_names=["outputs"], + dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]}, + rtol=1e-3, + atol=1e-5, + ) def test_paste_mask_in_image(self): masks = torch.rand(10, 1, 26, 26) @@ -629,12 +840,15 @@ def test_paste_mask_in_image(self): boxes *= 50 o_im_s = (100, 100) from torchvision.models.detection.roi_heads import paste_masks_in_image + out = paste_masks_in_image(masks, boxes, o_im_s) - jit_trace = torch.jit.trace(paste_masks_in_image, - (masks, boxes, - [torch.tensor(o_im_s[0]), - torch.tensor(o_im_s[1])])) - out_trace = jit_trace(masks, boxes, [torch.tensor(o_im_s[0]), torch.tensor(o_im_s[1])]) + jit_trace = torch.jit.trace( + paste_masks_in_image, + (masks, boxes, [torch.tensor(o_im_s[0]), torch.tensor(o_im_s[1])]), + ) + out_trace = jit_trace( + masks, boxes, [torch.tensor(o_im_s[0]), torch.tensor(o_im_s[1])] + ) assert torch.all(out.eq(out_trace)) @@ -644,35 +858,76 @@ def test_paste_mask_in_image(self): boxes2 *= 100 o_im_s2 = (200, 200) from torchvision.models.detection.roi_heads import paste_masks_in_image + out2 = paste_masks_in_image(masks2, boxes2, o_im_s2) - out_trace2 = jit_trace(masks2, boxes2, [torch.tensor(o_im_s2[0]), torch.tensor(o_im_s2[1])]) + out_trace2 = jit_trace( + masks2, boxes2, [torch.tensor(o_im_s2[0]), torch.tensor(o_im_s2[1])] + ) assert torch.all(out2.eq(out_trace2)) @skipIfUnsupportedMinOpsetVersion(11) - @disableScriptTest() + @skipScriptTest() def test_mask_rcnn(self): - model = torchvision.models.detection.mask_rcnn.maskrcnn_resnet50_fpn(pretrained=False, min_size=200, - max_size=300) + model = torchvision.models.detection.mask_rcnn.maskrcnn_resnet50_fpn( + pretrained=False, pretrained_backbone=True, min_size=200, max_size=300 + ) images, test_images = self.get_test_images() self.run_test(model, (images,), rtol=1e-3, atol=1e-5) - self.run_test(model, (images,), input_names=["images_tensors"], output_names=["boxes", "labels", "scores", "masks"], - dynamic_axes={"images_tensors": [0, 1, 2], "boxes": [0, 1], "labels": [0], - "scores": [0], "masks": [0, 1, 2]}, rtol=1e-3, atol=1e-5) + self.run_test( + model, + (images,), + input_names=["images_tensors"], + output_names=["boxes", "labels", "scores", "masks"], + dynamic_axes={ + "images_tensors": [0, 1, 2], + "boxes": [0, 1], + "labels": [0], + "scores": [0], + "masks": [0, 1, 2], + }, + rtol=1e-3, + atol=1e-5, + ) dummy_image = [torch.ones(3, 100, 100) * 0.3] - self.run_test(model, (images,), test_with_inputs=[(images,), (test_images,), (dummy_image,)], - input_names=["images_tensors"], output_names=["boxes", "labels", "scores", "masks"], - dynamic_axes={"images_tensors": [0, 1, 2], "boxes": [0, 1], "labels": [0], - "scores": [0], "masks": [0, 1, 2]}, rtol=1e-3, atol=1e-5) - self.run_test(model, (dummy_image,), test_with_inputs=[(dummy_image,), (images,)], - input_names=["images_tensors"], output_names=["boxes", "labels", "scores", "masks"], - dynamic_axes={"images_tensors": [0, 1, 2], "boxes": [0, 1], "labels": [0], - "scores": [0], "masks": [0, 1, 2]}, rtol=1e-3, atol=1e-5) + self.run_test( + model, + (images,), + test_with_inputs=[(images,), (test_images,), (dummy_image,)], + input_names=["images_tensors"], + output_names=["boxes", "labels", "scores", "masks"], + dynamic_axes={ + "images_tensors": [0, 1, 2], + "boxes": [0, 1], + "labels": [0], + "scores": [0], + "masks": [0, 1, 2], + }, + rtol=1e-3, + atol=1e-5, + ) + self.run_test( + model, + (dummy_image,), + test_with_inputs=[(dummy_image,), (images,)], + input_names=["images_tensors"], + output_names=["boxes", "labels", "scores", "masks"], + dynamic_axes={ + "images_tensors": [0, 1, 2], + "boxes": [0, 1], + "labels": [0], + "scores": [0], + "masks": [0, 1, 2], + }, + rtol=1e-3, + atol=1e-5, + ) def test_heatmaps_to_keypoints(self): maps = torch.rand(10, 1, 26, 26) rois = torch.rand(10, 4) from torchvision.models.detection.roi_heads import heatmaps_to_keypoints + out = heatmaps_to_keypoints(maps, rois) jit_trace = torch.jit.trace(heatmaps_to_keypoints, (maps, rois)) out_trace = jit_trace(maps, rois) @@ -683,6 +938,7 @@ def test_heatmaps_to_keypoints(self): maps2 = torch.rand(20, 2, 21, 21) rois2 = torch.rand(20, 4) from torchvision.models.detection.roi_heads import heatmaps_to_keypoints + out2 = heatmaps_to_keypoints(maps2, rois2) out_trace2 = jit_trace(maps2, rois2) @@ -691,46 +947,125 @@ def test_heatmaps_to_keypoints(self): @unittest.skip("Failing, see https://github.com/pytorch/pytorch/issues/66528") @skipIfUnsupportedMinOpsetVersion(11) - @disableScriptTest() + @skipScriptTest() def test_keypoint_rcnn(self): - model = torchvision.models.detection.keypoint_rcnn.keypointrcnn_resnet50_fpn(pretrained=False, min_size=200, - max_size=300) + model = torchvision.models.detection.keypoint_rcnn.keypointrcnn_resnet50_fpn( + pretrained=False, pretrained_backbone=False, min_size=200, max_size=300 + ) images, test_images = self.get_test_images() self.run_test(model, (images,), rtol=1e-3, atol=1e-5) - self.run_test(model, (images,), input_names=["images_tensors"], - output_names=["outputs1", "outputs2", "outputs3", "outputs4"], - dynamic_axes={"images_tensors": [0, 1, 2]}, - rtol=1e-3, atol=1e-5) + self.run_test( + model, + (images,), + input_names=["images_tensors"], + output_names=["outputs1", "outputs2", "outputs3", "outputs4"], + dynamic_axes={"images_tensors": [0, 1, 2]}, + rtol=1e-3, + atol=1e-5, + ) dummy_images = [torch.ones(3, 100, 100) * 0.3] - self.run_test(model, (images,), test_with_inputs=[(images, ), (test_images, ), (dummy_images, )], - input_names=["images_tensors"], output_names=["outputs1", "outputs2", "outputs3", "outputs4"], - dynamic_axes={"images_tensors": [0, 1, 2]}, - rtol=5e-3, atol=1e-5) - self.run_test(model, (dummy_images,), test_with_inputs=[(dummy_images, ), (test_images, )], - input_names=["images_tensors"], output_names=["outputs1", "outputs2", "outputs3", "outputs4"], - dynamic_axes={"images_tensors": [0, 1, 2]}, - rtol=5e-3, atol=1e-5) + self.run_test( + model, + (images,), + test_with_inputs=[(images,), (test_images,), (dummy_images,)], + input_names=["images_tensors"], + output_names=["outputs1", "outputs2", "outputs3", "outputs4"], + dynamic_axes={"images_tensors": [0, 1, 2]}, + rtol=5e-3, + atol=1e-5, + ) + self.run_test( + model, + (dummy_images,), + test_with_inputs=[(dummy_images,), (test_images,)], + input_names=["images_tensors"], + output_names=["outputs1", "outputs2", "outputs3", "outputs4"], + dynamic_axes={"images_tensors": [0, 1, 2]}, + rtol=5e-3, + atol=1e-5, + ) @skipIfUnsupportedMinOpsetVersion(11) - @disableScriptTest() + @skipScriptTest() def test_shufflenet_v2_dynamic_axes(self): model = torchvision.models.shufflenet_v2_x0_5(pretrained=False) dummy_input = torch.randn(1, 3, 224, 224, requires_grad=True) test_inputs = torch.randn(3, 3, 224, 224, requires_grad=True) - self.run_test(model, (dummy_input,), test_with_inputs=[(dummy_input,), (test_inputs,)], - input_names=["input_images"], output_names=["outputs"], - dynamic_axes={"input_images": {0: "batch_size"}, "output": {0: "batch_size"}}, - rtol=1e-3, atol=1e-5) + self.run_test( + model, + (dummy_input,), + test_with_inputs=[(dummy_input,), (test_inputs,)], + input_names=["input_images"], + output_names=["outputs"], + dynamic_axes={ + "input_images": {0: "batch_size"}, + "output": {0: "batch_size"}, + }, + rtol=1e-3, + atol=1e-5, + ) + + @skipScriptTest() + def test_mobilenet_v3(self): + model = torchvision.models.quantization.mobilenet_v3_large(pretrained=False) + dummy_input = torch.randn(1, 3, 224, 224) + self.run_test(model, (dummy_input,)) + + @unittest.skip( + "Unstable loading pretrained quantized mobilenet v3: https://github.com/pytorch/vision/issues/5303" + ) + @skipIfUnsupportedMinOpsetVersion(10) + @skipScriptTest() + def test_mobilenet_v3_quant(self): + model = torchvision.models.quantization.mobilenet_v3_large( + pretrained=True, quantize=True + ) + from PIL import Image + from torchvision import transforms + + data_dir = os.path.join(os.path.dirname(__file__), "assets") + path = os.path.join(data_dir, "grace_hopper_517x606.jpg") + input_image = Image.open(path) + # Based on example from https://pytorch.org/hub/pytorch_vision_resnet/ + preprocess = transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ), + ] + ) + input_tensor = preprocess(input_image).unsqueeze(0) + + # Due to precision error from quantization, check only that the top prediction matches. + class TopPredictor(torch.nn.Module): + def __init__(self, mobilenet): + super().__init__() + self.mobilenet = mobilenet + + def forward(self, x): + x = self.mobilenet(x) + _, topk_catid = torch.topk(x[0], 1) + return topk_catid - @disableScriptTest() + # Currently, we need convert the model to ScriptModule before export. + # The reason is that PackedParams contains int (not tensor). + # Then it fails when the exporter calls _trace_and_get_graph_from_model(). + # TODO: https://msdata.visualstudio.com/Vienna/_workitems/edit/1547858 + model = torch.jit.trace(TopPredictor(model), input_tensor) + self.run_test(model, (input_tensor,)) + + @skipScriptTest() def test_word_language_model_RNN_TANH(self): self.run_word_language_model("RNN_TANH") - @disableScriptTest() + @skipScriptTest() def test_word_language_model_RNN_RELU(self): self.run_word_language_model("RNN_RELU") - @disableScriptTest() # scripting prim::unchecked_cast prim::setattr + @skipScriptTest() # scripting prim::unchecked_cast prim::setattr def test_word_language_model_LSTM(self): self.run_word_language_model("LSTM") @@ -805,34 +1140,36 @@ def forward(self, input): m1 = torch.randn(3, 4, 5, 6, 7) self.run_test(MyModel(), m1) - @disableScriptTest() + @skipScriptTest() def test_dict(self): class MyModel(torch.nn.Module): def forward(self, x_in): x_out = {} - x_out["test_key_out"] = torch.add(x_in[list(x_in.keys())[0]], list(x_in.keys())[0]) + x_out["test_key_out"] = torch.add( + x_in[list(x_in.keys())[0]], list(x_in.keys())[0] + ) return x_out - x = {torch.tensor(1.): torch.randn(1, 2, 3)} + x = {torch.tensor(1.0): torch.randn(1, 2, 3)} self.run_test(MyModel(), (x, {})) - @disableScriptTest() + @skipScriptTest() def test_dict_str(self): class MyModel(torch.nn.Module): def forward(self, x_in): x_out = {} - x_out["test_key_out"] = torch.add(x_in["test_key_in"], 2.) + x_out["test_key_out"] = torch.add(x_in["test_key_in"], 2.0) return x_out x = {"test_key_in": torch.randn(1, 2, 3)} self.run_test(MyModel(), (x, {})) - @disableScriptTest() # User-defined class not supported + @skipScriptTest() # User-defined class not supported def test_dict_output(self): class DictModelOutput(OrderedDict): - tensor_out: torch.Tensor - tuple_out: Optional[Tuple[torch.Tensor]] = None - list_out: Optional[List[torch.Tensor]] = None + tensor_out: Tensor + tuple_out: Optional[Tuple[Tensor]] = None + list_out: Optional[List[Tensor]] = None class MyModel(torch.nn.Module): def forward(self, a, b, c, d): @@ -872,7 +1209,7 @@ def forward(self, a, b, c, d): def test_tuple_input(self): class TupleModel(torch.nn.Module): - def forward(self, a: Tuple[torch.Tensor, torch.Tensor]): + def forward(self, a: Tuple[Tensor, Tensor]): return a x = (torch.randn(3, 4), torch.randn(4, 3)) @@ -880,7 +1217,7 @@ def forward(self, a: Tuple[torch.Tensor, torch.Tensor]): def test_tuple_primitive_input(self): class TupleModel(torch.nn.Module): - def forward(self, a: Tuple[int, torch.Tensor], b): + def forward(self, a: Tuple[int, Tensor], b): return a[0], a[1] + b x = (3, torch.randn(4, 3)) @@ -889,30 +1226,27 @@ def forward(self, a: Tuple[int, torch.Tensor], b): def test_nested_tuple_input(self): class NestedTupleModel(torch.nn.Module): - def forward(self, a, b: Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]): + def forward(self, a, b: Tuple[Tensor, Tuple[Tensor, Tensor]]): return a + b[0] + b[1][0] + b[1][1] x = torch.randn(4, 5) y = (torch.randn(4, 5), (torch.randn(1, 5), torch.randn(4, 1))) self.run_test(NestedTupleModel(), input=(x, y)) - @disableScriptTest() - def test_optional_inputs_with_no_optionals(self): - class NoOptionalModel(torch.nn.Module): + def test_empty_kwargs(self): + class IdentityModel(torch.nn.Module): def forward(self, input): return input - # Without empty optional arguments dictionary - x = torch.randn(2, 3) - self.run_test(NoOptionalModel(), (x,)) - # With empty optional arguments dictionary - y = torch.randn(2, 3) - self.run_test(NoOptionalModel(), (y, {})) + self.run_test(IdentityModel(), (torch.randn(2, 3), {})) - @disableScriptTest() # ScriptModule could not be exported without the Input Descriptor for optional inputs - def test_optional_inputs_with_mixed_optionals(self): - class MixedModel(torch.nn.Module): - def forward(self, x, y=None, z=None): + @skipScriptTest() # Needs https://github.com/pytorch/rfcs/pull/21 + @skipIfUnsupportedMinOpsetVersion(15) + def test_mixed_optional_default_none(self): + class Model(torch.nn.Module): + def forward( + self, x, y: Optional[Tensor] = None, z: Optional[Tensor] = None + ): if y is not None: return x + y if z is not None: @@ -922,45 +1256,49 @@ def forward(self, x, y=None, z=None): x = torch.randn(2, 3) y = torch.randn(2, 3) z = torch.randn(2, 3) - # Without optional arguments dictionary - self.run_test(MixedModel(), (x, y, None)) - self.run_test(MixedModel(), (x, None, z)) - # With optional arguments dictionary - self.run_test(MixedModel(), (x, {"y": y, "z": None})) - self.run_test(MixedModel(), (x, {"y": None, "z": z})) - self.run_test(MixedModel(), (x, {"z": z})) - self.run_test(MixedModel(), (x, {"y": y})) - - @disableScriptTest() # ScriptModule could not be exported without the Input Descriptor for optional inputs - def test_optional_inputs_with_all_optionals(self): - class AllOptionalModel(torch.nn.Module): - def forward(self, y=None, z=None): + model = Model() + # Without kwargs dict. + self.run_test(model, (x, y, None)) + self.run_test(model, (x, None, z)) + # With kwargs dict. + self.run_test(model, (x, {"y": y, "z": None})) + self.run_test(model, (x, {"y": None, "z": z})) + self.run_test(model, (x, {"z": z})) + self.run_test(model, (x, {"y": y})) + + @skipScriptTest() # tracing eliminates None inputs so it works differently. See _script version below. + @skipIfUnsupportedMinOpsetVersion(15) + def test_mixed_optional_default_tensor(self): + class Model(torch.nn.Module): + def forward( + self, + x, + y: Optional[Tensor] = torch.ones(2, 3), + z: Optional[Tensor] = torch.zeros(2, 3), + ): if y is not None: - return y + return x + y if z is not None: - return z - - y = torch.randn(2, 3) - # Without optional arguments dictionary - self.run_test(AllOptionalModel(), (y, None)) - # With optional arguments dictionary - self.run_test(AllOptionalModel(), {"y": y, "z": None}) - - @disableScriptTest() - def test_input_names_with_optional_args(self): - class NoOptionalModel(torch.nn.Module): - def forward(self, input): - return input + return x + z + return x - # Without empty optional arguments dictionary x = torch.randn(2, 3) - self.run_test(NoOptionalModel(), (x,), input_names=["input_x"]) - # With empty optional arguments dictionary y = torch.randn(2, 3) - self.run_test(NoOptionalModel(), (y, {})) + z = torch.randn(2, 3) + model = Model() + + self.run_test(model, (x, y, None)) + self.run_test(model, (x, None, z)) - class MixedModel(torch.nn.Module): - def forward(self, x, y=None, z=None): + @skipIfUnsupportedMinOpsetVersion(15) + def test_mixed_optional_default_tensor_script(self): + class Model(torch.nn.Module): + def forward( + self, + x, + y: Optional[Tensor] = torch.ones(2, 3), + z: Optional[Tensor] = torch.zeros(2, 3), + ): if y is not None: return x + y if z is not None: @@ -970,54 +1308,128 @@ def forward(self, x, y=None, z=None): x = torch.randn(2, 3) y = torch.randn(2, 3) z = torch.randn(2, 3) - # Without optional arguments dictionary - self.run_test(MixedModel(), (x, y, None), input_names=["input_x", "input_y"]) - self.run_test(MixedModel(), (x, None, z), input_names=["input_x", "input_z"]) + model = torch.jit.script(Model()) + + self.run_test(model, (x, y, z), input_names=("x", "y", "z")) + self.run_test(model, (x, {"y": y, "z": z}), input_names=("x", "y", "z")) - # With optional arguments dictionary - self.run_test(MixedModel(), (x, {"y": y, "z": None}), input_names=["input_x", "input_y"]) - self.run_test(MixedModel(), (x, {"y": None, "z": z}), input_names=["input_x", "input_z"]) + # Requires input_names to be set so that we can feed the inputs properly into ORT. + # TODO: Export default values as ONNX initializers, then this should not raise. + # https://msdata.visualstudio.com/Vienna/_workitems/edit/969268 + # Default values are accessible via FunctionSchema. + with self.assertRaisesRegex( + ValueError, "Model requires 3 inputs. Input Feed contains 2" + ): + self.run_test(model, (x, {"y": y}), input_names=("x", "y")) - class AllOptionalModel(torch.nn.Module): - def forward(self, y=None, z=None): + for example_inputs in ( + (x, y, None), + (x, None, z), + (x, {"y": y, "z": None}), + (x, {"y": None, "z": z}), + ): + with self.assertRaisesRegex( + ValueError, "args contained 1 None's after flattening." + ): + self.run_test(model, example_inputs, input_names=("x", "y", "z")) + + @skipScriptTest() # Needs https://github.com/pytorch/rfcs/pull/21 + @skipIfUnsupportedMinOpsetVersion(15) + def test_all_optional_default_none(self): + class Model(torch.nn.Module): + def forward(self, x: Optional[Tensor] = None, y: Optional[Tensor] = None): + if x is not None: + return x if y is not None: return y - if z is not None: - return z + else: + return torch.tensor(-1.0) + + x = torch.randn(2, 3) + model = Model() + self.run_test(model, (x, None)) + self.run_test( + model, + ({"x": x, "y": None},), + # y disappears in tracing. + input_names=("x",), + ) + + @skipScriptTest() # tracing eliminates None inputs so it works differently. See _script version below. + @skipIfUnsupportedMinOpsetVersion(15) + def test_all_optional_default_tensor(self): + class Model(torch.nn.Module): + def forward( + self, + x: Optional[Tensor] = torch.ones(2, 3), + y: Optional[Tensor] = torch.zeros(2, 3), + ): + if x is not None: + return x + elif y is not None: + return y + else: + return torch.tensor(-1.0) + x = torch.randn(2, 3) y = torch.randn(2, 3) - z = torch.randn(2, 3) - # Without optional arguments dictionary - self.run_test(AllOptionalModel(), (y, None), input_names=["input_y"]) - self.run_test(AllOptionalModel(), (None, z), input_names=["input_z"]) - # With optional arguments dictionary - self.run_test(AllOptionalModel(), {"y": y, "z": None}, input_names=["input_y"]) - self.run_test(AllOptionalModel(), {"y": None, "z": z}, input_names=["input_z"]) - - def test_input_as_output(self): + model = Model() + self.run_test(model, (x, None)) + self.run_test(model, (None, y)) + # tracing means y is never used so it's removed from the exported model inputs, + # and we fail when trying to run ORT. + with self.assertRaisesRegex(ValueError, "got too many positional inputs"): + self.run_test(model, (x, y)) + + @skipIfUnsupportedMinOpsetVersion(15) + def test_all_optional_default_tensor_script(self): class Model(torch.nn.Module): - def forward(self, x, y): - return x, y + def forward( + self, + x: Optional[Tensor] = torch.ones(2, 3), + y: Optional[Tensor] = torch.zeros(2, 3), + ): + if x is not None: + return x + elif y is not None: + return y + else: + return torch.tensor(-1.0) x = torch.randn(2, 3) - y = torch.randn(3, 4) - self.run_test(Model(), (x, y), input_names=["x", "y"], output_names=["x_out", "y_out"]) + y = torch.randn(2, 3) + model = torch.jit.script(Model()) + + # TODO: Export default values as ONNX initializers, then this should not raise. + # https://msdata.visualstudio.com/Vienna/_workitems/edit/969268 + # Default values are accessible via FunctionSchema. + with self.assertRaisesRegex( + ValueError, "Model requires 2 inputs. Input Feed contains 1" + ): + self.run_test(model, (x,)) + self.run_test(model, ({"y": y},)) + self.run_test(model, (x, y)) + self.run_test(model, ({"x": x, "y": y},), input_names=("x", "y")) - @disableScriptTest() - def test_none_as_input(self): + @skipScriptTest() # Needs https://github.com/pytorch/rfcs/pull/21 + @skipIfUnsupportedMinOpsetVersion(15) + def test_mixed_optional(self): class Model(torch.nn.Module): - def forward(self, x, y): + def forward(self, x, y: Optional[Tensor]): if y is not None: return x + y return x x = torch.randn(2, 3) - self.run_test(Model(), (x, None)) + model = Model() + self.run_test(model, (x, None)) + self.run_test(model, (x, x)) - @disableScriptTest() # ScriptModule could not be exported without the Input Descriptor for optional inputs - def test_none_as_tuple_input(self): + @skipScriptTest() # Needs https://github.com/pytorch/rfcs/pull/21 + @skipIfUnsupportedMinOpsetVersion(15) + def test_tuple_of_optional(self): class Model(torch.nn.Module): - def forward(self, x, y): + def forward(self, x, y: Tuple[Optional[Tensor], Optional[Tensor]]): if y[0] is not None: return x + y[0] if y[1] is not None: @@ -1025,28 +1437,67 @@ def forward(self, x, y): return x x = torch.randn(2, 3) - y = torch.randn(2, 3) - self.run_test(Model(), (x, (None, y))) + y1 = torch.randn(2, 3) + self.run_test(Model(), (x, (None, y1))) - @disableScriptTest() # ScriptModule could not be exported without the Input Descriptor for optional inputs - def test_none_as_named_input(self): + @skipScriptTest() # tracing eliminates None inputs so it works differently. See _script version below. + @skipIfUnsupportedMinOpsetVersion(15) + def test_tuple_of_optional_default_tensor(self): class Model(torch.nn.Module): - def forward(self, x, y=None, z=None): - if y is not None: - return x + y - if z is not None: - return x + z + def forward( + self, + x, + y: Tuple[Optional[Tensor], Optional[Tensor]] = ( + torch.zeros(2, 3), + torch.zeros(2, 3), + ), + ): + y0, y1 = y + if y0 is not None: + return x + y0 + if y1 is not None: + return x + y1 return x x = torch.randn(2, 3) - z = torch.randn(2, 3) - self.run_test(Model(), (x, None, z)) + y1 = torch.randn(2, 3) + self.run_test(Model(), (x, (None, y1))) - def test_primitive_input_integer(self): + @skipIfUnsupportedMinOpsetVersion(15) + def test_tuple_of_optional_default_tensor_script(self): class Model(torch.nn.Module): - def __init__(self): - super().__init__() + def forward( + self, + x, + y: Tuple[Optional[Tensor], Optional[Tensor]] = ( + torch.zeros(2, 3), + torch.zeros(2, 3), + ), + ): + y0, y1 = y + if y0 is not None: + return x + y0 + if y1 is not None: + return x + y1 + return x + + x = torch.randn(2, 3) + y0 = torch.randn(2, 3) + y1 = torch.randn(2, 3) + model = torch.jit.script(Model()) + with self.assertRaisesRegex( + ValueError, "args contained 1 None's after flattening." + ): + self.run_test(model, (x, (None, y1))) + self.run_test(model, (x, (y0, y1))) + # export succeeds, but running ORT through run_test would fail because the exported model + # has the inputs flattened into 3 inputs. + torch.onnx.export( + model, (x, {"y": (y0, y1)}), io.BytesIO(), opset_version=self.opset_version + ) + def test_primitive_input_integer(self): + class Model(torch.nn.Module): def forward(self, x: int, y): return x + y @@ -1087,7 +1538,9 @@ def test_cste_script(self): class MyModel(torch.jit.ScriptModule): @torch.jit.script_method def forward(self, x): - return torch.zeros(x.size(0)), torch.ones((x.size(1), x.size(0)), dtype=torch.int64) + return torch.zeros(x.size(0)), torch.ones( + (x.size(1), x.size(0)), dtype=torch.int64 + ) x = torch.randn(3, 4) self.run_test(MyModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1]}) @@ -1096,15 +1549,20 @@ def forward(self, x): def test_scalar_tensor(self): class test(torch.nn.Module): def forward(self, input): - return torch.scalar_tensor(input.size(0)), \ - torch.scalar_tensor(input.size(1), dtype=torch.int64) + return torch.scalar_tensor(input.size(0)), torch.scalar_tensor( + input.size(1), dtype=torch.int64 + ) x = torch.randn(2, 3, 4) y = torch.randn(7, 8, 9) model = test() - self.run_test(model, x, test_with_inputs=[y], - input_names=["input_1"], - dynamic_axes={"input_1": [0, 1, 2]}) + self.run_test( + model, + x, + test_with_inputs=[y], + input_names=["input_1"], + dynamic_axes={"input_1": [0, 1, 2]}, + ) def test_tensor(self): class ScalarInputModel(torch.jit.ScriptModule): @@ -1113,7 +1571,9 @@ def forward(self, input): return torch.tensor(input.shape[1]) x = torch.randn(3, 4) - self.run_test(ScalarInputModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1]}) + self.run_test( + ScalarInputModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1]} + ) self.run_test(ScalarInputModel(), x, remained_onnx_input_idx=[]) class TensorInputModel(torch.jit.ScriptModule): @@ -1122,7 +1582,9 @@ def forward(self, input): return torch.tensor([input.shape[0], input.shape[1]]) x = torch.randn(3, 4) - self.run_test(TensorInputModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1]}) + self.run_test( + TensorInputModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1]} + ) self.run_test(TensorInputModel(), x, remained_onnx_input_idx=[]) class FloatInputModel(torch.jit.ScriptModule): @@ -1139,7 +1601,9 @@ def forward(self, input): return torch.tensor(input.shape[1], dtype=torch.long) x = torch.randn(3, 4) - self.run_test(InputWithDtypeModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1]}) + self.run_test( + InputWithDtypeModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1]} + ) self.run_test(InputWithDtypeModel(), x, remained_onnx_input_idx=[]) class MixedInputModel(torch.jit.ScriptModule): @@ -1261,7 +1725,6 @@ def forward(self, x): x = torch.arange(16).view(4, 4).float() self.run_test(ClampMaxModel(), x) - class ClampMinModel(torch.jit.ScriptModule): @torch.jit.script_method def forward(self, x): @@ -1336,8 +1799,12 @@ class TraceModel(torch.nn.Module): def __init__(self): super(TraceModel, self).__init__() self.conv1 = torch.nn.Conv1d(16, 33, 3, stride=2) - self.conv2 = torch.nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1)) - self.conv3 = torch.nn.Conv3d(16, 33, (3, 5, 2), stride=(2, 1, 1), padding=(4, 2, 0)) + self.conv2 = torch.nn.Conv2d( + 16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1) + ) + self.conv3 = torch.nn.Conv3d( + 16, 33, (3, 5, 2), stride=(2, 1, 1), padding=(4, 2, 0) + ) def forward(self, input1, input2, input3): return self.conv1(input1), self.conv2(input2), self.conv3(input3) @@ -1352,23 +1819,29 @@ def test_conv_shape_inference(self): class Model(torch.nn.Module): def __init__(self): super(Model, self).__init__() - self.conv2 = torch.nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1)) + self.conv2 = torch.nn.Conv2d( + 16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1) + ) def forward(self, input): return self.conv2(input) + 2 x = torch.randn(20, 16, 50, 100) - self.run_test(Model(), x, atol=10e-5, - input_names=["x"], - dynamic_axes={"x": [0]}) + self.run_test( + Model(), x, atol=10e-5, input_names=["x"], dynamic_axes={"x": [0]} + ) def test_conv_transpose(self): class TraceModel(torch.nn.Module): def __init__(self): super(TraceModel, self).__init__() self.conv1 = torch.nn.ConvTranspose1d(16, 33, 3, stride=2) - self.conv2 = torch.nn.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1)) - self.conv3 = torch.nn.ConvTranspose3d(16, 33, (3, 5, 2), stride=(2, 1, 1), padding=(4, 2, 0)) + self.conv2 = torch.nn.ConvTranspose2d( + 16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1) + ) + self.conv3 = torch.nn.ConvTranspose3d( + 16, 33, (3, 5, 2), stride=(2, 1, 1), padding=(4, 2, 0) + ) def forward(self, input1, input2, input3): return self.conv1(input1), self.conv2(input2), self.conv3(input3) @@ -1381,7 +1854,6 @@ def forward(self, input1, input2, input3): # Conversion of Transpose depends on input shape to be known. # The following test only works when onnx shape inference is enabled. - @skipIfONNXShapeInference(False) def test_transpose_infer_shape(self): class TransposeModule(torch.jit.ScriptModule): def __init__(self): @@ -1395,9 +1867,13 @@ def forward(self, x): x = torch.randn(32, 3, 64, 64) y = torch.randn(16, 3, 8, 64) - self.run_test(TransposeModule(), x, input_names=["x"], - dynamic_axes={"x": [0, 2]}, - test_with_inputs=[y]) + self.run_test( + TransposeModule(), + x, + input_names=["x"], + dynamic_axes={"x": [0, 2]}, + test_with_inputs=[y], + ) def squeeze_model_tests(self, d, x1, x2): class Squeeze(torch.nn.Module): @@ -1413,9 +1889,13 @@ def forward(self, x): x2 = [] if x2 is None else [x2] if len(x2) > 0: - self.run_test(Squeeze(d), x1, - input_names=["input"], dynamic_axes={"input": {0: "0", 1: "1", 2: "2"}}, - test_with_inputs=x2) + self.run_test( + Squeeze(d), + x1, + input_names=["input"], + dynamic_axes={"input": {0: "0", 1: "1", 2: "2"}}, + test_with_inputs=x2, + ) else: self.run_test(Squeeze(d), x1) @@ -1471,6 +1951,16 @@ def forward(self, x): x = torch.randn(2, 1, 4) self.run_test(Squeeze(), x) + @skipIfUnsupportedMinOpsetVersion(13) + def test_squeeze_dynamic_dim(self): + class Squeeze(torch.nn.Module): + def forward(self, x, dim: int): + return torch.squeeze(x, dim) + + x = torch.randn(2, 1, 4) + dim = 1 + self.run_test(Squeeze(), (x, dim)) + def test_unsqueeze(self): class Unsqueeze(torch.nn.Module): def forward(self, x): @@ -1479,6 +1969,16 @@ def forward(self, x): x = torch.randn(2, 3, 4) self.run_test(Unsqueeze(), x) + @skipIfUnsupportedMinOpsetVersion(13) + def test_unsqueeze_dynamic_dim(self): + class Unsqueeze(torch.nn.Module): + def forward(self, x, dim: int): + return torch.unsqueeze(x, dim) + + x = torch.randn(2, 1, 4) + dim = -1 + self.run_test(Unsqueeze(), (x, dim)) + def test_maxpool_default_stride(self): class MaxPoolModel(torch.nn.Module): def forward(self, x): @@ -1493,9 +1993,9 @@ def test_maxpool_adaptive(self): model = torch.nn.AdaptiveMaxPool1d((5), return_indices=False) x = torch.randn(20, 16, 50, requires_grad=True) y = torch.randn(32, 16, 50, requires_grad=True) - self.run_test(model, x, input_names=["x"], - dynamic_axes={"x" : [0]}, - test_with_inputs=[y]) + self.run_test( + model, x, input_names=["x"], dynamic_axes={"x": [0]}, test_with_inputs=[y] + ) def test_maxpool_2d(self): model = torch.nn.MaxPool2d(5, padding=(1, 2)) @@ -1557,9 +2057,13 @@ def test_avgpool_3d_ceil(self): model = torch.nn.AvgPool3d(3, 2, ceil_mode=True) x = torch.randn(20, 16, 50, 44, 31) y = torch.randn(32, 8, 50, 44, 31) - self.run_test(model, x, input_names=["x"], - dynamic_axes={"x" : [0, 1]}, - test_with_inputs=[y]) + self.run_test( + model, + x, + input_names=["x"], + dynamic_axes={"x": [0, 1]}, + test_with_inputs=[y], + ) @skipIfUnsupportedMinOpsetVersion(9) def test_floating_point(self): @@ -1571,7 +2075,9 @@ def forward(self, x): return x.new_zeros(x.shape) x = torch.randn(2, 3, 4) - self.run_test(FloatingPoint(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]}) + self.run_test( + FloatingPoint(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]} + ) self.run_test(FloatingPoint(), x, remained_onnx_input_idx=[]) class FloatingPoint(torch.jit.ScriptModule): @@ -1589,7 +2095,6 @@ def forward(self, x): # Operator rank mismatch between outputs of two branches for opsets below 11. @skipIfUnsupportedMinOpsetVersion(11) - @skipIfONNXShapeInference(False) def test_floating_point_infer_dtype(self): class FloatingPoint(torch.jit.ScriptModule): @torch.jit.script_method @@ -1602,7 +2107,9 @@ def forward(self, x): return x x = torch.randn(2, 3, 4) - self.run_test(FloatingPoint(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]}) + self.run_test( + FloatingPoint(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]} + ) self.run_test(FloatingPoint(), x, remained_onnx_input_idx=[]) class FloatingPoint(torch.jit.ScriptModule): @@ -1621,9 +2128,11 @@ def forward(self, x): @skipIfUnsupportedMinOpsetVersion(12) def test_prim_min(self): @torch.jit.script - def list_append(boxes: List[torch.Tensor]): + def list_append(boxes: List[Tensor]): temp = [] - for i, b in enumerate(boxes): # enumerate is creating a prim::min op in torch graph + for i, b in enumerate( + boxes + ): # enumerate is creating a prim::min op in torch graph temp.append(torch.full_like(b[:, 1], i)) return temp[0] @@ -1700,7 +2209,6 @@ def forward(self, x): x = torch.randn(2, 3, 4) self.run_test(ArithmeticModule(), x, remained_onnx_input_idx=[]) - @skipIfUnsupportedMinOpsetVersion(9) # https://github.com/microsoft/onnxruntime/issues/9663 def test_arithmetic_prim_bool(self): class ArithmeticModule(torch.nn.Module): def forward(self, x, y: int, z: bool, t: float): @@ -1725,19 +2233,21 @@ def forward(self, x: int, y: int): y = 2 self.run_test(ArithmeticModule(), (x, y)) - @disableScriptTest() + # In tracing, None outputs are removed. In scripting they're kept but + # we don't know Optional.elem_type, so we can't construct a valid Optional. + # Tests for Optional outputs (control flow with None in one branch, + # not-None in another) are in test_pytorch_onnx_no_runtime.py. + @skipScriptTest() def test_tuple_with_none_outputs(self): class TupleModel(torch.nn.Module): def forward(self, x): - l = (x, None, (x, None)) - return (x, l) + return (x, (x, None, (x, None))) x = torch.randn(3, 4) self.run_test(TupleModel(), (x,)) # In scripting the first transpose node do not carry shape and dtype info. # The following test only works when onnx shape inference is enabled. - @skipIfONNXShapeInference(False) def test_arithmetic_infer_dtype(self): class ArithmeticModule(torch.jit.ScriptModule): @torch.jit.script_method @@ -1755,12 +2265,20 @@ def forward(self, x): def test_floor_div(self): class FloorDivModule(torch.nn.Module): def forward(self, x, y): - return x // 3, x // 2., \ - x.to(dtype=torch.float64) // 3, x.to(dtype=torch.float64) // 2., \ - x.to(dtype=torch.int64) // 3, x.to(dtype=torch.int64) // 2., \ - x // (y + 1.).to(dtype=torch.int64), x // y, \ - x.to(dtype=torch.float64) // y.to(dtype=torch.int64), x.to(dtype=torch.float64) // y.to(dtype=torch.float64), \ - x.to(dtype=torch.int64) // y.to(dtype=torch.int64), x.to(dtype=torch.int64) // y + return ( + x // 3, + x // 2.0, + x.to(dtype=torch.float64) // 3, + x.to(dtype=torch.float64) // 2.0, + x.to(dtype=torch.int64) // 3, + x.to(dtype=torch.int64) // 2.0, + x // (y + 1.0).to(dtype=torch.int64), + x // y, + x.to(dtype=torch.float64) // y.to(dtype=torch.int64), + x.to(dtype=torch.float64) // y.to(dtype=torch.float64), + x.to(dtype=torch.int64) // y.to(dtype=torch.int64), + x.to(dtype=torch.int64) // y, + ) x = torch.arange(-2, 4).reshape(2, 3, 1) y = torch.arange(1, 2 * 3 * 4 + 1).reshape(2, 3, 4) @@ -1770,7 +2288,7 @@ def test_floor_div_script(self): class FloorDivModule(torch.jit.ScriptModule): @torch.jit.script_method def forward(self, x, y): - return x // 3, x // 2., x // y + return x // 3, x // 2.0, x // y x = torch.arange(-2, 4).reshape(2, 3, 1) y = torch.randn(2, 3, 4) @@ -1783,7 +2301,9 @@ def forward(self, x): return x.new_zeros(x.size(2) // x.size(1)) x = torch.randn(2, 3, 4) - self.run_test(FloordivModule(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]}) + self.run_test( + FloordivModule(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]} + ) self.run_test(FloordivModule(), (x,), remained_onnx_input_idx=[]) def test_div(self): @@ -1820,7 +2340,6 @@ def forward(self, x, y): # In scripting x, y do not carry shape and dtype info. # The following test only works when onnx shape inference is enabled. - @skipIfONNXShapeInference(False) def test_div_promotion_script(self): class DivModule(torch.nn.Module): def forward(self, x, y): @@ -1857,18 +2376,24 @@ def forward(self, x, y): def test_div_rounding_mode(self): class TrueDivModule(torch.nn.Module): def forward(self, x, y): - return (x.div(y, rounding_mode=None), - torch.div(x, y, rounding_mode=None)) + return ( + x.div(y, rounding_mode=None), + torch.div(x, y, rounding_mode=None), + ) class TruncDivModule(torch.nn.Module): def forward(self, x, y): - return (x.div(y, rounding_mode="trunc"), - torch.div(x, y, rounding_mode="trunc")) + return ( + x.div(y, rounding_mode="trunc"), + torch.div(x, y, rounding_mode="trunc"), + ) class FloorDivModule(torch.nn.Module): def forward(self, x, y): - return (x.div(y, rounding_mode="floor"), - torch.div(x, y, rounding_mode="floor")) + return ( + x.div(y, rounding_mode="floor"), + torch.div(x, y, rounding_mode="floor"), + ) modules = [TrueDivModule(), TruncDivModule(), FloorDivModule()] @@ -1924,7 +2449,7 @@ def forward(self, x): def test_slice_with_input_index(self): class InputIndexSlice(torch.nn.Module): def forward(self, x, y): - x[:y.size(0), 0, :] = y + x[: y.size(0), 0, :] = y return x x = torch.zeros((56, 6, 256)) @@ -1932,29 +2457,32 @@ def forward(self, x, y): self.run_test(InputIndexSlice(), (x, y)) @skipIfUnsupportedMinOpsetVersion(10) - @disableScriptTest() # scripting tuple/list append + @skipScriptTest() # scripting tuple/list append def test_slice_dynamic(self): class DynamicSliceExportMod(torch.nn.Module): def forward(self, x): results = [] for i in range(4): - results.append(x[:x.size(0) - i, i:x.size(2), i:3]) + results.append(x[: x.size(0) - i, i : x.size(2), i:3]) return tuple(results) x = torch.rand(5, 5, 5) y = torch.randn(6, 7, 8) - self.run_test(DynamicSliceExportMod(), x, test_with_inputs=[y], - input_names=["input_1"], - output_names=["output_1"], - dynamic_axes={"input_1": [0, 1, 2], - "output_1": [0, 1, 2]}) + self.run_test( + DynamicSliceExportMod(), + x, + test_with_inputs=[y], + input_names=["input_1"], + output_names=["output_1"], + dynamic_axes={"input_1": [0, 1, 2], "output_1": [0, 1, 2]}, + ) @skipIfUnsupportedMinOpsetVersion(10) def test_slice_dynamic_script(self): class DynamicSliceModel(torch.jit.ScriptModule): @torch.jit.script_method def forward(self, x): - return x[1:x.size(1)] + return x[1 : x.size(1)] x = torch.rand(1, 2) self.run_test(DynamicSliceModel(), x) @@ -1963,14 +2491,16 @@ def forward(self, x): def test_slice_dynamic_shape_script(self): class DynamicSliceModel(torch.nn.Module): def forward(self, x): - return x.new_zeros(x.shape[1:x.size(2)]) + return x.new_zeros(x.shape[1 : x.size(2)]) x = torch.rand(1, 2, 3, 4) - self.run_test(DynamicSliceModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2, 3]}) + self.run_test( + DynamicSliceModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2, 3]} + ) self.run_test(DynamicSliceModel(), x, remained_onnx_input_idx=[]) @skipIfUnsupportedMinOpsetVersion(10) - @disableScriptTest() # scripting tuple/list append + @skipScriptTest() # scripting tuple/list append def test_slice_dynamic_to_end(self): class DynamicSliceExportMod(torch.nn.Module): def forward(self, x): @@ -1980,9 +2510,11 @@ def forward(self, x): return tuple(results) x = torch.rand(5, 5, 5) - self.run_test(DynamicSliceExportMod(), x, - dynamic_axes={"input_1": [0, 1, 2], - "output_1": [0, 1, 2]}) + self.run_test( + DynamicSliceExportMod(), + x, + dynamic_axes={"input_1": [0, 1, 2], "output_1": [0, 1, 2]}, + ) def test_square(self): class Square(torch.nn.Module): @@ -1996,22 +2528,30 @@ def forward(self, x): def test_arange_dynamic(self): class ArangeModel(torch.nn.Module): def forward(self, input): - return torch.arange(input.shape[0]), \ - torch.arange(12), \ - torch.arange(start=input.shape[0], end=input.shape[0] + 5) + return ( + torch.arange(input.shape[0]), + torch.arange(12), + torch.arange(start=input.shape[0], end=input.shape[0] + 5), + ) x = torch.randn(5, 3, 2) y = torch.randn(8, 3, 2) - self.run_test(ArangeModel(), x, test_with_inputs=[y], - input_names=["input_1"], - output_names=["output_1", "output_2", "output_3"], - dynamic_axes={"input_1": [0], - "output_1": [0]}) - self.run_test(torch.jit.script(ArangeModel()), x, - test_with_inputs=[y], input_names=["input_1"], - output_names=["output_1", "output_2", "output_3"], - dynamic_axes={"input_1": [0], - "output_1": [0]}) + self.run_test( + ArangeModel(), + x, + test_with_inputs=[y], + input_names=["input_1"], + output_names=["output_1", "output_2", "output_3"], + dynamic_axes={"input_1": [0], "output_1": [0]}, + ) + self.run_test( + torch.jit.script(ArangeModel()), + x, + test_with_inputs=[y], + input_names=["input_1"], + output_names=["output_1", "output_2", "output_3"], + dynamic_axes={"input_1": [0], "output_1": [0]}, + ) @skipIfUnsupportedMinOpsetVersion(9) def test_dynamic_arange_out(self): @@ -2032,8 +2572,12 @@ def forward(self, start, end): x = torch.randn(2, 3, 4) y = torch.tensor(8) - self.run_test(ArangeStartOutModel(), (x, y), - input_names=["x", "y"], dynamic_axes={"x": [0, 1, 2]}) + self.run_test( + ArangeStartOutModel(), + (x, y), + input_names=["x", "y"], + dynamic_axes={"x": [0, 1, 2]}, + ) self.run_test(ArangeStartOutModel(), (x, y), remained_onnx_input_idx=[1]) @skipIfUnsupportedMinOpsetVersion(9) @@ -2047,6 +2591,17 @@ def forward(self, start, end, steps): z = torch.tensor(5, dtype=torch.int) self.run_test(LinspaceModel(), (x, y, z)) + @skipIfUnsupportedMinOpsetVersion(9) + def test_linspace_negative_start(self): + class LinspaceModel(torch.nn.Module): + def forward(self, start, end, steps): + return torch.linspace(start, end, steps) + + x = torch.tensor(-1, dtype=torch.float) + y = torch.tensor(1, dtype=torch.float) + z = torch.tensor(6, dtype=torch.int) + self.run_test(LinspaceModel(), (x, y, z)) + @skipIfUnsupportedMinOpsetVersion(9) def test_arange_with_floats_out(self): class ArangeModelEnd(torch.nn.Module): @@ -2064,8 +2619,12 @@ def forward(self, start, end): x = torch.randn(2, 3, 4) y = torch.tensor(8.5, dtype=torch.float) - self.run_test(ArangeModelStep(), (x, y), input_names=["x", "y"], - dynamic_axes={"x": [0, 1, 2]}) + self.run_test( + ArangeModelStep(), + (x, y), + input_names=["x", "y"], + dynamic_axes={"x": [0, 1, 2]}, + ) self.run_test(ArangeModelStep(), (x, y), remained_onnx_input_idx=[1]) @skipIfUnsupportedMinOpsetVersion(9) @@ -2083,8 +2642,12 @@ def forward(self, start, end): x = torch.randn(2, 3, 4) y = torch.tensor(8.5, dtype=torch.float) - self.run_test(ArangeModelStep(), (x, y), input_names=["x", "y"], - dynamic_axes={"x": [0, 1, 2]}) + self.run_test( + ArangeModelStep(), + (x, y), + input_names=["x", "y"], + dynamic_axes={"x": [0, 1, 2]}, + ) self.run_test(ArangeModelStep(), (x, y), remained_onnx_input_idx=[1]) class ArangeModelStepNeg(torch.nn.Module): @@ -2093,8 +2656,12 @@ def forward(self, start, end): x = torch.randn(2, 3, 4) y = torch.tensor(8.5, dtype=torch.float) - self.run_test(ArangeModelStepNeg(), (x, y), input_names=["x", "y"], - dynamic_axes={"x": [0, 1, 2]}) + self.run_test( + ArangeModelStepNeg(), + (x, y), + input_names=["x", "y"], + dynamic_axes={"x": [0, 1, 2]}, + ) self.run_test(ArangeModelStepNeg(), (x, y), remained_onnx_input_idx=[1]) class ArangeModelStart(torch.nn.Module): @@ -2103,8 +2670,12 @@ def forward(self, start, end): x = torch.randn(2, 3, 4) y = torch.tensor(8.5, dtype=torch.float) - self.run_test(ArangeModelStart(), (x, y), input_names=["x", "y"], - dynamic_axes={"x": [0, 1, 2]}) + self.run_test( + ArangeModelStart(), + (x, y), + input_names=["x", "y"], + dynamic_axes={"x": [0, 1, 2]}, + ) self.run_test(ArangeModelStart(), (x, y), remained_onnx_input_idx=[1]) @skipIfUnsupportedMinOpsetVersion(9) @@ -2122,8 +2693,12 @@ def forward(self, start, end): x = torch.randn(2, 3, 4) y = torch.tensor(8.5, dtype=torch.float) - self.run_test(ArangeModelStep(), (x, y), input_names=["x", "y"], - dynamic_axes={"x": [0, 1, 2]}) + self.run_test( + ArangeModelStep(), + (x, y), + input_names=["x", "y"], + dynamic_axes={"x": [0, 1, 2]}, + ) self.run_test(ArangeModelStep(), (x, y), remained_onnx_input_idx=[1]) @skipIfUnsupportedMinOpsetVersion(11) @@ -2145,16 +2720,19 @@ def forward(self, start, end): x = torch.randn(2, 3, 4) y = torch.tensor(8.5, dtype=torch.float) - self.run_test(ArangeStartOutModel(), (x, y), input_names=["x", "y"], - dynamic_axes={"x": [0, 1, 2]}) + self.run_test( + ArangeStartOutModel(), + (x, y), + input_names=["x", "y"], + dynamic_axes={"x": [0, 1, 2]}, + ) self.run_test(ArangeStartOutModel(), (x, y), remained_onnx_input_idx=[1]) @skipIfUnsupportedMinOpsetVersion(11) def test_arange_no_type(self): class ArangeModel(torch.nn.Module): def forward(self, end): - return torch.arange(end), \ - torch.arange(0, end) + return torch.arange(end), torch.arange(0, end) x = torch.tensor(6.2, dtype=torch.float) self.run_test(ArangeModel(), x) @@ -2163,14 +2741,18 @@ def forward(self, end): def test_size(self): class SizeModel(torch.nn.Module): def forward(self, input): - return torch.arange(input.size(0)), torch.arange(input.size(-1)), torch.ones(input.shape) + return ( + torch.arange(input.size(0)), + torch.arange(input.size(-1)), + torch.ones(input.shape), + ) x = torch.randn(5, 3, 2) self.run_test(SizeModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]}) self.run_test(SizeModel(), x, remained_onnx_input_idx=[]) @skipIfUnsupportedMinOpsetVersion(9) - @disableScriptTest() # x.stride() not scriptable + @skipScriptTest() # x.stride() not scriptable def test_as_strided(self): class Model(torch.nn.Module): def forward(self, x): @@ -2178,12 +2760,14 @@ def forward(self, x): chunk_size[1] = chunk_size[1] * 2 - 1 chunk_stride = list(x.stride()) chunk_stride[1] = chunk_stride[1] // 2 - return x.as_strided((3, 3, 3), (1, 4, 2), storage_offset=2), x.as_strided(chunk_size, chunk_stride) + return x.as_strided( + (3, 3, 3), (1, 4, 2), storage_offset=2 + ), x.as_strided(chunk_size, chunk_stride) x = torch.randn(5, 8, 7) self.run_test(Model(), x) - @disableScriptTest() # Ellipses followed by tensor indexing not scriptable + @skipScriptTest() # Ellipses followed by tensor indexing not scriptable def test_tensor_index_advanced_indexing_ellipsis(self): class MyModel(torch.nn.Module): def forward(self, input): @@ -2195,27 +2779,43 @@ def forward(self, input): def test_tensor_index_advanced_indexing(self): class MyModel(torch.nn.Module): def forward(self, input): - return input[:, torch.tensor([[0, 2], [1, 1]]), :, torch.tensor([2, 1]), torch.tensor([0, 3])] + return input[ + :, + torch.tensor([[0, 2], [1, 1]]), + :, + torch.tensor([2, 1]), + torch.tensor([0, 3]), + ] m1 = torch.randn(3, 4, 5, 6, 7) self.run_test(MyModel(), (m1,)) class MyModel(torch.nn.Module): def forward(self, input): - return input[:, torch.tensor([0, 2]), None, 2:4, torch.tensor([[1, 3], [4, 0]])] + return input[ + :, torch.tensor([0, 2]), None, 2:4, torch.tensor([[1, 3], [4, 0]]) + ] self.run_test(MyModel(), (m1,)) class MyModel(torch.nn.Module): def forward(self, input): - return input[:, torch.tensor([0, 2]), torch.tensor([1]), 2:4, torch.tensor([[1], [4]])] + return input[ + :, + torch.tensor([0, 2]), + torch.tensor([1]), + 2:4, + torch.tensor([[1], [4]]), + ] self.run_test(MyModel(), (m1,)) def test_tensor_index_advanced_indexing_consecutive(self): class MyModel(torch.nn.Module): def forward(self, input): - return input[:, torch.tensor([0, 2]), torch.tensor([[1, 3], [4, 0]]), None] + return input[ + :, torch.tensor([0, 2]), torch.tensor([[1, 3], [4, 0]]), None + ] m1 = torch.randn(3, 4, 5, 6, 7) self.run_test(MyModel(), (m1,)) @@ -2256,7 +2856,7 @@ def forward(self, mask, indices): def test_index_put_accumulate(self): class IndexPutModel(torch.nn.Module): def forward(self, x, ind, update): - return x.index_put((ind, ), update, accumulate=True) + return x.index_put((ind,), update, accumulate=True) x = torch.randn(3, 4) ind = torch.tensor([2], dtype=torch.long) @@ -2359,7 +2959,7 @@ def forward(self, x, ind, update): self.run_test(IndexPutModel10(), (x, ind, update)) @skipIfUnsupportedMinOpsetVersion(11) - @disableScriptTest() # Ellipses followed by tensor indexing not scriptable + @skipScriptTest() # Ellipses followed by tensor indexing not scriptable def test_index_put_ellipsis(self): class IndexPutModel(torch.nn.Module): def forward(self, x, update): @@ -2382,8 +2982,12 @@ def forward(self, x, update): @skipIfUnsupportedMinOpsetVersion(11) def test_index_put_loop(self): @torch.jit.script - def ngram_attention_bias(sequence_length: int, ngram: int, device: torch.device, dtype: torch.dtype): - bias = torch.ones((ngram, sequence_length), device=device, dtype=dtype) * float("-inf") + def ngram_attention_bias( + sequence_length: int, ngram: int, device: torch.device, dtype: torch.dtype + ): + bias = torch.ones( + (ngram, sequence_length), device=device, dtype=dtype + ) * float("-inf") for stream_idx in range(ngram): for i in range(sequence_length): bias = bias * 2 @@ -2406,15 +3010,23 @@ def __init__(self): def forward(self, hidden_states): seq_length, batch_size = hidden_states.shape[:2] predict_causal_mask = ngram_attention_bias( - self.max_target_positions, self.ngram, hidden_states.device, hidden_states.dtype + self.max_target_positions, + self.ngram, + hidden_states.device, + hidden_states.dtype, ) predict_causal_mask = predict_causal_mask[:, :seq_length] return predict_causal_mask x = torch.randn(6, 2) y = torch.randn(4, 1) - self.run_test(ScriptModel(), x, input_names=["x"], - dynamic_axes={"x": {0: "seq_length", 1: "batch_size"}}, test_with_inputs=[y]) + self.run_test( + ScriptModel(), + x, + input_names=["x"], + dynamic_axes={"x": {0: "seq_length", 1: "batch_size"}}, + test_with_inputs=[y], + ) @skipIfUnsupportedMinOpsetVersion(11) def test_copy_(self): @@ -2479,7 +3091,7 @@ def forward(self, x, mask): self.run_test(CopyModel5(), (x, mask)) @skipIfUnsupportedMinOpsetVersion(11) - @disableScriptTest() # Model not scriptable (output with shape doesn't match the broadcast shape) + @skipScriptTest() # Model not scriptable (output with shape doesn't match the broadcast shape) def test_copy_tracing(self): class CopyModel(torch.nn.Module): def forward(self, x, data): @@ -2580,14 +3192,18 @@ def forward(self, x): def test_random_like_dtype(self): class RandNLike(torch.nn.Module): def forward(self, x): - return torch.mul(x.to(torch.double), torch.randn_like(x, dtype=torch.double).size(0)) + return torch.mul( + x.to(torch.double), torch.randn_like(x, dtype=torch.double).size(0) + ) x = torch.randn(2, 3, 4) self.run_test(RandNLike(), x) class RandLike(torch.nn.Module): def forward(self, x): - return torch.mul(x.to(torch.double), torch.rand_like(x, dtype=torch.double).size(0)) + return torch.mul( + x.to(torch.double), torch.rand_like(x, dtype=torch.double).size(0) + ) x = torch.randn(2, 3, 4) self.run_test(RandLike(), x) @@ -2603,8 +3219,7 @@ def forward(self, x): x = torch.empty(2, 3, 3, dtype=torch.double).uniform_(0, 1) self.run_test(Bernoulli(), x) - # Enable test when fix for allowzero is in ORT - @skipForAllOpsetVersions() + @unittest.skip("Bug in ORT, skip test until rel-1.11.") @skipIfUnsupportedMinOpsetVersion(14) def test_reshape_allowzero(self): class ReshapeModel(torch.nn.Module): @@ -2626,7 +3241,16 @@ def forward(self, x): def _interpolate(self, x, mode, use_size, is_upsample, align_corners=False): class MyModel(torch.nn.Module): - __constants__ = ["mode", "use_size", "is_upsample", "size", "scale", "size_array", "scale_array", "align_corners"] + __constants__ = [ + "mode", + "use_size", + "is_upsample", + "size", + "scale", + "size_array", + "scale_array", + "align_corners", + ] def __init__(self, mode, use_size, is_upsample, align_corners): super(MyModel, self).__init__() @@ -2649,19 +3273,39 @@ def __init__(self, mode, use_size, is_upsample, align_corners): def forward(self, x): if self.use_size: if self.align_corners: - return torch.nn.functional.interpolate(x, mode=self.mode, size=self.size, align_corners=True), \ - torch.nn.functional.interpolate(x, mode=self.mode, size=self.size_array, align_corners=True) - return torch.nn.functional.interpolate(x, mode=self.mode, size=self.size), \ - torch.nn.functional.interpolate(x, mode=self.mode, size=self.size_array) + return torch.nn.functional.interpolate( + x, mode=self.mode, size=self.size, align_corners=True + ), torch.nn.functional.interpolate( + x, mode=self.mode, size=self.size_array, align_corners=True + ) + return torch.nn.functional.interpolate( + x, mode=self.mode, size=self.size + ), torch.nn.functional.interpolate( + x, mode=self.mode, size=self.size_array + ) if self.align_corners: - return torch.nn.functional.interpolate(x, mode=self.mode, - scale_factor=self.scale, recompute_scale_factor=False), \ - torch.nn.functional.interpolate(x, mode=self.mode, - scale_factor=self.scale_array, recompute_scale_factor=False) - return torch.nn.functional.interpolate(x, mode=self.mode, - scale_factor=self.scale, recompute_scale_factor=False), \ - torch.nn.functional.interpolate(x, mode=self.mode, - scale_factor=self.scale_array, recompute_scale_factor=False) + return torch.nn.functional.interpolate( + x, + mode=self.mode, + scale_factor=self.scale, + recompute_scale_factor=False, + ), torch.nn.functional.interpolate( + x, + mode=self.mode, + scale_factor=self.scale_array, + recompute_scale_factor=False, + ) + return torch.nn.functional.interpolate( + x, + mode=self.mode, + scale_factor=self.scale, + recompute_scale_factor=False, + ), torch.nn.functional.interpolate( + x, + mode=self.mode, + scale_factor=self.scale_array, + recompute_scale_factor=False, + ) model = MyModel(mode, use_size, is_upsample, align_corners) self.run_test(model, x, atol=1e-6) @@ -2672,9 +3316,11 @@ def _interpolate_tests(self, is_upsample): modes = ["nearest", "linear", "bicubic"] if self.opset_version < 11: modes = ["nearest"] - x = [torch.randn(1, 2, 6, requires_grad=True), - torch.randn(1, 2, 4, 6, requires_grad=True), - torch.randn(1, 2, 4, 4, 6, requires_grad=True)] + x = [ + torch.randn(1, 2, 6, requires_grad=True), + torch.randn(1, 2, 4, 6, requires_grad=True), + torch.randn(1, 2, 4, 4, 6, requires_grad=True), + ] for mode in modes: for xi in x: @@ -2711,7 +3357,7 @@ def test_interpolate_upsample(self): self._interpolate_tests(True) @skipIfUnsupportedMaxOpsetVersion(8) - @disableScriptTest() # Scripting supported for opsets > 8. See test_interpolate_upsample + @skipScriptTest() # Scripting supported for opsets > 8. See test_interpolate_upsample def test_interpolate_upsample_trace(self): self._interpolate_tests(True) @@ -2720,7 +3366,9 @@ def test_interpolate_function_substitution(self): class ScriptModel(torch.jit.ScriptModule): @torch.jit.script_method def forward(self, x): - return torch.nn.functional.interpolate(x, mode="nearest", scale_factor=2.) + return torch.nn.functional.interpolate( + x, mode="nearest", scale_factor=2.0 + ) class ScriptModule(torch.jit.ScriptModule): def __init__(self): @@ -2736,7 +3384,7 @@ def forward(self, input): @torch.jit.script def script_method(x): - return torch.nn.functional.interpolate(x, mode="nearest", scale_factor=2.) + return torch.nn.functional.interpolate(x, mode="nearest", scale_factor=2.0) class TracingModule(torch.nn.Module): def forward(self, x): @@ -2754,16 +3402,25 @@ class MyModel(torch.jit.ScriptModule): @torch.jit.script_method def forward(self, x, y): x = torch.add(x, x) - out1 = torch.nn.functional.interpolate(x, mode="bilinear", size=(16, 16), align_corners=False) - out2 = torch.nn.functional.interpolate(x, mode="nearest", size=(int(y.size(0)), int(y.size(1)))) + out1 = torch.nn.functional.interpolate( + x, mode="bilinear", size=(16, 16), align_corners=False + ) + out2 = torch.nn.functional.interpolate( + x, mode="nearest", size=(int(y.size(0)), int(y.size(1))) + ) return out1, out2 x = torch.randn(1, 2, 4, 4, requires_grad=True) y = torch.randn(16, 16, requires_grad=True) - self.run_test(MyModel(), (x, y), input_names=["x", "y"], dynamic_axes={"x": [0, 1, 2, 3], "y": [0, 1]}) + self.run_test( + MyModel(), + (x, y), + input_names=["x", "y"], + dynamic_axes={"x": [0, 1, 2, 3], "y": [0, 1]}, + ) self.run_test(MyModel(), (x, y), remained_onnx_input_idx=[0]) - @disableScriptTest() # scripting throws the ONNXRuntimeError + @skipScriptTest() # scripting raises OnnxRuntimeError def test_interpolate_adaptive_pooling_error(self): x = torch.randn(1, 2, 6, requires_grad=True) with self.assertRaises(RuntimeError) as cm: @@ -2772,7 +3429,6 @@ def test_interpolate_adaptive_pooling_error(self): with self.assertRaises(RuntimeError) as cm: self._interpolate(x, "area", False, True) - @skipIfUnsupportedMinOpsetVersion(9) # https://github.com/microsoft/onnxruntime/issues/9663 def test_groupnorm(self): model = torch.nn.GroupNorm(3, 6, 0.002) x = torch.randn(4, 6, 180, 180, 180) @@ -2786,7 +3442,6 @@ def test_groupnorm(self): x = torch.randn(4, 6, 180, 180) self.run_test(model, x) - @skipIfUnsupportedMinOpsetVersion(9) # https://github.com/microsoft/onnxruntime/issues/9663 def test_groupnorm_noaffine(self): model = torch.nn.GroupNorm(4, 8, 0.002, affine=False) x = torch.randn(3, 8, 224, 224) @@ -2819,7 +3474,9 @@ def forward(self, x): return x.new_zeros((a, b)) x = torch.randn(2, 3, 4, 5) - self.run_test(ListUnpackSlice(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2, 3]}) + self.run_test( + ListUnpackSlice(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2, 3]} + ) self.run_test(ListUnpackSlice(), x, remained_onnx_input_idx=[]) def test_pow(self): @@ -2969,7 +3626,8 @@ def forward(self, x): y = torch.zeros(()) y += x return y - x = torch.tensor(42.) + + x = torch.tensor(42.0) self.run_test(Zeros(), x) class Ones(torch.nn.Module): @@ -2977,15 +3635,17 @@ def forward(self, x): y = torch.ones(()) y += x return y - x = torch.tensor(42.) + + x = torch.tensor(42.0) self.run_test(Ones(), x) class Full(torch.nn.Module): def forward(self, x): - y = torch.full((), 1.) + y = torch.full((), 1.0) y += x return y - x = torch.tensor(42.) + + x = torch.tensor(42.0) self.run_test(Full(), x) class Empty(torch.nn.Module): @@ -2993,7 +3653,8 @@ def forward(self, x): y = torch.empty(()).fill_(0) y += x return y - x = torch.tensor(42.) + + x = torch.tensor(42.0) self.run_test(Empty(), x) def test_std(self): @@ -3267,18 +3928,15 @@ def forward(self, input): self.run_test(model, x) def test_bitshift(self): - class BitshiftModel(torch.nn.Module): - def forward(self, input, input2): - return input >> 1, input << 3.1, \ - input2 >> torch.tensor([1, 2]), input2 << 4.2 - input = torch.arange(24, dtype=torch.float32).reshape(3, 4, 2) - input2 = torch.arange(24, dtype=torch.int64).reshape(3, 4, 2) - self.run_test(BitshiftModel(), (input, input2)) - - def test_bitshift_other_fp(self): class BitshiftModel(torch.nn.Module): def forward(self, input): - return input << 2.4 + return ( + input >> 1, + input << 3, + input >> torch.tensor([1, 2]), + input << 4, + ) + input = torch.arange(24, dtype=torch.int64).reshape(3, 4, 2) self.run_test(BitshiftModel(), input) @@ -3288,8 +3946,13 @@ def forward(self, input): def test_bitshift_uint8(self): class BitshiftModel(torch.nn.Module): def forward(self, input, input2): - return input >> 1, input << 3., \ - input2 >> torch.tensor([1, 2], dtype=torch.uint8), input2 << 4. + return ( + input >> 1, + input << 3, + input2 >> torch.tensor([1, 2], dtype=torch.uint8), + input2 << 4, + ) + input = torch.arange(24, dtype=torch.uint8).reshape(3, 4, 2) input2 = torch.arange(24, dtype=torch.uint8).reshape(3, 4, 2) self.run_test(BitshiftModel(), (input, input2)) @@ -3353,6 +4016,7 @@ class IndexSelectScalerIndexModel(torch.nn.Module): def forward(self, x): index = 2 return torch.index_select(x, 1, torch.tensor(index)) + x = torch.randn(3, 4) self.run_test(IndexSelectScalerIndexModel(), x) @@ -3365,6 +4029,7 @@ def __init__(self, index_base): def forward(self, x, index_offset): index = self.index_base + index_offset return torch.index_select(x, 1, index) + x = torch.randn(3, 4) offset = 2 index_offset = torch.tensor(offset) @@ -3385,9 +4050,19 @@ class MyModule(torch.nn.Module): def forward(self, x): return torch.topk(x, 3) - x = torch.arange(1., 6., requires_grad=True) + x = torch.arange(1.0, 6.0, requires_grad=True) self.run_test(MyModule(), x) + @skipIfUnsupportedMinOpsetVersion(10) + def test_topk_int32_k(self): + class Model(torch.nn.Module): + def forward(self, x, k): + return torch.topk(x, k) + + x = torch.arange(1.0, 6.0) + k = torch.tensor(3, dtype=torch.int32) + self.run_test(Model(), (x, k)) + @skipIfUnsupportedMinOpsetVersion(11) def test_topk_smallest_unsorted(self): class MyModule(torch.nn.Module): @@ -3398,7 +4073,7 @@ def forward(self, x, k): topk_sorted = torch.topk(x, k, largest=False, sorted=True) return topk_sorted, torch.sort(topk_unsorted.values).values - x = torch.arange(1., 6., requires_grad=True) + x = torch.arange(1.0, 6.0, requires_grad=True) k = torch.tensor(3) self.run_test(MyModule(), (x, k)) @@ -3409,11 +4084,11 @@ class MyModuleDynamic(torch.jit.ScriptModule): def forward(self, x, k): return torch.topk(x, k) - x = torch.arange(1., 6., requires_grad=True) + x = torch.arange(1.0, 6.0, requires_grad=True) k = torch.tensor(3) self.run_test(MyModuleDynamic(), [x, k]) - @disableScriptTest() # Python builtin apply of FunctionMeta object is currently not supported in Torchscript. + @skipScriptTest() # Python builtin apply of FunctionMeta object is currently not supported in Torchscript. @skipIfUnsupportedMinOpsetVersion(11) # Clip op min is an input since opset 11. def test_auto_grad(self): class MyClip(torch.autograd.Function): @@ -3428,12 +4103,15 @@ def forward(ctx, input): ctx.save_for_backward(input) return input.clamp(min=0) - def symbolic_python_op(g: torch._C.Graph, n: torch._C.Node, *args, **kwargs): + def symbolic_python_op( + ctx: torch.onnx.SymbolicContext, g: torch._C.Graph, *args, **kwargs + ): + n = ctx.cur_node name = kwargs["name"] if name == "MyClip": - return g.op("Clip", args[0], args[1]) + return g.op("Clip", args[0], args[1], outputs=n.outputsSize()) elif name == "MyRelu": - return g.op("Relu", args[0]) + return g.op("Relu", args[0], outputs=n.outputsSize()) else: return _unimplemented("prim::PythonOp", "unknown node kind: " + name) @@ -3443,6 +4121,7 @@ def symbolic_python_op(g: torch._C.Graph, n: torch._C.Node, *args, **kwargs): class MyClipModule(torch.nn.Module): def forward(self, x, min): return MyClip.apply(x, min) + x = torch.randn(3, 3) min = torch.tensor([0.0]) self.run_test(MyClipModule(), (x, min)) @@ -3450,9 +4129,41 @@ def forward(self, x, min): class MyReluModule(torch.nn.Module): def forward(self, x): return MyRelu.apply(x) + x = torch.randn(3, 3) self.run_test(MyReluModule(), x) + def test_clip_int(self): + class MyClipInt(torch.nn.Module): + def forward(self, x): + return torch.clamp(x, 0, 1) + + self.run_test(MyClipInt(), torch.randn(3, 3).to(torch.int64)) + + def test_relu_int(self): + self.run_test(torch.nn.ReLU(), torch.randn(3, 3).to(torch.int32)) + + def test_pad_int(self): + class MyPadInt(torch.nn.Module): + def forward(self, x): + return torch.nn.functional.pad(x, (1, 1)) + + self.run_test(MyPadInt(), torch.randn(3, 3).to(torch.int32)) + + def test_min_int(self): + class MyMinInt(torch.nn.Module): + def forward(self, x): + return torch.min(x, x + 1) + + self.run_test(MyMinInt(), torch.randn(3, 3).to(torch.int32)) + + def test_max_int(self): + class MyMaxnInt(torch.nn.Module): + def forward(self, x): + return torch.max(x, x + 1) + + self.run_test(MyMaxnInt(), torch.randn(3, 3).to(torch.int32)) + @skipIfUnsupportedOpsetVersion([7]) def test_normalize(self): class Model(torch.nn.Module): @@ -3483,7 +4194,6 @@ def test_batchnorm1d_noaffine(self): x = torch.randn(10, 10, 128) self.run_test(model, x) - @skipIfUnsupportedMinOpsetVersion(9) # https://github.com/microsoft/onnxruntime/issues/9663 def test_batchnorm1d_norunningstats(self): x = torch.randn(10, 10) model = torch.nn.BatchNorm1d(10, track_running_stats=False) @@ -3502,7 +4212,6 @@ def test_batchnorm2d_noaffine(self): model = torch.nn.BatchNorm2d(3, affine=False) self.run_test(model, x) - @skipIfUnsupportedMinOpsetVersion(9) # https://github.com/microsoft/onnxruntime/issues/9663 def test_batchnorm2d_norunningstats(self): x = torch.randn(10, 3, 128, 128) model = torch.nn.BatchNorm2d(3, track_running_stats=False) @@ -3518,7 +4227,9 @@ def test_batchnorm3d_noaffine(self): model = torch.nn.BatchNorm3d(3, affine=False) self.run_test(model, x) - @skipIfUnsupportedMinOpsetVersion(9) # Because ConstantOfShape op is not supported for opset < 9 + @skipIfUnsupportedMinOpsetVersion( + 9 + ) # Because ConstantOfShape op is not supported for opset < 9 def test_instancenorm1d_runningstats(self): x = torch.randn(10, 5, 128) model = torch.nn.InstanceNorm1d(5, affine=True, track_running_stats=True) @@ -3527,7 +4238,6 @@ def test_instancenorm1d_runningstats(self): model = torch.nn.InstanceNorm1d(5, affine=False, track_running_stats=True) self.run_test(model, x) - @skipIfUnsupportedMinOpsetVersion(9) # https://github.com/microsoft/onnxruntime/issues/9663 def test_instancenorm1d_norunningstats(self): x = torch.randn(10, 5, 128) model = torch.nn.InstanceNorm1d(5, affine=True, track_running_stats=False) @@ -3536,7 +4246,9 @@ def test_instancenorm1d_norunningstats(self): model = torch.nn.InstanceNorm1d(5, affine=False, track_running_stats=False) self.run_test(model, x) - @skipIfUnsupportedMinOpsetVersion(9) # Because ConstantOfShape op is not supported for opset < 9 + @skipIfUnsupportedMinOpsetVersion( + 9 + ) # Because ConstantOfShape op is not supported for opset < 9 def test_instancenorm2d_runningstats(self): x = torch.randn(10, 3, 128, 128) model = torch.nn.InstanceNorm2d(3, affine=True, track_running_stats=True) @@ -3545,7 +4257,6 @@ def test_instancenorm2d_runningstats(self): model = torch.nn.InstanceNorm2d(3, affine=False, track_running_stats=True) self.run_test(model, x) - @skipIfUnsupportedMinOpsetVersion(9) # https://github.com/microsoft/onnxruntime/issues/9663 def test_instancenorm2d_norunningstats(self): x = torch.randn(10, 3, 128, 128) model = torch.nn.InstanceNorm2d(3, affine=True, track_running_stats=False) @@ -3554,7 +4265,9 @@ def test_instancenorm2d_norunningstats(self): model = torch.nn.InstanceNorm2d(3, affine=False, track_running_stats=False) self.run_test(model, x) - @skipIfUnsupportedMinOpsetVersion(9) # Because ConstantOfShape op is not supported for opset < 9 + @skipIfUnsupportedMinOpsetVersion( + 9 + ) # Because ConstantOfShape op is not supported for opset < 9 def test_instancenorm3d_runningstats(self): x = torch.randn(10, 3, 128, 128, 128) model = torch.nn.InstanceNorm3d(3, affine=True, track_running_stats=True) @@ -3563,7 +4276,6 @@ def test_instancenorm3d_runningstats(self): model = torch.nn.InstanceNorm3d(3, affine=False, track_running_stats=True) self.run_test(model, x) - @skipIfUnsupportedMinOpsetVersion(9) # https://github.com/microsoft/onnxruntime/issues/9663 def test_instancenorm3d_norunningstats(self): x = torch.randn(10, 3, 128, 128, 128) model = torch.nn.InstanceNorm3d(3, affine=True, track_running_stats=False) @@ -3579,7 +4291,9 @@ def forward(self, input, indices): values = 1.0 return input.scatter(1, indices, values) - input = torch.tensor([[0., 0., 0.], [0., 0., 0.], [0., 0., 0.]], dtype=torch.float64) + input = torch.tensor( + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], dtype=torch.float64 + ) indices = torch.tensor([[1, 0], [0, 1], [0, 1]], dtype=torch.int64) self.run_test(ScatterModel(), input=(input, indices)) @@ -3593,7 +4307,9 @@ def forward(self, input, indices): values = 1.0 return input.scatter(1, indices, values) - input = torch.tensor([[0., 0., 0.], [0., 0., 0.], [0., 0., 0.]], dtype=torch.float32) + input = torch.tensor( + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], dtype=torch.float32 + ) indices = torch.tensor([[1, 0], [0, 1], [0, 1]], dtype=torch.int64) self.run_test(ScatterModel(), input=(input, indices)) @@ -3603,7 +4319,7 @@ class ScatterModel(torch.nn.Module): def forward(self, input, indices, values): return input.scatter(1, indices, values) - input = torch.tensor([[0., 0., 0.], [0., 0., 0.], [0., 0., 0.]]) + input = torch.tensor([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]) indices = torch.tensor([[1, 0], [0, 1], [0, 1]], dtype=torch.int64) values = torch.tensor([[1.0, 1.1], [2.0, 2.1], [3.0, 3.1]]) self.run_test(ScatterModel(), input=(input, indices, values)) @@ -3636,7 +4352,7 @@ def forward(self, input, indices, values): self.run_test(ScatterModel(), input=(input, indices, values)) @torch.jit.script - def scatter_sum(src: torch.Tensor, index: torch.Tensor): + def scatter_sum(src: Tensor, index: Tensor): size = src.size() out = torch.zeros(size, dtype=src.dtype) return out.scatter_add_(1, index, src) @@ -3649,6 +4365,18 @@ def forward(self, src, index): index = torch.tensor([[0, 1], [0, 1], [0, 1]], dtype=torch.int64) self.run_test(ScatterModel(), (src, index)) + @skipIfUnsupportedMinOpsetVersion(9) + def test_bucketize(self): + class BucketModel(torch.nn.Module): + def forward(self, input, boundaries): + return torch.bucketize(input, boundaries), torch.bucketize( + input, boundaries, right=True + ) + + input = torch.tensor([[2, 5, 10], [6, 8, 3]]) + boundaries = torch.tensor([1, 5, 7, 8, 10]) + self.run_test(BucketModel(), (input, boundaries)) + @skipIfUnsupportedMinOpsetVersion(9) def test_one_hot(self): class OneHot(torch.nn.Module): @@ -3677,11 +4405,11 @@ class GatherModel(torch.nn.Module): def forward(self, input, indices): return input.gather(1, indices) - input = torch.tensor([[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]]) + input = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) indices = torch.tensor([[1, 0], [0, 1], [0, 1]], dtype=torch.int64) self.run_test(GatherModel(), input=(input, indices)) - @disableScriptTest() # Scripting error: Cannot instantiate nn module + @skipScriptTest() # Scripting error: Cannot instantiate nn module def test_gather_constant_fold(self): class GatherModule(torch.nn.Module): def __init__(self): @@ -3727,10 +4455,16 @@ def forward(self, x): return x x = torch.randn(1, 3, 224, 224) - self.run_test(GatherModule(), (x,), - dynamic_axes={"input": {0: "batch", 2: "height", 3: "width"}, - "output": {0: "batch", 1: "class", 2: "height", 3: "width"}}, - input_names=['input'], output_names=['output']) + self.run_test( + GatherModule(), + (x,), + dynamic_axes={ + "input": {0: "batch", 2: "height", 3: "width"}, + "output": {0: "batch", 1: "class", 2: "height", 3: "width"}, + }, + input_names=["input"], + output_names=["output"], + ) @skipIfUnsupportedOpsetVersion([13]) @skipIfUnsupportedMinOpsetVersion(9) @@ -3753,7 +4487,9 @@ class ExpandTensorSizeModel(torch.nn.Module): def forward(self, input, size): return input.expand(size) - input = torch.randn(3,) + input = torch.randn( + 3, + ) size = torch.tensor(-1) self.run_test(ExpandTensorSizeModel(), input=(input, size)) @@ -3761,27 +4497,33 @@ def forward(self, input, size): def test_dynamic_expand_as(self): class Model(torch.nn.Module): def forward(self, x): - x[:, x.size(0):] = 0 + x[:, x.size(0) :] = 0 return x x = torch.ones(2, 5) x2 = torch.randn(3, 4) - self.run_test(Model(), (x, ), - input_names=["x"], - dynamic_axes={"x": [0, 1]}, - test_with_inputs=[x2]) + self.run_test( + Model(), + (x,), + input_names=["x"], + dynamic_axes={"x": [0, 1]}, + test_with_inputs=[x2], + ) class Model(torch.nn.Module): def forward(self, x): - x[:, x.size(0):] = torch.tensor([1, 2, 3]) + x[:, x.size(0) :] = torch.tensor([1, 2, 3]) return x x = torch.ones(2, 5, 3) x2 = torch.randn(3, 4, 3) - self.run_test(Model(), (x, ), - input_names=["x"], - dynamic_axes={"x": [0, 1, 2]}, - test_with_inputs=[x2]) + self.run_test( + Model(), + (x,), + input_names=["x"], + dynamic_axes={"x": [0, 1, 2]}, + test_with_inputs=[x2], + ) def test_multinomial(self): class Multinomial(torch.nn.Module): @@ -3856,6 +4598,7 @@ def test_reduced_min_max(self): class ReducedMinMaxModule(torch.nn.Module): def forward(self, input): return torch.min(input, dim=-1)[0], torch.max(input, dim=0)[0] + x = torch.randint(10, (4, 4), dtype=torch.int32) self.run_test(ReducedMinMaxModule(), x) @@ -3893,7 +4636,9 @@ def forward(self, x): self.run_test(model, input) def test_softmax_large_values(self): - input = torch.tensor([[-1e12, -1e12, -1e12], [1e12, 0.0, -5.0], [3.0, 4.0, 5.0]]) + input = torch.tensor( + [[-1e12, -1e12, -1e12], [1e12, 0.0, -5.0], [3.0, 4.0, 5.0]] + ) for i in range(-2, 1): model = torch.nn.Softmax(dim=i) self.run_test(model, input) @@ -3984,7 +4729,9 @@ def test_lstm(self): class LSTMModel(torch.nn.Module): def __init__(self): super().__init__() - self.rnn = torch.nn.LSTM(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False) + self.rnn = torch.nn.LSTM( + RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False + ) def forward(self, x, h0, c0): return self.rnn(x, (h0, c0)) @@ -3999,7 +4746,9 @@ def test_lstm_cell(self): class LSTMCellModel(torch.nn.Module): def __init__(self, bias): super().__init__() - self.lstm_cell = torch.nn.LSTMCell(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, bias=bias) + self.lstm_cell = torch.nn.LSTMCell( + RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, bias=bias + ) def forward(self, x, h0, c0): return self.lstm_cell(x, (h0, c0)) @@ -4015,7 +4764,9 @@ def test_lstm_default_init_state(self): class LSTMModel(torch.nn.Module): def __init__(self): super().__init__() - self.rnn = torch.nn.LSTM(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False) + self.rnn = torch.nn.LSTM( + RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False + ) def forward(self, x): return self.rnn(x) @@ -4028,7 +4779,9 @@ def test_lstm_fixed_batch_size(self): class LSTMModel(torch.nn.Module): def __init__(self): super(LSTMModel, self).__init__() - self.lstm = torch.nn.LSTM(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False) + self.lstm = torch.nn.LSTM( + RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False + ) self.RNN_HIDDEN_SIZE = RNN_HIDDEN_SIZE def forward(self, input): @@ -4040,14 +4793,18 @@ def forward(self, input): input = torch.randn(RNN_SEQUENCE_LENGTH, BATCH_SIZE, RNN_INPUT_SIZE) # verify with different input of same batch size input2 = torch.randn(RNN_SEQUENCE_LENGTH, BATCH_SIZE, RNN_INPUT_SIZE) - self.run_test(LSTMModel(), input, fixed_batch_size=True, test_with_inputs=[input2]) + self.run_test( + LSTMModel(), input, fixed_batch_size=True, test_with_inputs=[input2] + ) @skipIfUnsupportedMinOpsetVersion(9) def test_lstm_post_fix_init_state(self): class LSTMModel(torch.nn.Module): def __init__(self): super(LSTMModel, self).__init__() - self.lstm = torch.nn.LSTM(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False) + self.lstm = torch.nn.LSTM( + RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False + ) self.RNN_HIDDEN_SIZE = RNN_HIDDEN_SIZE def forward(self, input): @@ -4060,20 +4817,28 @@ def forward(self, input): input = torch.randn(RNN_SEQUENCE_LENGTH, 1, RNN_INPUT_SIZE) # verify with different input of different batch size input2 = torch.randn(RNN_SEQUENCE_LENGTH, BATCH_SIZE, RNN_INPUT_SIZE) - self.run_test(model, input, input_names=["input.1"], dynamic_axes={"input.1" : {0 : "seq", 1 : "batch"}}, - test_with_inputs=[input2]) + self.run_test( + model, + input, + input_names=["input.1"], + dynamic_axes={"input.1": {0: "seq", 1: "batch"}}, + test_with_inputs=[input2], + ) def test_lstm_constant_folding(self): class LstmNet(torch.nn.Module): def __init__(self, input_size, hidden_size, num_layers, bidirectional): super(LstmNet, self).__init__() - self.lstm = torch.nn.LSTM(input_size, hidden_size, num_layers, bidirectional=bidirectional) + self.lstm = torch.nn.LSTM( + input_size, hidden_size, num_layers, bidirectional=bidirectional + ) - def forward(self, input, initial_state: Tuple[torch.Tensor, torch.Tensor]): + def forward(self, input, initial_state: Tuple[Tensor, Tensor]): return self.lstm(input, initial_state) - def get_LstmNet_model_and_inputs(input_size, hidden_size, num_layers, batch_size, - seq_len, bidirectional): + def get_LstmNet_model_and_inputs( + input_size, hidden_size, num_layers, batch_size, seq_len, bidirectional + ): num_directions = 2 if bidirectional else 1 model = LstmNet(input_size, hidden_size, num_layers, bidirectional) input = torch.randn(seq_len, batch_size, input_size) @@ -4094,9 +4859,15 @@ def test_lstm_no_bias(self): class LstmNet(torch.nn.Module): def __init__(self, num_layers, bidirectional): super(LstmNet, self).__init__() - self.lstm = torch.nn.LSTM(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, num_layers, bias=False, bidirectional=bidirectional) + self.lstm = torch.nn.LSTM( + RNN_INPUT_SIZE, + RNN_HIDDEN_SIZE, + num_layers, + bias=False, + bidirectional=bidirectional, + ) - def forward(self, input, initial_state: Tuple[torch.Tensor, torch.Tensor]): + def forward(self, input, initial_state: Tuple[Tensor, Tensor]): return self.lstm(input, initial_state) def get_LstmNet_model_and_inputs(num_layers, bidirectional): @@ -4109,16 +4880,54 @@ def get_LstmNet_model_and_inputs(num_layers, bidirectional): num_layers = [1, 1, 2, 3] bidirectional = [True, False, True, False] - models_and_inputs = [get_LstmNet_model_and_inputs(n, b) for n, b in zip(num_layers, bidirectional)] + models_and_inputs = [ + get_LstmNet_model_and_inputs(n, b) + for n, b in zip(num_layers, bidirectional) + ] for model, input in models_and_inputs: self.run_test(model, input) - @disableScriptTest() + @skipIfUnsupportedMinOpsetVersion(9) + def test_lstm_sequence(self): + class LstmNet(torch.nn.Module): + def __init__(self): + super().__init__() + self.rnn1 = torch.nn.LSTM(8, 8, bidirectional=True, batch_first=True) + self.linear1 = torch.nn.Linear(8 * 2, 8) + self.rnn2 = torch.nn.LSTM(8, 8, bidirectional=True, batch_first=True) + self.linear2 = torch.nn.Linear(8 * 2, 8) + + def forward(self, input): + rnn_output1, _ = self.rnn1(input) + linear_output1 = self.linear1(rnn_output1) + rnn_output2, _ = self.rnn2(linear_output1) + linear_output2 = self.linear2(rnn_output2) + return linear_output2 + + input = torch.zeros((1, 100, 8), dtype=torch.float32) + self.run_test( + LstmNet(), + input, + input_names=["input"], + output_names=["output"], + dynamic_axes={ + "input": {0: "batch_size", 1: "w", 2: "h"}, + "output": {0: "batch_size", 1: "w", 2: "h"}, + }, + ) + + @skipScriptTest() def test_rnn_no_bias(self): def make_model(layers, packed_sequence): batch_first = True if packed_sequence == 2 else False - model = torch.nn.RNN(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, layers, bidirectional=False, - batch_first=batch_first, bias=False) + model = torch.nn.RNN( + RNN_INPUT_SIZE, + RNN_HIDDEN_SIZE, + layers, + bidirectional=False, + batch_first=batch_first, + bias=False, + ) if packed_sequence == 1: model = RnnModelWithPackedSequence(model, False) @@ -4147,7 +4956,9 @@ def make_input(batch_size, layers, packed_sequence): layers = [1, 3, 1, 3, 1, 3] packed_sequence = [0, 0, 1, 1, 2, 2] models = [make_model(l, p) for l, p in zip(layers, packed_sequence)] - inputs = [make_input(RNN_BATCH_SIZE, l, p) for l, p in zip(layers, packed_sequence)] + inputs = [ + make_input(RNN_BATCH_SIZE, l, p) for l, p in zip(layers, packed_sequence) + ] for model, input in zip(models, inputs): self.run_test(model, input, batch_size=RNN_BATCH_SIZE) @@ -4156,14 +4967,21 @@ def test_gru_no_bias(self): class GruNet(torch.nn.Module): def __init__(self, input_size, hidden_size, num_layers, bidirectional): super(GruNet, self).__init__() - self.mygru = torch.nn.GRU(input_size, hidden_size, num_layers, bidirectional=bidirectional, bias=False) + self.mygru = torch.nn.GRU( + input_size, + hidden_size, + num_layers, + bidirectional=bidirectional, + bias=False, + ) def forward(self, input, initial_state): out = self.mygru(input, initial_state) return out - def get_GruNet_model_and_inputs(input_size, hidden_size, num_layers, batch_size, - seq_len, bidirectional): + def get_GruNet_model_and_inputs( + input_size, hidden_size, num_layers, batch_size, seq_len, bidirectional + ): num_directions = 2 if bidirectional else 1 model = GruNet(input_size, hidden_size, num_layers, bidirectional) input = torch.randn(seq_len, batch_size, input_size) @@ -4176,8 +4994,12 @@ def get_GruNet_model_and_inputs(input_size, hidden_size, num_layers, batch_size, batch_size = [3, 4] seq_len = [5, 7] bidirectional = [True, False] - models_and_inputs = [get_GruNet_model_and_inputs(i, h, n, b, s, bi) - for i, h, n, b, s, bi in zip(input_size, hidden_size, num_layers, batch_size, seq_len, bidirectional)] + models_and_inputs = [ + get_GruNet_model_and_inputs(i, h, n, b, s, bi) + for i, h, n, b, s, bi in zip( + input_size, hidden_size, num_layers, batch_size, seq_len, bidirectional + ) + ] for model, input in models_and_inputs: self.run_test(model, input, do_constant_folding=True) @@ -4185,14 +5007,17 @@ def test_gru_constant_folding(self): class GruNet(torch.nn.Module): def __init__(self, input_size, hidden_size, num_layers, bidirectional): super(GruNet, self).__init__() - self.mygru = torch.nn.GRU(input_size, hidden_size, num_layers, bidirectional=bidirectional) + self.mygru = torch.nn.GRU( + input_size, hidden_size, num_layers, bidirectional=bidirectional + ) def forward(self, input, initial_state): out = self.mygru(input, initial_state) return out - def get_GruNet_model_and_inputs(input_size, hidden_size, num_layers, batch_size, - seq_len, bidirectional): + def get_GruNet_model_and_inputs( + input_size, hidden_size, num_layers, batch_size, seq_len, bidirectional + ): num_directions = 2 if bidirectional else 1 model = GruNet(input_size, hidden_size, num_layers, bidirectional) input = torch.randn(seq_len, batch_size, input_size) @@ -4218,6 +5043,28 @@ def forward(self, input, other): y = torch.randn(4, 1, requires_grad=True) self.run_test(model, (x, y)) + def test_amax_amin(self): + class Model(torch.nn.Module): + def forward(self, x): + return torch.amax(x, dim=0, keepdim=True), torch.amin( + x, dim=[0, 1], keepdim=False + ) + + model = Model() + x = torch.randn(4, 4) + self.run_test(model, x) + + def test_aminmax(self): + class Model(torch.nn.Module): + def forward(self, x): + return torch.aminmax(x, dim=1, keepdim=True), torch.aminmax( + x, keepdim=False + ) + + model = Model() + x = torch.randn(3, 4) + self.run_test(model, x) + @skipIfUnsupportedMinOpsetVersion(9) def test_arange_end(self): class ArangeScript(torch.jit.ScriptModule): @@ -4291,14 +5138,24 @@ def test_arange_start_end_step(self): class ArangeScript(torch.jit.ScriptModule): @torch.jit.script_method def forward(self, a): - return torch.arange(2, a.size(0) * a.size(1) + 2, a.size(1), dtype=torch.float).view(-1, 1) + a + return ( + torch.arange( + 2, a.size(0) * a.size(1) + 2, a.size(1), dtype=torch.float + ).view(-1, 1) + + a + ) x = torch.randn(3, 4, requires_grad=True) self.run_test(ArangeScript(), x) class ArangeModel(torch.nn.Module): def forward(self, a): - return torch.arange(2, a.size(0) * a.size(1) + 2, a.size(1), dtype=torch.float).view(-1, 1) + a + return ( + torch.arange( + 2, a.size(0) * a.size(1) + 2, a.size(1), dtype=torch.float + ).view(-1, 1) + + a + ) self.run_test(ArangeModel(), x) @@ -4307,14 +5164,20 @@ def test_arange_start_end_step_notype(self): class ArangeScript(torch.jit.ScriptModule): @torch.jit.script_method def forward(self, a): - return torch.arange(2.7, a.size(0) * a.size(1) + 2, a.size(1)).view(-1, 1) + a + return ( + torch.arange(2.7, a.size(0) * a.size(1) + 2, a.size(1)).view(-1, 1) + + a + ) x = torch.randn(3, 4, requires_grad=True) self.run_test(ArangeScript(), x) class ArangeModel(torch.nn.Module): def forward(self, a): - return torch.arange(2.7, a.size(0) * a.size(1) + 2, a.size(1)).view(-1, 1) + a + return ( + torch.arange(2.7, a.size(0) * a.size(1) + 2, a.size(1)).view(-1, 1) + + a + ) self.run_test(ArangeModel(), x) @@ -4424,12 +5287,14 @@ def test_eq(self): class EqualModel(torch.nn.Module): def forward(self, input, other): return input == other + self._test_compare_ops(EqualModel(), 2) def test_gt(self): class GreaterModel(torch.nn.Module): def forward(self, input, other): return input > other + self._test_compare_ops(GreaterModel(), 2) @skipIfUnsupportedMinOpsetVersion(9) @@ -4437,37 +5302,41 @@ def test_ge(self): class GreaterOrEqualModel(torch.nn.Module): def forward(self, input, other): return input >= other + self._test_compare_ops(GreaterOrEqualModel(), 2) def test_gt_scalar(self): class GreaterModel(torch.nn.Module): def forward(self, input): return input > 1 + self._test_compare_ops(GreaterModel(), 1) def test_gt_primitive(self): class GreaterModel(torch.nn.Module): def __init__(self): super().__init__() - self.y : int = 2 + self.y: int = 2 def forward(self, x: int): return self.y > x x = 3 - self.run_test(GreaterModel(), (x, )) + self.run_test(GreaterModel(), (x,)) @skipIfUnsupportedMinOpsetVersion(9) def test_ge_scalar(self): class GreaterOrEqualModel(torch.nn.Module): def forward(self, input): return input >= 1 + self._test_compare_ops(GreaterOrEqualModel(), 1) def test_lt(self): class LessModel(torch.nn.Module): def forward(self, input, other): return input > other + self._test_compare_ops(LessModel(), 2) @skipIfUnsupportedMinOpsetVersion(9) @@ -4475,12 +5344,14 @@ def test_le(self): class LessOrEqualModel(torch.nn.Module): def forward(self, input, other): return input <= other + self._test_compare_ops(LessOrEqualModel(), 2) def test_lt_scalar(self): class LessModel(torch.nn.Module): def forward(self, input): return input < 1 + self._test_compare_ops(LessModel(), 1) @skipIfUnsupportedMinOpsetVersion(9) @@ -4488,6 +5359,7 @@ def test_le_scalar(self): class LessOrEqualModel(torch.nn.Module): def forward(self, input): return input <= 1 + self._test_compare_ops(LessOrEqualModel(), 1) def test_matmul(self): @@ -4519,10 +5391,12 @@ def forward(self, input, other): def _argmin_argmax_model(self, input): class ArgminArgmaxModel(torch.nn.Module): def forward(self, input): - return torch.argmin(input), \ - torch.argmax(input), \ - torch.argmin(input, keepdim=True), \ - torch.argmax(input, keepdim=True) + return ( + torch.argmin(input), + torch.argmax(input), + torch.argmin(input, keepdim=True), + torch.argmax(input, keepdim=True), + ) self.run_test(ArgminArgmaxModel(), input) @@ -4535,8 +5409,7 @@ def test_argmin_argmax(self): # same value appears multiple times in the tensor @skipIfUnsupportedMinOpsetVersion(12) def test_argmin_argmax_select_last_index(self): - input = torch.tensor([[1., 2., 3.], - [1., 1., 2.]]) + input = torch.tensor([[1.0, 2.0, 3.0], [1.0, 1.0, 2.0]]) self._argmin_argmax_model(input) input = torch.ones(7, 3, 5) @@ -4593,6 +5466,15 @@ def forward(self, x): x = torch.tensor([[1, 2], [3, 4]]) self.run_test(RepeatsDimsModel2(), (x,)) + @skipIfUnsupportedMinOpsetVersion(9) + def test_repeat_interleave_noop(self): + class Model(torch.nn.Module): + def forward(self, x): + return x.repeat_interleave(1, dim=1) + + x = torch.randn(4, 1, 8) + self.run_test(Model(), (x,)) + @skipIfUnsupportedMinOpsetVersion(13) def test_dynamic_repeat_interleave(self): class SingleDynamicModel(torch.nn.Module): @@ -4602,8 +5484,13 @@ def forward(self, x): x = torch.tensor([[1, 2, 4], [3, 4, 7]]) another_x = torch.tensor([[7, 8], [5, 6]]) - self.run_test(SingleDynamicModel(), x, test_with_inputs=[another_x], - input_names=["input_1"], dynamic_axes={"input_1" : {1 : "w"}}) + self.run_test( + SingleDynamicModel(), + x, + test_with_inputs=[another_x], + input_names=["input_1"], + dynamic_axes={"input_1": {1: "w"}}, + ) class NegDynamicModel(torch.nn.Module): def forward(self, x): @@ -4612,8 +5499,13 @@ def forward(self, x): x = torch.tensor([[1, 2, 4], [3, 4, 7]]) another_x = torch.tensor([[7, 8], [5, 6]]) - self.run_test(NegDynamicModel(), x, test_with_inputs=[another_x], - input_names=["input_1"], dynamic_axes={"input_1" : {1 : "w"}}) + self.run_test( + NegDynamicModel(), + x, + test_with_inputs=[another_x], + input_names=["input_1"], + dynamic_axes={"input_1": {1: "w"}}, + ) class SingleDynamicModelFloat(torch.nn.Module): def forward(self, x): @@ -4622,8 +5514,13 @@ def forward(self, x): x = torch.tensor([[1.1, 2.1], [3.1, 4.1]]) another_x = torch.tensor([[7.1, 8.1], [5.1, 6.1]]) - self.run_test(SingleDynamicModelFloat(), x, test_with_inputs=[another_x], - input_names=["input_1"], dynamic_axes={"input_1" : {0 : "h"}}) + self.run_test( + SingleDynamicModelFloat(), + x, + test_with_inputs=[another_x], + input_names=["input_1"], + dynamic_axes={"input_1": {0: "h"}}, + ) class DynamicRepeatsModel(torch.nn.Module): def forward(self, x, repeats): @@ -4633,9 +5530,13 @@ def forward(self, x, repeats): another_x = torch.tensor([[7, 8], [5, 6]]) repeats = torch.tensor([2]) another_repeats = torch.tensor([4]) - self.run_test(DynamicRepeatsModel(), (x, repeats), test_with_inputs=[(another_x, another_repeats)], - input_names=["input_1", "repeats_1"], - dynamic_axes={"input_1" : {1 : "w"}, "repeats_1" : {0 : "r"}}) + self.run_test( + DynamicRepeatsModel(), + (x, repeats), + test_with_inputs=[(another_x, another_repeats)], + input_names=["input_1", "repeats_1"], + dynamic_axes={"input_1": {1: "w"}, "repeats_1": {0: "r"}}, + ) class DynamicRepeatsModel2(torch.nn.Module): def forward(self, x, repeats): @@ -4644,9 +5545,13 @@ def forward(self, x, repeats): x = torch.tensor([[1, 2, 4], [3, 4, 7]]) repeats = torch.tensor([2]) another_repeats = torch.tensor([4]) - self.run_test(DynamicRepeatsModel2(), (x, repeats), test_with_inputs=[(x, another_repeats)], - input_names=["input_1", "repeats_1"], - dynamic_axes={"repeats_1" : {0 : "r"}}) + self.run_test( + DynamicRepeatsModel2(), + (x, repeats), + test_with_inputs=[(x, another_repeats)], + input_names=["input_1", "repeats_1"], + dynamic_axes={"repeats_1": {0: "r"}}, + ) @skipIfUnsupportedMinOpsetVersion(13) def test_multiple_dynamic_repeat_interleave(self): @@ -4657,9 +5562,13 @@ def forward(self, x, repeats): x = torch.tensor([[1, 2, 4], [3, 4, 7]]) repeats = torch.tensor([2, 3, 4]) another_repeats = torch.tensor([4, 3, 2]) - self.run_test(DynamicRepeatsModel(), (x, repeats), test_with_inputs=[(x, another_repeats)], - input_names=["input_1", "repeats_1"], - dynamic_axes={"repeats_1" : {0 : "r"}}) + self.run_test( + DynamicRepeatsModel(), + (x, repeats), + test_with_inputs=[(x, another_repeats)], + input_names=["input_1", "repeats_1"], + dynamic_axes={"repeats_1": {0: "r"}}, + ) class DynamicRepeatsModel2(torch.nn.Module): def forward(self, x, repeats): @@ -4668,9 +5577,13 @@ def forward(self, x, repeats): x = torch.tensor([[1, 2, 4], [3, 4, 7]]) repeats = torch.tensor([2, 3]) another_repeats = torch.tensor([4, 3]) - self.run_test(DynamicRepeatsModel2(), (x, repeats), test_with_inputs=[(x, another_repeats)], - input_names=["input_1", "repeats_1"], - dynamic_axes={"repeats_1" : {0 : "r"}}) + self.run_test( + DynamicRepeatsModel2(), + (x, repeats), + test_with_inputs=[(x, another_repeats)], + input_names=["input_1", "repeats_1"], + dynamic_axes={"repeats_1": {0: "r"}}, + ) def test_view(self): class ViewModel(torch.nn.Module): @@ -4687,8 +5600,12 @@ def forward(self, input, other): x = torch.randn(2, 3, 4) shape = torch.randn(6, 4) - self.run_test(ViewModel(), (x, shape), - input_names=["x", "shape"], dynamic_axes={"x": [0, 1, 2], "shape": [0, 1]}) + self.run_test( + ViewModel(), + (x, shape), + input_names=["x", "shape"], + dynamic_axes={"x": [0, 1, 2], "shape": [0, 1]}, + ) self.run_test(ViewModel(), (x, shape), remained_onnx_input_idx=[0]) def test_view_dynamic_zero_dim(self): @@ -4699,8 +5616,17 @@ def forward(self, input): x = torch.ones(2) another_x = torch.empty((0,)) - self.run_test(ViewModel(), x, test_with_inputs=[another_x], - input_names=["input_1"], dynamic_axes={"input_1": [0, ]}) + self.run_test( + ViewModel(), + x, + test_with_inputs=[another_x], + input_names=["input_1"], + dynamic_axes={ + "input_1": [ + 0, + ] + }, + ) def test_view_as(self): class ViewModel(torch.nn.Module): @@ -4741,7 +5667,7 @@ def forward(self, input, weight, bias): z = torch.randn(1) self.run_test(LinearModel(), (x, y, z)) - @disableScriptTest() + @skipScriptTest() def test_weight_norm(self): # addmm for 3-d inputs converts to onnx::MatMul model = torch.nn.utils.weight_norm(torch.nn.Linear(5, 10), dim=1) @@ -4765,7 +5691,7 @@ def test_weight_norm(self): x = torch.randn(3, 3, 5, requires_grad=True) self.run_test(model, x) - @disableScriptTest() + @skipScriptTest() def test_weight_norm_nodim(self): # addmm for 3-d inputs converts to onnx::MatMul model = torch.nn.utils.weight_norm(torch.nn.Linear(5, 10), dim=None) @@ -4785,6 +5711,9 @@ def forward(self, input): x = torch.randint(10, (1, 2, 3, 4)) self.run_test(FlattenModel(), x) + x = torch.randn(4) + self.run_test(FlattenModel(), x) + def test_flatten2d(self): class FlattenModel(torch.nn.Module): def forward(self, input): @@ -4796,7 +5725,11 @@ def forward(self, input): def test_flatten2d_neg(self): class FlattenModel(torch.nn.Module): def forward(self, x): - return torch.flatten(x, 1, -1), torch.flatten(x, 0, -2), torch.flatten(x, 1, -2) + return ( + torch.flatten(x, 1, -1), + torch.flatten(x, 0, -2), + torch.flatten(x, 1, -2), + ) x = torch.randint(10, (1, 2, 3, 4)) self.run_test(FlattenModel(), x) @@ -4811,11 +5744,14 @@ def forward(self, x): x = torch.randn(batch_size, 5, 4, 5) y = torch.randn(5, 5, 4, 5) model = MyModule() - self.run_test(model, x, test_with_inputs=[y], - input_names=["input"], - output_names=["output"], - dynamic_axes={"input" : {0 : "batch_size"}, - "output" : {0 : "batch_size"}}) + self.run_test( + model, + x, + test_with_inputs=[y], + input_names=["input"], + output_names=["output"], + dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}}, + ) @skipIfUnsupportedMinOpsetVersion(11) def test_getitem(self): @@ -4845,7 +5781,7 @@ def forward(self, x, y, i: int): i = 3 self.run_test(torch.jit.script(M()), (x, y, i)) - @disableScriptTest() # torch.nonzero(x, as_tuple=True) is not scriptable. + @skipScriptTest() # torch.nonzero(x, as_tuple=True) is not scriptable. @skipIfUnsupportedMinOpsetVersion(9) def test_nonzero(self): class NonzeroModel(torch.nn.Module): @@ -4888,8 +5824,13 @@ def forward(self, input): return len(input.unbind()) + input x = torch.randn(4, 5) - self.run_test(LenModel(), x, input_names=["input"], dynamic_axes={"input": {0: "seq"}}, - test_with_inputs=(torch.randn(5, 5),)) + self.run_test( + LenModel(), + x, + input_names=["input"], + dynamic_axes={"input": {0: "seq"}}, + test_with_inputs=(torch.randn(5, 5),), + ) @skipIfUnsupportedMinOpsetVersion(9) def test_len_list(self): @@ -4919,7 +5860,7 @@ def forward(self, input): x = torch.randn(3, 4, 5) self.run_test(UnbindModel2(), x) - @disableScriptTest() # scripting tests run for opsets > 11. See: test_split_script + @skipScriptTest() # scripting tests run for opsets > 11. See: test_split_script def test_split(self): class SplitModel(torch.nn.Module): def forward(self, input): @@ -4966,12 +5907,12 @@ def forward(self, input): self.run_test(SplitModel3(), x) @skipIfUnsupportedMinOpsetVersion(11) - @disableScriptTest() + @skipScriptTest() def test_split_size_as_list(self): class SplitModel(torch.nn.Module): def forward(self, input, split_sizes: List[int]): out = [] - split_list: List[torch.Tensor] = input.split(split_sizes) + split_list: List[Tensor] = input.split(split_sizes) for ob in split_list: out.append(ob) @@ -4992,8 +5933,12 @@ def forward(self, x, y, t): x = torch.randn(2, 3) y = torch.randn(2, 4) t = torch.randn(2, 7) - self.run_test(SplitModule(), (x, y, t), input_names=["x", "y", "t"], - dynamic_axes={"x": [0, 1], "y": [0, 1], "t": [0, 1]}) + self.run_test( + SplitModule(), + (x, y, t), + input_names=["x", "y", "t"], + dynamic_axes={"x": [0, 1], "y": [0, 1], "t": [0, 1]}, + ) self.run_test(SplitModule(), (x, y, t), remained_onnx_input_idx=[2]) @skipIfUnsupportedMinOpsetVersion(11) @@ -5022,8 +5967,12 @@ def forward(self, x): x = torch.randn(4, 384, 2) input_names = ["logits"] - self.run_test(Split(), x, input_names=input_names, - dynamic_axes={input_names[0]: {0: 'batch'}}) + self.run_test( + Split(), + x, + input_names=input_names, + dynamic_axes={input_names[0]: {0: "batch"}}, + ) @skipIfUnsupportedMinOpsetVersion(11) def test_chunk(self): @@ -5043,13 +5992,21 @@ def forward(self, x): for dim_size_ in range(13, 16): y = torch.randn(1, dim_size_) - self.run_test(model, x, test_with_inputs=[y], - input_names=["x"], - dynamic_axes={"x": {0: "batch_size", 1: "dims"}}) - - self.run_test(model_neg_dim, x, test_with_inputs=[y], - input_names=["x"], - dynamic_axes={"x": {0: "batch_size", 1: "dims"}}) + self.run_test( + model, + x, + test_with_inputs=[y], + input_names=["x"], + dynamic_axes={"x": {0: "batch_size", 1: "dims"}}, + ) + + self.run_test( + model_neg_dim, + x, + test_with_inputs=[y], + input_names=["x"], + dynamic_axes={"x": {0: "batch_size", 1: "dims"}}, + ) @skipIfUnsupportedMinOpsetVersion(11) def test_dynamic_chunk(self): @@ -5069,13 +6026,21 @@ def forward(self, x): for dim_size_ in range(13, 16): y = torch.randn(3, dim_size_) - self.run_test(model, x, test_with_inputs=[y], - input_names=["x"], - dynamic_axes={"x": {0: "batch_size", 1: "dims"}}) - - self.run_test(model_neg_dim, x, test_with_inputs=[y], - input_names=["x"], - dynamic_axes={"x": {0: "batch_size", 1: "dims"}}) + self.run_test( + model, + x, + test_with_inputs=[y], + input_names=["x"], + dynamic_axes={"x": {0: "batch_size", 1: "dims"}}, + ) + + self.run_test( + model_neg_dim, + x, + test_with_inputs=[y], + input_names=["x"], + dynamic_axes={"x": {0: "batch_size", 1: "dims"}}, + ) def test_concat(self): class ConcatModel(torch.nn.Module): @@ -5168,7 +6133,6 @@ def forward(self, x): inputs = torch.randn(16) self.run_test(model, inputs) - @skipIfONNXShapeInference(False) @skipIfUnsupportedMinOpsetVersion(11) def test_loop_transpose(self): class LoopModel(torch.nn.Module): @@ -5262,7 +6226,7 @@ def forward(self, x): model = torch.jit.script(ListModel()) x = torch.randn(4, 4, 3, 4) - self.run_test(model, (x, )) + self.run_test(model, (x,)) @skipIfUnsupportedMinOpsetVersion(13) def test_list_append_nested_mixed_dtype(self): @@ -5383,7 +6347,9 @@ def forward(self, x): return torch.zeros(x.size()) + torch.ones(x.size()) x = torch.randn(2, 3, 4) - self.run_test(TensorFactory(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]}) + self.run_test( + TensorFactory(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]} + ) self.run_test(TensorFactory(), x, remained_onnx_input_idx=[]) @skipIfUnsupportedMinOpsetVersion(9) @@ -5391,10 +6357,14 @@ def test_tensor_factories_script(self): class TensorFactory(torch.jit.ScriptModule): @torch.jit.script_method def forward(self, x): - return torch.zeros(x.shape, dtype=torch.float) + torch.ones(x.shape, dtype=torch.float) + return torch.zeros(x.shape, dtype=torch.float) + torch.ones( + x.shape, dtype=torch.float + ) x = torch.randn(2, 3, 4) - self.run_test(TensorFactory(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]}) + self.run_test( + TensorFactory(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]} + ) self.run_test(TensorFactory(), x, remained_onnx_input_idx=[]) @skipIfUnsupportedMinOpsetVersion(9) @@ -5402,26 +6372,47 @@ def test_tensor_like_factories_script(self): class TensorFactory(torch.jit.ScriptModule): @torch.jit.script_method def forward(self, x): - zeros = torch.zeros_like(x, dtype=torch.float, layout=torch.strided, device=torch.device("cpu")) - ones = torch.ones_like(x, dtype=torch.float, layout=torch.strided, device=torch.device("cpu")) + zeros = torch.zeros_like( + x, + dtype=torch.float, + layout=torch.strided, + device=torch.device("cpu"), + ) + ones = torch.ones_like( + x, + dtype=torch.float, + layout=torch.strided, + device=torch.device("cpu"), + ) return zeros + ones x = torch.randn(2, 3, 4) - self.run_test(TensorFactory(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]}) + self.run_test( + TensorFactory(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]} + ) self.run_test(TensorFactory(), x, remained_onnx_input_idx=[]) @skipIfUnsupportedMinOpsetVersion(9) def test_eye(self): class TensorFactory(torch.nn.Module): def forward(self, x): - return torch.eye(x.size()[1], 3), torch.eye(4, 4, dtype=torch.long), \ - torch.eye(x.size()[1], 2, dtype=torch.long), torch.eye(x.shape[0]), \ - torch.eye(x.shape[0], dtype=torch.float64) + return ( + torch.eye(x.size()[1], 3), + torch.eye(4, 4, dtype=torch.long), + torch.eye(x.size()[1], 2, dtype=torch.long), + torch.eye(x.shape[0]), + torch.eye(x.shape[0], dtype=torch.float64), + ) x = torch.randn(2, 3, 4) another_x = torch.randn(5, 6, 7) - self.run_test(TensorFactory(), x, test_with_inputs=[another_x], - input_names=["input_1"], dynamic_axes={"input_1": [0, 1, 2]}) + self.run_test( + TensorFactory(), + x, + test_with_inputs=[another_x], + input_names=["input_1"], + dynamic_axes={"input_1": [0, 1, 2]}, + ) @skipIfUnsupportedMinOpsetVersion(13) def test_diagonal(self): @@ -5432,9 +6423,13 @@ def forward(self, x): x = torch.randn(2, 4, 5, 2) # Other test inputs to test dynamic behavior another_x = torch.randn(5, 6, 7, 8) - self.run_test(DiagonalModel(), x, test_with_inputs=[another_x], - input_names=["input_1"], - dynamic_axes={"input_1": [0, 1, 2, 3]}) + self.run_test( + DiagonalModel(), + x, + test_with_inputs=[another_x], + input_names=["input_1"], + dynamic_axes={"input_1": [0, 1, 2, 3]}, + ) class DiagonalModelNegOffset(torch.nn.Module): def forward(self, x): @@ -5443,9 +6438,13 @@ def forward(self, x): x = torch.randn(2, 4, 5, 2) # Other test inputs to test dynamic behavior another_x = torch.randn(5, 6, 7, 8) - self.run_test(DiagonalModelNegOffset(), x, test_with_inputs=[another_x], - input_names=["input_1"], - dynamic_axes={"input_1": [0, 1, 2, 3]}) + self.run_test( + DiagonalModelNegOffset(), + x, + test_with_inputs=[another_x], + input_names=["input_1"], + dynamic_axes={"input_1": [0, 1, 2, 3]}, + ) class DiagonalModelPosOffset(torch.nn.Module): def forward(self, x): @@ -5454,9 +6453,13 @@ def forward(self, x): x = torch.randn(2, 4, 5, 2) # Other test inputs to test dynamic behavior another_x = torch.randn(5, 6, 7, 8) - self.run_test(DiagonalModelPosOffset(), x, test_with_inputs=[another_x], - input_names=["input_1"], - dynamic_axes={"input_1": [0, 1, 2, 3]}) + self.run_test( + DiagonalModelPosOffset(), + x, + test_with_inputs=[another_x], + input_names=["input_1"], + dynamic_axes={"input_1": [0, 1, 2, 3]}, + ) class DiagonalModelWithDims(torch.nn.Module): def forward(self, x): @@ -5465,9 +6468,13 @@ def forward(self, x): x = torch.randn(2, 4, 5, 2) # Other test inputs to test dynamic behavior another_x = torch.randn(5, 6, 7, 8) - self.run_test(DiagonalModelWithDims(), x, test_with_inputs=[another_x], - input_names=["input_1"], - dynamic_axes={"input_1": [0, 1, 2, 3]}) + self.run_test( + DiagonalModelWithDims(), + x, + test_with_inputs=[another_x], + input_names=["input_1"], + dynamic_axes={"input_1": [0, 1, 2, 3]}, + ) class DiagonalModelOffsetOverrun(torch.nn.Module): def forward(self, x): @@ -5476,9 +6483,13 @@ def forward(self, x): x = torch.randn(2, 4, 5, 2) # Other test inputs to test dynamic behavior another_x = torch.randn(5, 6, 7, 8) - self.run_test(DiagonalModelOffsetOverrun(), x, test_with_inputs=[another_x], - input_names=["input_1"], - dynamic_axes={"input_1": [0, 1, 2, 3]}) + self.run_test( + DiagonalModelOffsetOverrun(), + x, + test_with_inputs=[another_x], + input_names=["input_1"], + dynamic_axes={"input_1": [0, 1, 2, 3]}, + ) @skipIfUnsupportedMinOpsetVersion(9) def test_inplace_zero(self): @@ -5494,7 +6505,9 @@ def forward(self, x): def test_new_zeros(self): class Zero_(torch.nn.Module): def forward(self, x): - return x.new_zeros(x.shape[1:2]), x.new_zeros(x.shape[2:], dtype=torch.long) + return x.new_zeros(x.shape[1:2]), x.new_zeros( + x.shape[2:], dtype=torch.long + ) x = torch.randn(2, 3, 4) self.run_test(Zero_(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]}) @@ -5504,25 +6517,33 @@ def forward(self, x): def test_new_ones(self): class OnesModel(torch.nn.Module): def forward(self, x): - return x.new_ones(x.shape[1:2]), x.new_ones(x.shape[2:], dtype=torch.long) + return x.new_ones(x.shape[1:2]), x.new_ones( + x.shape[2:], dtype=torch.long + ) x = torch.randn(2, 3, 4) self.run_test(OnesModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]}) self.run_test(OnesModel(), x, remained_onnx_input_idx=[]) - @skipIfONNXShapeInference(True) + @skipIfUnsupportedMinOpsetVersion(9) + @skipScriptTest() # torch.zeros/torch.ones with size tensor of dim != 0 not scriptable. + def test_zeros_ones_with_tensor_input(self): + class ZeroAndOnes(torch.nn.Module): + def forward(self, x): + return torch.zeros(x, 1), torch.ones(x, 1) + + x = torch.tensor([2]) + self.run_test(ZeroAndOnes(), (x,)) + @skipIfUnsupportedMinOpsetVersion(9) def test_tolist(self): class List(torch.jit.ScriptModule): @torch.jit.script_method def forward(self, input): - cur_shape = torch._shape_as_tensor(input) - final_shape: List[int] = cur_shape.tolist() - pad_tensor = torch.zeros([1, 2] + final_shape) - return pad_tensor + res: List[int] = input.tolist() + return res - x = torch.randn(2, 3) - self.run_test(List(), (x,)) + self.run_test(List(), (torch.randint(100, (1,)),)) @skipIfUnsupportedMinOpsetVersion(9) def test_list_pass(self): @@ -5532,8 +6553,12 @@ def forward(self, x, y): x = torch.randn(2, 3, 4, 5) y = torch.randn(1, 2, 3, 4) - self.run_test(Slice(), (x, y), input_names=["x", "y"], - dynamic_axes={"x": [0, 1, 2, 3], "y": [0, 1, 2, 3]}) + self.run_test( + Slice(), + (x, y), + input_names=["x", "y"], + dynamic_axes={"x": [0, 1, 2, 3], "y": [0, 1, 2, 3]}, + ) self.run_test(Slice(), (x, y), remained_onnx_input_idx=[]) class Size(torch.nn.Module): @@ -5542,8 +6567,12 @@ def forward(self, x, y): x = torch.randn(2, 3, 4) y = torch.randn(1, 2, 3) - self.run_test(Size(), (x, y), input_names=["x", "y"], - dynamic_axes={"x": [0, 1, 2], "y": [0, 1, 2]}) + self.run_test( + Size(), + (x, y), + input_names=["x", "y"], + dynamic_axes={"x": [0, 1, 2], "y": [0, 1, 2]}, + ) self.run_test(Size(), (x, y), remained_onnx_input_idx=[]) class Array(torch.nn.Module): @@ -5554,8 +6583,12 @@ def forward(self, x, y): x = torch.randn(2, 3, 4) y = torch.randn(1, 2, 3) - self.run_test(Array(), (x, y), input_names=["x", "y"], - dynamic_axes={"x": [0, 1, 2], "y": [0, 1, 2]}) + self.run_test( + Array(), + (x, y), + input_names=["x", "y"], + dynamic_axes={"x": [0, 1, 2], "y": [0, 1, 2]}, + ) self.run_test(Array(), (x, y), remained_onnx_input_idx=[]) class List(torch.nn.Module): @@ -5566,15 +6599,22 @@ def forward(self, x, y): x = torch.randn(2, 3, 4) y = torch.randn(1, 2, 3) - self.run_test(List(), (x, y), input_names=["x", "y"], - dynamic_axes={"x": [0, 1, 2], "y": [0, 1, 2]}) + self.run_test( + List(), + (x, y), + input_names=["x", "y"], + dynamic_axes={"x": [0, 1, 2], "y": [0, 1, 2]}, + ) self.run_test(List(), (x, y), remained_onnx_input_idx=[]) @skipIfUnsupportedMinOpsetVersion(9) def test_new_empty(self): class Emtpy(torch.nn.Module): def forward(self, x): - return x.new_empty(x.shape[0]).fill_(0), x.new_empty(x.shape[0], dtype=torch.long) * 0 + return ( + x.new_empty(x.shape[0]).fill_(0), + x.new_empty(x.shape[0], dtype=torch.long) * 0, + ) x = torch.randn(2, 3, 4) self.run_test(Emtpy(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]}) @@ -5584,7 +6624,9 @@ def forward(self, x): def test_new_full(self): class Full(torch.nn.Module): def forward(self, x): - return x.new_full(x.shape[1:2], 5), x.new_full(x.shape[0:1], 1.3, dtype=torch.long) + return x.new_full(x.shape[1:2], 5), x.new_full( + x.shape[0:1], 1.3, dtype=torch.long + ) x = torch.randn(2, 3, 4) self.run_test(Full(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]}) @@ -5599,8 +6641,12 @@ def forward(self, x, y): x = torch.randn(2, 3) y = torch.randn(2, 3) - self.run_test(Arithmetic(), (x, y), input_names=["x", "y"], - dynamic_axes={"x": [0, 1], "y": [0, 1]}) + self.run_test( + Arithmetic(), + (x, y), + input_names=["x", "y"], + dynamic_axes={"x": [0, 1], "y": [0, 1]}, + ) self.run_test(Arithmetic(), (x, y), remained_onnx_input_idx=[0]) @skipIfUnsupportedMinOpsetVersion(9) @@ -5643,21 +6689,33 @@ def forward(self, x, y): def test_inplace_with_loop(self): class M(torch.nn.Module): def forward(self, x): - a = torch.ones(12,) + a = torch.ones( + 12, + ) for i in range(10): - a.add_(torch.ones(12,)) + a.add_( + torch.ones( + 12, + ) + ) return a + x m = M() - x = torch.randn(12,) + x = torch.randn( + 12, + ) self.run_test(torch.jit.script(M()), (x)) @skipIfUnsupportedMinOpsetVersion(9) def test_inplace_with_loop_2(self): class M(torch.nn.Module): def forward(self, x): - _bias = torch.ones(12,) - a = torch.ones(12,) # used in loop, altered. + _bias = torch.ones( + 12, + ) + a = torch.ones( + 12, + ) # used in loop, altered. a_ref = a # not used in loop, should be altered. b = x.clone() # used in loop, not be altered. b_ref = b # not used in loop, should not be altered. @@ -5665,18 +6723,32 @@ def forward(self, x): if i == 3: for j in range(5): a += _bias - _bias.add_(torch.ones(12,)) - b = b + torch.ones(12,) - - _bias.add_(torch.ones(12,)) + _bias.add_( + torch.ones( + 12, + ) + ) + b = b + torch.ones( + 12, + ) + + _bias.add_( + torch.ones( + 12, + ) + ) a += _bias # TODO: value for a_ref is incorrect. # a_ref += torch.ones(12,) - b_ref += torch.ones(12,) + b_ref += torch.ones( + 12, + ) return _bias + x, a, b, b_ref m = M() - x = torch.zeros(12,) + x = torch.zeros( + 12, + ) self.run_test(torch.jit.script(M()), (x)) @skipIfUnsupportedMinOpsetVersion(11) @@ -5684,18 +6756,26 @@ def test_inplace_attr_with_loop(self): class M(torch.nn.Module): def __init__(self): super().__init__() - self._bias = torch.arange(12,) + self._bias = torch.arange( + 12, + ) def forward(self, x): - self._bias = torch.arange(12,) + self._bias = torch.arange( + 12, + ) for i in range(10): if i == 3: for j in range(5): - self._bias += torch.arange(12,) + self._bias += torch.arange( + 12, + ) return self._bias + x m = M() - x = torch.zeros(12,) + x = torch.zeros( + 12, + ) self.run_test(torch.jit.script(M()), (x)) @skipIfUnsupportedMinOpsetVersion(11) @@ -5703,27 +6783,47 @@ def test_inplace_attr_copy_with_loop(self): class M(torch.nn.Module): def __init__(self): super().__init__() - self._bias = torch.arange(12,) + self._bias = torch.arange( + 12, + ) def forward(self, x): - self._bias = torch.arange(12,) + self._bias = torch.arange( + 12, + ) for i in range(10): if i == 3: for j in range(5): - self._bias.copy_(torch.arange(12,)) - self._bias.copy_(self._bias + torch.arange(12,)) - - self._bias.copy_(self._bias + torch.arange(12,)) + self._bias.copy_( + torch.arange( + 12, + ) + ) + self._bias.copy_( + self._bias + + torch.arange( + 12, + ) + ) + + self._bias.copy_( + self._bias + + torch.arange( + 12, + ) + ) return self._bias + x m = M() - x = torch.zeros(12,) + x = torch.zeros( + 12, + ) self.run_test(torch.jit.script(M()), (x)) @skipIfUnsupportedMinOpsetVersion(14) # Need onnx::Identity of sequence in opset 14 def test_inplace_sequence_with_loop(self): class M(torch.nn.Module): - def process(self, beam_hyps: List[torch.Tensor], done: torch.Tensor, x): + def process(self, beam_hyps: List[Tensor], done: Tensor, x): batch_size = x.shape[0] for i in range(batch_size): if done[i]: @@ -5742,7 +6842,7 @@ def process(self, beam_hyps: List[torch.Tensor], done: torch.Tensor, x): return beam_hyps, done def forward(self, x): - beam_hyps: List[torch.Tensor] = [] + beam_hyps: List[Tensor] = [] batch_size = x.shape[0] cur_len = 0 max_len = x.shape[1] @@ -5757,8 +6857,7 @@ def forward(self, x): x = torch.randn(8, 4, 3) self.run_test(torch.jit.script(M()), (x)) - - @disableScriptTest() # Sort with dynamic dim not supported in ONNX + @skipScriptTest() # Sort with dynamic dim not supported in ONNX def test_sort(self): class SortModel(torch.nn.Module): def forward(self, x): @@ -5771,7 +6870,7 @@ def forward(self, x): self.run_test(SortModel(), x) @skipIfUnsupportedMinOpsetVersion(11) - @disableScriptTest() # Sort with dynamic dim not supported in ONNX + @skipScriptTest() # Sort with dynamic dim not supported in ONNX def test_sort_ascending(self): class SortModel(torch.nn.Module): def forward(self, x): @@ -5802,7 +6901,6 @@ def forward(self, x): @skipIfUnsupportedMinOpsetVersion(9) def test_masked_fill_inplace(self): - class MaskedFillModel(torch.jit.ScriptModule): @torch.jit.script_method def forward(self, x): @@ -5872,7 +6970,32 @@ def forward(self, x): return torch.pixel_shuffle(x, upscale_factor=2) x = torch.randn(2, 16, 4, 3, requires_grad=True) + y = torch.randn(4, 32, 8, 4, requires_grad=True) self.run_test(PixelShuffle(), x) + self.run_test( + PixelShuffle(), + x, + input_names=["x"], + dynamic_axes={"x": [0, 1, 2, 3]}, + test_with_inputs=[y], + ) + + @skipIfUnsupportedMinOpsetVersion(9) + def test_pixel_unshuffle(self): + class PixelUnshuffle(torch.nn.Module): + def forward(self, x): + return torch.pixel_unshuffle(x, downscale_factor=2) + + x = torch.randn(2, 16, 4, 6, requires_grad=True) + y = torch.randn(4, 32, 8, 4, requires_grad=True) + self.run_test(PixelUnshuffle(), x) + self.run_test( + PixelUnshuffle(), + x, + input_names=["x"], + dynamic_axes={"x": [0, 1, 2, 3]}, + test_with_inputs=[y], + ) @skipIfUnsupportedMinOpsetVersion(9) def test_reciprocal(self): @@ -5895,7 +7018,6 @@ def forward(self, x): x = torch.ones(2, 3, dtype=torch.float32) self.run_test(ArithmeticModel(), x) - class ComparisonModel(torch.nn.Module): def forward(self, x, y): a = torch.tensor([12.0]) @@ -5907,7 +7029,7 @@ def forward(self, x, y): class MatMulModel(torch.nn.Module): def forward(self, x): - return (torch.mm(x, x) + x + torch.mm(x, x) + x) + return torch.mm(x, x) + x + torch.mm(x, x) + x x = torch.ones(3, 3) self.run_test(MatMulModel(), x) @@ -5923,9 +7045,19 @@ class FullModel(torch.nn.Module): # add is used for exporting full def forward(self, x): return torch.full((3, 4), x) - x = torch.tensor(12.) + + x = torch.tensor(12.0) self.run_test(FullModel(), x) + class CatModel(torch.nn.Module): + def forward(self, fp16, fp32): + return torch.cat([fp16, fp32]) + + fp16 = Tensor([0.5]) + fp16 = fp16.half() + fp32 = Tensor([1.5]) + self.run_test(CatModel(), (fp16, fp32)) + @skipIfUnsupportedMinOpsetVersion(9) def test_full_like(self): class FullLikeModel(torch.nn.Module): @@ -5985,12 +7117,14 @@ def forward(self, x): x = torch.randn(4, 2, 3, requires_grad=True) y = torch.randn(2, 1, 3, requires_grad=True) - self.run_test(UnfoldModel(), x, - dynamic_axes={"x": [0, 1]}, - input_names=["x"], - test_with_inputs=[y]) + self.run_test( + UnfoldModel(), + x, + dynamic_axes={"x": [0, 1]}, + input_names=["x"], + test_with_inputs=[y], + ) - @skipIfONNXShapeInference(False) def test_unfold_infer_shape(self): class UnfoldModule(torch.jit.ScriptModule): def __init__(self): @@ -6032,7 +7166,7 @@ def forward(self, input, other): self.run_test(MatmulModel(), (x, y)) x = torch.randint(10, (4, 5)) - y = torch.randint(10, (5, )) + y = torch.randint(10, (5,)) self.run_test(MatmulModel(), (x, y)) @skipIfUnsupportedMinOpsetVersion(9) # MatMul long inputs is added in ONNX opset 9. @@ -6045,16 +7179,16 @@ def forward(self, input, other): y = torch.randn(5, requires_grad=True) self.run_test(MatmulModel(), (x, y)) - x = torch.randint(10, (5, )) - y = torch.randint(10, (5, )) + x = torch.randint(10, (5,)) + y = torch.randint(10, (5,)) self.run_test(MatmulModel(), (x, y)) - @disableScriptTest() # SpectralNorm not TorchScript compatible. + @skipScriptTest() # SpectralNorm not TorchScript compatible. def test_spectral_norm(self): m = torch.nn.utils.spectral_norm(torch.nn.Linear(2, 4)) x = torch.randn(6, 2) - self.run_test(m, (x, )) + self.run_test(m, (x,)) def test_prelu(self): class PReluModel(torch.nn.Module): @@ -6067,9 +7201,17 @@ def forward(self, x): x = torch.randn(2, 3, 4) y = torch.randn(2, 4, 5) - self.run_test(PReluModel(), x, input_names=["x"], - dynamic_axes={"x": [1, 2]}, - test_with_inputs=[y]) + self.run_test( + PReluModel(), + x, + input_names=["x"], + dynamic_axes={"x": [1, 2]}, + test_with_inputs=[y], + ) + + def test_prelu_scalar(self): + x = torch.scalar_tensor(1.0) + self.run_test(torch.nn.PReLU(), x, input_names=["x"]) def test_relu6(self): class Relu6Model(torch.nn.Module): @@ -6082,9 +7224,13 @@ def forward(self, x): x = torch.randn(2, 3, 4) * 100.0 y = torch.randn(2, 4, 5) * 100.0 - self.run_test(Relu6Model(), x, input_names=['x'], - dynamic_axes={'x': [1, 2]}, - test_with_inputs=[y]) + self.run_test( + Relu6Model(), + x, + input_names=["x"], + dynamic_axes={"x": [1, 2]}, + test_with_inputs=[y], + ) def test_silu(self): class SiLUModel(torch.nn.Module): @@ -6225,7 +7371,16 @@ def forward(self, x): def test_gelu(self): class GeluModel(torch.nn.Module): def forward(self, x): - return torch.nn.functional.gelu(x) + return torch.nn.functional.gelu(x, approximate="none") + + x = torch.randn(2, 4, 5, 6, requires_grad=True) + self.run_test(GeluModel(), x) + + @skipIfUnsupportedMinOpsetVersion(9) + def test_tanh_gelu(self): + class GeluModel(torch.nn.Module): + def forward(self, x): + return torch.nn.functional.gelu(x, approximate="tanh") x = torch.randn(2, 4, 5, 6, requires_grad=True) self.run_test(GeluModel(), x) @@ -6239,6 +7394,16 @@ def forward(self, x): x = torch.randn(4, 2, 3, requires_grad=True) self.run_test(InplaceAddModel(), x) + def test_addcmul(self): + class AddcmulModel(torch.nn.Module): + def forward(self, x, t1, t2): + return torch.addcmul(x, t1, t2), torch.addcmul(x, t1, t2, value=2.2) + + x = torch.randn(1, 3) + t1 = torch.randn(3, 1) + t2 = torch.randn(1, 3) + self.run_test(AddcmulModel(), (x, t1, t2)) + def test_rsqrt(self): class RsqrtModel(torch.nn.Module): def forward(self, x): @@ -6251,6 +7416,7 @@ def test_rsqrt_zeros(self): class RsqrtModel(torch.nn.Module): def forward(self, x): return x.rsqrt() + x = torch.zeros(4, 2, 3, requires_grad=True, dtype=torch.float64) self.run_test(RsqrtModel(), x) @@ -6258,7 +7424,9 @@ def forward(self, x): def test_unique(self): class UniqueModel(torch.nn.Module): def forward(self, x): - return torch.unique(x, sorted=True, return_inverse=False, return_counts=True) + return torch.unique( + x, sorted=True, return_inverse=False, return_counts=True + ) x = torch.tensor([1, 3, 2, 3], dtype=torch.long) self.run_test(UniqueModel(), x) @@ -6267,7 +7435,9 @@ def forward(self, x): def test_unique_along_dim(self): class UniqueModel(torch.nn.Module): def forward(self, x): - return torch.unique(x, dim=0, sorted=True, return_inverse=True, return_counts=False) + return torch.unique( + x, dim=0, sorted=True, return_inverse=True, return_counts=False + ) x = torch.tensor([1, 3, 2, 3], dtype=torch.long) self.run_test(UniqueModel(), x) @@ -6277,6 +7447,7 @@ def test_cumsum(self): class CumSum(torch.nn.Module): def forward(self, input): return torch.cumsum(input, dim=0) + x = torch.randn(2, 3, 4) model = CumSum() self.run_test(model, x) @@ -6293,7 +7464,7 @@ def forward(self, input): x = torch.tensor([False, True, True]) self.run_test(model, x) - @disableScriptTest() # error in propagate as assign input shape + @skipScriptTest() # error in propagate as assign input shape @skipIfUnsupportedMinOpsetVersion(10) def test_embedding_bag(self): model = torch.nn.EmbeddingBag(10, 5, mode="sum", scale_grad_by_freq=True) @@ -6314,12 +7485,19 @@ def test_embedding_bag(self): def test_embedding_bag_1d_per_sample_weights(self): class EmbeddingModel(torch.nn.Module): def forward(self, embedding_matrix, input, offset, weights): - return torch.nn.functional.embedding_bag(input, embedding_matrix, offsets=offset, - mode="sum", per_sample_weights=weights) + return torch.nn.functional.embedding_bag( + input, + embedding_matrix, + offsets=offset, + mode="sum", + per_sample_weights=weights, + ) model = EmbeddingModel() x = torch.randint(7, (6,)) - w = torch.randn(6, ) + w = torch.randn( + 6, + ) offset = torch.tensor([0, 2, 5]) embedding_matrix = torch.rand(10, 15) self.run_test(model, (embedding_matrix, x, offset, w)) @@ -6328,43 +7506,78 @@ def forward(self, embedding_matrix, input, offset, weights): def test_embedding_bag_2d_per_sample_weights(self): class EmbeddingModel(torch.nn.Module): def forward(self, embedding_matrix, input, weights): - return torch.nn.functional.embedding_bag(input, embedding_matrix, - mode="sum", per_sample_weights=weights) + return torch.nn.functional.embedding_bag( + input, embedding_matrix, mode="sum", per_sample_weights=weights + ) embedding_matrix = torch.rand(10, 15) model = EmbeddingModel() x = torch.randint(7, (2, 3)) w = torch.randn(2, 3) - self.run_test(model, (embedding_matrix, x, w)) - @disableScriptTest() # scripting prim::Uninitialized, prim::dtype, prim::unchecked_cast + x2 = torch.randint(7, (4, 3)) + w2 = torch.randn(4, 3) + self.run_test( + model, + (embedding_matrix, x, w), + input_names=["embed", "x", "w"], + dynamic_axes={"x": [0], "w": [0]}, + test_with_inputs=[(embedding_matrix, x2, w2)], + ) + + @skipScriptTest() # scripting prim::Uninitialized, prim::dtype, prim::unchecked_cast @skipIfUnsupportedMinOpsetVersion(11) - @unittest.skip("Due to ONNX Loop shape inference issue. " - "https://msdata.visualstudio.com/Vienna/_workitems/edit/1352001") + @unittest.skip( + "Due to ONNX Loop shape inference issue. " + "https://msdata.visualstudio.com/Vienna/_workitems/edit/1352001" + ) def test_embedding_bag_dynamic_input(self): class EmbeddingModel1D(torch.nn.Module): def forward(self, embedding_matrix, input, weights, offsets): - return torch.nn.functional.embedding_bag(input, embedding_matrix, offsets=offsets, - mode="sum", per_sample_weights=weights) + return torch.nn.functional.embedding_bag( + input, + embedding_matrix, + offsets=offsets, + mode="sum", + per_sample_weights=weights, + ) model = EmbeddingModel1D() x = torch.randint(7, (6,)) - w = torch.randn(6, ) + w = torch.randn( + 6, + ) offsets = torch.tensor([0, 2, 5], dtype=torch.long) embedding_matrix = torch.rand(10, 15) x2 = torch.randint(7, (2,)) - w2 = torch.randn(2, ) + w2 = torch.randn( + 2, + ) embedding_matrix2 = torch.rand(12, 25) - offsets2 = torch.tensor([0, ], dtype=torch.long) - self.run_test(model, (embedding_matrix, x, w, offsets), - test_with_inputs=[(embedding_matrix2, x2, w2, offsets2)], - input_names=["embedding_matrix", "x", "offsets", "w"], - dynamic_axes={"embedding_matrix": [0, 1], "x": [0], "offsets": [0], "w": [0]}) + offsets2 = torch.tensor( + [ + 0, + ], + dtype=torch.long, + ) + self.run_test( + model, + (embedding_matrix, x, w, offsets), + test_with_inputs=[(embedding_matrix2, x2, w2, offsets2)], + input_names=["embedding_matrix", "x", "offsets", "w"], + dynamic_axes={ + "embedding_matrix": [0, 1], + "x": [0], + "offsets": [0], + "w": [0], + }, + ) class EmbeddingModel2D(torch.nn.Module): def forward(self, embedding_matrix, input, weights): - return torch.nn.functional.embedding_bag(input, embedding_matrix, - mode="sum", per_sample_weights=weights) + return torch.nn.functional.embedding_bag( + input, embedding_matrix, mode="sum", per_sample_weights=weights + ) model = EmbeddingModel2D() x = torch.randint(7, (2, 3)) @@ -6373,10 +7586,13 @@ def forward(self, embedding_matrix, input, weights): x2 = torch.randint(7, (3, 5)) w2 = torch.randn(3, 5) embedding_matrix2 = torch.rand(12, 25) - self.run_test(model, (embedding_matrix, x, w), - test_with_inputs=[(embedding_matrix2, x2, w2)], - input_names=["embedding_matrix", "x", "w"], - dynamic_axes={"embedding_matrix": [0, 1], "x": [0, 1], "w": [0, 1]}) + self.run_test( + model, + (embedding_matrix, x, w), + test_with_inputs=[(embedding_matrix2, x2, w2)], + input_names=["embedding_matrix", "x", "w"], + dynamic_axes={"embedding_matrix": [0, 1], "x": [0, 1], "w": [0, 1]}, + ) @skipIfUnsupportedMinOpsetVersion(8) def test_meshgrid(self): @@ -6405,7 +7621,10 @@ def forward(self, x, y, z): def test_baddbmm(self): class MyModule(torch.nn.Module): def forward(self, input, batch1, batch2): - return torch.baddbmm(input, batch1, batch2, alpha=torch.tensor(5), beta=3.5) + return torch.baddbmm( + input, batch1, batch2, alpha=torch.tensor(5), beta=3.5 + ) + x = torch.randn(10, 3, 5) batch1 = torch.randn(10, 3, 4) batch2 = torch.randn(10, 4, 5) @@ -6416,6 +7635,7 @@ def test_baddbmm_dynamic(self): class MyModule(torch.nn.Module): def forward(self, input, batch1, batch2, alpha, beta): return torch.baddbmm(input, batch1, batch2, alpha=alpha, beta=beta) + x = torch.randn(10, 3, 5) batch1 = torch.randn(10, 3, 4) batch2 = torch.randn(10, 4, 5) @@ -6425,24 +7645,36 @@ def forward(self, input, batch1, batch2, alpha, beta): self.run_test(model, (x, batch1, batch2, alpha, beta)) def test_numel(self): - class MyModule(torch.jit.ScriptModule): - @torch.jit.script_method + class MyModule(torch.nn.Module): def forward(self, input): return input.numel() * input x = torch.randn(2, 3, 5) + x2 = torch.randn(4, 5, 6) model = MyModule() - self.run_test(model, (x,)) + self.run_test( + model, + (x,), + input_names=["x"], + dynamic_axes={"x": [0, 1, 2]}, + test_with_inputs=[(x2,)], + ) def test_numel_empty(self): - class MyModule(torch.jit.ScriptModule): - @torch.jit.script_method + class MyModule(torch.nn.Module): def forward(self, input): return input.numel() * input x = torch.randn(0) + x2 = torch.randn(4) model = MyModule() - self.run_test(model, (x,)) + self.run_test( + model, + (x,), + input_names=["x"], + dynamic_axes={"x": [0]}, + test_with_inputs=[(x2,)], + ) def test_dtype(self): class MyModel(torch.jit.ScriptModule): @@ -6530,6 +7762,7 @@ def test_log(self): class Log(torch.nn.Module): def forward(self, input): return torch.log(input) + x = torch.rand(2, 3, 4) model = Log() self.run_test(model, x) @@ -6538,6 +7771,7 @@ def test_log1p(self): class Log1p(torch.nn.Module): def forward(self, input): return torch.log1p(input) + x = torch.rand(2, 3, 4) model = Log1p() self.run_test(model, x) @@ -6546,6 +7780,7 @@ def test_log10(self): class Log10(torch.nn.Module): def forward(self, input): return torch.log10(input) + x = torch.rand(2, 3, 4) model = Log10() self.run_test(model, x) @@ -6580,29 +7815,30 @@ def forward(self, x, pad: List[int]): y = pad = [2, 4] self.run_test(Pad(), (x, y)) - y = pad = [torch.tensor(2, dtype=torch.int64), torch.tensor(4, dtype=torch.int64)] + y = pad = [ + torch.tensor(2, dtype=torch.int64), + torch.tensor(4, dtype=torch.int64), + ] self.run_test(Pad(), (x, y)) - @skipIfUnsupportedMaxOpsetVersion(10) + @skipScriptTest() # TODO: the logic in symbolic_opset9 doesn't handle script def test_unsupported_pad(self): class Pad(torch.nn.Module): - def forward(self, x, pad): + def forward(self, x, pad: List[int]): return torch.nn.functional.pad(x, pad) - def run(): - x = torch.randn(2, 2, 4, 4) - y = pad = (torch.tensor(2, dtype=torch.int32), torch.tensor(4, dtype=torch.int32)) - p = Pad() - f = io.BytesIO() - torch.onnx._export(p, (x, y), f) - - with self.assertRaises(RuntimeError) as cm: - run() - - the_exception = cm.exception - self.assertEqual("Unsupported: ONNX export of Pad in opset 9. The sizes of the padding must be constant. " + - "Please try opset version 11.", the_exception.args[0]) + x = torch.randn(2, 2, 4, 4) + y = [2, 4] + + with self.assertRaisesRegex( + RuntimeError, + ( + "Unsupported: ONNX export of Pad.*" + + "The sizes of the padding must be constant" + ), + ): + self.run_test(Pad(), (x, y)) @skipIfUnsupportedMinOpsetVersion(9) def test_if_fold(self): @@ -6614,6 +7850,7 @@ def forward(self, y): else: y = y - 1 return y + x = torch.ones((3, 4), dtype=torch.int) self.run_test(IfFoldModel(), x) @@ -6723,7 +7960,6 @@ def forward(self, x, y): self.run_test(IfFoldModel(), (x, y)) @skipIfUnsupportedMinOpsetVersion(11) - @skipIfONNXShapeInference(False) def test_uninitialized(self): class UninitializedModel(torch.nn.Module): def forward(self, y): @@ -6738,7 +7974,6 @@ def forward(self, y): self.run_test(UninitializedModel(), x) @skipIfUnsupportedMinOpsetVersion(11) - @skipIfONNXShapeInference(False) def test_uninitialized_dynamic(self): class UninitializedModel(torch.nn.Module): def forward(self, y): @@ -6751,13 +7986,16 @@ def forward(self, y): x = torch.ones((3, 4), dtype=torch.int) y = torch.ones((6, 7), dtype=torch.int) - self.run_test(UninitializedModel(), x, test_with_inputs=[y], - input_names=["input_1"], - dynamic_axes={"input_1": [0, 1]}) + self.run_test( + UninitializedModel(), + x, + test_with_inputs=[y], + input_names=["input_1"], + dynamic_axes={"input_1": [0, 1]}, + ) # onnx::Identity of sequence supported for ONNX opset >= 14 @skipIfUnsupportedMinOpsetVersion(14) - @skipIfONNXShapeInference(False) def test_uninitialized_tensorList(self): class UninitializedTensorListModel(torch.nn.Module): def forward(self, x): @@ -6773,7 +8011,6 @@ def forward(self, x): # onnx::Identity of sequence supported for ONNX opset >= 14 @skipIfUnsupportedMinOpsetVersion(14) - @skipIfONNXShapeInference(False) def test_uninitialized_tensorList_dynamic(self): class UninitializedTensorListModel(torch.nn.Module): def forward(self, x): @@ -6785,12 +8022,15 @@ def forward(self, x): return list(x) x = torch.ones((3, 4), dtype=torch.double) - self.run_test(torch.jit.script(UninitializedTensorListModel()), x, input_names=["input_1"], - dynamic_axes={"input_1": [0, 1]}) + self.run_test( + torch.jit.script(UninitializedTensorListModel()), + x, + input_names=["input_1"], + dynamic_axes={"input_1": [0, 1]}, + ) # onnx::Identity of sequence supported for ONNX opset >= 14 @skipIfUnsupportedMinOpsetVersion(14) - @skipIfONNXShapeInference(False) def test_uninitialized_intList(self): class UninitializedListModel(torch.nn.Module): def forward(self, x): @@ -6804,12 +8044,15 @@ def forward(self, x): return y x = torch.ones((3, 4), dtype=torch.int) - self.run_test(torch.jit.script(UninitializedListModel()), x, input_names=["input_1"], - dynamic_axes={"input_1": [0, 1]}) + self.run_test( + torch.jit.script(UninitializedListModel()), + x, + input_names=["input_1"], + dynamic_axes={"input_1": [0, 1]}, + ) # onnx::Identity of sequence supported for ONNX opset >= 14 @skipIfUnsupportedMinOpsetVersion(14) - @skipIfONNXShapeInference(False) def test_uninitialized_tensorList_shape(self): class UninitializedModel(torch.nn.Module): def forward(self, x): @@ -6824,9 +8067,13 @@ def forward(self, x): x = torch.ones((3, 4), dtype=torch.int) y = torch.ones((4, 6), dtype=torch.int) - self.run_test(torch.jit.script(UninitializedModel()), x, test_with_inputs=[y], - input_names=["input_1"], - dynamic_axes={"input_1": [0, 1]}) + self.run_test( + torch.jit.script(UninitializedModel()), + x, + test_with_inputs=[y], + input_names=["input_1"], + dynamic_axes={"input_1": [0, 1]}, + ) # Sequence type as loop-carried dependencies only supported for ONNX opset >= 13 @skipIfUnsupportedMinOpsetVersion(13) @@ -6863,9 +8110,17 @@ def test_replication_pad(self): def test_im2col(self): class Unfold(torch.nn.Module): def forward(self, input): - return torch.nn.functional.unfold(input, kernel_size=(10, 15), dilation=2, padding=5, stride=3), \ - torch.nn.functional.unfold(input, kernel_size=(2, 2), dilation=1, padding=0, stride=3), \ - torch.nn.functional.unfold(input, kernel_size=(1, 1), dilation=5, padding=2, stride=3) + return ( + torch.nn.functional.unfold( + input, kernel_size=(10, 15), dilation=2, padding=5, stride=3 + ), + torch.nn.functional.unfold( + input, kernel_size=(2, 2), dilation=1, padding=0, stride=3 + ), + torch.nn.functional.unfold( + input, kernel_size=(1, 1), dilation=5, padding=2, stride=3 + ), + ) x = torch.rand(1, 1, 200, 100) self.run_test(Unfold(), x) @@ -6880,6 +8135,142 @@ def forward(self, x): x = torch.randn(2, 3, 5, 5) self.run_test(Det(), x) + def test_linalg_norm(self): + class LinalgSingleDimModel(torch.nn.Module): + def __init__(self, ord_val): + super(LinalgSingleDimModel, self).__init__() + self.ord = ord_val + + def forward(self, x): + return torch.linalg.norm(x, ord=self.ord, dim=1) + + x = torch.randn(2, 3, 5, 5) + self.run_test(LinalgSingleDimModel(None), x) + self.run_test(LinalgSingleDimModel(2), x) + self.run_test(LinalgSingleDimModel(float("inf")), x) + self.run_test(LinalgSingleDimModel(-float("inf")), x) + self.run_test(LinalgSingleDimModel(-4), x) + self.run_test(LinalgSingleDimModel(1.5), x) + + class LinalgMultiDimModel(torch.nn.Module): + def __init__(self, ord_val): + super(LinalgMultiDimModel, self).__init__() + self.ord = ord_val + + def forward(self, x): + return torch.linalg.norm(x, ord=self.ord, dim=(0, 2)) + + x = torch.randn(2, 3, 5, 5) + self.run_test(LinalgMultiDimModel("fro"), x) + self.run_test(LinalgMultiDimModel(float("inf")), x) + self.run_test(LinalgMultiDimModel(-float("inf")), x) + self.run_test(LinalgMultiDimModel(1), x) + self.run_test(LinalgMultiDimModel(-1), x) + + class LinalgNoDimNoOrdModel(torch.nn.Module): + def forward(self, x): + return torch.linalg.norm(x) + + x = torch.randn(2, 3, 5, 5) + self.run_test(LinalgNoDimNoOrdModel(), x) + y = torch.randn(2, 3) + self.run_test(LinalgNoDimNoOrdModel(), y) + z = torch.randn(2) + self.run_test(LinalgNoDimNoOrdModel(), z) + + class LinalgNoDim1DModel(torch.nn.Module): + def __init__(self, ord_val): + super(LinalgNoDim1DModel, self).__init__() + self.ord = ord_val + + def forward(self, x): + return torch.linalg.norm(x, ord=self.ord) + + x = torch.randn(2) + self.run_test(LinalgNoDim1DModel(None), x) + self.run_test(LinalgNoDim1DModel(2), x) + self.run_test(LinalgNoDim1DModel(float("inf")), x) + self.run_test(LinalgNoDim1DModel(-float("inf")), x) + self.run_test(LinalgNoDim1DModel(-4), x) + self.run_test(LinalgNoDim1DModel(1.5), x) + + class LinalgNoDim2DModel(torch.nn.Module): + def __init__(self, ord_val): + super(LinalgNoDim2DModel, self).__init__() + self.ord = ord_val + + def forward(self, x): + return torch.linalg.norm(x, ord=self.ord) + + x = torch.randn(2, 3) + self.run_test(LinalgNoDim2DModel("fro"), x) + self.run_test(LinalgNoDim2DModel(float("inf")), x) + self.run_test(LinalgNoDim2DModel(-float("inf")), x) + self.run_test(LinalgNoDim2DModel(1), x) + self.run_test(LinalgNoDim2DModel(-1), x) + + @skipIfUnsupportedMinOpsetVersion(11) + def test_linalg_vector_norm_zero(self): + class LinalgVectorNormModel(torch.nn.Module): + def __init__(self, ord_val): + super(LinalgVectorNormModel, self).__init__() + self.ord = ord_val + + def forward(self, x): + return torch.linalg.vector_norm(x, ord=self.ord) + + x = torch.randn(2, 3, 5, 5) + self.run_test(LinalgVectorNormModel(0), x) + + def test_linalg_vector_norm(self): + class LinalgVectorNormModel(torch.nn.Module): + def __init__(self, ord_val, dim_info): + super(LinalgVectorNormModel, self).__init__() + self.ord = ord_val + self.dim, self.keepdim = dim_info + + def forward(self, x): + return torch.linalg.vector_norm( + x, ord=self.ord, dim=self.dim, keepdim=self.keepdim + ) + + x = torch.randn(2, 3, 5, 5) + ord_options = [2, float("inf"), -float("inf"), -4, 1.5] + dim_options = [(None, False), (1, False), ((1, 2), False), ((1, 2), True)] + for ord_val in ord_options: + for dim_info in dim_options: + self.run_test(LinalgVectorNormModel(ord_val, dim_info), x) + + def test_linalg_matrix_norm(self): + class LinalgMatrixNormModel(torch.nn.Module): + def __init__(self, ord_val, dim_val=(-2, -1), keepdim_val=False): + super(LinalgMatrixNormModel, self).__init__() + self.ord = ord_val + self.dim = dim_val + self.keepdim = keepdim_val + + def forward(self, x): + return torch.linalg.matrix_norm( + x, ord=self.ord, dim=self.dim, keepdim=self.keepdim + ) + + x = torch.randn(2, 3, 5, 5) + ord_options = ["fro", float("inf"), -float("inf"), 1, -1] + for ord_val in ord_options: + self.run_test(LinalgMatrixNormModel(ord_val), x) + self.run_test(LinalgMatrixNormModel(ord_val, (0, 2)), x) + self.run_test(LinalgMatrixNormModel(ord_val, (0, 2), True), x) + + @skipIfUnsupportedMinOpsetVersion(9) + def test_linalg_cross(self): + class Cross(torch.nn.Module): + def forward(self, x, y): + return torch.linalg.cross(x, y, dim=1), torch.linalg.cross(x, y) + + x = torch.randn(5, 3, 2, 3) + y = torch.randn(1, 3, 1, 3) + self.run_test(Cross(), input=(x, y)) + # This test checks output scalar type in the ONNX graph should not be null # https://github.com/pytorch/pytorch/issues/28607 @skipIfUnsupportedMinOpsetVersion(10) @@ -6931,7 +8322,7 @@ def forward(self, poses): return batch_boxes dummy_inputs = torch.rand(2, 2, 3) - self.run_test(M(), (dummy_inputs, ), input_names=['x'], dynamic_axes={"x": [0]}) + self.run_test(M(), (dummy_inputs,), input_names=["x"], dynamic_axes={"x": [0]}) @skipIfUnsupportedMinOpsetVersion(12) def test_outer(self): @@ -6991,6 +8382,38 @@ def forward(self, x): for x in [torch.randn(3, 4), torch.randn(3, 4).to(dtype=torch.bool)]: self.run_test(EinsumModelTranspose(), input=(x,)) + @skipIfUnsupportedMinOpsetVersion(9) + def test_cosine_similarity(self): + x = torch.randn(5, 3, 2) + y = torch.randn(5, 3, 2) + self.run_test(torch.nn.CosineSimilarity(dim=2), input=(x, y)) + + @skipIfUnsupportedMinOpsetVersion(9) + def test_pairwise_distance(self): + x = torch.randn(5, 3, 2) + y = torch.randn(5, 3, 2) + self.run_test(torch.nn.PairwiseDistance(p=2.0), input=(x, y)) + + @skipIfUnsupportedMinOpsetVersion(9) + def test_cross(self): + class Cross(torch.nn.Module): + def forward(self, x, y): + return torch.cross(x, y, dim=3), torch.cross(x, y) + + x = torch.randn(5, 3, 2, 3) + y = torch.randn(5, 3, 2, 3) + self.run_test(Cross(), input=(x, y)) + + @skipIfUnsupportedMinOpsetVersion(9) + def test_cdist(self): + class Cdist(torch.nn.Module): + def forward(self, x, y): + return torch.cdist(x, y) + + x = torch.randn(5, 3, 3) + y = torch.randn(5, 2, 3) + self.run_test(Cdist(), input=(x, y)) + @skipIfUnsupportedMinOpsetVersion(12) def test_crossentropyloss(self): for ignore_index in [-100, 1]: @@ -7017,7 +8440,9 @@ def __init__(self, ignore_index): if ignore_index == -100: self.loss = torch.nn.CrossEntropyLoss(reduction="none") else: - self.loss = torch.nn.CrossEntropyLoss(reduction="none", ignore_index=ignore_index) + self.loss = torch.nn.CrossEntropyLoss( + reduction="none", ignore_index=ignore_index + ) def forward(self, input, target): return self.loss(input, target) @@ -7028,9 +8453,15 @@ class CrossEntropyLossNoneWeight(torch.nn.Module): def __init__(self, ignore_index): super(CrossEntropyLossNoneWeight, self).__init__() if ignore_index == -100: - self.loss = torch.nn.CrossEntropyLoss(reduction="none", weight=torch.randn(5)) + self.loss = torch.nn.CrossEntropyLoss( + reduction="none", weight=torch.randn(5) + ) else: - self.loss = torch.nn.CrossEntropyLoss(reduction="none", weight=torch.randn(5), ignore_index=ignore_index) + self.loss = torch.nn.CrossEntropyLoss( + reduction="none", + weight=torch.randn(5), + ignore_index=ignore_index, + ) def forward(self, input, target): return self.loss(input, target) @@ -7043,7 +8474,9 @@ def __init__(self, ignore_index): if ignore_index == -100: self.loss = torch.nn.CrossEntropyLoss(reduction="sum") else: - self.loss = torch.nn.CrossEntropyLoss(reduction="sum", ignore_index=ignore_index) + self.loss = torch.nn.CrossEntropyLoss( + reduction="sum", ignore_index=ignore_index + ) def forward(self, input, target): return self.loss(input, target) @@ -7054,9 +8487,15 @@ class CrossEntropyLossSumWeight(torch.nn.Module): def __init__(self, ignore_index): super(CrossEntropyLossSumWeight, self).__init__() if ignore_index == -100: - self.loss = torch.nn.CrossEntropyLoss(reduction="sum", weight=torch.randn(5)) + self.loss = torch.nn.CrossEntropyLoss( + reduction="sum", weight=torch.randn(5) + ) else: - self.loss = torch.nn.CrossEntropyLoss(reduction="sum", weight=torch.randn(5), ignore_index=ignore_index) + self.loss = torch.nn.CrossEntropyLoss( + reduction="sum", + weight=torch.randn(5), + ignore_index=ignore_index, + ) def forward(self, input, target): return self.loss(input, target) @@ -7082,7 +8521,9 @@ def __init__(self, ignore_index): if ignore_index == -100: self.loss = torch.nn.CrossEntropyLoss(weight=torch.randn(5)) else: - self.loss = torch.nn.CrossEntropyLoss(weight=torch.randn(5), ignore_index=ignore_index) + self.loss = torch.nn.CrossEntropyLoss( + weight=torch.randn(5), ignore_index=ignore_index + ) def forward(self, input, target): return self.loss(input, target) @@ -7148,7 +8589,9 @@ def forward(self, input, target): class KLDivLossMiniBatchMean(torch.nn.Module): def __init__(self): super(KLDivLossMiniBatchMean, self).__init__() - self.loss = torch.nn.KLDivLoss(reduction="batchmean", size_average=False, log_target=True) + self.loss = torch.nn.KLDivLoss( + reduction="batchmean", size_average=False, log_target=True + ) def forward(self, input, target): return self.loss(input, target) @@ -7284,11 +8727,17 @@ def test_nllloss_dynamic_ignore_index(self): def linear_combination(x, y, epsilon): return epsilon * x + (1 - epsilon) * y - def reduce_loss(loss, reduction='mean'): - return loss.mean() if reduction == 'mean' else loss.sum() if reduction == 'sum' else loss + def reduce_loss(loss, reduction="mean"): + return ( + loss.mean() + if reduction == "mean" + else loss.sum() + if reduction == "sum" + else loss + ) class LabelSmoothingCrossEntropy(torch.nn.Module): - def __init__(self, epsilon: float = 0.1, reduction='mean'): + def __init__(self, epsilon: float = 0.1, reduction="mean"): super().__init__() self.epsilon = epsilon self.reduction = reduction @@ -7297,7 +8746,12 @@ def forward(self, preds, target, start_position): n = preds.size()[-1] log_preds = F.log_softmax(preds, dim=-1) ignore_index = start_position.size(1) - nll = F.nll_loss(log_preds, target, reduction=self.reduction, ignore_index=ignore_index) + nll = F.nll_loss( + log_preds, + target, + reduction=self.reduction, + ignore_index=ignore_index, + ) return nll + start_position.float() N = 5 @@ -7311,7 +8765,9 @@ def test_nllloss_2d_mean_ignore_index_weights(self): class NLLModel(torch.nn.Module): def __init__(self): super(NLLModel, self).__init__() - self.loss = torch.nn.NLLLoss(reduction="mean", weight=torch.randn(C), ignore_index=1) + self.loss = torch.nn.NLLLoss( + reduction="mean", weight=torch.randn(C), ignore_index=1 + ) self.conv = torch.nn.Conv2d(16, C, (3, 3)) self.m = torch.nn.LogSoftmax(dim=1) @@ -7349,80 +8805,116 @@ def test_binary_cross_entropy_with_logits(self): def _bce_logits(self, x, y): class BCEWithLogitsLossNone(torch.nn.Module): def forward(self, input, target): - return torch.nn.functional.binary_cross_entropy_with_logits(input, target, reduction="none") + return torch.nn.functional.binary_cross_entropy_with_logits( + input, target, reduction="none" + ) self.run_test(BCEWithLogitsLossNone(), input=(x, y)) class BCEWithLogitsLossMean(torch.nn.Module): def forward(self, input, target): - return torch.nn.functional.binary_cross_entropy_with_logits(input, target, reduction="mean") + return torch.nn.functional.binary_cross_entropy_with_logits( + input, target, reduction="mean" + ) self.run_test(BCEWithLogitsLossMean(), input=(x, y)) class BCEWithLogitsLossSum(torch.nn.Module): def forward(self, input, target): - return torch.nn.functional.binary_cross_entropy_with_logits(input, target, reduction="sum") + return torch.nn.functional.binary_cross_entropy_with_logits( + input, target, reduction="sum" + ) self.run_test(BCEWithLogitsLossSum(), input=(x, y)) def _bce_logits_wegiht(self, x, y, weight): class BCEWithLogitsLossWegihtNone(torch.nn.Module): def forward(self, input, target, weight): - return torch.nn.functional.binary_cross_entropy_with_logits(input, target, weight=weight, reduction="none") + return torch.nn.functional.binary_cross_entropy_with_logits( + input, target, weight=weight, reduction="none" + ) + self.run_test(BCEWithLogitsLossWegihtNone(), input=(x, y, weight)) class BCEWithLogitsLossWegihtMean(torch.nn.Module): def forward(self, input, target, weight): - return torch.nn.functional.binary_cross_entropy_with_logits(input, target, weight=weight, reduction="mean") + return torch.nn.functional.binary_cross_entropy_with_logits( + input, target, weight=weight, reduction="mean" + ) self.run_test(BCEWithLogitsLossWegihtMean(), input=(x, y, weight)) class BCEWithLogitsLossWegihtSum(torch.nn.Module): def forward(self, input, target, weight): - return torch.nn.functional.binary_cross_entropy_with_logits(input, target, weight=weight, reduction="sum") + return torch.nn.functional.binary_cross_entropy_with_logits( + input, target, weight=weight, reduction="sum" + ) self.run_test(BCEWithLogitsLossWegihtSum(), input=(x, y, weight)) def _bce_logits_posweight(self, x, y, pos_weight): class BCEWithLogitsLossPosWegihtNone(torch.nn.Module): def forward(self, input, target, pos_weight): - return torch.nn.functional.binary_cross_entropy_with_logits(input, target, pos_weight=pos_weight, reduction="none") + return torch.nn.functional.binary_cross_entropy_with_logits( + input, target, pos_weight=pos_weight, reduction="none" + ) + self.run_test(BCEWithLogitsLossPosWegihtNone(), input=(x, y, pos_weight)) class BCEWithLogitsLossPosWegihtMean(torch.nn.Module): def forward(self, input, target, pos_weight): - return torch.nn.functional.binary_cross_entropy_with_logits(input, target, pos_weight=pos_weight, reduction="mean") + return torch.nn.functional.binary_cross_entropy_with_logits( + input, target, pos_weight=pos_weight, reduction="mean" + ) self.run_test(BCEWithLogitsLossPosWegihtMean(), input=(x, y, pos_weight)) class BCEWithLogitsLossPosWegihtSum(torch.nn.Module): def forward(self, input, target, pos_weight): - return torch.nn.functional.binary_cross_entropy_with_logits(input, target, pos_weight=pos_weight, reduction="sum") + return torch.nn.functional.binary_cross_entropy_with_logits( + input, target, pos_weight=pos_weight, reduction="sum" + ) self.run_test(BCEWithLogitsLossPosWegihtSum(), input=(x, y, pos_weight)) def _bce_logits_loss_weight_posweight(self, x, y, weight, pos_weight): class BCEWithLogitsLossWeightPosweightNone(torch.nn.Module): def forward(self, input, target, weight, pos_weight): - return torch.nn.functional.binary_cross_entropy_with_logits(input, target, weight=weight, - pos_weight=pos_weight, reduction="none") + return torch.nn.functional.binary_cross_entropy_with_logits( + input, + target, + weight=weight, + pos_weight=pos_weight, + reduction="none", + ) - self.run_test(BCEWithLogitsLossWeightPosweightNone(), input=(x, y, weight, pos_weight)) + self.run_test( + BCEWithLogitsLossWeightPosweightNone(), input=(x, y, weight, pos_weight) + ) class BCEWithLogitsLossWeightPosweightMean(torch.nn.Module): def forward(self, input, target, weight, pos_weight): - return torch.nn.functional.binary_cross_entropy_with_logits(input, target, weight=weight, - pos_weight=pos_weight, reduction="mean") + return torch.nn.functional.binary_cross_entropy_with_logits( + input, + target, + weight=weight, + pos_weight=pos_weight, + reduction="mean", + ) - self.run_test(BCEWithLogitsLossWeightPosweightMean(), input=(x, y, weight, pos_weight)) + self.run_test( + BCEWithLogitsLossWeightPosweightMean(), input=(x, y, weight, pos_weight) + ) class BCEWithLogitsLossWeightPosweightSum(torch.nn.Module): def forward(self, input, target, weight, pos_weight): - return torch.nn.functional.binary_cross_entropy_with_logits(input, target, weight=weight, - pos_weight=pos_weight, reduction="sum") - - self.run_test(BCEWithLogitsLossWeightPosweightSum(), input=(x, y, weight, pos_weight)) + return torch.nn.functional.binary_cross_entropy_with_logits( + input, target, weight=weight, pos_weight=pos_weight, reduction="sum" + ) + self.run_test( + BCEWithLogitsLossWeightPosweightSum(), input=(x, y, weight, pos_weight) + ) def test_torch_mm(self): class M(torch.nn.Module): @@ -7434,7 +8926,9 @@ def forward(self, mat1, mat2): mat2 = torch.randn(3, 3) self.run_test(M(), input=(mat1, mat2)) - @skipIfUnsupportedMinOpsetVersion(9) # Because where op is not supported for opset < 9. + @skipIfUnsupportedMinOpsetVersion( + 9 + ) # Because where op is not supported for opset < 9. def test_where_with_bool_tensor(self): class M(torch.nn.Module): def forward(self, mat1, mat2): @@ -7445,7 +8939,9 @@ def forward(self, mat1, mat2): mat2 = torch.ones(2, 3) self.run_test(M(), input=(mat1, mat2)) - @skipIfUnsupportedMinOpsetVersion(9) # Because where op is not supported for opset < 9. + @skipIfUnsupportedMinOpsetVersion( + 9 + ) # Because where op is not supported for opset < 9. def test_where_with_byte_tensor(self): class M(torch.nn.Module): def forward(self, cond, mat1, mat2): @@ -7465,7 +8961,7 @@ def forward(self, x): return x.isinf() x = torch.tensor([[1, 2, float("inf")], [2, float("nan"), float("inf")]]) - self.run_test(M(), (x, )) + self.run_test(M(), (x,)) @skipIfUnsupportedMinOpsetVersion(10) def test_isfinite(self): @@ -7474,7 +8970,7 @@ def forward(self, x): return x.isfinite() x = torch.tensor([[1, 2, float("inf")], [2, float("nan"), -float("inf")]]) - self.run_test(M(), (x, )) + self.run_test(M(), (x,)) @skipIfUnsupportedMinOpsetVersion(9) # ONNX IsNaN op is added in opset 9. def test_isnan(self): @@ -7483,7 +8979,61 @@ def forward(self, x): return x.isnan() x = torch.tensor([[1, 2, float("inf")], [2, float("nan"), float("inf")]]) - self.run_test(M(), (x, )) + self.run_test(M(), (x,)) + + @skipIfUnsupportedMinOpsetVersion( + 10 + ) # ONNX IsNaN, IsInf op is added in opset 9, 10 respectively. + def test_nan_to_num(self): + class NoParams(torch.nn.Module): + def forward(self, x): + return x.nan_to_num() + + x = torch.tensor([[1, 2, float("inf")], [2, float("nan"), -float("inf")]]) + xint = torch.ones((2, 4), dtype=torch.int) + xhalf = torch.ones((2, 4), dtype=torch.half) + self.run_test(NoParams(), (x,)) + self.run_test(NoParams(), (xint,)) + self.run_test(NoParams(), (xhalf,)) + + class WithParams(torch.nn.Module): + def forward(self, x): + return x.nan_to_num(nan=2.3, posinf=4.5, neginf=6.7) + + x = torch.tensor([[1, 2, float("inf")], [2, float("nan"), -float("inf")]]) + self.run_test(WithParams(), (x,)) + + @skipIfUnsupportedMinOpsetVersion(9) + def test_maximum_minimum(self): + class ModelWithNan(torch.nn.Module): + def forward(self, x, y): + return torch.maximum(x, y), torch.minimum(x, y) + + x = torch.tensor([-2, -2, float("nan")]) + y = torch.rand(1, 3) + self.run_test(ModelWithNan(), (x, y)) + + @skipIfUnsupportedMinOpsetVersion(12) + def test_minimum_dtypes(self): + class MinimumModel(torch.nn.Module): + def forward(self, x, y): + return torch.minimum(x, y) + + x = torch.randn((5, 5), dtype=torch.float16) + y = torch.randn((5, 5), dtype=torch.float) + self.run_test(MinimumModel(), (x, y)) + + x = torch.randn((5, 5), dtype=torch.float16) + y = torch.randint(10, (5, 5), dtype=torch.int16) + self.run_test(MinimumModel(), (x, y)) + + x = torch.randint(10, (5, 5), dtype=torch.int16) + y = torch.randint(10, (5, 5), dtype=torch.int32) + self.run_test(MinimumModel(), (x, y)) + + x = torch.randint(10, (5, 5), dtype=torch.int) + y = torch.full_like(x, True) + self.run_test(MinimumModel(), (x, y)) @skipIfUnsupportedMinOpsetVersion(9) def test_any(self): @@ -7492,21 +9042,21 @@ def forward(self, x): return x.any() x = torch.tensor([[True, False], [False, False]]) - self.run_test(M(), (x, )) + self.run_test(M(), (x,)) class MDim(torch.nn.Module): def forward(self, x): return x.any(dim=1) x = torch.rand(3, 4).bool() - self.run_test(MDim(), (x, )) + self.run_test(MDim(), (x,)) class MKeepdim(torch.nn.Module): def forward(self, x): return x.any(dim=1, keepdim=True) x = torch.rand(3, 4).bool() - self.run_test(MKeepdim(), (x, )) + self.run_test(MKeepdim(), (x,)) @skipIfUnsupportedMinOpsetVersion(9) def test_all(self): @@ -7515,21 +9065,21 @@ def forward(self, x): return x.all() x = torch.tensor([[True, False], [False, False]]) - self.run_test(M(), (x, )) + self.run_test(M(), (x,)) class MDim(torch.nn.Module): def forward(self, x): return x.all(dim=1) x = torch.rand(3, 4).bool() - self.run_test(MDim(), (x, )) + self.run_test(MDim(), (x,)) class MKeepdim(torch.nn.Module): def forward(self, x): return x.all(dim=1, keepdim=True) x = torch.rand(3, 4).bool() - self.run_test(MKeepdim(), (x, )) + self.run_test(MKeepdim(), (x,)) def test_dropout(self): class M(torch.nn.Module): @@ -7588,7 +9138,7 @@ def test_celu_alpha(self): class Celu(torch.nn.Module): def __init__(self): super(Celu, self).__init__() - self.celu = torch.nn.CELU(alpha=2.) + self.celu = torch.nn.CELU(alpha=2.0) def forward(self, input): return self.celu(input) @@ -7611,8 +9161,7 @@ def forward(self, input): def test_lower_tuple(self): class TupleModule(torch.nn.Module): - def forward(self, input1, input2, input3): - # type: (Tensor, Tensor, Tensor) -> Tensor + def forward(self, input1: Tensor, input2: Tensor, input3: Tensor) -> Tensor: a = (input1, input2) b = a c = (input1, input2, input3) @@ -7640,8 +9189,7 @@ def forward(self, input1, input2, input3): def test_lower_tuple_2(self): class TupleModule(torch.nn.Module): - def forward(self, input1, input2): - # type: (Tensor, Tensor) -> Tuple[Tensor, Tensor] + def forward(self, input1: Tensor, input2: Tensor) -> Tuple[Tensor, Tensor]: a = (input1, input2) for x in range(5): c, d = a @@ -7654,8 +9202,11 @@ def forward(self, input1, input2): def test_lower_tuple_3(self): class TupleModule(torch.nn.Module): - def forward(self, input1, input2): - # type: (Tuple[Tensor, Tensor], Tuple[Tensor, Tensor]) + def forward( + self, + input1: Tuple[Tensor, Tensor], + input2: Tuple[Tensor, Tensor], + ) -> Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor]]: a = input1 b = input2 for x in range(5): @@ -7667,7 +9218,7 @@ def forward(self, input1, input2): f = f + d a = (e, f) b = (c, d) - return a , b + return a, b input1 = (torch.randn(2), torch.randn(2)) input2 = (torch.randn(2), torch.randn(2)) @@ -7685,7 +9236,7 @@ def forward(self, cond, input, other): self.run_test(Model(), (x, y, z)) @skipIfUnsupportedMinOpsetVersion(9) - @disableScriptTest() # scripting tests run for opsets > 11. See: test_where_condition_script + @skipScriptTest() # scripting tests run for opsets > 11. See: test_where_condition_script def test_where_condition(self): class Model1(torch.nn.Module): def forward(self, input): @@ -7740,7 +9291,7 @@ def forward(self, input): @skipIfUnsupportedMinOpsetVersion(11) def test_derive_index_scripting(self): class MyModule(torch.nn.Module): - def forward(self, x: torch.Tensor): + def forward(self, x: Tensor): j = [] for idx in range(len(x) - 1, -len(x), -2): y = x[idx] @@ -7751,7 +9302,7 @@ def forward(self, x: torch.Tensor): self.run_test(MyModule(), x) class MyModule(torch.nn.Module): - def forward(self, x: torch.Tensor): + def forward(self, x: Tensor): j = [] for idx in range(-len(x), len(x) - 1, 2): y = x[idx] @@ -7762,7 +9313,7 @@ def forward(self, x: torch.Tensor): self.run_test(MyModule(), x) class MyModule(torch.nn.Module): - def forward(self, x: torch.Tensor): + def forward(self, x: Tensor): j = [] for idx in range(len(x) - 1, -len(x), -3): y = x[idx] @@ -7772,7 +9323,7 @@ def forward(self, x: torch.Tensor): self.run_test(MyModule(), x) class MyModule(torch.nn.Module): - def forward(self, x: torch.Tensor): + def forward(self, x: Tensor): j = [] for idx in range(-len(x), len(x) - 1, 3): y = x[idx] @@ -7781,10 +9332,10 @@ def forward(self, x: torch.Tensor): self.run_test(MyModule(), x) - @disableScriptTest() # Scripting fails for add lists for opsets < 11. Chek test_derive_index_scripting + @skipScriptTest() # Scripting fails for add lists for opsets < 11. Chek test_derive_index_scripting def test_derive_index(self): class MyModule(torch.nn.Module): - def forward(self, x: torch.Tensor): + def forward(self, x: Tensor): j = [] for idx in range(len(x) - 1, -len(x), -2): y = x[idx] @@ -7795,7 +9346,7 @@ def forward(self, x: torch.Tensor): self.run_test(MyModule(), x) class MyModule(torch.nn.Module): - def forward(self, x: torch.Tensor): + def forward(self, x: Tensor): j = [] for idx in range(-len(x), len(x) - 1, 2): y = x[idx] @@ -7806,7 +9357,7 @@ def forward(self, x: torch.Tensor): self.run_test(MyModule(), x) class MyModule(torch.nn.Module): - def forward(self, x: torch.Tensor): + def forward(self, x: Tensor): j = [] for idx in range(len(x) - 1, -len(x), -3): y = x[idx] @@ -7816,7 +9367,7 @@ def forward(self, x: torch.Tensor): self.run_test(MyModule(), x) class MyModule(torch.nn.Module): - def forward(self, x: torch.Tensor): + def forward(self, x: Tensor): j = [] for idx in range(-len(x), len(x) - 1, 3): y = x[idx] @@ -7825,7 +9376,6 @@ def forward(self, x: torch.Tensor): self.run_test(MyModule(), x) - @skipIfONNXShapeInference(False) @skipIfUnsupportedMinOpsetVersion(11) def test_if_transpose(self): class IfModel(torch.nn.Module): @@ -7837,11 +9387,13 @@ def forward(self, x): return x x = torch.randn(2, 3) - self.run_test(torch.jit.script(IfModel()), x, - output_names=["output_1"], - dynamic_axes={"output_1": [0, 1]}) + self.run_test( + torch.jit.script(IfModel()), + x, + output_names=["output_1"], + dynamic_axes={"output_1": [0, 1]}, + ) - @skipIfONNXShapeInference(False) @skipIfUnsupportedMinOpsetVersion(13) def test_if_list(self): class IfModel(torch.nn.Module): @@ -7872,9 +9424,12 @@ def forward(self, x, y, cond): x = torch.randn(2, 16, 2, 2) y = torch.randn(2, 16, 8) cond = torch.tensor(1, dtype=torch.bool) - self.run_test(torch.jit.script(IfModel()), (x, y, cond), - output_names=["output_1"], - dynamic_axes={"output_1": [1]}) + self.run_test( + torch.jit.script(IfModel()), + (x, y, cond), + output_names=["output_1"], + dynamic_axes={"output_1": [1]}, + ) def test_onnx_proto_checker(self): class Model(torch.nn.Module): @@ -7895,16 +9450,7 @@ def check_proto(): self.assertRaises(RuntimeError, check_proto) - @skipIfUnsupportedMinOpsetVersion(11) - def test_split_tensor_scalar_scripting(self): - class SplitModel(torch.nn.Module): - def forward(self, x): - return torch.split(x, x.size(1)) - - x = torch.randn(1, 2, 3, requires_grad=True) - self.run_test(SplitModel(), x) - - @disableScriptTest() # Scripting fails to export dynamic split for opsets < 11 + @skipScriptTest(min_opset_version=11) # dynamic split support addded in 11 def test_split_tensor_scalar(self): class SplitModel(torch.nn.Module): def forward(self, x): @@ -7941,7 +9487,9 @@ def forward(self, input, emb): x[2] = 1 x[0][1] = 1 self.run_test(model, (x, embedding_matrix)) - self.run_test(model, (x, embedding_matrix), training=torch.onnx.TrainingMode.TRAINING) + self.run_test( + model, (x, embedding_matrix), training=torch.onnx.TrainingMode.TRAINING + ) class EmbedModelWithoutPaddingIdx(torch.nn.Module): def forward(self, input, emb): @@ -7986,6 +9534,17 @@ def forward(self, input): x = torch.randint(4, (4, 3, 2)) self.run_test(model, (x,)) + @skipIfUnsupportedMinOpsetVersion(11) + def test_embedding_renorm(self): + n, d = 7, 5 + embedding = torch.nn.Embedding(n, d, max_norm=0.2) + idx = torch.tensor([2, 1]) + self.run_test(embedding, idx) + + embedding = torch.nn.Embedding(n, d, max_norm=0.5, norm_type=1.0) + idx = torch.tensor([4, 3, 4, 2]) + self.run_test(embedding, idx) + def _dispatch_rnn_test(self, name, *args, **kwargs): if name == "elman": self._elman_rnn_test(*args, **kwargs) @@ -7994,16 +9553,29 @@ def _dispatch_rnn_test(self, name, *args, **kwargs): if name == "gru": self._gru_test(*args, **kwargs) - def _elman_rnn_test(self, layers, nonlinearity, bidirectional, - initial_state, packed_sequence, dropout): - + def _elman_rnn_test( + self, + layers, + nonlinearity, + bidirectional, + initial_state, + packed_sequence, + dropout, + ): class ElmanWithStateModel(torch.nn.Module): def __init__(self, layers, nonlinearity, bidirect, dropout, batch_first): super(ElmanWithStateModel, self).__init__() self.batch_first = batch_first - self.inner_model = torch.nn.RNN(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, layers, nonlinearity=nonlinearity, - bidirectional=bidirectional, dropout=dropout, batch_first=batch_first) + self.inner_model = torch.nn.RNN( + RNN_INPUT_SIZE, + RNN_HIDDEN_SIZE, + layers, + nonlinearity=nonlinearity, + bidirectional=bidirectional, + dropout=dropout, + batch_first=batch_first, + ) def forward(self, input: PackedSequence, hx=None): return self.inner_model(input, hx) @@ -8012,8 +9584,15 @@ class ElmanWithoutStateModel(torch.nn.Module): def __init__(self, layers, nonlinearity, bidirect, dropout, batch_first): super(ElmanWithoutStateModel, self).__init__() self.batch_first = batch_first - self.inner_model = torch.nn.RNN(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, layers, nonlinearity=nonlinearity, - bidirectional=bidirectional, dropout=dropout, batch_first=batch_first) + self.inner_model = torch.nn.RNN( + RNN_INPUT_SIZE, + RNN_HIDDEN_SIZE, + layers, + nonlinearity=nonlinearity, + bidirectional=bidirectional, + dropout=dropout, + batch_first=batch_first, + ) def forward(self, input: PackedSequence): return self.inner_model(input) @@ -8021,14 +9600,23 @@ def forward(self, input: PackedSequence): batch_first = packed_sequence == 2 if initial_state: - model = ElmanWithStateModel(layers=layers, bidirect=bidirectional, nonlinearity=nonlinearity, - dropout=dropout, batch_first=batch_first) + model = ElmanWithStateModel( + layers=layers, + bidirect=bidirectional, + nonlinearity=nonlinearity, + dropout=dropout, + batch_first=batch_first, + ) if packed_sequence: model = RnnModelWithPackedSequenceWithState(model, batch_first) else: - model = ElmanWithStateModel(layers=layers, bidirect=bidirectional, - nonlinearity=nonlinearity, dropout=dropout, - batch_first=batch_first) + model = ElmanWithStateModel( + layers=layers, + bidirect=bidirectional, + nonlinearity=nonlinearity, + dropout=dropout, + batch_first=batch_first, + ) if packed_sequence: model = RnnModelWithPackedSequenceWithoutState(model, batch_first) @@ -8059,20 +9647,33 @@ def make_input(batch_size): other_input = make_input(RNN_BATCH_SIZE + 1) self.run_test(model, other_input, batch_size=RNN_BATCH_SIZE + 1) - def _lstm_test(self, layers, bidirectional, initial_state, - packed_sequence, dropout): + def _lstm_test( + self, layers, bidirectional, initial_state, packed_sequence, dropout + ): batch_first = packed_sequence == 2 if packed_sequence: - model = LstmFlatteningResultWithSeqLength(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, layers, - bidirectional, dropout, batch_first) + model = LstmFlatteningResultWithSeqLength( + RNN_INPUT_SIZE, + RNN_HIDDEN_SIZE, + layers, + bidirectional, + dropout, + batch_first, + ) if initial_state: model = RnnModelWithPackedSequenceWithState(model, batch_first) else: model = RnnModelWithPackedSequenceWithoutState(model, batch_first) else: - model = LstmFlatteningResultWithoutSeqLength(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, layers, - bidirectional, dropout, batch_first) + model = LstmFlatteningResultWithoutSeqLength( + RNN_INPUT_SIZE, + RNN_HIDDEN_SIZE, + layers, + bidirectional, + dropout, + batch_first, + ) def make_input(batch_size): seq_lengths = np.random.randint(1, RNN_SEQUENCE_LENGTH + 1, size=batch_size) @@ -8102,17 +9703,20 @@ def make_input(batch_size): other_input = make_input(RNN_BATCH_SIZE + 1) self.run_test(model, other_input, batch_size=RNN_BATCH_SIZE + 1) - def _gru_test(self, layers, bidirectional, initial_state, - packed_sequence, dropout): - + def _gru_test(self, layers, bidirectional, initial_state, packed_sequence, dropout): class GRUWithStateModel(torch.nn.Module): def __init__(self, layers, bidirect, dropout, batch_first): super(GRUWithStateModel, self).__init__() self.batch_first = batch_first - self.inner_model = torch.nn.GRU(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, num_layers=layers, - bidirectional=bidirectional, dropout=dropout, - batch_first=batch_first) + self.inner_model = torch.nn.GRU( + RNN_INPUT_SIZE, + RNN_HIDDEN_SIZE, + num_layers=layers, + bidirectional=bidirectional, + dropout=dropout, + batch_first=batch_first, + ) def forward(self, input: PackedSequence, hx): return self.inner_model(input, hx) @@ -8121,9 +9725,14 @@ class GRUWithoutStateModel(torch.nn.Module): def __init__(self, layers, bidirect, dropout, batch_first): super(GRUWithoutStateModel, self).__init__() self.batch_first = batch_first - self.inner_model = torch.nn.GRU(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, num_layers=layers, - bidirectional=bidirectional, dropout=dropout, - batch_first=batch_first) + self.inner_model = torch.nn.GRU( + RNN_INPUT_SIZE, + RNN_HIDDEN_SIZE, + num_layers=layers, + bidirectional=bidirectional, + dropout=dropout, + batch_first=batch_first, + ) def forward(self, input: PackedSequence): return self.inner_model(input) @@ -8132,9 +9741,14 @@ class GRUNoSeqLengthWithoutStateModel(torch.nn.Module): def __init__(self, layers, bidirect, dropout, batch_first): super(GRUNoSeqLengthWithoutStateModel, self).__init__() self.batch_first = batch_first - self.inner_model = torch.nn.GRU(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, num_layers=layers, - bidirectional=bidirectional, dropout=dropout, - batch_first=batch_first) + self.inner_model = torch.nn.GRU( + RNN_INPUT_SIZE, + RNN_HIDDEN_SIZE, + num_layers=layers, + bidirectional=bidirectional, + dropout=dropout, + batch_first=batch_first, + ) def forward(self, input): return self.inner_model(input) @@ -8143,9 +9757,14 @@ class GRUNoSeqLengthWithStateModel(torch.nn.Module): def __init__(self, layers, bidirect, dropout, batch_first): super(GRUNoSeqLengthWithStateModel, self).__init__() self.batch_first = batch_first - self.inner_model = torch.nn.GRU(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, num_layers=layers, - bidirectional=bidirectional, dropout=dropout, - batch_first=batch_first) + self.inner_model = torch.nn.GRU( + RNN_INPUT_SIZE, + RNN_HIDDEN_SIZE, + num_layers=layers, + bidirectional=bidirectional, + dropout=dropout, + batch_first=batch_first, + ) def forward(self, input, hx): return self.inner_model(input, hx) @@ -8154,20 +9773,36 @@ def forward(self, input, hx): if packed_sequence: if initial_state: - model = GRUWithStateModel(layers=layers, bidirect=bidirectional, dropout=dropout, - batch_first=batch_first) + model = GRUWithStateModel( + layers=layers, + bidirect=bidirectional, + dropout=dropout, + batch_first=batch_first, + ) model = RnnModelWithPackedSequenceWithState(model, batch_first) else: - model = GRUWithoutStateModel(layers=layers, bidirect=bidirectional, dropout=dropout, - batch_first=batch_first) + model = GRUWithoutStateModel( + layers=layers, + bidirect=bidirectional, + dropout=dropout, + batch_first=batch_first, + ) model = RnnModelWithPackedSequenceWithoutState(model, batch_first) else: if initial_state: - model = GRUNoSeqLengthWithStateModel(layers=layers, bidirect=bidirectional, - dropout=dropout, batch_first=batch_first) + model = GRUNoSeqLengthWithStateModel( + layers=layers, + bidirect=bidirectional, + dropout=dropout, + batch_first=batch_first, + ) else: - model = GRUNoSeqLengthWithoutStateModel(layers=layers, bidirect=bidirectional, - dropout=dropout, batch_first=batch_first) + model = GRUNoSeqLengthWithoutStateModel( + layers=layers, + bidirect=bidirectional, + dropout=dropout, + batch_first=batch_first, + ) def make_input(batch_size): seq_lengths = np.random.randint(1, RNN_SEQUENCE_LENGTH + 1, size=batch_size) @@ -8196,9 +9831,9 @@ def make_input(batch_size): other_input = make_input(RNN_BATCH_SIZE + 1) self.run_test(model, other_input, batch_size=RNN_BATCH_SIZE + 1) - @disableScriptTest() # TODO: RuntimeError: Exporting the operator __is_ to ONNX is not supported + @skipScriptTest() # TODO: https://msdata.visualstudio.com/Vienna/_workitems/edit/1253950 def test_transformer_encoder(self): - from torch.nn import TransformerEncoderLayer, TransformerEncoder + from torch.nn import TransformerEncoder, TransformerEncoderLayer class MyModule(torch.nn.Module): def __init__(self, ninp, nhead, nhid, dropout, nlayers): @@ -8210,36 +9845,84 @@ def forward(self, input): return self.transformer_encoder(input) x = torch.rand(10, 32, 512) - self.run_test(MyModule(512, 8, 2048 , 0., 3), (x,), atol=1e-6) + self.run_test(MyModule(512, 8, 2048, 0.0, 3), (x,), atol=1e-6) @skipIfUnsupportedMinOpsetVersion(10) def test_fake_quantize_per_tensor(self): class FakeQuantizePerTensorModel(torch.nn.Module): def forward(self, input): - scale = 1. / 127 + scale = 1.0 / 127 zero_point = 0 quant_min = -128 quant_max = 127 - return torch.fake_quantize_per_tensor_affine(input, scale, zero_point, quant_min, quant_max) + return torch.fake_quantize_per_tensor_affine( + input, scale, zero_point, quant_min, quant_max + ) x = torch.randn(6, 4, 3, 3) self.run_test(FakeQuantizePerTensorModel(), (x)) + @skipIfUnsupportedMinOpsetVersion(13) + def test_fake_quantize_per_tensor_dynamic_scale_zeropoint(self): + class FakeQuantizePerTensorModel(torch.nn.Module): + def forward(self, input, scale, zero_point): + quant_min = -128 + quant_max = 127 + return torch.fake_quantize_per_tensor_affine( + input, scale, zero_point, quant_min, quant_max + ) + + x = torch.randn(6, 4, 3, 3) + scale = torch.tensor(1.0 / 127) + zero_point = torch.tensor(0) + self.run_test(FakeQuantizePerTensorModel(), (x, scale, zero_point)) + @skipIfUnsupportedMinOpsetVersion(13) def test_fake_quantize_per_channel(self): class FakeQuantizePerChannelModel(torch.nn.Module): def forward(self, input): amax = torch.ones(4) - scale = amax / 127. + scale = amax / 127.0 zero_point = torch.zeros_like(amax, dtype=torch.int) # Quantize twice to test differnet branches - y = torch.fake_quantize_per_channel_affine(input, scale, zero_point, 1, 0, 255) - return torch.fake_quantize_per_channel_affine(y, scale, zero_point, 1, -128, 127) + y = torch.fake_quantize_per_channel_affine( + input, scale, zero_point, 1, 0, 255 + ) + return torch.fake_quantize_per_channel_affine( + y, scale, zero_point, 1, -128, 127 + ) x = torch.randn(6, 4, 3, 3) self.run_test(FakeQuantizePerChannelModel(), (x)) - @skipIfUnsupportedMinOpsetVersion(9) # https://github.com/microsoft/onnxruntime/issues/9663 + @skipIfUnsupportedMinOpsetVersion(13) + @skipScriptTest() # RuntimeError: Can't redefine method: forward on class: __torch__.torch.nn.modules.linear.Linear + def test_fake_quantize_activation(self): + from torch import quantization + + m = torch.nn.Linear(1, 1) + m.qconfig = quantization.QConfig( + activation=quantization.default_fake_quant, + weight=quantization.default_per_channel_weight_fake_quant, + ) + quantization.prepare_qat(m.train(), inplace=True) + m.apply(quantization.enable_observer) + m.apply(quantization.enable_fake_quant) + for module in m.modules(): + if isinstance(module, quantization.FakeQuantize): + module.calculate_qparams() + + m.apply(quantization.disable_observer) + m.eval() + + # Fake quantize activation is a special case, as it restricts quantized range to be (0, 127), + # while standard 8bit quantization range is (-128, 127) or (0, 255). + # Set fixed weight, bias and inputs to test if ONNX handles the overflow correctly. + m.weight = torch.nn.Parameter(torch.tensor([[1.0], [1.0], [1.0]])) + m.bias = torch.nn.Parameter(torch.tensor([0.0])) + x = torch.tensor([[150.0], [127.0], [-5.0]]) + self.run_test(m, x) + def test_batchnorm_training(self): class MyModule(torch.nn.Module): def __init__(self): @@ -8260,11 +9943,22 @@ def forward(self, x): x = torch.randn(10, 3, 20, 20) * 2 model_export = MyModule() - self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.TRAINING, rtol=1e-3, atol=1e-5) + self.run_test( + model_export, + (x,), + training=torch.onnx.TrainingMode.TRAINING, + rtol=1e-3, + atol=1e-5, + ) model_export.train() - self.run_test(model_export, (x, ), training=torch.onnx.TrainingMode.PRESERVE, rtol=1e-3, atol=1e-5) + self.run_test( + model_export, + (x,), + training=torch.onnx.TrainingMode.PRESERVE, + rtol=1e-3, + atol=1e-5, + ) - @skipIfUnsupportedMinOpsetVersion(9) # https://github.com/microsoft/onnxruntime/issues/9663 def test_batchnorm_training_mode_fix_layer(self): class MyModule(torch.nn.Module): def __init__(self): @@ -8286,9 +9980,21 @@ def forward(self, x): x = torch.randn(10, 3, 128, 128) model_export = MyModule() - self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.TRAINING, rtol=1e-3, atol=1e-5) + self.run_test( + model_export, + (x,), + training=torch.onnx.TrainingMode.TRAINING, + rtol=1e-3, + atol=1e-5, + ) model_export.train() - self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.PRESERVE, rtol=1e-3, atol=1e-5) + self.run_test( + model_export, + (x,), + training=torch.onnx.TrainingMode.PRESERVE, + rtol=1e-3, + atol=1e-5, + ) def test_batchnorm_eval_mode_train_layer(self): class MyModule(torch.nn.Module): @@ -8311,11 +10017,22 @@ def forward(self, x): x = torch.randn(10, 3, 128, 128) model_export = MyModule() - self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.EVAL, rtol=1e-3, atol=1e-5) + self.run_test( + model_export, + (x,), + training=torch.onnx.TrainingMode.EVAL, + rtol=1e-3, + atol=1e-5, + ) model_export.eval() - self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.PRESERVE, rtol=1e-3, atol=1e-5) + self.run_test( + model_export, + (x,), + training=torch.onnx.TrainingMode.PRESERVE, + rtol=1e-3, + atol=1e-5, + ) - @skipIfUnsupportedMinOpsetVersion(9) # https://github.com/microsoft/onnxruntime/issues/9663 def test_instancenorm_training(self): class MyModule(torch.nn.Module): def __init__(self): @@ -8336,11 +10053,22 @@ def forward(self, x): x = torch.randn(10, 3, 128, 128) model_export = MyModule() - self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.TRAINING, rtol=1e-3, atol=1e-5) + self.run_test( + model_export, + (x,), + training=torch.onnx.TrainingMode.TRAINING, + rtol=1e-3, + atol=1e-5, + ) model_export.train() - self.run_test(model_export, (x, ), training=torch.onnx.TrainingMode.PRESERVE, rtol=1e-3, atol=1e-5) + self.run_test( + model_export, + (x,), + training=torch.onnx.TrainingMode.PRESERVE, + rtol=1e-3, + atol=1e-5, + ) - @skipIfUnsupportedMinOpsetVersion(9) # https://github.com/microsoft/onnxruntime/issues/9663 def test_instancenorm_training_mode_fix_layer(self): class MyModule(torch.nn.Module): def __init__(self): @@ -8362,11 +10090,22 @@ def forward(self, x): x = torch.randn(10, 3, 128, 128) model_export = MyModule() - self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.TRAINING, rtol=1e-3, atol=1e-5) + self.run_test( + model_export, + (x,), + training=torch.onnx.TrainingMode.TRAINING, + rtol=1e-3, + atol=1e-5, + ) model_export.train() - self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.PRESERVE, rtol=1e-3, atol=1e-5) + self.run_test( + model_export, + (x,), + training=torch.onnx.TrainingMode.PRESERVE, + rtol=1e-3, + atol=1e-5, + ) - @skipIfUnsupportedMinOpsetVersion(9) # https://github.com/microsoft/onnxruntime/issues/9663 def test_instancenorm_eval_mode_train_layer(self): class MyModule(torch.nn.Module): def __init__(self): @@ -8388,9 +10127,21 @@ def forward(self, x): x = torch.randn(10, 8, 128, 128) model_export = MyModule() - self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.EVAL, rtol=1e-3, atol=1e-5) + self.run_test( + model_export, + (x,), + training=torch.onnx.TrainingMode.EVAL, + rtol=1e-3, + atol=1e-5, + ) model_export.eval() - self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.PRESERVE, rtol=1e-3, atol=1e-5) + self.run_test( + model_export, + (x,), + training=torch.onnx.TrainingMode.PRESERVE, + rtol=1e-3, + atol=1e-5, + ) @skipIfUnsupportedMinOpsetVersion(12) def test_dropout_training(self): @@ -8407,16 +10158,24 @@ def forward(self, x): x = torch.randn(10) model.train() - ort_sess = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, - training=torch.onnx.TrainingMode.TRAINING) - ort_outs = run_ort(ort_sess, input=(x,)) + ort_sess = convert_to_onnx( + model, + input=(x,), + opset_version=self.opset_version, + training=torch.onnx.TrainingMode.TRAINING, + ) + ort_outs = run_ort(ort_sess, (x,)) assert not torch.all(torch.eq(x, torch.from_numpy(ort_outs[0]))) script_model = torch.jit.script(model) output = model(x) - ort_sess = convert_to_onnx(script_model, input=(x,), opset_version=self.opset_version, - training=torch.onnx.TrainingMode.TRAINING) - ort_outs = run_ort(ort_sess, input=(x,)) + ort_sess = convert_to_onnx( + script_model, + input=(x,), + opset_version=self.opset_version, + training=torch.onnx.TrainingMode.TRAINING, + ) + ort_outs = run_ort(ort_sess, (x,)) assert not torch.all(torch.eq(x, torch.from_numpy(ort_outs[0]))) @skipIfUnsupportedMinOpsetVersion(12) @@ -8440,9 +10199,13 @@ def forward(self, x): nb_elements = torch.numel(input) model.train() - ort_sess = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, - training=torch.onnx.TrainingMode.TRAINING) - ort_outs = run_ort(ort_sess, input=(x,)) + ort_sess = convert_to_onnx( + model, + input=(x,), + opset_version=self.opset_version, + training=torch.onnx.TrainingMode.TRAINING, + ) + ort_outs = run_ort(ort_sess, (x,)) y = model(input) output = y.cpu().numpy() @@ -8457,9 +10220,13 @@ def forward(self, x): script_model = torch.jit.script(model) y = model(input) output = y.cpu().numpy() - ort_sess = convert_to_onnx(script_model, input=(x,), opset_version=self.opset_version, - training=torch.onnx.TrainingMode.TRAINING) - ort_outs = run_ort(ort_sess, input=(x,)) + ort_sess = convert_to_onnx( + script_model, + input=(x,), + opset_version=self.opset_version, + training=torch.onnx.TrainingMode.TRAINING, + ) + ort_outs = run_ort(ort_sess, (x,)) ort_mask = np.where(ort_outs[0] != 0, 1, 0) pyt_mask = np.where(output != 0, 1, 0) @@ -8468,12 +10235,13 @@ def forward(self, x): np.testing.assert_allclose(ratio_pytorch, ratio_ort, rtol=0.01, atol=0.01) - @skipIfUnsupportedMinOpsetVersion(9) # https://github.com/microsoft/onnxruntime/issues/9663 def test_conv_bn(self): class MyModule(torch.nn.Module): def __init__(self): super(MyModule, self).__init__() - self.conv = torch.nn.Conv2d(3, 16, kernel_size=1, stride=2, padding=3, bias=True) + self.conv = torch.nn.Conv2d( + 3, 16, kernel_size=1, stride=2, padding=3, bias=True + ) self.bn = torch.nn.BatchNorm2d(16, affine=True) def forward(self, x): @@ -8484,16 +10252,27 @@ def forward(self, x): model_export = MyModule() x = torch.randn(10, 3, 128, 128) self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.EVAL) - self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.TRAINING, rtol=1e-3, atol=1e-5) + self.run_test( + model_export, + (x,), + training=torch.onnx.TrainingMode.TRAINING, + rtol=1e-3, + atol=1e-5, + ) - @skipIfUnsupportedMinOpsetVersion(9) # https://github.com/microsoft/onnxruntime/issues/9663 def test_multiple_conv_bn(self): class MyModule(torch.nn.Module): def __init__(self): super(MyModule, self).__init__() - self.conv1 = torch.nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) - self.conv2 = torch.nn.Conv2d(64, 2, kernel_size=1, stride=1, padding=0, bias=False) - self.conv3 = torch.nn.Conv2d(2, 2, kernel_size=3, stride=1, padding=1, bias=False) + self.conv1 = torch.nn.Conv2d( + 3, 64, kernel_size=7, stride=2, padding=3, bias=False + ) + self.conv2 = torch.nn.Conv2d( + 64, 2, kernel_size=1, stride=1, padding=0, bias=False + ) + self.conv3 = torch.nn.Conv2d( + 2, 2, kernel_size=3, stride=1, padding=1, bias=False + ) self.bn = torch.nn.BatchNorm2d(64) self.bn2 = torch.nn.BatchNorm2d(2) self.relu = torch.nn.ReLU(inplace=True) @@ -8514,19 +10293,26 @@ def forward(self, x): model_export = MyModule() x = torch.randn(2, 3, 224, 224) - self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.TRAINING, rtol=1e-3, atol=1e-5) + self.run_test( + model_export, + (x,), + training=torch.onnx.TrainingMode.TRAINING, + rtol=1e-3, + atol=1e-5, + ) self.run_test(model_export, (x,), training=torch.onnx.TrainingMode.EVAL) def test_script_custom_class_error(self): class BoxCoder(object): - def __init__(self, bbox_xform_clip: float): - # type: (float) -> None + def __init__(self, bbox_xform_clip: float) -> None: self.bbox_xform_clip = bbox_xform_clip - def decode(self, rel_codes, boxes): - # type: (Tensor, List[Tensor]) -> Tensor + def decode(self, rel_codes: Tensor, boxes: List[Tensor]) -> Tensor: boxes = torch.cat(boxes, dim=0) - pred_ctr_x = torch.clamp(rel_codes[:, 0::4], max=self.bbox_xform_clip) * boxes[:, 2] + pred_ctr_x = ( + torch.clamp(rel_codes[:, 0::4], max=self.bbox_xform_clip) + * boxes[:, 2] + ) return pred_ctr_x class MyModule(torch.nn.Module): @@ -8538,7 +10324,7 @@ def __init__(self): super(MyModule, self).__init__() self.box_coder = BoxCoder(1.4) - def forward(self, box_regression: torch.Tensor, proposals: List[torch.Tensor]): + def forward(self, box_regression: Tensor, proposals: List[Tensor]): return self.box_coder.decode(box_regression, proposals) model = torch.jit.script(MyModule()) @@ -8572,20 +10358,28 @@ def forward(self, x): loaded_model = onnx.load_from_string(f.getvalue()) actual_list = [p.name for p in loaded_model.graph.initializer] - assert actual_list == state_dict_list, \ - "Initializers' sequence is not as same as state_dict(). Expected: (" \ - + ", ".join(state_dict_list) + "). Actual:(" + ", ".join(actual_list) + ")." - assert actual_list == named_params_list, \ - "Initializers' sequence is not as same as named_parameters(). Expected: (" \ - + ", ".join(named_params_list) + "). Actual:(" + ", ".join(actual_list) + ")." + assert actual_list == state_dict_list, ( + "Initializers' sequence is not as same as state_dict(). Expected: (" + + ", ".join(state_dict_list) + + "). Actual:(" + + ", ".join(actual_list) + + ")." + ) + assert actual_list == named_params_list, ( + "Initializers' sequence is not as same as named_parameters(). Expected: (" + + ", ".join(named_params_list) + + "). Actual:(" + + ", ".join(actual_list) + + ")." + ) def test_initializer_sequence_script_model(self): def list_is_expected(short_list, long_list) -> bool: - if (len(short_list) > len(long_list)): + if len(short_list) > len(long_list): return False for i in range(len(short_list)): - if (short_list[i] not in long_list[i]): + if short_list[i] not in long_list[i]: return False return True @@ -8621,12 +10415,20 @@ def forward(self, x, y): loaded_model = onnx.load_from_string(f.getvalue()) actual_list = [p.name for p in loaded_model.graph.initializer] - assert list_is_expected(state_dict_list, actual_list), \ - "ScriptModel - Initializers' sequence is not as same as state_dict(). Expected: (" \ - + ", ".join(state_dict_list) + "). Actual:(" + ", ".join(actual_list) + ")." - assert list_is_expected(named_params_list, actual_list), \ - "ScriptModel - Initializers' sequence is not as same as named_parameters(). Expected: (" \ - + ", ".join(named_params_list) + "). Actual:(" + ", ".join(actual_list) + ")." + assert list_is_expected(state_dict_list, actual_list), ( + "ScriptModel - Initializers' sequence is not as same as state_dict(). Expected: (" + + ", ".join(state_dict_list) + + "). Actual:(" + + ", ".join(actual_list) + + ")." + ) + assert list_is_expected(named_params_list, actual_list), ( + "ScriptModel - Initializers' sequence is not as same as named_parameters(). Expected: (" + + ", ".join(named_params_list) + + "). Actual:(" + + ", ".join(actual_list) + + ")." + ) @skipIfUnsupportedMinOpsetVersion(11) def test_nms(self): @@ -8635,7 +10437,6 @@ def test_nms(self): boxes[:, 2:] += boxes[:, :2] scores = torch.randn(num_boxes) - class Module(torch.nn.Module): def forward(self, boxes, scores): return ops.nms(boxes, scores, 0.5) @@ -8657,6 +10458,7 @@ def forward(self, boxes, scores, idxs): self.run_test(Module(), (boxes, scores, idxs)) @skipIfUnsupportedMinOpsetVersion(11) + @skipScriptTest() def test_clip_boxes_to_image(self): boxes = torch.randn(5, 4) * 500 boxes[:, 2:] += boxes[:, :2] @@ -8669,23 +10471,28 @@ def forward(self, boxes, size): shape = (size.shape[0], size.shape[1]) return ops.boxes.clip_boxes_to_image(boxes, shape) - self.run_test(Module(), (boxes, size), - input_names=["boxes", "size"], - dynamic_axes={"size": [0, 1]}, - test_with_inputs=[(boxes, size), (boxes, size_2)]) + self.run_test( + Module(), + (boxes, size), + input_names=["boxes", "size"], + dynamic_axes={"size": [0, 1]}, + test_with_inputs=[(boxes, size), (boxes, size_2)], + ) + @skipIfUnsupportedMaxOpsetVersion(15) # TODO: Opset 16 RoiAlign result mismatch @skipIfUnsupportedMinOpsetVersion(11) def test_roi_align(self): x = torch.rand(1, 1, 10, 10, dtype=torch.float32) single_roi = torch.tensor([[0, 0, 0, 4, 4]], dtype=torch.float32) - model = ops.RoIAlign((5, 5), 1., 2) + model = ops.RoIAlign((5, 5), 1.0, 2) self.run_test(model, (x, single_roi)) + @skipIfUnsupportedMaxOpsetVersion(15) # TODO: Opset 16 RoiAlign result mismatch @skipIfUnsupportedMinOpsetVersion(11) def test_roi_align_aligned(self): x = torch.rand(1, 1, 10, 10, dtype=torch.float32) single_roi = torch.tensor([[0, 1.5, 1.5, 3, 3]], dtype=torch.float32) - model1 = ops.RoIAlign((5, 5), 1., 2, aligned=True) + model1 = ops.RoIAlign((5, 5), 1.0, 2, aligned=True) self.run_test(model1, (x, single_roi)) x = torch.rand(1, 1, 10, 10, dtype=torch.float32) @@ -8709,7 +10516,7 @@ def test_roi_pool(self): rois = torch.tensor([[0, 0, 0, 4, 4]], dtype=torch.float32) pool_h = 5 pool_w = 5 - model = ops.RoIPool((pool_h, pool_w), 2.) + model = ops.RoIPool((pool_h, pool_w), 2.0) self.run_test(model, (x, rois)) @skipIfUnsupportedMinOpsetVersion(11) @@ -8724,25 +10531,30 @@ def forward(self, images): input = torch.rand(3, 10, 20) input_test = torch.rand(3, 100, 150) - self.run_test(TransformModule(), (input,), - input_names=["input1"], dynamic_axes={"input1": [0, 1, 2]}, - test_with_inputs=[(input,), (input_test,)]) + self.run_test( + TransformModule(), + (input,), + input_names=["input1"], + dynamic_axes={"input1": [0, 1, 2]}, + test_with_inputs=[(input,), (input_test,)], + ) @skipIfUnsupportedMinOpsetVersion(11) - @disableScriptTest() + @skipScriptTest() def test_transform_images(self): - class TransformModule(torch.nn.Module): def __init__(self): super(TransformModule, self).__init__() self.transform = _init_test_generalized_rcnn_transform() - def forward(self, images: List[torch.Tensor]): + def forward(self, images: List[Tensor]): return self.transform(images)[0].tensors input = torch.rand(3, 100, 200), torch.rand(3, 200, 200) input_test = torch.rand(3, 100, 200), torch.rand(3, 200, 200) - self.run_test(TransformModule(), (input,), test_with_inputs=[(input,), (input_test,)]) + self.run_test( + TransformModule(), (input,), test_with_inputs=[(input,), (input_test,)] + ) def get_features(self, images): s0, s1 = images.shape[-2:] @@ -8757,7 +10569,7 @@ def get_features(self, images): return features @skipIfUnsupportedMinOpsetVersion(11) - @disableScriptTest() + @skipScriptTest() def test_rpn(self): set_rng_seed(0) @@ -8766,8 +10578,10 @@ def __init__(self): super(RPNModule, self).__init__() self.rpn = _init_test_rpn() - def forward(self, images, features: Dict[str, torch.Tensor]): - images_m = ImageList(images, [(i.shape[-1], i.shape[-2]) for i in images]) + def forward(self, images, features: Dict[str, Tensor]): + images_m = ImageList( + images, [(i.shape[-1], i.shape[-2]) for i in images] + ) return self.rpn(images_m, features) images = torch.rand(2, 3, 150, 150) @@ -8778,16 +10592,25 @@ def forward(self, images, features: Dict[str, torch.Tensor]): model = RPNModule() model.eval() model(images, features) - self.run_test(model, (images, features), - input_names=["input1", "input2", "input3", "input4", "input5", "input6"], - dynamic_axes={"input1": [0, 1, 2, 3], "input2": [0, 1, 2, 3], - "input3": [0, 1, 2, 3], "input4": [0, 1, 2, 3], - "input5": [0, 1, 2, 3], "input6": [0, 1, 2, 3]}, - test_with_inputs=[(images, features), (images2, test_features)], - dict_check=False) - + self.run_test( + model, + (images, features), + input_names=["input1", "input2", "input3", "input4", "input5", "input6"], + dynamic_axes={ + "input1": [0, 1, 2, 3], + "input2": [0, 1, 2, 3], + "input3": [0, 1, 2, 3], + "input4": [0, 1, 2, 3], + "input5": [0, 1, 2, 3], + "input6": [0, 1, 2, 3], + }, + test_with_inputs=[(images, features), (images2, test_features)], + dict_check=False, + ) + + @skipIfUnsupportedMaxOpsetVersion(15) # TODO: Opset 16 RoiAlign result mismatch @skipIfUnsupportedMinOpsetVersion(11) - @disableScriptTest() + @skipScriptTest() def test_multi_scale_roi_align(self): class TransformModule(torch.nn.Module): def __init__(self): @@ -8795,8 +10618,7 @@ def __init__(self): self.model = ops.MultiScaleRoIAlign(["feat1", "feat2"], 3, 2) self.image_sizes = [(512, 512)] - def forward(self, input, boxes): - # type: (Dict[str, torch.Tensor], List[torch.Tensor]) -> torch.Tensor + def forward(self, input: Dict[str, Tensor], boxes: List[Tensor]) -> Tensor: return self.model(input, boxes, self.image_sizes) i = OrderedDict() @@ -8811,10 +10633,26 @@ def forward(self, input, boxes): boxes1 = torch.rand(6, 4) * 256 boxes1[:, 2:] += boxes1[:, :2] - self.run_test(TransformModule(), (i, [boxes],), test_with_inputs=[(i, [boxes],), (i1, [boxes1],)]) + self.run_test( + TransformModule(), + ( + i, + [boxes], + ), + test_with_inputs=[ + ( + i, + [boxes], + ), + ( + i1, + [boxes1], + ), + ], + ) @skipIfUnsupportedMinOpsetVersion(11) - @disableScriptTest() + @skipScriptTest() def test_roi_heads(self): class RoiHeadsModule(torch.nn.Module): def __init__(self): @@ -8823,15 +10661,21 @@ def __init__(self): self.rpn = _init_test_rpn() self.roi_heads = _init_test_roi_heads_faster_rcnn() - def forward(self, images, features: Dict[str, torch.Tensor]): - original_image_sizes = [(img.shape[-1], img.shape[-2]) for img in images] + def forward(self, images, features: Dict[str, Tensor]): + original_image_sizes = [ + (img.shape[-1], img.shape[-2]) for img in images + ] - images_m = ImageList(images, [(i.shape[-1], i.shape[-2]) for i in images]) + images_m = ImageList( + images, [(i.shape[-1], i.shape[-2]) for i in images] + ) proposals, _ = self.rpn(images_m, features) - detections, _ = self.roi_heads(features, proposals, images_m.image_sizes) - detections = self.transform.postprocess(detections, - images_m.image_sizes, - original_image_sizes) + detections, _ = self.roi_heads( + features, proposals, images_m.image_sizes + ) + detections = self.transform.postprocess( + detections, images_m.image_sizes, original_image_sizes + ) return detections images = torch.rand(2, 3, 100, 100) @@ -8843,12 +10687,21 @@ def forward(self, images, features: Dict[str, torch.Tensor]): model.eval() model(images, features) - self.run_test(model, (images, features), - input_names=["input1", "input2", "input3", "input4", "input5", "input6"], - dynamic_axes={"input1": [0, 1, 2, 3], "input2": [0, 1, 2, 3], "input3": [0, 1, 2, 3], - "input4": [0, 1, 2, 3], "input5": [0, 1, 2, 3], "input6": [0, 1, 2, 3]}, - test_with_inputs=[(images, features), (images2, test_features)], - dict_check=False) + self.run_test( + model, + (images, features), + input_names=["input1", "input2", "input3", "input4", "input5", "input6"], + dynamic_axes={ + "input1": [0, 1, 2, 3], + "input2": [0, 1, 2, 3], + "input3": [0, 1, 2, 3], + "input4": [0, 1, 2, 3], + "input5": [0, 1, 2, 3], + "input6": [0, 1, 2, 3], + }, + test_with_inputs=[(images, features), (images2, test_features)], + dict_check=False, + ) def test_set_(self): class M(torch.nn.Module): @@ -8861,9 +10714,14 @@ def forward(self, x, y): self.run_test(M(), (x, y), remained_onnx_input_idx=[1]) y2 = torch.randn(5, 2) - self.run_test(M(), (x, y), remained_onnx_input_idx=[1], input_names=['x', 'y'], - dynamic_axes={'x': [0, 1], 'y': [0, 1]}, - test_with_inputs=[(y, y2)]) + self.run_test( + M(), + (x, y), + remained_onnx_input_idx=[1], + input_names=["x", "y"], + dynamic_axes={"x": [0, 1], "y": [0, 1]}, + test_with_inputs=[(y, y2)], + ) @skipIfUnsupportedMinOpsetVersion(9) def test_set_attr_modules(self): @@ -8877,10 +10735,12 @@ def __init__(self, embedding_dim): @staticmethod def get_embedding(embedding_dim: int): emb = 4 / ((embedding_dim // 2) - 1) - emb = torch.exp(torch.arange((embedding_dim // 2), dtype=torch.float) * -emb) + emb = torch.exp( + torch.arange((embedding_dim // 2), dtype=torch.float) * -emb + ) return emb - def forward(self, input, incremental_state: Optional[torch.Tensor] = None): + def forward(self, input, incremental_state: Optional[Tensor] = None): bsz, seq_len = input.shape[0], input.shape[1] self.const = 3 if self.weights is None: @@ -8890,9 +10750,9 @@ def forward(self, input, incremental_state: Optional[torch.Tensor] = None): if incremental_state is not None: pos = seq_len return self.weights[1 + pos, :].expand(bsz, 1, -1) - return ( - self.weights.index_select(0, torch.ones((bsz * seq_len), dtype=torch.int64)).view(bsz, seq_len, -1) - ) + return self.weights.index_select( + 0, torch.ones((bsz * seq_len), dtype=torch.int64) + ).view(bsz, seq_len, -1) class InnerModule(torch.nn.Module): def __init__(self, embedding_dim): @@ -8903,7 +10763,9 @@ def __init__(self, embedding_dim): @staticmethod def get_embedding(embedding_dim: int): emb = 4 / ((embedding_dim // 2) - 1) - emb = torch.exp(torch.arange((embedding_dim // 2), dtype=torch.float) * -emb) + emb = torch.exp( + torch.arange((embedding_dim // 2), dtype=torch.float) * -emb + ) return emb def forward(self, x): @@ -8918,8 +10780,8 @@ def forward(self, x): return self.module(x) x = torch.randn(3, 256) - self.run_test(Module(), (x, ), input_names=["x"], dynamic_axes={"x": [0, 1]}) - self.run_test(Module(), (x, ), remained_onnx_input_idx=[]) + self.run_test(Module(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]}) + self.run_test(Module(), (x,), remained_onnx_input_idx=[]) @skipIfUnsupportedMinOpsetVersion(9) def test_set_attr_modules_2(self): @@ -8934,15 +10796,19 @@ def __init__(self, embedding_dim): @staticmethod def get_embedding(embedding_dim: int): emb = 4 / ((embedding_dim // 2) - 1) - emb = torch.exp(torch.arange((embedding_dim // 2), dtype=torch.float) * -emb) + emb = torch.exp( + torch.arange((embedding_dim // 2), dtype=torch.float) * -emb + ) return emb - def forward(self, input, incremental_state: Optional[torch.Tensor] = None): + def forward(self, input, incremental_state: Optional[Tensor] = None): bsz, seq_len = input.shape[0], input.shape[1] self.const = 1.5 self.weights = InnerModule.get_embedding(self.embedding_dim) return ( - self.weights.index_select(0, torch.ones((bsz * seq_len), dtype=torch.int64)).view(bsz, seq_len, -1) + self.weights.index_select( + 0, torch.ones((bsz * seq_len), dtype=torch.int64) + ).view(bsz, seq_len, -1) ) * self.const class Module(torch.nn.Module): @@ -8954,8 +10820,8 @@ def forward(self, x): return self.module(x) x = torch.randn(3, 256) - self.run_test(Module(), (x, ), input_names=["x"], dynamic_axes={"x": [0, 1]}) - self.run_test(Module(), (x, ), remained_onnx_input_idx=[]) + self.run_test(Module(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]}) + self.run_test(Module(), (x,), remained_onnx_input_idx=[]) def test_set_attr(self): class MyModule(torch.nn.Module): @@ -8996,7 +10862,7 @@ def set_cell_anchors(self, anchors): self.conv.weight = torch.randn(3, 10) self.conv.bias = self.conv.weight[:] - def forward(self, anchors) -> Optional[torch.Tensor]: + def forward(self, anchors) -> Optional[Tensor]: self.set_cell_anchors(anchors) return self.conv.bias @@ -9020,7 +10886,7 @@ def set_cell_anchors(self, anchors, boxes): self.conv.weight = anchors + self.conv.weight boxes[:] = torch.zeros(2, 3) - def forward(self, anchors) -> Tuple[torch.Tensor, torch.Tensor]: + def forward(self, anchors) -> Tuple[Tensor, Tensor]: boxes = torch.ones(2, 2, 3) self.set_cell_anchors(anchors, boxes) if self.conv.bias is not None: @@ -9048,7 +10914,7 @@ def set_cell_anchors(self, anchors): else: self.conv.bias = torch.ones(3, 10, 3) - def forward(self, feature_maps, anchors) -> Tuple[torch.Tensor, torch.Tensor]: + def forward(self, feature_maps, anchors) -> Tuple[Tensor, Tensor]: self.set_cell_anchors(anchors) result = [] if self.conv.bias is not None: @@ -9111,7 +10977,7 @@ def set_cell_anchors(self, anchors, boxes): self.conv.weight = anchors * i boxes[j] += torch.ones(3, 3) - def forward(self, anchors) -> Tuple[torch.Tensor, torch.Tensor]: + def forward(self, anchors) -> Tuple[Tensor, Tensor]: boxes = torch.ones(10, 3, 3) self.set_cell_anchors(anchors, boxes) if self.conv.bias is not None: @@ -9130,7 +10996,9 @@ def __init__(self): self.conv = torch.nn.Conv1d(10, 3, 3) self.conv.weight = torch.nn.Parameter(torch.zeros(3, 10)) self.conv.bias = torch.nn.Parameter(torch.zeros(3, 10, 3)) - self.boxes : List[torch.Tensor] = [torch.ones(1)] # Workaround placeholder for TorchScript + self.boxes: List[Tensor] = [ + torch.ones(1) + ] # Workaround placeholder for TorchScript def set_cell_anchors(self, anchors): self.conv.weight = torch.randn(3, 10) @@ -9140,7 +11008,7 @@ def set_cell_anchors(self, anchors): self.conv.weight = anchors * i self.boxes.append(torch.ones(3, 3)) - def forward(self, anchors) -> Tuple[torch.Tensor, List[torch.Tensor]]: + def forward(self, anchors) -> Tuple[Tensor, List[Tensor]]: self.boxes = [] self.set_cell_anchors(anchors) if self.conv.bias is not None: @@ -9154,8 +11022,9 @@ def forward(self, anchors) -> Tuple[torch.Tensor, List[torch.Tensor]]: @skipIfUnsupportedMinOpsetVersion(11) def test_index_put_if(self): @torch.jit.script - def check_init(input_data, hidden_size, prev_state): - # type: (torch.Tensor, int, torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor] + def check_init( + input_data: Tensor, hidden_size: int, prev_state: Tensor + ) -> Tuple[Tensor, Tensor]: batch_size = input_data.size(0) spatial_size_0 = input_data.size(2) spatial_size_1 = input_data.size(3) @@ -9164,11 +11033,23 @@ def check_init(input_data, hidden_size, prev_state): state = torch.zeros(state_size, device=input_data.device) state_copy = torch.zeros(state_size, device=input_data.device) if prev_state.size(0) == 0: - state[:] = torch.zeros(batch_size, hidden_size, spatial_size_0, spatial_size_1) + state[:] - state_copy[:] = torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) * 2 - state_copy[:] = torch.zeros(batch_size, hidden_size, spatial_size_0, spatial_size_1) * 2 + state[:] = ( + torch.zeros(batch_size, hidden_size, spatial_size_0, spatial_size_1) + + state[:] + ) + state_copy[:] = ( + torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) + * 2 + ) + state_copy[:] = ( + torch.zeros(batch_size, hidden_size, spatial_size_0, spatial_size_1) + * 2 + ) else: - state[:] = torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) * 4 + state[:] = ( + torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) + * 4 + ) return state, state_copy class Example(torch.nn.Module): @@ -9183,16 +11064,20 @@ def forward(self, input_data, prev_state): model = Example(10) random_data = torch.rand((1, 5, 30, 30)) empty_tensor = torch.tensor([], dtype=torch.float).view(0, 0, 0, 0, 0) - self.run_test(model, (random_data, empty_tensor), - input_names=["random_data", "empty_tensor"], - dynamic_axes={"random_data": [0, 1, 2, 3], "empty_tensor": [0, 1, 2, 3, 4]}) + self.run_test( + model, + (random_data, empty_tensor), + input_names=["random_data", "empty_tensor"], + dynamic_axes={"random_data": [0, 1, 2, 3], "empty_tensor": [0, 1, 2, 3, 4]}, + ) self.run_test(model, (random_data, empty_tensor), remained_onnx_input_idx=[]) @skipIfUnsupportedMinOpsetVersion(11) def test_index_put_if_2(self): @torch.jit.script - def check_init(input_data, hidden_size, prev_state): - # type: (torch.Tensor, int, torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor] + def check_init( + input_data: Tensor, hidden_size: int, prev_state: Tensor + ) -> Tuple[Tensor, Tensor]: batch_size = input_data.size(0) spatial_size_0 = input_data.size(2) spatial_size_1 = input_data.size(3) @@ -9202,13 +11087,26 @@ def check_init(input_data, hidden_size, prev_state): state_copy = torch.zeros(state_size, device=input_data.device) if prev_state.size(0) == 0: for i in range(2): - state[:] = torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) * i - state_copy[:] = torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) * i + state[:] = ( + torch.ones( + batch_size, hidden_size, spatial_size_0, spatial_size_1 + ) + * i + ) + state_copy[:] = ( + torch.ones( + batch_size, hidden_size, spatial_size_0, spatial_size_1 + ) + * i + ) elif prev_state.size(0) == 1: s = state[:] state[:] = prev_state + s elif prev_state.size(0) == 2: - state[:] = torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) * 4 + state[:] = ( + torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) + * 4 + ) return state, state_copy class Example(torch.nn.Module): @@ -9224,22 +11122,29 @@ def forward(self, input_data, prev_state): random_data = torch.rand((1, 5, 30, 30)) empty_tensor = torch.tensor([], dtype=torch.float).view(0, 0, 0, 0, 0) random_state = torch.rand((1, 1, 10, 30, 30)) - self.run_test(model, (random_data, empty_tensor), - input_names=["data", "state"], - dynamic_axes={"data": [0, 1, 2], "state": [0, 1, 2, 3, 4]}, - test_with_inputs=[(random_data, random_state)]) - self.run_test(model, (random_data, empty_tensor), - input_names=["data", "state"], - dynamic_axes={"state": [0, 1, 2, 3, 4]}, - test_with_inputs=[(random_data, random_state)], - remained_onnx_input_idx=[1]) + self.run_test( + model, + (random_data, empty_tensor), + input_names=["data", "state"], + dynamic_axes={"data": [0, 1, 2], "state": [0, 1, 2, 3, 4]}, + test_with_inputs=[(random_data, random_state)], + ) + self.run_test( + model, + (random_data, empty_tensor), + input_names=["data", "state"], + dynamic_axes={"state": [0, 1, 2, 3, 4]}, + test_with_inputs=[(random_data, random_state)], + remained_onnx_input_idx=[1], + ) self.run_test(model, (random_data, empty_tensor), remained_onnx_input_idx=[]) @skipIfUnsupportedMinOpsetVersion(11) def test_index_put_if_3(self): @torch.jit.script - def check_init(input_data, hidden_size, prev_state): - # type: (torch.Tensor, int, torch.Tensor) -> torch.Tensor + def check_init( + input_data: Tensor, hidden_size: int, prev_state: Tensor + ) -> Tensor: batch_size = input_data.size(0) spatial_size_0 = input_data.size(2) spatial_size_1 = input_data.size(3) @@ -9249,7 +11154,12 @@ def check_init(input_data, hidden_size, prev_state): if prev_state.size(0) < 2: state = state * 3 if prev_state.size(0) == 0: - state[:] = torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) * 3 + state[:] = ( + torch.ones( + batch_size, hidden_size, spatial_size_0, spatial_size_1 + ) + * 3 + ) else: state = state + 2 @@ -9267,16 +11177,20 @@ def forward(self, input_data, prev_state): model = Example(4) random_data = torch.rand((1, 5, 4, 4)) empty_tensor = torch.tensor([], dtype=torch.float).view(0, 0, 0, 0, 0) - self.run_test(model, (random_data, empty_tensor), - input_names=["random_data", "empty_tensor"], - dynamic_axes={"random_data": [0, 1, 2, 3], "empty_tensor": [0, 1, 2, 3, 4]}) + self.run_test( + model, + (random_data, empty_tensor), + input_names=["random_data", "empty_tensor"], + dynamic_axes={"random_data": [0, 1, 2, 3], "empty_tensor": [0, 1, 2, 3, 4]}, + ) self.run_test(model, (random_data, empty_tensor), remained_onnx_input_idx=[]) @skipIfUnsupportedMinOpsetVersion(11) def test_index_put_if_4(self): @torch.jit.script - def check_init(input_data, hidden_size, prev_state): - # type: (torch.Tensor, int, torch.Tensor) -> torch.Tensor + def check_init( + input_data: Tensor, hidden_size: int, prev_state: Tensor + ) -> Tensor: batch_size = input_data.size(0) spatial_size_0 = input_data.size(2) spatial_size_1 = input_data.size(3) @@ -9285,9 +11199,15 @@ def check_init(input_data, hidden_size, prev_state): state = torch.zeros(state_size, device=input_data.device) if prev_state.size(0) == 0: state = state + 3 - state[:] = torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) * 3 + state[:] = ( + torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) + * 3 + ) state = state + 3 - state[:] = torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) * 4 + state[:] = ( + torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) + * 4 + ) else: state = state + 2 return state @@ -9304,17 +11224,20 @@ def forward(self, input_data, prev_state): model = Example(4) random_data = torch.rand((1, 5, 4, 4)) empty_tensor = torch.tensor([], dtype=torch.float).view(0, 0, 0, 0, 0) - self.run_test(model, (random_data, empty_tensor), - input_names=["random_data", "empty_tensor"], - dynamic_axes={"random_data": [0, 1, 2, 3], "empty_tensor": [0, 1, 2, 3, 4]}) + self.run_test( + model, + (random_data, empty_tensor), + input_names=["random_data", "empty_tensor"], + dynamic_axes={"random_data": [0, 1, 2, 3], "empty_tensor": [0, 1, 2, 3, 4]}, + ) self.run_test(model, (random_data, empty_tensor), remained_onnx_input_idx=[]) - @skipIfUnsupportedMinOpsetVersion(11) def test_index_put_if_5(self): @torch.jit.script - def check_init(input_data, hidden_size, prev_state): - # type: (torch.Tensor, int, torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor] + def check_init( + input_data: Tensor, hidden_size: int, prev_state: Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: batch_size = input_data.size(0) spatial_size_0 = input_data.size(2) spatial_size_1 = input_data.size(3) @@ -9323,9 +11246,15 @@ def check_init(input_data, hidden_size, prev_state): state = torch.zeros(state_size, device=input_data.device) state_ref = state if prev_state.size(0) == 0: - state[:] = torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) * 3 + state[:] = ( + torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) + * 3 + ) state = state + 3 - state[:] = torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) * 4 + state[:] = ( + torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) + * 4 + ) else: state = state + 2 return state, state_ref @@ -9336,15 +11265,20 @@ def __init__(self, hidden_size): self.hidden_size = hidden_size def forward(self, input_data, prev_state): - prev_state, state_ref = check_init(input_data, self.hidden_size, prev_state) + prev_state, state_ref = check_init( + input_data, self.hidden_size, prev_state + ) return prev_state, state_ref model = Example(4) random_data = torch.rand((1, 5, 4, 4)) empty_tensor = torch.tensor([], dtype=torch.float).view(0, 0, 0, 0, 0) - self.run_test(model, (random_data, empty_tensor), - input_names=["random_data", "empty_tensor"], - dynamic_axes={"random_data": [0, 1, 2, 3], "empty_tensor": [0, 1, 2, 3, 4]}) + self.run_test( + model, + (random_data, empty_tensor), + input_names=["random_data", "empty_tensor"], + dynamic_axes={"random_data": [0, 1, 2, 3], "empty_tensor": [0, 1, 2, 3, 4]}, + ) self.run_test(model, (random_data, empty_tensor), remained_onnx_input_idx=[]) @skipIfUnsupportedMinOpsetVersion(11) @@ -9396,7 +11330,6 @@ def forward(self, x, y): y = torch.randn(4, 5) self.run_test(model, (x, y)) - @skipIfUnsupportedMinOpsetVersion(13) def test_list_del_in_block(self): class ListModel(torch.nn.Module): @@ -9436,8 +11369,7 @@ def forward(self, x, y): @skipIfUnsupportedMinOpsetVersion(11) def test_index_put_inplace_ops(self): @torch.jit.script - def check_init(input_data, hidden_size): - # type: (torch.Tensor, int) -> torch.Tensor + def check_init(input_data: Tensor, hidden_size: int) -> Tensor: batch_size = input_data.size(0) spatial_size_0 = input_data.size(2) spatial_size_1 = input_data.size(3) @@ -9445,11 +11377,22 @@ def check_init(input_data, hidden_size): state_size = (2, batch_size, hidden_size, spatial_size_0, spatial_size_1) state = torch.zeros(state_size, device=input_data.device) if input_data.size(0) == 1: - state[1] += torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) * 2 - state[1] /= torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) * 3 + state[1] += ( + torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) + * 2 + ) + state[1] /= ( + torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) + * 3 + ) for i in range(input_data.size(0)): - state[1] += torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) - state[1] /= torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) * i + state[1] += torch.ones( + batch_size, hidden_size, spatial_size_0, spatial_size_1 + ) + state[1] /= ( + torch.ones(batch_size, hidden_size, spatial_size_0, spatial_size_1) + * i + ) return state class Example(torch.nn.Module): @@ -9463,8 +11406,12 @@ def forward(self, input_data): model = Example(10) random_data = torch.rand((1, 5, 30, 30)) - self.run_test(model, (random_data), input_names=["random_data"], - dynamic_axes={"random_data": [0, 1, 2, 3]}) + self.run_test( + model, + (random_data), + input_names=["random_data"], + dynamic_axes={"random_data": [0, 1, 2, 3]}, + ) self.run_test(model, (random_data), remained_onnx_input_idx=[]) @skipIfUnsupportedMinOpsetVersion(11) @@ -9472,9 +11419,9 @@ def test_input_mask_model(self): class InputMaskModel(torch.nn.Module): def __init__(self, output_size): super(InputMaskModel, self).__init__() - self.bias = torch.nn.Parameter(torch.empty( - output_size, - dtype=torch.float)) + self.bias = torch.nn.Parameter( + torch.empty(output_size, dtype=torch.float) + ) with torch.no_grad(): self.bias.zero_() @@ -9487,8 +11434,15 @@ def forward(self, model_input, y): output_size = 4 m = InputMaskModel(output_size) x = torch.tensor([0, 4, 24, 25], dtype=torch.int64) - y = torch.tensor([[0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4], - [0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4]], dtype=torch.float) + y = torch.tensor( + [ + [0.1, 0.2, 0.3, 0.4], + [0.1, 0.2, 0.3, 0.4], + [0.1, 0.2, 0.3, 0.4], + [0.1, 0.2, 0.3, 0.4], + ], + dtype=torch.float, + ) self.run_test(m, (x, y)) class InputMaskModel(torch.nn.Module): @@ -9505,11 +11459,18 @@ def forward(self, model_input_1, model_input_2, y): m = InputMaskModel(output_size) x1 = torch.tensor([0, 4, 24, 25], dtype=torch.int64) x2 = torch.tensor([0, 3, 12, 15], dtype=torch.int64) - y = torch.tensor([[0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4], - [0.1, 0.2, 0.3, 0.4], [0.1, 0.2, 0.3, 0.4]], dtype=torch.float) + y = torch.tensor( + [ + [0.1, 0.2, 0.3, 0.4], + [0.1, 0.2, 0.3, 0.4], + [0.1, 0.2, 0.3, 0.4], + [0.1, 0.2, 0.3, 0.4], + ], + dtype=torch.float, + ) self.run_test(m, (x1, x2, y)) - @disableScriptTest() + @skipScriptTest() def test_unsafe_chunk(self): class ChunkModel(torch.nn.Module): def forward(self, x): @@ -9534,8 +11495,12 @@ def forward(self, x, y): model.eval() x = torch.ones(2, 3, 4, 5) y = torch.ones(3, 4, 5, 2) - self.run_test(model, (x, y), input_names=["x", "y"], - dynamic_axes={"x": [0, 1, 2, 3], "y": [0, 1, 2, 3]}) + self.run_test( + model, + (x, y), + input_names=["x", "y"], + dynamic_axes={"x": [0, 1, 2, 3], "y": [0, 1, 2, 3]}, + ) self.run_test(model, (x, y), remained_onnx_input_idx=[1]) class ViewModel(torch.nn.Module): @@ -9544,7 +11509,7 @@ def forward(self, x): model = ViewModel() model.eval() - x = torch.tensor(2.) + x = torch.tensor(2.0) self.run_test(model, (x,)) # test prim::ListConstruct for Reshape input 1 @@ -9585,8 +11550,13 @@ def forward(self, signal): y = torch.randint(5, (M, C + 1, K + 1, N + 1)) self.run_test(model, x, input_names=["x"], dynamic_axes={"x": [0, 1, 2, 3]}) self.run_test(model, x, remained_onnx_input_idx=[]) - self.run_test(model, x, input_names=["x"], - dynamic_axes={"x" : [0, 1, 2, 3]}, test_with_inputs=[(x,), (y,)]) + self.run_test( + model, + x, + input_names=["x"], + dynamic_axes={"x": [0, 1, 2, 3]}, + test_with_inputs=[(x,), (y,)], + ) @skipIfUnsupportedMinOpsetVersion(11) def test_symbolic_shape_inference_box(self): @@ -9604,8 +11574,13 @@ def forward(self, boxes): x = torch.ones(2, 4) y = torch.ones(3, 5) self.run_test(model, x) - self.run_test(model, x, input_names=["x"], - dynamic_axes={"x" : [0, 1]}, test_with_inputs=[(x,), (y,)]) + self.run_test( + model, + x, + input_names=["x"], + dynamic_axes={"x": [0, 1]}, + test_with_inputs=[(x,), (y,)], + ) @skipIfUnsupportedMinOpsetVersion(11) def test_symbolic_shape_inference_box_if(self): @@ -9632,41 +11607,61 @@ def test_symbolic_shape_inference_arange_2(self): class ArangeModel(torch.nn.Module): def forward(self, start): return torch.arange(start.size(0), 8.5, 1.5, dtype=torch.int64) + x = torch.randn(2, 3, 4) - self.run_test(ArangeModel(), (x,), input_names=['x'], dynamic_axes={"x": [0, 1, 2]}) + self.run_test( + ArangeModel(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1, 2]} + ) self.run_test(ArangeModel(), (x,), remained_onnx_input_idx=[]) class ArangeModel2(torch.nn.Module): def forward(self, start): return torch.arange(start.size(0), 8.5, 1.5, dtype=torch.double) + x = torch.randn(2, 3, 4) - self.run_test(ArangeModel2(), (x,), input_names=['x'], dynamic_axes={"x": [0, 1, 2]}) + self.run_test( + ArangeModel2(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1, 2]} + ) self.run_test(ArangeModel2(), (x,), remained_onnx_input_idx=[]) @skipIfUnsupportedMinOpsetVersion(9) def test_symbolic_shape_inference_nonzero(self): class OneLikeModel(torch.nn.Module): def forward(self, x): - ones = torch.ones_like(x, dtype=torch.float, layout=torch.strided, device=torch.device("cpu")) + ones = torch.ones_like( + x, + dtype=torch.float, + layout=torch.strided, + device=torch.device("cpu"), + ) return torch.nonzero(ones) x = torch.randn(2) - self.run_test(OneLikeModel(), x, input_names=['x'], dynamic_axes={"x": [0]}) + self.run_test(OneLikeModel(), x, input_names=["x"], dynamic_axes={"x": [0]}) self.run_test(OneLikeModel(), x, remained_onnx_input_idx=[]) x = torch.randn(2, 3, 4) - self.run_test(OneLikeModel(), x, input_names=['x'], dynamic_axes={"x": [0, 1, 2]}) + self.run_test( + OneLikeModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]} + ) self.run_test(OneLikeModel(), x, remained_onnx_input_idx=[]) class ZeroLikeModel(torch.nn.Module): def forward(self, x): - zeros = torch.zeros_like(x, dtype=torch.float, layout=torch.strided, device=torch.device("cpu")) + zeros = torch.zeros_like( + x, + dtype=torch.float, + layout=torch.strided, + device=torch.device("cpu"), + ) return torch.nonzero(zeros) x = torch.randn(2) - self.run_test(ZeroLikeModel(), x, input_names=['x'], dynamic_axes={"x": [0]}) + self.run_test(ZeroLikeModel(), x, input_names=["x"], dynamic_axes={"x": [0]}) self.run_test(ZeroLikeModel(), x, remained_onnx_input_idx=[]) x = torch.randn(2, 3, 4) - self.run_test(ZeroLikeModel(), x, input_names=['x'], dynamic_axes={"x": [0, 1, 2]}) + self.run_test( + ZeroLikeModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2]} + ) self.run_test(ZeroLikeModel(), x, remained_onnx_input_idx=[]) @skipIfUnsupportedMinOpsetVersion(9) @@ -9674,25 +11669,30 @@ def test_symbolic_shape_inference_expand_1(self): class ExpandModel(torch.nn.Module): def forward(self, x): return x.expand(4, 6, 2) + x = torch.randn(6, 1, requires_grad=True) self.run_test(ExpandModel(), (x,)) @skipIfUnsupportedMinOpsetVersion(9) - @disableScriptTest() # Test code not scriptable + @skipScriptTest() # Test code not scriptable def test_symbolic_shape_inference_expand_2(self): class M(torch.nn.Module): def forward(self, x): input_shape = x.size() batch_size, seq_length = input_shape seq_ids = torch.arange(seq_length) - causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None] + causal_mask = ( + seq_ids[None, None, :].repeat(batch_size, seq_length, 1) + <= seq_ids[None, :, None] + ) return causal_mask.transpose(0, 1) + x = torch.randn(3, 16) self.run_test(M(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]}) self.run_test(M(), (x,), remained_onnx_input_idx=[]) @skipIfUnsupportedMinOpsetVersion(10) - @disableScriptTest() # Test code not scriptable + @skipScriptTest() # Test code not scriptable def test_symbolic_shape_inference_slice(self): class M(torch.nn.Module): def forward(self, x, position_bias): @@ -9700,10 +11700,15 @@ def forward(self, x, position_bias): batch_size, seq_length = input_shape position_bias = position_bias[:, :, -seq_length:, :] return position_bias.transpose(0, 1) + x = torch.randn(3, 16) position_bias = torch.randn(1, 3, 20, 8) - self.run_test(M(), (x, position_bias), input_names=["x", "position_bias"], - dynamic_axes={"x": [0, 1], "position_bias": [0, 1, 2, 3]}) + self.run_test( + M(), + (x, position_bias), + input_names=["x", "position_bias"], + dynamic_axes={"x": [0, 1], "position_bias": [0, 1, 2, 3]}, + ) self.run_test(M(), (x, position_bias), remained_onnx_input_idx=[1]) def test_symbolic_shape_inference_slice_2(self): @@ -9711,24 +11716,37 @@ class M(torch.nn.Module): def forward(self, position_bias): position_bias = position_bias[:, :, -2:, :] return position_bias.transpose(0, 1) + position_bias = torch.randn(1, 3, 20, 8) self.run_test(M(), (position_bias,)) @skipIfUnsupportedMinOpsetVersion(9) - @disableScriptTest() + @skipScriptTest() def test_symbolic_shape_inference_time(self): input = torch.randn(RNN_SEQUENCE_LENGTH, BATCH_SIZE, RNN_INPUT_SIZE) h0 = torch.randn(1, BATCH_SIZE, RNN_HIDDEN_SIZE) c0 = torch.randn(1, BATCH_SIZE, RNN_HIDDEN_SIZE) - model_lstm = torch.nn.LSTM(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False) - self.run_test(model_lstm, (input, (h0, c0)), input_names=["x", "y"], - dynamic_axes={"x" : [0, 1]}) - model_gru = torch.nn.GRU(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False, bias=False) - self.run_test(model_gru, (input, h0), input_names=["x", "y"], - dynamic_axes={"x" : [0, 1]}) - model_rnn = torch.nn.RNN(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False, bias=False) - self.run_test(model_rnn, (input, h0), input_names=["x", "y"], - dynamic_axes={"x" : [0, 1]}) + model_lstm = torch.nn.LSTM( + RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False + ) + self.run_test( + model_lstm, + (input, (h0, c0)), + input_names=["x", "y"], + dynamic_axes={"x": [0, 1]}, + ) + model_gru = torch.nn.GRU( + RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False, bias=False + ) + self.run_test( + model_gru, (input, h0), input_names=["x", "y"], dynamic_axes={"x": [0, 1]} + ) + model_rnn = torch.nn.RNN( + RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, 1, bidirectional=False, bias=False + ) + self.run_test( + model_rnn, (input, h0), input_names=["x", "y"], dynamic_axes={"x": [0, 1]} + ) def test_symbolic_shape_inference_dynamic_axes(self): class M(torch.nn.Module): @@ -9736,9 +11754,14 @@ def forward(self, input_ids): input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) return input_ids.transpose(0, 1) + x = torch.randn(3, 16) - self.run_test(M(), (x,), input_names=["input_ids"], - dynamic_axes={"input_ids": {0: "batch", 1: "sequence"}}) + self.run_test( + M(), + (x,), + input_names=["input_ids"], + dynamic_axes={"input_ids": {0: "batch", 1: "sequence"}}, + ) @skipIfUnsupportedMinOpsetVersion(9) def test_hann_window_periodic(self): @@ -9749,7 +11772,12 @@ def __init__(self): def forward(self, x, window_length: int): self.window_length = window_length - return torch.add(x, torch.hann_window(self.window_length, periodic=True, dtype=torch.float)) + return torch.add( + x, + torch.hann_window( + self.window_length, periodic=True, dtype=torch.float + ), + ) win_length = 100 x = torch.randn(win_length) @@ -9766,7 +11794,12 @@ def __init__(self): def forward(self, x, window_length: int): self.window_length = window_length - return torch.add(x, torch.hann_window(self.window_length, periodic=False, dtype=torch.float)) + return torch.add( + x, + torch.hann_window( + self.window_length, periodic=False, dtype=torch.float + ), + ) win_length = 100 x = torch.randn(win_length) @@ -9775,7 +11808,7 @@ def forward(self, x, window_length: int): self.run_test(module, (x, win_length)) @skipIfUnsupportedMinOpsetVersion(9) - @disableScriptTest() + @skipScriptTest() def test_hann_window_default_values(self): class HannWindowModule(torch.nn.Module): def __init__(self): @@ -9784,6 +11817,7 @@ def __init__(self): def forward(self, x, window_length: int): import torch.nn.functional as F + self.window_length = window_length return torch.add(x, F.relu(torch.hann_window(self.window_length))) @@ -9795,7 +11829,7 @@ def forward(self, x, window_length: int): self.run_test(module, (x, win_length)) @skipIfUnsupportedMinOpsetVersion(12) - @disableScriptTest() + @skipScriptTest() def test_tensordot_dim_count(self): class M(torch.nn.Module): def forward(self, x, y): @@ -9820,7 +11854,7 @@ def forward(self, x, y): self.run_test(M(), (x, y)) @skipIfUnsupportedMinOpsetVersion(12) - @disableScriptTest() + @skipScriptTest() def test_tensordot_dynamic_dim(self): class M(torch.nn.Module): def forward(self, x, y): @@ -9833,9 +11867,13 @@ def forward(self, x, y): new_x = torch.randint(6, (8, 6, 2, 5)) new_y = torch.randint(6, (2, 5, 3, 4)) - self.run_test(M(), (x, y), test_with_inputs=[(new_x, new_y)], - input_names=["input_x", "input_y"], - dynamic_axes={"input_x": [0, 1, 2, 3], "input_y": [0, 1, 2, 3]}) + self.run_test( + M(), + (x, y), + test_with_inputs=[(new_x, new_y)], + input_names=["input_x", "input_y"], + dynamic_axes={"input_x": [0, 1, 2, 3], "input_y": [0, 1, 2, 3]}, + ) @skipIfUnsupportedMinOpsetVersion(9) def test_to_device(self): @@ -9854,7 +11892,7 @@ def forward(self, x, y): self.run_test(M_ToDeviceDtype(), (x, y)) @skipIfUnsupportedMinOpsetVersion(9) - @disableScriptTest() + @skipScriptTest() def test_fill(self): class FillModule(torch.nn.Module): def forward(self, x, filled_value: int): @@ -9891,11 +11929,15 @@ def forward(self, x): index = torch.tensor([0, 2, 3, 1, 4]) self.run_test(M(0, index, updates), (x,)) - updates = torch.tensor([[[1, 5, 7], [2, 4, 5], [5, 5, 6], [2, 3, 4]]], dtype=torch.float) + updates = torch.tensor( + [[[1, 5, 7], [2, 4, 5], [5, 5, 6], [2, 3, 4]]], dtype=torch.float + ) index = torch.tensor([0, 2, 3, 1]) self.run_test(M(1, index, updates), (x,)) - updates = torch.tensor([[[1, 2, 3], [4, 5, 6], [7, 8, 9], [2, 3, 4]]], dtype=torch.float) + updates = torch.tensor( + [[[1, 2, 3], [4, 5, 6], [7, 8, 9], [2, 3, 4]]], dtype=torch.float + ) index = torch.tensor([0, 2, 1]) self.run_test(M(2, index, updates), (x,)) @@ -9933,9 +11975,11 @@ def forward(self, x): return x x = torch.ones(5, 4, 3) - updates = torch.tensor([[[1, 5, 7], [2, 4, 5], [5, 5, 6], [2, 3, 4]]], dtype=torch.float) + updates = torch.tensor( + [[[1, 5, 7], [2, 4, 5], [5, 5, 6], [2, 3, 4]]], dtype=torch.float + ) index = torch.tensor([0, 2, 3, 1]) - loop_count = torch.randint(20, (1, ))[0].item() + loop_count = torch.randint(20, (1,))[0].item() self.run_test(M(1, index, updates, loop_count), (x,)) @skipIfUnsupportedMinOpsetVersion(9) @@ -9956,11 +12000,15 @@ def forward(self, x, cond): return x x = torch.ones(5, 4, 3) - updates = torch.tensor([[[1, 5, 7], [2, 4, 5], [5, 5, 6], [2, 3, 4]]], dtype=torch.float) + updates = torch.tensor( + [[[1, 5, 7], [2, 4, 5], [5, 5, 6], [2, 3, 4]]], dtype=torch.float + ) index_true = torch.tensor([0, 2, 3, 1]) index_false = torch.tensor([1, 0, 2, 3]) cond = torch.tensor(1, dtype=torch.bool) - self.run_test(torch.jit.script(M(1, updates, index_true, index_false)), (x, cond)) + self.run_test( + torch.jit.script(M(1, updates, index_true, index_false)), (x, cond) + ) @skipIfUnsupportedMinOpsetVersion(9) def test_index_add_dynamic_axes(self): @@ -9977,12 +12025,18 @@ def forward(self, x): x = torch.ones(5, 4, 3) y = torch.ones(7, 8, 3) - updates = torch.tensor([[[1, 5, 7], [2, 4, 5], [5, 5, 6], [2, 3, 4]]], dtype=torch.float) + updates = torch.tensor( + [[[1, 5, 7], [2, 4, 5], [5, 5, 6], [2, 3, 4]]], dtype=torch.float + ) index = torch.tensor([0, 2, 3, 1]) - self.run_test(M(1, index, updates), (x,), test_with_inputs=[y], - input_names=['input_1'], - dynamic_axes={'input_1': [0, 1]}) + self.run_test( + M(1, index, updates), + (x,), + test_with_inputs=[y], + input_names=["input_1"], + dynamic_axes={"input_1": [0, 1]}, + ) def test_roll(self): class M(torch.nn.Module): @@ -10006,7 +12060,7 @@ def forward(self, x): return torch.sum(x) x = torch.ones(12, 3) - self.run_test(M(), (x,), input_names=['x'], dynamic_axes={'x': [0]}) + self.run_test(M(), (x,), input_names=["x"], dynamic_axes={"x": [0]}) def test_sum_empty_tensor(self): class M(torch.nn.Module): @@ -10044,7 +12098,7 @@ def forward(self, x, y): self.run_test(M(), (x, y)) - @disableScriptTest() + @skipScriptTest() @skipIfUnsupportedMinOpsetVersion(11) def test_dist_normal(self): class M(torch.nn.Module): @@ -10054,9 +12108,15 @@ def forward(self, x, y): self.run_test(M(), (torch.tensor([0.0]), torch.tensor([[1.0], [2.0]]))) self.run_test(M(), (torch.tensor([0.0]), torch.tensor([1.0]))) - self.run_test(M(), (torch.tensor([[[0.0], [10.0]], [[2.0], [8.0]], [[2.0], [8.0]]]), torch.tensor([[1.0], [3.0]]))) + self.run_test( + M(), + ( + torch.tensor([[[0.0], [10.0]], [[2.0], [8.0]], [[2.0], [8.0]]]), + torch.tensor([[1.0], [3.0]]), + ), + ) - @disableScriptTest() + @skipScriptTest() @skipIfUnsupportedMinOpsetVersion(11) def test_dist_normal_correctness(self): class M(torch.nn.Module): @@ -10068,31 +12128,39 @@ def forward(self, x, y): model_export = M() dummy_input = (torch.tensor([expected_mean]), torch.tensor([expected_std])) - ort_sess = convert_to_onnx(model_export, input=dummy_input, opset_version=self.opset_version, - training=torch.onnx.TrainingMode.EVAL) + ort_sess = convert_to_onnx( + model_export, + input=dummy_input, + opset_version=self.opset_version, + training=torch.onnx.TrainingMode.EVAL, + ) - ort_out = run_ort(ort_sess, input=dummy_input) + ort_out = run_ort(ort_sess, inputs=dummy_input) actual_std = np.std(ort_out) actual_mean = np.mean(ort_out) - assert abs(abs(actual_mean) - expected_mean) <= expected_mean * 0.1, \ - "the gap of mean between ort outputs and expected one is unacceptable." - assert abs(abs(actual_std) - expected_std) <= expected_std * 0.1, \ - "the gap of variance between ort outputs and expected one is unacceptable." + assert ( + abs(abs(actual_mean) - expected_mean) <= expected_mean * 0.1 + ), "the gap of mean between ort outputs and expected one is unacceptable." + assert ( + abs(abs(actual_std) - expected_std) <= expected_std * 0.1 + ), "the gap of variance between ort outputs and expected one is unacceptable." - @disableScriptTest() + @skipScriptTest() @skipIfUnsupportedMinOpsetVersion(11) def test_dist_uniform(self): class M(torch.nn.Module): def forward(self, x, y): - return torch.distributions.Uniform(x, y).sample().size(0), x , y + return torch.distributions.Uniform(x, y).sample().size(0), x, y self.run_test(M(), (torch.tensor([0.0]), torch.tensor([10.0]))) self.run_test(M(), (torch.tensor([[0.0], [6.0]]), torch.tensor([[1.0], [7.0]]))) - self.run_test(M(), (torch.tensor([1.0]), torch.tensor([[10.0], [7.0], [9.0], [20.0]]))) + self.run_test( + M(), (torch.tensor([1.0]), torch.tensor([[10.0], [7.0], [9.0], [20.0]])) + ) - @disableScriptTest() + @skipScriptTest() @skipIfUnsupportedMinOpsetVersion(11) def test_dist_uniform_correctness(self): class M(torch.nn.Module): @@ -10105,18 +12173,27 @@ def forward(self, x, y): model_export = M() dummy_input = (torch.tensor([expected_min]), torch.tensor([expected_max])) - ort_sess = convert_to_onnx(model_export, input=dummy_input, opset_version=self.opset_version, - training=torch.onnx.TrainingMode.EVAL) - - ort_out = run_ort(ort_sess, input=dummy_input) + ort_sess = convert_to_onnx( + model_export, + input=dummy_input, + opset_version=self.opset_version, + training=torch.onnx.TrainingMode.EVAL, + ) + + ort_out = run_ort(ort_sess, inputs=dummy_input) actual_min = np.min(ort_out) actual_max = np.max(ort_out) actual_mean = np.mean(ort_out) - assert actual_min >= expected_min, "the minimum value of ort outputs is out of scope." - assert actual_max <= expected_max, "the maximum value of ort outputs is out of scope." - assert abs(actual_mean - expected_mean) <= expected_mean * 0.05, \ - "the mean value of ort outputs is out of scope." + assert ( + actual_min >= expected_min + ), "the minimum value of ort outputs is out of scope." + assert ( + actual_max <= expected_max + ), "the maximum value of ort outputs is out of scope." + assert ( + abs(actual_mean - expected_mean) <= expected_mean * 0.05 + ), "the mean value of ort outputs is out of scope." @skipIfUnsupportedMinOpsetVersion(13) def test_sequence_to_int(self): @@ -10132,7 +12209,9 @@ def forward(self, x): def test_sequence_to_float(self): class M(torch.nn.Module): def forward(self, x): - result = torch.tensor([1.1 for i in range(x.size()[0])], dtype=torch.float) + result = torch.tensor( + [1.1 for i in range(x.size()[0])], dtype=torch.float + ) return x, result x = torch.randn(10, 5) @@ -10142,7 +12221,9 @@ def forward(self, x): def test_sequence_to_bool(self): class M(torch.nn.Module): def forward(self, x): - result = torch.tensor([False for i in range(x.size()[0])], dtype=torch.bool) + result = torch.tensor( + [False for i in range(x.size()[0])], dtype=torch.bool + ) return x, result x = torch.randn(10, 5) @@ -10173,8 +12254,6 @@ def symbolic_custom_invalid_add(g, input, other, alpha=None): self.assertTrue(f.getvalue(), "ONNX graph was not exported.") loaded_model = onnx.load_from_string(f.getvalue()) - - @skipIfUnsupportedMinOpsetVersion(9) # https://github.com/microsoft/onnxruntime/issues/9663 def test_tuple_output_from_if_with_raised_exception(self): class M(torch.nn.Module): def __init__(self): @@ -10185,6 +12264,7 @@ def forward(self, t: Tensor) -> Tuple[Tensor, Tensor]: raise Exception("Negative input") else: return torch.zeros(5), torch.zeros(5) + x = torch.zeros(1) self.run_test(torch.jit.script(M()), (x,)) @@ -10201,22 +12281,455 @@ def forward(self, x): x = F.softmax(x, dim=1) x = x.reshape(batch, -1) return x + radix = 2 cardinality = 1 x = torch.randn(10, 1, 128, 1) f = io.BytesIO() - torch.onnx.export(RSoftMax(radix, cardinality), (x, ), f, input_names=["x"], dynamic_axes={"x": [0]}) + torch.onnx.export( + RSoftMax(radix, cardinality), + (x,), + f, + input_names=["x"], + dynamic_axes={"x": [0]}, + ) loaded_model = onnx.load_from_string(f.getvalue()) - self.assertEqual(loaded_model.graph.output[0].type.tensor_type.shape.dim[1].dim_value, 128) + self.assertEqual( + loaded_model.graph.output[0].type.tensor_type.shape.dim[1].dim_value, 128 + ) + + # NOTE: For quantization tests, choose scale and zero point carefully + # such that inputs and outputs do not always overflow/underflow. + # Otherwise test results could be inaccurate. + @skipIfUnsupportedMinOpsetVersion(10) + def test_quantized_linear(self): + model = torch.nn.quantized.Linear(4, 8) + # Set fixed weight to avoid flaky test. + weight = torch.quantize_per_tensor( + torch.arange(32, dtype=torch.float).view(8, 4), 0.5, 0, torch.qint8 + ) + # Set non-zero bias. + bias = torch.arange(8, dtype=torch.float) + model.set_weight_bias(weight, bias) + # Set fixed input to avoid flaky test. + input = torch.randn(4, 4) + input = torch.arange(16, dtype=torch.float).view(4, 4) - 8 + input_tensor = torch.quantize_per_tensor(input, 0.5, 128, torch.quint8) + self.run_test(model, input_tensor) + + @skipIfUnsupportedMinOpsetVersion(10) + def test_quantized_conv2d(self): + model = torch.nn.quantized.Conv2d(16, 33, 3, stride=2) + # Manually initialize model weight and bias to random numbers. + # By default all zeros. + q_weight = torch.quantize_per_tensor( + torch.randn(33, 16, 3, 3), 0.5, 0, torch.qint8 + ) + bias = torch.arange(33).to(torch.float) - 16 + model.set_weight_bias(q_weight, bias) + input = torch.randn(3, 16, 32, 32) + q_input = torch.quantize_per_tensor(input, 0.5, 128, torch.quint8) + self.run_test(model, q_input) + + @skipIfUnsupportedMinOpsetVersion(10) + def test_quantized_adaptive_avg_pool2d(self): + model = torch.nn.AdaptiveAvgPool2d((5, 7)) + input = torch.randn(4, 3, 10, 14) + q_input = torch.quantize_per_tensor(input, 0.2, 128, torch.quint8) + self.run_test(model, q_input) + + @skipIfUnsupportedMinOpsetVersion(10) + def test_quantized_conv2d_relu(self): + model = torch.nn.intrinsic.quantized.ConvReLU2d(16, 33, 3, stride=2) + # Manually initialize model weight and bias to random numbers. + # By default all zeros. + q_weight = torch.quantize_per_tensor( + torch.randn(33, 16, 3, 3), 0.5, 0, torch.qint8 + ) + bias = torch.arange(33).to(torch.float) - 16 + model.set_weight_bias(q_weight, bias) + input = torch.randn(3, 16, 32, 32) + q_input = torch.quantize_per_tensor(input, 0.5, 128, torch.quint8) + self.run_test(model, q_input) + + @skipIfUnsupportedMinOpsetVersion(10) + def test_quantized_hardswish(self): + model = torch.nn.quantized.Hardswish(1.0, 0) + input = torch.randn(2, 6) + q_input = torch.quantize_per_tensor(input, 0.26, 128, torch.quint8) + self.run_test(model, q_input) + + @skipIfUnsupportedMinOpsetVersion(10) + def test_quantized_hardsigmoid(self): + model = torch.nn.Hardsigmoid() + input = torch.randn(2, 6) + q_input = torch.quantize_per_tensor(input, 0.26, 128, torch.quint8) + self.run_test(model, q_input) + + @skipIfUnsupportedMinOpsetVersion(10) + def test_quantized_flatten(self): + class FlattenModel(torch.nn.Module): + def forward(self, input): + return torch.flatten(input) + + x = torch.quantize_per_tensor(torch.randn(1, 2, 3, 4), 1, 0, torch.quint8) + self.run_test(FlattenModel(), x) + + @skipIfUnsupportedMinOpsetVersion(10) + @skipScriptTest() # torch.jit.frontend.FrontendError: Cannot instantiate class 'QFunctional' in a script function: + def test_quantized_arithmetic_qfunctional(self): + x = torch.quantize_per_tensor(torch.randn(3, 4), 0.2, 128, torch.quint8) + y = torch.quantize_per_tensor(torch.randn(3, 4), 0.2, 128, torch.quint8) + + class ArithmeticModel(torch.nn.Module): + def forward(self, x, y): + o = torch.nn.quantized.QFunctional().add(x, y) + o = torch.nn.quantized.QFunctional().mul(o, x) + return o + + self.run_test(ArithmeticModel(), (x, y)) + + @skipIfUnsupportedMinOpsetVersion(10) + def test_quantized_arithmetic(self): + x = torch.quantize_per_tensor(torch.randn(3, 4), 0.2, 128, torch.quint8) + y = torch.quantize_per_tensor(torch.randn(3, 4), 0.2, 128, torch.quint8) -def make_test(name, base, layer, bidirectional, initial_state, - variable_length, dropout, script_test_min_opset_version, - **extra_kwargs): - test_name = str("_".join([ - "test", name, layer[1], - bidirectional[1], initial_state[1], - variable_length[1], dropout[1] - ])) + class ArithmeticModel2(torch.nn.Module): + def forward(self, x, y): + o = torch.ops.quantized.add(x, y, 0.4, 100) + o = torch.ops.quantized.mul(o, x, 0.4, 100) + return o + + self.run_test(ArithmeticModel2(), (x, y)) + + @skipIfUnsupportedMinOpsetVersion(10) + def test_quantize_per_tensor(self): + class Module(torch.nn.Module): + def forward(self, x): + return ( + torch.quantize_per_tensor(x, 0.2, 0, torch.qint8), + torch.quantize_per_tensor(x, 0.2, 128, torch.quint8), + ) + + x = torch.randn(4, 6) + self.run_test(Module(), x) + + @skipIfUnsupportedMinOpsetVersion(10) + def test_dequantize(self): + class Module(torch.nn.Module): + def forward(self, x): + return torch.dequantize(x) + + x = torch.quantize_per_tensor(torch.randn(3, 4), 0.2, 0, torch.qint8) + self.run_test(Module(), x) + + @skipIfUnsupportedMinOpsetVersion(13) + def test_qat_linear_per_channel(self): + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.quant = torch.quantization.QuantStub() + self.linear = torch.nn.Linear(4, 3) + self.dequant = torch.quantization.DeQuantStub() + + def forward(self, x): + x = self.quant(x) + x = self.linear(x) + x = self.dequant(x) + return x + + model = M() + model.qconfig = torch.quantization.get_default_qconfig("fbgemm") + model = torch.quantization.prepare_qat(model) + # Set fixed weight and bias to avoid flaky test. + model.linear.weight = torch.nn.Parameter( + _construct_tensor_for_quantization_test((3, 4)) + ) + model.linear.bias = torch.nn.Parameter(torch.arange(3, dtype=torch.float)) + model = torch.quantization.convert(model) + + # Set fixed input to avoid flaky test. + input = _construct_tensor_for_quantization_test((4, 4), offset=-8) + self.run_test(model, input) + + @skipIfUnsupportedMinOpsetVersion(13) + def test_qat_relu(self): + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.quant = torch.quantization.QuantStub() + self.relu = torch.nn.ReLU() + self.dequant = torch.quantization.DeQuantStub() + + def forward(self, x): + x = self.quant(x) + x = self.relu(x) + x = self.dequant(x) + return x + + model = M() + model.qconfig = torch.quantization.get_default_qconfig("fbgemm") + model = torch.quantization.prepare_qat(model) + model = torch.quantization.convert(model) + input = torch.randn(8, 4) + self.run_test(model, input) + + @skipIfUnsupportedMinOpsetVersion(13) + def test_qat_conv2d(self): + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.quant = torch.quantization.QuantStub() + self.conv = torch.nn.Conv2d(2, 4, 3, stride=2) + self.dequant = torch.quantization.DeQuantStub() + + def forward(self, x): + x = self.quant(x) + x = self.conv(x) + x = self.dequant(x) + return x + + model = M() + model.qconfig = torch.quantization.get_default_qconfig("fbgemm") + model = torch.quantization.prepare_qat(model) + # Set fixed weight and bias to avoid flaky test. + model.conv.weight = torch.nn.Parameter( + _construct_tensor_for_quantization_test((2, 4, 3, 3), max_val=2) + ) + model.conv.bias = torch.nn.Parameter(torch.tensor([0.0, 1.0])) + model = torch.quantization.convert(model) + + # Set fixed input to avoid flaky test. + input = _construct_tensor_for_quantization_test( + (3, 4, 8, 8), offset=-384, max_val=12 + ) + self.run_test(model, input) + + @skipIfUnsupportedMinOpsetVersion(13) + def test_qat_conv2d_relu(self): + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.quant = torch.quantization.QuantStub() + self.conv = torch.nn.Conv2d(2, 4, 3, stride=2) + self.relu = torch.nn.ReLU() + self.dequant = torch.quantization.DeQuantStub() + + def forward(self, x): + x = self.quant(x) + x = self.conv(x) + x = self.relu(x) + x = self.dequant(x) + return x + + model = M() + model.qconfig = torch.quantization.get_default_qconfig("fbgemm") + model = torch.quantization.prepare_qat(model) + # Set fixed weight and bias to avoid flaky test. + model.conv.weight = torch.nn.Parameter( + _construct_tensor_for_quantization_test((2, 4, 3, 3), max_val=2) + ) + model.conv.bias = torch.nn.Parameter(torch.tensor([0.0, 1.0])) + model = torch.quantization.convert(model) + + # Set fixed input to avoid flaky test. + input = _construct_tensor_for_quantization_test( + (3, 4, 8, 8), offset=-384, max_val=12 + ) + self.run_test(model, input) + + @skipIfUnsupportedMinOpsetVersion(13) + def test_qat_conv2d_relu_fused(self): + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.quant = torch.quantization.QuantStub() + self.conv = torch.nn.Conv2d(2, 4, 3, stride=2) + self.relu = torch.nn.ReLU() + self.dequant = torch.quantization.DeQuantStub() + + def forward(self, x): + x = self.quant(x) + x = self.conv(x) + x = self.relu(x) + x = self.dequant(x) + return x + + model = M() + model.qconfig = torch.quantization.get_default_qconfig("fbgemm") + model = torch.quantization.fuse_modules(model.eval(), [["conv", "relu"]]) + model = torch.quantization.prepare_qat(model.train()) + # Set fixed weight and bias to avoid flaky test. + model.conv.weight = torch.nn.Parameter( + _construct_tensor_for_quantization_test((2, 4, 3, 3), max_val=2) + ) + model.conv.bias = torch.nn.Parameter(torch.tensor([0.0, 1.0])) + model = torch.quantization.convert(model) + + # Set fixed input to avoid flaky test. + input = _construct_tensor_for_quantization_test( + (3, 4, 8, 8), offset=-384, max_val=12 + ) + self.run_test(model, input) + + @skipIfUnsupportedMinOpsetVersion(10) + def test_qat_maxpool2d(self): + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.quant = torch.quantization.QuantStub() + self.pool = torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.dequant = torch.quantization.DeQuantStub() + + def forward(self, x): + x = self.quant(x) + x = self.pool(x) + x = self.dequant(x) + return x + + model = M() + model.qconfig = torch.quantization.get_default_qconfig("fbgemm") + model = torch.quantization.prepare_qat(model.train()) + model = torch.quantization.convert(model) + + # Set fixed input to avoid flaky test. + input = _construct_tensor_for_quantization_test((4, 4, 3, 2)) + self.run_test(model, input) + + @skipIfUnsupportedMinOpsetVersion(9) + def test_convolution_allow_tf32(self): + class Module(torch.nn.Module): + def __init__(self, allow_tf32): + super().__init__() + + self.allow_tf32 = allow_tf32 + weight = torch.rand(32, 3, 3, 3) + self.weight = torch.nn.Parameter(weight) + + def forward(self, x): + if self.allow_tf32: + return torch._convolution( + x, + self.weight, + None, + [2, 2], + [0, 0], + [1, 1], + False, + [0, 0], + 1, + False, + False, + True, + True, + ) + else: + return torch._convolution( + x, + self.weight, + None, + [2, 2], + [0, 0], + [1, 1], + False, + [0, 0], + 1, + False, + False, + True, + ) + + x = torch.randn(1, 3, 224, 224) + self.run_test(Module(False), x, rtol=1e-3, atol=1e-6) + self.run_test(Module(True), x, rtol=1e-3, atol=1e-6) + + @skipIfUnsupportedMinOpsetVersion(16) + def test_grid_sample(self): + n, c, h_in, w_in, h_out, w_out = 1, 1, 3, 2, 2, 4 + + class GridSampleModule(torch.nn.Module): + def __init__(self, mode, padding_mode, align_corners) -> None: + super().__init__() + self.mode, self.padding_mode, self.align_corners = ( + mode, + padding_mode, + align_corners, + ) + + def forward(self, input, grid): + return torch.nn.functional.grid_sample( + input, grid, self.mode, self.padding_mode, self.align_corners + ) + + for mode, padding_mode, align_corners in itertools.product( + ("bilinear", "nearest", "bicubic"), + ("zeros", "border", "reflection"), + (True, False), + ): + atol_rtol = {} + if (mode, padding_mode) == ("bicubic", "border"): + if align_corners: + atol_rtol.update({"atol": 0.3, "rtol": 0.4}) + else: + atol_rtol.update({"atol": 0.02, "rtol": 0.02}) + input, grid = torch.randn(n, c, h_in, w_in), torch.randn(n, h_out, w_out, 2) + self.run_test( + GridSampleModule(mode, padding_mode, align_corners), + (input, grid), + **atol_rtol, + ) + + @skipIfUnsupportedMinOpsetVersion(9) + def test_device_eq(self): + class M(torch.nn.Module): + def forward(self, a): + # exercise both Tensor.device (prim::device) + # and torch.device (prim::Constant). + if a.device != torch.device("cpu"): + return a + return torch.zeros_like(a) + + mod = torch.jit.script(M()) # preserve control flow + + self.run_test( + mod, + # In order for the ONNX model behavior to match the torch model, we + # need to construct input that has the same device that is checked for + # in forward(). In ONNX there is no such thing as a device, so the if + # condition is always false. + torch.randn(3, 3, device="cpu"), + # Force dynamic axes so that the output shape depends on the input. + # Otherwise the entire model will just return a constant and not have + # any inputs. + input_names=["a"], + dynamic_axes={"a": {0: "a0"}}, + ) + + +def make_test( + name, + base, + layer, + bidirectional, + initial_state, + variable_length, + dropout, + script_test_min_opset_version, + **extra_kwargs, +): + test_name = str( + "_".join( + [ + "test", + name, + layer[1], + bidirectional[1], + initial_state[1], + variable_length[1], + dropout[1], + ] + ) + ) # Cannot export with older opsets because of "ConstantFill" op # ConstantFill was a temp op removed at opset 8. This is no longer supported by onnxruntime @@ -10226,10 +12739,12 @@ def make_test(name, base, layer, bidirectional, initial_state, # - https://msdata.visualstudio.com/Vienna/_workitems/edit/1055382 # Operator aten::_pack_padded_sequence is not supported by exporter yet. # - https://msdata.visualstudio.com/Vienna/_workitems/edit/1055384 - @disableScriptTest() + @skipScriptTest() @skipIfUnsupportedMinOpsetVersion(9) def f(self): - self.is_script_test_enabled = self.opset_version >= script_test_min_opset_version + self.is_script_test_enabled = ( + self.opset_version >= script_test_min_opset_version + ) self._dispatch_rnn_test( base, layers=layer[0], @@ -10237,153 +12752,126 @@ def f(self): initial_state=initial_state[0], packed_sequence=variable_length[0], dropout=dropout[0], - **extra_kwargs) + **extra_kwargs, + ) f.__name__ = test_name - setattr(TestONNXRuntime, f.__name__, f) + setattr(_TestONNXRuntime, f.__name__, f) + def setup_rnn_tests(): - layers_opts = [ - (1, "unilayer"), - (3, "trilayer") - ] - bidirectional_opts = [ - (False, "forward"), - (True, "bidirectional") - ] - initial_state_opts = [ - (True, "with_initial_state"), - (False, "no_initial_state") - ] + layers_opts = [(1, "unilayer"), (3, "trilayer")] + bidirectional_opts = [(False, "forward"), (True, "bidirectional")] + initial_state_opts = [(True, "with_initial_state"), (False, "no_initial_state")] variable_length_opts = [ (0, "without_sequence_lengths"), (1, "with_variable_length_sequences"), - (2, "with_batch_first_sequence_lengths") - ] - dropout_opts = [ - (0.2, "with_dropout"), - (0.0, "without_dropout") + (2, "with_batch_first_sequence_lengths"), ] + dropout_opts = [(0.2, "with_dropout"), (0.0, "without_dropout")] test_count = 0 - for (layer, bidirectional, initial_state, variable_length, dropout) in \ - itertools.product( - layers_opts, - bidirectional_opts, - initial_state_opts, - variable_length_opts, - dropout_opts,): + for ( + layer, + bidirectional, + initial_state, + variable_length, + dropout, + ) in itertools.product( + layers_opts, + bidirectional_opts, + initial_state_opts, + variable_length_opts, + dropout_opts, + ): for base, name, extra_kwargs in ( - ("elman", "elman_relu", {"nonlinearity": u"relu"}), - ("elman", "elman_tanh", {"nonlinearity": u"tanh"}), - ("lstm", "lstm", {}), - ("gru", "gru", {}) + ("elman", "elman_relu", {"nonlinearity": "relu"}), + ("elman", "elman_tanh", {"nonlinearity": "tanh"}), + ("lstm", "lstm", {}), + ("gru", "gru", {}), ): # Need Add between list of tensors script_test_min_opset_version = 11 - if ( # compiling in script mode fails with errors like: - # torch.jit.frontend.UnsupportedNodeError: annotated assignments - # without assigned value aren't supported - # https://msdata.visualstudio.com/Vienna/_workitems/edit/1160723 - base == 'elman' or - # compiling in script mode fails with errors like: - # RuntimeError: Arguments for call are not valid. - # https://msdata.visualstudio.com/Vienna/_workitems/edit/1160723 - base == 'lstm'): + if ( # compiling in script mode fails with errors like: + # torch.jit.frontend.UnsupportedNodeError: annotated assignments + # without assigned value aren't supported + # https://msdata.visualstudio.com/Vienna/_workitems/edit/1160723 + base == "elman" + or + # compiling in script mode fails with errors like: + # RuntimeError: Arguments for call are not valid. + # https://msdata.visualstudio.com/Vienna/_workitems/edit/1160723 + base == "lstm" + ): script_test_min_opset_version = float("inf") - make_test(name, base, layer, bidirectional, initial_state, - variable_length, dropout, script_test_min_opset_version, - **extra_kwargs) + make_test( + name, + base, + layer, + bidirectional, + initial_state, + variable_length, + dropout, + script_test_min_opset_version, + **extra_kwargs, + ) test_count += 1 # sanity check that a representative example does exist - TestONNXRuntime.test_gru_trilayer_forward_with_initial_state_without_sequence_lengths_with_dropout + _TestONNXRuntime.test_gru_trilayer_forward_with_initial_state_without_sequence_lengths_with_dropout # make sure no one accidentally disables all the tests without # noticing if test_count != 192: raise ValueError("Expected 192 tests but found {}".format(test_count)) + setup_rnn_tests() -# opset 7 tests -TestONNXRuntime_opset7 = type(str("TestONNXRuntime_opset7"), - (unittest.TestCase,), - dict(TestONNXRuntime.__dict__, opset_version=7)) - -# opset 8 tests -TestONNXRuntime_opset8 = type(str("TestONNXRuntime_opset8"), - (unittest.TestCase,), - dict(TestONNXRuntime.__dict__, opset_version=8)) - - -# opset 10 tests -TestONNXRuntime_opset10 = type(str("TestONNXRuntime_opset10"), - (unittest.TestCase,), - dict(TestONNXRuntime.__dict__, opset_version=10)) - -# opset 11 tests -TestONNXRuntime_opset11 = type(str("TestONNXRuntime_opset11"), - (unittest.TestCase,), - dict(TestONNXRuntime.__dict__, opset_version=11)) - -# opset 12 tests -TestONNXRuntime_opset12 = type(str("TestONNXRuntime_opset12"), - (unittest.TestCase,), - dict(TestONNXRuntime.__dict__, opset_version=12)) - -# opset 9 tests, with keep_initializers_as_inputs=False for -# IR version 4 style export. -TestONNXRuntime_opset9_IRv4 = type(str("TestONNXRuntime_opset9_IRv4"), - (unittest.TestCase,), - dict(TestONNXRuntime.__dict__, - keep_initializers_as_inputs=False)) - - -# opset 10 tests, with keep_initializers_as_inputs=False for -# IR version 4 style export. -TestONNXRuntime_opset10_IRv4 = type(str("TestONNXRuntime_opset10_IRv4"), - (unittest.TestCase,), - dict(TestONNXRuntime.__dict__, opset_version=10, - keep_initializers_as_inputs=False)) - - -# opset 11 tests, with keep_initializers_as_inputs=False for -# IR version 4 style export. -TestONNXRuntime_opset11_IRv4 = type(str("TestONNXRuntime_opset11_IRv4"), - (unittest.TestCase,), - dict(TestONNXRuntime.__dict__, opset_version=11, - keep_initializers_as_inputs=False)) - -# opset 12 tests, with keep_initializers_as_inputs=False for -# IR version 4 style export. -TestONNXRuntime_opset12_IRv4 = type(str("TestONNXRuntime_opset12_IRv4"), - (unittest.TestCase,), - dict(TestONNXRuntime.__dict__, opset_version=12, - keep_initializers_as_inputs=False)) - -# opset 13 tests -TestONNXRuntime_opset13 = type(str("TestONNXRuntime_opset13"), - (unittest.TestCase,), - dict(TestONNXRuntime.__dict__, opset_version=13, - keep_initializers_as_inputs=False, - onnx_shape_inference=True)) - -# opset 14 tests -TestONNXRuntime_opset14 = type(str("TestONNXRuntime_opset14"), - (unittest.TestCase,), - dict(TestONNXRuntime.__dict__, opset_version=14, - keep_initializers_as_inputs=False, - onnx_shape_inference=True)) - -# opset 15 tests -TestONNXRuntime_opset15 = type(str("TestONNXRuntime_opset15"), - (unittest.TestCase,), - dict(TestONNXRuntime.__dict__, opset_version=15, - keep_initializers_as_inputs=False, - onnx_shape_inference=True)) +def MakeTestCase(opset_version: int, keep_initializers_as_inputs: bool = True) -> type: + name = f"TestONNXRuntime_opset{opset_version}" + if not keep_initializers_as_inputs: + name += "_IRv4" + return type( + str(name), + (unittest.TestCase,), + dict( + _TestONNXRuntime.__dict__, + opset_version=opset_version, + keep_initializers_as_inputs=keep_initializers_as_inputs, + ), + ) + + +TestONNXRuntime_opset7 = MakeTestCase(7) + +TestONNXRuntime_opset8 = MakeTestCase(8) + +TestONNXRuntime_opset9 = MakeTestCase(9) + +TestONNXRuntime_opset9_IRv4 = MakeTestCase(9, keep_initializers_as_inputs=False) + +TestONNXRuntime_opset10 = MakeTestCase(10) + +TestONNXRuntime_opset10_IRv4 = MakeTestCase(10, keep_initializers_as_inputs=False) + +TestONNXRuntime_opset11 = MakeTestCase(11) + +TestONNXRuntime_opset11_IRv4 = MakeTestCase(11, keep_initializers_as_inputs=False) + +TestONNXRuntime_opset12 = MakeTestCase(12) + +TestONNXRuntime_opset12_IRv4 = MakeTestCase(12, keep_initializers_as_inputs=False) + +TestONNXRuntime_opset13 = MakeTestCase(13, keep_initializers_as_inputs=False) + +TestONNXRuntime_opset14 = MakeTestCase(14, keep_initializers_as_inputs=False) + +TestONNXRuntime_opset15 = MakeTestCase(15, keep_initializers_as_inputs=False) + +TestONNXRuntime_opset16 = MakeTestCase(16, keep_initializers_as_inputs=False) if __name__ == "__main__": diff --git a/test/onnx/test_pytorch_onnx_onnxruntime_cuda.py b/test/onnx/test_pytorch_onnx_onnxruntime_cuda.py index 575d4caa16ce..38ac87d46d13 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime_cuda.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime_cuda.py @@ -1,19 +1,26 @@ # Owner(s): ["module: onnx"] import unittest + import onnxruntime # noqa: F401 -import torch +from test_pytorch_common import ( + skipIfNoBFloat16Cuda, + skipIfNoCuda, + skipIfUnsupportedMinOpsetVersion, + skipScriptTest, +) + +# TODO(justinchuby): Remove reference to other unit tests. +from test_pytorch_onnx_onnxruntime import TestONNXRuntime +import torch from torch.cuda.amp import autocast +from torch.onnx._globals import GLOBALS -from test_pytorch_common import disableScriptTest, skipIfUnsupportedMinOpsetVersion -from test_pytorch_common import skipIfNoCuda, skipIfNoBFloat16Cuda - -from test_pytorch_onnx_onnxruntime import TestONNXRuntime class TestONNXRuntime_cuda(unittest.TestCase): - from torch.onnx.symbolic_helper import _export_onnx_opset_version - opset_version = _export_onnx_opset_version + + opset_version = GLOBALS.export_onnx_opset_version keep_initializers_as_inputs = True onnx_shape_inference = True @@ -24,12 +31,20 @@ class GeluModel(torch.nn.Module): def forward(self, x): return torch.nn.functional.gelu(x) - x = torch.randn(2, 4, 5, 6, requires_grad=True, dtype=torch.float16, device=torch.device("cuda")) + x = torch.randn( + 2, + 4, + 5, + 6, + requires_grad=True, + dtype=torch.float16, + device=torch.device("cuda"), + ) self.run_test(GeluModel(), x, rtol=1e-3, atol=1e-5) @skipIfUnsupportedMinOpsetVersion(9) @skipIfNoCuda - @disableScriptTest() + @skipScriptTest() def test_layer_norm_fp16(self): class LayerNormModel(torch.nn.Module): def __init__(self): @@ -40,13 +55,20 @@ def __init__(self): def forward(self, x): return self.layer_norm(x) - x = torch.randn(20, 5, 10, 10, requires_grad=True, dtype=torch.float16, device=torch.device("cuda")) + x = torch.randn( + 20, + 5, + 10, + 10, + requires_grad=True, + dtype=torch.float16, + device=torch.device("cuda"), + ) self.run_test(LayerNormModel().cuda(), x, rtol=1e-3, atol=1e-5) - @skipIfUnsupportedMinOpsetVersion(12) @skipIfNoCuda - @disableScriptTest() + @skipScriptTest() def test_softmaxCrossEntropy_fusion_fp16(self): class FusionModel(torch.nn.Module): def __init__(self): @@ -61,14 +83,16 @@ def forward(self, input, target): N, C = 5, 4 input = torch.randn(N, 16, dtype=torch.float16, device=torch.device("cuda")) - target = torch.empty(N, dtype=torch.long, device=torch.device("cuda")).random_(0, C) + target = torch.empty(N, dtype=torch.long, device=torch.device("cuda")).random_( + 0, C + ) # using test data containing default ignore_index=-100 target[target == 1] = -100 self.run_test(FusionModel(), (input, target)) @skipIfNoCuda - @disableScriptTest() + @skipScriptTest() def test_apex_o2(self): class LinearModel(torch.nn.Module): def __init__(self): @@ -77,6 +101,7 @@ def __init__(self): def forward(self, x): return self.linear(x) + try: from apex import amp except Exception: @@ -94,11 +119,33 @@ class MyModule(torch.nn.Module): def forward(self, x): y = torch.ones(3, 4, dtype=torch.bfloat16, device=torch.device("cuda")) x = x.type_as(y) - return torch.mul(torch.add(x, y), torch.sub(x, y)).to(dtype=torch.float16) + return torch.mul(torch.add(x, y), torch.sub(x, y)).to( + dtype=torch.float16 + ) - x = torch.ones(3, 4, requires_grad=True, dtype=torch.float16, device=torch.device("cuda")) + x = torch.ones( + 3, 4, requires_grad=True, dtype=torch.float16, device=torch.device("cuda") + ) self.run_test(MyModule(), x, rtol=1e-3, atol=1e-5) + @skipIfNoCuda + def test_deduplicate_initializers_diff_devices(self): + class Model(torch.nn.Module): + def __init__(self): + super().__init__() + self.w = torch.nn.Parameter( + torch.ones(2, 3, device=torch.device("cpu")) + ) + self.b = torch.nn.Parameter(torch.ones(3, device=torch.device("cuda"))) + + def forward(self, x, y): + return torch.matmul(self.w, x), y + self.b + + x = torch.randn(3, 3, device=torch.device("cpu")) + y = torch.randn(3, 3, device=torch.device("cuda")) + self.run_test(Model(), (x, y)) + + TestONNXRuntime_cuda.setUp = TestONNXRuntime.setUp TestONNXRuntime_cuda.run_test = TestONNXRuntime.run_test diff --git a/test/onnx/test_pytorch_onnx_shape_inference.py b/test/onnx/test_pytorch_onnx_shape_inference.py index a1eb4b7018d0..50e40ae95618 100644 --- a/test/onnx/test_pytorch_onnx_shape_inference.py +++ b/test/onnx/test_pytorch_onnx_shape_inference.py @@ -1,11 +1,14 @@ # Owner(s): ["module: onnx"] import unittest -import torch + import numpy as np -from torch.onnx.symbolic_helper import (_set_onnx_shape_inference, - _onnx_main_opset, - _set_opset_version) +from test_pytorch_common import skipIfUnsupportedMinOpsetVersion + +import torch +from torch.onnx import _constants +from torch.onnx.symbolic_helper import _set_onnx_shape_inference, _set_opset_version + def expect_tensor(scalar_type, shape=None): def verify(actual_type): @@ -14,12 +17,14 @@ def verify(actual_type): # np.testing.assert_equal(actual_type.sizes(), shape) if shape is not None: np.testing.assert_equal(actual_type.varyingSizes(), shape) + return verify + class TestONNXShapeInference(unittest.TestCase): def __init__(self, *args, **kwargs): unittest.TestCase.__init__(self, *args, **kwargs) - self.opset_version = _onnx_main_opset + self.opset_version = _constants.onnx_main_opset _set_onnx_shape_inference(True) _set_opset_version(self.opset_version) @@ -53,17 +58,23 @@ def test_constant_of_shape(self): constant = self.insert_tensor_constant(g, torch.ones(1, 2, 3, 4)) shape = g.op("Shape", constant) constant_of_shape = g.op("ConstantOfShape", shape, value_t=torch.tensor([2.0])) - self.run_test(g, constant_of_shape.node(), expect_tensor("Float", shape=(1, 2, 3, 4))) + self.run_test( + g, constant_of_shape.node(), expect_tensor("Float", shape=(1, 2, 3, 4)) + ) def test_constant_of_shape_static(self): # Test ConstantOfShape with input of prim::ListConstruct of static tensor rank = 4 g = self.create_empty_graph() - constants = [self.insert_tensor_constant(g, torch.tensor(i + 1)) for i in range(rank)] + constants = [ + self.insert_tensor_constant(g, torch.tensor(i + 1)) for i in range(rank) + ] shape = g.op("prim::ListConstruct", *constants) shape.setType(torch._C.ListType.ofInts()) constant_of_shape = g.op("ConstantOfShape", shape, value_t=torch.tensor([2.0])) - self.run_test(g, constant_of_shape.node(), expect_tensor("Float", shape=(1, 2, 3, 4))) + self.run_test( + g, constant_of_shape.node(), expect_tensor("Float", shape=(1, 2, 3, 4)) + ) def test_constant_of_shape_dynamic(self): # Test ConstantOfShape with input of prim::ListConstruct of dynamic tensor @@ -73,7 +84,34 @@ def test_constant_of_shape_dynamic(self): shape = g.op("prim::ListConstruct", *inputs) shape.setType(torch._C.ListType.ofInts()) constant_of_shape = g.op("ConstantOfShape", shape, value_t=torch.tensor([2.0])) - self.run_test(g, constant_of_shape.node(), expect_tensor("Float", shape=(None, None, None, None))) + self.run_test( + g, + constant_of_shape.node(), + expect_tensor("Float", shape=(None, None, None, None)), + ) + + def test_gather_dynamic_index(self): + g = self.create_empty_graph() + input = g.addInput() + input.setType( + input.type().with_dtype(torch.float).with_sizes([None, 3, 16, 16]) + ) + indices = g.addInput() + indices.setType(indices.type().with_dtype(torch.int64).with_sizes([None])) + output = g.op("Gather", input, indices, axis_i=1) + self.run_test( + g, output.node(), expect_tensor("Float", shape=([None, None, 16, 16])) + ) + + def test_gather_scalar_index(self): + g = self.create_empty_graph() + input = g.addInput() + input.setType( + input.type().with_dtype(torch.float).with_sizes([None, 3, 16, 16]) + ) + indices = self.insert_tensor_constant(g, torch.tensor(1)) + output = g.op("Gather", input, indices, axis_i=1) + self.run_test(g, output.node(), expect_tensor("Float", shape=([None, 16, 16]))) def test_reshape(self): g = self.create_empty_graph() @@ -94,6 +132,23 @@ def test_reshape(self): shape = g.op("Reshape", constant, constant_2) self.run_test(g, shape.node(), expect_tensor("Float", shape=(8, 16, 5))) + def test_reshape_symbolic(self): + g = self.create_empty_graph() + input = g.addInput() + input.setType(input.type().with_sizes([None, None, 2, 8])) + constant = self.insert_tensor_constant(g, torch.tensor([0, 0, -1])) + output = g.op("Reshape", input, constant) + self.run_test(g, output.node(), expect_tensor(None, shape=(None, None, 16))) + + @skipIfUnsupportedMinOpsetVersion(14) + def test_reshape_allowzero(self): + g = self.create_empty_graph() + input = g.addInput() + input.setType(input.type().with_sizes([3, 4, 0])) + constant = self.insert_tensor_constant(g, torch.tensor([0, 4, 3])) + output = g.op("Reshape", input, constant, allowzero_i=1) + self.run_test(g, output.node(), expect_tensor(None, shape=(0, 4, 3))) + def test_slice(self): g = self.create_empty_graph() input = g.addInput() @@ -106,5 +161,52 @@ def test_slice(self): slice = g.op("Slice", input, start_input, end, axis, step) self.run_test(g, slice.node(), expect_tensor(None, shape=(None, None))) -if __name__ == '__main__': + def test_broadcast_matmul(self): + g = self.create_empty_graph() + constant = self.insert_tensor_constant(g, torch.ones(5, 1, 2)) + constant_2 = self.insert_tensor_constant(g, torch.ones(3, 1, 2, 1)) + shape = g.op("MatMul", constant, constant_2) + self.run_test(g, shape.node(), expect_tensor("Float", shape=(3, 5, 1, 1))) + + # test when first input is of rank 1 + g = self.create_empty_graph() + constant = self.insert_tensor_constant(g, torch.ones(2)) + constant_2 = self.insert_tensor_constant(g, torch.ones(3, 1, 2, 1)) + shape = g.op("MatMul", constant, constant_2) + self.run_test(g, shape.node(), expect_tensor("Float", shape=(3, 1, 1))) + + # test when second input is of rank 1 + g = self.create_empty_graph() + constant = self.insert_tensor_constant(g, torch.ones(5, 1, 2)) + constant_2 = self.insert_tensor_constant(g, torch.ones(2)) + shape = g.op("MatMul", constant, constant_2) + self.run_test(g, shape.node(), expect_tensor("Float", shape=(5, 1))) + + # test when both inputs are of rank 1 + g = self.create_empty_graph() + constant = self.insert_tensor_constant(g, torch.ones(2)) + constant_2 = self.insert_tensor_constant(g, torch.ones(2)) + shape = g.op("MatMul", constant, constant_2) + self.run_test(g, shape.node(), expect_tensor("Float", shape=())) + + def test_expand(self): + g = self.create_empty_graph() + input = g.addInput() + constant = self.insert_tensor_constant(g, torch.ones(2, 4)) + input.setType(constant.type().with_sizes([None, None])) + shape = g.op("Shape", input) + expand = g.op("Expand", constant, shape) + self.run_test(g, expand.node(), expect_tensor("Float", shape=(None, None))) + + def test_pad(self): + g = self.create_empty_graph() + input = g.addInput() + input.setType(input.type().with_dtype(torch.float).with_sizes([3, 320, 100])) + constant = self.insert_tensor_constant(g, torch.ones(6, dtype=torch.long)) + none = g.op("prim::Constant").setType(torch.NoneType.get()) + pad = g.op("Pad", input, constant, none, mode_s="constant") + self.run_test(g, pad.node(), expect_tensor("Float", shape=(None, None, None))) + + +if __name__ == "__main__": unittest.main() diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py index dca45fc5c311..9638e53d6c06 100644 --- a/test/onnx/test_utility_funs.py +++ b/test/onnx/test_utility_funs.py @@ -1,47 +1,55 @@ # Owner(s): ["module: onnx"] -from test_pytorch_common import TestCase, run_tests +import copy +import io + +import onnx +import torchvision +from autograd_helper import CustomFunction as CustomFunction2 +from test_pytorch_common import ( + TestCase, + run_tests, + skipIfNoCuda, + skipIfUnsupportedMaxOpsetVersion, + skipIfUnsupportedMinOpsetVersion, +) +from verify import verify import torch import torch.onnx -from torch.onnx import (utils, - OperatorExportTypes, - TrainingMode, - register_custom_op_symbolic, - unregister_custom_op_symbolic) -from torch.onnx.symbolic_helper import (_set_opset_version, - _set_operator_export_type, - _set_onnx_shape_inference) import torch.utils.cpp_extension -from test_pytorch_common import (skipIfUnsupportedMinOpsetVersion, - skipIfUnsupportedMaxOpsetVersion) -import caffe2.python.onnx.backend as backend -from verify import verify - -import torchvision - -import onnx - -import io -import copy -import unittest - -skip = unittest.skip +from torch.onnx import ( + OperatorExportTypes, + TrainingMode, + register_custom_op_symbolic, + unregister_custom_op_symbolic, + utils, +) +from torch.onnx.symbolic_helper import ( + _set_onnx_shape_inference, + _set_operator_export_type, + _set_opset_version, + _unpack_list, + parse_args, +) class _BaseTestCase(TestCase): - def setUp(self): torch.manual_seed(0) if torch.cuda.is_available(): torch.cuda.manual_seed_all(0) - def _model_to_graph(self, model, input, - do_constant_folding=True, - training=TrainingMode.EVAL, - operator_export_type=OperatorExportTypes.ONNX, - input_names=None, - dynamic_axes=None): + def _model_to_graph( + self, + model, + input, + do_constant_folding=True, + training=TrainingMode.EVAL, + operator_export_type=OperatorExportTypes.ONNX, + input_names=None, + dynamic_axes=None, + ): if training == torch.onnx.TrainingMode.TRAINING: model.train() elif training == torch.onnx.TrainingMode.EVAL: @@ -49,19 +57,21 @@ def _model_to_graph(self, model, input, # Need disable onnx_shape_inference for this test because it puts const node to initializers. _set_onnx_shape_inference(False) utils._validate_dynamic_axes(dynamic_axes, model, None, None) - graph, params_dict, torch_out = utils._model_to_graph(model, input, - do_constant_folding=do_constant_folding, - _disable_torch_constant_prop=True, - operator_export_type=operator_export_type, - training=training, - input_names=input_names, - dynamic_axes=dynamic_axes) + graph, params_dict, torch_out = utils._model_to_graph( + model, + input, + do_constant_folding=do_constant_folding, + _disable_torch_constant_prop=True, + operator_export_type=operator_export_type, + training=training, + input_names=input_names, + dynamic_axes=dynamic_axes, + ) _set_onnx_shape_inference(True) return graph, params_dict, torch_out class TestUtilityFuns_opset_independent(_BaseTestCase): - def test_unconvertible_ops(self): class MyModule(torch.nn.Module): def forward(self, x): @@ -100,18 +110,24 @@ def forward(self, x): def test_validate_dynamic_axes_invalid_input_output_name(self): import warnings + with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") - utils._validate_dynamic_axes({"input1": {}, "output": {}, - "invalid_name1": {}, "invalid_name2": {}}, - None, ["input1", "input2"], ["output"]) + utils._validate_dynamic_axes( + {"input1": {}, "output": {}, "invalid_name1": {}, "invalid_name2": {}}, + None, + ["input1", "input2"], + ["output"], + ) messages = [str(warning.message) for warning in w] self.assertIn( "Provided key invalid_name1 for dynamic axes is not a valid input/output name", - messages) + messages, + ) self.assertIn( "Provided key invalid_name2 for dynamic axes is not a valid input/output name", - messages) + messages, + ) self.assertEqual(len(messages), 2) @skipIfUnsupportedMinOpsetVersion(11) @@ -127,23 +143,28 @@ def forward(self, x, y, t): x = torch.randn(2, 3) y = torch.randn(2, 4) t = torch.randn(2, 7) - graph, _, _ = self._model_to_graph(SplitModule(), (x, y, t), input_names=["x", "y", "t"], - dynamic_axes={"x": [0, 1], "y": [0, 1], "t": [0, 1]}) + graph, _, _ = self._model_to_graph( + SplitModule(), + (x, y, t), + input_names=["x", "y", "t"], + dynamic_axes={"x": [0, 1], "y": [0, 1], "t": [0, 1]}, + ) for node in graph.nodes(): self.assertNotEqual(node.kind(), "onnx::SplitToSequence") def test_constant_fold_transpose(self): class TransposeModule(torch.nn.Module): def forward(self, x): - a = torch.tensor([[1., 2., 3.], [4., 5., 6.]]) + a = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) b = torch.transpose(a, 1, 0) return b + x _set_opset_version(self.opset_version) _set_operator_export_type(OperatorExportTypes.ONNX) x = torch.ones(3, 2) - graph, _, __ = self._model_to_graph(TransposeModule(), (x, ), input_names=["x"], - dynamic_axes={"x": [0, 1]}) + graph, _, __ = self._model_to_graph( + TransposeModule(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]} + ) for node in graph.nodes(): self.assertNotEqual(node.kind(), "onnx::Transpose") @@ -154,15 +175,16 @@ def forward(self, x): def test_constant_fold_reduceL2(self): class ReduceModule(torch.nn.Module): def forward(self, x): - a = torch.tensor([[1., 2., 3.], [4., 5., 6.]]) + a = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) b = torch.norm(a, p=2, dim=-2, keepdim=False) return b + x _set_opset_version(self.opset_version) _set_operator_export_type(OperatorExportTypes.ONNX) x = torch.ones(2, 3) - graph, _, __ = self._model_to_graph(ReduceModule(), (x, ), input_names=["x"], - dynamic_axes={"x": [0, 1]}) + graph, _, __ = self._model_to_graph( + ReduceModule(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]} + ) for node in graph.nodes(): self.assertNotEqual(node.kind(), "onnx::ReduceL2") @@ -171,15 +193,16 @@ def forward(self, x): def test_constant_fold_reduceL1(self): class NormModule(torch.nn.Module): def forward(self, x): - a = torch.tensor([[1., 2., 3.], [4., 5., 6.]]) + a = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) b = torch.norm(a, p=1, dim=-2) return b + x _set_opset_version(self.opset_version) _set_operator_export_type(OperatorExportTypes.ONNX) x = torch.ones(2, 3) - graph, _, __ = self._model_to_graph(NormModule(), (x, ), input_names=["x"], - dynamic_axes={"x": [0, 1]}) + graph, _, __ = self._model_to_graph( + NormModule(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]} + ) for node in graph.nodes(): self.assertNotEqual(node.kind(), "onnx::ReduceL1") @@ -188,15 +211,16 @@ def forward(self, x): def test_constant_fold_slice(self): class NarrowModule(torch.nn.Module): def forward(self, x): - a = torch.tensor([[1., 2., 3.], [4., 5., 6.]]) + a = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) b = torch.narrow(a, 0, 0, 1) return b + x _set_opset_version(self.opset_version) _set_operator_export_type(OperatorExportTypes.ONNX) x = torch.ones(1, 3) - graph, _, __ = self._model_to_graph(NarrowModule(), (x, ), input_names=["x"], - dynamic_axes={"x": [0, 1]}) + graph, _, __ = self._model_to_graph( + NarrowModule(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]} + ) for node in graph.nodes(): self.assertNotEqual(node.kind(), "onnx::Slice") @@ -207,15 +231,19 @@ def forward(self, x): def test_constant_fold_slice_index_exceeds_dim(self): class SliceIndexExceedsDimModule(torch.nn.Module): def forward(self, x): - a = torch.tensor([[1., 2., 3.], [4., 5., 6.]]) - b = a[1:10] # index exceeds dimension + a = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) + b = a[1:10] # index exceeds dimension return b + x _set_opset_version(self.opset_version) _set_operator_export_type(OperatorExportTypes.ONNX) x = torch.ones(1, 3) - graph, _, __ = self._model_to_graph(SliceIndexExceedsDimModule(), (x, ), input_names=["x"], - dynamic_axes={"x": [0, 1]}) + graph, _, __ = self._model_to_graph( + SliceIndexExceedsDimModule(), + (x,), + input_names=["x"], + dynamic_axes={"x": [0, 1]}, + ) for node in graph.nodes(): self.assertNotEqual(node.kind(), "onnx::Slice") @@ -226,8 +254,8 @@ def forward(self, x): def test_constant_fold_slice_negative_index(self): class SliceNegativeIndexModule(torch.nn.Module): def forward(self, x): - a = torch.tensor([[1., 2., 3.], [4., 5., 6.]]) - b = a[0:-1] # index relative to the end + a = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) + b = a[0:-1] # index relative to the end c = torch.select(a, dim=-1, index=-2) d = torch.select(a, dim=1, index=0) return b + x, c + d @@ -235,8 +263,12 @@ def forward(self, x): _set_opset_version(self.opset_version) _set_operator_export_type(OperatorExportTypes.ONNX) x = torch.ones(1, 3) - graph, _, __ = self._model_to_graph(SliceNegativeIndexModule(), (x, ), input_names=["x"], - dynamic_axes={"x": [0, 1]}) + graph, _, __ = self._model_to_graph( + SliceNegativeIndexModule(), + (x,), + input_names=["x"], + dynamic_axes={"x": [0, 1]}, + ) for node in graph.nodes(): self.assertNotEqual(node.kind(), "onnx::Slice") @@ -246,7 +278,7 @@ def forward(self, x): def test_constant_fold_gather(self): class GatherModule(torch.nn.Module): def forward(self, x): - a = torch.tensor([[1., 2., 3.], [4., 5., 6.]]) + a = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) b = torch.select(a, dim=1, index=-2) c = torch.index_select(a, dim=-2, index=torch.tensor([0, 1])) return b + 1, c + x @@ -256,8 +288,9 @@ def forward(self, x): x = torch.ones(1, 3) model = GatherModule() model(x) - graph, _, __ = self._model_to_graph(GatherModule(), (x, ), input_names=["x"], - dynamic_axes={"x": [0, 1]}) + graph, _, __ = self._model_to_graph( + GatherModule(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]} + ) for node in graph.nodes(): self.assertNotEqual(node.kind(), "onnx::Gather") @@ -265,15 +298,16 @@ def forward(self, x): def test_constant_fold_unsqueeze(self): class UnsqueezeModule(torch.nn.Module): def forward(self, x): - a = torch.tensor([[1., 2., 3.], [4., 5., 6.]]) + a = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) b = torch.unsqueeze(a, -2) return b + x _set_opset_version(self.opset_version) _set_operator_export_type(OperatorExportTypes.ONNX) x = torch.ones(1, 2, 3) - graph, _, __ = self._model_to_graph(UnsqueezeModule(), (x, ), input_names=["x"], - dynamic_axes={"x": [0, 1, 2]}) + graph, _, __ = self._model_to_graph( + UnsqueezeModule(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1, 2]} + ) for node in graph.nodes(): self.assertNotEqual(node.kind(), "onnx::Unsqueeze") @@ -294,8 +328,9 @@ def forward(self, x): _set_opset_version(self.opset_version) _set_operator_export_type(OperatorExportTypes.ONNX) x = torch.randn(2, 3, 4, 5, 8, 7) - graph, _, __ = self._model_to_graph(PReluModel(), x, input_names=["x"], - dynamic_axes={"x": [0, 1, 2, 3, 4, 5]}) + graph, _, __ = self._model_to_graph( + PReluModel(), x, input_names=["x"], dynamic_axes={"x": [0, 1, 2, 3, 4, 5]} + ) for node in graph.nodes(): self.assertNotEqual(node.kind(), "onnx::Unsqueeze") @@ -306,14 +341,15 @@ def forward(self, x): def test_constant_fold_squeeze_without_axes(self): class SqueezeModule(torch.nn.Module): def forward(self, x): - a = torch.tensor([[[1., 2., 3.], [4., 5., 6.]]]) + a = torch.tensor([[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]]) return torch.squeeze(a) + x + torch.squeeze(a) _set_opset_version(self.opset_version) _set_operator_export_type(OperatorExportTypes.ONNX) x = torch.ones(2, 3) - graph, _, __ = self._model_to_graph(SqueezeModule(), (x, ), input_names=["x"], - dynamic_axes={"x": [0, 1]}) + graph, _, __ = self._model_to_graph( + SqueezeModule(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]} + ) for node in graph.nodes(): self.assertNotEqual(node.kind(), "onnx::Squeeze") self.assertNotEqual(node.kind(), "onnx::Cast") @@ -323,14 +359,15 @@ def forward(self, x): def test_constant_fold_squeeze_with_axes(self): class SqueezeAxesModule(torch.nn.Module): def forward(self, x): - a = torch.tensor([[[1., 2., 3.], [4., 5., 6.]]]) + a = torch.tensor([[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]]) return torch.squeeze(a, dim=-3) + x _set_opset_version(self.opset_version) _set_operator_export_type(OperatorExportTypes.ONNX) x = torch.ones(2, 3) - graph, _, __ = self._model_to_graph(SqueezeAxesModule(), (x, ), input_names=["x"], - dynamic_axes={"x": [0, 1]}) + graph, _, __ = self._model_to_graph( + SqueezeAxesModule(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]} + ) for node in graph.nodes(): self.assertNotEqual(node.kind(), "onnx::Squeeze") @@ -356,8 +393,8 @@ def forward(self, x): # # More commentary at # https://github.com/pytorch/pytorch/pull/18698/files#r340107552 - a = torch.tensor([[1., 2., 3.]]).to(torch.float) - b = torch.tensor([[4., 5., 6.]]).to(torch.float) + a = torch.tensor([[1.0, 2.0, 3.0]]).to(torch.float) + b = torch.tensor([[4.0, 5.0, 6.0]]).to(torch.float) c = torch.cat((a, b), 0) d = b + c return x + d @@ -365,8 +402,9 @@ def forward(self, x): _set_opset_version(self.opset_version) _set_operator_export_type(OperatorExportTypes.ONNX) x = torch.ones(2, 3) - graph, _, __ = self._model_to_graph(ConcatModule(), (x, ), input_names=["x"], - dynamic_axes={"x": [0, 1]}) + graph, _, __ = self._model_to_graph( + ConcatModule(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]} + ) for node in graph.nodes(): self.assertNotEqual(node.kind(), "onnx::Concat") @@ -387,8 +425,12 @@ def forward(self, input, initial_state): _set_operator_export_type(OperatorExportTypes.ONNX) input = torch.randn(5, 3, 7) h0 = torch.randn(1, 3, 3) - graph, _, __ = self._model_to_graph(GruNet(), (input, h0), input_names=["input", "h0"], - dynamic_axes={"input": [0, 1, 2], "h0": [0, 1, 2]}) + graph, _, __ = self._model_to_graph( + GruNet(), + (input, h0), + input_names=["input", "h0"], + dynamic_axes={"input": [0, 1, 2], "h0": [0, 1, 2]}, + ) for node in graph.nodes(): self.assertNotEqual(node.kind(), "onnx::Slice") @@ -413,8 +455,9 @@ def forward(self, A): _set_opset_version(self.opset_version) _set_operator_export_type(OperatorExportTypes.ONNX) A = torch.randn(2, 3) - graph, _, __ = self._model_to_graph(MatMulNet(), (A, ), - input_names=["A"], dynamic_axes={"A": [0, 1]}) + graph, _, __ = self._model_to_graph( + MatMulNet(), (A,), input_names=["A"], dynamic_axes={"A": [0, 1]} + ) for node in graph.nodes(): self.assertNotEqual(node.kind(), "onnx::Transpose") @@ -422,7 +465,9 @@ def forward(self, A): def test_constant_fold_reshape(self): class ReshapeModule(torch.nn.Module): - def __init__(self, ): + def __init__( + self, + ): super(ReshapeModule, self).__init__() self.register_buffer("weight", torch.ones(5)) @@ -433,8 +478,9 @@ def forward(self, x): _set_opset_version(self.opset_version) _set_operator_export_type(OperatorExportTypes.ONNX) x = torch.randn(4, 5) - graph, _, __ = self._model_to_graph(ReshapeModule(), (x, ), - input_names=["x"], dynamic_axes={"x": [0, 1]}) + graph, _, __ = self._model_to_graph( + ReshapeModule(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]} + ) for node in graph.nodes(): self.assertNotEqual(node.kind(), "onnx::Reshape") @@ -442,7 +488,9 @@ def forward(self, x): def test_constant_fold_div(self): class Module(torch.nn.Module): - def __init__(self, ): + def __init__( + self, + ): super(Module, self).__init__() self.register_buffer("weight", torch.ones(5)) @@ -453,8 +501,9 @@ def forward(self, x): x = torch.randn(2, 5) _set_opset_version(self.opset_version) _set_operator_export_type(OperatorExportTypes.ONNX) - graph, _, __ = self._model_to_graph(Module(), (x, ), input_names=["x"], - dynamic_axes={"x": [0, 1]}) + graph, _, __ = self._model_to_graph( + Module(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]} + ) for node in graph.nodes(): self.assertNotEqual(node.kind(), "onnx::Div") @@ -462,7 +511,9 @@ def forward(self, x): def test_constant_fold_mul(self): class Module(torch.nn.Module): - def __init__(self, ): + def __init__( + self, + ): super(Module, self).__init__() self.register_buffer("weight", torch.ones(5)) @@ -473,8 +524,9 @@ def forward(self, x): x = torch.randn(2, 5) _set_opset_version(self.opset_version) _set_operator_export_type(OperatorExportTypes.ONNX) - graph, _, __ = self._model_to_graph(Module(), (x, ), input_names=["x"], - dynamic_axes={"x": [0, 1]}) + graph, _, __ = self._model_to_graph( + Module(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]} + ) for node in graph.nodes(): self.assertNotEqual(node.kind(), "onnx::Mul") @@ -482,7 +534,9 @@ def forward(self, x): def test_constant_fold_add(self): class Module(torch.nn.Module): - def __init__(self, ): + def __init__( + self, + ): super(Module, self).__init__() self.register_buffer("weight", torch.ones(5)) @@ -494,9 +548,13 @@ def forward(self, x): _set_opset_version(self.opset_version) _set_operator_export_type(OperatorExportTypes.ONNX) graph, params_dict, __ = self._model_to_graph( - Module(), (x, ), do_constant_folding=True, + Module(), + (x,), + do_constant_folding=True, operator_export_type=OperatorExportTypes.ONNX, - input_names=["x"], dynamic_axes={"x": [0, 1]}) + input_names=["x"], + dynamic_axes={"x": [0, 1]}, + ) for node in graph.nodes(): self.assertTrue(node.kind() != "onnx::Add") self.assertEqual(len(list(graph.nodes())), 1) @@ -508,7 +566,9 @@ def forward(self, x): def test_constant_fold_sub(self): class Module(torch.nn.Module): - def __init__(self, ): + def __init__( + self, + ): super(Module, self).__init__() self.register_buffer("weight", torch.ones(5)) @@ -520,8 +580,13 @@ def forward(self, x): _set_opset_version(self.opset_version) _set_operator_export_type(OperatorExportTypes.ONNX) graph, params_dict, __ = self._model_to_graph( - Module(), (x, ), do_constant_folding=True, - operator_export_type=OperatorExportTypes.ONNX, input_names=["x"], dynamic_axes={"x": [0, 1]}) + Module(), + (x,), + do_constant_folding=True, + operator_export_type=OperatorExportTypes.ONNX, + input_names=["x"], + dynamic_axes={"x": [0, 1]}, + ) for node in graph.nodes(): self.assertNotEqual(node.kind(), "onnx::Sub") self.assertEqual(len(list(graph.nodes())), 1) @@ -533,7 +598,9 @@ def forward(self, x): def test_constant_fold_sqrt(self): class Module(torch.nn.Module): - def __init__(self, ): + def __init__( + self, + ): super(Module, self).__init__() self.register_buffer("weight", torch.ones(5)) @@ -544,7 +611,9 @@ def forward(self, x): x = torch.randn(2, 5) _set_opset_version(self.opset_version) _set_operator_export_type(OperatorExportTypes.ONNX) - graph, _, __ = self._model_to_graph(Module(), (x, ), input_names=["x"], dynamic_axes={"x": [0, 1]}) + graph, _, __ = self._model_to_graph( + Module(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]} + ) for node in graph.nodes(): self.assertNotEqual(node.kind(), "onnx::Sqrt") self.assertEqual(len(list(graph.nodes())), 1) @@ -562,7 +631,9 @@ def forward(self, x): x = torch.randn(2, 5) _set_opset_version(self.opset_version) _set_operator_export_type(OperatorExportTypes.ONNX) - graph, _, __ = self._model_to_graph(ShapeModule(), (x, ), input_names=["x"], dynamic_axes={"x": [0, 1]}) + graph, _, __ = self._model_to_graph( + ShapeModule(), (x,), input_names=["x"], dynamic_axes={"x": [0, 1]} + ) for node in graph.nodes(): self.assertNotEqual(node.kind(), "onnx::Shape") @@ -572,14 +643,16 @@ def test_verbose(self): class MyModule(torch.nn.Module): def forward(self, input): return torch.exp(input) + x = torch.randn(3, 4) def is_model_stripped(f, verbose=None): if verbose is None: torch.onnx.export(MyModule(), x, f, opset_version=self.opset_version) else: - torch.onnx.export(MyModule(), x, f, verbose=verbose, - opset_version=self.opset_version) + torch.onnx.export( + MyModule(), x, f, verbose=verbose, opset_version=self.opset_version + ) model = onnx.load(io.BytesIO(f.getvalue())) model_strip = copy.copy(model) onnx.helper.strip_doc_string(model_strip) @@ -595,12 +668,55 @@ def test_error_on_data_parallel(self): model = torch.nn.DataParallel(torch.nn.ReflectionPad2d((1, 2, 3, 4))) x = torch.randn(1, 2, 3, 4) f = io.BytesIO() - with self.assertRaisesRegex(ValueError, - "torch.nn.DataParallel is not supported by ONNX " - "exporter, please use 'attribute' module to " - "unwrap model from torch.nn.DataParallel. Try "): + with self.assertRaisesRegex( + ValueError, + "torch.nn.DataParallel is not supported by ONNX " + "exporter, please use 'attribute' module to " + "unwrap model from torch.nn.DataParallel. Try ", + ): torch.onnx.export(model, x, f, opset_version=self.opset_version) + @skipIfUnsupportedMinOpsetVersion(11) + def test_sequence_dim(self): + class Module(torch.nn.Module): + def forward(self, x, y): + return [x, y] + + model = Module() + # Export with scripting to keep output as Sequence type. + # Tracing unpacks the list. + script_model = torch.jit.script(model) + x = torch.randn(2, 3) + + # Case 1: dynamic axis + f = io.BytesIO() + y = torch.randn(2, 3) + torch.onnx.export( + script_model, + (x, y), + f, + opset_version=self.opset_version, + input_names=["x", "y"], + dynamic_axes={"y": [1]}, + ) + onnx_model = onnx.load(io.BytesIO(f.getvalue())) + loop_output_value_info_proto = onnx_model.graph.output[0] + ref_value_info_proto = onnx.helper.make_tensor_sequence_value_info( + loop_output_value_info_proto.name, 1, [2, None] + ) + self.assertEqual(loop_output_value_info_proto, ref_value_info_proto) + + # Case 2: no dynamic axes. + f = io.BytesIO() + y = torch.randn(2, 3) + torch.onnx.export(script_model, (x, y), f, opset_version=self.opset_version) + onnx_model = onnx.load(io.BytesIO(f.getvalue())) + loop_output_value_info_proto = onnx_model.graph.output[0] + ref_value_info_proto = onnx.helper.make_tensor_sequence_value_info( + loop_output_value_info_proto.name, 1, [2, 3] + ) + self.assertEqual(loop_output_value_info_proto, ref_value_info_proto) + def test_export_mode(self): class MyModule(torch.nn.Module): def forward(self, x): @@ -614,16 +730,26 @@ def forward(self, x): # set mode to in inference mode and export in training mode model.eval() old_state = model.training - torch.onnx.export(model, (x,), f, - opset_version=self.opset_version, training=torch.onnx.TrainingMode.TRAINING) + torch.onnx.export( + model, + (x,), + f, + opset_version=self.opset_version, + training=torch.onnx.TrainingMode.TRAINING, + ) # verify that the model state is preserved self.assertEqual(model.training, old_state) # set mode to training mode and export in inference mode model.train() old_state = model.training - torch.onnx.export(model, (x,), f, - opset_version=self.opset_version, training=torch.onnx.TrainingMode.EVAL) + torch.onnx.export( + model, + (x,), + f, + opset_version=self.opset_version, + training=torch.onnx.TrainingMode.EVAL, + ) # verify that the model state is preserved self.assertEqual(model.training, old_state) @@ -641,7 +767,9 @@ class M(torch.nn.Module): def __init__(self, num_layers): super().__init__() self.num_layers = num_layers - self.lns = torch.nn.ModuleList([torch.nn.LayerNorm(3, eps=i) for i in range(num_layers)]) + self.lns = torch.nn.ModuleList( + [torch.nn.LayerNorm(3, eps=i) for i in range(num_layers)] + ) self.celu1 = torch.nn.CELU(1.0) self.celu2 = torch.nn.CELU(2.0) self.dropout = N(0.5) @@ -662,8 +790,17 @@ def forward(self, x, y, z): # Model export in inference mode will remove dropout node, # thus the dropout module no longer exist in graph. f = io.BytesIO() - torch.onnx.export(M(3), (x, y, z), f, opset_version=self.opset_version, - export_modules_as_functions={torch.nn.CELU, torch.nn.Dropout, torch.nn.LayerNorm}) + torch.onnx.export( + M(3), + (x, y, z), + f, + opset_version=self.opset_version, + export_modules_as_functions={ + torch.nn.CELU, + torch.nn.Dropout, + torch.nn.LayerNorm, + }, + ) onnx_model = onnx.load(io.BytesIO(f.getvalue())) @@ -672,11 +809,11 @@ def forward(self, x, y, z): celu_funcs = [f for f in funcs if f.name == "CELU"] self.assertEqual(len(celu_funcs), 1) self.assertEqual(celu_funcs[0].domain, "torch.nn.modules.activation") - self.assertEqual(len(celu_funcs[0].attribute), 1) + self.assertEqual(len(celu_funcs[0].attribute), 3) ln_funcs = [f for f in funcs if f.name == "LayerNorm"] self.assertEqual(len(ln_funcs), 1) self.assertEqual(ln_funcs[0].domain, "torch.nn.modules.normalization") - self.assertEqual(len(ln_funcs[0].attribute), 1) + self.assertEqual(len(ln_funcs[0].attribute), 3) # Check local function nodes nodes = onnx_model.graph.node @@ -684,15 +821,20 @@ def forward(self, x, y, z): ln_ns = [n for n in nodes if n.op_type == "LayerNorm"] self.assertEqual(len(celu_ns), 2) self.assertEqual(celu_ns[0].domain, "torch.nn.modules.activation") - self.assertEqual(len(celu_ns[0].attribute), 1) + self.assertEqual(len(celu_ns[0].attribute), 3) self.assertEqual(len(ln_ns), 3) self.assertEqual(ln_ns[0].domain, "torch.nn.modules.normalization") - self.assertEqual(len(ln_ns[0].attribute), 1) + self.assertEqual(len(ln_ns[0].attribute), 3) # Export specified modules. f = io.BytesIO() - torch.onnx.export(M(3), (x, y, z), f, opset_version=self.opset_version, - export_modules_as_functions={torch.nn.CELU}) + torch.onnx.export( + M(3), + (x, y, z), + f, + opset_version=self.opset_version, + export_modules_as_functions={torch.nn.CELU}, + ) onnx_model = onnx.load(io.BytesIO(f.getvalue())) funcs = onnx_model.functions @@ -701,8 +843,13 @@ def forward(self, x, y, z): # Export with empty specified modules. Normal export. f = io.BytesIO() - torch.onnx.export(M(3), (x, y, z), f, opset_version=self.opset_version, - export_modules_as_functions=set()) + torch.onnx.export( + M(3), + (x, y, z), + f, + opset_version=self.opset_version, + export_modules_as_functions=set(), + ) onnx_model = onnx.load(io.BytesIO(f.getvalue())) funcs = onnx_model.functions @@ -710,8 +857,13 @@ def forward(self, x, y, z): # Export all modules. Should contain {M, CELU, LayerNorm}. f = io.BytesIO() - torch.onnx.export(M(3), (x, y, z), f, opset_version=self.opset_version, - export_modules_as_functions=True) + torch.onnx.export( + M(3), + (x, y, z), + f, + opset_version=self.opset_version, + export_modules_as_functions=True, + ) onnx_model = onnx.load(io.BytesIO(f.getvalue())) funcs = onnx_model.functions @@ -741,8 +893,13 @@ def forward(self, x, y, z): z = torch.randn(2, 3) f = io.BytesIO() - torch.onnx.export(M(3), (x, y, z), f, opset_version=self.opset_version, - export_modules_as_functions={NWithOverloads}) + torch.onnx.export( + M(3), + (x, y, z), + f, + opset_version=self.opset_version, + export_modules_as_functions={NWithOverloads}, + ) onnx_model = onnx.load(io.BytesIO(f.getvalue())) funcs = onnx_model.functions @@ -763,13 +920,73 @@ def forward(self, x): x = torch.randn(4, 5) f = io.BytesIO() - torch.onnx.export(M(), (x,), f, export_modules_as_functions=True, - opset_version=self.opset_version, do_constant_folding=False) + torch.onnx.export( + M(), + (x,), + f, + export_modules_as_functions=True, + opset_version=self.opset_version, + do_constant_folding=False, + ) onnx_model = onnx.load(io.BytesIO(f.getvalue())) funcs = onnx_model.functions self.assertIn("M", [f.name for f in funcs]) + @skipIfUnsupportedMinOpsetVersion(15) + def test_local_function_predefined_attributes(self): + class M(torch.nn.Module): + num_layers: int + + def __init__(self, num_layers): + super().__init__() + self.num_layers = num_layers + self.lns = torch.nn.ModuleList( + [torch.nn.LayerNorm(3, eps=1e-4) for _ in range(num_layers)] + ) + + def forward(self, x): + for ln in self.lns: + x = ln(x) + return x + + x = torch.randn(2, 3) + f = io.BytesIO() + model = M(3) + torch.onnx.export( + model, + (x,), + f, + export_modules_as_functions=True, + opset_version=self.opset_version, + ) + + onnx_model = onnx.load(io.BytesIO(f.getvalue())) + funcs = onnx_model.functions + m_funcs = [fn for fn in funcs if fn.name == "M"] + self.assertEqual(m_funcs[0].attribute, ["num_layers"]) + ln_funcs = [fn for fn in funcs if fn.name == "LayerNorm"] + self.assertEqual(ln_funcs[0].attribute, ["eps", "elementwise_affine"]) + + from onnx import helper + + m_node = [n for n in onnx_model.graph.node if n.op_type == "M"] + self.assertEqual( + m_node[0].attribute[0], + helper.make_attribute("num_layers", model.num_layers), + ) + + ln_nodes = [n for n in m_funcs[0].node if n.op_type == "LayerNorm"] + expected_ln_attrs = [ + helper.make_attribute( + "elementwise_affine", model.lns[0].elementwise_affine + ), + helper.make_attribute("eps", model.lns[0].eps), + ] + for ln_node in ln_nodes: + self.assertIn(ln_node.attribute[0], expected_ln_attrs) + self.assertIn(ln_node.attribute[1], expected_ln_attrs) + def test_aten_fallthrough(self): # Test aten export of op with no symbolic class Module(torch.nn.Module): @@ -778,9 +995,13 @@ def forward(self, x): x = torch.randn(2, 3, 4) _set_opset_version(self.opset_version) - graph, _, __ = self._model_to_graph(Module(), (x, ), - operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH, - input_names=["x"], dynamic_axes={"x": [0, 1, 2]}) + graph, _, __ = self._model_to_graph( + Module(), + (x,), + operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH, + input_names=["x"], + dynamic_axes={"x": [0, 1, 2]}, + ) iter = graph.nodes() self.assertEqual(next(iter).kind(), "aten::erfc") @@ -812,25 +1033,33 @@ def forward(self, input, other): x = torch.randn(2, 3, 4, requires_grad=False) y = torch.randn(2, 3, 4, requires_grad=False) model = FooModel() - graph, _, __ = self._model_to_graph(model, (x, y), - operator_export_type=torch.onnx.OperatorExportTypes.ONNX_FALLTHROUGH, - input_names=["x", "y"], - dynamic_axes={"x": [0, 1, 2], "y": [0, 1, 2]}) + graph, _, __ = self._model_to_graph( + model, + (x, y), + operator_export_type=torch.onnx.OperatorExportTypes.ONNX_FALLTHROUGH, + input_names=["x", "y"], + dynamic_axes={"x": [0, 1, 2], "y": [0, 1, 2]}, + ) iter = graph.nodes() self.assertEqual(next(iter).kind(), "custom_namespace::custom_op") def test_custom_opsets_gelu(self): self.addCleanup(unregister_custom_op_symbolic, "::gelu", 1) - def gelu(g, self): + def gelu(g, self, approximate): return g.op("com.microsoft::Gelu", self).setType(self.type()) register_custom_op_symbolic("::gelu", gelu, 1) - model = torch.nn.GELU() + model = torch.nn.GELU(approximate="none") x = torch.randn(3, 3) f = io.BytesIO() - torch.onnx.export(model, (x, ), f, - opset_version=self.opset_version, custom_opsets={"com.microsoft": 1}) + torch.onnx.export( + model, + (x,), + f, + opset_version=self.opset_version, + custom_opsets={"com.microsoft": 1}, + ) graph = onnx.load(io.BytesIO(f.getvalue())) self.assertEqual(graph.graph.node[0].op_type, "Gelu") @@ -838,18 +1067,17 @@ def gelu(g, self): self.assertEqual(graph.opset_import[1].domain, "com.microsoft") self.assertEqual(graph.opset_import[1].version, 1) - def test_register_aten_custom_op_symbolic(self): self.addCleanup(unregister_custom_op_symbolic, "aten::gelu", 1) - def gelu(g, self): + def gelu(g, self, approximate): return g.op("com.microsoft::Gelu", self).setType(self.type()) register_custom_op_symbolic("aten::gelu", gelu, 1) - model = torch.nn.GELU() + model = torch.nn.GELU(approximate="none") x = torch.randn(3, 3) f = io.BytesIO() - torch.onnx.export(model, (x, ), f, opset_version=self.opset_version) + torch.onnx.export(model, (x,), f, opset_version=self.opset_version) graph = onnx.load(io.BytesIO(f.getvalue())) self.assertEqual(graph.graph.node[0].op_type, "Gelu") @@ -867,8 +1095,13 @@ def inverse(g, self): model = CustomInverse() x = torch.randn(2, 3, 3) f = io.BytesIO() - torch.onnx.export(model, (x, ), f, - opset_version=self.opset_version, custom_opsets={"com.microsoft": 1}) + torch.onnx.export( + model, + (x,), + f, + opset_version=self.opset_version, + custom_opsets={"com.microsoft": 1}, + ) graph = onnx.load(io.BytesIO(f.getvalue())) self.assertEqual(graph.graph.node[0].op_type, "Inverse") @@ -878,51 +1111,20 @@ def inverse(g, self): def test_onnx_fallthrough(self): # Test aten export of op with symbolic for aten - x = torch.randn(100, 128) - y = torch.randn(100, 128) - model = torch.nn.CosineSimilarity(dim=1, eps=1e-6) - - graph, _, __ = self._model_to_graph(model, (x, y), - operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH, - input_names=["x", "y"], - dynamic_axes={"x": [0, 1], "y": [0, 1]}) - iter = graph.nodes() - self.assertEqual(next(iter).kind(), "onnx::Constant") - self.assertEqual(next(iter).kind(), "onnx::Constant") - self.assertEqual(next(iter).kind(), "aten::cosine_similarity") - - def test_quantized_fallthrough(self): - # Test Quantized op - class QModule(torch.nn.Module): - def __init__(self): - super(QModule, self).__init__() - self.quant1 = torch.ao.quantization.QuantStub() - self.dequant = torch.ao.quantization.DeQuantStub() - + class Module(torch.nn.Module): def forward(self, x): - res = self.quant1(x) - return self.dequant(res) - - model = QModule() - torch.backends.quantized.engine = "qnnpack" - pt_inputs = (torch.randn(1, 2, 3, 4)) - model.qconfig = torch.ao.quantization.default_qconfig - q_model = torch.ao.quantization.prepare(model, inplace=False) - q_model = torch.ao.quantization.convert(q_model, inplace=False) - - q_model.eval() - - graph, _, __ = self._model_to_graph(q_model, pt_inputs, - operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH, - input_names=["pt_inputs"], - dynamic_axes={"pt_inputs": [0, 1, 2, 3]}) + return torch.digamma(x) + x = torch.randn(100, 128) + graph, _, __ = self._model_to_graph( + Module(), + (x,), + operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH, + input_names=["x"], + dynamic_axes={"x": [0, 1]}, + ) iter = graph.nodes() - self.assertEqual(next(iter).kind(), "onnx::Constant") - self.assertEqual(next(iter).kind(), "onnx::Constant") - self.assertEqual(next(iter).kind(), "onnx::Constant") - self.assertEqual(next(iter).kind(), "aten::quantize_per_tensor") - self.assertEqual(next(iter).kind(), "aten::dequantize") + self.assertEqual(next(iter).kind(), "aten::digamma") # prim::ListConstruct is exported as onnx::SequenceConstruct for opset >= 11 @skipIfUnsupportedMaxOpsetVersion(10) @@ -940,9 +1142,13 @@ def forward(self, x): x = torch.tensor([2]) model = PrimModule() model.eval() - graph, _, __ = self._model_to_graph(model, (x,), - operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH, - input_names=["x"], dynamic_axes={"x": [0]}) + graph, _, __ = self._model_to_graph( + model, + (x,), + operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH, + input_names=["x"], + dynamic_axes={"x": [0]}, + ) iter = graph.nodes() self.assertEqual(next(iter).kind(), "prim::ListConstruct") @@ -963,8 +1169,9 @@ def forward(self, input): model = Custom() batch = torch.FloatTensor(1, 3) - graph, _, _ = self._model_to_graph(model, batch, - input_names=["batch"], dynamic_axes={"batch": [0, 1]}) + graph, _, _ = self._model_to_graph( + model, batch, input_names=["batch"], dynamic_axes={"batch": [0, 1]} + ) iter = graph.nodes() self.assertEqual(next(iter).kind(), "CustomNamespace::Custom") @@ -977,7 +1184,7 @@ def forward(ctx, input): @staticmethod def backward(ctx, grad_output): - input, = ctx.saved_tensors + (input,) = ctx.saved_tensors grad_input = grad_output.clone() grad_input[input < 0] = 0 return grad_input @@ -989,17 +1196,54 @@ def forward(self, input): model = Custom() batch = torch.FloatTensor(1, 3) - graph, _, _ = self._model_to_graph(model, batch, - operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH, - input_names=["batch"], dynamic_axes={"batch": [0, 1]}) + graph, _, _ = self._model_to_graph( + model, + batch, + operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH, + input_names=["batch"], + dynamic_axes={"batch": [0, 1]}, + ) iter = graph.nodes() self.assertEqual(next(iter).kind(), "prim::PythonOp") + def test_autograd_module_name(self): + class CustomFunction(torch.autograd.Function): + @staticmethod + def forward(ctx, input): + ctx.save_for_backward(input) + return input.clamp(min=0) + + @staticmethod + def backward(ctx, grad_output): + (input,) = ctx.saved_tensors + grad_input = grad_output.clone() + grad_input[input < 0] = 0 + return grad_input + + class Custom(torch.nn.Module): + def forward(self, input): + return CustomFunction.apply(input) + CustomFunction2.apply(input) + + model = Custom() + batch = torch.FloatTensor(1, 3) + + graph, _, _ = self._model_to_graph( + model, batch, input_names=["batch"], dynamic_axes={"batch": [0, 1]} + ) + iter = graph.nodes() + autograd1 = next(iter) + autograd2 = next(iter) + self.assertEqual(autograd1.kind(), "prim::PythonOp") + self.assertEqual(autograd2.kind(), "prim::PythonOp") + self.assertNotEqual(autograd1.s("module"), autograd2.s("module")) + def test_unused_initializers(self): class Model(torch.nn.Module): def __init__(self): super(Model, self).__init__() - self.conv2 = torch.nn.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(1, 1)) + self.conv2 = torch.nn.ConvTranspose2d( + 16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(1, 1) + ) self.k_proj = torch.nn.Linear(5, 5, bias=True) def forward(self, x): @@ -1009,10 +1253,14 @@ def forward(self, x): x = torch.randn(20, 16, 50, 100) _set_opset_version(self.opset_version) _set_operator_export_type(OperatorExportTypes.ONNX) - _, params_dict, __ = self._model_to_graph(Model(), (x, ), do_constant_folding=False, - operator_export_type=OperatorExportTypes.ONNX, - input_names=["x"], - dynamic_axes={"x": [0, 1, 2, 3]}) + _, params_dict, __ = self._model_to_graph( + Model(), + (x,), + do_constant_folding=False, + operator_export_type=OperatorExportTypes.ONNX, + input_names=["x"], + dynamic_axes={"x": [0, 1, 2, 3]}, + ) self.assertEqual(len(params_dict), 2) @@ -1020,7 +1268,9 @@ def test_scripting_param(self): class MyModule(torch.nn.Module): def __init__(self): super(MyModule, self).__init__() - self.conv = torch.nn.Conv2d(3, 16, kernel_size=1, stride=2, padding=3, bias=True) + self.conv = torch.nn.Conv2d( + 3, 16, kernel_size=1, stride=2, padding=3, bias=True + ) self.bn = torch.nn.BatchNorm2d(16, affine=True) def forward(self, x): @@ -1032,16 +1282,23 @@ def forward(self, x): x = torch.randn(10, 3, 128, 128) _set_opset_version(self.opset_version) _set_operator_export_type(OperatorExportTypes.ONNX) - graph, _, __ = self._model_to_graph(model, (x,), do_constant_folding=True, - operator_export_type=OperatorExportTypes.ONNX, - training=torch.onnx.TrainingMode.TRAINING, - input_names=["x"], dynamic_axes={"x": [0, 1, 2, 3]}) + graph, _, __ = self._model_to_graph( + model, + (x,), + do_constant_folding=True, + operator_export_type=OperatorExportTypes.ONNX, + training=torch.onnx.TrainingMode.TRAINING, + input_names=["x"], + dynamic_axes={"x": [0, 1, 2, 3]}, + ) graph_input_params = [param.debugName() for param in graph.inputs()] for item in dict(model.named_parameters()): self.assertIn( - item, graph_input_params, - "Graph parameter names does not match model parameters.") + item, + graph_input_params, + "Graph parameter names does not match model parameters.", + ) def test_modifying_params(self): class MyModel(torch.nn.Module): @@ -1055,13 +1312,19 @@ def forward(self, x): return y x = torch.tensor([1, 2]) + # Move import to local as caffe2 backend requires additional build flag, + # and is only used in this test case. + import caffe2.python.onnx.backend as backend + verify(MyModel(), x, backend, do_constant_folding=False) def test_fuse_conv_bn(self): class Fuse(torch.nn.Module): def __init__(self): super(Fuse, self).__init__() - self.conv = torch.nn.Conv2d(3, 2, kernel_size=1, stride=2, padding=3, bias=True) + self.conv = torch.nn.Conv2d( + 3, 2, kernel_size=1, stride=2, padding=3, bias=True + ) self.bn = torch.nn.BatchNorm2d(2) def forward(self, x): @@ -1069,9 +1332,13 @@ def forward(self, x): return self.bn(out) x = torch.randn(2, 3, 2, 2, requires_grad=True) - graph, _, __ = self._model_to_graph(Fuse(), (x, ), - training=TrainingMode.EVAL, input_names=["x"], - dynamic_axes={"x": [0, 1, 2, 3]}) + graph, _, __ = self._model_to_graph( + Fuse(), + (x,), + training=TrainingMode.EVAL, + input_names=["x"], + dynamic_axes={"x": [0, 1, 2, 3]}, + ) for node in graph.nodes(): self.assertNotEqual(node.kind(), "onnx::BatchNormalization") self.assertEqual(node.kind(), "onnx::Conv") @@ -1081,17 +1348,20 @@ def forward(self, x): def test_fuse_resnet18(self): model = torchvision.models.resnet18(pretrained=False) x = torch.randn(2, 3, 224, 224, requires_grad=True) - graph, _, __ = self._model_to_graph(model, (x, ), - training=TrainingMode.EVAL, - input_names=["x"], dynamic_axes={"x": [0, 1, 2, 3]}) + graph, _, __ = self._model_to_graph( + model, + (x,), + training=TrainingMode.EVAL, + input_names=["x"], + dynamic_axes={"x": [0, 1, 2, 3]}, + ) for node in graph.nodes(): self.assertNotEqual(node.kind(), "onnx::BatchNormalization") def test_onnx_function_substitution_pass(self): - @torch.jit.script - def f(x : torch.Tensor, y : torch.Tensor): + def f(x: torch.Tensor, y: torch.Tensor): z = x - y return x + z @@ -1106,16 +1376,22 @@ def forward(self, x, y): input_2 = torch.tensor(12) _set_opset_version(self.opset_version) _set_operator_export_type(OperatorExportTypes.ONNX) - graph, _, __ = self._model_to_graph(MyModule(), (input_1, input_2), do_constant_folding=True, - operator_export_type=OperatorExportTypes.ONNX, - input_names=["input_1", "input_2"], - dynamic_axes={"input_1": [0], "input_2": [0]}) + graph, _, __ = self._model_to_graph( + MyModule(), + (input_1, input_2), + do_constant_folding=True, + operator_export_type=OperatorExportTypes.ONNX, + input_names=["input_1", "input_2"], + dynamic_axes={"input_1": [0], "input_2": [0]}, + ) # Check that the prim::Constant node in the graph for representing the # scripted function `f` is removed and the following prim::CallFunction # is replced by inline graph, with onnx::Sub and onnx::Add nodes. for node in graph.nodes(): self.assertNotEqual(node.kind(), "prim::Constant") - self.assertEqual(len(list(graph.nodes())), 2) # onnx::Sub and onnx::Add nodes only. + self.assertEqual( + len(list(graph.nodes())), 2 + ) # onnx::Sub and onnx::Add nodes only. def test_onnx_value_name(self): class MyModule(torch.nn.Module): @@ -1139,9 +1415,13 @@ def forward(self, x): f = io.BytesIO() model.eval() - torch.onnx.export(model, (x,), f, - opset_version=self.opset_version, - keep_initializers_as_inputs=True) + torch.onnx.export( + model, + (x,), + f, + opset_version=self.opset_version, + keep_initializers_as_inputs=True, + ) graph = onnx.load(io.BytesIO(f.getvalue())) self.assertEqual(graph.graph.input[1].name, "in_weight") self.assertEqual(graph.graph.input[2].name, "in_bias") @@ -1164,7 +1444,7 @@ def forward(self, x): module = RenamedIntermediateModule() - g, p, o = utils._model_to_graph(module, torch.ones(1, 10), output_names=['y']) + g, p, o = utils._model_to_graph(module, torch.ones(1, 10), output_names=["y"]) renamed_intermediate = 0 for n in g.nodes(): for v in n.inputs(): @@ -1172,6 +1452,101 @@ def forward(self, x): renamed_intermediate += 1 self.assertEqual(renamed_intermediate, 2) + def _test_deduplicate_initializers(self, torchscript=False): + class MyModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.layer1 = torch.nn.Linear(3, 3) + self.layer2 = torch.nn.Linear(3, 3) + + # Reusing layers. + self.layer3 = self.layer1 + + # Reusing parameters. + self.layer2.weight = self.layer1.weight + self.layer1.bias = self.layer2.bias + + # Parameter with different tensors equal in value. + self.param1 = torch.nn.Parameter(torch.tensor([1.0, 2.0, 3.0])) + self.param2 = torch.nn.Parameter(torch.tensor([1.0, 2.0, 3.0])) + + def forward(self, x): + return ( + self.layer3(self.layer2(self.layer1(x))) + self.param1 + self.param2 + ) + + model = torch.jit.script(MyModule()) if torchscript else MyModule() + + x = torch.randn(3, 3) + param_name_set = set([k for k, _ in model.named_parameters()]) + + # Test training mode. + model.train() + f = io.BytesIO() + torch.onnx.export( + model, + (x,), + f, + training=TrainingMode.TRAINING, + opset_version=self.opset_version, + ) + graph = onnx.load(io.BytesIO(f.getvalue())) + self.assertSetEqual( + set([i.name for i in graph.graph.initializer]), param_name_set + ) + + model.train() + f = io.BytesIO() + torch.onnx.export( + model, + (x,), + f, + training=TrainingMode.PRESERVE, + opset_version=self.opset_version, + ) + graph = onnx.load(io.BytesIO(f.getvalue())) + self.assertSetEqual( + set([i.name for i in graph.graph.initializer]), param_name_set + ) + + # Test eval mode. + model.eval() + f = io.BytesIO() + torch.onnx.export(model, (x,), f, opset_version=self.opset_version) + graph = onnx.load(io.BytesIO(f.getvalue())) + param_name_set.remove("param2") + self.assertSetEqual( + set([i.name for i in graph.graph.initializer]), param_name_set + ) + + def test_deduplicate_initializers(self): + self._test_deduplicate_initializers(torchscript=False) + + def test_deduplicate_initializers_torchscript(self): + self._test_deduplicate_initializers(torchscript=True) + + @skipIfNoCuda + def test_deduplicate_initializers_diff_devices(self): + class Model(torch.nn.Module): + def __init__(self): + super().__init__() + self.w_cpu = torch.nn.Parameter( + torch.ones(3, device=torch.device("cpu")) + ) + self.w_cuda = torch.nn.Parameter( + torch.ones(3, device=torch.device("cuda")) + ) + + def forward(self, x, y): + return x + self.w_cpu, y + self.w_cuda + + x = torch.randn(3, 3, device=torch.device("cpu")) + y = torch.randn(3, 3, device=torch.device("cuda")) + f = io.BytesIO() + torch.onnx.export(Model(), (x, y), f, opset_version=self.opset_version) + graph = onnx.load(io.BytesIO(f.getvalue())) + self.assertSetEqual(set([i.name for i in graph.graph.initializer]), {"w_cpu"}) + def test_duplicated_output_node(self): class DuplicatedOutputNet(torch.nn.Module): def __init__(self, input_size, num_classes): @@ -1195,18 +1570,21 @@ def forward(self, input0, input1): "output-1": {0: "output-1_dim0", 1: "output-1_dim1"}, "output-2": {0: "output-2_dim0", 1: "output-2_dim1"}, "output-3": {0: "output-3_dim0", 1: "output-3_dim1"}, - "output-4": {0: "output-4_dim0", 1: "output-4_dim1"}} - - torch.onnx.export(pt_model, - (x, x), - f, - input_names=["input0", "input1"], - output_names=["output-0", "output-1", "output-2", "output-3", "output-4"], - do_constant_folding=False, - training=torch.onnx.TrainingMode.TRAINING, - dynamic_axes=dynamic_axes, - verbose=True, - keep_initializers_as_inputs=True) + "output-4": {0: "output-4_dim0", 1: "output-4_dim1"}, + } + + torch.onnx.export( + pt_model, + (x, x), + f, + input_names=["input0", "input1"], + output_names=["output-0", "output-1", "output-2", "output-3", "output-4"], + do_constant_folding=False, + training=torch.onnx.TrainingMode.TRAINING, + dynamic_axes=dynamic_axes, + verbose=True, + keep_initializers_as_inputs=True, + ) graph = onnx.load(io.BytesIO(f.getvalue())) self.assertEqual(graph.graph.input[0].name, "input0") @@ -1219,6 +1597,37 @@ def forward(self, input0, input1): self.assertEqual(graph.graph.node[3].op_type, "Gemm") self.assertEqual(graph.graph.node[4].op_type, "Identity") + def test_bad_symbolic_registration(self): + _onnx_opset_version = 9 + + @parse_args("v") + def cat(g, tensor_list, dim): + tensors = _unpack_list(tensor_list) + return g.op("Concat", *tensors, axis_i=dim) + + register_custom_op_symbolic("::cat", cat, _onnx_opset_version) + + class CatModel(torch.nn.Module): + def forward(self, x): + return torch.cat((x, x, x), 0) + + model = CatModel() + x = torch.randn(2, 3) + f = io.BytesIO() + self.assertExpectedRaisesInline( + AssertionError, + lambda: torch.onnx.export( + model, (x,), f, opset_version=_onnx_opset_version + ), + ( + "A mismatch between the number of arguments (2) and their descriptors (1) was found at symbolic function " + "'cat'. If you believe this is not due to custom symbolic implementation within your code or an external " + "library, please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml to " + "report this bug." + ), + ) + unregister_custom_op_symbolic("::cat", _onnx_opset_version) + class TestUtilityFuns_opset10(TestUtilityFuns_opset9): opset_version = 10 diff --git a/test/onnx/test_verify.py b/test/onnx/test_verify.py index 2884fa86472c..083b76f3bfc8 100644 --- a/test/onnx/test_verify.py +++ b/test/onnx/test_verify.py @@ -1,12 +1,12 @@ # Owner(s): ["module: onnx"] +from test_pytorch_common import TestCase, run_tests +from verify import verify + +import caffe2.python.onnx.backend as backend import torch from torch.autograd import Function from torch.nn import Module, Parameter -import caffe2.python.onnx.backend as backend -from verify import verify - -from test_pytorch_common import TestCase, run_tests class TestVerify(TestCase): diff --git a/test/onnx/verify.py b/test/onnx/verify.py index 4897a8f43f4a..f8f7b73f4b11 100644 --- a/test/onnx/verify.py +++ b/test/onnx/verify.py @@ -1,14 +1,13 @@ -import torch -import torch.jit -import torch.onnx +import difflib +import io +import numpy as np import onnx import onnx.helper -import numpy as np - -import difflib -import io +import torch +import torch.jit +import torch.onnx def colonize(msg, sep=": "): @@ -39,6 +38,7 @@ def __init__(self, msg, rtol=1e-3, atol=1e-5): # can be used class ShortCircuit(Exception): pass + self.exc_class = ShortCircuit def requireAlmostEqual(self, x, y, msg=None): @@ -67,8 +67,9 @@ def almostEqualAndThen(self, x, y, msg, k): """ if isinstance(x, np.ndarray) and isinstance(y, np.ndarray): try: - np.testing.assert_allclose(x, y, rtol=self.rtol, atol=self.atol, - equal_nan=True, verbose=True) + np.testing.assert_allclose( + x, y, rtol=self.rtol, atol=self.atol, equal_nan=True, verbose=True + ) except AssertionError as e: raise k("{}{}".format(colonize(msg), str(e).lstrip())) @@ -117,8 +118,11 @@ def equalAndThen(self, x, y, msg, k): if len(sx) > 40 or len(sy) > 40 or "\n" in sx or "\n" in sy: # long form l = "=" * 50 - k("\n{}The value\n{}\n{}\n{}\n\ndoes not equal\n\n{}\n{}\n{}" - .format(colonize(msg, ":\n"), l, sx, l, l, sy, l)) + k( + "\n{}The value\n{}\n{}\n{}\n\ndoes not equal\n\n{}\n{}\n{}".format( + colonize(msg, ":\n"), l, sx, l, l, sy, l + ) + ) else: k("{}{} != {}".format(colonize(msg), sx, sy)) @@ -193,6 +197,7 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): if exc_type == parent_self.exc_class: return True + return Recover() def addErrCtxt(self, msg): @@ -212,6 +217,7 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): parent_self.context.pop() + return AddContext() def __enter__(self): @@ -225,12 +231,25 @@ def __exit__(self, exc_type, exc_value, traceback): if exc_type == self.exc_class: raise RuntimeError("ShortCircuit was raised, but no errors were recorded") -def verify(model, args, backend, verbose=False, training=torch.onnx.TrainingMode.EVAL, rtol=1e-3, atol=1e-7, - test_args=2, do_constant_folding=True, opset_version=None, - keep_initializers_as_inputs=True, add_node_names=False, - operator_export_type=torch.onnx.OperatorExportTypes.ONNX, - input_names=None, dynamic_axes=None, - remained_onnx_input_idx=None): + +def verify( + model, + args, + backend, + verbose=False, + training=torch.onnx.TrainingMode.EVAL, + rtol=1e-3, + atol=1e-7, + test_args=2, + do_constant_folding=True, + opset_version=None, + keep_initializers_as_inputs=True, + add_node_names=False, + operator_export_type=torch.onnx.OperatorExportTypes.ONNX, + input_names=None, + dynamic_axes=None, + remained_onnx_input_idx=None, +): """ Export a model into ONNX, import it into a specified ONNX backend, and then on a few random inputs verify that PyTorch and the backend produced the same @@ -279,6 +298,7 @@ def verify(model, args, backend, verbose=False, training=torch.onnx.TrainingMode dynamic_axes (dict of (string, list)): dynamic_axes. remained_onnx_input_idx (list of int, default None): The remained ONNX input index. """ + def _nested_map(condition, fn, condition_msg=None): def _map(obj): if condition(obj): @@ -288,11 +308,18 @@ def _map(obj): elif isinstance(obj, (list, tuple)): return type(obj)(_map(x) for x in obj) else: - raise ValueError("Auto nesting doesn't know how to process " - "an input object of type " + torch.typename(obj) + - (". Accepted types: " + condition_msg + - ", or lists/tuples of them" - if condition_msg else "")) + raise ValueError( + "Auto nesting doesn't know how to process " + "an input object of type " + + torch.typename(obj) + + ( + ". Accepted types: " + + condition_msg + + ", or lists/tuples of them" + if condition_msg + else "" + ) + ) return _map @@ -309,11 +336,18 @@ def _iter(obj): elif allow_unknown: yield obj else: - raise ValueError("Auto nesting doesn't know how to process " - "an input object of type " + torch.typename(obj) + - (". Accepted types: " + condition_msg + - ", or lists/tuples of them" - if condition_msg else "")) + raise ValueError( + "Auto nesting doesn't know how to process " + "an input object of type " + + torch.typename(obj) + + ( + ". Accepted types: " + + condition_msg + + ", or lists/tuples of them" + if condition_msg + else "" + ) + ) return _iter @@ -352,14 +386,19 @@ def load_bytes(b): with torch.onnx.select_model_mode_for_export(model, training): proto_bytes = io.BytesIO() - torch_out = torch.onnx._export(model, args, proto_bytes, verbose=verbose, - do_constant_folding=do_constant_folding, - opset_version=opset_version, - keep_initializers_as_inputs=keep_initializers_as_inputs, - add_node_names=add_node_names, - operator_export_type=operator_export_type, - input_names=input_names, - dynamic_axes=dynamic_axes) + torch_out = torch.onnx._export( + model, + args, + proto_bytes, + verbose=verbose, + do_constant_folding=do_constant_folding, + opset_version=opset_version, + keep_initializers_as_inputs=keep_initializers_as_inputs, + add_node_names=add_node_names, + operator_export_type=operator_export_type, + input_names=input_names, + dynamic_axes=dynamic_axes, + ) if isinstance(model, torch.jit.ScriptModule): torch_out = model(*args) proto = load_bytes(proto_bytes) @@ -367,14 +406,19 @@ def load_bytes(b): def run(args, remained_onnx_input_idx): alt_proto_bytes = io.BytesIO() - torch_out = torch.onnx._export(model, args, alt_proto_bytes, verbose=verbose, - do_constant_folding=do_constant_folding, - opset_version=opset_version, - keep_initializers_as_inputs=keep_initializers_as_inputs, - add_node_names=add_node_names, - operator_export_type=operator_export_type, - input_names=input_names, - dynamic_axes=dynamic_axes) + torch_out = torch.onnx._export( + model, + args, + alt_proto_bytes, + verbose=verbose, + do_constant_folding=do_constant_folding, + opset_version=opset_version, + keep_initializers_as_inputs=keep_initializers_as_inputs, + add_node_names=add_node_names, + operator_export_type=operator_export_type, + input_names=input_names, + dynamic_axes=dynamic_axes, + ) if isinstance(model, torch.jit.ScriptModule): torch_out = model(*args) alt_proto = load_bytes(alt_proto_bytes) @@ -386,26 +430,36 @@ def run(args, remained_onnx_input_idx): with Errors(msg, rtol=rtol, atol=atol) as errs: # First, check if we have the same number of parameters, and # that they"re the same order. If they don"t, something has *really* gone wrong. - initializer_order_hint = ("This is really strange! The second time I exported your model,\n" - "it had a different set of parameters. Are you assigning Parameters\n" - "in the forward() of your model definition?") + initializer_order_hint = ( + "This is really strange! The second time I exported your model,\n" + "it had a different set of parameters. Are you assigning Parameters\n" + "in the forward() of your model definition?" + ) with errs.addErrCtxt(initializer_order_hint): - errs.requireEqual([x.name for x in proto.graph.initializer], - [x.name for x in alt_proto.graph.initializer], - msg="Parameters list differs") + errs.requireEqual( + [x.name for x in proto.graph.initializer], + [x.name for x in alt_proto.graph.initializer], + msg="Parameters list differs", + ) # Now check if the embedded parameters are actually the same - initializer_hint = ("A difference in embedded parameters usually means that\n" - "your model is updating parameters/buffers even in inference\n" - "mode. Look for a buggy nn.Module which isn't respecting train().\n") + initializer_hint = ( + "A difference in embedded parameters usually means that\n" + "your model is updating parameters/buffers even in inference\n" + "mode. Look for a buggy nn.Module which isn't respecting train().\n" + ) with errs.recover(), errs.addErrCtxt(initializer_hint): - for x, y in zip(proto.graph.initializer, alt_proto.graph.initializer): + for x, y in zip( + proto.graph.initializer, alt_proto.graph.initializer + ): errs.checkEqual(x, y) # Next, check if the model structure lines up. - structure_hint = ("A difference in model structure usually means that\n" - "your model has dynamic control flow. These models are not\n" - "currently supported by the exporter.") + structure_hint = ( + "A difference in model structure usually means that\n" + "your model has dynamic control flow. These models are not\n" + "currently supported by the exporter." + ) with errs.recover(), errs.addErrCtxt(structure_hint): # Delete initializers since we already tested them stripped_proto = onnx.ModelProto() @@ -417,12 +471,16 @@ def run(args, remained_onnx_input_idx): del stripped_alt_proto.graph.initializer[:] # Compare the printable graph representations first - errs.requireMultiLineEqual(onnx.helper.printable_graph(stripped_proto.graph), - onnx.helper.printable_graph(stripped_alt_proto.graph)) + errs.requireMultiLineEqual( + onnx.helper.printable_graph(stripped_proto.graph), + onnx.helper.printable_graph(stripped_alt_proto.graph), + ) # Compare the actual protobuf text formats now (not # very user-friendly!) - errs.requireMultiLineEqual(str(stripped_proto), str(stripped_alt_proto)) + errs.requireMultiLineEqual( + str(stripped_proto), str(stripped_alt_proto) + ) # One last ditch effort, using built-in equality on # protobufs @@ -437,7 +495,9 @@ def run(args, remained_onnx_input_idx): # case. We EXPECT these requires to fail. If they don't, # that is a bug in verify errs.requireEqual(proto, alt_proto) - errs.requireEqual(proto_bytes.getvalue(), alt_proto_bytes.getvalue()) + errs.requireEqual( + proto_bytes.getvalue(), alt_proto_bytes.getvalue() + ) raise AssertionError() # TODO: test that the traced model also returns the same thing... @@ -457,12 +517,18 @@ def run_helper(torch_out, args, remained_onnx_input_idx): torch_out, _ = torch._C._jit_flatten(torch_out) # NB: onnx backend NEVER returns bare numpy array msg = "ONNX backend returned different results from PyTorch" - result_hint = ("If you are not using trained parameters, a difference in results\n" - "could mean that your network is numerically unstable. Otherwise\n" - "it indicates a bug in PyTorch/ONNX; please file a bug report.") - with Errors(msg, rtol=rtol, atol=atol) as errs, errs.addErrCtxt(result_hint): + result_hint = ( + "If you are not using trained parameters, a difference in results\n" + "could mean that your network is numerically unstable. Otherwise\n" + "it indicates a bug in PyTorch/ONNX; please file a bug report." + ) + with Errors(msg, rtol=rtol, atol=atol) as errs, errs.addErrCtxt( + result_hint + ): for i, (x, y) in enumerate(zip(torch_out, backend_out)): - errs.checkAlmostEqual(x.data.cpu().numpy(), y, "In output {}".format(i)) + errs.checkAlmostEqual( + x.data.cpu().numpy(), y, "In output {}".format(i) + ) run_helper(torch_out, args, remained_onnx_input_idx) diff --git a/test/package/package_a/use_dunder_package.py b/test/package/package_a/use_dunder_package.py index 119cb4ee7b5c..4e0b2b3ebeac 100644 --- a/test/package/package_a/use_dunder_package.py +++ b/test/package/package_a/use_dunder_package.py @@ -3,7 +3,6 @@ def is_from_package(): return True - else: def is_from_package(): diff --git a/test/package/package_c/test_module.py b/test/package/package_c/test_module.py index 98fd7310eedc..c0d6f41839ea 100644 --- a/test/package/package_c/test_module.py +++ b/test/package/package_c/test_module.py @@ -14,7 +14,6 @@ def forward(self, x): x = a_non_torch_leaf(x, x) return torch.relu(x + 3.0) - except ImportError: pass diff --git a/test/package/package_e/test_nn_module.pt b/test/package/package_e/test_nn_module.pt new file mode 100644 index 000000000000..1c1a8964a8a4 Binary files /dev/null and b/test/package/package_e/test_nn_module.pt differ diff --git a/test/package/test_dependency_api.py b/test/package/test_dependency_api.py index be867528282d..9f1a9c9899e8 100644 --- a/test/package/test_dependency_api.py +++ b/test/package/test_dependency_api.py @@ -182,7 +182,7 @@ def test_pickle_mocked(self): obj2 = package_a.PackageAObject(obj) buffer = BytesIO() - with self.assertRaises(NotImplementedError): + with self.assertRaises(PackagingError): with PackageExporter(buffer) as he: he.mock(include="package_a.subpackage") he.intern("**") diff --git a/test/package/test_misc.py b/test/package/test_misc.py index 659355b62e59..480217b8feb3 100644 --- a/test/package/test_misc.py +++ b/test/package/test_misc.py @@ -2,12 +2,15 @@ # Owner(s): ["oncall: package/deploy"] import inspect +import platform from io import BytesIO +from pathlib import Path from textwrap import dedent +from unittest import skipIf from torch.package import PackageExporter, PackageImporter, is_from_package from torch.package.package_exporter import PackagingError -from torch.testing._internal.common_utils import run_tests +from torch.testing._internal.common_utils import IS_FBCODE, IS_SANDCASTLE, run_tests try: from .common import PackageTestCase @@ -31,6 +34,7 @@ def test_file_structure(self): """\ ├── .data │ ├── extern_modules + │ ├── python_version │ └── version ├── main │ └── main @@ -54,6 +58,7 @@ def test_file_structure(self): """\ ├── .data │ ├── extern_modules + │ ├── python_version │ └── version ├── main │ └── main @@ -99,6 +104,36 @@ def test_file_structure(self): import_exclude, ) + def test_python_version(self): + """ + Tests that the current python version is stored in the package and is available + via PackageImporter's python_version() method. + """ + buffer = BytesIO() + + with PackageExporter(buffer) as he: + from package_a.test_module import SimpleTest + + he.intern("**") + obj = SimpleTest() + he.save_pickle("obj", "obj.pkl", obj) + + buffer.seek(0) + hi = PackageImporter(buffer) + + self.assertEqual(hi.python_version(), platform.python_version()) + + @skipIf( + IS_FBCODE or IS_SANDCASTLE, + "Tests that use temporary files are disabled in fbcode", + ) + def test_load_python_version_from_package(self): + """Tests loading a package with a python version embdded""" + importer1 = PackageImporter( + f"{Path(__file__).parent}/package_e/test_nn_module.pt" + ) + self.assertEqual(importer1.python_version(), "3.9.7") + def test_file_structure_has_file(self): """ Test Directory's has_file() method. diff --git a/test/quantization/ao_migration/test_ao_migration.py b/test/quantization/ao_migration/test_ao_migration.py index 756507c26552..05931cc5f21b 100644 --- a/test/quantization/ao_migration/test_ao_migration.py +++ b/test/quantization/ao_migration/test_ao_migration.py @@ -111,8 +111,8 @@ def test_function_import_fake_quantize(self): 'FusedMovingAvgObsFakeQuantize', 'default_fake_quant', 'default_weight_fake_quant', - 'default_symmetric_fixed_qparams_fake_quant', - 'default_affine_fixed_qparams_fake_quant', + 'default_fixed_qparams_range_neg1to1_fake_quant', + 'default_fixed_qparams_range_0to1_fake_quant', 'default_per_channel_weight_fake_quant', 'default_histogram_fake_quant', 'default_fused_act_fake_quant', diff --git a/test/quantization/ao_migration/test_quantization.py b/test/quantization/ao_migration/test_quantization.py index 35ff8aedaaa7..89b69d1ef182 100644 --- a/test/quantization/ao_migration/test_quantization.py +++ b/test/quantization/ao_migration/test_quantization.py @@ -81,8 +81,8 @@ def test_function_import_fake_quantize(self): 'FusedMovingAvgObsFakeQuantize', 'default_fake_quant', 'default_weight_fake_quant', - 'default_symmetric_fixed_qparams_fake_quant', - 'default_affine_fixed_qparams_fake_quant', + 'default_fixed_qparams_range_neg1to1_fake_quant', + 'default_fixed_qparams_range_0to1_fake_quant', 'default_per_channel_weight_fake_quant', 'default_histogram_fake_quant', 'default_fused_act_fake_quant', diff --git a/test/quantization/ao_migration/test_quantization_fx.py b/test/quantization/ao_migration/test_quantization_fx.py index 23ee9c580005..223134724f7b 100644 --- a/test/quantization/ao_migration/test_quantization_fx.py +++ b/test/quantization/ao_migration/test_quantization_fx.py @@ -32,7 +32,7 @@ def test_function_import_fx(self): function_list = [ 'prepare', 'convert', - 'Fuser', + 'fuse', ] self._test_function_import('fx', function_list) @@ -155,9 +155,7 @@ def test_package_import_fx_fuse(self): self._test_package_import('fx.fuse') def test_function_import_fx_fuse(self): - function_list = [ - 'Fuser' - ] + function_list = ['fuse'] self._test_function_import('fx.fuse', function_list) def test_package_import_fx_fusion_patterns(self): @@ -170,15 +168,10 @@ def test_function_import_fx_fusion_patterns(self): ] self._test_function_import('fx.fusion_patterns', function_list) - def test_package_import_fx_quantization_types(self): - self._test_package_import('fx.quantization_types') - - def test_function_import_fx_quantization_types(self): - function_list = [ - 'Pattern', - 'QuantizerCls' - ] - self._test_function_import('fx.quantization_types', function_list) + # we removed matching test for torch.quantization.fx.quantization_types + # old: torch.quantization.fx.quantization_types + # new: torch.ao.quantization.quantization_types + # both are valid, but we'll deprecate the old path in the future def test_package_import_fx_utils(self): self._test_package_import('fx.utils') @@ -199,7 +192,7 @@ def test_function_import_fx_utils(self): 'create_qparam_nodes', 'all_node_args_have_no_tensors', 'node_return_type_is_int', - 'node_bool_tensor_arg_indexes', + 'get_non_observable_arg_indexes_and_types', 'is_get_tensor_info_node', 'maybe_get_next_module' ] diff --git a/test/quantization/core/test_quantized_module.py b/test/quantization/core/test_quantized_module.py index 613c237bdada..7cbab3be475e 100644 --- a/test/quantization/core/test_quantized_module.py +++ b/test/quantization/core/test_quantized_module.py @@ -6,6 +6,7 @@ import torch.nn.intrinsic.quantized as nniq import torch.nn.quantized as nnq import torch.nn.quantized.dynamic as nnqd +import torch.nn.quantized._reference as nnqr import torch.ao.quantization from torch.ao.quantization import ( @@ -26,6 +27,7 @@ override_quantized_engine, override_qengines, qengine_is_qnnpack, + qengine_is_onednn, ) from hypothesis import assume, given from hypothesis import strategies as st @@ -98,7 +100,9 @@ def _test_linear_api_impl(self, batch_size, in_features, out_features, use_bias, zero_points=zero_point_tensor, axis=0, dtype=torch.qint8) else: - W_q = torch.quantize_per_tensor(W, 0.1, 4, torch.qint8) + # ONEDNN only supports symmetric quantization of weight + W_zp = 0 if qengine_is_onednn() else 4 + W_q = torch.quantize_per_tensor(W, 0.1, W_zp, torch.qint8) X = torch.rand(batch_size, in_features).float() X_q = torch.quantize_per_tensor(X, 0.2, 10, torch.quint8) @@ -433,7 +437,7 @@ def test_conv1d_api(self): X_scale = 1.3 X_zero_point = 2 W_scale = [0.5] - W_zero_point = [3] + W_zero_point = [0] if qengine_is_onednn() else [3] Y_scale = 5.0 Y_zero_point = 4 if torch.backends.quantized.engine == 'qnnpack': @@ -500,7 +504,7 @@ def test_conv2d_api(self): X_scale = 1.3 X_zero_point = 2 W_scale = [0.5] - W_zero_point = [3] + W_zero_point = [0] if qengine_is_onednn() else [3] Y_scale = 5.0 Y_zero_point = 4 # use_fused -> quantized class @@ -569,7 +573,7 @@ def test_conv3d_api(self): X_scale = 1.3 X_zero_point = 2 W_scale = [0.5] - W_zero_point = [3] + W_zero_point = [0] if qengine_is_onednn() else [3] Y_scale = 5.0 Y_zero_point = 4 # use_fused -> quantized class @@ -1199,7 +1203,8 @@ def test_dynamic_convtranspose3d(self): def test_linear_api(self, batch_size, in_features, out_features, use_bias, use_default_observer): """test API functionality for nn.quantized.dynamic.Linear""" W = torch.rand(out_features, in_features).float() - W_scale, W_zp = _calculate_dynamic_qparams(W, torch.qint8) + qscheme = torch.per_tensor_symmetric if qengine_is_onednn() else torch.per_tensor_affine + W_scale, W_zp = _calculate_dynamic_qparams(W, torch.qint8, qscheme=qscheme) W_q = torch.quantize_per_tensor(W, W_scale, W_zp, torch.qint8) X = torch.rand(batch_size, in_features).float() B = torch.rand(out_features).float() if use_bias else None @@ -1310,8 +1315,8 @@ def test_lstm_api(self, dtype, bidirectional): bias_keys.append(key_name1) bias_keys.append(key_name2) - if not (dtype == torch.float16 and torch.backends.quantized.engine == "qnnpack"): - # fp16 dynamic quant is not supported for qnnpack + if not (dtype == torch.float16 and torch.backends.quantized.engine in ("qnnpack", "onednn")): + # fp16 dynamic quant is not supported for qnnpack or onednn x = torch.randn(seq_len, batch, input_size) h = torch.randn(num_layers * (bidirectional + 1), batch, hidden_size) c = torch.randn(num_layers * (bidirectional + 1), batch, hidden_size) @@ -1361,8 +1366,8 @@ def test_gru_api(self): # instantiated for all engines and dtypes for dtype in [torch.qint8, torch.float16]: - if dtype == torch.float16 and torch.backends.quantized.engine == "qnnpack": - # fp16 dynamic quant is not supported for qnnpack + if dtype == torch.float16 and torch.backends.quantized.engine in ("qnnpack", "onednn"): + # fp16 dynamic quant is not supported for qnnpack or onednn continue # Test default instantiation seq_len = 4 @@ -1434,8 +1439,8 @@ def test_cell_api(self, dtype): 'RNNReLU': torch.ops.quantized.quantized_rnn_relu_cell_dynamic} for rnn_type in cell_dict.keys(): - if not (dtype == torch.float16 and torch.backends.quantized.engine == "qnnpack"): - # fp16 dynamic quant is not supported for qnnpack + if not (dtype == torch.float16 and torch.backends.quantized.engine in ("qnnpack", "onednn")): + # fp16 dynamic quant is not supported for qnnpack or onednn kwargs = {'input_size': input_size, 'hidden_size': hidden_size, 'bias': bias, 'dtype': dtype} if rnn_type == 'RNNReLU': kwargs['nonlinearity'] = "relu" @@ -1453,3 +1458,204 @@ def test_cell_api(self, dtype): bias_keys = ['bias_ih', 'bias_hh'] self.check_eager_serialization(cell_dq, cell_dict[rnn_type](**kwargs), [x]) self.check_weight_bias_api(cell_dq, weight_keys, bias_keys) + +class TestReferenceQuantizedModule(QuantizationTestCase): + def _quant_dequant_weight(self, weight, weight_qparams): + qscheme = weight_qparams["qscheme"] + scale = weight_qparams["scale"] + zero_point = weight_qparams["zero_point"] + dtype = weight_qparams["dtype"] + if qscheme == torch.per_tensor_affine: + weight = torch.quantize_per_tensor(weight, scale, zero_point, dtype) + else: + # per channel affine + axis = weight_qparams["axis"] + weight = torch.quantize_per_channel(weight, scale, zero_point, axis, dtype) + weight = weight.dequantize() + return weight + + # TODO: add tests for conv and linear + def test_rnn_cell(self): + """ Checks the rnn cell reference quantized modules has correct numerics + This includes LSTMCell, GRUCell, RNNCell + """ + batch = 7 + input_size = 3 + hidden_size = 7 + bias = True + + x = torch.rand(batch, input_size) + h = torch.rand(batch, hidden_size) + cell_dict = {'LSTMCell': torch.nn.LSTMCell, + 'GRUCell': torch.nn.GRUCell, + 'RNNTanh': torch.nn.RNNCell, + 'RNNReLU': torch.nn.RNNCell + } + state = {'LSTMCell': (h, h), + 'GRUCell': h, + 'RNNTanh': h, + 'RNNReLU': h} + + qfn_dict = {'LSTMCell': nnqr.LSTMCell, + 'GRUCell': nnqr.GRUCell, + 'RNNTanh': nnqr.RNNCell, + 'RNNReLU': nnqr.RNNCell} + + for rnn_type in cell_dict.keys(): + kwargs = {'input_size': input_size, 'hidden_size': hidden_size, 'bias': bias} + if rnn_type == 'RNNReLU': + kwargs['nonlinearity'] = "relu" + elif rnn_type == 'RNNTanh': + kwargs['nonlinearity'] = "tanh" + + fp_cell = cell_dict[rnn_type](**kwargs) + # initialize ref rnn cell module + weight_qparams = { + 'qscheme': torch.per_tensor_affine, + 'dtype': torch.quint8, + 'scale': 2.0, + 'zero_point': 5 + } + weight_qparams_dict = { + "weight_ih": weight_qparams, + "weight_hh": weight_qparams, + } + ref_kwargs = kwargs.copy() + ref_kwargs["weight_qparams_dict"] = weight_qparams_dict + ref_cell = qfn_dict[rnn_type](**ref_kwargs) + # reassign the weights from fp32 rnn cell modulea + ref_cell.weight_ih = fp_cell.weight_ih + ref_cell.weight_hh = fp_cell.weight_hh + ref_cell.bias_ih = fp_cell.bias_ih + ref_cell.bias_hh = fp_cell.bias_hh + + ref_res = ref_cell(x, state[rnn_type]) + + # change the weight of fp_res, we first want to run a quantie and + # dequantize on the weight + fp_cell.weight_ih = torch.nn.Parameter(self._quant_dequant_weight(fp_cell.weight_ih, weight_qparams_dict["weight_ih"])) + fp_cell.weight_hh = torch.nn.Parameter(self._quant_dequant_weight(fp_cell.weight_hh, weight_qparams_dict["weight_hh"])) + fp_res = fp_cell(x, state[rnn_type]) + self.assertEqual(ref_res[0], fp_res[0], msg="RNNCell module API failed") + self.assertEqual(ref_res[1], fp_res[1], msg="RNNCell module API failed") + + def test_rnn(self): + """ Checks the rnn reference quantized modules has correct numerics + This includes LSTM + """ + seq_len = 4 + batch = 2 + input_size = 3 + hidden_size = 7 + num_layers = 2 + bias = True + for bidirectional in [True, False]: + x = torch.randn(seq_len, batch, input_size) + h = torch.randn(num_layers * (bidirectional + 1), batch, hidden_size) + c = torch.randn(num_layers * (bidirectional + 1), batch, hidden_size) + fp32_rnn = torch.nn.LSTM( + input_size=input_size, + hidden_size=hidden_size, + num_layers=num_layers, + bias=bias, + batch_first=False, + dropout=0.0, + bidirectional=bidirectional) + # initialize ref rnn module + weight_qparams = { + 'qscheme': torch.per_tensor_affine, + 'dtype': torch.qint8, + 'scale': 2.0, + 'zero_point': 5 + } + weight_qparams_dict = {key: weight_qparams for key in fp32_rnn._flat_weights_names if key.startswith("weight")} + ref_rnn = nnqr.LSTM( + input_size=input_size, + hidden_size=hidden_size, + num_layers=num_layers, + bias=bias, + batch_first=False, + dropout=0.0, + bidirectional=bidirectional, + weight_qparams_dict=weight_qparams_dict) + for wn in fp32_rnn._flat_weights_names: + setattr(ref_rnn, wn, copy.deepcopy(getattr(fp32_rnn, wn))) + + ref_rnn._flat_weights = copy.deepcopy(fp32_rnn._flat_weights) + + # quantize and dequantize the weights for fp32_rnn module + flat_weights = [] + for wn in fp32_rnn._flat_weights_names: + if wn.startswith("weight"): + weight = self._quant_dequant_weight(getattr(fp32_rnn, wn), weight_qparams) + else: + weight = getattr(fp32_rnn, wn) + flat_weights.append(weight) + fp32_rnn._flat_weights = flat_weights + + fp32_res = fp32_rnn(x, (h, c)) + ref_res = ref_rnn(x, (h, c)) + self.assertEqual(fp32_res, ref_res) + + def test_sparse(self): + """ Embedding and EmbeddingBag + """ + num_embeddings = 10 + embedding_dim = 3 + # embedding input + ex = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]]) + + # embedding bag input + ebx = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9], dtype=torch.long) + offsets = torch.tensor([0, 4], dtype=torch.long) + + fp_to_ref = { + nn.Embedding: (nnqr.Embedding, (ex,)), + nn.EmbeddingBag: (nnqr.EmbeddingBag, (ebx, offsets)), + } + + per_tensor_weight_qparams = { + 'qscheme': torch.per_tensor_affine, + 'dtype': torch.quint8, + 'scale': 2.0, + 'zero_point': 5, + } + + per_channel_weight_qparams = { + 'qscheme': torch.per_channel_affine, + 'dtype': torch.quint8, + 'scale': torch.randn(10), + 'zero_point': torch.randint(0, 255, (10,)), + 'axis': 0, + } + + per_channel_weight_qparams_quint4x2 = { + 'qscheme': torch.per_channel_affine_float_qparams, + 'dtype': torch.quint4x2, + 'scale': torch.randn(10), + 'zero_point': torch.randint(0, 255, (10,)), + 'axis': 0, + } + + weight_qparams_options = [ + per_tensor_weight_qparams, + per_channel_weight_qparams, + per_channel_weight_qparams_quint4x2, + ] + for fp_cls, weight_qparams in itertools.product([nn.Embedding, nn.EmbeddingBag], weight_qparams_options): + # TODO: torch.quint4x2 not supported in quantize_per_channel, need to add support + if weight_qparams['dtype'] == torch.quint4x2: + continue + ref_cls, args = fp_to_ref[fp_cls] + + fp32_embedding = fp_cls(num_embeddings, embedding_dim) + + ref_embedding = ref_cls(num_embeddings, embedding_dim, weight_qparams=weight_qparams) + ref_embedding.weight = fp32_embedding.weight + + # quantize and dequantize the weight for fp32 module + fp32_embedding.weight = torch.nn.Parameter(self._quant_dequant_weight(fp32_embedding.weight, weight_qparams)) + + fp32_res = fp32_embedding(*args) + ref_res = ref_embedding(*args) + self.assertEqual(fp32_res, ref_res) diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py index be84e7bd4e81..935204a3de49 100644 --- a/test/quantization/core/test_quantized_op.py +++ b/test/quantization/core/test_quantized_op.py @@ -26,9 +26,13 @@ from torch.testing._internal.common_quantization import skipIfNoFBGEMM, skipIfNoQNNPACK from torch.testing._internal.common_quantized import _quantize, _dequantize, _calculate_dynamic_qparams, \ override_quantized_engine, supported_qengines, override_qengines, _snr -from torch.testing._internal.common_quantized import qengine_is_qnnpack +from torch.testing._internal.common_quantized import ( + qengine_is_qnnpack, + qengine_is_onednn, +) from torch.ao.quantization import PerChannelMinMaxObserver from torch.testing._internal.common_cuda import TEST_CUDNN +import torch.backends.xnnpack from typing import Optional @@ -71,7 +75,7 @@ def avoid_vpmaddubsw_overflow_linear( # Reference quantized Linear operator -def qlinear_ref(X_q, X_scale, X_zp, W_q, W_scale, W_zp, b_q, Y_scale, Y_zp): +def qlinear_ref(X_q, X_scale, X_zp, W_q, W_scale, W_zp, b_q, Y_scale, Y_zp, dtype=np.uint8): X_q = np.reshape(X_q, (-1, X_q.shape[X_q.ndim - 1])) row_offsets_ref = X_q.sum(axis=1).astype(np.int32).reshape((-1, 1)) col_offsets_ref = W_q.sum(axis=1).astype(np.int32).reshape((1, -1)) @@ -85,7 +89,7 @@ def qlinear_ref(X_q, X_scale, X_zp, W_q, W_scale, W_zp, b_q, Y_scale, Y_zp): ) if b_q is not None: Prod_XqWq_ref += b_q - Y_q_ref = _quantize(Prod_XqWq_ref, Y_scale / (X_scale * W_scale), Y_zp) + Y_q_ref = _quantize(Prod_XqWq_ref, Y_scale / (X_scale * W_scale), Y_zp, dtype=dtype) return Y_q_ref """Computes the output shape given pooling parameters.""" @@ -441,8 +445,9 @@ def test_qgelu(self): shapes = ((4,), (4, 4), (4, 4, 4), (4, 4, 4, 4)) dtypes = (torch.quint8, torch.qint8) memory_formats = (torch.channels_last, torch.contiguous_format) - test_cases = itertools.product(shapes, dtypes, memory_formats) - for shape, dtype, memory_format in test_cases: + approximation = ['none', 'tanh'] + test_cases = itertools.product(shapes, dtypes, memory_formats, approximation) + for shape, dtype, memory_format, approximate in test_cases: if memory_format == torch.channels_last and len(shape) != 4: continue X, scale, zero_point, torch_type = \ @@ -454,7 +459,7 @@ def test_qgelu(self): dqX = qX.dequantize() op = torch.nn.functional.gelu - dqY = op(dqX) + dqY = op(dqX, approximate=approximate) qY = torch.quantize_per_tensor(dqY, scale=scale, zero_point=zero_point, dtype=torch_type) qY_hat = op(qX) @@ -824,6 +829,76 @@ def test_qadd_relu_same_qparams(self): self.assertEqual(qCrelu_hat, qCrelu_out_hat, msg="AddReLU.out failed") + """Tests the correctness of the cudnn add and add_relu op + (Similar to test_qadd_relu_different_qparams, will probably merge in the future)""" + @unittest.skipIf(not TEST_CUDNN, "cudnn is not enabled.") + @unittest.skip("Local only - currently the test_qadd_relu_cudnn op is bulid " + "with USE_EXPERIMENTAL_CUDNN_V8_API, we can enable the test " + "after it is built by default") + def test_qadd_relu_cudnn(self): + dtype = torch.qint8 + add_relu = torch.ops.quantized.add_relu + add = torch.ops.quantized.add + + A = torch.arange(-128, 130, dtype=torch.float).to(torch.device("cuda")) + B = torch.arange(-128, 130, dtype=torch.float).to(torch.device("cuda")) + scale_A = 2.5 + scale_B = 6.3 + scale_C = 12.9 + zero_point = 0 + qA = torch.quantize_per_tensor(A, scale=scale_A, zero_point=zero_point, + dtype=dtype) + qB = torch.quantize_per_tensor(B, scale=scale_B, zero_point=zero_point, + dtype=dtype) + # Add ground truth + C = (qA.dequantize() + qB.dequantize()).to(device="cpu").numpy() + qC = _quantize(C, scale_C, zero_point, dtype=np_dtype[dtype]) + qC_hat = add(qA, qB, scale=scale_C, zero_point=zero_point).to(device="cpu") + np.testing.assert_equal(qC, qC_hat.int_repr(), + "Quantized addition failed.") + + # Add + ReLU ground truth + Crelu = C.copy() + Crelu[C < 0] = 0 + qCrelu = _quantize(Crelu, scale_C, zero_point, dtype=np_dtype[dtype]) + qCrelu_hat = add_relu(qA, qB, scale=scale_C, zero_point=zero_point).to(device="cpu") + np.testing.assert_equal(qCrelu, qCrelu_hat.int_repr(), + "Quantized addition with ReLU failed.") + + """Tests the correctness of the cudnn add and add_relu op for nhwc format""" + @unittest.skipIf(not TEST_CUDNN, "cudnn is not enabled.") + @unittest.skip("Local only - currently the test_qadd_relu_cudnn_nhwc op is bulid " + "with USE_EXPERIMENTAL_CUDNN_V8_API, we can enable the test " + "after it is built by default") + def test_qadd_relu_cudnn_nhwc(self): + dtype = torch.qint8 + add_relu = torch.ops.quantized.add_relu + add = torch.ops.quantized.add + + A = torch.rand(16, 8, 4, 12).to(device="cuda") + B = torch.rand(16, 8, 4, 12).to(device="cuda") + scale_A = 2.5 + scale_B = 6.3 + scale_C = 12.9 + zero_point = 0 + qA = torch.quantize_per_tensor(A, scale=scale_A, zero_point=zero_point, + dtype=dtype) + qB = torch.quantize_per_tensor(B, scale=scale_B, zero_point=zero_point, + dtype=dtype) + # Add ground truth + C = (qA.dequantize() + qB.dequantize()).to(device="cpu").numpy() + qC = _quantize(C, scale_C, zero_point, dtype=np_dtype[dtype]) + qC_hat = add(qA, qB, scale=scale_C, zero_point=zero_point).to(device="cpu") + np.testing.assert_equal(qC, qC_hat.int_repr(), + "Quantized addition failed.") + + # Add + ReLU ground truth + Crelu = C.copy() + Crelu[C < 0] = 0 + qCrelu = _quantize(Crelu, scale_C, zero_point, dtype=np_dtype[dtype]) + qCrelu_hat = add_relu(qA, qB, scale=scale_C, zero_point=zero_point).to(device="cpu") + np.testing.assert_equal(qCrelu, qCrelu_hat.int_repr(), + "Quantized addition with ReLU failed.") """Tests the correctness of the add and add_relu op.""" def test_qadd_relu_different_qparams(self): @@ -991,9 +1066,20 @@ def test_qmul_relu_different_qparams(self): msg="mulReLU.out failed") """Tests the correctness of the matmul op.""" - def test_qmatmul(self): - A = torch.randn(size=(3, 4), dtype=torch.float32) * 3 - B = torch.randn(size=(4, 5), dtype=torch.float32) * 3 + @given(num_dims=st.integers(2, 5), + outer_dims=st.lists(st.integers(2, 6), min_size=3, max_size=3), + m=st.integers(2, 6), + k=st.integers(2, 6), + n=st.integers(2, 6), + dtypes=st.sampled_from(((torch.qint8, np.int8), + (torch.quint8, np.uint8)))) + def test_qmatmul(self, num_dims, outer_dims, m, k, n, dtypes): + (torch_dtype, np_dtype) = dtypes + + size_a = outer_dims[:num_dims - 2] + [m, k] + size_b = outer_dims[:num_dims - 2] + [k, n] + A = torch.randn(size=size_a, dtype=torch.float32) * 3 + B = torch.randn(size=size_b, dtype=torch.float32) * 3 scale_A = 3.1 zero_point_A = 7 @@ -1003,15 +1089,22 @@ def test_qmatmul(self): scale_C = 1.3 zero_point_C = 5 - qA = torch.quantize_per_tensor(A, scale=scale_A, zero_point=zero_point_A, - dtype=torch.qint8) - qB = torch.quantize_per_tensor(B, scale=scale_B, zero_point=zero_point_B, - dtype=torch.qint8) + qA = torch.quantize_per_tensor(A, + scale=scale_A, + zero_point=zero_point_A, + dtype=torch_dtype) + qB = torch.quantize_per_tensor(B, + scale=scale_B, + zero_point=zero_point_B, + dtype=torch_dtype) # matmul ground truth C = torch.matmul(qA.dequantize(), qB.dequantize()).numpy() - qC = _quantize(C, scale_C, zero_point_C, dtype=np.int8) - qC_hat = torch.ops.quantized.matmul(qA, qB, scale=scale_C, zero_point=zero_point_C) + qC = _quantize(C, scale_C, zero_point_C, dtype=(np_dtype)) + qC_hat = torch.ops.quantized.matmul(qA, + qB, + scale=scale_C, + zero_point=zero_point_C) np.testing.assert_equal(qC, qC_hat.int_repr(), "Quantized multiplication failed.") @@ -1022,10 +1115,16 @@ def test_qmatmul(self): scales_B = torch.rand(size=(B.shape[axis],)) zero_points_B = torch.randint(low=0, high=5, size=(B.shape[axis],)) - qA = torch.quantize_per_channel(A, scales=scales_A, zero_points=zero_points_A, - axis=axis, dtype=torch.qint8) - qB = torch.quantize_per_channel(B, scales=scales_B, zero_points=zero_points_B, - axis=axis, dtype=torch.qint8) + qA = torch.quantize_per_channel(A, + scales=scales_A, + zero_points=zero_points_A, + axis=axis, + dtype=torch.qint8) + qB = torch.quantize_per_channel(B, + scales=scales_B, + zero_points=zero_points_B, + axis=axis, + dtype=torch.qint8) np.testing.assert_raises_regex(RuntimeError, ".*per-tensor.*", torch.ops.quantized.matmul, @@ -1034,6 +1133,53 @@ def test_qmatmul(self): scale_C, zero_point_C) + + """Tests the correctness of the quantized softmax op.""" + @given(dims=st.lists(st.integers(2, 5), min_size=5, max_size=5)) + def test_qsoftmax(self, dims): + for (num_dims, dim, memory_format) in [ + (2, 1, torch.contiguous_format), # 2d softmax over last dim + (4, 3, torch.contiguous_format), # >2 dims, softmax along last dim + (5, 2, torch.contiguous_format), # >2 dims, softmax along not last dim (requires permute) + (4, 3, torch.channels_last), # >2 dims, softmax along last dim, but not contiguous + (4, 1, torch.channels_last), # Channels Last, doesn't require permute + (5, 1, torch.channels_last_3d), # Channels Last 3D, doesn't require permute + ]: + size = dims[:num_dims] + torch_dtype = torch.quint8 + np_dtype = np.uint8 + + scale_X = 1.3 + zero_point_X = 5 + X = torch.rand(size=size, dtype=torch.float32) * 8 + zero_point_X + X = X.to(memory_format=memory_format) + + scale_Y = 1 / 256 + zero_point_Y = 0 + + qX = torch.quantize_per_tensor(X, + scale=scale_X, + zero_point=zero_point_X, + dtype=torch_dtype) + + + # softmax ground truth + Y = torch.softmax(qX.dequantize(), dim=dim).numpy() + qY = _quantize(Y, scale_Y, zero_point_Y, dtype=np_dtype) + qY_hat = torch.ops.quantized.softmax(qX, + dim=dim, + output_scale=scale_Y, + output_zero_point=zero_point_Y) + + np.testing.assert_equal(qY, qY_hat.int_repr(), + "Quantized softmax failed.") + + """Tests the correctness of the quantized softmax op using qnnpack.""" + @skipIfNoQNNPACK + def test_qsoftmax_qnnpack(self): + with override_quantized_engine('qnnpack'): + self.test_qsoftmax() + """Tests the correctness of the mul and mul_relu op.""" def test_qmul_broadcast(self): mul_relu = torch.ops.quantized.mul_relu @@ -1160,6 +1306,52 @@ def test_max_pool1d(self, X, kernel, stride, dilation, padding, ceil_mode): self.assertEqual(a_ref, a_hat.dequantize(), msg="ops.quantized.max_pool1d results are off") + # TODO: merge this test with test_max_pool2d when USE_EXPERIMENTAL_CUDNN_V8_API flag is enabled in CI + """Tests 2D cudnn max pool operation on quantized tensors.""" + @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=3, max_dims=4, + min_side=1, max_side=10), + # cudnn's support for quantized pooling is limited to + # int8 currently + qparams=hu.qparams(dtypes=[torch.qint8])), + kernel=st.sampled_from((3, 5, 7)), + stride=st.sampled_from((None, 1, 2)), + # currently there is no support for dilation for cudnn + # pooling + dilation=st.integers(1, 1), + padding=st.integers(0, 2), + ceil_mode=st.booleans()) + @unittest.skipIf(not TEST_CUDNN, "cudnn is not enabled.") + @unittest.skip("Local only - currently the qconv2d_cudnn op is bulid " + "with USE_EXPERIMENTAL_CUDNN_V8_API, we can enable the test " + "after it is built by default") + def test_max_pool2d_cudnn(self, X, kernel, stride, dilation, padding, ceil_mode): + X, (scale, zero_point, torch_type) = X + assume(kernel // 2 >= padding) # Kernel cannot be overhanging! + iH, iW = X.shape[-2:] + oH = pool_output_shape(iH, kernel, padding, stride, dilation, ceil_mode) + assume(oH > 0) + oW = pool_output_shape(iW, kernel, padding, stride, dilation, ceil_mode) + assume(oW > 0) + + a = torch.from_numpy(X).to(device="cuda") + a_pool = torch.nn.functional.max_pool2d(a, kernel_size=kernel, + stride=stride, + padding=padding, dilation=dilation, + ceil_mode=ceil_mode) + a_ref = torch.quantize_per_tensor(a_pool, scale=scale, + zero_point=zero_point, dtype=torch_type) + a_ref = a_ref.dequantize() + qa = torch.quantize_per_tensor(a, scale=scale, zero_point=zero_point, + dtype=torch_type) + + # Test the ops.quantized separately, because None is not treated. + a_hat = torch.ops.quantized.max_pool2d( + qa, kernel_size=_pair(kernel), + stride=_pair(kernel if stride is None else stride), + padding=_pair(padding), dilation=_pair(dilation), ceil_mode=ceil_mode) + self.assertEqual(a_ref, a_hat.dequantize(), + msg="ops.quantized.max_pool2d results are off") + """Tests 2D max pool operation on quantized tensors.""" @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=3, max_dims=4, min_side=1, max_side=10), @@ -1621,19 +1813,23 @@ def test_adaptive_avg_pool(self): error_message = r"Results are off for {}:\n\tExpected:\n{}\n\tGot:\n{}" for name, op in ops_under_test.items(): - qX_hat = op(qX, output_size=output_size) - # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 - self.assertEqualIgnoreType( - X_ref, qX_hat.int_repr(), atol=1.0, - rtol=0, msg=error_message.format(name, X_ref, qX_hat)) - self.assertEqual( - scale, qX_hat.q_scale(), - msg=error_message.format(name + '.scale', scale, - qX_hat.q_scale())) - self.assertEqual( - zero_point, qX_hat.q_zero_point(), - msg=error_message.format(name + '.zero_point', scale, - qX_hat.q_zero_point())) + # TODO: torch.cuda.is_available() should be swapped for a flag that checks if cudnn + # is enabled in the build when cudnn supports adaptive average pooling + devices = ["cpu", "cuda"] if (dim == 2 and torch.cuda.is_available()) else ["cpu"] + for device in devices: + qX_hat = op(qX.to(device=device), output_size=output_size) + # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 + self.assertEqualIgnoreType( + X_ref, qX_hat.int_repr(), atol=1.0, + rtol=0, msg=error_message.format(name, X_ref, qX_hat)) + self.assertEqual( + scale, qX_hat.q_scale(), + msg=error_message.format(name + '.scale', scale, + qX_hat.q_scale())) + self.assertEqual( + zero_point, qX_hat.q_zero_point(), + msg=error_message.format(name + '.zero_point', scale, + qX_hat.q_zero_point())) """Tests adaptive average pool operation on NHWC quantized tensors.""" def test_adaptive_avg_pool3d_ndhwc(self): @@ -2066,7 +2262,7 @@ def test_group_norm(self): torch_type, Y_scale, Y_zero_point, channels_last, \ affine = test_case num_channels = num_groups * channels_per_group - # minimum rank for for channels_last + # minimum rank for channels_last shapes = (batches, num_channels, elements_per_channel, 1) # In the FP kernel, sums and sums of squares are calculated in floating point. @@ -2632,7 +2828,7 @@ def forward( ] q_data = [] - reduce_range = (qengine == 'fbgemm') + reduce_range = (qengine in ('fbgemm', 'onednn')) for idx, x in enumerate(fp_data): scale, zero_point = _calculate_dynamic_qparams( x, dtype=dtype, reduce_range=reduce_range) @@ -2653,7 +2849,13 @@ def forward( mha.eval() # Prepare - mha.qconfig = torch.ao.quantization.get_default_qconfig(qengine) + if qengine_is_onednn(): + # `reduce_range` is False by default for ONEDNN backend + # but the test fails on earlier CPUs without VNNI. + # So we use a default qconfig with `reduce_range=True` here + mha.qconfig = torch.ao.quantization.get_default_qconfig() + else: + mha.qconfig = torch.ao.quantization.get_default_qconfig(qengine) mha_prepared = torch.ao.quantization.prepare( mha, prepare_custom_config_dict=custom_module_config) @@ -2746,7 +2948,7 @@ def test_qlinear(self, batch_size, input_channels, output_channels, (b_value_max - b_value_min) + b_value_min ).astype(np.int32) if use_bias else None - if torch.backends.quantized.engine == 'fbgemm': + if torch.backends.quantized.engine in ('fbgemm', 'onednn'): avoid_vpmaddubsw_overflow_linear( batch_size, input_channels, @@ -2879,6 +3081,19 @@ def test_qlinear_legacy(self, batch_size, input_channels, output_channels): self.assertEqual(Y_fp32, Y_fp32_ref, msg="torch.ops.quantized.fbgemm_linear_dynamic results are off") + @skipIfNoFBGEMM + @given( + input_channels=st.integers(16, 32), + output_channels=st.integers(4, 8), + exponent=st.integers(0, 8)) + def test_linear_prepack_fp16_numerics(self, input_channels, output_channels, exponent): + w = torch.randn(output_channels, input_channels) * 10**exponent + bias = None + w_packed_fp16 = torch.ops.quantized.linear_prepack_fp16(w, bias) + w_unpacked_fp16 = torch.ops.quantized.linear_unpack_fp16(w_packed_fp16) + w_fp16 = w.to(torch.float16).to(torch.float32) + self.assertTrue(torch.equal(w_fp16, w_unpacked_fp16[0])) + @skipIfNoFBGEMM def test_qlinear_dynamic_fp16(self): @@ -2970,8 +3185,8 @@ def test_qlstmGRU(self, num_batches, input_size, hidden_size, for rnn_type in ['LSTM', 'GRU']: for dtype in [torch.qint8, torch.float16]: - # Fp16 quantization is not supported for qnnpack - if torch.backends.quantized.engine == 'qnnpack' and dtype == torch.float16: + # Fp16 quantization is not supported for qnnpack or onednn + if torch.backends.quantized.engine in ('qnnpack', 'onednn') and dtype == torch.float16: continue if torch.backends.quantized.engine == 'qnnpack': @@ -3104,8 +3319,8 @@ def test_qrnncell(self, num_batches, input_size, hidden_size, per_channel_quant) for rnn_type in ['LSTMCell', 'GRUCell', 'RNNTanh', 'RNNReLU']: for dtype in [torch.qint8, torch.float16]: - # Fp16 quantization is not supported for qnnpack - if torch.backends.quantized.engine == 'qnnpack' and dtype == torch.float16: + # Fp16 quantization is not supported for qnnpack or onednn + if torch.backends.quantized.engine in ('qnnpack', 'onednn') and dtype == torch.float16: continue if torch.backends.quantized.engine == 'qnnpack': @@ -3246,6 +3461,7 @@ class TestQuantizedLinear(TestCase): def test_qlinear(self, batch_size, input_channels, output_channels, use_bias, use_relu, use_multi_dim_input, use_channelwise): decimal_val = 4 + dtypes = [torch.quint8] if torch.backends.quantized.engine == 'qnnpack': # QNNPACK supports uint8 in the kernels. In the op we shift the int8 # weight values to uint8 to be on par with fbgemm. However, this causes @@ -3253,24 +3469,164 @@ def test_qlinear(self, batch_size, input_channels, output_channels, use_bias, # off by one results. decimal_val = 0 + # only qnnpack qengine supports qint8 when xnnpack is available + if torch.backends.xnnpack.enabled: + dtypes.append(torch.qint8) + + for dtype in dtypes: + # No support for channelwise in xnnpack (int8) + # ONEDNN does not support qint8 + if dtype == torch.qint8 and (use_channelwise or qengine_is_onednn()): + return + + nptype = np_dtype[dtype] + qlinear_prepack = torch.ops.quantized.linear_prepack + if use_relu: + qlinear = torch.ops.quantized.linear_relu + else: + qlinear = torch.ops.quantized.linear + if use_multi_dim_input: + batch_size *= 3 # Test the multi-dim input tensor + X_scale = 1.5 + X_zp = 5 + X_value_min = -128 if dtype == torch.qint8 else 0 + X_value_max = 127 if dtype == torch.qint8 else 255 + X_q0 = np.round( + np.random.rand(batch_size, input_channels) * + (X_value_max - X_value_min) + + X_value_min + ).astype(nptype) + + W_scales = np.random.rand(output_channels) + # xnnpack forces W_zp to 0 when using symmetric quantization + # ONEDNN only supports symmetric quantization of weight + if dtype == torch.qint8 or qengine_is_onednn(): + W_zps = np.zeros(output_channels).astype(np.int) + else: + W_zps = np.round(np.random.rand(output_channels) * 100 - 50).astype(np.int) + # when using symmetric quantization + # special restriction for xnnpack fully connected op weight + # [-127, 127] instead of [-128, 127] + W_value_min = -127 if dtype == torch.qint8 else -128 + W_value_max = 127 + W_q0 = np.round( + np.random.rand(output_channels, input_channels) + * (W_value_max - W_value_min) + + W_value_min + ).astype(np.int8) # weight is always int8_t + b_value_min = -10 + b_value_max = 10 + b_q0 = np.round( + np.random.rand(output_channels) * + (b_value_max - b_value_min) + b_value_min + ).astype(np.int32) if use_bias else None + if torch.backends.quantized.engine in ('fbgemm', 'onednn'): + avoid_vpmaddubsw_overflow_linear( + batch_size, + input_channels, + output_channels, + X_q0, + X_value_min, + X_value_max, + W_q0, + W_value_min, + W_value_max, + ) + X = torch.from_numpy(_dequantize( + X_q0, X_scale, X_zp)).to(dtype=torch.float) + X_q = torch.quantize_per_tensor( + X, scale=X_scale, zero_point=X_zp, dtype=dtype) + if use_channelwise: + W = torch.from_numpy(_dequantize(W_q0, W_scales.reshape( + (-1, 1)), W_zps.reshape((-1, 1)))).to(dtype=torch.float) + W_q = torch.quantize_per_channel(W, scales=torch.from_numpy(W_scales), + zero_points=torch.from_numpy(W_zps), axis=0, dtype=torch.qint8) + b = torch.from_numpy(_dequantize( + b_q0, X_scale * W_scales, 0)).to(dtype=torch.float) if use_bias else None + b_q = torch.quantize_per_channel(b, scales=torch.from_numpy(X_scale * W_scales), + zero_points=torch.zeros(output_channels, dtype=torch.long), + axis=0, dtype=torch.qint32) if use_bias else None + else: + W = torch.from_numpy(_dequantize( + W_q0, W_scales[0], W_zps[0])).to(dtype=torch.float) + W_q = torch.quantize_per_tensor(W, scale=W_scales[0], zero_point=( + W_zps[0].astype(int).item()), dtype=torch.qint8) + b = torch.from_numpy(_dequantize( + b_q0, X_scale * (W_scales[0].item()), 0)).to(dtype=torch.float) if use_bias else None + b_q = torch.quantize_per_tensor( + b, scale=X_scale * (W_scales[0].item()), zero_point=0, dtype=torch.qint32) if use_bias else None + # Compare X_scale * W_scale * input_channels * X_value_max * W_value_max with + # Y_scale * 255 (max for uint8). + Y_scale = 125.1234 + Y_zp = 5 + # Weight prepacking operator for quantized Linear + float_bias = b if use_bias else None + W_prepack = qlinear_prepack(W_q, float_bias) + if use_multi_dim_input: + X_q = X_q.view(3, int(batch_size / 3), input_channels) + # Quantized Linear operator with prepacked weight + Y_q = qlinear(X_q, W_prepack, Y_scale, Y_zp) + if not use_channelwise: + # Test the per-tensor quantization only + # Reference quantized Linear operator + Y_q_ref = qlinear_ref(X_q0, X_scale, X_zp, W_q0, + W_scales[0], W_zps[0], b_q0, Y_scale, Y_zp, dtype=nptype) + if use_relu: + Y_q_ref[Y_q_ref < Y_zp] = Y_zp + if use_multi_dim_input: + Y_q_ref = np.reshape( + Y_q_ref, (3, int(batch_size / 3), output_channels)) + # Assert equal + np.testing.assert_array_almost_equal(Y_q_ref, Y_q.int_repr().numpy(), decimal=decimal_val) + # Test both per-tensor and per-channel quantization + # Reference quantized result from PyTorch Linear operator + W_fp32 = W_q.dequantize().to(dtype=torch.float) + X_fp32 = X_q.dequantize().to(dtype=torch.float) + b_fp32 = b_q.dequantize().to(dtype=torch.float) if use_bias else None + Y_fp32_ref = F.linear(X_fp32, W_fp32, b_fp32) + if use_relu: + Y_fp32_ref[Y_fp32_ref < 0.0] = 0.0 + Y_q_ref2 = torch.quantize_per_tensor( + Y_fp32_ref, Y_scale, Y_zp, dtype) + # Assert equal + np.testing.assert_array_almost_equal( + Y_q_ref2.int_repr().numpy(), Y_q.int_repr().numpy(), decimal=decimal_val) + + @given(batch_size=st.integers(1, 4), + # in cudnn v. 8.4.0, there is a limitation that input channels + # should be a multiple of 4 for int8 tensors. in cudnn v.8.3.3 + # this should be a multiple of 16 + input_channels=st.sampled_from([4, 8, 12, 16, 32]), + # constraints on output channels appear to be relax, as it seems we can use any positive integer here + # except 1. It is not clear why 1 will not work. TODO: check with Yang + output_channels=st.integers(2, 36), + use_bias=st.booleans(), + use_relu=st.booleans(), + use_multi_dim_input=st.booleans(), + use_channelwise=st.sampled_from([False])) # channelwise currently not supported for qlinear cudnn + @skipIfNoFBGEMM + @unittest.skipIf(not TEST_CUDNN, "cudnn is not enabled.") + @unittest.skip("Local only - currently the qlinear_cudnn op is bulid " + "with USE_EXPERIMENTAL_CUDNN_V8_API, we can enable the test " + "after it is built by default") + # TODO: check with yang regarding CUDNN flags + def test_qlinear_cudnn(self, batch_size, input_channels, output_channels, use_bias, + use_relu, use_multi_dim_input, use_channelwise): qlinear_prepack = torch.ops.quantized.linear_prepack if use_relu: - qlinear = torch.ops.quantized.linear_relu + qlinear_op = torch.ops.quantized.linear_relu else: - qlinear = torch.ops.quantized.linear - if use_multi_dim_input: - batch_size *= 3 # Test the multi-dim input tensor + qlinear_op = torch.ops.quantized.linear X_scale = 1.5 - X_zp = 5 - X_value_min = 0 - X_value_max = 225 + X_zp = 0 + X_value_min = -128 + X_value_max = 127 X_q0 = np.round( np.random.rand(batch_size, input_channels) * (X_value_max - X_value_min) - + X_value_min - ).astype(np.uint8) - W_scales = np.random.rand(output_channels) - W_zps = np.round(np.random.rand(output_channels) * 100 - 50).astype(np.int) + + X_value_min).astype(np.int8) + W_scale = 2.5 + W_zp = 0 W_value_min = -128 W_value_max = 127 W_q0 = np.round( @@ -3284,6 +3640,15 @@ def test_qlinear(self, batch_size, input_channels, output_channels, use_bias, np.random.rand(output_channels) * (b_value_max - b_value_min) + b_value_min ).astype(np.int32) if use_bias else None + if use_bias: + b_value_min = -10 + b_value_max = 10 + b_q0 = np.round( + np.random.rand(output_channels) * + (b_value_max - b_value_min) + b_value_min + ).astype(np.int32) + else: + bias = None avoid_vpmaddubsw_overflow_linear( batch_size, input_channels, @@ -3295,65 +3660,31 @@ def test_qlinear(self, batch_size, input_channels, output_channels, use_bias, W_value_min, W_value_max, ) + quant_dtype = torch.qint8 X = torch.from_numpy(_dequantize( - X_q0, X_scale, X_zp)).to(dtype=torch.float) + X_q0, X_scale, X_zp)).to(dtype=torch.float).to(device="cuda") X_q = torch.quantize_per_tensor( - X, scale=X_scale, zero_point=X_zp, dtype=torch.quint8) - if use_channelwise: - W = torch.from_numpy(_dequantize(W_q0, W_scales.reshape( - (-1, 1)), W_zps.reshape((-1, 1)))).to(dtype=torch.float) - W_q = torch.quantize_per_channel(W, scales=torch.from_numpy(W_scales), - zero_points=torch.from_numpy(W_zps), axis=0, dtype=torch.qint8) - b = torch.from_numpy(_dequantize( - b_q0, X_scale * W_scales, 0)).to(dtype=torch.float) if use_bias else None - b_q = torch.quantize_per_channel(b, scales=torch.from_numpy(X_scale * W_scales), - zero_points=torch.zeros(output_channels, dtype=torch.long), - axis=0, dtype=torch.qint32) if use_bias else None - else: - W = torch.from_numpy(_dequantize( - W_q0, W_scales[0], W_zps[0])).to(dtype=torch.float) - W_q = torch.quantize_per_tensor(W, scale=W_scales[0], zero_point=( - W_zps[0].astype(int).item()), dtype=torch.qint8) - b = torch.from_numpy(_dequantize( - b_q0, X_scale * (W_scales[0].item()), 0)).to(dtype=torch.float) if use_bias else None - b_q = torch.quantize_per_tensor( - b, scale=X_scale * (W_scales[0].item()), zero_point=0, dtype=torch.qint32) if use_bias else None - # Compare X_scale * W_scale * input_channels * X_value_max * W_value_max with - # Y_scale * 255 (max for uint8). - Y_scale = 125.1234 - Y_zp = 5 + X, scale=X_scale, zero_point=X_zp, dtype=quant_dtype) + W = torch.from_numpy(_dequantize( + W_q0, W_scale, W_zp)).to(dtype=torch.float).to(device="cuda") + W_q = torch.quantize_per_tensor(W, scale=W_scale, zero_point=W_zp, dtype=quant_dtype) + b = torch.from_numpy(_dequantize( + b_q0, X_scale * (W_zp), 0)).to(dtype=torch.float).to(device="cuda") if use_bias else None + b_q = torch.quantize_per_tensor( + b, scale=X_scale * W_scale, zero_point=0, dtype=quant_dtype) if use_bias else None + Y_scale = 0.5 + Y_zp = 0 # Weight prepacking operator for quantized Linear float_bias = b if use_bias else None - W_prepack = qlinear_prepack(W_q, float_bias) - if use_multi_dim_input: - X_q = X_q.view(3, int(batch_size / 3), input_channels) + W_prepack = qlinear_prepack(W_q, float_bias if use_bias else None) # Quantized Linear operator with prepacked weight - Y_q = qlinear(X_q, W_prepack, Y_scale, Y_zp) - if not use_channelwise: - # Test the per-tensor quantization only - # Reference quantized Linear operator - Y_q_ref = qlinear_ref(X_q0, X_scale, X_zp, W_q0, - W_scales[0], W_zps[0], b_q0, Y_scale, Y_zp) - if use_relu: - Y_q_ref[Y_q_ref < Y_zp] = Y_zp - if use_multi_dim_input: - Y_q_ref = np.reshape( - Y_q_ref, (3, int(batch_size / 3), output_channels)) - # Assert equal - np.testing.assert_array_almost_equal(Y_q_ref, Y_q.int_repr().numpy(), decimal=decimal_val) - # Test both per-tensor and per-channel quantization - # Reference quantized result from PyTorch Linear operator - W_fp32 = W_q.dequantize().to(dtype=torch.float) - X_fp32 = X_q.dequantize().to(dtype=torch.float) - b_fp32 = b_q.dequantize().to(dtype=torch.float) if use_bias else None - Y_fp32_ref = F.linear(X_fp32, W_fp32, b_fp32) + Y_q = qlinear_op(X_q, W_prepack, Y_scale, Y_zp).to(device="cpu") + Y_q_ref = qlinear_ref(X_q0, X_scale, X_zp, W_q0, + W_scale, W_zp, b_q0, Y_scale, Y_zp, dtype=np.int8) if use_relu: - Y_fp32_ref[Y_fp32_ref < 0.0] = 0.0 - Y_q_ref2 = torch.quantize_per_tensor( - Y_fp32_ref, Y_scale, Y_zp, torch.quint8) - # Assert equal - np.testing.assert_array_almost_equal( - Y_q_ref2.int_repr().numpy(), Y_q.int_repr().numpy(), decimal=decimal_val) + Y_q_ref[Y_q_ref < Y_zp] = Y_zp + decimal_val = 0 + np.testing.assert_array_almost_equal(Y_q_ref, Y_q.int_repr().numpy(), decimal=decimal_val) """Tests the correctness of the quantized::linear_unpack op.""" @given(W=hu.tensor(shapes=hu.array_shapes(2, 2,), @@ -3370,6 +3701,13 @@ def test_qlinear_unpack(self, W, use_channelwise): qlinear_prepack = torch.ops.quantized.linear_prepack qlinear_unpack = torch.ops.quantized.linear_unpack + # ONEDNN only supports symmetric quantization of weight + if qengine_is_onednn(): + if use_channelwise: + W_zps = torch.zeros(output_channels).to(torch.int64) + else: + W_zp = 0 + W = torch.from_numpy(W) if use_channelwise: W_q = torch.quantize_per_channel( @@ -3833,6 +4171,10 @@ def _test_qconv_unpack_impl(self, qconv_prepack_fn, qconv_unpack_fn, inputs, if channelwise and transposed: # currently transposed conv and per-channel per quantization does not work return + # ONEDNN only supports symmetric quantization of weight and zero output padding + if qengine_is_onednn(): + W_zero_point = 0 + o_pads = len(o_pads) * [0] if o_pads is not None else None if channelwise: if transposed: output_channels = W.shape[1] # IC OC/G @@ -3971,6 +4313,9 @@ def _test_qconv_impl( weight_dtype=torch.qint8, output_dtype=torch.quint8, ): + # ONEDNN only supports symmetric quantization of weight + if qengine_is_onednn() and W_zero_point is not None: + W_zero_point = len(W_zero_point) * [0] (X, W), (X_q, W_q), bias_float = self._make_qconv_tensors( batch_size, input_channels_per_group, input_feature_map_shape, output_channels_per_group, groups, kernels, @@ -4055,7 +4400,7 @@ def _test_qconv_impl( Y_scale=st.floats(4.2, 5.6), Y_zero_point=st.integers(0, 4), use_bias=st.booleans(), - use_relu=st.sampled_from([False]), + use_relu=st.booleans(), use_channelwise=st.booleans()) @override_qengines def test_qconv2d( @@ -4103,26 +4448,33 @@ def test_qconv2d( dilations, groups, ) - self._test_qconv_impl( - qconv, qconv_prepack, conv_op, batch_size, - input_channels_per_group, (height, width), - output_channels_per_group, groups, kernels, strides, pads, None, - dilations, X_scale, X_zero_point, W_scale, W_zero_point, - Y_scale, Y_zero_point, use_bias, use_relu, use_channelwise, False) + act_qdtypes = [torch.quint8] + # Only qnnpack qengine supportes qint8 + if qengine_is_qnnpack() and torch.backends.xnnpack.enabled: + act_qdtypes.append(torch.qint8) + + for X_qdtype in act_qdtypes: + if X_qdtype == torch.qint8: + W_zero_point = [0 for i in range(len(W_zero_point))] + + self._test_qconv_impl( + qconv, qconv_prepack, conv_op, batch_size, + input_channels_per_group, (height, width), + output_channels_per_group, groups, kernels, strides, pads, None, + dilations, X_scale, X_zero_point, W_scale, W_zero_point, + Y_scale, Y_zero_point, use_bias, use_relu, use_channelwise, False, input_dtype=X_qdtype, output_dtype=X_qdtype) + + # TODO: merge this test with test_qconv2d when CUDNN runtime flags becomes available + """Tests the correctness of quantized 2D convolution cudnn op.""" @given(batch_size=st.integers(1, 3), - # only multiples of 16 are supported right now, might be fixed in - # next release of cudnn - # input_channels_per_group=st.sampled_from([2, 4, 5, 8, 16, 32]), - input_channels_per_group=st.sampled_from([16, 32]), + # cudnn only supports multiples of 4, but we have explicitly added padding on the backend + input_channels_per_group=st.integers(1, 32), height=st.integers(10, 16), width=st.integers(7, 14), - # only multiples of 16 are supported right now, might be fixed in - # next release of cudnn - # output_channels_per_group=st.sampled_from([2, 4, 5, 8, 16, 32]), - output_channels_per_group=st.sampled_from([16, 32]), - # groups=st.integers(1, 3), - groups=st.integers(1, 1), + # cudnn only supports multiples of 4, but we have explicitly added padding on the backend + output_channels_per_group=st.integers(1, 32), + groups=st.integers(1, 1), # currently padding only supports groups=1 kernel_h=st.integers(1, 7), kernel_w=st.integers(1, 7), stride_h=st.integers(1, 2), @@ -4131,6 +4483,8 @@ def test_qconv2d( pad_w=st.integers(0, 2), # result for dilation == 2 is not correct # dilation=st.integers(1, 2), + # currently cudnn has only been verified to work for dilation = 1 + # TODO: check backend works for dilation > 1 dilation=st.integers(1, 1), X_scale=st.floats(1.2, 1.6), X_zero_point=st.sampled_from([0]), @@ -4138,10 +4492,8 @@ def test_qconv2d( W_zero_point=st.lists(st.integers(0, 0), min_size=1, max_size=2), Y_scale=st.floats(4.2, 5.6), Y_zero_point=st.sampled_from([0]), - # TODO: enable bias - use_bias=st.sampled_from([False]), - # TODO: enable relu - use_relu=st.sampled_from([False]), + use_bias=st.booleans(), + use_relu=st.booleans(), # TODO: enable channelwise use_channelwise=st.sampled_from([False])) @skipIfNoFBGEMM @@ -4181,8 +4533,10 @@ def test_qconv2d_cudnn( pads = (pad_h, pad_w) dilations = (dilation, dilation) - qconv = torch.ops.quantized.conv2d_cudnn - assert not use_relu, "conv2d_relu_cudnn is not supported yet" + if use_relu: + qconv = torch.ops.quantized.conv2d_relu + else: + qconv = torch.ops.quantized.conv2d conv_op = torch.nn.Conv2d( input_channels, output_channels, @@ -4193,7 +4547,7 @@ def test_qconv2d_cudnn( groups, ).to(torch.device("cuda")) self._test_qconv_impl( - qconv, None, conv_op, batch_size, + qconv, torch.ops.quantized.conv2d_prepack, conv_op, batch_size, input_channels_per_group, (height, width), output_channels_per_group, groups, kernels, strides, pads, None, dilations, X_scale, X_zero_point, W_scale, W_zero_point, @@ -4269,13 +4623,14 @@ def trace_handler(p): weight_int8 = torch.quantize_per_tensor(weight, 1, 0, torch.qint8).contiguous(memory_format=torch.channels_last) scale = 1.0 zero_point = 0 - conv_op = torch.ops.quantized.conv2d_cudnn + conv_op = torch.ops.quantized.conv2d + weight_prepacked = torch.ops.quantized.conv2d_prepack(weight_int8, None, stride, padding, dilation, groups) with profile( activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], schedule=my_schedule, on_trace_ready=trace_handler) as prof: for i in range(30): - conv_op(input_int8, weight_int8, None, stride, padding, dilation, groups, scale, zero_point) + conv_op(input_int8, weight_prepacked, scale, zero_point) prof.step() print("int8 benchmark result:") @@ -4323,7 +4678,7 @@ def test_qconv_transpose1d( return # Currently only the QNNPACK is supported if qengine_is_qnnpack() and (IS_PPC or TEST_WITH_UBSAN): return # QNNPACK doesn't support these - assume(o_pad < stride or o_pad < dilation) + assume(o_pad < stride and o_pad < dilation) input_channels = input_channels_per_group * groups output_channels = output_channels_per_group * groups @@ -4346,40 +4701,51 @@ def test_qconv_transpose1d( dilation=dilations, bias=use_bias ) - X_q, W_q, bias_float = self._test_qconv_impl( - qconv, qconv_prepack, conv_op, batch_size, - input_channels_per_group, (width, ), - output_channels_per_group, groups, kernels, strides, pads, o_pads, - dilations, X_scale, X_zero_point, W_scale, W_zero_point, - Y_scale, Y_zero_point, use_bias, use_relu=False, - use_channelwise=False, use_transpose=True) - # check that this doesn't error - test_conv = torch.nn.quantized.ConvTranspose1d(input_channels, output_channels, 1) - test_conv(X_q) + act_qdtypes = [torch.quint8] + # Only qnnpack qengine supportes qint8 + if qengine_is_qnnpack() and torch.backends.xnnpack.enabled: + act_qdtypes.append(torch.qint8) - # Test the module implementation - qconv_op = torch.nn.quantized.ConvTranspose1d( - in_channels=input_channels, - out_channels=output_channels, - kernel_size=kernels, - stride=strides, - padding=pads, - output_padding=o_pads, - groups=groups, - dilation=dilations, - bias=use_bias - ) - qconv_op.scale = Y_scale - qconv_op.zero_point = Y_zero_point - qconv_op.set_weight_bias(W_q, bias_float) + for X_qdtype in act_qdtypes: + if X_qdtype == torch.qint8: + W_zero_point = [0 for i in range(len(W_zero_point))] - Y_dq_ref = conv_op(X_q.dequantize()) - Y_q_ref = torch.quantize_per_tensor(Y_dq_ref, scale=Y_scale, - zero_point=Y_zero_point, - dtype=torch.quint8) - Y_q = qconv_op(X_q) - self.assertEqual(Y_q_ref, Y_q) + X_q, W_q, bias_float = self._test_qconv_impl( + qconv, qconv_prepack, conv_op, batch_size, + input_channels_per_group, (width, ), + output_channels_per_group, groups, kernels, strides, pads, o_pads, + dilations, X_scale, X_zero_point, W_scale, W_zero_point, + Y_scale, Y_zero_point, use_bias, use_relu=False, + use_channelwise=False, use_transpose=True, input_dtype=X_qdtype, output_dtype=X_qdtype) + + # check that this doesn't error + test_conv = torch.nn.quantized.ConvTranspose1d(input_channels, output_channels, 1) + test_conv.scale = Y_scale + test_conv(X_q) + + # Test the module implementation + qconv_op = torch.nn.quantized.ConvTranspose1d( + in_channels=input_channels, + out_channels=output_channels, + kernel_size=kernels, + stride=strides, + padding=pads, + output_padding=o_pads, + groups=groups, + dilation=dilations, + bias=use_bias + ) + qconv_op.scale = Y_scale + qconv_op.zero_point = Y_zero_point + qconv_op.set_weight_bias(W_q, bias_float) + + Y_dq_ref = conv_op(X_q.dequantize()) + Y_q_ref = torch.quantize_per_tensor(Y_dq_ref, scale=Y_scale, + zero_point=Y_zero_point, + dtype=X_qdtype) + Y_q = qconv_op(X_q) + self.assertEqual(Y_q_ref, Y_q) """Tests the correctness of quantized convolution op.""" @@ -4432,8 +4798,11 @@ def test_qconv_transpose2d( use_bias): if qengine_is_qnnpack() and (IS_PPC or TEST_WITH_UBSAN): return # QNNPACK doesn't support these - assume(o_pad_h < stride_h or o_pad_h < dilation) - assume(o_pad_w < stride_w or o_pad_w < dilation) + # ONEDNN does not support output paddings + if qengine_is_onednn() and (o_pad_h, o_pad_w) != (0, 0): + return + assume(o_pad_h < stride_h and o_pad_h < dilation) + assume(o_pad_w < stride_w and o_pad_w < dilation) input_channels = input_channels_per_group * groups output_channels = output_channels_per_group * groups @@ -4456,40 +4825,50 @@ def test_qconv_transpose2d( dilation=dilations, bias=use_bias ) - X_q, W_q, bias_float = self._test_qconv_impl( - qconv, qconv_prepack, conv_op, batch_size, - input_channels_per_group, (height, width), - output_channels_per_group, groups, kernels, strides, pads, o_pads, - dilations, X_scale, X_zero_point, W_scale, W_zero_point, - Y_scale, Y_zero_point, use_bias, use_relu=False, - use_channelwise=False, use_transpose=True) + act_qdtypes = [torch.quint8] + # Only qnnpack qengine supportes qint8 + if qengine_is_qnnpack() and torch.backends.xnnpack.enabled: + act_qdtypes.append(torch.qint8) - # check that this doesn't error - test_conv = torch.nn.quantized.ConvTranspose2d(input_channels, output_channels, 1) - test_conv(X_q) + for X_qdtype in act_qdtypes: + if X_qdtype == torch.qint8: + W_zero_point = [0 for i in range(len(W_zero_point))] - # Test the module implementation - qconv_op = torch.nn.quantized.ConvTranspose2d( - in_channels=input_channels, - out_channels=output_channels, - kernel_size=kernels, - stride=strides, - padding=pads, - output_padding=o_pads, - groups=groups, - dilation=dilations, - bias=use_bias - ) - qconv_op.scale = Y_scale - qconv_op.zero_point = Y_zero_point - qconv_op.set_weight_bias(W_q, bias_float) + X_q, W_q, bias_float = self._test_qconv_impl( + qconv, qconv_prepack, conv_op, batch_size, + input_channels_per_group, (height, width), + output_channels_per_group, groups, kernels, strides, pads, o_pads, + dilations, X_scale, X_zero_point, W_scale, W_zero_point, + Y_scale, Y_zero_point, use_bias, use_relu=False, + use_channelwise=False, use_transpose=True, input_dtype=X_qdtype, output_dtype=X_qdtype) + + # check that this doesn't error + test_conv = torch.nn.quantized.ConvTranspose2d(input_channels, output_channels, 1) + test_conv.scale = Y_scale + test_conv(X_q) + + # Test the module implementation + qconv_op = torch.nn.quantized.ConvTranspose2d( + in_channels=input_channels, + out_channels=output_channels, + kernel_size=kernels, + stride=strides, + padding=pads, + output_padding=o_pads, + groups=groups, + dilation=dilations, + bias=use_bias + ) + qconv_op.scale = Y_scale + qconv_op.zero_point = Y_zero_point + qconv_op.set_weight_bias(W_q, bias_float) - Y_dq_ref = conv_op(X_q.dequantize()) - Y_q_ref = torch.quantize_per_tensor(Y_dq_ref, scale=Y_scale, - zero_point=Y_zero_point, - dtype=torch.quint8) - Y_q = qconv_op(X_q) - self.assertEqual(Y_q_ref, Y_q) + Y_dq_ref = conv_op(X_q.dequantize()) + Y_q_ref = torch.quantize_per_tensor(Y_dq_ref, scale=Y_scale, + zero_point=Y_zero_point, + dtype=X_qdtype) + Y_q = qconv_op(X_q) + self.assertEqual(Y_q_ref, Y_q) """Tests the correctness of quantized convolution op.""" @given(batch_size=st.integers(1, 3), @@ -4551,6 +4930,9 @@ def test_qconv_transpose3d( use_bias): if qengine_is_qnnpack(): return # QNNPACK doesn't support this + # ONEDNN doesn't support output paddings + if qengine_is_onednn() and (o_pad_t, o_pad_h, o_pad_w) != (0, 0, 0): + return assume(o_pad_t < stride_t or o_pad_t < dilation) assume(o_pad_h < stride_h or o_pad_h < dilation) assume(o_pad_w < stride_w or o_pad_w < dilation) @@ -4586,6 +4968,7 @@ def test_qconv_transpose3d( # check that this doesn't error test_conv = torch.nn.quantized.ConvTranspose3d(input_channels, output_channels, 1) + test_conv.scale = Y_scale test_conv(X_q) # Test the module implementation @@ -4730,12 +5113,11 @@ def test_qconv1d( use_relu, use_channelwise, ): - input_channels = input_channels_per_group * groups output_channels = output_channels_per_group * groups if torch.backends.quantized.engine == 'qnnpack': use_channelwise = False - true_conv1d = torch.nn.Conv1d( + conv1d = torch.nn.Conv1d( input_channels, output_channels, kernel, @@ -4748,12 +5130,104 @@ def test_qconv1d( qconv = torch.ops.quantized.conv1d if use_relu: qconv = torch.ops.quantized.conv1d_relu + + act_qdtypes = [torch.quint8] + # Only qnnpack qengine supportes qint8 + if qengine_is_qnnpack() and torch.backends.xnnpack.enabled: + act_qdtypes.append(torch.qint8) + + for X_qdtype in act_qdtypes: + if X_qdtype == torch.qint8: + W_zero_point = [0 for i in range(len(W_zero_point))] + + self._test_qconv_impl( + qconv, qconv_prepack, conv1d, batch_size, + input_channels_per_group, (length, ), + output_channels_per_group, groups, kernel, [stride], [pad], None, + [dilation], X_scale, X_zero_point, W_scale, W_zero_point, + Y_scale, Y_zero_point, use_bias, use_relu, use_channelwise, False, + input_dtype=X_qdtype, output_dtype=X_qdtype) + + # TODO: merge this test with test_qconv1d when CUDNN runtime flags becomes available + """Tests the correctness of quantized 1D convolution cudnn op.""" + @given(batch_size=st.integers(1, 6), + # cudnn only supports multiples of 4, but we have explicitly added padding on the backend + input_channels_per_group=st.integers(1, 32), + # cudnn only supports multiples of 4, but we have explicitly added padding on the backend + output_channels_per_group=st.integers(1, 32), + groups=st.integers(1, 1), # currently padding only supports groups=1 + length=st.integers(4, 16), + kernel=st.integers(1, 7), + stride=st.integers(1, 2), + pad=st.integers(0, 2), + # currently cudnn has only been verified to work for dilation = 1 + # TODO: check backend works for dilation > 1 + dilation=st.integers(1, 1), + X_scale=st.floats(1.2, 1.6), + # currently conv cudnn backend is only implemented for int8 symmetric + X_zero_point=st.sampled_from([0]), + W_scale=st.lists(st.floats(0.2, 1.6), min_size=1, max_size=2), + # currently conv cudnn backend is only implemented for int8 symmetric + W_zero_point=st.lists(st.integers(0, 0), min_size=1, max_size=2), + Y_scale=st.floats(4.2, 5.6), + # currently conv cudnn backend is only implemented for int8 symmetric + Y_zero_point=st.sampled_from([0]), + use_bias=st.booleans(), + use_relu=st.booleans(), + # TODO: enable channelwise + use_channelwise=st.sampled_from([False])) + @skipIfNoFBGEMM + @unittest.skipIf(not TEST_CUDNN, "cudnn is not enabled.") + @unittest.skip("Local only - currently the qconv1d_cudnn op is bulid " + "with USE_EXPERIMENTAL_CUDNN_V8_API, we can enable the test " + "after it is built by default") + def test_qconv1d_cudnn( + self, + batch_size, + input_channels_per_group, + output_channels_per_group, + groups, + length, + kernel, + stride, + pad, + dilation, + X_scale, + X_zero_point, + W_scale, + W_zero_point, + Y_scale, + Y_zero_point, + use_bias, + use_relu, + use_channelwise, + ): + input_channels = input_channels_per_group * groups + output_channels = output_channels_per_group * groups + + conv1d = torch.nn.Conv1d( + input_channels, + output_channels, + kernel, + stride, + pad, + dilation, + groups, + ).to(torch.device("cuda")) + qconv_prepack = torch.ops.quantized.conv1d_prepack + if use_relu: + qconv = torch.ops.quantized.conv1d_relu + else: + qconv = torch.ops.quantized.conv1d + self._test_qconv_impl( - qconv, qconv_prepack, true_conv1d, batch_size, + qconv, qconv_prepack, conv1d, batch_size, input_channels_per_group, (length, ), output_channels_per_group, groups, kernel, [stride], [pad], None, [dilation], X_scale, X_zero_point, W_scale, W_zero_point, - Y_scale, Y_zero_point, use_bias, use_relu, use_channelwise, False) + Y_scale, Y_zero_point, use_bias, use_relu, use_channelwise, False, + device=torch.device("cuda"), + input_dtype=torch.qint8, weight_dtype=torch.qint8, output_dtype=torch.qint8) @given(batch_size=st.integers(1, 4), input_channels_per_group=st.sampled_from([2, 4, 5, 8, 16]), @@ -5088,7 +5562,7 @@ def test_qnnpack_sigmoid_sweep(self): """Tests the correctness of the quantized::add (qnnpack) op.""" @settings(suppress_health_check=(HealthCheck.filter_too_much,)) @given(A=hu.tensor(shapes=hu.array_shapes(1, 5, 1, 5), - qparams=hu.qparams(dtypes=torch.quint8)), + qparams=hu.qparams(dtypes=[torch.quint8, torch.qint8])), zero_point=st.sampled_from([0, 2, 5, 15, 127]), scale_A=st.sampled_from([0.001, 0.057, 0.889, 12.3]), scale_B=st.sampled_from([0.008, 0.0821, 0.67, 7]), @@ -5096,39 +5570,96 @@ def test_qnnpack_sigmoid_sweep(self): def test_qnnpack_add(self, A, zero_point, scale_A, scale_B, scale_C): with override_quantized_engine('qnnpack'): A_temp = A - A, (scale_a, zero_point_A, torch_type) = A_temp - B, (scale_b, zero_point_B, torch_type) = A_temp - A = torch.from_numpy(A) - B = torch.from_numpy(B) - - assume(scale_A // scale_C >= 2**-14) - assume(scale_A // scale_C < 2**8) - assume(scale_B // scale_C >= 2**-14) - assume(scale_B // scale_C < 2**8) - - zero_point_C = 127 - qA = torch.quantize_per_tensor(A, scale=scale_A, zero_point=zero_point, - dtype=torch.quint8) - qB = torch.quantize_per_tensor(B, scale=scale_B, zero_point=zero_point, - dtype=torch.quint8) + for channels_last in [True, False]: + if channels_last and len(A_temp[0].shape) != 4: + continue + A, (scale_a, zero_point_A, torch_type) = A_temp + B, (scale_b, zero_point_B, torch_type) = A_temp + A = torch.from_numpy(A) + B = torch.from_numpy(B) - # Add ground truth - C = (qA.dequantize() + qB.dequantize()).numpy() + if torch_type == torch.qint8 and not torch.backends.xnnpack.enabled: + continue - qC = _quantize(C, scale_C, zero_point_C) + if channels_last: + A = A.to(memory_format=torch.channels_last) + B = B.to(memory_format=torch.channels_last) + assume(scale_A // scale_C >= 2**-14) + assume(scale_A // scale_C < 2**8) + assume(scale_B // scale_C >= 2**-14) + assume(scale_B // scale_C < 2**8) - qC_qnnp = torch.ops.quantized.add(qA, qB, scale_C, zero_point_C) + zero_point_C = 127 + np_dtype = np.uint8 - np.testing.assert_equal(qC, qC_qnnp.int_repr(), - "Quantized addition failed.") + if torch_type == torch.qint8: + zero_point_C = 0 + np_dtype = np.int8 - Crelu = C.copy() - Crelu[C < 0] = 0 - qCrelu = torch.quantize_per_tensor(torch.from_numpy(Crelu), scale_C, - zero_point_C, dtype=torch.quint8) - qCrelu_hat = torch.ops.quantized.add_relu(qA, qB, scale=scale_C, zero_point=zero_point_C) - np.testing.assert_equal(qCrelu.int_repr().numpy(), qCrelu_hat.int_repr(), - "Quantized addition with ReLU failed.") + qA = torch.quantize_per_tensor(A, scale=scale_A, zero_point=zero_point, + dtype=torch_type) + qB = torch.quantize_per_tensor(B, scale=scale_B, zero_point=zero_point, + dtype=torch_type) + + # Add ground truth + C = (qA.dequantize() + qB.dequantize()).numpy() + + qC = _quantize(C, scale_C, zero_point_C, dtype=np_dtype) + + qC_qnnp = torch.ops.quantized.add(qA, qB, scale_C, zero_point_C) + + np.testing.assert_equal(qC, qC_qnnp.int_repr(), + "Quantized addition failed.") + + Crelu = C.copy() + Crelu[C < 0] = 0 + qCrelu = torch.quantize_per_tensor(torch.from_numpy(Crelu), scale_C, + zero_point_C, dtype=torch_type) + qCrelu_hat = torch.ops.quantized.add_relu(qA, qB, scale=scale_C, zero_point=zero_point_C) + np.testing.assert_equal(qCrelu.int_repr().numpy(), qCrelu_hat.int_repr(), + "Quantized addition with ReLU failed.") + + """Tests that quantized add works with broadcasting """ + def test_qnnpack_add_broadcast(self): + def _run_test(A, B): + qA = torch.quantize_per_tensor(A, 0.02, 0, dtype) + qB = torch.quantize_per_tensor(B, 0.04, 2, dtype) + + output_scale = 0.01 + output_zp = 1 + + # ground truth + C = qA.dequantize() + qB.dequantize() + qC = torch.quantize_per_tensor(C, output_scale, output_zp, dtype) + + # quantized + qC_hat_1 = torch.ops.quantized.add(qA, qB, output_scale, output_zp) + qC_hat_2 = torch.ops.quantized.add(qB, qA, output_scale, output_zp) + + self.assertTrue(torch.allclose(qC.dequantize(), qC_hat_1.dequantize())) + self.assertTrue(torch.allclose(qC.dequantize(), qC_hat_2.dequantize())) + + with override_quantized_engine("qnnpack"): + for dtype in (torch.qint8, torch.quint8): + if dtype == torch.qint8 and not torch.backends.xnnpack.enabled: + continue + + for channels_last in [True, False]: + # 4d + A = torch.randn(1, 3, 4, 4) + B = torch.randn(1, 1, 1, 1) + if channels_last: + A = A.to(memory_format=torch.channels_last) + B = B.to(memory_format=torch.channels_last) + _run_test(A, B) + + # 5d + C = torch.randn(1, 3, 4, 4, 4) + D = torch.randn(1, 1, 1, 1, 1) + if channels_last: + C = C.to(memory_format=torch.channels_last_3d) + D = D.to(memory_format=torch.channels_last_3d) + _run_test(C, D) """Tests the correctness of quantized::qnnpack_maxpool2d op.""" @given(A=hu.tensor(shapes=hu.array_shapes(4, 4, 3, 5), diff --git a/test/quantization/core/test_quantized_tensor.py b/test/quantization/core/test_quantized_tensor.py index 45931637eb68..61dda57268bd 100644 --- a/test/quantization/core/test_quantized_tensor.py +++ b/test/quantization/core/test_quantized_tensor.py @@ -140,6 +140,56 @@ def _compress_uniform_simplified(X, bit_rate, xmin, xmax, fp16_scale_bias=True): return Xq, loss class TestQuantizedTensor(TestCase): + def test_per_tensor_qtensor_to_memory_format(self): + n = np.random.randint(1, 10) + c = np.random.randint(2, 10) + h = np.random.randint(2, 10) + w = np.random.randint(2, 10) + x = torch.rand(n, c, h, w) + scale = np.random.uniform(0.1, 1.0) + zero_point = np.random.randint(0.0, 10) + qints = [torch.qint8, torch.quint8, torch.qint32] + dtype = qints[np.random.randint(0, len(qints))] + qx = torch.quantize_per_tensor(x, scale=scale, zero_point=zero_point, dtype=dtype) + x_nhwc = x.to(memory_format=torch.channels_last) + qx_nhwc_using_to = qx.to(memory_format=torch.channels_last) + qx_nhwc_using_contiguous = qx.contiguous(memory_format=torch.channels_last) + self.assertEqual(qx_nhwc_using_to.stride(), qx_nhwc_using_contiguous.stride()) + self.assertEqual(qx_nhwc_using_to.stride(), x_nhwc.stride()) + + # When the last two dimensions of a 4D tensor are both size 1 or if c == 1, we have a degenerate case + # see https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html + # In this case, the output of torch.Tensor.to and torch.Tensor.contiguous should not be the same + x = torch.rand(10, 2, 1, 1) + qx = torch.quantize_per_tensor(x, scale=scale, zero_point=zero_point, dtype=dtype) + qx_nhwc_using_to = qx.to(memory_format=torch.channels_last) + qx_nhwc_using_contiguous = qx.contiguous(memory_format=torch.channels_last) + self.assertNotEqual(qx_nhwc_using_to.stride(), qx_nhwc_using_contiguous.stride()) + + x = torch.rand(10, 1, 2, 2) + qx = torch.quantize_per_tensor(x, scale=scale, zero_point=zero_point, dtype=dtype) + qx_nhwc_using_to = qx.to(memory_format=torch.channels_last) + qx_nhwc_using_contiguous = qx.contiguous(memory_format=torch.channels_last) + self.assertNotEqual(qx_nhwc_using_to.stride(), qx_nhwc_using_contiguous.stride()) + + def test_per_channel_qtensor_to_memory_format(self): + n = np.random.randint(1, 10) + c = np.random.randint(2, 10) + h = np.random.randint(2, 10) + w = np.random.randint(2, 10) + x = torch.rand(n, c, h, w) + x_nhwc = x.to(memory_format=torch.channels_last) + scale = np.random.uniform(0.1, 1.0) + zero_point = np.random.randint(0.0, 10) + qints = [torch.qint8, torch.quint8, torch.qint32] + dtype = qints[np.random.randint(0, len(qints))] + for axis in range(x.ndim): + scales = torch.rand(x.size(axis)) + 0.00001 + zero_points = torch.randint(low=0, high=10, size=(x.size(axis), )) + qx = torch.quantize_per_channel(x, scales=scales, zero_points=zero_points, dtype=dtype, axis=axis) + qx_nhwc_using_to = qx.to(memory_format=torch.channels_last) + self.assertEqual(qx_nhwc_using_to.stride(), x_nhwc.stride()) + @unittest.skipIf(not TEST_CUDA, "No gpu is available.") def test_qtensor_cuda(self): self._test_qtensor(torch.device('cuda')) @@ -304,25 +354,33 @@ def test_qtensor_float_assignment(self): # item scale = 1.0 zero_point = 2 - r = torch.ones(1, dtype=torch.float) - for dtype in [torch.qint8, torch.quint8, torch.qint32]: - qr = torch.quantize_per_tensor(r, scale, zero_point, dtype=dtype) - self.assertEqual(qr.item(), 1) - self.assertEqual(qr[0].item(), 1) - # assignment - self.assertTrue(qr[0].is_quantized) - qr[0] = 11.3 # float assignment - self.assertEqual(qr.item(), 11) - x = torch.ones(1, dtype=torch.float) * 15.3 - # Copying from a float Tensor - qr[:] = x - self.assertEqual(qr.item(), 15) - - dtype_msg = str(dtype) + ", " - self.assertEqual(' '.join(str(qr).split()), - "tensor([15.], size=(1,), dtype=" + dtype_msg + - "quantization_scheme=torch.per_tensor_affine, " + - "scale=1.0, zero_point=2)") + devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"] + for device in devices: + r = torch.ones(1, dtype=torch.float).to(device=device) + for dtype in [torch.qint8, torch.quint8, torch.qint32]: + qr = torch.quantize_per_tensor(r, scale, zero_point, dtype=dtype) + self.assertEqual(qr.item(), 1) + self.assertEqual(qr[0].item(), 1) + # assignment + self.assertTrue(qr[0].is_quantized) + qr[0] = torch.Tensor([11.3]).to(device=device) # float assignment + self.assertEqual(qr.item(), 11) + x = torch.ones(1, dtype=torch.float).to(device=device) * 15.3 + # Copying from a float Tensor + qr[:] = x + self.assertEqual(qr.item(), 15) + + dtype_msg = str(dtype) + ", " + if device == "cuda": + self.assertEqual(' '.join(str(qr).split()), + "tensor([15.], device='" + str(qr.device) + "', size=(1,), dtype=" + dtype_msg + + "quantization_scheme=torch.per_tensor_affine, " + + "scale=1.0, zero_point=2)") + else: + self.assertEqual(' '.join(str(qr).split()), + "tensor([15.], size=(1,), dtype=" + dtype_msg + + "quantization_scheme=torch.per_tensor_affine, " + + "scale=1.0, zero_point=2)") def test_qtensor_quant_dequant(self): scale = 0.02 @@ -490,7 +548,7 @@ def test_per_channel_to_device(self): self.assertEqual('cpu', dqr_cuda.q_per_channel_scales().device.type) self.assertEqual('cpu', dqr_cuda.q_per_channel_zero_points().device.type) - @unittest.skipIf(not torch.cuda.is_available() or TEST_WITH_ROCM, 'CUDA is not available') + @unittest.skipIf(not torch.cuda.is_available(), 'CUDA is not available') def test_compare_per_tensor_device_numerics(self): dtypes = [ torch.quint8, @@ -511,7 +569,7 @@ def test_compare_per_tensor_device_numerics(self): self.assertEqual(qtr.int_repr(), qtr_cuda.int_repr()) self.assertTrue(np.allclose(dqtr, dqtr_cuda.cpu())) - @unittest.skipIf(not torch.cuda.is_available() or TEST_WITH_ROCM, 'CUDA is not available') + @unittest.skipIf(not torch.cuda.is_available(), 'CUDA is not available') def test_compare_per_channel_device_numerics(self): dtype_and_zero_types = [ (torch.quint8, torch.float), @@ -665,7 +723,7 @@ def _quantize_per_channel_sub_byte_ref(data, scales, zero_points, axis, bit_widt data = data.view(-1, dims[axis], np.prod(dims[axis + 1:])) qtensor_size = math.ceil(data.numel() / 2) res = torch.empty(qtensor_size, dtype=torch.uint8) - elem_per_byte = 8 / bit_width + elem_per_byte = 8 // bit_width quant_min, quant_max = _get_qranges(bit_width) for i in range(data.size()[0]): for j in range(data.size()[1]): @@ -1101,7 +1159,7 @@ def test_choose_qparams(self, X, reduce_range): np.testing.assert_array_almost_equal(X_scale, qparams[0], decimal=3) self.assertEqual(X_zp, qparams[1]) - @unittest.skipIf(not torch.cuda.is_available() or TEST_WITH_ROCM, 'CUDA is not available') + @unittest.skipIf(not torch.cuda.is_available(), 'CUDA is not available') def test_cuda_quantization_does_not_pin_memory(self): # Context - https://github.com/pytorch/pytorch/issues/41115 x = torch.randn(3) @@ -1114,7 +1172,7 @@ def test_cuda_quantization_does_not_pin_memory(self): self.assertEqual(x.is_pinned(), False) # There's no way to actually pin the memory of a quantized tensor - @unittest.skipIf(not torch.cuda.is_available() or TEST_WITH_ROCM, 'CUDA is not available') + @unittest.skipIf(not torch.cuda.is_available(), 'CUDA is not available') def test_quant_pin_memory(self): x = torch.randn(3).pin_memory() self.assertEqual(x.is_pinned(), True) diff --git a/test/quantization/core/test_workflow_module.py b/test/quantization/core/test_workflow_module.py index 5415e2b03dcb..f299026b3192 100644 --- a/test/quantization/core/test_workflow_module.py +++ b/test/quantization/core/test_workflow_module.py @@ -68,50 +68,70 @@ tolerance = 1e-6 class TestObserver(QuantizationTestCase): - @given(qdtype=st.sampled_from((torch.qint8, torch.quint8)), + @given(qdtype=st.sampled_from((torch.qint8, torch.quint8, torch.qint32)), qscheme=st.sampled_from((torch.per_tensor_affine, torch.per_tensor_symmetric)), reduce_range=st.booleans()) def test_per_tensor_observers(self, qdtype, qscheme, reduce_range): # reduce_range cannot be true for symmetric quantization with uint8 - if qdtype == torch.quint8 and qscheme == torch.per_tensor_symmetric: + if (qdtype == torch.quint8 and qscheme == torch.per_tensor_symmetric) or qdtype == torch.qint32: reduce_range = False ObserverList = [MinMaxObserver(dtype=qdtype, qscheme=qscheme, reduce_range=reduce_range), MovingAverageMinMaxObserver(averaging_constant=0.5, dtype=qdtype, qscheme=qscheme, reduce_range=reduce_range)] + + def _get_ref_params(reduce_range, qscheme, dtype, input_scale, min_val, max_val): + eps = torch.tensor([tolerance]) + if dtype == torch.qint8: + if reduce_range: + quant_min, quant_max = -64, 63 + else: + quant_min, quant_max = -128, 127 + elif dtype == torch.quint8: + if reduce_range: + quant_min, quant_max = 0, 127 + else: + quant_min, quant_max = 0, 255 + elif dtype == torch.qint32: + quant_min, quant_max = -1 * (2 ** 31), (2 ** 31) - 1 + + min_val_neg = torch.tensor([0.]) + max_val_pos = torch.tensor([input_scale * max_val]) if qdtype is torch.qint32 else torch.tensor([max_val]) + + scale, zero_point = 1.0, 0 + if qscheme == torch.per_tensor_symmetric or qscheme == torch.per_channel_symmetric: + scale = torch.max(-min_val_neg, max_val_pos) / (float(quant_max - quant_min) / 2) + scale = torch.max(scale, eps) + if dtype == torch.quint8: + zero_point = 128 + else: + scale = torch.max((max_val_pos - min_val_neg) / float(quant_max - quant_min), eps) + zero_point = quant_min - torch.round(min_val_neg / scale).to(torch.int) + zero_point = torch.clamp(zero_point, quant_min, quant_max) + return scale, zero_point + for myobs in ObserverList: # Calculate Qparams should return with a warning for observers with no data qparams = myobs.calculate_qparams() + input_scale = 2**16 if qdtype is torch.qint32 else 1 if type(myobs) == MinMaxObserver: - x = torch.tensor([1.0, 2.0, 2.0, 3.0, 4.0, 5.0, 6.0]) - y = torch.tensor([4.0, 5.0, 5.0, 6.0, 7.0, 8.0]) + x = torch.tensor([1.0, 2.0, 2.0, 3.0, 4.0, 5.0, 6.0]) * input_scale + y = torch.tensor([4.0, 5.0, 5.0, 6.0, 7.0, 8.0]) * input_scale else: # Moving average of min/max for x and y matches that of # extreme values for x/y used for minmax observer - x = torch.tensor([0.0, 2.0, 2.0, 3.0, 4.0, 5.0, 6.0]) - y = torch.tensor([2.0, 5.0, 5.0, 6.0, 7.0, 10.0]) + x = torch.tensor([0.0, 2.0, 2.0, 3.0, 4.0, 5.0, 6.0]) * input_scale + y = torch.tensor([2.0, 5.0, 5.0, 6.0, 7.0, 10.0]) * input_scale result = myobs(x) result = myobs(y) self.assertEqual(result, y) - self.assertEqual(myobs.min_val, 1.0) - self.assertEqual(myobs.max_val, 8.0) + self.assertEqual(myobs.min_val, 1.0 * input_scale) + self.assertEqual(myobs.max_val, 8.0 * input_scale) qparams = myobs.calculate_qparams() - if reduce_range: - if qscheme == torch.per_tensor_symmetric: - ref_scale = 0.062745 * 255 / 127 - ref_zero_point = 0 if qdtype is torch.qint8 else 128 - else: - ref_scale = 0.0313725 * 255 / 127 - ref_zero_point = -64 if qdtype is torch.qint8 else 0 - else: - if qscheme == torch.per_tensor_symmetric: - ref_scale = 0.062745 - ref_zero_point = 0 if qdtype is torch.qint8 else 128 - else: - ref_scale = 0.0313725 - ref_zero_point = -128 if qdtype is torch.qint8 else 0 + ref_scale, ref_zero_point = _get_ref_params(reduce_range, qscheme, qdtype, input_scale, 1.0, 8.0) + self.assertEqual(qparams[1].item(), ref_zero_point) self.assertEqual(qparams[0].item(), ref_scale, atol=1e-5, rtol=0) state_dict = myobs.state_dict() @@ -380,7 +400,7 @@ def test_zero_numel(self): x = obs(x) def _test_memoryless(self, obs_class): - obs = obs_class(memoryless=True) + obs = obs_class(averaging_constant=1) x = torch.randn((3, 3)) obs(x) params = obs.calculate_qparams() @@ -391,10 +411,10 @@ def _test_memoryless(self, obs_class): self.assertEqual(params, obs.calculate_qparams()) def test_memoryless_minmaxobserver(self): - self._test_memoryless(MinMaxObserver) + self._test_memoryless(MovingAverageMinMaxObserver) def test_memoryless_perchannelminmaxobserver(self): - self._test_memoryless(PerChannelMinMaxObserver) + self._test_memoryless(MovingAveragePerChannelMinMaxObserver) # HistogramObserver that works like it does on master class _ReferenceHistogramObserver(HistogramObserver): @@ -555,10 +575,9 @@ def test_record_observer(self): self.assertEqual(observer_dict['fc1.module.activation_post_process'].get_tensor_value()[0], model(self.calib_data[0][0])) - @given(qdtype=st.sampled_from((torch.qint8, torch.quint8)), - qscheme=st.sampled_from((torch.per_tensor_affine, torch.per_tensor_symmetric))) - def test_observer_scriptable(self, qdtype, qscheme): - obs = RecordingObserver(dtype=qdtype, qscheme=qscheme) + @given(qdtype=st.sampled_from((torch.qint8, torch.quint8))) + def test_observer_scriptable(self, qdtype): + obs = RecordingObserver(dtype=qdtype) scripted = torch.jit.script(obs) x = torch.rand(3, 4) @@ -738,6 +757,17 @@ def test_fq_serializable_per_channel(self): for key in state_dict: self.assertEqual(state_dict[key], loaded_dict[key]) + def test_quant_min_max_override(self): + observer = default_per_channel_weight_observer + # test no override + fq_module = FakeQuantize(observer) + self.assertEqual(fq_module.activation_post_process.quant_min, -128) + self.assertEqual(fq_module.activation_post_process.quant_max, 127) + # test quant_min/quant_max override + fq_module = FakeQuantize(observer, quant_min=0, quant_max=127) + self.assertEqual(fq_module.activation_post_process.quant_min, 0) + self.assertEqual(fq_module.activation_post_process.quant_max, 127) + def _get_buffer_ids(module): """ Object addresses stay constant if and only if all modifications are in-place @@ -1124,9 +1154,8 @@ def test_fused_mod_per_channel(self): def test_fused_mod_reduce_range(self): obs = FusedMovingAvgObsFakeQuantize(quant_min=0, quant_max=255, dtype=torch.quint8, reduce_range=True) - - self.assertEqual(obs.quant_min, 0) - self.assertEqual(obs.quant_max, 127) + self.assertEqual(obs.activation_post_process.quant_min, 0) + self.assertEqual(obs.activation_post_process.quant_max, 127) def test_embedding_bag_qat_config(self): class Model(nn.Module): @@ -1241,16 +1270,19 @@ def forward(self, x): self.assertEqual(count_fake_quant, 3) if qengine == "fbgemm": - self.assertEqual(ref_model.quant.activation_post_process.quant_min, 0) - self.assertEqual(ref_model.quant.activation_post_process.quant_max, 127) - self.assertEqual(type(ref_model.module.linear.weight_fake_quant.activation_post_process), - MovingAveragePerChannelMinMaxObserver) - else: - self.assertEqual(ref_model.quant.activation_post_process.quant_min, 0) - self.assertEqual(ref_model.quant.activation_post_process.quant_max, 255) - self.assertEqual(type(ref_model.module.linear.weight_fake_quant.activation_post_process), - MovingAverageMinMaxObserver) + lower_bnd = 0 + upper_bnd = 127 + obs2match = MovingAveragePerChannelMinMaxObserver + else: + lower_bnd = 0 + upper_bnd = 255 + obs2match = MovingAverageMinMaxObserver + + self.assertEqual(ref_model.quant.activation_post_process.activation_post_process.quant_min, lower_bnd) + self.assertEqual(ref_model.quant.activation_post_process.activation_post_process.quant_max, upper_bnd) + self.assertEqual(type(ref_model.module.linear.weight_fake_quant.activation_post_process), + obs2match) if __name__ == '__main__': raise RuntimeError("This test file is not meant to be run directly, use:\n\n" diff --git a/test/quantization/core/test_workflow_ops.py b/test/quantization/core/test_workflow_ops.py index 8b4baf10d45d..4972a0324e2c 100644 --- a/test/quantization/core/test_workflow_ops.py +++ b/test/quantization/core/test_workflow_ops.py @@ -7,7 +7,7 @@ FakeQuantize, MovingAverageMinMaxObserver, default_observer, - default_affine_fixed_qparams_fake_quant, + default_fixed_qparams_range_0to1_fake_quant, ) from torch.ao.quantization._learnable_fake_quantize import _LearnableFakeQuantize @@ -544,7 +544,7 @@ def test_fq_module_per_tensor(self, device, X): def test_fixed_qparams_fq_module(self, device, X): X, (scale, zero_point, torch_type) = X X = to_tensor(X, device) - fq_module = default_affine_fixed_qparams_fake_quant() + fq_module = default_fixed_qparams_range_0to1_fake_quant() fq_module.to(device) fixed_scale = fq_module.scale.clone() fixed_zero_point = fq_module.zero_point.clone() diff --git a/test/quantization/dbr/test_quantize_dbr.py b/test/quantization/dbr/test_quantize_dbr.py index 30d3b64bdeb0..cd6dd6968ad0 100644 --- a/test/quantization/dbr/test_quantize_dbr.py +++ b/test/quantization/dbr/test_quantize_dbr.py @@ -22,6 +22,8 @@ from torch.quantization import ( ObserverBase, FakeQuantizeBase, + QConfig, + MinMaxObserver, ) from torch.quantization.quantize_fx import ( prepare_fx, @@ -33,6 +35,9 @@ import torch.ao.ns._numeric_suite_dbr as ns # TODO(future PR): move these utils out of the FX folder import torch.ao.ns._numeric_suite_fx as ns_fx +from torch.ao.quantization._dbr.torchscript_utils import ( + remove_redundant_aliases, +) def _allclose(a, b): if isinstance(a, tuple): @@ -248,9 +253,9 @@ def forward(self, x): x = torch.cat([x, x], dim=1) return x - m = M().eval() qconfig = torch.quantization.default_qconfig for dtype in (torch.int32, torch.int64): + m = M().eval() self._test_auto_tracing( m, qconfig, (torch.zeros(1, 1, 1, 1, dtype=dtype),), # FX graph mode quant does not support this yet @@ -422,6 +427,10 @@ def test_fusion_called_multiple_times(self): """ Tests that fusion works if the modules to fuse get called multiple times in the same forward. + + Currently, observers are not shared between successive calls of + the same module. + TODO(future PR): make them shared (this is easy to detect) """ class M(torch.nn.Module): def __init__(self): @@ -437,7 +446,10 @@ def forward(self, x): m = M().eval() qconfig = torch.quantization.default_qconfig - self._test_auto_tracing(m, qconfig, (torch.randn(1, 1, 2, 2),)) + # fx graph mode quant doesn't support using a single module multiple times + # right now, so this would crash, we can handle this case later + # if it is needed + self._test_auto_tracing(m, qconfig, (torch.randn(1, 1, 2, 2),), do_fx_comparison=False) def test_fusion_functions(self): class M(torch.nn.Module): @@ -553,6 +565,7 @@ def forward(self, x): # test backprop does not crash inputs = torch.randn(1, 1, 1, 1) inputs.requires_grad = True + m = M(torch.randn(1, 1, 1, 1), torch.randn(1)).eval() mp = _quantize_dbr.prepare(m, {'': qconfig}, (inputs,)) output = mp(inputs) labels = torch.randn(1, 1, 1, 1) @@ -853,9 +866,10 @@ def forward(self, x): qconfig = torch.quantization.default_qconfig self._test_auto_tracing(model_fp32, qconfig, (torch.randn(1, 1, 2, 2),)) - @unittest.skip('this depends on unsupported syntax detection, currently disabled') def test_vovnet_sequential(self): - + # We cannot quantize SequentialAppendList directly because + # AutoQuantizationStateModuleDict would appear in self.items. + # However, we can wrap it and quantize the wrapper. class SequentialAppendList(nn.Sequential): def __init__(self, *args): super(SequentialAppendList, self).__init__(*args) @@ -870,7 +884,16 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: x = torch.cat(concat_list, dim=1) return x - m = SequentialAppendList(torch.nn.Conv2d(1, 1, 1)).eval() + class Wrapper(nn.Module): + def __init__(self, *args): + super().__init__() + self.append_list = SequentialAppendList(*args) + + def forward(self, x): + x = self.append_list(x) + return x + + m = Wrapper(torch.nn.Conv2d(1, 1, 1)).eval() qconfig = torch.quantization.default_qconfig self._test_auto_tracing(m, qconfig, (torch.randn(1, 1, 1, 1),)) @@ -922,10 +945,11 @@ def forward(self, x): model_fp32, qconfig, (torch.randn(1, 1, 2, 2),), fuse_modules=False) - # this is broken because AutoQuantizationState appears in self.items - @unittest.skip('TODO fix this') def test_module_calls_items(self): - class M(torch.nn.ModuleDict): + # We cannot quantize M1 directly because + # AutoQuantizationStateModuleDict would appear in self.items. + # However, we can wrap it and quantize the wrapper. + class M1(torch.nn.ModuleDict): def __init__(self): super().__init__() for i in range(2): @@ -938,10 +962,22 @@ def forward(self, x): layers.append(layer(x)) return torch.cat(layers, dim=1) - model_fp32 = M().eval() + class M2(torch.nn.Module): + def __init__(self): + super().__init__() + self.m1 = M1() + + def forward(self, x): + x = self.m1(x) + return x + + model_fp32 = M2().eval() qconfig = torch.quantization.default_qconfig self._test_auto_tracing( - model_fp32, qconfig, (torch.randn(1, 1, 2, 2),)) + model_fp32, qconfig, (torch.randn(1, 1, 2, 2),), + # TODO(future PR): implement observer sharing for torch.cat + # in DBR quant, to ensure that numerical behavior matches + do_fx_comparison=False) def test_subclass_of_quantizeable_module(self): """ @@ -1280,6 +1316,52 @@ def forward(self, x): input_shape = (1, 1, 1, 1) self._test_serialization(M, input_shape) + def test_jit_tracing_removes_aliases(self): + m = nn.Sequential( + nn.Conv2d(1, 1, 1), + nn.Sequential( + nn.Conv2d(1, 1, 1), + ), + ) + qconfig_dict = {'': torch.quantization.default_qconfig} + example_args = (torch.randn(1, 1, 1, 1),) + mp = _quantize_dbr.prepare(m, qconfig_dict, example_args) + mq = _quantize_dbr.convert(mp) + mqs = torch.jit.trace(mq, example_args) + FileCheck().check_count("aten::alias", 5, exactly=True).run( + mqs.inlined_graph) + res1 = mqs(*example_args) + mqs = remove_redundant_aliases(mqs) + res2 = mqs(*example_args) + self.assertTrue(torch.allclose(res1, res2)) + # TODO(future PR): figure out why aliasing still appears in the inlined + # graph, and if that is fixed then just check the inlined graph. + for graph in ( + mqs.graph, + getattr(mqs, '1').graph, + getattr(getattr(mqs, '1'), '0').graph, + ): + FileCheck().check_count("aten::alias", 0, exactly=True).run(graph) + + def test_conv_int32_reference_model(self): + m = nn.Sequential(nn.Conv2d(1, 1, 1)).eval() + int32_obs_ctr = MinMaxObserver.with_args(dtype=torch.qint32) + int32_qconfig = QConfig(weight=int32_obs_ctr, activation=int32_obs_ctr) + qconfig_dict = {'': int32_qconfig} + mp = _quantize_dbr.prepare(m, qconfig_dict, (torch.randn(1, 1, 1, 1),)) + mp(torch.randn(1, 1, 1, 1)) + mq = _quantize_dbr.convert(mp) + res = mq(torch.randn(1, 1, 1, 1)) + mqt = torch.jit.trace(mq, (torch.randn(1, 1, 1, 1),)) + # verify the right ops are present: + # x0 -> quant -> (dequant -> conv_ref -> quant) -> dequant -> x1 + FileCheck()\ + .check_count("aten::quantize_per_tensor", 2, exactly=True)\ + .run(mqt.graph) + FileCheck()\ + .check_count("aten::dequantize", 2, exactly=True)\ + .run(mqt.graph) + @skipIfNoFBGEMM class TestQuantizeDBRMultipleOps(QuantizeDBRTestCase): """ @@ -1520,3 +1602,18 @@ def test_mobilenet_v2(self): m, qconfig, (torch.randn(1, 3, 224, 224),), # TODO fix this (reason TBD) do_torchscript_checks=False) + + @skip_if_no_torchvision + def test_mobilenet_v2_removes_aliases(self): + import torchvision + m = torchvision.models.__dict__['mobilenet_v2'](pretrained=False)\ + .eval().float() + qconfig_dict = {'': torch.quantization.default_qconfig} + example_args = (torch.randn(1, 3, 224, 224),) + mp = _quantize_dbr.prepare(m, qconfig_dict, example_args) + mq = _quantize_dbr.convert(mp) + mqs = torch.jit.trace(mq, example_args) + res1 = mqs(*example_args) + mqs = remove_redundant_aliases(mqs) + res2 = mqs(*example_args) + self.assertTrue(torch.allclose(res1, res2)) diff --git a/test/quantization/eager/test_numeric_suite_eager.py b/test/quantization/eager/test_numeric_suite_eager.py index 3bf969395c51..3714a1f28c67 100644 --- a/test/quantization/eager/test_numeric_suite_eager.py +++ b/test/quantization/eager/test_numeric_suite_eager.py @@ -19,6 +19,8 @@ compare_model_outputs, compare_model_stub, compare_weights, + prepare_model_outputs, + get_matching_activations, ) from torch.testing._internal.common_quantization import ( AnnotatedConvBnReLUModel, @@ -30,6 +32,7 @@ QuantizationTestCase, SingleLayerLinearDynamicModel, test_only_eval_fn, + skip_if_no_torchvision, ) from torch.testing._internal.common_quantized import override_qengines @@ -421,14 +424,12 @@ def test_compare_model_outputs_functional_static(self): q_model(self.img_data_2d[0][0]) q_model = convert(q_model) act_compare_dict = compare_model_outputs(model, q_model, self.img_data_2d[0][0]) - self.assertEqual(len(act_compare_dict), 7) + self.assertEqual(len(act_compare_dict), 5) expected_act_compare_dict_keys = { "mycat.stats", "myadd.stats", "mymul.stats", "myadd_relu.stats", - "my_scalar_add.stats", - "my_scalar_mul.stats", "quant.stats", } self.assertTrue(act_compare_dict.keys() == expected_act_compare_dict_keys) @@ -534,3 +535,50 @@ def test_shadow_logger(self): self.assertEqual(len(logger.stats["float"]), 2) self.assertEqual(len(logger.stats["quantized"]), 2) + + @skip_if_no_torchvision + def _test_vision_model(self, float_model): + float_model.to('cpu') + float_model.eval() + float_model.fuse_model() + float_model.qconfig = torch.quantization.default_qconfig + img_data = [(torch.rand(2, 3, 224, 224, dtype=torch.float), torch.randint(0, 1, (2,), dtype=torch.long)) for _ in range(2)] + qmodel = quantize(float_model, torch.quantization.default_eval_fn, [img_data], inplace=False) + + wt_compare_dict = compare_weights(float_model.state_dict(), qmodel.state_dict()) + + def compute_error(x, y): + Ps = torch.norm(x) + Pn = torch.norm(x - y) + return 20 * torch.log10(Ps / Pn) + + data = img_data[0][0] + # Take in floating point and quantized model as well as input data, and returns a dict, with keys + # corresponding to the quantized module names and each entry being a dictionary with two keys 'float' and + # 'quantized', containing the activations of floating point and quantized model at matching locations. + act_compare_dict = compare_model_outputs(float_model, qmodel, data) + + + for key in act_compare_dict: + compute_error(act_compare_dict[key]['float'][0], act_compare_dict[key]['quantized'][0].dequantize()) + + prepare_model_outputs(float_model, qmodel) + + for data in img_data: + float_model(data[0]) + qmodel(data[0]) + + # Find the matching activation between floating point and quantized modules, and return a dict with key + # corresponding to quantized module names and each entry being a dictionary with two keys 'float' + # and 'quantized', containing the matching floating point and quantized activations logged by the logger + act_compare_dict = get_matching_activations(float_model, qmodel) + + @skip_if_no_torchvision + def test_mobilenet_v2(self): + from torchvision.models.quantization import mobilenet_v2 + self._test_vision_model(mobilenet_v2(pretrained=True, quantize=False)) + + @skip_if_no_torchvision + def test_mobilenet_v3(self): + from torchvision.models.quantization import mobilenet_v3_large + self._test_vision_model(mobilenet_v3_large(pretrained=True, quantize=False)) diff --git a/test/quantization/eager/test_quantize_eager_ptq.py b/test/quantization/eager/test_quantize_eager_ptq.py index 6587740bdf9e..d06575c51bf2 100644 --- a/test/quantization/eager/test_quantize_eager_ptq.py +++ b/test/quantization/eager/test_quantize_eager_ptq.py @@ -62,6 +62,8 @@ override_qengines, ) from torch.testing._internal.jit_utils import JitTestCase +from torch.testing._internal.common_utils import skipIfNoCaffe2 + from hypothesis import given from hypothesis import strategies as st import torch.testing._internal.hypothesis_utils as hu @@ -74,6 +76,202 @@ import numpy as np class TestQuantizeEagerOps(QuantizationTestCase): + @override_qengines + def _test_reference_module_impl(self, + float_module_class, + quantized_module_class, + extra_module_kwargs, + input_size): + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = float_module_class(**extra_module_kwargs) + self.quant = QuantStub() + self.dequant = DeQuantStub() + + def forward(self, x): + x = self.quant(x) + x = self.conv(x) + x = self.dequant(x) + return x + + class RefM(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = float_module_class(**extra_module_kwargs) + self.quant1 = QuantStub() + self.dequant1 = DeQuantStub() + self.quant2 = QuantStub() + self.dequant2 = DeQuantStub() + + def forward(self, x): + x = self.quant1(x) + x = self.dequant1(x) + x = self.conv(x) + x = self.quant2(x) + x = self.dequant2(x) + return x + + qengine = torch.backends.quantized.engine + if qengine not in supported_qengines or qengine == 'qnnpack': + return # qnnpack does not support nnq.ConvTranspose3d + + data = torch.randn(*input_size, dtype=torch.float) + original_m = M() + original_ref_m = RefM() + + original_ref_m.conv.weight = torch.nn.Parameter(original_m.conv.weight.detach()) + original_ref_m.conv.bias = torch.nn.Parameter(original_m.conv.bias.detach()) + + original_m.qconfig = torch.quantization.default_qconfig + + m = prepare(original_m) + # calibration + m(data) + m = convert(m) + # check if the module is properly quantized + self.assertEqual(type(m.quant), nnq.Quantize) + self.assertEqual(type(m.conv), quantized_module_class) + self.assertEqual(type(m.dequant), nnq.DeQuantize) + res = m(data) + + # quantize the reference model + original_ref_m.eval() + original_ref_m.qconfig = torch.quantization.default_qconfig + + ref_m = prepare(original_ref_m) + ref_m(data) + ref_m = convert(ref_m, is_reference=True) + ref_res = ref_m(data) + self.assertEqual(res, ref_res) + + def test_conv_1d(self): + self._test_reference_module_impl( + nn.Conv1d, + nnq.Conv1d, + {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, + (16, 1, 1) + ) + + def test_conv_2d(self): + self._test_reference_module_impl( + nn.Conv2d, + nnq.Conv2d, + {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, + (16, 1, 10, 10) + ) + + def test_conv_3d(self): + self._test_reference_module_impl( + nn.Conv3d, + nnq.Conv3d, + {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, + (16, 1, 10, 10, 10) + ) + + def test_conv_transpose_1d(self): + self._test_reference_module_impl( + nn.ConvTranspose1d, + nnq.ConvTranspose1d, + {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, + (16, 1, 1) + ) + + def test_conv_transpose_2d(self): + self._test_reference_module_impl( + nn.ConvTranspose2d, + nnq.ConvTranspose2d, + {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, + (16, 1, 10, 10) + ) + + def test_conv_transpose_3d(self): + self._test_reference_module_impl( + nn.ConvTranspose3d, + nnq.ConvTranspose3d, + {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, + (16, 1, 10, 10, 10) + ) + + def test_linear(self): + self._test_reference_module_impl( + nn.Linear, + nnq.Linear, + {'in_features': 5, 'out_features': 10}, + (16, 5) + ) + + @override_qengines + def test_int16_reference_module(self): + + class RefM(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = nn.ConvTranspose2d(1, 1, 1) + self.quant1 = QuantStub() + self.dequant1 = DeQuantStub() + self.quant2 = QuantStub() + self.dequant2 = DeQuantStub() + + def forward(self, x): + x = self.quant1(x) + x = self.dequant1(x) + x = self.conv(x) + x = self.quant2(x) + x = self.dequant2(x) + return x + + + input_size = (16, 1, 10, 10) + data = torch.randn(*input_size, dtype=torch.float) + + original_ref_m = RefM() + rand_w = torch.randn_like(original_ref_m.conv.weight) + rand_b = torch.randn_like(original_ref_m.conv.bias) + original_ref_m.conv.weight = torch.nn.Parameter(rand_w, requires_grad=False) + original_ref_m.conv.bias = torch.nn.Parameter(rand_b, requires_grad=False) + + qengine = torch.backends.quantized.engine + if qengine not in supported_qengines: + return + from torch.ao.quantization.observer import MovingAverageMinMaxObserver + + weight_obs = MovingAverageMinMaxObserver.with_args( + dtype=torch.qint32, + # set qmin and qmax to represent qint16 + quant_min=-1 * (2 ** 15), + quant_max=(2 ** 15) - 1, + qscheme=torch.per_tensor_symmetric, + ) + act_obs = MovingAverageMinMaxObserver.with_args( + dtype=torch.qint32, + quant_min=-1 * (2 ** 15), + quant_max=(2 ** 15) - 1, + ) + custom_qconfig = QConfig(activation=act_obs, weight=weight_obs) + + # quantize the reference model + original_ref_m.eval() + original_ref_m.qconfig = custom_qconfig + + ref_m = prepare(original_ref_m) + # calibration + ref_m(torch.randn(*input_size, dtype=torch.float)) + + ref_m = convert(ref_m, is_reference=True) + + myobs = MovingAverageMinMaxObserver(averaging_constant=0.5, + dtype=torch.qint32, + # set qmin and qmax to represent qint16 + quant_min=-1 * (2 ** 15), + quant_max=(2 ** 15) - 1, + qscheme=torch.per_tensor_symmetric, + ) + result = myobs(rand_w) + qparams = myobs.calculate_qparams() + self.assertEqual(ref_m.conv.weight_scale, qparams[0]) + + def _test_activation_op_impl( self, float_module_class, quantized_module_class, extra_module_kwargs): """ Implementation for testing common activation ops like leaky relu @@ -815,6 +1013,19 @@ def test_convtranspose_per_channel_qconfig_none(self): m[0].qconfig = None mp = torch.ao.quantization.prepare(m) + @skipIfNoFBGEMM + def test_quantwrapper_attaches_qconfig_to_dequant(self): + qconfig = torch.ao.quantization.default_qconfig + + m = nn.Sequential(nn.Conv2d(1, 1, 1)).eval() + for i in range(len(m)): + m[i].qconfig = qconfig + m[i] = torch.ao.quantization.QuantWrapper(m[i]) + + mp = torch.ao.quantization.prepare(m) + mq = torch.ao.quantization.convert(mp) + self.assertTrue(isinstance(mq[0].dequant, nnq.DeQuantize)) + @skipIfNoFBGEMM class TestQuantizeEagerPTQDynamic(QuantizationTestCase): @@ -1250,10 +1461,12 @@ def export_to_onnx(model, input, input_names): model = torch.jit.load(buf) f = io.BytesIO() torch.onnx.export(model, input, f, input_names=input_names, - operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK) + operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK, + opset_version=9) onnx_model = export_to_onnx(model, data, input_names) @skipIfNoFBGEMM + @skipIfNoCaffe2 def test_lower_graph_linear(self): model = torch.ao.quantization.QuantWrapper(torch.nn.Linear(5, 10, bias=True)).to(dtype=torch.float) data_numpy = np.random.rand(1, 2, 5).astype(np.float32) @@ -1261,6 +1474,7 @@ def test_lower_graph_linear(self): self._test_lower_graph_impl(model, data) @skipIfNoFBGEMM + @skipIfNoCaffe2 def test_lower_graph_conv2d(self): model = torch.ao.quantization.QuantWrapper(torch.nn.Conv2d(3, 5, 2, bias=True)).to(dtype=torch.float) data_numpy = np.random.rand(1, 3, 6, 6).astype(np.float32) diff --git a/test/quantization/eager/test_quantize_eager_qat.py b/test/quantization/eager/test_quantize_eager_qat.py index efb7882c2dc3..984e87dacbbc 100644 --- a/test/quantization/eager/test_quantize_eager_qat.py +++ b/test/quantization/eager/test_quantize_eager_qat.py @@ -1,5 +1,6 @@ # Owner(s): ["oncall: quantization"] +import copy import math import torch import torch.nn as nn @@ -10,6 +11,7 @@ import torch.nn.quantized as nnq import torch.nn.quantized.dynamic as nnqd import torch.nn.qat as nnqat +import torch.nn.intrinsic.qat as nniqat import torch.nn.qat.dynamic as nnqatd from torch.ao.quantization import ( prepare, @@ -21,6 +23,7 @@ default_qconfig, default_qat_qconfig, default_embedding_qat_qconfig, + default_symmetric_qnnpack_qat_qconfig, get_default_qat_qconfig, FixedQParamsFakeQuantize, FusedMovingAvgObsFakeQuantize, @@ -37,6 +40,7 @@ ManualDropoutQATModel, ManualLinearDynamicQATModel, ManualConvLinearQATModel, + ManualConvLinearSymmQATModel, ManualEmbeddingBagLinear, TwoLayerLinearModel, test_only_eval_fn, @@ -49,6 +53,8 @@ override_qengines, ) +from torch.testing._internal.common_utils import skipIfNoXNNPACK + from hypothesis import given from hypothesis import strategies as st import torch.testing._internal.hypothesis_utils as hu @@ -338,11 +344,45 @@ def checkQuantized(model): model = quantize_qat(model, test_only_train_fn, [self.img_data_2d_train]) checkQuantized(model) + @skipIfNoXNNPACK + def test_conv_linear_symm(self): + r"""Same as test_conv_linear but with Symmetric quantization. + Supported only with qengine=qnnpack, which uses symmetric + kernels from xnnpack library.""" + for qengine in supported_qengines: + if qengine != 'qnnpack': + continue + with override_quantized_engine(qengine): + model = ManualConvLinearSymmQATModel() + + model = prepare_qat(model) + self.checkObservers(model) + + test_only_train_fn(model, self.img_data_2d_train) + model = convert(model) + + def checkQuantized(model): + self.assertEqual(type(model.conv), nnq.Conv2d) + self.assertEqual(type(model.fc1), nnq.Linear) + self.assertEqual(type(model.fc2), nnq.Linear) + test_only_eval_fn(model, self.img_data_2d) + self.checkScriptable(model, self.img_data_2d) + self.checkNoQconfig(model) + + checkQuantized(model) + + model = ManualConvLinearSymmQATModel() + model = quantize_qat(model, test_only_train_fn, [self.img_data_2d_train]) + checkQuantized(model) + def test_dynamic_qat_linear(self): for qengine in supported_qengines: with override_quantized_engine(qengine): # Dynamic QAT without memoryless observers should fail - with self.assertRaisesRegex(ValueError, "Dynamic QAT requires a memoryless observer"): + with self.assertRaisesRegex(ValueError, + "Dynamic QAT requires a memoryless observer." + + "This means a MovingAverage observer with averaging constant equal to 1" + ): model = ManualLinearDynamicQATModel(default_qat_qconfig) model = prepare_qat(model, mapping={torch.nn.Linear: nnqatd.Linear}) @@ -984,6 +1024,66 @@ def test_conv_bn_folded_vs_unfolded( qat_op_optim.step() qat_ref_op_optim.step() + @override_qengines + def test_linear_bn_numerics(self): + qengine = torch.backends.quantized.engine + m_ref = nn.Sequential( + nn.Linear(4, 4), + nn.BatchNorm1d(4), + ) + m_ref_copy = copy.deepcopy(m_ref) + m_ref_copy = torch.ao.quantization.fuse_modules_qat(m_ref_copy, [['0', '1']]) + qconfig = torch.ao.quantization.get_default_qat_qconfig(qengine) + m_ref_copy[0].qconfig = qconfig + m = nniqat.LinearBn1d.from_float(m_ref_copy[0]) + + # without fake_quants, fused QAT module should match fp32 module + m.apply(torch.quantization.disable_fake_quant) + data = torch.randn(4, 4) + r1 = m_ref(data) + r2 = m(data) + self.assertTrue(torch.allclose(r1, r2)) + + @skipIfNoXNNPACK + @override_qengines + def test_linear_bn_symm_numerics(self): + qengine = torch.backends.quantized.engine + if qengine != "qnnpack": + return # Only qnnpack support symmetric quantization + m_ref = nn.Sequential( + nn.Linear(4, 4), + nn.BatchNorm1d(4), + ) + m_ref_copy = copy.deepcopy(m_ref) + m_ref_copy = torch.ao.quantization.fuse_modules_qat(m_ref_copy, [['0', '1']]) + qconfig = default_symmetric_qnnpack_qat_qconfig + m_ref_copy[0].qconfig = qconfig + m = nniqat.LinearBn1d.from_float(m_ref_copy[0]) + + # without fake_quants, fused QAT module should match fp32 module + m.apply(torch.quantization.disable_fake_quant) + data = torch.randn(4, 4) + r1 = m_ref(data) + r2 = m(data) + self.assertTrue(torch.allclose(r1, r2)) + + @override_qengines + def test_linear_bn_workflow(self): + qengine = torch.backends.quantized.engine + m = nn.Sequential( + QuantStub(), + nn.Linear(4, 4), + nn.BatchNorm1d(4), + ) + data = torch.randn(4, 4) + m.qconfig = torch.ao.quantization.get_default_qat_qconfig(qengine) + m = torch.ao.quantization.fuse_modules_qat(m, [['1', '2']]) + mp = prepare_qat(m) + mp(data) + mq = convert(mp) + self.assertTrue(type(mq[1]) == nnq.Linear) + self.assertTrue(type(mq[2]) == nn.Identity) + if __name__ == '__main__': raise RuntimeError("This test file is not meant to be run directly, use:\n\n" "\tpython test/test_quantization.py TESTNAME\n\n" diff --git a/test/quantization/fx/test_numeric_suite_fx.py b/test/quantization/fx/test_numeric_suite_fx.py index 145b2af81b37..4559c6389be6 100644 --- a/test/quantization/fx/test_numeric_suite_fx.py +++ b/test/quantization/fx/test_numeric_suite_fx.py @@ -71,6 +71,8 @@ extract_shadow_logger_info, extend_logger_results_with_comparison, ) +from torch.ao.quantization.backend_config import get_native_backend_config_dict +from torch.ao.quantization.fx.backend_config_utils import get_pattern_to_quantize_handlers # Note: these models are not for use outside of this file. While it's good @@ -274,7 +276,19 @@ def _wrapped_sigmoid(x): def _wrapped_linear(x, w, b): return F.linear(x, w, b) - +def get_all_quant_patterns(): + """ we are in the process to migrate the frontend of fx graph mode quant + to use backend_config_dict, so some of the patterns are moved to backend_config_dict + this function will include these patterns so that we can still have all the patterns + """ + # TODO: we can remove this call, and get all patterns from backend_config_dict in + # the future when the frontend refactor is done in fx graph mode quantization + all_quant_patterns = get_default_quant_patterns() + # some of the patterns are moved to (native) backend_config_dict so we need to + # add them back here + for pattern, quantize_handler in get_pattern_to_quantize_handlers(get_native_backend_config_dict()).items(): + all_quant_patterns[pattern] = quantize_handler + return all_quant_patterns class TestFXGraphMatcher(QuantizationTestCase): @@ -463,7 +477,6 @@ def forward(self, x0): self.assert_types_for_matched_subgraph_pairs(results, expected_types, mp, mq) @skipIfNoFBGEMM - @unittest.skip("Broken by https://github.com/pytorch/pytorch/pull/62608, need dtype inference support") def test_nodes_with_equal_types_get_matched(self): class M(nn.Module): def __init__(self): @@ -510,13 +523,12 @@ def forward(self, x): conv_name_0: ((nn.Conv2d, torch.ao.quantization.MinMaxObserver), (nn.Conv2d, nn.Conv2d)), mul_name_0: ((torch.mul, torch.ao.quantization.MinMaxObserver), (toq.mul, toq.mul)), - relu_name_0: ((F.relu, torch.ao.quantization.MinMaxObserver), (F.relu, F.relu)), + relu_name_0: ((F.relu, torch.ao.quantization.FixedQParamsObserver), (F.relu, F.relu)), sigmoid_name_0: - ((torch.sigmoid, torch.sigmoid), (torch.sigmoid, torch.sigmoid)), + ((torch.sigmoid, torch.ao.quantization.FixedQParamsObserver), (torch.sigmoid, torch.sigmoid)), } self.assert_types_for_matched_subgraph_pairs(results, expected_types, mp, mq) - @unittest.skip("Broken by https://github.com/pytorch/pytorch/pull/62608, need dtype inference support") def test_methods(self): """ Verify that graph matching works on methods @@ -537,12 +549,11 @@ def forward(self, x): base_name_to_sets_of_related_ops, torch.sigmoid) + '_0' expected_types = { sigmoid_name_0: - (('sigmoid', 'sigmoid'), ('sigmoid', 'sigmoid')), + (('sigmoid', torch.ao.quantization.FixedQParamsObserver), ('sigmoid', torch.ao.quantization.FixedQParamsObserver)), } self.assert_types_for_matched_subgraph_pairs( results, expected_types, m1p, m2p) - def test_op_relationship_mapping(self): """ Tests that the mapping of op relationships is complete. @@ -559,6 +570,15 @@ def test_op_relationship_mapping(self): torch.ao.quantization.QuantStub, torch.ao.quantization.DeQuantStub, nnq.FloatFunctional, + # the ConvTranspose3d swap is not implemented in FX Graph + # mode quantization yet + nn.ConvTranspose3d, + # the GroupNorm swap is not implemented in FX Graph + # mode quantization yet + nn.GroupNorm, + # nnq.ReLU6 is no longer swapped, because nn.ReLU6 can + # take quantized inputs + nn.ReLU6, ) if fp32_type in types_to_skip: continue @@ -620,7 +640,7 @@ def _op_is_unmatchable(op): op in METHS_UNMATCHABLE ) - default_quant_patterns = get_default_quant_patterns() + default_quant_patterns = get_all_quant_patterns() for pattern, qhandler_cls in default_quant_patterns.items(): base_op = None if isinstance(pattern, tuple): @@ -664,9 +684,6 @@ def _op_is_unmatchable(op): # RNNDynamicQuantizeHandler pass elif qhandler_cls == qp.DefaultNodeQuantizeHandler: - # torch.sum does not have quantized equivalents - if base_op == torch.sum: - continue self.assertTrue( _op_in_base_sets_of_related_ops(base_op), f"{base_op} not in sets of related ops") @@ -682,8 +699,23 @@ def _op_is_unmatchable(op): _op_in_base_sets_of_related_ops(base_op), f"{base_op} not in sets of related ops") else: - raise AssertionError( - f"handing for {qhandler_cls} not implemented") + # torch.sum does not have quantized equivalents + if base_op in [ + torch.sum, + nn.GRUCell, + nn.GRU, + nn.LSTMCell, + nn.RNNCell, + ]: + continue + if isinstance(base_op, tuple): + # skip fusion patterns + continue + # didn't match explicit quantize handler class, we can check if the + # operator is in the related op set directly + if not (_op_in_base_sets_of_related_ops(base_op) or _op_is_unmatchable(base_op)): + raise AssertionError( + f"handling for {qhandler_cls} for op {base_op} not implemented") @skipIfNoFBGEMM def test_user_defined_function(self): @@ -1106,8 +1138,6 @@ def _test_add_shadow_loggers_mod_impl(self, prepare_fn=prepare_fx): prepare_fn=prepare_fn, qconfig_dict=qconfig_dict) @skipIfNoFBGEMM - @unittest.skip("Broken by https://github.com/pytorch/pytorch/pull/62608, enable after" - "dtype inference is supported") def test_add_shadow_loggers_mod_ptq(self): self._test_add_shadow_loggers_mod_impl(prepare_fn=prepare_fx) @@ -1133,8 +1163,6 @@ def test_add_shadow_loggers_fun_qat(self): self._test_add_shadow_loggers_fun_impl(prepare_fn=prepare_qat_fx) @skipIfNoFBGEMM - @unittest.skip("Broken by https://github.com/pytorch/pytorch/pull/62608, enable after" - "dtype inference is supported") def test_add_shadow_loggers_meth_ptq(self): """ Verify that add_loggers works on methods @@ -1147,34 +1175,10 @@ def forward(self, x): m = M().eval() res = self._test_match_shadow_activations( m, (torch.randn(4, 4),), - results_len=1) - - @skipIfNoFBGEMM - def test_add_shadow_loggers_multiple_dtype_casts(self): - """ - Verifies that for nodes where the first input arg is a list, - such as `cat`, we insert an individual dtype cast for each - arg of the list. - """ - class M(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - x = torch.cat([x, x, x], dim=0) - return x - - m = M().eval() - expected_occurrence = { - # 3 dequantize function calls from the 3 dtype casts for [x, x, x] - ns.call_module(torch.nn.Identity): 3, - # 1 dequantize method call for module output - ns.call_method("dequantize"): 1, - } - self._test_match_shadow_activations( - m, (torch.randn(4, 4),), - prepared_expected_node_occurrence=expected_occurrence, - results_len=1, compare_fp32_vs_fp32_prepared=False) + # For now, sigmoid is not supported for shadowing because the dtype + # inference for it is not implemented yet. So, this is just testing + # that shadowing models with method calls does not crash. + results_len=0) @skipIfNoFBGEMM def test_shadow_activations_fqn(self): @@ -1215,7 +1219,7 @@ def forward(self, x): m = M().eval() self._test_match_shadow_activations( m, (torch.randn(1, 1, 4, 4),), - results_len=2, + results_len=1, should_log_inputs=True) @skipIfNoFBGEMM @@ -1301,7 +1305,6 @@ def test_linear_fp16_vs_linear_fp16_shadow_activations(self): @skipIfNoFBGEMM - @unittest.skip("TODO: broken by https://github.com/pytorch/pytorch/pull/61687, will enable later") def test_op_with_either_fp32_or_int8_input(self): """ Verify that shadowing works with ops which accept either fp32 or @@ -1320,7 +1323,9 @@ def forward(self, x): m = M() res = self._test_match_shadow_activations( m, (torch.randn(4, 4),), - results_len=2) + # Note: shadowing relu by itself is currently not supported, + # this test is just testing that it does not crash + results_len=0) def _test_int8_shadows_int8_impl(self, m): """ @@ -1488,6 +1493,15 @@ def test_op_io_dtype_coverage(self): # makes sense nn.Embedding, nn.EmbeddingBag, + # the ConvTranspose3d swap is not implemented in FX Graph + # mode quantization yet + nn.ConvTranspose3d, + # the GroupNorm swap is not implemented in FX Graph + # mode quantization yet + nn.GroupNorm, + # nnq.ReLU6 is no longer swapped, because nn.ReLU6 can + # take quantized inputs + nn.ReLU6, ) if fp32_type in types_to_skip: continue @@ -1534,7 +1548,7 @@ def test_op_io_dtype_coverage(self): # 4. go through the ops mapped to each QuantizeHandler type, and verify # correctness. - default_quant_patterns = get_default_quant_patterns() + default_quant_patterns = get_all_quant_patterns() for pattern, qhandler_cls in default_quant_patterns.items(): base_op = None if isinstance(pattern, tuple): @@ -1585,14 +1599,35 @@ def test_op_io_dtype_coverage(self): self.assertTrue( (base_op in FUNS_IO_TYPE_FP32_OR_INT8) or (base_op in MODS_IO_TYPE_FP32_OR_INT8) or - (base_op in METHS_IO_TYPE_FP32_OR_INT8), + (base_op in METHS_IO_TYPE_FP32_OR_INT8) or + # Softmax has a different signature for the quantized + # version, so it does not fit into the cases above. + (base_op is torch.nn.Softmax), f"missing IO type handling for {base_op}") elif qhandler_cls == qp.EmbeddingQuantizeHandler: # embedding shadowing is not implemented, for now continue else: - raise AssertionError( - f"handing for {qhandler_cls} not implemented") + if ( + base_op in FUNS_UNMATCHABLE or + base_op in MODS_UNMATCHABLE or + base_op in METHS_UNMATCHABLE + ): + continue + if qhandler_cls(None, {}).is_general_tensor_value_op(): + self.assertTrue( + (base_op in FUNS_IO_TYPE_FP32_OR_INT8) or + (base_op in MODS_IO_TYPE_FP32_OR_INT8) or + (base_op in METHS_IO_TYPE_FP32_OR_INT8), + f"missing IO type handling for {base_op} using {qhandler_cls}") + else: + self.assertTrue( + (base_op in FUNS_IO_TYPE_FP32_OR_INT8) or + (base_op in MODS_IO_TYPE_FP32_OR_INT8) or + (base_op in METHS_IO_TYPE_FP32_OR_INT8) or + (base_op in FUNS_IO_TYPE_FP32) or + (base_op in MODS_IO_TYPE_FP32) or + f"missing IO type handling for {base_op} using {qhandler_cls}") @skipIfNoFBGEMM def test_user_defined_function(self): @@ -1689,8 +1724,6 @@ def forward(self, x): self.assert_ns_compare_dict_valid(act_compare_dict) @skipIfNoFBGEMM - @unittest.skip("Broken by https://github.com/pytorch/pytorch/pull/62608, enable after" - "dtype inference is supported") def test_layer_names(self): m = nn.Sequential( nn.Conv2d(1, 1, 1), @@ -1822,7 +1855,7 @@ def forward(self, x): mp_shadows_mq(torch.randn(1, 1, 1, 1)) act_compare_dict = extract_shadow_logger_info( mp_shadows_mq, OutputLogger, 'fp32') - self.assertTrue(len(act_compare_dict) == 4) + self.assertTrue(len(act_compare_dict) == 3) self.assert_ns_compare_dict_valid(act_compare_dict) @skipIfNoFBGEMM @@ -1905,6 +1938,58 @@ def test_add_shadow_loggers_cuda(self): extend_logger_results_with_comparison( act_compare_dict, 'a', 'b', compute_sqnr, 'sqnr') + def test_fp16_shadows_fp32(self): + m = LinearReluFunctional().eval() + qconfig_dict = {"": torch.ao.quantization.float16_static_qconfig} + mp = prepare_fx(copy.deepcopy(m), qconfig_dict) + mq = convert_fx(mp, is_reference=True) + mq_shadows_m = add_shadow_loggers('a', mq, 'b', m, OutputLogger) + + def test_mul_add_cat_stack_skips_shadowing(self): + class M(nn.Module): + def forward(self, x): + x = x * x + x = torch.mul(x, x) + x = x + x + x = torch.add(x, x) + x = torch.cat([x]) + x = torch.stack([x]) + return x + + m = M().eval() + self._test_match_shadow_activations( + m, (torch.randn(1, 1, 4, 4),), + results_len=0) + + def test_op_with_only_kwargs_skips_shadowing(self): + class M(nn.Module): + def forward(self, x): + x = torch.cat(tensors=[x]) + x = torch.stack(tensors=[x]) + return x + + m = M().eval() + self._test_match_shadow_activations( + m, (torch.randn(1, 1, 4, 4),), + results_len=0) + + def test_unsupported_op_copy_skips_shadowing(self): + """ + Copying a `call_function` node is not implemented, test that this + does not crash shadowing but instead skips the node. + """ + class M(nn.Module): + def forward(self, x): + # the second argument leads to attempting to copy a + # call_function node + x = F.layer_norm(x, x.shape[1:]) + return x + + m = M().eval() + self._test_match_shadow_activations( + m, (torch.randn(1, 1, 4, 4),), + results_len=0) + class TestFXNumericSuiteCoreAPIsModels(FXNumericSuiteQuantizationTestCase): """ @@ -2038,12 +2123,11 @@ def test_sparsenn_shadow(self): x = torch.randn(2, 4) self._test_match_shadow_activations( sparse_nn, (idx, offsets, x), - results_len=4, + results_len=3, should_log_inputs=should_log_inputs) @skip_if_no_torchvision @skipIfNoFBGEMM - @unittest.skip("TODO: broken by https://github.com/pytorch/pytorch/pull/61687, will enable later") def test_resnet18(self): import torchvision m = torchvision.models.quantization.resnet18(pretrained=False, quantize=False).eval() @@ -2055,7 +2139,6 @@ def test_resnet18(self): @skip_if_no_torchvision @skipIfNoFBGEMM - @unittest.skip("TODO: broken by https://github.com/pytorch/pytorch/pull/61687, will enable later") def test_mobilenet_v2(self): import torchvision m = torchvision.models.quantization.mobilenet_v2(pretrained=False, quantize=False).eval() diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py index 20bf20ea4027..27a83c5e7874 100644 --- a/test/quantization/fx/test_quantize_fx.py +++ b/test/quantization/fx/test_quantize_fx.py @@ -11,6 +11,7 @@ import torch.nn.intrinsic.quantized as nniq import torch.nn.intrinsic.quantized.dynamic as nniqd import torch.multiprocessing as mp +from torch.ao.quantization import is_activation_post_process # graph mode quantization based on fx from torch.ao.quantization.quantize_fx import ( @@ -40,6 +41,7 @@ default_qconfig, default_dynamic_qconfig, default_qat_qconfig, + default_reuse_input_qconfig, per_channel_dynamic_qconfig, float16_dynamic_qconfig, float16_static_qconfig, @@ -48,6 +50,7 @@ get_default_qconfig, get_default_qat_qconfig, get_default_qconfig_dict, + get_default_qat_qconfig_dict, fuse_modules, fuse_modules_qat, prepare, @@ -77,14 +80,16 @@ get_default_output_activation_post_process_map ) +from torch.ao.quantization.fx.utils import NodeInfo + from torch.ao.quantization.fake_quantize import ( - default_affine_fixed_qparams_fake_quant, - default_symmetric_fixed_qparams_fake_quant, + default_fixed_qparams_range_0to1_fake_quant, + default_fixed_qparams_range_neg1to1_fake_quant, ) from torch.ao.quantization.observer import ( - default_affine_fixed_qparams_observer, - default_symmetric_fixed_qparams_observer, + default_fixed_qparams_range_0to1_observer, + default_fixed_qparams_range_neg1to1_observer, ) # test utils @@ -130,7 +135,9 @@ import operator import unittest import io -from typing import Callable, Optional +from typing import Callable, Optional, List + + TEST_WITH_ROCM = os.getenv('PYTORCH_TEST_WITH_ROCM', '0') == '1' @@ -475,6 +482,195 @@ def forward(self, x): }) self.checkGraphModuleNodes(m, expected_node=ns.call_module(MyConvReLU)) + def test_fuse_custom_pattern(self): + class M(torch.nn.Module): + def __init__(self, use_torch_add=True): + super().__init__() + self.conv = torch.nn.Conv2d(3, 3, 3) + self.bn = torch.nn.BatchNorm2d(3) + self.relu = torch.nn.ReLU() + self.maxpool = torch.nn.MaxPool2d(3) + if use_torch_add: + self.add = torch.add + else: + self.add = operator.add + + def forward(self, x): + y = x + y = self.maxpool(x) + x = self.conv(x) + x = self.bn(x) + x = self.add(y, x) + x = self.relu(x) + return x + + for use_torch_add in [True, False]: + m = M(use_torch_add).eval() + + def fuse_conv_bn_relu(is_qat, relu, add_pattern): + _, _, bn_pattern = add_pattern + bn, conv = bn_pattern + return conv + + conv_bn_res_relu_config1 = { + "pattern": (nn.ReLU, (torch.add, MatchAllNode, (nn.BatchNorm2d, nn.Conv2d))), + "fuser_method": fuse_conv_bn_relu, + } + + conv_bn_res_relu_config2 = { + "pattern": (nn.ReLU, (operator.add, MatchAllNode, (nn.BatchNorm2d, nn.Conv2d))), + "fuser_method": fuse_conv_bn_relu, + } + + backend_config_dict = { + "configs": [conv_bn_res_relu_config1, conv_bn_res_relu_config2] + } + m = fuse_fx(m, backend_config_dict=backend_config_dict) + self.assertEqual(type(m.conv), torch.nn.Conv2d) + # check bn and relu are gone since we replaced the whole pattern to conv + self.assertFalse(hasattr(m, "bn")) + self.assertFalse(hasattr(m, "relu")) + + def test_fusion_pattern_with_multiple_inputs(self): + """ This test tests two keys in backend_config_dict: root_node_getter and + extra_inputs_getter, + root_node_getter is used to identify a "root" module in the node pattern, + the node that we'll keep after fusion. + extra_inputs_getter will return a list of node that needs to be added to the + fused node as extra inputs. + """ + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(3, 3, 3) + self.bn = torch.nn.BatchNorm2d(3) + self.relu = torch.nn.ReLU() + self.maxpool = torch.nn.MaxPool2d(3) + + def forward(self, x): + y = x + y = self.maxpool(x) + x = self.conv(x) + x = self.bn(x) + x = torch.add(x, y) + x = self.relu(x) + return x + + m = M().eval() + + def fuse_conv_bn_relu(is_qat, relu, add_pattern): + _, bn_pattern, _ = add_pattern + bn, conv = bn_pattern + return conv + + def conv_bn_res_relu_root_node_getter(pattern): + relu, add_pattern = pattern + _, bn_pattern, _ = add_pattern + bn, conv = bn_pattern + return conv + + def conv_bn_res_relu_extra_inputs_getter(pattern): + """ get inputs pattern for extra inputs, inputs for root node + are assumed to be copied over from root node to the fused node + """ + relu, add_pattern = pattern + _, bn_pattern, extra_input = add_pattern + bn, conv = bn_pattern + return [extra_input] + + conv_bn_res_relu_config = { + "pattern": (nn.ReLU, (torch.add, (nn.BatchNorm2d, nn.Conv2d), MatchAllNode)), + "fuser_method": fuse_conv_bn_relu, + "root_node_getter": conv_bn_res_relu_root_node_getter, + "extra_inputs_getter": conv_bn_res_relu_extra_inputs_getter + } + + backend_config_dict = { + "configs": [conv_bn_res_relu_config], + } + m = fuse_fx(m, backend_config_dict=backend_config_dict) + self.assertEqual(type(m.conv), torch.nn.Conv2d) + # check bn and relu are gone since we replaced the whole pattern to conv + self.assertFalse(hasattr(m, "bn")) + self.assertFalse(hasattr(m, "relu")) + + # check conv module has two inputs + named_modules = dict(m.named_modules()) + for node in m.graph.nodes: + if node.op == "call_module" and type(named_modules[node.target]) == torch.nn.Conv2d: + self.assertTrue(len(node.args) == 2), "Expecting the fused op to have two arguments" + + def test_fusion_pattern_with_matchallnode(self): + """This test tests that the node matched by MatchAllNode will be regared as an input + instead of a module to be fused. For instance, we have two patterns: + (nn.ReLU, (torch.add, MatchAllNode, nn.Conv2d)) + (nn.ReLU, nn.Conv2d) + And we wanna fuse the following model + Conv2d -> ReLU + + Conv2d ------ Add -> ReLU + ReLU in the first row is matched as MatchAllNode in the residual pattern. But it won't be + fused as part of that pattnern. It needs to be properly fused with the upstream Conv2d. + """ + + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv1 = torch.nn.Conv2d(3, 3, 3) + self.relu1 = torch.nn.ReLU() + self.conv2 = torch.nn.Conv2d(3, 3, 3) + self.relu2 = torch.nn.ReLU() + + def forward(self, x): + y = self.conv1(x) + y = self.relu1(y) + + x = self.conv2(x) + x = torch.add(x, y) + x = self.relu2(x) + return x + + m = M().eval() + + def fuse_conv_relu(is_qat, relu, conv): + return conv + + def fuse_conv_res_relu(is_qat, relu, add_pattern): + _, conv, _ = add_pattern + return conv + + def conv_res_relu_root_node_getter(pattern): + relu, (_, conv, _) = pattern + return conv + + def conv_res_relu_extra_inputs_getter(pattern): + relu, (_, _, extra_input) = pattern + return [extra_input] + + conv_relu_config = { + "pattern": (nn.ReLU, nn.Conv2d), + "fuser_method": fuse_conv_relu, + } + conv_res_relu_config = { + "pattern": (nn.ReLU, (torch.add, nn.Conv2d, MatchAllNode)), + "fuser_method": fuse_conv_res_relu, + "root_node_getter": conv_res_relu_root_node_getter, + "extra_inputs_getter": conv_res_relu_extra_inputs_getter, + } + + backend_config_dict = { + "configs": [ + conv_relu_config, + conv_res_relu_config, + ], + } + m = fuse_fx(m, backend_config_dict=backend_config_dict) + self.assertEqual(type(m.conv1), torch.nn.Conv2d) + self.assertEqual(type(m.conv2), torch.nn.Conv2d) + # check relu are gone since we replaced the both patterns to conv + self.assertFalse(hasattr(m, "relu1")) + self.assertFalse(hasattr(m, "relu2")) + + @skipIfNoFBGEMM class TestQuantizeFx(QuantizationTestCase): def test_pattern_match(self): @@ -826,7 +1022,7 @@ def forward(self, x): qconfig_dict = {'': qconfig} prepared = prepare_fx(m, qconfig_dict) quantized = convert_fx(prepared, is_reference=True) - qparams = (quantized._input_scale_0, quantized._input_zero_point_0) + qparams = (quantized._scale_0, quantized._zero_point_0) weight_obs = qconfig.weight() weight_obs(quantized.weight) # Get the actual value to avoid tensor size mismatch error, torch.Size([]) vs torch.Size([1]) @@ -834,6 +1030,8 @@ def forward(self, x): self.assertEqual(qparams, ref_qparams) def test_conv_bn_relu(self): + """ Tests fusion and quantization for "Conv - Bn" and "Conv - Bn - ReLU" + """ convs = { 1: nn.Conv1d, 2: nn.Conv2d, @@ -874,8 +1072,7 @@ def forward(self, x): x = self.dequant(x) return x - # TODO: add 1d support - options = itertools.product([2, 3], [True, False], self.static_quant_types) + options = itertools.product([1, 2, 3], [True, False], self.static_quant_types) for dim, has_relu, quant_type in options: expected_node = ns.call_module( quantized_conv_relus[dim] if has_relu @@ -912,11 +1109,56 @@ def forward(self, x): fuse_modules(m_eager, fuse_list, inplace=True) m_eager.qconfig = qconfig m_eager = prepare_fn(m_eager) + prepared_fx = result_dict["prepared"] + m_eager(*self.img_data_dict[dim][0]) m_eager = convert(m_eager) result_eager = m_eager(*self.img_data_dict[dim][0]) self.assertEqual(result, result_eager) + def test_linear_bn(self): + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = nn.Linear(4, 4) + self.bn = nn.BatchNorm1d(4) + self.quant = QuantStub() + self.dequant = DeQuantStub() + + def forward(self, x): + x = self.quant(x) + x = self.linear(x) + x = self.bn(x) + x = self.dequant(x) + return x + + data = (torch.randn(4, 4),) + for quant_type in self.static_quant_types: + expected_node = ns.call_module(nnq.Linear) + m = M() + m_eager = copy.deepcopy(m) + result_dict = self.checkGraphModeFxOp(m, data, quant_type, expected_node=expected_node) + result = result_dict["quantized_output"] + + # check numerics vs eager mode + fuse_list = ["linear", "bn"] + qengine = torch.backends.quantized.engine + if quant_type == QuantType.STATIC: + m_eager.eval() + qconfig = get_default_qconfig(qengine) + prepare_fn = prepare + fuse_modules(m_eager, fuse_list, inplace=True) + else: + m_eager.train() + qconfig = get_default_qat_qconfig(qengine) + prepare_fn = prepare_qat + fuse_modules_qat(m_eager, fuse_list, inplace=True) + m_eager.qconfig = qconfig + m_eager = prepare_fn(m_eager) + m_eager(*data) + m_eager = convert(m_eager) + result_eager = m_eager(*data) + self.assertEqual(result, result_eager) @skipIfNoFBGEMM def test_dynamic_quant_fp16(self): @@ -1536,6 +1778,49 @@ def forward(self, x): self.checkGraphModuleNodes(m, expected_node_list=node_list) + def test_qconfig_dict_with_fused_modules(self): + class LinearReLUModel(torch.nn.Module): + def __init__(self, relu): + super(LinearReLUModel, self).__init__() + self.linear = torch.nn.Linear(3, 3) + self.relu = relu + + def forward(self, x): + x = self.linear(x) + x = self.relu(x) + return x + + class ConvReLUModel(torch.nn.Module): + def __init__(self, relu): + super(ConvReLUModel, self).__init__() + self.conv = torch.nn.Conv1d(3, 3, 3) + self.relu = relu + + def forward(self, x): + x = self.conv(x) + x = self.relu(x) + return x + + class ConvBnReLUModel(torch.nn.Module): + def __init__(self, relu): + super(ConvBnReLUModel, self).__init__() + self.conv = torch.nn.Conv1d(3, 3, 3) + self.bn = torch.nn.BatchNorm1d(3) + self.relu = relu + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + return x + + for model in [LinearReLUModel, ConvReLUModel, ConvBnReLUModel]: + for relu in [torch.nn.ReLU(), torch.nn.functional.relu, torch.relu]: + m = model(relu).eval() + qconfig_dict = torch.ao.quantization.get_default_qconfig_dict("fbgemm") + # should not crash as in https://github.com/pytorch/pytorch/issues/75825 + prepare_fx(m, qconfig_dict) + def test_qconfig_dict_validity(self): r""" Verifies that if a user passes an invalid key or makes a typo when @@ -1770,7 +2055,7 @@ def forward(self, x): def assertAttrPreserved(m): self.assertTrue(hasattr(m, "preserved_attr")) - self.assertTrue(m.preserved_attr, 3) + self.assertEqual(m.preserved_attr, 3) assertAttrPreserved(m) convert_custom_config_dict = { @@ -2004,6 +2289,88 @@ def forward(self, x): ref_res = ref_m(data) self.assertEqual(res, ref_res) + @skipIfNoFBGEMM + def test_custom_module_class_input_has_multiple_users(self): + """ Tests that the flow still works when the input of custom module + has multiple users + """ + class CustomModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(3, 3) + + def forward(self, x): + return self.linear(x) + + class ObservedCustomModule(torch.nn.Module): + def __init__(self, linear): + super().__init__() + self.linear = linear + + def forward(self, x): + return self.linear(x) + + @classmethod + def from_float(cls, float_module): + assert hasattr(float_module, 'qconfig') + observed = cls(float_module.linear) + observed.qconfig = float_module.qconfig + return observed + + class StaticQuantCustomModule(torch.nn.Module): + def __init__(self, linear): + super().__init__() + self.linear = linear + + def forward(self, x): + return self.linear(x) + + @classmethod + def from_observed(cls, observed_module): + assert hasattr(observed_module, 'qconfig') + assert hasattr(observed_module, 'activation_post_process') + observed_module.linear.activation_post_process = \ + observed_module.activation_post_process + quantized = cls(nnq.Linear.from_float(observed_module.linear)) + return quantized + + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(3, 3) + self.custom = CustomModule() + + def forward(self, x0): + x1 = self.custom(x0) + x2 = self.linear(x0) + return x1 + x2 + + prepare_custom_config_dict = { + "float_to_observed_custom_module_class": { + "static": { + CustomModule: ObservedCustomModule + } + } + } + convert_custom_config_dict = { + "observed_to_quantized_custom_module_class": { + "static": { + ObservedCustomModule: StaticQuantCustomModule + } + } + } + m = M().eval() + m = prepare_fx( + m, + {"": default_qconfig}, + prepare_custom_config_dict=prepare_custom_config_dict) + # make sure it works + m = convert_fx( + m, + convert_custom_config_dict=convert_custom_config_dict) + # make sure it runs + m(torch.randn(3, 3)) + @skipIfNoFBGEMM def test_non_traceable_module(self): class NonTraceable(torch.nn.Module): @@ -2305,12 +2672,13 @@ def forward(self, x): self.assertTrue( set(scripted_keys) == set(non_packed_weight_keys), "Expected the scripted model to preserve the state_dict for non-packed weight attributes") + # TODO: probably don't want to hardcode the attribute names, since they are generated for attr_name in [ "mods1_0_input_scale_0", "mods1_0_input_zero_point_0", - "mods1_0_scale_0", "mods1_0_zero_point_0", - "mods1_1_scale_0", "mods1_1_zero_point_0", - "mods2_scale_0", "mods2_zero_point_0"]: - self.assertTrue(hasattr(m, attr_name)) + "mods1_0_scale_1", "mods1_0_zero_point_1", + "mods1_1_scale_1", "mods1_1_zero_point_1", + "mods2_scale_1", "mods2_zero_point_1"]: + self.assertTrue(hasattr(m, attr_name), attr_name + " not found.") @skipIfNoFBGEMM def test_packed_weight_fused_op(self): @@ -2423,6 +2791,234 @@ def forward(self, x): mp(torch.rand(4, 4, 4, 4)) mc = convert_fx(mp) + class _NonReferenceTestModel(nn.Module): + def __init__(self, func, lin_in, lin_out): + super().__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.lin = nn.Linear(lin_in, lin_out) + self.func = func + + def forward(self, x, y, z): + x = self.pool(F.relu(self.conv1(x))) + x = torch.flatten(x, 1) + x = self.func(x, y, z) + x = self.lin(x) + return x + + # This function looks at the node specified by the NodeInfo in the key of + # node_info_to_non_tensor_args and checks that the args at specified indices + # are not observed (since they are non tensors). If the args at those indices + # are a tuple/list (which do not show up as nodes) the function checks the + # individual elements of the tuple/list recursively. + def _check_not_observed(self, model, node_info_to_non_tensor_args): + + # this is a helper function (for easier recursion) that checks whether + # arg_node is observed + def _check_node_not_observed(model, arg_node, node): + if isinstance(arg_node, tuple) or isinstance(arg_node, list): + for new_node in arg_node: + _check_node_not_observed(model, new_node, node) + elif arg_node.op == "call_module": + self.assertTrue( + not is_activation_post_process(getattr(model, arg_node.target)), + "Arg: {0} of node: {1} is observed but is not a float tensor".format( + arg_node, node + ), + ) + + for node in model.graph.nodes: + indices = node_info_to_non_tensor_args.get( + NodeInfo(node.op, node.target), [] + ) + for index in indices: + if index < len(node.args): + arg_node = node.args[index] + _check_node_not_observed(model, arg_node, node) + + # This test checks that the model gets prepared correct, doesn't have observers + # on specific ops (see _check_not_observed) and that the prepared model runs + def _test_dtype_propagation(self, model, node_info_to_non_tensor_args, *args): + model.eval() + qconfig_dict = {"": torch.ao.quantization.get_default_qconfig("fbgemm")} + prepared_model = prepare_fx(model, qconfig_dict) + self._check_not_observed(prepared_model, node_info_to_non_tensor_args) + prepared_model(*args) + + def test_masked_fill_nontensor_args_not_observed(self): + def func(x, y, z): + return x.masked_fill(y, z) + + model = self._NonReferenceTestModel(func, 1176, 1) + args = [torch.randn(5, 3, 32, 32), torch.randn(1176) > 0, 0.1] + node_info_to_non_tensor_args = {NodeInfo("call_method", "masked_fill"): [1, 2]} + self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args) + + def test_permute_nontensor_args_not_observed(self): + def func(x, y, z): + return x.permute(y, z) + + model = self._NonReferenceTestModel(func, 1176, 1) + args = [torch.randn(5, 3, 32, 32), 0, 1] + node_info_to_non_tensor_args = {NodeInfo("call_method", "permute"): [1, 2]} + self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args) + + def test_repeat_nontensor_args_not_observed(self): + def func(x, y, z): + return x.repeat(y, z) + + model = self._NonReferenceTestModel(func, 1176, 1) + args = [torch.randn(5, 3, 32, 32), 2, 1] + node_info_to_non_tensor_args = {NodeInfo("call_method", "repeat"): [1, 2]} + self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args) + + def test_reshape_nontensor_args_not_observed(self): + def func(x, y, z): + return x.reshape(-1, y) + + model = self._NonReferenceTestModel(func, 5, 1) + args = [torch.randn(5, 3, 32, 32), 5, None] + node_info_to_non_tensor_args = {NodeInfo("call_method", "reshape"): [2]} + self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args) + + def test_size_nontensor_args_not_observed(self): + def func(x, y, z): + return x.reshape((-1, x.size(y))) + + model = self._NonReferenceTestModel(func, 5, 1) + args = [torch.randn(5, 3, 32, 32), 0, None] + node_info_to_non_tensor_args = {NodeInfo("call_method", "size"): [1]} + self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args) + + def test_transpose_nontensor_args_not_observed(self): + def func(x, y, z): + return x.transpose(y, z) + + model = self._NonReferenceTestModel(func, 5, 1) + args = [torch.randn(5, 3, 32, 32), 0, 1] + node_info_to_non_tensor_args = {NodeInfo("call_method", "transpose"): [1, 2]} + self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args) + + def test_torch_transpose_nontensor_args_not_observed(self): + # TODO: make torch.transpose traceable by fx when using + # variable nontensor arguments + # func = lambda x, y, z: torch.transpose(x, y, z) # error + def func(x, y, z): + return torch.transpose(x, 0, 1) + + model = self._NonReferenceTestModel(func, 5, 1) + node_info_to_non_tensor_args = { + NodeInfo("call_method", torch.transpose): [1, 2] + } + args = [torch.randn(5, 3, 32, 32), 0, 1] + self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args) + + def test_unsqueeze_nontensor_args_not_observed(self): + def func(x, y, z): + return x.unsqueeze(y) + + model = self._NonReferenceTestModel(func, 1176, 1) + args = [torch.randn(5, 3, 32, 32), 1, None] + node_info_to_non_tensor_args = {NodeInfo("call_method", "unsqueeze"): [1]} + self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args) + + def test_unsqueeze__nontensor_args_not_observed(self): + def func(x, y, z): + return x.unsqueeze_(y) + + model = self._NonReferenceTestModel(func, 1176, 1) + args = [torch.randn(5, 3, 32, 32), 1, None] + node_info_to_non_tensor_args = {NodeInfo("call_method", "unsqueeze_"): [1]} + self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args) + + def test_torch_unsqueeze_nontensor_args_not_observed(self): + # TODO: make torch.unsqueeze scriptable by fx when using + # variable nontensor arguments + # func = lambda x, y, z: torch.unsqueeze(x, y) # error + def func(x, y, z): + return torch.unsqueeze(x, 1) + + model = self._NonReferenceTestModel(func, 1176, 1) + args = [torch.randn(5, 3, 32, 32), 1, None] + node_info_to_non_tensor_args = {NodeInfo("call_method", torch.unsqueeze): [1]} + self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args) + + def test_view_nontensor_args_not_observed(self): + def func(x, y, z): + return x.view(-1, y) + + model = self._NonReferenceTestModel(func, 5, 1) + args = [torch.randn(5, 3, 32, 32), 5, None] + node_info_to_non_tensor_args = {NodeInfo("call_method", "view"): [2]} + self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args) + + def test_propagate_dtypes_for_known_nodes_list_args(self): + def func(x, y, z): + return x.reshape(y) + + model = self._NonReferenceTestModel(func, 5, 1) + args = [torch.randn(5, 3, 32, 32), [-1, 5], None] + node_info_to_non_tensor_args = {NodeInfo("call_method", "reshape"): [1]} + self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args) + + def test_propagate_dtypes_for_known_nodes_split_list_args(self): + def func(x, y, z): + return x.reshape([y, z]) + + model = self._NonReferenceTestModel(func, 5, 1) + args = [torch.randn(5, 3, 32, 32), -1, 5] + node_info_to_non_tensor_args = {NodeInfo("call_method", "reshape"): [1]} + self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args) + + def test_propagate_dtypes_for_known_nodes_tuple_args(self): + def func(x, y, z): + return x.reshape(y) + + model = self._NonReferenceTestModel(func, 5, 1) + args = [torch.randn(5, 3, 32, 32), (-1, 5), None] + node_info_to_non_tensor_args = {NodeInfo("call_method", "reshape"): [1]} + self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args) + + def test_propagate_dtypes_for_known_nodes_split_tuple_args(self): + def func(x, y, z): + return x.reshape((y, z)) + + model = self._NonReferenceTestModel(func, 5, 1) + args = [torch.randn(5, 3, 32, 32), -1, 5] + node_info_to_non_tensor_args = {NodeInfo("call_method", "reshape"): [1]} + self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args) + + def test_propagate_dtypes_for_known_nodes_dict_args(self): + def func(x, y, z): + return x.transpose(y["first"], y["second"]) + + model = self._NonReferenceTestModel(func, 5, 1) + args = [torch.randn(5, 3, 32, 32), {"first": 0, "second": 1}, None] + node_info_to_non_tensor_args = {NodeInfo("call_method", "transpose"): [1, 2]} + self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args) + + def test_propagate_dtypes_for_known_nodes_dict_tuple_args(self): + class reshape_module(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x, y, z): + return x.reshape(y["shape"]) + + model = self._NonReferenceTestModel(reshape_module(), 5, 1) + args = [torch.randn(5, 3, 32, 32), {"shape": (-1, 5)}, None] + node_info_to_non_tensor_args = {NodeInfo("call_method", "reshape"): [1]} + self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args) + + def test_propagate_dtypes_for_known_nodes_dict_split_tuple_args(self): + def func(x, y, z): + return x.reshape((y["first"], y["second"])) + + model = self._NonReferenceTestModel(func, 5, 1) + args = [torch.randn(5, 3, 32, 32), {"first": -1, "second": 5}, None] + node_info_to_non_tensor_args = {NodeInfo("call_method", "transpose"): [1]} + self._test_dtype_propagation(model, node_info_to_non_tensor_args, *args) + def test_assert_on_size_after_quant_layer(self): """ Verifies that calculating a size of a quantized tensor works @@ -2697,11 +3293,12 @@ def forward(self, x): m = convert_fx(m) keys = m.state_dict().keys() m(torch.randn(5, 5)) + # TODO: probably don't want to hardcode the attribute names, since they are generated for attr_name in [ "mods1_0_input_scale_0", "mods1_0_input_zero_point_0", "mods1_0_scale_0", "mods1_0_zero_point_0", "mods1_1_scale_0", "mods1_1_zero_point_0"]: - self.assertTrue(hasattr(m, attr_name)) + self.assertTrue(hasattr(m, attr_name), attr_name + " not found.") def test_no_obs_between_unmatched_node_and_copy_node(self): """ @@ -3033,7 +3630,6 @@ def forward(self, x): def test_preserve_tuple(self): """ Test tuple input type is preserved """ - from typing import List class LSTM(nn.Module): def __init__(self): @@ -3111,23 +3707,101 @@ def forward(self, x): x = self.relu(x) return x - model = M().eval() - dynamic_quantized_ops = { float16_dynamic_qconfig: torch.ops.quantized.linear_relu_dynamic_fp16, default_dynamic_qconfig: torch.ops.quantized.linear_relu_dynamic } - for config in [float16_dynamic_qconfig, default_dynamic_qconfig]: - qconfig = { - "": config + for qconfig in [float16_dynamic_qconfig, default_dynamic_qconfig]: + model = M().eval() + qconfig_dict = { + "": qconfig + } + m = prepare_fx(model, qconfig_dict) + m = convert_fx(m) + m(torch.rand(5, 5)) + node_list = [ + ns.call_module(nniqd.LinearReLU), + ns.call_module(nniqd.LinearReLU), + ns.call_function(dynamic_quantized_ops[qconfig]), + ] + self.checkGraphModuleNodes(m, expected_node_list=node_list) + + @skipIfNoFBGEMM + def test_dynamic_with_fusion_multiple_uses(self): + """ + Tests that dynamic quantization APIs work with Linear + Relu fusion + """ + class LinearRelu(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(5, 5) + self.relu = torch.nn.ReLU() + + def forward(self, x): + x = self.linear(x) + return self.relu(x) + + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear_relu = LinearRelu() + + def forward(self, x): + x = self.linear_relu(x) + x = self.linear_relu(x) + return x + + for qconfig in [float16_dynamic_qconfig, default_dynamic_qconfig]: + model = M().eval() + qconfig_dict = { + "": qconfig } - m = prepare_fx(model, qconfig) + m = prepare_fx(model, qconfig_dict) m = convert_fx(m) m(torch.rand(5, 5)) node_list = [ ns.call_module(nniqd.LinearReLU), ns.call_module(nniqd.LinearReLU), - ns.call_function(dynamic_quantized_ops[config]), + ] + self.checkGraphModuleNodes(m, expected_node_list=node_list) + + @skipIfNoFBGEMM + def test_dynamic_linear_input_multiple_use(self): + """ + Tests input for dynamic linear being used by multiple ops + """ + class LinearRelu(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(5, 5) + self.relu = torch.nn.ReLU() + + def forward(self, x): + x = self.linear(x) + return self.relu(x) + + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.mod1 = LinearRelu() + self.mod2 = LinearRelu() + + def forward(self, x): + y1 = self.mod1(x) + y2 = self.mod2(x) + return y1 + y2 + + for qconfig in [float16_dynamic_qconfig, default_dynamic_qconfig]: + model = M().eval() + qconfig_dict = { + "": qconfig + } + m = prepare_fx(model, qconfig_dict) + m = convert_fx(m) + m(torch.rand(5, 5, 5)) + node_list = [ + ns.call_module(nniqd.LinearReLU), + ns.call_module(nniqd.LinearReLU), ] self.checkGraphModuleNodes(m, expected_node_list=node_list) @@ -3379,6 +4053,7 @@ def forward(self, x): ns.call_function(torch.quantize_per_tensor): 1, ns.call_function(torch.ops.quantized.linear): 2, ns.call_function(torch.ops.quantized.add): 1, + ns.call_function(torch.mul): 1, ns.call_method("dequantize"): 1 } order_check = [ @@ -3387,6 +4062,7 @@ def forward(self, x): ns.call_function(torch.ops.quantized.linear), ns.call_function(torch.ops.quantized.add), ns.call_method("dequantize"), + ns.call_function(torch.mul), ns.call_module(nn.Linear), ] @@ -3400,19 +4076,6 @@ def forward(self, x): def _assertFixedQParamsFakeQuantizeEqual(self, fq1, fq2): self.assertEqual(fq1()._observer_ctr, fq2()._observer_ctr) - def test_fixed_qparams_patterns(self): - hard_sigmoid_keys = [torch.nn.Hardsigmoid, torch.nn.functional.hardsigmoid, "hardsigmoid", "hardsigmoid_"] - sigmoid_keys = [torch.nn.Sigmoid, torch.sigmoid, "sigmoid", "sigmoid_"] - tanh_keys = [torch.nn.Tanh, torch.tanh, "tanh", "tanh_"] - for k in hard_sigmoid_keys + sigmoid_keys: - self.assertEqual(DEFAULT_OUTPUT_OBSERVER_MAP[k], default_affine_fixed_qparams_observer) - self._assertFixedQParamsFakeQuantizeEqual(DEFAULT_OUTPUT_FAKE_QUANTIZE_MAP[k], - default_affine_fixed_qparams_fake_quant) - for k in tanh_keys: - self.assertEqual(DEFAULT_OUTPUT_OBSERVER_MAP[k], default_symmetric_fixed_qparams_observer) - self._assertFixedQParamsFakeQuantizeEqual(DEFAULT_OUTPUT_FAKE_QUANTIZE_MAP[k], - default_symmetric_fixed_qparams_fake_quant) - def test_register_patterns(self): @register_fusion_pattern("dummy_fusion") class DummyFusion(): @@ -3422,11 +4085,11 @@ class DummyFusion(): class DummyQuant(): pass - @register_quant_pattern("dummy_quant2", default_affine_fixed_qparams_observer) + @register_quant_pattern("dummy_quant2", default_fixed_qparams_range_0to1_observer) class DummyQuant2(): pass - @register_quant_pattern("dummy_quant3", default_symmetric_fixed_qparams_observer) + @register_quant_pattern("dummy_quant3", default_fixed_qparams_range_neg1to1_observer) class DummyQuant3(): pass @@ -3434,16 +4097,19 @@ class DummyQuant3(): self.assertEqual(DEFAULT_QUANTIZATION_PATTERNS["dummy_quant"], DummyQuant) self.assertEqual(DEFAULT_QUANTIZATION_PATTERNS["dummy_quant2"], DummyQuant2) self.assertEqual(DEFAULT_QUANTIZATION_PATTERNS["dummy_quant3"], DummyQuant3) - self.assertEqual(DEFAULT_OUTPUT_OBSERVER_MAP["dummy_quant2"], default_affine_fixed_qparams_observer) - self.assertEqual(DEFAULT_OUTPUT_OBSERVER_MAP["dummy_quant3"], default_symmetric_fixed_qparams_observer) + self.assertEqual(DEFAULT_OUTPUT_OBSERVER_MAP["dummy_quant2"], default_fixed_qparams_range_0to1_observer) + self.assertEqual(DEFAULT_OUTPUT_OBSERVER_MAP["dummy_quant3"], default_fixed_qparams_range_neg1to1_observer) self._assertFixedQParamsFakeQuantizeEqual(DEFAULT_OUTPUT_FAKE_QUANTIZE_MAP["dummy_quant2"], - default_affine_fixed_qparams_fake_quant) + default_fixed_qparams_range_0to1_fake_quant) self._assertFixedQParamsFakeQuantizeEqual(DEFAULT_OUTPUT_FAKE_QUANTIZE_MAP["dummy_quant3"], - default_symmetric_fixed_qparams_fake_quant) - self.assertTrue(get_default_output_activation_post_process_map(is_training=True) is - DEFAULT_OUTPUT_FAKE_QUANTIZE_MAP) - self.assertTrue(get_default_output_activation_post_process_map(is_training=False) is - DEFAULT_OUTPUT_OBSERVER_MAP) + default_fixed_qparams_range_neg1to1_fake_quant) + output_fake_quantize_map = get_default_output_activation_post_process_map(is_training=True) + output_observer_map = get_default_output_activation_post_process_map(is_training=False) + self.assertEqual(output_observer_map.get("dummy_quant3"), default_fixed_qparams_range_neg1to1_observer) + self._assertFixedQParamsFakeQuantizeEqual(output_fake_quantize_map.get("dummy_quant3"), + default_fixed_qparams_range_neg1to1_fake_quant) + + def test_reuse_input_qconfig(self): class M1(torch.nn.Module): @@ -3532,23 +4198,132 @@ def forward(self, x): break self.assertTrue(found_stack_trace, f"stack trace not found, node: {n.format_node()}, is_reference: False") - def test_stack_trace_preserved_subgraph_rewriter(self): - # a functional relu is taking the subgraph rewriter code path + def test_qat_skip_untraced(self): + class UnTraceableModuleClass(nn.Module): + def __init__(self): + super().__init__() + self.linear = nn.Linear(2, 2) + + def forward(self, x): + return self.linear(x) + + class UnTraceableModuleName(nn.Module): + def __init__(self): + super().__init__() + self.linear = nn.Linear(2, 2) + + def forward(self, x): + return self.linear(x) + class M(nn.Module): + def __init__(self): + super().__init__() + self.untraceable_module_class = UnTraceableModuleClass() + self.untraceable_module_name = UnTraceableModuleClass() + def forward(self, x): - x = F.relu(x) + x = self.untraceable_module_class(x) + x = self.untraceable_module_name(x) return x - m = M().eval() - mp = prepare_fx(m, get_default_qconfig_dict()) - mq = convert_fx(copy.deepcopy(mp), is_reference=False) - found_stack_trace = False - for n in mq.graph.nodes: - if n.op == 'call_function' and n.target == F.relu: - found_stack_trace = n.stack_trace is not None - break - self.assertTrue(found_stack_trace, f"stack trace not found, node: {n.format_node()}, is_reference: True") + mod = M() + + qconfig_dict = {"": torch.quantization.get_default_qat_qconfig()} + prepare_custom_config_dict = { + "non_traceable_module_class": [UnTraceableModuleClass], + "non_traceable_module_name": ["untraceable_module_name"], + } + mod_prep = torch.ao.quantization.quantize_fx.prepare_qat_fx( + mod.train(), qconfig_dict, prepare_custom_config_dict + ) + mod_prep = torch.ao.quantization.quantize_fx.prepare_qat_fx( + mod.train(), qconfig_dict, prepare_custom_config_dict + ) + self.assertTrue( + isinstance(mod_prep.untraceable_module_class.linear, torch.nn.Linear) + ) + self.assertTrue( + isinstance(mod_prep.untraceable_module_name.linear, torch.nn.Linear) + ) + self.assertTrue( + type(mod_prep.untraceable_module_class.linear) + is not torch.nn.qat.modules.linear.Linear, + "prepare_qat_fx shold not convert anything inside untraced module classes", + ) + self.assertTrue( + type(mod_prep.untraceable_module_name.linear) + is not torch.nn.qat.modules.linear.Linear, + "prepare_qat_fx shold not convert anything inside modules named in untraced_module_names", + ) + + def test_qconfig_dict_setup(self): + class M(torch.nn.Module): + def __init__(self): + super(M, self).__init__() + self.Conv1d = torch.nn.Conv1d(1, 1, 1) + self.Conv2d = torch.nn.Conv2d(1, 1, 1) + self.Conv3d = torch.nn.Conv3d(1, 1, 1) + self.ConvTranspose1d = torch.nn.ConvTranspose1d(1, 1, 1) + self.ConvTranspose2d = torch.nn.ConvTranspose2d(1, 1, 1) + self.ConvTranspose3d = torch.nn.ConvTranspose3d(1, 1, 1) + self.Linear = torch.nn.Linear(1, 1, 1) + + def forward(self, x): + x = self.Conv1d(x) + x = self.Conv2d(x) + x = self.Conv3d(x) + x = self.ConvTranspose1d(x) + x = self.ConvTranspose2d(x) + x = self.ConvTranspose3d(x) + x = self.Linear(x) + x = torch.nn.functional.conv1d(x, torch.rand(2, 2)) + x = torch.nn.functional.conv2d(x, torch.rand(2, 2)) + x = torch.nn.functional.conv3d(x, torch.rand(2, 2)) + x = torch.nn.functional.linear(x, torch.rand(2, 2)) + return x + + backends = ["qnnpack", "fbgemm"] + for func in [get_default_qconfig_dict, get_default_qat_qconfig_dict]: + for backend in backends: + m = M().eval() + qconfig_dict = func(backend) + m = prepare_fx(m, qconfig_dict) + for name, mod in m.named_modules(): + if is_activation_post_process(mod) and mod.dtype == torch.quint8: + if backend == "fbgemm": + lower_bnd = 0 + upper_bnd = 127 + else: + lower_bnd = 0 + upper_bnd = 255 + if issubclass(type(mod), FakeQuantize): + self.assertEqual(mod.activation_post_process.quant_min, lower_bnd) + self.assertEqual(mod.activation_post_process.quant_max, upper_bnd) + else: + self.assertEqual(mod.quant_min, lower_bnd) + self.assertEqual(mod.quant_max, upper_bnd) + + def test_prepare_mode(self): + class LinearModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(5, 10) + + def forward(self, x): + return self.linear(x) + + def _test(prepare_fn, qconfig_dict): + m = LinearModel() + m1 = copy.deepcopy(m) + m1.train() + prepare_fn(m1, qconfig_dict) + m2 = copy.deepcopy(m) + m2.eval() + prepare_fn(m2, qconfig_dict) + # Ensure prepare_fx and prepare_qat_fx work in both training and eval modes + _test(prepare_fx, get_default_qconfig_dict()) + _test(prepare_qat_fx, get_default_qat_qconfig_dict()) @skipIfNoFBGEMM class TestQuantizeFxOps(QuantizationTestCase): @@ -3590,41 +4365,64 @@ def setUp(self): """ @skipIfNoFBGEMM def test_linear_module(self): - class ModuleLinear(torch.nn.Module): - def __init__(self, has_relu=False, f_relu=False): - super(ModuleLinear, self).__init__() + class LinearModel(torch.nn.Module): + def __init__(self): + super(LinearModel, self).__init__() self.linear = torch.nn.Linear(30, 4).float() - if has_relu: - if f_relu: - self.relu = F.relu - else: - self.relu = torch.nn.ReLU() + + def forward(self, x): + return self.linear(x) + + class LinearReLUModel(torch.nn.Module): + def __init__(self, f_relu=False): + super(LinearReLUModel, self).__init__() + self.linear = torch.nn.Linear(30, 4).float() + if f_relu: + self.relu = F.relu else: - self.relu = torch.nn.Identity() + self.relu = torch.nn.ReLU() def forward(self, x): - return self.relu(self.linear(x)) + x = self.linear(x) + x = self.relu(x) + return x + class LinearBnModel(torch.nn.Module): + def __init__(self): + super(LinearBnModel, self).__init__() + self.linear = torch.nn.Linear(4, 4).float() + self.bn = torch.nn.BatchNorm1d(4) + + def forward(self, x): + x = self.linear(x) + x = self.bn(x) + return x + + # Test linear data = (torch.rand((1, 30), dtype=torch.float),) - options = itertools.product( - [ModuleLinear(has_relu=False)], - self.all_quant_types) - quantized_nodes = { - # quant_type: - QuantType.DYNAMIC: ns.call_module(nnqd.Linear), - QuantType.STATIC: ns.call_module(nnq.Linear), - # note that we are checking the final result - QuantType.QAT: ns.call_module(nnq.Linear), - } - for model, quant_type in options: - self.checkGraphModeFxOp( - model, data, quant_type, quantized_nodes[quant_type]) + for quant_type in self.all_quant_types: + model = LinearModel() + quantized_module = nnqd.Linear if quant_type == QuantType.DYNAMIC else nnq.Linear + quantized_node = ns.call_module(quantized_module) + result_dict = self.checkGraphModeFxOp(model, data, quant_type, quantized_node) + if quant_type in self.static_quant_types: + self.assertEqual(result_dict["quantized_output"], result_dict["quantized_reference_output"]) + # TODO: enable test for dynamic quant + # Test linear-relu for f_relu, quant_type in itertools.product([True, False], [QuantType.STATIC, QuantType.QAT]): - for model, quantized_node in [ - (ModuleLinear(has_relu=True, f_relu=f_relu), ns.call_module(nniq.LinearReLU))]: - result_dict = self.checkGraphModeFxOp(model, data, quant_type, quantized_node) - self.assertEqual(result_dict["quantized_output"], result_dict["quantized_reference_output"]) + model = LinearReLUModel(f_relu) + quantized_node = ns.call_module(nniq.LinearReLU) + result_dict = self.checkGraphModeFxOp(model, data, quant_type, quantized_node) + self.assertEqual(result_dict["quantized_output"], result_dict["quantized_reference_output"]) + + # Test linear-bn + data = (torch.rand((4, 4), dtype=torch.float),) + for quant_type in self.static_quant_types: + model = LinearBnModel() + quantized_node = ns.call_module(nnq.Linear) + result_dict = self.checkGraphModeFxOp(model, data, quant_type, quantized_node) + self.assertEqual(result_dict["quantized_output"], result_dict["quantized_reference_output"]) @skipIfNoFBGEMM def test_functional_linear(self): @@ -3636,18 +4434,18 @@ def __init__(self, use_bias, has_relu, f_relu): self.use_bias = use_bias if has_relu: if f_relu: - self.relu = F.relu + self.relu_or_id = F.relu else: - self.relu = torch.nn.ReLU() + self.relu_or_id = torch.nn.ReLU() else: - self.relu = torch.nn.Identity() + self.relu_or_id = torch.nn.Identity() def forward(self, x): if self.use_bias: x = F.linear(x, self.w, self.b) else: x = F.linear(x, self.w) - x = self.relu(x) + x = self.relu_or_id(x) return x data = (torch.rand((1, 30), dtype=torch.float),) @@ -3675,7 +4473,10 @@ def forward(self, x): # it is a copy node, that's why we have extra observer/fake_quant # when has_relu is False quant_type_to_prepare_expected_node_occurrence = { - QuantType.DYNAMIC: {}, + QuantType.DYNAMIC: { + ns.call_module(torch.ao.quantization.PlaceholderObserver): 1, + ns.call_module(torch.ao.quantization.MinMaxObserver): 1, + }, # There should be 3 observers: after input, weight and activation. # one more observer for torch.nn.Identity when there is no relu QuantType.STATIC: { @@ -3693,17 +4494,29 @@ def forward(self, x): else: qlinear_fun = quant_type_to_qlinear_fun[quant_type] + if quant_type != QuantType.DYNAMIC: + num_dequantize = 1 + else: + # we will have an extra quantize_per_tensor_dynamic + dequantize for + # nn.Identity right now, but it will be fixed after we use + # backend_config_dict to configure the default pt backend + num_dequantize = int(not has_relu) + convert_node_occurrence = { ns.call_function(torch.quantize_per_tensor): 1 if quant_type != QuantType.DYNAMIC else 0, qlinear_fun: 1, - ns.call_method("dequantize"): 1 if quant_type != QuantType.DYNAMIC else 0 + ns.call_method("dequantize"): num_dequantize if quant_type != QuantType.DYNAMIC else 0, } prepare_expected_node_occurrence = \ quant_type_to_prepare_expected_node_occurrence[quant_type] - self.checkGraphModeFxOp( + result_dict = self.checkGraphModeFxOp( model, data, quant_type, qlinear_fun, prepare_expected_node_occurrence=prepare_expected_node_occurrence, expected_node_occurrence=convert_node_occurrence) + if quant_type != QuantType.DYNAMIC: + self.assertEqual(result_dict["quantized_output"], result_dict["quantized_reference_output"]) + # Ensure packed weights in lowered models are folded + self.assertIn("_packed_weight_0", result_dict["quantized"].state_dict().keys()) def test_linear_dynamic_fp16(self): class FuncLinear(torch.nn.Module): @@ -3745,8 +4558,8 @@ def forward(self, x): else: qlinear_fun = ns.call_function(torch.ops.quantized.linear_dynamic_fp16) prepare_node_occurrence = { - # weight - ns.call_module(torch.ao.quantization.PlaceholderObserver): 1 + # activation and weight + ns.call_module(torch.ao.quantization.PlaceholderObserver): 2 } convert_node_occurrence = { qlinear_fun: 1, @@ -3760,6 +4573,7 @@ def forward(self, x): prepare_expected_node_occurrence=prepare_node_occurrence, expected_node_occurrence=convert_node_occurrence) + # TODO: maybe remove this support def test_linear_static_fp16(self): class FuncLinear(torch.nn.Module): def __init__(self, use_bias, has_relu, f_relu): @@ -3943,10 +4757,14 @@ def forward(self, x): } prepare_expected_node_occurrence = \ quant_type_to_prepare_expected_node_occurrence[quant_type] - self.checkGraphModeFxOp( + result_dict = self.checkGraphModeFxOp( model, data, quant_type, qconv_fun, prepare_expected_node_occurrence=prepare_expected_node_occurrence, expected_node_occurrence=convert_node_occurrence) + if quant_type != QuantType.DYNAMIC: + self.assertEqual(result_dict["quantized_output"], result_dict["quantized_reference_output"]) + # Ensure packed weights in lowered models are folded + self.assertIn("_packed_weight_0", result_dict["quantized"].state_dict().keys()) @skipIfNoFBGEMM def test_quantized_conv_relu(self): @@ -4096,10 +4914,12 @@ def test_add(self): self._test_binary_op_float16_impl( operator.add, operator.iadd) + @unittest.skip("This is no longer needed right now, can enable later with new api") def test_sub(self): self._test_binary_op_float16_impl(operator.sub, operator.isub) self._test_binary_op_float16_impl(torch.sub, None) + @unittest.skip("This is no longer needed right now, can enable later with new api") def test_div(self): self._test_binary_op_float16_impl(operator.truediv, operator.itruediv) self._test_binary_op_float16_impl(torch.div, None) @@ -4110,6 +4930,7 @@ def test_mul(self): operator.mul, operator.imul, torch.ops.quantized.mul) self._test_binary_op_float16_impl(operator.mul, operator.imul) + @unittest.skip("This is no longer needed right now, can enable later with new api") def test_sum(self): class Sum(torch.nn.Module): def forward(self, x): @@ -4133,6 +4954,7 @@ def forward(self, x): expected_node_occurrence=node_occurrence, custom_qconfig_dict=custom_qconfig_dict) + @unittest.skip("This is no longer needed right now, can enable later with new api") def test_bmm(self): class BMMMethod(torch.nn.Module): def __init__(self): @@ -4174,6 +4996,39 @@ def test_add_relu(self): self._test_binary_op_relu_float16_impl( operator.add, operator.iadd) + @skipIfNoFBGEMM + def test_add_relu_multiple_uses_of_relu(self): + class Sub(torch.nn.Module): + def __init__(self): + super().__init__() + self.relu = torch.nn.ReLU(inplace=True) + + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.sub = Sub() + + def forward(self, x, y): + x = x + y + x = self.sub.relu(x) + x = x + y + x = self.sub.relu(x) + return x + + m = M().eval() + m = prepare_fx(m, {"": default_qconfig}) + m = convert_fx(m) + node_occurrence = { + ns.call_function(torch.quantize_per_tensor): 2, + ns.call_function(torch.ops.quantized.add_relu): 2, + ns.call_method("dequantize"): 1, + } + self.checkGraphModuleNodes(m, expected_node_occurrence=node_occurrence) + # check the model is scriptable + m = torch.jit.script(m) + # check the model is runnable + m(torch.randn(3), torch.randn(3)) + @skipIfNoFBGEMM def test_mul_relu(self): self._test_binary_op_relu_int8_impl( @@ -4206,7 +5061,7 @@ def forward(self, x): m = M() expected_node_occurrence = { - ns.call_module(torch.ao.quantization.FusedMovingAvgObsFakeQuantize): 6, + ns.call_module(torch.ao.quantization.FusedMovingAvgObsFakeQuantize): 5, } self._test_quantized_add_mul_qat(m, expected_node_occurrence) @@ -4222,14 +5077,13 @@ def forward(self, x): x = torch.mul(x, 1.0) x = self.conv1(x) x = torch.mul(x, 1.0) - # TODO: add support for add + torch.relu? x = torch.relu(x) x = self.conv2(x) return x m = M() expected_node_occurrence = { - ns.call_module(torch.ao.quantization.FusedMovingAvgObsFakeQuantize): 6, + ns.call_module(torch.ao.quantization.FusedMovingAvgObsFakeQuantize): 5, } self._test_quantized_add_mul_qat(m, expected_node_occurrence) @@ -4253,7 +5107,7 @@ def forward(self, x): m, {'': torch.ao.quantization.get_default_qat_qconfig('fbgemm')}, prepare_custom_config_dict={"input_quantized_idxs": [0]}) expected_node_occurrence = { - ns.call_module(torch.ao.quantization.FusedMovingAvgObsFakeQuantize): 0, + ns.call_module(torch.ao.quantization.FusedMovingAvgObsFakeQuantize): 1, } self.checkGraphModuleNodes( mp, expected_node_occurrence=expected_node_occurrence) @@ -4531,9 +5385,6 @@ def test_layer_norm(self): self._test_norm_impl( nn.LayerNorm, F.layer_norm, [[2, 5, 5]], data, nnq.LayerNorm, torch.ops.quantized.layer_norm) - self._test_norm_float16_impl( - nn.LayerNorm, F.layer_norm, [[2, 5, 5]], data) - def test_instance_norm(self): data_1d = (torch.rand((1, 4, 5), dtype=torch.float),) data_2d = (torch.rand((1, 4, 5, 1), dtype=torch.float),) @@ -4625,6 +5476,7 @@ def forward(self, x): self.checkGraphModuleNodes(m_quant, expected_node_list=node_list) + @unittest.skip("TODO: reenable with backend_config_dict api") def test_gelu_normal(self): module = torch.nn.GELU functional = torch.nn.functional.gelu @@ -4637,18 +5489,20 @@ def test_gelu_normal(self): self._test_default_node_quant_handler_ops( module, functional, qconfig, is_reference, node_list) + @unittest.skip("TODO: reenable with backend_config_dict api") def test_softmax_normal(self): module = torch.nn.Softmax functional = torch.nn.functional.softmax qconfig = torch.ao.quantization.get_default_qconfig("fbgemm") is_reference = False node_list = [ - ns.call_module(module), + ns.call_module(torch.nn.quantized.Softmax), ns.call_function(functional), ] self._test_default_node_quant_handler_ops( module, functional, qconfig, is_reference, node_list) + @unittest.skip("This is no longer needed right now, can enable later with new api") def test_gelu_reference(self): module = torch.nn.GELU functional = torch.nn.functional.gelu @@ -4664,6 +5518,7 @@ def test_gelu_reference(self): ns.call_function(torch.quantize_per_tensor), ns.call_method('dequantize') ] + # TODO: change these to use backend_config_dict additional_patterns = {torch.nn.GELU: DefaultNodeQuantizeHandler, torch.nn.functional.gelu: DefaultNodeQuantizeHandler} self._test_default_node_quant_handler_ops( @@ -4672,6 +5527,7 @@ def test_gelu_reference(self): self._test_default_node_quant_handler_ops(module, functional, self.custom_qconfig, is_reference, node_list, additional_quant_pattern_dict=self.common_quant_patterns) + @unittest.skip("This is no longer needed right now, can enable later with new api") def test_softmax_reference(self): module = torch.nn.Softmax functional = torch.nn.functional.softmax @@ -4695,6 +5551,7 @@ def test_softmax_reference(self): self._test_default_node_quant_handler_ops(module, functional, self.custom_qconfig, is_reference, node_list, additional_quant_pattern_dict=self.common_quant_patterns) + @unittest.skip("This is no longer needed right now, can enable later with new api") def test_silu_reference(self): module = torch.nn.SiLU functional = torch.nn.functional.silu @@ -4726,6 +5583,7 @@ def test_silu_reference(self): self._test_default_node_quant_handler_ops(module, functional, self.custom_qconfig, is_reference, node_list, additional_quant_pattern_dict=self.common_quant_patterns) + @unittest.skip("This is no longer needed right now, can enable later with new api") def test_mish_reference(self): module = torch.nn.Mish functional = torch.nn.functional.mish @@ -4868,7 +5726,7 @@ def forward(self, x): data = (torch.randn((2, 2, 2, 2), dtype=torch.float),) quant_type = QuantType.STATIC qconfig = torch.ao.quantization.QConfig( - activation=HistogramObserver.with_args(qscheme=torch.per_tensor_symmetric, dtype=torch.qint8), + activation=HistogramObserver.with_args(qscheme=torch.per_tensor_symmetric, dtype=torch.quint8), weight=default_weight_observer) qconfig_dict = {"": qconfig} node_occurrence = { @@ -4979,7 +5837,7 @@ def forward(self, x): # observers and also successfully fused two quantized::conv2d # patterns # one quantize_per_tensor for input - # check exact counts of quantize and dequantiz + # check exact counts of quantize and dequantize count_check = { # input of conv and two outputs of getitem ns.call_function(torch.quantize_per_tensor) : 2, @@ -5006,6 +5864,47 @@ def forward(self, x): quantized = convert_fx(prepared, is_reference=True) + @skipIfNoFBGEMM + def test_ave_pool_with_custom_cfg(self): + """ A test that checks correct patterns are produced for + avg_pool2d with customized config + """ + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.avg_pool2d = torch.nn.AvgPool2d(3) + + + def forward(self, x): + x = self.avg_pool2d(x) + return x + + # This model is not executable since we just put all ops + # in the same forward + m = M().eval() + # nothing to fuse so skipping the fuse step + qconfig_dict = {'': default_qconfig} + prepared = prepare_fx(m, qconfig_dict, prepare_custom_config_dict={"input_quantized_idxs": [0]}) + + # not runnable + quantized = convert_fx(prepared) + + # This checks that the dequantize from the output of first conv + # is being propagated to the end, so that we don't insert extra + # observers + # check exact counts of quantize and dequantize + count_check = { + ns.call_method('dequantize') : 1 + } + order_check = [ + ns.call_module(nn.AvgPool2d), + ns.call_method('dequantize'), + ] + self.checkGraphModuleNodes( + quantized, + expected_node_occurrence=count_check, + expected_node_list=order_check) + @skipIfNoFBGEMM def test_general_value_ops(self): """ A test that checks correct patterns are produced for @@ -5074,6 +5973,21 @@ def forward(self, x): expected_node_occurrence=count_check, expected_node_list=order_check) + def test_copy_node_fp32_input(self): + """ CopyNode works for both fp32 and int8 inputs, this is a test to make + sure that a CopyNode can be successfully quantized in both cases + """ + class M(torch.nn.Module): + def forward(self, x): + x = x.relu() + return x + + m = M().eval() + m = prepare_fx(m, {"": default_reuse_input_qconfig}) + m = convert_fx(m) + # make sure it runs + m(torch.rand(1)) + def test_getitem(self): """ Make sure we only insert observer for getitem if the following node is matched or needs to be quantized @@ -5139,6 +6053,7 @@ def __init__(self): self.sigmoid = torch.nn.Sigmoid() self.hardsigmoid = torch.nn.Hardsigmoid() self.tanh = torch.nn.Tanh() + self.softmax = torch.nn.Softmax(dim=0) def forward(self, x): x = self.conv(x) @@ -5146,7 +6061,6 @@ def forward(self, x): x = self.sigmoid(x) x = torch.sigmoid(x) x = x.sigmoid() - x.sigmoid_() x = self.hardsigmoid(x) x = F.hardsigmoid(x) x = F.hardsigmoid(x, inplace=True) @@ -5154,7 +6068,8 @@ def forward(self, x): # F.tanh is deprecated x = torch.tanh(x) x = x.tanh() - x.tanh_() + # TODO(future PR): handle F.softmax + x = self.softmax(x) return x for eval_mode in [True, False]: @@ -5165,12 +6080,12 @@ def forward(self, x): m.eval() qconfig = default_qconfig prepare = prepare_fx - fq_count = 11 + fq_count = 10 else: m.train() qconfig = default_qat_qconfig prepare = prepare_qat_fx - fq_count = 11 + fq_count = 10 # nothing to fuse so skipping the fuse step m_copy = copy.deepcopy(m) @@ -5205,6 +6120,7 @@ def forward(self, x): ns.call_function(torch.quantize_per_tensor), ns.call_module(nnq.Conv2d), ns.call_module(nn.Sigmoid), + ns.call_module(nnq.Softmax), ns.call_method('dequantize'), ] self.checkGraphModuleNodes( @@ -5213,8 +6129,8 @@ def forward(self, x): expected_node_list=order_check) reference_count_check = { - ns.call_function(torch.quantize_per_tensor) : 13, - ns.call_method('dequantize') : 11 + ns.call_function(torch.quantize_per_tensor) : 12, + ns.call_method('dequantize') : 12 } reference_order_check = [ ns.call_function(torch.quantize_per_tensor), @@ -5225,12 +6141,18 @@ def forward(self, x): ns.call_module(nn.Sigmoid), ns.call_function(torch.quantize_per_tensor), ns.call_method('dequantize'), + ns.call_module(nn.Softmax), + ns.call_function(torch.quantize_per_tensor), + ns.call_method('dequantize'), ] self.checkGraphModuleNodes( quantized_reference, expected_node_occurrence=reference_count_check, expected_node_list=reference_order_check) + # Verify that softmax scale and zero_point are correct + self.assertTrue(quantized.softmax.scale - (1.0 / 256) <= 1e-8) + self.assertTrue(quantized.softmax.zero_point == 0) def test_float_functional(self): class TorchAdd(nn.Module): @@ -5627,6 +6549,7 @@ def forward(self, x): m, expected_node_occurrence=expected_occurrence) + @unittest.skip("This is no longer needed right now, can enable later with new api") def test_qmatmul(self): class M(torch.nn.Module): def forward(self, x, y): @@ -5634,7 +6557,7 @@ def forward(self, x, y): return z m = M().eval() - qconfig_dict = {"": torch.quantization.default_qconfig} + qconfig_dict = {"": torch.ao.quantization.default_qconfig} mp = prepare_fx(m, qconfig_dict) mp(torch.randn(2, 2), torch.randn(2, 2)) mq = convert_fx(mp) @@ -6025,15 +6948,7 @@ def forward(self, input: torch.Tensor, offsets: Optional[torch.Tensor] = None, model = EmbeddingBagLinear().train() prepared_fx_model = prepare_qat_fx(model, qconfig_dict) test_only_train_fn(prepared_fx_model, train_indices) - convert_custom_config_dict = { - "additional_object_mapping": { - "static": { - torch.nn.qat.EmbeddingBag: nn.quantized.EmbeddingBag, - } - } - } quant_model = convert_fx(prepared_fx_model, - convert_custom_config_dict=convert_custom_config_dict, qconfig_dict=qconfig_dict) def checkQuantized(model): @@ -6073,15 +6988,7 @@ def forward(self, input: torch.Tensor): model = EmbeddingLinear().train() prepared_fx_model = prepare_qat_fx(model, qconfig_dict) test_only_train_fn(prepared_fx_model, train_indices) - convert_custom_config_dict = { - "additional_object_mapping": { - "static": { - torch.nn.qat.Embedding: nn.quantized.Embedding, - } - } - } quant_model = convert_fx(prepared_fx_model, - convert_custom_config_dict=convert_custom_config_dict, qconfig_dict=qconfig_dict) def checkQuantized(model): diff --git a/test/quantization/jit/test_quantize_jit.py b/test/quantization/jit/test_quantize_jit.py index 025b031bb8d8..6648bcaa9afc 100644 --- a/test/quantization/jit/test_quantize_jit.py +++ b/test/quantization/jit/test_quantize_jit.py @@ -1416,6 +1416,38 @@ def forward(self, x, y, z): str(get_forward_graph(m.conv3d._c)) ) + def test_convtranspose_trace(self): + class M(torch.nn.Module): + def __init__(self): + super(M, self).__init__() + self.convtranspose1d = torch.nn.ConvTranspose1d(3, 3, 3).float() + self.convtranspose2d = torch.nn.ConvTranspose2d(3, 3, 3).float() + self.convtranspose3d = torch.nn.ConvTranspose3d(3, 3, 3).float() + + def forward(self, x, y, z): + a = self.convtranspose1d(x) + b = self.convtranspose2d(y) + c = self.convtranspose3d(z) + return (a, b, c) + + qconfig_dict = {"": default_qconfig} + inputs = ( + torch.rand((1, 3, 10), dtype=torch.float), + torch.rand((1, 3, 10, 10), dtype=torch.float), + torch.rand((1, 3, 10, 10, 10), dtype=torch.float), + ) + model = torch.jit.trace(M(), inputs).eval() + m = prepare_jit(model, qconfig_dict) + FileCheck().check("aten::conv_transpose1d").check_not("aten::_convolution").run( + str(get_forward_graph(m.convtranspose1d._c)) + ) + FileCheck().check("aten::conv_transpose2d").check_not("aten::_convolution").run( + str(get_forward_graph(m.convtranspose2d._c)) + ) + FileCheck().check("aten::conv_transpose3d").check_not("aten::_convolution").run( + str(get_forward_graph(m.convtranspose3d._c)) + ) + @unittest.skipUnless( "fbgemm" in torch.backends.quantized.supported_engines, " Quantized operations require FBGEMM. FBGEMM is only optimized for CPUs" diff --git a/test/quantization/serialized/TestSerialization.test_linear_relu_package_quantization_transforms.get_attr_targets.pt b/test/quantization/serialized/TestSerialization.test_linear_relu_package_quantization_transforms.get_attr_targets.pt index bb34a57f962a..6887e8c614a5 100644 Binary files a/test/quantization/serialized/TestSerialization.test_linear_relu_package_quantization_transforms.get_attr_targets.pt and b/test/quantization/serialized/TestSerialization.test_linear_relu_package_quantization_transforms.get_attr_targets.pt differ diff --git a/test/run_test.py b/test/run_test.py index 2b772bc6f368..c0ad0a55a02b 100644 --- a/test/run_test.py +++ b/test/run_test.py @@ -22,6 +22,7 @@ TEST_WITH_ROCM, shell, set_cwd, + parser as common_parser, ) import torch.distributed as dist from typing import Dict, Optional, List @@ -92,6 +93,7 @@ def skip_test_p(name: str) -> bool: 'onnx', 'package', # executed by test_package.py 'quantization', # executed by test_quantization.py + 'autograd', # executed by test_autograd.py ], blocklisted_tests=[ 'test_bundled_images', @@ -103,7 +105,6 @@ def skip_test_p(name: str) -> bool: 'test_kernel_launch_checks', 'test_metal', 'test_nnapi', - 'test_functionalization', 'test_segment_reductions', 'test_static_runtime', 'test_throughput_benchmark', @@ -132,6 +133,7 @@ def skip_test_p(name: str) -> bool: "distributed/elastic/utils/util_test", "distributed/elastic/utils/distributed_test", "distributed/elastic/multiprocessing/api_test", + "test_deploy", ] ) @@ -167,6 +169,7 @@ def skip_test_p(name: str) -> bool: "test_typing", "distributed/elastic/events/lib_test", "distributed/elastic/agent/server/test/api_test", + "test_deploy", ] WINDOWS_BLOCKLIST = [ @@ -199,17 +202,25 @@ def skip_test_p(name: str) -> bool: "distributed/elastic/agent/server/test/api_test", "distributed/elastic/multiprocessing/api_test", "distributed/_shard/sharding_spec/test_sharding_spec", + "distributed/_shard/sharding_plan/test_sharding_plan", "distributed/_shard/sharded_tensor/test_megatron_prototype", "distributed/_shard/sharded_tensor/test_sharded_tensor", "distributed/_shard/sharded_tensor/test_sharded_tensor_reshard", - "distributed/_shard/sharded_tensor/test_partial_tensor", + "distributed/_shard/sharded_tensor/ops/test_chunk", "distributed/_shard/sharded_tensor/ops/test_elementwise_ops", "distributed/_shard/sharded_tensor/ops/test_embedding", "distributed/_shard/sharded_tensor/ops/test_embedding_bag", "distributed/_shard/sharded_tensor/ops/test_binary_cmp", "distributed/_shard/sharded_tensor/ops/test_init", "distributed/_shard/sharded_tensor/ops/test_linear", + "distributed/_shard/sharded_tensor/ops/test_math_ops", + "distributed/_shard/sharded_tensor/ops/test_matrix_ops", + "distributed/_shard/sharded_tensor/ops/test_softmax", + "distributed/_shard/sharded_tensor/ops/test_tensor_ops", + "distributed/_shard/sharding_spec/test_sharding_spec", "distributed/_shard/sharded_optim/test_sharded_optim", + "distributed/_shard/test_partial_tensor", + "distributed/_shard/test_replicated_tensor", ] + FSDP_TEST ROCM_BLOCKLIST = [ @@ -217,23 +228,31 @@ def skip_test_p(name: str) -> bool: "distributed/rpc/test_faulty_agent", "distributed/rpc/test_tensorpipe_agent", "distributed/rpc/cuda/test_tensorpipe_agent", + "distributed/_shard/sharding_spec/test_sharding_spec", + "distributed/_shard/sharding_plan/test_sharding_plan", "distributed/_shard/sharded_tensor/test_megatron_prototype", "distributed/_shard/sharded_tensor/test_sharded_tensor", "distributed/_shard/sharded_tensor/test_sharded_tensor_reshard", - "distributed/_shard/sharded_tensor/test_partial_tensor", + "distributed/_shard/sharded_tensor/ops/test_chunk", "distributed/_shard/sharded_tensor/ops/test_elementwise_ops", "distributed/_shard/sharded_tensor/ops/test_embedding", "distributed/_shard/sharded_tensor/ops/test_embedding_bag", "distributed/_shard/sharded_tensor/ops/test_binary_cmp", "distributed/_shard/sharded_tensor/ops/test_init", "distributed/_shard/sharded_tensor/ops/test_linear", + "distributed/_shard/sharded_tensor/ops/test_math_ops", + "distributed/_shard/sharded_tensor/ops/test_matrix_ops", + "distributed/_shard/sharded_tensor/ops/test_softmax", + "distributed/_shard/sharded_tensor/ops/test_tensor_ops", + "distributed/_shard/sharding_spec/test_sharding_spec", "distributed/_shard/sharded_optim/test_sharded_optim", + "distributed/_shard/test_partial_tensor", + "distributed/_shard/test_replicated_tensor", "test_determination", - "test_multiprocessing", "test_jit_legacy", "test_type_hints", "test_openmp", -] + FSDP_TEST +] RUN_PARALLEL_BLOCKLIST = [ "test_cpp_extensions_jit", @@ -256,6 +275,8 @@ def skip_test_p(name: str) -> bool: "test_modules", "test_nn", "test_ops", + "test_ops_gradients", + "test_ops_jit", "test_torch" ] @@ -305,73 +326,17 @@ def skip_test_p(name: str) -> bool: ) JIT_EXECUTOR_TESTS = [ - "test_jit_cuda_fuser", "test_jit_profiling", "test_jit_legacy", "test_jit_fuser_legacy", ] -DISTRIBUTED_TESTS = [ - "distributed/test_data_parallel", - "distributed/test_launcher", - "distributed/nn/jit/test_instantiator", - "distributed/rpc/test_faulty_agent", - "distributed/rpc/test_tensorpipe_agent", - "distributed/rpc/cuda/test_tensorpipe_agent", - "distributed/test_c10d_common", - "distributed/test_c10d_gloo", - "distributed/test_c10d_nccl", - "distributed/test_c10d_spawn_gloo", - "distributed/test_c10d_spawn_nccl", - "distributed/test_store", - "distributed/test_pg_wrapper", - "distributed/algorithms/test_join", - "distributed/test_distributed_spawn", - "distributed/pipeline/sync/skip/test_api", - "distributed/pipeline/sync/skip/test_gpipe", - "distributed/pipeline/sync/skip/test_inspect_skip_layout", - "distributed/pipeline/sync/skip/test_leak", - "distributed/pipeline/sync/skip/test_portal", - "distributed/pipeline/sync/skip/test_stash_pop", - "distributed/pipeline/sync/skip/test_tracker", - "distributed/pipeline/sync/skip/test_verify_skippables", - "distributed/pipeline/sync/test_balance", - "distributed/pipeline/sync/test_bugs", - "distributed/pipeline/sync/test_checkpoint", - "distributed/pipeline/sync/test_copy", - "distributed/pipeline/sync/test_deferred_batch_norm", - "distributed/pipeline/sync/test_dependency", - "distributed/pipeline/sync/test_inplace", - "distributed/pipeline/sync/test_microbatch", - "distributed/pipeline/sync/test_phony", - "distributed/pipeline/sync/test_pipe", - "distributed/pipeline/sync/test_pipeline", - "distributed/pipeline/sync/test_stream", - "distributed/pipeline/sync/test_transparency", - "distributed/pipeline/sync/test_worker", - "distributed/optim/test_zero_redundancy_optimizer", - "distributed/elastic/timer/api_test", - "distributed/elastic/timer/local_timer_example", - "distributed/elastic/timer/local_timer_test", - "distributed/elastic/events/lib_test", - "distributed/elastic/metrics/api_test", - "distributed/elastic/utils/logging_test", - "distributed/elastic/utils/util_test", - "distributed/elastic/utils/distributed_test", - "distributed/elastic/multiprocessing/api_test", - "distributed/_shard/sharding_spec/test_sharding_spec", - "distributed/_shard/sharded_tensor/test_megatron_prototype", - "distributed/_shard/sharded_tensor/test_sharded_tensor", - "distributed/_shard/sharded_tensor/test_sharded_tensor_reshard", - "distributed/_shard/sharded_tensor/test_partial_tensor", - "distributed/_shard/sharded_tensor/ops/test_elementwise_ops", - "distributed/_shard/sharded_tensor/ops/test_embedding", - "distributed/_shard/sharded_tensor/ops/test_embedding_bag", - "distributed/_shard/sharded_tensor/ops/test_binary_cmp", - "distributed/_shard/sharded_tensor/ops/test_init", - "distributed/_shard/sharded_tensor/ops/test_linear", - "distributed/_shard/sharded_optim/test_sharded_optim", -] + [test for test in TESTS if test.startswith("distributed/fsdp")] +DISTRIBUTED_TESTS = [test for test in TESTS if test.startswith("distributed")] + +TESTS_REQUIRING_LAPACK = [ + "distributions/test_constraints", + "distributions/test_distributions", +] # Dictionary matching test modules (in TESTS) to lists of test cases (within that test_module) that would be run when # options.run_specified_test_cases is enabled. @@ -577,6 +542,7 @@ def test_distributed(test_module, test_directory, options): backend, with_init ) ) + old_environ = dict(os.environ) os.environ["TEMP_DIR"] = tmp_dir os.environ["BACKEND"] = backend os.environ["INIT_METHOD"] = "env://" @@ -627,6 +593,8 @@ def test_distributed(test_module, test_directory, options): return return_code finally: shutil.rmtree(tmp_dir) + os.environ.clear() + os.environ.update(old_environ) return 0 @@ -664,6 +632,7 @@ def parse_args(): description="Run the PyTorch unit test suite", epilog="where TESTS is any of: {}".format(", ".join(TESTS)), formatter_class=argparse.RawTextHelpFormatter, + parents=[common_parser] ) parser.add_argument( "-v", @@ -816,6 +785,11 @@ def parse_args(): " within a specified test module. For unspecified test modules with the bring-to-front " "option, all test cases will be run, as one may expect.", ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Only list the test that will run.", + ) return parser.parse_args() @@ -916,6 +890,10 @@ def get_selected_tests(options): if options.exclude_distributed_tests: options.exclude.extend(DISTRIBUTED_TESTS) + # these tests failing in CUDA 11.6 temporary disabling. issue https://github.com/pytorch/pytorch/issues/75375 + if torch.version.cuda is not None and LooseVersion(torch.version.cuda) == "11.6": + options.exclude.extend(["distributions/test_constraints"]) + selected_tests = exclude_tests(options.exclude, selected_tests) if sys.platform == "win32" and not options.ignore_win_blocklist: @@ -961,6 +939,11 @@ def get_selected_tests(options): selected_tests = exclude_tests(DISTRIBUTED_TESTS, selected_tests, "PyTorch is built without distributed support.") + # skip tests that require LAPACK when it's not available + if not torch._C.has_lapack: + selected_tests = exclude_tests(TESTS_REQUIRING_LAPACK, selected_tests, + "PyTorch is built without LAPACK support.") + return selected_tests @@ -1012,7 +995,10 @@ def main(): selected_tests = get_selected_tests(options) if options.verbose: - print_to_stderr("Selected tests: {}".format(", ".join(selected_tests))) + print_to_stderr("Selected tests:\n {}".format("\n ".join(selected_tests))) + + if options.dry_run: + return if options.coverage and not PYTORCH_COLLECT_COVERAGE: shell(["coverage", "erase"]) diff --git a/test/test_ao_sparsity.py b/test/test_ao_sparsity.py index 32b95973928e..6b5c8574c2e6 100644 --- a/test/test_ao_sparsity.py +++ b/test/test_ao_sparsity.py @@ -20,5 +20,8 @@ # Scheduler from ao.sparsity.test_scheduler import TestScheduler # noqa: F401 +# Composability +from ao.sparsity.test_composability import TestComposability # noqa: F401 + if __name__ == '__main__': run_tests() diff --git a/test/test_autocast.py b/test/test_autocast.py index aed0c3496223..bfbe46d08b89 100644 --- a/test/test_autocast.py +++ b/test/test_autocast.py @@ -104,8 +104,9 @@ def test_autocast_torch_bf16(self): self._run_autocast_outofplace(op, args, torch.bfloat16, add_kwargs=maybe_kwargs) def test_autocast_nn_bf16(self): - for op, args in self.autocast_lists.nn_bf16: - self._run_autocast_outofplace(op, args, torch.bfloat16, module=torch._C._nn) + for op_with_args in self.autocast_lists.nn_bf16: + op, args, maybe_kwargs = self.args_maybe_kwargs(op_with_args) + self._run_autocast_outofplace(op, args, torch.bfloat16, module=torch._C._nn, add_kwargs=maybe_kwargs) def test_autocast_torch_fp32(self): for op_with_args in self.autocast_lists.torch_fp32: diff --git a/test/test_autograd.py b/test/test_autograd.py index 2fe584ab0b68..7abf53148190 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -14,11 +14,12 @@ import uuid import warnings import operator +import subprocess from copy import deepcopy from collections import OrderedDict from itertools import product from operator import mul -from functools import reduce +from functools import reduce, partial import torch from torch import nn @@ -26,21 +27,21 @@ from torch.autograd.function import once_differentiable from torch.autograd.profiler import (profile, record_function, emit_nvtx) from torch.autograd.profiler_util import (_format_time, EventList, FunctionEvent, FunctionEventAvg) -import torch.autograd.functional as autogradF from torch.utils.checkpoint import checkpoint from torch.testing import make_tensor from torch.testing._internal.common_cuda import TEST_CUDA from torch.testing._internal.common_utils import ( TestCase, run_tests, skipIfNoLapack, slowTest, IS_WINDOWS, IS_MACOS, - disable_gc, gradcheck, gradgradcheck, parametrize, instantiate_parametrized_tests) + disable_gc, gradcheck, gradgradcheck, parametrize, + instantiate_parametrized_tests, skipIfMps) from torch.autograd import Variable, Function, detect_anomaly, kineto_available from torch.autograd.function import InplaceFunction import torch.autograd.forward_ad as fwAD from torch.testing._internal.common_methods_invocations import mask_not_all_zeros from torch.testing._internal.common_device_type import (instantiate_device_type_tests, skipCUDAIfRocm, onlyCPU, onlyCUDA, dtypes, dtypesIfCUDA, - deviceCountAtLeast, skipMeta) -from torch.testing._internal.common_dtype import get_all_dtypes + deviceCountAtLeast, skipMeta, dtypesIfMPS) +from torch.testing._internal.common_dtype import floating_types_and from torch.testing._internal.logging_tensor import no_dispatch import pickle @@ -389,8 +390,8 @@ def test_not_implemented_fwad(self): hint_msg = "Running forward AD for an OP that does not implement it should raise a NotImplementedError" with self.assertRaisesRegex(NotImplementedError, err_msg, msg=hint_msg): - # if forward AD ends up being implemented for torch.atan2, choose a different op - torch.atan2(dual_x, dual_x) + # if forward AD ends up being implemented for torch.igamma, choose a different op + torch.igamma(dual_x, dual_x) def test_accumulate_grad(self): grad_output = torch.ones(5, 5) @@ -2820,7 +2821,7 @@ def test_profiler(self): for evt in p.function_events: if evt.name in names: found_indices.add(names.index(evt.name)) - self.assertEquals(len(found_indices), len(names)) + self.assertEqual(len(found_indices), len(names)) def test_profiler_seq_nr(self): with profile(use_kineto=kineto_available()) as p: @@ -2931,6 +2932,21 @@ def test_record_function_callbacks(self): foo_event = [event for event in function_events if "foo" in event.name][0] self.assertEqual(foo_event.count, 1) + def test_record_function_new_signatures(self): + # Test the new _record_function ops work + # Note: Remove once record_function uses these directly + x = torch.randn(10, 10) + with profile(use_kineto=kineto_available()) as p: + record = torch.ops.profiler._record_function_enter_new("bar", None) + try: + y = x * 2 + 4 + finally: + torch.ops.profiler._record_function_exit(record) + + function_events = p.function_events + foo_event = [event for event in function_events if "bar" in event.name][0] + self.assertEqual(foo_event.count, 1) + def test_profiler_aggregation_fake(self): events = EventList() id = [0] @@ -3658,6 +3674,22 @@ def fn(sparse): check(fast_mode=True) check(fast_mode=False) + @unittest.expectedFailure + def test_gradcheck_sparse_csr_input(self): + def check(fast_mode): + def fn(sparse_csr): + return torch.clone(sparse_csr).to_dense() + + # Fails because gradcheck can't work with sparse csr inputs yet + gradcheck(fn, torch.rand(2, 2, dtype=torch.double).to_sparse_csr().requires_grad_(True), check_sparse_nnz=True, + check_batched_grad=False, fast_mode=fast_mode) + + with self.assertRaisesRegex(RuntimeError, 'gradcheck expects all tensor inputs are dense'): + gradcheck(fn, torch.rand(2, 2, dtype=torch.double).to_sparse_csr().requires_grad_(True), check_sparse_nnz=False, + check_batched_grad=False, fast_mode=fast_mode) + # check(fast_mode=True) # Segmentation fault + check(fast_mode=False) + def test_gradcheck_nondeterministic(self): class NonDetFunc(Function): @staticmethod @@ -4293,7 +4325,13 @@ def backward(ctx, grad): MyFunction.apply(v).backward() """ s = TestCase.runWithPytorchAPIUsageStderr(code) - self.assertRegex(s, "PYTORCH_API_USAGE torch.autograd.thread_shutdown") + # The autograd engine creates worker threads only when GPU devices are present. + # So make sure that we do shutdown threads when we're testing cuda and make sure + # that there is no thread to shutdown when we're not using cuda. + if TEST_CUDA or torch.backends.mps.is_available(): + self.assertRegex(s, "PYTORCH_API_USAGE torch.autograd.thread_shutdown") + else: + self.assertNotRegex(s, "PYTORCH_API_USAGE torch.autograd.thread_shutdown") @unittest.skipIf(IS_MACOS, "Fails with SIGBUS on macOS; https://github.com/pytorch/pytorch/issues/25941") def test_deep_reentrant(self): @@ -4793,7 +4831,10 @@ def test_grad_fn_attr_bindings(self): self.assertIsInstance(out.grad_fn._saved_output_size[0], int) self.assertEqual(out.grad_fn._saved_align_corners, False) # bool -> bool self.assertIsInstance(out.grad_fn._saved_align_corners, bool) - self.assertIsNone(out.grad_fn._saved_scale_factors) # c10::optional> -> float[]? + if hasattr(out.grad_fn, '_saved_scale_factors'): + self.assertIsNone(out.grad_fn._saved_scale_factors) # c10::optional> -> float[]? + else: + self.assertIsNone(out.grad_fn._saved_scales) # c10::optional> -> float[]? out = torch.nn.functional.interpolate(a, scale_factor=0.5, mode="linear") self.assertIsNone(out.grad_fn._saved_output_size) @@ -6267,7 +6308,14 @@ def test(get_input, cuda, pin_memory): if y.is_sparse: y = y.to_dense() y.sum().backward() - self.assertEqual(2 * a, a.grad) + + actual = 2 * a + expected = a.grad + if a.is_sparse: + actual = actual.coalesce() + expected = expected.coalesce() + + self.assertEqual(actual, expected) for cuda in [False] + ([True] if torch.cuda.is_available() else []): for pin_memory in [True, False]: @@ -6311,1361 +6359,76 @@ def f(x): memory_with_hooks = torch.cuda.memory_allocated() self.assertEqual(memory_with_hooks, memory_without_grad) + def test_pynode_destruction_deadlock(self): + script = """ +import torch -def index_perm_variable(shape, max_indices): - if not isinstance(shape, tuple): - shape = (shape,) - - index = torch.randperm(max_indices).narrow(0, 0, reduce(mul, shape)).view(shape) - return index - -def bernoulli_scalar(): - return torch.tensor(0, dtype=torch.uint8).bernoulli_() - - -class TestAutogradFunctional(TestCase): - def _assert_same_struct(self, res, base): - # base and res should be Tensors or tuple of Tensors with the same size - if isinstance(base, torch.Tensor): - self.assertTrue(isinstance(res, torch.Tensor)) - self.assertEqual(base.size(), res.size()) - elif isinstance(base, tuple): - self.assertTrue(isinstance(res, tuple)) - self.assertEqual(len(base), len(res)) - for el_base, el_res in zip(base, res): - self.assertTrue(isinstance(el_base, torch.Tensor)) - self.assertTrue(isinstance(el_res, torch.Tensor)) - self.assertEqual(el_base.size(), el_res.size()) - else: - # Wrong base - raise RuntimeError("The base given to `_assert_same_struct` doesn't have" - " the right structure.") - - def _assert_interleaved_struct(self, res, base1, base2): - # base1 and base2 can be Tensors or tuples of Tensors. - # If they are tuples, res should be a tuple as well. - # The indexing works as follows for base1, base2 being - # - tuple, tuple: res[i][j][k][l] = (base1[i][k], base2[j][l]) - # - tuple, Tensor: res[i][k][l] = (base1[i][k], base2[l]) - # - Tensor, tuple: res[i][j][l] = (base1[i], base2[j][l]) - # - Tensor, Tensor: res[k][l] = (base1[k], base2[l]) - if isinstance(base1, torch.Tensor) and isinstance(base2, torch.Tensor): - self.assertTrue(isinstance(res, torch.Tensor)) - self.assertEqual(res.size(), base1.size() + base2.size()) - elif isinstance(base1, tuple) and isinstance(base2, torch.Tensor): - self.assertTrue(isinstance(res, tuple)) - self.assertEqual(len(res), len(base1)) - for el_res, el_base1 in zip(res, base1): - self.assertTrue(isinstance(el_res, torch.Tensor)) - self.assertTrue(isinstance(el_base1, torch.Tensor)) - self.assertEqual(el_res.size(), el_base1.size() + base2.size()) - elif isinstance(base1, torch.Tensor) and isinstance(base2, tuple): - self.assertTrue(isinstance(res, tuple)) - self.assertEqual(len(res), len(base2)) - for el_res, el_base2 in zip(res, base2): - self.assertTrue(isinstance(el_res, torch.Tensor)) - self.assertTrue(isinstance(el_base2, torch.Tensor)) - self.assertEqual(el_res.size(), base1.size() + el_base2.size()) - elif isinstance(base1, tuple) and isinstance(base2, tuple): - self.assertTrue(isinstance(res, tuple)) - self.assertEqual(len(res), len(base1)) - for el_res, el_base1 in zip(res, base1): - self.assertTrue(isinstance(el_res, tuple)) - self.assertEqual(len(res), len(base2)) - for el_el_res, el_base2 in zip(el_res, base2): - self.assertTrue(isinstance(el_el_res, torch.Tensor)) - self.assertTrue(isinstance(el_base2, torch.Tensor)) - self.assertEqual(el_el_res.size(), el_base1.size() + el_base2.size()) - else: - # Wrong bases - raise RuntimeError("The bases given to `_assert_interleaved_struct` don't have" - " the right structure.") - - def test_vjp_err_check(self): - def foo(a): - return 3 * a.narrow(0, 0, 3) - - def bar(a): - return 3 * a.narrow(0, 0, 3), "bar" - - inp = torch.rand(4) - v = torch.ones(3) - with self.assertRaisesRegex(TypeError, "The inputs given to vjp must be either a Tensor"): - res = autogradF.vjp(foo, (inp, 2), v) - - with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to vjp must"): - res = autogradF.vjp(bar, inp, v) - - with self.assertRaisesRegex(RuntimeError, "The vector v can only be None if the user-provided function returns"): - res = autogradF.vjp(foo, inp) - - with self.assertRaisesRegex(RuntimeError, "The given v should contain a single Tensor."): - res = autogradF.vjp(foo, inp, (torch.ones_like(inp), torch.ones_like(inp))) - - with self.assertRaisesRegex(RuntimeError, "v has invalid size: should be torch.Size"): - res = autogradF.vjp(foo, inp, v[:2]) - - res = autogradF.vjp(foo, inp, v)[1] - self._assert_same_struct(res, inp) - - def test_vjp_err_check_strict(self): - def foo(a): - return a.detach() - - def bar(a): - # Make a non-leaf Tensor that requires_grad but that is not connected to the input - return a.long().float().requires_grad_().clone() - - inp = torch.rand(4) - v = torch.rand(4) - with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."): - res = autogradF.vjp(foo, inp, v, strict=True) - res = autogradF.vjp(foo, inp, v, strict=False) - self._assert_same_struct(res[1], inp) - self.assertEqual(res[1].abs().sum(), 0.) - - with self.assertRaisesRegex(RuntimeError, "The output of the user-provided function is independent of input 0"): - res = autogradF.vjp(bar, inp, v, strict=True) - res = autogradF.vjp(bar, inp, v, strict=False) - self._assert_same_struct(res[1], inp) - self.assertEqual(res[1].abs().sum(), 0.) - - # The Jacobian does not depend on the input - def foo(a): - return a.clone() - - inp.requires_grad_() - with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function is independent of input 0."): - res = autogradF.vjp(foo, inp, v, create_graph=True, strict=True) - res = autogradF.vjp(foo, inp, v, create_graph=True, strict=False) - self._assert_same_struct(res[1], inp) - self.assertEqual(res[1], v) - - def test_vjp_no_grad(self): - def reducer(x): - return x.sum(dim=1) - inputs = torch.rand(4, 4) - v = torch.ones(4) - with torch.no_grad(): - res = autogradF.vjp(reducer, inputs, v) - self.assertIsNone(res[0].grad_fn) - self.assertIsNone(res[1].grad_fn) - self.assertNotEqual(res[1], torch.zeros(4, 4)) - - inputs.requires_grad_() - v.requires_grad_() - with torch.no_grad(): - res = autogradF.vjp(reducer, inputs, v, create_graph=True) - self.assertIsNotNone(res[0].grad_fn) - self.assertIsNotNone(res[1].grad_fn) - self.assertNotEqual(res[1], torch.zeros(4, 4)) - - def test_vjp_output(self): - def reducer(x): - return x.sum(dim=1) - inputs = torch.rand(4, 4) - v = torch.ones(4) - res = autogradF.vjp(reducer, inputs, v) - self._assert_same_struct(res[1], inputs) - self.assertIsNone(res[0].grad_fn) - self.assertIsNone(res[1].grad_fn) - - def adder(x, y): - return 2 * x + 3 * y - - inputs = (torch.rand(2), torch.rand(2)) - v = torch.ones(2) - out, vjp_val = autogradF.vjp(adder, inputs, v) - self._assert_same_struct(vjp_val, inputs) - self.assertIsNone(out.grad_fn) - self.assertIsNone(vjp_val[0].grad_fn) - self.assertIsNone(vjp_val[1].grad_fn) - - def adder(x, y): - return 2 * x + 3 * y, x + y - - inputs = (torch.rand(2), torch.rand(2)) - v = (torch.tensor([1., 0.]), torch.tensor([1., 0.])) - out, vjp_val = autogradF.vjp(adder, inputs, v) - self._assert_same_struct(vjp_val, inputs) - self.assertIsNone(out[0].grad_fn) - self.assertIsNone(out[1].grad_fn) - self.assertIsNone(vjp_val[0].grad_fn) - self.assertIsNone(vjp_val[1].grad_fn) - - def test_vjp_scalar(self): - def reducer(x): - return x.sum() - inputs = torch.rand(4, 4) - v = torch.ones([]) - res = autogradF.vjp(reducer, inputs, v) - self._assert_same_struct(res[0], v) - self._assert_same_struct(res[1], inputs) - - res = autogradF.vjp(reducer, inputs) - self._assert_same_struct(res[0], v) - self._assert_same_struct(res[1], inputs) - - def expander(x): - return x.unsqueeze(0).repeat(4) - inputs = torch.rand([]) - v = torch.ones(4) - res = autogradF.vjp(expander, inputs, v) - self._assert_same_struct(res[0], v) - self._assert_same_struct(res[1], inputs) - - def test_vjp_create_graph(self): - def reducer(x): - return x.sum(dim=1) - inputs = torch.rand(2, 2, dtype=torch.double) - v = torch.ones(2, dtype=torch.double) - - inputs.requires_grad_() - v.requires_grad_() - res = autogradF.vjp(reducer, inputs, v, create_graph=True) - self._assert_same_struct(res[1], inputs) - self.assertIsNotNone(res[0].grad_fn) - self.assertIsNotNone(res[1].grad_fn) - - gradcheck(lambda inp, v: autogradF.vjp(reducer, inputs, v, create_graph=True), (inputs, v)) - gradgradcheck(lambda inp, v: autogradF.vjp(reducer, inputs, v, create_graph=True), (inputs, v)) - - def adder(x, y): - return 2 * x + 3 * y, x * y - - inputs = (torch.rand(2, dtype=torch.double, requires_grad=True), - torch.rand(2, dtype=torch.double, requires_grad=True)) - v = (torch.tensor([1., 0.], dtype=torch.double, requires_grad=True), - torch.tensor([1., 0.], dtype=torch.double, requires_grad=True)) - - gradcheck(lambda *args: autogradF.vjp(adder, args[:2], args[2:], create_graph=True)[1], inputs + v) - gradgradcheck(lambda *args: autogradF.vjp(adder, args[:2], args[2:], create_graph=True)[1], inputs + v) - - def foo(*args): - x, y = args[:2] - v = args[2:] - - x = x.cos() - val, grad = autogradF.vjp(adder, (x, y), v, create_graph=True) - - return val[0].exp() + val[1].exp() + grad[0].exp() + grad[1].exp() + x.exp() + y.exp() - - gradcheck(foo, inputs + v) - gradgradcheck(foo, inputs + v) - - def test_jvp_err_check(self): - def foo(a): - return 3 * a.narrow(0, 0, 3) - - def bar(a): - return 3 * a.narrow(0, 0, 3), "bar" - - inp = torch.rand(4) - v = torch.rand(4) - with self.assertRaisesRegex(TypeError, "The inputs given to jvp must be either a Tensor"): - res = autogradF.jvp(foo, (inp, 2), v) - - with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to jvp must"): - res = autogradF.jvp(bar, inp, v) - - with self.assertRaisesRegex(RuntimeError, "The vector v can only be None if the input to the user-provided function"): - res = autogradF.jvp(foo, inp) - - with self.assertRaisesRegex(RuntimeError, "The given v should contain a single Tensor."): - res = autogradF.jvp(foo, inp, (v, v)) - - with self.assertRaisesRegex(RuntimeError, "v has invalid size: should be torch.Size"): - res = autogradF.jvp(foo, inp, v[:2]) - - res = autogradF.jvp(foo, inp, v)[1] - self._assert_same_struct(res, foo(inp)) - - def test_jvp_err_check_strict(self): - def foo(a): - return a.detach() - - def bar(a): - # Make a non-leaf Tensor that requires_grad but that is not connected to the input - return a.long().float().requires_grad_().clone() - - inp = torch.rand(4) - v = torch.rand(4) - with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."): - res = autogradF.jvp(foo, inp, v, strict=True) - res = autogradF.jvp(foo, inp, v, strict=False) - self._assert_same_struct(res[1], res[0]) - self.assertEqual(res[1].abs().sum(), 0.) - - with self.assertRaisesRegex(RuntimeError, "The output of the user-provided function is independent of input 0"): - res = autogradF.jvp(bar, inp, v, strict=True) - res = autogradF.jvp(bar, inp, v, strict=False) - self._assert_same_struct(res[1], res[0]) - self.assertEqual(res[1].abs().sum(), 0.) - - # The Jacobian does not depend on the input - def foo(a): - return a.clone() - - inp.requires_grad_() - with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function is independent of input 0."): - res = autogradF.jvp(foo, inp, v, create_graph=True, strict=True) - res = autogradF.jvp(foo, inp, v, create_graph=True, strict=False) - self._assert_same_struct(res[1], inp) - self.assertEqual(res[1], v) - - def test_jvp_no_grad(self): - def reducer(x): - return x.sum(dim=1) - inputs = torch.rand(4, 4) - v = torch.ones(4, 4) - with torch.no_grad(): - res = autogradF.jvp(reducer, inputs, v) - self.assertIsNone(res[0].grad_fn) - self.assertIsNone(res[1].grad_fn) - self.assertNotEqual(res[1], torch.zeros(4, 4)) - - inputs.requires_grad_() - v.requires_grad_() - with torch.no_grad(): - res = autogradF.jvp(reducer, inputs, v, create_graph=True) - self.assertIsNotNone(res[0].grad_fn) - self.assertIsNotNone(res[1].grad_fn) - self.assertNotEqual(res[1], torch.zeros(4, 4)) - - def test_jvp_output(self): - def reducer(x): - return x.sum(dim=1) - inputs = torch.rand(4, 4) - v = torch.ones(4, 4) - res = autogradF.jvp(reducer, inputs, v) - self._assert_same_struct(res[1], res[0]) - self.assertIsNone(res[0].grad_fn) - self.assertIsNone(res[1].grad_fn) - - def adder(x, y): - return 2 * x + 3 * y - - inputs = (torch.rand(2), torch.rand(2)) - v = (torch.ones(2), torch.ones(2)) - out, jvp_val = autogradF.jvp(adder, inputs, v) - self._assert_same_struct(jvp_val, out) - self.assertIsNone(out.grad_fn) - self.assertIsNone(jvp_val[0].grad_fn) - self.assertIsNone(jvp_val[1].grad_fn) - - def adder(x, y): - return 2 * x + 3 * y, x + y - - inputs = (torch.rand(2), torch.rand(2)) - v = (torch.tensor([1., 0.]), torch.tensor([1., 0.])) - out, jvp_val = autogradF.jvp(adder, inputs, v) - self._assert_same_struct(jvp_val, out) - self.assertIsNone(out[0].grad_fn) - self.assertIsNone(out[1].grad_fn) - self.assertIsNone(jvp_val[0].grad_fn) - self.assertIsNone(jvp_val[1].grad_fn) - - def test_jvp_scalar(self): - def reducer(x): - return x.sum() - inputs = torch.rand(4, 4) - v = torch.ones(4, 4) - res = autogradF.jvp(reducer, inputs, v) - self._assert_same_struct(res[0], torch.zeros([])) - self._assert_same_struct(res[1], res[0]) - - def expander(x): - return x.unsqueeze(0).repeat(4) - inputs = torch.rand([]) - v = torch.ones([]) - res = autogradF.jvp(expander, inputs, v) - self._assert_same_struct(res[0], torch.zeros(4)) - self._assert_same_struct(res[1], res[0]) - - res = autogradF.jvp(expander, inputs) - self._assert_same_struct(res[0], torch.zeros(4)) - self._assert_same_struct(res[1], res[0]) - - def test_jvp_create_graph(self): - def reducer(x): - return x.sum(dim=1) - inputs = torch.rand(2, 2, dtype=torch.double) - v = torch.ones(2, 2, dtype=torch.double) - - inputs.requires_grad_() - v.requires_grad_() - res = autogradF.jvp(reducer, inputs, v, create_graph=True) - self._assert_same_struct(res[1], res[0]) - self.assertIsNotNone(res[0].grad_fn) - self.assertIsNotNone(res[1].grad_fn) - - gradcheck(lambda inp, v: autogradF.jvp(reducer, inp, v, create_graph=True), (inputs, v)) - gradgradcheck(lambda inp, v: autogradF.jvp(reducer, inp, v, create_graph=True), (inputs, v)) - - def adder(x, y): - return 2 * x + 3 * y, x * y - - inputs = (torch.rand(2, dtype=torch.double, requires_grad=True), - torch.rand(2, dtype=torch.double, requires_grad=True)) - v = (torch.tensor([1., 0.], dtype=torch.double, requires_grad=True), - torch.tensor([1., 0.], dtype=torch.double, requires_grad=True)) - - gradcheck(lambda *args: autogradF.jvp(adder, args[:2], args[2:], create_graph=True)[1], inputs + v) - gradgradcheck(lambda *args: autogradF.jvp(adder, args[:2], args[2:], create_graph=True)[1], inputs + v) - - def foo(*args): - x, y = args[:2] - v = args[2:] - - x = x.cos() - val, grad = autogradF.jvp(adder, (x, y), v, create_graph=True) - - return val[0].exp() + val[1].exp() + grad[0].exp() + grad[1].exp() + x.exp() + y.exp() - - gradcheck(foo, inputs + v) - gradgradcheck(foo, inputs + v) - - def _test_construct_standard_basis_for(self, inputs): - numels = tuple(tensor.numel() for tensor in inputs) - results = autogradF._construct_standard_basis_for(inputs, numels) - for result, inp in zip(results, inputs): - self.assertEqual(result.dtype, inp.dtype) - self.assertEqual(result.device, inp.device) - results = torch.cat([result.to(device='cpu', dtype=torch.float) - for result in results], dim=1) - expected = torch.eye(results[0].shape[0], dtype=torch.float) - self.assertEqual(results, expected) - - def test_construct_standard_basis_for(self): - test_cases = [ - (torch.randn(2, 3),), - (torch.randn(1),), - (torch.randn([]),), - (torch.randn(1), torch.randn([]), torch.randn([])), - (torch.randn(2), torch.randn(3), torch.randn([])), - (torch.randn(2), torch.randn([]), torch.randn(3)), - (torch.randn(2, 3), torch.randn(3), torch.randn(3, 4, 2)), - (torch.randn(2, dtype=torch.float64), torch.randn(3, dtype=torch.float32)), - ] - - for inputs in test_cases: - self._test_construct_standard_basis_for(inputs) - - @unittest.skipIf(not TEST_CUDA, "test requires CUDA") - def test_construct_standard_basis_for_cuda(self): - test_cases = [ - (torch.randn(2), torch.randn(3, device='cuda')), - (torch.randn(3, device='cuda'), torch.randn(2)), - ] - - for inputs in test_cases: - self._test_construct_standard_basis_for(inputs) - - def _test_vectorize_raises_no_warnings(self, api): - # vmap is an experimental prototype. When someone calls torch.vmap, - # it raises a python warning. This test checks that - # autogradF.{jacobian, hessian} don't raise that experimental prototype - # warning; it is not nice for a public-facing API to raise a warning - # no matter how it is called. - def foo(a): - return (a ** 2).sum() - - x = torch.randn(3) - with warnings.catch_warnings(record=True) as wa: - result = api(foo, x, vectorize=True) - self.assertEqual(len(wa), 0) - - def test_jacobian_vectorize_raises_no_warnings(self): - return self._test_vectorize_raises_no_warnings(autogradF.jacobian) - - def test_hessian_vectorize_raises_no_warnings(self): - return self._test_vectorize_raises_no_warnings(autogradF.hessian) - - def _test_jacobian_err_check(self, vectorize): - def foo(a): - return 3 * a.narrow(0, 0, 3) - - def bar(a): - return 3 * a.narrow(0, 0, 3), "bar" - - inp = torch.rand(4) - with self.assertRaisesRegex(TypeError, "The inputs given to jacobian must be either a Tensor"): - res = autogradF.jacobian(foo, (inp, 2), vectorize=vectorize) - - with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to jacobian must"): - res = autogradF.jacobian(bar, inp, vectorize=vectorize) - - res = autogradF.jacobian(foo, inp, vectorize=vectorize) - self._assert_interleaved_struct(res, foo(inp), inp) - - def foo(a, b): - return b, 3 * a.narrow(0, 0, 3) - - inp = (torch.rand(4), torch.rand(5)) - - res = autogradF.jacobian(foo, inp, vectorize=vectorize) - self._assert_interleaved_struct(res, foo(*inp), inp) - - def test_jacobian_err_check(self): - return self._test_jacobian_err_check(vectorize=False) - - def test_jacobian_err_check_vectorize(self): - return self._test_jacobian_err_check(vectorize=True) - - def test_jacobian_err_check_strict(self): - def foo(a): - return a.detach() - - def bar(a): - # Make a non-leaf Tensor that requires_grad but that is not connected to the input - return a.long().float().requires_grad_().clone() - - inp = torch.rand(4) - with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."): - res = autogradF.jacobian(foo, inp, strict=True) - res = autogradF.jacobian(foo, inp, strict=False) - self._assert_interleaved_struct(res, foo(inp), inp) - self.assertEqual(res.abs().sum(), 0.) - - with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function is independent of input 0."): - res = autogradF.jacobian(bar, inp, strict=True) - res = autogradF.jacobian(bar, inp, strict=False) - self._assert_interleaved_struct(res, foo(inp), inp) - self.assertEqual(res.abs().sum(), 0.) - - # The Jacobian does not depend on the input - def foo(a): - return a.clone() - - inp.requires_grad_() - with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function is independent of input 0."): - res = autogradF.jacobian(foo, inp, create_graph=True, strict=True) - res = autogradF.jacobian(foo, inp, create_graph=True, strict=False) - self._assert_interleaved_struct(res, inp, inp) - self.assertEqual(res, torch.eye(4)) - - def test_jacobian_err_check_strict_vectorize(self): - def foo(x): - return x - - inp = torch.rand(4) - with self.assertRaisesRegex(RuntimeError, "not supported together"): - res = autogradF.jacobian(foo, inp, strict=True, vectorize=True) - - def test_jacobian_no_grad(self): - def exp_reducer(x): - return x.exp().sum(dim=1) - - inputs = torch.rand(4, 4) - with torch.no_grad(): - res = autogradF.jacobian(exp_reducer, inputs) - self.assertIsNone(res.grad_fn) - self.assertNotEqual(res, torch.zeros(4, 4)) - - with torch.no_grad(): - res = autogradF.jacobian(exp_reducer, inputs, create_graph=True) - self.assertIsNotNone(res.grad_fn) - self.assertNotEqual(res, torch.zeros(4, 4)) - - def _test_jacobian_output(self, vectorize): - def exp_reducer(x): - return x.exp().sum(dim=1) - - inputs = torch.rand(4, 4) - res = autogradF.jacobian(exp_reducer, inputs, vectorize=vectorize) - self._assert_interleaved_struct(res, exp_reducer(inputs), inputs) - self.assertIsNone(res.grad_fn) - - def identity(x): - return x.clone() - - inputs = torch.rand(4) - res = autogradF.jacobian(identity, inputs, vectorize=vectorize) - self._assert_interleaved_struct(res, identity(inputs), inputs) - self.assertIsNone(res.grad_fn) - self.assertEqual(res, torch.eye(4)) - - def add_exp_reducer(x, y): - return (x + y.exp()).sum(dim=1) - - inputs = (torch.rand(4, 4), torch.rand(4, 4)) - res = autogradF.jacobian(add_exp_reducer, inputs, vectorize=vectorize) - self._assert_interleaved_struct(res, add_exp_reducer(*inputs), inputs) - self.assertIsNone(res[0].grad_fn) - self.assertIsNone(res[1].grad_fn) - - def test_jacobian_output(self): - self._test_jacobian_output(vectorize=False) - - def test_jacobian_output_vectorize(self): - self._test_jacobian_output(vectorize=True) - - def _test_jacobian_scalar(self, vectorize): - def reducer(x): - return x.sum() - inputs = torch.rand(4, 4) - res = autogradF.jacobian(reducer, inputs, vectorize=vectorize) - self._assert_same_struct(res, inputs) - - def expander(x): - return x.unsqueeze(0).repeat(4) - inputs = torch.rand([]) - res = autogradF.jacobian(expander, inputs, vectorize=vectorize) - self._assert_same_struct(res, torch.zeros(4)) - - def test_jacobian_scalar(self): - self._test_jacobian_scalar(vectorize=False) - - def test_jacobian_scalar_vectorize(self): - self._test_jacobian_scalar(vectorize=True) - - def _test_jacobian_create_graph(self, vectorize): - def exp_reducer(x): - return x.exp().sum(dim=1) - - inputs = torch.rand(4, 4, dtype=torch.double, requires_grad=True) - res = autogradF.jacobian(exp_reducer, inputs, create_graph=True, vectorize=vectorize) - self._assert_interleaved_struct(res, exp_reducer(inputs), inputs) - self.assertIsNotNone(res.grad_fn) - - gradcheck(lambda inp: autogradF.jacobian(exp_reducer, inp, create_graph=True, vectorize=vectorize), inputs) - gradgradcheck(lambda inp: autogradF.jacobian(exp_reducer, inp, create_graph=True, vectorize=vectorize), inputs) - - def add_exp_reducer(x, y): - return (x + y).exp().sum(dim=1) - - inputs = (torch.rand(4, 4, dtype=torch.double, requires_grad=True), - torch.rand(4, 4, dtype=torch.double, requires_grad=True)) - res = autogradF.jacobian(add_exp_reducer, inputs, create_graph=True, vectorize=vectorize) - self._assert_interleaved_struct(res, add_exp_reducer(*inputs), inputs) - self.assertIsNotNone(res[0].grad_fn) - self.assertIsNotNone(res[1].grad_fn) - - gradcheck(lambda *inp: autogradF.jacobian(add_exp_reducer, inp, create_graph=True, vectorize=vectorize), inputs) - gradgradcheck(lambda *inp: autogradF.jacobian(add_exp_reducer, inp, create_graph=True, vectorize=vectorize), inputs) - - def foo(x, y): - x = x.cos() - val, jac = autogradF.jacobian(add_exp_reducer, (x, y), create_graph=True, vectorize=vectorize) - - res = val[0].exp().sum() + val[1].exp().sum() + jac[0].exp().sum() - res = res + jac[1].exp().sum() + x.exp().sum() + y.exp().sum() - return res - - gradcheck(foo, inputs) - gradgradcheck(foo, inputs) - - def test_jacobian_create_graph(self): - self._test_jacobian_create_graph(vectorize=False) - - def test_jacobian_create_graph_vectorize(self): - self._test_jacobian_create_graph(vectorize=True) - - def _check_jacobian_vectorize_correctness(self, f, inputs, test_forward_ad=True): - expected = autogradF.jacobian(f, inputs, vectorize=False) - result_backward_mode = autogradF.jacobian(f, inputs, vectorize=True) - self.assertEqual(result_backward_mode, expected) - - if test_forward_ad: - result_forward_mode = autogradF.jacobian(f, inputs, strategy="forward-mode", vectorize=True) - self.assertEqual(result_forward_mode, expected) - - def test_jacobian_vectorize_correctness_simple(self): - def f(x): - return 3 * x ** 2 - - x = torch.randn(2, 3, 5) - self._check_jacobian_vectorize_correctness(f, x) - - def test_jacobian_vectorize_correctness_multi_input(self): - def f(x, y): - return (x.cos() * x) @ y.sin() - - x = torch.randn(2, 3) - y = torch.randn(3, 5) - self._check_jacobian_vectorize_correctness(f, (x, y)) - - def test_jacobian_vectorize_correctness_multi_input_multi_output(self): - def f(x, y): - return (x * x) @ y, x @ (x.sum(1) * y), y.sum() - - x = torch.randn(5, 3) - y = torch.randn(3, 5) - self._check_jacobian_vectorize_correctness(f, (x, y)) - - def test_jacobian_vectorize_correctness_unrelated_outputs(self): - def f(x, y): - return x, y, x, y - - x = torch.randn(2) - y = torch.randn(3) - self._check_jacobian_vectorize_correctness(f, (x, y)) - - def test_jacobian_vectorize_correctness_zero_dim(self): - # zero-dim output - def f(x, y): - return x.sum(), y.sum(), x * y - - x = torch.randn(3) - y = torch.randn(3) - self._check_jacobian_vectorize_correctness(f, (x, y)) - - # zero-dim input - def g(x): - return torch.stack([x, x, x]) - - x = torch.randn([]) - self._check_jacobian_vectorize_correctness(g, x) - - # Mixed zero-dim input / zero-dim output - def h(x, y): - return y.sum(), x * y - - x = torch.randn([]) - y = torch.randn(1) - self._check_jacobian_vectorize_correctness(h, (x, y)) - - @unittest.skipIf(not TEST_CUDA, "test requires CUDA") - def test_jacobian_vectorize_correctness_different_devices(self): - def f(x, y): - return x * y, (x * y).cuda() - - x = torch.randn(3) - y = torch.randn(3) - self._check_jacobian_vectorize_correctness(f, (x, y)) - - def test_jacobian_vectorize_correctness_different_dtype(self): - def f(x, y): - return (x * y).float(), (x * y).double() - - x = torch.randn(3) - y = torch.randn(3) - # The Jacobian computed using forward AD has the dtype of the output - # but the Jacobian computed with reverse AD has dtype of input - self._check_jacobian_vectorize_correctness(f, (x, y), test_forward_ad=False) - - def _check_hessian_vectorize_correctness(self, f, inputs): - expected = autogradF.hessian(f, inputs, vectorize=False) - result = autogradF.hessian(f, inputs, vectorize=True) - self.assertEqual(result, expected) - - result_forward_mode = autogradF.hessian(f, inputs, outer_jacobian_strategy="forward-mode", vectorize=True) - self.assertEqual(result_forward_mode, expected) - - def test_hessian_vectorize_correctness_simple(self): - def f(x): - return (3 * x ** 2).sum() - - x = torch.randn(2, 3, 5) - self._check_hessian_vectorize_correctness(f, x) - - def test_hessian_vectorize_correctness_multi_input(self): - def f(x, y, z): - return ((x.relu() * x) @ y.sin() @ z).sum() - - x = torch.randn(2, 3) - y = torch.randn(3, 5) - z = torch.randn(5, 5) - self._check_hessian_vectorize_correctness(f, (x, y, z)) - - def test_hessian_vectorize_correctness_unrelated_outputs(self): - # output unrelated to one input - def f(x, y): - return (x ** 2).sum() - - x = torch.randn(2) - y = torch.randn(3) - self._check_hessian_vectorize_correctness(f, (x, y)) - - # output unrelated to all inputs - def f(x, y): - return torch.ones([]) - - x = torch.randn(2) - y = torch.randn(3) - self._check_hessian_vectorize_correctness(f, (x, y)) - - def _test_hessian_err_check(self, vectorize): - def foo(a): - return 3 * a.narrow(0, 0, 3).exp().sum() - - def bar(a): - return 3 * a.narrow(0, 0, 3), "bar" - - def bar2(a): - return 3 * a.narrow(0, 0, 3) - - def bar3(a): - return 3 * a.narrow(0, 0, 3), 3 * a.narrow(0, 0, 3) - - inp = torch.rand(4) - with self.assertRaisesRegex(TypeError, "The inputs given to hessian must be either a Tensor"): - res = autogradF.hessian(foo, (inp, 2), vectorize=vectorize) - - with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to hessian must"): - res = autogradF.hessian(bar, inp, vectorize=vectorize) - - err_msg_out = "The Tensor returned by the function given to hessian should contain a single element" - with self.assertRaisesRegex(RuntimeError, err_msg_out): - res = autogradF.hessian(bar2, inp, vectorize=vectorize) - - with self.assertRaisesRegex(RuntimeError, "The function given to hessian should return a single Tensor"): - res = autogradF.hessian(bar3, inp, vectorize=vectorize) - - res = autogradF.hessian(foo, inp, vectorize=vectorize) - self._assert_interleaved_struct(res, inp, inp) - - def foo(a, b): - return (3 * b.narrow(0, 0, 3) * a.narrow(0, 0, 3)).sum() - - inp = (torch.rand(4), torch.rand(5)) - - res = autogradF.hessian(foo, inp, vectorize=vectorize) - self._assert_interleaved_struct(res, inp, inp) - - def test_hessian_err_check(self): - self._test_hessian_err_check(vectorize=False) - - def test_hessian_err_check_vectorize(self): - self._test_hessian_err_check(vectorize=True) - - def test_hessian_err_check_strict(self): - def foo(a): - return a.detach().sum() - - def bar(a): - # Make a non-leaf Tensor that requires_grad but that is not connected to the input - return a.long().float().requires_grad_().clone().sum() - - def bar2(a): - # A Linear function for which the jacobian is independent of the input - return (3 * a).sum() - - inp = torch.rand(4) - with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."): - res = autogradF.hessian(foo, inp, strict=True) - res = autogradF.hessian(foo, inp, strict=False) - self._assert_interleaved_struct(res, inp, inp) - self.assertEqual(res.abs().sum(), 0.) - - with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function with respect to input 0"): - res = autogradF.hessian(bar, inp, strict=True) - res = autogradF.hessian(bar, inp, strict=False) - self._assert_interleaved_struct(res, inp, inp) - self.assertEqual(res.abs().sum(), 0.) - - with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function with respect to input 0 is"): - res = autogradF.hessian(bar2, inp, strict=True) - res = autogradF.hessian(bar2, inp, strict=False) - self._assert_interleaved_struct(res, inp, inp) - self.assertEqual(res.abs().sum(), 0.) - - def test_hessian_err_check_strict_vectorize(self): - def foo(x): - return (x ** 3).sum() - - inp = torch.rand(4) - with self.assertRaisesRegex(RuntimeError, "not supported together"): - res = autogradF.hessian(foo, inp, strict=True, vectorize=True) - - def test_hessian_no_grad(self): - def pow_reducer(x): - return x.pow(3).sum() - - inputs = torch.rand(2, 2) - with torch.no_grad(): - res = autogradF.hessian(pow_reducer, inputs) - self.assertIsNone(res[0][0].grad_fn) - self.assertIsNone(res[0][1].grad_fn) - self.assertIsNone(res[1][0].grad_fn) - self.assertIsNone(res[1][1].grad_fn) - self.assertNotEqual(res, torch.zeros(2, 2, 2)) - - with torch.no_grad(): - res = autogradF.hessian(pow_reducer, inputs, create_graph=True) - self.assertIsNotNone(res[0][0].grad_fn) - self.assertIsNotNone(res[0][1].grad_fn) - self.assertIsNotNone(res[1][0].grad_fn) - self.assertIsNotNone(res[1][1].grad_fn) - self.assertNotEqual(res, torch.zeros(2, 2, 2)) - - - def _test_hessian_output(self, vectorize): - def pow_reducer(x): - return x.pow(3).sum() - - inputs = torch.rand(2, 2) - res = autogradF.hessian(pow_reducer, inputs, vectorize=vectorize) - self._assert_interleaved_struct(res, inputs, inputs) - self.assertIsNone(res.grad_fn) - - def add_pow_reducer(x, y): - return (x + y).pow(3).sum() - - inputs = (torch.rand(2, 2), torch.rand(2, 2)) - res = autogradF.hessian(add_pow_reducer, inputs, vectorize=vectorize) - self._assert_interleaved_struct(res, inputs, inputs) - self.assertIsNone(res[0][0].grad_fn) - self.assertIsNone(res[0][1].grad_fn) - self.assertIsNone(res[1][0].grad_fn) - self.assertIsNone(res[1][1].grad_fn) - - def test_hessian_output(self): - self._test_hessian_output(vectorize=False) - - def test_hessian_output_vectorize(self): - self._test_hessian_output(vectorize=True) - - def _test_hessian_scalar(self, vectorize): - def reducer(x): - return x.sum() - inputs = torch.rand(4, 4) - res = autogradF.hessian(reducer, inputs, vectorize=vectorize) - self._assert_interleaved_struct(res, inputs, inputs) - - inputs = torch.rand([]) - res = autogradF.hessian(reducer, inputs, vectorize=vectorize) - self._assert_same_struct(res, inputs) - - def bad_reducer(x): - return x.sum().view(1, 1, 1) - inputs = torch.rand(4, 4) - res = autogradF.hessian(bad_reducer, inputs, vectorize=vectorize) - self._assert_interleaved_struct(res, inputs, inputs) - - def test_hessian_scalar(self): - return self._test_hessian_scalar(vectorize=False) - - def test_hessian_scalar_vectorize(self): - return self._test_hessian_scalar(vectorize=True) - - def _test_hessian_create_graph(self, vectorize): - def pow_reducer(x): - return x.pow(3).sum() - - inputs = torch.rand(2, 2, dtype=torch.double, requires_grad=True) - res = autogradF.hessian(pow_reducer, inputs, create_graph=True, vectorize=vectorize) - self._assert_interleaved_struct(res, inputs, inputs) - self.assertIsNotNone(res.grad_fn) - - gradcheck(lambda inp: autogradF.hessian(pow_reducer, inp, create_graph=True, vectorize=vectorize), inputs) - gradgradcheck(lambda inp: autogradF.hessian(pow_reducer, inp, create_graph=True, vectorize=vectorize), inputs) - - def add_pow_reducer(x, y): - return (x + y).pow(3).sum() - - inputs = (torch.rand(2, 2, dtype=torch.double, requires_grad=True), - torch.rand(2, 2, dtype=torch.double, requires_grad=True)) - res = autogradF.hessian(add_pow_reducer, inputs, create_graph=True, vectorize=vectorize) - self._assert_interleaved_struct(res, inputs, inputs) - self.assertIsNotNone(res[0][0].grad_fn) - self.assertIsNotNone(res[0][1].grad_fn) - self.assertIsNotNone(res[1][0].grad_fn) - self.assertIsNotNone(res[1][1].grad_fn) - - def flatten(inp): - return tuple(el_lvl2 for el_lvl1 in inp for el_lvl2 in el_lvl1) - - gradcheck(lambda *inp: flatten(autogradF.hessian(add_pow_reducer, inp, create_graph=True, vectorize=vectorize)), inputs) - gradgradcheck(lambda *inp: flatten(autogradF.hessian(add_pow_reducer, inp, create_graph=True, vectorize=vectorize)), inputs) - - def foo(x, y): - x = x.cos() - val, hess = autogradF.hessian(add_pow_reducer, (x, y), create_graph=True, vectorize=vectorize) - - res = val[0].cos().sum() + val[1].cos().sum() + hess[0].cos().sum() - res = res + hess[1].cos().sum() + x.cos().sum() + y.cos().sum() - return res - - gradcheck(foo, inputs) - gradgradcheck(foo, inputs) - - def test_hessian_create_graph(self): - self._test_hessian_create_graph(vectorize=False) - - def test_hessian_create_graph_vectorize(self): - self._test_hessian_create_graph(vectorize=True) - - def test_vhp_err_check(self): - def foo(a): - return 3 * a.narrow(0, 0, 3).exp().sum() - - def bar(a): - return 3 * a.narrow(0, 0, 3), "bar" - - def bar2(a): - return 3 * a.narrow(0, 0, 3) - - inp = torch.rand(4) - v = torch.rand(4) - with self.assertRaisesRegex(TypeError, "The inputs given to vhp must be either a Tensor"): - res = autogradF.vhp(foo, (inp, 2), v) - - with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to vhp must"): - res = autogradF.vhp(bar, inp, v) - - err_msg_out = "The Tensor returned by the function given to vhp should contain a single element" - with self.assertRaisesRegex(RuntimeError, err_msg_out): - res = autogradF.vhp(bar2, inp, v) - - with self.assertRaisesRegex(RuntimeError, "v has invalid size:"): - res = autogradF.vhp(foo, inp, torch.rand(5)) - - with self.assertRaisesRegex(TypeError, "The v given to vhp must be either a Tensor or a tuple of Tensors"): - res = autogradF.vhp(foo, inp, (v, 2)) - - res = autogradF.vhp(foo, inp, v) - self._assert_same_struct(res[1], inp) - - def foo(a, b): - return (3 * b.narrow(0, 0, 3) * a.narrow(0, 0, 3)).sum() - - inp = (torch.rand(4), torch.rand(5)) - v = (torch.rand(4), torch.rand(5)) - - res = autogradF.vhp(foo, inp, v) - self._assert_same_struct(res[1], inp) - - def test_vhp_err_check_strict(self): - def foo(a): - return a.detach().sum() - - def bar(a): - # Make a non-leaf Tensor that requires_grad but that is not connected to the input - return a.long().float().requires_grad_().clone().sum() - - def bar2(a): - # A Linear function for which the jacobian is independent of the input - return (3 * a).sum() +class Foo(torch.autograd.Function): + @staticmethod + def forward(ctx, x): + return x.clone() - inp = torch.rand(4) - v = torch.rand(4) - with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."): - res = autogradF.vhp(foo, inp, v, strict=True) - res = autogradF.vhp(foo, inp, v, strict=False) - self._assert_same_struct(res[1], inp) - self.assertEqual(res[1].abs().sum(), 0.) + @staticmethod + def forward(ctx, gO): + return gO.clone() - with self.assertRaisesRegex(RuntimeError, "The output of the user-provided function is independent of input 0"): - res = autogradF.vhp(bar, inp, v, strict=True) - res = autogradF.vhp(bar, inp, v, strict=False) - self._assert_same_struct(res[1], inp) - self.assertEqual(res[1].abs().sum(), 0.) +def get_out(): + inp = torch.rand(2, requires_grad=True) - with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function with respect to input 0 is"): - res = autogradF.vhp(bar2, inp, v, strict=True) - res = autogradF.vhp(bar2, inp, v, strict=False) - self._assert_same_struct(res[1], inp) - self.assertEqual(res[1].abs().sum(), 0.) + # The python function is first so that it runs + # last in the backward pass + right = Foo.apply(inp) - def test_vhp_no_grad(self): - def reducer(x): - return x.exp().sum() - inputs = torch.rand(4, 4) - v = torch.ones(4, 4) - with torch.no_grad(): - res = autogradF.vhp(reducer, inputs, v) - self.assertIsNone(res[0].grad_fn) - self.assertIsNone(res[1].grad_fn) - self.assertNotEqual(res[1], torch.zeros(4, 4)) + # An op that creates new memory + left1 = inp.clone() + # An op that saves its input + left2 = left1 ** 2 - with torch.no_grad(): - res = autogradF.vhp(reducer, inputs, v, create_graph=True) - self.assertIsNotNone(res[0].grad_fn) - self.assertIsNotNone(res[1].grad_fn) - self.assertNotEqual(res[1], torch.zeros(4, 4)) - - def test_vhp_output(self): - def foo(a): - return 3 * a.narrow(0, 0, 3).exp().sum() - - inputs = torch.rand(4, 4) - v = torch.ones(4, 4) - res = autogradF.vhp(foo, inputs, v) - self._assert_same_struct(res[1], inputs) - self.assertIsNone(res[0].grad_fn) - self.assertIsNone(res[1].grad_fn) - - def bar(a, b): - return (a + 3 * b.narrow(0, 0, 3)).exp().sum() - - inputs = (torch.rand(3), torch.rand(4)) - v = (torch.ones(3), torch.ones(4)) - out, vhp_val = autogradF.vhp(bar, inputs, v) - self._assert_same_struct(vhp_val, inputs) - self.assertIsNone(out.grad_fn) - self.assertIsNone(vhp_val[0].grad_fn) - self.assertIsNone(vhp_val[1].grad_fn) - - def test_vhp_scalar(self): - def reducer(x): - return x.sum() - inputs = torch.rand(4, 4) - v = torch.ones(4, 4) - res = autogradF.vhp(reducer, inputs, v) - self._assert_same_struct(res[1], inputs) - - inputs = torch.rand([]) - v = torch.rand([]) - res = autogradF.vhp(reducer, inputs, v) - self._assert_same_struct(res[1], inputs) - - res = autogradF.vhp(reducer, inputs) - self._assert_same_struct(res[1], inputs) - - def bad_reducer(x): - return x.sum().view(1, 1, 1) - inputs = torch.rand(4, 4) - v = torch.rand(4, 4) - res = autogradF.vhp(bad_reducer, inputs, v) - self._assert_same_struct(res[1], inputs) - - def test_vhp_create_graph(self): - def foo(a): - return 3 * a.narrow(0, 0, 3).exp().sum() - - inputs = torch.rand(4, 4, dtype=torch.double, requires_grad=True) - v = torch.ones(4, 4, dtype=torch.double, requires_grad=True) - res = autogradF.vhp(foo, inputs, v, create_graph=True) - self._assert_same_struct(res[1], inputs) - self.assertIsNotNone(res[0].grad_fn) - self.assertIsNotNone(res[1].grad_fn) - - gradcheck(lambda inp, v: autogradF.vhp(foo, inp, v, create_graph=True), (inputs, v)) - gradgradcheck(lambda inp, v: autogradF.vhp(foo, inp, v, create_graph=True), (inputs, v)) - - def bar(a, b): - return (a + 3 * b.narrow(0, 0, 3)).exp().sum() - - inputs = (torch.rand(3, dtype=torch.double, requires_grad=True), - torch.rand(4, dtype=torch.double, requires_grad=True)) - v = (torch.ones(3, dtype=torch.double, requires_grad=True), - torch.ones(4, dtype=torch.double, requires_grad=True)) - out, vhp_val = autogradF.vhp(bar, inputs, v, create_graph=True) - self._assert_same_struct(vhp_val, inputs) - self.assertIsNotNone(out.grad_fn) - self.assertIsNotNone(vhp_val[0].grad_fn) - self.assertIsNotNone(vhp_val[1].grad_fn) - - gradcheck(lambda *args: autogradF.vhp(bar, args[:2], args[2:], create_graph=True)[1], inputs + v) - gradgradcheck(lambda *args: autogradF.vhp(bar, args[:2], args[2:], create_graph=True)[1], inputs + v) - - def foo(*args): - x, y = args[:2] - v = args[2:] - - x = x.cos() - val, grad = autogradF.vhp(bar, (x, y), v, create_graph=True) - - return val.cos() + grad[0].cos().sum() + grad[1].cos() + x.cos().sum() + y.cos() - - gradcheck(foo, inputs + v) - gradgradcheck(foo, inputs + v) - - def test_hvp_err_check(self): - def foo(a): - return 3 * a.narrow(0, 0, 3).exp().sum() - - def bar(a): - return 3 * a.narrow(0, 0, 3), "bar" - - def bar2(a): - return 3 * a.narrow(0, 0, 3) - - inp = torch.rand(4) - v = torch.rand(4) - res = autogradF.hvp(foo, inp, v) - with self.assertRaisesRegex(TypeError, "The inputs given to hvp must be either a Tensor"): - res = autogradF.hvp(foo, (inp, 2), v) - - with self.assertRaisesRegex(TypeError, "The outputs of the user-provided function given to hvp must"): - res = autogradF.hvp(bar, inp, v) - - err_msg_out = "The Tensor returned by the function given to hvp should contain a single element" - with self.assertRaisesRegex(RuntimeError, err_msg_out): - res = autogradF.hvp(bar2, inp, v) - - with self.assertRaisesRegex(RuntimeError, "v has invalid size:"): - res = autogradF.hvp(foo, inp, torch.rand(5)) - - with self.assertRaisesRegex(TypeError, "The v given to hvp must be either a Tensor or a tuple of Tensors"): - res = autogradF.hvp(foo, inp, (v, 2)) - - res = autogradF.hvp(foo, inp, v) - self._assert_same_struct(res[1], inp) - - def foo(a, b): - return (3 * b.narrow(0, 0, 3) * a.narrow(0, 0, 3)).sum() - - inp = (torch.rand(4), torch.rand(5)) - v = (torch.rand(4), torch.rand(5)) - - res = autogradF.hvp(foo, inp, v) - self._assert_same_struct(res[1], inp) - - def test_hvp_err_check_strict(self): - def foo(a): - return a.detach().sum() - - def bar(a): - # Make a non-leaf Tensor that requires_grad but that is not connected to the input - return a.long().float().requires_grad_().clone().sum() - - def bar2(a): - # A Linear function for which the jacobian is independent of the input - return (3 * a).sum() - - inp = torch.rand(4) - v = torch.rand(4) - with self.assertRaisesRegex(RuntimeError, "Output 0 of the user-provided function does not require gradients."): - res = autogradF.hvp(foo, inp, v, strict=True) - res = autogradF.hvp(foo, inp, v, strict=False) - self._assert_same_struct(res[1], inp) - self.assertEqual(res[1].abs().sum(), 0.) - - with self.assertRaisesRegex(RuntimeError, "The output of the user-provided function is independent of input 0"): - res = autogradF.hvp(bar, inp, v, strict=True) - res = autogradF.hvp(bar, inp, v, strict=False) - self._assert_same_struct(res[1], inp) - self.assertEqual(res[1].abs().sum(), 0.) - - with self.assertRaisesRegex(RuntimeError, "jacobian of the user-provided function with respect to input 0 is"): - res = autogradF.hvp(bar2, inp, v, strict=True) - res = autogradF.hvp(bar2, inp, v, strict=False) - self._assert_same_struct(res[1], inp) - self.assertEqual(res[1].abs().sum(), 0.) - - def test_hvp_no_grad(self): - def reducer(x): - return x.exp().sum() - inputs = torch.rand(4, 4) - v = torch.ones(4, 4) - with torch.no_grad(): - res = autogradF.hvp(reducer, inputs, v) - self.assertIsNone(res[0].grad_fn) - self.assertIsNone(res[1].grad_fn) - self.assertNotEqual(res[1], torch.zeros(4, 4)) + # Inplace modify so that the backward for + # left2 always raises an error + left1 += 1 - with torch.no_grad(): - res = autogradF.hvp(reducer, inputs, v, create_graph=True) - self.assertIsNotNone(res[0].grad_fn) - self.assertIsNotNone(res[1].grad_fn) - self.assertNotEqual(res[1], torch.zeros(4, 4)) - - def test_hvp_output(self): - def foo(a): - return 3 * a.narrow(0, 0, 3).exp().sum() - - inputs = torch.rand(4, 4) - v = torch.ones(4, 4) - res = autogradF.hvp(foo, inputs, v) - self._assert_same_struct(res[1], inputs) - self.assertIsNone(res[0].grad_fn) - self.assertIsNone(res[1].grad_fn) - - def bar(a, b): - return (a + 3 * b.narrow(0, 0, 3)).exp().sum() - - inputs = (torch.rand(3), torch.rand(4)) - v = (torch.ones(3), torch.ones(4)) - out, hvp_val = autogradF.hvp(bar, inputs, v) - self._assert_same_struct(hvp_val, inputs) - self.assertIsNone(out.grad_fn) - self.assertIsNone(hvp_val[0].grad_fn) - self.assertIsNone(hvp_val[1].grad_fn) - - def test_hvp_scalar(self): - def reducer(x): - return x.exp().sum() - inputs = torch.rand(4, 4) - v = torch.ones(4, 4) - res = autogradF.hvp(reducer, inputs, v) - self._assert_same_struct(res[1], inputs) - - inputs = torch.rand([]) - v = torch.rand([]) - res = autogradF.hvp(reducer, inputs, v) - self._assert_same_struct(res[1], inputs) - - res = autogradF.hvp(reducer, inputs) - self._assert_same_struct(res[1], inputs) - - def bad_reducer(x): - return x.exp().sum().view(1, 1, 1) - inputs = torch.rand(4, 4) - v = torch.rand(4, 4) - res = autogradF.hvp(bad_reducer, inputs, v) - self._assert_same_struct(res[1], inputs) - - def test_hvp_create_graph(self): - def foo(a): - return 3 * a.narrow(0, 0, 3).exp().sum() - - inputs = torch.rand(4, 4, dtype=torch.double, requires_grad=True) - v = torch.ones(4, 4, dtype=torch.double, requires_grad=True) - res = autogradF.hvp(foo, inputs, v, create_graph=True) - self._assert_same_struct(res[1], inputs) - self.assertIsNotNone(res[0].grad_fn) - self.assertIsNotNone(res[1].grad_fn) - - gradcheck(lambda inp, v: autogradF.hvp(foo, inp, v, create_graph=True), (inputs, v)) - gradgradcheck(lambda inp, v: autogradF.hvp(foo, inp, v, create_graph=True), (inputs, v)) - - def bar(a, b): - return (a + 3 * b.narrow(0, 0, 3)).exp().sum() - - inputs = (torch.rand(3, dtype=torch.double, requires_grad=True), - torch.rand(4, dtype=torch.double, requires_grad=True)) - v = (torch.ones(3, dtype=torch.double, requires_grad=True), - torch.ones(4, dtype=torch.double, requires_grad=True)) - out, hvp_val = autogradF.hvp(bar, inputs, v, create_graph=True) - self._assert_same_struct(hvp_val, inputs) - self.assertIsNotNone(out.grad_fn) - self.assertIsNotNone(hvp_val[0].grad_fn) - self.assertIsNotNone(hvp_val[1].grad_fn) - - gradcheck(lambda *args: autogradF.hvp(bar, args[:2], args[2:], create_graph=True)[1], inputs + v) - gradgradcheck(lambda *args: autogradF.hvp(bar, args[:2], args[2:], create_graph=True)[1], inputs + v) - - def foo(*args): - x, y = args[:2] - v = args[2:] - - x = x.cos() - val, grad = autogradF.hvp(bar, (x, y), v, create_graph=True) - - return val.cos() + grad[0].cos().sum() + grad[1].cos() + x.cos().sum() + y.cos() - - gradcheck(foo, inputs + v) - gradgradcheck(foo, inputs + v) - - def test_jacobian_match_vjp_jvp(self): - def foo(x): - return x ** 3 + x.sum() + # An op that takes both side as input. + # After running, both side's last op will be in + # the ready queue + # And the op for left will run first as it was + # executed last during the forward + out = left2 + right - inputs = torch.rand(4) - v = torch.rand(4) + return out - jac = autogradF.jacobian(foo, inputs) - jvp = autogradF.jvp(foo, inputs, v)[1] - vjp = autogradF.vjp(foo, inputs, v)[1] +# Nothing should be global variables here as, from what +# I can see, python leaks all the global objects +get_out().sum().backward() - self.assertEqual(jvp, torch.mm(jac, v.unsqueeze(1)).squeeze(1)) - self.assertEqual(vjp, torch.mm(v.unsqueeze(0), jac).squeeze(0)) +# This used to deadlock when the PyNode is being destroyed after +# the error is raised. +""" + try: + subprocess.check_output( + [sys.executable, '-c', script], + stderr=subprocess.STDOUT, + # On Windows, opening the subprocess with the default CWD makes `import torch` + # fail, so just set CWD to this script's directory + cwd=os.path.dirname(os.path.realpath(__file__)), + # It is ok to have an extra long timeout here as a timeout means the test failed + timeout=20) + except subprocess.TimeoutExpired as e: + self.fail(msg="Example code timed out! See the code sample in the test for details.") + except subprocess.CalledProcessError as e: + err_msg = "RuntimeError: one of the variables needed for gradient computation" + self.assertTrue(err_msg in e.output.decode("utf-8")) - def test_hessian_match_vhp_hvp(self): - def foo(a): - return 3 * a.narrow(0, 0, 3).exp().sum() +def index_perm_variable(shape, max_indices): + if not isinstance(shape, tuple): + shape = (shape,) - inputs = torch.rand(4) - v = torch.rand(4) + index = torch.randperm(max_indices).narrow(0, 0, reduce(mul, shape)).view(shape) + return index - hes = autogradF.hessian(foo, inputs) - hvp = autogradF.hvp(foo, inputs, v)[1] - vhp = autogradF.vhp(foo, inputs, v)[1] +def bernoulli_scalar(): + return torch.tensor(0, dtype=torch.uint8).bernoulli_() - self.assertEqual(hvp, torch.mm(hes, v.unsqueeze(1)).squeeze(1)) - self.assertEqual(vhp, torch.mm(v.unsqueeze(0), hes).squeeze(0)) class TestAutogradForwardModeBatchedGrad(TestCase): def test_out_of_place_basic(self): @@ -7814,6 +6577,18 @@ def test_metadata_check_checks_storage_numel(self): # as_strided runs without error dual.as_strided((5,), (1,), 0) + def test_metadata_check_checks_ignores_size_zero(self): + a = torch.ones(0).as_strided((0, 1,), (1, 1,), 0) + b = torch.ones(0).as_strided((0, 1,), (1, 0,), 0) + + with fwAD.dual_level(): + dual = fwAD.make_dual(a, b) + torch.diagonal(dual, offset=0) + + input = torch.rand([0, 1], dtype=torch.complex128, requires_grad=True) + func = partial(torch.diagonal, offset=0) + torch.autograd.gradcheck(func, (input,), check_forward_ad=True) + def test_metadata_check_when_primal_has_conj_bit(self): # Make sure the _has_same_storage_numel is a fallthrough, so that # conj bit does not materialize. If it materializes it would @@ -7910,13 +6685,17 @@ class MySubclass(torch.Tensor): def __new__(cls, data=None): return torch.Tensor._make_subclass(cls, data) + __torch_function__ = torch._C._disabled_torch_function_impl + @classmethod def __torch_dispatch__(cls, func, types, args=(), kwargs=None): - if func == torch.ops.aten.alias: + if func.overloadpacket == torch.ops.aten.alias: counter[0] += 1 - with no_dispatch(): - return MySubclass(torch.ops.aten.alias(*args)) + # Make sure we can re-enable autograd here + with torch.overrides.enable_reentrant_dispatch(): + foo = torch.rand(1, requires_grad=True) + self.assertIsNotNone(foo.exp().grad_fn) with no_dispatch(): return func(*args, **kwargs) @@ -7925,10 +6704,11 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None): s = MySubclass(a) with fwAD.dual_level(): + # Only the primal has "alias" called on it fwAD.make_dual(s, torch.rand_like(s)) self.assertEqual(counter[0], 1) fwAD.make_dual(torch.rand_like(s), s) - self.assertEqual(counter[0], 2) + self.assertEqual(counter[0], 1) def test_print(self): with fwAD.dual_level() as level: @@ -8299,6 +7079,35 @@ def test_min_max_median_backprops_to_all_values(self, device): self.assertEqual(x.grad.sum(), 1.) self.assertEqual((x.grad == 1 / 3).sum(), 3) + def test_scatter_index_reduce_amin_amax_backprops_to_all_values(self, device): + # tests that gradients are evenly distributed when there are multiple max/min values + # tested here instead of adding a SampleInput as the backward for this case is non-differentiable for gradgrad + # as is the case for test_min_max_median_backprops_to_all_values above + fns = (torch.scatter_reduce, torch.index_reduce) + reduces = ('amin', 'amax') + for fn, reduction in product(fns, reduces): + input = torch.randn((2, 3), device=device, dtype=torch.float64, requires_grad=True) + src = input.clone().detach_().requires_grad_(True) + idx = torch.arange(2).to(dtype=torch.long, device=device) + if fn == torch.scatter_reduce: + idx = idx.unsqueeze(-1).expand((2, 3)) + + gradcheck(fn, (input, 0, idx, src, reduction), check_batched_grad=False) + + def test_scatter_index_reduce_prod_gradgrad_error(self, device): + # test that double backward raises an error for the case where 2 zeros in src + # are scattered to the same position in self + input = torch.tensor([1.], device=device, dtype=torch.float64, requires_grad=True) + src = torch.tensor([0., 0.], device=device, dtype=torch.float64, requires_grad=True) + idx = torch.tensor([0, 0], device=device, dtype=torch.long) + + for fn in (torch.scatter_reduce, torch.index_reduce): + # check that this case passes on gradcheck + gradcheck(fn, (input, 0, idx, src, 'prod'), check_batched_grad=False) + with self.assertRaisesRegex(RuntimeError, "Double backward is unsupported for"): + gradgradcheck(fn, (input, 0, idx, src, 'prod')) + + @skipIfMps # the test doesn't work on MPS as double types are not supported def test_parameter_resize(self, device): asd = torch.nn.Parameter(torch.ones(16, dtype=torch.double, device=device)) @@ -8310,6 +7119,7 @@ def test_parameter_resize(self, device): m = torch.cat((asd, asd)) m.sum().backward() + @skipIfMps # the test doesn't work on MPS as double types are not supported @dtypes(torch.double, torch.cdouble) def test_sparse_ctor_getter_backward(self, device, dtype): # See NOTE [ Sparse: autograd and API ] on the expected behavior of this test @@ -8346,6 +7156,7 @@ def fn(v): _test(sparse_size + dense_size, len(sparse_size), nnz, device) @skipMeta + @skipIfMps @dtypes(torch.double, torch.cdouble) def test_sparse_backward(self, device, dtype): class FixedGradientFunction(Function): @@ -8391,6 +7202,7 @@ def backward(ctx, grad_x): # autograd tests via common_method_invocations don't allow input tensors to # be sparse (RuntimeError: gradcheck expects all tensor inputs are dense when # check_sparse_nnz is set to False.) + @skipIfMps def test_sparse_mask_autograd(self, device): tensor = torch.randn(3, requires_grad=True, device=device) mask = torch.ones(3, device=device) @@ -8400,6 +7212,7 @@ def test_sparse_mask_autograd(self, device): converted.sum().backward() self.assertEqual(tensor.grad, mask.to_dense()) + @skipIfMps # the test doesn't work on MPS as double types are not supported def test_pyscalar_conversions(self, device): def _test_pyscalar_conversions(t, integral_conv): # integral -> integral @@ -8454,6 +7267,7 @@ def test_nonzero(tensor, value, expected): _test_pyscalar_conversions(lambda x: x.to(device), lambda x: int(x)) + @dtypesIfMPS(torch.float32) @dtypesIfCUDA(torch.half, torch.float, torch.double, torch.int8, torch.int16, torch.int32, torch.int64) @dtypes(torch.float, torch.double, torch.int8, torch.int16, torch.int32, torch.int64) def test_set_requires_grad_only_for_floats(self, device, dtype): @@ -8559,6 +7373,7 @@ def _get_cuda_memory_usage(): self.assertEqual(before, after) + @skipIfMps # the test doesn't work on MPS # TODO: see if these tests can be ported to OpInfos or moved to where's test suite def test_where_functional(self, device): x = torch.randn(5, 5, dtype=torch.double, device=device, requires_grad=True) @@ -8576,6 +7391,7 @@ def where(cond, x, y): gradcheck(where, [cond, x, y], raise_exception=True) gradgradcheck(where, [cond, x, y], [torch.randn(5, 5, 5, device=device)]) + @skipIfMps # the test doesn't work on MPS def test_where_scalar(self, device): x = torch.randn(5, 5, dtype=torch.double, device=device, requires_grad=True) scalar = 4. @@ -8641,6 +7457,7 @@ def test_rnn_backward_to_input_but_not_parameters(self, device): out.sum().backward() self.assertFalse(s.grad is None or s.grad.abs().sum().item() == 0) + @skipIfMps # the test doesn't work as randn is not supported with type long @deviceCountAtLeast(1) def test_grad_assignment(self, devices): x = torch.randn(5, 5, device=devices[0]) @@ -8678,6 +7495,7 @@ def test_grad_assignment(self, devices): with self.assertRaises(RuntimeError): x.grad = torch.randn(5, 5, device=devices[1]) + @dtypesIfMPS(torch.float32) @deviceCountAtLeast(1) @dtypes(torch.float, torch.double) def test_requires_grad_factory(self, devices, dtype): @@ -8731,7 +7549,7 @@ def test_copy_(self, device): # At the time of writing this test, copy_ is not generated from native_functions.yaml # there was a bug that bfloat16 was not recognized as floating. x = torch.randn(10, device=device, requires_grad=True) - floating_dt = [dt for dt in get_all_dtypes() if dt.is_floating_point] + floating_dt = floating_types_and(torch.half, torch.bfloat16) for dt in floating_dt: y = torch.empty(10, device=device, dtype=dt) y.copy_(x) @@ -8842,12 +7660,14 @@ def test_inplace_on_view_of_view(self, device): # modify view-of-view and backprop through base root = torch.randn(2, 2, device=device, requires_grad=True) x = root.clone() + v1 = x.narrow(0, 0, 1) v2 = v1.narrow(1, 1, 1) v2.mul_(2) x.sum().backward() self.assertEqual(root.grad.tolist(), [[1, 2], [1, 1]]) + @skipIfMps # the test doesn't work on MPS as double types are not supported def test_inplace_on_view_then_no_grad(self, device): # Perform an in-place operation on a view of a non-leaf variable. a = torch.ones(3, 1, dtype=torch.double, device=device, requires_grad=True) @@ -8861,6 +7681,7 @@ def test_inplace_on_view_then_no_grad(self, device): c.sum().backward() + @skipIfMps # the test doesn't work on MPS as double types are not supported def test_inplace_on_view_gradcheck(self, device): # gradcheck modifications to views a = torch.randn(4, 4, dtype=torch.double, device=device, requires_grad=True) @@ -8883,6 +7704,7 @@ def test_inplace_on_view_multiple_outputs(self, device): with self.assertRaises(RuntimeError): v1[0].mul_(2) + @skipIfMps # the test doesn't work on MPS as double types are not supported def test_inplace_on_view_of_multiple_output_view(self, device): a = torch.rand(10, dtype=torch.double, device=device, requires_grad=True).clone() b = a.unbind(0) @@ -8890,6 +7712,7 @@ def test_inplace_on_view_of_multiple_output_view(self, device): with self.assertRaises(RuntimeError): c.mul_(2) + @skipIfMps # MPS backend doesn't support double types def test_inplace_multiple_output_view_of_view(self, device): a = torch.rand(10, dtype=torch.double, device=device, requires_grad=True).clone() b = a.view_as(a) @@ -8897,6 +7720,7 @@ def test_inplace_multiple_output_view_of_view(self, device): with self.assertRaises(RuntimeError): c[0].mul_(2) + @skipIfMps # MPS backend doesn't support double types def test_inplace_on_view_makes_base_require_grad(self, device): # in-place modification to view makes base require grad a = torch.randn(4, 4, dtype=torch.double, device=device, requires_grad=False) @@ -8922,6 +7746,7 @@ def test_inplace_on_view_backprop_view(self, device): self.assertEqual(b.grad.tolist(), [5]) self.assertIsNone(a.grad) + @skipIfMps # the test doesn't work on MPS as double types are not supported def test_inplace_on_view_modify_base(self, device): # Test that an in-place operation on a base that forced it to require # grad also forces any previous views to require grad and backprop @@ -8940,6 +7765,7 @@ def fn(r): gradcheck(fn, [r]) gradgradcheck(fn, [r]) + @skipIfMps # the test doesn't work on MPS as double types are not supported def test_inplace_on_view_python(self, device): # in-place modifications of Python-autograd created view a = torch.randn(4, 4, dtype=torch.double, device=device, requires_grad=True) @@ -8996,6 +7822,7 @@ def test_inplace_on_view_multi_output_safe(self, device): with self.assertRaisesRegex(RuntimeError, error_msg): s1.mul_(s2) + @skipIfMps # the test doesn't work on MPS as double types are not supported def test_mv_grad_stride_0(self, device): # Reference: https://github.com/pytorch/pytorch/issues/38315 mat = torch.randn(2, 2, dtype=torch.double, device=device) @@ -9050,6 +7877,7 @@ def test_strided_leaf_grad_layout(self, device): (c * d).sum().backward() self.assertEqual(c.grad.stride(), (2, 1)) + @skipIfMps def test_copy_r_to_c(self, device): out_c = torch.empty(3, 2, dtype=torch.cdouble, device=device) inp_r = torch.randn(3, 2, dtype=torch.double, device=device, @@ -9062,6 +7890,16 @@ def do_test(): self.assertNotWarn(do_test) + def test_to_r_to_c(self, device): + def do_test(): + inp_r = torch.randn(3, 2, dtype=torch.double, device=device, + requires_grad=True) + out = inp_r.to(torch.complex128) + out.sum().backward() + self.assertEqual(inp_r.grad, torch.ones_like(inp_r)) + + self.assertNotWarn(do_test) + def test_non_differentiable_ops(self, device): # Just make sure the op doesn't raise an error # and resulting tensor has requires_grad=False. @@ -9693,6 +8531,7 @@ def fn(x1, x2): # the suppressions. from autograd.test_complex import TestAutogradComplex # noqa: F401 +from autograd.test_functional import TestAutogradFunctional # noqa: F401 # e.g., TestAutogradDeviceTypeCPU and TestAutogradDeviceTypeCUDA instantiate_device_type_tests( diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py index 0bd2a9e4d527..a4d3db0ff82a 100644 --- a/test/test_binary_ufuncs.py +++ b/test/test_binary_ufuncs.py @@ -13,229 +13,74 @@ import operator from functools import partial +import torch.autograd.forward_ad as fwAD from torch._six import inf, nan from torch.testing._internal.common_utils import ( - TestCase, slowTest, iter_indices, TEST_WITH_ASAN, run_tests, gradcheck, - torch_to_numpy_dtype_dict, numpy_to_torch_dtype_dict, TEST_SCIPY, set_default_dtype) + TestCase, + slowTest, + iter_indices, + TEST_WITH_ASAN, + run_tests, + gradcheck, + torch_to_numpy_dtype_dict, + numpy_to_torch_dtype_dict, + TEST_SCIPY, + set_default_dtype, +) from torch.testing._internal.common_device_type import ( - expectedFailureMeta, instantiate_device_type_tests, onlyCUDA, onlyCPU, dtypes, dtypesIfCUDA, - dtypesIfCPU, deviceCountAtLeast, precisionOverride, onlyNativeDeviceTypes, - skipCUDAIfRocm, skipIf, ops, OpDTypes) + expectedFailureMeta, + instantiate_device_type_tests, + onlyCUDA, + onlyCPU, + dtypes, + dtypesIfCUDA, + dtypesIfCPU, + deviceCountAtLeast, + precisionOverride, + onlyNativeDeviceTypes, + skipIf, + ops, + OpDTypes, + skipMeta, +) from torch.testing import make_tensor from torch.testing._internal.common_dtype import ( - all_types_and_complex_and, integral_types_and, get_all_dtypes, get_all_int_dtypes, get_all_math_dtypes, - get_all_complex_dtypes, get_all_fp_dtypes, + all_types_and_complex_and, + all_types_and, + integral_types, + complex_types, + integral_types_and, + floating_types_and, + floating_and_complex_types, + get_all_math_dtypes, ) from torch.testing._internal.common_methods_invocations import ( - binary_ufuncs, _NOTHING) + binary_ufuncs, + binary_ufuncs_and_refs, + _NOTHING, + generate_elementwise_binary_tensors, + generate_elementwise_binary_small_value_tensors, + generate_elementwise_binary_large_value_tensors, + generate_elementwise_binary_extremal_value_tensors, + generate_elementwise_binary_broadcasting_tensors, + generate_elementwise_binary_with_scalar_samples, +) if TEST_SCIPY: import scipy.special import scipy.integrate -# TODO: remove this -def _generate_input(shape, dtype, device, with_extremal): - if shape == (): - x = torch.tensor((), dtype=dtype, device=device) - else: - if dtype.is_floating_point or dtype.is_complex: - # work around torch.randn not being implemented for bfloat16 - if dtype == torch.bfloat16: - x = torch.randn(*shape, device=device) * random.randint(30, 100) - x = x.to(torch.bfloat16) - else: - x = torch.randn(*shape, dtype=dtype, device=device) * random.randint(30, 100) - x[torch.randn(*shape) > 0.5] = 0 - if with_extremal and dtype.is_floating_point: - # Use extremal values - x[torch.randn(*shape) > 0.5] = float('nan') - x[torch.randn(*shape) > 0.5] = float('inf') - x[torch.randn(*shape) > 0.5] = float('-inf') - elif with_extremal and dtype.is_complex: - x[torch.randn(*shape) > 0.5] = complex('nan') - x[torch.randn(*shape) > 0.5] = complex('inf') - x[torch.randn(*shape) > 0.5] = complex('-inf') - elif dtype == torch.bool: - x = torch.zeros(shape, dtype=dtype, device=device) - x[torch.randn(*shape) > 0.5] = True - else: - x = torch.randint(15, 100, shape, dtype=dtype, device=device) - - return x - -# TODO: refactor this out -# Converts half/bfloat16 dtype to float when device is cpu -def _convert_t(dtype, device): - if device == 'cpu' and dtype in {torch.half, torch.bfloat16}: - return torch.float - return dtype - -# TODO: revise the tests to use make_tensor in common_utils.py instead -# Returns a tensor of the requested shape, dtype, and device -# Requesting a half CPU tensor returns a float CPU tensor with -# values representable by a half. -# Initialization uses randint for non-float types and randn for float types. -def _make_tensor(shape, dtype, device, fill_ones=False) -> torch.Tensor: - # Returns a tensor filled with ones - if fill_ones: - return torch.ones(*shape, dtype=_convert_t(dtype, device), device=device) - - # Returns a tensor with random integer values - if not (dtype.is_floating_point or dtype.is_complex): - t = torch.randint(0, 10, shape, device=device) - if dtype != torch.uint8: - t = t - 5 # generate negative values also - return t.to(_convert_t(dtype, device)) - - # Populates the CPU tensor with floats representable as half/bfloat16 - if dtype == torch.half and device == 'cpu': - return torch.randn(*shape, dtype=torch.float, device=device).half().float() - if dtype == torch.bfloat16 and device == 'cpu': - return torch.randn(*shape, dtype=torch.float, device=device).bfloat16().float() - - # Default: returns a tensor with random float values - return torch.randn(shape, dtype=dtype, device=device).to(dtype=dtype) - # TODO: update to use opinfos consistently class TestBinaryUfuncs(TestCase): # Generic tests for elementwise binary (AKA binary universal (u) functions (funcs)) # TODO: below contiguous tensor results are compared with a variety of noncontiguous results. # It would be interesting to have the lhs and rhs have different discontiguities. - # Returns a pair of iterables of contiguous tensors on the requested device - # and with the requested dtype. - # - # This function is intended to test the non-vectorized and vectorized code - # paths of unary functions, as well as their handling of odd tensor - # sizes (like zero-dim tensors and tensors with zero elements). - # - # Each iterable will include an a tensor with no elements, - # zero dim (scalar) tensors, small 1D tensors, a medium 1D tensor, and - # a large 2D tensor. - def _generate_numeric_tensors(self, op, *, device, dtype, lhs_kwargs, rhs_kwargs): - lhs_tensors = [] - rhs_tensors = [] - - shapes = ((0,), # tensors with no elements - (1, 0, 3), - # zero dim (scalar) tensor - (), - # small 1D tensor - (20,), - # medium 1D tensor - (812,), - # large 2D tensor - (1029, 917)) - - for kwargs, tensors in ((lhs_kwargs, lhs_tensors), (rhs_kwargs, rhs_tensors)): - for shape in shapes: - tensors.append(make_tensor(shape, device, dtype, **kwargs)) - - return lhs_tensors, rhs_tensors - - # Returns a pair of iterables of contiguous tensors on the requested device and with - # the requested dtype. - # - # Unlike the previous function, the values in these tensors are specified manually. - def _generate_interesting_small_valued_tensors(self, device, dtype): - # defines interesting values - _unsigned_int_vals = (0, 1, 55, 127, 128, 190, 210, 220, 254, 255, 256) - _int_vals = (0, -1, 1, -55, 55, -127, 127, -128, 128) - _float_vals = (0., - -.001, .001, - -.25, .25, - -1., 1., - -math.pi / 2, math.pi / 2, - -math.pi + .00001, math.pi - .00001, - -math.pi, math.pi, - -math.pi - .00001, math.pi + .00001) - - l_vals = [] - r_vals = [] - - if dtype.is_floating_point: - prod = product(_float_vals, _float_vals) - elif dtype.is_complex: - complex_vals = product(_float_vals, _float_vals) - # Note the use of list is required here or the map generator will be - # emptied by the following product and it won't produce the desired cross-product - complex_vals = list(map(lambda x: complex(*x), complex_vals)) - prod = product(complex_vals, complex_vals) - elif dtype in (torch.int8, torch.int16, torch.int32, torch.int64): - prod = product(_int_vals, _int_vals) - elif dtype is torch.uint8: - prod = product(_unsigned_int_vals, _unsigned_int_vals) - else: - raise ValueError("Unsupported dtype!") - - for l, r in prod: - l_vals.append(l) - r_vals.append(r) - - lhs = torch.tensor(l_vals, device=device, dtype=dtype) - rhs = torch.tensor(r_vals, device=device, dtype=dtype) - - return lhs, rhs - - def _generate_interesting_large_valued_tensors(self, device, dtype): - _large_int_vals = (-1113, 1113, -10701, 10701) - _large_float16_vals = (-501, 501, -1001.2, 1001.2, -13437.7, 13437.7) - _large_float_vals = _large_float16_vals + (-4988429.2, 4988429.2, -1e20, 1e20) - - l_vals = [] - r_vals = [] - - if dtype == torch.float16: - prod = product(_large_float16_vals, _large_float16_vals) - elif dtype.is_floating_point: - prod = product(_large_float_vals, _large_float_vals) - elif dtype.is_complex: - complex_vals = product(_large_float_vals, _large_float_vals) - # Note the use of list is required here or the map generator will be - # emptied by the following product and it won't produce the desired cross-product - complex_vals = list(map(lambda x: complex(*x), complex_vals)) - prod = product(complex_vals, complex_vals) - elif dtype in (torch.int16, torch.int32, torch.int64): - prod = product(_large_int_vals, _large_int_vals) - else: - raise ValueError("Unsupported dtype!") - - for l, r in prod: - l_vals.append(l) - r_vals.append(r) - lhs = torch.tensor(l_vals, device=device, dtype=dtype) - rhs = torch.tensor(r_vals, device=device, dtype=dtype) - - return lhs, rhs - - def _generate_interesting_extremal_valued_tensors(self, device, dtype): - _float_extremals = (float('inf'), float('-inf'), float('nan')) - - l_vals = [] - r_vals = [] - - if dtype.is_floating_point: - prod = product(_float_extremals, _float_extremals) - elif dtype.is_complex: - complex_vals = product(_float_extremals, _float_extremals) - # Note the use of list is required here or the map generator will be - # emptied by the following product and it won't produce the desired cross-product - complex_vals = list(map(lambda x: complex(*x), complex_vals)) - prod = product(complex_vals, complex_vals) - else: - raise ValueError("Unsupported dtype!") - - for l, r in prod: - l_vals.append(l) - r_vals.append(r) - lhs = torch.tensor(l_vals, device=device, dtype=dtype) - rhs = torch.tensor(r_vals, device=device, dtype=dtype) - - return lhs, rhs - # Helper for comparing torch tensors and NumPy arrays # TODO: should this or assertEqual also validate that strides are equal? - def assertEqualHelper(self, actual, expected, msg, *, dtype, exact_dtype=True, **kwargs): + def assertEqualHelper( + self, actual, expected, msg, *, dtype, exact_dtype=True, **kwargs + ): assert isinstance(actual, torch.Tensor) # Some NumPy functions return scalars, not arrays @@ -249,71 +94,104 @@ def assertEqualHelper(self, actual, expected, msg, *, dtype, exact_dtype=True, * # Also ops like scipy.special.erf, scipy.special.erfc, etc, promote float16 # to float32 if expected.dtype == np.float32: - assert actual.dtype in (torch.float16, torch.bfloat16, torch.float32) + assert actual.dtype in ( + torch.float16, + torch.bfloat16, + torch.float32, + ) else: assert expected.dtype == torch_to_numpy_dtype_dict[actual.dtype] - self.assertEqual(actual, - torch.from_numpy(expected).to(actual.dtype), - msg, - exact_device=False, - **kwargs) + self.assertEqual( + actual, + torch.from_numpy(expected).to(actual.dtype), + msg, + exact_device=False, + **kwargs, + ) else: self.assertEqual(actual, expected, msg, exact_device=False, **kwargs) # Tests that the function and its (array-accepting) reference produce the same # values on given tensors - def _test_reference_numerics(self, dtype, op, tensor_pairs, equal_nan=True): - def _helper_reference_numerics(expected, actual, msg, exact_dtype, equal_nan=True): - if not torch.can_cast(numpy_to_torch_dtype_dict[expected.dtype.type], dtype): + def _test_reference_numerics(self, dtype, op, gen, equal_nan=True): + def _helper_reference_numerics( + expected, actual, msg, exact_dtype, equal_nan=True + ): + if not torch.can_cast( + numpy_to_torch_dtype_dict[expected.dtype.type], dtype + ): exact_dtype = False if dtype is torch.bfloat16 and expected.dtype == np.float32: # Ref: https://github.com/pytorch/pytorch/blob/master/torch/testing/_internal/common_utils.py#L1149 - self.assertEqualHelper(actual, expected, msg, dtype=dtype, - exact_dtype=exact_dtype, rtol=16e-3, atol=1e-5) + self.assertEqualHelper( + actual, + expected, + msg, + dtype=dtype, + exact_dtype=exact_dtype, + rtol=16e-3, + atol=1e-5, + ) else: - self.assertEqualHelper(actual, expected, msg, dtype=dtype, equal_nan=equal_nan, exact_dtype=exact_dtype) - - for l, r in tensor_pairs: - if dtype is torch.bfloat16: - l_numpy = l.cpu().to(torch.float32).numpy() - r_numpy = r.cpu().to(torch.float32).numpy() - else: - l_numpy = l.cpu().numpy() - r_numpy = r.cpu().numpy() + self.assertEqualHelper( + actual, + expected, + msg, + dtype=dtype, + equal_nan=equal_nan, + exact_dtype=exact_dtype, + ) + + for sample in gen: + # Each sample input acquired from the generator is just one lhs tensor + # and one rhs tensor + l = sample.input + r = sample.args[0] + + numpy_sample = sample.numpy() + l_numpy = numpy_sample.input + r_numpy = numpy_sample.args[0] actual = op(l, r) expected = op.ref(l_numpy, r_numpy) # Crafts a custom error message for smaller, printable tensors - if l.numel() < 10 and r.numel() < 10: - msg = ("Failed to produce expected results! Input lhs tensor was" - " {0}, rhs tensor was {1}, torch result is {2}, and reference result is" - " {3}.").format(l, r, actual, expected) + def _numel(x): + if isinstance(x, torch.Tensor): + return x.numel() + # Assumes x is a scalar + return 1 + + if _numel(l) < 10 and _numel(r) < 10: + msg = ( + "Failed to produce expected results! Input lhs tensor was" + " {0}, rhs tensor was {1}, torch result is {2}, and reference result is" + " {3}." + ).format(l, r, actual, expected) else: msg = None exact_dtype = True if isinstance(actual, torch.Tensor): - _helper_reference_numerics(expected, actual, msg, exact_dtype, equal_nan) + _helper_reference_numerics( + expected, actual, msg, exact_dtype, equal_nan + ) else: for x, y in zip(expected, actual): # testing multi-outputs results _helper_reference_numerics(x, y, msg, exact_dtype, equal_nan) # The following tests only apply to elementwise binary operators with references - binary_ufuncs_with_references = list(filter(lambda op: op.ref is not None and op.ref is not _NOTHING, binary_ufuncs)) + binary_ufuncs_with_references = list( + filter(lambda op: op.ref is not None and op.ref is not _NOTHING, binary_ufuncs) + ) @ops(binary_ufuncs_with_references) def test_reference_numerics(self, device, dtype, op): - lhs_tensors, rhs_tensors = self._generate_numeric_tensors(op, - device=device, - dtype=dtype, - lhs_kwargs=op.lhs_make_tensor_kwargs, - rhs_kwargs=op.rhs_make_tensor_kwargs) - - self._test_reference_numerics(dtype, op, zip(lhs_tensors, rhs_tensors), equal_nan=True) + gen = generate_elementwise_binary_tensors(op, device=device, dtype=dtype) + self._test_reference_numerics(dtype, op, gen, equal_nan=True) # runtime error: 128 is outside the range of representable values of type 'signed char' @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN") @@ -322,81 +200,84 @@ def test_reference_numerics_small_values(self, device, dtype, op): if dtype is torch.bool: self.skipTest("Doesn't support bool!") - lhs, rhs = self._generate_interesting_small_valued_tensors(device, dtype) - self._test_reference_numerics(dtype, op, ((lhs, rhs),), equal_nan=True) + gen = generate_elementwise_binary_small_value_tensors( + op, device=device, dtype=dtype + ) + self._test_reference_numerics(dtype, op, gen, equal_nan=True) # TODO: review if this skip is necessary @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN") - @ops(binary_ufuncs_with_references, - allowed_dtypes=(torch.int16, torch.int32, torch.int64, torch.float16, - torch.bfloat16, torch.float32, torch.float64, torch.complex64, torch.complex128)) + @ops( + binary_ufuncs_with_references, + allowed_dtypes=( + torch.int16, + torch.int32, + torch.int64, + torch.float16, + torch.bfloat16, + torch.float32, + torch.float64, + torch.complex64, + torch.complex128, + ), + ) def test_reference_numerics_large_values(self, device, dtype, op): - lhs, rhs = self._generate_interesting_large_valued_tensors(device, dtype) - self._test_reference_numerics(dtype, op, ((lhs, rhs),), equal_nan=True) + gen = generate_elementwise_binary_large_value_tensors( + op, device=device, dtype=dtype + ) + self._test_reference_numerics(dtype, op, gen, equal_nan=True) # TODO: review if this skip is necessary @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN") - @ops(binary_ufuncs_with_references, - allowed_dtypes=(torch.float16, torch.bfloat16, torch.float32, - torch.float64, torch.complex64, torch.complex128)) + @ops( + binary_ufuncs_with_references, + allowed_dtypes=( + torch.float16, + torch.bfloat16, + torch.float32, + torch.float64, + torch.complex64, + torch.complex128, + ), + ) def test_reference_numerics_extremal_values(self, device, dtype, op): - lhs, rhs = self._generate_interesting_extremal_valued_tensors(device, dtype) - self._test_reference_numerics(dtype, op, ((lhs, rhs),), equal_nan=True) + gen = generate_elementwise_binary_extremal_value_tensors( + op, device=device, dtype=dtype + ) + self._test_reference_numerics(dtype, op, gen, equal_nan=True) # tests broadcasting and noncontiguous broadcasting behavior - @ops(binary_ufuncs_with_references, allowed_dtypes=(torch.long, torch.float32,)) + @ops( + binary_ufuncs_with_references, + allowed_dtypes=( + torch.long, + torch.float32, + ), + ) def test_broadcasting(self, device, dtype, op): - shapes = ( - ((1,), ()), - ((2,), ()), - ((1,), (2,)), - ((2,), (2,)), - ((2, 1), (2,)), - ((1, 2), (2,)), - ((3, 2), (2,)), - ((3, 2), (3, 2)), - ((1, 3, 2), (2,)), - ((1, 3, 2), (3, 2)), - ((3, 1, 2), (3, 2)), - ((1, 3, 2), (1, 3, 2)), - ((2, 3, 2), ()), - ((2, 3, 2), (2, 3, 2)), - ((3, 1, 2), (1, 3, 2)), - ) - - for shape, noncontiguous in product(shapes, [True, False]): - shape_lhs, shape_rhs = shape - lhs = make_tensor(shape_lhs, device=device, dtype=dtype, - noncontiguous=noncontiguous, **op.lhs_make_tensor_kwargs) - rhs = make_tensor(shape_rhs, device=device, dtype=dtype, - noncontiguous=noncontiguous, **op.rhs_make_tensor_kwargs) - - actual = op(lhs, rhs) - expected = op.ref(lhs.cpu().numpy(), rhs.cpu().numpy()) - - self.assertEqual(actual, expected, exact_dtype=False) - - @ops(binary_ufuncs, allowed_dtypes=(torch.long, torch.float32,)) - def test_broadcast_python_scalar(self, device, dtype, op): - for shape_lhs in ((), (1,), (2,), (1, 2, 3),): - lhs = make_tensor(shape_lhs, device=device, dtype=dtype, **op.lhs_make_tensor_kwargs) - - rhs_tensor = make_tensor((), device=device, dtype=dtype, **op.rhs_make_tensor_kwargs) - rhs_expanded = rhs_tensor.expand_as(lhs) - rhs_scalar = rhs_tensor.item() - - expected = op(lhs, rhs_expanded) - - actual_tensor = op(lhs, rhs_tensor) - actual_scalar = op(lhs, rhs_scalar) - - self.assertEqual(actual_tensor, expected) - self.assertEqual(actual_scalar, expected) + gen = generate_elementwise_binary_broadcasting_tensors( + op, device=device, dtype=dtype + ) + self._test_reference_numerics(dtype, op, gen, equal_nan=True) + + @ops( + binary_ufuncs_with_references, + allowed_dtypes=(torch.long, torch.float32, torch.complex64), + ) + def test_scalar_support(self, device, dtype, op): + gen = generate_elementwise_binary_with_scalar_samples( + op, device=device, dtype=dtype + ) + self._test_reference_numerics(dtype, op, gen, equal_nan=True) @ops(binary_ufuncs) def test_contig_vs_every_other(self, device, dtype, op): - lhs = make_tensor((1026,), device=device, dtype=dtype, **op.lhs_make_tensor_kwargs) - rhs = make_tensor((1026,), device=device, dtype=dtype, **op.rhs_make_tensor_kwargs) + lhs = make_tensor( + (1026,), device=device, dtype=dtype, **op.lhs_make_tensor_kwargs + ) + rhs = make_tensor( + (1026,), device=device, dtype=dtype, **op.rhs_make_tensor_kwargs + ) lhs_non_contig = lhs[::2] rhs_non_contig = rhs[::2] @@ -413,8 +294,12 @@ def test_contig_vs_every_other(self, device, dtype, op): @ops(binary_ufuncs) def test_contig_vs_transposed(self, device, dtype, op): - lhs = make_tensor((789, 357), device=device, dtype=dtype, **op.lhs_make_tensor_kwargs) - rhs = make_tensor((789, 357), device=device, dtype=dtype, **op.rhs_make_tensor_kwargs) + lhs = make_tensor( + (789, 357), device=device, dtype=dtype, **op.lhs_make_tensor_kwargs + ) + rhs = make_tensor( + (789, 357), device=device, dtype=dtype, **op.rhs_make_tensor_kwargs + ) lhs_non_contig = lhs.T rhs_non_contig = rhs.T @@ -433,13 +318,21 @@ def test_contig_vs_transposed(self, device, dtype, op): def test_non_contig(self, device, dtype, op): shapes = ((5, 7), (1024,)) for shape in shapes: - lhs = make_tensor(shape, device, dtype, **op.lhs_make_tensor_kwargs) - rhs = make_tensor(shape, device, dtype, **op.rhs_make_tensor_kwargs) + lhs = make_tensor( + shape, dtype=dtype, device=device, **op.lhs_make_tensor_kwargs + ) + rhs = make_tensor( + shape, dtype=dtype, device=device, **op.rhs_make_tensor_kwargs + ) - lhs_non_contig = torch.empty(shape + (2,), device=device, dtype=dtype)[..., 0] + lhs_non_contig = torch.empty(shape + (2,), device=device, dtype=dtype)[ + ..., 0 + ] lhs_non_contig.copy_(lhs) - rhs_non_contig = torch.empty(shape + (2,), device=device, dtype=dtype)[..., 0] + rhs_non_contig = torch.empty(shape + (2,), device=device, dtype=dtype)[ + ..., 0 + ] rhs_non_contig.copy_(rhs) self.assertTrue(lhs.is_contiguous()) @@ -455,8 +348,12 @@ def test_non_contig(self, device, dtype, op): @ops(binary_ufuncs) def test_non_contig_index(self, device, dtype, op): shape = (2, 2, 1, 2) - lhs = make_tensor(shape, device, dtype, **op.lhs_make_tensor_kwargs) - rhs = make_tensor(shape, device, dtype, **op.rhs_make_tensor_kwargs) + lhs = make_tensor( + shape, dtype=dtype, device=device, **op.lhs_make_tensor_kwargs + ) + rhs = make_tensor( + shape, dtype=dtype, device=device, **op.rhs_make_tensor_kwargs + ) lhs_non_contig = lhs[:, 1, ...] lhs = lhs_non_contig.contiguous() @@ -478,8 +375,12 @@ def test_non_contig_index(self, device, dtype, op): def test_non_contig_expand(self, device, dtype, op): shapes = [(1, 3), (1, 7), (5, 7)] for shape in shapes: - lhs = make_tensor(shape, device, dtype, **op.lhs_make_tensor_kwargs) - rhs = make_tensor(shape, device, dtype, **op.rhs_make_tensor_kwargs) + lhs = make_tensor( + shape, dtype=dtype, device=device, **op.lhs_make_tensor_kwargs + ) + rhs = make_tensor( + shape, dtype=dtype, device=device, **op.rhs_make_tensor_kwargs + ) lhs_non_contig = lhs.clone().expand(3, -1, -1) rhs_non_contig = rhs.clone().expand(3, -1, -1) @@ -498,8 +399,12 @@ def test_non_contig_expand(self, device, dtype, op): @ops(binary_ufuncs) def test_contig_size1(self, device, dtype, op): shape = (5, 100) - lhs = make_tensor(shape, device, dtype, **op.lhs_make_tensor_kwargs) - rhs = make_tensor(shape, device, dtype, **op.rhs_make_tensor_kwargs) + lhs = make_tensor( + shape, dtype=dtype, device=device, **op.lhs_make_tensor_kwargs + ) + rhs = make_tensor( + shape, dtype=dtype, device=device, **op.rhs_make_tensor_kwargs + ) lhs = lhs[:1, :50] lhs_alt = torch.empty(lhs.size(), device=device, dtype=dtype) @@ -522,8 +427,12 @@ def test_contig_size1(self, device, dtype, op): @ops(binary_ufuncs) def test_contig_size1_large_dim(self, device, dtype, op): shape = (5, 2, 3, 1, 4, 5, 3, 2, 1, 2, 3, 4) - lhs = make_tensor(shape, device, dtype, **op.lhs_make_tensor_kwargs) - rhs = make_tensor(shape, device, dtype, **op.rhs_make_tensor_kwargs) + lhs = make_tensor( + shape, dtype=dtype, device=device, **op.lhs_make_tensor_kwargs + ) + rhs = make_tensor( + shape, dtype=dtype, device=device, **op.rhs_make_tensor_kwargs + ) lhs = lhs[:1, :, :, :, :, :, :, :, :, :, :, :] lhs_alt = torch.empty(lhs.size(), device=device, dtype=dtype) @@ -546,8 +455,12 @@ def test_contig_size1_large_dim(self, device, dtype, op): @ops(binary_ufuncs) def test_batch_vs_slicing(self, device, dtype, op): shape = (32, 512) - lhs = make_tensor(shape, device, dtype, **op.lhs_make_tensor_kwargs) - rhs = make_tensor(shape, device, dtype, **op.rhs_make_tensor_kwargs) + lhs = make_tensor( + shape, dtype=dtype, device=device, **op.lhs_make_tensor_kwargs + ) + rhs = make_tensor( + shape, dtype=dtype, device=device, **op.rhs_make_tensor_kwargs + ) expected = op(lhs, rhs) @@ -562,40 +475,63 @@ def test_batch_vs_slicing(self, device, dtype, op): # NOTE: because the cross-product of all possible type promotion tests is huge, this # just spot checks some handwritten cases. # NOTE: It may be possible to refactor this test into something simpler - @ops(binary_ufuncs, dtypes=OpDTypes.none) + @ops(binary_ufuncs_and_refs, dtypes=OpDTypes.none) def test_type_promotion(self, device, op): supported_dtypes = op.supported_dtypes(torch.device(device).type) + make_lhs = partial( + make_tensor, (5,), device=device, **op.lhs_make_tensor_kwargs + ) + make_rhs = partial( + make_tensor, (5,), device=device, **op.rhs_make_tensor_kwargs + ) + + make_lhs_scalar_tensor = partial( + make_tensor, (), device='cpu', **op.lhs_make_tensor_kwargs + ) + make_rhs_scalar_tensor = partial( + make_tensor, (), device='cpu', **op.rhs_make_tensor_kwargs + ) + def _supported(dtypes): return all(map(lambda x: x in supported_dtypes, dtypes)) # int x int type promotion if _supported((torch.int16, torch.int32, torch.int64)): - lhs_i16 = make_tensor((5,), device=device, dtype=torch.int16, **op.lhs_make_tensor_kwargs) - lhs_i32 = make_tensor((5,), device=device, dtype=torch.int32, **op.lhs_make_tensor_kwargs) - lhs_i64 = make_tensor((5,), device=device, dtype=torch.int64, **op.lhs_make_tensor_kwargs) - - rhs_i16 = make_tensor((5,), device=device, dtype=torch.int16, **op.rhs_make_tensor_kwargs) - rhs_i32 = make_tensor((5,), device=device, dtype=torch.int32, **op.rhs_make_tensor_kwargs) - rhs_i64 = make_tensor((5,), device=device, dtype=torch.int64, **op.rhs_make_tensor_kwargs) + lhs_i16 = make_lhs(dtype=torch.int16) + lhs_i32 = make_lhs(dtype=torch.int32) + lhs_i64 = make_lhs(dtype=torch.int64) + rhs_i16 = make_rhs(dtype=torch.int16) + rhs_i32 = make_rhs(dtype=torch.int32) + rhs_i64 = make_rhs(dtype=torch.int64) if op.promotes_int_to_float: default_dtype = torch.get_default_dtype() self.assertEqual(op(lhs_i16, rhs_i32).dtype, default_dtype) - self.assertEqual(op(lhs_i16, rhs_i32), op(lhs_i16.to(default_dtype), rhs_i32.to(default_dtype))) + self.assertEqual( + op(lhs_i16, rhs_i32), + op(lhs_i16.to(default_dtype), rhs_i32.to(default_dtype)), + ) self.assertEqual(op(lhs_i32, rhs_i64).dtype, default_dtype) - self.assertEqual(op(lhs_i32, rhs_i64), op(lhs_i32.to(default_dtype), rhs_i64.to(default_dtype))) + self.assertEqual( + op(lhs_i32, rhs_i64), + op(lhs_i32.to(default_dtype), rhs_i64.to(default_dtype)), + ) elif op.always_returns_bool: self.assertEqual(op(lhs_i16, rhs_i32).dtype, torch.bool) self.assertEqual(op(lhs_i32, rhs_i64).dtype, torch.bool) else: # standard type promotion self.assertEqual(op(lhs_i16, rhs_i32).dtype, torch.int32) - self.assertEqual(op(lhs_i16, rhs_i32), op(lhs_i16.to(torch.int32), rhs_i32)) + self.assertEqual( + op(lhs_i16, rhs_i32), op(lhs_i16.to(torch.int32), rhs_i32) + ) self.assertEqual(op(lhs_i32, rhs_i64).dtype, torch.int64) - self.assertEqual(op(lhs_i32, rhs_i64), op(lhs_i32.to(torch.int64), rhs_i64)) + self.assertEqual( + op(lhs_i32, rhs_i64), op(lhs_i32.to(torch.int64), rhs_i64) + ) if op.supports_out: if not op.promotes_int_to_float: @@ -606,7 +542,6 @@ def _supported(dtypes): out = torch.empty_like(lhs_i16) self.assertEqual(op(lhs_i32, rhs_i64, out=out).dtype, torch.int16) - self.assertEqual(op(lhs_i32, rhs_i64), out, exact_dtype=False) else: # Float outs cannot be safely cast to integer types with self.assertRaisesRegex(RuntimeError, "can't be cast"): @@ -615,16 +550,18 @@ def _supported(dtypes): if not op.always_returns_bool: # Neither integer nor float outs can be cast to bool with self.assertRaisesRegex(RuntimeError, "can't be cast"): - op(lhs_i16, rhs_i32, out=torch.empty_like(lhs_i64, dtype=torch.bool)) + op( + lhs_i16, + rhs_i32, + out=torch.empty_like(lhs_i64, dtype=torch.bool), + ) # All these output types can be cast to any float or complex type out = torch.empty_like(lhs_i64, dtype=torch.float16) self.assertEqual(op(lhs_i16, rhs_i32, out=out).dtype, torch.float16) - self.assertEqual(op(lhs_i16, rhs_i32), out, exact_dtype=False) out = torch.empty_like(lhs_i64, dtype=torch.bfloat16) self.assertEqual(op(lhs_i16, rhs_i32, out=out).dtype, torch.bfloat16) - self.assertEqual(op(lhs_i16, rhs_i32), out, exact_dtype=False) out = torch.empty_like(lhs_i64, dtype=torch.float32) self.assertEqual(op(lhs_i16, rhs_i32, out=out).dtype, torch.float32) @@ -636,23 +573,24 @@ def _supported(dtypes): # float x float type promotion if _supported((torch.float32, torch.float64)): - lhs_f32 = make_tensor((5,), device=device, dtype=torch.float32, **op.lhs_make_tensor_kwargs) - lhs_f64 = make_tensor((5,), device=device, dtype=torch.float64, **op.lhs_make_tensor_kwargs) + lhs_f32 = make_lhs(dtype=torch.float32) + lhs_f64 = make_lhs(dtype=torch.float64) - rhs_f32 = make_tensor((5,), device=device, dtype=torch.float32, **op.rhs_make_tensor_kwargs) - rhs_f64 = make_tensor((5,), device=device, dtype=torch.float64, **op.rhs_make_tensor_kwargs) + rhs_f32 = make_rhs(dtype=torch.float32) + rhs_f64 = make_rhs(dtype=torch.float64) if op.always_returns_bool: self.assertEqual(op(lhs_f32, rhs_f64).dtype, torch.bool) else: # normal float type promotion self.assertEqual(op(lhs_f32, rhs_f64).dtype, torch.float64) - self.assertEqual(op(lhs_f32, rhs_f64), op(lhs_f32.to(torch.float64), rhs_f64)) + self.assertEqual( + op(lhs_f32, rhs_f64), op(lhs_f32.to(torch.float64), rhs_f64) + ) if op.supports_out: # All these output types can be cast to any float or complex type out = torch.empty_like(lhs_f64, dtype=torch.float16) self.assertEqual(op(lhs_f32, rhs_f64, out=out).dtype, torch.float16) - self.assertEqual(op(lhs_f32, rhs_f64), out, exact_dtype=False) out = torch.empty_like(lhs_f64, dtype=torch.bfloat16) self.assertEqual(op(lhs_f32, rhs_f64, out=out).dtype, torch.bfloat16) @@ -669,7 +607,11 @@ def _supported(dtypes): if not op.always_returns_bool: # float outs can't be cast to an integer dtype with self.assertRaisesRegex(RuntimeError, "can't be cast"): - op(lhs_f32, rhs_f64, out=torch.empty_like(lhs_f64, dtype=torch.int64)) + op( + lhs_f32, + rhs_f64, + out=torch.empty_like(lhs_f64, dtype=torch.int64), + ) else: # bool outs can be cast to an integer dtype out = torch.empty_like(lhs_f64, dtype=torch.int64) @@ -678,35 +620,49 @@ def _supported(dtypes): # complex x complex type promotion if _supported((torch.complex64, torch.complex128)): - lhs_c64 = make_tensor((5,), device=device, dtype=torch.complex64, **op.lhs_make_tensor_kwargs) - lhs_c128 = make_tensor((5,), device=device, dtype=torch.complex128, **op.lhs_make_tensor_kwargs) + lhs_c64 = make_lhs(dtype=torch.complex64) + lhs_c128 = make_lhs(dtype=torch.complex128) - rhs_c64 = make_tensor((5,), device=device, dtype=torch.complex64, **op.rhs_make_tensor_kwargs) - rhs_c128 = make_tensor((5,), device=device, dtype=torch.complex128, **op.rhs_make_tensor_kwargs) + rhs_c64 = make_rhs(dtype=torch.complex64) + rhs_c128 = make_rhs(dtype=torch.complex128) if op.always_returns_bool: self.assertEqual(op(lhs_c64, lhs_c128).dtype, torch.bool) else: # normal complex type promotion self.assertEqual(op(lhs_c64, rhs_c128).dtype, torch.complex128) - self.assertEqual(op(lhs_c64, rhs_c128), op(lhs_c64.to(torch.complex128), rhs_c128)) + self.assertEqual( + op(lhs_c64, rhs_c128), op(lhs_c64.to(torch.complex128), rhs_c128) + ) if op.supports_out: # All these output types can be cast to any or complex type out = torch.empty_like(lhs_c64, dtype=torch.complex64) + self.assertEqual(op(lhs_c64, rhs_c128, out=out).dtype, torch.complex64) - self.assertEqual(op(lhs_c64, rhs_c128), out, exact_dtype=False) + result = op(lhs_c64, rhs_c128) + self.assertEqual(result, out.to(result.dtype)) if not op.always_returns_bool: # complex outs can't be cast to float types with self.assertRaisesRegex(RuntimeError, "can't be cast"): - op(lhs_c64, rhs_c128, out=torch.empty_like(lhs_c64, dtype=torch.float64)) + op( + lhs_c64, + rhs_c128, + out=torch.empty_like(lhs_c64, dtype=torch.float64), + ) # complex outs can't be cast to an integer dtype with self.assertRaisesRegex(RuntimeError, "can't be cast"): - op(lhs_c64, rhs_c128, out=torch.empty_like(lhs_c64, dtype=torch.int64)) + op( + lhs_c64, + rhs_c128, + out=torch.empty_like(lhs_c64, dtype=torch.int64), + ) else: # bool outs can be cast to a float type out = torch.empty_like(lhs_c64, dtype=torch.float64) - self.assertEqual(op(lhs_c64, rhs_c128, out=out).dtype, torch.float64) + self.assertEqual( + op(lhs_c64, rhs_c128, out=out).dtype, torch.float64 + ) self.assertEqual(op(lhs_c64, rhs_c128), out, exact_dtype=False) # bool outs can be cast to an integer dtype @@ -714,17 +670,139 @@ def _supported(dtypes): self.assertEqual(op(lhs_f32, rhs_f64, out=out).dtype, torch.int64) self.assertEqual(op(lhs_f32, rhs_f64), out, exact_dtype=False) + # int x float type promotion + # Note: float type is the result dtype + if _supported((torch.long, torch.float32)): + lhs_i64 = make_lhs(dtype=torch.int64) + rhs_f32 = make_rhs(dtype=torch.float32) + + result = op(lhs_i64, rhs_f32) + expected_dtype = torch.float32 if not op.always_returns_bool else torch.bool + self.assertEqual(result.dtype, expected_dtype) + + # float x complex type promotion + # Note: complex type with highest "value type" is the result dtype + if _supported((torch.float64, torch.complex64)): + lhs_f64 = make_lhs(dtype=torch.float64) + rhs_c64 = make_rhs(dtype=torch.complex64) + + result = op(lhs_f64, rhs_c64) + expected_dtype = ( + torch.complex128 if not op.always_returns_bool else torch.bool + ) + self.assertEqual(result.dtype, expected_dtype) + + # int x float scalar type promotion + # Note: default float dtype is the result dtype + if _supported((torch.int64, torch.float32)) and op.supports_rhs_python_scalar: + lhs_i64 = make_lhs(dtype=torch.int64) + rhs_f_scalar = 1.0 + + result = op(lhs_i64, rhs_f_scalar) + expected_dtype = ( + torch.get_default_dtype() if not op.always_returns_bool else torch.bool + ) + self.assertEqual(result.dtype, expected_dtype) + + # repeats with a scalar float tensor, which should set the dtype + rhs_f32_scalar_tensor = make_rhs_scalar_tensor(dtype=torch.float32) + result = op(lhs_i64, rhs_f32_scalar_tensor) + expected_dtype = torch.float32 if not op.always_returns_bool else torch.bool + self.assertEqual(result.dtype, expected_dtype) + + # Additional test with double + if _supported((torch.float64,)): + rhs_f64_scalar_tensor = make_rhs_scalar_tensor(dtype=torch.float64) + result = op(lhs_i64, rhs_f64_scalar_tensor) + expected_dtype = ( + torch.float64 if not op.always_returns_bool else torch.bool + ) + self.assertEqual(result.dtype, expected_dtype) + + # float x complex scalar type promotion + # Note: result dtype is complex with highest "value type" among all tensors + if ( + _supported((torch.float32, torch.complex64)) + and op.supports_rhs_python_scalar + ): + lhs_f32 = make_lhs(dtype=torch.float32) + rhs_c_scalar = complex(1, 1) + + result = op(lhs_f32, rhs_c_scalar) + expected_dtype = ( + torch.complex64 if not op.always_returns_bool else torch.bool + ) + self.assertEqual(result.dtype, expected_dtype) + + # repeats with a scalar complex tensor + rhs_c64_scalar_tensor = make_rhs_scalar_tensor(dtype=torch.complex64) + result = op(lhs_f32, rhs_c64_scalar_tensor) + expected_dtype = ( + torch.complex64 if not op.always_returns_bool else torch.bool + ) + self.assertEqual(result.dtype, expected_dtype) + + # Additional test with complexdouble + if _supported((torch.complex128,)): + rhs_c128_scalar_tensor = make_rhs_scalar_tensor(dtype=torch.complex128) + result = op(lhs_f32, rhs_c128_scalar_tensor) + expected_dtype = ( + torch.complex128 if not op.always_returns_bool else torch.bool + ) + self.assertEqual(result.dtype, expected_dtype) + + # float x float scalar tensor + # Note: result dtype is the type of the float tensor + if _supported((torch.float32, torch.float64)) and op.supports_rhs_python_scalar: + lhs_f32 = make_lhs(dtype=torch.float32) + rhs_f64_scalar_tensor = make_rhs_scalar_tensor(dtype=torch.float64) + + result = op(lhs_f32, rhs_f64_scalar_tensor) + expected_dtype = torch.float32 if not op.always_returns_bool else torch.bool + self.assertEqual(result.dtype, expected_dtype) + + # complex x complex scalar tensor + # Note: result dtype is the type of the complex tensor + if ( + _supported((torch.complex64, torch.complex128)) + and op.supports_rhs_python_scalar + ): + lhs_c64 = make_lhs(dtype=torch.complex64) + rhs_c128_scalar_tensor = make_rhs_scalar_tensor(dtype=torch.complex128) + + result = op(lhs_c64, rhs_c128_scalar_tensor) + expected_dtype = ( + torch.complex64 if not op.always_returns_bool else torch.bool + ) + self.assertEqual(result.dtype, expected_dtype) + + # scalar int x scalar float + # Note: result dtype is default float type + # TODO: FIXME: re-enable this, scalar x scalar type promotion is currently broken + # https://github.com/pytorch/pytorch/issues/76801 + # if op.supports_two_python_scalars and _supported((torch.long, torch.float32)): + # lhs_i_scalar = 1 + # rhs_f_scalar = 2. + + # result = op(lhs_i_scalar, rhs_f_scalar) + # expected_dtype = torch.get_default_dtype() if not op.always_returns_bool else torch.bool + # self.assertEqual(result.dtype, expected_dtype) + # TODO: move to error input test @ops(binary_ufuncs, allowed_dtypes=(torch.float32,)) def test_not_broadcastable(self, device, dtype, op): for shape_lhs, shape_rhs in ( - ((2,), (3,)), - ((3, 1), (2, 1)), - ((1, 3, 2), (3,)), - ((3, 1, 2), (2, 1, 2)), + ((2,), (3,)), + ((3, 1), (2, 1)), + ((1, 3, 2), (3,)), + ((3, 1, 2), (2, 1, 2)), ): - lhs = make_tensor(shape_lhs, device=device, dtype=dtype, **op.lhs_make_tensor_kwargs) - rhs = make_tensor(shape_rhs, device=device, dtype=dtype, **op.rhs_make_tensor_kwargs) + lhs = make_tensor( + shape_lhs, device=device, dtype=dtype, **op.lhs_make_tensor_kwargs + ) + rhs = make_tensor( + shape_rhs, device=device, dtype=dtype, **op.rhs_make_tensor_kwargs + ) try: broadcasted_shape = op(lhs, rhs).shape @@ -739,27 +817,48 @@ def test_not_broadcastable(self, device, dtype, op): def test_add_broadcast_empty(self, device): # empty + empty - self.assertRaises(RuntimeError, lambda: torch.randn(5, 0, device=device) + torch.randn(0, 5, device=device)) - self.assertEqual(torch.randn(5, 0, device=device), torch.randn(0, device=device) + torch.randn(5, 0, device=device)) - self.assertEqual(torch.randn(5, 0, 0, device=device), torch.randn(0, device=device) + torch.randn(5, 0, 1, device=device)) + self.assertRaises( + RuntimeError, + lambda: torch.randn(5, 0, device=device) + torch.randn(0, 5, device=device), + ) + self.assertEqual( + torch.randn(5, 0, device=device), + torch.randn(0, device=device) + torch.randn(5, 0, device=device), + ) + self.assertEqual( + torch.randn(5, 0, 0, device=device), + torch.randn(0, device=device) + torch.randn(5, 0, 1, device=device), + ) # scalar + empty - self.assertEqual(torch.randn(5, 0, 6, device=device), torch.randn((), device=device) + torch.randn(5, 0, 6, device=device)) + self.assertEqual( + torch.randn(5, 0, 6, device=device), + torch.randn((), device=device) + torch.randn(5, 0, 6, device=device), + ) # non-empty, empty - self.assertEqual(torch.randn(0, device=device), torch.randn(0, device=device) + torch.randn(1, device=device)) - self.assertEqual(torch.randn(0, 7, 0, 6, 5, 0, 7, device=device), - torch.randn(0, 7, 0, 6, 5, 0, 1, device=device) + torch.randn(1, 1, 5, 1, 7, device=device)) - self.assertRaises(RuntimeError, lambda: torch.randn(7, 0, device=device) + torch.randn(2, 1, device=device)) + self.assertEqual( + torch.randn(0, device=device), + torch.randn(0, device=device) + torch.randn(1, device=device), + ) + self.assertEqual( + torch.randn(0, 7, 0, 6, 5, 0, 7, device=device), + torch.randn(0, 7, 0, 6, 5, 0, 1, device=device) + + torch.randn(1, 1, 5, 1, 7, device=device), + ) + self.assertRaises( + RuntimeError, + lambda: torch.randn(7, 0, device=device) + torch.randn(2, 1, device=device), + ) def test_addcmul_scalars_as_floats(self, device): # zero-dim variables that don't require grad should bind to scalar arguments - x = torch.tensor(2.) - y = torch.tensor(3., device=device) + x = torch.tensor(2.0) + y = torch.tensor(3.0, device=device) # 3 + (3 * 3) * 2 self.assertEqual(y.addcmul(y, y, value=x), 21) - x = torch.tensor(2., requires_grad=True) + x = torch.tensor(2.0, requires_grad=True) self.assertRaises(Exception, lambda: y.addcmul(y, y, value=x)) # TODO: update to work on CUDA, too @@ -796,8 +895,8 @@ def test_comparison_ops(self, device): def test_comparison_ops_device_computation(self, device): operands = ( torch.tensor(0), - torch.tensor(2, device='cuda'), - torch.tensor([0, 2], device='cuda') + torch.tensor(2, device="cuda"), + torch.tensor([0, 2], device="cuda"), ) # Checks that comparison operators compute the correct # output device, given a combination of devices @@ -811,38 +910,49 @@ def test_comparison_ops_device_computation(self, device): # TODO: update to work on CUDA, too @onlyCPU def test_comparison_ops_must_take_bool_output(self, device): - for op in [torch.lt, torch.le, torch.gt, torch.ge, torch.eq, torch.ne, - torch.logical_and, torch.logical_or, torch.logical_xor]: - self.assertEqual(op(torch.tensor([True]), torch.tensor([False])).dtype, torch.bool) + for op in [ + torch.lt, + torch.le, + torch.gt, + torch.ge, + torch.eq, + torch.ne, + torch.logical_and, + torch.logical_or, + torch.logical_xor, + ]: + self.assertEqual( + op(torch.tensor([True]), torch.tensor([False])).dtype, torch.bool + ) # TODO: update to work on CUDA, too @onlyCPU def test_comparison_ops_check_for_scalar_overflow(self, device): s = 1 << 20 t = torch.tensor([1 << 5], dtype=torch.uint8) - with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'): + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): self.assertTrue(t < s) - with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'): + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): self.assertTrue(s < t) - with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'): + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): self.assertTrue(t <= s) - with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'): + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): self.assertTrue(s <= t) - with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'): + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): self.assertTrue(t > s) - with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'): + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): self.assertTrue(s > t) - with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'): + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): self.assertTrue(t >= s) - with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'): + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): self.assertTrue(s >= t) - with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'): + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): self.assertTrue(t == s) - with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'): + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): self.assertTrue(s == t) - with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'): + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): self.assertTrue(t != s) - with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'): + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): self.assertTrue(s != t) # TODO: update to work on CUDA, too @@ -852,29 +962,29 @@ def test_comparison_ops_check_for_zerodim_tensor_overflow(self, device): t2 = torch.tensor([1 << 30], dtype=torch.int32) ts1 = torch.tensor(1 << 20, dtype=torch.int32) ts2 = torch.tensor(1 << 40, dtype=torch.int64) - with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'): + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): self.assertTrue(t1 < ts1) - with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'): + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): self.assertTrue(ts2 < t2) - with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'): + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): self.assertTrue(t1 <= ts1) - with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'): + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): self.assertTrue(ts2 <= t2) - with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'): + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): self.assertTrue(t1 > ts1) - with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'): + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): self.assertTrue(ts2 > t2) - with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'): + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): self.assertTrue(t1 >= ts1) - with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'): + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): self.assertTrue(ts2 >= t2) - with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'): + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): self.assertTrue(t1 == ts1) - with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'): + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): self.assertTrue(ts2 == t2) - with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'): + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): self.assertTrue(t1 != ts1) - with self.assertRaisesRegex(RuntimeError, 'value cannot be converted to type'): + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): self.assertTrue(ts2 != t2) # Tests that the binary operators and, or, and xor (as well as their reflected and inplace versions) @@ -882,7 +992,14 @@ def test_comparison_ops_check_for_zerodim_tensor_overflow(self, device): @dtypes(*integral_types_and(torch.bool)) def test_bitwise_ops(self, device, dtype): # Tensor x Tensor and Tensor x Scalar ops - ops = (operator.and_, operator.iand, operator.or_, operator.ior, operator.xor, operator.ixor) + ops = ( + operator.and_, + operator.iand, + operator.or_, + operator.ior, + operator.xor, + operator.ixor, + ) inplace_ops = (operator.iand, operator.ior, operator.ixor) shapes = ((5,), (15, 15), (500, 500)) @@ -896,12 +1013,12 @@ def test_bitwise_ops(self, device, dtype): # Tests tensor x scalar case a = make_tensor(shape, device=device, dtype=dtype) - b_scalar = make_tensor((), device='cpu', dtype=dtype).item() + b_scalar = make_tensor((), device="cpu", dtype=dtype).item() a_np = a.cpu().clone().numpy() self.assertEqual(op(a, b_scalar), op(a_np, b_scalar)) # Tests scalar x tensor case - a_scalar = make_tensor((), device='cpu', dtype=dtype).item() + a_scalar = make_tensor((), device="cpu", dtype=dtype).item() b = make_tensor(shape, device=device, dtype=dtype) b_np = b.cpu().clone().numpy() self.assertEqual(op(a_scalar, b), op(a_scalar, b_np)) @@ -919,7 +1036,7 @@ def test_bitwise_ops(self, device, dtype): # Tests tensor x scalar case a = make_tensor(shape, device=device, dtype=dtype) - b_scalar = make_tensor((), device='cpu', dtype=dtype).item() + b_scalar = make_tensor((), device="cpu", dtype=dtype).item() a_np = a.cpu().clone().numpy() op(a, b_scalar) op(a_np, b_scalar) @@ -932,7 +1049,7 @@ def test_inplace_division(self, device): id_after = id(t) self.assertEqual(id_before, id_after) - @dtypes(*get_all_dtypes(include_bool=False, include_complex=False)) + @dtypes(*all_types_and(torch.half, torch.bfloat16)) def test_div_rounding_modes(self, device, dtype): if dtype.is_floating_point: low, high = -10.0, 10.0 @@ -940,8 +1057,8 @@ def test_div_rounding_modes(self, device, dtype): info = torch.iinfo(dtype) low, high = info.min, info.max - a = make_tensor((100,), device, dtype, low=low, high=high) - b = make_tensor((100,), device, dtype, low=low, high=high) + a = make_tensor((100,), dtype=dtype, device=device, low=low, high=high) + b = make_tensor((100,), dtype=dtype, device=device, low=low, high=high) # Avoid division by zero so we can test (a / b) * b == a if dtype.is_floating_point: @@ -958,17 +1075,23 @@ def test_div_rounding_modes(self, device, dtype): self.assertTrue(d_true.is_floating_point()) self.assertEqual(d_true * b, a.to(d_true.dtype)) - d_floor = torch.divide(a, b, rounding_mode='floor') + d_floor = torch.divide(a, b, rounding_mode="floor") if dtype not in (torch.bfloat16, torch.half): self.assertEqual(d_floor * b + torch.remainder(a, b), a) else: - self.assertEqual(d_floor * b + torch.remainder(a.float(), b.float()), a, - exact_dtype=False) + self.assertEqual( + d_floor * b + torch.remainder(a.float(), b.float()), + a, + exact_dtype=False, + ) - d_trunc = torch.divide(a, b, rounding_mode='trunc') + d_trunc = torch.divide(a, b, rounding_mode="trunc") rounding_unsupported = ( - dtype == torch.half and device != 'cuda' or - dtype == torch.bfloat16 and device != 'cpu') + dtype == torch.half + and device != "cuda" + or dtype == torch.bfloat16 + and device != "cpu" + ) d_ref = d_true.float() if rounding_unsupported else d_true self.assertEqual(d_trunc, d_ref.trunc().to(dtype)) @@ -976,8 +1099,10 @@ def test_div_rounding_modes(self, device, dtype): def test_div_rounding_nonfinite(self, device, dtype): # Compare division of special floating point values against NumPy - num = torch.tensor([1.0, -1.0, 0, 0.1, -0.1, np.pi, -np.pi, np.inf, -np.inf, np.nan], - dtype=dtype) + num = torch.tensor( + [1.0, -1.0, 0, 0.1, -0.1, np.pi, -np.pi, np.inf, -np.inf, np.nan], + dtype=dtype, + ) # Divide by zero is tested seperately denom = num[num != 0] @@ -991,18 +1116,26 @@ def test_div_rounding_nonfinite(self, device, dtype): an, bn = a.float().cpu().numpy(), b.float().cpu().numpy() for mode, np_ref in ((None, np.true_divide), ("floor", np.floor_divide)): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): expect = np_ref(an, bn) kwargs = dict(rounding_mode=mode) if mode is not None else {} with set_default_dtype(torch.double): actual = torch.divide(a, b, **kwargs) - self.assertEqual(actual, torch.from_numpy(expect), - exact_device=False, exact_dtype=exact_dtype) + self.assertEqual( + actual, + torch.from_numpy(expect), + exact_device=False, + exact_dtype=exact_dtype, + ) # Compare contiguous (likely vectorized) against non-contiguous (not vectorized) - a_noncontig = torch.empty([2 * i for i in a.shape], dtype=dtype, device=device)[::2, ::2] + a_noncontig = torch.empty([2 * i for i in a.shape], dtype=dtype, device=device)[ + ::2, ::2 + ] a_noncontig[:] = a - b_noncontig = torch.empty([2 * i for i in b.shape], dtype=dtype, device=device)[::2, ::2] + b_noncontig = torch.empty([2 * i for i in b.shape], dtype=dtype, device=device)[ + ::2, ::2 + ] b_noncontig[:] = b for rounding_mode in (None, "trunc", "floor"): @@ -1012,9 +1145,11 @@ def test_div_rounding_nonfinite(self, device, dtype): @dtypes(torch.bfloat16, torch.half, torch.float32, torch.float64) def test_divide_by_zero_rounding(self, device, dtype): - a = torch.tensor([1.0, -1.0, 0, 0.1, -0.1, np.pi, -np.pi, np.inf, -np.inf, np.nan], - dtype=dtype) - exact_dtype = (dtype != torch.bfloat16) + a = torch.tensor( + [1.0, -1.0, 0, 0.1, -0.1, np.pi, -np.pi, np.inf, -np.inf, np.nan], + dtype=dtype, + ) + exact_dtype = dtype != torch.bfloat16 if exact_dtype: an = a.cpu().numpy() else: @@ -1024,7 +1159,7 @@ def test_divide_by_zero_rounding(self, device, dtype): # NOTE: NumPy's floor_divide rounding changed in 1.20.0 to be consistent with divide expect = np.divide(an, 0) - for rounding_mode in (None, 'floor'): + for rounding_mode in (None, "floor"): # CPU scalar actual = torch.divide(a, 0, rounding_mode=rounding_mode) self.assertEqual(actual, expect, exact_dtype=exact_dtype) @@ -1032,16 +1167,14 @@ def test_divide_by_zero_rounding(self, device, dtype): actual = torch.divide(a, zero, rounding_mode=rounding_mode) self.assertEqual(actual, expect, exact_dtype=exact_dtype) - @dtypes(*get_all_dtypes( - include_bool=False, include_complex=False, include_bfloat16=False)) + @dtypes(*all_types_and(torch.half)) def test_div_rounding_numpy(self, device, dtype): - info = (torch.finfo(dtype) if dtype.is_floating_point - else torch.iinfo(dtype)) + info = torch.finfo(dtype) if dtype.is_floating_point else torch.iinfo(dtype) low, high = info.min, info.max # Compare division of random values against NumPy - a = make_tensor((4096,), device, dtype, low=low, high=high) - b = make_tensor((4096,), device, dtype, low=low, high=high) + a = make_tensor((4096,), dtype=dtype, device=device, low=low, high=high) + b = make_tensor((4096,), dtype=dtype, device=device, low=low, high=high) # Avoid division by zero which raises for integers and, for floats, # NumPy 1.20 changed floor_divide to follow IEEE rules for inf/nan @@ -1057,34 +1190,39 @@ def test_div_rounding_numpy(self, device, dtype): an, bn = a.float().cpu().numpy(), b.float().cpu().numpy() for mode, np_ref in ( - (None, np.true_divide), - ("floor", np.floor_divide), - ("trunc", lambda a, b: np.trunc(np.true_divide(a, b)).astype(a.dtype)) + (None, np.true_divide), + ("floor", np.floor_divide), + ("trunc", lambda a, b: np.trunc(np.true_divide(a, b)).astype(a.dtype)), ): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): expect = torch.from_numpy(np_ref(an, bn)) kwargs = dict(rounding_mode=mode) if mode is not None else {} # Contiguous (likely vectorized) with set_default_dtype(torch.double): actual = torch.divide(a, b, **kwargs) - self.assertEqual(actual, expect, exact_device=False, exact_dtype=exact_dtype) + self.assertEqual( + actual, expect, exact_device=False, exact_dtype=exact_dtype + ) # Non-contiguous (not vectorized) expect = expect[::2] with set_default_dtype(torch.double): actual = torch.divide(a[::2], b[::2], **kwargs) - self.assertEqual(actual, expect, exact_device=False, exact_dtype=exact_dtype) + self.assertEqual( + actual, expect, exact_device=False, exact_dtype=exact_dtype + ) # Tests that trying to add, inplace, a CUDA tensor to a CPU tensor # throws the correct error message @onlyCUDA def test_cross_device_inplace_error_msg(self, device): - a = torch.tensor(2.) - b = torch.tensor(2., device=device) - with self.assertRaisesRegex(RuntimeError, - "Expected all tensors to be on the same device"): + a = torch.tensor(2.0) + b = torch.tensor(2.0, device=device) + with self.assertRaisesRegex( + RuntimeError, "Expected all tensors to be on the same device" + ): a += b # TODO: refactor this test into a more generic one, it's parked here currently @@ -1097,7 +1235,7 @@ def test_out_resize_warning(self, device): binary_inputs = (a, b) unary_ops = (torch.ceil, torch.exp) binary_ops = (torch.add, torch.sub) - for op in (unary_ops + binary_ops): + for op in unary_ops + binary_ops: with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") inputs = unary_inputs if op in unary_ops else binary_inputs @@ -1121,30 +1259,30 @@ def test_inplace_dunders(self, device): t -= 1 t *= 1 t /= 1 - with self.assertWarnsOnceRegex(UserWarning, 'floor_divide'): + t **= 1 + with self.assertWarnsOnceRegex(UserWarning, "floor_divide"): t //= 1 t %= 1 self.assertEqual(expected, t.data_ptr()) - def check_internal_mem_overlap(self, inplace_op, num_inputs, - dtype, device, - expected_failure=False): + def check_internal_mem_overlap( + self, inplace_op, num_inputs, dtype, device, expected_failure=False + ): if isinstance(inplace_op, str): inplace_op = getattr(torch.Tensor, inplace_op) input = torch.randn(1, dtype=dtype, device=device).expand(3, 3) - inputs = [input] + [torch.randn_like(input) - for i in range(num_inputs - 1)] + inputs = [input] + [torch.randn_like(input) for i in range(num_inputs - 1)] if not expected_failure: - with self.assertRaisesRegex(RuntimeError, 'single memory location'): + with self.assertRaisesRegex(RuntimeError, "single memory location"): inplace_op(*inputs) else: with self.assertRaises(AssertionError): - with self.assertRaisesRegex(RuntimeError, 'single memory location'): + with self.assertRaisesRegex(RuntimeError, "single memory location"): inplace_op(*inputs) - def unary_check_input_output_mem_overlap(self, data, sz, op, - expected_failure=False): - + def unary_check_input_output_mem_overlap( + self, data, sz, op, expected_failure=False + ): def _test(op, output, input): output_exp = torch.empty_like(output) op(input, out=output_exp) @@ -1153,93 +1291,114 @@ def _test(op, output, input): # output is identical to input: _test(op, output=data[0:sz], input=data[0:sz]) # output and input are independent: - _test(op, output=data[0:sz], input=data[sz:2 * sz]) + _test(op, output=data[0:sz], input=data[sz : 2 * sz]) # output partially overlaps with input: if not expected_failure: - with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): - _test(op, data[0:sz], data[1:sz + 1]) + with self.assertRaisesRegex(RuntimeError, "unsupported operation"): + _test(op, data[0:sz], data[1 : sz + 1]) else: with self.assertRaises(AssertionError): - with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): - _test(op, data[0:sz], data[1:sz + 1]) + with self.assertRaisesRegex(RuntimeError, "unsupported operation"): + _test(op, data[0:sz], data[1 : sz + 1]) - def binary_check_input_output_mem_overlap(self, op, device, - expected_failure=False): + def binary_check_input_output_mem_overlap(self, op, device, expected_failure=False): sz = 3 data = torch.randn(2 * sz, device=device) other = torch.randn(sz, device=device) self.unary_check_input_output_mem_overlap( - data, sz, lambda input, out: op(other, input, out=out), - expected_failure=expected_failure) + data, + sz, + lambda input, out: op(other, input, out=out), + expected_failure=expected_failure, + ) self.unary_check_input_output_mem_overlap( - data, sz, lambda input, out: op(input, other, out=out), - expected_failure=expected_failure) + data, + sz, + lambda input, out: op(input, other, out=out), + expected_failure=expected_failure, + ) @dtypes(torch.double) def test_binary_op_mem_overlap(self, device, dtype): ops = [ - ("add", True, True, 'cpu'), - ("add", True, True, 'cuda'), - ("mul", True, True, 'cpu'), - ("mul", True, True, 'cuda'), - ("sub", True, True, 'cpu'), - ("sub", True, True, 'cuda'), - ("div", True, True, 'cpu'), - ("div", True, True, 'cuda'), - ("pow", True, True, 'cpu'), - ("pow", True, True, 'cuda'), - ("fmod", True, True, 'cpu'), - ("fmod", True, True, 'cuda'), - ("atan2", True, True, 'cpu'), - ("atan2", True, True, 'cuda'), - ("hypot", True, True, 'cpu'), - ("hypot", True, True, 'cuda'), - ("igamma", True, True, 'cpu'), - ("igamma", True, True, 'cuda'), - ("igammac", True, True, 'cpu'), - ("igammac", True, True, 'cuda'), - ("nextafter", True, True, 'cpu'), - ("nextafter", True, True, 'cuda'), - ("le", True, True, 'cpu'), - ("le", True, True, 'cuda'), - ("lt", True, True, 'cpu'), - ("lt", True, True, 'cuda'), - ("ge", True, True, 'cpu'), - ("ge", True, True, 'cuda'), - ("gt", True, True, 'cpu'), - ("gt", True, True, 'cuda'), - ("eq", True, True, 'cpu'), - ("eq", True, True, 'cuda'), - ("ne", True, True, 'cpu'), - ("ne", True, True, 'cuda'), - ("logical_and", True, True, 'cpu'), - ("logical_and", True, True, 'cuda'), - ("logical_or", True, True, 'cpu'), - ("logical_or", True, True, 'cuda'), - ("logical_xor", True, True, 'cpu'), - ("logical_xor", True, True, 'cuda'), + ("add", True, True, "cpu"), + ("add", True, True, "cuda"), + ("mul", True, True, "cpu"), + ("mul", True, True, "cuda"), + ("sub", True, True, "cpu"), + ("sub", True, True, "cuda"), + ("div", True, True, "cpu"), + ("div", True, True, "cuda"), + ("pow", True, True, "cpu"), + ("pow", True, True, "cuda"), + ("fmod", True, True, "cpu"), + ("fmod", True, True, "cuda"), + ("atan2", True, True, "cpu"), + ("atan2", True, True, "cuda"), + ("hypot", True, True, "cpu"), + ("hypot", True, True, "cuda"), + ("igamma", True, True, "cpu"), + ("igamma", True, True, "cuda"), + ("igammac", True, True, "cpu"), + ("igammac", True, True, "cuda"), + ("nextafter", True, True, "cpu"), + ("nextafter", True, True, "cuda"), + ("le", True, True, "cpu"), + ("le", True, True, "cuda"), + ("lt", True, True, "cpu"), + ("lt", True, True, "cuda"), + ("ge", True, True, "cpu"), + ("ge", True, True, "cuda"), + ("gt", True, True, "cpu"), + ("gt", True, True, "cuda"), + ("eq", True, True, "cpu"), + ("eq", True, True, "cuda"), + ("ne", True, True, "cpu"), + ("ne", True, True, "cuda"), + ("logical_and", True, True, "cpu"), + ("logical_and", True, True, "cuda"), + ("logical_or", True, True, "cpu"), + ("logical_or", True, True, "cuda"), + ("logical_xor", True, True, "cpu"), + ("logical_xor", True, True, "cuda"), ] - for (fn, has_input_output_mem_overlap_check, - has_internal_mem_overlap_check, dev) in ops: + for ( + fn, + has_input_output_mem_overlap_check, + has_internal_mem_overlap_check, + dev, + ) in ops: if dev != device: continue out_op = getattr(torch, fn) - inplace_op = getattr(torch.Tensor, fn + '_') + inplace_op = getattr(torch.Tensor, fn + "_") self.check_internal_mem_overlap( - inplace_op, 2, dtype, device, - expected_failure=not has_internal_mem_overlap_check) + inplace_op, + 2, + dtype, + device, + expected_failure=not has_internal_mem_overlap_check, + ) - self.binary_check_input_output_mem_overlap(out_op, device, - expected_failure=not has_input_output_mem_overlap_check) + self.binary_check_input_output_mem_overlap( + out_op, device, expected_failure=not has_input_output_mem_overlap_check + ) def _do_pow_for_exponents(self, m1, exponents, pow_fn, atol): for num in exponents: - if isinstance(num, int) and num < 0 and not m1.is_floating_point() and not m1.is_complex(): - with self.assertRaisesRegex(RuntimeError, - r'Integers to negative integer powers are not allowed\.'): + if ( + isinstance(num, int) + and num < 0 + and not m1.is_floating_point() + and not m1.is_complex() + ): + with self.assertRaisesRegex( + RuntimeError, + r"Integers to negative integer powers are not allowed\.", + ): torch.pow(m1[4], num) else: # base - tensor, exponent - number @@ -1262,7 +1421,9 @@ def _do_pow_for_exponents(self, m1, exponents, pow_fn, atol): # scalar ** tensor to enforce correct handling of dtypes for __rpow__(). expected_dtype = torch.result_type(num, m1) res1 = num ** m1[4] - res2 = torch.tensor(num, dtype=expected_dtype, device=m1.device) ** m1[4] + res2 = ( + torch.tensor(num, dtype=expected_dtype, device=m1.device) ** m1[4] + ) self.assertEqual(res1, res2) self.assertEqual(res1.dtype, expected_dtype) @@ -1270,14 +1431,27 @@ def _do_pow_for_exponents(self, m1, exponents, pow_fn, atol): def test_pow(self, device, dtype): m1 = torch.empty(0, dtype=dtype, device=device) if m1.is_floating_point() or m1.is_complex(): - m1 = make_tensor((100, 100), low=0, high=1, dtype=dtype, device=device) + 0.5 + m1 = ( + make_tensor((100, 100), low=0, high=1, dtype=dtype, device=device) + 0.5 + ) else: # math.pow will overflow and throw exceptions for large integers range_high = 4 if dtype in (torch.int8, torch.uint8) else 10 - m1 = make_tensor((100, 100), low=1, high=range_high, dtype=dtype, device=device) + m1 = make_tensor( + (100, 100), low=1, high=range_high, dtype=dtype, device=device + ) exponents = [-2.8, -2, -1, -0.5, 0, 0.5, 1, 2, 3, 4, 3.3] - complex_exponents = [-2.5j, -1.0j, 0j, 1.0j, 2.5j, 1.0 + 1.0j, -1.0 - 1.5j, 3.3j] + complex_exponents = [ + -2.5j, + -1.0j, + 0j, + 1.0j, + 2.5j, + 1.0 + 1.0j, + -1.0 - 1.5j, + 3.3j, + ] if m1.is_complex(): self._do_pow_for_exponents(m1, exponents + complex_exponents, pow, 10e-4) else: @@ -1311,7 +1485,11 @@ def to_np(value): try: np_res = np.power(to_np(base), to_np(np_exponent)) - expected = torch.from_numpy(np_res) if isinstance(np_res, np.ndarray) else torch.tensor(np_res, dtype=base.dtype) + expected = ( + torch.from_numpy(np_res) + if isinstance(np_res, np.ndarray) + else torch.tensor(np_res, dtype=base.dtype) + ) except ValueError as e: err_msg = "Integers to negative integer powers are not allowed." self.assertEqual(str(e), err_msg) @@ -1320,7 +1498,7 @@ def to_np(value): lambda: base.pow(exponent), lambda: base.pow_(exponent), lambda: torch.pow(base, exponent), - lambda: torch.pow(base, exponent, out=out) + lambda: torch.pow(base, exponent, out=out), ] for test_case in test_cases: self.assertRaisesRegex(RuntimeError, err_msg, test_case) @@ -1331,16 +1509,24 @@ def to_np(value): actual = base.clone() # When base is a 0-dim cpu tensor and exp is a cuda tensor, we exp `pow` to work but `pow_` to fail, since # `pow` will try to create the output tensor on a cuda device, but `pow_` needs to use the cpu tensor as the output - if (isinstance(exponent, torch.Tensor) and base.dim() == 0 and base.device.type == 'cpu' and - exponent.device.type == 'cuda'): - regex = 'Expected all tensors to be on the same device, but found at least two devices, cuda.* and cpu!' + if ( + isinstance(exponent, torch.Tensor) + and base.dim() == 0 + and base.device.type == "cpu" + and exponent.device.type == "cuda" + ): + regex = "Expected all tensors to be on the same device, but found at least two devices, cuda.* and cpu!" self.assertRaisesRegex(RuntimeError, regex, base.pow_, exponent) elif torch.can_cast(torch.result_type(base, exponent), base.dtype): actual2 = actual.pow_(exponent) self.assertEqual(actual, expected) self.assertEqual(actual2, expected) else: - self.assertRaisesRegex(RuntimeError, "Found dtype \\w+ but expected \\w+", lambda: actual.pow_(exponent)) + self.assertRaisesRegex( + RuntimeError, + "Found dtype \\w+ but expected \\w+", + lambda: actual.pow_(exponent), + ) actual = torch.pow(base, exponent) self.assertEqual(actual, expected.to(actual)) @@ -1354,13 +1540,16 @@ def to_np(value): # a lambada that switches the inputs, because we also want to test samples inputs # where the second input is a scalar. The wrapper would need some more logic. def test_pow_scalar_base(self, device): - a = torch.arange(1, 13, dtype=torch.double, device=device).view(3, 4).requires_grad_() + a = ( + torch.arange(1, 13, dtype=torch.double, device=device) + .view(3, 4) + .requires_grad_() + ) gradcheck(lambda a: torch.pow(2, a), (a,)) # Tests pow() for integral, floating-type tensors, with integral, floating-type # exponents (tensor or scalar), respectively. noncontiguous tensors are also tested. def test_int_and_float_pow(self, device): - def _test_int_and_float_pow(dt, low, high, dev): test_cases = ( ((4, 4), 0, (4, 1)), @@ -1372,23 +1561,59 @@ def _test_int_and_float_pow(dt, low, high, dev): ((), 2, ()), ) for base_shape, exp_scalar, exp_shape in test_cases: - base_tensor = make_tensor(base_shape, dtype=dt, device=dev, low=low, high=high) + base_tensor = make_tensor( + base_shape, dtype=dt, device=dev, low=low, high=high + ) # int tensors don't take negative exponents - if dt in [torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64]: - exp_tensor = make_tensor(exp_shape, dtype=dt, device=dev, low=0, high=high) + if dt in [ + torch.uint8, + torch.int8, + torch.int16, + torch.int32, + torch.int64, + ]: + exp_tensor = make_tensor( + exp_shape, dtype=dt, device=dev, low=0, high=high + ) else: - exp_tensor = make_tensor(exp_shape, dtype=dt, device=dev, low=low, high=high) + exp_tensor = make_tensor( + exp_shape, dtype=dt, device=dev, low=low, high=high + ) self._test_pow(base_tensor, exp_scalar) self._test_pow(base_tensor, exp_tensor) # test non-contiguous tensors as well - base_tensor = make_tensor(base_shape, dtype=dt, device=dev, low=low, high=high, - noncontiguous=True) - if dt in [torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64]: - exp_tensor = make_tensor(exp_shape, dtype=dt, device=dev, low=0, high=high, - noncontiguous=True) + base_tensor = make_tensor( + base_shape, + dtype=dt, + device=dev, + low=low, + high=high, + noncontiguous=True, + ) + if dt in [ + torch.uint8, + torch.int8, + torch.int16, + torch.int32, + torch.int64, + ]: + exp_tensor = make_tensor( + exp_shape, + dtype=dt, + device=dev, + low=0, + high=high, + noncontiguous=True, + ) else: - exp_tensor = make_tensor(exp_shape, dtype=dt, device=dev, low=low, high=high, - noncontiguous=True) + exp_tensor = make_tensor( + exp_shape, + dtype=dt, + device=dev, + low=low, + high=high, + noncontiguous=True, + ) self._test_pow(base_tensor, exp_scalar) self._test_pow(base_tensor, exp_tensor) @@ -1397,12 +1622,12 @@ def _test_int_and_float_pow(dt, low, high, dev): _test_int_and_float_pow(torch.int16, -5, 5, device) _test_int_and_float_pow(torch.int64, -10, 10, device) _test_int_and_float_pow(torch.int32, -10, 10, device) - _test_int_and_float_pow(torch.float16, 0., 5., device) - _test_int_and_float_pow(torch.float32, 0., 10., device) - _test_int_and_float_pow(torch.float64, 0., 10., device) + _test_int_and_float_pow(torch.float16, 0.0, 5.0, device) + _test_int_and_float_pow(torch.float32, 0.0, 10.0, device) + _test_int_and_float_pow(torch.float64, 0.0, 10.0, device) # pow's output would have some NaNs as well - _test_int_and_float_pow(torch.float32, -10., 10., device) - _test_int_and_float_pow(torch.float64, -10., 10., device) + _test_int_and_float_pow(torch.float32, -10.0, 10.0, device) + _test_int_and_float_pow(torch.float64, -10.0, 10.0, device) # Tests that a Runtime error occurs when a base tensor cannot be resized # by pow's inplace variant due to PyTorch's broadcasting semantics. @@ -1413,19 +1638,33 @@ def test_pow_inplace_resizing_exception(self, device): ((2, 1), (2, 2)), ((2, 2), (2, 1, 1)), ) - test_inputs = list((make_tensor(base_size, dtype=torch.float64, device=device, - high=10., low=0.), - make_tensor(exp_size, dtype=torch.float64, device=device, - high=10., low=0.)) - for base_size, exp_size in test_cases) + test_inputs = list( + ( + make_tensor( + base_size, dtype=torch.float64, device=device, high=10.0, low=0.0 + ), + make_tensor( + exp_size, dtype=torch.float64, device=device, high=10.0, low=0.0 + ), + ) + for base_size, exp_size in test_cases + ) for base, exponent in test_inputs: regex = "doesn't match the broadcast shape" self.assertRaisesRegex(RuntimeError, regex, base.pow_, exponent) def test_int_tensor_pow_neg_ints(self, device): - ints = [torch.iinfo(torch.int32).min, - -3, -2, -1, 0, 1, 2, 3, - torch.iinfo(torch.int32).max] + ints = [ + torch.iinfo(torch.int32).min, + -3, + -2, + -1, + 0, + 1, + 2, + 3, + torch.iinfo(torch.int32).max, + ] neg_ints = [torch.iinfo(torch.int32).min, -3, -2, -1] tensor = torch.tensor(ints, dtype=torch.int32, device=device) for pow in neg_ints: @@ -1440,16 +1679,17 @@ def test_long_tensor_pow_floats(self, device): @dtypes(*[torch.float32, torch.float64]) def test_float_scalar_pow_float_tensor(self, device, dtype): - floats = [2.0, -3 / 2, -1.0, -1 / 2, -1 / 3, 0.0, - 1 / 3, 1 / 2, 1.0, 3 / 2, 2.0] + floats = [2.0, -3 / 2, -1.0, -1 / 2, -1 / 3, 0.0, 1 / 3, 1 / 2, 1.0, 3 / 2, 2.0] exponent_shapes = ( (1,), (2, 2), (2, 1), (2, 2, 2), ) - tensors = list(make_tensor(shape, dtype=dtype, device=device, low=0) - for shape in exponent_shapes) + tensors = list( + make_tensor(shape, dtype=dtype, device=device, low=0) + for shape in exponent_shapes + ) floats_tensor = torch.tensor(floats, dtype=dtype, device=device) for base in floats: self._test_pow(base, floats_tensor) @@ -1458,38 +1698,50 @@ def test_float_scalar_pow_float_tensor(self, device, dtype): @onlyCUDA def test_cuda_tensor_pow_scalar_tensor(self, device): - cuda_tensors = [torch.randn((3, 3), device=device), torch.tensor(3.0, device=device)] - scalar_tensors = [torch.tensor(5.0, device='cpu'), torch.tensor(-3), torch.tensor(1)] + cuda_tensors = [ + torch.randn((3, 3), device=device), + torch.tensor(3.0, device=device), + ] + scalar_tensors = [ + torch.tensor(5.0, device="cpu"), + torch.tensor(-3), + torch.tensor(1), + ] for base, exp in product(cuda_tensors, scalar_tensors): self._test_pow(base, exp) @onlyCUDA def test_cpu_tensor_pow_cuda_scalar_tensor(self, device): - cuda_tensors = [torch.tensor(5.0, device='cuda'), torch.tensor(-3, device='cuda')] + cuda_tensors = [ + torch.tensor(5.0, device="cuda"), + torch.tensor(-3, device="cuda"), + ] for exp in cuda_tensors: - base = torch.randn((3, 3), device='cpu') - regex = 'Expected all tensors to be on the same device, but found at least two devices, cuda.* and cpu!' + base = torch.randn((3, 3), device="cpu") + regex = "Expected all tensors to be on the same device, but found at least two devices, cuda.* and cpu!" self.assertRaisesRegex(RuntimeError, regex, torch.pow, base, exp) for exp in cuda_tensors: # Binary ops with a cpu + cuda tensor are allowed if the cpu tensor has 0 dimension - base = torch.tensor(3.0, device='cpu') + base = torch.tensor(3.0, device="cpu") self._test_pow(base, exp) @onlyCUDA @dtypes(torch.complex64, torch.complex128) def test_pow_cuda_complex_extremal_failing(self, device, dtype): - t = torch.tensor(complex(-1., float('inf')), dtype=dtype, device=device) + t = torch.tensor(complex(-1.0, float("inf")), dtype=dtype, device=device) with self.assertRaises(AssertionError): cuda_out = t.pow(2) cpu_out = t.cpu().pow(2) self.assertEqual(cpu_out, cuda_out) @onlyNativeDeviceTypes - @dtypes(*(get_all_dtypes(include_bool=False, include_bfloat16=False))) + @dtypes(*all_types_and_complex_and(torch.half)) def test_complex_scalar_pow_tensor(self, device, dtype): - complexes = [0.5j, 1. + 1.j, -1.5j, 2.2 - 1.6j, 1 + 0j] - first_exp = make_tensor((100,), device, dtype, low=-2, high=2) - second_exp = make_tensor((100,), device, dtype, low=-2, high=2, noncontiguous=True) + complexes = [0.5j, 1.0 + 1.0j, -1.5j, 2.2 - 1.6j, 1 + 0j] + first_exp = make_tensor((100,), dtype=dtype, device=device, low=-2, high=2) + second_exp = make_tensor( + (100,), dtype=dtype, device=device, low=-2, high=2, noncontiguous=True + ) first_exp[0] = first_exp[10] = first_exp[20] = 0 second_exp[0] = second_exp[10] = second_exp[20] = 0 for base in complexes: @@ -1497,20 +1749,32 @@ def test_complex_scalar_pow_tensor(self, device, dtype): self._test_pow(base, second_exp) @onlyNativeDeviceTypes + @skipMeta def test_pow_scalar_type_promotion(self, device): # Test against a scalar and non-scalar input inputs = [17, [17]] for input in inputs: # We expect the computation to be performed in uint8 (overflowing to 0), and then cast to int64 input_tensor_uint8 = torch.tensor(input, dtype=torch.uint8, device=device) - out_uint8_computation = torch.pow(2, input_tensor_uint8, out=torch.tensor(0, dtype=torch.int64, device=device)) + out_uint8_computation = torch.pow( + 2, + input_tensor_uint8, + out=torch.tensor(0, dtype=torch.int64, device=device), + ) # Computation should run in int64, and not overflow input_tensor_int64 = torch.tensor(input, dtype=torch.int64, device=device) - out_int64_computation = torch.pow(2, input_tensor_int64, out=torch.tensor(0, dtype=torch.int64, device=device)) + out_int64_computation = torch.pow( + 2, + input_tensor_int64, + out=torch.tensor(0, dtype=torch.int64, device=device), + ) self.assertNotEqual(out_uint8_computation, out_int64_computation) - self.assertEqual(out_uint8_computation.to(dtype=torch.uint8), out_int64_computation.to(dtype=torch.uint8)) + self.assertEqual( + out_uint8_computation.to(dtype=torch.uint8), + out_int64_computation.to(dtype=torch.uint8), + ) def test_tensor_pow_tensor(self, device): def rotate(l, n): @@ -1530,26 +1794,24 @@ def test_tensor_pow_tensor(values, torch_type, numpy_type): test_tensor_pow_tensor(ints, torch.int32, np.int32) test_tensor_pow_tensor(ints, torch.int64, np.int64) - floats = [-3.0, -2.0, -1.0, -1 / 2, -1 / 3, - 0.0, 1 / 3, 1 / 2, 1.0, 2.0, 3.0] + floats = [-3.0, -2.0, -1.0, -1 / 2, -1 / 3, 0.0, 1 / 3, 1 / 2, 1.0, 2.0, 3.0] test_tensor_pow_tensor(floats, torch.float16, np.float16) test_tensor_pow_tensor(floats, torch.float32, np.float32) test_tensor_pow_tensor(floats, torch.float64, np.float64) - def test_logical_xor_with_nontrivial_alignment(self, device): # test tensor that is not aligned to multiple of 16 bytes size = 128 - a = (torch.randn(size, device=device) > 0) - b = (torch.randn(size, device=device) > 0) - c = (torch.randn(size, device=device) > 0) + a = torch.randn(size, device=device) > 0 + b = torch.randn(size, device=device) > 0 + c = torch.randn(size, device=device) > 0 non_trivial_alignment = [1, 2, 4, 8, 15] for i in non_trivial_alignment: for j in non_trivial_alignment: for k in non_trivial_alignment: - a_ = a[i: 100 + i] - b_ = b[j: 100 + j] - c_ = c[k: 100 + k] + a_ = a[i : 100 + i] + b_ = b[j : 100 + j] + c_ = c[k : 100 + k] torch.logical_xor(a_, b_, out=c_) for x, y, z in zip(a_.tolist(), b_.tolist(), c_.tolist()): self.assertEqual(x ^ y, z) @@ -1572,7 +1834,7 @@ def test_add_with_tail(self, device, dtype): @deviceCountAtLeast(2) @onlyCUDA def test_cross_device_binary_ops(self, devices): - vals = (1., (2.,)) + vals = (1.0, (2.0,)) cpu_tensor = torch.randn(2, 2) def do_test(op, a, b): @@ -1585,11 +1847,18 @@ def do_test(op, a, b): with self.assertRaisesRegex(RuntimeError, "Expected all tensors.+"): op(cpu_tensor, a) - for op in (operator.add, torch.add, - operator.sub, torch.sub, - operator.mul, torch.mul, - operator.truediv, torch.true_divide, - operator.floordiv, torch.floor_divide): + for op in ( + operator.add, + torch.add, + operator.sub, + torch.sub, + operator.mul, + torch.mul, + operator.truediv, + torch.true_divide, + operator.floordiv, + torch.floor_divide, + ): for a, b in product(vals, vals): a = torch.tensor(a, device=devices[0]) b = torch.tensor(b, device=devices[1]) @@ -1602,7 +1871,7 @@ def do_test(op, a, b): @deviceCountAtLeast(2) @onlyCUDA def test_binary_op_scalar_device_unspecified(self, devices): - scalar_val = torch.tensor(1.) + scalar_val = torch.tensor(1.0) for default_device in devices: with torch.cuda.device(default_device): for device in devices: @@ -1621,7 +1890,7 @@ def test_div_and_floordiv_vs_python(self, device): # the quotient. See https://github.com/pytorch/pytorch/issues/43874. def _scalar_helper(python_op, torch_op): for a, b in product(range(-10, 10), range(-10, 10)): - for op in (lambda x: x * .5, lambda x: math.floor(x)): + for op in (lambda x: x * 0.5, lambda x: math.floor(x)): a = op(a) b = op(b) @@ -1648,7 +1917,7 @@ def _scalar_helper(python_op, torch_op): _scalar_helper(operator.truediv, operator.truediv) _scalar_helper(operator.truediv, torch.true_divide) - with self.assertWarnsOnceRegex(UserWarning, 'floor_divide'): + with self.assertWarnsOnceRegex(UserWarning, "floor_divide"): _scalar_helper(lambda a, b: math.trunc(a / b), operator.floordiv) _scalar_helper(lambda a, b: math.trunc(a / b), torch.floor_divide) @@ -1666,7 +1935,7 @@ def _wrapped_floordiv(a, b): scripted_div = torch.jit.script(_wrapped_div) scripted_floordiv = torch.jit.script(_wrapped_floordiv) for a, b in product(range(-10, 10), range(-10, 10)): - for op in (lambda x: x * .5, lambda x: math.floor(x)): + for op in (lambda x: x * 0.5, lambda x: math.floor(x)): a = op(a) b = op(b) @@ -1680,7 +1949,7 @@ def _wrapped_floordiv(a, b): b_t = torch.tensor(b, device=device) self.assertEqual(scripted_div(a_t, b_t), expected_div) - with self.assertWarnsOnceRegex(UserWarning, 'floor_divide'): + with self.assertWarnsOnceRegex(UserWarning, "floor_divide"): self.assertEqual(scripted_floordiv(a_t, b_t), expected_truncdiv) # Creates jitted functions of one tensor @@ -1705,13 +1974,13 @@ def _wrapped_rfloordiv_scalar(a): scripted_rfloordiv_scalar = torch.jit.script(_wrapped_rfloordiv_scalar) for a in range(-10, 10): - for op in (lambda x: x * .5, lambda x: math.floor(x)): + for op in (lambda x: x * 0.5, lambda x: math.floor(x)): a = op(a) a_t = torch.tensor(a, device=device) self.assertEqual(a / 5, scripted_div_scalar(a_t)) - with self.assertWarnsOnceRegex(UserWarning, 'floor_divide'): + with self.assertWarnsOnceRegex(UserWarning, "floor_divide"): self.assertEqual(math.trunc(a / 5), scripted_floordiv_scalar(a_t)) # Skips zero divisors @@ -1780,7 +2049,7 @@ def _wrapped_ifloordiv_scalar(a): scripted_floor_divide__scalar = torch.jit.script(_wrapped_floor_divide__scalar) for a, b in product(range(-10, 10), range(-10, 10)): - for op in (lambda x: x * .5, lambda x: math.floor(x)): + for op in (lambda x: x * 0.5, lambda x: math.floor(x)): a = op(a) b = op(b) @@ -1804,8 +2073,13 @@ def _wrapped_ifloordiv_scalar(a): self.assertEqual(tmp0.item(), expected_idiv) self.assertEqual(tmp1.item(), expected_idiv) - self.assertEqual(scripted_true_divide__tensor(a_t.clone(), b_t).item(), expected_idiv) - self.assertEqual(scripted_true_divide__scalar(a_t.clone()).item(), a / 5) + self.assertEqual( + scripted_true_divide__tensor(a_t.clone(), b_t).item(), + expected_idiv, + ) + self.assertEqual( + scripted_true_divide__scalar(a_t.clone()).item(), a / 5 + ) else: tmp = a_t.clone() with self.assertRaises(RuntimeError): @@ -1817,42 +2091,56 @@ def _wrapped_ifloordiv_scalar(a): with self.assertRaises(RuntimeError): scripted_true_divide__scalar(tmp) - if not a_t.is_floating_point() and b_t.is_floating_point(): # Inplace modification fails because a float tensor is required # if the divisor is a float tensor - with self.assertRaises(RuntimeError), self.assertWarnsOnceRegex(UserWarning, "floor_divide"): + with self.assertRaises(RuntimeError), self.assertWarnsOnceRegex( + UserWarning, "floor_divide" + ): a_t.clone().floor_divide_(b_t) - with self.assertRaises(RuntimeError), self.assertWarnsOnceRegex(UserWarning, "floor_divide"): + with self.assertRaises(RuntimeError), self.assertWarnsOnceRegex( + UserWarning, "floor_divide" + ): scripted_floor_divide_tensor(a_t.clone(), b_t) tmp = a_t.clone() - with self.assertRaises(RuntimeError), self.assertWarnsOnceRegex(UserWarning, "floor_divide"): + with self.assertRaises(RuntimeError), self.assertWarnsOnceRegex( + UserWarning, "floor_divide" + ): tmp //= b_t else: # Inplace modification is OK when both or neither tensor is # a float tensor with self.assertWarnsOnceRegex(UserWarning, "floor_divide"): - self.assertEqual(a_t.clone().floor_divide_(b_t).item(), expected_itruncdiv) - self.assertEqual(scripted_floor_divide__tensor(a_t.clone(), b_t).item(), expected_itruncdiv) + self.assertEqual( + a_t.clone().floor_divide_(b_t).item(), expected_itruncdiv + ) + self.assertEqual( + scripted_floor_divide__tensor(a_t.clone(), b_t).item(), + expected_itruncdiv, + ) tmp = a_t.clone() with self.assertWarnsOnceRegex(UserWarning, "floor_divide"): tmp //= b_t self.assertEqual(tmp.item(), expected_itruncdiv) with self.assertWarnsOnceRegex(UserWarning, "floor_divide"): - self.assertEqual(scripted_floor_divide__scalar(a_t), math.trunc(a / 5)) + self.assertEqual( + scripted_floor_divide__scalar(a_t), math.trunc(a / 5) + ) # Tests binary op equivalence with Python builtin ops # Also tests that reverse operations are equivalent to forward ops # NOTE: division ops are tested separately above def test_binary_ops_with_scalars(self, device): - for python_op, torch_op in ((operator.add, torch.add), - (operator.sub, torch.sub), - (operator.mul, torch.mul), - (operator.truediv, torch.div)): + for python_op, torch_op in ( + (operator.add, torch.add), + (operator.sub, torch.sub), + (operator.mul, torch.mul), + (operator.truediv, torch.div), + ): for a, b in product(range(-10, 10), range(-10, 10)): - for op in (lambda x: x * .5, lambda x: math.floor(x)): + for op in (lambda x: x * 0.5, lambda x: math.floor(x)): a = op(a) b = op(b) @@ -1869,28 +2157,56 @@ def test_binary_ops_with_scalars(self, device): for args in product(vals, vals): first, second = args - first_scalar = first if not isinstance(first, torch.Tensor) else first.item() - second_scalar = second if not isinstance(second, torch.Tensor) else second.item() + first_scalar = ( + first + if not isinstance(first, torch.Tensor) + else first.item() + ) + second_scalar = ( + second + if not isinstance(second, torch.Tensor) + else second.item() + ) expected = python_op(first_scalar, second_scalar) self.assertEqual(expected, python_op(first, second)) self.assertEqual(expected, torch_op(first, second)) - @dtypes(*product(get_all_dtypes(include_complex=False), get_all_dtypes(include_complex=False))) + @dtypes( + *product( + all_types_and(torch.half, torch.bfloat16, torch.bool), + all_types_and(torch.half, torch.bfloat16, torch.bool), + ) + ) def test_maximum_minimum_type_promotion(self, device, dtypes): a = torch.tensor((0, 1), device=device, dtype=dtypes[0]) b = torch.tensor((1, 0), device=device, dtype=dtypes[1]) - for op in (torch.maximum, torch.max, torch.fmax, torch.minimum, torch.min, torch.fmin): + for op in ( + torch.maximum, + torch.max, + torch.fmax, + torch.minimum, + torch.min, + torch.fmin, + ): result = op(a, b) self.assertEqual(result.dtype, torch.result_type(a, b)) - @dtypes(*(get_all_int_dtypes() + [torch.bool])) + @dtypes(*integral_types_and(torch.bool)) def test_maximum_minimum_int_and_bool(self, device, dtype): - ops = ((torch.maximum, torch.max, np.maximum), (torch.minimum, torch.min, np.minimum), - (torch.fmax, None, np.fmax), (torch.fmin, None, np.fmin)) + ops = ( + (torch.maximum, torch.max, np.maximum), + (torch.minimum, torch.min, np.minimum), + (torch.fmax, None, np.fmax), + (torch.fmin, None, np.fmin), + ) rng = np.random.default_rng() - a_np = np.array(rng.integers(-100, 100, size=10), dtype=torch_to_numpy_dtype_dict[dtype]) - b_np = np.array(rng.integers(-100, 100, size=10), dtype=torch_to_numpy_dtype_dict[dtype]) + a_np = np.array( + rng.integers(-100, 100, size=10), dtype=torch_to_numpy_dtype_dict[dtype] + ) + b_np = np.array( + rng.integers(-100, 100, size=10), dtype=torch_to_numpy_dtype_dict[dtype] + ) for torch_op, alias, numpy_op in ops: a_tensor = torch.from_numpy(a_np).to(device=device, dtype=dtype) @@ -1910,10 +2226,14 @@ def test_maximum_minimum_int_and_bool(self, device, dtype): self.assertEqual(out, numpy_result) @precisionOverride({torch.bfloat16: 1e-2}) - @dtypes(*(get_all_fp_dtypes())) + @dtypes(*(floating_types_and(torch.half, torch.bfloat16))) def test_maximum_minimum_float(self, device, dtype): - ops = ((torch.maximum, torch.max, np.maximum), (torch.minimum, torch.min, np.minimum), - (torch.fmax, None, np.fmax), (torch.fmin, None, np.fmin)) + ops = ( + (torch.maximum, torch.max, np.maximum), + (torch.minimum, torch.min, np.minimum), + (torch.fmax, None, np.fmax), + (torch.fmin, None, np.fmin), + ) if dtype == torch.bfloat16: a_np = np.random.randn(10).astype(np.float64) @@ -1938,14 +2258,36 @@ def test_maximum_minimum_float(self, device, dtype): self.assertEqual(tensor_result, numpy_result, exact_dtype=False) self.assertEqual(out, numpy_result, exact_dtype=False) - @dtypes(*(get_all_fp_dtypes())) + @dtypes(*(floating_types_and(torch.half, torch.bfloat16))) def test_maximum_minimum_float_nan_and_inf(self, device, dtype): # np.maximum and np.minimum functions compare input arrays element-wisely. # if one of the elements being compared is a NaN, then that element is returned. - ops = ((torch.maximum, torch.max, np.maximum), (torch.minimum, torch.min, np.minimum), - (torch.fmax, None, np.fmax), (torch.fmin, None, np.fmin)) - a_vals = (float('inf'), -float('inf'), float('nan'), float('inf'), float('nan'), float('nan'), 1, float('nan')) - b_vals = (-float('inf'), float('inf'), float('inf'), float('nan'), float('nan'), 0, float('nan'), -5) + ops = ( + (torch.maximum, torch.max, np.maximum), + (torch.minimum, torch.min, np.minimum), + (torch.fmax, None, np.fmax), + (torch.fmin, None, np.fmin), + ) + a_vals = ( + float("inf"), + -float("inf"), + float("nan"), + float("inf"), + float("nan"), + float("nan"), + 1, + float("nan"), + ) + b_vals = ( + -float("inf"), + float("inf"), + float("inf"), + float("nan"), + float("nan"), + 0, + float("nan"), + -5, + ) if dtype == torch.bfloat16: a_np = np.array(a_vals, dtype=np.float64) b_np = np.array(b_vals, dtype=np.float64) @@ -1974,16 +2316,32 @@ def test_maximum_minimum_float_nan_and_inf(self, device, dtype): self.assertEqual(tensor_result, numpy_result) self.assertEqual(out, numpy_result) - @dtypes(*product(get_all_complex_dtypes(), get_all_dtypes())) + @dtypes( + *product( + complex_types(), + all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool), + ) + ) def test_maximum_minimum_complex(self, device, dtypes): - for torch_op in (torch.maximum, torch.minimum, torch.max, torch.min, torch.fmax, torch.fmin): - with self.assertRaisesRegex(RuntimeError, '.+not implemented for.+'): - torch_op(torch.ones(1, device=device, dtype=dtypes[0]), - torch.ones(1, device=device, dtype=dtypes[1])) - - with self.assertRaisesRegex(RuntimeError, '.+not implemented for.+'): - torch_op(torch.ones(1, device=device, dtype=dtypes[1]), - torch.ones(1, device=device, dtype=dtypes[0])) + for torch_op in ( + torch.maximum, + torch.minimum, + torch.max, + torch.min, + torch.fmax, + torch.fmin, + ): + with self.assertRaisesRegex(RuntimeError, ".+not implemented for.+"): + torch_op( + torch.ones(1, device=device, dtype=dtypes[0]), + torch.ones(1, device=device, dtype=dtypes[1]), + ) + + with self.assertRaisesRegex(RuntimeError, ".+not implemented for.+"): + torch_op( + torch.ones(1, device=device, dtype=dtypes[1]), + torch.ones(1, device=device, dtype=dtypes[0]), + ) @onlyCUDA def test_maximum_minimum_cross_device(self, device): @@ -1992,12 +2350,14 @@ def test_maximum_minimum_cross_device(self, device): ops = (torch.maximum, torch.minimum) for torch_op in ops: - with self.assertRaisesRegex(RuntimeError, - "Expected all tensors to be on the same device"): + with self.assertRaisesRegex( + RuntimeError, "Expected all tensors to be on the same device" + ): torch_op(a, b) - with self.assertRaisesRegex(RuntimeError, - "Expected all tensors to be on the same device"): + with self.assertRaisesRegex( + RuntimeError, "Expected all tensors to be on the same device" + ): torch_op(b, a) # test cuda tensor and cpu scalar @@ -2016,7 +2376,12 @@ def test_maximum_minimum_cross_device(self, device): self.assertEqual(tensor_result_1, numpy_result_1) self.assertEqual(tensor_result_2, numpy_result_2) - @dtypes(*product(get_all_fp_dtypes(), get_all_fp_dtypes())) + @dtypes( + *product( + floating_types_and(torch.half, torch.bfloat16), + floating_types_and(torch.half, torch.bfloat16), + ) + ) def test_maximum_and_minimum_subgradient(self, device, dtypes): def run_test(f, a, b, expected_a_grad, expected_b_grad): a = torch.tensor(a, requires_grad=True, device=device, dtype=dtypes[0]) @@ -2026,8 +2391,47 @@ def run_test(f, a, b, expected_a_grad, expected_b_grad): self.assertEqual(a.grad, expected_a_grad) self.assertEqual(b.grad, expected_b_grad) - run_test(torch.maximum, [0., 1., 2.], [1., 1., 1.], [0., 0.5, 1.], [1., 0.5, 0.]) - run_test(torch.minimum, [0., 1., 2.], [1., 1., 1.], [1., 0.5, 0.], [0., 0.5, 1.]) + run_test( + torch.maximum, + [0.0, 1.0, 2.0], + [1.0, 1.0, 1.0], + [0.0, 0.5, 1.0], + [1.0, 0.5, 0.0], + ) + run_test( + torch.minimum, + [0.0, 1.0, 2.0], + [1.0, 1.0, 1.0], + [1.0, 0.5, 0.0], + [0.0, 0.5, 1.0], + ) + + def test_maximum_minimum_forward_ad_float32(self, device): + # TODO: This should really be covered by OpInfo but it isn't. The problem + # is that our gradient tests test using float64 but it should also test + # float32 + x = torch.randn(3, device=device, dtype=torch.float32) + y = torch.randn(3, device=device, dtype=torch.float32) + tx = torch.randn(3, device=device, dtype=torch.float32) + ty = torch.randn(3, device=device, dtype=torch.float32) + + with fwAD.dual_level(): + x_dual = fwAD.make_dual(x, tx) + y_dual = fwAD.make_dual(y, ty) + result = torch.maximum(x_dual, y_dual) + _, result_tangent = fwAD.unpack_dual(result) + + expected = torch.where(x > y, tx, ty) + self.assertEqual(result_tangent, expected) + + with fwAD.dual_level(): + x_dual = fwAD.make_dual(x, tx) + y_dual = fwAD.make_dual(y, ty) + result = torch.minimum(x_dual, y_dual) + _, result_tangent = fwAD.unpack_dual(result) + + expected = torch.where(x < y, tx, ty) + self.assertEqual(result_tangent, expected) # TODO: tests like this should be generic @dtypesIfCUDA(torch.half, torch.float, torch.double) @@ -2039,24 +2443,37 @@ def test_mul_intertype_scalar(self, device, dtype): self.assertEqual(x * y, 4.5) self.assertEqual(y * x, 4.5) - with self.assertRaisesRegex(RuntimeError, "can't be cast to the desired output type"): + with self.assertRaisesRegex( + RuntimeError, "can't be cast to the desired output type" + ): y *= x x *= y self.assertEqual(x, 4.5) @onlyCPU - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool)) def test_sub(self, device, dtype): - m1 = torch.tensor([2.34, 4.44], dtype=dtype, device=device) - m2 = torch.tensor([1.23, 2.33], dtype=dtype, device=device) + if dtype in integral_types(): + # Before Python 3.10, floats were implicitly converted to ints, but with + # DeprecationWarning: an integer is required (got type float). + # Implicit conversion to integers using __int__ is deprecated, + # and may be removed in a future version of Python. + # Since Python 3.10, that attempt gives an error. + m1 = torch.tensor([2, 4], dtype=dtype, device=device) + m2 = torch.tensor([1, 2], dtype=dtype, device=device) + diff = torch.tensor([1, 2], dtype=dtype) + else: + m1 = torch.tensor([2.34, 4.44], dtype=dtype, device=device) + m2 = torch.tensor([1.23, 2.33], dtype=dtype, device=device) + diff = torch.tensor([1.11, 2.11], dtype=dtype) if dtype == torch.bool: self.assertRaises(RuntimeError, lambda: m1 - m2) - elif (dtype == torch.bfloat16 or dtype == torch.half): + elif dtype == torch.bfloat16 or dtype == torch.half: # bfloat16 has a lower precision so we have to have a separate check for it - self.assertEqual(m1 - m2, torch.tensor([1.11, 2.11], dtype=dtype), atol=0.01, rtol=0) + self.assertEqual(m1 - m2, diff, atol=0.01, rtol=0) else: - self.assertEqual(m1 - m2, torch.tensor([1.11, 2.11], dtype=dtype)) + self.assertEqual(m1 - m2, diff) # TODO: what is this test testing? @onlyCPU @@ -2088,27 +2505,43 @@ def test_min_max_binary_op_nan(self, device, dtype): b = torch.rand(1000, dtype=dtype, device=device) # 0:250: a -- nan, b -- not nan - a[:250] = float('nan') + a[:250] = float("nan") # 250:500: a -- not nan, b -- nan - b[250:500] = float('nan') + b[250:500] = float("nan") # 500:750: a and b both nan - a[500:750] = float('nan') - b[500:750] = float('nan') + a[500:750] = float("nan") + b[500:750] = float("nan") # 750:1000: neither nan ma = torch.max(a, b) mi = torch.min(a, b) for i in range(750): - self.assertTrue(torch.isnan(ma[i]), "max(a, b): {}, a: {}, b: {}".format(ma[i], a[i], b[i])) - self.assertTrue(torch.isnan(mi[i]), "min(a, b): {}, a: {}, b: {}".format(mi[i], a[i], b[i])) + self.assertTrue( + torch.isnan(ma[i]), + "max(a, b): {}, a: {}, b: {}".format(ma[i], a[i], b[i]), + ) + self.assertTrue( + torch.isnan(mi[i]), + "min(a, b): {}, a: {}, b: {}".format(mi[i], a[i], b[i]), + ) for i in range(750, 1000): - self.assertFalse(torch.isnan(ma[i]), "max(a, b): {}, a: {}, b: {}".format(ma[i], a[i], b[i])) - self.assertFalse(torch.isnan(mi[i]), "min(a, b): {}, a: {}, b: {}".format(mi[i], a[i], b[i])) + self.assertFalse( + torch.isnan(ma[i]), + "max(a, b): {}, a: {}, b: {}".format(ma[i], a[i], b[i]), + ) + self.assertFalse( + torch.isnan(mi[i]), + "min(a, b): {}, a: {}, b: {}".format(mi[i], a[i], b[i]), + ) - @dtypes(*product(get_all_dtypes(include_complex=False), - get_all_dtypes(include_complex=False))) + @dtypes( + *product( + all_types_and(torch.half, torch.bfloat16, torch.bool), + all_types_and(torch.half, torch.bfloat16, torch.bool), + ) + ) def test_copysign(self, device, dtypes): def _test_copysign_numpy(a, b): torch_result = torch.copysign(a, b) @@ -2125,7 +2558,7 @@ def _test_copysign_numpy(a, b): expected = torch.from_numpy(np.copysign(np_a, np_b)) # To handle inconsistencies of type promotion between PyTorch and Numpy # Applied for both arguments having integral precision and bfloat16 - types = [torch.bool, torch.bfloat16] + get_all_int_dtypes() + types = integral_types_and(torch.bool, torch.bfloat16) if a.dtype in types or b.dtype in types: promoted_type = torch.promote_types(torch_result.dtype, expected.dtype) torch_result = torch_result.to(promoted_type) @@ -2140,8 +2573,10 @@ def _test_copysign_numpy(a, b): # Special case: NaN conversions between FP32 and FP16 is not bitwise # equivalent to pass this assertion. if a.dtype != torch.float16 and b.dtype != torch.float16: - self.assertEqual(torch.copysign(torch.tensor(1.0), torch_result), - torch.copysign(torch.tensor(1.0), expected)) + self.assertEqual( + torch.copysign(torch.tensor(1.0), torch_result), + torch.copysign(torch.tensor(1.0), expected), + ) # Compare Result with NumPy # Type promotion @@ -2159,52 +2594,76 @@ def _test_copysign_numpy(a, b): _test_copysign_numpy(a, b) # 0.0/-0.0/inf/-inf/nan - cases = [0.0, -0.0, float('inf'), float('-inf'), float('nan')] + cases = [0.0, -0.0, float("inf"), float("-inf"), float("nan")] # torch.bfloat16 can not hold '-nan' # torch.half can not hold '-nan' on CUDA types = [torch.float32, torch.float64] - if device == 'cpu': + if device == "cpu": types.append(torch.float16) if dtypes[0] in types: b = make_tensor((10, 10), device=device, dtype=dtypes[1], low=-9, high=9) for case in cases: - _test_copysign_numpy(torch.tensor([case], device=device, dtype=dtypes[0]), b) + _test_copysign_numpy( + torch.tensor([case], device=device, dtype=dtypes[0]), b + ) - if dtypes[1] in get_all_fp_dtypes(): + if dtypes[1] in floating_types_and(torch.half, torch.bfloat16): a = make_tensor((10, 10), device=device, dtype=dtypes[0], low=-9, high=9) for case in cases: - _test_copysign_numpy(a, torch.tensor([case], device=device, dtype=dtypes[1])) - - @dtypes(*product(get_all_fp_dtypes(), - get_all_fp_dtypes())) + _test_copysign_numpy( + a, torch.tensor([case], device=device, dtype=dtypes[1]) + ) + + @dtypes( + *product( + floating_types_and(torch.half, torch.bfloat16), + floating_types_and(torch.half, torch.bfloat16), + ) + ) def test_copysign_subgradient(self, device, dtypes): # Input is 0.0 - x = torch.tensor([0.0, 0.0, 0.0], dtype=dtypes[0], device=device, requires_grad=True) - y = torch.tensor([-1.0, 0.0, 1.0], dtype=dtypes[1], device=device, requires_grad=True) + x = torch.tensor( + [0.0, 0.0, 0.0], dtype=dtypes[0], device=device, requires_grad=True + ) + y = torch.tensor( + [-1.0, 0.0, 1.0], dtype=dtypes[1], device=device, requires_grad=True + ) out = torch.copysign(x, y) out.sum().backward() self.assertEqual(x.grad.tolist(), [0.0, 0.0, 0.0]) self.assertEqual(y.grad.tolist(), [0.0] * 3) # Input is -0.0 - x = torch.tensor([-0.0, -0.0, -0.0], dtype=dtypes[0], device=device, requires_grad=True) - y = torch.tensor([-1.0, 0.0, 1.0], dtype=dtypes[1], device=device, requires_grad=True) + x = torch.tensor( + [-0.0, -0.0, -0.0], dtype=dtypes[0], device=device, requires_grad=True + ) + y = torch.tensor( + [-1.0, 0.0, 1.0], dtype=dtypes[1], device=device, requires_grad=True + ) out = torch.copysign(x, y) out.sum().backward() self.assertEqual(x.grad.tolist(), [0.0, 0.0, 0.0]) self.assertEqual(y.grad.tolist(), [0.0] * 3) # Other is 0.0 - x = torch.tensor([-1.0, 0.0, 1.0], dtype=dtypes[0], device=device, requires_grad=True) - y = torch.tensor([0.0, 0.0, 0.0], dtype=dtypes[1], device=device, requires_grad=True) + x = torch.tensor( + [-1.0, 0.0, 1.0], dtype=dtypes[0], device=device, requires_grad=True + ) + y = torch.tensor( + [0.0, 0.0, 0.0], dtype=dtypes[1], device=device, requires_grad=True + ) out = torch.copysign(x, y) out.sum().backward() self.assertEqual(x.grad.tolist(), [-1.0, 0.0, 1.0]) self.assertEqual(y.grad.tolist(), [0.0] * 3) # Other is -0.0 - x = torch.tensor([-1.0, 0.0, 1.0], dtype=dtypes[0], device=device, requires_grad=True) - y = torch.tensor([-0.0, -0.0, -0.0], dtype=dtypes[1], device=device, requires_grad=True) + x = torch.tensor( + [-1.0, 0.0, 1.0], dtype=dtypes[0], device=device, requires_grad=True + ) + y = torch.tensor( + [-0.0, -0.0, -0.0], dtype=dtypes[1], device=device, requires_grad=True + ) out = torch.copysign(x, y) out.sum().backward() self.assertEqual(x.grad.tolist(), [1.0, 0.0, -1.0]) @@ -2212,9 +2671,10 @@ def test_copysign_subgradient(self, device, dtypes): @dtypes(torch.bfloat16, torch.float) def test_div(self, device, dtype): - for op, method, inplace in ((torch.div, torch.Tensor.div, torch.Tensor.div_), - (torch.true_divide, torch.Tensor.true_divide, - torch.Tensor.true_divide_)): + for op, method, inplace in ( + (torch.div, torch.Tensor.div, torch.Tensor.div_), + (torch.true_divide, torch.Tensor.true_divide, torch.Tensor.true_divide_), + ): m1 = torch.randn(10, 10, dtype=torch.float, device=device).to(dtype=dtype) res1 = m1.clone() inplace(res1[:, 3], 2) @@ -2225,40 +2685,48 @@ def test_div(self, device, dtype): if dtype == torch.bfloat16: a1 = torch.tensor([4.2, 6.2], dtype=dtype, device=device) - a2 = torch.tensor([2., 2.], dtype=dtype, device=device) - self.assertEqual(op(a1, a2), - torch.tensor([2.1, 3.1], dtype=dtype, device=device), - atol=0.01, rtol=0) + a2 = torch.tensor([2.0, 2.0], dtype=dtype, device=device) + self.assertEqual( + op(a1, a2), + torch.tensor([2.1, 3.1], dtype=dtype, device=device), + atol=0.01, + rtol=0, + ) self.assertEqual(method(a1, a2), op(a1, a2)) @dtypes(torch.bfloat16, torch.float) def test_true_divide_out(self, device, dtype): a1 = torch.tensor([4.2, 6.2], dtype=dtype, device=device) - a2 = torch.tensor([2., 2.], dtype=dtype, device=device) + a2 = torch.tensor([2.0, 2.0], dtype=dtype, device=device) res = torch.empty_like(a1) - self.assertEqual(torch.true_divide(a1, a2, out=res), - torch.tensor([2.1, 3.1], dtype=dtype, device=device), - atol=0.01, rtol=0) + self.assertEqual( + torch.true_divide(a1, a2, out=res), + torch.tensor([2.1, 3.1], dtype=dtype, device=device), + atol=0.01, + rtol=0, + ) @onlyCUDA @dtypes(torch.half) def test_divmul_scalar(self, device, dtype): - x = torch.tensor(100., device=device, dtype=dtype) + x = torch.tensor(100.0, device=device, dtype=dtype) x_ref = x.float() scale = 1e5 res = x.div(scale) expected = x_ref.div(scale) - self.assertEqual(res, expected.to(dtype), atol=0., rtol=0.) + self.assertEqual(res, expected.to(dtype), atol=0.0, rtol=0.0) x = torch.tensor(1e-5, device=device, dtype=dtype) x_ref = x.float() res = x.mul(scale) expected = x_ref.mul(scale) - self.assertEqual(res, expected.to(dtype), atol=0., rtol=0.) + self.assertEqual(res, expected.to(dtype), atol=0.0, rtol=0.0) res = scale * x - self.assertEqual(res, expected.to(dtype), atol=0., rtol=0.) + self.assertEqual(res, expected.to(dtype), atol=0.0, rtol=0.0) - @dtypesIfCUDA(*set(get_all_math_dtypes('cuda')) - {torch.complex64, torch.complex128}) - @dtypes(*set(get_all_math_dtypes('cpu')) - {torch.complex64, torch.complex128}) + @dtypesIfCUDA( + *set(get_all_math_dtypes("cuda")) - {torch.complex64, torch.complex128} + ) + @dtypes(*set(get_all_math_dtypes("cpu")) - {torch.complex64, torch.complex128}) def test_floor_divide_tensor(self, device, dtype): x = torch.randn(10, device=device).mul(30).to(dtype) y = torch.arange(1, 11, dtype=dtype, device=device) @@ -2270,14 +2738,18 @@ def test_floor_divide_tensor(self, device, dtype): self.assertEqual(z.dtype, x.dtype) self.assertEqual(z, z_alt) - @dtypesIfCUDA(*set(get_all_math_dtypes('cuda')) - {torch.complex64, torch.complex128}) - @dtypes(*set(get_all_math_dtypes('cpu')) - {torch.complex64, torch.complex128}) + @dtypesIfCUDA( + *set(get_all_math_dtypes("cuda")) - {torch.complex64, torch.complex128} + ) + @dtypes(*set(get_all_math_dtypes("cpu")) - {torch.complex64, torch.complex128}) def test_floor_divide_scalar(self, device, dtype): x = torch.randn(100, device=device).mul(10).to(dtype) with self.assertWarnsOnceRegex(UserWarning, "__floordiv__"): z = x // 3 - z_alt = torch.tensor([math.trunc(v.item() / 3.) for v in x], dtype=x.dtype, device=device) + z_alt = torch.tensor( + [math.trunc(v.item() / 3.0) for v in x], dtype=x.dtype, device=device + ) self.assertEqual(z.dtype, x.dtype) self.assertEqual(z, z_alt) @@ -2304,7 +2776,7 @@ def test_floor_divide_out(self, device, dtype): self.assertEqual(o, torch.floor_divide(x.float(), y.float())) @onlyCPU - @dtypes(*get_all_math_dtypes('cpu')) + @dtypes(*get_all_math_dtypes("cpu")) def test_rdiv(self, device, dtype): if dtype is torch.float16: return @@ -2316,7 +2788,7 @@ def test_rdiv(self, device, dtype): z = torch.tensor([30 / v.item() for v in x], device=device) self.assertEqual(y, z, exact_dtype=False) - @dtypes(*get_all_fp_dtypes(include_bfloat16=False)) + @dtypes(*floating_types_and(torch.half)) def test_fmod_remainder_by_zero_float(self, device, dtype): fn_list = (torch.fmod, torch.remainder) for fn in fn_list: @@ -2327,8 +2799,7 @@ def test_fmod_remainder_by_zero_float(self, device, dtype): self.assertTrue(torch.all(fn(x, zero).isnan())) @onlyNativeDeviceTypes # Check Issue https://github.com/pytorch/pytorch/issues/48130 - @skipCUDAIfRocm # Error happens on both ROCM and XLA - @dtypes(*get_all_int_dtypes()) + @dtypes(*integral_types()) def test_fmod_remainder_by_zero_integral(self, device, dtype): fn_list = (torch.fmod, torch.remainder) for fn in fn_list: @@ -2336,16 +2807,19 @@ def test_fmod_remainder_by_zero_integral(self, device, dtype): x = make_tensor((10, 10), device=device, dtype=dtype, low=-9, high=9) zero = torch.zeros_like(x) # RuntimeError on CPU - if self.device_type == 'cpu': + if self.device_type == "cpu": with self.assertRaisesRegex(RuntimeError, "ZeroDivisionError"): fn(x, zero) - # Different value for different dtype on CUDA: - # Due to it's an undefined behavior, CUDA returns a pattern of all 1s - # for integral dividend (other than int64) divided by zero. For int64, - # CUDA returns all 1s for negative dividend, half 1s for positive dividend. - # uint8: 0xff -> 255 - # int32: 0xffffffff -> -1 + elif torch.version.hip is not None: + # ROCm behavior: x % 0 is a no-op; x is returned + self.assertEqual(fn(x, zero), x) else: + # CUDA behavior: Different value for different dtype + # Due to it's an undefined behavior, CUDA returns a pattern of all 1s + # for integral dividend (other than int64) divided by zero. For int64, + # CUDA returns all 1s for negative dividend, half 1s for positive dividend. + # uint8: 0xff -> 255 + # int32: 0xffffffff -> -1 if dtype == torch.int64: self.assertEqual(fn(x, zero) == 4294967295, x >= 0) self.assertEqual(fn(x, zero) == -1, x < 0) @@ -2353,7 +2827,7 @@ def test_fmod_remainder_by_zero_integral(self, device, dtype): value = 255 if dtype == torch.uint8 else -1 self.assertTrue(torch.all(fn(x, zero) == value)) - @dtypes(*get_all_dtypes(include_bfloat16=False, include_bool=False, include_complex=False)) + @dtypes(*all_types_and(torch.half)) def test_fmod_remainder(self, device, dtype): # Use numpy as reference def _helper(x, mod, fns_list): @@ -2377,9 +2851,12 @@ def _helper(x, mod, fns_list): inplace_fn(x, mod) self.assertEqual(x, exp, exact_dtype=False) except RuntimeError as e: - self.assertRegex(str(e), "result type (Half|Float|Double) " - "can't be cast to the desired output " - "type (Byte|Char|Short|Int|Long)") + self.assertRegex( + str(e), + "result type (Half|Float|Double) " + "can't be cast to the desired output " + "type (Byte|Char|Short|Int|Long)", + ) x = make_tensor((10, 10), device=device, dtype=dtype, low=-9, high=9) # mod with same dtype as x @@ -2390,21 +2867,31 @@ def _helper(x, mod, fns_list): # Mods: Integer, Float, Tensor, Non-contiguous Tensor mods = [3, 2.3, mod, mod.t()] # mod with floating-point dtype - if dtype in get_all_int_dtypes(): - mod_float = make_tensor((10, 10), device=device, dtype=torch.float, low=-9, high=9) + if dtype in integral_types(): + mod_float = make_tensor( + (10, 10), device=device, dtype=torch.float, low=-9, high=9 + ) mod[mod == 0] = 1 mods.append(mod_float) for dividend, mod in product([x, x.t()], mods): - _helper(dividend, mod, - ((torch.fmod, torch.Tensor.fmod_, np.fmod), - (torch.remainder, torch.Tensor.remainder_, np.remainder),)) + _helper( + dividend, + mod, + ( + (torch.fmod, torch.Tensor.fmod_, np.fmod), + (torch.remainder, torch.Tensor.remainder_, np.remainder), + ), + ) # Tests for torch.remainder(scalar, tensor) for dividend, mod in product([5, 3.14], mods): if torch.is_tensor(mod): - _helper(dividend, mod, - ((torch.remainder, torch.Tensor.remainder_, np.remainder),)) + _helper( + dividend, + mod, + ((torch.remainder, torch.Tensor.remainder_, np.remainder),), + ) @dtypes(torch.float, torch.double) def test_remainder_fmod_large_dividend(self, device, dtype): @@ -2416,23 +2903,45 @@ def test_remainder_fmod_large_dividend(self, device, dtype): b = torch.tensor([bvalue], dtype=dtype, device=device) c = torch.remainder(a, b) d = torch.fmod(a, b) - self.assertTrue((b[0] > 0) == (c[0] > 0)) # remainder has same sign as divisor - self.assertTrue((a[0] > 0) == (d[0] > 0)) # fmod has same sign as dividend - self.assertTrue(abs(c[0]) < abs(b[0])) # remainder is within range of divisor - self.assertTrue(abs(d[0]) < abs(b[0])) # fmod is within range of divisor - if ((a[0] > 0) == (b[0] > 0)): - self.assertTrue(c[0] == d[0]) # remainder is same as fmod + self.assertTrue( + (b[0] > 0) == (c[0] > 0) + ) # remainder has same sign as divisor + self.assertTrue( + (a[0] > 0) == (d[0] > 0) + ) # fmod has same sign as dividend + self.assertTrue( + abs(c[0]) < abs(b[0]) + ) # remainder is within range of divisor + self.assertTrue( + abs(d[0]) < abs(b[0]) + ) # fmod is within range of divisor + if (a[0] > 0) == (b[0] > 0): + self.assertTrue(c[0] == d[0]) # remainder is same as fmod else: - self.assertTrue(abs(c[0] - d[0]) == abs(b[0])) # differ by one divisor + self.assertTrue( + abs(c[0] - d[0]) == abs(b[0]) + ) # differ by one divisor @dtypesIfCPU(torch.bfloat16, torch.float32, torch.float64) @dtypes(torch.float32, torch.float64) def test_hypot(self, device, dtype): inputs = [ - (torch.randn(10, device=device).to(dtype), torch.randn(10, device=device).to(dtype)), - (torch.randn((3, 3, 3), device=device).to(dtype), torch.randn((3, 3, 3), device=device).to(dtype)), - (torch.randn((10, 1), device=device).to(dtype), torch.randn((10, 1), device=device).to(dtype).transpose(0, 1)), - (torch.randint(100, (10, ), device=device, dtype=torch.long), torch.randn(10, device=device).to(dtype)) + ( + torch.randn(10, device=device).to(dtype), + torch.randn(10, device=device).to(dtype), + ), + ( + torch.randn((3, 3, 3), device=device).to(dtype), + torch.randn((3, 3, 3), device=device).to(dtype), + ), + ( + torch.randn((10, 1), device=device).to(dtype), + torch.randn((10, 1), device=device).to(dtype).transpose(0, 1), + ), + ( + torch.randint(100, (10,), device=device, dtype=torch.long), + torch.randn(10, device=device).to(dtype), + ), ] for input in inputs: actual = torch.hypot(input[0], input[1]) @@ -2511,8 +3020,8 @@ def test_nextafter(self, device, dtype): @onlyNativeDeviceTypes @dtypes(torch.bfloat16) def test_nextafter_bfloat16(self, device, dtype): - nan = float('nan') - inf = float('inf') + nan = float("nan") + inf = float("inf") cases = ( # (from, to, expected) (0, 1, 9.183549615799121e-41), @@ -2528,7 +3037,7 @@ def test_nextafter_bfloat16(self, device, dtype): (20, -3000, 19.875), (3000, -20, 2992.0), (-3000, 20, -2992.0), - (65536, 0, 65280.0) , + (65536, 0, 65280.0), (65536, inf, 66048.0), (-65536, 0, -65280.0), (-65536, -inf, -66048.0), @@ -2537,11 +3046,11 @@ def test_nextafter_bfloat16(self, device, dtype): (nan, nan, nan), (nan, inf, nan), (inf, nan, nan), - (inf, -inf, 3.3895313892515355e+38), - (-inf, inf, -3.3895313892515355e+38), - (inf, 0, 3.3895313892515355e+38), + (inf, -inf, 3.3895313892515355e38), + (-inf, inf, -3.3895313892515355e38), + (inf, 0, 3.3895313892515355e38), (0, inf, 9.183549615799121e-41), - (-inf, 0, -3.3895313892515355e+38), + (-inf, 0, -3.3895313892515355e38), (0, -inf, -9.183549615799121e-41), ) @@ -2574,10 +3083,17 @@ def reference_implementation(res2): sm1 = m1[:, 4] sm2 = m2[:, 4] # view as sm1.size() - sm2.set_(sm2.storage(), sm2.storage_offset(), sm1.size(), (sm2.stride()[0] * 10, sm2.stride()[0])) + sm2.set_( + sm2.storage(), + sm2.storage_offset(), + sm1.size(), + (sm2.stride()[0] * 10, sm2.stride()[0]), + ) res1 = torchfn(sm1, sm2) # reference_implementation assumes 1-d sm2 - sm2.set_(sm2.storage(), sm2.storage_offset(), m2[:, 4].size(), m2[:, 4].stride()) + sm2.set_( + sm2.storage(), sm2.storage_offset(), m2[:, 4].size(), m2[:, 4].stride() + ) res2 = reference_implementation(res1.clone()) self.assertEqual(res1, res2) @@ -2599,29 +3115,69 @@ def test_cmul(self, device, dtype): @onlyCPU @dtypes(torch.float) def test_cpow(self, device, dtype): - self._test_cop(torch.pow, lambda x, y: nan if x < 0 else math.pow(x, y), dtype, device) + self._test_cop( + torch.pow, lambda x, y: nan if x < 0 else math.pow(x, y), dtype, device + ) @onlyCPU @dtypes(torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64) def test_floor_divide_zero(self, device, dtype): a = torch.tensor([0, 1], dtype=dtype, device=device) b = torch.tensor([0, 1], dtype=dtype, device=device) - with self.assertRaisesRegex(RuntimeError, 'ZeroDivisionError'): + with self.assertRaisesRegex(RuntimeError, "ZeroDivisionError"): with self.assertWarnsOnceRegex(UserWarning, "floor_divide"): a // b @unittest.skipIf(TEST_WITH_ASAN, "Integer overflows are not allowed under ASAN") - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool)) def test_muldiv_scalar(self, device, dtype): - x = make_tensor((10, 3), device, dtype, low=None, high=None) - s = make_tensor((1,), 'cpu', dtype, low=None, high=None).item() + x = make_tensor((10, 3), dtype=dtype, device=device, low=None, high=None) + s = make_tensor((1,), dtype=dtype, device="cpu", low=None, high=None).item() y = torch.full_like(x, s) self.assertEqual(x * s, x * y) self.assertEqual(s * x, y * x) self.assertEqual(x / s, x / y) self.assertEqual(s / x, y / x) - @dtypes(*tuple(itertools.combinations_with_replacement(get_all_dtypes(), 2))) + # TODO: update make_tensor to support extremal additions and remove this in favor of make_tensor + def _generate_input(self, shape, dtype, device, with_extremal): + if shape == (): + x = torch.tensor((), dtype=dtype, device=device) + else: + if dtype.is_floating_point or dtype.is_complex: + # work around torch.randn not being implemented for bfloat16 + if dtype == torch.bfloat16: + x = torch.randn(*shape, device=device) * random.randint(30, 100) + x = x.to(torch.bfloat16) + else: + x = torch.randn( + *shape, dtype=dtype, device=device + ) * random.randint(30, 100) + x[torch.randn(*shape) > 0.5] = 0 + if with_extremal and dtype.is_floating_point: + # Use extremal values + x[torch.randn(*shape) > 0.5] = float("nan") + x[torch.randn(*shape) > 0.5] = float("inf") + x[torch.randn(*shape) > 0.5] = float("-inf") + elif with_extremal and dtype.is_complex: + x[torch.randn(*shape) > 0.5] = complex("nan") + x[torch.randn(*shape) > 0.5] = complex("inf") + x[torch.randn(*shape) > 0.5] = complex("-inf") + elif dtype == torch.bool: + x = torch.zeros(shape, dtype=dtype, device=device) + x[torch.randn(*shape) > 0.5] = True + else: + x = torch.randint(15, 100, shape, dtype=dtype, device=device) + + return x + + @dtypes( + *tuple( + itertools.combinations_with_replacement( + all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool), 2 + ) + ) + ) def test_comparison_ops_type_promotion_and_broadcasting(self, device, dtypes): # issue #42660 # testing all combinations of broadcasting and type promotion @@ -2630,37 +3186,45 @@ def compare_with_numpy_bin_op(torch_fn, np_fn, x, y, out=None): # working around the fact that numpy doesn't support bfloat16 # by letting numpy treat them as float32's x_np = x if x.dtype != torch.bfloat16 else x.to(torch.float32) - y_np = y.cpu().numpy() if y.dtype != torch.bfloat16 else y.to(torch.float32).cpu().numpy() - self.compare_with_numpy(lambda inp: torch_fn(inp, y, out=out) if out else torch_fn(inp, y), - lambda inp: np_fn(inp, y_np, out=out) if out else np_fn(inp, y_np), - x_np) + y_np = ( + y.cpu().numpy() + if y.dtype != torch.bfloat16 + else y.to(torch.float32).cpu().numpy() + ) + self.compare_with_numpy( + lambda inp: torch_fn(inp, y, out=out) if out else torch_fn(inp, y), + lambda inp: np_fn(inp, y_np, out=out) if out else np_fn(inp, y_np), + x_np, + ) - complex_op_denylist = [torch.lt, torch.le, torch.gt, torch.ge] # complex not supported - input_sizes = [ - (1,), - (10,), - (10, 1), - (1, 10), - (4, 10), - (64, 10), - (12, 3)] - op_pairs = [(torch.lt, np.less), - (torch.le, np.less_equal), - (torch.gt, np.greater), - (torch.ge, np.greater_equal), - (torch.eq, np.equal), - (torch.ne, np.not_equal), - (torch.logical_and, np.logical_and), - (torch.logical_or, np.logical_or), - (torch.logical_xor, np.logical_xor)] + complex_op_denylist = [ + torch.lt, + torch.le, + torch.gt, + torch.ge, + ] # complex not supported + input_sizes = [(1,), (10,), (10, 1), (1, 10), (4, 10), (64, 10), (12, 3)] + op_pairs = [ + (torch.lt, np.less), + (torch.le, np.less_equal), + (torch.gt, np.greater), + (torch.ge, np.greater_equal), + (torch.eq, np.equal), + (torch.ne, np.not_equal), + (torch.logical_and, np.logical_and), + (torch.logical_or, np.logical_or), + (torch.logical_xor, np.logical_xor), + ] for size1 in input_sizes: size2 = (2,) + size1 # perform broadcasting for with_extremal in [False, True]: - a = _generate_input(size1, dtypes[0], device, with_extremal) - b = _generate_input(size2, dtypes[1], device, with_extremal) + a = self._generate_input(size1, dtypes[0], device, with_extremal) + b = self._generate_input(size2, dtypes[1], device, with_extremal) for torch_op, numpy_op in op_pairs: - if (dtypes[0].is_complex or dtypes[1].is_complex) and torch_op in complex_op_denylist: + if ( + dtypes[0].is_complex or dtypes[1].is_complex + ) and torch_op in complex_op_denylist: continue # functional version of op compare_with_numpy_bin_op(torch_op, numpy_op, a, b) @@ -2669,7 +3233,9 @@ def compare_with_numpy_bin_op(torch_fn, np_fn, x, y, out=None): self.assertEqual(torch_op(a, b).dtype, torch.bool) # out version of op - out = torch.zeros(1, dtype=torch.complex128) # all casts to complex128 are safe + out = torch.zeros( + 1, dtype=torch.complex128 + ) # all casts to complex128 are safe compare_with_numpy_bin_op(torch_op, numpy_op, a, b, out=out) @onlyNativeDeviceTypes @@ -2677,145 +3243,47 @@ def compare_with_numpy_bin_op(torch_fn, np_fn, x, y, out=None): def test_signed_shift(self, device, dtype): "Ensure that signed integer bit shifting works as expected." a = torch.tensor([-10, 10], device=device, dtype=dtype) # [11...1110110, 1010] - expected_l = torch.tensor([-40, 40], device=device, dtype=dtype) # [11...11011000, 101000] + expected_l = torch.tensor( + [-40, 40], device=device, dtype=dtype + ) # [11...11011000, 101000] self.assertEqual(a << 2, expected_l) self.compare_with_numpy(lambda x: x << 2, lambda x: np.left_shift(x, 2), a) - expected_r = torch.tensor([-5, 5], device=device, dtype=dtype) # [1111...111011, 101] + expected_r = torch.tensor( + [-5, 5], device=device, dtype=dtype + ) # [1111...111011, 101] self.assertEqual(a >> 1, expected_r) self.compare_with_numpy(lambda x: x >> 1, lambda x: np.right_shift(x, 1), a) - @dtypes(torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64) - def test_bitwise_and(self, device, dtype): - a = torch.tensor([1, -2, 3], dtype=dtype, device=device) - b = torch.tensor([2, 1, 3], dtype=dtype, device=device) - - a_np = a.cpu().numpy() - b_np = b.cpu().numpy() - - # Tensor x Tensor - self.assertEqual(torch.bitwise_and(a, b), torch.tensor(np.bitwise_and(a_np, b_np), device=device)) - # Tensor x int scaler - self.assertEqual(torch.bitwise_and(a, 2), torch.tensor(np.bitwise_and(a_np, 2), device=device)) - - self.assertEqual(torch.tensor([False, True, False], device=device), - torch.bitwise_and(torch.tensor([True, True, False], device=device), - torch.tensor([False, True, False], device=device))) - - # type promotion - c = torch.zeros(2) >= 1 - self.assertEqual(torch.bitwise_and(c, c.byte()), torch.bitwise_and(c.byte(), c)) - - def test_bitwise_or(self, device): - for dtype in (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64): - a = torch.tensor([1, -2, 3], dtype=dtype, device=device) - b = torch.tensor([2, 1, 3], dtype=dtype, device=device) - expected_res = torch.tensor([3, -1, 3], dtype=dtype, device=device) - b_scalar = 2 - expected_res_scalar = torch.tensor([3, -2, 3], dtype=dtype, device=device) - - # standard version - self.assertEqual(torch.bitwise_or(a, b), expected_res) - self.assertEqual(torch.bitwise_or(a, b_scalar), expected_res_scalar) - - # out - c = torch.empty(0, dtype=dtype, device=device) - torch.bitwise_or(a, b, out=c) - self.assertEqual(c, expected_res) - torch.bitwise_or(a, b_scalar, out=c) - self.assertEqual(c, expected_res_scalar) - - # in-place - a1 = a.clone() - a1.bitwise_or_(b) - self.assertEqual(a1, expected_res) - a.bitwise_or_(b_scalar) - self.assertEqual(a, expected_res_scalar) - - self.assertEqual(torch.tensor([True, True, False], device=device), - torch.bitwise_or(torch.tensor([True, True, False], device=device), - torch.tensor([False, True, False], device=device))) - - def test_bitwise_xor(self, device): - for dtype in (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64): - a = torch.tensor([1, -2, 3], dtype=dtype, device=device) - b = torch.tensor([2, 1, 3], dtype=dtype, device=device) - expected_res = torch.tensor([3, -1, 0], dtype=dtype, device=device) - b_scalar = 2 - expected_res_scalar = torch.tensor([3, -4, 1], dtype=dtype, device=device) - - # standard version - self.assertEqual(torch.bitwise_xor(a, b), expected_res) - self.assertEqual(torch.bitwise_xor(a, b_scalar), expected_res_scalar) - - # out - c = torch.empty(0, dtype=dtype, device=device) - torch.bitwise_xor(a, b, out=c) - self.assertEqual(c, expected_res) - torch.bitwise_xor(a, b_scalar, out=c) - self.assertEqual(c, expected_res_scalar) - - # in-place - a1 = a.clone() - a1.bitwise_xor_(b) - self.assertEqual(a1, expected_res) - a.bitwise_xor_(b_scalar) - self.assertEqual(a, expected_res_scalar) - - self.assertEqual(torch.tensor([True, False, False], device=device), - torch.bitwise_xor(torch.tensor([True, True, False], device=device), - torch.tensor([False, True, False], device=device))) - - @dtypes(torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64) - def test_bitwise_shift(self, device, dtype): - ops = [ - (torch.bitwise_left_shift, np.left_shift), - (operator.lshift, operator.lshift), - (torch.bitwise_right_shift, np.right_shift), - (operator.rshift, operator.rshift), - ] - for torch_op, numpy_op in ops: - a = torch.tensor([19, -20, -21, 22], dtype=dtype, device=device) - b = torch.tensor([2, 1, 3, 1], dtype=dtype, device=device) - a_np = a.cpu().numpy() - b_np = b.cpu().numpy() - - # Tensor x Tensor - self.assertEqual(torch_op(a, b), torch.tensor(numpy_op(a_np, b_np), device=device)) - # Tensor x int scalar - self.assertEqual(torch_op(a, 2), torch.tensor(numpy_op(a_np, 2), device=device)) - - def test_bitwise_shift_float(self, device): - ops = [ - (torch.bitwise_left_shift, lambda x, y: x * 2. ** y), - (operator.lshift, lambda x, y: x * 2. ** y), - (torch.bitwise_right_shift, lambda x, y: x / 2. ** y), - (operator.rshift, lambda x, y: x / 2. ** y), - ] - for torch_op, expected_op in ops: - # int tensor x float - a = torch.tensor([19, -20, -21, 22], dtype=torch.int64, device=device) - self.assertEqual(torch_op(a, 1.8), torch.floor(expected_op(a, 1)).to(a.dtype)) - # float tensor x int scalar - a = torch.tensor([19.1, -20.2, -21.3, 22.4], dtype=torch.float32, device=device) - self.assertEqual(torch_op(a, 2), expected_op(a, 2)) - # float tensor x float scalar - a = torch.tensor([19.1, -20.2, -21.3, 22.4], dtype=torch.float32, device=device) - self.assertEqual(torch_op(a, 2.2), expected_op(a, 2.2)) - @onlyNativeDeviceTypes - @dtypes(*list(product(get_all_dtypes(include_complex=False), - get_all_dtypes(include_complex=False)))) + @dtypes( + *list( + product( + all_types_and(torch.half, torch.bfloat16, torch.bool), + all_types_and(torch.half, torch.bfloat16, torch.bool), + ) + ) + ) def test_heaviside(self, device, dtypes): input_dtype = dtypes[0] values_dtype = dtypes[1] rng = np.random.default_rng() - input = np.array(rng.integers(-10, 10, size=10), - dtype=torch_to_numpy_dtype_dict[input_dtype if (input_dtype != torch.bfloat16) else torch.float64]) + input = np.array( + rng.integers(-10, 10, size=10), + dtype=torch_to_numpy_dtype_dict[ + input_dtype if (input_dtype != torch.bfloat16) else torch.float64 + ], + ) input[0] = input[3] = input[7] = 0 - values = np.array(rng.integers(-10, 10, size=10), - dtype=torch_to_numpy_dtype_dict[values_dtype if (values_dtype != torch.bfloat16) else torch.float64]) - np_result = torch.from_numpy(np.heaviside(input, values)).to(device=device, dtype=input_dtype) + values = np.array( + rng.integers(-10, 10, size=10), + dtype=torch_to_numpy_dtype_dict[ + values_dtype if (values_dtype != torch.bfloat16) else torch.float64 + ], + ) + np_result = torch.from_numpy(np.heaviside(input, values)).to( + device=device, dtype=input_dtype + ) input = torch.from_numpy(input).to(device=device, dtype=input_dtype) values = torch.from_numpy(values).to(device=device, dtype=values_dtype) @@ -2834,13 +3302,25 @@ def test_heaviside(self, device, dtypes): input.heaviside_(values) self.assertEqual(np_result, input) else: - with self.assertRaisesRegex(RuntimeError, 'heaviside is not yet implemented for tensors with different dtypes.'): + with self.assertRaisesRegex( + RuntimeError, + "heaviside is not yet implemented for tensors with different dtypes.", + ): torch.heaviside(input, values) - with self.assertRaisesRegex(RuntimeError, 'heaviside is not yet implemented for tensors with different dtypes.'): + with self.assertRaisesRegex( + RuntimeError, + "heaviside is not yet implemented for tensors with different dtypes.", + ): input.heaviside(values) - with self.assertRaisesRegex(RuntimeError, 'heaviside is not yet implemented for tensors with different dtypes.'): + with self.assertRaisesRegex( + RuntimeError, + "heaviside is not yet implemented for tensors with different dtypes.", + ): torch.heaviside(input, values, out=out) - with self.assertRaisesRegex(RuntimeError, 'heaviside is not yet implemented for tensors with different dtypes.'): + with self.assertRaisesRegex( + RuntimeError, + "heaviside is not yet implemented for tensors with different dtypes.", + ): input.heaviside_(values) @onlyCUDA @@ -2857,14 +3337,17 @@ def test_heaviside_cross_device(self, device): x = torch.tensor([-9, 5, 0, 6, -2, 2]) y = torch.tensor(0, device=device) - with self.assertRaisesRegex(RuntimeError, 'Expected all tensors to be on the same device'): + with self.assertRaisesRegex( + RuntimeError, "Expected all tensors to be on the same device" + ): torch.heaviside(x, y) - with self.assertRaisesRegex(RuntimeError, 'Expected all tensors to be on the same device'): + with self.assertRaisesRegex( + RuntimeError, "Expected all tensors to be on the same device" + ): torch.heaviside(y, x) - @dtypes(*list(product(get_all_complex_dtypes(), - get_all_complex_dtypes()))) + @dtypes(*list(product(complex_types(), complex_types()))) def test_heaviside_complex(self, device, dtypes): input_dtype = dtypes[0] values_dtype = dtypes[1] @@ -2875,13 +3358,21 @@ def test_heaviside_complex(self, device, dtypes): out = torch.empty_like(input) real = input.real - with self.assertRaisesRegex(RuntimeError, 'heaviside is not yet implemented for complex tensors.'): + with self.assertRaisesRegex( + RuntimeError, "heaviside is not yet implemented for complex tensors." + ): torch.heaviside(input, real) - with self.assertRaisesRegex(RuntimeError, 'heaviside is not yet implemented for complex tensors.'): + with self.assertRaisesRegex( + RuntimeError, "heaviside is not yet implemented for complex tensors." + ): real.heaviside(values) - with self.assertRaisesRegex(RuntimeError, 'heaviside is not yet implemented for complex tensors.'): + with self.assertRaisesRegex( + RuntimeError, "heaviside is not yet implemented for complex tensors." + ): input.heaviside_(values) - with self.assertRaisesRegex(RuntimeError, 'heaviside is not yet implemented for complex tensors.'): + with self.assertRaisesRegex( + RuntimeError, "heaviside is not yet implemented for complex tensors." + ): torch.heaviside(real, real, out=out) def _test_logical(self, device, dtypes, op, a_, b_, expected_res_): @@ -2896,20 +3387,41 @@ def _test_logical(self, device, dtypes, op, a_, b_, expected_res_): getattr(torch, op)(a, b, out=c) self.assertEqual(expected_res.bool(), c) - getattr(a, op + '_')(b) + getattr(a, op + "_")(b) self.assertEqual(expected_res, a) - @dtypes(*product(get_all_dtypes(), get_all_dtypes())) + @dtypes( + *product( + all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool), + all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool), + ) + ) def test_logical_xor(self, device, dtypes): - self._test_logical(device, dtypes, 'logical_xor', [10, 0, 1, 0], [1, 0, 0, 10], [0, 0, 1, 1]) + self._test_logical( + device, dtypes, "logical_xor", [10, 0, 1, 0], [1, 0, 0, 10], [0, 0, 1, 1] + ) - @dtypes(*product(get_all_dtypes(), get_all_dtypes())) + @dtypes( + *product( + all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool), + all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool), + ) + ) def test_logical_and(self, device, dtypes): - self._test_logical(device, dtypes, 'logical_and', [10, 0, 1, 0], [1, 0, 0, 10], [1, 0, 0, 0]) + self._test_logical( + device, dtypes, "logical_and", [10, 0, 1, 0], [1, 0, 0, 10], [1, 0, 0, 0] + ) - @dtypes(*product(get_all_dtypes(), get_all_dtypes())) + @dtypes( + *product( + all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool), + all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool), + ) + ) def test_logical_or(self, device, dtypes): - self._test_logical(device, dtypes, 'logical_or', [10, 0, 1, 0], [1, 0, 0, 10], [1, 0, 1, 1]) + self._test_logical( + device, dtypes, "logical_or", [10, 0, 1, 0], [1, 0, 0, 10], [1, 0, 1, 1] + ) def test_remainder_overflow(self, device): # Check Integer Overflows @@ -2945,7 +3457,9 @@ def test_ldexp(self, device): self.assertEqual(np_outcome, mantissas) # test bounds - mantissas = torch.tensor([float('inf'), float('-inf'), float('inf'), float('nan')], device=device) + mantissas = torch.tensor( + [float("inf"), float("-inf"), float("inf"), float("nan")], device=device + ) exponents = torch.randint(0, 31, (4,), device=device, dtype=torch.int32) np_outcome = np.ldexp(mantissas.numpy(), exponents.numpy()) pt_outcome = torch.ldexp(mantissas, exponents) @@ -2954,12 +3468,17 @@ def test_ldexp(self, device): @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble) def test_lerp(self, device, dtype): start_end_weight_shapes = [(), (5,), (5, 5)] - for shapes in product(start_end_weight_shapes, start_end_weight_shapes, start_end_weight_shapes): + for shapes in product( + start_end_weight_shapes, start_end_weight_shapes, start_end_weight_shapes + ): start = torch.randn(shapes[0], device=device, dtype=dtype) end = torch.randn(shapes[1], device=device, dtype=dtype) # Tensor weights - weights = [torch.randn(shapes[2], device=device, dtype=dtype), random.random()] + weights = [ + torch.randn(shapes[2], device=device, dtype=dtype), + random.random(), + ] if dtype.is_complex: weights += [complex(0, 1), complex(0.4, 1.2)] @@ -2967,12 +3486,29 @@ def test_lerp(self, device, dtype): actual = torch.lerp(start, end, weight) actual_method = start.lerp(end, weight) self.assertEqual(actual, actual_method) - actual_out = torch.tensor(1., dtype=dtype, device=device) + actual_out = torch.tensor(1.0, dtype=dtype, device=device) torch.lerp(start, end, weight, out=actual_out) self.assertEqual(actual, actual_out) expected = start + weight * (end - start) self.assertEqual(expected, actual) + @onlyCUDA + @dtypes(torch.half, torch.bfloat16) + def test_lerp_lowp(self, device, dtype): + ref_dtype = torch.float + xvals = (0.0, -30000.0) + yvals = (0.1, -20000.0) + xs = [torch.full((4,), xval, device=device, dtype=dtype) for xval in xvals] + ys = [torch.full((4,), yval, device=device, dtype=dtype) for yval in yvals] + weights = [70000, torch.full((4,), 8, device=device, dtype=dtype)] + for x, y, w in zip(xs, ys, weights): + xref = x.float() + yref = y.float() + wref = w.float() if isinstance(w, torch.Tensor) else w + actual = torch.lerp(x, y, w) + expected = torch.lerp(xref, yref, wref).to(dtype) + self.assertEqual(actual, expected, atol=0.0, rtol=0.0) + def _test_logaddexp(self, device, dtype, base2): if base2: ref_func = np.logaddexp2 @@ -3003,8 +3539,16 @@ def _test_helper(a, b): _test_helper(a, b) _test_helper(a[:3], b[:3]) - a = torch.tensor([float('inf'), float('-inf'), float('inf'), float("nan")], dtype=dtype, device=device) - b = torch.tensor([float('inf'), float('-inf'), float('-inf'), float("nan")], dtype=dtype, device=device) + a = torch.tensor( + [float("inf"), float("-inf"), float("inf"), float("nan")], + dtype=dtype, + device=device, + ) + b = torch.tensor( + [float("inf"), float("-inf"), float("-inf"), float("nan")], + dtype=dtype, + device=device, + ) _test_helper(a, b) @dtypes(torch.float32, torch.float64, torch.bfloat16) @@ -3016,7 +3560,7 @@ def test_logaddexp2(self, device, dtype): self._test_logaddexp(device, dtype, base2=True) def test_add(self, device): - dtypes = [torch.float, torch.double] + get_all_complex_dtypes() + dtypes = floating_and_complex_types() for dtype in dtypes: # [res] torch.add([res,] tensor1, tensor2) m1 = torch.randn(100, 100, dtype=dtype, device=device) @@ -3082,9 +3626,15 @@ def test_add(self, device): self.assertEqual(torch.add(one, 1).dtype, torch.uint8) # bool - m1 = torch.tensor([True, False, False, True, False, False], dtype=torch.bool, device=device) - m2 = torch.tensor([True, True, False, False, False, True], dtype=torch.bool, device=device) - expected = torch.tensor([True, True, False, True, False, True], dtype=torch.bool, device=device) + m1 = torch.tensor( + [True, False, False, True, False, False], dtype=torch.bool, device=device + ) + m2 = torch.tensor( + [True, True, False, False, False, True], dtype=torch.bool, device=device + ) + expected = torch.tensor( + [True, True, False, True, False, True], dtype=torch.bool, device=device + ) self.assertEqual(m1 + m2, expected) # fused multiply add @@ -3094,56 +3644,70 @@ def test_add(self, device): self.assertEqual(res, expected) # bfloat16 - m1 = torch.tensor([1., 2.], dtype=torch.bfloat16) - m2 = torch.tensor([3., 4.], dtype=torch.bfloat16) - self.assertEqual(m1 + m2, torch.tensor([4., 6.], dtype=torch.bfloat16)) + m1 = torch.tensor([1.0, 2.0], dtype=torch.bfloat16) + m2 = torch.tensor([3.0, 4.0], dtype=torch.bfloat16) + self.assertEqual(m1 + m2, torch.tensor([4.0, 6.0], dtype=torch.bfloat16)) # different alpha types m1 = torch.tensor([2 + 3j, 4 + 5j], dtype=torch.complex64, device=device) m2 = torch.tensor([4 + 5j, 2 + 3j], dtype=torch.complex64, device=device) # add complex numbers with float alpha res = torch.add(m1, m2, alpha=0.1) - expected = torch.tensor([2.4000 + 3.5000j, 4.2000 + 5.3000j], dtype=torch.complex64, device=device) + expected = torch.tensor( + [2.4000 + 3.5000j, 4.2000 + 5.3000j], dtype=torch.complex64, device=device + ) self.assertEqual(res, expected) # add complex numbers with complex alpha res = torch.add(m1, m2, alpha=complex(0.1, 0.2)) - expected = torch.tensor([1.4000 + 4.3000j, 3.6000 + 5.7000j], dtype=torch.complex64, device=device) + expected = torch.tensor( + [1.4000 + 4.3000j, 3.6000 + 5.7000j], dtype=torch.complex64, device=device + ) self.assertEqual(res, expected) # add complex numbers with integer alpha res = torch.add(m1, m2, alpha=2) - expected = torch.tensor([10. + 13.j, 8. + 11.j], dtype=torch.complex64, device=device) + expected = torch.tensor( + [10.0 + 13.0j, 8.0 + 11.0j], dtype=torch.complex64, device=device + ) self.assertEqual(res, expected) # mismatched alpha m1 = torch.tensor([1], dtype=torch.int8, device=device) m2 = torch.tensor([2], dtype=torch.int8, device=device) - self.assertRaisesRegex(RuntimeError, - r"Boolean alpha only supported for Boolean results\.", - lambda: torch.add(m1, m2, alpha=True)) - self.assertRaisesRegex(RuntimeError, - r"For integral input tensors, argument alpha must not be a floating point number\.", - lambda: torch.add(m1, m2, alpha=1.0)) + self.assertRaisesRegex( + RuntimeError, + r"Boolean alpha only supported for Boolean results\.", + lambda: torch.add(m1, m2, alpha=True), + ) + self.assertRaisesRegex( + RuntimeError, + r"For integral input tensors, argument alpha must not be a floating point number\.", + lambda: torch.add(m1, m2, alpha=1.0), + ) # mismatched alpha, float / double tensor and complex alpha msg = r"For non-complex input tensors, argument alpha must not be a complex number\." - m1 = torch.tensor([3., 4.], device=device) - m2 = torch.tensor([4., 3.], device=device) - self.assertRaisesRegex(RuntimeError, msg, - lambda: torch.add(m1, m2, alpha=complex(0.1, 0.2))) + m1 = torch.tensor([3.0, 4.0], device=device) + m2 = torch.tensor([4.0, 3.0], device=device) + self.assertRaisesRegex( + RuntimeError, msg, lambda: torch.add(m1, m2, alpha=complex(0.1, 0.2)) + ) - m1 = torch.tensor([3., 4.], dtype=torch.double, device=device) - m2 = torch.tensor([4., 3.], dtype=torch.double, device=device) - self.assertRaisesRegex(RuntimeError, msg, - lambda: torch.add(m1, m2, alpha=complex(0.1, 0.2))) + m1 = torch.tensor([3.0, 4.0], dtype=torch.double, device=device) + m2 = torch.tensor([4.0, 3.0], dtype=torch.double, device=device) + self.assertRaisesRegex( + RuntimeError, msg, lambda: torch.add(m1, m2, alpha=complex(0.1, 0.2)) + ) # complex m1 = torch.tensor((4.0000 + 4.0000j), dtype=torch.complex64) - m2 = torch.tensor(4., dtype=torch.float64) - self.assertRaisesRegex(RuntimeError, r"result type ComplexFloat can't be cast to the desired output type Double", - lambda: torch.add(m1, m1, out=m2)) - + m2 = torch.tensor(4.0, dtype=torch.float64) + self.assertRaisesRegex( + RuntimeError, + r"result type ComplexFloat can't be cast to the desired output type Double", + lambda: torch.add(m1, m1, out=m2), + ) @onlyCUDA def test_addsub_half_tensor(self, device): @@ -3158,30 +3722,44 @@ def test_addsub_half_tensor(self, device): self.assertTrue(not (actual.isnan() or actual.isinf())) def test_sub_typing(self, device): - m1 = torch.tensor([True, False, False, True, False, False], dtype=torch.bool, device=device) - m2 = torch.tensor([True, True, False, False, False, True], dtype=torch.bool, device=device) - self.assertRaisesRegex(RuntimeError, - r"Subtraction, the `\-` operator, with two bool tensors is not supported. " - r"Use the `\^` or `logical_xor\(\)` operator instead.", - lambda: m1 - m2) - self.assertRaisesRegex(RuntimeError, - r"Subtraction, the `\-` operator, with a bool tensor is not supported. " - r"If you are trying to invert a mask, use the `\~` or `logical_not\(\)` operator instead.", - lambda: 1 - m1) - self.assertRaisesRegex(RuntimeError, - r"Subtraction, the `\-` operator, with a bool tensor is not supported. " - r"If you are trying to invert a mask, use the `\~` or `logical_not\(\)` operator instead.", - lambda: m2 - 1) + m1 = torch.tensor( + [True, False, False, True, False, False], dtype=torch.bool, device=device + ) + m2 = torch.tensor( + [True, True, False, False, False, True], dtype=torch.bool, device=device + ) + self.assertRaisesRegex( + RuntimeError, + r"Subtraction, the `\-` operator, with two bool tensors is not supported. " + r"Use the `\^` or `logical_xor\(\)` operator instead.", + lambda: m1 - m2, + ) + self.assertRaisesRegex( + RuntimeError, + r"Subtraction, the `\-` operator, with a bool tensor is not supported. " + r"If you are trying to invert a mask, use the `\~` or `logical_not\(\)` operator instead.", + lambda: 1 - m1, + ) + self.assertRaisesRegex( + RuntimeError, + r"Subtraction, the `\-` operator, with a bool tensor is not supported. " + r"If you are trying to invert a mask, use the `\~` or `logical_not\(\)` operator instead.", + lambda: m2 - 1, + ) # mismatched alpha m1 = torch.tensor([1], dtype=torch.int8, device=device) m2 = torch.tensor([2], dtype=torch.int8, device=device) - self.assertRaisesRegex(RuntimeError, - r"Boolean alpha only supported for Boolean results\.", - lambda: torch.sub(m1, m2, alpha=True)) - self.assertRaisesRegex(RuntimeError, - r"For integral input tensors, argument alpha must not be a floating point number\.", - lambda: torch.sub(m1, m2, alpha=1.0)) + self.assertRaisesRegex( + RuntimeError, + r"Boolean alpha only supported for Boolean results\.", + lambda: torch.sub(m1, m2, alpha=True), + ) + self.assertRaisesRegex( + RuntimeError, + r"For integral input tensors, argument alpha must not be a floating point number\.", + lambda: torch.sub(m1, m2, alpha=1.0), + ) def test_mul(self, device): m1 = torch.randn(10, 10, device=device) @@ -3194,31 +3772,61 @@ def test_mul(self, device): a1 = torch.tensor([True, False, False, True], dtype=torch.bool, device=device) a2 = torch.tensor([True, False, True, False], dtype=torch.bool, device=device) - self.assertEqual(a1 * a2, torch.tensor([True, False, False, False], dtype=torch.bool, device=device)) + self.assertEqual( + a1 * a2, + torch.tensor([True, False, False, False], dtype=torch.bool, device=device), + ) - if device == 'cpu': + if device == "cpu": a1 = torch.tensor([0.1, 0.1], dtype=torch.bfloat16, device=device) a2 = torch.tensor([1.1, 0.1], dtype=torch.bfloat16, device=device) - self.assertEqual(a1 * a2, torch.tensor([0.11, 0.01], dtype=torch.bfloat16, device=device), atol=0.01, rtol=0) + self.assertEqual( + a1 * a2, + torch.tensor([0.11, 0.01], dtype=torch.bfloat16, device=device), + atol=0.01, + rtol=0, + ) self.assertEqual(a1.mul(a2), a1 * a2) def test_bool_tensor_comparison_ops(self, device): - a = torch.tensor([True, False, True, False, True, False], dtype=torch.bool, device=device) - b = torch.tensor([True, False, True, True, True, True], dtype=torch.bool, device=device) - self.assertEqual(a == b, torch.tensor([1, 1, 1, 0, 1, 0], dtype=torch.bool, device=device)) - self.assertEqual(a != b, torch.tensor([0, 0, 0, 1, 0, 1], dtype=torch.bool, device=device)) - self.assertEqual(a < b, torch.tensor([0, 0, 0, 1, 0, 1], dtype=torch.bool, device=device)) - self.assertEqual(a > b, torch.tensor([0, 0, 0, 0, 0, 0], dtype=torch.bool, device=device)) - self.assertEqual(a >= b, torch.tensor([1, 1, 1, 0, 1, 0], dtype=torch.bool, device=device)) - self.assertEqual(a <= b, torch.tensor([1, 1, 1, 1, 1, 1], dtype=torch.bool, device=device)) - self.assertEqual(a > False, torch.tensor([1, 0, 1, 0, 1, 0], dtype=torch.bool, device=device)) - self.assertEqual(a == torch.tensor(True, dtype=torch.bool, device=device), - torch.tensor([1, 0, 1, 0, 1, 0], dtype=torch.bool, device=device)) - self.assertEqual(a == torch.tensor(0, dtype=torch.bool, device=device), - torch.tensor([0, 1, 0, 1, 0, 1], dtype=torch.bool, device=device)) + a = torch.tensor( + [True, False, True, False, True, False], dtype=torch.bool, device=device + ) + b = torch.tensor( + [True, False, True, True, True, True], dtype=torch.bool, device=device + ) + self.assertEqual( + a == b, torch.tensor([1, 1, 1, 0, 1, 0], dtype=torch.bool, device=device) + ) + self.assertEqual( + a != b, torch.tensor([0, 0, 0, 1, 0, 1], dtype=torch.bool, device=device) + ) + self.assertEqual( + a < b, torch.tensor([0, 0, 0, 1, 0, 1], dtype=torch.bool, device=device) + ) + self.assertEqual( + a > b, torch.tensor([0, 0, 0, 0, 0, 0], dtype=torch.bool, device=device) + ) + self.assertEqual( + a >= b, torch.tensor([1, 1, 1, 0, 1, 0], dtype=torch.bool, device=device) + ) + self.assertEqual( + a <= b, torch.tensor([1, 1, 1, 1, 1, 1], dtype=torch.bool, device=device) + ) + self.assertEqual( + a > False, torch.tensor([1, 0, 1, 0, 1, 0], dtype=torch.bool, device=device) + ) + self.assertEqual( + a == torch.tensor(True, dtype=torch.bool, device=device), + torch.tensor([1, 0, 1, 0, 1, 0], dtype=torch.bool, device=device), + ) + self.assertEqual( + a == torch.tensor(0, dtype=torch.bool, device=device), + torch.tensor([0, 1, 0, 1, 0, 1], dtype=torch.bool, device=device), + ) self.assertFalse(a.equal(b)) - @dtypes(*get_all_dtypes(include_complex=False)) + @dtypes(*all_types_and(torch.half, torch.bfloat16, torch.bool)) def test_logical(self, device, dtype): if dtype != torch.bool: x = torch.tensor([1, 2, 3, 4], device=device, dtype=dtype) @@ -3252,10 +3860,20 @@ def _test_atan2_with_size(size, device): actual = a.atan2(b) x = a.view(-1) y = b.view(-1) - expected = torch.tensor([math.atan2(x[i].item(), y[i].item()) for i in range(x.numel())], - device=device, dtype=torch.double) + expected = torch.tensor( + [math.atan2(x[i].item(), y[i].item()) for i in range(x.numel())], + device=device, + dtype=torch.double, + ) self.assertEqual(expected, actual.view(-1), rtol=0, atol=0.02) + # bfloat16 + a_bf16 = a.bfloat16() + b_bf16 = b.bfloat16() + actual_bf16 = a_bf16.atan2(b_bf16) + self.assertEqual(actual_bf16, actual.bfloat16()) + self.assertEqual(expected, actual_bf16.view(-1), exact_dtype=False, rtol=0, atol=0.02) + _test_atan2_with_size((2, 2), device) _test_atan2_with_size((3, 3), device) _test_atan2_with_size((5, 5), device) @@ -3274,10 +3892,10 @@ def _test_atan2(x, y, expected, device, dtype): _test_atan2(0, -1, math.pi / -2, device, dtype) _test_atan2(-1, 0, math.pi, device, dtype) _test_atan2(1, 0, 0, device, dtype) - _test_atan2(-1, -1, math.pi * -3 / 4 , device, dtype) - _test_atan2(1, 1, math.pi / 4 , device, dtype) - _test_atan2(1, -1, math.pi / -4 , device, dtype) - _test_atan2(-1, 1, math.pi * 3 / 4 , device, dtype) + _test_atan2(-1, -1, math.pi * -3 / 4, device, dtype) + _test_atan2(1, 1, math.pi / 4, device, dtype) + _test_atan2(1, -1, math.pi / -4, device, dtype) + _test_atan2(-1, 1, math.pi * 3 / 4, device, dtype) def test_trapezoid(self, device): def test_dx(sizes, dim, dx, device): @@ -3300,7 +3918,9 @@ def test_x(sizes, dim, x, device): test_dx((0, 2), 0, 1.0, device) test_dx((0, 2), 1, 1.0, device) test_x((2, 3, 4), 1, [1.0, 2.0, 3.0], device) - test_x((10, 2), 0, [2.0, 3.0, 4.0, 7.0, 11.0, 14.0, 22.0, 26.0, 26.1, 30.3], device) + test_x( + (10, 2), 0, [2.0, 3.0, 4.0, 7.0, 11.0, 14.0, 22.0, 26.0, 26.1, 30.3], device + ) test_x((1, 10), 0, [1.0], device) test_x((0, 2), 0, [], device) test_x((0, 2), 1, [1.0, 2.0], device) @@ -3309,14 +3929,12 @@ def test_x(sizes, dim, x, device): test_x((2, 3, 4), 1, [1.0, 2.0, 3.0], device) test_x((2, 3, 4), 2, [1.0, 2.0, 3.0, 4.0], device) test_x((2, 2, 4), -1, [[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]], device) - with self.assertRaisesRegex( - IndexError, - 'Dimension out of range'): + with self.assertRaisesRegex(IndexError, "Dimension out of range"): test_x((2, 3), 2, [], device) test_dx((2, 3), 2, 1.0, device) with self.assertRaisesRegex( - RuntimeError, - 'There must be one `x` value for each sample point'): + RuntimeError, "There must be one `x` value for each sample point" + ): test_x((2, 3), 1, [1.0, 2.0], device) test_x((2, 3), 1, [1.0, 2.0, 3.0, 4.0], device) @@ -3325,7 +3943,7 @@ def test_cumulative_trapezoid(self, device): import scipy.integrate - if hasattr(scipy.integrate, 'cumulative_trapezoid'): + if hasattr(scipy.integrate, "cumulative_trapezoid"): scipy_cumulative_trapezoid = scipy.integrate.cumulative_trapezoid else: # Older version of SciPy uses a different name scipy_cumulative_trapezoid = scipy.integrate.cumtrapz @@ -3340,14 +3958,20 @@ def test_dx(sizes, dim, dx, device): def test_x(sizes, dim, x, device): t = torch.randn(sizes, device=device) - actual = torch.cumulative_trapezoid(t, x=torch.tensor(x, device=device), dim=dim) + actual = torch.cumulative_trapezoid( + t, x=torch.tensor(x, device=device), dim=dim + ) expected = scipy_cumulative_trapezoid(t.cpu().numpy(), x=x, axis=dim) self.assertEqual(expected.shape, actual.shape) - self.assertEqual(expected, actual.cpu(), exact_dtype=False, atol=1e-4, rtol=1e-4) + self.assertEqual( + expected, actual.cpu(), exact_dtype=False, atol=1e-4, rtol=1e-4 + ) def test_empty_x(sizes, dim, x, device): t = torch.randn(sizes, device=device) - actual = torch.cumulative_trapezoid(t, x=torch.tensor(x, device=device), dim=dim) + actual = torch.cumulative_trapezoid( + t, x=torch.tensor(x, device=device), dim=dim + ) self.assertEqual(torch.empty(actual.shape), actual) test_dx((2,), -1, 1, device) @@ -3364,7 +3988,9 @@ def test_empty_x(sizes, dim, x, device): test_x((2,), -1, [100, 50], device) test_x((4, 2), 0, [2, 3, 4, 5], device) test_x((2, 3, 4), 1, [1.0, 2.0, 3.0], device) - test_x((10, 2), 0, [2.0, 3.0, 4.0, 7.0, 11.0, 14.0, 22.0, 26.0, 26.1, 30.3], device) + test_x( + (10, 2), 0, [2.0, 3.0, 4.0, 7.0, 11.0, 14.0, 22.0, 26.0, 26.1, 30.3], device + ) test_x((1, 10), 0, [1.0], device) test_x((0, 2), 1, [1, 2], device) test_x((2, 3, 4), -1, [1.0, 2.0, 3.0, 4.0], device) @@ -3372,40 +3998,51 @@ def test_empty_x(sizes, dim, x, device): test_x((2, 3, 4), 1, [1.0, 2.0, 3.0], device) test_x((2, 3, 4), 2, [1.0, 2.0, 3.0, 4.0], device) - test_empty_x((0, 2), 0, [], device) # SciPy failing when x == [], but our version returns empty + test_empty_x( + (0, 2), 0, [], device + ) # SciPy failing when x == [], but our version returns empty - with self.assertRaisesRegex( - IndexError, - 'Dimension out of range'): + with self.assertRaisesRegex(IndexError, "Dimension out of range"): test_x((2, 3), 2, [], device) test_dx((2, 3), 2, 1.0, device) with self.assertRaisesRegex( - RuntimeError, - 'There must be one `x` value for each sample point'): + RuntimeError, "There must be one `x` value for each sample point" + ): test_x((2, 3), 1, [1.0, 2.0], device) test_x((0, 2), 0, [1.0, 2.0], device) test_x((2, 3), 1, [1.0, 2.0, 3.0, 4.0], device) with self.assertRaisesRegex( - RuntimeError, - 'Currently, we only support dx as a real number'): - test_dx((2, 2), -1, complex(1, 1) , device) + RuntimeError, "Currently, we only support dx as a real number" + ): + test_dx((2, 2), -1, complex(1, 1), device) with self.assertRaisesRegex( - TypeError, 'received an invalid combination of arguments'): - actual = torch.cumulative_trapezoid(torch.randn((3, 3)), x=torch.randn((3, 3)), dx=3) + TypeError, "received an invalid combination of arguments" + ): + actual = torch.cumulative_trapezoid( + torch.randn((3, 3)), x=torch.randn((3, 3)), dx=3 + ) + @skipMeta @dtypes(torch.double) def test_pow_scalar_overloads_mem_overlap(self, device, dtype): sz = 3 doubles = torch.randn(2 * sz, dtype=dtype, device=device) - self.check_internal_mem_overlap( - lambda t: t.pow_(42), 1, dtype, device) + self.check_internal_mem_overlap(lambda t: t.pow_(42), 1, dtype, device) self.unary_check_input_output_mem_overlap( - doubles, sz, lambda input, out: torch.pow(input, 42, out=out)) + doubles, sz, lambda input, out: torch.pow(input, 42, out=out) + ) self.unary_check_input_output_mem_overlap( - doubles, sz, lambda input, out: torch.pow(42, input, out=out)) + doubles, sz, lambda input, out: torch.pow(42, input, out=out) + ) - @dtypes(*list(product(get_all_dtypes(include_bool=False), - get_all_dtypes(include_bool=False)))) + @dtypes( + *list( + product( + all_types_and_complex_and(torch.half, torch.bfloat16), + all_types_and_complex_and(torch.half, torch.bfloat16), + ) + ) + ) def test_float_power(self, device, dtypes): def to_np(value): if isinstance(value, torch.Tensor) and value.dtype == torch.bfloat16: @@ -3414,25 +4051,43 @@ def to_np(value): base_dtype = dtypes[0] exp_dtype = dtypes[1] - out_dtype = torch.complex128 if base_dtype.is_complex or exp_dtype.is_complex else torch.float64 + out_dtype = ( + torch.complex128 + if base_dtype.is_complex or exp_dtype.is_complex + else torch.float64 + ) - base = make_tensor((30,), device, base_dtype, low=1, high=100) + base = make_tensor((30,), dtype=base_dtype, device=device, low=1, high=100) # Complex and real results do not agree between PyTorch and NumPy when computing negative and zero power of 0 # Related: https://github.com/pytorch/pytorch/issues/48000 # base[0] = base[3] = base[7] = 0 - exp = make_tensor((30,), device, exp_dtype, low=-2, high=2) + exp = make_tensor((30,), dtype=exp_dtype, device=device, low=-2, high=2) exp[0] = exp[4] = exp[6] = 0 expected = torch.from_numpy(np.float_power(to_np(base), to_np(exp))) exponents = [-2.8, -2, -1, -0.5, 0.5, 1, 2] - complex_exponents = exponents + [-2.5j, -1.0j, 1.0j, 2.5j, 1.0 + 1.0j, -1.0 - 1.5j, 3.3j] + complex_exponents = exponents + [ + -2.5j, + -1.0j, + 1.0j, + 2.5j, + 1.0 + 1.0j, + -1.0 - 1.5j, + 3.3j, + ] - for op in (torch.float_power, torch.Tensor.float_power, torch.Tensor.float_power_): + for op in ( + torch.float_power, + torch.Tensor.float_power, + torch.Tensor.float_power_, + ): # Case of Tensor x Tensor if op is torch.Tensor.float_power_ and base_dtype != out_dtype: - with self.assertRaisesRegex(RuntimeError, "operation's result requires dtype"): + with self.assertRaisesRegex( + RuntimeError, "operation's result requires dtype" + ): op(base.clone(), exp) else: result = op(base.clone(), exp) @@ -3445,24 +4100,39 @@ def to_np(value): # Case of Tensor x Scalar for i in complex_exponents if exp_dtype.is_complex else exponents: - out_dtype_scalar_exp = torch.complex128 if base_dtype.is_complex or type(i) == complex else torch.float64 + out_dtype_scalar_exp = ( + torch.complex128 + if base_dtype.is_complex or type(i) == complex + else torch.float64 + ) expected_scalar_exp = torch.from_numpy(np.float_power(to_np(base), i)) - if op is torch.Tensor.float_power_ and base_dtype != out_dtype_scalar_exp: - with self.assertRaisesRegex(RuntimeError, "operation's result requires dtype"): + if ( + op is torch.Tensor.float_power_ + and base_dtype != out_dtype_scalar_exp + ): + with self.assertRaisesRegex( + RuntimeError, "operation's result requires dtype" + ): op(base.clone(), i) else: result = op(base.clone(), i) self.assertEqual(expected_scalar_exp, result) if op is torch.float_power: - out = torch.empty_like(base).to(device=device, dtype=out_dtype_scalar_exp) + out = torch.empty_like(base).to( + device=device, dtype=out_dtype_scalar_exp + ) op(base, i, out=out) self.assertEqual(expected_scalar_exp, out) # Case of Scalar x Tensor for i in complex_exponents if base_dtype.is_complex else exponents: - out_dtype_scalar_base = torch.complex128 if exp_dtype.is_complex or type(i) == complex else torch.float64 + out_dtype_scalar_base = ( + torch.complex128 + if exp_dtype.is_complex or type(i) == complex + else torch.float64 + ) expected_scalar_base = torch.from_numpy(np.float_power(i, to_np(exp))) result = torch.float_power(i, exp) @@ -3481,8 +4151,13 @@ def _promo_helper(x, y): return torch.complex128 return torch.double - test_cases = ((torch.tensor([-2, -1, 0, 1, 2], device=device), -.25), - (torch.tensor([-1.0j, 0j, 1.0j, 1.0 + 1.0j, -1.0 - 1.5j], device=device), 2.)) + test_cases = ( + (torch.tensor([-2, -1, 0, 1, 2], device=device), -0.25), + ( + torch.tensor([-1.0j, 0j, 1.0j, 1.0 + 1.0j, -1.0 - 1.5j], device=device), + 2.0, + ), + ) for base, exp in test_cases: for out_dtype in (torch.long, torch.float, torch.double, torch.cdouble): out = torch.empty(1, device=device, dtype=out_dtype) @@ -3491,18 +4166,25 @@ def _promo_helper(x, y): if out.dtype == required_dtype: torch.float_power(base, exp, out=out) else: - with self.assertRaisesRegex(RuntimeError, "operation's result requires dtype"): + with self.assertRaisesRegex( + RuntimeError, "operation's result requires dtype" + ): torch.float_power(base, exp, out=out) if base.dtype == required_dtype: torch.Tensor.float_power_(base.clone(), exp) else: - with self.assertRaisesRegex(RuntimeError, "operation's result requires dtype"): + with self.assertRaisesRegex( + RuntimeError, "operation's result requires dtype" + ): torch.Tensor.float_power_(base.clone(), exp) @skipIf(not TEST_SCIPY, "Scipy required for the test.") - @dtypes(*product(get_all_dtypes(include_complex=False, include_bfloat16=False), - get_all_dtypes(include_complex=False, include_bfloat16=False))) + @dtypes( + *product( + all_types_and(torch.half, torch.bool), all_types_and(torch.half, torch.bool) + ) + ) def test_xlogy_xlog1py(self, device, dtypes): x_dtype, y_dtype = dtypes @@ -3513,9 +4195,10 @@ def out_variant_helper(torch_fn, x, y): self.assertEqual(expected, out) def xlogy_inplace_variant_helper(x, y): - if x.dtype in get_all_int_dtypes() + [torch.bool]: - with self.assertRaisesRegex(RuntimeError, - "can't be cast to the desired output type"): + if x.dtype in integral_types_and(torch.bool): + with self.assertRaisesRegex( + RuntimeError, "can't be cast to the desired output type" + ): x.clone().xlogy_(y) else: expected = torch.empty_like(x) @@ -3527,9 +4210,15 @@ def test_helper(torch_fn, reference_fn, inputs, scalar=None): x, y, z = inputs torch_fn_partial = partial(torch_fn, x) reference_fn_partial = partial(reference_fn, x.cpu().numpy()) - self.compare_with_numpy(torch_fn_partial, reference_fn_partial, x, exact_dtype=False) - self.compare_with_numpy(torch_fn_partial, reference_fn_partial, y, exact_dtype=False) - self.compare_with_numpy(torch_fn_partial, reference_fn_partial, z, exact_dtype=False) + self.compare_with_numpy( + torch_fn_partial, reference_fn_partial, x, exact_dtype=False + ) + self.compare_with_numpy( + torch_fn_partial, reference_fn_partial, y, exact_dtype=False + ) + self.compare_with_numpy( + torch_fn_partial, reference_fn_partial, z, exact_dtype=False + ) val = scalar if scalar is not None else x out_variant_helper(torch_fn, val, x) @@ -3537,13 +4226,17 @@ def test_helper(torch_fn, reference_fn, inputs, scalar=None): out_variant_helper(torch_fn, val, z) # Tensor-Tensor Test (tensor of same and different shape) - x = make_tensor((3, 2, 4, 5), device, x_dtype, low=0.5, high=1000) - y = make_tensor((3, 2, 4, 5), device, y_dtype, low=0.5, high=1000) - z = make_tensor((4, 5), device, y_dtype, low=0.5, high=1000) + x = make_tensor((3, 2, 4, 5), dtype=x_dtype, device=device, low=0.5, high=1000) + y = make_tensor((3, 2, 4, 5), dtype=y_dtype, device=device, low=0.5, high=1000) + z = make_tensor((4, 5), dtype=y_dtype, device=device, low=0.5, high=1000) - x_1p = make_tensor((3, 2, 4, 5), device, x_dtype, low=-0.5, high=1000) - y_1p = make_tensor((3, 2, 4, 5), device, y_dtype, low=-0.5, high=1000) - z_1p = make_tensor((4, 5), device, y_dtype, low=-0.5, high=1000) + x_1p = make_tensor( + (3, 2, 4, 5), dtype=x_dtype, device=device, low=-0.5, high=1000 + ) + y_1p = make_tensor( + (3, 2, 4, 5), dtype=y_dtype, device=device, low=-0.5, high=1000 + ) + z_1p = make_tensor((4, 5), dtype=y_dtype, device=device, low=-0.5, high=1000) xlogy_fns = torch.xlogy, scipy.special.xlogy xlog1py_fns = torch.special.xlog1py, scipy.special.xlog1py @@ -3559,7 +4252,10 @@ def test_helper(torch_fn, reference_fn, inputs, scalar=None): test_helper(*xlog1py_fns, (x_1p, y_1p, z_1p), 3.14) # Special Values Tensor-Tensor - t = torch.tensor([-1., 0., 1., 2., float('inf'), -float('inf'), float('nan')], device=device) + t = torch.tensor( + [-1.0, 0.0, 1.0, 2.0, float("inf"), -float("inf"), float("nan")], + device=device, + ) zeros = torch.zeros(7, dtype=y_dtype, device=device) def test_zeros_special_helper(torch_fn, reference_fn, scalar=False): @@ -3567,7 +4263,9 @@ def test_zeros_special_helper(torch_fn, reference_fn, scalar=False): zeros_np = 0 if scalar else zeros.cpu().numpy() torch_fn_partial = partial(torch_fn, zeros_t) reference_fn_partial = partial(reference_fn, zeros_np) - self.compare_with_numpy(torch_fn_partial, reference_fn_partial, t, exact_dtype=False) + self.compare_with_numpy( + torch_fn_partial, reference_fn_partial, t, exact_dtype=False + ) out_variant_helper(torch_fn, zeros_t, t) test_zeros_special_helper(*xlogy_fns) @@ -3584,14 +4282,14 @@ def test_xlogy_xlog1py_scalar_type_promotion(self, device): t = torch.randn((), dtype=torch.float32, device=device) self.assertEqual(t.dtype, torch.xlogy(t, 5).dtype) - self.assertEqual(t.dtype, torch.xlogy(t, 5.).dtype) + self.assertEqual(t.dtype, torch.xlogy(t, 5.0).dtype) self.assertEqual(t.dtype, torch.special.xlog1py(t, 5).dtype) - self.assertEqual(t.dtype, torch.special.xlog1py(t, 5.).dtype) + self.assertEqual(t.dtype, torch.special.xlog1py(t, 5.0).dtype) self.assertEqual(t.dtype, torch.xlogy(5, t).dtype) - self.assertEqual(t.dtype, torch.xlogy(5., t).dtype) + self.assertEqual(t.dtype, torch.xlogy(5.0, t).dtype) self.assertEqual(t.dtype, torch.special.xlog1py(5, t).dtype) - self.assertEqual(t.dtype, torch.special.xlog1py(5., t).dtype) + self.assertEqual(t.dtype, torch.special.xlog1py(5.0, t).dtype) @skipIf(not TEST_SCIPY, "Scipy required for the test.") def test_xlogy_xlog1py_bfloat16(self, device): @@ -3605,13 +4303,17 @@ def _compare_helper(x, y, torch_fn, reference_fn): x_dtype, y_dtype = torch.bfloat16, torch.bfloat16 # Tensor-Tensor Test (tensor of same and different shape) - x = make_tensor((3, 2, 4, 5), device, x_dtype, low=0.5, high=1000) - y = make_tensor((3, 2, 4, 5), device, y_dtype, low=0.5, high=1000) - z = make_tensor((4, 5), device, y_dtype, low=0.5, high=1000) + x = make_tensor((3, 2, 4, 5), dtype=x_dtype, device=device, low=0.5, high=1000) + y = make_tensor((3, 2, 4, 5), dtype=y_dtype, device=device, low=0.5, high=1000) + z = make_tensor((4, 5), dtype=y_dtype, device=device, low=0.5, high=1000) - x_1p = make_tensor((3, 2, 4, 5), device, x_dtype, low=-0.8, high=1000) - y_1p = make_tensor((3, 2, 4, 5), device, y_dtype, low=-0.8, high=1000) - z_1p = make_tensor((4, 5), device, y_dtype, low=-0.8, high=1000) + x_1p = make_tensor( + (3, 2, 4, 5), dtype=x_dtype, device=device, low=-0.8, high=1000 + ) + y_1p = make_tensor( + (3, 2, 4, 5), dtype=y_dtype, device=device, low=-0.8, high=1000 + ) + z_1p = make_tensor((4, 5), dtype=y_dtype, device=device, low=-0.8, high=1000) xlogy_fns = torch.xlogy, scipy.special.xlogy xlog1py_fns = torch.special.xlog1py, scipy.special.xlog1py @@ -3631,19 +4333,19 @@ def _compare_helper(x, y, torch_fn, reference_fn): _compare_helper(z_1p, 3.14, *xlog1py_fns) # Special Values Tensor-Tensor - t = torch.tensor([-1., 0., 1., 2., float('inf'), -float('inf'), float('nan')], device=device) + t = torch.tensor( + [-1.0, 0.0, 1.0, 2.0, float("inf"), -float("inf"), float("nan")], + device=device, + ) zeros = torch.tensor(7, dtype=y_dtype, device=device) _compare_helper(t, zeros, *xlogy_fns) - _compare_helper(t, 0., *xlogy_fns) + _compare_helper(t, 0.0, *xlogy_fns) _compare_helper(t, zeros, *xlog1py_fns) - _compare_helper(t, 0., *xlog1py_fns) + _compare_helper(t, 0.0, *xlog1py_fns) - @dtypes(*product(get_all_dtypes(include_complex=False, - include_half=False, include_bfloat16=False), - get_all_dtypes(include_complex=False, - include_half=False, include_bfloat16=False))) + @dtypes(*product(all_types_and(torch.bool), all_types_and(torch.bool))) @skipIf(not TEST_SCIPY, "Scipy required for the test.") @slowTest def test_zeta(self, device, dtypes): @@ -3656,64 +4358,106 @@ def test_helper(x, q): actual = torch.special.zeta(x, q) rtol, atol = None, None - if self.device_type == 'cpu': + if self.device_type == "cpu": rtol, atol = 1e-6, 1e-6 self.assertEqual(expected, actual, rtol=rtol, atol=atol, exact_dtype=False) # x tensor - q tensor same size - x = make_tensor((2, 3, 4), device, x_dtype) - q = make_tensor((2, 3, 4), device, q_dtype) + x = make_tensor((2, 3, 4), dtype=x_dtype, device=device) + q = make_tensor((2, 3, 4), dtype=q_dtype, device=device) test_helper(x, q) # x tensor - q tensor broadcast lhs - x = make_tensor((2, 1, 4), device, x_dtype) - q = make_tensor((2, 3, 4), device, q_dtype) + x = make_tensor((2, 1, 4), dtype=x_dtype, device=device) + q = make_tensor((2, 3, 4), dtype=q_dtype, device=device) test_helper(x, q) # x tensor - q tensor broadcast rhs - x = make_tensor((2, 3, 4), device, x_dtype) - q = make_tensor((2, 1, 4), device, q_dtype) + x = make_tensor((2, 3, 4), dtype=x_dtype, device=device) + q = make_tensor((2, 1, 4), dtype=q_dtype, device=device) test_helper(x, q) # x tensor - q tensor broadcast all - x = make_tensor((2, 3, 1), device, x_dtype) - q = make_tensor((2, 1, 4), device, q_dtype) + x = make_tensor((2, 3, 1), dtype=x_dtype, device=device) + q = make_tensor((2, 1, 4), dtype=q_dtype, device=device) test_helper(x, q) # x scalar - q tensor for x in np.linspace(-5, 5, num=10).tolist(): if not q_dtype.is_floating_point: q_dtype = torch.get_default_dtype() - q = make_tensor((2, 3, 4), device, q_dtype) + q = make_tensor((2, 3, 4), dtype=q_dtype, device=device) test_helper(x, q) # x tensor - q scalar for q in np.linspace(-5, 5, num=10).tolist(): if not x_dtype.is_floating_point: x_dtype = torch.get_default_dtype() - x = make_tensor((2, 3, 4), device, x_dtype) + x = make_tensor((2, 3, 4), dtype=x_dtype, device=device) test_helper(x, q) + @onlyCUDA + @dtypes( + torch.chalf, + ) + def test_mul_chalf_tensor_and_cpu_scalar(self, device, dtype): + # Tests that Tensor and CPU Scalar work for `mul` for chalf. + # Ideally, this should be covered by `test_complex_half_reference_testing` + # from test_ops.py by checking reference_samples from the OpInfo. + # But currently that doesn't work as sample generation requires support of + # `index_select` which is not implemented for `complex32` at the + # time of writing this test. + # TODO: Remove this test once above issue is fixed. + # Ref: https://github.com/pytorch/pytorch/pull/76364 + x = make_tensor((2, 2), device=device, dtype=dtype) + self.assertEqual(x * 2.5, x * torch.tensor(2.5, device=device, dtype=dtype)) -tensor_binary_ops = [ - '__lt__', '__le__', - '__gt__', '__ge__', - '__eq__', '__ne__', - - '__add__', '__radd__', '__iadd__', - '__sub__', '__rsub__', '__isub__', - '__mul__', '__rmul__', '__imul__', - '__matmul__', '__rmatmul__', - '__truediv__', '__rtruediv__', '__itruediv__', - '__floordiv__', '__rfloordiv__', '__ifloordiv__', - '__mod__', '__rmod__', '__imod__', - '__pow__', '__rpow__', '__ipow__', - '__lshift__', '__rlshift__', '__ilshift__', - '__rshift__', '__rrshift__', '__irshift__', - '__and__', '__rand__', '__iand__', - '__xor__', '__rxor__', '__ixor__', - '__or__', '__ror__', '__ior__', +tensor_binary_ops = [ + "__lt__", + "__le__", + "__gt__", + "__ge__", + "__eq__", + "__ne__", + "__add__", + "__radd__", + "__iadd__", + "__sub__", + "__rsub__", + "__isub__", + "__mul__", + "__rmul__", + "__imul__", + "__matmul__", + "__rmatmul__", + "__truediv__", + "__rtruediv__", + "__itruediv__", + "__floordiv__", + "__rfloordiv__", + "__ifloordiv__", + "__mod__", + "__rmod__", + "__imod__", + "__pow__", + "__rpow__", + "__ipow__", + "__lshift__", + "__rlshift__", + "__ilshift__", + "__rshift__", + "__rrshift__", + "__irshift__", + "__and__", + "__rand__", + "__iand__", + "__xor__", + "__rxor__", + "__ixor__", + "__or__", + "__ror__", + "__ior__", # Unsupported operators # '__imatmul__', # '__divmod__', '__rdivmod__', '__idivmod__', @@ -3726,35 +4470,33 @@ class UnknownType: # TODO: refactor to inline these _types = [ - torch.half, torch.float, torch.double, - torch.int8, torch.short, torch.int, torch.long, - torch.uint8 + torch.half, + torch.float, + torch.double, + torch.int8, + torch.short, + torch.int, + torch.long, + torch.uint8, ] - # TODO: refactor to use make_tensor - def _small_2d(dtype, device, has_zeros=True, fill_ones=False, oneish=False): - t = _make_tensor((5, 5), dtype, device, fill_ones=fill_ones) - if oneish: - return t.clamp(min=_number(.99, 1, dtype), max=1.01) - if not has_zeros: - return t.clamp(min=(_number(_div_min, 1, dtype))) - return t - def create_test_func(op): @dtypes(*_types) def test(self, device, dtype): # Generate the inputs - tensor = _small_2d(dtype, device) + tensor = torch.empty((), device=device, dtype=dtype) # Runs the tensor op on the device result = getattr(tensor, op)(UnknownType()) self.assertEqual(result, NotImplemented) + return test for op in tensor_binary_ops: test_name = "test_{}_not_implemented".format(op) assert not hasattr(cls, test_name), "{0} already in {1}".format( - test_name, cls.__name__) + test_name, cls.__name__ + ) setattr(cls, test_name, create_test_func(op)) @@ -3762,5 +4504,5 @@ def test(self, device, dtype): generate_not_implemented_tests(TestBinaryUfuncs) instantiate_device_type_tests(TestBinaryUfuncs, globals()) -if __name__ == '__main__': +if __name__ == "__main__": run_tests() diff --git a/test/test_complex.py b/test/test_complex.py index 9f2e0ad32401..88404902631f 100644 --- a/test/test_complex.py +++ b/test/test_complex.py @@ -3,12 +3,12 @@ import torch from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes from torch.testing._internal.common_utils import TestCase, run_tests -from torch.testing._internal.common_dtype import get_all_complex_dtypes +from torch.testing._internal.common_dtype import complex_types devices = (torch.device('cpu'), torch.device('cuda:0')) class TestComplexTensor(TestCase): - @dtypes(*get_all_complex_dtypes()) + @dtypes(*complex_types()) def test_to_list(self, device, dtype): # test that the complex float tensor has expected values and # there's no garbage value in the resultant list diff --git a/test/test_cpp_extensions_jit.py b/test/test_cpp_extensions_jit.py index 06b1133d887f..9875f4ee3567 100644 --- a/test/test_cpp_extensions_jit.py +++ b/test/test_cpp_extensions_jit.py @@ -10,15 +10,12 @@ import subprocess import glob -import textwrap -from multiprocessing import Process - import torch.testing._internal.common_utils as common import torch import torch.backends.cudnn import torch.utils.cpp_extension from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME -from torch.testing._internal.common_utils import gradcheck, TEST_WITH_ASAN, has_breakpad +from torch.testing._internal.common_utils import gradcheck TEST_CUDA = torch.cuda.is_available() and CUDA_HOME is not None @@ -869,80 +866,6 @@ def test_custom_compound_op_autograd(self): gradcheck(torch.ops.my.add, [a, b], eps=1e-2) - @staticmethod - def _crash_handler_test_process(stderr_file, destination): - # Code to enable dumps and trigger a segfault - if sys.platform == "win32": - destination = destination.replace("\\", "\\\\") - csrc = textwrap.dedent(f""" - #include - #include - #include - #include - #include - - int fail() {{ - std::wstring_convert> converter; - std::string narrow("{destination}"); - std::wstring wide = converter.from_bytes(narrow); - torch::crash_handler::enable_minidumps(wide.c_str()); - - volatile int* bad = nullptr; - return *bad; - }} - """) - else: - csrc = textwrap.dedent(f""" - #include - - int fail() {{ - torch::crash_handler::enable_minidumps("{destination}"); - - volatile int* bad = nullptr; - return *bad; - }} - """) - - # Some special stuff to overwrite stderr for a C++ extension - # Copied from: https://stackoverflow.com/questions/8804893/redirect-stdout-from-python-for-c-calls - sys.stdout.flush() - newstdout = os.dup(2) - devnull = os.open(stderr_file, os.O_WRONLY) - os.dup2(devnull, 2) - os.close(devnull) - sys.stdout = os.fdopen(newstdout, 'w') - - module = torch.utils.cpp_extension.load_inline( - name="segfault", - cpp_sources=csrc, - functions=["fail"], - ) - module.fail() - - @unittest.skipIf(TEST_WITH_ASAN, "ASAN disables the crash handler's signal handler") - @unittest.skipIf(not has_breakpad(), "Built without breakpad") - @unittest.skipIf(os.environ.get("TEST_CONFIG") == "force_on_cpu", "fails on force_on_cpu config, tracked w/ #65253") - def test_crash_handler(self): - with tempfile.TemporaryDirectory() as temp_dir, tempfile.NamedTemporaryFile(delete=not sys.platform == "win32") as stderr: - # Use multiprocessing to spin up a separate process to make catching - # the segfault easier - p = Process(target=self._crash_handler_test_process, args=(stderr.name, temp_dir)) - p.start() - p.join() - - with open(stderr.name) as f: - result = f.read().strip() - - # Check that the signal handler was called - self.assertTrue(result.startswith(f"Wrote minidump to {temp_dir}")) - - with open(result.replace("Wrote minidump to ", ""), "rb") as dump_file: - dump_bytes = dump_file.read() - - # Check that the file has the correct magic number - self.assertEqual(b"MDMP", dump_bytes[0:4]) - - if __name__ == "__main__": common.run_tests() diff --git a/test/test_cuda.py b/test/test_cuda.py index 7df9f637274c..c3d33224c361 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -7,6 +7,7 @@ import ctypes import gc import io +import os import pickle import queue import sys @@ -64,7 +65,7 @@ def make_sparse_tensor(t, n, *sizes): torch.cat([torch.LongTensor(1, n).random_(s) for s in sizes], 0)) v = tensor._values() v = v.new(n).copy_(torch.randn(n)) - return t(i, v, torch.Size(sizes)) + return t(i, v, torch.Size(sizes)).coalesce() _cycles_per_ms = None @@ -568,18 +569,40 @@ def test_serialization_array_with_storage(self): self.assertTrue(isinstance(q_copy[0], torch.cuda.FloatTensor)) self.assertTrue(isinstance(q_copy[1], torch.cuda.IntTensor)) self.assertTrue(isinstance(q_copy[2], torch.cuda.FloatTensor)) - self.assertTrue(isinstance(q_copy[3], torch.storage.TypedStorage)) - self.assertTrue(isinstance(q_copy[3]._storage, torch.cuda.UntypedStorage)) + self.assertTrue(isinstance(q_copy[3], torch.storage._TypedStorage)) + self.assertTrue(isinstance(q_copy[3]._storage, torch.cuda._UntypedStorage)) q_copy[1].fill_(10) self.assertEqual(q_copy[3], torch.cuda.IntStorage(10).fill_(10)) def test_cublas_allow_tf32_get_set(self): + skip_tf32_cublas = 'TORCH_ALLOW_TF32_CUBLAS_OVERRIDE' in os.environ and\ + int(os.environ['TORCH_ALLOW_TF32_CUBLAS_OVERRIDE']) + if skip_tf32_cublas: + self.assertTrue(torch.backends.cuda.matmul.allow_tf32) + return + orig = torch.backends.cuda.matmul.allow_tf32 self.assertEqual(torch._C._get_cublas_allow_tf32(), orig) torch.backends.cuda.matmul.allow_tf32 = not orig self.assertEqual(torch._C._get_cublas_allow_tf32(), not orig) torch.backends.cuda.matmul.allow_tf32 = orig + def test_float32_matmul_precision_get_set(self): + self.assertEqual(torch.get_float32_matmul_precision(), 'highest') + skip_tf32_cublas = 'TORCH_ALLOW_TF32_CUBLAS_OVERRIDE' in os.environ and\ + int(os.environ['TORCH_ALLOW_TF32_CUBLAS_OVERRIDE']) + if not skip_tf32_cublas: + self.assertFalse(torch.backends.cuda.matmul.allow_tf32) + for p in ('medium', 'high'): + torch.set_float32_matmul_precision(p) + self.assertEqual(torch.get_float32_matmul_precision(), p) + if not skip_tf32_cublas: + self.assertTrue(torch.backends.cuda.matmul.allow_tf32) + torch.set_float32_matmul_precision('highest') + self.assertEqual(torch.get_float32_matmul_precision(), 'highest') + if not skip_tf32_cublas: + self.assertFalse(torch.backends.cuda.matmul.allow_tf32) + def test_cublas_allow_fp16_reduced_precision_reduction_get_set(self): orig = torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction self.assertEqual(torch._C._get_cublas_allow_fp16_reduced_precision_reduction(), orig) @@ -1519,6 +1542,7 @@ def _spawn_test_multinomial_invalid_probs_cuda(self, probs): self.assertTrue(any([msg in out or msg in err for msg in expected_messages])) @slowTest + @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support device side asserts") @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \ don't support multiprocessing with spawn start method") def test_multinomial_invalid_probs_cuda(self): @@ -1956,6 +1980,7 @@ def worker(rank): t2.start() """]) + @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support device side asserts") def test_fixed_cuda_assert_async(self): with self.assertRaisesRegex(RuntimeError, "Boolean value of Tensor with no values is ambiguous"): torch._assert_async(torch.tensor([], device="cuda")) @@ -3007,7 +3032,6 @@ def test_autocast_rnn(self): x = torch.randn((B, T, F), device="cuda", dtype=input_dtype) elif input_layout == "packed": batch_first = False - x = torch.randn((T, B, F), device="cuda", dtype=input_dtype) x = torch.nn.utils.rnn.pack_padded_sequence(torch.randn((T, B, F), device="cuda", dtype=input_dtype), lengths=(3, 2, 1, 3), @@ -3103,6 +3127,18 @@ def test_max_large_axis(self): def test_to_numpy(self): self.assertRaises(TypeError, lambda: torch.empty(1, device="cuda").numpy()) + def test_graph_is_current_stream_capturing(self): + self.assertFalse(torch.cuda.is_current_stream_capturing()) + + if (TEST_CUDA and (not TEST_WITH_ROCM) and int(torch.version.cuda.split(".")[0]) >= 11): + s = torch.cuda.Stream() + with torch.cuda.stream(s): + g = torch.cuda.CUDAGraph() + self.assertFalse(torch.cuda.is_current_stream_capturing()) + g.capture_begin() + self.assertTrue(torch.cuda.is_current_stream_capturing()) + g.capture_end() + @unittest.skipIf((not TEST_CUDA) or TEST_WITH_ROCM or int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs") @@ -3124,6 +3160,14 @@ def test_graph_capture_simple(self): self.assertTrue(b.sum().item() == 11000.) + @unittest.skipIf((not TEST_CUDA) or + TEST_WITH_ROCM or + int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs") + def test_graph_capture_oom(self): + with self.assertRaisesRegex(RuntimeError, "out of memory"): + with torch.cuda.graph(torch.cuda.CUDAGraph()): + torch.zeros(2 ** 40, device="cuda") + @unittest.skipIf((not TEST_CUDA) or TEST_WITH_ROCM or int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs") @@ -3813,6 +3857,41 @@ def get_max_used(): self.assertEqual(matmul_expand_mem, matmul_mem) self.assertEqual(bmm_mem, matmul_mem) + @unittest.skipIf(not TEST_WITH_ROCM, "ROCm-only test") + def test_rocm_backward_pass_guard(self): + # The test exercises a ROCm-specific feature. + + class MyFunction(torch.autograd.Function): + @staticmethod + def forward(ctx, tensor, constant): + self.assertFalse(torch._C._rocm_is_backward_pass()) + ctx.constant = constant + return tensor * constant + + @staticmethod + def backward(ctx, grad_output): + self.assertTrue(torch._C._rocm_is_backward_pass()) + return grad_output * ctx.constant, None + + class MyModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.a = torch.nn.Parameter(torch.randn(())) + + def forward(self, x): + return MyFunction.apply(x, self.a) + + model = MyModule() + criterion = torch.nn.MSELoss(reduction='sum') + optimizer = torch.optim.SGD(model.parameters(), lr=1e-6) + + x = torch.randn(5, 5) + result = model(x) + loss = criterion(result, x) + optimizer.zero_grad() + loss.backward() + optimizer.step() + class TestCudaComm(TestCase): def _test_broadcast(self, input): @@ -3940,7 +4019,7 @@ def _test_reduce_add_coalesced(self, tensors, buffer_size): r_tensors = [comm.reduce_add(t) for t in zip(*dup_tensors)] for r, t in zip(r_tensors, tensors): self.assertEqualTypeString(r, t) - self.assertEqual(r, t * 2) + self.assertEqual(r.coalesce() if r.is_sparse else r, t * 2) rc_tensors = comm.reduce_add_coalesced(dup_tensors, buffer_size=buffer_size) self.assertEqual(r_tensors, rc_tensors) diff --git a/test/test_dataloader.py b/test/test_dataloader.py index bb45ad244741..6cefc78c2ed9 100644 --- a/test/test_dataloader.py +++ b/test/test_dataloader.py @@ -23,6 +23,7 @@ DataLoader2, Dataset, IterableDataset, + IterDataPipe, Subset, TensorDataset, communication, @@ -35,7 +36,8 @@ from torch._utils import ExceptionWrapper from torch.testing._internal.common_utils import (TestCase, run_tests, TEST_NUMPY, IS_WINDOWS, IS_IN_CI, NO_MULTIPROCESSING_SPAWN, skipIfRocm, slowTest, - load_tests, TEST_WITH_ASAN, TEST_WITH_TSAN, IS_SANDCASTLE) + load_tests, TEST_WITH_ASAN, TEST_WITH_TSAN, IS_SANDCASTLE, + IS_MACOS) try: @@ -62,6 +64,14 @@ HAS_DILL = False skipIfNoDill = unittest.skipIf(not HAS_DILL, "no dill") + +try: + import numpy as np + HAS_NUMPY = True +except ImportError: + HAS_NUMPY = False +skipIfNoNumpy = unittest.skipIf(not HAS_NUMPY, "no NumPy") + # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for # sharding on sandcastle. This line silences flake warnings load_tests = load_tests @@ -833,6 +843,21 @@ def __len__(self): return int(math.ceil(len(self.dataset) / float(self.batch_size))) +class TestMultiEpochDataset(IterableDataset): + def __init__(self, length): + self.length = length + + def __iter__(self): + worker_info = torch.utils.data.get_worker_info() + assert worker_info is not None + worker_id = worker_info.id + for idx in range(self.length // worker_info.num_workers): + yield worker_id + + def __len__(self): + return self.length + + class CustomList(list): pass @@ -841,6 +866,14 @@ class CustomDict(dict): pass +def row_processor(row): + return np.add(row, 1) + + +def filter_len(row): + return len(row) == 4 + + @unittest.skipIf( TEST_WITH_TSAN, "Fails with TSAN with the following error: starting new threads after multi-threaded " @@ -1343,6 +1376,7 @@ def test_chain_iterable_style_dataset(self): with self.assertRaisesRegex(AssertionError, "ChainDataset only supports IterableDataset"): list(iter(ChainDataset([dataset1, self.dataset]))) + @unittest.skipIf(IS_MACOS, "Not working on macos") def test_multiprocessing_contexts(self): reference = [ torch.arange(3), @@ -1366,6 +1400,30 @@ def test_multiprocessing_contexts(self): self.assertEqual( reference, list(self._get_data_loader(ds_cls(counting_ds_n), multiprocessing_context=ctx, **dl_common_args))) + @skipIfNoNumpy + def test_multiprocessing_iterdatapipe(self): + # Testing to make sure that function from global scope (e.g. imported from library) can be serialized + # and used with multiprocess DataLoader + + reference = [torch.as_tensor([[2, 3, 4, 5]], dtype=torch.int64), + torch.as_tensor([[2, 3, 4, 5]], dtype=torch.int64)] + datapipe: IterDataPipe = IterableWrapper([[1, 2, 3, 4], [1, 2, 3, 4, 5, 6]]) + datapipe = datapipe.map(row_processor) + datapipe = datapipe.filter(lambda row: len(row) == 4) if HAS_DILL else datapipe.filter(filter_len) + + dl_common_args = dict(num_workers=2, batch_size=2, shuffle=True, pin_memory=(not TEST_CUDA)) + for ctx in supported_multiprocessing_contexts: + self.assertEqual(reference, + [t.type(torch.int64) + for t in self._get_data_loader(datapipe, multiprocessing_context=ctx, **dl_common_args)]) + if ctx is not None: + # test ctx object + ctx = mp.get_context(ctx) + self.assertEqual(reference, + [t.type(torch.int64) + for t in + self._get_data_loader(datapipe, multiprocessing_context=ctx, **dl_common_args)]) + def test_worker_seed(self): num_workers = 6 batch_size = 1 @@ -1385,6 +1443,19 @@ def get_dataloader(): dataset = SynchronizedSeedDataset(num_workers, batch_size, num_workers) self.assertEqual(set(int(batch) for batch in get_dataloader()), set(int(batch) for batch in get_dataloader())) + def test_multi_epochs_reproducibility(self): + num_workers = 2 + batch_size = 10 + num_epochs = 3 + + dataset = TestMultiEpochDataset(batch_size * num_workers) + dataloader = self._get_data_loader(dataset, batch_size=batch_size, + shuffle=False, num_workers=num_workers) + + for ind in range(num_epochs): + for batch_idx, sample in enumerate(dataloader): + self.assertEqual(sample.tolist(), [batch_idx % num_workers] * batch_size) + def test_worker_init_fn(self): dataset = SeedDataset(4) dataloader = self._get_data_loader(dataset, batch_size=2, num_workers=2, @@ -2104,6 +2175,13 @@ def test_basics(self): self.assertEqual(list(dl), list(dl2)) self.assertEqual(list(dl), list(dl2_threading)) + class Sorter(IterDataPipe): + def __init__(self, datapipe): + self.datapipe = datapipe + + def __iter__(self): + return iter(sorted(self.datapipe)) + def test_shuffle(self): items = list(range(1000)) dp = IterableWrapper(items).sharding_filter().shuffle() @@ -2111,19 +2189,27 @@ def test_shuffle(self): dl = DataLoader2(dp, batch_size=None, num_workers=2, shuffle=False) self.assertEqual(items, list(dl)) - dl = DataLoader(dp, batch_size=None, num_workers=2, shuffle=False, - worker_init_fn=torch.utils.data.backward_compatibility.worker_init_fn) + dl = DataLoader2(dp, batch_size=None, num_workers=2, shuffle=False, + worker_init_fn=torch.utils.data.backward_compatibility.worker_init_fn) self.assertEqual(items, list(dl)) dl = DataLoader2(dp, batch_size=None, num_workers=2, shuffle=True) self.assertNotEqual(items, list(dl)) self.assertEqual(items, sorted(list(dl))) - dl = DataLoader(dp, batch_size=None, num_workers=2, shuffle=True, - worker_init_fn=torch.utils.data.backward_compatibility.worker_init_fn) + dl = DataLoader2(dp, batch_size=None, num_workers=2, shuffle=True, + worker_init_fn=torch.utils.data.backward_compatibility.worker_init_fn) self.assertNotEqual(items, list(dl)) self.assertEqual(items, sorted(list(dl))) + dl = DataLoader2(self.Sorter(dp), batch_size=None, num_workers=2, shuffle=True) + self.assertEqual(list(dl), items) + + dl = DataLoader2(self.Sorter(dp), batch_size=None, num_workers=2, shuffle=True, + worker_init_fn=torch.utils.data.backward_compatibility.worker_init_fn) + self.assertEqual(list(dl), items) + + @unittest.skipIf( TEST_WITH_TSAN, "Fails with TSAN with the following error: starting new threads after multi-threaded " @@ -2265,6 +2351,19 @@ def test_pin_memory(self): self.assertTrue(sample['a_tensor'].is_pinned()) self.assertTrue(sample['another_dict']['a_number'].is_pinned()) + @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") + def test_pin_memory_device(self): + loader = DataLoader(self.dataset, batch_size=2, pin_memory=True, pin_memory_device='cuda') + for sample in loader: + self.assertTrue(sample['a_tensor'].is_pinned(device='cuda')) + self.assertTrue(sample['another_dict']['a_number'].is_pinned(device='cuda')) + + @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") + def test_pin_memory_with_only_device(self): + loader = DataLoader(self.dataset, batch_size=2, pin_memory_device='cuda') + for sample in loader: + self.assertFalse(sample['a_tensor'].is_pinned(device='cuda')) + self.assertFalse(sample['another_dict']['a_number'].is_pinned(device='cuda')) class DummyDataset(torch.utils.data.Dataset): def __init__(self): diff --git a/test/test_datapipe.py b/test/test_datapipe.py index ab56f0b41eb9..7e76618f338f 100644 --- a/test/test_datapipe.py +++ b/test/test_datapipe.py @@ -53,9 +53,9 @@ from torch.utils.data.datapipes.dataframe import CaptureDataFrame from torch.utils.data.datapipes.dataframe import dataframe_wrapper as df_wrapper - try: import dill + # XXX: By default, dill writes the Pickler dispatch table to inject its # own logic there. This globally affects the behavior of the standard library # pickler for any user who transitively depends on this module! @@ -68,6 +68,7 @@ try: import pandas # type: ignore[import] # noqa: F401 F403 + HAS_PANDAS = True except ImportError: HAS_PANDAS = False @@ -234,16 +235,16 @@ def test_api(self): self.assertTrue(fd.closed) def test_pickle(self): - f = tempfile.TemporaryFile() - with self.assertRaises(TypeError) as ctx1: - pickle.dumps(f) + with tempfile.TemporaryFile() as f: + with self.assertRaises(TypeError) as ctx1: + pickle.dumps(f) - wrap_f = StreamWrapper(f) - with self.assertRaises(TypeError) as ctx2: - pickle.dumps(wrap_f) + wrap_f = StreamWrapper(f) + with self.assertRaises(TypeError) as ctx2: + pickle.dumps(wrap_f) - # Same exception when pickle - self.assertEqual(str(ctx1.exception), str(ctx2.exception)) + # Same exception when pickle + self.assertEqual(str(ctx1.exception), str(ctx2.exception)) fd = TestStreamWrapper._FakeFD("") wrap_fd = StreamWrapper(fd) @@ -254,9 +255,9 @@ def test_repr(self): wrap_fd = StreamWrapper(fd) self.assertEqual(str(wrap_fd), "StreamWrapper") - f = tempfile.TemporaryFile() - wrap_f = StreamWrapper(f) - self.assertEqual(str(wrap_f), "StreamWrapper<" + str(f) + ">") + with tempfile.TemporaryFile() as f: + wrap_f = StreamWrapper(f) + self.assertEqual(str(wrap_f), "StreamWrapper<" + str(f) + ">") class TestIterableDataPipeBasic(TestCase): @@ -310,7 +311,7 @@ def test_listdirfilesdeterministic_iterable_datapipe(self): # The output order should be always the same. self.assertEqual(list(datapipe), list(datapipe)) - def test_readfilesfromdisk_iterable_datapipe(self): + def test_openfilesfromdisk_iterable_datapipe(self): # test import datapipe class directly from torch.utils.data.datapipes.iter import ( FileLister, @@ -330,6 +331,22 @@ def test_readfilesfromdisk_iterable_datapipe(self): rec[1].close() self.assertEqual(count, len(self.temp_files)) + # functional API + datapipe3 = datapipe1.open_files(mode='b') + + count = 0 + for rec in datapipe3: + count = count + 1 + self.assertTrue(rec[0] in self.temp_files) + with open(rec[0], 'rb') as f: + self.assertEqual(rec[1].read(), f.read()) + rec[1].close() + self.assertEqual(count, len(self.temp_files)) + + # __len__ Test + with self.assertRaises(TypeError): + len(datapipe3) + def test_routeddecoder_iterable_datapipe(self): temp_dir = self.temp_dir.name temp_pngfile_pathname = os.path.join(temp_dir, "test_png.png") @@ -361,12 +378,14 @@ def _helper(prior_dp, dp, channel_first=False): self.assertTrue(inp[1].closed) cached = list(datapipe2) - datapipe3 = dp.iter.RoutedDecoder(cached, _png_decoder) + with warnings.catch_warnings(record=True) as wa: + datapipe3 = dp.iter.RoutedDecoder(cached, _png_decoder) datapipe3.add_handler(decoder_basichandlers) _helper(cached, datapipe3) cached = list(datapipe2) - datapipe4 = dp.iter.RoutedDecoder(cached, decoder_basichandlers) + with warnings.catch_warnings(record=True) as wa: + datapipe4 = dp.iter.RoutedDecoder(cached, decoder_basichandlers) datapipe4.add_handler(_png_decoder) _helper(cached, datapipe4, channel_first=True) @@ -415,7 +434,7 @@ def test_demux_mux_datapipe(self): numbers = NumbersDataset(10) n1, n2, n3 = numbers.demux(3, lambda x: x % 3) n = n1.mux(n2, n3) - self.assertEqual(list(range(10)), list(n)) + self.assertEqual(list(range(9)), list(n)) # Functional Test: Uneven DataPipes source_numbers = list(range(0, 10)) + [10, 12] @@ -424,7 +443,7 @@ def test_demux_mux_datapipe(self): self.assertEqual([0, 2, 4, 6, 8, 10, 12], list(n1)) self.assertEqual([1, 3, 5, 7, 9], list(n2)) n = n1.mux(n2) - self.assertEqual(source_numbers, list(n)) + self.assertEqual(list(range(10)), list(n)) @suppress_warnings # Suppress warning for lambda fn def test_map_with_col_file_handle_datapipe(self): @@ -465,9 +484,11 @@ def operations(df): df['c'] = df.b + df['a'] * 7 # somehow swallows pandas UserWarning when `df.c = df.b + df['a'] * 7` return df + self.compare_capture_and_eager(operations) +@skipIf(True, "Fix DataFramePipes Tests") class TestDataFramesPipes(TestCase): """ Most of test will fail if pandas instaled, but no dill available. @@ -482,8 +503,8 @@ def _get_dataframes_pipe(self, range=10, dataframe_size=7): return NumbersDataset(range) \ .map(lambda i: (i, i % 3)) \ ._to_dataframes_pipe( - columns=['i', 'j'], - dataframe_size=dataframe_size) + columns=['i', 'j'], + dataframe_size=dataframe_size) @skipIfNoDataFrames @skipIfNoDill # TODO(VitalyFedyunin): Decouple tests from dill by avoiding lambdas in map @@ -549,59 +570,146 @@ def _fake_add(constant, data): def _fake_filter_fn(data): - return data >= 5 + return True +def _simple_filter_fn(data): + return data >= 5 + def _fake_filter_fn_constant(constant, data): return data >= constant +def _mul_10(x): + return x * 10 + + +def _mod_3_test(x): + return x % 3 == 1 + + def _worker_init_fn(worker_id): - random.seed(123) + info = torch.utils.data.get_worker_info() + num_workers = info.num_workers + datapipe = info.dataset + torch.utils.data.graph_settings.apply_sharding(datapipe, num_workers, worker_id) class TestFunctionalIterDataPipe(TestCase): + def _serialization_test_helper(self, datapipe, use_dill): + if use_dill: + serialized_dp = dill.dumps(datapipe) + deserialized_dp = dill.loads(serialized_dp) + else: + serialized_dp = pickle.dumps(datapipe) + deserialized_dp = pickle.loads(serialized_dp) + try: + self.assertEqual(list(datapipe), list(deserialized_dp)) + except AssertionError as e: + print(f"{datapipe} is failing.") + raise e + + def _serialization_test_for_single_dp(self, dp, use_dill=False): + # 1. Testing for serialization before any iteration starts + self._serialization_test_helper(dp, use_dill) + # 2. Testing for serialization after DataPipe is partially read + it = iter(dp) + _ = next(it) + self._serialization_test_helper(dp, use_dill) + # 3. Testing for serialization after DataPipe is fully read + it = iter(dp) + _ = list(it) + self._serialization_test_helper(dp, use_dill) + + def _serialization_test_for_dp_with_children(self, dp1, dp2, use_dill=False): + # 1. Testing for serialization before any iteration starts + self._serialization_test_helper(dp1, use_dill) + self._serialization_test_helper(dp2, use_dill) + + # 2. Testing for serialization after DataPipe is partially read + it1, it2 = iter(dp1), iter(dp2) + _, _ = next(it1), next(it2) + # Catch `fork`, `demux` "some child DataPipes are not exhausted" warning + with warnings.catch_warnings(record=True) as wa: + self._serialization_test_helper(dp1, use_dill) + self._serialization_test_helper(dp2, use_dill) + + # 2.5. Testing for serialization after one child DataPipe is fully read + # (Only for DataPipes with children DataPipes) + it1 = iter(dp1) + _ = list(it1) # fully read one child + # Catch `fork`, `demux` "some child DataPipes are not exhausted" warning + with warnings.catch_warnings(record=True) as wa: + self._serialization_test_helper(dp1, use_dill) + self._serialization_test_helper(dp2, use_dill) + + # 3. Testing for serialization after DataPipe is fully read + it2 = iter(dp2) + _ = list(it2) # fully read the other child + self._serialization_test_helper(dp1, use_dill) + self._serialization_test_helper(dp2, use_dill) + def test_serializable(self): - input_dp = dp.iter.IterableWrapper(range(10)) - picklable_datapipes: List[Tuple[Type[IterDataPipe], Tuple, Dict[str, Any]]] = [ - (dp.iter.Batcher, (3, True,), {}), - (dp.iter.Collator, (_fake_fn,), {}), - (dp.iter.Concater, (dp.iter.IterableWrapper(range(5)),), {}), - (dp.iter.Demultiplexer, (2, _fake_filter_fn), {}), - (dp.iter.FileLister, (), {}), - (dp.iter.FileOpener, (), {}), - (dp.iter.Filter, (_fake_filter_fn,), {}), - (dp.iter.Filter, (partial(_fake_filter_fn_constant, 5),), {}), - (dp.iter.Forker, (2,), {}), - (dp.iter.Grouper, (_fake_filter_fn,), {"group_size": 2}), - (dp.iter.IterableWrapper, (), {}), - (dp.iter.Mapper, (_fake_fn, ), {}), - (dp.iter.Mapper, (partial(_fake_add, 1), ), {}), - (dp.iter.Multiplexer, (input_dp,), {}), - (dp.iter.Sampler, (), {}), - (dp.iter.Shuffler, (), {}), - (dp.iter.StreamReader, (), {}), - (dp.iter.UnBatcher, (), {}), - (dp.iter.Zipper, (input_dp,), {}), + picklable_datapipes: List = [ + (dp.iter.Batcher, None, (3, True,), {}), + (dp.iter.Collator, None, (_fake_fn,), {}), + (dp.iter.Concater, None, (dp.iter.IterableWrapper(range(5)),), {}), + (dp.iter.Demultiplexer, None, (2, _simple_filter_fn), {}), + (dp.iter.FileLister, ".", (), {}), + (dp.iter.FileOpener, None, (), {}), + (dp.iter.Filter, None, (_fake_filter_fn,), {}), + (dp.iter.Filter, None, (partial(_fake_filter_fn_constant, 5),), {}), + (dp.iter.Forker, None, (2,), {}), + (dp.iter.Grouper, None, (_fake_filter_fn,), {"group_size": 2}), + (dp.iter.IterableWrapper, range(10), (), {}), + (dp.iter.Mapper, None, (_fake_fn,), {}), + (dp.iter.Mapper, None, (partial(_fake_add, 1),), {}), + (dp.iter.Multiplexer, None, (dp.iter.IterableWrapper(range(10)),), {}), + (dp.iter.Sampler, None, (), {}), + (dp.iter.Shuffler, dp.iter.IterableWrapper([0] * 10), (), {}), + (dp.iter.StreamReader, None, (), {}), + (dp.iter.UnBatcher, None, (0,), {}), + (dp.iter.Zipper, None, (dp.iter.IterableWrapper(range(10)),), {}), ] - for dpipe, dp_args, dp_kwargs in picklable_datapipes: - print(dpipe) - _ = pickle.dumps(dpipe(input_dp, *dp_args, **dp_kwargs)) # type: ignore[call-arg] + # Skipping comparison for these DataPipes + dp_skip_comparison = {dp.iter.FileOpener, dp.iter.StreamReader} + # These DataPipes produce multiple DataPipes as outputs and those should be compared + dp_compare_children = {dp.iter.Demultiplexer, dp.iter.Forker} + + for dpipe, custom_input, dp_args, dp_kwargs in picklable_datapipes: + if custom_input is None: + custom_input = dp.iter.IterableWrapper(range(10)) + if dpipe in dp_skip_comparison: # Merely make sure they are picklable and loadable (no value comparison) + datapipe = dpipe(custom_input, *dp_args, **dp_kwargs) # type: ignore[call-arg] + serialized_dp = pickle.dumps(datapipe) + _ = pickle.loads(serialized_dp) + elif dpipe in dp_compare_children: # DataPipes that have children + dp1, dp2 = dpipe(custom_input, *dp_args, **dp_kwargs) # type: ignore[call-arg] + self._serialization_test_for_dp_with_children(dp1, dp2) + else: # Single DataPipe that requires comparison + datapipe = dpipe(custom_input, *dp_args, **dp_kwargs) # type: ignore[call-arg] + self._serialization_test_for_single_dp(datapipe) def test_serializable_with_dill(self): - """Only for DataPipes that take in a function or buffer as argument""" + """Only for DataPipes that take in a function as argument""" input_dp = dp.iter.IterableWrapper(range(10)) unpicklable_datapipes: List[Tuple[Type[IterDataPipe], Tuple, Dict[str, Any]]] = [ (dp.iter.Collator, (lambda x: x,), {}), (dp.iter.Demultiplexer, (2, lambda x: x % 2,), {}), (dp.iter.Filter, (lambda x: x >= 5,), {}), (dp.iter.Grouper, (lambda x: x >= 5,), {}), - (dp.iter.Mapper, (lambda x: x, ), {}), + (dp.iter.Mapper, (lambda x: x,), {}), ] + dp_compare_children = {dp.iter.Demultiplexer} if HAS_DILL: for dpipe, dp_args, dp_kwargs in unpicklable_datapipes: - _ = pickle.dumps(dpipe(input_dp, *dp_args, **dp_kwargs)) # type: ignore[call-arg] + if dpipe in dp_compare_children: + dp1, dp2 = dpipe(input_dp, *dp_args, **dp_kwargs) # type: ignore[call-arg] + self._serialization_test_for_dp_with_children(dp1, dp2, use_dill=True) + else: + datapipe = dpipe(input_dp, *dp_args, **dp_kwargs) # type: ignore[call-arg] + self._serialization_test_for_single_dp(datapipe, use_dill=True) else: for dpipe, dp_args, dp_kwargs in unpicklable_datapipes: with warnings.catch_warnings(record=True) as wa: @@ -738,28 +846,38 @@ def test_fork_iterdatapipe(self): self.assertEqual(list(range(5)), output2) self.assertEqual(list(range(10)), output3) - # Reset Test: DataPipe doesn't reset if this pipe hasn't been read + # Reset Test: DataPipe resets when a new iterator is created, even if this datapipe hasn't been read dp1, dp2 = input_dp.fork(num_instances=2) - i1, i2 = iter(dp1), iter(dp2) + _ = iter(dp1) output2 = [] - for i, n2 in enumerate(i2): - output2.append(n2) - if i == 4: - i1 = iter(dp1) # Doesn't reset because i1 hasn't been read - self.assertEqual(list(range(10)), output2) + with self.assertRaisesRegex(RuntimeError, r"iterator has been invalidated"): + for i, n2 in enumerate(dp2): + output2.append(n2) + if i == 4: + with warnings.catch_warnings(record=True) as wa: + _ = iter(dp1) # This will reset all child DataPipes + self.assertEqual(len(wa), 1) + self.assertRegex(str(wa[0].message), r"child DataPipes are not exhausted") + self.assertEqual(list(range(5)), output2) - # Reset Test: DataPipe reset when some of it have been read + # Reset Test: DataPipe resets when some of it has been read dp1, dp2 = input_dp.fork(num_instances=2) - i1, i2 = iter(dp1), iter(dp2) output1, output2 = [], [] - for i, (n1, n2) in enumerate(zip(i1, i2)): + for i, (n1, n2) in enumerate(zip(dp1, dp2)): output1.append(n1) output2.append(n2) if i == 4: with warnings.catch_warnings(record=True) as wa: - i1 = iter(dp1) # Reset both all child DataPipe + _ = iter(dp1) # Reset both all child DataPipe self.assertEqual(len(wa), 1) self.assertRegex(str(wa[0].message), r"Some child DataPipes are not exhausted") + break + with warnings.catch_warnings(record=True) as wa: + for i, (n1, n2) in enumerate(zip(dp1, dp2)): + output1.append(n1) + output2.append(n2) + self.assertEqual(len(wa), 1) + self.assertRegex(str(wa[0].message), r"child DataPipes are not exhausted") self.assertEqual(list(range(5)) + list(range(10)), output1) self.assertEqual(list(range(5)) + list(range(10)), output2) @@ -814,7 +932,7 @@ def test_mux_iterdatapipe(self): input_dp2 = dp.iter.IterableWrapper([10]) input_dp3 = dp.iter.IterableWrapper([100, 200, 300]) output_dp = input_dp1.mux(input_dp2, input_dp3) - expected_output = [1, 10, 100, 2, 200, 3, 300, 4] + expected_output = [1, 10, 100] self.assertEqual(len(expected_output), len(output_dp)) self.assertEqual(expected_output, list(output_dp)) @@ -822,8 +940,8 @@ def test_mux_iterdatapipe(self): input_dp1 = dp.iter.IterableWrapper([0, 1, 2, 3]) input_dp2 = dp.iter.IterableWrapper([]) output_dp = input_dp1.mux(input_dp2) - self.assertEqual(len(input_dp1), len(output_dp)) - self.assertEqual(list(input_dp1), list(output_dp)) + self.assertEqual(len(input_dp2), len(output_dp)) + self.assertEqual(list(input_dp2), list(output_dp)) # __len__ Test: raises TypeError when __len__ is called and an input doesn't have __len__ input_dp1 = dp.iter.IterableWrapper(range(10)) @@ -886,18 +1004,21 @@ def test_demux_iterdatapipe(self): next(it) next(it) - # Reset Test: DataPipe doesn't reset when it has not been read + # Reset Test: DataPipe resets when a new iterator is created, even if this datapipe hasn't been read dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: x % 2) - i1 = iter(dp1) + _ = iter(dp1) output2 = [] - i = 0 - for i, n2 in enumerate(dp2): - output2.append(n2) - if i == 4: - i1 = iter(dp1) + with self.assertRaisesRegex(RuntimeError, r"iterator has been invalidated"): + for i, n2 in enumerate(dp2): + output2.append(n2) + if i == 4: + with warnings.catch_warnings(record=True) as wa: + _ = iter(dp1) # This will reset all child DataPipes + self.assertEqual(len(wa), 1) + self.assertRegex(str(wa[0].message), r"child DataPipes are not exhausted") self.assertEqual(list(range(1, 10, 2)), output2) - # Reset Test: DataPipe reset when some of it has been read + # Reset Test: DataPipe resets when some of it has been read dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: x % 2) output1, output2 = [], [] for n1, n2 in zip(dp1, dp2): @@ -909,11 +1030,13 @@ def test_demux_iterdatapipe(self): i1 = iter(dp1) # Reset all child DataPipes self.assertEqual(len(wa), 1) self.assertRegex(str(wa[0].message), r"Some child DataPipes are not exhausted") - for n1, n2 in zip(dp1, dp2): - output1.append(n1) - output2.append(n2) - self.assertEqual([0, 2, 4] + list(range(0, 10, 2)), output1) - self.assertEqual([1, 3, 5] + list(range(1, 10, 2)), output2) + for n1, n2 in zip(dp1, dp2): + output1.append(n1) + output2.append(n2) + self.assertEqual([0, 2, 4] + list(range(0, 10, 2)), output1) + self.assertEqual([1, 3, 5] + list(range(1, 10, 2)), output2) + self.assertEqual(len(wa), 1) + self.assertRegex(str(wa[0].message), r"child DataPipes are not exhausted") # Reset Test: DataPipe reset, even when not all child DataPipes are exhausted dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: x % 2) @@ -964,7 +1087,8 @@ def test_demux_iterdatapipe(self): traverse(dp2) # This should not raise any error either def test_map_iterdatapipe(self): - input_dp = dp.iter.IterableWrapper(range(10)) + target_length = 10 + input_dp = dp.iter.IterableWrapper(range(target_length)) def fn(item, dtype=torch.float, *, sum=False): data = torch.tensor(item, dtype=dtype) @@ -972,21 +1096,21 @@ def fn(item, dtype=torch.float, *, sum=False): # Functional Test: apply to each element correctly map_dp = input_dp.map(fn) - self.assertEqual(len(input_dp), len(map_dp)) - for x, y in zip(map_dp, input_dp): + self.assertEqual(target_length, len(map_dp)) + for x, y in zip(map_dp, range(target_length)): self.assertEqual(x, torch.tensor(y, dtype=torch.float)) # Functional Test: works with partial function map_dp = input_dp.map(partial(fn, dtype=torch.int, sum=True)) - for x, y in zip(map_dp, input_dp): + for x, y in zip(map_dp, range(target_length)): self.assertEqual(x, torch.tensor(y, dtype=torch.int).sum()) # __len__ Test: inherits length from source DataPipe - self.assertEqual(len(input_dp), len(map_dp)) + self.assertEqual(target_length, len(map_dp)) - input_dp_nl = IDP_NoLen(range(10)) + input_dp_nl = IDP_NoLen(range(target_length)) map_dp_nl = input_dp_nl.map(lambda x: x) - for x, y in zip(map_dp_nl, input_dp_nl): + for x, y in zip(map_dp_nl, range(target_length)): self.assertEqual(x, torch.tensor(y, dtype=torch.float)) # __len__ Test: inherits length from source DataPipe - raises error when invalid @@ -1138,24 +1262,24 @@ def _collate_fn(batch, default_type=torch.float): # Functional Test: defaults to the default collate function when a custom one is not specified collate_dp = input_dp.collate() - for x, y in zip(input_dp, collate_dp): + for x, y in zip(arrs, collate_dp): self.assertEqual(torch.tensor(x), y) # Functional Test: custom collate function collate_dp = input_dp.collate(collate_fn=_collate_fn) - for x, y in zip(input_dp, collate_dp): + for x, y in zip(arrs, collate_dp): self.assertEqual(torch.tensor(sum(x), dtype=torch.float), y) # Functional Test: custom, partial collate function collate_dp = input_dp.collate(partial(_collate_fn, default_type=torch.int)) - for x, y in zip(input_dp, collate_dp): + for x, y in zip(arrs, collate_dp): self.assertEqual(torch.tensor(sum(x), dtype=torch.int), y) # Reset Test: reset the DataPipe and results are still correct n_elements_before_reset = 1 res_before_reset, res_after_reset = reset_after_n_next_calls(collate_dp, n_elements_before_reset) self.assertEqual([torch.tensor(6, dtype=torch.int)], res_before_reset) - for x, y in zip(input_dp, res_after_reset): + for x, y in zip(arrs, res_after_reset): self.assertEqual(torch.tensor(sum(x), dtype=torch.int), y) # __len__ Test: __len__ is inherited @@ -1166,7 +1290,7 @@ def _collate_fn(batch, default_type=torch.float): collate_dp_nl = input_dp_nl.collate() with self.assertRaisesRegex(TypeError, r"instance doesn't have valid length$"): len(collate_dp_nl) - for x, y in zip(input_dp_nl, collate_dp_nl): + for x, y in zip(arrs, collate_dp_nl): self.assertEqual(torch.tensor(x), y) def test_batch_iterdatapipe(self): @@ -1216,14 +1340,14 @@ def test_unbatch_iterdatapipe(self): input_dp = prebatch_dp.batch(3) unbatch_dp = input_dp.unbatch() self.assertEqual(len(list(unbatch_dp)), target_length) # __len__ is as expected - for i, res in zip(prebatch_dp, unbatch_dp): + for i, res in zip(range(target_length), unbatch_dp): self.assertEqual(i, res) # Functional Test: unbatch works for an input with nested levels input_dp = dp.iter.IterableWrapper([[0, 1, 2], [3, 4, 5]]) unbatch_dp = input_dp.unbatch() self.assertEqual(len(list(unbatch_dp)), target_length) - for i, res in zip(prebatch_dp, unbatch_dp): + for i, res in zip(range(target_length), unbatch_dp): self.assertEqual(i, res) input_dp = dp.iter.IterableWrapper([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]) @@ -1232,8 +1356,8 @@ def test_unbatch_iterdatapipe(self): unbatch_dp = input_dp.unbatch() expected_dp = [[0, 1], [2, 3], [4, 5], [6, 7]] self.assertEqual(len(list(unbatch_dp)), 4) - for i, res in zip(expected_dp, unbatch_dp): - self.assertEqual(i, res) + for j, res in zip(expected_dp, unbatch_dp): + self.assertEqual(j, res) # Functional Test: unbatching multiple levels at the same time unbatch_dp = input_dp.unbatch(unbatch_level=2) @@ -1272,20 +1396,12 @@ def test_unbatch_iterdatapipe(self): def test_filter_datapipe(self): input_ds = dp.iter.IterableWrapper(range(10)) - def _filter_fn(data, val, clip=False): - if clip: - return data >= val - return True + def _filter_fn(data, val): + return data >= val # Functional Test: filter works with partial function filter_dp = input_ds.filter(partial(_filter_fn, val=5)) - for data, exp in zip(filter_dp, range(10)): - self.assertEqual(data, exp) - - # Functional Test: filter works with partial function with keyword args - filter_dp = input_ds.filter(partial(_filter_fn, val=5, clip=True)) - for data, exp in zip(filter_dp, range(5, 10)): - self.assertEqual(data, exp) + self.assertEqual(list(filter_dp), list(range(5, 10))) def _non_bool_fn(data): return 1 @@ -1295,12 +1411,26 @@ def _non_bool_fn(data): with self.assertRaises(ValueError): temp = list(filter_dp) + # Funtional Test: Specify input_col + tuple_input_ds = dp.iter.IterableWrapper([(d - 1, d, d + 1) for d in range(10)]) + + # Single input_col + input_col_1_dp = tuple_input_ds.filter(partial(_filter_fn, val=5), input_col=1) + self.assertEqual(list(input_col_1_dp), [(d - 1, d, d + 1) for d in range(5, 10)]) + + # Multiple input_col + def _mul_filter_fn(a, b): + return a + b < 10 + + input_col_2_dp = tuple_input_ds.filter(_mul_filter_fn, input_col=[0, 2]) + self.assertEqual(list(input_col_2_dp), [(d - 1, d, d + 1) for d in range(5)]) + # __len__ Test: DataPipe has no valid len with self.assertRaisesRegex(TypeError, r"has no len"): len(filter_dp) # Reset Test: DataPipe resets correctly - filter_dp = input_ds.filter(partial(_filter_fn, val=5, clip=True)) + filter_dp = input_ds.filter(partial(_filter_fn, val=5)) n_elements_before_reset = 3 res_before_reset, res_after_reset = reset_after_n_next_calls(filter_dp, n_elements_before_reset) self.assertEqual(list(range(5, 10))[:n_elements_before_reset], res_before_reset) @@ -1315,39 +1445,124 @@ def test_sampler_iterdatapipe(self): self.assertEqual(x, i) # RandomSampler - random_sampled_dp = dp.iter.Sampler(input_dp, sampler=RandomSampler, sampler_kwargs={'replacement': True}) # type: ignore[var-annotated] # noqa: B950 + random_sampled_dp = dp.iter.Sampler(input_dp, sampler=RandomSampler, sampler_kwargs={ + 'replacement': True}) # type: ignore[var-annotated] # noqa: B950 # Requires `__len__` to build SamplerDataPipe input_dp_nolen = IDP_NoLen(range(10)) with self.assertRaises(AssertionError): sampled_dp = dp.iter.Sampler(input_dp_nolen) + def test_stream_reader_iterdatapipe(self): + from io import StringIO + + input_dp = dp.iter.IterableWrapper([("f1", StringIO("abcde")), ("f2", StringIO("bcdef"))]) + expected_res = ["abcde", "bcdef"] + + # Functional Test: Read full chunk + dp1 = input_dp.read_from_stream() + self.assertEqual([d[1] for d in dp1], expected_res) + + # Functional Test: Read full chunk + dp2 = input_dp.read_from_stream(chunk=1) + self.assertEqual([d[1] for d in dp2], [c for s in expected_res for c in s]) + + # `__len__` Test + with self.assertRaises(TypeError): + len(dp1) + def test_shuffle_iterdatapipe(self): - exp = list(range(20)) + exp = list(range(100)) input_ds = dp.iter.IterableWrapper(exp) with self.assertRaises(AssertionError): shuffle_dp = input_ds.shuffle(buffer_size=0) - for bs in (5, 20, 25): - shuffle_dp = input_ds.shuffle(buffer_size=bs) - self.assertEqual(len(shuffle_dp), len(input_ds)) + def _create_dp(buffer_size): + input_ds = dp.iter.IterableWrapper(list(range(100))) + return input_ds.shuffle(buffer_size=bs).sharding_filter() - random.seed(123) + for bs in (5, 20, 33): + shuffle_dp = _create_dp(bs) + self.assertEqual(len(shuffle_dp), len(exp)) + + torch.manual_seed(123) res = list(shuffle_dp) self.assertEqual(sorted(res), exp) # Test Deterministic - for num_workers in (0, 1): - random.seed(123) - dl = DataLoader(shuffle_dp, num_workers=num_workers, worker_init_fn=_worker_init_fn, shuffle=True) - dl_res = list(dl) - self.assertEqual(res, dl_res) + for num_workers in (0, 1, 2): + dl_res = [] + mp_ctx = "spawn" if num_workers > 0 else None + dl = DataLoader( + shuffle_dp, + num_workers=num_workers, + shuffle=True, + multiprocessing_context=mp_ctx, + worker_init_fn=_worker_init_fn + ) + for epoch in range(2): + torch.manual_seed(123) + dl_res.append(list(dl)) + self.assertEqual(dl_res[0], dl_res[1]) + + # Different seeds + torch.manual_seed(321) + dl_res.append(list(dl)) + + self.assertEqual(len(dl_res[0]), len(dl_res[2])) + self.assertNotEqual(dl_res[0], dl_res[2]) + self.assertEqual(sorted(dl_res[0]), sorted(dl_res[2])) + + if num_workers == 0: + continue + + # Persistent workers + ps_dl_res = [] + for _ in range(2): + dl = DataLoader( + shuffle_dp, + num_workers=num_workers, + shuffle=True, + multiprocessing_context="spawn", + worker_init_fn=_worker_init_fn, + persistent_workers=True + ) + ps_res = [] + torch.manual_seed(123) + for epoch in range(2): + ps_res.extend(list(dl)) + ps_dl_res.append(ps_res) + self.assertEqual(ps_dl_res[0], ps_dl_res[1]) + + # Different Seeds + dl = DataLoader( + shuffle_dp, + num_workers=num_workers, + shuffle=True, + multiprocessing_context="spawn", + worker_init_fn=_worker_init_fn, + persistent_workers=True + ) + ps_res = [] + torch.manual_seed(321) + for epoch in range(2): + ps_res.extend(list(dl)) + ps_dl_res.append(ps_res) + + self.assertEqual(len(ps_dl_res[0]), len(ps_dl_res[2])) + self.assertNotEqual(ps_dl_res[0], ps_dl_res[2]) + self.assertEqual(sorted(ps_dl_res[0]), sorted(ps_dl_res[2])) + shuffle_dp_nl = IDP_NoLen(range(20)).shuffle(buffer_size=5) with self.assertRaisesRegex(TypeError, r"instance doesn't have valid length$"): len(shuffle_dp_nl) + # Test: deactivate shuffling via set_shuffle + unshuffled_dp = input_ds.shuffle().set_shuffle(False) + self.assertEqual(list(unshuffled_dp), list(input_ds)) + def test_zip_iterdatapipe(self): # Functional Test: raises TypeError when an input is not of type `IterDataPipe` @@ -1378,19 +1593,50 @@ def test_zip_iterdatapipe(self): class TestFunctionalMapDataPipe(TestCase): + + def _serialization_test_helper(self, datapipe, use_dill): + if use_dill: + serialized_dp = dill.dumps(datapipe) + deserialized_dp = dill.loads(serialized_dp) + else: + serialized_dp = pickle.dumps(datapipe) + deserialized_dp = pickle.loads(serialized_dp) + try: + self.assertEqual(list(datapipe), list(deserialized_dp)) + except AssertionError as e: + print(f"{datapipe} is failing.") + raise e + + def _serialization_test_for_single_dp(self, dp, use_dill=False): + # 1. Testing for serialization before any iteration starts + self._serialization_test_helper(dp, use_dill) + # 2. Testing for serialization after DataPipe is partially read + it = iter(dp) + _ = next(it) + self._serialization_test_helper(dp, use_dill) + # 3. Testing for serialization after DataPipe is fully read + _ = list(it) + self._serialization_test_helper(dp, use_dill) + def test_serializable(self): - input_dp = dp.map.SequenceWrapper(range(10)) - picklable_datapipes: List[ - Tuple[Type[MapDataPipe], Tuple, Dict[str, Any]] - ] = [ - (dp.map.Mapper, (), {}), - (dp.map.Mapper, (_fake_fn, ), {}), - (dp.map.Mapper, (partial(_fake_add, 1), ), {}), + picklable_datapipes: List = [ + (dp.map.Batcher, None, (2,), {}), + (dp.map.Concater, None, (dp.map.SequenceWrapper(range(10)),), {}), + (dp.map.Mapper, None, (), {}), + (dp.map.Mapper, None, (_fake_fn,), {}), + (dp.map.Mapper, None, (partial(_fake_add, 1),), {}), + (dp.map.SequenceWrapper, range(10), (), {}), + (dp.map.Shuffler, dp.map.SequenceWrapper([0] * 5), (), {}), + (dp.map.Zipper, None, (dp.map.SequenceWrapper(range(10)),), {}), ] - for dpipe, dp_args, dp_kwargs in picklable_datapipes: - _ = pickle.dumps(dpipe(input_dp, *dp_args, **dp_kwargs)) # type: ignore[call-arg] + for dpipe, custom_input, dp_args, dp_kwargs in picklable_datapipes: + if custom_input is None: + custom_input = dp.map.SequenceWrapper(range(10)) + datapipe = dpipe(custom_input, *dp_args, **dp_kwargs) # type: ignore[call-arg] + self._serialization_test_for_single_dp(datapipe) def test_serializable_with_dill(self): + """Only for DataPipes that take in a function as argument""" input_dp = dp.map.SequenceWrapper(range(10)) unpicklable_datapipes: List[ Tuple[Type[MapDataPipe], Tuple, Dict[str, Any]] @@ -1399,7 +1645,7 @@ def test_serializable_with_dill(self): ] if HAS_DILL: for dpipe, dp_args, dp_kwargs in unpicklable_datapipes: - _ = pickle.dumps(dpipe(input_dp, *dp_args, **dp_kwargs)) # type: ignore[call-arg] + _ = dill.dumps(dpipe(input_dp, *dp_args, **dp_kwargs)) # type: ignore[call-arg] else: for dpipe, dp_args, dp_kwargs in unpicklable_datapipes: with warnings.catch_warnings(record=True) as wa: @@ -1589,7 +1835,7 @@ class A(IterDataPipe[P]): @skipTyping def test_subtype(self): - from torch.utils.data._typing import issubtype + from torch.utils.data.datapipes._typing import issubtype basic_type = (int, str, bool, float, complex, list, tuple, dict, set, T_co) @@ -1637,7 +1883,7 @@ def test_subtype(self): @skipTyping def test_issubinstance(self): - from torch.utils.data._typing import issubinstance + from torch.utils.data.datapipes._typing import issubinstance basic_data = (1, '1', True, 1., complex(1., 0.)) basic_type = (int, str, bool, float, complex) @@ -1690,7 +1936,7 @@ def __iter__(self) -> Iterator[int]: # type: ignore[override] with self.assertRaisesRegex(TypeError, r"Expected return type of '__iter__'"): class InvalidDP3(IterDataPipe[Tuple[int, str]]): def __iter__(self) -> Iterator[tuple]: # type: ignore[override] - yield (0, ) + yield (0,) if _generic_namedtuple_allowed: with self.assertRaisesRegex(TypeError, r"is not supported by Python typing"): @@ -1707,14 +1953,14 @@ def __iter__(self) -> Iterator[Tuple[int, str]]: self.assertTrue(issubclass(DP1, IterDataPipe)) dp1 = DP1(10) - self.assertTrue(DP1.type.issubtype(dp1.type) and dp1.type.issubtype(DP1.type)) + self.assertTrue(DP1.type.issubtype(dp1.type) and dp1.type.issubtype(DP1.type)) # type: ignore[attr-defined] dp1_ = DP1(5) self.assertEqual(dp1.type, dp1_.type) with self.assertRaisesRegex(TypeError, r"is not a generic class"): class InvalidDP5(DP1[tuple]): # type: ignore[type-arg] def __iter__(self) -> Iterator[tuple]: # type: ignore[override] - yield (0, ) + yield (0,) class DP2(IterDataPipe[T_co]): def __iter__(self) -> Iterator[T_co]: @@ -1723,7 +1969,7 @@ def __iter__(self) -> Iterator[T_co]: self.assertTrue(issubclass(DP2, IterDataPipe)) dp2 = DP2() # type: ignore[var-annotated] - self.assertTrue(DP2.type.issubtype(dp2.type) and dp2.type.issubtype(DP2.type)) + self.assertTrue(DP2.type.issubtype(dp2.type) and dp2.type.issubtype(DP2.type)) # type: ignore[attr-defined] dp2_ = DP2() # type: ignore[var-annotated] self.assertEqual(dp2.type, dp2_.type) @@ -1739,7 +1985,7 @@ def __iter__(self) -> Iterator[Tuple[T_co, str]]: self.assertTrue(issubclass(DP3, IterDataPipe)) dp3 = DP3(range(10)) # type: ignore[var-annotated] - self.assertTrue(DP3.type.issubtype(dp3.type) and dp3.type.issubtype(DP3.type)) + self.assertTrue(DP3.type.issubtype(dp3.type) and dp3.type.issubtype(DP3.type)) # type: ignore[attr-defined] dp3_ = DP3(5) # type: ignore[var-annotated] self.assertEqual(dp3.type, dp3_.type) @@ -1761,7 +2007,7 @@ def __iter__(self) -> Iterator[str]: self.assertTrue(issubclass(DP5, IterDataPipe)) dp5 = DP5() - from torch.utils.data._typing import issubtype + from torch.utils.data.datapipes._typing import issubtype self.assertTrue(issubtype(dp5.type.param, Any) and issubtype(Any, dp5.type.param)) class DP6(IterDataPipe[int]): @@ -1778,13 +2024,13 @@ class DP7(IterDataPipe[Awaitable[T_co]]): r""" DataPipe with abstract base class""" self.assertTrue(issubclass(DP7, IterDataPipe)) - self.assertTrue(DP7.type.param == Awaitable[T_co]) + self.assertTrue(DP7.type.param == Awaitable[T_co]) # type: ignore[attr-defined] class DP8(DP7[str]): r""" DataPipe subclass from a DataPipe with abc type""" self.assertTrue(issubclass(DP8, IterDataPipe)) - self.assertTrue(DP8.type.param == Awaitable[str]) + self.assertTrue(DP8.type.param == Awaitable[str]) # type: ignore[attr-defined] @skipTyping def test_construct_time(self): @@ -1918,10 +2164,171 @@ def test_traverse_forked(self): dp2: {dp2.main_datapipe: {dp2.main_datapipe.main_datapipe: {}}}}} self.assertEqual(expected, graph) + def test_traverse_mapdatapipe(self): + source_dp = dp.map.SequenceWrapper(range(10)) + map_dp = source_dp.map(partial(_fake_add, 1)) + graph = torch.utils.data.graph.traverse(map_dp) + expected: Dict[Any, Any] = {map_dp: {source_dp: {}}} + self.assertEqual(expected, graph) + + def test_traverse_mixdatapipe(self): + source_map_dp = dp.map.SequenceWrapper(range(10)) + iter_dp = dp.iter.IterableWrapper(source_map_dp) + graph = torch.utils.data.graph.traverse(iter_dp) + expected: Dict[Any, Any] = {iter_dp: {source_map_dp: {}}} + self.assertEqual(expected, graph) + + +def unbatch(x): + return x[0] + + +class TestSerialization(TestCase): + @skipIfNoDill + def test_spawn_lambdas_iter(self): + idp = dp.iter.IterableWrapper(range(3)).map(lambda x: x + 1) + dl = DataLoader(idp, num_workers=2, shuffle=True, + multiprocessing_context='spawn', collate_fn=unbatch, batch_size=1) + result = list(dl) + self.assertEquals([1, 1, 2, 2, 3, 3], sorted(result)) + + @skipIfNoDill + def test_spawn_lambdas_map(self): + mdp = dp.map.SequenceWrapper(range(6)).map(lambda x: x + 1) + dl = DataLoader(mdp, num_workers=2, shuffle=True, + multiprocessing_context='spawn', collate_fn=unbatch, batch_size=1) + result = list(dl) + self.assertEquals([1, 2, 3, 4, 5, 6], sorted(result)) + + +class TestCircularSerialization(TestCase): + class CustomIterDataPipe(IterDataPipe): + + @staticmethod + def add_one(x): + return x + 1 + + @classmethod + def classify(cls, x): + return 0 + + def add_v(self, x): + return x + self.v + + def __init__(self, fn, source_dp=None): + self.fn = fn + self.source_dp = source_dp if source_dp else dp.iter.IterableWrapper([1, 2, 4]) + self._dp = self.source_dp.map(self.add_one).map(self.add_v).demux(2, self.classify)[0] + self.v = 1 + + def __iter__(self): + yield from self._dp + + def test_circular_serialization_with_pickle(self): + from torch.utils.data.datapipes.iter.combining import _ChildDataPipe, _DemultiplexerIterDataPipe + + def _get_name(datapipe): + return datapipe.__name__ + + # Test for circular reference issue with pickle + source_dp = TestCircularSerialization.CustomIterDataPipe(fn=_fake_fn) + self.assertTrue(list(source_dp) == + list(pickle.loads(pickle.dumps(TestCircularSerialization.CustomIterDataPipe(fn=_fake_fn))))) + res1 = traverse(source_dp, only_datapipe=True) + res2 = traverse(source_dp, only_datapipe=False) + expected_str1 = str({source_dp: + {_get_name(dp.iter.IterableWrapper): {}, + _get_name(_ChildDataPipe): + {_get_name(_DemultiplexerIterDataPipe): + {_get_name(dp.iter.Mapper): + {_get_name(dp.iter.Mapper): + {_get_name(dp.iter.IterableWrapper): {}}}}}}} + ).replace("'", "") + expected_str2 = str({source_dp: + {_get_name(dp.iter.IterableWrapper): {}, + _get_name(_ChildDataPipe): + {_get_name(_DemultiplexerIterDataPipe): + {_get_name(dp.iter.Mapper): + {_get_name(dp.iter.Mapper): + {_get_name(dp.iter.IterableWrapper): {}}, + _get_name(dp.iter.IterableWrapper): {}}}}}} + ).replace("'", "") + # For simplicity, compare the resulting string instead of trying to recreate the object + self.assertEqual(expected_str1, str(res1)) + self.assertEqual(expected_str2, str(res2)) + + dp1 = TestCircularSerialization.CustomIterDataPipe(fn=_fake_fn) + dp2 = TestCircularSerialization.CustomIterDataPipe(fn=_fake_fn, source_dp=dp1) + self.assertTrue(list(dp2) == list(pickle.loads(pickle.dumps(dp2)))) + res3 = traverse(dp2, only_datapipe=True) + res4 = traverse(dp2, only_datapipe=False) + self.assertTrue(str(dp2) in str(res3)) # Quick check to ensure the result isn't blank + self.assertTrue(str(dp2) in str(res4)) + + class LambdaIterDataPipe(CustomIterDataPipe): + + def __init__(self, fn, source_dp=None): + super().__init__(fn, source_dp) + self.container = [lambda x: x + 1, ] + self.lambda_fn = lambda x: x + 1 + self._dp = self.source_dp.map(self.add_one).map(self.lambda_fn).map(self.add_v).demux(2, self.classify)[0] + + @skipIfNoDill + def test_circular_serialization_with_dill(self): + from torch.utils.data.datapipes.iter.combining import _ChildDataPipe, _DemultiplexerIterDataPipe + + def _get_name(datapipe): + return datapipe.__name__ + + # Test for circular reference issue with dill + self.assertTrue(list(TestCircularSerialization.LambdaIterDataPipe(lambda x: x + 1)) == + list(dill.loads(dill.dumps(TestCircularSerialization.LambdaIterDataPipe(lambda x: x + 1))))) + source_dp = TestCircularSerialization.LambdaIterDataPipe(fn=_fake_fn) + res1 = traverse(source_dp, only_datapipe=True) + res2 = traverse(source_dp, only_datapipe=False) + expected_str1 = str({source_dp: + {_get_name(dp.iter.IterableWrapper): {}, + _get_name(_ChildDataPipe): + {_get_name(_DemultiplexerIterDataPipe): + {_get_name(dp.iter.Mapper): + {_get_name(dp.iter.Mapper): + {_get_name(dp.iter.Mapper): + {_get_name(dp.iter.IterableWrapper): {}}}}}}}} + ).replace("'", "") + expected_str2 = str({source_dp: + {_get_name(dp.iter.IterableWrapper): {}, + _get_name(_ChildDataPipe): + {_get_name(_DemultiplexerIterDataPipe): + {_get_name(dp.iter.Mapper): + {_get_name(dp.iter.Mapper): + {_get_name(dp.iter.Mapper): + {_get_name(dp.iter.IterableWrapper): {}}}, + _get_name(dp.iter.IterableWrapper): {}}}}}} + ).replace("'", "") + # For simplicity, compare the resulting string instead of trying to recreate the object + self.assertEqual(expected_str1, str(res1)) + self.assertEqual(expected_str2, str(res2)) + + dp1 = TestCircularSerialization.LambdaIterDataPipe(fn=_fake_fn) + dp2 = TestCircularSerialization.LambdaIterDataPipe(fn=_fake_fn, source_dp=dp1) + self.assertTrue(list(dp2) == list(dill.loads(dill.dumps(dp2)))) + res3 = traverse(dp2, only_datapipe=True) + res4 = traverse(dp2, only_datapipe=False) + self.assertTrue(str(dp2) in str(res3)) # Quick check to ensure the result isn't blank + self.assertTrue(str(dp2) in str(res4)) + class TestSharding(TestCase): def _get_pipeline(self): + numbers_dp = NumbersDataset(size=10) + dp0, dp1 = numbers_dp.fork(num_instances=2) + dp0_upd = dp0.map(_mul_10) + dp1_upd = dp1.filter(_mod_3_test) + combined_dp = dp0_upd.mux(dp1_upd) + return combined_dp + + def _get_dill_pipeline(self): numbers_dp = NumbersDataset(size=10) dp0, dp1 = numbers_dp.fork(num_instances=2) dp0_upd = dp0.map(lambda x: x * 10) @@ -1929,20 +2336,18 @@ def _get_pipeline(self): combined_dp = dp0_upd.mux(dp1_upd) return combined_dp - @skipIfNoDill def test_simple_sharding(self): sharded_dp = self._get_pipeline().sharding_filter() torch.utils.data.graph_settings.apply_sharding(sharded_dp, 3, 1) items = list(sharded_dp) - self.assertEqual([1, 20, 40, 70], items) + self.assertEqual([1, 20], items) - all_items = list(self._get_pipeline()) + all_items = [0, 1, 10, 4, 20, 7] items = [] for i in range(3): sharded_dp = self._get_pipeline().sharding_filter() torch.utils.data.graph_settings.apply_sharding(sharded_dp, 3, i) items += list(sharded_dp) - self.assertEqual(sorted(all_items), sorted(items)) def test_sharding_length(self): @@ -1966,7 +2371,6 @@ def test_sharding_length(self): self.assertEqual(1, len(sharded_dp0)) self.assertEqual(0, len(sharded_dp1)) - @skipIfNoDill def test_old_dataloader(self): dp0 = self._get_pipeline() expected = list(dp0) @@ -1981,5 +2385,229 @@ def test_old_dataloader(self): self.assertEqual(sorted(expected), sorted(items)) +class TestIterDataPipeSingletonConstraint(TestCase): + + r""" + Each `IterDataPipe` can only have one active iterator. Whenever a new iterator is created, older + iterators are invalidated. These tests aim to ensure `IterDataPipe` follows this behavior. + """ + + def _check_single_iterator_invalidation_logic(self, source_dp: IterDataPipe): + r""" + Given a IterDataPipe, verifies that the iterator can be read, reset, and the creation of + a second iterator invalidates the first one. + """ + it1 = iter(source_dp) + self.assertEqual(list(range(10)), list(it1)) + it1 = iter(source_dp) + self.assertEqual(list(range(10)), list(it1)) # A fresh iterator can be read in full again + it1 = iter(source_dp) + self.assertEqual(0, next(it1)) + it2 = iter(source_dp) # This should invalidate `it1` + self.assertEqual(0, next(it2)) # Should read from the beginning again + with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"): + next(it1) + + + def test_iterdatapipe_singleton_generator(self): + r""" + Testing for the case where IterDataPipe's `__iter__` is a generator function. + """ + + # Functional Test: Check if invalidation logic is correct + source_dp: IterDataPipe = dp.iter.IterableWrapper(range(10)) + self._check_single_iterator_invalidation_logic(source_dp) + + # Functional Test: extend the test to a pipeline + dps = source_dp.map(_fake_fn).filter(_fake_filter_fn) + self._check_single_iterator_invalidation_logic(dps) + + # Functional Test: multiple simultaneous references to the same DataPipe fails + with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"): + for _ in zip(source_dp, source_dp): + pass + + # Function Test: sequential references work + for _ in zip(list(source_dp), list(source_dp)): + pass + + def test_iterdatapipe_singleton_self_next(self): + r""" + Testing for the case where IterDataPipe's `__iter__` returns `self` and there is a `__next__` method + Note that the following DataPipe by is singleton by default (because `__iter__` returns `self`). + """ + class _CustomIterDP_Self(IterDataPipe): + def __init__(self, iterable): + self.source = iterable + self.iterable = iter(iterable) + + def __iter__(self): + self.reset() + return self + + def __next__(self): + return next(self.iterable) + + def reset(self): + self.iterable = iter(self.source) + + # Functional Test: Check that every `__iter__` call returns the same object + source_dp = _CustomIterDP_Self(range(10)) + res = list(source_dp) + it = iter(source_dp) + self.assertEqual(res, list(it)) + + # Functional Test: Check if invalidation logic is correct + source_dp = _CustomIterDP_Self(range(10)) + self._check_single_iterator_invalidation_logic(source_dp) + self.assertEqual(1, next(source_dp)) # `source_dp` is still valid and can be read + + # Functional Test: extend the test to a pipeline + source_dp = _CustomIterDP_Self(dp.iter.IterableWrapper(range(10)).map(_fake_fn).filter(_fake_filter_fn)) + self._check_single_iterator_invalidation_logic(source_dp) + self.assertEqual(1, next(source_dp)) # `source_dp` is still valid and can be read + + # Functional Test: multiple simultaneous references to the same DataPipe fails + with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"): + for _ in zip(source_dp, source_dp): + pass + + def test_iterdatapipe_singleton_new_object(self): + r""" + Testing for the case where IterDataPipe's `__iter__` isn't a generator nor returns `self`, + and there isn't a `__next__` method. + """ + class _CustomIterDP(IterDataPipe): + def __init__(self, iterable): + self.iterable = iter(iterable) + + def __iter__(self): # Note that this doesn't reset + return self.iterable # Intentionally not returning `self` + + # Functional Test: Check if invalidation logic is correct + source_dp = _CustomIterDP(range(10)) + it1 = iter(source_dp) + self.assertEqual(0, next(it1)) + it2 = iter(source_dp) + self.assertEqual(1, next(it2)) + with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"): + next(it1) + + # Functional Test: extend the test to a pipeline + source_dp = _CustomIterDP(dp.iter.IterableWrapper(range(10)).map(_fake_fn).filter(_fake_filter_fn)) + it1 = iter(source_dp) + self.assertEqual(0, next(it1)) + it2 = iter(source_dp) + self.assertEqual(1, next(it2)) + with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"): + next(it1) + + # Functional Test: multiple simultaneous references to the same DataPipe fails + with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"): + for _ in zip(source_dp, source_dp): + pass + + def test_iterdatapipe_singleton_buggy(self): + r""" + Buggy test case case where IterDataPipe's `__iter__` returns a new object, but also has + a `__next__` method. + """ + class _CustomIterDP(IterDataPipe): + def __init__(self, iterable): + self.source = iterable + self.iterable = iter(iterable) + + def __iter__(self): + return iter(self.source) # Intentionally not returning `self` + + def __next__(self): + return next(self.iterable) + + # Functional Test: Check if invalidation logic is correct + source_dp = _CustomIterDP(range(10)) + self._check_single_iterator_invalidation_logic(source_dp) + self.assertEqual(0, next(source_dp)) # `__next__` is unrelated with `__iter__` + + # Functional Test: Special case to show `__next__` is unrelated with `__iter__` + source_dp = _CustomIterDP(range(10)) + self.assertEqual(0, next(source_dp)) + it1 = iter(source_dp) + self.assertEqual(0, next(it1)) + self.assertEqual(1, next(source_dp)) + it2 = iter(source_dp) # invalidates both `it1` + with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"): + next(it1) + self.assertEqual(2, next(source_dp)) # not impacted by the creation of `it2` + self.assertEqual(list(range(10)), list(it2)) # `it2` still works because it is a new object + + def test_iterdatapipe_singleton_constraint_multiple_outputs(self): + r""" + Testing for the case where IterDataPipe has multiple child DataPipes as outputs. + """ + # Functional Test: all previous related iterators should be invalidated when a new iterator + # is created from a ChildDataPipe + source_dp: IterDataPipe = dp.iter.IterableWrapper(range(10)) + cdp1, cdp2 = source_dp.fork(num_instances=2) + it1, it2 = iter(cdp1), iter(cdp2) + self.assertEqual(list(range(10)), list(it1)) + self.assertEqual(list(range(10)), list(it2)) + it1, it2 = iter(cdp1), iter(cdp2) + with warnings.catch_warnings(record=True) as wa: + it3 = iter(cdp1) # This should invalidate `it1` and `it2` + self.assertEqual(len(wa), 1) + self.assertRegex(str(wa[0].message), r"child DataPipes are not exhausted") + with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"): + next(it1) + with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"): + next(it2) + self.assertEqual(0, next(it3)) + # The next line should not invalidate anything, as there was no new iterator created + # for `cdp2` after `it2` was invalidated + it4 = iter(cdp2) + self.assertEqual(1, next(it3)) # An error shouldn't be raised here + self.assertEqual(list(range(10)), list(it4)) + + # Functional Test: invalidation when a new iterator is created from `source_dp` + source_dp = dp.iter.IterableWrapper(range(10)) + cdp1, cdp2 = source_dp.fork(num_instances=2) + it1, it2 = iter(cdp1), iter(cdp2) + self.assertEqual(list(range(10)), list(it1)) + self.assertEqual(list(range(10)), list(it2)) + it1, it2 = iter(cdp1), iter(cdp2) + self.assertEqual(0, next(it1)) + self.assertEqual(0, next(it2)) + it3 = iter(source_dp) # note that a new iterator is created from `source_dp` + self.assertEqual(0, next(it3)) # `it3` should invalidate `it1` and `it2` since they both use `source_dp` + with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"): + next(it1) + self.assertEqual(1, next(it3)) + + # Function Test: Extending test to pipeline + source_dp = dp.iter.IterableWrapper(range(10)).map(_fake_fn).filter(_fake_filter_fn) + cdp1, cdp2 = source_dp.fork(num_instances=2) + it1, it2 = iter(cdp1), iter(cdp2) + self.assertEqual(list(range(10)), list(it1)) + self.assertEqual(list(range(10)), list(it2)) + it1, it2 = iter(cdp1), iter(cdp2) + with warnings.catch_warnings(record=True) as wa: + it3 = iter(cdp1) # This should invalidate `it1` and `it2` + self.assertEqual(len(wa), 1) + self.assertRegex(str(wa[0].message), r"child DataPipes are not exhausted") + with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"): + next(it1) + with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"): + next(it2) + with warnings.catch_warnings(record=True) as wa: + it1, it2 = iter(cdp1), iter(cdp2) + self.assertEqual(len(wa), 1) + self.assertRegex(str(wa[0].message), r"child DataPipes are not exhausted") + self.assertEqual(0, next(it1)) + self.assertEqual(0, next(it2)) + it3 = iter(source_dp) # note that a new iterator is created from `source_dp` + self.assertEqual(0, next(it3)) # `it3` should invalidate `it1` and `it2` since they both use `source_dp` + with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"): + next(it1) + self.assertEqual(1, next(it3)) + if __name__ == '__main__': run_tests() diff --git a/test/test_decomp.py b/test/test_decomp.py new file mode 100644 index 000000000000..100859713a26 --- /dev/null +++ b/test/test_decomp.py @@ -0,0 +1,503 @@ +# Owner(s): ["module: primTorch"] + +from collections import defaultdict +from torch import Tensor +import torch.autograd +from torch.utils._python_dispatch import enable_torch_dispatch_mode +from torch._decomp import decomposition_table + +from torch.utils._pytree import tree_map, tree_flatten, tree_unflatten +from torch.testing._internal.logging_tensor import no_dispatch +from torch.testing._internal.common_utils import ( + is_iterable_of_tensors, + TestCase, + skipIfCrossRef, + suppress_warnings, + TEST_WITH_ASAN, + run_tests, +) +from torch.testing._internal.common_device_type import ( + onlyNativeDeviceTypes, + ops, + instantiate_device_type_tests, +) +from torch.testing._internal.common_methods_invocations import op_db + +import itertools +import functools +from functools import partial +import unittest + +aten = torch.ops.aten + + +# TODO: this isn't going to work with non-aten namespaces +def overload_to_aten_name(overload): + return overload._schema.name.split("::")[1] + + +# All operators that can have decomp tests +decomposition_names = {overload_to_aten_name(k) for k in decomposition_table} +_decomp_test_ops = [ + op + for op in op_db + if op.aten_name in decomposition_names + or op.aten_backward_name in decomposition_names +] + + +def diff_arg(arg, requires_grad=True): + def is_differentiable_arg(arg): + if requires_grad: + return arg.requires_grad + else: + return arg.is_floating_point() or arg.is_complex() + + if is_iterable_of_tensors(arg): + if all([is_differentiable_arg(a) for a in arg]): + return True + if all([not is_differentiable_arg(a) for a in arg]): + return False + raise RuntimeError("NYI: The test runner can't handle this") + return isinstance(arg, Tensor) and is_differentiable_arg(arg) + + +# Version of autograd.grad with some differences: +# - pytree inputs is allowed (but leaves of the pytree have to all +# be tensors) +# - if an input is not used as part of derivatives, we will return a +# zero-filled tensor for the result +def _autograd_grad( + outputs, inputs, grad_outputs=None, retain_graph=False, create_graph=True +): + inputs, inputs_spec = tree_flatten(inputs) + diff_inputs = tuple(inp for inp in inputs if inp.requires_grad) + if grad_outputs is None: + diff_outputs = tuple(out for out in outputs if out.requires_grad) + else: + diff_grad_outputs = [ + (out, go) for out, go in zip(outputs, grad_outputs) if out.requires_grad + ] + if len(diff_grad_outputs) == 0: + diff_outputs, grad_outputs = (), () + else: + diff_outputs, grad_outputs = zip(*diff_grad_outputs) + grad_inputs = torch.autograd.grad( + diff_outputs, + diff_inputs, + grad_outputs, + retain_graph=retain_graph, + create_graph=create_graph, + allow_unused=True, + ) + result = [] + grad_inputs_iter = iter(grad_inputs) + for inp in inputs: + if inp.requires_grad: + grad_input = next(grad_inputs_iter) + if grad_input is None: + result.append(torch.zeros_like(inp)) + else: + result.append(grad_input) + else: + result.append(torch.zeros_like(inp)) + return tree_unflatten(result, inputs_spec) + + +def _as_tuple(val): + if isinstance(val, tuple): + return val + return (val,) + + +def ref_vjp_no_create(f, *primals): + result = f(*primals) + + def wrapped(cotangents): + return _autograd_grad( + _as_tuple(result), primals, _as_tuple(cotangents), create_graph=False + ) + + return result, wrapped + + +dtype_precisions = { + torch.float16: (0.001, 1e-5), + torch.bfloat16: (0.016, 1e-4), + torch.float32: (1.3e-6, 1e-5), + torch.float64: (1e-7, 1e-7), + torch.complex32: (0.001, 1e-5), + torch.complex64: (1.3e-6, 1e-5), + torch.complex128: (1e-7, 1e-7), +} +# Returns the "default" rtol and atol for comparing scalars or +# tensors of the given dtypes. + + +def _getDefaultRtolAndAtol(dtype0, dtype1): + rtol = max( + dtype_precisions.get(dtype0, (0, 0))[0], dtype_precisions.get(dtype1, (0, 0))[0] + ) + atol = max( + dtype_precisions.get(dtype0, (0, 0))[1], dtype_precisions.get(dtype1, (0, 0))[1] + ) + return rtol, atol + + +def op_assert_ref(test_case, op, orig, decomp, ref, args, kwargs): + assert orig.dtype == decomp.dtype, f"Operation: {op}" + if orig.numel() == 0 or decomp.numel() == 0: + assert orig.numel() == decomp.numel() + return + if ref.is_floating_point(): + orig_diff = (orig - ref).abs().max() + decomp_diff = (decomp - ref).abs().max() + atol = 1e-10 + if decomp_diff > orig_diff + atol: + raise RuntimeError( + f"Difference from float64 is larger with decomposition {op.__name__}" + f" than original. Original max diff: {orig_diff}, Decomp max diff: {decomp_diff}\n" + f"args = {args}\n" + f"kwargs = {kwargs}" + ) + else: + test_case.assertEqual( + orig, decomp, msg=f"{op.__name__}\nargs = {args}\nkwargs = {kwargs}" + ) + + +def op_assert_equal(test_case, op, orig, decomp, args, kwargs): + test_case.assertEqual( + orig.dtype, decomp.dtype, f"Operation: {op}, orig.dtype: {orig.dtype}, decomp.dtype: {decomp.dtype}, {args}, {kwargs}") + # Before adding an entry to this table, make sure your decomposition is right :) + tol_table = { + # Due to strange epsilon behaviors, see https://github.com/pytorch/pytorch/issues/73161 + (torch.float32, torch.ops.aten.native_layer_norm.default): (1e-3, 1e-3), + (torch.float32, torch.ops.aten.native_layer_norm_backward.default): ( + 1e-3, + 1e-3, + ), + } + if (decomp.dtype, op) in tol_table: + rtol, atol = tol_table[(decomp.dtype, op)] + else: + rtol, atol = _getDefaultRtolAndAtol(orig.dtype, decomp.dtype) + + test_case.assertEqual(orig, decomp, rtol=rtol, atol=atol, msg=f"{op.__name__}\nargs = {args}\nkwargs = {kwargs}") + + +# Given f, returns an f' such that: +# - f' takes only positional arguments +# - All arguments to f' are floating-point Tensors +# - All outputs of f' are floating-point Tensors +def normalize_op_input_output2( + f, args, kwargs, output_process_fn_grad=None, requires_grad=True +): + flat_args, args_spec = tree_flatten(args) + diff_argnums = tuple( + i + for i, arg in enumerate(flat_args) + if diff_arg(arg, requires_grad=requires_grad) + ) + assert len(diff_argnums) > 0 + primals = tuple(flat_args[i] for i in diff_argnums) + + @functools.wraps(f) + def wrapped(*primals): + _args = list(flat_args) + for num, arg in zip(diff_argnums, primals): + _args[num] = arg + _args = tree_unflatten(_args, args_spec) + result = f(*_args, **kwargs) + if output_process_fn_grad is not None: + result = output_process_fn_grad(result) + if isinstance(result, tuple): + # TODO: Remove the following hack for namedtuples + result = tuple(result) + result = tuple( + r + for r in result + if isinstance(r, Tensor) and (r.is_floating_point() or r.is_complex()) + ) + assert len(result) > 0 + return result + + return wrapped, primals + + +# NB: This also upcasts dtype arguments + + +def upcast_tensor(func, x, dtype=torch.float32): + # Some functions take a dtype as argument, so we need to + # manually change that dtype in order to run it with a + # higher precision + dtype_arg_table = { + torch.ops.aten._softmax_backward_data.default, + torch.ops.aten._log_softmax_backward_data.default, + } + + if isinstance(x, Tensor) and x.dtype.is_floating_point: + return x.to(dtype=dtype) + elif ( + isinstance(x, torch.dtype) + and func in dtype_arg_table + and x in [torch.float16, torch.bfloat16] + ): + return torch.float64 + else: + return x + + +def normalize_op_input_output(f, sample, requires_grad=True): + args = tuple([sample.input] + list(sample.args)) + return normalize_op_input_output2( + f, + args, + sample.kwargs, + sample.output_process_fn_grad, + requires_grad=requires_grad, + ) + + +CROSS_REF_EXCLUDE_SET = { + # CUBLAS_STATUS_NOT_SUPPORTED when calling + # `cublasGemmStridedBatchedExFix(handle, opa, opb, (int)m, (int)n, (int)k, + # (void*)&falpha, a, CUDA_R_16BF, (int)lda, stridea, b, CUDA_R_16BF, + # (int)ldb, strideb, (void*)&fbeta, c, CUDA_R_16BF, (int)ldc, stridec, + # (int)num_batches, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)` + ("cuda", torch.bfloat16, "nn.functional.bilinear"), + # randomness + ("cuda", torch.float16, "nn.functional.dropout"), + ("cuda", torch.bfloat16, "nn.functional.dropout"), + ("cuda", torch.float64, "nn.functional.dropout"), + ("cuda", torch.float32, "nn.functional.dropout"), + # decomp has problem even with opmath + ("cuda", torch.bfloat16, "nn.functional.layer_norm"), + ("cuda", torch.float16, "nn.functional.layer_norm"), + ("cuda", torch.bfloat16, "nn.functional.batch_norm"), + ("cuda", torch.float16, "nn.functional.batch_norm"), + ("cuda", torch.bfloat16, "nn.functional.instance_norm"), + ("cuda", torch.float16, "nn.functional.instance_norm"), + # doesn't work + ("cuda", torch.bfloat16, "nn.functional.embedding"), + +} + +all_decomposed = set() +all_called = defaultdict(int) + +# Helpful snippet for testing coverage +""" +import atexit +def check_coverage(): + print("missing coverage:") + print("\n".join(map(str, decomposition_table.keys() - all_decomposed))) +atexit.register(check_coverage) +""" + +# Helpful snippet for Horace to create his google sheet :) +""" +import atexit +def dump_ops(): + with open('run_ops.txt', 'w') as f, open('count_ops.txt', 'w') as g: + for op, count in sorted(all_called.items(), key=lambda x: x[0].__name__): + f.write(f'{op.__name__}\n') + g.write(f'{count}\n') + with open('run_decompositions.txt', 'w') as f: + for op in sorted([i.__name__ for i in all_decomposed]): + f.write(f'{op}\n') + +atexit.register(dump_ops) +""" + + +def any_unsupported(args, kwargs): + def test_unsupported(t): + if type(t) is torch.Tensor or type(t) is torch.nn.Parameter: + # These are all things that we haven't coded decompositions + # to handle correctly. Maybe they should. + return any([ + t.is_sparse_csr, t.is_sparse, t.is_mkldnn, t.is_quantized, + t.is_nested, torch._is_functional_tensor(t), + ]) + elif torch.overrides.is_tensor_like(t): + # Decompositions will generally change the behavior of Tensor-like + # subclasses, so bypass tests in this case too + return True + else: + return False + + flat_args, _ = tree_flatten(args) + flat_kwargs, _ = tree_flatten(kwargs) + return any(test_unsupported(x) for x in itertools.chain(flat_args, flat_kwargs)) + + +class TestDecomp(TestCase): + longMessage = True + + # NB: This actually overlaps with test_comprehensive, but it only + # runs on things that are definitely decomposed so it's a lot faster + # to run + @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN") + @onlyNativeDeviceTypes + @skipIfCrossRef + @suppress_warnings + @ops(_decomp_test_ops) + def test_quick(self, device, dtype, op): + self.do_cross_ref(device, dtype, op, run_all=False) + + @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN") + @onlyNativeDeviceTypes + @skipIfCrossRef + @suppress_warnings + @ops(op_db) + def test_comprehensive(self, device, dtype, op): + self.do_cross_ref(device, dtype, op, run_all=True) + + def do_cross_ref(self, device, dtype, op, *, run_all): + if (torch.device(device).type, dtype, op.name) in CROSS_REF_EXCLUDE_SET or ( + None, + dtype, + op.name, + ) in CROSS_REF_EXCLUDE_SET: + self.skipTest(f"{op.name} in {dtype} not supported") + + test_dtype = dtype + + # We check the correctness of each decomposition right after running it. + # So, when we encounter a decomposition, we run the function normally, and + # then run the decomposition, and ensure they're identical. + called = set() + decomposed = set() + + saved_precision = self.precision + saved_rel_tol = self.rel_tol + + class DecompCrossRefMode(torch.Tensor): + @classmethod + def __torch_dispatch__(cls, func, types, args=(), kwargs=None): + with no_dispatch(): + return cls._torch_dispatch(func, types, args, kwargs) + + @classmethod + def _torch_dispatch(cls, func, types, args=(), kwargs=None): + self.precision = saved_precision + self.rel_tol = saved_rel_tol + + called.add(func) + all_called[func] += 1 + + # Stuff we shouldn't bother testing + # (TODO: remove detach from the decomp table?) + if func not in decomposition_table or func in [ + torch.ops.aten.detach.default + ] or any_unsupported(args, kwargs): + return func(*args, **kwargs) + + decomposed.add(func) + all_decomposed.add(func) + + # We take 2 main strategies for verifying correctness/numerical stability of decompositions + # The first one is simply tolerance checking between decomp_out and pytorch_out + # However, for fp16/bf16 and reductions, this becomes very + # finicky, as there are not many guarantees we can make. + # So, for fp16/bf16, we instead compare the difference of + # {decomp_out, pytorch_out_64} and {pytorch_out, + # pytorch_out_64}. In other words, we compare how far the + # decomposition and pytorch are from the "ground truth" (i.e. + # fp64). If the decomposition results in more error, we error + + decomposition = decomposition_table[func] + + do_relative_check = test_dtype in [torch.float16, torch.bfloat16] + real_out_unflat = func(*args, **kwargs) + real_out, _ = tree_flatten(real_out_unflat) + decomp_out, _ = tree_flatten(decomposition(*args, **kwargs)) + assert len(real_out) == len(decomp_out) + + if do_relative_check: + upcast = partial(upcast_tensor, func, dtype=torch.float64) + real_out_double, _ = tree_flatten( + func(*tree_map(upcast, args), **tree_map(upcast, kwargs)) + ) + for orig, decomp, ref in zip(real_out, decomp_out, real_out_double): + if orig is None: + assert decomp is None + continue + op_assert_ref(self, func, orig, decomp, ref, args, kwargs) + else: + for orig, decomp in zip(real_out, decomp_out): + if orig is None: + assert decomp is None + continue + op_assert_equal(self, func, orig, decomp, args, kwargs) + + return real_out_unflat + + requires_grad = ( + op.supports_autograd + and dtype in op.supported_backward_dtypes(torch.device(device).type) + # TODO: OpInfo really ought to error out for this case, but it's + # not exercised in test_ops_gradients atm. The problem is not + # complex32 per-se (which is supported by data movement only ops) + # but that when we do backwards we expect other ops like add to work + and not dtype == torch.complex32 + ) + samples = op.sample_inputs(device, test_dtype, requires_grad=requires_grad) + + def check_decomposed(aten_name): + self.assertTrue( + any(overload_to_aten_name(c) == aten_name for c in decomposed), + msg=f"aten.{aten_name} was not decomposed, saw calls for: " + + ", ".join(map(str, list(called))), + ) + + aten_name = op.decomp_aten_name or op.aten_name + + func = op.get_op() + for sample_input in samples: + if requires_grad: + fn, primals = normalize_op_input_output(func, sample_input) + primals = tree_map( + lambda x: x if isinstance(x, torch.Tensor) else x, primals + ) + + # Once https://github.com/pytorch/pytorch/pull/75965/ I can + # store the called list on the mode object instance and no + # explicit clearing is necessary as I will create a fresh mode + # for each region + decomposed.clear() + with enable_torch_dispatch_mode(DecompCrossRefMode): + decomp_out, decomp_vjp_fn = ref_vjp_no_create(fn, *primals) + if aten_name in decomposition_names: + check_decomposed(aten_name) + + if op.aten_backward_name in decomposition_names or run_all: + cotangents = tree_map(lambda x: torch.randn_like(x), decomp_out) + + decomposed.clear() + with enable_torch_dispatch_mode(DecompCrossRefMode): + decomp_vjp_fn(cotangents) + if not run_all: + check_decomposed(op.aten_backward_name) + + elif aten_name in decomposition_names or run_all: + args = [sample_input.input] + list(sample_input.args) + kwargs = sample_input.kwargs + decomposed.clear() + with enable_torch_dispatch_mode(DecompCrossRefMode): + func(*args, **kwargs) + if not run_all: + check_decomposed(aten_name) + else: + assert op.supports_autograd + self.skipTest( + "only backwards is decomposed, but dtype doesn't support AD" + ) + + +instantiate_device_type_tests(TestDecomp, globals()) + +if __name__ == "__main__": + run_tests() diff --git a/test/test_dispatch.py b/test/test_dispatch.py index 37a6054f9151..bf609cf50b3e 100644 --- a/test/test_dispatch.py +++ b/test/test_dispatch.py @@ -532,8 +532,8 @@ def test_computed_table_with_ambiguous_autogradother(self): lambda m: m.def_("foo(Tensor x) -> Tensor"), # m.impl("foo", torch::kCompositeImplicitAutograd, [](const Tensor & x) { return x }) lambda m: m.impl_t_t("foo", "CompositeImplicitAutograd", debug="fn_math"), - # m.impl("foo", torch::kQuantizedCPU, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "QuantizedCPU", debug="fn_quantizedcpu"), + # m.impl("foo", torch::kFPGA, [](const Tensor & x) { return x }) + lambda m: m.impl_t_t("foo", "FPGA", debug="fn_fpga"), ]) state, table = result.state, result.table self.assertExpectedInline(state, '''\ @@ -541,12 +541,12 @@ def test_computed_table_with_ambiguous_autogradother(self): schema: test::foo(Tensor x) -> (Tensor) debug: registered at /dev/null:0 alias analysis kind: FROM_SCHEMA -QuantizedCPU: fn_quantizedcpu :: (Tensor _0) -> (Tensor _0) [ boxed unboxed ] +FPGA: fn_fpga :: (Tensor _0) -> (Tensor _0) [ boxed unboxed ] CompositeImplicitAutograd[alias]: fn_math :: (Tensor _0) -> (Tensor _0) [ boxed unboxed ] ''') # computed dispatch table is too big, so we only check on a few entries we're interested in. - extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check + ('QuantizedCPU',)) + extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check + ('FPGA',)) self.assertExpectedInline(extracted_table, '''\ Undefined: fn_math [math kernel] @@ -557,7 +557,7 @@ def test_computed_table_with_ambiguous_autogradother(self): AutogradCPU: fn_math [math kernel] AutogradCUDA: fn_math [math kernel] AutogradXLA: fn_math [math kernel] -QuantizedCPU: fn_quantizedcpu [kernel] +FPGA: fn_fpga [kernel] ''') def test_computed_table_with_cpu_defaultbackend(self): @@ -616,7 +616,7 @@ def test_computed_table_with_cpu_autograd_defaultbackend(self): ''') # computed dispatch table is too big, so we only check on a few entries we're interested in. - extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check + ('QuantizedCPU',)) + extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check + ('FPGA',)) self.assertExpectedInline(extracted_table, '''\ Undefined: fn_defaultbackend [default backend kernel] @@ -627,7 +627,7 @@ def test_computed_table_with_cpu_autograd_defaultbackend(self): AutogradCPU: fn_autograd [autograd kernel] AutogradCUDA: fn_autograd [autograd kernel] AutogradXLA: fn_autograd [autograd kernel] -QuantizedCPU: fn_defaultbackend [default backend kernel] +FPGA: fn_defaultbackend [default backend kernel] ''') def test_computed_table_with_cpu_autograd_math_defaultbackend(self): @@ -808,7 +808,7 @@ def test_basic(self): CPU fn_CPU [kernel] XLA fn_XLA [kernel] Lazy fn_Lazy [kernel] -QuantizedCPU fn_CompositeImplicitAutograd [math kernel] +FPGA fn_CompositeImplicitAutograd [math kernel] AutogradOther fn_CompositeImplicitAutograd [math kernel] AutogradCPU fallthrough [backend fallback] AutogradXLA fallthrough [backend fallback] @@ -829,7 +829,7 @@ def test_math_autogradcpu(self): CPU fn_CPU [kernel] XLA fn_XLA [kernel] Lazy fn_Lazy [kernel] -QuantizedCPU fn_CompositeImplicitAutograd [math kernel] +FPGA fn_CompositeImplicitAutograd [math kernel] AutogradOther fn_CompositeImplicitAutograd [math kernel] AutogradCPU fn_AutogradCPU [kernel] AutogradXLA fallthrough [backend fallback] @@ -864,7 +864,7 @@ def test_defaultbackend_autogradcpu(self): CPU fn_CPU [kernel] XLA fn_XLA [kernel] Lazy fn_Lazy [kernel] -QuantizedCPU fn_CompositeExplicitAutograd [default backend kernel] +FPGA fn_CompositeExplicitAutograd [default backend kernel] AutogradOther fallthrough [backend fallback] AutogradCPU fn_AutogradCPU [kernel] AutogradXLA fallthrough [backend fallback] @@ -889,7 +889,7 @@ def test_defaultbackend_autogradcpu(self): def test_autogradother(self): dispatcher = PythonDispatcher() - dispatcher.register(["CPU", "QuantizedCPU", "CompositeImplicitAutograd"]) + dispatcher.register(["CPU", "FPGA", "CompositeImplicitAutograd"]) self.assertExpectedInline( dispatcher.dispatchTable(), '''\ @@ -900,7 +900,7 @@ def test_autogradother(self): CPU fn_CPU [kernel] XLA fn_CompositeImplicitAutograd [math kernel] Lazy fn_CompositeImplicitAutograd [math kernel] -QuantizedCPU fn_QuantizedCPU [kernel] +FPGA fn_FPGA [kernel] AutogradOther ambiguous_autogradother [ambiguous autogradother] AutogradCPU fallthrough [backend fallback] AutogradXLA fn_CompositeImplicitAutograd [math kernel] @@ -915,8 +915,8 @@ def test_autogradother(self): Registered Kernels key kernel --------------------------- +FPGA fn_FPGA CPU fn_CPU -QuantizedCPU fn_QuantizedCPU CompositeImplicitAutograd[alias] fn_CompositeImplicitAutograd ''' ) @@ -935,5 +935,20 @@ def test_defaultbackend_math(self): r"Registration to both CompositeImplicitAutograd and CompositeExplicitAutograd is not allowed"): dispatcher.register(["CompositeExplicitAutograd", "CompositeImplicitAutograd"]) + def test_quantized_structured_not_implemented(self): + x = torch.zeros([1, 1, 1]) + y = torch.zeros([1, 1, 1]) + scale, zero_point = 1.0, 0 + dtype = torch.qint8 + qx = torch.quantize_per_tensor(x, scale, zero_point, dtype) + qy = torch.quantize_per_tensor(y, scale, zero_point, dtype) + # If bmm gets quantized support you need to update this to something + # else that is not implemented + self.assertRaisesRegex( + NotImplementedError, + "Could not run 'aten::bmm.out' with arguments from the 'QuantizedCPU' backend.", + lambda: torch.bmm(qx, qy) + ) + if __name__ == '__main__': run_tests() diff --git a/test/test_expanded_weights.py b/test/test_expanded_weights.py new file mode 100644 index 000000000000..a1eb96019cfd --- /dev/null +++ b/test/test_expanded_weights.py @@ -0,0 +1,481 @@ +# Owner(s): ["module: nn"] + +from functools import partial +from itertools import product, chain +import unittest + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import CrossEntropyLoss +from torch.nn.utils._per_sample_grad import call_for_per_sample_grads +from torch.testing._internal.common_cuda import TEST_CUDA +from torch.testing._internal.common_device_type import OpDTypes, instantiate_device_type_tests, ops +from torch.testing._internal.common_nn import TestBase, module_tests, new_module_tests +from torch.testing._internal.common_utils import TestCase, freeze_rng_state, make_tensor, run_tests +from torch.testing._internal.common_methods_invocations import SampleInput, op_db +from torch.nn.utils._expanded_weights import ExpandedWeight +from torch.nn.utils._expanded_weights.expanded_weights_utils import forward_helper, set_grad_sample_if_exists, \ + unpack_expanded_weight_or_tensor, sum_over_all_but_batch_and_last_n, standard_kwargs + +class TestContext: + pass + +class TestExpandedWeightHelperFunction(TestCase): + def test_forward_helper(self, device): + input = torch.randn(3, 4, device=device) + weight = torch.randn(5, 4, device=device) + bias = torch.randn(5, device=device) + for (weight_batched, bias_batched) in product([True, False], [True, False]): + maybe_batched_weight = ExpandedWeight(weight.clone().requires_grad_(), 3) if weight_batched else weight + maybe_batched_bias = ExpandedWeight(bias.clone().requires_grad_(), 3) if bias_batched else bias + args = (input, maybe_batched_weight, maybe_batched_bias) + expanded_args, expanded_kwargs = standard_kwargs(('bias',), args) + res = forward_helper(nn.functional.linear, expanded_args, expanded_kwargs) + expected = nn.functional.linear(input, weight, bias) + self.assertEqual(res, expected) + + self.assertEqual(len(expanded_args), 2) + assert expanded_args[0] is args[0] # avoids property checks in assertEquals + assert expanded_args[1] is args[1] # avoids property checks in assertEquals + self.assertEqual(len(expanded_kwargs), 1) + assert expanded_kwargs['bias'] is args[2] # avoids property checks in assertEquals + + def test_forward_helper_failure_args(self, device): + weight = torch.randn(5, 4, device=device) + bias = torch.randn(5, device=device) + with self.assertRaisesRegex(RuntimeError, r"do not support inputs that are also ExpandedWeights."): + input = ExpandedWeight(torch.randn(3, 4, requires_grad=True), 3) + expanded_args, expanded_kwargs = standard_kwargs(('bias',), (input, weight, bias)) + forward_helper(nn.functional.linear, expanded_args, expanded_kwargs) + with self.assertRaisesRegex(RuntimeError, r"requires a Tensor as the first input"): + expanded_args, expanded_kwargs = standard_kwargs(('bias',), (3, weight, bias)) + forward_helper(nn.functional.linear, expanded_args, expanded_kwargs) + with self.assertRaisesRegex(RuntimeError, r"requires a batch dimension but got an input of size 0"): + expanded_args, expanded_kwargs = standard_kwargs(('bias',), (torch.tensor(3), weight, bias)) + forward_helper(nn.functional.linear, expanded_args, expanded_kwargs) + with self.assertRaisesRegex(RuntimeError, r"0 is not a valid batch size for Expanded Weights"): + expanded_args, expanded_kwargs = standard_kwargs(('bias',), (torch.randn(0, 1, 2), weight, bias)) + forward_helper(nn.functional.linear, expanded_args, expanded_kwargs) + input = torch.randn(3, 4) + for (weight_batched, bias_batched) in product([True, False], [True, False]): + if not weight_batched and not bias_batched: + continue + maybe_batched_weight = ExpandedWeight(weight.clone().requires_grad_(), 4) if weight_batched else weight + maybe_batched_bias = ExpandedWeight(bias.clone().requires_grad_(), 4) if bias_batched else bias + with self.assertRaisesRegex(RuntimeError, r"Expected ExpandedWeights to have batch size matching input"): + expanded_args, expanded_kwargs = standard_kwargs(('bias',), (input, maybe_batched_weight, maybe_batched_bias)) + forward_helper(nn.functional.linear, expanded_args, expanded_kwargs) + + def test_set_grad_sample_if_exists(self, device): + def test_fn(_): + return True + + orig_weight = torch.randn(4, device=device, requires_grad=True) + expanded_weight = ExpandedWeight(orig_weight, 3) + set_grad_sample_if_exists(expanded_weight, test_fn) + self.assertTrue(hasattr(orig_weight, 'grad_sample')) + self.assertTrue(orig_weight.grad_sample) + + basic_tensor = torch.randn(4, device=device) + set_grad_sample_if_exists(basic_tensor, test_fn) + self.assertFalse(hasattr(basic_tensor, 'grad_sample')) + + non_tensor = 3 + set_grad_sample_if_exists(non_tensor, test_fn) + self.assertFalse(hasattr(non_tensor, 'grad_sample')) + + def test_set_grad_sample_if_exists_failure(self, device): + def test_fn(_): + return True + + grad_tensor = torch.randn(4, requires_grad=True, device=device) + with self.assertRaisesRegex(RuntimeError, r"does not support a mixture of ExpandedWeight parameters and normal Parameters"): + set_grad_sample_if_exists(grad_tensor, test_fn) + + def test_unpack_expanded_weight_or_tensor(self, device): + input = torch.randn(3, requires_grad=True, device=device) + self.assertEqual(input, unpack_expanded_weight_or_tensor(ExpandedWeight(input, 3))) + + input.requires_grad_(False) + self.assertEqual(input, unpack_expanded_weight_or_tensor(input)) + self.assertTrue(unpack_expanded_weight_or_tensor(4) is None) + + def test_unpack_expanded_weight_or_tensor_with_custom_function(self, device): + input = torch.randn(3, requires_grad=True, device=device) + self.assertTrue(unpack_expanded_weight_or_tensor(ExpandedWeight(input, 3), lambda x: x is input)) + + input.requires_grad_(False) + self.assertTrue(unpack_expanded_weight_or_tensor(input, lambda x: x is input)) + self.assertTrue(unpack_expanded_weight_or_tensor(4, lambda x: x is input) is None) + + def test_unpack_expanded_weight_or_tensor_failure(self, device): + input = torch.randn(3, requires_grad=True, device=device) + with self.assertRaisesRegex(RuntimeError, r"does not support a mixture of ExpandedWeight parameters and normal Parameters"): + unpack_expanded_weight_or_tensor(input) + + with self.assertRaisesRegex(RuntimeError, r"does not support a mixture of ExpandedWeight parameters and normal Parameters"): + unpack_expanded_weight_or_tensor(input, lambda x: x is input) + + def test_sum_over_all_but_batch_and_last_n(self, device): + input = torch.randn(1, 2, 3, 4, 5, device=device) + res = sum_over_all_but_batch_and_last_n(input, 2) + expected = input.sum((1, 2)) + self.assertEqual(res, expected) + + res = sum_over_all_but_batch_and_last_n(input, 0) + expected = input.sum((1, 2, 3, 4)) + self.assertEqual(res, expected) + + res = sum_over_all_but_batch_and_last_n(input, 4) + self.assertEqual(res, input) + +class TestExpandedWeightFunctional(TestCase): + @ops(filter(lambda op: op.supports_expanded_weight, op_db), dtypes=OpDTypes.supported, allowed_dtypes=(torch.double,)) + def test_expanded_weight_per_sample_grad(self, device, dtype, op): + sample_inputs = op.sample_inputs(device, dtype, requires_grad=True) + for sample_input in supported_inputs(op, sample_inputs): + if op.name == "nn.functional.embedding": # embedding flips its argument order for autograd tests + sample_input = SampleInput(sample_input.args[0], args=(sample_input.input,), kwargs=sample_input.kwargs) + input = sample_input.input + args = sample_input.args + kwargs = sample_input.kwargs + batch_size = input.shape[0] if len(input.shape) > 1 else 1 + + # get per sample grads with ExpandedWeights objects + (ew_input, ew_args, ew_kwargs) = make_expanded_weight(sample_input, batch_size) + diff_input_list = (ew_input,) + tuple(ew_args) + tuple(ew_kwargs.values()) + diff_input_list = [i for i in diff_input_list if is_diff_tensor(i)] + diff_input_list = [i.orig_weight if isinstance(i, ExpandedWeight) else i for i in diff_input_list] + if not diff_input_list: + continue + result = run_op(op, ew_input, *ew_args, **ew_kwargs) + result.sum().backward() # grad doesn't work with ExpandedWeight because it calls __torch_function__ + expanded_weight_grad = tuple(i.grad_sample if hasattr(i, "grad_sample") else i.grad for i in diff_input_list) + + # get per sample grads with for loop + func = partial(run_op, op) + per_sample_grad = for_loop_per_sample_grad(batch_size, input, func, *args, **kwargs) + + # check equality + self.assertEqual(len(per_sample_grad), len(expanded_weight_grad)) + for (result_grad, expected_grad) in zip(expanded_weight_grad, per_sample_grad): + if result_grad is None: + result_grad = torch.zeros_like(expected_grad) + self.assertEqual(result_grad, expected_grad) + + @ops(filter(lambda op: op.supports_expanded_weight, op_db), dtypes=OpDTypes.supported, allowed_dtypes=(torch.double,)) + def test_unsupported_expand_weights(self, device, dtype, op): + sample_inputs = op.sample_inputs(device, dtype, requires_grad=True) + unsupported_inputs = supported_inputs(op, sample_inputs, supported_inputs=False) + for sample_input in unsupported_inputs: + with self.assertRaisesRegex(RuntimeError, r"Expanded Weights"): + if op.name == "nn.functional.embedding": # embedding flips its argument order for autograd tests + sample_input = SampleInput(sample_input.args[0], args=(sample_input.input,), kwargs=sample_input.kwargs) + input = sample_input.input + + batch_size = input.shape[0] if len(input.shape) > 1 else 1 + + # get per sample grads with ExpandedWeights objects + (ew_input, ew_args, ew_kwargs) = make_expanded_weight(sample_input, batch_size) + result = run_op(op, ew_input, *ew_args, **ew_kwargs) + diff_input_list = (ew_input,) + tuple(ew_args) + tuple(ew_kwargs.values()) + diff_input_list = [i for i in diff_input_list if is_diff_tensor(i)] + diff_input_list = [i.orig_weight if isinstance(i, ExpandedWeight) else i for i in diff_input_list] + result.sum().backward() # grad doesn't work with ExpandedWeight because it calls __torch_function__ + + @ops(filter(lambda op: op.supports_expanded_weight, op_db), dtypes=OpDTypes.supported) + def test_expanded_weight_forward(self, device, dtype, op): + sample_inputs = op.sample_inputs(device, dtype) + for sample_input in supported_inputs(op, sample_inputs): + if op.name == "nn.functional.embedding": # embedding flips its argument order for autograd tests + sample_input = SampleInput(sample_input.args[0].clone(), + args=(sample_input.input.clone(),), + kwargs=sample_input.kwargs) + if "cuda" in device and "max_norm" in sample_input.kwargs and "padding_idx" in sample_input.kwargs: + self.skipTest("embedding is non-determinstic in this case, see issue #74679") + batch_size = sample_input.input.shape[0] if len(sample_input.input.shape) > 1 else 1 + (ew_input, ew_args, ew_kwargs) = make_expanded_weight(sample_input, batch_size) + expanded_weight_result = run_op(op, ew_input, *ew_args, **ew_kwargs) + normal_result = run_op(op, sample_input.input, *sample_input.args, **sample_input.kwargs) + self.assertEqual(expanded_weight_result, normal_result) + + def test_expanded_weight_error(self, device): + batch_size = 3 + sample_input = make_tensor((batch_size, 4), dtype=torch.float32, device=device, requires_grad=True) + sample_weight = make_tensor((4), dtype=torch.float32, device=device, requires_grad=True) + with self.assertRaisesRegex(RuntimeError, r"Expanded Weights encountered but cannot handle function"): + torch.add(sample_input, ExpandedWeight(sample_weight, batch_size)) + + def test_small_model(self, device): + def convnet(num_classes): + return nn.Sequential( + nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1), + nn.ReLU(), + nn.AvgPool2d(kernel_size=2, stride=2), + nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1), + nn.ReLU(), + nn.AvgPool2d(kernel_size=2, stride=2), + nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1), + nn.ReLU(), + nn.AvgPool2d(kernel_size=2, stride=2), + nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1), + nn.ReLU(), + nn.AdaptiveAvgPool2d((1, 1)), + nn.Flatten(start_dim=1, end_dim=-1), + nn.Linear(128, num_classes, bias=True), + ) + + batch_size = 32 + model = convnet(10).to(device) + input = torch.randn([batch_size, 3, 28, 28], device=device) + targets = torch.randint(0, 10, (batch_size,), device=device) + criterion = CrossEntropyLoss(reduction='sum') # use a loss that doesn't average across the batch to test in a for loop + result = call_for_per_sample_grads(model, batch_size, input) + loss = criterion(result, targets) + loss.backward() + result = [] + for weight in model.parameters(): + result.append(weight.grad_sample) + del weight.grad_sample + + expected = [] + for i in range(batch_size): + loss = criterion(model(input[i].unsqueeze(0)), targets[i].unsqueeze(0)) + expected.append(torch.autograd.grad(loss, model.parameters(), torch.ones_like(loss))) + + expected = [torch.stack(grad) for grad in zip(*expected)] + for (res, exp) in zip(result, expected): + self.assertEqual(res, exp, atol=1e-4, rtol=5e-5) + + def test_group_norm_error(self, device): + # group norm has to call native_group_norm. This checks that it hits the same errors + # that normal group norm would + + N = 3 + C = 5 + inp = torch.randn(N, C) + with self.assertRaisesRegex(RuntimeError, r"Expected number of channels in input to be divisible"): + F.group_norm(inp, 2) # 5 is not divisible by 2 + +class TestExpandedWeightModule(TestCase): + def _do_test(self, module, input): + batch_size = input.shape[0] + diff_input = input.dtype == torch.float or input.dtype == torch.double + if diff_input: + input.requires_grad_() + with freeze_rng_state(): + # get per sample grads with ExpandedWeights context manager + actual_res = call_for_per_sample_grads(module, batch_size, input).sum() + actual_res.backward() + actual_grads = [] + for param in module.parameters(): + actual_grads.append(param.grad_sample) + del param.grad_sample + if diff_input: + actual_grads.append(input.grad.clone()) + input.grad = torch.zeros_like(input.grad) + + # get per sample grads with a for loop + expected_res = torch.tensor(0., device=input.device, dtype=torch.double) + expected_grads = [] + for i in range(batch_size): + input_slice = input[i] + diff_params = module.parameters() + if diff_input: + diff_params = chain(diff_params, (input_slice,)) + res = module(input_slice.unsqueeze(0)).sum() + out_grads = torch.autograd.grad(res, diff_params, torch.ones_like(res), allow_unused=True) + expected_grads.append(out_grads) + expected_res += res + expected_grads = tuple(torch.stack(grad) for grad in zip(*expected_grads)) + self.assertEqual(actual_res, expected_res) + [self.assertEqual(actual, expected) for (actual, expected) in zip(actual_grads, expected_grads)] + + def _do_test_multi_input(self, module, input): + class TestModule(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + + def forward(self, input): + return self.module(input) + self.module(input) + + batch_size = input.shape[0] + diff_input = input.dtype == torch.float or input.dtype == torch.double + if diff_input: + input.requires_grad_() + with freeze_rng_state(): + # get per sample grads with ExpandedWeights context manager, calling .backward() twice + test_module = TestModule(module) + actual_res = call_for_per_sample_grads(test_module, batch_size, input).sum() + actual_res.backward() + actual_grads = [] + for param in module.parameters(): + actual_grads.append(param.grad_sample) + del param.grad_sample + if diff_input: + actual_grads.append(input.grad.clone()) + input.grad = torch.zeros_like(input.grad) + + + # get per sample grads with a for loop, running over the input twice + expected_grads = [] + for i in range(batch_size): + input_slice = input[i] + diff_params = module.parameters() + if diff_input: + diff_params = chain(diff_params, (input_slice,)) + res = module(input_slice.unsqueeze(0)).sum() + out_grads = torch.autograd.grad(res, diff_params, torch.ones_like(res), allow_unused=True) + expected_grads.append(out_grads) + expected_grads = tuple(torch.stack(grad) for grad in zip(*expected_grads)) + expected_grads = tuple(expected_grad for expected_grad in expected_grads if expected_grad is not None) + assert [self.assertEqual(actual, 2 * expected) for (actual, expected) in zip(actual_grads, expected_grads)] + + def test_per_sample_api_failing(self): + module = nn.Linear(10, 10) + input = torch.randn(64, 10) + with self.assertRaisesRegex(RuntimeError, r"Module passed must be nn.Module"): + call_for_per_sample_grads("fail", 64, input) + with self.assertRaisesRegex(RuntimeError, r"Batch size passed must be an integer"): + call_for_per_sample_grads(module, 6.4, input) + with self.assertRaisesRegex(RuntimeError, r"Batch size must be positive"): + call_for_per_sample_grads(module, -64, input) + with self.assertRaisesRegex(RuntimeError, r"incorrect for multiple calls"): + loss = call_for_per_sample_grads(module, 64, input).sum() + loss.backward() # populate grad_sample fields + call_for_per_sample_grads(module, 64, input) + +class ContextManagerTests(TestBase): + def __init__(self, *args, **kwargs): + self.test_cpu = kwargs.get('test_cpu', True) + self.test_cuda = kwargs.get('test_cuda', True) + super().__init__(*args, **kwargs) + + @property + def constructor_args(self): + return self._get_arg('constructor_args', False) + + def test_context_manager(self, test_case, device): + kwargs = {'device': device, 'dtype': torch.double} + module = self.constructor(*self.constructor_args).to(**kwargs) + if 'Embedding' in self.get_name(): + kwargs['dtype'] = torch.long + input = self._get_input().to(**kwargs) + if len(input.shape) == 0 or input.shape[0] == 0: + raise unittest.SkipTest("Can't get per sample gradients when no batch dim or batch dim is 0") + if self.constructor == torch.nn.Linear and len(input.shape) == 1: + raise unittest.SkipTest("Can't get per sample gradients for input of rank 1") + test_case._do_test(module, input) + + def test_context_manager_multiple_inputs(self, test_case, device): + module = self.constructor(*self.constructor_args).to(device) + input = self._get_input() + if len(input.shape) == 0 or input.shape[0] == 0: + raise unittest.SkipTest("Can't get per sample gradients when no batch dim or batch dim is 0") + if self.constructor == torch.nn.Linear and len(input.shape) == 1: + raise unittest.SkipTest("Can't get per sample gradients for input of rank 1") + test_case._do_test_multi_input(module, input) + +# TODO: Once all of these use ModuleInfo, replace with ModuleInfo tests +# These currently use the legacy nn tests +supported_modules = ['Linear', 'Conv1d', 'Conv2d', 'Conv3d', 'Embedding', 'LayerNorm', 'GroupNorm', 'InstanceNorm'] +supported_tests = [t for t in module_tests + new_module_tests if 'module_name' in t and t['module_name'] in supported_modules] +for test_param in supported_tests: + if 'constructor' not in test_param: + name = test_param.pop('module_name') + test_param['constructor'] = getattr(nn, name) + decorator = test_param.pop('decorator', None) + test = ContextManagerTests(**test_param) + test_name = test.get_name() + if hasattr(TestExpandedWeightModule, test_name): + raise RuntimeError('Found two tests with the same name: ' + test_name) + test_name_multi_input = test.get_name() + "_multiple_inputs" + if hasattr(TestExpandedWeightModule, test_name_multi_input): + raise RuntimeError('Found two tests with the same name: ' + test_name) + if decorator is not None: + fn = decorator(fn) + if test.test_cpu: + setattr(TestExpandedWeightModule, test_name, lambda self, test=test: test.test_context_manager(self, 'cpu')) + setattr(TestExpandedWeightModule, test_name_multi_input, + lambda self, test=test: test.test_context_manager_multiple_inputs(self, 'cpu')) + if TEST_CUDA and test.test_cuda: + # since this checks derivatives, only use double for precision + setattr(TestExpandedWeightModule, test_name + '_cuda_double', + lambda self, test=test: test.test_context_manager(self, 'cuda')) + +# ------------- HELPER FUNCTIONS ----------------- + +def run_op(op, input, *args, **kwargs): + r""" + OpInfo for Embedding switches the input and weight so autograd tests will only check the derivative + of the weight, not the input, which can't be differentiable since its dtype is int. Calls op, + using the special ordering that Embedding's OpInfo expects for that case. + """ + if op.name == "nn.functional.embedding": + return op(args[0], input, **kwargs) + else: + return op(input, *args, **kwargs) + +def make_expanded_weight(sample_input, batch_size): + def expanded_weight_or_clone(arg): + return ExpandedWeight(torch.clone(arg), batch_size) if is_diff_tensor(arg) else clone_if_tensor(arg) + + ew_input = clone_if_tensor(sample_input.input) + ew_args = tuple(expanded_weight_or_clone(arg) for arg in sample_input.args) + ew_kwargs = {name: expanded_weight_or_clone(arg) for (name, arg) in sample_input.kwargs.items()} + return ew_input, ew_args, ew_kwargs + +def supported_inputs(op, sample_inputs, supported_inputs=True): + r""" + ExpandedWeights currently does not support some use cases when there's no batch dimension or + operations that would cause inter-batch operations. Removes all of the cases it cannot deal with + """ + def filter_fn(input): + convolutions = ["nn.functional.conv1d", "nn.functional.conv2d", "nn.functional.conv3d"] + if op.name == "nn.functional.linear": + is_supported_input = len(input.input.shape) > 1 # input of rank 1 means no batch dim + elif op.name == "nn.functional.layer_norm": + normalized_shape = input.args[0] + is_supported_input = input.input.shape != normalized_shape # would cause inter-batch operations + elif op.name in convolutions: + # currently can't deal with padding computation on Python level + is_supported_input = 'padding' not in input.kwargs or not isinstance(input.kwargs['padding'], str) + elif op.name == "nn.functional.embedding": + idx = input.args[0] + is_supported_input = len(idx.shape) > 1 # there's no batch size + else: + is_supported_input = True + is_supported_input = is_supported_input and input.input.shape[0] > 0 # 0 is not a valid batch size + return is_supported_input if supported_inputs else not is_supported_input + return [input for input in sample_inputs if filter_fn(input)] + +def for_loop_per_sample_grad(batch_size, input, func, *args, **kwargs): + # get per sample grads by getting derivative for each input in a for loop + per_sample_grad = [] + for i in range(batch_size): + per_sample_input = input[i] + result = func(per_sample_input.unsqueeze(0), *args, **kwargs) + diff_input_list = (per_sample_input,) + tuple(args) + tuple(kwargs.values()) + diff_input_list = [i for i in diff_input_list if isinstance(i, torch.Tensor) and i.requires_grad] + per_sample_grad.append(torch.autograd.grad(result, diff_input_list, torch.ones_like(result), allow_unused=True)) + if len(per_sample_grad) == batch_size: + per_sample_grad = tuple(torch.stack(grad) for grad in zip(*per_sample_grad)) + return per_sample_grad + +def is_diff_tensor(t): + return isinstance(t, ExpandedWeight) or (isinstance(t, torch.Tensor) and t.requires_grad) + +def clone_if_tensor(t): + if isinstance(t, torch.Tensor): + res = torch.clone(t).detach() + res.requires_grad_(t.requires_grad) + return res + else: + return t + +instantiate_device_type_tests(TestExpandedWeightHelperFunction, globals()) +instantiate_device_type_tests(TestExpandedWeightFunctional, globals()) +if __name__ == '__main__': + run_tests() diff --git a/test/test_foreach.py b/test/test_foreach.py index a04ddcebbaae..4da23dc66fc3 100644 --- a/test/test_foreach.py +++ b/test/test_foreach.py @@ -11,12 +11,13 @@ from torch.testing._comparison import default_tolerances from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_ROCM, TEST_WITH_SLOW from torch.testing._internal.common_device_type import \ - (instantiate_device_type_tests, dtypes, onlyCUDA, skipCUDAIfRocm, skipMeta, ops) + (instantiate_device_type_tests, dtypes, onlyCUDA, skipMeta, ops) from torch.testing._internal.common_methods_invocations import ( foreach_unary_op_db, foreach_binary_op_db, foreach_pointwise_op_db, foreach_minmax_op_db, foreach_reduce_op_db) from torch.testing._internal.common_dtype import ( - get_all_dtypes, get_all_int_dtypes, get_all_complex_dtypes, get_all_fp_dtypes, + all_types_and_complex_and, all_types_and, integral_types, complex_types, + floating_types_and, floating_types, integral_types_and, ) # Includes some values such that N * N won't be a multiple of 4, @@ -140,7 +141,7 @@ def _test_binary_op_tensorlists(self, device, dtype, opinfo, N, is_fastpath, dis self._binary_test(dtype, inplace_op, inplace_ref, inputs, is_fastpath, is_inplace=True) if opinfo.supports_alpha_param: alpha = None - if dtype in get_all_int_dtypes(): + if dtype in integral_types(): alpha = 3 elif dtype.is_complex: alpha = complex(3, 3) @@ -165,19 +166,11 @@ def _test_binary_op_tensorlists(self, device, dtype, opinfo, N, is_fastpath, dis self._binary_test( dtype, inplace_op, inplace_ref, inputs, is_fastpath and disable_fastpath, is_inplace=True) - # note(mkozuki): Why ROCm? - # ROCm is supposed to compile slow path as in - # https://github.com/pytorch/pytorch/blob/7e032f18cf1405804c4f787b05ea2de5e08a091e/aten/src/ATen/native/ForeachUtils.h#L148-L164, # noqa: E501 - # Therefore `[torch.add(*args, alpha=alpha) for args in zip(tensors1, tensors2)]` and - # `torch._foreach_add(tensors1, tensors2, alpha=alpha)` - # are expected to return the same outputs, however, the outputs look unstable for torch.bfloat16 and torch.half. - # log: https://ci.pytorch.org/jenkins/job/pytorch-builds/job/pytorch-linux-bionic-rocm4.2-py3.6-test1/2741/console - @skipCUDAIfRocm @skipMeta @ops(foreach_binary_op_db) def test_binary_op_tensorlists_fastpath(self, device, dtype, op): for N in N_values: - disable_fastpath = op.ref == torch.div and dtype in get_all_int_dtypes() + [torch.bool] + disable_fastpath = op.ref == torch.div and dtype in integral_types_and(torch.bool) if op.ref == torch.add and dtype == torch.bool: disable_fastpath = True self._test_binary_op_tensorlists(device, dtype, op, N, True, disable_fastpath) @@ -194,22 +187,21 @@ def _test_binary_op_scalar(self, device, dtype, opinfo, N, scalar, is_fastpath, self._binary_test(dtype, op, ref, inputs, is_fastpath, is_inplace=False) self._binary_test(dtype, inplace_op, inplace_ref, inputs, is_fastpath, is_inplace=True) - @skipCUDAIfRocm @skipMeta @ops(foreach_binary_op_db) def test_binary_op_scalar_fastpath(self, device, dtype, op): for N, scalar in itertools.product(N_values, Scalars): - disable_fastpath = op.ref == torch.div and dtype in get_all_int_dtypes() + [torch.bool] + disable_fastpath = op.ref == torch.div and dtype in integral_types_and(torch.bool) if isinstance(scalar, int): disable_fastpath |= dtype == torch.bool if isinstance(scalar, float): - disable_fastpath |= dtype in get_all_int_dtypes() + [torch.bool] + disable_fastpath |= dtype in integral_types_and(torch.bool) if isinstance(scalar, bool): disable_fastpath |= dtype == torch.bool if op.ref in (torch.add, torch.mul): disable_fastpath = False if isinstance(scalar, complex): - disable_fastpath |= dtype not in get_all_complex_dtypes() + disable_fastpath |= dtype not in complex_types() self._test_binary_op_scalar(device, dtype, op, N, scalar, True, disable_fastpath) @ops(foreach_binary_op_db) @@ -233,22 +225,21 @@ def _test_binary_op_scalarlist(self, device, dtype, opinfo, N, scalarlist, is_fa # errors depending on the order of scalarlist. To keep actual unit test impl simple, # separating mixed scalarlist tests. By setting the first element of scalarlist to bool, # they are expected to throw bool sub error even in inplace test. - @skipCUDAIfRocm @skipMeta @ops(foreach_binary_op_db) def test_binary_op_scalarlist_fastpath(self, device, dtype, op): for N in N_values: for type_str, scalarlist in getScalarLists(N): - bool_int_div = op.ref == torch.div and dtype in get_all_int_dtypes() + [torch.bool] + bool_int_div = op.ref == torch.div and dtype in integral_types_and(torch.bool) disable_fastpath = bool_int_div if type_str == "int": disable_fastpath |= dtype == torch.bool if type_str == "float": - disable_fastpath |= dtype in get_all_int_dtypes() + [torch.bool] + disable_fastpath |= dtype in integral_types_and(torch.bool) if type_str == "complex": - disable_fastpath |= dtype not in get_all_complex_dtypes() + disable_fastpath |= dtype not in complex_types() if type_str == "mixed": - disable_fastpath |= True and dtype not in get_all_complex_dtypes() + disable_fastpath |= True and dtype not in complex_types() self._test_binary_op_scalarlist(device, dtype, op, N, scalarlist, True, disable_fastpath) @ops(foreach_binary_op_db) @@ -305,7 +296,7 @@ def _test_pointwise_op(self, device, dtype, opinfo, N, is_fastpath, disable_fast @skipMeta @ops(foreach_pointwise_op_db) def test_pointwise_op_fastpath(self, device, dtype, op): - disable_fastpath = dtype in get_all_int_dtypes() + [torch.bool] + disable_fastpath = dtype in integral_types_and(torch.bool) # for N, scalar in itertools.product(N_values, Scalars): for N in N_values: self._test_pointwise_op(device, dtype, op, N, True, disable_fastpath) @@ -363,7 +354,7 @@ def _test_unary(self, device, dtype, opinfo, N, is_fastpath): op, ref, inplace_op, inplace_ref = self._get_funcs(opinfo, 1) inputs = opinfo.sample_inputs(device, dtype, N, noncontiguous=not is_fastpath), # note(mkozuki): Complex inputs for `_foreach_abs` go through slowpath. - if opinfo.name == "_foreach_abs" and dtype in get_all_complex_dtypes(): + if opinfo.name == "_foreach_abs" and dtype in complex_types(): is_fastpath = False self._regular_unary_test(dtype, op, ref, inputs, is_fastpath) self._inplace_unary_test(dtype, inplace_op, inplace_ref, inputs, is_fastpath) @@ -374,7 +365,7 @@ def test_unary_fastpath(self, device, dtype, op): for N in N_values: self._test_unary(device, dtype, op, N, is_fastpath=True) - @ops(foreach_unary_op_db, dtypes=get_all_dtypes()) + @ops(foreach_unary_op_db, dtypes=all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool)) def test_unary_slowpath(self, device, dtype, op): for N in N_values: self._test_unary(device, dtype, op, N, is_fastpath=False) @@ -391,7 +382,7 @@ def test_minmax_fastpath(self, device, dtype, op): self._minmax_test(op, inputs, True, N if dtype == torch.bool else 1) @ops(foreach_minmax_op_db, - dtypes=get_all_dtypes(include_half=True, include_bfloat16=True, include_complex=False)) + dtypes=all_types_and(torch.half, torch.bfloat16, torch.bool)) def test_minmax_slowpath(self, device, dtype, op): for N in N_values: inputs = tuple(op.sample_inputs(device, dtype, N, noncontiguous=True) for _ in range(2)) @@ -399,7 +390,7 @@ def test_minmax_slowpath(self, device, dtype, op): # note(mkozuki): ForeachFuncInfo's of both `_foreach_maximum` and `_foreach_minimum` include integer types. # so, manually limit dtypes to fp types for inf&nan tests. - @ops(foreach_minmax_op_db, dtypes=get_all_fp_dtypes(include_bfloat16=True, include_half=True)) + @ops(foreach_minmax_op_db, dtypes=floating_types_and(torch.half, torch.bfloat16)) def test_minmax_float_inf_nan(self, device, dtype, op): inputs = ( [ @@ -424,7 +415,7 @@ def _reduce_test(self, opinfo, inputs, ord, is_fastpath, n_expected_cudaLaunchKe @ops(foreach_reduce_op_db) def test_reduce_fastpath(self, device, dtype, op): for N, ord in itertools.product(N_values, (0, 1, 2, -1, -2)): - if ord in (1, 2) and dtype in torch.testing.get_all_fp_dtypes(): + if ord in (1, 2) and dtype in floating_types_and(torch.half, torch.bfloat16): n_expected_cudaLaunchKernels = 3 else: n_expected_cudaLaunchKernels = N @@ -437,7 +428,7 @@ def test_reduce_slowpath(self, device, dtype, op): inputs = op.sample_inputs(device, dtype, N, noncontiguous=True), self._reduce_test(op, inputs, ord, False, 1) - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool)) def test_add_scalar_with_empty_list_and_empty_tensor(self, device, dtype): # TODO: enable empty list case for tensors in [[torch.randn([0])]]: @@ -447,7 +438,7 @@ def test_add_scalar_with_empty_list_and_empty_tensor(self, device, dtype): torch._foreach_add_(tensors, 1) self.assertEqual(res, tensors) - @ops(foreach_binary_op_db, dtypes=get_all_dtypes()) + @ops(foreach_binary_op_db, dtypes=all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool)) def test_binary_op_scalar_with_overlapping_tensors(self, device, dtype, op): foreach_op, ref = op.method_variant, op.ref tensors = [torch.ones(1, 1, device=device, dtype=dtype).expand(2, 1, 3)] @@ -479,7 +470,7 @@ def test_binary_op_scalar_with_different_tensor_dtypes(self, device, dtype, op): runtime_error = e self.assertIsNone(runtime_error) - @ops(foreach_binary_op_db, dtypes=get_all_dtypes()) + @ops(foreach_binary_op_db, dtypes=all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool)) def test_binary_op_list_error_cases(self, device, dtype, op): foreach_op, foreach_op_, ref, ref_ = op.method_variant, op.inplace_variant, op.ref, op.ref_inplace tensors1 = [] @@ -534,7 +525,7 @@ def test_binary_op_list_error_cases(self, device, dtype, op): return with self.assertRaisesRegex(RuntimeError, "Expected all tensors to be on the same device"): foreach_op([tensor1], [tensor2]) - if dtype in get_all_int_dtypes() + [torch.bool] and foreach_op == torch._foreach_div: + if dtype in integral_types_and(torch.bool) and foreach_op == torch._foreach_div: with self.assertRaisesRegex(RuntimeError, "result type"): foreach_op_([tensor1], [tensor2]) else: @@ -543,7 +534,7 @@ def test_binary_op_list_error_cases(self, device, dtype, op): @skipMeta @unittest.skipIf(not torch.cuda.is_available(), "CUDA not found") - @ops(foreach_binary_op_db, dtypes=get_all_dtypes()) + @ops(foreach_binary_op_db, dtypes=all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool)) def test_binary_op_list_slow_path(self, device, dtype, op): # note(mkozuki): why `n_expected_cudaLaunchKernels=0`? # In this test, foreach functions don't go through fast path, @@ -635,7 +626,7 @@ def test_binary_op_tensors_on_different_devices(self, device, dtype, op): self.assertEqual(actual, tensors1) @onlyCUDA - @ops(foreach_pointwise_op_db, allowed_dtypes=get_all_fp_dtypes(include_half=False, include_bfloat16=False)) + @ops(foreach_pointwise_op_db, allowed_dtypes=floating_types()) def test_pointwise_op_tensors_on_different_devices(self, device, dtype, op): # tensors1: ['cuda', 'cpu] # tensors2: ['cuda', 'cpu] @@ -653,6 +644,27 @@ def test_pointwise_op_tensors_on_different_devices(self, device, dtype, op): foreach_op_(tensors1, tensors2, tensors3) self.assertEqual(expected, tensors1) + # note: BFloat16 has the same number of exponent bits as FP32 + # so if squared L2 norm overflows in BF16, then it also overflows in FP32. + @onlyCUDA + @ops(foreach_reduce_op_db, allowed_dtypes=(torch.half, torch.bfloat16)) + def test_foreach_l2_large_value_input(self, device, dtype, op): + ord, N = 2, 10 + max_value = torch.finfo(dtype).max + scaler = torch.tensor([max_value]).sqrt().to(device=device, dtype=dtype) + inputs = [t * scaler for t in op.sample_inputs(device, dtype, N, noncontiguous=False, low=1)], + # make sure that the min. of squared L2 norm value per tensor is greater than the max value of `dtype`. + self.assertTrue(scaler * scaler * N > max_value) + fn, ref_fn, *_ = self._get_funcs(op, 3) + actual = fn(inputs, is_cuda=True, is_fastpath=True, ord=ord) + expect = ref_fn(inputs, ord=ord) + if dtype == torch.float16: + # making sure the reference L2 norm values are in the range of FP16. + self.assertFalse(any(torch.isinf(e) for e in expect)) + else: + self.assertTrue(all(torch.isinf(e) for e in expect)) + self.assertEqual(expect, actual, equal_nan=False) + instantiate_device_type_tests(TestForeach, globals()) diff --git a/test/test_functionalization.py b/test/test_functionalization.py index 28476ff25957..31220b9f2d5a 100644 --- a/test/test_functionalization.py +++ b/test/test_functionalization.py @@ -2,7 +2,10 @@ import torch from torch.testing._internal.common_utils import TestCase, run_tests -from torch.testing._internal.logging_tensor import LoggingTensor, capture_logs, log_input +from torch.testing._internal.logging_tensor import LoggingTensor, LoggingTensorReentrant, capture_logs, log_input +from torch.utils._pytree import tree_map + +import logging def are_aliased(x, y): if x._base is None and y._base is None: @@ -13,23 +16,63 @@ def are_aliased(x, y): return y._base is x return x._base is y._base +# Just for testing: a logging tensor that also transforms out-of-place ops into inplace ops. +# That way even if the outer wrapper is functionalized, the inner wrapper will also need functionalization. +class InplaceLoggingTensor(LoggingTensorReentrant): + @staticmethod + def __new__(cls, e): + r = torch.Tensor._make_wrapper_subclass(cls, e.shape, dtype=e.dtype, requires_grad=False) + r.elem = e + return r + + __torch_function__ = torch._C._disabled_torch_function_impl + + def __str__(self): + return f'InplaceLoggingTensor({self.elem})' + + @classmethod + def __torch_dispatch__(cls, func, types, args=(), kwargs=None): + def unwrap(e): + if isinstance(e, InplaceLoggingTensor): + return e.elem + else: + return e + + def wrap(e): + if isinstance(e, torch.Tensor): + return InplaceLoggingTensor(e) + else: + return e + f = func + # this subclass converts all `add()` ops into `add_()` ops + if f is torch.ops.aten.add.Tensor: + f = torch.ops.aten.add_.Tensor + + with cls.context(): + rs = tree_map(wrap, f(*tree_map(unwrap, args), **tree_map(unwrap, kwargs))) + # after running the (potentially transformed) op, + # log the original op that we saw. + logging.getLogger("LoggingTensor").info(f"{func.__module__}.{func.__name__}", args, kwargs, rs) + return rs + + class TestFunctionalization(TestCase): - def get_logs(self, func, inpt): + def get_logs(self, func, inpt, *, reapply_views=False): input_clone_logging = LoggingTensor(inpt.clone()) input_functional_logging = torch._to_functional_tensor(input_clone_logging) with capture_logs() as logs: log_input("input", input_clone_logging) - torch._enable_functionalization() + torch._enable_functionalization(reapply_views=reapply_views) try: func(input_functional_logging) finally: torch._disable_functionalization() return logs - def assert_functionalization(self, func, inpt): + def assert_functionalization(self, func, inpt, *, reapply_views=False): input_clone = inpt.clone() input_clone2 = inpt.clone() input_functional = torch._to_functional_tensor(input_clone2) @@ -37,7 +80,7 @@ def assert_functionalization(self, func, inpt): # Compare outputs (and mutated inputs), with and without functionalization. out_ref = func(inpt) - torch._enable_functionalization() + torch._enable_functionalization(reapply_views=reapply_views) try: out_functional = func(input_functional) finally: @@ -61,13 +104,57 @@ def f(x): logs = self.get_logs(f, torch.ones(4, 2)) self.assertExpectedInline('\n'.join(logs), """\ $0 = input('input') -$1 = torch._ops.aten.view($0, [4, 2]) -$2 = torch._ops.aten.add($1, tensor([[1., 1.], +$1 = torch._ops.aten.view_copy.default($0, [4, 2]) +$2 = torch._ops.aten.add.Tensor($1, tensor([[1., 1.], + [1., 1.], + [1., 1.], + [1., 1.]])) +$3 = torch._ops.aten.view_copy.default($2, [4, 2]) +$4 = torch._ops.aten.mul.Tensor($3, $3)""") + + def test_simple_out(self): + def f(x): + tmp = torch.ones(4, 2) + y = x.view(4, 2) + # the out= tensor will get resized, since it has size=0 to start. + z = torch.empty(()) + torch.add(y, tmp, out=z) + w = z * z + return w + self.assert_functionalization(f, torch.ones(4, 2)) + logs = self.get_logs(f, torch.ones(4, 2)) + self.assertExpectedInline('\n'.join(logs), """\ +$0 = input('input') +$1 = torch._ops.aten.view_copy.default($0, [4, 2]) +$2 = torch._ops.aten.add.Tensor($1, tensor([[1., 1.], [1., 1.], [1., 1.], [1., 1.]])) -$3 = torch._ops.aten.view($2, [4, 2]) -$4 = torch._ops.aten.mul($3, $3)""") +$3 = torch._ops.aten.mul.Tensor($2, $2)""") + + def test_multi_out(self): + def f(x): + # aminmax.out returns a tuple of tensors. + # functionalization should properly handle the tuple. + out_min = torch.empty(4) + out_max = torch.empty(4) + torch.aminmax(x, dim=0, out=(out_max, out_min)) + return out_max + self.assert_functionalization(f, torch.arange(8, dtype=torch.float32)) + logs = self.get_logs(f, torch.arange(8, dtype=torch.float32)) + self.assertExpectedInline('\n'.join(logs), """\ +$0 = input('input') +$1, $2 = torch._ops.aten.aminmax.default($0, dim=0)""") + + def test_tensor_ctr(self): + def f(x): + y = torch.tensor((1, 2, 3)) + z = y.view(-1) + z.add_(1) + return y + self.assert_functionalization(f, torch.arange(3, dtype=torch.float32)) + logs = self.get_logs(f, torch.arange(3, dtype=torch.float32)) + self.assertExpectedInline('\n'.join(logs), """$0 = input('input')""") def test_inplace_on_non_view(self): def f(x): @@ -81,8 +168,8 @@ def f(x): logs = self.get_logs(f, torch.ones(4, 2)) self.assertExpectedInline('\n'.join(logs), """\ $0 = input('input') -$1 = torch._ops.aten.view($0, [4, 2]) -$2 = torch._ops.aten.add($0, tensor([[1., 1.], +$1 = torch._ops.aten.view_copy.default($0, [4, 2]) +$2 = torch._ops.aten.add.Tensor($0, tensor([[1., 1.], [1., 1.], [1., 1.], [1., 1.]]))""") @@ -94,17 +181,30 @@ def f(x): return y self.assert_functionalization(f, torch.ones(2, 2)) logs = self.get_logs(f, torch.ones(2, 2)) - # Only seeing copy_() calls in the logs are actually expected: - # - block_diag is a CompositeImplicitAutograd op, implemented in terms of copy_() and a few other ops. - # - copy_() doesn't have an out-of-place variant, so the pass leaves it alone - # - the other ops are all not called on the input tensor, which means that the LoggingTensor doesn't see them - # We can update the output of this test if/when these tests eventually use LoggingTensor with PythonMode self.assertExpectedInline('\n'.join(logs), """\ $0 = input('input') -$1 = torch._ops.aten.copy_(tensor([[1., 1.], - [1., 1.]]), $0) -$2 = torch._ops.aten.copy_(tensor([[1., 1.], - [1., 1.]]), $0)""") +$1 = torch._ops.aten.expand_copy.default($0, [2, 2]) +$2 = torch._ops.aten.slice_scatter.default(tensor([[0., 0., 0., 0.], + [0., 0., 0., 0.]]), $1, 1, 0, 2) +$3 = torch._ops.aten.slice_scatter.default(tensor([[0., 0., 0., 0.], + [0., 0., 0., 0.], + [0., 0., 0., 0.], + [0., 0., 0., 0.]]), $2, 0, 0, 2) +$4 = torch._ops.aten.slice_copy.Tensor($3, 0, 2, 4) +$5 = torch._ops.aten.slice_copy.Tensor($4, 1, 2, 4) +$6 = torch._ops.aten.expand_copy.default($0, [2, 2])""") + + def test_cat(self): + def f(x): + out = torch.empty(0) + torch.cat((x,), out=out) + return out + self.assert_functionalization(f, torch.ones(2, 2)) + logs = self.get_logs(f, torch.ones(2, 2)) + self.assertExpectedInline('\n'.join(logs), """\ +$0 = input('input') +$1 = torch._ops.aten.cat.default([LoggingTensor(tensor([[1., 1.], + [1., 1.]]))])""") def test_diagonal(self): def f(x): @@ -118,10 +218,10 @@ def f(x): logs = self.get_logs(f, torch.ones(2, 2)) self.assertExpectedInline('\n'.join(logs), """\ $0 = input('input') -$1 = torch._ops.aten.diagonal($0) -$2 = torch._ops.aten.add($1, tensor([1., 1.])) -$3 = torch._ops.aten.diagonal_scatter($0, $2) -$4 = torch._ops.aten.mul($3, $3)""") +$1 = torch._ops.aten.diagonal_copy.default($0) +$2 = torch._ops.aten.add.Tensor($1, tensor([1., 1.])) +$3 = torch._ops.aten.diagonal_scatter.default($0, $2) +$4 = torch._ops.aten.mul.Tensor($3, $3)""") def test_diagonal_mutated_input(self): def f(x): @@ -146,13 +246,13 @@ def f(x): logs = self.get_logs(f, torch.ones(4, 2)) self.assertExpectedInline('\n'.join(logs), """\ $0 = input('input') -$1, $2 = torch._ops.aten.split($0, 2) -$3 = torch._ops.aten.diagonal($2) -$4 = torch._ops.aten.add($3, tensor([1., 1.])) -$5, $6 = torch._ops.aten.split($0, 2) -$7 = torch._ops.aten.diagonal_scatter($6, $4) -$8 = torch._ops.aten.slice_scatter($0, $7, 0, 2, 4) -$9 = torch._ops.aten.mul($8, $8)""") +$1, $2 = torch._ops.aten.split_copy.Tensor($0, 2) +$3 = torch._ops.aten.diagonal_copy.default($2) +$4 = torch._ops.aten.add.Tensor($3, tensor([1., 1.])) +$5, $6 = torch._ops.aten.split_copy.Tensor($0, 2) +$7 = torch._ops.aten.diagonal_scatter.default($6, $4) +$8 = torch._ops.aten.slice_scatter.default($0, $7, 0, 2, 4) +$9 = torch._ops.aten.mul.Tensor($8, $8)""") def test_view_inplace(self): def f(x): @@ -166,9 +266,25 @@ def f(x): logs = self.get_logs(f, torch.ones(4, 2)) self.assertExpectedInline('\n'.join(logs), """\ $0 = input('input') -$1 = torch._ops.aten.transpose($0, 1, 0) -$2 = torch._ops.aten.select($1, 0, 0) -$3 = torch._ops.aten.add($2, tensor([1., 1., 1., 1.]))""") +$1 = torch._ops.aten.transpose_copy.int($0, 1, 0) +$2 = torch._ops.aten.select_copy.int($1, 0, 0) +$3 = torch._ops.aten.add.Tensor($2, tensor([1., 1., 1., 1.]))""") + + def test_optional_tensor_list(self): + def f(x): + # test: an operator that takes in a List[Optional[Tensor]] argument + # (index_put) + y = x.view(8) + indices = torch.arange(4) + values = torch.arange(4, dtype=y.dtype) + y.index_put_((indices,), values, accumulate=False) + return y + self.assert_functionalization(f, torch.ones(4, 2)) + logs = self.get_logs(f, torch.ones(4, 2)) + self.assertExpectedInline('\n'.join(logs), """\ +$0 = input('input') +$1 = torch._ops.aten.view_copy.default($0, [8]) +$2 = torch._ops.aten.index_put.default($1, [tensor([0, 1, 2, 3])], tensor([0., 1., 2., 3.]))""") def test_scalars(self): def f(x): @@ -183,16 +299,28 @@ def f(x): logs = self.get_logs(f, torch.ones(4, 2)) self.assertExpectedInline('\n'.join(logs), """\ $0 = input('input') -$1 = torch._ops.aten.view($0, [4, 2]) -$2 = torch._ops.aten.add($1, tensor(1)) -$3 = torch._ops.aten.mul($2, tensor(2)) -$4 = torch._ops.aten.div($3, tensor(1))""") +$1 = torch._ops.aten.view_copy.default($0, [4, 2]) +$2 = torch._ops.aten.add.Tensor($1, 1) +$3 = torch._ops.aten.mul.Tensor($2, 2) +$4 = torch._ops.aten.div.Tensor($3, 1)""") + + def test_only_one_view(self): + def f(x): + # This tests that we don't have any unnecessary views in the trace. + # If the input wasn't mutated, we don't need to regenerate it, + # so there should be a total of 1 op in the output trace. + return x.view(4, 2) + logs = self.get_logs(f, torch.ones(4, 2)) + self.assertExpectedInline('\n'.join(logs), """\ +$0 = input('input') +$1 = torch._ops.aten.view_copy.default($0, [4, 2])""") def test_everything(self): def f(x): # test: everything tmp = torch.ones(2, 2) - y = x.view(8) + x2 = x + x + y = x2.view(8) z0 = y.reshape(2, 4) z1 = z0.transpose(1, 0) z1.unsqueeze_(0) @@ -205,41 +333,61 @@ def f(x): logs = self.get_logs(f, torch.ones(4, 2)) self.assertExpectedInline('\n'.join(logs), """\ $0 = input('input') -$1 = torch._ops.aten.view($0, [8]) -$2 = torch._ops.aten._reshape_alias($1, [2, 4], [4, 1]) -$3 = torch._ops.aten.transpose($2, 1, 0) -$4 = torch._ops.aten.view($0, [8]) -$5 = torch._ops.aten._reshape_alias($4, [2, 4], [4, 1]) -$6 = torch._ops.aten.transpose($5, 1, 0) -$7 = torch._ops.aten.unsqueeze($6, 0) -$8 = torch._ops.aten.view($0, [8]) -$9 = torch._ops.aten._reshape_alias($8, [2, 4], [4, 1]) -$10 = torch._ops.aten.transpose($9, 1, 0) -$11 = torch._ops.aten.unsqueeze($10, 0) -$12 = torch._ops.aten.squeeze($11) -$13, $14 = torch._ops.aten.split($12, 2) -$15 = torch._ops.aten.add($13, tensor([[1., 1.], +$1 = torch._ops.aten.add.Tensor($0, $0) +$2 = torch._ops.aten.view_copy.default($1, [8]) +$3 = torch._ops.aten._reshape_alias_copy.default($2, [2, 4], [4, 1]) +$4 = torch._ops.aten.transpose_copy.int($3, 1, 0) +$5 = torch._ops.aten.view_copy.default($1, [8]) +$6 = torch._ops.aten._reshape_alias_copy.default($5, [2, 4], [4, 1]) +$7 = torch._ops.aten.transpose_copy.int($6, 1, 0) +$8 = torch._ops.aten.unsqueeze_copy.default($7, 0) +$9 = torch._ops.aten.view_copy.default($1, [8]) +$10 = torch._ops.aten._reshape_alias_copy.default($9, [2, 4], [4, 1]) +$11 = torch._ops.aten.transpose_copy.int($10, 1, 0) +$12 = torch._ops.aten.unsqueeze_copy.default($11, 0) +$13 = torch._ops.aten.squeeze_copy.default($12) +$14, $15 = torch._ops.aten.split_copy.Tensor($13, 2) +$16 = torch._ops.aten.add.Tensor($14, tensor([[1., 1.], + [1., 1.]])) +$17 = torch._ops.aten.select_copy.int($3, 0, 0) +$18 = torch._ops.aten.clone.default($16, memory_format=torch.contiguous_format) +$19 = torch._ops.aten._unsafe_view.default($18, [4]) +$20 = torch._ops.aten.view_copy.default($1, [8]) +$21 = torch._ops.aten._reshape_alias_copy.default($20, [2, 4], [4, 1]) +$22 = torch._ops.aten.transpose_copy.int($21, 1, 0) +$23 = torch._ops.aten.unsqueeze_copy.default($22, 0) +$24 = torch._ops.aten.squeeze_copy.default($23) +$25 = torch._ops.aten.slice_scatter.default($24, $16, 0, 0, 2) +$26 = torch._ops.aten.unsqueeze_copy.default($25, 0) +$27 = torch._ops.aten.squeeze_copy.dim($26, 0) +$28 = torch._ops.aten.transpose_copy.int($27, 1, 0) +$29 = torch._ops.aten._reshape_alias_copy.default($28, [8], [1]) +$30 = torch._ops.aten.view_copy.default($29, [4, 2]) +$31 = torch._ops.aten.view_copy.default($30, [8]) +$32 = torch._ops.aten._reshape_alias_copy.default($31, [2, 4], [4, 1]) +$33 = torch._ops.aten.select_copy.int($32, 0, 0) +$34 = torch._ops.aten.add.Tensor($33, $19)""") + + def test_reapply_views_simple(self): + def f(x): + tmp = torch.ones(4, 2) + y = x.view(4, 2) + y.add_(tmp) + z = x * x + return y + self.assert_functionalization(f, torch.ones(4, 2), reapply_views=True) + logs = self.get_logs(f, torch.ones(4, 2), reapply_views=True) + self.assertExpectedInline('\n'.join(logs), """\ +$0 = input('input') +$1 = torch._ops.aten.view.default($0, [4, 2]) +$2 = torch._ops.aten.add.Tensor($1, tensor([[1., 1.], + [1., 1.], + [1., 1.], [1., 1.]])) -$16 = torch._ops.aten.select($2, 0, 0) -$17 = torch._ops.aten.clone($15, memory_format=0) -$18 = torch._ops.aten._unsafe_view($17, [4]) -$19 = torch._ops.aten.view($0, [8]) -$20 = torch._ops.aten._reshape_alias($19, [2, 4], [4, 1]) -$21 = torch._ops.aten.transpose($20, 1, 0) -$22 = torch._ops.aten.unsqueeze($21, 0) -$23 = torch._ops.aten.squeeze($22) -$24 = torch._ops.aten.slice_scatter($23, $15, 0, 0, 2) -$25 = torch._ops.aten.unsqueeze($24, 0) -$26 = torch._ops.aten.squeeze($25, 0) -$27 = torch._ops.aten.transpose($26, 1, 0) -$28 = torch._ops.aten._reshape_alias($27, [8], [1]) -$29 = torch._ops.aten.view($28, [4, 2]) -$30 = torch._ops.aten.view($29, [8]) -$31 = torch._ops.aten._reshape_alias($30, [2, 4], [4, 1]) -$32 = torch._ops.aten.select($31, 0, 0) -$33 = torch._ops.aten.add($32, $18)""") - - def test_aliases_maintained_after_pass(self): +$3 = torch._ops.aten.view.default($2, [4, 2]) +$4 = torch._ops.aten.mul.Tensor($3, $3)""") + + def test_aliases_maintained_after_pass_when_reapplying_views(self): def f(x): tmp = torch.ones(4, 2) y = x.view(4, 2) @@ -248,7 +396,7 @@ def f(x): return y, z input_functional = torch._to_functional_tensor(torch.ones(4, 2)) - torch._enable_functionalization() + torch._enable_functionalization(reapply_views=True) try: y, z = f(input_functional) torch._sync(y) @@ -279,34 +427,49 @@ def f(x): logs = self.get_logs(f, torch.ones(2)) self.assertExpectedInline('\n'.join(logs), """\ $0 = input('input') -$1 = torch._ops.aten.expand($0, [2]) -$2 = torch._ops.aten.add($1, $0)""") +$1 = torch._ops.aten.expand_copy.default($0, [2]) +$2 = torch._ops.aten.add.Tensor($1, $0)""") # Test 2: copy_() with same dtype, different shape self.assert_functionalization(f, torch.ones(1)) logs = self.get_logs(f, torch.ones(1)) self.assertExpectedInline('\n'.join(logs), """\ $0 = input('input') -$1 = torch._ops.aten.expand($0, [2]) -$2 = torch._ops.aten.add($1, $0)""") +$1 = torch._ops.aten.expand_copy.default($0, [2]) +$2 = torch._ops.aten.add.Tensor($1, $0)""") # Test 3: copy_() with different dtype, same shape self.assert_functionalization(f, torch.ones(2, dtype=torch.long)) logs = self.get_logs(f, torch.ones(2, dtype=torch.long)) self.assertExpectedInline('\n'.join(logs), """\ $0 = input('input') -$1 = torch._ops.aten._to_copy($0, dtype=6, layout=0, device=device(type='cpu'), pin_memory=False) -$2 = torch._ops.aten.expand($1, [2]) -$3 = torch._ops.aten.add($2, $0)""") +$1 = torch._ops.aten._to_copy.default($0, dtype=torch.float32, layout=torch.strided, device=device(type='cpu'), pin_memory=False) +$2 = torch._ops.aten.expand_copy.default($1, [2]) +$3 = torch._ops.aten.add.Tensor($2, $0)""") # Test 4: copy_() with different dtype, different shape self.assert_functionalization(f, torch.ones(1, dtype=torch.long)) logs = self.get_logs(f, torch.ones(1, dtype=torch.long)) self.assertExpectedInline('\n'.join(logs), """\ $0 = input('input') -$1 = torch._ops.aten._to_copy($0, dtype=6, layout=0, device=device(type='cpu'), pin_memory=False) -$2 = torch._ops.aten.expand($1, [2]) -$3 = torch._ops.aten.add($2, $0)""") +$1 = torch._ops.aten._to_copy.default($0, dtype=torch.float32, layout=torch.strided, device=device(type='cpu'), pin_memory=False) +$2 = torch._ops.aten.expand_copy.default($1, [2]) +$3 = torch._ops.aten.add.Tensor($2, $0)""") + + def test_fill_(self): + def f(x): + y = x + x + z = y.diagonal() + z.fill_(0) + return y + + self.assert_functionalization(f, torch.ones(2, 2)) + logs = self.get_logs(f, torch.ones(2, 2)) + self.assertExpectedInline('\n'.join(logs), """\ +$0 = input('input') +$1 = torch._ops.aten.add.Tensor($0, $0) +$2 = torch._ops.aten.diagonal_copy.default($1) +$3 = torch._ops.aten.fill.Scalar($2, 0)""") def test_nested_functions_propagate_updates(self): def g(x): @@ -324,5 +487,74 @@ def f(x): self.assert_functionalization(f, torch.ones(2, 2)) + def test_mixed_wrappers_valid(self): + def f(x, y): + z = x + y + z.add_(1) + return z + + x1_not_functional = LoggingTensor(torch.ones(4)) + x2_functional = torch._to_functional_tensor(LoggingTensor(torch.ones(4))) + + with capture_logs() as logs: + y = f(x1_not_functional, x2_functional) + + # Make sure that functionalization ran the "+" kernel + # with a functional + non-functional tensor, and wrapped the output appropriately. + self.assertExpectedInline('\n'.join(logs), """\ +$2 = torch._ops.aten.add.Tensor($0, $1) +$3 = torch._ops.aten.add.Tensor($2, 1)""") + + def test_mixed_wrappers_invalid(self): + x1_not_functional = torch.ones(4) + x2_functional = torch._to_functional_tensor(torch.ones(4)) + + # When dealing with mixed functional + nonfunctional tensors, + # normal_tensor.add_(functional_tensor) is not valid + # because normal_tensor would need to be "promoted" to a functional tensor. + with self.assertRaises(RuntimeError): + x1_not_functional.add_(x2_functional) + + # This tests the behavior of functionalization with multiple layers of wrapped tensor subclasses. + def test_multiple_levels_of_wrapping(self): + def f(x): + # call an inplace op and have it get logged twice (by the outer + inner wrapper) + x.add_(1) + + # Test 1: both the inner and outer wrapper are "functionalized" + x_inner_and_outer_functional = torch._to_functional_tensor( + InplaceLoggingTensor(torch._to_functional_tensor(LoggingTensor(torch.ones(4))))) + + with capture_logs() as logs: + f(x_inner_and_outer_functional) + + # Since both wrappers were unctionalized, they both log "add" + self.assertExpectedInline('\n'.join(logs), """\ +$1 = torch._ops.aten.add.Tensor($0, 1) +$3 = torch._ops.aten.add.Tensor($2, 1)""") + + # Test 2: only the inner wrapper is "functionalized" + x_only_inner_functional = InplaceLoggingTensor(torch._to_functional_tensor(LoggingTensor(torch.ones(4)))) + + with capture_logs() as logs: + f(x_only_inner_functional) + + # Since only the inner wrapper is functionalized, then the inner (first) log is functionalized + self.assertExpectedInline('\n'.join(logs), """\ +$1 = torch._ops.aten.add.Tensor($0, 1) +$3 = torch._ops.aten.add_.Tensor($2, 1)""") + + # Test 3: only the inner wrapper is "functionalized" + x_only_outer_functional = torch._to_functional_tensor(InplaceLoggingTensor(LoggingTensor(torch.ones(4)))) + + with capture_logs() as logs: + f(x_only_outer_functional) + + # Only the outer add_ is functionalized + # Since only the outer wrapper is functionalized, then the outer (second) log is functionalized + self.assertExpectedInline('\n'.join(logs), """\ +$1 = torch._ops.aten.add_.Tensor($0, 1) +$3 = torch._ops.aten.add.Tensor($2, 1)""") + if __name__ == '__main__': run_tests() diff --git a/test/test_fx.py b/test/test_fx.py index a9ea626c8053..56b28371456e 100644 --- a/test/test_fx.py +++ b/test/test_fx.py @@ -7,6 +7,7 @@ import inspect import math import numbers +import io import operator import os import pickle @@ -17,6 +18,7 @@ import types import warnings import unittest +import torch.nn.utils._stateless as _stateless from math import sqrt from torch.multiprocessing import Process from torch.testing import FileCheck @@ -24,8 +26,8 @@ from torch.testing._internal.common_device_type import ops, onlyCPU, instantiate_device_type_tests import torch.utils._pytree as pytree import torch.fx._pytree as fx_pytree -from torch.fx import symbolic_trace, Proxy, Node, GraphModule, Interpreter, Tracer, Transformer, Graph, wrap, PH -from torch.fx.node import Target, Argument +from torch.fx import symbolic_trace, Proxy, Node, GraphModule, Interpreter, Tracer, Transformer, Graph, wrap, PH, CodeGen +from torch.fx.node import Target, Argument, _format_arg from torch.fx.passes import shape_prop from torch.fx.immutable_collections import immutable_dict, immutable_list from torch.fx.experimental.rewriter import RewritingTracer @@ -101,6 +103,11 @@ def a_lifted_leaf2(a, b): wrap('getattr') +def wrapped_named_tup(p1, *, p2): + return p1.x + p2.y + +wrap(wrapped_named_tup) + @wrap def wrapped_via_decorator(a): return a + 1 @@ -125,6 +132,9 @@ class Pair(NamedTuple): x : torch.Tensor y : torch.Tensor + def _custom_fx_repr_fn(self) -> str: + return f"Pair(x={_format_arg(self.x)}, y={_format_arg(self.y)})" + # for testing pytrees class Foo(object): # noqa: B209 def __init__(self, a, b): @@ -133,6 +143,7 @@ def __init__(self, a, b): class TestFX(JitTestCase): def setUp(self): + super().setUp() # Checking for mutable operations whil tracing is feature flagged # Enable it in testing but not by default self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations @@ -143,6 +154,7 @@ def setUp(self): torch.ops.load_library(str(lib_file_path)) def tearDown(self): + super().tearDown() torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag def checkGraphModule(self, m: torch.nn.Module, args, kwargs=None): @@ -449,6 +461,55 @@ def forward(self, a, b): gm.graph.lint() self.assertEqual(gm(3, 4), 14) + def test_concrete_arg_none_assert(self): + class Foo(torch.nn.Module): + def forward(self, x, val=None): + return x if val is None else x + val + + f = Foo() + traced = torch.fx.symbolic_trace(f, concrete_args={'val' : None}) + with self.assertRaisesRegex(AssertionError, 'val has been specialized to have value None'): + traced(torch.randn(5), torch.randn(5)) + + x = torch.randn(5) + torch.testing.assert_close(traced(x), f(x)) + + def test_trace_multiple_funcs(self): + class Foo(torch.nn.Module): + def forward(self, x, y): + return x + y + + def minus_forward(self, x, y): + return x - y + + def multiply_forward(self, x, y): + return x * y + + f = Foo() + x, y = torch.randn(5), torch.randn(5) + + print(torch.__version__) + + tracer = Tracer() + torch.testing.assert_close(GraphModule(f, tracer.trace(f))(x, y), f(x, y)) + + tracer.traced_func_name = "minus_forward" + torch.testing.assert_close( + GraphModule(f, tracer.trace(f))(x, y), + f.minus_forward(x, y), + ) + + tracer.traced_func_name = "multiply_forward" + torch.testing.assert_close( + GraphModule(f, tracer.trace(f))(x, y), + f.multiply_forward(x, y), + ) + + tracer.traced_func_name = "add_forward" + with self.assertRaisesRegex(AssertionError, "doesn't exist in"): + tracer.trace(f) + + def test_graph_unique_names(self): class M(torch.nn.Module): def forward(self, a, b): @@ -678,6 +739,39 @@ def forward(self, a): for node in m_g.graph.nodes: self.assertTrue(node.name != "getattr") + @unittest.skip("Hotfix for SEV remediation") + def test_trace_buffer_slice(self): + bs, d_hid = 10, 23 + + class ExampleCode(torch.nn.Module): + def __init__(self): + super().__init__() + self.mm_param = torch.nn.Parameter(torch.randn(d_hid, d_hid)) + self.mm_param2 = torch.nn.Parameter(torch.randn(d_hid, d_hid)) + self.lin = torch.nn.Linear(d_hid, d_hid) + self.register_buffer('buffer', torch.randn(bs + 100, d_hid)) + + def forward(self, x): + x = torch.mm(x, self.mm_param) + skip_connection = x + x = torch.relu(x) + x = torch.mm(x, self.mm_param) + self.buffer[:x.shape[0]] + x = self.lin(x) + x = torch.relu(x) + x = x + skip_connection + x = torch.mm(x, self.mm_param2) + x = self.lin(x) + return x + + + ec = ExampleCode() + + traced = torch.fx.symbolic_trace(ec) + + x = torch.randn(bs, d_hid) + torch.testing.assert_allclose(ec(x), traced(x)) + + def test_node_tagging(self): class TaggingTracer(Tracer): def create_node(self, kind : str, target : Union[str, Callable], @@ -986,6 +1080,24 @@ def forward(self, x): traced_scripted = torch.jit.script(traced) self.assertEqual(traced_scripted(torch.rand(4)), 2) + def test_tuple_no_subscript(self): + def foo(x : Tuple): + return x[0] + + traced = torch.fx.symbolic_trace(foo) + x = (torch.randn(5, 3),) + torch.testing.assert_allclose(traced(x), x[0]) + + bio = io.BytesIO() + + torch.save(traced, bio) + + bio.seek(0) + + loaded = torch.load(bio) + + torch.testing.assert_allclose(loaded(x), x[0]) + def test_torch_fx_len(self): class FXLenTest(torch.nn.Module): def forward(self, x): @@ -1056,6 +1168,24 @@ def forward(self, a): out = gm(input) self.assertEqual(out, ref_out) + def test_torch_op_overloads(self): + class M(torch.nn.Module): + def forward(self, a): + b = torch.ops.aten.add.Tensor(a, a) + return b + m = M() + input = torch.randn(3) + ref_out = m(input) + gm = symbolic_trace(m) + gm.graph.lint() + out = gm(input) + self.assertEqual(out, ref_out) + + for node in gm.graph.nodes: + if node.op == 'call_function': + assert isinstance(node.target, torch._ops.OpOverload) + assert node.target.__name__ == 'add.Tensor' + def test_pickle_torch_custom_ops(self): class M(torch.nn.Module): def forward(self, a): @@ -1238,6 +1368,18 @@ def test_remove_uses(self): self.assertTrue(neg not in relu.users) + def test_remove_uses_with_custom_filter(self): + g : torch.fx.Graph = Graph() + x : torch.fx.Node = g.placeholder('x') + relu : torch.fx.Node = g.call_function(torch.relu, (x,)) + neg : torch.fx.Node = g.call_function(torch.neg, (relu,)) + g.output(neg) + + neg.replace_all_uses_with(relu, lambda x: x != neg) + + self.assertTrue(neg in relu.users) + + def test_nonetype_annotation(self): eb = torch.nn.EmbeddingBag(3, 4) symbolic_trace(eb) @@ -1925,6 +2067,28 @@ def test_update_kwargs_api(self): new_gm = torch.fx.GraphModule(torch.nn.Module(), graph) self.assertEqual(new_gm(inp_x, inp_y), torch.relu(inp_y)) + def test_immutable_list_pytree_ops(self): + rand_tensor = torch.randn(5, 3) + l = immutable_list([3, [rand_tensor, 42]]) + + flattened, spec = pytree.tree_flatten(l) + assert flattened == [3, rand_tensor, 42] + + unflattened = pytree.tree_unflatten(flattened, spec) + assert unflattened == l + assert isinstance(unflattened, immutable_list) + + def test_immutable_dict_pytree_ops(self): + rand_tensor = torch.randn(5, 3) + d = immutable_dict({'a': 3, 'b': [rand_tensor, 42]}) + + flattened, spec = pytree.tree_flatten(d) + assert flattened == [3, rand_tensor, 42] + + unflattened = pytree.tree_unflatten(flattened, spec) + assert unflattened == d + assert isinstance(unflattened, immutable_dict) + def test_move_before(self): graph : torch.fx.Graph = torch.fx.Graph() x : torch.fx.Node = graph.create_node('placeholder', 'x') @@ -2261,6 +2425,40 @@ def forward(self, x): input = torch.rand(3, 4) self.assertEqual(traced(input), Pair(input, input)) + def test_named_tuple_inlined(self): + class NamedTupMod(torch.nn.Module): + def forward(self, inp): + return wrapped_named_tup(Pair(inp, 1.2), p2=Pair(3.4, inp)) + + m = NamedTupMod() + input = torch.rand(3, 4) + ref = m(input) + traced = symbolic_trace(m) + + res = traced(input) + self.assertEqual(ref, res) + + # Check Pair NamedTuple works when inlined into the function call. + ph = call_func = None + for node in traced.graph.nodes: + if node.op == "placeholder": + ph = node + elif node.op == "call_function" and node.target == wrapped_named_tup: + node.update_arg(0, Pair(ph, 1.2)) + node.update_kwarg("p2", Pair(3.4, ph)) + call_func = node + break + self.assertTrue(call_func is not None) + self.assertTrue(isinstance(call_func.args[0], Pair)) + self.assertTrue(isinstance(call_func.kwargs["p2"], Pair)) + self.assertEqual(_format_arg(call_func.args[0]), "Pair(x=%inp, y=1.2)") + self.assertEqual(_format_arg(call_func.kwargs["p2"]), "Pair(x=3.4, y=%inp)") + + traced.graph.eliminate_dead_code() + traced.recompile() + res = traced(input) + self.assertEqual(ref, res) + def test_return_type_exists(self): class ReturnTypeModule(torch.nn.Module): def other(self, x: List[str]) -> List[str]: @@ -2809,6 +3007,35 @@ def is_leaf_module(self, m: torch.nn.Module, module_qualified_name : str) -> boo gm2.delete_all_unused_submodules() torch.testing.assert_allclose(gm2(inputs), model(inputs)) + def test_fx_stateless(self): + class MockModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.l1 = torch.nn.Linear(1, 1) + self.register_buffer('buffer', torch.ones(1)) + + def forward(self, x): + return self.l1(x) + self.buffer + + module = MockModule() + x = torch.rand((1, 1)) + weight = torch.tensor([[1.0]], requires_grad=True) + bias = torch.tensor([0.0], requires_grad=True) + buffer = torch.tensor([0.0]) + parameters = {'l1.weight': weight, + 'l1.bias': bias, + 'buffer': buffer} + fx_module = torch.fx.symbolic_trace(module) + res = _stateless.functional_call(fx_module, parameters, x) + res.backward() + self.assertIsNotNone(weight.grad) + self.assertIsNotNone(bias.grad) + self.assertIsNone(buffer.grad) + # Gradient was not calculated for the module stated and buffers + self.assertIsNone(module.l1.weight.grad) + self.assertIsNone(module.l1.bias.grad) + self.assertIsNone(module.buffer.grad) + def test_tracing_graphmodules_as_leaf_submodules(self): class A(torch.nn.Module): def forward(self, t): @@ -3126,6 +3353,12 @@ def verify_pytree(f, inp): orig_out = f(val) nf = symbolic_trace(f, concrete_args={'x': inp}) self.assertEqual(nf(val), orig_out) + + bare_fx = GraphModule({}, copy.deepcopy(nf.graph)) + bare_fx.graph.set_codegen(CodeGen()) + bare_fx.recompile() + self.assertEqual(nf.graph.process_outputs(bare_fx(*nf.graph.process_inputs(val))), orig_out) + assert num_flat_args == 0 or "tree_flatten_spec" in nf.code assert(sum([i.op == 'placeholder' for i in nf.graph.nodes]) == num_flat_args) @@ -3161,6 +3394,102 @@ def f(b, a): nf = symbolic_trace(nf) self.assertEqual(nf(**val), f(**val)) + def test_custom_codegen(self): + class ListCodeGen(CodeGen): + def gen_fn_def(self, free_vars, maybe_return_annotation): + lst_unpack = f""" +def forward(self, args_list: List[torch.Tensor]){maybe_return_annotation}: + {', '.join(free_vars)} = args_list""" + return lst_unpack + + def additional_globals(self): + return [('List', typing.List)] + + def process_inputs(self, *inputs): + assert(len(inputs) == 1) + return inputs[0] + + def f(a, b): + return a + b + + nf = symbolic_trace(f) + vals = [torch.randn(3), torch.randn(3)] + self.assertEqual(nf(*vals), f(*vals)) + + nf.graph.set_codegen(ListCodeGen()) + nf.recompile() + + bare_fx = GraphModule({}, copy.deepcopy(nf.graph)) + bare_fx.graph.set_codegen(CodeGen()) + bare_fx.recompile() + + self.assertEqual(nf(vals), f(*vals)) + self.assertEqual(nf.graph.process_outputs(bare_fx(*nf.graph.process_inputs(vals))), f(*vals)) + + ts_f = torch.jit.script(nf) + self.assertEqual(nf(vals), ts_f(vals)) + + def test_custom_codegen_with_transformer(self): + class ListCodeGen(CodeGen): + def gen_fn_def(self, free_vars, maybe_return_annotation): + lst_unpack = f""" +def forward(self, args_list: List[torch.Tensor]){maybe_return_annotation}: + {', '.join(free_vars)} = args_list""" + return lst_unpack + + def additional_globals(self): + return [('List', typing.List)] + + def process_inputs(self, *inputs): + assert(len(inputs) == 1) + return inputs[0] + + def f(a, b): + return a + b + + nf = symbolic_trace(f) + vals = [torch.randn(3), torch.randn(3)] + self.assertEqual(nf(*vals), f(*vals)) + + nf.graph.set_codegen(ListCodeGen()) + nf.recompile() + self.assertEqual(nf(vals), f(*vals)) + + transformed_gm = Transformer(nf).transform() + self.assertEqual(nf(vals), transformed_gm(vals)) + + def test_interpreter_with_codegen(self): + class ListCodeGen(CodeGen): + def gen_fn_def(self, free_vars, maybe_return_annotation): + lst_unpack = f""" +def forward(self, args_list: List[torch.Tensor]){maybe_return_annotation}: + {', '.join(free_vars)} = args_list""" + return lst_unpack + + def additional_globals(self): + return [('List', typing.List)] + + def process_inputs(self, *inputs): + assert(len(inputs) == 1) + return inputs[0] + + def generate_output(self, output_args): + return f'return list({repr(output_args)})' + + def process_outputs(self, outputs): + return list(outputs) + + def f(a, b): + a = a + b + b = a + b + return a, b + + nf = symbolic_trace(f) + vals = [torch.randn(3), torch.randn(3)] + nf.graph.set_codegen(ListCodeGen()) + nf.recompile() + self.assertEqual(Interpreter(nf).run(vals), nf(vals)) + def test_imul_code_print(self): graph = torch.fx.Graph() a = graph.placeholder("a") @@ -3218,6 +3547,7 @@ def test_get_torch_func_signature_exhaustive(self, device, dtype, op): class TestFXAPIBackwardCompatibility(JitTestCase): def setUp(self): + super().setUp() self.maxDiff = None # Checking for mutable operations whil tracing is feature flagged @@ -3226,6 +3556,7 @@ def setUp(self): torch.fx.proxy.TracerBase.check_mutable_operations = True def tearDown(self): + super().tearDown() torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag @@ -3464,12 +3795,14 @@ def check_symbols_have_bc_designation(m, prefix): class TestFunctionalTracing(JitTestCase): def setUp(self): + super().setUp() # Checking for mutable operations whil tracing is feature flagged # Enable it in testing but not by default self.orig_tracer_mutable_flag = torch.fx.proxy.TracerBase.check_mutable_operations torch.fx.proxy.TracerBase.check_mutable_operations = True def tearDown(self): + super().tearDown() torch.fx.proxy.TracerBase.check_mutable_operations = self.orig_tracer_mutable_flag IGNORE_FUNCS = ("has_torch_function", "has_torch_function_unary", @@ -3496,6 +3829,7 @@ def tearDown(self): "bilinear": BUILT_IN_FUNC, "celu_": BUILT_IN_FUNC, "channel_shuffle": BUILT_IN_FUNC, + "native_channel_shuffle": BUILT_IN_FUNC, "conv1d": BUILT_IN_FUNC, "conv2d": BUILT_IN_FUNC, "conv3d": BUILT_IN_FUNC, @@ -3512,6 +3846,7 @@ def tearDown(self): "linear": BUILT_IN_FUNC, "logsigmoid": BUILT_IN_FUNC, "one_hot": BUILT_IN_FUNC, + "pad": BUILT_IN_FUNC, "pairwise_distance": BUILT_IN_FUNC, "pdist": BUILT_IN_FUNC, "pixel_shuffle": BUILT_IN_FUNC, @@ -3529,7 +3864,6 @@ def tearDown(self): "adaptive_max_pool2d_with_indices": LEN_ERROR, "adaptive_max_pool3d_with_indices": LEN_ERROR, "instance_norm": CONTROL_FLOW, - "pad": LEN_ERROR, "adaptive_max_pool1d": PROXY_ITERABLE, "adaptive_max_pool2d": PROXY_ITERABLE, @@ -3584,9 +3918,9 @@ def tearDown(self): "leaky_relu": CONTROL_FLOW, "local_response_norm": CONTROL_FLOW, "margin_ranking_loss": CONTROL_FLOW, - "max_pool1d_with_indices": CONTROL_FLOW, - "max_pool2d_with_indices": CONTROL_FLOW, - "max_pool3d_with_indices": CONTROL_FLOW, + "max_pool1d_with_indices": ARG_TYPE_MISMATCH, + "max_pool2d_with_indices": ARG_TYPE_MISMATCH, + "max_pool3d_with_indices": ARG_TYPE_MISMATCH, "mse_loss": CONTROL_FLOW, "multi_head_attention_forward": CONTROL_FLOW, "multi_margin_loss": CONTROL_FLOW, diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py index b7533ef34245..062eaed38f50 100644 --- a/test/test_fx_experimental.py +++ b/test/test_fx_experimental.py @@ -3,7 +3,9 @@ import math import numbers import operator +import pickle import sys +import tempfile import unittest from typing import Callable, Dict, Union, List, Optional from types import BuiltinFunctionType @@ -26,6 +28,8 @@ ) from torch.fx.experimental.rewriter import RewritingTracer from torch.fx.experimental.schema_type_annotation import AnnotateTypesWithSchema +import torch.fx.experimental.meta_tracer +from torch.fx.experimental.proxy_tensor import make_fx from torch.fx.graph_module import GraphModule from torch.fx.node import Node from torch.fx.operator_schemas import ( @@ -119,7 +123,7 @@ def forward(self, a, b, c): assert len(serialized_graph1["weights"]) == 4 assert len(serialized_graph1["modules"]) == 0 assert len(serialized_graph2["nodes"]) == 6 - assert len(serialized_graph2["weights"]) == 4 + assert len(serialized_graph2["weights"]) == 1 assert len(serialized_graph2["modules"]) == 1 assert serialized_graph1["weights"]["linear.weight"]["shape"] == "[4, 4]" assert serialized_graph1["weights"]["linear.weight"]["dtype"] == "torch.float32" @@ -667,6 +671,45 @@ def forward(self, a, b): # Confirm that the output is correct self.assertEqual(traced(3, 3), m(3, 3)) + def test_meta_tracer(self): + class MetaTracerTestModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.emb = torch.nn.Embedding(num_embeddings=42, embedding_dim=16) + self.layernorm = torch.nn.LayerNorm(16) + + def forward(self, x): + emb = self.emb(x) + emb = emb + torch.arange(emb.shape[-1], dtype=torch.float, device=emb.device) + lol = self.layernorm(emb) + return torch.relu(lol) if lol.shape[0] < 30 else torch.sigmoid(lol) + + mttm = MetaTracerTestModule() + for BS in [15, 35]: + x = torch.zeros(BS, dtype=torch.long).random_(42) + meta_args = {'x' : x.to(device='meta')} + gm = torch.fx.experimental.meta_tracer.symbolic_trace(mttm, meta_args=meta_args) + torch.testing.assert_close(gm(x), mttm(x)) + + # Test serialization/deserialization + with tempfile.TemporaryDirectory() as tmp_dir: + with open(f'{tmp_dir}/meta_module.pkl', 'wb') as f: + pickle.dump(gm, f) + + with open(f'{tmp_dir}/meta_module.pkl', 'rb') as f: + loaded = pickle.load(f) + + torch.testing.assert_close(loaded(x), mttm(x)) + + def test_proxy_tensor(self): + def f(x): + val = x.cos().cos().sum() + return torch.autograd.grad(val, x) + + traced_graph = make_fx(f)(torch.randn(3, requires_grad=True)) + inp = torch.randn(3, requires_grad=True) + torch.testing.assert_close(traced_graph(inp), f(inp)) + def test_call_to_assert_with_msg(self): class M(torch.nn.Module): def forward(self, a, b): @@ -814,6 +857,29 @@ def mod_partition(node: Node): self.assertEqual(orig_out, submodules_out) + def test_split_module_kwargs_expansion(self): + class ModuleWithKwargsExpansion(torch.nn.Module): + def forward(self, x, **kwargs): + return x + kwargs['foo'] + + mod = ModuleWithKwargsExpansion() + traced = torch.fx.symbolic_trace(mod) + + seen_getitem = False + + def split_callback(n): + nonlocal seen_getitem + split_idx = int(seen_getitem) + if n.target == operator.getitem: + seen_getitem = True + return split_idx + + split = split_module(traced, mod, split_callback) + + x = torch.randn(5, 3) + foo = torch.randn(5, 3) + torch.testing.assert_allclose(split(x, foo=foo), traced(x, foo=foo)) + @skipIfNoTorchVision def test_subgraph_trivial_resnet(self): # Smoke test trivially splitting resnet into 1 partition works @@ -1125,6 +1191,47 @@ def split_cb(node: torch.fx.Node): module_with_submodule = split_module(traced, mm, split_cb) self.assertEqual(module_with_submodule(a, b, c, d), traced(a, b, c, d)) + def test_split_qualname_mapping(self): + d_hid = 4 + + class ExampleCode(torch.nn.Module): + def __init__(self): + super().__init__() + self.mm_param = torch.nn.Parameter(torch.randn(d_hid, d_hid)) + self.mm_param2 = torch.nn.Parameter(torch.randn(d_hid, d_hid)) + self.lin = torch.nn.Linear(d_hid, d_hid) + + def forward(self, x): + x = torch.mm(x, self.mm_param) + x = torch.relu(x) + x = torch.mm(x, self.mm_param) + x = self.lin(x) + x = torch.relu(x) + x = torch.mm(x, self.mm_param2) + x = self.lin(x) + return x + + my_module = ExampleCode() + my_module_traced = symbolic_trace(my_module) + + part_idx = 0 + + def split_callback(n : torch.fx.Node): + nonlocal part_idx + if (n.op, n.target) == ('call_module', 'lin'): + part_idx += 1 + return part_idx + + # split module in module with submodules + qualname_map : Dict[str, str] = {} + module_with_submodules = split_module( + my_module_traced, my_module, split_callback, qualname_map + ) + expected_qualname_map = { + 'submod_1.lin': 'lin', 'submod_2.lin': 'lin' + } + self.assertEqual(qualname_map, expected_qualname_map) + def test_traceable_function_with_nonstandard_name(self): def foo(x): return torch.relu(x) @@ -1454,101 +1561,6 @@ class TestNormalizeOperators(JitTestCase): @onlyCPU @ops(op_db, allowed_dtypes=(torch.float,)) def test_normalize_operator_exhaustive(self, device, dtype, op): - # Sorted and one entry on each line to minimize merge conflicts. - op_skip = { - # See: https://github.com/pytorch/pytorch/issues/64997 - "as_strided", - "block_diag", - "broadcast_tensors", - "cartesian_prod", - "contiguous", - "einsum", - "expand", - "expand_as", - "fill_", - "T", # Implemented with a lambda - "H", # Implemented with a lambda - "mT", # Implemented with a lambda - "mH", # Implemented with a lambda - "gradient", - "histogramdd", - "igamma", - "igammac", - "index_put", - "nn.functional.conv2d", - "nn.functional.dropout", - "nn.functional.dropout2d", - "nn.functional.embedding", # Implemented with a lambda - "nn.functional.embedding_bag", # Implemented with a lambda - "nn.functional.rrelu", # Implemented with a lambda - "nn.functional.feature_alpha_dropout", # Implemented with a lambda - "nonzero", - "polygamma", - "special.polygamma", - "repeat", - "reshape_as", - "resize_", - "resize_as_", - "special.zeta", - "sum_to_size", - "to_sparse", - "unique", - "unique_consecutive", - "view", - "view_as", - "unfold", - "where", - "zero_", - 'bfloat16', - 'bool', - 'byte', - 'char', - 'double', - 'float', - 'half', - 'int', - 'long', - 'short', - 'empty_like', - 'ones_like', - 'randn_like', - 'zeros_like', - 'full_like', - 'rand_like', - 'randint_like', - 'new_ones', - 'new_empty', - 'new_zeros', - 'new_full', - 'normal', - 'multinomial', - 'bernoulli', - "__getitem__", - "__radd__", - "__rsub__", - "__rmul__", - "__rdiv__", - "__rmod__", - "__rpow__", - '__rand__', - '__ror__', - '__rxor__', - "__rmatmul__", - "atleast_1d", - "atleast_2d", - "atleast_3d", - "svd_lowrank", # implemented with a lambda - "pca_lowrank", # implemented with a lambda - "column_stack", - } - - # Unsupported input types - if op.name in op_skip: - return - - if op.name.startswith('_masked.'): - return - # These ops currently don't trace in FX for various reasons (i.e. they take a list of tensors) fx_fail = {"cat", "stack", "hstack", "vstack", "dstack", "linalg.multi_dot"} sample_inputs_itr = op.sample_inputs(device, dtype, requires_grad=False) @@ -1650,6 +1662,40 @@ def forward(self, {', '.join(param_names)}): test_out = traced(*param_values) self.assertEqual(test_out, ref_out) + def test_normalize_quantized_eb(self): + target = torch.ops.quantized.embedding_bag_byte_rowwise_offsets + args = ( + torch.empty((2, 3), dtype=torch.uint8), + torch.empty((2,), dtype=torch.int64), + torch.empty((2,), dtype=torch.int64), + ) + norm_args_and_kwargs = normalize_function( + target, args, normalize_to_only_use_kwargs=True + ) + self.assertTrue(norm_args_and_kwargs is not None) + self.assertEqual( + set(norm_args_and_kwargs.kwargs.keys()), + { + "weight", + "indices", + "offsets", + "scale_grad_by_freq", + "mode", + "pruned_weights", + "per_sample_weights", + "compressed_indices_mapping", + "include_last_offset", + }, + ) + self.assertEqual(norm_args_and_kwargs.args, tuple()) + + def test_normalize_args_op_overload(self): + for target in [torch.ops.aten.resize_as_.default, torch.ops.aten.resize_as_]: + inp1 = torch.rand([1]) + inp2 = torch.rand([4]) + args, kwargs = normalize_function(target, (inp1,), {"the_template": inp2}, normalize_to_only_use_kwargs=True) + self.assertIs(kwargs["input"], inp1) + self.assertIs(kwargs["the_template"], inp2) instantiate_device_type_tests(TestNormalizeOperators, globals()) diff --git a/test/test_hub.py b/test/test_hub.py new file mode 100644 index 000000000000..662a2cf9771e --- /dev/null +++ b/test/test_hub.py @@ -0,0 +1,256 @@ +# Owner(s): ["module: hub"] + +import unittest +from unittest.mock import patch +import os +import tempfile +import warnings + +import torch +import torch.hub as hub +from torch.testing._internal.common_utils import retry, IS_SANDCASTLE, TestCase + + +def sum_of_state_dict(state_dict): + s = 0 + for _, v in state_dict.items(): + s += v.sum() + return s + + +SUM_OF_HUB_EXAMPLE = 431080 +TORCHHUB_EXAMPLE_RELEASE_URL = 'https://github.com/ailzhang/torchhub_example/releases/download/0.1/mnist_init_ones' + + +@unittest.skipIf(IS_SANDCASTLE, 'Sandcastle cannot ping external') +class TestHub(TestCase): + + def setUp(self): + super().setUp() + self.previous_hub_dir = torch.hub.get_dir() + self.tmpdir = tempfile.TemporaryDirectory('hub_dir') + torch.hub.set_dir(self.tmpdir.name) + self.trusted_list_path = os.path.join(torch.hub.get_dir(), "trusted_list") + + def tearDown(self): + super().tearDown() + torch.hub.set_dir(self.previous_hub_dir) # probably not needed, but can't hurt + self.tmpdir.cleanup() + + def _assert_trusted_list_is_empty(self): + with open(self.trusted_list_path) as f: + assert not f.readlines() + + def _assert_in_trusted_list(self, line): + with open(self.trusted_list_path) as f: + assert line in (l.strip() for l in f.readlines()) + + @retry(Exception, tries=3) + def test_load_from_github(self): + hub_model = hub.load('ailzhang/torchhub_example', 'mnist', source='github', pretrained=True, verbose=False) + self.assertEqual(sum_of_state_dict(hub_model.state_dict()), SUM_OF_HUB_EXAMPLE) + + @retry(Exception, tries=3) + def test_load_from_local_dir(self): + local_dir = hub._get_cache_or_reload( + 'ailzhang/torchhub_example', + force_reload=False, + trust_repo=True, + calling_fn=None + ) + hub_model = hub.load(local_dir, 'mnist', source='local', pretrained=True, verbose=False) + self.assertEqual(sum_of_state_dict(hub_model.state_dict()), SUM_OF_HUB_EXAMPLE) + + @retry(Exception, tries=3) + def test_load_from_branch(self): + hub_model = hub.load('ailzhang/torchhub_example:ci/test_slash', 'mnist', pretrained=True, verbose=False) + self.assertEqual(sum_of_state_dict(hub_model.state_dict()), SUM_OF_HUB_EXAMPLE) + + @retry(Exception, tries=3) + def test_get_set_dir(self): + previous_hub_dir = torch.hub.get_dir() + with tempfile.TemporaryDirectory('hub_dir') as tmpdir: + torch.hub.set_dir(tmpdir) + self.assertEqual(torch.hub.get_dir(), tmpdir) + self.assertNotEqual(previous_hub_dir, tmpdir) + + hub_model = hub.load('ailzhang/torchhub_example', 'mnist', pretrained=True, verbose=False) + self.assertEqual(sum_of_state_dict(hub_model.state_dict()), SUM_OF_HUB_EXAMPLE) + assert os.path.exists(os.path.join(tmpdir, 'ailzhang_torchhub_example_master')) + + # Test that set_dir properly calls expanduser() + # non-regression test for https://github.com/pytorch/pytorch/issues/69761 + new_dir = os.path.join("~", "hub") + torch.hub.set_dir(new_dir) + self.assertEqual(torch.hub.get_dir(), os.path.expanduser(new_dir)) + + @retry(Exception, tries=3) + def test_list_entrypoints(self): + entry_lists = hub.list('ailzhang/torchhub_example', trust_repo=True) + self.assertObjectIn('mnist', entry_lists) + + @retry(Exception, tries=3) + def test_download_url_to_file(self): + with tempfile.TemporaryDirectory() as tmpdir: + f = os.path.join(tmpdir, 'temp') + hub.download_url_to_file(TORCHHUB_EXAMPLE_RELEASE_URL, f, progress=False) + loaded_state = torch.load(f) + self.assertEqual(sum_of_state_dict(loaded_state), SUM_OF_HUB_EXAMPLE) + + @retry(Exception, tries=3) + def test_load_state_dict_from_url(self): + loaded_state = hub.load_state_dict_from_url(TORCHHUB_EXAMPLE_RELEASE_URL) + self.assertEqual(sum_of_state_dict(loaded_state), SUM_OF_HUB_EXAMPLE) + + # with name + file_name = "the_file_name" + loaded_state = hub.load_state_dict_from_url(TORCHHUB_EXAMPLE_RELEASE_URL, file_name=file_name) + expected_file_path = os.path.join(torch.hub.get_dir(), 'checkpoints', file_name) + self.assertTrue(os.path.exists(expected_file_path)) + self.assertEqual(sum_of_state_dict(loaded_state), SUM_OF_HUB_EXAMPLE) + + @retry(Exception, tries=3) + def test_load_legacy_zip_checkpoint(self): + with warnings.catch_warnings(record=True) as ws: + warnings.simplefilter("always") + hub_model = hub.load('ailzhang/torchhub_example', 'mnist_zip', pretrained=True, verbose=False) + self.assertEqual(sum_of_state_dict(hub_model.state_dict()), SUM_OF_HUB_EXAMPLE) + assert any("will be deprecated in favor of default zipfile" in str(w) for w in ws) + + # Test the default zipfile serialization format produced by >=1.6 release. + @retry(Exception, tries=3) + def test_load_zip_1_6_checkpoint(self): + hub_model = hub.load( + 'ailzhang/torchhub_example', + 'mnist_zip_1_6', + pretrained=True, + verbose=False, + trust_repo=True + ) + self.assertEqual(sum_of_state_dict(hub_model.state_dict()), SUM_OF_HUB_EXAMPLE) + + @retry(Exception, tries=3) + def test_hub_parse_repo_info(self): + # If the branch is specified we just parse the input and return + self.assertEqual( + torch.hub._parse_repo_info('a/b:c'), + ('a', 'b', 'c') + ) + # For torchvision, the default branch is main + self.assertEqual( + torch.hub._parse_repo_info('pytorch/vision'), + ('pytorch', 'vision', 'main') + ) + # For the torchhub_example repo, the default branch is still master + self.assertEqual( + torch.hub._parse_repo_info('ailzhang/torchhub_example'), + ('ailzhang', 'torchhub_example', 'master') + ) + + @retry(Exception, tries=3) + def test_load_commit_from_forked_repo(self): + with self.assertRaisesRegex(ValueError, 'If it\'s a commit from a forked repo'): + torch.hub.load('pytorch/vision:4e2c216', 'resnet18') + + @retry(Exception, tries=3) + @patch('builtins.input', return_value='') + def test_trust_repo_false_emptystring(self, patched_input): + with self.assertRaisesRegex(Exception, 'Untrusted repository.'): + torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo=False) + self._assert_trusted_list_is_empty() + patched_input.assert_called_once() + + patched_input.reset_mock() + with self.assertRaisesRegex(Exception, 'Untrusted repository.'): + torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo=False) + self._assert_trusted_list_is_empty() + patched_input.assert_called_once() + + @retry(Exception, tries=3) + @patch('builtins.input', return_value='no') + def test_trust_repo_false_no(self, patched_input): + with self.assertRaisesRegex(Exception, 'Untrusted repository.'): + torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo=False) + self._assert_trusted_list_is_empty() + patched_input.assert_called_once() + + patched_input.reset_mock() + with self.assertRaisesRegex(Exception, 'Untrusted repository.'): + torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo=False) + self._assert_trusted_list_is_empty() + patched_input.assert_called_once() + + @retry(Exception, tries=3) + @patch('builtins.input', return_value='y') + def test_trusted_repo_false_yes(self, patched_input): + torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo=False) + self._assert_in_trusted_list("ailzhang_torchhub_example") + patched_input.assert_called_once() + + # Loading a second time with "check", we don't ask for user input + patched_input.reset_mock() + torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo="check") + patched_input.assert_not_called() + + # Loading again with False, we still ask for user input + patched_input.reset_mock() + torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo=False) + patched_input.assert_called_once() + + @retry(Exception, tries=3) + @patch('builtins.input', return_value='no') + def test_trust_repo_check_no(self, patched_input): + with self.assertRaisesRegex(Exception, 'Untrusted repository.'): + torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo="check") + self._assert_trusted_list_is_empty() + patched_input.assert_called_once() + + patched_input.reset_mock() + with self.assertRaisesRegex(Exception, 'Untrusted repository.'): + torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo="check") + patched_input.assert_called_once() + + @retry(Exception, tries=3) + @patch('builtins.input', return_value='y') + def test_trust_repo_check_yes(self, patched_input): + torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo="check") + self._assert_in_trusted_list("ailzhang_torchhub_example") + patched_input.assert_called_once() + + # Loading a second time with "check", we don't ask for user input + patched_input.reset_mock() + torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo="check") + patched_input.assert_not_called() + + @retry(Exception, tries=3) + def test_trust_repo_true(self): + torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo=True) + self._assert_in_trusted_list("ailzhang_torchhub_example") + + @retry(Exception, tries=3) + def test_trust_repo_builtin_trusted_owners(self): + torch.hub.load('pytorch/vision', 'resnet18', trust_repo="check") + self._assert_trusted_list_is_empty() + + @retry(Exception, tries=3) + def test_trust_repo_none(self): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo=None) + assert len(w) == 1 + assert issubclass(w[-1].category, UserWarning) + assert "You are about to download and run code from an untrusted repository" in str(w[-1].message) + + self._assert_trusted_list_is_empty() + + @retry(Exception, tries=3) + def test_trust_repo_legacy(self): + # We first download a repo and then delete the allowlist file + # Then we check that the repo is indeed trusted without a prompt, + # because it was already downloaded in the past. + torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo=True) + os.remove(self.trusted_list_path) + + torch.hub.load('ailzhang/torchhub_example', 'mnist_zip_1_6', trust_repo="check") + + self._assert_trusted_list_is_empty() diff --git a/test/test_indexing.py b/test/test_indexing.py index 42ffa8ab24e8..9d2d82e9f12a 100644 --- a/test/test_indexing.py +++ b/test/test_indexing.py @@ -692,7 +692,7 @@ def test_bool_indices(self, device): self.assertEqual(v[boolIndices].shape, v[uint8Indices].shape) self.assertEqual(v[boolIndices], v[uint8Indices]) self.assertEqual(v[boolIndices], tensor([True], dtype=torch.bool, device=device)) - self.assertEquals(len(w), 2) + self.assertEqual(len(w), 2) def test_bool_indices_accumulate(self, device): mask = torch.zeros(size=(10, ), dtype=torch.bool, device=device) @@ -713,7 +713,7 @@ def test_byte_mask(self, device): with warnings.catch_warnings(record=True) as w: self.assertEqual(v[mask].shape, (3, 7, 3)) self.assertEqual(v[mask], torch.stack([v[0], v[2], v[3]])) - self.assertEquals(len(w), 2) + self.assertEqual(len(w), 2) v = torch.tensor([1.], device=device) self.assertEqual(v[v == 0], torch.tensor([], device=device)) @@ -725,7 +725,7 @@ def test_byte_mask_accumulate(self, device): warnings.simplefilter("always") y.index_put_((mask, ), y[mask], accumulate=True) self.assertEqual(y, torch.ones(size=(10, 10), device=device)) - self.assertEquals(len(w), 2) + self.assertEqual(len(w), 2) def test_index_put_accumulate_large_tensor(self, device): # This test is for tensors with number of elements >= INT_MAX (2^31 - 1). @@ -818,6 +818,9 @@ def test_index_put_accumulate_non_contiguous(self, device): value = torch.randn(2, 2) out_cuda = t1.index_put_(indices_dev, value.to(device), accumulate=True) out_cpu = t2.index_put_(indices, value, accumulate=True) + self.assertTrue(not t1.is_contiguous()) + self.assertTrue(not t2.is_contiguous()) + self.assertEqual(out_cuda.cpu(), out_cpu) @onlyCUDA @@ -876,7 +879,7 @@ def test_multiple_byte_mask(self, device): with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") self.assertEqual(v[mask1, :, mask2].shape, (3, 7)) - self.assertEquals(len(w), 2) + self.assertEqual(len(w), 2) def test_byte_mask2d(self, device): v = torch.randn(5, 7, 3, device=device) @@ -1130,7 +1133,7 @@ def test_byte_tensor_assignment(self, device): with warnings.catch_warnings(record=True) as w: x[b] = value - self.assertEquals(len(w), 1) + self.assertEqual(len(w), 1) self.assertEqual(x[0], value) self.assertEqual(x[1], torch.arange(4., 8, device=device)) diff --git a/test/test_jit.py b/test/test_jit.py index 37cd9b5d53c0..e585471a413d 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -17,6 +17,7 @@ from jit.test_data_parallel import TestDataParallel # noqa: F401 from jit.test_models import TestModels # noqa: F401 from jit.test_modules import TestModules # noqa: F401 +from jit.test_autodiff import TestAutodiffJit # noqa: F401 from jit.test_autodiff_subgraph_slicing import TestAutodiffSubgraphSlicing # noqa: F401 from jit.test_custom_operators import TestCustomOperators # noqa: F401 from jit.test_export_modes import TestExportModes # noqa: F401 @@ -25,12 +26,12 @@ from jit.test_builtins import TestBuiltins, TestTensorBuiltins # noqa: F401 from jit.test_ignore_context_manager import TestIgnoreContextManager # noqa: F401 from jit.test_symbolic_shape_analysis import TestSymbolicShapeAnalysis # noqa: F401 -from jit.test_if_hoisting import TestIfHoisting # noqa: F401 +from jit.test_op_decompositions import TestOpDecompositions # noqa: F401 from jit.test_unsupported_ops import TestUnsupportedOps # noqa: F401 from jit.test_freezing import TestFreezing, TestFrozenOptimizations, TestMKLDNNReinplacing # noqa: F401 from jit.test_peephole import TestPeephole # noqa: F401 from jit.test_alias_analysis import TestAliasAnalysis # noqa: F401 -from jit.test_save_load import TestSaveLoad # noqa: F401 +from jit.test_save_load import TestSaveLoad, TestSaveLoadFlatbuffer # noqa: F401 from jit.test_save_load_for_op_version import TestSaveLoadForOpVersion # noqa: F401 from jit.test_module_containers import TestModuleContainers # noqa: F401 from jit.test_python_bindings import TestPythonBindings # noqa: F401 @@ -76,6 +77,7 @@ from jit.test_device_analysis import TestDeviceAnalysis # noqa: F401 from jit.test_dce import TestDCE # noqa: F401 from jit.test_sparse import TestSparse # noqa: F401 +from jit.test_tensor_methods import TestTensorMethods # noqa: F401 # Torch from torch import Tensor @@ -98,18 +100,19 @@ from torch.testing._internal.common_utils import run_tests, IS_WINDOWS, TEST_WITH_UBSAN, \ suppress_warnings, BUILD_WITH_CAFFE2, IS_SANDCASTLE, GRAPH_EXECUTOR, ProfilingMode, TestCase, \ freeze_rng_state, slowTest, TemporaryFileName, skipIfCompiledWithoutNumpy, \ - enable_profiling_mode_for_profiling_tests, TEST_MKL, set_default_dtype, num_profiled_runs + enable_profiling_mode_for_profiling_tests, TEST_MKL, set_default_dtype, num_profiled_runs, \ + skipIfCrossRef, IS_MACOS from torch.testing._internal.jit_utils import JitTestCase, enable_cpu_fuser, disable_autodiff_subgraph_inlining, \ _trace, do_input_map, get_execution_plan, make_global, \ execWrapper, _inline_everything, _tmp_donotuse_dont_inline_everything, \ RUN_CUDA -from torch.testing._internal.jit_metaprogramming_utils import create_script_fn, nn_functional_tests, get_script_args, \ - EXCLUDE_SCRIPT, additional_module_tests, EXCLUDE_SCRIPT_MODULES, \ - get_nn_module_name_from_kwargs, get_nn_mod_test_name, script_method_template +from torch.testing._internal.jit_metaprogramming_utils import ( + get_script_args, + create_input, unpack_variables, + additional_module_tests, EXCLUDE_SCRIPT_MODULES, + get_nn_module_name_from_kwargs, get_nn_mod_test_name, script_method_template) from torch.testing._internal.common_nn import module_tests, new_module_tests, criterion_tests -from torch.testing._internal.common_methods_invocations import ( - create_input, unpack_variables) # For testing truediv in python 2 from torch.testing._internal.test_module.future_div import div_int_future, div_float_future @@ -203,11 +206,6 @@ def doAutodiffCheck(testname): # TODO: enable TE in PE when all tests are fixed torch._C._jit_set_texpr_fuser_enabled(GRAPH_EXECUTOR == ProfilingMode.PROFILING) torch._C._jit_set_profiling_executor(GRAPH_EXECUTOR != ProfilingMode.LEGACY) -# even though FULL_PROFILER should be our default -# we haven't tested every single test in this file -# but we enable FULL_PROFILER for a large subset -# of the tests with "with enable_profiling_mode_for_profiling_tests" -torch._C._jit_set_profiling_mode(False) def LSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None): hx, cx = hidden @@ -969,6 +967,56 @@ def forward(self, input): m_dropout.eval() self.assertEqual(dropout(input) + 1, m_dropout(input)) + def test_nn_lp_pool2d(self): + class Mod(torch.nn.Module): + def __init__(self): + super().__init__() + self.l = torch.nn.LPPool2d(2, 3) + self.n = torch.nn.LPPool2d(2, (7, 1)) + + def forward(self, x): + return (self.l(x), + self.n(x), + torch.nn.functional.lp_pool2d(x, float(2), 3), + torch.nn.functional.lp_pool2d(x, 2, 3), + torch.nn.functional.lp_pool2d(x, float(2), (7, 1))) + + self.checkModule(Mod(), (torch.rand(1, 3, 7, 7),)) + + def test_nn_lp_pool1d(self): + class Mod(torch.nn.Module): + def __init__(self): + super().__init__() + self.l = torch.nn.LPPool1d(2, 3) + self.n = torch.nn.LPPool1d(2, 7) + + def forward(self, x): + return (self.l(x), + self.n(x), + torch.nn.functional.lp_pool1d(x, float(2), 3), + torch.nn.functional.lp_pool1d(x, 2, 3), + torch.nn.functional.lp_pool1d(x, float(2), 7)) + + self.checkModule(Mod(), (torch.rand(1, 3, 7),)) + + def test_nn_padding_functional(self): + class Mod(nn.Module): + def __init__(self, *pad): + super().__init__() + self.pad = pad + + def forward(self, x): + return F.pad(x, self.pad, mode='constant', value=3.5) + + inputs = [ + (Mod(1, 2), torch.randn(1, 3, 4)), # 1D + (Mod(1, 2, 3, 4), torch.randn(1, 3, 4)), # 2D + (Mod(1, 2, 3, 4, 5, 6), torch.randn(1, 3, 4)), # 3D + ] + + for m, inp in inputs: + self.checkModule(m, (inp,)) + def test_nn_padding(self): class Mod(nn.Module): def __init__(self, padding): @@ -1640,6 +1688,73 @@ def doit(x, y): for node in g.nodes(): self.assertTrue(g2.findNode(node.kind()) is not None) + def test_permute_inputs_binding(self): + @torch.jit.script + def foo(i, j, k): + pass + + g = foo.graph + + idxs = [] + for i, inp in enumerate(g.inputs()): + inp.setDebugName(f"inp{i}") + idxs.append(i) + + permuted_idxs = list(np.random.permutation(idxs)) + g.permuteInputs(permuted_idxs) + for i, inp in enumerate(g.inputs()): + self.assertEqual(f"inp{permuted_idxs[i]}", inp.debugName()) + + @unittest.skipIf(IS_MACOS, "Failing on MacOS only") + def test_python_ir_utils(self): + @torch.jit.script + def foo(inp): + x = inp + 1 + y = x / 2 + z = y * y + return z + + add_node = foo.graph.findNode("aten::add") + div_node = foo.graph.findNode("aten::div") + + with foo.graph.insert_point_guard(add_node): + with foo.graph.insert_point_guard(div_node): + foo.graph.insertConstant("goodbye") + foo.graph.insertConstant("hello") + with foo.graph.insert_point_guard(foo.graph.findNode("aten::mul")): + foo.graph.insertConstant("hello") + FileCheck().check("hello").check("goodbye").check("hello").run(foo.graph) + + self.assertTrue(add_node.matches(add_node.schema())) + self.assertFalse(add_node.matches(div_node.schema())) + + def test_python_ir_utils_graph(self): + @torch.jit.script + def unrolled_mul(x: torch.Tensor, y: int): + out = x + for _ in range(y - 1): + out = out + x + return out + + @torch.jit.script + def foo(x): + return x * 4 + + g = foo.graph + muls = g.findAllNodes("aten::mul") + scalar_muls = filter(lambda x: x.matches("aten::mul(Tensor self, Scalar other) -> Tensor"), muls) + mul_constant_int = filter(lambda x: isinstance(list(x.inputs())[1].toIValue(), int), scalar_muls) + for mul in mul_constant_int: + with g.insert_point_guard(mul): + outputs = g.insertGraph(unrolled_mul.graph, list(mul.inputs())) + assert len(outputs) == len(list(mul.outputs())) + for new_out, old_out in zip(outputs, g.outputs()): + old_out.replaceAllUsesWith(new_out) + mul.destroy() + + FileCheck().check_not("aten::mul").check("aten::add").run(foo.graph) + self.assertEqual(foo(torch.ones([2, 2])), torch.ones([2, 2]) * 4) + @unittest.skipIf(IS_SANDCASTLE, "gtest runs these in sandcastle") @unittest.skipIf(RUN_CUDA, "covered by test_cpp_cuda") @unittest.skipIf(not torch._C._jit_has_cpp_tests(), "Tests were not built, use BUILD_TEST=1") @@ -1868,8 +1983,8 @@ def equation_format_varargs(x, y): def sublist_format(x, y): return torch.einsum(x, [0], y, [1], [0, 1]) - x = make_tensor((5,), 'cpu', torch.float32) - y = make_tensor((10,), 'cpu', torch.float32) + x = make_tensor((5,), dtype=torch.float32, device="cpu") + y = make_tensor((10,), dtype=torch.float32, device="cpu") for fn in [equation_format, equation_format_varargs, sublist_format]: check(fn, torch.jit.script(fn), x, y) @@ -4310,6 +4425,7 @@ def foo(xyz): fc.run(scripted.foo.graph) fc.run(str(scripted.foo.graph)) + @skipIfCrossRef def test_file_line_trace(self): def foobar(xyz): return torch.neg(xyz) @@ -4450,6 +4566,7 @@ def debug_records_from_mod(mod): debug_files = filter(lambda f: f.endswith('.debug_pkl'), files) debug_files = (archive.open(f) for f in debug_files) debug_files = (pickle.load(f) for f in debug_files) + debug_files = (f[2] for f in debug_files) return list(debug_files) debug_files = debug_records_from_mod(ft3) @@ -5666,12 +5783,7 @@ def test_fuser_double_float_codegen(self): 'frac'] def lookup_c_equivalent_fn(aten_fn): - if aten_fn == 'min': - return 'fmin' - elif aten_fn == 'max': - return 'fmax' - else: - return aten_fn + return aten_fn def test_dispatch(op, expects, dtype, binary=False): if dtype == torch.double: @@ -5705,7 +5817,9 @@ def test_dispatch(op, expects, dtype, binary=False): test_dispatch(fn, lookup_c_equivalent_fn(fn) + '(', torch.double) test_dispatch(fn, lookup_c_equivalent_fn(fn) + 'f(', torch.float) - binary_fns = ['min', 'max', 'pow'] + # 'min', 'max' were previously tested but are now replaced with ternary expressions + # instead of fmin() and fmax() + binary_fns = ['pow'] for fn in binary_fns: test_dispatch(fn, lookup_c_equivalent_fn(fn) + '(', torch.double, binary=True) test_dispatch(fn, lookup_c_equivalent_fn(fn) + 'f(', torch.float, binary=True) @@ -6617,6 +6731,13 @@ def forward(self) -> torch.Tensor: self.assertEqual(model1.forward(), script_model_1.forward()) self.assertEqual(model2.forward(), script_model_2.forward()) + def test_ternary_right_associative(self): + def plus_123(x: int): + return x + 1 if x == 1 else x + 2 if x == 2 else x + 3 + self.checkScript(plus_123, (1,)) + self.checkScript(plus_123, (2,)) + self.checkScript(plus_123, (3,)) + def test_print(self): def func(x, y): q = (x + y).sigmoid() @@ -7256,7 +7377,7 @@ def test_as_tensor_tensor_input(input): g = test_as_tensor_tensor_input.graph_for(torch.ones(3, 4)) FileCheck().check("Tensor = aten::as_tensor").check("Float(*, *, requires_grad=0, device=cpu) = aten::as_tensor").run(g) - + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.LEGACY, "testing legacy behavior") def test_tensor_requires_grad(self): @torch.jit.script def test(b): @@ -8162,6 +8283,44 @@ def test_irparser(self): """ FileCheck().run(graph_str, parse_ir(graph_str)) + def test_parse_tensor_constants(self): + def foo(): + return torch.zeros([4, 4]) + + foo_s = torch.jit.script(foo) + torch._C._jit_pass_constant_propagation(foo_s.graph) + + g = str(foo_s.graph) + g_parsed = parse_ir(g, parse_tensor_constants=True) + self.assertEqual(str(canonical(g_parsed)), str(canonical(foo_s.graph))) + func = torch._C._create_function_from_graph("forward", g_parsed) + + out_parsed = func() + out_func = foo() + # not checking data, just dtype, size etc + out_parsed[:] = 0 + out_func[:] = 0 + self.assertEqual(out_func, out_parsed) + + with self.assertRaises(RuntimeError): + parse_ir(g, parse_tensor_constants=False) + + def test_parse_nested_names(self): + g_str = """ + graph(%x.1 : Tensor): + %3 : int = prim::Constant[value=1]() + %2 : int = prim::Constant[value=2]() + %hi.submod.value.5 : Tensor = aten::add(%x.1, %2, %3) + return (%hi.submod.value.5) + """ + g = parse_ir(g_str) + round_trip_g = parse_ir(str(g)) + self.assertEqual(canonical(g), canonical(round_trip_g)) + + func1 = torch._C._create_function_from_graph("forward", g) + func2 = torch._C._create_function_from_graph("forward", round_trip_g) + self.assertEqual(func1(torch.ones([2])), func2(torch.ones([2]))) + def test_is_after_use(self): def sorted_input_use(g): uses = list(next(g.inputs()).uses()) @@ -10286,7 +10445,7 @@ def fn(x): self.assertTrue(n.type() == torch._C.TensorType.getInferred()) with self.assertRaisesRegex(RuntimeError, "Inferred \'x\' to be of type \'Tensor"): - fn(1) + fn("1") def test_script_define_order(self): class M(torch.jit.ScriptModule): @@ -10991,6 +11150,26 @@ def randint(): FileCheck().check("Double(*, *, requires_grad=0, device=cpu)") \ .check_not("Float(*, *, requires_grad=0, device=cpu)").run(randint.graph_for()) + @unittest.skipIf(not RUN_CUDA, "no CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "skip if profiling isn't enabled") + def test_autodiff_complex(self): + def foo(x: torch.Tensor, y: torch.Tensor, W: torch.Tensor): + return torch.exp(torch.mm(torch.complex(x, y), W.cfloat())) + + @torch.jit.script + def jitted_foo(x: torch.Tensor, y: torch.Tensor, W: torch.Tensor): + return torch.exp(torch.mm(torch.complex(x, y), W.cfloat())) + + x = torch.randn(128, 16, dtype=torch.float32, device='cuda:0') + y = torch.randn(128, 16, dtype=torch.float32, device='cuda:0') + W = torch.randn(16, 1, dtype=torch.float32, device='cuda:0', requires_grad=True) + W.data /= 4 + + with enable_profiling_mode_for_profiling_tests(): + for i in range(4): + self.assertTrue((foo(x, y, W).grad_fn is None) == (jitted_foo(x, y, W).grad_fn is None)) + + def test_linear_grad(self): with enable_profiling_mode_for_profiling_tests(): def t(x: torch.Tensor, w: torch.Tensor, b: Optional[torch.Tensor]): @@ -11090,6 +11269,21 @@ def func(a): self.run_pass("erase_number_types", graph) FileCheck().check_not("int = prim::Constant").run(str(graph)) + def test_refine_tuple_types(self): + # TupleConstruct output type is not correct here. + graph_str = """ + graph(%a : Float(123), %b : Float(4, 5, 6)): + %c : (Tensor, Tensor) = prim::TupleConstruct(%a, %b) + return (%c) + """ + graph = parse_ir(graph_str) + torch._C._jit_pass_refine_tuple_types(graph) + + # After the pass, the output type should've been updated. + self.assertTrue('(Float(123), Float(4, 5, 6))' in str(graph.findNode('prim::TupleConstruct').output())) + + # TODO(henrytu): Add test for RefineTypes for NamedTuple when it's supported by IR parser. + def test_remove_dropout(self): weight_0_shape = (20, 5) weight_1_shape = (20, 20) @@ -13013,153 +13207,6 @@ def func(niter): self.checkScript(dedent(code), (101,)) - def test_pyop_exception_message(self): - class Foo(torch.jit.ScriptModule): - def __init__(self): - super(Foo, self).__init__() - self.conv = nn.Conv2d(1, 10, kernel_size=5) - - @torch.jit.script_method - def forward(self, x): - return self.conv(x) - foo = Foo() - # testing that the correct error message propagates - with self.assertRaisesRegex(RuntimeError, r"Expected 3D \(unbatched\) or 4D \(batched\) input to conv2d"): - foo(torch.ones([123])) # wrong size - - def test_builtin_error_messsage(self): - with self.assertRaisesRegex(RuntimeError, "Arguments for call are not valid"): - @torch.jit.script - def close_match(x): - return x.masked_fill(True) - - with self.assertRaisesRegex(RuntimeError, "This op may not exist or may not be currently " - "supported in TorchScript"): - @torch.jit.script - def unknown_op(x): - torch.set_anomaly_enabled(True) - return x - - def test_exceptions(self): - cu = torch.jit.CompilationUnit(''' - def foo(cond): - if bool(cond): - raise ValueError(3) - return 1 - ''') - - cu.foo(torch.tensor(0)) - with self.assertRaisesRegex(torch.jit.Error, "3"): - cu.foo(torch.tensor(1)) - - def foo(cond): - a = 3 - if bool(cond): - raise ArbitraryError(a, "hi") - if 1 == 2: - raise ArbitraryError - return a - - with self.assertRaisesRegex(RuntimeError, "undefined value ArbitraryError"): - torch.jit.script(foo) - - def exception_as_value(): - a = Exception() - print(a) - - with self.assertRaisesRegex(RuntimeError, "cannot be used as a value"): - torch.jit.script(exception_as_value) - - @torch.jit.script - def foo_no_decl_always_throws(): - raise RuntimeError("Hi") - - # function that has no declared type but always throws set to None - output_type = next(foo_no_decl_always_throws.graph.outputs()).type() - self.assertTrue(str(output_type) == "NoneType") - - @torch.jit.script - def foo_decl_always_throws(): - # type: () -> Tensor - raise Exception("Hi") - - output_type = next(foo_decl_always_throws.graph.outputs()).type() - self.assertTrue(str(output_type) == "Tensor") - - def foo(): - raise 3 + 4 - - with self.assertRaisesRegex(RuntimeError, "must derive from BaseException"): - torch.jit.script(foo) - - # a escapes scope - @torch.jit.script - def foo(): - if 1 == 1: - a = 1 - else: - if 1 == 1: - raise Exception("Hi") - else: - raise Exception("Hi") - return a - self.assertEqual(foo(), 1) - - @torch.jit.script - def tuple_fn(): - raise RuntimeError("hello", "goodbye") - - with self.assertRaisesRegex(torch.jit.Error, "hello, goodbye"): - tuple_fn() - - @torch.jit.script - def no_message(): - raise RuntimeError - - with self.assertRaisesRegex(torch.jit.Error, "RuntimeError"): - no_message() - - def test_assertions(self): - cu = torch.jit.CompilationUnit(''' - def foo(cond): - assert bool(cond), "hi" - return 0 - ''') - - cu.foo(torch.tensor(1)) - with self.assertRaisesRegex(torch.jit.Error, "AssertionError: hi"): - cu.foo(torch.tensor(0)) - - @torch.jit.script - def foo(cond): - assert bool(cond), "hi" - - foo(torch.tensor(1)) - # we don't currently validate the name of the exception - with self.assertRaisesRegex(torch.jit.Error, "AssertionError: hi"): - foo(torch.tensor(0)) - - def test_python_op_exception(self): - @torch.jit.ignore - def python_op(x): - raise Exception("bad!") - - @torch.jit.script - def fn(x): - return python_op(x) - - with self.assertRaisesRegex(RuntimeError, "operation failed in the TorchScript interpreter"): - fn(torch.tensor(4)) - - def test_dict_expansion_raises_error(self): - def fn(self): - d = {"foo": 1, "bar": 2, "baz": 3} - return {**d} - - with self.assertRaisesRegex(torch.jit.frontend.NotSupportedError, - "Dict expansion "): - torch.jit.script(fn) - def test_module_parameters_and_buffers(self): weights = torch.randn(10, 10) bias = torch.randn(10) @@ -14911,6 +14958,12 @@ def forward(self, x): with self.assertRaisesRegex(Exception, "Overloads are not useable when a module"): a = torch.jit.script(W2()) + def test_narrow_copy(self): + def foo(a): + return a.narrow_copy(0, 0, 5) + + self.checkScript(foo, [torch.rand(10)]) + def test_select_after_chunk(self): def foo(x): chunked = torch.chunk(x, 1) @@ -15065,6 +15118,22 @@ def jit_multihead_attn_forward(query, # type: Tensor # print(jit_out / py_out - 1) self.assertEqual(jit_out, py_out, atol=5e-4, rtol=1e-4) + def test_torchscript_multi_head_attn_fast_path(self): + src_l = 3 + bsz = 5 + embed_size = 8 + nhead = 2 + multi_head_attn = torch.nn.MultiheadAttention(embed_size, nhead, batch_first=True) + multi_head_attn = multi_head_attn.eval() + + query = key = value = torch.rand((bsz, src_l, embed_size)) + + with torch.no_grad(): + py_out = multi_head_attn(query, key, value) + mha = torch.jit.script(multi_head_attn) + jit_out = mha(query, key, value) + torch.testing.assert_close(jit_out, py_out) + @unittest.skipIf(not RUN_CUDA, "no CUDA") def test_scriptmodule_multi_head_attn_cuda(self): @@ -15494,7 +15563,7 @@ def forward(self, x): self.assertEqual(m.int64_min, imported.int64_min) def test_script_scope(self): - scripted = torch.jit.script(torch.nn.functional.pad) + scripted = torch.jit.script(torch.nn.functional.triplet_margin_loss) @unittest.skipIf(IS_WINDOWS, "NYI: TemporaryFileName on Windows") def test_serialization_sharing(self): @@ -15895,7 +15964,7 @@ def foo(a): with self.assertRaisesRegex(RuntimeError, (r"Expected a value of type \'Tensor \(inferred\)\'" r"[\S\s]*Inferred \'a\' to be of type \'Tensor\'")): - foo(1) + foo("1") def test_type_comments_in_body(self): @torch.jit.script @@ -15918,6 +15987,13 @@ def __init__(self, torch.jit.script(M(2, 3)) + def test_input_keyword_in_schema(self): + def f(x): + return torch.ceil(input=x) + + inp = torch.randn(10) + self.checkScript(f, (inp, )) + def test_module_method_reassignment(self): class Foo(torch.nn.Module): def __init__(self): @@ -16207,59 +16283,6 @@ def test_nhwc_autocast_jit_trace_model(model, x): M = 10 S = 5 - -def add_nn_functional_test(name, self_size, args, variant_name='', check_ad=(), skipTestIf=(), - output_process_fn=lambda x: x, kwargs=None): - test_name = 'test_nn_' + name - - if variant_name != '': - test_name = test_name + '_' + variant_name - - no_grad = variant_name == 'inplace' - - @suppress_warnings - def do_test(self, name=name, args=args, test_name=test_name, check_ad=check_ad): - torch.manual_seed(2) - - self_variable = create_input((self_size,))[0][0] - - # need to record this because methods can change the size (e.g. unsqueeze) - args_variable, kwargs_variable = create_input(args, call_kwargs=kwargs) - - self_tensor = deepcopy(self_variable.data) - args_tensor = deepcopy(unpack_variables(args_variable)) - - if not no_grad: - output_variable = getattr(F, name)(self_variable, *args_variable, **kwargs_variable) - - def fn(*inputs, **kwargs): - return getattr(F, name)(*inputs, **kwargs) - - f_args_variable = (self_variable,) + args_variable - f_args_tensor = (self_tensor,) + args_tensor - should_autodiff_node, autodiff_nodes, fusible_nodes = normalize_check_ad(check_ad, name) - - if test_name not in EXCLUDE_SCRIPT: - def run_test(): - # XXX: this test should always run with disable_autodiff_subgraph_inlining(True), - # so that we don't regress on autodiff support. - with disable_autodiff_subgraph_inlining(): - script_fn = create_script_fn(self, name, 'nn_functional') - check_against_reference(self, script_fn, fn, output_process_fn, - f_args_variable, kwargs_variable, no_grad=no_grad) - # For tests we disabled AD subgraph inlining, make sure it's not falling back to autograd - if (doAutodiffCheck(test_name)): - self.assertAutodiffNode(script_fn.last_graph, should_autodiff_node, autodiff_nodes, fusible_nodes) - - if test_name in EXCLUDE_PYTHON_PRINT: - with torch._jit_internal._disable_emit_hooks(): - run_test() - else: - run_test() - - post_add_test(test_name, skipTestIf, do_test, TestJitGeneratedFunctional) - - def add_nn_module_test(*args, **kwargs): no_grad = False if 'no_grad' not in kwargs else kwargs['no_grad'] @@ -16410,10 +16433,6 @@ def test_version(self): # issue gh-32561 self.assertTrue(torch.__version__.startswith(torch.onnx.producer_version)) - -for test in nn_functional_tests: - add_nn_functional_test(*test) - for test in module_tests + new_module_tests + additional_module_tests: add_nn_module_test(**test) diff --git a/test/test_jit_autocast.py b/test/test_jit_autocast.py index cec8acfe7e85..6cb3c5645382 100644 --- a/test/test_jit_autocast.py +++ b/test/test_jit_autocast.py @@ -15,14 +15,15 @@ class TestAutocast(JitTestCase): def setUp(self): # common input tensors - self.a_fp16 = torch.rand((2, 2), dtype=torch.float16, device='cuda') - self.b_fp16 = torch.rand((2, 2), dtype=torch.float16, device='cuda') - self.c_fp16 = torch.rand((2, 2), dtype=torch.float16, device='cuda') - self.d_fp16 = torch.rand((2, 2), dtype=torch.float16, device='cuda') - self.a_fp32 = torch.rand((2, 2), dtype=torch.float32, device='cuda') - self.b_fp32 = torch.rand((2, 2), dtype=torch.float32, device='cuda') - self.c_fp32 = torch.rand((2, 2), dtype=torch.float32, device='cuda') - self.d_fp32 = torch.rand((2, 2), dtype=torch.float32, device='cuda') + if TEST_CUDA: + self.a_fp16 = torch.rand((2, 2), dtype=torch.float16, device='cuda') + self.b_fp16 = torch.rand((2, 2), dtype=torch.float16, device='cuda') + self.c_fp16 = torch.rand((2, 2), dtype=torch.float16, device='cuda') + self.d_fp16 = torch.rand((2, 2), dtype=torch.float16, device='cuda') + self.a_fp32 = torch.rand((2, 2), dtype=torch.float32, device='cuda') + self.b_fp32 = torch.rand((2, 2), dtype=torch.float32, device='cuda') + self.c_fp32 = torch.rand((2, 2), dtype=torch.float32, device='cuda') + self.d_fp32 = torch.rand((2, 2), dtype=torch.float32, device='cuda') self.old_value = torch._C._jit_set_autocast_mode(True) super().setUp() @@ -659,6 +660,95 @@ def forward(self, x, y): # isn't enabled self.assertRaises(RuntimeError, lambda: scripted_thing1.forward(x, y)) + @unittest.skipIf(not TEST_CUDA, "No cuda") + def test_jit_freeze_autocast_basic(self): + class TestModule(torch.nn.Module): + def __init__(self): + super(TestModule, self).__init__() + + def forward(self, x, y): + with torch.cuda.amp.autocast(): + return torch.mm(x, y) + + x = torch.rand((3, 4), dtype=torch.float).cuda() + y = torch.rand((4, 5), dtype=torch.float).cuda() + + mod = TestModule().eval() + + # sanity check + self._test_autocast(mod, "aten::_autocast_to_reduced_precision", x, y) + + frozen_mod = torch.jit.freeze(torch.jit.script(mod).eval()) + FileCheck().check_count("aten::_autocast_to_reduced_precision", 2, True).run(frozen_mod.graph) + + # make sure that the runtime pass doesn't duplicate autocast nodes + frozen_mod(x, y) + optimized_graph = frozen_mod.graph_for(x, y) + FileCheck().check_count("aten::_autocast_to_reduced_precision", 2, True).run(optimized_graph) + + @unittest.skipIf(not TEST_CUDA, "No cuda") + def test_jit_freeze_autocast_constants(self): + class TestModule(torch.nn.Module): + def __init__(self): + super(TestModule, self).__init__() + self.x = torch.rand((3, 4), dtype=torch.float).cuda() + + def forward(self, y): + with torch.cuda.amp.autocast(): + return torch.mm(self.x, y) + + y = torch.rand((4, 5), dtype=torch.float).cuda() + mod = TestModule().eval() + + frozen_mod = torch.jit.freeze(torch.jit.script(mod).eval()) + # freezing should pre-cast the constant self.x to remove one autocast call + FileCheck().check_count("aten::_autocast_to_reduced_precision", 1, True).run(frozen_mod.graph) + + # the runtime autocasting pass will re-insert the second autocast call, + # but constant propagation will merge it with the constant that it's casting. + frozen_mod(y) + optimized_graph = frozen_mod.graph_for(y) + FileCheck().check_count("aten::_autocast_to_reduced_precision", 1, True).run(optimized_graph) + + @unittest.skipIf(TEST_CUDA, "CPU-only test") + def test_jit_autocast_softmax_cpu(self): + def fn(x): + with torch.cpu.amp.autocast(): + return torch.nn.functional.softmax(x, dim=0) + + fn_s = torch.jit.script(fn) + x = torch.rand((2, 2), dtype=torch.bfloat16) + fn_s(x) + y = fn_s(x) + + self.assertTrue(y.dtype == torch.bfloat16) + + @unittest.skipIf(not TEST_CUDA, "No cuda") + def test_jit_autocast_softmax_gpu(self): + def fn(x): + with torch.cuda.amp.autocast(): + return torch.nn.functional.softmax(x, dim=0) + + fn_s = torch.jit.script(fn) + x = torch.rand((2, 2), dtype=torch.half).cuda() + fn_s(x) + y = fn_s(x) + + self.assertTrue(y.dtype == torch.float) + + def test_ignore_amp(self): + @torch.jit.script + def foo(x): + return torch.mm(x, x) + + inp = torch.rand([10, 10], dtype=torch.float) + foo._set_ignore_amp(True) + with torch.cpu.amp.autocast(): + foo(inp) + foo(inp) + + g = torch.jit.last_executed_optimized_graph() + FileCheck().check_not("_autocast_to_reduced").run(g) if __name__ == "__main__": run_tests() diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py index c03ff0b3119a..3926b081d7c9 100644 --- a/test/test_jit_cuda_fuser.py +++ b/test/test_jit_cuda_fuser.py @@ -1,34 +1,48 @@ # Owner(s): ["oncall: jit"] +import contextlib import unittest import os import random +import enum +import copy +from functools import reduce +import operator +import warnings import torch from torch.nn import functional +from torch.profiler import profile, ProfilerActivity -from torch.testing._internal.common_utils import run_tests, ProfilingMode, GRAPH_EXECUTOR # TEST_WITH_ROCM -from torch.testing._internal.common_cuda import TEST_MULTIGPU from torch.testing._internal.codegen.random_topo_test import runDefaultTestWithSeed +from torch.testing._internal.common_cuda import TEST_MULTIGPU +from torch.testing._internal.common_device_type import instantiate_device_type_tests, ops, OpDTypes +from torch.testing._internal.common_jit import JitCommonTestCase +from torch.testing._internal.common_methods_invocations import op_db, SampleInput +from torch.testing._internal.common_utils import run_tests, ProfilingMode, GRAPH_EXECUTOR, TEST_WITH_ROCM, slowTest, \ + is_iterable_of_tensors, freeze_rng_state +from torch.testing._internal.jit_utils import clone_inputs, get_traced_sample_variant_pairs, JitTestCase, RUN_CUDA +from torch.testing._internal.jit_metaprogramming_utils import create_traced_fn from torch.testing import FileCheck -from test_jit import JitTestCase, RUN_CUDA - from jit.test_fuser_common import TestFuserCommon # noqa: F401 import itertools import numpy as np import math +from torch.autograd.gradcheck import gradcheck + from typing import List -CUDA_MAJOR, CUDA_MINOR = (int(x) for x in torch.version.cuda.split('.')) +RUN_NVFUSER = RUN_CUDA and not TEST_WITH_ROCM +CUDA_MAJOR, CUDA_MINOR = 0, 0 + +if RUN_NVFUSER and torch.version.cuda is not None: + CUDA_MAJOR, CUDA_MINOR = (int(x) for x in torch.version.cuda.split('.')[:2]) -os.environ['PYTORCH_NVFUSER_DISABLE_FALLBACK'] = '1' -os.environ['PYTORCH_NVFUSER_DISABLE_FMA'] = '1' -os.environ['PYTORCH_NVFUSER_DISABLE_FASTMATH'] = '1' +os.environ['PYTORCH_NVFUSER_DISABLE'] = 'fallback,fma,unroll_with_rng' os.environ['PYTORCH_NVFUSER_JIT_OPT_LEVEL'] = '0' -os.environ['PYTORCH_NVFUSER_DISABLE_RNG_UNROLL'] = '1' if GRAPH_EXECUTOR == ProfilingMode.PROFILING: torch._C._jit_set_texpr_fuser_enabled(False) @@ -37,8 +51,9 @@ FUSION_GROUP = 'prim::CudaFusionGroup' FUSION_GUARD = 'prim::CudaFusionGuard' +# TODO: revert disabled alias ops +ALIAS_TEST_DISABLED = True -import contextlib @contextlib.contextmanager def nvfuser_singleton_fusion(flag): @@ -57,37 +72,39 @@ def nvfuser_horizontal_fusion(flag): torch._C._jit_set_nvfuser_horizontal_mode(old_value) def is_pre_volta(): + if not RUN_NVFUSER: + return False prop = torch.cuda.get_device_properties(torch.cuda.current_device()) return prop.major < 7 -TEST_BF16 = torch.cuda.is_bf16_supported() +TEST_BF16 = RUN_NVFUSER and torch.cuda.is_bf16_supported() -class TestCudaFuser(JitTestCase): +class CudaFuserTestOptions(): + def __init__(self): + self.old_cpu_fuse = torch._C._jit_can_fuse_on_cpu() + self.old_gpu_fuse = torch._C._jit_can_fuse_on_gpu() + torch._C._jit_override_can_fuse_on_cpu(False) + torch._C._jit_override_can_fuse_on_gpu(False) + self.old_guard = torch._C._jit_set_nvfuser_guard_mode(False) + torch._C._debug_set_autodiff_subgraph_inlining(False) + self.old_value = torch._C._jit_set_autocast_mode(True) + + if(RUN_CUDA): + self.old_nvfuser = torch._C._jit_set_nvfuser_enabled(True) + + def restore(self): + if(RUN_CUDA): + torch._C._jit_set_nvfuser_enabled(self.old_nvfuser) + torch._C._jit_override_can_fuse_on_cpu(self.old_cpu_fuse) + torch._C._jit_override_can_fuse_on_gpu(self.old_gpu_fuse) + torch._C._jit_set_nvfuser_guard_mode(self.old_guard) + torch._C._debug_set_autodiff_subgraph_inlining(True) + torch._C._jit_set_autocast_mode(self.old_value) - special_values = torch.tensor( - [float("-inf"), -10, -math.pi, - -1, -0.5, 0, 1, 0.5, - math.pi, 10, float("inf"), - float("nan")], dtype=torch.float, device='cuda') - - int_types = [ - torch.int8, - torch.uint8, - torch.int16, - torch.int32, - torch.int64 - ] - - support_tensor_dtypes = [ - torch.int32, - torch.int64, - torch.float16, - torch.float32, - torch.float64, - torch.bool - ] - if TEST_BF16: - support_tensor_dtypes.append(torch.bfloat16) +class TestCudaFuser(JitTestCase): + def assertEqual(self, *args, **kwargs): + kwargs["exact_layout"] = True + super(JitTestCase, self).assertEqual(*args, **kwargs) def _getSubgraphInFusion(self, graph): num_node = 0 @@ -108,28 +125,60 @@ def count(block, ret): def setUp(self): super(TestCudaFuser, self).setUp() - self.old_cpu_fuse = torch._C._jit_can_fuse_on_cpu() - self.old_gpu_fuse = torch._C._jit_can_fuse_on_gpu() - torch._C._jit_override_can_fuse_on_cpu(False) - torch._C._jit_override_can_fuse_on_gpu(False) - self.old_guard = torch._C._jit_set_nvfuser_guard_mode(False) - torch._C._debug_set_autodiff_subgraph_inlining(False) - self.old_value = torch._C._jit_set_autocast_mode(True) - if(RUN_CUDA): - self.old_nvfuser = torch._C._jit_set_nvfuser_enabled(True) + self.skip_node_list = [] + disabled_ops = ("aten::batch_norm", + "aten::_batch_norm_impl_index", + "aten::_batch_norm_impl_index_backward", + "aten::native_batch_norm_backward") + for op in disabled_ops: + disabled_flag = torch._C._jit_set_nvfuser_skip_node_kind(op, False) + if disabled_flag: + torch._C._jit_set_nvfuser_skip_node_kind(op, True) + self.skip_node_list.append(op) + + # cpu backup to avoid errors in case this is run on a CPU-only machine + dev = 'cuda' if RUN_NVFUSER else 'cpu' + self.special_values = torch.tensor( + [float("-inf"), -10, -math.pi, + -1, -0.5, 0, 1, 0.5, + math.pi, 10, float("inf"), + float("nan")], dtype=torch.float, device=dev) + + self.int_types = [ + torch.int8, + torch.uint8, + torch.int16, + torch.int32, + torch.int64 + ] + + self.support_tensor_dtypes = [ + torch.int32, + torch.int64, + torch.float16, + torch.float32, + torch.float64, + torch.bool + ] + if TEST_BF16: + self.support_tensor_dtypes.append(torch.bfloat16) + + if(RUN_NVFUSER): + self.cuda_fuser_options = CudaFuserTestOptions() def tearDown(self): - if(RUN_CUDA): - torch._C._jit_set_nvfuser_enabled(self.old_nvfuser) - torch._C._jit_override_can_fuse_on_cpu(self.old_cpu_fuse) - torch._C._jit_override_can_fuse_on_gpu(self.old_gpu_fuse) - torch._C._jit_set_nvfuser_guard_mode(self.old_guard) - torch._C._debug_set_autodiff_subgraph_inlining(True) - torch._C._jit_set_autocast_mode(self.old_value) + # restoring skip node to the configuration before tests + for op in self.skip_node_list: + disabled_flag = torch._C._jit_set_nvfuser_skip_node_kind(op, False) + if not disabled_flag: + torch._C._jit_set_nvfuser_skip_node_kind(op, True) + + if(RUN_NVFUSER): + self.cuda_fuser_options.restore() super(TestCudaFuser, self).tearDown() - def _run_helper(self, jit_op, op, *args): + def _run_helper(self, jit_op, op, *args, check_stride=False, num_fusion=1): torch.cuda.manual_seed_all(123) jit_o = jit_op(*args) torch.cuda.manual_seed_all(123) @@ -138,7 +187,9 @@ def _run_helper(self, jit_op, op, *args): o = op(*args) self.assertEqual(o.dtype, jit_o.dtype) self.assertEqual(o, jit_o) - self.assertGraphContainsExactly(jit_op.graph_for(*args), FUSION_GUARD, 1, consider_subgraphs=True) + if check_stride: + self.assertEqual(o.stride(), jit_o.stride()) + self.assertGraphContainsExactly(jit_op.graph_for(*args), FUSION_GUARD, num_fusion, consider_subgraphs=True) def _run_training_helper(self, jit_op, op, grads, *args): torch.cuda.manual_seed_all(123) @@ -162,7 +213,7 @@ def _run_training_helper(self, jit_op, op, grads, *args): )[0].graph self.assertGraphContainsExactly(bwd_graph, FUSION_GUARD, 1, consider_subgraphs=True) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_half(self): @@ -187,8 +238,9 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor, alpha: float): self.assertEqual(oo, jit_oo) self.assertGraphContains(t_jit.graph_for(x, y, z, alpha), FUSION_GUARD) + @unittest.skipIf(not TEST_BF16, "device does not support BFloat16") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_bfloat(self): @@ -213,7 +265,7 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor, alpha: float): self.assertEqual(oo, jit_oo) self.assertGraphContains(t_jit.graph_for(x, y, z, alpha), FUSION_GUARD) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_const(self): @@ -230,7 +282,7 @@ def t(x, y): self.assertEqual(o, jit_o) self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_chunk(self): @@ -254,14 +306,14 @@ def t(x, y, z, q): self.assertGraphContains(t_jit.graph_for(x, y, z, q), FUSION_GUARD) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_reduction_dtypes_axis(self): - for op in [torch.sum, torch.mean, torch.amax]: + for op in [torch.sum, torch.mean, torch.amax, torch.var, torch.std]: for dtype in [torch.float16, torch.float32, torch.double]: - for axis in [-1, 2]: + for axis in [-1, 2, 0]: def make_func(op): def func(x: torch.Tensor): o = torch.mul(x, 2.0) @@ -279,7 +331,34 @@ def func(x: torch.Tensor): self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4)) self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_variance(self): + + for op in [torch.var, torch.std]: + for dtype in [torch.float16, torch.float32, torch.double]: + for axis in [-2, -1, 2, 1]: + for unbiased in [False, True]: + def make_func(op): + def func(x: torch.Tensor): + o = torch.mul(x, 2.0) + o = op(o, dim=[axis]) + return o + return func + + x = torch.randn(8, 4, 16, dtype=dtype, device="cuda") + t = make_func(op) + t_jit = torch.jit.trace(t, x) + jit_o = t_jit(x) + jit_o = t_jit(x) + o = t(x) + self.assertEqual(o.dtype, jit_o.dtype) + self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4)) + self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_scalar_input(self): @@ -297,7 +376,7 @@ def t(x: torch.Tensor, y: torch.Tensor, z: float): self.assertEqual(o, jit_o) self.assertGraphContains(t_jit.graph_for(x, y, 2.0), FUSION_GUARD) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_broadcasting_0(self): @@ -316,7 +395,7 @@ def t(x: torch.Tensor, y: torch.Tensor, z: float): subgraph = self._getSubgraphInFusion(t_jit.graph_for(x, y, 2.0)) self.assertGraphContainsExactly(subgraph, 'aten::add', 2, consider_subgraphs=False) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_broadcasting_1(self): @@ -335,7 +414,7 @@ def t(x: torch.Tensor, y: torch.Tensor, z: float): subgraph = self._getSubgraphInFusion(t_jit.graph_for(x, y, 2.0)) self.assertGraphContainsExactly(subgraph, 'aten::add', 2, consider_subgraphs=False) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_broadcasting_2(self): @@ -354,7 +433,7 @@ def t(x: torch.Tensor, y: torch.Tensor, z: float): subgraph = self._getSubgraphInFusion(t_jit.graph_for(x, y, 2.0)) self.assertGraphContainsExactly(subgraph, 'aten::add', 2, consider_subgraphs=False) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_broadcasting_3(self): @@ -376,7 +455,7 @@ def t(x: torch.Tensor, y: torch.Tensor, z: float): # test_broadcasting_partition_logic_X # Testing partition logic that is capable to avoid creating unsupported # broadcasting semantics in CudaFusionGroup - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_broadcasting_partition_logic_0(self): @@ -398,7 +477,7 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): subgraph = self._getSubgraphInFusion(t_jit.graph_for(x, y, z)) self.assertGraphContainsExactly(subgraph, 'aten::add', 4, consider_subgraphs=False) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_broadcasting_partition_logic_1(self): @@ -421,7 +500,7 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): self.assertGraphContainsExactly(subgraph, 'aten::add', 4, consider_subgraphs=False) @unittest.skipIf(True, "Broadcast with different output not supported yet") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_broadcasting_multiple_output_shape(self): @@ -443,7 +522,7 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD) @unittest.skipIf(True, "broadcast on branches can't be resolved yet") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_broadcasting_multiple_output(self): @@ -465,21 +544,25 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD) def _unary_test_helper(self, operation, dtype, random_data): - shape = (4, 8, 32, 32) + gradient_check = (dtype == torch.float64) and random_data + shape = self.special_values.shape + torch.cuda.manual_seed_all(211) # need additional def of t for boolean ops def t(x: torch.Tensor, y: torch.Tensor): o = x * y + o = o + 5e-3 o = operation(o) return o - y = torch.tensor([1], device="cuda").to(dtype) + y = torch.rand(shape, dtype=torch.float32, device="cuda", requires_grad=gradient_check) + y = y.to(dtype=dtype) if random_data: - x = torch.randn(shape, dtype=torch.float32, device="cuda") + x = torch.rand(shape, dtype=torch.float32, device="cuda", requires_grad=gradient_check) if dtype in self.int_types: # prefer a larger variance for integer types - x *= 5 + x = x * 5 x = x.to(dtype=dtype) else: x = self.special_values.to(dtype=dtype) @@ -491,16 +574,28 @@ def t(x: torch.Tensor, y: torch.Tensor): t_jit = torch.jit.script(t) jit_o = t_jit(x, y) jit_o = t_jit(x, y) - if dtype in self.support_tensor_dtypes: + jit_o = t_jit(x, y) + if gradient_check: + if jit_o.dtype != torch.bool: + # bool dtype has no `-` + gradcheck(t_jit, [x, y], nondet_tol=1e-5) + elif dtype in self.support_tensor_dtypes: self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD) o = t(x, y) self.assertEqual(o.dtype, jit_o.dtype) - self.assertEqual(o, jit_o, msg=f""" - failing case: - {dtype} {operation} {x} - """) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + if dtype == torch.bfloat16: + # compare with the actual ground truth for + # bfloat16 kernels instead of eager mode + # implementation, since mismatch in cast + # adds excessive noise. + o = t(x.to(torch.float64), y.to(torch.float64)).to(torch.bfloat16) + else: + o = t(x, y) + + self.assertTrue(self._compare("failing case {}\n{}\n{}\n{}".format(dtype, operation, x, y), o, jit_o, 1e-2)) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_unary_ops(self): @@ -539,6 +634,12 @@ def test_unary_ops(self): torch.trunc, torch.frac, torch.reciprocal, + torch.isfinite, + torch.isinf, + torch.isnan, + torch.isneginf, + torch.isposinf, + torch.isreal, torch.nn.functional.softplus, torch.nn.functional.gelu, torch.relu, @@ -551,7 +652,7 @@ def test_unary_ops(self): self._unary_test_helper(op, dtype, False) # test special numbers self._unary_test_helper(op, dtype, True) # test random data - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_category_rule(self): @@ -611,7 +712,7 @@ def t(x: torch.Tensor, z: float): z = torch.tensor(3., dtype=torch.double) run_scalar(x, z) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_unary_bitwise(self): @@ -620,8 +721,8 @@ def bit_not(x: torch.Tensor): jitted = torch.jit.script(bit_not) x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda").mul(5).to(torch.long) - jit_o = bit_not(x) - jit_o = bit_not(x) + jit_o = jitted(x) + jit_o = jitted(x) o = bit_not(x) self.assertEqual(o, jit_o) jitted.graph_for(x) # Shows up in second instance, not first @@ -633,48 +734,180 @@ def bool_not(x: torch.Tensor, y: torch.Tensor): jitted = torch.jit.script(bool_not) x = torch.rand(4, 8, 32, 32, dtype=torch.float, device="cuda").round().to(torch.bool) y = torch.rand(4, 8, 32, 32, dtype=torch.float, device="cuda").round().to(torch.bool) - jit_o = bool_not(x, y) - jit_o = bool_not(x, y) + jit_o = jitted(x, y) + jit_o = jitted(x, y) o = bool_not(x, y) self.assertEqual(o, jit_o) jitted.graph_for(x, y) # Shows up in second instance, not first self.assertGraphContains(jitted.graph_for(x, y), FUSION_GUARD) - def _binary_test_helper(self, operation, dtypes, random_data): + def _get_scalar_binary_test_fn(self, category_and_type1, category_and_type2, operation): + category1, dtype_arg1 = category_and_type1 + category2, dtype_arg2 = category_and_type2 + + def t_intx_tensory(x: int, y: torch.Tensor): + o = operation(x, y) + o = 2 + o + return o + + def t_doublex_tensory(x: float, y: torch.Tensor): + o = operation(x, y) + o = 2 + o + return o + # Omit both scalar cases and swap cases + assert category1 == "scalar" and category2 != "scalar" + if dtype_arg1.is_floating_point: + return t_doublex_tensory + if dtype_arg1 == torch.int64 or dtype_arg1 == torch.int32: + return t_intx_tensory + raise NotImplementedError + + def _binary_test_helper(self, operation, dtypes, random_data, categories="ndim"): if isinstance(dtypes, tuple): dtype_arg1, dtype_arg2 = dtypes else: dtype_arg1 = dtype_arg2 = dtypes + if isinstance(categories, tuple) and random_data: + category1, category2 = categories + elif not random_data: + category1 = category2 = "ndim" + else: + category1 = category2 = categories + + def is_cpu_category(x): + return x == "0dimcpu" or x == "scalar" + + # skip unsupported cases + if is_cpu_category(category1) and is_cpu_category(category2): + return + + # only test cases with first operand as scalar + if category2 == "scalar": + return + + # skip ops that doesn't support scalar inputs in eager + if operation in [ + torch.atan2, + torch.max, + torch.min, + torch.remainder, # unsupported in nvfuser + ]: + if category1 == "scalar" or category2 == "scalar": + return + + if operation in [ + torch.fmod, + torch.eq, + torch.ne, + torch.ge, + torch.gt, + torch.le, + torch.lt + ]: + if category1 == "scalar": + return + + # operators that does not support bfloat16 + if operation in [torch.fmod]: + if dtype_arg1 == torch.bfloat16 or dtype_arg2 == torch.bfloat16: + return + def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): o = operation(x, y) o = o + z return o shape = (4, 32, 32) + + shapex = shape if category1 == "ndim" else () + shapey = shape if category2 == "ndim" else () + if random_data: - x = (torch.randn(shape, dtype=torch.float, device="cuda") * 5).to(dtype_arg1) - y = (torch.randn(shape, dtype=torch.float, device="cuda") * 5).to(dtype_arg2) + x = (torch.randn(shapex, dtype=torch.float, device="cuda") * 5).to(dtype_arg1) + y = (torch.randn(shapey, dtype=torch.float, device="cuda") * 5).to(dtype_arg2) else: x = self.special_values.to(dtype=dtype_arg1) y = (torch.rand_like(self.special_values) * 5).to(dtype_arg2) + + r""" + Category conversion + """ + has_scalar = False + if category1 == "scalar": + has_scalar = True + x = x.item() + + if category1 == "0dimcpu": + x = x.to(device="cpu") + + if category2 == "scalar": + has_scalar = True + y = y.item() + + if category2 == "0dimcpu": + y = y.to(device="cpu") + z = torch.tensor([2], device="cuda").to(dtype_arg1) + is_dtype_arg1_int = dtype_arg1 == torch.int32 or dtype_arg1 == torch.int64 + is_dtype_arg2_int = dtype_arg2 == torch.int32 or dtype_arg2 == torch.int64 + + if operation in [torch.pow]: + if is_dtype_arg1_int and is_dtype_arg2_int: + if category2 == "scalar": + # RuntimeError: Integers to negative integer powers are not allowed + y = abs(y) + if category2 == "0dimcpu" and y == -1: + # https://github.com/pytorch/pytorch/issues/73196 + y = y - 1 + if category2 == "0dimcpu" and y == -2: + # avoid pow(0, -2), which gives inconsistent results on integer tensor + y = y - 1 # Avoid division by zero for integer tensors div_like = [torch.div, torch.fmod, torch.remainder] if operation in div_like and (dtype_arg2 == torch.int32 or dtype_arg2 == torch.int64): y[y == 0] = 1 - o = t(x, y, z) - t_jit = torch.jit.script(t) - jit_o = t_jit(x, y, z) - jit_o = t_jit(x, y, z) + test_value = True + if dtype_arg1 == torch.half or dtype_arg2 == torch.half: + test_value = False + if dtype_arg1 == torch.bfloat16 or dtype_arg2 == torch.bfloat16: + test_value = False - self.assertEqual(o.dtype, jit_o.dtype) - self.assertEqual(o, jit_o) - self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD) + try: + if not has_scalar: + o = t(x, y, z) + t_jit = torch.jit.script(t) + jit_o = t_jit(x, y, z) + jit_o = t_jit(x, y, z) + jit_o = t_jit(x, y, z) + + self.assertEqual(o.dtype, jit_o.dtype) + if test_value: + self.assertEqual(o, jit_o) + self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD) + + elif category2 != "scalar": # only test the case where first is scalar + test_fn = self._get_scalar_binary_test_fn((category1, dtype_arg1), (category2, dtype_arg2), operation) + o = test_fn(x, y) + t_jit = torch.jit.script(test_fn) + jit_o = t_jit(x, y) + jit_o = t_jit(x, y) + jit_o = t_jit(x, y) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + self.assertEqual(o.dtype, jit_o.dtype) + if test_value: + self.assertEqual(o, jit_o) + self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD) + except Exception as e: + print("failing test for op: ", operation.__name__) + print("with input\n\tx: ", x) + print("\ty: ", y) + print("\tz: ", z) + raise e + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_binary_ops(self): @@ -682,14 +915,12 @@ def test_binary_ops(self): data_types = [ torch.int32, torch.int64, - # torch.float16, + torch.float16, torch.float32, torch.float64 ] - ''' if TEST_BF16: data_types.append(torch.bfloat16) - ''' operations = [torch.mul, torch.div, torch.atan2, @@ -704,59 +935,60 @@ def test_binary_ops(self): torch.gt, torch.le, torch.lt] - binary_dtype_combinations = itertools.combinations(data_types, 2) + + category_types = [ + "scalar", + "0dim", + "0dimcpu", + "ndim" + ] + + binary_dtype_combinations = list(itertools.combinations(data_types, 2)) + category_combinations = list(itertools.combinations(category_types, 2)) + + for op, dtypes, categories in itertools.product(operations, binary_dtype_combinations, category_combinations): + self._binary_test_helper(op, dtypes, True, categories) # random data + for op, dtypes in itertools.product(operations, binary_dtype_combinations): - self._binary_test_helper(op, dtypes, True) # random data self._binary_test_helper(op, dtypes, False) # special numbers - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_binary_bitwise(self): - def jit_or(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): - return (x & y) | z - - def jit_xor(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): - return (x & y) ^ z + dtypes = [torch.bool, torch.int32, torch.int64] - def jit_lshift(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): - return (x & y) << z + for dtype1, dtype2, dtype3 in itertools.product(dtypes, repeat=3): + def jit_and(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): + return torch.bitwise_and(x, y) & z - def jit_rshift(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): - return (x & y) >> z + def jit_or(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): + return torch.bitwise_or(x, y) | z - for jit_func in [jit_or, jit_xor, jit_lshift, jit_rshift]: - x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda").mul(5).to(torch.long) - y = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda").mul(5).to(torch.long) - z = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda").mul(2).to(torch.long) - - jitted = torch.jit.script(jit_func) - jit_o = jitted(x, y, z) - jit_o = jitted(x, y, z) - o = jit_func(x, y, z) - self.assertEqual(o, jit_o) - self.assertGraphContains(jitted.graph_for(x, y, z), FUSION_GUARD) + def jit_xor(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): + return torch.bitwise_xor(x, y) ^ z - # We shouldn't need this redefinition of the function, but otherwise it won't recompile for a new type - def jit_or(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): - return (x & y) | z + def jit_lshift(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): + return torch.bitwise_left_shift(x, y) << z - def jit_xor(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): - return (x & y) ^ z + def jit_rshift(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): + return torch.bitwise_right_shift(x, y) >> z - for jit_func in [jit_or, jit_xor]: - x = torch.rand(4, 2, dtype=torch.float, device="cuda").round().to(torch.bool) - y = torch.rand(4, 2, dtype=torch.float, device="cuda").round().to(torch.bool) - z = torch.rand(4, 2, dtype=torch.float, device="cuda").round().to(torch.bool) + for jit_func in [jit_and, jit_or, jit_xor, jit_lshift, jit_rshift]: + if torch.bool in {dtype1, dtype2, dtype3} and jit_func in {jit_lshift, jit_rshift}: + continue + x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda").mul(5).to(dtype1) + y = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda").mul(5).to(dtype2) + z = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda").mul(2).to(dtype3) - jitted = torch.jit.script(jit_func) - jit_o = jitted(x, y, z) - jit_o = jitted(x, y, z) - o = jit_func(x, y, z) - self.assertEqual(o, jit_o) - self.assertGraphContains(jitted.graph_for(x, y, z), FUSION_GUARD) + jitted = torch.jit.script(jit_func) + jit_o = jitted(x, y, z) + jit_o = jitted(x, y, z) + o = jit_func(x, y, z) + self.assertEqual(o, jit_o) + self.assertGraphContains(jitted.graph_for(x, y, z), FUSION_GUARD) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_type_as_op(self): @@ -813,7 +1045,7 @@ def threshold(x: torch.Tensor, th: int, val: int): threshold_jit = torch.jit.script(threshold) self._run_helper(threshold_jit, threshold, x, arg2, arg3) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_ternary_ops_integer_compatibility(self): @@ -866,7 +1098,7 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor, alpha: torch.Tensor): self.assertEqual(o, jit_o) self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_ternary_ops_type_promotion(self): @@ -887,7 +1119,22 @@ def test_ternary_ops_type_promotion(self): self._ternary_test_helper(op, dtypes, True) # random data self._ternary_test_helper(op, dtypes, False) # special numbers - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + # We can't test the scalar version of rsub from python + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") + def test_rsub(self): + x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda") + y = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda") + + def rsub(x: torch.Tensor, y: torch.Tensor): + o = torch.rsub(x, y) + o = o * 2. + return o + + rsub_jit = torch.jit.script(rsub) + self._run_helper(rsub_jit, rsub, x, y) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") # legacy fuser does not work for rand_like, see issue #34361 @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_ternary_ops(self): @@ -939,7 +1186,7 @@ def lerp_scale(x: torch.Tensor, y: torch.Tensor, z: float): lerp_scale_jit = torch.jit.script(lerp_scale) self._run_helper(lerp_scale_jit, lerp_scale, x, y, 0.5) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires profiling node to run cuda fuser") def test_addcmul_ops(self): x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda") @@ -967,7 +1214,7 @@ def addcmul_const_alpha(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): addcmul_const_alpha_jit = torch.jit.script(addcmul_const_alpha) self._run_helper(addcmul_const_alpha_jit, addcmul_const_alpha, x, y, z) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_dynamic_size(self): @@ -1007,7 +1254,9 @@ def t(x: torch.Tensor, y: torch.Tensor, z: float): self.assertGraphContains(t_jit.graph_for(x, y, 2.0), FUSION_GUARD) torch._C._jit_set_nvfuser_guard_mode(old_guard) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") def test_random_topo(self): os.environ["PYTORCH_NVFUSER_DISABLE_FALLBACK"] = "1" self.assertTrue(runDefaultTestWithSeed(28449)) @@ -1015,7 +1264,7 @@ def test_random_topo(self): def _compare(self, desc, inp1, inp2, error): a = inp1.clone() b = inp2.clone() - close = torch.allclose(a, b, rtol=error, atol=error) + close = torch.allclose(a, b, rtol=error, atol=error, equal_nan=True) if not close: print(desc, close) z = a - b @@ -1056,7 +1305,7 @@ def t(x: torch.Tensor, y: torch.Tensor): # we are testing inputs with all combination of permutation order, just to # ensure that integration would be able to generate functionally correct # kernels - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_binary_ops_permutation(self): @@ -1070,7 +1319,7 @@ def test_binary_ops_permutation(self): x = [7, 8, 12] self._permutation_helper(x, b_axis, torch.float32, "cuda", perm0, perm1) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_binary_ops_channels_last_with_bcast(self): @@ -1121,7 +1370,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor): self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_reduction(self): @@ -1171,7 +1420,7 @@ def _layer_norm_autodiff_helper(self, model, grad, shapes, args): FileCheck().check(FUSION_GUARD).run(v2.graph) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_layer_norm_autodiff(self): @@ -1213,7 +1462,7 @@ def t(shapes: List[int], x, eps: float, cudnn: bool): self._layer_norm_autodiff_helper(m, grad, shapes, args) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_layer_norm_parser(self): @@ -1272,9 +1521,8 @@ def forward(self, x: torch.Tensor): self.assertTrue(self._compare("comparing rstd failed", rstd, jit_rstd, error)) self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD) - @unittest.skipIf(True, "codegen failure awaiting fix") @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_native_layer_norm(self): @@ -1287,9 +1535,8 @@ def test_native_layer_norm(self): norm_shape = [input_shape[idx] for idx in range(dims - offset, dims)] self._native_layer_norm_helper(input_shape, norm_shape, torch.float32, "cuda", 1e-4, affine) - @unittest.skipIf(True, "codegen failure awaiting fix") @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_native_layer_norm_half(self): @@ -1302,7 +1549,7 @@ def test_native_layer_norm_half(self): self._native_layer_norm_helper(input_shape, norm_shape, torch.float16, "cuda", 5e-3) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") @unittest.skipIf(not TEST_BF16, "device does not support BFloat16") @@ -1315,7 +1562,15 @@ def test_native_layer_norm_bfloat(self): norm_shape = [input_shape[idx] for idx in range(dims - offset, dims)] self._native_layer_norm_helper(input_shape, norm_shape, torch.bfloat16, "cuda", 1e-1) - def _norm_helper(self, shape, dtype, device, error, is_batch_norm_else_instance_norm, memory_format=torch.contiguous_format): + def _norm_helper(self, + shape, + dtype, + device, + error, + is_batch_norm_else_instance_norm, + memory_format=torch.contiguous_format, + *, + layer_dtype=torch.float32): class MyBatchNorm(torch.nn.Module): def __init__(self): super(MyBatchNorm, self).__init__() @@ -1337,8 +1592,8 @@ def forward(self, x: torch.Tensor, r_mean: torch.Tensor, r_var: torch.Tensor): t = MyBatchNorm() if is_batch_norm_else_instance_norm else MyInstanceNorm() x = torch.randn(shape, dtype=dtype, device=device).to(memory_format=memory_format) - running_mean = torch.zeros(shape[1], dtype=torch.float32, device=device) - running_var = torch.ones(shape[1], dtype=torch.float32, device=device) + running_mean = torch.zeros(shape[1], dtype=layer_dtype, device=device) + running_var = torch.ones(shape[1], dtype=layer_dtype, device=device) t_jit = torch.jit.script(t) eager_running_mean = running_mean.clone() @@ -1363,7 +1618,38 @@ def forward(self, x: torch.Tensor, r_mean: torch.Tensor, r_var: torch.Tensor): self.assertGraphContains(t_jit.graph_for(x, running_mean, running_var), FUSION_GUARD) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_layer_norm_trivial_reduce_dim(self): + def t_wb(shapes: List[int], x, w, b, eps: float, cudnn: bool): + o = torch.layer_norm(x, shapes, w, b, eps, cudnn) + o = torch.relu(o) + return o + + batch = [1] + shapes = [2, 7, 3] + + grad = torch.randn(batch + shapes, dtype=torch.float32, device="cuda") + args = [torch.randn(batch + shapes, dtype=torch.float32, device="cuda").requires_grad_()] + args.append(torch.randn(shapes, dtype=torch.float32, device="cuda").requires_grad_()) + args.append(torch.randn(shapes, dtype=torch.float32, device="cuda").requires_grad_()) + self._layer_norm_autodiff_helper(t_wb, grad, shapes, args) + + @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_norm_half_layer(self): + size = [2, 4, 2, 2] + + for is_batch_norm_else_instance_norm in [False, True]: + for mf in [torch.channels_last, torch.contiguous_format]: + self._norm_helper(size, torch.float16, "cuda", 1e-3, is_batch_norm_else_instance_norm, + memory_format=mf, layer_dtype=torch.float16) + + @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_norm_channels_last(self): @@ -1375,7 +1661,7 @@ def test_norm_channels_last(self): self._norm_helper(size, torch.float32, "cuda", 1e-4, is_batch_norm_else_instance_norm, memory_format=mf) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_norm(self): @@ -1392,7 +1678,7 @@ def test_norm(self): self._norm_helper(x, torch.float32, "cuda", 1e-4, is_batch_norm_else_instance_norm) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_norm_large(self): @@ -1408,7 +1694,7 @@ def test_norm_large(self): self._norm_helper(x, torch.float32, "cuda", 1e-4, is_batch_norm_else_instance_norm) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_norm_half(self): @@ -1425,7 +1711,7 @@ def test_norm_half(self): self._norm_helper(x, torch.float16, "cuda", 5e-3, is_batch_norm_else_instance_norm) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") @unittest.skipIf(not TEST_BF16, "device does not support BFloat16") @@ -1442,7 +1728,7 @@ def test_norm_bfloat(self): x[1] = C self._norm_helper(x, torch.bfloat16, "cuda", 1e-1, is_batch_norm_else_instance_norm) - def _softmax_helper(self, shape, reduction_axis, dtype, device, error): + def _softmax_helper(self, shape, reduction_axis, is_log_softmax, dtype, device, error): class MySoftmax(torch.nn.Module): __constants__ = ['reduction_axis'] @@ -1455,22 +1741,40 @@ def forward(self, x: torch.Tensor, y: torch.Tensor): o = torch.nn.functional.softmax(o, dim=self.reduction_axis) return o - t = MySoftmax() + class MyLogSoftmax(torch.nn.Module): + __constants__ = ['reduction_axis'] - x = torch.randn(shape, dtype=dtype, device=device) - y = torch.randn(shape, dtype=dtype, device=device) + def __init__(self): + super(MyLogSoftmax, self).__init__() + self.reduction_axis = reduction_axis + + def forward(self, x: torch.Tensor, y: torch.Tensor): + o = torch.add(x, y) + o = torch.nn.functional.log_softmax(o, dim=self.reduction_axis) + return o + + gradient_check = (dtype == torch.float64) + t = MyLogSoftmax() if is_log_softmax else MySoftmax() + + x = torch.randn(shape, dtype=dtype, device=device, requires_grad=gradient_check) + y = torch.randn(shape, dtype=dtype, device=device, requires_grad=gradient_check) t_jit = torch.jit.script(t) jit_o = t_jit(x, y) jit_o = t_jit(x, y) - o = t(x, y) - self.assertEqual(o.dtype, jit_o.dtype) - # numerical issues here due to our scheduling. - # can't use `self.assertEqual(o, jit_o)` - self.assertTrue(self._compare("comparing output failed", o, jit_o, error)) - self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD) + jit_o = t_jit(x, y) + + if gradient_check: + gradcheck(t_jit.forward, [x, y], nondet_tol=1e-5) + else: + o = t(x, y) + self.assertEqual(o.dtype, jit_o.dtype) + # numerical issues here due to our scheduling. + # can't use `self.assertEqual(o, jit_o)` + self.assertTrue(self._compare("comparing output failed", o, jit_o, error)) + self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_softmax_dtype(self): @@ -1512,7 +1816,7 @@ def t(x: torch.Tensor, y: torch.Tensor): FileCheck().check(FUSION_GUARD).run(bwd_graph) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test__softmax_function(self): @@ -1536,7 +1840,7 @@ def t(x: torch.Tensor, y: torch.Tensor): self.assertGraphContainsExactly(t_jit.graph_for(x, y), FUSION_GUARD, 1, consider_subgraphs=True) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test__softmax_function_half_to_float(self): @@ -1560,7 +1864,7 @@ def t(x: torch.Tensor, y: torch.Tensor): self.assertGraphContainsExactly(t_jit.graph_for(x, y), FUSION_GUARD, 1, consider_subgraphs=True) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_softmax(self): @@ -1569,14 +1873,21 @@ def test_softmax(self): output_size = int(pow(output_size, 1. / dims)) reduction_sizes = [67, 256, 1024, 4096] + # gradient check + for reduction_dim in range(dims): + for is_log_softmax in [False, True]: + shape = [output_size for idx in range(dims)] + self._softmax_helper(shape, reduction_dim, is_log_softmax, torch.float64, "cuda", 1e-4) + for reduction_dim in range(dims): for reduction_size in reduction_sizes: x = [output_size for idx in range(dims)] x[reduction_dim] = reduction_size - self._softmax_helper(x, reduction_dim, torch.float32, "cuda", 1e-4) + for is_log_softmax in [False, True]: + self._softmax_helper(x, reduction_dim, is_log_softmax, torch.float32, "cuda", 1e-4) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_softmax_half(self): @@ -1589,10 +1900,11 @@ def test_softmax_half(self): for reduction_size in reduction_sizes: x = [output_size for idx in range(dims)] x[reduction_dim] = reduction_size - self._softmax_helper(x, reduction_dim, torch.float16, "cuda", 5e-3) + for is_log_softmax in [False, True]: + self._softmax_helper(x, reduction_dim, is_log_softmax, torch.float16, "cuda", 5e-3) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") @unittest.skipIf(not TEST_BF16, "device does not support BFloat16") @@ -1606,10 +1918,11 @@ def test_softmax_bfloat(self): for reduction_size in reduction_sizes: x = [output_size for idx in range(dims)] x[reduction_dim] = reduction_size - self._softmax_helper(x, reduction_dim, torch.bfloat16, "cuda", 1e-1) + for is_log_softmax in [False, True]: + self._softmax_helper(x, reduction_dim, is_log_softmax, torch.bfloat16, "cuda", 1e-1) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_reduction_permutation(self): @@ -1623,7 +1936,7 @@ def test_reduction_permutation(self): self._reduction_helper(x, axes, torch.float32, "cuda", perm0, perm1) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_reduction_multiple_output(self): @@ -1662,7 +1975,7 @@ def t(x: torch.Tensor, y: torch.Tensor, scale: float, z: torch.Tensor): self.assertGraphContains(t_jit.graph_for(x, y, scale, z), FUSION_GUARD) torch._C._jit_set_nvfuser_guard_mode(old_guard) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_channels_last_with_broadcast(self): @@ -1768,7 +2081,7 @@ def t(x: torch.Tensor, y: torch.Tensor): ''' @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_pw_single_reduction_partition(self): @@ -1793,62 +2106,118 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_permutation_preservation(self): sizes = [2, 3, 4, 5] dtype = torch.float device = "cuda" - x = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last) - def t(x: torch.Tensor): - o = torch.relu(x) - o = torch.sum(o, dim=[0]) - return o - t_jit = torch.jit.script(t) - jit_o = t_jit(x) - jit_o = t_jit(x) - o = t(x) - self.assertEqual(o.dtype, jit_o.dtype) - self.assertEqual(o, jit_o) - self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD) - # TODO: we could preserve permutation to inputs - self.assertEqual(o.stride(), jit_o.stride()) + with nvfuser_singleton_fusion(True): - def t(x: torch.Tensor): - o = torch.relu(x) - o = torch.add(o, 1.0) - return o + def t(x: torch.Tensor): + return torch.relu(x) - t_jit = torch.jit.script(t) - jit_o = t_jit(x) - jit_o = t_jit(x) - o = t(x) - self.assertEqual(o.dtype, jit_o.dtype) - self.assertEqual(o, jit_o) - self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD) - self.assertTrue(jit_o.is_contiguous(memory_format=torch.channels_last)) + t_jit = torch.jit.script(t) + x = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last) + self._run_helper(t_jit, t, x, check_stride=True) - @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + def t(x: torch.Tensor, y: torch.Tensor): + return torch.add(x, y) + + t_jit = torch.jit.script(t) + x = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last) + y = torch.randn(sizes[1:], dtype=dtype, device=device) + self._run_helper(t_jit, t, x, y, check_stride=True) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") - def test_normalization_partition(self): - sizes = [3, 8, 5] + def test_permutation_preservation_edge_case_0(self): + sizes = [2, 3, 4, 5] dtype = torch.float device = "cuda" - x = torch.randn(sizes, dtype=dtype, device=device) - y = torch.randn(sizes, dtype=dtype, device=device) - z = torch.randn(sizes, dtype=dtype, device=device) - r_m = torch.randn(8, dtype=dtype, device=device) - r_v = torch.randn(8, dtype=dtype, device=device) + x = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last) + # mismatch rank with *note* different permutation recognized by PE + bias = torch.randn(3, dtype=dtype, device=device).unsqueeze(-1).unsqueeze(-1) - def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor, r_mean: torch.Tensor, r_var: torch.Tensor): - o = torch.add(x, y) - o = torch.nn.functional.softmax(o, dim=0) - o = torch.add(o, z) - o = torch.nn.functional.batch_norm(o, r_mean, r_var, training=True) + def t(x, y): + return x + y + + t_jit = torch.jit.script(t) + with nvfuser_singleton_fusion(True): + self._run_helper(t_jit, t, x, bias, check_stride=True) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_permutation_preservation_edge_case_1_broken(self): + sizes = [2, 3, 4, 5] + dtype = torch.float + device = "cuda" + x = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last) + # in-compatible permutation, this will cause format propagation to break + bias = torch.randn(4, 5, dtype=dtype, device=device) + + def t(x, y): + return x + y + + t_jit = torch.jit.script(t) + with nvfuser_singleton_fusion(True): + for _ in range(5): + jit_o = t_jit(x, bias) + + o = t(x, bias) + self.assertEqual(o.dtype, jit_o.dtype) + self.assertEqual(o, jit_o) + try: + # nvfuser does not support in-compatible permutation, this will throw + self.assertEqual(o.stride(), jit_o.stride()) + except Exception as e: + warnings.warn( + "permutation propagatoin is broken, proper support should come after nvfuser permutation scheduler update") + self.assertGraphContains(t_jit.graph_for(x, bias), FUSION_GUARD) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_permutation_preservation_edge_case_2(self): + sizes = [2, 3, 4, 5] + dtype = torch.float + device = "cuda" + x = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last) + y = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last) + z = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=torch.channels_last) + + def t(x, y, w): + tmp = torch.lerp(x, y, w) + tmp = torch.clamp(tmp, -1.0, 0.5) + tmp = torch.nn.functional.softplus(tmp) + return torch.threshold(tmp, -2.0, 0.5) + + t_jit = torch.jit.script(t) + self._run_helper(t_jit, t, x, y, z, check_stride=True) + + @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_normalization_partition(self): + sizes = [3, 8, 5] + dtype = torch.float + device = "cuda" + x = torch.randn(sizes, dtype=dtype, device=device) + y = torch.randn(sizes, dtype=dtype, device=device) + z = torch.randn(sizes, dtype=dtype, device=device) + r_m = torch.randn(8, dtype=dtype, device=device) + r_v = torch.randn(8, dtype=dtype, device=device) + + def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor, r_mean: torch.Tensor, r_var: torch.Tensor): + o = torch.add(x, y) + o = torch.nn.functional.softmax(o, dim=0) + o = torch.add(o, z) + o = torch.nn.functional.batch_norm(o, r_mean, r_var, training=True) return o t_jit = torch.jit.script(t) jit_o = t_jit(x, y, z, r_m, r_v) @@ -1859,7 +2228,7 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor, r_mean: torch.Tensor, r self.assertGraphContains(t_jit.graph_for(x, y, z, r_m, r_v), FUSION_GUARD) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_sum_to_one(self): @@ -1880,7 +2249,7 @@ def t(x: torch.Tensor): self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_single_reduction_broadcast(self): @@ -1904,7 +2273,7 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_trivial_reduction(self): @@ -1925,7 +2294,7 @@ def t(x: torch.Tensor): self.assertEqual(o, jit_o) self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_profiling_node(self): @@ -1940,8 +2309,28 @@ def repro(x: torch.Tensor, alpha: float): repro_jit = torch.jit.script(repro) self._run_helper(repro_jit, repro, x, 0.6) + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_rand_like(self): + dtype = torch.float + device = "cuda" + + def t(x: torch.Tensor, alpha: float): + o = torch.rand_like(x) + o = torch.add(o, alpha) + return o + + # disabling cache so new inputs would generate new graph + t.__disable_jit_function_caching__ = True + + for m_format in [torch.contiguous_format, torch.channels_last]: + x = torch.randn(4, 5, 6, 7, dtype=dtype, device=device).to(memory_format=m_format) + t_jit = torch.jit.script(t) + self._run_helper(t_jit, t, x, 0.6, check_stride=True) + @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_reduction_sizes_op(self): @@ -1965,7 +2354,7 @@ def t(x: torch.Tensor, y: torch.Tensor): self.assertGraphContainsExactly(t_jit.graph_for(x, y), FUSION_GUARD, 0) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_profile_ivalue(self): @@ -1988,7 +2377,28 @@ def t(x: torch.Tensor, y: torch.Tensor, dim: List[int], keepdim: bool): self.assertGraphContains(t_jit.graph_for(x, y, (0, 1), False), FUSION_GUARD) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_profile_ivalue_multiple_profiles(self): + dtype = torch.float + device = "cuda" + x = torch.randn([7, 4, 7], dtype=dtype, device=device) + + def t(x, num: int): + for i in range(num): + # varying reduction axes should break profile_ivalue + tmp = x.sum(i, keepdim=True) + # inplace add on input/output, can't be functionalized/fused + x += tmp + return x + + with nvfuser_singleton_fusion(True): + t_jit = torch.jit.script(t) + self._run_helper(t_jit, t, x, 3, num_fusion=0) + + @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_sum_to_size(self): @@ -2003,12 +2413,7 @@ def t(x: torch.Tensor, y: torch.Tensor, new_size: List[int]): return o t_jit = torch.jit.script(t) - jit_o = t_jit(x, y, (4, 1)) - jit_o = t_jit(x, y, (4, 1)) - o = t(x, y, (4, 1)) - self.assertEqual(o.dtype, jit_o.dtype) - self.assertEqual(o, jit_o) - self.assertGraphContains(t_jit.graph_for(x, y, (4, 1)), FUSION_GUARD) + self._run_helper(t_jit, t, x, y, (4, 1)) # update shape: old kernel should handle dynamic shape well without # recompilation @@ -2016,13 +2421,20 @@ def t(x: torch.Tensor, y: torch.Tensor, new_size: List[int]): y = torch.randn([2, 5, 8], dtype=dtype, device=device) # (TODO) check executed kernel, should extend autograd.profiler to fused # kernels - jit_o = t_jit(x, y, (5, 1)) - o = t(x, y, (5, 1)) - self.assertEqual(o.dtype, jit_o.dtype) - self.assertEqual(o, jit_o) + self._run_helper(t_jit, t, x, y, (5, 1)) + + with nvfuser_singleton_fusion(True): + x = torch.randn([2, 5, 8], dtype=dtype, device=device) + + def t(x: torch.Tensor): + # no-op reduction + return x.sum_to_size((2, 5, 8)) + + t_jit = torch.jit.script(t) + self._run_helper(t_jit, t, x) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_grad_sum_to_size(self): @@ -2081,7 +2493,7 @@ def t(x: torch.Tensor, y: torch.Tensor): self.assertEqual(x.grad, ref_x.grad) self.assertEqual(y.grad, ref_y.grad) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_dropout_inference_fusion(self): @@ -2098,7 +2510,7 @@ def t(x: torch.Tensor, p: float, train: bool): self._run_helper(t_jit, t, x, 0.15, False) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_dropout_train_nograd_fusion(self): @@ -2115,7 +2527,7 @@ def t(x: torch.Tensor, p: float, train: bool): self._run_helper(t_jit, t, x, 0.0, True) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_dropout_train_nograd_prob_check(self): @@ -2146,55 +2558,60 @@ def t(x: torch.Tensor, p: float, train: bool): self.assertGraphContainsExactly(t_jit.graph_for(x, prob, True), FUSION_GUARD, 1, consider_subgraphs=True) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_dropout_training_fusion(self): dtype = torch.float device = "cuda" - x = torch.randn([10, 4, 8], dtype=dtype, device=device, requires_grad=True) - grads = torch.randn([10, 4, 8], dtype=dtype, device=device) + sizes = [2, 3, 4, 5] def t(x: torch.Tensor, p: float, train: bool): o = torch.nn.functional.dropout(x, p, training=train) o = o * 2.0 return o - t_jit = torch.jit.script(t) - - # The drop probability needs to be set to zero given that the order of picking random - # numbers between eager mode and the jit is different - self._run_training_helper(t_jit, t, grads, x, 0.0, True) - def t2(x: torch.Tensor, p: float, train: bool): o = torch.nn.functional.softmax(x, dim=-1) o = torch.nn.functional.dropout(o, p, training=train) return o - t2_jit = torch.jit.script(t2) + # disabling cache so new inputs would generate new graph + t.__disable_jit_function_caching__ = True + t2.__disable_jit_function_caching__ = True - # The drop probability needs to be set to zero given that the order of picking random - # numbers between eager mode and the jit is different - self._run_training_helper(t2_jit, t2, grads, x, 0.0, True) + for fn in [t, t2]: + for m_format in [torch.contiguous_format, torch.channels_last]: + fn_jit = torch.jit.script(fn) + x = torch.randn(sizes, dtype=dtype, device=device, requires_grad=True).to(memory_format=m_format) + grads = torch.randn(sizes, dtype=dtype, device=device).to(memory_format=m_format) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + # The drop probability needs to be set to zero given that the order of picking random + # numbers between eager mode and the jit is different + self._run_training_helper(fn_jit, fn, grads, x, 0.0, True) + + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_gelu(self): + old_guard = torch._C._jit_set_nvfuser_guard_mode(True) dtype = torch.float device = "cuda" x = torch.randn([1024, 1024], dtype=dtype, device=device, requires_grad=True) grads = torch.randn([1024, 1024], dtype=dtype, device=device, requires_grad=False) - def t(x: torch.Tensor): - o = torch.nn.functional.gelu(x) + def t(x: torch.Tensor, mode: str): + o = torch.nn.functional.gelu(x, approximate=mode) o = o * 2.0 return o t_jit = torch.jit.script(t) - self._run_training_helper(t_jit, t, grads, x) + self._run_training_helper(t_jit, t, grads, x, 'none') + self._run_training_helper(t_jit, t, grads, x, 'tanh') + torch._C._jit_set_nvfuser_guard_mode(old_guard) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_dropout_training_prob_check(self): @@ -2227,13 +2644,14 @@ def t(x: torch.Tensor, p: float, train: bool): self.assertTrue((percent_zeros >= (prob - 0.01)) and (percent_zeros <= (prob + 0.01))) self.assertGraphContainsExactly(t_jit.graph_for(x, prob, True), FUSION_GUARD, 1, consider_subgraphs=True) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_linear(self): in_feature = 2 out_feature = 8 - x = torch.randn(4, in_feature, dtype=torch.float32, device='cuda') + # Changing the input dims to be 3-D to avoid eager mode bias fusion + # The bias fusion causes some precision issues with TF-32 weight = torch.randn(out_feature, in_feature, dtype=torch.float32, device='cuda') bias = torch.randn(out_feature, dtype=torch.float32, device='cuda') @@ -2242,17 +2660,55 @@ def t(x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor): o = torch.relu(o) return o - # bias set to true. - t_jit = torch.jit.script(t) - jit_o = t_jit(x, weight, bias) - jit_o = t_jit(x, weight, bias) - o = t(x, weight, bias) - self.assertEqual(o, jit_o) - # since the output value is not used at all, the fusion operator should - # have been optimized away - self.assertGraphContainsExactly(t_jit.graph_for(x, weight, bias), FUSION_GUARD, 1) + # disabling cache so new inputs would generate new graph + t.__disable_jit_function_caching__ = True - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + sizes = [in_feature, ] + for i in range(4): + # increase input rank in each iteration + sizes.insert(0, i + 2) + x = torch.randn(*sizes, dtype=torch.float32, device='cuda') + t_jit = torch.jit.script(t) + # fusion only happens for input rank >= 4 + has_fusion = 0 if len(sizes) < 4 else 1 + self._run_helper(t_jit, t, x, weight, bias, check_stride=True, num_fusion=has_fusion) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_linear_symbolic_shapes(self): + def fn(x: int): + y = torch.zeros((3, 4, x, x + 2)).cuda() + for i in range(2): + inp = torch.rand((3, 4, x, x + i)).cuda() + weight = torch.rand((x + 2, x + i)).cuda() + bias = torch.rand((x, x + 2)).cuda() + y += torch.sin(torch.nn.functional.linear(inp, weight, bias)) + return y + + fn_s = torch.jit.script(fn) + fn_s(5) + fn_s(5) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_conv2d_symbolic_shapes(self): + def fn(x: int): + responses = [] + for i in range(2): + inp = torch.rand((3, 3, 32, 32)).cuda() + weight = torch.rand((x + i, 3, 7, 7)).cuda() + bias = torch.rand((x + i)).cuda() + res = torch.nn.functional.conv2d(inp, weight, bias, padding=3) + responses.append(res) + return responses + + fn_s = torch.jit.script(fn) + fn_s(5) + fn_s(5) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_backward_type(self): @@ -2295,7 +2751,7 @@ def test1(x: torch.Tensor, y: torch.Tensor): self.assertEqual(y.grad.dtype, y.dtype) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_autocast_1(self): @@ -2303,7 +2759,7 @@ def t(x: torch.Tensor, y: torch.Tensor): o = x * 2.0 o = torch.softmax(o, dim=-1) o = o * 3.0 - o = torch.matmul(o, y) + o = torch._C._nn.linear(o, y) return o x = torch.randn(8, 4, dtype=torch.half, device='cuda', requires_grad=True) @@ -2314,7 +2770,7 @@ def t(x: torch.Tensor, y: torch.Tensor): for i in range(3): with torch.cuda.amp.autocast(): jit_o = t_jit(x, y) - if i == 2 : + if i == 2: fwd_graph = t_jit.graph_for(x, y) jit_o.backward(grad) @@ -2332,7 +2788,7 @@ def t(x: torch.Tensor, y: torch.Tensor): self.assertEqual(y.grad.dtype, y.dtype) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_autocast_2(self): @@ -2349,9 +2805,9 @@ def t(x: torch.Tensor): t_jit = torch.jit.script(t) for i in range(3): - with torch.cuda.amp.autocast() : + with torch.cuda.amp.autocast(): jit_o = t_jit(x) - if i == 2 : + if i == 2: fwd_graph = t_jit.graph_for(x) jit_o.backward(grad) @@ -2368,7 +2824,7 @@ def t(x: torch.Tensor): self.assertEqual(x.grad.dtype, x.dtype) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") @unittest.skipIf(not TEST_BF16, "device does not support BFloat16") @@ -2377,7 +2833,7 @@ def t(x: torch.Tensor, y: torch.Tensor): o = x * 2.0 o = torch.softmax(o, dim=-1) o = o * 3.0 - o = torch.matmul(o, y) + o = torch._C._nn.linear(o, y) return o x = torch.randn(8, 4, dtype=torch.bfloat16, device='cuda', requires_grad=True) @@ -2388,7 +2844,7 @@ def t(x: torch.Tensor, y: torch.Tensor): for i in range(3): with torch.cuda.amp.autocast(dtype=torch.bfloat16): jit_o = t_jit(x, y) - if i == 2 : + if i == 2: fwd_graph = t_jit.graph_for(x, y) jit_o.backward(grad) @@ -2406,7 +2862,7 @@ def t(x: torch.Tensor, y: torch.Tensor): self.assertEqual(y.grad.dtype, y.dtype) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") @unittest.skipIf(not TEST_BF16, "device does not support BFloat16") @@ -2424,9 +2880,9 @@ def t(x: torch.Tensor): t_jit = torch.jit.script(t) for i in range(3): - with torch.cuda.amp.autocast(dtype=torch.bfloat16) : + with torch.cuda.amp.autocast(dtype=torch.bfloat16): jit_o = t_jit(x) - if i == 2 : + if i == 2: fwd_graph = t_jit.graph_for(x) jit_o.backward(grad) @@ -2442,7 +2898,7 @@ def t(x: torch.Tensor): self.assertEqual(jit_o.dtype, torch.float) self.assertEqual(x.grad.dtype, x.dtype) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_to_dtype_fp32_to_fp16(self): @@ -2461,7 +2917,7 @@ def t(x: torch.Tensor): self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1) self.assertEqual(jit_o.dtype, torch.half) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_to_dtype_fp16_to_fp32(self): @@ -2480,7 +2936,7 @@ def t(x: torch.Tensor): self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1) self.assertEqual(jit_o.dtype, torch.float) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_to_dtype_fp16_to_fp16(self): @@ -2499,7 +2955,7 @@ def t(x: torch.Tensor): self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1) self.assertEqual(jit_o.dtype, torch.half) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") @unittest.skipIf(not TEST_BF16, "device does not support BFloat16") @@ -2519,7 +2975,7 @@ def t(x: torch.Tensor): self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1) self.assertEqual(jit_o.dtype, torch.bfloat16) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") @unittest.skipIf(not TEST_BF16, "device does not support BFloat16") @@ -2539,7 +2995,7 @@ def t(x: torch.Tensor): self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1) self.assertEqual(jit_o.dtype, torch.float) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") @unittest.skipIf(not TEST_BF16, "device does not support BFloat16") @@ -2559,7 +3015,7 @@ def t(x: torch.Tensor): self.assertGraphContainsExactly(t_jit.graph_for(x), FUSION_GUARD, 1) self.assertEqual(jit_o.dtype, torch.bfloat16) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(not TEST_MULTIGPU, "requires multiple CUDA device") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") @@ -2581,7 +3037,7 @@ def t(x): x = x.to("cuda:1") jit_o = t_jit(x) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_graph_for_with_missing_optimized_engine(self): @@ -2608,7 +3064,7 @@ def t(x: torch.Tensor, flag: bool): # have been optimized away self.assertGraphContainsExactly(t_jit.graph_for(x, True), FUSION_GUARD, 1, True) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_branches(self): @@ -2638,7 +3094,7 @@ def t(x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, flag: bool): # have been optimized away self.assertGraphContainsExactly(t_jit.graph_for(x, weight, bias, True), FUSION_GUARD, 1) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_scalar_tensor(self): @@ -2661,7 +3117,7 @@ def t(x: torch.Tensor): @unittest.skipIf(os.environ.get('PYTORCH_NO_CUDA_MEMORY_CACHING') is not None, "skipping graph_rng when caching allocator is disabled") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(CUDA_MAJOR < 11, "requires CUDA11 or above") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") @@ -2728,8 +3184,8 @@ def __init__(self, num_features=10, affine=True, track_running_stats=True): track_running_stats=track_running_stats).to(dtype=dtype) def forward(self, x): - o = x * 2.0 - o = self.bn(o) + o = self.bn(x) + o = o * 2.0 return o x = torch.randn(batch, c, hw, hw, dtype=torch.float, device="cuda").to(dtype=dtype).requires_grad_() @@ -2818,7 +3274,7 @@ def forward(self, x): e0)) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_batch_norm_half(self): @@ -2833,7 +3289,25 @@ def test_batch_norm_half(self): self._test_batch_norm_impl_index_helper(4, 8, 5, affine, track_running_stats, training, torch.half) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_batch_norm_impl_index_inner_bcast(self): + # the repro + self._test_batch_norm_impl_index_helper(2, 1, 1, False, True, True) + + # running the full set + setups = [ + [True, True], + [False, False], + [True, False], + [False, True]] + for training_and_track, affine in itertools.product(setups, [True, False]): + training, track_running_stats = training_and_track + self._test_batch_norm_impl_index_helper(2, 1, 1, affine, track_running_stats, training) + + @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_batch_norm_impl_index_correctness(self): @@ -2857,7 +3331,7 @@ def test_batch_norm_impl_index_correctness(self): training, track_running_stats = training_and_track self._test_batch_norm_impl_index_helper(b, c, hw, affine, track_running_stats, training) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_softplus_fuser(self): @@ -2883,7 +3357,7 @@ def shifted_softplus(x: torch.Tensor, shift: float): assert torch.allclose(jit_grad, aten_grad) self.assertGraphContains(jitted.graph_for(inp, 0.693147), FUSION_GROUP, True) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_inplace_removal(self): @@ -2903,7 +3377,7 @@ def t(x: torch.Tensor): self.assertGraphContains(graph, 'aten::add', True) self.assertGraphContains(graph, 'aten::relu', True) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_conv2d_bias(self): @@ -2936,7 +3410,8 @@ def t_not_fused(x: torch.Tensor, w: torch.Tensor): self.assertGraphContains(graph, 'aten::relu', True) def t_bias(x: torch.Tensor, w: torch.Tensor, bias: torch.Tensor): - return torch.nn.functional.conv2d(x, w, bias) + o = torch.nn.functional.conv2d(x, w, bias) + return o.relu() jitted_bias = torch.jit.script(t_bias) @@ -2944,11 +3419,11 @@ def t_bias(x: torch.Tensor, w: torch.Tensor, bias: torch.Tensor): jit_o = jitted_bias(inp, weight, bias) graph = jitted_bias.graph_for(inp) - self.assertGraphContainsExactly(graph, FUSION_GROUP, 0) + self.assertGraphContains(graph, FUSION_GROUP, True) self.assertGraphContains(graph, 'prim::add_optional', True) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_remove_output_used_only_in_dtype(self): @@ -2981,7 +3456,7 @@ def forward(self, x, y): self.assertGraphContains(graph, FUSION_GROUP, True) @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_fix_shape_expression_bn(self): @@ -3013,31 +3488,6 @@ def forward(self, x, y): graph = jitted.graph_for(x, y) self.assertGraphContains(graph, FUSION_GROUP, True) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") - @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, - "Requires fusion optimization pass to be effective") - def test_linear_1d_weight_mismatch_bias_dtype(self): - def t(x: torch.Tensor, w: torch.Tensor, b: torch.Tensor): - o = torch.nn.functional.linear(x, w, b) - return o.relu() - - device = "cuda" - jitted = torch.jit.script(t) - x = torch.randn(2, 5, 5, dtype=torch.half, device=device) - w = torch.randn(5, dtype=torch.half, device=device) - b = torch.randn(5, dtype=torch.float32, device=device) - - for i in range(3): - jit_o = jitted(x, w, b) - jit_o = jitted(x, w, b) - o = t(x, w, b) - self.assertEqual(o, jit_o) - self.assertEqual(o.dtype, jit_o.dtype) - self.assertEqual(o.size(), jit_o.size()) - graph = jitted.graph_for(x, w, b) - self.assertGraphContains(graph, FUSION_GROUP, True) - self.assertGraphContains(graph, 'aten::matmul', True) - def _run_fwd_helper(self, func, ops, *args): jitted = torch.jit.script(func) for i in range(3): @@ -3052,7 +3502,8 @@ def _run_fwd_helper(self, func, ops, *args): for op in ops: self.assertGraphContainsExactly(graph, op, 0) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_sibling_fusion(self): @@ -3065,7 +3516,7 @@ def t(x: torch.Tensor): o1 = x + 1.0 o2 = x * 0.5 return o1, o2 - self._run_fwd_helper(t, ['aten::add'], x) + self._run_fwd_helper(t, ['aten::add', 'aten::mul'], x) def t2(x: torch.Tensor, y: torch.Tensor): o1 = x.sum(0) @@ -3073,8 +3524,7 @@ def t2(x: torch.Tensor, y: torch.Tensor): return o1, o2 self._run_fwd_helper(t2, ['aten::sum', 'aten::mul'], x, y) - @unittest.skipIf(True, "Fixed in PR #68804") - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_clean_profile_ivalue(self): @@ -3090,13 +3540,13 @@ def t(x: torch.Tensor, flag: bool): return torch.dropout(x, 0.5, flag) jit_t = torch.jit.script(t) - for idx in range(5) : + for idx in range(5): out = jit_t(x, True) graph = jit_t.graph_for(x, True) out = jit_t(x, False) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_sibling_fusion_no_scalar_inputs(self): @@ -3117,79 +3567,1248 @@ def t(x: torch.Tensor, y: torch.Tensor): graph = jitted.graph_for(x, y) self.assertGraphContainsExactly(graph, FUSION_GROUP, 0) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") - @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, - "Requires fusion optimization pass to be effective") - def test_singleton_fusion(self): - x = torch.randn(4, 2, device="cuda") + def _bias_view_relu_helper(self, shape, output_shape, dtype, device, error): + class BiasViewRelu(torch.nn.Module): + def __init__(self): + super(BiasViewRelu, self).__init__() + self.bias = torch.nn.Parameter(torch.randn(shape, dtype=dtype, device=device), requires_grad=False) + with torch.no_grad(): + self.bias.fill_(10) + + def forward(self, inputs: torch.Tensor, view_shape: List[int]): + o = inputs + self.bias + o = o.view(view_shape) + return torch.relu(o) + + t = BiasViewRelu() + x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False) + t_jit = torch.jit.script(t) - with nvfuser_singleton_fusion(True): - def t(x): - return x.relu() + # profiling + jit_o = t_jit(x, output_shape) + # optimization + jit_o = t_jit(x, output_shape) + # final + jit_o = t_jit(x, output_shape) + # eager - baseline + o = t(x, output_shape) - t_jit = torch.jit.script(t) - self._run_helper(t_jit, t, x) + self.assertEqual(o.dtype, jit_o.dtype) + self.assertTrue(self._compare("comparing output failed", o, jit_o, error)) + graph = t_jit.graph_for(x, output_shape) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + has_inferred_dimension = any([dim == -1 for dim in output_shape]) + if has_inferred_dimension: + # prohibit fusing when view_shape contains an inferred dimension + self.assertGraphContainsExactly(graph, FUSION_GROUP, 0) + self.assertGraphContainsExactly(graph, 'prim::view_copy', 0) + else: + self.assertGraphContains(graph, FUSION_GUARD) + self.assertGraphContains(graph, 'prim::view_copy', True) + + def _alias_bias_view_relu_helper(self, shape, output_shape, dtype, device, error): + class BiasViewRelu(torch.nn.Module): + def __init__(self): + super(BiasViewRelu, self).__init__() + self.bias = torch.nn.Parameter(torch.randn(shape, dtype=dtype, device=device), requires_grad=False) + with torch.no_grad(): + self.bias.fill_(10) + + def forward(self, inputs : torch.Tensor, bias : torch.Tensor, view_shape : List[int]): + o = inputs.view(view_shape) + inputs.add_(bias) + return torch.relu(o) + + t = BiasViewRelu() + x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False) + bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False) + t_jit = torch.jit.script(t) + + # profiling + jit_o = t_jit(x.clone(), bias, output_shape) + # optimization + jit_o = t_jit(x.clone(), bias, output_shape) + # final + jit_o = t_jit(x.clone(), bias, output_shape) + # eager - baseline + o = t(x.clone(), bias, output_shape) + + self.assertEqual(o.dtype, jit_o.dtype) + self.assertTrue(self._compare("comparing output failed", o, jit_o, error)) + graph = t_jit.graph_for(x, bias, output_shape) + self.assertGraphContainsExactly(graph, FUSION_GUARD, 0) + self.assertGraphContainsExactly(graph, 'prim::view_copy', 0) + + # generate random view given original view + def _random_view(self, original_view, max_len=8, max_views=10000): + class Moves(enum.Enum): + Merge = 0 + Split = 1 + Broadcast = 2 + ImplicitBroadcast = 3 + Keep = 4 + + def valid(old_view, new_view): + old_view_size = reduce(operator.mul, old_view) + new_view_size = reduce(operator.mul, new_view) + return old_view_size == new_view_size + + # given a random starting number, find the nearest divisor + def find_nearest_divisor(N): + if 2 >= (N - 1): + return -1 + result = random.randint(2, N - 1) + while (N % result) != 0: + result += 1 + return result + + complete_views = set([tuple(original_view)]) + + to_visit = [] + # empty new view, curent originaal view, start pos=0, move count = 0, last_move + to_visit.append(([], original_view, 0, [], Moves.Keep)) + + # depth-first search of view shapes, starting from the original view + while len(to_visit) > 0 and len(complete_views) < max_views: + new_view, old_view, odx, move_list, last_move = to_visit[-1] + to_visit.pop() + + # iterate over each move type + for idx in range(len(Moves)): + state = Moves(idx) + new_view_clone = copy.deepcopy(new_view) + old_view_clone = copy.deepcopy(old_view) + new_move_list = move_list + [state] + new_odx = odx + + # Update state using Move state + if state == Moves.Keep: + new_size = old_view_clone[odx] + new_view_clone.append(new_size) + new_odx += 1 + + elif state == Moves.Merge: + if odx + 1 < len(old_view_clone): + new_size = old_view_clone[odx] * old_view_clone[odx + 1] + new_view_clone.append(new_size) + new_odx += 2 + else: + continue + + elif state == Moves.Broadcast and last_move != Moves.Broadcast: + new_view_clone.append(1) + + elif state == Moves.Split: + new_size = find_nearest_divisor(old_view_clone[odx]) + if new_size == -1: + continue + new_view_clone.append(new_size) + old_view_clone[odx] = int(old_view[odx] / new_size) + + if old_view_clone[odx] == 1: + new_odx += 1 + + elif state == Moves.ImplicitBroadcast: + old_view_clone.insert(odx + 1, 1) + new_size = old_view[odx] * 1 + new_view_clone.append(new_size) + new_odx += 2 + + if new_odx < len(old_view_clone) and len(new_move_list) < max_len: + to_visit.append((new_view_clone, old_view_clone, new_odx, new_move_list, state)) + elif (valid(original_view, new_view_clone)): + final_new_view = tuple(new_view_clone) + complete_views.add(final_new_view) + return list(complete_views) + + # ndims - number of dimensions + # test_fn - view test function + def _view_test_generator(self, ndims, test_fn): + # create random tensor + # max value for each dimension + max_size = 10e7 + max_value = max(int(pow(max_size, 1. / ndims)), 1) + sizes = [random.randint(1, max_value) for idx in range(ndims)] + x = torch.randn(sizes) + + original_sizes = list(x.size()) + all_views = self._random_view(original_sizes) + random.shuffle(all_views) + + max_samples = 20 + max_views = min(len(all_views), max_samples) + total = 0 + correct = 0 + # test random combinations of compatible views + for idx in range(max_views): + for jdx in range(idx + 1, max_views): + total += 1 + test_fn(all_views[idx], all_views[jdx], torch.float, 'cuda', 1e-6) + + @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since view is disabled now") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") - def test_disable_sibling_fuse(self): - x = torch.randn(4, 2, device="cuda") - y = torch.randn(8, device="cuda") - s = torch.tensor(1.5, device="cuda") + def test_view(self): + torch._C._jit_set_nvfuser_guard_mode(True) + self._bias_view_relu_helper([2, 3, 4, 5], [-1, 4, 5], torch.float, 'cuda', 1e-6) + for ndims in range(1, 5): + self._view_test_generator(ndims, self._bias_view_relu_helper) + self._alias_bias_view_relu_helper([2, 3, 4, 5], [1, 6, 1, 2, 2, 5, 1], torch.float, 'cuda', 1e-6) - with nvfuser_horizontal_fusion(False): - def t(x, y, s): - o1 = x + s - o2 = y + s - return o1, o2 + def _bias_flatten_relu_helper(self, shape, start_dim, end_dim, dtype, device, error): + class BiasFlattenRelu(torch.nn.Module): + def __init__(self): + super(BiasFlattenRelu, self).__init__() + self.bias = torch.nn.Parameter(torch.randn(shape, dtype=dtype, device=device), requires_grad=False) + with torch.no_grad(): + self.bias.fill_(10) + + def forward(self, inputs : torch.Tensor, start_dim : int, end_dim : int): + o = inputs + self.bias + o = o.flatten(start_dim, end_dim) + return torch.relu(o) + + t = BiasFlattenRelu() + x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False) + t_jit = torch.jit.script(t) - t_jit = torch.jit.script(t) - for i in range(5): - t_jit(x, y, s) + self._run_helper(t_jit, t, x, start_dim, end_dim) + self.assertGraphContains(t_jit.graph_for(x, start_dim, end_dim), 'prim::flatten_copy', True) - # sibling fusion should be disabled with the flag - self.assertGraphContainsExactly(t_jit.graph_for(x, y, s), FUSION_GUARD, 0) + def _alias_bias_flatten_relu_helper(self, shape, start_dim, end_dim, dtype, device, error): + class BiasFlattenRelu(torch.nn.Module): + def __init__(self): + super(BiasFlattenRelu, self).__init__() + self.bias = torch.nn.Parameter(torch.randn(shape, dtype=dtype, device=device), requires_grad=False) + with torch.no_grad(): + self.bias.fill_(10) + + def forward(self, inputs : torch.Tensor, bias : torch.Tensor, start_dim : int, end_dim : int): + o = inputs.flatten(start_dim, end_dim) + inputs.add_(bias) + return torch.relu(o) + + t = BiasFlattenRelu() + x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False) + bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False) + t_jit = torch.jit.script(t) -class TestPassManagerCudaFuser(JitTestCase): + # profiling + jit_o = t_jit(x.clone(), bias, start_dim, end_dim) + # optimization + jit_o = t_jit(x.clone(), bias, start_dim, end_dim) + # final + jit_o = t_jit(x.clone(), bias, start_dim, end_dim) + # eager - baseline + o = t(x.clone(), bias, start_dim, end_dim) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + self.assertEqual(o.dtype, jit_o.dtype) + self.assertTrue(self._compare("comparing output failed", o, jit_o, error)) + graph = t_jit.graph_for(x, bias, start_dim, end_dim) + + self.assertGraphContainsExactly(graph, FUSION_GUARD, 0) + self.assertGraphContainsExactly(graph, 'prim::flatten_copy', 0) + + @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since flatten is disabled now") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") - def test_context_manager_test(self): - x = torch.randn(4, 8, dtype=torch.float, device="cuda") - y = torch.randn(4, 8, dtype=torch.float, device="cuda") - with torch.jit.fuser('fuser2'): - with torch.jit.fuser('fuser2'): + def test_flatten(self): + torch._C._jit_set_nvfuser_guard_mode(True) + self._bias_flatten_relu_helper([2, 3, 4, 5], 0, -1, torch.float, 'cuda', 1e-6) + self._bias_flatten_relu_helper([2, 3, 4, 5], 1, -1, torch.float, 'cuda', 1e-6) + self._bias_flatten_relu_helper([2, 3, 4, 5], 2, -1, torch.float, 'cuda', 1e-6) + self._bias_flatten_relu_helper([2, 3, 4, 5], 0, 3, torch.float, 'cuda', 1e-6) + self._bias_flatten_relu_helper([2, 3, 4, 5], 1, 2, torch.float, 'cuda', 1e-6) + self._bias_flatten_relu_helper([2, 3, 4, 5], 2, 2, torch.float, 'cuda', 1e-6) + self._alias_bias_flatten_relu_helper([2, 3, 4, 5], 0, -1, torch.float, 'cuda', 1e-6) + self._alias_bias_flatten_relu_helper([2, 3, 4, 5], 1, -1, torch.float, 'cuda', 1e-6) + self._alias_bias_flatten_relu_helper([2, 3, 4, 5], 2, -1, torch.float, 'cuda', 1e-6) + self._alias_bias_flatten_relu_helper([2, 3, 4, 5], 0, 3, torch.float, 'cuda', 1e-6) + self._alias_bias_flatten_relu_helper([2, 3, 4, 5], 1, 2, torch.float, 'cuda', 1e-6) + self._alias_bias_flatten_relu_helper([2, 3, 4, 5], 2, 2, torch.float, 'cuda', 1e-6) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_strict_fusion(self): + def success(x): + with torch.jit.strict_fusion(): + return x + x + x + + scripted = self.checkScript(success, (torch.rand([4], device='cuda'),)) + g = torch.jit.last_executed_optimized_graph() + FileCheck().check_not("aten::add").check("prim::CudaFusionGroup").run(g) + + def failure(x): + with torch.jit.strict_fusion(): + return x + torch.mm(x, x) + x + + with self.assertRaises(Exception) as error_out: + foo_s = torch.jit.script(failure) + foo_s(torch.rand([4, 4])) + foo_s(torch.rand([4, 4])) + + fc = FileCheck().check("Found unfused operators") + fc.check("aten::mm").run(str(error_out.exception)) + + def _ltc_helper(self, shape, dtype, device, error, approximate=True): + # modeled after LTC linear layer + class LTC(torch.nn.Module): + def __init__(self): + super(LTC, self).__init__() + self.weight = torch.nn.Parameter(torch.randn([1024, 1024], dtype=dtype, device=device), requires_grad=False) + self.bias = torch.nn.Parameter(torch.randn([1, 1024], dtype=dtype, device=device), requires_grad=False) + + def forward(self, inputs : torch.Tensor): + o = inputs.view([32768, 1024]) + o = torch.mm(o, self.weight) + o = o.view([256, 128, 1024]) + o = o + self.bias + o = o.view([32768, 1024]) + o = o.view([256, 128, 1024]) + return torch.nn.functional.gelu(o) + + t = LTC() + x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False) + t_jit = torch.jit.script(t) - def t1(x, y): - o = x + y - o = o + 2.0 - return o - t_jit = torch.jit.script(t1) - t_jit(x, y) - t_jit(x, y) - self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD) + # profile/optimization runs + for i in range(3): + jit_o = t_jit(x) + o = t(x) - def t2(x, y): - o = x + y - o = o + 3.0 - return o - t_jit_2 = torch.jit.script(t2) - t_jit_2(x, y) - t_jit_2(x, y) - self.assertGraphContains(t_jit_2.graph_for(x, y), FUSION_GUARD) + self.assertEqual(o.dtype, jit_o.dtype) + self.assertTrue(self._compare("comparing output failed", o, jit_o, error)) + graph = t_jit.graph_for(x) + self.assertGraphContains(graph, FUSION_GUARD) + self.assertGraphContains(graph, 'prim::view_copy', True) - def t3(x, y): - o = x + y - o = o + 4.0 - return o - t_jit_3 = torch.jit.script(t3) - t_jit_3(x, y) - t_jit_3(x, y) + @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since view is disabled now") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_nested_view(self): + self._ltc_helper([256, 128, 1024], torch.float, 'cuda', 1e-6) + + def _bias_squeeze_relu_helper(self, shape, dtype, device, error): + class BiasSqueezeRelu(torch.nn.Module): + def __init__(self): + super(BiasSqueezeRelu, self).__init__() + + def forward(self, inputs: torch.Tensor, bias: torch.Tensor): + o = inputs + bias + o = torch.squeeze(o) + return torch.relu(o) + + t = BiasSqueezeRelu() + x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False) + bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False) + t_jit = torch.jit.script(t) + + jit_o = t_jit(x, bias) + jit_o = t_jit(x, bias) + jit_o = t_jit(x, bias) + o = t(x, bias) + + self.assertEqual(o.dtype, jit_o.dtype) + self.assertTrue(self._compare("comparing output failed", o, jit_o, error)) + graph = t_jit.graph_for(x, bias) + self.assertGraphContains(graph, FUSION_GUARD) + self.assertGraphContains(graph, 'prim::squeeze_copy', True) + + def _alias_bias_squeeze_relu_helper(self, shape, dtype, device, error): + class BiasSqueezeRelu(torch.nn.Module): + def __init__(self): + super(BiasSqueezeRelu, self).__init__() + + def forward(self, inputs: torch.Tensor, bias: torch.Tensor): + o = torch.squeeze(inputs) + inputs.add_(bias) + return torch.relu(o) + + t = BiasSqueezeRelu() + x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False) + bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False) + t_jit = torch.jit.script(t) + + jit_o = t_jit(x.clone(), bias) + jit_o = t_jit(x.clone(), bias) + jit_o = t_jit(x.clone(), bias) + o = t(x.clone(), bias) + + self.assertEqual(o.dtype, jit_o.dtype) + self.assertTrue(self._compare("comparing output failed", o, jit_o, error)) + graph = t_jit.graph_for(x, bias) + self.assertGraphContainsExactly(graph, FUSION_GUARD, 0) + self.assertGraphContainsExactly(graph, 'prim::squeeze_copy', 0) + + @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since squeeze/unsqueeze is disabled now") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_squeeze(self): + self._bias_squeeze_relu_helper([1, 6, 1, 2, 2, 5, 1], torch.float, 'cuda', 1e-6) + self._alias_bias_squeeze_relu_helper([1, 6, 1, 2, 2, 5, 1], torch.float, 'cuda', 1e-6) + + @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since squeeze/unsqueeze is disabled now") + # remove this after opinfo tests are enabled + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_squeeze_zero(self): + x = torch.tensor(1.0, dtype=torch.float, device="cuda") + + def squeeze_0(x: torch.Tensor): + o = x + 1. + o = torch.squeeze(o, 0) + o = o * 2. + return o + + def squeeze_1(x: torch.Tensor): + o = x + 1. + o = torch.squeeze(o, -1) + o = o + .5 + return o + + squeeze_0_jit = torch.jit.script(squeeze_0) + self._run_helper(squeeze_0_jit, squeeze_0, x) + squeeze_1_jit = torch.jit.script(squeeze_1) + self._run_helper(squeeze_1_jit, squeeze_1, x) + + def _bias_unsqueeze_relu_helper(self, shape, dtype, device, error): + class BiasUnsqueezeRelu(torch.nn.Module): + def __init__(self): + super(BiasUnsqueezeRelu, self).__init__() + + def forward(self, inputs: torch.Tensor, bias: torch.Tensor): + o = inputs + bias + o = torch.unsqueeze(o, 0) + return torch.relu(o) + + t = BiasUnsqueezeRelu() + x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False) + bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False) + t_jit = torch.jit.script(t) + + jit_o = t_jit(x, bias) + jit_o = t_jit(x, bias) + jit_o = t_jit(x, bias) + o = t(x, bias) + + self.assertEqual(o.dtype, jit_o.dtype) + self.assertTrue(self._compare("comparing output failed", o, jit_o, error)) + graph = t_jit.graph_for(x, bias) + self.assertGraphContains(graph, FUSION_GUARD) + self.assertGraphContains(graph, 'prim::unsqueeze_copy', True) + + def _alias_bias_unsqueeze_relu_helper(self, shape, dtype, device, error): + class BiasUnsqueezeRelu(torch.nn.Module): + def __init__(self): + super(BiasUnsqueezeRelu, self).__init__() + + def forward(self, inputs : torch.Tensor, bias : torch.Tensor): + o = torch.unsqueeze(inputs, 0) + inputs.add_(bias) + return torch.relu(o) + + t = BiasUnsqueezeRelu() + x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False) + bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False) + t_jit = torch.jit.script(t) + + jit_o = t_jit(x.clone(), bias) + jit_o = t_jit(x.clone(), bias) + jit_o = t_jit(x.clone(), bias) + o = t(x.clone(), bias) + + self.assertEqual(o.dtype, jit_o.dtype) + self.assertTrue(self._compare("comparing output failed", o, jit_o, error)) + graph = t_jit.graph_for(x, bias) + self.assertGraphContainsExactly(graph, FUSION_GUARD, 0) + self.assertGraphContainsExactly(graph, 'prim::unsqueeze_copy', 0) + + @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since squeeze/unsqueeze is disabled now") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_unsqueeze(self): + self._bias_unsqueeze_relu_helper([2, 3, 4, 5], torch.float, 'cuda', 1e-6) + self._alias_bias_unsqueeze_relu_helper([2, 3, 4, 5], torch.float, 'cuda', 1e-6) + + @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since unsqueeze is disabled now") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_alias_pass_fix(self): + x = torch.randn(4, 24, 2, 2, dtype=torch.float, device="cuda") + w = torch.randn(24, 24, 1, 1, dtype=torch.float, device="cuda") + b = torch.randn(24, dtype=torch.float, device="cuda") + + def t(x, w, b): + b2 = b + 1.0 + o = torch.conv2d(x, w, b2) + return o + + t_jit = torch.jit.script(t) + self._run_helper(t_jit, t, x, w, b) + + @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since squeeze/unsqueeze is disabled now") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_squeeze_negative_dim(self): + x = torch.randn(4, 24, 1, 2, dtype=torch.float, device="cuda") + + def t(x): + o = x + 1.0 + o = o.squeeze(-2) + o = o * 2.0 + return o + + t_jit = torch.jit.script(t) + self._run_helper(t_jit, t, x) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_singleton_fusion(self): + x = torch.randn(4, 2, device="cuda") + + with nvfuser_singleton_fusion(True): + def t(x): + return x.relu() + + t_jit = torch.jit.script(t) + self._run_helper(t_jit, t, x) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_issue1445_fusion(self): + def f(t0, t1, t2, t3): + masked_input = torch.where(t1, t2, t3) + total = masked_input.sum([0, 1, 2, 3]) + sizes : List[int] = [] + t10 = torch.reshape(t0, sizes) + t7 = total / t10 + t4 = t7.to(dtype=torch.float) + return t4 + + x = torch.randn(1, 1, 1, 1, device='cuda').to(dtype=torch.long) + y = torch.randn(3, 2, 1, 1, device='cuda').to(dtype=torch.bool).expand([3, 2, 1, 2]) + z = torch.randn(3, 2, 1, 2, device='cuda') + w = torch.tensor(1.5, device='cuda') + + f_jit = torch.jit.script(f) + for i in range(5): + out_jit = f_jit(x, y, z, w) + out = f(x, y, z, w) + self.assertEqual(out, out_jit) + self.assertGraphContainsExactly(f_jit.graph_for(x, y, z, w), FUSION_GROUP, 1) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_disable_sibling_fuse(self): + x = torch.randn(4, 2, device="cuda") + y = torch.randn(8, device="cuda") + s = torch.tensor(1.5, device="cuda") + + with nvfuser_horizontal_fusion(False): + def t(x, y, s): + o1 = x + s + o2 = y + s + return o1, o2 + + t_jit = torch.jit.script(t) + for i in range(5): + t_jit(x, y, s) + + # sibling fusion should be disabled with the flag + self.assertGraphContainsExactly(t_jit.graph_for(x, y, s), FUSION_GUARD, 0) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_build_shape_expression_native_dropout(self): + x = torch.randn(4, 2, device="cuda") + + def t(x): + o, mask = torch.native_dropout(x, 0.0, True) + o1 = o.sigmoid() + o2 = mask.float().sigmoid() + return (o1, o2) + + t_jit = torch.jit.script(t) + + jit_o = t_jit(x) + jit_o = t_jit(x) + o = t(x) + for oo, jit_oo in zip(o, jit_o): + self.assertEqual(oo.dtype, jit_oo.dtype) + self.assertEqual(oo, jit_oo) + self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_scalar_tensor_permuted(self): + x = torch.randn(4, 2, 3, device="cuda").permute([1, 2, 0]) + y = torch.tensor(1.0, device="cuda") + + with nvfuser_singleton_fusion(True): + def t(x, y): + return x + y + + t_jit = torch.jit.script(t) + self._run_helper(t_jit, t, x, y) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_cpu_scalar(self): + x = torch.randn(4, 2, 3, device="cuda") + y = torch.tensor(1.0, device="cpu") + z = torch.tensor(2.0, device="cpu") + + with nvfuser_singleton_fusion(True): + # testing cpu scalar tensor promotion + def t(x, y): + return x + y + + t_jit = torch.jit.script(t) + self._run_helper(t_jit, t, x, y) + + # scalar cpu tensor add should NOT be fused + @torch.jit.script + def t1(y, z): + return y * z + for _ in range(5): + t1(y, z) + self.assertGraphContainsExactly(t1.graph_for(y, z), FUSION_GUARD, 0) + + # everything, including scalar cpu tensor add should be fused + @torch.jit.script + def t2(x, y, z): + tmp = y + z + return tmp + x + for _ in range(5): + t2(x, y, z) + self.assertGraphContainsExactly(t2.graph_for(x, y, z), 'aten::add', 0) + self.assertGraphContainsExactly(t2.graph_for(x, y, z), FUSION_GUARD, 1) + + # 'cpu_tmp = y + z' shouldn't be fused. + @torch.jit.script + def t3(x, y, z): + cpu_tmp = y + z + out = x + y + return cpu_tmp, out + for _ in range(5): + t3(x, y, z) + self.assertGraphContainsExactly(t3.graph_for(x, y, z), FUSION_GUARD, 1) + self.assertGraphContainsExactly(t3.graph_for(x, y, z), 'aten::add', 1) + + @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since squeeze/unsqueeze is disabled now") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_shape_expression(self): + x = torch.randn(4, 2, 1, 3, device="cuda") + + def t_unsqueeze(x): + t0 = x.relu() + t1 = t0.unsqueeze(1) + t2 = t1 + 1.0 + t3 = t1.size() + return t2, t3 + + def t_squeeze(x): + t0 = x.relu() + t1 = t0.squeeze() + t2 = t1 + 1.0 + t3 = t1.size() + return t2, t3 + + def t_squeeze_dim(x): + t0 = x.relu() + t1 = t0.squeeze(-2) + t2 = t1 + 1.0 + t3 = t1.size() + return t2, t3 + + # squeezing a non-size 1 dimension should be a no op + def t_squeeze_dim_no_op(x): + t0 = x.relu() + t1 = t0.squeeze(1) + t2 = t1 + 1.0 + t3 = t1.size() + return t2, t3 + + def run(fn): + jit_fn = torch.jit.script(fn) + jit_o = jit_fn(x) + jit_o = jit_fn(x) + jit_o = jit_fn(x) + o = fn(x) + # output 0 is a tensor, so we check dtype and value + self.assertEqual(o[0].dtype, jit_o[0].dtype) + self.assertEqual(o[0], jit_o[0]) + # output 1 is shape + self.assertEqual(o[1], jit_o[1]) + self.assertGraphContainsExactly(jit_fn.graph_for(x), FUSION_GUARD, 1) + + for t in [t_unsqueeze, t_squeeze, t_squeeze_dim, t_squeeze_dim_no_op]: + run(t) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_scalar_cuda_tensor(self): + x = torch.tensor(2.0, device="cuda") + + with nvfuser_singleton_fusion(True): + def t(x): + return x + 1.0 + + t_jit = torch.jit.script(t) + self._run_helper(t_jit, t, x) + + @torch.jit.script + def t_jitted(x): + return x.sum(0) + + for i in range(5): + t_jitted(x) + self.assertGraphContainsExactly(t_jitted.graph_for(x), FUSION_GUARD, 0) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_overlapped_input(self): + x = torch.randn(8, device="cuda").as_strided((2, 4), (1, 1)) + + with nvfuser_singleton_fusion(True): + def t(x): + return x + 1.0 + + t_jit = torch.jit.script(t) + self._run_helper(t_jit, t, x) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + def test_reduction_empty_axes(self): + x = torch.randn(4, 2, 3, device="cuda").permute([1, 2, 0]) + + with nvfuser_singleton_fusion(True): + def t(x): + sizes : List[int] = [] + return x.sum(sizes) + + t_jit = torch.jit.script(t) + self._run_helper(t_jit, t, x) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + def test_int_tensor_input(self): + x = torch.randn(4, 2, device="cuda").to(dtype=torch.int) + + with nvfuser_singleton_fusion(True): + def t(x): + return x.amax(dim=0) + + t_jit = torch.jit.script(t) + self._run_helper(t_jit, t, x) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_to_boolean(self): + x = torch.randn(4, 2, device="cuda") + + with nvfuser_singleton_fusion(True): + def t(x): + return x.to(dtype=torch.bool) + + t_jit = torch.jit.script(t) + self._run_helper(t_jit, t, x) + + @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since reshape is disabled now") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_view_copy_graph_guard(self): + x = torch.randn(4, 2, 3, device="cuda").permute([1, 2, 0]) + y = [4, 6] + + with nvfuser_singleton_fusion(True): + def t(x, y : List[int]): + t1 = x + 1.0 + t2 = t1 * 1.0 + out = t2.reshape(y) + return out.relu() + + t_jit = torch.jit.script(t) + self._run_helper(t_jit, t, x, y) + + @unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since view is disabled now") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_view_copy_graph_guard_double_fusion(self): + x = torch.randn(2, 2, 5, device="cuda") + w = torch.randn(5, 5, device="cuda") + + with nvfuser_singleton_fusion(True): + def t(x, w): + o = x.view([4, x.size()[-1]]) + o = torch.matmul(o, w) + o = o.view([2, 2, o.size()[1]]) + return o + + t_jit = torch.jit.script(t) + for i in range(3): + jit_o = t_jit(x, w) + o = t(x, w) + self.assertEqual(jit_o, o) + self.assertGraphContainsExactly(t_jit.graph_for(x, w), FUSION_GUARD, 2, consider_subgraphs=True) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_input_output_passthrough(self): + def t(t0, t1, t2): + mask = t1.to(dtype=torch.bool) + masked_input = torch.where(t0, mask, t2) + return masked_input, mask + + t_jit = torch.jit.script(t) + # stick to integers, this avoid the numerical difference due to our + # promotion + x = torch.randn(4, 4, device='cuda').to(dtype=torch.bool) + y = torch.randn(4, 4, device='cuda').to(dtype=torch.bool) + z = torch.tensor(1.0, device='cuda').to(dtype=torch.bool) + jit_o = t_jit(x, y, z) + jit_o = t_jit(x, y, z) + o = t(x, y, z) + for oo, jit_oo in zip(o, jit_o): + self.assertEqual(oo.dtype, jit_oo.dtype) + self.assertEqual(oo, jit_oo) + self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_pointwise_reference_tensor(self): + def t(input1, input2, scalar): + _unsafe_view = torch.ops.aten._unsafe_view(input1, [2, 4, 16]) + add_ = torch.ops.aten.add_(_unsafe_view, input2) + gelu_ = torch.ops.aten.gelu(add_) + view_ = torch.ops.aten.view(gelu_, [8, 16]) + mul_ = torch.ops.aten.mul(add_, scalar) + return [view_, mul_] + + x = torch.randn(8, 16, device="cuda") + bias = torch.randn(16, device="cuda") + scalar = torch.ones(torch.Size([]), device="cuda") + + t_jit = torch.jit.script(t) + for i in range(3): + jit_o = t_jit(x, bias, scalar) + o = t(x, bias, scalar) + self.assertEqual(jit_o, o) + self.assertGraphContains(t_jit.graph_for(x, bias, scalar), FUSION_GUARD) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + def test_native_batch_norm_backward(self): + grad_output = torch.randn(4, 2, 3, device="cuda") + input = torch.randn(4, 2, 3, device="cuda") + weight = torch.randn(2, device="cuda") + + r_m = torch.randn(2, device="cuda") + r_v = torch.randn(2, device="cuda").abs() + + save_mean = torch.randn(2, device="cuda") + save_invstd = torch.randn(2, device="cuda").abs() + + with nvfuser_singleton_fusion(True): + def t(grad_out, input, weight, r_m, r_v, save_mean, save_invstd, train: bool, eps: float, mask: List[bool]): + return torch.ops.aten.native_batch_norm_backward(grad_out, input, weight, r_m, r_v, save_mean, + save_invstd, train, eps, mask) + + t_jit = torch.jit.script(t) + for i in range(4): + jit_o = t_jit(grad_output, input, weight, r_m.clone(), r_v.clone(), + save_mean, save_invstd, True, 1e-5, [True, True, True]) + + ref_m = r_m.clone() + ref_v = r_v.clone() + jit_o = t_jit(grad_output, input, weight, r_m, r_v, save_mean, save_invstd, True, 1e-5, [True, True, True]) + o = t(grad_output, input, weight, ref_m, ref_v, save_mean, save_invstd, True, 1e-5, [True, True, True]) + for oo, jit_oo in zip(o, jit_o): + self.assertEqual(oo.dtype, jit_oo.dtype) + self.assertEqual(oo, jit_oo) + self.assertEqual(ref_m.dtype, r_m.dtype) + self.assertEqual(ref_m, r_m) + self.assertEqual(ref_v.dtype, r_v.dtype) + self.assertEqual(ref_v, r_v) + self.assertGraphContains(t_jit.graph_for(grad_output, input, weight, r_m.clone(), r_v.clone, save_mean, + save_invstd, True, 1e-5, [True, True, True]), FUSION_GUARD) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_contiguous_on_broadcasted(self): + x = torch.randn(4, 1, device="cuda") + y = torch.randn(4, 128, device="cuda") + + with nvfuser_singleton_fusion(True): + def t(x, y): + t1 = x.expand([4, 128]) + t2 = t1 * y + return t2 + + t_jit = torch.jit.script(t) + self._run_helper(t_jit, t, x, y) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_skip_parser(self): + x = torch.randn(4, 12, device="cuda") + + with nvfuser_singleton_fusion(True): + def fn(x): + t1 = x + 1.0 + return t1.relu() + + fn_jit = torch.jit.script(fn) + self._run_helper(fn_jit, fn, x) + + # add node should have been merged into fusion + self.assertGraphContains(fn_jit.graph_for(x), FUSION_GUARD) + self.assertGraphContainsExactly(fn_jit.graph_for(x), 'aten::add', 0) + + # flips skip parse for `aten::add`, following fusion should skip the + # add node + self.assertFalse(torch._C._jit_set_nvfuser_skip_node_kind("aten::add", True)) + + def fn_1(x): + t1 = x + 2.0 # change const value so we'll not reuse plan + return t1.relu() + + fn_1_jit = torch.jit.script(fn_1) + self._run_helper(fn_1_jit, fn_1, x) + + # add node should have been merged into fusion + self.assertGraphContains(fn_1_jit.graph_for(x), FUSION_GUARD) + self.assertGraphContainsExactly(fn_1_jit.graph_for(x), 'aten::add', 1) + + # flips skip parse for `aten::add`, next fusion should fuse add node + self.assertTrue(torch._C._jit_set_nvfuser_skip_node_kind("aten::add", True)) + + def fn_2(x): + t1 = x + 2.0 # change const value so we'll not reuse plan + return t1.relu() + + fn_2_jit = torch.jit.script(fn_2) + self._run_helper(fn_2_jit, fn_2, x) + + # add node should have been merged into fusion + self.assertGraphContains(fn_2_jit.graph_for(x), FUSION_GUARD) + self.assertGraphContainsExactly(fn_2_jit.graph_for(x), 'aten::add', 0) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_cuda_fusion_guard(self): + old_guard = torch._C._jit_set_nvfuser_guard_mode(True) + + class ConvModule(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + return x.sin().sigmoid() + + mod = ConvModule().to(device="cuda") + + inputs = [torch.randn(20, 16, 50, 100, device="cuda", requires_grad=True)] + + def reduce_scalar(temp): + return temp.sum() + + scripted = torch.jit.script(mod) + with torch.no_grad(): + scripted(*inputs) + res = scripted(*inputs) + reduce_scalar(res).backward() + torch._C._jit_set_nvfuser_guard_mode(old_guard) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_nvfuser_comparison_callbacks_with_fallback(self): + try: + fused_result = None + unfused_result = None + graph_ir = None + + def callback(fused_outputs, unfused_outputs, graph_str): + nonlocal unfused_result + nonlocal fused_result + nonlocal graph_ir + unfused_result = unfused_outputs[-1] + fused_result = fused_outputs[-1] + graph_ir = graph_str + torch._C._jit_nvfuser_set_comparison_callback(True, callback) + + def fn(x, y): + z = torch.add(x, y) + return torch.relu(z) + + x = torch.rand((4, 4)).cuda() - 0.5 + y = torch.rand((4, 4)).cuda() - 0.5 + + fn_s = torch.jit.script(fn) + fn_s(x, y) + fn_s(x, y) + fn_s(x, y) + + expected = fn(x, y) + + self.assertEqual(expected, fused_result) + self.assertEqual(expected, unfused_result) + FileCheck().check("aten::add").run(graph_ir) + finally: + torch._C._jit_nvfuser_clear_comparison_callback() + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_nvfuser_comparison_callbacks_without_fallback(self): + try: + fused_result = None + unfused_result = None + graph_ir = None + + def callback(fused_outputs, unfused_outputs, graph_str): + nonlocal unfused_result + nonlocal fused_result + nonlocal graph_ir + if len(unfused_outputs) > 0: + unfused_result = unfused_outputs[-1] + fused_result = fused_outputs[-1] + graph_ir = graph_str + torch._C._jit_nvfuser_set_comparison_callback(False, callback) + + def fn(x, y): + z = torch.add(x, y) + return torch.relu(z) + + x = torch.rand((4, 4)).cuda() - 0.5 + y = torch.rand((4, 4)).cuda() - 0.5 + + fn_s = torch.jit.script(fn) + fn_s(x, y) + fn_s(x, y) + fn_s(x, y) + + expected = fn(x, y) + + self.assertEqual(expected, fused_result) + self.assertEqual(None, unfused_result) + FileCheck().check("aten::add").run(graph_ir) + finally: + torch._C._jit_nvfuser_clear_comparison_callback() + + @unittest.skipIf(not RUN_NVFUSER, "requires NVFuser") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_cuda_fusion_guard_backward(self): + old_guard = torch._C._jit_set_nvfuser_guard_mode(True) + + inp = torch.randn(10, device="cuda", requires_grad=True) + grad = torch.randn(10, device="cuda") + + def f(x): + a = x.cos().cos() + return a + scripted = torch.jit.script(f) + + with profile(activities=[ProfilerActivity.CPU]) as prof: + for _ in range(5): + inp.grad = None + out = scripted(inp) + out.backward(grad) + + # check that we do not have fallback triggered + self.assertEqual(prof.events().table().find("fallback"), -1) + torch._C._jit_set_nvfuser_guard_mode(old_guard) + + # TODO: generalize this + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") + def test_inf_quick_patch(self): + inputs = [torch.tensor([-float('inf'), float('inf'), 4.0], device="cuda"), + torch.tensor([1.0, float('inf'), 4.0], device="cuda"), + torch.tensor([-float('inf'), -1.5, 4.0], device="cuda"), + torch.tensor([1.0, -3.0, float('nan')], device="cuda"), + torch.tensor([-float('inf'), -float('inf'), -float('inf')], device="cuda"), + torch.tensor([float('inf'), float('inf'), float('inf')], device="cuda"), + torch.tensor([float('nan'), float('nan'), float('nan')], device="cuda")] + + def fn_amax(x): + return x.amax(dim=0) + + def fn_amin(x): + return x.amin(dim=0) + + def fn_add_nan(x): + return x.relu() + float('nan') + + def fn_add(x): + return x + 1.0 + + with nvfuser_singleton_fusion(True): + for t in [fn_amax, fn_amin, fn_add, fn_add_nan]: + for x in inputs: + t_jit = torch.jit.script(t) + self._run_helper(t_jit, t, x) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_clamp_reversed_bound(self): + x = torch.tensor([1., -float('inf'), 2., float('inf'), float('nan')], device="cuda") + + def t(x): + return x.clamp(min=1., max=0.5) + + with nvfuser_singleton_fusion(True): + jit_t = torch.jit.script(t) + self._run_helper(jit_t, t, x) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_high_rank_fusion(self): + # currently we want to limit fusion to node with input where rank <= 8 + rank_limit = 8 + shapes = [4 for i in range(rank_limit + 1)] + x = torch.randn(shapes, device="cuda") + + with nvfuser_singleton_fusion(True): + def t(x): + return x.relu() + + jit_t = torch.jit.script(t) + for i in range(5): + jit_t(x) + self.assertGraphContainsExactly(jit_t.graph_for(x), FUSION_GUARD, 0) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_clamp(self): + x = torch.tensor([1., float('inf'), 2., float('nan'), float('-inf')], device="cuda") + + def clamp_max(x): + return x.clamp(max=1.5) + + def clamp_min_max(x): + return x.clamp(min=1.5) + + def clamp_min(x): + return x.clamp(min=1., max=3.) + + with nvfuser_singleton_fusion(True): + for t in [clamp_max, clamp_min, clamp_min_max]: + t_jit = torch.jit.script(t) + self._run_helper(t_jit, t, x) + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_device_constant(self): + x = torch.randn(4, 2, device="cuda") + + def t(x): + return torch.rand_like(x, device=torch.device(type='cuda')) + + # cpu tensor shouldn't be fused + def t_cpu(x): + return torch.rand_like(x, device=torch.device(type='cpu')) + + with nvfuser_singleton_fusion(True): + t_jit = torch.jit.script(t) + self._run_helper(t_jit, t, x) + + t_cpu_jit = torch.jit.script(t_cpu) + for i in range(5): + t_cpu_jit(x) + + self.assertGraphContainsExactly(t_cpu_jit.graph_for(x), FUSION_GUARD, 0) + + +class TestPassManagerCudaFuser(JitTestCase): + def setUp(self): + super().setUp() + if RUN_NVFUSER: + self.is_enabled = torch._C._jit_set_nvfuser_enabled(False) + + def tearDown(self): + if RUN_NVFUSER: + torch._C._jit_set_nvfuser_enabled(self.is_enabled) + super().tearDown() + + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_context_manager_test(self): + x = torch.randn(4, 8, dtype=torch.float, device="cuda") + y = torch.randn(4, 8, dtype=torch.float, device="cuda") + with torch.jit.fuser('fuser2'): + with torch.jit.fuser('fuser2'): + + def t1(x, y): + o = x + y + o = o + 2.0 + return o + t_jit = torch.jit.script(t1) + t_jit(x, y) + t_jit(x, y) + self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD) + + def t2(x, y): + o = x + y + o = o + 3.0 + return o + t_jit_2 = torch.jit.script(t2) + t_jit_2(x, y) + t_jit_2(x, y) + self.assertGraphContains(t_jit_2.graph_for(x, y), FUSION_GUARD) + + def t3(x, y): + o = x + y + o = o + 4.0 + return o + t_jit_3 = torch.jit.script(t3) + t_jit_3(x, y) + t_jit_3(x, y) self.assertGraphContainsExactly(t_jit_3.graph_for(x, y), FUSION_GUARD, 0) - @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") def test_register_fuser(self): self.assertFalse(torch._C._jit_set_nvfuser_enabled(True)) self.assertTrue(torch._C._jit_nvfuser_enabled()) @@ -3198,6 +4817,128 @@ def test_register_fuser(self): self.assertTrue(torch._C._jit_set_nvfuser_enabled(False)) self.assertFalse(torch._C._jit_nvfuser_enabled()) + @unittest.skipIf(RUN_CUDA, "Testing on CPU only") + def test_register_fuser_cpu(self): + with self.assertRaises(RuntimeError): + torch._C._jit_set_nvfuser_enabled(True) + torch._C._jit_set_nvfuser_enabled(False) + + @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(not TEST_WITH_ROCM, "ROCM test only") + def test_register_fuser_rocm(self): + with self.assertRaises(RuntimeError): + torch._C._jit_set_nvfuser_enabled(True) + torch._C._jit_set_nvfuser_enabled(False) + +# See TestNNCOpInfoParent +class TestCudaFuserOpInfoParent(JitCommonTestCase): + pass + +class TestCudaFuserOpInfo(TestCudaFuserOpInfoParent): + def setUp(self): + super(TestCudaFuserOpInfoParent, self).setUp() + if RUN_NVFUSER: + self.cuda_fuser_options = CudaFuserTestOptions() + # enables guard mode since tracing could change graph to violate guard. + torch._C._jit_set_nvfuser_guard_mode(True) + self.nvfuser_single_node_mode = torch._C._jit_set_nvfuser_single_node_mode(True) + + def tearDown(self): + if RUN_NVFUSER: + self.cuda_fuser_options.restore() + + torch._C._jit_set_nvfuser_single_node_mode(self.nvfuser_single_node_mode) + + super(TestCudaFuserOpInfoParent, self).tearDown() + + @slowTest + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @ops(op_db, dtypes=OpDTypes.supported) + def test_nvfuser_correctness(self, device, dtype, op): + variant_sample_pairs = get_traced_sample_variant_pairs(device, dtype, op) + + for variant, sample in variant_sample_pairs: + trace = create_traced_fn(self, variant, cache_traced_fn=True) + ref = variant(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) + + trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) + + val = trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) + + self.assertEqual(ref, val, exact_layout=True) + + # Note: Clearing CU after NVFuser tests + # https://github.com/pytorch/pytorch/issues/35600 + # each torch.jit.trace adds state to the _python_cu compilation unit + # since this test traces a lot of functions, out-of-memory can occur + # if the CU is not cleared. + torch.jit._state._python_cu.drop_all_functions() + + @slowTest + @unittest.skipIf(not RUN_NVFUSER, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + @ops(op_db, allowed_dtypes=(torch.float16, torch.bfloat16, torch.float32, + torch.float64, torch.complex64, torch.complex128)) + def test_nvfuser_extremal_values(self, device, dtype, op): + variant_sample_pairs = get_traced_sample_variant_pairs(device, dtype, op) + + def _get_extremal_tensor(x, val, dtype): + if x.dtype != dtype: + return x + return torch.full_like(x, val) + + def _get_extremal_input(x, val, dtype): + if isinstance(x, torch.Tensor): + return _get_extremal_tensor(x, val, dtype) + elif is_iterable_of_tensors(x): + return [_get_extremal_tensor(y, val, dtype) for y in x] + return x + + def _get_extremal_sample(sample: SampleInput, val, dtype): + extremal_sample = SampleInput( + input=_get_extremal_input(sample.input, val, dtype), + args=[_get_extremal_input(x, val, dtype) for x in sample.args], + kwargs={k: _get_extremal_input(v, val, dtype) for k, v in sample.kwargs.items()}, + ) + return extremal_sample + + def _get_extremal_samples(sample: SampleInput, dtype): + vals = [float('inf'), float('-inf'), float('nan')] + if dtype.is_complex: + complex_vals = itertools.product(vals, vals) + vals = list(map(lambda x: complex(*x), complex_vals)) + for val in vals: + yield _get_extremal_sample(sample, val, dtype) + + variant_sample_pairs = get_traced_sample_variant_pairs(device, dtype, op) + + for variant, sample in variant_sample_pairs: + + trace = create_traced_fn(self, variant, cache_traced_fn=True) + trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) + trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) + + for extremal_sample in _get_extremal_samples(sample, dtype): + try: + with freeze_rng_state(): + ref = variant(*clone_inputs((extremal_sample.input, *extremal_sample.args)), + **extremal_sample.kwargs) + except (torch._C._LinAlgError, RuntimeError, ValueError): + # if eager errors out, then don't expect NVFuser to pass + continue + + with freeze_rng_state(): + val = trace(*clone_inputs((extremal_sample.input, *extremal_sample.args)), + **extremal_sample.kwargs) + + self.assertEqual(val, ref, equal_nan=True, exact_device=True) + + # See [Note: Clearing CU after NVFuser tests] + torch.jit._state._python_cu.drop_all_functions() + +instantiate_device_type_tests(TestCudaFuserOpInfo, globals(), only_for=("cuda")) + if __name__ == '__main__': run_tests() diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py index a548a8df4c8c..cb14fe573358 100644 --- a/test/test_jit_fuser_te.py +++ b/test/test_jit_fuser_te.py @@ -18,7 +18,7 @@ # inferred erroneously runs or skips # some tests torch._C._jit_set_profiling_executor(True) -torch._C._jit_set_profiling_mode(True) +torch._C._get_graph_executor_optimize(True) from torch.testing._internal.common_utils import run_tests, ProfilingMode, GRAPH_EXECUTOR, \ enable_profiling_mode_for_profiling_tests, slowTest @@ -82,6 +82,7 @@ def inline_fusion_groups(): class TestTEFuser(JitTestCase): def setUp(self): + super().setUp() self.tensorexpr_options = TensorExprTestOptions() # note: `self.dynamic_shapes` instatiated in specialization of class @@ -109,6 +110,7 @@ def setUp(self): def tearDown(self): self.tensorexpr_options.restore() torch._C._jit_set_fusion_strategy(self.old_fusion_strategy) + super().tearDown() def assertAllFused(self, graph, except_for=None): except_for = except_for if except_for is not None else set() @@ -1321,82 +1323,114 @@ def test_isnan(self): " ".join(["Failed:", str(dtype), 'isnan', device]) ) - def test_unary_ops(self): + def test_gelu(self): def apply(fn): - return lambda x: fn(x) + return lambda x, approximate: fn(x, approximate) unary_ops = [ - torch.lgamma, - torch.sigmoid, - torch.reciprocal, - torch.neg, - torch.relu, - F.relu6, - torch.log, - torch.log10, - torch.log1p, - torch.log2, - torch.exp, - torch.expm1, - torch.erf, - torch.erfc, - torch.cos, - torch.sin, - torch.tan, - torch.acos, - torch.asin, - torch.cosh, - torch.sinh, - torch.atan, - torch.tanh, - F.hardtanh, - F.hardsigmoid, - F.hardswish, - F.softplus, - torch.sqrt, - torch.rsqrt, F.gelu, - torch.abs, - torch.ceil, - torch.floor, - torch.round, - torch.trunc, - torch.frac, - # TODO: broken on ROCm? - # F.hardshrink, - F.leaky_relu, - lambda x: torch.threshold(x, 0, -10), - lambda x: torch.clamp(x, -10, 10), ] - gpu_only = {torch.erf, torch.erfc} sizes = [(1,), (2,), (4, 4)] for dtype, op, device, size in product(self.dtypes, unary_ops, self.devices, sizes): # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed if dtype in [torch.float16, torch.bfloat16] and device == "cpu": continue - # todo - re-enable. fails with .500 - if dtype == torch.bfloat16 and op == torch.round: - continue - if op in gpu_only and device == "cpu": - continue try: x = self.data_for(dtype, device, size=size) + cond = self.data_for(torch.bool, device) fn = apply(op) - ref = fn(x) + ref = fn(x, cond) except Exception: # If eager mode doesn't support a dtype/op/device combo, # neither does the fuser. Catch everything to avoid needing to # guess what errors might be thrown by eager. continue try: - t = torch.jit.trace(fn, (x,)) - torch.testing.assert_close(ref, t(x)) - self.assertAllFused(t.graph_for(x)) + t = torch.jit.trace(fn, (x, cond)) + torch.testing.assert_close(ref, t(x, cond)) + self.assertAllFused(t.graph_for(x, cond)) except Exception as e: raise RuntimeError( " ".join(["Failed:", str(dtype), op.__name__, device, str(size)]) ) + def test_unary_ops(self): + with torch._jit_internal._disable_emit_hooks(): + def apply(fn): + return lambda x: fn(x) + + unary_ops = [ + torch.lgamma, + torch.sigmoid, + torch.reciprocal, + torch.neg, + torch.relu, + F.relu6, + torch.log, + torch.log10, + torch.log1p, + torch.log2, + torch.exp, + torch.expm1, + torch.erf, + torch.erfc, + torch.cos, + torch.sin, + torch.tan, + torch.acos, + torch.asin, + torch.cosh, + torch.sinh, + torch.atan, + torch.tanh, + F.hardtanh, + F.hardsigmoid, + F.hardswish, + F.softplus, + torch.sqrt, + torch.rsqrt, + torch.abs, + torch.ceil, + torch.floor, + torch.round, + torch.trunc, + torch.frac, + # TODO: broken on ROCm? + # F.hardshrink, + F.leaky_relu, + lambda x: torch.threshold(x, 0, -10), + # TODO: broken since type promotion was added + # lambda x: torch.clamp(x, -10, 10), + ] + gpu_only = {torch.erf, torch.erfc} + sizes = [(1,), (2,), (4, 4)] + for dtype, op, device, size in product(self.dtypes, unary_ops, self.devices, sizes): + # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed + if dtype in [torch.float16, torch.bfloat16] and device == "cpu": + continue + # todo - re-enable. fails with .500 + if dtype == torch.bfloat16 and op == torch.round: + continue + if op in gpu_only and device == "cpu": + continue + try: + x = self.data_for(dtype, device, size=size) + fn = apply(op) + ref = fn(x) + except Exception: + # If eager mode doesn't support a dtype/op/device combo, + # neither does the fuser. Catch everything to avoid needing to + # guess what errors might be thrown by eager. + continue + try: + t = torch.jit.trace(fn, (x,)) + torch.testing.assert_close(ref, t(x)) + self.assertAllFused(t.graph_for(x)) + except Exception as e: + raise RuntimeError( + " ".join(["Failed:", str(dtype), op.__name__, device, str(size)]) + ) + def test_binary_ops(self): def apply(fn): return lambda x, y: fn(x, y) @@ -1562,47 +1596,48 @@ def fn(x, y): ) def test_binary_tensor_scalar_ops(self): - def apply_with_scalar(fn, scalar): - return lambda x: fn(x, scalar) - - # FIXME: Fails in IR Eval: torch.int64 and_ cpu - binary_ops = [ - operator.__and__, - operator.__or__, - operator.__xor__, - torch.add, - torch.sub, - torch.mul, - torch.eq, - torch.ne, - torch.ge, - torch.lt, - torch.gt, - ] - devices = self.devices - # Maybe we should split this into separate tests to speed it up by - # only using scalar values relevant to particular ops - scalars = [1.5, 3, 0, -2.0, -1] - for dtype, op, device, scalar in product(self.dtypes, binary_ops, devices, scalars): - if dtype in [torch.float16, torch.bfloat16] and device == "cpu": - continue - try: - x = self.data_for(dtype, device) - fn = apply_with_scalar(op, scalar) - ref = fn(x) - except Exception: - # If eager mode doesn't support a dtype/op/device combo, - # neither does the fuser. Catch everything to avoid needing to - # guess what errors might be thrown by eager. - continue - try: - t = torch.jit.trace(fn, (x)) - self.assertEqual(ref, t(x)) - self.assertAllFused(t.graph_for(x)) - except Exception as e: - raise RuntimeError( - " ".join(["Failed:", str(dtype), op.__name__, device]) - ) + with torch._jit_internal._disable_emit_hooks(): + def apply_with_scalar(fn, scalar): + return lambda x: fn(x, scalar) + + # FIXME: Fails in IR Eval: torch.int64 and_ cpu + binary_ops = [ + operator.__and__, + operator.__or__, + operator.__xor__, + torch.add, + torch.sub, + torch.mul, + torch.eq, + torch.ne, + torch.ge, + torch.lt, + torch.gt, + ] + devices = self.devices + # Maybe we should split this into separate tests to speed it up by + # only using scalar values relevant to particular ops + scalars = [1.5, 3, 0, -2.0, -1] + for dtype, op, device, scalar in product(self.dtypes, binary_ops, devices, scalars): + if dtype in [torch.float16, torch.bfloat16] and device == "cpu": + continue + try: + x = self.data_for(dtype, device) + fn = apply_with_scalar(op, scalar) + ref = fn(x) + except Exception: + # If eager mode doesn't support a dtype/op/device combo, + # neither does the fuser. Catch everything to avoid needing to + # guess what errors might be thrown by eager. + continue + try: + t = torch.jit.trace(fn, (x)) + self.assertEqual(ref, t(x)) + self.assertAllFused(t.graph_for(x)) + except Exception as e: + raise RuntimeError( + " ".join(["Failed:", str(dtype), op.__name__, device]) + ) def test_binary_div_ops(self): def apply_with_scalar(fn, scalar): @@ -2307,6 +2342,59 @@ def f(x): scr(x) self.assertLastGraphAllFused() + def test_with_strict_fusion(self): + + def success(x): + with torch.jit.strict_fusion(): + return x + x + x + + scripted = self.checkScript(success, (torch.rand([4]),)) + g = torch.jit.last_executed_optimized_graph() + FileCheck().check_not("aten::add").check("prim::TensorExprGroup").run(g) + + def foo(x): + with torch.jit.strict_fusion(): + return x + x + torch.rand([4]) + 3 + + with self.assertRaises(Exception) as error_out: + foo_s = torch.jit.script(foo) + foo_s(torch.rand([4])) + foo_s(torch.rand([4])) + print(torch.jit.last_executed_optimized_graph()) + fc = FileCheck().check("Found unfused operators") + fc.check("aten::rand(int[] size") + fc.check("torch.rand([4]").run(str(error_out.exception)) + + with warnings.catch_warnings(record=True) as warns: + foo(torch.rand([4])) + + FileCheck().check("Only works in script mode").run(str(warns[0])) + + def test_autodiff(x): + with torch.jit.strict_fusion(): + return torch.rand([4]) + x + x + x + + foo_s = torch.jit.script(test_autodiff) + inp = torch.rand([4], requires_grad=True) + with self.assertRaises(Exception) as error_out: + for _ in range(3): + foo_s(inp) + f = FileCheck().check("unfused operators").check("aten::rand") + f.run(str(error_out.exception)) + + def test_separate_fusions(x, y): + with torch.jit.strict_fusion(): + return x + x + x, y + y + y + + inp = torch.rand([4], requires_grad=True) + with self.assertRaises(Exception) as error_out: + for _ in range(3): + foo_s = torch.jit.script(test_separate_fusions) + foo_s(inp, inp) + + f = FileCheck().check("Found multiple fusions") + f.run(str(error_out.exception)) + class TestTEFuserStatic(TestTEFuser): dynamic_shapes = False @@ -2367,7 +2455,6 @@ class TestTEFuserDynamic(TestTEFuser): 'mul', 'ne', 'neg', - 'nn.functional.gelu', 'nn.functional.hardshrink', 'nn.functional.hardsigmoid', 'nn.functional.hardswish', @@ -2444,12 +2531,21 @@ def get_name(op): l.append(op.variant_test_name) return '.'.join(l) -class TestNNCOpInfo(JitCommonTestCase): +# Purpose of this class is to allow super() calls. +# super() [with no arguments] fails, presumably because of how instantiate_device_type_tests works. +# super(TestNNCOpInfo, self) fails because TestNNCOpInfo gets deleted from global scope. +# super(JitCommonTestCase, self).fn() would skip JitCommonTestCase.fn() implementation +class TestNNCOpInfoParent(JitCommonTestCase): + pass + +class TestNNCOpInfo(TestNNCOpInfoParent): def setUp(self): + super(TestNNCOpInfoParent, self).setUp() self.tensorexpr_options = TensorExprTestOptions() def tearDown(self): self.tensorexpr_options.restore() + super(TestNNCOpInfoParent, self).tearDown() def te_compile(self, device, dtype, op): if op.name in skip_ops: @@ -2531,7 +2627,7 @@ def test_nnc_correctness(self, device, dtype, op): variant_sample_pairs = get_traced_sample_variant_pairs(device, dtype, op) for variant, sample in variant_sample_pairs: - trace = create_traced_fn(self, variant) + trace = create_traced_fn(self, variant, cache_traced_fn=True) ref = variant(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) trace(*clone_inputs((sample.input, *sample.args)), **sample.kwargs) @@ -2549,9 +2645,13 @@ def test_nnc_correctness(self, device, dtype, op): only_for = ("cpu", "cuda") instantiate_device_type_tests(TestNNCOpInfo, globals(), only_for=only_for) +# Purpose of this class is to allow super() calls. (See TestNNCOpInfoParent) +class TestLoopnestRandomizationParent(JitTestCase): + pass -class TestLoopnestRandomization(JitTestCase): +class TestLoopnestRandomization(TestLoopnestRandomizationParent): def setUp(self): + super(TestLoopnestRandomizationParent, self).setUp() self.old_cpu_fuser_state = torch._C._jit_can_fuse_on_cpu() self.old_must_use_cpu_state = torch._C._jit_get_te_must_use_llvm_cpu() self.old_gpu_fuser_state = torch._C._jit_can_fuse_on_gpu() @@ -2562,7 +2662,7 @@ def setUp(self): torch._C._jit_override_can_fuse_on_gpu(True) self.old_profiling_executor = torch._C._jit_set_profiling_executor(True) - self.old_profiling_mode = torch._C._jit_set_profiling_mode(True) + self.old_profiling_mode = torch._C._get_graph_executor_optimize(True) self.old_fusion_inlining = torch._C._debug_get_fusion_group_inlining() torch._C._debug_set_fusion_group_inlining(False) @@ -2579,7 +2679,7 @@ def setUp(self): def tearDown(self): torch._C._jit_set_profiling_executor(self.old_profiling_executor) - torch._C._jit_set_profiling_mode(self.old_profiling_mode) + torch._C._get_graph_executor_optimize(self.old_profiling_mode) torch._C._jit_override_can_fuse_on_gpu(self.old_gpu_fuser_state) torch._C._jit_override_can_fuse_on_cpu(self.old_cpu_fuser_state) @@ -2591,6 +2691,7 @@ def tearDown(self): # Set it back to 0. os.environ["PYTORCH_TENSOREXPR_RANDOM_TRANSFORM_SEED"] = "0" + super(TestLoopnestRandomizationParent, self).tearDown() @onlyCPU @unittest.skipIf(not LLVM_ENABLED, "Compiles with TensorExprKernel") diff --git a/test/test_jit_llga_fuser.py b/test/test_jit_llga_fuser.py new file mode 100644 index 000000000000..1e79b745d2c1 --- /dev/null +++ b/test/test_jit_llga_fuser.py @@ -0,0 +1,519 @@ +# Owner(s): ["module: mkldnn"] +import torch +import unittest +import itertools + +import torch.nn as nn +import torch.nn.functional as F +from torch.testing._internal.jit_utils import JitTestCase +from torch.testing._internal.common_utils import run_tests, TEST_SCIPY, IS_WINDOWS, IS_MACOS + +LLGA_FUSION_GROUP = 'prim::oneDNNFusionGroup' +LLGA_NOT_ENABLED = not torch._C.has_mkldnn or IS_WINDOWS or IS_MACOS + + +def warmup_forward(f, *args, profiling_count=2): + for i in range(profiling_count): + results = f(*args) + + return results + + +class JitLlgaTestCase(JitTestCase): + def setUp(self): + torch.jit.enable_onednn_fusion(True) + + def tearDown(self): + torch.jit.enable_onednn_fusion(False) + + def checkTrace(self, m, x, *args, **kwargs): + if isinstance(m, torch.nn.Module): + m.eval() + with torch.no_grad(), \ + torch._jit_internal._disable_emit_hooks(): + traced = torch.jit.trace(m, x) + if isinstance(m, torch.nn.Module): + traced = torch.jit.freeze(traced) + warmup_forward(traced, *x) + fwd_graph = traced.graph_for(*x) + + ref_o = m(*x) + jit_o = traced(*x) + self.assertEqual(jit_o, ref_o) + return traced, fwd_graph + + def assertFused(self, graph, fused_patterns): + for pat in fused_patterns: + self.assertGraphContainsExactly(graph, pat, 0) + + +try: + import torchvision + HAS_TORCHVISION = True +except ImportError: + HAS_TORCHVISION = False +except RuntimeError: + HAS_TORCHVISION = False +skipIfNoTorchVision = unittest.skipIf(not HAS_TORCHVISION, 'no torchvision') + +def get_eltwise_fn(name): + if hasattr(torch, name): + return getattr(torch, name) + elif hasattr(F, name): + return getattr(F, name) + else: + raise NameError('Eltwise function %s not found' % name) + + +@unittest.skipIf(LLGA_NOT_ENABLED, "MKL-DNN build is disabled") +class TestOp(JitLlgaTestCase): + def test_conv2d(self): + for [spatial, in_channels, out_channels, kernel, padding, stride, dilation, g, bias] in itertools.product( + [7, 8], + [8, 15], + [7, 16], + [3, 4], + [0, 2], + [1, 2], + [1, 2], + [1, 2], + [True, False]): + + m = nn.Conv2d(in_channels=in_channels * g, + out_channels=out_channels * g, + kernel_size=kernel, + padding=padding, + stride=stride, + dilation=dilation, + groups=g, + bias=bias) + + x = torch.rand(1, in_channels * g, spatial, spatial) + _, graph = self.checkTrace(m, [x]) + self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1) + + def test_bn2d(self): + m = nn.BatchNorm2d(32).eval() + x = torch.rand(1, 32, 28, 28) + _, graph = self.checkTrace(m, [x]) + # single-op partition shouldn't be created for softmax + self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 0) + + def test_eltwise(self): + class M(nn.Module): + def __init__(self, eltwise_fn): + super(M, self).__init__() + self.eltwise = eltwise_fn + + def forward(self, x): + return self.eltwise(x) + + for eltwise in ['relu', 'gelu']: + eltwise_fn = get_eltwise_fn(eltwise) + m = M(eltwise_fn) + x = torch.rand(1, 32, 28, 28) + _, graph = self.checkTrace(m, [x]) + # single-op partition shouldn't be created. + self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 0) + + def test_max_pool2d(self): + for [spatial, kernel, padding, stride, dilation, ceil_mode] in itertools.product( + [15, 16, 17, 18, 19], + [4, 5], + [0, 1, 2], + [1, 2], # [1, 2, 4], TODO: fix issue in pad calculation + [1], # [1, 2], TODO: backend support for dilation + [True, False]): + + m = nn.MaxPool2d(kernel_size=kernel, + stride=stride, + padding=padding, + dilation=dilation, + ceil_mode=ceil_mode) + + x = torch.rand(1, 4, spatial, spatial) + _, graph = self.checkTrace(m, [x]) + self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1) + + def test_avg_pool2d(self): + for [spatial, kernel, padding, stride, ceil_mode, count_include_pad] in itertools.product( + [15, 16, 17, 18, 19], + [4, 5], + [0, 1, 2], + [1, 2, 4], + [False], # TODO: oneDNN Graph does not fully support ceil_mode=True + [True, False]): + + m = nn.AvgPool2d(kernel_size=kernel, + stride=stride, + padding=padding, + ceil_mode=ceil_mode, + count_include_pad=count_include_pad) + + x = torch.rand(1, 4, spatial, spatial) + _, graph = self.checkTrace(m, [x]) + self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1) + + def test_variable_kernel_avg_pool2d(self): + class M(nn.Module): + def __init__(self): + super(M, self).__init__() + + def forward(self, x): + x = F.avg_pool2d(x, kernel_size=(x.size(2), x.size(3)), padding=0, count_include_pad=False) + return x + + x = torch.randn(1, 1000, 1, 1) + m = M() + _, graph = self.checkTrace(m, [x]) + # kernel_size is not Constant, shouldn't have any LLGA_FUSION_GROUP + # TODO: with shape specialization, should have 1 LLGA_FUSION_GROUP + self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 0) + + def test_softmax(self): + for dim in [-4, -3, -2, -1, 0, 1, 2, 3]: + m = nn.Softmax(dim=dim) + x = torch.rand(8, 12, 12, 12) + _, graph = self.checkTrace(m, [x]) + # single-op partition shouldn't be created for softmax + self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 0) + + def test_linear(self): + for bias in [True, False]: + x = torch.rand(32, 28) + m = torch.nn.Linear(in_features=28, out_features=64, bias=bias) + _, graph = self.checkTrace(m, [x]) + self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1) + self.assertFused(graph, ['aten::linear']) + + def _gen_binary_inputs(self, gen_permute=True): + for xshape, yshape in [ + [[1, 32, 28, 28], [1, 32, 28, 28]], + [[1, 32, 28, 28], [1, 1, 28, 28]], + [[1, 32, 28, 28], [28]], + [[1, 32, 28, 28], [1]], + + ]: + yield torch.rand(xshape), torch.rand(yshape) + if gen_permute and xshape != yshape: + yield torch.rand(yshape), torch.rand(xshape) + + def test_add(self): + def forward_add(x, y): + return torch.add(x, y, alpha=2) + + for x, y in self._gen_binary_inputs(): + _, graph = self.checkTrace(forward_add, [x, y]) + self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1) + + def test_add_scalar(self): + def add_scalar(x): + return 42 + x + 3.14 + + x = torch.rand(32, 32) + _, graph = self.checkTrace(add_scalar, [x]) + self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1) + + def test_addmm(self): + def addmm(x, y, z): + # alpha and beta are 1, by default + return torch.addmm(z, x, y) + + x = torch.rand(64, 32) + y = torch.rand(32, 32) + z = torch.rand(64, 32) + _, graph = self.checkTrace(addmm, [x, y, z]) + # single-op partition should be created for matmul with bias. + self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1) + + def test_mul(self): + def forward_mul(x, y): + return torch.mul(x, y) * 3 + + for x, y in self._gen_binary_inputs(): + _, graph = self.checkTrace(forward_mul, [x, y]) + # single-op partitions shouldn't be created + self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1) + + def test_identity_binary(self): + def forward(x): + return x * 1 + 0.0 + + x = torch.rand(32) + _, graph = self.checkTrace(forward, [x]) + self.assertFused(graph, ['aten::add', 'aten::mul']) + + def test_layer_norm(self): + # TODO: support more normalized_shape + m = torch.nn.LayerNorm(10) + x = torch.randn(2, 5, 10, 10) + _, graph = self.checkTrace(m, [x]) + self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1) + + def test_cat(self): + def cat_along_dim(d): + def forward_cat(*inputs): + return torch.cat(inputs, d) + return forward_cat + + for xshape in [ + [8, 8, 8, 8], + [64, 8, 32], + [2048, 64], + ]: + for d in range(len(xshape)): + x = torch.rand(xshape) + _, graph = self.checkTrace(cat_along_dim(d), [x, x, x]) + self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1) + + def test_typecheck(self): + x = torch.rand(32, 28) + m = torch.nn.Linear(in_features=28, out_features=64, bias=True) + traced, graph = self.checkTrace(m, [x]) + self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1) + self.assertFused(graph, ['aten::linear']) + # change the shape of the input, we should enter fallback graph + x = torch.rand(5, 28) + self.assertEqual(m(x), traced(x)) + + +@unittest.skipIf(LLGA_NOT_ENABLED, "MKL-DNN build is disabled") +class TestFusionPattern(JitLlgaTestCase): + def test_conv2d_eltwise(self): + class M(nn.Module): + def __init__(self, eltwise_fn): + super(M, self).__init__() + self.conv1 = nn.Conv2d(32, 32, 3, padding=1, bias=True) + self.conv2 = nn.Conv2d(32, 32, 3, padding=1, bias=False) + self.eltwise = eltwise_fn + + def forward(self, x): + x = self.conv1(x) + x = self.eltwise(x) + x = self.conv2(x) + x = self.eltwise(x) + return x + + # for eltwise in ['relu', 'sigmoid', 'sqrt', 'abs', 'square', 'hardtanh']: + for eltwise in ['relu']: + for inplace in [True, False]: + eltwise_fn_name = eltwise + '_' if inplace else eltwise + eltwise_fn = get_eltwise_fn(eltwise_fn_name) + + m = M(eltwise_fn) + x = torch.rand(1, 32, 28, 28) + _, graph = self.checkTrace(m, [x]) + self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 2) + # test if relu_ is replace with relu by mutation removal pass + self.assertFused(graph, ['aten::' + eltwise_fn_name]) + # test if relu is fused into the fusion group + self.assertFused(graph, ['aten::' + eltwise]) + + def test_conv2d_bn(self): + class M(nn.Module): + def __init__(self): + super(M, self).__init__() + self.conv1 = nn.Conv2d(32, 32, 3, padding=1, bias=True) + self.bn1 = nn.BatchNorm2d(32) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + return x + + m = M().eval() + x = torch.rand(1, 32, 28, 28) + _, graph = self.checkTrace(m, [x]) + self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1) + self.assertFused(graph, ['aten::_convolution', 'aten::batch_norm']) + + + def test_conv2d_bn_relu(self): + class M(nn.Module): + def __init__(self): + super(M, self).__init__() + self.conv1 = nn.Conv2d(32, 32, 3, padding=1, bias=True) + self.bn1 = nn.BatchNorm2d(32) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = F.relu(x) + return x + + m = M().eval() + x = torch.rand(1, 32, 28, 28) + _, graph = self.checkTrace(m, [x]) + self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1) + self.assertFused(graph, ['aten::_convolution', 'aten::batch_norm', + 'aten::relu']) + + def test_bn2d_eltwise(self): + class M(nn.Module): + def __init__(self, eltwise_fn): + super(M, self).__init__() + self.eltwise = eltwise_fn + self.bn = nn.BatchNorm2d(32) + + def forward(self, x): + x = self.bn(x) + x = self.eltwise(x) + return x + + for eltwise in ['relu']: + eltwise_fn = get_eltwise_fn(eltwise) + m = M(eltwise_fn).eval() + x = torch.rand(1, 32, 28, 28) + _, graph = self.checkTrace(m, [x]) + self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1) + self.assertFused(graph, ['aten::' + eltwise]) + + def test_linear_eltwise(self): + class M(nn.Module): + def __init__(self, eltwise_fn, bias): + super(M, self).__init__() + self.linear = nn.Linear(28, 64, bias) + self.eltwise = eltwise_fn + + def forward(self, x): + x = self.linear(x) + x = self.eltwise(x) + return x + + for [has_bias, eltwise] in itertools.product( + [True, False], + ['relu', 'gelu', 'sigmoid', 'hardtanh', 'relu6', 'elu']): + + eltwise_fn = get_eltwise_fn(eltwise) + m = M(eltwise_fn, has_bias) + x = torch.rand(32, 28, requires_grad=False) + _, graph = self.checkTrace(m, [x]) + self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1) + self.assertFused(graph, ['aten::' + eltwise]) + + def test_conv2d_sum(self): + class M(nn.Module): + def __init__(self, bias=False): + super(M, self).__init__() + self.conv1 = nn.Conv2d(32, 32, 3, padding=1, bias=bias) + self.bn1 = nn.BatchNorm2d(32) + self.conv2 = nn.Conv2d(32, 32, 3, padding=1, bias=bias) + self.bn2 = nn.BatchNorm2d(32) + self.relu = nn.ReLU() + self.conv3 = nn.Conv2d(32, 32, 3, padding=1, bias=bias) + self.bn3 = nn.BatchNorm2d(32) + + def forward(self, x, y): + x = self.conv1(x) + x = self.bn1(x) + y = self.conv2(y) + y = self.bn2(y) + z = self.relu(x + y) + z = self.conv3(z) + z = self.bn3(z) + return z + + for bias in [True, False]: + m = M(bias).eval() + x = torch.rand(1, 32, 16, 16, requires_grad=False) + y = torch.rand(1, 32, 16, 16, requires_grad=False) + _, graph = self.checkTrace(m, [x, y]) + self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 3) + + def test_wildcard(self): + class M(nn.Module): + def __init__(self): + super(M, self).__init__() + self.conv1 = nn.Conv2d(32, 32, 3, padding=1, bias=True) + self.eltwise = nn.ReLU() + + def forward(self, x): + x = self.conv1(x) + y = self.eltwise(x) + return [x, y] + + # The pattern is as the following: + # conv + # | \ + # eltwise \ + # | \ + # ListConstruct + # + # The output of conv is used by a wildcard op: ListConstruct. + # Thus conv-eltwise cannot be selected into the same Partition. + m = M() + x = torch.rand(1, 32, 28, 28) + _, graph = self.checkTrace(m, [x]) + # conv can exist in a single-op oneDNN Graph partition but not relu + self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1) + self.assertFused(graph, ['aten::_convolution']) + + def test_rewrap_tensor_input_to_pytorch(self): + class M(nn.Module): + def __init__(self, eltwise_fn, data_type): + super(M, self).__init__() + self.conv1 = nn.Conv2d(32, 32, 3, padding=1, bias=True, dtype=data_type) + self.conv2 = nn.Conv2d(32, 32, 3, padding=1, bias=True, dtype=data_type) + self.eltwise = eltwise_fn + self.adaptive_avg_pool_2d = nn.AdaptiveAvgPool2d((5, 7)) + + def forward(self, x, y): + x = self.conv1(x) + x = self.eltwise(x) + x = self.conv2(x) + x = self.eltwise(x) + x = torch.add(x, y) + x = self.adaptive_avg_pool_2d(x) + return x + + eltwise_fn_name = 'relu' + eltwise_fn = get_eltwise_fn(eltwise_fn_name) + # Add bfloat16 later + for data_type in [torch.float]: + m = M(eltwise_fn, data_type) + m = m.to(memory_format=torch.channels_last) + x = torch.rand(1, 32, 28, 28, dtype=data_type).to(memory_format=torch.channels_last) + y = torch.rand(1, 32, 28, 28, dtype=data_type).to(memory_format=torch.channels_last) + # Simply test if the output is accurate + # The output of the second partition is input to adaptive_avg_pool2d, which is + # unsupported by LLGA, so it must be handled by PyTorch, which should receive + # correct strides info of the channels-last tensor. + graph, _ = self.checkTrace(m, [x, y]) + + +@unittest.skipIf(LLGA_NOT_ENABLED, "MKL-DNN build is disabled") +class TestModel(JitLlgaTestCase): + @skipIfNoTorchVision + def _test_vision(self, model_name): + m = getattr(torchvision.models, model_name)().eval() + x = torch.rand(1, 3, 224, 224) / 10 + _, graph = self.checkTrace(m, [x]) + self.assertFused(graph, ['aten::_convolution', 'aten::batch_norm', + 'aten::relu', 'aten::linear', + 'aten::avg_pool2d', 'aten::max_pool2d']) + + +for model_name, enabled in [ + ['resnet50', True], + ['resnext50_32x4d', True], + ['resnext101_32x8d', True], + ['densenet121', True], + ['googlenet', TEST_SCIPY], + ['mobilenet_v2', True], + ['mnasnet1_0', True], + ['squeezenet1_0', True], + ['vgg16', True], + ['alexnet', True], + ['shufflenet_v2_x1_0', True], + ['wide_resnet50_2', True], +]: + def wrapper(mname): + @unittest.skipIf(not enabled, 'Disabled') + def test(self): + return self._test_vision(mname) + return test + + setattr(TestModel, 'test_vision_%s' % model_name, wrapper(model_name)) + +if __name__ == '__main__': + run_tests() diff --git a/test/test_jiterator.py b/test/test_jiterator.py new file mode 100644 index 000000000000..a92998672ffb --- /dev/null +++ b/test/test_jiterator.py @@ -0,0 +1,132 @@ +# Owner(s): ["module: cuda"] + +import torch +from torch.cuda.jiterator import _create_jit_fn as create_jit_fn +import sys +from itertools import product +from torch.testing._internal.common_utils import TestCase, parametrize, run_tests, TEST_CUDA +from torch.testing._internal.common_dtype import all_types_and_complex_and +from torch.testing._internal.common_device_type import ( + skipCUDAIfRocm, skipCUDAIf, instantiate_device_type_tests, dtypes, toleranceOverride, tol) +from torch.testing._internal.common_cuda import _get_torch_cuda_version + +if not TEST_CUDA: + print('CUDA not available, skipping tests', file=sys.stderr) + TestCase = object # noqa: F811 + + +code_string = "template T my_fused_kernel(T x, T y, T alpha, T beta) { return alpha * x + beta * y; }" +jitted_fn = create_jit_fn(code_string, alpha=1, beta=1) + +def ref_fn(x, y, alpha=1, beta=1): + return alpha * x + beta * y + +class TestPythonJiterator(TestCase): + @skipCUDAIfRocm + @parametrize("shape_strides", [ + (([3, 3], [3, 1]), ([3, 3], [3, 1])), # contiguous + ]) + @dtypes(*product(all_types_and_complex_and(torch.half, torch.bfloat16), + all_types_and_complex_and(torch.half, torch.bfloat16))) + def test_all_dtype_contiguous(self, device, dtypes, shape_strides): + a_buffer = torch.rand(9, device=device).mul(10).type(dtypes[0]) + b_buffer = torch.rand(9, device=device).mul(10).type(dtypes[1]) + + a = a_buffer.as_strided(*shape_strides[0]) + b = b_buffer.as_strided(*shape_strides[1]) + + expected = ref_fn(a, b) + result = jitted_fn(a, b) + + self.assertEqual(expected, result) + + @skipCUDAIfRocm + # See https://github.com/pytorch/pytorch/pull/76394#issuecomment-1118018287 for details + @skipCUDAIf(_get_torch_cuda_version() < (11, 6), "On cuda 11.3, nvrtcCompileProgram is taking too long to " + "compile jiterator generated kernels for non-contiguous input that requires dynamic-casting.") + @parametrize("shape_strides", [ + (([3, 3], [1, 3]), ([3, 1], [1, 3])), # non-contiguous + ]) + @dtypes(*product(all_types_and_complex_and(torch.half, torch.bfloat16), + all_types_and_complex_and(torch.half, torch.bfloat16))) + def test_all_dtype_noncontiguous(self, device, dtypes, shape_strides): + a_buffer = torch.rand(9, device=device).mul(10).type(dtypes[0]) + b_buffer = torch.rand(9, device=device).mul(10).type(dtypes[1]) + + a = a_buffer.as_strided(*shape_strides[0]) + b = b_buffer.as_strided(*shape_strides[1]) + + expected = ref_fn(a, b) + result = jitted_fn(a, b) + + self.assertEqual(expected, result) + + @skipCUDAIfRocm + @dtypes(torch.float, torch.double, torch.float16, torch.bfloat16) + @parametrize("alpha", [-1, 2.0, None]) + @parametrize("beta", [3, -4.2, None]) + @toleranceOverride({torch.float16 : tol(atol=1e-2, rtol=1e-3)}) + def test_extra_args(self, device, dtype, alpha, beta): + a = torch.rand(3, device=device).mul(10).type(dtype) + b = torch.rand(3, device=device).mul(10).type(dtype) + + extra_args = {} + if alpha is not None: + extra_args["alpha"] = alpha + if beta is not None: + extra_args["beta"] = beta + + expected = ref_fn(a, b, **extra_args) + result = jitted_fn(a, b, **extra_args) + + self.assertEqual(expected, result) + + @skipCUDAIfRocm + def test_bool_extra_args(self, device): + code_string = "template T conditional(T x, T mask, bool is_train) { return is_train ? x * mask : x; }" + jitted_fn = create_jit_fn(code_string, is_train=False) + + def ref_fn(x, mask, is_train): + return x * mask if is_train else x + + a = torch.rand(3, device=device) + b = torch.rand(3, device=device) + + expected = ref_fn(a, b, is_train=True) + result = jitted_fn(a, b, is_train=True) + self.assertEqual(expected, result) + + @skipCUDAIfRocm + @parametrize("num_inputs", list(range(1, 9))) + def test_various_num_inputs(self, num_inputs): + inputs = [] + for i in range(num_inputs): + inputs.append(torch.rand(3, device='cuda').mul(10)) + + input_string = ",".join([f"T i{i}" for i in range(num_inputs)]) + function_body = "+".join([f"i{i}" for i in range(num_inputs)]) + code_string = f"template T my_kernel({input_string}) {{ return {function_body}; }}" + jitted_fn = create_jit_fn(code_string) + + def ref_fn(*inputs): + return torch.sum(torch.stack(inputs), dim=0) + + expected = ref_fn(*inputs) + result = jitted_fn(*inputs) + + self.assertEqual(expected, result) + + @skipCUDAIfRocm + @parametrize("code_string", [ + "template T my _kernel(T x) { return x; }", + "template Tmy_kernel(T x) { return x; }", + ]) + def test_invalid_function_name(self, code_string): + with self.assertRaises(Exception): + jitted_fn = create_jit_fn(code_string) + + +instantiate_device_type_tests(TestPythonJiterator, globals(), only_for="cuda") + +if __name__ == '__main__': + run_tests() diff --git a/test/test_linalg.py b/test/test_linalg.py index 6ca35557bbf5..1b0b4d95478a 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -22,13 +22,14 @@ (instantiate_device_type_tests, dtypes, has_cusolver, onlyCPU, skipCUDAIf, skipCUDAIfNoMagma, skipCPUIfNoLapack, precisionOverride, skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfRocm, onlyNativeDeviceTypes, dtypesIfCUDA, - onlyCUDA, skipCUDAVersionIn, skipMeta, skipCUDAIfNoCusolver) + onlyCUDA, skipCUDAVersionIn, skipMeta, skipCUDAIfNoCusolver, dtypesIfMPS) from torch.testing import make_tensor from torch.testing._internal.common_dtype import ( - all_types, floating_and_complex_types, get_all_dtypes, get_all_int_dtypes, get_all_complex_dtypes, - get_all_fp_dtypes, + all_types, all_types_and_complex_and, floating_and_complex_types, integral_types, + floating_and_complex_types_and, floating_types_and, complex_types, ) -from torch.testing._internal.common_cuda import SM53OrLater, tf32_on_and_off, CUDA11OrLater, CUDA9 +from torch.testing._internal.common_cuda import SM53OrLater, tf32_on_and_off, CUDA11OrLater, CUDA9, _get_magma_version, \ + _get_torch_cuda_version from torch.distributions.binomial import Binomial # Protects against includes accidentally setting the default dtype @@ -101,7 +102,7 @@ def check(a_sizes_, b_sizes_): # Tests torch.outer, and its alias, torch.ger, vs. NumPy @precisionOverride({torch.bfloat16: 1e-1}) - @dtypes(*(get_all_dtypes())) + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool)) def test_outer(self, device, dtype): def run_test_case(a, b): if dtype == torch.bfloat16: @@ -138,6 +139,14 @@ def run_test_case(a, b): run_test_case(zero_strided, b) run_test_case(a, zero_strided) + def test_solve_removed_error(self, device): + a = make_tensor(5, 5, device=device, dtype=torch.float32) + b = make_tensor(5, 1, device=device, dtype=torch.float32) + with self.assertRaisesRegex(RuntimeError, "This function was deprecated since version 1.9 and is now removed"): + torch.solve(b, a) + with self.assertRaisesRegex(RuntimeError, "This function was deprecated since version 1.9 and is now removed"): + b.solve(a) + @skipCUDAIfNoMagma @skipCPUIfNoLapack @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble) @@ -264,7 +273,8 @@ def numpy_ref(a, b): else: # driver == 'gelsy' # QR based algorithm; setting the value too high might lead to non-unique solutions and flaky tests - rcond = 1e-4 + # so we skip this case + continue # specifying rcond value has no effect for gels driver so no need to run the tests again if driver == 'gels' and rcond is not None: @@ -744,7 +754,7 @@ def check(m, a, b, beta, alpha): check(m_scalar, a, b, beta, alpha) # test nans and infs are not propagated to the output when beta == 0 - float_and_complex_dtypes = get_all_fp_dtypes() + get_all_complex_dtypes() + float_and_complex_dtypes = floating_and_complex_types_and(torch.half, torch.bfloat16) if beta == 0 and dtype in float_and_complex_dtypes: m[0][10] = m[10][10] = m[20][20] = float('inf') m[1][10] = m[11][10] = m[21][20] = float('nan') @@ -757,7 +767,7 @@ def test_addr_bool(self, device, dtype): self._test_addr_vs_numpy(device, dtype, beta=False, alpha=False) self._test_addr_vs_numpy(device, dtype, beta=True, alpha=True) - @dtypes(*(get_all_int_dtypes())) + @dtypes(*integral_types()) def test_addr_integral(self, device, dtype): with self.assertRaisesRegex(RuntimeError, 'argument beta must not be a floating point number.'): @@ -778,7 +788,7 @@ def test_addr_integral(self, device, dtype): self._test_addr_vs_numpy(device, dtype, beta=2, alpha=2) @precisionOverride({torch.bfloat16: 1e-1}) - @dtypes(*(get_all_fp_dtypes() + get_all_complex_dtypes())) + @dtypes(*floating_and_complex_types_and(torch.half, torch.bfloat16)) def test_addr_float_and_complex(self, device, dtype): with self.assertRaisesRegex(RuntimeError, 'Boolean beta only supported for Boolean results.'): @@ -791,11 +801,11 @@ def test_addr_float_and_complex(self, device, dtype): self._test_addr_vs_numpy(device, dtype, beta=0., alpha=2) # when beta is not zero self._test_addr_vs_numpy(device, dtype, beta=0.5, alpha=2) - if dtype in get_all_complex_dtypes(): + if dtype in complex_types(): self._test_addr_vs_numpy(device, dtype, beta=(0 + 0.1j), alpha=(0.2 - 0.2j)) - @dtypes(*itertools.product(get_all_dtypes(), - get_all_dtypes())) + @dtypes(*itertools.product(all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool), + all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))) def test_outer_type_promotion(self, device, dtypes): a = torch.randn(5).to(device=device, dtype=dtypes[0]) b = torch.randn(5).to(device=device, dtype=dtypes[1]) @@ -805,7 +815,7 @@ def test_outer_type_promotion(self, device, dtypes): # don't use @dtypes decorator to avoid generating ~1700 tests per device def test_addr_type_promotion(self, device): - for dtypes0, dtypes1, dtypes2 in product(get_all_dtypes(), repeat=3): + for dtypes0, dtypes1, dtypes2 in product(all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool), repeat=3): a = make_tensor((5,), device=device, dtype=dtypes0, low=-2, high=2) b = make_tensor((5,), device=device, dtype=dtypes1, low=-2, high=2) m = make_tensor((5, 5), device=device, dtype=dtypes2, low=-2, high=2) @@ -1100,96 +1110,65 @@ def test_kron_errors_and_warnings(self, device, dtype): # This test confirms that torch.linalg.norm's dtype argument works # as expected, according to the function's documentation - @skipCUDAIfNoMagma - def test_norm_dtype(self, device): - def run_test_case(input_size, ord, keepdim, from_dtype, to_dtype): - # Determine the best dtype to use for comparisons between tensors - # of two different types - def get_compare_dtype(type0, type1): - types_32bit_based = [torch.float, torch.cfloat] - is_complex = type0.is_complex or type1.is_complex - - if type0 in types_32bit_based or type1 in types_32bit_based: - return torch.cfloat if is_complex else torch.float - else: - return torch.cdouble if is_complex else torch.double - - compare_dtype = get_compare_dtype(from_dtype, to_dtype) - - def get_value_type(dtype): - if dtype == torch.cfloat: - return torch.float - elif dtype == torch.cdouble: - return torch.double - elif dtype == torch.complex32: - return torch.float16 - else: - return dtype + @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble, torch.bfloat16, torch.float16) + def test_norm_dtype(self, device, dtype): + make_arg = partial(make_tensor, dtype=dtype, device=device) + def run_test_case(input_size, ord, keepdim, to_dtype): msg = ( f'input_size={input_size}, ord={ord}, keepdim={keepdim}, ' - f'from_dtype={from_dtype}, to_dtype={to_dtype}') - input = torch.randn(*input_size, dtype=from_dtype, device=device) + f'dtype={dtype}, to_dtype={to_dtype}') + input = make_arg(input_size) result = torch.linalg.norm(input, ord, keepdim=keepdim) - if from_dtype.is_complex: - # By default, norm downgrades a complex input to the corresponding real number type - self.assertEqual(result.dtype, get_value_type(from_dtype), msg=msg) - else: - self.assertEqual(result.dtype, from_dtype, msg=msg) + self.assertEqual(result.dtype, input.real.dtype, msg=msg) - result_out = torch.empty((0), dtype=to_dtype, device=device) + result_out = torch.empty((0), dtype=result.dtype, device=device) torch.linalg.norm(input, ord, keepdim=keepdim, out=result_out) - self.assertEqual(result_out.dtype, to_dtype, msg=msg) - self.assertEqual(result.to(compare_dtype), result_out.to(compare_dtype), msg=msg) + self.assertEqual(result, result_out, msg=msg) + result = torch.linalg.norm(input.to(to_dtype), ord, keepdim=keepdim) result_with_dtype = torch.linalg.norm(input, ord, keepdim=keepdim, dtype=to_dtype) - self.assertEqual(result_with_dtype.dtype, to_dtype, msg=msg) - - if from_dtype.is_complex: - result_convert_first = torch.linalg.norm(input.to(to_dtype), ord, keepdim=keepdim) - self.assertEqual(result_with_dtype.to(compare_dtype), result_convert_first.to(compare_dtype), msg=msg) - else: - self.assertEqual(result.to(compare_dtype), result_with_dtype.to(compare_dtype), msg=msg) + self.assertEqual(result, result_with_dtype, msg=msg) result_out_with_dtype = torch.empty_like(result_with_dtype) torch.linalg.norm(input, ord, keepdim=keepdim, dtype=to_dtype, out=result_out_with_dtype) - self.assertEqual(result_out_with_dtype.dtype, to_dtype, msg=msg) self.assertEqual(result_with_dtype, result_out_with_dtype, msg=msg) - ord_vector = [0, 0.1, -0.1, 1, -1, 2, -2, 3, -3, 4.5, -4.5, inf, -inf, None] + ord_vector = [0, 1, -1, 2, -2, 3, -3, 4.5, -4.5, inf, -inf, None] + + # In these orders we are computing the 10-th power and 10-th root of numbers. + # We avoid them for half-precision types as it makes the tests above too badly conditioned + if dtype != torch.float16 and dtype != torch.bfloat16: + ord_vector.extend([0.1, -0.1]) ord_matrix = ['fro', 'nuc', 1, -1, 2, -2, inf, -inf, None] S = 10 - test_cases = [ - ((S, ), ord_vector), - ((S, S), ord_matrix), - ] - for keepdim in [True, False]: - for input_size, ord_settings in test_cases: - for ord in ord_settings: - if self.device_type == 'cpu' and not torch._C.has_lapack and ord in [2, -2, 'nuc']: - continue - dtypes = [torch.float, torch.double, torch.cfloat, torch.cdouble] - for from_dtype, to_dtype in itertools.product(dtypes, dtypes): - if from_dtype.is_complex and not to_dtype.is_complex: - continue - run_test_case(input_size, ord, keepdim, from_dtype, to_dtype) - - # Make sure that setting dtype != out.dtype raises an error - dtype_pairs = [ - (torch.float, torch.double), - (torch.double, torch.float), - (torch.cfloat, torch.cdouble), - (torch.cdouble, torch.cfloat), - ] - for keepdim in [True, False]: - for input_size, ord_settings in test_cases: - for ord in ord_settings: - for dtype, out_dtype in dtype_pairs: - input = torch.rand(*input_size) - result = torch.tensor([]).to(out_dtype) - with self.assertRaisesRegex(RuntimeError, r'provided dtype must match dtype of result'): - torch.linalg.norm(input, ord=ord, keepdim=keepdim, dtype=dtype, out=result) + if dtype == torch.cfloat: + norm_dtypes = (torch.cfloat, torch.cdouble) + elif dtype == torch.cdouble: + norm_dtypes = (torch.cdouble,) + elif dtype in (torch.float16, torch.bfloat16, torch.float): + norm_dtypes = (torch.float, torch.double) + elif dtype == torch.double: + norm_dtypes = (torch.double,) + else: + raise RuntimeError("Unsupported dtype") + + for ord, keepdim, norm_dtype in product(ord_vector, (True, False), norm_dtypes): + run_test_case((S,) , ord, keepdim, norm_dtype) + + for ord, keepdim, norm_dtype in product(ord_matrix, (True, False), norm_dtypes): + if ord in [2, -2, 'nuc']: + # We need torch.svdvals + if dtype == torch.float16 or dtype == torch.bfloat16: + continue + + # We need LAPACK or equivalent + if ((torch.device(device).type == 'cuda' and not torch.cuda.has_magma and not has_cusolver()) or + (torch.device(device).type == 'cpu' and not torch._C.has_lapack)): + continue + run_test_case((S, S) , ord, keepdim, norm_dtype) + @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble, torch.bfloat16, torch.float16) def test_vector_norm(self, device, dtype): @@ -1218,45 +1197,40 @@ def vector_norm_reference(input, ord, dim=None, keepdim=False, dtype=None): return result def run_test_case(input, ord, dim, keepdim, norm_dtype): - msg = f'input.size()={input.size()}, ord={ord}, dim={dim}, keepdim={keepdim}, dtype={dtype}, norm_dtype={norm_dtype}' - error_msg = None - if input.numel() == 0: - if ord < 0: - error_msg = r'linalg.vector_norm of negative order cannot be performed on an empty tensor' - elif ord == inf and (dim is None or input.size(dim) == 0): - error_msg = ( - r'linalg.vector_norm cannot compute the infinity norm on an empty ' - r'dimension because the operation does not have an identity') - if error_msg is None: + if (input.numel() == 0 and + (ord < 0. or ord == inf) and + (dim is None or input.shape[dim] == 0)): + # The operation does not have an identity. + error_msg = "linalg.vector_norm cannot compute" + with self.assertRaisesRegex(RuntimeError, error_msg): + torch.linalg.vector_norm(input, ord, dim=dim, keepdim=keepdim) + else: + msg = (f'input.size()={input.size()}, ord={ord}, dim={dim}, ' + f'keepdim={keepdim}, dtype={dtype}, norm_dtype={norm_dtype}') result_dtype_reference = vector_norm_reference(input, ord, dim=dim, keepdim=keepdim, dtype=norm_dtype) result_dtype = torch.linalg.vector_norm(input, ord, dim=dim, keepdim=keepdim, dtype=norm_dtype) + if dtype.is_complex: + result_dtype_reference = result_dtype_reference.real self.assertEqual(result_dtype, result_dtype_reference, msg=msg) if norm_dtype is not None: - result_convert_before = torch.linalg.vector_norm(input.to(norm_dtype), ord, dim=dim, keepdim=keepdim) - if norm_dtype.is_complex: - result_convert_before = result_convert_before.to(norm_dtype) - - result_out = torch.empty((0), dtype=norm_dtype, device=device) - torch.linalg.vector_norm(input, ord, dtype=norm_dtype, dim=dim, keepdim=keepdim, out=result_out) - self.assertEqual(result_convert_before, result_out, msg=msg) - else: - result_out = torch.empty((0), dtype=result_dtype.dtype, device=device) - torch.linalg.vector_norm(input, ord, dim=dim, keepdim=keepdim, out=result_out) - self.assertEqual(result_dtype, result_out, msg=msg) - else: - with self.assertRaises(RuntimeError): - vector_norm_reference(input, ord, dim=dim, keepdim=keepdim) - with self.assertRaisesRegex(RuntimeError, error_msg): - torch.linalg.vector_norm(input, ord, dim=dim, keepdim=keepdim) - - if dtype.is_complex: - norm_dtypes = [None, torch.cfloat, torch.cdouble] + ref = torch.linalg.vector_norm(input.to(norm_dtype), ord, dim=dim, keepdim=keepdim) + actual = torch.linalg.vector_norm(input, ord, dim=dim, keepdim=keepdim, dtype=norm_dtype) + self.assertEqual(ref, actual, msg=msg) + + if dtype == torch.cfloat: + norm_dtypes = (None, torch.cfloat, torch.cdouble) + elif dtype == torch.cdouble: + norm_dtypes = (None, torch.cdouble) + elif dtype in (torch.float16, torch.bfloat16, torch.float): + norm_dtypes = (None, torch.float, torch.double) + elif dtype == torch.double: + norm_dtypes = (None, torch.double) else: - norm_dtypes = [None, torch.float, torch.double, torch.cfloat, torch.cdouble, torch.float16, torch.bfloat16] + raise RuntimeError("Unsupported dtype") for input_size, ord, keepdim, norm_dtype in product(input_sizes, ord_vector, [True, False], norm_dtypes): - input = make_tensor(input_size, device, dtype, low=-9, high=9) + input = make_tensor(input_size, dtype=dtype, device=device, low=-9, high=9) for dim in [None, random.randint(0, len(input_size) - 1)]: run_test_case( input, @@ -1287,40 +1261,6 @@ def test_vector_norm_dim_tuple_arg(self, device): with self.assertRaises(error): torch.linalg.vector_norm(input, dim=dim) - # Test that linalg.vector_norm throws an error if the out tensor's dtype - # does not match the expected output dtype - @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble, torch.bfloat16, torch.float16) - def test_vector_norm_out_dtype_error(self, device, dtype): - input = torch.randn(10, device=device, dtype=dtype) - dtypes = [None, torch.float, torch.double, torch.cfloat, torch.cdouble, torch.float16, torch.bfloat16] - - for norm_dtype, out_dtype in product(dtypes, dtypes): - if out_dtype is None: - continue - - if norm_dtype is None: - if dtype == torch.cfloat: - expected_dtype = torch.float - elif dtype == torch.cdouble: - expected_dtype = torch.double - else: - expected_dtype = dtype - else: - expected_dtype = norm_dtype - - result = torch.empty((0), device=device, dtype=out_dtype) - msg = f'norm_dtype: {norm_dtype}, out_dtype: {out_dtype}, expected_dtype: {expected_dtype}' - - if dtype.is_complex and norm_dtype is not None and not norm_dtype.is_complex: - with self.assertRaisesRegex(RuntimeError, r"linalg.vector_norm expected complex 'dtype'", msg=msg): - torch.linalg.vector_norm(input, dtype=norm_dtype, out=result) - - elif out_dtype != expected_dtype: - with self.assertRaisesRegex(RuntimeError, r'linalg.vector_norm expected out tensor dtype', msg=msg): - torch.linalg.vector_norm(input, dtype=norm_dtype, out=result) - else: - torch.linalg.vector_norm(input, dtype=norm_dtype, out=result) - # This test compares torch.linalg.norm and numpy.linalg.norm to ensure that # their vector norm results match @dtypes(torch.float, torch.double) @@ -1363,49 +1303,45 @@ def run_test_case(input, p, dim, keepdim): @skipMeta # https://github.com/pytorch/pytorch/issues/54082 @skipCUDAIfNoMagma @dtypes(torch.float, torch.double) - @precisionOverride({torch.float32: 2e-5}) + @precisionOverride({torch.float32: 2e-4}) def test_norm_matrix(self, device, dtype): + make_arg = partial(make_tensor, dtype=dtype, device=device) + def run_test_case(input, ord, dim, keepdim): msg = f'input.size()={input.size()}, ord={ord}, dim={dim}, keepdim={keepdim}, dtype={dtype}' result = torch.linalg.norm(input, ord, dim, keepdim) input_numpy = input.cpu().numpy() result_numpy = np.linalg.norm(input_numpy, ord, dim, keepdim) - def check(op): - result = op(input, ord, dim, keepdim) - self.assertEqual(result, result_numpy, msg=msg) - result_out = torch.empty_like(result) - op(input, ord, dim, keepdim, out=result_out) - self.assertEqual(result, result_out, msg=msg) - - check(torch.linalg.norm) + result = torch.linalg.norm(input, ord, dim, keepdim) + self.assertEqual(result, result_numpy, msg=msg) if ord is not None and dim is not None: - check(torch.linalg.matrix_norm) + result = torch.linalg.matrix_norm(input, ord, dim, keepdim) + self.assertEqual(result, result_numpy, msg=msg) ord_matrix = [1, -1, 2, -2, inf, -inf, 'nuc', 'fro'] S = 10 test_cases = [ - # input size, p settings, dim - ((S, S), ord_matrix, None), - ((S, S), ord_matrix, (0, 1)), - ((S, S), ord_matrix, (1, 0)), - ((S, S, S, S), ord_matrix, (2, 0)), - ((S, S, S, S), ord_matrix, (-1, -2)), - ((S, S, S, S), ord_matrix, (-1, -3)), - ((S, S, S, S), ord_matrix, (-3, 2)), + # input size, dim + ((S, S), None), + ((S, S), (0, 1)), + ((S, S), (1, 0)), + ((S, S, S, S), (2, 0)), + ((S, S, S, S), (-1, -2)), + ((S, S, S, S), (-1, -3)), + ((S, S, S, S), (-3, 2)), ] - L = 1_000 - if dtype == torch.double: - test_cases.append(((L, L), ord_matrix, None)) - - for keepdim in [True, False]: - for input_size, ord_settings, dim in test_cases: - input = torch.randn(*input_size, dtype=dtype, device=device) - for ord in ord_settings: - if self.device_type == 'cpu' and not torch._C.has_lapack and ord in [2, -2, 'nuc']: - continue - run_test_case(input, ord, dim, keepdim) + for (shape, dim), keepdim, ord in product(test_cases, [True, False], ord_matrix): + if ord in [2, -2, 'nuc']: + # We need torch.svdvals + if dtype == torch.float16 or dtype == torch.bfloat16: + continue + # We need LAPACK or equivalent + if ((torch.device(device).type == 'cuda' and not torch.cuda.has_magma and not has_cusolver()) or + (torch.device(device).type == 'cpu' and not torch._C.has_lapack)): + continue + run_test_case(make_arg(shape), ord, dim, keepdim) @onlyCUDA @@ -1573,20 +1509,17 @@ def run_error_test_case(input, ord, dim, keepdim, error_type, error_regex): S = 10 error_test_cases = [ # input size, p settings, dim, error type, error regex - ((S, ), ['fro'], None, RuntimeError, r'order "fro" can only be used if either len\(dim\) == 2'), - ((S, ), ['nuc'], None, RuntimeError, r'order "nuc" can only be used if either len\(dim\) == 2'), - ((S, S), [3.5], None, RuntimeError, r'Order 3.5 not supported for matrix norm'), - ((S, S), [0], None, RuntimeError, r'Order 0 not supported for matrix norm'), - ((S, S), ['nuc'], 0, RuntimeError, r'order "nuc" can only be used if either len\(dim\) == 2'), - ((S, S), ['fro'], 0, RuntimeError, r'order "fro" can only be used if either len\(dim\) == 2'), - ((S, S), ['nuc'], (0, 0), RuntimeError, r'duplicate or invalid dimensions'), - ((S, S), ['fro', 0], (0, 0), RuntimeError, r'Expected dims to be different'), - ((S, S), ['fro', 'nuc', 0], (0, 4), IndexError, r'Dimension out of range'), + ((S, ), ['fro', 'nuc'], None, RuntimeError, r'input tensor must be a matrix or a batch of matrices'), + ((S, S), [3.5], None, RuntimeError, r'matrix_norm: Order 3.5 not supported'), + ((S, S), [0], None, RuntimeError, r'matrix_norm: Order 0 not supported'), + ((S, S), ['fail'], None, RuntimeError, r'matrix_norm: Order fail not supported'), + ((S, S), ['fro', 'nuc'], 0, RuntimeError, r'matrix_norm: dim must be a 2-tuple of ints'), + ((S, S), ['fro', 'nuc', 2], (0, 0), RuntimeError, r'Expected dims to be different'), + ((S, S), ['fro', 'nuc', 2], (0, 4), IndexError, r'Dimension out of range'), ((S, ), [0], (4, ), IndexError, r'Dimension out of range'), ((S, ), [None], (0, 0), RuntimeError, r'dim 0 appears multiple times'), - ((S, S, S), [1], (0, 1, 2), RuntimeError, r"'dim' must specify 1 or 2 dimensions"), - ((S, S, S), [1], None, RuntimeError, r"'dim' must specify 1 or 2 dimensions"), - ((S, S), ['garbage'], (0, 1), RuntimeError, r'Invalid norm order: garbage'), + ((S, S, S), [1], (0, 1, 2), RuntimeError, r"If dim is specified, it must be of length 1 or 2."), + ((S, S, S), [1], None, RuntimeError, r"If dim is not specified but ord is, the input must be 1D or 2D"), ] for keepdim in [True, False]: for input_size, ord_settings, dim, error_type, error_regex in error_test_cases: @@ -1619,10 +1552,10 @@ def gen_error_message(input_size, ord, keepdim, dim=None): self.assertEqual(res.shape, expected.shape, msg=msg) self.assertEqual(res, expected, msg=msg, exact_dtype=False) - res_out = torch.tensor([]).to(device) + res_out = torch.tensor([], device=device, dtype=res.dtype) torch.linalg.norm(x, ord, keepdim=keepdim, out=res_out) self.assertEqual(res_out.shape, expected.shape, msg=msg) - self.assertEqual(res_out.cpu(), expected, msg=msg, exact_dtype=False) + self.assertEqual(res_out, expected, msg=msg) # matrix norm x = torch.randn(25, 25, device=device, dtype=dtype) @@ -1634,10 +1567,10 @@ def gen_error_message(input_size, ord, keepdim, dim=None): self.assertEqual(res.shape, expected.shape, msg=msg) self.assertEqual(res, expected, msg=msg, exact_dtype=False) - res_out = torch.tensor([]).to(device) + res_out = torch.tensor([], device=device, dtype=res.dtype) torch.linalg.norm(x, ord, keepdim=keepdim, out=res_out) self.assertEqual(res_out.shape, expected.shape, msg=msg) - self.assertEqual(res_out.cpu(), expected, msg=msg, exact_dtype=False) + self.assertEqual(res_out, expected, msg=msg) # Test that linal.vector_norm gives the same result as numpy when inputs # contain extreme values (inf, -inf, nan) @@ -1655,18 +1588,17 @@ def test_vector_norm_extreme_values(self, device): result_n = np.linalg.norm(x_n, ord=ord) self.assertEqual(result, result_n, msg=msg) - @skipMeta # https://github.com/pytorch/pytorch/issues/54082 - @skipCUDAIfNoMagma + @skipCUDAIfNoMagmaAndNoCusolver @skipCPUIfNoLapack @dtypes(torch.float, torch.double) @precisionOverride({torch.float32: 2e-5}) def test_matrix_norm(self, device, dtype): # Test only inputs for which torch.linalg.matrix_norm diverges from torch.linalg.norm - A = make_tensor((2, 2, 2), device, dtype) + A = make_tensor((2, 2, 2), dtype=dtype, device=device) - with self.assertRaisesRegex(RuntimeError, r'linalg.matrix_norm\(\):.*must be a matrix.*'): - torch.linalg.matrix_norm(make_tensor((2,), device, dtype)) - with self.assertRaisesRegex(RuntimeError, r'linalg.matrix_norm\(\):.*must be a 2-tuple.*'): + with self.assertRaisesRegex(RuntimeError, r'linalg.matrix_norm:.*must be a matrix.*'): + torch.linalg.matrix_norm(make_tensor((2,), dtype=dtype, device=device)) + with self.assertRaisesRegex(RuntimeError, r'linalg.matrix_norm:.*must be a 2-tuple.*'): torch.linalg.matrix_norm(A, dim=(0,)) with self.assertRaisesRegex(RuntimeError, r'.*not supported.*'): torch.linalg.matrix_norm(A, ord=0) @@ -1738,14 +1670,9 @@ def is_broken_matrix_norm_case(ord, x): def test_norm_vector_degenerate_shapes(self, device, dtype): def run_test_case(input, ord, dim, keepdim): msg = f'input.size()={input.size()}, ord={ord}, dim={dim}, keepdim={keepdim}, dtype={dtype}' - should_error = False - if ord is not None and ord < 0: - should_error = True - elif ord == inf: - if dim is None or input.size(dim) == 0: - should_error = True - - if should_error: + if (input.numel() == 0 and + (ord < 0. or ord == inf) and + (dim is None or input.shape[dim] == 0)): with self.assertRaises(RuntimeError): torch.linalg.norm(input, ord, dim, keepdim) else: @@ -1754,7 +1681,7 @@ def run_test_case(input, ord, dim, keepdim): result = torch.linalg.norm(input, ord, dim, keepdim) self.assertEqual(result, result_numpy, msg=msg) - ord_vector = [0, 0.5, 1, 2, 3, inf, -0.5, -1, -2, -3, -inf, None] + ord_vector = [0, 0.5, 1, 2, 3, inf, -0.5, -1, -2, -3, -inf] S = 10 test_cases = [ # input size, dim @@ -2381,7 +2308,7 @@ def test_norm_fro_2_equivalence_old(self, device, dtype): (5, 3, 8, 1, 3, 5)] for input_size in input_sizes: - a = make_tensor(input_size, device, dtype, low=-9, high=9) + a = make_tensor(input_size, dtype=dtype, device=device, low=-9, high=9) # Try full reduction dim_settings = [None] @@ -2866,7 +2793,6 @@ def test_inv_ex_info_device(self, device, dtype): @skipCUDAIfNoMagmaAndNoCusolver @skipCPUIfNoLapack @dtypes(*floating_and_complex_types()) - @skipCUDAIfRocm def test_inv_ex_singular(self, device, dtype): # if the input matrix is not invertible, info with positive integer is returned A = torch.eye(3, 3, dtype=dtype, device=device) @@ -2894,6 +2820,7 @@ def test_inv_ex_singular(self, device, dtype): @slowTest @skipCUDAIfNoMagmaAndNoCusolver @skipCPUIfNoLapack + @skipCUDAIfRocm @dtypes(*floating_and_complex_types()) @precisionOverride({torch.float32: 2e-3, torch.complex64: 2e-3, torch.float64: 1e-5, torch.complex128: 1e-5}) @@ -2936,7 +2863,7 @@ def run_test_singular_input(batch_dim, n): @skipCPUIfNoLapack @onlyNativeDeviceTypes # TODO: XLA doesn't raise exception @skipCUDAIfRocm - @skipCUDAVersionIn([(11, 3), (11, 5)]) # https://github.com/pytorch/pytorch/issues/57482 + @skipCUDAVersionIn([(11, 3), (11, 5), (11, 6)]) # https://github.com/pytorch/pytorch/issues/57482 @dtypes(*floating_and_complex_types()) def test_inverse_errors_large(self, device, dtype): # Test batched inverse of singular matrices reports errors without crashing (gh-51930) @@ -3243,89 +3170,23 @@ def run_test_singular_input(batch_dim, n): @skipCUDAIfNoMagma @skipCPUIfNoLapack @dtypes(*floating_and_complex_types()) - def test_old_solve(self, device, dtype): - for (k, n) in zip([2, 3, 5], [3, 5, 7]): - b, A = self.solve_test_helper((n, n), (n, k), device, dtype) - x = torch.solve(b, A)[0] - self.assertEqual(b, np.matmul(A.cpu(), x.cpu())) - - @skipCUDAIfNoMagma - @skipCPUIfNoLapack - @dtypes(*floating_and_complex_types()) - def test_old_solve_batched(self, device, dtype): - def solve_batch_helper(A_dims, b_dims): - b, A = self.solve_test_helper(A_dims, b_dims, device, dtype) - x_exp_list = [] - for i in range(b_dims[0]): - x_exp_list.append(torch.solve(b[i], A[i])[0]) - x_exp = torch.stack(x_exp_list) # Stacked output - x_act = torch.solve(b, A)[0] # Actual output - self.assertEqual(x_exp, x_act) # Equality check - Ax = np.matmul(A.cpu(), x_act.cpu()) - self.assertEqual(b, Ax) - - for batchsize in [1, 3, 4]: - solve_batch_helper((batchsize, 5, 5), (batchsize, 5, 10)) - - @slowTest - @skipCUDAIfNoMagma - @skipCPUIfNoLapack - @dtypes(*floating_and_complex_types()) - def test_old_solve_batched_many_batches(self, device, dtype): - for A_dims, b_dims in zip([(256, 256, 5, 5), (3, 3)], [(5, 1), (512, 512, 3, 1)]): - b, A = self.solve_test_helper(A_dims, b_dims, device, dtype) - x, _ = torch.solve(b, A) - Ax = torch.matmul(A, x) - self.assertEqual(Ax, b.expand_as(x)) - - @skipCUDAIfNoMagma - @skipCPUIfNoLapack - @dtypes(*floating_and_complex_types()) - def test_old_solve_batched_broadcasting(self, device, dtype): + def test_solve_batched_broadcasting(self, device, dtype): from numpy.linalg import solve - def run_test(A_dims, b_dims): + def run_test(A_dims, B_dims): A_matrix_size = A_dims[-1] A_batch_dims = A_dims[:-2] - b, A = self.solve_test_helper(A_batch_dims + (A_matrix_size, A_matrix_size), b_dims, device, dtype) - x, _ = torch.solve(b, A) - x_exp = solve(A.cpu().numpy(), b.cpu().numpy()) - self.assertEqual(x, x_exp) + B, A = self.solve_test_helper(A_batch_dims + (A_matrix_size, A_matrix_size), B_dims, device, dtype) + actual = torch.linalg.solve(A, B) + expected = solve(A.cpu().numpy(), B.cpu().numpy()) + self.assertEqual(actual, expected) # test against numpy.linalg.solve - run_test((2, 1, 3, 4, 4), (2, 1, 3, 4, 6)) # no broadcasting - run_test((2, 1, 3, 4, 4), (4, 6)) # broadcasting b + run_test((5, 5), (2, 0, 5, 3)) # broadcasting with 0 batch dim + run_test((2, 0, 5, 5), (5, 3)) # broadcasting with 0 batch dim + run_test((2, 1, 3, 4, 4), (4, 6)) # broadcasting B run_test((4, 4), (2, 1, 3, 4, 2)) # broadcasting A - run_test((1, 3, 1, 4, 4), (2, 1, 3, 4, 5)) # broadcasting A & b - - @skipCUDAIfNoMagma - @skipCPUIfNoLapack - @dtypes(*floating_and_complex_types()) - def test_old_solve_errors_and_warnings(self, device, dtype): - # dtypes should be safely castable - a = torch.eye(2, dtype=dtype, device=device) - b = torch.randn(2, 1, dtype=dtype, device=device) - out = torch.empty(0, dtype=torch.int, device=device) - lu = torch.empty(0, dtype=dtype, device=device) - with self.assertRaisesRegex(RuntimeError, "but got solution with dtype Int"): - torch.solve(b, a, out=(out, lu)) - - out = torch.empty(0, dtype=dtype, device=device) - lu = torch.empty(0, dtype=torch.int, device=device) - with self.assertRaisesRegex(RuntimeError, "but got lu with dtype Int"): - torch.solve(b, a, out=(out, lu)) - - # device should match - if torch.cuda.is_available(): - wrong_device = 'cpu' if self.device_type != 'cpu' else 'cuda' - out = torch.empty(0, dtype=dtype, device=wrong_device) - lu = torch.empty_like(a) - with self.assertRaisesRegex(RuntimeError, "tensors to be on the same device"): - torch.solve(b, a, out=(out, lu)) - out = torch.empty(0, dtype=dtype, device=device) - lu = torch.empty_like(a).to(wrong_device) - with self.assertRaisesRegex(RuntimeError, "tensors to be on the same device"): - torch.solve(b, a, out=(out, lu)) + run_test((1, 3, 1, 4, 4), (2, 1, 3, 4, 5)) # broadcasting A & B @skipCUDAIfNoMagma @skipCPUIfNoLapack @@ -3678,6 +3539,9 @@ def test_matrix_rank_atol_rtol(self, device, dtype): result = torch.linalg.matrix_rank(a, atol=tol_value, rtol=tol_value) self.assertEqual(result, 2) # there are 2 singular values above max(0.81, 1.5*0.81) + # CUDA 11.6 issue failure https://github.com/pytorch/pytorch/issues/75391 + @skipCUDAIf(torch.version.cuda is not None + and torch.version.cuda.split(".") == ["11", "6"], "There's a bug in CUDA 11.6") @skipCUDAIfNoMagma @skipCPUIfNoLapack @dtypes(*floating_and_complex_types()) @@ -3791,7 +3655,7 @@ def test_old_matrix_rank(self, device, dtype): # This tests only the cases where torch.chain_matmul differs from torch.linalg.multi_dot which this is an "alias" for. def test_chain_matmul(self, device, dtype): # chain_matmul accepts a single input tensor while multi_dot does not - t = make_tensor((2, 2), device, dtype) + t = make_tensor((2, 2), dtype=dtype, device=device) self.assertEqual(t, torch.chain_matmul(t)) with self.assertRaisesRegex(RuntimeError, r"chain_matmul\(\): Expected one or more matrices"): torch.chain_matmul() @@ -3799,13 +3663,13 @@ def test_chain_matmul(self, device, dtype): # chain_matmul expects all tensors to be 2D whereas multi_dot allows the first and last tensors to # be either 1D or 2D with self.assertRaisesRegex(RuntimeError, r"Tensor dimension is 1, expected 2 instead"): - torch.chain_matmul(make_tensor(1, device, dtype), make_tensor(1, device, dtype)) + torch.chain_matmul(make_tensor(1, dtype=dtype, device=device), make_tensor(1, dtype=dtype, device=device)) @onlyNativeDeviceTypes @dtypes(torch.double, torch.cdouble) def test_multi_dot(self, device, dtype): def check(*shapes): - tensors = [make_tensor(shape, device, dtype) for shape in shapes] + tensors = [make_tensor(shape, dtype=dtype, device=device) for shape in shapes] np_arrays = [tensor.cpu().numpy() for tensor in tensors] res = torch.linalg.multi_dot(tensors).cpu() ref = torch.from_numpy(np.array(np.linalg.multi_dot(np_arrays))) @@ -3843,7 +3707,7 @@ def check(tensors, out, msg): with self.assertRaisesRegex(RuntimeError, msg): torch.linalg.multi_dot(tensors, out=out) - a = make_tensor(2, device, dtype) + a = make_tensor(2, dtype=dtype, device=device) check([], None, "expected at least 2 tensors") check([a], None, "expected at least 2 tensors") @@ -3852,17 +3716,17 @@ def check(tensors, out, msg): check([a, torch.tensor(1, device=device, dtype=dtype)], None, "the last tensor must be 1D or 2D") check([a, a, a], None, "tensor 1 must be 2D") - check([a, make_tensor((2, 2, 2), device, dtype), a], None, "tensor 1 must be 2D") + check([a, make_tensor((2, 2, 2), dtype=dtype, device=device), a], None, "tensor 1 must be 2D") - check([a, make_tensor(2, device, torch.double)], None, "all tensors must have be the same dtype") + check([a, make_tensor(2, dtype=torch.double, device=device)], None, "all tensors must have be the same dtype") check([a, a], torch.empty(0, device=device, dtype=torch.double), "expected out tensor to have dtype") if self.device_type == 'cuda': - check([a, make_tensor(2, 'cpu', dtype)], None, "all tensors must be on the same device") + check([a, make_tensor(2, dtype=dtype, device="cpu")], None, "all tensors must be on the same device") check([a, a], torch.empty(0, dtype=dtype), "expected out tensor to be on device") - check([a, make_tensor(3, device, dtype)], None, "cannot be multiplied") - check([a, make_tensor((3, 2), device, dtype), a], None, "cannot be multiplied") + check([a, make_tensor(3, dtype=dtype, device=device)], None, "cannot be multiplied") + check([a, make_tensor((3, 2), dtype=dtype, device=device), a], None, "cannot be multiplied") @precisionOverride({torch.float32: 5e-6, torch.complex64: 5e-6}) @skipCUDAIfNoMagma @@ -3955,14 +3819,14 @@ def test_linalg_qr_autograd_errors(self, device, dtype): self.assertEqual(q.shape, (0,)) # empty tensor b = torch.sum(r) with self.assertRaisesRegex(RuntimeError, - "The derivative of qr is not implemented when mode='r'"): + "The derivative of linalg.qr depends on Q"): b.backward() # inp = torch.randn((7, 5), device=device, dtype=dtype, requires_grad=True) q, r = torch.linalg.qr(inp, mode='complete') b = torch.sum(r) with self.assertRaisesRegex(RuntimeError, - "The derivative of qr is not implemented when mode='complete' and nrows > ncols"): + "The QR decomposition is not differentiable when mode='complete' and nrows > ncols"): b.backward() @skipCUDAIfNoMagma @@ -4054,17 +3918,17 @@ def _check_einsum(self, *args, np_args=None): @dtypes(torch.double, torch.cdouble) def test_einsum(self, device, dtype): # Test cases from https://gist.github.com/rockt/15ee013889d65342088e9260a377dc8f - x = make_tensor((5,), device, dtype) - y = make_tensor((7,), device, dtype) - A = make_tensor((3, 5), device, dtype) - B = make_tensor((2, 5), device, dtype) - C = make_tensor((2, 3, 5), device, dtype) - D = make_tensor((2, 5, 7), device, dtype) - E = make_tensor((7, 9), device, dtype) - F = make_tensor((2, 3, 3, 5), device, dtype) - G = make_tensor((5, 4, 6), device, dtype) - H = make_tensor((4, 4), device, dtype) - I = make_tensor((2, 3, 2), device, dtype) + x = make_tensor((5,), dtype=dtype, device=device) + y = make_tensor((7,), dtype=dtype, device=device) + A = make_tensor((3, 5), dtype=dtype, device=device) + B = make_tensor((2, 5), dtype=dtype, device=device) + C = make_tensor((2, 3, 5), dtype=dtype, device=device) + D = make_tensor((2, 5, 7), dtype=dtype, device=device) + E = make_tensor((7, 9), dtype=dtype, device=device) + F = make_tensor((2, 3, 3, 5), dtype=dtype, device=device) + G = make_tensor((5, 4, 6), dtype=dtype, device=device) + H = make_tensor((4, 4), dtype=dtype, device=device) + I = make_tensor((2, 3, 2), dtype=dtype, device=device) # Vector operations self._check_einsum('i->', x) # sum @@ -4095,20 +3959,20 @@ def test_einsum(self, device, dtype): self._check_einsum("ii", H) # trace self._check_einsum("ii->i", H) # diagonal self._check_einsum('iji->j', I) # non-contiguous trace - self._check_einsum('ngrg...->nrg...', make_tensor((2, 1, 3, 1, 4), device, dtype)) + self._check_einsum('ngrg...->nrg...', make_tensor((2, 1, 3, 1, 4), dtype=dtype, device=device)) # Test ellipsis self._check_einsum("i...->...", H) self._check_einsum("ki,...k->i...", A.t(), B) self._check_einsum("k...,jk->...", A.t(), B) self._check_einsum('...ik, ...j -> ...ij', C, x) - self._check_einsum('Bik,k...j->i...j', C, make_tensor((5, 3), device, dtype)) - self._check_einsum('i...j, ij... -> ...ij', C, make_tensor((2, 5, 2, 3), device, dtype)) + self._check_einsum('Bik,k...j->i...j', C, make_tensor((5, 3), dtype=dtype, device=device)) + self._check_einsum('i...j, ij... -> ...ij', C, make_tensor((2, 5, 2, 3), dtype=dtype, device=device)) # torch.bilinear with noncontiguous tensors - l = make_tensor((5, 10), device, dtype, noncontiguous=True) - r = make_tensor((5, 20), device, dtype, noncontiguous=True) - w = make_tensor((15, 10, 20), device, dtype) + l = make_tensor((5, 10), dtype=dtype, device=device, noncontiguous=True) + r = make_tensor((5, 20), dtype=dtype, device=device, noncontiguous=True) + w = make_tensor((15, 10, 20), dtype=dtype, device=device) self._check_einsum("bn,anm,bm->ba", l, w, r) # with strided tensors @@ -4116,11 +3980,11 @@ def test_einsum(self, device, dtype): @dtypes(torch.double, torch.cdouble) def test_einsum_sublist_format(self, device, dtype): - x = make_tensor((5,), device, dtype) - y = make_tensor((7,), device, dtype) - A = make_tensor((3, 5), device, dtype) - B = make_tensor((2, 5), device, dtype) - C = make_tensor((2, 1, 3, 1, 4), device, dtype) + x = make_tensor((5,), dtype=dtype, device=device) + y = make_tensor((7,), dtype=dtype, device=device) + A = make_tensor((3, 5), dtype=dtype, device=device) + B = make_tensor((2, 5), dtype=dtype, device=device) + C = make_tensor((2, 1, 3, 1, 4), dtype=dtype, device=device) self._check_einsum(x, [0]) self._check_einsum(x, [0], []) @@ -4135,9 +3999,9 @@ def test_einsum_sublist_format(self, device, dtype): self._check_einsum(A.t(), [0, Ellipsis], B, [1, 0], [Ellipsis]) # torch.bilinear with noncontiguous tensors - l = make_tensor((5, 10), device, dtype, noncontiguous=True) - r = make_tensor((5, 20), device, dtype, noncontiguous=True) - w = make_tensor((15, 10, 20), device, dtype) + l = make_tensor((5, 10), dtype=dtype, device=device, noncontiguous=True) + r = make_tensor((5, 20), dtype=dtype, device=device, noncontiguous=True) + w = make_tensor((15, 10, 20), dtype=dtype, device=device) self._check_einsum(l, [40, 41], w, [2, 41, 50], r, [40, 50], [40, 2]) @dtypes(torch.double, torch.cdouble) @@ -4214,7 +4078,7 @@ def test(n=10, # how many tests to generate shape[ell_index:ell_index] = ell_shape labels.insert(ell_index, ...) - operands.append(make_tensor(shape, device, dtype)) + operands.append(make_tensor(shape, dtype=dtype, device=device)) sublists.append(labels) # NumPy has a bug with the sublist format so for now we compare PyTorch sublist @@ -4251,7 +4115,7 @@ def test(n=10, # how many tests to generate def test_einsum_corner_cases(self, device): def check(equation, *operands, expected_output): tensors = [torch.tensor(operand, device=device, dtype=torch.float32) if not isinstance(operand, tuple) - else make_tensor(operand, device, torch.float32) for operand in operands] + else make_tensor(operand, dtype=torch.float32, device=device) for operand in operands] output = torch.einsum(equation, tensors) self.assertEqual(output, torch.tensor(expected_output, dtype=torch.float32, device=device)) @@ -4293,8 +4157,8 @@ def check(*args, regex, exception=RuntimeError): with self.assertRaisesRegex(exception, r'einsum\(\):.*' + regex): torch.einsum(*args) - x = make_tensor((2,), device, torch.float32) - y = make_tensor((2, 3), device, torch.float32) + x = make_tensor((2,), dtype=torch.float32, device=device) + y = make_tensor((2, 3), dtype=torch.float32, device=device) check('', [], regex=r'at least one operand', exception=ValueError) check('. ..', [x], regex=r'found \'.\' for operand 0 that is not part of any ellipsis') @@ -4405,7 +4269,7 @@ def test_linalg_solve_triangular(self, device, dtype): @onlyCUDA @skipCUDAIfNoMagma # Magma needed for the PLU decomposition @skipCUDAIfRocm # There is a memory access bug in rocBLAS in the (non-batched) solve_triangular - @skipCUDAVersionIn([(11, 3), (11, 5)]) # Tracked in https://github.com/pytorch/pytorch/issues/70111 + @skipCUDAVersionIn([(11, 3), (11, 5), (11, 6)]) # Tracked in https://github.com/pytorch/pytorch/issues/70111 @dtypes(*floating_and_complex_types()) @precisionOverride({torch.float32: 1e-2, torch.complex64: 1e-2, torch.float64: 1e-8, torch.complex128: 1e-8}) @@ -4643,98 +4507,86 @@ def test_triangular_solve_out_errors_and_warnings(self, device, dtype): self.assertTrue("An output with one or more elements was resized" in str(w[0].message)) self.assertTrue("An output with one or more elements was resized" in str(w[1].message)) - def check_single_matmul(self, x, y, shape): - a = np.array(x, copy=False) - b = np.array(y, copy=False) - expected = np.matmul(a, b) + def check_single_matmul(self, x, y): + + def assertEqual(answer, expected): + if x.dtype.is_floating_point or x.dtype.is_complex: + k = max(x.shape[-1], 1) # Scale the atol with the size of the matrix + self.assertEqual(answer, expected, + msg=f"{x.shape} x {y.shape} = {answer.shape}", + atol=k * 5e-5, + rtol=1e-4) + else: + self.assertEqual(answer, expected, msg=f"{x.shape} x {y.shape} = {answer.shape}") + + # test x @ y + expected = np.matmul(x.cpu(), y.cpu()) ans = torch.matmul(x, y) self.assertTrue(ans.is_contiguous()) - self.assertTrue(np.array_equal(ans, expected)) + assertEqual(ans, expected) - out = torch.zeros(*shape, dtype=torch.int64).to(x.device) + # test out + out = torch.empty_like(ans) ans = torch.matmul(x, y, out=out) self.assertIs(ans, out) self.assertTrue(ans.is_contiguous()) - self.assertTrue(np.array_equal(ans, expected)) + assertEqual(ans, expected) - # TODO: update to run on CUDA, too - @onlyCPU - def test_matmul_small_brute_force_1d_Nd(self, device): - # Issue #20452: range(0, 10) does not work. - n = 1 - for m in range(1, 8): - for p in range(1, 8): - for o in range(1, 5): - # 1d, 3d, inner dimensions C - x = torch.arange(m, device=device) - y = torch.arange(o * m * p, device=device).reshape(o, m, p) - self.check_single_matmul(x, y, (o, n, p)) - - # 1d, 3d, inner dimensions Fortran - x = torch.arange(m, device=device) - y = torch.arange(o * p * m, device=device).reshape(o, p, m).mT - self.check_single_matmul(x, y, (o, n, p)) - - # 1d, 3d, inner dimensions non-contiguous - x = torch.arange(2 * m, device=device)[::2] - y = torch.arange(o * m * 2 * p, device=device).reshape(o, m, 2 * p)[:, :, ::2] - self.check_single_matmul(x, y, (o, n, p)) - - for r in range(1, 5): - # 1d, 4d, inner dimensions C - x = torch.arange(m) - y = torch.arange(r * o * m * p, device=device).reshape(r, o, m, p) - self.check_single_matmul(x, y, (r, o, n, p)) - - # 1d, 4d, inner dimensions Fortran - x = torch.arange(m) - y = torch.arange(r * o * p * m, device=device).reshape(r, o, p, m).mT - self.check_single_matmul(x, y, (r, o, n, p)) - - # 1d, 4d, inner dimensions non-contiguous - x = torch.arange(2 * m, device=device)[::2] - y = torch.arange(r * o * m * 2 * p, device=device).reshape(r, o, m, 2 * p)[:, :, :, ::2] - self.check_single_matmul(x, y, (r, o, n, p)) - - # TODO: update to run on CUDA, too - @onlyCPU - def test_matmul_small_brute_force_2d_Nd(self, device): - # Issue #20452: range(0, 10) does not work. - for n in range(1, 5): - for m in range(1, 5): - for p in range(1, 5): - for o in range(1, 3): - # 2d, 3d, inner dimensions C - x = torch.arange(n * m, device=device).reshape(n, m) - y = torch.arange(o * m * p, device=device).reshape(o, m, p) - self.check_single_matmul(x, y, (o, n, p)) - - # 2d, 3d, inner dimensions Fortran - x = torch.arange(m * n, device=device).reshape(m, n).mT - y = torch.arange(o * p * m, device=device).reshape(o, p, m).mT - self.check_single_matmul(x, y, (o, n, p)) - - # 2d, 3d, inner dimensions non-contiguous - x = torch.arange(n * 2 * m, device=device).reshape(n, 2 * m)[:, ::2] - y = torch.arange(o * m * 2 * p, device=device).reshape(o, m, 2 * p)[:, :, ::2] - self.check_single_matmul(x, y, (o, n, p)) - - for r in range(1, 2): - # 2d, 4d, inner dimensions C - x = torch.arange(n * m, device=device).reshape(n, m) - y = torch.arange(r * o * m * p, device=device).reshape(r, o, m, p) - self.check_single_matmul(x, y, (r, o, n, p)) - - # 2d, 4d, inner dimensions Fortran - x = torch.arange(m * n, device=device).reshape(m, n).mT - y = torch.arange(r * o * p * m, device=device).reshape(r, o, p, m).mT - self.check_single_matmul(x, y, (r, o, n, p)) - - # 2d, 4d, inner dimensions non-contiguous - x = torch.arange(n * 2 * m, device=device).reshape(n, 2 * m)[:, ::2] - y = torch.arange(r * o * m * 2 * p, device=device).reshape(r, o, m, 2 * p)[:, :, :, ::2] - self.check_single_matmul(x, y, (r, o, n, p)) + def gen_sizes_matmul(self, x_dim, y_dim=4, matrix_size=4, batch_size=3): + """ + Generates sequences of tuples (x, y) of with size(x) = x_dim and + size(y) <= y_dim that are compatible wrt. matmul + """ + assert x_dim >= 1 + assert y_dim >= 2 + x = x_dim + for y in range(1, y_dim + 1): + for batch, mn in product(product(range(batch_size), repeat=max(x - 2, y - 2, 0)), + product(range(matrix_size), repeat=min(y, 2))): + if x == 1: + size_x = mn[:1] + size_y = batch + mn + yield size_x, size_y + else: + for k in range(matrix_size): + size_x = (k,) + mn[:1] + if x > 2: + size_x = batch[-(x - 2):] + size_x + size_y = mn + if y > 2: + size_y = batch[-(y - 2):] + size_y + yield size_x, size_y + + @dtypesIfCUDA(torch.float, torch.complex64) # Integer matmul just supported on CPU + @dtypes(torch.int64, torch.float, torch.complex64) + def test_matmul_small_brute_force_1d_Nd(self, device, dtype): + make_arg = partial(make_tensor, device=device, dtype=dtype) + + for (size_x, size_y), nctg_x, nctg_y in product(self.gen_sizes_matmul(1), (True, False), (True, False)): + x = make_arg(size_x, noncontiguous=nctg_x) + y = make_arg(size_y, noncontiguous=nctg_y) + self.check_single_matmul(x, y) + + @dtypesIfCUDA(torch.float, torch.complex64) # Integer matmul just supported on CPU + @dtypes(torch.int64, torch.float, torch.complex64) + def test_matmul_small_brute_force_2d_Nd(self, device, dtype): + make_arg = partial(make_tensor, device=device, dtype=dtype) + + for (size_x, size_y), nctg_x, nctg_y in product(self.gen_sizes_matmul(2), (True, False), (True, False)): + x = make_arg(size_x, noncontiguous=nctg_x) + y = make_arg(size_y, noncontiguous=nctg_y) + self.check_single_matmul(x, y) + + @dtypesIfCUDA(torch.float, torch.complex64) # Integer matmul just supported on CPU + @dtypes(torch.int64, torch.float, torch.complex64) + def test_matmul_small_brute_force_3d_Nd(self, device, dtype): + make_arg = partial(make_tensor, device=device, dtype=dtype) + + for (size_x, size_y), nctg_x, nctg_y in product(self.gen_sizes_matmul(3), (True, False), (True, False)): + x = make_arg(size_x, noncontiguous=nctg_x) + y = make_arg(size_y, noncontiguous=nctg_y) + self.check_single_matmul(x, y) def test_linear_algebra_scalar_raises(self, device) -> None: m = torch.randn(5, 5, device=device) @@ -5050,9 +4902,40 @@ def call_torch_fn(*args, **kwargs): A_LU, pivots = fn(torch.lu, (2, 0, 0)) self.assertEqual([(2, 0, 0), (2, 0)], [A_LU.shape, pivots.shape]) - @dtypesIfCUDA(torch.cfloat, torch.cdouble, - *get_all_fp_dtypes(include_half=not CUDA9, include_bfloat16=(CUDA11OrLater and SM53OrLater))) - @dtypes(*(set(get_all_dtypes()) - {torch.half, torch.bool})) + @precisionOverride({torch.double: 1e-8, torch.float: 1e-4, torch.bfloat16: 0.6, + torch.half: 1e-1, torch.cfloat: 1e-4, torch.cdouble: 1e-8}) + @dtypesIfCUDA(*floating_and_complex_types_and( + *[torch.half] if not CUDA9 else [], + *[torch.bfloat16] if CUDA11OrLater and SM53OrLater else [] + )) + @dtypes(*all_types_and_complex_and(torch.bfloat16)) + def test_corner_cases_of_cublasltmatmul(self, device, dtype): + # common case + M = torch.randn(128, device=device).to(dtype) + m1 = torch.randn(2048, 2400, device=device).to(dtype) + m2 = torch.randn(128, 2400, device=device).to(dtype) + torch.nn.functional.linear(m1, m2, M) + # Ntrans_B has ld >> rows + m1 = torch.rand([128, 2400]).to(dtype).to(device).t() + m2 = torch.rand([2048, 25272]).to(dtype).to(device).t()[21940:24340] + M = torch.rand([128]).to(dtype).to(device) + torch.addmm(M, m2.t(), m1) + # trans_A has ld >> rows + m1 = torch.rand([128, 25272]).to(dtype).to(device)[:, 21940:24340].t() + m2 = torch.randn(2048, 2400, device=device).to(dtype) + M = torch.rand([128]).to(dtype).to(device) + torch.addmm(M, m2, m1) + # large tensor dim > 65535 + M = torch.randn(16, device=device).to(dtype) + m1 = torch.randn(32, 131071 , device=device).to(dtype) + m2 = torch.randn(16, 131071, device=device).to(dtype) + torch.nn.functional.linear(m1, m2, M) + + @dtypesIfCUDA(*floating_and_complex_types_and( + *[torch.half] if not CUDA9 else [], + *[torch.bfloat16] if CUDA11OrLater and SM53OrLater else [] + )) + @dtypes(*all_types_and_complex_and(torch.bfloat16)) def test_blas_alpha_beta_empty(self, device, dtype): # This test is disabled on CUDA 9 due to: # See: https://github.com/pytorch/pytorch/issues/31006 @@ -5088,7 +4971,7 @@ def test_blas_alpha_beta_empty(self, device, dtype): self.assertEqual(torch.full((2, 3), beta * value, dtype=dtype, device=device), torch.addmm(input=input, mat1=mat, mat2=mat2, alpha=alpha, beta=beta, out=out)) - @dtypes(*(get_all_complex_dtypes() + get_all_fp_dtypes())) + @dtypes(*floating_and_complex_types_and(torch.half, torch.bfloat16)) def test_blas_nan_out(self, device, dtype): # These functions should work correctly with NaN filled outputs, # but need special handling, see [NOTE: cpu_zero] @@ -5249,8 +5132,11 @@ def test_householder_product_errors_and_warnings(self, device): @skipCUDAIfNoMagmaAndNoCusolver @skipCPUIfNoLapack @dtypes(*floating_and_complex_types()) - def test_linalg_lu_factor_and_lu(self, device, dtype): - # Tests lu, linalg.lu_factor and linalg.lu_factor_ex + def test_linalg_lu_factor_and_lu_and_lu_unpack(self, device, dtype): + # Tests torch.lu + # torch.linalg.lu_factor + # torch.linalg.lu_factor_ex + # torch.lu_unpack from torch.testing._internal.common_utils import random_matrix def run_test(A, pivot, singular, fn): @@ -5273,9 +5159,14 @@ def run_test(A, pivot, singular, fn): if not pivot: self.assertEqual(pivots, torch.arange(1, 1 + k, device=device, dtype=torch.int32).expand(batch + (k, ))) - P, L, U = torch.lu_unpack(LU, pivots) + P, L, U = torch.lu_unpack(LU, pivots, unpack_pivots=pivot) - self.assertEqual(P @ L @ U, A) + self.assertEqual(P @ L @ U if pivot else L @ U, A) + + PLU = torch.linalg.lu(A, pivot=pivot) + self.assertEqual(P, PLU.P) + self.assertEqual(L, PLU.L) + self.assertEqual(U, PLU.U) sizes = ((3, 3), (5, 5), (4, 2), (3, 4), (0, 0), (0, 1), (1, 0)) batches = ((0,), (2,), (3,), (1, 0), (3, 5)) @@ -5306,39 +5197,10 @@ def run_test(A, pivot, singular, fn): if self.device_type == 'cpu': # Error checking, no pivoting variant on CPU - with self.assertRaisesRegex(RuntimeError, 'LU without pivoting is not implemented on the CPU'): - torch.lu(torch.empty(1, 2, 2), pivot=False) - - with self.assertRaisesRegex(RuntimeError, 'LU without pivoting is not implemented on the CPU'): - torch.linalg.lu_factor(torch.empty(1, 2, 2), pivot=False) - - @skipCPUIfNoLapack - @skipCUDAIfNoMagma - @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble) - @skipCUDAIfRocm - @precisionOverride({torch.float: 1e-3}) - def test_lu_unpack(self, device, dtype): - def run_test(pivot): - for shape in ((3, 3), (5, 3, 3), (7, 3, 5, 5), (7, 5, 3, 3, 3)): - a = torch.randn(*shape, dtype=dtype, device=device) - a_lu, p = torch.lu(a, pivot=pivot) - p_ref, l_ref, u_ref = torch.lu_unpack(a_lu, p) - self.assertEqual(p_ref.matmul(l_ref.matmul(u_ref)), a) - for shape in ((3, 3), (5, 3, 3), (7, 3, 5, 5), (7, 5, 3, 3, 3), - (3, 5), (5, 3), (3, 3, 5), (3, 5, 3), - (7, 5, 3, 5, 3), (7, 5, 3, 3, 5), - # empty tensors - (0, 0), (0, 0, 0), (0, 3, 3) - ): - a = make_tensor(shape, dtype=dtype, device=device, low=-0.1, high=+0.1) - a_lu, p = torch.lu(a, pivot=pivot) - p_ref, l_ref, u_ref = torch.lu_unpack(a_lu, p) - self.assertEqual(p_ref.matmul(l_ref.matmul(u_ref)), a) - - run_test(True) - - if self.device_type == 'cuda': - run_test(False) + fns = [torch.lu, torch.linalg.lu_factor, torch.linalg.lu_factor_ex, torch.linalg.lu] + for f in fns: + with self.assertRaisesRegex(RuntimeError, 'LU without pivoting is not implemented on the CPU'): + f(torch.empty(1, 2, 2), pivot=False) @skipCPUIfNoLapack @skipCUDAIfNoMagma @@ -5349,21 +5211,18 @@ def test_lu_unpack_check_input(self, device, dtype): with self.assertRaisesRegex(RuntimeError, "torch.int32 dtype"): torch.lu_unpack(lu_data, lu_pivots.long()) - with self.assertRaisesRegex(RuntimeError, "contiguous tensor"): - torch.lu_unpack(lu_data, lu_pivots.mT) # check that onces flags are unset, Nones are returned p, l, u = torch.lu_unpack(lu_data, lu_pivots, unpack_data=False) - self.assertTrue((l == u) and l is None) + self.assertTrue(l.numel() == 0 and u.numel() == 0) p, l, u = torch.lu_unpack(lu_data, lu_pivots, unpack_pivots=False) - self.assertTrue(p is None) + self.assertTrue(p.numel() == 0) p, l, u = torch.lu_unpack(lu_data, lu_pivots, unpack_data=False, unpack_pivots=False) - self.assertTrue((p == l == u) and p is None) + self.assertTrue(p.numel() == 0 and l.numel() == 0 and u.numel() == 0) @skipCUDAIfNoMagma @skipCPUIfNoLapack @dtypes(torch.double) - @skipCUDAIfRocm def test_lobpcg_basic(self, device, dtype): self._test_lobpcg_method(device, dtype, 'basic') @@ -5674,7 +5533,7 @@ def tracker(worker): ---(input size: {:4}, eigenpairs:{:2}, units: relative error, maxiter={:4})--- '''.format(tol, eq_err, eq_err_general, iters1, eq_err_scipy, eq_err_general_scipy, iters2, m, k, niter)) - def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out=False): + def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out=False, activation=None): dtype = t.dtype numpy_dtype = dtype if dtype in {torch.bfloat16}: @@ -5693,15 +5552,19 @@ def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out= res3 = alpha * (m.to(numpy_dtype).cpu().numpy() @ v.to(numpy_dtype).cpu().numpy()) if beta != 0: res3 += (beta * t).to(numpy_dtype).cpu().numpy() + if activation == "relu": + res3 = res3 * (res3 > 0) + else: + assert activation is None, f"unsupported activation {activation}" res3 = torch.from_numpy(res3).to(dtype) self.assertEqual(res1, res2) self.assertEqual(res1, res3) @precisionOverride({torch.bfloat16: 1e-0, torch.half: 5e-4, torch.float: 1e-4, torch.double: 1e-8, torch.cfloat: 1e-4, torch.cdouble: 1e-8}) - @dtypesIfCUDA(*get_all_complex_dtypes(), - *get_all_fp_dtypes(include_bfloat16=(TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater)), - include_half=(not TEST_WITH_ROCM))) + @dtypesIfCUDA(*floating_and_complex_types_and( + *[torch.bfloat16] if TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater) else [], + *[torch.half])) @dtypes(torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble) def test_addmv(self, device, dtype): # have to use torch.randn(...).to(bfloat16) instead of @@ -5736,7 +5599,8 @@ def test_addmv(self, device, dtype): for m, v in itertools.product(ms, vs): self._test_addmm_addmv(torch.addmv, t, m, v, beta=0) - @dtypesIfCUDA(*get_all_fp_dtypes(include_bfloat16=(TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater)))) + @dtypesIfCUDA(*floating_types_and(*[torch.bfloat16] if TEST_WITH_ROCM or (CUDA11OrLater and + SM53OrLater) else [])) @dtypes(torch.float, torch.double) def test_addmv_rowmajor_colmajor_incx_incy_lda(self, device, dtype): # tests (o, s)*(s). o is output size, s is summed size. @@ -5765,29 +5629,23 @@ def _test(row_major, incx, incy, lda_tail): for row_major, incx, incy, lda_tail in itertools.product((False, True), (1, 2), (1, 2), (0, 1)): _test(row_major, incx, incy, lda_tail) - @precisionOverride({torch.double: 1e-8, torch.float: 1e-4, torch.bfloat16: 0.6, - torch.half: 1e-1, torch.cfloat: 1e-4, torch.cdouble: 1e-8}) - @dtypesIfCUDA(*get_all_complex_dtypes(), - *get_all_fp_dtypes(include_bfloat16=(TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater)))) - @dtypes(*get_all_complex_dtypes(), *get_all_fp_dtypes()) - @tf32_on_and_off(0.05) - def test_addmm(self, device, dtype): + def _test_addmm_impl(self, func, activation, device, dtype): M = torch.randn(10, 25, device=device).to(dtype) m1 = torch.randn(10, 50, device=device).to(dtype) m2 = torch.randn(50, 25, device=device).to(dtype) - self._test_addmm_addmv(torch.addmm, M, m1, m2) + self._test_addmm_addmv(func, M, m1, m2, activation=activation) # Test 0-strided M = torch.randn(10, 1, device=device).to(dtype).expand(10, 25) m1 = torch.randn(10, 1, device=device).to(dtype).expand(10, 50) m2 = torch.randn(50, 25, device=device).to(dtype) - self._test_addmm_addmv(torch.addmm, M, m1, m2) + self._test_addmm_addmv(func, M, m1, m2, activation=activation) # Test beta=0, M=nan M = torch.full((10, 25), math.nan, device=device).to(dtype) m1 = torch.randn(10, 50, device=device).to(dtype) m2 = torch.randn(50, 25, device=device).to(dtype) - self._test_addmm_addmv(torch.addmm, M, m1, m2, beta=0) + self._test_addmm_addmv(func, M, m1, m2, beta=0, activation=activation) # Test transpose for t1, t2, t3, t4 in itertools.product([True, False], repeat=4): @@ -5799,10 +5657,29 @@ def maybe_transpose(cond, m): M = maybe_transpose(t1, torch.randn(10, 25, device=device).to(dtype)) m1 = maybe_transpose(t2, torch.randn(10, 50, device=device).to(dtype)) m2 = maybe_transpose(t3, torch.randn(50, 25, device=device).to(dtype)) - self._test_addmm_addmv(torch.addmm, M, m1, m2, transpose_out=t4) + self._test_addmm_addmv(func, M, m1, m2, transpose_out=t4, activation=activation) + + @precisionOverride({torch.double: 1e-8, torch.float: 1e-4, torch.bfloat16: 0.6, + torch.half: 1e-1, torch.cfloat: 1e-4, torch.cdouble: 1e-8}) + @dtypesIfMPS(torch.float32) + @dtypesIfCUDA(*floating_and_complex_types_and( + *[torch.bfloat16] if TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater) else [])) + @dtypes(*floating_and_complex_types_and(torch.bfloat16)) + @tf32_on_and_off(0.05) + def test_addmm(self, device, dtype): + self._test_addmm_impl(torch.addmm, None, device, dtype) + + @precisionOverride({torch.double: 1e-8, torch.float: 1e-4, torch.bfloat16: 0.6, + torch.half: 1e-1, torch.cfloat: 1e-4, torch.cdouble: 1e-8}) + @dtypesIfCUDA(*floating_types_and( + *[torch.bfloat16] if TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater) else [])) + @dtypes(*floating_types_and(torch.bfloat16)) + @tf32_on_and_off(0.05) + def test_addmm_activation(self, device, dtype): + self._test_addmm_impl(torch._addmm_activation, "relu", device, dtype) @dtypes(torch.float, torch.double) - @dtypesIfCUDA(*([torch.float, torch.double] + get_all_complex_dtypes())) + @dtypesIfCUDA(*floating_and_complex_types()) @tf32_on_and_off(0.005) def test_addmm_sizes(self, device, dtype): for m in [0, 1, 25]: @@ -5855,7 +5732,8 @@ def test_matmul_45724(self, device): @slowTest @onlyNativeDeviceTypes - @dtypes(torch.float32, torch.float64, torch.bfloat16, torch.int32, torch.int64, torch.cfloat, torch.cdouble) + # bfloat16 doesn't have sufficient precision to pass this test + @dtypes(torch.float32, torch.float64, torch.int32, torch.int64, torch.cfloat, torch.cdouble) @dtypesIfCUDA(torch.float32, torch.float64, torch.cfloat, torch.cdouble) @tf32_on_and_off(0.01) def test_mm(self, device, dtype): @@ -5998,9 +5876,8 @@ def test_strided_mm_bmm(self, device, dtype): self.compare_with_numpy(torch_fn, np_fn, sx[0]) @precisionOverride({torch.half: 0.05, torch.bfloat16: 0.05}) - @skipCUDAIf(torch.version.cuda == "10.1", "flaky on CUDA 10.1") @onlyNativeDeviceTypes - @dtypes(*get_all_fp_dtypes(), *get_all_complex_dtypes()) + @dtypes(*floating_and_complex_types_and(torch.bfloat16)) @tf32_on_and_off(0.05) def test_bmm(self, device, dtype): if self.device_type == 'cuda' and dtype is torch.bfloat16 and CUDA11OrLater and not SM53OrLater: @@ -6032,8 +5909,8 @@ def invert_perm(p): def generate_inputs(num_batches): # transposed tensors for perm1, perm2 in itertools.product(itertools.permutations((0, 1, 2)), repeat=2): - b1 = make_tensor((num_batches, M, N), device, dtype, low=-0.1, high=0.1) - b2 = make_tensor((num_batches, N, O), device, dtype, low=-0.1, high=0.1) + b1 = make_tensor((num_batches, M, N), dtype=dtype, device=device, low=-0.1, high=0.1) + b2 = make_tensor((num_batches, N, O), dtype=dtype, device=device, low=-0.1, high=0.1) b1 = b1.permute(perm1).contiguous().permute(invert_perm(perm1)) b2 = b2.permute(perm2).contiguous().permute(invert_perm(perm2)) yield b1, b2 @@ -6041,8 +5918,8 @@ def generate_inputs(num_batches): for b1, b2, b3, b4, b5, b6 in itertools.product((True, False), repeat=6): shape1 = (num_batches if b1 else 1, M if b2 else 1, N if b3 else 1) shape2 = (num_batches if b4 else 1, N if b5 else 1, O if b6 else 1) - b1 = make_tensor(shape1, device, dtype, low=-0.1, high=0.1).expand(num_batches, M, N) - b2 = make_tensor(shape2, device, dtype, low=-0.1, high=0.1).expand(num_batches, N, O) + b1 = make_tensor(shape1, dtype=dtype, device=device, low=-0.1, high=0.1).expand(num_batches, M, N) + b2 = make_tensor(shape2, dtype=dtype, device=device, low=-0.1, high=0.1).expand(num_batches, N, O) yield b1, b2 # zero-sized tensors for z1, z2, z3, z4 in itertools.product((True, False), repeat=4): @@ -6112,7 +5989,7 @@ def _test_addbmm_baddbmm(self, func, b1, b2, ref, out_tensor): @precisionOverride({torch.half: 0.05, torch.bfloat16: 0.05}) @onlyNativeDeviceTypes - @dtypes(*get_all_fp_dtypes(), *get_all_complex_dtypes()) + @dtypes(*floating_and_complex_types_and(torch.bfloat16)) @tf32_on_and_off(0.05) def test_addbmm(self, device, dtype): if self.device_type == 'cuda' and dtype is torch.bfloat16 and CUDA11OrLater and not SM53OrLater: @@ -6132,9 +6009,9 @@ def test_addbmm(self, device, dtype): is_supported = TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater) if not is_supported: - b1 = make_tensor((num_batches, M, N), device, dtype, low=-1, high=1) - b2 = make_tensor((num_batches, N, O), device, dtype, low=-1, high=1) - t = make_tensor((M, O), device, dtype, low=-1, high=1) + b1 = make_tensor((num_batches, M, N), dtype=dtype, device=device, low=-1, high=1) + b2 = make_tensor((num_batches, N, O), dtype=dtype, device=device, low=-1, high=1) + t = make_tensor((M, O), dtype=dtype, device=device, low=-1, high=1) self.assertRaisesRegex(RuntimeError, "type|Type|not implemented|CUBLAS_STATUS_NOT_SUPPORTED", lambda: torch.addbmm(t, b1, b2)) return @@ -6148,8 +6025,8 @@ def generate_tensor(): # transposed tensors for perm1, perm2 in itertools.product(itertools.permutations((0, 1, 2)), repeat=2): for perm3 in itertools.permutations((0, 1)): - b1 = make_tensor((num_batches, M, N), device, dtype, low=-1, high=1) * 0.1 - b2 = make_tensor((num_batches, N, O), device, dtype, low=-1, high=1) * 0.1 + b1 = make_tensor((num_batches, M, N), dtype=dtype, device=device, low=-1, high=1) * 0.1 + b2 = make_tensor((num_batches, N, O), dtype=dtype, device=device, low=-1, high=1) * 0.1 b1 = b1.permute(perm1).contiguous().permute(invert_perm(perm1)) b2 = b2.permute(perm2).contiguous().permute(invert_perm(perm2)) ref = torch.from_numpy( @@ -6161,8 +6038,8 @@ def generate_tensor(): for s1, s2, s3, s4, s5, s6 in itertools.product((True, False), repeat=6): shape1 = (num_batches if s1 else 1, M if s2 else 1, N if s3 else 1) shape2 = (num_batches if s4 else 1, N if s5 else 1, O if s6 else 1) - b1 = make_tensor(shape1, device, dtype, low=-1, high=1).expand(num_batches, M, N) * 0.1 - b2 = make_tensor(shape2, device, dtype, low=-1, high=1).expand(num_batches, N, O) * 0.1 + b1 = make_tensor(shape1, dtype=dtype, device=device, low=-1, high=1).expand(num_batches, M, N) * 0.1 + b2 = make_tensor(shape2, dtype=dtype, device=device, low=-1, high=1).expand(num_batches, N, O) * 0.1 ref = torch.from_numpy( b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy() ).to(device=device, dtype=dtype).sum(0) @@ -6172,8 +6049,8 @@ def generate_tensor(): for z1, z2, z3, z4 in itertools.product((True, False), repeat=4): shape1 = (num_batches if z1 else 0, M if z2 else 0, N if z3 else 0) shape2 = (num_batches if z1 else 0, N if z3 else 0, O if z4 else 0) - b1 = make_tensor(shape1, device, dtype, low=-1, high=1) * 0.1 - b2 = make_tensor(shape2, device, dtype, low=-1, high=1) * 0.1 + b1 = make_tensor(shape1, dtype=dtype, device=device, low=-1, high=1) * 0.1 + b2 = make_tensor(shape2, dtype=dtype, device=device, low=-1, high=1) * 0.1 ref = torch.from_numpy( b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy() ).to(device=device, dtype=dtype).sum(0) @@ -6185,7 +6062,7 @@ def generate_tensor(): @precisionOverride({torch.half: 0.1, torch.bfloat16: 0.5}) @onlyNativeDeviceTypes - @dtypes(*get_all_fp_dtypes(), *get_all_complex_dtypes()) + @dtypes(*floating_and_complex_types_and(torch.bfloat16)) @tf32_on_and_off(0.05) def test_baddbmm(self, device, dtype): if self.device_type == 'cuda' and dtype is torch.bfloat16 and CUDA11OrLater and not SM53OrLater: @@ -6202,9 +6079,9 @@ def test_baddbmm(self, device, dtype): is_supported = TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater) if not is_supported: - b1 = make_tensor((num_batches, M, N), device, dtype, low=-1, high=1) - b2 = make_tensor((num_batches, N, O), device, dtype, low=-1, high=1) - t = make_tensor((num_batches, M, O), device, dtype, low=-1, high=1) + b1 = make_tensor((num_batches, M, N), dtype=dtype, device=device, low=-1, high=1) + b2 = make_tensor((num_batches, N, O), dtype=dtype, device=device, low=-1, high=1) + t = make_tensor((num_batches, M, O), dtype=dtype, device=device, low=-1, high=1) self.assertRaisesRegex(RuntimeError, "type|Type|not implemented|CUBLAS_STATUS_NOT_SUPPORTED", lambda: torch.baddbmm(t, b1, b2)) return @@ -6217,8 +6094,8 @@ def generate_tensor(): numpy_dtype = dtype if dtype != torch.bfloat16 else torch.float32 # transposed tensors for perm1, perm2, perm3 in itertools.product(itertools.permutations((0, 1, 2)), repeat=3): - b1 = make_tensor((num_batches, M, N), device, dtype, low=-1, high=1) - b2 = make_tensor((num_batches, N, O), device, dtype, low=-1, high=1) + b1 = make_tensor((num_batches, M, N), dtype=dtype, device=device, low=-1, high=1) + b2 = make_tensor((num_batches, N, O), dtype=dtype, device=device, low=-1, high=1) b1 = b1.permute(perm1).contiguous().permute(invert_perm(perm1)) b2 = b2.permute(perm2).contiguous().permute(invert_perm(perm2)) ref = torch.from_numpy( @@ -6230,8 +6107,8 @@ def generate_tensor(): for s1, s2, s3, s4, s5, s6 in itertools.product((True, False), repeat=6): shape1 = (num_batches if s1 else 1, M if s2 else 1, N if s3 else 1) shape2 = (num_batches if s4 else 1, N if s5 else 1, O if s6 else 1) - b1 = make_tensor(shape1, device, dtype, low=-1, high=1).expand(num_batches, M, N) - b2 = make_tensor(shape2, device, dtype, low=-1, high=1).expand(num_batches, N, O) + b1 = make_tensor(shape1, dtype=dtype, device=device, low=-1, high=1).expand(num_batches, M, N) + b2 = make_tensor(shape2, dtype=dtype, device=device, low=-1, high=1).expand(num_batches, N, O) ref = torch.from_numpy( b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy()).to(device=device, dtype=dtype) out_tensor = torch.zeros_like(ref) @@ -6240,8 +6117,8 @@ def generate_tensor(): for z1, z2, z3, z4 in itertools.product((True, False), repeat=4): shape1 = (num_batches if z1 else 0, M if z2 else 0, N if z3 else 0) shape2 = (num_batches if z1 else 0, N if z3 else 0, O if z4 else 0) - b1 = make_tensor(shape1, device, dtype, low=-2, high=2) - b2 = make_tensor(shape2, device, dtype, low=-2, high=2) + b1 = make_tensor(shape1, dtype=dtype, device=device, low=-2, high=2) + b2 = make_tensor(shape2, dtype=dtype, device=device, low=-2, high=2) ref = torch.from_numpy( b1.to(numpy_dtype).cpu().numpy() @ b2.to(numpy_dtype).cpu().numpy()).to(device=device, dtype=dtype) out_tensor = torch.zeros_like(ref) @@ -6260,12 +6137,10 @@ def test_solve_methods_arg_device(self, device): b = torch.randn(3, 1, device=b_device) A = torch.randn(3, 3, device=A_device) - # solve and cholesky_solve goes through generic backend dispatch and hit kernel specific device check first + # cholesky_solve goes through generic backend dispatch and hit kernel specific device check first # triangular_solve goes through specific backend dispatch (CPU/CUDA) and hit auto-generated device check first generic_backend_dispatch_err_str = "Expected b and A to be on the same device" specific_backend_dispatch_err_str = "Expected all tensors to be on the same device" - with self.assertRaisesRegex(RuntimeError, generic_backend_dispatch_err_str): - torch.solve(b, A) with self.assertRaisesRegex(RuntimeError, generic_backend_dispatch_err_str): torch.cholesky_solve(b, A) @@ -6324,7 +6199,7 @@ def run_test(M): @dtypes(torch.double, torch.cdouble) def test_matrix_power_non_negative(self, device, dtype): def check(*size): - t = make_tensor(size, device, dtype) + t = make_tensor(size, dtype=dtype, device=device) for n in range(8): res = torch.linalg.matrix_power(t, n) ref = np.linalg.matrix_power(t.cpu().numpy(), n) @@ -7271,6 +7146,7 @@ def lu_solve_batch_test_helper(A_dims, b_dims, pivot): if self.device_type == 'cuda': sub_test(False) + @skipCUDAIfRocm # ROCm: test was exceptionally slow, even for slow tests. Skip until triage. @slowTest @skipCUDAIfNoMagma @skipCPUIfNoLapack @@ -7764,6 +7640,104 @@ def test_tensordot(self, device): an = torch.from_numpy(np.tensordot(np.zeros((), dtype=np.float32), np.zeros((), dtype=np.float32), 0)) self.assertEqual(a, an) + @skipCUDAIfNoCusolver + @skipCUDAIfNoMagma + @skipCPUIfNoLapack + @skipCUDAIfRocm + @dtypes(*floating_and_complex_types()) + def test_ldl_factor(self, device, dtype): + from torch.testing._internal.common_utils import random_hermitian_pd_matrix + + def run_test(shape, batch, hermitian): + A = random_hermitian_pd_matrix(shape, *batch, dtype=dtype, device=device) + actual_factors, actual_pivots, info = torch.linalg.ldl_factor_ex(A, hermitian=hermitian) + actual_L = torch.tril(actual_factors, diagonal=-1) + actual_L.diagonal(0, -2, -1).fill_(1.0) + + # This test is designed only for inputs with 1x1 block diagonal matrix D. + # That is for positive definite input matrices, the pivots tensor is always > 0. + # If negative pivots are encountered, it means that the input matrix is not positive definite. + # And matrix D is a 2x2 block diagonal matrix. + self.assertTrue((actual_pivots > 0).all()) + + # Construct a 1x1 block diagonal matrix D from factors. + actual_D = torch.diag_embed(actual_factors.diagonal(0, -2, -1)) + + def T(x): + return x.mH if hermitian else x.mT + A_reconstructed = actual_L @ actual_D @ T(actual_L) + + def symmetric(A): + return A.tril() + A.tril(-1).mT + + self.assertEqual(symmetric(A) if not hermitian else A, A_reconstructed) + + # Now test against SciPy implementation + if TEST_SCIPY: + from scipy.linalg import ldl as scipy_ldl + A_np = A.cpu().numpy() + np_dtype = A_np.dtype + scipy_ldl_batched = np.vectorize( + lambda x: scipy_ldl(x, hermitian=hermitian, lower=True), + otypes=[np_dtype, np_dtype, np.dtype('int64')], + signature='(m,m)->(m,m),(m,m),(m)') + + expected = scipy_ldl_batched(A_np) + expected_L, expected_D, expected_pivots = expected + + if expected_pivots.ndim > 1: + permuted_expected_L = np.stack( + [expected_L[i][expected_pivots[i], :] for i in range(expected_pivots.shape[0])] + ) + else: + permuted_expected_L = expected_L[expected_pivots, :] + self.assertEqual(actual_L, permuted_expected_L) + self.assertEqual(actual_D, expected_D) + else: + self.assertEqual(actual_factors.shape, A.shape) + self.assertEqual(actual_pivots.shape, A.shape[:-1]) + self.assertEqual(info.shape, A.shape[:-2]) + + # hermitian=True for complex inputs on CUDA is supported only with MAGMA 2.5.4+ + magma_254_available = self.device_type == 'cuda' and _get_magma_version() >= (2, 5, 4) + hermitians = (True, False) if dtype.is_complex and (self.device_type == 'cpu' or magma_254_available) else (False,) + + shapes = (5,) + batches = ((), (4,),) + for shape, batch, hermitian in itertools.product(shapes, batches, hermitians): + run_test(shape, batch, hermitian) + + @skipCUDAIfNoCusolver + @skipCUDAIfNoMagma + @skipCPUIfNoLapack + @skipCUDAIfRocm + @skipCUDAIf(_get_torch_cuda_version() < (11, 4), "not available before CUDA 11.3.1") + @dtypes(*floating_and_complex_types()) + def test_ldl_solve(self, device, dtype): + from torch.testing._internal.common_utils import random_hermitian_pd_matrix + + def run_test(shape, batch, nrhs, hermitian): + A = random_hermitian_pd_matrix(shape, *batch, dtype=dtype, device=device) + B = make_tensor((*A.shape[:-1], nrhs), dtype=dtype, device=device) + factors, pivots, info = torch.linalg.ldl_factor_ex(A, hermitian=hermitian) + X = torch.linalg.ldl_solve(factors, pivots, B, hermitian=hermitian) + + def symmetric(A): + return A.tril() + A.tril(-1).mT + + # verify A @ X == B + expected_B = symmetric(A) @ X if not hermitian else A @ X + self.assertEqual(B, expected_B) + + # hermitian=True is not supported on CUDA yet + hermitians = (True, False) if dtype.is_complex and self.device_type == 'cpu' else (False,) + + shapes = (5,) + batches = ((), (4,), (2, 2)) + nrhss = (1, 7) + for shape, batch, nrhs, hermitian in itertools.product(shapes, batches, nrhss, hermitians): + run_test(shape, batch, nrhs, hermitian) + @onlyCUDA @skipCUDAIfNoMagma @skipCUDAIfNoCusolver diff --git a/test/test_logging.py b/test/test_logging.py index 4bb057fd157a..01fdd3f8edd8 100644 --- a/test/test_logging.py +++ b/test/test_logging.py @@ -12,10 +12,10 @@ def testApiUsage(self): subprocess """ s = TestCase.runWithPytorchAPIUsageStderr("import torch") - self.assertRegexpMatches(s, "PYTORCH_API_USAGE.*import") + self.assertRegex(s, "PYTORCH_API_USAGE.*import") # import the shared library directly - it triggers static init but doesn't call anything s = TestCase.runWithPytorchAPIUsageStderr("from ctypes import CDLL; CDLL('{}')".format(torch._C.__file__)) - self.assertNotRegexpMatches(s, "PYTORCH_API_USAGE") + self.assertNotRegex(s, "PYTORCH_API_USAGE") if __name__ == '__main__': diff --git a/test/test_masked.py b/test/test_masked.py index 24593d156fd2..4b8fab87318f 100644 --- a/test/test_masked.py +++ b/test/test_masked.py @@ -6,13 +6,15 @@ import itertools import torch from typing import List, Any +from functools import wraps +import unittest from torch.testing._internal.common_utils import \ - (TestCase, suppress_warnings) + (TestCase, parametrize, suppress_warnings, _TestParametrizer, run_tests) from torch.testing._internal.common_methods_invocations import \ - (op_db,) + (op_db, SampleInput) from torch.testing._internal.common_device_type import \ - (instantiate_device_type_tests, ops, onlyNativeDeviceTypes) + (instantiate_device_type_tests, ops, onlyNativeDeviceTypes, precisionOverride) def apply_masked_reduction_along_dim(op, input, *args, **kwargs): @@ -111,7 +113,10 @@ def apply_masked_reduction_along_dim(op, input, *args, **kwargs): output = input.new_full(shape, float('nan') if dtype.is_floating_point else 0, dtype=dtype) # apply op to all elementary slices: - inpmask = torch._masked._input_mask(input, mask=mask) + if mask is None: + inpmask = input.new_ones([], dtype=torch.bool).expand(input.shape) + else: + inpmask = torch._masked._input_mask(input, mask=mask) for s in itertools.product(*ranges): # data of an elementary slice is 1D sequence and has only # masked-in elements: @@ -140,7 +145,10 @@ def apply_masked_normalization_along_dim(op, input, *args, **kwargs): dim = args[dim_pos] args0 = args[:dim_pos] + (0,) + args[dim_pos + 1:] output = torch.zeros_like(input, dtype=dtype) - inpmask = torch._masked._input_mask(input, mask=mask) + if mask is None: + inpmask = input.new_ones([], dtype=torch.bool).expand(input.shape) + else: + inpmask = torch._masked._input_mask(input, mask=mask) dim_ = dim % input.ndim left_ranges = tuple(map(range, input.shape[:dim_])) right_ranges = tuple(map(range, input.shape[dim_ + 1:])) @@ -153,6 +161,7 @@ def apply_masked_normalization_along_dim(op, input, *args, **kwargs): reference_functions = dict( norm=lambda *args, **kwargs: apply_masked_reduction_along_dim(torch.linalg.vector_norm, *args, **dict(kwargs, dim_position=1)), var=lambda *args, **kwargs: apply_masked_reduction_along_dim(torch.var, *args, **dict(kwargs, dim_position=0)), + std=lambda *args, **kwargs: apply_masked_reduction_along_dim(torch.std, *args, **dict(kwargs, dim_position=0)), softmax=lambda *args, **kwargs: apply_masked_normalization_along_dim(torch.softmax, *args, **kwargs), log_softmax=lambda *args, **kwargs: apply_masked_normalization_along_dim(torch.log_softmax, *args, **kwargs), softmin=lambda *args, **kwargs: apply_masked_normalization_along_dim(torch.nn.functional.softmin, *args, **kwargs), @@ -162,28 +171,262 @@ def apply_masked_normalization_along_dim(op, input, *args, **kwargs): masked_ops = [op for op in op_db if op.name.startswith('_masked.')] masked_ops_with_references = [op for op in masked_ops if op.name.rsplit('.', 1)[-1] in reference_functions] +masked_ops_with_non_strided_support = [op for op in masked_ops if op.supports_sparse or op.supports_sparse_csr] + + +def _tensor_to_strided(obj): + # after gh-59958 is resolved, replace the usage of this function + # with torch.Tensor.to_dense + if torch.is_tensor(obj): + if obj.layout == torch.strided: + return obj + return obj.to_dense() + return obj + + +def to_strided(obj): + """Convert the tensor content of object to strided tensor content. + """ + return torch.utils._pytree.tree_map(_tensor_to_strided, obj) + + +def to_sparse_coo(obj): + """Convert the tensor content of object to sparse coo tensor content. + """ + return torch.utils._pytree.tree_map(torch.Tensor.to_sparse, obj) + + +def to_sparse_csr(obj): + """Convert the tensor content of object to sparse csr tensor content. + """ + return torch.utils._pytree.tree_map(torch.Tensor.to_sparse_csr, obj) + + +class mask_layouts(_TestParametrizer): + """Decorator class for parametrization of test function with an input + layout argument and an extra argument of sample inputs generator. + The sample_inputs generator provides samples with all supported + layouts for the mask argument. + """ + def _parametrize_test(self, test, generic_cls, device_cls): + + @wraps(test) + def wrap(self, layout, device, dtype, op): + layout_name = str(layout).lstrip('torch.') + if layout == torch.strided: + # strided layouts are always supported + sample_inputs_func = op.sample_inputs + elif layout == torch.sparse_coo: + if not op.supports_sparse: + raise unittest.SkipTest(f"{op.name} does not support inputs with {layout_name} layout") + sample_inputs_func = op.sample_inputs_sparse_coo + elif layout == torch.sparse_csr: + if not op.supports_sparse_csr: + raise unittest.SkipTest(f"{op.name} does not support inputs with {layout_name} layout") + sample_inputs_func = op.sample_inputs_sparse_csr + else: + raise NotImplementedError(f'{layout}') + + def sample_inputs_generator(): + for sample_input in sample_inputs_func(device, dtype): + mask = sample_input.kwargs.get('mask') + if mask is None: + yield sample_input + else: + if layout == sample_input.input.layout: + yield sample_input + if layout != torch.strided: + sample_input_kwargs = sample_input.kwargs.copy() + sample_input_kwargs.update(mask=mask.to_dense()) + yield SampleInput(sample_input.input.clone(), + args=sample_input.args, + kwargs=sample_input_kwargs) + if layout != torch.sparse_coo and op.supports_sparse: + sample_input_kwargs = sample_input.kwargs.copy() + sample_input_kwargs.update(mask=mask.to_sparse()) + yield SampleInput(sample_input.input.clone(), + args=sample_input.args, + kwargs=sample_input_kwargs) + if layout != torch.sparse_csr and op.supports_sparse_csr and sample_input.input.ndim == 2: + sample_input_kwargs = sample_input.kwargs.copy() + sample_input_kwargs.update(mask=mask.to_sparse_csr()) + yield SampleInput(sample_input.input.clone(), + args=sample_input.args, + kwargs=sample_input_kwargs) + + test(self, layout, device, dtype, op, sample_inputs_generator()) + + for layout in (torch.strided, torch.sparse_coo, torch.sparse_csr): + yield (wrap, str(layout).lstrip('torch.'), {'layout': layout}) class TestMasked(TestCase): + def assertEqualMasked(self, actual, expected, mask): + strided = to_strided(actual) + if mask is not None: + strided = torch.where(mask, strided, strided.new_zeros([])) + expected = torch.where(mask, expected, expected.new_zeros([])) + self.assertEqual(strided, expected, exact_device=False) + @onlyNativeDeviceTypes @suppress_warnings @ops(masked_ops_with_references) + @precisionOverride({torch.bfloat16: 5e-4, torch.float16: 5e-4}) def test_reference_masked(self, device, dtype, op): op_name = op.name.rsplit('.', 1)[-1] ref_op = reference_functions[op_name] sample_inputs = op.sample_inputs(device, dtype) for sample_input in sample_inputs: t_inp, t_args, t_kwargs = sample_input.input, sample_input.args, sample_input.kwargs - if op_name == 'var' and not (t_inp.dtype.is_floating_point or t_inp.dtype.is_complex): - # torch.var does not support integer inputs + if op_name in {'var', 'std'} and not (t_inp.dtype.is_floating_point or t_inp.dtype.is_complex): + # torch.var/torch.std does not support integer inputs continue actual = op.op(t_inp, *t_args, **t_kwargs) expected = ref_op(t_inp, *t_args, **t_kwargs) - outmask = torch._masked._output_mask(op.op, t_inp, *t_args, **t_kwargs) - actual = torch.where(outmask, actual, actual.new_zeros([])) - expected = torch.where(outmask, expected, expected.new_zeros([])) - self.assertEqual(actual, expected, exact_device=False) + if t_kwargs.get('mask') is None: + outmask = None + else: + outmask = torch._masked._output_mask(op.op, t_inp, *t_args, **t_kwargs) + self.assertEqualMasked(actual, expected, outmask) + + @mask_layouts() + @onlyNativeDeviceTypes + @suppress_warnings + @ops(masked_ops_with_non_strided_support) + @precisionOverride({torch.bfloat16: 5e-3, torch.float16: 5e-3}) + def test_mask_layout(self, layout, device, dtype, op, sample_inputs): + for sample in sample_inputs: + t_inp, t_args, t_kwargs = sample.input, sample.args, sample.kwargs + actual = op.op(t_inp, *t_args, **t_kwargs) + + assert actual.layout == layout + + # check masked invariance: + # op(inp, mask).to_dense() == op(inp.to_dense(), mask.to_dense()) at outmask + # + r_inp, r_args, r_kwargs = to_strided((t_inp, t_args, t_kwargs)) + if r_kwargs.get('mask') is None: + outmask = None + else: + outmask = torch._masked._output_mask(op.op, r_inp, *r_args, **r_kwargs) + expected = op.op(r_inp, *r_args, **r_kwargs) + self.assertEqualMasked(actual, expected, outmask) + + @parametrize("sparse_kind,fill_value", [('coo', 0), ('hybrid_coo', 0), + ('coo', 123), ('hybrid_coo', 123), + ('csr', 0), ('csr', 123)], + name_fn=lambda sparse_kind, fill_value: f'{sparse_kind}_fill_value_{fill_value}') + def test_where(self, sparse_kind, fill_value): + + is_hybrid = False + if sparse_kind == 'coo': + + def to_sparse(dense): + return dense.to_sparse(2) + + def set_values(sparse, index, value): + sparse._values()[index] = value + + elif sparse_kind == 'hybrid_coo': + is_hybrid = True + + def to_sparse(dense): + return dense.to_sparse(1) + + def set_values(sparse, index, value): + sparse._values()[index] = value + + elif sparse_kind == 'csr': + + def to_sparse(dense): + return dense.to_sparse_csr() + + def set_values(sparse, index, value): + sparse.values()[index] = value + + else: + assert 0, sparse_kind + + mask = torch.tensor([[1, 0, 1, 0, 0], + [1, 1, 1, 1, 0], + [0, 1, 0, 1, 0], + [0, 0, 0, 0, 0], + [0, 0, 1, 1, 0], + [1, 1, 0, 0, 0]]).to(dtype=bool) + mask = to_sparse(mask) + # make some specified mask elements as explicit masked-out masks: + if is_hybrid: + set_values(mask, (1, 1), False) + set_values(mask, (-2, -2), False) + else: + set_values(mask, 3, False) + set_values(mask, -3, False) + + input = torch.tensor([[1, 0, 0, 0, -1], + [2, 3, 0, 0, -2], + [0, 4, 5, 0, -3], + [0, 0, 6, 7, 0], + [0, 8, 9, 0, -3], + [10, 11, 0, 0, -5]]) + input = to_sparse(input) + # make specified input elements have zero values: + if is_hybrid: + set_values(input, (1, 1), 0) + set_values(input, (-1, 0), 0) + F = fill_value + else: + set_values(input, 3, 0) + set_values(input, -3, 0) + F = 0 + + # expected where result: + Z = 99 + # Z value corresponds to masked-in elements that are not + # specified in the input and it will be replaced with a zero + tmp = torch.tensor([[1, F, Z, F, F], + [2, F, Z, Z, F], + [F, 4, F, Z, F], + [0, 0, 0, 0, 0], + [F, F, 9, F, F], + [Z, 11, F, F, F]]) + tmp = to_sparse(tmp) + + + sparse = torch._masked._where(mask, input, + torch.tensor(fill_value, dtype=input.dtype, device=input.device)) + + if tmp.layout == torch.sparse_coo: + expected_sparse = torch.sparse_coo_tensor( + tmp.indices(), + torch.where(tmp.values() != Z, tmp.values(), tmp.values().new_full([], 0)), + input.shape) + outmask = torch.sparse_coo_tensor(sparse.indices(), + sparse.values().new_full(sparse.values().shape, 1).to(dtype=bool), + sparse.shape)._coalesced_(True) + elif tmp.layout == torch.sparse_csr: + expected_sparse = torch.sparse_csr_tensor( + tmp.crow_indices(), + tmp.col_indices(), + torch.where(tmp.values() != Z, tmp.values(), tmp.values().new_full([], 0)), + input.shape) + outmask = torch.sparse_csr_tensor(sparse.crow_indices(), sparse.col_indices(), + sparse.values().new_full(sparse.values().shape, 1).to(dtype=bool), + sparse.shape) + else: + assert 0 + + self.assertEqual(sparse, expected_sparse) + + # check invariance: + # torch.where(mask.to_dense(), input.to_dense(), fill_value) + # == where(mask, input, fill_value).to_dense(fill_value) + expected = torch.where(mask.to_dense(), input.to_dense(), torch.full(input.shape, F)) + dense = torch.where(outmask.to_dense(), sparse.to_dense(), torch.full(sparse.shape, F)) + self.assertEqual(dense, expected) + +instantiate_device_type_tests(TestMasked, globals(), except_for='meta') -instantiate_device_type_tests(TestMasked, globals()) +if __name__ == "__main__": + run_tests() diff --git a/test/test_meta.py b/test/test_meta.py new file mode 100644 index 000000000000..c8b62dbf57e9 --- /dev/null +++ b/test/test_meta.py @@ -0,0 +1,1152 @@ +# Owner(s): ["module: primTorch"] + +import torch +import os +from enum import Enum +from torch.overrides import resolve_name +from torch.utils._pytree import tree_map, tree_flatten +import torch.utils._python_dispatch +from torch._prims.utils import is_complex_dtype, corresponding_real_dtype +from torch.testing._internal.common_utils import ( + TestCase, + skipIfCrossRef, + suppress_warnings, + TEST_WITH_ASAN, + run_tests, +) +from torch.testing._internal.common_device_type import ( + ops, + instantiate_device_type_tests, + onlyCUDA, +) +from torch.testing._internal.logging_tensor import no_dispatch +from torch.testing._internal.common_methods_invocations import op_db +import torch._prims as prims + +import atexit +import re +from collections import defaultdict +import unittest +import warnings + +bf16 = torch.bfloat16 +f64 = torch.float64 +f32 = torch.float32 +f16 = torch.float16 +c32 = torch.complex32 +c64 = torch.complex64 +c128 = torch.complex128 +i8 = torch.int8 +i16 = torch.int16 +i32 = torch.int32 +i64 = torch.int64 +b8 = torch.bool +u8 = torch.uint8 + +dtype_abbrs = { + torch.bfloat16: 'bf16', + torch.float64: 'f64', + torch.float32: 'f32', + torch.float16: 'f16', + torch.complex32: 'c32', + torch.complex64: 'c64', + torch.complex128: 'c128', + torch.int8: 'i8', + torch.int16: 'i16', + torch.int32: 'i32', + torch.int64: 'i64', + torch.bool: 'b8', + torch.uint8: 'u8', +} + +def safe_is_leaf(t): + try: + return t.is_leaf + except RuntimeError: + # inference mode can trigger this + return False + + +# This is a class for converting multiple tensors into meta tensors which +# share the same view/storage structure. The operation model is you allocate +# one of these, and then call it repeatedly on all the tensors you want to +# convert. It's important to use the same object for tensors you want to +# share storage because this is how we correlate shared storages to the same +# meta storages; similarly, it's important NOT to use the same object for +# unrelated groups of tensors because this class will remember all the +# tensors/storages its seen and therefore leak memory. +class MetaConverter: + def __init__(self): + self.storage_memo = {} + self.tensor_memo = {} + self.hit = 0 + self.miss = 0 + + def successful(self): + return self.hit > 0 and self.miss == 0 + + # NB: doesn't actually return a storage, because meta storage is + # not supported + def meta_storage(self, s): + # NB: TypedStorage is freshly allocated and cannot be used as hash + # key index. + if s._cdata not in self.storage_memo: + self.storage_memo[s._cdata] = torch.empty(s.size(), dtype=s.dtype, device='meta') + return self.storage_memo[s._cdata] + + # This function assumes that it's possible to do the conversion + def meta_tensor(self, t): + if t not in self.tensor_memo: + with torch.inference_mode(t.is_inference()): + if t._is_view(): + # Construct views in two steps: recursively meta-fy their + # base, and then create the view off that. NB: doing it + # directly from storage is WRONG because this won't cause + # version counters to get shared. + assert t._is_view() + base = self.meta_tensor(t._base) + + def is_c_of_r(complex_dtype, real_dtype): + return is_complex_dtype(complex_dtype) and \ + corresponding_real_dtype(complex_dtype) == real_dtype + + if base.dtype == t.dtype: + pass + elif is_c_of_r(base.dtype, t.dtype): + base = torch.view_as_real(base) + elif is_c_of_r(t.dtype, base.dtype): + base = torch.view_as_complex(base) + else: + # This is not guaranteed to succeed. If it fails, it + # means there is another dtype-converting view function + # that hasn't been handled here + base = base.view(t.dtype) + + with torch.enable_grad(): + r = base.as_strided(t.size(), t.stride(), t.storage_offset()) + else: + is_leaf = safe_is_leaf(t) + # Fake up some autograd history. + if t.requires_grad: + r = torch.empty((0,), dtype=t.dtype, device='meta', requires_grad=True) + if not is_leaf: + with torch.enable_grad(): + # The backward function here will be wrong, but + # that's OK; our goal is just to get the metadata + # looking as close as possible; we're not going to + # actually try to backward() on these produced + # metas. TODO: would be safer to install some + # sort of unsupported grad_fn here + r = r.clone() + else: + r = torch.empty((0,), dtype=t.dtype, device='meta') + # As long as meta storage is not supported, need to prevent + # redispatching on set_(Storage, ...) which will choke with + # meta storage + s = self.meta_storage(t.storage()) + with no_dispatch(): + with torch.no_grad(): + r.set_(s, t.storage_offset(), t.size(), t.stride()) + + torch._C._set_conj(r, t.is_conj()) + torch._C._set_neg(r, t.is_neg()) + self.tensor_memo[t] = r + + return self.tensor_memo[t] + + def __call__(self, t): + # TODO: zero tensors? We appear to have eliminated them by + # excluding complex for now + if type(t) is torch.Tensor or type(t) is torch.nn.Parameter: + if any([ + t.is_sparse_csr, t.is_sparse, t.is_mkldnn, t.is_quantized, + t.is_nested, torch._is_functional_tensor(t), + # these are supported in meta conversion but the fallbacks + # don't work + t.is_neg(), t.is_conj(), + # conjugate fallback does not support meta tensors + t.dtype in (torch.complex128, torch.complex64), + ]): + # TODO: sparse should support meta + # NB technically to('meta') does work but our logging + # instrumentation will see the meta conversions and the + # tests all break so we just exclude this. In any case + # the to conversion isn't really right anyhow. + self.miss += 1 + return t + elif any([ + t.device.type in ("lazy", "meta"), t.is_complex(), + # We need a way to test if a tensor is batched but there + # is no official APi to do it + # torch._C._is_batched(t), + ]): + # TODO: this stuff should support storage + # (well, maybe not batched) + self.hit += 1 + return t.to("meta") + else: + self.hit += 1 + r = self.meta_tensor(t) + if type(t) is torch.nn.Parameter: + r = torch.nn.Parameter(r, requires_grad=r.requires_grad) + return r + elif torch.overrides.is_tensor_like(t): + # Blindly converting tensor subclasses to meta can cause + # unpredictable problems; e.g., FX tests will trace meta + # tensors into their trace / some subclasses don't correctly + # support meta. Trying to YOLO this is more trouble than it's + # worth. + self.miss += 1 + return t + else: + # non-Tensor types don't count as hit or miss + return t + + +class TestMetaConverter(TestCase): + def assertSameVersionCounter(self, m1, m2): + # Cannot easily test m1 and m2 have same storage due to + # lack of Storage bindings. Use version counter. + vc = m1._version + self.assertEqual(m2._version, vc) + # Doing it this way ensures that we get VC bump even with leaves + with torch.no_grad(): + m1._base.add_(3) + self.assertNotEqual(m1._version, vc) + self.assertEqual(m2._version, m1._version) + + def test_view_of_non_leaf(self): + x = torch.randn(4, requires_grad=True) + y = x.neg() + z1 = y[:] + z2 = y[:] + to_meta = MetaConverter() + m1 = to_meta(z1) + m2 = to_meta(z2) + self.assertEqual(m1.shape, z1.shape) + self.assertTrue(m1._is_view()) + self.assertFalse(m1._base.is_leaf) + self.assertSameVersionCounter(m1, m2) + + def test_view_of_leaf(self): + x = torch.randn(4, requires_grad=True) + z1 = x[:] + z2 = x[:] + to_meta = MetaConverter() + m1 = to_meta(z1) + m2 = to_meta(z2) + self.assertEqual(m1.shape, z1.shape) + self.assertTrue(m1._is_view()) + self.assertTrue(m1._base.is_leaf) + self.assertSameVersionCounter(m1, m2) + + def test_leaf(self): + x = torch.randn(4, requires_grad=True) + to_meta = MetaConverter() + m = to_meta(x) + self.assertEqual(m.shape, x.shape) + self.assertTrue(m.is_leaf) + self.assertTrue(m.requires_grad) + + def test_non_leaf(self): + x = torch.randn(4, requires_grad=True) + y = x.neg() + to_meta = MetaConverter() + m = to_meta(y) + self.assertEqual(m.shape, y.shape) + self.assertFalse(m.is_leaf) + self.assertTrue(m.requires_grad) + + def test_requires_grad_false(self): + x = torch.randn(4, requires_grad=False) + to_meta = MetaConverter() + m = to_meta(x) + self.assertEqual(m.shape, x.shape) + self.assertFalse(m.requires_grad) + + def test_view_as_real(self): + x = torch.randn(4, dtype=torch.complex64) + y = torch.view_as_real(x) + m = MetaConverter()(y) + self.assertEqual(m.shape, y.shape) + self.assertEqual(m.dtype, y.dtype) + + def test_view_as_complex(self): + x = torch.randn((4, 2), dtype=torch.float32) + y = torch.view_as_complex(x) + m = MetaConverter()(y) + self.assertEqual(m.shape, y.shape) + self.assertEqual(m.dtype, y.dtype) + + def test_view_dtype(self): + x = torch.randn(4, dtype=torch.float32) + y = x.view(dtype=torch.int32) + m = MetaConverter()(y) + self.assertEqual(m.shape, y.shape) + self.assertEqual(m.dtype, y.dtype) + + def test_imag(self): + x = torch.randn(4, dtype=torch.complex64) + y = x.imag + m = MetaConverter()(y) + self.assertEqual(m.shape, y.shape) + self.assertEqual(m.dtype, y.dtype) + self.assertEqual(m.stride(), y.stride()) + self.assertEqual(m.storage_offset(), y.storage_offset()) + + +def assert_ref_meta_equal(test_case, meta_rs, rs, msg_callable): + flat_meta_rs, _ = tree_flatten(meta_rs) + flat_rs, _ = tree_flatten(rs) + test_case.assertEqual(len(flat_meta_rs), len(flat_rs)) + for i, meta_r, r in zip(range(len(flat_rs)), flat_meta_rs, flat_rs): + def test_assert(cond, msg): + if not cond: + raise RuntimeError(f"output {i}: {msg_callable(msg)}") + if not isinstance(r, torch.Tensor): + continue + test_assert(isinstance(meta_r, torch.Tensor), f"but real {i}th result is Tensor") + test_assert(meta_r.dtype == r.dtype, f"but real dtype was {r.dtype}") + test_assert(meta_r.shape == r.shape, f"but real shape was {r.shape}") + # NOTE: this helper is used instead of a direct stride comparison + # because strides of tensors with no elements and dimensions of + # length 1 are not computed consistently + same_strides, _ = prims.utils.check_significant_strides(meta_r, r) + test_assert(same_strides, f"but real stride was {r.stride()}") + test_assert( + meta_r.storage_offset() == r.storage_offset(), + f"but real storage_offset was {r.storage_offset()}") + test_assert(meta_r.requires_grad == r.requires_grad, f"but real requires_grad was {r.requires_grad}") + test_assert(meta_r.is_conj() == r.is_conj(), f"but real is_conj was {r.is_conj()}") + test_assert(meta_r.is_neg() == r.is_neg(), f"but real is_neg was {r.is_neg()}") + + +# This environment variable controls whether or not we print expected failure +# lists at the end of a test suite run. The intended usage looks like this: +# +# 1. Run `PYTORCH_COLLECT_EXPECT=1 python test/test_meta.py` on a CUDA build +# of PyTorch that has LAPACK/MAGMA installed. You can filter `-k test_meta` +# or `-k test_dispatch_meta` to only focus on one or another list +# 2. Given the printed skip/xfail list, add them to the corresponding lists; +# torch.* entries go in meta_function and aten.* entries go in meta_dispatch. +# If there are preexisting entries, you need to merge in the entries. +# +# This is somewhat manual but typically you shouldn't need to do this, unless +# you've made a major change (e.g., added a new dtype to PyTorch) and need to +# refresh the lists. If you want to do it from scratch, just clear out the +# preexisting lists before running. +# +# WARNING: Python dict literals will silently ignore duplicate keys +COLLECT_EXPECT = os.getenv('PYTORCH_COLLECT_EXPECT', '0') == '1' + +seen_succeeded = {} +seen_failed = {} +failed_reasons = defaultdict(set) +def print_seen(): + expected_failures = [] + skips = [] + + def fmt_dtypes(dtypes): + r = ', '.join(sorted(dtype_abbrs[d] for d in dtypes)) + return '{' + r + '}' + + for op, failed_dtypes in seen_failed.items(): + ops = resolve_name(op) + succeeded_dtypes = seen_succeeded.get(op, set()) + expected_failures_dtypes = failed_dtypes - succeeded_dtypes + skips_dtypes = failed_dtypes & succeeded_dtypes + reasons = "" + if failed_reasons[op]: + reasons = " # " + ", ".join(sorted(failed_reasons[op])) + if expected_failures_dtypes: + expected_failures.append(f" {ops}: {fmt_dtypes(expected_failures_dtypes)},{reasons}") + if skips_dtypes: + skips.append(f" {ops}: {fmt_dtypes(skips_dtypes)},") + expected_failures.sort() + skips.sort() + nl = '\n' + print(f"""\ +expected_failures = {{ +{nl.join(expected_failures)} +}} + +skips = {{ +{nl.join(skips)} +}} +""") +if COLLECT_EXPECT: + atexit.register(print_seen) + +# Success forces pass; failure forces fail; skip unconditionally skips testing +TestExpect = Enum("TestExpect", ("SUCCESS", "XFAILURE", "SKIP")) + +# unlike print produce strides +def verbose_print(e): + class Lit: + def __init__(self, s): + self.s = s + + def __repr__(self): + return self.s + + def go(t): + if isinstance(t, torch.Tensor): + return Lit(f"{t} stride={t.stride()}") + else: + return t + + return repr(tree_map(go, e)) + +def run_meta_crossref( + test_case, + test_expect, + func, + args, + kwargs, + *, + dtype, + device_type, +): + to_meta = MetaConverter() + do_meta = test_expect is not TestExpect.SKIP + + if do_meta: + try: + meta_args = tree_map(to_meta, args) + meta_kwargs = tree_map(to_meta, kwargs) + except Exception as e: + raise RuntimeError( + f"failed to convert args to meta; " + f"originally (*{args}, **{kwargs})") from e + + rs = func(*args, **kwargs) + + # TODO: also handle cases where func raise an exception + + # For now, only attempt if we managed to convert all tensor types + # (if any of them failed, we're in a mixed device situation and + # this isn't well supported) + if do_meta and to_meta.successful(): + try: + # Suppress warnings, this doesn't matter for test_meta.py + # but it does matter if you want to use this decorator + # for cross-ref testing, as some tests may be looking at + # errors + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + meta_rs = func(*meta_args, **meta_kwargs) + except Exception as e: + if test_expect is TestExpect.XFAILURE: + return rs + seen_failed.setdefault(func, set()).add(dtype) + if isinstance(e, NotImplementedError): + m = RE_NOT_IMPLEMENTED_MSG.search(e.args[0]) + if m: + failed_reasons[func].add(m.group(1)) + if COLLECT_EXPECT: + return rs + raise RuntimeError(f"""\ +failed to run: {resolve_name(func)}( +*{verbose_print(meta_args)}, +**{verbose_print(meta_kwargs)} +)""") from e + else: + try: + delim = ',\n ' + assert_ref_meta_equal(test_case, meta_rs, rs, lambda msg: f"""\ +meta disagrees with real impl: +{resolve_name(func)}( + {delim.join(map(verbose_print, meta_args))}, + {delim.join(k + ": " + verbose_print(v) for k, v in meta_kwargs.items())} +) = ( + {verbose_print(meta_rs)} +) +{msg} +""") + except Exception: + if test_expect is TestExpect.XFAILURE: + return rs + seen_failed.setdefault(func, set()).add(dtype) + if COLLECT_EXPECT: + return rs + raise + else: + seen_succeeded.setdefault(func, set()).add(dtype) + if test_expect is TestExpect.XFAILURE and not COLLECT_EXPECT: + raise RuntimeError(f"unexpected success {resolve_name(func)}") + + return rs + + + +RE_NOT_IMPLEMENTED_MSG = re.compile(r"Could not run '([^']+)' with arguments ") + + +meta_function_expected_failures = { + torch.Tensor.item: {b8, bf16, c128, c64, f16, f32, f64, i16, i32, i64, i8, u8}, # aten::_local_scalar_dense + torch.Tensor.to_sparse: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8}, # aten::to_sparse, aten::to_sparse.sparse_dim + torch.addbmm: {bf16, f32, f64, i16, i32, i64, i8, u8}, # aten::addbmm, aten::addbmm.out + torch.allclose: {bf16, f16, f32, f64}, # aten::_local_scalar_dense + torch.angle: {c32, b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8}, # aten::angle, aten::angle.out + torch.argwhere: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8}, # aten::nonzero + torch.bincount: {i16, i32, i64, i8, u8}, # aten::bincount + torch.bucketize: {bf16, f16, f32, f64, i16, i32, i64, i8, u8}, # aten::bucketize.Tensor, aten::bucketize.Tensor_out + torch.combinations: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8}, # aten::masked_select + torch.complex: {f16, f32, f64}, # aten::complex.out + torch.conj_physical: {c32}, # aten::conj_physical.out + torch.corrcoef: {bf16, f32, f64, i16, i32, i64, i8, u8}, # aten::_local_scalar_dense + torch.count_nonzero: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8}, # aten::count_nonzero.dim_IntList + torch.cov: {bf16, f32, f64, i16, i32, i64, i8, u8}, # aten::_local_scalar_dense + torch.diag: {bf16, b8, f32, f64, i16, i32, i64, i8, u8}, # aten::diag.out + torch.diagflat: {bf16, b8, f32, f64, i16, i32, i64, i8, u8}, # aten::diag.out + torch.dot: {bf16, f32, f64, i16, i32, i64, i8, u8}, # aten::dot + torch.fft.fft2: {b8, f32, f64, i16, i32, i64, i8, u8}, # aten::_fft_c2c + torch.fft.fft: {b8, f32, f64, i16, i32, i64, i8, u8}, # aten::_fft_r2c + torch.fft.fftn: {b8, f32, f64, i16, i32, i64, i8, u8}, # aten::_fft_c2c + torch.fft.fftshift: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8}, # aten::roll + torch.fft.hfft2: {b8, f32, f64, i16, i32, i64, i8, u8}, # aten::_fft_c2c + torch.fft.hfft: {b8, f32, f64, i16, i32, i64, i8, u8}, + torch.fft.hfftn: {b8, f32, f64, i16, i32, i64, i8, u8}, # aten::_fft_c2c + torch.fft.ifft2: {b8, f32, f64, i16, i32, i64, i8, u8}, # aten::_fft_c2c + torch.fft.ifft: {b8, f32, f64, i16, i32, i64, i8, u8}, # aten::_fft_r2c + torch.fft.ifftn: {b8, f32, f64, i16, i32, i64, i8, u8}, # aten::_fft_c2c + torch.fft.ifftshift: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8}, # aten::roll + torch.fft.ihfft2: {b8, f32, f64, i16, i32, i64, i8, u8}, # aten::_fft_r2c + torch.fft.ihfft: {b8, f32, f64, i16, i32, i64, i8, u8}, # aten::_fft_r2c + torch.fft.ihfftn: {b8, f32, f64, i16, i32, i64, i8, u8}, # aten::_fft_r2c + torch.fft.irfft2: {b8, f32, f64, i16, i32, i64, i8, u8}, # aten::_fft_c2r, aten::_fft_c2r.out + torch.fft.irfft: {b8, f32, f64, i16, i32, i64, i8, u8}, # aten::_fft_c2r, aten::_fft_c2r.out + torch.fft.irfftn: {b8, f32, f64, i16, i32, i64, i8, u8}, # aten::_fft_c2r, aten::_fft_c2r.out + torch.fft.rfft2: {b8, f32, f64, i16, i32, i64, i8, u8}, # aten::_fft_r2c + torch.fft.rfft: {b8, f32, f64, i16, i32, i64, i8, u8}, # aten::_fft_r2c + torch.fft.rfftn: {b8, f32, f64, i16, i32, i64, i8, u8}, # aten::_fft_r2c + torch.floor_divide: {bf16, f16, f32, f64, i16, i32, i64, i8, u8}, # aten::floor_divide, aten::floor_divide.out + torch.frexp: {bf16, f16, f32, f64}, # aten::frexp.Tensor_out + torch.functional.istft: {f32, f64}, # aten::view_as_complex + torch.functional.stft: {f32, f64}, # aten::_fft_r2c + torch.functional.unique: {b8, bf16, f32, f64, i16, i32, i64, i8, u8}, # aten::_unique2, aten::unique_dim + torch.functional.unique_consecutive: {b8, bf16, f32, f64, i16, i32, i64, i8, u8}, # aten::unique_consecutive + torch.histc: {bf16, f32, f64}, # aten::histc, aten::histc.out + torch.histogram: {f32, f64}, # aten::histogram.bin_ct, aten::histogram.bins_tensor + torch.histogramdd: {f32, f64}, # aten::_histogramdd_bin_edges, aten::_histogramdd_from_bin_tensors + torch.kthvalue: {bf16, f32, f64, i16, i32, i64, i8, u8}, # aten::kthvalue.values + torch.linalg.qr: {f32, f64}, # aten::_linalg_qr_helper + torch.logcumsumexp: {bf16, f32, f64}, # aten::_logcumsumexp, aten::_logcumsumexp.out + torch.masked_select: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8}, # aten::masked_select, aten::masked_select.out + torch.matrix_exp: {bf16, f32, f64}, # aten::linalg_matrix_exp + torch.median: {bf16, f32, f64, i16, i32, i64, i8, u8}, # aten::median, aten::median.dim_values + torch.mode: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8}, # aten::mode + torch.multinomial: {bf16, f32, f64}, # aten::multinomial, aten::multinomial.out + torch.mvlgamma: {bf16, f32, f64, i16, i32, i64, i8, u8}, # aten::_local_scalar_dense, aten::mvlgamma.out + torch.nan_to_num: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8}, # aten::nan_to_num.out + torch.nanmean: {bf16, f16, f32, f64}, + torch.nanmedian: {bf16, f32, f64, i16, i32, i64, i8, u8}, # aten::nanmedian, aten::nanmedian.dim_values + torch.nanquantile: {f32, f64}, + torch.nansum: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8}, # aten::nansum, aten::nansum.out + torch.nn.functional.adaptive_avg_pool2d: {bf16, f32, f64}, # aten::_adaptive_avg_pool2d + torch.nn.functional.conv1d: {bf16, f32, f64, i64}, + torch.nn.functional.conv2d: {bf16, f32, f64, i64}, + torch.nn.functional.conv_transpose1d: {f32, f64, i64}, + torch.nn.functional.conv_transpose2d: {f32, f64, i64}, + torch.nn.functional.conv_transpose3d: {f32, f64, i64}, + torch.nn.functional.ctc_loss: {f32, f64}, + torch.nn.functional.embedding_bag: {f16, f32, f64}, # aten::_embedding_bag_forward_only + torch.nn.functional.gaussian_nll_loss: {bf16, f32, f64}, # aten::_local_scalar_dense + torch.nn.functional.grid_sample: {f32, f64}, # aten::grid_sampler_2d, aten::grid_sampler_3d + torch.nn.functional.group_norm: {bf16, f32, f64}, # aten::var_mean.correction + torch.nn.functional.instance_norm: {f32, f64}, # aten::var_mean.correction + torch.nn.functional.layer_norm: {bf16, f32, f64}, + torch.nn.functional.max_pool3d: {f32, f64}, # aten::max_pool3d_with_indices + torch.nn.functional.max_pool3d_with_indices: {f32, f64}, # aten::max_pool3d_with_indices + torch.nn.functional.max_unpool1d: {f32, f64}, # aten::max_unpool2d + torch.nn.functional.max_unpool2d: {f32, f64}, # aten::max_unpool2d + torch.nn.functional.max_unpool3d: {f32, f64}, # aten::max_unpool3d + torch.nn.functional.multi_margin_loss: {f32, f64}, # aten::multi_margin_loss + torch.nn.functional.multilabel_margin_loss: {f32, f64}, # aten::multilabel_margin_loss_forward + torch.nn.functional.one_hot: {i64}, # aten::_local_scalar_dense + torch.nn.functional.pdist: {f32, f64}, # aten::_pdist_forward + torch.nn.functional.prelu: {bf16, f32, f64}, # aten::prelu + torch.nn.functional.relu: {bf16, f32, f64, i16, i32, i64, i8, u8}, # aten::relu + torch.nn.functional.rrelu: {bf16, f32, f64}, # aten::rrelu_with_noise + torch.nn.functional.unfold: {bf16, f16, f32, f64}, # aten::im2col + torch.nonzero: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8}, # aten::nonzero, aten::nonzero.out + torch.polar: {f32, f64}, # aten::polar.out + torch.repeat_interleave: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8}, # aten::repeat_interleave.Tensor + torch.roll: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8}, # aten::roll + torch.searchsorted: {bf16, f16, f32, f64, i16, i32, i64, i8, u8}, # aten::searchsorted.Tensor, aten::searchsorted.Tensor_out + torch.symeig: {f32, f64}, + torch.std_mean: {bf16, f16, f32, f64}, # aten::std_mean.correction + torch.take: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8}, # aten::take, aten::take.out + torch.trace: {f32, f64, i16, i32, i64, i8, u8}, # aten::trace + torch.var_mean: {bf16, f16, f32, f64}, # aten::var_mean.correction + torch.vdot: {bf16, f32, f64, i16, i32, i64, i8, u8}, # aten::vdot + torch.qr: {f32, f64}, + torch.ormqr: {f32, f64}, + torch.lu_solve: {f32, f64}, + torch.cholesky: {f32, f64}, # aten::cholesky, aten::cholesky.out + torch.cholesky_inverse: {f32, f64}, # aten::cholesky_inverse, aten::cholesky_inverse.out + torch.cholesky_solve: {f32, f64}, # aten::_cholesky_solve_helper + torch.eig: {f32, f64}, # aten::_local_scalar_dense + torch.geqrf: {f32, f64}, # aten::geqrf + torch.linalg.cholesky: {f32, f64}, # aten::linalg_cholesky_ex, aten::linalg_cholesky_ex.L + torch.linalg.cholesky_ex: {f32, f64}, # aten::linalg_cholesky_ex + torch.linalg.det: {f32, f64}, # aten::_det_lu_based_helper + torch.linalg.eig: {f32, f64}, # aten::linalg_eig + torch.linalg.eigh: {f32, f64}, + torch.linalg.eigvals: {f32, f64}, + torch.linalg.eigvalsh: {f32, f64}, # aten::linalg_eigvalsh.out + torch.linalg.householder_product: {f32, f64}, # aten::linalg_householder_product + torch.linalg.inv: {f32, f64}, # aten::_local_scalar_dense + torch.linalg.ldl_factor: {f32, f64}, # aten::_local_scalar_dense + torch.linalg.lstsq: {f32, f64}, # aten::linalg_lstsq.out + torch.linalg.lu_factor: {f32, f64}, # aten::_local_scalar_dense + torch.linalg.slogdet: {f32, f64}, # aten::linalg_slogdet + torch.linalg.solve: {f32, f64}, # aten::linalg_solve, aten::linalg_solve.out + torch.linalg.solve_triangular: {f32, f64}, # aten::linalg_solve_triangular + torch.linalg.tensorinv: {f32, f64}, # aten::_local_scalar_dense + torch.linalg.tensorsolve: {f32, f64}, # aten::linalg_solve + torch.logdet: {f32, f64}, # aten::_local_scalar_dense, aten::nonzero +} + +""" +# This is some sample code for how we could dump these dicts into YAML +# file for easier reading/writing +import yaml +print(yaml.dump( + {resolve_name(k): [dtype_abbrs[d] for d in v] + for k, v in meta_function_expected_failures.items()}, default_flow_style=None)) +import sys +sys.exit() +""" + +meta_function_skips = { + torch.Tensor.__getitem__: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8, c32}, + torch.Tensor.__rmatmul__: {bf16, f32, f64, i16, i32, i64, i8, u8}, + torch.index_reduce: {bf16, f16, f32, f64}, + torch.addr: {b8}, + torch.aminmax: {b8, f32, f64, i16, i32, i64, i8, u8}, + torch.bernoulli: {bf16, f32, f64}, + torch.conj_physical: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8}, + torch.cummax: {b8, bf16, f32, f64, i16, i32, i64, i8, u8}, + torch.cummin: {b8, bf16, f32, f64, i16, i32, i64, i8, u8}, + torch.diff: {b8}, + torch.functional.cdist: {f32, f64}, + torch.functional.tensordot: {bf16, f32, f64, i16, i32, i64, i8, u8}, + torch.index_add: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8}, + torch.inner: {bf16, f32, f64, i16, i32, i64, i8, u8}, + torch.logical_not: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8}, + torch.logical_xor: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8}, + torch.logit: {b8, bf16, f32, f64, i16, i32, i64, i8, u8}, + torch.matmul: {bf16, f32, f64, i16, i32, i64, i8, u8}, + torch.nn.functional.adaptive_avg_pool1d: {bf16, f32, f64}, + torch.nn.functional.adaptive_avg_pool3d: {f16, f32, f64}, + torch.nn.functional.batch_norm: {f32, f64}, + torch.nn.functional.cross_entropy: {bf16, f32, f64}, + torch.nn.functional.interpolate: {bf16, f32, f64, u8}, + torch.nn.functional.nll_loss: {bf16, f32, f64}, + torch.nn.functional.pad: {f32, f64}, + torch.normal: {bf16, f16, f32, f64}, + torch.prod: {b8, f32, f64, i16, i32, i64, i8, u8}, + torch.tensor_split: {b8, bf16, f16, f32, f64, i16, i32, i64, i8, u8}, + torch.nn.functional.logsigmoid: {bf16, f16, f32, f64}, # logsigmoid.output + torch.inverse: {f32, f64}, + torch.linalg.matrix_power: {f32, f64}, + torch.linalg.matrix_rank: {f32, f64}, + torch.linalg.pinv: {f32, f64}, + torch.empty: {b8, bf16, c128, c64, c32, f16, f32, f64, i16, i32, i64, i8, u8}, +} + +meta_function_device_expected_failures = defaultdict(dict) +meta_function_device_skips = defaultdict(dict) + +meta_function_device_expected_failures['cpu'] = { +} + +meta_function_device_expected_failures['cuda'] = { + torch.addbmm: {f16}, # aten::addbmm, aten::addbmm.out + torch.corrcoef: {bf16, f16}, # aten::_local_scalar_dense + torch.cov: {f16}, # aten::_local_scalar_dense + torch.diag: {bf16, f16}, # aten::diag.out + torch.diagflat: {bf16, f16}, # aten::diag.out + torch.dot: {f16}, # aten::dot + torch.fft.fft2: {c32, f16}, # aten::_fft_c2c, aten::_fft_c2c.out + torch.fft.fft: {c32, f16}, # aten::_fft_c2c, aten::_fft_c2c.out + torch.fft.fftn: {c32, f16}, # aten::_fft_c2c, aten::_fft_c2c.out + torch.fft.hfft2: {c32, f16}, # aten::_fft_c2c + torch.fft.hfft: {c32, f16}, + torch.fft.hfftn: {c32, f16}, # aten::_fft_c2c + torch.fft.ifft2: {c32, f16}, # aten::_fft_c2c, aten::_fft_c2c.out + torch.fft.ifft: {c32, f16}, # aten::_fft_c2c, aten::_fft_c2c.out + torch.fft.ifftn: {c32, f16}, # aten::_fft_c2c, aten::_fft_c2c.out + torch.fft.ihfft2: {f16}, + torch.fft.ihfft: {f16}, + torch.fft.ihfftn: {f16}, + torch.fft.irfft2: {c32, f16}, # aten::_fft_c2r, aten::_fft_c2r.out + torch.fft.irfft: {c32, f16}, # aten::_fft_c2r, aten::_fft_c2r.out + torch.fft.irfftn: {c32, f16}, # aten::_fft_c2r, aten::_fft_c2r.out + torch.fft.rfft2: {f16}, + torch.fft.rfft: {f16}, + torch.fft.rfftn: {f16}, + torch.functional.unique: {f16}, # aten::_unique2, aten::unique_dim + torch.functional.unique_consecutive: {f16}, # aten::unique_consecutive + torch.geqrf: {f32, f64}, # aten::geqrf + torch.histc: {i16, i32, i64, i8}, # aten::histc, aten::histc.out + torch.kthvalue: {f16}, # aten::kthvalue.values + torch.linalg.cholesky: {f32, f64}, # aten::linalg_cholesky_ex, aten::linalg_cholesky_ex.L + torch.linalg.cholesky_ex: {f32, f64}, # aten::linalg_cholesky_ex + torch.linalg.householder_product: {f32, f64}, # aten::linalg_householder_product, aten::linalg_householder_product.out + torch.linalg.inv: {f32, f64}, # aten::_local_scalar_dense + torch.linalg.ldl_factor: {f32, f64}, # aten::_local_scalar_dense + torch.linalg.lu_factor: {f32, f64}, # aten::_local_scalar_dense + torch.linalg.solve_triangular: {f32, f64}, # aten::linalg_solve_triangular, aten::linalg_solve_triangular.out + torch.linalg.tensorinv: {f32, f64}, # aten::_local_scalar_dense + torch.logcumsumexp: {bf16, f16}, # aten::_logcumsumexp, aten::_logcumsumexp.out + torch.matrix_exp: {f16}, # aten::linalg_matrix_exp + torch.median: {f16}, # aten::median, aten::median.dim_values + torch.multinomial: {f16}, # aten::multinomial, aten::multinomial.out + torch.mvlgamma: {f16}, # aten::_local_scalar_dense, aten::mvlgamma.out + torch.nanmedian: {f16}, # aten::nanmedian, aten::nanmedian.dim_values + torch.nn.functional.adaptive_avg_pool2d: {f16}, # aten::_adaptive_avg_pool2d + torch.nn.functional.conv1d: {f16}, + torch.nn.functional.conv2d: {f16}, + torch.nn.functional.conv_transpose1d: {bf16, f16}, + torch.nn.functional.conv_transpose2d: {bf16, f16}, + torch.nn.functional.conv_transpose3d: {bf16, f16}, + torch.nn.functional.embedding_bag: {bf16}, # aten::_embedding_bag_forward_only + torch.nn.functional.gaussian_nll_loss: {f16}, # aten::_local_scalar_dense + torch.nn.functional.grid_sample: {f16}, # aten::grid_sampler_2d, aten::grid_sampler_3d + torch.nn.functional.group_norm: {bf16, f16}, # aten::var_mean.correction + torch.nn.functional.instance_norm: {bf16, f16}, # aten::var_mean.correction + torch.nn.functional.layer_norm: {f16}, + torch.nn.functional.max_pool3d: {bf16, f16}, # aten::max_pool3d_with_indices + torch.nn.functional.max_pool3d_with_indices: {bf16, f16}, # aten::max_pool3d_with_indices + torch.nn.functional.max_unpool1d: {f16}, # aten::max_unpool2d + torch.nn.functional.max_unpool2d: {f16}, # aten::max_unpool2d + torch.nn.functional.max_unpool3d: {f16}, # aten::max_unpool3d + torch.nn.functional.multi_margin_loss: {bf16, f16}, # aten::multi_margin_loss + torch.nn.functional.multilabel_margin_loss: {bf16, f16}, # aten::multilabel_margin_loss_forward + torch.nn.functional.prelu: {f16}, # aten::prelu + torch.nn.functional.relu: {f16}, # aten::relu + torch.nn.functional.rrelu: {f16}, # aten::rrelu_with_noise + torch.ormqr: {f32, f64}, # aten::ormqr, aten::ormqr.out + torch.qr: {f32, f64}, # aten::_linalg_qr_helper + torch.trace: {b8, bf16, f16}, # aten::diag.out + torch.vdot: {f16}, # aten::vdot +} + +meta_function_device_skips['cuda'] = { + torch.Tensor.__getitem__: {c32}, + torch.Tensor.__rmatmul__: {f16}, + torch.bernoulli: {f16}, + torch.cummax: {f16}, + torch.cummin: {f16}, + torch.functional.tensordot: {f16}, + torch.inner: {f16}, + torch.inverse: {f32, f64}, + torch.linalg.matrix_power: {f32, f64}, + torch.linalg.matrix_rank: {f32, f64}, + torch.linalg.svd: {f32, f64}, + torch.logit: {f16}, + torch.matmul: {f16}, + torch.nn.functional.adaptive_avg_pool1d: {f16}, + torch.nn.functional.adaptive_avg_pool3d: {bf16}, + torch.nn.functional.batch_norm: {bf16, f16}, + torch.nn.functional.cross_entropy: {f16}, + torch.nn.functional.interpolate: {f16}, + torch.nn.functional.nll_loss: {f16}, + torch.nn.functional.pad: {f16}, + torch.prod: {bf16, c32, f16}, + torch.svd: {f32, f64}, +} + +# This is a __torch_function__ mode that, when enabled, interposes every +# Torch API call and runs the operator as normal, and then reruns it +# with meta inputs, and then checks that everything about the output agrees. +# Most of the logic deals with faithfully replicating the original tensor +# as a meta tensor, which is nontrivial because there are a lot of subsystems +# that may potentially be exercised. +# +# That being said, this class is a little overkill for what it is doing in +# this test file (since I could have just inlined __torch_function__ on the +# OpInfo call, and OpInfos generally have very regular inputs), but it will be +# useful for more comprehensive testing e.g., as seen in +# https://github.com/pytorch/pytorch/pull/75994 The big benefit is it is +# A LOT more efficient that torch dispatch mode (at the cost of less coverage) +class MetaCrossRefFunctionMode(torch.overrides.TorchFunctionMode): + test_case: TestCase + device_type: str + dtype: torch.dtype + + def __init__(self, test_case, *, device, dtype): + self.test_case = test_case + self.device_type = torch.device(device).type + self.dtype = dtype + + def __torch_function__(self, func, types, args=(), kwargs=None): + kwargs = kwargs or {} + + if torch.jit.is_tracing() or isinstance(func, torch.ScriptMethod): + return func(*args, **kwargs) + + if self.dtype in meta_function_skips.get(func, set()): + test_expect = TestExpect.SKIP + elif self.dtype in meta_function_device_skips[self.device_type].get(func, set()): + test_expect = TestExpect.SKIP + elif self.dtype in meta_function_expected_failures.get(func, set()): + test_expect = TestExpect.XFAILURE + elif self.dtype in meta_function_device_expected_failures[self.device_type].get(func, set()): + test_expect = TestExpect.XFAILURE + else: + test_expect = TestExpect.SUCCESS + + return run_meta_crossref( + self.test_case, test_expect, func, args, + kwargs, dtype=self.dtype, device_type=self.device_type + ) + +aten = torch.ops.aten + +# these always fail +meta_dispatch_expected_failures = { + aten._adaptive_avg_pool2d.default: {bf16, f64, f32}, + aten._adaptive_avg_pool3d.default: {f16, f64, f32}, + aten._cdist_forward.default: {f64, f32}, + aten._conj_physical.default: {c32}, + aten._convolution.default: {c64, i64, f64, c128, bf16, f32}, + aten._ctc_loss.default: {f64, f32}, + aten._embedding_bag_forward_only.default: {f16, f64, f32}, + aten._fft_r2c.default: {i64, u8, b8, f32, i8, f64, i16, i32}, + aten._histogramdd_bin_edges.default: {f64, f32}, + aten._histogramdd_from_bin_cts.default: {f64, f32}, + aten._histogramdd_from_bin_tensors.default: {f64, f32}, + aten._local_scalar_dense.default: {c64, i64, c128, bf16, f16, u8, b8, f32, i8, f64, i16, i32}, + aten._pdist_forward.default: {f64, f32}, + aten._unique2.default: {i64, bf16, u8, b8, f32, i8, f64, i16, i32}, + aten.addbmm.default: {i64, bf16, u8, f32, i8, f64, i16, i32}, + aten.addbmm.out: {i64, bf16, u8, f32, i8, f64, i16, i32}, + aten.angle.default: {c32, i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32}, + aten.angle.out: {c32, i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32}, + aten.bernoulli.out: {bf16, f64, f32}, + aten.bincount.default: {i8, i64, i16, u8, i32}, + aten.bucketize.Tensor: {i64, bf16, f16, u8, f32, i8, f64, i16, i32}, + aten.bucketize.Tensor_out: {i64, bf16, f16, u8, f32, i8, f64, i16, i32}, + aten.col2im.default: {c64, f32, f64, c128}, + aten.complex.default: {c64, f64, c128, f16, f32}, + aten.complex.out: {f16}, + aten.conj_physical.out: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, c32, i32}, + aten.convolution.default: {c64, i64, f64, c128, bf16, f32}, + aten.count_nonzero.default: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32}, + aten.count_nonzero.dim_IntList: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32}, + aten.diag.default: {i64, u8, b8, f32, i8, f64, i16, i32, bf16}, + aten.diag.out: {bf16, i64, u8, b8, f32, i8, f64, i16, i32}, + aten.dot.default: {i64, bf16, u8, f32, i8, f64, i16, i32}, + aten.dot.out: {i64, bf16, u8, f32, i8, f64, i16, i32}, + aten.floor_divide.default: {i64, bf16, f16, u8, f32, i8, f64, i16, i32}, + aten.floor_divide.out: {i64, bf16, f16, u8, f32, i8, f64, i16, i32}, + aten.frexp.Tensor: {bf16, f16, f64, f32}, + aten.grid_sampler_2d.default: {f64, f32}, + aten.grid_sampler_3d.default: {f64, f32}, + aten.histc.default: {bf16, f64, f32}, + aten.histc.out: {bf16, f64, f32}, + aten.histogram.bin_ct: {f64, f32}, + aten.histogram.bins_tensor: {f64, f32}, + aten.im2col.default: {bf16, f16, f64, f32}, + aten.index.Tensor: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32, c32}, + aten.kthvalue.default: {i64, bf16, u8, f32, i8, f64, i16, i32}, + aten.linalg_matrix_exp.default: {bf16, f64, f32}, + aten.log_sigmoid_forward.output: {bf16, f64, f32}, + aten.logcumsumexp.default: {bf16, f64, f32}, + aten.logcumsumexp.out: {bf16, f64, f32}, + aten.logical_not.out: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32}, + aten.logical_not_.default: {bf16, f16, f64, f32}, + aten.logical_xor.out: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32}, + aten.logit.out: {i64, bf16, u8, b8, f32, i8, f64, i16, i32}, + aten.masked_select.default: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32}, + aten.masked_select.out: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32}, + aten.max_pool3d_with_indices.default: {f64, f32}, + aten.max_unpool2d.default: {f64, f32}, + aten.max_unpool3d.default: {f64, f32}, + aten.median.default: {i64, bf16, u8, f32, i8, f64, i16, i32}, + aten.median.dim: {i64, bf16, u8, f32, i8, f64, i16, i32}, + aten.mode.default: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32}, + aten.multi_margin_loss.default: {f64, f32}, + aten.multilabel_margin_loss_forward.default: {f64, f32}, + aten.multinomial.default: {bf16, f64, f32}, + aten.multinomial.out: {bf16, f64, f32}, + aten.mvlgamma.default: {i64, bf16, u8, f32, i8, f64, i16, i32}, + aten.mvlgamma.out: {i64, bf16, u8, f32, i8, f64, i16, i32}, + aten.nan_to_num.default: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32}, + aten.nan_to_num.out: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32}, + aten.nanmedian.default: {i64, bf16, u8, f32, i8, f64, i16, i32}, + aten.nanmedian.dim: {i64, bf16, u8, f32, i8, f64, i16, i32}, + aten.nansum.default: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32}, + aten.nansum.out: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32}, + aten.native_group_norm.default: {bf16, f64, f32}, + aten.nll_loss2d_forward.default: {bf16, f64, f32}, + aten.nonzero.default: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32}, + aten.nonzero.out: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32}, + aten.normal.Tensor_Tensor: {bf16, f16, f64, f32}, + aten.normal.Tensor_Tensor_out: {bf16, f16, f64, f32}, + aten.normal.float_Tensor: {bf16, f16, f64, f32}, + aten.normal.float_Tensor_out: {bf16, f16, f64, f32}, + aten.polar.default: {f64, f32}, + aten.prelu.default: {bf16, f64, f32}, + aten.prod.default: {i64, u8, b8, f32, i8, f64, i16, i32}, + aten.reflection_pad2d.default: {f64, f32}, + aten.relu.default: {i64, bf16, u8, f32, i8, f64, i16, i32}, + aten.repeat_interleave.Tensor: {c64, i64, c128, bf16, f16, u8, b8, f32, i8, f64, i16, i32}, + aten.roll.default: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32}, + aten.rrelu_with_noise.default: {bf16, f64, f32}, + aten.searchsorted.Tensor: {i64, bf16, f16, u8, f32, i8, f64, i16, i32}, + aten.searchsorted.Tensor_out: {i64, bf16, f16, u8, f32, i8, f64, i16, i32}, + aten.std_mean.correction: {bf16, f16, f64, f32}, + aten.take.default: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32}, + aten.take.out: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32}, + aten.tensordot.out: {i64, bf16, u8, f32, i8, f64, i16, i32}, + aten.to_sparse.default: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32}, + aten.to_sparse.sparse_dim: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32}, + aten.trace.default: {i8, i64, f64, i16, u8, i32, f32}, + aten.unique_consecutive.default: {i64, bf16, u8, b8, f32, i8, f64, i16, i32}, + aten.unique_dim.default: {i64, bf16, u8, b8, f32, i8, f64, i16, i32}, + aten.upsample_nearest3d.vec: {bf16, u8, f64, f32}, + aten.var_mean.correction: {bf16, f16, f64, f32}, + aten.vdot.default: {i64, bf16, u8, f32, i8, f64, i16, i32}, + aten.vdot.out: {i64, bf16, u8, f32, i8, f64, i16, i32}, + aten._det_lu_based_helper.default: {f32, f64}, # aten::_det_lu_based_helper + aten._linalg_check_errors.default: {c128, c64, f32, f64}, # aten::_local_scalar_dense + aten.cholesky.default: {f32, f64}, # aten::cholesky + aten.cholesky.out: {f32, f64}, # aten::cholesky.out + aten.cholesky_inverse.default: {f32, f64}, # aten::cholesky_inverse + aten.cholesky_inverse.out: {f32, f64}, # aten::cholesky_inverse.out + aten.cholesky_solve.default: {f32, f64}, # aten::_cholesky_solve_helper + aten.cholesky_solve.out: {f32, f64}, # aten::_cholesky_solve_helper + aten.eig.default: {f32, f64}, # aten::_local_scalar_dense + aten.geqrf.default: {f32, f64}, # aten::geqrf + aten.inverse.out: {f32, f64}, # aten::_local_scalar_dense + aten.linalg_cholesky_ex.L: {f32, f64}, # aten::linalg_cholesky_ex.L + aten.linalg_cholesky_ex.default: {f32, f64}, # aten::linalg_cholesky_ex + aten.linalg_eig.default: {f32, f64}, # aten::linalg_eig + aten.linalg_eigh.default: {f32, f64}, + aten.linalg_eigvalsh.out: {f32, f64}, # aten::linalg_eigvalsh.out + aten.linalg_householder_product.default: {f32, f64}, # aten::linalg_householder_product + aten.linalg_householder_product.out: {f32, f64}, # aten::linalg_householder_product.out + aten.linalg_lstsq.default: {f32, f64}, # aten::linalg_lstsq.out + aten.linalg_qr.default: {f32, f64}, # aten::_linalg_qr_helper + aten.linalg_slogdet.default: {f32, f64}, # aten::linalg_slogdet + aten.linalg_solve.default: {f32, f64}, # aten::linalg_solve + aten.linalg_solve.out: {f32, f64}, # aten::linalg_solve.out + aten.linalg_solve_triangular.default: {f32, f64}, # aten::linalg_solve_triangular + aten.linalg_solve_triangular.out: {f32, f64}, # aten::linalg_solve_triangular.out + aten.logdet.default: {f32, f64}, # aten::_local_scalar_dense, aten::nonzero + aten.lu_solve.default: {f32, f64}, # aten::lu_solve + aten.lu_solve.out: {f32, f64}, # aten::lu_solve.out + aten.ormqr.default: {f32, f64}, # aten::ormqr + aten.ormqr.out: {f32, f64}, # aten::ormqr.out + aten.symeig.default: {f32, f64}, # aten::_symeig_helper +} + +# these sometimes pass and sometimes fail +meta_dispatch_skips = { + aten.index_reduce.default: {bf16, f16, f64, f32}, + aten.index_reduce.out: {bf16, f16, f64, f32}, + aten._to_copy.default: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32}, + aten.addr.default: {b8}, + aten.addr.out: {b8}, + aten.aminmax.default: {i64, u8, b8, f32, i8, f64, i16, i32}, + aten.copy_.default: {c32}, + aten.cummax.default: {i64, bf16, u8, b8, f32, i8, f64, i16, i32}, + aten.cummin.default: {i64, bf16, u8, b8, f32, i8, f64, i16, i32}, + aten.index_add.default: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32}, + aten.index_add.out: {i64, bf16, f16, u8, b8, f32, i8, f64, i16, i32}, + aten.isnan.default: {f64, f32}, + aten.mul.Scalar: {i64, bf16, f16, f32, i8, f64, i16, i32}, + aten.native_batch_norm.default: {f64, f32}, + aten.native_layer_norm.default: {bf16, f64, f32}, + aten.slice.Tensor: {c32}, + aten.inverse.default: {f32, f64}, + aten.linalg_pinv.atol_rtol_tensor: {f32, f64}, + aten.linalg_pinv.atol_rtol_tensor_out: {f32, f64}, + aten.empty.memory_format: {b8, bf16, c128, c64, c32, f16, f32, f64, i16, i32, i64, i8, u8}, +} + +meta_dispatch_device_expected_failures = defaultdict(dict) +meta_dispatch_device_skips = defaultdict(dict) + +meta_dispatch_device_expected_failures['cuda'] = { + aten._adaptive_avg_pool2d.default: {f16}, # aten::_adaptive_avg_pool2d + aten._adaptive_avg_pool3d.default: {bf16}, # aten::_adaptive_avg_pool3d + aten._conj_physical.default: {f16}, # aten::conj_physical.out + aten._convolution.default: {f16}, + aten._embedding_bag_forward_only.default: {bf16}, # aten::_embedding_bag_forward_only + aten._fft_c2c.default: {c32, f16}, # aten::_fft_c2c + aten._fft_c2c.out: {c32, f16}, # aten::_fft_c2c.out + aten._fft_c2r.default: {c32, f16}, # aten::_fft_c2r + aten._fft_c2r.out: {c32, f16}, # aten::_fft_c2r.out + aten._fft_r2c.default: {f16}, # aten::_fft_r2c + aten._fft_r2c.out: {f16}, # aten::_fft_r2c.out + aten._linalg_check_errors.default: {c128, c64, f32, f64}, # aten::_local_scalar_dense + aten._unique2.default: {f16}, # aten::_unique2 + aten._use_cudnn_ctc_loss.default: {f32, f64}, # aten::_use_cudnn_ctc_loss + aten.addbmm.default: {f16}, # aten::addbmm + aten.addbmm.out: {f16}, # aten::addbmm.out + aten.bernoulli.out: {f16}, # aten::bernoulli.out + aten.convolution.default: {f16}, + aten.cudnn_grid_sampler.default: {f16, f32, f64}, # aten::cudnn_grid_sampler + aten.diag.default: {f16}, # aten::diag.out + aten.diag.out: {bf16, f16}, # aten::diag.out + aten.dot.default: {f16}, # aten::dot + aten.dot.out: {f16}, # aten::dot + aten.geqrf.default: {f32, f64}, # aten::geqrf + aten.grid_sampler_2d.default: {f16}, # aten::grid_sampler_2d + aten.grid_sampler_3d.default: {f16}, # aten::grid_sampler_3d + aten.histc.default: {i16, i32, i64, i8}, # aten::histc + aten.histc.out: {i16, i32, i64, i8}, # aten::histc.out + aten.index.Tensor: {c32}, # aten::index.Tensor + aten.inverse.out: {f32, f64}, # aten::_local_scalar_dense + aten.kthvalue.default: {f16}, # aten::kthvalue.values + aten.linalg_cholesky_ex.L: {f32, f64}, # aten::linalg_cholesky_ex.L + aten.linalg_cholesky_ex.default: {f32, f64}, # aten::linalg_cholesky_ex + aten.linalg_eigvalsh.out: {f32, f64}, # aten::linalg_eigvalsh.out + aten.linalg_householder_product.default: {f32, f64}, # aten::linalg_householder_product + aten.linalg_householder_product.out: {f32, f64}, # aten::linalg_householder_product.out + aten.linalg_matrix_exp.default: {f16}, # aten::linalg_matrix_exp + aten.linalg_qr.default: {f32, f64}, # aten::_linalg_qr_helper + aten.linalg_solve_triangular.default: {f32, f64}, # aten::linalg_solve_triangular + aten.linalg_solve_triangular.out: {f32, f64}, # aten::linalg_solve_triangular.out + aten.log_sigmoid_forward.default: {bf16, f16, f64, f32}, + aten.log_sigmoid_forward.output: {f16}, # aten::log_sigmoid_forward.output + aten.logcumsumexp.default: {bf16, f16}, # aten::_logcumsumexp + aten.logcumsumexp.out: {bf16, f16}, # aten::_logcumsumexp.out + aten.logit.out: {f16}, + aten.max_pool3d_with_indices.default: {bf16, f16}, # aten::max_pool3d_with_indices + aten.max_unpool2d.default: {f16}, # aten::max_unpool2d + aten.max_unpool3d.default: {f16}, # aten::max_unpool3d + aten.median.default: {f16}, # aten::median + aten.median.dim: {f16}, # aten::median.dim_values + aten.multi_margin_loss.default: {bf16, f16}, # aten::multi_margin_loss + aten.multilabel_margin_loss_forward.default: {bf16, f16}, # aten::multilabel_margin_loss_forward + aten.multinomial.default: {f16}, # aten::multinomial + aten.multinomial.out: {f16}, # aten::multinomial.out + aten.mvlgamma.default: {f16}, # aten::_local_scalar_dense + aten.mvlgamma.out: {f16}, # aten::mvlgamma.out + aten.nanmedian.default: {f16}, # aten::nanmedian + aten.nanmedian.dim: {f16}, # aten::nanmedian.dim_values + aten.native_batch_norm.default: {bf16, f16}, # aten::var_mean.correction + aten.native_dropout.default: {bf16, f16, f32, f64}, + aten.native_group_norm.default: {bf16, f16}, # aten::var_mean.correction + aten.native_layer_norm.default: {f16}, # aten::var_mean.correction + aten.nll_loss2d_forward.default: {f16}, # aten::nll_loss2d_forward + aten.ormqr.default: {f32, f64}, # aten::ormqr + aten.ormqr.out: {f32, f64}, # aten::ormqr.out + aten.prelu.default: {f16}, # aten::prelu + aten.prod.default: {bf16, c32, f16}, # aten::prod + aten.reflection_pad2d.default: {f16}, # aten::reflection_pad2d + aten.relu.default: {f16}, # aten::relu + aten.rrelu_with_noise.default: {f16}, # aten::rrelu_with_noise + aten.tensordot.out: {f16}, # aten::tensordot.out + aten.trace.default: {b8, bf16, f16}, # aten::diag.out + aten.unique_consecutive.default: {f16}, # aten::unique_consecutive + aten.unique_dim.default: {f16}, # aten::unique_dim + aten.upsample_nearest3d.vec: {f16}, # aten::upsample_nearest3d.vec + aten.vdot.default: {f16}, # aten::vdot + aten.vdot.out: {f16}, # aten::vdot +} + +meta_dispatch_device_skips['cuda'] = { + aten._conj.default: {c32, f16}, + aten._linalg_svd.default: {f32, f64}, + aten.cudnn_batch_norm.default: {f32, f64}, + aten.cummax.default: {f16}, + aten.cummin.default: {f16}, + aten.inverse.default: {f32, f64}, + aten.slice.Tensor: {f16}, + # ROCm stuff; technically this should be expected failure but it's + # not worth it; these should get unified anyway + aten.miopen_batch_norm.default: {f32}, +} + +class MetaCrossRefDispatchMode(torch.utils._python_dispatch.TorchDispatchMode): + test_case: TestCase + device: torch.device + dtype: torch.dtype + + def __init__(self, test_case, *, device, dtype): + self.test_case = test_case + # save TLS + self.precision = test_case.precision + self.rel_tol = test_case.rel_tol + self.device_type = torch.device(device).type + self.dtype = dtype + + def __torch_dispatch__(self, func, types, args=(), kwargs=None): + kwargs = kwargs or {} + + self.test_case.precision = self.precision + self.test_case.rel_tol = self.rel_tol + + if self.dtype in meta_dispatch_skips.get(func, set()): + test_expect = TestExpect.SKIP + elif self.dtype in meta_dispatch_device_skips[self.device_type].get(func, set()): + test_expect = TestExpect.SKIP + elif self.dtype in meta_dispatch_expected_failures.get(func, set()): + test_expect = TestExpect.XFAILURE + elif self.dtype in meta_dispatch_device_expected_failures[self.device_type].get(func, set()): + test_expect = TestExpect.XFAILURE + else: + test_expect = TestExpect.SUCCESS + + return run_meta_crossref( + self.test_case, + test_expect, + func, + args, + kwargs, + dtype=self.dtype, + device_type=self.device_type, + ) + + +# NB: we're running these tests only on CUDA because there are some +# inconsistencies between CUDA and CPU, and running on CUDA makes it easier +# to ignore the CPU case when inconsistencies arise. Ideally we deal +# with the inconsistencies but this takes time. +class TestMeta(TestCase): + @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN") + @onlyCUDA + @skipIfCrossRef + @suppress_warnings + @ops(op_db) + def test_meta(self, device, dtype, op): + # run the OpInfo sample inputs, cross-referencing them with the + # meta implementation and check the results are the same. All + # the heavy lifting happens in MetaCrossRefFunctionMode + func = op.get_op() + samples = op.sample_inputs(device, dtype, requires_grad=False) + for sample_input in samples: + args = [sample_input.input] + list(sample_input.args) + kwargs = sample_input.kwargs + with MetaCrossRefFunctionMode.push(self, dtype=dtype, device=device): + expected = func(*args, **kwargs) + if isinstance(expected, torch.Tensor) and op.supports_out: + func(*args, **kwargs, out=expected) + + @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN") + @onlyCUDA + @skipIfCrossRef + @suppress_warnings + @ops(op_db) + def test_dispatch_meta(self, device, dtype, op): + func = op.get_op() + samples = op.sample_inputs(device, dtype, requires_grad=False) + for sample_input in samples: + args = [sample_input.input] + list(sample_input.args) + kwargs = sample_input.kwargs + with MetaCrossRefDispatchMode.push(self, dtype=dtype, device=device): + expected = func(*args, **kwargs) + if isinstance(expected, torch.Tensor) and op.supports_out: + func(*args, **kwargs, out=expected) + +instantiate_device_type_tests(TestMeta, globals()) + +if __name__ == "__main__": + run_tests() diff --git a/test/test_mkldnn.py b/test/test_mkldnn.py index bfaca50e2090..cb9eb4828cac 100644 --- a/test/test_mkldnn.py +++ b/test/test_mkldnn.py @@ -283,6 +283,56 @@ def test_conv2d_bf16(self): def test_conv3d_bf16(self): self._test_conv_bf16_base(dim=3) + def _test_conv2d_nhwc_base(self, dtype): + conv_module = torch.nn.Conv2d + input_shapes = (224, 224) + options = itertools.product([True, False], [True, False], [1, 2], [1, 4]) + for train, bias, dilation, groups in options: + N = torch.randint(3, 10, (1,)).item() + M = torch.randint(1, 3, (1,)).item() * groups + C = torch.randint(1, 3, (1,)).item() * groups + x_shape = (N, C) + input_shapes + x = torch.randn(x_shape, dtype=dtype) + # conv1: mkldnn conv2d in contiguous memory format (nchw) + # conv2: mkldnn conv2d in channels last memory format (nhwc) + conv1 = conv_module(in_channels=C, + out_channels=M, + kernel_size=3, + stride=2, + padding=1, + dilation=dilation, + bias=bias, + groups=groups).to(dtype=dtype) + conv2 = copy.deepcopy(conv1).to(memory_format=torch.channels_last) + x1 = x.clone() + x2 = x.clone().to(memory_format=torch.channels_last) + if train: + x1.requires_grad_() + x2.requires_grad_() + y1 = conv1(x1) + y2 = conv2(x2) + self.assertEqual(y1, y2) + if train: + y1.sum().backward() + y2.sum().backward() + self.assertTrue(x2.grad.is_contiguous(memory_format=torch.channels_last)) + self.assertEqual(conv1.weight.grad, + conv2.weight.grad, + atol=1e-3, + rtol=1e-3) + if bias: + self.assertEqual(conv1.bias.grad, conv2.bias.grad) + self.assertEqual(x1.grad, x2.grad) + + def test_conv2d_nhwc(self): + self._test_conv2d_nhwc_base(dtype=torch.float32) + + @unittest.skipIf(IS_WINDOWS, "Limit support for bf16 path") + def test_conv2d_nhwc_bf16(self): + # when has_bf16_support() returns false, bf16 CPU conv will fall back to thnn impl + if has_bf16_support(): + self._test_conv2d_nhwc_base(dtype=torch.bfloat16) + def test_conv2d_legacy_jit_model(self): """ MKLDNN integration used to serialize models with 5d weight for grouped @@ -400,6 +450,74 @@ def test_gelu_bf16(self): msg, lambda: m(x2)) + def _test_prelu_base(self, size, num_channels): + x = torch.randn(size, dtype=torch.float32) + x1 = x.clone().requires_grad_() + x2 = x.clone().to_mkldnn().requires_grad_() + x3 = x.clone().to_mkldnn().requires_grad_() + m1 = torch.nn.PReLU(num_channels) + m2 = mkldnn_utils.to_mkldnn(copy.deepcopy(m1)) + m3 = copy.deepcopy(m1) + y1 = m1(x1) + y2 = m2(x2).to_dense() + y3 = m3(x3).to_dense() # Only convert data to mkldnn, weight is Aten tensor + loss1 = y1.sum() + loss1.backward() + loss2 = y2.sum() + loss2.backward() + loss3 = y3.sum() + loss3.backward() + self.assertEqual(y1, y2) + self.assertEqual(y1, y3) + self.assertEqual(x1.grad, x2.grad.to_dense()) + self.assertEqual(x1.grad, x3.grad.to_dense()) + + def test_prelu(self): + self._test_prelu_base(torch.Size([16]), 1) + self._test_prelu_base(torch.Size([16, 64]), 1) + self._test_prelu_base(torch.Size([16, 64]), 64) + self._test_prelu_base(torch.Size([16, 64, 112]), 1) + self._test_prelu_base(torch.Size([16, 64, 112]), 64) + self._test_prelu_base(torch.Size([16, 64, 112, 112]), 1) + self._test_prelu_base(torch.Size([16, 64, 112, 112]), 64) + self._test_prelu_base(torch.Size([16, 64, 112, 112, 1]), 1) + self._test_prelu_base(torch.Size([16, 64, 112, 112, 1]), 64) + + @unittest.skipIf(IS_WINDOWS, "Limit support for bf16 path") + def _test_prelu_bf16_base(self, size, num_channels): + if has_bf16_support(): + x = torch.randn(size, dtype=torch.float32) + x_fp32 = x.clone().to_mkldnn().requires_grad_() + x_bf16 = x.clone().to_mkldnn(torch.bfloat16).requires_grad_() + m = mkldnn_utils.to_mkldnn(torch.nn.PReLU()) + m_bf16 = mkldnn_utils.to_mkldnn(torch.nn.PReLU(), torch.bfloat16) + + y = m(x_fp32).to_dense() + y_bf16 = m_bf16(x_bf16).to_dense() + self.assertEqual(y, y_bf16.to(torch.float32), atol=1e-1, rtol=1e-3) + + loss = y.sum() + loss.backward() + loss_bf16 = y_bf16.sum() + loss_bf16.backward() + self.assertEqual(x_fp32.grad.to_dense(), x_bf16.grad.to_dense(torch.float32)) + else: + x_bf16 = torch.randn(size, dtype=torch.bfloat16).requires_grad_() + m_bf16 = mkldnn_utils.to_mkldnn(torch.nn.PReLU(), torch.bfloat16) + msg = r"bf16 path needs the cpu support avx512bw, avx512vl and avx512dq" + self.assertRaisesRegex(RuntimeError, + msg, + lambda: m_bf16(x_bf16)) + + def test_prelu_bf16(self): + self._test_prelu_bf16_base(torch.Size([16]), 1) + self._test_prelu_bf16_base(torch.Size([16, 64]), 1) + self._test_prelu_bf16_base(torch.Size([16, 64]), 64) + self._test_prelu_bf16_base(torch.Size([16, 64, 112]), 1) + self._test_prelu_bf16_base(torch.Size([16, 64, 112]), 64) + self._test_prelu_bf16_base(torch.Size([16, 64, 112, 112, 1]), 1) + self._test_prelu_bf16_base(torch.Size([16, 64, 112, 112, 1]), 64) + def _test_max_pool_base(self, dim, input): pool_module = {2: torch.nn.MaxPool2d, 3: torch.nn.MaxPool3d} for stride in [1, 2, 3]: diff --git a/test/test_mobile_optimizer.py b/test/test_mobile_optimizer.py index bb42702f536b..c7fc823a9364 100644 --- a/test/test_mobile_optimizer.py +++ b/test/test_mobile_optimizer.py @@ -3,9 +3,8 @@ import unittest import torch import torch.nn as nn -import torch.backends.xnnpack import torch.utils.bundled_inputs -from torch.testing._internal.common_utils import TestCase, run_tests +from torch.testing._internal.common_utils import TestCase, run_tests, skipIfNoXNNPACK from torch.testing._internal.jit_utils import get_forward, get_forward_graph from torch.utils.mobile_optimizer import (LintCode, generate_mobile_module_lints, @@ -24,9 +23,7 @@ class TestOptimizer(TestCase): - @unittest.skipUnless(torch.backends.xnnpack.enabled, - " XNNPACK must be enabled for these tests." - " Please build with USE_XNNPACK=1.") + @skipIfNoXNNPACK def test_optimize_for_mobile(self): batch_size = 2 input_channels_per_group = 6 @@ -151,7 +148,7 @@ def forward(self, x): bn_scripted_module = torch.jit.script(bn_test_module) bn_scripted_module.eval() - self.assertEqual(len(torch.jit.export_opnames(bn_scripted_module)), 14) + self.assertEqual(len(torch.jit.export_opnames(bn_scripted_module)), 11) FileCheck().check_count("prim::CallMethod[name=\"forward\"]", 2, exactly=True) \ .run(str(get_forward(bn_scripted_module._c).graph)) @@ -252,7 +249,7 @@ def foo(self, x): bn_no_forward_scripted_module = torch.jit.script(bn_test_no_forward_module) bn_no_forward_scripted_module.eval() - self.assertEqual(len(torch.jit.export_opnames(bn_no_forward_scripted_module)), 14) + self.assertEqual(len(torch.jit.export_opnames(bn_no_forward_scripted_module)), 11) FileCheck().check_count("prim::CallMethod[name=\"forward\"]", 2, exactly=True) \ .run(bn_no_forward_scripted_module.foo.graph) @@ -265,9 +262,7 @@ def foo(self, x): rtol=1e-2, atol=1e-3) - @unittest.skipUnless(torch.backends.xnnpack.enabled, - " XNNPACK must be enabled for these tests." - " Please build with USE_XNNPACK=1.") + @skipIfNoXNNPACK def test_quantized_conv_no_asan_failures(self): # There were ASAN failures when fold_conv_bn was run on # already quantized conv modules. Verifying that this does @@ -361,6 +356,7 @@ def get_lint_count_by_type(lint_type, module_lint_List): bi_module_lint_list = generate_mobile_module_lints(bi_module) self.assertEqual(len(bi_module_lint_list), 0) + @skipIfNoXNNPACK def test_preserve_bundled_inputs_methods(self): class MyBundledInputModule(torch.nn.Module): def __init__(self): @@ -415,9 +411,7 @@ def get_all_bundled_inputs(self): incomplete_bi_module_optim = optimize_for_mobile(incomplete_bi_module, preserved_methods=['get_all_bundled_inputs']) self.assertTrue(hasattr(incomplete_bi_module_optim, 'get_all_bundled_inputs')) - @unittest.skipUnless(torch.backends.xnnpack.enabled, - " XNNPACK must be enabled for these tests." - " Please build with USE_XNNPACK=1.") + @skipIfNoXNNPACK def test_hoist_conv_packed_params(self): if 'qnnpack' not in torch.backends.quantized.supported_engines: @@ -511,6 +505,7 @@ def _quant_script_and_optimize(model): m_optim_res = m_optim(data) torch.testing.assert_close(m_res, m_optim_res, rtol=1e-2, atol=1e-3) + @skipIfNoXNNPACK @unittest.skipUnless(HAS_TORCHVISION, "Needs torchvision") def test_mobilenet_optimize_for_mobile(self): m = torchvision.models.mobilenet_v3_small() diff --git a/test/test_model_dump.py b/test/test_model_dump.py index 10f3fe39b373..a8add0e2cd92 100644 --- a/test/test_model_dump.py +++ b/test/test_model_dump.py @@ -10,9 +10,10 @@ import unittest import torch +import torch.backends.xnnpack import torch.utils.model_dump import torch.utils.mobile_optimizer -from torch.testing._internal.common_utils import TestCase, run_tests, IS_WINDOWS +from torch.testing._internal.common_utils import TestCase, run_tests, IS_WINDOWS, skipIfNoXNNPACK from torch.testing._internal.common_quantized import supported_qengines @@ -170,6 +171,7 @@ def test_quantized_model(self): qmodel = self.get_quant_model() self.do_dump_model(torch.jit.script(qmodel)) + @skipIfNoXNNPACK @unittest.skipUnless("qnnpack" in supported_qengines, "QNNPACK not available") def test_optimized_quantized_model(self): qmodel = self.get_quant_model() diff --git a/test/test_module_init.py b/test/test_module_init.py index 589db4b71622..b568f210e550 100644 --- a/test/test_module_init.py +++ b/test/test_module_init.py @@ -166,6 +166,9 @@ def build_constructor_arg_db(): torch.nn.UpsamplingBilinear2d: ((), {}), torch.nn.UpsamplingNearest2d: ((), {}), torch.nn.ZeroPad2d: ((0,), {}), + torch.nn.qat.Conv1d: ((3, 3, 3), { + 'qconfig': torch.ao.quantization.default_qconfig, + }), torch.nn.qat.Conv2d: ((3, 3, 3), { 'qconfig': torch.ao.quantization.default_qconfig, }), @@ -206,7 +209,7 @@ def build_constructor_arg_db(): torch.nn.quantized.EmbeddingBag: ((10, 3), { 'factory_kwargs': {}, }), - torch.nn.quantized.GroupNorm: ((2, 3, torch.nn.Parameter(torch.tensor(2.)), + torch.nn.quantized.GroupNorm: ((2, 4, torch.nn.Parameter(torch.tensor(2.)), torch.nn.Parameter(torch.tensor(2.)), 0.1, 0), {}), torch.nn.quantized.Hardswish: ((0.1, 0,), {}), torch.nn.quantized.InstanceNorm1d: ((2, torch.nn.Parameter(torch.tensor(2.)), @@ -228,6 +231,7 @@ def build_constructor_arg_db(): }), torch.nn.quantized.ReLU6: ((), {}), torch.nn.quantized.Sigmoid: ((0.1, 0), {}), + torch.nn.quantized.Softmax: ((), {}), torch.nn.quantized.FloatFunctional: ((), {}), torch.nn.quantized.FXFloatFunctional: ((), {}), torch.nn.quantized.QFunctional: ((), {}), diff --git a/test/test_modules.py b/test/test_modules.py index 448f8f5fa751..292382e83e9c 100644 --- a/test/test_modules.py +++ b/test/test_modules.py @@ -8,10 +8,10 @@ import torch from torch.testing._internal.common_device_type import ( - instantiate_device_type_tests, onlyCUDA, toleranceOverride, tol) + instantiate_device_type_tests, onlyCUDA, toleranceOverride, tol, skipMeta) from torch.testing._internal.common_modules import module_db, modules from torch.testing._internal.common_utils import ( - TestCase, run_tests, freeze_rng_state, mock_wrapper, get_tensors_from, gradcheck, gradgradcheck) + TestCase, run_tests, freeze_rng_state, mock_wrapper, get_tensors_from, gradcheck, gradgradcheck, skipIfMps) from unittest.mock import patch, call @@ -40,6 +40,7 @@ def _check_module(items, name, device=device, dtype=dtype): _check_module(module.named_parameters(), "Parameter") _check_module(module.named_buffers(), "Buffer") + @skipIfMps # the test doesn't work on MPS as double types are not supported @modules(module_db) def test_forward(self, device, dtype, module_info): module_cls = module_info.module_cls @@ -201,6 +202,7 @@ def test_repr(self, device, dtype, module_info): m.__repr__() str(m) + @skipIfMps @modules(module_db) def test_pickle(self, device, dtype, module_info): # Test that module can be pickled and unpickled. @@ -233,6 +235,7 @@ def test_pickle(self, device, dtype, module_info): @modules([module_info for module_info in module_db if 'inplace' in signature(module_info.module_cls).parameters]) + @skipMeta def test_check_inplace(self, device, dtype, module_info): # Check if the inplace variant of the module gives the same result as the out of place # variant. @@ -310,6 +313,7 @@ def inner_zero_grad(obj): obj.grad = None self._traverse_obj(obj, inner_zero_grad) + @skipIfMps @modules(module_db) def test_non_contiguous_tensors(self, device, dtype, module_info): # Check modules work with non-contiguous tensors @@ -543,7 +547,7 @@ def check_backward(cpu_output, gpu_output): for cpu_output, gpu_output in zip(flatten_cpu_outputs, flatten_gpu_outputs): check_backward(cpu_output, gpu_output) - + @skipIfMps @modules(module_db) def test_memory_format(self, device, dtype, module_info): module_cls = module_info.module_cls diff --git a/test/test_mps.py b/test/test_mps.py new file mode 100644 index 000000000000..04804261505f --- /dev/null +++ b/test/test_mps.py @@ -0,0 +1,4006 @@ +# -*- coding: utf-8 -*- +# Owner(s): ["module: mps"] + +import sys +import math +import random +import unittest +import warnings +import torch +import torch.nn as nn +import torch.nn.functional as F +import itertools +from torch.nn import Parameter +from torch.testing._internal.common_utils import run_tests, TestCase, download_file, TEST_WITH_UBSAN +import torch.backends.mps +from torch.distributions import (Uniform) + +from torch.testing._internal.common_nn import NNTestCase +import numpy as np +import torch + +# Same logic as test_cuda.py +if not torch.backends.mps.is_available(): + print('MPS not available, skipping tests', file=sys.stderr) + TestCase = object # noqa: F811 + NNTestCase = object # noqa: F811 + + +class MPSReluTest(TestCase): + def _npRelu(self, np_features): + return np.maximum(np_features, np.zeros(np_features.shape)).astype(np_features.dtype) + + def testNpRelu(self): + torch.testing.assert_allclose( + np.array([[0., 0.7, 0.0, 0.3, 0.0], [0.1, 0.0, 0.5, 0.0, 0.9]]), + self._npRelu( + np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7, + 0.9]]))) + + def _testRelu(self, np_features, device): + np_relu = self._npRelu(np_features) + # Convert the numpy array to a PyTorch Tensor, + # and move the Tensor to the CPU/GPU based on the "device" parameter + py_tensor = torch.from_numpy(np_features).to(device) + py_relu = torch.nn.ReLU(inplace=False)(py_tensor) + py_relu_cpu = py_relu.to("cpu") + + torch.testing.assert_allclose(np_relu, py_relu_cpu) + + def _testReluInPlace(self, np_features, device): + np_relu = self._npRelu(np_features) + # Convert the numpy array to a PyTorch Tensor, + # and move the Tensor to the CPU/GPU based on the "device" parameter + py_tensor = torch.from_numpy(np_features).to(device) + py_relu = torch.nn.ReLU(inplace=True)(py_tensor) + py_relu_cpu = py_relu.to("cpu") + + torch.testing.assert_allclose(np_relu, py_relu_cpu) + # Inplace Relu modifies the initial input and it should match the output of Relu + torch.testing.assert_allclose(np_relu, py_tensor.to("cpu")) + + def testNumbersCPU(self): + for t in [np.int32]: + # Force execution on CPU even if a GPU kernel is available for the type. + self._testRelu( + np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t), + device="cpu") + self._testReluInPlace( + np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t), + device="cpu") + + def testNumbersGPU(self): + for t in [np.float16, np.float32]: + self._testRelu( + np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t), + device="mps") + self._testReluInPlace( + np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t), + device="mps") + +class MatmulTest(TestCase): + def _helper(self, shape_tensor_1, shape_tensor_2, expand_tensor_1_shape=None, expand_tensor_2_shape=None): + if expand_tensor_1_shape: + tensor1_mps = torch.randn(shape_tensor_1, device="mps").expand(expand_tensor_1_shape) + else: + tensor1_mps = torch.randn(shape_tensor_1, device="mps") + + if expand_tensor_2_shape: + tensor2_mps = torch.randn(shape_tensor_2, device="mps").expand(expand_tensor_2_shape) + else: + tensor2_mps = torch.randn(shape_tensor_2, device="mps") + + tensor1_cpu = tensor1_mps.to("cpu") + tensor2_cpu = tensor2_mps.to("cpu") + + matmul_cpu = torch.matmul(tensor1_cpu, tensor2_cpu) + matmul_mps = torch.matmul(tensor1_mps, tensor2_mps) + + self.assertEqual(matmul_cpu, matmul_mps.to("cpu")) + + def test_vector_x_vector(self): + # uses `dot` + self._helper(3, 3) + + def test_matrix_x_vector(self): + # uses `addmv` + self._helper((3, 4), 4) + + def test_batched_matrix_x_broadcasted_vector(self): + self._helper((10, 3, 4), 4) + + def test_batched_matrix_x_batched_matrix(self): + # uses `bmm.out` + self._helper((10, 3, 4), (10, 4, 5)) + + def test_batched_matrix_x_broadcasted_matrix(self): + self._helper((10, 3, 4), (4, 5)) + + +class MPSLeakyReluTest(TestCase): + def _npLeakyRelu(self, np_features, negative_slope=0.1): + return np.maximum(np_features, negative_slope * np_features).astype(np_features.dtype) + + def testNpLeakyRelu(self): + torch.testing.assert_allclose( + np.array([[-0.09, 0.7, -0.05, 0.3, -0.01], + [0.1, -0.03, 0.5, -0.07, 0.9]]), + self._npLeakyRelu( + np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7, + 0.9]]), + negative_slope=0.1)) + + def _testLeakyRelu(self, np_features, negative_slope, device): + cpu_x = torch.from_numpy(np_features).requires_grad_() + mps_x = torch.from_numpy(np_features).to('mps').requires_grad_() + relu_op = torch.nn.LeakyReLU(negative_slope) + + cpu_leaky_relu = relu_op(cpu_x) + mps_leaky_relu = relu_op(mps_x) + torch.testing.assert_allclose(cpu_leaky_relu, mps_leaky_relu.to('cpu')) + + # test backward pass + cpu_grad = torch.ones_like(cpu_leaky_relu) + mps_grad = cpu_grad.to('mps') + cpu_leaky_relu.backward(gradient=cpu_grad) + mps_leaky_relu.backward(gradient=mps_grad) + torch.testing.assert_allclose(cpu_x.grad, mps_x.grad.to('cpu')) + + def testNumbersCPU(self): + for t in [np.float32]: + self._testLeakyRelu( + np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t), + negative_slope=0.2, + device="cpu") + + +class TestAvgPool(TestCase): + def _sum_pool2d(self, x, kernel_size): + windows = torch.nn.functional.unfold(x, kernel_size=kernel_size, stride=kernel_size) + return torch.sum(windows, dim=1) + + def _sum_pool3d(self, x, kernel_size): + # Because unfold does not support 3D sliding window we will split tensor to multiple tensors and calculate sum + h = kernel_size[0] + splited_x = [t.sum(0) for t in x.split(h) if t.size(0) == h] + # sum_pool2d assumes tensor in (1, 1, n, m) view, so unsqueeze two times + splited_x = [self._sum_pool2d(t.unsqueeze(0).unsqueeze(0), kernel_size[1:]) for t in splited_x] + joined_x = torch.cat(splited_x) + return joined_x.view(1, joined_x.numel()) + + def _avg_pool2d(self, x, kernel_size): + size = reduce((lambda x, y: x * y), kernel_size) + return self._sum_pool2d(x, kernel_size) / size + + def _avg_pool3d(self, x, kernel_size): + size = reduce((lambda x, y: x * y), kernel_size) + return self._sum_pool3d(x, kernel_size) / size + + def test_avg_pool2d_with_zero_divisor(self): + self.assertRaisesRegex(RuntimeError, "divisor must be not zero", + lambda: F.avg_pool2d(torch.zeros(3, 3, 3), (2, 2), divisor_override=0)) + + def test_doubletensor_avg_pool2d_with_divisor(self): + n, m = 3, 3 + input = torch.rand(1, 1, n, m) + for i in range(1, n + 1): + for j in range(1, m + 1): + for divisor in [1, 7, i * j]: + actual = F.avg_pool2d(input[0], (i, j), divisor_override=divisor) + actual = actual.view(1, actual.numel()) + expected = self._sum_pool2d(input, (i, j)) / divisor + self.assertEqual(actual, expected, rtol=0, atol=1e-5) + + def test_avg_pool2d_ceil_mode(self): + # Regression test for gh-36977 + x = 10 * torch.randn((1, 16, 4, 4)) + y = torch.nn.functional.avg_pool2d( + x, ceil_mode=True, count_include_pad=True, kernel_size=(1, 2), + padding=(0, 1), stride=2) + self.assertTrue(not torch.isnan(y).any()) + y = torch.nn.functional.avg_pool2d( + x.to('mps'), ceil_mode=True, count_include_pad=True, kernel_size=(1, 2), + padding=(0, 1), stride=2) + self.assertTrue(not torch.isnan(y).any()) + + +class TestMPS(TestCase): + # @dtypes(*product([torch.float32, torch.int32], (torch.uint8, torch.bool))) + def test_masked_fill(self): + device = "mps" + dtype = torch.float32 + mask_dtype = torch.bool + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + num_dest = 10 + dst = torch.zeros(num_dest, dtype=dtype, device=device) + mask = torch.randint(2, (num_dest,), dtype=mask_dtype, device=device) + val = random.random() + dst2 = torch.zeros(num_dest, dtype=dtype) + mask_cpu = mask.to("cpu") + + dst.masked_fill_(mask, val) + for i in range(num_dest): + if mask_cpu[i]: + dst2[i] = val + self.assertEqual(dst.to("cpu"), dst2, atol=0, rtol=0) + + # test non-contiguous case + dst = ((torch.randn(num_dest, num_dest, num_dest) * 10).to(dtype)).permute((2, 0, 1)) + dst2 = dst.contiguous() + if dtype.is_complex: + mask = dst.abs() > 0 + else: + mask = dst > 0 + self.assertTrue(not dst.is_contiguous()) + self.assertTrue(dst2.is_contiguous()) + dst.masked_fill_(mask.to(mask_dtype), val) + dst2.masked_fill_(mask.to(mask_dtype), val) + self.assertEqual(dst, dst2, atol=0, rtol=0) + + if mask_dtype == torch.uint8: + self.assertEqual(len(w), 3) + + warn = 'masked_fill_ received a mask with dtype torch.uint8,' + for wi in w: + self.assertEqual(str(wi.message)[0:52], str(warn)) + else: + self.assertEqual(len(w), 0) + + def test_exp(self, device="mps", dtype=torch.float): + for v in (2, -2) + ((1j, 1 + 1j) if dtype.is_complex else ()): + b = torch.arange(18, device="cpu") / 3 * math.pi + a = torch.tensor(v, dtype=dtype, device="cpu") * b + a = a.to(dtype).to("mps") + self.compare_with_numpy(torch.exp, np.exp, a) + + def test_exp1(self, device="mps", dtype=torch.float): + input = torch.tensor([-0.1, 3.0, -0.9]).to('mps') + output = torch.exp(input).to('cpu') + print(output) + + def _testLeakyRelu(self, np_features, negative_slope, device): + cpu_x = torch.from_numpy(np_features).requires_grad_() + mps_x = torch.from_numpy(np_features).to('mps').requires_grad_() + relu_op = torch.nn.LeakyReLU(negative_slope) + + cpu_leaky_relu = relu_op(cpu_x) + mps_leaky_relu = relu_op(mps_x) + torch.testing.assert_allclose(cpu_leaky_relu, mps_leaky_relu.to('cpu')) + + # test backward pass + cpu_grad = torch.ones_like(cpu_leaky_relu) + mps_grad = cpu_grad.to('mps') + cpu_leaky_relu.backward(gradient=cpu_grad) + mps_leaky_relu.backward(gradient=mps_grad) + torch.testing.assert_allclose(cpu_x.grad, mps_x.grad.to('cpu')) + + def testNumbersGPU(self): + for t in [np.float32]: + self._testLeakyRelu( + np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t), + negative_slope=0.1, + device="mps") + + def test_fill(self): + + def helper(val, shape): + tensor = torch.zeros(shape, device='mps') + tensor_mps = tensor.fill_(val) + tensor_mps = torch.tanh(tensor_mps) + + tensor_0 = torch.zeros(shape, device='cpu') + tensor_cpu = tensor_0.fill_(val) + tensor_cpu = torch.tanh(tensor_cpu) + + self.assertEqual(tensor_mps, tensor_cpu) + + helper(0, [1024]) + helper(0.2, [2, 3]) + + def test_mm(self): + B = torch.ones(5, 6).to("mps") + C = torch.ones(6, 5).to("mps") + D = torch.mm(B, C).cpu() + torch.testing.assert_allclose(D, torch.full((5, 5), 6.0)) + + def test_addmm(self): + A = torch.ones(5, 5).to("mps") + B = torch.ones(5, 6).to("mps") + C = torch.ones(6, 5).to("mps") + D = torch.addmm(A, B, C).to("cpu") + torch.testing.assert_allclose(D, torch.full((5, 5), 7.0)) + + def test_bmm(self): + batch1_cpu = torch.randn(10, 3, 4) + batch2_cpu = torch.randn(10, 4, 5) + + batch1_mps = batch1_cpu.detach().clone().to("mps") + batch2_mps = batch2_cpu.detach().clone().to("mps") + + output_cpu = torch.bmm(batch1_cpu, batch2_cpu) + output_mps = torch.bmm(batch1_mps, batch2_mps) + + self.assertEqual(output_cpu, output_mps) + self.assertEqual(output_cpu.size(), output_mps.size()) + + def test_addbmm(self): + M_cpu = torch.randn(3, 5) + batch1_cpu = torch.randn(10, 3, 4) + batch2_cpu = torch.randn(10, 4, 5) + + M_mps = M_cpu.detach().clone().to("mps") + batch1_mps = batch1_cpu.detach().clone().to("mps") + batch2_mps = batch2_cpu.detach().clone().to("mps") + + output_cpu = torch.addbmm(M_cpu, batch1_cpu, batch2_cpu) + output_mps = torch.addbmm(M_mps, batch1_mps, batch2_mps) + + self.assertEqual(output_cpu, output_mps) + self.assertEqual(output_cpu.size(), output_mps.size()) + + def test_baddbmm(self): + M_cpu = torch.randn(3, 5) + batch1_cpu = torch.randn(10, 3, 4) + batch2_cpu = torch.randn(10, 4, 5) + alpha = 1.2 + beta = 0.8 + + M_mps = M_cpu.detach().clone().to("mps") + batch1_mps = batch1_cpu.detach().clone().to("mps") + batch2_mps = batch2_cpu.detach().clone().to("mps") + + output_cpu = torch.baddbmm(M_cpu, batch1_cpu, batch2_cpu, beta=beta, alpha=alpha) + output_mps = torch.baddbmm(M_mps, batch1_mps, batch2_mps, beta=beta, alpha=alpha) + + self.assertEqual(output_cpu, output_mps) + self.assertEqual(output_cpu.size(), output_mps.size()) + + def test_local_scalar_dense_mps(self): + x_cpu = torch.randn(1) + y_mps = x_cpu.to("mps") + torch.testing.assert_allclose(x_cpu.item(), y_mps.item()) + + def _linear_helper(self, in_features, out_features, shape, bias=True, backward_pass=False): + cpu_linear = torch.nn.Linear(in_features=in_features, out_features=out_features, device="cpu", bias=bias) + mps_linear = torch.nn.Linear(in_features=in_features, out_features=out_features, device="mps", bias=bias) + + # Use the same weights and bias as the ones from the cpu + mps_linear.weight.data = cpu_linear.weight.data.detach().clone().to("mps") + + if bias: + mps_linear.bias.data = cpu_linear.bias.data.detach().clone().to("mps") + + linear_mps_input = torch.randn(shape).to('mps') + linear_cpu_input = linear_mps_input.detach().clone().to('cpu') + + if backward_pass: + linear_mps_input = linear_mps_input.requires_grad_() + linear_cpu_input = linear_cpu_input.requires_grad_() + + linear_cpu_output = cpu_linear(linear_cpu_input) + linear_mps_output = mps_linear(linear_mps_input) + + self.assertEqual(linear_cpu_output, linear_mps_output.to('cpu')) + self.assertEqual(linear_cpu_output.size(), linear_mps_output.size()) + + if backward_pass: + cpu_grad = torch.ones_like(linear_cpu_output) + grad = cpu_grad.to('mps') + + linear_cpu_output.backward(gradient=cpu_grad) + linear_mps_output.backward(gradient=grad) + + self.assertEqual(linear_cpu_input.grad.size(), linear_mps_input.grad.size()) + self.assertEqual(linear_cpu_input.grad, linear_mps_input.grad.to("cpu"), atol=8e-04, rtol=10.4e-05) + + self.assertEqual(cpu_linear.weight.grad.size(), mps_linear.weight.grad.size()) + self.assertEqual(cpu_linear.weight.grad, mps_linear.weight.grad.to("cpu"), atol=8e-04, rtol=10.4e-05) + if bias: + self.assertEqual(cpu_linear.bias.grad.size(), mps_linear.bias.grad.size()) + self.assertEqual(cpu_linear.bias.grad, mps_linear.bias.grad.to("cpu"), atol=8e-04, rtol=10.4e-05) + + def test_linear2D(self): + self._linear_helper(in_features=2, out_features=3, shape=((4, 2)), bias=True, backward_pass=False) + + def test_linear2D_backward(self): + self._linear_helper(in_features=2, out_features=3, shape=((4, 2)), bias=True, backward_pass=True) + + def test_linear2D_no_bias(self): + self._linear_helper(in_features=2, out_features=3, shape=((4, 2)), bias=False, backward_pass=False) + + def test_linear2D_no_bias_backward(self): + self._linear_helper(in_features=2, out_features=3, shape=((4, 2)), bias=False, backward_pass=True) + + def test_linear3D(self): + self._linear_helper(in_features=200, out_features=33278, shape=((35, 20, 200)), bias=True, backward_pass=False) + + def test_linear3D_backwarwd(self): + self._linear_helper(in_features=200, out_features=33278, shape=((35, 20, 200)), bias=True, backward_pass=True) + + def test_linear3D_no_bias(self): + self._linear_helper(in_features=200, out_features=33278, shape=((35, 20, 200)), bias=True, backward_pass=False) + + def test_linear3D_no_bias_backward(self): + self._linear_helper(in_features=200, out_features=33278, shape=((35, 20, 200)), bias=True, backward_pass=True) + + def test_uniform(self): + low = torch.zeros(5, 5, requires_grad=True) + high = (torch.ones(5, 5) * 3).requires_grad_() + low_1d = torch.zeros(1, requires_grad=True) + high_1d = (torch.ones(1) * 3).requires_grad_() + self.assertEqual(Uniform(low, high).sample().size(), (5, 5)) + self.assertEqual(Uniform(low, high).sample((7,)).size(), (7, 5, 5)) + # self.assertEqual(Uniform(low_1d, high_1d).sample().size(), (1,)) + # self.assertEqual(Uniform(low_1d, high_1d).sample((1,)).size(), (1, 1)) + # self.assertEqual(Uniform(0.0, 1.0).sample((1,)).size(), (1,)) + + # # Check log_prob computation when value outside range + # uniform = Uniform(low_1d, high_1d, validate_args=False) + # above_high = torch.tensor([4.0]) + # below_low = torch.tensor([-1.0]) + # self.assertEqual(uniform.log_prob(above_high).item(), -inf) + # self.assertEqual(uniform.log_prob(below_low).item(), -inf) + + # # check cdf computation when value outside range + # self.assertEqual(uniform.cdf(below_low).item(), 0) + # self.assertEqual(uniform.cdf(above_high).item(), 1) + + # set_rng_seed(1) + # self._gradcheck_log_prob(Uniform, (low, high)) + # self._gradcheck_log_prob(Uniform, (low, 1.0)) + # self._gradcheck_log_prob(Uniform, (0.0, high)) + + # state = torch.get_rng_state() + # rand = low.new(low.size()).uniform_() + # torch.set_rng_state(state) + # u = Uniform(low, high).rsample() + # u.backward(torch.ones_like(u)) + # self.assertEqual(low.grad, 1 - rand) + # self.assertEqual(high.grad, rand) + # low.grad.zero_() + # high.grad.zero_() + + # Test forward maxpool2d + def test_max_pool2d(self): + def helper(shape, ks, padding=0, dilation=1, ceil_mode=False, return_indices=False, test_ties=False): + + cpu_x = None + if(test_ties): + cpu_x = torch.ones(shape, device='cpu', dtype=torch.float, requires_grad=True) + else: + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + + pool = torch.nn.MaxPool2d(kernel_size=ks, padding=padding, dilation=dilation, + ceil_mode=ceil_mode, return_indices=return_indices) + + if(return_indices is False): + y = pool(x) + ref_y = pool(cpu_x) + + cpu_grad = torch.ones_like(ref_y) + grad = cpu_grad.to('mps') + + y.backward(gradient=grad) + ref_y.backward(gradient=cpu_grad) + + self.assertEqual(y, ref_y) + self.assertEqual(x.grad, cpu_x.grad) + else: + y, idx = pool(x) + ref_y, ref_idx = pool(cpu_x) + + cpu_grad = torch.ones_like(ref_y) + grad = cpu_grad.to('mps') + + y.backward(gradient=grad) + ref_y.backward(gradient=cpu_grad) + + self.assertEqual(y, ref_y) + self.assertEqual(idx, ref_idx) + self.assertEqual(x.grad, cpu_x.grad) + + # Test with no batch dimension + helper((8, 4, 4), ks=2) + helper((2, 8, 4, 4), ks=2) + helper((1, 100000, 32, 32), ks=4) + helper((1, 100000, 1, 4), ks=(1, 4)) # test for max_pool1d + # Test padding + helper((1, 100000, 32, 32), ks=4, padding=1) + helper((1, 100000, 1, 4), ks=(1, 4), padding=(0, 1)) # test for max_pool1d + # Test dilation + helper((1, 100000, 32, 32), ks=4, dilation=2) + helper((1, 100000, 1, 4), ks=(1, 4), padding=(0, 2)) # test for max_pool1d + # Test ceil mode + helper((1, 100000, 32, 32), ks=4, ceil_mode=True) + helper((1, 100000, 1, 4), ks=(1, 4), ceil_mode=True) # test for max_pool1d + + # Test return indices + for test_ties in [False, True]: + # Test with no batch dimension + helper((8, 4, 4), ks=2, return_indices=True, test_ties=test_ties) + helper((2, 8, 4, 4), ks=2, return_indices=True, test_ties=test_ties) + helper((1, 100000, 32, 32), ks=4, return_indices=True, test_ties=test_ties) + helper((1, 100000, 1, 4), ks=(1, 4), return_indices=True, test_ties=test_ties) # test for max_pool1d + # Test padding + helper((1, 100000, 32, 32), ks=4, padding=1, return_indices=True, test_ties=test_ties) + helper((1, 100000, 1, 4), ks=(1, 4), padding=(0, 1), + return_indices=True, test_ties=test_ties) # test for max_pool1d + # Test dilation + helper((1, 100000, 32, 32), ks=4, dilation=2, return_indices=True, test_ties=test_ties) + helper((1, 100000, 1, 4), ks=(1, 4), padding=(0, 2), + return_indices=True, test_ties=test_ties) # test for max_pool1d + # Test ceil mode + helper((1, 100000, 32, 32), ks=4, ceil_mode=True, return_indices=True, test_ties=test_ties) + helper((1, 100000, 1, 4), ks=(1, 4), ceil_mode=True, + return_indices=True, test_ties=test_ties) # test for max_pool1d + + def test_adaptive_avg_pool2d_output_size_one(self): + def helper(size, memory_format): + x = torch.randint(1, 10, size, dtype=torch.float, device='mps', requires_grad=True) + x = x.to(memory_format=memory_format) + + net = torch.nn.AdaptiveAvgPool2d((1, 1)) + out = net(x) + ref_out = x.contiguous().mean((-1, -2)).view((x.size(0), x.size(1), 1, 1)) + + out.sum().backward() # make sure it doesn't crash + + self.assertEqual(out, ref_out) + if memory_format == torch.channels_last: + self.assertTrue(out.is_contiguous(memory_format=torch.channels_last)) + c = out.size(1) + self.assertEqual(out.stride(), [c, 1, c, c]) + else: + self.assertTrue(out.is_contiguous()) + c = out.size(1) + self.assertEqual(out.stride(), [c, 1, 1, 1]) + + helper((2, 3, 6, 6), torch.contiguous_format) + + # Test forward batch norm + def test_batch_norm(self): + def helper(shape, eps=1, momentum=0.1, wts=False, training=False, channels_last=False, + track_running_stats=True, test_module=False): + + import numpy as np + np.random.seed(332) + arr = (256 - 128) * np.random.random_sample(size=shape) + 128 + cpu_x = torch.tensor(arr, device='cpu', dtype=torch.float, requires_grad=True) + if(channels_last): + cpu_x = cpu_x.to(memory_format=torch.channels_last) + cpu_x.retain_grad() + x = cpu_x.detach().clone().to('mps').requires_grad_() + + mean_shape = [shape[1]] + cpu_running_mean = None + cpu_running_var = None + running_mean = None + running_var = None + if(track_running_stats): + mean_arr = (240 - 140) * np.random.random_sample(size=mean_shape) + 140 + cpu_running_mean = torch.tensor(mean_arr, device='cpu', dtype=torch.float) + var_arr = 32 * np.random.random_sample(size=mean_shape) + cpu_running_var = torch.tensor(var_arr, device='cpu', dtype=torch.float) + running_mean = cpu_running_mean.detach().clone().to('mps') + running_var = cpu_running_var.detach().clone().to('mps') + + weight = None + cpu_weight = None + bias = None + cpu_bias = None + if(wts): + cpu_weight = torch.randn(mean_shape, device='cpu', dtype=torch.float, requires_grad=True) + weight = cpu_weight.detach().clone().to('mps').requires_grad_() + cpu_bias = torch.randn(mean_shape, device='cpu', dtype=torch.float, requires_grad=True) + bias = cpu_bias.detach().clone().to('mps').requires_grad_() + + y = None + ref_y = None + + if(not test_module): + y = torch.nn.functional.batch_norm(x, running_mean, running_var, + weight=weight, + bias=bias, + training=training, + momentum=momentum, eps=eps) + ref_y = torch.nn.functional.batch_norm(cpu_x, cpu_running_mean, cpu_running_var, + weight=cpu_weight, + bias=cpu_bias, + training=training, + momentum=momentum, eps=eps) + + else: + + batchnorm_op = None + mps_batchnorm_op = None + + if(len(shape) == 3): + batchnorm_op = torch.nn.BatchNorm1d(shape[1], + eps=eps, + momentum=momentum, + affine=wts, + track_running_stats=track_running_stats, + device='cpu') + mps_batchnorm_op = torch.nn.BatchNorm1d(shape[1], + eps=eps, + momentum=momentum, + affine=wts, + track_running_stats=track_running_stats, + device='mps') + elif(len(shape) == 4): + batchnorm_op = torch.nn.BatchNorm2d(shape[1], + eps=eps, + momentum=momentum, + affine=wts, + track_running_stats=track_running_stats, + device='cpu') + mps_batchnorm_op = torch.nn.BatchNorm2d(shape[1], + eps=eps, + momentum=momentum, + affine=wts, + track_running_stats=track_running_stats, + device='mps') + elif(len(shape) == 5): + batchnorm_op = torch.nn.BatchNorm3d(shape[1], + eps=eps, + momentum=momentum, + affine=wts, + track_running_stats=track_running_stats, + device='cpu') + mps_batchnorm_op = torch.nn.BatchNorm3d(shape[1], + eps=eps, + momentum=momentum, + affine=wts, + track_running_stats=track_running_stats, + device='mps') + + if(track_running_stats): + batchnorm_op.running_mean = cpu_running_mean + batchnorm_op.running_var = cpu_running_var + mps_batchnorm_op.running_mean = running_mean + mps_batchnorm_op.running_var = running_var + if(wts): + batchnorm_op.weight = torch.nn.Parameter(cpu_weight) + batchnorm_op.bias = torch.nn.Parameter(cpu_bias) + mps_batchnorm_op.weight = torch.nn.Parameter(weight) + mps_batchnorm_op.bias = torch.nn.Parameter(bias) + + ref_y = batchnorm_op(cpu_x) + y = mps_batchnorm_op(x) + + self.assertEqual(y, ref_y) + if(not test_module): + self.assertEqual(running_mean, cpu_running_mean) + self.assertEqual(running_var, cpu_running_var) + else: + self.assertEqual(mps_batchnorm_op.running_mean, batchnorm_op.running_mean) + self.assertEqual(mps_batchnorm_op.running_var, batchnorm_op.running_var) + + cpu_grad = torch.randn(ref_y.shape) + grad = cpu_grad.to('mps') + ref_y.backward(gradient=cpu_grad) + y.backward(gradient=grad) + + self.assertEqual(x.grad, cpu_x.grad) + if(wts): + if(not test_module): + self.assertEqual(weight.grad, cpu_weight.grad) + self.assertEqual(bias.grad, cpu_bias.grad) + else: + self.assertEqual(mps_batchnorm_op.weight.grad, batchnorm_op.weight.grad) + self.assertEqual(mps_batchnorm_op.bias.grad, batchnorm_op.bias.grad) + + for shape in [(2, 3, 2, 2), (2, 3, 2, 2, 2), (2, 3, 2)]: + for test_module in [False, True]: + for track_running_stats in [True, False]: + for channels_last in [False, True]: + if(channels_last and len(shape) != 4): + continue + # Running stats must be tracked in eval mode + if(track_running_stats): + helper(shape, eps=0, momentum=1, channels_last=channels_last, + track_running_stats=track_running_stats, test_module=test_module) + helper(shape, channels_last=channels_last, + track_running_stats=track_running_stats, test_module=test_module) + helper(shape, eps=1e-05, momentum=0.1, wts=False, training=False, channels_last=channels_last, + track_running_stats=track_running_stats, test_module=test_module) + helper(shape, eps=0, momentum=1.0, wts=False, training=False, channels_last=channels_last, + track_running_stats=track_running_stats, test_module=test_module) + helper(shape, eps=1, momentum=1, wts=True, training=False, channels_last=channels_last, + track_running_stats=track_running_stats, test_module=test_module) + helper(shape, eps=3, momentum=0.67, wts=True, training=False, channels_last=channels_last, + track_running_stats=track_running_stats, test_module=test_module) + helper(shape, eps=1e-05, momentum=0.1, wts=False, training=True, channels_last=channels_last, + track_running_stats=track_running_stats, test_module=test_module) + helper(shape, eps=0, momentum=1.0, wts=False, training=True, channels_last=channels_last, + track_running_stats=track_running_stats, test_module=test_module) + helper(shape, eps=1, momentum=1, wts=True, training=True, channels_last=channels_last, + track_running_stats=track_running_stats, test_module=test_module) + helper(shape, eps=3, momentum=0.67, wts=True, training=True, channels_last=channels_last, + track_running_stats=track_running_stats, test_module=test_module) + + # Test forward instance norm + def test_instance_norm(self): + def helper(shape, eps=1, momentum=0.1, wts=False, channels_last=False, track_running_stats=True, test_module=False): + + import numpy as np + np.random.seed(332) + arr = (256 - 128) * np.random.random_sample(size=shape) + 128 + cpu_x = torch.tensor(arr, device='cpu', dtype=torch.float, requires_grad=True) + if(channels_last): + cpu_x = cpu_x.to(memory_format=torch.channels_last) + cpu_x.retain_grad() + x = cpu_x.detach().clone().to('mps').requires_grad_() + + mean_shape = [shape[1]] + cpu_running_mean = None + cpu_running_var = None + running_mean = None + running_var = None + if(track_running_stats): + mean_arr = (240 - 140) * np.random.random_sample(size=mean_shape) + 140 + cpu_running_mean = torch.tensor(mean_arr, device='cpu', dtype=torch.float) + var_arr = 32 * np.random.random_sample(size=mean_shape) + cpu_running_var = torch.tensor(var_arr, device='cpu', dtype=torch.float) + running_mean = cpu_running_mean.detach().clone().to('mps') + running_var = cpu_running_var.detach().clone().to('mps') + + weight = None + cpu_weight = None + bias = None + cpu_bias = None + if(wts): + cpu_weight = torch.randn(mean_shape, device='cpu', dtype=torch.float, requires_grad=True) + weight = cpu_weight.detach().clone().to('mps').requires_grad_() + cpu_bias = torch.randn(mean_shape, device='cpu', dtype=torch.float, requires_grad=True) + bias = cpu_bias.detach().clone().to('mps').requires_grad_() + + y = None + ref_y = None + + if(not test_module): + ref_y = torch.nn.functional.instance_norm(cpu_x, cpu_running_mean, cpu_running_var, + weight=cpu_weight, + bias=cpu_bias, + momentum=momentum, eps=eps) + y = torch.nn.functional.instance_norm(x, running_mean, running_var, + weight=weight, + bias=bias, + momentum=momentum, eps=eps) + + else: + + instancenorm_op = None + mps_instancenorm_op = None + + if(len(shape) == 3): + instancenorm_op = torch.nn.InstanceNorm1d(shape[1], + eps=eps, + momentum=momentum, + affine=wts, + track_running_stats=track_running_stats, + device='cpu') + mps_instancenorm_op = torch.nn.InstanceNorm1d(shape[1], + eps=eps, + momentum=momentum, + affine=wts, + track_running_stats=track_running_stats, + device='mps') + elif(len(shape) == 4): + instancenorm_op = torch.nn.InstanceNorm2d(shape[1], + eps=eps, + momentum=momentum, + affine=wts, + track_running_stats=track_running_stats, + device='cpu') + mps_instancenorm_op = torch.nn.InstanceNorm2d(shape[1], + eps=eps, + momentum=momentum, + affine=wts, + track_running_stats=track_running_stats, + device='mps') + elif(len(shape) == 5): + instancenorm_op = torch.nn.InstanceNorm3d(shape[1], + eps=eps, + momentum=momentum, + affine=wts, + track_running_stats=track_running_stats, + device='cpu') + mps_instancenorm_op = torch.nn.InstanceNorm3d(shape[1], + eps=eps, + momentum=momentum, + affine=wts, + track_running_stats=track_running_stats, + device='mps') + + if(track_running_stats): + instancenorm_op.running_mean = cpu_running_mean + instancenorm_op.running_var = cpu_running_var + mps_instancenorm_op.running_mean = running_mean + mps_instancenorm_op.running_var = running_var + if(wts): + instancenorm_op.weight = torch.nn.Parameter(cpu_weight) + instancenorm_op.bias = torch.nn.Parameter(cpu_bias) + mps_instancenorm_op.weight = torch.nn.Parameter(weight) + mps_instancenorm_op.bias = torch.nn.Parameter(bias) + + ref_y = instancenorm_op(cpu_x) + y = mps_instancenorm_op(x) + + self.assertEqual(y, ref_y) + if(not test_module): + self.assertEqual(running_mean, cpu_running_mean) + self.assertEqual(running_var, cpu_running_var) + else: + self.assertEqual(mps_instancenorm_op.running_mean, instancenorm_op.running_mean) + self.assertEqual(mps_instancenorm_op.running_var, instancenorm_op.running_var) + + cpu_grad = torch.randn(ref_y.shape) + grad = cpu_grad.to('mps') + ref_y.backward(gradient=cpu_grad) + y.backward(gradient=grad) + + self.assertEqual(x.grad, cpu_x.grad) + if(wts): + if(not test_module): + self.assertEqual(weight.grad, cpu_weight.grad) + self.assertEqual(bias.grad, cpu_bias.grad) + else: + self.assertEqual(mps_instancenorm_op.weight.grad, instancenorm_op.weight.grad) + self.assertEqual(mps_instancenorm_op.bias.grad, instancenorm_op.bias.grad) + + for shape in [(2, 3, 2, 2), (2, 3, 2, 2, 2), (2, 3, 2)]: + for test_module in [False, True]: + for track_running_stats in [True, False]: + for channels_last in [False]: + if(channels_last and len(shape) != 4): + continue + # Running stats must be tracked in eval mode + if(track_running_stats): + helper(shape, eps=0, momentum=1, channels_last=channels_last, + track_running_stats=track_running_stats, test_module=test_module) + helper(shape, channels_last=channels_last, + track_running_stats=track_running_stats, test_module=test_module) + helper(shape, eps=1e-05, momentum=0.1, wts=False, channels_last=channels_last, + track_running_stats=track_running_stats, test_module=test_module) + helper(shape, eps=0, momentum=1.0, wts=False, channels_last=channels_last, + track_running_stats=track_running_stats, test_module=test_module) + helper(shape, eps=1, momentum=1, wts=True, channels_last=channels_last, + track_running_stats=track_running_stats, test_module=test_module) + helper(shape, eps=3, momentum=0.67, wts=True, channels_last=channels_last, + track_running_stats=track_running_stats, test_module=test_module) + helper(shape, eps=1e-05, momentum=0.1, wts=False, channels_last=channels_last, + track_running_stats=track_running_stats, test_module=test_module) + helper(shape, eps=0, momentum=1.0, wts=False, channels_last=channels_last, + track_running_stats=track_running_stats, test_module=test_module) + helper(shape, eps=1, momentum=1, wts=True, channels_last=channels_last, + track_running_stats=track_running_stats, test_module=test_module) + helper(shape, eps=3, momentum=0.67, wts=True, channels_last=channels_last, + track_running_stats=track_running_stats, test_module=test_module) + + # Test conv2d + def test_conv2d_unit(self): + def helper(input_shape, wt_shape, + stride=1, padding=0, + dilation=1, groups=1, + bias_shape=None): + + cpu_x = torch.randn(input_shape, device='cpu', dtype=torch.float, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + + cpu_wt = torch.randn(wt_shape, device='cpu', dtype=torch.float, requires_grad=True) + wt = cpu_wt.detach().clone().to('mps').requires_grad_() + + cpu_bias = None + bias = None + + if(bias_shape is not None): + cpu_bias = torch.randn(bias_shape, device='cpu', dtype=torch.float, requires_grad=True) + bias = cpu_bias.detach().clone().to('mps').requires_grad_() + + y = torch.nn.functional.conv2d(x, wt, bias=bias, stride=stride, + padding=padding, dilation=dilation, groups=groups) + ref_y = torch.nn.functional.conv2d(cpu_x, cpu_wt, bias=cpu_bias, stride=stride, + padding=padding, dilation=dilation, groups=groups) + + cpu_grad = torch.ones_like(ref_y) + grad = cpu_grad.to('mps') + + y.backward(gradient=grad) + ref_y.backward(gradient=cpu_grad) + + self.assertEqual(y, ref_y, rtol=2.6e-05, atol=2e-04) + self.assertEqual(x.grad, cpu_x.grad, rtol=2.6e-06, atol=2e-05) + self.assertEqual(wt.grad, cpu_wt.grad, atol=8e-04, rtol=10.4e-05) + # if(bias_shape is not None): + # print(cpu_bias.grad) + # print(bias.grad.to('cpu')) + # self.assertEqual(bias.grad, cpu_bias.grad, atol=8e-04, rtol=10.4e-05) + + N = 1 + C_in = 3 + C_out = 64 + H = 64 + W = 64 + kH = 4 + kW = 4 + stride = 2 + padding = 1 + + helper((N, C_in, H, W), (C_out, C_in, kH, kW), stride=stride, padding=padding) + + N = 4 + C_in = 16 + H = 32 + W = 32 + + C_out = 8 + kH = 3 + kW = 3 + + for groups in [1, 2, 4]: + helper((N, C_in, H, W), (C_out, C_in // groups, kH, kW), groups=groups) + helper((N, C_in, H, W), (C_out, C_in // groups, kH, kW), groups=groups) + + helper((N, C_in, H, W), (C_out, C_in // groups, kH, kW), bias_shape=(C_out), groups=groups) + helper((N, C_in, H, W), (C_out, C_in // groups, kH, kW), bias_shape=(C_out), groups=groups) + + helper((N, C_in * 2, H * 2, W * 2), (C_out * 2, (C_in * 2) // groups, kH + 2, kW + 2), groups=groups) + helper((N, C_in * 2, H * 2, W * 2), (C_out * 2, (C_in * 2) // groups, kH + 2, kW + 2), groups=groups) + + helper((N, C_in * 2, H * 2, W * 2), (C_out * 2, (C_in * 2) // groups, + kH + 2, kW + 2), bias_shape=(C_out * 2), groups=groups) + helper((N, C_in * 2, H * 2, W * 2), (C_out * 2, (C_in * 2) // groups, + kH + 2, kW + 2), bias_shape=(C_out * 2), groups=groups) + + # Test conv transpose 2d + def test_conv_transpose2d(self): + def helper(input_shape, wt_shape, + stride=1, padding=0, + output_padding=0, + dilation=1, groups=1, + bias_shape=None): + + cpu_x = torch.randn(input_shape, device='cpu', dtype=torch.float, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + + cpu_wt = torch.randn(wt_shape, device='cpu', dtype=torch.float, requires_grad=True) + wt = cpu_wt.detach().clone().to('mps').requires_grad_() + + cpu_bias = None + bias = None + + if(bias_shape is not None): + cpu_bias = torch.randn(bias_shape, device='cpu', dtype=torch.float, requires_grad=True) + bias = cpu_bias.detach().clone().to('mps').requires_grad_() + + y = torch.nn.functional.conv_transpose2d( + x, wt, bias=bias, stride=stride, padding=padding, output_padding=output_padding, groups=groups, dilation=dilation) + ref_y = torch.nn.functional.conv_transpose2d( + cpu_x, cpu_wt, bias=cpu_bias, stride=stride, padding=padding, + output_padding=output_padding, groups=groups, dilation=dilation) + + cpu_grad = torch.randn(ref_y.shape) + grad = cpu_grad.to('mps') + + y.backward(gradient=grad) + ref_y.backward(gradient=cpu_grad) + + self.assertEqual(y, ref_y, rtol=2.6e-05, atol=2e-04) + self.assertEqual(x.grad, cpu_x.grad, rtol=2.6e-06, atol=2e-05) + self.assertEqual(wt.grad, cpu_wt.grad, atol=8e-04, rtol=10.4e-05) + + # if(bias_shape is not None): + # print(cpu_bias.grad) + # print(bias.grad.to('cpu')) + # self.assertEqual(bias.grad, cpu_bias.grad) + + N = 4 + C_in = 16 + H = 32 + W = 32 + + C_out = 8 + groups = 1 + kH = 3 + kW = 3 + + for stride in [1, 2, 3]: + for padding in [0, 1, 2]: + for output_padding in [0, 1, 2]: + for dilation in [1, 2]: + if(output_padding >= stride or output_padding >= dilation): + continue + helper((N, C_out, H, W), (C_out, C_in, kH, kW), stride=stride, + padding=padding, output_padding=output_padding, dilation=dilation) + helper((N, C_out, H, W), (C_out, C_in, kH, kW), stride=stride, + padding=padding, output_padding=output_padding, dilation=dilation) + + helper((N, C_out, H, W), (C_out, C_in, kH, kW), bias_shape=(C_in), stride=stride, + padding=padding, output_padding=output_padding, dilation=dilation) + helper((N, C_out, H, W), (C_out, C_in, kH, kW), bias_shape=(C_in), stride=stride, + padding=padding, output_padding=output_padding, dilation=dilation) + + # Test sigmoid + def test_sigmoid(self): + def helper(shape): + + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + + sigmoid_op = torch.nn.Sigmoid() + + y = sigmoid_op(x) + ref_y = sigmoid_op(cpu_x) + + cpu_grad = torch.ones_like(ref_y) + grad = cpu_grad.to('mps') + + y.backward(gradient=grad) + ref_y.backward(gradient=cpu_grad) + + self.assertEqual(y, ref_y) + self.assertEqual(x.grad, cpu_x.grad) + + helper((2, 3, 4, 5)) + helper((2, 3, 4)) + helper((2, 8, 4, 5)) + + # Test tanh + def test_tanh(self): + def helper(shape): + + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + + tanh_op = torch.nn.Tanh() + + y = tanh_op(x) + ref_y = tanh_op(cpu_x) + + cpu_grad = torch.ones_like(ref_y) + grad = cpu_grad.to('mps') + + y.backward(gradient=grad) + ref_y.backward(gradient=cpu_grad) + + self.assertEqual(y, ref_y) + self.assertEqual(x.grad, cpu_x.grad) + + helper((2, 3, 4, 5)) + helper((2, 3, 4)) + helper((2, 8, 4, 5)) + + def test_threshold(self): + def helper(threshold, value, num_elems, inplace=False, requires_grad=True): + m = nn.Threshold(threshold=threshold, value=value, inplace=inplace) + + input_cpu = torch.randn(num_elems, requires_grad=requires_grad, dtype=torch.float) + input_mps = input_cpu.detach().clone().to('mps').requires_grad_(requires_grad) + + output_cpu = m(input_cpu) + output_mps = m(input_mps) + + cpu_grad = torch.ones_like(output_cpu) + mps_grad = cpu_grad.to('mps') + + self.assertEqual(output_cpu, output_mps) + + if requires_grad: + output_cpu.backward(gradient=cpu_grad) + output_mps.backward(gradient=mps_grad) + + self.assertEqual(input_cpu.grad, input_mps.grad) + + helper(threshold=0.1, value=20, num_elems=2) + helper(threshold=-0.1, value=10, num_elems=10) + helper(threshold=0.5, value=-15, num_elems=100) + helper(threshold=1, value=10, num_elems=100, inplace=True, requires_grad=False) + + # Test pow + def test_pow(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + cpu_y = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + y = cpu_y.detach().clone().to('mps') + z = torch.pow(x, y) + ref_z = torch.pow(cpu_x, cpu_y) + + self.assertEqual(z, ref_z) + + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + exp = random.random() + z = torch.pow(x, exp) + ref_z = torch.pow(cpu_x, exp) + + self.assertEqual(z, ref_z) + + helper((2, 8, 4, 5)) + + # Test addcmul + def test_addcmul(self): + def helper(shape, value): + + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + + cpu_y = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + y = cpu_y.detach().clone().to('mps') + + cpu_z = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + z = cpu_z.detach().clone().to('mps') + + y = torch.addcmul(x, y, z, value=value) + ref_y = torch.addcmul(cpu_x, cpu_y, cpu_z, value=value) + + self.assertEqual(y, ref_y) + + helper((2, 3, 4, 5), 0.1) + helper((2, 8, 4, 5), 0.1) + helper((2, 3, 4, 5), 0.2) + helper((2, 8, 4, 5), 0.2) + + # Test addcdiv + def test_addcdiv(self): + def helper(shape, value): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + cpu_y = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + # clamp to avoid division by 0 + cpu_z = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False).clamp_min_(0.1) + cpu_out = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + + mps_x = cpu_x.detach().clone().to('mps') + mps_y = cpu_y.detach().clone().to('mps') + mps_z = cpu_z.detach().clone().to('mps') + mps_out = cpu_out.detach().clone().to('mps') + + result_div_mps = torch.addcdiv(mps_x, mps_y, mps_z, value=value) + result_div_cpu = torch.addcdiv(cpu_x, cpu_y, cpu_z, value=value) + self.assertEqual(result_div_mps, result_div_cpu) + # test .out variant + self.assertEqual(torch.addcdiv(mps_x, mps_y, mps_z, out=mps_out, value=value), result_div_cpu) + + helper((2, 3, 4, 5), 0.1) + helper((2, 8, 4, 5), 0.2) + helper((2, 3, 4, 5), 1.0) # value of 1 should be ignored internally + + def test_transpose_inplace(self): + values = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]] + cpu_x = torch.tensor(values, device='cpu') + mps_x = torch.tensor(values, device='mps') + + cpu_x.transpose_(0, 1) + mps_x.transpose_(0, 1) + self.assertEqual(cpu_x, mps_x.to('cpu')) + + def test_slice(self): + values = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]] + cpu_x = torch.tensor(values, device='cpu') + mps_x = (torch.tensor(values, device='mps', dtype=torch.float)) + + cpu_slice1 = cpu_x[:2, :] + mps_slice1 = mps_x[:2, :] + print(mps_slice1) + self.assertEqual(cpu_slice1, mps_slice1) + + cpu_slice2 = cpu_x[:, :1] + mps_slice2 = mps_x[:, :1] + print(cpu_slice2) + print(mps_slice2.to('cpu')) + self.assertEqual(cpu_slice2, mps_slice2) + + cpu_slice3 = cpu_x[1:2, :] + mps_slice3 = mps_x[1:2, :] + self.assertEqual(cpu_slice3, mps_slice3.to('cpu')) + + cpu_slice4 = cpu_x[1, :] + mps_slice4 = mps_x[1, :].to('cpu') + self.assertEqual(cpu_slice4, mps_slice4) + + def test_flatten(self): + values = [[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], [[7.0, 8.0, 9.0], [10.0, 11.0, 12.0]]] + cpu_x = torch.tensor(values, device='cpu') + mps_x = torch.tensor(values, device='mps') + + cpu_flatten1 = cpu_x.flatten() + mps_flatten1 = mps_x.flatten().to('cpu') + self.assertEqual(cpu_flatten1, mps_flatten1) + + cpu_flatten2 = cpu_x.flatten(start_dim=1) + mps_flatten2 = mps_x.flatten(start_dim=1).to('cpu') + self.assertEqual(cpu_flatten2, mps_flatten2) + + cpu_flatten3 = cpu_x.flatten(end_dim=1) + mps_flatten3 = mps_x.flatten(end_dim=1).to('cpu') + self.assertEqual(cpu_flatten3, mps_flatten3) + + # Test repeat + def test_repeat(self): + def helper(shape, repeats): + + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + + y = x.repeat(repeats) + ref_y = cpu_x.repeat(repeats) + + cpu_grad = torch.randn(ref_y.shape) + grad = cpu_grad.to('mps') + + y.backward(gradient=grad) + ref_y.backward(gradient=cpu_grad) + + self.assertEqual(y, ref_y) + self.assertEqual(x.grad, cpu_x.grad) + + helper((2, 3, 4, 5), (2, 3, 4, 5)) + helper((2, 3, 4), (4, 3, 2, 5, 7, 2)) + helper((3, 4, 5), (2, 3, 4, 5)) + helper((3, 4, 5), (2, 2, 2)) + + def _test_module_empty_input(self, module, inp, check_size=True): + inp.requires_grad_(True) + out = module(inp) + gO = torch.rand_like(out) + out.backward(gO) + if check_size: + self.assertEqual(out.size(), inp.size()) + for p in module.parameters(): + if p.requires_grad: + self.assertEqual(p.grad, torch.zeros_like(p.grad)) + self.assertEqual(inp.grad, torch.zeros_like(inp)) + + +class TestSmoothL1Loss(TestCase): + + def _smooth_l1_loss_helper(self, reduction="mean", requires_grad=False): + # CPU + input_cpu = torch.randn(4, 7, requires_grad=requires_grad) + target_cpu = torch.randn(4, 7) + + # MPS + input_mps = input_cpu.detach().clone().to('mps').requires_grad_() + target_mps = target_cpu.detach().clone().to('mps') + + smooth_l1_loss_cpu = F.smooth_l1_loss(input_cpu, target_cpu, beta=1.0, reduction=reduction) + smooth_l1_loss_mps = F.smooth_l1_loss(input_mps, target_mps, beta=1.0, reduction=reduction) + + self.assertEqual(smooth_l1_loss_cpu, smooth_l1_loss_mps) + + if requires_grad: + smooth_l1_loss_cpu.backward() + smooth_l1_loss_mps.backward() + self.assertEqual(input_cpu.grad, input_mps.grad.to("cpu")) + + return smooth_l1_loss_cpu, smooth_l1_loss_mps + + def test_smooth_l1_loss_reduction_none(self): + self._smooth_l1_loss_helper(reduction="none") + + def test_smooth_l1_loss_reduction_mean(self): + self._smooth_l1_loss_helper(reduction="mean") + + def test_smooth_l1_loss_reduction_sum(self): + self._smooth_l1_loss_helper(reduction="sum") + + def test_smooth_l1_loss_reduction_mean_backward(self): + self._smooth_l1_loss_helper(reduction="mean", requires_grad=True) + + def test_smooth_l1_loss_reduction_mean_sum_backward(self): + self._smooth_l1_loss_helper(reduction="sum", requires_grad=True) + + +class TestNLLLoss(TestCase): + + def test_nll_loss_mismatched_batch(self, device='mps'): + x = torch.randn((10, 3), requires_grad=True, device=device) + # t should have size (10,) + t = torch.zeros((3,), dtype=torch.int64, device=device) + with self.assertRaisesRegex(ValueError, 'Expected.*batch_size'): + F.nll_loss(x, t) + + def test_nll_loss_out_of_bounds_ignore_index(self): + + def _test_nll_loss_out_of_bounds_ignore_index(device): + output = [] + x = torch.tensor([[0.3, 0.5, 0.2], [0.1, 0.7, 0.2], [0.4, 0.5, 0.1], [ + 0.3, 0.5, 0.2], [0.1, 0.7, 0.2], [0.4, 0.5, 0.1]], device=device) + t = torch.tensor([0, 1, 255, 0, 1, 2], dtype=torch.int64, device=device) + for reduction in ['mean', 'none']: + output.append(F.nll_loss(x, t, ignore_index=255, reduction=reduction)) + return output + + output_cpu = _test_nll_loss_out_of_bounds_ignore_index(device='cpu') + output_mps = _test_nll_loss_out_of_bounds_ignore_index(device='mps') + + for cpu, mps in zip(output_cpu, output_mps): + self.assertEqual(cpu, mps.to('cpu')) + + def test_nll_loss_invalid_target_dim(self): + + def _test_nll_loss_invalid_target_dim(device): + output = [] + x = torch.tensor([[0.3, 0.5, 0.2], [0.1, 0.7, 0.2], [0.4, 0.5, 0.1], [ + 0.3, 0.5, 0.2], [0.1, 0.7, 0.2], [0.4, 0.5, 0.1]], device=device) + t = torch.zeros((6, 2), dtype=torch.int64, device=device) + with self.assertRaisesRegex(RuntimeError, "1D target tensor expected"): + F.nll_loss(x, t) + + _test_nll_loss_invalid_target_dim(device='cpu') + _test_nll_loss_invalid_target_dim(device='mps') + + def test_nll_loss_invalid_weights(self): + + def _test_nll_loss_invalid_weights(device): + x = torch.tensor([[0.3, 0.5, 0.2], [0.1, 0.7, 0.2], [0.4, 0.5, 0.1], [ + 0.3, 0.5, 0.2], [0.1, 0.7, 0.2], [0.4, 0.5, 0.1]], device=device) + t = torch.tensor([0, 1, 2, 1, 1, 2], dtype=torch.int64, device=device) + invalid_weights = [ + torch.zeros(4, device=device), + torch.zeros((1, 3), device=device), + ] + msg = "weight tensor should be defined either for all 3 classes or no classes" + for weight in invalid_weights: + with self.assertRaisesRegex(RuntimeError, msg): + F.nll_loss(x, t, weight=weight) + + _test_nll_loss_invalid_weights(device='cpu') + _test_nll_loss_invalid_weights(device='mps') + + def _nll_loss_helper(self, input_size, reduction, expected): + + # CPU + input = torch.rand(input_size, requires_grad=True, device='cpu') + num_channels = input_size[1] + target_size = (input_size[0], ) + tuple(input_size[2:]) + target = torch.randint(num_channels, target_size, device='cpu') + + # MPS + input_mps = input.detach().clone().to('mps').requires_grad_() + target_mps = target.detach().clone().to('mps') + + output_cpu = F.nll_loss(input, target, reduction=reduction) + output_mps = F.nll_loss(input_mps, target_mps, reduction=reduction) + # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 + self.assertEqualIgnoreType(output_cpu, output_mps.to('cpu')) + + output_cpu.sum().backward() + output_mps.sum().backward() + self.assertEqual(input.grad, input_mps.grad.to('cpu')) + + def test_as_strided(self): + def helper(n, c): + values = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]] + values_1 = [[1.0, 1.0], [1.0, 1.0]] + cpu_x = torch.tensor(values, device='cpu') + ones1 = torch.tensor(values_1, device='mps') + x = cpu_x.detach().clone().to('mps').requires_grad_() + strided_cpu = torch.as_strided(cpu_x, (2, 2), (2, 2)) + strided_mps = torch.as_strided(x, (2, 2), (2, 2)) + + print("Strided MPS {}".format(strided_mps.to('cpu'))) + print("Strided cpu {}".format(strided_cpu)) + + self.assertEqual(strided_mps, strided_cpu) + + helper(3, 3) + + def test_nll_loss_empty_tensor_reduction_none(self, device='cpu'): + self._nll_loss_helper([1, 3], "none", torch.empty([0], device=device)) + self._nll_loss_helper([3, 5, 7], "none", torch.empty([5, 7], device=device)) + self._nll_loss_helper([2, 3, 1, 7], "none", torch.empty([2, 1, 7], device=device)) + self._nll_loss_helper([2, 3, 5, 1], "none", torch.empty([2, 5, 1], device=device)) + self._nll_loss_helper([2, 3, 5, 7, 1], "none", torch.empty([2, 5, 7, 1], device=device)) + + @unittest.skipIf(TEST_WITH_UBSAN, "division-by-zero error with UBSAN") + def test_nll_loss_empty_tensor_reduction_mean(self, device='cpu'): + nan = torch.tensor(float('nan'), device=device) + self._nll_loss_helper([1, 3], "mean", nan) + self._nll_loss_helper([1, 3, 5, 7], "mean", nan) + self._nll_loss_helper([2, 3, 1, 7], "mean", nan) + self._nll_loss_helper([2, 3, 5, 1], "mean", nan) + self._nll_loss_helper([2, 3, 5, 7, 1], "mean", nan) + + def test_nll_loss_empty_tensor_reduction_sum(self, device='cpu'): + zero = torch.tensor(0, device=device) + self._nll_loss_helper([1, 3], "sum", zero) + self._nll_loss_helper([1, 3, 5, 7], "sum", zero) + self._nll_loss_helper([2, 3, 1, 7], "sum", zero) + self._nll_loss_helper([2, 3, 5, 1], "sum", zero) + self._nll_loss_helper([2, 3, 5, 7, 1], "sum", zero) + + def test_nll_loss_byte_target_matches_long(self, device='cpu'): + N, C = 10, 4 + input = torch.randn(N, C, device=device, requires_grad=True) + target = torch.empty(N, dtype=torch.long, device=device).random_(0, C) + + def compute_result_and_gradient(reduction, target_dtype): + result, grad = {}, {} + for dev in ['cpu', 'mps']: + input_dev = input.to(dev) + input_ = input_dev.detach() + input_.requires_grad_() + + target_dev = target.to(dev) + + prob = F.log_softmax(input_, dim=-1) + loss = nn.NLLLoss(reduction=reduction) + result[dev] = loss(prob, target_dev.to(target_dtype)) + result[dev].sum().backward() + grad[dev] = input_.grad + + return result, grad + + for reduction in ["none", "mean", "sum"]: + result_long, grad_long = compute_result_and_gradient(reduction, torch.long) + result_byte, grad_byte = compute_result_and_gradient(reduction, torch.uint8) + + self.assertEqual(result_long['mps'].to('cpu'), result_long['cpu']) + self.assertEqual(grad_long['mps'].to('cpu'), grad_long['cpu']) + + # Mean Squared Error + def test_mse_loss(self): + def helper(shape, reduction): + # create the criterion + loss = torch.nn.MSELoss(reduction=reduction) + + inputCPU = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True) + targetCPU = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + inputMPS = inputCPU.detach().clone().to('mps').requires_grad_() + targetMPS = targetCPU.detach().clone().to('mps') + + # forward pass + outputCPU = loss(inputCPU, targetCPU) + outputMPS = loss(inputMPS, targetMPS) + self.assertEqual(outputCPU, outputMPS) + + # backward pass + if reduction != 'none': + # chose 2 just to make the grad_output > 1 in backward pass + outputCPU.backward(gradient=torch.full_like(outputCPU, 2)) + outputMPS.backward(gradient=torch.full_like(outputMPS, 2)) + self.assertEqual(inputCPU.grad, inputMPS.grad) + + helper([8, 5, 4], 'none') + helper([7, 5, 2, 4], 'sum') + # verify if changes in shape would cause cached graph lookup problems + helper([7, 5, 2, 4, 6], 'sum') + helper([8, 4, 5, 7, 6], 'mean') + + # Binary Cross Enropy + def test_bce_loss(self): + def helper(shape, reduction): + # create the criterion + loss = torch.nn.BCELoss(reduction=reduction) + + # input and target must be within [0..1] + input_t = np.random.random_sample(size=shape).astype(np.float32) + target_t = np.random.random_sample(size=shape).astype(np.float32) + inputCPU = torch.tensor(input_t, device='cpu', dtype=torch.float, requires_grad=True) + targetCPU = torch.tensor(target_t, device='cpu', dtype=torch.float, requires_grad=False) + inputMPS = inputCPU.detach().clone().to('mps').requires_grad_() + targetMPS = targetCPU.detach().clone().to('mps') + + # forward pass + outputCPU = loss(inputCPU, targetCPU) + outputMPS = loss(inputMPS, targetMPS) + self.assertEqual(outputCPU, outputMPS) + + # backward pass + if reduction != 'none': + # chose 0.6 just to have the grad_output != 1 + outputCPU.backward(gradient=torch.full_like(outputCPU, 0.6)) + outputMPS.backward(gradient=torch.full_like(outputMPS, 0.6)) + self.assertEqual(inputCPU.grad, inputMPS.grad) + + helper([8, 5, 4], 'none') + helper([7, 5, 2, 4], 'sum') + # verify if changes in shape would cause cached graph lookup problems + helper([7, 5, 2, 4, 6], 'sum') + helper([8, 4, 5, 7, 6], 'mean') + + def test_log_softmax(self): + values = [[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], [[7.0, 8.0, 9.0], [10.0, 11.0, 12.0]]] + cpu_x = torch.tensor(values, device='cpu', requires_grad=True) + mps_x = torch.tensor(values, device='mps', requires_grad=True) + + cpu_log_softmax = F.log_softmax(cpu_x, dim=0) + mps_log_softmax = F.log_softmax(mps_x, dim=0) + self.assertEqual(cpu_log_softmax, mps_log_softmax.to('cpu')) + + cpu_grad = torch.ones_like(cpu_log_softmax) + mps_grad = torch.ones_like(cpu_log_softmax).to('mps') + + cpu_log_softmax.backward(gradient=cpu_grad) + mps_log_softmax.backward(gradient=mps_grad) + + self.assertEqual(cpu_x.grad, mps_x.grad.to('cpu')) + + def test_eq(self): + values1 = [[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], [[7.0, 8.0, 9.0], [10.0, 11.0, 12.0]]] + values2 = [[[1.0, 2.0, 15.0], [4.0, 5.0, 6.0]], [[7.0, 8.0, 9.0], [0.0, 11.0, 12.0]]] + mps_x = torch.tensor(values1, device='mps') + mps_y = torch.tensor(values2, device='mps') + cpu_x = torch.tensor(values1, device='cpu') + cpu_y = torch.tensor(values2, device='cpu') + result_mps = torch.eq(mps_x, mps_y) + result_cpu = torch.eq(cpu_x, cpu_y) + + self.assertEqual(result_cpu, result_mps.to('cpu')) + + def test_eq_int64(self): + values1 = [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]] + values2 = [[[1, 2, 15], [4, 5, 6]], [[7, 8, 9], [0, 11, 12]]] + mps_x = torch.tensor(values1, device='mps') + mps_y = torch.tensor(values2, device='mps') + cpu_x = torch.tensor(values1, device='cpu') + cpu_y = torch.tensor(values2, device='cpu') + result_mps = torch.eq(mps_x, mps_y) + result_cpu = torch.eq(cpu_x, cpu_y) + + self.assertEqual(result_cpu, result_mps.to('cpu')) + + def test_ne(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float) + cpu_y = torch.randn(shape, device='cpu', dtype=torch.float) + mps_x = cpu_x.detach().clone().to('mps') + mps_y = cpu_y.detach().clone().to('mps') + result_mps = torch.ne(mps_x, mps_y) + result_cpu = torch.ne(cpu_x, cpu_y) + + self.assertEqual(result_cpu, result_mps.to('cpu')) + + helper((2, 3, 4, 5)) + + def test_ne_scalar(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float) + mps_x = cpu_x.detach().clone().to('mps') + result_mps = torch.ne(mps_x, 0.0) + result_cpu = torch.ne(cpu_x, 0.0) + + self.assertEqual(result_cpu, result_mps.to('cpu')) + + helper((2, 3, 4, 5)) + + def test_lt(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float) + cpu_y = torch.randn(shape, device='cpu', dtype=torch.float) + mps_x = cpu_x.detach().clone().to('mps') + mps_y = cpu_y.detach().clone().to('mps') + result_mps = torch.lt(mps_x, mps_y) + result_cpu = torch.lt(cpu_x, cpu_y) + + self.assertEqual(result_cpu, result_mps.to('cpu')) + + helper((2, 3, 4, 5)) + + def test_lt_scalar(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float) + mps_x = cpu_x.detach().clone().to('mps') + result_mps = torch.lt(mps_x, 0.0) + result_cpu = torch.lt(cpu_x, 0.0) + + self.assertEqual(result_cpu, result_mps.to('cpu')) + + helper((2, 3, 4, 5)) + + def test_le(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float) + cpu_y = torch.randn(shape, device='cpu', dtype=torch.float) + mps_x = cpu_x.detach().clone().to('mps') + mps_y = cpu_y.detach().clone().to('mps') + result_mps = torch.le(mps_x, mps_y) + result_cpu = torch.le(cpu_x, cpu_y) + + self.assertEqual(result_cpu, result_mps.to('cpu')) + + helper((2, 3, 4, 5)) + + def test_le_scalar(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float) + mps_x = cpu_x.detach().clone().to('mps') + result_mps = torch.le(mps_x, 0.0) + result_cpu = torch.le(cpu_x, 0.0) + + self.assertEqual(result_cpu, result_mps.to('cpu')) + + helper((2, 3, 4, 5)) + + def test_ge(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float) + cpu_y = torch.randn(shape, device='cpu', dtype=torch.float) + mps_x = cpu_x.detach().clone().to('mps') + mps_y = cpu_y.detach().clone().to('mps') + result_mps = torch.ge(mps_x, mps_y) + result_cpu = torch.ge(cpu_x, cpu_y) + + self.assertEqual(result_cpu, result_mps.to('cpu')) + + helper((2, 3, 4, 5)) + + def test_ge_scalar(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float) + mps_x = cpu_x.detach().clone().to('mps') + result_mps = torch.ge(mps_x, 0.0) + result_cpu = torch.ge(cpu_x, 0.0) + + self.assertEqual(result_cpu, result_mps.to('cpu')) + + helper((2, 3, 4, 5)) + + def test_gt(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float) + cpu_y = torch.randn(shape, device='cpu', dtype=torch.float) + mps_x = cpu_x.detach().clone().to('mps') + mps_y = cpu_y.detach().clone().to('mps') + result_mps = torch.gt(mps_x, mps_y) + result_cpu = torch.gt(cpu_x, cpu_y) + + self.assertEqual(result_cpu, result_mps.to('cpu')) + + helper((2, 3, 4, 5)) + + def test_gt_scalar(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float) + mps_x = cpu_x.detach().clone().to('mps') + result_mps = torch.gt(mps_x, 0.0) + result_cpu = torch.gt(cpu_x, 0.0) + + self.assertEqual(result_cpu, result_mps.to('cpu')) + + helper((2, 3, 4, 5)) + + # Test forward argmax + def test_argmax(self): + def helper(n, c, h, w, dtype=torch.float32): + cpu_x = None + x = None + if(dtype not in [torch.float32, torch.bool]): + cpu_x = torch.randint(50, (n, c, h, w), device='cpu', dtype=dtype, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + elif (dtype == torch.bool): + cpu_x = torch.randint(2, (n, c, h, w), device='cpu', dtype=dtype, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + else: + cpu_x = torch.randn(n, c, h, w, device='cpu', dtype=dtype, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + + y = torch.argmax(x) + ref_y = torch.argmax(cpu_x) + self.assertEqual(y, ref_y) + + y_0 = torch.argmax(x, dim=0) + refy_0 = torch.argmax(cpu_x, dim=0) + self.assertEqual(y_0, refy_0) + + y_0dim = torch.argmax(x, dim=0, keepdim=True) + refy_0dim = torch.argmax(cpu_x, dim=0, keepdim=True) + self.assertEqual(y_0dim, refy_0dim) + + y_1 = torch.argmax(x, dim=1) + refy_1 = torch.argmax(cpu_x, dim=1) + self.assertEqual(y_1, refy_1) + + y_1dim = torch.argmax(x, dim=1, keepdim=True) + refy_1dim = torch.argmax(cpu_x, dim=1, keepdim=True) + self.assertEqual(y_1dim, refy_1dim) + + y_2 = torch.argmax(x, dim=2) + refy_2 = torch.argmax(cpu_x, dim=2) + self.assertEqual(y_2, refy_2) + + y_2dim = torch.argmax(x, dim=2, keepdim=True) + refy_2dim = torch.argmax(cpu_x, dim=2, keepdim=True) + self.assertEqual(y_2dim, refy_2dim) + + y_3 = torch.argmax(x, dim=3) + refy_3 = torch.argmax(cpu_x, dim=3) + self.assertEqual(y_3, refy_3) + + y_3dim = torch.argmax(x, dim=3, keepdim=True) + refy_3dim = torch.argmax(cpu_x, dim=3, keepdim=True) + self.assertEqual(y_3dim, refy_3dim) + + helper(2, 8, 4, 4, torch.float32) + helper(2, 8, 4, 4, torch.int32) + helper(2, 8, 4, 4, torch.float16) + helper(2, 8, 4, 4, torch.int64) + + # Test forward max + # Note - don't test grad now + def test_max_el(self): + def helper(n, c, h, w, dtype=torch.float32): + + if(dtype not in [torch.float32, torch.bool]): + cpu_x = torch.randint(50, (n, c, h, w), device='cpu', dtype=dtype, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + elif (dtype == torch.bool): + cpu_x = torch.randint(2, (n, c, h, w), device='cpu', dtype=dtype, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + else: + cpu_x = torch.randn(n, c, h, w, device='cpu', dtype=dtype, requires_grad=True) + x = cpu_x.detach().clone().to('mps') + + ref_y = torch.max(cpu_x) + y = torch.max(x) + self.assertEqual(y, ref_y) + + for dim in [0, 1, 2, 3]: + for keepdim in [True, False]: + y, idx = torch.max(x, dim=dim, keepdim=keepdim) + refy, refidx = torch.max(cpu_x, dim=dim, keepdim=keepdim) + self.assertEqual(y, refy) + self.assertEqual(idx, refidx) + + y_0 = torch.ones(c, h, w, device='mps', dtype=dtype) + idx_0 = torch.ones(c, h, w, device='mps', dtype=torch.int64) + torch.max(x, dim=0, out=(y_0, idx_0)) + refy_0, refidx_0 = torch.max(cpu_x, dim=0) + self.assertEqual(y_0, refy_0) + self.assertEqual(idx_0, refidx_0) + + y_0dim = torch.ones(1, c, h, w, device='mps', dtype=dtype) + idx_0dim = torch.ones(1, c, h, w, device='mps', dtype=torch.int64) + torch.max(x, dim=0, keepdim=True, out=(y_0dim, idx_0dim)) + refy_0dim, refidx_0dim = torch.max(cpu_x, dim=0, keepdim=True) + self.assertEqual(y_0dim, refy_0dim) + self.assertEqual(idx_0dim, refidx_0dim) + + y_1 = torch.ones(n, h, w, device='mps', dtype=dtype) + idx_1 = torch.ones(n, h, w, device='mps', dtype=torch.int64) + torch.max(x, dim=1, out=(y_1, idx_1)) + refy_1, refidx_1 = torch.max(cpu_x, dim=1) + self.assertEqual(y_1, refy_1) + self.assertEqual(idx_1, refidx_1) + + y_1dim = torch.ones(n, 1, h, w, device='mps', dtype=dtype) + idx_1dim = torch.ones(n, 1, h, w, device='mps', dtype=torch.int64) + torch.max(x, dim=1, keepdim=True, out=(y_1dim, idx_1dim)) + refy_1dim, refidx_1dim = torch.max(cpu_x, keepdim=True, dim=1) + self.assertEqual(y_1dim, refy_1dim) + self.assertEqual(idx_1dim, refidx_1dim) + + y_2 = torch.ones(n, c, w, device='mps', dtype=dtype) + idx_2 = torch.ones(n, c, w, device='mps', dtype=torch.int64) + torch.max(x, dim=2, out=(y_2, idx_2)) + refy_2, refidx_2 = torch.max(cpu_x, dim=2) + self.assertEqual(y_2, refy_2) + self.assertEqual(idx_2, refidx_2) + + y_2dim = torch.ones(n, c, 1, w, device='mps', dtype=dtype) + idx_2dim = torch.ones(n, c, 1, w, device='mps', dtype=torch.int64) + torch.max(x, dim=2, keepdim=True, out=(y_2dim, idx_2dim)) + refy_2dim, refidx_2dim = torch.max(cpu_x, dim=2, keepdim=True,) + self.assertEqual(y_2dim, refy_2dim) + self.assertEqual(idx_2dim, refidx_2dim) + + y_3 = torch.ones(n, c, h, device='mps', dtype=dtype) + idx_3 = torch.ones(n, c, h, device='mps', dtype=torch.int64) + torch.max(x, dim=3, out=(y_3, idx_3)) + refy_3, refidx_3 = torch.max(cpu_x, dim=3) + self.assertEqual(y_3, refy_3) + self.assertEqual(idx_3, refidx_3) + + y_3dim = torch.ones(n, c, h, 1, device='mps', dtype=dtype) + idx_3dim = torch.ones(n, c, h, 1, device='mps', dtype=torch.int64) + torch.max(x, dim=3, keepdim=True, out=(y_3dim, idx_3dim)) + refy_3dim, refidx_3dim = torch.max(cpu_x, dim=3, keepdim=True,) + self.assertEqual(y_3dim, refy_3dim) + self.assertEqual(idx_3dim, refidx_3dim) + + helper(2, 8, 4, 5, torch.float32) + helper(2, 8, 4, 5, torch.int32) + # helper(2, 8, 4, 5, torch.int64) + + def test_any(self): + def helper(shape): + input_xs = [] + prod = 1 + + for i in range(len(shape)): + prod *= shape[i] + input_xs.append(torch.randn(prod, dtype=torch.float).reshape(shape)) + input_xs.append(torch.arange(0, prod, dtype=torch.float).reshape(shape)) + input_xs.append(torch.ones(prod, dtype=torch.float).reshape(shape)) + input_xs.append(torch.zeros(prod, dtype=torch.float).reshape(shape)) + input_xs.append(torch.arange(0, prod, dtype=torch.int).reshape(shape)) + input_xs.append(torch.ones(prod, dtype=torch.int).reshape(shape)) + input_xs.append(torch.zeros(prod, dtype=torch.int).reshape(shape)) + input_xs.append(torch.arange(0, prod, dtype=torch.int).reshape(shape).bool()) + input_xs.append(torch.ones(prod, dtype=torch.int).reshape(shape).bool()) + input_xs.append(torch.zeros(prod, dtype=torch.int).reshape(shape).bool()) + + for i, cpu_x in enumerate(input_xs): + x = cpu_x.detach().clone().to('mps') + y = torch.any(x) + ref_y = torch.any(cpu_x) + self.assertEqual(y, ref_y) + + y_0 = torch.any(x, dim=0) + refy_0 = torch.any(cpu_x, dim=0) + self.assertEqual(y_0, refy_0) + + y_0dim = torch.any(x, dim=0, keepdim=True) + refy_0dim = torch.any(cpu_x, dim=0, keepdim=True) + self.assertEqual(y_0dim, refy_0dim) + + y_0dim = torch.any(x, dim=0, keepdim=True) + refy_0dim = torch.any(cpu_x, dim=0, keepdim=True) + self.assertEqual(y_0dim, refy_0dim) + + y_1 = torch.any(x, dim=1) + refy_1 = torch.any(cpu_x, dim=1) + self.assertEqual(y_1, refy_1) + + y_1dim = torch.any(x, dim=1, keepdim=True) + refy_1dim = torch.any(cpu_x, dim=1, keepdim=True) + self.assertEqual(y_1dim, refy_1dim) + + if (len(shape) > 2): + y_2 = torch.any(x, dim=2) + refy_2 = torch.any(cpu_x, dim=2) + self.assertEqual(y_2, refy_2) + + y_2dim = torch.any(x, dim=2, keepdim=True) + refy_2dim = torch.any(cpu_x, dim=2, keepdim=True) + self.assertEqual(y_2dim, refy_2dim) + + y_3 = torch.any(x, dim=3) + refy_3 = torch.any(cpu_x, dim=3) + self.assertEqual(y_3, refy_3) + + y_3dim = torch.any(x, dim=3, keepdim=True) + refy_3dim = torch.any(cpu_x, dim=3, keepdim=True) + self.assertEqual(y_3dim, refy_3dim) + helper((1, 1, 1, 1)) + helper((1, 1, 3, 3)) + helper((7, 13)) + helper((2, 8, 4, 5)) + + def test_all(self): + def helper(shape): + input_xs = [] + prod = 1 + + for i in range(len(shape)): + prod *= shape[i] + input_xs.append(torch.randn(prod, dtype=torch.float).reshape(shape)) + input_xs.append(torch.arange(0, prod, dtype=torch.float).reshape(shape)) + input_xs.append(torch.ones(prod, dtype=torch.float).reshape(shape)) + input_xs.append(torch.zeros(prod, dtype=torch.float).reshape(shape)) + input_xs.append(torch.arange(0, prod, dtype=torch.int).reshape(shape)) + input_xs.append(torch.ones(prod, dtype=torch.int).reshape(shape)) + input_xs.append(torch.zeros(prod, dtype=torch.int).reshape(shape)) + input_xs.append(torch.arange(0, prod, dtype=torch.int).reshape(shape).bool()) + input_xs.append(torch.ones(prod, dtype=torch.int).reshape(shape).bool()) + input_xs.append(torch.zeros(prod, dtype=torch.int).reshape(shape).bool()) + + for i, cpu_x in enumerate(input_xs): + x = cpu_x.detach().clone().to('mps') + y = torch.all(x) + ref_y = torch.all(cpu_x) + self.assertEqual(y, ref_y) + + y_0 = torch.all(x, dim=0) + refy_0 = torch.all(cpu_x, dim=0) + self.assertEqual(y_0, refy_0) + + y_0dim = torch.all(x, dim=0, keepdim=True) + refy_0dim = torch.all(cpu_x, dim=0, keepdim=True) + self.assertEqual(y_0dim, refy_0dim) + + y_0dim = torch.all(x, dim=0, keepdim=True) + refy_0dim = torch.all(cpu_x, dim=0, keepdim=True) + self.assertEqual(y_0dim, refy_0dim) + + y_1 = torch.all(x, dim=1) + refy_1 = torch.all(cpu_x, dim=1) + self.assertEqual(y_1, refy_1) + + y_1dim = torch.all(x, dim=1, keepdim=True) + refy_1dim = torch.all(cpu_x, dim=1, keepdim=True) + self.assertEqual(y_1dim, refy_1dim) + if (len(shape) > 2): + y_2 = torch.all(x, dim=2) + refy_2 = torch.all(cpu_x, dim=2) + self.assertEqual(y_2, refy_2) + + y_2dim = torch.all(x, dim=2, keepdim=True) + refy_2dim = torch.all(cpu_x, dim=2, keepdim=True) + self.assertEqual(y_2dim, refy_2dim) + + y_3 = torch.all(x, dim=3) + refy_3 = torch.all(cpu_x, dim=3) + self.assertEqual(y_3, refy_3) + + y_3dim = torch.all(x, dim=3, keepdim=True) + refy_3dim = torch.all(cpu_x, dim=3, keepdim=True) + self.assertEqual(y_3dim, refy_3dim) + + helper((1, 1, 1, 1)) + helper((1, 1, 3, 3)) + helper((7, 13)) + helper((2, 8, 4, 5)) + + # Test forward min + def test_min_el(self): + def helper(n, c, h, w): + cpu_x = torch.randn(n, c, h, w, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + + y = torch.min(x) + ref_y = torch.min(cpu_x) + self.assertEqual(y, ref_y) + + y_0, idx_0 = torch.min(x, dim=0) + refy_0, refidx_0 = torch.min(cpu_x, dim=0) + self.assertEqual(y_0, refy_0) + self.assertEqual(idx_0, refidx_0) + + y_0 = torch.ones(c, h, w, device='mps', dtype=torch.float) + idx_0 = torch.ones(c, h, w, device='mps', dtype=torch.int64) + torch.min(x, dim=0, out=(y_0, idx_0)) + refy_0, refidx_0 = torch.min(cpu_x, dim=0) + self.assertEqual(y_0, refy_0) + self.assertEqual(idx_0, refidx_0) + + y_0dim, idx_0dim = torch.min(x, dim=0, keepdim=True) + refy_0dim, refidx_0dim = torch.min(cpu_x, dim=0, keepdim=True) + self.assertEqual(y_0dim, refy_0dim) + self.assertEqual(idx_0dim, refidx_0dim) + + y_0dim = torch.ones(1, c, h, w, device='mps', dtype=torch.float) + idx_0dim = torch.ones(1, c, h, w, device='mps', dtype=torch.int64) + torch.min(x, dim=0, keepdim=True, out=(y_0dim, idx_0dim)) + refy_0dim, refidx_0dim = torch.min(cpu_x, dim=0, keepdim=True) + self.assertEqual(y_0dim, refy_0dim) + self.assertEqual(idx_0dim, refidx_0dim) + + y_1, idx_1 = torch.min(x, dim=1) + refy_1, refidx_1 = torch.min(cpu_x, dim=1) + self.assertEqual(y_1, refy_1) + self.assertEqual(idx_1, refidx_1) + + y_1 = torch.ones(n, h, w, device='mps', dtype=torch.float) + idx_1 = torch.ones(n, h, w, device='mps', dtype=torch.int64) + torch.min(x, dim=1, out=(y_1, idx_1)) + refy_1, refidx_1 = torch.min(cpu_x, dim=1) + self.assertEqual(y_1, refy_1) + self.assertEqual(idx_1, refidx_1) + + y_1dim, idx_1dim = torch.min(x, dim=1, keepdim=True) + refy_1dim, refidx_1dim = torch.min(cpu_x, dim=1, keepdim=True) + self.assertEqual(y_1dim, refy_1dim) + self.assertEqual(idx_1dim, refidx_1dim) + + y_1dim = torch.ones(n, 1, h, w, device='mps', dtype=torch.float) + idx_1dim = torch.ones(n, 1, h, w, device='mps', dtype=torch.int64) + torch.min(x, dim=1, keepdim=True, out=(y_1dim, idx_1dim)) + refy_1dim, refidx_1dim = torch.min(cpu_x, keepdim=True, dim=1) + self.assertEqual(y_1dim, refy_1dim) + self.assertEqual(idx_1dim, refidx_1dim) + + y_2, idx_2 = torch.min(x, dim=2) + refy_2, refidx_2 = torch.min(cpu_x, dim=2) + self.assertEqual(y_2, refy_2) + self.assertEqual(idx_2, refidx_2) + + y_2 = torch.ones(n, c, w, device='mps', dtype=torch.float) + idx_2 = torch.ones(n, c, w, device='mps', dtype=torch.int64) + torch.min(x, dim=2, out=(y_2, idx_2)) + refy_2, refidx_2 = torch.min(cpu_x, dim=2) + self.assertEqual(y_2, refy_2) + self.assertEqual(idx_2, refidx_2) + + y_2dim, idx_2dim = torch.min(x, dim=2, keepdim=True) + refy_2dim, refidx_2dim = torch.min(cpu_x, dim=2, keepdim=True) + self.assertEqual(y_2dim, refy_2dim) + self.assertEqual(idx_2dim, refidx_2dim) + + y_2dim = torch.ones(n, c, 1, w, device='mps', dtype=torch.float) + idx_2dim = torch.ones(n, c, 1, w, device='mps', dtype=torch.int64) + torch.min(x, dim=2, keepdim=True, out=(y_2dim, idx_2dim)) + refy_2dim, refidx_2dim = torch.min(cpu_x, dim=2, keepdim=True,) + self.assertEqual(y_2dim, refy_2dim) + self.assertEqual(idx_2dim, refidx_2dim) + + y_3, idx_3 = torch.min(x, dim=3) + refy_3, refidx_3 = torch.min(cpu_x, dim=3) + self.assertEqual(y_3, refy_3) + self.assertEqual(idx_3, refidx_3) + + y_3 = torch.ones(n, c, h, device='mps', dtype=torch.float) + idx_3 = torch.ones(n, c, h, device='mps', dtype=torch.int64) + torch.min(x, dim=3, out=(y_3, idx_3)) + refy_3, refidx_3 = torch.min(cpu_x, dim=3) + self.assertEqual(y_3, refy_3) + self.assertEqual(idx_3, refidx_3) + + y_3dim, idx_3dim = torch.min(x, dim=3, keepdim=True) + refy_3dim, refidx_3dim = torch.min(cpu_x, dim=3, keepdim=True) + self.assertEqual(y_3dim, refy_3dim) + self.assertEqual(idx_3dim, refidx_3dim) + + y_3dim = torch.ones(n, c, h, 1, device='mps', dtype=torch.float) + idx_3dim = torch.ones(n, c, h, 1, device='mps', dtype=torch.int64) + torch.min(x, dim=3, keepdim=True, out=(y_3dim, idx_3dim)) + refy_3dim, refidx_3dim = torch.min(cpu_x, dim=3, keepdim=True,) + self.assertEqual(y_3dim, refy_3dim) + self.assertEqual(idx_3dim, refidx_3dim) + + helper(2, 8, 4, 5) + + # Test forward sum + def test_sum(self): + def helper(n, c, h, w, dtype=torch.float32): + cpu_x = None + x = None + if(dtype not in [torch.float32, torch.bool]): + cpu_x = torch.randint(50, (n, c, h, w), device='cpu', dtype=dtype, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + elif (dtype == torch.bool): + cpu_x = torch.randint(2, (n, c, h, w), device='cpu', dtype=dtype, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + else: + cpu_x = torch.randn(n, c, h, w, device='cpu', dtype=dtype, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + + all_sum = torch.sum(x) + all_sum_cpu = torch.sum(cpu_x) + + self.assertEqual(all_sum, all_sum_cpu) + + nil_dim_sum = torch.sum(x, dim=[]) + nil_dim_sum_cpu = torch.sum(cpu_x, dim=[]) + + self.assertEqual(nil_dim_sum, nil_dim_sum_cpu) + + nil_dim_sum_keepdim = torch.sum(x, dim=[], keepdim=True) + nil_dim_sum_cpu_keepdim = torch.sum(cpu_x, dim=[], keepdim=True) + + self.assertEqual(nil_dim_sum_keepdim, nil_dim_sum_cpu_keepdim) + + zero_dim_sum = torch.sum(x, dim=[0]) + zero_dim_sum_cpu = torch.sum(cpu_x, dim=[0]) + + self.assertEqual(zero_dim_sum, zero_dim_sum_cpu) + + zero_dim_sum_keepdim = torch.sum(x, dim=[0], keepdim=True) + zero_dim_sum_cpu_keepdim = torch.sum(cpu_x, dim=[0], keepdim=True) + + self.assertEqual(zero_dim_sum_keepdim, zero_dim_sum_cpu_keepdim) + + zero_one_dim_sum = torch.sum(x, dim=[0, 1]) + zero_one_dim_sum_cpu = torch.sum(cpu_x, dim=[0, 1]) + + self.assertEqual(zero_one_dim_sum, zero_one_dim_sum_cpu) + + zero_one_dim_sum_keepdim = torch.sum(x, dim=[0, 1], keepdim=True) + zero_one_dim_sum_cpu_keepdim = torch.sum(cpu_x, dim=[0, 1], keepdim=True) + + self.assertEqual(zero_one_dim_sum_keepdim, zero_one_dim_sum_cpu_keepdim) + + two_three_dim_sum = torch.sum(x, dim=[2, 3]) + two_three_dim_sum_cpu = torch.sum(cpu_x, dim=[2, 3]) + + self.assertEqual(two_three_dim_sum, two_three_dim_sum_cpu) + + two_three_keepdim_sum = torch.sum(x, dim=[2, 3], keepdim=True) + two_three_dim_keepsum_cpu = torch.sum(cpu_x, dim=[2, 3], keepdim=True) + + self.assertEqual(two_three_keepdim_sum, two_three_dim_keepsum_cpu) + + helper(2, 8, 4, 5) + helper(2, 8, 4, 5, dtype=torch.int32) + helper(2, 8, 4, 5, dtype=torch.int64) + helper(2, 8, 4, 5, dtype=torch.bool) + + # Test forward prod + def test_prod(self): + def helper(shape, dtype=torch.float32): + cpu_x = None + x = None + if(dtype not in [torch.float32, torch.bool]): + cpu_x = torch.randint(1, 6, shape, device='cpu', dtype=dtype, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + elif (dtype == torch.bool): + cpu_x = torch.randint(2, shape, device='cpu', dtype=dtype, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + else: + cpu_x = torch.randn(shape, device='cpu', dtype=dtype, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + + all_prod = torch.prod(x) + all_prod_cpu = torch.prod(cpu_x) + + self.assertEqual(all_prod, all_prod_cpu) + + for dim in range(len(shape)): + dim_prod = torch.prod(x, dim=dim) + dim_prod_cpu = torch.prod(cpu_x, dim=dim) + + self.assertEqual(dim_prod, dim_prod_cpu) + + dim_prod_keepdim = torch.prod(x, dim=dim, keepdim=True) + dim_prod_cpu_keepdim = torch.prod(cpu_x, dim=dim, keepdim=True) + + self.assertEqual(dim_prod_keepdim, dim_prod_cpu_keepdim) + + for dtype in [torch.float32, torch.int32, torch.int64, torch.bool]: + helper((2, 3), dtype) + + # Test forward mean + def test_mean(self): + def helper(n, c, h, w): + cpu_x = torch.randn(n, c, h, w, device='cpu', dtype=torch.float, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + + all_mean = torch.mean(x) + all_mean_cpu = torch.mean(cpu_x) + + self.assertEqual(all_mean, all_mean_cpu) + + nil_dim_mean = torch.mean(x, dim=[]) + nil_dim_mean_cpu = torch.mean(cpu_x, dim=[]) + + self.assertEqual(nil_dim_mean, nil_dim_mean_cpu) + + nil_dim_mean_keepdim = torch.mean(x, dim=[], keepdim=True) + nil_dim_mean_cpu_keepdim = torch.mean(cpu_x, dim=[], keepdim=True) + + self.assertEqual(nil_dim_mean_keepdim, nil_dim_mean_cpu_keepdim) + + zero_dim_mean = torch.mean(x, dim=[0]) + zero_dim_mean_cpu = torch.mean(cpu_x, dim=[0]) + + self.assertEqual(zero_dim_mean, zero_dim_mean_cpu) + + zero_dim_mean_keepdim = torch.mean(x, dim=[0], keepdim=True) + zero_dim_mean_cpu_keepdim = torch.mean(cpu_x, dim=[0], keepdim=True) + + self.assertEqual(zero_dim_mean_keepdim, zero_dim_mean_cpu_keepdim) + + zero_one_dim_mean = torch.mean(x, dim=[0, 1]) + zero_one_dim_mean_cpu = torch.mean(cpu_x, dim=[0, 1]) + + self.assertEqual(zero_one_dim_mean, zero_one_dim_mean_cpu) + + zero_one_dim_mean_keepdim = torch.mean(x, dim=[0, 1], keepdim=True) + zero_one_dim_mean_cpu_keepdim = torch.mean(cpu_x, dim=[0, 1], keepdim=True) + + self.assertEqual(zero_one_dim_mean_keepdim, zero_one_dim_mean_cpu_keepdim) + + two_three_dim_mean = torch.mean(x, dim=[2, 3]) + two_three_dim_mean_cpu = torch.mean(cpu_x, dim=[2, 3]) + + self.assertEqual(two_three_dim_mean, two_three_dim_mean_cpu) + + two_three_keepdim_mean = torch.mean(x, dim=[2, 3], keepdim=True) + two_three_dim_keepmean_cpu = torch.mean(cpu_x, dim=[2, 3], keepdim=True) + + self.assertEqual(two_three_keepdim_mean, two_three_dim_keepmean_cpu) + + helper(2, 8, 4, 5) + + # Test std + def test_std(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + + all_std = torch.std(x, unbiased=False) + all_std_cpu = torch.std(cpu_x, unbiased=False) + + self.assertEqual(all_std, all_std_cpu) + + nil_dim_std = torch.std(x, dim=[], unbiased=False) + nil_dim_std_cpu = torch.std(cpu_x, dim=[], unbiased=False) + + self.assertEqual(nil_dim_std, nil_dim_std_cpu) + + nil_dim_std_keepdim = torch.std(x, dim=[], keepdim=True, unbiased=False) + nil_dim_std_cpu_keepdim = torch.std(cpu_x, dim=[], keepdim=True, unbiased=False) + + self.assertEqual(nil_dim_std_keepdim, nil_dim_std_cpu_keepdim) + + zero_dim_std = torch.std(x, dim=[0], unbiased=False) + zero_dim_std_cpu = torch.std(cpu_x, dim=[0], unbiased=False) + + self.assertEqual(zero_dim_std, zero_dim_std_cpu) + + zero_dim_std_keepdim = torch.std(x, dim=[0], keepdim=True, unbiased=False) + zero_dim_std_cpu_keepdim = torch.std(cpu_x, dim=[0], keepdim=True, unbiased=False) + + self.assertEqual(zero_dim_std_keepdim, zero_dim_std_cpu_keepdim) + + zero_one_dim_std = torch.std(x, dim=[0, 1], unbiased=False) + zero_one_dim_std_cpu = torch.std(cpu_x, dim=[0, 1], unbiased=False) + + self.assertEqual(zero_one_dim_std, zero_one_dim_std_cpu) + + zero_one_dim_std_keepdim = torch.std(x, dim=[0, 1], keepdim=True, unbiased=False) + zero_one_dim_std_cpu_keepdim = torch.std(cpu_x, dim=[0, 1], keepdim=True, unbiased=False) + + self.assertEqual(zero_one_dim_std_keepdim, zero_one_dim_std_cpu_keepdim) + + two_three_dim_std = torch.std(x, dim=[2, 3], unbiased=False) + two_three_dim_std_cpu = torch.std(cpu_x, dim=[2, 3], unbiased=False) + + self.assertEqual(two_three_dim_std, two_three_dim_std_cpu) + + two_three_keepdim_std = torch.std(x, dim=[2, 3], keepdim=True, unbiased=False) + two_three_dim_keepstd_cpu = torch.std(cpu_x, dim=[2, 3], keepdim=True, unbiased=False) + + self.assertEqual(two_three_keepdim_std, two_three_dim_keepstd_cpu) + + all_std = torch.std(x, unbiased=True) + all_std_cpu = torch.std(cpu_x, unbiased=True) + + self.assertEqual(all_std, all_std_cpu) + + nil_dim_std = torch.std(x, dim=[], unbiased=True) + nil_dim_std_cpu = torch.std(cpu_x, dim=[], unbiased=True) + + self.assertEqual(nil_dim_std, nil_dim_std_cpu) + + nil_dim_std_keepdim = torch.std(x, dim=[], keepdim=True, unbiased=True) + nil_dim_std_cpu_keepdim = torch.std(cpu_x, dim=[], keepdim=True, unbiased=True) + + self.assertEqual(nil_dim_std_keepdim, nil_dim_std_cpu_keepdim) + + zero_dim_std = torch.std(x, dim=[0], unbiased=True) + zero_dim_std_cpu = torch.std(cpu_x, dim=[0], unbiased=True) + + self.assertEqual(zero_dim_std, zero_dim_std_cpu) + + zero_dim_std_keepdim = torch.std(x, dim=[0], keepdim=True, unbiased=True) + zero_dim_std_cpu_keepdim = torch.std(cpu_x, dim=[0], keepdim=True, unbiased=True) + + self.assertEqual(zero_dim_std_keepdim, zero_dim_std_cpu_keepdim) + + zero_one_dim_std = torch.std(x, dim=[0, 1], unbiased=True) + zero_one_dim_std_cpu = torch.std(cpu_x, dim=[0, 1], unbiased=True) + + self.assertEqual(zero_one_dim_std, zero_one_dim_std_cpu) + + zero_one_dim_std_keepdim = torch.std(x, dim=[0, 1], keepdim=True, unbiased=True) + zero_one_dim_std_cpu_keepdim = torch.std(cpu_x, dim=[0, 1], keepdim=True, unbiased=True) + + self.assertEqual(zero_one_dim_std_keepdim, zero_one_dim_std_cpu_keepdim) + + two_three_dim_std = torch.std(x, dim=[2, 3], unbiased=True) + two_three_dim_std_cpu = torch.std(cpu_x, dim=[2, 3], unbiased=True) + + self.assertEqual(two_three_dim_std, two_three_dim_std_cpu) + + two_three_keepdim_std = torch.std(x, dim=[2, 3], keepdim=True, unbiased=True) + two_three_dim_keepstd_cpu = torch.std(cpu_x, dim=[2, 3], keepdim=True, unbiased=True) + + self.assertEqual(two_three_keepdim_std, two_three_dim_keepstd_cpu) + + helper((4, 5, 6, 7)) + + # Test var + def test_var(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + + all_var = torch.var(x, unbiased=False) + all_var_cpu = torch.var(cpu_x, unbiased=False) + + self.assertEqual(all_var, all_var_cpu) + + nil_dim_var = torch.var(x, dim=[], unbiased=False) + nil_dim_var_cpu = torch.var(cpu_x, dim=[], unbiased=False) + + self.assertEqual(nil_dim_var, nil_dim_var_cpu) + + nil_dim_var_keepdim = torch.var(x, dim=[], keepdim=True, unbiased=False) + nil_dim_var_cpu_keepdim = torch.var(cpu_x, dim=[], keepdim=True, unbiased=False) + + self.assertEqual(nil_dim_var_keepdim, nil_dim_var_cpu_keepdim) + + zero_dim_var = torch.var(x, dim=[0], unbiased=False) + zero_dim_var_cpu = torch.var(cpu_x, dim=[0], unbiased=False) + + self.assertEqual(zero_dim_var, zero_dim_var_cpu) + + zero_dim_var_keepdim = torch.var(x, dim=[0], keepdim=True, unbiased=False) + zero_dim_var_cpu_keepdim = torch.var(cpu_x, dim=[0], keepdim=True, unbiased=False) + + self.assertEqual(zero_dim_var_keepdim, zero_dim_var_cpu_keepdim) + + zero_one_dim_var = torch.var(x, dim=[0, 1], unbiased=False) + zero_one_dim_var_cpu = torch.var(cpu_x, dim=[0, 1], unbiased=False) + + self.assertEqual(zero_one_dim_var, zero_one_dim_var_cpu) + + zero_one_dim_var_keepdim = torch.var(x, dim=[0, 1], keepdim=True, unbiased=False) + zero_one_dim_var_cpu_keepdim = torch.var(cpu_x, dim=[0, 1], keepdim=True, unbiased=False) + + self.assertEqual(zero_one_dim_var_keepdim, zero_one_dim_var_cpu_keepdim) + + two_three_dim_var = torch.var(x, dim=[2, 3], unbiased=False) + two_three_dim_var_cpu = torch.var(cpu_x, dim=[2, 3], unbiased=False) + + self.assertEqual(two_three_dim_var, two_three_dim_var_cpu) + + two_three_keepdim_var = torch.var(x, dim=[2, 3], keepdim=True, unbiased=False) + two_three_dim_keepvar_cpu = torch.var(cpu_x, dim=[2, 3], keepdim=True, unbiased=False) + + self.assertEqual(two_three_keepdim_var, two_three_dim_keepvar_cpu) + + all_var = torch.var(x, unbiased=True) + all_var_cpu = torch.var(cpu_x, unbiased=True) + + self.assertEqual(all_var, all_var_cpu) + + nil_dim_var = torch.var(x, dim=[], unbiased=True) + nil_dim_var_cpu = torch.var(cpu_x, dim=[], unbiased=True) + + self.assertEqual(nil_dim_var, nil_dim_var_cpu) + + nil_dim_var_keepdim = torch.var(x, dim=[], keepdim=True, unbiased=True) + nil_dim_var_cpu_keepdim = torch.var(cpu_x, dim=[], keepdim=True, unbiased=True) + + self.assertEqual(nil_dim_var_keepdim, nil_dim_var_cpu_keepdim) + + zero_dim_var = torch.var(x, dim=[0], unbiased=True) + zero_dim_var_cpu = torch.var(cpu_x, dim=[0], unbiased=True) + + self.assertEqual(zero_dim_var, zero_dim_var_cpu) + + zero_dim_var_keepdim = torch.var(x, dim=[0], keepdim=True, unbiased=True) + zero_dim_var_cpu_keepdim = torch.var(cpu_x, dim=[0], keepdim=True, unbiased=True) + + self.assertEqual(zero_dim_var_keepdim, zero_dim_var_cpu_keepdim) + + zero_one_dim_var = torch.var(x, dim=[0, 1], unbiased=True) + zero_one_dim_var_cpu = torch.var(cpu_x, dim=[0, 1], unbiased=True) + + self.assertEqual(zero_one_dim_var, zero_one_dim_var_cpu) + + zero_one_dim_var_keepdim = torch.var(x, dim=[0, 1], keepdim=True, unbiased=True) + zero_one_dim_var_cpu_keepdim = torch.var(cpu_x, dim=[0, 1], keepdim=True, unbiased=True) + + self.assertEqual(zero_one_dim_var_keepdim, zero_one_dim_var_cpu_keepdim) + + two_three_dim_var = torch.var(x, dim=[2, 3], unbiased=True) + two_three_dim_var_cpu = torch.var(cpu_x, dim=[2, 3], unbiased=True) + + self.assertEqual(two_three_dim_var, two_three_dim_var_cpu) + + two_three_keepdim_var = torch.var(x, dim=[2, 3], keepdim=True, unbiased=True) + two_three_dim_keepvar_cpu = torch.var(cpu_x, dim=[2, 3], keepdim=True, unbiased=True) + + self.assertEqual(two_three_keepdim_var, two_three_dim_keepvar_cpu) + + helper((4, 5, 6, 7)) + + # test norm_out + # CRASH in Fallback for svd_linalg op. + # def test_norm(self): + # def helper(shape): + # cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + # x = cpu_x.detach().clone().to('mps') + # p_vals = [ ] + # for i in range(-5, 6): + # p_vals.append(i) + # p_vals.append(random.uniform(5.1, 10.1)) + + # p_vals.append(float('inf')) + # p_vals.append(float('-inf')) + # p_vals.append('fro') + # p_vals.append('nuc') + # # ints + # for p_val in p_vals: + # if (p_val != 'nuc'): + # all_norm = torch.norm(x, p=p_val) + # all_norm_cpu = torch.norm(cpu_x, p=p_val) + + # self.assertEqual(all_norm, all_norm_cpu) + + # nil_dim_norm = torch.norm(x, dim=[], p=p_val) + # nil_dim_norm_cpu = torch.norm(cpu_x, dim=[], p=p_val) + + # self.assertEqual(nil_dim_norm, nil_dim_norm_cpu) + + # nil_dim_norm_keepdim = torch.norm(x, dim=[], keepdim=True, p=p_val) + # nil_dim_norm_cpu_keepdim = torch.norm(cpu_x, dim=[], keepdim=True, p=p_val) + + # self.assertEqual(nil_dim_norm_keepdim, nil_dim_norm_cpu_keepdim) + + # zero_dim_norm = torch.norm(x, dim=[0], p=p_val) + # zero_dim_norm_cpu = torch.norm(cpu_x, dim=[0], p=p_val) + + # self.assertEqual(zero_dim_norm, zero_dim_norm_cpu) + + # zero_dim_norm_keepdim = torch.norm(x, dim=[0], keepdim=True, p=p_val) + # zero_dim_norm_cpu_keepdim = torch.norm(cpu_x, dim=[0], keepdim=True, p=p_val) + + # self.assertEqual(zero_dim_norm_keepdim, zero_dim_norm_cpu_keepdim) + + # if (len(shape) > 1): + # zero_one_dim_norm = torch.norm(x, dim=[0, 1],p=p_val) + # zero_one_dim_norm_cpu = torch.norm(cpu_x, dim=[0, 1],p=p_val) + + # self.assertEqual(zero_one_dim_norm, zero_one_dim_norm_cpu) + + # zero_one_dim_norm_keepdim = torch.norm(x, dim=[0, 1], keepdim=True, p=p_val) + # zero_one_dim_norm_cpu_keepdim = torch.norm(cpu_x, dim=[0, 1], keepdim=True, p=p_val) + + # self.assertEqual(zero_one_dim_norm_keepdim, zero_one_dim_norm_cpu_keepdim) + + # zero_one_dim_norm = torch.norm(x, dim=[0, 1],p='fro') + + # if (len(shape) > 3): + # two_three_dim_norm = torch.norm(x, dim=[2,3], p=p_val) + # two_three_dim_norm_cpu = torch.norm(cpu_x, dim=[2,3], p=p_val) + + # self.assertEqual(two_three_dim_norm, two_three_dim_norm_cpu) + + # two_three_keepdim_norm = torch.norm(x, dim=[2,3], keepdim=True, p=p_val) + # two_three_dim_keepnorm_cpu = torch.norm(cpu_x, dim=[2, 3], keepdim=True,p=p_val) + + # self.assertEqual(two_three_keepdim_norm, two_three_dim_keepnorm_cpu) + + # helper((5, 1)) + # helper((5, 7)) + # helper((4, 5, 6, 7)) + + # Test minimum and maximum + def test_minimum_maximum(self): + def helper(n, c, h, w): + cpu_x = torch.randn(n, c, h, w, device='cpu', dtype=torch.float, requires_grad=False) + cpu_y = torch.randn(n, c, h, w, device='cpu', dtype=torch.float, requires_grad=False) + mps_x = cpu_x.detach().clone().to('mps') + mps_y = cpu_y.detach().clone().to('mps') + + minimum_result_cpu = torch.minimum(cpu_x, cpu_y) + minimum_result_mps = torch.minimum(mps_x, mps_y) + self.assertEqual(minimum_result_cpu, minimum_result_mps) + + maximum_result_cpu = torch.maximum(cpu_x, cpu_y) + maximum_result_mps = torch.maximum(mps_x, mps_y) + self.assertEqual(maximum_result_cpu, maximum_result_mps) + + helper(1, 1, 4, 5) + + # Test clamp_min + def test_clamp_min(self): + def helper(n, c, h, w): + cpu_x = torch.randn(n, c, h, w, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + + cpu_min_t = torch.randn(n, c, h, w, device='cpu', dtype=torch.float, requires_grad=False) + min_t = cpu_min_t.detach().clone().to('mps') + + clamp_min_result = torch.clamp_min(x, min=5.0) + clamp_min_result_cpu = torch.clamp_min(cpu_x, min=5.0) + + self.assertEqual(clamp_min_result, clamp_min_result_cpu) + + clamp_min_t_result = torch.clamp_min(x, min=min_t) + clamp_min_t_result_cpu = torch.clamp_min(cpu_x, min=cpu_min_t) + + self.assertEqual(clamp_min_t_result, clamp_min_t_result_cpu) + + helper(2, 8, 4, 5) + + # Test clamp_max + + def test_clamp_max(self): + def helper(n, c, h, w): + cpu_x = torch.randn(n, c, h, w, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + + cpu_max_t = torch.randn(n, c, h, w, device='cpu', dtype=torch.float, requires_grad=False) + max_t = cpu_max_t.detach().clone().to('mps') + + clamp_max_result = torch.clamp_max(x, max=100.0) + clamp_max_result_cpu = torch.clamp_max(cpu_x, max=100.0) + + self.assertEqual(clamp_max_result, clamp_max_result_cpu) + + clamp_max_t_result = torch.clamp_max(x, max=max_t) + clamp_max_t_result_cpu = torch.clamp_max(cpu_x, max=cpu_max_t) + + self.assertEqual(clamp_max_t_result, clamp_max_t_result_cpu) + + helper(2, 8, 4, 5) + + # Test clamp + def test_clamp(self): + def helper(n, c, h, w): + import numpy as np + upper_bound = 1000 + half_upper_bound = upper_bound / 2 + + # x=[0..1000) + x_arr = upper_bound * np.random.random_sample(size=(n, c, h, w)).astype(np.float32) + cpu_x = torch.tensor(x_arr, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + + # x=[0..500) + min_arr = half_upper_bound * np.random.random_sample(size=(n, c, h, w)).astype(np.float32) + cpu_min_t = torch.tensor(min_arr, device='cpu', dtype=torch.float, requires_grad=False) + min_t = cpu_min_t.detach().clone().to('mps') + + # x=[500..1000), to ensure max's are greater than mins + max_arr = (half_upper_bound * np.random.random_sample(size=(n, c, h, w)).astype(np.float32)) + half_upper_bound + cpu_max_t = torch.tensor(max_arr, device='cpu', dtype=torch.float, requires_grad=False) + max_t = cpu_max_t.detach().clone().to('mps') + + # [200..600]: just an arbitrary range between [0..1000] + clamp_result = torch.clamp(x, min=200.0, max=600.0) + clamp_result_cpu = torch.clamp(cpu_x, min=200.0, max=600.0) + self.assertEqual(clamp_result, clamp_result_cpu) + + # test optional scalar refs and cached graph keys by passing only max + clamp_opt_result = torch.clamp(x, max=600.0) + clamp_opt_result_cpu = torch.clamp(cpu_x, max=600.0) + self.assertEqual(clamp_opt_result, clamp_opt_result_cpu) + + clamp_t_result = torch.clamp(x, min=min_t, max=max_t) + clamp_t_result_cpu = torch.clamp(cpu_x, min=cpu_min_t, max=cpu_max_t) + self.assertEqual(clamp_t_result, clamp_t_result_cpu) + + # test optional tensor refs and cached graph keys by passing only max + clamp_topt_result = torch.clamp(x, max=max_t) + clamp_topt_result_cpu = torch.clamp(cpu_x, max=cpu_max_t) + self.assertEqual(clamp_topt_result, clamp_topt_result_cpu) + + # test inplace clamping + x.clamp_(min=200.0, max=600.0) + cpu_x.clamp_(min=200.0, max=600.0) + self.assertEqual(cpu_x, x) + + helper(2, 8, 4, 5) + + def test_divmode(self): + def helper(shape, rounding_mode): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + mps_x = cpu_x.detach().clone().to('mps') + # clamp to avoid division by 0 + cpu_y = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False).clamp_min_(0.1) + mps_y = cpu_y.detach().clone().to('mps') + + result_div_cpu = torch.div(cpu_x, cpu_y, rounding_mode=rounding_mode) + result_div_mps = torch.div(mps_x, mps_y, rounding_mode=rounding_mode) + self.assertEqual(result_div_mps, result_div_cpu) + + helper((2, 8, 4, 5), "floor") + helper((2, 8, 4, 5), "trunc") + + def test_rounding(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + mps_x = cpu_x.detach().clone().to('mps') + + result_floor_cpu = torch.floor(cpu_x) + result_floor_mps = torch.floor(mps_x) + self.assertEqual(result_floor_mps, result_floor_cpu) + + result_ceil_cpu = torch.ceil(cpu_x) + result_ceil_mps = torch.ceil(mps_x) + self.assertEqual(result_ceil_mps, result_ceil_cpu) + + result_trunc_cpu = torch.trunc(cpu_x) + result_trunc_mps = torch.trunc(mps_x) + self.assertEqual(result_trunc_mps, result_trunc_cpu) + + result_round_cpu = torch.round(cpu_x) + result_round_mps = torch.round(mps_x) + self.assertEqual(result_round_mps, result_round_cpu) + + helper((2, 6, 3, 5)) + helper((2, 8, 4, 5)) + + def test_expand(self): + def helper(n, c): + values = [[1.0], [4.0], [7.0]] + cpu_x = torch.tensor(values, device='cpu') + x = cpu_x.detach().clone().to('mps') + + strided_cpu = torch.as_strided(cpu_x, (3, 4), (1, 0)) + strided_mps = torch.as_strided(x, (3, 4), (1, 0)) + + print(cpu_x) + print(strided_cpu) + + print(x.to('cpu')) + print(strided_mps.to('cpu')) + + print(strided_mps.size()) + print(strided_mps.stride()) + + self.assertEqual(strided_mps, strided_cpu) + + helper(3, 1) + + def test_select(self): + def helper(n, c): + cpu_x = torch.randn(n, c, device='cpu', dtype=torch.float, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + + strided_cpu = torch.as_strided(cpu_x, (3, 1), (3, 1)) + strided_mps = torch.as_strided(x, (3, 1), (3, 1)) + self.assertEqual(strided_mps, strided_cpu) + + strided_cpu = torch.as_strided(cpu_x, (1, 3), (3, 1)) + strided_mps = torch.as_strided(x, (1, 3), (3, 1)) + self.assertEqual(strided_mps, strided_cpu) + + strided_cpu = torch.as_strided(cpu_x, (3, 1), (3, 1), storage_offset=1) + strided_mps = torch.as_strided(x, (3, 1), (3, 1), storage_offset=1) + print(cpu_x) + print(strided_cpu) + + print(x.to('cpu')) + print(strided_mps.to('cpu')) + + self.assertEqual(strided_mps, strided_cpu) + + helper(3, 3) + + def test_topk(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + for largest_val in [True, False]: + if (type(shape) == tuple): + for curr_dim in range(0, len(shape)): + dim_size = shape[curr_dim] + for k in range(1, dim_size + 1): + topk_values, topk_indices = torch.topk(x, k, dim=curr_dim, largest=largest_val) + topk_values_cpu, topk_indices_cpu = torch.topk(cpu_x, k, dim=curr_dim, largest=largest_val) + self.assertEqual(topk_values, topk_values_cpu) + self.assertEqual(topk_indices, topk_indices_cpu) + else: + for k in range(1, shape): + topk_values, topk_indices = torch.topk(x, k, dim=0, largest=largest_val) + topk_values_cpu, topk_indices_cpu = torch.topk(cpu_x, k, dim=0, largest=largest_val) + self.assertEqual(topk_values, topk_values_cpu) + self.assertEqual(topk_indices, topk_indices_cpu) + + helper(2) + helper((5, 1)) + helper((1, 5)) + helper((5, 9, 7, 4)) + + def test_upsample_nearest_exact2d(self): + def helper(N, C, H, W): + inputCPU = torch.arange(N * C * H * W, device='cpu', dtype=torch.float, + requires_grad=True).reshape(N, C, H, W) + inputCPU.retain_grad() + inputMPS = inputCPU.detach().clone().to('mps').requires_grad_() + + outputCPU = torch.nn.functional.interpolate(inputCPU, size=(5, 5), mode='nearest-exact') + outputMPS = torch.nn.functional.interpolate(inputMPS, size=(5, 5), mode='nearest-exact') + + self.assertEqual(outputCPU, outputMPS) + + outputCPU.backward(gradient=torch.full_like(outputCPU, 0.3)) + outputMPS.backward(gradient=torch.full_like(outputMPS, 0.3)) + + self.assertEqual(inputCPU.grad, inputMPS.grad) + + helper(1, 1, 4, 4) + helper(7, 5, 3, 2) + + def test_upsample_nearest2d(self): + def helper(N, C, H, W): + inputCPU = torch.arange(N * C * H * W, device='cpu', dtype=torch.float, + requires_grad=True).reshape(N, C, H, W) + inputCPU.retain_grad() + inputMPS = inputCPU.detach().clone().to('mps').requires_grad_() + + x_max = 40 + y_max = 40 + + for i in range(1, x_max): + for j in range(1, y_max): + upsample_nearest2d = nn.UpsamplingNearest2d(scale_factor=(i, j)) + + outputCPU = upsample_nearest2d(inputCPU) + outputMPS = upsample_nearest2d(inputMPS) + + self.assertEqual(outputCPU, outputMPS) + upsample_nearest2d = nn.UpsamplingNearest2d((i * H, j * W)) + + outputCPU = upsample_nearest2d(inputCPU) + outputMPS = upsample_nearest2d(inputMPS) + + self.assertEqual(outputCPU, outputMPS) + + outputCPU.backward(gradient=torch.full_like(outputCPU, 0.3)) + outputMPS.backward(gradient=torch.full_like(outputMPS, 0.3)) + + self.assertEqual(inputCPU.grad, inputMPS.grad) + + helper(1, 1, 4, 4) + helper(7, 5, 3, 2) + + def test_upsample_bilinear2d(self): + def helper(N, C, H, W): + inputCPU = torch.arange(N * C * H * W, device='cpu', dtype=torch.float, + requires_grad=True).reshape(N, C, H, W) + inputCPU.retain_grad() + inputMPS = inputCPU.detach().clone().to('mps').requires_grad_() + + x_max = 40 + y_max = 40 + + for i in range(1, x_max): + for j in range(1, y_max): + upsample_bilinear2d = nn.UpsamplingBilinear2d(scale_factor=(i, j)) + + outputCPU = upsample_bilinear2d(inputCPU) + outputMPS = upsample_bilinear2d(inputMPS) + + self.assertEqual(outputCPU, outputMPS) + + upsample_bilinear2d = nn.UpsamplingBilinear2d((i * H, j * W)) + + outputCPU = upsample_bilinear2d(inputCPU) + outputMPS = upsample_bilinear2d(inputMPS) + + self.assertEqual(outputCPU, outputMPS) + + outputCPU.backward(gradient=torch.full_like(outputCPU, 0.3)) + outputMPS.backward(gradient=torch.full_like(outputMPS, 0.3)) + + self.assertEqual(inputCPU.grad, inputMPS.grad) + + helper(1, 1, 4, 4) + helper(7, 5, 3, 2) + + # Test concat forward + def test_cat1(self): + def helper(shape_x, shape_y, shape_z): + cpu_x = torch.randn(shape_x, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + + cpu_y = torch.randn(shape_y, device='cpu', dtype=torch.float, requires_grad=False) + y = cpu_y.detach().clone().to('mps') + + cpu_z = torch.randn(shape_z, device='cpu', dtype=torch.float, requires_grad=False) + z = cpu_z.detach().clone().to('mps') + + cat = torch.cat([x, y, z], dim=1) + cat_cpu = torch.cat([cpu_x, cpu_y, cpu_z], dim=1) + + self.assertEqual(cat, cat_cpu) + + helper([2, 2, 4, 5], [2, 3, 4, 5], [2, 5, 4, 5]) + # Empty test - Currently failing! Empty tensor not handled! + # helper([0, 2, 4, 5], [2, 0, 4, 5], [2, 5, 0, 5]) + + def test_pad(self): + def helper(shape, padding, op): + inputCPU = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True) + inputCPU.retain_grad() + inputMPS = inputCPU.detach().clone().to('mps').requires_grad_() + + padCriteria = op(padding) + outputCPU = padCriteria(inputCPU) + outputMPS = padCriteria(inputMPS) + self.assertEqual(outputCPU, outputMPS) + + # backward pass (chose 0.6 just to have the grad_output != 1) + outputCPU.backward(gradient=torch.full_like(outputCPU, 0.6)) + outputMPS.backward(gradient=torch.full_like(outputMPS, 0.6)) + self.assertEqual(inputCPU.grad, inputMPS.grad) + + # 1D Padding + helper((2, 4, 3), 2, nn.ReflectionPad1d) + # verify if a change in shape of input would cause problems with graph caching + helper((2, 4, 4), (1, 3), nn.ReflectionPad1d) + # Replication 1D + helper((2, 1, 6), 3, nn.ReplicationPad1d) + + # 2D Padding + helper((1, 2, 3, 4), (1, 1, 2, 0), nn.ReflectionPad2d) + # verify if a change in shape of input would cause problems with graph caching + helper((2, 4, 3, 4), (1, 1, 2, 0), nn.ReflectionPad2d) + # this should make the padding (2, 2, 2, 2) + helper((2, 1, 6, 8), 2, nn.ReplicationPad2d) + # verify if a change in shape of padding would cause problems with graph caching + helper((2, 1, 6, 8), (2, 4, 3, 5), nn.ReplicationPad2d) + + # 3D Padding + helper((2, 4, 6, 8, 4), (1, 3, 3, 5, 3, 4), nn.ReflectionPad3d) + # verify if a change in shape of padding would cause problems with graph caching + helper((2, 4, 6, 8, 4), (1, 3, 3, 5, 3, 4), nn.ReplicationPad3d) + + # Test stack forward + def test_stack(self): + # All shapes must be same + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + + cpu_y = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + y = cpu_y.detach().clone().to('mps') + + cpu_z = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + z = cpu_z.detach().clone().to('mps') + + stack = torch.stack([x, y, z], dim=1) + stack_cpu = torch.stack([cpu_x, cpu_y, cpu_z], dim=1) + + self.assertEqual(stack, stack_cpu) + + helper([2, 8, 4, 5]) + # Empty test - Currently failing! Empty tensor not handled! + # helper([0, 2, 4, 5]) + + # Test abs + def test_abs(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + + abs_result = torch.abs(x) + abs_result_cpu = torch.abs(cpu_x) + + self.assertEqual(abs_result, abs_result_cpu) + + helper((2, 8, 4, 5)) + + def test_log(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + + log_result = torch.log(x) + log_result_cpu = torch.log(cpu_x) + + self.assertEqual(log_result, log_result_cpu) + + helper((2, 8, 4, 5)) + + def test_log_ten(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + + log_ten_result = torch.log10(x) + log_ten_result_cpu = torch.log10(cpu_x) + + self.assertEqual(log_ten_result, log_ten_result_cpu) + + helper((2, 8, 4, 5)) + + def test_log_two(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + + log_two_result = torch.log2(x) + log_two_result_cpu = torch.log2(cpu_x) + + self.assertEqual(log_two_result, log_two_result_cpu) + + helper((2, 8, 4, 5)) + + def test_log1p(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + + log_result = torch.log1p(x) + log_result_cpu = torch.log1p(cpu_x) + + self.assertEqual(log_result, log_result_cpu) + + helper((2, 8, 4, 5)) + + def test_logaddexp(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + + cpu_y = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + y = cpu_y.detach().clone().to('mps') + + log_result = torch.logaddexp(x, y) + log_result_cpu = torch.logaddexp(cpu_x, cpu_y) + + self.assertEqual(log_result, log_result_cpu) + + helper((2, 8, 4, 5)) + + def test_logaddexp2(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + + cpu_y = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + y = cpu_y.detach().clone().to('mps') + + log_result = torch.logaddexp2(x, y) + log_result_cpu = torch.logaddexp2(cpu_x, cpu_y) + + self.assertEqual(log_result, log_result_cpu) + + helper((2, 8, 4, 5)) + + # Test concat forward + def test_cat2(self): + + def helper1(shape_x, shape_y, shape_z, shape_w): + cpu_x = torch.randn(shape_x, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + + cpu_y = torch.randn(shape_y, device='cpu', dtype=torch.float, requires_grad=False) + y = cpu_y.detach().clone().to('mps') + + cpu_z = torch.randn(shape_z, device='cpu', dtype=torch.float, requires_grad=False) + z = cpu_z.detach().clone().to('mps') + + cpu_w = torch.randn(shape_w, device='cpu', dtype=torch.float, requires_grad=False) + w = cpu_w.detach().clone().to('mps') + + cat = torch.cat([x, y, z, w], dim=1) + cat_cpu = torch.cat([cpu_x, cpu_y, cpu_z, cpu_w], dim=1) + + self.assertEqual(cat, cat_cpu) + + def helper(shape_x, shape_y, shape_z): + cpu_x = torch.randn(shape_x, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + + cpu_y = torch.randn(shape_y, device='cpu', dtype=torch.float, requires_grad=False) + y = cpu_y.detach().clone().to('mps') + + cpu_z = torch.randn(shape_z, device='cpu', dtype=torch.float, requires_grad=False) + z = cpu_z.detach().clone().to('mps') + + cat = torch.cat([x, y, z], dim=1) + cat_cpu = torch.cat([cpu_x, cpu_y, cpu_z], dim=1) + + self.assertEqual(cat, cat_cpu) + + helper([2, 8, 4, 5], [2, 10, 4, 5], [2, 6, 4, 5]) + helper([2, 2, 4, 5], [2, 3, 4, 5], [2, 5, 4, 5]) + # Empty test - Currently failing! Empty tensor not handled! + # helper([0, 2, 4, 5], [2, 0, 4, 5], [2, 5, 0, 5]) + + # Test isnan + def test_isnan(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + nan_index = [random.randrange(0, shape[0])] + # make a selected row inf + cpu_x.index_put_(indices=[torch.tensor(nan_index)], values=torch.tensor(float('nan'))) + x = cpu_x.detach().clone().to('mps') + + isnan_result = torch.isnan(x) + isnan_result_cpu = torch.isnan(cpu_x) + + self.assertEqual(isnan_result, isnan_result_cpu) + + helper((8, 2, 4, 5)) + + # Test reciprocal + def test_reciprocal(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + + reciprocal_result = torch.reciprocal(x) + reciprocal_result_cpu = torch.reciprocal(cpu_x) + + cpu_grad = torch.ones_like(reciprocal_result_cpu) + grad = cpu_grad.to('mps') + + reciprocal_result.backward(gradient=grad) + reciprocal_result_cpu.backward(gradient=cpu_grad) + + self.assertEqual(reciprocal_result, reciprocal_result_cpu) + self.assertEqual(x.grad, cpu_x.grad) + + helper((2, 8, 4, 5)) + + # Test sqrt + def test_sqrt(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + + sqrt_result = torch.sqrt(x) + sqrt_result_cpu = torch.sqrt(cpu_x) + + cpu_grad = torch.ones_like(sqrt_result_cpu) + grad = cpu_grad.to('mps') + + sqrt_result.backward(gradient=grad) + sqrt_result_cpu.backward(gradient=cpu_grad) + + self.assertEqual(sqrt_result, sqrt_result_cpu) + self.assertEqual(x.grad, cpu_x.grad) + + helper((2, 8, 4, 5)) + + # Test selu, elu, celu + def test_elu(self): + def helper(shape, alpha=1.0): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + + for activation_func in [torch.nn.ELU(alpha=alpha), torch.nn.CELU(alpha=alpha), torch.nn.SELU()]: + elu_result = activation_func(x) + elu_result_cpu = activation_func(cpu_x) + + cpu_grad = torch.randn(elu_result_cpu.shape) + grad = cpu_grad.to('mps') + + elu_result.backward(gradient=grad) + elu_result_cpu.backward(gradient=cpu_grad) + + self.assertEqual(elu_result, elu_result_cpu) + self.assertEqual(x.grad, cpu_x.grad) + + # Test empty shape too + for shape in [[], (2, 3), (2, 8, 4, 5)]: + for alpha in [0.000001, 1.0, 2.3, 0.34, 23]: + helper(shape, alpha) + # Test silu + + def test_silu(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + + silu_result = torch.nn.SiLU()(x) + silu_result_cpu = torch.nn.SiLU()(cpu_x) + + cpu_grad = torch.randn(silu_result_cpu.shape) + grad = cpu_grad.to('mps') + + silu_result.backward(gradient=grad) + silu_result_cpu.backward(gradient=cpu_grad) + + self.assertEqual(silu_result, silu_result_cpu) + self.assertEqual(x.grad, cpu_x.grad) + + # Test empty shape too + for shape in [[], (2, 3), (2, 8, 4, 5)]: + helper(shape) + + # Test adaptive avg pool2d - when the input size is a multiple of output size + # Not testing for channels last right now + def test_adaptive_avg_pool2d_simple(self): + def helper(input_shape, out_shape, channels_last): + cpu_x = torch.randn(input_shape, device='cpu', dtype=torch.float, requires_grad=True) + if(channels_last): + cpu_x = cpu_x.to(memory_format=torch.channels_last) + cpu_x.retain_grad() + x = cpu_x.detach().clone().to('mps').requires_grad_() + + avg_result = torch.nn.AdaptiveAvgPool2d(out_shape)(x) + avg_result_cpu = torch.nn.AdaptiveAvgPool2d(out_shape)(cpu_x) + + cpu_grad = torch.randn(avg_result_cpu.shape) + grad = cpu_grad.to('mps') + + avg_result.backward(gradient=grad) + avg_result_cpu.backward(gradient=cpu_grad) + + self.assertEqual(avg_result, avg_result_cpu) + self.assertEqual(x.grad, cpu_x.grad) + + helper((2, 2, 4, 4), (2, 2), False) + helper((2, 2, 9, 9), (3, 3), False) + helper((2, 2, 9, 9), (9, 9), False) + helper((2, 2, 16, 16), (2, 2), False) + helper((2, 2, 16, 16), (2, 16), False) + + helper((2, 16, 16), (4, 4), False) + + def test_gelu_simple(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + + gelu_result = torch.nn.GELU()(x) + gelu_result_cpu = torch.nn.GELU()(cpu_x) + + cpu_grad = torch.ones_like(gelu_result_cpu) + grad = cpu_grad.to('mps') + + gelu_result.backward(gradient=grad) + gelu_result_cpu.backward(gradient=cpu_grad) + + self.assertEqual(gelu_result, gelu_result_cpu) + self.assertEqual(x.grad, cpu_x.grad) + + # Test empty shape too + for shape in [(0, 3), [], (2, 3), (2, 8, 4, 5)]: + helper(shape) + + # Test hardtanh + def test_hardtanh(self): + def helper(shape, min_val, max_val, inplace=False): + cpu_x = None + x = None + + if(not inplace): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + else: + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + + hardtanh_result = torch.nn.Hardtanh(min_val=min_val, max_val=max_val, inplace=inplace)(x) + hardtanh_result_cpu = torch.nn.Hardtanh(min_val=min_val, max_val=max_val, inplace=inplace)(cpu_x) + + self.assertEqual(hardtanh_result, hardtanh_result_cpu) + + if(not inplace): + cpu_grad = torch.randn(hardtanh_result_cpu.shape) + grad = cpu_grad.to('mps') + hardtanh_result.backward(gradient=grad) + hardtanh_result_cpu.backward(gradient=cpu_grad) + self.assertEqual(x.grad, cpu_x.grad) + + # Test empty shape too + for shape in [(0, 3), [], (2, 3), (2, 8, 4, 5)]: + for min_val, max_val in zip([-1, -2, 3], [1, -1, 4]): + helper(shape, min_val, max_val) + helper(shape, min_val, max_val, inplace=True) + + # Test sign + def test_sign(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + + sign_result = torch.sign(x) + sign_result_cpu = torch.sign(cpu_x) + + cpu_grad = torch.ones_like(sign_result_cpu) + grad = cpu_grad.to('mps') + + sign_result.backward(gradient=grad) + sign_result_cpu.backward(gradient=cpu_grad) + + self.assertEqual(sign_result, sign_result_cpu) + + helper((2, 8, 4, 5)) + + # Test neg + def test_neg(self): + def helper(shape): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + + neg_result = torch.neg(x) + neg_result_cpu = torch.neg(cpu_x) + + cpu_grad = torch.ones_like(neg_result_cpu) + grad = cpu_grad.to('mps') + + neg_result.backward(gradient=grad) + neg_result_cpu.backward(gradient=cpu_grad) + + self.assertEqual(neg_result, neg_result_cpu) + + helper((2, 8, 4, 5)) + + # Test index select + def test_index_select(self): + def helper(shape, dim, index, idx_dtype=torch.int32): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + + cpu_idx = torch.tensor(index, device='cpu', dtype=idx_dtype) + idx = cpu_idx.detach().clone().to('mps') + + print(cpu_idx.shape) + + idx_result = torch.index_select(x, dim=dim, index=idx) + idx_result_cpu = torch.index_select(cpu_x, dim=dim, index=cpu_idx) + + self.assertEqual(idx_result, idx_result_cpu) + + helper((2, 8, 4, 5), 0, [1]) + helper((8, 8, 4, 5), 0, [0, 3, 2, 7, 6]) + helper((2, 8, 4, 5), 1, [0, 3, 2, 7, 6]) + helper((2, 8, 4, 5), 2, [3, 0, 1]) + helper((2, 8, 4, 5), 3, [2, 3, 0]) + helper((2, 3, 3), -1, [1, 2]) + + def test_embedding_dense_backward(self): + def helper(n, d, m): + embeddingMPS = nn.Embedding(n, d, max_norm=True, device='mps') + W_MPS = torch.randn((m, d), requires_grad=True, device='mps') + idx_MPS = torch.tensor([0, 1, 2]).to('mps') + a_MPS = embeddingMPS.weight.clone() @ W_MPS.t() # weight must be cloned for this to be differentiable + a_MPS.retain_grad() + b_MPS = embeddingMPS(idx_MPS) @ W_MPS.t() # modifies weight in-place + b_MPS.retain_grad() + out_MPS = (a_MPS.unsqueeze(0) + b_MPS.unsqueeze(1)) + loss_MPS = out_MPS.sigmoid().prod() + loss_MPS.backward() + + embeddingCPU = nn.Embedding(n, d, max_norm=True, scale_grad_by_freq=True) + W_CPU = W_MPS.to('cpu') + idx_CPU = torch.tensor([0, 1, 2]) + a_CPU = embeddingCPU.weight.clone() @ W_CPU.t() # weight must be cloned for this to be differentiable + a_CPU.retain_grad() + b_CPU = embeddingCPU(idx_CPU) @ W_CPU.t() # modifies weight in-place + b_CPU.retain_grad() + out_CPU = (a_CPU.unsqueeze(0) + b_CPU.unsqueeze(1)) + loss_CPU = out_CPU.sigmoid().prod() + loss_CPU.backward() + + self.assertEqual(b_CPU.grad, b_MPS.grad) + self.assertEqual(a_CPU.grad, a_MPS.grad) + + helper(3, 5, 7) + + # Test pytorch gather + def test_gather(self): + def helper(shape, dim, idx_shape, idx_dtype=torch.int64): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + + # Indices should be taken from range of axis along which gathering is done + idx_np = np.random.randint(0, shape[dim], idx_shape) + + cpu_idx = torch.tensor(idx_np, device='cpu', dtype=idx_dtype) + idx = cpu_idx.detach().clone().to('mps') + + gather_result = torch.gather(x, dim=dim, index=idx) + gather_result_cpu = torch.gather(cpu_x, dim=dim, index=cpu_idx) + + cpu_grad = torch.randn(idx_shape, device='cpu', dtype=torch.float) + grad = cpu_grad.to('mps') + gather_result.backward(gradient=grad) + gather_result_cpu.backward(gradient=cpu_grad) + + self.assertEqual(gather_result, gather_result_cpu) + self.assertEqual(cpu_x.grad, x.grad) + + helper((6, 3, 3), 0, (3, 3, 3)) + helper((2, 3, 3, 3), 0, (10, 3, 3, 3)) + helper((2, 8, 4, 5), 0, (10, 8, 4, 5)) + helper((2, 8, 4, 5), 0, (10, 6, 3, 2)) + helper((8, 8, 4, 5), 0, (6, 8, 4, 5)) + helper((8, 8, 4, 5), 0, (6, 7, 2, 3)) + helper((2, 8, 4, 5), 1, (2, 5, 3, 4)) + helper((2, 8, 4, 5), 2, (1, 8, 10, 3)) + helper((2, 8, 4, 5), 3, (2, 5, 3, 12)) + + # Test pytorch scatter_add and scatter + def test_scatter_add(self): + def helper(shape, dim, idx_shape, src_shape, idx_dtype=torch.int64, do_add=True): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + + cpu_src = torch.randn(src_shape, device='cpu', dtype=torch.float, requires_grad=True) + src = cpu_src.detach().clone().to('mps').requires_grad_() + + # Indices should be taken from range of axis along which gathering is done + idx_np = None + if(do_add): + idx_np = np.random.randint(0, shape[dim], idx_shape) + else: + idx_np = np.array([[0, 1, 2], + [1, 2, 3], + [2, 3, 4], + [3, 4, 5], + [4, 5, 6]]) + + cpu_idx = torch.tensor(idx_np, device='cpu', dtype=idx_dtype) + idx = cpu_idx.detach().clone().to('mps') + + scatter_result = None + scatter_result_cpu = None + + if(do_add): + scatter_result = torch.scatter_add(x, dim=dim, index=idx, src=src) + scatter_result_cpu = torch.scatter_add(cpu_x, dim=dim, index=cpu_idx, src=cpu_src) + else: + scatter_result = torch.scatter(x, dim=dim, index=idx, src=src) + scatter_result_cpu = torch.scatter(cpu_x, dim=dim, index=cpu_idx, src=cpu_src) + + cpu_grad = None + grad = None + + if(idx_shape == src_shape): + cpu_grad = torch.randn(shape, device='cpu', dtype=torch.float) + grad = cpu_grad.to('mps') + scatter_result.backward(gradient=grad) + scatter_result_cpu.backward(gradient=cpu_grad) + + self.assertEqual(scatter_result, scatter_result_cpu) + if(idx_shape == src_shape): + self.assertEqual(cpu_x.grad, x.grad) + self.assertEqual(cpu_src.grad, src.grad) + + helper((2, 3), 0, (5, 3), (5, 3)) + helper((2, 8, 4, 5), 0, (10, 8, 4, 5), (10, 8, 4, 5)) + helper((8, 8, 4, 5), 0, (10, 8, 4, 5), (10, 8, 4, 5)) + helper((8, 8, 4, 5), 0, (4, 7, 3, 2), (4, 7, 3, 2)) + helper((8, 8, 4, 5), 0, (4, 6, 3, 2), (4, 7, 3, 2)) + helper((8, 8, 4, 5), 0, (4, 6, 3, 2), (8, 8, 4, 5)) + + helper((2, 8, 4, 5), 1, (2, 20, 4, 5), (2, 20, 4, 5)) + helper((2, 8, 4, 5), 1, (2, 13, 3, 2), (2, 13, 3, 2)) + helper((8, 8, 4, 5), 1, (6, 5, 2, 3), (6, 5, 2, 3)) + helper((8, 8, 4, 5), 1, (3, 4, 2, 2), (6, 5, 2, 3)) + + helper((4, 5, 9, 8), 2, (4, 5, 13, 8), (4, 5, 13, 8)) + helper((4, 5, 9, 8), 2, (3, 4, 10, 6), (3, 4, 10, 6)) + helper((4, 5, 9, 8), 2, (3, 3, 7, 5), (3, 4, 10, 6)) + + # Test scatter src + helper((8, 3), 0, (5, 3), (5, 3), do_add=False) + helper((10, 3), 0, (5, 3), (5, 8), do_add=False) + + # Test pytorch scatter_reduce + def test_scatter_reduce(self): + def helper(shape, dim, idx_shape, src_shape, idx_dtype=torch.int64, reduce_str="sum"): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + + cpu_src = torch.randn(src_shape, device='cpu', dtype=torch.float, requires_grad=True) + src = cpu_src.detach().clone().to('mps').requires_grad_() + + # Indices should be taken from range of axis along which gathering is done + idx_np = np.random.randint(0, shape[dim], idx_shape) + + cpu_idx = torch.tensor(idx_np, device='cpu', dtype=idx_dtype) + idx = cpu_idx.detach().clone().to('mps') + + scatter_result = torch.scatter(x, dim=dim, index=idx, src=src, reduce=reduce_str) + scatter_result_cpu = torch.scatter(cpu_x, dim=dim, index=cpu_idx, src=cpu_src, reduce=reduce_str) + + self.assertEqual(scatter_result, scatter_result_cpu) + + # for reduce in ["sum", "prod", "amax", "amin"]: + for reduce in ["add", "multiply"]: + helper((2, 3), 0, (5, 3), (5, 3), reduce_str=reduce) + helper((2, 8, 4, 5), 0, (10, 8, 4, 5), (10, 8, 4, 5), reduce_str=reduce) + helper((8, 8, 4, 5), 0, (10, 8, 4, 5), (10, 8, 4, 5), reduce_str=reduce) + helper((8, 8, 4, 5), 0, (4, 7, 3, 2), (4, 7, 3, 2), reduce_str=reduce) + helper((8, 8, 4, 5), 0, (4, 6, 3, 2), (4, 7, 3, 2), reduce_str=reduce) + helper((8, 8, 4, 5), 0, (4, 6, 3, 2), (8, 8, 4, 5), reduce_str=reduce) + + helper((2, 8, 4, 5), 1, (2, 20, 4, 5), (2, 20, 4, 5), reduce_str=reduce) + helper((2, 8, 4, 5), 1, (2, 13, 3, 2), (2, 13, 3, 2), reduce_str=reduce) + helper((8, 8, 4, 5), 1, (6, 5, 2, 3), (6, 5, 2, 3), reduce_str=reduce) + helper((8, 8, 4, 5), 1, (3, 4, 2, 2), (6, 5, 2, 3), reduce_str=reduce) + + helper((4, 5, 9, 8), 2, (4, 5, 13, 8), (4, 5, 13, 8), reduce_str=reduce) + helper((4, 5, 9, 8), 2, (3, 4, 10, 6), (3, 4, 10, 6), reduce_str=reduce) + helper((4, 5, 9, 8), 2, (3, 3, 7, 5), (3, 4, 10, 6), reduce_str=reduce) + + def test_is_nonzero(self): + self.assertFalse(torch.is_nonzero(torch.tensor([0.]).to('mps'))) + self.assertTrue(torch.is_nonzero(torch.tensor([1.5]).to('mps'))) + self.assertFalse(torch.is_nonzero(torch.tensor([False]).to('mps'))) + self.assertTrue(torch.is_nonzero(torch.tensor([3]).to('mps'))) + + # Test triu + def test_triu(self): + def helper(shape, diag=0): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + + triu_result = torch.triu(x, diag) + triu_result_cpu = torch.triu(cpu_x, diag) + + cpu_grad = torch.randn(triu_result_cpu.shape) + grad = cpu_grad.to('mps') + + triu_result.backward(gradient=grad) + triu_result_cpu.backward(gradient=cpu_grad) + + self.assertEqual(triu_result, triu_result_cpu) + self.assertEqual(x.grad, cpu_x.grad) + + helper((2, 8, 4, 5)) + helper((2, 8, 4, 5), diag=1) + helper((2, 8, 4, 5), diag=2) + helper((2, 8, 4, 5), diag=3) + helper((2, 8, 4, 5), diag=-1) + helper((2, 8, 4, 5), diag=-2) + helper((2, 8, 4, 5), diag=-3) + + # Test tril + def test_tril(self): + def helper(shape, diag=0): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + + tril_result = torch.tril(x, diag) + tril_result_cpu = torch.tril(cpu_x, diag) + + cpu_grad = torch.randn(tril_result_cpu.shape) + grad = cpu_grad.to('mps') + + tril_result.backward(gradient=grad) + tril_result_cpu.backward(gradient=cpu_grad) + + self.assertEqual(tril_result, tril_result_cpu) + self.assertEqual(x.grad, cpu_x.grad) + + helper((2, 8, 4, 5)) + helper((2, 8, 4, 5), diag=1) + helper((2, 8, 4, 5), diag=2) + helper((2, 8, 4, 5), diag=3) + helper((2, 8, 4, 5), diag=-1) + helper((2, 8, 4, 5), diag=-2) + helper((2, 8, 4, 5), diag=-3) + + # Test diag + def test_diag(self): + def helper(shape, diag=0): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + + diag_result = torch.diag(x, diag) + diag_result_cpu = torch.diag(cpu_x, diag) + + # cpu_grad = torch.randn(diag_result_cpu.shape) + # grad = cpu_grad.to('mps') + + # diag_result.backward(gradient=grad) + # diag_result_cpu.backward(gradient=cpu_grad) + + self.assertEqual(diag_result, diag_result_cpu) + # self.assertEqual(x.grad, cpu_x.grad) + + for shape in [(5, 5), (5, 6), (6, 5), (5,), (6,)]: + for diag in [0, 1, 2, 3, 4, -1, -2, -3, -4]: + helper(shape, diag=diag) + + # Test softmax + def test_softmax(self): + def helper(shape, dim, channels_last=False): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True) + if(channels_last): + cpu_x = cpu_x.to(memory_format=torch.channels_last) + cpu_x.retain_grad() + x = cpu_x.detach().clone().to('mps').requires_grad_() + + softmax_result = torch.nn.functional.softmax(x, dim=dim) + softmax_result_cpu = torch.nn.functional.softmax(cpu_x, dim=dim) + + # Currently NOT testing backward for channels last backward + cpu_grad = None + grad = None + + if(not channels_last): + cpu_grad = torch.randn(shape, device='cpu', dtype=torch.float) + grad = cpu_grad.to('mps') + + softmax_result.backward(gradient=grad) + softmax_result_cpu.backward(gradient=cpu_grad) + + self.assertEqual(softmax_result, softmax_result_cpu) + if(not channels_last): + self.assertEqual(x.grad, cpu_x.grad) + + def helper2(dim): + cpu_x = torch.tensor(1.23, device='cpu', dtype=torch.float, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + + softmax_result = torch.nn.functional.softmax(x, dim=dim) + softmax_result_cpu = torch.nn.functional.softmax(cpu_x, dim=dim) + + cpu_grad = torch.tensor(2.34, device='cpu', dtype=torch.float) + grad = cpu_grad.to('mps') + + softmax_result.backward(gradient=grad) + softmax_result_cpu.backward(gradient=cpu_grad) + + self.assertEqual(softmax_result, softmax_result_cpu) + self.assertEqual(x.grad, cpu_x.grad) + + helper2(0) + + for channels_last in [False, True]: + for shape in [(2, 4, 8, 5), (3, 4, 6, 7, 2)]: + if(len(shape) != 4 and channels_last): + continue + for dim in [0, 1, 2, 3, -1, -2, -3]: + helper(shape, dim, channels_last) + + # Test sub + def test_sub(self): + def helper(shape, alpha): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + + cpu_y = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + y = cpu_y.detach().clone().to('mps') + + cpu_out = torch.sub(cpu_x, cpu_y, alpha=alpha) + out = torch.sub(x, y, alpha=alpha) + + self.assertEqual(out, cpu_out) + + helper((2, 8, 4, 5), 0.1) + helper((2, 8, 3, 5), 0.1) + helper((2, 8, 3, 5), 0.2) + + # Test where + def test_where(self): + def helper(shape, x_shape, y_shape, cond_dtype=torch.bool, x_dtype=torch.float): + + cpu_cond = torch.randint(2, shape, device='cpu', dtype=cond_dtype, requires_grad=False) + cond = cpu_cond.detach().clone().to('mps') + + cpu_x = torch.randn(x_shape, device='cpu', dtype=x_dtype, requires_grad=True) + x = cpu_x.detach().clone().to('mps').requires_grad_() + + cpu_y = torch.randn(y_shape, device='cpu', dtype=x_dtype, requires_grad=True) + y = cpu_y.detach().clone().to('mps').requires_grad_() + + cpu_out = torch.where(cpu_cond, cpu_x, cpu_y) + out = torch.where(cond, x, y) + + cpu_grad = torch.randn(cpu_out.shape) + grad = cpu_grad.to('mps') + + cpu_out.backward(gradient=cpu_grad) + out.backward(gradient=grad) + + self.assertEqual(out, cpu_out) + self.assertEqual(x.grad, cpu_x.grad) + self.assertEqual(y.grad, cpu_y.grad) + + for shape in ([(0, 3), [], (2, 3), (9,)]): + helper(shape, shape, shape) + + helper((2, 3, 1), (2, 3, 4), (2, 1, 4)) + helper((2, 1, 1), (2, 3, 4), (1, 3, 4)) + helper((1, 1, 1), (1, 1, 4), (2, 3, 1)) + helper([], (1, 1, 4), (2, 3, 1)) + helper([], (2, 3, 4), []) + + # Test normal + def test_normal(self): + def helper(shape, mean=0.0, std=1.0): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + + mps_out = torch.normal(mean, std, shape, device='mps') + + # print(mps_out.to('cpu')) + print(mps_out.to('cpu').mean()) + print(mps_out.to('cpu').std()) + + mean_array = np.ones(shape) + mean_array *= mean + cpu_mean_tensor = torch.tensor(mean_array, device='cpu', dtype=torch.float, requires_grad=False) + mean_tensor = cpu_mean_tensor.detach().clone().to('mps') + + std_array = np.ones(shape) + std_array *= std + cpu_std_tensor = torch.tensor(std_array, device='cpu', dtype=torch.float, requires_grad=False) + std_tensor = cpu_std_tensor.detach().clone().to('mps') + + mps_out = torch.zeros(shape, device='mps') + torch.normal(mean_tensor, std, out=mps_out) + print(mps_out.to('cpu').mean()) + print(mps_out.to('cpu').std()) + + mps_out = torch.zeros(shape, device='mps') + torch.normal(mean, std_tensor, out=mps_out) + print(mps_out.to('cpu').mean()) + print(mps_out.to('cpu').std()) + + mps_out = torch.zeros(shape, device='mps') + torch.normal(mean_tensor, std_tensor, out=mps_out) + print(mps_out.to('cpu').mean()) + print(mps_out.to('cpu').std()) + + helper((2, 3, 4, 5, 6)) + helper((100, 100), 2.5, 1.2) + + def test_bernoulli(self): + def helper(shape, prob=0.5): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + + prob_array = np.ones(shape) + prob_array *= prob + cpu_prob_tensor = torch.tensor(prob_array, device='cpu', dtype=torch.float, requires_grad=False) + prob_tensor = cpu_prob_tensor.detach().clone().to('mps') + + mps_out = torch.bernoulli(prob_tensor) + # Compare "real" with theoretical values + print(mps_out.to('cpu').mean(), prob) + print(mps_out.to('cpu').std() ** 2, prob * (1 - prob)) + + mps_out = torch.zeros(shape, device='mps') + mps_out = torch.bernoulli(mps_out, prob) + + print(mps_out.to('cpu').mean(), prob) + print(mps_out.to('cpu').std() ** 2, prob * (1 - prob)) + + helper((100, 100), 0.50) + helper((100, 100), 0.76) + helper((100, 100), 0.23) + + # Test random_.to and random_.from + def test_random(self): + def helper(shape, low, high, dtype=torch.int32): + + print(low, high) + mps_out = torch.randint(low, high, shape, dtype=dtype, device='mps') + + print(mps_out.to('cpu').float().mean(), (low + (high - 1)) / 2.) + print(mps_out.to('cpu').float().std() ** 2, ((high - low)**2 - 1) / 12.) + + helper([100, 100], 0, 10) + helper([100, 100], 23, 89) + helper([100, 100], 23, 89, dtype=torch.float32) + helper([100, 100], 23, 89, dtype=torch.int64) + helper([100, 100], 0, 2, dtype=torch.bool) + + # Test add + def test_add_binary_op(self): + def helper(shape, alpha): + cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + + cpu_y = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False) + y = cpu_y.detach().clone().to('mps') + + cpu_out = torch.add(cpu_x, cpu_y, alpha=alpha) + out = torch.add(x, y, alpha=alpha) + + self.assertEqual(out, cpu_out) + + helper((2, 8, 4, 5), 0.1) + helper((2, 8, 3, 5), 0.1) + helper((2, 8, 3, 5), 0.2) + + # Test add + def test_add_scalars(self): + def helper(alpha=1.0): + cpu_x = torch.tensor(2.3, device='cpu', dtype=torch.float, requires_grad=False) + x = cpu_x.detach().clone().to('mps') + + cpu_y = torch.tensor(3.4, device='cpu', dtype=torch.float, requires_grad=False) + y = cpu_y.detach().clone().to('mps') + + cpu_out = torch.add(cpu_x, cpu_y, alpha=alpha) + out = torch.add(x, y, alpha=alpha) + + print(out.to('cpu')) + + self.assertEqual(out, cpu_out) + + helper() + helper(0.1) + helper(0.2) + + def test_atan2(self): + def helper(shape): + input_cpu = torch.randn(shape) + input_mps = input_cpu.detach().clone().to("mps") + + other_cpu = torch.randn(shape) + other_mps = other_cpu.detach().clone().to("mps") + + atan2_cpu = torch.atan2(input_cpu, other_cpu) + atan2_mps = torch.atan2(input_mps, other_mps) + + self.assertEqual(atan2_cpu, atan2_mps.to("cpu")) + + helper(4) + helper(10000) + helper((10000, 40)) + + +class TestNNMPS(NNTestCase): + + def _create_basic_net(self): + class Layer(nn.Module): + def __init__(self): + super(Layer, self).__init__() + self.layer_dummy_param = Parameter(torch.empty(3, 5)) + self.register_buffer('layer_dummy_buf', torch.zeros(1, 3, 3, 7)) + + class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.l1 = Layer() + self.dummy_param = Parameter(torch.empty(3, 5)) + self.register_buffer('dummy_buf', torch.zeros(7, 3, 3, 1)) + + l = Layer() + n = Net() + s = nn.Sequential(n, n) + + return l, n, s + + def test_requires_grad_(self): + m = self._create_basic_net()[-1] + assert len(list(m.buffers())) > 0, 'invalid test' + assert all(not b.requires_grad for b in m.buffers()) > 0, 'invalid test' + assert len(list(m.parameters())) > 0, 'invalid test' + assert all(p.requires_grad for p in m.parameters()) > 0, 'invalid test' + for requires_grad in (False, True): + self.assertIs(m.requires_grad_(requires_grad), m) + for p in m.parameters(): + self.assertEqual(p.requires_grad, requires_grad) + for b in m.buffers(): + self.assertFalse(b.requires_grad) + + def test_module_backcompat(self): + from torch.serialization import SourceChangeWarning + path = download_file('https://download.pytorch.org/test_data/linear.pt') + with warnings.catch_warnings(): + warnings.simplefilter('ignore', SourceChangeWarning) + m = torch.load(path) + input = torch.randn(2, 3, dtype=torch.float) + self.assertEqual(m(input).size(), (2, 5)) + + def test_conv_backcompat(self): + from torch.serialization import SourceChangeWarning + # This file was generated by running on PyTorch 1.0.1 on Python 2: + # + # import torch + # from torch import nn + # m = nn.Conv2d(1, 1, 1) + # torch.save(m, 'legacy_conv2d.pt') + # + # NB: This Pickle also contains some Unicode data! + path = download_file('https://download.pytorch.org/test_data/legacy_conv2d.pt') + with warnings.catch_warnings(): + warnings.simplefilter('ignore', SourceChangeWarning) + m = torch.load(path, encoding='utf-8') + input = torch.randn((1, 1, 1, 1), dtype=torch.float) + self.assertEqual(m(input).size(), (1, 1, 1, 1)) + + def test_zero_grad(self): + i = torch.randn(2, 5, requires_grad=True) + module = nn.Linear(5, 5) + for p in module.parameters(): + p.requires_grad = False + module.zero_grad() + + module.weight.requires_grad = True + module.zero_grad() + self.assertIsNone(module.weight.grad) # uninitialized grad + + module(i).sum().backward() + self.assertIsNotNone(module.weight.grad) + self.assertGreater(module.weight.grad.data.abs().sum(), 0) + module.zero_grad() + self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_()) + + module.bias.requires_grad = True + module.zero_grad() + self.assertIsNotNone(module.weight.grad) + self.assertIsNone(module.bias.grad) + module(i).sum().backward() + self.assertIsNotNone(module.weight.grad) + self.assertIsNotNone(module.bias.grad) + self.assertGreater(module.weight.grad.data.abs().sum(), 0) + self.assertGreater(module.bias.grad.data.abs().sum(), 0) + module.zero_grad() + self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_()) + self.assertEqual(module.bias.grad.data, module.bias.data.clone().zero_()) + + # Force set to None. + module.zero_grad(set_to_none=True) + self.assertIsNone(module.weight.grad) + + def test_no_grad(self): + for dtype in [torch.bfloat16, torch.float, torch.double]: + module = nn.Conv2d(2, 5, kernel_size=3, padding=1).to(dtype) + input = torch.randn(1, 2, 10, 10).to(dtype) + x = input + y = input.clone() + + output = module(x) + self.assertTrue(output.requires_grad) + output.backward(torch.ones(1, 5, 10, 10)) + + with torch.no_grad(): + output2 = module(y) + self.assertFalse(output2.requires_grad) + self.assertRaises(RuntimeError, lambda: output2.backward(torch.ones(1, 5, 10, 10))) + + def test_invalid_conv1d(self): + for dtype in [torch.bfloat16, torch.float, torch.double]: + module = nn.Conv1d(in_channels=3, out_channels=33, kernel_size=10, stride=1, bias=True).to(dtype) + input = torch.randn(1, 3, 4).to(dtype) + with self.assertRaisesRegex(RuntimeError, + r'Calculated padded input size per channel: \(4\). ' + + r'Kernel size: \(10\). Kernel size can\'t be greater than actual input size'): + module(input) + + # Negative stride check + module = nn.Conv1d(in_channels=3, out_channels=6, kernel_size=3, stride=-1, bias=True).to(dtype) + input = torch.randn(1, 3, 4).to(dtype) + with self.assertRaisesRegex(RuntimeError, 'non-positive stride is not supported'): + module(input) + + def test_conv2d_discontiguous_weight(self): + # Test for https://github.com/pytorch/pytorch/issues/55781 + x = torch.ones(64, 16, 16, 16) + weight = torch.arange(0, 1.0, 1 / 2.0 ** 10).reshape(32, 16, 1, 2)[:, :, :, ::2] + self.assertFalse(weight.is_contiguous()) + y = torch.nn.functional.conv2d(x, weight, None) + if torch.backends.mkldnn.is_available(): + # Disable MKLDNN explicitly, so that either NNPACK or THCNN will be used + with torch.backends.mkldnn.flags(enabled=False): + y_ = torch.nn.functional.conv2d(x, weight, None) + self.assertEqual(y, y_) + self.assertEqual(y.sum(), 4186112.) + + def test_invalid_conv2d(self): + for dtype in [torch.bfloat16, torch.float, torch.double]: + module = torch.nn.Conv2d(1, 1, kernel_size=3, dilation=2, stride=2).to(dtype) + input = torch.empty(1, 1, 4, 4).to(dtype) + self.assertRaises(RuntimeError, lambda: module(input)) + + module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, stride=1, bias=True) + input = torch.randn(1, 3, 1, 1) + with self.assertRaisesRegex(RuntimeError, + r'Calculated padded input size per channel: \(1 x 1\). ' + + r'Kernel size: \(10 x 10\). Kernel size can\'t be greater than actual input size'): + module(input) + + # Negative stride check + module = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=4, stride=-1, bias=True).to(dtype) + input = torch.randn(1, 3, 4, 4).to(dtype) + with self.assertRaisesRegex(RuntimeError, 'non-positive stride is not supported'): + module(input) + + # Zero stride check + module = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=4, stride=0, bias=True).to(dtype) + input = torch.randn(1, 3, 4, 4).to(dtype) + with self.assertRaisesRegex(RuntimeError, 'non-positive stride is not supported'): + module(input) + + def test_conv2d_valid_padding(self, device='mps'): + # Test F.conv2d padding='valid' is the same as no padding + x = torch.rand(1, 1, 1, 10, device=device).to(torch.float) + y = torch.rand(1, 1, 1, 4, device=device).to(torch.float) + + expect = F.conv2d(x, y) + actual = F.conv2d(x, y, padding='valid') + self.assertEqual(expect.to('cpu'), actual.to('cpu')) + + # def test_conv2d_same_padding(self, device='mps'): + # x = torch.rand(1, 1, 10, 11, device=device) + # y = torch.rand(1, 1, 4, 5, device=device) + # expect = F.conv2d(x, y, padding=(2, 2))[..., 1:, :] + # actual = F.conv2d(x, y, padding='same') + # self.assertEqual(expect.to('cpu'), actual.to('cpu')) + + # # With dilation + # y = torch.rand(1, 1, 3, 4, device=device) + # expect = F.conv2d(x, y, padding=(2, 3), dilation=2) + # actual = F.conv2d(x, y, padding='same', dilation=2) + # self.assertEqual(expect, actual) + + # # Dilation with asymmetric padding + # y = torch.rand(1, 1, 4, 4, device=device) + # expect = F.conv2d(x, y, padding=5, dilation=3)[..., 1:, 1:] + # actual = F.conv2d(x, y, padding='same', dilation=3) + # self.assertEqual(expect, actual) + + +class TestConstantPadNd(TestCase): + def test_preserves_memory_format(self): + nchw_tensor = torch.rand((1, 2, 5, 3)) + nchw_padded = torch.constant_pad_nd(nchw_tensor, [1, 2], 0.5) + self.assertTrue(nchw_padded.is_contiguous(memory_format=torch.contiguous_format)) + + nhwc_tensor = nchw_tensor.contiguous(memory_format=torch.channels_last) + nhwc_padded = torch.constant_pad_nd(nhwc_tensor, [1, 2], 0.5) + self.assertTrue(nhwc_padded.is_contiguous(memory_format=torch.channels_last)) + + +class TestLinalgMPS(TestCase): + def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out=False): + dtype = t.dtype + numpy_dtype = dtype + alpha = 1.2 if alpha is None else alpha + beta = 0.8 if beta is None else beta + res1 = f(t, m, v, alpha=alpha, beta=beta) + res2 = torch.full_like(res1, math.nan) + if transpose_out: + res2 = res2.t().clone(memory_format=torch.contiguous_format).t() + f(t, m, v, alpha=alpha, beta=beta, out=res2) + res3 = alpha * (m.to(numpy_dtype).cpu().numpy() @ v.to(numpy_dtype).cpu().numpy()) + if beta != 0: + res3 += (torch.mul(t, beta)).to(numpy_dtype).cpu().numpy() + res3 = torch.from_numpy(res3).to(dtype) + self.assertEqual(res1, res2) + self.assertEqual(res1, res3) + + def test_addmm(self, device="mps", dtype=torch.float32): + M = torch.randn(10, 25, device=device).to(dtype) + m1 = torch.randn(10, 50, device=device).to(dtype) + m2 = torch.randn(50, 25, device=device).to(dtype) + self._test_addmm_addmv(torch.addmm, M, m1, m2) + + # Test beta=0, M=nan + M = torch.full((10, 25), math.nan, device=device).to(dtype) + m1 = torch.randn(10, 50, device=device).to(dtype) + m2 = torch.randn(50, 25, device=device).to(dtype) + self._test_addmm_addmv(torch.addmm, M, m1, m2, beta=0) + + # Test transpose + for t1, t2, t3, t4 in itertools.product([True, False], repeat=4): + def maybe_transpose(cond, m): + if not cond: + return m + return m.t().clone(memory_format=torch.contiguous_format).t() + + M = maybe_transpose(t1, torch.randn(10, 25, device=device).to(dtype)) + m1 = maybe_transpose(t2, torch.randn(10, 50, device=device).to(dtype)) + m2 = maybe_transpose(t3, torch.randn(50, 25, device=device).to(dtype)) + self._test_addmm_addmv(torch.addmm, M, m1, m2, transpose_out=t4) + + +class TestRNNMPS(TestCase): + def test_lstm_1(self, device="mps", dtype=torch.float32): + + rnn = nn.LSTM(1, 4, 2, device="cpu") + input = torch.randn(2, 3, 1, device="cpu") + hx = torch.zeros(2, 3, 4, device="cpu") + cx = torch.zeros(2, 3, 4, device="cpu") + outputs = [] + for device in [torch.device("cpu"), torch.device("mps")]: + rnn = rnn.to(device) + input = input.to(device) + hx = hx.to(device) + cx = cx.to(device) + weight_list = [] + output, _ = rnn(input, (hx, cx)) + print(output.to('cpu')) + + def test_lstm_2(self, device="mps", dtype=torch.float32): + rnn = nn.LSTM(1, 4, 1, device="cpu") + input = torch.randn(2, 3, 1, device="cpu", requires_grad=True) + hx = torch.zeros(1, 3, 4, device="cpu") + cx = torch.zeros(1, 3, 4, device="cpu") + outputs = [] + for device in [torch.device("cpu"), torch.device("mps")]: + rnn = rnn.to(device) + input = input.to(device) + input.retain_grad() + hx = hx.to(device) + cx = cx.to(device) + + output, _ = rnn(input, (hx, cx)) + # Test by passing ones as the gradient from the loss. + output.backward(torch.ones_like(output)) + + print(rnn.weight_ih_l0.grad) + # Gradient on GPU is 2x the CPU gradient??? + + +if __name__ == "__main__": + run_tests() diff --git a/test/test_multiprocessing.py b/test/test_multiprocessing.py index fc5fadcc20b8..5b939afd998f 100644 --- a/test/test_multiprocessing.py +++ b/test/test_multiprocessing.py @@ -15,7 +15,7 @@ import torch.utils.hooks from torch.nn import Parameter from torch.testing._internal.common_utils import (TestCase, run_tests, IS_WINDOWS, NO_MULTIPROCESSING_SPAWN, TEST_WITH_ASAN, - load_tests, slowTest, TEST_WITH_TSAN) + load_tests, slowTest, TEST_WITH_TSAN, TEST_WITH_ROCM) # load_tests from common_utils is used to automatically filter tests for # sharding on sandcastle. This line silences flake warnings @@ -258,7 +258,7 @@ def test_fill(): self.assertTrue(e.is_set()) self.assertTrue(data[0].eq(4).all()) self.assertTrue(data[1].eq(4).all()) - p.join(1) + p.join(100) self.assertFalse(p.is_alive()) def test_receive(): @@ -280,7 +280,7 @@ def test_receive(): # collect them properly del t1, t2 e.set() - p.join(1) + p.join(100) self.assertFalse(p.is_alive()) with leak_checker(self) as lc: @@ -383,7 +383,12 @@ def test_inherit_tensor(self): def test_autograd_errors(self): ctx = mp.get_context('fork') simple_autograd_function() - with self.assertRaisesRegex(RuntimeError, r'Unable to handle autograd'): + # Autograd only uses thread when GPUs are involved + if torch.cuda.is_available() or torch.backends.mps.is_available(): + with self.assertRaisesRegex(RuntimeError, r'Unable to handle autograd'): + with ctx.Pool(3) as pool: + pool.map(simple_autograd_function, [1, 2, 3]) + else: with ctx.Pool(3) as pool: pool.map(simple_autograd_function, [1, 2, 3]) @@ -585,6 +590,7 @@ def _test_event_multiprocess_child(event, p2c, c2p): @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \ don't support multiprocessing with spawn start method") @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available') + @unittest.skipIf(TEST_WITH_ROCM, 'Skip the test for ROCm') def test_event_multiprocess(self): event = torch.cuda.Event(enable_timing=False, interprocess=True) self.assertTrue(event.query()) @@ -643,6 +649,7 @@ def _test_event_handle_importer_consumer(handle, p2c, c2p): @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \ don't support multiprocessing with spawn start method") @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available') + @unittest.skipIf(TEST_WITH_ROCM, 'Skip the test for ROCm') def test_event_handle_importer(self): e0 = torch.cuda.Event(enable_timing=False, interprocess=True) self.assertTrue(e0.query()) @@ -682,6 +689,7 @@ def _test_event_handle_exporter_consumer(handle, p2c, c2p): @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \ don't support multiprocessing with spawn start method") @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available') + @unittest.skipIf(TEST_WITH_ROCM, 'Skip the test for ROCm') def test_event_handle_exporter(self): e0 = torch.cuda.Event(enable_timing=False, interprocess=True) @@ -748,7 +756,7 @@ def hook(*unused): self.assertEqual(var.data, torch.ones(5, 5, device=device)) self.assertEqual(var.grad.data, torch.ones(5, 5, device=device) * 4) - p.join(1) + p.join(100) self.assertFalse(p.is_alive()) # Check sharing a cudaMalloc allocation with different types of storage. diff --git a/test/test_namedtuple_return_api.py b/test/test_namedtuple_return_api.py index ddc23e45f276..7bb529e8bbcc 100644 --- a/test/test_namedtuple_return_api.py +++ b/test/test_namedtuple_return_api.py @@ -14,14 +14,14 @@ aten_native_yaml = os.path.join(path, '../aten/src/ATen/native/native_functions.yaml') all_operators_with_namedtuple_return = { 'max', 'min', 'aminmax', 'median', 'nanmedian', 'mode', 'kthvalue', 'svd', 'symeig', 'eig', - 'qr', 'geqrf', 'solve', 'slogdet', 'sort', 'topk', 'lstsq', 'linalg_inv_ex', + 'qr', 'geqrf', 'slogdet', 'sort', 'topk', 'lstsq', 'linalg_inv_ex', 'triangular_solve', 'cummax', 'cummin', 'linalg_eigh', "_unpack_dual", 'linalg_qr', 'linalg_svd', '_linalg_svd', 'linalg_slogdet', 'fake_quantize_per_tensor_affine_cachemask', 'fake_quantize_per_channel_affine_cachemask', 'linalg_lstsq', 'linalg_eig', 'linalg_cholesky_ex', - 'frexp', 'lu_unpack', 'histogram', '_fake_quantize_per_tensor_affine_cachemask_tensor_qparams', - '_fused_moving_avg_obs_fq_helper', 'linalg_lu_factor', 'linalg_lu_factor_ex', - '_det_lu_based_helper', - '_lu_with_info', + 'frexp', 'lu_unpack', 'histogram', 'histogramdd', + '_fake_quantize_per_tensor_affine_cachemask_tensor_qparams', + '_fused_moving_avg_obs_fq_helper', 'linalg_lu_factor', 'linalg_lu_factor_ex', 'linalg_lu', + '_det_lu_based_helper', '_lu_with_info', 'linalg_ldl_factor_ex', 'linalg_ldl_factor', } @@ -76,7 +76,6 @@ def test_namedtuple_return(self): op(operators=['_linalg_svd'], input=(), names=('U', 'S', 'Vh'), hasout=True), op(operators=['slogdet'], input=(), names=('sign', 'logabsdet'), hasout=False), op(operators=['qr', 'linalg_qr'], input=(), names=('Q', 'R'), hasout=True), - op(operators=['solve'], input=(a,), names=('solution', 'LU'), hasout=True), op(operators=['geqrf'], input=(), names=('a', 'tau'), hasout=True), op(operators=['symeig', 'eig'], input=(True,), names=('eigenvalues', 'eigenvectors'), hasout=True), op(operators=['triangular_solve'], input=(a,), names=('solution', 'cloned_coefficient'), hasout=True), @@ -88,6 +87,9 @@ def test_namedtuple_return(self): op(operators=['linalg_inv_ex'], input=(), names=('inverse', 'info'), hasout=True), op(operators=['linalg_lu_factor'], input=(), names=('LU', 'pivots'), hasout=True), op(operators=['linalg_lu_factor_ex'], input=(), names=('LU', 'pivots', 'info'), hasout=True), + op(operators=['linalg_ldl_factor'], input=(), names=('LD', 'pivots'), hasout=True), + op(operators=['linalg_ldl_factor_ex'], input=(), names=('LD', 'pivots', 'info'), hasout=True), + op(operators=['linalg_lu'], input=(), names=('P', 'L', 'U'), hasout=True), op(operators=['fake_quantize_per_tensor_affine_cachemask'], input=(0.1, 0, 0, 255), names=('output', 'mask',), hasout=False), op(operators=['fake_quantize_per_channel_affine_cachemask'], @@ -100,6 +102,7 @@ def test_namedtuple_return(self): input=(torch.tensor([3, 2, 1, 4, 5], dtype=torch.int32), True, True), names=('P', 'L', 'U'), hasout=True), op(operators=['histogram'], input=(1,), names=('hist', 'bin_edges'), hasout=True), + op(operators=['histogramdd'], input=(1,), names=('hist', 'bin_edges'), hasout=False), op(operators=['_fake_quantize_per_tensor_affine_cachemask_tensor_qparams'], input=(torch.tensor([1.0]), torch.tensor([0], dtype=torch.int), torch.tensor([1]), 0, 255), names=('output', 'mask',), hasout=False), diff --git a/test/test_native_mha.py b/test/test_native_mha.py new file mode 100644 index 000000000000..1689789f9cea --- /dev/null +++ b/test/test_native_mha.py @@ -0,0 +1,306 @@ +# Owner(s): ["module: nn"] +import math + +import torch +from torch.testing._internal.common_device_type import ( + dtypes, + dtypesIfCUDA, + instantiate_device_type_tests, + onlyCUDA, + skipMeta, +) +from torch.testing._internal.common_utils import run_tests, TestCase + +class TestMHADeviceType(TestCase): + @torch.no_grad() + def _test_transform_bias_rescale_qkv_impl( + self, device, dtype, use_nt, use_padding=False + ): + tests = [ + (64, 4, 16, 8), + # dim_per_head = 12 does not divide evenly by CPU vectorization length of 8 + (24, 2, 4, 2), + # Make sure CUDA can handle small input sizes + (2, 2, 2, 2), + # dim_per_head = 6 does not divide evenly by CUDA vectorization length of 4, + # causes alignment issues + (24, 4, 4, 2), + (48, 4, 16, 8), + ] + for (embed_dim, num_heads, bs, sl) in tests: + with self.subTest(embed_dim=embed_dim, num_heads=num_heads, bs=bs, sl=sl): + torch.manual_seed(9343) + dense_x = x = ( + torch.randn(bs, sl, 3 * embed_dim, device=device, dtype=dtype) * 10 + ) + if use_padding: + x[0][-1] = torch.full(x[0][-1].shape, float("-Inf")) + if use_nt: + xs = list(torch.unbind(x)) + if use_padding: + xs[0] = xs[0][:-1] + x = torch.nested_tensor(xs, device=device, dtype=dtype) + qkv = torch.nn.Linear(embed_dim, 3 * embed_dim, device=device, dtype=dtype) + + # We have to use inference_mode here because q/k/v are + # all views of the same Tensor, which autograd doesn't + # like. This is fine because this function is only + # exposed to Python for purposes of writing this test. + with torch.inference_mode(): + (q, k, v) = torch._transform_bias_rescale_qkv( + x, qkv.bias, num_heads=num_heads + ) + + def simple_transform_bias_rescale_qkv(qkv, bias): + (q, k, v) = torch.split(qkv, embed_dim, dim=-1) + (q_bias, k_bias, v_bias) = torch.split(bias, embed_dim, dim=-1) + return tuple( + x.reshape( + (bs, sl, num_heads, embed_dim // num_heads) + ).transpose(2, 1) + for x in ( + (q + q_bias) / math.sqrt(embed_dim // num_heads), + (k + k_bias), + (v + v_bias), + ) + ) + + correct_q, correct_k, correct_v = simple_transform_bias_rescale_qkv( + dense_x, qkv.bias + ) + if use_nt and use_padding: + for t in (correct_q, correct_k, correct_v): + t[t == float("-Inf")] = 0 + + self.assertEqual(q.size(), correct_q.size()) + torch.testing.assert_close(q, correct_q) + torch.testing.assert_close(k, correct_k) + torch.testing.assert_close(v, correct_v) + + @dtypesIfCUDA(torch.float) + @dtypes(torch.float) + @skipMeta + def test_transform_bias_rescale_qkv(self, device, dtype): + for use_padding in (False, True): + with self.subTest(use_padding=use_padding): + self._test_transform_bias_rescale_qkv_impl( + device, dtype, use_nt=False, use_padding=use_padding + ) + + @dtypesIfCUDA(torch.float) + @dtypes(torch.float) + @skipMeta + @onlyCUDA + def test_transform_bias_rescale_qkv_nested(self, device, dtype): + for use_padding in (False, True): + with self.subTest(use_padding=use_padding): + self._test_transform_bias_rescale_qkv_impl( + device, dtype, use_nt=True, use_padding=use_padding + ) + + def _test_multihead_attention_impl( + self, device, dtype, mode, use_nt, need_weights, average_attn_weights, use_padding=False, pad_all=False + ): + embed_dim = 64 + num_heads = 4 + bs = 16 + sl = 8 + + q = torch.randn(bs, sl, embed_dim, device=device, dtype=dtype) * 10 + if use_padding: + if pad_all: + for q_i in q: + q_i[-1] = torch.zeros_like(q[0][-1], device=device, dtype=dtype) + mask = torch.zeros(q.shape[:-1], device=device, dtype=torch.bool) + for mask_i in mask: + mask_i[-1] = True + else: + q[0][-1] = torch.zeros_like(q[0][-1], device=device, dtype=dtype) + mask = torch.zeros(q.shape[:-1], device=device, dtype=torch.bool) + mask[0][-1] = True + if mode == "self": + k = q + v = q + elif mode == "encdec": + k = torch.randn(bs, sl, embed_dim, device=device, dtype=dtype) * 10 + v = k + elif mode == "generic": + k = torch.randn(bs, sl, embed_dim, device=device, dtype=dtype) * 10 + v = torch.randn(bs, sl, embed_dim, device=device, dtype=dtype) * 10 + else: + self.fail(f"invalid mode `{mode}`!") + + qkv = torch.nn.Linear(embed_dim, 3 * embed_dim, device=device, dtype=dtype) + proj = torch.nn.Linear(embed_dim, embed_dim, device=device, dtype=dtype) + + pt = torch.nn.MultiheadAttention( + embed_dim, num_heads, batch_first=True, device=device, dtype=dtype + ) + pt.in_proj_weight = qkv.weight + pt.in_proj_bias = qkv.bias + pt.out_proj.weight = proj.weight + pt.out_proj.bias = proj.bias + + class NativeMHA(torch.nn.Module): + def __init__(self, embed_dim, num_heads, qkv, proj): + super().__init__() + self.qkv = qkv + self.proj = proj + self.embed_dim = embed_dim + self.num_heads = num_heads + + def forward(self, q, k, v, key_padding_mask): + return torch._native_multi_head_attention( + q, + k, + v, + self.embed_dim, + self.num_heads, + self.qkv.weight, + self.qkv.bias, + self.proj.weight, + self.proj.bias, + key_padding_mask, + need_weights=need_weights, + average_attn_weights=average_attn_weights, + ) + + npt = NativeMHA( + embed_dim=embed_dim, num_heads=num_heads, qkv=qkv, proj=proj + ).to(dtype) + + if device == "cuda": + pt = pt.cuda() + npt = npt.cuda() + + ypt, weight_pt = pt( + q, + k, + v, + need_weights=need_weights, + average_attn_weights=average_attn_weights, + key_padding_mask=mask if use_padding else None, + ) + if use_nt: + qs = list(torch.unbind(q)) + if use_padding: + if pad_all: + qs = [x[:-1] for x in qs] + else: + qs[0] = qs[0][:-1] + q = torch.nested_tensor(qs, device=device, dtype=dtype) + if mode == "self": + k = v = q + elif mode == "encdec": + k = torch.nested_tensor(torch.unbind(k), device=device, dtype=dtype) + v = k + else: + k = torch.nested_tensor(torch.unbind(k), device=device, dtype=dtype) + v = torch.nested_tensor(torch.unbind(v), device=device, dtype=dtype) + + ynpt, weight_npt = npt( + q, k, v, key_padding_mask=mask if use_padding and not use_nt else None + ) + if use_nt: + ynpt = ynpt.to_padded_tensor(0) + if pad_all: + ynpt_final = torch.zeros_like(ypt) + ynpt_final[:, :ynpt.shape[1], :] = ynpt + ynpt = ynpt_final + + def do_pad_all(tensors): + for t in tensors: + for t_i in t: + t_i[-1] = torch.zeros_like(t_i[-1], device=device, dtype=dtype) + + # PyTorch implementation returns non-zero junk in the padding + # locations; overwrite it so that the comparison works out. + if use_padding: + ypt[0][-1] = torch.zeros_like(ypt[0][-1], device=device, dtype=dtype) + ynpt[0][-1] = torch.zeros_like(ynpt[0][-1], device=device, dtype=dtype) + if pad_all: + do_pad_all((ypt, ynpt)) + # Zero the last row of each TxT weight matrix + if need_weights: + if average_attn_weights: + weight_pt[0][-1] = torch.zeros_like(weight_pt[0][-1], device=device, dtype=dtype) + weight_npt[0][-1] = torch.zeros_like(weight_npt[0][-1], device=device, dtype=dtype) + if pad_all: + do_pad_all((weight_pt, weight_npt)) + else: + for nh in range(num_heads): + weight_pt[0][nh][-1] = torch.zeros_like(weight_pt[0][nh][-1], device=device, dtype=dtype) + weight_npt[0][nh][-1] = torch.zeros_like(weight_npt[0][nh][-1], device=device, dtype=dtype) + + if dtype == torch.half: + torch.testing.assert_close(ypt, ynpt, atol=1e-3, rtol=1e-3) + else: + # High rtol seems necessary for + # test_native_multihead_attention_cpu_float32 on Windows, + # otherwise 2e-4 would likely be fine. + torch.testing.assert_close(ypt, ynpt, atol=2e-5, rtol=2e-3) + + if need_weights: + torch.testing.assert_close(weight_pt, weight_npt) + else: + self.assertEqual(weight_pt, weight_npt) + + @dtypesIfCUDA(torch.float, torch.half) + @dtypes(torch.float) + @skipMeta + @torch.no_grad() + def test_native_multihead_self_attention(self, device, dtype): + for (use_padding, pad_all) in ((False, False), (True, False), (True, True)): + for use_nt in (False, True): + # Figuring out exactly which elements of the weights are garbage in this + # case eludes me, and it's not particularly enlightening to test anyway + # because padding doesn't especially affect the intermediate weights. + for need_weights in (False, not pad_all): + for average_attn_weights in (False, True): + with self.subTest(use_padding=use_padding, pad_all=pad_all, + use_nt=use_nt, need_weights=need_weights, + average_attn_weights=average_attn_weights): + self._test_multihead_attention_impl( + device, + dtype, + "self", + use_nt=use_nt, + use_padding=use_padding, + pad_all=pad_all, + need_weights=need_weights, + average_attn_weights=average_attn_weights, + ) + + @dtypesIfCUDA(torch.float, torch.half) + @dtypes(torch.float) + @skipMeta + @torch.no_grad() + def test_native_multihead_encoder_decoder_attention(self, device, dtype): + self._test_multihead_attention_impl( + device, + dtype, + "encdec", + use_nt=False, + need_weights=False, + average_attn_weights=False, + ) + + @dtypesIfCUDA(torch.float, torch.half) + @dtypes(torch.float) + @skipMeta + @torch.no_grad() + def test_native_multihead_attention(self, device, dtype): + self._test_multihead_attention_impl( + device, + dtype, + "generic", + use_nt=False, + need_weights=False, + average_attn_weights=False, + ) + + +instantiate_device_type_tests(TestMHADeviceType, globals()) + +if __name__ == "__main__": + run_tests() diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py new file mode 100644 index 000000000000..b01ecbc4675a --- /dev/null +++ b/test/test_nestedtensor.py @@ -0,0 +1,420 @@ +# Owner(s): ["module: nestedtensor"] + +import torch +import torch.nn +import unittest +from torch.testing._internal.common_device_type import ( + dtypes, + dtypesIfCUDA, + instantiate_device_type_tests, + skipMeta, +) +from torch.testing._internal.common_utils import TestCase, IS_FBCODE, run_tests +from torch import nested_tensor + +# Tests are ported from pytorch/nestedtensor. +# This makes porting as_nested_tensor easier in the future. +def _iter_constructors(): + # yield as_nested_tensor + yield nested_tensor + + +class TestNestedTensor(TestCase): + @torch.inference_mode() + def _test_unbind_case(self, a, b): + nt = nested_tensor([a, b]) + a1, b1 = nt.unbind() + self.assertTrue(a is not a1) + self.assertTrue(b is not b1) + + nt = nested_tensor([a, b], dtype=a.dtype) + a1, b1 = nt.unbind(0) + self.assertEqual(a, a1) + self.assertEqual(b, b1) + + a = torch.randn((2, 3)).add_(1) + nt = nested_tensor([a]) + self.assertEqual(a, nt.unbind(0)[0]) + + @torch.inference_mode() + def test_unbind_0(self): + self._test_unbind_case( + torch.tensor([1, 2]), torch.tensor([7, 8]), + ) + + @torch.inference_mode() + def test_unbind_1(self): + self._test_unbind_case( + torch.tensor([1]), torch.tensor([7]), + ) + + # @torch.inference_mode() + # def test_unbind_2(self): + # self._test_unbind_case( + # torch.tensor(1), torch.tensor(7), + # ) + + @torch.inference_mode() + def test_unbind_3(self): + self._test_unbind_case( + torch.tensor([1.0]), torch.tensor([]), + ) + + @torch.inference_mode() + def test_unbind_4(self): + self._test_unbind_case( + torch.tensor([]), torch.tensor([]), + ) + + @torch.inference_mode() + def test_unbind_dim(self): + def _test_fn(unbind_fn): + a = torch.rand(3, 2) + b = torch.rand(2, 3) + nt = nested_tensor([a, b]) + self.assertRaises(RuntimeError, lambda: unbind_fn(nt, 1)) + + # Both of these tests are necessary, because we're using + # torch_function. + _test_fn(lambda x, dim: x.unbind(dim)) + # TODO: Re-enable this once using torch_dispatch + # _test_fn(lambda x, dim: torch.unbind(x, dim)) + + @torch.inference_mode() + def test_nested_tensor(self): + self.assertRaises(TypeError, lambda: nested_tensor([3.0])) + self.assertRaises(TypeError, lambda: nested_tensor(torch.tensor([3.0]))) + self.assertRaises(TypeError, lambda: nested_tensor(4.0)) + + @torch.inference_mode() + def test_nested_tensor_matching_dim(self): + self.assertRaisesRegex( + RuntimeError, + "Found dimension 1 for Tensor at index 1 and dimension 0 for Tensor at index 0.", + lambda: nested_tensor([torch.tensor(1.0), torch.tensor([])]), + ) + self.assertRaisesRegex( + RuntimeError, + "Found dimension 1 for Tensor at index 2 and dimension 0 for Tensor at index 1.", + lambda: nested_tensor( + [torch.tensor(1.0), torch.tensor(2.0), torch.tensor([])] + ), + ) + + @torch.inference_mode() + def test_default_nested_tensor(self): + self.assertRaises(TypeError, lambda: nested_tensor()) + default_nested_tensor = nested_tensor([]) + default_tensor = torch.tensor([]) + # self.assertEqual(default_nested_tensor.nested_dim(), 1) + # self.assertEqual(default_nested_tensor.nested_size(), ()) + self.assertEqual(default_nested_tensor.dim(), default_tensor.dim()) + self.assertEqual(default_nested_tensor.layout, default_tensor.layout) + self.assertEqual(default_nested_tensor.device, default_tensor.device) + self.assertEqual(default_nested_tensor.dtype, default_tensor.dtype) + self.assertEqual( + default_nested_tensor.requires_grad, default_tensor.requires_grad + ) + self.assertIsNone(default_tensor.grad) + # TODO: Re-enable once we have a performance driven + # use case and implementation. + # self.assertEqual(default_nested_tensor.is_pinned(), + # default_tensor.is_pinned()) + + @torch.inference_mode() + def test_dim(self): + for constructor in _iter_constructors(): + a1 = constructor([]) + self.assertEqual(a1.dim(), 1) + a1 = constructor([torch.tensor(3.0)]) + self.assertEqual(a1.dim(), 1) + a1 = constructor([torch.tensor([1, 2, 3, 4])]) + self.assertEqual(a1.dim(), 2) + + @unittest.skipIf(IS_FBCODE, "numel is not virtual in fbcode.") + @torch.inference_mode() + def test_numel(self): + for constructor in _iter_constructors(): + a1 = constructor([]) + self.assertRaisesRegex( + RuntimeError, "numel is disabled", lambda: a1.numel(), + ) + + @torch.inference_mode() + def test_size(self): + for constructor in _iter_constructors(): + a1 = constructor([]) + self.assertRaisesRegex( + RuntimeError, + "Tensors of type NestedTensorImpl do not have sizes" + if IS_FBCODE + else "NestedTensorImpl doesn't support sizes", + lambda: a1.size(), + ) + + @unittest.skipIf(IS_FBCODE, "stride is not virtual in fbcode.") + @torch.inference_mode() + def test_stride(self): + for constructor in _iter_constructors(): + a1 = constructor([]) + self.assertRaisesRegex( + RuntimeError, + "NestedTensorImpl doesn't support strides", + lambda: a1.stride(), + ) + + @unittest.skipIf(IS_FBCODE, "is_contiguous is not virtual in fbcode.") + @torch.inference_mode() + def test_is_contiguous(self): + for constructor in _iter_constructors(): + a1 = constructor([]) + self.assertRaisesRegex( + RuntimeError, "is_contiguous is disabled", lambda: a1.is_contiguous() + ) + + @torch.inference_mode() + def test_repr_string(self): + a = nested_tensor([]) + expected = "nested_tensor([" "\n\n])" + self.assertEqual(str(a), expected) + self.assertEqual(repr(a), expected) + + a = nested_tensor([torch.tensor(1.0)]) + expected = "nested_tensor([" "\n tensor(1.)" "\n])" + self.assertEqual(str(a), expected) + self.assertEqual(repr(a), expected) + + a = nested_tensor([torch.tensor([[1, 2]]), torch.tensor([[4, 5]])]) + expected = ( + "nested_tensor([" "\n tensor([[1, 2]])" "," "\n tensor([[4, 5]])" "\n])" + ) + self.assertEqual(str(a), expected) + self.assertEqual(repr(a), expected) + + @torch.inference_mode() + def test_activations(self): + for func in (torch.nn.functional.relu, torch.nn.functional.relu_, torch.nn.functional.gelu, torch._C._nn.gelu_): + t = torch.tensor([-1, 0, 1], dtype=torch.float) + nt = nested_tensor([t]) + nested_result = func(nt) + self.assertTrue(nested_result.is_nested) + self.assertEqual(func(t), nested_result.unbind()[0]) + + def test_to_padded_tensor_on_empty_tensor(self): + nt = torch.nested_tensor([]) + empty = nt.to_padded_tensor(4) + self.assertEqual(empty, torch.tensor([])) + +class TestNestedTensorDeviceType(TestCase): + @dtypes(torch.float) + @skipMeta + def test_to_then_from_padded_tensor_no_transform0213(self, device, dtype): + t = torch.randn(4, 4, 4, device=device, dtype=dtype) + ts = list(torch.unbind(t)) + ts[0] = ts[0][:-1] + nt = torch.nested_tensor(ts, device=device, dtype=dtype) + padded = nt.to_padded_tensor(0) + + nt_to = torch._nested_from_padded_and_nested_example(padded, nt) + + for (t1, t2) in zip(nt.unbind(), nt_to.unbind()): + self.assertEqual(t1, t2) + self.assertEqual(nt.device, nt_to.device) + + @dtypes(torch.float) + @dtypesIfCUDA(torch.float, torch.half) + @skipMeta + @torch.inference_mode() + def test_layer_norm(self, device, dtype): + def _test(size): + t0 = torch.randn(2, size, device=device, dtype=dtype, requires_grad=False) + t1 = torch.randn(2, size, device=device, dtype=dtype, requires_grad=False) + ts = [t0, t1, t0, t1] + nt = torch.nested_tensor(ts, device=device, dtype=dtype) + layer_norm = torch.nn.LayerNorm(size, device=device, dtype=dtype) + nt_result = nt._nested_tensor_layer_norm( + layer_norm.weight, layer_norm.bias, 1e-5 + ) + for (nt_subresult, t) in zip(nt_result.unbind(), ts): + t_result = layer_norm(t.reshape(1, -1, size).squeeze(0)) + self.assertEqual(nt_subresult, t_result) + + for size in (1024, 1023, 513, 512, 256, 128, 2, 4, 32): + _test(size) + + @skipMeta + @torch.inference_mode() + def test_embedding(self, device): + inputs = [ + torch.randint(100, (L,), device=device, dtype=torch.int64) + for L in torch.randint(5, 50, (8,)) + ] + x = torch.nested_tensor(inputs, device=device, dtype=torch.int64) + emb = torch.nn.Embedding(100, 8, device=device) + y = emb(x) + ys = y.unbind() + for i, inp in enumerate(inputs): + self.assertEqual(emb(inp), ys[i]) + + @dtypes(torch.float, torch.float16) + def test_to_padded_tensor_simple(self, device, dtype): + t = torch.randn(4, 4, 4, device=device, dtype=dtype) + ts = list(torch.unbind(t)) + ts[0] = ts[0][:-1] + nt = torch.nested_tensor(ts, device=device, dtype=dtype) + for padding_value in (0, 1): + padded = nt.to_padded_tensor(padding_value) + + correct_output = t.clone() + if padding_value == 0: + correct_output[0][-1] = torch.zeros_like(correct_output[0][-1]) + else: + correct_output[0][-1] = torch.ones_like(correct_output[0][-1]) + + self.assertEqual(padded, correct_output) + self.assertEqual(padded.device, torch.device(device)) + self.assertEqual(padded.dtype, dtype) + + @dtypes(torch.float, torch.float16) + def test_to_padded_tensor_output_size(self, device, dtype): + t = torch.randn(4, 4, 4, device=device, dtype=dtype) + output_size = (4, 6, 5) + ts = list(torch.unbind(t)) + ts[0] = ts[0][:-1] + nt = torch.nested_tensor(ts, device=device, dtype=dtype) + for padding_value in (0, 1): + padded = nt.to_padded_tensor(padding_value, output_size=output_size) + correct_output = torch.ones(output_size, device=device, dtype=dtype) * padding_value + correct_output[:4:, :4, :4] = t.clone() + if padding_value == 0: + correct_output[0][3] = torch.zeros_like(correct_output[0][3]) + else: + correct_output[0][3] = torch.ones_like(correct_output[0][3]) + + self.assertEqual(padded, correct_output) + self.assertEqual(padded.device, torch.device(device)) + self.assertEqual(padded.dtype, dtype) + + @dtypes(torch.float, torch.float16, torch.double) + def test_to_padded_tensor_dim2(self, device, dtype): + ts = [ + torch.randn(160, device=device, dtype=dtype), + torch.randn(1240, device=device, dtype=dtype), + torch.randn(2400, device=device, dtype=dtype), + ] + nt = torch.nested_tensor(ts, device=device, dtype=dtype) + pad = 42 + correct_output = [] + for t in ts: + next_output = torch.ones_like(ts[2]) * pad + correct_output.append(next_output) + next_output[:t.size(0)].copy_(t) + correct_output = torch.stack(correct_output) + padded = nt.to_padded_tensor(pad) + self.assertEqual(padded, correct_output) + + @dtypes(torch.float, torch.float16, torch.double) + def test_to_padded_tensor_dim3(self, device, dtype): + ts = [ + torch.randn(16, 21, device=device, dtype=dtype), + torch.randn(24, 32, device=device, dtype=dtype), + torch.randn(40, 53, device=device, dtype=dtype), + ] + nt = torch.nested_tensor(ts, device=device, dtype=dtype) + pad = 42 + correct_output = [] + for t in ts: + next_output = torch.ones_like(ts[2]) * pad + correct_output.append(next_output) + next_output[:t.size(0), :t.size(1)].copy_(t) + correct_output = torch.stack(correct_output) + padded = nt.to_padded_tensor(pad) + self.assertEqual(padded, correct_output) + + @dtypes(torch.float, torch.float16, torch.double) + def test_to_padded_tensor_dim4(self, device, dtype): + ts = [ + torch.randn(16, 21, 13, device=device, dtype=dtype), + torch.randn(24, 32, 14, device=device, dtype=dtype), + torch.randn(40, 53, 16, device=device, dtype=dtype), + ] + nt = torch.nested_tensor(ts, device=device, dtype=dtype) + pad = 42 + correct_output = [] + for t in ts: + next_output = torch.ones_like(ts[2]) * pad + correct_output.append(next_output) + next_output[:t.size(0), :t.size(1), :t.size(2)].copy_(t) + correct_output = torch.stack(correct_output) + padded = nt.to_padded_tensor(pad) + self.assertEqual(padded, correct_output) + + @skipMeta + def test_device_checks(self, device): + nt = torch.nested_tensor([], device=device) + is_cuda = 'cuda' in str(device) + self.assertEqual(nt.is_cuda, is_cuda) + + # Helper functions for testing elementwise ops + def random_nt_pair(self, device, dtype, num_tensors, max_dims): + ts1 = [] + ts2 = [] + for _ in range(num_tensors): + tensor_dims = tuple([torch.randint(low=0, high=max_dim, size=(1,)).item() for max_dim in max_dims]) + t1 = torch.randn(tensor_dims, device=device, dtype=dtype) + t2 = torch.randn(tensor_dims, device=device, dtype=dtype) + ts1.append(t1) + ts2.append(t2) + return (torch.nested_tensor(ts1, device=device, dtype=dtype), + torch.nested_tensor(ts2, device=device, dtype=dtype)) + + def nt_equal(self, nt1, nt2): + self.assertEqual(nt1.dtype, nt2.dtype) + self.assertEqual(nt1.device, nt2.device) + ub1 = nt1.unbind() + ub2 = nt2.unbind() + self.assertEqual(len(ub1), len(ub2)) + n = len(ub1) + for i in range(n): + self.assertEqual(ub1[i], ub2[i]) + + @dtypes(torch.float, torch.float16) + @skipMeta + @torch.inference_mode() + def test_nested_tensor_add(self, device, dtype): + (nt1, nt2) = self.random_nt_pair(device, dtype, 4, (4, 4)) + ref = torch.nested_tensor([t1 + t2 for (t1, t2) in zip(nt1.unbind(), nt2.unbind())]) + out = nt1 + nt2 + self.nt_equal(ref, out) + + @dtypes(torch.float, torch.float16) + @skipMeta + @torch.inference_mode() + def test_nested_tensor_mul(self, device, dtype): + (nt1, nt2) = self.random_nt_pair(device, dtype, 4, (4, 4)) + ref = torch.nested_tensor([t1 * t2 for (t1, t2) in zip(nt1.unbind(), nt2.unbind())]) + out = nt1 * nt2 + self.nt_equal(ref, out) + + @dtypes(torch.float, torch.float16) + @skipMeta + @torch.inference_mode() + def test_nested_tensor_add_in_place(self, device, dtype): + (nt1, nt2) = self.random_nt_pair(device, dtype, 4, (4, 4)) + ref = torch.nested_tensor([t1 + t2 for (t1, t2) in zip(nt1.unbind(), nt2.unbind())]) + nt1 += nt2 + self.nt_equal(ref, nt1) + + @dtypes(torch.float, torch.float16) + @skipMeta + @torch.inference_mode() + def test_nested_tensor_mul_in_place(self, device, dtype): + (nt1, nt2) = self.random_nt_pair(device, dtype, 4, (4, 4)) + ref = torch.nested_tensor([t1 * t2 for (t1, t2) in zip(nt1.unbind(), nt2.unbind())]) + nt1 *= nt2 + self.nt_equal(ref, nt1) + +instantiate_device_type_tests(TestNestedTensorDeviceType, globals()) + +if __name__ == '__main__': + run_tests() diff --git a/test/test_nn.py b/test/test_nn.py index 28f44c94f405..ddb7a47cd813 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -1,5 +1,6 @@ # Owner(s): ["module: nn"] +import contextlib import math import random import string @@ -14,6 +15,7 @@ from functools import reduce, partial from operator import mul from collections import OrderedDict +from tempfile import NamedTemporaryFile import torch @@ -22,6 +24,7 @@ torch.set_default_dtype(torch.double) from torch._six import inf, nan +import torch.autograd.forward_ad as fwAD import torch.backends.cudnn as cudnn import torch.nn as nn import torch.nn.functional as F @@ -34,12 +37,13 @@ from torch.nn import Parameter from torch.nn.parameter import UninitializedParameter, UninitializedBuffer from torch.nn.parallel._functions import Broadcast -from torch.testing._internal.common_dtype import integral_types, get_all_fp_dtypes, get_all_math_dtypes +from torch.testing._internal.common_dtype import integral_types, floating_types_and, get_all_math_dtypes, \ + floating_and_complex_types_and from torch.testing._internal.common_utils import freeze_rng_state, run_tests, TestCase, skipIfNoLapack, skipIfRocm, \ - skipIfRocmVersionLessThan, skipIfNotMiopenSuggestNHWC, TEST_NUMPY, TEST_SCIPY, TEST_WITH_ROCM, download_file, \ - get_function_arglist, load_tests, \ + skipIfRocmVersionLessThan, skipIfNotMiopenSuggestNHWC, TEST_NUMPY, TEST_SCIPY, TEST_WITH_CROSSREF, TEST_WITH_ROCM, \ + download_file, get_function_arglist, load_tests, skipIfMps,\ suppress_warnings, TemporaryFileName, TEST_WITH_UBSAN, IS_PPC, \ - parametrize as parametrize_test, subtest, instantiate_parametrized_tests + parametrize as parametrize_test, subtest, instantiate_parametrized_tests, set_default_dtype, IS_WINDOWS from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, TEST_CUDNN_VERSION from torch.testing._internal.common_nn import NNTestCase, NewModuleTest, CriterionTest, \ module_tests, criterion_tests, loss_reference_fns, \ @@ -52,6 +56,7 @@ from torch.nn import MultiheadAttention from hypothesis import given +from torch.testing import make_tensor import torch.testing._internal.hypothesis_utils as hu from torch.testing._internal.common_utils import _assertGradAndGradgradChecks, gradcheck, gradgradcheck, \ GRADCHECK_NONDET_TOL @@ -68,6 +73,7 @@ if TEST_SCIPY: from scipy import stats + import scipy.signal import scipy.ndimage if TEST_NUMPY: @@ -138,6 +144,21 @@ def test_wrong_order(self): RuntimeError, lambda: rnn_utils.pack_padded_sequence(b_a, [22, 25], enforce_sorted=True)) + def test_pad_sequence_with_tensor_sequences(self): + seq_tuple_input = torch.nn.utils.rnn.pad_sequence( + (torch.tensor([[7, 6]]), torch.tensor([[-7, -1]])) + ) + seq_tensor_input = torch.nn.utils.rnn.pad_sequence( + torch.tensor([[[7, 6]], [[-7, -1]]]) + ) + self.assertEqual(seq_tuple_input, seq_tensor_input) + self.assertEqual(seq_tuple_input.shape, torch.Size([1, 2, 2])) + + def test_pad_sequence_with_non_iterable_sequences(self): + msg = r"Expected iterable for input sequences, but got arg of type" + with self.assertRaisesRegex(RuntimeError, msg): + torch.nn.utils.rnn.pad_sequence(5) + def test_total_length(self): padded, lengths = self._padded_sequence(torch.FloatTensor) max_length = max(lengths) @@ -395,6 +416,13 @@ def __init__(self): return l, n, s + def test_parse_to(self): + # Test for buggy use of THPMemoryFormat_New + self.assertEqual( + repr(torch._C._nn._parse_to(memory_format=torch.contiguous_format)[3]), + "torch.contiguous_format" + ) + def test_requires_grad_(self): m = self._create_basic_net()[-1] assert len(list(m.buffers())) > 0, 'invalid test' @@ -876,7 +904,7 @@ def test_no_grad(self): self.assertRaises(RuntimeError, lambda: output2.backward(torch.ones(1, 5, 10, 10))) def test_invalid_conv1d(self): - for dtype in [torch.bfloat16, torch.float, torch.double]: + for dtype in [torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble]: module = nn.Conv1d(in_channels=3, out_channels=33, kernel_size=10, stride=1, bias=True).to(dtype) input = torch.randn(1, 3, 4).to(dtype) with self.assertRaisesRegex(RuntimeError, @@ -891,30 +919,32 @@ def test_invalid_conv1d(self): module(input) def test_mismatch_shape_conv2d(self): - x = torch.randn(1, 10, 1, 28, 28) - w = torch.randn(6, 1, 5, 5) + for dtype in (torch.float, torch.cfloat): + x = torch.randn(1, 10, 1, 28, 28, dtype=dtype) + w = torch.randn(6, 1, 5, 5, dtype=dtype) - with self.assertRaisesRegex(RuntimeError, - r'Expected 3D \(unbatched\) or 4D \(batched\) input to conv2d, but got ' + - r'input of size: \[1, 10, 1, 28, 28\]'): + with self.assertRaisesRegex(RuntimeError, + r'Expected 3D \(unbatched\) or 4D \(batched\) input to conv2d, but got ' + + r'input of size: \[1, 10, 1, 28, 28\]'): - F.conv2d(x, w) + F.conv2d(x, w) def test_conv2d_discontiguous_weight(self): - # Test for https://github.com/pytorch/pytorch/issues/55781 - x = torch.ones(64, 16, 16, 16) - weight = torch.arange(0, 1.0, 1 / 2.0 ** 10).reshape(32, 16, 1, 2)[:, :, :, ::2] - self.assertFalse(weight.is_contiguous()) - y = torch.nn.functional.conv2d(x, weight, None) - if torch.backends.mkldnn.is_available(): - # Disable MKLDNN explicitly, so that either NNPACK or THCNN will be used - with torch.backends.mkldnn.flags(enabled=False): - y_ = torch.nn.functional.conv2d(x, weight, None) - self.assertEqual(y, y_) - self.assertEqual(y.sum(), 4186112.) + for dtype in (torch.float, torch.cfloat): + # Test for https://github.com/pytorch/pytorch/issues/55781 + x = torch.ones(64, 16, 16, 16, dtype=dtype) + weight = torch.arange(0, 1.0, 1 / 2.0 ** 10).reshape(32, 16, 1, 2).to(dtype)[:, :, :, ::2] + self.assertFalse(weight.is_contiguous()) + y = torch.nn.functional.conv2d(x, weight, None) + if torch.backends.mkldnn.is_available(): + # Disable MKLDNN explicitly, so that either NNPACK or THCNN will be used + with torch.backends.mkldnn.flags(enabled=False): + y_ = torch.nn.functional.conv2d(x, weight, None) + self.assertEqual(y, y_) + self.assertEqual(y.sum(), 4186112.) def test_invalid_conv2d(self): - for dtype in [torch.bfloat16, torch.float, torch.double]: + for dtype in [torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble]: module = torch.nn.Conv2d(1, 1, kernel_size=3, dilation=2, stride=2).to(dtype) input = torch.empty(1, 1, 4, 4).to(dtype) self.assertRaises(RuntimeError, lambda: module(input)) @@ -939,7 +969,7 @@ def test_invalid_conv2d(self): module(input) def test_invalid_conv3d(self): - for dtype in [torch.bfloat16, torch.float, torch.double]: + for dtype in [torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble]: module = torch.nn.Conv3d(1, 1, kernel_size=3, dilation=2, stride=2).to(dtype) input = torch.empty(1, 1, 4, 4, 4).to(dtype) self.assertRaises(RuntimeError, lambda: module(input)) @@ -1391,6 +1421,16 @@ def test_load_state_dict_invalid(self): "expected torch.Tensor or Tensor-like object from checkpoint but received"): m.load_state_dict(state_dict) + def test_load_state_dict_type(self): + m = nn.Module() + + with self.assertRaisesRegex(TypeError, + "Expected state_dict to be dict-like, got"): + m.load_state_dict("") + with self.assertRaisesRegex(TypeError, + "Expected state_dict to be dict-like, got"): + m.load_state_dict(2) + def test_buffer_not_persistent_load(self): m = nn.Module() m.register_buffer('buf', torch.rand(5), persistent=False) @@ -3153,6 +3193,40 @@ def forward(self, X): Y = model.weight self.assertEqual(id(X), id(Y)) + # FIXME: Rewrite this test using functions not depending on LAPACK + # and remove the `@skipIfNoLapack` (see #70995) + @skipIfNoLapack + def test_caching_parametrization_with_transfer_parametrizations_and_params(self): + r"""Test that transferring parametrizations doesn't cause issues with caching""" + class Skew(nn.Module): + def forward(self, X): + X = X.tril(-1) + return X - X.T + + class Orthogonal(nn.Module): + def forward(self, X): + Id = torch.eye(X.size(0), device=X.device) + return torch.linalg.solve(Id + X, Id - X) + + model = nn.Linear(5, 5) + parametrize.register_parametrization(model, "weight", Skew()) + parametrize.register_parametrization(model, "weight", Orthogonal()) + + to_model = nn.Linear(5, 5) + parametrize.transfer_parametrizations_and_params(model, to_model) + + with parametrize.cached(): + X = model.weight + Y = model.weight + self.assertEqual(id(X), id(Y)) + + A = to_model.weight + B = to_model.weight + self.assertEqual(id(A), id(B)) + + # test that the results are distinct objects for each module + self.assertNotEqual(id(A), id(X)) + def test_parametrization_same_training_mode(self): r"""Test training mode updated on parametrization registration""" class Identity(nn.Module): @@ -3168,6 +3242,220 @@ def forward(self, X): self.assertTrue(module.parametrizations.weight[0].training) self.assertTrue(module.parametrizations.weight[1].training) + def test_type_before_parametrizations(self): + r"""Test that type_before_parametrizations always retrieves original type""" + + class Identity(nn.Module): + def forward(self, X): + return X + + model = nn.Linear(5, 5) + original_type = type(model) + self.assertTrue( + parametrize.type_before_parametrizations(model) == original_type + ) + parametrize.register_parametrization(model, "weight", Identity()) + self.assertTrue( + parametrize.type_before_parametrizations(model) == original_type + ) + + def test_transfer_parametrizations_and_params(self): + r"""Test that all parametrizations and their associated parameters are transferred.""" + + class AddOne(nn.Module): + def forward(self, x): + return x + 1.0 + + class Double(nn.Module): + def forward(self, x): + return 2.0 * x + + def right_inverse(self, x): + return 0.5 * x + + class MinusOne(nn.Module): + def forward(self, x): + return x - 1.0 + + model = nn.Linear(5, 5) + parametrize.register_parametrization(model, "weight", AddOne()) + parametrize.register_parametrization(model, "weight", Double()) + parametrize.register_parametrization(model, "weight", MinusOne()) + hold_weight = model.weight + + to_model = nn.qat.Linear( + 5, 5, qconfig=torch.ao.quantization.get_default_qconfig() + ) + parametrize.transfer_parametrizations_and_params(model, to_model) + + # checks that final and original value are correct and the to_model is parametrized + self.assertTrue(torch.nn.utils.parametrize.is_parametrized(to_model, "weight")) + self.assertEqual(model.weight, to_model.weight) + self.assertEqual( + model.parametrizations.weight.original, + to_model.parametrizations.weight.original, + ) + + # check that the transfer didn't affect the original value + self.assertEqual(hold_weight, model.weight) + + # testing that changes to one set of parametrizations do not affect the other + parametrize.remove_parametrizations(to_model, "weight") + self.assertFalse(torch.nn.utils.parametrize.is_parametrized(to_model, "weight")) + self.assertTrue(torch.nn.utils.parametrize.is_parametrized(model, "weight")) + + # also test that parameters that don't exist in to_model get transferred + model.test_param = Parameter(torch.randn(5, 5)) + + self.assertTrue(not hasattr(to_model, "test_param")) + parametrize.register_parametrization(model, "test_param", Double()) + hold_test_param = model.test_param + parametrize.transfer_parametrizations_and_params(model, to_model, "test_param") + + # check that previously missing params got transferred correctly + self.assertEqual(model.test_param, to_model.test_param) + self.assertEqual( + model.parametrizations.test_param.original, + to_model.parametrizations.test_param.original, + ) + + # check that the new transfer didn't change the value for the from_module + self.assertEqual(hold_test_param, model.test_param) + + def test_transfer_parametrizations_and_params_right_inverse(self): + r"""Test that all parametrizations and their associated parameters are transferred.""" + + class Double(nn.Module): + def forward(self, x): + return 2.0 * x + + def right_inverse(self, x): + return 0.5 * x + + model = nn.Linear(5, 5) + parametrize.register_parametrization(model, "weight", Double()) + hold_weight = model.weight + + to_model = nn.qat.Linear( + 5, 5, qconfig=torch.ao.quantization.get_default_qconfig() + ) + parametrize.transfer_parametrizations_and_params(model, to_model) + + # check that transfer occurs successfully + self.assertEqual(model.weight, to_model.weight) + self.assertEqual( + model.parametrizations.weight.original, + to_model.parametrizations.weight.original, + ) + + # check that transfer doesn't affect the from_model weight + self.assertEqual(hold_weight, model.weight) + + def test_transfer_parametrizations_and_params_single_param(self): + r"""Test that all parametrizations and their associated parameters are transferred.""" + + class AddOne(nn.Module): + def forward(self, x): + return x + 1.0 + + class Double(nn.Module): + def forward(self, x): + return 2.0 * x + + class MinusOne(nn.Module): + def forward(self, x): + return x - 1.0 + + model = nn.Linear(5, 5, bias=True) + parametrize.register_parametrization(model, "weight", AddOne()) + parametrize.register_parametrization(model, "weight", Double()) + parametrize.register_parametrization(model, "weight", MinusOne()) + parametrize.register_parametrization(model, "bias", AddOne()) + parametrize.register_parametrization(model, "bias", Double()) + parametrize.register_parametrization(model, "bias", MinusOne()) + + to_model = nn.qat.Linear( + 5, 5, bias=True, qconfig=torch.ao.quantization.get_default_qconfig() + ) + parametrize.transfer_parametrizations_and_params(model, to_model, "weight") + + # check that weight and only weight was transferred + self.assertEqual(model.weight, to_model.weight) + self.assertEqual( + model.parametrizations.weight.original, + to_model.parametrizations.weight.original, + ) + self.assertTrue("bias" not in to_model.parametrizations) + + # FIXME: Rewrite this test using functions not depending on LAPACK + # and remove the `@skipIfNoLapack` (see #70995) + @skipIfNoLapack + def test_transfer_parametrizations_and_params_many_to_one(self): + # A parametrization with several outputs + class RankOne(nn.Module): + def forward(self, x, y): + # Form a rank-1 matrix from a pair of vectors + return x.unsqueeze(-1) @ y.unsqueeze(-2) + + def right_inverse(self, Y): + # We project the given matrix onto the rank 1 matrices + U, S, Vh = torch.linalg.svd(Y, full_matrices=False) + # S is ordered in a decreasing way. + s0_sqrt = S[0].sqrt().unsqueeze(-1) + return U[..., :, 0] * s0_sqrt, Vh[..., 0, :] * s0_sqrt + + class Double(nn.Module): + def forward(self, x): + return 2.0 * x + + model = nn.Linear(3, 3) + parametrize.register_parametrization(model, "weight", RankOne()) + parametrize.register_parametrization(model, "weight", Double()) + hold_weight = model.weight + + to_model = nn.qat.Linear( + 3, 3, qconfig=torch.ao.quantization.get_default_qconfig() + ) + + parametrize.transfer_parametrizations_and_params(model, to_model) + + # checks that final and original value are correct and the to_model is parametrized + self.assertTrue(torch.nn.utils.parametrize.is_parametrized(to_model, "weight")) + self.assertEqual(model.weight, to_model.weight) + self.assertEqual( + model.parametrizations.weight.original0, + to_model.parametrizations.weight.original0, + ) + self.assertEqual( + model.parametrizations.weight.original1, + to_model.parametrizations.weight.original1, + ) + + # check that the transfer didn't affect the original value + self.assertEqual(hold_weight, model.weight) + + # testing that changes to one set of parametrizations do not affect the other + model.test_param = Parameter(torch.randn(3, 3)) + + self.assertTrue(not hasattr(to_model, "test_param")) + parametrize.register_parametrization(model, "test_param", RankOne()) + hold_test_param = model.test_param + parametrize.transfer_parametrizations_and_params(model, to_model, "test_param") + + # also check that previously missing params got transferred correctly + self.assertEqual(model.test_param, to_model.test_param) + self.assertEqual( + model.parametrizations.test_param.original0, + to_model.parametrizations.test_param.original0, + ) + self.assertEqual( + model.parametrizations.test_param.original1, + to_model.parametrizations.test_param.original1, + ) + + # check that the new transfer didn't change the value for the from_module + self.assertEqual(hold_test_param, model.test_param) + # torch/nn/utils/prune.py @unittest.skipIf(not TEST_NUMPY, "numpy not found") def test_validate_pruning_amount_init(self): @@ -4131,37 +4419,38 @@ def check_weight_norm(l, name, num_params): def test_weight_norm(self): - input = torch.randn(3, 5) - m = nn.Linear(5, 7) - expected_output = m(input) - - # add weight normalization - m = torch.nn.utils.weight_norm(m) - self.assertEqual(m.weight_v.size(), m.weight.size()) - self.assertEqual(m.weight_g.size(), (7, 1)) - self.assertEqual(m(input), expected_output) - - # remove weight norm - m = torch.nn.utils.remove_weight_norm(m) - self.assertFalse(hasattr(m, 'weight_g')) - self.assertFalse(hasattr(m, 'weight_v')) - self.assertEqual(m(input), expected_output) - - # test with dim=1 - m = torch.nn.utils.weight_norm(m, dim=1) - self.assertEqual(m.weight_v.size(), m.weight.size()) - self.assertEqual(m.weight_g.size(), (1, 5)) - self.assertEqual(m(input), expected_output) - - # test with dim=None - m = nn.Linear(5, 7) - expected_output = m(input) - m = torch.nn.utils.weight_norm(m, dim=None) - self.assertEqual(m(input), expected_output) + for dtype in [torch.float, torch.bfloat16]: + input = torch.randn(3, 40, dtype=dtype) + m = nn.Linear(40, 50).to(dtype=dtype) + expected_output = m(input) - with self.assertRaisesRegex(RuntimeError, 'register two weight_norm hooks'): - m = torch.nn.utils.weight_norm(m) + # add weight normalization m = torch.nn.utils.weight_norm(m) + self.assertEqual(m.weight_v.size(), m.weight.size()) + self.assertEqual(m.weight_g.size(), (50, 1)) + self.assertEqual(m(input), expected_output, atol=dtype2prec_DONTUSE[dtype], rtol=0) + + # remove weight norm + m = torch.nn.utils.remove_weight_norm(m) + self.assertFalse(hasattr(m, 'weight_g')) + self.assertFalse(hasattr(m, 'weight_v')) + self.assertEqual(m(input), expected_output, atol=dtype2prec_DONTUSE[dtype], rtol=0) + + # test with dim=1 + m = torch.nn.utils.weight_norm(m, dim=1) + self.assertEqual(m.weight_v.size(), m.weight.size()) + self.assertEqual(m.weight_g.size(), (1, 40)) + self.assertEqual(m(input), expected_output, atol=dtype2prec_DONTUSE[dtype], rtol=0) + + # test with dim=None + m = nn.Linear(40, 50).to(dtype=dtype) + expected_output = m(input) + m = torch.nn.utils.weight_norm(m, dim=None) + self.assertEqual(m(input), expected_output) + + with self.assertRaisesRegex(RuntimeError, 'register two weight_norm hooks'): + m = torch.nn.utils.weight_norm(m) + m = torch.nn.utils.weight_norm(m) def test_parameterlistdict_setting_attributes(self): with warnings.catch_warnings(record=True) as w: @@ -4807,7 +5096,7 @@ def assert_weight_allclose_Q(weight, W): (torch.float32, torch.complex64), (True, False)): # Conv2d does not support complex yet - if not use_linear and dtype.is_complex: + if not use_linear: continue if use_linear: @@ -5161,8 +5450,52 @@ def test_FeatureAlphaDropout(self): def test_pad_scalar_error(self): inputs = torch.tensor(0., requires_grad=True) - self.assertRaises(AssertionError, lambda: F.pad(inputs, (1, 1))) - self.assertRaises(AssertionError, lambda: F.pad(inputs, (1,))) + self.assertRaises(RuntimeError, lambda: F.pad(inputs, (1, 1))) + self.assertRaises(RuntimeError, lambda: F.pad(inputs, (1,))) + + def test_nested_tensor_from_mask(self): + N, L, D = 10, 12, 14 + + input = torch.rand(N, L, D) + mask = torch.ones(N, L, dtype=torch.bool) + # Leave first row be all True to maintain the nt's size unchanged + for i in range(1, N): + end = torch.randint(1, L, size=()).item() + mask[i, end:] = False + + nt = torch._nested_tensor_from_mask(input, mask) + input_convert = nt.to_padded_tensor(0.) + input.masked_fill_(mask.reshape(N, L, 1).logical_not(), 0.) + + self.assertEqual(input, input_convert) + + def test_nested_tensor_from_mask_error(self): + N, L, D = 10, 12, 14 + + input = torch.rand(N, L, D) + # Mask is not bool + mask = torch.zeros(N, L, dtype=torch.float) + self.assertRaises(RuntimeError, lambda: torch._nested_tensor_from_mask(input, mask)) + + # Mask size is not 2 + mask = torch.zeros(N, L, D, dtype=torch.bool) + self.assertRaises(RuntimeError, lambda: torch._nested_tensor_from_mask(input, mask)) + + # Input size is not 3 + mask = torch.zeros(N, L, dtype=torch.bool) + input = torch.rand(N, L) + self.assertRaises(RuntimeError, lambda: torch._nested_tensor_from_mask(input, mask)) + + # Mask size does not match input + mask = torch.zeros(N + 1, L + 1, dtype=torch.bool) + input = torch.rand(N, L, D) + self.assertRaises(RuntimeError, lambda: torch._nested_tensor_from_mask(input, mask)) + + # Mask is not padding format + mask = torch.ones(N, L, dtype=torch.bool) + mask[0, 0] = False + mask[0, 2] = False + self.assertRaises(RuntimeError, lambda: torch._nested_tensor_from_mask(input, mask)) @unittest.skipIf(not TEST_NUMPY, "numpy not found") @parametrize_test("average_attn_weights", [True, False]) @@ -5487,6 +5820,32 @@ def test_multihead_attn_3d_attn_mask(self): # output_2d in shape of [T, 1, D] self.assertEqual(output_3d[i].unsqueeze(0).transpose(0, 1), output_2d) + @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") + def test_self_attn_TxT_attn_mask(self): + embed_dim = 16 + num_heads = 4 + batch_size = 10 + tgt_len = 16 + + query = torch.rand(batch_size, tgt_len, embed_dim, device="cuda") # [N, T, D] + attn_mask = torch.randint(0, 2, (tgt_len, tgt_len)).cuda().float() # [T, T] + attn_mask = attn_mask.masked_fill(attn_mask == 0, float('-inf')).masked_fill(attn_mask == 1, float(0.0)) + + attn_mask_4d = attn_mask.expand(batch_size, num_heads, tgt_len, tgt_len) + + mta_model = torch.nn.MultiheadAttention(embed_dim, num_heads, batch_first=True).cuda() + mta_model.eval() + + # Generate 3D results + with torch.inference_mode(): + output_mask_4d = mta_model(query, query, query, attn_mask=attn_mask_4d)[0] + output_mask_4d = output_mask_4d.transpose(0, 1) # [N, T, D] + + output_mask_TxT = mta_model(query, query, query, attn_mask=attn_mask)[0] + output_mask_TxT = output_mask_TxT.transpose(0, 1) # [N, T, D] + + self.assertEqual(output_mask_4d, output_mask_TxT) + def test_multihead_attn_no_bias(self): embed_dim = 8 num_heads = 4 @@ -5496,9 +5855,7 @@ def test_multihead_attn_no_bias(self): self.assertIsNone(mha.in_proj_bias) self.assertIsNone(mha.out_proj.bias) - def test_multihead_attn_invalid_shape(self): - mha = torch.nn.MultiheadAttention(3, 3) - + def _test_multihead_attn_invalid_shape_impl(self, mha): # Batched (3D) query cases query = torch.randn(3, 3, 3) key = torch.randn(3, 3, 3) @@ -5554,6 +5911,113 @@ def test_multihead_attn_invalid_shape(self): with self.assertRaisesRegex(AssertionError, msg): mha(query, key, value, attn_mask=torch.randn(4, 3, 3).bernoulli_().to(torch.bool)) + def test_multihead_attn_invalid_shape(self): + mha = torch.nn.MultiheadAttention(3, 3) + self._test_multihead_attn_invalid_shape_impl(mha) + # Give the test a chance to hit the fast path. (Right now, it + # won't, but gating may be less restricted in the future.) + with torch.no_grad(): + self._test_multihead_attn_invalid_shape_impl(mha.eval()) + + @torch.no_grad() + def test_multihead_attn_fast_path_invalid_shape(self): + mha = torch.nn.MultiheadAttention(3, 3, batch_first=True).eval() + + # Batched (3D) query cases + query = torch.randn(3, 3, 3) + key = torch.randn(3, 3, 3) + value = torch.randn(3, 3, 3) + + # Currently, this case will just go to the slow path and get + # the usual message because it fails the requirement to be + # batched. + msg = "expected `key` and `value` to be 3-D but found 2-D and 3-D tensors respectively" + # 3D query, 2D key and 3D value + with self.assertRaisesRegex(AssertionError, msg): + mha(query, torch.randn(3, 3), value, need_weights=False) + + # Currently, this case will just go to the slow path and get + # the usual message because it fails the requirement to be + # batched. + msg = "expected `key` and `value` to be 3-D but found 3-D and 2-D tensors respectively" + # 3D query, 3D key and 2D value + with self.assertRaisesRegex(AssertionError, msg): + mha(query, key, torch.randn(3, 3), need_weights=False) + + msg = "expected `key_padding_mask` to be `None` or 2-D but found 1-D tensor instead" + # 3D query, 3D key, 3D value and 1D key_padding_mask + with self.assertRaisesRegex(AssertionError, msg): + mha(query, key, value, key_padding_mask=torch.tensor([False, True, True], dtype=torch.bool), need_weights=False) + + msg = "expected `attn_mask` to be `None`, 2-D or 3-D but found 1-D tensor instead" + # 3D query, 3D key, 3D value and 1D attn_mask + with self.assertRaisesRegex(AssertionError, msg): + mha(query, key, value, attn_mask=torch.tensor([False, True, True], dtype=torch.bool), need_weights=False) + + # Unbatched (2D) query cases + # NOTE: error messages are the same as regular path because the fast path doesn't support 2D. + query = torch.randn(3, 3) + key = torch.randn(3, 3) + value = torch.randn(3, 3) + + msg = "expected `key` and `value` to be 2-D but found 3-D and 2-D tensors respectively" + # 2D query, 3D key and 2D value + with self.assertRaisesRegex(AssertionError, msg): + mha(query, torch.randn(3, 3, 3), value) + + msg = "expected `key` and `value` to be 2-D but found 2-D and 3-D tensors respectively" + # 2D query, 3D key and 2D value + with self.assertRaisesRegex(AssertionError, msg): + mha(query, key, torch.randn(3, 3, 3)) + + msg = "expected `key_padding_mask` to be `None` or 1-D but found 2-D tensor instead" + # 2D query, 2D key, 2D value and 1D key_padding_mask + with self.assertRaisesRegex(AssertionError, msg): + mha(query, key, value, key_padding_mask=torch.tensor([[False, True, True] * 2], dtype=torch.bool)) + + msg = "expected `attn_mask` to be `None`, 2-D or 3-D but found 1-D tensor instead" + # 2D query, 2D key, 2D value and 1D attn_mask + with self.assertRaisesRegex(AssertionError, msg): + mha(query, key, value, attn_mask=torch.tensor([False, True, True], dtype=torch.bool)) + + msg = r"Expected `attn_mask` shape to be \(3, 3, 3\)" + # 2D query, 2D key, 2D value and 3D incorrect attn_mask + with self.assertRaisesRegex(AssertionError, msg): + mha(query, key, value, attn_mask=torch.randn(4, 3, 3).bernoulli_().to(torch.bool)) + + def test_multihead_attn_nested_tensor_outside_fast_path(self): + mha = torch.nn.MultiheadAttention(3, 3, batch_first=True).eval() + nt = torch.nested_tensor([torch.randn(3, 3)]) + # One tested platform (linux-bionic-py3.7-clang) has a torch_function for one + # or more of these. Take advantage of that to test the torch_function bailout. + has_torch_func = torch.overrides.has_torch_function( + (nt, mha.in_proj_weight, mha.in_proj_bias, mha.out_proj.weight, mha.out_proj.bias)) + if has_torch_func: + msg = "MultiheadAttention does not support NestedTensor.*argument has_torch_function" + else: + msg = ("MultiheadAttention does not support NestedTensor outside of its fast path.*grad is " + + "enabled and.*or biases requires_grad") + with self.assertRaisesRegex(AssertionError, msg): + mha(nt, nt, nt) + + if has_torch_func: + # Just give up, they're all going to fail with the same message. + return + + with torch.no_grad(): + mha(nt, nt, nt) + with torch.inference_mode(): + mha(nt, nt, nt) + nt = torch.nested_tensor([torch.randn(3, 3, requires_grad=False)]) + nt.requires_grad = False + with self.assertRaisesRegex(AssertionError, msg): + mha(nt, nt, nt) + mha.in_proj_weight.requires_grad = False + mha.in_proj_bias.requires_grad = False + mha.out_proj.weight.requires_grad = False + mha.out_proj.bias.requires_grad = False + mha(nt, nt, nt) + def test_normalize(self): inputs = torch.randn(1, 3, 4, 4, requires_grad=True) self.assertTrue(gradcheck(lambda x: F.normalize(x, p=1, dim=-1), (inputs,))) @@ -5795,6 +6259,9 @@ def test_state_dict(self): self.assertEqual(state_dict['weight'].data_ptr(), l.weight.data_ptr()) self.assertEqual(state_dict['bias'].data_ptr(), l.bias.data_ptr()) + # Reference https://github.com/pytorch/pytorch/pull/75507#issuecomment-1110291545 + self.assertNotWarn(lambda: l.state_dict(destination=dict()), "Should not warn kwarg destination w/o _metadata") + def test_load_state_dict(self): l = nn.Linear(5, 5) block = nn.Module() @@ -6289,7 +6756,7 @@ def test(should_raise, module, input_size, dtype): # just run it to ensure no exception raised. module(input) - for dtype in [torch.bfloat16, torch.float, torch.double]: + for dtype in [torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble]: # Conv1d test(True, nn.Conv1d(1, 1, 3).to(dtype), (1, 2), dtype) test(True, nn.Conv1d(1, 1, 3, stride=2).to(dtype), (1, 2), dtype) @@ -6365,8 +6832,6 @@ def test_ConvTranspose2d_half_cublas_gemm(self): output = deconv(inputs) output.mean().backward() - - @skipIfRocm # For https://github.com/pytorch/pytorch/pull/1273 # Almost identical to the above `test_Conv2d_naive_groups` def test_Conv2d_groups_nobias(self): @@ -6406,7 +6871,6 @@ def test_Conv2d_groups_nobias(self): # Covering special case when group > 1, input-channel / group < 16 and output-channel is multiple of 16 # See also https://github.com/pytorch/pytorch/pull/18463#issuecomment-476563686 # and https://github.com/pytorch/pytorch/pull/18463#issuecomment-477001024 - @skipIfRocm def test_Conv2d_groups_nobias_v2(self): torch.manual_seed(123) dev_dtypes = [("cpu", torch.float)] @@ -7194,187 +7658,6 @@ def test_Transformer_cell(self): memory_key_padding_mask=memory_key_padding_mask) output.sum().backward() - def test_transformerencoderlayer(self): - # this is a deterministic test for TransformerEncoderLayer - d_model = 4 - nhead = 2 - dim_feedforward = 16 - dropout = 0.0 - bsz = 2 - - for batch_first in (False, True): - def perm_fn(x): - return x.transpose(1, 0) if batch_first else x - - model = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, - batch_first=batch_first) - - # set constant weights of the model - for idx, p in enumerate(model.parameters()): - x = p.data - sz = x.view(-1).size(0) - shape = x.shape - x = torch.cos(torch.arange(0, sz).float().view(shape)) - p.data.copy_(x) - - # deterministic input - encoder_input = torch.tensor([[[20., 30., 40., 50.]]]) - result = model(encoder_input) - ref_output = torch.tensor([[[2.258703, 0.127985, -0.697881, 0.170862]]]) - result = result.detach().numpy() - ref_output = ref_output.detach().numpy() - self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - np.testing.assert_allclose(result, ref_output, atol=1e-5) - # 0 values are NOT masked. This shouldn't mask anything. - mask = torch.tensor([[0]]) == 1 - result = model(encoder_input, src_key_padding_mask=mask) - result = result.detach().numpy() - self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - np.testing.assert_allclose(result, ref_output, atol=1e-5) - # 1 values are masked. Since there is only 1 input embedding this - # will result in nan. - mask = torch.tensor([[1]]) == 1 - result = model(encoder_input, src_key_padding_mask=mask) - result = result.detach().numpy() - self.assertTrue(np.isnan(result).all()) - - # deterministic input - encoder_input = perm_fn(torch.tensor([[[1., 2., 3., 4.]], - [[5., 6., 7., 8.]]])) - result = model(encoder_input) - ref_output = perm_fn(torch.tensor([[[2.272644, 0.119035, -0.691669, 0.153486]], - [[2.272644, 0.119035, -0.691669, 0.153486]]])) - result = result.detach().numpy() - ref_output = ref_output.detach().numpy() - self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - np.testing.assert_allclose(result, ref_output, atol=1e-5) - # all 0 which is no masking - mask = torch.tensor([[0, 0]]) == 1 - result = model(encoder_input, src_key_padding_mask=mask) - result = result.detach().numpy() - self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - np.testing.assert_allclose(result, ref_output, atol=1e-5) - mask = torch.tensor([[1, 0]]) == 1 - result = model(encoder_input, src_key_padding_mask=mask) - ref_output = perm_fn(torch.tensor([[[2.301516, 0.092249, -0.679101, 0.103088]], - [[2.301516, 0.092249, -0.679101, 0.103088]]])) - result = result.detach().numpy() - ref_output = ref_output.detach().numpy() - self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - np.testing.assert_allclose(result, ref_output, atol=1e-5) - - # deterministic input - encoder_input = perm_fn(torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891], - [0.5387, 0.1655, 0.3565, 0.0471]], - [[0.8335, 0.2799, 0.5031, 0.2947], - [0.1402, 0.0318, 0.7636, 0.1346]], - [[0.6333, 0.9344, 0.1376, 0.9938], - [0.8924, 0.2872, 0.6692, 0.2944]], - [[0.9897, 0.6915, 0.3154, 0.1733], - [0.8645, 0.3513, 0.3064, 0.0767]], - [[0.8117, 0.2366, 0.4838, 0.7881], - [0.3718, 0.4945, 0.9511, 0.0864]]])) - result = model(encoder_input) - ref_output = perm_fn(torch.tensor([[[2.428589, 0.020835, -0.602055, -0.085249], - [2.427987, 0.021213, -0.602496, -0.084103]], - [[2.424689, 0.019155, -0.604793, -0.085672], - [2.413863, 0.022211, -0.612486, -0.072490]], - [[2.433774, 0.021598, -0.598343, -0.087548], - [2.425104, 0.019748, -0.604515, -0.084839]], - [[2.436185, 0.022682, -0.596625, -0.087261], - [2.433556, 0.021891, -0.598509, -0.086832]], - [[2.416246, 0.017512, -0.610712, -0.082961], - [2.422901, 0.024187, -0.606178, -0.074929]]])) - result = result.detach().numpy() - ref_output = ref_output.detach().numpy() - self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - np.testing.assert_allclose(result, ref_output, atol=1e-5) - # all 0 - mask = torch.zeros([2, 5]) == 1 - result = model(encoder_input, src_key_padding_mask=mask) - result = result.detach().numpy() - self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - np.testing.assert_allclose(result, ref_output, atol=1e-5) - mask[0, 1] = 1 - mask[1, 3] = 1 - mask[1, 4] = 1 - result = model(encoder_input, src_key_padding_mask=mask) - ref_output = perm_fn(torch.tensor([[[2.429026, 0.020793, -0.601741, -0.085642], - [2.428811, 0.021445, -0.601912, -0.084252]], - [[2.425009, 0.019155, -0.604566, -0.085899], - [2.415408, 0.02249 , -0.611415, -0.073]], - [[2.434199, 0.021682, -0.598039, -0.087699], - [2.42598, 0.019941, -0.603896, -0.085091]], - [[2.436457, 0.022736, -0.59643 , -0.08736], - [2.434021, 0.022093, -0.598179, -0.08679]], - [[2.416531, 0.017498, -0.610513, -0.083181], - [2.4242, 0.024653, -0.605266, -0.074959]]])) - result = result.detach().numpy() - ref_output = ref_output.detach().numpy() - self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) - np.testing.assert_allclose(result, ref_output, atol=1e-5) - - def test_transformerencoderlayer_gelu(self): - # this is a deterministic test for TransformerEncoderLayer with gelu activation - d_model = 4 - nhead = 2 - dim_feedforward = 16 - dropout = 0.0 - bsz = 2 - - for activation, batch_first in product(('gelu', F.gelu, nn.GELU()), (True, False)): - def perm_fn(x): - return x.transpose(1, 0) if batch_first else x - - model = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, - activation, batch_first=batch_first) - - # set constant weights of the model - for idx, p in enumerate(model.parameters()): - x = p.data - sz = x.view(-1).size(0) - shape = x.shape - x = torch.cos(torch.arange(0, sz).float().view(shape)) - p.data.copy_(x) - - # deterministic input - encoder_input = torch.tensor([[[20., 30., 40., 50.]]]) - result = model(encoder_input) - ref_output = torch.tensor([[[2.249815, 0.131006, -0.702199, 0.177868]]]) - torch.testing.assert_close(result, ref_output, rtol=1e-5, atol=0) - - # deterministic input - encoder_input = perm_fn(torch.tensor([[[1., 2., 3., 4.]], - [[5., 6., 7., 8.]]])) - result = model(encoder_input) - ref_output = perm_fn(torch.tensor([[[2.264103, 0.121417, -0.696012, 0.159724]], - [[2.264103, 0.121417, -0.696012, 0.159724]]])) - torch.testing.assert_close(result, ref_output, rtol=1e-5, atol=0) - - # deterministic input - encoder_input = perm_fn(torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891], - [0.5387, 0.1655, 0.3565, 0.0471]], - [[0.8335, 0.2799, 0.5031, 0.2947], - [0.1402, 0.0318, 0.7636, 0.1346]], - [[0.6333, 0.9344, 0.1376, 0.9938], - [0.8924, 0.2872, 0.6692, 0.2944]], - [[0.9897, 0.6915, 0.3154, 0.1733], - [0.8645, 0.3513, 0.3064, 0.0767]], - [[0.8117, 0.2366, 0.4838, 0.7881], - [0.3718, 0.4945, 0.9511, 0.0864]]])) - result = model(encoder_input) - ref_output = perm_fn(torch.tensor([[[2.42163188, 0.03227153, -0.60714219, -0.05908082], - [2.42151276, 0.03302179, -0.60722523, -0.05762651]], - [[2.41926761, 0.02974034, -0.60879519, -0.0621269], - [2.41626395, 0.03539356, -0.61087842, -0.04978623]], - [[2.42382808, 0.03218872, -0.6055963, -0.06073591], - [2.41983477, 0.03085259, -0.60840145, -0.06046414]], - [[2.42500749, 0.03328855, -0.60476388, -0.0595334], - [2.4237977, 0.03290575, -0.60561789, -0.05940082]], - [[2.41383916, 0.02686345, -0.61256377, -0.06380707], - [2.42000277, 0.03800944, -0.60824798, -0.04754947]]])) - torch.testing.assert_close(result, ref_output, rtol=1e-5, atol=0) - def test_transformerdecoderlayer(self): # this is a deterministic test for TransformerDecoderLayer d_model = 4 @@ -7633,7 +7916,7 @@ def get_a_test_layer(use_cuda, activation, batch_first=False): use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") - for batch_first in (True, False): + def _test(batch_first, training): def perm_fn(x): return x.transpose(1, 0) if batch_first else x @@ -7641,6 +7924,8 @@ def perm_fn(x): batch_first=batch_first) model = nn.TransformerEncoder(encoder_layer, 1).to(device) + if not training: + model = model.eval() # deterministic input encoder_input = perm_fn(torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891], @@ -7694,6 +7979,8 @@ def perm_fn(x): # test case 2, multiple layers no norm model = nn.TransformerEncoder(encoder_layer, 2).to(device) + if not training: + model = model.eval() result = model(encoder_input, src_key_padding_mask=mask) ref_output = perm_fn(torch.tensor([[[2.419051, 0.017446, -0.608738, -0.085003], [2.419102, 0.017452, -0.608703, -0.085026]], @@ -7710,6 +7997,8 @@ def perm_fn(x): torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5) model = nn.TransformerEncoder(encoder_layer, 6).to(device) + if not training: + model = model.eval() result = model(encoder_input, src_key_padding_mask=mask) ref_output = perm_fn(torch.tensor([[[2.419101, 0.017453, -0.608703, -0.085025], [2.419101, 0.017453, -0.608704, -0.085025]], @@ -7729,6 +8018,8 @@ def perm_fn(x): # d_model = 4 norm = nn.LayerNorm(4) model = nn.TransformerEncoder(encoder_layer, 2, norm=norm).to(device) + if not training: + model = model.eval() result = model(encoder_input, src_key_padding_mask=mask) ref_output = perm_fn(torch.tensor([[[1.695949, -0.357635, -0.893077, -0.445238], [1.695955, -0.357639, -0.893050, -0.445266]], @@ -7745,6 +8036,8 @@ def perm_fn(x): torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5) model = nn.TransformerEncoder(encoder_layer, 6, norm=norm).to(device) + if not training: + model = model.eval() result = model(encoder_input, src_key_padding_mask=mask) ref_output = perm_fn(torch.tensor([[[1.695955, -0.357639, -0.893051, -0.445265], [1.695955, -0.357639, -0.893051, -0.445265]], @@ -7759,7 +8052,15 @@ def perm_fn(x): )).to(device) self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5) - + for batch_first in (True, False): + for training in (True, False): + # Fast path requires inference mode. + if training: + cm = contextlib.nullcontext() + else: + cm = torch.no_grad() + with cm: + _test(batch_first, training) def test_transformerdecoder(self): def get_a_test_layer(use_cuda, activation, batch_first=False): @@ -9142,6 +9443,28 @@ def test_pixel_shuffle_unshuffle_5D(): test_pixel_shuffle_unshuffle_4D() test_pixel_shuffle_unshuffle_5D() + def test_pixel_shuffle_nhwc_cpu(self): + input = torch.randn(3, 18, 4, 4, device='cpu') + input = input.contiguous(memory_format=torch.channels_last).requires_grad_() + grad = torch.randn(3, 18, 4, 4, device='cpu') + ps = torch.nn.PixelShuffle(3) + pus = torch.nn.PixelUnshuffle(3) + + ref_input = input.detach().clone().contiguous().requires_grad_(True) + ref_grad = grad.detach().clone().contiguous() + ref_ps = torch.nn.PixelShuffle(3) + ref_pus = torch.nn.PixelUnshuffle(3) + + out = pus(ps(input)) + out.backward(grad) + ref_out = ref_pus(ref_ps(ref_input)) + ref_out.backward(ref_grad) + + self.assertTrue(out.is_contiguous(memory_format=torch.channels_last)) + self.assertTrue(ref_out.is_contiguous()) + self.assertEqual(out, ref_out) + self.assertEqual(input.grad, ref_input.grad) + # These tests should be OpInfo'd def test_elu_inplace_on_view(self): v = torch.tensor([1.0, -1.0, 1.0, -1.0], requires_grad=True) @@ -9179,55 +9502,15 @@ def func(root): gradcheck(func, [v]) gradgradcheck(func, [v]) - @unittest.skipIf(not TEST_CUDA, 'CUDA not available') def test_PReLU_backward_requires_grad_false(self): - m = nn.PReLU().to('cuda') - x = torch.randn(2, 3, 4, 5, requires_grad=False, device='cuda') - y = m(x) - y.mean().backward() - self.assertEqual(x.grad, None) - - @unittest.skipIf( - not TEST_NUMPY or not TEST_SCIPY, "Numpy or Scipy not found") - def test_gelu(self): - def _test_gelu(n, m, dtype, contiguous, atol=None, rtol=None): - numpy_dtype = { - torch.bfloat16: torch.float, torch.float: torch.float, torch.double: torch.double - }[dtype] - devices = ['cpu'] - devices += ['cuda'] if TEST_CUDA else [] - - def _gelu_ref(X): - return X * stats.norm.cdf(X) - - for d in devices: - if contiguous: - X = torch.rand(n, m, dtype=dtype, requires_grad=True, device=d) - else: - X = torch.rand(n, m, dtype=dtype, requires_grad=True, device=d)[:, ::2] - res = F.gelu(X) - ref = _gelu_ref(X.to(numpy_dtype).cpu().detach().numpy()) - self.assertEqual(res, ref, rtol=rtol, atol=atol, exact_dtype=False) - if dtype == torch.float64: - gradcheck(F.gelu, [X], eps=1e-4) - - for n in range(1, 10): - for m in range(1, 10): - _test_gelu(n, m, torch.bfloat16, True, 1e-2, 0) - _test_gelu(n, m, torch.bfloat16, False, 1e-2, 0) - _test_gelu(n, m, torch.float32, True) - _test_gelu(n, m, torch.float32, False) - _test_gelu(n, m, torch.float64, True) - _test_gelu(n, m, torch.float64, False) - - # Test multi threaded - num_threads = torch.get_num_threads() - torch.set_num_threads(4) - try: - _test_gelu(32, 32, torch.float32, False) - finally: - torch.set_num_threads(num_threads) - + devices = ['cpu'] + devices += ['cuda'] if TEST_CUDA else [] + for d in devices: + m = nn.PReLU().to(d) + x = torch.randn(2, 3, 4, 5, device=d, requires_grad=False) + y = m(x) + y.mean().backward() + self.assertEqual(x.grad, None) def test_bce_loss_always_nonnegative(self): target = torch.ones(5) @@ -9445,22 +9728,26 @@ def test_hardtanh_backward(self): self.assertEqual(x.grad, x_grad_ref) def test_batchnorm_nhwc_cpu(self): - def helper(self, size): + def helper(self, size, dtype, mixed_dtype=False): channels = size[1] - input = torch.randn(size, dtype=torch.float32, device='cpu', requires_grad=True) - input = input.contiguous(memory_format=torch.channels_last) + input = torch.randn(size, dtype=dtype, device='cpu', requires_grad=True) + input = input.contiguous(memory_format=torch.channels_last).to(dtype) input.retain_grad() - grad = torch.randn(size, dtype=torch.float32, device='cpu') + grad = torch.randn(size, dtype=dtype, device='cpu') grad = grad.contiguous(memory_format=torch.channels_last) - bn = nn.BatchNorm2d(channels).cpu().float() + bn = nn.BatchNorm2d(channels).cpu().to(dtype) bn.weight.data.uniform_() bn.bias.data.uniform_() ref_input = input.detach().clone().contiguous().requires_grad_(True) ref_grad = grad.detach().clone().contiguous() - ref_bn = nn.BatchNorm2d(channels).cpu().float() + ref_bn = nn.BatchNorm2d(channels).cpu().to(dtype) ref_bn.load_state_dict(bn.state_dict()) + if mixed_dtype: + bn.float() + ref_bn.float() + out = bn(input) out.backward(grad) ref_out = ref_bn(ref_input) @@ -9473,9 +9760,11 @@ def helper(self, size): self.assertEqual(bn.bias.grad, ref_bn.bias.grad) self.assertEqual(input.grad, ref_input.grad) - helper(self, (4, 8, 10, 10)) - helper(self, (4, 1, 9, 9)) - helper(self, (4, 9, 1, 1)) + # test NC11 and N1HW; test mixed dtype + for shape in [(4, 8, 10, 10), (4, 1, 9, 9), (4, 9, 1, 1)]: + helper(self, shape, torch.float, False) + helper(self, shape, torch.bfloat16, False) + helper(self, shape, torch.bfloat16, True) def test_batchnorm_non_contig_cpu(self): input = torch.arange(6, dtype=torch.float).reshape(1, 3, 2, 1).cpu() @@ -9603,6 +9892,29 @@ def test_batchnorm_raises_error_if_bias_is_not_same_size_as_input(self): with self.assertRaises(RuntimeError): F.batch_norm(input, running_mean, running_var, bias=Parameter(torch.rand(size))) + def test_batchnorm_raises_error_if_running_var_or_running_mean_have_forward_grad(self): + args = ( + torch.randn(3, 2, 5), # input + torch.randn(2), # running_mean + torch.randn(2), # running_var + ) + kwargs = {'training': False, 'momentum': -1.2} + fn = partial(F.batch_norm, **kwargs) + + for dual_indices in ((0,), (1,), (1, 2), (0, 1), (0, 1, 2),): + tangents = tuple(torch.rand_like(x) for x in args) + + with fwAD.dual_level(): + duals = [fwAD.make_dual(primal, tangent) if i in dual_indices else primal + for i, (primal, tangent) in enumerate(zip(args, tangents))] + msg = "batch_norm is not differentiable wrt running_mean and running_var" + # 0 needs to have forward grad because otherwise we won't even run batch_norm_jvp + if (1 in dual_indices or 2 in dual_indices) and 0 in dual_indices: + with self.assertRaisesRegex(RuntimeError, msg): + fn(*duals) + else: + fn(*duals) + def test_batchnorm_buffer_update_when_stats_are_not_tracked(self): input_size = (32, 4) # Instantiate BN with buffers that are not None @@ -9689,22 +10001,6 @@ def func(x): # just run a single backward, as gradcheck/gradgradcheck is expensive here output.sum().backward() - def test_binary_cross_entropy_grads(self): - import torch.nn.functional as F - for device in device_(): - input = torch.rand(3, 3, dtype=torch.double, device=device, requires_grad=True) - target = torch.rand(3, 3, dtype=torch.double, device=device) - - gradcheck(F.binary_cross_entropy, [input, target]) - gradgradcheck(F.binary_cross_entropy, [input, target]) - - # now with diffentiable target - target.requires_grad_(True) - gradcheck(F.binary_cross_entropy, [input, target], check_batched_grad=False) - # no double backward for target yet - with self.assertRaisesRegex(RuntimeError, "not implemented"): - gradgradcheck(F.binary_cross_entropy, [input, target], check_batched_grad=False) - def test_cosine_embedding_loss_with_diff_type(self): for device in device_(): input1 = torch.tensor([[2, 3, 4], [6, 2, 4]], dtype=torch.double, device=device) @@ -9957,11 +10253,20 @@ def test_cosine_similarity(self): self.assertLessEqual(out, 1.0) # Check dividing by 0. + # previous behavior: /max(eps, ||x|| * ||y||) + # current: + # if f(x,y) is the cosine similarity, then + # df/dx = y/(||x|| * ||y||) - (x * * ||y||/||x||)/(||x|| * ||y||)^2 + # the tests below check division by zero in the backward formula when + # x := input2 = 0, y := input1 != 0. + # For these inputs the gradient wrt x simplifies to g(x,y) := y/(||x|| * ||y||) + # Previous test checks g(x,y) == y/eps, + # Current test checks g(x,y) == (y/||y||)/eps. input1 = torch.randn(10).requires_grad_() input2 = torch.zeros_like(input1).requires_grad_() torch.cosine_similarity(input1, input2, 0).sum().backward() self.assertEqual(input1.grad, torch.zeros_like(input1)) - self.assertEqual(input2.grad, input1 * 1e8) + self.assertEqual(input2.grad, input1 / input1.norm() * 1e8) # Check type promotion, issue #61454 input = torch.tensor(12.) @@ -9981,10 +10286,10 @@ def test_grid_sample_error_checking(self): with self.assertRaisesRegex(ValueError, "but got: 'garbage'"): F.grid_sample(input, grid, padding_mode='garbage', align_corners=False) - with self.assertRaisesRegex(RuntimeError, "expected 4D or 5D input"): + with self.assertRaisesRegex(RuntimeError, "expected grid to have size 1 in last dimension"): F.grid_sample(input[0], grid, align_corners=False) - with self.assertRaisesRegex(RuntimeError, "grid with same number of dimensions"): + with self.assertRaisesRegex(RuntimeError, "expected grid to have size 2 in last dimension"): F.grid_sample(input, torch.empty(1, 1, 1, 1, 3), align_corners=False) with self.assertRaisesRegex(RuntimeError, "expected grid and input to have same batch size"): @@ -10000,7 +10305,7 @@ def test_grid_sample_error_checking(self): F.grid_sample(torch.empty(1, 1, 2, 2, 2), torch.empty(1, 1, 1, 1, 3), mode='bicubic') if TEST_CUDA: - with self.assertRaisesRegex(RuntimeError, "expected input and grid to be on same device"): + with self.assertRaisesRegex(RuntimeError, "Expected all tensors to be on the same device"): F.grid_sample(input.cuda(), grid, align_corners=False) def test_affine_grid_error_checking(self): @@ -10077,7 +10382,6 @@ def test_affine_grid_error_checking(self): with self.assertRaisesRegex(NotImplementedError, "affine_grid only supports 4D and 5D sizes"): F.affine_grid(theta, torch.Size([1, 1, 2, 2, 2, 2]), align_corners=False) - @skipIfRocm def test_grid_sample(self): # Backward pass of native C++ and CUDA kernels branch depending on whether input requires gradient, # so we test both cases. @@ -10426,26 +10730,24 @@ def get_grid(device='cpu', data=None): W = random.randint(2, 8) input = torch.randn(N, C, H, W, requires_grad=True) grid = torch.randn(N, H, W, 2, requires_grad=True) - self.assertTrue(gradcheck( - lambda inp, grid: F.grid_sample(inp, grid, mode=mode, padding_mode=padding_mode, - align_corners=align_corners), - (input, grid))) - input = input.requires_grad_(False) - self.assertTrue(gradcheck( - lambda grid: F.grid_sample(input, grid, mode=mode, padding_mode=padding_mode, - align_corners=align_corners), - (grid,))) for input_requires_grad in [False, True]: + input.requires_grad_(input_requires_grad) + self.assertTrue(gradcheck( + lambda inp, grd: F.grid_sample(inp, grd, mode=mode, padding_mode=padding_mode, + align_corners=align_corners), + (input, grid))) test(N, C, H, W, mode, padding_mode, align_corners, input_requires_grad) if TEST_CUDNN: with cudnn.flags(enabled=False): test(N, C, H, W, mode, padding_mode, align_corners, input_requires_grad) def test_grid_sample_3d(self): - def test(N, C, D, H, W, mode, padding_mode, align_corners): + # Backward pass of native C++ and CUDA kernels branch depending on whether input requires gradient, + # so we test both cases. + def test(N, C, D, H, W, mode, padding_mode, align_corners, input_requires_grad): def test_shape(N, C, ID, IH, IW, D, H, W, mode, padding_mode, align_corners): - input_cpu = torch.randn(C, N, ID, IH, IW).transpose(0, 1).requires_grad_() + input_cpu = torch.randn(C, N, ID, IH, IW).transpose(0, 1).requires_grad_(input_requires_grad) grid_cpu = torch.randn(D, N, H, W, 3).transpose(0, 1).requires_grad_() out_cpu = F.grid_sample(input_cpu, grid_cpu, mode=mode, padding_mode=padding_mode, align_corners=align_corners) @@ -10455,24 +10757,25 @@ def test_shape(N, C, ID, IH, IW, D, H, W, mode, padding_mode, align_corners): out_cpu.backward(gradients) if TEST_CUDA: - input_cuda = input_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_() + input_cuda = input_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_(input_requires_grad) grid_cuda = grid_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_() out_cuda = F.grid_sample(input_cuda, grid_cuda, mode=mode, padding_mode=padding_mode, align_corners=align_corners) self.assertEqual(out_cpu, out_cuda) out_cuda.backward(gradients.cuda()) - self.assertEqual(input_cpu.grad, input_cuda.grad) + if input_requires_grad: + self.assertEqual(input_cpu.grad, input_cuda.grad) self.assertEqual(grid_cpu.grad, grid_cuda.grad, atol=5e-5, rtol=0) # check that zero-dimensional input strides don't error out base_input = torch.randn(N, C, 1, IH, IW) - input_cpu = base_input.expand_as(input_cuda).requires_grad_() + input_cpu = base_input.expand_as(input_cuda).requires_grad_(input_requires_grad) grid_cpu = torch.randn(N, D, H, W, 3, requires_grad=True) out_cpu = F.grid_sample(input_cpu, grid_cpu, mode=mode, padding_mode=padding_mode, align_corners=align_corners) - input_cuda = base_input.cuda().expand_as(input_cuda).requires_grad_() + input_cuda = base_input.cuda().expand_as(input_cuda).requires_grad_(input_requires_grad) grid_cuda = grid_cpu.detach().cuda().requires_grad_() out_cuda = F.grid_sample(input_cuda, grid_cuda, mode=mode, padding_mode=padding_mode, align_corners=align_corners) @@ -10558,8 +10861,14 @@ def test_shape(N, C, ID, IH, IW, D, H, W, mode, padding_mode, align_corners): lambda inp, grid: F.grid_sample(inp, grid, mode=mode, padding_mode=padding_mode, align_corners=align_corners), (input, grid))) + input = input.requires_grad_(False) + self.assertTrue(gradcheck( + lambda grid: F.grid_sample(input, grid, mode=mode, padding_mode=padding_mode, + align_corners=align_corners), + (grid,))) - test(N, C, D, H, W, mode, padding_mode, align_corners) + for input_requires_grad in [False, True]: + test(N, C, D, H, W, mode, padding_mode, align_corners, input_requires_grad) def test_affine_grid(self): # test known input on CPU @@ -10878,6 +11187,30 @@ def test_upsampling_small_scale(self): expected_out_t = torch.tensor([[[[2.5]]]]) self.assertEqual(expected_out_t, out_t) + def test_upsampling_bfloat16(self, dtype=torch.bfloat16): + def helper(size, scale_factor, mode, device): + inputf = torch.randn(size, device=device, dtype=torch.float, requires_grad=True) + input = inputf.to(dtype).detach().requires_grad_(True) + m = nn.Upsample(scale_factor=scale_factor, mode=mode) + + outf = m(inputf) + out = m(input) + self.assertEqual(out.dtype, dtype) + self.assertEqualIgnoreType(out, outf, atol=0.1, rtol=0.0) + + out.sum().backward() + outf.sum().backward() + self.assertEqual(input.grad.dtype, dtype) + self.assertEqual(input.grad, inputf.grad.to(dtype), atol=0.1, rtol=0) + + for device in ['cpu']: + helper([3, 20, 30], 2, 'nearest', device) + helper([3, 20, 11, 7], 2, 'nearest', device) + helper([3, 20, 11, 7, 3], 2, 'nearest', device) + helper([3, 20, 30], 2, 'linear', device) + helper([3, 20, 11, 7], 2, 'bilinear', device) + helper([3, 20, 11, 7, 3], 2, 'trilinear', device) + @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") def test_interpolate_illegal_memory_access(self): in_s = 45 @@ -11388,6 +11721,12 @@ def test_cross_entropy_loss_precision(self): outd = loss_cpu(inputd, target) self.assertEqual(outf, outd, exact_dtype=False) + def test_cross_entropy_loss_zero_div(self): + # Test for issue #73165 + input_1 = torch.rand([5, 0], dtype=torch.float32) + input_2 = torch.rand([5, 0], dtype=torch.float32) + torch.nn.CrossEntropyLoss()(input_1, input_2) + @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available") def test_convert_sync_batchnorm(self): module = torch.nn.Sequential( @@ -12771,6 +13110,17 @@ def _test_LayerNorm_cuda_half(self, device): output.sum().backward() self.assertEqualTypeString(output, input) + def _test_LayerNorm_cpu_mixed_dtype(self, device): + for elementwise_affine in [True, False]: + # layer norm input shape is normalized to m x n, cpu vectorized on n, + # so make sure n exceeds vector length + input = torch.empty(2, 3, 11, 3, device=device, dtype=torch.bfloat16).random_(1, 10) + m = nn.LayerNorm([11, 3], elementwise_affine=elementwise_affine).to(device, torch.bfloat16) + m2 = deepcopy(m).to(device, torch.float) + out = m(input) + out2 = m2(input) + self.assertEqual(out, out2) + def _test_GroupNorm_general(self, device, dtype=torch.float): good_shape_g = { (1, 2, 3, 4): 2, @@ -12824,9 +13174,8 @@ def _test_GroupNorm_general(self, device, dtype=torch.float): (2, 6, 4, 2, 2): 4, } for shape, g in bad_shape_g.items(): - gn = nn.GroupNorm(g, shape[1]) - input = torch.empty(*shape, device=device, dtype=dtype).uniform_(0, 10) - self.assertRaises(RuntimeError, lambda: gn(input)) + with self.assertRaises(ValueError): + gn = nn.GroupNorm(g, shape[1]) def _test_GroupNorm_cuda_half(self): input = torch.zeros(2, 4, 3, 2, requires_grad=True).cuda().half().random_(1, 10) @@ -12835,17 +13184,20 @@ def _test_GroupNorm_cuda_half(self): output.sum().backward() self.assertEqualTypeString(output, input) - def _test_module_empty_input(self, module, inp, check_size=True): - inp.requires_grad_(True) + def _test_module_empty_input(self, module, inp, check_size=True, inference=False): + if not inference: + inp.requires_grad_(True) out = module(inp) - gO = torch.rand_like(out) - out.backward(gO) + if not inference: + gO = torch.rand_like(out) + out.backward(gO) if check_size: self.assertEqual(out.size(), inp.size()) - for p in module.parameters(): - if p.requires_grad: - self.assertEqual(p.grad, torch.zeros_like(p.grad)) - self.assertEqual(inp.grad, torch.zeros_like(inp)) + if not inference: + for p in module.parameters(): + if p.requires_grad: + self.assertEqual(p.grad, torch.zeros_like(p.grad)) + self.assertEqual(inp.grad, torch.zeros_like(inp)) def _test_module_empty_inputs(self, module, inputs): for _inp in inputs: @@ -13097,7 +13449,7 @@ def test_affine_3d_rotateRandom(self, device): @onlyCUDA @skipCUDAIfNoCudnn - @dtypes(*get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM)) + @dtypes(*floating_and_complex_types_and(torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else [])) def test_Conv2d_deterministic_cudnn(self, device, dtype): inputs = torch.randn(2, 3, 5, 5, device=device, dtype=dtype, requires_grad=True) with cudnn.flags(enabled=True, benchmark=True, deterministic=True): @@ -13116,7 +13468,7 @@ def test_Conv2d_deterministic_cudnn(self, device, dtype): @onlyCUDA - @dtypes(*get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM)) + @dtypes(*floating_types_and(torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else [])) def test_Conv2d_large_workspace(self, device, dtype): # These sizes require huge cuDNN workspaces. Make sure we choose a # reasonable algorithm that does not run out of memory @@ -13241,7 +13593,7 @@ def test_Conv3d_depthwise_naive_groups(self, device, dtype): @onlyCUDA - @dtypes(*get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM)) + @dtypes(*floating_types_and(torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else [])) def test_noncontig_conv_grad(self, device, dtype): # FIXME: remove after adding non-contiguous grad tests for all modules module = nn.Conv2d(3, 5, kernel_size=3, padding=1).to(device, dtype) @@ -13357,8 +13709,8 @@ def test_conv_double_backward_stride(self): batch_size, inp_size, dilation, no_weight) - - def test_conv1d_same_padding(self, device): + @dtypes(torch.float, torch.cfloat) + def test_conv1d_same_padding(self, device, dtype): # Test padding='same' outputs the correct shape test_args = [ # in_size @@ -13371,22 +13723,22 @@ def test_conv1d_same_padding(self, device): [1], ] for in_size, k_size, dilation, stride in itertools.product(*test_args): - x = torch.rand(1, 1, in_size, device=device) - y = torch.rand(1, 1, k_size, device=device) + x = torch.rand(1, 1, in_size, device=device, dtype=dtype) + y = torch.rand(1, 1, k_size, device=device, dtype=dtype) z = F.conv1d(x, y, padding='same', dilation=dilation, stride=stride) self.assertEqual(z.size(2), int(math.ceil(in_size / stride))) # Compare F.conv1d padding='same' output against manual padding # Without strides/dilation - x = torch.rand(1, 1, 12, device=device) - y = torch.rand(1, 1, 3, device=device) + x = torch.rand(1, 1, 12, device=device, dtype=dtype) + y = torch.rand(1, 1, 3, device=device, dtype=dtype) expect = F.conv1d(x, y, padding=1) actual = F.conv1d(x, y, padding='same') self.assertEqual(expect, actual) # With dilation - x = torch.rand(1, 1, 12, device=device) - y = torch.rand(1, 1, 4, device=device) + x = torch.rand(1, 1, 12, device=device, dtype=dtype) + y = torch.rand(1, 1, 4, device=device, dtype=dtype) expect = F.conv1d(x, y, padding=3, dilation=2) actual = F.conv1d(x, y, padding='same', dilation=2) self.assertEqual(expect, actual) @@ -13396,76 +13748,89 @@ def test_conv1d_same_padding(self, device): actual = F.conv1d(x, y, padding='same', dilation=3) self.assertEqual(expect, actual) - - def test_conv2d_same_padding(self, device): + @dtypes(torch.float, torch.cfloat) + def test_conv2d_same_padding(self, device, dtype): + if dtype is torch.cfloat: + rtol, atol = 2e-6, 2e-6 + else: + rtol, atol = None, None # Compare F.conv2d padding='same' output against manual padding # Without strides/dilation - x = torch.rand(1, 1, 10, 11, device=device) - y = torch.rand(1, 1, 4, 5, device=device) + x = torch.rand(1, 1, 10, 11, device=device, dtype=dtype) + y = torch.rand(1, 1, 4, 5, device=device, dtype=dtype) expect = F.conv2d(x, y, padding=(2, 2))[..., 1:, :] actual = F.conv2d(x, y, padding='same') - self.assertEqual(expect, actual) + self.assertEqual(expect, actual, rtol=rtol, atol=atol) # With dilation - y = torch.rand(1, 1, 3, 4, device=device) + y = torch.rand(1, 1, 3, 4, device=device, dtype=dtype) expect = F.conv2d(x, y, padding=(2, 3), dilation=2) actual = F.conv2d(x, y, padding='same', dilation=2) - self.assertEqual(expect, actual) + self.assertEqual(expect, actual, rtol=rtol, atol=atol) # Dilation with asymmetric padding - y = torch.rand(1, 1, 4, 4, device=device) + y = torch.rand(1, 1, 4, 4, device=device, dtype=dtype) expect = F.conv2d(x, y, padding=5, dilation=3)[..., 1:, 1:] actual = F.conv2d(x, y, padding='same', dilation=3) - self.assertEqual(expect, actual) + self.assertEqual(expect, actual, rtol=rtol, atol=atol) - def test_conv3d_same_padding(self, device): + @dtypes(torch.float, torch.cfloat) + def test_conv3d_same_padding(self, device, dtype): + if dtype is torch.cfloat: + rtol, atol = 2e-6, 2e-6 + else: + rtol, atol = None, None # Compare F.conv3d padding='same' output against manual padding # Without strides/dilation - x = torch.rand(1, 1, 10, 11, 12, device=device) - y = torch.rand(1, 1, 1, 2, 5, device=device) + x = torch.rand(1, 1, 10, 11, 12, device=device, dtype=dtype) + y = torch.rand(1, 1, 1, 2, 5, device=device, dtype=dtype) expect = F.conv3d(x, y, padding=(0, 1, 2))[..., :, 1:, :] actual = F.conv3d(x, y, padding='same') - self.assertEqual(expect, actual) + self.assertEqual(expect, actual, rtol=rtol, atol=atol) # With dilation expect = F.conv3d(x, y, padding=(0, 1, 4), dilation=2) actual = F.conv3d(x, y, padding='same', dilation=2) - self.assertEqual(expect, actual) + self.assertEqual(expect, actual, rtol=rtol, atol=atol) # Dilation with asymmetric padding - y = torch.rand(1, 1, 4, 4, 4, device=device) + y = torch.rand(1, 1, 4, 4, 4, device=device, dtype=dtype) expect = F.conv3d(x, y, padding=5, dilation=3)[..., 1:, 1:, 1:] actual = F.conv3d(x, y, padding='same', dilation=3) - self.assertEqual(expect, actual) + self.assertEqual(expect, actual, rtol=rtol, atol=atol) - def test_conv1d_valid_padding(self, device): + @dtypes(torch.float, torch.cfloat) + def test_conv1d_valid_padding(self, device, dtype): # Test F.conv1d padding='valid' is the same as no padding - x = torch.rand(1, 1, 10, device=device) - y = torch.rand(1, 1, 4, device=device) + x = torch.rand(1, 1, 10, device=device, dtype=dtype) + y = torch.rand(1, 1, 4, device=device, dtype=dtype) expect = F.conv1d(x, y) actual = F.conv1d(x, y, padding='valid') self.assertEqual(expect, actual) - def test_conv2d_valid_padding(self, device): + @dtypes(torch.float, torch.cfloat) + def test_conv2d_valid_padding(self, device, dtype): # Test F.conv2d padding='valid' is the same as no padding - x = torch.rand(1, 1, 1, 10, device=device) - y = torch.rand(1, 1, 1, 4, device=device) + x = torch.rand(1, 1, 1, 10, device=device, dtype=dtype) + y = torch.rand(1, 1, 1, 4, device=device, dtype=dtype) expect = F.conv2d(x, y) actual = F.conv2d(x, y, padding='valid') self.assertEqual(expect, actual) - def test_conv3d_valid_padding(self, device): + @dtypes(torch.float, torch.cfloat) + def test_conv3d_valid_padding(self, device, dtype): # Test F.conv3d padding='valid' is the same as no padding - x = torch.rand(1, 1, 1, 1, 10, device=device) - y = torch.rand(1, 1, 1, 1, 4, device=device) + x = torch.rand(1, 1, 1, 1, 10, dtype=dtype, device=device) + y = torch.rand(1, 1, 1, 1, 4, dtype=dtype, device=device) expect = F.conv3d(x, y) actual = F.conv3d(x, y, padding='valid') self.assertEqual(expect, actual) - def test_conv1d_same_padding_backward(self, device): + @dtypes(torch.float, torch.cfloat) + def test_conv1d_same_padding_backward(self, device, dtype): # Test F.conv1d gradients work with padding='same' - x = torch.rand(1, 1, 12, device=device, requires_grad=True) - y = torch.rand(1, 1, 4, device=device, requires_grad=True) + x = torch.rand(1, 1, 12, dtype=dtype, device=device, requires_grad=True) + y = torch.rand(1, 1, 4, dtype=dtype, device=device, requires_grad=True) # Symmetric padding z = F.conv1d(x, y, padding=3, dilation=2) @@ -13490,10 +13855,11 @@ def test_conv1d_same_padding_backward(self, device): self.assertEqual(gx_expect, x.grad) self.assertEqual(gy_expect, y.grad) - def test_conv2d_same_padding_backward(self, device): + @dtypes(torch.float, torch.cfloat) + def test_conv2d_same_padding_backward(self, device, dtype): # Test F.conv2d gradients work with padding='same' - x = torch.rand(1, 1, 10, 11, device=device, requires_grad=True) - y = torch.rand(1, 1, 4, 5, device=device, requires_grad=True) + x = torch.rand(1, 1, 10, 11, device=device, dtype=dtype, requires_grad=True) + y = torch.rand(1, 1, 4, 5, device=device, dtype=dtype, requires_grad=True) # Symmetric padding z = F.conv2d(x, y, padding=(3, 4), dilation=2) @@ -13508,7 +13874,7 @@ def test_conv2d_same_padding_backward(self, device): x.grad, y.grad = None, None # Asymmetric padding - y = torch.rand(1, 1, 4, 4, device=device, requires_grad=True) + y = torch.rand(1, 1, 4, 4, device=device, dtype=dtype, requires_grad=True) z = F.conv2d(x, y, padding=2)[..., 1:, 1:] z.sum().backward() gx_expect, gy_expect = x.grad, y.grad @@ -13519,12 +13885,13 @@ def test_conv2d_same_padding_backward(self, device): self.assertEqual(gx_expect, x.grad) self.assertEqual(gy_expect, y.grad) - def test_conv3d_same_padding_backward(self, device): + @dtypes(torch.double, torch.cdouble) + def test_conv3d_same_padding_backward(self, device, dtype): check_forward_ad = torch.device(device).type != 'xla' # Test F.conv3d gradients work with padding='same' - x = torch.rand(1, 1, 1, 11, 12, device=device, requires_grad=True) - y = torch.rand(1, 1, 1, 2, 5, device=device, requires_grad=True) + x = torch.rand(1, 1, 1, 11, 12, dtype=dtype, device=device, requires_grad=True) + y = torch.rand(1, 1, 1, 2, 5, dtype=dtype, device=device, requires_grad=True) # Symmetric padding z = F.conv3d(x, y, padding=(0, 1, 4), dilation=2) @@ -13546,7 +13913,7 @@ def test_conv3d_same_padding_backward(self, device): check_fwd_over_rev=True) # Asymmetric padding - y = torch.rand(1, 1, 1, 4, 4, device=device, requires_grad=True) + y = torch.rand(1, 1, 1, 4, 4, dtype=dtype, device=device, requires_grad=True) z = F.conv3d(x, y, padding=2)[..., 1:, 1:] z.sum().backward() gx_expect, gy_expect = x.grad, y.grad @@ -13564,10 +13931,11 @@ def test_conv3d_same_padding_backward(self, device): gradgradcheck(lambda x, y: F.conv3d(x, y, padding='same'), (x, y), check_fwd_over_rev=True) - def test_conv1d_valid_padding_backward(self, device): + @dtypes(torch.float, torch.cfloat) + def test_conv1d_valid_padding_backward(self, device, dtype): # Test F.conv1d gradients work with padding='valid' - x = torch.rand(1, 1, 10, device=device, requires_grad=True) - y = torch.rand(1, 1, 4, device=device, requires_grad=True) + x = torch.rand(1, 1, 10, dtype=dtype, device=device, requires_grad=True) + y = torch.rand(1, 1, 4, dtype=dtype, device=device, requires_grad=True) F.conv1d(x, y, padding=0).sum().backward() gx_expect, gy_expect = x.grad, y.grad x.grad, y.grad = None, None @@ -13577,10 +13945,132 @@ def test_conv1d_valid_padding_backward(self, device): self.assertEqual(gx_expect, gx_actual) self.assertEqual(gy_expect, gy_actual) - def test_conv2d_valid_padding_backward(self, device): + @unittest.skipIf(not TEST_SCIPY, "Scipy required for the test.") + @dtypes(torch.float, torch.cfloat) + @parametrize_test("mode", ('valid', 'same')) + def test_conv1d_vs_scipy(self, device, dtype, mode): + t = make_tensor((1, 10), device=device, dtype=dtype) + feat_dim = t.shape[1] + weight_even = make_tensor((1, 1, 4), device=device, dtype=dtype) + weight_odd = make_tensor((1, 1, 5), device=device, dtype=dtype) + + def _test(t, weight, mode): + # SciPy expects two 1-D inputs. + t_a = t.view(-1).cpu().numpy() + w_a = weight.view(-1).cpu().numpy() + expected = scipy.signal.convolve(t_a, w_a, mode=mode) + + kwargs = {'padding': mode} + if mode == 'same': + # `same` padding in PyTorch conv1d is different + # from SciPy + p = weight.shape[2] // 2 + t = torch.nn.functional.pad(t, (p, p)) + # We have already taken care of padding + kwargs.pop("padding") + + # second input is flipped in SciPy's convolve + weight_flipped = torch.flip(weight, (2,)) + actual = torch.nn.functional.conv1d(t, weight_flipped, **kwargs).squeeze(0) + if mode == 'same': + actual = actual[:feat_dim] + + self.assertEqual(actual, expected) + + # Global dtype for this test suite is torch.double + # This leads to change in type-promotion + # and conv1d outputs `complex128` for `complex64` input. + with set_default_dtype(torch.float): + _test(t, weight_even, mode) + _test(t, weight_odd, mode) + + @unittest.skipIf(not TEST_SCIPY, "Scipy required for the test.") + @dtypes(torch.float, torch.cfloat) + @parametrize_test("mode", ('valid', 'same')) + def test_conv2d_vs_scipy(self, device, dtype, mode): + t = make_tensor((1, 5, 10), device=device, dtype=dtype) + weight_even = make_tensor((1, 1, 2, 4), device=device, dtype=dtype) + weight_odd = make_tensor((1, 1, 3, 5), device=device, dtype=dtype) + + def _test(t, weight, mode): + # SciPy expects two 2-D inputs. + t_a = t.squeeze(0).cpu().numpy() + w_a = weight.squeeze(0).squeeze(0).cpu().numpy() + expected = scipy.signal.convolve2d(t_a, w_a, mode=mode) + + kwargs = {'padding': mode} + if mode == 'same': + # `same` padding in PyTorch conv2d is different + # from SciPy + left_right_pad = weight.shape[3] // 2 + top_bottom_pad = weight.shape[2] // 2 + p = (left_right_pad, left_right_pad, top_bottom_pad, top_bottom_pad) + t = torch.nn.functional.pad(t, p) + # We have already taken care of padding + kwargs.pop("padding") + + # second input is flipped in SciPy's convolve2d + weight_flipped = torch.flip(weight, (2, 3)) + actual = torch.nn.functional.conv2d(t, weight_flipped, **kwargs).squeeze(0) + if mode == 'same': + actual = actual[:5, :10] + + self.assertEqual(actual, expected, rtol=2e-5, atol=5e-6) + + # Global dtype for this test suite is torch.double + # This leads to change in type-promotion + # and conv1d outputs `complex128` for `complex64` input. + with set_default_dtype(torch.float): + _test(t, weight_even, mode) + _test(t, weight_odd, mode) + + @unittest.skipIf(not TEST_SCIPY, "Scipy required for the test.") + @dtypes(torch.float, torch.cfloat) + @parametrize_test("mode", ('valid', 'same')) + def test_conv3d_vs_scipy(self, device, dtype, mode): + t = make_tensor((1, 5, 5, 10), device=device, dtype=dtype) + weight_even = make_tensor((1, 1, 2, 2, 4), device=device, dtype=dtype) + weight_odd = make_tensor((1, 1, 2, 3, 5), device=device, dtype=dtype) + + def _test(t, weight, mode): + # SciPy expects two 3-D inputs. + t_a = t.squeeze(0).cpu().numpy() + w_a = weight.squeeze(0).squeeze(0).cpu().numpy() + expected = scipy.signal.convolve(t_a, w_a, mode=mode) + + kwargs = {'padding': mode} + if mode == 'same': + # `same` padding in PyTorch conv3d is different + # from SciPy + left_right_pad = weight.shape[4] // 2 + top_bottom_pad = weight.shape[3] // 2 + front_back_pad = weight.shape[2] // 2 + p = (left_right_pad, left_right_pad, top_bottom_pad, top_bottom_pad, + front_back_pad, front_back_pad) + t = torch.nn.functional.pad(t, p) + # We have already taken care of padding + kwargs.pop("padding") + + # second input is flipped in SciPy's convolve + weight_flipped = torch.flip(weight, (2, 3, 4)) + actual = torch.nn.functional.conv3d(t, weight_flipped, **kwargs).squeeze(0) + if mode == 'same': + actual = actual[:5, :5, :10] + + self.assertEqual(actual, expected, rtol=2e-5, atol=5e-6) + + # Global dtype for this test suite is torch.double + # This leads to change in type-promotion + # and conv1d outputs `complex128` for `complex64` input. + with set_default_dtype(torch.float): + _test(t, weight_even, mode) + _test(t, weight_odd, mode) + + @dtypes(torch.float, torch.complex64) + def test_conv2d_valid_padding_backward(self, device, dtype): # Test F.conv2d gradients work with padding='valid' - x = torch.rand(1, 1, 1, 10, device=device, requires_grad=True) - y = torch.rand(1, 1, 1, 4, device=device, requires_grad=True) + x = torch.rand(1, 1, 1, 10, device=device, dtype=dtype, requires_grad=True) + y = torch.rand(1, 1, 1, 4, device=device, dtype=dtype, requires_grad=True) F.conv2d(x, y, padding=0).sum().backward() gx_expect, gy_expect = x.grad, y.grad x.grad, y.grad = None, None @@ -13590,12 +14080,13 @@ def test_conv2d_valid_padding_backward(self, device): self.assertEqual(gx_expect, gx_actual) self.assertEqual(gy_expect, gy_actual) - def test_conv3d_valid_padding_backward(self, device): + @dtypes(torch.double, torch.cdouble) + def test_conv3d_valid_padding_backward(self, device, dtype): check_forward_ad = torch.device(device).type != 'xla' # Test F.conv3d gradients work with padding='valid' - x = torch.rand(1, 1, 1, 1, 10, device=device, requires_grad=True) - y = torch.rand(1, 1, 1, 1, 4, device=device, requires_grad=True) + x = torch.rand(1, 1, 1, 1, 10, dtype=dtype, device=device, requires_grad=True) + y = torch.rand(1, 1, 1, 1, 4, dtype=dtype, device=device, requires_grad=True) F.conv3d(x, y, padding=0).sum().backward() gx_expect, gy_expect = x.grad, y.grad x.grad, y.grad = None, None @@ -13608,6 +14099,17 @@ def test_conv3d_valid_padding_backward(self, device): gradcheck(lambda x, y: F.conv3d(x, y, padding='valid'), (x, y), check_forward_ad=check_forward_ad) gradgradcheck(lambda x, y: F.conv3d(x, y, padding='valid'), (x, y), check_fwd_over_rev=check_forward_ad) + @parametrize_test("N", range(2, 4), name_fn=lambda N: 'ConvTranspose{}d'.format(N)) + def test_conv_transpose_with_output_size_and_no_batch_dim(self, device, N): + # For inputs with no batch dim, verify output is the correct shape when output_size is set. + # See https://github.com/pytorch/pytorch/issues/75889 + inp = torch.randn((1, 15, 13) if N == 2 else (1, 15, 13, 13), device=device) + output_size = (1, 240, 200) if N == 2 else (1, 240, 200, 200) + ConvTransposeNd = getattr(nn, 'ConvTranspose{}d'.format(N)) + m = ConvTransposeNd(1, 1, kernel_size=16, stride=16, padding=7, bias=False, device=device) + output = m(inp, output_size=output_size) + self.assertEqual(output.shape, output_size) + @skipMeta @parametrize_test("input_shape,transposed,dilated,groups,layout,backend_expected", [ # === slow === @@ -13798,6 +14300,20 @@ def _make_noncontiguous(inp): if layout is torch._mkldnn: return + if backend_actual != torch._C._ConvBackend.Empty: # FIXME: forward AD fails + # Forward AD and forward-over-reverse AD smoke test in float32 + # TODO: remove this if we introduce per-op gradient tests for float32 + with fwAD.dual_level(): + dual_inputs = [(fwAD.make_dual(i, torch.rand_like(i)) if isinstance(i, torch.Tensor) else i) for i in inputs] + # Forward AD + output = convolution(*dual_inputs) + # Forward over reverse AD + grad_output_d = fwAD.make_dual(torch.rand_like(output), torch.rand_like(output)) + if has_bias: + torch.autograd.grad(output, [x, weight, bias], grad_output_d) + else: + torch.autograd.grad(output, [x, weight], grad_output_d) + # Convert to float64 for gradcheck. x = x.to(torch.float64).detach().requires_grad_(True) weight = weight.to(torch.float64).detach().requires_grad_(True) @@ -13979,6 +14495,9 @@ def test_LayerNorm_general(self, device): if self.device_type == 'cuda': self._test_LayerNorm_cuda_half(device) + if self.device_type == 'cpu': + self._test_LayerNorm_cpu_mixed_dtype(device) + @onlyNativeDeviceTypes def test_LayerNorm_numeric(self, device): def layer_norm_ref(X, gamma, beta, normalized_shape, eps): @@ -14005,9 +14524,31 @@ def layer_norm_ref(X, gamma, beta, normalized_shape, eps): Y_cpu = layer_norm(X.cpu()) self.assertEqual(Y_cpu, Y, rtol=0, atol=1e-5) - @onlyNativeDeviceTypes - def test_GroupNorm_general(self, device): - self._test_GroupNorm_general(device) + @onlyCPU + def test_glu_bfloat16(self, device): + def test_dtype(fn, input, dtype): + input = input.detach().clone().to(dtype=dtype).requires_grad_(True) + input2 = input.detach().clone().float().requires_grad_(True) + out = fn(input) + out.sum().backward() + out2 = fn(input2) + out2.sum().backward() + self.assertEqual(out.dtype, dtype) + self.assertEqual(input.grad.dtype, dtype) + self.assertEqual(out, out2, exact_dtype=False) + self.assertEqual(input.grad, input2.grad, atol=1e-2, rtol=0, exact_dtype=False) + + def func(device): + return torch.nn.GLU(dim=-1).to(device) + + shapes = [[1, 3, 1, 6], [1, 3, 1, 128], [1, 3, 256, 256]] + for shape in shapes: + x = torch.randn(shape, device=device) + test_dtype(func(device), x, torch.bfloat16) + + @onlyNativeDeviceTypes + def test_GroupNorm_general(self, device): + self._test_GroupNorm_general(device) if self.device_type == 'cuda': self._test_GroupNorm_cuda_half() @@ -14028,13 +14569,13 @@ def test_GroupNorm_empty(self, device): @onlyCPU @dtypes(torch.float, torch.double) def test_groupnorm_nhwc(self, device, dtype): - def helper(self, size, groups): + def helper(self, size, groups, memory_format): channels = size[1] input = torch.randn(size, dtype=dtype, device=device, requires_grad=True) - input = input.contiguous(memory_format=torch.channels_last) + input = input.contiguous(memory_format=memory_format) input.retain_grad() grad = torch.randn(size, dtype=dtype, device=device) - grad = grad.contiguous(memory_format=torch.channels_last) + grad = grad.contiguous(memory_format=memory_format) gn = nn.GroupNorm(groups, channels).to(device).to(dtype) gn.weight.data.uniform_() gn.bias.data.uniform_() @@ -14049,15 +14590,16 @@ def helper(self, size, groups): ref_out = ref_gn(ref_input) ref_out.backward(ref_grad) - self.assertTrue(out.is_contiguous(memory_format=torch.channels_last)) + self.assertTrue(out.is_contiguous(memory_format=memory_format)) self.assertTrue(ref_out.is_contiguous()) self.assertEqual(out, ref_out) self.assertEqual(gn.weight.grad, ref_gn.weight.grad) self.assertEqual(gn.bias.grad, ref_gn.bias.grad) self.assertEqual(input.grad, ref_input.grad) - helper(self, (4, 8, 10, 10), 4) - helper(self, (2, 30, 9, 9), 3) + helper(self, (4, 8, 10, 10), 4, torch.channels_last) + helper(self, (2, 30, 9, 9), 3, torch.channels_last) + helper(self, (2, 9, 7, 11, 15), 3, torch.channels_last_3d) @onlyNativeDeviceTypes def test_GroupNorm_numeric(self, device): @@ -14095,10 +14637,10 @@ def test_pad(self, device, dtype): # Assert assertion errors are raised for invalid circular padding values inputs = torch.randn(1, 1, 4, device=device, dtype=dtype, requires_grad=True) # Should raise error when trying to wrap around more than once - self.assertRaises(AssertionError, lambda: F.pad(inputs, (5, 4), mode='circular')) - self.assertRaises(AssertionError, lambda: F.pad(inputs, (3, 6), mode='circular')) + self.assertRaises(RuntimeError, lambda: F.pad(inputs, (5, 4), mode='circular')) + self.assertRaises(RuntimeError, lambda: F.pad(inputs, (3, 6), mode='circular')) # Should raise error when negative padding results in negative output shape - self.assertRaises(AssertionError, lambda: F.pad(inputs, (-3, -2), mode='circular')) + self.assertRaises(RuntimeError, lambda: F.pad(inputs, (-3, -2), mode='circular')) # assert that relfection padding errors when pad >= input size expected_err_msg = r"Padding size should be less than the corresponding input dimension" @@ -14250,11 +14792,29 @@ def test_Bilinear_empty(self, device): @expectedFailureMeta # RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1] @onlyNativeDeviceTypes def test_TransformerEncoderLayer_empty(self, device): - for batch_first, input_shape in [(True, (0, 10, 512)), - (False, (10, 0, 512))]: - input = torch.rand(*input_shape, device=device) - encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=batch_first).to(device) - self._test_module_empty_input(encoder_layer, input, check_size=False) + for training in (True, False): + for batch_first, input_shape in [(True, (0, 10, 512)), + (False, (10, 0, 512))]: + input = torch.rand(*input_shape, device=device) + encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=batch_first).to(device) + if not training: + encoder_layer = encoder_layer.eval() + with torch.no_grad(): + self._test_module_empty_input(encoder_layer, input, check_size=False, inference=True) + if batch_first and not TEST_WITH_CROSSREF: + with torch.no_grad(): + # A NestedTensor with no tensors inside it doesn't have dim 3 (or dim + # 2, for that matter) so it can't hit the fast path, nor can we give a + # result. + with self.assertRaisesRegex( + AssertionError, 'MultiheadAttention does not support NestedTensor outside'): + nt = torch.nested_tensor([], device=device) + self._test_module_empty_input(encoder_layer, nt, check_size=False, inference=True) + + nt = torch.nested_tensor([torch.rand(0, 512, device=device)], device=device) + self._test_module_empty_input(encoder_layer, nt, check_size=False, inference=True) + else: + self._test_module_empty_input(encoder_layer, input, check_size=False) @expectedFailureMeta # RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1] @onlyNativeDeviceTypes @@ -14430,6 +14990,20 @@ def test_FractionalMaxPool3d_zero_batch(self, device): inp = torch.randn(1, 0, 50, 32, 32, device=device) mod(inp) + @onlyNativeDeviceTypes + def test_FractionalMaxPool2d_zero_out_size(self, device): + mod = nn.FractionalMaxPool2d([2, 2], output_size=[0, 1]) + inp = torch.rand([16, 50, 32, 32], device=device) + out = mod(inp) + self.assertEqual(out, torch.empty((16, 50, 0, 1), device=device)) + + @onlyNativeDeviceTypes + def test_FractionalMaxPool3d_zero_out_size(self, device): + mod = nn.FractionalMaxPool3d([3, 2, 2], output_size=[0, 1, 1]) + inp = torch.rand([16, 50, 32, 32], device=device) + out = mod(inp) + self.assertEqual(out, torch.empty((16, 0, 1, 1), device=device)) + @onlyNativeDeviceTypes def test_Unfold_empty(self, device): inp = torch.randn(0, 3, 3, 4, device=device) @@ -14607,26 +15181,27 @@ def test_BatchNorm_empty(self, device): self.assertEqual(mod.weight.grad, torch.tensor([0., 0, 0], device=device)) self.assertEqual(mod.bias.grad, torch.tensor([0., 0, 0], device=device)) - def test_conv_empty_channel(self, device): + @dtypes(torch.float, torch.cfloat) + def test_conv_empty_channel(self, device, dtype): in_channels = 0 - mod = torch.nn.Conv1d(in_channels, 8, 2, stride=2).to(device) - inp = torch.randn(2, 0, 15, device=device) + mod = torch.nn.Conv1d(in_channels, 8, 2, stride=2, dtype=dtype).to(device) + inp = torch.randn(2, 0, 15, device=device, dtype=dtype) self._test_module_empty_input(mod, inp, check_size=False) with self.assertRaisesRegex(RuntimeError, "Given groups=1, weight"): inp = torch.randn(2, 1, 0, device=device) mod(inp) - mod = torch.nn.Conv2d(in_channels, 33, 3, stride=2).to(device) - inp = torch.randn(2, 0, 50, 100, device=device) + mod = torch.nn.Conv2d(in_channels, 33, 3, stride=2, dtype=dtype).to(device) + inp = torch.randn(2, 0, 50, 100, device=device, dtype=dtype) self._test_module_empty_input(mod, inp, check_size=False) with self.assertRaisesRegex(RuntimeError, "Given groups=1, weight"): inp = torch.randn(2, 1, 40, 0, device=device) mod(inp) - mod = torch.nn.Conv3d(in_channels, 33, 3, stride=2).to(device) - inp = torch.randn(2, 0, 50, 20, 40, device=device) + mod = torch.nn.Conv3d(in_channels, 33, 3, stride=2, dtype=dtype).to(device) + inp = torch.randn(2, 0, 50, 20, 40, device=device, dtype=dtype) self._test_module_empty_input(mod, inp, check_size=False) with self.assertRaisesRegex(RuntimeError, "Given groups=1, weight"): @@ -14734,6 +15309,21 @@ def test_one_hot(self, device): with self.assertRaises(RuntimeError): torch.nn.functional.one_hot(torch.tensor([3, 4, 1, 0], device=device), -2) + def test_nn_empty(self, device): + # One off tests to ensure scalars from nn.yaml are properly applied + def verify_scalars(input, output): + self.assertEqual(input.shape, output.shape) + self.assertEqual(0, output.numel()) + + for input_shape in [(0), (0, 2)]: + for module in [torch.nn.ELU, torch.nn.Hardtanh, torch.nn.LeakyReLU, torch.nn.LogSigmoid, + torch.nn.RReLU, torch.nn.Softshrink, torch.nn.Softplus, torch.nn.Sigmoid, + torch.nn.Tanh]: + input = torch.randn(input_shape, device=device, requires_grad=True) + m = module() + output = m(input) + verify_scalars(input, output) + def test_nn_scalars(self, device): # One off tests to ensure scalars from nn.yaml are properly applied def verify_scalars(input, output): @@ -14887,6 +15477,31 @@ def test_unequal_when_beta_is_greater_than_one(): test_unequal_when_beta_is_less_than_one() test_unequal_when_beta_is_greater_than_one() + @onlyCPU + def test_smooth_l1_loss_bfloat16(self, device): + def test_dtype(fn, input, target, dtype): + input = input.detach().clone().to(dtype=dtype).requires_grad_(True) + input2 = input.detach().clone().float().requires_grad_(True) + target = target.detach().clone().to(dtype=dtype) + target2 = target.detach().clone().float() + out = fn(input, target) + out.sum().backward() + out2 = fn(input2, target2) + out2.sum().backward() + self.assertEqual(out.dtype, dtype) + self.assertEqual(input.grad.dtype, dtype) + self.assertEqual(out, out2, exact_dtype=False) + self.assertEqual(input.grad, input2.grad, exact_dtype=False) + + def func(device): + return nn.SmoothL1Loss().to(device=device) + + shapes = [[1, 3, 1, 6], [1, 3, 1, 128], [1, 3, 128, 128]] + for shape in shapes: + x = torch.randn(shape, device=device, requires_grad=True) + t = torch.randn(shape, device=device) + test_dtype(func(device), x, t, torch.bfloat16) + # We don't want to make propagating NaN a hard requirement on ops, but for # these easy ones, we should make them do so. def test_nonlinearity_propagate_nan(self, device): @@ -15662,9 +16277,7 @@ def test_upsamplingBicubic2d(self, device, antialias, align_corners): # for scale_factor in [0.5, 1, 1.5, 2]: for scale_factor in [2, ]: in_t = torch.ones(2, 3, 8, 8, device=device) - print("dtype: ", in_t.dtype) out_t = F.interpolate(in_t, scale_factor=scale_factor, **kwargs) - print(out_t) out_size = int(math.floor(in_t.shape[-1] * scale_factor)) expected_out = torch.ones(2, 3, out_size, out_size, device=device) self.assertEqual(expected_out, out_t, atol=1e-5, rtol=0) @@ -15745,6 +16358,48 @@ def helper(n, c, h, w, output_height, output_width, contig): helper(4, 8, 9, 14, 5, 8, contig) helper(4, 8, 11, 11, 1, 1, contig) + @dtypes(torch.float, torch.double) + def test_pooling_max_nhwc(self, device, dtype): + def helper(n, c, h, w, kernel_size, stride, padding, dilation, contig, device): + output_height = math.floor((h + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1) / stride[0] + 1) + output_width = math.floor((w + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1) / stride[1] + 1) + + input = torch.randint(1, 10, (n, c, h, w), device=device, dtype=dtype) + input = input.contiguous(memory_format=torch.channels_last) + grad = torch.randint(1, 10, (n, c, output_height, output_width), device=device, dtype=dtype) + grad = grad.contiguous(memory_format=torch.channels_last) + if not contig: + input = input[:, ::2, :, :] + grad = grad[:, ::2, :, :] + input.requires_grad_(True) + pool = torch.nn.MaxPool2d( + kernel_size, stride, padding, dilation, return_indices=True, ceil_mode=False + ) + + ref_input = input.detach().clone().contiguous().requires_grad_(True) + ref_grad = grad.detach().clone().contiguous() + ref_pool = torch.nn.MaxPool2d( + kernel_size, stride, padding, dilation, return_indices=True, ceil_mode=False + ).to(device) + + out, ind = pool(input) + out.backward(grad) + ref_out, ref_ind = ref_pool(ref_input) + ref_out.backward(ref_grad) + + self.assertTrue(out.is_contiguous(memory_format=torch.channels_last)) + self.assertTrue(ref_out.is_contiguous()) + self.assertTrue(ind.is_contiguous(memory_format=torch.channels_last)) + self.assertTrue(ref_ind.is_contiguous()) + self.assertEqual(out, ref_out) + self.assertEqual(ind, ref_ind) + self.assertEqual(input.grad, ref_input.grad) + + for contig in [True, False]: + helper(4, 8, 10, 10, (2, 2), (1, 1), (1, 1), (2, 2), contig, device) + helper(4, 8, 9, 14, (2, 2), (1, 1), (1, 1), (2, 2), contig, device) + helper(4, 8, 11, 11, (4, 4), (2, 2), (2, 2), (2, 2), contig, device) + def test_embedding_dense_grad(self, device): embd = nn.Embedding(20, 20).to(device) weight = embd.weight @@ -16112,25 +16767,93 @@ def embedding_bag_check(indices, weights, mode, sparse, padding_idx): rtol = None self.assertEqual(grad, grad_check, msg=msg, atol=atol, rtol=rtol) + def _slow_masked_softmax(self, input, mask): + exp = torch.exp(input) + exp = exp * mask + s = exp.sum(dim=3, keepdim=True).expand(exp.size()) + return exp / s + def test_masked_softmax(self, device): sizes = [(1, 1, 32), (3, 16, 310), (12, 4, 1024), (4, 2, 1200)] for (B, num_heads, L) in sizes: - input = torch.randn((B, num_heads, L, L)) - mask = torch.randint(0, 2, (B, L)) + for dim in [0, 3]: + input = torch.randn((B, num_heads, L, L)) + mask = torch.randint(0, 2, (B, L)) + mask = mask.reshape(B, 1, 1, L).expand(B, num_heads, L, L).bool() + if (self.device_type == "cuda"): + input = input.cuda() + mask = mask.cuda() + native_res = torch._masked_softmax(input, mask, dim) + mask = ~mask + + def slow_masked_softmax(input, mask): + exp = torch.exp(input) + exp = exp * mask + s = exp.sum(dim=dim, keepdim=True).expand(exp.size()) + return exp / s + + pt_res = slow_masked_softmax(input, mask) + pt_res = torch.nan_to_num(pt_res) + + mask_not = mask.logical_not() + # In result, should only fill the entirely masked out rows since those are non-deterministic (*may* be 0) + # Converts rows with all True's to False + mask_out = mask_not.all(dim, keepdim=True).expand(mask_not.shape) + self.assertEqual( + pt_res.masked_fill(mask_out, 0), + native_res.masked_fill(mask_out, 0), + exact_dtype=True + ) + + def _test_masked_softmax_helper(self, input, dim, mask): + input_ref = input.detach().clone().requires_grad_() + result = torch._masked_softmax(input, mask, dim) + + expected = torch._softmax(input_ref.masked_fill(mask, float('-inf')), dim, False) + grad = torch.randn_like(expected).to(dtype=expected.dtype) + + result.backward(grad) + expected.backward(grad) + + # Make sure the optional argument works as well + if dim == input.dim() - 1: + input_ref_default = input.detach().clone().requires_grad_() + result_default = torch._masked_softmax(input_ref_default, mask) + result_default.backward(grad) + self.assertEqual(result, result_default) + self.assertEqual(input.grad, input_ref_default.grad) + + # In result, should only fill the entirely masked out rows since those are non-deterministic (*may* be 0) + # Converts rows with all True's to False + mask_out = mask.all(dim, keepdim=True).expand(mask.shape) + self.assertEqual(result.masked_fill(mask_out, 0), expected.masked_fill(mask_out, 0)) + + self.assertEqual(input.grad, torch.nan_to_num(input_ref.grad)) + self.assertEqual(input.grad, input.grad.masked_fill(mask, 0.0)) + + def test_masked_softmax_grad(self, device): + shapes = [(1, 1, 32), (3, 16, 310), (12, 4, 1024), (4, 2, 1200)] + for shape in shapes: + dims = [0, len(shape) - 1] if len(shape) > 0 else [0] + for dim in dims: + input = torch.randn(shape, requires_grad=True) + mask = torch.randint(0, 2, shape).bool() + if (self.device_type == "cuda"): + input = input.cuda().detach().requires_grad_() + mask = mask.cuda() + self._test_masked_softmax_helper(input, dim, mask) + + # In this test, the forward pass is expected to produce nan's because when dim=0, we only have unspecified values + def test_masked_softmax_forward_with_nans(self, device): + dim = 0 + shapes = [(4, 5), (50, 100), (1500, 1200)] + for (x, y) in shapes: + input = torch.randn((x, y), requires_grad=True) + mask = torch.tensor([i % 2 for i in range(y)]).expand((x, y)).bool() if (self.device_type == "cuda"): - input = input.cuda() + input = input.cuda().detach().requires_grad_() mask = mask.cuda() - mask = mask.reshape(B, 1, 1, L).expand(B, num_heads, L, L).bool() - native_res = torch._masked_softmax(input, mask) - mask = mask.float() - - def slow_masked_softmax(input, mask): - exp = torch.exp(input) - exp = exp * mask - s = exp.sum(dim=3, keepdim=True).expand(exp.size()) - return exp / s - pt_res = slow_masked_softmax(input, mask) - self.assertEqual(pt_res, native_res, exact_dtype=True) + self._test_masked_softmax_helper(input, dim, mask) @onlyCUDA def test_masked_softmax_transformer_layout(self, device): @@ -16138,25 +16861,40 @@ def test_masked_softmax_transformer_layout(self, device): num_heads = 16 L = 42 input = torch.randn((B, num_heads, L, L)) + dim = input.dim() - 1 mask = torch.randint(0, 2, (B, L)) if (self.device_type == "cuda"): input = input.cuda() mask = mask.cuda() mask = mask.bool() - native_res = torch._masked_softmax(input, mask) + native_res = torch._masked_softmax(input, mask, dim) mask = mask.reshape(B, 1, 1, L).expand(B, num_heads, L, L) + mask = ~mask mask = mask.float() - def slow_masked_softmax(input, mask): - exp = torch.exp(input) - exp = exp * mask - s = exp.sum(dim=3, keepdim=True).expand(exp.size()) - return exp / s - pt_res = slow_masked_softmax(input, mask) + pt_res = self._slow_masked_softmax(input, mask) + self.assertEqual(pt_res, native_res, exact_dtype=True) + + @onlyCUDA + def test_masked_softmax_TxT_layout(self, device): + B = 211 + num_heads = 16 + L = 42 + input = torch.randn((B, num_heads, L, L)) + dim = input.dim() - 1 + mask = torch.randint(0, 2, (L, L)) + if (self.device_type == "cuda"): + input = input.cuda() + mask = mask.cuda() + mask = mask.bool() + native_res = torch._masked_softmax(input, mask, dim) + mask = mask.expand(B, num_heads, L, L) + mask = ~mask + mask = mask.float() + + pt_res = self._slow_masked_softmax(input, mask) self.assertEqual(pt_res, native_res, exact_dtype=True) - # Test fails on Vg20 - @skipCUDAIfRocm @dtypesIfCUDA(torch.half, torch.float) @dtypes(torch.float) def test_softmax_results(self, device, dtype): @@ -16454,7 +17192,6 @@ def test_conv_transposed_large(self, device): def test_conv_large(self, device): dtype = torch.half if self.device_type == 'cuda' else torch.float conv = nn.Conv2d(2, 2, 8, 8, bias=False).to(device).to(dtype) - conv.weight = torch.nn.Parameter(torch.randn(2, 2, 8, 8, device=device, dtype=dtype) / 64) input_large = torch.randn(4097, 2, 512, 512, dtype=dtype, device=device) # forward ret = conv(input_large) @@ -16538,6 +17275,7 @@ def _test_gumbel_softmax_grad(self, device, dtype): tol = 2 * torch.finfo(dtype).eps self.assertEqual(logits_soft.grad, logits_hard.grad, atol=tol, rtol=0) + @skipIfMps @dtypesIfCUDA(torch.half, torch.float, torch.double) @dtypes(torch.float, torch.double) def test_gumbel_softmax(self, device, dtype): @@ -16941,8 +17679,6 @@ def test_embedding_max_norm_device(self, device, dtype): self.assertEqual(output[1], output[2]) self.assertTrue(output.data.norm(p=2, dim=1).le(1).all()) - # Test fails on Vg20 - @skipCUDAIfRocm @onlyCUDA @dtypes(torch.half, torch.float) def test_softmax(self, device, dtype): @@ -17029,7 +17765,7 @@ def test_embedding_bag_empty_input(self, device, dtypes): output = Embed(input=x, offsets=torch.tensor([0, 0], device=device, dtype=dtypes[1])) self.assertEqual(output, torch.zeros_like(output)) - @skipCUDAIf(True, "cuda assert is not recovarable.") + @skipCUDAIf(True, "no out-of-bounds check on CUDA for perf.") @dtypes(*itertools.product((torch.float, torch.double), (torch.int, torch.long))) @parametrize_test("padding_idx", [None, 0]) @parametrize_test("mode", ["sum", "mean", "max"]) @@ -17148,15 +17884,15 @@ def _embedding_bag_reference_impl(self, input, weight, offsets=None, mode='sum', bags.append(embeddings.narrow(0, offset, length).max(0)[0]) return torch.stack(bags) - @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double, torch.half))) - @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double))) + @skipMeta + @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.half, torch.float, torch.double))) def test_EmbeddingBag_empty_per_sample_weights_and_offsets(self, device, dtypes): # Test empty input and per sample weight, and backward pass. There was a CUDA # invalid configuration bug (more context in #46572) def test_per_sample_weights(mode, trainable_scale): es = nn.EmbeddingBag(5, 2, mode=mode).to(dtype=dtypes[2], device=device) es.weight.data.copy_( - torch.arange(1, 11, device=device, dtype=dtypes[2]).view_as(es.weight)) + torch.arange(1, 11, device=device).view_as(es.weight).to(dtypes[2])) input = torch.tensor([], device=device, dtype=dtypes[0]) offsets = torch.tensor([0, 0, 0, 0, 0], device=device, dtype=dtypes[1]) per_sample_weights = torch.randn_like(input, dtype=dtypes[2]) \ @@ -17187,13 +17923,13 @@ def test_per_sample_weights(mode, trainable_scale): for mode, trainable in itertools.product(modes, trainable_scale): test_per_sample_weights(mode, trainable) - @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double, torch.half))) - @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double))) + @skipMeta + @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double, torch.half))) def test_EmbeddingBag_per_sample_weights_and_offsets(self, device, dtypes): def test_per_sample_weights(mode, trainable_scale): es = nn.EmbeddingBag(5, 2, mode=mode).to(dtype=dtypes[2], device=device) es.weight.data.copy_( - torch.arange(1, 11, device=device, dtype=dtypes[2]).view_as(es.weight)) + torch.arange(1, 11, device=device).view_as(es.weight).to(dtypes[2])) input = torch.tensor([3, 1, 1, 1, 4, 0], device=device, dtype=dtypes[0]) offsets = torch.tensor([0, 0, 3, 3, 6], device=device, dtype=dtypes[1]) per_sample_weights = torch.randn_like(input, dtype=dtypes[2]) \ @@ -17221,13 +17957,13 @@ def test_per_sample_weights(mode, trainable_scale): for mode, trainable in itertools.product(modes, trainable_scale): test_per_sample_weights(mode, trainable) - @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double, torch.half))) - @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double))) + @skipMeta + @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double, torch.half))) def test_EmbeddingBag_per_sample_weights_and_new_offsets(self, device, dtypes): def test_per_sample_weights_new_offsets(mode, trainable_scale, include_last_offset, has_weight=True): es = nn.EmbeddingBag(5, 2, mode=mode, include_last_offset=include_last_offset).to(dtype=dtypes[2], device=device) es.weight.data.copy_( - torch.arange(1, 11, device=device, dtype=dtypes[2]).view_as(es.weight)) + torch.arange(1, 11, device=device).view_as(es.weight).to(dtypes[2])) input = torch.tensor([3, 1, 1, 1, 4, 0], device=device, dtype=dtypes[0]) offsets = torch.tensor([0, 0, 3, 3, 6], device=device, dtype=dtypes[1]) @@ -17383,7 +18119,7 @@ def _test_EmbeddingBag( ): # check a known test example es = nn.EmbeddingBag(5, 2, mode=mode, sparse=sparse).to(device, wdtype) - es.weight.data.copy_(torch.arange(1, 11, device=device, dtype=wdtype).view_as(es.weight)) + es.weight.data.copy_(torch.arange(1, 11, device=device).view_as(es.weight).to(wdtype)) input = torch.tensor([3, 1, 1, 1, 4, 0], device=device, dtype=dtype) offsets = torch.tensor([0, 0, 3, 3, 6], device=device, dtype=odtype) @@ -17486,8 +18222,8 @@ def _test_EmbeddingBag( offset[-1] = 100 self.assertRaises(RuntimeError, lambda: es(input.view(-1), offset)) - @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double, torch.half))) - @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double))) + @skipMeta + @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double, torch.half))) def test_embedding_bag_device(self, device, dtypes): self._test_EmbeddingBag(device, 'sum', False, wdtype=dtypes[2], dtype=dtypes[0], odtype=dtypes[1]) self._test_EmbeddingBag(device, 'mean', False, wdtype=dtypes[2], dtype=dtypes[0], odtype=dtypes[1]) @@ -17500,7 +18236,7 @@ def test_embedding_bag_device(self, device, dtypes): elif self.device_type == 'cpu': # TODO: figure out why precision on sparse embeddings isn't the # same as for dense. - test_backward = dtypes[2] is not torch.float + test_backward = dtypes[2] is not torch.float and dtypes[2] is not torch.float16 self._test_EmbeddingBag( device, @@ -17521,8 +18257,8 @@ def test_embedding_bag_device(self, device, dtypes): test_backward=test_backward, ) - @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double, torch.half))) - @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double))) + @skipMeta + @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long), (torch.float, torch.double, torch.half))) def test_embedding_bag_non_contiguous_weight(self, device, dtypes): weight_tensor = torch.randn(3, 4, dtype=dtypes[2], device=device) @@ -17546,13 +18282,16 @@ def test_embedding_bag_non_contiguous_weight(self, device, dtypes): ) self.assertEqual(output_non_contig, output_contig) - @onlyCUDA @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long))) def test_embedding_bag_bfloat16(self, device, dtypes): self._test_EmbeddingBag(device, 'sum', True, wdtype=torch.bfloat16, dtype=dtypes[0], odtype=dtypes[1], test_backward=True) self._test_EmbeddingBag(device, 'mean', True, wdtype=torch.bfloat16, dtype=dtypes[0], odtype=dtypes[1], test_backward=True) + @onlyNativeDeviceTypes # currently fails on XLA + @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long))) + def test_embedding_bag_half(self, device, dtypes): + self._test_EmbeddingBag(device, 'sum', True, wdtype=torch.float16, dtype=dtypes[0], odtype=dtypes[1], test_backward=True) @onlyCUDA @dtypes(torch.half, torch.float, torch.double) @@ -17569,7 +18308,33 @@ def test_multihead_attention_dtype(self, device, dtype): self.assertEqual(q.size(), out[0].size()) self.assertEqual(dtype, out[0].dtype) - @dtypesIfCUDA(*get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM)) + @onlyCUDA + @dtypes(torch.half, torch.float, torch.double) + def test_multihead_attention_dtype_batch_first(self, device, dtype): + embed_dim = 128 + num_heads = 8 + sl = 10 + bs = 8 + # With batch_first=True, we have the possibility of hitting + # the native fast path if we call .eval() and enable inference + # mode. Test both paths. + for training in (True, False): + model = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True).cuda().to(dtype) + if not training: + model = model.eval() + cm = torch.no_grad() + else: + cm = contextlib.nullcontext() + with cm: + q = torch.randn(bs, sl, embed_dim, device=device, dtype=dtype) + k = torch.randn(bs, sl, embed_dim, device=device, dtype=dtype) + v = torch.randn(bs, sl, embed_dim, device=device, dtype=dtype) + # fast path currently doesn't support weights + out = model(q, k, v, need_weights=False) + self.assertEqual(q.size(), out[0].size()) + self.assertEqual(dtype, out[0].dtype) + + @dtypesIfCUDA(*floating_types_and(torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else [])) @dtypes(torch.float) def test_Conv2d_naive_groups(self, device, dtype): # Check that grouped convolutions matches two half convolutions @@ -17604,7 +18369,7 @@ def test_Conv2d_naive_groups(self, device, dtype): torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0), atol=dtype2prec_DONTUSE[dtype], rtol=0) - @dtypes(torch.double) + @dtypes(torch.double, torch.cdouble) def test_Conv2d_backward_depthwise(self, device, dtype): x = torch.randn(2, 2, 4, 20, device=device, dtype=dtype, requires_grad=True) weight = torch.randn(2, 1, 3, 5, device=device, dtype=dtype, requires_grad=True) @@ -17937,37 +18702,42 @@ def expected_output(dim): self.assertEqual(output[0, 0, 0, 0], float("-inf")) self.assertEqual(indices[0, 0, 0, 0], 0) - @dtypesIfCUDA(*get_all_fp_dtypes()) + @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16)) @dtypes(torch.float) def test_MaxPool1d_indices(self, device, dtype): self._test_maxpool_indices(1, device=device, dtype=dtype) - @dtypesIfCUDA(*get_all_fp_dtypes()) + @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16)) @dtypes(torch.float) def test_MaxPool2d_indices(self, device, dtype): self._test_maxpool_indices(2, device=device, dtype=dtype) - @dtypesIfCUDA(*get_all_fp_dtypes()) + @skipIfMps + @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16)) @dtypes(torch.float) def test_MaxPool3d_indices(self, device, dtype): self._test_maxpool_indices(3, device=device, dtype=dtype) - @dtypesIfCUDA(*get_all_fp_dtypes()) + @skipIfMps + @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16)) @dtypes(torch.float) def test_AdaptiveMaxPool1d_indices(self, device, dtype): self._test_maxpool_indices(1, adaptive=True, device=device, dtype=dtype) - @dtypesIfCUDA(*get_all_fp_dtypes()) + @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16)) + @skipIfMps @dtypes(torch.float) def test_AdaptiveMaxPool2d_indices(self, device, dtype): self._test_maxpool_indices(2, adaptive=True, device=device, dtype=dtype) - @dtypesIfCUDA(*get_all_fp_dtypes()) + @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16)) + @skipIfMps @dtypes(torch.float) def test_AdaptiveMaxPool3d_indices(self, device, dtype): self._test_maxpool_indices(3, adaptive=True, device=device, dtype=dtype) - @dtypesIfCUDA(*get_all_fp_dtypes()) + @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16)) + @skipIfMps @dtypes(torch.float) def test_maxpool_indices_no_batch_dim(self, device, dtype): """Check that indices with no batch dim is consistent with a single batch.""" @@ -18132,7 +18902,8 @@ def test_pooling_zero_stride(self, device): self.assertRaisesRegex(RuntimeError, r"stride should not be zero|stride must be greater than zero", lambda: fn_module(x)) - @dtypesIfCUDA(*get_all_fp_dtypes()) + @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16)) + @skipIfMps @dtypes(torch.float) def test_pool_large_size(self, device, dtype): for op in ('max', 'avg'): @@ -18146,7 +18917,8 @@ def test_pool_large_size(self, device, dtype): # check if the output shape was still computed correctly self.assertEqual(x.shape[2], res.shape[2]) - @dtypesIfCUDA(*get_all_fp_dtypes()) + @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16)) + @skipIfMps @dtypes(torch.float) def test_pool_invalid_size(self, device, dtype): for op in ('max', 'avg'): @@ -18390,6 +19162,35 @@ def test_multi_margin_loss_errors(self, device): lambda: nn.functional.multi_margin_loss(torch.randn(5, device=device), torch.zeros(3, device=device))) + @onlyCPU + def test_activations_bfloat16_cpu(self, device): + def test_bfloat16(fn, device, inp_dims, prec): + # bfloat16 compute + input = torch.randn(inp_dims, dtype=torch.bfloat16, device=device, requires_grad=True) + out = fn(input) + grad_input = torch.randn_like(out, dtype=torch.bfloat16, device=device) + out.backward(grad_input) + + # fp32 compute + input2 = input.detach().clone().float().requires_grad_(True) + out2 = fn(input2) + grad_input2 = grad_input.detach().clone().float() + out2.backward(grad_input2) + + self.assertEqual(out.dtype, torch.bfloat16) + self.assertEqual(input.grad.dtype, torch.bfloat16) + self.assertEqual(out, out2, atol=prec, rtol=0, exact_dtype=False) + self.assertEqual(input.grad.data, input2.grad.data, atol=prec, rtol=0, exact_dtype=False) + + shapes = [[1, 3, 1, 6], [1, 3, 1, 128], [1, 3, 256, 256]] + for shape in shapes: + test_bfloat16(torch.nn.LogSigmoid(), device, shape, prec=2e-2) + test_bfloat16(torch.nn.Hardsigmoid(), device, shape, prec=1e-2) + test_bfloat16(torch.nn.Hardshrink(), device, shape, prec=1e-2) + test_bfloat16(torch.nn.Softshrink(), device, shape, prec=1e-2) + test_bfloat16(torch.nn.Hardswish(), device, shape, prec=2e-2) + test_bfloat16(torch.nn.Softplus(), device, shape, prec=1e-2) + def _test_bfloat16_ops(self, op, device, inp_dims=(), prec=1e-2, scale_factor=None): # fp32 compute input1 = torch.randn(inp_dims, dtype=torch.float32, device=device, requires_grad=True) @@ -18435,11 +19236,57 @@ def test_softmax_bfloat16(self, device): # test softmax with large input value which casues exp() to overflow self._test_bfloat16_ops(torch.nn.Softmax(dim=dim), device, inp_dims=(16, 33, 15, 16), prec=0.05, scale_factor=1000.0) + @onlyCPU + @dtypes(torch.float, torch.double) + def test_conv_thnn_nhwc(self, device, dtype): + def helper(n, c, h, w, out_channels, kernel_size, dilation, groups, weight_memory_format): + input = torch.randint(-3, 3, (n, c, h, w), dtype=dtype, device=device)\ + .to(memory_format=torch.channels_last) + input.requires_grad_() + conv = nn.Conv2d(c, out_channels, kernel_size, dilation=dilation, groups=groups)\ + .to(device='cpu', dtype=dtype, memory_format=weight_memory_format) + for p in conv.parameters(): + p.data = torch.randint_like(p, -3, 3) + + ref_input = input.detach().clone().contiguous().requires_grad_() + ref_conv = nn.Conv2d(c, out_channels, kernel_size, dilation=dilation, groups=groups) + # load_state_dict will restore the stride & memory_layout on ref_conv.weight. + ref_conv.load_state_dict(conv.state_dict()) + ref_conv = ref_conv.to(device='cpu', dtype=dtype, memory_format=torch.contiguous_format) + + out = conv(input) + ref_out = ref_conv(ref_input) + + grad = torch.randint_like(out, -3, 3) + ref_grad = grad.detach().clone().contiguous() + + out.backward(grad) + ref_out.backward(ref_grad) + + self.assertTrue(out.is_contiguous(memory_format=torch.channels_last)) + self.assertTrue(ref_out.is_contiguous()) + self.assertEqual(out, ref_out, exact_dtype=False) + self.assertEqual(conv.weight.grad, ref_conv.weight.grad, exact_dtype=False) + self.assertEqual(conv.bias.grad, ref_conv.bias.grad, exact_dtype=False) + self.assertEqual(input.grad, ref_input.grad, exact_dtype=False) + + with torch.backends.mkldnn.flags(enabled=False): + for mf in [torch.contiguous_format, torch.channels_last]: + # non-dilated conv: thnn_conv2d normal path (with im2col) + helper(2, 8, 4, 4, out_channels=4, kernel_size=3, dilation=1, groups=1, weight_memory_format=mf) + helper(2, 8, 4, 4, out_channels=8, kernel_size=3, dilation=1, groups=8, weight_memory_format=mf) + # non-dilated conv: thnn_conv2d fast path (skip im2col) + helper(1, 16, 56, 56, out_channels=16, kernel_size=1, dilation=1, groups=1, weight_memory_format=mf) + helper(1, 16, 56, 56, out_channels=16, kernel_size=1, dilation=1, groups=16, weight_memory_format=mf) + # dilated conv: slow_conv_dilated2d + helper(2, 8, 11, 13, out_channels=16, kernel_size=3, dilation=2, groups=1, weight_memory_format=mf) + helper(2, 16, 11, 13, out_channels=32, kernel_size=3, dilation=2, groups=16, weight_memory_format=mf) + @onlyCUDA @skipCUDAIfRocmVersionLessThan((4, 3)) @skipCUDAIfNotMiopenSuggestNHWC @skipCUDAIfCudnnVersionLessThan(7603) - @dtypes(torch.half, torch.float) + @dtypes(torch.half, torch.float, torch.cfloat) def test_conv_cudnn_nhwc(self, device, dtype): def helper(n, c, h, w, out_channels, kernel_size, groups): input = torch.randint(-3, 3, (n, c, h, w), dtype=dtype, device=device)\ @@ -18886,6 +19733,20 @@ def test_cross_entropy_loss_prob_target_unit_weights(self, device): output_unit = m_unit(input, target) self.assertEqual(output, output_unit) + @parametrize_test('reduction', ['none', 'mean', 'sum']) + @parametrize_test('weighted', [False, True]) + def test_cross_entropy_loss_prob_target_no_batch_dim(self, device, reduction, weighted): + C = 5 + input = torch.randn(C, device=device).log_softmax(dim=-1) + target = torch.randn(C, device=device).softmax(dim=-1) + weight = torch.randn(C, device=device) if weighted else None + m = nn.CrossEntropyLoss(reduction=reduction, weight=weight) + loss_no_batch = m(input, target) + loss_batch = m(input.unsqueeze(0), target.unsqueeze(0)) + if reduction == 'none': + loss_batch = loss_batch.squeeze(0) + self.assertEqual(loss_no_batch, loss_batch) + def test_cross_entropy_loss_index_target_unit_weights(self, device): # Test with k-dimensional loss. for k in range(5): @@ -19235,7 +20096,7 @@ def __init__(self): self.assertEqual(p.grad.to(devices[0]), pe.grad) def test_elu_inplace_overlap(self, device): - x = torch.randn((1, 6), device=device).expand((6, 6)) + x = torch.randn((1, 6), dtype=torch.bfloat16, device=device).expand((6, 6)) with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): F.elu(x, inplace=True) with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): @@ -19322,6 +20183,32 @@ def test_leaky_relu_inplace_with_zero_slope(self, device): expected_bf16 = torch.tensor([0., 0., 1.], device=device, dtype=torch.bfloat16) self.assertEqual(a_bf16.grad, expected_bf16) + @onlyCPU + def test_softshrink(self, device): + x = torch.tensor([[1.21, 0.56, 0.5001, 0.4999, 1.2357, -0.4999, -0.5001, -1.154, + 0.254, -0.24, -0.225, 0.104, 0.002, -0.001, 0.0574, 1.2344, + 0.1748, -0.1797, -0.8125, 0.2051, -1.1328, 1.2344, -0.1562, 2.3554, + -0.1953, 0.0304, -0.3613, -1.3047, 1.0312, 0.1436, -0.6953, 0.5664, + -0.5820, -0.3301, 0.8203, 0.6133, 0.5938], + [-0.8203, -1.2344, -0.5234, 2.5312, -0.4551, -0.6875, -1.5547, -0.2217, + -0.3027, 2.6406, 1.3047, 0.2344, -1.6719, 0.2773, -1.3516, 3.4575, + 0.4414, 0.2656, 2.1094, -1.5156, 1.2344, -0.4336, 0.6797, -3.5486, + 0.9766, -0.4062, 1.4844, 0.7500, -1.7578, 0.7461, 1.6094, 8.5458, + 0.3730, -0.3477, -1.0625, 0.3848, 0.0557]], device=device) + expected = torch.tensor([[0.71, 0.06, 0.0001, 0., 0.7357, 0., -0.0001, -0.654, + 0., 0., 0., 0., 0., 0., 0., 0.7344, + 0., 0., -0.3125, 0., -0.6328, 0.7344, 0., 1.8554, + 0., 0., 0., -0.8047, 0.5312, 0., -0.1953, 0.0664, + -0.0820, 0.0, 0.3203, 0.1133, 0.0938], + [-0.3203, -0.7344, -0.0234, 2.0312, 0.0, -0.1875, -1.0547, 0., + 0.0, 2.1406, 0.8047, 0., -1.1719, 0., -0.8516, 2.9575, + 0., 0., 1.6094, -1.0156, 0.7344, 0., 0.1797, -3.0486, + 0.4766, 0., 0.9844, 0.2500, -1.2578, 0.2461, 1.1094, 8.0458, + 0., 0., -0.5625, 0., 0.]]) + softshrink = torch.nn.Softshrink() + out = softshrink(x) + self.assertEqual(out, expected, atol=1e-2, rtol=0) + def test_threshold_inplace_overlap(self, device): # Inplace threshold is okay, because it is idempotent x = torch.randn((1, 6), device=device).expand((6, 6)) @@ -19476,6 +20363,293 @@ def test_adaptive_pool_invalid(self, device): t, output_size = inp m(output_size)(t) + @dtypes(torch.float) + @dtypesIfCUDA(torch.double, torch.float, torch.half) + def test_transformerencoderlayer(self, device, dtype): + # this is a deterministic test for TransformerEncoderLayer + d_model = 4 + nhead = 2 + dim_feedforward = 16 + dropout = 0.0 + bsz = 2 + + atol = 1e-5 + rtol = 1e-7 + if "cuda" in device: + atol = 1e-3 + rtol = 1e-2 + + def _test(training, batch_first, atol, rtol): + def perm_fn(x): + return x.transpose(1, 0) if batch_first else x + + model = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, + batch_first=batch_first, device=device, dtype=dtype) + + if not training: + assert dropout == 0 + model = model.eval() + + # set constant weights of the model + for idx, p in enumerate(model.parameters()): + x = p.data + sz = x.view(-1).size(0) + shape = x.shape + x = torch.cos(torch.arange(0, sz).float().view(shape)) + p.data.copy_(x) + + # deterministic input + encoder_input = torch.tensor([[[20., 30., 40., 50.]]], device=device, dtype=dtype) + result = model(encoder_input) + ref_output = torch.tensor([[[2.258703, 0.127985, -0.697881, 0.170862]]], device=device, dtype=dtype) + self.assertEqual(result.shape, ref_output.shape) + torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol) + # 0 values are NOT masked. This shouldn't mask anything. + mask = torch.tensor([[0]], device=device) == 1 + # TODO: enable fast path for calls with a mask! + result = model(encoder_input, src_key_padding_mask=mask) + self.assertEqual(result.shape, ref_output.shape) + torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol) + # 1 values are masked. Since there is only 1 input embedding this + # will result in nan. + mask = torch.tensor([[1]], device=device) == 1 + result = model(encoder_input, src_key_padding_mask=mask) + result = result.cpu().detach().numpy() + self.assertTrue(np.isnan(result).all()) + + # deterministic input + encoder_input = perm_fn(torch.tensor([[[1., 2., 3., 4.]], + [[5., 6., 7., 8.]]], device=device, dtype=dtype)) + result = model(encoder_input) + ref_output = perm_fn(torch.tensor([[[2.272644, 0.119035, -0.691669, 0.153486]], + [[2.272644, 0.119035, -0.691669, 0.153486]]], device=device, dtype=dtype)) + self.assertEqual(result.shape, ref_output.shape) + torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol) + # all 0 which is no masking + mask = torch.tensor([[0, 0]], device=device) == 1 + result = model(encoder_input, src_key_padding_mask=mask) + self.assertEqual(result.shape, ref_output.shape) + torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol) + mask = torch.tensor([[1, 0]], device=device) == 1 + result = model(encoder_input, src_key_padding_mask=mask) + ref_output = perm_fn(torch.tensor([[[2.301516, 0.092249, -0.679101, 0.103088]], + [[2.301516, 0.092249, -0.679101, 0.103088]]], device=device, dtype=dtype)) + self.assertEqual(result.shape, ref_output.shape) + torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol) + + # deterministic input + encoder_input = perm_fn(torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891], + [0.5387, 0.1655, 0.3565, 0.0471]], + [[0.8335, 0.2799, 0.5031, 0.2947], + [0.1402, 0.0318, 0.7636, 0.1346]], + [[0.6333, 0.9344, 0.1376, 0.9938], + [0.8924, 0.2872, 0.6692, 0.2944]], + [[0.9897, 0.6915, 0.3154, 0.1733], + [0.8645, 0.3513, 0.3064, 0.0767]], + [[0.8117, 0.2366, 0.4838, 0.7881], + [0.3718, 0.4945, 0.9511, 0.0864]]], device=device, dtype=dtype)) + result = model(encoder_input) + ref_output = perm_fn(torch.tensor([[[2.428589, 0.020835, -0.602055, -0.085249], + [2.427987, 0.021213, -0.602496, -0.084103]], + [[2.424689, 0.019155, -0.604793, -0.085672], + [2.413863, 0.022211, -0.612486, -0.072490]], + [[2.433774, 0.021598, -0.598343, -0.087548], + [2.425104, 0.019748, -0.604515, -0.084839]], + [[2.436185, 0.022682, -0.596625, -0.087261], + [2.433556, 0.021891, -0.598509, -0.086832]], + [[2.416246, 0.017512, -0.610712, -0.082961], + [2.422901, 0.024187, -0.606178, -0.074929]]], device=device, dtype=dtype)) + self.assertEqual(result.shape, ref_output.shape) + torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol) + + # all 0 + mask = torch.zeros([2, 5], device=device) == 1 + result = model(encoder_input, src_key_padding_mask=mask) + self.assertEqual(result.shape, ref_output.shape) + torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol) + mask[0, 1] = 1 + mask[1, 3] = 1 + mask[1, 4] = 1 + result = model(encoder_input, src_key_padding_mask=mask) + ref_output = perm_fn(torch.tensor([[[2.429026, 0.020793, -0.601741, -0.085642], + [2.428811, 0.021445, -0.601912, -0.084252]], + [[2.425009, 0.019155, -0.604566, -0.085899], + [2.415408, 0.02249 , -0.611415, -0.073]], + [[2.434199, 0.021682, -0.598039, -0.087699], + [2.42598, 0.019941, -0.603896, -0.085091]], + [[2.436457, 0.022736, -0.59643 , -0.08736], + [2.434021, 0.022093, -0.598179, -0.08679]], + [[2.416531, 0.017498, -0.610513, -0.083181], + [2.4242, 0.024653, -0.605266, -0.074959]]], device=device, dtype=dtype)) + self.assertEqual(result.shape, ref_output.shape) + torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol) + + # NestedTensor is only supported for the fast path + # currently, which won't be used if training. + if (batch_first and not training and + ('cuda' in str(device) or 'cpu' in str(device)) and not TEST_WITH_CROSSREF): + encoder_input[0][-1] = torch.zeros_like(encoder_input[0][1]) + mask = torch.zeros(encoder_input.shape[:-1], device=device, dtype=torch.bool) + mask[0][-1] = True + + nt = torch.nested_tensor([encoder_input[0][:-1], encoder_input[1]], device=device) + result = model(nt) + ref_output = torch.tensor( + [ + [ + [2.4268184, 0.02042419, -0.603311, -0.08476824], + [2.423306, 0.01889652, -0.6057701, -0.08519465], + [2.431538, 0.02078694, -0.5999354, -0.08746159], + [2.4348664, 0.02212971, -0.5975677, -0.08733892], + [2.423133, 0.02097577, -0.60594773, -0.08113337], + ], + [ + [2.4279876, 0.02121329, -0.60249615, -0.08410317], + [2.4138637, 0.02221113, -0.6124869, -0.07249016], + [2.4251041, 0.01974815, -0.6045152, -0.08483928], + [2.4335563, 0.0218913, -0.59850943, -0.08683228], + [2.4229012, 0.02418739, -0.6061784, -0.07492948], + ], + ], + device=device, dtype=dtype + ) + result = result.to_padded_tensor(0) + ref_output[0][-1] = torch.zeros_like( + ref_output[0][-1], device=device, dtype=dtype + ) + result[0][-1] = torch.zeros_like( + result[0][-1], device=device, dtype=dtype + ) + self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) + if 'cuda' in device: + if dtype == torch.float: + atol = 2e-4 + rtol = 4e-3 + else: + atol = 7e-4 + rtol = 2e-2 + torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol) + else: + torch.testing.assert_close(result, ref_output) + + + for batch_first in (True, False): + for training in (True, False): + if training: + cm = contextlib.nullcontext() + else: + # Fast path requires inference mode. + cm = torch.no_grad() + with cm: + _test(batch_first=batch_first, training=training, atol=atol, rtol=rtol) + + @dtypes(torch.double) + @torch.no_grad() + def test_multihead_attn_fast_path_query_and_bias_have_different_dtypes(self, device, dtype): + mha = torch.nn.MultiheadAttention(3, 3, batch_first=True, dtype=dtype, device=device).eval() + mha.in_proj_bias = torch.nn.Parameter(mha.in_proj_bias.to(torch.half).to(device)) + query = torch.randn(3, 3, 3, dtype=dtype, device=device) + mha(query, query, query) + + @dtypes(torch.double) + @torch.no_grad() + def test_multihead_attn_in_proj_bias_none(self, device, dtype): + mha = torch.nn.MultiheadAttention(1, 1, bias=False, dtype=dtype, device=device) + query = torch.rand(3, 2, 1, dtype=dtype, device=device) + mha(query, query, query) + + @dtypes(torch.double) + @torch.no_grad() + def test_multihead_attn_in_proj_weight_none(self, device, dtype): + # Setting kdim == vdim == 2 means that vdim != embed_dim + # will cause the logic to use per-input project weights, thereby + # forcing self.in_proj_weight = None + mha = torch.nn.MultiheadAttention(4, 4, vdim=2, kdim=2, dtype=dtype, device=device) + query = torch.rand(4, 4, 4, dtype=dtype, device=device) + key = torch.rand(4, 4, 2, dtype=dtype, device=device) + mha(query, key, key) + + @dtypes(torch.float) + @dtypesIfCUDA(torch.half, torch.float) + def test_transformerencoderlayer_gelu(self, device, dtype): + # this is a deterministic test for TransformerEncoderLayer with gelu activation + d_model = 4 + nhead = 2 + dim_feedforward = 16 + dropout = 0.0 + bsz = 2 + + atol = 0 + rtol = 1e-5 + if "cuda" in device: + atol = 1e-3 + rtol = 1e-2 + + def _test(activation, batch_first, training): + def perm_fn(x): + return x.transpose(1, 0) if batch_first else x + + model = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, + activation, batch_first=batch_first, device=device, dtype=dtype) + if not training: + assert dropout == 0 + model = model.eval() + + # set constant weights of the model + for idx, p in enumerate(model.parameters()): + x = p.data + sz = x.view(-1).size(0) + shape = x.shape + x = torch.cos(torch.arange(0, sz).float().view(shape)) + p.data.copy_(x) + + # deterministic input + encoder_input = torch.tensor([[[20., 30., 40., 50.]]], device=device, dtype=dtype) + result = model(encoder_input) + ref_output = torch.tensor([[[2.249815, 0.131006, -0.702199, 0.177868]]], device=device, dtype=dtype) + torch.testing.assert_close(result, ref_output, rtol=rtol, atol=atol) + + # deterministic input + encoder_input = perm_fn(torch.tensor([[[1., 2., 3., 4.]], + [[5., 6., 7., 8.]]], device=device, dtype=dtype)) + result = model(encoder_input) + ref_output = perm_fn(torch.tensor([[[2.264103, 0.121417, -0.696012, 0.159724]], + [[2.264103, 0.121417, -0.696012, 0.159724]]], device=device, dtype=dtype)) + torch.testing.assert_close(result, ref_output, rtol=rtol, atol=atol) + + # deterministic input + encoder_input = perm_fn(torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891], + [0.5387, 0.1655, 0.3565, 0.0471]], + [[0.8335, 0.2799, 0.5031, 0.2947], + [0.1402, 0.0318, 0.7636, 0.1346]], + [[0.6333, 0.9344, 0.1376, 0.9938], + [0.8924, 0.2872, 0.6692, 0.2944]], + [[0.9897, 0.6915, 0.3154, 0.1733], + [0.8645, 0.3513, 0.3064, 0.0767]], + [[0.8117, 0.2366, 0.4838, 0.7881], + [0.3718, 0.4945, 0.9511, 0.0864]]], device=device, dtype=dtype)) + result = model(encoder_input) + ref_output = perm_fn(torch.tensor([[[2.42163188, 0.03227153, -0.60714219, -0.05908082], + [2.42151276, 0.03302179, -0.60722523, -0.05762651]], + [[2.41926761, 0.02974034, -0.60879519, -0.0621269], + [2.41626395, 0.03539356, -0.61087842, -0.04978623]], + [[2.42382808, 0.03218872, -0.6055963, -0.06073591], + [2.41983477, 0.03085259, -0.60840145, -0.06046414]], + [[2.42500749, 0.03328855, -0.60476388, -0.0595334], + [2.4237977, 0.03290575, -0.60561789, -0.05940082]], + [[2.41383916, 0.02686345, -0.61256377, -0.06380707], + [2.42000277, 0.03800944, -0.60824798, -0.04754947]]], device=device, dtype=dtype)) + torch.testing.assert_close(result, ref_output, rtol=rtol, atol=atol) + for activation, batch_first, training in product(('gelu', F.gelu, nn.GELU()), (True, False), (True, False)): + # Fast path requires inference mode. + if training: + cm = contextlib.nullcontext() + else: + cm = torch.no_grad() + with cm: + _test(activation=activation, batch_first=batch_first, training=training) + + class TestModuleGlobalHooks(TestCase): def tearDown(self): @@ -20443,18 +21617,120 @@ def my_pre_load_hook_with_module( nonlocal hook_called hook_called += 1 - m = MyModule() - state_dict = m.state_dict() + # Test that hooks registered on a submodule are also called + # appropriately, i.e. with the submodule as module argument in + # my_pre_load_hook_with_module. + class MyModuleContainer(nn.Module): + def __init__(self, mod): + super().__init__() + self.mod = mod - hook_called = 0 - m._register_load_state_dict_pre_hook(m.my_pre_load_hook) - m.load_state_dict(state_dict) - self.assertEqual(1, hook_called) + for ctor in [MyModuleContainer, lambda x: x]: + m = ctor(MyModule()) + state_dict = m.state_dict() + if isinstance(m, MyModuleContainer): + mod = m.mod + else: + mod = m + + hook_called = 0 + mod._register_load_state_dict_pre_hook( + mod.my_pre_load_hook + ) + m.load_state_dict(state_dict) + self.assertEqual(1, hook_called) + hook_called = 0 + mod._register_load_state_dict_pre_hook( + mod.my_pre_load_hook_with_module, True + ) + m.load_state_dict(state_dict) + self.assertEqual(2, hook_called) + + def test_load_state_dict_post_hook(self): hook_called = 0 - m._register_load_state_dict_pre_hook(m.my_pre_load_hook_with_module, True) - m.load_state_dict(state_dict) - self.assertEqual(2, hook_called) + + class MyModule(nn.Module): + def __init__(self): + super(MyModule, self).__init__() + self.foo = torch.nn.Parameter(torch.rand(10)) + + def my_post_load_hook(self, module, incompatible_keys): + assert module is self + nonlocal hook_called + incompatible_keys.missing_keys.append("foo") + incompatible_keys.unexpected_keys.append("bar") + hook_called += 1 + + nested = MyModule() + wrapped = nn.ModuleList([nested]) + handle = nested.register_load_state_dict_post_hook( + nested.my_post_load_hook, + ) + # Hook must be called even if it is wrapped + ret = wrapped.load_state_dict(wrapped.state_dict(), strict=False) + self.assertEqual(hook_called, 1) + # Ensure that the hook modified missing_keys and unexpected_keys + missing = ret.missing_keys + unexpected = ret.unexpected_keys + self.assertEqual(missing, ["foo"]) + self.assertEqual(unexpected, ["bar"]) + # When called with strict=True, the error raised should mention the + # missing and unexpected keys the hook added. + with self.assertRaisesRegex(RuntimeError, "foo.*\n.*bar"): + wrapped.load_state_dict(wrapped.state_dict(), strict=True) + self.assertEqual(hook_called, 2) + # Removing the hook via handle.remove() should cause it not to + # fire anymore. + handle.remove() + # Hook did not run so it should not have added any keys + ret = wrapped.load_state_dict(wrapped.state_dict(), strict=False) + self.assertEqual(ret.missing_keys, []) + self.assertEqual(ret.unexpected_keys, []) + # hook_called should not have been incremented + self.assertEqual(hook_called, 2) + + def load_hook_clear_incompatible(module, incompatible_keys): + incompatible_keys.missing_keys.clear() + incompatible_keys.unexpected_keys.clear() + + nested.register_load_state_dict_post_hook(load_hook_clear_incompatible) + state_dict = wrapped.state_dict() + state_dict["extra"] = torch.ones(1) + # load state_dict with strict=True should not throw. + ret = wrapped.load_state_dict(state_dict, strict=True) + # explicitly ensure that the post hook clearned out incompatible_keys + self.assertEqual([], ret.missing_keys) + self.assertEqual([], ret.unexpected_keys) + + @unittest.skipIf(IS_WINDOWS, "Tempfile permission issue on windows") + def test_load_state_dict_post_hook_backward_compatibility(self): + def my_post_load_hook(mod, _): + nonlocal called + called = True + + for m in [nn.Softmin(10), nn.Softmax(10), nn.LogSoftmax(10)]: + called = False + sd = deepcopy(m.state_dict()) + self.assertTrue(hasattr(m, '_load_state_dict_post_hooks')) + # Simulate an older model that did not have this attr + delattr(m, '_load_state_dict_post_hooks') + # Save and load, and ensure that load_state_dict works (without proper + # BC we would run into errors because this attribute would be expected). + # In particular, Softmax runs into the issue described here: + # https://github.com/pytorch/pytorch/issues/77280 + with NamedTemporaryFile() as f: + # Note that torch.save / torch.load is not recommended to save/load + # modules. + torch.save(m, f.name) + m = torch.load(f.name) + m.load_state_dict(sd) + self.assertFalse(called) + + # Ensure hooks can be registered and called. + m.register_load_state_dict_post_hook(my_post_load_hook) + m.load_state_dict(sd) + self.assertTrue(called) instantiate_device_type_tests(TestNNDeviceType, globals()) diff --git a/test/test_numpy_interop.py b/test/test_numpy_interop.py index 656abdc57bda..96c1016c2dbb 100644 --- a/test/test_numpy_interop.py +++ b/test/test_numpy_interop.py @@ -8,8 +8,8 @@ from torch.testing._internal.common_utils import \ (TestCase, run_tests) from torch.testing._internal.common_device_type import \ - (instantiate_device_type_tests, onlyCPU, dtypes) -from torch.testing._internal.common_dtype import get_all_dtypes + (instantiate_device_type_tests, onlyCPU, dtypes, skipMeta) +from torch.testing._internal.common_dtype import all_types_and_complex_and # For testing handling NumPy objects and sending tensors to / accepting # arrays from NumPy. @@ -228,11 +228,34 @@ def test_from_numpy(self, device) -> None: x.strides = (3,) self.assertRaises(ValueError, lambda: torch.from_numpy(x)) + @skipMeta def test_from_list_of_ndarray_warning(self, device): warning_msg = r"Creating a tensor from a list of numpy.ndarrays is extremely slow" with self.assertWarnsOnceRegex(UserWarning, warning_msg): torch.tensor([np.array([0]), np.array([1])], device=device) + def test_ctor_with_invalid_numpy_array_sequence(self, device): + # Invalid list of numpy array + with self.assertRaisesRegex(ValueError, "expected sequence of length"): + torch.tensor([np.random.random(size=(3, 3)), np.random.random(size=(3, 0))], device=device) + + # Invalid list of list of numpy array + with self.assertRaisesRegex(ValueError, "expected sequence of length"): + torch.tensor([[np.random.random(size=(3, 3)), np.random.random(size=(3, 2))]], device=device) + + with self.assertRaisesRegex(ValueError, "expected sequence of length"): + torch.tensor([[np.random.random(size=(3, 3)), np.random.random(size=(3, 3))], + [np.random.random(size=(3, 3)), np.random.random(size=(3, 2))]], device=device) + + # expected shape is `[1, 2, 3]`, hence we try to iterate over 0-D array + # leading to type error : not a sequence. + with self.assertRaisesRegex(TypeError, "not a sequence"): + torch.tensor([[np.random.random(size=(3)), np.random.random()]], device=device) + + # list of list or numpy array. + with self.assertRaisesRegex(ValueError, "expected sequence of length"): + torch.tensor([[1, 2, 3], np.random.random(size=(2,)), ], device=device) + @onlyCPU def test_ctor_with_numpy_scalar_ctor(self, device) -> None: dtypes = [ @@ -396,7 +419,7 @@ def test_has_storage_numpy(self, device): self.assertIsNotNone(torch.tensor(arr, device=device, dtype=torch.long).storage()) self.assertIsNotNone(torch.tensor(arr, device=device, dtype=torch.uint8).storage()) - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool)) def test_numpy_scalar_cmp(self, device, dtype): if dtype.is_complex: tensors = (torch.tensor(complex(1, 3), dtype=dtype, device=device), diff --git a/test/test_ops.py b/test/test_ops.py index 4d41e60b4aaf..2d737f3d6d39 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -1,45 +1,80 @@ -# Owner(s): ["high priority"] +# Owner(s): ["module: unknown"] from collections.abc import Sequence -from functools import partial, wraps +from functools import partial import warnings import unittest import itertools - import torch -from torch.testing import FileCheck, make_tensor -from torch.testing._internal.common_dtype import floating_and_complex_types_and, get_all_dtypes -from torch.testing._internal.common_utils import \ - (TestCase, is_iterable_of_tensors, run_tests, IS_SANDCASTLE, clone_input_helper, - gradcheck, gradgradcheck, IS_IN_CI, suppress_warnings, noncontiguous_like, - TEST_WITH_ASAN, IS_WINDOWS, IS_FBCODE, first_sample) -from torch.testing._internal.common_methods_invocations import \ - (op_db, _NOTHING, UnaryUfuncInfo, ReductionOpInfo, SpectralFuncInfo) -from torch.testing._internal.common_device_type import \ - (deviceCountAtLeast, instantiate_device_type_tests, ops, onlyCPU, - onlyCUDA, onlyNativeDeviceTypes, skipCUDAIfRocm, OpDTypes, skipMeta) -from torch.testing._internal.common_jit import JitCommonTestCase, check_against_reference -from torch.testing._internal.jit_metaprogramming_utils import create_script_fn, create_traced_fn, \ - check_alias_annotation -from torch.testing._internal.jit_utils import disable_autodiff_subgraph_inlining, is_lambda +from torch.testing import make_tensor +from torch.testing._internal.common_dtype import ( + floating_and_complex_types_and, + all_types_and_complex_and, +) +from torch.testing._internal.common_utils import ( + TestCase, + is_iterable_of_tensors, + run_tests, + IS_SANDCASTLE, + clone_input_helper, + IS_IN_CI, + suppress_warnings, + noncontiguous_like, + TEST_WITH_ASAN, + IS_WINDOWS, + IS_FBCODE, + first_sample, +) +from torch.testing._internal.common_methods_invocations import ( + op_db, + _NOTHING, + UnaryUfuncInfo, + ReductionOpInfo, + SpectralFuncInfo, + ops_and_refs, + python_ref_db, + BinaryUfuncInfo, +) +from torch.testing._internal.common_device_type import ( + deviceCountAtLeast, + instantiate_device_type_tests, + ops, + onlyCUDA, + onlyNativeDeviceTypes, + OpDTypes, + skipMeta, +) +import torch._prims as prims + import torch.testing._internal.opinfo_helper as opinfo_helper -from torch.testing._internal.composite_compliance import _check_composite_compliance +from torch.testing._internal import composite_compliance # TODO: fixme https://github.com/pytorch/pytorch/issues/68972 torch.set_default_dtype(torch.float32) # variant testing is only done with torch.float and torch.cfloat to avoid # excessive test times and maximize signal to noise ratio -_variant_ops = partial(ops, dtypes=OpDTypes.supported, - allowed_dtypes=(torch.float, torch.cfloat)) +_variant_ops = partial( + ops, dtypes=OpDTypes.supported, allowed_dtypes=(torch.float, torch.cfloat) +) # Get names of all the operators which have ref in their entry in OpInfo (testing infra) -# except for Unary Ufuncs (separately implemented in test/test_unary_ufuncs.py) +# except for elementwise unary operators (separately implemented in test/test_unary_ufuncs.py), +# elementwise binary operators (separately implemented in test_binary_ufuncs.py), +# reduction operations (separately impelemented in test_reductions.py), # and Spectral Functions (separately implemented for only 1D as of now, in test/test_spectral_ops.py) -_ref_test_ops = list(filter(lambda op: not isinstance(op, (UnaryUfuncInfo, ReductionOpInfo, - SpectralFuncInfo)) and op.ref is not None and op.ref is not _NOTHING, op_db)) - +_ref_test_ops = tuple( + filter( + lambda op: not isinstance( + op, (UnaryUfuncInfo, ReductionOpInfo, SpectralFuncInfo, BinaryUfuncInfo) + ) + and op.ref is not None + and op.ref is not _NOTHING, + op_db, + ) +) +_ops_and_refs = op_db + python_ref_db # Tests that apply to all operators and aren't related to any particular # system @@ -52,8 +87,10 @@ def tearDownClass(cls): super().tearDownClass() if IS_IN_CI: - err_msg = ("The operator(s) below is(are) using dynamic_dtypes in the OpInfo entries." - "This is OK for testing, but be sure to set the dtypes manually before landing your PR!") + err_msg = ( + "The operator(s) below is(are) using dynamic_dtypes in the OpInfo entries." + "This is OK for testing, but be sure to set the dtypes manually before landing your PR!" + ) # Assure no opinfo entry has dynamic_dtypes filtered_ops = list(filter(opinfo_helper.is_dynamic_dtype_set, op_db)) for op in filtered_ops: @@ -64,65 +101,90 @@ def tearDownClass(cls): # Validates that each OpInfo specifies its forward and backward dtypes # correctly for CPU and CUDA devices + @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN") @skipMeta - @skipCUDAIfRocm @onlyNativeDeviceTypes - @ops(op_db, dtypes=OpDTypes.none) + @ops(ops_and_refs, dtypes=OpDTypes.none) def test_dtypes(self, device, op): + # Check complex32 support only if the op claims. + # TODO: Once the complex32 support is better, we should add check for complex32 unconditionally. + device_type = torch.device(device).type + include_complex32 = ( + (torch.complex32,) + if op.supports_dtype(torch.complex32, device_type) + else () + ) + # dtypes to try to backward in - allowed_backward_dtypes = floating_and_complex_types_and(torch.bfloat16, torch.float16) + allowed_backward_dtypes = floating_and_complex_types_and( + *((torch.half, torch.bfloat16) + include_complex32) + ) # lists for (un)supported dtypes - supported_dtypes = [] - unsupported_dtypes = [] - supported_backward_dtypes = [] - unsupported_backward_dtypes = [] + supported_dtypes = set() + unsupported_dtypes = set() + supported_backward_dtypes = set() + unsupported_backward_dtypes = set() def unsupported(dtype): - unsupported_dtypes.append(dtype) + unsupported_dtypes.add(dtype) if dtype in allowed_backward_dtypes: - unsupported_backward_dtypes.append(dtype) + unsupported_backward_dtypes.add(dtype) - for dtype in get_all_dtypes(): + for dtype in all_types_and_complex_and( + *((torch.half, torch.bfloat16, torch.bool) + include_complex32) + ): # tries to acquire samples - failure indicates lack of support - requires_grad = (dtype in allowed_backward_dtypes and op.supports_autograd) + requires_grad = dtype in allowed_backward_dtypes try: - samples = list(op.sample_inputs(device, dtype, requires_grad=requires_grad)) + samples = tuple( + op.sample_inputs(device, dtype, requires_grad=requires_grad) + ) except Exception as e: unsupported(dtype) continue - # Counts number of successful backward attempts - # NOTE: This exists as a kludge because this only understands how to - # request a gradient if the output is a tensor or a sequence with - # a tensor as its first element. - num_backward_successes = 0 for sample in samples: # tries to call operator with the sample - failure indicates # lack of support try: result = op(sample.input, *sample.args, **sample.kwargs) + supported_dtypes.add(dtype) except Exception as e: # NOTE: some ops will fail in forward if their inputs # require grad but they don't support computing the gradient # in that type! This is a bug in the op! unsupported(dtype) + continue - # Short-circuits testing this dtype -- it doesn't work - if dtype in unsupported_dtypes: - break - - # Short-circuits if the dtype isn't a backward dtype or - # it's already identified as not supported - if dtype not in allowed_backward_dtypes or dtype in unsupported_backward_dtypes: + # Checks for backward support in the same dtype, if the input has + # one or more tensors requiring grad + def _tensor_requires_grad(x): + if isinstance(x, dict): + for k, v in x.items(): + if _tensor_requires_grad(v): + return True + if isinstance(x, (list, tuple)): + for a in x: + if _tensor_requires_grad(a): + return True + if isinstance(x, torch.Tensor) and x.requires_grad: + return True + + return False + + requires_grad = _tensor_requires_grad(sample.input) \ + or _tensor_requires_grad(sample.args) or _tensor_requires_grad(sample.kwargs) + if not requires_grad: continue - # Checks for backward support in the same dtype try: result = sample.output_process_fn_grad(result) if isinstance(result, torch.Tensor): backward_tensor = result - elif isinstance(result, Sequence) and isinstance(result[0], torch.Tensor): + elif isinstance(result, Sequence) and isinstance( + result[0], torch.Tensor + ): backward_tensor = result[0] else: continue @@ -135,55 +197,105 @@ def unsupported(dtype): # supporting grad in the input dtype. grad = torch.randn_like(backward_tensor) backward_tensor.backward(grad) - num_backward_successes += 1 + supported_backward_dtypes.add(dtype) except Exception as e: - unsupported_backward_dtypes.append(dtype) - - if dtype not in unsupported_dtypes: - supported_dtypes.append(dtype) - if num_backward_successes > 0 and dtype not in unsupported_backward_dtypes: - supported_backward_dtypes.append(dtype) + unsupported_backward_dtypes.add(dtype) # Checks that dtypes are listed correctly and generates an informative # error message - device_type = torch.device(device).type - claimed_supported = set(op.supported_dtypes(device_type)) - supported_dtypes = set(supported_dtypes) - - supported_but_unclaimed = supported_dtypes - claimed_supported - claimed_but_unsupported = claimed_supported - supported_dtypes - msg = """The supported dtypes for {0} on {1} according to its OpInfo are - {2}, but the detected supported dtypes are {3}. - """.format(op.name, device_type, claimed_supported, supported_dtypes) - - if len(supported_but_unclaimed) > 0: - msg += "The following dtypes should be added to the OpInfo: {0}. ".format(supported_but_unclaimed) - if len(claimed_but_unsupported) > 0: - msg += "The following dtypes should be removed from the OpInfo: {0}.".format(claimed_but_unsupported) - - self.assertEqual(supported_dtypes, claimed_supported, msg=msg) - # Checks that backward dtypes are listed correctly and generates an - # informative error message - # NOTE: this code is nearly identical to the check + msg generation - claimed_backward_supported = set(op.supported_backward_dtypes(device_type)) - supported_backward_dtypes = set(supported_backward_dtypes) + supported_forward = supported_dtypes - unsupported_dtypes + partially_supported_forward = supported_dtypes & unsupported_dtypes + unsupported_forward = unsupported_dtypes - supported_dtypes + supported_backward = supported_backward_dtypes - unsupported_backward_dtypes + partially_supported_backward = ( + supported_backward_dtypes & unsupported_backward_dtypes + ) + unsupported_backward = unsupported_backward_dtypes - supported_backward_dtypes - supported_but_unclaimed = supported_backward_dtypes - claimed_backward_supported - claimed_but_unsupported = claimed_backward_supported - supported_backward_dtypes - msg = """The supported backward dtypes for {0} on {1} according to its OpInfo are - {2}, but the detected supported backward dtypes are {3}. - """.format(op.name, device_type, claimed_backward_supported, supported_backward_dtypes) + device_type = torch.device(device).type - if len(supported_but_unclaimed) > 0: - msg += "The following backward dtypes should be added to the OpInfo: {0}. ".format(supported_but_unclaimed) - if len(claimed_but_unsupported) > 0: - msg += "The following backward dtypes should be removed from the OpInfo: {0}.".format(claimed_but_unsupported) + claimed_forward = set(op.supported_dtypes(device_type)) + supported_but_unclaimed_forward = supported_forward - claimed_forward + claimed_but_unsupported_forward = claimed_forward & unsupported_forward + + claimed_backward = set(op.supported_backward_dtypes(device_type)) + supported_but_unclaimed_backward = supported_backward - claimed_backward + claimed_but_unsupported_backward = claimed_backward & unsupported_backward + + # Partially supporting a dtype is not an error, but we print a warning + if (len(partially_supported_forward) + len(partially_supported_backward)) > 0: + msg = "Some dtypes for {0} on device type {1} are only partially supported!\n".format( + op.name, device_type + ) + if len(partially_supported_forward) > 0: + msg = ( + msg + + "The following dtypes only worked on some samples during forward: {0}.\n".format( + partially_supported_forward + ) + ) + if len(partially_supported_backward) > 0: + msg = ( + msg + + "The following dtypes only worked on some samples during backward: {0}.\n".format( + partially_supported_backward + ) + ) + print(msg) + + if ( + len(supported_but_unclaimed_forward) + + len(claimed_but_unsupported_forward) + + len(supported_but_unclaimed_backward) + + len(claimed_but_unsupported_backward) + ) == 0: + return - self.assertEqual(supported_backward_dtypes, claimed_backward_supported, msg=msg) + # Reference operators often support additional dtypes, and that's OK + if op in python_ref_db: + if ( + len(claimed_but_unsupported_forward) + + len(claimed_but_unsupported_backward) + ) == 0: + return + + # Generates error msg + msg = "The supported dtypes for {0} on device type {1} are incorrect!\n".format( + op.name, device_type + ) + if len(supported_but_unclaimed_forward) > 0: + msg = ( + msg + + "The following dtypes worked in forward but are not listed by the OpInfo: {0}.\n".format( + supported_but_unclaimed_forward + ) + ) + if len(supported_but_unclaimed_backward) > 0: + msg = ( + msg + + "The following dtypes worked in backward but are not listed by the OpInfo: {0}.\n".format( + supported_but_unclaimed_backward + ) + ) + if len(claimed_but_unsupported_forward) > 0: + msg = ( + msg + + "The following dtypes did not work in forward but are listed by the OpInfo: {0}.\n".format( + claimed_but_unsupported_forward + ) + ) + if len(claimed_but_unsupported_backward) > 0: + msg = ( + msg + + "The following dtypes did not work in backward but are listed by the OpInfo: {0}.\n".format( + claimed_but_unsupported_backward + ) + ) + + self.fail(msg) # Validates that each OpInfo works correctly on different CUDA devices - @skipCUDAIfRocm @onlyCUDA @deviceCountAtLeast(2) @ops(op_db, allowed_dtypes=(torch.float32, torch.long)) @@ -200,13 +312,16 @@ def test_multiple_devices(self, devices, dtype, op): elif is_iterable_of_tensors(result): self.assertTrue(all(map(lambda t: t.device == cuda_device, result))) else: - self.skipTest("Skipped! Only supports single tensor or iterable of tensor outputs.") + self.skipTest( + "Skipped! Only supports single tensor or iterable of tensor outputs." + ) # Tests that the function and its (ndarray-accepting) reference produce the same # values on the tensors from sample_inputs func for the corresponding op. # This test runs in double and complex double precision because # NumPy does computation internally using double precision for many functions # resulting in possible equality check failures. + @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN") @onlyNativeDeviceTypes @suppress_warnings @ops(_ref_test_ops, allowed_dtypes=(torch.float64, torch.long, torch.complex128)) @@ -215,12 +330,73 @@ def test_reference_testing(self, device, dtype, op): # Sets the default dtype to NumPy's default dtype of double cur_default = torch.get_default_dtype() torch.set_default_dtype(torch.double) - sample_inputs = op.sample_inputs(device, dtype) - for sample_input in sample_inputs: - self.compare_with_reference(op, op.ref, sample_input, exact_dtype=(dtype is not torch.long)) + for sample_input in op.reference_inputs(device, dtype): + self.compare_with_reference( + op, op.ref, sample_input, exact_dtype=(dtype is not torch.long) + ) finally: torch.set_default_dtype(cur_default) + # Tests that experimental Python References can propagate shape, dtype, + # and device metadata properly. + # TODO: include stride propagation. + @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN") + @onlyNativeDeviceTypes + @ops(python_ref_db) + def test_python_reference_meta_functions(self, device, dtype, op): + def _to_tensormeta(x): + if isinstance(x, torch.Tensor): + return prims.utils.TensorMeta(x) + return x + + # TODO: iterate over requires_grad true/false + inps = tuple(op.reference_inputs(device, dtype, requires_grad=False)) + for sample in op.reference_inputs(device, dtype, requires_grad=False): + + result = op(sample.input, *sample.args, **sample.kwargs) + + meta_sample = sample.transform(_to_tensormeta) + meta_result = op(meta_sample.input, *meta_sample.args, **meta_sample.kwargs) + + if isinstance(result, torch.Tensor): + prims.utils.compare_tensor_meta(result, meta_result) + elif isinstance(result, Sequence): + for a, b in zip(result, meta_result): + prims.utils.compare_tensor_meta(a, b) + + # Tests that experimental Python References perform the same computation + # as the operators they reference. + @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN") + @onlyNativeDeviceTypes + @ops(python_ref_db) + def test_python_reference_consistency(self, device, dtype, op): + for sample in op.reference_inputs(device, dtype, requires_grad=False): + + actual = op(sample.input, *sample.args, **sample.kwargs) + expected = op.torch_opinfo(sample.input, *sample.args, **sample.kwargs) + + self.assertEqual( + actual, + expected, + exact_stride=False, + exact_device=True, + exact_layout=True, + exact_is_coalesced=True, + ) + + if isinstance(actual, torch.Tensor): + assert isinstance(expected, torch.Tensor) + prims.utils.compare_tensor_meta(actual, expected) + if getattr(op, 'validate_view_consistency', True): + self.assertEqual(actual._is_view(), expected._is_view()) + if isinstance(actual, Sequence): + assert isinstance(expected, Sequence) + for a, b in zip(actual, expected): + prims.utils.compare_tensor_meta(a, b) + if getattr(op, 'validate_view_consistency', True): + self.assertEqual(a._is_view(), b._is_view()) + + @skipMeta @onlyNativeDeviceTypes @ops([op for op in op_db if op.error_inputs_func is not None], dtypes=OpDTypes.none) @@ -231,6 +407,23 @@ def test_errors(self, device, op): with self.assertRaisesRegex(ei.error_type, ei.error_regex): op(si.input, *si.args, **si.kwargs) + @skipMeta + @onlyNativeDeviceTypes + @ops([op for op in python_ref_db if op.error_inputs_func is not None], dtypes=OpDTypes.none) + def test_python_reference_errors(self, device, op): + def _to_tensormeta(x): + if isinstance(x, torch.Tensor): + return prims.utils.TensorMeta(x) + return x + + error_inputs = op.error_inputs(device) + for ei in error_inputs: + si = ei.sample_input + meta_sample = si.transform(_to_tensormeta) + # TODO: match strings + with self.assertRaisesRegex(ei.error_type, ""): + op(meta_sample.input, *meta_sample.args, **meta_sample.kwargs) + # Tests that the function produces the same result when called with # noncontiguous tensors. # TODO: get working with Windows by addressing failing operators @@ -244,8 +437,17 @@ def test_noncontiguous_samples(self, device, dtype, op): test_grad = dtype in op.supported_backward_dtypes(torch.device(device).type) sample_inputs = op.sample_inputs(device, dtype, requires_grad=test_grad) for sample_input in sample_inputs: - t_inp, t_args, t_kwargs = sample_input.input, sample_input.args, sample_input.kwargs - n_inp, n_args, n_kwargs = sample_input.noncontiguous() + t_inp, t_args, t_kwargs = ( + sample_input.input, + sample_input.args, + sample_input.kwargs, + ) + noncontig_sample = sample_input.noncontiguous() + n_inp, n_args, n_kwargs = ( + noncontig_sample.input, + noncontig_sample.args, + noncontig_sample.kwargs, + ) # Verifies sample input tensors should have no grad or history sample_tensor = t_inp if isinstance(t_inp, torch.Tensor) else t_inp[0] @@ -271,10 +473,14 @@ def test_noncontiguous_samples(self, device, dtype, op): grad_for_actual = noncontiguous_like(grad_for_expected) elif isinstance(expected, Sequence): # Filter output elements that do not require grad - expected = [t for t in expected - if isinstance(t, torch.Tensor) and t.requires_grad] - actual = [n for n in actual - if isinstance(n, torch.Tensor) and n.requires_grad] + expected = [ + t + for t in expected + if isinstance(t, torch.Tensor) and t.requires_grad + ] + actual = [ + n for n in actual if isinstance(n, torch.Tensor) and n.requires_grad + ] grad_for_expected = [torch.randn_like(t) for t in expected] grad_for_actual = [noncontiguous_like(n) for n in grad_for_expected] else: @@ -282,19 +488,35 @@ def test_noncontiguous_samples(self, device, dtype, op): continue # Concatenate inputs into a tuple - t_inputs = (t_inp,) + t_args if isinstance(t_inp, torch.Tensor) else tuple(t_inp) + t_args - n_inputs = (n_inp,) + n_args if isinstance(n_inp, torch.Tensor) else tuple(n_inp) + n_args + t_inputs = ( + (t_inp,) + t_args + if isinstance(t_inp, torch.Tensor) + else tuple(t_inp) + t_args + ) + n_inputs = ( + (n_inp,) + n_args + if isinstance(n_inp, torch.Tensor) + else tuple(n_inp) + n_args + ) # Filter the elemnts that are tensors that require grad - t_input_tensors = [t for t in t_inputs if isinstance(t, torch.Tensor) and t.requires_grad] - n_input_tensors = [n for n in n_inputs if isinstance(n, torch.Tensor) and n.requires_grad] + t_input_tensors = [ + t for t in t_inputs if isinstance(t, torch.Tensor) and t.requires_grad + ] + n_input_tensors = [ + n for n in n_inputs if isinstance(n, torch.Tensor) and n.requires_grad + ] self.assertEqual(len(t_input_tensors), len(n_input_tensors)) # Some functions may not use all the inputs to generate gradients. One of the # few examples of this "odd" behaviour is F.hinge_embedding_loss - t_grads = torch.autograd.grad(expected, t_input_tensors, grad_for_expected, allow_unused=True) - n_grads = torch.autograd.grad(actual, n_input_tensors, grad_for_actual, allow_unused=True) + t_grads = torch.autograd.grad( + expected, t_input_tensors, grad_for_expected, allow_unused=True + ) + n_grads = torch.autograd.grad( + actual, n_input_tensors, grad_for_actual, allow_unused=True + ) msg = "Got different gradients for contiguous / non-contiguous inputs wrt input {}." for i, (t, n) in enumerate(zip(t_grads, n_grads)): @@ -304,97 +526,119 @@ def test_noncontiguous_samples(self, device, dtype, op): # incorrectly sized out parameter warning properly yet # Cases test here: # - out= with the correct dtype and device, but the wrong shape - @ops(op_db, dtypes=OpDTypes.none) + @ops(_ops_and_refs, dtypes=OpDTypes.none) def test_out_warning(self, device, op): - # TODO: verify the op doesn't support the out= kwarg - if not op.supports_out: - self.skipTest("Skipped! Op doesn't support out= kwarg.") - # Prefers running in float32 but has a fallback for the first listed supported dtype supported_dtypes = op.supported_dtypes(self.device_type) if len(supported_dtypes) == 0: self.skipTest("Skipped! Op has not supported dtypes on this device.") - dtype = torch.float32 if torch.float32 in supported_dtypes else list(supported_dtypes)[0] + dtype = ( + torch.float32 + if torch.float32 in supported_dtypes + else list(supported_dtypes)[0] + ) - # NOTE: only tests on first sample samples = op.sample_inputs(device, dtype) - sample = first_sample(self, samples) - - # calls it normally to get the expected result - expected = op(sample.input, *sample.args, **sample.kwargs) - op_out = partial(op, sample.input, *sample.args, **sample.kwargs) - - # Short-circuits if output is not a single tensor or an - # iterable of tensors - - if not isinstance(expected, torch.Tensor) and not is_iterable_of_tensors(expected, include_empty=True): - self.skipTest("Skipped! Only supports single tensor or iterable of tensor outputs.") - - # A wrapper around map that works with single tensors and always - # instantiates the map. Used below to apply transforms to - # single tensor and iterable tensor outputs. - def _apply_out_transform(fn, out): - if isinstance(out, torch.Tensor): - return fn(out) - - # assumes (see above) that out is an iterable of tensors - return tuple(map(fn, out)) - - # Extracts strides from a tensor or iterable of tensors into a tuple - def _extract_strides(out): - if isinstance(out, torch.Tensor): - return (out.stride(),) - - # assumes (see above) that out is an iterable of tensors - return tuple(map(lambda t: t.stride(), out)) - - # Extracts data pointers from a tensor or iterable of tensors into a tuple - # NOTE: only extracts on the CPU and CUDA device types since some - # device types don't have storage - def _extract_data_ptrs(out): - if self.device_type != 'cpu' and self.device_type != 'cuda': - return () - - if isinstance(out, torch.Tensor): - return (out.data_ptr(),) + for sample in samples: + # calls it normally to get the expected result + expected = op(sample.input, *sample.args, **sample.kwargs) + op_out = partial(op, sample.input, *sample.args, **sample.kwargs) + + # Short-circuits if output is not a single tensor or an + # iterable of tensors + if not isinstance(expected, torch.Tensor) and not is_iterable_of_tensors( + expected, include_empty=True + ): + self.skipTest( + "Skipped! Only supports single tensor or iterable of tensor outputs." + ) + + # Validates the op doesn't support out if it claims not to + if not op.supports_out: + with self.assertRaises(Exception): + assert op_out(out=expected) != NotImplemented + return + + # A wrapper around map that works with single tensors and always + # instantiates the map. Used below to apply transforms to + # single tensor and iterable tensor outputs. + def _apply_out_transform(fn, out): + if isinstance(out, torch.Tensor): + return fn(out) + + # assumes (see above) that out is an iterable of tensors + return tuple(map(fn, out)) + + # Extracts strides from a tensor or iterable of tensors into a tuple + def _extract_strides(out): + if isinstance(out, torch.Tensor): + return (out.stride(),) + + # assumes (see above) that out is an iterable of tensors + return tuple(map(lambda t: t.stride(), out)) + + # Extracts data pointers from a tensor or iterable of tensors into a tuple + # NOTE: only extracts on the CPU and CUDA device types since some + # device types don't have storage + def _extract_data_ptrs(out): + if self.device_type != "cpu" and self.device_type != "cuda": + return () + + if isinstance(out, torch.Tensor): + return (out.data_ptr(),) + + # assumes (see above) that out is an iterable of tensors + return tuple(map(lambda t: t.data_ptr(), out)) + + @suppress_warnings + def _compare_out(transform, *, compare_strides_and_data_ptrs=True): + out = _apply_out_transform(transform, expected) + original_strides = _extract_strides(out) + original_ptrs = _extract_data_ptrs(out) - # assumes (see above) that out is an iterable of tensors - return tuple(map(lambda t: t.data_ptr(), out)) + op_out(out=out) + final_strides = _extract_strides(out) + final_ptrs = _extract_data_ptrs(out) - def _compare_out(transform, *, compare_strides_and_data_ptrs=True): - out = _apply_out_transform(transform, expected) - original_strides = _extract_strides(out) - original_ptrs = _extract_data_ptrs(out) + self.assertEqual(expected, out) - op_out(out=out) - final_strides = _extract_strides(out) - final_ptrs = _extract_data_ptrs(out) + if compare_strides_and_data_ptrs: + stride_msg = "Strides are not the same! Original strides were {0} and strides are now {1}".format( + original_strides, final_strides + ) + self.assertEqual(original_strides, final_strides, msg=stride_msg) + self.assertEqual(original_ptrs, final_ptrs) - self.assertEqual(expected, out) + # Case Zero: out= with the correct dtype and device, but the wrong shape + # Expected behavior: if nonempty, resize with a warning. + def _case_zero_transform(t): + wrong_shape = list(t.shape) - if compare_strides_and_data_ptrs: - self.assertEqual(original_strides, final_strides) - self.assertEqual(original_ptrs, final_ptrs) + if len(wrong_shape) == 0: + # Handles scalar tensor case (empty list) + wrong_shape = [2] + else: + wrong_shape[-1] = wrong_shape[-1] + 1 + return make_tensor(wrong_shape, dtype=t.dtype, device=t.device) - # Case: out= with the correct dtype and device, but the wrong shape - # Expected behavior: resize with a warning. - def _case_two_transform(t): - wrong_shape = list(t.shape) + # Verifies the out values are correct + _compare_out(_case_zero_transform, compare_strides_and_data_ptrs=False) - if len(wrong_shape) == 0: - # Handles scalar tensor case (empty list) - wrong_shape = [2] - else: - wrong_shape[-1] = wrong_shape[-1] + 1 - return make_tensor(wrong_shape, dtype=t.dtype, device=t.device) + # Additionally validates that the appropriate warning is thrown if a nonempty + # tensor is resized. + def _any_nonempty(out): + if isinstance(out, torch.Tensor): + return out.numel() > 0 - _compare_out(_case_two_transform, compare_strides_and_data_ptrs=False) + return any(x.numel() > 0 for x in out) - # Additional validates that the appropriate warning is thrown - out = _apply_out_transform(_case_two_transform, expected) - msg_fail = "Resized a non-empty tensor but did not warn about it." - with self.assertWarnsRegex(UserWarning, "An output with one or more elements", msg=msg_fail): - op_out(out=out) + out = _apply_out_transform(_case_zero_transform, expected) + msg_fail = "Resized a non-empty tensor but did not warn about it." + if _any_nonempty(out): + with self.assertWarnsRegex( + UserWarning, "An output with one or more elements", msg=msg_fail + ): + op_out(out=out) # Validates ops implement the correct out= behavior # See https://github.com/pytorch/pytorch/wiki/Developer-FAQ#how-does-out-work-in-pytorch @@ -406,173 +650,191 @@ def _case_two_transform(t): # - Case 3: out has the correct shape and dtype, but is on a different device type # - Case 4: out has the with correct shape and device, but a dtype that cannot # "safely" cast to - @ops(op_db, dtypes=OpDTypes.none) - def test_out(self, device, op): - # TODO: verify the op doesn't support the out= kwarg - if not op.supports_out: - self.skipTest("Skipped! Op doesn't support out= kwarg.") - + @ops(_ops_and_refs, dtypes=OpDTypes.any_one) + def test_out(self, device, dtype, op): # Prefers running in float32 but has a fallback for the first listed supported dtype - supported_dtypes = op.supported_dtypes(self.device_type) - if len(supported_dtypes) == 0: - self.skipTest("Skipped! Op has not supported dtypes on this device.") - dtype = torch.float32 if torch.float32 in supported_dtypes else list(supported_dtypes)[0] - - # NOTE: only tests on first sample samples = op.sample_inputs(device, dtype) - sample = first_sample(self, samples) - - # calls it normally to get the expected result - expected = op(sample.input, *sample.args, **sample.kwargs) - op_out = partial(op, sample.input, *sample.args, **sample.kwargs) - - # Short-circuits if output is not a single tensor or an - # iterable of tensors - - if not isinstance(expected, torch.Tensor) and not is_iterable_of_tensors(expected, include_empty=True): - self.skipTest("Skipped! Only supports single tensor or iterable of tensor outputs.") - - # A wrapper around map that works with single tensors and always - # instantiates the map. Used below to apply transforms to - # single tensor and iterable tensor outputs. - def _apply_out_transform(fn, out): - if isinstance(out, torch.Tensor): - return fn(out) - - # assumes (see above) that out is an iterable of tensors - return tuple(map(fn, out)) - - # Extracts strides from a tensor or iterable of tensors into a tuple - def _extract_strides(out): - if isinstance(out, torch.Tensor): - return (out.stride(),) + for sample in samples: + # calls it normally to get the expected result + expected = op(sample.input, *sample.args, **sample.kwargs) + op_out = partial(op, sample.input, *sample.args, **sample.kwargs) + + # Short-circuits if output is not a single tensor or an + # iterable of tensors + if not isinstance(expected, torch.Tensor) and not is_iterable_of_tensors( + expected, include_empty=True + ): + self.skipTest( + "Skipped! Only supports single tensor or iterable of tensor outputs." + ) + + # Validates the op doesn't support out if it claims not to + if not op.supports_out: + with self.assertRaises(Exception): + assert op_out(out=expected) != NotImplemented + return + + # A wrapper around map that works with single tensors and always + # instantiates the map. Used below to apply transforms to + # single tensor and iterable tensor outputs. + def _apply_out_transform(fn, out): + if isinstance(out, torch.Tensor): + return fn(out) + + # assumes (see above) that out is an iterable of tensors + return tuple(map(fn, out)) + + # Extracts strides from a tensor or iterable of tensors into a tuple + def _extract_strides(out): + if isinstance(out, torch.Tensor): + return (out.stride(),) + + # assumes (see above) that out is an iterable of tensors + return tuple(map(lambda t: t.stride(), out)) + + # Extracts data pointers from a tensor or iterable of tensors into a tuple + # NOTE: only extracts on the CPU and CUDA device types since some + # device types don't have storage + def _extract_data_ptrs(out): + if self.device_type != "cpu" and self.device_type != "cuda": + return () + + if isinstance(out, torch.Tensor): + return (out.data_ptr(),) + + # assumes (see above) that out is an iterable of tensors + return tuple(map(lambda t: t.data_ptr(), out)) + + def _compare_out(transform, *, compare_strides_and_data_ptrs=True): + out = _apply_out_transform(transform, expected) + original_strides = _extract_strides(out) + original_ptrs = _extract_data_ptrs(out) - # assumes (see above) that out is an iterable of tensors - return tuple(map(lambda t: t.stride(), out)) + op_out(out=out) + final_strides = _extract_strides(out) + final_ptrs = _extract_data_ptrs(out) + self.assertEqual(expected, out) - # Extracts data pointers from a tensor or iterable of tensors into a tuple - # NOTE: only extracts on the CPU and CUDA device types since some - # device types don't have storage - def _extract_data_ptrs(out): - if self.device_type != 'cpu' and self.device_type != 'cuda': - return () + if compare_strides_and_data_ptrs: + stride_msg = "Strides are not the same! Original strides were {0} and strides are now {1}".format( + original_strides, final_strides + ) + self.assertEqual(original_strides, final_strides, msg=stride_msg) + self.assertEqual(original_ptrs, final_ptrs) + + # Case 0: out= with the correct shape, dtype, and device + # but NaN values for floating point and complex tensors, and + # maximum values for integer tensors. + # Expected behavior: out= values have no effect on the computation. + def _case_zero_transform(t): + try: + info = torch.iinfo(t.dtype) + return torch.full_like(t, info.max) + except TypeError as te: + # for non-integer types fills with NaN + return torch.full_like(t, float("nan")) - if isinstance(out, torch.Tensor): - return (out.data_ptr(),) - # assumes (see above) that out is an iterable of tensors - return tuple(map(lambda t: t.data_ptr(), out)) + _compare_out(_case_zero_transform) - def _compare_out(transform, *, compare_strides_and_data_ptrs=True): - out = _apply_out_transform(transform, expected) - original_strides = _extract_strides(out) - original_ptrs = _extract_data_ptrs(out) + # Case 1: out= with the correct shape, dtype, and device, + # but noncontiguous. + # Expected behavior: strides are respected and `out` storage is not changed. + def _case_one_transform(t): + return make_tensor( + t.shape, dtype=t.dtype, device=t.device, noncontiguous=True + ) - op_out(out=out) - final_strides = _extract_strides(out) - final_ptrs = _extract_data_ptrs(out) + _compare_out(_case_one_transform) - self.assertEqual(expected, out) + # Case 2: out= with the correct dtype and device, but has no elements. + # Expected behavior: resize without warning. + def _case_two_transform(t): + return make_tensor((0,), dtype=t.dtype, device=t.device) - if compare_strides_and_data_ptrs: - self.assertEqual(original_strides, final_strides) - self.assertEqual(original_ptrs, final_ptrs) + _compare_out(_case_two_transform, compare_strides_and_data_ptrs=False) - # Case 0: out= with the correct shape, dtype, and device - # but NaN values for floating point and complex tensors, and - # maximum values for integer tensors. - # Expected behavior: out= values have no effect on the computation. - def _case_zero_transform(t): - try: - info = torch.iinfo(t.dtype) - return torch.full_like(t, info.max) - except TypeError as te: - # for non-integer types fills with NaN - return torch.full_like(t, float('nan')) - - _compare_out(_case_zero_transform) - - # Case 1: out= with the correct shape, dtype, and device, - # but noncontiguous. - # Expected behavior: strides are respected and `out` storage is not changed. - def _case_one_transform(t): - return make_tensor(t.shape, - dtype=t.dtype, - device=t.device, - noncontiguous=True) - - _compare_out(_case_one_transform) - - # Case 2: out= with the correct dtype and device, but has no elements. - # Expected behavior: resize without warning. - def _case_two_transform(t): - return make_tensor((0,), - dtype=t.dtype, - device=t.device) - - _compare_out(_case_two_transform, compare_strides_and_data_ptrs=False) - - # Also validates that no warning is thrown when this out is resized - out = _apply_out_transform(_case_two_transform, expected) - with warnings.catch_warnings(record=True) as caught: - warnings.simplefilter("always") - op_out(out=out) - - # Verifies no warning is a resize warning - for w in caught: - if "An output with one or more elements" in str(w.message): - self.fail("Resizing an out= argument with no elements threw a resize warning!") - - # Case 3: out= with correct shape and dtype, but wrong device. - wrong_device = None - if torch.device(device).type != 'cpu': - wrong_device = 'cpu' - elif torch.cuda.is_available(): - wrong_device = 'cuda' - - if wrong_device is not None: - def _case_three_transform(t): - return make_tensor(t.shape, dtype=t.dtype, device=wrong_device) - - out = _apply_out_transform(_case_three_transform, expected) - msg_fail = f"Expected RuntimeError when calling with input.device={device} and out.device={wrong_device}" - with self.assertRaises(RuntimeError, msg=msg_fail): + # Also validates that no warning is thrown when this out is resized + out = _apply_out_transform(_case_two_transform, expected) + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") op_out(out=out) - # Case 4: out= with correct shape and device, but a dtype - # that output cannot be "safely" cast to (long). - # Expected behavior: error. - # NOTE: this case is filtered by dtype since some ops produce - # bool tensors, for example, which can be safely cast to any - # dtype. It is applied when single tensors are floating point or complex - # dtypes, or if an op returns multiple tensors when at least one such - # tensor is a floating point or complex dtype. - _dtypes = floating_and_complex_types_and(torch.float16, torch.bfloat16) - if (isinstance(expected, torch.Tensor) and expected.dtype in _dtypes or - (not isinstance(expected, torch.Tensor) and any(t.dtype in _dtypes for t in expected))): - def _case_four_transform(t): - return make_tensor(t.shape, dtype=torch.long, device=t.device) - - out = _apply_out_transform(_case_four_transform, expected) - msg_fail = "" if not isinstance(expected, torch.Tensor) else \ - ("Expected RuntimeError when doing an unsafe cast from a result of dtype " - f"{expected.dtype} into an out= with dtype torch.long") - with self.assertRaises(RuntimeError, msg=msg_fail): - op_out(out=out) + # Verifies no warning is a resize warning + for w in caught: + if "An output with one or more elements" in str(w.message): + self.fail( + "Resizing an out= argument with no elements threw a resize warning!" + ) + + # Case 3: out= with correct shape and dtype, but wrong device. + wrong_device = None + if torch.device(device).type != "cpu": + wrong_device = "cpu" + elif torch.cuda.is_available(): + wrong_device = "cuda" + + if wrong_device is not None: + + def _case_three_transform(t): + return make_tensor(t.shape, dtype=t.dtype, device=wrong_device) + + out = _apply_out_transform(_case_three_transform, expected) + msg_fail = f"Expected RuntimeError when calling with input.device={device} and out.device={wrong_device}" + with self.assertRaises(RuntimeError, msg=msg_fail): + op_out(out=out) + + # Case 4: out= with correct shape and device, but a dtype + # that output cannot be "safely" cast to (long). + # Expected behavior: error. + # NOTE: this case is filtered by dtype since some ops produce + # bool tensors, for example, which can be safely cast to any + # dtype. It is applied when single tensors are floating point or complex + # dtypes, or if an op returns multiple tensors when at least one such + # tensor is a floating point or complex dtype. + _dtypes = floating_and_complex_types_and(torch.float16, torch.bfloat16) + if ( + isinstance(expected, torch.Tensor) + and expected.dtype in _dtypes + or ( + not isinstance(expected, torch.Tensor) + and any(t.dtype in _dtypes for t in expected) + ) + ): + + def _case_four_transform(t): + return make_tensor(t.shape, dtype=torch.long, device=t.device) + + out = _apply_out_transform(_case_four_transform, expected) + msg_fail = "Expected RuntimeError when doing an unsafe cast!" + msg_fail = ( + msg_fail + if not isinstance(expected, torch.Tensor) + else ( + "Expected RuntimeError when doing an unsafe cast from a result of dtype " + f"{expected.dtype} into an out= with dtype torch.long" + ) + ) + with self.assertRaises(RuntimeError, msg=msg_fail): + op_out(out=out) # Tests that the forward and backward passes of operations produce the # same values for the cross-product of op variants (method, inplace) # against eager's gold standard op function variant @_variant_ops(op_db) def test_variant_consistency_eager(self, device, dtype, op): - # Acquires variants (method variant, inplace variant, aliases) + # Acquires variants (method variant, inplace variant, operator variant, inplace_operator variant, aliases) method = op.method_variant inplace = op.inplace_variant + operator = op.operator_variant + inplace_operator = op.inplace_operator_variant + # list of all inplace ops: inplace variant + alias inplace variants if exist - inplace_ops = [inplace, ] - variants = [method, inplace] + inplace_ops = [inplace, inplace_operator] + variants = [method, inplace, operator, inplace_operator] + operators = [operator, inplace_operator] for a_op in op.aliases: variants.append(a_op.op) @@ -582,32 +844,48 @@ def test_variant_consistency_eager(self, device, dtype, op): inplace_variants = tuple(filter(None, inplace_ops)) variants = tuple(filter(None, variants)) + operators = tuple(filter(None, operators)) - _requires_grad = (op.supports_autograd and - (dtype.is_floating_point or op.supports_complex_autograd(torch.device(device).type))) + _requires_grad = dtype in op.supported_backward_dtypes( + torch.device(device).type + ) include_conjugated_inputs = op.test_conjugated_samples and dtype.is_complex - samples = op.sample_inputs(device, dtype, requires_grad=_requires_grad, include_conjugated_inputs=include_conjugated_inputs) + samples = op.sample_inputs( + device, + dtype, + requires_grad=_requires_grad, + include_conjugated_inputs=include_conjugated_inputs, + ) samples = list(samples) def _test_consistency_helper(samples, variants): for sample in samples: # TODO: Check grad for all Tensors requiring grad if sample.input is TensorList - tensor = sample.input if isinstance(sample.input, torch.Tensor) else sample.input[0] + tensor = ( + sample.input + if isinstance(sample.input, torch.Tensor) + else sample.input[0] + ) # Computes function forward and backward values tensor.grad = None expected_forward = op(sample.input, *sample.args, **sample.kwargs) expected_grad = None - output_process_fn_grad = sample.output_process_fn_grad if sample.output_process_fn_grad \ + output_process_fn_grad = ( + sample.output_process_fn_grad + if sample.output_process_fn_grad else lambda x: x + ) # Skips inplace variants if the output dtype is not the same as # the input dtype skip_inplace = False - if (isinstance(expected_forward, torch.Tensor) and - expected_forward.dtype is not tensor.dtype): + if ( + isinstance(expected_forward, torch.Tensor) + and expected_forward.dtype is not tensor.dtype + ): skip_inplace = True # TODO: backward consistency only supported for single tensor outputs @@ -615,8 +893,9 @@ def _test_consistency_helper(samples, variants): # tensor inputs # TODO: update to handle checking grads of all tensor inputs as # derived from each tensor output - if (op.supports_autograd and isinstance(expected_forward, torch.Tensor) - and (dtype.is_floating_point or op.supports_complex_autograd(torch.device(device).type))): + if isinstance( + expected_forward, torch.Tensor + ) and dtype in op.supported_backward_dtypes(torch.device(device).type): output_process_fn_grad(expected_forward).sum().backward() expected_grad = tensor.grad @@ -629,26 +908,39 @@ def _test_consistency_helper(samples, variants): # Compares variant's forward # Note: copies the to-be-modified input when testing the inplace variant tensor.grad = None - cloned = clone_input_helper(sample.input) if variant in inplace_ops else sample.input + cloned = ( + clone_input_helper(sample.input) + if variant in inplace_ops + else sample.input + ) if variant in inplace_ops and sample.broadcasts_input: - with self.assertRaises(RuntimeError, - msg=('inplace variant either incorrectly allowed ' - 'resizing or you have marked the sample {}' - ' incorrectly with `broadcasts_self=True'.format(sample.summary()))): - variant_forward = variant(cloned, - *sample.args, - **sample.kwargs) + with self.assertRaises( + RuntimeError, + msg=( + "inplace variant either incorrectly allowed " + "resizing or you have marked the sample {}" + " incorrectly with `broadcasts_self=True".format( + sample.summary() + ) + ), + ): + variant_forward = variant( + cloned, *sample.args, **sample.kwargs + ) + continue + + if variant in operators and sample.kwargs: + # skip samples with kwargs for operator variants continue - variant_forward = variant(cloned, - *sample.args, - **sample.kwargs) + variant_forward = variant(cloned, *sample.args, **sample.kwargs) self.assertEqual(expected_forward, variant_forward) # Compares variant's backward - if expected_grad is not None and \ - (variant not in inplace_ops or op.supports_inplace_autograd): + if expected_grad is not None and ( + variant not in inplace_ops or op.supports_inplace_autograd + ): output_process_fn_grad(variant_forward).sum().backward() self.assertEqual(expected_grad, tensor.grad) @@ -659,531 +951,119 @@ def _test_inplace_preserve_storage(samples, variants): # Skips inplace variants if the output dtype is not the same as # the input dtype expected_forward = op(sample.input, *sample.args, **sample.kwargs) - tensor = sample.input if isinstance(sample.input, torch.Tensor) else sample.input[0] + tensor = ( + sample.input + if isinstance(sample.input, torch.Tensor) + else sample.input[0] + ) skip_inplace = False - if (isinstance(expected_forward, torch.Tensor) and - expected_forward.dtype is not tensor.dtype): + if ( + isinstance(expected_forward, torch.Tensor) + and expected_forward.dtype is not tensor.dtype + ): skip_inplace = True if skip_inplace: return for variant in variants: - cloned = clone_input_helper(sample.input) if variant in inplace_ops else sample.input - inp_tensor = cloned if isinstance(cloned, torch.Tensor) else cloned[0] + cloned = ( + clone_input_helper(sample.input) + if variant in inplace_ops + else sample.input + ) + inp_tensor = ( + cloned if isinstance(cloned, torch.Tensor) else cloned[0] + ) data_ptr = inp_tensor.data_ptr() - variant_forward = variant(cloned, - *sample.args, - **sample.kwargs) + if variant in operators and sample.kwargs: + # skip samples with kwargs for operator variants + continue + + variant_forward = variant(cloned, *sample.args, **sample.kwargs) # TODO Support non-tensor outputs if they exist for inplace ops - if (isinstance(variant_forward, torch.Tensor)): - self.assertEqual(data_ptr, variant_forward.data_ptr(), atol=0, rtol=0) + if isinstance(variant_forward, torch.Tensor): + self.assertEqual( + data_ptr, variant_forward.data_ptr(), atol=0, rtol=0 + ) else: - self.assertTrue(False, "Non-tensor outputs for inplace ops are not supported") + self.assertTrue( + False, + "Non-tensor outputs for inplace ops are not supported", + ) if len(inplace_ops) > 0: - inplace_samples = list(filter(lambda sample: not sample.broadcasts_input, samples)) + inplace_samples = list( + filter(lambda sample: not sample.broadcasts_input, samples) + ) _test_inplace_preserve_storage(inplace_samples, inplace_variants) + # Reference testing for operations in complex32 against complex64. + # NOTE: We test against complex64 as NumPy doesn't have a complex32 equivalent dtype. + @ops(op_db, allowed_dtypes=(torch.complex32,)) + def test_complex_half_reference_testing(self, device, dtype, op): + if not op.supports_dtype(torch.complex32, device): + unittest.skip("Does not support complex32") + + for sample in op.sample_inputs(device, dtype): + actual = op(sample.input, *sample.args, **sample.kwargs) + # sample.transform applies the lambda to torch.Tensor and torch.dtype. + # However, we only want to apply it to Tensors with dtype `torch.complex32`.. + transformed_sample = sample.transform(lambda x: x.to(torch.complex64) if isinstance( + x, torch.Tensor) and x.dtype is torch.complex32 else x) + expected = op( + transformed_sample.input, + *transformed_sample.args, + **transformed_sample.kwargs, + ) + self.assertEqual(actual, expected, exact_dtype=False) + + +class TestCompositeCompliance(TestCase): # Checks if the operator (if it is composite) is written to support most # backends and Tensor subclasses. See "CompositeImplicitAutograd Compliance" # in aten/src/ATen/native/README.md for more details - # - # NB: onlyCPU because CompositeImplicitAutograd ops go through the same - # codepath on all devices. Ideally we'd use a meta device here but coverage - # for that is not good yet. - @unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, '__torch_dispatch__ does not work in fbcode') - @onlyCPU + @unittest.skipIf( + IS_FBCODE or IS_SANDCASTLE, "__torch_dispatch__ does not work in fbcode" + ) @ops(op_db, allowed_dtypes=(torch.float,)) - def test_composite_compliance(self, device, dtype, op): + def test_operator(self, device, dtype, op): samples = op.sample_inputs(device, dtype, requires_grad=False) for sample in samples: args = [sample.input] + list(sample.args) kwargs = sample.kwargs - _check_composite_compliance(op, args, kwargs) - - @onlyCPU - @ops(op_db, allowed_dtypes=(torch.float,)) - def test_floating_inputs_are_differentiable(self, device, dtype, op): - # Nothing to check if the operation it's not differentiable - if not op.supports_autograd: - return - - floating_dtypes = list(floating_and_complex_types_and(torch.bfloat16, torch.float16)) - - def check_tensor_floating_is_differentiable(t): - if isinstance(t, torch.Tensor) and t.dtype in floating_dtypes: - msg = (f"Found a sampled tensor of floating-point dtype {t.dtype} sampled with " - "requires_grad=False. If this is intended, please skip/xfail this test. " - "Remember that sampling operations are executed under a torch.no_grad contextmanager.") - self.assertTrue(t.requires_grad, msg) - + composite_compliance.check_with_mode(op, args, kwargs) + composite_compliance.check_all_permutations(op, args, kwargs) + + @unittest.skipIf( + IS_FBCODE or IS_SANDCASTLE, "__torch_dispatch__ does not work in fbcode" + ) + @ops([op for op in op_db if op.supports_autograd], allowed_dtypes=(torch.float,)) + def test_backward(self, device, dtype, op): samples = op.sample_inputs(device, dtype, requires_grad=True) - for sample in samples: - check_tensor_floating_is_differentiable(sample.input) - for arg in sample.args: - check_tensor_floating_is_differentiable(arg) - for arg in sample.kwargs.values(): - check_tensor_floating_is_differentiable(arg) - - -# gradcheck requires double precision -_gradcheck_ops = partial(ops, dtypes=OpDTypes.supported, - allowed_dtypes=[torch.double, torch.cdouble]) - - -class TestGradients(TestCase): - exact_dtype = True - - # Copies inputs to inplace operations to avoid inplace modifications - # to leaves requiring gradient - def _get_safe_inplace(self, inplace_variant): - @wraps(inplace_variant) - def _fn(t, *args, **kwargs): - return inplace_variant(t.clone(), *args, **kwargs) - - return _fn - - def _check_helper(self, device, dtype, op, variant, check, *, check_forward_ad=False, check_backward_ad=True, - check_batched_grad=None, check_batched_forward_grad=False): - assert check in ('gradcheck', 'bwgrad_bwgrad', 'fwgrad_bwgrad') - # NB: check_backward_ad does not affect gradgradcheck (always True) - if variant is None: - self.skipTest("Skipped! Variant not implemented.") - if not op.supports_dtype(dtype, torch.device(device).type): - self.skipTest(f"Skipped! {op.name} does not support dtype {str(dtype)}") - - def is_inplace(variant): - if hasattr(variant, "__wrapped__"): - return variant.__wrapped__ is op.get_inplace() - return variant is op.get_inplace() - - include_conjugated_inputs = op.test_conjugated_samples and dtype.is_complex - samples = op.sample_inputs(device, dtype, requires_grad=True, include_conjugated_inputs=include_conjugated_inputs) for sample in samples: - if sample.broadcasts_input and is_inplace(variant): - continue - - # Note on TensorList inputs - # - # gradcheck does not support TensorList inputs so here we pass TensorList - # inputs of size n as n single Tensor inputs to gradcheck and wrap the op - # in a function that puts the n Tensor inputs back into a TensorList - def fn(*inputs): - # Put tensors back into TensorList since we splat them when passing to gradcheck - if is_iterable_of_tensors(sample.input): - n = len(sample.input) - inputs = (inputs[:n], *inputs[n:]) - output = op.gradcheck_wrapper(variant, *inputs, **sample.kwargs) - if sample.output_process_fn_grad is not None: - return sample.output_process_fn_grad(output) - return output - - # Splat TensorList inputs into single Tensor inputs - gradcheck_args = (sample.input,) if isinstance(sample.input, torch.Tensor) else tuple(sample.input) - gradcheck_args += sample.args - - if check == 'gradcheck': - if check_batched_grad is None: - check_batched_grad = op.check_batched_grad - self.assertTrue(gradcheck(fn, gradcheck_args, - check_batched_grad=check_batched_grad, - check_grad_dtypes=True, - nondet_tol=op.gradcheck_nondet_tol, - fast_mode=op.gradcheck_fast_mode, - check_forward_ad=check_forward_ad, - check_backward_ad=check_backward_ad, - check_undefined_grad=True, - check_batched_forward_grad=check_batched_forward_grad)) - elif check in ('bwgrad_bwgrad', 'fwgrad_bwgrad'): # gradgrad check - self.assertFalse(check_forward_ad, msg="Cannot run forward AD check for gradgradcheck") - for gen_non_contig_grad_outputs in (False, True): - kwargs = { - "gen_non_contig_grad_outputs": gen_non_contig_grad_outputs, - "check_batched_grad": op.check_batched_gradgrad, - "check_grad_dtypes": True, - "nondet_tol": op.gradcheck_nondet_tol, - "fast_mode": op.gradcheck_fast_mode - } - if check == "fwgrad_bwgrad": - kwargs["check_fwd_over_rev"] = True - kwargs["check_rev_over_rev"] = False - kwargs["check_batched_grad"] = False - kwargs["check_undefined_grad"] = False - - self.assertTrue(gradgradcheck(fn, gradcheck_args, **kwargs)) - else: - self.assertTrue(False, msg="Unknown check requested!") - - def _grad_test_helper(self, device, dtype, op, variant, *, check_forward_ad=False, check_backward_ad=True, - check_batched_grad=None, check_batched_forward_grad=False): - return self._check_helper(device, dtype, op, variant, 'gradcheck', check_forward_ad=check_forward_ad, - check_backward_ad=check_backward_ad, check_batched_grad=check_batched_grad, - check_batched_forward_grad=check_batched_forward_grad) - - def _skip_helper(self, op, device, dtype): - if not op.supports_autograd and not op.supports_forward_ad: - self.skipTest("Skipped! autograd not supported.") - if not op.supports_complex_autograd(torch.device(device).type) and dtype.is_complex: - self.skipTest("Skipped! Complex autograd not supported.") - - # Tests that gradients are computed correctly - @_gradcheck_ops(op_db) - def test_fn_grad(self, device, dtype, op): - self._skip_helper(op, device, dtype) - self._grad_test_helper(device, dtype, op, op.get_op()) - - # Method grad (and gradgrad, see below) tests are disabled since they're - # costly and redundant with function grad (and gradgad) tests - # @_gradcheck_ops(op_db) - # def test_method_grad(self, device, dtype, op): - # self._skip_helper(op, device, dtype) - # self._grad_test_helper(device, dtype, op, op.get_method()) - - @_gradcheck_ops(op_db) - def test_inplace_grad(self, device, dtype, op): - self._skip_helper(op, device, dtype) - if not op.inplace_variant or not op.supports_inplace_autograd: - self.skipTest("Skipped! Operation does not support inplace autograd.") - self._grad_test_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace())) - - # Test that gradients of gradients are computed correctly - @_gradcheck_ops(op_db) - def test_fn_gradgrad(self, device, dtype, op): - self._skip_helper(op, device, dtype) - if not op.supports_gradgrad: - self.skipTest("Skipped! Operation does not support gradgrad") - self._check_helper(device, dtype, op, op.get_op(), 'bwgrad_bwgrad') - - # Test that forward-over-reverse gradgrad is computed correctly - @_gradcheck_ops(op_db) - def test_fn_fwgrad_bwgrad(self, device, dtype, op): - self._skip_helper(op, device, dtype) - - if op.supports_fwgrad_bwgrad: - self._check_helper(device, dtype, op, op.get_op(), "fwgrad_bwgrad") - else: - err_msg = r"Trying to use forward AD with .* that does not support it\." - hint_msg = ("Running forward-over-backward gradgrad for an OP that has does not support it did not " - "raise any error. If your op supports forward AD, you should set supports_fwgrad_bwgrad=True.") - with self.assertRaisesRegex(NotImplementedError, err_msg, msg=hint_msg): - self._check_helper(device, dtype, op, op.get_op(), "fwgrad_bwgrad") - - # Test that gradients of gradients are properly raising - @_gradcheck_ops(op_db) - def test_fn_fail_gradgrad(self, device, dtype, op): - self._skip_helper(op, device, dtype) - if op.supports_gradgrad: - self.skipTest("Skipped! Operation does support gradgrad") - - err_msg = r"derivative for .* is not implemented" - with self.assertRaisesRegex(RuntimeError, err_msg): - self._check_helper(device, dtype, op, op.get_op(), 'bwgrad_bwgrad') - - # Method gradgrad (and grad, see above) tests are disabled since they're - # costly and redundant with function gradgrad (and grad) tests - # @_gradcheck_ops(op_db) - # def test_method_gradgrad(self, device, dtype, op): - # self._skip_helper(op, device, dtype) - # self._gradgrad_test_helper(device, dtype, op, op.get_method()) - - @_gradcheck_ops(op_db) - def test_inplace_gradgrad(self, device, dtype, op): - self._skip_helper(op, device, dtype) - if not op.inplace_variant or not op.supports_inplace_autograd: - self.skipTest("Skipped! Operation does not support inplace autograd.") - self._check_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace()), "bwgrad_bwgrad") - - def _forward_grad_helper(self, device, dtype, op, variant, is_inplace): - # TODO: clean up how attributes are passed to gradcheck from OpInfos - def call_grad_test_helper(): - check_batched_forward_grad = ((op.check_batched_forward_grad and not is_inplace) or - (op.check_inplace_batched_forward_grad and is_inplace)) - self._grad_test_helper(device, dtype, op, variant, check_forward_ad=True, check_backward_ad=False, - check_batched_grad=False, check_batched_forward_grad=check_batched_forward_grad) - if op.supports_forward_ad: - call_grad_test_helper() - else: - err_msg = r"Trying to use forward AD with .* that does not support it\." - hint_msg = ("Running forward AD for an OP that has does not support it did not " - "raise any error. If your op supports forward AD, you should set supports_forward_ad=True") - with self.assertRaisesRegex(NotImplementedError, err_msg, msg=hint_msg): - call_grad_test_helper() - - @_gradcheck_ops(op_db) - def test_forward_mode_AD(self, device, dtype, op): - self._skip_helper(op, device, dtype) - - self._forward_grad_helper(device, dtype, op, op.get_op(), is_inplace=False) - - @_gradcheck_ops(op_db) - def test_inplace_forward_mode_AD(self, device, dtype, op): - self._skip_helper(op, device, dtype) - - if not op.inplace_variant or not op.supports_inplace_autograd: - self.skipTest("Skipped! Operation does not support inplace autograd.") - - self._forward_grad_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace()), is_inplace=True) - - # Functions that do not support autograd should not fail in forward mode - # Inplace functions (such as "resize_") are expected to fail in forward mode and should be skipped - # Test only when supports_autograd=False and for double dtype - @ops(filter(lambda op: not op.supports_autograd, op_db), dtypes=OpDTypes.supported, allowed_dtypes=(torch.double,)) - def test_nondifferentiable(self, device, dtype, op): - # Expecting no errors - samples = op.sample_inputs(device, dtype, requires_grad=True) - sample = first_sample(self, samples) - result = op(sample.input, *sample.args, **sample.kwargs) - - -# Tests operators for consistency between JIT and eager, also checks -# correctness of JIT specific alias schemas and intended -# autodifferentiation behavior. -# Inherits from JitCommonTestCase instead of TestCase directly to share -# functionality with original test_jit.py method operator tests -class TestJit(JitCommonTestCase): - exact_dtype = True - - # Tests that the forward and backward passes of operations produce the - # same values for the cross-product of op variants (function, method, inplace) - # and runtimes (eager, traced, scripted). - # TODO WARNING: inplace x {traced, scripted} not currently tested - @_variant_ops(op_db) - def test_variant_consistency_jit(self, device, dtype, op): - _requires_grad = op.supports_autograd and (dtype.is_floating_point or - op.supports_complex_autograd(torch.device(device).type)) - - include_conjugated_inputs = op.test_conjugated_samples and dtype.is_complex - samples = op.sample_inputs(device, dtype, requires_grad=_requires_grad, include_conjugated_inputs=include_conjugated_inputs) - - # Acquires variants to test - func = op.get_op() - method = op.get_method() - variants = { - # TODO: inplace tests currently fail, fix and add inplace variant - 'function': func, 'method': method, - } - - # TODO: find better way to standardize on op registration itself.. - has_fake_function = op.name in ["resize_", 'resize_as_'] - - if has_fake_function: - variants = {'method': getattr(torch.Tensor, op.name)} - samples = op.sample_inputs(device, dtype, requires_grad=False) - - support_script = op.supports_scripting + args = [sample.input] + list(sample.args) + kwargs = sample.kwargs + composite_compliance.check_backward_formula(op, args, kwargs) - tested = False - for sample in samples: - # Test traced and scripted consistency - for func_type, variant in variants.items(): - if variant is None: - continue + @unittest.skipIf( + IS_FBCODE or IS_SANDCASTLE, "__torch_dispatch__ does not work in fbcode" + ) + @ops(op_db, allowed_dtypes=(torch.float,)) + def test_forward_ad(self, device, dtype, op): + if torch.float not in op.supported_backward_dtypes(device): + raise unittest.SkipTest("Does not support autograd") - # scripting and check_alias_analysis do not work with lambdas - # lambdas are typically used as a way to simulate methods without - # functional variants, so rely on the other variant for testing - # for now - if is_lambda(variant): - continue + if not op.supports_forward_ad: + raise unittest.SkipTest("Does not support forward_ad") - tested = True - - # Create accessor for script function variant - name = op.name + '_' if func_type == 'inplace' else op.name - - # run with disable_autodiff_subgraph_inlining(True) to test - # autodiff support. Context manager forces the graph to contain - # DifferentiableGraph nodes if they are present - with disable_autodiff_subgraph_inlining(): - # Check scripted forward, grad, and grad grad - if support_script: - script_fn = create_script_fn(self, name, func_type) - - def out_fn(output): - # Processes the output for autograd - if sample.output_process_fn_grad is not None: - return sample.output_process_fn_grad(output) - return output - - def get_sample(): - return clone_input_helper(sample.input) if op.name[-1] == '_' else sample.input - - if support_script: - check_against_reference(self, - script_fn, - func, - out_fn, - (get_sample(),) + sample.args, - sample.kwargs, - no_grad=not _requires_grad, no_gradgrad=not op.supports_gradgrad) - - # Check traced forward, grad, and grad grad - # TODO: fix tracing here - supports_tracing = not has_fake_function - if op.assert_jit_shape_analysis: - self.assertTrue(supports_tracing) - - if supports_tracing: - traced_fn = create_traced_fn(self, variant) - check_against_reference(self, - traced_fn, - func, - out_fn, - (get_sample(),) + sample.args, - sample.kwargs, - no_grad=not _requires_grad, no_gradgrad=not op.supports_gradgrad) - - # Check alias annotation schema for correctness (make - # sure inputs that aren't supposed to be modified aren't) - # Note: only runs in float32 because schema isn't affected by dtype, - # so running it on all dtypes is would be excessive - if dtype == torch.float32: - # TODO: no reason why we cant run this with tracing graph - if support_script and op.name != "rsub": - check_alias_annotation(name, (get_sample(),) + sample.args, sample.kwargs, - func_type=func_type, aten_name=op.aten_name) - - # TODO: use script graph as well - checked_shape_analysis = False - if supports_tracing: - out = variant(get_sample(), *sample.args, **sample.kwargs) - - # right now, tuple of outputs and tensor output supported - # TODO: list of tensor outputs - tuple_of_tensors = isinstance(out, tuple) and all([isinstance(elem, torch.Tensor) for elem in out]) - - if isinstance(out, torch.Tensor) or tuple_of_tensors: - if tuple_of_tensors: - sizes = [elem.size() for elem in out] - else: - sizes = out.size() - self.checkShapeAnalysis(sizes, traced_fn.graph, op.assert_jit_shape_analysis) - checked_shape_analysis = True - if op.assert_jit_shape_analysis: - self.assertTrue(checked_shape_analysis) - - # Check autodifferentiation of nodes for traced and scripted graphs, only need to check once per sample - if dtype is torch.float32: - # Sandcastle doesn't fuse nodes - if IS_SANDCASTLE: - # fusible nodes are expected to be found in FusionGroups in the DifferentiableGraphs - nonfusible_nodes = op.autodiff_nonfusible_nodes + op.autodiff_fusible_nodes - fusible_nodes = [] - else: - nonfusible_nodes = op.autodiff_nonfusible_nodes - fusible_nodes = op.autodiff_fusible_nodes - - if supports_tracing: - self.assertAutodiffNode(traced_fn.last_graph, op.assert_autodiffed, nonfusible_nodes, fusible_nodes) - if support_script: - self.assertAutodiffNode(script_fn.last_graph, op.assert_autodiffed, nonfusible_nodes, fusible_nodes) - assert tested, "JIT Test does not execute any logic" - - # alias testing is only done with torch.float for the same reason - _alias_ops = partial(ops, dtypes=OpDTypes.supported, - allowed_dtypes=(torch.float,)) - - @_alias_ops((op for op in op_db if op.aliases)) - def test_jit_alias_remapping(self, device, dtype, op): - # Required to avoid undefined value: tensor error in JIT compilation of the function template - tensor = torch.tensor - - # NOTE: only tests on first sample samples = op.sample_inputs(device, dtype, requires_grad=True) - sample = first_sample(self, samples) - - # [Scripting Data Preparation] - # Prepare data for test scripting - # Below we prepare strings of args/kwargs with and without type annotations. - # These strings are inserted into function template strings which is then torch scripted. - # - args string is ["t0"] corresponding to the "input" tensor required by the op - # - args_kw is the value of args and strings of kwargs used to call the op (without type annotations), for example, - # ["to", "1.0", "(1,)", "True", "tensor(1.0)"] -> def fn(t0): return variant(t0, 1.0, (1,), True, tensor(1.0)) - args = ["t0"] - - def quote_strs(v): - if isinstance(v, str): - return f"'{v}'" - - return str(v) - - args_kw = args + \ - [f"{v}" for v in sample.args] + \ - [f"{k}={quote_strs(v)}" for k, v in sample.kwargs.items()] - - # Prepare data for test tracing - sample_args_kwargs = () - if len(sample.args) > 0: - sample_args_kwargs += (sample.args, ) - if len(sample.kwargs) > 0: - sample_args_kwargs += (sample.kwargs, ) - - original_name = op.aten_name - original_name_inplace = original_name + "_" - expected_dtype = op(sample.input, *sample.args, **sample.kwargs).dtype - for a_op in op.aliases: - inplace = a_op.inplace_variant - method_or_inplace = [a_op.inplace_variant, a_op.method_variant] - variants = (v for v in (a_op.op, a_op.method_variant, a_op.inplace_variant) if v is not None) - - # Test scripting: - for variant in variants: - variant_name = variant.__name__ - op_name = original_name_inplace if variant is inplace else original_name - - if variant in method_or_inplace: - fn_template = ''' - def _fn(t0{c}): - return t0.{alias_name}({args_kw}) - ''' - # remove the first input tensor - script = fn_template.format( - c=", " if len(args_kw[1:]) > 1 else "", - args_kw=", ".join(args_kw[1:]), - alias_name=variant_name, - ) - else: - fn_template = ''' - def _fn({args}): - return variant({args_kw}) - ''' - script = fn_template.format( - args=", ".join(args), - args_kw=", ".join(args_kw), - ) - scripted = torch.jit.CompilationUnit(script)._fn + for sample in samples: + args = [sample.input] + list(sample.args) + kwargs = sample.kwargs + composite_compliance.check_forward_ad_formula(op, args, kwargs) - if (variant is inplace and not torch.can_cast(expected_dtype, dtype)): - try: - inp = clone_input_helper(sample.input) - scripted(inp) - except Exception as e: - continue - self.fail("Inplace operation on integer tensor that should be promoted to float didn't fail!") - - inp = clone_input_helper(sample.input) - scripted(inp) - inp = clone_input_helper(sample.input) - graph = scripted.graph_for(inp) - FileCheck().check(op.aten_name).check_not(variant_name).run(graph) - - # Test tracing: - for variant in variants: - variant_name = variant.__name__ - op_name = original_name_inplace if variant is inplace else original_name - - def _fn(*sample_args, **sample_kwargs): - return variant(*sample_args, **sample_kwargs) - - inp = (clone_input_helper(sample.input),) + sample_args_kwargs - traced = torch.jit.trace(_fn, *inp) - inp = (clone_input_helper(sample.input),) + sample_args_kwargs - traced(*inp) - inp = (clone_input_helper(sample.input),) + sample_args_kwargs - graph = traced.graph_for(*inp) - FileCheck().check(op_name).check_not(variant_name).run(graph) class TestMathBits(TestCase): # Tests that @@ -1196,7 +1076,17 @@ class TestMathBits(TestCase): # This test only runs for C -> R and C -> C functions # TODO: add tests for `R->C` functions # Note: This test runs for functions that take both tensors and tensorlists as input. - def _test_math_view(self, device, dtype, op, samples, math_op_physical, math_op_view, is_bit_set, out_type): + def _test_math_view( + self, + device, + dtype, + op, + samples, + math_op_physical, + math_op_view, + is_bit_set, + out_type, + ): inplace_variant = op.inplace_variant # helper function to clone and conjugate/negate the input if its a tensor @@ -1205,7 +1095,7 @@ def _test_math_view(self, device, dtype, op, samples, math_op_physical, math_op_ # have its requires_grad set to that value. def clone_and_perform_view(input, **kwargs): if isinstance(input, torch.Tensor): - requires_grad = kwargs.get('requires_grad', input.requires_grad) + requires_grad = kwargs.get("requires_grad", input.requires_grad) with torch.no_grad(): # Ensure view represents the original sample input input = math_op_physical(input) @@ -1222,7 +1112,11 @@ def clone_and_perform_view(input, **kwargs): return tuple(out) for sample in samples: - tensor = sample.input if isinstance(sample.input, torch.Tensor) else sample.input[0] + tensor = ( + sample.input + if isinstance(sample.input, torch.Tensor) + else sample.input[0] + ) cloned1 = clone_and_perform_view(sample.input) # Computes function forward value with a physically conjugated/negated tensor and @@ -1236,9 +1130,13 @@ def clone_and_perform_view(input, **kwargs): # input produces correct output, and the output tensor has the conj/neg bit set to True if inplace_variant is not None and not sample.broadcasts_input: cloned2 = clone_and_perform_view(tensor, requires_grad=False) - if (isinstance(expected_forward, torch.Tensor) and - expected_forward.dtype is tensor.dtype): - inplace_forward = inplace_variant(cloned2, *sample.args, **sample.kwargs) + if ( + isinstance(expected_forward, torch.Tensor) + and expected_forward.dtype is tensor.dtype + ): + inplace_forward = inplace_variant( + cloned2, *sample.args, **sample.kwargs + ) self.assertTrue(is_bit_set(inplace_forward)) self.assertEqual(inplace_forward, expected_forward) @@ -1247,40 +1145,62 @@ def clone_and_perform_view(input, **kwargs): # tensor inputs # TODO: update to handle checking grads of all tensor inputs as # derived from each tensor output - if isinstance(expected_forward, torch.Tensor) and expected_forward.requires_grad: + if ( + isinstance(expected_forward, torch.Tensor) + and expected_forward.requires_grad + ): output_process_fn_grad = sample.output_process_fn_grad or (lambda x: x) expected_forward = output_process_fn_grad(expected_forward) forward_with_mathview = output_process_fn_grad(forward_with_mathview) - tensor = sample.input if isinstance(sample.input, torch.Tensor) else sample.input[0] + tensor = ( + sample.input + if isinstance(sample.input, torch.Tensor) + else sample.input[0] + ) expected_forward.sum().backward(retain_graph=True) forward_with_mathview.sum().backward(retain_graph=True) if tensor.grad is not None: - cloned1_tensor = cloned1 if isinstance(cloned1, torch.Tensor) else cloned1[0] + cloned1_tensor = ( + cloned1 if isinstance(cloned1, torch.Tensor) else cloned1[0] + ) self.assertEqual(tensor.grad, cloned1_tensor.grad) tensor.grad, cloned1_tensor.grad = None, None # a repeat of the above test if output is not complex valued - if (out_type(expected_forward)): + if out_type(expected_forward): grad = torch.randn_like(expected_forward) expected_forward.backward(grad) - forward_with_mathview.backward(math_op_view(math_op_physical(grad))) + forward_with_mathview.backward( + math_op_view(math_op_physical(grad)) + ) self.assertEqual(tensor.grad, cloned1_tensor.grad) - @ops(op_db, allowed_dtypes=(torch.cfloat,)) + @ops(ops_and_refs, allowed_dtypes=(torch.cfloat,)) def test_conj_view(self, device, dtype, op): if not op.test_conjugated_samples: self.skipTest("Operation doesn't support conjugated inputs.") math_op_physical = torch.conj_physical math_op_view = torch.conj - _requires_grad = (op.supports_autograd and op.supports_complex_autograd(torch.device(device).type)) + _requires_grad = torch.cfloat in op.supported_backward_dtypes( + torch.device(device).type + ) is_bit_set = torch.is_conj samples = op.sample_inputs(device, dtype, requires_grad=_requires_grad) - self._test_math_view(device, dtype, op, samples, math_op_physical, math_op_view, is_bit_set, torch.is_complex) - - @ops(op_db, allowed_dtypes=(torch.double,)) + self._test_math_view( + device, + dtype, + op, + samples, + math_op_physical, + math_op_view, + is_bit_set, + torch.is_complex, + ) + + @ops(ops_and_refs, allowed_dtypes=(torch.double,)) def test_neg_view(self, device, dtype, op): if not op.test_neg_view: self.skipTest("Operation not tested with tensors with negative bit.") @@ -1288,10 +1208,18 @@ def test_neg_view(self, device, dtype, op): math_op_view = torch._neg_view is_bit_set = torch.is_neg samples = op.sample_inputs(device, dtype, requires_grad=op.supports_autograd) - self._test_math_view(device, dtype, op, samples, math_op_physical, math_op_view, is_bit_set, - lambda x: True) - - @ops(op_db, allowed_dtypes=(torch.cdouble,)) + self._test_math_view( + device, + dtype, + op, + samples, + math_op_physical, + math_op_view, + is_bit_set, + lambda x: True, + ) + + @ops(ops_and_refs, allowed_dtypes=(torch.cdouble,)) def test_neg_conj_view(self, device, dtype, op): if not op.test_neg_view: self.skipTest("Operation not tested with tensors with negative bit.") @@ -1307,18 +1235,27 @@ def math_op_view(x): def is_bit_set(x): return torch.is_neg(x) and torch.is_conj(x) - _requires_grad = (op.supports_autograd and op.supports_complex_autograd(torch.device(device).type)) + _requires_grad = dtype in op.supported_backward_dtypes( + torch.device(device).type + ) samples = op.sample_inputs(device, dtype, requires_grad=_requires_grad) # Only test one sample samples = itertools.islice(samples, 1) - self._test_math_view(device, dtype, op, samples, math_op_physical, math_op_view, is_bit_set, - torch.is_complex) + self._test_math_view( + device, + dtype, + op, + samples, + math_op_physical, + math_op_view, + is_bit_set, + torch.is_complex, + ) instantiate_device_type_tests(TestCommon, globals()) -instantiate_device_type_tests(TestGradients, globals()) -instantiate_device_type_tests(TestJit, globals()) +instantiate_device_type_tests(TestCompositeCompliance, globals()) instantiate_device_type_tests(TestMathBits, globals()) -if __name__ == '__main__': +if __name__ == "__main__": run_tests() diff --git a/test/test_ops_gradients.py b/test/test_ops_gradients.py new file mode 100644 index 000000000000..64cd71fdee6d --- /dev/null +++ b/test/test_ops_gradients.py @@ -0,0 +1,264 @@ +# Owner(s): ["module: unknown"] + +from functools import partial, wraps +from itertools import chain +import torch + +from torch.testing._internal.common_utils import \ + (TestCase, is_iterable_of_tensors, run_tests, gradcheck, gradgradcheck) +from torch.testing._internal.common_methods_invocations import op_db +from torch.testing._internal.common_device_type import \ + (instantiate_device_type_tests, ops, OpDTypes) + +# TODO: fixme https://github.com/pytorch/pytorch/issues/68972 +torch.set_default_dtype(torch.float32) + +# gradcheck requires double precision +_gradcheck_ops = partial(ops, dtypes=OpDTypes.supported, + allowed_dtypes=[torch.double, torch.cdouble]) + +class TestGradients(TestCase): + exact_dtype = True + + # Copies inputs to inplace operations to avoid inplace modifications + # to leaves requiring gradient + def _get_safe_inplace(self, inplace_variant): + @wraps(inplace_variant) + def _fn(t, *args, **kwargs): + return inplace_variant(t.clone(), *args, **kwargs) + + return _fn + + def _check_helper(self, device, dtype, op, variant, check, *, check_forward_ad=False, check_backward_ad=True, + check_batched_grad=None, check_batched_forward_grad=False): + assert check in ('gradcheck', 'bwgrad_bwgrad', 'fwgrad_bwgrad') + # NB: check_backward_ad does not affect gradgradcheck (always True) + if variant is None: + self.skipTest("Skipped! Variant not implemented.") + if not op.supports_dtype(dtype, torch.device(device).type): + self.skipTest(f"Skipped! {op.name} does not support dtype {str(dtype)}") + + def is_inplace(variant): + if hasattr(variant, "__wrapped__"): + return variant.__wrapped__ is op.get_inplace() + return variant is op.get_inplace() + + include_conjugated_inputs = op.test_conjugated_samples and dtype.is_complex + samples = op.sample_inputs(device, dtype, requires_grad=True, include_conjugated_inputs=include_conjugated_inputs) + + for sample in samples: + if sample.broadcasts_input and is_inplace(variant): + continue + + # Gradcheck expects tensors as its input, but autograd actually supports tensorlists + # and tensors passed as kwargs. The following creates a function that accepts just + # the tensors that require grad as varargs, and then recomposes them back into the + # original input. + + # Creates gradcheck inputs by identifying tensors requiring grad + all_args = None + if is_iterable_of_tensors(sample.input): + all_args = chain(sample.input, sample.args, sample.kwargs.values()) + else: + all_args = tuple(chain((sample.input,), sample.args, sample.kwargs.values())) + gradcheck_args = tuple(x for x in all_args if (isinstance(x, torch.Tensor) and x.requires_grad)) + + def _input_recomposition_helper(inputs, inp, input_idx): + if is_iterable_of_tensors(inp): + tensor_list = [] + for x in inp: + if isinstance(x, torch.Tensor) and x.requires_grad: + tensor_list.append(inputs[input_idx]) + input_idx = input_idx + 1 + else: + tensor_list.append(x) + return tensor_list, input_idx + elif isinstance(inp, torch.Tensor) and inp.requires_grad: + return inputs[input_idx], input_idx + 1 + else: + return inp, input_idx + + def fn(*inputs): + # Puts inputs back into sample properly + positional_args = [] + input_idx = 0 + inp, input_idx = _input_recomposition_helper(inputs, sample.input, input_idx) + positional_args.append(inp) + + for x in sample.args: + inp, input_idx = _input_recomposition_helper(inputs, x, input_idx) + positional_args.append(inp) + + # Recreates kwargs + kwargs = {} + for k, v in sample.kwargs.items(): + inp, input_idx = _input_recomposition_helper(inputs, v, input_idx) + kwargs[k] = inp + + output = op.gradcheck_wrapper(variant, *positional_args, **kwargs) + if sample.output_process_fn_grad is not None: + return sample.output_process_fn_grad(output) + return output + + if check == 'gradcheck': + if check_batched_grad is None: + check_batched_grad = op.check_batched_grad + self.assertTrue(gradcheck(fn, gradcheck_args, + check_batched_grad=check_batched_grad, + check_grad_dtypes=True, + nondet_tol=op.gradcheck_nondet_tol, + fast_mode=op.gradcheck_fast_mode, + check_forward_ad=check_forward_ad, + check_backward_ad=check_backward_ad, + check_undefined_grad=True, + check_batched_forward_grad=check_batched_forward_grad)) + elif check in ('bwgrad_bwgrad', 'fwgrad_bwgrad'): # gradgrad check + self.assertFalse(check_forward_ad, msg="Cannot run forward AD check for gradgradcheck") + for gen_non_contig_grad_outputs in (False, True): + kwargs = { + "gen_non_contig_grad_outputs": gen_non_contig_grad_outputs, + "check_batched_grad": op.check_batched_gradgrad, + "check_grad_dtypes": True, + "nondet_tol": op.gradcheck_nondet_tol, + "fast_mode": op.gradcheck_fast_mode + } + if check == "fwgrad_bwgrad": + kwargs["check_fwd_over_rev"] = True + kwargs["check_rev_over_rev"] = False + kwargs["check_batched_grad"] = False + kwargs["check_undefined_grad"] = False + + self.assertTrue(gradgradcheck(fn, gradcheck_args, **kwargs)) + else: + self.assertTrue(False, msg="Unknown check requested!") + + def _grad_test_helper(self, device, dtype, op, variant, *, check_forward_ad=False, check_backward_ad=True, + check_batched_grad=None, check_batched_forward_grad=False): + return self._check_helper(device, dtype, op, variant, 'gradcheck', check_forward_ad=check_forward_ad, + check_backward_ad=check_backward_ad, check_batched_grad=check_batched_grad, + check_batched_forward_grad=check_batched_forward_grad) + + def _skip_helper(self, op, device, dtype): + if dtype not in op.supported_backward_dtypes(torch.device(device).type): + self.skipTest("Skipped! Op doesn't support autograd for this dtype.") + if not op.supports_autograd and not op.supports_forward_ad: + self.skipTest("Skipped! autograd not supported.") + + # Tests that gradients are computed correctly + @_gradcheck_ops(op_db) + def test_fn_grad(self, device, dtype, op): + # This is verified by test_dtypes in test_ops.py + if dtype not in op.supported_backward_dtypes(torch.device(device).type): + self.skipTest("Skipped! Dtype is not in supported backward dtypes!") + else: + self._grad_test_helper(device, dtype, op, op.get_op()) + + # Method grad (and gradgrad, see below) tests are disabled since they're + # costly and redundant with function grad (and gradgad) tests + # @_gradcheck_ops(op_db) + # def test_method_grad(self, device, dtype, op): + # self._skip_helper(op, device, dtype) + # self._grad_test_helper(device, dtype, op, op.get_method()) + + @_gradcheck_ops(op_db) + def test_inplace_grad(self, device, dtype, op): + self._skip_helper(op, device, dtype) + if not op.inplace_variant: + self.skipTest("Op has no inplace variant!") + + # Verifies an operation doesn't support inplace autograd if it claims not to + if not op.supports_inplace_autograd: + inplace = self._get_safe_inplace(op.get_inplace()) + for sample in op.sample_inputs(device, dtype, requires_grad=True): + if sample.broadcasts_input: + continue + with self.assertRaises(Exception): + result = inplace(sample) + result.sum().backward() + else: + self._grad_test_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace())) + + # Test that gradients of gradients are computed correctly + @_gradcheck_ops(op_db) + def test_fn_gradgrad(self, device, dtype, op): + self._skip_helper(op, device, dtype) + if not op.supports_gradgrad: + self.skipTest("Op claims it doesn't support gradgrad. This is not verified.") + else: + self._check_helper(device, dtype, op, op.get_op(), 'bwgrad_bwgrad') + + # Test that forward-over-reverse gradgrad is computed correctly + @_gradcheck_ops(op_db) + def test_fn_fwgrad_bwgrad(self, device, dtype, op): + self._skip_helper(op, device, dtype) + + if op.supports_fwgrad_bwgrad: + self._check_helper(device, dtype, op, op.get_op(), "fwgrad_bwgrad") + else: + err_msg = r"Trying to use forward AD with .* that does not support it" + hint_msg = ("Running forward-over-backward gradgrad for an OP that has does not support it did not " + "raise any error. If your op supports forward AD, you should set supports_fwgrad_bwgrad=True.") + with self.assertRaisesRegex(NotImplementedError, err_msg, msg=hint_msg): + self._check_helper(device, dtype, op, op.get_op(), "fwgrad_bwgrad") + + # Test that gradients of gradients are properly raising + @_gradcheck_ops(op_db) + def test_fn_fail_gradgrad(self, device, dtype, op): + self._skip_helper(op, device, dtype) + if op.supports_gradgrad: + self.skipTest("Skipped! Operation does support gradgrad") + + err_msg = r"derivative for .* is not implemented" + with self.assertRaisesRegex(RuntimeError, err_msg): + self._check_helper(device, dtype, op, op.get_op(), 'bwgrad_bwgrad') + + # Method gradgrad (and grad, see above) tests are disabled since they're + # costly and redundant with function gradgrad (and grad) tests + # @_gradcheck_ops(op_db) + # def test_method_gradgrad(self, device, dtype, op): + # self._skip_helper(op, device, dtype) + # self._gradgrad_test_helper(device, dtype, op, op.get_method()) + + @_gradcheck_ops(op_db) + def test_inplace_gradgrad(self, device, dtype, op): + self._skip_helper(op, device, dtype) + if not op.inplace_variant or not op.supports_inplace_autograd: + self.skipTest("Skipped! Operation does not support inplace autograd.") + self._check_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace()), "bwgrad_bwgrad") + + def _forward_grad_helper(self, device, dtype, op, variant, is_inplace): + # TODO: clean up how attributes are passed to gradcheck from OpInfos + def call_grad_test_helper(): + check_batched_forward_grad = ((op.check_batched_forward_grad and not is_inplace) or + (op.check_inplace_batched_forward_grad and is_inplace)) + self._grad_test_helper(device, dtype, op, variant, check_forward_ad=True, check_backward_ad=False, + check_batched_grad=False, check_batched_forward_grad=check_batched_forward_grad) + if op.supports_forward_ad: + call_grad_test_helper() + else: + err_msg = r"Trying to use forward AD with .* that does not support it" + hint_msg = ("Running forward AD for an OP that has does not support it did not " + "raise any error. If your op supports forward AD, you should set supports_forward_ad=True") + with self.assertRaisesRegex(NotImplementedError, err_msg, msg=hint_msg): + call_grad_test_helper() + + @_gradcheck_ops(op_db) + def test_forward_mode_AD(self, device, dtype, op): + self._skip_helper(op, device, dtype) + + self._forward_grad_helper(device, dtype, op, op.get_op(), is_inplace=False) + + @_gradcheck_ops(op_db) + def test_inplace_forward_mode_AD(self, device, dtype, op): + self._skip_helper(op, device, dtype) + + if not op.inplace_variant or not op.supports_inplace_autograd: + self.skipTest("Skipped! Operation does not support inplace autograd.") + + self._forward_grad_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace()), is_inplace=True) + + +instantiate_device_type_tests(TestGradients, globals()) + +if __name__ == '__main__': + run_tests() diff --git a/test/test_ops_jit.py b/test/test_ops_jit.py new file mode 100644 index 000000000000..e8b914f8072f --- /dev/null +++ b/test/test_ops_jit.py @@ -0,0 +1,279 @@ +# Owner(s): ["module: unknown"] + +from functools import partial + +import torch + +from torch.testing import FileCheck +from torch.testing._internal.common_utils import \ + (run_tests, IS_SANDCASTLE, clone_input_helper, first_sample) +from torch.testing._internal.common_methods_invocations import op_db +from torch.testing._internal.common_device_type import instantiate_device_type_tests, ops, OpDTypes +from torch.testing._internal.common_jit import JitCommonTestCase, check_against_reference +from torch.testing._internal.jit_metaprogramming_utils import create_script_fn, create_traced_fn, check_alias_annotation +from torch.testing._internal.jit_utils import disable_autodiff_subgraph_inlining, is_lambda + + +# TODO: fixme https://github.com/pytorch/pytorch/issues/68972 +torch.set_default_dtype(torch.float32) + +# variant testing is only done with torch.float and torch.cfloat to avoid +# excessive test times and maximize signal to noise ratio +_variant_ops = partial(ops, dtypes=OpDTypes.supported, + allowed_dtypes=(torch.float, torch.cfloat)) + + + +# Tests operators for consistency between JIT and eager, also checks +# correctness of JIT specific alias schemas and intended +# autodifferentiation behavior. +# Inherits from JitCommonTestCase instead of TestCase directly to share +# functionality with original test_jit.py method operator tests +class TestJit(JitCommonTestCase): + exact_dtype = True + + # Tests that the forward and backward passes of operations produce the + # same values for the cross-product of op variants (function, method, inplace) + # and runtimes (eager, traced, scripted). + # TODO WARNING: inplace x {traced, scripted} not currently tested + @_variant_ops(op_db) + def test_variant_consistency_jit(self, device, dtype, op): + _requires_grad = (dtype in op.supported_backward_dtypes(torch.device(device).type)) + + include_conjugated_inputs = op.test_conjugated_samples and dtype.is_complex + samples = op.sample_inputs(device, dtype, requires_grad=_requires_grad, include_conjugated_inputs=include_conjugated_inputs) + + # Acquires variants to test + func = op.get_op() + method = op.get_method() + variants = { + # TODO: inplace tests currently fail, fix and add inplace variant + 'function': func, 'method': method, + } + + # TODO: find better way to standardize on op registration itself.. + has_fake_function = op.name in ["resize_", 'resize_as_'] + + if has_fake_function: + variants = {'method': getattr(torch.Tensor, op.name)} + samples = op.sample_inputs(device, dtype, requires_grad=False) + + support_script = op.supports_scripting + + tested = False + for sample in samples: + # Test traced and scripted consistency + for func_type, variant in variants.items(): + if variant is None: + continue + + # scripting and check_alias_analysis do not work with lambdas + # lambdas are typically used as a way to simulate methods without + # functional variants, so rely on the other variant for testing + # for now + if is_lambda(variant): + continue + + tested = True + + # Create accessor for script function variant + name = op.name + '_' if func_type == 'inplace' else op.name + + # run with disable_autodiff_subgraph_inlining(True) to test + # autodiff support. Context manager forces the graph to contain + # DifferentiableGraph nodes if they are present + with disable_autodiff_subgraph_inlining(): + # Check scripted forward, grad, and grad grad + if support_script: + script_fn = create_script_fn(self, name, func_type) + + def out_fn(output): + # Processes the output for autograd + if sample.output_process_fn_grad is not None: + return sample.output_process_fn_grad(output) + return output + + def get_sample(): + return clone_input_helper(sample.input) if op.name[-1] == '_' else sample.input + + if support_script: + check_against_reference(self, + script_fn, + func, + out_fn, + (get_sample(),) + sample.args, + sample.kwargs, + no_grad=not _requires_grad, no_gradgrad=not op.supports_gradgrad) + + # Check traced forward, grad, and grad grad + # TODO: fix tracing here + supports_tracing = not has_fake_function + if op.assert_jit_shape_analysis: + self.assertTrue(supports_tracing) + + if supports_tracing: + traced_fn = create_traced_fn(self, variant) + check_against_reference(self, + traced_fn, + func, + out_fn, + (get_sample(),) + sample.args, + sample.kwargs, + no_grad=not _requires_grad, no_gradgrad=not op.supports_gradgrad) + + # Check alias annotation schema for correctness (make + # sure inputs that aren't supposed to be modified aren't) + # Note: only runs in float32 because schema isn't affected by dtype, + # so running it on all dtypes is would be excessive + if dtype == torch.float32: + # TODO: no reason why we cant run this with tracing graph + if support_script and op.name != "rsub": + check_alias_annotation(name, (get_sample(),) + sample.args, sample.kwargs, + func_type=func_type, aten_name=op.aten_name) + + # TODO: use script graph as well + checked_shape_analysis = False + if supports_tracing: + out = variant(get_sample(), *sample.args, **sample.kwargs) + + # right now, tuple of outputs and tensor output supported + # TODO: list of tensor outputs + tuple_of_tensors = isinstance(out, tuple) and all([isinstance(elem, torch.Tensor) for elem in out]) + + if isinstance(out, torch.Tensor) or tuple_of_tensors: + if tuple_of_tensors: + sizes = [elem.size() for elem in out] + else: + sizes = out.size() + self.checkShapeAnalysis(sizes, traced_fn.graph, op.assert_jit_shape_analysis) + checked_shape_analysis = True + if op.assert_jit_shape_analysis: + self.assertTrue(checked_shape_analysis) + + # Check autodifferentiation of nodes for traced and scripted graphs, only need to check once per sample + if dtype is torch.float32: + # Sandcastle doesn't fuse nodes + if IS_SANDCASTLE: + # fusible nodes are expected to be found in FusionGroups in the DifferentiableGraphs + nonfusible_nodes = op.autodiff_nonfusible_nodes + op.autodiff_fusible_nodes + fusible_nodes = [] + else: + nonfusible_nodes = op.autodiff_nonfusible_nodes + fusible_nodes = op.autodiff_fusible_nodes + + if supports_tracing: + self.assertAutodiffNode(traced_fn.last_graph, op.assert_autodiffed, nonfusible_nodes, fusible_nodes) + if support_script: + self.assertAutodiffNode(script_fn.last_graph, op.assert_autodiffed, nonfusible_nodes, fusible_nodes) + assert tested, "JIT Test does not execute any logic" + + # alias testing is only done with torch.float for the same reason + _alias_ops = partial(ops, dtypes=OpDTypes.supported, + allowed_dtypes=(torch.float,)) + + @_alias_ops((op for op in op_db if op.aliases)) + def test_jit_alias_remapping(self, device, dtype, op): + # Required to avoid undefined value: tensor error in JIT compilation of the function template + tensor = torch.tensor + + # NOTE: only tests on first sample + samples = op.sample_inputs(device, dtype, requires_grad=True) + sample = first_sample(self, samples) + + # [Scripting Data Preparation] + # Prepare data for test scripting + # Below we prepare strings of args/kwargs with and without type annotations. + # These strings are inserted into function template strings which is then torch scripted. + # - args string is ["t0"] corresponding to the "input" tensor required by the op + # - args_kw is the value of args and strings of kwargs used to call the op (without type annotations), for example, + # ["to", "1.0", "(1,)", "True", "tensor(1.0)"] -> def fn(t0): return variant(t0, 1.0, (1,), True, tensor(1.0)) + args = ["t0"] + + def quote_strs(v): + if isinstance(v, str): + return f"'{v}'" + + return str(v) + + args_kw = args + \ + [f"{v}" for v in sample.args] + \ + [f"{k}={quote_strs(v)}" for k, v in sample.kwargs.items()] + + # Prepare data for test tracing + sample_args_kwargs = () + if len(sample.args) > 0: + sample_args_kwargs += (sample.args, ) + if len(sample.kwargs) > 0: + sample_args_kwargs += (sample.kwargs, ) + + original_name = op.aten_name + original_name_inplace = original_name + "_" + expected_dtype = op(sample.input, *sample.args, **sample.kwargs).dtype + + for a_op in op.aliases: + inplace = a_op.inplace_variant + method_or_inplace = [a_op.inplace_variant, a_op.method_variant] + variants = (v for v in (a_op.op, a_op.method_variant, a_op.inplace_variant) if v is not None) + + # Test scripting: + for variant in variants: + variant_name = variant.__name__ + op_name = original_name_inplace if variant is inplace else original_name + + if variant in method_or_inplace: + fn_template = ''' + def _fn(t0{c}): + return t0.{alias_name}({args_kw}) + ''' + # remove the first input tensor + script = fn_template.format( + c=", " if len(args_kw[1:]) > 1 else "", + args_kw=", ".join(args_kw[1:]), + alias_name=variant_name, + ) + else: + fn_template = ''' + def _fn({args}): + return variant({args_kw}) + ''' + script = fn_template.format( + args=", ".join(args), + args_kw=", ".join(args_kw), + ) + scripted = torch.jit.CompilationUnit(script)._fn + + if (variant is inplace and not torch.can_cast(expected_dtype, dtype)): + try: + inp = clone_input_helper(sample.input) + scripted(inp) + except Exception as e: + continue + self.fail("Inplace operation on integer tensor that should be promoted to float didn't fail!") + + inp = clone_input_helper(sample.input) + scripted(inp) + inp = clone_input_helper(sample.input) + graph = scripted.graph_for(inp) + FileCheck().check(op.aten_name).check_not(variant_name).run(graph) + + # Test tracing: + for variant in variants: + variant_name = variant.__name__ + op_name = original_name_inplace if variant is inplace else original_name + + def _fn(*sample_args, **sample_kwargs): + return variant(*sample_args, **sample_kwargs) + + inp = (clone_input_helper(sample.input),) + sample_args_kwargs + traced = torch.jit.trace(_fn, *inp) + inp = (clone_input_helper(sample.input),) + sample_args_kwargs + traced(*inp) + inp = (clone_input_helper(sample.input),) + sample_args_kwargs + graph = traced.graph_for(*inp) + FileCheck().check(op_name).check_not(variant_name).run(graph) + + +instantiate_device_type_tests(TestJit, globals()) + +if __name__ == '__main__': + run_tests() diff --git a/test/test_optim.py b/test/test_optim.py index 061f8a44765c..6d587b4b352d 100644 --- a/test/test_optim.py +++ b/test/test_optim.py @@ -20,8 +20,7 @@ _LRScheduler, CyclicLR, CosineAnnealingWarmRestarts, OneCycleLR, ChainedScheduler, \ EPOCH_DEPRECATION_WARNING from torch.optim.swa_utils import AveragedModel, SWALR, update_bn -from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_UBSAN, load_tests, \ - skipIfRocm +from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_UBSAN, load_tests # load_tests from common_utils is used to automatically filter tests for # sharding on sandcastle. This line silences flake warnings load_tests = load_tests @@ -228,6 +227,12 @@ def fn_base(optimizer, weight, bias): # Make sure state dict wasn't modified self.assertEqual(state_dict, state_dict_c) + # Make sure that device of state['step'] is still CPU + new_state_dict = optimizer_cuda.state_dict() + if 'step' in state_dict['state'][0] and torch.is_tensor(state_dict['state'][0]['step']): + for state in new_state_dict['state'].values(): + self.assertEqual(state['step'].device.type, 'cpu') + for _i in range(20): optimizer.step(fn) optimizer_cuda.step(fn_cuda) @@ -481,9 +486,8 @@ def test_multi_tensor_optimizers(self): loss.backward() # Test that step behaves as expected (a no-op) when grads are set to None - # TODO: uncomment after optim foreach cleanup is landed - # if iter == 0: - # optimizer.zero_grad(set_to_none=True) + if iter == 0: + optimizer.zero_grad(set_to_none=True) optimizer.step() @@ -615,26 +619,29 @@ def test_sparse_adam(self): optim.SparseAdam([{"params": [torch.zeros(3, layout=torch.sparse_coo)]}]) # ROCm precision is too low to pass this test - @skipIfRocm def test_adadelta(self): # Handles https://github.com/pytorch/pytorch/issues/69698 self.rel_tol = 4e-3 for optimizer in [optim.Adadelta, optim_mt.Adadelta]: self._test_basic_cases( - lambda weight, bias: optimizer([weight, bias]) + lambda weight, bias, maximize: optimizer([weight, bias], maximize=maximize), + constructor_accepts_maximize=True ) self._test_basic_cases( - lambda weight, bias: optimizer( - self._build_params_dict(weight, bias, rho=0.95)) + lambda weight, bias, maximize: optimizer( + self._build_params_dict(weight, bias, rho=0.95), maximize=maximize), + constructor_accepts_maximize=True ) self._test_basic_cases( - lambda weight, bias: optimizer( - self._build_params_dict(weight, bias, rho=0.95)), + lambda weight, bias, maximize: optimizer( + self._build_params_dict(weight, bias, rho=0.95), maximize=maximize), [lambda opt: StepLR(opt, gamma=0.9, step_size=10), - lambda opt: ReduceLROnPlateau(opt)] + lambda opt: ReduceLROnPlateau(opt)], + constructor_accepts_maximize=True ) self._test_basic_cases( - lambda weight, bias: optimizer([weight, bias], weight_decay=1) + lambda weight, bias, maximize: optimizer([weight, bias], weight_decay=1, maximize=maximize), + constructor_accepts_maximize=True ) with self.assertRaisesRegex(ValueError, "Invalid rho value: 1.1"): optimizer(None, lr=1e-2, rho=1.1) @@ -678,30 +685,38 @@ def test_nadam(self): def test_adagrad(self): for optimizer in [optim.Adagrad, optim_mt.Adagrad]: self._test_basic_cases( - lambda weight, bias: optimizer([weight, bias], lr=1e-1) + lambda weight, bias, maximize: optimizer([weight, bias], lr=1e-1, maximize=maximize), + constructor_accepts_maximize=True ) self._test_basic_cases( - lambda weight, bias: optimizer( - [weight, bias], lr=1e-1, initial_accumulator_value=0.1 - ) + lambda weight, bias, maximize: optimizer( + [weight, bias], lr=1e-1, initial_accumulator_value=0.1, maximize=maximize, + ), + constructor_accepts_maximize=True ) self._test_basic_cases( - lambda weight, bias: optimizer( + lambda weight, bias, maximize: optimizer( self._build_params_dict(weight, bias, lr=1e-2), - lr=1e-1) + lr=1e-1, + maximize=maximize), + constructor_accepts_maximize=True ) self._test_basic_cases( - lambda weight, bias: optimizer( + lambda weight, bias, maximize: optimizer( self._build_params_dict(weight, bias, lr=1e-2), - lr=1e-1), - [lambda opt: ReduceLROnPlateau(opt)] + lr=1e-1, + maximize=maximize), + [lambda opt: ReduceLROnPlateau(opt)], + constructor_accepts_maximize=True ) self._test_basic_cases( - lambda weight, bias: optimizer( + lambda weight, bias, maximize: optimizer( self._build_params_dict(weight, bias, lr=1e-2), - lr=1e-1), + lr=1e-1, + maximize=maximize), [lambda opt: ReduceLROnPlateau(opt), - lambda opt: ExponentialLR(opt, gamma=0.99)] + lambda opt: ExponentialLR(opt, gamma=0.99)], + constructor_accepts_maximize=True ) with self.assertRaisesRegex(ValueError, "Invalid lr_decay value: -0.5"): optimizer(None, lr=1e-2, lr_decay=-0.5) @@ -731,15 +746,20 @@ def test_adagrad_complex(self): def test_adamax(self): for optimizer in [optim.Adamax, optim_mt.Adamax]: self._test_basic_cases( - lambda weight, bias: optimizer([weight, bias], lr=1e-1) + lambda weight, bias, maximize: optimizer( + [weight, bias], lr=1e-1, maximize=maximize), + constructor_accepts_maximize=True ) self._test_basic_cases( - lambda weight, bias: optimizer( + lambda weight, bias, maximize: optimizer( self._build_params_dict(weight, bias, lr=1e-2), - lr=1e-1) + lr=1e-1, maximize=maximize), + constructor_accepts_maximize=True ) self._test_basic_cases( - lambda weight, bias: optimizer([weight, bias], lr=1e-1, weight_decay=1) + lambda weight, bias, maximize: optimizer( + [weight, bias], lr=1e-1, weight_decay=1, maximize=maximize), + constructor_accepts_maximize=True ) with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 1: 1.0"): optimizer(None, lr=1e-2, betas=(0.0, 1.0)) @@ -1323,6 +1343,18 @@ def test_closed_form_cos_anneal_lr(self): closed_form_scheduler = CosineAnnealingLR(self.opt, T_max=T_max, eta_min=eta_min) self._test_against_closed_form(scheduler, closed_form_scheduler, epochs) + def test_cos_anneal_lr_continue(self): + eta_min = 0.1 + T_max = 5 + scheduler = CosineAnnealingLR(self.opt, T_max=T_max, eta_min=eta_min) + self.opt.step() + scheduler.step() + original_lrs = scheduler._last_lr + new_scheduler = CosineAnnealingLR( + self.opt, T_max=T_max, eta_min=eta_min, last_epoch=0) + new_lrs = new_scheduler._last_lr + torch.testing.assert_allclose(original_lrs, new_lrs, rtol=1e-4, atol=1e-5) + def test_reduce_lr_on_plateau1(self): epochs = 10 for param_group in self.opt.param_groups: diff --git a/test/test_overrides.py b/test/test_overrides.py index da013d33a53d..d208a9201729 100644 --- a/test/test_overrides.py +++ b/test/test_overrides.py @@ -1,4 +1,4 @@ -# Owner(s): ["high priority"] +# Owner(s): ["module: __torch_function__"] import torch import numpy as np @@ -8,14 +8,16 @@ import pickle import collections -from torch.testing._internal.common_utils import TestCase, run_tests +from torch.testing._internal.common_utils import TestCase, run_tests, skipIfCrossRef from torch.overrides import ( handle_torch_function, has_torch_function, get_overridable_functions, get_testing_overrides, - is_tensor_method_or_property + is_tensor_method_or_property, + TorchFunctionMode ) +from functools import partial Tensor = torch.Tensor @@ -28,7 +30,7 @@ def foo(a, b, c=None): """A function multiple arguments and an optional argument""" - if any(type(t) is not Tensor for t in (a, b, c)) and has_torch_function((a, b, c)): + if has_torch_function((a, b, c)): return handle_torch_function(foo, (a, b, c), a, b, c=c) if c: return a + b + c @@ -36,19 +38,19 @@ def foo(a, b, c=None): def bar(a): """A function with one argument""" - if type(a) is not Tensor and has_torch_function((a,)): + if has_torch_function((a,)): return handle_torch_function(bar, (a,), a) return a def baz(a, b): """A function with multiple arguments""" - if type(a) is not Tensor or type(b) is not Tensor and has_torch_function((a, b)): + if has_torch_function((a, b)): return handle_torch_function(baz, (a, b), a, b) return a + b def quux(a): """Used to test that errors raised in user implementations get propagated""" - if type(a) is not Tensor and has_torch_function((a,)): + if has_torch_function((a,)): return handle_torch_function(quux, (a,), a) return a @@ -556,6 +558,42 @@ class DummyTensor(torch.Tensor): self.assertTrue(c._is_view()) self.assertTrue(c._base is a) + def test_grad(self): + # Previously, Tensor-like objects that did not subclass from Tensor + # did not get wrapped into unary tuples before being passed into + # handle_torch_function, in contradiction with how Tensor-likes + # were handled + # + # NB: this asserts that the arguments get normalized into a tuple + # before entering the torch function handler; it could go the + # other way but beware https://github.com/pytorch/pytorch/issues/76037 + + class Dummy: + @classmethod + def __torch_function__(cls, func, types, args=(), kwargs=None): + inputs, outputs = args + self.assertEqual(inputs, (x,)) + self.assertEqual(outputs, (x,)) + return -1 + + x = Dummy() + self.assertEqual(torch.autograd.grad(x, x), -1) + + def test_pow_rpow(self): + class NothingImplemented(torch.Tensor): + @classmethod + def __torch_function__(cls, func, types, args=(), kwargs=None): + return NotImplemented + + class RPowOnly(torch.Tensor): + @classmethod + def __torch_function__(cls, func, types, args=(), kwargs=None): + if func is torch.Tensor.__rpow__: + return -1 + return NotImplemented + + self.assertEqual(NothingImplemented() ** RPowOnly(), -1) + def generate_tensor_like_override_tests(cls): from torch.testing._internal.generated.annotated_fn_args import annotated_args @@ -599,7 +637,7 @@ def instance_gen(): func_args.append([instance_gen(), instance_gen()]) elif t == 'c10::List>': func_args.append([instance_gen(), instance_gen()]) - elif t == 'IntArrayRef': + elif t == 'IntArrayRef' or t == 'SymIntArrayRef': size = arg.get('size', 2) if size == 1: func_args.append(1) @@ -621,6 +659,9 @@ def instance_gen(): func_args.append(torch.float32) elif t == 'c10::string_view': func_args.append('') + elif t == 'SymInt': + # TODO: generate actual SymbolicInt + func_args.append(1) else: raise RuntimeError(f"Unsupported argument type {t} for {arg['name']} of function {func}") else: @@ -690,7 +731,7 @@ def test(self): test_method.__name__ = name setattr(cls, name, test_method) -# generate_tensor_like_override_tests(TestTorchFunctionOverride) +generate_tensor_like_override_tests(TestTorchFunctionOverride) class Wrapper: "Basic data container that knows how to unwrap itself" @@ -739,10 +780,11 @@ def __torch_function__(cls, func, types, args=(), kwargs=None): for a in args: if isinstance(a, cls): args_of_this_cls.append(a) - elif isinstance(a, collections.Sequence): + elif isinstance(a, collections.abc.Sequence): args_of_this_cls.extend(el for el in a if isinstance(el, cls)) assert len(args_of_this_cls) > 0 - args_of_this_cls[0].used_calls.add(func) + for a in args_of_this_cls: + a.used_calls.add(func) args = unwrap(tuple(args)) kwargs = {k: unwrap(v) for k, v in kwargs.items()} @@ -847,6 +889,7 @@ def run_test(fast_mode): 'dtype', 'is_floating_point', 'is_sparse', + 'is_sparse_csr', 'layout', 'new_zeros', 'numel', @@ -1044,6 +1087,16 @@ def __torch_function__(cls, func, types, args=(), kwargs=None): self.assertEqual(torch.nn.functional.linear(inp, t1, t2), "called") self.assertEqual(torch.nn.functional.linear(inp, t2, t1), "called") +class TestResolveName(TestCase): + def test_resolve_name(self): + for cs in get_overridable_functions().values(): + for c in cs: + self.assertEqual( + eval(torch.overrides.resolve_name(c)), + c, + msg=f"{c}, {torch.overrides.resolve_name(c)}" + ) + class TestTorchFunctionWarning(TestCase): def test_warn_on_invalid_torch_function(self): class Bad1(): @@ -1055,14 +1108,249 @@ def __torch_function__(self, *args, **kwargs): pass a = Bad1() - with self.assertWarnsRegex(DeprecationWarning, "as a plain method is deprecated"): - # This needs to be a function that handle torch_function on the python side - torch.split(a, (2)) - - a = Bad2() - with self.assertWarnsRegex(DeprecationWarning, "as a plain method is deprecated"): - # This needs to be a function that handle torch_function on the python side - torch.split(a, (2)) + for a in (Bad1(), Bad2()): + with self.assertWarnsRegex(DeprecationWarning, "as a plain method is deprecated"): + # Function that handles torch_function on the python side + torch.nn.functional.dropout(a) + + with self.assertWarnsRegex(UserWarning, "as a plain method is deprecated"): + # Function that handles torch_function in C++ + torch.abs(a) + +@skipIfCrossRef +class TestTorchFunctionMode(TestCase): + def test_basic(self): + class A(TorchFunctionMode): + def __torch_function__(self, *args, **kwargs): + return -1 + # NB: factory functions get overridden too! + x = torch.randn(1) + with torch.overrides.push_torch_function_mode(A): + self.assertEqual(torch.randn(3), -1) + self.assertEqual(torch.add(x, x), -1) + self.assertEqual(torch.split(None, [2]), -1) # python side + self.assertEqual(bar(x), -1) + + def test_factory_override(self): + class A(TorchFunctionMode): + def __torch_function__(self, *args, **kwargs): + return -1 + + with torch.overrides.push_torch_function_mode(A): + self.assertEqual(torch.tensor([1]), -1) + self.assertEqual(torch.sparse_coo_tensor(1, 1, 1), -1) + self.assertEqual(torch.sparse_csr_tensor(1, 1, 1), -1) + self.assertEqual(torch._sparse_coo_tensor_unsafe(1, 1, (1, 1)), -1) + self.assertEqual(torch._sparse_csr_tensor_unsafe(1, 1, 1, (1, 1)), -1) + self.assertEqual(torch.as_tensor([1]), -1) + + def test_enable_torch_function_mode_with_tensor_subclass(self): + x = torch.randn(1) + with torch.overrides.enable_torch_function_mode(SubTensor): + self.assertEqual(torch.mm(x, x), -1) + + def test_modes_handle_first(self): + class A(TorchFunctionMode): + def __torch_function__(self, *args, **kwargs): + return -40 + + x = SubTensor() + with torch.overrides.push_torch_function_mode(A): + self.assertEqual(torch.neg(x), -40) + self.assertEqual(torch.mean(x), -40) + self.assertEqual(torch.mm(x, x), -40) + self.assertEqual(bar(x), -40) + + def test_modes_return_notimplemented(self): + class MyMode(TorchFunctionMode): + def __torch_function__(self, *args, **kwargs): + return NotImplemented + + x = SubTensor() + with torch.overrides.push_torch_function_mode(MyMode): + self.assertEqual(torch.mean(x), 0) + self.assertEqual(torch.mm(x, x), -1) + self.assertEqual(bar(x), 1) + self.assertRaisesRegex( + TypeError, r'SubTensor.+MyMode', + lambda: self.assertEqual(torch.max(x, x))) + + def test_mode_stack(self): + logs = [] + + class Logger(TorchFunctionMode): + def __init__(self, name): + self.name = name + + def __torch_function__(self, func, types, args=(), kwargs=None): + if kwargs is None: + kwargs = {} + logs.append(self.name) + return func(*args, **kwargs) + + x = torch.randn(1) + with torch.overrides.push_torch_function_mode(partial(Logger, "A")): + with torch.overrides.push_torch_function_mode(partial(Logger, "B")): + torch.mean(x) + + self.assertEqual(logs, ["B", "A"]) + + def test_push_mode_instance_errors(self): + class A(TorchFunctionMode): + pass + with self.assertRaisesRegex(ValueError, 'instance of TorchFunctionMode'): + with torch.overrides.push_torch_function_mode(A(inner=None)): + pass + + def test_push_mode_returns_unrelated(self): + with self.assertRaisesRegex(ValueError, 'return a TorchFunctionMode'): + with torch.overrides.push_torch_function_mode(lambda *, inner: None): + pass + + def test_missing_inner_mode_ctor(self): + self.assertRaisesRegex(TypeError, 'push_torch_function_mode', lambda: TorchFunctionMode()) + + def test_enable_torch_function_mode_trivial(self): + class A(TorchFunctionMode): + def __torch_function__(self, *args, **kwargs): + return -40 + a = A(inner=None) + with torch.overrides.enable_torch_function_mode(a): + with torch.overrides.enable_torch_function_mode(a): + self.assertEqual(bar(None), -40) + + def test_enable_torch_function_mode_replace(self): + class A(TorchFunctionMode): + def __init__(self, val): + self.val = val + + def __torch_function__(self, *args, **kwargs): + return self.val + a1 = A(-40, inner=None) + a2 = A(-41, inner=None) + with torch.overrides.enable_torch_function_mode(a1): + with torch.overrides.enable_torch_function_mode(a2, replace=a1): + self.assertEqual(bar(None), -41) + + def test_enable_torch_function_mode_ignore_preexisting(self): + class A(TorchFunctionMode): + def __init__(self, val): + self.val = val + + def __torch_function__(self, *args, **kwargs): + return self.val + a1 = A(-40, inner=None) + a2 = A(-41, inner=None) + with torch.overrides.enable_torch_function_mode(a1): + with torch.overrides.enable_torch_function_mode(a2, ignore_preexisting=True): + self.assertEqual(bar(None), -41) + + def test_reentrant_mode_idiom(self): + log = [] + + class A(TorchFunctionMode): + def __torch_function__(self, func, types, args=(), kwargs=None): + if kwargs is None: + kwargs = {} + log.append(func) + if func is torch.sub: + with torch.overrides.enable_torch_function_mode(self, replace=self.inner): + input, other = args + assert not kwargs + return torch.add(input, other, alpha=-1) + return func(*args, **kwargs) + + x = torch.randn(1) + y = torch.randn(1) + with torch.overrides.push_torch_function_mode(A): + torch.sub(x, y) + # add hits the torch function again! + self.assertEqual(log, [torch.sub, torch.add]) + + def test_nn_parse_to(self): + # This failed because the parser thinks the function is called to() + # but it's actually called _parse_to() + + called = False + + class A(TorchFunctionMode): + def __torch_function__(self, func, types, args=(), kwargs=None): + nonlocal called + if kwargs is None: + kwargs = {} + called = True + return func(*args, **kwargs) + + with torch.overrides.push_torch_function_mode(A): + torch._C._nn._parse_to('cpu') + + self.assertTrue(called) + + def test_distributions_bernoulli(self): + # This failed because improper use of has_torch_function when + # is_tensor_like should have been used instead, inside the + # broadcasting logic called by distributions (Bernoulli doesn't + # matter per se) + + called = False + + class A(TorchFunctionMode): + def __torch_function__(self, func, types, args=(), kwargs=None): + nonlocal called + if kwargs is None: + kwargs = {} + called = True + return func(*args, **kwargs) + + with torch.overrides.push_torch_function_mode(A): + torch.distributions.Bernoulli(0.3) + + self.assertTrue(called) + + def test_mode_notimplemented_loop(self): + # Default tensor subclass implementation disables torch function; + # when we redispatch to mode we must not treat the objects as + # eligible + + called = 0 + + class A(TorchFunctionMode): + def __torch_function__(self, func, types, args=(), kwargs=None): + nonlocal called + if kwargs is None: + kwargs = {} + called += 1 + # The first time we call, the mode sees an active type that + # it doesn't know how to deal with. The second time, we're + # instructed to treat it "as if it were a tensor", and so + # we keep going. I'm not entirely clear if the subclasses + # disappearing from types is the correct way to do it. + if any(t is not torch.Tensor for t in types): + return NotImplemented + else: + return func(*args, **kwargs) + + class B(torch.Tensor): + pass + + b = B() + + with torch.overrides.push_torch_function_mode(A): + r = torch.neg(b) + + self.assertIs(type(r), B) + self.assertEqual(called, 2) + + called = 0 + + with torch.overrides.push_torch_function_mode(A): + r = bar(b) + + self.assertIs(type(r), B) + self.assertEqual(called, 2) + + + if __name__ == '__main__': run_tests() diff --git a/test/test_per_overload_api.py b/test/test_per_overload_api.py index c97ee2d62766..cdb2b7983512 100644 --- a/test/test_per_overload_api.py +++ b/test/test_per_overload_api.py @@ -1,67 +1,64 @@ # Owner(s): ["module: unknown"] -# import torch -# import copy +import torch +import copy from torch.testing._internal.common_utils import TestCase, run_tests class TestPerOverloadAPI(TestCase): - # def test_basics_opoverloadpacket(self): - # # add is ony used as an example here. It is ok to update the test - # # if the semantics of add are modified in the future. - # add_packet = torch.ops.aten.add + def test_basics_opoverloadpacket(self): + # add is ony used as an example here. It is ok to update the test + # if the semantics of add are modified in the future. + add_packet = torch.ops.aten.add - # # class attributes - # self.assertEqual(add_packet.op_name, 'add') - # self.assertEqual(add_packet.qualified_op_name, 'aten.add') + # class attributes + self.assertEqual(add_packet.__name__, 'add') + self.assertEqual(str(add_packet), 'aten.add') - # # callable - # self.assertEqual(add_packet(torch.tensor(2), torch.tensor(3)), torch.tensor(5)) + # callable + self.assertEqual(add_packet(torch.tensor(2), torch.tensor(3)), torch.tensor(5)) - # # correct module - # self.assertEqual(add_packet.__module__, add_packet.op.__module__) + # correct module + self.assertEqual(add_packet.__module__, add_packet.op.__module__) - # # caching - # another_add_packet = torch.ops.aten.add - # self.assertEqual(id(add_packet), id(another_add_packet)) + # caching + another_add_packet = torch.ops.aten.add + self.assertEqual(id(add_packet), id(another_add_packet)) - # # deepcopy is a no-op - # self.assertEqual(id(add_packet), id(copy.deepcopy(add_packet))) + # deepcopy is a no-op + self.assertEqual(id(add_packet), id(copy.deepcopy(add_packet))) - # # pretty print - # self.assertEqual(str(add_packet), "OpOverloadPacket(op='aten.add')") + # pretty print + self.assertEqual(repr(add_packet), "") - # self.assertRaises(AttributeError, lambda: add_packet.foo) + self.assertRaises(AttributeError, lambda: add_packet.foo) - # def test_basics_opoverload(self): - # add_packet = torch.ops.aten.add - # add_tensoroverload = add_packet.Tensor + def test_basics_opoverload(self): + add_packet = torch.ops.aten.add + add_tensoroverload = add_packet.Tensor - # # class attributes - # self.assertEqual(add_tensoroverload.name, 'aten.add') - # self.assertEqual(add_tensoroverload.overload_name, 'Tensor') - # self.assertEqual(add_tensoroverload.overload_packet, add_packet) + # class attributes + self.assertEqual(str(add_tensoroverload), 'aten.add.Tensor') + self.assertEqual(add_tensoroverload.__name__, 'add.Tensor') + self.assertEqual(add_tensoroverload.overloadpacket, add_packet) - # # deepcopy is a no-op - # self.assertEqual(id(add_tensoroverload), id(copy.deepcopy(add_tensoroverload))) + # deepcopy is a no-op + self.assertEqual(id(add_tensoroverload), id(copy.deepcopy(add_tensoroverload))) - # # caching - # another_add_tensoroverload = torch.ops.aten.add.Tensor - # self.assertEqual(id(add_tensoroverload), id(another_add_tensoroverload)) + # caching + another_add_tensoroverload = torch.ops.aten.add.Tensor + self.assertEqual(id(add_tensoroverload), id(another_add_tensoroverload)) - # # pretty print - # self.assertEqual(str(add_tensoroverload), "OpOverload(op='aten.add', overload='Tensor')") + # pretty print + self.assertEqual(repr(add_tensoroverload), "") - # # callable - # self.assertEqual(add_tensoroverload(torch.tensor(2), torch.tensor(3)), torch.tensor(5)) + # callable + self.assertEqual(add_tensoroverload(torch.tensor(2), torch.tensor(3)), torch.tensor(5)) - # a = torch.tensor(2) - # b = torch.tensor(0) - # torch.ops.aten.add.out(a, a, out=b) - # self.assertEqual(b, torch.tensor(4)) + a = torch.tensor(2) + b = torch.tensor(0) + torch.ops.aten.add.out(a, a, out=b) + self.assertEqual(b, torch.tensor(4)) - # self.assertRaises(RuntimeError, lambda: add_tensoroverload(a, a, out=b)) - - def do_nothing(self): - return + self.assertRaises(RuntimeError, lambda: add_tensoroverload(a, a, out=b)) if __name__ == '__main__': run_tests() diff --git a/test/test_prims.py b/test/test_prims.py new file mode 100644 index 000000000000..fab525cd8b73 --- /dev/null +++ b/test/test_prims.py @@ -0,0 +1,108 @@ +# Owner(s): ["module: primTorch"] + +from functools import partial + +import torch +from torch.testing import make_tensor +from torch.testing._internal.common_utils import run_tests, TestCase +from torch.testing._internal.common_device_type import ( + instantiate_device_type_tests, + onlyCUDA, + skipCUDAIfRocm, + dtypes, +) +from torch.testing._internal.logging_tensor import LoggingTensor, capture_logs, log_input +import torch._prims as prims +from torch._prims.executor import make_traced + + +class TestPrims(TestCase): + @onlyCUDA + @skipCUDAIfRocm + @dtypes(torch.float32) + def test_broadcast_in_dim(self, device, dtype): + # nvfuser is not currently capable of realizing a broadcasted tensor + # when the broadcast is the only operation. Another op is needed. + def _wrapper(a, b, broadcast_dimensions): + a_bc = prims.broadcast_in_dim(a, b.shape, broadcast_dimensions) + return prims.add(a_bc, b) + + traced = make_traced(_wrapper) + make_arg = partial(make_tensor, device=device, dtype=dtype) + + for executor in ('aten', 'nvfuser'): + fn = partial(traced, executor=executor) + # Same shape + shape = (5, 5) + a = make_arg(shape) + b = make_arg(shape, low=0.0, high=0.0) + result = fn(a, b, (0, 1)) + + self.assertEqual(result.shape, a.shape) + self.assertTrue(result.is_contiguous) + self.assertEqual(a, result) + + # Error input: reordering dims + with self.assertRaises(Exception): + result = fn(a, b, (1, 0)) + + # Adding outermost dimensions + a = make_arg((5, 5)) + b = make_arg((3, 3, 5, 5), low=0.0, high=0.0) + result = fn(a, b, (2, 3)) + + self.assertEqual(result.shape, b.shape) + self.assertEqual(a.broadcast_to(b.shape), result) + + # Expands + a = make_arg((1, 5, 1)) + b = make_arg((3, 5, 7), low=0.0, high=0.0) + result = fn(a, b, (0, 1, 2)) + + self.assertEqual(result.shape, b.shape) + self.assertEqual(a.expand_as(result), result) + + # Unsqueezes + a = make_arg((1, 2, 3)) + b = make_arg((1, 2, 1, 3), low=0.0, high=0.0) + result = fn(a, b, (0, 1, 3)) + + self.assertEqual(result.shape, b.shape) + self.assertEqual(a.unsqueeze(2), result) + + # FIXME: This test exposes an issue in nvfuser + # Adds outermost, expands, and unsqueezes + """ + a = make_arg((1, 2, 3)) + b = make_arg((4, 1, 7, 2, 3, 3), low=0.0, high=0.0) + result = fn(a, b, (1, 3, 4)) + + self.assertEqual(result.shape, b.shape) + a.unsqueeze_(3) + a.unsqueeze_(1) + a.unsqueeze_(0) + self.assertEqual(a.expand_as(result), result) + """ + + +class TestPrimsBasic(TestCase): + def test_torch_ops(self): + r = make_tensor((2,), device='cpu', dtype=torch.float) + self.assertEqual(torch.ops.prims.sin(r), torch.sin(r)) + + r = LoggingTensor(r) + with capture_logs() as logs: + log_input("input", r) + prims.sin(r) + self.assertExpectedInline('\n'.join(logs), """\ +$0 = input('input') +$1 = torch._ops.prims.sin.default($0)""") + + def test_mul_complex(self): + prims.mul(torch.randn(2), 1 + 1j) + + +instantiate_device_type_tests(TestPrims, globals()) + +if __name__ == "__main__": + run_tests() diff --git a/test/test_profiler.py b/test/test_profiler.py index e5fa27248f89..adb3f1920d48 100644 --- a/test/test_profiler.py +++ b/test/test_profiler.py @@ -18,6 +18,7 @@ TemporaryFileName, TemporaryDirectoryName) from torch.autograd import (_record_function_with_args_enter, _record_function_with_args_exit) from torch.autograd.profiler import profile as _profile +from torch.autograd.profiler_legacy import profile as _profile_legacy from torch.profiler import ( kineto_available, profile, record_function, supported_activities, DeviceType, ProfilerAction, ProfilerActivity @@ -64,6 +65,31 @@ def test_mem_leak(self): self.assertTrue(not (is_increasing and max_diff > 100 * 1024), msg='memory usage is increasing, {}'.format(str(last_rss))) + def test_custom_module_input_op_ids(self): + class MyFunc(torch.autograd.Function): + @staticmethod + def forward(ctx, x): + ctx.save_for_backward(x) + return x + + @staticmethod + def backward(ctx, gO): + x, = ctx.saved_tensors + return x + + def custom_layer(input_ten): + return MyFunc.apply(input_ten) + + # Only testing that emit_nvtx runs when + # record_shapes option is enabled. + with torch.autograd.profiler.emit_nvtx(record_shapes=True) as prof: + x = torch.randn(10, 10, requires_grad=True) + y = torch.randn(10, 10, requires_grad=True) + z = x + y + s = custom_layer(z) + q = s.sum() + q.backward() + class TestRecordFunction(TestCase): def _record_function_with_param(self): u = torch.randn(3, 4, 5, requires_grad=True) @@ -108,6 +134,43 @@ def test_datapipe_with_record_function(self): self.assertTrue(has_iter) self.assertTrue(has_mux) + def test_datapipe_delegation_with_profiler(self): + class IDPIterator(torch.utils.data.IterDataPipe): + def __init__(self): + self.data = list(range(10)) + self._idx = 0 + + def __iter__(self): + return self + + def __next__(self): + if self._idx >= 10: + self._idx = 0 + raise StopIteration + self._idx += 1 + return self.data[self._idx - 1] + + def get_value(self, idx): + return self.data[idx] + + dp1 = IDPIterator() # The object itself is an iterator + self.assertEqual(5, dp1.get_value(5)) + it_dp1 = iter(dp1) # This creates the 1st iterator + self.assertEqual(5, it_dp1.get_value(5)) # type: ignore[attr-defined] + self.assertEqual(list(range(10)), list(it_dp1)) + + class IDPDelegator(torch.utils.data.IterDataPipe): + def __init__(self, datapipe): + self.datapipe = datapipe + + def __iter__(self): + return iter(self.datapipe) + + dp2 = IDPDelegator(dp1) + it_dp2 = iter(dp2) + self.assertEqual(5, it_dp2.get_value(5)) + self.assertEqual(list(range(10)), list(it_dp2)) + def test_datapipe_with_record_function_fork(self): with _profile(with_stack=True, use_kineto=kineto_available(), record_shapes=True) as prof: input_dp = dp.iter.IterableWrapper(range(10)) @@ -782,6 +845,7 @@ def test_profiler_tracing(self): if kineto_available(): self._test_profiler_tracing(True) + @unittest.skip("Disable forward->backward link to workaround profiler crash") def test_profiler_fwd_bwd_link(self): with _profile(use_kineto=True) as prof: t1, t2 = torch.ones(1, requires_grad=True), torch.ones(1, requires_grad=True) @@ -819,5 +883,19 @@ def test_profiler_fwd_bwd_link(self): self.assertTrue(ts_to_name[s_ts_1] == "aten::binary_cross_entropy_with_logits") self.assertTrue(ts_to_name[s_ts_2] == "aten::add") + def test_profiler_type(self): + profiler_type = torch._C._autograd._profiler_type + ActiveProfilerType = torch._C._autograd.ActiveProfilerType + self.assertEqual(profiler_type(), ActiveProfilerType.NONE) + + # Autograd profiler + with _profile_legacy(): + self.assertEqual(profiler_type(), ActiveProfilerType.LEGACY) + + # Kineto profiler + with profile(): + self.assertEqual(profiler_type(), ActiveProfilerType.KINETO) + + if __name__ == '__main__': run_tests() diff --git a/test/test_public_bindings.py b/test/test_public_bindings.py index 769e23159747..3c407be258cd 100644 --- a/test/test_public_bindings.py +++ b/test/test_public_bindings.py @@ -1,8 +1,15 @@ +# -*- coding: utf-8 -*- # Owner(s): ["module: autograd"] -from torch.testing._internal.common_utils import TestCase, run_tests - +from torch.testing._internal.common_utils import TestCase, run_tests, IS_WINDOWS +import pkgutil import torch +import sys +from typing import Callable +import inspect +import json +import os +import unittest class TestPublicBindings(TestCase): def test_no_new_bindings(self): @@ -125,7 +132,7 @@ def test_no_new_bindings(self): "has_lapack", "has_mkl", "has_mkldnn", - "has_mlc", + "has_mps", "has_openmp", "has_spectral", "HOIST_CONV_PACKED_PARAMS", @@ -138,6 +145,7 @@ def test_no_new_bindings(self): "InterfaceType", "IntStorageBase", "IntType", + "SymIntType", "IODescriptor", "is_anomaly_enabled", "is_autocast_cache_enabled", @@ -234,7 +242,7 @@ def test_no_new_bindings(self): "has_lapack", "has_mkl", "has_mkldnn", - "has_mlc", + "has_mps", "has_openmp", "iinfo", "import_ir_module", @@ -272,6 +280,118 @@ def test_no_new_bindings(self): msg = f"torch._C had bindings that are not present in the allowlist:\n{difference}" self.assertTrue(torch_C_bindings.issubset(torch_C_allowlist_superset), msg) + # AttributeError: module 'torch.distributed' has no attribute '_shard' + @unittest.skipIf(IS_WINDOWS, "Distributed Attribute Error") + def test_correct_module_names(self): + ''' + An API is considered public, if its `__module__` starts with `torch.` + and there is no name in `__module__` or the object itself that starts with “_”. + Each public package should either: + - (preferred) Define `__all__` and all callables and classes in there must have their + `__module__` start with the current submodule's path. Things not in `__all__` should + NOT have their `__module__` start with the current submodule. + - (for simple python-only modules) Not define `__all__` and all the elements in `dir(submod)` must have their + `__module__` that start with the current submodule. + ''' + failure_list = [] + with open(os.path.join(os.path.dirname(__file__), 'allowlist_for_publicAPI.json')) as json_file: + # no new entries should be added to this allow_dict. + # New APIs must follow the public API guidelines. + allow_dict = json.load(json_file) + + def test_module(modname): + split_strs = modname.split('.') + mod = sys.modules.get(modname) + for elem in split_strs: + if elem.startswith("_"): + return + + # verifies that each public API has the correct module name and naming semantics + def check_one_element(elem, modname, mod, *, is_public, is_all): + obj = getattr(mod, elem) + if not (isinstance(obj, Callable) or inspect.isclass(obj)): + return + elem_module = getattr(obj, '__module__', None) + # Only used for nice error message below + why_not_looks_public = "" + if elem_module is None: + why_not_looks_public = "because it does not have a `__module__` attribute" + elem_modname_starts_with_mod = elem_module is not None and \ + elem_module.startswith(modname) and '._' not in elem_module + if not why_not_looks_public and not elem_modname_starts_with_mod: + why_not_looks_public = f"because its `__module__` attribute (`{elem_module}`) is not within the " \ + f"torch library or does not start with the submodule where it is defined (`{modname}`)" + # elem's name must NOT begin with an `_` and it's module name + # SHOULD start with it's current module since it's a public API + looks_public = not elem.startswith('_') and elem_modname_starts_with_mod + if not why_not_looks_public and not looks_public: + why_not_looks_public = f"because it starts with `_` (`{elem}`)" + + if is_public != looks_public: + if modname in allow_dict and elem in allow_dict[modname]: + return + + if is_public: + why_is_public = f"it is inside the module's (`{modname}`) `__all__`" if is_all else \ + "it is an attribute that does not start with `_` on a module that " \ + "does not have `__all__` defined" + fix_is_public = f"remove it from the modules's (`{modname}`) `__all__`" if is_all else \ + f"either define a `__all__` for `{modname}` or add a `_` at the beginning of the name" + else: + assert is_all + why_is_public = f"it is not inside the module's (`{modname}`) `__all__`" + fix_is_public = f"add it from the modules's (`{modname}`) `__all__`" + + if looks_public: + why_looks_public = "it does look public because it follows the rules from the doc above " \ + "(does not start with `_` and has a proper `__module__`)." + fix_looks_public = "make its name start with `_`" + else: + why_looks_public = why_not_looks_public + if not elem_modname_starts_with_mod: + fix_looks_public = "make sure the `__module__` is properly set and points to a submodule "\ + f"of `{modname}`" + else: + fix_looks_public = "remove the `_` at the beginning of the name" + + failure_list.append(f"# {modname}.{elem}:") + is_public_str = "" if is_public else " NOT" + failure_list.append(f" - Is{is_public_str} public: {why_is_public}") + looks_public_str = "" if looks_public else " NOT" + failure_list.append(f" - Does{looks_public_str} look public: {why_looks_public}") + # Swap the str below to avoid having to create the NOT again + failure_list.append(" - You can do either of these two things to fix this problem:") + failure_list.append(f" - To make it{looks_public_str} public: {fix_is_public}") + failure_list.append(f" - To make it{is_public_str} look public: {fix_looks_public}") + + + if hasattr(mod, '__all__'): + public_api = mod.__all__ + all_api = dir(mod) + for elem in all_api: + check_one_element(elem, modname, mod, is_public=elem in public_api, is_all=True) + + else: + all_api = dir(mod) + for elem in all_api: + if not elem.startswith('_'): + check_one_element(elem, modname, mod, is_public=True, is_all=False) + + for _, modname, ispkg in pkgutil.walk_packages(path=torch.__path__, prefix=torch.__name__ + '.'): + test_module(modname) + + test_module('torch') + + msg = "All the APIs below do not meet our guidelines for public API from " \ + "https://github.com/pytorch/pytorch/wiki/Public-API-definition-and-documentation.\n" + msg += "Make sure that everything that is public is expected (in particular that the module " \ + "has a properly populated `__all__` attribute) and that everything that is supposed to be public " \ + "does look public (it does not start with `_` and has a `__module__` that is properly populated)." + msg += "\n\nFull list:\n" + msg += "\n".join(map(str, failure_list)) + + # empty lists are considered false in python + self.assertTrue(not failure_list, msg) if __name__ == '__main__': run_tests() diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py index 3cf4a18bd1ea..7aedd935c697 100644 --- a/test/test_python_dispatch.py +++ b/test/test_python_dispatch.py @@ -1,17 +1,256 @@ -# Owner(s): ["high priority"] +# Owner(s): ["module: __torch_dispatch__"] +import tempfile import torch -from torch.testing._internal.common_utils import TestCase, run_tests -from torch.testing._internal.logging_tensor import LoggingTensor, log_input, capture_logs, no_dispatch +from copy import deepcopy +from torch.library import Library +from torch.cuda.jiterator import _create_jit_fn +from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_ROCM +from torch.testing._internal.logging_tensor import LoggingTensor, LoggingTensorReentrant, LoggingTensorMode, \ + log_input, capture_logs, no_dispatch from torch.utils._pytree import tree_map -from torch.utils._python_dispatch import enable_python_mode +from torch.utils._python_dispatch import enable_torch_dispatch_mode, push_torch_dispatch_mode, TorchDispatchMode import logging +from functools import partial + +class TestPythonRegistration(TestCase): + def test_override_aten_ops_with_multiple_libraries(self) -> None: + x = torch.tensor([1, 2]) + my_lib1 = Library("aten", "IMPL") + my_lib2 = Library("aten", "IMPL") + + # Example 1 + def my_neg(*args, **kwargs): + return args[0]._neg_view() + + # Now we are secretly making the operator a view op so autograd needs to know how + # to handle it + my_lib1.impl('neg', my_neg, "AutogradCPU") + + self.assertTrue(torch.neg(x).is_neg()) + + # RuntimeError: impl("aten::neg", ...): + # Explicitly provided namespace (aten) in operator name does not match ... + with self.assertRaisesRegex(RuntimeError, "operator name does not match namespace"): + my_lib3 = Library("foo", "IMPL") + my_lib3.impl(torch.ops.aten.neg.default, my_neg, "AutogradCPU") + del my_lib3 + + # Example 2 + def my_mul(*args, **kwargs): + return torch.zeros_like(args[0]) + + # torch.ops.aten.mul.Tensor + my_lib2.impl("aten::mul.Tensor", my_mul, "ZeroTensor") + + y = torch._efficientzerotensor(2) + self.assertFalse(torch.mul(x, y)._is_zerotensor()) + + # Assert that a user can't override the behavior of a (ns, op, dispatch_key) + # combination if someone overrided the behavior for the same before them + with self.assertRaisesRegex(RuntimeError, 'already a kernel registered from python'): + my_lib2.impl(torch.ops.aten.mul.Tensor, my_mul, "ZeroTensor") + + del my_lib1 + + # Validate that lib2 is not affected by removing lib1 + self.assertFalse(torch.mul(x, y)._is_zerotensor()) + + del my_lib2 + + # Validate that the old behavior is restored for neg and mul + self.assertFalse(torch.neg(x).is_neg()) + self.assertTrue(torch.mul(x, y)._is_zerotensor()) + + def test_override_cpu_sum(self) -> None: + # Example 1 + run = [False] + + def my_sum(*args, **kwargs): + run[0] = True + return args[0] + + my_lib1 = Library("aten", "IMPL") + my_lib1.impl('aten::sum', my_sum, "CPU") + x = torch.tensor([1, 2]) + self.assertEqual(torch.sum(x), x) + self.assertTrue(run[0]) + del my_lib1 + # Validate that the old behavior is restored for sum + self.assertEqual(torch.sum(x), torch.tensor(3)) + + def test_override_cuda_with_jiterator(self) -> None: + def override_where_cuda() -> None: + # Example 1: Invert the behavior of where's condition input + not_where_code_string = ''' + template T inverted_where(bool cond, T a, T b){ + return !cond ? a : b; + } + ''' + jitted_where = _create_jit_fn(not_where_code_string) + + CALLED = [False] + + def inverted_where(*args, **kwargs): + CALLED[0] = True + return jitted_where(*args, **kwargs) + + # overriding where's cuda kernel with Jiterator generated kernel + my_lib = Library("aten", "IMPL") + my_lib.impl('aten::where.self', inverted_where, "CUDA") + + device = 'cuda' + cond = torch.tensor([True, True, False], device=device, dtype=torch.bool) + x = torch.tensor([1, 2, 3], device=device) + y = torch.tensor([-1, -2, -3], device=device) + + self.assertEqual(torch.where(cond, x, y), torch.tensor([-1, -2, 3])) + self.assertTrue(CALLED[0]) + del my_lib + + # behavior restored after deregistration + self.assertEqual(torch.where(cond, x, y), torch.tensor([1, 2, -3])) + + def override_gelu_cuda() -> None: + # Example 2: Use relu to approximate gelu for faster compute + fastest_gelu_code_string = ''' + template T fast_gelu(T a){ + return a > 0 ? a : 0; + } + ''' + jitted_gelu = _create_jit_fn(fastest_gelu_code_string) + + CALLED = [False] + + def fast_gelu(*args, **kwargs): + CALLED[0] = True + return jitted_gelu(*args, **kwargs) + + # overriding gelu's cuda kernel with Jiterator generated relu kernel + my_lib = Library("aten", "IMPL") + my_lib.impl('aten::gelu', fast_gelu, "CUDA") + + x = torch.rand([3, 3], device='cuda', dtype=torch.float) + self.assertEqual(torch.nn.functional.gelu(x), torch.nn.functional.relu(x)) + self.assertTrue(CALLED[0]) + del my_lib + + # behavior restored after deregistration + self.assertNotEqual(torch.nn.functional.gelu(x), torch.nn.functional.relu(x)) + + def override_exp_cuda() -> None: + # Example 3: Preventing exp from exploding for float16 + clipped_exp_code_string = ''' + template T clipped_exp(T a){ + return a > T(10.0) ? T(22026.4657948) : exp(a); + } + ''' + jitted_exp = _create_jit_fn(clipped_exp_code_string) + + CALLED = [False] + + def clipped_exp(*args, **kwargs): + CALLED[0] = True + return jitted_exp(*args, **kwargs) + + # overriding exp's cuda kernel with clipped_exp kernel + my_lib = Library("aten", "IMPL") + my_lib.impl('aten::exp', clipped_exp, "CUDA") + + x = torch.tensor([0.0, 100.0], device='cuda', dtype=torch.float16) + self.assertEqual(torch.exp(x), torch.tensor([1.0, 22026.4657948], dtype=torch.float16)) + self.assertTrue(CALLED[0]) + del my_lib + + # behavior restored after deregistration + self.assertEqual(torch.exp(x), torch.tensor([1.0, torch.inf], dtype=torch.float16)) + + def override_add_cuda() -> None: + # Example 4: simulate a hardware bug, where the adder is always off by 1 + buggy_add_code_string = ''' + template T buggy_add(T a, T b){ + return a + b + T(1); + } + ''' + jitted_add = _create_jit_fn(buggy_add_code_string) + + CALLED = [False] + + def buggy_add(*args, **kwargs): + CALLED[0] = True + return jitted_add(*args, **kwargs) + + my_lib = Library("aten", "IMPL") + my_lib.impl('aten::add.Tensor', buggy_add, "CUDA") + + x_cpu = torch.rand([3, 3], device='cpu') + y_cpu = torch.rand([3], device='cpu') + + x_cuda = x_cpu.cuda() + y_cuda = y_cpu.cuda() + + self.assertEqual(x_cuda + y_cuda, x_cpu + y_cpu + 1) + self.assertTrue(CALLED[0]) + del my_lib + + # behavior restored after deregistration + self.assertEqual(x_cuda + y_cuda, x_cpu + y_cpu) + + if torch.cuda.is_available() and not TEST_WITH_ROCM: + override_where_cuda() + override_gelu_cuda() + override_exp_cuda() + override_add_cuda() + + def test_extend_library_with_dispatch_key_arg(self): + def my_sum(*args, **kwargs): + return args[0] + my_lib1 = Library("aten", "IMPL", dispatch_key="CPU") + + # RuntimeError: Explicitly provided dispatch key (Conjugate) is + # inconsistent with the dispatch key of the enclosing TORCH_LIBRARY_IMPL block + with self.assertRaisesRegex(RuntimeError, "inconsistent with the dispatch key"): + my_lib1.impl('sum', my_sum, "Conjugate") + my_lib1.impl('aten::sum', my_sum) + x = torch.tensor([1, 2]) + self.assertEqual(torch.sum(x), x) + del my_lib1 + + def test_create_new_library(self) -> None: + my_lib1 = Library("foo", "DEF") + + my_lib1.define("sum(Tensor self) -> Tensor") + + # Example 1 + @torch.library.impl(my_lib1, "sum", "CPU") + def my_sum(*args, **kwargs): + return args[0] + + x = torch.tensor([1, 2]) + self.assertEqual(torch.ops.foo.sum(x), x) + + my_lib2 = Library("foo", "IMPL") + + # Example 2 + @torch.library.impl(my_lib2, torch.ops.foo.sum.default, "ZeroTensor") + def my_sum_zt(*args, **kwargs): + if args[0]._is_zerotensor(): + return torch._efficientzerotensor(args[0].shape) + else: + return args[0] + + y = torch._efficientzerotensor(3) + self.assertTrue(torch.ops.foo.sum(y)._is_zerotensor()) + self.assertEqual(torch.ops.foo.sum(x), x) + + del my_lib2 + del my_lib1 class TestPythonDispatch(TestCase): def test_basic(self) -> None: with capture_logs() as logs: - x = LoggingTensor(torch.tensor([3.0], requires_grad=True)) + x = LoggingTensor(torch.tensor([3.0]), requires_grad=True) log_input("x", x) y = x * x saved_x = y.grad_fn._saved_self @@ -29,11 +268,11 @@ def test_basic(self) -> None: # self.assertEqual(saved_x._version, x._version) self.assertExpectedInline('\n'.join(logs), '''\ $0 = input('x') -$1 = torch._ops.aten.mul($0, $0) +$1 = torch._ops.aten.mul.Tensor($0, $0) $2 = input('grad_y') -$3 = torch._ops.aten.mul($2, $0) -$4 = torch._ops.aten.mul($2, $0) -$5 = torch._ops.aten.add($4, $3)''') +$3 = torch._ops.aten.mul.Tensor($2, $0) +$4 = torch._ops.aten.mul.Tensor($2, $0) +$5 = torch._ops.aten.add.Tensor($4, $3)''') def test_out(self) -> None: with capture_logs() as logs: @@ -49,7 +288,7 @@ def test_out(self) -> None: self.assertExpectedInline('\n'.join(logs), '''\ $0 = input('x') $1 = input('y') -$2 = torch._ops.aten.abs($0, out=$1)''') +$2 = torch._ops.aten.abs.out($0, out=$1)''') def test_kwarg_only(self) -> None: @@ -72,11 +311,11 @@ def test_kwarg_only(self) -> None: $0 = input('x') $1 = input('y') $2 = input('z') -$3 = torch._ops.aten.addmv($0, $1, $2) -$4 = torch._ops.aten.addmv($0, $1, $2) -$5 = torch._ops.aten.addmv($0, $1, $2, beta=2) -$6 = torch._ops.aten.addmv($0, $1, $2, alpha=2) -$7 = torch._ops.aten.addmv($0, $1, $2, beta=2, alpha=2)''') +$3 = torch._ops.aten.addmv.default($0, $1, $2) +$4 = torch._ops.aten.addmv.default($0, $1, $2) +$5 = torch._ops.aten.addmv.default($0, $1, $2, beta=2) +$6 = torch._ops.aten.addmv.default($0, $1, $2, alpha=2) +$7 = torch._ops.aten.addmv.default($0, $1, $2, beta=2, alpha=2)''') def test_kwarg_only_and_positional_default(self) -> None: with capture_logs() as logs: @@ -94,10 +333,28 @@ def test_kwarg_only_and_positional_default(self) -> None: self.assertExpectedInline('\n'.join(logs), '''\ $0 = input('x') $1 = input('y') -$2 = torch._ops.aten.kl_div($0, $1) -$3 = torch._ops.aten.kl_div($0, $1, 2) -$4 = torch._ops.aten.kl_div($0, $1, log_target=True) -$5 = torch._ops.aten.kl_div($0, $1, 2, log_target=True)''') +$2 = torch._ops.aten.kl_div.default($0, $1) +$3 = torch._ops.aten.kl_div.default($0, $1, 2) +$4 = torch._ops.aten.kl_div.default($0, $1, log_target=True) +$5 = torch._ops.aten.kl_div.default($0, $1, 2, log_target=True)''') + + def test_produce_real_type(self) -> None: + with capture_logs() as logs: + x = LoggingTensor(torch.ones(2, 2)) + log_input("x", x) + x.to(dtype=torch.double) # non-optional dtype + torch.cumprod(x, 0, dtype=torch.double) # optional dtype + x[:, 1].contiguous(memory_format=torch.contiguous_format) # optional memory format + # There doesn't appear to be any layout signatures which are + # triggerable using tensor subclasses (need to use a mode) + + self.assertExpectedInline('\n'.join(logs), '''\ +$0 = input('x') +$1 = torch._ops.aten._to_copy.default($0, dtype=torch.float64) +$2 = torch._ops.aten.cumprod.default($0, 0, dtype=torch.float64) +$3 = torch._ops.aten.slice.Tensor($0, 0, 0, 9223372036854775807) +$4 = torch._ops.aten.select.int($3, 1, 1) +$5 = torch._ops.aten.clone.default($4, memory_format=torch.contiguous_format)''') def test_list_ret(self) -> None: # test all sequence types are permissible returns @@ -109,7 +366,7 @@ def __new__(cls, elem): @classmethod def __torch_dispatch__(cls, func, types, args=(), kwargs=None): - if func == torch.ops.aten.split: + if func.overloadpacket == torch.ops.aten.split: with no_dispatch(): return list_type(torch.split(*args)) else: @@ -132,7 +389,7 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None): return "arf" # Wobbles depending on NDEBUG mode of pybind11 - self.assertRaisesRegexp( + self.assertRaisesRegex( RuntimeError, "Unable to cast", lambda: A(torch.zeros(1)).neg(), ) self.assertRaisesRegexp( @@ -141,7 +398,7 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None): def test_detach_appears_twice_when_called_once(self) -> None: with capture_logs() as logs: - x = LoggingTensor(torch.tensor([3.0], requires_grad=True)) + x = LoggingTensor(torch.tensor([3.0]), requires_grad=True) log_input("x", x) x.detach() # FIXME: We actually want this to emit a single detach. However, @@ -150,8 +407,8 @@ def test_detach_appears_twice_when_called_once(self) -> None: # would be bad if calling .detach() once emits 3+ detaches). self.assertExpectedInline('\n'.join(logs), '''\ $0 = input('x') -$1 = torch._ops.aten.detach($0) -$2 = torch._ops.aten.detach($1)''') +$1 = torch._ops.aten.detach.default($0) +$2 = torch._ops.aten.detach.default($1)''') def test_metadata_change_not_allowed(self) -> None: x = LoggingTensor(torch.ones(1)) @@ -240,7 +497,7 @@ def backward(ctx, grad_output): return grad_output * 2 * x with capture_logs() as logs: - x = LoggingTensor(torch.ones(1, requires_grad=True)) + x = LoggingTensor(torch.ones(1), requires_grad=True) log_input("x", x) x.grad = LoggingTensor(torch.zeros(1)) log_input("x.grad", x.grad) @@ -262,11 +519,11 @@ def backward(ctx, grad_output): self.assertExpectedInline('\n'.join(logs), '''\ $0 = input('x') $1 = input('x.grad') -$2 = torch._ops.aten.pow($0, 2) +$2 = torch._ops.aten.pow.Tensor_Scalar($0, 2) $3 = input('grad_output') -$4 = torch._ops.aten.mul($3, tensor(2)) -$5 = torch._ops.aten.mul($4, $0) -$6 = torch._ops.aten.add_($1, $5)''') +$4 = torch._ops.aten.mul.Tensor($3, 2) +$5 = torch._ops.aten.mul.Tensor($4, $0) +$6 = torch._ops.aten.add_.Tensor($1, $5)''') def test_subclass_creation(self): # Make sure these statements runs without error @@ -338,6 +595,83 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None): self.assertEqual(y.stride(), x.stride()) self.assertEqual(y.storage_offset(), x.storage_offset()) + def test_wrapper_subclass_serializes(self) -> None: + with tempfile.TemporaryFile() as f: + x = LoggingTensor(torch.randn(3)) + torch.save(x, f) + f.seek(0) + x_loaded = torch.load(f) + self.assertTrue(type(x_loaded) is type(x)) + self.assertEqual(x.elem, x_loaded.elem) + self.assertFalse(x is x_loaded) + + def test_deepcopy_wrapper_subclass(self) -> None: + x = LoggingTensor(torch.randn(3)) + x_copy = deepcopy(x) + self.assertTrue(type(x_copy) is type(x)) + self.assertEqual(x.elem, x_copy.elem) + self.assertFalse(x is x_copy) + + def test_deepcopy_wrapper_subclass_with_clone_returning_different_type(self) -> None: + + class MyWrapperTensor(torch.Tensor): + elem: torch.Tensor + + __slots__ = ['elem'] + + @staticmethod + def __new__(cls, elem, *args, **kwargs): + r = torch.Tensor._make_wrapper_subclass( # type: ignore[attr-defined] + cls, elem.size(), + dtype=elem.dtype, layout=elem.layout, + device=elem.device, requires_grad=elem.requires_grad, + strides=elem.stride(), storage_offset=elem.storage_offset()) + r.elem = elem + return r + + @classmethod + def __torch_dispatch__(cls, func, types, args=(), kwargs=None): + if func.overloadpacket.__name__ == "clone": + # Return a plain tensor from clone(). + return args[0].elem.clone() + raise RuntimeError("NYI") + + # NB: The default Tensor.__torch_function__ implementation called for deepcopy + # disables __torch_function__ by the time we get to clone(), so there is no need to + # explicitly disable __torch_function__ for this subclass. + + x = MyWrapperTensor(torch.randn(3)) + with self.assertRaisesRegex(RuntimeError, + "for which cloning returns another instance of the same subclass"): + x_copy = deepcopy(x) + + def test_deepcopy_non_wrapper_subclass(self) -> None: + + # Ensure correct error is thrown for common error cases. + class SubTensorError1(torch.Tensor): + # Default implementation of new_empty() returns a plain tensor. + pass + + class SubTensorError2(torch.Tensor): + # new_empty() incorrectly returns a different type (i.e. a plain tensor). + def new_empty(self, shape): + return torch.Tensor(shape) + + for error_cls in [SubTensorError1, SubTensorError2]: + x = error_cls(3) + with self.assertRaisesRegex(RuntimeError, + "for which that function returns another instance of the same subclass"): + x_copy = deepcopy(x) + + # Ensure a correctly implemented new_empty() causes deepcopy() to work. + class SubTensorSuccess(torch.Tensor): + def new_empty(self, shape): + return type(self)(shape) + + x = SubTensorSuccess(3) + x_copy = deepcopy(x) + self.assertIs(type(x_copy), type(x)) + def test_index_put_where_only_index_is_subclass(self) -> None: called_funcs = [] @@ -365,30 +699,27 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None): idxs = (MyTensor(torch.tensor(0)),) v = torch.randn(1) res = x.index_put_(idxs, v) - self.assertEqual(called_funcs, [torch.ops.aten.index_put_]) + self.assertEqual(called_funcs, [torch.ops.aten.index_put_.default]) - def test_enable_python_mode_error(self) -> None: - with self.assertRaisesRegex(ValueError, "__torch_dispatch__"): - with enable_python_mode(torch.Tensor): - pass + def test_enable_torch_dispatch_mode_error(self) -> None: z = LoggingTensor(torch.empty([])) - with self.assertRaisesRegex(ValueError, "must be the type"): - with enable_python_mode(z): + with self.assertRaisesRegex(ValueError, "expected to get TorchDispatchMode, Tensor-like class, or None"): + with enable_torch_dispatch_mode(z): pass - def test_enable_python_mode_basic(self) -> None: - with enable_python_mode(LoggingTensor): + def test_enable_torch_dispatch_mode_basic(self) -> None: + with enable_torch_dispatch_mode(LoggingTensorMode): z = torch.empty([]) - self.assertTrue(isinstance(z, LoggingTensor)) + self.assertTrue(isinstance(z, LoggingTensorMode)) - def test_enable_python_mode_unrelated_tensors(self) -> None: + def test_enable_torch_dispatch_mode_unrelated_tensors(self) -> None: x = torch.randn([]) y = torch.randn([]) - with enable_python_mode(LoggingTensor): + with enable_torch_dispatch_mode(LoggingTensorMode): z = x + y - self.assertTrue(isinstance(z, LoggingTensor)) + self.assertTrue(isinstance(z, LoggingTensorMode)) - def test_enable_python_mode_subclass_priority(self) -> None: + def test_enable_torch_dispatch_mode_subclass_priority(self) -> None: class ErrorA(RuntimeError): pass @@ -417,33 +748,173 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None): b = B(torch.empty(1)) with self.assertRaises(ErrorA): a + a - - # B has precedence over A due to the subclass relationship with self.assertRaises(ErrorB): - with enable_python_mode(A): + a + b + + # B has precedence over A due to the subclass relationship yet + # modes take precedence over arguments + with self.assertRaises(ErrorA): + with enable_torch_dispatch_mode(A): b + b with self.assertRaises(ErrorB): - with enable_python_mode(B): + with enable_torch_dispatch_mode(B): a + a with self.assertRaises(ErrorB): - with enable_python_mode(B): + with enable_torch_dispatch_mode(B): a + b - def test_enable_python_mode_respects_no_dispatch(self) -> None: - with enable_python_mode(LoggingTensor): + def test_enable_torch_dispatch_mode_respects_no_dispatch(self) -> None: + with enable_torch_dispatch_mode(LoggingTensorMode): z = torch.ones([2, 3]) - self.assertTrue(isinstance(z, LoggingTensor)) + self.assertTrue(isinstance(z, LoggingTensorMode)) with no_dispatch(): expected = torch.ones([2, 3]) self.assertEqual(z.elem, expected) - def test_nested_enable_python_mode(self) -> None: - with self.assertRaisesRegex(RuntimeError, "has already been set"): - with enable_python_mode(LoggingTensor): - with enable_python_mode(LoggingTensor): + def test_enable_torch_dispatch_mode_instance(self) -> None: + class TestMode(TorchDispatchMode): + def __torch_dispatch__(self, func, types, args=(), kwargs=None): + if kwargs is None: + kwargs = {} + return func(*args, **kwargs) + + x = TestMode(inner=None) + y = torch.tensor([2.]) + with enable_torch_dispatch_mode(x): + y + y + + def test_nested_enable_torch_dispatch_mode(self) -> None: + class A(LoggingTensorMode): + pass + + with self.assertRaisesRegex(ValueError, "there is already an active mode"): + with enable_torch_dispatch_mode(LoggingTensorMode): + with enable_torch_dispatch_mode(A): pass - def test_tolist_numpy_with_python_mode(self) -> None: + def test_nesting_with_same_enable_torch_dispatch_mode(self) -> None: + # "nested" enable_torch_dispatch_modes are allowed if they're the same mode. It's the equivalent of + # a noop, so it will only write once to the log + with capture_logs() as logs: + x = LoggingTensor(torch.tensor([3.])) + log_input("x", x) + with enable_torch_dispatch_mode(LoggingTensor): + with enable_torch_dispatch_mode(LoggingTensor): + x + x + + self.assertExpectedInline('\n'.join(logs), '''\ +$0 = input('x') +$1 = torch._ops.aten.add.Tensor($0, $0)''') + + def test_enable_torch_dispatch_mode_ignore_preexisting(self): + class A(torch.Tensor): + @staticmethod + def __new__(cls, elem): + return torch.Tensor._make_subclass(cls, elem, elem.requires_grad) + + @classmethod + def __torch_dispatch__(cls, func, types, args=(), kwargs=None): + return cls(torch.zeros(())) + + class B(A): + pass + + with enable_torch_dispatch_mode(A): + with enable_torch_dispatch_mode(B, ignore_preexisting=True): + self.assertTrue(isinstance(torch.zeros(()), B)) + + def test_enable_torch_dispatch_mode_replace(self): + class A(torch.Tensor): + @staticmethod + def __new__(cls, elem): + return torch.Tensor._make_subclass(cls, elem, elem.requires_grad) + + @classmethod + def __torch_dispatch__(cls, func, types, args=(), kwargs=None): + return cls(torch.zeros(())) + + class B(A): + pass + + with enable_torch_dispatch_mode(A): + with enable_torch_dispatch_mode(B, replace=A): + self.assertTrue(isinstance(torch.zeros(()), B)) + + def test_exception_handling(self): + class A(torch.Tensor): + @staticmethod + def __new__(cls, elem): + return torch.Tensor._make_subclass(cls, elem, elem.requires_grad) + + @classmethod + def __torch_dispatch__(cls, func, types, args=(), kwargs=None): + if func.__name__ == 'randn.default': + raise RuntimeError() + return cls(torch.zeros(())) + + with enable_torch_dispatch_mode(A): + try: + torch.randn(()) + except RuntimeError: + pass + self.assertTrue(isinstance(torch.zeros(()), A)) + + def test_push_torch_dispatch_mode(self) -> None: + class ErrorA(RuntimeError): + def __init__(self, msg=None): + return super().__init__(msg) + + class A(TorchDispatchMode): + def __init__(self, msg=None): + self.msg = msg + + def __torch_dispatch__(self, func, types, args=(), kwargs=None): + raise ErrorA(self.msg) + + x = torch.randn(3) + with self.assertRaises(ErrorA): + with push_torch_dispatch_mode(A): + torch.add(x, x) + + with self.assertRaisesRegex(ErrorA, r"partial constructor"): + with push_torch_dispatch_mode(partial(A, "partial constructor")): + x + x + + def test_torch_dispatch_mode_stack(self) -> None: + logs = [] + + class Logger(TorchDispatchMode): + def __init__(self, name): + self.name = name + + def __torch_dispatch__(self, func, types, args=(), kwargs=None): + if kwargs is None: + kwargs = {} + logs.append(self.name) + return func(*args, **kwargs) + + x = torch.randn(1) + with Logger.push("A"): + with Logger.push("B"): + x + x + self.assertEqual(logs, ["B", "A"]) + + def test_push_mode_instance_errors(self): + class A(TorchDispatchMode): + pass + with self.assertRaisesRegex(ValueError, 'instance of TorchDispatchMode'): + with push_torch_dispatch_mode(A(inner=None)): + pass + + def test_push_mode_returns_unrelated(self): + with self.assertRaisesRegex(ValueError, 'return a TorchDispatchMode'): + with push_torch_dispatch_mode(lambda *, inner: None): + pass + + def test_missing_inner_mode_ctor(self): + self.assertRaisesRegex(TypeError, 'push_torch_dispatch_mode', lambda: TorchDispatchMode()) + + def test_tolist_numpy_with_torch_dispatch_mode(self) -> None: x = LoggingTensor(torch.tensor([2.0, 3.0])) with self.assertRaisesRegex(RuntimeError, "is not supported for tensor subclasses."): x.tolist() @@ -452,7 +923,7 @@ def test_tolist_numpy_with_python_mode(self) -> None: with self.assertRaises(AssertionError): self.assertEqual(x, None) - def test_enable_python_mode_subclass_autograd_device_check(self) -> None: + def test_enable_torch_dispatch_mode_subclass_autograd_device_check(self) -> None: class NonWrapperSubclass(torch.Tensor): elem: torch.Tensor @@ -474,10 +945,7 @@ def unwrap(e): def wrap(e): return NonWrapperSubclass(e) if isinstance(e, torch.Tensor) else e - # no_dispatch is only needed if you use enable_python_mode. - # It prevents infinite recursion. - with no_dispatch(): - rs = tree_map(wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs))) + rs = tree_map(wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs))) logging.getLogger("NonWrapperSubclass").info(f"{func.__module__}.{func.__name__}", args, kwargs, rs) return rs @@ -511,11 +979,8 @@ def unwrap(e): def wrap(e): return SubclassWithNone(e) if isinstance(e, torch.Tensor) else e - # no_dispatch is only needed if you use enable_python_mode. - # It prevents infinite recursion. - with no_dispatch(): - rs = tree_map(wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs))) - if func.__name__ == "add": + rs = tree_map(wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs))) + if func.overloadpacket.__name__ == "add": return None else: return rs @@ -536,11 +1001,235 @@ def wrap(e): out.backward() def test_storage_can_be_converted_to_python_object(self): - with enable_python_mode(LoggingTensor): + with enable_torch_dispatch_mode(LoggingTensorMode): s = torch.Storage() - z = LoggingTensor(torch.empty([])) + z = LoggingTensorMode(torch.empty([])) z.set_(s) + def test_autograd_in_attr(self): + # We want the wrapped Tensor to require gradients! + true_t = torch.rand(2, requires_grad=True) + t = LoggingTensorReentrant(true_t) + + out = t + 2 + + self.assertFalse(out.requires_grad) + self.assertIsNone(out.grad_fn) + + self.assertTrue(out.elem.requires_grad) + self.assertIsNotNone(out.elem.grad_fn) + + with self.assertRaisesRegex(RuntimeError, "does not require grad"): + out.sum().backward() + + out.elem.sum().backward() + + self.assertIsNone(t.grad) + self.assertIsNotNone(t.elem.grad) + + def test_dispatch_super_call(self): + called = [] + + class SubTensor(torch.Tensor): + @staticmethod + def __new__(cls, elem): + return torch.Tensor._make_subclass(cls, elem) + + __torch_function__ = torch._C._disabled_torch_function_impl + + @classmethod + def __torch_dispatch__(cls, func, types, args=(), kwargs=None): + called.append(func) + return super().__torch_dispatch__(func, types, args, kwargs) + + x = torch.randn(2) + y = torch.randn(2) + self.assertEqual(SubTensor(x) + SubTensor(y), x + y) + self.assertEqual(called, [torch.ops.aten.add.Tensor]) + + def test_dispatch_super_call_list_arg(self): + called = [] + + class SubTensorWithListArg(torch.Tensor): + @staticmethod + def __new__(cls, elem): + return torch.Tensor._make_subclass(cls, elem) + + __torch_function__ = torch._C._disabled_torch_function_impl + + @classmethod + def __torch_dispatch__(cls, func, types, args=(), kwargs=None): + called.append(func) + return super().__torch_dispatch__(func, types, list(args), kwargs) + + x = torch.randn(2) + self.assertEqual(SubTensorWithListArg(x).neg(), x.neg()) + self.assertEqual(called, [torch.ops.aten.neg.default]) + + def test_dispatch_super_dont_autograd(self): + called = [] + + class SubTensor(torch.Tensor): + @staticmethod + def __new__(cls, elem): + return torch.Tensor._make_subclass(cls, elem, elem.requires_grad) + + __torch_function__ = torch._C._disabled_torch_function_impl + + @classmethod + def __torch_dispatch__(cls, func, types, args=(), kwargs=None): + called.append(func) + # This argument still requires grad because it was passed + # through directly... + self.assertTrue(args[0].requires_grad) + r = super().__torch_dispatch__(func, types, args, kwargs) + # But the output better not require grad, because that means + # you did autograd again in torch dispatch (oops) + self.assertFalse(r.requires_grad) + return r + + x = SubTensor(torch.randn(2, requires_grad=True)) + x.neg() + self.assertEqual(called, [torch.ops.aten.neg.default]) + + def test_set_data(self): + called = 0 + + class SubTensor(torch.Tensor): + __torch_function__ = torch._C._disabled_torch_function_impl + + @classmethod + def __torch_dispatch__(cls, func, types, args=(), kwargs=None): + nonlocal called + called += 1 + return super().__torch_dispatch__(func, types, args, kwargs) + + x = SubTensor(torch.empty(2)) + x.data + self.assertEqual(called, 1) + x.data = torch.empty(2) + self.assertEqual(called, 1) + x.data + self.assertEqual(called, 2) + self.assertIs(type(x), SubTensor) + x.set_(torch.empty(2)) + self.assertEqual(called, 3) + x.data + self.assertEqual(called, 4) + self.assertIs(type(x), SubTensor) + + def test_construct_int_tensor(self): + class SubTensor(torch.Tensor): + pass + # should not fail + SubTensor(torch.zeros(2, dtype=torch.int)) + + def test_multiple_ops_subclass(self): + # This is a Direct Subclass, don't do that! + class MySubclass(torch.Tensor): + @staticmethod + def __new__(cls, elem): + r = torch.Tensor._make_subclass(cls, elem) + return r + + __torch_function__ = torch._C._disabled_torch_function_impl + + @classmethod + def __torch_dispatch__(cls, func, types, args=(), kwargs=None): + with no_dispatch(): + return func(*args, **kwargs) + + x = MySubclass(torch.rand(2, 2, dtype=torch.complex64)) + y = x.conj() + # Details of the bug that this tests for: + # Here, y dispatch keys are: {PythonTLSSnapshot, AutogradCPU, Conjugate, Python, CPU} + # There are a few calls to the dispatcher that are going to happen here: + # - call_exp: User calling exp on y + # - PythonTLSSnapshot: records the TLS on entry and redispatch + # - AutogradCPU: no input requires grad, so does nothing and redispatch + # - Conjugate: no special implementation for exp: use the fallback that + # first clone the Tensor (to materialize the conj) then redispatch + # - call_clone: conjugate fallback calling clone on y + # - PythonTLSSnapshot: records the TLS on entry and redispatch + # - (AutogradCPU: skipped as autograd added itself to the exclude set above) + # - Conjugate: special implementation for clone: just skip this key + # - Python: Reset the TLS based on the snapshot above and call the user implementation (this + # actually calls into the dispatcher again but since we disable both our keys + # before, not detailed here) + # - exit Python: restore the TLS and exit + # - exit Conjugate: nothing was inplace so just exit + # - exit PythonTLSSnapshot: done with this call, reset the saved TLS to empty + # - Python: Reset the TLS again based on the snapshot. <- this used to fail + # - More steps.... + y.exp() + + def test_is_contiguous_slow_path(self): + data = torch.randn(3, 3) + contiguous_data = data.clone() + not_contiguous_data = torch.as_strided(data.clone(), (2, 2), (1, 2)) + + def subclass_helper(cls, data, use_wrapper_subclass): + if use_wrapper_subclass: + kwargs = {} + kwargs["device"] = data.device + kwargs["dtype"] = data.dtype + kwargs["layout"] = data.layout + kwargs["requires_grad"] = True + kwargs['dispatch_strides'] = True + return torch.Tensor._make_wrapper_subclass(cls, data.size(), **kwargs) # type: ignore[attr-defined] + else: + return torch.Tensor._make_subclass(cls, data, True, dispatch_strides=True) + + for use_wrapper_subclass in [True, False]: + class ExampleTensor1(torch.Tensor): + @staticmethod + def __new__(cls, data, wrapper): + return subclass_helper(cls, data, wrapper) + + @classmethod + def __torch_dispatch__(cls, func, types, args, kwargs): + return NotImplemented + + class ExampleTensor2(torch.Tensor): + @staticmethod + def __new__(cls, data, wrapper): + return subclass_helper(cls, data, wrapper) + + @classmethod + def __torch_dispatch__(cls, func, types, args, kwargs): + if func.overloadpacket == torch.ops.aten.is_contiguous: + return contiguous_data.is_contiguous() + return NotImplemented + + class ExampleTensor3(torch.Tensor): + @staticmethod + def __new__(cls, data, wrapper): + return subclass_helper(cls, data, wrapper) + + @classmethod + def __torch_dispatch__(cls, func, types, args, kwargs): + if func.overloadpacket == torch.ops.aten.is_contiguous: + return not_contiguous_data.is_contiguous() + return NotImplemented + + + err_msg = "no implementation found for 'torch.ops.aten.is_contiguous'" + e = ExampleTensor1(torch.randn(3, 3), use_wrapper_subclass) + with self.assertRaisesRegex(TypeError, err_msg): + e.is_contiguous() + with self.assertRaisesRegex(TypeError, err_msg): + e.contiguous() + + e = ExampleTensor2(torch.randn(3, 3), use_wrapper_subclass) + self.assertEqual(e.is_contiguous(), True) + e.contiguous() # this will just return the original TensorImpl since is_contiguous = True + + err_msg = "no implementation found for" + e = ExampleTensor3(torch.randn(3, 3), use_wrapper_subclass) + self.assertEqual(e.is_contiguous(), False) + with self.assertRaisesRegex(TypeError, err_msg): + e.contiguous() + if __name__ == '__main__': run_tests() diff --git a/test/test_pytree.py b/test/test_pytree.py index 81631c45c3fd..6a1c750d49b6 100644 --- a/test/test_pytree.py +++ b/test/test_pytree.py @@ -1,10 +1,11 @@ -# Owner(s): ["high priority"] +# Owner(s): ["module: pytree"] import torch from torch.testing._internal.common_utils import TestCase, run_tests from torch.utils._pytree import tree_flatten, tree_map, tree_unflatten, TreeSpec, LeafSpec from torch.utils._pytree import _broadcast_to_and_flatten from collections import namedtuple +from torch.testing._internal.common_utils import parametrize, subtest, instantiate_parametrized_tests class TestPytree(TestCase): def test_treespec_equality(self): @@ -79,11 +80,18 @@ def run_test(tup): run_test(Point(1., 2)) run_test(Point(torch.tensor(1.), 2)) - def test_flatten_unflatten_torch_namedtuple_return_type(self): + @parametrize("op", [ + subtest(torch.max, name='max'), + subtest(torch.min, name='min'), + ]) + def test_flatten_unflatten_return_type(self, op): x = torch.randn(3, 3) - expected = torch.max(x, dim=0) + expected = op(x, dim=0) values, spec = tree_flatten(expected) + # Check that values is actually List[Tensor] and not (ReturnType(...),) + for value in values: + self.assertTrue(isinstance(value, torch.Tensor)) result = tree_unflatten(values, spec) self.assertEqual(type(result), type(expected)) @@ -204,5 +212,7 @@ def test_broadcast_to_and_flatten(self): self.assertEqual(result, expected, msg=str([pytree, to_spec, expected])) +instantiate_parametrized_tests(TestPytree) + if __name__ == '__main__': run_tests() diff --git a/test/test_quantization.py b/test/test_quantization.py index e646750a623e..83d80deece2c 100644 --- a/test/test_quantization.py +++ b/test/test_quantization.py @@ -31,6 +31,7 @@ from quantization.core.test_workflow_module import TestObserver # noqa: F401 from quantization.core.test_quantized_module import TestStaticQuantizedModule # noqa: F401 from quantization.core.test_quantized_module import TestDynamicQuantizedModule # noqa: F401 +from quantization.core.test_quantized_module import TestReferenceQuantizedModule # noqa: F401 from quantization.core.test_workflow_module import TestRecordHistogramObserver # noqa: F401 from quantization.core.test_workflow_module import TestHistogramObserver # noqa: F401 from quantization.core.test_workflow_module import TestDistributed # noqa: F401 diff --git a/test/test_reductions.py b/test/test_reductions.py index 03d70db31ae0..6e422c9b6c3e 100644 --- a/test/test_reductions.py +++ b/test/test_reductions.py @@ -13,8 +13,8 @@ from torch._six import inf, nan from torch.testing import make_tensor from torch.testing._internal.common_dtype import ( - get_all_dtypes, get_all_math_dtypes, get_all_int_dtypes, get_all_complex_dtypes, get_all_fp_dtypes, - integral_types_and, floating_and_complex_types_and + all_types_and_complex_and, get_all_math_dtypes, integral_types, complex_types, floating_types_and, + integral_types_and, floating_and_complex_types_and, all_types_and, ) from torch.testing._internal.common_utils import ( TestCase, run_tests, skipIfNoSciPy, slowTest, torch_to_numpy_dtype_dict, @@ -99,7 +99,7 @@ class TestReductions(TestCase): def _test_dim_keepdim(self, op: ReductionOpInfo, device, *, ndim, **dim_keepdim): """Tests output shape for input with ndim and dim and keepdim kwargs""" shape = torch.randint(2, 5, (ndim,)).tolist() - t = make_tensor(shape, device, torch.float) + t = make_tensor(shape, dtype=torch.float, device=device) args, kwargs = next(op.generate_args_kwargs(t, **dim_keepdim)) result = op(t, *args, **dim_keepdim, **kwargs) expected_shape = _reduced_shape(shape, **dim_keepdim) @@ -207,14 +207,14 @@ def test_dim_offbounds(self, device, op: ReductionOpInfo): def test_dim_ndim_limit(self, device, op: ReductionOpInfo): """Tests that an exception is raised when reducing a tensor with more than 64 dims along some specific dimensions. dim=None is ok""" - t = make_tensor([1] * 65, device, torch.float) + t = make_tensor([1] * 65, dtype=torch.float, device=device) with self.assertRaisesRegex(RuntimeError, "only tensors with up to 64 dims are supported"): op(t, dim=0) @ops(filter(lambda op: op.identity is not None, reduction_ops), dtypes=OpDTypes.supported) def test_identity(self, device, dtype, op: ReductionOpInfo): """Tests that the identity value is an identity for the operator""" - t = make_tensor((10,), device, dtype) + t = make_tensor((10,), dtype=dtype, device=device) t[1::2] = op.identity args, kwargs = next(op.generate_args_kwargs(t)) result = op(t[::2], *args, **kwargs) @@ -230,7 +230,7 @@ def test_identity(self, device, dtype, op: ReductionOpInfo): allowed_dtypes=floating_and_complex_types_and(torch.bfloat16, torch.float16)) def test_nan_policy_propagate(self, device, dtype, op: ReductionOpInfo): """Tests that nan is propagated to the output by default""" - t = make_tensor((5,), device, dtype) + t = make_tensor((5,), dtype=dtype, device=device) t[2] = torch.nan args, kwargs = next(op.generate_args_kwargs(t)) result = op(t, *args, **kwargs) @@ -240,7 +240,7 @@ def test_nan_policy_propagate(self, device, dtype, op: ReductionOpInfo): allowed_dtypes=floating_and_complex_types_and(torch.bfloat16, torch.float16)) def test_nan_policy_omit(self, device, dtype, op: ReductionOpInfo): """Tests that NaN values do not affect the result.""" - t = make_tensor((10,), device, dtype) + t = make_tensor((10,), dtype=dtype, device=device) t[1::2] = torch.nan args, kwargs = next(op.generate_args_kwargs(t)) result = op(t[::2], *args, **kwargs) @@ -250,7 +250,7 @@ def test_nan_policy_omit(self, device, dtype, op: ReductionOpInfo): @ops(reduction_ops, dtypes=OpDTypes.supported) def test_result_dtype(self, device, dtype, op: ReductionOpInfo): """Tests that the result has the correct dtype""" - t = make_tensor((5,), device, dtype) + t = make_tensor((5,), dtype=dtype, device=device) args, kwargs = next(op.generate_args_kwargs(t)) result: torch.Tensor = op(t, *args, **kwargs) is_integral = dtype in integral_types_and(torch.bool) @@ -275,7 +275,7 @@ def test_empty_tensor_empty_slice(self, device, op: ReductionOpInfo): See discussion here https://github.com/pytorch/pytorch/issues/61901 """ - t = make_tensor((0, 2, 3), device, torch.float) + t = make_tensor((0, 2, 3), dtype=torch.float, device=device) for dim in [0] + [[0, 2]] if op.supports_multiple_dims else []: args, kwargs = next(op.generate_args_kwargs(t, dim=dim)) if op.identity is not None: @@ -295,7 +295,7 @@ def test_empty_tensor_empty_slice(self, device, op: ReductionOpInfo): def test_empty_tensor_nonempty_slice(self, device, op: ReductionOpInfo): """Tests that reducing a nonempty slice of an empty tensor returns an empty tensor with the dimensions reduced.""" - t = make_tensor((0, 2, 3), device, torch.float) + t = make_tensor((0, 2, 3), dtype=torch.float, device=device) for dim in [1] + [[1, 2]] if op.supports_multiple_dims else []: args, kwargs = next(op.generate_args_kwargs(t, dim=dim)) result = op(t, *args, dim=dim, **kwargs) @@ -315,31 +315,31 @@ def _test_noncontiguous(self, op: ReductionOpInfo, t: torch.Tensor, **reduction_ @ops(reduction_ops) def test_noncontiguous_innermost(self, device, dtype, op: ReductionOpInfo): """Tests reducing along noncontiguous innermost dimension.""" - t = make_tensor((10, 10), device, dtype, low=-1, high=1) + t = make_tensor((10, 10), dtype=dtype, device=device, low=-1, high=1) self._test_noncontiguous(op, t[:, ::2], dim=1) @ops(reduction_ops) def test_noncontiguous_outermost(self, device, dtype, op: ReductionOpInfo): """Tests reducing along noncontiguous outermost dimension.""" - t = make_tensor((10, 10), device, dtype, low=-1, high=1) + t = make_tensor((10, 10), dtype=dtype, device=device, low=-1, high=1) self._test_noncontiguous(op, t[::2, :], dim=0) @ops(reduction_ops) def test_noncontiguous_all(self, device, dtype, op: ReductionOpInfo): """Tests reducing all dimensions of a noncontiguous tensor.""" - t = make_tensor((5, 5, 5), device, dtype, low=-1, high=1) + t = make_tensor((5, 5, 5), dtype=dtype, device=device, low=-1, high=1) self._test_noncontiguous(op, t[::2, ::3, 1:-1:2]) @ops(reduction_ops) def test_noncontiguous_transposed(self, device, dtype, op: ReductionOpInfo): """Tests reducing a transposed tensor.""" - t = make_tensor((5, 5), device, dtype, low=-1, high=1) + t = make_tensor((5, 5), dtype=dtype, device=device, low=-1, high=1) self._test_noncontiguous(op, t.T) @ops(reduction_ops) def test_noncontiguous_expanded(self, device, dtype, op: ReductionOpInfo): """Tests reducing a tensor with expanded singleton dimensions.""" - t = make_tensor((2, 3), device, dtype, low=-1, high=1) + t = make_tensor((2, 3), dtype=dtype, device=device, low=-1, high=1) self._test_noncontiguous(op, t.unsqueeze(1).expand(-1, 5, -1)) # NumPy does not support BFloat16 so we don't test that against reference @@ -357,16 +357,16 @@ def _test_ref(self, op: ReductionOpInfo, t: torch.Tensor, **reduction_kwargs): self.assertEqual(result, expected, exact_dtype=False) @ops(filter(lambda op: op.ref is not None, reduction_ops), - allowed_dtypes=get_all_dtypes(include_bfloat16=False)) + allowed_dtypes=all_types_and_complex_and(torch.half, torch.bool)) def test_ref_scalar_input(self, device, dtype, op: ReductionOpInfo): """Compares op against reference for scalar input tensors""" - self._test_ref(op, make_tensor([], device, dtype)) + self._test_ref(op, make_tensor([], dtype=dtype, device=device)) @ops(filter(lambda op: op.ref is not None, reduction_ops), - allowed_dtypes=get_all_dtypes(include_bfloat16=False)) + allowed_dtypes=all_types_and_complex_and(torch.half, torch.bool)) def test_ref_small_input(self, device, dtype, op: ReductionOpInfo): """Compares op against reference for small input tensors""" - t = make_tensor((5, 3, 4, 2), device, dtype, low=-2, high=2, exclude_zero=True) + t = make_tensor((5, 3, 4, 2), dtype=dtype, device=device, low=-2, high=2, exclude_zero=True) self._test_ref(op, t) for dim in [0, 1, 3] + ([[0, 2], [1, 3]] if op.supports_multiple_dims else []): self._test_ref(op, t, dim=dim) @@ -375,26 +375,27 @@ def test_ref_small_input(self, device, dtype, op: ReductionOpInfo): allowed_dtypes=[torch.float64]) def test_ref_large_input_1D(self, device, dtype, op: ReductionOpInfo): """Compares op against reference for a large 1D input tensor to check stability""" - self._test_ref(op, make_tensor((2 ** 20,), device, dtype, low=-1, high=1, exclude_zero=True)) + self._test_ref(op, make_tensor((2 ** 20,), dtype=dtype, device=device, low=-1, high=1, exclude_zero=True)) @ops(filter(lambda op: op.ref is not None, reduction_ops), allowed_dtypes=[torch.float64]) def test_ref_large_input_2D(self, device, dtype, op: ReductionOpInfo): """Compares op against reference for a large 2D input tensor to test parallelism""" - t = make_tensor((32, 2 ** 16), device, dtype, low=-1, high=1, exclude_zero=True) + t = make_tensor((32, 2 ** 16), dtype=dtype, device=device, low=-1, high=1, exclude_zero=True) self._test_ref(op, t, dim=1) + @largeTensorTest("8gb") @ops(filter(lambda op: op.ref is not None, reduction_ops), allowed_dtypes=[torch.float64]) def test_ref_large_input_64bit_indexing(self, device, dtype, op: ReductionOpInfo): """Compares op against reference for a very large input tensor that requires 64 bit indexing""" - self._test_ref(op, make_tensor((275000000,), device, dtype, low=-1, high=1, exclude_zero=True)) + self._test_ref(op, make_tensor((275000000,), dtype=dtype, device=device, low=-1, high=1, exclude_zero=True)) @ops(filter(lambda op: op.ref is not None, reduction_ops), - allowed_dtypes=get_all_dtypes(include_bfloat16=False)) + allowed_dtypes=all_types_and_complex_and(torch.half, torch.bool)) def test_ref_duplicate_values(self, device, dtype, op: ReductionOpInfo): """Compares op against reference for input tensors with duplicate values""" - t = make_tensor((4, 4), device, dtype, low=-2, high=2, exclude_zero=True) + t = make_tensor((4, 4), dtype=dtype, device=device, low=-2, high=2, exclude_zero=True) t[::2, ::2] = t[1::2, 1::2] self._test_ref(op, t) self._test_ref(op, t, dim=0) @@ -404,7 +405,7 @@ def test_ref_duplicate_values(self, device, dtype, op: ReductionOpInfo): allowed_dtypes=[torch.float32, torch.complex64]) def test_ref_extremal_values(self, device, dtype, op: ReductionOpInfo): """Compares op against reference for input tensors with extremal values""" - t = make_tensor((5,), device, dtype, exclude_zero=True) + t = make_tensor((5,), dtype=dtype, device=device, exclude_zero=True) extremals = [0, 1, nan, inf, -inf] for extremal in extremals: t[2] = extremal @@ -452,7 +453,7 @@ def test_dim_reduction_less_than_64(self, device): sizes = [1] * 65 x = torch.randn(sizes, device=device) ops = [torch.mean, torch.sum, torch.nansum, torch.std, torch.logsumexp, torch.std, torch.var, - torch.amin, torch.amax, torch.norm] + torch.norm] for op in ops: with self.assertRaisesRegex(RuntimeError, "only tensors with up to 64 dims are supported"): op(x, 64) @@ -743,6 +744,15 @@ def test_logsumexp_dim(self, device): lambda n, d: logsumexp(n, d), use_integral=False) + @onlyCPU + def test_mean_int_with_optdtype(self, device): + a = make_tensor((3, 4, 5), dtype=torch.int64, device=device) + + # If the optional desired output type is given, the input + # is internally cast. + a_float = a.to(torch.float32) + self.assertEqual(a_float.mean(), a.mean(dtype=torch.float32)) + # TODO: update this and tests that use it to handle device properly def _test_reduce_integer_upcast(self, fn, has_out=True, test_complex=True): shape = (3, 4, 5) @@ -1101,6 +1111,10 @@ def test_bincount(self, device): self.assertEqual( torch.tensor([1, 1, 1, 2], dtype=torch.int64, device=device), long_counts) + # test avoiding overflow for uint8 (#76979) + count_uint8 = torch.tensor([0, 1, 2, 3, 255], dtype=torch.uint8, device=device).bincount() + count_int16 = torch.tensor([0, 1, 2, 3, 255], dtype=torch.int16, device=device).bincount() + self.assertEqual(count_uint8, count_int16) # test minlength functionality int_counts = torch.bincount( torch.tensor([1, 1, 1, 1], device=device), minlength=5) @@ -1415,7 +1429,7 @@ def test_dtype_bfloat16(values_bf16=False, boundaries_bf16=False): test_dtype_bfloat16(False, True) test_dtype_bfloat16(True, True) - @dtypes(*get_all_dtypes(include_bool=False, include_complex=False)) + @dtypes(*all_types_and(torch.half, torch.bfloat16)) def test_nansum(self, device, dtype): args = product( (True, False), # noncontiguous @@ -1468,15 +1482,14 @@ def _test_reduction_function_with_numpy(self, torch_func, np_func, device, dtype self.compare_with_numpy(torch_func_partial, np_func_partial, x, device=None, dtype=None, atol=atol, rtol=rtol, exact_dtype=exact_dtype) - @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False) + - get_all_complex_dtypes())) + @dtypes(*all_types_and_complex_and(torch.half)) def test_count_nonzero(self, device, dtype): self._test_reduction_function_with_numpy(torch.count_nonzero, np.count_nonzero, device, dtype) self._test_reduction_function_with_numpy(torch.count_nonzero, np.count_nonzero, device, dtype, True) def _test_sum_reduction_vs_numpy(self, torch_fn, np_fn, device, dtype, with_keepdim=False, with_extremal=False): def is_integral(dtype): - return dtype in get_all_int_dtypes() + return dtype in integral_types() # On Windows CI, the current version of `numpy` promotes all lower integers # dtypes to int32 while `torch` promotes them to int64. Hence we skip on checking @@ -1505,28 +1518,30 @@ def is_integral(dtype): with_keepdim=with_keepdim, with_extremal=with_extremal) @onlyNativeDeviceTypes - @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False))) + @dtypes(*all_types_and(torch.half)) def test_sum_vs_numpy(self, device, dtype): self._test_sum_reduction_vs_numpy(torch.sum, np.sum, device, dtype) self._test_sum_reduction_vs_numpy(torch.sum, np.sum, device, dtype, with_extremal=True) self._test_sum_reduction_vs_numpy(torch.sum, np.sum, device, dtype, with_keepdim=True) @onlyNativeDeviceTypes - @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False))) + @dtypes(*all_types_and(torch.half)) def test_nansum_vs_numpy(self, device, dtype): self._test_sum_reduction_vs_numpy(torch.nansum, np.nansum, device, dtype) self._test_sum_reduction_vs_numpy(torch.nansum, np.nansum, device, dtype, with_extremal=True) self._test_sum_reduction_vs_numpy(torch.nansum, np.nansum, device, dtype, with_keepdim=True) - @dtypes(*(get_all_complex_dtypes())) + @dtypes(*complex_types()) def test_nansum_complex(self, device, dtype): x = torch.randn((3, 3, 3), device=device, dtype=dtype) with self.assertRaisesRegex(RuntimeError, "nansum does not support complex inputs"): torch.nansum(x) - def test_nansum_out_dtype(self, device): - dtypes = list(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False)) - for inp_dtype, out_dtype in combinations(dtypes, 2): + @dtypes(*all_types_and(torch.half)) + def test_nansum_out_dtype(self, device, dtype): + out_dtype = dtype + inp_dtypes = all_types_and(torch.half) if out_dtype.is_floating_point else integral_types() + for inp_dtype in inp_dtypes: shape = _rand_shape(random.randint(2, 5), min_size=5, max_size=10) x = _generate_input(shape, inp_dtype, device, with_extremal=False) torch_fn = partial(torch.nansum, dtype=out_dtype) @@ -1534,7 +1549,7 @@ def test_nansum_out_dtype(self, device): np_fn = partial(np.nansum, dtype=np_out_dtype) self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None) - @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False))) + @dtypes(*all_types_and(torch.half)) def test_argminmax_multiple(self, device, dtype): # Case: All Ones t = torch.ones(3, 3, device=device, dtype=dtype) @@ -1542,7 +1557,7 @@ def test_argminmax_multiple(self, device, dtype): self.compare_with_numpy(torch.argmin, np.argmin, t) # Case: With single `nan` present. - if dtype in get_all_fp_dtypes(): + if dtype in floating_types_and(torch.half, torch.bfloat16): t[2, 2] = float('nan') self.compare_with_numpy(torch.argmax, np.argmax, t) self.compare_with_numpy(torch.argmin, np.argmin, t) @@ -1619,8 +1634,7 @@ def verify_against_numpy(t): [0, 0]], device=device, dtype=dtype) verify_against_numpy(t) - @dtypes(*(get_all_dtypes(include_half=True, include_bfloat16=False, - include_bool=True, include_complex=True))) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool)) def test_all_any_vs_numpy(self, device, dtype): # Note [all, any uint8 compatibility]: However for compatibility reason, # for `uint8`, they return Tensor of same dtype `uint8`. @@ -1735,7 +1749,7 @@ def _test_output_dtype(x): @onlyNativeDeviceTypes def test_repeated_dim(self, device): ops = [torch.mean, torch.sum, torch.nansum, torch.std, torch.logsumexp, torch.std, torch.var, - torch.amin, torch.amax, torch.norm] + torch.norm] x = torch.randn(3, 3, 3, 3, device=device) error_msg = r'appears multiple times in the list of dims' @@ -1835,10 +1849,6 @@ def test_minmax_illegal_dtype(self, device): torch.max(x, dim=0, out=(illegal_values, valid_indices)) with self.assertRaisesRegex(RuntimeError, rmsg): torch.min(x, dim=0, out=(illegal_values, valid_indices)) - with self.assertRaisesRegex(RuntimeError, rmsg): - torch.amax(x, dim=0, out=illegal_values) - with self.assertRaisesRegex(RuntimeError, rmsg): - torch.amin(x, dim=0, out=illegal_values) with self.assertRaisesRegex(RuntimeError, rmsg): torch.max(x, dim=0, out=(valid_values, illegal_indices)) with self.assertRaisesRegex(RuntimeError, rmsg): @@ -1848,7 +1858,7 @@ def test_minmax_illegal_dtype(self, device): with self.assertRaisesRegex(RuntimeError, rmsg): torch.min(x, dim=0, out=(illegal_values, illegal_indices)) - @dtypes(*get_all_dtypes(include_bool=False, include_complex=False)) + @dtypes(*all_types_and(torch.half, torch.bfloat16)) def test_dim_arg_reduction_scalar(self, device, dtype): example = 4.0 @@ -1866,7 +1876,7 @@ def test_dim_arg_reduction_scalar(self, device, dtype): @precisionOverride({torch.float16: 1e-2, torch.bfloat16: 1e-2}) - @dtypes(*(set(get_all_dtypes(include_bool=False, include_complex=False)) - {torch.uint8})) + @dtypes(*set(all_types_and(torch.half, torch.bfloat16)) - {torch.uint8}) def test_dim_reduction(self, device, dtype): example = [[-1, 2, 1], [5, 3, 6]] @@ -2792,6 +2802,15 @@ def test_against_np(tensor, bins=100, min=0, max=0): expanded = torch.randn(1, 5, 1, 2, device=device).expand(3, 5, 7, 2) test_against_np(expanded) + @onlyCPU + def test_histc_bfloat16(self, device): + actual = torch.histc( + torch.tensor([1, 2, 1], dtype=torch.bfloat16, device=device), bins=4, min=0, max=3) + self.assertEqual( + torch.tensor([0, 2, 1, 0], dtype=torch.bfloat16, device=device), + actual) + self.assertEqual(actual.dtype, torch.bfloat16) + """ Runs torch.histogram and numpy.histogram on the specified input parameters and asserts that their output is equal. @@ -2862,8 +2881,8 @@ def test_histogram(self, device, dtype): for contig, bins_contig, bin_ct, weighted, density, shape in \ product([True, False], [True, False], range(1, 10), [True, False], [True, False], shapes): - values = make_tensor(shape, device, dtype, low=-9, high=9, noncontiguous=not contig) - weights = make_tensor(shape, device, dtype, low=0, high=9, noncontiguous=not contig) if weighted else None + values = make_tensor(shape, dtype=dtype, device=device, low=-9, high=9, noncontiguous=not contig) + weights = make_tensor(shape, dtype=dtype, device=device, low=0, high=9, noncontiguous=not contig) if weighted else None # Tests passing just the bin_ct self._test_histogram_numpy(values, bin_ct, None, weights, density) @@ -2877,10 +2896,10 @@ def test_histogram(self, device, dtype): self._test_histogram_numpy(values, bin_ct, bin_range, weights, density) # Tests with caller-specified bin edges - bin_edges = make_tensor(bin_ct + 1, device, dtype, low=-9, high=9).msort() + bin_edges = make_tensor(bin_ct + 1, dtype=dtype, device=device, low=-9, high=9).msort() if not bins_contig: # Necessary because msort always produces contiguous output - bin_edges_noncontig = make_tensor(bin_ct + 1, device, dtype, noncontiguous=not bins_contig) + bin_edges_noncontig = make_tensor(bin_ct + 1, dtype=dtype, device=device, noncontiguous=not bins_contig) bin_edges_noncontig.copy_(bin_edges) bin_edges = bin_edges_noncontig self.assertEqual(bin_edges.is_contiguous(), bins_contig) @@ -2888,17 +2907,21 @@ def test_histogram(self, device, dtype): # Tests with input tensor in which all elements are equal elt = random.uniform(-9, 9) - values = make_tensor(shape, device, dtype, low=elt, high=elt, noncontiguous=not contig) + values = make_tensor(shape, dtype=dtype, device=device, low=elt, high=elt, noncontiguous=not contig) self._test_histogram_numpy(values, bin_ct, bin_range, weights, density) self._test_histogram_numpy(values, bin_edges, None, weights, density) # Tests with input equal to bin_edges - weights = make_tensor(bin_ct + 1, device, dtype, low=0, high=9, noncontiguous=not contig) if weighted else None + weights = ( + make_tensor(bin_ct + 1, dtype=dtype, device=device, low=0, high=9, noncontiguous=not contig) + if weighted + else None + ) self._test_histogram_numpy(bin_edges, bin_edges, None, weights, density) # Tests values of default args for bin_ct, shape in product(range(1, 10), shapes): - values = make_tensor(shape, device, dtype, low=-9, high=9) + values = make_tensor(shape, dtype=dtype, device=device, low=-9, high=9) (actual_hist, actual_bin_edges) = torch.histogram(values, bin_ct) (expected_hist, expected_bin_edges) = torch.histogram( values, bin_ct, range=None, weight=None, density=False) @@ -2982,8 +3005,12 @@ def test_histogramdd(self, device, dtype): product([True, False], [True, False], [True, False], [True, False], shapes): D = shape[-1] - values = make_tensor(shape, device, dtype, low=-9, high=9, noncontiguous=not contig) - weights = make_tensor(shape[:-1], device, dtype, low=0, high=9, noncontiguous=not contig) if weighted else None + values = make_tensor(shape, dtype=dtype, device=device, low=-9, high=9, noncontiguous=not contig) + weights = ( + make_tensor(shape[:-1], dtype=dtype, device=device, low=0, high=9, noncontiguous=not contig) + if weighted + else None + ) # Tests passing a single bin count bin_ct = random.randint(1, 5) @@ -3004,10 +3031,13 @@ def test_histogramdd(self, device, dtype): self._test_histogramdd_numpy(values, bin_ct, bin_range, weights, density) # Tests with caller-specified bin edges - bin_edges = [make_tensor(ct + 1, device, dtype, low=-9, high=9).msort() for ct in bin_ct] + bin_edges = [make_tensor(ct + 1, dtype=dtype, device=device, low=-9, high=9).msort() for ct in bin_ct] if not bins_contig: # Necessary because msort always produces contiguous output - bin_edges_noncontig = [make_tensor(ct + 1, device, dtype, noncontiguous=not bins_contig) for ct in bin_ct] + bin_edges_noncontig = [ + make_tensor(ct + 1, dtype=dtype, device=device, noncontiguous=not bins_contig) + for ct in bin_ct + ] for dim in range(D): bin_edges_noncontig[dim].copy_(bin_edges[dim]) bin_edges = bin_edges_noncontig @@ -3019,58 +3049,58 @@ def test_histogramdd(self, device, dtype): @dtypes(torch.float32) def test_histogram_error_handling(self, device, dtype): with self.assertRaisesRegex(RuntimeError, 'not implemented for'): - values = make_tensor((), device, dtype=torch.int32) + values = make_tensor((), dtype=torch.int32, device=device) torch.histogram(values, 1) inconsistent_dtype = torch.float32 if dtype != torch.float32 else torch.float64 with self.assertRaisesRegex(RuntimeError, 'input tensor and bins tensors should have the same dtype'): - values = make_tensor((), device, dtype=dtype) - bins = make_tensor((), device, dtype=inconsistent_dtype) + values = make_tensor((), dtype=dtype, device=device) + bins = make_tensor((), dtype=inconsistent_dtype, device=device) torch.histogram(values, bins) with self.assertRaisesRegex(RuntimeError, 'input tensor and weight tensor should have the same dtype'): - values = make_tensor((), device, dtype=dtype) - weight = make_tensor((), device, dtype=inconsistent_dtype) + values = make_tensor((), dtype=dtype, device=device) + weight = make_tensor((), dtype=inconsistent_dtype, device=device) torch.histogram(values, 1, weight=weight) with self.assertRaisesRegex(RuntimeError, 'input tensor and hist tensor should have the same dtype'): - values = make_tensor((), device, dtype=dtype) - hist = make_tensor((), device, dtype=inconsistent_dtype) - bin_edges = make_tensor((), device, dtype=dtype) + values = make_tensor((), dtype=dtype, device=device) + hist = make_tensor((), dtype=inconsistent_dtype, device=device) + bin_edges = make_tensor((), dtype=dtype, device=device) torch.histogram(values, 1, out=(hist, bin_edges)) with self.assertRaisesRegex(RuntimeError, 'input tensor and bin_edges tensor should have the same dtype'): - values = make_tensor((), device, dtype=dtype) - hist = make_tensor((), device, dtype=dtype) - bin_edges = make_tensor((), device, dtype=inconsistent_dtype) + values = make_tensor((), dtype=dtype, device=device) + hist = make_tensor((), dtype=dtype, device=device) + bin_edges = make_tensor((), dtype=inconsistent_dtype, device=device) torch.histogram(values, 1, out=(hist, bin_edges)) with self.assertRaisesRegex(RuntimeError, 'bins tensor should have one dimension'): - t = make_tensor((2, 2), device, dtype=dtype) + t = make_tensor((2, 2), dtype=dtype, device=device) torch.histogram(t, t) with self.assertRaisesRegex(RuntimeError, 'bins tensor should have at least 1 element'): - t = make_tensor((0), device, dtype=dtype) + t = make_tensor((0), dtype=dtype, device=device) torch.histogram(t, t) with self.assertRaisesRegex(RuntimeError, 'bins must be > 0'): - values = make_tensor((), device, dtype=dtype) + values = make_tensor((), dtype=dtype, device=device) torch.histogram(values, -1) with self.assertRaisesRegex(RuntimeError, 'if weight tensor is provided it should have the same shape \ as the input tensor excluding its innermost dimension'): - values = make_tensor((2, 2), device, dtype=dtype) - weight = make_tensor((1), device, dtype=dtype) + values = make_tensor((2, 2), dtype=dtype, device=device) + weight = make_tensor((1), dtype=dtype, device=device) torch.histogram(values, 1, weight=weight) with self.assertRaisesRegex(TypeError, 'received an invalid combination of arguments'): - values = make_tensor((), device, dtype=dtype) - bin_edges = make_tensor((), device, dtype=dtype) + values = make_tensor((), dtype=dtype, device=device) + bin_edges = make_tensor((), dtype=dtype, device=device) torch.histogram(values, bin_edges, range=(0, 1)) with self.assertRaisesRegex(RuntimeError, 'min should not exceed max'): - values = make_tensor((), device, dtype=dtype) + values = make_tensor((), dtype=dtype, device=device) torch.histogram(values, 2, range=(1, 0)) with self.assertRaisesRegex(RuntimeError, r'range \[nan, nan\] is not finite'): @@ -3230,8 +3260,7 @@ def test_reduction_empty_any_all(self, device): shape = (2, 0, 4) x = torch.randn(shape, device=device) - for dtype in get_all_dtypes(include_half=True, include_bfloat16=False, - include_bool=True, include_complex=True): + for dtype in all_types_and_complex_and(torch.half, torch.bool): # Refer: [all, any uint8 compatibility] if dtype == torch.uint8: out_dtype = torch.uint8 diff --git a/test/test_scatter_gather_ops.py b/test/test_scatter_gather_ops.py index cd944da73667..d82cdca5534e 100644 --- a/test/test_scatter_gather_ops.py +++ b/test/test_scatter_gather_ops.py @@ -10,7 +10,9 @@ (run_tests, TestCase,) from torch.testing._internal.common_device_type import \ (instantiate_device_type_tests, dtypes, dtypesIfCUDA, - toleranceOverride, tol) + toleranceOverride, tol,) +from torch.testing._internal.common_dtype import \ + (get_all_dtypes, get_all_fp_dtypes,) # Protects against includes accidentally setting the default dtype assert torch.get_default_dtype() is torch.float32 @@ -22,13 +24,16 @@ class TestScatterGather(TestCase): # Fills an index tensor with valid indices - def _fill_indices(self, idx, dim, dim_size, elems_per_row, m, n, o): + def _fill_indices(self, idx, dim, dim_size, elems_per_row, m, n, o, unique_indices=True): for i in range(1 if dim == 0 else m): for j in range(1 if dim == 1 else n): for k in range(1 if dim == 2 else o): ii = [i, j, k] ii[dim] = slice(0, idx.size(dim) + 1) - idx[tuple(ii)] = torch.randperm(dim_size)[0:elems_per_row] + if unique_indices: + idx[tuple(ii)] = torch.randperm(dim_size)[0:elems_per_row] + else: + idx[tuple(ii)] = torch.randint(dim_size, (elems_per_row,)) @dtypes(torch.float32, torch.complex64) def test_gather(self, device, dtype): @@ -67,7 +72,8 @@ def test_gather_bool(self, device, dtype): expected = torch.tensor(((False, False), (True, True)), device=device, dtype=dtype) self.assertEqual(actual, expected, atol=0, rtol=0) - def _test_scatter_base(self, fn, *, device, dtype, is_scalar, reduction): + def _test_scatter_base(self, fn, *, device, dtype, is_scalar, reduction, + unique_indices=True, include_self=True): m, n, o = random.randint(10, 20), random.randint(10, 20), random.randint(10, 20) elems_per_row = random.randint(1, 10) dim = random.randrange(3) @@ -75,7 +81,7 @@ def _test_scatter_base(self, fn, *, device, dtype, is_scalar, reduction): idx_size = [m, n, o] idx_size[dim] = elems_per_row idx = torch.empty(tuple(idx_size), device=device, dtype=torch.long) - self._fill_indices(idx, dim, ([m, n, o])[dim], elems_per_row, m, n, o) + self._fill_indices(idx, dim, ([m, n, o])[dim], elems_per_row, m, n, o, unique_indices) if is_scalar: src = random.random() @@ -85,11 +91,15 @@ def _test_scatter_base(self, fn, *, device, dtype, is_scalar, reduction): base = make_tensor((m, n, o), device=device, dtype=dtype) if reduction is not None: - actual = fn(base.clone(), dim, idx, src, reduce=reduction) + if fn is torch.Tensor.scatter_reduce_: + actual = fn(base.clone(), dim, idx, src, reduce=reduction, include_self=include_self) + else: + actual = fn(base.clone(), dim, idx, src, reduce=reduction) else: actual = fn(base.clone(), dim, idx, src) expected = base.clone() + counts = torch.zeros(base.shape, dtype=torch.long, device=device) + include_self for i in range(idx_size[0]): for j in range(idx_size[1]): for k in range(idx_size[2]): @@ -98,16 +108,35 @@ def _test_scatter_base(self, fn, *, device, dtype, is_scalar, reduction): if fn is torch.Tensor.scatter_add_: expected[tuple(ii)] += src[i, j, k] else: - # method may be 'scatter_' or 'scatter' - # both might have a reduction argument + # method may be 'scatter_', 'scatter', 'scatter_reduce' + # or 'scatter_reduce_', the former two might have a reduction argument + # while the latter two always do value = src if is_scalar else src[i, j, k] - if reduction == "add": - expected[tuple(ii)] += value - elif reduction == "multiply": - expected[tuple(ii)] *= value - else: + if ((not include_self) and counts[tuple(ii)] == 0): expected[tuple(ii)] = value + else: + if reduction == "add" or reduction == "sum": + expected[tuple(ii)] += value + elif reduction == "multiply" or reduction == "prod": + expected[tuple(ii)] *= value + elif reduction == "amax": + expected[tuple(ii)] = max(expected[tuple(ii)], value) + elif reduction == "amin": + expected[tuple(ii)] = min(expected[tuple(ii)], value) + elif reduction == "mean": + expected[tuple(ii)] += value + else: + expected[tuple(ii)] = value + + counts[tuple(ii)] += 1 + + if (reduction == "mean"): + counts.masked_fill_(counts == 0, 1) + if (dtype.is_floating_point or dtype.is_complex): + expected /= counts + else: + expected.div_(counts, rounding_mode="floor") self.assertEqual(actual, expected, atol=0, rtol=0) @@ -158,6 +187,67 @@ def test_scatter_add_mult_index_base(self, device, dtype): self.assertEqual(res0[0, :], m * torch.ones(n, device=device, dtype=dtype), atol=0, rtol=0) self.assertEqual(res1[:, 0], n * torch.ones(m, device=device, dtype=dtype), atol=0, rtol=0) + # FIXME: discrepancy between bool ReduceAdd on CUDA and CPU (a + b on CPU and buggy a && b on CUDA) + @dtypes(*get_all_dtypes(include_half=True, include_bfloat16=True, include_bool=False)) + def test_scatter_reduce_sum(self, device, dtype): + for include_self in (True, False): + self._test_scatter_base(torch.Tensor.scatter_reduce_, device=device, dtype=dtype, + is_scalar=False, reduction='sum', unique_indices=False, + include_self=include_self) + + @dtypes(*get_all_dtypes(include_half=True, include_bfloat16=True)) + @dtypesIfCUDA(*get_all_fp_dtypes(include_half=True, include_bfloat16=True)) + def test_scatter_reduce_prod(self, device, dtype): + for include_self in (True, False): + self._test_scatter_base(torch.Tensor.scatter_reduce_, device=device, dtype=dtype, + is_scalar=False, reduction='prod', unique_indices=False, + include_self=include_self) + + @dtypes(*get_all_dtypes(include_half=True, include_bfloat16=True, include_bool=False)) + @dtypesIfCUDA(*get_all_fp_dtypes(include_half=True, include_bfloat16=True)) + def test_scatter_reduce_mean(self, device, dtype): + for include_self in (True, False): + self._test_scatter_base(torch.Tensor.scatter_reduce_, device=device, dtype=dtype, + is_scalar=False, reduction='mean', unique_indices=False, + include_self=include_self) + + @dtypes(*get_all_dtypes(include_half=True, include_bfloat16=True, include_complex=False)) + @dtypesIfCUDA(*get_all_fp_dtypes(include_half=True, include_bfloat16=True)) + def test_scatter_reduce_amax(self, device, dtype): + for include_self in (True, False): + self._test_scatter_base(torch.Tensor.scatter_reduce_, device=device, dtype=dtype, + is_scalar=False, reduction='amax', unique_indices=False, + include_self=include_self) + # simple test for nan/inf propagation + if (dtype.is_floating_point): + input = torch.zeros(3, device=device, dtype=dtype) + src = torch.tensor([1, float('nan'), -float('inf'), -float('inf'), 2, float('inf')], device=device, dtype=dtype) + idx = torch.tensor([0, 0, 1, 1, 2, 2], device=device) + input.scatter_reduce_(0, idx, src, 'amax', include_self=include_self) + expected_result = torch.tensor([float('nan'), -float('inf'), float('inf')], device=device, dtype=dtype) + if (include_self): + expected_result[1] = 0 + self.assertEqual(input, expected_result) + + + @dtypes(*get_all_dtypes(include_half=True, include_bfloat16=True, include_complex=False)) + @dtypesIfCUDA(*get_all_fp_dtypes(include_half=True, include_bfloat16=True)) + def test_scatter_reduce_amin(self, device, dtype): + for include_self in (True, False): + self._test_scatter_base(torch.Tensor.scatter_reduce_, device=device, dtype=dtype, + is_scalar=False, reduction='amin', unique_indices=False, + include_self=include_self) + # simple test for nan/inf propagation + if (dtype.is_floating_point): + input = torch.zeros(3, device=device, dtype=dtype) + src = torch.tensor([1, float('nan'), -2, -float('inf'), float('inf'), float('inf')], device=device, dtype=dtype) + idx = torch.tensor([0, 0, 1, 1, 2, 2], device=device) + input.scatter_reduce_(0, idx, src, 'amin', include_self=include_self) + expected_result = torch.tensor([float('nan'), -float('inf'), float('inf')], device=device, dtype=dtype) + if (include_self): + expected_result[2] = 0 + self.assertEqual(input, expected_result) + # Generic Device Test Framework instantation, see # https://github.com/pytorch/pytorch/wiki/Running-and-writing-tests diff --git a/test/test_serialization.py b/test/test_serialization.py index a4fa6e8c9ba5..2643b4bcad5c 100644 --- a/test/test_serialization.py +++ b/test/test_serialization.py @@ -20,10 +20,10 @@ from torch._utils import _rebuild_tensor from torch.serialization import check_module_version_greater_or_equal -from torch.testing._internal.common_utils import TestCase, IS_WINDOWS, \ - TEST_DILL, run_tests, download_file, BytesIOContext, TemporaryFileName +from torch.testing._internal.common_utils import TestCase, IS_WINDOWS, TEST_DILL, \ + run_tests, download_file, BytesIOContext, TemporaryFileName, parametrize, instantiate_parametrized_tests from torch.testing._internal.common_device_type import instantiate_device_type_tests -from torch.testing._internal.common_dtype import get_all_dtypes +from torch.testing._internal.common_dtype import all_types_and_complex_and # These tests were all copied from `test/test_torch.py` at some point, so see # the actual blame, see this revision @@ -97,7 +97,7 @@ def _test_serialization_assert(self, b, c): self.assertTrue(isinstance(c[1], torch.FloatTensor)) self.assertTrue(isinstance(c[2], torch.FloatTensor)) self.assertTrue(isinstance(c[3], torch.FloatTensor)) - self.assertTrue(isinstance(c[4], torch.storage.TypedStorage)) + self.assertTrue(isinstance(c[4], torch.storage._TypedStorage)) self.assertEqual(c[4].dtype, torch.float) c[0].fill_(10) self.assertEqual(c[0], c[2], atol=0, rtol=0) @@ -370,7 +370,7 @@ def test_serialization_backwards_compat(self): self.assertTrue(isinstance(c[1], torch.FloatTensor)) self.assertTrue(isinstance(c[2], torch.FloatTensor)) self.assertTrue(isinstance(c[3], torch.FloatTensor)) - self.assertTrue(isinstance(c[4], torch.storage.TypedStorage)) + self.assertTrue(isinstance(c[4], torch.storage._TypedStorage)) self.assertEqual(c[4].dtype, torch.float32) c[0].fill_(10) self.assertEqual(c[0], c[2], atol=0, rtol=0) @@ -414,7 +414,7 @@ def test_serialization_save_warnings(self): with warnings.catch_warnings(record=True) as warns: with tempfile.NamedTemporaryFile() as checkpoint: x = torch.save(torch.nn.Linear(2, 3), checkpoint) - self.assertEquals(len(warns), 0) + self.assertEqual(len(warns), 0) def test_serialization_map_location(self): test_file_path = download_file('https://download.pytorch.org/test_data/gpu_tensors.pt') @@ -616,11 +616,12 @@ def save_load_check(a, b): self.assertEqual(a, a_loaded) self.assertEqual(b, b_loaded) - for device, dtype in product(devices, get_all_dtypes()): + for device, dtype in product(devices, all_types_and_complex_and(torch.half, + torch.bfloat16, torch.bool)): a = torch.tensor([], dtype=dtype, device=device) - for other_dtype in get_all_dtypes(): - s = torch.TypedStorage( + for other_dtype in all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool): + s = torch._TypedStorage( wrap_storage=a.storage()._untyped(), dtype=other_dtype) save_load_check(a, s) @@ -652,7 +653,7 @@ def test_save_different_dtype_error(self): torch.save([a.storage(), a.imag.storage()], f) a = torch.randn(10, device=device) - s_bytes = torch.TypedStorage( + s_bytes = torch._TypedStorage( wrap_storage=a.storage()._untyped(), dtype=torch.uint8) @@ -726,7 +727,7 @@ def import_module(name, filename): loaded = torch.load(checkpoint) self.assertTrue(isinstance(loaded, module.Net)) if can_retrieve_source: - self.assertEquals(len(w), 0) + self.assertEqual(len(w), 0) # Replace the module with different source fname = get_file_path_2(os.path.dirname(os.path.dirname(torch.__file__)), 'torch', 'testing', @@ -737,7 +738,7 @@ def import_module(name, filename): loaded = torch.load(checkpoint) self.assertTrue(isinstance(loaded, module.Net)) if can_retrieve_source: - self.assertEquals(len(w), 1) + self.assertEqual(len(w), 1) self.assertTrue(w[0].category, 'SourceChangeWarning') def test_serialization_container(self): @@ -869,6 +870,9 @@ def __new__(cls, elem, *args, **kwargs): r.elem = elem return r + def clone(self): + return type(self)(self.elem.clone()) + class TestGetStateSubclass(torch.Tensor): elem: torch.Tensor @@ -944,8 +948,18 @@ def test_tensor_subclass_deepcopy(self): self.assertEqual(new_tensor.elem, my_tensor.elem) self.assertEqual(new_tensor.foo, foo_val) + @parametrize('requires_grad', (True, False)) + def test_cloned_deepcopy(self, requires_grad): + my_tensor = torch.rand(2, requires_grad=requires_grad, device='meta') + + new_tensor = deepcopy(my_tensor) + + self.assertEqual(new_tensor.requires_grad, my_tensor.requires_grad) + + instantiate_device_type_tests(TestBothSerialization, globals()) +instantiate_parametrized_tests(TestSubclassSerialization) if __name__ == '__main__': run_tests() diff --git a/test/test_shape_ops.py b/test/test_shape_ops.py index 0267852ceb6a..b6557eed0d25 100644 --- a/test/test_shape_ops.py +++ b/test/test_shape_ops.py @@ -15,7 +15,7 @@ from torch.testing._internal.common_device_type import ( instantiate_device_type_tests, onlyCPU, onlyCUDA, dtypes, onlyNativeDeviceTypes, dtypesIfCUDA, largeTensorTest) -from torch.testing._internal.common_dtype import get_all_dtypes +from torch.testing._internal.common_dtype import all_types_and_complex_and, all_types, all_types_and # TODO: replace with make_tensor def _generate_input(shape, dtype, device, with_extremal): @@ -227,12 +227,11 @@ def test_diagonal_multidim(self, device, dtype): self.assertEqual(expected, result) @onlyNativeDeviceTypes - @dtypes(*get_all_dtypes(include_complex=False, include_bool=False, include_half=False, - include_bfloat16=False)) - @dtypesIfCUDA(*get_all_dtypes(include_complex=False, include_bool=False, include_bfloat16=False)) + @dtypes(*all_types()) + @dtypesIfCUDA(*all_types_and(torch.half)) def test_trace(self, device, dtype): def test(shape): - tensor = make_tensor(shape, device, dtype, low=-9, high=9) + tensor = make_tensor(shape, dtype=dtype, device=device, low=-9, high=9) expected_dtype = tensor.sum().dtype expected_dtype = torch_to_numpy_dtype_dict[expected_dtype] @@ -341,7 +340,7 @@ def test_clamp_raises_arg_errors(self, device): with self.assertRaisesRegex(RuntimeError, error_msg): torch.clamp(X) - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) def test_flip(self, device, dtype): make_from_data = partial(torch.tensor, device=device, dtype=dtype) make_from_size = partial(make_tensor, device=device, dtype=dtype) @@ -440,7 +439,7 @@ def gen_data(): for dims in test_dims: self.assertEqual(size, list(data.flip(dims).size())) - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) def test_flip_errors(self, device, dtype): make_arg = partial(make_tensor, dtype=dtype, device=device) data = make_arg((2, 2, 2)) @@ -458,7 +457,7 @@ def test_flip_errors(self, device, dtype): def _rand_shape(self, dim, min_size, max_size): return tuple(torch.randint(min_size, max_size + 1, (dim,))) - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) def test_flip_numpy(self, device, dtype): make_arg = partial(make_tensor, dtype=dtype, device=device) @@ -567,7 +566,7 @@ def test_nonzero_no_warning(self, device): t.nonzero() self.assertEqual(len(w), 0) - @dtypes(*get_all_dtypes(include_complex=False)) + @dtypes(*all_types_and(torch.half, torch.bool, torch.bfloat16)) def test_nonzero(self, device, dtype): shapes = [ diff --git a/test/test_sort_and_select.py b/test/test_sort_and_select.py index b44b09ffa1dc..19394c0809c8 100644 --- a/test/test_sort_and_select.py +++ b/test/test_sort_and_select.py @@ -8,14 +8,12 @@ from itertools import permutations, product from torch.testing import make_tensor -from torch.testing._internal.common_dtype import ( - all_types, all_types_and, floating_types_and, get_all_dtypes, get_all_int_dtypes, get_all_fp_dtypes, -) +from torch.testing._internal.common_dtype import all_types, all_types_and, floating_types_and from torch.testing._internal.common_utils import \ - (TEST_WITH_ROCM, TestCase, run_tests, slowTest) + (TestCase, run_tests, slowTest) from torch.testing._internal.common_device_type import \ (instantiate_device_type_tests, dtypes, onlyNativeDeviceTypes, - skipCUDAIfRocm, onlyCUDA, dtypesIfCUDA, dtypesIfCPU, onlyCPU, largeTensorTest) + onlyCUDA, dtypesIfCUDA, dtypesIfCPU, onlyCPU, largeTensorTest) # TODO: remove this SIZE = 100 @@ -132,11 +130,25 @@ def test_sort(self, device): self.assertIsOrdered('descending', x, res2val, res2ind, 'random with NaNs') + @onlyCUDA + def test_sort_large_slice(self, device): + # tests direct cub path + x = torch.randn(4, 1024000, device=device) + res1val, res1ind = torch.sort(x, stable=True) + torch.cuda.synchronize() + # assertIsOrdered is too slow, so just compare to cpu + res1val_cpu, res1ind_cpu = torch.sort(x.cpu(), stable=True) + self.assertEqual(res1val, res1val_cpu.cuda()) + self.assertEqual(res1ind, res1ind_cpu.cuda()) + res1val, res1ind = torch.sort(x, descending=True, stable=True) + torch.cuda.synchronize() + res1val_cpu, res1ind_cpu = torch.sort(x.cpu(), descending=True, stable=True) + self.assertEqual(res1val, res1val_cpu.cuda()) + self.assertEqual(res1ind, res1ind_cpu.cuda()) + # FIXME: remove torch.bool from unsupported types once support is added for cub sort - @dtypes(*set(get_all_dtypes()) - {torch.bool, torch.complex64, torch.complex128}) + @dtypes(*all_types_and(torch.half, torch.bfloat16)) def test_stable_sort(self, device, dtype): - if TEST_WITH_ROCM and dtype == torch.bfloat16: - return sizes = (100, 1000, 10000) for ncopies in sizes: x = torch.tensor([0, 1] * ncopies, dtype=dtype, device=device) @@ -167,6 +179,23 @@ def test_sort_large(self, device, dtype): self.assertEqual(vm, torch.arange(255, dtype=dtype, device=device)) self.assertEqual(im, t0.sort().indices) + + @dtypes(torch.float32) + def test_sort_restride(self, device, dtype): + # Input: non-contiguous (stride: 5) 3-element array + tensor = torch.randn((3, 5), dtype=dtype, device=device)[:, 0] + # Outputs: 0-dim tensors + # They will need to be resized, which means they will also be + # restrided with the input tensor's strides as base. + values = torch.tensor(0, dtype=dtype, device=device) + indices = torch.tensor(0, dtype=torch.long, device=device) + torch.sort(tensor, out=(values, indices)) + # Check: outputs were restrided to dense strides + self.assertEqual(values.stride(), (1,)) + self.assertEqual(indices.stride(), (1,)) + # Check: 'tensor' indexed by 'indices' is equal to 'values' + self.assertEqual(tensor[indices], values) + def _test_sort_discontiguous(self, device, dtype): # on CUDA 2048 vs >2048 have different code path for the dim being sorted sizes = (5, 7, 2049) @@ -228,10 +257,8 @@ def test_topk_1d_output_discontiguous(self, device, dtype): self.assertEqual(values, values_cont) # FIXME: remove torch.bool from unsupported types once support is added for cub sort - @dtypes(*set(get_all_dtypes()) - {torch.bool, torch.complex64, torch.complex128}) + @dtypes(*all_types_and(torch.half, torch.bfloat16)) def test_stable_sort_against_numpy(self, device, dtype): - if TEST_WITH_ROCM and dtype == torch.bfloat16: - return if dtype in floating_types_and(torch.float16, torch.bfloat16): inf = float('inf') neg_inf = -float('inf') @@ -293,13 +320,10 @@ def repeated_index_fill(t, dim, idxs, vals): idx_numpy = np.argsort(sample_numpy, axis=dim, kind='stable') self.assertEqual(idx_torch, idx_numpy) - @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) + @dtypes(*all_types_and(torch.half, torch.bfloat16)) def test_msort(self, device, dtype): - if TEST_WITH_ROCM and dtype == torch.bfloat16: - return - def test(shape): - tensor = make_tensor(shape, device, dtype, low=-9, high=9) + tensor = make_tensor(shape, dtype=dtype, device=device, low=-9, high=9) if tensor.size() != torch.Size([]): if dtype is torch.bfloat16: expected = torch.from_numpy(np.msort(tensor.float().cpu().numpy())).bfloat16() @@ -385,7 +409,6 @@ def test_topk_arguments(self, device): # Make sure True isn't mistakenly taken as the 2nd dimension (interpreted as 1) self.assertRaises(TypeError, lambda: q.topk(4, True)) - @skipCUDAIfRocm def test_unique_dim(self, device): self.assertFalse(hasattr(torch, 'unique_dim')) @@ -441,7 +464,7 @@ def run_test(device, dtype): device=device) expected_inverse_dim2 = torch.tensor([0, 1]) expected_counts_dim2 = torch.tensor([1, 1]) - expected_unique_empty = torch.tensor([], dtype=dtype, device=device) + expected_unique_empty = torch.empty(5, 0, dtype=dtype, device=device) expected_inverse_empty = torch.tensor([], dtype=torch.long, device=device) expected_counts_empty = torch.tensor([], dtype=torch.long, device=device) if dtype in floating_types_and(torch.float16, torch.bfloat16): @@ -685,7 +708,6 @@ def test_topk_integral(self, device, dtype): @onlyCUDA @dtypes(torch.bfloat16) - @skipCUDAIfRocm def test_topk_bfloat16(self, device, dtype): small = 10 @@ -694,12 +716,9 @@ def test_topk_bfloat16(self, device, dtype): for curr_size in (small, large, verylarge): self._test_topk_dtype(device, dtype, False, curr_size) - @dtypesIfCUDA(*get_all_fp_dtypes()) + @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16)) @dtypes(torch.float, torch.double, torch.bfloat16) def test_topk_nonfinite(self, device, dtype): - if TEST_WITH_ROCM and dtype == torch.bfloat16: - return - x = torch.tensor([float('nan'), float('inf'), 1e4, 0, -1e4, -float('inf')], device=device, dtype=dtype) val, idx = x.topk(4) expect = torch.tensor([float('nan'), float('inf'), 1e4, 0], device=device, dtype=dtype) @@ -728,15 +747,9 @@ def test_topk_4d(self, device): self.assertEqual(ind, expected_ind, atol=0, rtol=0) @onlyNativeDeviceTypes - @dtypesIfCUDA(*(get_all_dtypes(include_complex=False, - include_bool=False, - include_half=False, - include_bfloat16=True))) - @dtypes(*(get_all_dtypes(include_complex=False, include_bool=False, include_half=False, include_bfloat16=False))) + @dtypesIfCUDA(*all_types_and(torch.bfloat16)) + @dtypes(*all_types()) def test_topk_zero(self, device, dtype): - if TEST_WITH_ROCM and dtype == torch.bfloat16: - return - # https://github.com/pytorch/pytorch/issues/49205 t = torch.rand(2, 2, device=device).to(dtype=dtype) val, idx = torch.topk(t, k=0, largest=False) @@ -789,12 +802,9 @@ def ensure_tuple(x): self.assertEqual(expected_inverse.view(additional_shape), y_inverse) self.assertEqual(expected_counts, y_counts) - @dtypesIfCPU(*set(get_all_dtypes()) - {torch.complex64, torch.complex128}) - @dtypes(*set(get_all_dtypes()) - {torch.bfloat16, torch.complex64, torch.complex128}) + @dtypesIfCPU(*all_types_and(torch.bool, torch.bfloat16)) + @dtypes(*all_types_and(torch.half, torch.bool)) def test_unique(self, device, dtype): - if dtype is torch.half and self.device_type == 'cpu': - return # CPU does not have half support - def ensure_tuple(x): if isinstance(x, torch.Tensor): return (x,) @@ -849,12 +859,9 @@ def ensure_tuple(x): count += 1 self.assertEqual(j, count) - @dtypesIfCPU(*set(get_all_dtypes()) - {torch.complex64, torch.complex128}) - @dtypes(*set(get_all_dtypes()) - {torch.bfloat16, torch.complex64, torch.complex128}) + @dtypesIfCPU(*all_types_and(torch.bool, torch.bfloat16)) + @dtypes(*all_types_and(torch.half, torch.bool)) def test_unique_consecutive(self, device, dtype): - if dtype is torch.half and self.device_type == 'cpu': - return # CPU does not have half support - if dtype is torch.bool: x = torch.tensor([True, False, False, False, True, True, False, False, False], dtype=torch.bool, device=device) expected_unique = torch.tensor([True, False, True, False], dtype=torch.bool, device=device) diff --git a/test/test_sparse.py b/test/test_sparse.py index cbc98f572bd8..07a8fd2a03de 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -7,9 +7,6 @@ import random import unittest from torch.testing import make_tensor -from torch.testing._internal.common_dtype import ( - all_types_and_complex, -) from torch.testing._internal.common_utils import TestCase, run_tests, skipIfRocm, do_test_dtypes, \ do_test_empty_full, load_tests, TEST_NUMPY, IS_WINDOWS, gradcheck, coalescedonoff, \ DeterministicGuard, first_sample @@ -17,16 +14,16 @@ from numbers import Number from typing import Dict, Any from distutils.version import LooseVersion -from torch.testing import get_all_complex_dtypes, get_all_fp_dtypes from torch.testing._internal.common_cuda import \ (SM53OrLater, SM80OrLater, CUDA11OrLater) from torch.testing._internal.common_device_type import \ (instantiate_device_type_tests, ops, dtypes, dtypesIfCUDA, onlyCPU, onlyCUDA, precisionOverride, deviceCountAtLeast, OpDTypes) from torch.testing._internal.common_methods_invocations import \ - (sparse_unary_ufuncs) + (sparse_unary_ufuncs, sparse_masked_reduction_ops) from torch.testing._internal.common_dtype import ( - floating_and_complex_types, floating_and_complex_types_and, get_all_dtypes, get_all_int_dtypes, + all_types, all_types_and_complex, all_types_and_complex_and, floating_and_complex_types, + floating_and_complex_types_and, integral_types, floating_types_and, ) # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for @@ -43,6 +40,8 @@ class TestSparse(TestCase): def setUp(self): + TestCase.setUp(self) + self.index_tensor = lambda *args, **kwargs: torch.tensor(*args, **kwargs, dtype=torch.int64) def sparse_empty_factory(*args, **kwargs): @@ -158,7 +157,7 @@ def test_shape(sparse_dims, nnz, with_size): self.assertEqual(i, x._indices()) self.assertEqual(v, x._values()) self.assertEqual(x.ndimension(), len(with_size)) - self.assertEqual(x.coalesce()._nnz(), nnz) + self.assertEqual(x.coalesce()._nnz(), nnz if x.is_coalesced() else nnz // 2) self.assertEqual(list(x.size()), with_size) # Test .indices() and .values() @@ -188,7 +187,8 @@ def test_shape(sparse_dims, nnz, with_size): self.assertEqual(x._values().numel(), 0) @coalescedonoff - @dtypes(torch.double, torch.cdouble) + @dtypes(torch.double, torch.cdouble, torch.bfloat16) + @precisionOverride({torch.bfloat16: 1e-2}) def test_coalesce(self, device, dtype, coalesced): def _test_coalesce(t): @@ -299,24 +299,22 @@ def test_ctor_size_checks(self, device, dtype): RuntimeError, lambda: self.sparse_tensor(indices, values, torch.Size([2, 4, 2, 1]))) - @dtypes(*floating_and_complex_types_and(torch.float16)) + @dtypes(*floating_and_complex_types_and(torch.float16, torch.bfloat16)) def test_to_dense(self, device, dtype): def test_tensor(x, res): x.to_dense() # Tests triple to_dense for memory corruption x.to_dense() x.to_dense() - # We dont have to_dense for half types, so we don't request - # exact_dtype if res.type is torch.float16. dense_x = x.to_dense() safe_dense_x = self.safeToDense(x) - if (res.dtype == torch.float16): - exact_dtype = False - else: - exact_dtype = True - dense_x = dense_x.to(res.dtype) - safe_dense_x = safe_dense_x.to(res.dtype) - self.assertEqual(res, dense_x, exact_dtype=exact_dtype) - self.assertEqual(res, safe_dense_x, exact_dtype=exact_dtype) + dense_x = dense_x.to(res.dtype) + safe_dense_x = safe_dense_x.to(res.dtype) + self.assertEqual(res, dense_x) + self.assertEqual(res, safe_dense_x) + + # Only run autograd test for float64 + if x.dtype != torch.float64: + return def fn(x): return x.to_dense() @@ -349,6 +347,7 @@ def fn(x): ], dtype=dtype, device=device) test_tensor(x, res) + test_tensor(res, res) i = self.index_tensor([ [0, 1, 2, 2], @@ -360,16 +359,8 @@ def fn(x): res = torch.empty((3, 4, 5, 0), dtype=dtype, device=device) test_tensor(x, res) - # half tensors on cpu don't implement to_dense, so need to convert to float - def _to_dense_half_safe(self, tensor): - if(tensor.dtype == torch.half and tensor.device.type == 'cpu'): - return tensor.to(torch.float).to_dense().to(torch.half) - else: - return tensor.to_dense() - @coalescedonoff - @skipIfRocm - @dtypes(torch.float16, torch.float64, torch.int, torch.cfloat, torch.cdouble) + @dtypes(torch.float16, torch.bfloat16, torch.float64, torch.int, torch.cfloat, torch.cdouble) def test_to_sparse(self, device, dtype, coalesced): shape = [5, 2, 10, 4] max_nnz = 1 @@ -382,9 +373,9 @@ def test_to_sparse(self, device, dtype, coalesced): coalesced=coalesced) expected = expected.to(dtype) - d = self._to_dense_half_safe(expected) + d = expected.to_dense() result = d.to_sparse(dim) - self.assertEqual(d, self._to_dense_half_safe(result)) # == not implemented for sparse tensors yet + self.assertEqual(d, result.to_dense()) self.assertEqual(expected.size(), result.size()) self.assertEqual(dim, result.sparse_dim()) @@ -416,7 +407,7 @@ def test_scalar(self, device, dtype): a_coalesced = a.coalesce() self.assertTrue(a_coalesced.is_coalesced()) self.assertEqual(torch.tensor(12.3 * 2, dtype=dtype, device=device), a.to_dense()) - self.assertEqual(a, a.to_dense().to_sparse()) + self.assertEqual(a.coalesce(), a.coalesce().to_dense().to_sparse()) # tensor without value a = self.sparse_empty((), dtype=dtype, device=device) @@ -675,7 +666,8 @@ def test_shape(sparse_dims, nnz, with_size): test_shape(3, 0, [0, 0, 100, 5, 5, 5, 0]) @coalescedonoff - @dtypes(torch.double, torch.cdouble) + @dtypes(torch.double, torch.cdouble, torch.bfloat16) + @precisionOverride({torch.bfloat16: 2e-2}) def test_Sparse_to_Sparse_copy_(self, device, dtype, coalesced): # This is for testing torch.copy_(SparseTensor, SparseTensor) sparse_dims = 3 @@ -1007,6 +999,105 @@ def test_shape(sparse_dims, nnz, sizes, select_dim, select_index, fail_message=N test_shape(len(sizes) // 2, 10, sizes, d, index) test_shape(len(sizes), 10, sizes, d, index) + def _test_index_select_exhaustive_index(self, sizes, dims, device, dtype, coalesced): + t = make_tensor(sizes, dtype=dtype, device=device) + t_sparse = t.to_sparse().coalesce() if coalesced else t.to_sparse() + t_small_sparse, _, _ = self._gen_sparse(len(sizes), 2, sizes, dtype, device, coalesced) + t_small = t_small_sparse.to_dense() + for d in dims: + # NOTE: indices are negative + idx_dim_d_range = list(range(-sizes[d], 0)) + for idx_len in range(sizes[d], sizes[d] + 1): + # creates all possible valid indices into dim d of lenght idx_len + for idx in itertools.product(*itertools.repeat(idx_dim_d_range, idx_len)): + t_idx = torch.tensor(idx, dtype=torch.long, device=device) + + # NOTE: index_select for dense does not support negative indices, + # hence + sizes[d]. See https://github.com/pytorch/pytorch/issues/76347 + + # tests the nnz > sizes[d] branch + dense_result = t.index_select(d, t_idx + sizes[d]) + sparse_result = t_sparse.index_select(d, t_idx) + self.assertEqual(dense_result, sparse_result) + + # tests the nnz <= sizes[d] branch + small_dense_result = t_small.index_select(d, t_idx + sizes[d]) + small_sparse_result = t_small_sparse.index_select(d, t_idx) + self.assertEqual(small_dense_result, small_sparse_result) + + @coalescedonoff + @dtypes(torch.double, torch.cdouble) + def test_index_select_exhaustive_index_small(self, device, dtype, coalesced): + # will trigger brute-force algo + self._test_index_select_exhaustive_index((3, 3, 4), range(3), device, dtype, coalesced) + + @coalescedonoff + @dtypes(torch.double, torch.cdouble) + def test_index_select_exhaustive_index_large(self, device, dtype, coalesced): + # will trigger more sophisticated algos + self._test_index_select_exhaustive_index((100, 50, 3, 3), (2, 3), device, dtype, coalesced) + + @coalescedonoff + @dtypes(torch.double, torch.cdouble) + def test_index_select_empty_and_non_contiguous_index(self, device, dtype, coalesced): + # empty index + idx_empty = torch.tensor([], dtype=torch.long, device=device) + t = make_tensor((5, 5), dtype=dtype, device=device) + res_dense = t.index_select(0, idx_empty) + res_sparse = t.to_sparse().index_select(0, idx_empty) + self.assertEqual(res_dense, res_sparse) + + # non-contigous index + idx = torch.randint(low=0, high=5, size=(10, 2), device=device)[:, 0] + + def run_test(sizes): + # case nnz > size[d] + t = make_tensor(sizes, dtype=dtype, device=device) + res_dense = t.index_select(0, idx) + res_sparse = t.to_sparse().index_select(0, idx) + self.assertEqual(res_dense, res_sparse) + + # case nnz <= size[d] + t_small_sparse, _, _ = self._gen_sparse(len(sizes), 2, sizes, dtype, device, coalesced) + res_sparse = t_small_sparse.index_select(0, idx) + res_dense = t_small_sparse.to_dense().index_select(0, idx) + self.assertEqual(res_dense, res_sparse) + + # brute-force + run_test((10, 10)) + # more sophisticated algos + run_test((10, 100, 100)) + + @coalescedonoff + @dtypes(torch.double, torch.cdouble) + def test_index_select_parallelization(self, device, dtype, coalesced): + """ + Test with sizes that will trigger parallelization (i.e. with sizes + that are >= at::internal::GRAIN_SIZE) + """ + def run_test(nnz, size): + t_sparse, _, _ = self._gen_sparse(1, nnz, (size,), dtype, device, coalesced) + t_dense = t_sparse.to_dense() + + # idx_small to (sort) and (binary) search into t_sparse + idx_small = torch.randint(size, (nnz // 2,), device=device) + # idx_large to (sort) and (binary) search into idx_large + # NOTE: when coalesced=True, the (binary) search will be + # done over t_sparse anyway, as it is already sorted. + idx_large = torch.randint(size, (nnz * 2,), device=device) + for idx in (idx_small, idx_large): + res_dense = t_dense.index_select(0, idx) + res_sparse = t_sparse.index_select(0, idx) + self.assertEqual(res_dense, res_sparse) + + # NOTE: GRAIN_SIZE = 32768 + # case nnz <= size[d] + tlen = 70000 # > 2 * GRAIN_SIZE + run_test(tlen, tlen) + + # case nnz > size[d] + run_test(tlen, tlen // 2) + @onlyCPU @coalescedonoff @dtypes(torch.double, torch.cdouble) @@ -1252,7 +1343,8 @@ def test_shape(di, dj, dk, nnz): self.assertEqual(self.safeToDense(res), self.safeToDense(true_result)) @coalescedonoff - @dtypes(torch.double, torch.cdouble) + @unittest.skip("See https://github.com/pytorch/pytorch/issues/73145") + @dtypes(torch.double, torch.cdouble, torch.bfloat16) def test_sparse_addmm(self, device, dtype, coalesced): def test_shape(m, n, p, nnz, broadcast, alpha_beta=None): if alpha_beta is None: @@ -1598,7 +1690,6 @@ def _test_basic_ops_shape(self, nnz_x1, nnz_x2, shape_i, shape_v, dtype, device, z = x1.coalesce() self.assertEqual(x1.is_coalesced(), coalesced) self.assertTrue(y.is_coalesced()) - self.assertEqual(x1, y) y._values().add_(1) if not x1.is_coalesced(): # check that coalesce is out of place if the original tensor is not @@ -1698,7 +1789,7 @@ def _test_sparse_mask_fixed(): exp_v = torch.tensor([7, 14, 3, 20], dtype=dtype, device=device) res = dense.sparse_mask(x) expected = self.sparse_tensor(i, exp_v, torch.Size([5, 4]), dtype=dtype, device=device) - self.assertEqual(res, expected) + self.assertEqual(res.coalesce(), expected.coalesce()) i = self.index_tensor([ [1, 3, 0, 4], @@ -1710,7 +1801,7 @@ def _test_sparse_mask_fixed(): exp_v = torch.empty([4, 0], dtype=dtype, device=device) res = dense.sparse_mask(x) expected = self.sparse_tensor(i, exp_v, torch.Size([5, 4, 0]), dtype=dtype, device=device) - self.assertEqual(res, expected) + self.assertEqual(res.coalesce(), expected.coalesce()) _test_sparse_mask_fixed() @@ -1746,7 +1837,7 @@ def _test_sparse_mask_hybrid_fixed(): res = dense.sparse_mask(x) exp_v = torch.tensor([[7, 9], [14, 1], [3, 3], [20, 1]]) expected = self.sparse_tensor(i, exp_v, torch.Size([5, 4, 2])) - self.assertEqual(res, expected) + self.assertEqual(res.coalesce(), expected.coalesce()) i = self.index_tensor([ [1, 3, 0, 4], @@ -1758,7 +1849,7 @@ def _test_sparse_mask_hybrid_fixed(): res = dense.sparse_mask(x) exp_v = torch.empty(4, 2, 0) expected = self.sparse_tensor(i, exp_v, torch.Size([5, 4, 2, 0])) - self.assertEqual(res, expected) + self.assertEqual(res.coalesce(), expected.coalesce()) _test_sparse_mask_hybrid_fixed() @@ -1960,7 +2051,7 @@ def test_narrow(self, device, dtype, coalesced): def _test_log1p_tensor(self, sparse_tensor, coalesced): def is_integral(dtype): - return dtype in get_all_int_dtypes() + return dtype in integral_types() dense_tensor = sparse_tensor.to_dense() expected_output = dense_tensor.log1p() @@ -1991,8 +2082,7 @@ def is_integral(dtype): sparse_tensor.requires_grad_() @coalescedonoff - @dtypes(*get_all_dtypes(include_bool=False, include_half=False, - include_bfloat16=False, include_complex=False)) + @dtypes(*all_types()) def test_log1p(self, device, dtype, coalesced): if coalesced: input_coalesced = torch.sparse_coo_tensor( @@ -2100,7 +2190,7 @@ def test_neg_negative(self, device, dtype, coalesced): def _test_asin_arcsin(self, sparse_tensor, coalesced): def is_integral(dtype): - return dtype in get_all_int_dtypes() + return dtype in integral_types() is_integral_dtype = is_integral(sparse_tensor.dtype) dense_tensor = sparse_tensor.to_dense() @@ -2135,8 +2225,7 @@ def is_integral(dtype): op(sparse_tensor) @coalescedonoff - @dtypes(*get_all_dtypes(include_bool=False, include_half=False, - include_bfloat16=False, include_complex=False)) + @dtypes(*all_types()) def test_asin_arcsin(self, device, dtype, coalesced): if coalesced: input_coalesced = torch.sparse_coo_tensor( @@ -2623,14 +2712,14 @@ def test_legacy_new(self, device): @onlyCPU # not really, but we only really want to run this once def test_dtypes(self, device): - all_sparse_dtypes = get_all_dtypes(include_complex=True) + all_sparse_dtypes = all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16) do_test_dtypes(self, all_sparse_dtypes, torch.sparse_coo, torch.device('cpu')) if torch.cuda.is_available(): do_test_dtypes(self, all_sparse_dtypes, torch.sparse_coo, torch.device('cuda:0')) @onlyCPU # not really, but we only really want to run this once def test_empty_full(self, device): - all_sparse_dtypes = get_all_dtypes(include_complex=True) + all_sparse_dtypes = all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16) do_test_empty_full(self, all_sparse_dtypes, torch.sparse_coo, torch.device('cpu')) if torch.cuda.device_count() > 0: do_test_empty_full(self, all_sparse_dtypes, torch.sparse_coo, None) @@ -2887,11 +2976,11 @@ def test_any(self, device): self.assertEqual(torch.any(t), t_any) def test_isnan(self, device): - t = torch.sparse_coo_tensor(torch.tensor(([0, 0], [2, 0])), torch.tensor([1, 4]), device=device) - t_nan = torch.sparse_coo_tensor(torch.tensor(([0, 0], [2, 0])), torch.tensor([False, False]), device=device) + t = torch.sparse_coo_tensor(torch.tensor(([0, 0], [0, 2])), torch.tensor([1, 4]), device=device) + t_nan = torch.sparse_coo_tensor(torch.tensor(([0, 0], [0, 2])), torch.tensor([False, False]), device=device) self.assertEqual(torch.isnan(t).int(), t_nan.int()) - t = torch.sparse_coo_tensor(torch.tensor(([0, 0], [2, 0])), torch.tensor([1, float("nan")]), device=device) - t_nan = torch.sparse_coo_tensor(torch.tensor(([0, 0], [2, 0])), torch.tensor([False, True]), device=device) + t = torch.sparse_coo_tensor(torch.tensor(([0, 0], [0, 2])), torch.tensor([1, float("nan")]), device=device) + t_nan = torch.sparse_coo_tensor(torch.tensor(([0, 0], [0, 2])), torch.tensor([False, True]), device=device) self.assertEqual(torch.isnan(t).int(), t_nan.int()) @coalescedonoff @@ -3227,13 +3316,11 @@ def sparse_log(x): # TODO: Check after why ROCm's cusparseXcsrgemm2Nnz function doesn't return the same nnz value as CUDA @skipIfRocm @coalescedonoff - @dtypes(*get_all_complex_dtypes(), - *get_all_fp_dtypes(include_half=False, include_bfloat16=False)) - @dtypesIfCUDA(*((torch.complex64,) if CUDA11OrLater else ()), - *((torch.complex128,) if CUSPARSE_SPMM_COMPLEX128_SUPPORTED else ()), - *get_all_fp_dtypes( - include_half=(CUDA11OrLater and SM53OrLater), - include_bfloat16=(CUDA11OrLater and SM80OrLater))) + @dtypes(*floating_and_complex_types()) + @dtypesIfCUDA(*floating_types_and(*[torch.half] if CUDA11OrLater and SM53OrLater else [], + *[torch.bfloat16] if CUDA11OrLater and SM80OrLater else [], + *[torch.complex64] if CUDA11OrLater else [], + *[torch.complex128] if CUSPARSE_SPMM_COMPLEX128_SUPPORTED else [])) @precisionOverride({torch.bfloat16: 1e-2, torch.float16: 1e-2, torch.complex64: 1e-2, torch.float32: 1e-2}) def test_sparse_matmul(self, device, dtype, coalesced): """ @@ -3372,7 +3459,7 @@ def can_broadcast(s0, s1): (), (1,), (2,), (1, 1), (3, 1), (3, 2), (4, 1, 1), (4, 3, 2) ) for s0, s1 in itertools.combinations(sizes, r=2): - t = make_tensor(s0, device, dtype, low=-9, high=9) + t = make_tensor(s0, dtype=dtype, device=device, low=-9, high=9) for sparse_dims in range(1, len(s0) + 1): s = t.to_sparse(sparse_dims) if can_broadcast(s0, s1): @@ -3410,21 +3497,21 @@ class TestSparseOneOff(TestCase): def test_cuda_from_cpu(self): with self.assertRaisesRegex( RuntimeError, - "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"): + "Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!"): torch.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(), torch.randn(4, 4, 4), [3, 4, 4]) with self.assertRaisesRegex( RuntimeError, - "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"): + "Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!"): torch.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(), torch.randn(4, 4, 4, 0), [3, 4, 4, 0]) with self.assertRaisesRegex( RuntimeError, - "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"): + "Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!"): torch.sparse.FloatTensor(torch.LongTensor(1, 0).cuda(), torch.randn(0, 4, 4, 0), [0, 4, 4, 0]) @@ -3555,9 +3642,55 @@ def fn(x): fast_mode=op.gradcheck_fast_mode)) +class TestSparseMaskedReductions(TestCase): + exact_dtype = True + + @ops(sparse_masked_reduction_ops) + def test_future_empty_dim(self, device, dtype, op): + """Currently, `dim=()` in reductions operations means "reduce over + all dimensions" while in future, it will read "no reduce". See + https://github.com/pytorch/pytorch/issues/29137 + + For sparse masked reductions, we'll implement the current behavior. + + For testing, we'll use samples with `dim=0` and map it to + `dim=()` until + torch.testing._internal.common_methods_invocations._generate_reduction_kwargs + is made to generate samples with `dim=()` for non-scalar + inputs. With this and after gh-29137 is resolved, this test + can be deleted. See also `torch._masked._canonical_dim` + implementation about changing the `dim=()` behavior. + """ + + samples = op.sample_inputs_func(op, device, dtype, requires_grad=False) + op_name = op.name.replace('_masked.', '') + for sample_input in samples: + if sample_input.kwargs.get('dim') != 0: + continue + sample_input_kwargs = dict(sample_input.kwargs) + sample_input_kwargs['dim'] = () # reduce over all dimensions + + t = sample_input.input + mask = sample_input_kwargs.get('mask') + if mask is None and op_name in {'prod', 'amax', 'amin'}: + # FIXME: for now reductions with non-zero reduction identity and + # unspecified mask are not supported for sparse COO + # tensors, see torch._masked.prod implementation + # for details. + continue + sparse_op_kwargs = dict(sample_input_kwargs) + actual = op(t.to_sparse(), *sample_input.args, **sample_input_kwargs) + self.assertEqual(actual.layout, torch.sparse_coo) + + expected = op(t, *sample_input.args, **sample_input_kwargs).to_sparse() + self.assertEqual(actual, expected) + + # e.g., TestSparseUnaryUfuncsCPU and TestSparseUnaryUfuncsCUDA instantiate_device_type_tests(TestSparseUnaryUfuncs, globals(), except_for='meta') +instantiate_device_type_tests(TestSparseMaskedReductions, globals(), except_for='meta') + # e.g., TestSparseCPU and TestSparseCUDA instantiate_device_type_tests(TestSparse, globals(), except_for='meta') diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py index 6c9961a6d1fe..cccc2bbc3b47 100644 --- a/test/test_sparse_csr.py +++ b/test/test_sparse_csr.py @@ -4,17 +4,21 @@ import random import itertools import unittest -from torch.testing import get_all_complex_dtypes, get_all_fp_dtypes, floating_and_complex_types, make_tensor +from torch.testing import make_tensor from torch.testing._internal.common_cuda import SM53OrLater, SM80OrLater, TEST_CUSPARSE_GENERIC from torch.testing._internal.common_utils import \ - (TEST_WITH_ROCM, TEST_SCIPY, TestCase, run_tests, load_tests, coalescedonoff) + (TEST_WITH_ROCM, TEST_SCIPY, TEST_MKL, IS_WINDOWS, TestCase, run_tests, load_tests, coalescedonoff, parametrize, + subtest) from torch.testing._internal.common_device_type import \ - (ops, instantiate_device_type_tests, dtypes, dtypesIfCUDA, onlyCPU, onlyCUDA, skipCUDAIfNoCusparseGeneric, + (ops, instantiate_device_type_tests, dtypes, OpDTypes, dtypesIfCUDA, onlyCPU, onlyCUDA, skipCUDAIfNoCusparseGeneric, precisionOverride, skipMeta, skipCUDAIf, skipCUDAIfRocm, skipCPUIfNoMklSparse) from torch.testing._internal.common_methods_invocations import \ - (op_db, sparse_csr_unary_ufuncs, ) + (op_db, sparse_csr_unary_ufuncs, ReductionOpInfo) from torch.testing._internal.common_cuda import _get_torch_cuda_version, CUDA11OrLater -from torch.testing._internal.common_dtype import floating_types, get_all_dtypes +from torch.testing._internal.common_dtype import ( + floating_types, all_types_and_complex_and, floating_and_complex_types, floating_types_and, + all_types_and_complex, floating_and_complex_types_and +) from test_sparse import CUSPARSE_SPMM_COMPLEX128_SUPPORTED if TEST_SCIPY: @@ -24,6 +28,8 @@ # sharding on sandcastle. This line silences flake warnings load_tests = load_tests +no_mkl_sparse = IS_WINDOWS or not TEST_MKL + def _check_cusparse_triangular_solve_available(): version = _get_torch_cuda_version() # cusparseSpSM was added in 11.3.1 but we don't have access to patch version @@ -43,10 +49,30 @@ def _check_cusparse_sddmm_available(): return version >= min_supported_version _sparse_csr_ops = list(filter(lambda op: op.supports_sparse_csr, op_db)) +binary_functions_with_dense_output = ['mm', 'mv', ] +binary_ops_with_dense_output = list(filter(lambda op: op.name in binary_functions_with_dense_output, op_db)) + +UNARY_EWISE_CSR_ALLOW_AUTOGRAD = [ + 'abs', + 'conj_physical', + 'neg', +] # This should be just an import from test_linalg instead of code duplication # but https://github.com/pytorch/pytorch/pull/63511#discussion_r733989701 -def _test_addmm_addmv(test_case, f, t, m, v, *, alpha=None, beta=None, transpose_out=False, layout=torch.strided, all_sparse=False): +def _test_addmm_addmv( + test_case, + f, + t, + m, + v, + *, + alpha=None, + beta=None, + transpose_out=False, + layout=torch.strided, + mode=None +): """ Unified test for checking `f(t, m, v, alpha=alpha, beta=beta)` computation, where f is `torch.addmv` or `torch.addmm`. @@ -72,9 +98,11 @@ def convert_layout(mat): assert mat.layout == layout return mat - if all_sparse: + if mode == "all_sparse": res1 = f(*map(convert_layout, (t, m, v)), alpha=alpha, beta=beta) res1 = res1.to_dense() + elif mode == "dense_result": + res1 = f(t, convert_layout(m), convert_layout(v), alpha=alpha, beta=beta) else: res1 = f(t, convert_layout(m), v, alpha=alpha, beta=beta) res2 = torch.full_like(res1, float('nan')) @@ -110,186 +138,398 @@ def test_make_crow_indices(self): self.assertLessEqual(counts.max(), n_cols) -class TestSparseCSR(TestCase): +def all_sparse_compressed_layouts(test_name='layout'): + return parametrize(test_name, [ + subtest(torch.sparse_csr, name='SparseCSR'), + subtest(torch.sparse_csc, name='SparseCSC'), + subtest(torch.sparse_bsr, name='SparseBSR'), + subtest(torch.sparse_bsc, name='SparseBSC')]) - @onlyCPU - def test_csr_layout(self): - self.assertEqual(str(torch.sparse_csr), 'torch.sparse_csr') - self.assertEqual(type(torch.sparse_csr), torch.layout) - @dtypes(*get_all_dtypes()) - def test_sparse_csr_constructor_shape_inference(self, device, dtype): - crow_indices = [0, 2, 4] - col_indices = [0, 1, 0, 1] - values = [1, 2, 3, 4] - sparse = torch.sparse_csr_tensor(torch.tensor(crow_indices, dtype=torch.int64), - torch.tensor(col_indices, dtype=torch.int64), - torch.tensor(values), dtype=dtype, device=device) - self.assertEqual(torch.tensor(crow_indices, dtype=torch.int64), sparse.crow_indices()) - self.assertEqual((len(crow_indices) - 1, max(col_indices) + 1), sparse.shape) - self.assertEqual(dtype, sparse.dtype) - self.assertEqual(torch.device(device), sparse.device) - - @dtypes(*get_all_dtypes()) - def test_sparse_csr_constructor(self, device, dtype): - crow_indices = [0, 2, 4] - col_indices = [0, 1, 0, 1] - values = [1, 2, 3, 4] - for index_dtype in [torch.int32, torch.int64]: - sparse = torch.sparse_csr_tensor(torch.tensor(crow_indices, dtype=index_dtype), - torch.tensor(col_indices, dtype=index_dtype), - torch.tensor(values), - size=(2, 10), - dtype=dtype, - device=device) - self.assertEqual((2, 10), sparse.shape) - self.assertEqual(torch.tensor(crow_indices, dtype=index_dtype), sparse.crow_indices()) - self.assertEqual(torch.tensor(col_indices, dtype=index_dtype), sparse.col_indices()) - self.assertEqual(torch.tensor(values, dtype=dtype), sparse.values()) - - @dtypes(*get_all_dtypes()) - def test_sparse_csr_constructor_from_lists(self, device, dtype): - # without size - sparse = torch.sparse_csr_tensor([0, 2, 4], - [0, 1, 0, 1], - [1, 2, 3, 4], - dtype=dtype, - device=device) +def sparse_compressed_nonblock_layouts(test_name='layout'): + return parametrize(test_name, [ + subtest(torch.sparse_csr, name='SparseCSR'), + subtest(torch.sparse_csc, name='SparseCSC')]) + +sparse_compressed_indices_methods = { + torch.sparse_csr: (torch.Tensor.crow_indices, torch.Tensor.col_indices), + torch.sparse_csc: (torch.Tensor.ccol_indices, torch.Tensor.row_indices), + torch.sparse_bsr: (torch.Tensor.crow_indices, torch.Tensor.col_indices), + torch.sparse_bsc: (torch.Tensor.ccol_indices, torch.Tensor.row_indices), +} - self.assertEqual((2, 2), sparse.shape) - self.assertEqual(4, sparse.numel()) - self.assertEqual(torch.tensor([0, 2, 4], dtype=torch.int64, device=device), sparse.crow_indices()) - self.assertEqual(torch.tensor([0, 1, 0, 1], dtype=torch.int64, device=device), sparse.col_indices()) - self.assertEqual(torch.tensor([1, 2, 3, 4], dtype=dtype, device=device), sparse.values()) - - # with size - for sparse_csr_tensor in [torch.sparse_csr_tensor, torch._sparse_csr_tensor_unsafe]: - sparse = sparse_csr_tensor([0, 2, 4], - [0, 1, 0, 1], - [1, 2, 3, 4], - size=(2, 10), - dtype=dtype, - device=device) - - self.assertEqual((2, 10), sparse.shape) - self.assertEqual(torch.tensor([0, 2, 4], dtype=torch.int64, device=device), sparse.crow_indices()) - self.assertEqual(torch.tensor([0, 1, 0, 1], dtype=torch.int64, device=device), sparse.col_indices()) - self.assertEqual(torch.tensor([1, 2, 3, 4], dtype=dtype, device=device), sparse.values()) + +class TestSparseCompressed(TestCase): + """Testing sparse compressed (CSR, CSC, BSR, BSC) tensor generic features. + """ + + def genTensor(self, size, nnz, *, layout, device=None, dtype=torch.float, index_dtype=torch.int64): + if device is None: + device = self.device_type + return self.genSparseCompressedTensor(size, nnz, device=device, dtype=dtype, index_dtype=index_dtype, layout=layout) + + def _generate_small_inputs(self, layout, device, dtype, index_dtype): + """Generator of inputs to sparse compressed tensor factory functions. + + The input is defined as a 4-tuple: + compressed_indices, plain_indices, values, expected_size_from_shape_inference + """ + from operator import mul + from functools import reduce + if layout in {torch.sparse_csr, torch.sparse_csc}: + yield (torch.tensor([0, 2, 4], device=device, dtype=index_dtype), + torch.tensor([0, 1, 0, 1], device=device, dtype=index_dtype), + torch.tensor([1, 2, 3, 4], device=device, dtype=dtype), + (2, 2)) + yield (torch.tensor([0, ], device=device, dtype=index_dtype), + torch.tensor([], device=device, dtype=index_dtype), + torch.tensor([], device=device, dtype=dtype), + (0, 0)) + for batch_shape in [(2,), (2, 3)]: + prod = reduce(mul, batch_shape, 1) + yield (torch.tensor([0, 2, 4], device=device, dtype=index_dtype).repeat(prod, 1).reshape(*batch_shape, -1), + torch.tensor([0, 1, 0, 1], device=device, dtype=index_dtype).repeat(prod, 1).reshape(*batch_shape, -1), + torch.tensor([1, 2, 3, 4], device=device, dtype=dtype).repeat(prod, 1).reshape(*batch_shape, -1), + (*batch_shape, 2, 2)) + else: + assert layout in {torch.sparse_bsr, torch.sparse_bsc} + yield (torch.tensor([0, 2, 4], device=device, dtype=index_dtype), + torch.tensor([0, 1, 0, 1], device=device, dtype=index_dtype), + torch.tensor([[[1, 11]], [[2, 22]], [[3, 33]], [[4, 44]]], device=device, dtype=dtype), + (2, 2)) + yield (torch.tensor([0, ], device=device, dtype=index_dtype), + torch.tensor([], device=device, dtype=index_dtype), + torch.tensor([], device=device, dtype=dtype).reshape(1, 0, 0), + (0, 0)) + for batch_shape in [(2,), (2, 3)]: + prod = reduce(mul, batch_shape, 1) + yield (torch.tensor([0, 2, 4], device=device, dtype=index_dtype).repeat(prod, 1).reshape(*batch_shape, -1), + torch.tensor([0, 1, 0, 1], device=device, dtype=index_dtype).repeat(prod, 1).reshape(*batch_shape, -1), + torch.tensor([[[1, 11]], [[2, 22]], [[3, 33]], [[4, 44]]], + device=device, dtype=dtype).repeat(prod, 1, 1).reshape(*batch_shape, 4, 1, 2), + (*batch_shape, 2, 2)) + + @all_sparse_compressed_layouts() + @onlyCPU + def test_layout(self, layout): + self.assertIn(str(layout), {'torch.sparse_csr', 'torch.sparse_csc', 'torch.sparse_bsr', 'torch.sparse_bsc'}) + self.assertEqual(type(layout), torch.layout) + + @parametrize('shape_and_device_inference', [subtest(False, name='_'), subtest(False, name='shape_and_device_inference')]) + @parametrize('use_factory_function', [subtest(False, name='_'), subtest(True, name='factory')]) + @parametrize('input_kind', [subtest('tensor', name='from_tensor'), subtest('list', name='from_list')]) + @all_sparse_compressed_layouts() + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) + def test_sparse_compressed_constructor(self, layout, device, dtype, + use_factory_function, shape_and_device_inference, input_kind): + factory_function = { + torch.sparse_csr: torch.sparse_csr_tensor, + torch.sparse_csc: torch.sparse_csc_tensor, + torch.sparse_bsr: torch.sparse_bsr_tensor, + torch.sparse_bsc: torch.sparse_bsc_tensor, + }[layout] + compressed_indices_mth, plain_indices_mth = sparse_compressed_indices_methods[layout] + for index_dtype in [torch.int32, torch.int64]: + for compressed_indices, plain_indices, values, size in self._generate_small_inputs(layout, device, dtype, index_dtype): + if input_kind == 'list': + if size == (0, 0): + # for this degenerate case, plain_indices must + # remain a tensor because + # tensor(plain_indices) results a float dtype + # when plain_indices is an empty list + if index_dtype == torch.int32: + # skip testing int32 case because + # tensor(compressed_indices) results a + # int64 dtype when compressed_indices is + # [0] (a list of single int zero). + continue + else: + plain_indices = plain_indices.tolist() + compressed_indices = compressed_indices.tolist() + values = values.tolist() + if size == (0, 0) and layout in {torch.sparse_bsr, torch.sparse_bsc}: + # in the block sparse case, values of type list needs to represent a 3-D tensor + values = [[[]]] + if use_factory_function: + if shape_and_device_inference: + sparse = factory_function(compressed_indices, plain_indices, values) + else: + sparse = factory_function(compressed_indices, plain_indices, values, size, + dtype=dtype, device=device) + else: + if shape_and_device_inference: + sparse = torch.sparse_compressed_tensor(compressed_indices, plain_indices, values, layout=layout) + else: + sparse = torch.sparse_compressed_tensor(compressed_indices, plain_indices, values, size, + dtype=dtype, layout=layout, device=device) + self.assertEqual(layout, sparse.layout) + self.assertEqual(size, sparse.shape) + self.assertEqual(compressed_indices, compressed_indices_mth(sparse)) + self.assertEqual(plain_indices, plain_indices_mth(sparse)) + self.assertEqual(values, sparse.values()) @skipMeta - @dtypes(*get_all_dtypes()) - def test_empty(self, device, dtype): + @sparse_compressed_nonblock_layouts() + @dtypes(*all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half)) + def test_empty(self, layout, device, dtype): ns = [5, 2, 0] - for shape in itertools.product(ns, ns): - result = torch.empty(shape, dtype=dtype, device=device, layout=torch.sparse_csr) + batch_shapes = [(), (2,), (2, 3)] + compressed_dim = { + torch.sparse_csr: -2, + torch.sparse_csc: -1, + }[layout] + compressed_indices_mth, plain_indices_mth = sparse_compressed_indices_methods[layout] + for m, n, b in itertools.product(ns, ns, batch_shapes): + shape = (*b, m, n) + result = torch.empty(shape, dtype=dtype, device=device, layout=layout) self.assertEqual(result.shape, shape) self.assertEqual(result.dtype, dtype) self.assertEqual(result.device, torch.device(device)) - self.assertEqual(result.layout, torch.sparse_csr) - self.assertEqual(result.crow_indices().shape, (shape[0] + 1,)) - self.assertEqual(result.col_indices().shape, (0,)) - self.assertEqual(result.values().shape, (0,)) + self.assertEqual(result.layout, layout) + self.assertEqual(compressed_indices_mth(result).shape, (*b, shape[compressed_dim] + 1,)) + self.assertEqual(plain_indices_mth(result).shape, (*b, 0,)) + self.assertEqual(result.values().shape, (*b, 0,)) self.assertEqual(result._nnz(), 0) - self.assertEqual(result.crow_indices().device, torch.device(device)) - self.assertEqual(result.col_indices().device, torch.device(device)) + self.assertEqual(compressed_indices_mth(result).device, torch.device(device)) + self.assertEqual(plain_indices_mth(result).device, torch.device(device)) self.assertEqual(result.values().device, torch.device(device)) - self.assertEqual(result.crow_indices().dtype, torch.int64) - self.assertEqual(result.col_indices().dtype, torch.int64) + self.assertEqual(compressed_indices_mth(result).dtype, torch.int64) + self.assertEqual(plain_indices_mth(result).dtype, torch.int64) self.assertEqual(result.values().dtype, dtype) @skipMeta - @dtypes(*get_all_dtypes()) - def test_empty_errors(self, device, dtype): - with self.assertRaisesRegex(RuntimeError, "torch.empty: Only 2D sparse CSR tensors are supported."): - torch.empty((5,), dtype=dtype, device=device, layout=torch.sparse_csr) - - with self.assertRaisesRegex(RuntimeError, "torch.empty: Only 2D sparse CSR tensors are supported."): - torch.empty((2, 3, 4), dtype=dtype, device=device, layout=torch.sparse_csr) + @sparse_compressed_nonblock_layouts() + @dtypes(*all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16)) + def test_empty_errors(self, layout, device, dtype): + with self.assertRaisesRegex(RuntimeError, + "torch.empty: Only batched sparse compressed \\(non-block\\) tensors are supported" + ", but got size"): + torch.empty((5,), dtype=dtype, device=device, layout=layout) @skipMeta - @dtypes(*get_all_dtypes()) - def test_clone(self, device, dtype): - x = torch.sparse_csr_tensor([0, 2, 4], - [0, 1, 0, 1], - [1, 2, 3, 4], - dtype=dtype, - device=device) - y = x.clone() - - self.assertEqual(x.shape, y.shape) - self.assertEqual(x.crow_indices(), y.crow_indices()) - self.assertEqual(x.col_indices(), y.col_indices()) - self.assertEqual(x.values(), y.values()) + @all_sparse_compressed_layouts() + @dtypes(*all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16)) + def test_clone(self, layout, device, dtype): + for compressed_indices, plain_indices, values, size in self._generate_small_inputs( + layout, device, dtype, index_dtype=torch.int32): + sparse = torch.sparse_compressed_tensor(compressed_indices, plain_indices, values, size, + dtype=dtype, layout=layout, device=device) + cloned_sparse = sparse.clone() + self.assertEqual(sparse, cloned_sparse) + + @all_sparse_compressed_layouts() + def test_print(self, layout, device): + compressed_indices_mth, plain_indices_mth = sparse_compressed_indices_methods[layout] + printed = [] + for index_dtype in [torch.int32, torch.int64]: + for dtype in [torch.float32, torch.float64]: + for compressed_indices, plain_indices, values, size in self._generate_small_inputs( + layout, device, dtype, index_dtype): + batch_shape = tuple(size[:-2]) + block_shape = tuple(values.shape[-2:]) if layout in {torch.sparse_bsr, torch.sparse_bsc} else () + if size not in [(2, 2), (0, 0), (2, 3, 2, 2), (2, 2, 2)]: + # Skip inputs that are not in the list of + # expected sizes to ensure the stability of + # test_print in the case + # _generate_small_inputs is extended with new + # inputs + continue + if block_shape not in [(), (0, 0), (1, 2)]: + # Skip inputs that are not in the list of + # expected block sizes to ensure test_print + # stability. + continue + printed.append("########## {}/{}/batch_shape={}/block_shape={} ##########".format( + dtype, index_dtype, batch_shape, block_shape)) + x = torch.sparse_compressed_tensor(compressed_indices, + plain_indices, + values, dtype=dtype, layout=layout, device=device) + printed.append("# sparse tensor") + printed.append(str(x)) + printed.append(f"# _{compressed_indices_mth.__name__}") + printed.append(str(compressed_indices_mth(x))) + printed.append(f"# _{plain_indices_mth.__name__}") + printed.append(str(plain_indices_mth(x))) + printed.append("# _values") + printed.append(str(x.values())) + printed.append('') + printed.append('') + orig_maxDiff = self.maxDiff + self.maxDiff = None + try: + self.assertExpected('\n'.join(printed)) + self.maxDiff = orig_maxDiff + except Exception: + self.maxDiff = orig_maxDiff + raise @skipMeta - @dtypes(*get_all_dtypes()) - def test_copy(self, device, dtype): + @all_sparse_compressed_layouts() + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) + def test_copy(self, layout, device, dtype): def run_test(shape, nnz, index_type): - a = self.genSparseCSRTensor(shape, nnz, dtype=dtype, device=device, index_dtype=index_dtype) - b = self.genSparseCSRTensor(shape, nnz, dtype=dtype, device=device, index_dtype=index_dtype) + block_size = (2, 3) if layout in {torch.sparse_bsr, torch.sparse_bsc} else () + a = self.genSparseCompressedTensor(shape, nnz, dtype=dtype, layout=layout, device=device, + index_dtype=index_dtype, block_size=block_size) + b = self.genSparseCompressedTensor(shape, nnz, dtype=dtype, layout=layout, device=device, + index_dtype=index_dtype, block_size=block_size) a.copy_(b) - self.assertEqual(a.crow_indices(), b.crow_indices()) - self.assertEqual(a.col_indices(), b.col_indices()) - self.assertEqual(a.values(), b.values()) + self.assertEqual(a, b) ns = [5, 2, 0] - for shape, index_dtype in zip(itertools.product(ns, ns), [torch.int32, torch.int64]): - run_test(shape, 0, index_dtype) - run_test(shape, shape[0] * shape[1], index_dtype) + batch_shapes = [(), (2,), (2, 3)] + for (m, n, b), index_dtype in zip(itertools.product(ns, ns, batch_shapes), [torch.int32, torch.int64]): + run_test((*b, m, n), 0, index_dtype) + run_test((*b, m, n), m * n, index_dtype) @skipMeta - @dtypes(*get_all_dtypes()) - def test_copy_errors(self, device, dtype): + @all_sparse_compressed_layouts() + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) + def test_copy_errors(self, layout, device, dtype): + block_size = (2, 3) if layout in {torch.sparse_bsr, torch.sparse_bsc} else () for index_dtype in [torch.int32, torch.int64]: shape1 = (2, 3) - shape2 = (3, 2) - a = self.genSparseCSRTensor(shape1, 0, dtype=dtype, device=device, index_dtype=index_dtype) - b = self.genSparseCSRTensor(shape2, 0, dtype=dtype, device=device, index_dtype=index_dtype) + a = self.genSparseCompressedTensor(shape1, 0, dtype=dtype, layout=layout, device=device, + index_dtype=index_dtype, block_size=block_size) - with self.assertRaisesRegex(RuntimeError, "only same size tensors are supported."): - a.copy_(b) - - with self.assertRaisesRegex(RuntimeError, "copy between different layouts is not supported."): + with self.assertRaisesRegex(RuntimeError, + "copy of sparse compressed tensors having different layouts is not supported."): a.copy_(torch.empty(a.shape, dtype=dtype, device=device)) - b = self.genSparseCSRTensor(shape1, 1, dtype=dtype, device=device, index_dtype=index_dtype) - with self.assertRaisesRegex(RuntimeError, "only tensors with the same number of specified elements are supported."): + b = self.genSparseCompressedTensor(shape1, 1, dtype=dtype, layout=layout, device=device, + index_dtype=index_dtype, block_size=block_size) + with self.assertRaisesRegex(RuntimeError, + "only sparse compressed tensors with the same number of specified elements are supported."): a.copy_(b) + shape2 = tuple(reversed(shape1)) + c = self.genSparseCompressedTensor(shape2, 1, dtype=dtype, layout=layout, device=device, + index_dtype=index_dtype, block_size=block_size) + with self.assertRaisesRegex( + RuntimeError, + "expected shapes of self and src to match along dimension"): + b.copy_(c) + + if block_size: + block_size1 = tuple(reversed(block_size)) + d = self.genSparseCompressedTensor(shape1, 1, dtype=dtype, layout=layout, device=device, + index_dtype=index_dtype, block_size=block_size1) + with self.assertRaisesRegex(RuntimeError, + "copy of sparse compressed tensors having different block sizes is not supported"): + b.copy_(d) + + +class TestSparseCSR(TestCase): + + def test_csr_stride(self): + a = self.genSparseCSRTensor((3, 3), 3, dtype=torch.float, device=self.device_type, index_dtype=torch.int64) + + with self.assertRaisesRegex(RuntimeError, "Sparse CSR tensors do not have strides"): + a.stride() + + with self.assertRaisesRegex(RuntimeError, "Sparse CSR tensors do not have strides"): + a.stride(-1) + + def test_csr_storage(self): + a = self.genSparseCSRTensor((3, 3), 3, dtype=torch.float, device=self.device_type, index_dtype=torch.int64) + + with self.assertRaisesRegex(RuntimeError, "Cannot access storage of SparseCsrTensorImpl"): + a.storage() + + def test_csr_is_contiguous(self): + a = self.genSparseCSRTensor((3, 3), 3, dtype=torch.float, device=self.device_type, index_dtype=torch.int64) + + with self.assertRaisesRegex(RuntimeError, "Tensors of type SparseCsrTensorImpl do not have is_contiguous"): + a.is_contiguous() + + def test_csr_double_to_sparse_csr(self): + a = self.genSparseCSRTensor((3, 3), 3, dtype=torch.float, device=self.device_type, index_dtype=torch.int64) + a.to_sparse_csr().to_sparse_csr() + + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool)) + def test_sparse_csr_select(self, device, dtype): + batch_shape = (2, 3) + crow_indices = torch.tensor([0, 2, 4], device=device).repeat(6, 1).reshape(*batch_shape, -1) + col_indices = torch.tensor([0, 1, 0, 1], device=device).repeat(6, 1).reshape(*batch_shape, -1) + values = torch.tensor([1, 2, 3, 4], device=device, dtype=dtype).repeat(6, 1).reshape(*batch_shape, -1) + sparse = torch.sparse_csr_tensor(crow_indices, + col_indices, + values, + size=(*batch_shape, 2, 10), + dtype=dtype, + device=device) + + # select from batch dimensions + sparse_selected12 = sparse.select(1, 2) + expected_sparse_selected12 = torch.sparse_csr_tensor(crow_indices.select(1, 2).contiguous(), + col_indices.select(1, 2).contiguous(), + values.select(1, 2).contiguous(), + size=(2, 2, 10), + dtype=dtype, + device=device) + self.assertEqual(expected_sparse_selected12, sparse_selected12) + + # select from rows or columns + sparse_non_batched = sparse[0, 0] + for selects_args in [(0, 0), (1, 1)]: + sparse_selected = sparse_non_batched.select(*selects_args) + dense_selected = sparse_non_batched.to_dense().select(*selects_args) + self.assertEqual(dense_selected, sparse_selected) + + # index a single element + self.assertEqual(sparse[0, 0, 0, 0], sparse.to_dense()[0, 0, 0, 0]) + + # selecting from rows or columns for batched CSR is not yet implemented + with self.assertRaisesRegex(RuntimeError, "selecting rows or columns is not implemented for batched"): + sparse.select(-2, 0) + + with self.assertRaisesRegex(RuntimeError, "selecting rows or columns is not implemented for batched"): + sparse.select(-1, 0) + + # assigning to sparse trhough indexing is disabled + with self.assertRaisesRegex(TypeError, "Cannot assign to a sparse tensor"): + sparse[0, 0, 0, 0] = 99.0 + @skipMeta - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) def test_resize(self, device, dtype): - for index_dtype in [torch.int32, torch.int64]: - shape = (2, 3) + batch_shapes = [(), (2,), (2, 3)] + for index_dtype, b in zip([torch.int32, torch.int64], batch_shapes): + shape = (*b, 2, 3) nnz = 6 a = self.genSparseCSRTensor(shape, nnz, dtype=dtype, device=device, index_dtype=index_dtype) - new_shape = (4, 5) + new_shape = (*b, 4, 5) a.resize_(new_shape) self.assertEqual(a.shape, new_shape) # resize to larger shape doesn't add specified elements self.assertEqual(a._nnz(), nnz) - new_shape = (1, 5) + new_shape = (*b, 1, 5) a.resize_(new_shape) self.assertEqual(a.shape, new_shape) # resize to smaller shape trims specified elements self.assertEqual(a._nnz(), 5) + # trim batched dimensions + a.resize_(new_shape[-2], new_shape[-1]) + self.assertEqual(a.shape, (new_shape[-2], new_shape[-1])) + self.assertEqual(a._nnz(), 5) + @skipMeta - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) def test_resize_errors(self, device, dtype): for index_dtype in [torch.int32, torch.int64]: shape = (2, 3) nnz = 6 a = self.genSparseCSRTensor(shape, nnz, dtype=dtype, device=device, index_dtype=index_dtype) - with self.assertRaisesRegex(RuntimeError, "torch.resize_: Only 2D sparse CSR tensors are supported."): + with self.assertRaisesRegex(RuntimeError, "torch.resize_: Only batched sparse CSR matrices are supported"): new_shape = (4,) a.resize_(new_shape) @@ -308,7 +548,7 @@ def test_factory_type_invariants_check(self, device): torch.tensor([1, 2, 3, 4]), device=device) - with self.assertRaisesRegex(RuntimeError, r"\"csr_construct_check\" not implemented for 'Short'"): + with self.assertRaisesRegex(RuntimeError, r"\"validate_sparse_compressed_tensor_args\" not implemented for 'Short'"): torch.sparse_csr_tensor(torch.tensor([0, 2, 4], dtype=torch.int16), torch.tensor([0, 1, 0, 1], dtype=torch.int16), torch.tensor([1, 2, 3, 4]), @@ -334,49 +574,63 @@ def test_factory_layout_invariants_check(self, device): torch.tensor([1, 2, 3, 4])) def test_factory_shape_invariants_check(self, device): - crow_indices = [0, 2, 4] - col_indices = [0, 1, 0, 1] - values = [1, 2, 3, 4] + crow_indices = torch.tensor([0, 2, 4], device=device) + col_indices = torch.tensor([0, 1, 0, 1], device=device) + values = torch.tensor([1, 2, 3, 4], device=device) size = (2, 10) - torch.sparse_csr_tensor(torch.tensor(crow_indices), torch.tensor(col_indices), torch.tensor(values), size, - device=device) + torch.sparse_csr_tensor(crow_indices, col_indices, values, size, device=device) - with self.assertRaisesRegex(RuntimeError, r"size of a CSR tensor must be of length 2, but got: 3"): - torch.sparse_csr_tensor(torch.tensor(crow_indices), torch.tensor(col_indices), torch.tensor(values), - size=(2, 10, 2), + with self.assertRaisesRegex(RuntimeError, r"size of a batched CSR tensor must have length >= 2, but got: 1"): + torch.sparse_csr_tensor(crow_indices, col_indices, values, + size=(2,), device=device) - with self.assertRaisesRegex(RuntimeError, r"crow_indices must have dim\=1 but got crow_indices\.dim\(\)\=2"): - torch.sparse_csr_tensor(torch.tensor(crow_indices).repeat(2, 1), - torch.tensor(col_indices), - torch.tensor(values), + with self.assertRaisesRegex(RuntimeError, r"crow_indices must have dim >= 1 but got crow_indices\.dim\(\)\ = 0"): + torch.sparse_csr_tensor(torch.zeros((), device=device, dtype=torch.int64), + col_indices, + values, size, device=device) - with self.assertRaisesRegex(RuntimeError, r"col_indices must have dim\=1 but got col_indices\.dim\(\)\=2"): - torch.sparse_csr_tensor(torch.tensor(crow_indices), - torch.tensor(col_indices).repeat(2, 1), - torch.tensor(values), + with self.assertRaisesRegex(RuntimeError, r"col_indices must have dim >= 1 but got col_indices\.dim\(\)\ = 0"): + torch.sparse_csr_tensor(crow_indices, + torch.zeros((), device=device, dtype=torch.int64), + values, size, device=device) - with self.assertRaisesRegex(RuntimeError, r"values must have dim\=1 but got values\.dim\(\)\=2"): - torch.sparse_csr_tensor(torch.tensor(crow_indices), - torch.tensor(col_indices), - torch.tensor(values).repeat(2, 1), + with self.assertRaisesRegex(RuntimeError, r"values must have dim >= 1 but got values\.dim\(\)\ = 0"): + torch.sparse_csr_tensor(crow_indices, + col_indices, + torch.zeros((), device=device, dtype=torch.int64), size, device=device) with self.assertRaisesRegex(RuntimeError, - r"crow_indices\.numel\(\) must be size\(0\) \+ 1, but got: 3"): - torch.sparse_csr_tensor(torch.tensor(crow_indices), torch.tensor(col_indices), torch.tensor(values), (1, 1), + r"crow_indices\.size\(-1\) must be equal to size\[-2\] \+ 1 \(that is 2\), but got: 3"): + torch.sparse_csr_tensor(crow_indices, col_indices, values, (1, 1), + device=device) + + + with self.assertRaisesRegex(RuntimeError, + r"number of dimensions of crow_indices and col_indices must be the same"): + torch.sparse_csr_tensor(crow_indices, col_indices.repeat(2, 1), values, size, + device=device) + + with self.assertRaisesRegex(RuntimeError, + r"number of dimensions of indices and values must be the same"): + torch.sparse_csr_tensor(crow_indices, col_indices, values.repeat(2, 1), size, device=device) + with self.assertRaisesRegex(RuntimeError, + r"number of dimensions of indices must be one less"): + torch.sparse_csr_tensor(crow_indices.repeat(2, 1), col_indices.repeat(2, 1), values.repeat(2, 1), size, + device=device) with self.assertRaisesRegex(RuntimeError, - r"col_indices and values must have equal sizes, " + - r"but got col_indices\.numel\(\): 3, values\.numel\(\): 4"): - torch.sparse_csr_tensor(torch.tensor(crow_indices), torch.tensor([0, 1, 0]), torch.tensor(values), size, + r"all batch dimensions of the provided size \(\[2\]\), indices \(\[2\], \[3\]\)," + r" and values \(\[4\]\) must be the same"): + torch.sparse_csr_tensor(crow_indices.repeat(2, 1), col_indices.repeat(3, 1), values.repeat(4, 1), (2, 2, 10), device=device) def test_factory_indices_invariants_check(self, device): @@ -395,7 +649,7 @@ def test_factory_indices_invariants_check(self, device): with self.assertRaisesRegex(RuntimeError, r"at position i \= 2," + - r" this condition crow_indices\[i - 1\] <\= crow_indices\[i\] fails"): + r" the condition crow_indices\[i - 1\] <\= crow_indices\[i\] fails"): torch.sparse_csr_tensor(torch.tensor([0, 5, 4]), torch.tensor(col_indices), torch.tensor(values), size, device=device) @@ -403,12 +657,12 @@ def test_factory_indices_invariants_check(self, device): torch.sparse_csr_tensor(torch.tensor(crow_indices), torch.tensor([0, -1, 0, 1]), torch.tensor(values), size, device=device) - with self.assertRaisesRegex(RuntimeError, r"size\(1\) should be greater than col_indices\.max\(\)"): + with self.assertRaisesRegex(RuntimeError, r"size\[-1\] should be greater than col_indices\.max\(\)"): torch.sparse_csr_tensor(torch.tensor(crow_indices), torch.tensor([0, 11, 0, 1]), torch.tensor(values), size, device=device) @onlyCUDA - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) def test_factory_device_type_inference(self, device, dtype): cpu_cuda = ('cpu', 'cuda') cpu_cuda_none = cpu_cuda + (None,) @@ -442,44 +696,7 @@ def test_factory_device_type_inference(self, device, dtype): t.crow_indices().device == t.values().device t.col_indices().device == t.values().device - def test_sparse_csr_print(self, device): - orig_maxDiff = self.maxDiff - self.maxDiff = None - shape_nnz = [ - ((10, 10), 10), - ((100, 10), 10), - ((1000, 10), 10) - ] - printed = [] - for shape, nnz in shape_nnz: - values_shape = torch.Size((nnz,)) - col_indices_shape = torch.Size((nnz,)) - crow_indices_shape = torch.Size((shape[0] + 1,)) - printed.append("# shape: {}".format(torch.Size(shape))) - printed.append("# nnz: {}".format(nnz)) - printed.append("# crow_indices shape: {}".format(crow_indices_shape)) - printed.append("# col_indices shape: {}".format(col_indices_shape)) - printed.append("# values_shape: {}".format(values_shape)) - for index_dtype in [torch.int32, torch.int64]: - for dtype in floating_types(): - printed.append("########## {}/{} ##########".format(dtype, index_dtype)) - x = torch.sparse_csr_tensor(torch.tensor([0, 2, 4], dtype=index_dtype), - torch.tensor([0, 1, 0, 1], dtype=index_dtype), - torch.tensor([1, 2, 3, 4]), dtype=dtype, device=device) - printed.append("# sparse tensor") - printed.append(str(x)) - printed.append("# _crow_indices") - printed.append(str(x.crow_indices())) - printed.append("# _col_indices") - printed.append(str(x.col_indices())) - printed.append("# _values") - printed.append(str(x.values())) - printed.append('') - printed.append('') - self.assertExpected('\n'.join(printed)) - self.maxDiff = orig_maxDiff - - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) def test_sparse_csr_from_dense(self, device, dtype): dense = torch.tensor([[4, 5, 0], [0, 0, 0], [1, 0, 0]], dtype=dtype, device=device) sparse = dense.to_sparse_csr() @@ -499,7 +716,7 @@ def test_sparse_csr_from_dense(self, device, dtype): self.assertEqual(torch.tensor([0, 1, 2] * 3, dtype=torch.int64), sparse.col_indices()) self.assertEqual(torch.tensor([2] * 9, dtype=dtype), sparse.values()) - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) def test_sparse_csr_to_dense(self, device, dtype): mn = [5, 2, 0] for (m, n) in itertools.product(mn, mn): @@ -508,14 +725,15 @@ def test_sparse_csr_to_dense(self, device, dtype): sparse = dense.to_sparse_csr() self.assertEqual(sparse.to_dense(), dense) - crow_indices = torch.tensor([0, 3, 5]) - col_indices = torch.tensor([0, 1, 2, 0, 1]) - values = torch.tensor([1, 2, 1, 3, 4], dtype=dtype) - csr = torch.sparse_csr_tensor(crow_indices, col_indices, - values, dtype=dtype, device=device) - dense = torch.tensor([[1, 2, 1], [3, 4, 0]], dtype=dtype, device=device) + batch_shape = (2, 3) + crow_indices = torch.tensor([0, 3, 5], device=device).repeat(6, 1).reshape(*batch_shape, -1) + col_indices = torch.tensor([0, 1, 2, 0, 1], device=device).repeat(6, 1).reshape(*batch_shape, -1) + values = torch.tensor([1, 2, 1, 3, 4], device=device, dtype=dtype).repeat(6, 1).reshape(*batch_shape, -1) + csr = torch.sparse_csr_tensor(crow_indices, col_indices, values, dtype=dtype, device=device) + dense = torch.tensor([[1, 2, 1], [3, 4, 0]], dtype=dtype, device=device).repeat(6, 1).reshape(csr.shape) self.assertEqual(csr.to_dense(), dense) + @skipMeta @skipCPUIfNoMklSparse @coalescedonoff @dtypes(torch.double) @@ -559,7 +777,40 @@ def test_coo_to_csr_convert(self, device, dtype, coalesced): values = torch.tensor([2, 1, 6, 4, 10, 3, 5, 9, 8, 7], dtype=dtype, device=device) self.assertEqual(csr.values(), values) - @dtypes(*get_all_dtypes()) + @parametrize("blocksize", [2, 4]) + @dtypes((torch.double, torch.int32), (torch.double, torch.int64)) + @unittest.skipIf(not TEST_SCIPY, "SciPy not found") + @skipMeta + def test_csr_to_block_csr(self, device, dtypes, blocksize): + for shape in [(24, 24), (12, 24)]: + dtype, index_dtype = dtypes + m, k = shape + nnz = random.randint(0, m * k) + t = self.genSparseCSRTensor((m * blocksize, k * blocksize), nnz, dtype=dtype, + device=device, index_dtype=index_dtype) + st = sp.csr_matrix((t.values().cpu(), t.col_indices().cpu(), t.crow_indices().cpu()), shape=tuple(t.size())) + block_t = t.to_sparse_bsr((blocksize, blocksize)) + self.assertEqual(block_t.values().dim(), 3) + self.assertTrue(block_t.layout == torch.sparse_bsr) + block_st = st.tobsr(blocksize=(blocksize, blocksize)) + self.assertEqual(block_t.values().cpu(), block_st.data) + self.assertEqual(block_t.col_indices().cpu(), torch.tensor(block_st.indices).to(index_dtype)) + self.assertEqual(block_t.crow_indices().cpu(), torch.tensor(block_st.indptr).to(index_dtype)) + + @dtypes(torch.double) + @unittest.skipIf(not TEST_SCIPY, "SciPy not found") + def test_csr_to_block_csr_errors(self, device, dtype): + for index_dtype in [torch.int32, torch.int64]: + nnz = 15 + t = self.genSparseCSRTensor((16, 16), nnz, dtype=dtype, + device=device, index_dtype=index_dtype) + with self.assertRaisesRegex(RuntimeError, "must be square."): + block_t = t.to_sparse_bsr((2, 3)) + + with self.assertRaisesRegex(RuntimeError, r"size \(16, 16\) with block size \(5, 5\)"): + block_t = t.to_sparse_bsr((5, 5)) + + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) def test_sparse_csr_from_dense_convert_error(self, device, dtype): size = (4, 2, 4) dense = make_tensor(size, dtype=dtype, device=device) @@ -585,8 +836,9 @@ def test_matmul_device_mismatch(self, device, dtype): @skipCPUIfNoMklSparse @skipCUDAIfNoCusparseGeneric @dtypes(*floating_and_complex_types()) - @dtypesIfCUDA(*get_all_complex_dtypes(), - *get_all_fp_dtypes(include_half=SM53OrLater, include_bfloat16=SM80OrLater)) + @dtypesIfCUDA(*floating_and_complex_types_and( + *[torch.half] if SM53OrLater else [], + *[torch.bfloat16] if SM80OrLater else [])) def test_csr_matvec(self, device, dtype): side = 100 for index_dtype in [torch.int32, torch.int64]: @@ -624,7 +876,7 @@ def run_test(c, a, a_batched, b, op_b=False, op_out=False, *, dtype=None, device self.assertEqual(actual, expected) for index_dtype in [torch.int32, torch.int64]: - for (m, n, k), batch_size, noncontiguous in zip(itertools.product([1, 5], repeat=3), [1, 3], [True, False]): + for (m, n, k), batch_size, noncontiguous in zip(itertools.product([2, 5], repeat=3), [1, 3], [True, False]): nnz = random.randint(0, m * k) a = self.genSparseCSRTensor((m, k), nnz, dtype=dtype, device=device, index_dtype=index_dtype) @@ -657,7 +909,7 @@ def run_test(a, a_batched, b, op_b=False, op_out=False, *, dtype=None, device=No self.assertEqual(actual, expected) for index_dtype in [torch.int32, torch.int64]: - for (m, n, k), batch_size, noncontiguous in zip(itertools.product([1, 5], repeat=3), [1, 3], [True, False]): + for (m, n, k), batch_size, noncontiguous in zip(itertools.product([2, 5], repeat=3), [1, 3], [True, False]): nnz = random.randint(0, m * k) a = self.genSparseCSRTensor((m, k), nnz, dtype=dtype, device=device, index_dtype=index_dtype) @@ -691,46 +943,75 @@ def run_test_block_addmm_addmv(self, addmv_addmm, c, a, b, op_b=False, op_out=Fa self.assertEqual(actual, out) self.assertEqual(actual, expected) + # TODO: block_size 1 is broken + @parametrize("block_size", [2, 3]) + @parametrize("index_dtype", [torch.int32, torch.int64]) + @parametrize("noncontiguous", [True, False]) @skipCPUIfNoMklSparse @unittest.skipIf(not TEST_SCIPY, "SciPy not found") @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128) - def test_block_addmm(self, device, dtype): - for index_dtype in [torch.int32, torch.int64]: - for (m, n, k), block_size, noncontiguous in zip(itertools.product([1, 5], repeat=3), [1, 2, 3], [True, False]): - nnz = random.randint(0, m * k) + @precisionOverride({torch.float32: 1e-3, torch.complex64: 1e-3, + torch.float64: 1e-5, torch.complex128: 1e-5}) + def test_block_addmm(self, device, dtype, index_dtype, block_size, noncontiguous): + for (m, n, k) in itertools.product([2, 5], repeat=3): + nnz = random.randint(0, m * k) + if not noncontiguous: + a = self.genSparseCSRTensor((m * block_size, k * block_size), nnz, + dtype=dtype, device=device, index_dtype=index_dtype) + a = a.to_sparse_bsr((block_size, block_size)) + else: a = self.genSparseCSRTensor((m, k), nnz, dtype=dtype, device=device, index_dtype=index_dtype) a_data = make_tensor((nnz, block_size, block_size), dtype=dtype, device=device) a_data = a_data.mT if noncontiguous else a_data # Test column-major blocks - a = torch._sparse_csr_tensor_unsafe(a.crow_indices(), a.col_indices(), a_data, (m * block_size, k * block_size)) - b = make_tensor((k * block_size, n * block_size), dtype=dtype, device=device, noncontiguous=noncontiguous) - c = make_tensor((m * block_size, n * block_size), dtype=dtype, device=device, noncontiguous=noncontiguous) - for op_b, op_out in itertools.product([True, False], repeat=2): - self.run_test_block_addmm_addmv(torch.addmm, c, a, b, op_b, op_out, dtype=dtype, device=device) - + a = torch._sparse_bsr_tensor_unsafe(a.crow_indices(), a.col_indices(), + a_data, (m * block_size, k * block_size)) + b = make_tensor((k * block_size, n * block_size), dtype=dtype, device=device, noncontiguous=noncontiguous) + c = make_tensor((m * block_size, n * block_size), dtype=dtype, device=device, noncontiguous=noncontiguous) + for op_b, op_out in itertools.product([True, False], repeat=2): + self.run_test_block_addmm_addmv(torch.addmm, c, a, b, op_b, op_out, dtype=dtype, device=device) + + @parametrize("block_size", [2, 3]) + @parametrize("index_dtype", [torch.int32, torch.int64]) + @parametrize("noncontiguous", [True, False]) @skipCPUIfNoMklSparse @unittest.skipIf(not TEST_SCIPY, "SciPy not found") @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128) - def test_block_addmv(self, device, dtype): - for index_dtype in [torch.int32, torch.int64]: - block_sizes = [1, 2, 3] - if TEST_WITH_ROCM or not TEST_CUSPARSE_GENERIC: - block_sizes = [2, 3] - for (m, k), block_size, noncontiguous in zip(itertools.product([1, 5], repeat=2), block_sizes, [True, False]): - nnz = random.randint(0, m * k) + def test_block_addmv(self, device, dtype, index_dtype, block_size, noncontiguous): + # TODO: Explicitly disable block size 1 support + # if (TEST_WITH_ROCM or not TEST_CUSPARSE_GENERIC) and block_size == 1: + # return + for (m, k) in itertools.product([2, 5], repeat=2): + nnz = random.randint(0, m * k) + if not noncontiguous: + a = self.genSparseCSRTensor((m * block_size, k * block_size), nnz, + dtype=dtype, device=device, index_dtype=index_dtype) + a = a.to_sparse_bsr((block_size, block_size)) + else: a = self.genSparseCSRTensor((m, k), nnz, dtype=dtype, device=device, index_dtype=index_dtype) a_data = make_tensor((nnz, block_size, block_size), dtype=dtype, device=device) - a_data = a_data.mT if noncontiguous else a_data # Test column-major blocks - a = torch._sparse_csr_tensor_unsafe(a.crow_indices(), a.col_indices(), a_data, (m * block_size, k * block_size)) - b = make_tensor((k * block_size,), dtype=dtype, device=device, noncontiguous=noncontiguous) - c = make_tensor((m * block_size,), dtype=dtype, device=device, noncontiguous=noncontiguous) - self.run_test_block_addmm_addmv(torch.addmv, c, a, b, dtype=dtype, device=device) - + a_data = a_data.mT if noncontiguous else a_data # Test column-major blocks + a = torch._sparse_bsr_tensor_unsafe(a.crow_indices(), a.col_indices(), + a_data, (m * block_size, k * block_size)) + b = make_tensor((k * block_size,), dtype=dtype, device=device, noncontiguous=noncontiguous) + c = make_tensor((m * block_size,), dtype=dtype, device=device, noncontiguous=noncontiguous) + self.run_test_block_addmm_addmv(torch.addmv, c, a, b, dtype=dtype, device=device) + + @parametrize("block_size", [2, 3]) + @parametrize("index_dtype", [torch.int32, torch.int64]) + @parametrize("noncontiguous", [True, False]) @skipCPUIfNoMklSparse - @skipCUDAIfRocm @unittest.skipIf(not TEST_SCIPY, "SciPy not found") @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128) - def test_block_triangular_solve(self, device, dtype): + def test_block_triangular_solve(self, device, dtype, index_dtype, block_size, noncontiguous): def run_test(a, b, upper, transpose, unitriangular, op_out): + if unitriangular and self.device_type == 'cpu': + # TODO: When unitriangular=True results are not correct on CPU + return + + if not upper and self.device_type == 'cpu': + # TODO: When upper=False some generated inputs might crash on CPU + return + actual = torch.triangular_solve(b, a, upper=upper, unitriangular=unitriangular, transpose=transpose) actual_X = actual.solution actual_A_clone = actual.cloned_coefficient @@ -754,6 +1035,14 @@ def run_test(a, b, upper, transpose, unitriangular, op_out): transpose=transpose, upper=upper, unitriangular=unitriangular) + + if expected_X.isnan().any(): + # TODO: zeros on the diagonal are not handled for CPU path + # there's no way to query this info from MKL + if self.device_type == 'cuda' and not TEST_WITH_ROCM: + self.assertTrue(actual_X.isnan().any() or actual_X.isinf().any()) + return + self.assertEqual(actual_X, expected_X) out = torch.empty_like(b.mH if op_out and a.shape == b.shape else b) @@ -764,53 +1053,70 @@ def run_test(a, b, upper, transpose, unitriangular, op_out): self.assertEqual(out, actual_X) self.assertEqual(out, expected_X) - for index_dtype in [torch.int32, torch.int64]: - for (m, k), block_size, noncontiguous in zip(itertools.product([1, 5], repeat=2), [2, 3], [True, False]): - nnz = random.randint(0, m * m) + for (m, k) in itertools.product([2, 3], [1, 3]): + nnz = random.randint(0, m * m) + if not noncontiguous: + a = self.genSparseCSRTensor((m * block_size, m * block_size), nnz, + dtype=dtype, device=device, index_dtype=index_dtype) + a = a.to_sparse_bsr((block_size, block_size)) + else: a = self.genSparseCSRTensor((m, m), nnz, dtype=dtype, device=device, index_dtype=index_dtype) a_data = make_tensor((nnz, block_size, block_size), dtype=dtype, device=device) a_data = a_data.mT if noncontiguous else a_data # Test column-major blocks - a = torch._sparse_csr_tensor_unsafe(a.crow_indices(), a.col_indices(), a_data, (m * block_size, m * block_size)) - b = make_tensor((m * block_size, k), dtype=dtype, device=device, noncontiguous=noncontiguous) + a = torch._sparse_bsr_tensor_unsafe(a.crow_indices(), a.col_indices(), + a_data, (m * block_size, m * block_size)) + b = make_tensor((m * block_size, k), dtype=dtype, device=device, noncontiguous=noncontiguous) - for (upper, unitriangular, transpose, op_out) in itertools.product([True, False], repeat=4): - run_test(a, b, upper, unitriangular, transpose, op_out) + for (upper, unitriangular, transpose, op_out) in itertools.product([True, False], repeat=4): + run_test(a, b, upper, unitriangular, transpose, op_out) @skipCPUIfNoMklSparse @dtypes(torch.double) def test_mm(self, device, dtype): - def test_shape(di, dj, dk, nnz): + def test_shape(di, dj, dk, nnz0=None, nnz1=None): for index_dtype in [torch.int32, torch.int64]: - x = self.genSparseCSRTensor((di, dj), nnz, device=device, dtype=dtype, index_dtype=index_dtype) - t = torch.randn(di, dk, dtype=dtype, device=device) - y = torch.randn(dj, dk, dtype=dtype, device=device) alpha = random.random() beta = random.random() - # res = beta * t + alpha * (x @ y) - res = torch.addmm(t, x, y, beta=beta, alpha=alpha) - expected = torch.addmm(t, x.to_dense(), y, beta=beta, alpha=alpha) - self.assertEqual(res, expected) - - res = torch.addmm(t, x, y) - expected = torch.addmm(t, x.to_dense(), y) - self.assertEqual(res, expected) - - res = torch.mm(x, y) - expected = torch.mm(x.to_dense(), y) - self.assertEqual(res, expected) + def _test(t, x, y): + # res = beta * t + alpha * (x @ y) + res = torch.addmm(t, x, y, beta=beta, alpha=alpha) + expected = torch.addmm(t, x.to_dense(), y.to_dense(), beta=beta, alpha=alpha) + self.assertEqual(res, expected) + + res = torch.addmm(t, x, y) + expected = torch.addmm(t, x.to_dense(), y.to_dense()) + self.assertEqual(res, expected) + + res = torch.mm(x, y) + expected = torch.mm(x.to_dense(), y.to_dense()) + self.assertEqual(res, expected) + + if nnz0 is None: + nnz0 = random.randint(di * dk // 2, di * dk) + t = torch.randn(di, dj, dtype=dtype, device=device) + x = self.genSparseCSRTensor((di, dk), nnz0, device=device, dtype=dtype, index_dtype=index_dtype) + y = torch.randn(dk, dj, dtype=dtype, device=device) + _test(t, x, y) + + if nnz1 is None: + nnz1 = random.randint(dk * dj // 2, dk * dj) + t = torch.randn(di, dj, dtype=dtype, device=device) + x = torch.randn(di, dk, dtype=dtype, device=device) + y = self.genSparseCSRTensor((dk, dj), nnz1, device=device, dtype=dtype, index_dtype=index_dtype) + _test(t, x, y) for i in range(2, 5): for j in range(2, 8): for k in range(2, 8): - test_shape(i, j, k, i * j // 2) - test_shape(4, 4, 4, 0) + test_shape(i, j, k) + test_shape(4, 4, 4, 0, 0) @skipCPUIfNoMklSparse @dtypes(*floating_and_complex_types()) - @dtypesIfCUDA(*get_all_complex_dtypes(), - *get_all_fp_dtypes(include_half=SM53OrLater and TEST_CUSPARSE_GENERIC, - include_bfloat16=SM80OrLater and TEST_CUSPARSE_GENERIC)) + @dtypesIfCUDA(*floating_and_complex_types_and( + *[torch.half] if SM53OrLater and TEST_CUSPARSE_GENERIC else [], + *[torch.bfloat16] if SM80OrLater and TEST_CUSPARSE_GENERIC else [])) @precisionOverride({torch.bfloat16: 1e-2, torch.float16: 1e-2}) def test_sparse_mm(self, device, dtype): def test_shape(d1, d2, d3, nnz, transposed, index_dtype): @@ -827,9 +1133,9 @@ def test_shape(d1, d2, d3, nnz, transposed, index_dtype): test_shape(7, 8, 9, 20, True, index_dtype) @dtypes(*floating_and_complex_types()) - @dtypesIfCUDA(*get_all_complex_dtypes(), - *get_all_fp_dtypes(include_half=SM53OrLater and TEST_CUSPARSE_GENERIC, - include_bfloat16=SM80OrLater and TEST_CUSPARSE_GENERIC)) + @dtypesIfCUDA(*floating_and_complex_types_and( + *[torch.half] if SM53OrLater and TEST_CUSPARSE_GENERIC else [], + *[torch.bfloat16] if SM80OrLater and TEST_CUSPARSE_GENERIC else [])) @precisionOverride({torch.bfloat16: 1e-2, torch.float16: 1e-2}) def test_sparse_addmm(self, device, dtype): def test_shape(m, n, p, nnz, broadcast, index_dtype, alpha_beta=None): @@ -861,10 +1167,10 @@ def test_shape(m, n, p, nnz, broadcast, index_dtype, alpha_beta=None): @dtypes(*floating_and_complex_types()) @precisionOverride({torch.double: 1e-8, torch.float: 1e-4, torch.bfloat16: 0.6, torch.half: 1e-1, torch.cfloat: 1e-4, torch.cdouble: 1e-8}) - @dtypesIfCUDA(torch.complex64, - *((torch.complex128,) if CUSPARSE_SPMM_COMPLEX128_SUPPORTED else ()), - *torch.testing.get_all_fp_dtypes(include_bfloat16=SM80OrLater, - include_half=SM53OrLater)) + @dtypesIfCUDA(*floating_types_and(torch.complex64, + *[torch.bfloat16] if SM80OrLater else [], + *[torch.half] if SM53OrLater else [], + *[torch.complex128] if CUSPARSE_SPMM_COMPLEX128_SUPPORTED else [])) @skipCUDAIf( not _check_cusparse_spgemm_available(), "cuSparse Generic API SpGEMM is not available" @@ -873,19 +1179,52 @@ def test_addmm_all_sparse_csr(self, device, dtype): M = torch.randn(10, 25, device=device).to(dtype) m1 = torch.randn(10, 50, device=device).to(dtype) m2 = torch.randn(50, 25, device=device).to(dtype) - _test_addmm_addmv(self, torch.addmm, M, m1, m2, layout=torch.sparse_csr, all_sparse=True) + _test_addmm_addmv(self, torch.addmm, M, m1, m2, layout=torch.sparse_csr, mode="all_sparse") + + # Test 0-strided + M = torch.randn(10, 1, device=device).to(dtype).expand(10, 25) + m1 = torch.randn(10, 1, device=device).to(dtype).expand(10, 50) + m2 = torch.randn(50, 25, device=device).to(dtype) + _test_addmm_addmv(self, torch.addmm, M, m1, m2, layout=torch.sparse_csr, mode="all_sparse") + + # Test beta=0, M=nan + M = torch.full((10, 25), float('nan'), device=device).to(dtype) + m1 = torch.randn(10, 50, device=device).to(dtype) + m2 = torch.randn(50, 25, device=device).to(dtype) + _test_addmm_addmv(self, torch.addmm, M, m1, m2, beta=0, layout=torch.sparse_csr, mode="all_sparse") + + # Test transpose + for t1, t2, t3, t4 in itertools.product([True, False], repeat=4): + def maybe_transpose(cond, m): + if not cond: + return m + return m.t().clone(memory_format=torch.contiguous_format).t() + + M = maybe_transpose(t1, torch.randn(10, 25, device=device).to(dtype)) + m1 = maybe_transpose(t2, torch.randn(10, 50, device=device).to(dtype)) + m2 = maybe_transpose(t3, torch.randn(50, 25, device=device).to(dtype)) + _test_addmm_addmv(self, torch.addmm, M, m1, m2, transpose_out=t4, layout=torch.sparse_csr, mode="all_sparse") + + @onlyCPU + @skipCPUIfNoMklSparse + @dtypes(*floating_and_complex_types()) + def test_addmm_dense_result(self, device, dtype): + M = torch.randn(10, 25, device=device).to(dtype) + m1 = torch.randn(10, 50, device=device).to(dtype) + m2 = torch.randn(50, 25, device=device).to(dtype) + _test_addmm_addmv(self, torch.addmm, M, m1, m2, layout=torch.sparse_csr, mode="dense_result") # Test 0-strided M = torch.randn(10, 1, device=device).to(dtype).expand(10, 25) m1 = torch.randn(10, 1, device=device).to(dtype).expand(10, 50) m2 = torch.randn(50, 25, device=device).to(dtype) - _test_addmm_addmv(self, torch.addmm, M, m1, m2, layout=torch.sparse_csr, all_sparse=True) + _test_addmm_addmv(self, torch.addmm, M, m1, m2, layout=torch.sparse_csr, mode="dense_result") # Test beta=0, M=nan M = torch.full((10, 25), float('nan'), device=device).to(dtype) m1 = torch.randn(10, 50, device=device).to(dtype) m2 = torch.randn(50, 25, device=device).to(dtype) - _test_addmm_addmv(self, torch.addmm, M, m1, m2, beta=0, layout=torch.sparse_csr, all_sparse=True) + _test_addmm_addmv(self, torch.addmm, M, m1, m2, beta=0, layout=torch.sparse_csr, mode="dense_result") # Test transpose for t1, t2, t3, t4 in itertools.product([True, False], repeat=4): @@ -897,34 +1236,34 @@ def maybe_transpose(cond, m): M = maybe_transpose(t1, torch.randn(10, 25, device=device).to(dtype)) m1 = maybe_transpose(t2, torch.randn(10, 50, device=device).to(dtype)) m2 = maybe_transpose(t3, torch.randn(50, 25, device=device).to(dtype)) - _test_addmm_addmv(self, torch.addmm, M, m1, m2, transpose_out=t4, layout=torch.sparse_csr, all_sparse=True) + _test_addmm_addmv(self, torch.addmm, M, m1, m2, transpose_out=t4, layout=torch.sparse_csr, mode="dense_result") + @parametrize("k", [0, 1, 8]) + @parametrize("n", [0, 1, 10]) + @parametrize("m", [0, 1, 25]) @skipCPUIfNoMklSparse @dtypes(*floating_and_complex_types()) - @dtypesIfCUDA(torch.complex64, - *((torch.complex128,) if CUSPARSE_SPMM_COMPLEX128_SUPPORTED else ()), - *torch.testing.get_all_fp_dtypes(include_bfloat16=SM80OrLater, - include_half=SM53OrLater)) + @dtypesIfCUDA(*floating_types_and(torch.complex64, + *[torch.bfloat16] if SM80OrLater else [], + *[torch.half] if SM53OrLater else [], + *[torch.complex128] if CUSPARSE_SPMM_COMPLEX128_SUPPORTED else [])) @skipCUDAIf( not _check_cusparse_spgemm_available(), "cuSparse Generic API SpGEMM is not available" ) @precisionOverride({torch.double: 1e-8, torch.float: 1e-4, torch.bfloat16: 0.6, torch.half: 1e-1, torch.cfloat: 1e-4, torch.cdouble: 1e-8}) - def test_addmm_sizes_all_sparse_csr(self, device, dtype): - for m in [0, 1, 25]: - for n in [0, 1, 10]: - for k in [0, 1, 8]: - M = torch.randn(n, m, device=device).to(dtype) - m1 = torch.randn(n, k, device=device).to(dtype) - m2 = torch.randn(k, m, device=device).to(dtype) - _test_addmm_addmv(self, torch.addmm, M, m1, m2, layout=torch.sparse_csr, all_sparse=True) - - M = torch.randn(n, m, device=device).to(dtype).to_sparse_csr() - m1 = torch.randn(n, k + 1, device=device).to(dtype).to_sparse_csr() - m2 = torch.randn(k, m, device=device).to(dtype).to_sparse_csr() - self.assertRaisesRegex(RuntimeError, f"{n}x{k + 1}.*{k}x{m}", lambda: torch.addmm(M, m1, m2)) - self.assertRaisesRegex(RuntimeError, f"{n}x{k + 1}.*{k}x{m}", lambda: torch.mm(m1, m2)) + def test_addmm_sizes_all_sparse_csr(self, device, dtype, m, n, k): + M = torch.randn(n, m, device=device).to(dtype) + m1 = torch.randn(n, k, device=device).to(dtype) + m2 = torch.randn(k, m, device=device).to(dtype) + _test_addmm_addmv(self, torch.addmm, M, m1, m2, layout=torch.sparse_csr, mode="all_sparse") + + M = torch.randn(n, m, device=device).to(dtype).to_sparse_csr() + m1 = torch.randn(n, k + 1, device=device).to(dtype).to_sparse_csr() + m2 = torch.randn(k, m, device=device).to(dtype).to_sparse_csr() + self.assertRaisesRegex(RuntimeError, f"{n}x{k + 1}.*{k}x{m}", lambda: torch.addmm(M, m1, m2)) + self.assertRaisesRegex(RuntimeError, f"{n}x{k + 1}.*{k}x{m}", lambda: torch.mm(m1, m2)) @skipCPUIfNoMklSparse @dtypes(torch.float) @@ -1000,6 +1339,9 @@ def test2(*, is_sparse): @dtypes(torch.float, torch.double) def test_add(self, device, dtype): def _test_spadd_shape(nnz, shape): + # sparse.to_dense() uses torch.add internally so if torch.add is wrong, + # the dense tensor will be wrong but this test would still pass + # there's a separate test that checks for the correctness of the .to_dense() call x = self.genSparseCSRTensor(shape, nnz, dtype=dtype, device=device, index_dtype=torch.int32) y = torch.randn(*shape, dtype=dtype, device=device) r = random.random() @@ -1021,10 +1363,74 @@ def _test_spadd_shape(nnz, shape): self.assertEqual(res, expected) - _test_spadd_shape(10, [100, 100]) - _test_spadd_shape(0, [100, 100]) - _test_spadd_shape(10, [100, 1]) - _test_spadd_shape(10, [1, 100]) + ns = [2, 5] + batch_shapes = [(), (2,), (2, 3)] + for b, m, n in itertools.product(batch_shapes, ns, ns): + _test_spadd_shape(0, (*b, m, n)) + _test_spadd_shape(m * n // 2, (*b, m, n)) + _test_spadd_shape(m * n, (*b, m, n)) + + @dtypes(torch.float, torch.double) + def test_mul(self, device, dtype): + # TODO: This whole test should be migrated to OpInfos + def _test_spadd_shape(fn, nnz, shape): + x = self.genSparseCSRTensor(shape, nnz, dtype=dtype, device=device, index_dtype=torch.int32) + y = self.genSparseCSRTensor(shape, nnz, dtype=dtype, device=device, index_dtype=torch.int32) + + # Forward comparison + res_sparse_sparse = fn(y, x) + res_dense_sparse = fn(y.to_dense(), x) + res_sparse_dense = fn(y, x.to_dense()) + expected = fn(y.to_dense(), x.to_dense()).to_sparse_csr() + self.assertEqual(res_sparse_sparse, expected) + # TODO: While result of mul(dense, csr) is csr, it is not fully compressed. + # That means it may contain materialized zeros, since the dense argument + # is converted according to the sparsity pattern of csr. In the future + # we might require the result to be fully compressed. + self.assertEqual(res_dense_sparse.to_dense(), expected.to_dense()) + self.assertEqual(res_sparse_dense.to_dense(), expected.to_dense()) + + # Grad comparison + x = self.genSparseCSRTensor(shape, nnz, dtype=dtype, device=device, index_dtype=torch.int32) + y = self.genSparseCSRTensor(shape, nnz, dtype=dtype, device=device, index_dtype=torch.int32) + z = self.genSparseCSRTensor(shape, nnz, dtype=dtype, device=device, index_dtype=torch.int32) + + # csr * csr -> csr with csr, csr gradients + x_a = x.clone().requires_grad_() + y_a = y.clone().requires_grad_() + + fn(y_a, x_a).backward(z) + + x_dense_a = x.to_dense().requires_grad_() + y_dense_a = y.to_dense().requires_grad_() + + fn(y_dense_a, x_dense_a).backward(z.to_dense()) + + self.assertEqual(x_a.grad.layout, torch.sparse_csr) + self.assertEqual(y_a.grad.layout, torch.sparse_csr) + + self.assertEqual(x_a.grad.to_dense(), x_dense_a.grad) + self.assertEqual(y_a.grad.to_dense(), y_dense_a.grad) + + # TODO: Currently strided Tensors cannot have csr gradients + # dense * csr -> csr with csr, dense gradients + x_a = x.clone().requires_grad_() + y_a = y.to_dense().clone().requires_grad_() + err_msg = "Function MulBackward0 returned an invalid gradient at index 0 - expected layout Strided but got SparseCsr" + with self.assertRaisesRegex(RuntimeError, err_msg): + fn(y_a, x_a).backward(z) + + # csr * dense -> csr with dense, csr gradients + x_a = x.to_dense().clone().requires_grad_() + y_a = y.clone().requires_grad_() + err_msg = "Function MulBackward0 returned an invalid gradient at index 1 - expected layout Strided but got SparseCsr" + with self.assertRaisesRegex(RuntimeError, err_msg): + fn(y_a, x_a).backward(z) + + _test_spadd_shape(torch.mul, 100, [100, 100]) + _test_spadd_shape(torch.mul, 0, [100, 100]) + _test_spadd_shape(torch.mul, 100, [100, 1]) + _test_spadd_shape(torch.mul, 100, [1, 100]) @skipCPUIfNoMklSparse @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128) @@ -1135,7 +1541,6 @@ def run_test(n, k, upper, unitriangular, transpose, zero): run_test(n, k, upper, unitriangular, transpose, zero) @skipCUDAIfRocm - @onlyCUDA @skipCUDAIf( not _check_cusparse_sddmm_available(), "cuSparse Generic API SDDMM is not available" @@ -1162,7 +1567,7 @@ def run_test(c, a, b, op_a, op_b, *, alpha=None, beta=None): out = torch.sparse_csr_tensor( *map(torch.clone, (actual.crow_indices(), actual.col_indices())), torch.empty_like(actual.values()), - size=c.shape + size=actual.shape ) torch.sparse.sampled_addmm(c, a, b, alpha=alpha, beta=beta, out=out) @@ -1171,15 +1576,52 @@ def run_test(c, a, b, op_a, op_b, *, alpha=None, beta=None): self.assertEqual(actual.to_dense(), out.to_dense()) self.assertEqual(actual.to_dense(), expected) + mnk = itertools.product([2, 5], repeat=3) + batch_shapes = [(), (2,), (2, 3)] if self.device_type == 'cuda' else [(), ] + tf = [True, False] for index_dtype in [torch.int32, torch.int64]: - for (m, n, k), noncontiguous in zip(itertools.product([1, 5], repeat=3), [True, False]): + for (m, n, k), b, noncontiguous, bcast_c in itertools.product(mnk, batch_shapes, tf, tf): + if bcast_c and len(b) == 0: + continue nnz = random.randint(0, m * n) - c = self.genSparseCSRTensor((m, n), nnz, dtype=dtype, device=device, index_dtype=index_dtype) - a = make_tensor((m, k), dtype=dtype, device=device, noncontiguous=noncontiguous) - b = make_tensor((k, n), dtype=dtype, device=device, noncontiguous=noncontiguous) + c_batch = () if bcast_c else b + c = self.genSparseCSRTensor((*c_batch, m, n), nnz, dtype=dtype, device=device, index_dtype=index_dtype) + a = make_tensor((*b, m, k), dtype=dtype, device=device, noncontiguous=noncontiguous) + b = make_tensor((*b, k, n), dtype=dtype, device=device, noncontiguous=noncontiguous) for op_a, op_b in itertools.product([True, False], repeat=2): run_test(c, a, b, op_a, op_b) + @skipCUDAIfRocm + @skipCUDAIf( + not _check_cusparse_sddmm_available(), + "cuSparse Generic API SDDMM is not available" + ) + @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128) + def test_sampled_addmm_autograd(self, device, dtype): + from torch.testing._internal.common_methods_invocations import sample_inputs_sparse_sampled_addmm + + samples = list(sample_inputs_sparse_sampled_addmm(None, device, dtype, requires_grad=True)) + + for sample, dense_covector in zip(samples, [True, False]): + c = sample.input + a = sample.args[0] + b = sample.args[1] + + # Compute sparse result + output = torch.sparse.sampled_addmm(c, a, b, **sample.kwargs) + covector = torch.randn_like(output).to_dense() if dense_covector else torch.randn_like(output) + output.backward(covector) + + # Compute dense result and compare with sparse result + c1, a1, b1 = map(lambda x: x.detach().to_dense().requires_grad_(True), [c, a, b]) + dense_output = sample.kwargs['alpha'] * (a1 @ b1) * torch.ones_like(c).to_dense() + sample.kwargs['beta'] * c1 + self.assertEqual(output, dense_output) + dense_covector = covector.to_dense() + dense_output.backward(dense_covector) + self.assertEqual(c.grad, c1.grad) + self.assertEqual(a.grad, a1.grad) + self.assertEqual(b.grad, b1.grad) + @skipCUDAIfRocm @onlyCUDA @skipCUDAIf(True, "Causes CUDA memory exception, see https://github.com/pytorch/pytorch/issues/72177") @@ -1220,21 +1662,21 @@ def test_sampled_addmm_errors(self, device, dtype): # mat1 must be a matrix with self.assertRaisesRegex(RuntimeError, r"Expected mat1 to be a matrix"): - torch.sparse.sampled_addmm(a_sparse, a.unsqueeze(0), a) + torch.sparse.sampled_addmm(a_sparse, a[..., 0, :], a) # mat2 must be a matrix with self.assertRaisesRegex(RuntimeError, r"Expected mat2 to be a matrix"): - torch.sparse.sampled_addmm(a_sparse, a, a.unsqueeze(0)) + torch.sparse.sampled_addmm(a_sparse, a, a[..., 0, :]) a = make_tensor((2, 2), dtype=dtype, device=device) b = make_tensor((3, 3), dtype=dtype, device=device) b_sparse = b.to_sparse_csr() - with self.assertRaisesRegex(RuntimeError, r"self dim 0 must match mat1 dim 0"): + with self.assertRaisesRegex(RuntimeError, r"self.shape\[-2\] must match mat1.shape\[-2\]"): torch.sparse.sampled_addmm(b_sparse, a, a) b = make_tensor((2, 3), dtype=dtype, device=device) b_sparse = b.to_sparse_csr() - with self.assertRaisesRegex(RuntimeError, r"self dim 1 must match mat2 dim 1"): + with self.assertRaisesRegex(RuntimeError, r"self.shape\[-1\] must match mat2.shape\[-1\]"): torch.sparse.sampled_addmm(b_sparse, a, a) a = make_tensor((2, 2), dtype=dtype, device=device) @@ -1245,7 +1687,8 @@ def test_sampled_addmm_errors(self, device, dtype): with self.assertRaisesRegex(RuntimeError, r"Expected mat2 to have strided layout"): torch.sparse.sampled_addmm(a_sparse, a, a_sparse) - @dtypes(*get_all_dtypes()) + @skipMeta + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) def test_coo_csr_conversion(self, device, dtype): for m, n in itertools.product([5, 2, 0], [5, 2, 0]): size = (m, n) @@ -1255,9 +1698,20 @@ def test_coo_csr_conversion(self, device, dtype): self.assertEqual(csr_sparse.to_dense(), dense) + @skipMeta + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) + def test_csr_coo_conversion(self, device, dtype): + for m, n in itertools.product([5, 2, 0], [5, 2, 0]): + size = (m, n) + dense = make_tensor(size, dtype=dtype, device=device) + csr_sparse = dense.to_sparse_csr() + coo_sparse = csr_sparse.to_sparse() + + self.assertEqual(coo_sparse.to_dense(), dense) + @ops(_sparse_csr_ops) def test_sparse_csr_consistency(self, device, dtype, op): - samples = op.sample_inputs(device, dtype) + samples = list(op.sample_inputs(device, dtype)) # Fail early to prevent silent success with this test ndims_equals_2d = (s.input.ndim == 2 for s in samples) @@ -1269,7 +1723,9 @@ def test_sparse_csr_consistency(self, device, dtype, op): # Sparse CSR only supports 2D tensors as inputs if sample.input.ndim != 2: continue - + # Reductions on sparse CSR require keepdim=True + if isinstance(op, ReductionOpInfo): + continue expected = op(sample.input) assert torch.is_tensor(expected) output = op(sample.input.to_sparse_csr()) @@ -1326,10 +1782,7 @@ def test_sparse_csr_unary_out(self, device, dtype, op): index_dtype=sample.input.crow_indices().dtype) op(sample.input, *sample.args, **sample.kwargs, out=out) - self.assertEqual(out.values(), expect.values()) - self.assertEqual(out.crow_indices(), expect.crow_indices()) - self.assertEqual(out.col_indices(), expect.col_indices()) - self.assertEqual(out._nnz(), expect._nnz()) + self.assertEqual(out, expect) @ops(sparse_csr_unary_ufuncs) def test_sparse_csr_unary_inplace(self, device, dtype, op): @@ -1361,12 +1814,178 @@ def test_sparse_csr_unary_inplace(self, device, dtype, op): actual = op.inplace_variant(sample.input, *sample.args, **sample.kwargs) self.assertIs(actual, sample.input) - self.assertEqual(actual.values(), expect.values()) - self.assertEqual(actual.crow_indices(), expect.crow_indices()) - self.assertEqual(actual.col_indices(), expect.col_indices()) - self.assertEqual(actual._nnz(), expect._nnz()) + self.assertEqual(actual, expect) + + @ops(sparse_csr_unary_ufuncs, dtypes=OpDTypes.supported, allowed_dtypes=[torch.double, torch.cdouble]) + def test_autograd_sparse_csr_unary(self, device, dtype, op): + if op.name not in UNARY_EWISE_CSR_ALLOW_AUTOGRAD: + self.skipTest(f"Skipped! Unary op {op.name} not supported with CSR input and autograd") + + samples = list(op.sample_inputs(device, dtype)) - @dtypes(*get_all_dtypes(include_bool=False, include_half=False, include_bfloat16=False)) + # Fail early to prevent silent success with this test + ndims_equals_2d = (s.input.ndim == 2 for s in samples) + if not any(ndims_equals_2d): + raise ValueError("Expected at least one 2D tensor in samples.") + + for sample in samples: + sparse_input = sample.input.to_sparse_csr().requires_grad_(True) + + def fn(input): + output = op.gradcheck_wrapper(op.get_op(), input, *sample.args, **sample.kwargs) + if sample.output_process_fn_grad is not None: + return sample.output_process_fn_grad(output) + return output + + # Compute sparse result + output = fn(sparse_input) + covector = torch.randn_like(output) + output.backward(covector) + self.assertTrue(torch.is_tensor(sparse_input.grad)) + self.assertTrue(sparse_input.grad.is_sparse_csr) + + # Compute dense result and compare with sparse result + dense_input = sparse_input.detach().to_dense().requires_grad_(True) + dense_output = fn(dense_input) + dense_covector = covector.to_dense() + dense_output.backward(dense_covector) + self.assertEqual(sparse_input.grad, dense_input.grad) + + @skipCUDAIfRocm + @skipCUDAIf( + not _check_cusparse_sddmm_available(), + "cuSparse Generic API SDDMM is not available" + ) + @dtypes(torch.float64) + def test_autograd_dense_output_addmm(self, device, dtype): + from torch.testing._internal.common_methods_invocations import sample_inputs_addmm + + samples = list(sample_inputs_addmm(None, device, dtype, requires_grad=True)) + + # Fail early to prevent silent success with this test + ndims_equals_2d = (s.args[0].ndim == 2 for s in samples) + if not any(ndims_equals_2d): + raise ValueError("Expected at least one 2D tensor in samples to convert to sparse.") + + for sample in samples: + a = sample.args[0].relu().to_sparse_csr() + + # This path tests the autograd path wrt dense inputs + for addmm in [torch.addmm, torch.sparse.addmm]: + + def fn(c, b): + output = addmm(c, a, b, **sample.kwargs) + if sample.output_process_fn_grad is not None: + return sample.output_process_fn_grad(output) + return output + + self.assertTrue(torch.autograd.gradcheck(fn, [sample.input, sample.args[1]], fast_mode=True)) + + # noncontiguous + c = make_tensor(sample.input.shape, device=device, dtype=dtype, noncontiguous=True, requires_grad=True) + b = make_tensor(sample.args[1].shape, device=device, dtype=dtype, noncontiguous=True, requires_grad=True) + self.assertTrue(torch.autograd.gradcheck(fn, [c, b], fast_mode=True)) + + # Now test the autograd path wrt sparse inputs + for reverse in [True, False]: + c, b = sample.input, sample.args[1] + if reverse and a.shape != b.shape: + continue + + def fn(a): + inputs = (c, b, a) if reverse else (c, a, b) + output = addmm(*inputs, **sample.kwargs) + if sample.output_process_fn_grad is not None: + return sample.output_process_fn_grad(output) + return output + + # gradcheck doesn't work for sparse CSR yet, compare against dense path + # Compute sparse result + a = a.detach().requires_grad_(True) + output = fn(a) + covector = torch.randn_like(output) + output.backward(covector) + self.assertTrue(torch.is_tensor(a.grad)) + if addmm == torch.sparse.addmm: + self.assertTrue(a.grad.is_sparse_csr) + else: + self.assertTrue(a.grad.layout == torch.strided) + + # Compute dense result and compare with sparse result + dense_a = a.detach().to_dense().requires_grad_(True) + dense_output = fn(dense_a) + self.assertEqual(output, dense_output) + dense_covector = covector.to_dense() + dense_output.backward(dense_covector) + + if addmm == torch.sparse.addmm: + self.assertEqual(a.grad, dense_a.grad.sparse_mask(a)) + else: + self.assertEqual(a.grad, dense_a.grad) + + @skipCUDAIfRocm + @skipCPUIfNoMklSparse + @dtypes(torch.float64) + def test_autograd_dense_output_addmv(self, device, dtype): + from torch.testing._internal.common_methods_invocations import sample_inputs_addmv + + samples = list(sample_inputs_addmv(None, device, dtype, requires_grad=True)) + + # Fail early to prevent silent success with this test + ndims_equals_2d = (s.args[0].ndim == 2 for s in samples) + if not any(ndims_equals_2d): + raise ValueError("Expected at least one 2D tensor in samples to convert to sparse.") + + for sample in samples: + # TODO: Remove detach once we have autograd support for CSR input + a = sample.args[0].to_sparse_csr().detach() + + def fn(c, b): + output = torch.addmv(c, a, b, **sample.kwargs) + if sample.output_process_fn_grad is not None: + return sample.output_process_fn_grad(output) + return output + + self.assertTrue(torch.autograd.gradcheck(fn, [sample.input, sample.args[1]], fast_mode=True)) + + # noncontiguous + c = make_tensor(sample.input.shape, device=device, dtype=dtype, noncontiguous=True, requires_grad=True) + b = make_tensor(sample.args[1].shape, device=device, dtype=dtype, noncontiguous=True, requires_grad=True) + self.assertTrue(torch.autograd.gradcheck(fn, [c, b], fast_mode=True)) + + @ops(binary_ops_with_dense_output, dtypes=OpDTypes.supported, allowed_dtypes=[torch.double, ]) + def test_autograd_dense_output(self, device, dtype, op): + if op.name == "mv" and no_mkl_sparse and self.device_type == 'cpu': + self.skipTest("MKL Sparse is not available") + if op.name == "mv" and TEST_WITH_ROCM and self.device_type == 'cuda': + # mv currently work only on CUDA + self.skipTest("ROCm is not supported") + + samples = list(op.sample_inputs(device, dtype, requires_grad=True)) + + # Fail early to prevent silent success with this test + ndims_equals_2d = (s.input.ndim == 2 for s in samples) + if not any(ndims_equals_2d): + raise ValueError("Expected at least one 2D tensor in samples.") + + # Here we assume that the signature is op(sparse_input, dense_input) -> dense_output + for sample in samples: + # TODO: Remove detach once we have autograd support for CSR input + sparse_input = sample.input.to_sparse_csr().detach() + + def fn(*args): + output = op.gradcheck_wrapper(op.get_op(), sparse_input, *args, **sample.kwargs) + if sample.output_process_fn_grad is not None: + return sample.output_process_fn_grad(output) + return output + + self.assertTrue(torch.autograd.gradcheck(fn, sample.args, fast_mode=True)) + + # noncontiguous + args = [make_tensor(a.shape, device=device, dtype=dtype, noncontiguous=True, requires_grad=True) for a in sample.args] + self.assertTrue(torch.autograd.gradcheck(fn, args, fast_mode=True)) + + @dtypes(*all_types_and_complex()) def test_direct_coo_csr_conversion(self, device, dtype): for m, n in itertools.product([5, 2, 0], [5, 2, 0]): size = (m, n) @@ -1376,7 +1995,25 @@ def test_direct_coo_csr_conversion(self, device, dtype): self.assertEqual(coo_sparse.to_sparse_csr().to_sparse_coo(), coo_sparse) @skipMeta - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) + def test_sum(self, device, dtype): + def run_test(shape, nnz, index_type): + a = self.genSparseCSRTensor(shape, nnz, dtype=dtype, device=device, index_dtype=index_dtype) + self.assertEqual(a.sum(), a.values().sum()) + if dtype in floating_types(): + a.requires_grad_(True) + a.sum().backward() + self.assertEqual(a.grad, torch.ones(shape, dtype=dtype, device=device)) + for shape, index_dtype in itertools.product( + [(10, 5), (10, 10)], + [torch.int32, torch.int64]): + run_test(shape, 0, index_dtype) + run_test(shape, max(shape), index_dtype) + run_test(shape, shape[0] * shape[1], index_dtype) + + + @skipMeta + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) def test_transpose(self, device, dtype): def run_test(shape, nnz, index_type, dim0, dim1): @@ -1397,21 +2034,173 @@ def run_test(shape, nnz, index_type, dim0, dim1): # TODO: This is a stopgap for a rigorous extension of our autograd tests # to test the functionality of detach @skipMeta - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) def test_exercise_detach(self, device, dtype): shape = (3, 3) nnz = 4 for index_dtype in [torch.int32, torch.int64]: inp = self.genSparseCSRTensor(shape, nnz, dtype=dtype, device=device, index_dtype=index_dtype) detached_inp = inp.detach() - self.assertEqual(inp.values(), detached_inp.values()) - self.assertEqual(inp.crow_indices(), detached_inp.crow_indices()) - self.assertEqual(inp.col_indices(), detached_inp.col_indices()) + self.assertEqual(inp, detached_inp) + + def _convert_to_layout(self, a, target_layout): + """ + Helper function to call the correct layout conversion + with reasonable defaults for the block size. Clearly there + is a need for a to.layout overload. + """ + if target_layout is torch.sparse_csr: + return a.to_sparse_csr() + if target_layout is torch.sparse_csc: + return a.to_sparse_csc() + if target_layout is torch.sparse_bsr: + return a.to_sparse_bsr((2, 2)) + if target_layout is torch.sparse_bsc: + return a.to_sparse_bsc((2, 2)) + raise NotImplementedError(repr(a)) + + def _construct_sp_matrix(self, tensor, layout): + if tensor.layout in [torch.sparse_coo, torch.sparse_csr, torch.sparse_csc, torch.strided]: + tensor = tensor.to_dense() + else: + raise NotImplementedError(repr(tensor)) + if layout is torch.sparse_csr: + return sp.csr_matrix(tensor.cpu().numpy()) + if layout is torch.sparse_csc: + return sp.csc_matrix(tensor.cpu().numpy()) + if layout is torch.sparse_bsr: + return sp.bsr_matrix(tensor.cpu().numpy()) + # No native scipy BSC support? + raise NotImplementedError(repr(tensor)) + @skipMeta + @all_sparse_compressed_layouts('to_layout') + @all_sparse_compressed_layouts('from_layout') + def test_compressed_layout_conversions_coverage(self, device, from_layout, to_layout): + """ + This test performs a smoke test for covered conversion and verifies + that an exception is thrown for unsupported conversions. + """ + + def _to_from_layout(layout_a, layout_b): + a = make_tensor((6, 10), dtype=torch.float, device=device) + expect_error = (layout_a in [torch.sparse_csc, torch.sparse_bsc] + or layout_b in [torch.sparse_csc, torch.sparse_bsc]) + expect_error = expect_error or (layout_a, layout_b) == (torch.sparse_bsr, torch.sparse_bsr) + expect_error = expect_error or (layout_a, layout_b) == (torch.sparse_bsr, torch.sparse_csr) + # CSC to CSR conversion is supported + if layout_a is torch.sparse_csc and layout_b is torch.sparse_csr: + expect_error = False + # CSC to CSC conversion is supported + if layout_a is torch.sparse_csc and layout_b is torch.sparse_csc: + expect_error = False + if expect_error: + with self.assertRaises(RuntimeError): + b = self._convert_to_layout(a, layout_a) + self._convert_to_layout(b, layout_b) + else: + b = self._convert_to_layout(a, layout_a) + c = self._convert_to_layout(b, layout_b) + if (layout_a is not torch.sparse_bsr and layout_b is not torch.sparse_bsr): + self.assertEqual(a.to_dense(), c.to_dense()) + + _to_from_layout(from_layout, to_layout) + + @skipMeta + @all_sparse_compressed_layouts() + def test_dense_to_from_sparse_compressed(self, device, layout): + """ + This test tests conversion from dense to/from CSR and CSC + by comparing to SciPy's implementation. + + TODO: Eventually this is meant to be merged into test_compressed_layout_conversions_coverage + """ + if layout is torch.sparse_bsc: + # TODO: Remove this once support has been enabled + return + if layout is torch.sparse_bsr: + # TODO: Remove this once support has been enabled + return + + for shape in [(0, 10), (6, 0), (6, 10), (0, 0)]: + dense = make_tensor(shape, dtype=torch.float, device=device) + dense = dense.relu() # Introduce some sparsity + sp_matrix = self._construct_sp_matrix(dense, layout) + pt_matrix = self._convert_to_layout(dense, layout) + + compressed_indices_mth = { + torch.sparse_csr: torch.Tensor.crow_indices, + torch.sparse_csc: torch.Tensor.ccol_indices, + }[layout] + + plain_indices_mth = { + torch.sparse_csr: torch.Tensor.col_indices, + torch.sparse_csc: torch.Tensor.row_indices, + }[layout] + + self.assertEqual(layout, pt_matrix.layout) + self.assertEqual(sp_matrix.shape, pt_matrix.shape) + self.assertEqual(torch.tensor(sp_matrix.indptr, dtype=torch.int64), compressed_indices_mth(pt_matrix)) + self.assertEqual(torch.tensor(sp_matrix.indices, dtype=torch.int64), plain_indices_mth(pt_matrix)) + self.assertEqual(torch.tensor(sp_matrix.data), pt_matrix.values()) + + self.assertEqual(dense, pt_matrix.to_dense()) + + @skipMeta + @all_sparse_compressed_layouts() + @coalescedonoff + @dtypes(torch.double) + def test_sparse_to_sparse_compressed(self, device, dtype, coalesced, layout): + """ + This test tests conversion from COO to CSR and CSC and CSC to CSR and CSC + by comparing to SciPy's implementation. + + TODO: Eventually this is meant to be merged into test_compressed_layout_conversions_coverage + """ + if layout is torch.sparse_bsc: + # TODO: Remove this once support has been enabled + return + if layout is torch.sparse_bsr: + # TODO: Remove this once support has been enabled + return + + for shape in [(0, 10), (6, 0), (6, 10), (0, 0)]: + sparse_dim = 2 + nnz = shape[0] * shape[1] // 2 + sparse, _, _ = self.genSparseTensor(shape, sparse_dim, nnz, coalesced, device, dtype) + sp_matrix = self._construct_sp_matrix(sparse, layout) + pt_matrix = self._convert_to_layout(sparse, layout) + + compressed_indices_mth = { + torch.sparse_csr: torch.Tensor.crow_indices, + torch.sparse_csc: torch.Tensor.ccol_indices, + }[layout] + + plain_indices_mth = { + torch.sparse_csr: torch.Tensor.col_indices, + torch.sparse_csc: torch.Tensor.row_indices, + }[layout] + + self.assertEqual(layout, pt_matrix.layout) + self.assertEqual(sp_matrix.shape, pt_matrix.shape) + self.assertEqual(torch.tensor(sp_matrix.indptr, dtype=torch.int64), compressed_indices_mth(pt_matrix)) + self.assertEqual(torch.tensor(sp_matrix.indices, dtype=torch.int64), plain_indices_mth(pt_matrix)) + self.assertEqual(torch.tensor(sp_matrix.data), pt_matrix.values()) + + sparse_csc = sparse.to_sparse_csc() + sp_matrix = self._construct_sp_matrix(sparse_csc, layout) + pt_matrix = self._convert_to_layout(sparse_csc, layout) + + self.assertEqual(layout, pt_matrix.layout) + self.assertEqual(sp_matrix.shape, pt_matrix.shape) + self.assertEqual(torch.tensor(sp_matrix.indptr, dtype=torch.int64), compressed_indices_mth(pt_matrix)) + self.assertEqual(torch.tensor(sp_matrix.indices, dtype=torch.int64), plain_indices_mth(pt_matrix)) + self.assertEqual(torch.tensor(sp_matrix.data), pt_matrix.values()) # e.g., TestSparseCSRCPU and TestSparseCSRCUDA instantiate_device_type_tests(TestSparseCSR, globals()) +instantiate_device_type_tests(TestSparseCompressed, globals()) if __name__ == '__main__': run_tests() diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py index fecb4735976e..b4f37cc1558e 100644 --- a/test/test_spectral_ops.py +++ b/test/test_spectral_ops.py @@ -10,12 +10,14 @@ import inspect from torch.testing._internal.common_utils import \ - (TestCase, run_tests, TEST_NUMPY, TEST_LIBROSA, TEST_MKL) + (TestCase, run_tests, TEST_NUMPY, TEST_LIBROSA, TEST_MKL, first_sample, TEST_WITH_ROCM, + make_tensor) from torch.testing._internal.common_device_type import \ (instantiate_device_type_tests, ops, dtypes, onlyNativeDeviceTypes, - skipCPUIfNoFFT, deviceCountAtLeast, onlyCUDA, OpDTypes, skipIf) + skipCPUIfNoFFT, deviceCountAtLeast, onlyCUDA, OpDTypes, skipIf, toleranceOverride, tol) from torch.testing._internal.common_methods_invocations import ( - spectral_funcs, SpectralFuncInfo, SpectralFuncType) + spectral_funcs, SpectralFuncType) +from torch.testing._internal.common_cuda import SM53OrLater from setuptools import distutils from typing import Optional, List @@ -110,102 +112,27 @@ def _stft_reference(x, hop_length, window): X[:, m] = torch.fft.fft(slc * window) return X -# Tests of functions related to Fourier analysis in the torch.fft namespace -class TestFFT(TestCase): - exact_dtype = True - # rocFFT requires/assumes that the input to hipfftExecC2R or hipfftExecZ2D - # is of the form that is a valid output from a real to complex transform - # (i.e. it cannot be a set of random numbers) - # So for ROCm, call np.fft.rfftn and use its output as the input - # for testing ops that call hipfftExecC2R - def _generate_valid_rocfft_input(self, input, op, s, dim, norm): - def get_op_name(op): - if type(op) == SpectralFuncInfo: - return op.name - else: - return op.__name__ - - op_name = get_op_name(op) - - # pick ops that call hipfftExecC2R or hipfftExecZ2D - if op_name in ("fft.irfft", "fft.hfft"): - n = s - # figure out fft_size - if dim is None and n is None: - dim = tuple(range(-(input.dim()), 0)) - s = [input.size(d) for d in dim] - elif dim is None and n is not None: - dim = -1 - s = [n] - elif dim is not None and n is None: - s = [input.size(d) for d in [dim]] - else: - s = [n] - fft_size = s[-1] +def skip_helper_for_fft(device, dtype): + device_type = torch.device(device).type + if dtype not in (torch.half, torch.complex32): + return - # make fft_size even to match rocfft behavior to cuda and numpy - if (fft_size % 2) != 0: - n = fft_size + 1 + if device_type == 'cpu': + raise unittest.SkipTest("half and complex32 are not supported on CPU") + if TEST_WITH_ROCM: + raise unittest.SkipTest("half and complex32 are not supported on ROCM") + if not SM53OrLater: + raise unittest.SkipTest("half and complex32 are only supported on CUDA device with SM>53") - # generate Hermitian symmetric input - if torch.is_complex(input): - valid_input = torch.fft.rfft(input.real, n=n, dim=dim, norm=norm) - else: - valid_input = torch.fft.rfft(input, n=n, dim=dim, norm=norm) - - return (valid_input, n, dim, norm) - elif op_name in ("fft.irfftn", "fft.hfftn"): - # figure out fft_size - if dim is None and s is None: - dim = tuple(range(-(input.dim()), 0)) - s = [input.size(d) for d in dim] - elif dim is None and s is not None: - dim = tuple(range(-(len(s)), 0)) - elif dim is not None and s is None: - s = [input.size(d) for d in dim] - - fft_size = s[-1] - - # make fft_size even to match rocfft behavior to cuda and numpy - if (fft_size % 2) != 0: - if type(s) is tuple: - s = list(s) - s[-1] = fft_size + 1 - - # generate Hermitian symmetric input - if torch.is_complex(input): - valid_input = torch.fft.rfftn(input.real, s=s, dim=dim, norm=norm) - else: - valid_input = torch.fft.rfftn(input, s=s, dim=dim, norm=norm) - return (valid_input, s, dim, norm) - elif op_name in ("fft_irfft2", "fft_hfft2"): - # figure out fft_size - if dim is None and s is None: - dim = tuple(range(-(2), 0)) - s = [input.size(d) for d in dim] - elif dim is None and s is not None: - dim = tuple(range(-(len(s)), 0)) - elif dim is not None and s is None: - s = [input.size(d) for d in dim] - fft_size = s[-1] - - # make fft_size even to match rocfft behavior to cuda and numpy - if (fft_size % 2) != 0: - if type(s) is tuple: - s = list(s) - s[-1] = fft_size + 1 - # generate Hermitian symmetric input - if torch.is_complex(input): - valid_input = torch.fft.rfft2(input.real, s=s, dim=dim, norm=norm) - else: - valid_input = torch.fft.rfft2(input, s=s, dim=dim, norm=norm) - return (valid_input, s, dim, norm) - else: - return (input, s, dim, norm) + +# Tests of functions related to Fourier analysis in the torch.fft namespace +class TestFFT(TestCase): + exact_dtype = True @onlyNativeDeviceTypes - @ops([op for op in spectral_funcs if op.ndimensional == SpectralFuncType.OneD]) + @ops([op for op in spectral_funcs if op.ndimensional == SpectralFuncType.OneD], + allowed_dtypes=(torch.float, torch.cfloat)) def test_reference_1d(self, device, dtype, op): if op.ref is None: raise unittest.SkipTest("No reference implementation") @@ -239,10 +166,6 @@ def test_reference_1d(self, device, dtype, op): input = args[0] args = args[1:] - if torch.version.hip is not None and input.device.type == 'cuda': - input, args[0], args[1], args[2] = self._generate_valid_rocfft_input( - input, op, args[0], args[1], args[2]) - expected = op.ref(input.cpu().numpy(), *args) exact_dtype = dtype in (torch.double, torch.complex128) actual = op(input, *args) @@ -250,20 +173,39 @@ def test_reference_1d(self, device, dtype, op): @skipCPUIfNoFFT @onlyNativeDeviceTypes - @dtypes(torch.float, torch.double, torch.complex64, torch.complex128) + @toleranceOverride({ + torch.half : tol(1e-2, 1e-2), + torch.chalf : tol(1e-2, 1e-2), + }) + @dtypes(torch.half, torch.float, torch.double, torch.complex32, torch.complex64, torch.complex128) def test_fft_round_trip(self, device, dtype): + skip_helper_for_fft(device, dtype) # Test that round trip through ifft(fft(x)) is the identity - test_args = list(product( - # input - (torch.randn(67, device=device, dtype=dtype), - torch.randn(80, device=device, dtype=dtype), - torch.randn(12, 14, device=device, dtype=dtype), - torch.randn(9, 6, 3, device=device, dtype=dtype)), - # dim - (-1, 0), - # norm - (None, "forward", "backward", "ortho") - )) + if dtype not in (torch.half, torch.complex32): + test_args = list(product( + # input + (torch.randn(67, device=device, dtype=dtype), + torch.randn(80, device=device, dtype=dtype), + torch.randn(12, 14, device=device, dtype=dtype), + torch.randn(9, 6, 3, device=device, dtype=dtype)), + # dim + (-1, 0), + # norm + (None, "forward", "backward", "ortho") + )) + else: + # cuFFT supports powers of 2 for half and complex half precision + test_args = list(product( + # input + (torch.randn(64, device=device, dtype=dtype), + torch.randn(128, device=device, dtype=dtype), + torch.randn(4, 16, device=device, dtype=dtype), + torch.randn(8, 6, 2, device=device, dtype=dtype)), + # dim + (-1, 0), + # norm + (None, "forward", "backward", "ortho") + )) fft_functions = [(torch.fft.fft, torch.fft.ifft)] # Real-only functions @@ -282,13 +224,17 @@ def test_fft_round_trip(self, device, dtype): } y = backward(forward(x, **kwargs), **kwargs) + if x.dtype is torch.half and y.dtype is torch.complex32: + # Since type promotion currently doesn't work with complex32 + # manually promote `x` to complex32 + x = x.to(torch.complex32) # For real input, ifft(fft(x)) will convert to complex self.assertEqual(x, y, exact_dtype=( forward != torch.fft.fft or x.is_complex())) # Note: NumPy will throw a ValueError for an empty input @onlyNativeDeviceTypes - @ops(spectral_funcs) + @ops(spectral_funcs, allowed_dtypes=(torch.half, torch.float, torch.complex32, torch.cfloat)) def test_empty_fft(self, device, dtype, op): t = torch.empty(1, 0, device=device, dtype=dtype) match = r"Invalid number of data points \([-\d]*\) specified" @@ -296,6 +242,16 @@ def test_empty_fft(self, device, dtype, op): with self.assertRaisesRegex(RuntimeError, match): op(t) + @onlyNativeDeviceTypes + def test_empty_ifft(self, device): + t = torch.empty(2, 1, device=device, dtype=torch.complex64) + match = r"Invalid number of data points \([-\d]*\) specified" + + for f in [torch.fft.irfft, torch.fft.irfft2, torch.fft.irfftn, + torch.fft.hfft, torch.fft.hfft2, torch.fft.hfftn]: + with self.assertRaisesRegex(RuntimeError, match): + f(t) + @onlyNativeDeviceTypes def test_fft_invalid_dtypes(self, device): t = torch.randn(64, device=device, dtype=torch.complex128) @@ -311,8 +267,11 @@ def test_fft_invalid_dtypes(self, device): @skipCPUIfNoFFT @onlyNativeDeviceTypes - @dtypes(torch.int8, torch.float, torch.double, torch.complex64, torch.complex128) + @dtypes(torch.int8, torch.half, torch.float, torch.double, + torch.complex32, torch.complex64, torch.complex128) def test_fft_type_promotion(self, device, dtype): + skip_helper_for_fft(device, dtype) + if dtype.is_complex or dtype.is_floating_point: t = torch.randn(64, device=device, dtype=dtype) else: @@ -320,8 +279,10 @@ def test_fft_type_promotion(self, device, dtype): PROMOTION_MAP = { torch.int8: torch.complex64, + torch.half: torch.complex32, torch.float: torch.complex64, torch.double: torch.complex128, + torch.complex32: torch.complex32, torch.complex64: torch.complex64, torch.complex128: torch.complex128, } @@ -330,17 +291,27 @@ def test_fft_type_promotion(self, device, dtype): PROMOTION_MAP_C2R = { torch.int8: torch.float, + torch.half: torch.half, torch.float: torch.float, torch.double: torch.double, + torch.complex32: torch.half, torch.complex64: torch.float, torch.complex128: torch.double, } - R = torch.fft.hfft(t) + if dtype in (torch.half, torch.complex32): + # cuFFT supports powers of 2 for half and complex half precision + # NOTE: With hfft and default args where output_size n=2*(input_size - 1), + # we make sure that logical fft size is a power of two. + x = torch.randn(65, device=device, dtype=dtype) + R = torch.fft.hfft(x) + else: + R = torch.fft.hfft(t) self.assertEqual(R.dtype, PROMOTION_MAP_C2R[dtype]) if not dtype.is_complex: PROMOTION_MAP_R2C = { torch.int8: torch.complex64, + torch.half: torch.complex32, torch.float: torch.complex64, torch.double: torch.complex128, } @@ -352,14 +323,38 @@ def test_fft_type_promotion(self, device, dtype): allowed_dtypes=[torch.half, torch.bfloat16]) def test_fft_half_and_bfloat16_errors(self, device, dtype, op): # TODO: Remove torch.half error when complex32 is fully implemented - x = torch.randn(8, 8, device=device).to(dtype) - with self.assertRaisesRegex(RuntimeError, "Unsupported dtype "): - op(x) + sample = first_sample(self, op.sample_inputs(device, dtype)) + device_type = torch.device(device).type + if dtype is torch.half and device_type == 'cuda' and TEST_WITH_ROCM: + err_msg = "Unsupported dtype " + elif dtype is torch.half and device_type == 'cuda' and not SM53OrLater: + err_msg = "cuFFT doesn't support signals of half type with compute capability less than SM_53" + else: + err_msg = "Unsupported dtype " + with self.assertRaisesRegex(RuntimeError, err_msg): + op(sample.input, *sample.args, **sample.kwargs) + + @onlyNativeDeviceTypes + @ops(spectral_funcs, allowed_dtypes=(torch.half, torch.chalf)) + def test_fft_half_and_chalf_not_power_of_two_error(self, device, dtype, op): + t = make_tensor(13, 13, device=device, dtype=dtype) + err_msg = "cuFFT only supports dimensions whose sizes are powers of two" + with self.assertRaisesRegex(RuntimeError, err_msg): + op(t) + + if op.ndimensional in (SpectralFuncType.ND, SpectralFuncType.TwoD): + kwargs = {'s': (12, 12)} + else: + kwargs = {'n': 12} + + with self.assertRaisesRegex(RuntimeError, err_msg): + op(t, **kwargs) # nd-fft tests @onlyNativeDeviceTypes @unittest.skipIf(not TEST_NUMPY, 'NumPy not found') - @ops([op for op in spectral_funcs if op.ndimensional == SpectralFuncType.ND]) + @ops([op for op in spectral_funcs if op.ndimensional == SpectralFuncType.ND], + allowed_dtypes=(torch.cfloat, torch.cdouble)) def test_reference_nd(self, device, dtype, op): if op.ref is None: raise unittest.SkipTest("No reference implementation") @@ -383,9 +378,6 @@ def test_reference_nd(self, device, dtype, op): input = torch.randn(*shape, device=device, dtype=dtype) for norm in norm_modes: - if torch.version.hip is not None: - input, s, dim, norm = self._generate_valid_rocfft_input( - input, op, s, dim, norm) expected = op.ref(input.cpu().numpy(), s, dim, norm) exact_dtype = dtype in (torch.double, torch.complex128) actual = op(input, s, dim, norm) @@ -393,8 +385,15 @@ def test_reference_nd(self, device, dtype, op): @skipCPUIfNoFFT @onlyNativeDeviceTypes - @dtypes(torch.float, torch.double, torch.complex64, torch.complex128) + @toleranceOverride({ + torch.half : tol(1e-2, 1e-2), + torch.chalf : tol(1e-2, 1e-2), + }) + @dtypes(torch.half, torch.float, torch.double, + torch.complex32, torch.complex64, torch.complex128) def test_fftn_round_trip(self, device, dtype): + skip_helper_for_fft(device, dtype) + norm_modes = (None, "forward", "backward", "ortho") # input_ndim, dim @@ -416,7 +415,11 @@ def test_fftn_round_trip(self, device, dtype): (torch.fft.ihfftn, torch.fft.hfftn)] for input_ndim, dim in transform_desc: - shape = itertools.islice(itertools.cycle(range(4, 9)), input_ndim) + if dtype in (torch.half, torch.complex32): + # cuFFT supports powers of 2 for half and complex half precision + shape = itertools.islice(itertools.cycle((2, 4, 8)), input_ndim) + else: + shape = itertools.islice(itertools.cycle(range(4, 9)), input_ndim) x = torch.randn(*shape, device=device, dtype=dtype) for (forward, backward), norm in product(fft_functions, norm_modes): @@ -428,8 +431,13 @@ def test_fftn_round_trip(self, device, dtype): kwargs = {'s': s, 'dim': dim, 'norm': norm} y = backward(forward(x, **kwargs), **kwargs) # For real input, ifftn(fftn(x)) will convert to complex - self.assertEqual(x, y, exact_dtype=( - forward != torch.fft.fftn or x.is_complex())) + if x.dtype is torch.half and y.dtype is torch.chalf: + # Since type promotion currently doesn't work with complex32 + # manually promote `x` to complex32 + self.assertEqual(x.to(torch.chalf), y) + else: + self.assertEqual(x, y, exact_dtype=( + forward != torch.fft.fftn or x.is_complex())) @onlyNativeDeviceTypes @ops([op for op in spectral_funcs if op.ndimensional == SpectralFuncType.ND], @@ -454,8 +462,13 @@ def test_fftn_invalid(self, device, dtype, op): @skipCPUIfNoFFT @onlyNativeDeviceTypes - @dtypes(torch.float, torch.double) + @toleranceOverride({ + torch.half : tol(1e-2, 1e-2), + }) + @dtypes(torch.half, torch.float, torch.double) def test_hfftn(self, device, dtype): + skip_helper_for_fft(device, dtype) + # input_ndim, dim transform_desc = [ *product(range(2, 5), (None, (0,), (0, -1))), @@ -468,8 +481,10 @@ def test_hfftn(self, device, dtype): for input_ndim, dim in transform_desc: actual_dims = list(range(input_ndim)) if dim is None else dim - - shape = tuple(itertools.islice(itertools.cycle(range(4, 9)), input_ndim)) + if dtype is torch.half: + shape = tuple(itertools.islice(itertools.cycle((2, 4, 8)), input_ndim)) + else: + shape = tuple(itertools.islice(itertools.cycle(range(4, 9)), input_ndim)) expect = torch.randn(*shape, device=device, dtype=dtype) input = torch.fft.ifftn(expect, dim=dim, norm="ortho") @@ -486,8 +501,13 @@ def test_hfftn(self, device, dtype): @skipCPUIfNoFFT @onlyNativeDeviceTypes - @dtypes(torch.float, torch.double) + @toleranceOverride({ + torch.half : tol(1e-2, 1e-2), + }) + @dtypes(torch.half, torch.float, torch.double) def test_ihfftn(self, device, dtype): + skip_helper_for_fft(device, dtype) + # input_ndim, dim transform_desc = [ *product(range(2, 5), (None, (0,), (0, -1))), @@ -499,7 +519,11 @@ def test_ihfftn(self, device, dtype): ] for input_ndim, dim in transform_desc: - shape = tuple(itertools.islice(itertools.cycle(range(4, 9)), input_ndim)) + if dtype is torch.half: + shape = tuple(itertools.islice(itertools.cycle((2, 4, 8)), input_ndim)) + else: + shape = tuple(itertools.islice(itertools.cycle(range(4, 9)), input_ndim)) + input = torch.randn(*shape, device=device, dtype=dtype) expect = torch.fft.ifftn(input, dim=dim, norm="ortho") @@ -552,31 +576,18 @@ def fn(t: torch.Tensor, s: Optional[List[int]], dim: List[int] = (-2, -1), norm: torch_fns = (torch_fn, torch.jit.script(fn)) - if torch.version.hip is not None: - valid_input_default, s, _, norm = self._generate_valid_rocfft_input( - input, torch_fn, s, None, norm) - else: - valid_input_default = input - # Once with dim defaulted - input_np = valid_input_default.cpu().numpy() + input_np = input.cpu().numpy() expected = numpy_fn(input_np, s, norm=norm) for fn in torch_fns: - actual = fn(valid_input_default, s, norm=norm) + actual = fn(input, s, norm=norm) self.assertEqual(actual, expected) # Once with explicit dims dim = (1, 0) - if torch.version.hip is not None: - valid_input_explicit, s, dim, norm = self._generate_valid_rocfft_input( - input, torch_fn, s, dim, norm) - input_np = valid_input_explicit.cpu().numpy() - else: - valid_input_explicit = input - expected = numpy_fn(input_np, s, dim, norm) for fn in torch_fns: - actual = fn(valid_input_explicit, s, dim, norm) + actual = fn(input, s, dim, norm) self.assertEqual(actual, expected) @skipCPUIfNoFFT @@ -879,9 +890,16 @@ def librosa_stft(x, n_fft, hop_length, win_length, window, center): input_1d = x.dim() == 1 if input_1d: x = x.view(1, -1) + + # NOTE: librosa 0.9 changed default pad_mode to 'constant' (zero padding) + # however, we use the pre-0.9 default ('reflect') + pad_mode = 'reflect' + result = [] for xi in x: - ri = librosa.stft(xi.cpu().numpy(), n_fft, hop_length, win_length, window, center=center) + ri = librosa.stft(xi.cpu().numpy(), n_fft=n_fft, hop_length=hop_length, + win_length=win_length, window=window, center=center, + pad_mode=pad_mode) result.append(torch.from_numpy(np.stack([ri.real, ri.imag], -1))) result = torch.stack(result, 0) if input_1d: diff --git a/test/test_stateless.py b/test/test_stateless.py index f092f36b2e65..e3e3f03277d8 100644 --- a/test/test_stateless.py +++ b/test/test_stateless.py @@ -1,10 +1,13 @@ # Owner(s): ["module: nn"] import unittest +import sys +import os +import subprocess import torch -import torch.nn.utils._stateless as _stateless +import torch.nn.utils.stateless as stateless from torch.testing._internal.common_cuda import TEST_MULTIGPU from torch.testing._internal.common_utils import run_tests, TestCase @@ -41,7 +44,7 @@ def _run_call_with_mock_module(self, module, device='cpu', prefix=''): # the parameters represent an identity function contrary to the # existing params in module. So here we expect the result to be the # same as the input if the weight swapping went well. - res = _stateless.functional_call(module, parameters, x) + res = stateless.functional_call(module, parameters, x) self.assertEqual(x, res) # check that the weight remain unmodified cur_weight = to_check.l1.weight @@ -70,6 +73,7 @@ def test_functional_call_with_jit(self): self._run_call_with_mock_module(traced_module) @unittest.skipIf(not TEST_MULTIGPU, 'multi-GPU not supported') + @unittest.skip("This doesn't work right now") def test_functional_call_with_data_parallel(self): module = MockModule() module.cuda() @@ -85,7 +89,7 @@ def test_functional_call_with_gradient(self): parameters = {'l1.weight': weight, 'l1.bias': bias, 'buffer': buffer} - res = _stateless.functional_call(module, parameters, x) + res = stateless.functional_call(module, parameters, x) # Check that a backward step calculates the gradient of the supplied parameters res.backward() self.assertIsNotNone(weight.grad) @@ -104,13 +108,13 @@ def test_functional_batch_norm(self): rm = torch.zeros(10) parameters = {'running_mean': rm} prev_rm = module.running_mean.clone() - res = _stateless.functional_call(module, parameters, x) + res = stateless.functional_call(module, parameters, x) cur_rm = module.running_mean self.assertEqual(cur_rm, prev_rm) self.assertEqual(rm, torch.full((10,), 12.8)) # Now run functional without reparametrization and check that the module has # been updated - res = _stateless.functional_call(module, {}, x) + res = stateless.functional_call(module, {}, x) self.assertEqual(module.running_mean, torch.full((10,), 12.8)) def test_circular_references(self): @@ -126,7 +130,7 @@ def test_circular_references(self): 'l1.m.buffer': buffer} prev_weight = module.l1.weight.clone() prev_buffer = module.buffer.clone() - res = _stateless.functional_call(module, parameters, x) + res = stateless.functional_call(module, parameters, x) self.assertEqual(x, res) # check that the weights remain unmodified and were correctly accesed cur_weight = module.l1.weight @@ -146,11 +150,66 @@ def test_reparametrized_module_change_parametrization_original(self): parameters = {'l1.parametrizations.weight.original': torch.nn.Parameter(torch.tensor([[1.0]])), 'l1.bias': torch.tensor([0.0]), 'buffer': torch.tensor([0.0])} - res = torch.nn.utils._stateless.functional_call(module, parameters, x) + res = stateless.functional_call(module, parameters, x) self.assertEqual(x, res) # verify that the spectral normalization is still applied self.assertTrue('l1.parametrizations.weight.original' in dict(module.named_parameters())) self.assertEqual(orig_sn_weight, module.l1.weight) + def test_setattr(self): + class Foo(torch.nn.Module): + def __init__(self): + super().__init__() + self.register_buffer('foo', torch.zeros(())) + + def forward(self, x): + self.foo = self.foo + 1 + return x + self.foo + + a = {'foo': torch.zeros(())} + mod = Foo() + stateless.functional_call(mod, a, torch.ones(())) + self.assertEqual(mod.foo, torch.zeros(())) + self.assertEqual(a['foo'], torch.ones(())) + + +class TestStatelessDeprecation(TestCase): + def test_private_stateless_warns(self): + script = """ +import torch +import warnings + +with warnings.catch_warnings(record=True) as w: + from torch.nn.utils import _stateless + +exit(len(w)) +""" + try: + subprocess.check_output( + [sys.executable, '-W', 'all', '-c', script], + stderr=subprocess.STDOUT, + # On Windows, opening the subprocess with the default CWD makes `import torch` + # fail, so just set CWD to this script's directory + cwd=os.path.dirname(os.path.realpath(__file__)),) + except subprocess.CalledProcessError as e: + self.assertEqual(e.returncode, 1) + else: + self.assertTrue(False, "No warning was raised.") + +class TestPythonOptimizeMode(TestCase): + def test_runs_with_optimize_flag(self): + script = """ +import torch +""" + try: + subprocess.check_output( + [sys.executable, '-OO', '-c', script], + stderr=subprocess.STDOUT, + # On Windows, opening the subprocess with the default CWD makes `import torch` + # fail, so just set CWD to this script's directory + cwd=os.path.dirname(os.path.realpath(__file__)),) + except subprocess.CalledProcessError as e: + self.assertFalse(e.returncode, "Import failed while running python in optimized mode") + if __name__ == '__main__': run_tests() diff --git a/test/test_subclass.py b/test/test_subclass.py new file mode 100644 index 000000000000..2eb45c361ed9 --- /dev/null +++ b/test/test_subclass.py @@ -0,0 +1,245 @@ +# Owner(s): ["module: nn"] + +import tempfile +import torch +from copy import deepcopy +from functools import partial +from torch import nn +from torch.nn.utils.parametrize import register_parametrization, remove_parametrizations +from torch.nn.modules.lazy import LazyModuleMixin +from torch.testing._internal.common_utils import ( + TestCase, run_tests, parametrize, subtest, instantiate_parametrized_tests) +from torch.testing._internal.common_subclass import subclass_db, DiagTensorBelow +from torch.testing._internal.logging_tensor import LoggingTensor +from torch.utils._pytree import tree_map +from unittest import expectedFailure + +# The current test methodology in this file is to test a variety of real use cases +# with a set of fully-fledged tensor subclasses. In the future, this may change +# to more narrowly specify toy subclasses for each of the specific invariants under +# test, avoiding the need to maintain the set of fully-fledged tensor subclasses. + + +# Decorator for parametrizing tests across the various tensor classes. +parametrize_tensor_cls = parametrize("tensor_cls", [ + subtest(tensor_cls, name=info.name) for tensor_cls, info in subclass_db.items()]) + + +class TestSubclass(TestCase): + def _create_tensor(self, tensor_cls): + return subclass_db[tensor_cls].create_fn(3) + + @parametrize_tensor_cls + @parametrize("tensor_requires_grad", [False, True]) + def test_param_invariants(self, tensor_cls, tensor_requires_grad): + x = self._create_tensor(tensor_cls).requires_grad_(tensor_requires_grad) + param = nn.Parameter(x, requires_grad=(not tensor_requires_grad)) + + self.assertIsInstance(param, nn.Parameter) + # Ensure requires_grad passed to Parameter's constructor takes precedence. + self.assertEqual(param.requires_grad, not tensor_requires_grad) + + # Ensure original tensor is not mutated by Parameter construction. + self.assertNotIsInstance(x, nn.Parameter) + self.assertEqual(x.requires_grad, tensor_requires_grad) + + @parametrize_tensor_cls + @parametrize("as_param", [False, True]) + def test_deepcopy(self, tensor_cls, as_param): + x = self._create_tensor(tensor_cls) + if as_param: + x = nn.Parameter(x) + x_copy = deepcopy(x) + self.assertEqual(x, x_copy) + self.assertEqual(x.__class__, x_copy.__class__) + self.assertIsNot(x, x_copy) + self.assertIsInstance(x_copy, tensor_cls) + if as_param: + # Deepcopy should preserve both custom type and "parameter-ness". + self.assertIsInstance(x_copy, nn.Parameter) + + @parametrize_tensor_cls + @parametrize("as_param", [False, True]) + def test_serialization(self, tensor_cls, as_param): + with tempfile.TemporaryFile() as f: + x = self._create_tensor(tensor_cls) + if as_param: + x = nn.Parameter(x) + torch.save(x, f) + f.seek(0) + x_loaded = torch.load(f) + + self.assertEqual(x, x_loaded) + self.assertIsNot(x, x_loaded) + self.assertIsInstance(x_loaded, tensor_cls) + if as_param: + # Serialization should preserve both custom type and "parameter-ness". + self.assertIsInstance(x_loaded, nn.Parameter) + + @parametrize_tensor_cls + @parametrize("as_param", [False, True]) + def test_repr(self, tensor_cls, as_param): + x = self._create_tensor(tensor_cls) + if as_param: + x = nn.Parameter(x) + str_repr = x.__repr__() + if tensor_cls is not torch.Tensor: + self.assertEqual(str_repr.count(f"{tensor_cls.__name__}("), 1) + self.assertEqual(str_repr.count("Parameter"), 1 if as_param else 0) + + @parametrize_tensor_cls + @parametrize("as_param", [False, True]) + def test_type_propagation(self, tensor_cls, as_param): + x = self._create_tensor(tensor_cls) + if as_param: + x = nn.Parameter(x) + + # Call the add operator to produce an output tensor. + output = x + self._create_tensor(torch.Tensor) + + # Custom type should be propagated across operations if closed under the op, but + # "parameter-ness" should not be. + if subclass_db[tensor_cls].closed_under_ops: + self.assertIsInstance(output, tensor_cls) + else: + self.assertIsInstance(output, torch.Tensor) + self.assertNotIsInstance(output, nn.Parameter) + + @parametrize_tensor_cls + def test_module_optimization(self, tensor_cls): + create_fn = partial(self._create_tensor, tensor_cls) + + class MyModule(nn.Module): + def __init__(self): + super().__init__() + self.p1 = nn.Parameter(create_fn()) + + self.p_list = nn.ParameterList([create_fn() for _ in range(3)]) + self.p_list.append(create_fn()) + + self.p_dict = nn.ParameterDict({ + 'foo': create_fn(), + 'bar': create_fn(), + }) + self.p_dict['baz'] = create_fn() + + with torch.no_grad(): + nn.init.normal_(self.p1) + for p in self.p_list: + nn.init.uniform_(p) + for _, p in self.p_dict.items(): + nn.init.uniform_(p) + + def forward(self, x): + out = self.p1 + x + for p in self.p_list: + out = p + out + + for _, v in self.p_dict.items(): + out = v + out + + return out + + m = MyModule() + self.assertEqual(len(m.state_dict()), 8) + + optimizer = torch.optim.SGD(m.parameters(), lr=0.1) + m(create_fn()).sum().backward(torch.tensor(1)) + optimizer.step() + + @parametrize_tensor_cls + @parametrize("leave_parametrized", [False, True]) + def test_parametrization(self, tensor_cls, leave_parametrized): + # TODO: Either implement set_() properly for these tensor subclasses or apply a + # more general fix to avoid the need for special set_() handling. For now, skip + # testing these as they're expected to fail. + if tensor_cls in [LoggingTensor, DiagTensorBelow]: + return + + create_fn = partial(self._create_tensor, tensor_cls) + + class MyModule(nn.Module): + def __init__(self): + super().__init__() + self.weight = nn.Parameter(create_fn()) + + def forward(self, x): + return self.weight + x + + class MyParametrization(nn.Module): + def forward(self, X): + return -X + + m = MyModule() + self.assertEqual(len(m.state_dict()), 1) + register_parametrization(m, 'weight', MyParametrization()) + self.assertIsInstance(m.weight, tensor_cls) + output = m(self._create_tensor(torch.Tensor)) + self.assertIsInstance(output, tensor_cls) + remove_parametrizations(m, 'weight', leave_parametrized=leave_parametrized) + + # Lazy modules with custom tensors are not supported yet. + @expectedFailure + @parametrize_tensor_cls + def test_lazy_module(self, tensor_cls): + if tensor_cls is torch.Tensor: + self.fail('dummy fail for base tensor until the test passes for subclasses') + + class MyLazyModule(LazyModuleMixin, nn.Module): + def __init__(self): + super().__init__() + self.param = nn.UninitializedParameter() + + def initialize_parameters(self, input) -> None: # type: ignore[override] + if self.has_uninitialized_params(): + with torch.no_grad(): + self.param.materialize(input.shape) + nn.init.uniform_(self.param) + + def forward(self, x): + return self.param + x + + m = MyLazyModule() + self.assertTrue(m.has_uninitialized_params()) + output = m(self._create_tensor(tensor_cls)) + self.assertFalse(m.has_uninitialized_params()) + self.assertIsInstance(m.param, tensor_cls) + + def test_non_rewrapping_torch_dispatch_subclass_as_parameter_throws_for_detach(self): + + # Define a subclass that does not rewrap for any function in its __torch_dispatch__ impl. + class NonRewrappingTensor(torch.Tensor): + @staticmethod + def __new__( + cls, t: torch.Tensor + ): + r = super(NonRewrappingTensor, cls)._make_wrapper_subclass( + cls, t.shape, dtype=t.dtype, requires_grad=t.requires_grad, device=t.device) + return r + + def __init__(self, t) -> None: + self.tensor: torch.Tensor = t + + __torch_function__ = torch._C._disabled_torch_function_impl + + @classmethod + def __torch_dispatch__(cls, func, types, args=(), kwargs=None): + + def unwrap(e) -> torch.Tensor: + if isinstance(e, NonRewrappingTensor): + t = e.tensor + return t + else: + return e + + r = func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs)) + # Return an unwrapped tensor no longer of original subclass type. + return r + + with self.assertRaisesRegex(RuntimeError, r"requires that detach\(\) returns an instance of the same type"): + param = nn.Parameter(NonRewrappingTensor(torch.randn(3))) + +instantiate_parametrized_tests(TestSubclass) + +if __name__ == '__main__': + run_tests() diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py index 62d595373b3a..c341ef36dae1 100644 --- a/test/test_tensor_creation_ops.py +++ b/test/test_tensor_creation_ops.py @@ -13,15 +13,17 @@ from torch.testing import make_tensor from torch.testing._internal.common_utils import ( TestCase, run_tests, do_test_empty_full, TEST_WITH_ROCM, suppress_warnings, - torch_to_numpy_dtype_dict, slowTest, - TEST_SCIPY, IS_MACOS, IS_PPC, IS_WINDOWS) + torch_to_numpy_dtype_dict, numpy_to_torch_dtype_dict, slowTest, + TEST_SCIPY, IS_MACOS, IS_PPC, IS_WINDOWS, parametrize) from torch.testing._internal.common_device_type import ( expectedFailureMeta, instantiate_device_type_tests, deviceCountAtLeast, onlyNativeDeviceTypes, onlyCPU, largeTensorTest, precisionOverride, dtypes, onlyCUDA, skipCPUIf, dtypesIfCUDA, skipMeta, get_all_device_types) from torch.testing._internal.common_dtype import ( - get_all_dtypes, get_all_math_dtypes, get_all_int_dtypes, get_all_fp_dtypes, get_all_complex_dtypes + all_types_and_complex_and, all_types_and, floating_and_complex_types, + floating_types, floating_and_complex_types_and, integral_types_and, get_all_dtypes ) +from torch.testing._creation import float_to_corresponding_complex_type_map from torch.utils.dlpack import to_dlpack @@ -147,7 +149,7 @@ def test_vander_types(self, device, dtype): exact_dtype=False) def test_cat_all_dtypes_and_devices(self, device): - for dt in get_all_dtypes(): + for dt in all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16, torch.chalf): x = torch.tensor([[1, 2], [3, 4]], dtype=dt, device=device) expected1 = torch.tensor([[1, 2], [3, 4], [1, 2], [3, 4]], dtype=dt, device=device) @@ -157,7 +159,7 @@ def test_cat_all_dtypes_and_devices(self, device): self.assertEqual(torch.cat((x, x), 1), expected2) def test_fill_all_dtypes_and_devices(self, device): - for dt in get_all_dtypes(): + for dt in all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16, torch.chalf): for x in [torch.tensor((10, 10), dtype=dt, device=device), torch.empty(10000, dtype=dt, device=device)]: # large tensor numel = x.numel() @@ -311,7 +313,7 @@ def run_test(shape, device, diagonal, dtype): (3, 1), (5, 3, 1), (7, 5, 3, 1), # very fat matrices (1, 3), (5, 1, 3), (7, 5, 1, 3), # very thin matrices (1, 3, 3, 3), (3, 1, 3, 3, 3)] # unsqueezed batch dimensions - dtypes = [dtype for dtype in get_all_dtypes() if dtype != torch.bfloat16] + dtypes = all_types_and_complex_and(torch.half, torch.bool) for s, d, dtype in product(shapes, diagonals, dtypes): run_test(s, device, d, dtype) @@ -508,12 +510,12 @@ def test_block_diag_scipy(self, device): self.assertEqual(torch_result, scipy_result) @onlyNativeDeviceTypes - @dtypes(torch.float32, torch.float64) + @dtypes(torch.half, torch.float32, torch.float64) def test_torch_complex(self, device, dtype): real = torch.tensor([1, 2], device=device, dtype=dtype) imag = torch.tensor([3, 4], device=device, dtype=dtype) z = torch.complex(real, imag) - complex_dtype = torch.complex64 if dtype == torch.float32 else torch.complex128 + complex_dtype = float_to_corresponding_complex_type_map[dtype] self.assertEqual(torch.tensor([1.0 + 3.0j, 2.0 + 4.0j], dtype=complex_dtype), z) @onlyNativeDeviceTypes @@ -531,12 +533,12 @@ def test_torch_polar(self, device, dtype): @onlyNativeDeviceTypes @dtypes(torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64, - torch.float16, torch.complex64, torch.complex128, torch.bool) + torch.complex64, torch.complex128, torch.bool) def test_torch_complex_floating_dtype_error(self, device, dtype): for op in (torch.complex, torch.polar): a = torch.tensor([1, 2], device=device, dtype=dtype) b = torch.tensor([3, 4], device=device, dtype=dtype) - error = r"Expected both inputs to be Float or Double tensors but " \ + error = r"Expected both inputs to be Half, Float or Double tensors but " \ r"got [A-Za-z]+ and [A-Za-z]+" with self.assertRaisesRegex(RuntimeError, error): op(a, b) @@ -626,16 +628,22 @@ def test_cat_out(self, device): y = torch.randn((4, 6), device=device) with self.assertRaisesRegex( - RuntimeError, r"unsupported operation:.* input tensor 0"): + RuntimeError, + r"unsupported operation: some elements of the input tensor and " + r"the written-to tensor refer to a single memory location."): torch.cat([x, y], dim=0, out=x) with self.assertRaisesRegex( - RuntimeError, r"unsupported operation:.* input tensor 1"): + RuntimeError, + r"unsupported operation: some elements of the input tensor and " + r"the written-to tensor refer to a single memory location."): torch.cat([x, y], dim=0, out=y) z = torch.zeros((4, 6), device=device) with self.assertRaisesRegex( - RuntimeError, r"unsupported operation:.* input tensor 1"): + RuntimeError, + r"unsupported operation: some elements of the input tensor and " + r"the written-to tensor refer to a single memory location."): torch.cat([y, z], out=z[:2, :]) w = y.view(-1).clone() @@ -739,8 +747,7 @@ def test_cat_out_memory_format(self, device): self.assertTrue(res1_cpu.is_contiguous(memory_format=torch.contiguous_format)) # Case 2: if out= is not the correct shape then the output it is resized internally - # - For the CPU variant the memory format is that of the first tensor - # - For the CUDA variant it only propagates memory format if all the tensors have + # - For both CPU and CUDA variants, it only propagates memory format if all the tensors have # the same memory format, otherwise it just uses contiguous_format as a default out_cuda = torch.empty((0), device=device).contiguous(memory_format=torch.contiguous_format) @@ -751,7 +758,7 @@ def test_cat_out_memory_format(self, device): res2_cpu = torch.cat((a_cpu, b_cpu), out=out_cpu) self.assertTrue(res2_cuda.is_contiguous(memory_format=torch.contiguous_format)) - self.assertTrue(res2_cpu.is_contiguous(memory_format=torch.channels_last)) + self.assertTrue(res2_cpu.is_contiguous(memory_format=torch.contiguous_format)) out_cuda = torch.empty((0), device=device).contiguous(memory_format=torch.contiguous_format) # a_cuda and c_cuda have same memory_format @@ -890,7 +897,7 @@ def _hvd_split_helper(self, torch_fn, np_fn, op_name, inputs, device, dtype, dim bound = dim + 2 * (dim == 0) + (dim == 2) error_expected = len(shape) < bound or (not isinstance(arg, list) and shape[direction] % arg != 0) - t = make_tensor(shape, device, dtype) + t = make_tensor(shape, dtype=dtype, device=device) t_np = t.cpu().numpy() if not error_expected: @@ -1009,8 +1016,7 @@ def _test_special_stacks(self, dim, at_least_dim, torch_fn, np_fn, device, dtype np_fn(np_input) @onlyNativeDeviceTypes - @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False) + - get_all_complex_dtypes())) + @dtypes(*all_types_and_complex_and(torch.half)) def test_hstack_column_stack(self, device, dtype): ops = ((torch.hstack, np.hstack), (torch.column_stack, np.column_stack)) for torch_op, np_op in ops: @@ -1029,8 +1035,7 @@ def test_hstack_column_stack(self, device, dtype): torch_result) @onlyNativeDeviceTypes - @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False) + - get_all_complex_dtypes())) + @dtypes(*all_types_and_complex_and(torch.half)) def test_vstack_row_stack(self, device, dtype): ops = ((torch.vstack, np.vstack), (torch.row_stack, np.row_stack)) for torch_op, np_op in ops: @@ -1047,8 +1052,7 @@ def test_vstack_row_stack(self, device, dtype): self.assertEqual(actual, expected) @onlyNativeDeviceTypes - @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False) + - get_all_complex_dtypes())) + @dtypes(*all_types_and_complex_and(torch.half)) def test_dstack(self, device, dtype): self._test_special_stacks(2, 3, torch.dstack, np.dstack, device, dtype) for i in range(5): @@ -1600,6 +1604,10 @@ def test_cartesian_prod(self, device): def test_combinations(self, device): a = torch.tensor([1, 2, 3], device=device) + c = torch.combinations(a, r=0) + expected = torch.empty(0, dtype=a.dtype, device=device) + self.assertEqual(c, expected) + c = torch.combinations(a, r=1) expected = torch.tensor(list(combinations(a, r=1)), device=device) self.assertEqual(c, expected) @@ -1752,7 +1760,7 @@ def test_random_from_to_bool(self, device): lambda: t.random_(from_, to_) ) - @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) + @dtypes(*all_types_and(torch.bfloat16, torch.half)) def test_random_full_range(self, device, dtype): size = 2000 alpha = 0.1 @@ -1786,7 +1794,7 @@ def test_random_full_range(self, device, dtype): self.assertTrue(from_ <= t.to(torch.double).min() < (from_ + delta)) self.assertTrue((to_inc_ - delta) < t.to(torch.double).max() <= to_inc_) - @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) + @dtypes(*all_types_and(torch.bfloat16, torch.half)) def test_random_from_to(self, device, dtype): size = 2000 alpha = 0.1 @@ -1875,7 +1883,7 @@ def test_random_from_to(self, device, dtype): lambda: t.random_(from_, to_) ) - @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) + @dtypes(*all_types_and(torch.bfloat16, torch.half)) def test_random_to(self, device, dtype): size = 2000 alpha = 0.1 @@ -1933,7 +1941,7 @@ def test_random_to(self, device, dtype): lambda: t.random_(from_, to_) ) - @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) + @dtypes(*all_types_and(torch.bfloat16, torch.half)) def test_random_default(self, device, dtype): size = 2000 alpha = 0.1 @@ -1960,11 +1968,12 @@ def test_empty_full(self, device): torch_device = torch.device(device) device_type = torch_device.type + dtypes = get_all_dtypes(include_half=False, include_bfloat16=False, include_complex32=True) if device_type == 'cpu': - do_test_empty_full(self, get_all_math_dtypes('cpu'), torch.strided, torch_device) + do_test_empty_full(self, dtypes, torch.strided, torch_device) if device_type == 'cuda': - do_test_empty_full(self, get_all_math_dtypes('cpu'), torch.strided, None) - do_test_empty_full(self, get_all_math_dtypes('cpu'), torch.strided, torch_device) + do_test_empty_full(self, dtypes, torch.strided, None) + do_test_empty_full(self, dtypes, torch.strided, torch_device) # TODO: this test should be updated @suppress_warnings @@ -2053,6 +2062,10 @@ def test_zeros(self, device): expected = torch.tensor([[0., 0.], [0., 0.]], device=device, dtype=torch.complex64) self.assertEqual(complexTensor, expected) + complexHalfTensor = torch.zeros(2, 2, device=device, dtype=torch.complex32) + expected = torch.tensor([[0., 0.], [0., 0.]], device=device, dtype=torch.complex32) + self.assertEqual(complexHalfTensor, expected) + # TODO: this test should be updated def test_zeros_out(self, device): shape = (3, 4) @@ -2085,6 +2098,10 @@ def test_ones(self, device): expected = torch.tensor([[True, True]], device=device, dtype=torch.bool) self.assertEqual(res1, expected) + # test chalf + self.assertEqual(torch.ones(100, 100, device=device, dtype=torch.chalf), + torch.ones(100, 100, device=device, dtype=torch.cfloat), exact_dtype=False) + # TODO: this test should be updated @onlyCPU def test_constructor_dtypes(self, device): @@ -2099,6 +2116,9 @@ def test_constructor_dtypes(self, device): self.assertIs(torch.float32, torch.get_default_dtype()) self.assertIs(torch.FloatStorage, torch.Storage) + # only floating-point types are supported as the default type + self.assertRaises(TypeError, lambda: torch.set_default_tensor_type('torch.IntTensor')) + torch.set_default_dtype(torch.float64) self.assertIs(torch.float64, torch.get_default_dtype()) self.assertIs(torch.DoubleStorage, torch.Storage) @@ -2117,13 +2137,21 @@ def test_constructor_dtypes(self, device): self.assertIs(torch.float64, torch.get_default_dtype()) self.assertIs(torch.cuda.DoubleStorage, torch.Storage) - # don't support integral or sparse default types. - self.assertRaises(TypeError, lambda: torch.set_default_tensor_type('torch.IntTensor')) - self.assertRaises(TypeError, lambda: torch.set_default_dtype(torch.int64)) - # don't allow passing dtype to set_default_tensor_type self.assertRaises(TypeError, lambda: torch.set_default_tensor_type(torch.float32)) + # don't allow passing dtype to set_default_dtype + for t in all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.qint8): + # only floating-point types are supported as the default type + if t in ( + torch.half, + torch.float, + torch.double, + torch.bfloat16): + torch.set_default_dtype(t) + else: + self.assertRaises(TypeError, lambda: torch.set_default_dtype(t)) + torch.set_default_tensor_type(default_type) # TODO: this test should be updated @@ -2651,8 +2679,17 @@ def test_empty_tensor_props(self, device): y = torch.empty(tuple(size_ones_instead_of_zeros), device=device) self.assertEqual(x.stride(), y.stride()) + @onlyNativeDeviceTypes + def test_empty_overflow(self, device): + with self.assertRaisesRegex(RuntimeError, 'Storage size calculation overflowed'): + torch.empty([2, 4, 2**29, 2**29], dtype=torch.float64) + with self.assertRaisesRegex(RuntimeError, 'Storage size calculation overflowed'): + torch.empty([8, 8, 2**29, 2**29], dtype=torch.float64) + with self.assertRaisesRegex(RuntimeError, 'Storage size calculation overflowed'): + torch.empty_strided([8, 8], [2**61, 1], dtype=torch.float64) + def test_eye(self, device): - for dtype in get_all_dtypes(): + for dtype in all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16): if dtype == torch.bfloat16: continue # Test the RuntimeError is raised when either m or n is a negative number @@ -2685,8 +2722,7 @@ def test_eye(self, device): self.assertEqual(res1, res2) @precisionOverride({torch.float: 1e-8, torch.double: 1e-10}) - @dtypes(*(get_all_fp_dtypes(include_half=False, include_bfloat16=False) + - get_all_complex_dtypes())) + @dtypes(*floating_and_complex_types()) def test_linspace_vs_numpy(self, device, dtype): start = -0.0316082797944545745849609375 + (0.8888888888j if dtype.is_complex else 0) end = .0315315723419189453125 + (0.444444444444j if dtype.is_complex else 0) @@ -2723,7 +2759,7 @@ def test_logspace_vs_numpy_complex(self, device, dtype): device, dtype) @precisionOverride({torch.float: 1e-6, torch.double: 1e-10}) - @dtypes(*get_all_fp_dtypes(include_half=False, include_bfloat16=False)) + @dtypes(*floating_types()) def test_logspace_vs_numpy(self, device, dtype): start = -0.0316082797944545745849609375 end = .0315315723419189453125 @@ -2786,43 +2822,49 @@ def test_tensor_ctor_device_inference(self, device): sparse_size, dtype=torch.float64) self.assertEqual(sparse_with_dtype.device, torch.device('cpu')) + def _test_signal_window_functions(self, name, dtype, device, **kwargs): + import scipy.signal as signal + + torch_method = getattr(torch, name + '_window') + if not dtype.is_floating_point: + with self.assertRaisesRegex(RuntimeError, r'floating point'): + torch_method(3, dtype=dtype) + return + for size in [0, 1, 2, 5, 10, 50, 100, 1024, 2048]: + for periodic in [True, False]: + res = torch_method(size, periodic=periodic, **kwargs, device=device, dtype=dtype) + # NB: scipy always returns a float64 result + ref = torch.from_numpy(signal.get_window((name, *(kwargs.values())), size, fftbins=periodic)) + self.assertEqual(res, ref, exact_dtype=False) + with self.assertRaisesRegex(RuntimeError, r'not implemented for sparse types'): + torch_method(3, layout=torch.sparse_coo) + self.assertTrue(torch_method(3, requires_grad=True).requires_grad) + self.assertFalse(torch_method(3).requires_grad) + @onlyNativeDeviceTypes @precisionOverride({torch.bfloat16: 5e-2, torch.half: 1e-3}) @unittest.skipIf(not TEST_SCIPY, "Scipy not found") @dtypesIfCUDA(torch.float, torch.double, torch.bfloat16, torch.half, torch.long) @dtypes(torch.float, torch.double, torch.long) - def test_signal_window_functions(self, device, dtype): - import scipy.signal as signal - - def test(name, kwargs): - torch_method = getattr(torch, name + '_window') - if not dtype.is_floating_point: - with self.assertRaisesRegex(RuntimeError, r'floating point'): - torch_method(3, dtype=dtype) - return - for size in [0, 1, 2, 5, 10, 50, 100, 1024, 2048]: - for periodic in [True, False]: - res = torch_method(size, periodic=periodic, **kwargs, device=device, dtype=dtype) - # NB: scipy always returns a float64 result - ref = torch.from_numpy(signal.get_window((name, *(kwargs.values())), size, fftbins=periodic)) - self.assertEqual(res, ref, exact_dtype=False) - with self.assertRaisesRegex(RuntimeError, r'not implemented for sparse types'): - torch_method(3, layout=torch.sparse_coo) - self.assertTrue(torch_method(3, requires_grad=True).requires_grad) - self.assertFalse(torch_method(3).requires_grad) - - for window in ['hann', 'hamming', 'bartlett', 'blackman']: - test(window, kwargs={}) + @parametrize("window", ['hann', 'hamming', 'bartlett', 'blackman']) + def test_signal_window_functions(self, device, dtype, window): + self._test_signal_window_functions(window, dtype, device) + @onlyNativeDeviceTypes + @precisionOverride({torch.bfloat16: 5e-2, torch.half: 1e-3}) + @unittest.skipIf(not TEST_SCIPY, "Scipy not found") + @dtypesIfCUDA(torch.float, torch.double, torch.bfloat16, torch.half, torch.long) + @dtypes(torch.float, torch.double, torch.long) + def test_kaiser_window(self, device, dtype): for num_test in range(50): - test('kaiser', kwargs={'beta': random.random() * 30}) + self._test_signal_window_functions('kaiser', dtype, device, beta=random.random() * 30) def test_tensor_factories_empty(self, device): # ensure we can create empty tensors from each factory function shapes = [(5, 0, 1), (0,), (0, 0, 1, 0, 2, 0, 0)] for shape in shapes: - for dt in get_all_dtypes(): + for dt in all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16, torch.chalf): self.assertEqual(shape, torch.zeros(shape, device=device, dtype=dt).shape) self.assertEqual(shape, torch.zeros_like(torch.zeros(shape, device=device, dtype=dt)).shape) @@ -2843,7 +2885,8 @@ def test_tensor_factories_empty(self, device): self.assertEqual(shape, torch.randint(6, shape, device=device, dtype=dt).shape) self.assertEqual(shape, torch.randint_like(torch.zeros(shape, device=device, dtype=dt), 6).shape) - if dt not in {torch.double, torch.float, torch.half, torch.bfloat16, torch.complex64, torch.complex128}: + if dt not in {torch.double, torch.float, torch.half, torch.bfloat16, + torch.complex32, torch.complex64, torch.complex128}: self.assertRaises(RuntimeError, lambda: torch.rand(shape, device=device, dtype=dt).shape) if dt == torch.double or dt == torch.float or dt.is_complex: @@ -2908,8 +2951,8 @@ def test_arange_bfloat16(self, device): bfloat16_tensor = torch.arange(0, 6, step=2, dtype=torch.bfloat16, device=device) self.assertEqual(ref_tensor, bfloat16_tensor) - @dtypes(*get_all_dtypes(include_bool=False, include_half=False)) - @dtypesIfCUDA(*get_all_dtypes(include_bool=False, include_half=True)) + @dtypes(*all_types_and_complex_and(torch.bfloat16)) + @dtypesIfCUDA(*all_types_and_complex_and(torch.bfloat16)) def test_linspace(self, device, dtype): _from = random.random() to = _from + random.random() @@ -3026,12 +3069,12 @@ def _test_linspace(self, device, dtype, steps): # See NOTE [Linspace+Logspace precision override] @skipCPUIf(True, "compares with CPU") @precisionOverride({torch.half: 0.0039 + LINSPACE_LOGSPACE_EXTRA_EPS}) - @dtypes(*(get_all_fp_dtypes() + get_all_complex_dtypes())) + @dtypes(*floating_and_complex_types_and(torch.half, torch.bfloat16)) def test_linspace_device_vs_cpu(self, device, dtype): self._test_linspace(device, dtype, steps=10) @skipCPUIf(True, "compares with CPU") - @dtypes(*(get_all_fp_dtypes() + get_all_complex_dtypes())) + @dtypes(*floating_and_complex_types_and(torch.half, torch.bfloat16)) def test_linspace_special_steps(self, device, dtype): for steps in self.LINSPACE_LOGSPACE_SPECIAL_STEPS: self._test_linspace(device, dtype, steps=steps) @@ -3072,10 +3115,9 @@ def test_logspace_special_steps(self, device, dtype): self._test_logspace(device, dtype, steps=steps) self._test_logspace_base2(device, dtype, steps=steps) - @dtypes(*get_all_dtypes(include_bool=False, include_half=False, include_complex=False)) - @dtypesIfCUDA(*((get_all_int_dtypes() + [torch.float32, torch.float16, torch.bfloat16]) - if TEST_WITH_ROCM - else get_all_dtypes(include_bool=False, include_half=True, include_complex=False))) + @dtypes(*all_types_and(torch.bfloat16)) + @dtypesIfCUDA(*integral_types_and(torch.half, torch.bfloat16, torch.float32, torch.float64) if TEST_WITH_ROCM else + all_types_and(torch.half, torch.bfloat16)) def test_logspace(self, device, dtype): _from = random.random() to = _from + random.random() @@ -3335,7 +3377,7 @@ def test_normal_std_error(self, device): std = torch.tensor(-1, dtype=torch.float32, device=device) for input in [0, a]: - with self.assertRaisesRegex(RuntimeError, r'normal_ expects std >= 0.0'): + with self.assertRaisesRegex(RuntimeError, r'normal expects std >= 0.0, but found std'): torch.normal(input, -1, (10,)) with self.assertRaisesRegex(RuntimeError, r'normal expects all elements of std >= 0.0'): @@ -3453,7 +3495,7 @@ def seed(generator): self.assertTrue((res1 >= 0).all().item()) @dtypes(torch.half, torch.float, torch.bfloat16, torch.double, - torch.complex64, torch.complex128) + torch.complex32, torch.complex64, torch.complex128) def test_randn(self, device, dtype): SIZE = 100 for size in [0, SIZE]: @@ -3464,7 +3506,7 @@ def test_randn(self, device, dtype): torch.randn(size, size, out=res2) self.assertEqual(res1, res2) - @dtypes(torch.float, torch.double, torch.complex64, torch.complex128) + @dtypes(torch.float, torch.double, torch.complex32, torch.complex64, torch.complex128) def test_rand(self, device, dtype): SIZE = 100 for size in [0, SIZE]: @@ -3488,9 +3530,13 @@ def test_randperm(self, device): for n in (5, 100, 50000, 100000): # Ensure both integer and floating-point numbers are tested. Half follows an execution path that is # different from others on CUDA. - for dtype in (torch.long, torch.half, torch.float): + for dtype in (torch.long, torch.half, torch.float, torch.bfloat16): if n > 2049 and dtype == torch.half: # Large n for torch.half will raise an exception, do not test here. continue + if dtype == torch.bfloat16 and device != 'cpu': + continue + if n > 256 and dtype == torch.bfloat16: + continue with torch.random.fork_rng(devices=rng_device): res1 = torch.randperm(n, dtype=dtype, device=device) res2 = torch.empty(0, dtype=dtype, device=device) @@ -3640,7 +3686,7 @@ def _run_test(self, shape, dtype, count=-1, first=0, offset=None, **kwargs): if offset is None: offset = first * get_dtype_size(dtype) - numpy_original = make_tensor(shape, torch.device("cpu"), dtype).numpy() + numpy_original = make_tensor(shape, dtype=dtype, device="cpu").numpy() original = memoryview(numpy_original) # First call PyTorch's version in case of errors. # If this call exits successfully, the NumPy version must also do so. @@ -3651,13 +3697,13 @@ def _run_test(self, shape, dtype, count=-1, first=0, offset=None, **kwargs): self.assertEqual(numpy_frombuffer.__array_interface__["data"][0], torch_frombuffer.data_ptr()) return (numpy_original, torch_frombuffer) - @dtypes(*torch_to_numpy_dtype_dict.keys()) + @dtypes(*set(numpy_to_torch_dtype_dict.values())) def test_same_type(self, device, dtype): self._run_test((), dtype) self._run_test((4,), dtype) self._run_test((10, 10), dtype) - @dtypes(*torch_to_numpy_dtype_dict.keys()) + @dtypes(*set(numpy_to_torch_dtype_dict.values())) def test_requires_grad(self, device, dtype): def _run_test_and_check_grad(requires_grad, *args, **kwargs): kwargs["requires_grad"] = requires_grad @@ -3672,14 +3718,14 @@ def _run_test_and_check_grad(requires_grad, *args, **kwargs): _run_test_and_check_grad(False, (4,), dtype) _run_test_and_check_grad(False, (10, 10), dtype) - @dtypes(*torch_to_numpy_dtype_dict.keys()) + @dtypes(*set(numpy_to_torch_dtype_dict.values())) def test_with_offset(self, device, dtype): # Offset should be valid whenever there is, at least, # one remaining element for i in range(SIZE): self._run_test(SHAPE, dtype, first=i) - @dtypes(*torch_to_numpy_dtype_dict.keys()) + @dtypes(*set(numpy_to_torch_dtype_dict.values())) def test_with_count(self, device, dtype): # Count should be valid for any valid in the interval # [-1, len(input)], except for 0 @@ -3687,7 +3733,7 @@ def test_with_count(self, device, dtype): if i != 0: self._run_test(SHAPE, dtype, count=i) - @dtypes(*torch_to_numpy_dtype_dict.keys()) + @dtypes(*set(numpy_to_torch_dtype_dict.values())) def test_with_count_and_offset(self, device, dtype): # Explicit default count [-1, 1, 2, ..., len] for i in range(-1, SIZE + 1): @@ -3703,7 +3749,7 @@ def test_with_count_and_offset(self, device, dtype): for j in range(SIZE - i + 1): self._run_test(SHAPE, dtype, count=i, first=j) - @dtypes(*torch_to_numpy_dtype_dict.keys()) + @dtypes(*set(numpy_to_torch_dtype_dict.values())) def test_invalid_positional_args(self, device, dtype): bytes = get_dtype_size(dtype) in_bytes = SIZE * bytes @@ -3740,9 +3786,9 @@ def test_invalid_positional_args(self, device, dtype): rf"buffer length \({in_bytes} bytes\)"): self._run_test(SHAPE, dtype, count=count, first=first) - @dtypes(*torch_to_numpy_dtype_dict.keys()) + @dtypes(*set(numpy_to_torch_dtype_dict.values())) def test_shared_buffer(self, device, dtype): - x = make_tensor((1,), device, dtype) + x = make_tensor((1,), dtype=dtype, device=device) # Modify the whole tensor arr, tensor = self._run_test(SHAPE, dtype) tensor[:] = x @@ -3767,15 +3813,15 @@ def test_shared_buffer(self, device, dtype): arr[first] = x.item() - 1 self.assertEqual(arr[first:last], tensor) - @dtypes(*torch_to_numpy_dtype_dict.keys()) + @dtypes(*set(numpy_to_torch_dtype_dict.values())) def test_not_a_buffer(self, device, dtype): with self.assertRaisesRegex(ValueError, r"object does not implement Python buffer protocol."): torch.frombuffer([1, 2, 3, 4], dtype=dtype) - @dtypes(*torch_to_numpy_dtype_dict.keys()) + @dtypes(*set(numpy_to_torch_dtype_dict.values())) def test_non_writable_buffer(self, device, dtype): - numpy_arr = make_tensor((1,), device, dtype).numpy() + numpy_arr = make_tensor((1,), dtype=dtype, device=device).numpy() byte_arr = numpy_arr.tobytes() with self.assertWarnsOnceRegex(UserWarning, r"The given buffer is not writable."): @@ -3852,7 +3898,7 @@ def _check(self, original, cvt=lambda t: t, is_alias=True, same_dtype=True, same self.assertEqual(result.requires_grad, kwargs.get("requires_grad", False)) def _test_alias_with_cvt(self, cvt, device, dtype, shape=(5, 5), only_with_dtype=False): - original = make_tensor(shape, device, dtype) + original = make_tensor(shape, dtype=dtype, device=device) def check(**kwargs): self._check(original, cvt=cvt, **kwargs) @@ -3873,28 +3919,28 @@ def check(**kwargs): # data pointer (which is basically the point here), since they all # return 0. @skipMeta - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) def test_alias_from_tensor(self, device, dtype): self._test_alias_with_cvt(identity, device, dtype) @onlyCPU - @dtypes(*torch_to_numpy_dtype_dict.keys()) + @dtypes(*set(numpy_to_torch_dtype_dict.values())) def test_alias_from_numpy(self, device, dtype): self._test_alias_with_cvt(to_numpy, device, dtype) # Skipping 'meta', since 'to_dlpack' does not work for them. @skipMeta - @dtypes(*get_all_dtypes(include_bool=False)) + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16)) def test_alias_from_dlpack(self, device, dtype): self._test_alias_with_cvt(to_dlpack, device, dtype) @onlyCPU - @dtypes(*torch_to_numpy_dtype_dict.keys()) + @dtypes(*set(numpy_to_torch_dtype_dict.values())) def test_alias_from_buffer(self, device, dtype): self._test_alias_with_cvt(to_memview, device, dtype, shape=(5,), only_with_dtype=True) def _test_copy_with_cvt(self, cvt, device, dtype, shape=(5, 5), only_with_dtype=False): - original = make_tensor(shape, device, dtype) + original = make_tensor(shape, dtype=dtype, device=device) def check(**kwargs): self._check(original, cvt=cvt, is_alias=False, **kwargs) @@ -3916,35 +3962,35 @@ def check(**kwargs): # Copy is forced because of different dtype if not only_with_dtype: - for other in get_all_dtypes(): + for other in all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16): if dtype != other: check(same_dtype=False, dtype=other) check(same_dtype=False, dtype=other, copy=True) @skipMeta - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) def test_copy_tensor(self, device, dtype): self._test_copy_with_cvt(identity, device, dtype) @onlyCPU - @dtypes(*torch_to_numpy_dtype_dict.keys()) + @dtypes(*set(numpy_to_torch_dtype_dict.values())) def test_copy_from_numpy(self, device, dtype): self._test_copy_with_cvt(to_numpy, device, dtype) @skipMeta - @dtypes(*get_all_dtypes(include_bool=False)) + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16)) def test_copy_from_dlpack(self, device, dtype): self._test_copy_with_cvt(to_dlpack, device, dtype) @onlyCPU - @dtypes(*torch_to_numpy_dtype_dict.keys()) + @dtypes(*set(numpy_to_torch_dtype_dict.values())) def test_copy_from_buffer(self, device, dtype): self._test_copy_with_cvt(to_memview, device, dtype, shape=(5,), only_with_dtype=True) def _test_copy_mult_devices(self, devices, dtype, cvt): cuda1 = devices[0] cuda2 = devices[1] - original = make_tensor((5, 5), cuda1, dtype) + original = make_tensor((5, 5), dtype=dtype, device=cuda1) def check(**kwargs): self._check(original, cvt, is_alias=False, same_device=False, device=cuda2, **kwargs) @@ -3955,19 +4001,19 @@ def check(**kwargs): @onlyCUDA @deviceCountAtLeast(2) - @dtypes(*get_all_dtypes(include_bool=False)) + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16)) def test_copy_from_tensor_mult_devices(self, devices, dtype): self._test_copy_mult_devices(devices, dtype, identity) @onlyCUDA @deviceCountAtLeast(2) - @dtypes(*get_all_dtypes(include_bool=False)) + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16)) def test_copy_from_dlpack_mult_devices(self, devices, dtype): self._test_copy_mult_devices(devices, dtype, to_dlpack) - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) def test_copy_list(self, device, dtype): - original = make_tensor((5, 5), torch.device("cpu"), dtype) + original = make_tensor((5, 5), dtype=dtype, device=torch.device("cpu")) def check(**kwargs): self._check(original, torch.Tensor.tolist, is_alias=False, **kwargs) @@ -3980,7 +4026,7 @@ def check(**kwargs): @dtypes(torch.float32) def test_unsupported_alias(self, device, dtype): - original = make_tensor((5, 5), device, dtype) + original = make_tensor((5, 5), dtype=dtype, device=device) if torch.cuda.is_available(): other_device = get_another_device(device) @@ -4001,14 +4047,14 @@ def test_unsupported_alias(self, device, dtype): @dtypes(torch.float32) def test_unsupported_alias_mult_devices(self, devices, dtype): dev1, dev2 = devices[:2] - original = make_tensor((5, 5), dev1, dtype) + original = make_tensor((5, 5), dtype=dtype, device=dev1) with self.assertRaisesRegex(ValueError, f"from device '{dev1}' to '{dev2}'"): torch.asarray(original, device=dev2, copy=False) @dtypes(torch.float32, torch.complex64) def test_retain_autograd_history(self, device, dtype): - original = make_tensor((5, 5), device, dtype, requires_grad=True) + original = make_tensor((5, 5), dtype=dtype, device=device, requires_grad=True) # 'cloned' has 'grad_fn=' cloned = original.clone() @@ -4046,6 +4092,8 @@ def test_astensor_consistency(self, device): [0.0, True, False, 42], # With Complex [0.0, True, False, 42, 5j], + # With Range + range(5), ] for e in examples: diff --git a/test/test_tensorboard.py b/test/test_tensorboard.py index 4300e9a71006..45bcef536e4f 100644 --- a/test/test_tensorboard.py +++ b/test/test_tensorboard.py @@ -42,7 +42,7 @@ skipIfNoMatplotlib = unittest.skipIf(not TEST_MATPLOTLIB, "no matplotlib") import torch -from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_ASAN +from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_ASAN, TEST_WITH_CROSSREF def tensor_N(shape, dtype=float): numel = np.prod(shape) @@ -54,6 +54,8 @@ class BaseTestCase(TestCase): def setUp(self): if not TEST_TENSORBOARD: return self.skipTest("Skip the test since TensorBoard is not installed") + if TEST_WITH_CROSSREF: + return self.skipTest("Don't run TensorBoard tests with crossref") self.temp_dirs = [] def createSummaryWriter(self): @@ -562,15 +564,15 @@ def forward(self, x): expected_proto = GraphDef() text_format.Parse(expected_str, expected_proto) - self.assertEquals(len(expected_proto.node), len(actual_proto.node)) + self.assertEqual(len(expected_proto.node), len(actual_proto.node)) for i in range(len(expected_proto.node)): expected_node = expected_proto.node[i] actual_node = actual_proto.node[i] - self.assertEquals(expected_node.name, actual_node.name) - self.assertEquals(expected_node.op, actual_node.op) - self.assertEquals(expected_node.input, actual_node.input) - self.assertEquals(expected_node.device, actual_node.device) - self.assertEquals( + self.assertEqual(expected_node.name, actual_node.name) + self.assertEqual(expected_node.op, actual_node.op) + self.assertEqual(expected_node.input, actual_node.input) + self.assertEqual(expected_node.device, actual_node.device) + self.assertEqual( sorted(expected_node.attr.keys()), sorted(actual_node.attr.keys())) def test_nested_nn_squential(self): diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py index 42ca49dc3475..8a5e918eda4b 100644 --- a/test/test_tensorexpr.py +++ b/test/test_tensorexpr.py @@ -13,11 +13,13 @@ class BaseTestClass(JitTestCase): def setUp(self): + super(BaseTestClass, self).setUp() self.tensorexpr_options = TensorExprTestOptions() self.devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] def tearDown(self): self.tensorexpr_options.restore() + super(BaseTestClass, self).tearDown() def assertLastGraphAllFused(self): self.assertAllFused(torch.jit.last_executed_optimized_graph()) diff --git a/test/test_tensorexpr_pybind.py b/test/test_tensorexpr_pybind.py index 00f97399edd7..486858d310a3 100644 --- a/test/test_tensorexpr_pybind.py +++ b/test/test_tensorexpr_pybind.py @@ -348,20 +348,14 @@ def f(a): """ graph = torch._C.parse_ir(graph_str) - def my_custom_lowering(inputs, out_shape, out_type, device): - def get_dim_args(dims): - dim_args = [] - for dim in dims: - dim_args.append(te.DimArg(dim, "i" + str(len(dim_args)))) - return dim_args - + def my_custom_lowering(inputs, out_shape, out_stride, out_type, device): def compute(idxs): load = inputs[0].as_buf().load(idxs) return te.ifThenElse( te.ExprHandle.isnan(load), te.ExprHandle.float(0.0), load ) - return te.Compute2("custom_nan_to_num", get_dim_args(out_shape), compute) + return te.Compute2("custom_nan_to_num", out_shape, compute) kernel = te.TensorExprKernel(graph, {"aten::nan_to_num": my_custom_lowering}) res1 = kernel.run((x,)) diff --git a/test/test_testing.py b/test/test_testing.py index 3cfef8cee395..25f53e5e91ae 100644 --- a/test/test_testing.py +++ b/test/test_testing.py @@ -15,21 +15,20 @@ from torch.testing import make_tensor from torch.testing._internal.common_utils import \ (IS_FBCODE, IS_SANDCASTLE, IS_WINDOWS, TestCase, run_tests, skipIfRocm, slowTest, - parametrize, subtest, instantiate_parametrized_tests, dtype_name) + parametrize, subtest, instantiate_parametrized_tests, dtype_name, TEST_WITH_ROCM) from torch.testing._internal.common_device_type import \ (PYTORCH_TESTING_DEVICE_EXCEPT_FOR_KEY, PYTORCH_TESTING_DEVICE_ONLY_FOR_KEY, dtypes, get_device_type_test_bases, instantiate_device_type_tests, onlyCUDA, onlyNativeDeviceTypes, deviceCountAtLeast, ops, expectedFailureMeta) from torch.testing._internal.common_methods_invocations import op_db import torch.testing._internal.opinfo_helper as opinfo_helper -from torch.testing._internal.common_dtype import get_all_dtypes +from torch.testing._internal.common_dtype import all_types_and_complex_and from torch.testing._internal.common_modules import modules, module_db # For testing TestCase methods and torch.testing functions class TestTesting(TestCase): # Ensure that assertEqual handles numpy arrays properly - @dtypes(*(get_all_dtypes(include_half=True, include_bfloat16=False, - include_bool=True, include_complex=True))) + @dtypes(*all_types_and_complex_and(torch.bool, torch.half)) def test_assertEqual_numpy(self, device, dtype): S = 10 test_sizes = [ @@ -40,7 +39,7 @@ def test_assertEqual_numpy(self, device, dtype): (0, S), (S, 0)] for test_size in test_sizes: - a = make_tensor(test_size, device, dtype, low=-5, high=5) + a = make_tensor(test_size, dtype=dtype, device=device, low=-5, high=5) a_n = a.cpu().numpy() msg = f'size: {test_size}' self.assertEqual(a_n, a, rtol=0, atol=0, msg=msg) @@ -255,7 +254,7 @@ def test_make_tensor(self, device, dtype): def check(size, low, high, requires_grad, noncontiguous): if dtype not in [torch.float, torch.cfloat]: requires_grad = False - t = make_tensor(size, device, dtype, low=low, high=high, + t = make_tensor(size, dtype=dtype, device=device, low=low, high=high, requires_grad=requires_grad, noncontiguous=noncontiguous) self.assertEqual(t.shape, size) @@ -279,10 +278,16 @@ def check(size, low, high, requires_grad, noncontiguous): check(size, None, None, False, False) check(size, 2, 4, True, True) + def test_make_tensor_complex32(self, device): + # verify that we can generate torch.complex32 tensor + t = make_tensor((1, 2, 3), dtype=torch.complex32, device=device) + self.assertEqual(t.dtype, torch.complex32) + # The following tests (test_cuda_assert_*) are added to ensure test suite terminates early # when CUDA assert was thrown. Because all subsequent test will fail if that happens. # These tests are slow because it spawn another process to run test suite. # See: https://github.com/pytorch/pytorch/issues/49019 + @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support device side asserts") @onlyCUDA @slowTest def test_cuda_assert_should_stop_common_utils_test_suite(self, device): @@ -316,6 +321,7 @@ def test_trivial_passing_test_case_on_cpu_cuda(self): self.assertIn('errors=1', stderr) + @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support device side asserts") @onlyCUDA @slowTest def test_cuda_assert_should_stop_common_device_type_test_suite(self, device): @@ -356,6 +362,7 @@ def test_trivial_passing_test_case_on_cpu_cuda(self, device): self.assertIn('errors=1', stderr) + @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support device side asserts") @onlyCUDA @slowTest def test_cuda_assert_should_not_stop_common_distributed_test_suite(self, device): @@ -403,10 +410,10 @@ def test_get_supported_dtypes(self, device): ops_to_test = list(filter(lambda op: op.name in ['atan2', 'topk', 'xlogy'], op_db)) for op in ops_to_test: - dynamic_dtypes = opinfo_helper.get_supported_dtypes(op.op, op.sample_inputs_func, self.device_type) + dynamic_dtypes = opinfo_helper.get_supported_dtypes(op, op.sample_inputs_func, self.device_type) dynamic_dispatch = opinfo_helper.dtypes_dispatch_hint(dynamic_dtypes) if self.device_type == 'cpu': - dtypes = op.dtypesIfCPU + dtypes = op.dtypes else: # device_type ='cuda' dtypes = op.dtypesIfCUDA @@ -574,11 +581,10 @@ def test_unknown_layout(self): def test_meta(self): actual = torch.empty((2, 2), device="meta") - expected = actual.clone() + expected = torch.empty((2, 2), device="meta") for fn in assert_close_with_inputs(actual, expected): - with self.assertRaisesRegex(NotImplementedError, "meta"): - fn() + fn() def test_mismatching_layout(self): strided = torch.empty((2, 2)) @@ -1085,10 +1091,7 @@ def test_matching(self): col_indices = (1, 0) values = (1, 2) actual = torch.sparse_csr_tensor(crow_indices, col_indices, values, size=(2, 2)) - # TODO: replace this by actual.clone() after https://github.com/pytorch/pytorch/issues/59285 is fixed - expected = torch.sparse_csr_tensor( - actual.crow_indices(), actual.col_indices(), actual.values(), size=actual.size(), device=actual.device - ) + expected = actual.clone() for fn in assert_close_with_inputs(actual, expected): fn() @@ -1139,6 +1142,180 @@ def test_mismatching_values_msg(self): fn() +@unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "Not all sandcastle jobs support CSC testing") +class TestAssertCloseSparseCSC(TestCase): + def test_matching(self): + ccol_indices = (0, 1, 2) + row_indices = (1, 0) + values = (1, 2) + actual = torch.sparse_csc_tensor(ccol_indices, row_indices, values, size=(2, 2)) + expected = actual.clone() + + for fn in assert_close_with_inputs(actual, expected): + fn() + + def test_mismatching_ccol_indices_msg(self): + actual_ccol_indices = (0, 1, 2) + actual_row_indices = (1, 0) + actual_values = (1, 2) + actual = torch.sparse_csc_tensor(actual_ccol_indices, actual_row_indices, actual_values, size=(2, 2)) + + expected_ccol_indices = (0, 2, 2) + expected_row_indices = actual_row_indices + expected_values = actual_values + expected = torch.sparse_csc_tensor(expected_ccol_indices, expected_row_indices, expected_values, size=(2, 2)) + + for fn in assert_close_with_inputs(actual, expected): + with self.assertRaisesRegex(AssertionError, re.escape("Sparse CSC ccol_indices")): + fn() + + def test_mismatching_row_indices_msg(self): + actual_ccol_indices = (0, 1, 2) + actual_row_indices = (1, 0) + actual_values = (1, 2) + actual = torch.sparse_csc_tensor(actual_ccol_indices, actual_row_indices, actual_values, size=(2, 2)) + + expected_ccol_indices = actual_ccol_indices + expected_row_indices = (1, 1) + expected_values = actual_values + expected = torch.sparse_csc_tensor(expected_ccol_indices, expected_row_indices, expected_values, size=(2, 2)) + + for fn in assert_close_with_inputs(actual, expected): + with self.assertRaisesRegex(AssertionError, re.escape("Sparse CSC row_indices")): + fn() + + def test_mismatching_values_msg(self): + actual_ccol_indices = (0, 1, 2) + actual_row_indices = (1, 0) + actual_values = (1, 2) + actual = torch.sparse_csc_tensor(actual_ccol_indices, actual_row_indices, actual_values, size=(2, 2)) + + expected_ccol_indices = actual_ccol_indices + expected_row_indices = actual_row_indices + expected_values = (1, 3) + expected = torch.sparse_csc_tensor(expected_ccol_indices, expected_row_indices, expected_values, size=(2, 2)) + + for fn in assert_close_with_inputs(actual, expected): + with self.assertRaisesRegex(AssertionError, re.escape("Sparse CSC values")): + fn() + + +@unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "Not all sandcastle jobs support BSR testing") +class TestAssertCloseSparseBSR(TestCase): + def test_matching(self): + crow_indices = (0, 1, 2) + col_indices = (1, 0) + values = ([[1]], [[2]]) + actual = torch.sparse_bsr_tensor(crow_indices, col_indices, values, size=(2, 2)) + expected = actual.clone() + + for fn in assert_close_with_inputs(actual, expected): + fn() + + def test_mismatching_crow_indices_msg(self): + actual_crow_indices = (0, 1, 2) + actual_col_indices = (1, 0) + actual_values = ([[1]], [[2]]) + actual = torch.sparse_bsr_tensor(actual_crow_indices, actual_col_indices, actual_values, size=(2, 2)) + + expected_crow_indices = (0, 2, 2) + expected_col_indices = actual_col_indices + expected_values = actual_values + expected = torch.sparse_bsr_tensor(expected_crow_indices, expected_col_indices, expected_values, size=(2, 2)) + + for fn in assert_close_with_inputs(actual, expected): + with self.assertRaisesRegex(AssertionError, re.escape("Sparse BSR crow_indices")): + fn() + + def test_mismatching_col_indices_msg(self): + actual_crow_indices = (0, 1, 2) + actual_col_indices = (1, 0) + actual_values = ([[1]], [[2]]) + actual = torch.sparse_bsr_tensor(actual_crow_indices, actual_col_indices, actual_values, size=(2, 2)) + + expected_crow_indices = actual_crow_indices + expected_col_indices = (1, 1) + expected_values = actual_values + expected = torch.sparse_bsr_tensor(expected_crow_indices, expected_col_indices, expected_values, size=(2, 2)) + + for fn in assert_close_with_inputs(actual, expected): + with self.assertRaisesRegex(AssertionError, re.escape("Sparse BSR col_indices")): + fn() + + def test_mismatching_values_msg(self): + actual_crow_indices = (0, 1, 2) + actual_col_indices = (1, 0) + actual_values = ([[1]], [[2]]) + actual = torch.sparse_bsr_tensor(actual_crow_indices, actual_col_indices, actual_values, size=(2, 2)) + + expected_crow_indices = actual_crow_indices + expected_col_indices = actual_col_indices + expected_values = ([[1]], [[3]]) + expected = torch.sparse_bsr_tensor(expected_crow_indices, expected_col_indices, expected_values, size=(2, 2)) + + for fn in assert_close_with_inputs(actual, expected): + with self.assertRaisesRegex(AssertionError, re.escape("Sparse BSR values")): + fn() + + +@unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "Not all sandcastle jobs support BSC testing") +class TestAssertCloseSparseBSC(TestCase): + def test_matching(self): + ccol_indices = (0, 1, 2) + row_indices = (1, 0) + values = ([[1]], [[2]]) + actual = torch.sparse_bsc_tensor(ccol_indices, row_indices, values, size=(2, 2)) + expected = actual.clone() + + for fn in assert_close_with_inputs(actual, expected): + fn() + + def test_mismatching_ccol_indices_msg(self): + actual_ccol_indices = (0, 1, 2) + actual_row_indices = (1, 0) + actual_values = ([[1]], [[2]]) + actual = torch.sparse_bsc_tensor(actual_ccol_indices, actual_row_indices, actual_values, size=(2, 2)) + + expected_ccol_indices = (0, 2, 2) + expected_row_indices = actual_row_indices + expected_values = actual_values + expected = torch.sparse_bsc_tensor(expected_ccol_indices, expected_row_indices, expected_values, size=(2, 2)) + + for fn in assert_close_with_inputs(actual, expected): + with self.assertRaisesRegex(AssertionError, re.escape("Sparse BSC ccol_indices")): + fn() + + def test_mismatching_row_indices_msg(self): + actual_ccol_indices = (0, 1, 2) + actual_row_indices = (1, 0) + actual_values = ([[1]], [[2]]) + actual = torch.sparse_bsc_tensor(actual_ccol_indices, actual_row_indices, actual_values, size=(2, 2)) + + expected_ccol_indices = actual_ccol_indices + expected_row_indices = (1, 1) + expected_values = actual_values + expected = torch.sparse_bsc_tensor(expected_ccol_indices, expected_row_indices, expected_values, size=(2, 2)) + + for fn in assert_close_with_inputs(actual, expected): + with self.assertRaisesRegex(AssertionError, re.escape("Sparse BSC row_indices")): + fn() + + def test_mismatching_values_msg(self): + actual_ccol_indices = (0, 1, 2) + actual_row_indices = (1, 0) + actual_values = ([[1]], [[2]]) + actual = torch.sparse_bsc_tensor(actual_ccol_indices, actual_row_indices, actual_values, size=(2, 2)) + + expected_ccol_indices = actual_ccol_indices + expected_row_indices = actual_row_indices + expected_values = ([[1]], [[3]]) + expected = torch.sparse_bsc_tensor(expected_ccol_indices, expected_row_indices, expected_values, size=(2, 2)) + + for fn in assert_close_with_inputs(actual, expected): + with self.assertRaisesRegex(AssertionError, re.escape("Sparse BSC values")): + fn() + + class TestAssertCloseQuantized(TestCase): def test_mismatching_is_quantized(self): actual = torch.tensor(1.0) @@ -1463,7 +1640,7 @@ def test_op_parametrized(self, device, dtype, op, flag): device_cls = locals()['TestParametrized{}'.format(device.upper())] expected_test_names = [] for op in op_db: - for dtype in op.default_test_dtypes(device): + for dtype in op.supported_dtypes(torch.device(device).type): for flag_part in ('flag_disabled', 'flag_enabled'): expected_name = '{}.test_op_parametrized_{}_{}_{}_{}'.format( device_cls.__name__, op.formatted_name, flag_part, device, dtype_name(dtype)) diff --git a/test/test_torch.py b/test/test_torch.py index 164e6585f164..3e2bc5b03a3b 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -34,16 +34,16 @@ TestCase, TEST_WITH_ROCM, run_tests, IS_WINDOWS, IS_FILESYSTEM_UTF8_ENCODING, NO_MULTIPROCESSING_SPAWN, IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, load_tests, slowTest, - skipCUDAMemoryLeakCheckIf, BytesIOContext, noarchTest, + TEST_WITH_CROSSREF, + skipCUDAMemoryLeakCheckIf, BytesIOContext, skipIfRocm, skipIfNoSciPy, TemporaryFileName, TemporaryDirectoryName, wrapDeterministicFlagAPITest, DeterministicGuard, CudaSyncGuard, - skipIfNotRegistered, bytes_to_scalar) + skipIfNotRegistered, bytes_to_scalar, parametrize, skipIfMps) from multiprocessing.reduction import ForkingPickler from torch.testing._internal.common_device_type import ( expectedFailureMeta, expectedFailureXLA, instantiate_device_type_tests, - skipCUDAVersionIn, onlyCUDA, onlyCPU, dtypes, dtypesIfCUDA, dtypesIfCPU, deviceCountAtLeast, skipMeta, @@ -52,9 +52,11 @@ from typing import Tuple import torch.backends.quantized import torch.testing._internal.data -from torch.testing._internal.common_cuda import tf32_on_and_off, tf32_is_not_fp32 +from torch.testing._internal.common_cuda import ( + tf32_on_and_off, tf32_is_not_fp32, TEST_CUDNN) from torch.testing._internal.common_dtype import ( - get_all_fp_dtypes, get_all_int_dtypes, get_all_math_dtypes, get_all_dtypes, get_all_complex_dtypes + floating_types_and, get_all_math_dtypes, all_types_and_complex_and, complex_types, + all_types_and, floating_types, floating_and_complex_types, integral_types, ) # Protects against includes accidentally setting the default dtype @@ -116,19 +118,6 @@ def test_cuda_vitals_gpu_only(self, device): class TestTorchDeviceType(TestCase): exact_dtype = True - # FIXME: Port this to ErrorInputs on where - @onlyCUDA - @dtypes(torch.float32) - def test_where_invalid_device(self, device, dtype): - for devices in [('cpu', device, device), (device, 'cpu', 'cpu'), - (device, 'cpu', device), ('cpu', device, 'cpu')]: - condition = make_tensor(16, device=devices[0], dtype=torch.float32) - x = make_tensor(16, device=devices[1], dtype=torch.float32) - y = make_tensor(16, device=devices[2], dtype=torch.float32) - with self.assertRaisesRegex(RuntimeError, - "Expected condition, x and y to be on the same device"): - torch.where(condition, x, y) - # TODO: move all tensor creation to common ops def _rand_shape(self, dim, min_size, max_size): shape = [] @@ -174,7 +163,7 @@ def rand_byte(): torch.bool, torch.float32, torch.complex64, torch.float64, torch.complex128) def test_storage(self, device, dtype): - v = make_tensor((3, 5), device, dtype, low=-9, high=9) + v = make_tensor((3, 5), dtype=dtype, device=device, low=-9, high=9) self.assertEqual(v.storage()[0], v[0][0]) self.assertEqual(v.storage()[14], v[2][4]) v_s = v.storage() @@ -233,16 +222,26 @@ def test_storage_setitem(self, device, dtype): self.assertEqual(s, storage_type(l)) @onlyNativeDeviceTypes - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) + def test_tensor_storage_type(self, device, dtype): + a = make_tensor((10,), dtype=dtype, device=device, low=-9, high=9) + + module = torch.cuda if (torch.device(device).type == 'cuda') else torch + expected_storage_type = getattr(module, torch.storage._dtype_to_storage_type_map()[dtype]) + + self.assertEqual(a.storage_type(), expected_storage_type) + + @onlyNativeDeviceTypes + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) def test_tensor_from_storage(self, device, dtype): - a = make_tensor((4, 5, 3), device, dtype, low=-9, high=9) + a = make_tensor((4, 5, 3), dtype=dtype, device=device, low=-9, high=9) a_s = a.storage() b = torch.tensor(a_s, device=device, dtype=dtype).reshape(a.size()) self.assertEqual(a, b) c = torch.tensor(a_s._untyped(), device=device, dtype=dtype).reshape(a.size()) self.assertEqual(a, c) - for error_dtype in get_all_dtypes(): + for error_dtype in all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16): if error_dtype == dtype: continue with self.assertRaisesRegex(RuntimeError, r'Expected a Storage of type'): @@ -250,16 +249,16 @@ def test_tensor_from_storage(self, device, dtype): torch.tensor(error_storage, device=device, dtype=dtype) @onlyNativeDeviceTypes - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) def test_set_storage(self, device, dtype): - a = make_tensor((4, 5, 3), device, dtype, low=-9, high=9) + a = make_tensor((4, 5, 3), dtype=dtype, device=device, low=-9, high=9) a_s = a.storage() b = torch.tensor([], device=device, dtype=dtype).set_(a_s).reshape(a.size()) self.assertEqual(a, b) c = torch.tensor([], device=device, dtype=dtype).set_(a_s._untyped()).reshape(a.size()) self.assertEqual(a, c) - for error_dtype in get_all_dtypes(): + for error_dtype in all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16): if error_dtype == dtype: continue with self.assertRaisesRegex(RuntimeError, r'Expected a Storage of type'): @@ -460,26 +459,12 @@ def test_scalar_check(self, device): self.assertEqual((), torch.cummax(zero_d, 0)[0].shape) self.assertEqual((), torch.cummin(zero_d, 0)[0].shape) - # renorm - self.assertRaises(RuntimeError, lambda: torch.renorm(zero_d, 0.5, 0, 1.0)) - # sort, topk self.assertEqual([(), ()], [x.shape for x in torch.sort(zero_d, 0, False)]) self.assertEqual([(), ()], [x.shape for x in torch.sort(zero_d, 0, True)]) self.assertEqual([(), ()], [x.shape for x in torch.topk(zero_d, 1, 0, False)]) self.assertEqual([(), ()], [x.shape for x in torch.topk(zero_d, 1, 0, True)]) - # lstsq (gels) - self.assertRaises(RuntimeError, lambda: torch.lstsq(zero_d, zero_d)) - - # eig - self.assertRaises(RuntimeError, lambda: torch.eig(zero_d, False)) - self.assertRaises(RuntimeError, lambda: torch.eig(zero_d, True)) - - # this is only implemented on cpu - if (torch.device(device).type == 'cpu'): - self.assertRaises(RuntimeError, lambda: torch.ormqr(zero_d, zero_d, zero_d)) - # max, min self.assertEqual((), torch.max(zero_d, zero_d).shape) self.assertEqual((1,), torch.max(one_d, zero_d).shape) @@ -488,9 +473,6 @@ def test_scalar_check(self, device): self.assertEqual((1,), torch.min(one_d, zero_d).shape) self.assertEqual((1,), torch.min(zero_d, one_d).shape) - # diag - self.assertRaises(RuntimeError, lambda: torch.diag(zero_d)) - zero_d_int = torch.tensor(1, device=device) one_d_int = torch.tensor([1], device=device) @@ -647,6 +629,7 @@ def test_scalar_check(self, device): self.assertEqual((), torch.nn.functional.multi_margin_loss(input, target, reduction='sum').shape) # Uses mismatched arange out size to trigger a warning + @unittest.skipIf(TEST_WITH_CROSSREF, "crossref perturbs line numbering") def test_cpp_warnings_have_python_context(self, device): # Creates long string in advance to avoid a too-long Python line s = ".+Triggered internally at.+RangeFactories.+" @@ -793,158 +776,159 @@ def test_is_set_to(self, device): self.assertFalse(t1.is_set_to(t2)) self.assertFalse(t2.is_set_to(t1)) - def test_broadcast(self, device): - - # all functions - fns = { - "dist", "atan2", "pow", "lerp", "add", - "sub", "mul", "div", "fmod", "remainder", - "eq", "ge", "gt", "le", "lt", "max", "min", "ne", - "addcdiv", "addcmul", "masked_scatter", "masked_select", "masked_fill", - "map", "map2", "copy" - } + # See https://github.com/pytorch/pytorch/issues/72650 + @skipIfMps + @skipMeta + @parametrize( + "fn", + [ + "dist", "atan2", "pow", "lerp", "add", "sub", "mul", "div", "fmod", "remainder", "eq", "ge", "gt", "le", + "lt", "max", "min", "ne", "addcdiv", "addcmul", "masked_scatter", "masked_select", "masked_fill", "map", + "map2", "copy", + ], + ) + def test_broadcast(self, fn, device): # functions with three tensor arguments fns_3_args = {"map2"} fns_value_kwarg = {"addcdiv", "addcmul"} - for fn in fns: - (dims_small, dims_large, dims_full) = self._select_broadcastable_dims() - full1d = torch.randn(*dims_full, device=device).flatten().float() - small = torch.randn(*dims_small, device=device).float() - large = torch.randn(*dims_large, device=device).float() - small_expanded = small.expand(*dims_full) - large_expanded = large.expand(*dims_full) - small2 = None - small2_expanded = None - if fn in fns_3_args or fn in fns_value_kwarg: - # create another smaller tensor - (dims_small2, _, _) = self._select_broadcastable_dims(dims_full) - small2 = torch.randn(*dims_small2, device=device).float() - small2_expanded = small2.expand(*dims_full) - - if small.is_cuda and fn in ['map', 'map2']: - # map and map2 are not implementd on CUDA tensors - continue - - if hasattr(large_expanded, fn): - # run through tensor versions of functions - # and verify fully expanded inputs give same results - expanded = {large: large_expanded, small: small_expanded, small2: small2_expanded} - - def tensorfn(myfn, t1, t2): - if fn == "lerp": - return myfn(t1, 0.5) - elif fn == "masked_select": - return myfn(t1 < 0) - elif fn == "masked_scatter": - return myfn(t1 < 0.5, full1d) - elif fn == "masked_fill": - return myfn(t1 < 0.5, 1.0) - elif fn in fns_3_args: - return myfn(1, t1, t2) - elif fn in fns_value_kwarg: - return myfn(t1, t2, value=1) - else: - return myfn(t1) - - # test various orders - for first, second, third in [(large, small, small2), (small, large, small2), - (small2, small, large), (small2, large, small)]: - if first is None: - break # ignore last iter when small2 is None - method_expanded = getattr(expanded[first], fn) - method = getattr(first, fn) - r1 = tensorfn(method_expanded, expanded[second], expanded[third]) - r2 = tensorfn(method, second, third) - self.assertEqual(r1, r2) - - # now for torch. versions of functions - if hasattr(torch, fn): - fntorch = getattr(torch, fn) - expanded = {large: large_expanded, small: small_expanded, small2: small2_expanded} - - def torchfn(t1, t2, t3): - if fn == "lerp": - return fntorch(t1, t2, 0.5) - elif fn == "masked_select": - return fntorch(t1, t2 < 0) - elif fn == "masked_scatter": - return fntorch(t1, t2 < 0.5, full1d) - elif fn == "masked_fill": - return fntorch(t1, t2 < 0.5, 1.0) - elif fn in fns_3_args: - return fntorch(t1, 1.0, t2, t3) - elif fn in fns_value_kwarg: - return fntorch(t1, t2, t3, value=1.0) - else: - return fntorch(t1, t2) - - # test various orders - for first, second, third in [(large, small, small2), (small, large, small2), - (small2, small, large), (small2, large, small)]: - if first is None: - break # ignore last iter when small2 is None - r1 = torchfn(expanded[first], expanded[second], expanded[third]) - r2 = torchfn(first, second, third) - self.assertEqual(r1, r2) - - # now for in place functions - # in-place tensor is not broadcastable; test only guaranteed - # to work by broadcasting other argument(s) - if not hasattr(large_expanded, fn + "_"): - continue + (dims_small, dims_large, dims_full) = self._select_broadcastable_dims() + full1d = torch.randn(*dims_full, device=device).flatten().float() + small = torch.randn(*dims_small, device=device).float() + large = torch.randn(*dims_large, device=device).float() + small_expanded = small.expand(*dims_full) + large_expanded = large.expand(*dims_full) + small2 = None + small2_expanded = None + if fn in fns_3_args or fn in fns_value_kwarg: + # create another smaller tensor + (dims_small2, _, _) = self._select_broadcastable_dims(dims_full) + small2 = torch.randn(*dims_small2, device=device).float() + small2_expanded = small2.expand(*dims_full) + + if small.is_cuda and fn in ['map', 'map2']: + # map and map2 are not implementd on CUDA tensors + return - # need to clone largeExpanded so we can reuse, since functions are in-place - large_expanded_clone = large_expanded.clone() + if hasattr(large_expanded, fn): + # run through tensor versions of functions + # and verify fully expanded inputs give same results + expanded = {large: large_expanded, small: small_expanded, small2: small2_expanded} - def tensorfn_inplace(t0, t1, t2=None): - t0_fn = getattr(t0, fn + "_") + def tensorfn(myfn, t1, t2): if fn == "lerp": - return t0_fn(t1, 0.5) + return myfn(t1, 0.5) + elif fn == "masked_select": + return myfn(t1 < 0) elif fn == "masked_scatter": - return t0_fn(t1 < 0.5, full1d) + return myfn(t1 < 0.5, full1d) elif fn == "masked_fill": - return t0_fn(t1 < 0.5, 1.0) - elif fn == "map": - return t0_fn(t1, lambda x, y: x + y) - elif fn == "map2": - return t0_fn(t1, t2, lambda x, y, z: x + y + z) + return myfn(t1 < 0.5, 1.0) elif fn in fns_3_args: - return t0_fn(1.0, t1, t2) + return myfn(1, t1, t2) elif fn in fns_value_kwarg: - return t0_fn(t1, t2, value=1.0) + return myfn(t1, t2, value=1) else: - return t0_fn(t1) - # in-place pointwise operations don't actually work if the in-place - # tensor is 0-strided (numpy has the same issue) - if (0 not in large_expanded.stride() and 0 not in large_expanded_clone.stride()): - r1 = tensorfn_inplace(large_expanded, small_expanded, small2_expanded) - r2 = tensorfn_inplace(large_expanded_clone, small, small2) + return myfn(t1) + + # test various orders + for first, second, third in [(large, small, small2), (small, large, small2), + (small2, small, large), (small2, large, small)]: + if first is None: + break # ignore last iter when small2 is None + method_expanded = getattr(expanded[first], fn) + method = getattr(first, fn) + r1 = tensorfn(method_expanded, expanded[second], expanded[third]) + r2 = tensorfn(method, second, third) self.assertEqual(r1, r2) - def broadcastable(t0, t1, t2=None): - try: - t1.expand_as(t0) - if t2 is not None: - t2.expand_as(t0) - except RuntimeError: - return False - return True - - def _test_in_place_broadcastable(t0, t1, t2=None): - if not broadcastable(t0, t1, t2): - same_size = t0.numel() == t1.numel() and (t0.numel() == t2.numel() if t2 is not None else True) - if not same_size: - self.assertRaises(RuntimeError, lambda: tensorfn_inplace(t0, t1, t2)) + # now for torch. versions of functions + if hasattr(torch, fn): + fntorch = getattr(torch, fn) + expanded = {large: large_expanded, small: small_expanded, small2: small2_expanded} + + def torchfn(t1, t2, t3): + if fn == "lerp": + return fntorch(t1, t2, 0.5) + elif fn == "masked_select": + return fntorch(t1, t2 < 0) + elif fn == "masked_scatter": + return fntorch(t1, t2 < 0.5, full1d) + elif fn == "masked_fill": + return fntorch(t1, t2 < 0.5, 1.0) + elif fn in fns_3_args: + return fntorch(t1, 1.0, t2, t3) + elif fn in fns_value_kwarg: + return fntorch(t1, t2, t3, value=1.0) else: - tensorfn_inplace(t0, t1, t2) + return fntorch(t1, t2) + + # test various orders + for first, second, third in [(large, small, small2), (small, large, small2), + (small2, small, large), (small2, large, small)]: + if first is None: + break # ignore last iter when small2 is None + r1 = torchfn(expanded[first], expanded[second], expanded[third]) + r2 = torchfn(first, second, third) + self.assertEqual(r1, r2) + + # now for in place functions + # in-place tensor is not broadcastable; test only guaranteed + # to work by broadcasting other argument(s) + if not hasattr(large_expanded, fn + "_"): + return + + # need to clone largeExpanded so we can reuse, since functions are in-place + large_expanded_clone = large_expanded.clone() + + def tensorfn_inplace(t0, t1, t2=None): + t0_fn = getattr(t0, fn + "_") + if fn == "lerp": + return t0_fn(t1, 0.5) + elif fn == "masked_scatter": + return t0_fn(t1 < 0.5, full1d) + elif fn == "masked_fill": + return t0_fn(t1 < 0.5, 1.0) + elif fn == "map": + return t0_fn(t1, lambda x, y: x + y) + elif fn == "map2": + return t0_fn(t1, t2, lambda x, y, z: x + y + z) + elif fn in fns_3_args: + return t0_fn(1.0, t1, t2) + elif fn in fns_value_kwarg: + return t0_fn(t1, t2, value=1.0) + else: + return t0_fn(t1) + # in-place pointwise operations don't actually work if the in-place + # tensor is 0-strided (numpy has the same issue) + if (0 not in large_expanded.stride() and 0 not in large_expanded_clone.stride()): + r1 = tensorfn_inplace(large_expanded, small_expanded, small2_expanded) + r2 = tensorfn_inplace(large_expanded_clone, small, small2) + self.assertEqual(r1, r2) + + def broadcastable(t0, t1, t2=None): + try: + t1.expand_as(t0) + if t2 is not None: + t2.expand_as(t0) + except RuntimeError: + return False + return True - if fn not in fns_3_args and fn not in fns_value_kwarg: - _test_in_place_broadcastable(small, large_expanded) - _test_in_place_broadcastable(small, large) + def _test_in_place_broadcastable(t0, t1, t2=None): + if not broadcastable(t0, t1, t2): + same_size = t0.numel() == t1.numel() and (t0.numel() == t2.numel() if t2 is not None else True) + if not same_size: + self.assertRaises(RuntimeError, lambda: tensorfn_inplace(t0, t1, t2)) else: - _test_in_place_broadcastable(small2, small_expanded, large_expanded) - _test_in_place_broadcastable(small2, small, large) + tensorfn_inplace(t0, t1, t2) + + if fn not in fns_3_args and fn not in fns_value_kwarg: + _test_in_place_broadcastable(small, large_expanded) + _test_in_place_broadcastable(small, large) + else: + _test_in_place_broadcastable(small2, small_expanded, large_expanded) + _test_in_place_broadcastable(small2, small, large) @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "cublas runtime error") @onlyCUDA @@ -1019,6 +1003,7 @@ def test_case_info(fn_name, config): # FIXME: update OpInfos to support "nondeterministic samples" and port these tests # to that architecture + @skipIfMps def test_nondeterministic_alert_AvgPool3d(self, device): module = torch.nn.AvgPool3d(3) input = torch.randn(2, 3, 3, 3, requires_grad=True, device=device) @@ -1031,6 +1016,7 @@ def backward_func(slf, device): backward_func(self, device) + @skipIfMps def test_nondeterministic_alert_AdaptiveAvgPool2d(self, device): module = torch.nn.AdaptiveAvgPool2d(3) input = torch.randn(2, 3, 3, requires_grad=True, device=device) @@ -1043,6 +1029,7 @@ def backward_func(slf, device): backward_func(self, device) + @skipIfMps def test_nondeterministic_alert_AdaptiveAvgPool3d(self, device): module = torch.nn.AdaptiveAvgPool3d(3) input = torch.randn(2, 3, 3, 3, requires_grad=True, device=device) @@ -1055,6 +1042,7 @@ def backward_func(slf, device): backward_func(self, device) + @skipIfMps def test_nondeterministic_alert_MaxPool3d(self, device): module = torch.nn.MaxPool3d(3) input = torch.randn(2, 3, 3, 3, requires_grad=True, device=device) @@ -1067,6 +1055,7 @@ def backward_func(slf, device): backward_func(self, device) + @skipIfMps def test_nondeterministic_alert_AdaptiveMaxPool2d(self, device): module = torch.nn.AdaptiveMaxPool2d(3) input = torch.randn(2, 3, 3, requires_grad=True, device=device) @@ -1079,6 +1068,7 @@ def backward_func(slf, device): backward_func(self, device) + @skipIfMps def test_nondeterministic_alert_FractionalMaxPool2d(self, device): module = torch.nn.FractionalMaxPool2d(2, output_ratio=0.5) input = torch.randn(2, 3, 3, 3, requires_grad=True, device=device) @@ -1091,6 +1081,7 @@ def backward_func(slf, device): backward_func(self, device) + @skipIfMps def test_nondeterministic_alert_FractionalMaxPool3d(self, device): module = torch.nn.FractionalMaxPool3d(2, output_ratio=0.5) input = torch.randn(2, 3, 3, 3, 3, requires_grad=True, device=device) @@ -1103,6 +1094,7 @@ def backward_func(slf, device): backward_func(self, device) + @skipIfMps def test_nondeterministic_alert_interpolate_linear(self, device): input = torch.randn(1, 2, 4, device=device, requires_grad=True) res = torch.nn.functional.interpolate( @@ -1133,6 +1125,7 @@ def backward_func(slf, device): backward_func(self, device) + @skipIfMps def test_nondeterministic_alert_interpolate_bicubic(self, device): input = torch.randn(1, 2, 4, 4, device=device, requires_grad=True) res = torch.nn.functional.interpolate( @@ -1148,6 +1141,7 @@ def backward_func(slf, device): backward_func(self, device) + @skipIfMps def test_nondeterministic_alert_interpolate_trilinear(self, device): input = torch.randn(1, 2, 4, 4, 4, device=device, requires_grad=True) res = torch.nn.functional.interpolate( @@ -1163,6 +1157,7 @@ def backward_func(slf, device): backward_func(self, device) + @skipIfMps def test_nondeterministic_alert_ReflectionPad1d(self, device): module = torch.nn.ReflectionPad1d((1, 2)) input = torch.randn(2, 3, 8, device=device, requires_grad=True) @@ -1187,6 +1182,7 @@ def backward_func(slf, device): backward_func(self, device) + @skipIfMps def test_nondeterministic_alert_ReflectionPad3d(self, device): module = torch.nn.ReflectionPad3d((1, 2, 3, 4, 5, 6)) input = torch.randn(2, 3, 8, 8, 8, device=device, requires_grad=True) @@ -1199,6 +1195,7 @@ def backward_func(slf, device): backward_func(self, device) + @skipIfMps def test_nondeterministic_alert_ReplicationPad1d(self, device): module = torch.nn.ReplicationPad1d((1, 2)) input = torch.randn(2, 3, 4, device=device, requires_grad=True) @@ -1223,6 +1220,7 @@ def backward_func(slf, device): backward_func(self, device) + @skipIfMps def test_nondeterministic_alert_ReplicationPad3d(self, device): module = torch.nn.ReplicationPad3d((1, 2, 3, 4, 5, 6)) input = torch.randn(2, 3, 4, 4, 4, device=device, requires_grad=True) @@ -1324,6 +1322,7 @@ def forward_func(slf, device): test_func(torch.Tensor.put) test_func(torch.Tensor.put_) + @skipIfMps def test_nondeterministic_alert_histc(self, device): def test_func(op_call): a = torch.tensor([], device=device) @@ -1337,6 +1336,7 @@ def forward_func(slf, device): test_func(torch.histc) test_func(torch.Tensor.histc) + @skipIfMps def test_nondeterministic_alert_bincount(self, device): def test_func(op_call): a = torch.tensor([], device=device, dtype=torch.long) @@ -1391,6 +1391,7 @@ def backward_func(slf, device): test_func(torch.gather) test_func(torch.Tensor.gather) + @skipIfMps def test_nondeterministic_alert_grid_sample_2d(self, device): input = torch.empty(1, 1, 2, 2, device=device, requires_grad=True) grid = torch.empty(1, 1, 1, 2, device=device) @@ -1403,6 +1404,7 @@ def backward_func(slf, device): backward_func(self, device) + @skipIfMps def test_nondeterministic_alert_grid_sample_3d(self, device): input = torch.empty(1, 1, 2, 2, 2, device=device, requires_grad=True) grid = torch.empty(1, 1, 1, 2, 3, device=device) @@ -1415,15 +1417,55 @@ def backward_func(slf, device): backward_func(self, device) - def test_embedding_scalar_weight_error(self, device): - indices = torch.rand(2, 2, device=device).long() - weights = [ - torch.tensor(1.0, device=device), - torch.tensor(1.0, device=device).reshape(1, 1, 1), - ] - for weight in weights: - with self.assertRaisesRegex(RuntimeError, "'weight' must be 2-D"): - torch.embedding(weight, indices) + def test_invalid_shapes_grid_sampler(self, device): + make_arg = partial( + make_tensor, device=device, dtype=torch.float64, requires_grad=True) + + inputs = ( + # input, grid + ((5, 5, 5, 5, 5,), (1, 1, 1, 4, 4,)), # 3d + ((5, 5, 5, 5,), (1, 1, 4, 4,)), # 2d + ) + + interpolation_mode = 0 + padding_mode = 0 + align_corners = True + + err = "expected grid and input to have same batch size" + + for input, grid in inputs: + input = make_arg(input) + grid = make_arg(grid, low=-1, high=1) + + # Wrapper for the 2d, 3d, and cuDNN functions listed below. + with self.assertRaisesRegex(RuntimeError, err): + torch.grid_sampler( + input, grid, interpolation_mode, padding_mode, + align_corners) + + # Expects 2d input. + with self.assertRaisesRegex(RuntimeError, err): + torch.grid_sampler_2d( + input, grid, interpolation_mode, padding_mode, + align_corners) + + # Expects 3d input. + with self.assertRaisesRegex(RuntimeError, err): + torch.grid_sampler_3d( + input, grid, interpolation_mode, padding_mode, + align_corners) + + # Expects 2d input. + with self.assertRaisesRegex(RuntimeError, err): + torch._grid_sampler_2d_cpu_fallback( + input, grid, interpolation_mode, padding_mode, + align_corners) + + # Expects 2d input, on CUDA. + # Doesn't work on CPU and ROCm. + if device != 'cpu' and TEST_CUDNN and not TEST_WITH_ROCM: + with self.assertRaisesRegex(RuntimeError, err): + torch.cudnn_grid_sampler(input, grid) def test_dist(self, device): def run_test(x, y): @@ -1592,18 +1634,21 @@ def _cond_fn(x): _sync_raises_helper(f, level) - @dtypes(*get_all_fp_dtypes()) + @dtypes(*floating_types_and(torch.half, torch.bfloat16)) + @skipIfMps def test_log_normal(self, device, dtype): a = torch.tensor([10], dtype=dtype, device=device).log_normal_() self.assertEqual(a.dtype, dtype) self.assertEqual(a.size(), torch.Size([1])) - @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) + @dtypes(*all_types_and(torch.half, torch.bfloat16)) + @skipIfMps def test_geometric(self, device, dtype): a = torch.tensor([10], dtype=dtype, device=device).geometric_(0.5) self.assertEqual(a.dtype, dtype) self.assertEqual(a.size(), torch.Size([1])) + @skipIfMps def test_repeat_interleave(self, device): y = torch.tensor([[1, 2], [3, 4]], device=device) # exercise single argument function signature @@ -1630,9 +1675,9 @@ def test_repeat_interleave(self, device): self.assertEqual(a_with_output.dtype, y.dtype) self.assertEqual(a_with_output.size(), torch.Size([3, 2])) - @dtypes(*get_all_fp_dtypes(include_half=False, include_bfloat16=False)) - @dtypesIfCPU(*(get_all_fp_dtypes(include_half=False, include_bfloat16=True))) - @dtypesIfCUDA(*(get_all_fp_dtypes(include_bfloat16=False))) + @dtypes(*floating_types()) + @dtypesIfCPU(*floating_types_and(torch.bfloat16)) + @dtypesIfCUDA(*floating_types_and(torch.half)) def test_bernoulli_p(self, device, dtype): for trivial_p in ([0, 1], [1, 0, 1, 1, 0, 1]): x = torch.tensor(trivial_p, dtype=dtype, device=device) @@ -1652,9 +1697,9 @@ def isBinary(t): self.assertTrue(isBinary(p)) # RngUniform not implemented for Integral type in XLA test - @dtypes(*(get_all_fp_dtypes(include_half=False, include_bfloat16=False))) - @dtypesIfCPU(*(get_all_dtypes(include_half=False, include_bfloat16=False, include_complex=False))) - @dtypesIfCUDA(*(get_all_dtypes(include_bfloat16=False, include_complex=False))) + @dtypes(*floating_types()) + @dtypesIfCPU(*all_types_and(torch.bool)) + @dtypesIfCUDA(*all_types_and(torch.bool, torch.half)) def test_bernoulli_self(self, device, dtype): def isBinary(t): @@ -1666,7 +1711,7 @@ def isBinary(t): t.bernoulli_(0.5) self.assertTrue(isBinary(t)) - for p_dtype in get_all_fp_dtypes(include_half=device.startswith('cuda'), include_bfloat16=False): + for p_dtype in floating_types_and(*[torch.half] if device.startswith('cuda') else []): p = torch.rand(10, dtype=p_dtype, device=device).expand(10, 10) t.fill_(2) t.bernoulli_(p) @@ -1681,8 +1726,8 @@ def isBinary(t): self.assertTrue(isBinary(t)) @slowTest - @dtypes(*(get_all_fp_dtypes(include_half=False, include_bfloat16=False))) - @dtypesIfCUDA(*(get_all_fp_dtypes(include_bfloat16=False))) + @dtypes(*floating_types()) + @dtypesIfCUDA(*floating_types_and(torch.half)) def test_bernoulli_edge_cases(self, device, dtype): # Need to draw a lot of samples to cover every random floating point number. a = torch.zeros(10000, 10000, dtype=dtype, device=device) # probability of drawing "1" is 0 @@ -1693,7 +1738,8 @@ def test_bernoulli_edge_cases(self, device, dtype): num_zeros = (torch.bernoulli(b) == 0).sum() self.assertEqual(num_zeros, 0) - @dtypes(*get_all_fp_dtypes()) + @dtypes(*floating_types_and(torch.half, torch.bfloat16)) + @skipIfMps def test_exponential(self, device, dtype): a = torch.tensor([10], dtype=dtype, device=device).exponential_(0.5) self.assertEqual(a.dtype, dtype) @@ -1720,15 +1766,15 @@ def test_exponential_no_zero(self, device, dtype): self.assertTrue(x.min() > 0) def _generate_correlation_tensors(self, device, dtype): - yield make_tensor((0, 0), device, dtype) - yield make_tensor((1, 0), device, dtype) - yield make_tensor((0, 1), device, dtype) - yield make_tensor((2,), device, dtype) - yield make_tensor((2, 1), device, dtype) - yield make_tensor((2, 2), device, dtype) - yield make_tensor((2, 3), device, dtype) - yield make_tensor((5, 10), device, dtype) - yield make_tensor((5, 10), device, dtype, noncontiguous=True) + yield make_tensor((0, 0), dtype=dtype, device=device) + yield make_tensor((1, 0), dtype=dtype, device=device) + yield make_tensor((0, 1), dtype=dtype, device=device) + yield make_tensor((2,), dtype=dtype, device=device) + yield make_tensor((2, 1), dtype=dtype, device=device) + yield make_tensor((2, 2), dtype=dtype, device=device) + yield make_tensor((2, 3), dtype=dtype, device=device) + yield make_tensor((5, 10), dtype=dtype, device=device) + yield make_tensor((5, 10), dtype=dtype, device=device, noncontiguous=True) if dtype != torch.int: yield torch.tensor([0, -2, nan, 10.2, inf], dtype=dtype, device=device) @@ -1755,29 +1801,12 @@ def check(t, correction=1, fweights=None, aweights=None): num_observations = x.numel() if x.ndim < 2 else x.size(1) if num_observations > 0: fweights = torch.randint(1, 10, (num_observations,), device=device) - aweights = make_tensor((num_observations,), device, torch.float, low=1) + aweights = make_tensor((num_observations,), dtype=torch.float, device=device, low=1) for correction, fw, aw in product([0, 1, 2], [None, fweights], [None, aweights]): check(x, correction, fweights, aweights) - # FIXME: port to ErrorInputs - def test_cov_error(self, device): - def check(msg, *args, **kwargs): - with self.assertRaisesRegex(RuntimeError, r'cov\(\):.*' + msg + r'.*'): - torch.cov(*args, **kwargs) - - a = torch.rand(2) - check(r'expected input to have two or fewer dimensions', torch.rand(2, 2, 2)) - check(r'expected fweights to have one or fewer dimensions', a, fweights=torch.rand(2, 2)) - check(r'expected aweights to have one or fewer dimensions', a, aweights=torch.rand(2, 2)) - check(r'expected fweights to have integral dtype', a, fweights=torch.rand(2)) - check(r'expected aweights to have floating point dtype', a, aweights=torch.tensor([1, 1])) - check(r'expected fweights to have the same numel', a, fweights=torch.tensor([1])) - check(r'expected aweights to have the same numel', a, aweights=torch.rand(1)) - check(r'fweights cannot be negative', a, fweights=torch.tensor([-1, -2])) - check(r'aweights cannot be negative', a, aweights=torch.tensor([-1., -2.])) - @skipIfNoSciPy - @dtypes(*get_all_fp_dtypes()) + @dtypes(*floating_types_and(torch.half, torch.bfloat16)) def test_uniform_kstest(self, device, dtype): from scipy import stats size = 1000 @@ -1789,8 +1818,8 @@ def test_uniform_kstest(self, device, dtype): self.assertTrue(res.statistic < 0.1) @skipIfNoSciPy - @dtypes(*get_all_fp_dtypes(include_bfloat16=False)) - @dtypesIfCUDA(*get_all_fp_dtypes()) + @dtypes(*floating_types_and(torch.half)) + @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16)) def test_normal_kstest(self, device, dtype): from scipy import stats size = 1000 @@ -1800,8 +1829,9 @@ def test_normal_kstest(self, device, dtype): res = stats.kstest(t.cpu().to(torch.double), 'norm', args=(mean, std)) self.assertTrue(res.statistic < 0.1) + @skipIfMps @skipIfNoSciPy - @dtypes(*get_all_fp_dtypes()) + @dtypes(*floating_types_and(torch.half, torch.bfloat16)) def test_lognormal_kstest(self, device, dtype): from scipy import stats size = 1000 @@ -1814,8 +1844,9 @@ def test_lognormal_kstest(self, device, dtype): else: self.assertTrue(res.statistic < 0.1) + @skipIfMps @skipIfNoSciPy - @dtypes(*get_all_fp_dtypes()) + @dtypes(*floating_types_and(torch.half, torch.bfloat16)) def test_exponential_kstest(self, device, dtype): from scipy import stats size = 1000 @@ -1824,8 +1855,9 @@ def test_exponential_kstest(self, device, dtype): res = stats.kstest(t.cpu().to(torch.double), 'expon', args=(0, 1 / lambd,)) self.assertTrue(res.statistic < 0.1) + @skipIfMps @skipIfNoSciPy - @dtypes(*get_all_fp_dtypes()) + @dtypes(*floating_types_and(torch.half, torch.bfloat16)) def test_cauchy_kstest(self, device, dtype): from scipy import stats size = 1000 @@ -1845,8 +1877,9 @@ def test_cauchy_no_inf(self, device, dtype): x.cauchy_() self.assertFalse(x.isinf().sum()) + @skipIfMps @skipIfNoSciPy - @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) + @dtypes(*all_types_and(torch.half, torch.bfloat16)) def test_geometric_kstest(self, device, dtype): from scipy import stats size = 1000 @@ -1909,6 +1942,7 @@ def _brute_cdist(self, x, y, p=2): return torch.empty(r1, r2, device=x.device) return torch.norm(x[..., None, :] - y[..., None, :, :], p=p, dim=-1) + @skipIfMps def test_cdist_norm(self, device): for r1 in [3, 4, 5, 6]: for m in [2, 3, 4, 10]: @@ -1926,6 +1960,7 @@ def test_cdist_norm(self, device): expected = self._brute_cdist(x, y, p=p) self.assertEqual(expected, actual) + @skipIfMps def test_cdist_norm_batch(self, device): for r1 in [3, 4, 5, 6]: for m in [2, 3, 4, 10]: @@ -2060,6 +2095,7 @@ def _test_euclidean_large_cdist(sizex, sizey=None): _test_euclidean_large_cdist((2000, 5)) # Ensure that cdist backward with p<1 does not produce NaNs + @skipIfMps def test_cdist_grad_p_lt_1_no_nan(self, device): for p in [0.99, 0.7, 0.5, 0.1, 0.01]: x = torch.randn(1, 2, device=device) @@ -2087,37 +2123,7 @@ def test_cdist_same_inputs(self, device): # values such as nan or inf assert torch.isfinite(x.grad).all() - def test_multinomial_constraints(self, device): - x = torch.empty(1, 2, 3, dtype=torch.double, device=device) - self.assertRaisesRegex( - RuntimeError, "prob_dist must be 1 or 2 dim", - lambda: torch.multinomial(x, 2)) - x = torch.empty(1, 2, dtype=torch.long, device=device) - self.assertRaisesRegex( - RuntimeError, "multinomial only supports floating-point dtypes for input", - lambda: torch.multinomial(x, 2)) - x = torch.empty(1, 2, dtype=torch.double, device=device) - y = torch.empty(1, 2, dtype=torch.double, device=device) - self.assertRaisesRegex( - RuntimeError, "multinomial expects Long tensor out", - lambda: torch.multinomial(x, 2, out=y)) - x = torch.empty(2, dtype=torch.double, device=device) - self.assertRaisesRegex( - RuntimeError, "cannot sample n_sample <= 0 samples", - lambda: torch.multinomial(x, 0)) - x = torch.empty(2, dtype=torch.double, device=device) - self.assertRaisesRegex( - RuntimeError, "cannot sample n_sample <= 0 samples", - lambda: torch.multinomial(x, -1)) - x = torch.empty(2, dtype=torch.double, device=device) - self.assertRaisesRegex( - RuntimeError, "cannot sample n_sample > prob_dist", - lambda: torch.multinomial(x, 3, False)) - x = torch.empty(16777217, dtype=torch.double, device=device) - self.assertRaisesRegex( - RuntimeError, "number of categories cannot exceed", - lambda: torch.multinomial(x, 3)) - + @skipIfMps def test_cumsum(self, device): x = torch.rand(100, 100, device=device) res1 = torch.cumsum(x, 1) @@ -2168,6 +2174,7 @@ def test_cumsum(self, device): # Check that output maintained correct shape self.assertEqual(raw_tensor.shape, raw_tensor.grad.shape) + @skipIfMps def test_cumprod(self, device): x = torch.rand(100, 100, device=device) res1 = torch.cumprod(x, 1) @@ -2218,6 +2225,7 @@ def test_cumprod(self, device): # Check that output maintained correct shape self.assertEqual(raw_tensor.shape, raw_tensor.grad.shape) + @skipIfMps def test_cummax_cummin(self, device): def test_ops(op, string_of_function_name, expected_output1, expected_output2): x = torch.rand(100, 100, device=device) @@ -2284,6 +2292,7 @@ def test_ops(op, string_of_function_name, expected_output1, expected_output2): [0, 0, 0], [0, 0, 0]]), expected_out) + @skipIfMps def test_logcumsumexp(self, device): def logcumsumexp(a, axis): return torch.cumsum(a.exp(), axis=axis).log_() @@ -2357,7 +2366,7 @@ def to_np(t): # All tensors appear contiguous on XLA @onlyNativeDeviceTypes - @dtypes(*get_all_dtypes(include_bfloat16=False)) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool)) def test_diff_noncontig(self, device, dtype): shapes = ( (1,), @@ -2367,7 +2376,7 @@ def test_diff_noncontig(self, device, dtype): (2, 3, 5)) for shape in shapes: - contig = make_tensor(shape, device, dtype, low=-9, high=9) + contig = make_tensor(shape, dtype=dtype, device=device, low=-9, high=9) non_contig = torch.empty(shape + (2, 2), device=device, dtype=dtype)[..., 0] non_contig = non_contig.select(-1, -1) @@ -2377,9 +2386,9 @@ def test_diff_noncontig(self, device, dtype): self._test_diff_numpy(non_contig) # RngNormal not implemented for type f16 for XLA - @dtypes(*get_all_dtypes(include_half=False, include_bfloat16=False)) - @dtypesIfCPU(*get_all_dtypes(include_bfloat16=False)) - @dtypesIfCUDA(*get_all_dtypes(include_bfloat16=False)) + @dtypes(*all_types_and_complex_and(torch.bool)) + @dtypesIfCPU(*all_types_and_complex_and(torch.half, torch.bool)) + @dtypesIfCUDA(*all_types_and_complex_and(torch.half, torch.bool)) def test_diff(self, device, dtype): shapes = ( (1,), @@ -2389,7 +2398,7 @@ def test_diff(self, device, dtype): (2, 3, 5)) for shape in shapes: - contig = make_tensor(shape, device, dtype, low=-9, high=9) + contig = make_tensor(shape, dtype=dtype, device=device, low=-9, high=9) self._test_diff_numpy(contig) t = torch.ones(2, 3) @@ -2494,7 +2503,7 @@ def test_gradient_extreme_cases(self, device, dtype): # Test behaviour in very big tensors large_size = 100000 - t = make_tensor((large_size,), device, dtype) + t = make_tensor((large_size,), dtype=dtype, device=device) t_np = t.cpu().numpy() coordinates_np = list(np.random.randn(large_size)) coordinates = [torch.tensor(coordinates_np, device=device)] @@ -2551,38 +2560,6 @@ def test_gradient_type_promotion(self, device): actual, expected = self._inf_nan_preprocess(list(actual), expected) self.assertEqual(actual, expected, equal_nan=True, exact_dtype=False) - # FIXME: port this to ErrorInputs - @onlyNativeDeviceTypes - @dtypes(torch.long, torch.float32, torch.complex64) - def test_error_gradient(self, device, dtype): - t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], device=device, dtype=dtype) - with self.assertRaisesRegex(RuntimeError, 'torch.gradient expected spacing to be unspecified, a scalar '): - dim = (1, 0) - spacing = [0.1] - torch.gradient(t, spacing=spacing, dim=dim, edge_order=1) - - with self.assertRaisesRegex(RuntimeError, 'torch.gradient only supports edge_order=1 and edge_order=2.'): - torch.gradient(t, edge_order=3) - - with self.assertRaisesRegex(RuntimeError, 'dim 1 appears multiple times in the list of dims'): - dim = (1, 1) - spacing = 0.1 - torch.gradient(t, spacing=spacing, dim=dim, edge_order=1) - - with self.assertRaisesRegex(RuntimeError, 'torch.gradient expected each tensor to be on the same device,'): - dim = (0, 1) - coordinates = [torch.tensor([1, 2, 4], device='cpu'), torch.tensor([1, 2, 4], device='meta')] - torch.gradient(t, spacing=coordinates, dim=dim, edge_order=1) - - with self.assertRaises(IndexError): - torch.gradient(t, dim=3) - - with self.assertRaisesRegex(RuntimeError, 'torch.gradient expected each dimension size to be at least'): - torch.gradient(torch.tensor([[1], [2], [3]]), edge_order=1) - - with self.assertRaisesRegex(RuntimeError, 'torch.gradient expected each dimension size to be at least'): - torch.gradient(torch.tensor([[1, 2], [3, 4]]), edge_order=2) - def _test_large_cum_fn_helper(self, x, fn): x_cpu = x.cpu().float() expected = fn(x_cpu) @@ -2610,6 +2587,7 @@ def test_large_cumprod(self, device, dtype): x[2::3] = .5 self._test_large_cum_fn_helper(x, lambda x: torch.cumprod(x, 0)) + @skipIfMps def test_discontiguous_out_cumsum(self, device): x = torch.randn(4, 8, device=device) y = torch.empty(4, 16, device=device)[:, ::2] @@ -2630,12 +2608,14 @@ def _test_cumminmax_helper(self, x, fn, expected_val, expected_ind): self.assertEqual(out_val, expected_val, atol=0, rtol=0) self.assertEqual(out_ind, expected_ind, atol=0, rtol=0) + @skipIfMps def test_cummax_discontiguous(self, device): x = torch.tensor([[0, 1, 2, 3, 2, 1], [4, 5, 6, 5, 6, 7]], device=device, dtype=torch.float).t().contiguous().t() expected_val = torch.tensor([[0, 1, 2, 3, 3, 3], [4, 5, 6, 6, 6, 7]], device=device, dtype=torch.float) expected_ind = torch.tensor([[0, 1, 2, 3, 3, 3], [0, 1, 2, 2, 4, 5]], device=device, dtype=torch.long) self._test_cumminmax_helper(x, torch.cummax, expected_val, expected_ind) + @skipIfMps def test_cummin_discontiguous(self, device): x = torch.tensor([[3, 2, 1, 0, 1, 2], [7, 6, 5, 4, 5, 2]], device=device, dtype=torch.float).t().contiguous().t() expected_val = torch.tensor([[3, 2, 1, 0, 0, 0], [7, 6, 5, 4, 4, 2]], device=device, dtype=torch.float) @@ -2650,7 +2630,7 @@ def test_bool_tensor_value_change(self, device): # FIXME: move to shape ops test suite def test_unfold_all_devices_and_dtypes(self, device): - for dt in get_all_dtypes(): + for dt in all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16): if dt == torch.bool: x = torch.empty((0, 1, 3, 0), dtype=dt, device=device) @@ -2672,7 +2652,7 @@ def test_unfold_scalars(self, device): # FIXME: move to data movement test suite def test_copy_all_dtypes_and_devices(self, device): from copy import copy - for dt in get_all_dtypes(): + for dt in all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16): x = torch.tensor([1, 2, 3, 4], dtype=dt, device=device) x_clone = x.clone() y = copy(x) @@ -2741,7 +2721,7 @@ def test_copy_transpose_math_view(self, device, dtype): self.assertEqual(dst, src.conj_physical()) def test_clone_all_dtypes_and_devices(self, device): - for dt in get_all_dtypes(): + for dt in all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16): x = torch.tensor((1, 1), dtype=dt, device=device) y = x.clone() self.assertEqual(x, y) @@ -2811,8 +2791,58 @@ def test_narrow_empty(self, device): sz[d] = 0 self.assertEqual(sz, y.size()) + # FIXME: move to indexing test suite + @parametrize("reduce", ['prod', 'amin', 'amax', 'mean']) + @dtypes(*floating_types_and(torch.half, torch.bfloat16)) + def test_index_reduce(self, device, dtype, reduce): + size = (3, 4, 5) + index_dtypes = [torch.int, torch.long] + include_selfs = [True, False] + reduction_init = {'prod': 1, 'mean': 0, 'amin': float('inf'), 'amax': -float('inf')} + + for dest_contig, src_contig, index_contig in product([True, False], repeat=3): + for idx_dtype, include_self in product(index_dtypes, include_selfs): + for dim in range(len(size)): + num_src = np.random.randint(10) + num_dest = size[dim] + dest = torch.randn(size, dtype=dtype, device=device) + if not dest_contig: + dest = make_tensor(size, device=device, dtype=dtype, noncontiguous=True) + src = torch.randn(*size[:dim], num_src, *size[dim + 1:], dtype=dtype, device=device) + if not src_contig: + # noncontiguous_like fails with RuntimeError: XLA tensors do not have storage + src = torch.testing.make_non_contiguous(src) + idx = torch.randint(num_dest, (num_src,), dtype=idx_dtype, device=device) + if not index_contig: + # noncontiguous_like fails with RuntimeError: XLA tensors do not have storage + idx = torch.testing.make_non_contiguous(idx) + expected = dest.clone() + dest.index_reduce_(dim, idx, src, reduce, include_self=include_self) + # fill rows in idx with reduction inits if include_self=False + if (not include_self): + expected.index_fill_(dim, idx.long(), reduction_init[reduce]) + expected = expected.transpose(0, dim) + src = src.transpose(0, dim) + for i in range(num_src): + if reduce == 'prod': + expected[idx[i]] *= src[i] + elif reduce == 'amin': + torch.minimum(expected[idx[i]], src[i], out=expected[idx[i]]) + elif reduce == 'amax': + torch.maximum(expected[idx[i]], src[i], out=expected[idx[i]]) + else: + expected[idx[i]] += src[i] + if reduce == 'mean': + counts = torch.ones_like(expected) if include_self else torch.zeros_like(expected) + counts.index_add_(0, idx, torch.ones_like(src)) + counts.masked_fill_(counts == 0, 1) + expected /= counts + expected = expected.transpose(0, dim) + + self.assertEqual(dest, expected) + # FIXME: move to test indexing - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) def test_index_copy(self, device, dtype): # We just test for num_copy <= num_dest, as otherwise there are repeated indices # and the behavior is undefined @@ -2820,7 +2850,7 @@ def test_index_copy(self, device, dtype): def make_arg(batch_sizes, n, dim, contig): size_arg = batch_sizes[:dim] + (n,) + batch_sizes[dim:] - return make_tensor(size_arg, device, dtype, low=None, high=None, noncontiguous=not contig) + return make_tensor(size_arg, dtype=dtype, device=device, low=None, high=None, noncontiguous=not contig) def ref_index_copy(tgt, dim, idx, src): for i in range(idx.size(0)): @@ -2847,7 +2877,7 @@ def ref_index_copy(tgt, dim, idx, src): # onlyNativeDeviceTypes due to an XLA error: # https://github.com/pytorch/pytorch/issues/53256 @onlyNativeDeviceTypes - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) def test_index_copy_scalars(self, device, dtype): # Create the 8 possible combinations of scalar sizes for target / index / source scalars = ((make_tensor(size_t, dtype=dtype, device=device, low=None, high=None), @@ -2957,13 +2987,14 @@ def test_index_put_non_accumulate_deterministic(self, device) -> None: self.assertEqual(output, input_list) # FIXME: move to test indexing - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) + @skipIfMps def test_index_fill(self, device, dtype): x = torch.tensor([[1, 2], [4, 5]], dtype=dtype, device=device) index = torch.tensor([0], device=device) x.index_fill_(1, index, 0) self.assertEqual(x, torch.tensor([[0, 2], [0, 5]], dtype=dtype, device=device)) - if not x.is_complex(): + if not x.is_complex() and not device == "meta": with self.assertRaisesRegex(RuntimeError, r"Scalar"): x.index_fill_(1, index, 1 + 1j) # Make sure that the result stays 0-dim while applied to @@ -2975,13 +3006,13 @@ def test_index_fill(self, device, dtype): # FIXME: move to test indexing # The test fails for zero-dimensional tensors on XLA @onlyNativeDeviceTypes - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) def test_index_select(self, device, dtype): num_src, num_out = 3, 5 def make_arg(batch_sizes, n, dim, contig): size_arg = batch_sizes[:dim] + (n,) + batch_sizes[dim:] - return make_tensor(size_arg, device, dtype, low=None, high=None, noncontiguous=not contig) + return make_tensor(size_arg, dtype=dtype, device=device, low=None, high=None, noncontiguous=not contig) def ref_index_select(src, dim, idx): # bfloat16 is just used on GPU, so it's not supported on numpy @@ -2996,7 +3027,9 @@ def ref_index_select(src, dim, idx): for other_sizes in ((), (4, 5)): for dim in range(len(other_sizes)): src = make_arg(other_sizes, num_src, dim, src_contig) - idx = make_tensor((num_out,), device, dtype=torch.int64, low=0, high=num_src, noncontiguous=not idx_contig) + idx = make_tensor( + (num_out,), dtype=torch.int64, device=device, low=0, high=num_src, noncontiguous=not idx_contig + ) out = torch.index_select(src, dim, idx) out2 = ref_index_select(src, dim, idx) self.assertEqual(out, out2) @@ -3005,13 +3038,13 @@ def ref_index_select(src, dim, idx): other_sizes = (3, 2) dim = 1 src = make_arg(other_sizes, num_src, dim, True) - idx = make_tensor((num_out,), device, dtype=idx_type, low=0, high=num_src, noncontiguous=False) + idx = make_tensor((num_out,), dtype=idx_type, device=device, low=0, high=num_src, noncontiguous=False) out = torch.index_select(src, dim, idx) out2 = ref_index_select(src, dim, idx) self.assertEqual(out, out2) # Create the 4 possible combinations of scalar sizes for index / source - scalars = ((make_tensor(size_s, device, dtype), + scalars = ((make_tensor(size_s, dtype=dtype, device=device), torch.zeros(size_i, dtype=torch.int64, device=device)) for size_s, size_i in product([(), (1,)], repeat=2)) for source, idx in scalars: @@ -3019,7 +3052,7 @@ def ref_index_select(src, dim, idx): self.assertEqual(out.item(), source.item()) # FIXME: find a test suite for the take operator - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) def test_take(self, device, dtype): idx_size = (4,) @@ -3054,7 +3087,7 @@ def ref_take(src, idx): # FIXME: find a test suite for the put operator # The bool instance does not work on GPU. See # https://github.com/pytorch/pytorch/issues/54317 - @dtypes(*get_all_dtypes(include_bool=False)) + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16)) def test_put(self, device, dtype): src_size = (4,) @@ -3125,7 +3158,7 @@ def ref_put(dst, idx, src, accumulate): # FIXME: find a test suite for the put operator # The bool instance does not work on GPU. See # https://github.com/pytorch/pytorch/issues/54317 - @dtypes(*get_all_dtypes(include_bool=False)) + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16)) def test_put_accumulate(self, device, dtype): # Test for parallel adds with accumulate == True low_precision = dtype == torch.half or dtype == torch.bfloat16 @@ -3147,6 +3180,7 @@ def test_put_accumulate(self, device, dtype): self.assertEqual(out, orig + source.sum(), rtol=rtol, atol=atol) # FIXME: find a test suite for the take operator + @skipIfMps def test_take_empty(self, device): for input_shape in [(0,), (0, 1, 2, 0), (1, 2, 3)]: for indices_shape in [(0,), (0, 1, 2, 0)]: @@ -3169,13 +3203,9 @@ def scatter_allow_reduce(self, device, dtype, reduceop): device_type = torch.device(device).type return device_type != 'cuda' or (reduceop == 'multiply' and dtype.is_floating_point) - # FIXME: port to test_scatter_gather_ops.py - # torch.{zeros, ones} do not support ComplexHalf (torch.complex32) - # So, we are skipping it here. - @dtypes(*(get_all_fp_dtypes(include_bfloat16=False, include_half=False) + - get_all_complex_dtypes())) - @dtypesIfCPU(*get_all_dtypes()) - @dtypesIfCUDA(*get_all_dtypes()) + @dtypes(*floating_and_complex_types()) + @dtypesIfCPU(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) + @dtypesIfCUDA(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) def test_scatter_reduce_operations_to_large_input(self, device, dtype): index = torch.tensor([[1], [2]], device=device, dtype=torch.long) test_data = [ @@ -3200,13 +3230,9 @@ def test_scatter_reduce_operations_to_large_input(self, device, dtype): input.scatter_(0, index, src, reduce=operation) self.assertEqual(input, result) - # FIXME: port to test_scatter_gather_ops.py - # torch.{zeros, ones} do not support ComplexHalf (torch.complex32) - # So, we are skipping it here. - @dtypes(*(get_all_fp_dtypes(include_bfloat16=False, include_half=False) + - get_all_complex_dtypes())) - @dtypesIfCPU(*get_all_dtypes()) - @dtypesIfCUDA(*get_all_dtypes()) + @dtypes(*floating_and_complex_types()) + @dtypesIfCPU(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) + @dtypesIfCUDA(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) def test_scatter_reduce_scalar(self, device, dtype): index = torch.tensor([[1], [2]], device=device, dtype=torch.long) test_data = [ @@ -3243,13 +3269,9 @@ def test_scatter_add_non_unique_index(self, device): torch.tensor([[3], [1]], device=device, dtype=torch.float32).repeat(1, width)) - # FIXME: port to test_scatter_gather_ops.py - # torch.{zeros, ones} do not support ComplexHalf (torch.complex32) - # So, we are skipping it here. - @dtypes(*(get_all_fp_dtypes(include_bfloat16=False, include_half=False) + - get_all_complex_dtypes())) - @dtypesIfCPU(*get_all_dtypes()) - @dtypesIfCUDA(*get_all_dtypes()) + @dtypes(*floating_and_complex_types()) + @dtypesIfCPU(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) + @dtypesIfCUDA(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) def test_scatter_reduce_non_unique_index(self, device, dtype): height = 2 width = 2 @@ -3270,12 +3292,8 @@ def test_scatter_reduce_non_unique_index(self, device, dtype): input.scatter_(0, index, src, reduce=operation) self.assertEqual(input, result, msg=f"result: {result} input: {input} method: {str(operation)}") - # FIXME: port to test_scatter_gather_ops.py - # torch.{zeros, ones} do not support ComplexHalf (torch.complex32) - # So, we are skipping it here. @onlyCUDA - @dtypes(*(get_all_complex_dtypes() + - get_all_int_dtypes())) + @dtypes(*integral_types(), *complex_types()) def test_scatter_reduce_multiply_unsupported_dtypes(self, device, dtype): height = 2 width = 2 @@ -3327,7 +3345,7 @@ def test_scatter_add_bool(self, device): # FIXME: find a test suite for the masked scatter operator @onlyNativeDeviceTypes - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) def test_masked_scatter(self, device, dtype): dt = dtype with warnings.catch_warnings(record=True) as w: @@ -3390,6 +3408,7 @@ def test_masked_scatter(self, device, dtype): self.assertEqual(str(wi.message)[0:55], str(warn)) # FIXME: find a test suite for the masked scatter operator + @skipIfMps def test_masked_scatter_bool_tensor(self, device): src = torch.tensor([True, True, True], device=device) dst = torch.tensor([False, False, False], device=device) @@ -3404,8 +3423,6 @@ def test_masked_scatter_bool_tensor(self, device): # FIXME: find a test suite for the masked scatter operator # test_scatter_gather_ops or test_masked_ops? - # refer https://github.com/pytorch/pytorch/issues/60190 - @skipIfRocm @onlyCUDA @largeTensorTest('30GB') def test_masked_scatter_large_tensor(self, device): @@ -3416,7 +3433,7 @@ def test_masked_scatter_large_tensor(self, device): self.assertEqual(result, result_cpu) # FIXME: find a test suite for the masked select operator - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16)) def test_masked_select(self, device, dtype): if device == 'cpu': warn = 'masked_select received a mask with dtype torch.uint8,' @@ -3484,7 +3501,7 @@ def test_masked_select_discontiguous(self, device): self.assertEqual(out_dc, expected, atol=0, rtol=0) # FIXME: find a test suite for the masked fill operator - @dtypes(*product(get_all_dtypes(), (torch.uint8, torch.bool))) + @dtypes(*product(all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16), (torch.uint8, torch.bool))) def test_masked_fill(self, device, dtypes): dtype = dtypes[0] mask_dtype = dtypes[1] @@ -3789,15 +3806,18 @@ def test_pdist_norm_backward(self, device): # FIXME: find a test suite for the pdist operator @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "sandcastle OOM with current tpx gpu/re configuration") @skipIfRocm + @onlyCUDA + @largeTensorTest('10GB', device='cpu') + @largeTensorTest('5GB', device='cuda') def test_pdist_norm_large(self, device): # use dim0>=46342 for forward, see: # https://github.com/pytorch/pytorch/issues/30583 # Compare output using GPU with the CPU implementation, as brute_pdist uses too much memory - if 'cuda' in device: - x = torch.randn(50000, 1, dtype=torch.float32) - expected_cpu = torch.pdist(x, p=2) - actual_gpu = torch.pdist(x.to(device), p=2) - self.assertEqual(expected_cpu, actual_gpu.cpu()) + x = torch.randn(50000, 1, dtype=torch.float32) # 50k * 4 bytes = 200 KB + # Will require 1249975000 float32s + expected_cpu = torch.pdist(x, p=2) # ~1250M * 4 bytes = 5 GB on CPU + actual_gpu = torch.pdist(x.to(device), p=2) # 5 GB on GPU + self.assertEqual(expected_cpu, actual_gpu.cpu()) # Another 5 GB on CPU # FIXME: move to elementwise ternary test suite @onlyNativeDeviceTypes @@ -4031,19 +4051,6 @@ def test_masked_fill_mem_overlap(self, device): with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): mask[1:].masked_fill_(mask[:-1], False) - # FIXME: convert to ErrorInputs - @onlyNativeDeviceTypes - def test_masked_select_mem_overlap(self, device): - x = torch.rand((1,), device=device).expand((3,)) - y = torch.rand((6,), device=device) - mask = torch.tensor([True, False, True, True, False, False], device=device) - with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): - torch.masked_select(y, mask, out=x) - with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): - torch.masked_select(y, mask, out=y) - with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): - torch.masked_select(mask.clone(), mask, out=mask) - # FIXME: convert to ErrorInputs @expectedFailureMeta # RuntimeError not raised @onlyNativeDeviceTypes @@ -4055,15 +4062,6 @@ def test_masked_scatter_mem_overlap(self, device): with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): x.masked_scatter_(mask, src) - # FIXME: convert to ErrorInputs - @onlyNativeDeviceTypes - def test_index_select_mem_overlap(self, device): - x = torch.rand((1, 6), device=device).expand((2, 6)) - y = torch.rand((3, 6), device=device) - ind = torch.tensor([0, 1], dtype=torch.int64, device=device) - with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): - torch.index_select(y, 1, ind, out=x) - # FIXME: convert to ErrorInputs @onlyNativeDeviceTypes def test_scatter_mem_overlap(self, device): @@ -4078,32 +4076,6 @@ def test_scatter_mem_overlap(self, device): with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): ind.scatter_(0, ind, ind.clone()) - # FIXME: convert to ErrorInputs - @onlyNativeDeviceTypes - def test_gather_mem_overlap(self, device): - x = torch.rand((1,), device=device).expand((3,)) - src = torch.rand((6,), device=device) - ind = torch.tensor([2, 1, 0], device=device, dtype=torch.int64) - with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): - torch.gather(src, 0, ind, out=x) - with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): - torch.gather(src, 0, ind, out=src) - with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): - torch.gather(ind.clone(), 0, ind[1:], out=ind[:1]) - - # FIXME: convert to ErrorInputs - @onlyNativeDeviceTypes - def test_take_mem_overlap(self, device): - x = torch.rand((1,), device=device).expand((3,)) - src = torch.rand((6,), device=device) - ind = torch.tensor([2, 1, 0], device=device, dtype=torch.int64) - with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): - torch.take(src, ind, out=x) - with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): - torch.take(src, ind, out=src) - with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): - torch.take(ind.clone(), ind[1:], out=ind[:-1]) - # FIXME: move to test distributions @onlyCUDA def test_multinomial_device_constrain(self, device): @@ -4308,6 +4280,7 @@ def _test_propagation_rules(self, contiguous, cl, ambiguous, bias): result = ambiguous * 5 self.assertEqual(ambiguous.stride(), result.stride()) + @skipIfMps def test_memory_format_empty_like(self, device): def test_helper(x, memory_format): xc = x.contiguous(memory_format=memory_format) @@ -4562,38 +4535,38 @@ def compare_strides(s1, s2, div): # FIXME: move dlpack tests to their own test class/suite @skipMeta @onlyNativeDeviceTypes - @dtypes(*get_all_dtypes(include_bool=False)) + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16)) def test_dlpack_capsule_conversion(self, device, dtype): # DLpack does not explicitly support bool (xref dmlc/dlpack#75) - x = make_tensor((5,), device, dtype) + x = make_tensor((5,), dtype=dtype, device=device) z = from_dlpack(to_dlpack(x)) self.assertEqual(z, x) @skipMeta @onlyNativeDeviceTypes - @dtypes(*get_all_dtypes(include_bool=False)) + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16)) def test_dlpack_protocol_conversion(self, device, dtype): - x = make_tensor((5,), device, dtype) + x = make_tensor((5,), dtype=dtype, device=device) z = from_dlpack(x) self.assertEqual(z, x) @skipMeta @onlyNativeDeviceTypes def test_dlpack_shared_storage(self, device): - x = make_tensor((5,), device, torch.float64) + x = make_tensor((5,), dtype=torch.float64, device=device) z = from_dlpack(to_dlpack(x)) z[0] = z[0] + 20.0 self.assertEqual(z, x) @skipMeta @onlyCUDA - @dtypes(*get_all_dtypes(include_bool=False)) + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16)) def test_dlpack_conversion_with_streams(self, device, dtype): # Create a stream where the tensor will reside stream = torch.cuda.Stream() with torch.cuda.stream(stream): # Do an operation in the actual stream - x = make_tensor((5,), device, dtype) + 1 + x = make_tensor((5,), dtype=dtype, device=device) + 1 # DLPack protocol helps establish a correct stream order # (hence data dependency) at the exchange boundary. # DLPack manages this synchronization for us, so we don't need to @@ -4604,11 +4577,44 @@ def test_dlpack_conversion_with_streams(self, device, dtype): stream.synchronize() self.assertEqual(z, x) + @skipMeta + @onlyNativeDeviceTypes + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16)) + def test_from_dlpack(self, device, dtype): + x = make_tensor((5,), dtype=dtype, device=device) + y = torch.from_dlpack(x) + self.assertEqual(x, y) + + @skipMeta + @onlyNativeDeviceTypes + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16)) + def test_from_dlpack_noncontinguous(self, device, dtype): + x = make_tensor((25,), dtype=dtype, device=device).reshape(5, 5) + + y1 = x[0] + y1_dl = torch.from_dlpack(y1) + self.assertEqual(y1, y1_dl) + + y2 = x[:, 0] + y2_dl = torch.from_dlpack(y2) + self.assertEqual(y2, y2_dl) + + y3 = x[1, :] + y3_dl = torch.from_dlpack(y3) + self.assertEqual(y3, y3_dl) + + y4 = x[1] + y4_dl = torch.from_dlpack(y4) + self.assertEqual(y4, y4_dl) + + y5 = x.t() + y5_dl = torch.from_dlpack(y5) + self.assertEqual(y5, y5_dl) + @skipMeta @onlyCUDA - @dtypes(*get_all_dtypes(include_bool=False)) + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16)) def test_dlpack_conversion_with_diff_streams(self, device, dtype): - from torch._C import _from_dlpack stream_a = torch.cuda.Stream() stream_b = torch.cuda.Stream() # DLPack protocol helps establish a correct stream order @@ -4616,12 +4622,20 @@ def test_dlpack_conversion_with_diff_streams(self, device, dtype): # the `tensor.__dlpack__` method will insert a synchronization event # in the current stream to make sure that it was correctly populated. with torch.cuda.stream(stream_a): - x = make_tensor((5,), device, dtype) + 1 - z = _from_dlpack(x.__dlpack__(stream_b.cuda_stream)) + x = make_tensor((5,), dtype=dtype, device=device) + 1 + z = torch.from_dlpack(x.__dlpack__(stream_b.cuda_stream)) stream_a.synchronize() stream_b.synchronize() self.assertEqual(z, x) + @skipMeta + @onlyNativeDeviceTypes + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16)) + def test_from_dlpack_dtype(self, device, dtype): + x = make_tensor((5,), dtype=dtype, device=device) + y = torch.from_dlpack(x) + assert x.dtype == y.dtype + @skipMeta @onlyCUDA def test_dlpack_default_stream(self, device): @@ -4643,15 +4657,15 @@ def __dlpack__(self, stream=None): # CUDA-based tests runs on non-default streams with torch.cuda.stream(torch.cuda.default_stream()): - x = DLPackTensor(make_tensor((5,), device, torch.float32)) + x = DLPackTensor(make_tensor((5,), dtype=torch.float32, device=device)) from_dlpack(x) @skipMeta @onlyNativeDeviceTypes - @dtypes(*get_all_dtypes(include_bool=False)) + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16)) def test_dlpack_tensor_invalid_stream(self, device, dtype): with self.assertRaises(TypeError): - x = make_tensor((5,), device, dtype) + x = make_tensor((5,), dtype=dtype, device=device) x.__dlpack__(stream=object()) @skipMeta @@ -4723,6 +4737,7 @@ def test_storage_all_devices(self, devices): self.assertEqual(t.dtype, t.storage().dtype) # FIXME: move to test distributions + @skipIfMps @dtypesIfCUDA(torch.float, torch.double, torch.half) @dtypes(torch.float, torch.double) def test_multinomial(self, device, dtype): @@ -5070,6 +5085,7 @@ def test_pickle_gradscaler(self, device): self.assertEqual(b.scale(torch.tensor([4.0], dtype=torch.float32, device=device)), 12.0) # FIXME: convert to ErrorInputs + @skipIfMps def test_multinomial_invalid(self, device): def test(probs): with self.assertRaisesRegex(RuntimeError, @@ -5083,6 +5099,7 @@ def test(probs): test(torch.tensor([1., 1., nan])) # FIXME: convert to ErrorInputs + @skipIfMps def test_multinomial_invalid_distribution(self, device): def test(probs, replacement): with self.assertRaisesRegex(RuntimeError, @@ -5123,106 +5140,72 @@ def test_multinomial_empty_wo_replacement(self, device): self._test_multinomial_empty(device, False, 1) self._test_multinomial_empty(device, False, 2) - # FIXME: move to elementwise ternary test suite - def _test_where_scalar_template(self, device, dtype, exec_fn): - for ndims in range(0, 4): - shape = self._rand_shape(ndims, min_size=5, max_size=10) - for n in range(ndims + 1): - for c in combinations(list(range(ndims)), n): - for scalar_type in [int, float, complex]: - if dtype.is_complex: - condition = make_tensor(shape, dtype=dtype, device=device).abs() > 0.5 - else: - condition = make_tensor(shape, dtype=dtype, device=device) > 0.5 - - x = make_tensor(shape, dtype=dtype, device=device) - - if not dtype.is_complex and scalar_type == complex: - continue - - scalar_1 = scalar_type(random.random()) - - exec_fn(scalar_type, dtype, condition, x, scalar_1) - - # FIXME: move to elementwise ternary test suite - # For current implementation, - # below are the valid `TensorDtype` and `ScalarType` combinations. - def _where_valid_scalar_tensor_combination(self, scalar_type, dtype): - if (scalar_type == int and dtype == torch.long): - return True - elif (scalar_type == float and dtype == torch.double): - return True - elif (scalar_type == complex and dtype == torch.complex128): - return True - return False + @dtypesIfCUDA(torch.float, torch.double, torch.half) + @dtypesIfCPU(torch.float, torch.double, torch.bfloat16) + @dtypes(torch.float, torch.double) + def test_multinomial_cpu(self, device, dtype): + def make_prob_dist(shape, is_contiguous): + if is_contiguous: + if dtype == torch.half or dtype == torch.bfloat16: + return torch.zeros(shape, device=device).uniform_().to(dtype=dtype) + return torch.zeros(shape, device=device, dtype=dtype).uniform_() + elif len(shape) == 1: + if dtype == torch.half or dtype == torch.bfloat16: + return torch.zeros((shape + [5]), device=device).uniform_().to(dtype=dtype)[:, 2] + return torch.zeros((shape + [5]), device=device, dtype=dtype).uniform_()[:, 2] + else: + # num dim = 2 + new_shape = [2, shape[1], 7, 1, shape[0], 1, 10] + if dtype == torch.half or dtype == torch.bfloat16: + prob_dist = torch.zeros(new_shape, device=device).uniform_().to(dtype=dtype) + else: + prob_dist = torch.zeros(new_shape, device=device, dtype=dtype).uniform_() + prob_dist = prob_dist.transpose(1, 4) + prob_dist = prob_dist[1, :, 5, 0, :, 0, 4] + assert not prob_dist.is_contiguous() # sanity check + return prob_dist # FIXME: move to elementwise ternary test suite + # As the test fails with Runtime Error not raised on XLA @onlyNativeDeviceTypes - @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes() + - get_all_complex_dtypes())) - def test_where_scalar_invalid_combination_raises(self, device, dtype): - - def checkRaises(scalar_type, dtype, condition, x, scalar_1): - if not self._where_valid_scalar_tensor_combination(scalar_type, dtype): - # Note: This should fail once `where` supports type promotion. - with self.assertRaisesRegex(RuntimeError, "expected scalar type"): - torch.where(condition, x, scalar_1) - - self._test_where_scalar_template(device, dtype, checkRaises) + def test_where_scalar_handcrafted_values(self, device): + # Tests ScalarxScalar, ScalarxTensor and TensorxScalar + # variant of `where` against NumPy version with + # handcrafted values. + condition_shape = (5, 5) + dtypes = ( + torch.bool, torch.uint8, torch.int8, torch.int16, torch.int64, + torch.float16, torch.float32, torch.float64, + torch.complex64, torch.complex128, + ) + shapes = ((), (5,), (1, 5),) - # FIXME: move to elementwise ternary test suite - @skipCUDAVersionIn([(11, 2)]) # test fails for 11.2, see https://github.com/pytorch/pytorch/issues/51980 - @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes() + - get_all_complex_dtypes())) - def test_where_scalar_valid_combination(self, device, dtype): + with torch.no_grad(): + tensors = (torch.empty(shape, dtype=dtype, device=device).fill_(17) + for shape, dtype in product(shapes, dtypes)) - def checkResult(scalar_type, dtype, condition, x, scalar_1): - if self._where_valid_scalar_tensor_combination(scalar_type, dtype): - def x_like(scalar, without_dtype=False): - return torch.tensor(scalar, dtype=dtype, device=device).expand_as(x) + # Use different values for `x` and `y` + # as they are the output values which are compared. + x_vals = (True, 3, 7.0, 1 + 0.5j) + y_vals = itertools.chain((False, 4, 8.0, 2 + 0.5j), tensors) + for x in x_vals: + for y in y_vals: + condition = torch.empty(*condition_shape, dtype=torch.bool, device=device).bernoulli_() + common_dtype = torch.result_type(x, y) - # X = Tensor, Y = Scalar - scalar_out = torch.where(condition, x, scalar_1) - tensor_out = torch.where(condition, x, x_like(scalar_1)) - self.assertEqual(scalar_out, tensor_out) + def check_equal(condition, x, y): + condition_np = condition.cpu().numpy() + x_np = x.cpu().numpy() if isinstance(x, torch.Tensor) else x + y_np = y.cpu().numpy() if isinstance(y, torch.Tensor) else y - # X = Scalar, Y = Tensor - scalar_out = torch.where(condition, scalar_1, x) - tensor_out = torch.where(condition, x_like(scalar_1), x) - self.assertEqual(scalar_out, tensor_out) + # NumPy aggressively promotes to double, hence cast to output to correct dtype + expected = torch.from_numpy(np.where(condition_np, x_np, y_np)).to(common_dtype) + result = torch.where(condition, x, y) + self.assertEqual(expected, result) - self._test_where_scalar_template(device, dtype, checkResult) + check_equal(condition, x, y) + check_equal(condition, y, x) - # FIXME: move to elementwise ternary test suite - # As the test fails with Runtime Error not raised on XLA - @onlyNativeDeviceTypes - def test_where_scalar_scalar(self, device): - # Scalar-Scalar Version - height = 5 - width = 5 - default_dtype = torch.get_default_dtype() - for test_default_dtype in [torch.float, torch.double]: - torch.set_default_dtype(test_default_dtype) - for scalar_type_1 in [int, float, complex]: - for scalar_type_2 in [int, float, complex]: - x1 = scalar_type_1(random.random() * random.randint(10, 20)) - x2 = scalar_type_2(random.random() * random.randint(20, 30)) - condition = torch.randn(height, width, device=device) > 0.5 - if scalar_type_1 != scalar_type_2: - self.assertRaisesRegex(RuntimeError, "expected scalar type", lambda: torch.where(condition, x1, x2)) - else: - def get_dtype(scalar_type): - complex_dtype = torch.complex64 if torch.float == torch.get_default_dtype() else torch.complex128 - type_map = {int: torch.long, float: torch.get_default_dtype(), complex: complex_dtype} - return type_map[scalar_type] - expected = torch.zeros((height, width), dtype=get_dtype(scalar_type_1)) - expected[condition] = x1 - expected[~condition] = x2 - result = torch.where(condition, x1, x2) - self.assertEqual(expected, result) - - # Reset the original dtype - torch.set_default_dtype(default_dtype) def test_hook_remove(self, device): # Reference: https://github.com/pytorch/pytorch/issues/58354 @@ -5286,6 +5269,48 @@ def test_assertRaisesRegex_ignore_msg_non_native_device(self, device): with self.assertRaisesRegex(RuntimeError, msg): torch.nn.functional.nll_loss(x, t, weight=invalid_weight) + @dtypes(*all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.complex32)) + def test_copy_(self, device, dtype): + def can_cast(src_dtype, dst_dtype): + # torch.can_cast(torch.int16, torch.uint8) returns True + # which isn't actually safe-cast. + # This function returns False in this case. + def is_unsigned_int(dtype): + return dtype is torch.uint8 + + if is_unsigned_int(dst_dtype): + return is_unsigned_int(src_dtype) + return torch.can_cast(src_dtype, dst_dtype) + + def make_tensor_wrapper(shape, dtype): + if dtype is not torch.complex32: + # Make tensor does not support generating + # complex32 tensor + return make_tensor(shape, device=device, dtype=dtype) + return torch.randn(shape, device=device, dtype=dtype) + + t = make_tensor_wrapper((50,), dtype) + src_dtypes = all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.complex32) + for src_dtype in src_dtypes: + src = make_tensor_wrapper((50,), dtype=src_dtype) + t.copy_(src) + dst = make_tensor_wrapper((50, ), dtype=src_dtype) + if can_cast(src_dtype, dtype): + rtol = None + atol = None + if dtype in (torch.half, torch.complex32): + rtol = 1e-3 + atol = 1e-3 + if dtype in (torch.bfloat16,): + rtol = 1e-2 + atol = 1e-2 + self.assertEqual(src, dst.copy_(t), rtol=rtol, atol=atol) + + @dtypes(*all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.complex32)) + def test_item(self, device, dtype): + t = torch.ones((), device=device, dtype=dtype) + self.assertEqual(1, t.item()) + # Tests that compare a device's computation with the (gold-standard) CPU's. class TestDevicePrecision(TestCase): @@ -5714,69 +5739,6 @@ def test_unflatten(self): r"the unspecified dimension size -1 can be any value and is ambiguous"): torch.randn(2, 0).unflatten(1, (2, -1, 0)) - # FIXME: move to test_scatter_gather_ops.py - def test_scatter_reduce(self): - dtype = device = None - output_size = 10 - shape = [5, 10, 20] - reduces = ["sum", "prod", "mean", "amax", "amin"] - fills = {"sum": 0, "prod": 1, "mean": 0, "amax": -(2 ** 31), "amin": 2 ** 31 - 1} - fns = {"sum": lambda t, v: t.add_(v), - "prod": lambda t, v: t.mul_(v), - "mean": lambda t, v, n: t.mul_(n).add_(v).div_(n + 1), - "amax": lambda t, v: torch.max(t, v, out=t), - "amin": lambda t, v: torch.min(t, v, out=t)} - - index = torch.randint(0, output_size, shape, dtype=torch.long, device=device) - input = torch.randn(shape, dtype=dtype, device=device) - - for reduce in reduces: - for dim in range(len(shape)): - output = input.scatter_reduce(dim, index, reduce, output_size=output_size) - - # Check that output is of the correct size - output_shape = copy.copy(shape) - output_shape[dim] = output_size - self.assertEqual(output.shape, output_shape) - - expected = torch.zeros(output_shape, dtype=dtype, device=device) - expected.fill_(fills[reduce]) - counts = torch.zeros(output_shape, dtype=dtype, device=device) - for i, j, k in itertools.product(range(shape[0]), range(shape[1]), range(shape[2])): - v = input[i, j, k] - m = index[i, j, k] - - if dim == 0: - i = m - elif dim == 1: - j = m - else: - k = m - - op = fns[reduce] - if (reduce == "mean"): - op(expected[i, j, k], v, counts[i, j, k]) - else: - op(expected[i, j, k], v) - counts[i, j, k] += 1 - - if (reduce == "amin" or reduce == "amax"): - expected.masked_fill_(counts == 0, 0) - - self.assertTrue(torch.allclose(output, expected)) - - with self.assertRaisesRegex(RuntimeError, "Expected `dim` to be in range -3 to 2"): - torch.scatter_reduce(input, 4, index, "sum") - - with self.assertRaisesRegex(RuntimeError, "Shape mismatch"): - index2 = torch.randint(0, output_size, (10, ), dtype=torch.long, device=device) - torch.scatter_reduce(input, 0, index2, "sum") - - with self.assertRaisesRegex(RuntimeError, "Expected `index` values to be in range 0 to 2"): - input2 = torch.randn(10, dtype=dtype, device=device) - index2 = torch.tensor([0, 1, 0, 1, 2, 3, 3, 4, 4, 3]) - torch.scatter_reduce(input2, 0, index2, "sum", output_size=2) - def test_structseq_repr(self): a = torch.arange(250).reshape(5, 5, 10) expected = """ @@ -6296,6 +6258,7 @@ def test_from_buffer(self): self.assertEqual(bools.size(), 8) self.assertEqual(bools.tolist(), [False, True, True, True, True, True, True, True]) self.assertEqual(bools.type(), 'torch.BoolStorage') + self.assertTrue(isinstance(bools, torch.BoolStorage)) f = bytearray(b'\x80\x02\x8a\nl\xfc\x9cF\xf9 j\xa8P\x19.\x80\x02M\xe9') bools = torch.BoolStorage.from_buffer(f, 'big') @@ -6308,6 +6271,122 @@ def test_from_buffer(self): bytes = torch.ByteStorage.from_buffer(a) self.assertEqual(bytes.nbytes(), 4) self.assertEqual(bytes.tolist(), [1, 2, 3, 4]) + self.assertTrue(isinstance(bytes, torch.ByteStorage)) + + def test_storage_error(self): + quantized_storages = [ + torch.QInt32Storage, + torch.QInt8Storage, + torch.QUInt2x4Storage, + torch.QUInt4x2Storage, + torch.QUInt8Storage, + ] + + with self.assertRaisesRegex(RuntimeError, r"Only child classes of _LegacyStorage can be instantiated"): + torch.storage._LegacyStorage() + + for storage_class in torch._storage_classes: + if storage_class in [torch._UntypedStorage, torch.cuda._UntypedStorage, torch._TypedStorage]: + continue + + device = 'cuda' if storage_class.__module__ == 'torch.cuda' else 'cpu' + dtype = storage_class.dtype + + if device == 'cuda' and not torch.cuda.is_available(): + continue + + # Legacy Storage constructor errors + with self.assertRaisesRegex(RuntimeError, r"'device' cannot be specified"): + storage_class(device='cpu') + + with self.assertRaisesRegex(RuntimeError, r"'dtype' cannot be specified"): + storage_class(dtype=torch.float) + + with self.assertRaisesRegex(TypeError, r"got an unexpected keyword"): + storage_class(sdlkjf=torch.float) + + with self.assertRaisesRegex(RuntimeError, r"Too many positional arguments"): + storage_class(0, 0) + + with self.assertRaisesRegex(TypeError, r"invalid data type"): + storage_class('string') + + with self.assertRaisesRegex(TypeError, r"Argument type not recognized"): + storage_class(torch.tensor([])) + + s = storage_class() + + with self.assertRaisesRegex(RuntimeError, r"No positional arguments"): + storage_class(0, wrap_storage=s._untyped()) + + with self.assertRaisesRegex(TypeError, r"must be _UntypedStorage"): + storage_class(wrap_storage=s) + + if torch.cuda.is_available(): + if storage_class in quantized_storages: + with self.assertRaisesRegex(RuntimeError, r"Cannot create CUDA storage with quantized dtype"): + s.cuda() + + else: + + if s.is_cuda: + s_other_device = s.cpu() + else: + s_other_device = s.cuda() + + with self.assertRaisesRegex(RuntimeError, r"Device of 'wrap_storage' must be"): + storage_class(wrap_storage=s_other_device._untyped()) + + # _TypedStorage constructor errors + with self.assertRaisesRegex(RuntimeError, r"No positional arguments"): + torch._TypedStorage(0, wrap_storage=s._untyped(), dtype=dtype) + + with self.assertRaisesRegex(RuntimeError, r"Argument 'dtype' must be specified"): + torch._TypedStorage(wrap_storage=s._untyped()) + + with self.assertRaisesRegex(TypeError, r"Argument 'dtype' must be torch.dtype"): + torch._TypedStorage(wrap_storage=s._untyped(), dtype=0) + + with self.assertRaisesRegex(RuntimeError, r"Argument 'device' should not be specified"): + torch._TypedStorage(wrap_storage=s._untyped(), dtype=dtype, device=device) + + with self.assertRaisesRegex(TypeError, r"Argument 'wrap_storage' must be _UntypedStorage"): + torch._TypedStorage(wrap_storage=s, dtype=dtype) + + with self.assertRaisesRegex(RuntimeError, r"Storage device not recognized"): + torch._TypedStorage(dtype=dtype, device='xla') + + if torch.cuda.is_available(): + if storage_class in quantized_storages: + with self.assertRaisesRegex(RuntimeError, r"Cannot create CUDA storage with quantized dtype"): + torch._TypedStorage(dtype=dtype, device='cuda') + + with self.assertRaisesRegex(TypeError, r"Argument type not recognized"): + torch._TypedStorage(torch.tensor([]), dtype=dtype, device=device) + + with self.assertRaisesRegex(RuntimeError, r"Too many positional arguments"): + torch._TypedStorage(0, 0, dtype=dtype, device=device) + + def test_storage_error_no_attribute(self): + storage_classes = [ + torch.cuda.ByteStorage, + torch.cuda.FloatStorage, + torch.cuda._UntypedStorage, + ] + for storage_class in storage_classes: + with self.assertRaisesRegex(RuntimeError, r'Not available for CUDA storage'): + storage_class.from_buffer() + + if storage_class == torch.cuda._UntypedStorage: + with self.assertRaisesRegex(RuntimeError, r'Not available for CUDA storage'): + storage_class._new_with_weak_ptr() + + else: + with self.assertRaisesRegex(AttributeError, r'has no attribute'): + storage_class._new_with_weak_ptr() + + with self.assertRaisesRegex(RuntimeError, r'Not available for CUDA storage'): + storage_class._new_shared_filename(0, 0, 0) def test_storage_casts(self): storage = torch.IntStorage([-1, 0, 1, 2, 3, 4]) @@ -6489,6 +6568,11 @@ def test_print(self): self.assertEqual(x.__repr__(), str(x)) self.assertExpectedInline(str(x), '''tensor([2.3000+4.j, 7.0000+6.j])''') + # test complex half tensor + x = torch.tensor([1.25 + 4j, -7. + 6j], dtype=torch.chalf) + self.assertEqual(x.__repr__(), str(x)) + self.assertExpectedInline(str(x), '''tensor([ 1.2500+4.j, -7.0000+6.j], dtype=torch.complex32)''') + # test scientific notation for complex tensors x = torch.tensor([1e28 + 2j , -1e-28j]) self.assertEqual(x.__repr__(), str(x)) @@ -7066,6 +7150,14 @@ def test_fill_diagonal(self): e1.fill_diagonal_(v, wrap=True) self.assertEqual(e1, e2) + def test_setting_real_imag_to_a_number(self): + x = torch.randn(4, dtype=torch.cfloat) + x.real = 0 + x.imag = 0 + zeros = torch.zeros(4) + self.assertEqual(x.real, zeros) + self.assertEqual(x.imag, zeros) + def test_batch_norm_cpu_inference(self): # input nchw in (2,1,1,1), (2,2,2,2) inputs = [ @@ -7114,7 +7206,6 @@ def test_batch_norm_cpu_inference(self): # FIXME: move these meta tests to their own test suite/class or # distribute them among the appropriate test suites for their ops - @noarchTest def test_empty_meta(self): x = torch.empty(2 ** 20, 2 ** 20, device='meta') y = torch.empty(2 ** 20, device='meta') @@ -7122,7 +7213,10 @@ def test_empty_meta(self): self.assertEqual(z.size(), (2 ** 20, 2 ** 20)) self.assertRaises(RuntimeError, lambda: z[0][0].item()) - @noarchTest + def test_format_scalar_meta(self): + x = torch.empty((), device='meta') + self.assertEqual(format(x), repr(x)) + def test_upsample_nearest1d_meta(self): # TODO: this test should be triggered by test_nn.py but right # now meta is not enabled (and even if it was, we are probably @@ -7146,7 +7240,6 @@ def test_upsample_nearest1d_meta(self): self.assertEqual(z.size(), (2 * 10 ** 8, 3, 4 * 10 ** 8)) self.assertRaises(RuntimeError, lambda: z[0][0][0].item()) - @noarchTest def test_upsample_nearest2d_meta(self): # TODO: the out tests cannot be triggered by test_nn.py because # we don't actually do out= arguments for nn functions, so there @@ -7187,13 +7280,11 @@ def test_upsample_nearest2d_meta(self): """Expected out tensor to have device meta, but got cpu instead""" ) - @noarchTest def test_detach_meta(self): x = torch.empty(2, device='meta') # This used to segfault self.assertRaises(RuntimeError, lambda: x.detach().storage()) - @noarchTest def test_add_meta_scalar(self): # From https://github.com/pytorch/pytorch/issues/53815 x = torch.empty(2, device='meta') @@ -7228,28 +7319,39 @@ def test_normal_shape(self): self.assertEqual(torch.normal(tensor2145, tensor2345).size(), (2, 3, 4, 5)) # inputs are non-expandable tensors, but they have same number of elements - # TORCH_WARN_ONCE is used in torch.normal, only 1st assertEqual will show warn msg - if not warned: - self.assertWarnsRegex(UserWarning, "deprecated and the support will be removed", - lambda: self.assertEqual(torch.normal(tensor120, tensor2345).size(), (120,))) - warned = True - else: + with self.assertRaisesRegex( + RuntimeError, + r"The size of tensor a \(120\) must match the size of " + r"tensor b \(5\) at non-singleton dimension 3"): self.assertEqual(torch.normal(tensor120, tensor2345).size(), (120,)) - self.assertEqual(torch.normal(tensor2345, tensor120).size(), (2, 3, 4, 5)) + with self.assertRaisesRegex( + RuntimeError, + r"The size of tensor a \(5\) must match the size of " + r"tensor b \(120\) at non-singleton dimension 3"): + self.assertEqual(torch.normal(tensor2345, tensor120).size(), (2, 3, 4, 5)) # inputs are non-expandable tensors and they don't have same number of elements - with self.assertRaisesRegex(RuntimeError, "inconsistent tensor"): + with self.assertRaisesRegex( + RuntimeError, + r"The size of tensor a \(5\) must match the size of " + r"tensor b \(4\) at non-singleton dimension 3"): torch.normal(tensor2345, tensor4) # output and inputs are size compatible self.assertEqual(torch.normal(tensor2345, tensor2345, out=output2345).size(), (2, 3, 4, 5)) # output and inputs are not size compatible - with self.assertRaisesRegex(RuntimeError, "inconsistent tensor"): - # inputs are expandable but have different broadcasted size than output - torch.normal(tensor2345, tensor2145, out=output345) - with self.assertRaisesRegex(RuntimeError, "inconsistent tensor"): - # inputs are not expandable but reshapeable, output size is not the same as mean + with self.assertWarnsRegex( + UserWarning, + "This behavior is deprecated, and in a future PyTorch " + "release outputs will not be resized unless they have " + "zero elements"): + self.assertEqual(torch.normal(tensor2345, tensor2145, out=output345).size(), (2, 3, 4, 5)) + with self.assertRaisesRegex( + RuntimeError, + r"The size of tensor a \(5\) must match the size of " + r"tensor b \(120\) at non-singleton dimension 3"): + # inputs are not expandable, output size is not the same as mean torch.normal(tensor2345, tensor120, out=output345) def test_tensoriterator_output_setup(self): @@ -7354,12 +7456,12 @@ def test_numel(self): # Verifies that (deep)copies of dtypes are the same objects def test_copy_dtypes(self): - for dtype in get_all_dtypes(): + for dtype in all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool): copied_dtype = copy.deepcopy(dtype) self.assertIs(dtype, copied_dtype) def test_dtype_is_signed(self): - for dtype in get_all_dtypes(): + for dtype in all_types_and_complex_and(torch.half, torch.bfloat16, torch.half): self.assertEqual(dtype.is_signed, torch.is_signed(torch.tensor(0, dtype=dtype))) self.assertRaisesRegex(RuntimeError, 'not supported for quantized', lambda: torch.quint8.is_signed) @@ -7474,6 +7576,12 @@ def test_copy_transpose(self): self.assertEqual(y[:, 0], range(100)) self.assertEqual(y[:, 40], range(4000, 4100)) + x = torch.arange(100 * 100).reshape(100, 100).to(dtype=torch.complex32).t() + y = torch.empty(100, 100, dtype=torch.complex32) + y.copy_(x) + self.assertEqual(y[:, 0], range(100)) + self.assertEqual(y[:, 40], range(4000, 4100)) + # FIXME: Port to a more appropriate test suite def test_copy_broadcast(self): torch.zeros(5, 6).copy_(torch.zeros(6)) @@ -7486,7 +7594,7 @@ def test_copy_many_to_one(self): self.assertRaises(RuntimeError, lambda: torch.zeros(1, 6).expand(5, 6).copy_(torch.zeros(5, 6))) # FIXME: Port to a more appropriate test suite - def test_to(self): + def _test_to_with_layout(self, layout): def test_copy_behavior(t, non_blocking=False): self.assertIs(t, t.to(t, non_blocking=non_blocking)) self.assertIs(t, t.to(t.dtype, non_blocking=non_blocking)) @@ -7508,16 +7616,33 @@ def test_copy_behavior(t, non_blocking=False): self.assertIsNot(t, t.to(device, t.dtype, non_blocking=non_blocking, copy=True)) a = torch.tensor(5) + if layout == torch.sparse_csr: + a = torch.tensor([[0, 1, 2], [2, 0, 3]]).to_sparse_csr() test_copy_behavior(a) self.assertEqual(a.device, a.to('cpu').device) self.assertEqual(a.device, a.to('cpu', dtype=torch.float32).device) self.assertIs(torch.float32, a.to('cpu', dtype=torch.float32).dtype) self.assertEqual(a.device, a.to(torch.float32).device) self.assertIs(torch.float32, a.to(dtype=torch.float32).dtype) - self.assertEqual(a.data_ptr(), a.to('cpu').data_ptr()) - self.assertEqual(a.data_ptr(), a.to(dtype=a.dtype, device=a.device, copy=False).data_ptr()) - self.assertEqual(a.data_ptr(), a.to('cpu', copy=False).data_ptr()) - self.assertNotEqual(a.data_ptr(), a.to('cpu', copy=True).data_ptr()) + + def test_data_ptr(getter): + self.assertEqual(getter(a), getter(a.to('cpu'))) + self.assertEqual(getter(a), getter(a.to(dtype=a.dtype, device=a.device, copy=False))) + self.assertEqual(getter(a), getter(a.to('cpu', copy=False))) + self.assertNotEqual(getter(a), getter(a.to('cpu', copy=True))) + if layout == torch.sparse_csr: + # TODO: compressed sparse tensors currently don't support data_ptr. + # Exercising failure will allow us to widen coverage of this test once it does. + with self.assertRaisesRegex(RuntimeError, "Cannot access data pointer of Tensor that doesn't have storage"): + a.data_ptr() + # While compressed sparse tensors don't have a concept of data_ptr + # the underlying tensors do. The implementation of to appropriately forwards + # the call to the components, which is what we're test here. + test_data_ptr(lambda a: a.values().data_ptr()) + test_data_ptr(lambda a: a.crow_indices().data_ptr()) + test_data_ptr(lambda a: a.col_indices().data_ptr()) + else: + test_data_ptr(lambda a: a.data_ptr()) if torch.cuda.is_available(): for non_blocking in [True, False]: @@ -7532,6 +7657,10 @@ def test_copy_behavior(t, non_blocking=False): self.assertIs(torch.int32, b.to(dtype=torch.int32).dtype) self.assertEqual(b.device, b.to(dtype=torch.int32).device) + def test_to(self): + self._test_to_with_layout(torch.strided) + self._test_to_with_layout(torch.sparse_csr) + # FIXME: describe this test def test_as_subclass(self): class SubTensor(torch.Tensor): @@ -7802,6 +7931,22 @@ def test_type_conversion_via_dtype_name(self): self.assertEqual(cdouble.dtype, torch.complex128) self.assertEqual(cdouble.real, x.double()) self.assertEqual(cdouble.imag, torch.zeros_like(cdouble.imag)) + chalf = x.chalf() + self.assertEqual(chalf.dtype, torch.complex32) + self.assertEqual(chalf.real, x.half()) + self.assertEqual(chalf.imag, torch.zeros_like(chalf.imag)) + + def test_type_alias(self): + type_alias_map = {torch.float64: torch.double, + torch.float32: torch.float, + torch.int32: torch.int, + torch.int64: torch.long, + torch.int16: torch.short, + torch.float16: torch.half, + torch.complex32: torch.chalf, + torch.complex64: torch.cfloat} + for dtype, alias in type_alias_map.items(): + self.assertIs(alias, dtype) # FIXME: Describe this test def test_doc_template(self) -> None: @@ -8153,8 +8298,8 @@ def invert_perm(p): def generate_inputs(num_batches): # transposed tensors for perm1, perm2 in itertools.product(itertools.permutations((0, 1, 2)), repeat=2): - b1 = make_tensor((num_batches, M, N), device, dtype, low=-1, high=1) - b2 = make_tensor((num_batches, N, O), device, dtype, low=-1, high=1) + b1 = make_tensor((num_batches, M, N), dtype=dtype, device=device, low=-1, high=1) + b2 = make_tensor((num_batches, N, O), dtype=dtype, device=device, low=-1, high=1) b1 = b1.permute(perm1).contiguous().permute(invert_perm(perm1)) b2 = b2.permute(perm2).contiguous().permute(invert_perm(perm2)) yield b1, b2 @@ -8162,8 +8307,8 @@ def generate_inputs(num_batches): for b1, b2, b3, b4, b5, b6 in itertools.product((True, False), repeat=6): shape1 = (num_batches if b1 else 1, M if b2 else 1, N if b3 else 1) shape2 = (num_batches if b4 else 1, N if b5 else 1, O if b6 else 1) - b1 = make_tensor(shape1, device, dtype, low=-1, high=1).expand(num_batches, M, N) - b2 = make_tensor(shape2, device, dtype, low=-1, high=1).expand(num_batches, N, O) + b1 = make_tensor(shape1, dtype=dtype, device=device, low=-1, high=1).expand(num_batches, M, N) + b2 = make_tensor(shape2, dtype=dtype, device=device, low=-1, high=1).expand(num_batches, N, O) yield b1, b2 # zero-sized tensors for z1, z2, z3, z4 in itertools.product((True, False), repeat=4): diff --git a/test/test_type_promotion.py b/test/test_type_promotion.py index 01e96a3fe112..8c82b43ecba6 100644 --- a/test/test_type_promotion.py +++ b/test/test_type_promotion.py @@ -7,15 +7,14 @@ import torch from torch.testing._internal.common_utils import (TestCase, run_tests, load_tests, - TEST_NUMPY, torch_to_numpy_dtype_dict) + TEST_NUMPY, torch_to_numpy_dtype_dict, numpy_to_torch_dtype_dict) from torch.testing._internal.common_device_type import (instantiate_device_type_tests, onlyNativeDeviceTypes, - dtypes, dtypesIfCUDA, onlyCPU, expectedFailureMeta) + dtypes, onlyCPU, expectedFailureMeta, skipMeta) from torch.testing._internal.common_dtype import ( - get_all_dtypes, get_all_math_dtypes, get_all_int_dtypes, get_all_fp_dtypes + all_types_and_complex_and, get_all_math_dtypes, floating_types, get_all_dtypes ) -if TEST_NUMPY: - import numpy as np +import numpy as np # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for # sharding on sandcastle. This line silences flake warnings @@ -184,12 +183,14 @@ def test_bfloat16(self, device): self.assertEqual(bf + scalar, scalar + bf) # with tensor - for dtype in get_all_dtypes(): + for dtype in all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool): t = torch.tensor(1, dtype=dtype, device=device) self.assertEqual(bf + t, t + bf) if dtype in (torch.float16, torch.float32, torch.float64, torch.cfloat, torch.cdouble): # Handles bfloat16 x float16 -> float32 promotion expected_dtype = dtype if dtype != torch.half else torch.float32 + elif dtype is torch.chalf: + expected_dtype = torch.cfloat elif dtype in (torch.bool, torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64, torch.bfloat16): expected_dtype = torch.bfloat16 @@ -200,6 +201,39 @@ def test_bfloat16(self, device): self.assertEqual(torch.promote_types(torch.bfloat16, dtype), expected_dtype) self.assertEqual((bf + t).dtype, expected_dtype) + @onlyNativeDeviceTypes + def test_complex_half(self, device): + # with scalar + chalf = torch.tensor(5.5, dtype=torch.chalf, device=device) + for scalar in (2.2, 5, 100000): # chalf + 100000 is inf + self.assertEqual((chalf * scalar).dtype, torch.chalf) + self.assertEqual(scalar * chalf, chalf * scalar) + + for scalar in (complex(1, 1), complex(-2, 0), complex(0, -3)): + self.assertEqual((chalf * scalar).dtype, torch.chalf) + self.assertEqual(chalf * scalar, scalar * chalf) + + # with tensor + dtypes = all_types_and_complex_and(torch.chalf, torch.half, torch.bfloat16, torch.bool) + for dtype in dtypes: + t = torch.tensor(1, dtype=dtype, device=device) + self.assertEqual(chalf * t, t * chalf) + if dtype in (torch.float16, torch.chalf): + expected_dtype = torch.chalf + elif dtype in (torch.float, torch.double, torch.bfloat16): + expected_dtype = torch.cdouble if dtype is torch.double else torch.cfloat + elif dtype in (torch.cfloat, torch.cdouble): + expected_dtype = dtype + elif dtype in (torch.bool, torch.uint8, + torch.int8, torch.int16, torch.int32, torch.int64): + expected_dtype = torch.chalf + else: + raise AssertionError(f'Missing dtype {dtype} not tested.') + + self.assertEqual(torch.promote_types(dtype, torch.chalf), expected_dtype) + self.assertEqual(torch.promote_types(torch.chalf, dtype), expected_dtype) + self.assertEqual((chalf * t).dtype, expected_dtype) + @float_double_default_dtype def test_alternate_result(self, device): f = torch.tensor([1, 1, 1, 1], dtype=torch.float, device=device) @@ -340,7 +374,8 @@ def test_create_bool_tensors(self, device): # this seems like odd behavior but ints also create float tensors, numpy doesn't have this function. self.assertEqual(torch.scalar_tensor(False, device=device), torch.tensor(0., device=device)) - @dtypes(*itertools.product(get_all_dtypes(), get_all_dtypes())) + @dtypes(*itertools.product(all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool), + all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))) def test_result_type(self, device, dtypes): "Test result_type for tensor vs tensor and scalar vs scalar." @@ -520,12 +555,16 @@ def test_complex_assertraises(self, device): dict(name="ne", compare_op=lambda x, y: x != y, ), ] for op in comparison_ops: - for dt1 in get_all_math_dtypes(device): - for dt2 in get_all_math_dtypes(device): - if (dt1.is_complex or dt2.is_complex) and not (op["name"] == "eq" or op["name"] == "ne"): - u = torch.tensor([1], dtype=dt1, device=device) - v = torch.tensor([2], dtype=dt2, device=device) - self.assertRaises(RuntimeError, lambda: torch.tensor([op["compare_op"](u, v)], dtype=torch.bool)) + is_cuda = torch.device(device).type == 'cuda' + dtypes = get_all_dtypes(include_half=is_cuda, + include_bfloat16=False, include_bool=False, + include_complex32=True) + + for dt1, dt2 in itertools.product(dtypes, dtypes): + if (dt1.is_complex or dt2.is_complex) and not (op["name"] == "eq" or op["name"] == "ne"): + u = torch.tensor([1], dtype=dt1, device=device) + v = torch.tensor([2], dtype=dt2, device=device) + self.assertRaises(RuntimeError, lambda: torch.tensor([op["compare_op"](u, v)], dtype=torch.bool)) @float_double_default_dtype def test_lt_with_type_promotion(self, device): @@ -562,7 +601,7 @@ def test_promote_types(self, device): @float_double_default_dtype def test_promote_self(self, device): - for dtype in get_all_dtypes(): + for dtype in all_types_and_complex_and(torch.half, torch.bfloat16, torch.chalf, torch.bool): self.assertEqual(torch.promote_types(dtype, dtype), dtype) @expectedFailureMeta @@ -811,8 +850,8 @@ def test_integer_addcdiv_deprecated(self, device, dtype): @unittest.skipIf(not TEST_NUMPY, "NumPy not found") @float_double_default_dtype @onlyCPU - @dtypes(*list(itertools.product(torch_to_numpy_dtype_dict.keys(), - torch_to_numpy_dtype_dict.keys()))) + @dtypes(*list(itertools.product(set(numpy_to_torch_dtype_dict.values()), + set(numpy_to_torch_dtype_dict.values())))) def test_numpy_array_binary_ufunc_promotion(self, device, dtypes): import operator np_type = torch_to_numpy_dtype_dict[dtypes[0]] @@ -880,7 +919,7 @@ def test_numpy_array_binary_ufunc_promotion(self, device, dtypes): @onlyNativeDeviceTypes def test_cat_different_dtypes(self, device): - dtypes = get_all_dtypes(include_bfloat16=False) + dtypes = all_types_and_complex_and(torch.half, torch.bool) for x_dtype, y_dtype in itertools.product(dtypes, dtypes): x_vals, y_vals = [1, 2, 3], [4, 5, 6] @@ -899,7 +938,7 @@ def test_cat_different_dtypes(self, device): @onlyNativeDeviceTypes def test_cat_out_different_dtypes(self, device): - dtypes = get_all_dtypes(include_bfloat16=False, include_bool=False) + dtypes = all_types_and_complex_and(torch.half) for x_dtype, y_dtype, out_dtype in itertools.product(dtypes, dtypes, dtypes): out = torch.zeros(6, device=device, dtype=out_dtype) x = torch.tensor([1, 2, 3], device=device, dtype=x_dtype) @@ -937,7 +976,11 @@ def test_unary_op_out_casting(self, device, dtypes): elif op in real_only_ops and dtypes[0].is_complex: with self.assertRaises(RuntimeError): op(t, out=out) - elif op in float_only_ops and (not dtypes[0].is_floating_point and not dtypes[0].is_complex): + elif ( + op in float_only_ops + and (not dtypes[0].is_floating_point and not dtypes[0].is_complex) + and device != "meta" + ): with self.assertRaises(RuntimeError): op(t, out=out) else: @@ -947,6 +990,7 @@ def test_unary_op_out_casting(self, device, dtypes): # Verifies that the out= argument doesn't affect the computation, that # is, out = op(...) and op(..., out=out) produce the same result. @onlyNativeDeviceTypes + @skipMeta def test_computation_ignores_out(self, device): t = torch.tensor(33000, dtype=torch.float16, device=device) out = torch.empty(0, dtype=torch.float64, device=device) @@ -966,37 +1010,70 @@ def test_computation_ignores_out(self, device): self.assertEqual(result, a - b, exact_dtype=False) self.assertNotEqual(result, a.double() - b, exact_dtype=False) - @dtypesIfCUDA(*itertools.product(get_all_dtypes(include_bfloat16=False, include_complex=False), - get_all_dtypes(include_bfloat16=False, include_complex=False))) - @dtypes(*itertools.product(get_all_dtypes(include_half=False, include_bfloat16=False, - include_complex=False), - get_all_dtypes(include_half=False, include_bfloat16=False, - include_complex=False))) - def test_atan2_type_promotion(self, device, dtypes): - dtype1, dtype2 = dtypes - default_float = torch.get_default_dtype() - - def is_int(dtype): - return dtype in get_all_int_dtypes() + [torch.bool] - - def is_float(dtype): - return dtype in get_all_fp_dtypes(include_half=True, include_bfloat16=False) - - def get_binary_float_result_type(x, y): - dtype1 = x.dtype - dtype2 = y.dtype - if is_float(dtype1) and is_float(dtype2): - return torch.result_type(x, y) - elif is_float(dtype1) and is_int(dtype2): - return dtype1 - elif is_int(dtype1) and is_float(dtype2): - return dtype2 - elif is_int(dtype1) and is_int(dtype2): - return default_float - - x = torch.tensor(1, dtype=dtype1, device=device) - y = torch.tensor(2, dtype=dtype2, device=device) - self.assertEqual(get_binary_float_result_type(x, y), torch.atan2(x, y).dtype) + @onlyNativeDeviceTypes + @dtypes(*itertools.product((torch.bool, torch.int, torch.float, torch.double), repeat=3)) + def test_clamp_type_promotion(self, device, dtypes): + dtype0, dtype1, dtype2 = dtypes + S = 4 + + def make_tensor(size, dtype): + if dtype == torch.bool: + return torch.randint(2, size, dtype=dtype, device=device) + elif dtype == torch.int: + return torch.randint(10, size, dtype=dtype, device=device) + else: + return torch.randn(size, dtype=dtype, device=device) + min_t = make_tensor((S,), dtype1) + max_t = make_tensor((S,), dtype2) + mins = (min_t, min_t[0], min_t[0].item()) + maxs = (max_t, max_t[0], max_t[0].item()) + inp = make_tensor((S,), dtype0) + for min_v, max_v in itertools.product(mins, maxs): + if type(max_v) != type(min_v): + continue + if isinstance(min_v, torch.Tensor) and min_v.ndim == 0 and max_v.ndim == 0: + continue # 0d tensors go to scalar overload, and it's tested separately + + def expected_type(inp, max, min): + arg1, arg2 = max, min + if isinstance(max, torch.Tensor) and max.ndim == 0: + # first do a maybe dimensional boundary + arg1, arg2 = min, max + exp_type = torch.result_type(inp, arg1) + inp_new = torch.empty_like(inp, dtype=exp_type) + return torch.result_type(inp_new, arg2) + exp_type = expected_type(inp, min_v, max_v) + if exp_type != torch.bool: + actual = torch.clamp(inp, min_v, max_v) + inps = list(map(lambda x: x.to(exp_type) if isinstance(x, torch.Tensor) else x, + (inp, min_v, max_v))) + expected = torch.clamp(inps[0], inps[1], inps[2]) + self.assertEqual(actual, expected) + if inp.dtype in floating_types() or exp_type == inp.dtype: + actual = torch.clamp_(inp, min_v, max_v) + self.assertEqual(actual, expected, exact_dtype=False) + for val in mins: + def expected_type(inp, val): + return torch.result_type(inp, val) + exp_type = expected_type(inp, val) + if exp_type != torch.bool: + actual = torch.clamp_min(inp, val) + inps = list(map(lambda x: x.to(exp_type) if isinstance(x, torch.Tensor) else x, + (inp, val))) + expected = torch.clamp_min(inps[0], inps[1]) + self.assertEqual(actual.dtype, exp_type) + self.assertEqual(actual, expected) + if inp.dtype == exp_type: + actual = torch.clamp_min_(inp, val) + self.assertEqual(actual, expected) + actual = torch.clamp_max(inp, val) + expected = torch.clamp_max(inps[0], inps[1]) + self.assertEqual(actual, expected) + if inp.dtype in floating_types() or exp_type == inp.dtype: + actual = torch.clamp_max_(inp, val) + self.assertEqual(actual, expected, exact_dtype=False) + + instantiate_device_type_tests(TestTypePromotion, globals()) diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py index 2a113799fff6..3c443dd5bc52 100644 --- a/test/test_unary_ufuncs.py +++ b/test/test_unary_ufuncs.py @@ -11,18 +11,47 @@ from torch._six import inf, nan from torch.testing._internal.common_utils import ( - TestCase, run_tests, torch_to_numpy_dtype_dict, numpy_to_torch_dtype_dict, - suppress_warnings, TEST_SCIPY, slowTest, skipIfNoSciPy, IS_WINDOWS, gradcheck) + TestCase, + run_tests, + torch_to_numpy_dtype_dict, + numpy_to_torch_dtype_dict, + suppress_warnings, + TEST_SCIPY, + slowTest, + skipIfNoSciPy, + IS_WINDOWS, + gradcheck, + TEST_WITH_ASAN, +) from torch.testing._internal.common_methods_invocations import ( - unary_ufuncs, _NOTHING) + unary_ufuncs, + generate_elementwise_unary_tensors, + _NOTHING, + generate_elementwise_unary_small_value_tensors, + generate_elementwise_unary_large_value_tensors, + generate_elementwise_unary_extremal_value_tensors, +) from torch.testing._internal.common_device_type import ( - instantiate_device_type_tests, ops, dtypes, onlyCPU, onlyNativeDeviceTypes, - onlyCUDA, dtypesIfCUDA, precisionOverride, skipCUDAIfRocm, dtypesIfCPU, - OpDTypes) + instantiate_device_type_tests, + ops, + dtypes, + onlyCPU, + onlyNativeDeviceTypes, + onlyCUDA, + dtypesIfCUDA, + precisionOverride, + dtypesIfCPU, +) + from torch.testing import make_tensor from torch.testing._internal.common_dtype import ( - floating_types_and, all_types_and_complex_and, floating_and_complex_types_and, get_all_dtypes, get_all_math_dtypes, - get_all_int_dtypes, get_all_fp_dtypes, get_all_complex_dtypes + floating_types_and, + all_types_and_complex_and, + integral_types_and, + get_all_math_dtypes, + complex_types, + all_types_and, + floating_and_complex_types_and, ) if TEST_SCIPY: @@ -45,140 +74,8 @@ # (https://numpy.org/doc/1.18/reference/ufuncs.html) for more details # about the concept of ufuncs. -# Functions tested here: -# - -# Interesting values and extremal values for different dtypes -_unsigned_int_vals = (0, 1, 55, 127) -_int_vals = (0, -1, 1, -55, 55, -127, 127, -128, 128) -_large_int_vals = (-1113, 1113, -10701, 10701) -_float_vals = (0., - -.001, .001, - -.25, .25, - -1., 1., - -math.pi / 2, math.pi / 2, - -math.pi + .00001, math.pi - .00001, - -math.pi, math.pi, - -math.pi - .00001, math.pi + .00001) -_large_float16_vals = (-501, 501, - -1001.2, 1001.2, - -13437.7, 13437.7) -_large_float_vals = _large_float16_vals + (-4988429.2, 4988429.2, -1e20, 1e20) -_float_extremals = (float('inf'), float('-inf'), float('nan')) -_medium_length = 812 -_large_size = (1029, 917) - - -# Replace values satisfying condition with a safe value. This is used to block -# out values the could cause singularity like tan(pi/2) -def replace_values_in_tensor(tensor, condition, safe_value): - mask = condition(tensor) - tensor.masked_fill_(mask, safe_value) - - -# Returns generator of tensors of different sizes filled with values in domain -# and with intested region filled with `vals`. This will help test different code -# paths for the given vals -# `filter_` can be either None or a tuple of (condition, safe_value). When not None -# values satisfying `condition`` will be replaced with `safe_value` in the generated -# tensor. This is useful to avoid singularities when generating inputs for tests, such -# as tan(pi/2) -def generate_tensors_from_vals(vals, device, dtype, domain, filter_): - offset = 63 - - assert _large_size[1] > (_medium_length + offset) # large tensor should be large enough - assert len(vals) < _medium_length # medium tensor should contain all vals - assert _medium_length % 4 == 0 # ensure vectorized code coverage - - if not dtype.is_complex: - # Filter values based on Operators domain. - # Note: Complex numbers don't belong to ordered field, - # so we don't filter for them. - if domain[0] is not None: - vals = list(filter(lambda x: x >= domain[0], vals)) - if domain[1] is not None: - vals = list(filter(lambda x: x < domain[1], vals)) - - if filter_ is not None: - condition, safe_value = filter_ - - # Constructs the large tensor containing vals - large_tensor = make_tensor(_large_size, device=device, dtype=dtype, low=domain[0], high=domain[1]) - - # Inserts the vals at an odd place - large_tensor[57][offset:offset + len(vals)] = torch.tensor(vals, device=device, dtype=dtype) - - if filter_ is not None: - replace_values_in_tensor(large_tensor, condition, safe_value) - - # Takes a medium sized copy of the large tensor containing vals - medium_tensor = large_tensor[57][offset:offset + _medium_length] - - if filter_ is not None: - replace_values_in_tensor(medium_tensor, condition, safe_value) - - # Constructs scalar tensors - scalar_tensors = (t.squeeze() for t in torch.split(medium_tensor, 1)) - - # Tensors with no elements - empty_sizes = ((0,), (0, 3, 3), (1, 0, 5), (6, 0, 0, 0), (3, 0, 1, 0)) - empty_tensors = (torch.empty(size, device=device, dtype=dtype) for size in empty_sizes) - - return chain(empty_tensors, scalar_tensors, (medium_tensor,), (large_tensor,)) - - -# [Note generate_numeric_tensors, generate_numeric_tensors_hard, -# and generate_numeric_tensors_extremal] -# -# Returns an iterable of contiguous tensors with the same storage on the requested -# device and with the requested dtype. -# -# This function is intended to test the non-vectorized and vectorized code -# paths of unary functions, as well as their handling of odd tensor -# sizes (like zero-dim tensors and tensors with zero elements). -# -# The iterable will include an empty tensor, tensors with no elements, -# zero dim (scalar) tensors, small 1D tensors, a medium 1D tensor, and -# a large 2D tensor. -# -# These tensors will include interesting values. The generate_numeric_tensors_hard -# tests larger values (>500) and generate_numeric_tensors_extremal tests extremal -# values like -inf, inf, and nan. -# -# The randomly generated values can be restricted by the domain -# argument. -def generate_numeric_tensors(device, dtype, *, - domain=(None, None), - filter_=None): - # Special-cases bool - if dtype is torch.bool: - tensors = (torch.empty(0, device=device, dtype=torch.bool), - torch.tensor(True, device=device), - torch.tensor(False, device=device), - torch.tensor((True, False), device=device), - make_tensor((_medium_length,), device=device, dtype=dtype, low=None, high=None), - make_tensor(_large_size, device=device, dtype=dtype, low=None, high=None)) - return tensors - - # Acquires dtype-specific vals - if dtype.is_floating_point or dtype.is_complex: - vals = _float_vals - - # Converts float -> complex vals if dtype is complex - if dtype.is_complex: - vals = tuple(complex(x, y) for x, y in product(vals, vals)) - elif dtype is torch.uint8: - vals = _unsigned_int_vals - else: # dtypes is a signed integer type - assert dtype in (torch.int8, torch.int16, torch.int32, torch.int64) - vals = _int_vals - - return generate_tensors_from_vals(vals, device, dtype, domain, filter_) - -def generate_numeric_tensors_hard(device, dtype, *, - domain=(None, None), - filter_=None): +def generate_numeric_tensors_hard(device, dtype, *, domain=(None, None), filter_=None): is_signed_integral = dtype in (torch.int8, torch.int16, torch.int32, torch.int64) if not (dtype.is_floating_point or dtype.is_complex or is_signed_integral): return () @@ -190,18 +87,23 @@ def generate_numeric_tensors_hard(device, dtype, *, else: vals = _large_float_vals elif dtype.is_complex: - vals = tuple(complex(x, y) for x, y in chain(product(_large_float_vals, _large_float_vals), - product(_float_vals, _large_float_vals), - product(_large_float_vals, _float_vals))) + vals = tuple( + complex(x, y) + for x, y in chain( + product(_large_float_vals, _large_float_vals), + product(_float_vals, _large_float_vals), + product(_large_float_vals, _float_vals), + ) + ) else: vals = _large_int_vals return generate_tensors_from_vals(vals, device, dtype, domain, filter_) -def generate_numeric_tensors_extremal(device, dtype, *, - domain=(None, None), - filter_=None): +def generate_numeric_tensors_extremal( + device, dtype, *, domain=(None, None), filter_=None +): if not (dtype.is_floating_point or dtype.is_complex): return () @@ -209,9 +111,14 @@ def generate_numeric_tensors_extremal(device, dtype, *, if dtype.is_floating_point: vals = _float_extremals elif dtype.is_complex: - vals = tuple(complex(x, y) for x, y in chain(product(_float_extremals, _float_extremals), - product(_float_vals, _float_extremals), - product(_float_extremals, _float_vals))) + vals = tuple( + complex(x, y) + for x, y in chain( + product(_float_extremals, _float_extremals), + product(_float_vals, _float_extremals), + product(_float_extremals, _float_vals), + ) + ) return generate_tensors_from_vals(vals, device, dtype, domain, filter_) @@ -221,8 +128,10 @@ def generate_numeric_tensors_extremal(device, dtype, *, class TestUnaryUfuncs(TestCase): exact_dtype = True - @ops([_fn for _fn in unary_ufuncs if _fn.domain != (None, None)], - allowed_dtypes=floating_types_and(torch.bfloat16, torch.half)) + @ops( + [_fn for _fn in unary_ufuncs if _fn.domain != (None, None)], + allowed_dtypes=floating_types_and(torch.bfloat16, torch.half), + ) def test_float_domains(self, device, dtype, op): eps = (1e-5, 1e-3, 1e-1, 1, 2, 10, 20, 50, 100) @@ -240,11 +149,14 @@ def test_float_domains(self, device, dtype, op): continue result = op(lower_tensor) - self.assertEqual(result.item(), float('nan'), - msg=("input of {0} outside lower domain boundary" - " {1} produced {2}, not nan!").format(lower_tensor.item(), - low, - result.item())) + self.assertEqual( + result.item(), + float("nan"), + msg=( + "input of {0} outside lower domain boundary" + " {1} produced {2}, not nan!" + ).format(lower_tensor.item(), low, result.item()), + ) if high is not None: high_tensor = torch.tensor(high, device=device, dtype=dtype) @@ -256,15 +168,20 @@ def test_float_domains(self, device, dtype, op): continue result = op(higher_tensor) - self.assertEqual(result.item(), float('nan'), - msg=("input of {0} outside upper domain boundary" - " {1} produced {2}, not nan!").format(higher_tensor.item(), - high, - result.item())) + self.assertEqual( + result.item(), + float("nan"), + msg=( + "input of {0} outside upper domain boundary" + " {1} produced {2}, not nan!" + ).format(higher_tensor.item(), high, result.item()), + ) # Helper for comparing torch tensors and numpy arrays # TODO: should this or assertEqual also validate that strides are equal? - def assertEqualHelper(self, actual, expected, msg, *, dtype, exact_dtype=True, **kwargs): + def assertEqualHelper( + self, actual, expected, msg, *, dtype, exact_dtype=True, **kwargs + ): assert isinstance(actual, torch.Tensor) # Some NumPy functions return scalars, not arrays @@ -273,46 +190,96 @@ def assertEqualHelper(self, actual, expected, msg, *, dtype, exact_dtype=True, * elif isinstance(expected, np.ndarray): # Handles exact dtype comparisons between arrays and tensors if exact_dtype: - # Allows array dtype to be float32 when comparing with bfloat16 tensors - # since NumPy doesn't support the bfloat16 dtype - # Also ops like scipy.special.erf, scipy.special.erfc, etc, promote float16 - # to float32 - if expected.dtype == np.float32: - assert actual.dtype in (torch.float16, torch.bfloat16, torch.float32) - else: - assert expected.dtype == torch_to_numpy_dtype_dict[actual.dtype] - - self.assertEqual(actual, - torch.from_numpy(expected).to(actual.dtype), - msg, - exact_device=False, - **kwargs) + if ( + actual.dtype is torch.bfloat16 + or expected.dtype != torch_to_numpy_dtype_dict[actual.dtype] + ): + # Allows array dtype to be float32 when comparing with bfloat16 tensors + # since NumPy doesn't support the bfloat16 dtype + # Also ops like scipy.special.erf, scipy.special.erfc, etc, promote float16 + # to float32 + if expected.dtype == np.float32: + assert actual.dtype in ( + torch.float16, + torch.bfloat16, + torch.float32, + ) + elif expected.dtype == np.float64: + assert actual.dtype in ( + torch.float16, + torch.bfloat16, + torch.float32, + torch.float64, + ) + else: + self.fail( + "Expected dtype {0} but got {1}!".format( + expected.dtype, actual.dtype + ) + ) + + self.assertEqual( + actual, + torch.from_numpy(expected).to(actual.dtype), + msg, + exact_device=False, + **kwargs + ) else: self.assertEqual(actual, expected, msg, exact_device=False, **kwargs) # Tests that the function and its (array-accepting) reference produce the same # values on given tensors def _test_reference_numerics(self, dtype, op, tensors, equal_nan=True): - def _helper_reference_numerics(expected, actual, msg, exact_dtype, equal_nan=True): - if not torch.can_cast(numpy_to_torch_dtype_dict[expected.dtype.type], dtype): + def _helper_reference_numerics( + expected, actual, msg, exact_dtype, equal_nan=True + ): + if not torch.can_cast( + numpy_to_torch_dtype_dict[expected.dtype.type], dtype + ): exact_dtype = False if dtype in [torch.uint8, torch.int8, torch.bool]: # NOTE: For these dtypes, PyTorch computes in the default scalar type (float) # while NumPy computes in float16 - self.assertEqualHelper(actual, expected, msg, dtype=dtype, - exact_dtype=exact_dtype, rtol=1e-3, atol=1e-2) + self.assertEqualHelper( + actual, + expected, + msg, + dtype=dtype, + exact_dtype=exact_dtype, + rtol=1e-3, + atol=1e-2, + ) elif dtype is torch.bfloat16: # Ref: https://github.com/pytorch/pytorch/blob/master/torch/testing/_internal/common_utils.py#L1149 - self.assertEqualHelper(actual, expected, msg, dtype=dtype, - exact_dtype=exact_dtype, rtol=16e-3, atol=1e-5) + self.assertEqualHelper( + actual, + expected, + msg, + dtype=dtype, + exact_dtype=exact_dtype, + rtol=16e-3, + atol=1e-5, + ) + else: - self.assertEqualHelper(actual, expected, msg, dtype=dtype, equal_nan=equal_nan, exact_dtype=exact_dtype) + self.assertEqualHelper( + actual, + expected, + msg, + dtype=dtype, + equal_nan=equal_nan, + exact_dtype=exact_dtype, + ) for t in tensors: + t = t.input torch_kwargs, numpy_kwargs = op.sample_kwargs(t.device, dtype, t) if dtype is torch.bfloat16: a = t.cpu().to(torch.float32).numpy() + elif dtype is torch.complex32: + a = t.cpu().to(torch.complex64).numpy() else: a = t.cpu().numpy() @@ -321,15 +288,19 @@ def _helper_reference_numerics(expected, actual, msg, exact_dtype, equal_nan=Tru # Crafts a custom error message for smaller, printable tensors if t.numel() < 10: - msg = ("Failed to produce expected results! Input tensor was" - " {0}, torch result is {1}, and reference result is" - " {2}.").format(t, actual, expected) + msg = ( + "Failed to produce expected results! Input tensor was" + " {0}, torch result is {1}, and reference result is" + " {2}." + ).format(t, actual, expected) else: msg = None exact_dtype = True if isinstance(actual, torch.Tensor): - _helper_reference_numerics(expected, actual, msg, exact_dtype, equal_nan) + _helper_reference_numerics( + expected, actual, msg, exact_dtype, equal_nan + ) else: for x, y in zip(expected, actual): # testing multi-outputs results @@ -339,58 +310,72 @@ def _helper_reference_numerics(expected, actual, msg, exact_dtype, equal_nan=Tru # values on a range of tensors, including empty tensors, scalar tensors, # 1D tensors and a large 2D tensor with interesting and extremal values # and noncontiguities. + @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN") @suppress_warnings @ops(reference_filtered_ops) def test_reference_numerics_normal(self, device, dtype, op): - tensors = generate_numeric_tensors(device, dtype, - domain=op.domain, - filter_=op.reference_numerics_filter) + tensors = generate_elementwise_unary_tensors( + op, device=device, dtype=dtype, requires_grad=False + ) self._test_reference_numerics(dtype, op, tensors) + @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN") @suppress_warnings - @ops(reference_filtered_ops, allowed_dtypes=floating_and_complex_types_and( - torch.bfloat16, torch.half, torch.int8, torch.int16, torch.int32, torch.int64 - )) - def test_reference_numerics_hard(self, device, dtype, op): - if not op.handles_large_floats: - raise self.skipTest("This op does not handle large values") - - tensors = generate_numeric_tensors_hard(device, dtype, - domain=op.domain) + @ops(reference_filtered_ops) + def test_reference_numerics_small(self, device, dtype, op): + if dtype in (torch.bool,): + raise self.skipTest("bool has no small values") + + tensors = generate_elementwise_unary_small_value_tensors( + op, device=device, dtype=dtype, requires_grad=False + ) self._test_reference_numerics(dtype, op, tensors) + @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN") @suppress_warnings - @ops(reference_filtered_ops, - allowed_dtypes=floating_and_complex_types_and(torch.bfloat16, torch.half)) - def test_reference_numerics_extremal(self, device, dtype, op): - handles_extremals = (op.handles_complex_extremals if - dtype in (torch.cfloat, torch.cdouble) else op.handles_extremals) - if not handles_extremals: - raise self.skipTest("This op does not handle extremal values") + @ops(reference_filtered_ops) + def test_reference_numerics_large(self, device, dtype, op): + if dtype in (torch.bool, torch.uint8, torch.int8): + raise self.skipTest("bool, uint8, and int8 dtypes have no large values") - tensors = generate_numeric_tensors_extremal(device, dtype, - domain=op.domain) + tensors = generate_elementwise_unary_large_value_tensors( + op, device=device, dtype=dtype, requires_grad=False + ) + self._test_reference_numerics(dtype, op, tensors) + @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN") + @suppress_warnings + @ops( + reference_filtered_ops, + allowed_dtypes=floating_and_complex_types_and(torch.bfloat16, torch.half), + ) + def test_reference_numerics_extremal(self, device, dtype, op): + tensors = generate_elementwise_unary_extremal_value_tensors( + op, device=device, dtype=dtype, requires_grad=False + ) self._test_reference_numerics(dtype, op, tensors) # Tests for testing (non)contiguity consistency - @ops(unary_ufuncs) def test_contig_vs_every_other(self, device, dtype, op): - contig = make_tensor((1026,), device=device, dtype=dtype, - low=op.domain[0], high=op.domain[1]) + contig = make_tensor( + (1026,), device=device, dtype=dtype, low=op.domain[0], high=op.domain[1] + ) non_contig = contig[::2] self.assertTrue(contig.is_contiguous()) self.assertFalse(non_contig.is_contiguous()) torch_kwargs, _ = op.sample_kwargs(device, dtype, non_contig) - self.assertEqual(op(contig, **torch_kwargs)[::2], op(non_contig, **torch_kwargs)) + self.assertEqual( + op(contig, **torch_kwargs)[::2], op(non_contig, **torch_kwargs) + ) @ops(unary_ufuncs) def test_contig_vs_transposed(self, device, dtype, op): - contig = make_tensor((789, 357), device=device, dtype=dtype, - low=op.domain[0], high=op.domain[1]) + contig = make_tensor( + (789, 357), device=device, dtype=dtype, low=op.domain[0], high=op.domain[1] + ) non_contig = contig.T self.assertTrue(contig.is_contiguous()) @@ -403,8 +388,9 @@ def test_contig_vs_transposed(self, device, dtype, op): def test_non_contig(self, device, dtype, op): shapes = [(5, 7), (1024,)] for shape in shapes: - contig = make_tensor(shape, device, dtype, - low=op.domain[0], high=op.domain[1]) + contig = make_tensor( + shape, dtype=dtype, device=device, low=op.domain[0], high=op.domain[1] + ) non_contig = torch.empty(shape + (2,), device=device, dtype=dtype)[..., 0] non_contig.copy_(contig) @@ -416,8 +402,13 @@ def test_non_contig(self, device, dtype, op): @ops(unary_ufuncs) def test_non_contig_index(self, device, dtype, op): - contig = make_tensor((2, 2, 1, 2), device, dtype, - low=op.domain[0], high=op.domain[1]) + contig = make_tensor( + (2, 2, 1, 2), + dtype=dtype, + device=device, + low=op.domain[0], + high=op.domain[1], + ) non_contig = contig[:, 1, ...] contig = non_contig.contiguous() @@ -431,8 +422,9 @@ def test_non_contig_index(self, device, dtype, op): def test_non_contig_expand(self, device, dtype, op): shapes = [(1, 3), (1, 7), (5, 7)] for shape in shapes: - contig = make_tensor(shape, device, dtype, - low=op.domain[0], high=op.domain[1]) + contig = make_tensor( + shape, dtype=dtype, device=device, low=op.domain[0], high=op.domain[1] + ) non_contig = contig.clone().expand(3, -1, -1) self.assertTrue(contig.is_contiguous()) @@ -442,13 +434,15 @@ def test_non_contig_expand(self, device, dtype, op): contig = op(contig, **torch_kwargs) non_contig = op(non_contig, **torch_kwargs) for i in range(3): - self.assertEqual(contig, non_contig[i], - msg='non-contiguous expand[' + str(i) + ']') + self.assertEqual( + contig, non_contig[i], msg="non-contiguous expand[" + str(i) + "]" + ) @ops(unary_ufuncs) def test_contig_size1(self, device, dtype, op): - contig = make_tensor((5, 100), device, dtype, - low=op.domain[0], high=op.domain[1]) + contig = make_tensor( + (5, 100), dtype=dtype, device=device, low=op.domain[0], high=op.domain[1] + ) contig = contig[:1, :50] contig2 = torch.empty(contig.size(), device=device, dtype=dtype) contig2.copy_(contig) @@ -461,8 +455,13 @@ def test_contig_size1(self, device, dtype, op): @ops(unary_ufuncs) def test_contig_size1_large_dim(self, device, dtype, op): - contig = make_tensor((5, 2, 3, 1, 4, 5, 3, 2, 1, 2, 3, 4), device, dtype, - low=op.domain[0], high=op.domain[1]) + contig = make_tensor( + (5, 2, 3, 1, 4, 5, 3, 2, 1, 2, 3, 4), + dtype=dtype, + device=device, + low=op.domain[0], + high=op.domain[1], + ) contig = contig[:1, :, :, :, :, :, :, :, :, :, :, :] contig2 = torch.empty(contig.size(), device=device, dtype=dtype) contig2.copy_(contig) @@ -477,8 +476,9 @@ def test_contig_size1_large_dim(self, device, dtype, op): # per-batch computation. @ops(unary_ufuncs) def test_batch_vs_slicing(self, device, dtype, op): - input = make_tensor((1024, 512), dtype=dtype, device=device, - low=op.domain[0], high=op.domain[1]) + input = make_tensor( + (1024, 512), dtype=dtype, device=device, low=op.domain[0], high=op.domain[1] + ) torch_kwargs, _ = op.sample_kwargs(device, dtype, input) actual = op(input, **torch_kwargs) @@ -486,43 +486,14 @@ def test_batch_vs_slicing(self, device, dtype, op): self.assertEqual(actual, expected) - def _test_out_arg(self, op, input, output, expected, **kwargs): - if op.safe_casts_outputs: - expect_fail = not torch.can_cast(expected.dtype, output.dtype) - else: - expect_fail = output.dtype != expected.dtype - - if expect_fail: - with self.assertRaises(RuntimeError): - op(input, out=output, **kwargs) - else: - res = op(input, out=output, **kwargs) - self.assertTrue(res is output) - self.assertEqual(output, expected.to(output.dtype)) - - @ops(unary_ufuncs, dtypes=OpDTypes.supported) - def test_out_arg_all_dtypes(self, device, dtype, op): - if not op.supports_out: - self.skipTest("Skipped! Op doesn't support out= kwarg.") - - input = make_tensor((64, 64), dtype=dtype, device=device, - low=op.domain[0], high=op.domain[1]) - torch_kwargs, _ = op.sample_kwargs(device, dtype, input) - expected = op(input, **torch_kwargs) - - for out_dtype in all_types_and_complex_and(torch.bool, torch.half): - out = torch.empty_like(input, dtype=out_dtype) - self._test_out_arg(op, input, out, expected, **torch_kwargs) - - @dtypes(*(get_all_int_dtypes() + [torch.bool] + - get_all_fp_dtypes(include_bfloat16=False))) + @dtypes(*all_types_and(torch.bool, torch.half)) def test_nan_to_num(self, device, dtype): for contiguous in [False, True]: - x = make_tensor((64, 64), low=0., high=100., dtype=dtype, device=device) + x = make_tensor((64, 64), low=0.0, high=100.0, dtype=dtype, device=device) if dtype.is_floating_point: # Add extremal values. - extremals = [float('nan'), float('inf'), -float('inf')] + extremals = [float("nan"), float("inf"), -float("inf")] for idx, extremal in zip(torch.randint(0, 63, (3,)), extremals): x[idx, :] = extremal @@ -534,12 +505,16 @@ def test_nan_to_num(self, device, dtype): posinf = random.random() * 5 neginf = random.random() * 10 - self.compare_with_numpy(lambda x: x.nan_to_num(nan=nan, posinf=posinf), - lambda x: np.nan_to_num(x, nan=nan, posinf=posinf), - x) - self.compare_with_numpy(lambda x: x.nan_to_num(posinf=posinf, neginf=neginf), - lambda x: np.nan_to_num(x, posinf=posinf, neginf=neginf), - x) + self.compare_with_numpy( + lambda x: x.nan_to_num(nan=nan, posinf=posinf), + lambda x: np.nan_to_num(x, nan=nan, posinf=posinf), + x, + ) + self.compare_with_numpy( + lambda x: x.nan_to_num(posinf=posinf, neginf=neginf), + lambda x: np.nan_to_num(x, posinf=posinf, neginf=neginf), + x, + ) # Out Variant out = torch.empty_like(x) @@ -551,10 +526,35 @@ def test_nan_to_num(self, device, dtype): torch.nan_to_num(x, out=out, nan=nan, posinf=posinf, neginf=neginf) self.assertEqual(result, out) + @onlyCPU + def test_nan_to_num_bfloat16(self, device): + def test_dtype(fn, input, dtype): + input = input.detach().clone().to(dtype=dtype).requires_grad_(True) + input2 = input.detach().clone().float().requires_grad_(True) + out = fn(input) + out.sum().backward() + out2 = fn(input2) + out2.sum().backward() + self.assertEqual(out.dtype, dtype) + self.assertEqual(input.grad.dtype, dtype) + self.assertEqual(out, out2, exact_dtype=False) + self.assertEqual(input.grad, input2.grad, exact_dtype=False) + + def func(): + return torch.nan_to_num + + shapes = [[1, 3, 6, 6], [1, 3, 6, 128], [1, 3, 256, 256]] + for shape in shapes: + x = torch.randn(shape, device=device) + extremals = [float('nan'), float('inf'), -float('inf')] + for id1, id2, extremal in zip(torch.randint(0, 2, (3,)), torch.randint(0, 5, (3,)), extremals): + x[0, id1, id2, :] = extremal + test_dtype(func(), x, torch.bfloat16) + @dtypes(torch.cdouble) def test_complex_edge_values(self, device, dtype): # sqrt Test Reference: https://github.com/pytorch/pytorch/pull/47424 - x = torch.tensor(0. - 1.0e+20j, dtype=dtype, device=device) + x = torch.tensor(0.0 - 1.0e20j, dtype=dtype, device=device) self.compare_with_numpy(torch.sqrt, np.sqrt, x) # acos test reference: https://github.com/pytorch/pytorch/issue/42952 # Skip on Windows, as CUDA acos returns conjugate value @@ -562,7 +562,11 @@ def test_complex_edge_values(self, device, dtype): if not (IS_WINDOWS and dtype == torch.cdouble and "cuda" in device): self.compare_with_numpy(torch.acos, np.arccos, x) - x = torch.tensor((-1.0e+60 if dtype == torch.cdouble else -1.0e+20) - 4988429.2j, dtype=dtype, device=device) + x = torch.tensor( + (-1.0e60 if dtype == torch.cdouble else -1.0e20) - 4988429.2j, + dtype=dtype, + device=device, + ) self.compare_with_numpy(torch.sqrt, np.sqrt, x) @unittest.skipIf(not TEST_SCIPY, "Requires SciPy") @@ -572,14 +576,28 @@ def test_digamma_special(self, device, dtype): # Reference: # https://github.com/scipy/scipy/blob/3a8a3a1d4657254a6611e77e9c28feafa26e6645/scipy/special/tests/test_digamma.py#L22 euler = 0.57721566490153286 - dataset = [(0., -0.), - (1, -euler), - (0.5, -2 * math.log(2) - euler), - (1 / 3, -math.pi / (2 * math.sqrt(3)) - 3 * math.log(3) / 2 - euler), - (1 / 4, -math.pi / 2 - 3 * math.log(2) - euler), - (1 / 6, -math.pi * math.sqrt(3) / 2 - 2 * math.log(2) - 3 * math.log(3) / 2 - euler), - (1 / 8, -math.pi / 2 - 4 * math.log(2) - - (math.pi + math.log(2 + math.sqrt(2)) - math.log(2 - math.sqrt(2))) / math.sqrt(2) - euler)] + dataset = [ + (0.0, -0.0), + (1, -euler), + (0.5, -2 * math.log(2) - euler), + (1 / 3, -math.pi / (2 * math.sqrt(3)) - 3 * math.log(3) / 2 - euler), + (1 / 4, -math.pi / 2 - 3 * math.log(2) - euler), + ( + 1 / 6, + -math.pi * math.sqrt(3) / 2 + - 2 * math.log(2) + - 3 * math.log(3) / 2 + - euler, + ), + ( + 1 / 8, + -math.pi / 2 + - 4 * math.log(2) + - (math.pi + math.log(2 + math.sqrt(2)) - math.log(2 - math.sqrt(2))) + / math.sqrt(2) + - euler, + ), + ] x = torch.tensor(dataset, device=device, dtype=dtype) self.compare_with_numpy(torch.digamma, scipy.special.digamma, x) @@ -587,15 +605,29 @@ def test_digamma_special(self, device, dtype): @dtypes(torch.float, torch.double) def test_digamma(self, device, dtype): # Tests pole behavior - tensor = torch.tensor([-0.999999994, -1.999999994, -2.0000000111, - -100.99999994, 0.000000111, -1931.99999994, - -0.000000111, 0, -0, -1, -2, -931], dtype=dtype, device=device) + tensor = torch.tensor( + [ + -0.999999994, + -1.999999994, + -2.0000000111, + -100.99999994, + 0.000000111, + -1931.99999994, + -0.000000111, + 0, + -0, + -1, + -2, + -931, + ], + dtype=dtype, + device=device, + ) self.compare_with_numpy(torch.digamma, scipy.special.digamma, tensor) - @skipCUDAIfRocm - @dtypes(*get_all_fp_dtypes(include_half=True, include_bfloat16=False)) + @dtypes(*floating_types_and(torch.half)) def test_frexp(self, device, dtype): - input = make_tensor((50, 50), device, dtype) + input = make_tensor((50, 50), dtype=dtype, device=device) mantissa, exponent = torch.frexp(input) np_mantissa, np_exponent = np.frexp(input.cpu().numpy()) @@ -606,26 +638,29 @@ def test_frexp(self, device, dtype): self.assertTrue(exponent.dtype == torch.int32) self.assertTrue(torch_to_numpy_dtype_dict[exponent.dtype] == np_exponent.dtype) - @skipCUDAIfRocm def test_frexp_assert_raises(self, device): - invalid_input_dtypes = get_all_int_dtypes() + \ - get_all_complex_dtypes() + \ - [torch.bool] + invalid_input_dtypes = integral_types_and(torch.bool) + complex_types() for dtype in invalid_input_dtypes: - input = make_tensor((50, 50), device, dtype) - with self.assertRaisesRegex(RuntimeError, r"torch\.frexp\(\) only supports floating-point dtypes"): + input = make_tensor((50, 50), dtype=dtype, device=device) + with self.assertRaisesRegex( + RuntimeError, r"torch\.frexp\(\) only supports floating-point dtypes" + ): torch.frexp(input) - for dtype in get_all_fp_dtypes(include_half=True, include_bfloat16=False): - input = make_tensor((50, 50), device, dtype) + for dtype in floating_types_and(torch.half): + input = make_tensor((50, 50), dtype=dtype, device=device) - dtypes = list(all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16)) + dtypes = list( + all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16) + ) dtypes.remove(dtype) for mantissa_dtype in dtypes: mantissa = torch.empty_like(input, dtype=mantissa_dtype) exponent = torch.empty_like(input, dtype=torch.int) - with self.assertRaisesRegex(RuntimeError, - r"torch\.frexp\(\) expects mantissa to have dtype .+ but got .+"): + with self.assertRaisesRegex( + RuntimeError, + r"torch\.frexp\(\) expects mantissa to have dtype .+ but got .+", + ): torch.frexp(input, out=(mantissa, exponent)) dtypes.append(dtype) @@ -633,8 +668,10 @@ def test_frexp_assert_raises(self, device): for exponent_dtype in dtypes: mantissa = torch.empty_like(input) exponent = torch.empty_like(input, dtype=exponent_dtype) - with self.assertRaisesRegex(RuntimeError, - r"torch\.frexp\(\) expects exponent to have int dtype but got .+"): + with self.assertRaisesRegex( + RuntimeError, + r"torch\.frexp\(\) expects exponent to have int dtype but got .+", + ): torch.frexp(input, out=(mantissa, exponent)) def test_mvlgamma_argcheck(self, device): @@ -642,17 +679,21 @@ def run_test(d): input = torch.linspace((d - 2) / 2, 10, 10, device=device) torch.mvlgamma(input, d) - with self.assertRaisesRegex(RuntimeError, r"All elements must be greater than \(p-1\)/2"): + with self.assertRaisesRegex( + RuntimeError, r"All elements must be greater than \(p-1\)/2" + ): run_test(3) def test_polygamma_neg(self, device): - with self.assertRaisesRegex(RuntimeError, r'polygamma\(n, x\) does not support negative n\.'): + with self.assertRaisesRegex( + RuntimeError, r"polygamma\(n, x\) does not support negative n\." + ): torch.polygamma(-1, torch.tensor([1.0, 2.0], device=device)) # TODO resolve with opinfos @onlyCPU def test_op_invert(self, device): - res = 0xffff - torch.arange(127, dtype=torch.int8) + res = 0xFFFF - torch.arange(127, dtype=torch.int8) for dtype in (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64): a = torch.arange(127, dtype=dtype) self.assertEqual(res.to(dtype), ~a) @@ -669,16 +710,19 @@ def test_op_invert(self, device): def test_abs_angle_complex_to_float(self, device, dtype): # Constructs random complex values from random import random + random_vals = [] for multiplier in (-1, 1, -10, 10, -100, 100): for _ in range(10): - random_vals.append(complex(random() * multiplier, random() * multiplier)) + random_vals.append( + complex(random() * multiplier, random() * multiplier) + ) for vals in (random_vals, []): a = np.array(vals, dtype=torch_to_numpy_dtype_dict[dtype]) t = torch.tensor(vals, device=device, dtype=dtype) - for fn_name in ('abs', 'angle'): + for fn_name in ("abs", "angle"): torch_fn = getattr(torch, fn_name) np_fn = getattr(np, fn_name) @@ -688,12 +732,16 @@ def test_abs_angle_complex_to_float(self, device, dtype): self.assertEqual(np_result, torch_result, exact_dtype=True) # Tests float out - float_dtype = torch.float32 if dtype is torch.complex64 else torch.float64 + float_dtype = ( + torch.float32 if dtype is torch.complex64 else torch.float64 + ) np_float_out = np_fn(a).astype(torch_to_numpy_dtype_dict[float_dtype]) float_out = torch.empty_like(t).float() torch_fn(t, out=float_out) # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 - self.assertEqualIgnoreType(torch.from_numpy(np_float_out), float_out.cpu()) + self.assertEqualIgnoreType( + torch.from_numpy(np_float_out), float_out.cpu() + ) # Tests float out (resized out) float_out = torch.empty(1, device=device, dtype=float_dtype) @@ -705,13 +753,17 @@ def test_abs_angle_complex_to_float(self, device, dtype): complex_out = torch.empty_like(t) torch_fn(t, out=complex_out) # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 - self.assertEqualIgnoreType(torch.from_numpy(np_complex_out), complex_out.cpu()) + self.assertEqualIgnoreType( + torch.from_numpy(np_complex_out), complex_out.cpu() + ) # Tests complex out (resized out) complex_out = torch.empty(0, device=device, dtype=dtype) torch_fn(t, out=complex_out) # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 - self.assertEqualIgnoreType(torch.from_numpy(np_complex_out), complex_out.cpu()) + self.assertEqualIgnoreType( + torch.from_numpy(np_complex_out), complex_out.cpu() + ) # Tests long out behavior (expected failure) long_out = torch.empty(0, device=device, dtype=torch.long) @@ -719,40 +771,42 @@ def test_abs_angle_complex_to_float(self, device, dtype): torch_fn(t, out=long_out) # Tests inplace - if fn_name == 'abs': + if fn_name == "abs": torch_inplace_method = getattr(torch.Tensor, fn_name + "_") np_fn(a, out=a) if dtype.is_complex: - with self.assertRaisesRegex(RuntimeError, "In-place abs is not supported for complex tensors."): + with self.assertRaisesRegex( + RuntimeError, + "In-place abs is not supported for complex tensors.", + ): torch_inplace_method(t) return torch_inplace_method(t) self.assertEqual(torch.from_numpy(a), t.cpu()) # Note: angle does not have an in-place variant - if fn_name == 'angle': + if fn_name == "angle": with self.assertRaises(AttributeError): torch_inplace_method = getattr(torch.Tensor, fn_name + "_") - def check_internal_mem_overlap(self, inplace_op, num_inputs, - dtype, device, - expected_failure=False): + def check_internal_mem_overlap( + self, inplace_op, num_inputs, dtype, device, expected_failure=False + ): if isinstance(inplace_op, str): inplace_op = getattr(torch.Tensor, inplace_op) input = torch.randn(1, dtype=dtype, device=device).expand(3, 3) - inputs = [input] + [torch.randn_like(input) - for i in range(num_inputs - 1)] + inputs = [input] + [torch.randn_like(input) for i in range(num_inputs - 1)] if not expected_failure: - with self.assertRaisesRegex(RuntimeError, 'single memory location'): + with self.assertRaisesRegex(RuntimeError, "single memory location"): inplace_op(*inputs) else: with self.assertRaises(AssertionError): - with self.assertRaisesRegex(RuntimeError, 'single memory location'): + with self.assertRaisesRegex(RuntimeError, "single memory location"): inplace_op(*inputs) - def unary_check_input_output_mem_overlap(self, data, sz, op, - expected_failure=False): - + def unary_check_input_output_mem_overlap( + self, data, sz, op, expected_failure=False + ): def _test(op, output, input): output_exp = torch.empty_like(output) op(input, out=output_exp) @@ -761,15 +815,15 @@ def _test(op, output, input): # output is identical to input: _test(op, output=data[0:sz], input=data[0:sz]) # output and input are independent: - _test(op, output=data[0:sz], input=data[sz:2 * sz]) + _test(op, output=data[0:sz], input=data[sz : 2 * sz]) # output partially overlaps with input: if not expected_failure: - with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): - _test(op, data[0:sz], data[1:sz + 1]) + with self.assertRaisesRegex(RuntimeError, "unsupported operation"): + _test(op, data[0:sz], data[1 : sz + 1]) else: with self.assertRaises(AssertionError): - with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): - _test(op, data[0:sz], data[1:sz + 1]) + with self.assertRaisesRegex(RuntimeError, "unsupported operation"): + _test(op, data[0:sz], data[1 : sz + 1]) # TODO: run on non-native device types @dtypes(torch.double) @@ -779,170 +833,201 @@ def test_unary_out_op_mem_overlap(self, device, dtype): positives = torch.randint(1, 100, (2 * sz,), device=device).double() ints = torch.randint(-100, 100, (2 * sz,), device=device) unary_mem_overlap_cases = [ - ("abs", doubles, True, True, 'cpu'), - ("abs", doubles, True, True, 'cuda'), - ("acos", doubles, True, True, 'cpu'), - ("acos", doubles, True, True, 'cuda'), - ("asin", doubles, True, True, 'cpu'), - ("asin", doubles, True, True, 'cuda'), - ("atan", doubles, True, True, 'cpu'), - ("atan", doubles, True, True, 'cuda'), - ("acosh", doubles, True, True, 'cpu'), - ("acosh", doubles, True, True, 'cuda'), - ("asinh", doubles, True, True, 'cpu'), - ("asinh", doubles, True, True, 'cuda'), - ("atanh", doubles, True, True, 'cpu'), - ("atanh", doubles, True, True, 'cuda'), - ("bitwise_not", ints, True, True, 'cpu'), - ("bitwise_not", ints, True, True, 'cuda'), - ("ceil", doubles, True, True, 'cpu'), - ("ceil", doubles, True, True, 'cuda'), - ("cos", doubles, True, True, 'cpu'), - ("cos", doubles, True, True, 'cuda'), - ("cosh", doubles, True, True, 'cpu'), - ("cosh", doubles, True, True, 'cuda'), - ("digamma", doubles, True, True, 'cpu'), - ("erf", doubles, True, True, 'cpu'), - ("erf", doubles, True, True, 'cuda'), - ("erfc", doubles, True, True, 'cpu'), - ("erfc", doubles, True, True, 'cuda'), - ("erfinv", doubles, True, True, 'cpu'), - ("erfinv", doubles, True, True, 'cuda'), - ("exp", doubles, True, True, 'cpu'), - ("exp", doubles, True, True, 'cuda'), - ("exp2", doubles, True, True, 'cpu'), - ("exp2", doubles, True, True, 'cuda'), - ("expm1", doubles, True, True, 'cpu'), - ("expm1", doubles, True, True, 'cuda'), - ("floor", doubles, True, True, 'cpu'), - ("floor", doubles, True, True, 'cuda'), - ("frac", doubles, True, True, 'cpu'), - ("frac", doubles, True, True, 'cuda'), - ("i0", doubles, True, True, 'cpu'), - ("i0", doubles, True, True, 'cuda'), - ("log", positives, True, True, 'cpu'), - ("log", positives, True, True, 'cuda'), - ("log10", positives, True, True, 'cpu'), - ("log10", positives, True, True, 'cuda'), - ("log1p", positives, True, True, 'cpu'), - ("log1p", positives, True, True, 'cuda'), - ("log2", positives, True, True, 'cpu'), - ("log2", positives, True, True, 'cuda'), - ("neg", doubles, True, True, 'cpu'), - ("neg", doubles, True, True, 'cuda'), - ("reciprocal", doubles, True, True, 'cpu'), - ("reciprocal", doubles, True, True, 'cuda'), - ("round", doubles, True, True, 'cpu'), - ("round", doubles, True, True, 'cuda'), - ("rsqrt", positives, True, True, 'cpu'), - ("rsqrt", positives, True, True, 'cuda'), - ("sin", doubles, True, True, 'cpu'), - ("sin", doubles, True, True, 'cuda'), - ("sinh", doubles, True, True, 'cpu'), - ("sinh", doubles, False, True, 'cuda'), - ("sigmoid", doubles, True, True, 'cpu'), - ("sigmoid", doubles, True, True, 'cuda'), - ("logit", doubles, True, True, 'cpu'), - ("logit", doubles, True, True, 'cuda'), - ("sqrt", doubles, True, True, 'cpu'), - ("sqrt", doubles, False, True, 'cuda'), - ("tan", doubles, True, True, 'cpu'), - ("tan", doubles, True, True, 'cuda'), - ("tanh", doubles, True, True, 'cpu'), - ("tanh", doubles, True, True, 'cuda'), - ("trunc", doubles, True, True, 'cpu'), - ("trunc", doubles, True, True, 'cuda') + ("abs", doubles, True, True, "cpu"), + ("abs", doubles, True, True, "cuda"), + ("acos", doubles, True, True, "cpu"), + ("acos", doubles, True, True, "cuda"), + ("asin", doubles, True, True, "cpu"), + ("asin", doubles, True, True, "cuda"), + ("atan", doubles, True, True, "cpu"), + ("atan", doubles, True, True, "cuda"), + ("acosh", doubles, True, True, "cpu"), + ("acosh", doubles, True, True, "cuda"), + ("asinh", doubles, True, True, "cpu"), + ("asinh", doubles, True, True, "cuda"), + ("atanh", doubles, True, True, "cpu"), + ("atanh", doubles, True, True, "cuda"), + ("bitwise_not", ints, True, True, "cpu"), + ("bitwise_not", ints, True, True, "cuda"), + ("ceil", doubles, True, True, "cpu"), + ("ceil", doubles, True, True, "cuda"), + ("cos", doubles, True, True, "cpu"), + ("cos", doubles, True, True, "cuda"), + ("cosh", doubles, True, True, "cpu"), + ("cosh", doubles, True, True, "cuda"), + ("digamma", doubles, True, True, "cpu"), + ("erf", doubles, True, True, "cpu"), + ("erf", doubles, True, True, "cuda"), + ("erfc", doubles, True, True, "cpu"), + ("erfc", doubles, True, True, "cuda"), + ("erfinv", doubles, True, True, "cpu"), + ("erfinv", doubles, True, True, "cuda"), + ("exp", doubles, True, True, "cpu"), + ("exp", doubles, True, True, "cuda"), + ("exp2", doubles, True, True, "cpu"), + ("exp2", doubles, True, True, "cuda"), + ("expm1", doubles, True, True, "cpu"), + ("expm1", doubles, True, True, "cuda"), + ("floor", doubles, True, True, "cpu"), + ("floor", doubles, True, True, "cuda"), + ("frac", doubles, True, True, "cpu"), + ("frac", doubles, True, True, "cuda"), + ("i0", doubles, True, True, "cpu"), + ("i0", doubles, True, True, "cuda"), + ("log", positives, True, True, "cpu"), + ("log", positives, True, True, "cuda"), + ("log10", positives, True, True, "cpu"), + ("log10", positives, True, True, "cuda"), + ("log1p", positives, True, True, "cpu"), + ("log1p", positives, True, True, "cuda"), + ("log2", positives, True, True, "cpu"), + ("log2", positives, True, True, "cuda"), + ("neg", doubles, True, True, "cpu"), + ("neg", doubles, True, True, "cuda"), + ("reciprocal", doubles, True, True, "cpu"), + ("reciprocal", doubles, True, True, "cuda"), + ("round", doubles, True, True, "cpu"), + ("round", doubles, True, True, "cuda"), + ("rsqrt", positives, True, True, "cpu"), + ("rsqrt", positives, True, True, "cuda"), + ("sin", doubles, True, True, "cpu"), + ("sin", doubles, True, True, "cuda"), + ("sinh", doubles, True, True, "cpu"), + ("sinh", doubles, False, True, "cuda"), + ("sigmoid", doubles, True, True, "cpu"), + ("sigmoid", doubles, True, True, "cuda"), + ("logit", doubles, True, True, "cpu"), + ("logit", doubles, True, True, "cuda"), + ("sqrt", doubles, True, True, "cpu"), + ("sqrt", doubles, False, True, "cuda"), + ("tan", doubles, True, True, "cpu"), + ("tan", doubles, True, True, "cuda"), + ("tanh", doubles, True, True, "cpu"), + ("tanh", doubles, True, True, "cuda"), + ("trunc", doubles, True, True, "cpu"), + ("trunc", doubles, True, True, "cuda"), ] - for (fn, inputs, has_input_output_mem_overlap_check, - has_internal_mem_overlap_check, dev) in unary_mem_overlap_cases: + for ( + fn, + inputs, + has_input_output_mem_overlap_check, + has_internal_mem_overlap_check, + dev, + ) in unary_mem_overlap_cases: if dev != device: continue out_fn = getattr(torch, fn) - in_fn = getattr(torch.Tensor, fn + '_') - - self.unary_check_input_output_mem_overlap(inputs, sz, out_fn, - expected_failure=not has_input_output_mem_overlap_check) - - self.check_internal_mem_overlap(in_fn, 1, dtype, dev, - expected_failure=not has_internal_mem_overlap_check) + in_fn = getattr(torch.Tensor, fn + "_") + + self.unary_check_input_output_mem_overlap( + inputs, + sz, + out_fn, + expected_failure=not has_input_output_mem_overlap_check, + ) + + self.check_internal_mem_overlap( + in_fn, + 1, + dtype, + dev, + expected_failure=not has_internal_mem_overlap_check, + ) # TODO: opinfo hardshrink @onlyCPU - @dtypes(torch.float, torch.double) + @dtypes(torch.float, torch.double, torch.bfloat16) def test_hardshrink(self, device, dtype): data = torch.tensor([1, 0.5, 0.3, 0.6], dtype=dtype, device=device).view(2, 2) - self.assertEqual(torch.tensor([1, 0.5, 0, 0.6], dtype=dtype, device=device).view(2, 2), - data.hardshrink(0.3)) - self.assertEqual(torch.tensor([1, 0, 0, 0.6], dtype=dtype, device=device).view(2, 2), - data.hardshrink(0.5)) + self.assertEqual( + torch.tensor([1, 0.5, 0, 0.6], dtype=dtype, device=device).view(2, 2), + data.hardshrink(0.3), + ) + self.assertEqual( + torch.tensor([1, 0, 0, 0.6], dtype=dtype, device=device).view(2, 2), + data.hardshrink(0.5), + ) # test default lambd=0.5 self.assertEqual(data.hardshrink(), data.hardshrink(0.5)) # test non-contiguous case - self.assertEqual(torch.tensor([1, 0, 0.5, 0.6], dtype=dtype, device=device).view(2, 2), - data.t().hardshrink(0.3)) + self.assertEqual( + torch.tensor([1, 0, 0.5, 0.6], dtype=dtype, device=device).view(2, 2), + data.t().hardshrink(0.3), + ) @onlyCPU - @dtypes(torch.float, torch.double) + @dtypes(torch.float, torch.double, torch.bfloat16) def test_hardshrink_edge_cases(self, device, dtype) -> None: def h(values, l_expected): for l, expected in l_expected.items(): - values_tensor = torch.tensor([float(v) for v in values], - dtype=dtype, device=device) - expected_tensor = torch.tensor([float(v) for v in expected], - dtype=dtype, device=device) - self.assertEqual(expected_tensor == values_tensor.hardshrink(l), - torch.ones_like(values_tensor, dtype=torch.bool)) + values_tensor = torch.tensor( + [float(v) for v in values], dtype=dtype, device=device + ) + expected_tensor = torch.tensor( + [float(v) for v in expected], dtype=dtype, device=device + ) + self.assertEqual( + expected_tensor == values_tensor.hardshrink(l), + torch.ones_like(values_tensor, dtype=torch.bool), + ) def test_helper(min, max): - h([0.0, min, -min, 0.1, -0.1, 1.0, -1.0, max, -max, inf, -inf], - {0.0: [0.0, min, -min, 0.1, -0.1, 1.0, -1.0, max, -max, inf, -inf], - min: [0.0, 0.0, 0.0, 0.1, -0.1, 1.0, -1.0, max, -max, inf, -inf], - 0.1: [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, -1.0, max, -max, inf, -inf], - 1.0: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, max, -max, inf, -inf], - max: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, inf, -inf], - inf: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}) + h( + [0.0, min, -min, 0.1, -0.1, 1.0, -1.0, max, -max, inf, -inf], + { + 0.0: [0.0, min, -min, 0.1, -0.1, 1.0, -1.0, max, -max, inf, -inf], + min: [0.0, 0.0, 0.0, 0.1, -0.1, 1.0, -1.0, max, -max, inf, -inf], + 0.1: [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, -1.0, max, -max, inf, -inf], + 1.0: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, max, -max, inf, -inf], + max: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, inf, -inf], + inf: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + }, + ) test_helper(torch.finfo(dtype).tiny, torch.finfo(dtype).max) @onlyCPU @slowTest @dtypes(torch.float) + @unittest.skipIf(True, "Insufficient memory on linux.(2|4)xlarge") def test_exp_slow(self, device, dtype): # Test for https://github.com/pytorch/pytorch/issues/17271 # This is pretty slow on my Macbook but it only takes a few # seconds on a beefy Xeon server - a = torch.exp(torch.ones(2 ** 31, dtype=dtype, device=device)) + a = torch.exp(torch.ones(2**31, dtype=dtype, device=device)) b = torch.exp(torch.ones(1, dtype=dtype, device=device)) - self.assertEqual(a, b.expand(2 ** 31)) + self.assertEqual(a, b.expand(2**31)) - @precisionOverride({torch.bfloat16: 1e-2, torch.float: 0.0002, torch.double: 0.0002}) - @dtypesIfCUDA(torch.float, torch.double, torch.bfloat16) - @dtypes(torch.float, torch.double) + @precisionOverride( + {torch.bfloat16: 1e-2, torch.float: 0.0002, torch.double: 0.0002} + ) + @dtypes(torch.float, torch.double, torch.bfloat16) def test_hardswish(self, device, dtype): inputValues = [-1000, -4, -3, -2, 0, 2, 3, 4, 1000] expectedOutput = np.multiply( - inputValues, - np.minimum(np.maximum((np.add(inputValues, 3)), 0), 6) / 6.0) + inputValues, np.minimum(np.maximum((np.add(inputValues, 3)), 0), 6) / 6.0 + ) inputTensor = torch.tensor(inputValues, dtype=dtype, device=device) - expectedOutputTensor = \ - torch.tensor(expectedOutput, dtype=dtype, device=device) + expectedOutputTensor = torch.tensor(expectedOutput, dtype=dtype, device=device) # normal - self.assertEqual(torch.nn.functional.hardswish(inputTensor), - expectedOutputTensor) + self.assertEqual( + torch.nn.functional.hardswish(inputTensor), expectedOutputTensor + ) # inplace inputTensorCpy = inputTensor.clone().detach() torch.nn.functional.hardswish(inputTensorCpy, inplace=True) self.assertEqual(inputTensorCpy, expectedOutputTensor) - @precisionOverride({torch.bfloat16: 1e-2, torch.float: 0.0002, torch.double: 0.0002}) - @dtypesIfCUDA(torch.float, torch.double, torch.bfloat16) - @dtypes(torch.float, torch.double) + @precisionOverride( + {torch.bfloat16: 1e-2, torch.float: 0.0002, torch.double: 0.0002} + ) + @dtypes(torch.float, torch.double, torch.bfloat16) def test_hardsigmoid(self, device, dtype): inputValues = [-1000, -4, -3, -2, 0, 2, 3, 4, 1000] expectedOutput = np.minimum(np.maximum((np.add(inputValues, 3)), 0), 6) / 6.0 @@ -950,21 +1035,28 @@ def test_hardsigmoid(self, device, dtype): inputTensor = torch.tensor(inputValues, dtype=dtype, device=device) # normal - self.assertEqual(torch.nn.functional.hardsigmoid(inputTensor), - torch.tensor(expectedOutput, dtype=dtype, device=device)) + self.assertEqual( + torch.nn.functional.hardsigmoid(inputTensor), + torch.tensor(expectedOutput, dtype=dtype, device=device), + ) # inplace inputTensorCpy = inputTensor.clone().detach() - self.assertEqual(torch.nn.functional.hardsigmoid(inputTensorCpy, inplace=True), - torch.tensor(expectedOutput, dtype=dtype, device=device)) - - @precisionOverride({torch.bfloat16: 1e-2, torch.float: 0.0002, torch.double: 0.0002}) - @dtypesIfCUDA(torch.float, torch.double, torch.bfloat16) - @dtypes(torch.float, torch.double) + self.assertEqual( + torch.nn.functional.hardsigmoid(inputTensorCpy, inplace=True), + torch.tensor(expectedOutput, dtype=dtype, device=device), + ) + + @precisionOverride( + {torch.bfloat16: 1e-2, torch.float: 0.0002, torch.double: 0.0002} + ) + @dtypes(torch.float, torch.double, torch.bfloat16) def test_hardsigmoid_backward(self, device, dtype): inputValues = [-3.0, 3.0, -2.0, 2.0, -6.0, 6.0] expectedValues = [0.0, 0.0, 1.0 / 6.0, 1.0 / 6.0, 0.0, 0.0] - inputTensor = torch.tensor(inputValues, dtype=dtype, device=device).requires_grad_() + inputTensor = torch.tensor( + inputValues, dtype=dtype, device=device + ).requires_grad_() expetedTensor = torch.tensor(expectedValues, dtype=dtype, device=device) out = torch.nn.functional.hardsigmoid(inputTensor) out.backward(torch.ones_like(inputTensor)) @@ -976,7 +1068,8 @@ def test_silu(self, device, dtype): input_np = np.random.randn(5, 8) special_input = [[-1000, -1, -0.1, 0, 0.5, 1, 2, 1000]] input_np = np.concatenate((input_np, special_input), axis=0).astype( - torch_to_numpy_dtype_dict[dtype]) + torch_to_numpy_dtype_dict[dtype] + ) expected_output_np = input_np * scipy.special.expit(input_np) expected_output = torch.from_numpy(expected_output_np).to(device) @@ -986,18 +1079,30 @@ def test_silu(self, device, dtype): rtol = 1e-6 input = torch.from_numpy(input_np).clone().contiguous().to(device) - self.assertEqual(torch.nn.functional.silu(input), expected_output, - atol=atol, rtol=rtol) - self.assertEqual(torch.nn.functional.silu(input, inplace=True), - expected_output, atol=atol, rtol=rtol) + self.assertEqual( + torch.nn.functional.silu(input), expected_output, atol=atol, rtol=rtol + ) + self.assertEqual( + torch.nn.functional.silu(input, inplace=True), + expected_output, + atol=atol, + rtol=rtol, + ) input = torch.from_numpy(input_np).clone().to(device) input_noncontig = input.transpose(0, 1) - self.assertEqual(torch.nn.functional.silu(input_noncontig), - expected_output_noncontig, atol=atol, rtol=rtol) - self.assertEqual(torch.nn.functional.silu( - input_noncontig, inplace=True), expected_output_noncontig, - atol=atol, rtol=rtol) + self.assertEqual( + torch.nn.functional.silu(input_noncontig), + expected_output_noncontig, + atol=atol, + rtol=rtol, + ) + self.assertEqual( + torch.nn.functional.silu(input_noncontig, inplace=True), + expected_output_noncontig, + atol=atol, + rtol=rtol, + ) # It is not obvious how to merge this into OpInfo becuase these inputs # succeed for gradcheck but are expected to fail for gradgradcheck @@ -1008,10 +1113,12 @@ def test_sinc(self, device, dtype): # We also need to be careful when we are very close to 0, as the # derivative's denominator is squared, and there are some floats # that are positive and whose squares are zero. - a = torch.tensor([0.0, torch.finfo(torch.double).tiny, 1.0], - dtype=dtype, - requires_grad=True, - device=device) + a = torch.tensor( + [0.0, torch.finfo(torch.double).tiny, 1.0], + dtype=dtype, + requires_grad=True, + device=device, + ) gradcheck(torch.sinc, a) @skipIfNoSciPy @@ -1020,7 +1127,8 @@ def test_mish(self, device, dtype): input_np = np.random.randn(5, 8) special_input = [[-1000, -1, -0.1, 0, 0.5, 1, 2, 1000]] input_np = np.concatenate((input_np, special_input), axis=0).astype( - torch_to_numpy_dtype_dict[dtype]) + torch_to_numpy_dtype_dict[dtype] + ) expected_output_np = input_np * np.tanh(np.log1p(np.exp(input_np))) expected_output = torch.from_numpy(expected_output_np).to(device) @@ -1030,34 +1138,50 @@ def test_mish(self, device, dtype): rtol = 1e-6 input = torch.from_numpy(input_np).clone().contiguous().to(device) - self.assertEqual(torch.nn.functional.mish(input), expected_output, - atol=atol, rtol=rtol) - self.assertEqual(torch.nn.functional.mish(input, inplace=True), - expected_output, atol=atol, rtol=rtol) + self.assertEqual( + torch.nn.functional.mish(input), expected_output, atol=atol, rtol=rtol + ) + self.assertEqual( + torch.nn.functional.mish(input, inplace=True), + expected_output, + atol=atol, + rtol=rtol, + ) input = torch.from_numpy(input_np).clone().to(device) input_noncontig = input.transpose(0, 1) - self.assertEqual(torch.nn.functional.mish(input_noncontig), - expected_output_noncontig, atol=atol, rtol=rtol) - self.assertEqual(torch.nn.functional.mish( - input_noncontig, inplace=True), expected_output_noncontig, - atol=atol, rtol=rtol) + self.assertEqual( + torch.nn.functional.mish(input_noncontig), + expected_output_noncontig, + atol=atol, + rtol=rtol, + ) + self.assertEqual( + torch.nn.functional.mish(input_noncontig, inplace=True), + expected_output_noncontig, + atol=atol, + rtol=rtol, + ) # do ops like threshold need a test_unary(_nonufunc) test suite? @onlyCPU - @dtypes(*get_all_math_dtypes('cpu')) + @dtypes(*get_all_math_dtypes("cpu")) def test_threshold(self, device, dtype): if dtype != torch.uint8 and dtype != torch.float16 and not dtype.is_complex: # 100 is wide enough to use AVX2 instructions for all types - x = torch.randn(100, dtype=torch.float, device=device).sign().to(dtype=dtype) + x = ( + torch.randn(100, dtype=torch.float, device=device) + .sign() + .to(dtype=dtype) + ) y = torch.threshold(x, 0, 0) self.assertTrue(y.le(0).any()) - def _helper_test_igamma(self, loglo, loghi, device, dtype, - torch_fcn, scipy_fcn): + def _helper_test_igamma(self, loglo, loghi, device, dtype, torch_fcn, scipy_fcn): exp1 = 2.71828182846 - vec1 = torch.logspace(loglo, loghi, steps=500, base=exp1, - dtype=torch.float64, device=device).unsqueeze(-1) + vec1 = torch.logspace( + loglo, loghi, steps=500, base=exp1, dtype=torch.float64, device=device + ).unsqueeze(-1) vec1 = vec1.to(dtype) inputs = [ (vec1, vec1.transpose(0, 1)), @@ -1065,8 +1189,8 @@ def _helper_test_igamma(self, loglo, loghi, device, dtype, (vec1, 0.5 * vec1), # test for considerable ratio (vec1, 2.0 * vec1), (vec1[::2, :], vec1[::2, :]), # contiguous/noncontiguous tests - (vec1[::2, :], vec1[:vec1.shape[0] // 2, :]), - (vec1[:vec1.shape[0] // 2, :], vec1[::2, :]), + (vec1[::2, :], vec1[: vec1.shape[0] // 2, :]), + (vec1[: vec1.shape[0] // 2, :], vec1[::2, :]), ] half_prec = dtype in [torch.bfloat16, torch.float16] for input0, input1 in inputs: @@ -1078,7 +1202,6 @@ def _helper_test_igamma(self, loglo, loghi, device, dtype, expected = torch.from_numpy(expected).to(dtype) self.assertEqual(actual, expected) - @skipCUDAIfRocm # see issue https://github.com/pytorch/pytorch/issues/46531 @dtypesIfCPU(torch.float16, torch.bfloat16, torch.float32, torch.float64) @dtypes(torch.float32, torch.float64) @unittest.skipIf(not TEST_SCIPY, "SciPy not found") @@ -1087,8 +1210,9 @@ def test_igamma_common(self, device, dtype): # test igamma for reasonable range of values loglo = -4 # approx 0.018 loghi = 4 # approx 54.6 - self._helper_test_igamma(loglo, loghi, device, dtype, - torch.igamma, scipy.special.gammainc) + self._helper_test_igamma( + loglo, loghi, device, dtype, torch.igamma, scipy.special.gammainc + ) @dtypesIfCPU(torch.float16, torch.bfloat16, torch.float32, torch.float64) @dtypes(torch.float32, torch.float64) @@ -1098,8 +1222,9 @@ def test_igammac_common(self, device, dtype): # test igammac for reasonable range of values loglo = -4 # approx 0.018 loghi = 4 # approx 54.6 - self._helper_test_igamma(loglo, loghi, device, dtype, - torch.igammac, scipy.special.gammaincc) + self._helper_test_igamma( + loglo, loghi, device, dtype, torch.igammac, scipy.special.gammaincc + ) @dtypesIfCPU(torch.float16, torch.bfloat16, torch.float32, torch.float64) @dtypes(torch.float32, torch.float64) @@ -1109,8 +1234,8 @@ def test_igamma_edge_cases(self, device, dtype): infs = torch.zeros((3,), **tkwargs) + float("inf") zeros = torch.zeros((3,), **tkwargs) ones = torch.ones((3,), **tkwargs) - zero_to_large = torch.tensor([0., 1., 1e3], **tkwargs) - small_to_inf = torch.tensor([1e-3, 1., float("inf")], **tkwargs) + zero_to_large = torch.tensor([0.0, 1.0, 1e3], **tkwargs) + small_to_inf = torch.tensor([1e-3, 1.0, float("inf")], **tkwargs) nans = torch.zeros((3,), **tkwargs) + float("nan") inpouts = [ # (a , x), out @@ -1138,8 +1263,8 @@ def test_igammac_edge_cases(self, device, dtype): infs = torch.zeros((3,), **tkwargs) + float("inf") zeros = torch.zeros((3,), **tkwargs) ones = torch.ones((3,), **tkwargs) - zero_to_large = torch.tensor([0., 1., 1e3], **tkwargs) - small_to_inf = torch.tensor([1e-3, 1., float("inf")], **tkwargs) + zero_to_large = torch.tensor([0.0, 1.0, 1e3], **tkwargs) + small_to_inf = torch.tensor([1e-3, 1.0, float("inf")], **tkwargs) nans = torch.zeros((3,), **tkwargs) + float("nan") inpouts = [ # (a , x), out @@ -1179,7 +1304,7 @@ def _i0_range_helper(self, range, device, dtype): t = torch.rand(1000, device=device).to(dtype) * r self._i0_helper(t) - @dtypesIfCUDA(*get_all_fp_dtypes()) + @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16)) @dtypes(torch.bfloat16, torch.float32, torch.float64) @unittest.skipIf(not TEST_SCIPY, "SciPy not found") def test_i0_range1(self, device, dtype): @@ -1187,7 +1312,7 @@ def test_i0_range1(self, device, dtype): # The domain is (-13.25, 13.25) self._i0_range_helper(13.25, device, dtype) - @dtypesIfCUDA(*get_all_fp_dtypes()) + @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16)) @dtypes(torch.bfloat16, torch.float32, torch.float64) @unittest.skipIf(not TEST_SCIPY, "SciPy not found") def test_i0_range2(self, device, dtype): @@ -1202,7 +1327,7 @@ def test_i0_range3(self, device, dtype): # The domain is (-709.75, 709.75) self._i0_range_helper(709.75, device, dtype) - @dtypesIfCUDA(*get_all_fp_dtypes()) + @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16)) @dtypes(torch.bfloat16, torch.float32, torch.float64) @unittest.skipIf(not TEST_SCIPY, "SciPy not found") def test_i0_special(self, device, dtype): @@ -1212,7 +1337,7 @@ def test_i0_special(self, device, dtype): t = torch.tensor([inf, -inf, nan], device=device, dtype=dtype) self.assertTrue(torch.i0(t).isnan().all()) - @dtypesIfCUDA(*get_all_fp_dtypes()) + @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16)) @dtypes(torch.bfloat16, torch.float32, torch.float64) @unittest.skipIf(not TEST_SCIPY, "SciPy not found") def test_special_i0_i1_vs_scipy(self, device, dtype): @@ -1266,11 +1391,25 @@ def check_equal(t): self.assertEqual(actual, expected) range = (-10, 10) + t = torch.linspace(*range, 1, device=device, dtype=dtype) + check_equal(t) - t = torch.linspace(*range, int(1e4), device=device, dtype=dtype) + # Skip testing NaN, inf, -inf since they are tested in reference_numerics tests. + info = torch.finfo(dtype) + min, max, eps, tiny = info.min, info.max, info.eps, info.tiny + t = torch.tensor([min, max, eps, tiny], dtype=dtype, device=device) check_equal(t) - # NaN, inf, -inf are tested in reference_numerics tests. + @dtypes(torch.float32, torch.float64) + @unittest.skipIf(not TEST_SCIPY, "SciPy not found") + def test_special_log_ndtr_vs_scipy(self, device, dtype): + def check_equal(t): + # Test by comparing with scipy + actual = torch.special.log_ndtr(t) + expected = scipy.special.log_ndtr(t.cpu().numpy()) + self.assertEqual(actual, expected) + + # Skip testing NaN, inf, -inf since they are tested in reference_numerics tests. info = torch.finfo(dtype) min, max, eps, tiny = info.min, info.max, info.eps, info.tiny t = torch.tensor([min, max, eps, tiny], dtype=dtype, device=device) @@ -1279,7 +1418,7 @@ def check_equal(t): # TODO: allow large opinfo values to be opted-into via metadata @dtypes(torch.long) def test_abs_big_number(self, device, dtype): - bignumber = 2 ** 31 + 1 + bignumber = 2**31 + 1 res = torch.tensor([bignumber], device=device, dtype=dtype) self.assertGreater(res.abs()[0], 0) @@ -1304,15 +1443,17 @@ def test_abs_zero(self, device, dtype): for num in abs_zeros: self.assertGreater(math.copysign(1.0, num), 0.0) - @dtypes(*(get_all_dtypes(include_bool=False))) + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16)) def test_isposinf_isneginf_non_boolean_output(self, device, dtype): # test non-boolean tensors as the `out=` parameters # boolean outputs are tested in the above testcases - vals = (float('inf'), -float('inf'), 1.2) + vals = (float("inf"), -float("inf"), 1.2) t = torch.tensor(vals, device=device) for torch_op in (torch.isposinf, torch.isneginf): out = torch.empty_like(t, dtype=dtype) - with self.assertRaisesRegex(RuntimeError, 'does not support non-boolean outputs'): + with self.assertRaisesRegex( + RuntimeError, "does not support non-boolean outputs" + ): torch_op(t, out=out) def test_nonzero_empty(self, device): @@ -1346,13 +1487,16 @@ def assert_tuple_empty(tup, dim): self.assertEqual(torch.empty(0, dtype=torch.long), z[0]) # TODO: rationalize with exp OpInfo - @dtypes(*(get_all_fp_dtypes(include_half=False) + - get_all_complex_dtypes())) - @dtypesIfCUDA(*(get_all_fp_dtypes(include_half=True) + - get_all_complex_dtypes())) + @dtypes(*floating_and_complex_types_and(torch.bfloat16)) + @dtypesIfCUDA(*floating_and_complex_types_and(torch.half, torch.bfloat16)) def test_exp(self, device, dtype): for v in (2, -2) + ((1j, 1 + 1j) if dtype.is_complex else ()): - a = torch.tensor(v, dtype=dtype, device=device) * torch.arange(18, device=device) / 3 * math.pi + a = ( + torch.tensor(v, dtype=dtype, device=device) + * torch.arange(18, device=device) + / 3 + * math.pi + ) a = a.to(dtype) # bfloat16 overflows if dtype == torch.bfloat16: @@ -1360,10 +1504,12 @@ def test_exp(self, device, dtype): self.compare_with_numpy(torch.exp, np.exp, a) if dtype.is_complex: - inf_real_zero_imag_in = torch.tensor(complex(float('inf'), 0), device=device, dtype=dtype) + inf_real_zero_imag_in = torch.tensor( + complex(float("inf"), 0), device=device, dtype=dtype + ) inf_real_zero_imag_out = torch.exp(inf_real_zero_imag_in).item() self.assertTrue(math.isinf(inf_real_zero_imag_out.real)) - if self.device_type == 'cpu': + if self.device_type == "cpu": pass # These are commented out because it cannot be consistently reproduced. # This is incorrect. It should be zero. Need fix! @@ -1377,16 +1523,20 @@ def test_exp(self, device, dtype): self.assertEqual(inf_real_zero_imag_out.imag, 0, atol=0, rtol=0) self.compare_with_numpy(torch.exp, np.exp, inf_real_zero_imag_in) - zero_real_inf_imag_in = torch.tensor(complex(0, float('inf')), device=device, dtype=dtype) + zero_real_inf_imag_in = torch.tensor( + complex(0, float("inf")), device=device, dtype=dtype + ) zero_real_inf_imag_out = torch.exp(zero_real_inf_imag_in).item() self.assertTrue(math.isnan(zero_real_inf_imag_out.real)) self.assertTrue(math.isnan(zero_real_inf_imag_out.imag)) # Ensure we are notified when NumPy changes its behavior self.compare_with_numpy(torch.exp, np.exp, zero_real_inf_imag_in) - inf_real_imag_in = torch.tensor(complex(float('inf'), float('inf')), device=device, dtype=dtype) + inf_real_imag_in = torch.tensor( + complex(float("inf"), float("inf")), device=device, dtype=dtype + ) inf_real_imag_out = torch.exp(inf_real_imag_in).item() - if self.device_type == 'cpu': + if self.device_type == "cpu": pass # This is incorrect. Need fix! https://github.com/pytorch/pytorch/issues/40590 # This is commented out because it cannot be consistently reproduced. @@ -1397,9 +1547,11 @@ def test_exp(self, device, dtype): self.assertTrue(math.isnan(inf_real_imag_out.imag)) self.compare_with_numpy(torch.exp, np.exp, inf_real_imag_in) - inf_real_nan_imag_in = torch.tensor(complex(float('inf'), float('nan')), device=device, dtype=dtype) + inf_real_nan_imag_in = torch.tensor( + complex(float("inf"), float("nan")), device=device, dtype=dtype + ) inf_real_nan_imag_out = torch.exp(inf_real_nan_imag_in).item() - if self.device_type == 'cpu': + if self.device_type == "cpu": pass # This is incorrect. It should be inf. Need fix! https://github.com/pytorch/pytorch/issues/40590 # This is commented out because it cannot be consistently reproduced. @@ -1410,7 +1562,9 @@ def test_exp(self, device, dtype): self.assertTrue(math.isnan(inf_real_nan_imag_out.imag)) self.compare_with_numpy(torch.exp, np.exp, inf_real_nan_imag_in) - nan_real_inf_imag_in = torch.tensor(complex(float('nan'), float('inf')), device=device, dtype=dtype) + nan_real_inf_imag_in = torch.tensor( + complex(float("nan"), float("inf")), device=device, dtype=dtype + ) nan_real_inf_imag_out = torch.exp(nan_real_inf_imag_in).item() self.assertTrue(math.isnan(nan_real_inf_imag_out.real)) self.assertTrue(math.isnan(nan_real_inf_imag_out.imag)) @@ -1420,5 +1574,5 @@ def test_exp(self, device, dtype): instantiate_device_type_tests(TestUnaryUfuncs, globals()) -if __name__ == '__main__': +if __name__ == "__main__": run_tests() diff --git a/test/test_utils.py b/test/test_utils.py index c8f4e3aa9453..65583bcbaf63 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1,9 +1,7 @@ -# Owner(s): ["high priority"] +# Owner(s): ["module: unknown"] import sys import os -import contextlib -import io import re import shutil import random @@ -18,10 +16,9 @@ import torch.cuda from torch.utils.checkpoint import checkpoint, checkpoint_sequential import torch.utils.cpp_extension -import torch.hub as hub from torch.autograd._functions.utils import check_onnx_broadcast from torch.onnx.symbolic_opset9 import _prepare_onnx_paddings -from torch.testing._internal.common_utils import has_breakpad, load_tests, retry, IS_SANDCASTLE, IS_WINDOWS, TEST_WITH_ASAN +from torch.testing._internal.common_utils import load_tests, IS_SANDCASTLE, IS_WINDOWS # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for # sharding on sandcastle. This line silences flake warnings @@ -262,6 +259,19 @@ def run_fn(input): self.assertEqual(grad_with_checkpointing, grad_no_checkpointing) + @unittest.skipIf(not HAS_CUDA, 'No CUDA') + def test_checkpoint_not_preserve_rng_state_and_without_reentrant(self): + inp = torch.randn(2, device='cuda').requires_grad_() + layer = torch.nn.Dropout() + + def run_fn(input): + return layer(input) + + out = checkpoint(run_fn, inp, use_reentrant=False, preserve_rng_state=False) + out.sum().backward() + # This should run without error + + def test_checkpoint_non_tensor(self): def run_fn(tensor1, tensor2): @@ -411,12 +421,6 @@ def test_multi_drop(self): test_dir = os.path.abspath(os.path.dirname(str(__file__))) -class TestFFI(TestCase): - def test_deprecated(self): - with self.assertRaisesRegex(ImportError, "torch.utils.ffi is deprecated. Please use cpp extensions instead."): - from torch.utils.ffi import create_extension # type: ignore[attr-defined] # noqa: F401 - - @unittest.skipIf('SKIP_TEST_BOTTLENECK' in os.environ.keys(), 'SKIP_TEST_BOTTLENECK is set') class TestBottleneck(TestCase): def _run(self, command, timeout=30): @@ -584,146 +588,6 @@ def try_check_onnx_broadcast(dims1, dims2, expect_broadcast, expect_fail): try_check_onnx_broadcast(dims1, dims2, True, False) -def sum_of_state_dict(state_dict): - s = 0 - for _, v in state_dict.items(): - s += v.sum() - return s - -SUM_OF_HUB_EXAMPLE = 431080 -TORCHHUB_EXAMPLE_RELEASE_URL = 'https://github.com/ailzhang/torchhub_example/releases/download/0.1/mnist_init_ones' - -@unittest.skipIf(IS_SANDCASTLE, 'Sandcastle cannot ping external') -class TestHub(TestCase): - @retry(Exception, tries=3) - def test_load_from_github(self): - hub_model = hub.load( - 'ailzhang/torchhub_example', - 'mnist', - source='github', - pretrained=True, - verbose=False) - self.assertEqual(sum_of_state_dict(hub_model.state_dict()), - SUM_OF_HUB_EXAMPLE) - - @retry(Exception, tries=3) - def test_load_from_local_dir(self): - local_dir = hub._get_cache_or_reload( - 'ailzhang/torchhub_example', force_reload=False) - hub_model = hub.load( - local_dir, - 'mnist', - source='local', - pretrained=True, - verbose=False) - self.assertEqual(sum_of_state_dict(hub_model.state_dict()), - SUM_OF_HUB_EXAMPLE) - - @retry(Exception, tries=3) - def test_load_from_branch(self): - hub_model = hub.load( - 'ailzhang/torchhub_example:ci/test_slash', - 'mnist', - pretrained=True, - verbose=False) - self.assertEqual(sum_of_state_dict(hub_model.state_dict()), - SUM_OF_HUB_EXAMPLE) - - @retry(Exception, tries=3) - def test_set_dir(self): - temp_dir = tempfile.gettempdir() - hub.set_dir(temp_dir) - hub_model = hub.load( - 'ailzhang/torchhub_example', - 'mnist', - pretrained=True, - verbose=False) - self.assertEqual(sum_of_state_dict(hub_model.state_dict()), - SUM_OF_HUB_EXAMPLE) - assert os.path.exists(temp_dir + '/ailzhang_torchhub_example_master') - shutil.rmtree(temp_dir + '/ailzhang_torchhub_example_master') - - @retry(Exception, tries=3) - def test_list_entrypoints(self): - entry_lists = hub.list('ailzhang/torchhub_example', force_reload=True) - self.assertObjectIn('mnist', entry_lists) - - @retry(Exception, tries=3) - def test_download_url_to_file(self): - temp_file = os.path.join(tempfile.gettempdir(), 'temp') - hub.download_url_to_file(TORCHHUB_EXAMPLE_RELEASE_URL, temp_file, progress=False) - loaded_state = torch.load(temp_file) - self.assertEqual(sum_of_state_dict(loaded_state), - SUM_OF_HUB_EXAMPLE) - - @retry(Exception, tries=3) - def test_load_state_dict_from_url(self): - loaded_state = hub.load_state_dict_from_url(TORCHHUB_EXAMPLE_RELEASE_URL) - self.assertEqual(sum_of_state_dict(loaded_state), - SUM_OF_HUB_EXAMPLE) - - @retry(Exception, tries=3) - def test_load_zip_checkpoint(self): - hub_model = hub.load( - 'ailzhang/torchhub_example', - 'mnist_zip', - pretrained=True, - verbose=False) - self.assertEqual(sum_of_state_dict(hub_model.state_dict()), - SUM_OF_HUB_EXAMPLE) - - # Test the default zipfile serialization format produced by >=1.6 release. - @retry(Exception, tries=3) - def test_load_zip_1_6_checkpoint(self): - hub_model = hub.load( - 'ailzhang/torchhub_example', - 'mnist_zip_1_6', - pretrained=True, - verbose=False) - self.assertEqual(sum_of_state_dict(hub_model.state_dict()), - SUM_OF_HUB_EXAMPLE) - - - def test_hub_dir(self): - with tempfile.TemporaryDirectory('hub_dir') as dirname: - torch.hub.set_dir(dirname) - self.assertEqual(torch.hub.get_dir(), dirname) - - @retry(Exception, tries=3) - def test_hub_parse_repo_info(self): - # If the branch is specified we just parse the input and return - self.assertEqual( - torch.hub._parse_repo_info('a/b:c'), - ('a', 'b', 'c') - ) - # For torchvision, the default branch is main - self.assertEqual( - torch.hub._parse_repo_info('pytorch/vision'), - ('pytorch', 'vision', 'main') - ) - # For the torchhub_example repo, the default branch is still master - self.assertEqual( - torch.hub._parse_repo_info('ailzhang/torchhub_example'), - ('ailzhang', 'torchhub_example', 'master') - ) - - @retry(Exception, tries=3) - def test_load_state_dict_from_url_with_name(self): - with tempfile.TemporaryDirectory('hub_dir') as dirname: - torch.hub.set_dir(dirname) - file_name = 'test_file' - loaded_state = hub.load_state_dict_from_url(TORCHHUB_EXAMPLE_RELEASE_URL, file_name=file_name) - self.assertTrue(os.path.exists(os.path.join(dirname, 'checkpoints', file_name))) - self.assertEqual(sum_of_state_dict(loaded_state), - SUM_OF_HUB_EXAMPLE) - - @retry(Exception, tries=3) - def test_load_commit_from_forked_repo(self): - with self.assertRaisesRegex( - ValueError, - 'If it\'s a commit from a forked repo'): - model = torch.hub.load('pytorch/vision:4e2c216', 'resnet18', force_reload=True) - class TestHipify(TestCase): def test_import_hipify(self): from torch.utils.hipify import hipify_python # noqa: F401 @@ -757,32 +621,6 @@ def forward(self, x): ms(torch.tensor([False], dtype=torch.bool)) -class TestCrashHandler(TestCase): - @unittest.skipIf(TEST_WITH_ASAN, "ASAN disables the crash handler's signal handler") - @unittest.skipIf(not has_breakpad(), "Built without breakpad") - def test_python_exception_writing(self): - with tempfile.TemporaryDirectory() as temp_dir: - torch.utils._crash_handler.enable_minidumps(temp_dir) - torch.utils._crash_handler.enable_minidumps_on_exceptions() - - files = os.listdir(temp_dir) - self.assertEqual(len(files), 0) - - f = io.StringIO() - with contextlib.redirect_stderr(f): - try: - @torch.jit.script - def x(i: int): - return i + "2" # type: ignore[operator] - except RuntimeError as e: - pass - - files = os.listdir(temp_dir) - self.assertEqual(len(files), 1) - self.assertTrue(files[0].endswith(".dmp")) - torch.utils._crash_handler.disable_minidumps() - - @unittest.skipIf(IS_SANDCASTLE, "cpp_extension is OSS only") class TestStandaloneCPPJIT(TestCase): def test_load_standalone(self): diff --git a/test/test_view_ops.py b/test/test_view_ops.py index 2678db1d74d5..424a31e61d24 100644 --- a/test/test_view_ops.py +++ b/test/test_view_ops.py @@ -11,12 +11,12 @@ from torch.testing import make_tensor from torch.testing._internal.common_utils import ( TestCase, run_tests, suppress_warnings, gradcheck, gradgradcheck, - torch_to_numpy_dtype_dict, + numpy_to_torch_dtype_dict, ) from torch.testing._internal.common_device_type import \ - (instantiate_device_type_tests, onlyCPU, dtypes, onlyNativeDeviceTypes) + (instantiate_device_type_tests, onlyCPU, dtypes, onlyNativeDeviceTypes, skipMeta) from torch.testing._internal.common_dtype import ( - get_all_dtypes, get_all_int_dtypes, get_all_fp_dtypes, get_all_complex_dtypes + all_types_and_complex_and, complex_types, all_types_and, floating_and_complex_types_and, ) # TODO: replace this with make_tensor() in common_utils.py @@ -121,26 +121,26 @@ def _do_transpose(self, x, contiguous=False, dim0=0, dim1=1): else: return x.transpose(dim0, dim1) - @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) + @dtypes(*all_types_and(torch.half, torch.bfloat16)) def test_conj_self(self, device, dtype): t = torch.ones(5, 5, device=device) s = t.conj() self.assertTrue(s is t) @onlyNativeDeviceTypes - @dtypes(*get_all_dtypes(include_bfloat16=False)) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool)) def test_view_dtype_new(self, device, dtype): - dtypes = torch_to_numpy_dtype_dict.copy() + dtypes = {value : key for (key, value) in numpy_to_torch_dtype_dict.items()} del dtypes[torch.bool] def generate_inputs(): - yield make_tensor((4, 4, 64), device, dtype, low=-5, high=5) - yield make_tensor((4, 4, 64), device, dtype, low=-5, high=5).permute(1, 0, 2) - yield make_tensor((4, 64, 4), device, dtype, low=-5, high=5).permute(2, 0, 1) - yield make_tensor((1, 5, 1), device, dtype, low=-5, high=5).expand(5, 5, 64) - yield make_tensor((2, 5, 256), device, dtype, low=-5, high=5)[1::2, 1:, ::2] - yield make_tensor((0, 5, 64), device, dtype, low=-5, high=5) - yield make_tensor((), device, dtype, low=-5, high=5) + yield make_tensor((4, 4, 64), dtype=dtype, device=device, low=-5, high=5) + yield make_tensor((4, 4, 64), dtype=dtype, device=device, low=-5, high=5).permute(1, 0, 2) + yield make_tensor((4, 64, 4), dtype=dtype, device=device, low=-5, high=5).permute(2, 0, 1) + yield make_tensor((1, 5, 1), dtype=dtype, device=device, low=-5, high=5).expand(5, 5, 64) + yield make_tensor((2, 5, 256), dtype=dtype, device=device, low=-5, high=5)[1::2, 1:, ::2] + yield make_tensor((0, 5, 64), dtype=dtype, device=device, low=-5, high=5) + yield make_tensor((), dtype=dtype, device=device, low=-5, high=5) def calc_expected_size_and_stride(a, view_dtype): dtype_size = torch._utils._element_size(a.dtype) @@ -210,24 +210,24 @@ def calc_expected_size_and_stride(a, view_dtype): # because view(dtype) does not support backward yet # TODO: Remove this when autograd support is added if dtype.is_floating_point or dtype.is_complex: - for view_dtype in [*get_all_fp_dtypes(), *get_all_complex_dtypes()]: - t = make_tensor((5, 5, 64), device, dtype, low=-5, high=5, requires_grad=True) + for view_dtype in floating_and_complex_types_and(torch.half, torch.bfloat16): + t = make_tensor((5, 5, 64), dtype=dtype, device=device, low=-5, high=5, requires_grad=True) self.assertFalse(t.view(view_dtype).requires_grad) # Test the extra error checks that happen when the view dtype # has a greater element size than the original dtype @onlyNativeDeviceTypes - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool)) def test_view_dtype_upsize_errors(self, device, dtype): dtype_size = torch._utils._element_size(dtype) - for view_dtype in get_all_dtypes(): + for view_dtype in all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool): view_dtype_size = torch._utils._element_size(view_dtype) if view_dtype_size <= dtype_size: continue size_ratio = view_dtype_size // dtype_size - a = make_tensor((4, 4, size_ratio + 1), device, dtype, low=-5, high=5) + a = make_tensor((4, 4, size_ratio + 1), dtype=dtype, device=device, low=-5, high=5) with self.assertRaisesRegex( RuntimeError, rf"self.size\(-1\) must be divisible by {size_ratio}"): @@ -238,7 +238,7 @@ def test_view_dtype_upsize_errors(self, device, dtype): rf"self.storage_offset\(\) must be divisible by {size_ratio}"): a[:, :, 1:].view(view_dtype) - a = make_tensor((4, 4, size_ratio), device, dtype, low=-5, high=5) + a = make_tensor((4, 4, size_ratio), dtype=dtype, device=device, low=-5, high=5) a = a.as_strided((4, 4, size_ratio), (size_ratio, 1, 1)) with self.assertRaisesRegex( RuntimeError, @@ -302,7 +302,7 @@ def fn(contiguous_input=True, dim0=0, dim1=1): self.assertEqual(res.shape, torch.Size([0])) @onlyNativeDeviceTypes - @dtypes(*get_all_complex_dtypes(include_complex32=True)) + @dtypes(*complex_types(), torch.complex32) def test_view_as_real(self, device, dtype): def fn(contiguous_input=True): t = torch.randn(3, 4, dtype=dtype, device=device) @@ -310,11 +310,7 @@ def fn(contiguous_input=True): res = torch.view_as_real(input) self.assertEqual(res[:, :, 0], input.real) self.assertEqual(res[:, :, 1], input.imag) - # TODO: Add torch.ComplexHalfStorage - if dtype != torch.complex32: - self.assertTrue(self.is_view_of(t, res)) - else: - self.assertRaises(RuntimeError, lambda: self.is_view_of(t, res)) + self.assertTrue(self.is_view_of(t, res)) fn() fn(contiguous_input=False) @@ -322,27 +318,19 @@ def fn(contiguous_input=True): # tensor with zero elements x = torch.tensor([], dtype=dtype, device=device) res = torch.view_as_real(x) - # TODO: Add torch.ComplexHalfStorage - if dtype != torch.complex32: - self.assertTrue(self.is_view_of(x, res)) - else: - self.assertRaises(RuntimeError, lambda: self.is_view_of(x, res)) + self.assertTrue(self.is_view_of(x, res)) self.assertEqual(res.shape, torch.Size([0, 2])) # tensor with zero dim x = torch.tensor(2 + 3j, dtype=dtype, device=device) res = torch.view_as_real(x) - # TODO: Add torch.ComplexHalfStorage - if dtype != torch.complex32: - self.assertTrue(self.is_view_of(x, res)) - else: - self.assertRaises(RuntimeError, lambda: self.is_view_of(x, res)) + self.assertTrue(self.is_view_of(x, res)) self.assertEqual(res.shape, torch.Size([2])) @onlyNativeDeviceTypes - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool)) def test_view_tensor_split(self, device, dtype): - a = make_tensor((40, 30), device, dtype, low=-9, high=9) + a = make_tensor((40, 30), dtype=dtype, device=device, low=-9, high=9) a_split_dim0 = a.tensor_split(7, 0) for a_split_dim0_tensor in a_split_dim0: self.assertTrue(self.is_view_of(a, a_split_dim0_tensor)) @@ -351,9 +339,9 @@ def test_view_tensor_split(self, device, dtype): self.assertTrue(self.is_view_of(a, a_split_dim1_tensor)) @onlyNativeDeviceTypes - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool)) def test_view_tensor_hsplit(self, device, dtype): - t = make_tensor((4, 4, 4), device, dtype, low=-9, high=9) + t = make_tensor((4, 4, 4), dtype=dtype, device=device, low=-9, high=9) t_hsplit = torch.hsplit(t, 2) for t_hsplit_tensor in t_hsplit: self.assertTrue(self.is_view_of(t, t_hsplit_tensor)) @@ -361,9 +349,9 @@ def test_view_tensor_hsplit(self, device, dtype): self.assertEqual(t_hsplit[1][2, 0, 2], t[2, 2, 2]) @onlyNativeDeviceTypes - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool)) def test_view_tensor_vsplit(self, device, dtype): - t = make_tensor((4, 4, 4), device, dtype, low=-9, high=9) + t = make_tensor((4, 4, 4), dtype=dtype, device=device, low=-9, high=9) t_vsplit = torch.vsplit(t, 2) for t_vsplit_tensor in t_vsplit: self.assertTrue(self.is_view_of(t, t_vsplit_tensor)) @@ -371,9 +359,9 @@ def test_view_tensor_vsplit(self, device, dtype): self.assertEqual(t_vsplit[1][0, 2, 2], t[2, 2, 2]) @onlyNativeDeviceTypes - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool)) def test_view_tensor_dsplit(self, device, dtype): - t = make_tensor((4, 4, 4), device, dtype, low=-9, high=9) + t = make_tensor((4, 4, 4), dtype=dtype, device=device, low=-9, high=9) t_dsplit = torch.dsplit(t, 2) for t_dsplit_tensor in t_dsplit: self.assertTrue(self.is_view_of(t, t_dsplit_tensor)) @@ -381,7 +369,7 @@ def test_view_tensor_dsplit(self, device, dtype): self.assertEqual(t_dsplit[1][2, 2, 0], t[2, 2, 2]) @onlyNativeDeviceTypes - @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) + @dtypes(*all_types_and(torch.half, torch.bfloat16)) def test_imag_noncomplex(self, device, dtype): t = torch.ones((5, 5), dtype=dtype, device=device) @@ -389,7 +377,7 @@ def test_imag_noncomplex(self, device, dtype): torch.imag(t) @onlyNativeDeviceTypes - @dtypes(*get_all_complex_dtypes()) + @dtypes(*complex_types()) def test_real_imag_view(self, device, dtype): def compare_with_numpy(contiguous_input=True): t = torch.randn(3, 3, dtype=dtype, device=device) @@ -420,7 +408,7 @@ def compare_with_numpy(contiguous_input=True): self.assertEqual(a[5:].imag, a.imag[5:]) @onlyNativeDeviceTypes - @dtypes(*get_all_complex_dtypes()) + @dtypes(*complex_types()) def test_conj_imag_view(self, device, dtype) -> None: t = _make_tensor((4, 5,), dtype, device) t_numpy_conj = torch.from_numpy(t.cpu().numpy().conj()).to(device=device) @@ -445,7 +433,7 @@ def test_conj_view_with_shared_memory(self, device) -> None: self.assertEqual(torch.add(b, c), b.add_(c)) @onlyNativeDeviceTypes - @dtypes(*product(get_all_complex_dtypes(), get_all_dtypes())) + @dtypes(*product(complex_types(), all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))) @suppress_warnings def test_set_real_imag(self, device, dtypes): x = torch.randn(10, dtype=dtypes[0], device=device) @@ -729,6 +717,7 @@ def test_contiguous_self(self, device): s = t.contiguous() self.assertTrue(s is t) + @skipMeta def test_contiguous_nonview(self, device): t = torch.ones(5, 5, device=device) nv = t.t().contiguous() @@ -754,6 +743,7 @@ def test_reshape_as_view(self, device): v[6] = 0 self.assertEqual(t[1, 1], v[6]) + @skipMeta def test_reshape_nonview(self, device): t = torch.ones(5, 5, device=device) nv = torch.reshape(t.t(), (25,)) @@ -806,7 +796,8 @@ def assert_is_nonview(t, nv): idx_nv = (0,) * nv.ndim self.assertTrue(not nv._is_view()) nv[idx_nv] = 0 - self.assertNotEqual(t[idx_t], nv[idx_nv]) + if device != "meta": + self.assertNotEqual(t[idx_t], nv[idx_nv]) t = torch.ones(2, 3, 2, 3, device=device).transpose(2, 3) nv = t.flatten(1, 3) assert_is_nonview(t, nv) @@ -905,6 +896,43 @@ def run_test(device, op): op = partial(fn, source=0, destination=1) run_test(device, op) + # Testing that the generated view_copy kernel and its derivative are implemented correctly + def test_view_copy(self, device): + a = torch.randn(4, device=device, requires_grad=True) + a_ref = a.clone().detach().requires_grad_() + a_view = a_ref.view(2, 2) + a_view_copy = torch.view_copy(a, (2, 2)) + + # view_copy ops don't preserve view relationship + self.assertTrue(self.is_view_of(a_ref, a_view)) + self.assertFalse(self.is_view_of(a, a_view_copy)) + + a_view_copy.sum().backward() + a_view.sum().backward() + + # forward and backward give the same shape + result + self.assertEqual(a_view_copy, a_view) + self.assertEqual(a.grad, a_ref.grad) + + def test_view_copy_out(self, device): + a = torch.randn(2, 2, device=device) + out = torch.empty(2, device=device) + + torch.diagonal_copy(a, out=out) + expected = torch.diagonal_copy(a) + + self.assertEqual(expected, out) + + a = torch.randn(4, device=device) + out1 = torch.empty(2, device=device) + out2 = torch.empty(2, device=device) + + torch.split_copy(a, 2, out=(out1, out2)) + expected1, expected2 = torch.split_copy(a, 2) + + self.assertEqual(expected1, out1) + self.assertEqual(expected2, out2) + class TestOldViewOps(TestCase): def test_ravel(self, device): @@ -914,29 +942,38 @@ def _test_ravel(tensors, size, nc=False): flat = src.ravel() self.assertEqual(flat.shape, torch.Size([size])) self.assertEqual(src.view(-1), flat) - self.assertEqual(flat._base, src) + self.assertIs(flat._base, src) + self.assertTrue(flat.is_contiguous()) # Non-continuous Tensor -> Copy if nc: nc_src = src.t() nc_flat = nc_src.ravel() self.assertEqual(nc_flat.shape, torch.Size([size])) - self.assertEqual(nc_src.reshape(-1), nc_flat) - self.assertTrue(nc_flat._base != nc_src) + self.assertEqual(nc_src.contiguous().view(-1), nc_flat) + self.assertIsNot(nc_flat._base, src) + self.assertTrue(nc_flat.is_contiguous()) # Test that flatten returns 1-dim tensor when given a 0-dim tensor zero_dim_tensor = torch.tensor(123, device=device) flat0 = zero_dim_tensor.ravel() one_dim_tensor = torch.tensor([123], device=device) flat1 = zero_dim_tensor.ravel() + nc_ones_tensor = torch.ones(10, device=device)[::2] + flat2 = nc_ones_tensor.ravel() self.assertEqual(zero_dim_tensor.shape, torch.Size([])) self.assertEqual(flat0.shape, torch.Size([1])) self.assertEqual(one_dim_tensor.shape, torch.Size([1])) self.assertEqual(flat1.shape, torch.Size([1])) + self.assertEqual(nc_ones_tensor.shape, torch.Size([5])) + self.assertEqual(flat2.shape, torch.Size([5])) self.assertEqual(flat0, one_dim_tensor) self.assertEqual(flat0, flat1) self.assertEqual(flat0.shape, flat1.shape) + self.assertTrue(flat0.is_contiguous()) + self.assertTrue(flat1.is_contiguous()) + self.assertTrue(flat2.is_contiguous()) # Test both float tensor and quantized tensor tensors = [torch.randn(5, 5, 5, 5, device=device), @@ -1027,7 +1064,9 @@ def test_reshape(self, device): self.assertRaises(RuntimeError, lambda: x.reshape(-1, -1)) y = torch.randn(4, 4, 4, device=device)[:, 0, :] - self.assertNotEqual(y.data_ptr(), y.reshape(-1).data_ptr()) + # .data_ptr() on meta tensors is always 0 so they are equal regardless of the reshape + if device != "meta": + self.assertNotEqual(y.data_ptr(), y.reshape(-1).data_ptr()) self.assertEqual(y.contiguous().view(-1), y.reshape(-1)) self.assertEqual(y.reshape(2, 2, 4).data_ptr(), y.data_ptr()) @@ -1250,7 +1289,7 @@ def test_T(self, device): scalar = torch.tensor(5, device=device) self.assertEqual(scalar, scalar.T) - @dtypes(*(torch.testing.get_all_dtypes())) + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool)) def test_transposes(self, device, dtype): for op in ("T", "H", "mT", "mH", "adjoint"): shapes = ((), (2, 3), (2, 3, 4)) if op[0] == "m" or op == "adjoint" else ((), (2, 3),) @@ -1266,7 +1305,7 @@ def test_transposes(self, device, dtype): t2 = t2.conj() self.assertEqual(t2, t1) - @dtypes(*(torch.testing.get_all_dtypes())) + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool)) def test_transposes_errors(self, device, dtype): for op in ("H", "mT", "mH", "adjoint"): shapes = ((2,), (2, 3, 4)) if op == "H" else ((2,),) @@ -1392,8 +1431,7 @@ def _test_atleast_dim(self, torch_fn, np_fn, device, dtype): self.assertEqual(np_res, torch_res) # TODO: are these view ops? - @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False) + - get_all_complex_dtypes())) + @dtypes(*all_types_and_complex_and(torch.half)) def test_atleast(self, device, dtype): self._test_atleast_dim(torch.atleast_1d, np.atleast_1d, device, dtype) self._test_atleast_dim(torch.atleast_2d, np.atleast_2d, device, dtype) @@ -1457,8 +1495,80 @@ def test_broadcast_shapes(self, device): actual = torch.broadcast_shapes(s0, s1) self.assertEqual(expected, actual) + inputs_list = [[1, 4], [4, 1], [1, 1, 3]] + for integral_inputs in inputs_list: + res1 = torch.broadcast_shapes(*integral_inputs) + res2 = torch.broadcast_tensors(*map(torch.empty, integral_inputs))[0].shape + self.assertEqual(res1, res2) + + inputs_with_neg_vals = [[1, 1, -12], [-1, 1], [-11, ]] + for integral_inputs_with_neg_vals in inputs_with_neg_vals: + with self.assertRaisesRegex(RuntimeError, "Trying to create tensor with negative dimension"): + torch.broadcast_shapes(*integral_inputs_with_neg_vals) + + integral_inputs_error_case = [(3, 5), (2, 4, 1)] + for error_input in integral_inputs_error_case: + with self.assertRaisesRegex(RuntimeError, "Shape mismatch: objects cannot be broadcast to a single shape"): + torch.broadcast_shapes(*error_input) + + negative_inputs = [(-1,), (1, -12), (4, -11), (-4, 1), (1, 1, -2)] + for s0 in negative_inputs: + with self.assertRaisesRegex(RuntimeError, "Trying to create tensor with negative dimension"): + torch.broadcast_shapes(s0) + + for s1 in negative_inputs: + with self.assertRaisesRegex(RuntimeError, "Trying to create tensor with negative dimension"): + torch.broadcast_shapes(s0, s1) + + float_inputs_error_case = [(1.1, 2.0), (1.1, 1.0)] + for error_case in float_inputs_error_case: + for float_input in error_case: + with self.assertRaisesRegex(RuntimeError, "Input shapes " + "should be of type ints, a tuple of ints, or a list of ints"): + torch.broadcast_shapes(float_input) + + diff_input_types = [(1, (5,)), (3, (1,)), (1, (3, 4))] + for s0 in diff_input_types: + res1 = torch.broadcast_shapes(*s0) + res2 = torch.broadcast_tensors(*map(torch.empty, s0))[0].shape + self.assertEqual(res1, res2) + + @unittest.skipIf(np.__version__ < '1.20', + "NumPy does not support broadcast_shapes before the 1.20 version") + @onlyCPU + def test_broadcast_shapes_numpy_ref(self, device): + examples = [(), (1,), (2,), (1, 1), (3, 1), (3, 2), (4, 1, 1), (4, 3, 2)] + for s0 in examples: + x0 = torch.randn(s0) + actual = torch.broadcast_shapes(s0) + numpy_expected = np.broadcast_shapes(s0) + self.assertEqual(actual, numpy_expected) + + for s1 in examples: + x1 = torch.randn(s1) + actual = torch.broadcast_shapes(s0, s1) + numpy_expected = np.broadcast_shapes(s0, s1) + self.assertEqual(actual, numpy_expected) + + inputs_list = [[1, 4], [4, 1], [1, 1, 3]] + for integral_inputs in inputs_list: + res1 = torch.broadcast_shapes(*integral_inputs) + res2_numpy = np.broadcast_shapes(*integral_inputs) + self.assertEqual(res1, res2_numpy) + + for list_inputs in inputs_list: + res1 = torch.broadcast_shapes(list_inputs) + res2 = np.broadcast_shapes(list_inputs) + self.assertEqual(res1, res2) + + diff_input_types = [(1, (5,)), (3, (1,)), (1, (3, 4))] + for s0 in diff_input_types: + res1 = torch.broadcast_shapes(*s0) + res2_numpy = np.broadcast_shapes(*s0) + self.assertEqual(res1, res2_numpy) + # Skip BFloat16 since numpy does not support it - @dtypes(*get_all_dtypes(include_bfloat16=False)) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool)) def test_broadcast_to(self, device, dtype): def can_broadcast(s0, s1): # s0.dim() <= s1.dim(), reverse s0 and s1 to compare trailing dimension @@ -1473,7 +1583,7 @@ def can_broadcast(s0, s1): (), (1,), (2,), (1, 1), (3, 1), (3, 2), (4, 1, 1), (4, 3, 2) ) for s0, s1 in combinations(sizes, r=2): - t = make_tensor(s0, device, dtype, low=-9, high=9) + t = make_tensor(s0, dtype=dtype, device=device, low=-9, high=9) t_np = t.cpu().numpy() if can_broadcast(s0, s1): @@ -1561,9 +1671,9 @@ def test_view(self, device): self.assertEqual(tensor.view(6, 2, 1), contig_tensor.view(6, 2, 1)) self.assertEqual(tensor.view(1, 6, 2, 1), contig_tensor.view(1, 6, 2, 1)) - @dtypes(*get_all_dtypes()) + @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool)) def test_reshape_view_semantics(self, device, dtype): - tensor = make_tensor((15, 4), device, dtype) + tensor = make_tensor((15, 4), dtype=dtype, device=device) target = (20, 3) # Cases where the tensor can be returned as a view. @@ -1588,7 +1698,7 @@ def test_contiguous(self, device): @onlyNativeDeviceTypes # Skip BFloat16 since numpy does not support it - @dtypes(*get_all_dtypes(include_bfloat16=False)) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool)) def test_tensor_split_sections(self, device, dtype): input_sizes = [ (0,), @@ -1599,7 +1709,7 @@ def test_tensor_split_sections(self, device, dtype): (12, 3), ] for input_size in input_sizes: - a_base = make_tensor(input_size, device, dtype, low=-9, high=9) + a_base = make_tensor(input_size, dtype=dtype, device=device, low=-9, high=9) # Run tests on transposed input if it has at least 2 dims for a in [a_base, a_base.t()] if a_base.dim() > 2 else [a_base]: a_n = a.cpu().numpy() @@ -1619,7 +1729,7 @@ def test_tensor_split_sections(self, device, dtype): @onlyNativeDeviceTypes # Skip BFloat16 since numpy does not support it - @dtypes(*get_all_dtypes(include_bfloat16=False)) + @dtypes(*all_types_and_complex_and(torch.half, torch.bool)) def test_tensor_split_indices(self, device, dtype): input_sizes = [ (0,), @@ -1642,7 +1752,7 @@ def test_tensor_split_indices(self, device, dtype): (1, 5, 2, 8), ] for input_size in input_sizes: - a_base = make_tensor(input_size, device, dtype, low=-9, high=9) + a_base = make_tensor(input_size, dtype=dtype, device=device, low=-9, high=9) # Run tests on transposed input if it has at least 2 dims for a in [a_base, a_base.t()] if a_base.dim() > 2 else [a_base]: a_n = a.cpu().numpy() @@ -1698,20 +1808,28 @@ def test_tensor_split_errors(self, device): def test_resize_all_dtypes_and_devices(self, device): shape = (2, 2) - for dt in get_all_dtypes(): + for dt in all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool): x = torch.tensor([[1, 2], [3, 4], [5, 6]], dtype=dt, device=device) x.resize_(shape) self.assertEqual(shape, x.shape) def test_resize_as_all_dtypes_and_devices(self, device): - for dt in get_all_dtypes(): + for dt in all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool): x = torch.tensor([[1, 2], [3, 4], [5, 6]], dtype=dt, device=device) y = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=dt, device=device) x.resize_as_(y) self.assertEqual(y.shape, x.shape) + @onlyNativeDeviceTypes + def test_resize_overflow(self, device): + x = torch.empty((), dtype=torch.float64) + with self.assertRaisesRegex(RuntimeError, 'Storage size calculation overflowed'): + x.resize_([2, 4, 2**29, 2**29]) + with self.assertRaisesRegex(RuntimeError, 'overflow'): + x.resize_([8, 8, 2**29, 2**29]) + def test_view_all_dtypes_and_devices(self, device): - for dt in get_all_dtypes(): + for dt in all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool): x = torch.tensor([[1, 2], [3, 4], [5, 6]], dtype=dt, device=device) self.assertEqual(x.view(6).shape, [6]) diff --git a/test/test_xnnpack_integration.py b/test/test_xnnpack_integration.py index 38ee3bbfdfa4..9e510d1715b1 100644 --- a/test/test_xnnpack_integration.py +++ b/test/test_xnnpack_integration.py @@ -21,6 +21,7 @@ " Please build with USE_XNNPACK=1.") @unittest.skipIf(TEST_WITH_TSAN, "TSAN fails with XNNPACK. Does not seem to have a good reason for failures.") class TestXNNPACKOps(TestCase): + @unittest.skip("Fails on some platforms, see https://github.com/pytorch/pytorch/issues/73488") @given(batch_size=st.integers(0, 3), data_shape=hu.array_shapes(1, 3, 2, 64), weight_output_dim=st.integers(2, 64), @@ -53,7 +54,6 @@ def test_linear_1d_input(self, input_size, weight_output_dim, use_bias): output_linearprepacked = torch.ops.prepacked.linear_clamp_run(input_data, packed_weight_bias) torch.testing.assert_close(ref_result, output_linearprepacked, rtol=1e-2, atol=1e-3) - @given(batch_size=st.integers(0, 3), input_channels_per_group=st.integers(1, 32), height=st.integers(5, 64), @@ -183,6 +183,7 @@ def test_conv2d_transpose(self, " Please build with USE_XNNPACK=1.") @unittest.skipIf(TEST_WITH_TSAN, "TSAN fails with XNNPACK. Does not seem to have a good reason for failures.") class TestXNNPACKSerDes(TestCase): + @unittest.skip("Fails on some platforms, see https://github.com/pytorch/pytorch/issues/73488") @given(batch_size=st.integers(0, 3), data_shape=hu.array_shapes(1, 3, 2, 64), weight_output_dim=st.integers(2, 64), @@ -437,6 +438,7 @@ def forward(self, x): xnnpack_result = deserialized_conv2d_clamp_prepacked(input_data) torch.testing.assert_close(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3) + @unittest.skip("Fails on some platforms, see https://github.com/pytorch/pytorch/issues/73488") @given(batch_size=st.integers(0, 3), input_channels_per_group=st.integers(1, 32), height=st.integers(5, 64), diff --git a/test/typing/reveal/namedtuple.py b/test/typing/reveal/namedtuple.py index 8a0508b325c5..2e130338f0b9 100644 --- a/test/typing/reveal/namedtuple.py +++ b/test/typing/reveal/namedtuple.py @@ -7,9 +7,9 @@ t_sort[0][0, 0] == 1.5 # noqa: B015 t_sort.indices[0, 0] == 1 # noqa: B015 t_sort.values[0, 0] == 1.5 # noqa: B015 -reveal_type(t_sort) # E: Tuple[{Tensor}, {Tensor}, fallback=torch._C.namedtuple_values_indices] +reveal_type(t_sort) # E: Tuple[{Tensor}, {Tensor}, fallback=torch.return_types.sort] t_qr = torch.linalg.qr(t) t_qr[0].shape == [2, 2] # noqa: B015 t_qr.Q.shape == [2, 2] # noqa: B015 -reveal_type(t_qr) # E: Tuple[{Tensor}, {Tensor}, fallback=torch._C._VariableFunctions.namedtuple_Q_R] +reveal_type(t_qr) # E: Tuple[{Tensor}, {Tensor}, fallback=torch.return_types.qr] diff --git a/third_party/BUILD.buck b/third_party/BUILD.buck new file mode 100644 index 000000000000..cb28c744b54a --- /dev/null +++ b/third_party/BUILD.buck @@ -0,0 +1,221 @@ +load("//third_party:glog.buck.bzl", "define_glog") +load("//third_party:xnnpack.buck.bzl", "define_xnnpack") + +define_glog() + +define_xnnpack() + +cxx_library( + name = "fmt", + srcs = ['fmt/src/format.cc'], + deps = [], + compiler_flags = ['-w', '-Wno-error=format-zero-length', '-Wno-error=vla', '-Wno-incompatible-pointer-types-discards-qualifiers', '-Wno-unused-label', '-Wno-deprecated-declarations', '-Wno-implicit-function-declaration', '-Wno-error', '-Wno-non-pod-varargs', '-Wno-format-security', '-Wno-c++11-narrowing', '-Wno-ignored-attributes', '-Wno-return-std-move', '-Wno-shadow', '-Wno-sign-compare', '-Wno-switch', '-Wno-undef', '-Wno-uninitialized', '-Wno-unknown-pragmas', '-Wno-unknown-warning-option', '-Wno-unused-function', '-Wno-unused-local-typedef', '-Wno-unused-value', '-Wno-unused-variable', '-Wno-register', '-Wno-format', '-Wno-unused-lambda-capture', '-Wno-missing-braces', '-Wno-unused-parameter', '-Wno-unreachable-code', '-Wno-inconsistent-missing-destructor-override', '-Wno-implicit-fallthrough', '-Wno-ignored-qualifiers', '-Wno-pedantic', '-Wno-deprecated-copy', '-Wno-non-virtual-dtor', '-Wno-null-pointer-arithmetic', '-Wno-implicit-const-int-float-conversion', '-Wno-tautological-unsigned-enum-zero-compare', '-Wno-embedded-directive', '-Wno-int-conversion', '-Wno-nonnull', '-Wno-variadic-macros', '-Wno-zero-length-array', '-Wno-missing-prototypes', '-fno-exceptions', '-fno-rtti', '-Wno-braced-scalar-init', '-fvisibility-inlines-hidden'], + preferred_linkage = "static", + exported_preprocessor_flags = ['-DFMT_EXCEPTIONS=0'], + header_namespace = "third_party/fmt", + public_system_include_directories = ['fmt/include'], + raw_headers = glob(["fmt/include/fmt/*.h"]), + soname = "libthird-party_fmt_fmt.$(ext)", + visibility = ['PUBLIC'], +) + +cxx_library( + name = "pthreadpool", + srcs = ['pthreadpool/src/legacy-api.c', 'pthreadpool/src/memory.c', 'pthreadpool/src/portable-api.c', 'pthreadpool/src/pthreads.c'], + deps = [ + ":FXdiv", + ":pthreadpool_header", + ], + compiler_flags = [ + "-w", + "-Os", + "-fstack-protector-strong", + "-fno-delete-null-pointer-checks" + ], + headers = { + 'threadpool-atomics.h': 'pthreadpool/src/threadpool-atomics.h', + 'threadpool-common.h': 'pthreadpool/src/threadpool-common.h', + 'threadpool-object.h': 'pthreadpool/src/threadpool-object.h', + 'threadpool-utils.h': 'pthreadpool/src/threadpool-utils.h', + }, + header_namespace = "", + preferred_linkage = "static", + link_whole = False, + platform_preprocessor_flags = [['windows', ['-D_WINDOWS', '-D_WIN32', '-DWIN32', '-DNOMINMAX', '-D_CRT_SECURE_NO_WARNINGS', '-D_USE_MATH_DEFINES']], ['windows.*64$', ['-D_WIN64']]], + preprocessor_flags = ['-DPTHREADPOOL_USE_FUTEX=0', '-DPTHREADPOOL_USE_GCD=0'], + reexport_all_header_dependencies = True, + visibility = ['PUBLIC'], +) + +cxx_library( + name = "pthreadpool_header", + header_namespace = "", + exported_headers = { + "pthreadpool.h": "pthreadpool/include/pthreadpool.h", + }, + reexport_all_header_dependencies = True, + visibility = ["PUBLIC"], +) + +cxx_library( + name = "FXdiv", + header_namespace = "", + exported_headers = { + "fxdiv.h": "FXdiv/include/fxdiv.h", + }, + reexport_all_header_dependencies = True, + visibility = ["PUBLIC"], +) + +cxx_library( + name = "psimd", + header_namespace = "", + exported_headers = { + "psimd.h": "psimd/include/psimd.h", + }, + preferred_linkage = "static", + visibility = ["PUBLIC"], +) + +cxx_library( + name = "cpuinfo", + srcs = [ + "cpuinfo/wrappers/api.c", + "cpuinfo/wrappers/arm/android/properties.c", + "cpuinfo/wrappers/arm/cache.c", + "cpuinfo/wrappers/arm/linux/aarch32-isa.c", + "cpuinfo/wrappers/arm/linux/aarch64-isa.c", + "cpuinfo/wrappers/arm/linux/chipset.c", + "cpuinfo/wrappers/arm/linux/clusters.c", + "cpuinfo/wrappers/arm/linux/cpuinfo.c", + "cpuinfo/wrappers/arm/linux/hwcap.c", + "cpuinfo/wrappers/arm/linux/init.c", + "cpuinfo/wrappers/arm/linux/midr.c", + "cpuinfo/wrappers/arm/mach/init.c", + "cpuinfo/wrappers/arm/uarch.c", + "cpuinfo/wrappers/cache.c", + "cpuinfo/wrappers/init.c", + "cpuinfo/wrappers/linux/cpulist.c", + "cpuinfo/wrappers/linux/multiline.c", + "cpuinfo/wrappers/linux/processors.c", + "cpuinfo/wrappers/linux/smallfile.c", + "cpuinfo/wrappers/mach/topology.c", + "cpuinfo/wrappers/x86/cache/descriptor.c", + "cpuinfo/wrappers/x86/cache/deterministic.c", + "cpuinfo/wrappers/x86/cache/init.c", + "cpuinfo/wrappers/x86/info.c", + "cpuinfo/wrappers/x86/init.c", + "cpuinfo/wrappers/x86/isa.c", + "cpuinfo/wrappers/x86/linux/cpuinfo.c", + "cpuinfo/wrappers/x86/linux/init.c", + "cpuinfo/wrappers/x86/mach/init.c", + "cpuinfo/wrappers/x86/name.c", + "cpuinfo/wrappers/x86/topology.c", + "cpuinfo/wrappers/x86/uarch.c", + "cpuinfo/wrappers/x86/vendor.c", + "cpuinfo/wrappers/x86/windows/init.c", + ], + include_directories = ["cpuinfo/src"], + public_include_directories = ["cpuinfo/include"], + raw_headers = glob([ + "cpuinfo/src/**/*.h", + "cpuinfo/src/**/*.c", + ]), + preferred_linkage = "static", + preprocessor_flags = [ + "-DCPUINFO_LOG_LEVEL=2", + "-D_GNU_SOURCE=1", + ], + visibility = ["PUBLIC"], + deps = [ + ":clog", + ], +) + +cxx_library( + name = "clog", + srcs = [ + "cpuinfo/deps/clog/src/clog.c", + ], + raw_headers = glob([ + "cpuinfo/deps/clog/include/*.h", + ]), + public_include_directories = [ + "cpuinfo/deps/clog/include/", + ], + force_static = True, + visibility = ["PUBLIC"], +) + +cxx_library( + name = "FP16", + raw_headers = glob([ + "FP16/include/*.h", + ]), + public_include_directories = [ + "FP16/include/", + ], + force_static = True, + visibility = ["PUBLIC"], +) + + +cxx_library( + name = "miniz", + srcs = ["miniz-2.0.8/miniz.c"], + header_namespace = "", + exported_headers = {"miniz.h": "miniz-2.0.8/miniz.h"}, + exported_preprocessor_flags = [ + "-DMINIZ_DISABLE_ZIP_READER_CRC32_CHECKS", + ], + visibility = ["PUBLIC"], +) + +remote_file( + name = "typing-extensions-download", + url = "https://files.pythonhosted.org/packages/75/e1/932e06004039dd670c9d5e1df0cd606bf46e29a28e65d5bb28e894ea29c9/typing_extensions-4.2.0-py3-none-any.whl", + sha1 = "ff0849420e94f425818bff5d0f25e3cdfaba8601", + out = "typing_extensions-4.2.0-py3-none-any.whl", +) + +prebuilt_python_library( + name = "typing-extensions", + binary_src = ":typing-extensions-download", + visibility = ["PUBLIC"], + deps = [":typing-extensions-download"], +) + +remote_file( + name = "pyyaml-download", + url = "https://files.pythonhosted.org/packages/12/fc/a4d5a7554e0067677823f7265cb3ae22aed8a238560b5133b58cda252dad/PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", + sha1 = "11aa9c5fe2d890b6a73212beadc7c8a4265ebc39", + out = "pyyaml.whl", +) + +prebuilt_python_library( + name = "pyyaml", + binary_src = ":pyyaml-download", + visibility = ["PUBLIC"], + deps = [":pyyaml-download"], +) + +cxx_library( + name = "ruy_lib", + srcs = glob( + ["ruy/**/*.cc"], + exclude = [ + "ruy/ruy/test_*.cc", + "ruy/ruy/*_test.cc", + "ruy/example/*.cc", + "ruy/ruy/profiler/test.cc", + "ruy/ruy/benchmark.cc", + ], + ), + compiler_flags = ["-Os"], + preferred_linkage = "static", + public_include_directories = ["ruy"], + raw_headers = glob(["ruy/**/*.h"]), + visibility = [ + "PUBLIC", + ], +) diff --git a/third_party/LICENSES_BUNDLED.txt b/third_party/LICENSES_BUNDLED.txt index c1c9a1783964..9b61374c0aa7 100644 --- a/third_party/LICENSES_BUNDLED.txt +++ b/third_party/LICENSES_BUNDLED.txt @@ -6,11 +6,21 @@ License: MIT Files: third_party/FP16 For details, see third_party/FP16/LICENSE +Name: FP16-source +License: MIT +Files: third_party/XNNPACK/build/FP16-source + For details, see third_party/XNNPACK/build/FP16-source/LICENSE + Name: FXdiv License: MIT Files: third_party/FXdiv For details, see third_party/FXdiv/LICENSE +Name: FXdiv-source +License: MIT +Files: third_party/XNNPACK/build/FXdiv-source + For details, see third_party/XNNPACK/build/FXdiv-source/LICENSE + Name: NNPACK License: BSD-2-Clause Files: third_party/NNPACK @@ -29,22 +39,36 @@ Files: third_party/XNNPACK Name: benchmark License: Apache-2.0 Files: third_party/benchmark, - third_party/protobuf/third_party/benchmark, + third_party/onnx/third_party/benchmark, third_party/onnx-tensorrt/third_party/onnx/third_party/benchmark, - third_party/onnx/third_party/benchmark + third_party/protobuf/third_party/benchmark For details, see third_party/benchmark/LICENSE, - third_party/protobuf/third_party/benchmark/LICENSE, + third_party/onnx/third_party/benchmark/LICENSE, third_party/onnx-tensorrt/third_party/onnx/third_party/benchmark/LICENSE, - third_party/onnx/third_party/benchmark/LICENSE + third_party/protobuf/third_party/benchmark/LICENSE + +Name: breakpad +License: BSD-3-Clause +Files: third_party/breakpad + For details, see third_party/breakpad/LICENSE Name: clog License: BSD-2-Clause -Files: third_party/cpuinfo/deps/clog, - third_party/fbgemm/third_party/cpuinfo/deps/clog, - third_party/QNNPACK/deps/clog - For details, see third_party/cpuinfo/deps/clog/LICENSE, - third_party/fbgemm/third_party/cpuinfo/deps/clog/LICENSE, - third_party/QNNPACK/deps/clog/LICENSE +Files: third_party/QNNPACK/deps/clog, + third_party/XNNPACK/build/clog-source/deps/clog, + third_party/XNNPACK/build/cpuinfo-source/deps/clog, + third_party/cpuinfo/deps/clog, + third_party/fbgemm/third_party/cpuinfo/deps/clog + For details, see third_party/QNNPACK/deps/clog/LICENSE, + third_party/XNNPACK/build/clog-source/deps/clog/LICENSE, + third_party/XNNPACK/build/cpuinfo-source/deps/clog/LICENSE, + third_party/cpuinfo/deps/clog/LICENSE, + third_party/fbgemm/third_party/cpuinfo/deps/clog/LICENSE + +Name: clog-source +License: BSD-2-Clause +Files: third_party/XNNPACK/build/clog-source + For details, see third_party/XNNPACK/build/clog-source/LICENSE Name: cpuinfo License: BSD-2-Clause @@ -53,6 +77,21 @@ Files: third_party/cpuinfo, For details, see third_party/cpuinfo/LICENSE, third_party/fbgemm/third_party/cpuinfo/LICENSE +Name: cpuinfo-source +License: BSD-2-Clause +Files: third_party/XNNPACK/build/cpuinfo-source + For details, see third_party/XNNPACK/build/cpuinfo-source/LICENSE + +Name: cudnn_frontend +License: MIT +Files: third_party/cudnn_frontend + For details, see third_party/cudnn_frontend/LICENSE.txt + +Name: dart +License: Apache-2.0 +Files: third_party/flatbuffers/dart + For details, see third_party/flatbuffers/dart/LICENSE + Name: eigen License: BSD-3-Clause Files: third_party/eigen @@ -68,12 +107,17 @@ License: BSD-3-Clause Files: third_party/fbgemm For details, see third_party/fbgemm/LICENSE +Name: flatbuffers +License: Apache-2.0 +Files: third_party/flatbuffers + For details, see third_party/flatbuffers/LICENSE.txt + Name: fmt License: MIT with exception -Files: third_party/kineto/libkineto/third_party/fmt, - third_party/fmt - For details, see third_party/kineto/libkineto/third_party/fmt/LICENSE.rst, - third_party/fmt/LICENSE.rst +Files: third_party/fmt, + third_party/kineto/libkineto/third_party/fmt + For details, see third_party/fmt/LICENSE.rst, + third_party/kineto/libkineto/third_party/fmt/LICENSE.rst Name: foxi License: MIT @@ -87,14 +131,18 @@ Files: third_party/gemmlowp/gemmlowp Name: generator License: Apache-2.0 -Files: third_party/kineto/libkineto/third_party/googletest/googlemock/scripts/generator, - third_party/googletest/googlemock/scripts/generator, +Files: third_party/XNNPACK/build/googletest-source/googlemock/scripts/generator, + third_party/benchmark/build/third_party/googletest/src/googlemock/scripts/generator, third_party/fbgemm/third_party/googletest/googlemock/scripts/generator, + third_party/googletest/googlemock/scripts/generator, + third_party/kineto/libkineto/third_party/googletest/googlemock/scripts/generator, third_party/protobuf/third_party/googletest/googlemock/scripts/generator, third_party/tensorpipe/third_party/googletest/googlemock/scripts/generator - For details, see third_party/kineto/libkineto/third_party/googletest/googlemock/scripts/generator/LICENSE, - third_party/googletest/googlemock/scripts/generator/LICENSE, + For details, see third_party/XNNPACK/build/googletest-source/googlemock/scripts/generator/LICENSE, + third_party/benchmark/build/third_party/googletest/src/googlemock/scripts/generator/LICENSE, third_party/fbgemm/third_party/googletest/googlemock/scripts/generator/LICENSE, + third_party/googletest/googlemock/scripts/generator/LICENSE, + third_party/kineto/libkineto/third_party/googletest/googlemock/scripts/generator/LICENSE, third_party/protobuf/third_party/googletest/googlemock/scripts/generator/LICENSE, third_party/tensorpipe/third_party/googletest/googlemock/scripts/generator/LICENSE @@ -103,46 +151,58 @@ License: BSD-3-Clause Files: third_party/gloo For details, see third_party/gloo/LICENSE +Name: googlebenchmark-source +License: Apache-2.0 +Files: third_party/XNNPACK/build/googlebenchmark-source + For details, see third_party/XNNPACK/build/googlebenchmark-source/LICENSE + Name: googlemock License: BSD-3-Clause -Files: third_party/kineto/libkineto/third_party/googletest/googlemock, - third_party/googletest/googlemock, +Files: third_party/XNNPACK/build/googletest-source/googlemock, third_party/fbgemm/third_party/googletest/googlemock, + third_party/kineto/libkineto/third_party/googletest/googlemock, third_party/protobuf/third_party/googletest/googlemock, third_party/tensorpipe/third_party/googletest/googlemock - For details, see third_party/kineto/libkineto/third_party/googletest/googlemock/LICENSE, - third_party/googletest/googlemock/LICENSE, + For details, see third_party/XNNPACK/build/googletest-source/googlemock/LICENSE, third_party/fbgemm/third_party/googletest/googlemock/LICENSE, + third_party/kineto/libkineto/third_party/googletest/googlemock/LICENSE, third_party/protobuf/third_party/googletest/googlemock/LICENSE, third_party/tensorpipe/third_party/googletest/googlemock/LICENSE Name: googletest License: BSD-3-Clause -Files: third_party/kineto/libkineto/third_party/googletest, - third_party/kineto/libkineto/third_party/googletest/googletest, - third_party/googletest, - third_party/googletest/googletest, +Files: third_party/XNNPACK/build/googletest-source/googletest, third_party/fbgemm/third_party/googletest, third_party/fbgemm/third_party/googletest/googletest, + third_party/googletest, + third_party/kineto/libkineto/third_party/googletest, + third_party/kineto/libkineto/third_party/googletest/googletest, third_party/protobuf/third_party/googletest, third_party/protobuf/third_party/googletest/googletest, third_party/tensorpipe/third_party/googletest, third_party/tensorpipe/third_party/googletest/googletest - For details, see third_party/kineto/libkineto/third_party/googletest/LICENSE, - third_party/kineto/libkineto/third_party/googletest/googletest/LICENSE, - third_party/googletest/LICENSE, - third_party/googletest/googletest/LICENSE, + For details, see third_party/XNNPACK/build/googletest-source/googletest/LICENSE, third_party/fbgemm/third_party/googletest/LICENSE, third_party/fbgemm/third_party/googletest/googletest/LICENSE, + third_party/googletest/LICENSE, + third_party/kineto/libkineto/third_party/googletest/LICENSE, + third_party/kineto/libkineto/third_party/googletest/googletest/LICENSE, third_party/protobuf/third_party/googletest/LICENSE, third_party/protobuf/third_party/googletest/googletest/LICENSE, third_party/tensorpipe/third_party/googletest/LICENSE, third_party/tensorpipe/third_party/googletest/googletest/LICENSE +Name: googletest-source +License: BSD-3-Clause +Files: third_party/XNNPACK/build/googletest-source + For details, see third_party/XNNPACK/build/googletest-source/LICENSE + Name: gtest License: BSD-3-Clause -Files: third_party/ideep/mkl-dnn/tests/gtests/gtest - For details, see third_party/ideep/mkl-dnn/tests/gtests/gtest/LICENSE +Files: third_party/ideep/mkl-dnn/tests/gtest, + third_party/ideep/mkl-dnn/third_party/oneDNN/tests/gtests/gtest + For details, see third_party/ideep/mkl-dnn/tests/gtest/LICENSE, + third_party/ideep/mkl-dnn/third_party/oneDNN/tests/gtests/gtest/LICENSE Name: ideep License: MIT @@ -154,11 +214,21 @@ License: BSD-3-Clause Files: third_party/ios-cmake For details, see third_party/ios-cmake/LICENSE +Name: json +License: MIT +Files: third_party/cudnn_frontend/include/contrib/nlohmann/json + For details, see third_party/cudnn_frontend/include/contrib/nlohmann/json/LICENSE.txt + Name: kineto License: BSD-3-Clause Files: third_party/kineto For details, see third_party/kineto/LICENSE +Name: libdisasm +License: Clarified Artistic License +Files: third_party/breakpad/src/third_party/libdisasm + For details, see third_party/breakpad/src/third_party/libdisasm/LICENSE + Name: libnop License: Apache-2.0 Files: third_party/tensorpipe/third_party/libnop @@ -169,6 +239,11 @@ License: MIT Files: third_party/tensorpipe/third_party/libuv For details, see third_party/tensorpipe/third_party/libuv/LICENSE +Name: lss +License: BSD-3-Clause +Files: third_party/breakpad/src/third_party/lss + For details, see third_party/breakpad/src/third_party/lss/LICENSE + Name: miniz-2.0.8 License: MIT Files: third_party/miniz-2.0.8 @@ -189,12 +264,20 @@ License: BSD-Source-Code Files: third_party/neon2sse For details, see third_party/neon2sse/LICENSE +Name: oneDNN +License: Apache-2.0 +Files: third_party/ideep/mkl-dnn/third_party/oneDNN + For details, see third_party/ideep/mkl-dnn/third_party/oneDNN/LICENSE + +Name: onnx +License: Apache-2.0 +Files: third_party/onnx + For details, see third_party/onnx/LICENSE + Name: onnx License: MIT -Files: third_party/onnx-tensorrt/third_party/onnx, - third_party/onnx - For details, see third_party/onnx-tensorrt/third_party/onnx/LICENSE, - third_party/onnx/LICENSE +Files: third_party/onnx-tensorrt/third_party/onnx + For details, see third_party/onnx-tensorrt/third_party/onnx/LICENSE Name: onnx-tensorrt License: MIT @@ -208,23 +291,30 @@ Files: third_party/protobuf Name: psimd License: MIT -Files: third_party/psimd - For details, see third_party/psimd/LICENSE +Files: third_party/XNNPACK/deps/psimd, + third_party/psimd + For details, see third_party/XNNPACK/deps/psimd/LICENSE, + third_party/psimd/LICENSE Name: pthreadpool License: BSD-2-Clause Files: third_party/pthreadpool For details, see third_party/pthreadpool/LICENSE +Name: pthreadpool-source +License: BSD-2-Clause +Files: third_party/XNNPACK/build/pthreadpool-source + For details, see third_party/XNNPACK/build/pthreadpool-source/LICENSE + Name: pybind11 License: BSD-3-Clause -Files: third_party/pybind11, +Files: third_party/onnx/third_party/pybind11, third_party/onnx-tensorrt/third_party/onnx/third_party/pybind11, - third_party/onnx/third_party/pybind11, + third_party/pybind11, third_party/tensorpipe/third_party/pybind11 - For details, see third_party/pybind11/LICENSE, + For details, see third_party/onnx/third_party/pybind11/LICENSE, third_party/onnx-tensorrt/third_party/onnx/third_party/pybind11/LICENSE, - third_party/onnx/third_party/pybind11/LICENSE, + third_party/pybind11/LICENSE, third_party/tensorpipe/third_party/pybind11/LICENSE Name: python-peachpy @@ -242,6 +332,21 @@ License: BSL-1.0 Files: third_party/sleef For details, see third_party/sleef/LICENSE.txt +Name: src +License: BSD-3-Clause +Files: third_party/benchmark/build/third_party/googletest/src + For details, see third_party/benchmark/build/third_party/googletest/src/LICENSE + +Name: swift +License: Apache-2.0 +Files: third_party/flatbuffers/swift + For details, see third_party/flatbuffers/swift/LICENSE + +Name: tb_plugin +License: BSD-3-Clause +Files: third_party/kineto/tb_plugin + For details, see third_party/kineto/tb_plugin/LICENSE + Name: tbb License: Apache-2.0 Files: third_party/tbb diff --git a/third_party/XNNPACK b/third_party/XNNPACK index 79cd5f9e18ad..ae108ef49aa5 160000 --- a/third_party/XNNPACK +++ b/third_party/XNNPACK @@ -1 +1 @@ -Subproject commit 79cd5f9e18ad0925ac9a050b00ea5a36230072db +Subproject commit ae108ef49aa5623b896fc93d4298c49d1750d9ba diff --git a/third_party/breakpad b/third_party/breakpad deleted file mode 160000 index 7d188f679d4a..000000000000 --- a/third_party/breakpad +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 7d188f679d4ae0a5bd06408a3047d69ef8eef848 diff --git a/third_party/build_bundled.py b/third_party/build_bundled.py index 0e1da44565ed..c05e1c3642fe 100644 --- a/third_party/build_bundled.py +++ b/third_party/build_bundled.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +import argparse import os @@ -52,7 +53,7 @@ def create_bundled(d, outstream): outstream.write(f"Files: {files}\n") outstream.write(' For details, see ') outstream.write(license_file) - outstream.write('\n\n') + outstream.write('\n\n') def identify_license(f, exception=''): @@ -89,6 +90,8 @@ def squeeze(t): elif 'BoostSoftwareLicense-Version1.0' in txt: # Hmm, do we need to check the text? return 'BSL-1.0' + elif squeeze("Clarified Artistic License") in txt: + return 'Clarified Artistic License' elif all([squeeze(m) in txt.lower() for m in bsd3_txt]): return 'BSD-3-Clause' elif all([squeeze(m) in txt.lower() for m in bsd3_v1_txt]): @@ -97,30 +100,30 @@ def squeeze(t): return 'BSD-2-Clause' elif all([squeeze(m) in txt.lower() for m in bsd3_src_txt]): return 'BSD-Source-Code' - elif all([squeeze(m) in txt.lower() for m in mit_txt]): + elif any([squeeze(m) in txt.lower() for m in mit_txt]): return 'MIT' else: raise ValueError('unknown license') -mit_txt = ['permission is hereby granted, free of charge, to any person ' - 'obtaining a copy of this software and associated documentation ' - 'files (the "software"), to deal in the software without ' - 'restriction, including without limitation the rights to use, copy, ' - 'modify, merge, publish, distribute, sublicense, and/or sell copies ' - 'of the software, and to permit persons to whom the software is ' +mit_txt = ['permission is hereby granted, free of charge, to any person ', + 'obtaining a copy of this software and associated documentation ', + 'files (the "software"), to deal in the software without ', + 'restriction, including without limitation the rights to use, copy, ', + 'modify, merge, publish, distribute, sublicense, and/or sell copies ', + 'of the software, and to permit persons to whom the software is ', 'furnished to do so, subject to the following conditions:', - 'the above copyright notice and this permission notice shall be ' + 'the above copyright notice and this permission notice shall be ', 'included in all copies or substantial portions of the software.', - 'the software is provided "as is", without warranty of any kind, ' - 'express or implied, including but not limited to the warranties of ' - 'merchantability, fitness for a particular purpose and ' - 'noninfringement. in no event shall the authors or copyright holders ' - 'be liable for any claim, damages or other liability, whether in an ' - 'action of contract, tort or otherwise, arising from, out of or in ' - 'connection with the software or the use or other dealings in the ' - 'software.' + 'the software is provided "as is", without warranty of any kind, ', + 'express or implied, including but not limited to the warranties of ', + 'merchantability, fitness for a particular purpose and ', + 'noninfringement. in no event shall the authors or copyright holders ', + 'be liable for any claim, damages or other liability, whether in an ', + 'action of contract, tort or otherwise, arising from, out of or in ', + 'connection with the software or the use or other dealings in the ', + 'software.', ] bsd3_txt = ['redistribution and use in source and binary forms, with or without ' @@ -154,6 +157,21 @@ def squeeze(t): if __name__ == '__main__': third_party = os.path.join(mydir) - fname = os.path.join(third_party, 'LICENSES_BUNDLED.txt') + parser = argparse.ArgumentParser( + description="Generate bundled licenses file", + ) + parser.add_argument( + "--out-file", + type=str, + default=os.environ.get( + "PYTORCH_THIRD_PARTY_BUNDLED_LICENSE_FILE", + str(os.path.join(third_party, 'LICENSES_BUNDLED.txt')) + ), + help="location to output new bundled licenses file", + ) + + args = parser.parse_args() + fname = args.out_file + print(f"+ Writing bundled licenses to {args.out_file}") with open(fname, 'w') as fid: create_bundled(third_party, fid) diff --git a/third_party/cudnn_frontend b/third_party/cudnn_frontend index 51e60d891b68..43709ab96c47 160000 --- a/third_party/cudnn_frontend +++ b/third_party/cudnn_frontend @@ -1 +1 @@ -Subproject commit 51e60d891b689d618e7a623509a779c422a420f7 +Subproject commit 43709ab96c47e26eebcdac72f93f946d44ceffa8 diff --git a/third_party/eigen b/third_party/eigen index d41dc4dd74ac..3147391d946b 160000 --- a/third_party/eigen +++ b/third_party/eigen @@ -1 +1 @@ -Subproject commit d41dc4dd74acce21fb210e7625d5d135751fa9e5 +Subproject commit 3147391d946bb4b6c68edd901f2add6ac1f31f8c diff --git a/third_party/fbgemm b/third_party/fbgemm index e385d0267a9c..2e9be6581010 160000 --- a/third_party/fbgemm +++ b/third_party/fbgemm @@ -1 +1 @@ -Subproject commit e385d0267a9cc6235ee19d4689930e32fe693b89 +Subproject commit 2e9be65810107a9595da717f95d21924b73be833 diff --git a/third_party/generate-cpuinfo-wrappers.py b/third_party/generate-cpuinfo-wrappers.py new file mode 100644 index 000000000000..825a6bd228a2 --- /dev/null +++ b/third_party/generate-cpuinfo-wrappers.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 + +from __future__ import print_function +import os + + +CPUINFO_SOURCES = { + None: [ + "init.c", + "api.c", + "cache.c", + ], + "defined(__linux__)": [ + "linux/multiline.c", + "linux/cpulist.c", + "linux/mockfile.c", + "linux/smallfile.c", + "linux/processors.c", + ], + "defined(__MACH__) && defined(__APPLE__)": [ + "mach/topology.c", + ], + "defined(__i386__) || defined(__i686__) || defined(__x86_64__) || defined(_WIN32)": [ + "x86/cache/init.c", + "x86/cache/deterministic.c", + "x86/cache/descriptor.c", + "x86/info.c", + "x86/mockcpuid.c", + "x86/isa.c", + "x86/topology.c", + "x86/name.c", + "x86/init.c", + "x86/uarch.c", + "x86/vendor.c", + ], + "(defined(__i386__) || defined(__i686__) || defined(__x86_64__)) && defined(__linux__)": [ + "x86/linux/init.c", + "x86/linux/cpuinfo.c", + ], + "(defined(__i386__) || defined(__i686__) || defined(__x86_64__)) && defined(__MACH__) && defined(__APPLE__)": [ + "x86/mach/init.c", + ], + "defined(_WIN32)": [ + "x86/windows/init.c", + ], + "(defined(__arm__) || defined(__aarch64__)) && defined(__linux__)": [ + "arm/linux/cpuinfo.c", + "arm/linux/hwcap.c", + "arm/linux/init.c", + "arm/linux/clusters.c", + "arm/linux/midr.c", + "arm/linux/chipset.c", + "arm/tlb.c", + "arm/uarch.c", + "arm/cache.c", + ], + "defined(__arm__) && defined(__linux__)": [ + "arm/linux/aarch32-isa.c", + ], + "defined(__aarch64__) && defined(__linux__)": [ + "arm/linux/aarch64-isa.c", + ], + "(defined(__arm__) || defined(__aarch64__)) && defined(__ANDROID__)": [ + "arm/android/properties.c", + ], + "(defined(__arm__) || defined(__aarch64__)) && defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE": [ + "arm/mach/init.c", + ], +} + + +if __name__ == "__main__": + for condition, filenames in CPUINFO_SOURCES.items(): + for filename in filenames: + filepath = os.path.join("cpuinfo/wrappers", filename) + if not os.path.exists(os.path.dirname(filepath)): + print(filepath) + os.makedirs(os.path.dirname(filepath)) + with open(filepath, "w") as wrapper: + print("/* Auto-generated by generate-wrappers.py script. Do not modify */", file=wrapper) + print(file=wrapper) + print("#ifdef __APPLE__", file=wrapper) + print("\t#include ", file=wrapper) + print("#endif /* __APPLE__ */", file=wrapper) + print(file=wrapper) + + if not condition: + print("#include <%s>" % filename, file=wrapper) + else: + # Include source file only if condition is satisfied + print("#if %s" % condition, file=wrapper) + print("#include <%s>" % filename, file=wrapper) + print("#endif /* %s */" % condition, file=wrapper) diff --git a/third_party/generate-xnnpack-wrappers.py b/third_party/generate-xnnpack-wrappers.py new file mode 100644 index 000000000000..23992645672a --- /dev/null +++ b/third_party/generate-xnnpack-wrappers.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 + +from __future__ import print_function +import collections +import os + +BANNER = "Auto-generated by generate-wrappers.py script. Do not modify" +WRAPPER_SRC_NAMES = { + "PROD_SCALAR_PORTABLE_MICROKERNEL_SRCS": None, + "PROD_SCALAR_AARCH32_MICROKERNEL_SRCS" : "defined(__arm__)", + "PROD_NEON_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)", + "PROD_NEONFP16_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)", + "PROD_NEONFMA_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)", + "PROD_AARCH64_NEON_MICROKERNEL_SRCS": "defined(__aarch64__)", + "PROD_NEONV8_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)", + "PROD_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS": "defined(__aarch64__)", + "PROD_NEONDOT_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)", + "PROD_SSE_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", + "PROD_SSE2_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", + "PROD_SSSE3_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", + "PROD_SSE41_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", + "PROD_AVX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", + "PROD_F16C_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", + "PROD_XOP_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", + "PROD_FMA3_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", + "PROD_AVX2_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", + "PROD_AVX512F_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", + "PROD_AVX512SKX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", + "AARCH32_ASM_MICROKERNEL_SRCS": "defined(__arm__)", + "AARCH64_ASM_MICROKERNEL_SRCS": "defined(__aarch64__)", +} + +SRC_NAMES = [ + "OPERATOR_SRCS", + "SUBGRAPH_SRCS", + "LOGGING_SRCS", + "HOT_SRCS", + "TABLE_SRCS", + "JIT_SRCS", + "JIT_AARCH32_SRCS", + "JIT_AARCH64_SRCS", + "PROD_SCALAR_PORTABLE_MICROKERNEL_SRCS", + "PROD_SSE_MICROKERNEL_SRCS", + "PROD_SSE2_MICROKERNEL_SRCS", + "PROD_SSSE3_MICROKERNEL_SRCS", + "PROD_SSE41_MICROKERNEL_SRCS", + "PROD_AVX_MICROKERNEL_SRCS", + "PROD_F16C_MICROKERNEL_SRCS", + "PROD_XOP_MICROKERNEL_SRCS", + "PROD_FMA3_MICROKERNEL_SRCS", + "PROD_AVX2_MICROKERNEL_SRCS", + "PROD_AVX512F_MICROKERNEL_SRCS", + "PROD_AVX512SKX_MICROKERNEL_SRCS", +] + +def update_sources(): + sources = collections.defaultdict(list) + with open("./XNNPACK/CMakeLists.txt") as cmake: + lines = cmake.readlines() + i = 0 + while i < len(lines): + line = lines[i] + if line.startswith("SET") and line.split('(')[1].strip(' \t\n\r') in set(WRAPPER_SRC_NAMES.keys()) | set(SRC_NAMES): + name = line.split('(')[1].strip(' \t\n\r') + i += 1 + while i < len(lines) and len(lines[i]) > 0 and ')' not in lines[i]: + # remove "src/" at the beginning, remove whitespaces and newline + value = lines[i].strip(' \t\n\r') + sources[name].append(value[4:]) + i += 1 + if i < len(lines) and len(lines[i]) > 4: + # remove "src/" at the beginning, possibly ')' at the end + value = lines[i].strip(' \t\n\r)') + sources[name].append(value[4:]) + else: + i += 1 + print(sources) + return sources + +if __name__ == "__main__": + xnnpack_sources = collections.defaultdict(list) + sources = update_sources() + for name in WRAPPER_SRC_NAMES: + xnnpack_sources[WRAPPER_SRC_NAMES[name]].extend(sources[name]) + for condition, filenames in xnnpack_sources.items(): + for filename in filenames: + filepath = os.path.join("XNNPACK/wrappers", filename) + if not os.path.isdir(os.path.dirname(filepath)): + os.makedirs(os.path.dirname(filepath)) + with open(filepath, "w") as wrapper: + print("/* {} */".format(BANNER), file=wrapper) + print(file=wrapper) + + # Architecture- or platform-dependent preprocessor flags can be + # defined here. Note: platform_preprocessor_flags can't be used + # because they are ignored by arc focus & buck project. + + if condition is None: + print("#include <%s>" % filename, file=wrapper) + else: + # Include source file only if condition is satisfied + print("#if %s" % condition, file=wrapper) + print("#include <%s>" % filename, file=wrapper) + print("#endif /* %s */" % condition, file=wrapper) diff --git a/third_party/glog.buck.bzl b/third_party/glog.buck.bzl new file mode 100644 index 000000000000..88e32ae02cd3 --- /dev/null +++ b/third_party/glog.buck.bzl @@ -0,0 +1,97 @@ +GLOG_CONFIG_HEADERS = [ + "vlog_is_on.h", + "stl_logging.h", + "raw_logging.h", + "logging.h", +] + +GLOG_SED_COMMAND = " ".join([ + "sed", + "-e 's/@ac_cv_cxx_using_operator@/1/g'", + "-e 's/@ac_cv_have_unistd_h@/1/g'", + "-e 's/@ac_cv_have_stdint_h@/1/g'", + "-e 's/@ac_cv_have_systypes_h@/1/g'", + "-e 's/@ac_cv_have_libgflags@/0/g'", + "-e 's/@ac_cv_have_uint16_t@/1/g'", + "-e 's/@ac_cv_have___builtin_expect@/1/g'", + "-e 's/@ac_cv_have_.*@/0/g'", + "-e 's/@ac_google_start_namespace@/namespace google {/g'", + "-e 's/@ac_google_end_namespace@/}/g'", + "-e 's/@ac_google_namespace@/google/g'", + "-e 's/@ac_cv___attribute___noinline@/__attribute__((noinline))/g'", + "-e 's/@ac_cv___attribute___noreturn@/__attribute__((noreturn))/g'", + "-e 's/@ac_cv___attribute___printf_4_5@/__attribute__((__format__ (__printf__, 4, 5)))/g'", +]) + +def define_glog(): + cxx_library( + name = "glog", + srcs = [ + "glog/src/demangle.cc", + "glog/src/vlog_is_on.cc", + "glog/src/symbolize.cc", + "glog/src/raw_logging.cc", + "glog/src/logging.cc", + "glog/src/signalhandler.cc", + "glog/src/utilities.cc", + ], + exported_headers = [":glog_{}".format(header) for header in GLOG_CONFIG_HEADERS], + header_namespace = "glog", + compiler_flags = [ + "-Wno-sign-compare", + "-Wno-unused-function", + "-Wno-unused-local-typedefs", + "-Wno-unused-variable", + "-Wno-deprecated-declarations", + ], + preferred_linkage = "static", + exported_linker_flags = [], + exported_preprocessor_flags = [ + "-DGLOG_NO_ABBREVIATED_SEVERITIES", + "-DGLOG_STL_LOGGING_FOR_UNORDERED", + "-DGOOGLE_GLOG_DLL_DECL=", + "-DGOOGLE_NAMESPACE=google", + # this is required for buck build + "-DGLOG_BAZEL_BUILD", + "-DHAVE_PTHREAD", + # Allows src/logging.cc to determine the host name. + "-DHAVE_SYS_UTSNAME_H", + # For src/utilities.cc. + "-DHAVE_SYS_SYSCALL_H", + "-DHAVE_SYS_TIME_H", + "-DHAVE_STDINT_H", + "-DHAVE_STRING_H", + # Enable dumping stacktrace upon sigaction. + "-DHAVE_SIGACTION", + # For logging.cc. + "-DHAVE_PREAD", + "-DHAVE___ATTRIBUTE__", + ], + deps = [":glog_config"], + soname = "libglog.$(ext)", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "glog_config", + header_namespace = "", + exported_headers = { + "config.h": ":glog_config.h", + "glog/log_severity.h": "glog/src/glog/log_severity.h", + }, + ) + + genrule( + name = "glog_config.h", + srcs = ["glog/src/config.h.cmake.in"], + out = "config.h", + cmd = "awk '{ gsub(/^#cmakedefine/, \"//cmakedefine\"); print; }' $SRCS > $OUT", + ) + + for header in GLOG_CONFIG_HEADERS: + genrule( + name = "glog_{}".format(header), + out = header, + srcs = ["glog/src/glog/{}.in".format(header)], + cmd = "{} $SRCS > $OUT".format(GLOG_SED_COMMAND), + ) diff --git a/third_party/ideep b/third_party/ideep index 4a56ab2c3f61..02b17c5748c9 160000 --- a/third_party/ideep +++ b/third_party/ideep @@ -1 +1 @@ -Subproject commit 4a56ab2c3f61c44e0f8ea241beeb732b7d70dc5b +Subproject commit 02b17c5748c9349dcc586c359af800c684d9b1ab diff --git a/third_party/kineto b/third_party/kineto index b5bb62d25be7..b2b48c00c6e5 160000 --- a/third_party/kineto +++ b/third_party/kineto @@ -1 +1 @@ -Subproject commit b5bb62d25be75c381dbbd975276602f021982ef2 +Subproject commit b2b48c00c6e5bd8e807e2231adb229db6a1d1c22 diff --git a/third_party/mkl-dnn.BUILD b/third_party/mkl-dnn.BUILD index 4ffe7a578fbf..1d40b1c5feda 100644 --- a/third_party/mkl-dnn.BUILD +++ b/third_party/mkl-dnn.BUILD @@ -10,6 +10,7 @@ _DNNL_RUNTIME_OMP = { "#cmakedefine DNNL_WITH_LEVEL_ZERO": "/* #undef DNNL_WITH_LEVEL_ZERO */", "#cmakedefine DNNL_SYCL_CUDA": "/* #undef DNNL_SYCL_CUDA */", "#cmakedefine DNNL_ENABLE_STACK_CHECKER": "#undef DNNL_ENABLE_STACK_CHECKER", + "#cmakedefine DNNL_EXPERIMENTAL": "#undef DNNL_EXPERIMENTAL", "#cmakedefine01 BUILD_TRAINING": "#define BUILD_TRAINING 1", "#cmakedefine01 BUILD_INFERENCE": "#define BUILD_INFERENCE 0", "#cmakedefine01 BUILD_PRIMITIVE_ALL": "#define BUILD_PRIMITIVE_ALL 1", @@ -37,6 +38,13 @@ _DNNL_RUNTIME_OMP = { "#cmakedefine01 BUILD_AVX2": "#define BUILD_AVX2 0", "#cmakedefine01 BUILD_AVX512": "#define BUILD_AVX512 0", "#cmakedefine01 BUILD_AMX": "#define BUILD_AMX 0", + "#cmakedefine01 BUILD_PRIMITIVE_GPU_ISA_ALL": "#define BUILD_PRIMITIVE_GPU_ISA_ALL 1", + "#cmakedefine01 BUILD_GEN9": "#define BUILD_GEN9 0", + "#cmakedefine01 BUILD_GEN11": "#define BUILD_GEN11 0", + "#cmakedefine01 BUILD_XELP": "#define BUILD_XELP 0", + "#cmakedefine01 BUILD_XEHPG": "#define BUILD_XEHPG 0", + "#cmakedefine01 BUILD_XEHPC": "#define BUILD_XEHPC 0", + "#cmakedefine01 BUILD_XEHP": "#define BUILD_XEHP 0", } template_rule( @@ -45,9 +53,9 @@ template_rule( out = "third_party/oneDNN/include/oneapi/dnnl/dnnl_version.h", substitutions = { "@DNNL_VERSION_MAJOR@": "2", - "@DNNL_VERSION_MINOR@": "5", - "@DNNL_VERSION_PATCH@": "2", - "@DNNL_VERSION_HASH@": "a9302535553c73243c632ad3c4c80beec3d19a1e", + "@DNNL_VERSION_MINOR@": "6", + "@DNNL_VERSION_PATCH@": "0", + "@DNNL_VERSION_HASH@": "52b5f107dd9cf10910aaa19cb47f3abf9b349815", }, ) diff --git a/third_party/onnx b/third_party/onnx index 85546f8c44e6..96046b8ccfb8 160000 --- a/third_party/onnx +++ b/third_party/onnx @@ -1 +1 @@ -Subproject commit 85546f8c44e627f8ff1181725d03cc49f675e44f +Subproject commit 96046b8ccfb8e6fa82f6b2b34b3d56add2e8849c diff --git a/third_party/xnnpack.buck.bzl b/third_party/xnnpack.buck.bzl new file mode 100644 index 000000000000..549c70e03953 --- /dev/null +++ b/third_party/xnnpack.buck.bzl @@ -0,0 +1,586 @@ +load("//tools/build_defs:glob_defs.bzl", "subdir_glob") + +def define_xnnpack(): + cxx_library( + name = "XNNPACK", + srcs = ["XNNPACK/src/allocator.c", "XNNPACK/src/init.c", "XNNPACK/src/memory-planner.c", "XNNPACK/src/operator-delete.c", "XNNPACK/src/runtime.c", "XNNPACK/src/subgraph.c", "XNNPACK/src/tensor.c", "XNNPACK/src/datatype-strings.c", "XNNPACK/src/operator-strings.c", "XNNPACK/src/subgraph-strings.c"], + deps = [":operators", ":subgraph", ":tables", ":ukernels_scalar", "//third_party:cpuinfo", "//third_party:pthreadpool", "//third_party:pthreadpool_header", ":arm_lib", ":x86_and_x86_64_lib"], + exported_deps = [], + compiler_flags = ["-w"], + preferred_linkage = "static", + exported_headers = {"xnnpack.h": "XNNPACK/include/xnnpack.h"}, + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0", "-DXNN_NO_Q8_OPERATORS", "-DXNN_NO_F16_OPERATORS", "-DXNN_NO_NCHW_OPERATORS", "-DXNN_NO_QU8_OPERATORS", "-DXNN_NO_S8_OPERATORS", "-DXNN_NO_U8_OPERATORS", "-DXNN_NO_VCVT_OPERATORS", "-DXNN_NO_X32_OPERATORS", "-DXNN_NO_X8_OPERATORS", "-DXNN_NO_XX_OPERATORS"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "ukernels_scalar", + srcs = ["XNNPACK/wrappers/params-init.c", "XNNPACK/wrappers/u8-lut32norm/scalar.c", "XNNPACK/wrappers/xx-copy/memcpy.c", "XNNPACK/wrappers/x8-lut/gen/lut-scalar-x4.c", "XNNPACK/wrappers/x32-depthtospace2d-chw2hwc/scalar.c"], + deps = [":interface", "//third_party:FP16", "//third_party:FXdiv"], + exported_deps = [], + compiler_flags = ["-w", "-O2"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "operators", + srcs = ["XNNPACK/src/operators/argmax-pooling-nhwc.c", "XNNPACK/src/operators/average-pooling-nhwc.c", "XNNPACK/src/operators/binary-elementwise-nd.c", "XNNPACK/src/operators/channel-shuffle-nc.c", "XNNPACK/src/operators/constant-pad-nd.c", "XNNPACK/src/operators/convolution-nchw.c", "XNNPACK/src/operators/convolution-nhwc.c", "XNNPACK/src/operators/deconvolution-nhwc.c", "XNNPACK/src/operators/depth-to-space-nchw2nhwc.c", "XNNPACK/src/operators/depth-to-space-nhwc.c", "XNNPACK/src/operators/fully-connected-nc.c", "XNNPACK/src/operators/global-average-pooling-ncw.c", "XNNPACK/src/operators/global-average-pooling-nwc.c", "XNNPACK/src/operators/lut-elementwise-nc.c", "XNNPACK/src/operators/max-pooling-nhwc.c", "XNNPACK/src/operators/prelu-nc.c", "XNNPACK/src/operators/resize-bilinear-nchw.c", "XNNPACK/src/operators/resize-bilinear-nhwc.c", "XNNPACK/src/operators/softmax-nc.c", "XNNPACK/src/operators/unary-elementwise-nc.c", "XNNPACK/src/operators/unpooling-nhwc.c", "XNNPACK/src/indirection.c", "XNNPACK/src/operator-run.c", "XNNPACK/src/packing.c"], + deps = [":interface", "//third_party:cpuinfo", "//third_party:FP16", "//third_party:FXdiv", "//third_party:clog"], + exported_deps = [], + compiler_flags = ["-w", "-Os"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "arm_lib", + srcs = [], + deps = [":jit_memory", ":ukernels_asm_aarch32", ":ukernels_asm_aarch64", ":ukernels_neon", ":ukernels_neon_aarch64", ":ukernels_neon_dot", ":ukernels_neon_fma", ":ukernels_neon_fp16", ":ukernels_neon_fp16arith_aarch64", ":ukernels_neon_v8", ":ukernels_scalar_aarch32"], + exported_deps = [], + compiler_flags = ["-w"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "third-party/XNNPACK", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + preprocessor_flags = [], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "x86_and_x86_64_lib", + srcs = [], + deps = [":ukernels_avx", ":ukernels_avx2", ":ukernels_avx512", ":ukernels_avx512skx", ":ukernels_f16c", ":ukernels_fma3", ":ukernels_sse", ":ukernels_sse2", ":ukernels_sse41", ":ukernels_ssse3", ":ukernels_xop"], + exported_deps = [], + compiler_flags = ["-w"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "third-party/XNNPACK", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + preprocessor_flags = [], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "tables", + srcs = ["XNNPACK/src/tables/exp2-k-over-64.c", "XNNPACK/src/tables/exp2-k-over-2048.c", "XNNPACK/src/tables/exp2minus-k-over-4.c", "XNNPACK/src/tables/exp2minus-k-over-8.c", "XNNPACK/src/tables/exp2minus-k-over-16.c", "XNNPACK/src/tables/exp2minus-k-over-64.c", "XNNPACK/src/tables/exp2minus-k-over-2048.c"], + deps = [":interface", "//third_party:FP16", "//third_party:FXdiv", "//third_party:clog"], + exported_deps = [], + compiler_flags = ["-w", "-O2"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "subgraph", + srcs = ["XNNPACK/src/subgraph/abs.c", "XNNPACK/src/subgraph/add2.c", "XNNPACK/src/subgraph/argmax-pooling-2d.c", "XNNPACK/src/subgraph/average-pooling-2d.c", "XNNPACK/src/subgraph/bankers-rounding.c", "XNNPACK/src/subgraph/ceiling.c", "XNNPACK/src/subgraph/clamp.c", "XNNPACK/src/subgraph/convert.c", "XNNPACK/src/subgraph/convolution-2d.c", "XNNPACK/src/subgraph/deconvolution-2d.c", "XNNPACK/src/subgraph/depth-to-space.c", "XNNPACK/src/subgraph/depthwise-convolution-2d.c", "XNNPACK/src/subgraph/divide.c", "XNNPACK/src/subgraph/elu.c", "XNNPACK/src/subgraph/floor.c", "XNNPACK/src/subgraph/fully-connected.c", "XNNPACK/src/subgraph/global-average-pooling-2d.c", "XNNPACK/src/subgraph/hardswish.c", "XNNPACK/src/subgraph/leaky-relu.c", "XNNPACK/src/subgraph/max-pooling-2d.c", "XNNPACK/src/subgraph/maximum2.c", "XNNPACK/src/subgraph/minimum2.c", "XNNPACK/src/subgraph/multiply2.c", "XNNPACK/src/subgraph/negate.c", "XNNPACK/src/subgraph/prelu.c", "XNNPACK/src/subgraph/sigmoid.c", "XNNPACK/src/subgraph/softmax.c", "XNNPACK/src/subgraph/square-root.c", "XNNPACK/src/subgraph/square.c", "XNNPACK/src/subgraph/squared-difference.c", "XNNPACK/src/subgraph/static-constant-pad.c", "XNNPACK/src/subgraph/static-reshape.c", "XNNPACK/src/subgraph/static-resize-bilinear-2d.c", "XNNPACK/src/subgraph/subtract.c", "XNNPACK/src/subgraph/unpooling-2d.c"], + deps = [":interface", "//third_party:FP16", "//third_party:FXdiv", "//third_party:clog"], + exported_deps = [], + compiler_flags = ["-w", "-O2"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "ukernels_avx512", + srcs = [], + deps = [":interface"], + exported_deps = [], + compiler_flags = ["-w", "-O2", "-mavx512f"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [["x86", ["-mavx512f"]]], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + platform_srcs = [["x86|x86_64|platform009", ["XNNPACK/wrappers/f32-dwconv/gen/up16x3-minmax-avx512f.c", "XNNPACK/wrappers/f32-dwconv/gen/up16x4-minmax-avx512f.c", "XNNPACK/wrappers/f32-dwconv/gen/up16x9-minmax-avx512f.c", "XNNPACK/wrappers/f32-dwconv/gen/up16x25-minmax-avx512f.c", "XNNPACK/wrappers/f32-gemm/gen/1x16-minmax-avx512f-broadcast.c", "XNNPACK/wrappers/f32-gemm/gen/7x16-minmax-avx512f-broadcast.c", "XNNPACK/wrappers/f32-igemm/gen/1x16-minmax-avx512f-broadcast.c", "XNNPACK/wrappers/f32-igemm/gen/7x16-minmax-avx512f-broadcast.c", "XNNPACK/wrappers/f32-prelu/gen/avx512f-2x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vadd-minmax-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vaddc-minmax-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vdiv-minmax-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vdivc-minmax-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vmax-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vmaxc-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vmin-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vminc-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vmul-minmax-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vmulc-minmax-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vrdivc-minmax-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vrsubc-minmax-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vsqrdiff-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vsqrdiffc-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vsub-minmax-avx512f-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vsubc-minmax-avx512f-x32.c", "XNNPACK/wrappers/f32-vclamp/gen/vclamp-avx512f-x16.c", "XNNPACK/wrappers/f32-velu/gen/velu-avx512f-rr1-lut16-p3-perm-x64.c", "XNNPACK/wrappers/f32-vhswish/gen/vhswish-avx512f-x16.c", "XNNPACK/wrappers/f32-vlrelu/gen/vlrelu-avx512f-x16.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndd-avx512f-x16.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndne-avx512f-x16.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndu-avx512f-x16.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndz-avx512f-x16.c", "XNNPACK/wrappers/f32-vsigmoid/gen/vsigmoid-avx512f-rr2-lut32-p2-perm2-scalef-div-x64.c", "XNNPACK/wrappers/f32-vunary/gen/vabs-avx512f-x16.c", "XNNPACK/wrappers/f32-vunary/gen/vneg-avx512f-x16.c", "XNNPACK/wrappers/f32-vunary/gen/vsqr-avx512f-x16.c"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "ukernels_neon_fp16arith_aarch64", + srcs = ["XNNPACK/wrappers/f16-dwconv/gen/up8x25-minmax-neonfp16arith-acc2.c", "XNNPACK/wrappers/f16-dwconv/gen/up16x3-minmax-neonfp16arith.c", "XNNPACK/wrappers/f16-dwconv/gen/up16x4-minmax-neonfp16arith.c", "XNNPACK/wrappers/f16-dwconv/gen/up16x9-minmax-neonfp16arith.c", "XNNPACK/wrappers/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c8.c", "XNNPACK/wrappers/f16-gavgpool/gen/7x-minmax-neonfp16arith-c8.c", "XNNPACK/wrappers/f16-gemm/gen/1x16-minmax-neonfp16arith-ld64.c", "XNNPACK/wrappers/f16-gemm/gen/6x16-minmax-neonfp16arith-ld64.c", "XNNPACK/wrappers/f16-ibilinear/gen/neonfp16arith-c8.c", "XNNPACK/wrappers/f16-igemm/gen/1x16-minmax-neonfp16arith-ld64.c", "XNNPACK/wrappers/f16-igemm/gen/6x16-minmax-neonfp16arith-ld64.c", "XNNPACK/wrappers/f16-maxpool/9p8x-minmax-neonfp16arith-c8.c", "XNNPACK/wrappers/f16-prelu/gen/neonfp16arith-2x16.c", "XNNPACK/wrappers/f16-vbinary/gen/vadd-minmax-neonfp16arith-x16.c", "XNNPACK/wrappers/f16-vbinary/gen/vaddc-minmax-neonfp16arith-x16.c", "XNNPACK/wrappers/f16-vbinary/gen/vmul-minmax-neonfp16arith-x16.c", "XNNPACK/wrappers/f16-vbinary/gen/vmulc-minmax-neonfp16arith-x16.c", "XNNPACK/wrappers/f16-vclamp/gen/vclamp-neonfp16arith-x16.c", "XNNPACK/wrappers/f16-vhswish/gen/vhswish-neonfp16arith-x16.c", "XNNPACK/wrappers/f16-vlrelu/gen/vlrelu-neonfp16arith-x16.c", "XNNPACK/wrappers/f16-vmulcaddc/gen/c8-minmax-neonfp16arith-2x.c"], + deps = [":interface", "//third_party:FP16"], + exported_deps = [], + compiler_flags = ["-w", "-O2"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [["(aarch64|arm64)", ["-march=armv8.2-a+fp16"]]], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "ukernels_avx", + srcs = [], + deps = [":interface"], + exported_deps = [], + compiler_flags = ["-w", "-O2", "-mavx"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [["x86", ["-mavx"]]], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + platform_srcs = [["x86|x86_64|platform009", ["XNNPACK/wrappers/f16-f32-vcvt/gen/vcvt-avx-int16-x16.c", "XNNPACK/wrappers/f32-dwconv/gen/up8x25-minmax-avx.c", "XNNPACK/wrappers/f32-dwconv/gen/up16x3-minmax-avx.c", "XNNPACK/wrappers/f32-dwconv/gen/up16x4-minmax-avx.c", "XNNPACK/wrappers/f32-dwconv/gen/up16x9-minmax-avx.c", "XNNPACK/wrappers/f32-f16-vcvt/gen/vcvt-avx-x24.c", "XNNPACK/wrappers/f32-gemm/gen/1x16-minmax-avx-broadcast.c", "XNNPACK/wrappers/f32-gemm/gen/5x16-minmax-avx-broadcast.c", "XNNPACK/wrappers/f32-igemm/gen/1x16-minmax-avx-broadcast.c", "XNNPACK/wrappers/f32-igemm/gen/5x16-minmax-avx-broadcast.c", "XNNPACK/wrappers/f32-prelu/gen/avx-2x16.c", "XNNPACK/wrappers/f32-qs8-vcvt/gen/vcvt-avx-x32.c", "XNNPACK/wrappers/f32-qu8-vcvt/gen/vcvt-avx-x32.c", "XNNPACK/wrappers/f32-vbinary/gen/vadd-minmax-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vaddc-minmax-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vdiv-minmax-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vdivc-minmax-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vmax-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vmaxc-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vmin-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vminc-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vmul-minmax-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vmulc-minmax-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vrdivc-minmax-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vrsubc-minmax-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vsqrdiff-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vsqrdiffc-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vsub-minmax-avx-x16.c", "XNNPACK/wrappers/f32-vbinary/gen/vsubc-minmax-avx-x16.c", "XNNPACK/wrappers/f32-vclamp/gen/vclamp-avx-x16.c", "XNNPACK/wrappers/f32-velu/gen/velu-avx-rr2-lut4-p4-perm-x32.c", "XNNPACK/wrappers/f32-vhswish/gen/vhswish-avx-x16.c", "XNNPACK/wrappers/f32-vlrelu/gen/vlrelu-avx-x16.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndd-avx-x16.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndne-avx-x16.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndu-avx-x16.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndz-avx-x16.c", "XNNPACK/wrappers/f32-vsigmoid/gen/vsigmoid-avx-rr2-p5-nr2-x40.c", "XNNPACK/wrappers/f32-vsqrt/gen/avx-sqrt-x8.c", "XNNPACK/wrappers/f32-vunary/gen/vabs-avx-x16.c", "XNNPACK/wrappers/f32-vunary/gen/vneg-avx-x16.c", "XNNPACK/wrappers/f32-vunary/gen/vsqr-avx-x16.c", "XNNPACK/wrappers/qc8-dwconv/gen/up16x9-minmax-fp32-avx-mul16-add16.c", "XNNPACK/wrappers/qc8-dwconv/gen/up16x25-minmax-fp32-avx-mul16-add16.c", "XNNPACK/wrappers/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c", "XNNPACK/wrappers/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c", "XNNPACK/wrappers/qc8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c", "XNNPACK/wrappers/qc8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c", "XNNPACK/wrappers/qs8-dwconv/gen/up16x9-minmax-fp32-avx-mul16-add16.c", "XNNPACK/wrappers/qs8-dwconv/gen/up16x25-minmax-fp32-avx-mul16-add16.c", "XNNPACK/wrappers/qs8-f32-vcvt/gen/vcvt-avx-x32.c", "XNNPACK/wrappers/qs8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c", "XNNPACK/wrappers/qs8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c", "XNNPACK/wrappers/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c", "XNNPACK/wrappers/qs8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c", "XNNPACK/wrappers/qs8-vadd/gen/minmax-avx-mul32-ld32-x8.c", "XNNPACK/wrappers/qs8-vaddc/gen/minmax-avx-mul32-ld32-x8.c", "XNNPACK/wrappers/qs8-vmul/gen/minmax-fp32-avx-mul16-ld64-x16.c", "XNNPACK/wrappers/qs8-vmulc/gen/minmax-fp32-avx-mul16-ld64-x16.c", "XNNPACK/wrappers/qu8-dwconv/gen/up16x9-minmax-fp32-avx-mul16.c", "XNNPACK/wrappers/qu8-dwconv/gen/up16x25-minmax-fp32-avx-mul16.c", "XNNPACK/wrappers/qu8-f32-vcvt/gen/vcvt-avx-x32.c", "XNNPACK/wrappers/qu8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c", "XNNPACK/wrappers/qu8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c", "XNNPACK/wrappers/qu8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c", "XNNPACK/wrappers/qu8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c", "XNNPACK/wrappers/qu8-vadd/gen/minmax-avx-mul32-ld32-x8.c", "XNNPACK/wrappers/qu8-vaddc/gen/minmax-avx-mul32-ld32-x8.c", "XNNPACK/wrappers/qu8-vmul/gen/minmax-fp32-avx-mul16-ld64-x16.c", "XNNPACK/wrappers/qu8-vmulc/gen/minmax-fp32-avx-mul16-ld64-x16.c", "XNNPACK/wrappers/x8-lut/gen/lut-avx-x64.c"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "ukernels_sse41", + srcs = [], + deps = [":interface", "//third_party:FP16"], + exported_deps = [], + compiler_flags = ["-w", "-O2"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [["x86", ["-msse4.1"]]], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + platform_srcs = [["x86|x86_64|platform009", ["XNNPACK/wrappers/f16-f32-vcvt/gen/vcvt-sse41-int16-x16.c", "XNNPACK/wrappers/f32-f16-vcvt/gen/vcvt-sse41-x8.c", "XNNPACK/wrappers/f32-prelu/gen/sse41-2x8.c", "XNNPACK/wrappers/f32-qs8-vcvt/gen/vcvt-sse41-x32.c", "XNNPACK/wrappers/f32-vlrelu/gen/vlrelu-sse41-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndd-sse41-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndne-sse41-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndu-sse41-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndz-sse41-x8.c", "XNNPACK/wrappers/f32-vsigmoid/gen/vsigmoid-sse41-rr2-lut64-p2-div-x8.c", "XNNPACK/wrappers/qc8-dwconv/gen/up8x9-minmax-fp32-sse41-mul16.c", "XNNPACK/wrappers/qc8-dwconv/gen/up8x25-minmax-fp32-sse41-mul16.c", "XNNPACK/wrappers/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c", "XNNPACK/wrappers/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c", "XNNPACK/wrappers/qc8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c", "XNNPACK/wrappers/qc8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c", "XNNPACK/wrappers/qs8-dwconv/gen/up8x9-minmax-fp32-sse41-mul16-add16.c", "XNNPACK/wrappers/qs8-dwconv/gen/up8x25-minmax-fp32-sse41-mul16-add16.c", "XNNPACK/wrappers/qs8-f32-vcvt/gen/vcvt-sse41-x16.c", "XNNPACK/wrappers/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c", "XNNPACK/wrappers/qs8-gavgpool/gen/7x-minmax-fp32-sse41-c8.c", "XNNPACK/wrappers/qs8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c", "XNNPACK/wrappers/qs8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c", "XNNPACK/wrappers/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c", "XNNPACK/wrappers/qs8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c", "XNNPACK/wrappers/qs8-vadd/gen/minmax-sse41-mul16-ld64-x8.c", "XNNPACK/wrappers/qs8-vaddc/gen/minmax-sse41-mul16-ld64-x8.c", "XNNPACK/wrappers/qs8-vmul/gen/minmax-fp32-sse41-mul16-ld64-x16.c", "XNNPACK/wrappers/qs8-vmulc/gen/minmax-fp32-sse41-mul16-ld64-x16.c", "XNNPACK/wrappers/qu8-dwconv/gen/up8x9-minmax-fp32-sse41-mul16.c", "XNNPACK/wrappers/qu8-dwconv/gen/up8x25-minmax-fp32-sse41-mul16.c", "XNNPACK/wrappers/qu8-f32-vcvt/gen/vcvt-sse41-x16.c", "XNNPACK/wrappers/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c", "XNNPACK/wrappers/qu8-gavgpool/gen/7x-minmax-fp32-sse41-c8.c", "XNNPACK/wrappers/qu8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c", "XNNPACK/wrappers/qu8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c", "XNNPACK/wrappers/qu8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c", "XNNPACK/wrappers/qu8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c", "XNNPACK/wrappers/qu8-vadd/gen/minmax-sse41-mul16-ld64-x8.c", "XNNPACK/wrappers/qu8-vaddc/gen/minmax-sse41-mul16-ld64-x8.c", "XNNPACK/wrappers/qu8-vmul/gen/minmax-fp32-sse41-mul16-ld64-x16.c", "XNNPACK/wrappers/qu8-vmulc/gen/minmax-fp32-sse41-mul16-ld64-x16.c", "XNNPACK/wrappers/s8-ibilinear/gen/sse41-c16.c", "XNNPACK/wrappers/s8-maxpool/9p8x-minmax-sse41-c16.c", "XNNPACK/wrappers/s8-vclamp/sse41-x64.c", "XNNPACK/wrappers/u8-ibilinear/gen/sse41-c16.c"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "ukernels_neon", + srcs = ["XNNPACK/wrappers/f16-f32-vcvt/gen/vcvt-neon-int16-x16.c", "XNNPACK/wrappers/f32-argmaxpool/4x-neon-c4.c", "XNNPACK/wrappers/f32-argmaxpool/9p8x-neon-c4.c", "XNNPACK/wrappers/f32-argmaxpool/9x-neon-c4.c", "XNNPACK/wrappers/f32-avgpool/9p8x-minmax-neon-c4.c", "XNNPACK/wrappers/f32-avgpool/9x-minmax-neon-c4.c", "XNNPACK/wrappers/f32-conv-hwc2chw/3x3s2p1c3x4-neon-2x2.c", "XNNPACK/wrappers/f32-dwconv/gen/up8x3-minmax-neon.c", "XNNPACK/wrappers/f32-dwconv/gen/up8x4-minmax-neon.c", "XNNPACK/wrappers/f32-dwconv/gen/up8x9-minmax-neon.c", "XNNPACK/wrappers/f32-dwconv/gen/up8x25-minmax-neon-acc2.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/3x3p1-minmax-neon-2x4.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/3x3s2p1-minmax-neon-1x4.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/5x5p2-minmax-neon-1x4.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/5x5s2p2-minmax-neon-1x4.c", "XNNPACK/wrappers/f32-f16-vcvt/gen/vcvt-neon-x8.c", "XNNPACK/wrappers/f32-gavgpool-cw/neon-x4.c", "XNNPACK/wrappers/f32-gavgpool/7p7x-minmax-neon-c4.c", "XNNPACK/wrappers/f32-gavgpool/7x-minmax-neon-c4.c", "XNNPACK/wrappers/f32-gemm/gen/1x8-minmax-neon-lane-ld64.c", "XNNPACK/wrappers/f32-gemm/gen/4x2-minmax-neon-lane-ld64.c", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-neon-lane-ld64.c", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-neon-lane-ld128.c", "XNNPACK/wrappers/f32-ibilinear-chw/gen/neon-p8.c", "XNNPACK/wrappers/f32-ibilinear/gen/neon-c8.c", "XNNPACK/wrappers/f32-igemm/gen/1x8-minmax-neon-lane-ld64.c", "XNNPACK/wrappers/f32-igemm/gen/4x2-minmax-neon-lane-ld64.c", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-neon-lane-ld64.c", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-neon-lane-ld128.c", "XNNPACK/wrappers/f32-maxpool/9p8x-minmax-neon-c4.c", "XNNPACK/wrappers/f32-pavgpool/9p8x-minmax-neon-c4.c", "XNNPACK/wrappers/f32-pavgpool/9x-minmax-neon-c4.c", "XNNPACK/wrappers/f32-prelu/gen/neon-2x8.c", "XNNPACK/wrappers/f32-qs8-vcvt/gen/vcvt-neon-x32.c", "XNNPACK/wrappers/f32-qu8-vcvt/gen/vcvt-neon-x32.c", "XNNPACK/wrappers/f32-raddstoreexpminusmax/gen/neon-rr2-lut64-p2-x8.c", "XNNPACK/wrappers/f32-rmax/neon.c", "XNNPACK/wrappers/f32-spmm/gen/32x1-minmax-neon.c", "XNNPACK/wrappers/f32-vbinary/gen/vadd-minmax-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vaddc-minmax-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmax-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmaxc-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmin-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vminc-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmul-minmax-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmulc-minmax-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vrsubc-minmax-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vsqrdiff-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vsqrdiffc-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vsub-minmax-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vsubc-minmax-neon-x8.c", "XNNPACK/wrappers/f32-vclamp/gen/vclamp-neon-x8.c", "XNNPACK/wrappers/f32-velu/gen/velu-neon-rr2-lut16-p3-x8.c", "XNNPACK/wrappers/f32-vhswish/gen/vhswish-neon-x16.c", "XNNPACK/wrappers/f32-vlrelu/gen/vlrelu-neon-x8.c", "XNNPACK/wrappers/f32-vmulcaddc/gen/c4-minmax-neon-2x.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndd-neon-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndne-neon-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndu-neon-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndz-neon-x8.c", "XNNPACK/wrappers/f32-vsigmoid/gen/vsigmoid-neon-rr2-lut64-p2-nr2recps-x8.c", "XNNPACK/wrappers/f32-vunary/gen/vabs-neon-x8.c", "XNNPACK/wrappers/f32-vunary/gen/vneg-neon-x8.c", "XNNPACK/wrappers/f32-vunary/gen/vsqr-neon-x8.c", "XNNPACK/wrappers/qc8-dwconv/gen/up8x25-minmax-fp32-neon-mla8-ld64.c", "XNNPACK/wrappers/qc8-dwconv/gen/up16x9-minmax-fp32-neon-mla8-ld64.c", "XNNPACK/wrappers/qc8-dwconv/gen/up16x25-minmax-fp32-neon-mla8-ld64.c", "XNNPACK/wrappers/qc8-gemm/gen/1x8c2s4-minmax-fp32-neon-mlal.c", "XNNPACK/wrappers/qc8-gemm/gen/2x8c2s4-minmax-fp32-neon-mlal.c", "XNNPACK/wrappers/qc8-igemm/gen/1x8c2s4-minmax-fp32-neon-mlal.c", "XNNPACK/wrappers/qc8-igemm/gen/2x8c2s4-minmax-fp32-neon-mlal.c", "XNNPACK/wrappers/qs8-dwconv/gen/up8x25-minmax-rndnu-neon-mla8-ld64.c", "XNNPACK/wrappers/qs8-dwconv/gen/up16x9-minmax-rndnu-neon-mla8-ld64.c", "XNNPACK/wrappers/qs8-dwconv/gen/up16x25-minmax-rndnu-neon-mla8-ld64.c", "XNNPACK/wrappers/qs8-f32-vcvt/gen/vcvt-neon-x32.c", "XNNPACK/wrappers/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c8.c", "XNNPACK/wrappers/qs8-gavgpool/gen/7x-minmax-rndnu-neon-c8.c", "XNNPACK/wrappers/qs8-gemm/gen/1x8-minmax-rndnu-neon-mlal-lane.c", "XNNPACK/wrappers/qs8-gemm/gen/1x8c2s4-minmax-rndnu-neon-mlal.c", "XNNPACK/wrappers/qs8-gemm/gen/1x16-minmax-rndnu-neon-mlal-lane.c", "XNNPACK/wrappers/qs8-gemm/gen/2x8c2s4-minmax-rndnu-neon-mlal.c", "XNNPACK/wrappers/qs8-igemm/gen/1x8-minmax-rndnu-neon-mlal-lane.c", "XNNPACK/wrappers/qs8-igemm/gen/1x8c2s4-minmax-rndnu-neon-mlal.c", "XNNPACK/wrappers/qs8-igemm/gen/1x16-minmax-rndnu-neon-mlal-lane.c", "XNNPACK/wrappers/qs8-igemm/gen/2x8c2s4-minmax-rndnu-neon-mlal.c", "XNNPACK/wrappers/qs8-vadd/gen/minmax-neon-ld64-x16.c", "XNNPACK/wrappers/qs8-vadd/gen/minmax-neon-ld64-x32.c", "XNNPACK/wrappers/qs8-vaddc/gen/minmax-neon-ld64-x16.c", "XNNPACK/wrappers/qs8-vaddc/gen/minmax-neon-ld64-x32.c", "XNNPACK/wrappers/qs8-vmul/gen/minmax-rndnu-neon-ld64-x16.c", "XNNPACK/wrappers/qs8-vmulc/gen/minmax-rndnu-neon-ld64-x16.c", "XNNPACK/wrappers/qu8-avgpool/9p8x-minmax-neon-c8.c", "XNNPACK/wrappers/qu8-avgpool/9x-minmax-neon-c8.c", "XNNPACK/wrappers/qu8-dwconv/gen/up8x25-minmax-rndnu-neon-mul8.c", "XNNPACK/wrappers/qu8-dwconv/gen/up16x9-minmax-rndnu-neon-mul8.c", "XNNPACK/wrappers/qu8-f32-vcvt/gen/vcvt-neon-x32.c", "XNNPACK/wrappers/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c8.c", "XNNPACK/wrappers/qu8-gavgpool/gen/7x-minmax-rndnu-neon-c8.c", "XNNPACK/wrappers/qu8-gemm/gen/1x8-minmax-rndnu-neon-mlal-lane.c", "XNNPACK/wrappers/qu8-gemm/gen/1x16-minmax-rndnu-neon-mlal-lane.c", "XNNPACK/wrappers/qu8-gemm/gen/3x8-minmax-rndnu-neon-mlal-lane.c", "XNNPACK/wrappers/qu8-gemm/gen/4x16-minmax-rndnu-neon-mlal-lane.c", "XNNPACK/wrappers/qu8-igemm/gen/1x8-minmax-rndnu-neon-mlal-lane.c", "XNNPACK/wrappers/qu8-igemm/gen/1x16-minmax-rndnu-neon-mlal-lane.c", "XNNPACK/wrappers/qu8-igemm/gen/3x8-minmax-rndnu-neon-mlal-lane.c", "XNNPACK/wrappers/qu8-igemm/gen/4x16-minmax-rndnu-neon-mlal-lane.c", "XNNPACK/wrappers/qu8-vadd/gen/minmax-neon-ld64-x16.c", "XNNPACK/wrappers/qu8-vadd/gen/minmax-neon-ld64-x32.c", "XNNPACK/wrappers/qu8-vaddc/gen/minmax-neon-ld64-x16.c", "XNNPACK/wrappers/qu8-vaddc/gen/minmax-neon-ld64-x32.c", "XNNPACK/wrappers/qu8-vmul/gen/minmax-rndnu-neon-ld64-x16.c", "XNNPACK/wrappers/qu8-vmulc/gen/minmax-rndnu-neon-ld64-x16.c", "XNNPACK/wrappers/s8-ibilinear/gen/neon-c8.c", "XNNPACK/wrappers/s8-ibilinear/gen/neon-c16.c", "XNNPACK/wrappers/s8-maxpool/9p8x-minmax-neon-c16.c", "XNNPACK/wrappers/s8-vclamp/neon-x64.c", "XNNPACK/wrappers/u8-ibilinear/gen/neon-c8.c", "XNNPACK/wrappers/u8-ibilinear/gen/neon-c16.c", "XNNPACK/wrappers/u8-maxpool/9p8x-minmax-neon-c16.c", "XNNPACK/wrappers/u8-rmax/neon.c", "XNNPACK/wrappers/u8-vclamp/neon-x64.c", "XNNPACK/wrappers/xx-fill/neon-x64.c", "XNNPACK/wrappers/xx-pad/neon.c", "XNNPACK/wrappers/x8-zip/xm-neon.c", "XNNPACK/wrappers/x8-zip/x2-neon.c", "XNNPACK/wrappers/x8-zip/x3-neon.c", "XNNPACK/wrappers/x8-zip/x4-neon.c", "XNNPACK/wrappers/x32-packx/x4-neon-st4.c", "XNNPACK/wrappers/x32-unpool/neon.c", "XNNPACK/wrappers/x32-zip/xm-neon.c", "XNNPACK/wrappers/x32-zip/x2-neon.c", "XNNPACK/wrappers/x32-zip/x3-neon.c", "XNNPACK/wrappers/x32-zip/x4-neon.c"], + deps = [":interface", "//third_party:FP16"], + exported_deps = [], + compiler_flags = ["-w", "-O2"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [["^(android-armv7|iphoneos-armv7)$", ["-march=armv7-a", "-mfpu=neon", "-mfloat-abi=softfp"]]], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "ukernels_neon_dot", + srcs = [], + deps = [":interface", "//third_party:FP16"], + exported_deps = [], + compiler_flags = ["-w", "-O2"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [["(aarch64|arm64)", ["-march=armv8.2-a+dotprod"]], ["^android-armv7$", ["-march=armv8.2-a+dotprod", "-mfpu=neon-fp-armv8", "-mfloat-abi=softfp"]]], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + platform_srcs = [["^((?!iphoneos-armv7).)*$", ["XNNPACK/wrappers/qc8-gemm/gen/1x8c4-minmax-fp32-neondot.c", "XNNPACK/wrappers/qc8-gemm/gen/1x16c4-minmax-fp32-neondot.c", "XNNPACK/wrappers/qc8-gemm/gen/4x8c4-minmax-fp32-neondot.c", "XNNPACK/wrappers/qc8-gemm/gen/4x16c4-minmax-fp32-neondot.c", "XNNPACK/wrappers/qc8-igemm/gen/1x8c4-minmax-fp32-neondot.c", "XNNPACK/wrappers/qc8-igemm/gen/1x16c4-minmax-fp32-neondot.c", "XNNPACK/wrappers/qc8-igemm/gen/4x8c4-minmax-fp32-neondot.c", "XNNPACK/wrappers/qc8-igemm/gen/4x16c4-minmax-fp32-neondot.c", "XNNPACK/wrappers/qs8-gemm/gen/1x8c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qs8-gemm/gen/1x16c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qs8-gemm/gen/4x8c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qs8-gemm/gen/4x16c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qs8-igemm/gen/1x8c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qs8-igemm/gen/1x16c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qs8-igemm/gen/4x8c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qs8-igemm/gen/4x16c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qu8-gemm/gen/1x8c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qu8-gemm/gen/1x16c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qu8-gemm/gen/4x8c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qu8-gemm/gen/4x16c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qu8-igemm/gen/1x8c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qu8-igemm/gen/1x16c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qu8-igemm/gen/4x8c4-minmax-rndnu-neondot.c", "XNNPACK/wrappers/qu8-igemm/gen/4x16c4-minmax-rndnu-neondot.c"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "ukernels_neon_aarch64", + srcs = ["XNNPACK/wrappers/f32-conv-hwc2chw/3x3s2p1c3x4-neonfma-2x2.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/3x3p1-minmax-neonfma-3x4.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/3x3s2p1-minmax-neonfma-2x4-acc2.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/5x5p2-minmax-neonfma-4x4.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/5x5s2p2-minmax-neonfma-1x4-acc2.c", "XNNPACK/wrappers/f32-gemm/gen/1x8-minmax-neonfma-lane-ld64.c", "XNNPACK/wrappers/f32-gemm/gen/4x2-minmax-neonfma-lane-ld64.c", "XNNPACK/wrappers/f32-gemm/gen/6x8-minmax-neonfma-lane-ld64.c", "XNNPACK/wrappers/f32-igemm/gen/1x8-minmax-neonfma-lane-ld64.c", "XNNPACK/wrappers/f32-igemm/gen/4x2-minmax-neonfma-lane-ld64.c", "XNNPACK/wrappers/f32-igemm/gen/6x8-minmax-neonfma-lane-ld64.c", "XNNPACK/wrappers/f32-spmm/gen/32x2-minmax-neonfma.c", "XNNPACK/wrappers/f32-spmm/gen/32x4-minmax-neonfma.c", "XNNPACK/wrappers/f32-vbinary/gen/vdiv-minmax-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vdivc-minmax-neon-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vrdivc-minmax-neon-x8.c", "XNNPACK/wrappers/f32-vsqrt/gen/neon-sqrt-x4.c", "XNNPACK/wrappers/x8-lut/gen/lut-neon-tbx128x4-x64.c"], + deps = [":interface", "//third_party:FP16"], + exported_deps = [], + compiler_flags = ["-w", "-O2"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [["(aarch64|arm64)", ["-mfpu=neon-vfpv4"]]], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "ukernels_neon_v8", + srcs = ["XNNPACK/wrappers/f32-qs8-vcvt/gen/vcvt-neonv8-x32.c", "XNNPACK/wrappers/f32-qu8-vcvt/gen/vcvt-neonv8-x32.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndd-neonv8-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndne-neonv8-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndu-neonv8-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndz-neonv8-x8.c", "XNNPACK/wrappers/qc8-dwconv/gen/up8x25-minmax-fp32-neonv8-mla8-ld64.c", "XNNPACK/wrappers/qc8-dwconv/gen/up16x9-minmax-fp32-neonv8-mla8-ld64.c", "XNNPACK/wrappers/qc8-dwconv/gen/up16x25-minmax-fp32-neonv8-mla8-ld64.c", "XNNPACK/wrappers/qc8-gemm/gen/1x8-minmax-fp32-neonv8-mlal-lane-prfm.c", "XNNPACK/wrappers/qc8-gemm/gen/1x8-minmax-fp32-neonv8-mlal-lane.c", "XNNPACK/wrappers/qc8-gemm/gen/1x8c2s4-minmax-fp32-neonv8-mlal.c", "XNNPACK/wrappers/qc8-gemm/gen/1x8c8-minmax-fp32-neonv8-mlal.c", "XNNPACK/wrappers/qc8-gemm/gen/1x16-minmax-fp32-neonv8-mlal-lane.c", "XNNPACK/wrappers/qc8-gemm/gen/2x8c2s4-minmax-fp32-neonv8-mlal.c", "XNNPACK/wrappers/qc8-gemm/gen/2x8c8-minmax-fp32-neonv8-mlal.c", "XNNPACK/wrappers/qc8-gemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c", "XNNPACK/wrappers/qc8-igemm/gen/1x8-minmax-fp32-neonv8-mlal-lane-prfm.c", "XNNPACK/wrappers/qc8-igemm/gen/1x8-minmax-fp32-neonv8-mlal-lane.c", "XNNPACK/wrappers/qc8-igemm/gen/1x8c2s4-minmax-fp32-neonv8-mlal.c", "XNNPACK/wrappers/qc8-igemm/gen/1x8c8-minmax-fp32-neonv8-mlal.c", "XNNPACK/wrappers/qc8-igemm/gen/1x16-minmax-fp32-neonv8-mlal-lane.c", "XNNPACK/wrappers/qc8-igemm/gen/2x8c2s4-minmax-fp32-neonv8-mlal.c", "XNNPACK/wrappers/qc8-igemm/gen/2x8c8-minmax-fp32-neonv8-mlal.c", "XNNPACK/wrappers/qc8-igemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c"], + deps = [":interface", "//third_party:FP16"], + exported_deps = [], + compiler_flags = ["-w", "-O2"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [["(aarch64|arm64)", ["-march=armv8-a", "-mfpu=neon-fp-armv8"]], ["^android-armv7$", ["-march=armv8-a", "-mfpu=neon-fp-armv8", "-mfloat-abi=softfp"]], ["^iphoneos-armv7$", ["-mcpu=cyclone", "-mtune=generic"]]], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "ukernels_avx512skx", + srcs = [], + deps = [":interface"], + exported_deps = [], + compiler_flags = ["-w", "-O2", "-mavx512f", "-mavx512cd", "-mavx512bw", "-mavx512dq", "-mavx512vl"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [["^(i[3-6]86|x86|x86_64|AMD64)$", ["-mavx512f", "-mavx512cd", "-mavx512bw", "-mavx512dq", "-mavx512vl"]]], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + platform_srcs = [["x86|x86_64|platform009", ["XNNPACK/wrappers/f16-f32-vcvt/gen/vcvt-avx512skx-x16.c", "XNNPACK/wrappers/f32-f16-vcvt/gen/vcvt-avx512skx-x16.c", "XNNPACK/wrappers/f32-qs8-vcvt/gen/vcvt-avx512skx-x128.c", "XNNPACK/wrappers/f32-qu8-vcvt/gen/vcvt-avx512skx-x128.c", "XNNPACK/wrappers/qc8-dwconv/gen/up32x9-minmax-fp32-avx512skx-mul32.c", "XNNPACK/wrappers/qc8-dwconv/gen/up32x25-minmax-fp32-avx512skx-mul32.c", "XNNPACK/wrappers/qc8-gemm/gen/1x16c8-minmax-fp32-avx512skx.c", "XNNPACK/wrappers/qc8-gemm/gen/4x16c8-minmax-fp32-avx512skx.c", "XNNPACK/wrappers/qc8-igemm/gen/1x16c8-minmax-fp32-avx512skx.c", "XNNPACK/wrappers/qc8-igemm/gen/4x16c8-minmax-fp32-avx512skx.c", "XNNPACK/wrappers/qs8-dwconv/gen/up32x9-minmax-fp32-avx512skx-mul32.c", "XNNPACK/wrappers/qs8-dwconv/gen/up32x25-minmax-fp32-avx512skx-mul32.c", "XNNPACK/wrappers/qs8-f32-vcvt/gen/vcvt-avx512skx-x32.c", "XNNPACK/wrappers/qs8-gemm/gen/1x16c8-minmax-fp32-avx512skx.c", "XNNPACK/wrappers/qs8-gemm/gen/4x16c8-minmax-fp32-avx512skx.c", "XNNPACK/wrappers/qs8-igemm/gen/1x16c8-minmax-fp32-avx512skx.c", "XNNPACK/wrappers/qs8-igemm/gen/4x16c8-minmax-fp32-avx512skx.c", "XNNPACK/wrappers/qs8-vadd/gen/minmax-avx512skx-mul32-ld128-x16.c", "XNNPACK/wrappers/qs8-vaddc/gen/minmax-avx512skx-mul32-ld128-x16.c", "XNNPACK/wrappers/qu8-dwconv/gen/up32x9-minmax-fp32-avx512skx-mul32.c", "XNNPACK/wrappers/qu8-dwconv/gen/up32x25-minmax-fp32-avx512skx-mul32.c", "XNNPACK/wrappers/qu8-f32-vcvt/gen/vcvt-avx512skx-x32.c", "XNNPACK/wrappers/qu8-gemm/gen/1x16c8-minmax-fp32-avx512skx.c", "XNNPACK/wrappers/qu8-gemm/gen/4x16c8-minmax-fp32-avx512skx.c", "XNNPACK/wrappers/qu8-igemm/gen/1x16c8-minmax-fp32-avx512skx.c", "XNNPACK/wrappers/qu8-igemm/gen/4x16c8-minmax-fp32-avx512skx.c", "XNNPACK/wrappers/qu8-vadd/gen/minmax-avx512skx-mul32-ld128-x16.c", "XNNPACK/wrappers/qu8-vaddc/gen/minmax-avx512skx-mul32-ld128-x16.c", "XNNPACK/wrappers/x8-lut/gen/lut-avx512skx-vpshufb-x64.c"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "ukernels_neon_fp16", + srcs = ["XNNPACK/wrappers/f16-f32-vcvt/gen/vcvt-neonfp16-x16.c", "XNNPACK/wrappers/f32-f16-vcvt/gen/vcvt-neonfp16-x16.c"], + deps = [":interface"], + exported_deps = [], + compiler_flags = ["-w", "-O2"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [["arm", ["-mfpu=neon-fp16"]]], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "interface", + srcs = [], + deps = [], + exported_deps = ["//third_party:pthreadpool_header"], + compiler_flags = ["-w"], + preferred_linkage = "static", + exported_headers = {"xnnpack.h": "XNNPACK/include/xnnpack.h"}, + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "ukernels_fma3", + srcs = [], + deps = [":interface"], + exported_deps = [], + compiler_flags = ["-w", "-O2", "-mfma", "-mf16c"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [["^(i[3-6]86|x86|x86_64|AMD64)$", ["-mfma", "-mf16c"]]], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + platform_srcs = [["x86|x86_64|platform009", ["XNNPACK/wrappers/f16-dwconv/gen/up8x25-minmax-fma3-acc2.c", "XNNPACK/wrappers/f16-dwconv/gen/up16x3-minmax-fma3.c", "XNNPACK/wrappers/f16-dwconv/gen/up16x4-minmax-fma3.c", "XNNPACK/wrappers/f16-dwconv/gen/up16x9-minmax-fma3.c", "XNNPACK/wrappers/f16-ibilinear/gen/fma3-c8.c", "XNNPACK/wrappers/f16-vmulcaddc/gen/c8-minmax-fma3-2x.c", "XNNPACK/wrappers/f32-dwconv/gen/up8x25-minmax-fma3.c", "XNNPACK/wrappers/f32-dwconv/gen/up16x3-minmax-fma3.c", "XNNPACK/wrappers/f32-dwconv/gen/up16x4-minmax-fma3.c", "XNNPACK/wrappers/f32-dwconv/gen/up16x9-minmax-fma3.c", "XNNPACK/wrappers/f32-gemm/gen/1x16-minmax-fma3-broadcast.c", "XNNPACK/wrappers/f32-gemm/gen/1x16s4-minmax-fma3-broadcast.c", "XNNPACK/wrappers/f32-gemm/gen/4x16s4-minmax-fma3-broadcast.c", "XNNPACK/wrappers/f32-gemm/gen/5x16-minmax-fma3-broadcast.c", "XNNPACK/wrappers/f32-igemm/gen/1x16-minmax-fma3-broadcast.c", "XNNPACK/wrappers/f32-igemm/gen/1x16s4-minmax-fma3-broadcast.c", "XNNPACK/wrappers/f32-igemm/gen/4x16s4-minmax-fma3-broadcast.c", "XNNPACK/wrappers/f32-igemm/gen/5x16-minmax-fma3-broadcast.c", "XNNPACK/wrappers/f32-vhswish/gen/vhswish-fma3-x16.c"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "jit_memory", + srcs = ["XNNPACK/src/jit/aarch32-assembler.cc", "XNNPACK/src/jit/aarch64-assembler.cc", "XNNPACK/src/jit/assembler.cc", "XNNPACK/src/jit/memory.c"], + deps = [":interface", "//third_party:clog"], + exported_deps = [], + compiler_flags = ["-w", "-Os"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "ukernels_sse2", + srcs = [], + deps = [":interface", "//third_party:FP16"], + exported_deps = [], + compiler_flags = ["-w", "-O2"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [["x86", ["-msse2"]]], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + platform_srcs = [["x86|x86_64|platform009", ["XNNPACK/wrappers/f16-f32-vcvt/gen/vcvt-sse2-int16-x32.c", "XNNPACK/wrappers/f32-argmaxpool/4x-sse2-c4.c", "XNNPACK/wrappers/f32-argmaxpool/9p8x-sse2-c4.c", "XNNPACK/wrappers/f32-argmaxpool/9x-sse2-c4.c", "XNNPACK/wrappers/f32-f16-vcvt/gen/vcvt-sse2-x16.c", "XNNPACK/wrappers/f32-prelu/gen/sse2-2x8.c", "XNNPACK/wrappers/f32-qs8-vcvt/gen/vcvt-sse2-x32.c", "XNNPACK/wrappers/f32-qu8-vcvt/gen/vcvt-sse2-x32.c", "XNNPACK/wrappers/f32-raddstoreexpminusmax/gen/sse2-rr2-p5-x20-acc2.c", "XNNPACK/wrappers/f32-velu/gen/velu-sse2-rr2-lut16-p3-x12.c", "XNNPACK/wrappers/f32-vlrelu/gen/vlrelu-sse2-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndd-sse2-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndne-sse2-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndu-sse2-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndz-sse2-x8.c", "XNNPACK/wrappers/f32-vsigmoid/gen/vsigmoid-sse2-rr2-lut64-p2-div-x8.c", "XNNPACK/wrappers/qc8-dwconv/gen/up8x9-minmax-fp32-sse2-mul16.c", "XNNPACK/wrappers/qc8-dwconv/gen/up8x25-minmax-fp32-sse2-mul16.c", "XNNPACK/wrappers/qc8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c", "XNNPACK/wrappers/qc8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c", "XNNPACK/wrappers/qc8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c", "XNNPACK/wrappers/qc8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c", "XNNPACK/wrappers/qs8-dwconv/gen/up8x9-minmax-fp32-sse2-mul16-add16.c", "XNNPACK/wrappers/qs8-dwconv/gen/up8x25-minmax-fp32-sse2-mul16-add16.c", "XNNPACK/wrappers/qs8-f32-vcvt/gen/vcvt-sse2-x32.c", "XNNPACK/wrappers/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c", "XNNPACK/wrappers/qs8-gavgpool/gen/7x-minmax-fp32-sse2-c8.c", "XNNPACK/wrappers/qs8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c", "XNNPACK/wrappers/qs8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c", "XNNPACK/wrappers/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c", "XNNPACK/wrappers/qs8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c", "XNNPACK/wrappers/qs8-vadd/gen/minmax-sse2-mul16-ld64-x8.c", "XNNPACK/wrappers/qs8-vaddc/gen/minmax-sse2-mul16-ld64-x8.c", "XNNPACK/wrappers/qs8-vmul/gen/minmax-fp32-sse2-mul16-ld64-x8.c", "XNNPACK/wrappers/qs8-vmulc/gen/minmax-fp32-sse2-mul16-ld64-x8.c", "XNNPACK/wrappers/qu8-avgpool/9p8x-minmax-sse2-c8.c", "XNNPACK/wrappers/qu8-avgpool/9x-minmax-sse2-c8.c", "XNNPACK/wrappers/qu8-dwconv/gen/up8x9-minmax-fp32-sse2-mul16.c", "XNNPACK/wrappers/qu8-dwconv/gen/up8x25-minmax-fp32-sse2-mul16.c", "XNNPACK/wrappers/qu8-f32-vcvt/gen/vcvt-sse2-x32.c", "XNNPACK/wrappers/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c", "XNNPACK/wrappers/qu8-gavgpool/gen/7x-minmax-fp32-sse2-c8.c", "XNNPACK/wrappers/qu8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c", "XNNPACK/wrappers/qu8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c", "XNNPACK/wrappers/qu8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c", "XNNPACK/wrappers/qu8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c", "XNNPACK/wrappers/qu8-vadd/gen/minmax-sse2-mul16-ld64-x8.c", "XNNPACK/wrappers/qu8-vaddc/gen/minmax-sse2-mul16-ld64-x8.c", "XNNPACK/wrappers/qu8-vmul/gen/minmax-fp32-sse2-mul16-ld64-x8.c", "XNNPACK/wrappers/qu8-vmulc/gen/minmax-fp32-sse2-mul16-ld64-x8.c", "XNNPACK/wrappers/s8-ibilinear/gen/sse2-c8.c", "XNNPACK/wrappers/s8-maxpool/9p8x-minmax-sse2-c16.c", "XNNPACK/wrappers/s8-vclamp/sse2-x64.c", "XNNPACK/wrappers/u8-ibilinear/gen/sse2-c8.c", "XNNPACK/wrappers/u8-maxpool/9p8x-minmax-sse2-c16.c", "XNNPACK/wrappers/u8-rmax/sse2.c", "XNNPACK/wrappers/u8-vclamp/sse2-x64.c", "XNNPACK/wrappers/xx-fill/sse2-x64.c", "XNNPACK/wrappers/xx-pad/sse2.c", "XNNPACK/wrappers/x8-zip/xm-sse2.c", "XNNPACK/wrappers/x8-zip/x2-sse2.c", "XNNPACK/wrappers/x8-zip/x3-sse2.c", "XNNPACK/wrappers/x8-zip/x4-sse2.c", "XNNPACK/wrappers/x32-unpool/sse2.c", "XNNPACK/wrappers/x32-zip/xm-sse2.c", "XNNPACK/wrappers/x32-zip/x2-sse2.c", "XNNPACK/wrappers/x32-zip/x3-sse2.c", "XNNPACK/wrappers/x32-zip/x4-sse2.c"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "ukernels_sse", + srcs = [], + deps = [":interface"], + exported_deps = [], + compiler_flags = ["-w", "-O2"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [["x86", ["-msse"]]], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + platform_srcs = [["x86|x86_64|platform009", ["XNNPACK/wrappers/f32-avgpool/9p8x-minmax-sse-c4.c", "XNNPACK/wrappers/f32-avgpool/9x-minmax-sse-c4.c", "XNNPACK/wrappers/f32-conv-hwc2chw/3x3s2p1c3x4-sse-2x2.c", "XNNPACK/wrappers/f32-dwconv/gen/up8x3-minmax-sse.c", "XNNPACK/wrappers/f32-dwconv/gen/up8x4-minmax-sse.c", "XNNPACK/wrappers/f32-dwconv/gen/up8x9-minmax-sse.c", "XNNPACK/wrappers/f32-dwconv/gen/up8x25-minmax-sse.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/3x3p1-minmax-sse-2x4-acc2.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/3x3s2p1-minmax-sse-1x4-acc3.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/5x5p2-minmax-sse-4x4.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/5x5s2p2-minmax-sse-2x4.c", "XNNPACK/wrappers/f32-gavgpool-cw/sse-x4.c", "XNNPACK/wrappers/f32-gavgpool/7p7x-minmax-sse-c4.c", "XNNPACK/wrappers/f32-gavgpool/7x-minmax-sse-c4.c", "XNNPACK/wrappers/f32-gemm/gen/1x8-minmax-sse-load1.c", "XNNPACK/wrappers/f32-gemm/gen/4x2c4-minmax-sse.c", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-sse-load1.c", "XNNPACK/wrappers/f32-ibilinear-chw/gen/sse-p8.c", "XNNPACK/wrappers/f32-ibilinear/gen/sse-c8.c", "XNNPACK/wrappers/f32-igemm/gen/1x8-minmax-sse-load1.c", "XNNPACK/wrappers/f32-igemm/gen/4x2c4-minmax-sse.c", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-sse-load1.c", "XNNPACK/wrappers/f32-maxpool/9p8x-minmax-sse-c4.c", "XNNPACK/wrappers/f32-pavgpool/9p8x-minmax-sse-c4.c", "XNNPACK/wrappers/f32-pavgpool/9x-minmax-sse-c4.c", "XNNPACK/wrappers/f32-rmax/sse.c", "XNNPACK/wrappers/f32-spmm/gen/32x1-minmax-sse.c", "XNNPACK/wrappers/f32-vbinary/gen/vadd-minmax-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vaddc-minmax-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vdiv-minmax-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vdivc-minmax-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmax-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmaxc-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmin-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vminc-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmul-minmax-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmulc-minmax-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vrdivc-minmax-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vrsubc-minmax-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vsqrdiff-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vsqrdiffc-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vsub-minmax-sse-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vsubc-minmax-sse-x8.c", "XNNPACK/wrappers/f32-vclamp/gen/vclamp-sse-x8.c", "XNNPACK/wrappers/f32-vhswish/gen/vhswish-sse-x8.c", "XNNPACK/wrappers/f32-vlrelu/gen/vlrelu-sse-x8.c", "XNNPACK/wrappers/f32-vmulcaddc/gen/c4-minmax-sse-2x.c", "XNNPACK/wrappers/f32-vsqrt/gen/sse-sqrt-x4.c", "XNNPACK/wrappers/f32-vunary/gen/vabs-sse-x8.c", "XNNPACK/wrappers/f32-vunary/gen/vneg-sse-x8.c", "XNNPACK/wrappers/f32-vunary/gen/vsqr-sse-x8.c", "XNNPACK/wrappers/x32-packx/x4-sse.c"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "ukernels_asm_aarch32", + srcs = ["XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-aarch32-neon-cortex-a7.S", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-aarch32-neon-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-aarch32-neon-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-aarch32-neon-ld64.S", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-aarch32-neon-prfm-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-aarch32-neon-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/4x4-aarch32-vfp-ld64.S", "XNNPACK/wrappers/f32-gemm/4x4-minmax-aarch32-vfp-ld64.S", "XNNPACK/wrappers/f32-gemm/4x8-minmax-aarch32-neon-cortex-a55.S", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a7.S", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a53.S", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a75.S", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-aarch32-neon-ld64.S", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-aarch32-neon-prfm-cortex-a53.S", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-aarch32-neon-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-igemm/4x8-minmax-aarch32-neon-cortex-a55.S", "XNNPACK/wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a7.S", "XNNPACK/wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a7.S", "XNNPACK/wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-ld64.S", "XNNPACK/wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qc8-gemm/gen/4x8c4-minmax-fp32-aarch32-neondot-cortex-a55.S", "XNNPACK/wrappers/qc8-gemm/gen/4x8c4-minmax-fp32-aarch32-neondot-ld64.S", "XNNPACK/wrappers/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-ld64.S", "XNNPACK/wrappers/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qc8-igemm/gen/4x8c4-minmax-fp32-aarch32-neondot-cortex-a55.S", "XNNPACK/wrappers/qc8-igemm/gen/4x8c4-minmax-fp32-aarch32-neondot-ld64.S", "XNNPACK/wrappers/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a7.S", "XNNPACK/wrappers/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S", "XNNPACK/wrappers/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qs8-gemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-cortex-a55.S", "XNNPACK/wrappers/qs8-gemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-ld64.S", "XNNPACK/wrappers/qs8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qs8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qs8-igemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-cortex-a55.S", "XNNPACK/wrappers/qs8-igemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-ld64.S", "XNNPACK/wrappers/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a7.S", "XNNPACK/wrappers/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S", "XNNPACK/wrappers/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S"], + deps = [":interface", ":jit_memory", "//third_party:FP16"], + exported_deps = [], + compiler_flags = ["-w", "-O2"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [["^android-armv7$", ["-march=armv8.2-a+dotprod", "-mfpu=neon-fp-armv8", "-mfloat-abi=softfp"]]], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "ukernels_ssse3", + srcs = [], + deps = [":interface", "//third_party:FP16"], + exported_deps = [], + compiler_flags = ["-w", "-O2"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [["x86", ["-mssse3"]]], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + platform_srcs = [["x86|x86_64|platform009", ["XNNPACK/wrappers/f32-dwconv2d-chw/gen/3x3p1-minmax-ssse3-2x4-acc2.c"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "ukernels_f16c", + srcs = [], + deps = [":interface"], + exported_deps = [], + compiler_flags = ["-w", "-O2", "-mf16c"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [["x86", ["-mf16c"]]], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + platform_srcs = [["x86|x86_64|platform009", ["XNNPACK/wrappers/f16-f32-vcvt/gen/vcvt-f16c-x16.c", "XNNPACK/wrappers/f16-gavgpool/gen/7p7x-minmax-f16c-c8.c", "XNNPACK/wrappers/f16-gavgpool/gen/7x-minmax-f16c-c8.c", "XNNPACK/wrappers/f16-maxpool/9p8x-minmax-f16c-c8.c", "XNNPACK/wrappers/f16-prelu/gen/f16c-2x16.c", "XNNPACK/wrappers/f16-vbinary/gen/vadd-minmax-f16c-x16.c", "XNNPACK/wrappers/f16-vbinary/gen/vaddc-minmax-f16c-x16.c", "XNNPACK/wrappers/f16-vbinary/gen/vmul-minmax-f16c-x16.c", "XNNPACK/wrappers/f16-vbinary/gen/vmulc-minmax-f16c-x16.c", "XNNPACK/wrappers/f16-vclamp/gen/vclamp-f16c-x16.c", "XNNPACK/wrappers/f16-vhswish/gen/vhswish-f16c-x16.c", "XNNPACK/wrappers/f16-vlrelu/gen/vlrelu-f16c-x16.c", "XNNPACK/wrappers/f32-f16-vcvt/gen/vcvt-f16c-x16.c"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "ukernels_xop", + srcs = [], + deps = [":interface"], + exported_deps = [], + compiler_flags = ["-w", "-O2", "-mxop"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows-x86_64", ["-Drestrict="]], ["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + platform_srcs = [["x86|x86_64|platform009", ["XNNPACK/wrappers/qc8-dwconv/gen/up16x9-minmax-fp32-xop-mul16-add16.c", "XNNPACK/wrappers/qc8-dwconv/gen/up16x25-minmax-fp32-xop-mul16-add16.c", "XNNPACK/wrappers/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c", "XNNPACK/wrappers/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c", "XNNPACK/wrappers/qc8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c", "XNNPACK/wrappers/qc8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c", "XNNPACK/wrappers/qs8-dwconv/gen/up16x9-minmax-fp32-xop-mul16-add16.c", "XNNPACK/wrappers/qs8-dwconv/gen/up16x25-minmax-fp32-xop-mul16-add16.c", "XNNPACK/wrappers/qs8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c", "XNNPACK/wrappers/qs8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c", "XNNPACK/wrappers/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c", "XNNPACK/wrappers/qs8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c", "XNNPACK/wrappers/qs8-vadd/gen/minmax-xop-mul32-ld32-x8.c", "XNNPACK/wrappers/qs8-vaddc/gen/minmax-xop-mul32-ld32-x8.c", "XNNPACK/wrappers/qu8-dwconv/gen/up16x9-minmax-fp32-xop-mul32.c", "XNNPACK/wrappers/qu8-dwconv/gen/up16x25-minmax-fp32-xop-mul32.c", "XNNPACK/wrappers/qu8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c", "XNNPACK/wrappers/qu8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c", "XNNPACK/wrappers/qu8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c", "XNNPACK/wrappers/qu8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c", "XNNPACK/wrappers/qu8-vadd/gen/minmax-xop-mul32-ld32-x8.c", "XNNPACK/wrappers/qu8-vaddc/gen/minmax-xop-mul32-ld32-x8.c"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "ukernels_scalar_aarch32", + srcs = ["XNNPACK/wrappers/f16-f32-vcvt/gen/vcvt-scalar-x4.c", "XNNPACK/wrappers/f32-argmaxpool/4x-scalar-c1.c", "XNNPACK/wrappers/f32-argmaxpool/9p8x-scalar-c1.c", "XNNPACK/wrappers/f32-argmaxpool/9x-scalar-c1.c", "XNNPACK/wrappers/f32-avgpool/9p8x-minmax-scalar-c1.c", "XNNPACK/wrappers/f32-avgpool/9x-minmax-scalar-c1.c", "XNNPACK/wrappers/f32-conv-hwc/3x3s2p0p1c3x4-scalar-1x1.c", "XNNPACK/wrappers/f32-conv-hwc/3x3s2p1c3x4-scalar-1x1.c", "XNNPACK/wrappers/f32-conv-hwc2chw/3x3s2p1c3x4-scalar-1x1.c", "XNNPACK/wrappers/f32-dwconv/gen/up1x3-minmax-scalar-acc2.c", "XNNPACK/wrappers/f32-dwconv/gen/up1x3-scalar-acc2.c", "XNNPACK/wrappers/f32-dwconv/gen/up1x4-minmax-scalar-acc2.c", "XNNPACK/wrappers/f32-dwconv/gen/up1x4-scalar-acc2.c", "XNNPACK/wrappers/f32-dwconv/gen/up1x9-minmax-scalar-acc2.c", "XNNPACK/wrappers/f32-dwconv/gen/up1x9-scalar-acc2.c", "XNNPACK/wrappers/f32-dwconv/gen/up1x25-minmax-scalar-acc2.c", "XNNPACK/wrappers/f32-dwconv/gen/up1x25-scalar-acc2.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/3x3p1-minmax-scalar-4x1.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/3x3s2p1-minmax-scalar-2x1-acc2.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/5x5p2-minmax-scalar-2x1-acc2.c", "XNNPACK/wrappers/f32-dwconv2d-chw/gen/5x5s2p2-minmax-scalar-2x1-acc2.c", "XNNPACK/wrappers/f32-f16-vcvt/gen/vcvt-scalar-fabsf-x2.c", "XNNPACK/wrappers/f32-gavgpool-cw/scalar-x1.c", "XNNPACK/wrappers/f32-gavgpool/7p7x-minmax-scalar-c1.c", "XNNPACK/wrappers/f32-gavgpool/7x-minmax-scalar-c1.c", "XNNPACK/wrappers/f32-gemm/gen/1x4-minmax-scalar.c", "XNNPACK/wrappers/f32-gemm/gen/1x4-relu-scalar.c", "XNNPACK/wrappers/f32-gemm/gen/1x4-scalar.c", "XNNPACK/wrappers/f32-gemm/gen/4x2-minmax-scalar.c", "XNNPACK/wrappers/f32-gemm/gen/4x2-scalar.c", "XNNPACK/wrappers/f32-gemm/gen/4x4-minmax-scalar.c", "XNNPACK/wrappers/f32-gemm/gen/4x4-relu-scalar.c", "XNNPACK/wrappers/f32-gemm/gen/4x4-scalar.c", "XNNPACK/wrappers/f32-ibilinear-chw/gen/scalar-p4.c", "XNNPACK/wrappers/f32-ibilinear/gen/scalar-c2.c", "XNNPACK/wrappers/f32-igemm/gen/1x4-minmax-scalar.c", "XNNPACK/wrappers/f32-igemm/gen/1x4-relu-scalar.c", "XNNPACK/wrappers/f32-igemm/gen/1x4-scalar.c", "XNNPACK/wrappers/f32-igemm/gen/4x2-minmax-scalar.c", "XNNPACK/wrappers/f32-igemm/gen/4x2-scalar.c", "XNNPACK/wrappers/f32-igemm/gen/4x4-minmax-scalar.c", "XNNPACK/wrappers/f32-igemm/gen/4x4-relu-scalar.c", "XNNPACK/wrappers/f32-igemm/gen/4x4-scalar.c", "XNNPACK/wrappers/f32-maxpool/9p8x-minmax-scalar-c1.c", "XNNPACK/wrappers/f32-pavgpool/9p8x-minmax-scalar-c1.c", "XNNPACK/wrappers/f32-pavgpool/9x-minmax-scalar-c1.c", "XNNPACK/wrappers/f32-prelu/gen/scalar-2x4.c", "XNNPACK/wrappers/f32-qs8-vcvt/gen/vcvt-scalar-imagic-x4.c", "XNNPACK/wrappers/f32-qu8-vcvt/gen/vcvt-scalar-imagic-x4.c", "XNNPACK/wrappers/f32-raddstoreexpminusmax/gen/scalar-rr2-p5-x4-acc2.c", "XNNPACK/wrappers/f32-rmax/scalar.c", "XNNPACK/wrappers/f32-spmm/gen/8x1-minmax-scalar.c", "XNNPACK/wrappers/f32-spmm/gen/8x2-minmax-scalar.c", "XNNPACK/wrappers/f32-spmm/gen/8x4-minmax-scalar.c", "XNNPACK/wrappers/f32-vbinary/gen/vadd-minmax-scalar-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vaddc-minmax-scalar-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vdiv-minmax-scalar-x2.c", "XNNPACK/wrappers/f32-vbinary/gen/vdivc-minmax-scalar-x2.c", "XNNPACK/wrappers/f32-vbinary/gen/vmax-scalar-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmaxc-scalar-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmin-scalar-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vminc-scalar-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmul-minmax-scalar-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vmulc-minmax-scalar-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vrdivc-minmax-scalar-x2.c", "XNNPACK/wrappers/f32-vbinary/gen/vrsubc-minmax-scalar-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vsqrdiff-scalar-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vsqrdiffc-scalar-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vsub-minmax-scalar-x8.c", "XNNPACK/wrappers/f32-vbinary/gen/vsubc-minmax-scalar-x8.c", "XNNPACK/wrappers/f32-vclamp/gen/vclamp-scalar-x4.c", "XNNPACK/wrappers/f32-velu/gen/velu-scalar-rr2-lut16-p3-x4.c", "XNNPACK/wrappers/f32-vhswish/gen/vhswish-scalar-x4.c", "XNNPACK/wrappers/f32-vlrelu/gen/vlrelu-scalar-x4.c", "XNNPACK/wrappers/f32-vmulcaddc/gen/c1-minmax-scalar-2x.c", "XNNPACK/wrappers/f32-vrelu/gen/vrelu-scalar-x8.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndd-scalar-libm-x1.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndne-scalar-libm-x1.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndu-scalar-libm-x1.c", "XNNPACK/wrappers/f32-vrnd/gen/vrndz-scalar-libm-x1.c", "XNNPACK/wrappers/f32-vsigmoid/gen/vsigmoid-scalar-rr2-lut64-p2-div-x2.c", "XNNPACK/wrappers/f32-vsqrt/gen/scalar-sqrt-x1.c", "XNNPACK/wrappers/f32-vunary/gen/vabs-scalar-x4.c", "XNNPACK/wrappers/f32-vunary/gen/vneg-scalar-x4.c", "XNNPACK/wrappers/f32-vunary/gen/vsqr-scalar-x4.c", "XNNPACK/wrappers/qc8-dwconv/gen/up2x9-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qc8-dwconv/gen/up2x25-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qc8-gemm/gen/1x2-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qc8-gemm/gen/1x8-minmax-fp32-neon-mlal-lane.c", "XNNPACK/wrappers/qc8-gemm/gen/2x2-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qc8-igemm/gen/1x2-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qc8-igemm/gen/1x8-minmax-fp32-neon-mlal-lane.c", "XNNPACK/wrappers/qc8-igemm/gen/2x2-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qs8-dwconv/gen/up1x9-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qs8-dwconv/gen/up1x25-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qs8-f32-vcvt/gen/vcvt-scalar-x4.c", "XNNPACK/wrappers/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c", "XNNPACK/wrappers/qs8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c1.c", "XNNPACK/wrappers/qs8-gemm/gen/1x2-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qs8-gemm/gen/2x2-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qs8-igemm/gen/1x2-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qs8-igemm/gen/2x2-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qs8-vadd/gen/minmax-scalar-x1.c", "XNNPACK/wrappers/qs8-vaddc/gen/minmax-scalar-x1.c", "XNNPACK/wrappers/qs8-vmul/gen/minmax-fp32-scalar-x4.c", "XNNPACK/wrappers/qs8-vmulc/gen/minmax-fp32-scalar-x4.c", "XNNPACK/wrappers/qu8-avgpool/9p8x-minmax-scalar-c1.c", "XNNPACK/wrappers/qu8-avgpool/9x-minmax-scalar-c1.c", "XNNPACK/wrappers/qu8-dwconv/gen/up1x9-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qu8-dwconv/gen/up1x25-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qu8-f32-vcvt/gen/vcvt-scalar-x4.c", "XNNPACK/wrappers/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c", "XNNPACK/wrappers/qu8-gavgpool/gen/7x-minmax-fp32-scalar-imagic-c1.c", "XNNPACK/wrappers/qu8-gemm/gen/1x2-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qu8-gemm/gen/2x2-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qu8-igemm/gen/1x2-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qu8-igemm/gen/2x2-minmax-fp32-scalar-fmagic.c", "XNNPACK/wrappers/qu8-vadd/gen/minmax-scalar-x1.c", "XNNPACK/wrappers/qu8-vaddc/gen/minmax-scalar-x1.c", "XNNPACK/wrappers/qu8-vmul/gen/minmax-fp32-scalar-x4.c", "XNNPACK/wrappers/qu8-vmulc/gen/minmax-fp32-scalar-x4.c", "XNNPACK/wrappers/s8-ibilinear/gen/scalar-c1.c", "XNNPACK/wrappers/s8-maxpool/9p8x-minmax-scalar-c1.c", "XNNPACK/wrappers/s8-vclamp/scalar-x4.c", "XNNPACK/wrappers/u8-ibilinear/gen/scalar-c1.c", "XNNPACK/wrappers/u8-maxpool/9p8x-minmax-scalar-c1.c", "XNNPACK/wrappers/u8-rmax/scalar.c", "XNNPACK/wrappers/u8-vclamp/scalar-x4.c", "XNNPACK/wrappers/xx-fill/scalar-x16.c", "XNNPACK/wrappers/xx-pad/scalar.c", "XNNPACK/wrappers/x8-zip/xm-scalar.c", "XNNPACK/wrappers/x8-zip/x2-scalar.c", "XNNPACK/wrappers/x8-zip/x3-scalar.c", "XNNPACK/wrappers/x8-zip/x4-scalar.c", "XNNPACK/wrappers/x32-packx/x2-scalar.c", "XNNPACK/wrappers/x32-packx/x3-scalar.c", "XNNPACK/wrappers/x32-packx/x4-scalar.c", "XNNPACK/wrappers/x32-unpool/scalar.c", "XNNPACK/wrappers/x32-zip/xm-scalar.c", "XNNPACK/wrappers/x32-zip/x2-scalar.c", "XNNPACK/wrappers/x32-zip/x3-scalar.c", "XNNPACK/wrappers/x32-zip/x4-scalar.c"], + deps = [":interface", "//third_party:FP16"], + exported_deps = [], + compiler_flags = ["-w", "-O2"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [["^(android-armv7|iphoneos-armv7)$", ["-march=armv7-a", "-mfpu=neon", "-mfloat-abi=softfp"]]], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "ukernels_neon_fma", + srcs = ["XNNPACK/wrappers/f32-dwconv/gen/up8x3-minmax-neonfma.c", "XNNPACK/wrappers/f32-dwconv/gen/up8x4-minmax-neonfma.c", "XNNPACK/wrappers/f32-dwconv/gen/up8x9-minmax-neonfma.c", "XNNPACK/wrappers/f32-dwconv/gen/up8x25-minmax-neonfma-acc2.c", "XNNPACK/wrappers/f32-gemm/gen/1x8s4-minmax-neonfma.c", "XNNPACK/wrappers/f32-gemm/gen/6x8s4-minmax-neonfma.c", "XNNPACK/wrappers/f32-ibilinear-chw/gen/neonfma-p8.c", "XNNPACK/wrappers/f32-ibilinear/gen/neonfma-c8.c", "XNNPACK/wrappers/f32-igemm/gen/1x8s4-minmax-neonfma.c", "XNNPACK/wrappers/f32-igemm/gen/6x8s4-minmax-neonfma.c", "XNNPACK/wrappers/f32-raddstoreexpminusmax/gen/neonfma-rr1-lut64-p2-x16.c", "XNNPACK/wrappers/f32-spmm/gen/32x1-minmax-neonfma-pipelined.c", "XNNPACK/wrappers/f32-velu/gen/velu-neonfma-rr1-lut16-p3-x16.c", "XNNPACK/wrappers/f32-velu/gen/velu-neonfma-rr1-p6-x8.c", "XNNPACK/wrappers/f32-vmulcaddc/gen/c4-minmax-neonfma-2x.c", "XNNPACK/wrappers/f32-vsigmoid/gen/vsigmoid-neonfma-rr1-lut64-p2-nr2recps-x16.c"], + deps = [":interface", "//third_party:FP16"], + exported_deps = [], + compiler_flags = ["-w", "-O2"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [["arm", ["-mfpu=neon-vfpv4"]]], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "ukernels_avx2", + srcs = [], + deps = [":interface"], + exported_deps = [], + compiler_flags = ["-w", "-O2", "-mavx2", "-mfma", "-mf16c"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [["x86", ["-mavx2", "-mfma", "-mf16c"]]], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + platform_srcs = [["x86|x86_64|platform009", ["XNNPACK/wrappers/f16-gemm/gen/1x16-minmax-avx2-broadcast.c", "XNNPACK/wrappers/f16-gemm/gen/4x16-minmax-avx2-broadcast.c", "XNNPACK/wrappers/f16-igemm/gen/1x16-minmax-avx2-broadcast.c", "XNNPACK/wrappers/f16-igemm/gen/4x16-minmax-avx2-broadcast.c", "XNNPACK/wrappers/f32-qs8-vcvt/gen/vcvt-avx2-x64.c", "XNNPACK/wrappers/f32-qu8-vcvt/gen/vcvt-avx2-x64.c", "XNNPACK/wrappers/f32-velu/gen/velu-avx2-rr1-lut4-p4-perm-x56.c", "XNNPACK/wrappers/f32-vsigmoid/gen/vsigmoid-avx2-rr1-p5-div-x40.c", "XNNPACK/wrappers/qc8-dwconv/gen/up16x9-minmax-fp32-avx2-mul32.c", "XNNPACK/wrappers/qc8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c", "XNNPACK/wrappers/qc8-gemm/gen/1x8c8-minmax-fp32-avx2.c", "XNNPACK/wrappers/qc8-gemm/gen/3x8c8-minmax-fp32-avx2.c", "XNNPACK/wrappers/qc8-igemm/gen/1x8c8-minmax-fp32-avx2.c", "XNNPACK/wrappers/qc8-igemm/gen/3x8c8-minmax-fp32-avx2.c", "XNNPACK/wrappers/qs8-dwconv/gen/up16x9-minmax-fp32-avx2-mul32.c", "XNNPACK/wrappers/qs8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c", "XNNPACK/wrappers/qs8-f32-vcvt/gen/vcvt-avx2-x16.c", "XNNPACK/wrappers/qs8-gemm/gen/1x8c8-minmax-fp32-avx2.c", "XNNPACK/wrappers/qs8-gemm/gen/3x8c8-minmax-fp32-avx2.c", "XNNPACK/wrappers/qs8-igemm/gen/1x8c8-minmax-fp32-avx2.c", "XNNPACK/wrappers/qs8-igemm/gen/3x8c8-minmax-fp32-avx2.c", "XNNPACK/wrappers/qs8-vadd/gen/minmax-avx2-mul32-ld64-x16.c", "XNNPACK/wrappers/qs8-vaddc/gen/minmax-avx2-mul32-ld64-x16.c", "XNNPACK/wrappers/qu8-dwconv/gen/up16x9-minmax-fp32-avx2-mul32.c", "XNNPACK/wrappers/qu8-dwconv/gen/up16x25-minmax-fp32-avx2-mul32.c", "XNNPACK/wrappers/qu8-f32-vcvt/gen/vcvt-avx2-x16.c", "XNNPACK/wrappers/qu8-gemm/gen/1x8c8-minmax-fp32-avx2.c", "XNNPACK/wrappers/qu8-gemm/gen/3x8c8-minmax-fp32-avx2.c", "XNNPACK/wrappers/qu8-igemm/gen/1x8c8-minmax-fp32-avx2.c", "XNNPACK/wrappers/qu8-igemm/gen/3x8c8-minmax-fp32-avx2.c", "XNNPACK/wrappers/qu8-vadd/gen/minmax-avx2-mul32-ld64-x16.c", "XNNPACK/wrappers/qu8-vaddc/gen/minmax-avx2-mul32-ld64-x16.c", "XNNPACK/wrappers/x8-lut/gen/lut-avx2-x128.c"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) + + cxx_library( + name = "ukernels_asm_aarch64", + srcs = ["XNNPACK/wrappers/f16-gemm/gen-inc/1x8inc-minmax-aarch64-neonfp16arith-ld64.S", "XNNPACK/wrappers/f16-gemm/gen-inc/1x16inc-minmax-aarch64-neonfp16arith-ld32.S", "XNNPACK/wrappers/f16-gemm/gen-inc/4x8inc-minmax-aarch64-neonfp16arith-ld64.S", "XNNPACK/wrappers/f16-gemm/gen-inc/4x16inc-minmax-aarch64-neonfp16arith-ld32.S", "XNNPACK/wrappers/f16-gemm/gen-inc/6x8inc-minmax-aarch64-neonfp16arith-ld64.S", "XNNPACK/wrappers/f16-gemm/gen-inc/6x16inc-minmax-aarch64-neonfp16arith-cortex-a55.S", "XNNPACK/wrappers/f16-gemm/gen-inc/6x16inc-minmax-aarch64-neonfp16arith-cortex-a75.S", "XNNPACK/wrappers/f16-gemm/gen-inc/6x16inc-minmax-aarch64-neonfp16arith-ld32.S", "XNNPACK/wrappers/f16-gemm/gen-inc/8x8inc-minmax-aarch64-neonfp16arith-ld64.S", "XNNPACK/wrappers/f16-gemm/gen/1x8-minmax-aarch64-neonfp16arith-ld64.S", "XNNPACK/wrappers/f16-gemm/gen/1x16-minmax-aarch64-neonfp16arith-ld32.S", "XNNPACK/wrappers/f16-gemm/gen/4x8-minmax-aarch64-neonfp16arith-ld64.S", "XNNPACK/wrappers/f16-gemm/gen/4x16-minmax-aarch64-neonfp16arith-ld32.S", "XNNPACK/wrappers/f16-gemm/gen/6x8-minmax-aarch64-neonfp16arith-ld64.S", "XNNPACK/wrappers/f16-gemm/gen/6x16-minmax-aarch64-neonfp16arith-cortex-a55.S", "XNNPACK/wrappers/f16-gemm/gen/6x16-minmax-aarch64-neonfp16arith-cortex-a75.S", "XNNPACK/wrappers/f16-gemm/gen/6x16-minmax-aarch64-neonfp16arith-ld32.S", "XNNPACK/wrappers/f16-gemm/gen/8x8-minmax-aarch64-neonfp16arith-ld64.S", "XNNPACK/wrappers/f16-igemm/4x16-minmax-aarch64-neonfp16arith-ld32.S", "XNNPACK/wrappers/f32-dwconv/up4x9-minmax-aarch64-neonfma-cortex-a55.S", "XNNPACK/wrappers/f32-dwconv/up4x9-minmax-aarch64-neonfma.S", "XNNPACK/wrappers/f32-gemm/gen-inc/1x8inc-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen-inc/1x8inc-minmax-aarch64-neonfma-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen-inc/1x8inc-minmax-aarch64-neonfma-ld64.S", "XNNPACK/wrappers/f32-gemm/gen-inc/1x8inc-minmax-aarch64-neonfma-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen-inc/1x12inc-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen-inc/4x8inc-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen-inc/4x8inc-minmax-aarch64-neonfma-cortex-a55.S", "XNNPACK/wrappers/f32-gemm/gen-inc/4x8inc-minmax-aarch64-neonfma-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen-inc/4x8inc-minmax-aarch64-neonfma-ld64.S", "XNNPACK/wrappers/f32-gemm/gen-inc/4x8inc-minmax-aarch64-neonfma-ld128.S", "XNNPACK/wrappers/f32-gemm/gen-inc/4x8inc-minmax-aarch64-neonfma-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen-inc/4x12inc-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen-inc/5x8inc-minmax-aarch64-neonfma-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen-inc/5x8inc-minmax-aarch64-neonfma-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-cortex-a55.S", "XNNPACK/wrappers/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-cortex-a73.S", "XNNPACK/wrappers/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-ld64.S", "XNNPACK/wrappers/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-ld128.S", "XNNPACK/wrappers/f32-gemm/gen-inc/6x8inc-minmax-aarch64-neonfma-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen/1x8-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen/1x8-minmax-aarch64-neonfma-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen/1x8-minmax-aarch64-neonfma-ld64.S", "XNNPACK/wrappers/f32-gemm/gen/1x8-minmax-aarch64-neonfma-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen/1x12-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-aarch64-neonfma-cortex-a55.S", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-aarch64-neonfma-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-aarch64-neonfma-ld64.S", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-aarch64-neonfma-ld128.S", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-aarch64-neonfma-prfm-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen/4x8-minmax-aarch64-neonfma-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen/4x12-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen/5x8-minmax-aarch64-neonfma-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen/5x8-minmax-aarch64-neonfma-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen/6x8-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen/6x8-minmax-aarch64-neonfma-cortex-a55.S", "XNNPACK/wrappers/f32-gemm/gen/6x8-minmax-aarch64-neonfma-cortex-a73.S", "XNNPACK/wrappers/f32-gemm/gen/6x8-minmax-aarch64-neonfma-cortex-a75.S", "XNNPACK/wrappers/f32-gemm/gen/6x8-minmax-aarch64-neonfma-ld64.S", "XNNPACK/wrappers/f32-gemm/gen/6x8-minmax-aarch64-neonfma-ld128.S", "XNNPACK/wrappers/f32-gemm/gen/6x8-minmax-aarch64-neonfma-prfm-cortex-a53.S", "XNNPACK/wrappers/f32-gemm/gen/6x8-minmax-aarch64-neonfma-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-igemm/gen/1x8-minmax-aarch64-neonfma-cortex-a75.S", "XNNPACK/wrappers/f32-igemm/gen/1x8-minmax-aarch64-neonfma-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-aarch64-neonfma-cortex-a75.S", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-aarch64-neonfma-ld64.S", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-aarch64-neonfma-ld128.S", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-aarch64-neonfma-prfm-cortex-a53.S", "XNNPACK/wrappers/f32-igemm/gen/4x8-minmax-aarch64-neonfma-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-igemm/gen/5x8-minmax-aarch64-neonfma-cortex-a75.S", "XNNPACK/wrappers/f32-igemm/gen/5x8-minmax-aarch64-neonfma-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-igemm/gen/6x8-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-igemm/gen/6x8-minmax-aarch64-neonfma-cortex-a75.S", "XNNPACK/wrappers/f32-igemm/gen/6x8-minmax-aarch64-neonfma-ld64.S", "XNNPACK/wrappers/f32-igemm/gen/6x8-minmax-aarch64-neonfma-ld128.S", "XNNPACK/wrappers/f32-igemm/gen/6x8-minmax-aarch64-neonfma-prfm-cortex-a53.S", "XNNPACK/wrappers/f32-igemm/gen/6x8-minmax-aarch64-neonfma-prfm-cortex-a75.S", "XNNPACK/wrappers/f32-igemm/1x8-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-igemm/1x12-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-igemm/4x8-minmax-aarch64-neonfma-cortex-a55.S", "XNNPACK/wrappers/f32-igemm/4x12-minmax-aarch64-neonfma-cortex-a53.S", "XNNPACK/wrappers/f32-igemm/6x8-minmax-aarch64-neonfma-cortex-a55.S", "XNNPACK/wrappers/f32-igemm/6x8-minmax-aarch64-neonfma-cortex-a73.S", "XNNPACK/wrappers/qc8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S", "XNNPACK/wrappers/qc8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-prfm-cortex-a53.S", "XNNPACK/wrappers/qc8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S", "XNNPACK/wrappers/qc8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal.S", "XNNPACK/wrappers/qc8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld32.S", "XNNPACK/wrappers/qc8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld64.S", "XNNPACK/wrappers/qc8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S", "XNNPACK/wrappers/qc8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-prfm-cortex-a53.S", "XNNPACK/wrappers/qc8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S", "XNNPACK/wrappers/qc8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal.S", "XNNPACK/wrappers/qc8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mull.S", "XNNPACK/wrappers/qc8-gemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal.S", "XNNPACK/wrappers/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qc8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S", "XNNPACK/wrappers/qc8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld32.S", "XNNPACK/wrappers/qc8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S", "XNNPACK/wrappers/qc8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S", "XNNPACK/wrappers/qc8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S", "XNNPACK/wrappers/qc8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-prfm-cortex-a53.S", "XNNPACK/wrappers/qc8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S", "XNNPACK/wrappers/qc8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal.S", "XNNPACK/wrappers/qc8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S", "XNNPACK/wrappers/qc8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-prfm-cortex-a53.S", "XNNPACK/wrappers/qc8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S", "XNNPACK/wrappers/qc8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal.S", "XNNPACK/wrappers/qc8-igemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal.S", "XNNPACK/wrappers/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S", "XNNPACK/wrappers/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S", "XNNPACK/wrappers/qc8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S", "XNNPACK/wrappers/qs8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S", "XNNPACK/wrappers/qs8-gemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal.S", "XNNPACK/wrappers/qs8-gemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal-prfm.S", "XNNPACK/wrappers/qs8-gemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal.S", "XNNPACK/wrappers/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld32.S", "XNNPACK/wrappers/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld64.S", "XNNPACK/wrappers/qs8-gemm/gen/1x16c4-minmax-rndnu-aarch64-neondot-ld32.S", "XNNPACK/wrappers/qs8-gemm/gen/1x16c4-minmax-rndnu-aarch64-neondot-ld64.S", "XNNPACK/wrappers/qs8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S", "XNNPACK/wrappers/qs8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal.S", "XNNPACK/wrappers/qs8-gemm/gen/2x8c8-minmax-fp32-aarch64-neon-mull.S", "XNNPACK/wrappers/qs8-gemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal-prfm.S", "XNNPACK/wrappers/qs8-gemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal.S", "XNNPACK/wrappers/qs8-gemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mull.S", "XNNPACK/wrappers/qs8-gemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal.S", "XNNPACK/wrappers/qs8-gemm/gen/2x8c16-minmax-rndnu-aarch64-neon-mlal.S", "XNNPACK/wrappers/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld32.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-cortex-a55.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld32.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld64.S", "XNNPACK/wrappers/qs8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld128.S", "XNNPACK/wrappers/qs8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S", "XNNPACK/wrappers/qs8-igemm/gen/1x8c8-minmax-fp32-aarch64-neon-mlal.S", "XNNPACK/wrappers/qs8-igemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal-prfm.S", "XNNPACK/wrappers/qs8-igemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal.S", "XNNPACK/wrappers/qs8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal-prfm.S", "XNNPACK/wrappers/qs8-igemm/gen/2x8c8-minmax-fp32-aarch64-neon-mlal.S", "XNNPACK/wrappers/qs8-igemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal-prfm.S", "XNNPACK/wrappers/qs8-igemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal.S", "XNNPACK/wrappers/qs8-igemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal.S", "XNNPACK/wrappers/qs8-igemm/gen/2x8c16-minmax-rndnu-aarch64-neon-mlal.S", "XNNPACK/wrappers/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-cortex-a55.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld64.S", "XNNPACK/wrappers/qs8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld128.S", "XNNPACK/wrappers/qu8-gemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-cortex-a55.S", "XNNPACK/wrappers/qu8-gemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-ld128.S", "XNNPACK/wrappers/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a75.S", "XNNPACK/wrappers/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a75.S", "XNNPACK/wrappers/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qu8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S", "XNNPACK/wrappers/qu8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S", "XNNPACK/wrappers/qu8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-cortex-a55.S", "XNNPACK/wrappers/qu8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld128.S", "XNNPACK/wrappers/qu8-igemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-cortex-a55.S", "XNNPACK/wrappers/qu8-igemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-ld128.S", "XNNPACK/wrappers/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S", "XNNPACK/wrappers/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a75.S", "XNNPACK/wrappers/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S", "XNNPACK/wrappers/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S", "XNNPACK/wrappers/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a75.S", "XNNPACK/wrappers/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S", "XNNPACK/wrappers/qu8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S", "XNNPACK/wrappers/qu8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S", "XNNPACK/wrappers/qu8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-cortex-a55.S", "XNNPACK/wrappers/qu8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld128.S"], + deps = [":interface", ":jit_memory", "//third_party:FP16"], + exported_deps = [], + compiler_flags = ["-w", "-O2"], + preferred_linkage = "static", + exported_preprocessor_flags = [], + header_namespace = "", + headers = subdir_glob([("XNNPACK/src", "**/*.S"), ("XNNPACK/src", "**/*.c"), ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h")]), + linker_flags = [], + platform_compiler_flags = [["(aarch64|arm64)", ["-march=armv8.2-a+fp16+dotprod"]]], + platform_linker_flags = [], + platform_preprocessor_flags = [["windows", ["-D_WINDOWS", "-D_WIN32", "-DWIN32", "-DNOMINMAX", "-D_CRT_SECURE_NO_WARNINGS", "-D_USE_MATH_DEFINES"]], ["windows.*64$", ["-D_WIN64"]]], + preprocessor_flags = ["-DXNN_LOG_LEVEL=0"], + soname = "", + visibility = ["PUBLIC"], + ) diff --git a/tools/README.md b/tools/README.md index e4aba38afd85..94b61eecce82 100644 --- a/tools/README.md +++ b/tools/README.md @@ -33,41 +33,14 @@ Build system pieces: Developer tools which you might find useful: -* [linter/clang_tidy](linter/clang_tidy/__main__.py) - Script for running clang-tidy - on lines of your script which you changed. -* [extract_scripts.py](extract_scripts.py) - Extract scripts from - `.github/workflows/*.yml` into a specified dir, on which linters such as - [linter/run_shellcheck.sh](linter/run_shellcheck.sh) can be run. Assumes that every `run` - script has `shell: bash` unless a different shell is explicitly listed on that - specific step (so `defaults` doesn't currently work), but also has some rules - for other situations such as [actions/github-script][]. Exits with nonzero - status if any of the extracted scripts contain [GitHub Actions expressions][]: - `${{ }}` * [git_add_generated_dirs.sh](git_add_generated_dirs.sh) and [git_reset_generated_dirs.sh](git_reset_generated_dirs.sh) - Use this to force add generated files to your Git index, so that you can conveniently run diffs on them when working on code-generation. (See also [generated_dirs.txt](generated_dirs.txt) which specifies the list of directories with generated files.) -* [linter/mypy_wrapper.py](linter/mypy_wrapper.py) - Run `mypy` on a single file using the - appropriate subset of our `mypy*.ini` configs. -* [linter/run_shellcheck.sh](linter/run_shellcheck.sh) - Find `*.sh` files (recursively) in - the directories specified as arguments, and run [ShellCheck][] on all of them. * [stats/test_history.py](stats/test_history.py) - Query S3 to display history of a single test across multiple jobs over time. -* [linter/trailing_newlines.py](linter/trailing_newlines.py) - Take names of UTF-8 files from - stdin, print names of nonempty files whose contents don't end in exactly one - trailing newline, exit with status 1 if no output printed or 0 if some - filenames were printed. -* [linter/translate_annotations.py](linter/translate_annotations.py) - Read [Flake8][] or - [clang-tidy][] warnings (according to a `--regex`) from a `--file`, convert to - the JSON format accepted by [pytorch/add-annotations-github-action], and - translate line numbers from `HEAD` back in time to the given `--commit` by - running `git diff-index --unified=0` appropriately. -* [vscode_settings.py](vscode_settings.py) - Merge - `.vscode/settings_recommended.json` into your workspace-local - `.vscode/settings.json`, preferring the former in case of conflicts but - otherwise preserving the latter as much as possible. Important if you want to run on AMD GPU: diff --git a/tools/actions_local_runner.py b/tools/actions_local_runner.py deleted file mode 100755 index 050905934133..000000000000 --- a/tools/actions_local_runner.py +++ /dev/null @@ -1,440 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import subprocess -import sys -import os -import argparse -import yaml -import asyncio -import shutil -import re -import fnmatch -import shlex -import configparser - -from typing import List, Dict, Any, Optional, Union, NamedTuple, Set - -REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - - -class col: - HEADER = "\033[95m" - BLUE = "\033[94m" - GREEN = "\033[92m" - YELLOW = "\033[93m" - RED = "\033[91m" - RESET = "\033[0m" - BOLD = "\033[1m" - UNDERLINE = "\033[4m" - - -def should_color() -> bool: - return hasattr(sys.stdout, "isatty") and sys.stdout.isatty() - - -def color(the_color: str, text: str) -> str: - if should_color(): - return col.BOLD + the_color + str(text) + col.RESET - else: - return text - - -def cprint(the_color: str, text: str) -> None: - if should_color(): - print(color(the_color, text)) - else: - print(text) - - -def git(args: List[str]) -> List[str]: - p = subprocess.run( - ["git"] + args, - cwd=REPO_ROOT, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - check=True, - ) - lines = p.stdout.decode().strip().split("\n") - return [line.strip() for line in lines] - - -def find_changed_files() -> List[str]: - untracked = [] - - for line in git(["status", "--porcelain"]): - # Untracked files start with ??, so grab all of those - if line.startswith("?? "): - untracked.append(line.replace("?? ", "")) - - # Modified, unstaged - modified = git(["diff", "--name-only"]) - - # Modified, staged - cached = git(["diff", "--cached", "--name-only"]) - - # Committed - merge_base = git(["merge-base", "origin/master", "HEAD"])[0] - diff_with_origin = git(["diff", "--name-only", merge_base, "HEAD"]) - - # De-duplicate - all_files = set() - for x in untracked + cached + modified + diff_with_origin: - stripped = x.strip() - if stripped != "" and os.path.exists(stripped): - all_files.add(stripped) - return list(all_files) - - -def print_results(job_name: str, passed: bool, streams: List[str]) -> None: - icon = color(col.GREEN, "✓") if passed else color(col.RED, "x") - print(f"{icon} {color(col.BLUE, job_name)}") - - for stream in streams: - stream = stream.strip() - if stream != "": - print(stream) - - -class CommandResult(NamedTuple): - passed: bool - stdout: str - stderr: str - - -async def shell_cmd( - cmd: Union[str, List[str]], - env: Optional[Dict[str, Any]] = None, - redirect: bool = True, -) -> CommandResult: - if isinstance(cmd, list): - cmd_str = " ".join(shlex.quote(arg) for arg in cmd) - else: - cmd_str = cmd - - proc = await asyncio.create_subprocess_shell( - cmd_str, - shell=True, - cwd=REPO_ROOT, - env=env, - stdout=subprocess.PIPE if redirect else None, - stderr=subprocess.PIPE if redirect else None, - executable=shutil.which("bash"), - ) - stdout, stderr = await proc.communicate() - - passed = proc.returncode == 0 - if not redirect: - return CommandResult(passed, "", "") - - return CommandResult(passed, stdout.decode().strip(), stderr.decode().strip()) - - -class Check: - name: str - - def __init__(self, files: Optional[List[str]], quiet: bool): - self.quiet = quiet - self.files = files - - async def run(self) -> bool: - result = await self.run_helper() - if result is None: - return True - - streams = [] - if not result.passed: - streams = [ - result.stderr, - result.stdout, - ] - print_results(self.name, result.passed, streams) - return result.passed - - async def run_helper(self) -> Optional[CommandResult]: - if self.files is not None: - relevant_files = self.filter_files(self.files) - if len(relevant_files) == 0: - # No files, do nothing - return CommandResult(passed=True, stdout="", stderr="") - - return await self.quick(relevant_files) - - return await self.full() - - def filter_ext(self, files: List[str], extensions: Set[str]) -> List[str]: - def passes(filename: str) -> bool: - return os.path.splitext(filename)[1] in extensions - - return [f for f in files if passes(f)] - - def filter_files(self, files: List[str]) -> List[str]: - return files - - async def quick(self, files: List[str]) -> CommandResult: - raise NotImplementedError - - async def full(self) -> Optional[CommandResult]: - raise NotImplementedError - - -class Flake8(Check): - name = "flake8" - - def filter_files(self, files: List[str]) -> List[str]: - config = configparser.ConfigParser() - config.read(os.path.join(REPO_ROOT, ".flake8")) - - excludes = re.split(r",\s*", config["flake8"]["exclude"].strip()) - excludes = [e.strip() for e in excludes if e.strip() != ""] - - def should_include(name: str) -> bool: - for exclude in excludes: - if fnmatch.fnmatch(name, pat=exclude): - return False - if name.startswith(exclude) or f"./{name}".startswith(exclude): - return False - return True - - files = self.filter_ext(files, {".py"}) - return [f for f in files if should_include(f)] - - async def quick(self, files: List[str]) -> CommandResult: - return await shell_cmd(["flake8"] + files) - - async def full(self) -> CommandResult: - return await shell_cmd(["flake8"]) - - -class Mypy(Check): - name = "mypy (skipped typestub generation)" - - def filter_files(self, files: List[str]) -> List[str]: - return self.filter_ext(files, {".py", ".pyi"}) - - def env(self) -> Dict[str, Any]: - env = os.environ.copy() - if should_color(): - # Secret env variable: https://github.com/python/mypy/issues/7771 - env["MYPY_FORCE_COLOR"] = "1" - return env - - async def quick(self, files: List[str]) -> CommandResult: - return await shell_cmd( - [sys.executable, "tools/linter/mypy_wrapper.py"] - + [os.path.join(REPO_ROOT, f) for f in files], - env=self.env(), - ) - - async def full(self) -> None: - env = self.env() - # hackily change the name - self.name = "mypy" - - await shell_cmd( - [ - sys.executable, - "tools/actions_local_runner.py", - "--job", - "mypy", - "--file", - ".github/workflows/lint.yml", - "--step", - "Run autogen", - ], - redirect=False, - env=env, - ) - - await shell_cmd( - [ - sys.executable, - "tools/actions_local_runner.py", - "--job", - "mypy", - "--file", - ".github/workflows/lint.yml", - "--step", - "Run mypy", - ], - redirect=False, - env=env, - ) - - -class ShellCheck(Check): - name = "shellcheck: Run ShellCheck" - - def filter_files(self, files: List[str]) -> List[str]: - return self.filter_ext(files, {".sh"}) - - async def quick(self, files: List[str]) -> CommandResult: - return await shell_cmd( - ["tools/linter/run_shellcheck.sh"] - + [os.path.join(REPO_ROOT, f) for f in files], - ) - - async def full(self) -> None: - await shell_cmd( - [ - sys.executable, - "tools/actions_local_runner.py", - "--job", - "shellcheck", - "--file", - ".github/workflows/lint.yml", - "--step", - "Run ShellCheck", - ], - redirect=False, - ) - - -class ClangTidy(Check): - name = "clang-tidy: Run clang-tidy" - common_options = [ - "--clang-tidy-exe", - ".clang-tidy-bin/clang-tidy", - ] - - def filter_files(self, files: List[str]) -> List[str]: - return self.filter_ext(files, {".c", ".cc", ".cpp"}) - - async def quick(self, files: List[str]) -> CommandResult: - return await shell_cmd( - [sys.executable, "-m", "tools.linter.clang_tidy", "--paths"] - + [os.path.join(REPO_ROOT, f) for f in files] - + self.common_options, - ) - - async def full(self) -> None: - await shell_cmd( - [sys.executable, "-m", "tools.linter.clang_tidy"] + self.common_options, - redirect=False, - ) - - -class YamlStep(Check): - def __init__(self, step: Dict[str, Any], job_name: str, quiet: bool): - super().__init__(files=None, quiet=quiet) - self.step = step - self.name = f'{job_name}: {self.step["name"]}' - - async def full(self) -> CommandResult: - env = os.environ.copy() - env["GITHUB_WORKSPACE"] = "/tmp" - script = self.step["run"] - - if self.quiet: - # TODO: Either lint that GHA scripts only use 'set -eux' or make this more - # resilient - script = script.replace("set -eux", "set -eu") - script = re.sub(r"^time ", "", script, flags=re.MULTILINE) - - return await shell_cmd(script, env=env) - - -def changed_files() -> Optional[List[str]]: - changed_files: Optional[List[str]] = None - try: - changed_files = sorted(find_changed_files()) - except Exception: - # If the git commands failed for some reason, bail out and use the whole list - print( - "Could not query git for changed files, falling back to testing all files instead", - file=sys.stderr, - ) - return None - - return changed_files - - -def grab_specific_steps( - steps_to_grab: List[str], job: Dict[str, Any] -) -> List[Dict[str, Any]]: - relevant_steps = [] - for step in steps_to_grab: - for actual_step in job["steps"]: - if actual_step["name"].lower().strip() == step.lower().strip(): - relevant_steps.append(actual_step) - break - - if len(relevant_steps) != len(steps_to_grab): - raise RuntimeError(f"Missing steps:\n{relevant_steps}\n{steps_to_grab}") - - return relevant_steps - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Pull shell scripts out of GitHub actions and run them" - ) - parser.add_argument("--file", help="YAML file with actions") - parser.add_argument( - "--changed-only", - help="only run on changed files", - action="store_true", - default=False, - ) - parser.add_argument("--job", help="job name", required=True) - parser.add_argument( - "--no-quiet", help="output commands", action="store_true", default=False - ) - parser.add_argument("--step", action="append", help="steps to run (in order)") - args = parser.parse_args() - - quiet = not args.no_quiet - - if args.file is None: - # If there is no .yml file provided, fall back to the list of known - # jobs. We use this for flake8 and mypy since they run different - # locally than in CI due to 'make quicklint' - if args.job not in ad_hoc_steps: - raise RuntimeError( - f"Job {args.job} not found and no .yml file was provided" - ) - - files = None - if args.changed_only: - files = changed_files() - - checks = [ad_hoc_steps[args.job](files, quiet)] - else: - if args.step is None: - raise RuntimeError("1+ --steps must be provided") - - action = yaml.safe_load(open(args.file, "r")) - if "jobs" not in action: - raise RuntimeError(f"top level key 'jobs' not found in {args.file}") - jobs = action["jobs"] - - if args.job not in jobs: - raise RuntimeError(f"job '{args.job}' not found in {args.file}") - - job = jobs[args.job] - - # Pull the relevant sections out of the provided .yml file and run them - relevant_steps = grab_specific_steps(args.step, job) - checks = [ - YamlStep(step=step, job_name=args.job, quiet=quiet) - for step in relevant_steps - ] - - loop = asyncio.get_event_loop() - loop.run_until_complete(asyncio.gather(*[check.run() for check in checks])) - - -# These are run differently locally in order to enable quicklint, so dispatch -# out to special handlers instead of using lint.yml -ad_hoc_steps = { - "mypy": Mypy, - "flake8-py3": Flake8, - "shellcheck": ShellCheck, - "clang-tidy": ClangTidy, -} - -if __name__ == "__main__": - try: - main() - except KeyboardInterrupt: - pass diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py index 38698631c03c..ca41b17a43e7 100755 --- a/tools/amd_build/build_amd.py +++ b/tools/amd_build/build_amd.py @@ -4,43 +4,50 @@ import os import argparse import sys -sys.path.append(os.path.realpath(os.path.join( - __file__, - os.path.pardir, - os.path.pardir, - os.path.pardir, - 'torch', - 'utils'))) + +sys.path.append( + os.path.realpath( + os.path.join( + __file__, os.path.pardir, os.path.pardir, os.path.pardir, "torch", "utils" + ) + ) +) from hipify import hipify_python # type: ignore[import] -parser = argparse.ArgumentParser(description='Top-level script for HIPifying, filling in most common parameters') +parser = argparse.ArgumentParser( + description="Top-level script for HIPifying, filling in most common parameters" +) parser.add_argument( - '--out-of-place-only', - action='store_true', - help="Whether to only run hipify out-of-place on source files") + "--out-of-place-only", + action="store_true", + help="Whether to only run hipify out-of-place on source files", +) parser.add_argument( - '--project-directory', + "--project-directory", type=str, - default='', + default="", help="The root of the project.", - required=False) + required=False, +) parser.add_argument( - '--output-directory', + "--output-directory", type=str, - default='', + default="", help="The directory to store the hipified project", - required=False) + required=False, +) parser.add_argument( - '--extra-include-dir', + "--extra-include-dir", type=str, default=[], - nargs='+', + nargs="+", help="The list of extra directories in caffe2 to hipify", - required=False) + required=False, +) args = parser.parse_args() @@ -78,8 +85,11 @@ "aten/src/ATen/cuda/*", "aten/src/ATen/native/cuda/*", "aten/src/ATen/native/cudnn/*", + "aten/src/ATen/native/quantized/cudnn/*", + "aten/src/ATen/native/nested/cuda/*", "aten/src/ATen/native/sparse/cuda/*", "aten/src/ATen/native/quantized/cuda/*", + "aten/src/ATen/native/transformers/cuda/*", "aten/src/THC/*", "aten/src/ATen/test/*", # CMakeLists.txt isn't processed by default, but there are a few @@ -89,16 +99,18 @@ "tools/autograd/templates/python_variable_methods.cpp", ] +includes = [os.path.join(proj_dir, include) for include in includes] + for new_dir in args.extra_include_dir: abs_new_dir = os.path.join(proj_dir, new_dir) if os.path.exists(abs_new_dir): - new_dir = os.path.join(new_dir, '**/*') - includes.append(new_dir) + abs_new_dir = os.path.join(abs_new_dir, "**/*") + includes.append(abs_new_dir) ignores = [ "caffe2/operators/depthwise_3x3_conv_op_cudnn.cu", "caffe2/operators/pool_op_cudnn.cu", - '*/hip/*', + "*/hip/*", # These files are compatible with both cuda and hip "aten/src/ATen/core/*", "torch/csrc/jit/codegen/cuda/codegen.cpp", @@ -112,15 +124,18 @@ "torch/include/*", ] +ignores = [os.path.join(proj_dir, ignore) for ignore in ignores] + # Check if the compiler is hip-clang. def is_hip_clang() -> bool: try: - hip_path = os.getenv('HIP_PATH', '/opt/rocm/hip') - with open(hip_path + '/lib/.hipInfo') as f: - return 'HIP_COMPILER=clang' in f.read() + hip_path = os.getenv("HIP_PATH", "/opt/rocm/hip") + with open(hip_path + "/lib/.hipInfo") as f: + return "HIP_COMPILER=clang" in f.read() except IOError: return False + # TODO Remove once gloo submodule is recent enough to contain upstream fix. if is_hip_clang(): gloo_cmake_file = "third_party/gloo/cmake/Hip.cmake" @@ -128,7 +143,7 @@ def is_hip_clang() -> bool: if os.path.exists(gloo_cmake_file): with open(gloo_cmake_file, "r") as sources: lines = sources.readlines() - newlines = [line.replace(' hip_hcc ', ' amdhip64 ') for line in lines] + newlines = [line.replace(" hip_hcc ", " amdhip64 ") for line in lines] if lines == newlines: print("%s skipped" % gloo_cmake_file) else: @@ -142,7 +157,7 @@ def is_hip_clang() -> bool: do_write = False with open(gloo_cmake_file, "r") as sources: lines = sources.readlines() - newlines = [line.replace('RCCL_LIBRARY', 'RCCL_LIBRARY_PATH') for line in lines] + newlines = [line.replace("RCCL_LIBRARY", "RCCL_LIBRARY_PATH") for line in lines] if lines == newlines: print("%s skipped" % gloo_cmake_file) else: @@ -158,7 +173,7 @@ def is_hip_clang() -> bool: if os.path.exists(gloo_cmake_file): with open(gloo_cmake_file, "r") as sources: lines = sources.readlines() - newlines = [line.replace('HIP_HCC_FLAGS', 'HIP_CLANG_FLAGS') for line in lines] + newlines = [line.replace("HIP_HCC_FLAGS", "HIP_CLANG_FLAGS") for line in lines] if lines == newlines: print("%s skipped" % gloo_cmake_file) else: @@ -173,4 +188,5 @@ def is_hip_clang() -> bool: includes=includes, ignores=ignores, out_of_place_only=args.out_of_place_only, - hip_clang_launch=is_hip_clang()) + hip_clang_launch=is_hip_clang(), +) diff --git a/tools/autograd/BUILD.bazel b/tools/autograd/BUILD.bazel new file mode 100644 index 000000000000..d1a0db360d23 --- /dev/null +++ b/tools/autograd/BUILD.bazel @@ -0,0 +1,4 @@ +load("//:tools/bazel.bzl", "rules") +load(":build.bzl", "define_targets") + +define_targets(rules = rules) diff --git a/tools/autograd/BUILD.buck b/tools/autograd/BUILD.buck new file mode 100644 index 000000000000..aedc8fa342b4 --- /dev/null +++ b/tools/autograd/BUILD.buck @@ -0,0 +1,34 @@ +python_library( + name = "autograd", + srcs = glob( + ["*.py"], + ), + base_module = "tools.autograd", + resources = [ + "deprecated.yaml", + "derivatives.yaml", + "templates/ADInplaceOrViewType.cpp", + "templates/Functions.cpp", + "templates/Functions.h", + "templates/TraceType.cpp", + "templates/VariableType.cpp", + "templates/VariableType.h", + "templates/annotated_fn_args.py.in", + "templates/python_fft_functions.cpp", + "templates/python_functions.cpp", + "templates/python_functions.h", + "templates/python_linalg_functions.cpp", + "templates/python_nn_functions.cpp", + "templates/python_return_types.cpp", + "templates/python_sparse_functions.cpp", + "templates/python_special_functions.cpp", + "templates/python_torch_functions.cpp", + "templates/python_variable_methods.cpp", + "templates/variable_factories.h", + ], + visibility = ["PUBLIC"], + deps = [ + "//third_party:pyyaml", + "//torchgen:torchgen", + ], +) diff --git a/tools/autograd/build.bzl b/tools/autograd/build.bzl new file mode 100644 index 000000000000..a21ca870708c --- /dev/null +++ b/tools/autograd/build.bzl @@ -0,0 +1,14 @@ +def define_targets(rules): + rules.py_library( + name = "autograd", + srcs = rules.glob(["*.py"]), + data = rules.glob([ + "*.yaml", + "templates/*", + ]), + visibility = ["//:__subpackages__"], + deps = [ + rules.requirement("PyYAML"), + "//torchgen:torchgen", + ], + ) diff --git a/tools/autograd/context.py b/tools/autograd/context.py index 66f4f81aa0fb..af1a6025ed8d 100644 --- a/tools/autograd/context.py +++ b/tools/autograd/context.py @@ -1,15 +1,18 @@ -from tools.codegen.api.autograd import NativeFunctionWithDifferentiabilityInfo as NFWDI -from tools.codegen.context import native_function_manager -from tools.codegen.utils import T +from torchgen.api.autograd import NativeFunctionWithDifferentiabilityInfo as NFWDI +from torchgen.context import native_function_manager +from torchgen.utils import T import functools from typing import Callable # Like tools.api.context.with_native_function, but for # NativeFunctionWithDifferentiabilityInfo. -def with_native_function_with_differentiability_info(func: Callable[[NFWDI], T]) -> Callable[[NFWDI], T]: +def with_native_function_with_differentiability_info( + func: Callable[[NFWDI], T] +) -> Callable[[NFWDI], T]: @functools.wraps(func) def wrapper(f: NFWDI) -> T: with native_function_manager(f.func): return func(f) + return wrapper diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index 582ba69c3623..75aec440808e 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -230,14 +230,14 @@ - name: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor self: maybe_multiply(grad, beta.conj()) - mat1: mm_mat1_backward(grad, mat2, mat1.sizes(), mat1.strides(), alpha) - mat2: mm_mat2_backward(grad, mat1, mat2.sizes(), mat2.strides(), alpha) + mat1: mm_mat1_backward(grad, mat2, mat1.sizes(), mat1.strides(), mat1.layout(), alpha) + mat2: mm_mat2_backward(grad, mat1, mat2.sizes(), mat2.strides(), mat2.layout(), alpha) result: maybe_multiply(self_t, beta) + maybe_multiply(mat1_t.mm(mat2_p), alpha) + maybe_multiply(mat1_p.mm(mat2_t), alpha) -- name: _sparse_addmm(Tensor self, Tensor sparse, Tensor dense, *, Scalar beta=1, Scalar alpha=1) -> Tensor +- name: _sparse_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor self: maybe_multiply(grad, beta) - sparse: _sparse_addmm_sparse_backward(grad, sparse, dense, alpha) - dense: mm_mat2_backward(grad, sparse, dense.sizes(), dense.strides(), alpha) + mat1: mm_mat1_sparse_backward(grad, mat1, mat2, alpha) + mat2: mm_mat2_backward(grad, mat1, mat2.sizes(), mat2.strides(), mat2.layout(), alpha) - name: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor self: maybe_multiply(grad, beta.conj()) @@ -260,7 +260,7 @@ - name: angle(Tensor self) -> Tensor self: angle_backward(grad, self) - result: handle_r_to_c(result.scalar_type(), angle_backward(self_t, self_p)) + result: handle_r_to_c(result.scalar_type(), angle_backward(self_t.conj(), self_p).conj()) # The four items below are necessary because TensorIterator doesn't work on # Variables (codegen does not unwrap the input Tensor for all() and any() ). @@ -315,6 +315,7 @@ - name: atan2(Tensor self, Tensor other) -> Tensor self, other: atan2_backward(grad, self, other, grad_input_mask) + result: (-self_p * other_t + other_p * self_t) / (self_p.pow(2) + other_p.pow(2)) - name: baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor self: maybe_multiply(grad, beta.conj()) @@ -365,12 +366,14 @@ - name: cholesky_inverse(Tensor self, bool upper=False) -> Tensor self: cholesky_inverse_backward(grad, self, upper, result) + result: cholesky_inverse_jvp(self_p, self_t, result, upper) # For clamp, gradient is not defined at the boundaries. But empirically it's helpful # to be able to get gradient on min and max, so we return the subgradient 1 for these cases. - name: clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor self: clamp_backward(grad, self, min, max) min, max: clamp_backward_min_max(grad, self, min, max, grad_input_mask) + result: clamp_jvp(self_p, self_t, min_p, min_t, max_p, max_t) - name: clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor self: clamp_backward(grad, self, min, max) @@ -383,7 +386,7 @@ - name: clamp_min.Tensor(Tensor self, Tensor min) -> Tensor self: where(self >= min, grad, at::scalar_tensor(0., grad.options())) min: where(self < min, grad, at::scalar_tensor(0., grad.options())) - result: where(self_p >= min_p, self_t, at::scalar_tensor(0., self_p.options())) + where(self_p < min_p, min_t, at::scalar_tensor(0., self_p.options())) + result: where(self_p >= min_p, self_t, min_t) - name: clamp_max(Tensor self, Scalar max) -> Tensor self: where(self <= max, grad, at::scalar_tensor(0., grad.options())) @@ -392,14 +395,14 @@ - name: clamp_max.Tensor(Tensor self, Tensor max) -> Tensor self: where(self <= max, grad, at::scalar_tensor(0., grad.options())) max: where(self > max, grad, at::scalar_tensor(0., grad.options())) - result: where(self_p <= max_p, self_t, at::scalar_tensor(0., self_p.options())) + where(self_p > max_p, max_t, at::scalar_tensor(0., self_p.options())) + result: where(self_p <= max_p, self_t, max_t) - name: clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor self: grad result: auto_linear - name: _to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor - self: grad.to(self.options(), /*non_blocking*/false, /*copy*/false) + self: _to_copy_backward(grad, self.options()) result: _to_copy(self_t, dtype, layout, device, pin_memory, non_blocking, memory_format) # The condition is: if dtype is not nullopt, then isDifferentiableType(*dtype) # (If dtype IS nullopt, we rely on the regular check that any input requires grad). @@ -415,6 +418,7 @@ - name: polar(Tensor abs, Tensor angle) -> Tensor abs, angle: polar_backward(grad, result) + result: at::complex(abs_t*angle_p.cos() - angle_t*abs_p*angle_p.sin(), abs_t*angle_p.sin() + angle_t*abs_p*angle_p.cos()) - name: _conj(Tensor(a) self) -> Tensor(a) self: grad.conj() @@ -512,6 +516,7 @@ - name: dist(Tensor self, Tensor other, Scalar p=2) -> Tensor self: norm_backward(grad, self - other, p, result) other: -norm_backward(grad, self - other, p, result) + result: norm_jvp(self_p - other_p, self_t - other_t, p, result, {}, false) # The backward formula is done in this order to improve numerical stability # of the higher order derivatives, see https://github.com/pytorch/pytorch/issues/43414 @@ -549,6 +554,7 @@ - name: native_dropout(Tensor input, float p, bool? train) -> (Tensor, Tensor) input: "GradMode::is_enabled() ? infinitely_differentiable_native_dropout_backward(grad, result1, (!train.has_value() || !train.value() ? 1 : (p == 1 ? 0.0 : 1.0 / (1.0 - p)))) : native_dropout_backward(grad, result1, (!train.has_value() || !train.value() ? 1 : (p == 1 ? 0.0 : 1.0 / (1.0 - p))))" + result0: "(!train.has_value() || train.value()) ? (p == 1 ? 0.0 : 1.0 / (1.0 - p)) * input_t * result1 : input_t" - name: native_dropout_backward(Tensor grad_output, Tensor mask, float scale) -> Tensor grad_output: "native_dropout_double_backward(grad, grad_output, mask, scale)" @@ -598,6 +604,10 @@ self: at::sum_to(grad, self.sizes()) result: auto_linear +- name: expand.SymInt(Tensor(a) self, SymInt[] size, *, bool implicit=False) -> Tensor(a) + self: at::sum_to(grad, c10::expectIntArrayRef(self.sym_sizes())) + result: auto_linear + - name: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!) self: zeros_like(grad) result: self_t.zero_() @@ -620,6 +630,15 @@ - name: _fused_moving_avg_obs_fq_helper(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> (Tensor output, Tensor mask) self: fake_quantize_per_tensor_affine_cachemask_backward(grad, mask) +- name: fill.Scalar(Tensor self, Scalar value) -> Tensor + self: zeros_like(grad) + result: at::fill(self_t, 0) + +- name: fill.Tensor(Tensor self, Tensor value) -> Tensor + self: zeros_like(grad) + value: grad.sum() + result: at::fill(self_t, value_t) + - name: fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!) self: zeros_like(grad) result: self_t.fill_(0) @@ -681,7 +700,7 @@ input, grid: "grad.defined() ? grid_sampler_2d_backward(grad, input, grid, interpolation_mode, padding_mode, align_corners, grad_input_mask) : std::tuple()" - name: grid_sampler_3d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor - input, grid: "grad.defined() ? grid_sampler_3d_backward(grad, input, grid, interpolation_mode, padding_mode, align_corners) : std::tuple()" + input, grid: "grad.defined() ? grid_sampler_3d_backward(grad, input, grid, interpolation_mode, padding_mode, align_corners, grad_input_mask) : std::tuple()" # See NOTE [ grid_sample CPU fallback ] - name: _grid_sampler_2d_cpu_fallback(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor @@ -749,8 +768,12 @@ index: non_differentiable result: at::index_add(self_t, dim, index, maybe_multiply(source_t, alpha)) +- name: index_reduce(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor + self, source: index_reduce_backward(grad, self, dim, index, source, reduce, include_self, result) + index: non_differentiable + - name: index_copy(Tensor self, int dim, Tensor index, Tensor source) -> Tensor - self: grad.clone().index_fill_(dim, index, 0) + self: grad.index_fill(dim, index, 0) # The case source.dim() == 0 is necessary to support scalar tensors of the form # source.dim() == 0 and index.dim() == 1 and index.size() == (1,), # This is because source is not broadcastable to index, as source.dim() < index.dim() @@ -910,6 +933,7 @@ - name: logsumexp(Tensor self, int[1] dim, bool keepdim=False) -> Tensor self: logsumexp_backward(grad, self, result, dim, keepdim) + result: logsumexp_jvp(self_p, self_t, dim, keepdim) - name: lstsq(Tensor self, Tensor A) -> (Tensor solution, Tensor QR) self: not_implemented("lstsq") @@ -930,17 +954,26 @@ result: self_t.zero_() - name: linalg_lu_factor_ex(Tensor A, *, bool pivot=True, bool check_errors=False) -> (Tensor LU, Tensor pivots, Tensor info) - A: lu_factor_ex_backward(grad, A, LU, pivots) - LU: lu_factor_ex_jvp(A_t, LU, pivots) + A: lu_factor_ex_backward(grad, LU, pivots, pivot) + LU: lu_factor_ex_jvp(A_t, LU, pivots, pivot) output_differentiability: [True, False, False] +- name: linalg_lu(Tensor A, *, bool pivot=True) -> (Tensor P, Tensor L, Tensor U) + A: linalg_lu_backward(grad_L, grad_U, P, L, U, pivot) + L: std::get<0>(linalg_lu_jvp(A_t, P, L, U, pivot)) + U: std::get<1>(linalg_lu_jvp(A_t, P, L, U, pivot)) + output_differentiability: [False, True, True] + - name: lu_solve(Tensor self, Tensor LU_data, Tensor LU_pivots) -> Tensor self, LU_data: lu_solve_backward(grad, result, LU_data, LU_pivots, grad_input_mask) result: lu_solve_jvp(result, LU_data_p, LU_data_t, self_t, LU_pivots) - name: lu_unpack(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True) -> (Tensor P, Tensor L, Tensor U) - LU_data: lu_unpack_backward(grads, LU_data, unpack_data) + LU_data: lu_unpack_backward(grad_L, grad_U, LU_data.size(-2), LU_data.size(-1)) LU_pivots: non_differentiable + L: "LU_data_t.size(-2) >= LU_data_t.size(-1) ? LU_data_t.tril(-1) : LU_data_t.narrow(-1, 0, LU_data_t.size(-2)).tril(-1)" + U: "LU_data_t.size(-1) >= LU_data_t.size(-2) ? LU_data_t.triu() : LU_data_t.narrow(-2, 0, LU_data_t.size(-1)).triu()" + output_differentiability: [False, True, True] - name: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor self: grad.masked_fill(mask, 0) @@ -979,7 +1012,7 @@ - name: maximum(Tensor self, Tensor other) -> Tensor self: at::where(self == other, grad / 2, grad).masked_fill_(self < other, 0) other: at::where(self == other, grad / 2, grad).masked_fill_(self > other, 0) - result: other_t + at::where(self_p == other_p, 0.5, (self_p > other_p).to(result.scalar_type())) * (self_t - other_t) + result: other_t + at::where(self_p == other_p, at::scalar_tensor(0.5, result.options()), (self_p > other_p).to(result.scalar_type())) * (self_t - other_t) - name: fmax(Tensor self, Tensor other) -> Tensor self: grad.masked_fill((self >= other).logical_or_(other.isnan()).logical_not_(), 0) @@ -1035,7 +1068,7 @@ - name: minimum(Tensor self, Tensor other) -> Tensor self: at::where(self == other, grad / 2, grad).masked_fill_(self > other, 0) other: at::where(self == other, grad / 2, grad).masked_fill_(self < other, 0) - result: other_t + at::where(self_p == other_p, 0.5, (self_p < other_p).to(result.scalar_type())) * (self_t - other_t) + result: other_t + at::where(self_p == other_p, at::scalar_tensor(0.5, result.options()), (self_p < other_p).to(result.scalar_type())) * (self_t - other_t) - name: fmin(Tensor self, Tensor other) -> Tensor self: grad.masked_fill((self <= other).logical_or_(other.isnan()).logical_not_(), 0) @@ -1049,8 +1082,8 @@ self: scale_grad_by_count(restore_reduced_dims(grad, dim, keepdim), restore_reduced_dims(result, dim, keepdim) == self, dim) - name: mm(Tensor self, Tensor mat2) -> Tensor - self: mm_mat1_backward(grad, mat2, self.sizes(), self.strides(), 1) - mat2: mm_mat2_backward(grad, self, mat2.sizes(), mat2.strides(), 1) + self: mm_mat1_backward(grad, mat2, self.sizes(), self.strides(), self.layout(), 1) + mat2: mm_mat2_backward(grad, self, mat2.sizes(), mat2.strides(), mat2.layout(), 1) result: at::mm(self_t, mat2_p) + at::mm(self_p, mat2_t) - name: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices) @@ -1123,18 +1156,23 @@ - name: norm.Scalar(Tensor self, Scalar p=2) -> Tensor self: norm_backward(grad, self, p, result) + result: norm_jvp(self_p, self_t, p, result) - name: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor self: norm_backward(grad, self, p, result, dim, keepdim) + result: norm_jvp(self_p, self_t, p, result, dim, keepdim) - name: norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor self: norm_backward(grad, self.to(grad.scalar_type()), p, result) + result: norm_jvp(self_p, self_t, p, result) - name: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor self: norm_backward(grad, self.to(grad.scalar_type()), p, result, dim, keepdim) + result: norm_jvp(self_p, self_t, p, result, dim, keepdim) - name: linalg_vector_norm(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor self: linalg_vector_norm_backward(grad, self, ord, result, dim, keepdim) + result: linalg_vector_norm_jvp(self_p, self_t, ord, result, dim, keepdim) - name: _pdist_forward(Tensor self, float p=2) -> Tensor self: _pdist_backward(grad, self, p, result) @@ -1216,11 +1254,11 @@ self: "accumulate ? grad : grad.put(index, zeros_like(source), false)" index: non_differentiable source: grad.take(index).reshape_as(source) + result: auto_linear # It is affine, but sure -- name: linalg_qr(Tensor self, str mode='reduced') -> (Tensor Q, Tensor R) - self: linalg_qr_backward(grads, self, mode, Q, R) - Q: linalg_qr_jvp_Q(self_t, Q, R) - R: linalg_qr_jvp_R(self_t, Q, R) +- name: linalg_qr(Tensor A, str mode='reduced') -> (Tensor Q, Tensor R) + A: linalg_qr_backward(grad_Q, grad_R, Q, R, mode) + Q, R: linalg_qr_jvp(A_t, Q, R, mode) - name: rad2deg(Tensor self) -> Tensor self: rad2deg_backward(grad) @@ -1266,6 +1304,15 @@ self: grad * std::sqrt(2 * M_PI) * (result.square() / 2).exp() result: auto_element_wise +- name: special_log_ndtr(Tensor self) -> Tensor + self: grad / std::sqrt(2 * M_PI) * (result + self.pow(2) / 2).neg().exp() + result: auto_element_wise + +# [Note: Sometimes view derivatives] +# The following situation applies to other operations as well. +# TODO: This note is only referenced once by to_dense. Make this +# more generic if it's been referenced more than once. +# # DO NOT define a backward for reshape! # reshape is special in that it sometimes returns a view, and sometimes not. # Defining a backward will make codegen spit out the forward call as @@ -1373,10 +1420,6 @@ self: slogdet_backward(grad, self, sign, logabsdet) output_differentiability: [false, true] -- name: solve(Tensor self, Tensor A) -> (Tensor solution, Tensor LU) - self: solve_backward_self(grad, self, A) - A: solve_backward_A(grad, self, A, solution) - - name: linalg_solve(Tensor input, Tensor other) -> Tensor input: solve_backward_A(grad, other, input, result) other: solve_backward_self(grad, other, input) @@ -1447,9 +1490,11 @@ - name: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor self: handle_r_to_c(self.scalar_type(), maybe_multiply(-grad, alpha.conj())) other: handle_r_to_c(other.scalar_type(), grad) + result: -maybe_multiply(self_t, alpha) + other_t - name: rsub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor self: handle_r_to_c(self.scalar_type(), maybe_multiply(-grad, alpha.conj())) + result: auto_element_wise - name: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor self: grad.expand(self.sizes()) @@ -1459,10 +1504,7 @@ self: sum_backward(grad, self.sizes(), dim, keepdim) result: auto_linear -- name: nansum(Tensor self, *, ScalarType? dtype=None) -> Tensor - self: grad.expand(self.sizes()).to(self.scalar_type()) * self.isnan().logical_not() - -- name: nansum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor +- name: nansum(Tensor self, int[1] dim=[], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor self: nansum_backward(grad.to(self.scalar_type()), self, dim, keepdim) # We never call _linalg_svd with compute_uv=False in an autograd context, so we don't even consider it here @@ -1473,22 +1515,18 @@ full_matrices ? U.narrow(-1, 0, S.size(-1)) : U, S, full_matrices ? Vh.narrow(-2, 0, S.size(-1)) : Vh)" - U: std::get<0>(linalg_svd_jvp(A_t, U, S, Vh, full_matrices)) - S: std::get<1>(linalg_svd_jvp(A_t, U, S, Vh, full_matrices)) - Vh: std::get<2>(linalg_svd_jvp(A_t, U, S, Vh, full_matrices)) + U, S, Vh: linalg_svd_jvp(A_t, U, S, Vh, full_matrices) - name: symeig(Tensor self, bool eigenvectors=False, bool upper=True) -> (Tensor eigenvalues, Tensor eigenvectors) self: linalg_eig_backward(grads[0], grads[1], eigenvalues, eigenvectors_return, /*is_hermitian=*/true, /*symeig_eigenvector=*/eigenvectors) - name: linalg_eigh(Tensor self, str UPLO="L") -> (Tensor eigenvalues, Tensor eigenvectors) self: linalg_eig_backward(grads[0], grads[1], eigenvalues, eigenvectors, /*is_hermitian=*/true) - eigenvalues: std::get<0>(linalg_eig_jvp(self_t, eigenvalues, eigenvectors, /*is_hermitian=*/true)) - eigenvectors: std::get<1>(linalg_eig_jvp(self_t, eigenvalues, eigenvectors, /*is_hermitian=*/true)) + eigenvalues, eigenvectors: linalg_eig_jvp(self_t, eigenvalues, eigenvectors, /*is_hermitian=*/true) - name: linalg_eig(Tensor self) -> (Tensor eigenvalues, Tensor eigenvectors) self: handle_r_to_c(self.scalar_type(), linalg_eig_backward(grads[0], grads[1], eigenvalues, eigenvectors, /*is_hermitian=*/false)) - eigenvalues: std::get<0>(linalg_eig_jvp(self_t, eigenvalues, eigenvectors, /*is_hermitian=*/false)) - eigenvectors: std::get<1>(linalg_eig_jvp(self_t, eigenvalues, eigenvectors, /*is_hermitian=*/false)) + eigenvalues, eigenvectors: linalg_eig_jvp(self_t, eigenvalues, eigenvectors, /*is_hermitian=*/false) - name: t(Tensor(a) self) -> Tensor(a) self: grad.t() @@ -1564,7 +1602,11 @@ self: zeros_like(grad) result: auto_element_wise -- name: to_dense(Tensor self, ScalarType? dtype=None) -> Tensor +# DO NOT define a backward for to_dense +# See [Note: Sometimes view derivatives] +# - name: to_dense(Tensor self, ScalarType? dtype=None) -> Tensor +# +- name: _to_dense(Tensor self, ScalarType? dtype=None) -> Tensor self: to_dense_backward(grad, self) - name: to_sparse(Tensor self) -> Tensor @@ -1612,6 +1654,9 @@ self: grad.reshape(self.sizes()) result: auto_linear +- name: lift(Tensor self) -> Tensor + self: not_implemented("lift") + - name: unsqueeze(Tensor(a) self, int dim) -> Tensor(a) self: grad.squeeze(dim) result: auto_linear @@ -1642,7 +1687,7 @@ self: at::view_as_real(grad.contiguous().resolve_conj()) # [gx, gy] result: at::view_as_complex(self_t) -- name: _s_where(Tensor condition, Tensor self, Tensor other) -> Tensor +- name: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor condition: non_differentiable self: where(condition, grad, zeros_like(grad)) other: where(condition, zeros_like(grad), grad) @@ -1651,8 +1696,8 @@ # weight_norm_cuda_interface_backward does not have an explicitly defined derivative, so if we do happen # to be running backward with create_graph=True, fall back to a backward function that uses # differentiable ops. -- name: _weight_norm_cuda_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor) - v, g: "grad.defined() ? (GradMode::is_enabled() ? _weight_norm_differentiable_backward(grad.contiguous(), v, g, result1, dim) : _weight_norm_cuda_interface_backward(grad.contiguous(), v, g, result1, dim)) : std::tuple()" +- name: _weight_norm_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor) + v, g: "grad.defined() ? (GradMode::is_enabled() ? _weight_norm_differentiable_backward(grad.contiguous(), v, g, result1, dim) : _weight_norm_interface_backward(grad.contiguous(), v, g, result1, dim)) : std::tuple()" - name: zero_(Tensor(a!) self) -> Tensor(a!) self: zeros_like(grad) @@ -1685,6 +1730,9 @@ # NN - name: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor i1, i2, i3: _trilinear_backward(grad, i1, i2, i3, expand1, expand2, expand3, sumdim, grad_input_mask) + result: "_trilinear(i1_t, i2_p, i3_p, expand1, expand2, expand3, sumdim, unroll_dim) + + _trilinear(i1_p, i2_t, i3_p, expand1, expand2, expand3, sumdim, unroll_dim) + + _trilinear(i1_p, i2_p, i3_t, expand1, expand2, expand3, sumdim, unroll_dim)" - name: constant_pad_nd(Tensor self, int[] pad, Scalar value=0) -> Tensor self: constant_pad_nd_backward(grad, pad) @@ -1696,7 +1744,7 @@ - name: binary_cross_entropy_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor self: binary_cross_entropy_double_backward(grad_output, grad, self, target, weight, reduction) - target: not_implemented("binary_cross_entropy_backward wrt `target`") + target: binary_cross_entropy_double_backward_target(grad, grad_output, self, target, weight, reduction) grad_output: binary_cross_entropy_double_backward_grad_output(grad, self, target, weight, reduction) - name: binary_cross_entropy_with_logits(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean) -> Tensor @@ -1707,6 +1755,7 @@ - name: embedding(Tensor weight, Tensor indices, int padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor indices: non_differentiable weight: embedding_backward(grad, indices, weight.size(0), padding_idx, scale_grad_by_freq, sparse) + result: auto_linear - name: embedding_dense_backward(Tensor grad_output, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor grad_output: embedding_dense_double_backward(grad, indices, padding_idx) @@ -1754,10 +1803,12 @@ - name: nll_loss_forward(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index) -> (Tensor output, Tensor total_weight) self: nll_loss_backward(grad, self, target, weight, reduction, ignore_index, total_weight) target: non_differentiable + output: std::get<0>(nll_loss_forward(self_t, target, weight, reduction, ignore_index)) - name: nll_loss2d_forward(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index) -> (Tensor output, Tensor total_weight) self: nll_loss2d_backward(grad, self, target, weight, reduction, ignore_index, total_weight) target: non_differentiable + output: std::get<0>(nll_loss2d_forward(self_t, target, weight, reduction, ignore_index)) - name: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor self: smooth_l1_loss_backward(grad, self, target, reduction, beta) @@ -1799,12 +1850,20 @@ - name: celu_(Tensor(a!) self, Scalar alpha=1.0) -> Tensor(a!) self: elu_backward(grad, alpha, 1, 1.0/alpha.toFloat(), /* is_result */ true, result) -- name: gelu(Tensor self) -> Tensor - self: "GradMode::is_enabled() ? infinitely_differentiable_gelu_backward(grad, self) : gelu_backward(grad, self)" +- name: gelu(Tensor self, *, str approximate='none') -> Tensor + self: gelu_backward(grad, self, approximate) result: auto_element_wise +- name: gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor + grad_output: gelu_backward(grad, self, approximate) + self: gelu_double_backward(grad, grad_output, self, approximate) + result: gelu_backward(grad_output_t, self_p, approximate) + gelu_double_backward(self_t, grad_output_p, self_p, approximate) + - name: glu(Tensor self, int dim=-1) -> Tensor + # TODO: glu_backward can benefit from forward result, + # and forward ad/forward over reverse ad for that matter self: glu_backward(grad, self, dim) + result: glu_jvp(result, self_p, self_t, dim) - name: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor self: hardshrink_backward(grad, self, lambd) @@ -1829,27 +1888,37 @@ - name: log_sigmoid_forward(Tensor self) -> (Tensor output, Tensor buffer) self: log_sigmoid_backward(grad, self, buffer) + output: auto_element_wise - name: _log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor self: _log_softmax_backward_data(grad, result, dim, self.scalar_type()) + result: self_t - logsumexp_jvp(self_p, self_t, {dim}, true) - name: _sparse_log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor self: _sparse_log_softmax_backward_data(grad, result, dim, self) +- name: _masked_softmax(Tensor self, Tensor mask, int? dim=None) -> Tensor + self: _masked_softmax_backward(grad, result, mask, dim) + mask: non_differentiable + - name: prelu(Tensor self, Tensor weight) -> Tensor self, weight: "grad.defined() ? prelu_backward(grad, self, weight) : std::tuple()" + result: prelu_jvp(self_p, self_t, weight_p, weight_t) - name: prelu_backward(Tensor grad_output, Tensor self, Tensor weight) -> (Tensor, Tensor) grad_output, self, weight: prelu_double_backward(grads[0], grads[1], grad_output, self, weight) - name: rrelu_with_noise(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor self: rrelu_with_noise_backward(grad, self, noise, lower, upper, training, false) + result: auto_element_wise + - name: rrelu_with_noise_(Tensor(a!) self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!) self: rrelu_with_noise_backward(grad, result, noise, lower, upper, training, true) - name: _softmax(Tensor self, int dim, bool half_to_float) -> Tensor self: _softmax_backward_data(grad, result, dim, self.scalar_type()) + result: result * (self_t - logsumexp_jvp(self_p, self_t, {dim}, true)) - name: _sparse_softmax(Tensor self, int dim, bool half_to_float) -> Tensor self: _sparse_softmax_backward_data(grad, result, dim, self) @@ -1898,43 +1967,52 @@ self: replication_pad3d_backward(grad, self, padding) result: auto_linear - # NOTE: Not implementing forward AD formulas for non-vec upsample overloads because they are - # only kept for backward compatability - name: upsample_linear1d(Tensor self, int[1] output_size, bool align_corners, float? scales=None) -> Tensor self: upsample_linear1d_backward(grad, output_size, self.sizes(), align_corners, scales) + result: auto_linear - name: upsample_bilinear2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor self: upsample_bilinear2d_backward(grad, output_size, self.sizes(), align_corners, scales_h, scales_w) + result: auto_linear - name: _upsample_bilinear2d_aa(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor self: _upsample_bilinear2d_aa_backward(grad, output_size, self.sizes(), align_corners, scales_h, scales_w) + result: auto_linear - name: upsample_bicubic2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor self: upsample_bicubic2d_backward(grad, output_size, self.sizes(), align_corners, scales_h, scales_w) + result: auto_linear - name: _upsample_bicubic2d_aa(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor self: _upsample_bicubic2d_aa_backward(grad, output_size, self.sizes(), align_corners, scales_h, scales_w) - name: upsample_trilinear3d(Tensor self, int[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor self: upsample_trilinear3d_backward(grad, output_size, self.sizes(), align_corners, scales_d, scales_h, scales_w) + result: auto_linear - name: upsample_nearest1d(Tensor self, int[1] output_size, float? scales=None) -> Tensor self: upsample_nearest1d_backward(grad, output_size, self.sizes(), scales) + result: auto_linear - name: _upsample_nearest_exact1d(Tensor self, int[1] output_size, float? scales=None) -> Tensor self: _upsample_nearest_exact1d_backward(grad, output_size, self.sizes(), scales) + result: auto_linear - name: upsample_nearest2d(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor self: upsample_nearest2d_backward(grad, output_size, self.sizes(), scales_h, scales_w) + result: auto_linear - name: _upsample_nearest_exact2d(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor self: _upsample_nearest_exact2d_backward(grad, output_size, self.sizes(), scales_h, scales_w) + result: auto_linear - name: upsample_nearest3d(Tensor self, int[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor self: upsample_nearest3d_backward(grad, output_size, self.sizes(), scales_d, scales_h, scales_w) + result: auto_linear - name: _upsample_nearest_exact3d(Tensor self, int[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor self: _upsample_nearest_exact3d_backward(grad, output_size, self.sizes(), scales_d, scales_h, scales_w) + result: auto_linear - name: upsample_linear1d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor input: upsample_linear1d_backward(grad, output_size, input.sizes(), align_corners, scale_factors) @@ -1983,6 +2061,14 @@ input: _upsample_nearest_exact3d_backward(grad, output_size, input.sizes(), scale_factors) result: auto_linear +- name: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor + self: pixel_unshuffle(grad, upscale_factor) + result: auto_linear + +- name: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor + self: pixel_shuffle(grad, downscale_factor) + result: auto_linear + - name: _adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor self: _adaptive_avg_pool2d_backward(grad, self) result: auto_linear @@ -2019,6 +2105,19 @@ result0: gather(self_t.flatten(-3), -1, result1.flatten(-3)).view_as(result1) output_differentiability: [True, False] +#mps +- name: _mps_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor + self: mps_max_pool2d_backward(grad, self, kernel_size, stride, padding, dilation, ceil_mode) + +- name: _mps_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor + self, weight, bias: "grad.defined() ? mps_convolution_backward(self, grad, weight, padding, stride, dilation, groups, grad_input_mask) : std::tuple()" + +- name: mps_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) + grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, dilation, false, std::vector(padding.size(), 0), groups, grad_input_mask) + +- name: _mps_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor + self, weight, bias: mps_linear_backward(self, grad, weight, grad_input_mask) + - name: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) self: max_pool2d_with_indices_backward(grad, self, kernel_size, stride, padding, dilation, ceil_mode, result1) result0: gather(self_t.flatten(-2), -1, result1.flatten(-2)).view_as(result1) @@ -2030,12 +2129,12 @@ output_differentiability: [True, False] - name: max_unpool2d(Tensor self, Tensor indices, int[2] output_size) -> Tensor - self: max_unpool2d_backward(grad, self, indices, output_size) + self: max_pool_double_backward(grad, indices, 2) indices: non_differentiable result: auto_linear - name: max_unpool3d(Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding) -> Tensor - self: max_unpool3d_backward(grad, self, indices, output_size, stride, padding) + self: max_pool_double_backward(grad, indices, 3) indices: non_differentiable result: auto_linear @@ -2139,6 +2238,7 @@ - name: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor grad_output: elu_backward(grad, alpha, scale, input_scale, is_result, self_or_result) self_or_result: elu_double_backward(grad, grad_output, alpha, scale, input_scale, is_result, self_or_result) + result: elu_backward(grad_output_t, alpha, scale, input_scale, is_result, self_or_result_p) + elu_double_backward(self_or_result_t, grad_output_p, alpha, scale, input_scale, is_result, self_or_result_p) - name: fractional_max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices) -> Tensor grad_output: max_pool_double_backward(grad, indices, 2) @@ -2153,6 +2253,7 @@ - name: glu_backward(Tensor grad_output, Tensor self, int dim) -> Tensor grad_output: glu_double_backward_grad_output(grad, self, dim) self: glu_double_backward(grad, grad_output, self, dim) + result: glu_backward_jvp(result, grad_output_p, self_p, grad_output_t, self_t, dim) - name: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor grad_output: hardtanh_backward(grad, self, min_val, max_val) @@ -2181,6 +2282,24 @@ # self_is_result is always false here since double backward call is an out-of-place call, self is input itself grad_output: leaky_relu_backward(grad, self, negative_slope, false) self: zeros_like(grad) + # leaky_relu_backward(grad_output, self, negative_slope, false) + # computes grad_output * at::where(self_p > 0, 1, negative_slope) + # so the jvp formula is the following: + # grad_output_t * at::where(self_p > 0, self_p.new_ones([]), negative_slope); + # + # leaky_relu_backward(grad_output, result, negative_slope, true) + # computes grad_output * at::where(result > 0, 1, negative_slope) + # under the assumption that `negative_slope` is positive (otherwise, + # it is not possible to compute the gradient). + # + # so the jvp formula is the following: + # grad_output_t * at::where(result_p > 0, result_p.new_ones([]), negative_slope); + # with the assumption that negative_slope is positive. + # + # Combined together that results in the following optimized kernel which + # also checks the assumption that negative_slope is positive when self_is_result + # is True: + result: leaky_relu_backward(grad_output_t, self_p, negative_slope, self_is_result) - name: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor grad_output: max_pool_double_backward(grad, indices, 2) @@ -2194,11 +2313,6 @@ indices: non_differentiable result: auto_linear -- name: max_unpool2d_backward(Tensor grad_output, Tensor self, Tensor indices, int[2] output_size) -> Tensor - grad_output: max_unpool2d(grad, indices, output_size) - self: zeros_like(self) - indices: non_differentiable - - name: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor grad_output: mse_loss_double_backward_grad_output(grad, grad_output, self, target, reduction) self: mse_loss_double_backward(grad * grad_output, self, reduction) @@ -2249,6 +2363,11 @@ self: zeros_like(self) result: replication_pad3d_backward(grad_output_t, self_p, padding) +- name: sparse_sampled_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor + self: maybe_multiply(grad, beta.conj()) + mat1: maybe_multiply(grad.sparse_mask(self).mm(mat2.mH()), alpha.conj()) + mat2: maybe_multiply(mat1.mH().mm(grad.sparse_mask(self)), alpha.conj()) + - name: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor grad_output: smooth_l1_loss_double_backward_grad_output(grad, grad_output, self, target, reduction, beta) self: smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction, beta) @@ -2281,43 +2400,52 @@ self: zeros_like(grad) result: zeros_like(self_t) + threshold_backward(grad_output_t, self_p, threshold) - # NOTE: Not implementing forward AD formulas for backwards of non-vec upsample overloads - # because they are only kept for backward compatability - name: upsample_linear1d_backward(Tensor grad_output, int[1] output_size, int[3] input_size, bool align_corners, float? scales=None) -> Tensor grad_output: upsample_linear1d(grad, output_size, align_corners, scales) + result: auto_linear - name: upsample_bilinear2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor grad_output: upsample_bilinear2d(grad, output_size, align_corners, scales_h, scales_w) + result: auto_linear - name: _upsample_bilinear2d_aa_backward(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor grad_output: _upsample_bilinear2d_aa(grad, output_size, align_corners, scales_h, scales_w) + result: auto_linear - name: upsample_bicubic2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor grad_output: upsample_bicubic2d(grad, output_size, align_corners, scales_h, scales_w) + result: auto_linear - name: _upsample_bicubic2d_aa_backward(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor grad_output: _upsample_bicubic2d_aa(grad, output_size, align_corners, scales_h, scales_w) - name: upsample_trilinear3d_backward(Tensor grad_output, int[3] output_size, int[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor grad_output: upsample_trilinear3d(grad, output_size, align_corners, scales_d, scales_h, scales_w) + result: auto_linear - name: upsample_nearest1d_backward(Tensor grad_output, int[1] output_size, int[3] input_size, float? scales=None) -> Tensor grad_output: upsample_nearest1d(grad, output_size, scales) + result: auto_linear - name: _upsample_nearest_exact1d_backward(Tensor grad_output, int[1] output_size, int[3] input_size, float? scales=None) -> Tensor grad_output: _upsample_nearest_exact1d(grad, output_size, scales) + result: auto_linear - name: upsample_nearest2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor grad_output: upsample_nearest2d(grad, output_size, scales_h, scales_w) + result: auto_linear - name: _upsample_nearest_exact2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor grad_output: _upsample_nearest_exact2d(grad, output_size, scales_h, scales_w) + result: auto_linear - name: upsample_nearest3d_backward(Tensor grad_output, int[3] output_size, int[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor grad_output: upsample_nearest3d(grad, output_size, scales_d, scales_h, scales_w) + result: auto_linear - name: _upsample_nearest_exact3d_backward(Tensor grad_output, int[3] output_size, int[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor grad_output: _upsample_nearest_exact3d(grad, output_size, scales_d, scales_h, scales_w) + result: auto_linear - name: upsample_linear1d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor grad_output: upsample_linear1d(grad, output_size, align_corners, scale_factors) @@ -2383,6 +2511,9 @@ - name: cudnn_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor self, weight: "_cudnn_convolution_backward(self, grad, weight, padding, output_padding, stride, dilation, true, groups, {grad_input_mask[0], grad_input_mask[1]})" +- name: _mps_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups) -> Tensor + self, weight: "grad.defined() ? mps_convolution_transpose_backward(self, grad, weight, padding, output_padding, stride, dilation, groups, grad_input_mask) : std::tuple()" + - name: cudnn_convolution(Tensor self, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor self, weight: "_cudnn_convolution_backward(self, grad, weight, padding, std::vector(padding.size(), 0), stride, dilation, false, groups, {grad_input_mask[0], grad_input_mask[1]})" @@ -2418,6 +2549,15 @@ # NNPACK does not support strided convolutions in the backwards path, which is the reason why we are using the closest available function that does here. input, weight, bias: "grad.defined() ? convolution_backward(grad, input, weight, bias->sizes(), stride, padding, std::vector(padding.size(), 1), false, std::vector(padding.size(), 0), 1, grad_input_mask) : std::tuple()" +#LSTM MPS +- name: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor) + output_differentiability: [True, True, True, False, False] + input, hx, params: "lstm_mps_backward(grads[0], grads[1], grads[2], result3, result4, input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first)" + +- name: lstm_mps_backward(Tensor grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[]) + + + # Only frst three of _cudnn_rnn outputs can have gradients. # _cudnn_rnn outputs: (output, hy, cy, reserve, weight_buf) - name: _cudnn_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, int hidden_size, int proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor) @@ -2485,12 +2625,15 @@ # fft - name: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor self: fft_r2c_backward(grad, dim, normalization, onesided, self.size(dim.back())) + result: auto_linear - name: _fft_c2r(Tensor self, int[] dim, int normalization, int last_dim_size) -> Tensor self: fft_c2r_backward(grad, dim, normalization) + result: auto_linear - name: _fft_c2c(Tensor self, int[] dim, int normalization, bool forward) -> Tensor self: _fft_c2c(grad, dim, normalization, !forward) + result: auto_linear - name: unbind.int(Tensor(a -> *) self, int dim=0) -> Tensor(a)[] self: unbind_backward(grads, dim) @@ -2590,6 +2733,6 @@ - name: _efficientzerotensor(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor output_differentiability: [False] -- name: scatter_reduce.two(Tensor self, int dim, Tensor index, str reduce, *, int? output_size=None) -> Tensor - self: scatter_reduce_backward(grad, self, dim, index, reduce, result) +- name: scatter_reduce.two(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor + self, src: scatter_reduce_backward(grad, self, dim, index, src, reduce, include_self, result) index: non_differentiable diff --git a/tools/autograd/gen_annotated_fn_args.py b/tools/autograd/gen_annotated_fn_args.py index 2d1dbd5c71a5..89269e8e0e0f 100644 --- a/tools/autograd/gen_annotated_fn_args.py +++ b/tools/autograd/gen_annotated_fn_args.py @@ -5,6 +5,7 @@ python -m tools.autograd.gen_annotated_fn_args \ aten/src/ATen/native/native_functions.yaml \ + aten/src/ATen/native/tags.yaml \ $OUTPUT_DIR \ tools/autograd @@ -20,24 +21,35 @@ from typing import Dict, List, Any -from tools.codegen.gen import parse_native_yaml -from tools.codegen.utils import FileManager -from tools.codegen.context import with_native_function -from tools.codegen.model import BaseOperatorName, NativeFunction -import tools.codegen.api.python as python -from .gen_python_functions import should_generate_py_binding, is_py_torch_function, \ - is_py_nn_function, is_py_linalg_function, is_py_variable_method, is_py_special_function, \ - is_py_fft_function +from torchgen.gen import parse_native_yaml +from torchgen.utils import FileManager +from torchgen.context import with_native_function +from torchgen.model import BaseOperatorName, NativeFunction +import torchgen.api.python as python +from .gen_python_functions import ( + should_generate_py_binding, + is_py_torch_function, + is_py_nn_function, + is_py_linalg_function, + is_py_variable_method, + is_py_special_function, + is_py_fft_function, +) -def gen_annotated(native_yaml_path: str, out: str, autograd_dir: str) -> None: - native_functions = parse_native_yaml(native_yaml_path).native_functions + +def gen_annotated( + native_yaml_path: str, tags_yaml_path: str, out: str, autograd_dir: str +) -> None: + native_functions = parse_native_yaml( + native_yaml_path, tags_yaml_path + ).native_functions mappings = ( - (is_py_torch_function, 'torch._C._VariableFunctions'), - (is_py_nn_function, 'torch._C._nn'), - (is_py_linalg_function, 'torch._C._linalg'), - (is_py_special_function, 'torch._C._special'), - (is_py_fft_function, 'torch._C._fft'), - (is_py_variable_method, 'torch.Tensor'), + (is_py_torch_function, "torch._C._VariableFunctions"), + (is_py_nn_function, "torch._C._nn"), + (is_py_linalg_function, "torch._C._linalg"), + (is_py_special_function, "torch._C._special"), + (is_py_fft_function, "torch._C._fft"), + (is_py_variable_method, "torch.Tensor"), ) annotated_args: List[str] = [] for pred, namespace in mappings: @@ -48,13 +60,18 @@ def gen_annotated(native_yaml_path: str, out: str, autograd_dir: str) -> None: groups[f.func.name.name].append(f) for group in groups.values(): for f in group: - annotated_args.append(f'{namespace}.{gen_annotated_args(f)}') + annotated_args.append(f"{namespace}.{gen_annotated_args(f)}") - template_path = os.path.join(autograd_dir, 'templates') + template_path = os.path.join(autograd_dir, "templates") fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False) - fm.write_with_template('annotated_fn_args.py', 'annotated_fn_args.py.in', lambda: { - 'annotated_args': textwrap.indent('\n'.join(annotated_args), ' '), - }) + fm.write_with_template( + "annotated_fn_args.py", + "annotated_fn_args.py.in", + lambda: { + "annotated_args": textwrap.indent("\n".join(annotated_args), " "), + }, + ) + @with_native_function def gen_annotated_args(f: NativeFunction) -> str: @@ -63,26 +80,29 @@ def gen_annotated_args(f: NativeFunction) -> str: if arg.default is not None: continue out_arg: Dict[str, Any] = {} - out_arg['name'] = arg.name - out_arg['simple_type'] = python.argument_type_str(arg.type, simple_type=True) + out_arg["name"] = arg.name + out_arg["simple_type"] = python.argument_type_str(arg.type, simple_type=True) size = python.argument_type_size(arg.type) if size: - out_arg['size'] = size + out_arg["size"] = size out_args.append(out_arg) - return f'{f.func.name.name}: {repr(out_args)},' + return f"{f.func.name.name}: {repr(out_args)}," + def main() -> None: - parser = argparse.ArgumentParser( - description='Generate annotated_fn_args script') - parser.add_argument('native_functions', metavar='NATIVE', - help='path to native_functions.yaml') - parser.add_argument('out', metavar='OUT', - help='path to output directory') - parser.add_argument('autograd', metavar='AUTOGRAD', - help='path to template directory') + parser = argparse.ArgumentParser(description="Generate annotated_fn_args script") + parser.add_argument( + "native_functions", metavar="NATIVE", help="path to native_functions.yaml" + ) + parser.add_argument("tags", metavar="TAGS", help="path to tags.yaml") + parser.add_argument("out", metavar="OUT", help="path to output directory") + parser.add_argument( + "autograd", metavar="AUTOGRAD", help="path to template directory" + ) args = parser.parse_args() - gen_annotated(args.native_functions, args.out, args.autograd) + gen_annotated(args.native_functions, args.tags, args.out, args.autograd) + -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tools/autograd/gen_autograd.py b/tools/autograd/gen_autograd.py index 26ab682c5d05..25a04fb14acc 100644 --- a/tools/autograd/gen_autograd.py +++ b/tools/autograd/gen_autograd.py @@ -5,6 +5,7 @@ python -m tools.autograd.gen_autograd \ build/aten/src/ATen/Declarations.yaml \ aten/src/ATen/native/native_functions.yaml \ + aten/src/ATen/native/tags.yaml \ $OUTPUT_DIR \ tools/autograd @@ -24,23 +25,29 @@ import argparse import os -from tools.codegen.api import cpp -from tools.codegen.api.autograd import ( - match_differentiability_info, NativeFunctionWithDifferentiabilityInfo, +from torchgen.api import cpp +from torchgen.api.autograd import ( + match_differentiability_info, + NativeFunctionWithDifferentiabilityInfo, ) -from tools.codegen.gen import parse_native_yaml -from tools.codegen.selective_build.selector import SelectiveBuilder +from torchgen.gen import parse_native_yaml +from torchgen.selective_build.selector import SelectiveBuilder from typing import List from . import gen_python_functions -from .gen_autograd_functions import gen_autograd_functions_lib, gen_autograd_functions_python +from .gen_autograd_functions import ( + gen_autograd_functions_lib, + gen_autograd_functions_python, +) from .gen_trace_type import gen_trace_type from .gen_variable_type import gen_variable_type from .gen_inplace_or_view_type import gen_inplace_or_view_type from .gen_variable_factories import gen_variable_factories from .load_derivatives import load_derivatives + def gen_autograd( native_functions_path: str, + tags_path: str, out: str, autograd_dir: str, operator_selector: SelectiveBuilder, @@ -48,66 +55,84 @@ def gen_autograd( ) -> None: # Parse and load derivatives.yaml differentiability_infos = load_derivatives( - os.path.join(autograd_dir, 'derivatives.yaml'), native_functions_path) - - template_path = os.path.join(autograd_dir, 'templates') - - native_funcs = parse_native_yaml(native_functions_path).native_functions - fns = list(sorted(filter( - operator_selector.is_native_function_selected_for_training, - native_funcs), key=lambda f: cpp.name(f.func))) - fns_with_diff_infos: List[NativeFunctionWithDifferentiabilityInfo] = match_differentiability_info(fns, differentiability_infos) + os.path.join(autograd_dir, "derivatives.yaml"), native_functions_path, tags_path + ) + + template_path = os.path.join(autograd_dir, "templates") + + native_funcs = parse_native_yaml(native_functions_path, tags_path).native_functions + fns = list( + sorted( + filter( + operator_selector.is_native_function_selected_for_training, native_funcs + ), + key=lambda f: cpp.name(f.func), + ) + ) + fns_with_diff_infos: List[ + NativeFunctionWithDifferentiabilityInfo + ] = match_differentiability_info(fns, differentiability_infos) # Generate VariableType.h/cpp if not disable_autograd: - gen_variable_type(out, native_functions_path, fns_with_diff_infos, template_path) + gen_variable_type( + out, native_functions_path, tags_path, fns_with_diff_infos, template_path + ) - gen_inplace_or_view_type(out, native_functions_path, fns_with_diff_infos, template_path) + gen_inplace_or_view_type( + out, native_functions_path, tags_path, fns_with_diff_infos, template_path + ) # operator filter not applied as tracing sources are excluded in selective build gen_trace_type(out, native_funcs, template_path) # Generate Functions.h/cpp - gen_autograd_functions_lib( - out, differentiability_infos, template_path) + gen_autograd_functions_lib(out, differentiability_infos, template_path) # Generate variable_factories.h - gen_variable_factories(out, native_functions_path, template_path) + gen_variable_factories(out, native_functions_path, tags_path, template_path) def gen_autograd_python( native_functions_path: str, + tags_path: str, out: str, autograd_dir: str, ) -> None: differentiability_infos = load_derivatives( - os.path.join(autograd_dir, 'derivatives.yaml'), native_functions_path) + os.path.join(autograd_dir, "derivatives.yaml"), native_functions_path, tags_path + ) - template_path = os.path.join(autograd_dir, 'templates') + template_path = os.path.join(autograd_dir, "templates") # Generate Functions.h/cpp - gen_autograd_functions_python( - out, differentiability_infos, template_path) + gen_autograd_functions_python(out, differentiability_infos, template_path) # Generate Python bindings - deprecated_path = os.path.join(autograd_dir, 'deprecated.yaml') + deprecated_path = os.path.join(autograd_dir, "deprecated.yaml") gen_python_functions.gen( - out, native_functions_path, deprecated_path, template_path) + out, native_functions_path, tags_path, deprecated_path, template_path + ) def main() -> None: - parser = argparse.ArgumentParser( - description='Generate autograd C++ files script') - parser.add_argument('native_functions', metavar='NATIVE', - help='path to native_functions.yaml') - parser.add_argument('out', metavar='OUT', - help='path to output directory') - parser.add_argument('autograd', metavar='AUTOGRAD', - help='path to autograd directory') + parser = argparse.ArgumentParser(description="Generate autograd C++ files script") + parser.add_argument( + "native_functions", metavar="NATIVE", help="path to native_functions.yaml" + ) + parser.add_argument("tags", metavar="NATIVE", help="path to tags.yaml") + parser.add_argument("out", metavar="OUT", help="path to output directory") + parser.add_argument( + "autograd", metavar="AUTOGRAD", help="path to autograd directory" + ) args = parser.parse_args() - gen_autograd(args.native_functions, - args.out, args.autograd, - SelectiveBuilder.get_nop_selector()) + gen_autograd( + args.native_functions, + args.tags, + args.out, + args.autograd, + SelectiveBuilder.get_nop_selector(), + ) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tools/autograd/gen_autograd_functions.py b/tools/autograd/gen_autograd_functions.py index be7c7212db8d..3e1e55b82b2f 100644 --- a/tools/autograd/gen_autograd_functions.py +++ b/tools/autograd/gen_autograd_functions.py @@ -8,17 +8,36 @@ from typing import List, Sequence, Tuple -from tools.codegen.api.autograd import (Derivative, DifferentiabilityInfo, - SavedAttribute, uses_retain_variables, - uses_single_grad) -from tools.codegen.api.types import (Binding, BaseCType, OptionalCType, tensorT, longT, - doubleT, scalarT, stringT, boolT, intArrayRefT, - tensorListT, MutRefCType, ListCType, ArrayRefCType) -from tools.codegen.code_template import CodeTemplate -from tools.codegen.utils import FileManager -from tools.codegen.model import Argument - -FUNCTION_DECLARATION = CodeTemplate("""\ +from torchgen.api.autograd import ( + Derivative, + DifferentiabilityInfo, + SavedAttribute, + uses_retain_variables, + uses_single_grad, +) +from torchgen.api.types import ( + Binding, + BaseCType, + OptionalCType, + tensorT, + longT, + doubleT, + scalarT, + stringT, + boolT, + intArrayRefT, + tensorListT, + MutRefCType, + ListCType, + ArrayRefCType, + optionalIntArrayRefT, +) +from torchgen.code_template import CodeTemplate +from torchgen.utils import FileManager +from torchgen.model import Argument + +FUNCTION_DECLARATION = CodeTemplate( + """\ struct TORCH_API ${op} : public ${superclass} { using ${superclass}::${superclass}; variable_list apply(variable_list&& grads) override; @@ -31,16 +50,20 @@ ${saved_variables} ${saved_list_sizes} }; -""") +""" +) -WILL_RELEASE_VARIABLES = CodeTemplate("""\ +WILL_RELEASE_VARIABLES = CodeTemplate( + """\ bool retain_variables = true; void will_release_variables() override { retain_variables = false; } -""") +""" +) -FUNCTION_DEFINITION = CodeTemplate("""\ +FUNCTION_DEFINITION = CodeTemplate( + """\ variable_list ${op}::apply(variable_list&& grads) { ${thread_lock} ${asserts} @@ -50,34 +73,43 @@ ${body} return grad_inputs; } -""") +""" +) -GRAD_INPUT_MASK = CodeTemplate("""\ +GRAD_INPUT_MASK = CodeTemplate( + """\ auto grad_input_mask = std::array{ ${masks} };\ -""") +""" +) -DERIVATIVE_SINGLE = CodeTemplate("""\ +DERIVATIVE_SINGLE = CodeTemplate( + """\ if (should_compute_output({ ${name}_ix })) { auto grad_result = ${derivative}; copy_range(grad_inputs, ${name}_ix, grad_result); } -""") +""" +) -DERIVATIVE_MULTI_COPY_RANGE = CodeTemplate("""\ +DERIVATIVE_MULTI_COPY_RANGE = CodeTemplate( + """\ if (should_compute_output({ ${name}_ix })) { copy_range(grad_inputs, ${name}_ix, std::get<${i}>(grad_result)); } -""") +""" +) -DERIVATIVE_MULTI = CodeTemplate("""\ +DERIVATIVE_MULTI = CodeTemplate( + """\ if (should_compute_output({ ${idx_ranges} })) { ${grad_input_mask} auto grad_result = ${derivative}; ${copy_ranges} } -""") +""" +) # Generates python bindings # @@ -88,12 +120,15 @@ # Each PyGetSetDef has a function ptr to a getter, also defined here (3). # (3) Getters for each of grad_fn's saved inputs and outputs. # -PY_FUNCTION_DEFINITION = CodeTemplate("""\ +PY_FUNCTION_DEFINITION = CodeTemplate( + """\ static PyTypeObject ${op}Class; addClass<${op}>(${op}Class, "${op}", ${op}_properties); -""") +""" +) -PY_FUNCTION_PROPS_AND_GETTERS = CodeTemplate("""\ +PY_FUNCTION_PROPS_AND_GETTERS = CodeTemplate( + """\ ${all_getter_definitions} static struct PyGetSetDef ${op}_properties[] = { @@ -102,43 +137,55 @@ {nullptr} /* sentinel */ }; -""") +""" +) -PY_GETSETDEF_STRUCT = CodeTemplate("""\ -{(char*)"_saved_${name}", (getter)THP${op}_${name}_getter, nullptr, nullptr, nullptr}""") +PY_GETSETDEF_STRUCT = CodeTemplate( + """\ +{(char*)"_saved_${name}", (getter)THP${op}_${name}_getter, nullptr, nullptr, nullptr}""" +) -PY_RAW_GETSETDEF_STRUCT = CodeTemplate("""\ -{(char*)"_raw_saved_${name}", (getter)THP${op}_${name}_raw_getter, nullptr, nullptr, nullptr}""") +PY_RAW_GETSETDEF_STRUCT = CodeTemplate( + """\ +{(char*)"_raw_saved_${name}", (getter)THP${op}_${name}_raw_getter, nullptr, nullptr, nullptr}""" +) # Getter templates -GETTER_DEFINITION = CodeTemplate("""\ +GETTER_DEFINITION = CodeTemplate( + """\ PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) { HANDLE_TH_ERRORS auto prop = static_cast<${op}*>(self->cdata.get())->${name}; ${body} END_HANDLE_TH_ERRORS } -""") +""" +) -GETTER_DEFINITION_SAVEDVAR = CodeTemplate("""\ +GETTER_DEFINITION_SAVEDVAR = CodeTemplate( + """\ PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) { HANDLE_TH_ERRORS const auto& prop = static_cast<${op}*>(self->cdata.get())->${name}_; ${body} END_HANDLE_TH_ERRORS } -""") +""" +) -GETTER_DEFINITION_RAW_SAVEDVAR = CodeTemplate("""\ +GETTER_DEFINITION_RAW_SAVEDVAR = CodeTemplate( + """\ PyObject* THP${op}_${name}_raw_getter(THPCppFunction *self, void *_unused) { HANDLE_TH_ERRORS const auto& prop = static_cast<${op}*>(self->cdata.get())->${name}_; ${body} END_HANDLE_TH_ERRORS } -""") +""" +) -GETTER_DEFINITION_VEC_SAVEDVAR = CodeTemplate("""\ +GETTER_DEFINITION_VEC_SAVEDVAR = CodeTemplate( + """\ PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) { HANDLE_TH_ERRORS const auto *node = static_cast<${op}*>(self->cdata.get()); @@ -150,9 +197,11 @@ ${body} END_HANDLE_TH_ERRORS } -""") +""" +) -GETTER_DEFINITION_RAW_VEC_SAVEDVAR = CodeTemplate("""\ +GETTER_DEFINITION_RAW_VEC_SAVEDVAR = CodeTemplate( + """\ PyObject* THP${op}_${name}_raw_getter(THPCppFunction *self, void *_unused) { HANDLE_TH_ERRORS const auto *node = static_cast<${op}*>(self->cdata.get()); @@ -164,9 +213,11 @@ ${body} END_HANDLE_TH_ERRORS } -""") +""" +) -GETTER_DEFINITION_OPT = CodeTemplate("""\ +GETTER_DEFINITION_OPT = CodeTemplate( + """\ PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) { HANDLE_TH_ERRORS auto opt_prop = static_cast<${op}*>(self->cdata.get())->${name}; @@ -177,9 +228,11 @@ ${body} END_HANDLE_TH_ERRORS } -""") +""" +) -GETTER_DEFINITION_OPT_ARRAYREF = CodeTemplate("""\ +GETTER_DEFINITION_OPT_ARRAYREF = CodeTemplate( + """\ PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) { HANDLE_TH_ERRORS auto opt_prop = static_cast<${op}*>(self->cdata.get())->${name}; @@ -190,7 +243,8 @@ ${body} END_HANDLE_TH_ERRORS } -""") +""" +) # Getter body GETTER_BODY_SAVEDVAR = """\ @@ -204,7 +258,7 @@ GETTER_BODY_VEC_SAVEDVAR = """\ PyObject* tup = PyTuple_New((Py_ssize_t) prop.size()); -for (int i = 0; i < prop.size(); i++) { +for (auto i: c10::irange(prop.size())) { PyTuple_SetItem(tup, (Py_ssize_t) i, THPVariable_Wrap(prop[i].unpack(self->cdata))); } return tup; @@ -212,7 +266,7 @@ GETTER_BODY_RAW_VEC_SAVEDVAR = """\ PyObject* tup = PyTuple_New((Py_ssize_t) prop.size()); -for (int i = 0; i < prop.size(); i++) { +for (auto i : c10::irange(prop.size())) { pybind11::object obj = pybind11::cast(prop[i], pybind11::return_value_policy::reference); PyTuple_SetItem(tup, (Py_ssize_t) i, obj.release().ptr()); } @@ -221,7 +275,7 @@ GETTER_BODY_ARRAYREF_LONG = """\ PyObject* tup = PyTuple_New((Py_ssize_t) prop.size()); -for (int i = 0; i < prop.size(); i++) { +for (auto i : c10::irange(prop.size())) { PyTuple_SetItem(tup, (Py_ssize_t) i, PyLong_FromUnsignedLong((uint64_t) prop[i])); } return tup; @@ -229,7 +283,7 @@ GETTER_BODY_ARRAYREF_DOUBLE = """\ PyObject* tup = PyTuple_New((Py_ssize_t) prop.size()); -for (int i = 0; i < prop.size(); i++) { +for (auto i : c10::irange(prop.size())) { PyTuple_SetItem(tup, (Py_ssize_t) i, PyFloat_FromDouble((double) prop[i])); } return tup; @@ -292,6 +346,7 @@ # TODO: This is probably not exhaustive, but it's a start UNTRACEABLE_FUNCTIONS = VIEW_FUNCTIONS + def gen_autograd_functions_lib( out: str, differentiability_infos: Sequence[DifferentiabilityInfo], @@ -304,19 +359,26 @@ def gen_autograd_functions_lib( """ # only create an autograd function if we are actually going to calculate a derivative - infos = list(filter(lambda info: info.args_with_derivatives, differentiability_infos)) + infos = list( + filter(lambda info: info.args_with_derivatives, differentiability_infos) + ) declarations = list(map(lambda f: process_function(f, FUNCTION_DECLARATION), infos)) definitions = list(map(lambda f: process_function(f, FUNCTION_DEFINITION), infos)) - file_basename = 'Functions' + file_basename = "Functions" fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False) - for suffix in ['.h', '.cpp']: + for suffix in [".h", ".cpp"]: fname = file_basename + suffix - fm.write_with_template(fname, fname, lambda: { - 'generated_comment': '@' + f'generated from {fm.template_dir}/' + fname, - 'autograd_function_declarations': declarations, - 'autograd_function_definitions': definitions, - }) + fm.write_with_template( + fname, + fname, + lambda: { + "generated_comment": "@" + f"generated from {fm.template_dir}/" + fname, + "autograd_function_declarations": declarations, + "autograd_function_definitions": definitions, + }, + ) + def gen_autograd_functions_python( out: str, @@ -326,34 +388,43 @@ def gen_autograd_functions_python( fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False) num_shards = 5 - fm.write('python_functions.h', lambda: { - 'generated_comment': f'@generated from {fm.template_dir}/python_functions.h', - 'shard_forward_declare': [ - f"void initialize_autogenerated_functions_{i}();" - for i in range(num_shards) - ], - 'shard_call': [ - f"initialize_autogenerated_functions_{i}();" - for i in range(num_shards) - ] - }) - - infos = list(filter(lambda info: info.args_with_derivatives, differentiability_infos)) + fm.write( + "python_functions.h", + lambda: { + "generated_comment": f"@generated from {fm.template_dir}/python_functions.h", + "shard_forward_declare": [ + f"void initialize_autogenerated_functions_{i}();" + for i in range(num_shards) + ], + "shard_call": [ + f"initialize_autogenerated_functions_{i}();" for i in range(num_shards) + ], + }, + ) + + infos = list( + filter(lambda info: info.args_with_derivatives, differentiability_infos) + ) fm.write_sharded( - 'python_functions.cpp', + "python_functions.cpp", infos, key_fn=lambda info: info.name, base_env={ - 'generated_comment': f'@generated from {fm.template_dir}/python_functions.cpp', + "generated_comment": f"@generated from {fm.template_dir}/python_functions.cpp", }, env_callable=lambda info: { - 'py_function_initializers': [process_function(info, PY_FUNCTION_DEFINITION)], - 'py_function_props_and_getters': [process_function(info, PY_FUNCTION_PROPS_AND_GETTERS)], + "py_function_initializers": [ + process_function(info, PY_FUNCTION_DEFINITION) + ], + "py_function_props_and_getters": [ + process_function(info, PY_FUNCTION_PROPS_AND_GETTERS) + ], }, num_shards=num_shards, - sharded_keys={'py_function_initializers', 'py_function_props_and_getters'} + sharded_keys={"py_function_initializers", "py_function_props_and_getters"}, ) + def process_function(info: DifferentiabilityInfo, template: CodeTemplate) -> str: saved_variables: List[str] = [] release_variables: List[str] = [] @@ -365,12 +436,15 @@ def process_function(info: DifferentiabilityInfo, template: CodeTemplate) -> str py_getsetdef_structs: List[str] = [] for arg in info.args_with_derivatives: - if arg.type == 'at::TensorList' or arg.type == 'const c10::List> &': - size = f'{arg.name}_size_' - saved_list_sizes.append(f'size_t {arg.name}_size_;') + if ( + arg.type == "at::TensorList" + or arg.type == "const c10::List> &" + ): + size = f"{arg.name}_size_" + saved_list_sizes.append(f"size_t {arg.name}_size_;") else: - size = '1' - compute_index_ranges.append(f'auto {arg.name}_ix = gen.range({size});') + size = "1" + compute_index_ranges.append(f"auto {arg.name}_ix = gen.range({size});") def save_var(var: SavedAttribute, is_output: bool) -> None: name = var.nctype.name @@ -378,76 +452,124 @@ def save_var(var: SavedAttribute, is_output: bool) -> None: should_append_getsetdef = True should_append_raw_getsetdef = False - if type == BaseCType(tensorT) or type == OptionalCType(BaseCType(tensorT)) or \ - type == MutRefCType(OptionalCType(BaseCType(tensorT))) or \ - (type == BaseCType(scalarT) and is_output): - saved_variables.append(f'SavedVariable {name}_;') - release_variables.append(f'{name}_.reset_data();') - ptr = 'shared_from_this()' if is_output else '' - unpack.append(f'auto {name} = {name}_.unpack({ptr});') - getter_definitions.append(GETTER_DEFINITION_SAVEDVAR.substitute( - op=info.op, name=name, body=GETTER_BODY_SAVEDVAR)) - getter_definitions.append(GETTER_DEFINITION_RAW_SAVEDVAR.substitute( - op=info.op, name=name, body=GETTER_BODY_RAW_SAVEDVAR)) + if ( + type == BaseCType(tensorT) + or type == OptionalCType(BaseCType(tensorT)) + or type == MutRefCType(OptionalCType(BaseCType(tensorT))) + or (type == BaseCType(scalarT) and is_output) + ): + saved_variables.append(f"SavedVariable {name}_;") + release_variables.append(f"{name}_.reset_data();") + ptr = "shared_from_this()" if is_output else "" + unpack.append(f"auto {name} = {name}_.unpack({ptr});") + getter_definitions.append( + GETTER_DEFINITION_SAVEDVAR.substitute( + op=info.op, name=name, body=GETTER_BODY_SAVEDVAR + ) + ) + getter_definitions.append( + GETTER_DEFINITION_RAW_SAVEDVAR.substitute( + op=info.op, name=name, body=GETTER_BODY_RAW_SAVEDVAR + ) + ) should_append_raw_getsetdef = True elif type == BaseCType(tensorListT): - saved_variables.append(f'std::vector {name}_;') - saved_variables.append(f'bool {name}_released_ = false;') + saved_variables.append(f"std::vector {name}_;") + saved_variables.append(f"bool {name}_released_ = false;") # Just clear() is sufficient, we don't need to loop and clear each variable. # Because the SavedVariable owns a tensor and a grad_fn, removing the SavedVariable makes them go away as well. - release_variables.append(f'{name}_.clear();') - release_variables.append(f'{name}_released_ = true;') - unpack.append(f'auto {name} = unpack_list({name}_);') - asserts.append(f'TORCH_CHECK(!{name}_released_, ERR_BACKWARD_TWICE);') - getter_definitions.append(GETTER_DEFINITION_VEC_SAVEDVAR.substitute( - op=info.op, name=name, body=GETTER_BODY_VEC_SAVEDVAR)) - getter_definitions.append(GETTER_DEFINITION_RAW_VEC_SAVEDVAR.substitute( - op=info.op, name=name, body=GETTER_BODY_RAW_VEC_SAVEDVAR)) + release_variables.append(f"{name}_.clear();") + release_variables.append(f"{name}_released_ = true;") + unpack.append(f"auto {name} = unpack_list({name}_);") + asserts.append(f"TORCH_CHECK(!{name}_released_, ERR_BACKWARD_TWICE);") + getter_definitions.append( + GETTER_DEFINITION_VEC_SAVEDVAR.substitute( + op=info.op, name=name, body=GETTER_BODY_VEC_SAVEDVAR + ) + ) + getter_definitions.append( + GETTER_DEFINITION_RAW_VEC_SAVEDVAR.substitute( + op=info.op, name=name, body=GETTER_BODY_RAW_VEC_SAVEDVAR + ) + ) should_append_raw_getsetdef = True elif type == ListCType(OptionalCType(BaseCType(tensorT))): - saved_variables.append(f'std::vector {name}_;') - saved_variables.append(f'bool {name}_released_ = false;') + saved_variables.append(f"std::vector {name}_;") + saved_variables.append(f"bool {name}_released_ = false;") # Just clear() is sufficient, we don't need to loop and clear each variable. # Because the SavedVariable owns a tensor and a grad_fn, removing the SavedVariable makes them go away as well. - release_variables.append(f'{name}_.clear();') - release_variables.append(f'{name}_released_ = true;') - unpack.append(f'auto {name} = unpack_opt_list({name}_);') - asserts.append(f'TORCH_CHECK(!{name}_released_, ERR_BACKWARD_TWICE);') - getter_definitions.append(GETTER_DEFINITION_VEC_SAVEDVAR.substitute( - op=info.op, name=name, body=GETTER_BODY_VEC_SAVEDVAR)) - getter_definitions.append(GETTER_DEFINITION_RAW_VEC_SAVEDVAR.substitute( - op=info.op, name=name, body=GETTER_BODY_RAW_VEC_SAVEDVAR)) + release_variables.append(f"{name}_.clear();") + release_variables.append(f"{name}_released_ = true;") + unpack.append(f"auto {name} = unpack_opt_list({name}_);") + asserts.append(f"TORCH_CHECK(!{name}_released_, ERR_BACKWARD_TWICE);") + getter_definitions.append( + GETTER_DEFINITION_VEC_SAVEDVAR.substitute( + op=info.op, name=name, body=GETTER_BODY_VEC_SAVEDVAR + ) + ) + getter_definitions.append( + GETTER_DEFINITION_RAW_VEC_SAVEDVAR.substitute( + op=info.op, name=name, body=GETTER_BODY_RAW_VEC_SAVEDVAR + ) + ) should_append_raw_getsetdef = True elif type == BaseCType(intArrayRefT): - saved_variables.append(f'std::vector {name};') - getter_definitions.append(GETTER_DEFINITION.substitute( - op=info.op, name=name, body=GETTER_BODY_ARRAYREF_LONG)) + saved_variables.append(f"std::vector {name};") + getter_definitions.append( + GETTER_DEFINITION.substitute( + op=info.op, name=name, body=GETTER_BODY_ARRAYREF_LONG + ) + ) + elif type == BaseCType(optionalIntArrayRefT): + saved_variables.append(f"c10::OptionalArray {name};") + getter_definitions.append( + GETTER_DEFINITION_OPT_ARRAYREF.substitute( + op=info.op, name=name, body=GETTER_BODY_ARRAYREF_LONG + ) + ) elif type == OptionalCType(BaseCType(intArrayRefT)): - saved_variables.append(f'c10::OptionalArray {name};') - getter_definitions.append(GETTER_DEFINITION_OPT_ARRAYREF.substitute( - op=info.op, name=name, body=GETTER_BODY_ARRAYREF_LONG)) + saved_variables.append(f"c10::OptionalArray {name};") + getter_definitions.append( + GETTER_DEFINITION_OPT_ARRAYREF.substitute( + op=info.op, name=name, body=GETTER_BODY_ARRAYREF_LONG + ) + ) elif type == OptionalCType(ArrayRefCType(BaseCType(doubleT))): - saved_variables.append(f'c10::OptionalArray {name};') - getter_definitions.append(GETTER_DEFINITION_OPT_ARRAYREF.substitute( - op=info.op, name=name, body=GETTER_BODY_ARRAYREF_DOUBLE)) + saved_variables.append(f"c10::OptionalArray {name};") + getter_definitions.append( + GETTER_DEFINITION_OPT_ARRAYREF.substitute( + op=info.op, name=name, body=GETTER_BODY_ARRAYREF_DOUBLE + ) + ) elif type == BaseCType(longT): - saved_variables.append(f'{type.cpp_type()} {name} = 0;') - getter_definitions.append(GETTER_DEFINITION.substitute( - op=info.op, name=name, body=GETTER_BODY_INT64_T)) + saved_variables.append(f"{type.cpp_type()} {name} = 0;") + getter_definitions.append( + GETTER_DEFINITION.substitute( + op=info.op, name=name, body=GETTER_BODY_INT64_T + ) + ) elif type == BaseCType(stringT): - saved_variables.append(f'std::string {name};') - getter_definitions.append(GETTER_DEFINITION.substitute( - op=info.op, name=name, body=GETTER_BODY_STRING)) + saved_variables.append(f"std::string {name};") + getter_definitions.append( + GETTER_DEFINITION.substitute( + op=info.op, name=name, body=GETTER_BODY_STRING + ) + ) elif type == OptionalCType(BaseCType(stringT)): - saved_variables.append(f'c10::optional {name};') - getter_definitions.append(GETTER_DEFINITION_OPT.substitute( - op=info.op, name=name, body=GETTER_BODY_STRING)) + saved_variables.append(f"c10::optional {name};") + getter_definitions.append( + GETTER_DEFINITION_OPT.substitute( + op=info.op, name=name, body=GETTER_BODY_STRING + ) + ) else: - saved_variables.append(f'{type.cpp_type()} {name};') + saved_variables.append(f"{type.cpp_type()} {name};") if type in MISC_GETTER_DEFS: getter_def, body = MISC_GETTER_DEFS[type] - getter_definitions.append(getter_def.substitute(op=info.op, name=name, body=body)) + getter_definitions.append( + getter_def.substitute(op=info.op, name=name, body=body) + ) else: # Types we don't expose python bindings to yet: # TypeAndSize, at::ScalarType, TensorOptions, TensorGeometry, @@ -455,9 +577,13 @@ def save_var(var: SavedAttribute, is_output: bool) -> None: should_append_getsetdef = False if should_append_getsetdef: - py_getsetdef_structs.append(PY_GETSETDEF_STRUCT.substitute(op=info.op, name=name)) + py_getsetdef_structs.append( + PY_GETSETDEF_STRUCT.substitute(op=info.op, name=name) + ) if should_append_raw_getsetdef: - py_getsetdef_structs.append(PY_RAW_GETSETDEF_STRUCT.substitute(op=info.op, name=name)) + py_getsetdef_structs.append( + PY_RAW_GETSETDEF_STRUCT.substitute(op=info.op, name=name) + ) for var in info.all_saved_inputs: save_var(var, is_output=False) @@ -467,24 +593,25 @@ def save_var(var: SavedAttribute, is_output: bool) -> None: # lock the mutex when we release variables and in Node::apply to protect thread safety # see Note [Thread Safety on Autograd Node] if len(release_variables) > 0: - thread_lock = 'std::lock_guard lock(mutex_);' + thread_lock = "std::lock_guard lock(mutex_);" else: - thread_lock = '' + thread_lock = "" if uses_retain_variables(info): will_release_variables = WILL_RELEASE_VARIABLES.substitute() else: - will_release_variables = '' + will_release_variables = "" body: List[str] = [] if uses_single_grad(info): - body.append('const auto& grad = grads[0];') + body.append("const auto& grad = grads[0];") else: # Generate aliases for gradients named for returned values. body.extend( - f'const auto& {name} = grads[{info.available_named_gradients.index(name)}];' - for name in info.used_named_gradients) + f"const auto& {name} = grads[{info.available_named_gradients.index(name)}];" + for name in info.used_named_gradients + ) def emit_derivative( derivative: Derivative, @@ -494,51 +621,65 @@ def emit_derivative( var_names = derivative.var_names if len(var_names) == 1: checks_any_grad_defined = False - if 'not_implemented' not in formula: + if "not_implemented" not in formula: matching_args = [ - arg for arg in args_with_derivatives - if arg.name == var_names[0]] + arg for arg in args_with_derivatives if arg.name == var_names[0] + ] if len(matching_args) == 1: # We can add undefined grad support if the input variable is a Tensor arg = matching_args[0] - if isinstance(arg.argument, Argument) and str(arg.argument.type) in ('Tensor', 'Tensor?'): - formula = 'any_grad_defined ? (' + formula + ') : Tensor()' + if isinstance(arg.argument, Argument) and str( + arg.argument.type + ) in ("Tensor", "Tensor?"): + formula = "any_grad_defined ? (" + formula + ") : Tensor()" checks_any_grad_defined = True - return (checks_any_grad_defined, - DERIVATIVE_SINGLE.substitute(name=var_names[0], derivative=formula)) + return ( + checks_any_grad_defined, + DERIVATIVE_SINGLE.substitute(name=var_names[0], derivative=formula), + ) else: - if 'grad_input_mask' in formula: - masks = [f'should_compute_output({{ {n}_ix }}),' for n in var_names] - grad_input_mask = GRAD_INPUT_MASK.substitute(masks=masks, n=len(var_names)) + if "grad_input_mask" in formula: + masks = [f"should_compute_output({{ {n}_ix }})," for n in var_names] + grad_input_mask = GRAD_INPUT_MASK.substitute( + masks=masks, n=len(var_names) + ) else: - grad_input_mask = '' - idx_ranges = ', '.join(f'{n}_ix' for n in var_names) + grad_input_mask = "" + idx_ranges = ", ".join(f"{n}_ix" for n in var_names) copy_ranges: List[str] = [] for i, n in enumerate(var_names): copy_ranges.append(DERIVATIVE_MULTI_COPY_RANGE.substitute(name=n, i=i)) return False, DERIVATIVE_MULTI.substitute( - idx_ranges=idx_ranges, copy_ranges=copy_ranges, + idx_ranges=idx_ranges, + copy_ranges=copy_ranges, derivative=formula, - grad_input_mask=grad_input_mask) + grad_input_mask=grad_input_mask, + ) body.extend(unpack) need_any_grad_defined_var = False for derivative in info.derivatives: - checks_any_grad_defined, derivative_text = emit_derivative(derivative, info.args_with_derivatives) + checks_any_grad_defined, derivative_text = emit_derivative( + derivative, info.args_with_derivatives + ) body.append(derivative_text) need_any_grad_defined_var |= checks_any_grad_defined # Since single-output derivative formulas need to check if grads are # defined, only perform the check once, before all the formulas if need_any_grad_defined_var: - body.insert(-len(info.derivatives), - 'bool any_grad_defined = any_variable_defined(grads);') + body.insert( + -len(info.derivatives), + "bool any_grad_defined = any_variable_defined(grads);", + ) if info.name in UNTRACEABLE_FUNCTIONS: - superclass = 'Node' + superclass = "Node" else: - superclass = 'TraceableFunction' + superclass = "TraceableFunction" - all_getsetdef_structs = ",\n".join(py_getsetdef_structs) + "," if len(py_getsetdef_structs) != 0 else "" + all_getsetdef_structs = ( + ",\n".join(py_getsetdef_structs) + "," if len(py_getsetdef_structs) != 0 else "" + ) all_getter_definitions = "\n".join(getter_definitions) return template.substitute( @@ -553,5 +694,5 @@ def emit_derivative( body=body, superclass=superclass, all_getter_definitions=all_getter_definitions, - all_getsetdef_structs=all_getsetdef_structs + all_getsetdef_structs=all_getsetdef_structs, ) diff --git a/tools/autograd/gen_inplace_or_view_type.py b/tools/autograd/gen_inplace_or_view_type.py index dfb1a1e9892b..541ef2b5312b 100644 --- a/tools/autograd/gen_inplace_or_view_type.py +++ b/tools/autograd/gen_inplace_or_view_type.py @@ -4,24 +4,40 @@ # if updates are needed in torch/csrc/autograd/autograd_not_implemented_fallback.cpp # The fallback is expected to mimick this codegen, so we should keep the two in sync. -from tools.codegen.api import cpp -from tools.codegen.api.autograd import ( - NativeFunctionWithDifferentiabilityInfo, gen_differentiable_outputs, +from torchgen.api import cpp +from torchgen.api.autograd import ( + NativeFunctionWithDifferentiabilityInfo, + gen_differentiable_outputs, dispatch_strategy, ) -from tools.codegen.api.types import (Binding, DispatcherSignature, CType, BaseCType, - OptionalCType, longT, boolT, intArrayRefT) -from tools.codegen.code_template import CodeTemplate -from tools.codegen.context import with_native_function -from tools.codegen.model import ( - Type, NativeFunction, SelfArgument, TensorOptionsArguments, SchemaKind, - is_foreach_op, +from torchgen.api.types import ( + Binding, + DispatcherSignature, + CType, + BaseCType, + OptionalCType, + longT, + boolT, + intArrayRefT, + symIntArrayRefT, +) +from torchgen.code_template import CodeTemplate +from torchgen.context import with_native_function +from torchgen.model import ( + Type, + NativeFunction, + SelfArgument, + TensorOptionsArguments, + SchemaKind, ) from typing import List, Optional, Sequence, Tuple, Dict -from tools.codegen.utils import FileManager +from torchgen.utils import FileManager from .context import with_native_function_with_differentiability_info from .gen_trace_type import ( - MANUAL_AUTOGRAD, type_wrapper_name, tie_return_values, get_return_value + MANUAL_AUTOGRAD, + type_wrapper_name, + tie_return_values, + get_return_value, ) # See NOTE [ Autograd View Variables ] in variable.h for details. @@ -33,58 +49,77 @@ # A map: function name => name of the argument that all outputs are view of VIEW_FUNCTIONS_WITH_METADATA_CHANGE = [ - 'view_as_complex', - 'view_as_real', - '_conj', - '_neg_view' + "view_as_complex", + "view_as_real", + "_conj", + "_neg_view", ] VIEW_FUNCTIONS = { - 'numpy_T': 'self', - 'alias': 'self', - 'as_strided': 'self', - 'diagonal': 'self', - 'expand': 'self', - 'permute': 'self', - 'select': 'self', - 'slice': 'self', - 'split': 'self', - 'split_with_sizes': 'self', - 'squeeze': 'self', - 't': 'self', - 'transpose': 'self', - 'unfold': 'self', - 'unsqueeze': 'self', - 'flatten': 'self', - 'view': 'self', - 'unbind': 'self', - '_indices': 'self', - '_values': 'self', - 'indices': 'self', - 'values': 'self', - 'crow_indices': 'self', - 'col_indices': 'self', + "numpy_T": "self", + "alias": "self", + "as_strided": "self", + "diagonal": "self", + "expand": "self", + "permute": "self", + "select": "self", + "slice": "self", + "split": "self", + "split_with_sizes": "self", + "squeeze": "self", + "t": "self", + "transpose": "self", + "unfold": "self", + "unsqueeze": "self", + "flatten": "self", + "view": "self", + "unbind": "self", + "_indices": "self", + "_values": "self", + "indices": "self", + "values": "self", + "crow_indices": "self", + "col_indices": "self", + "ccol_indices": "self", + "row_indices": "self", # sparse_coo ctor output should really be views of both indices and values, # but we only supports making as view of a single variable, and indices is # discrete anyways. # FIXME: clone indices on construction. - 'sparse_coo_tensor_with_dims_and_tensors': 'values', - '_reshape_alias': 'self', + "sparse_coo_tensor_with_dims_and_tensors": "values", + "_reshape_alias": "self", } for key in VIEW_FUNCTIONS_WITH_METADATA_CHANGE: - VIEW_FUNCTIONS[key] = 'self' + VIEW_FUNCTIONS[key] = "self" # note: some VIEW_FUNCTIONS are just compositions of the view functions above # this list contains both the root view functions and any that are purely composed # of viewing functions, and is used by the JIT to determine when an operator # may return a view of its inputs; however they may sometimes return a copy. # (e.g. `contiguous`) -RETURNS_VIEWS_OF_INPUT = set(VIEW_FUNCTIONS.keys()).union({ - 'chunk', 'detach', 'contiguous', 'reshape', 'reshape_as', - 'expand_as', 'view_as', 'real', 'imag', 'narrow', 'movedim', - 'tensor_split', 'swapdims', 'swapaxes', 'mT', 'mH', 'adjoint', 'matrix_H' -}) +RETURNS_VIEWS_OF_INPUT = set(VIEW_FUNCTIONS.keys()).union( + { + "chunk", + "detach", + "contiguous", + "reshape", + "reshape_as", + "expand_as", + "view_as", + "real", + "imag", + "narrow", + "movedim", + "tensor_split", + "swapdims", + "swapaxes", + "mT", + "mH", + "adjoint", + "matrix_H", + } +) # These are the functions we consider views for the purposes of validating # StorageImpl and TensorImpl in gen_variable_type. @@ -93,68 +128,90 @@ # See NOTE [Unsafe View] for more info. ALL_VIEW_FUNCTIONS = { **VIEW_FUNCTIONS, - '_unsafe_view': 'self', + "_unsafe_view": "self", } -ARRAYREF_TO_VEC = CodeTemplate("""\ +ARRAYREF_TO_VEC = CodeTemplate( + """\ auto ${vec} = ${arg}.vec(); -""") +""" +) -OPTIONAL_TO_VAL = CodeTemplate("""\ +OPTIONAL_TO_VAL = CodeTemplate( + """\ auto ${val} = ${arg}.value_or(${default}); -""") +""" +) -CALL_DISPATCH = CodeTemplate("""\ -at::_ops::${unambiguous_name}::call(${unpacked_args})""") +CALL_DISPATCH = CodeTemplate( + """\ +at::_ops::${unambiguous_name}::call(${unpacked_args})""" +) -SETUP_REPLAY_VIEW_IF_NOT_SUPPORT_AS_STRIDED_OR_VIEW_WITH_METADATA_CHANGE = CodeTemplate("""\ +SETUP_REPLAY_VIEW_IF_NOT_SUPPORT_AS_STRIDED_OR_VIEW_WITH_METADATA_CHANGE = CodeTemplate( + """\ std::function func=nullptr; if (${is_view_with_metadata_change} || !self.unsafeGetTensorImpl()->support_as_strided()) { ${replay_view_func} } -""") +""" +) -REPLAY_VIEW_LAMBDA_FUNC = CodeTemplate("""\ +REPLAY_VIEW_LAMBDA_FUNC = CodeTemplate( + """\ func = [=](const at::Tensor& ${input_base}) { return ${replay_view_call}; }; -""") +""" +) -METHOD_DEFINITION = CodeTemplate("""\ +METHOD_DEFINITION = CodeTemplate( + """\ ${return_type} ${type_wrapper_name}(${formals}) { ${type_definition_body} } -""") +""" +) -WRAPPER_REGISTRATION = CodeTemplate("""\ +WRAPPER_REGISTRATION = CodeTemplate( + """\ m.impl("${unqual_operator_name_with_overload}", TORCH_FN(${class_type}::${type_wrapper_name}) ); -""") +""" +) -AUTOGRAD_NOT_IMPLEMENTED_REGISTRATION = CodeTemplate("""\ +AUTOGRAD_NOT_IMPLEMENTED_REGISTRATION = CodeTemplate( + """\ m.impl("${unqual_operator_name_with_overload}", torch::autograd::autogradNotImplementedFallback()); -""") +""" +) -INPLACE_REDISPATCH = CodeTemplate("""\ +INPLACE_REDISPATCH = CodeTemplate( + """\ { at::AutoDispatchBelowADInplaceOrView guard; at::_ops::${unambiguous_name}::redispatch(${unpacked_args}); } -""") +""" +) -ASSIGN_RETURN_VALUE = CodeTemplate("""\ +ASSIGN_RETURN_VALUE = CodeTemplate( + """\ ${return_values} = ${rhs_value}; -""") +""" +) -VIEW_REDISPATCH = CodeTemplate("""\ +VIEW_REDISPATCH = CodeTemplate( + """\ ${assign_return_values} ([&]() { at::AutoDispatchBelowADInplaceOrView guard; return at::_ops::${unambiguous_name}::redispatch(${unpacked_args}); })(); -""") +""" +) -TMP_VAR = '_tmp' +TMP_VAR = "_tmp" # FIXME: Ideally these functions should be methods on Type class, but we have a # comment in codegen/model.py there saying these concepts are not well defined. @@ -163,27 +220,38 @@ def is_tensor_type(t: Type) -> bool: # TODO: Should handle optional here? return t.is_tensor_like() and t.is_list_like() is None + def is_tensor_list_type(t: Type) -> bool: # TODO: Should handle optional here? return t.is_tensor_like() and t.is_list_like() is not None -UNPACK_TENSOR = CodeTemplate("""\ -auto${ref} ${arg_name}_ = unpack${suffix}(${arg_name}, "${arg_name}", ${arg_pos});""") + +UNPACK_TENSOR = CodeTemplate( + """\ +auto${ref} ${arg_name}_ = unpack${suffix}(${arg_name}, "${arg_name}", ${arg_pos});""" +) + def unpacked_name(arg_name: str) -> str: - return arg_name + '_' + return arg_name + "_" + @with_native_function def unpack_args(f: NativeFunction) -> Tuple[List[str], List[Binding]]: body: List[str] = [] unpacked_bindings: List[Binding] = [] - bindings = [r for a in f.func.schema_order_arguments() - for r in cpp.argument(a, - method=False, - cpp_no_default_args=set(), - faithful=False, - has_tensor_options=False)] + bindings = [ + r + for a in f.func.schema_order_arguments() + for r in cpp.argument( + a, + method=False, + cpp_no_default_args=set(), + faithful=False, + has_tensor_options=False, + ) + ] for i, binding in enumerate(bindings): assert not isinstance(binding.argument, SelfArgument) @@ -197,25 +265,31 @@ def unpack_args(f: NativeFunction) -> Tuple[List[str], List[Binding]]: is_tensor_list = is_tensor_list_type(binding.argument.type) ref = (not is_nullable) and not is_tensor_list - suffix = '_opt' if is_nullable and not is_tensor_list else '' - body.append(UNPACK_TENSOR.substitute( - arg_name=binding.name, - arg_pos=i, - suffix=suffix, - ref='&' if ref else '', - )) - unpacked_bindings.append(Binding( - name=unpacked_name(binding.name), - nctype=binding.nctype, - argument=binding.argument, - default=binding.default, - )) + suffix = "_opt" if is_nullable and not is_tensor_list else "" + body.append( + UNPACK_TENSOR.substitute( + arg_name=binding.name, + arg_pos=i, + suffix=suffix, + ref="&" if ref else "", + ) + ) + unpacked_bindings.append( + Binding( + name=unpacked_name(binding.name), + nctype=binding.nctype, + argument=binding.argument, + default=binding.default, + ) + ) return body, unpacked_bindings + def get_base_name(f: NativeFunction) -> str: return f.func.name.name.base # TODO: should be str(f.func.name.name)? + def get_view_info(f: NativeFunction) -> Optional[str]: base_name = get_base_name(f) view_info = VIEW_FUNCTIONS.get(base_name, None) @@ -223,114 +297,148 @@ def get_view_info(f: NativeFunction) -> Optional[str]: view_info = "self" return view_info + # For view replay calls, we generate an ordinary Dispatcher::call() instead, because: # - We want to replay the entire call into the op, including any previously-set dispatch keys (including autograd!). # - The view replay call also is not part of the hot path. -def emit_view_call(f: NativeFunction, input_base: str, unpacked_args: Sequence[str]) -> str: +def emit_view_call( + f: NativeFunction, input_base: str, unpacked_args: Sequence[str] +) -> str: # View replay functions use the standard Dispatcher::call API. return CALL_DISPATCH.substitute( - unambiguous_name=f.func.name.unambiguous_name(), - unpacked_args=unpacked_args) + unambiguous_name=f.func.name.unambiguous_name(), unpacked_args=unpacked_args + ) + def emit_view_lambda(f: NativeFunction, unpacked_bindings: List[Binding]) -> str: - """ Generate an additional lambda function to recover views in backward when as_strided is not supported. + """Generate an additional lambda function to recover views in backward when as_strided is not supported. See Note [View + Inplace update for base tensor] and [View + Inplace update for view tensor] for more details.""" - input_base = 'input_base' - replay_view_func = '' + input_base = "input_base" + replay_view_func = "" updated_unpacked_args: List[str] = [] known_view_arg_simple_types: List[CType] = [ BaseCType(longT), OptionalCType(BaseCType(longT)), BaseCType(boolT), - BaseCType(intArrayRefT)] + BaseCType(intArrayRefT), + BaseCType(symIntArrayRefT), + ] for unpacked_binding in unpacked_bindings: arg, arg_type = unpacked_binding.name, unpacked_binding.nctype.type - if arg == 'self_': + if arg == "self_": updated_unpacked_args.append(input_base) continue if arg_type not in known_view_arg_simple_types: - known_types_str = ', '.join([str(t) for t in known_view_arg_simple_types]) - raise TypeError(f'You are adding an {arg_type} {arg} argument to op {cpp.name(f.func)} in addition to known types: ' - f'{known_types_str}. Please update the list or materialize it so that it can be closed ' - 'over by value, also add a test in pytorch/xla/test/test_operations.py where this code ' - 'is exercised.') - - if arg_type == BaseCType(intArrayRefT): + known_types_str = ", ".join([str(t) for t in known_view_arg_simple_types]) + raise TypeError( + f"You are adding an {arg_type} {arg} argument to op {cpp.name(f.func)} in addition to known types: " + f"{known_types_str}. Please update the list or materialize it so that it can be closed " + "over by value, also add a test in pytorch/xla/test/test_operations.py where this code " + "is exercised." + ) + + if arg_type == BaseCType(intArrayRefT) or arg_type == BaseCType( + symIntArrayRefT + ): # It's not safe to close over IntArrayRef by value, since this is a # reference type, so materialize a vector to close over by value - arg_vec = arg + '_vec' + arg_vec = arg + "_vec" replay_view_func += ARRAYREF_TO_VEC.substitute(arg=arg, vec=arg_vec) updated_unpacked_args.append(arg_vec) elif arg_type == OptionalCType(BaseCType(longT)): # Materialize int64_t? to int64_t - arg_value = arg + '_val' - replay_view_func += OPTIONAL_TO_VAL.substitute(arg=arg, val=arg_value, default='0') + arg_value = arg + "_val" + replay_view_func += OPTIONAL_TO_VAL.substitute( + arg=arg, val=arg_value, default="0" + ) updated_unpacked_args.append(arg_value) else: updated_unpacked_args.append(arg) replay_view_call = emit_view_call(f, input_base, updated_unpacked_args) replay_view_func += REPLAY_VIEW_LAMBDA_FUNC.substitute( - input_base=input_base, - replay_view_call=replay_view_call) + input_base=input_base, replay_view_call=replay_view_call + ) - is_view_with_metadata_change = 'true' if cpp.name(f.func) in VIEW_FUNCTIONS_WITH_METADATA_CHANGE else 'false' + is_view_with_metadata_change = ( + "true" if cpp.name(f.func) in VIEW_FUNCTIONS_WITH_METADATA_CHANGE else "false" + ) return SETUP_REPLAY_VIEW_IF_NOT_SUPPORT_AS_STRIDED_OR_VIEW_WITH_METADATA_CHANGE.substitute( is_view_with_metadata_change=is_view_with_metadata_change, - replay_view_func=replay_view_func) + replay_view_func=replay_view_func, + ) -def emit_view_body(fn: NativeFunctionWithDifferentiabilityInfo, var: str) -> Tuple[str, str]: + +def emit_view_body( + fn: NativeFunctionWithDifferentiabilityInfo, var: str +) -> Tuple[str, str]: # See NOTE [ Autograd View Variables ] in variable.h for details. f = fn.func base_name = get_base_name(f) view_info = get_view_info(f) - call = '' + call = "" differentiable_outputs = gen_differentiable_outputs(fn) differentiable_output_vars = {r.name for r in differentiable_outputs} if not isinstance(view_info, str): - raise TypeError(f'The view info should be a string for {base_name}, but it is: {view_info}') + raise TypeError( + f"The view info should be a string for {base_name}, but it is: {view_info}" + ) if len(differentiable_output_vars) == 0: # no output is differentiable (.indices() for SparseTensors for example) - rhs_value = (f'as_view({view_info}, {var}, ' - f'/* is_bw_differentiable */ false, /* is_fw_differentiable */ false)') + rhs_value = ( + f"as_view({view_info}, {var}, " + f"/* is_bw_differentiable */ false, /* is_fw_differentiable */ false)" + ) elif len(differentiable_output_vars) == 1: # Single differentiable output (Tensor or Tensor[]) return_info = differentiable_outputs[0] # We only support simple Tensor or a TensorList for functions that return views - if not is_tensor_type(return_info.type) and not is_tensor_list_type(return_info.type): - raise RuntimeError(f'{base_name} that return differentiable views can only return Tensor or Tensor[]') + if not is_tensor_type(return_info.type) and not is_tensor_list_type( + return_info.type + ): + raise RuntimeError( + f"{base_name} that return differentiable views can only return Tensor or Tensor[]" + ) # See Note [ View + Inplace detection] def get_creation_meta_in_mode(original: str) -> str: - creation_meta_with_grad_mode = f'(at::GradMode::is_enabled() ? {original} : CreationMeta::NO_GRAD_MODE)' - return f'InferenceMode::is_enabled() ? CreationMeta::INFERENCE_MODE : {creation_meta_with_grad_mode}' + creation_meta_with_grad_mode = f"(at::GradMode::is_enabled() ? {original} : CreationMeta::NO_GRAD_MODE)" + return f"InferenceMode::is_enabled() ? CreationMeta::INFERENCE_MODE : {creation_meta_with_grad_mode}" # Only allow rebasing of the history if we return a single Tensor # If we are in a no grad block, raise a warning # See NOTE [ View + Inplace detection ] for more details about this logic if is_tensor_list_type(return_info.type): - creation_meta = get_creation_meta_in_mode('CreationMeta::MULTI_OUTPUT_NODE') - call += (f'as_view(/* base */ {view_info}, /* output */ {var}, /* is_bw_differentiable */ true, ' - '/* is_fw_differentiable */ true, ' - f'/* creation_meta */ {creation_meta});') - rhs_value = f'std::move({var})' + creation_meta = get_creation_meta_in_mode("CreationMeta::MULTI_OUTPUT_NODE") + call += ( + f"as_view(/* base */ {view_info}, /* output */ {var}, /* is_bw_differentiable */ true, " + "/* is_fw_differentiable */ true, " + f"/* creation_meta */ {creation_meta});" + ) + rhs_value = f"std::move({var})" else: _, unpacked_bindings = unpack_args(f) call += emit_view_lambda(f, unpacked_bindings) - creation_meta = get_creation_meta_in_mode('CreationMeta::DEFAULT') - rhs_value = (f'as_view(/* base */ {view_info}, /* output */ {var}, /* is_bw_differentiable */ true, ' - '/* is_fw_differentiable */ true, ' - f'/* view_func */ func, /* creation_meta */ {creation_meta})') + creation_meta = get_creation_meta_in_mode("CreationMeta::DEFAULT") + rhs_value = ( + f"as_view(/* base */ {view_info}, /* output */ {var}, /* is_bw_differentiable */ true, " + "/* is_fw_differentiable */ true, " + f"/* view_func */ func, /* creation_meta */ {creation_meta})" + ) else: # This could be supported but we don't need it at the moment, so keeping things simple. - raise RuntimeError('Function that return multiple differentiable output ' - 'when at least one of them is view is not supported.') + raise RuntimeError( + "Function that return multiple differentiable output " + "when at least one of them is view is not supported." + ) return call, rhs_value + def modifies_arguments(f: NativeFunction) -> bool: return f.func.kind() in [SchemaKind.inplace, SchemaKind.out] + @with_native_function_with_differentiability_info def emit_inplace_or_view_body(fn: NativeFunctionWithDifferentiabilityInfo) -> List[str]: f = fn.func @@ -341,48 +449,67 @@ def emit_inplace_or_view_body(fn: NativeFunctionWithDifferentiabilityInfo) -> Li # code-generated ADInplaceOrView kernels plumb and recompute dispatch keys directly through the kernel for performance. # See Note [Plumbing Keys Through The Dispatcher] for details. - dispatch_key_set = 'ks & c10::after_ADInplaceOrView_keyset' - redispatch_args = ', '.join([dispatch_key_set] + [a.expr for a in dispatcher_exprs]) + dispatch_key_set = "ks & c10::after_ADInplaceOrView_keyset" + redispatch_args = ", ".join([dispatch_key_set] + [a.expr for a in dispatcher_exprs]) # Note that this calls the slow, dispatching variants of manual_cpp_binding ops. # We could probably work harder to ensure that the fast variants are called instead, but the perf benefit would be minimal. if modifies_arguments(f): # inplace op - inplace_view_body.append(INPLACE_REDISPATCH.substitute( - unambiguous_name=f.func.name.unambiguous_name(), - unpacked_args=redispatch_args, - )) + inplace_view_body.append( + INPLACE_REDISPATCH.substitute( + unambiguous_name=f.func.name.unambiguous_name(), + unpacked_args=redispatch_args, + ) + ) for r in cpp.return_names(f): - inplace_view_body.append(f'increment_version({r});') + inplace_view_body.append(f"increment_version({r});") else: - assert(get_view_info(f) is not None) - inplace_view_body.append(VIEW_REDISPATCH.substitute( - assign_return_values='auto ' + TMP_VAR + ' = ', - unambiguous_name=f.func.name.unambiguous_name(), - unpacked_args=redispatch_args, - )) + assert get_view_info(f) is not None + inplace_view_body.append( + VIEW_REDISPATCH.substitute( + assign_return_values="auto " + TMP_VAR + " = ", + unambiguous_name=f.func.name.unambiguous_name(), + unpacked_args=redispatch_args, + ) + ) call, rhs_value = emit_view_body(fn, TMP_VAR) inplace_view_body.append(call) assert rhs_value is not None inplace_view_body.append( - ASSIGN_RETURN_VALUE.substitute(return_values=tie_return_values(f), rhs_value=rhs_value)) + ASSIGN_RETURN_VALUE.substitute( + return_values=tie_return_values(f), rhs_value=rhs_value + ) + ) if f.func.returns: - inplace_view_body.append(f'return {get_return_value(f)};') + inplace_view_body.append(f"return {get_return_value(f)};") return inplace_view_body + @with_native_function def gen_formals(f: NativeFunction) -> str: - return ', '.join( + return ", ".join( # code-generated autograd kernels plumb and recompute dispatch keys directly through the kernel for performance. # See Note [Plumbing Keys Through The Dispatcher] for details. - ['c10::DispatchKeySet ks'] + - [f'{cpp.argument_type(a, binds="__placeholder__").cpp_type()} {a.name}' - for a in f.func.schema_order_arguments()] + ["c10::DispatchKeySet ks"] + + [ + f'{cpp.argument_type(a, binds="__placeholder__").cpp_type()} {a.name}' + for a in f.func.schema_order_arguments() + ] ) + @with_native_function_with_differentiability_info -def inplace_or_view_method_definition(fn: NativeFunctionWithDifferentiabilityInfo) -> Optional[str]: +def inplace_or_view_method_definition( + fn: NativeFunctionWithDifferentiabilityInfo, +) -> Optional[str]: f = fn.func - if get_view_info(f) is None and (not modifies_arguments(f) or is_foreach_op(str(f.func.name))): + if get_view_info(f) is None and ( + # For functions that modify their inputs but don't return them, + # we can't give them autograd support. + # See https://github.com/pytorch/pytorch/issues/53796 + not modifies_arguments(f) + or len(f.func.returns) == 0 + ): return None return METHOD_DEFINITION.substitute( return_type=cpp.returns_type(f.func.returns).cpp_type(), @@ -391,38 +518,56 @@ def inplace_or_view_method_definition(fn: NativeFunctionWithDifferentiabilityInf type_definition_body=emit_inplace_or_view_body(fn), ) + @with_native_function_with_differentiability_info -def inplace_or_view_method_registration(fn: NativeFunctionWithDifferentiabilityInfo) -> Optional[str]: +def inplace_or_view_method_registration( + fn: NativeFunctionWithDifferentiabilityInfo, +) -> Optional[str]: f = fn.func - if get_view_info(f) is None and (not modifies_arguments(f) or is_foreach_op(str(f.func.name))): + if get_view_info(f) is None and ( + not modifies_arguments(f) or len(f.func.returns) == 0 + ): return None return WRAPPER_REGISTRATION.substitute( unqual_operator_name_with_overload=f.func.name, type_wrapper_name=type_wrapper_name(f), - class_type='ADInplaceOrView', + class_type="ADInplaceOrView", ) + def use_derived(fn: NativeFunctionWithDifferentiabilityInfo) -> bool: f = fn.func name = cpp.name(f.func) - return name not in MANUAL_AUTOGRAD and dispatch_strategy(fn) == 'use_derived' + return name not in MANUAL_AUTOGRAD and dispatch_strategy(fn) == "use_derived" + -def gen_inplace_or_view_type_env(fn: NativeFunctionWithDifferentiabilityInfo) -> Dict[str, List[str]]: +def gen_inplace_or_view_type_env( + fn: NativeFunctionWithDifferentiabilityInfo, +) -> Dict[str, List[str]]: definition = inplace_or_view_method_definition(fn) registration = inplace_or_view_method_registration(fn) return { - 'ops_headers': ([f'#include '] - if definition is not None else []), - 'inplace_or_view_method_definitions': [definition] if definition is not None else [], - 'inplace_or_view_wrapper_registrations': [registration] if registration is not None else [], + "ops_headers": ( + [f"#include "] + if definition is not None + else [] + ), + "inplace_or_view_method_definitions": [definition] + if definition is not None + else [], + "inplace_or_view_wrapper_registrations": [registration] + if registration is not None + else [], } + def gen_inplace_or_view_type( out: str, native_yaml_path: str, + tags_yaml_path: str, fns_with_infos: List[NativeFunctionWithDifferentiabilityInfo], - template_path: str + template_path: str, ) -> None: # NOTE: see Note [Sharded File] at the top of the VariableType.cpp # template regarding sharding of the generated files. @@ -430,15 +575,17 @@ def gen_inplace_or_view_type( fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False) fm.write_sharded( - 'ADInplaceOrViewType.cpp', + "ADInplaceOrViewType.cpp", [fn for fn in fns_with_infos if use_derived(fn)], key_fn=lambda fn: fn.func.root_name, base_env={ - 'generated_comment': - f'@generated from {template_path}/ADInplaceOrViewType.cpp', + "generated_comment": f"@generated from {template_path}/ADInplaceOrViewType.cpp", }, env_callable=gen_inplace_or_view_type_env, num_shards=2, - sharded_keys={'ops_headers', 'inplace_or_view_method_definitions', - 'inplace_or_view_wrapper_registrations'} + sharded_keys={ + "ops_headers", + "inplace_or_view_method_definitions", + "inplace_or_view_wrapper_registrations", + }, ) diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py index 2b9b133cea7a..ab592764e5bd 100644 --- a/tools/autograd/gen_python_functions.py +++ b/tools/autograd/gen_python_functions.py @@ -37,26 +37,36 @@ from .gen_trace_type import should_trace -from tools.codegen.code_template import CodeTemplate -from tools.codegen.api import cpp -from tools.codegen.api.types import CppSignatureGroup -from tools.codegen.api.python import (PythonArgument, PythonSignature, - PythonSignatureDeprecated, - PythonSignatureGroup, - PythonSignatureNativeFunctionPair, - arg_parser_output_exprs, - argument_type_str, cpp_dispatch_exprs, - cpp_dispatch_target, - dispatch_lambda_args, - dispatch_lambda_exprs, - dispatch_lambda_return_str, - has_tensor_options, - namedtuple_fieldnames, signature) -from tools.codegen.gen import cpp_string, parse_native_yaml -from tools.codegen.context import with_native_function -from tools.codegen.model import (Argument, BaseOperatorName, NativeFunction, - Type, Variant) -from tools.codegen.utils import split_name_params, YamlLoader, FileManager +from torchgen.code_template import CodeTemplate +from torchgen.api import cpp +from torchgen.api.types import CppSignatureGroup +from torchgen.api.python import ( + PythonArgument, + PythonSignature, + PythonSignatureDeprecated, + PythonSignatureGroup, + PythonSignatureNativeFunctionPair, + arg_parser_output_exprs, + argument_type_str, + cpp_dispatch_exprs, + cpp_dispatch_target, + dispatch_lambda_args, + dispatch_lambda_exprs, + dispatch_lambda_return_str, + has_tensor_options, + namedtuple_fieldnames, + signature, +) +from torchgen.gen import cpp_string, parse_native_yaml +from torchgen.context import with_native_function +from torchgen.model import ( + Argument, + BaseOperatorName, + NativeFunction, + Type, + Variant, +) +from torchgen.utils import split_name_params, YamlLoader, FileManager from typing import Dict, Optional, List, Tuple, Set, Sequence, Callable @@ -70,49 +80,101 @@ # These functions require manual Python bindings or are not exposed to Python _SKIP_PYTHON_BINDINGS = [ - 'alias', 'contiguous', 'is_cuda', 'is_sparse', 'is_sparse_csr', 'size', 'stride', - '.*_backward', '.*_backward_(out|input|weight|bias)', '.*_forward', - '.*_forward_out', '_unsafe_view', 'tensor', '_?sparse_coo_tensor.*', - '_?sparse_csr_tensor.*', - '_arange.*', '_range.*', 'linspace.*', 'logspace.*', - '_sparse_add_out', '_sparse_div.*', '_sparse_mul.*', '_sparse_sub.*', '_sparse_dense_add_out', - 'index', 'unique_dim_consecutive', - '_cumsum.*', '_cumprod.*', '_sum.*', '_prod.*', - '_th_.*', '_thnn_.*', - 'arange.*', 'range.*', '_solve.*', '_inverse.*', - 'full(_out)?', - '_cholesky.*', '_triangular_solve.*', '_qr.*', '_symeig.*', '_svd.*', - 'slice', 'randint(_out)?', - 'item', '_local_scalar_dense', 'to', - '_to_copy', - 'copy_sparse_to_sparse_', 'copy_', - 'numpy_T', 'matrix_H', 'mT', 'mH', # these need to be an attributes in Python, not functions - 'nonzero(_(out|numpy))?', - 'set_data', - '.*_overrideable', # overrideable functions for backend extension - 'data', 'is_leaf', 'output_nr', '_version', 'requires_grad_', 'retains_grad', 'set_', - '_fw_primal', 'fake_quantize_per_tensor_affine_cachemask', - 'fake_quantize_per_channel_affine_cachemask', - '_new_zeros_with_same_feature_meta', '_has_same_storage_numel', # used for forward AD internals - '_reshape_alias', - 'replace_', # only used by the functionalization pass, doesn't need to be exposed to python + "alias", + "contiguous", + "is_cuda", + "is_sparse", + "is_sparse_csr", + "size", + "stride", + ".*_backward", + ".*_backward_(out|input|weight|bias)", + ".*_forward", + ".*_forward_out", + ".*_jvp", + "_unsafe_view", + "tensor", + "_?sparse_(coo|compressed|csr|csc|bsr|bsc)_tensor.*", + "_arange.*", + "_range.*", + "linspace.*", + "logspace.*", + "_sparse_add_out", + "_sparse_div.*", + "_sparse_mul.*", + "_sparse_sub.*", + "_sparse_dense_add_out", + "index", + "unique_dim_consecutive", + "_cumsum.*", + "_cumprod.*", + "_sum.*", + "_prod.*", + "_th_.*", + "_thnn_.*", + "arange.*", + "range.*", + "_solve.*", + "_inverse.*", + "full(_out)?", + "_cholesky.*", + "_triangular_solve.*", + "_qr.*", + "_symeig.*", + "_svd.*", + "slice", + "randint(_out)?", + "item", + "_local_scalar_dense", + "to", + "_to_copy", + "copy_sparse_to_sparse_", + "copy_", + "numpy_T", + "matrix_H", + "mT", + "mH", # these need to be an attributes in Python, not functions + "nonzero(_(out|numpy))?", + "set_data", + ".*_overrideable", # overrideable functions for backend extension + "data", + "is_leaf", + "output_nr", + "_version", + "requires_grad_", + "retains_grad", + "set_", + "_fw_primal", + "fake_quantize_per_tensor_affine_cachemask", + "fake_quantize_per_channel_affine_cachemask", + "_new_zeros_with_same_feature_meta", + "_has_same_storage_numel", # used for forward AD internals + "_reshape_alias", + "replace_", # only used by the functionalization pass, doesn't need to be exposed to python + "copy", # only used by the functionalization pass + "fill.Tensor", # only used by the functionalization pass + "fill.Scalar", # only used by the functionalization pass + "lift", ] -SKIP_PYTHON_BINDINGS = list(map(lambda pattern: re.compile(rf'^{pattern}$'), _SKIP_PYTHON_BINDINGS)) +SKIP_PYTHON_BINDINGS = list( + map(lambda pattern: re.compile(rf"^{pattern}$"), _SKIP_PYTHON_BINDINGS) +) # These function signatures are not exposed to Python. Note that this signature # list does not support regex. SKIP_PYTHON_BINDINGS_SIGNATURES = [ - 'add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor', - 'add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)', - 'sub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor', - 'sub_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)', - 'mul.Scalar(Tensor self, Scalar other) -> Tensor', - 'mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)', - 'div.Scalar(Tensor self, Scalar other) -> Tensor', - 'div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)', + "add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor", + "add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)", + "sub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor", + "sub_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)", + "mul.Scalar(Tensor self, Scalar other) -> Tensor", + "mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", + "div.Scalar(Tensor self, Scalar other) -> Tensor", + "div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", ] + @with_native_function def should_generate_py_binding(f: NativeFunction) -> bool: name = cpp.name(f.func) @@ -127,32 +189,42 @@ def should_generate_py_binding(f: NativeFunction) -> bool: return True + def get_pycname(name: BaseOperatorName) -> str: - return f'THPVariable_{name}' + return f"THPVariable_{name}" + def is_noarg(overloads: Sequence[PythonSignatureNativeFunctionPair]) -> bool: return len(overloads) == 1 and overloads[0].signature.arguments_count() == 0 + def is_py_variable_method(f: NativeFunction) -> bool: return f.python_module is None and Variant.method in f.variants + def is_py_torch_function(f: NativeFunction) -> bool: return f.python_module is None and Variant.function in f.variants + def is_py_nn_function(f: NativeFunction) -> bool: - return f.python_module == 'nn' + return f.python_module == "nn" + def is_py_fft_function(f: NativeFunction) -> bool: - return f.python_module == 'fft' + return f.python_module == "fft" + def is_py_linalg_function(f: NativeFunction) -> bool: - return f.python_module == 'linalg' + return f.python_module == "linalg" + def is_py_sparse_function(f: NativeFunction) -> bool: - return f.python_module == 'sparse' + return f.python_module == "sparse" + def is_py_special_function(f: NativeFunction) -> bool: - return f.python_module == 'special' + return f.python_module == "special" + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # @@ -160,54 +232,110 @@ def is_py_special_function(f: NativeFunction) -> bool: # # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -def gen(out: str, native_yaml_path: str, deprecated_yaml_path: str, template_path: str) -> None: + +def gen( + out: str, + native_yaml_path: str, + tags_yaml_path: str, + deprecated_yaml_path: str, + template_path: str, +) -> None: fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False) - native_functions = parse_native_yaml(native_yaml_path).native_functions + native_functions = parse_native_yaml( + native_yaml_path, tags_yaml_path + ).native_functions native_functions = list(filter(should_generate_py_binding, native_functions)) methods = load_signatures(native_functions, deprecated_yaml_path, method=True) create_python_bindings( - fm, methods, is_py_variable_method, None, 'python_variable_methods.cpp', method=True) + fm, + methods, + is_py_variable_method, + None, + "python_variable_methods.cpp", + method=True, + ) # NOTE: num_shards here must be synced with gatherTorchFunctions in # torch/csrc/autograd/python_torch_functions_manual.cpp functions = load_signatures(native_functions, deprecated_yaml_path, method=False) create_python_bindings_sharded( - fm, functions, is_py_torch_function, 'torch', 'python_torch_functions.cpp', - method=False, num_shards=3) + fm, + functions, + is_py_torch_function, + "torch", + "python_torch_functions.cpp", + method=False, + num_shards=3, + ) create_python_bindings( - fm, functions, is_py_nn_function, 'torch.nn', 'python_nn_functions.cpp', method=False) + fm, + functions, + is_py_nn_function, + "torch.nn", + "python_nn_functions.cpp", + method=False, + ) create_python_bindings( - fm, functions, is_py_fft_function, 'torch.fft', 'python_fft_functions.cpp', method=False) + fm, + functions, + is_py_fft_function, + "torch.fft", + "python_fft_functions.cpp", + method=False, + ) create_python_bindings( - fm, functions, is_py_linalg_function, 'torch.linalg', 'python_linalg_functions.cpp', method=False) + fm, + functions, + is_py_linalg_function, + "torch.linalg", + "python_linalg_functions.cpp", + method=False, + ) create_python_bindings( - fm, functions, is_py_sparse_function, 'torch.sparse', 'python_sparse_functions.cpp', method=False) + fm, + functions, + is_py_sparse_function, + "torch.sparse", + "python_sparse_functions.cpp", + method=False, + ) create_python_bindings( - fm, functions, is_py_special_function, 'torch.special', 'python_special_functions.cpp', method=False) + fm, + functions, + is_py_special_function, + "torch.special", + "python_special_functions.cpp", + method=False, + ) # Currently, we only use `functions` to generate `return_types` bindings. # All methods which return namedtuple have function variant at this point. # If any method only operator with namedtuple is added in the future, # we will have to address that. create_python_return_type_bindings( - fm, functions, lambda fn: True, 'python_return_types.cpp') + fm, functions, lambda fn: True, "python_return_types.cpp" + ) + def group_filter_overloads( pairs: Sequence[PythonSignatureNativeFunctionPair], - pred: Callable[[NativeFunction], bool] + pred: Callable[[NativeFunction], bool], ) -> Dict[BaseOperatorName, List[PythonSignatureNativeFunctionPair]]: - grouped: Dict[BaseOperatorName, List[PythonSignatureNativeFunctionPair]] = defaultdict(list) + grouped: Dict[ + BaseOperatorName, List[PythonSignatureNativeFunctionPair] + ] = defaultdict(list) for pair in pairs: if pred(pair.function): grouped[pair.function.func.name.name].append(pair) return grouped + def create_python_bindings( fm: FileManager, pairs: Sequence[PythonSignatureNativeFunctionPair], @@ -230,15 +358,20 @@ def create_python_bindings( py_methods.append(method_impl(name, module, overloads, method=method)) py_method_defs.append(method_def(name, module, overloads, method=method)) py_forwards.extend(forward_decls(name, overloads, method=method)) - ops_headers.append(f'#include ') + ops_headers.append(f"#include ") + + fm.write_with_template( + filename, + filename, + lambda: { + "generated_comment": "@" + f"generated from {fm.template_dir}/{filename}", + "ops_headers": ops_headers, + "py_forwards": py_forwards, + "py_methods": py_methods, + "py_method_defs": py_method_defs, + }, + ) - fm.write_with_template(filename, filename, lambda: { - 'generated_comment': '@' + f'generated from {fm.template_dir}/{filename}', - 'ops_headers': ops_headers, - 'py_forwards': py_forwards, - 'py_methods': py_methods, - 'py_method_defs': py_method_defs, - }) def create_python_return_type_bindings( fm: FileManager, @@ -257,15 +390,24 @@ def create_python_return_type_bindings( for name in sorted(grouped.keys(), key=lambda x: str(x)): overloads = grouped[name] - definitions, map_entries = generate_return_type_definition_and_map_entry(overloads) - py_return_types_definition.append("" if not definitions else "\n".join(definitions)) + definitions, map_entries = generate_return_type_definition_and_map_entry( + overloads + ) + py_return_types_definition.append( + "" if not definitions else "\n".join(definitions) + ) py_return_types_map.append("" if not map_entries else "\n".join(map_entries)) - fm.write_with_template(filename, filename, lambda: { - 'generated_comment': '@' + f'generated from {fm.template_dir}/{filename}', - 'py_return_types': py_return_types_definition, - 'py_return_types_map' : py_return_types_map, - }) + fm.write_with_template( + filename, + filename, + lambda: { + "generated_comment": "@" + f"generated from {fm.template_dir}/{filename}", + "py_return_types": py_return_types_definition, + "py_return_types_map": py_return_types_map, + }, + ) + def create_python_bindings_sharded( fm: FileManager, @@ -275,12 +417,14 @@ def create_python_bindings_sharded( filename: str, *, method: bool, - num_shards: int + num_shards: int, ) -> None: """Generates Python bindings to ATen functions""" grouped = group_filter_overloads(pairs, pred) - def key_func(kv: Tuple[BaseOperatorName, List[PythonSignatureNativeFunctionPair]]) -> str: + def key_func( + kv: Tuple[BaseOperatorName, List[PythonSignatureNativeFunctionPair]] + ) -> str: return kv[0].base def env_func( @@ -288,25 +432,25 @@ def env_func( ) -> Dict[str, List[str]]: name, fn_pairs = kv return { - 'ops_headers': [f'#include '], - 'py_forwards': list(forward_decls(name, fn_pairs, method=method)), - 'py_methods': [method_impl(name, module, fn_pairs, method=method)], - 'py_method_defs': [method_def(name, module, fn_pairs, method=method)], + "ops_headers": [f"#include "], + "py_forwards": list(forward_decls(name, fn_pairs, method=method)), + "py_methods": [method_impl(name, module, fn_pairs, method=method)], + "py_method_defs": [method_def(name, module, fn_pairs, method=method)], } fm.write_sharded( filename, grouped.items(), base_env={ - 'generated_comment': - '@' + f'generated from {fm.template_dir}/{filename}', + "generated_comment": "@" + f"generated from {fm.template_dir}/{filename}", }, key_fn=key_func, env_callable=env_func, num_shards=num_shards, - sharded_keys={'ops_headers', 'py_forwards', 'py_methods', 'py_method_defs'} + sharded_keys={"ops_headers", "py_forwards", "py_methods", "py_method_defs"}, ) + def load_signatures( native_functions: List[NativeFunction], deprecated_yaml_path: str, @@ -315,7 +459,6 @@ def load_signatures( skip_deprecated: bool = False, pyi: bool = False, ) -> Sequence[PythonSignatureNativeFunctionPair]: - @with_native_function def gen_signature_pairs(f: NativeFunction) -> PythonSignatureNativeFunctionPair: return PythonSignatureNativeFunctionPair( @@ -324,9 +467,12 @@ def gen_signature_pairs(f: NativeFunction) -> PythonSignatureNativeFunctionPair: ) pairs = list(map(gen_signature_pairs, native_functions)) - deprecated = load_deprecated_signatures(pairs, deprecated_yaml_path, method=method, pyi=pyi) + deprecated = load_deprecated_signatures( + pairs, deprecated_yaml_path, method=method, pyi=pyi + ) return pairs if skip_deprecated else pairs + deprecated + def load_deprecated_signatures( pairs: Sequence[PythonSignatureNativeFunctionPair], deprecated_yaml_path: str, @@ -345,28 +491,35 @@ def signature_original(f: NativeFunction) -> str: # remove inplace suffix but keep outplace suffix opname = str(f.func.name.name.base) if f.func.is_out_fn(): - opname += '_out' + opname += "_out" if f.func.name.name.inplace and pyi: - opname += '_' - args = CppSignatureGroup.from_native_function(f, method=False).signature.arguments() + opname += "_" + args = CppSignatureGroup.from_native_function( + f, method=False + ).signature.arguments() # Simply ignore TensorOptionsArguments as it does not exist in deprecated.yaml. - types = ', '.join(argument_type_str(a.argument.type) - for a in args if isinstance(a.argument, Argument)) - return f'{opname}({types})' + types = ", ".join( + argument_type_str(a.argument.type) + for a in args + if isinstance(a.argument, Argument) + ) + return f"{opname}({types})" # deprecated -> type-only native signature (according to the call order) - def signature_deprecated(opname: str, params: List[str], call_args: List[str]) -> str: + def signature_deprecated( + opname: str, params: List[str], call_args: List[str] + ) -> str: # create a mapping of parameter name to parameter type types: Dict[str, str] = {} for param in params: - if param == '*': + if param == "*": continue - type, name = param.split(' ') + type, name = param.split(" ") types[name] = type # if the name in the call is not in the parameter list, assume it's # a literal Scalar - rearranged_types = ', '.join(types.get(arg, 'Scalar') for arg in call_args) - return f'{opname}({rearranged_types})' + rearranged_types = ", ".join(types.get(arg, "Scalar") for arg in call_args) + return f"{opname}({rearranged_types})" # group the original ATen signatures by type-only signature grouped: Dict[str, List[PythonSignatureNativeFunctionPair]] = defaultdict(list) @@ -376,12 +529,12 @@ def signature_deprecated(opname: str, params: List[str], call_args: List[str]) - # find matching original signatures for each deprecated signature results: List[PythonSignatureNativeFunctionPair] = [] - with open(deprecated_yaml_path, 'r') as f: + with open(deprecated_yaml_path, "r") as f: deprecated_defs = yaml.load(f, Loader=YamlLoader) for deprecated in deprecated_defs: - _, params = split_name_params(deprecated['name']) - aten_name, call_args = split_name_params(deprecated['aten']) + _, params = split_name_params(deprecated["name"]) + aten_name, call_args = split_name_params(deprecated["aten"]) for pair in grouped[signature_deprecated(aten_name, params, call_args)]: # It uses the types from the original ATen declaration, but the @@ -392,12 +545,15 @@ def signature_deprecated(opname: str, params: List[str], call_args: List[str]) - # but never changes output_args nor TensorOptions (if any?), # so here we only look into these two types of args. python_sig = pair.signature - src_args: Dict[str, PythonArgument] = {a.name: PythonArgument( - name=a.name, - type=a.type, - default=None, - default_init=None, - ) for a in itertools.chain(python_sig.input_args, python_sig.input_kwargs)} + src_args: Dict[str, PythonArgument] = { + a.name: PythonArgument( + name=a.name, + type=a.type, + default=None, + default_init=None, + ) + for a in itertools.chain(python_sig.input_args, python_sig.input_kwargs) + } args: List[str] = [] input_args: List[PythonArgument] = [] @@ -405,10 +561,10 @@ def signature_deprecated(opname: str, params: List[str], call_args: List[str]) - kwarg_only = False for param in params: - if param == '*': + if param == "*": kwarg_only = True continue - _, param_name = param.split(' ') + _, param_name = param.split(" ") args.append(param_name) if param_name not in src_args: @@ -416,49 +572,56 @@ def signature_deprecated(opname: str, params: List[str], call_args: List[str]) - continue if not kwarg_only: - if not method or param_name != 'self': + if not method or param_name != "self": input_args.append(src_args[param_name]) else: input_kwargs.append(src_args[param_name]) - results.append(PythonSignatureNativeFunctionPair( - signature=PythonSignatureDeprecated( - name=python_sig.name, - input_args=tuple(input_args), - input_kwargs=tuple(input_kwargs), - output_args=python_sig.output_args, - tensor_options_args=python_sig.tensor_options_args, - method=python_sig.method, - deprecated_args_names=tuple(args), - deprecated_args_exprs=tuple(call_args), - returns=python_sig.returns, - ), - function=pair.function, - )) + results.append( + PythonSignatureNativeFunctionPair( + signature=PythonSignatureDeprecated( + name=python_sig.name, + input_args=tuple(input_args), + input_kwargs=tuple(input_kwargs), + output_args=python_sig.output_args, + tensor_options_args=python_sig.tensor_options_args, + method=python_sig.method, + deprecated_args_names=tuple(args), + deprecated_args_exprs=tuple(call_args), + returns=python_sig.returns, + ), + function=pair.function, + ) + ) return results + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # # Named Tuple Codegen # # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # + @with_native_function def gen_namedtuple_typename_key(f: NativeFunction) -> str: name = cpp.name(f.func) fieldnames = namedtuple_fieldnames(f.func.returns) - return '_'.join([name] + fieldnames) + return "_".join([name] + fieldnames) + def emit_namedtuple_call( - overloads: Sequence[PythonSignatureNativeFunctionPair] + overloads: Sequence[PythonSignatureNativeFunctionPair], ) -> Tuple[List[str], Dict[str, str]]: """ Generate block of named tuple type def inits, and add typeref snippets to declarations that use them """ - typenames: Dict[str, str] = {} # map from unique name + field name lists to typedef name - typedefs: List[str] = [] # typedef declarations and init code + typenames: Dict[ + str, str + ] = {} # map from unique name + field name lists to typedef name + typedefs: List[str] = [] # typedef declarations and init code for overload in overloads: fieldnames = namedtuple_fieldnames(overload.function.func.returns) @@ -471,8 +634,10 @@ def emit_namedtuple_call( if typename is None: typename = f'NamedTuple{"" if not typedefs else len(typedefs)}' typenames[tn_key] = typename - typedefs.append(f"""\ -static PyTypeObject* {typename} = get_namedtuple("{name}");""") + typedefs.append( + f"""\ +static PyTypeObject* {typename} = get_namedtuple("{name}");""" + ) return typedefs, typenames @@ -485,16 +650,20 @@ def generate_return_type_definition_and_map_entry( and return named tuple for a native function which returns named tuple and relevant entry for the map in same file. """ - typenames: Dict[str, str] = {} # map from unique name + field name lists to typedef name + typenames: Dict[ + str, str + ] = {} # map from unique name + field name lists to typedef name definitions: List[str] = [] # function defintion to register the typedef - map_entries: List[str] = [] # C++ map entry of + map_entries: List[ + str + ] = [] # C++ map entry of for overload in overloads: fieldnames = namedtuple_fieldnames(overload.function.func.returns) if not fieldnames: continue - fields = ', '.join(f'{{"{fn}", ""}}' for fn in fieldnames) + fields = ", ".join(f'{{"{fn}", ""}}' for fn in fieldnames) name = cpp.name(overload.function.func) # use @with_native_function? tn_key = gen_namedtuple_typename_key(overload.function) @@ -503,7 +672,8 @@ def generate_return_type_definition_and_map_entry( if typename is None: typename = f'{name}NamedTuple{"" if not definitions else len(definitions)}' typenames[tn_key] = typename - definitions.append(f"""\ + definitions.append( + f"""\ PyTypeObject* get_{name}_namedtuple() {{ static PyStructSequence_Field NamedTuple_fields[] = {{ {fields}, {{nullptr}} }}; static PyTypeObject {typename}; @@ -516,11 +686,13 @@ def generate_return_type_definition_and_map_entry( }} return &{typename}; }} -""") +""" + ) map_entries.append(f'{{"{name}", get_{name}_namedtuple()}}, ') return definitions, map_entries + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # # Method Impl Codegen @@ -528,7 +700,8 @@ def generate_return_type_definition_and_map_entry( # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # python binding for all overloads of a particular function/method -PY_VARIABLE_METHOD_VARARGS = CodeTemplate(r"""\ +PY_VARIABLE_METHOD_VARARGS = CodeTemplate( + r"""\ // ${name} static PyObject * ${pycname}(PyObject* self_, PyObject* args, PyObject* kwargs) { @@ -546,19 +719,23 @@ def generate_return_type_definition_and_map_entry( ${method_footer} } -""") +""" +) # handler for a single parsed signature - may be a single overload or # a pair of overloads that whose signatures only differ in output params # (plugged into PY_VARIABLE_METHOD_VARARGS as an item in ${dispatch}) -PY_VARIABLE_CASE = CodeTemplate("""\ +PY_VARIABLE_CASE = CodeTemplate( + """\ case ${overload_index}: { ${body} } -""") +""" +) # python binding for single-overload function/method -PY_VARIABLE_METHOD_VARARGS_SINGLETON = CodeTemplate("""\ +PY_VARIABLE_METHOD_VARARGS_SINGLETON = CodeTemplate( + """\ // ${name} static PyObject * ${pycname}(PyObject* self_, PyObject* args, PyObject* kwargs) { @@ -574,10 +751,12 @@ def generate_return_type_definition_and_map_entry( ${method_footer} } -""") +""" +) # python binding for a method with no args, shortcuts parsing -PY_VARIABLE_METHOD_NOARGS = CodeTemplate("""\ +PY_VARIABLE_METHOD_NOARGS = CodeTemplate( + """\ // ${name} static PyObject * ${pycname}(PyObject* self_, PyObject* args) { @@ -587,14 +766,16 @@ def generate_return_type_definition_and_map_entry( ${method_footer} } -""") +""" +) + def method_impl( name: BaseOperatorName, module: Optional[str], overloads: Sequence[PythonSignatureNativeFunctionPair], *, - method: bool + method: bool, ) -> str: """ Generate a python binding for all overloads of an op. @@ -603,15 +784,15 @@ def method_impl( noarg = is_noarg(overloads) namedtuple_inits, namedtuple_typenames = emit_namedtuple_call(overloads) - method_header = ['HANDLE_TH_ERRORS'] + method_header = ["HANDLE_TH_ERRORS"] method_header += namedtuple_inits - method_header += [ - "const Tensor& self = THPVariable_Unpack(self_);" - ] if method else [] + method_header += ( + ["const Tensor& self = THPVariable_Unpack(self_);"] if method else [] + ) - method_footer = ([] if noarg else ['Py_RETURN_NONE;']) + ['END_HANDLE_TH_ERRORS'] + method_footer = ([] if noarg else ["Py_RETURN_NONE;"]) + ["END_HANDLE_TH_ERRORS"] - traceable = 'true' if all(should_trace(o.function) for o in overloads) else 'false' + traceable = "true" if all(should_trace(o.function) for o in overloads) else "false" grouped_overloads: Sequence[PythonSignatureGroup] = group_overloads(overloads) is_singleton = len(grouped_overloads) == 1 @@ -619,11 +800,15 @@ def method_impl( dispatch: List[str] = [] for overload_index, overload in enumerate(grouped_overloads): signature = overload.signature.signature_str() - signatures.append(f'{cpp_string(str(signature))},') + signatures.append(f"{cpp_string(str(signature))},") dispatch_body = emit_dispatch_case(overload, namedtuple_typenames) dispatch.append( - PY_VARIABLE_CASE.substitute(overload_index=overload_index, body=dispatch_body) - if not is_singleton else dispatch_body) + PY_VARIABLE_CASE.substitute( + overload_index=overload_index, body=dispatch_body + ) + if not is_singleton + else dispatch_body + ) if noarg: template = PY_VARIABLE_METHOD_NOARGS @@ -650,6 +835,7 @@ def method_impl( self_="self_" if method else "nullptr", ) + def gen_has_torch_function_check( name: BaseOperatorName, module: Optional[str], *, noarg: bool, method: bool ) -> str: @@ -661,17 +847,21 @@ def gen_has_torch_function_check( }} """ else: - return '' + return "" self_ = "self_" if method else "nullptr" - namespace = { - "torch": "THPVariableFunctionsModule", - "torch.nn": "THPNNVariableFunctionsModule", - "torch.fft": "THPFFTVariableFunctionsModule", - "torch.linalg": "THPLinalgVariableFunctionsModule", - "torch.sparse": "THPSparseVariableFunctionsModule", - "torch.special": "THPSpecialVariableFunctionsModule", - }[module] if module else "THPVariableClass" + namespace = ( + { + "torch": "THPVariableFunctionsModule", + "torch.nn": "THPNNVariableFunctionsModule", + "torch.fft": "THPFFTVariableFunctionsModule", + "torch.linalg": "THPLinalgVariableFunctionsModule", + "torch.sparse": "THPSparseVariableFunctionsModule", + "torch.special": "THPSpecialVariableFunctionsModule", + }[module] + if module + else "THPVariableClass" + ) return f"""\ if(_r.has_torch_function()) {{ @@ -679,14 +869,18 @@ def gen_has_torch_function_check( }} """ + # handler for output/no-output overload pair -PY_VARIABLE_OUT = CodeTemplate("""\ +PY_VARIABLE_OUT = CodeTemplate( + """\ if (_r.isNone(${out_idx})) { ${call_dispatch} } else { ${call_dispatch_out} } -""") +""" +) + def emit_dispatch_case( overload: PythonSignatureGroup, @@ -703,14 +897,18 @@ def emit_dispatch_case( return PY_VARIABLE_OUT.substitute( out_idx=overload.signature.output_idx(), call_dispatch=emit_single_dispatch( - overload.signature, overload.base, namedtuple_typenames), + overload.signature, overload.base, namedtuple_typenames + ), call_dispatch_out=emit_single_dispatch( - overload.signature, overload.outplace, namedtuple_typenames), + overload.signature, overload.outplace, namedtuple_typenames + ), ) else: # no-output version only return emit_single_dispatch( - overload.signature, overload.base, namedtuple_typenames) + overload.signature, overload.base, namedtuple_typenames + ) + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # @@ -718,24 +916,30 @@ def emit_dispatch_case( # # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # + def forward_decls( name: BaseOperatorName, overloads: Sequence[PythonSignatureNativeFunctionPair], *, - method: bool + method: bool, ) -> Tuple[str, ...]: if method: return () pycname = get_pycname(name) if is_noarg(overloads): - return (f"""\ + return ( + f"""\ static PyObject * {pycname}(PyObject* self_, PyObject* args); -""",) +""", + ) else: - return (f"""\ + return ( + f"""\ static PyObject * {pycname}(PyObject* self_, PyObject* args, PyObject* kwargs); -""",) +""", + ) + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # @@ -743,12 +947,13 @@ def forward_decls( # # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # + def method_def( name: BaseOperatorName, module: Optional[str], overloads: Sequence[PythonSignatureNativeFunctionPair], *, - method: bool + method: bool, ) -> str: """ Generate method def entry. @@ -756,14 +961,14 @@ def method_def( pycname = get_pycname(name) if is_noarg(overloads): - pyfunc_cast = '' - flags = 'METH_NOARGS' if method else 'METH_VARARGS | METH_KEYWORDS' + pyfunc_cast = "" + flags = "METH_NOARGS" if method else "METH_VARARGS | METH_KEYWORDS" else: - pyfunc_cast = 'castPyCFunctionWithKeywords' - flags = 'METH_VARARGS | METH_KEYWORDS' + pyfunc_cast = "castPyCFunctionWithKeywords" + flags = "METH_VARARGS | METH_KEYWORDS" if module == "torch": - flags += ' | METH_STATIC' + flags += " | METH_STATIC" if name.dunder_method: # PyMethodDef entry for binary op, throws not implemented error @@ -774,12 +979,14 @@ def method_def( return f"""\ {{"{name}", {pyfunc_cast}({pycname}), {flags}, NULL}},""" + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # # Overload Sorting and Grouping # # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # + def group_overloads( overloads: Sequence[PythonSignatureNativeFunctionPair], ) -> Sequence[PythonSignatureGroup]: @@ -792,15 +999,15 @@ def group_overloads( if overload.function.func.is_out_fn(): if sig in outplaces: raise RuntimeError( - f'Found duplicated function definition:\n- {overload.function.func}.\n' - f'Existing definition:\n- {outplaces[sig].function.func}.' + f"Found duplicated function definition:\n- {overload.function.func}.\n" + f"Existing definition:\n- {outplaces[sig].function.func}." ) outplaces[sig] = overload else: if sig in bases: raise RuntimeError( - f'Found duplicated function definition:\n- {overload.function.func}.\n' - f'Existing definition:\n- {bases[sig].function.func}.' + f"Found duplicated function definition:\n- {overload.function.func}.\n" + f"Existing definition:\n- {bases[sig].function.func}." ) bases[sig] = overload @@ -808,30 +1015,41 @@ def group_overloads( if sig not in bases: candidates: List[str] = [] for overload in overloads: - if str(overload.function.func.name.name) == str(out.function.func.name.name) \ - and not overload.function.func.is_out_fn() \ - and not overload.signature.deprecated: - candidates.append(overload.signature.signature_str(skip_outputs=True)) + if ( + str(overload.function.func.name.name) + == str(out.function.func.name.name) + and not overload.function.func.is_out_fn() + and not overload.signature.deprecated + ): + candidates.append( + overload.signature.signature_str(skip_outputs=True) + ) out_sig = out.signature.signature_str() raise RuntimeError( - f'While identifying overloads, we found an out schema {out_sig} without a corresponding non-out variant. ' - f'We expected the non-out variant to have schema: \n- {sig}\nPlease check that you spelled the schema ' - 'correctly in native_functions.yaml. We discovered the following candidate(s): \n' - + '\n'.join(f'- {candidate}' for candidate in candidates)) + f"While identifying overloads, we found an out schema {out_sig} without a corresponding non-out variant. " + f"We expected the non-out variant to have schema: \n- {sig}\nPlease check that you spelled the schema " + "correctly in native_functions.yaml. We discovered the following candidate(s): \n" + + "\n".join(f"- {candidate}" for candidate in candidates) + ) grouped: List[PythonSignatureGroup] = [] for sig, base in bases.items(): outplace = outplaces.get(sig) - grouped.append(PythonSignatureGroup( - # prefer the signature with optional out=... arguments because it's the - # superset that can be used to parse input for both base and outplace. - signature=outplace.signature if outplace is not None else base.signature, - base=base.function, - outplace=outplace.function if outplace is not None else None, - )) + grouped.append( + PythonSignatureGroup( + # prefer the signature with optional out=... arguments because it's the + # superset that can be used to parse input for both base and outplace. + signature=outplace.signature + if outplace is not None + else base.signature, + base=base.function, + outplace=outplace.function if outplace is not None else None, + ) + ) return sort_overloads(grouped) + # This function declares a partial order on declarations, and sorts them according # to its linear extension. This is necessary, because there's some ambiguity in the # choice of overload, and we want a different order. @@ -876,20 +1094,29 @@ def group_overloads( # foo(Tensor other, *, Scalar alpha=1, Scalar beta=1) # + def sort_overloads( - grouped_overloads: Sequence[PythonSignatureGroup] + grouped_overloads: Sequence[PythonSignatureGroup], ) -> Sequence[PythonSignatureGroup]: - def is_arg_smaller(t1: Type, t2: Type) -> bool: - return (str(t1) == 'Scalar' and str(t2) == 'Tensor' or - 'Dimname' in str(t1) and 'Dimname' not in str(t2) or - # In the discussion https://github.com/pytorch/pytorch/issues/54555 it has been - # discussed why it is important to prioritize int/int? over int[] - str(t1) == 'int[]' and (str(t2) == 'int' or str(t2) == 'int?') or - # TensorList currently throws an error during argument parsing, that's why it needs to be - # last in signature ordering. See discussion: https://github.com/pytorch/pytorch/issues/58087 - str(t1) == 'Tensor[]' and str(t2).find("[]") != -1) - + return ( + str(t1) == "Scalar" + and str(t2) == "Tensor" + or str(t1) == "Scalar?" + and str(t2) == "Tensor?" + or "Dimname" in str(t1) + and "Dimname" not in str(t2) + or + # In the discussion https://github.com/pytorch/pytorch/issues/54555 it has been + # discussed why it is important to prioritize int/int? over int[] + str(t1) == "int[]" + and (str(t2) == "int" or str(t2) == "int?") + or + # TensorList currently throws an error during argument parsing, that's why it needs to be + # last in signature ordering. See discussion: https://github.com/pytorch/pytorch/issues/58087 + str(t1) == "Tensor[]" + and str(t2).find("[]") != -1 + ) def is_smaller(s1: PythonSignature, s2: PythonSignature) -> bool: """Returns True if s1 < s2 in the partial order.""" @@ -900,13 +1127,16 @@ def is_smaller(s1: PythonSignature, s2: PythonSignature) -> bool: # above. The old codegen used the deprecated 'dynamic_type(arg.type)', which # ignores the optional annotation, i.e. 'Scalar' and 'Scalar?'. equal = all(arg1.type == arg2.type for arg1, arg2 in zip(args1, args2)) - smaller_or_equal = all(str(arg1.type) == str(arg2.type) - or is_arg_smaller(arg1.type, arg2.type) - for arg1, arg2 in zip(args1, args2)) + smaller_or_equal = all( + str(arg1.type) == str(arg2.type) or is_arg_smaller(arg1.type, arg2.type) + for arg1, arg2 in zip(args1, args2) + ) return smaller_or_equal and not equal # First sort by signature - grouped_overloads = sorted(grouped_overloads, key=lambda x: x.signature.signature_str()) + grouped_overloads = sorted( + grouped_overloads, key=lambda x: x.signature.signature_str() + ) # Construct the relation graph larger_than: Dict[int, Set[int]] = defaultdict(set) @@ -934,39 +1164,43 @@ def is_smaller(s1: PythonSignature, s2: PythonSignature) -> bool: return list(map(lambda x: grouped_overloads[x], sorted_ids)) + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # # Codegen API Integration # # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # + def emit_single_dispatch( ps: PythonSignature, f: NativeFunction, namedtuple_typenames: Dict[str, str] ) -> str: """ Emit dispatch code for a single native function. """ + @with_native_function def go(f: NativeFunction) -> str: # header comments - deprecated = '[deprecated] ' if ps.deprecated else '' - schema_comment = f'// {deprecated}aten::{f.func}' + deprecated = "[deprecated] " if ps.deprecated else "" + schema_comment = f"// {deprecated}aten::{f.func}" # dispatch lambda signature name = cpp.name(f.func) - lambda_formals = ', '.join(map(lambda a: f"{a.type_str} {a.name}", - dispatch_lambda_args(ps, f))) + lambda_formals = ", ".join( + map(lambda a: f"{a.type_str} {a.name}", dispatch_lambda_args(ps, f)) + ) lambda_return = dispatch_lambda_return_str(f) # dispatch lambda body dispatch_callee = cpp_dispatch_target(f) - dispatch_args = ', '.join(cpp_dispatch_exprs(f, python_signature=ps)) + dispatch_args = ", ".join(cpp_dispatch_exprs(f, python_signature=ps)) # from arg parser outputs to dispatch lambda arguments parser_outputs = arg_parser_output_exprs(ps, f) lambda_arg_exprs = dispatch_lambda_exprs(ps, f) - inits = '\n'.join(lambda_arg_exprs.inits) - lambda_args = ', '.join(lambda_arg_exprs.exprs) + inits = "\n".join(lambda_arg_exprs.inits) + lambda_args = ", ".join(lambda_arg_exprs.exprs) # scatter fields # TODO: Checking `ps.method and ('requires_grad' in parser_outputs)` is a hacky @@ -974,12 +1208,17 @@ def go(f: NativeFunction) -> str: # new_full, new_empty, and new_zeros. A much better but more difficult to # implement solution involves refactoring according to Ed's description here: # https://github.com/pytorch/pytorch/issues/36455#issuecomment-614767589 - need_set_requires_grad = ps.tensor_options_args and (not has_tensor_options(f) or ( - ps.method and ('requires_grad' in parser_outputs))) - set_requires_grad = f'.set_requires_grad({parser_outputs["requires_grad"].expr})' \ - if need_set_requires_grad else '' + need_set_requires_grad = ps.tensor_options_args and ( + not has_tensor_options(f) + or (ps.method and ("requires_grad" in parser_outputs)) + ) + set_requires_grad = ( + f'.set_requires_grad({parser_outputs["requires_grad"].expr})' + if need_set_requires_grad + else "" + ) - if lambda_return == 'void': + if lambda_return == "void": return f"""\ {schema_comment} {inits} @@ -992,7 +1231,7 @@ def go(f: NativeFunction) -> str: """ else: typename = namedtuple_typenames.get(gen_namedtuple_typename_key(f)) - namedtuple_typeref = f'{typename}, ' if typename is not None else '' + namedtuple_typeref = f"{typename}, " if typename is not None else "" return f"""\ {schema_comment} {inits} diff --git a/tools/autograd/gen_trace_type.py b/tools/autograd/gen_trace_type.py index 1b9cc7eec294..8072c6cad2d9 100644 --- a/tools/autograd/gen_trace_type.py +++ b/tools/autograd/gen_trace_type.py @@ -1,13 +1,17 @@ import itertools from typing import List, Sequence, Union, Dict -from tools.codegen.api.types import DispatcherSignature -from tools.codegen.api import cpp -from tools.codegen.code_template import CodeTemplate -from tools.codegen.context import with_native_function -from tools.codegen.utils import FileManager -from tools.codegen.model import (Argument, NativeFunction, SchemaKind, - TensorOptionsArguments) +from torchgen.api.types import DispatcherSignature +from torchgen.api import cpp +from torchgen.code_template import CodeTemplate +from torchgen.context import with_native_function +from torchgen.utils import FileManager +from torchgen.model import ( + Argument, + NativeFunction, + SchemaKind, + TensorOptionsArguments, +) # Note [Manual Backend kernels] # For these ops, we want to manually register to dispatch key Backend and @@ -19,16 +23,33 @@ # - all ops below are part of MANUAL_TRACER to skip codegen Tracer kernel registration # Note: we still register to dispatch key Profiler for these ops, keeping it untouched for now. # You can find the manual registration in torch/csrc/autograd/VariableTypeManual.cpp -MANUAL_BACKEND = set([ - 'options', 'data', 'set_data', 'is_leaf', 'output_nr', '_version', 'retain_grad', - '_backward', 'requires_grad_', -]) +MANUAL_BACKEND = set( + [ + "options", + "data", + "set_data", + "is_leaf", + "output_nr", + "_version", + "retain_grad", + "_backward", + "requires_grad_", + ] +) # For these ops we want to skip the codegen-ed registration to both Autograd and Tracer keys. # You can find the manual registration in torch/csrc/autograd/VariableTypeManual.cpp -MANUAL_AUTOGRAD_AND_TRACER = set([ - 'resize_', 'resize_as_', 'detach', 'detach_', 'copy_', '_fw_primal', '_make_dual', -]) +MANUAL_AUTOGRAD_AND_TRACER = set( + [ + "resize_", + "resize_as_", + "detach", + "detach_", + "copy_", + "_fw_primal", + "_make_dual", + ] +) # Currently MANUAL_AUTOGRAD and MANUAL_TRACER share the same set of ops: # union(MANUAL_BACKEND, MANUAL_AUTOGRAD_AND_TRACER) @@ -41,45 +62,65 @@ # on demand. Only concrete ATen methods can be disabled this way; it will have # NO EFFECT otherwise. DONT_RECORD_TRACE = { - 'convolution', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d', - 'conv_transpose2d', 'conv_transpose3d', 'lstm_cell', 'gru_cell', - 'rnn_tanh_cell', 'rnn_relu_cell', + "convolution", + "conv1d", + "conv2d", + "conv3d", + "conv_transpose1d", + "conv_transpose2d", + "conv_transpose3d", + "lstm_cell", + "gru_cell", + "rnn_tanh_cell", + "rnn_relu_cell", # FIXME: figure out a better way when we support sparse tensors in jit - '_coalesced', + "_coalesced", } + def should_trace(f: NativeFunction) -> bool: # Operations involving Storage or Type are not traceable at the moment - if any(str(arg.type) in {'Storage', 'Type', 'ConstQuantizerPtr'} - for arg in f.func.schema_order_arguments()): + if any( + str(arg.type) in {"Storage", "Type", "ConstQuantizerPtr"} + for arg in f.func.schema_order_arguments() + ): return False # We can't trace functions which don't have any Tensor or TensorList returns if not any(r.type.is_tensor_like() for r in f.func.returns): return False return f.func.name.name.base not in DONT_RECORD_TRACE -SELECT = CodeTemplate("""\ + +SELECT = CodeTemplate( + """\ if (${cond}) { ${true} } else { ${false} } -""") +""" +) -OP_NAME = CodeTemplate("""\ +OP_NAME = CodeTemplate( + """\ op_name = c10::Symbol::fromQualString("aten::${trace_name}"); -""") +""" +) # These functions have their names recorded under trace renamed, RENAME_TRACE = { - 'zero': 'zeros_like', # replacing aten::zero_ with aten::zeros_like - 'fill': 'full_like', # replacing aten::fill_ with aten::full_like + "zero": "zeros_like", # replacing aten::zero_ with aten::zeros_like + "fill": "full_like", # replacing aten::fill_ with aten::full_like } + def format_trace_op_name(f: NativeFunction) -> str: # TODO: byte-for-byte compatible with old codegen behavior - should clean up - if f.func.kind() in (SchemaKind.functional, SchemaKind.out) or f.func.name.name.dunder_method: + if ( + f.func.kind() in (SchemaKind.functional, SchemaKind.out) + or f.func.name.name.dunder_method + ): # special case for *_out functions: the in-place and out-of-place ops # are overloaded with the same name in the JIT trace_name = str(f.func.name.name) @@ -94,32 +135,39 @@ def format_trace_op_name(f: NativeFunction) -> str: inplace_trace_name = RENAME_TRACE.get(inplace_trace_name, inplace_trace_name) return SELECT.substitute( - cond='tracer_state->force_outplace', + cond="tracer_state->force_outplace", true=OP_NAME.substitute(trace_name=outplace_trace_name), false=OP_NAME.substitute(trace_name=inplace_trace_name), ) + ADD_TRACE_INPUT = CodeTemplate("""jit::tracer::addInputs(node, "${name}", ${input});""") -def format_trace_inputs(f: NativeFunction) -> str: - def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequence[str]: +def format_trace_inputs(f: NativeFunction) -> str: + def dispatch_trace_input( + arg: Union[Argument, TensorOptionsArguments] + ) -> Sequence[str]: if isinstance(arg, TensorOptionsArguments): - name = 'options' + name = "options" return [ - ADD_TRACE_INPUT.substitute(name=name, input='optTypeMetaToScalarType(options.dtype_opt())'), - ADD_TRACE_INPUT.substitute(name=name, input='options.layout()'), - ADD_TRACE_INPUT.substitute(name=name, input='options.device()'), - ADD_TRACE_INPUT.substitute(name=name, input='options.pinned_memory()'), + ADD_TRACE_INPUT.substitute( + name=name, input="optTypeMetaToScalarType(options.dtype_opt())" + ), + ADD_TRACE_INPUT.substitute(name=name, input="options.layout()"), + ADD_TRACE_INPUT.substitute(name=name, input="options.device()"), + ADD_TRACE_INPUT.substitute(name=name, input="options.pinned_memory()"), ] else: name = arg.name - if str(arg.type) == 'Tensor?[]': + if str(arg.type) == "Tensor?[]": return [f'jit::tracer::addInputs(node, "{name}", {name});'] else: return [ADD_TRACE_INPUT.substitute(name=name, input=name)] - args: List[Union[Argument, TensorOptionsArguments]] = list(f.func.schema_order_arguments()) + args: List[Union[Argument, TensorOptionsArguments]] = list( + f.func.schema_order_arguments() + ) if f.func.is_out_fn(): # *_out functions take the result as a separate argument, but we don't want to @@ -129,7 +177,9 @@ def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequen # there is only one output argument. args = args[:-1] - trace_inputs = itertools.chain.from_iterable(dispatch_trace_input(arg) for arg in args) + trace_inputs = itertools.chain.from_iterable( + dispatch_trace_input(arg) for arg in args + ) if f.func.is_out_fn(): # for *_out functions, handle the result argument differently for inplace/outplace. @@ -141,32 +191,49 @@ def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequen # Factories are a bit special because their out-of-place overloads # take an extra TensorOptions argument, which is missing in the _out function has_tensor_return = any(r.type.is_tensor_like() for r in f.func.returns) - has_tensor_input_arg = any(a.type.is_tensor_like() for a in f.func.arguments.flat_non_out) - is_factory_method = f.category_override == 'factory' or (has_tensor_return and not has_tensor_input_arg) + has_tensor_input_arg = any( + a.type.is_tensor_like() for a in f.func.arguments.flat_non_out + ) + is_factory_method = f.category_override == "factory" or ( + has_tensor_return and not has_tensor_input_arg + ) # HACK: preserve old codegen behavior - the old codegen set the `is_factory_method` # flag for the whole family of ops with the same basename if any of them is a # factory method. For most cases the whole family of ops are indeed all factory # method - 'normal' is the only exception. So we handle it specially here to avoid # cloning the old logic. - if f.func.name.name.base == 'normal': + if f.func.name.name.base == "normal": is_factory_method = True if is_factory_method: outplace = [ - ADD_TRACE_INPUT.substitute(name='out', input='optTypeMetaToScalarType(out.options().dtype_opt())'), - ADD_TRACE_INPUT.substitute(name='out', input='out.options().layout()'), - ADD_TRACE_INPUT.substitute(name='out', input='out.options().device()'), - ADD_TRACE_INPUT.substitute(name='out', input='out.options().pinned_memory()'), + ADD_TRACE_INPUT.substitute( + name="out", + input="optTypeMetaToScalarType(out.options().dtype_opt())", + ), + ADD_TRACE_INPUT.substitute(name="out", input="out.options().layout()"), + ADD_TRACE_INPUT.substitute(name="out", input="out.options().device()"), + ADD_TRACE_INPUT.substitute( + name="out", input="out.options().pinned_memory()" + ), ] else: outplace = [] trace_inputs = itertools.chain( trace_inputs, - [SELECT.substitute(cond='tracer_state->force_outplace', true='\n'.join(outplace), false=inplace)]) + [ + SELECT.substitute( + cond="tracer_state->force_outplace", + true="\n".join(outplace), + false=inplace, + ) + ], + ) + + return "\n".join(trace_inputs) - return '\n'.join(trace_inputs) # `torch.jit.trace` have undocumented keyword argument `_force_outplace`, # which force jit to replace functions with outplace variants (for @@ -191,29 +258,32 @@ def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequen # - Or keep `aten::zeros_like` arguments aligned with `aten::zero_` # arguments (inside of the `native_functions.yaml`) RENAME_TRACE_ADD_ARGS = { - 'fill': '''\ + "fill": """\ jit::tracer::addInputs(node, "options", c10::optional()); jit::tracer::addInputs(node, "options", layout_or_default(c10::nullopt)); jit::tracer::addInputs(node, "options", device_or_default(c10::nullopt)); jit::tracer::addInputs(node, "options", pinned_memory_or_default(c10::nullopt)); c10::optional memory_format = c10::MemoryFormat::Preserve; jit::tracer::addInputs(node, "memory_format", memory_format); -''', - 'zero': '''\ +""", + "zero": """\ jit::tracer::addInputs(node, "options", c10::optional()); jit::tracer::addInputs(node, "options", layout_or_default(c10::nullopt)); jit::tracer::addInputs(node, "options", device_or_default(c10::nullopt)); jit::tracer::addInputs(node, "options", pinned_memory_or_default(c10::nullopt)); c10::optional memory_format = c10::MemoryFormat::Preserve; jit::tracer::addInputs(node, "memory_format", memory_format); -''', +""", } -INPLACE_GUARD = CodeTemplate("""\ +INPLACE_GUARD = CodeTemplate( + """\ jit::tracer::ensureUniqueIfOutOfPlaced("${name}", ${mutable_input}); -""") +""" +) -PRE_RECORD_TRACE = CodeTemplate("""\ +PRE_RECORD_TRACE = CodeTemplate( + """\ torch::jit::Node* node = nullptr; std::shared_ptr tracer_state; if (jit::tracer::isTracing()) { @@ -227,40 +297,59 @@ def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequen ${inplace_guard} jit::tracer::setTracingState(nullptr); } -""") +""" +) + def format_prerecord_trace(f: NativeFunction) -> str: if not should_trace(f): - return '' + return "" # TODO: clean up old codegen behavior - is_inplace = f.func.kind() in (SchemaKind.inplace, SchemaKind.out) and not f.func.name.name.dunder_method - add_args = RENAME_TRACE_ADD_ARGS.get(f.func.name.name.base, '') if is_inplace else '' - additional_inputs = SELECT.substitute( - cond='tracer_state->force_outplace', - true=add_args, - false='', - ) if add_args else '' + is_inplace = ( + f.func.kind() in (SchemaKind.inplace, SchemaKind.out) + and not f.func.name.name.dunder_method + ) + add_args = ( + RENAME_TRACE_ADD_ARGS.get(f.func.name.name.base, "") if is_inplace else "" + ) + additional_inputs = ( + SELECT.substitute( + cond="tracer_state->force_outplace", + true=add_args, + false="", + ) + if add_args + else "" + ) return PRE_RECORD_TRACE.substitute( set_op_name=format_trace_op_name(f), add_trace_inputs=format_trace_inputs(f) + additional_inputs, inplace_guard=INPLACE_GUARD.substitute( name=cpp.name(f.func), - mutable_input=f.func.arguments.out[0].name if f.func.arguments.out else 'self', - ) if is_inplace else '', + mutable_input=f.func.arguments.out[0].name + if f.func.arguments.out + else "self", + ) + if is_inplace + else "", ) -POST_RECORD_TRACE = CodeTemplate("""\ + +POST_RECORD_TRACE = CodeTemplate( + """\ if (tracer_state) { jit::tracer::setTracingState(std::move(tracer_state)); ${add_trace_outputs} } -""") +""" +) + def format_postrecord_trace(f: NativeFunction) -> str: if not should_trace(f): - return '' + return "" # For outplacing ops, *_out overloads require special handling to move the # output *argument* to a return value @@ -271,29 +360,37 @@ def format_postrecord_trace(f: NativeFunction) -> str: # Code size optimization: the common case is that the return value is # the same for both variants if output_names_outplace == output_names_inplace: - outputs = [f'jit::tracer::addOutput(node, {n});' for n in output_names_outplace] + outputs = [ + f"jit::tracer::addOutput(node, {n});" for n in output_names_outplace + ] return POST_RECORD_TRACE.substitute(add_trace_outputs=outputs) selection = SELECT.substitute( - cond='force_outplace', - true='\n'.join(f'jit::tracer::addOutput(node, {n});' for n in output_names_outplace), - false='\n'.join(f'jit::tracer::addOutput(node, {n});' for n in output_names_inplace), + cond="force_outplace", + true="\n".join( + f"jit::tracer::addOutput(node, {n});" for n in output_names_outplace + ), + false="\n".join( + f"jit::tracer::addOutput(node, {n});" for n in output_names_inplace + ), ) return POST_RECORD_TRACE.substitute(add_trace_outputs=selection) else: output_names = cpp.return_names(f) - outputs = [f'jit::tracer::addOutput(node, {n});' for n in output_names] + outputs = [f"jit::tracer::addOutput(node, {n});" for n in output_names] return POST_RECORD_TRACE.substitute(add_trace_outputs=outputs) + def declare_returned_variables(f: NativeFunction) -> str: modifies_arguments = f.func.kind() in (SchemaKind.inplace, SchemaKind.out) if modifies_arguments: - return '' + return "" if len(f.func.returns) == 1: - return '' + return "" types = map(cpp.return_type, f.func.returns) names = cpp.return_names(f) - return '\n'.join(f'{type.cpp_type()} {name};' for type, name in zip(types, names)) + return "\n".join(f"{type.cpp_type()} {name};" for type, name in zip(types, names)) + def tie_return_values(f: NativeFunction) -> str: if len(f.func.returns) == 1: @@ -301,6 +398,7 @@ def tie_return_values(f: NativeFunction) -> str: names = cpp.return_names(f) return f'std::tie({", ".join(names)})' + def get_return_value(f: NativeFunction) -> str: names = cpp.return_names(f) if len(f.func.returns) == 1: @@ -308,11 +406,15 @@ def get_return_value(f: NativeFunction) -> str: if f.func.kind() == SchemaKind.out: return f'std::forward_as_tuple({", ".join(names)})' else: - moved = ", ".join(f'std::move({name})' for name in names) - return f'std::make_tuple({moved})' + moved = ", ".join(f"std::move({name})" for name in names) + return f"std::make_tuple({moved})" + + +TRACE_DISPATCH = CodeTemplate( + """\ +${assign_return_values}at::_ops::${unambiguous_name}::redispatch(${unpacked_args});""" +) -TRACE_DISPATCH = CodeTemplate("""\ -${assign_return_values}at::_ops::${unambiguous_name}::redispatch(${unpacked_args});""") def emit_trace_body(f: NativeFunction) -> List[str]: trace_body: List[str] = [] @@ -325,47 +427,59 @@ def emit_trace_body(f: NativeFunction) -> List[str]: # code-generated tracing kernels plumb and recompute dispatch keys directly through the kernel for performance. # See Note [Plumbing Keys Through The Dispatcher] for details. - dispatch_key_set = 'ks & c10::DispatchKeySet(c10::DispatchKeySet::FULL_AFTER, c10::DispatchKey::Tracer)' - redispatch_args = ', '.join([dispatch_key_set] + [a.expr for a in dispatcher_exprs]) + dispatch_key_set = "ks & c10::DispatchKeySet(c10::DispatchKeySet::FULL_AFTER, c10::DispatchKey::Tracer)" + redispatch_args = ", ".join([dispatch_key_set] + [a.expr for a in dispatcher_exprs]) - assign_return_values = f'{tie_return_values(f)} = ' \ - if f.func.kind() == SchemaKind.functional and f.func.returns else '' + assign_return_values = ( + f"{tie_return_values(f)} = " + if f.func.kind() == SchemaKind.functional and f.func.returns + else "" + ) # Note that this calls the slow, dispatching variants of manual_cpp_binding ops. # We could probably work harder to ensure that the fast variants are called instead, but the perf benefit would be minimal. - trace_body.append(TRACE_DISPATCH.substitute( - assign_return_values=assign_return_values, - unambiguous_name=f.func.name.unambiguous_name(), - unpacked_args=redispatch_args, - )) + trace_body.append( + TRACE_DISPATCH.substitute( + assign_return_values=assign_return_values, + unambiguous_name=f.func.name.unambiguous_name(), + unpacked_args=redispatch_args, + ) + ) trace_body.append(format_postrecord_trace(f)) if f.func.returns: - trace_body.append(f'return {get_return_value(f)};') + trace_body.append(f"return {get_return_value(f)};") return trace_body -METHOD_DEFINITION = CodeTemplate("""\ + +METHOD_DEFINITION = CodeTemplate( + """\ ${return_type} ${type_wrapper_name}(${formals}) { ${type_definition_body} } -""") +""" +) + def type_wrapper_name(f: NativeFunction) -> str: if f.func.name.overload_name: - return f'{cpp.name(f.func)}_{f.func.name.overload_name}' + return f"{cpp.name(f.func)}_{f.func.name.overload_name}" else: return cpp.name(f.func) + @with_native_function def method_definition(f: NativeFunction) -> str: assert cpp.name(f.func) not in MANUAL_TRACER - formals = ', '.join( + formals = ", ".join( # code-generated tracing kernels plumb and recompute dispatch keys directly through the kernel for performance. # See Note [Plumbing Keys Through The Dispatcher] for details. - ['c10::DispatchKeySet ks'] + - [f'{cpp.argument_type(a, binds="__placeholder__").cpp_type()} {a.name}' - for a in f.func.schema_order_arguments()] + ["c10::DispatchKeySet ks"] + + [ + f'{cpp.argument_type(a, binds="__placeholder__").cpp_type()} {a.name}' + for a in f.func.schema_order_arguments() + ] ) return METHOD_DEFINITION.substitute( @@ -375,11 +489,15 @@ def method_definition(f: NativeFunction) -> str: type_definition_body=emit_trace_body(f), ) -WRAPPER_REGISTRATION = CodeTemplate("""\ + +WRAPPER_REGISTRATION = CodeTemplate( + """\ m.impl("${name}", TORCH_FN(${class_type}::${type_wrapper_name}) ); -""") +""" +) + @with_native_function def method_registration(f: NativeFunction) -> str: @@ -388,31 +506,36 @@ def method_registration(f: NativeFunction) -> str: return WRAPPER_REGISTRATION.substitute( name=f.func.name, type_wrapper_name=type_wrapper_name(f), - class_type='TraceType', + class_type="TraceType", ) -def gen_trace_type_func( - fn: NativeFunction -) -> Dict[str, List[str]]: + +def gen_trace_type_func(fn: NativeFunction) -> Dict[str, List[str]]: return { - 'ops_headers': [f'#include '], - 'trace_method_definitions': [method_definition(fn)], - 'trace_wrapper_registrations': [method_registration(fn)], + "ops_headers": [f"#include "], + "trace_method_definitions": [method_definition(fn)], + "trace_wrapper_registrations": [method_registration(fn)], } -def gen_trace_type(out: str, native_functions: List[NativeFunction], template_path: str) -> None: + +def gen_trace_type( + out: str, native_functions: List[NativeFunction], template_path: str +) -> None: # NOTE: see Note [Sharded File] at the top of the VariableType.cpp # template regarding sharding of the generated files. fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False) fm.write_sharded( - 'TraceType.cpp', + "TraceType.cpp", [fn for fn in native_functions if cpp.name(fn.func) not in MANUAL_TRACER], key_fn=lambda fn: fn.root_name, base_env={ - 'generated_comment': - f'@generated from {template_path}/TraceType.cpp', + "generated_comment": f"@generated from {template_path}/TraceType.cpp", }, env_callable=gen_trace_type_func, num_shards=5, - sharded_keys={'ops_headers', 'trace_method_definitions', 'trace_wrapper_registrations'} + sharded_keys={ + "ops_headers", + "trace_method_definitions", + "trace_wrapper_registrations", + }, ) diff --git a/tools/autograd/gen_variable_factories.py b/tools/autograd/gen_variable_factories.py index 1a09902e86ec..26eb2d91595d 100644 --- a/tools/autograd/gen_variable_factories.py +++ b/tools/autograd/gen_variable_factories.py @@ -5,13 +5,13 @@ import re from typing import Optional, List -from tools.codegen.api.types import CppSignatureGroup -from tools.codegen.api import cpp -import tools.codegen.api.python as python -from tools.codegen.gen import parse_native_yaml -from tools.codegen.context import with_native_function -from tools.codegen.utils import mapMaybe, FileManager -from tools.codegen.model import NativeFunction, TensorOptionsArguments, Variant +from torchgen.api.types import CppSignatureGroup +from torchgen.api import cpp +import torchgen.api.python as python +from torchgen.gen import parse_native_yaml +from torchgen.context import with_native_function +from torchgen.utils import mapMaybe, FileManager +from torchgen.model import NativeFunction, TensorOptionsArguments, Variant OPTIONAL_TYPE_PATTERN = re.compile(r"c10::optional<(.+)>") TYPE_PATTERN = re.compile(r"(?:const\s+)?([A-Z]\w+)") @@ -20,28 +20,41 @@ # TODO: maybe update the cpp argument API to take optional namespace argument? def fully_qualified_type(argument_type: str) -> str: def maybe_optional_type(type: str, is_opt: bool) -> str: - return f'c10::optional<{type}>' if is_opt else type + return f"c10::optional<{type}>" if is_opt else type opt_match = OPTIONAL_TYPE_PATTERN.match(argument_type) is_opt = opt_match is not None if opt_match: - argument_type = argument_type[opt_match.start(1):opt_match.end(1)] + argument_type = argument_type[opt_match.start(1) : opt_match.end(1)] match = TYPE_PATTERN.match(argument_type) if match is None: return maybe_optional_type(argument_type, is_opt) index = match.start(1) - qualified_type = f'{argument_type[:index]}at::{argument_type[index:]}' + qualified_type = f"{argument_type[:index]}at::{argument_type[index:]}" return maybe_optional_type(qualified_type, is_opt) -def gen_variable_factories(out: str, native_yaml_path: str, template_path: str) -> None: - native_functions = parse_native_yaml(native_yaml_path).native_functions + +def gen_variable_factories( + out: str, native_yaml_path: str, tags_yaml_path: str, template_path: str +) -> None: + native_functions = parse_native_yaml( + native_yaml_path, tags_yaml_path + ).native_functions factory_functions = [fn for fn in native_functions if is_factory_function(fn)] fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False) - fm.write_with_template('variable_factories.h', 'variable_factories.h', lambda: { - 'generated_comment': '@' + f'generated from {fm.template_dir}/variable_factories.h', - 'ops_headers': [f'#include ' for fn in factory_functions], - 'function_definitions': list(mapMaybe(process_function, factory_functions)), - }) + fm.write_with_template( + "variable_factories.h", + "variable_factories.h", + lambda: { + "generated_comment": "@" + + f"generated from {fm.template_dir}/variable_factories.h", + "ops_headers": [ + f"#include " for fn in factory_functions + ], + "function_definitions": list(mapMaybe(process_function, factory_functions)), + }, + ) + @with_native_function def is_factory_function(f: NativeFunction) -> bool: @@ -52,6 +65,7 @@ def is_factory_function(f: NativeFunction) -> bool: has_tensor_options = python.has_tensor_options(f) return has_tensor_options or name.endswith("_like") + @with_native_function def process_function(f: NativeFunction) -> Optional[str]: name = cpp.name(f.func) @@ -64,22 +78,22 @@ def process_function(f: NativeFunction) -> Optional[str]: sig = CppSignatureGroup.from_native_function(f, method=False).signature formals: List[str] = [] exprs: List[str] = [] - requires_grad = 'false' + requires_grad = "false" for arg in sig.arguments(): qualified_type = fully_qualified_type(arg.type) if arg.default: - formals.append(f'{qualified_type} {arg.name} = {arg.default}') + formals.append(f"{qualified_type} {arg.name} = {arg.default}") else: - formals.append(f'{qualified_type} {arg.name}') + formals.append(f"{qualified_type} {arg.name}") if isinstance(arg.argument, TensorOptionsArguments): # note: we remove the requires_grad setting from the TensorOptions because # it is ignored anyways (and we actually have an assertion that it isn't set # which would fail otherwise). We handle requires_grad explicitly here # instead of passing it through to the kernel. - exprs.append(f'at::TensorOptions({arg.name}).requires_grad(c10::nullopt)') + exprs.append(f"at::TensorOptions({arg.name}).requires_grad(c10::nullopt)") # Manually set the requires_grad bit on the result tensor. - requires_grad = f'{arg.name}.requires_grad()' + requires_grad = f"{arg.name}.requires_grad()" else: exprs.append(arg.name) diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py index 4b634146dfed..78e8e4edce13 100644 --- a/tools/autograd/gen_variable_type.py +++ b/tools/autograd/gen_variable_type.py @@ -27,103 +27,329 @@ # from .context import with_native_function_with_differentiability_info from .gen_trace_type import ( - MANUAL_BACKEND, MANUAL_AUTOGRAD_AND_TRACER, declare_returned_variables, - tie_return_values, get_return_value, type_wrapper_name, + MANUAL_BACKEND, + MANUAL_AUTOGRAD_AND_TRACER, + declare_returned_variables, + tie_return_values, + get_return_value, + type_wrapper_name, ) from .gen_inplace_or_view_type import ( - get_view_info, is_tensor_type, is_tensor_list_type, unpack_args, get_base_name, - use_derived, modifies_arguments, WRAPPER_REGISTRATION, TMP_VAR, METHOD_DEFINITION, - ASSIGN_RETURN_VALUE, gen_formals, ALL_VIEW_FUNCTIONS, unpacked_name, - AUTOGRAD_NOT_IMPLEMENTED_REGISTRATION -) - -from tools.codegen.api.types import (Binding, DispatcherSignature, BaseCType, intArrayRefT, - tensorT, tensorListT, MutRefCType, OptionalCType, - ListCType, SpecialArgName, scalarT, stringT, - VectorCType) -from tools.codegen.api.autograd import ( - DifferentiableInput, NativeFunctionWithDifferentiabilityInfo, - SavedAttribute, dispatch_strategy, gen_differentiable_outputs, - is_differentiable) -from tools.codegen.api import cpp -from tools.codegen.code_template import CodeTemplate -from tools.codegen.context import native_function_manager, with_native_function -from tools.codegen.utils import mapMaybe, FileManager -from tools.codegen.model import (Argument, NativeFunction, SchemaKind, - SelfArgument, TensorOptionsArguments, - BaseType, ListType) -from typing import Callable, List, Optional, Sequence, Union, Dict + get_view_info, + is_tensor_type, + is_tensor_list_type, + unpack_args, + get_base_name, + use_derived, + modifies_arguments, + WRAPPER_REGISTRATION, + TMP_VAR, + METHOD_DEFINITION, + ASSIGN_RETURN_VALUE, + gen_formals, + ALL_VIEW_FUNCTIONS, + unpacked_name, + AUTOGRAD_NOT_IMPLEMENTED_REGISTRATION, +) + +from torchgen.api.types import ( + Binding, + DispatcherSignature, + BaseCType, + intArrayRefT, + tensorT, + tensorListT, + MutRefCType, + OptionalCType, + ListCType, + SpecialArgName, + scalarT, + stringT, + TupleCType, + VectorCType, +) +from torchgen.api.autograd import ( + DifferentiableInput, + NativeFunctionWithDifferentiabilityInfo, + SavedAttribute, + dispatch_strategy, + gen_differentiable_outputs, + is_differentiable, +) +from torchgen.api import cpp +from torchgen.code_template import CodeTemplate +from torchgen.context import native_function_manager, with_native_function +from torchgen.utils import mapMaybe, FileManager +from torchgen.model import ( + Argument, + NativeFunction, + SchemaKind, + SelfArgument, + TensorOptionsArguments, + BaseType, + ListType, +) +from typing import Callable, List, Optional, Sequence, Tuple, Union, Dict # We don't set or modify grad_fn on these methods. Generally, they return # tensors that have requires_grad=False. In-place functions listed here will # not examine or modify requires_grad or grad_fn. DONT_REQUIRE_DERIVATIVE = { # These only depend on the input Tensor's shape and device, not the data - 'ones_like', 'zeros_like', 'rand_like', 'randn_like', + "ones_like", + "zeros_like", + "rand_like", + "randn_like", # These are only implemented on integral types - '__and__', '__iand__', '__ilshift__', '__ior__', '__irshift__', '__ixor__', - '__lshift__', '__or__', '__rshift__', '__xor__', + "__and__", + "__iand__", + "__ilshift__", + "__ior__", + "__irshift__", + "__ixor__", + "__lshift__", + "__or__", + "__rshift__", + "__xor__", # These work on integral data types, and hence don't require derivative - '_sobol_engine_draw', '_sobol_engine_ff', '_sobol_engine_scramble_', - '_sobol_engine_initialize_state_', + "_sobol_engine_draw", + "_sobol_engine_ff", + "_sobol_engine_scramble_", + "_sobol_engine_initialize_state_", # This is an unsafe method that is meant to be out of reach of autograd. - '_coalesced_', + "_coalesced_", # Quantize functions should not record gradients - 'quantize_per_tensor', 'quantize_per_channel', + "quantize_per_tensor", + "quantize_per_channel", # Functions that return integers should not have output that require gradients - 'argmax', 'argmin', 'argsort', 'searchsorted', - 'bucketize', + "argmax", + "argmin", + "argsort", + "searchsorted", + "bucketize", # Functions that return booleans are not differentiable - 'isnan', 'isposinf', 'isneginf', 'isinf', 'signbit', 'isin', + "isnan", + "isposinf", + "isneginf", + "isinf", + "signbit", + "isin", # Functions return none are not differentiable - 'record_stream', + "record_stream", # These functions are not differentiable - 'logical_and', 'logical_xor', 'logical_not', 'logical_or', + "logical_and", + "logical_xor", + "logical_not", + "logical_or", } # The C -> R functions at the time of adding this are still being audited and tested # but will not error out. # C -> C, R -> C functions for which backward is correctly implemented and tested GRADIENT_IMPLEMENTED_FOR_COMPLEX = { - 't', 'view', 'reshape', 'reshape_as', 'view_as', 'roll', 'clone', 'diag_embed', - 'repeat', 'expand', 'flip', 'fliplr', 'flipud', 'rot90', 'transpose', - 'permute', 'squeeze', 'unsqueeze', 'resize', 'resize_as', 'tril', - 'triu', 'chunk', 'zero_', 'eq_', 'ne_', 'add', '__radd__', 'sum', - '_conj', 'sin', 'cos', 'mul', 'sinc', 'sinh', 'cosh', '__rmul__', - 'sgn', 'asin', 'acos', 'sub', 'div', 'cat', 'view_as_complex', 'index_put', - 'neg', 'complex', 'select', '_s_where', 'as_strided', 'slice', 'constant_pad_nd', - 'unbind', 'split', 'split_with_sizes', 'unsafe_split', 'split_with_sizes_backward', - 'dot', 'vdot', 'cholesky', 'triangular_solve', 'mm', '_unsafe_view', 'mv', 'outer', - 'bmm', 'diagonal', 'alias', 'atan', 'log', 'log10', 'log1p', 'log2', 'reciprocal', - 'tan', 'pow', 'rsqrt', 'tanh', 'tanh_backward', 'asinh', 'acosh', 'atanh', 'take', 'fill_', - 'exp', 'nonzero', 'mean', 'inverse', 'solve', 'linalg_cholesky', 'addcmul', 'addcdiv', - 'matrix_exp', 'linalg_matrix_exp', 'linalg_eigh', 'cholesky_solve', 'linalg_qr', '_linalg_svd', '_fft_c2c', '_fft_r2c', - 'linalg_solve', 'sqrt', 'stack', 'gather', 'index_select', 'index_add_', 'linalg_inv', 'linalg_inv_ex', - 'l1_loss_backward', 'baddbmm', 'addbmm', 'addmm', 'addmv', 'addr', 'linalg_householder_product', - 'constant_pad_nd', 'reflection_pad1d', 'reflection_pad2d', 'reflection_pad3d', 'linalg_cholesky_ex', 'linalg_eig', - 'select_backward', 'diagonal_backward', 'slice_backward', - 'reflection_pad1d_backward', 'reflection_pad2d_backward', 'reflection_pad3d_backward', 'symeig', '_sparse_sparse_matmul', - 'replication_pad1d', 'replication_pad2d', 'replication_pad3d', 'take', 'put_', '_to_copy', - 'replication_pad1d_backward', 'replication_pad2d_backward', 'replication_pad3d_backward', - 'diag', 'masked_scatter', 'masked_select', 'index_add', 'index_fill', 'trace', 'polar', 'cumsum', 'rsub', - 'eig', 'lerp', 'linalg_vector_norm', 'cumprod', 'prod', 'index_copy', 'lu', 'unfold', 'unfold_backward', - 'index', 'masked_fill', 'linalg_cross', 'lu_unpack', 'renorm', '_conj_physical', 'linalg_lu_factor_ex', - 'scatter', 'scatter_add', 'sigmoid', 'sigmoid_backward', 'trapezoid', 'cumulative_trapezoid', - 'conj_physical_', '_neg_view', '_reshape_alias', '_det_lu_based_helper', 'lu_solve', - 'linalg_solve_triangular', 'linalg_pinv', 'linalg_lstsq', 'col2im', 'col2im_backward', 'im2col', 'im2col_backward', + "t", + "view", + "reshape", + "reshape_as", + "view_as", + "roll", + "clone", + "diag_embed", + "repeat", + "expand", + "flip", + "fliplr", + "flipud", + "rot90", + "transpose", + "permute", + "squeeze", + "unsqueeze", + "resize", + "resize_as", + "tril", + "triu", + "chunk", + "zero_", + "eq_", + "ne_", + "add", + "__radd__", + "sum", + "_conj", + "sin", + "cos", + "mul", + "sinc", + "sinh", + "cosh", + "__rmul__", + "sgn", + "asin", + "acos", + "sub", + "div", + "cat", + "view_as_complex", + "index_put", + "neg", + "complex", + "select", + "where", + "as_strided", + "slice", + "constant_pad_nd", + "unbind", + "split", + "split_with_sizes", + "unsafe_split", + "split_with_sizes_backward", + "dot", + "vdot", + "cholesky", + "triangular_solve", + "mm", + "_unsafe_view", + "mv", + "outer", + "bmm", + "diagonal", + "alias", + "atan", + "log", + "log10", + "log1p", + "log2", + "reciprocal", + "tan", + "pow", + "rsqrt", + "tanh", + "tanh_backward", + "asinh", + "acosh", + "atanh", + "take", + "fill_", + "exp", + "nonzero", + "mean", + "inverse", + "solve", + "linalg_cholesky", + "addcmul", + "addcdiv", + "matrix_exp", + "linalg_matrix_exp", + "linalg_eigh", + "cholesky_solve", + "linalg_qr", + "_linalg_svd", + "_fft_c2c", + "_fft_r2c", + "linalg_solve", + "sqrt", + "stack", + "gather", + "index_select", + "index_add_", + "linalg_inv", + "linalg_inv_ex", + "l1_loss_backward", + "baddbmm", + "addbmm", + "addmm", + "addmv", + "addr", + "linalg_householder_product", + "constant_pad_nd", + "reflection_pad1d", + "reflection_pad2d", + "reflection_pad3d", + "linalg_cholesky_ex", + "linalg_eig", + "select_backward", + "diagonal_backward", + "slice_backward", + "reflection_pad1d_backward", + "reflection_pad2d_backward", + "reflection_pad3d_backward", + "symeig", + "_sparse_sparse_matmul", + "replication_pad1d", + "replication_pad2d", + "replication_pad3d", + "take", + "put_", + "_to_copy", + "replication_pad1d_backward", + "replication_pad2d_backward", + "replication_pad3d_backward", + "diag", + "masked_scatter", + "masked_select", + "index_add", + "index_fill", + "trace", + "polar", + "cumsum", + "rsub", + "eig", + "lerp", + "linalg_vector_norm", + "cumprod", + "prod", + "index_copy", + "lu", + "unfold", + "unfold_backward", + "index", + "masked_fill", + "linalg_cross", + "lu_unpack", + "renorm", + "_conj_physical", + "linalg_lu_factor_ex", + "scatter", + "scatter_add", + "sigmoid", + "sigmoid_backward", + "trapezoid", + "cumulative_trapezoid", + "conj_physical_", + "_neg_view", + "_reshape_alias", + "_det_lu_based_helper", + "lu_solve", + "linalg_solve_triangular", + "linalg_pinv", + "linalg_lstsq", + "col2im", + "col2im_backward", + "im2col", + "im2col_backward", + "cholesky_inverse", + "to_sparse", + "sparse_sampled_addmm", + "linalg_lu", + "pixel_shuffle", + "pixel_unshuffle", } GRADIENT_IMPLEMENTED_FOR_SPARSE_COMPLEX = { - 'to_dense', '_coalesce', 'coalesce', 'values', '_sparse_coo_tensor_with_dims_and_tensors', - 'sparse_mask_helper_cuda', '_sparse_addmm', + "_to_dense", + "_coalesce", + "coalesce", + "values", + "_sparse_coo_tensor_with_dims_and_tensors", + "sparse_mask_helper_cuda", + "_sparse_addmm", } GRADIENT_IMPLEMENTED_FOR_COMPLEX.update(GRADIENT_IMPLEMENTED_FOR_SPARSE_COMPLEX) # Some operators invalidate the grad_accumulator. Let's reset it. -RESET_GRAD_ACCUMULATOR = { - 'set', 'resize' -} +RESET_GRAD_ACCUMULATOR = {"set", "resize"} # NOTE [ TensorImpl and Storage Pointer Sanity Checks ] # @@ -138,206 +364,282 @@ # the input it is aliased with. Otherwise, its StorageImpl has use_count of 1 # # The following code templates implement the checks for this invariant: -SAVE_TENSOR_STORAGE = CodeTemplate("""\ +SAVE_TENSOR_STORAGE = CodeTemplate( + """\ c10::optional ${tensor_name}_storage_saved = ${tensor_name}.has_storage() ? c10::optional(${tensor_name}.storage()) : c10::nullopt; -""") +""" +) # If tensor_name == out_tensor_name, used to enforce (1), otherwise used for (2) -ENFORCE_SAME_TENSOR_STORAGE = CodeTemplate("""\ +ENFORCE_SAME_TENSOR_STORAGE = CodeTemplate( + """\ if (${tensor_name}_storage_saved.has_value()) AT_ASSERT(${tensor_name}_storage_saved.value().is_alias_of(${out_tensor_name}.storage())); -""") +""" +) -SAVE_TENSORLIST_STORAGE = CodeTemplate("""\ +SAVE_TENSORLIST_STORAGE = CodeTemplate( + """\ std::vector> ${tensorlist_name}_storage_saved(${tensorlist_name}.size()); for (const Tensor& tensor : ${tensorlist_name}) ${tensorlist_name}_storage_saved.push_back( tensor.has_storage() ? c10::optional(tensor.storage()) : c10::nullopt); -""") +""" +) -ENFORCE_SAME_TENSORLIST_STORAGE = CodeTemplate("""\ +ENFORCE_SAME_TENSORLIST_STORAGE = CodeTemplate( + """\ for (size_t i=0; i<${tensorlist_name}.size(); i++) { if (${tensorlist_name}_storage_saved[i].has_value()) AT_ASSERT(${tensorlist_name}_storage_saved[i].value().is_alias_of(${tensorlist_name}[i].storage())); } -""") +""" +) -SAVE_OPTIONALTENSORLIST_STORAGE = CodeTemplate("""\ +SAVE_OPTIONALTENSORLIST_STORAGE = CodeTemplate( + """\ std::vector> ${tensorlist_name}_storage_saved(${tensorlist_name}.size()); for (const c10::optional& tensor : ${tensorlist_name}) ${tensorlist_name}_storage_saved.push_back( tensor.has_value() && tensor->has_storage() ? c10::optional(tensor->storage()) : c10::nullopt); -""") +""" +) -ENFORCE_SAME_OPTIONALTENSORLIST_STORAGE = CodeTemplate("""\ +ENFORCE_SAME_OPTIONALTENSORLIST_STORAGE = CodeTemplate( + """\ for (size_t i=0; i<${tensorlist_name}.size(); i++) { if (${tensorlist_name}_storage_saved[i].has_value()) AT_ASSERT(${tensorlist_name}_storage_saved[i].value().is_alias_of( static_cast>(${tensorlist_name}[i])->storage())); } -""") +""" +) -SAVE_TENSOR_IMPL = CodeTemplate("""\ +SAVE_TENSOR_IMPL = CodeTemplate( + """\ c10::intrusive_ptr ${tensor_name}_impl_saved; if (${tensor_name}.defined()) ${tensor_name}_impl_saved = ${tensor_name}.getIntrusivePtr(); -""") +""" +) -ENFORCE_SAME_TENSOR_IMPL = CodeTemplate("""\ +ENFORCE_SAME_TENSOR_IMPL = CodeTemplate( + """\ if (${tensor_name}_impl_saved) AT_ASSERT(${tensor_name}_impl_saved == ${tensor_name}.getIntrusivePtr()); -""") +""" +) -ENFORCE_TENSOR_IMPL_USE_COUNT_LT_OR_EQ_ONE = CodeTemplate("""\ +ENFORCE_TENSOR_IMPL_USE_COUNT_LT_OR_EQ_ONE = CodeTemplate( + """\ AT_ASSERT(${tensor_name}.use_count() <= 1, "function: ${fn_name}"); -""") +""" +) -ENFORCE_TENSOR_STORAGE_USE_COUNT_EQUALS_ONE = CodeTemplate("""\ +ENFORCE_TENSOR_STORAGE_USE_COUNT_EQUALS_ONE = CodeTemplate( + """\ if (${tensor_name}.has_storage()) AT_ASSERT(${tensor_name}.storage().use_count() == 1, "function: ${fn_name}"); -""") +""" +) -SAVE_TENSORLIST_IMPL = CodeTemplate("""\ +SAVE_TENSORLIST_IMPL = CodeTemplate( + """\ std::vector> ${tensorlist_name}_impl_saved(${tensorlist_name}.size()); for (size_t i=0; i<${tensorlist_name}.size(); i++) if (${tensorlist_name}[i].defined()) ${tensorlist_name}_impl_saved[i] = ${tensorlist_name}[i].getIntrusivePtr(); -""") +""" +) -ENFORCE_SAME_TENSORLIST_IMPL = CodeTemplate("""\ +ENFORCE_SAME_TENSORLIST_IMPL = CodeTemplate( + """\ for (size_t i=0; i<${tensorlist_name}.size(); i++) { if (${tensorlist_name}_impl_saved[i]) AT_ASSERT(${tensorlist_name}_impl_saved[i] == ${tensorlist_name}[i].getIntrusivePtr()); } -""") +""" +) -SAVE_OPTIONALTENSORLIST_IMPL = CodeTemplate("""\ +SAVE_OPTIONALTENSORLIST_IMPL = CodeTemplate( + """\ std::vector> ${tensorlist_name}_impl_saved(${tensorlist_name}.size()); for (size_t i=0; i<${tensorlist_name}.size(); i++) { c10::optional t = ${tensorlist_name}[i]; if (t.has_value() && t->defined()) ${tensorlist_name}_impl_saved[i] = t->getIntrusivePtr(); } -""") +""" +) -ENFORCE_SAME_OPTIONALTENSORLIST_IMPL = CodeTemplate("""\ +ENFORCE_SAME_OPTIONALTENSORLIST_IMPL = CodeTemplate( + """\ for (size_t i=0; i<${tensorlist_name}.size(); i++) { if (${tensorlist_name}_impl_saved[i]) AT_ASSERT(${tensorlist_name}_impl_saved[i] == static_cast>(${tensorlist_name}[i])->getIntrusivePtr()); } -""") +""" +) # The following list contains functions that we don't enforce the invariant on. DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE = { # These functions are expected to change impl or storage of input tensors - 'set_', '_cudnn_rnn_flatten_weight', + "set_", + "_cudnn_rnn_flatten_weight", } DONT_ENFORCE_TENSOR_IMPL_USE_COUNT = { # These non-inplace, non-out functions return tensors with use_count > 1 # Therefore, they MAY (but not necessarily) return one of its inputs as-is # See https://github.com/pytorch/pytorch/issues/60426 for more information - '_embedding_bag', '_embedding_bag_forward_only', - 'q_per_channel_scales', 'q_per_channel_zero_points', - 'lu_unpack', '_cudnn_rnn_backward', - + "_embedding_bag", + "_embedding_bag_forward_only", + "q_per_channel_scales", + "q_per_channel_zero_points", + "lu_unpack", + "_cudnn_rnn_backward", # The below failed StorageImpl use_count check but we skip tensor_impl check # just in case - '_cudnn_rnn', 'dequantize_self', + "_cudnn_rnn", + "dequantize_self", + # lift() should never actually be called with a requires_grad=True tensor, + "lift", } DONT_ENFORCE_STORAGE_IMPL_USE_COUNT = { # These non-view functions return tensors with storage use_count != 1 - '_slow_conv2d_forward', 'slow_conv3d_forward', 'channel_shuffle', - + "_slow_conv2d_forward", + "slow_conv3d_forward", + "channel_shuffle", + # lift() should never actually be called with a requires_grad=True tensor, + "lift", # If an input is returned as-is in output, we cannot guarantee its storage_impl # use count to be 1 either. *DONT_ENFORCE_TENSOR_IMPL_USE_COUNT, } # END CHECKS FOR [ TensorImpl and Storage Pointer Sanity Checks ] -DECLARE_GRAD_FN = CodeTemplate("""\ +DECLARE_GRAD_FN = CodeTemplate( + """\ std::shared_ptr<${op}> grad_fn; -""") +""" +) -SETUP_ANY_REQUIRES_GRAD = CodeTemplate("""\ +SETUP_ANY_REQUIRES_GRAD = CodeTemplate( + """\ auto _any_requires_grad = compute_requires_grad( ${args_with_derivatives} ); ${extra_differentiability_conditions} (void)_any_requires_grad; -""") +""" +) -SETUP_DERIVATIVE = CodeTemplate("""\ +SETUP_DERIVATIVE = CodeTemplate( + """\ if (_any_requires_grad) { ${setup} } -""") +""" +) -SETUP_NONE_REQUIRES_GRAD = CodeTemplate("""\ +SETUP_NONE_REQUIRES_GRAD = CodeTemplate( + """\ if (compute_requires_grad( ${args_to_check} )) { throw_error_out_requires_grad("${base_name}"); } -""") +""" +) -ASSIGN_GRAD_FN = CodeTemplate("""\ +ASSIGN_GRAD_FN = CodeTemplate( + """\ grad_fn = std::shared_ptr<${op}>(new ${op}(${op_ctor}), deleteNode); grad_fn->set_next_edges(collect_next_edges( ${args_with_derivatives} )); -""") +""" +) -CALL_REDISPATCH = CodeTemplate("""\ -at::redispatch::${api_name}(${unpacked_args})""") +CALL_REDISPATCH = CodeTemplate( + """\ +at::redispatch::${api_name}(${unpacked_args})""" +) # If the non-variable operation has return values, we use the `tmp` variable to hold the # values temporarily and pass the values to the return variables outside of the # `at::AutoDispatchBelowAutograd` guard block. -DISPATCH_TO_NON_VAR_TYPE_WITH_TMP_RETURN_VALUES = CodeTemplate("""\ +DISPATCH_TO_NON_VAR_TYPE_WITH_TMP_RETURN_VALUES = CodeTemplate( + """\ auto ${tmp_var} = ([&]() { ${guard} return ${base_type_call}; })(); -""") +""" +) -DISPATCH_TO_NON_VAR_TYPE_WITHOUT_RETURN_VALUES = CodeTemplate("""\ +DISPATCH_TO_NON_VAR_TYPE_WITHOUT_RETURN_VALUES = CodeTemplate( + """\ { ${guard} ${base_type_call}; } -""") +""" +) -SET_HISTORY = CodeTemplate("""\ +SET_HISTORY = CodeTemplate( + """\ if (grad_fn) { ${fn}_history(${differentiable_outputs}, grad_fn); } -""") +""" +) -CONDITIONAL = CodeTemplate("""\ +CONDITIONAL = CodeTemplate( + """\ if (${cond}) { ${statements} } -""") +""" +) -RUN_ONLY_IN_DEBUG_MODE = CodeTemplate("""\ +RUN_ONLY_IN_DEBUG_MODE = CodeTemplate( + """\ #ifndef NDEBUG ${statements} #endif -""") +""" +) -FW_DERIVATIVE_CHECK_TEMPLATE = CodeTemplate("""\ +FW_DERIVATIVE_CHECK_TEMPLATE = CodeTemplate( + """\ isFwGradDefined(${req_inp})\ -""") +""" +) -FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE = CodeTemplate("""\ +FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE = CodeTemplate( + """\ auto ${inp}_t_raw = toNonOptFwGrad(${inp}); auto ${inp}_tensor = toNonOptTensor(${inp}); auto ${inp}_t = (${inp}_t_raw.defined() || !${inp}_tensor.defined()) ? ${inp}_t_raw : at::${zeros_fn}(${inp}_tensor.sizes(), ${inp}_tensor.options()); -""") +""" +) -FW_DERIVATIVE_DEFINED_PRIMAL_TEMPLATE = CodeTemplate("""\ +FW_DERIVATIVE_DEFINED_PRIMAL_TEMPLATE = CodeTemplate( + """\ auto ${inp}_p = toNonOptPrimal(${inp}); -""") +""" +) -FW_DERIVATIVE_SETTER_TENSOR = CodeTemplate("""\ +FW_DERIVATIVE_SETTER_TENSOR = CodeTemplate( + """\ if (${out_arg}_new_fw_grad_opt.has_value() && ${out_arg}_new_fw_grad_opt.value().defined()) { // The hardcoded 0 here will need to be updated once we support multiple levels. ${out_arg}._set_fw_grad(${out_arg}_new_fw_grad_opt.value(), /* level */ 0, /* is_inplace_op */ ${is_inplace}); } -""") +""" +) + +FW_DERIVATIVE_SETTER_MULTI_OUTPUT = CodeTemplate( + """\ +if (${all_res}_new_fw_grad_opt.has_value() && std::get<${idx}>(${all_res}_new_fw_grad_opt.value()).defined()) { + ${out_arg}._set_fw_grad(std::get<${idx}>(${all_res}_new_fw_grad_opt.value()), /* level */ 0, /* is_inplace_op */ false); +} +""" +) -FW_DERIVATIVE_SETTER_TENSOR_LIST = CodeTemplate("""\ +FW_DERIVATIVE_SETTER_TENSOR_LIST = CodeTemplate( + """\ if (${out_arg}_new_fw_grad_opt.has_value()) { auto ${out_arg}_new_fw_grad = ${out_arg}_new_fw_grad_opt.value(); TORCH_INTERNAL_ASSERT(${out_arg}.size() == ${out_arg}_new_fw_grad.size()); @@ -348,29 +650,38 @@ } } } -""") +""" +) -FW_DERIVATIVE_TEMPLATE = CodeTemplate("""\ +FW_DERIVATIVE_TEMPLATE = CodeTemplate( + """\ ${fw_grad_opt_definition} if (${requires_fw_grad}) { ${unpacked_arguments} ${out_arg}_new_fw_grad_opt = ${formula}; } -""") +""" +) -FW_DERIVATIVE_FORBID_TEMPLATE = CodeTemplate("""\ -TORCH_CHECK_NOT_IMPLEMENTED(!(${cond}), "Trying to use forward AD with ${msg} that does not support it."); -""") +FW_DERIVATIVE_FORBID_TEMPLATE = CodeTemplate( + """\ +TORCH_CHECK_NOT_IMPLEMENTED(!(${cond}), "Trying to use forward AD with ${name} that does not support it ${msg}"); +""" +) -FW_DERIVATIVE_FORBID_LIST_TEMPLATE = CodeTemplate("""\ +FW_DERIVATIVE_FORBID_LIST_TEMPLATE = CodeTemplate( + """\ for (const auto& _t: ${arg}) { - TORCH_CHECK_NOT_IMPLEMENTED(!(${cond}), "Trying to use forward AD with ${msg} that does not support it."); + TORCH_CHECK_NOT_IMPLEMENTED(!(${cond}), "Trying to use forward AD with ${name} that does not support it ${msg}"); } -""") +""" +) + def gen_variable_type( out: str, native_yaml_path: str, + tags_yaml_path: str, fns_with_diff_infos: List[NativeFunctionWithDifferentiabilityInfo], template_path: str, ) -> None: @@ -382,47 +693,54 @@ def gen_variable_type( compute the output. The grad_fn is attached to differentiable functions. """ fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False) - fm.write('VariableType.h', lambda: { - 'generated_comment': "@" f'generated from {template_path}/VariableType.h' - }) + fm.write( + "VariableType.h", + lambda: { + "generated_comment": "@" f"generated from {template_path}/VariableType.h" + }, + ) # NOTE: see Note [Sharded File] at the top of the VariableType.cpp # template regarding sharding of the generated files. fm.write_sharded( - 'VariableType.cpp', + "VariableType.cpp", [fn for fn in fns_with_diff_infos if use_derived(fn)], key_fn=lambda fn: cpp.name(fn.func.func), base_env={ - 'generated_comment': - "@" f'generated from {template_path}/VariableType.cpp', + "generated_comment": "@" f"generated from {template_path}/VariableType.cpp", }, env_callable=gen_variable_type_func, num_shards=5, - sharded_keys={'type_derived_method_definitions', 'wrapper_registrations'} + sharded_keys={"type_derived_method_definitions", "wrapper_registrations"}, ) + @with_native_function def gen_wrapper_registration(f: NativeFunction) -> str: return WRAPPER_REGISTRATION.substitute( unqual_operator_name_with_overload=f.func.name, type_wrapper_name=type_wrapper_name(f), - class_type='VariableType', + class_type="VariableType", ) + def gen_variable_type_func( - fn: NativeFunctionWithDifferentiabilityInfo + fn: NativeFunctionWithDifferentiabilityInfo, ) -> Dict[str, List[str]]: f = fn.func with native_function_manager(f): name = cpp.name(f.func) formals = gen_formals(f) - if fn.info is None and not get_base_name(f) in RESET_GRAD_ACCUMULATOR \ - and not get_base_name(f) in DONT_REQUIRE_DERIVATIVE \ - and len(gen_differentiable_outputs(fn)) > 0 \ - and not cpp.name(f.func) in DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE \ - and not type_wrapper_name(f) in DONT_ENFORCE_STORAGE_IMPL_USE_COUNT \ - and not type_wrapper_name(f) in DONT_ENFORCE_TENSOR_IMPL_USE_COUNT: + if ( + fn.info is None + and not get_base_name(f) in RESET_GRAD_ACCUMULATOR + and not get_base_name(f) in DONT_REQUIRE_DERIVATIVE + and len(gen_differentiable_outputs(fn)) > 0 + and not cpp.name(f.func) in DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE + and not type_wrapper_name(f) in DONT_ENFORCE_STORAGE_IMPL_USE_COUNT + and not type_wrapper_name(f) in DONT_ENFORCE_TENSOR_IMPL_USE_COUNT + ): # NOTE: [ Registering AutogradNotImplemented boxed kernel ] # # When there is no derivatives.yaml entry, we register a generic boxed @@ -441,7 +759,8 @@ def gen_variable_type_func( # to (1). type_definition = "" wrapper_registration = AUTOGRAD_NOT_IMPLEMENTED_REGISTRATION.substitute( - unqual_operator_name_with_overload=f.func.name) + unqual_operator_name_with_overload=f.func.name + ) else: type_definition = METHOD_DEFINITION.substitute( return_type=cpp.returns_type(f.func.returns).cpp_type(), @@ -456,21 +775,24 @@ def gen_variable_type_func( # If you want to register a kernel to Autograd, you must make the op abstract. # In other words, this op must have dispatch section in native_functions.yaml. if name in MANUAL_AUTOGRAD_AND_TRACER or (fn.info and fn.info.has_derivatives): - msg = (f'There\'s a formula for {name}(or its functional variant) in derivatives.yaml. ' - f'It\'s required to add a dispatch section for it with explicit supported backends e.g CPU/CUDA ' - f'or CompositeExplicitAutograd in native_functions.yaml. Please see ' - f'https://github.com/pytorch/pytorch/tree/master/aten/src/ATen/native#choosing-the-right-dispatch-keyword ' - f'for instructions to choose the right dispatch keyword.') + msg = ( + f"There's a formula for {name}(or its functional variant) in derivatives.yaml. " + f"It's required to add a dispatch section for it with explicit supported backends e.g CPU/CUDA " + f"or CompositeExplicitAutograd in native_functions.yaml. Please see " + f"https://github.com/pytorch/pytorch/tree/master/aten/src/ATen/native#choosing-the-right-dispatch-keyword " + f"for instructions to choose the right dispatch keyword." + ) assert f.is_abstract, msg return { - 'type_derived_method_definitions': [type_definition], - 'wrapper_registrations': [wrapper_registration], + "type_derived_method_definitions": [type_definition], + "wrapper_registrations": [wrapper_registration], } + @with_native_function_with_differentiability_info def emit_body(fn: NativeFunctionWithDifferentiabilityInfo) -> List[str]: - assert dispatch_strategy(fn) == 'use_derived' + assert dispatch_strategy(fn) == "use_derived" f = fn.func info = fn.info fw_derivatives = fn.fw_derivatives @@ -506,7 +828,9 @@ def gen_differentiable_input( def gen_differentiable_inputs(f: NativeFunction) -> List[DifferentiableInput]: return list(mapMaybe(gen_differentiable_input, f.func.arguments.non_out)) - def find_args_with_derivatives(differentiable_inputs: List[DifferentiableInput]) -> List[DifferentiableInput]: + def find_args_with_derivatives( + differentiable_inputs: List[DifferentiableInput], + ) -> List[DifferentiableInput]: """Find arguments that have derivative definitions""" if info is None or not info.has_derivatives: return differentiable_inputs @@ -514,26 +838,38 @@ def find_args_with_derivatives(differentiable_inputs: List[DifferentiableInput]) differentiable = [arg for arg in differentiable_inputs if arg.name in names] if len(differentiable) != len(names): missing = names - set(arg.name for arg in differentiable) - raise RuntimeError(f'Missing arguments for derivatives: {missing} in {info.name}') + raise RuntimeError( + f"Missing arguments for derivatives: {missing} in {info.name}" + ) return differentiable differentiable_inputs = gen_differentiable_inputs(f) args_with_derivatives = find_args_with_derivatives(differentiable_inputs) differentiable_outputs = gen_differentiable_outputs(fn) - undifferentiable = (base_name in DONT_REQUIRE_DERIVATIVE) or (name in DONT_REQUIRE_DERIVATIVE) + undifferentiable = (base_name in DONT_REQUIRE_DERIVATIVE) or ( + name in DONT_REQUIRE_DERIVATIVE + ) - requires_derivative = (not undifferentiable) and (len(differentiable_inputs) > 0) and (len(differentiable_outputs) > 0) + requires_derivative = ( + (not undifferentiable) + and (len(differentiable_inputs) > 0) + and (len(differentiable_outputs) > 0) + ) if info is not None and info.has_derivatives and not requires_derivative: - raise RuntimeError(f'ERROR: derivative ignored for {name} -- specified an autograd function without derivative') + raise RuntimeError( + f"ERROR: derivative ignored for {name} -- specified an autograd function without derivative" + ) def emit_save_inputs() -> List[str]: setup: List[str] = [] if info is None or not info.has_derivatives: return setup - has_tensorlist_arg = any(is_tensor_list_type(arg.type) for arg in args_with_derivatives) + has_tensorlist_arg = any( + is_tensor_list_type(arg.type) for arg in args_with_derivatives + ) # We don't want to save tensors if we know that they will never be used # when computing the derivative, so we add guards to those statements @@ -550,7 +886,7 @@ def guard_for(arg: SavedAttribute) -> Optional[str]: # require_grad if the backward function even gets executed. I don't # have any good ideas for detecting those cases, so I simply disabled the # checks. - if 'backward' in info.name: + if "backward" in info.name: return None # If there's a single derivative we could compute, we already have @@ -580,12 +916,12 @@ def guard_for(arg: SavedAttribute) -> Optional[str]: else: raise AssertionError() - return f'grad_fn->should_compute_output({edge_off})' + return f"grad_fn->should_compute_output({edge_off})" setup.extend(save_variables(info.all_saved_inputs, False, guard_for)) for arg in args_with_derivatives: if is_tensor_list_type(arg.type): - setup.append(f'grad_fn->{arg.name}_size_ = {arg.name}.size();') + setup.append(f"grad_fn->{arg.name}_size_ = {arg.name}.size();") return setup @@ -593,25 +929,37 @@ def setup_derivative(differentiable_inputs: List[DifferentiableInput]) -> List[s body: List[str] = [] if is_out_fn: # For out functions, ensure that no input or output requires grad - body.append(DECLARE_GRAD_FN.substitute(op='Node')) - body.append(SETUP_NONE_REQUIRES_GRAD.substitute( - base_name=base_name, - args_to_check=[arg.name for arg in differentiable_inputs])) - body.append(SETUP_NONE_REQUIRES_GRAD.substitute( - base_name=base_name, - args_to_check=[arg.name for arg in differentiable_outputs])) + body.append(DECLARE_GRAD_FN.substitute(op="Node")) + body.append( + SETUP_NONE_REQUIRES_GRAD.substitute( + base_name=base_name, + args_to_check=[arg.name for arg in differentiable_inputs], + ) + ) + body.append( + SETUP_NONE_REQUIRES_GRAD.substitute( + base_name=base_name, + args_to_check=[arg.name for arg in differentiable_outputs], + ) + ) return body - op = info.op if info is not None and info.has_derivatives else 'NotImplemented' + op = info.op if info is not None and info.has_derivatives else "NotImplemented" setup = [] - setup.extend(ASSIGN_GRAD_FN.substitute( - op=op, - op_ctor='' if info is not None and info.has_derivatives else f'"{cpp.name(f.func)}"', - args_with_derivatives=[arg.name for arg in args_with_derivatives], - ).split('\n')) + setup.extend( + ASSIGN_GRAD_FN.substitute( + op=op, + op_ctor="" + if info is not None and info.has_derivatives + else f'"{cpp.name(f.func)}"', + args_with_derivatives=[arg.name for arg in args_with_derivatives], + ).split("\n") + ) setup.extend(emit_save_inputs()) - body.extend(emit_check_no_requires_grad(differentiable_inputs, args_with_derivatives)) + body.extend( + emit_check_no_requires_grad(differentiable_inputs, args_with_derivatives) + ) body.append(DECLARE_GRAD_FN.substitute(op=op)) body.append(SETUP_DERIVATIVE.substitute(setup=setup)) return body @@ -623,7 +971,11 @@ def emit_check_if_in_complex_autograd_allowlist() -> List[str]: for arg in differentiable_outputs: name = arg.name # TODO: should be `arg.type.is_tensor_like()`? - if arg.cpp_type in ['at::Tensor', 'at::TensorList', 'const c10::List> &']: + if arg.cpp_type in [ + "at::Tensor", + "at::TensorList", + "const c10::List> &", + ]: body.append(f'throw_error_for_complex_autograd({name}, "{base_name}");') return body @@ -639,7 +991,7 @@ def emit_check_no_requires_grad( arg_name = arg.name if info and arg_name in info.non_differentiable_arg_names: continue - if arg_name == 'output': + if arg_name == "output": # Double-backwards definitions sometimes take in 'input' and # 'output', but only define the derivative for input. continue @@ -649,17 +1001,19 @@ def emit_check_no_requires_grad( def emit_original_self_definition() -> List[str]: body: List[str] = [] if inplace: - body.append('c10::optional original_self;') + body.append("c10::optional original_self;") all_forward_grad_cond = [] for derivative in fw_derivatives: if derivative.required_original_self_value: - all_forward_grad_cond.append(get_any_has_forward_grad_name(derivative.var_name)) + all_forward_grad_cond.append( + get_any_has_forward_grad_name(derivative.var_names) + ) if all_forward_grad_cond: body.append(f'if ({" || ".join(all_forward_grad_cond)}) {{') - body.append(' original_self = self.clone();') - body.append('}') + body.append(" original_self = self.clone();") + body.append("}") return body @@ -671,80 +1025,100 @@ def save_variables( # assign the saved variables to the generated grad_fn stmts: List[str] = [] for arg in saved_variables: - name = arg.nctype.name.name if isinstance(arg.nctype.name, SpecialArgName) else arg.nctype.name + name = ( + arg.nctype.name.name + if isinstance(arg.nctype.name, SpecialArgName) + else arg.nctype.name + ) type = arg.nctype.type expr = arg.expr stmts_prepend = None - if type == BaseCType(tensorT) or type == OptionalCType(BaseCType(tensorT)) or \ - type == MutRefCType(OptionalCType(BaseCType(tensorT))) or (is_output and type == BaseCType(scalarT)): + if ( + type == BaseCType(tensorT) + or type == OptionalCType(BaseCType(tensorT)) + or type == MutRefCType(OptionalCType(BaseCType(tensorT))) + or (is_output and type == BaseCType(scalarT)) + ): var = name - name += '_' - if var == 'self' and inplace: - stmts_prepend = 'if (!original_self.has_value()) original_self = self.clone()' - var = 'original_self.value()' + name += "_" + if var == "self" and inplace: + stmts_prepend = ( + "if (!original_self.has_value()) original_self = self.clone()" + ) + var = "original_self.value()" assert not is_output if inplace and is_output: - var = 'self' - is_inplace_view = f'{var}.is_view()' - expr = f'SavedVariable({var}, {str(is_output).lower()}, {is_inplace_view})' + var = "self" + is_inplace_view = f"{var}.is_view()" + expr = f"SavedVariable({var}, {str(is_output).lower()}, {is_inplace_view})" else: - expr = f'SavedVariable({var}, {str(is_output).lower()})' - elif type == BaseCType(tensorListT) or type == ListCType(OptionalCType(BaseCType(tensorT))): - expr = f'make_saved_variable_list({name})' - name += '_' + expr = f"SavedVariable({var}, {str(is_output).lower()})" + elif type == BaseCType(tensorListT) or type == ListCType( + OptionalCType(BaseCType(tensorT)) + ): + expr = f"make_saved_variable_list({name})" + name += "_" elif type == BaseCType(intArrayRefT): expr = expr + ".vec()" elif type == BaseCType(stringT): - expr = f'std::string({expr})' + expr = f"std::string({expr})" elif type == OptionalCType(BaseCType(stringT)): - expr = f'{expr}.has_value() ? c10::optional(std::string({expr}.value())) : c10::nullopt' + expr = f"{expr}.has_value() ? c10::optional(std::string({expr}.value())) : c10::nullopt" guard = guard_for(arg) if guard is None: if stmts_prepend: - stmts.append(f'{stmts_prepend};') - stmts.append(f'grad_fn->{name} = {expr};') + stmts.append(f"{stmts_prepend};") + stmts.append(f"grad_fn->{name} = {expr};") else: - stmts.append(f'if ({guard}) {{') + stmts.append(f"if ({guard}) {{") if stmts_prepend: - stmts.append(f' {stmts_prepend};') - stmts.append(f' grad_fn->{name} = {expr};') - stmts.append('}') + stmts.append(f" {stmts_prepend};") + stmts.append(f" grad_fn->{name} = {expr};") + stmts.append("}") return stmts # Generates a Dispatcher::redispatch() call into the dispatcher. We do this mainly for performance reasons: # - Pre-compute the full DispatchKeySet. This saves the dispatcher from having to read from TLS. # - redispatch() avoids a redundant call to RecordFunction, which was already called right before # we entered this autograd kernel. - def emit_dispatch_call(f: NativeFunction, input_base: str, unpacked_args: Sequence[str]) -> str: - """ Dispatch call via function in a namespace or method on Tensor.""" + def emit_dispatch_call( + f: NativeFunction, input_base: str, unpacked_args: Sequence[str] + ) -> str: + """Dispatch call via function in a namespace or method on Tensor.""" dispatcher_sig = DispatcherSignature.from_schema(f.func) dispatcher_exprs = dispatcher_sig.exprs() # code-generated autograd kernels plumb and recompute dispatch keys directly through the kernel for performance. # Ops also always have a function variant of the redispatch API. # See Note [Plumbing Keys Through The Dispatcher] for details. - dispatch_key_set = 'ks & c10::after_autograd_keyset' + dispatch_key_set = "ks & c10::after_autograd_keyset" call = CALL_REDISPATCH.substitute( api_name=cpp.name( f.func, faithful_name_for_out_overloads=True, ), - unpacked_args=[dispatch_key_set] + list(unpacked_args)) + unpacked_args=[dispatch_key_set] + list(unpacked_args), + ) return call - def wrap_output(f: NativeFunction, unpacked_bindings: List[Binding], var: str) -> str: - call = '' + def wrap_output( + f: NativeFunction, unpacked_bindings: List[Binding], var: str + ) -> str: + call = "" rhs_value: Optional[str] = None if not any(r.type.is_tensor_like() for r in f.func.returns): rhs_value = var else: - rhs_value = f'std::move({var})' + rhs_value = f"std::move({var})" assert rhs_value is not None - call += ASSIGN_RETURN_VALUE.substitute(return_values=tie_return_values(f), - rhs_value=rhs_value) + call += ASSIGN_RETURN_VALUE.substitute( + return_values=tie_return_values(f), rhs_value=rhs_value + ) return call - def check_tensorimpl_and_storage(call: str, unpacked_bindings: List[Binding]) -> str: + def check_tensorimpl_and_storage( + call: str, unpacked_bindings: List[Binding] + ) -> str: # See NOTE [ TensorImpl and Storage Pointer Sanity Checks ] stmts_before_call: List[str] = [] stmts_after_call: List[str] = [] @@ -757,22 +1131,42 @@ def check_tensorimpl_and_storage(call: str, unpacked_bindings: List[Binding]) -> arg = unpacked_binding.name noref_cpp_type = unpacked_binding.nctype.type.remove_const_ref() if noref_cpp_type == BaseCType(tensorListT): - stmts_before_call += [SAVE_TENSORLIST_STORAGE.substitute(tensorlist_name=arg), - SAVE_TENSORLIST_IMPL.substitute(tensorlist_name=arg)] - stmts_after_call += [ENFORCE_SAME_TENSORLIST_STORAGE.substitute(tensorlist_name=arg), - ENFORCE_SAME_TENSORLIST_IMPL.substitute(tensorlist_name=arg)] + stmts_before_call += [ + SAVE_TENSORLIST_STORAGE.substitute(tensorlist_name=arg), + SAVE_TENSORLIST_IMPL.substitute(tensorlist_name=arg), + ] + stmts_after_call += [ + ENFORCE_SAME_TENSORLIST_STORAGE.substitute(tensorlist_name=arg), + ENFORCE_SAME_TENSORLIST_IMPL.substitute(tensorlist_name=arg), + ] elif noref_cpp_type == ListCType(OptionalCType(BaseCType(tensorT))): - stmts_before_call += [SAVE_OPTIONALTENSORLIST_STORAGE.substitute(tensorlist_name=arg), - SAVE_OPTIONALTENSORLIST_IMPL.substitute(tensorlist_name=arg)] - stmts_after_call += [ENFORCE_SAME_OPTIONALTENSORLIST_STORAGE.substitute(tensorlist_name=arg), - ENFORCE_SAME_OPTIONALTENSORLIST_IMPL.substitute(tensorlist_name=arg)] + stmts_before_call += [ + SAVE_OPTIONALTENSORLIST_STORAGE.substitute(tensorlist_name=arg), + SAVE_OPTIONALTENSORLIST_IMPL.substitute(tensorlist_name=arg), + ] + stmts_after_call += [ + ENFORCE_SAME_OPTIONALTENSORLIST_STORAGE.substitute( + tensorlist_name=arg + ), + ENFORCE_SAME_OPTIONALTENSORLIST_IMPL.substitute( + tensorlist_name=arg + ), + ] elif noref_cpp_type == BaseCType(tensorT): - stmts_before_call += [SAVE_TENSOR_STORAGE.substitute(tensor_name=arg), - SAVE_TENSOR_IMPL.substitute(tensor_name=arg)] - stmts_after_call += [ENFORCE_SAME_TENSOR_STORAGE.substitute(tensor_name=arg, out_tensor_name=arg), - ENFORCE_SAME_TENSOR_IMPL.substitute(tensor_name=arg)] - - assert (stmts_before_call and stmts_after_call) or (not stmts_before_call and not stmts_after_call) + stmts_before_call += [ + SAVE_TENSOR_STORAGE.substitute(tensor_name=arg), + SAVE_TENSOR_IMPL.substitute(tensor_name=arg), + ] + stmts_after_call += [ + ENFORCE_SAME_TENSOR_STORAGE.substitute( + tensor_name=arg, out_tensor_name=arg + ), + ENFORCE_SAME_TENSOR_IMPL.substitute(tensor_name=arg), + ] + + assert (stmts_before_call and stmts_after_call) or ( + not stmts_before_call and not stmts_after_call + ) # Check properties of outputs (enforce (2), (3)) if not f.func.kind() in (SchemaKind.inplace, SchemaKind.out): @@ -780,33 +1174,55 @@ def check_tensorimpl_and_storage(call: str, unpacked_bindings: List[Binding]) -> aliased_arg_name = ALL_VIEW_FUNCTIONS.get(base_name, None) if aliased_arg_name is not None: aliased_arg_name = unpacked_name(aliased_arg_name) - for i, (ret, ret_name) in enumerate(zip(f.func.returns, cpp.return_names(f))): + for i, (ret, ret_name) in enumerate( + zip(f.func.returns, cpp.return_names(f)) + ): noref_cpp_type = cpp.return_type(ret).remove_const_ref() if noref_cpp_type == BaseCType(tensorT): if aliased_arg_name is not None: - assert i == 0, "Expect non-CompositeImplicitAutograd view function {base} to return single output" - stmts_after_call += [ENFORCE_SAME_TENSOR_STORAGE.substitute(tensor_name=aliased_arg_name, - out_tensor_name=ret_name)] + assert ( + i == 0 + ), "Expect non-CompositeImplicitAutograd view function {base} to return single output" + stmts_after_call += [ + ENFORCE_SAME_TENSOR_STORAGE.substitute( + tensor_name=aliased_arg_name, out_tensor_name=ret_name + ) + ] else: - if type_wrapper_name(f) not in DONT_ENFORCE_STORAGE_IMPL_USE_COUNT: - stmts_after_call += [ENFORCE_TENSOR_STORAGE_USE_COUNT_EQUALS_ONE.substitute( - tensor_name=ret_name, fn_name=type_wrapper_name(f))] + if ( + type_wrapper_name(f) + not in DONT_ENFORCE_STORAGE_IMPL_USE_COUNT + ): + stmts_after_call += [ + ENFORCE_TENSOR_STORAGE_USE_COUNT_EQUALS_ONE.substitute( + tensor_name=ret_name, fn_name=type_wrapper_name(f) + ) + ] if type_wrapper_name(f) not in DONT_ENFORCE_TENSOR_IMPL_USE_COUNT: - stmts_after_call += [ENFORCE_TENSOR_IMPL_USE_COUNT_LT_OR_EQ_ONE.substitute( - tensor_name=ret_name, fn_name=type_wrapper_name(f))] + stmts_after_call += [ + ENFORCE_TENSOR_IMPL_USE_COUNT_LT_OR_EQ_ONE.substitute( + tensor_name=ret_name, fn_name=type_wrapper_name(f) + ) + ] # Currently we don't have any functions that return the following types, but # we should update the checks once we do elif noref_cpp_type == ListCType(OptionalCType(BaseCType(tensorT))): - raise AssertionError(f"Please add use_count checks for {noref_cpp_type}") + raise AssertionError( + f"Please add use_count checks for {noref_cpp_type}" + ) elif noref_cpp_type == BaseCType(tensorListT): - raise AssertionError(f"Please add use_count checks for {noref_cpp_type}") + raise AssertionError( + f"Please add use_count checks for {noref_cpp_type}" + ) if stmts_before_call and stmts_after_call: - call = RUN_ONLY_IN_DEBUG_MODE.substitute(statements=stmts_before_call) + \ - call + \ - RUN_ONLY_IN_DEBUG_MODE.substitute(statements=stmts_after_call) + call = ( + RUN_ONLY_IN_DEBUG_MODE.substitute(statements=stmts_before_call) + + call + + RUN_ONLY_IN_DEBUG_MODE.substitute(statements=stmts_after_call) + ) return call def emit_call(f: NativeFunction, unpacked_bindings: List[Binding]) -> str: @@ -816,161 +1232,259 @@ def emit_call(f: NativeFunction, unpacked_bindings: List[Binding]) -> str: # in are now Variables. # See NOTE [ Treating Variables as non-Variables in type dispatch ] for details. unpacked_args = [b.name for b in unpacked_bindings] - base_type_call = emit_dispatch_call(f, 'self_', unpacked_args) + base_type_call = emit_dispatch_call(f, "self_", unpacked_args) if get_view_info(f) is not None or modifies_arguments(f): - guard = 'at::AutoDispatchBelowAutograd guard;' + guard = "at::AutoDispatchBelowAutograd guard;" else: - guard = 'at::AutoDispatchBelowADInplaceOrView guard;' + guard = "at::AutoDispatchBelowADInplaceOrView guard;" if not modifies_arguments(f) and not returns_void: call = DISPATCH_TO_NON_VAR_TYPE_WITH_TMP_RETURN_VALUES.substitute( - base_type_call=base_type_call, tmp_var=TMP_VAR, guard=guard) + base_type_call=base_type_call, tmp_var=TMP_VAR, guard=guard + ) call += wrap_output(f, unpacked_bindings, TMP_VAR) else: call = DISPATCH_TO_NON_VAR_TYPE_WITHOUT_RETURN_VALUES.substitute( - base_type_call=base_type_call, guard=guard) + base_type_call=base_type_call, guard=guard + ) call = check_tensorimpl_and_storage(call, unpacked_bindings) return call def emit_history() -> str: - fn = 'rebase' if modifies_arguments(f) and view_info is None else 'set' + fn = "rebase" if modifies_arguments(f) and view_info is None else "set" output_names = [r.name for r in differentiable_outputs] # TODO: flatten allocates a std::vector, which could be expensive - outs = CodeTemplate("flatten_tensor_args( ${outs} )").substitute(outs=output_names) + outs = CodeTemplate("flatten_tensor_args( ${outs} )").substitute( + outs=output_names + ) return SET_HISTORY.substitute(fn=fn, differentiable_outputs=outs) def emit_save_outputs() -> str: if is_out_fn: # out functions don't currently support differentiation - return '' + return "" if info is not None and info.has_derivatives: stmts = save_variables(info.all_saved_outputs, True) if len(stmts) == 0: - return '' - return CONDITIONAL.substitute(cond='grad_fn', statements=stmts) - return '' + return "" + return CONDITIONAL.substitute(cond="grad_fn", statements=stmts) + return "" def emit_any_requires_grad() -> List[str]: - extra_condition = '' + extra_condition = "" if fn.info and fn.info.output_differentiability_conditions: assert len(fn.info.output_differentiability_conditions) == 1 - extra_condition = \ - f'_any_requires_grad &= ({fn.info.output_differentiability_conditions[0]});' - return [SETUP_ANY_REQUIRES_GRAD.substitute( - args_with_derivatives=[arg.name for arg in args_with_derivatives], - extra_differentiability_conditions=extra_condition)] + extra_condition = f"_any_requires_grad &= ({fn.info.output_differentiability_conditions[0]});" + return [ + SETUP_ANY_REQUIRES_GRAD.substitute( + args_with_derivatives=[arg.name for arg in args_with_derivatives], + extra_differentiability_conditions=extra_condition, + ) + ] - def get_any_has_forward_grad_name(var_name: str) -> str: - return f'_any_has_forward_grad_{var_name}' + def get_any_has_forward_grad_name(var_names: Tuple[str, ...]) -> str: + if len(var_names) == 1: + return f"_any_has_forward_grad_{var_names[0]}" + else: + return f'_any_has_forward_grad_{"_".join(var_names)}' def emit_any_has_forward_grad() -> List[str]: content: List[str] = [] for derivative in fw_derivatives: assert derivative.required_inputs_fw_grad is not None - requires_fw_grad = " || ".join([FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp.name) - for inp in differentiable_inputs if inp.name in derivative.required_inputs_fw_grad]) + requires_fw_grad = " || ".join( + [ + FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp.name) + for inp in differentiable_inputs + if inp.name in derivative.required_inputs_fw_grad + ] + ) if not requires_fw_grad: # Handle functions like stack # For these, we don't unpack anything and always call the user function - if not (len(differentiable_inputs) == 1 and is_tensor_list_type(differentiable_inputs[0].type)): - raise RuntimeError(f'No differentiable input to "{name}" is a differentiable Tensor (as the provided' - 'forward AD formula does not use any input tangent) even though a forward gradient ' - 'formula has been defined for it. This case should only happen for function that ' - 'take a single TensorList as input. All other cases are not supported right now.') + if not ( + len(differentiable_inputs) == 1 + and is_tensor_list_type(differentiable_inputs[0].type) + ): + raise RuntimeError( + f'No differentiable input to "{name}" is a differentiable Tensor (as the provided ' + "forward AD formula does not use any input tangent) even though a forward gradient " + "formula has been defined for it. This case should only happen for function that " + "take a single TensorList as input. All other cases are not supported right now." + ) requires_fw_grad = "true" if fn.info and fn.info.output_differentiability_conditions: assert len(fn.info.output_differentiability_conditions) == 1 - requires_fw_grad = \ - f'({fn.info.output_differentiability_conditions[0]}) && ({requires_fw_grad})' + requires_fw_grad = f"({fn.info.output_differentiability_conditions[0]}) && ({requires_fw_grad})" - content.append(f"auto {get_any_has_forward_grad_name(derivative.var_name)} = {requires_fw_grad};\n" - f"(void){get_any_has_forward_grad_name(derivative.var_name)};") + content.append( + f"auto {get_any_has_forward_grad_name(derivative.var_names)} = {requires_fw_grad};\n" + f"(void){get_any_has_forward_grad_name(derivative.var_names)};" + ) return content def emit_check_inplace() -> List[str]: if not inplace: return [] - return [f'check_inplace({arg.name}, _any_requires_grad);' for arg in differentiable_outputs] + return [ + f"check_inplace({arg.name}, _any_requires_grad);" + for arg in differentiable_outputs + ] def emit_fw_derivatives() -> List[str]: content: List[str] = [] fw_grad_setters: List[str] = [] for derivative in fw_derivatives: - res = derivative.var_name + res = derivative.var_names if f.func.name.name.inplace: + assert ( + len(res) == 1 + ), "Expected number of outputs to be 1 if function is inplace" # TODO update this when inplace namings are unified - res = "self" + res = ("self",) assert derivative.required_inputs_fw_grad is not None unpacked_arguments = "" for inp in differentiable_inputs: - zeros_fn = "zeros" if inplace and inp.name == "self" else "_efficientzerotensor" + zeros_fn = ( + "zeros" + if inplace and inp.name == "self" + else "_efficientzerotensor" + ) if inp.name in derivative.required_inputs_fw_grad: - unpacked_arguments += FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE.substitute(inp=inp.name, zeros_fn=zeros_fn) + unpacked_arguments += ( + FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE.substitute( + inp=inp.name, zeros_fn=zeros_fn + ) + ) if inp.name in (derivative.required_inputs_primal or []): - unpacked_arguments += FW_DERIVATIVE_DEFINED_PRIMAL_TEMPLATE.substitute(inp=inp.name) + unpacked_arguments += ( + FW_DERIVATIVE_DEFINED_PRIMAL_TEMPLATE.substitute(inp=inp.name) + ) if derivative.required_original_self_value: - unpacked_arguments += FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE.substitute(inp="original_self", zeros_fn=zeros_fn) - unpacked_arguments += FW_DERIVATIVE_DEFINED_PRIMAL_TEMPLATE.substitute(inp="original_self") + unpacked_arguments += FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE.substitute( + inp="original_self", zeros_fn=zeros_fn + ) + unpacked_arguments += FW_DERIVATIVE_DEFINED_PRIMAL_TEMPLATE.substitute( + inp="original_self" + ) elif inplace and derivative.is_reusing_outplace_formula: # The gradient wasn't already cloned, do it if grad mode is enabled - unpacked_arguments += "self_t = GradMode::is_enabled() ? self_t.clone() : self_t;" + unpacked_arguments += ( + "self_t = GradMode::is_enabled() ? self_t.clone() : self_t;" + ) if inplace: is_inplace_str = "true" else: is_inplace_str = "false" - if isinstance(derivative.var_type, BaseType) and derivative.var_type.is_tensor_like(): + if all( + (isinstance(var_type, BaseType) and var_type.is_tensor_like()) + for var_type in derivative.var_types + ): # Is there a way to get from BaseType to BaseCType - opt_res_grad_type = OptionalCType(BaseCType(tensorT)).cpp_type() - fw_grad_setter = FW_DERIVATIVE_SETTER_TENSOR.substitute(out_arg=res, is_inplace=is_inplace_str) - elif isinstance(derivative.var_type, ListType) and derivative.var_type.is_tensor_like(): - opt_res_grad_type = OptionalCType(VectorCType(BaseCType(tensorT))).cpp_type() - fw_grad_setter = FW_DERIVATIVE_SETTER_TENSOR_LIST.substitute(out_arg=res, is_inplace=is_inplace_str) + if len(derivative.var_types) == 1: + opt_res_grad_type = OptionalCType(BaseCType(tensorT)).cpp_type() + fw_grad_setters.append( + FW_DERIVATIVE_SETTER_TENSOR.substitute( + out_arg=res[0], is_inplace=is_inplace_str + ) + ) + else: + tuple_type = TupleCType( + [BaseCType(tensorT)] * len(derivative.var_types) + ) + opt_res_grad_type = OptionalCType(tuple_type).cpp_type() + for idx, single_res in enumerate(res): + fw_grad_setters.append( + FW_DERIVATIVE_SETTER_MULTI_OUTPUT.substitute( + idx=idx, all_res="_".join(res), out_arg=single_res + ) + ) + elif ( + isinstance(derivative.var_types[0], ListType) + and derivative.var_types[0].is_tensor_like() + ): + assert ( + len(derivative.var_types) == 1 + ), "Expected number of outputs to be 1 if function returns ListType" + opt_res_grad_type = OptionalCType( + VectorCType(BaseCType(tensorT)) + ).cpp_type() + fw_grad_setters.append( + FW_DERIVATIVE_SETTER_TENSOR_LIST.substitute( + out_arg=res[0], is_inplace=is_inplace_str + ) + ) else: raise RuntimeError("Unsupported output type for forward derivative") - fw_grad_opt_definition = f"{opt_res_grad_type} {res}_new_fw_grad_opt = c10::nullopt;" + fw_grad_opt_definition = ( + f"{opt_res_grad_type} {'_'.join(res)}_new_fw_grad_opt = c10::nullopt;" + ) # View ops create fw_grad that already is a view of the base's fw_grad so just use that - content.append(FW_DERIVATIVE_TEMPLATE.substitute( - fw_grad_opt_definition=fw_grad_opt_definition, - requires_fw_grad=get_any_has_forward_grad_name(derivative.var_name), formula=derivative.formula, out_arg=res, - unpacked_arguments=unpacked_arguments)) - fw_grad_setters.append(fw_grad_setter) + content.append( + FW_DERIVATIVE_TEMPLATE.substitute( + fw_grad_opt_definition=fw_grad_opt_definition, + requires_fw_grad=get_any_has_forward_grad_name( + derivative.var_names + ), + formula=derivative.formula, + out_arg="_".join(res), + unpacked_arguments=unpacked_arguments, + ) + ) # Set all the grads at the end to avoid: https://github.com/pytorch/pytorch/issues/67367 - content.append('\n'.join(fw_grad_setters)) + content.append("\n".join(fw_grad_setters)) return content def emit_forbid_fw_derivatives(is_out_fn: bool = False) -> str: def get_msg() -> str: if is_out_fn: - msg = name + " (because it is an out= function)" + msg = "because it is an out= function" else: - msg = name + msg = ( + "because it has not been implemented yet.\\nPlease file an issue " + "to PyTorch at https://github.com/pytorch/pytorch/issues/new?template=feature-request.yml " + "so that we can prioritize its implementation." + ) return msg + res = "" to_check: List[str] = [] - for inp in list(mapMaybe(gen_differentiable_input, - f.func.arguments.non_out + list(f.func.arguments.out))): # type: ignore[operator] + for inp in list( + mapMaybe( + gen_differentiable_input, + f.func.arguments.non_out + list(f.func.arguments.out), # type: ignore[operator] + ) + ): if is_tensor_type(inp.type): - to_check.append(FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp.name)) + to_check.append( + FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp.name) + ) elif is_tensor_list_type(inp.type): cond = FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp="_t") - res += FW_DERIVATIVE_FORBID_LIST_TEMPLATE.substitute(arg=inp.name, cond=cond, msg=get_msg()) + res += FW_DERIVATIVE_FORBID_LIST_TEMPLATE.substitute( + arg=inp.name, cond=cond, name=name, msg=get_msg() + ) else: - raise RuntimeError(f'Unsupported input type for "{name}" when forbidding forward AD usage.') + raise RuntimeError( + f'Unsupported input type for "{name}" when forbidding forward AD usage.' + ) if len(to_check) > 0: cond = " || ".join(to_check) - res += FW_DERIVATIVE_FORBID_TEMPLATE.substitute(cond=cond, msg=get_msg()) + res += FW_DERIVATIVE_FORBID_TEMPLATE.substitute( + cond=cond, name=name, msg=get_msg() + ) return res body: List[str] = [] @@ -1000,12 +1514,15 @@ def get_msg() -> str: if len(fw_derivatives) == 0: body.append(emit_forbid_fw_derivatives()) else: - assert len(fw_derivatives) == len(differentiable_outputs), ( + assert sum( + len(derivative.var_names) for derivative in fw_derivatives + ) == len(differentiable_outputs), ( "Expected the number of forward derivatives implemented to match the " "number of differentiable outputs. NB: This only applies when at least " "one forward derivative is implemented. Not implementing any forward " "derivatives is also okay, and we would require inputs to the op to " - "not have associated tangents in that case.") + "not have associated tangents in that case." + ) if requires_derivative: # Save only after the forward AD has been set up @@ -1017,7 +1534,7 @@ def get_msg() -> str: # `reset_grad_accumulator` in an operator that's not `inplace`, you can # remove this assert but the code generation will get more elaborate assert inplace - body.append('reset_grad_accumulator(self);') + body.append("reset_grad_accumulator(self);") if not returns_void: - body.append(f'return {get_return_value(f)};') + body.append(f"return {get_return_value(f)};") return body diff --git a/tools/autograd/load_derivatives.py b/tools/autograd/load_derivatives.py index e62ab95c66d0..185a4cdcef49 100644 --- a/tools/autograd/load_derivatives.py +++ b/tools/autograd/load_derivatives.py @@ -1,44 +1,113 @@ # Parses derivatives.yaml into autograd functions # # Each autograd function is represented by `DifferentiabilityInfo` containing -# a list of `Derivative`. See `tools.codegen.api.autograd` for the data models. +# a list of `Derivative`. See `torchgen.api.autograd` for the data models. from collections import defaultdict import re from typing import Counter, Sequence, Any, Tuple, List, Set, Dict, Match, Optional import yaml -from tools.codegen.api.autograd import (Derivative, DifferentiabilityInfo, - SavedAttribute, ForwardDerivative) -from tools.codegen.api.types import (Binding, CppSignatureGroup, NamedCType, BaseCType, VectorCType, - intArrayRefT, tensorOptionsT, typeAndSizeT, longT, boolT, - tensorGeometryT, scalarTypeT, SpecialArgName, - OptionalCType, stringT) -from tools.codegen.api import cpp -from tools.codegen.gen import parse_native_yaml -from tools.codegen.context import with_native_function -from tools.codegen.model import FunctionSchema, NativeFunction, Variant, Type -from tools.codegen.utils import IDENT_REGEX, split_name_params, YamlLoader +from torchgen.api.autograd import ( + Derivative, + DifferentiabilityInfo, + SavedAttribute, + ForwardDerivative, +) +from torchgen.api.types import ( + Binding, + CppSignatureGroup, + NamedCType, + BaseCType, + VectorCType, + intArrayRefT, + tensorOptionsT, + typeAndSizeT, + longT, + boolT, + layoutT, + tensorGeometryT, + scalarTypeT, + SpecialArgName, + OptionalCType, + stringT, +) +from torchgen.api import cpp +from torchgen.gen import parse_native_yaml, get_grouped_by_view_native_functions +from torchgen.context import with_native_function +from torchgen.model import ( + FunctionSchema, + NativeFunction, + Variant, + Type, + NativeFunctionsViewGroup, + OperatorName, +) +from torchgen.utils import IDENT_REGEX, split_name_params, YamlLoader, concatMap _GLOBAL_LOAD_DERIVATIVE_CACHE = {} -def load_derivatives(derivatives_yaml_path: str, native_yaml_path: str) -> Sequence[DifferentiabilityInfo]: +# This function directly adds derivative entries for {view}_copy variants of each view op. +# Since every {view} and {view}_copy op shares the same derivative formula, +# we generate them here instead of duplicating them in the yaml. +# See Note [Codegen'd {view}_copy Operators] +def add_view_copy_derivatives( + infos: List[DifferentiabilityInfo], view_groups: List[NativeFunctionsViewGroup] +) -> List[DifferentiabilityInfo]: + # Get the map from each view op's name to its corresponding view group + view_name_to_group: Dict[OperatorName, NativeFunctionsViewGroup] = { + g.view.func.name: g for g in view_groups + } + + view_copy_differentiability_infos = [] + for info in infos: + maybe_view_group = view_name_to_group.get(info.func.func.name, None) + if maybe_view_group is not None and maybe_view_group.view_copy is not None: + view_copy_info = info.create_view_copy_from_view_derivative( + maybe_view_group + ) + if view_copy_info is not None: + view_copy_differentiability_infos.append(view_copy_info) + + return view_copy_differentiability_infos + + +def load_derivatives( + derivatives_yaml_path: str, native_yaml_path: str, tags_yaml_path: str +) -> Sequence[DifferentiabilityInfo]: # Do some caching as this is a deterministic function global _GLOBAL_LOAD_DERIVATIVE_CACHE key = (derivatives_yaml_path, native_yaml_path) if key not in _GLOBAL_LOAD_DERIVATIVE_CACHE: - with open(derivatives_yaml_path, 'r') as f: + with open(derivatives_yaml_path, "r") as f: definitions = yaml.load(f, Loader=YamlLoader) - functions = parse_native_yaml(native_yaml_path).native_functions + funcs = parse_native_yaml(native_yaml_path, tags_yaml_path).native_functions + # From the parsed native functions, separate out the (generated) view_copy functions, + # so we can generate derivatives for them separately. + native_functions_with_view_groups = get_grouped_by_view_native_functions(funcs) + native_functions_without_view_copies = concatMap( + # We need to pull out the view_inplace ops too, since they might have their own derivative entries. + lambda g: [g] + if isinstance(g, NativeFunction) + else list(g.functions(include_copy=False)), + native_functions_with_view_groups, + ) + view_groups = [ + g + for g in native_functions_with_view_groups + if isinstance(g, NativeFunctionsViewGroup) + ] # What's the difference between function schema v.s. signature? # function schema is the complete declaration including mutability annotation / default value and etc. # signature is the canonical schema for a group of functions (in-place/out/functional variants) # that are semantically related. - functions_by_signature: Dict[FunctionSchema, List[NativeFunction]] = defaultdict(list) + functions_by_signature: Dict[ + FunctionSchema, List[NativeFunction] + ] = defaultdict(list) functions_by_schema: Dict[str, NativeFunction] = dict() - for function in functions: + for function in native_functions_without_view_copies: functions_by_signature[function.func.signature()].append(function) assert str(function.func) not in functions_by_schema functions_by_schema[str(function.func)] = function @@ -48,38 +117,56 @@ def load_derivatives(derivatives_yaml_path: str, native_yaml_path: str) -> Seque op_counter = Counter[str]() infos = [ - create_differentiability_info(defn, functions_by_signature, functions_by_schema, op_counter) - for defn in definitions] + create_differentiability_info( + defn, functions_by_signature, functions_by_schema, op_counter + ) + for defn in definitions + ] + infos += add_view_copy_derivatives(infos, view_groups) _GLOBAL_LOAD_DERIVATIVE_CACHE[key] = infos return _GLOBAL_LOAD_DERIVATIVE_CACHE[key] + @with_native_function def cpp_arguments(f: NativeFunction) -> Sequence[Binding]: return CppSignatureGroup.from_native_function(f, method=False).signature.arguments() -def create_derivative(f: NativeFunction, formula: str, var_names: Tuple[str, ...], - available_named_gradients: Sequence[str]) -> Derivative: + +def create_derivative( + f: NativeFunction, + formula: str, + var_names: Tuple[str, ...], + available_named_gradients: Sequence[str], +) -> Derivative: original_formula = formula - arguments: List[NamedCType] = [a.nctype.remove_const_ref() for a in cpp_arguments(f)] + arguments: List[NamedCType] = [ + a.nctype.remove_const_ref() for a in cpp_arguments(f) + ] - return_names = tuple(n if n != 'self' else 'result' for n in cpp.return_names(f)) + return_names = tuple(n if n != "self" else "result" for n in cpp.return_names(f)) return_types = tuple(cpp.return_type(r).remove_const_ref() for r in f.func.returns) - named_returns = [NamedCType(name, type) for name, type in zip(return_names, return_types)] + named_returns = [ + NamedCType(name, type) for name, type in zip(return_names, return_types) + ] formula, saved_inputs = saved_variables(formula, arguments, var_names) formula, saved_outputs = saved_variables(formula, named_returns, var_names) - used_named_gradients = {name for name in available_named_gradients if re.search(IDENT_REGEX.format(name), formula)} + used_named_gradients = { + name + for name in available_named_gradients + if re.search(IDENT_REGEX.format(name), formula) + } # Check that the referenced derivatives in the formula are in bounds for i in used_gradient_indices(formula): if i >= len(f.func.returns): raise RuntimeError( - f'Out of bounds grads access: derivative formula for {cpp.name(f.func)} ' - f'used grads[{i}], but the forward only returns {len(f.func.returns)} outputs.' + f"Out of bounds grads access: derivative formula for {cpp.name(f.func)} " + f"used grads[{i}], but the forward only returns {len(f.func.returns)} outputs." ) return Derivative( @@ -91,34 +178,43 @@ def create_derivative(f: NativeFunction, formula: str, var_names: Tuple[str, ... named_gradients=used_named_gradients, ) -def create_forward_derivative(f: NativeFunction, formula: str, names: Tuple[str, ...]) -> ForwardDerivative: - assert len(names) == 1, "Forward derivatives can define gradients for only one output at a time" - var_name = names[0] - var_type: Optional[Type] = None + +def create_forward_derivative( + f: NativeFunction, formula: str, names: Tuple[str, ...] +) -> ForwardDerivative: + var_names = names + var_types: Optional[Tuple[Type, ...]] = None for r in f.func.returns: - if r.name == var_name: - var_type = r.type - break + if r.name in var_names: + if var_types is None: + var_types = tuple() + var_types = var_types + (r.type,) + # Handle default return names - if var_type is None: - if var_name == "result": + if var_types is None: + if var_names == ("result",): assert len(f.func.returns) == 1 - var_type = f.func.returns[0].type + var_types = (f.func.returns[0].type,) else: - res = re.findall(r"^result(\d+)$", var_name) - if len(res) == 1: - arg_idx = int(res[0]) - var_type = f.func.returns[arg_idx].type - - assert var_type is not None, "No matching output for forward derivative definition" + for var_name in var_names: + res = re.findall(r"^result(\d+)$", var_name) + if len(res) == 1: + if var_types is None: + var_types = tuple() + arg_idx = int(res[0]) + var_types = var_types + (f.func.returns[arg_idx].type,) + + assert var_types is not None, "No matching output for forward derivative definition" return ForwardDerivative( formula=formula, - var_name=var_name, - var_type=var_type, + var_names=var_names, + var_types=var_types, required_inputs_fw_grad=None, required_inputs_primal=None, required_original_self_value=False, - is_reusing_outplace_formula=False) + is_reusing_outplace_formula=False, + ) + def postprocess_forward_derivatives( f: NativeFunction, @@ -126,22 +222,23 @@ def postprocess_forward_derivatives( all_arg_names: List[str], derivatives: List[Derivative], forward_derivatives: List[ForwardDerivative], - args_with_derivatives: Sequence[Binding] + args_with_derivatives: Sequence[Binding], ) -> List[ForwardDerivative]: - def find_required_inputs(formula: str, postfix: str) -> Tuple[str, ...]: required_inputs = set() for arg in args_with_derivatives: - if arg.type == 'at::TensorList': + if arg.type == "at::TensorList": # The functions taking TensorList handle everything internally continue arg_name = arg.name found = re.search(IDENT_REGEX.format(arg_name), formula) if found: - raise RuntimeError(f"The forward formula for {defn_name} is using the base name of the {arg_name} " - f"argument which is ambiguous. You should use {arg_name}_p to access the primal " - f"value and {arg_name}_t to access the tangent.") + raise RuntimeError( + f"The forward formula for {defn_name} is using the base name of the {arg_name} " + f"argument which is ambiguous. You should use {arg_name}_p to access the primal " + f"value and {arg_name}_t to access the tangent." + ) found = re.search(IDENT_REGEX.format(arg_name + postfix), formula) if found: @@ -155,15 +252,23 @@ def find_required_inputs(formula: str, postfix: str) -> Tuple[str, ...]: formula = defn.formula required_inputs_tangent = find_required_inputs(formula, "_t") if formula == "auto_element_wise": - if (not len(args_with_derivatives) == 1) or len(forward_derivatives) > 1: - raise RuntimeError(f"Derivative definition of {defn_name} in derivatives.yaml defines the " - "forward definition of gradient as element_wise but this only " - "works for functions with a single differentiable input and a " - "single differentiable output.") + if ( + (not len(args_with_derivatives) == 1) + or len(forward_derivatives) > 1 + or len(forward_derivatives[0].var_names) > 1 + ): + raise RuntimeError( + f"Derivative definition of {defn_name} in derivatives.yaml defines the " + "forward definition of gradient as element_wise but this only " + "works for functions with a single differentiable input and a " + "single differentiable output." + ) if not len(derivatives) == 1: - raise RuntimeError(f"Derivative definition of {defn_name} in derivatives.yaml defines the " - "forward definition of gradient as element_wise but it does not " - "defines the gradient formula for its argument which is required.") + raise RuntimeError( + f"Derivative definition of {defn_name} in derivatives.yaml defines the " + "forward definition of gradient as element_wise but it does not " + "defines the gradient formula for its argument which is required." + ) # This transformation is based on the observation that for element-wise functions, the Jacobian # matrix is diagonal and thus doing J * v is the same as (v^T J)^T (in practice, we ignore the transpositions) # For the complex case, we use hermitian transpose and get (v.conj() J).conj() @@ -182,6 +287,7 @@ def find_required_inputs(formula: str, postfix: str) -> Tuple[str, ...]: # Do replacement 1) of the grad def repl(m: Any) -> str: return f"{m.group(1)}{input_name}_t.conj(){m.group(2)}" + fw_formula = re.sub(IDENT_REGEX.format("grad"), repl, backward_formula) # Do replacement 2) of the input variables @@ -190,6 +296,7 @@ def repl(m: Any) -> str: def repl(m: Any) -> str: return f"{m.group(1)}{arg_name}_p{m.group(2)}" + fw_formula = re.sub(IDENT_REGEX.format(arg_name), repl, fw_formula) # Do the final conjugate 3) @@ -200,10 +307,15 @@ def repl(m: Any) -> str: required_inputs_tangent = tuple(all_arg_names) formula = fw_formula elif formula == "auto_linear": - if len(forward_derivatives) > 1: - raise RuntimeError(f"Derivative definition of {defn_name} in derivatives.yaml defines the " - "forward definition of gradient as linear but this only works " - "for functions with a single differentiable output.") + if ( + len(forward_derivatives) > 1 + or len(forward_derivatives[0].var_names) > 1 + ): + raise RuntimeError( + f"Derivative definition of {defn_name} in derivatives.yaml defines the " + "forward definition of gradient as linear but this only works " + "for functions with a single differentiable output." + ) # This transformation is based on the observation that linear functions can be written as: # y = f(x) = A * x # For some matrix A and the Jacobian of the function f is also A. @@ -229,7 +341,9 @@ def repl(m: Any) -> str: fw_formula = "at::{}({})".format(defn_name, ", ".join(new_args)) else: assert Variant.method in f.variants - fw_formula = "{}.{}({})".format(new_args[0], defn_name, ", ".join(new_args[1:])) + fw_formula = "{}.{}({})".format( + new_args[0], defn_name, ", ".join(new_args[1:]) + ) # All of the input tangents are always used so all of them are required here. required_inputs_tangent = tuple(diff_arg_names) @@ -241,26 +355,31 @@ def repl(m: Any) -> str: # This call inspects the formula to find for which input's primal are used. required_inputs_primal = find_required_inputs(formula, "_p") - updated_derivatives.append(ForwardDerivative( - formula=formula, - var_name=defn.var_name, - var_type=defn.var_type, - required_inputs_fw_grad=required_inputs_tangent, - required_inputs_primal=required_inputs_primal, - required_original_self_value=False, - is_reusing_outplace_formula=False)) + updated_derivatives.append( + ForwardDerivative( + formula=formula, + var_names=defn.var_names, + var_types=defn.var_types, + required_inputs_fw_grad=required_inputs_tangent, + required_inputs_primal=required_inputs_primal, + required_original_self_value=False, + is_reusing_outplace_formula=False, + ) + ) return updated_derivatives -def is_forward_derivative_definition(all_arg_names: List[str], names: Tuple[str, ...]) -> bool: - if len(names) > 1: - # Forward definition are always for a single output at a time - return False - name = names[0] - if name not in all_arg_names: - return True - else: - return False + +def is_forward_derivative_definition( + all_arg_names: List[str], names: Tuple[str, ...] +) -> bool: + for name in names: + if name not in all_arg_names: + return True + else: + return False + raise RuntimeError("Expected `names` to be non-empty") + def create_differentiability_info( defn: Dict[Any, Any], @@ -270,17 +389,19 @@ def create_differentiability_info( ) -> DifferentiabilityInfo: """Processes a single entry `defn` in derivatives.yaml""" - def canonical_function(functions: Sequence[NativeFunction], name: str) -> NativeFunction: + def canonical_function( + functions: Sequence[NativeFunction], name: str + ) -> NativeFunction: for f in functions: if cpp.name(f.func) == name: return f # some functions only have in-place variants - assert name + '_' == cpp.name(functions[0].func) + assert name + "_" == cpp.name(functions[0].func) return functions[0] def split_names(raw_names: str) -> Tuple[str, ...]: """Given "foo, bar", return ["foo", "bar"].""" - return tuple(x.strip() for x in raw_names.split(',')) + return tuple(x.strip() for x in raw_names.split(",")) def check_grad_usage(defn_name: str, derivatives: Sequence[Derivative]) -> None: """ @@ -289,14 +410,16 @@ def check_grad_usage(defn_name: str, derivatives: Sequence[Derivative]) -> None: used with double backwards. """ - uses_grad = False # true if any derivative uses "grad" - num_grads_uses = 0 # count of uses of "grads" or "grads[INDEX]" - uses_named_grads = False # true if any derivative uses "grad_{name}" + uses_grad = False # true if any derivative uses "grad" + num_grads_uses = 0 # count of uses of "grads" or "grads[INDEX]" + uses_named_grads = False # true if any derivative uses "grad_{name}" used_grads_indices: List[int] = [] # which indices of grads are used for d in derivatives: formula = d.formula - uses_grad = uses_grad or bool(re.findall(IDENT_REGEX.format('grad'), formula)) - num_grads_uses += len(re.findall(IDENT_REGEX.format('grads'), formula)) + uses_grad = uses_grad or bool( + re.findall(IDENT_REGEX.format("grad"), formula) + ) + num_grads_uses += len(re.findall(IDENT_REGEX.format("grads"), formula)) uses_named_grads = uses_named_grads or bool(d.named_gradients) used_grads_indices.extend(used_gradient_indices(formula)) # This is a basic sanity check: the number of places we see @@ -309,26 +432,32 @@ def check_grad_usage(defn_name: str, derivatives: Sequence[Derivative]) -> None: only_used_grads_indices = num_grads_uses == len(used_grads_indices) if uses_grad and num_grads_uses > 0: - raise RuntimeError(f"Derivative definition of {defn_name} in derivatives.yaml illegally " - "mixes use of 'grad' and 'grads'. Consider replacing " - "occurrences of 'grad' with 'grads[0]'") + raise RuntimeError( + f"Derivative definition of {defn_name} in derivatives.yaml illegally " + "mixes use of 'grad' and 'grads'. Consider replacing " + "occurrences of 'grad' with 'grads[0]'" + ) if only_used_grads_indices and set(used_grads_indices) == {0}: - raise RuntimeError(f"Derivative definition of {defn_name} in derivatives.yaml solely " - "refers to 'grads[0]'. If the first output is indeed the " - "only differentiable output, replace 'grads[0]' with 'grad'; " - "otherwise, there is a likely error in your derivatives " - "declaration.") + raise RuntimeError( + f"Derivative definition of {defn_name} in derivatives.yaml solely " + "refers to 'grads[0]'. If the first output is indeed the " + "only differentiable output, replace 'grads[0]' with 'grad'; " + "otherwise, there is a likely error in your derivatives " + "declaration." + ) if uses_named_grads and (uses_grad or num_grads_uses > 0): raise RuntimeError( - f'Derivative definition of {defn_name} in derivatives.yaml illegally ' + f"Derivative definition of {defn_name} in derivatives.yaml illegally " 'mixes use of "grad_RETURN_NAME" and "grad" or "grads[x]". Use ' - 'only one method for identifying gradients.') - + "only one method for identifying gradients." + ) @with_native_function - def set_up_derivatives(f: NativeFunction) -> Tuple[ + def set_up_derivatives( + f: NativeFunction, + ) -> Tuple[ Sequence[Derivative], Sequence[ForwardDerivative], Sequence[Binding], @@ -342,7 +471,9 @@ def set_up_derivatives(f: NativeFunction) -> Tuple[ args_with_derivatives_set: Set[str] = set() all_arg_names = [a.name for a in cpp_arguments(f)] - + all_ret_names = [ + r.name for r in f.func.returns + ] # only used for the assert below # output_differentiability is captured from the enclosed # scope. Don't modify it. # @@ -355,72 +486,104 @@ def set_up_derivatives(f: NativeFunction) -> Tuple[ differentiability = output_differentiability or [True] * len(f.func.returns) # A return is available as a named gradient ... available_named_gradients = [ - f'grad_{ret.name}' for ret, differentiable in zip(f.func.returns, differentiability) + f"grad_{ret.name}" + for ret, differentiable in zip(f.func.returns, differentiability) # if it has not been explicitly made undifferentiable if differentiable # and if it has a name and ret.name is not None # and if its type is differentiable - and ret.type.is_tensor_like()] + and ret.type.is_tensor_like() + ] for raw_names in sorted(defn.keys()): formula = defn[raw_names] names = split_names(raw_names) + for name in names: + assert not (name in all_arg_names and name in all_ret_names), ( + f"While processing the derivative formula for '{f.func.name}' wrt '{name}', " + f"expected '{name}' to not be both an input arg and named return. " + ) + if is_forward_derivative_definition(all_arg_names, names): forward_derivatives.append(create_forward_derivative(f, formula, names)) else: - if formula.lower().strip() == 'non_differentiable': + if formula.lower().strip() == "non_differentiable": non_differentiable_arg_names += names else: - derivative = create_derivative(f, formula, names, - available_named_gradients) + derivative = create_derivative( + f, formula, names, available_named_gradients + ) derivatives.append(derivative) args_with_derivatives_set |= set(names) overlap = args_with_derivatives_set.intersection(non_differentiable_arg_names) if overlap: - raise RuntimeError(f'derivatives definition for {defn} have overlapped non_differentiable ' - f'and differentiable variables: {overlap}') + raise RuntimeError( + f"derivatives definition for {defn} have overlapped non_differentiable " + f"and differentiable variables: {overlap}" + ) # Next, let us determine the list of inputs in order. # TODO: do we need eagerly calculate and save it here? Can it be derived # from NativeFunction and `derivatives` on callsites instead? - args_with_derivatives = [a for a in cpp_arguments(f) if a.name in args_with_derivatives_set] + args_with_derivatives = [ + a for a in cpp_arguments(f) if a.name in args_with_derivatives_set + ] # Postprocess forward derivatives definitions now that we know the differentiable arguments - forward_derivatives = postprocess_forward_derivatives(f, defn_name, all_arg_names, derivatives, - forward_derivatives, args_with_derivatives) + forward_derivatives = postprocess_forward_derivatives( + f, + defn_name, + all_arg_names, + derivatives, + forward_derivatives, + args_with_derivatives, + ) # Test to see if the use of 'grads' makes sense. check_grad_usage(defn_name, derivatives) - return (derivatives, forward_derivatives, args_with_derivatives, - non_differentiable_arg_names, available_named_gradients) + return ( + derivatives, + forward_derivatives, + args_with_derivatives, + non_differentiable_arg_names, + available_named_gradients, + ) # NB: Removes 'name' from defn dictionary - specification = defn.pop('name') + specification = defn.pop("name") defn_name, _ = split_name_params(specification) # NB: Removes 'output_differentiability' from defn dictionary # `None` means all differentiable. - output_differentiability = defn.pop('output_differentiability', None) + output_differentiability = defn.pop("output_differentiability", None) output_differentiability_conditions = None - if output_differentiability and any([isinstance(diff, str) for diff in output_differentiability]): + if output_differentiability and any( + [isinstance(diff, str) for diff in output_differentiability] + ): if len(output_differentiability) != 1: - raise RuntimeError(f'Not supported: for {specification},' - f'output_differentiability must either be ' - f'List[bool] or a List[str] where each str is a ' - f'condition. In the case where it is a condition, ' - f'we only support single-output functions. ' - f'Please file us an issue. ') + raise RuntimeError( + f"Not supported: for {specification}," + f"output_differentiability must either be " + f"List[bool] or a List[str] where each str is a " + f"condition. In the case where it is a condition, " + f"we only support single-output functions. " + f"Please file us an issue. " + ) output_differentiability_conditions = output_differentiability output_differentiability = [True] schema_function = functions_by_schema.get(specification) if not schema_function: - avail = '\n'.join(k for k, v in functions_by_schema.items() if cpp.name(v.func) == defn_name) - raise RuntimeError(f'could not find ATen function for schema: {specification} ' - f'. Available signatures:\n{avail}') + avail = "\n".join( + k for k, v in functions_by_schema.items() if cpp.name(v.func) == defn_name + ) + raise RuntimeError( + f"could not find ATen function for schema: {specification} " + f". Available signatures:\n{avail}" + ) # now map this to the legacy schema; this isn't technically necessary, but we'd need some logic here # to map in-place schemas to the out-of-place variants. @@ -428,24 +591,39 @@ def set_up_derivatives(f: NativeFunction) -> Tuple[ signature = schema_function.func.signature() functions = functions_by_signature[signature] if len(functions) == 0: - avail = '\n'.join(str(k) for k, v in functions_by_signature.items() if cpp.name(k) == defn_name) - raise RuntimeError(f'could not find ATen function for legacy signature: {signature} ' - f'corresponding to schema {specification}. Please report a bug to PyTorch. ' - f'Available signatures:\n{avail}') + avail = "\n".join( + str(k) + for k, v in functions_by_signature.items() + if cpp.name(k) == defn_name + ) + raise RuntimeError( + f"could not find ATen function for legacy signature: {signature} " + f"corresponding to schema {specification}. Please report a bug to PyTorch. " + f"Available signatures:\n{avail}" + ) canonical = canonical_function(functions, defn_name) - if 'grad_input_mask' in (a.name for a in cpp_arguments(canonical)): - raise RuntimeError(f"Schema for {defn_name} has an argument named grad_input_mask, " - "but this name would be shadowed by our codegen. " - "Please use a different name in native_functions.yaml.") - - if 'result' in (a.name for a in cpp_arguments(canonical)): - raise RuntimeError(f"Schema for {defn_name} has an argument named result, " - "but this is only allowed for outputs." - "Please use a different name in native_functions.yaml.") - - (derivatives, forward_derivatives, args_with_derivatives, - non_differentiable_arg_names, available_named_gradients) = set_up_derivatives(canonical) + if "grad_input_mask" in (a.name for a in cpp_arguments(canonical)): + raise RuntimeError( + f"Schema for {defn_name} has an argument named grad_input_mask, " + "but this name would be shadowed by our codegen. " + "Please use a different name in native_functions.yaml." + ) + + if "result" in (a.name for a in cpp_arguments(canonical)): + raise RuntimeError( + f"Schema for {defn_name} has an argument named result, " + "but this is only allowed for outputs." + "Please use a different name in native_functions.yaml." + ) + + ( + derivatives, + forward_derivatives, + args_with_derivatives, + non_differentiable_arg_names, + available_named_gradients, + ) = set_up_derivatives(canonical) used_named_gradients: Set[str] = set() for d in derivatives: @@ -455,7 +633,7 @@ def set_up_derivatives(f: NativeFunction) -> Tuple[ op = None if args_with_derivatives: op_prefix = _create_op_prefix(defn_name) - op = f'{op_prefix}{op_counter[op_prefix]}' + op = f"{op_prefix}{op_counter[op_prefix]}" op_counter[op_prefix] += 1 return DifferentiabilityInfo( @@ -474,7 +652,9 @@ def set_up_derivatives(f: NativeFunction) -> Tuple[ output_differentiability_conditions=output_differentiability_conditions, ) -GRAD_INDEX_REGEX = r'(?:^|\W)grads\[(\d+)\]' + +GRAD_INDEX_REGEX = r"(?:^|\W)grads\[(\d+)\]" + def used_gradient_indices(formula: str) -> List[int]: """Determine a list of gradient indices (the i in grads[i]) that @@ -485,106 +665,167 @@ def used_gradient_indices(formula: str) -> List[int]: """ return [int(i) for i in re.findall(GRAD_INDEX_REGEX, formula)] + def saved_variables( formula: str, nctypes: List[NamedCType], var_names: Tuple[str, ...], ) -> Tuple[str, Tuple[SavedAttribute, ...]]: - def stride_expr(name: str) -> str: assert var_names == (name,), ( 'Replacement for ".strides()" is currently only supported for single derivatives of the same tensor ' - 'that ".strides()" is being called on.') + 'that ".strides()" is being called on.' + ) return f'strides_or_error({name}, "{name}")' REPLACEMENTS: List[Tuple[str, Dict[str, Any]]] = [ # replace self.sizes() with self_sizes - (r'{}.sizes\(\)', { - 'suffix': '_sizes', - 'nctype': lambda name: NamedCType(name, BaseCType(intArrayRefT)), - }), + ( + r"{}.sizes\(\)", + { + "suffix": "_sizes", + "nctype": lambda name: NamedCType(name, BaseCType(intArrayRefT)), + }, + ), # replace self->sizes() with self_sizes_opt - (r'{}->sizes\(\)', { - 'suffix': '_sizes_opt', - 'nctype': lambda name: NamedCType(name, OptionalCType(BaseCType(intArrayRefT))), - 'expr': lambda name: f'{name}.has_value() ? c10::optional({name}->sizes()) : c10::nullopt', - }), + ( + r"{}->sizes\(\)", + { + "suffix": "_sizes_opt", + "nctype": lambda name: NamedCType( + name, OptionalCType(BaseCType(intArrayRefT)) + ), + "expr": lambda name: f"{name}.has_value() ? c10::optional({name}->sizes()) : c10::nullopt", + }, + ), # replace self.options() with self_options - (r'{}.options\(\)', { - 'suffix': '_options', - 'nctype': lambda name: NamedCType(name, BaseCType(tensorOptionsT)), - }), + ( + r"{}.options\(\)", + { + "suffix": "_options", + "nctype": lambda name: NamedCType(name, BaseCType(tensorOptionsT)), + }, + ), # replace zeros_like(self) with self_info - (r'zeros_like\({}\)', { - 'suffix': '_info', - 'nctype': lambda name: NamedCType(name, BaseCType(typeAndSizeT)), - 'expr': lambda name: name, # at save-time - 'res': lambda name: name + '_info.zeros()', # at eval-time - }), + ( + r"zeros_like\({}\)", + { + "suffix": "_info", + "nctype": lambda name: NamedCType(name, BaseCType(typeAndSizeT)), + "expr": lambda name: name, # at save-time + "res": lambda name: name + "_info.zeros()", # at eval-time + }, + ), # replace self.size(2) with self_size_2 - (r'{}.size\((\w+)\)', { - 'suffix': lambda m: '_argsize_{}'.format(*m.groups()), - 'nctype': lambda name: NamedCType(name, BaseCType(longT)), - }), + ( + r"{}.size\((\w+)\)", + { + "suffix": lambda m: "_argsize_{}".format(*m.groups()), + "nctype": lambda name: NamedCType(name, BaseCType(longT)), + }, + ), # replace self.numel() with self_numel - (r'{}.numel\(\)', { - 'suffix': '_numel', - 'nctype': lambda name: NamedCType(name, BaseCType(longT)), - }), + ( + r"{}.numel\(\)", + { + "suffix": "_numel", + "nctype": lambda name: NamedCType(name, BaseCType(longT)), + }, + ), # replace to_args_sizes(self) with self_args_sizes - (r'to_args_sizes\({}\)', { - 'suffix': '_args_sizes', - 'nctype': lambda name: NamedCType(name, VectorCType(VectorCType(BaseCType(longT)))), - }), + ( + r"to_args_sizes\({}\)", + { + "suffix": "_args_sizes", + "nctype": lambda name: NamedCType( + name, VectorCType(VectorCType(BaseCType(longT))) + ), + }, + ), # replace to_args_scalartypes(self) with self_args_scalartypes - (r'to_args_scalartypes\({}\)', { - 'suffix': '_args_scalartypes', - 'nctype': lambda name: NamedCType(name, VectorCType(BaseCType(scalarTypeT))), - }), + ( + r"to_args_scalartypes\({}\)", + { + "suffix": "_args_scalartypes", + "nctype": lambda name: NamedCType( + name, VectorCType(BaseCType(scalarTypeT)) + ), + }, + ), # replace TensorGeometry(self) with self_geometry - (r'TensorGeometry\({}\)', { - 'suffix': '_geometry', - 'nctype': lambda name: NamedCType(name, BaseCType(tensorGeometryT)), - }), - (r'{}.scalar_type\(\)', { - 'suffix': '_scalar_type', - 'nctype': lambda name: NamedCType(name, BaseCType(scalarTypeT)), - }), + ( + r"TensorGeometry\({}\)", + { + "suffix": "_geometry", + "nctype": lambda name: NamedCType(name, BaseCType(tensorGeometryT)), + }, + ), + ( + r"{}.scalar_type\(\)", + { + "suffix": "_scalar_type", + "nctype": lambda name: NamedCType(name, BaseCType(scalarTypeT)), + }, + ), # replace self.dim() with self_dim - (r'{}.dim\(\)', { - 'suffix': '_dim', - 'nctype': lambda name: NamedCType(name, BaseCType(longT)), - }), + ( + r"{}.dim\(\)", + { + "suffix": "_dim", + "nctype": lambda name: NamedCType(name, BaseCType(longT)), + }, + ), # replace self.strides() with self_strides - (r'{}.strides\(\)', { - 'suffix': '_strides', - 'nctype': lambda name: NamedCType(name, BaseCType(intArrayRefT)), - 'expr': stride_expr, - }), + ( + r"{}.strides\(\)", + { + "suffix": "_strides", + "nctype": lambda name: NamedCType(name, BaseCType(intArrayRefT)), + "expr": stride_expr, + }, + ), + # replace self.layout() with self_layout + ( + r"{}.layout\(\)", + { + "suffix": "_layout", + "nctype": lambda name: NamedCType(name, BaseCType(layoutT)), + }, + ), # replace self.is_conj() with self_conjugate - (r'{}.is_conj\(\)', { - 'suffix': '_conjugate', - 'nctype': lambda name: NamedCType(name, BaseCType(boolT)), - }) + ( + r"{}.is_conj\(\)", + { + "suffix": "_conjugate", + "nctype": lambda name: NamedCType(name, BaseCType(boolT)), + }, + ), ] # find which arguments need to be saved saved: List[SavedAttribute] = [] for nctype in nctypes: - name = nctype.name.name if isinstance(nctype.name, SpecialArgName) else nctype.name + name = ( + nctype.name.name if isinstance(nctype.name, SpecialArgName) else nctype.name + ) # First search the formula for expressions which can be evaluated # when the autograd Function is created to avoid saving variables for regex, info in REPLACEMENTS: + def repl(m: Match[str]) -> str: - suffix: str = info['suffix'](m) if callable(info['suffix']) else info['suffix'] - expr: str = info['expr'](name) if 'expr' in info else m.group(0) - saved.append(SavedAttribute( - nctype=info['nctype'](name + suffix), - expr=expr, - )) - if 'res' in info: - replacement: str = info['res'](name) + suffix: str = ( + info["suffix"](m) if callable(info["suffix"]) else info["suffix"] + ) + expr: str = info["expr"](name) if "expr" in info else m.group(0) + saved.append( + SavedAttribute( + nctype=info["nctype"](name + suffix), + expr=expr, + ) + ) + if "res" in info: + replacement: str = info["res"](name) return replacement return name + suffix @@ -595,19 +836,23 @@ def repl(m: Match[str]) -> str: # the backward function if nctype.type == OptionalCType(BaseCType(stringT)): formula = re.sub( - rf'\b{name}\b', - f'{name}.has_value() ? c10::optional({name}.value()) : c10::nullopt', - formula) + rf"\b{name}\b", + f"{name}.has_value() ? c10::optional({name}.value()) : c10::nullopt", + formula, + ) # Find any variables which remain in the formula and save them if re.search(IDENT_REGEX.format(name), formula): - saved.append(SavedAttribute( - nctype=nctype, - expr=name, - )) + saved.append( + SavedAttribute( + nctype=nctype, + expr=name, + ) + ) return formula, tuple(saved) + def _create_op_prefix(name: str) -> str: """Takes a native function name converts to a op prefix name. @@ -621,15 +866,19 @@ def _create_op_prefix(name: str) -> str: >>> _create_op_prefix('add') 'AddBackward' """ - camel_case = ''.join([p.title() for p in name.split('_')]) - return (camel_case + 'Backward').replace('ForwardBackward', 'Backward') + camel_case = "".join([p.title() for p in name.split("_")]) + return (camel_case + "Backward").replace("ForwardBackward", "Backward") def dedup_vars(vars: Sequence[SavedAttribute]) -> Sequence[SavedAttribute]: seen: Set[str] = set() saved: List[SavedAttribute] = [] for var in vars: - name = var.nctype.name.name if isinstance(var.nctype.name, SpecialArgName) else var.nctype.name + name = ( + var.nctype.name.name + if isinstance(var.nctype.name, SpecialArgName) + else var.nctype.name + ) if name in seen: continue seen.add(name) diff --git a/tools/autograd/templates/python_nn_functions.cpp b/tools/autograd/templates/python_nn_functions.cpp index 5465e6214387..13b3d47cf448 100644 --- a/tools/autograd/templates/python_nn_functions.cpp +++ b/tools/autograd/templates/python_nn_functions.cpp @@ -12,6 +12,7 @@ #include "torch/csrc/utils/pycfunction_helpers.h" #include "torch/csrc/utils/python_arg_parser.h" #include "torch/csrc/utils/structseq.h" +#include "torch/csrc/utils/tensor_memoryformats.h" #ifndef AT_PER_OPERATOR_HEADERS #include @@ -43,7 +44,7 @@ static PyObject * THPVariable__parse_to(PyObject* module, PyObject* args, PyObje ParsedArgs<5> parsed_args; auto r = parser.parse(args, kwargs, parsed_args); if (r.has_torch_function()) { - return handle_torch_function(r, args, kwargs, THPNNVariableFunctionsModule, "torch.nn"); + return handle_torch_function(r, args, kwargs, THPNNVariableFunctionsModule, "torch.nn", "_parse_to"); } auto parsed = parse_to_conversion(r, /*allow_copy*/ false); // we don't want copy for nn.Module.to auto& device = std::get<0>(parsed); @@ -66,7 +67,7 @@ static PyObject * THPVariable__parse_to(PyObject* module, PyObject* args, PyObje } PyTuple_SET_ITEM(tuple.get(), 2, torch::autograd::utils::wrap(non_blocking)); if (opt_memory_format.has_value()) { - PyTuple_SET_ITEM(tuple.get(), 3, THPMemoryFormat_New(opt_memory_format.value(), "unused_name")); + PyTuple_SET_ITEM(tuple.get(), 3, torch::utils::getTHPMemoryFormat(opt_memory_format.value()).release().ptr()); } else { Py_INCREF(Py_None); PyTuple_SET_ITEM(tuple.get(), 3, Py_None); diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp index b3d6ae705c51..ad14d2c7c20c 100644 --- a/tools/autograd/templates/python_variable_methods.cpp +++ b/tools/autograd/templates/python_variable_methods.cpp @@ -231,7 +231,11 @@ static PyObject * THPVariable_numel(PyObject* self, PyObject* args) return handle_torch_function(self, "numel", args); } auto& self_ = THPVariable_Unpack(self); - return THPUtils_packInt64(self_.numel()); + if (jit::tracer::isTracing()) { + return wrap(jit::tracer::getNumelOf(self_)); + } else { + return THPUtils_packInt64(self_.numel()); + } END_HANDLE_TH_ERRORS } @@ -541,6 +545,28 @@ static PyObject * THPVariable_xpu(PyObject* self, PyObject* args, PyObject* kwar END_HANDLE_TH_ERRORS } +static PyObject * THPVariable_ipu(PyObject* self, PyObject* args, PyObject* kwargs) +{ + HANDLE_TH_ERRORS + static PythonArgParser parser({ + "ipu(Device? device=None, bool non_blocking=False, *, MemoryFormat? memory_format=None)", + "ipu(Device? device=None, bool async=False, *, MemoryFormat? memory_format=None)|deprecated" + }); + auto& self_ = THPVariable_Unpack(self); + ParsedArgs<3> parsed_args; + auto r = parser.parse(self, args, kwargs, parsed_args); + + if (r.has_torch_function()) { + return handle_torch_function(r, self, args, kwargs, THPVariableClass, "torch.Tensor"); + } + + auto device = r.isNone(0) ? at::Device(at::DeviceType::IPU) : r.device(0); + auto opt_memory_format = r.memoryformatOptional(2); + TORCH_CHECK(device.is_ipu(), "Invalid device, must be ipu device"); + return THPVariable_Wrap(dispatch_to(self_, device, r.toBool(1), false, opt_memory_format)); + END_HANDLE_TH_ERRORS +} + static PyObject * THPVariable_to_type(PyObject* self, ScalarType scalarType, c10::optional optional_memory_format) { HANDLE_TH_ERRORS auto& self_ = THPVariable_Unpack(self); @@ -1091,6 +1117,7 @@ static PyObject* THPVariable_set_( "set_(Storage source)", "set_(Storage source, int64_t storage_offset, IntArrayRef size, IntArrayRef stride=None)", "set_(Tensor source)", + "set_(Tensor source, int64_t storage_offset, IntArrayRef size, IntArrayRef stride=None)", }, /*traceable=*/false); @@ -1114,7 +1141,7 @@ static PyObject* THPVariable_set_( at::Storage storage = _r.storage(0, storage_scalar_type, is_typed_storage); TORCH_CHECK(storage_scalar_type == self.dtype() || !is_typed_storage, "Expected a Storage of type ", self.dtype(), - " or an UntypedStorage, but got type ", storage_scalar_type, + " or an _UntypedStorage, but got type ", storage_scalar_type, " for argument 1 'storage'"); auto dispatch_set_ = [](const Tensor& self, Storage source) -> Tensor { pybind11::gil_scoped_release no_gil; @@ -1130,7 +1157,7 @@ static PyObject* THPVariable_set_( at::Storage storage = _r.storage(0, storage_scalar_type, is_typed_storage); TORCH_CHECK(storage_scalar_type == self.dtype() || !is_typed_storage, "Expected a Storage of type ", self.dtype(), - " or an UntypedStorage, but got type ", storage_scalar_type, + " or an _UntypedStorage, but got type ", storage_scalar_type, " for argument 1 'storage'"); auto dispatch_set_ = [](const Tensor& self, Storage source, @@ -1152,6 +1179,21 @@ static PyObject* THPVariable_set_( }; return wrap(dispatch_set_(self, _r.tensor(0))); } + case 4: { + // aten::set_.source_Tensor_storage_offset(Tensor(a!) self, Tensor + // source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!) + at::Tensor storage = _r.tensor(0); + auto dispatch_set_ = [](const Tensor& self, + const Tensor& source, + int64_t storage_offset, + IntArrayRef size, + IntArrayRef stride) -> Tensor { + pybind11::gil_scoped_release no_gil; + return self.set_(source, storage_offset, size, stride); + }; + return wrap(dispatch_set_( + self, storage, _r.toInt64(1), _r.intlist(2), _r.intlist(3))); + } } Py_RETURN_NONE; END_HANDLE_TH_ERRORS @@ -1205,6 +1247,7 @@ PyMethodDef variable_methods[] = { {"cpu", castPyCFunctionWithKeywords(THPVariable_cpu), METH_VARARGS | METH_KEYWORDS, NULL}, {"cuda", castPyCFunctionWithKeywords(THPVariable_cuda), METH_VARARGS | METH_KEYWORDS, NULL}, {"xpu", castPyCFunctionWithKeywords(THPVariable_xpu), METH_VARARGS | METH_KEYWORDS, NULL}, + {"ipu", castPyCFunctionWithKeywords(THPVariable_ipu), METH_VARARGS | METH_KEYWORDS, NULL}, {"data_ptr", THPVariable_data_ptr, METH_NOARGS, NULL}, {"dim", THPVariable_dim, METH_NOARGS, NULL}, {"has_names", THPVariable_has_names, METH_NOARGS, NULL}, diff --git a/tools/bazel.bzl b/tools/bazel.bzl index b932b812c322..75216430b2e4 100644 --- a/tools/bazel.bzl +++ b/tools/bazel.bzl @@ -1,16 +1,39 @@ -load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test") +load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test") load("@rules_cuda//cuda:defs.bzl", "requires_cuda_enabled") load("//c10/macros:cmake_configure_file.bzl", "cmake_configure_file") +load("//tools/config:defs.bzl", "if_cuda") + +def _genrule(**kwds): + if _enabled(**kwds): + native.genrule(**kwds) + +def _py_library(name, **kwds): + deps = [dep for dep in kwds.pop("deps", []) if dep != None] + native.py_library(name = name, deps = deps, **kwds) + +def _requirement(_pypi_project): + return None # Rules implementation for the Bazel build system. Since the common # build structure aims to replicate Bazel as much as possible, most of # the rules simply forward to the Bazel definitions. rules = struct( + cc_binary = cc_binary, cc_library = cc_library, cc_test = cc_test, cmake_configure_file = cmake_configure_file, filegroup = native.filegroup, + genrule = _genrule, glob = native.glob, + if_cuda = if_cuda, + py_binary = native.py_binary, + py_library = _py_library, + requirement = _requirement, requires_cuda_enabled = requires_cuda_enabled, select = select, + test_suite = native.test_suite, ) + +def _enabled(tags = [], **_kwds): + """Determines if the target is enabled.""" + return "-bazel" not in tags diff --git a/tools/build_defs/fb_xplat_genrule.bzl b/tools/build_defs/fb_xplat_genrule.bzl new file mode 100644 index 000000000000..ddc19b2373e9 --- /dev/null +++ b/tools/build_defs/fb_xplat_genrule.bzl @@ -0,0 +1,5 @@ +def fb_xplat_genrule(default_outs = ["."], **kwargs): + genrule( + # default_outs=default_outs, # only needed for internal BUCK + **kwargs + ) diff --git a/tools/build_defs/glob_defs.bzl b/tools/build_defs/glob_defs.bzl new file mode 100644 index 000000000000..a0eea247e839 --- /dev/null +++ b/tools/build_defs/glob_defs.bzl @@ -0,0 +1,89 @@ +"""Provides utility macros for working with globs.""" + +load("@bazel_skylib//lib:paths.bzl", "paths") + +def subdir_glob(glob_specs, exclude = None, prefix = ""): + """Returns a dict of sub-directory relative paths to full paths. + + The subdir_glob() function is useful for defining header maps for C/C++ + libraries which should be relative the given sub-directory. + Given a list of tuples, the form of (relative-sub-directory, glob-pattern), + it returns a dict of sub-directory relative paths to full paths. + + Please refer to native.glob() for explanations and examples of the pattern. + + Args: + glob_specs: The array of tuples in form of + (relative-sub-directory, glob-pattern inside relative-sub-directory). + type: List[Tuple[str, str]] + exclude: A list of patterns to identify files that should be removed + from the set specified by the first argument. Defaults to []. + type: Optional[List[str]] + prefix: If is not None, prepends it to each key in the dictionary. + Defaults to None. + type: Optional[str] + + Returns: + A dict of sub-directory relative paths to full paths. + """ + if exclude == None: + exclude = [] + + results = [] + + for dirpath, glob_pattern in glob_specs: + results.append( + _single_subdir_glob(dirpath, glob_pattern, exclude, prefix), + ) + + return _merge_maps(*results) + +def _merge_maps(*file_maps): + result = {} + for file_map in file_maps: + for key in file_map: + if key in result and result[key] != file_map[key]: + fail( + "Conflicting files in file search paths. " + + "\"%s\" maps to both \"%s\" and \"%s\"." % + (key, result[key], file_map[key]), + ) + + result[key] = file_map[key] + + return result + +def _single_subdir_glob(dirpath, glob_pattern, exclude = None, prefix = None): + if exclude == None: + exclude = [] + results = {} + files = native.glob([paths.join(dirpath, glob_pattern)], exclude = exclude) + for f in files: + if dirpath: + key = f[len(dirpath) + 1:] + else: + key = f + if prefix: + key = paths.join(prefix, key) + results[key] = f + + return results + +# Using a flat list will trigger build errors on Android. +# cxx_library will generate an apple_library on iOS, a cxx_library on Android. +# Those rules have different behaviors. Using a map will make the behavior consistent. +# +def glob_private_headers(glob_patterns, exclude = []): + result = {} + headers = native.glob(glob_patterns, exclude = exclude) + for header in headers: + result[paths.basename(header)] = header + return result + +def glob(include, exclude = (), **kwargs): + buildfile = native.read_config("buildfile", "name", "BUCK") + subpkgs = [ + target[:-len(buildfile)] + "**/*" + for target in native.glob(["*/**/" + buildfile]) + ] + return native.glob(include, exclude = list(exclude) + subpkgs, **kwargs) diff --git a/tools/build_defs/type_defs.bzl b/tools/build_defs/type_defs.bzl new file mode 100644 index 000000000000..afc02702e8ad --- /dev/null +++ b/tools/build_defs/type_defs.bzl @@ -0,0 +1,128 @@ +"""Provides macros for queries type information.""" + +_SELECT_TYPE = type(select({"DEFAULT": []})) + +def is_select(thing): + return type(thing) == _SELECT_TYPE + +def is_unicode(arg): + """Checks if provided instance has a unicode type. + + Args: + arg: An instance to check. type: Any + + Returns: + True for unicode instances, False otherwise. rtype: bool + """ + return hasattr(arg, "encode") + +_STRING_TYPE = type("") + +def is_string(arg): + """Checks if provided instance has a string type. + + Args: + arg: An instance to check. type: Any + + Returns: + True for string instances, False otherwise. rtype: bool + """ + return type(arg) == _STRING_TYPE + +_LIST_TYPE = type([]) + +def is_list(arg): + """Checks if provided instance has a list type. + + Args: + arg: An instance to check. type: Any + + Returns: + True for list instances, False otherwise. rtype: bool + """ + return type(arg) == _LIST_TYPE + +_DICT_TYPE = type({}) + +def is_dict(arg): + """Checks if provided instance has a dict type. + + Args: + arg: An instance to check. type: Any + + Returns: + True for dict instances, False otherwise. rtype: bool + """ + return type(arg) == _DICT_TYPE + +_TUPLE_TYPE = type(()) + +def is_tuple(arg): + """Checks if provided instance has a tuple type. + + Args: + arg: An instance to check. type: Any + + Returns: + True for tuple instances, False otherwise. rtype: bool + """ + return type(arg) == _TUPLE_TYPE + +def is_collection(arg): + """Checks if provided instance is a collection subtype. + + This will either be a dict, list, or tuple. + """ + return is_dict(arg) or is_list(arg) or is_tuple(arg) + +_BOOL_TYPE = type(True) + +def is_bool(arg): + """Checks if provided instance is a boolean value. + + Args: + arg: An instance ot check. type: Any + + Returns: + True for boolean values, False otherwise. rtype: bool + """ + return type(arg) == _BOOL_TYPE + +_NUMBER_TYPE = type(1) + +def is_number(arg): + """Checks if provided instance is a number value. + + Args: + arg: An instance ot check. type: Any + + Returns: + True for number values, False otherwise. rtype: bool + """ + return type(arg) == _NUMBER_TYPE + +_STRUCT_TYPE = type(struct()) # Starlark returns the same type for all structs + +def is_struct(arg): + """Checks if provided instance is a struct value. + + Args: + arg: An instance ot check. type: Any + + Returns: + True for struct values, False otherwise. rtype: bool + """ + return type(arg) == _STRUCT_TYPE + +type_utils = struct( + is_bool = is_bool, + is_number = is_number, + is_string = is_string, + is_unicode = is_unicode, + is_list = is_list, + is_dict = is_dict, + is_tuple = is_tuple, + is_collection = is_collection, + is_select = is_select, + is_struct = is_struct, +) diff --git a/tools/build_libtorch.py b/tools/build_libtorch.py index c263e5084f78..c5508773f643 100644 --- a/tools/build_libtorch.py +++ b/tools/build_libtorch.py @@ -11,13 +11,22 @@ from tools.build_pytorch_libs import build_caffe2 from tools.setup_helpers.cmake import CMake -if __name__ == '__main__': +if __name__ == "__main__": # Placeholder for future interface. For now just gives a nice -h. - parser = argparse.ArgumentParser(description='Build libtorch') - parser.add_argument('--rerun-cmake', action="store_true", help='rerun cmake') - parser.add_argument('--cmake-only', action="store_true", - help='Stop once cmake terminates. Leave users a chance to adjust build options') + parser = argparse.ArgumentParser(description="Build libtorch") + parser.add_argument("--rerun-cmake", action="store_true", help="rerun cmake") + parser.add_argument( + "--cmake-only", + action="store_true", + help="Stop once cmake terminates. Leave users a chance to adjust build options", + ) options = parser.parse_args() - build_caffe2(version=None, cmake_python_library=None, build_python=False, - rerun_cmake=options.rerun_cmake, cmake_only=options.cmake_only, cmake=CMake()) + build_caffe2( + version=None, + cmake_python_library=None, + build_python=False, + rerun_cmake=options.rerun_cmake, + cmake_only=options.cmake_only, + cmake=CMake(), + ) diff --git a/tools/build_pytorch_libs.py b/tools/build_pytorch_libs.py index d795770c8844..eba8ea1dcf66 100644 --- a/tools/build_pytorch_libs.py +++ b/tools/build_pytorch_libs.py @@ -1,4 +1,5 @@ import os +import platform from glob import glob import shutil from typing import Dict, Optional @@ -8,8 +9,30 @@ from setuptools import distutils # type: ignore[import] + def _overlay_windows_vcvars(env: Dict[str, str]) -> Dict[str, str]: - vc_arch = 'x64' if IS_64BIT else 'x86' + vc_arch = "x64" if IS_64BIT else "x86" + + if platform.machine() == "ARM64": + vc_arch = "x64_arm64" + + # First Win11 Windows on Arm build version that supports x64 emulation + # is 10.0.22000. + win11_1st_version = (10, 0, 22000) + current_win_version = tuple( + int(version_part) for version_part in platform.version().split(".") + ) + if current_win_version < win11_1st_version: + vc_arch = "x86_arm64" + print( + "Warning: 32-bit toolchain will be used, but 64-bit linker " + "is recommended to avoid out-of-memory linker error!" + ) + print( + "Warning: Please consider upgrading to Win11, where x64 " + "emulation is enabled!" + ) + vc_env: Dict[str, str] = distutils._msvccompiler._get_vc_env(vc_arch) # Keys in `_get_vc_env` are always lowercase. # We turn them into uppercase before overlaying vcvars @@ -29,19 +52,21 @@ def _create_build_env() -> Dict[str, str]: # you should NEVER add something to this list. It is bad practice to # have cmake read the environment my_env = os.environ.copy() - if 'CUDA_HOME' in my_env: # Keep CUDA_HOME. This env variable is still used in other part. - my_env['CUDA_BIN_PATH'] = my_env['CUDA_HOME'] + if ( + "CUDA_HOME" in my_env + ): # Keep CUDA_HOME. This env variable is still used in other part. + my_env["CUDA_BIN_PATH"] = my_env["CUDA_HOME"] elif IS_WINDOWS: # we should eventually make this as part of FindCUDA. - cuda_win = glob('C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*') + cuda_win = glob("C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*") if len(cuda_win) > 0: - my_env['CUDA_BIN_PATH'] = cuda_win[0] + my_env["CUDA_BIN_PATH"] = cuda_win[0] if IS_WINDOWS and USE_NINJA: # When using Ninja under Windows, the gcc toolchain will be chosen as # default. But it should be set to MSVC as the user's first choice. my_env = _overlay_windows_vcvars(my_env) - my_env.setdefault('CC', 'cl') - my_env.setdefault('CXX', 'cl') + my_env.setdefault("CC", "cl") + my_env.setdefault("CXX", "cl") return my_env @@ -54,18 +79,15 @@ def build_caffe2( cmake: CMake, ) -> None: my_env = _create_build_env() - build_test = not check_negative_env_flag('BUILD_TEST') - cmake.generate(version, - cmake_python_library, - build_python, - build_test, - my_env, - rerun_cmake) + build_test = not check_negative_env_flag("BUILD_TEST") + cmake.generate( + version, cmake_python_library, build_python, build_test, my_env, rerun_cmake + ) if cmake_only: return cmake.build(my_env) if build_python: - caffe2_proto_dir = os.path.join(cmake.build_dir, 'caffe2', 'proto') - for proto_file in glob(os.path.join(caffe2_proto_dir, '*.py')): - if proto_file != os.path.join(caffe2_proto_dir, '__init__.py'): - shutil.copy(proto_file, os.path.join('caffe2', 'proto')) + caffe2_proto_dir = os.path.join(cmake.build_dir, "caffe2", "proto") + for proto_file in glob(os.path.join(caffe2_proto_dir, "*.py")): + if proto_file != os.path.join(caffe2_proto_dir, "__init__.py"): + shutil.copy(proto_file, os.path.join("caffe2", "proto")) diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 21cef9716924..52e8dd25f5a6 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -1,51 +1,41 @@ -# In both open-source and fbcode builds, these are generated into -# torch/csrc/{autgrad,jit}/generated.i -GENERATED_CPP = [ - "autograd/generated/Functions.cpp", - "autograd/generated/VariableType_0.cpp", - "autograd/generated/VariableType_1.cpp", - "autograd/generated/VariableType_2.cpp", - "autograd/generated/VariableType_3.cpp", - "autograd/generated/VariableType_4.cpp", - "autograd/generated/TraceType_0.cpp", - "autograd/generated/TraceType_1.cpp", - "autograd/generated/TraceType_2.cpp", - "autograd/generated/TraceType_3.cpp", - "autograd/generated/TraceType_4.cpp", - "autograd/generated/ADInplaceOrViewType_0.cpp", - "autograd/generated/ADInplaceOrViewType_1.cpp", - "autograd/generated/python_functions_0.cpp", - "autograd/generated/python_functions_1.cpp", - "autograd/generated/python_functions_2.cpp", - "autograd/generated/python_functions_3.cpp", - "autograd/generated/python_functions_4.cpp", - "autograd/generated/python_nn_functions.cpp", - "autograd/generated/python_fft_functions.cpp", - "autograd/generated/python_linalg_functions.cpp", - "autograd/generated/python_return_types.cpp", - "autograd/generated/python_sparse_functions.cpp", - "autograd/generated/python_special_functions.cpp", - "autograd/generated/python_torch_functions_0.cpp", - "autograd/generated/python_torch_functions_1.cpp", - "autograd/generated/python_torch_functions_2.cpp", - "autograd/generated/python_variable_methods.cpp", +# WARNING: the contents of this file must BOTH be valid Starlark (for Buck and + +# Bazel) as well as valid Python (for our cmake build). This means that +# load() directives are not allowed (as they are not recognized by Python). +# If you want to fix this, figure out how run this file from cmake with a proper +# Starlark interpreter as part of the default OSS build process. If you need +# some nontrivial Starlark features, make a separate bzl file (remember that + +# bzl files are not exported via ShipIt by default, so you may also need to +# update PyTorch's ShipIt config) + +# This is duplicated in caffe2/CMakeLists.txt for now and not yet used in buck +GENERATED_LAZY_TS_CPP = [ + "lazy/generated/LazyNativeFunctions.cpp", + "lazy/generated/RegisterAutogradLazy.cpp", + "lazy/generated/RegisterLazy.cpp", ] # NVFuser runtime library libtorch_nvfuser_runtime_sources = [ + "torch/csrc/jit/codegen/cuda/runtime/array.cu", "torch/csrc/jit/codegen/cuda/runtime/bf16_support.cu", "torch/csrc/jit/codegen/cuda/runtime/block_reduction.cu", "torch/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu", "torch/csrc/jit/codegen/cuda/runtime/block_sync_default.cu", "torch/csrc/jit/codegen/cuda/runtime/broadcast.cu", "torch/csrc/jit/codegen/cuda/runtime/fp16_support.cu", + "torch/csrc/jit/codegen/cuda/runtime/fused_reduction.cu", "torch/csrc/jit/codegen/cuda/runtime/grid_broadcast.cu", "torch/csrc/jit/codegen/cuda/runtime/grid_reduction.cu", "torch/csrc/jit/codegen/cuda/runtime/grid_sync.cu", "torch/csrc/jit/codegen/cuda/runtime/helpers.cu", "torch/csrc/jit/codegen/cuda/runtime/index_utils.cu", + "torch/csrc/jit/codegen/cuda/runtime/tensorcore.cu", "torch/csrc/jit/codegen/cuda/runtime/random_numbers.cu", "torch/csrc/jit/codegen/cuda/runtime/tensor.cu", + "torch/csrc/jit/codegen/cuda/runtime/tuple.cu", + "torch/csrc/jit/codegen/cuda/runtime/type_traits.cu", "torch/csrc/jit/codegen/cuda/runtime/welford.cu", "torch/csrc/jit/codegen/cuda/runtime/warp.cu", "aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh", @@ -56,19 +46,19 @@ libtorch_nvfuser_generated_headers = ["{}.h".format(name.split("/")[-1].split(". def libtorch_generated_sources(gencode_pattern): return [gencode_pattern.format(name) for name in [ - "autograd/generated/Functions.cpp", - "autograd/generated/VariableType_0.cpp", - "autograd/generated/VariableType_1.cpp", - "autograd/generated/VariableType_2.cpp", - "autograd/generated/VariableType_3.cpp", - "autograd/generated/VariableType_4.cpp", - "autograd/generated/TraceType_0.cpp", - "autograd/generated/TraceType_1.cpp", - "autograd/generated/TraceType_2.cpp", - "autograd/generated/TraceType_3.cpp", - "autograd/generated/TraceType_4.cpp", - "autograd/generated/ADInplaceOrViewType_0.cpp", - "autograd/generated/ADInplaceOrViewType_1.cpp", + "torch/csrc/autograd/generated/Functions.cpp", + "torch/csrc/autograd/generated/VariableType_0.cpp", + "torch/csrc/autograd/generated/VariableType_1.cpp", + "torch/csrc/autograd/generated/VariableType_2.cpp", + "torch/csrc/autograd/generated/VariableType_3.cpp", + "torch/csrc/autograd/generated/VariableType_4.cpp", + "torch/csrc/autograd/generated/TraceType_0.cpp", + "torch/csrc/autograd/generated/TraceType_1.cpp", + "torch/csrc/autograd/generated/TraceType_2.cpp", + "torch/csrc/autograd/generated/TraceType_3.cpp", + "torch/csrc/autograd/generated/TraceType_4.cpp", + "torch/csrc/autograd/generated/ADInplaceOrViewType_0.cpp", + "torch/csrc/autograd/generated/ADInplaceOrViewType_1.cpp", ]] # copied from https://github.com/pytorch/pytorch/blob/f99a693cd9ff7a9b5fdc71357dac66b8192786d3/aten/src/ATen/core/CMakeLists.txt @@ -137,8 +127,10 @@ libtorch_profiler_sources = [ "torch/csrc/autograd/profiler_legacy.cpp", "torch/csrc/autograd/profiler_kineto.cpp", "torch/csrc/profiler/api.cpp", + "torch/csrc/profiler/collection.cpp", "torch/csrc/profiler/kineto_shim.cpp", "torch/csrc/profiler/nvtx_observer.cpp", + "torch/csrc/profiler/kineto_client_interface.cpp", "torch/csrc/monitor/counters.cpp", "torch/csrc/monitor/events.cpp", ] @@ -213,20 +205,22 @@ core_sources_full_mobile_no_backend_interface = [ "torch/csrc/jit/operator_upgraders/utils.cpp", "torch/csrc/jit/operator_upgraders/upgraders.cpp", "torch/csrc/jit/operator_upgraders/upgraders_entry.cpp", + "torch/csrc/jit/passes/add_if_then_else.cpp", "torch/csrc/jit/passes/annotate_warns.cpp", "torch/csrc/jit/passes/bailout_graph.cpp", + "torch/csrc/jit/passes/check_strict_fusion.cpp", "torch/csrc/jit/passes/batch_mm.cpp", "torch/csrc/jit/passes/canonicalize.cpp", "torch/csrc/jit/passes/canonicalize_graph_fuser_ops.cpp", "torch/csrc/jit/passes/clear_profiling.cpp", "torch/csrc/jit/passes/clear_undefinedness.cpp", "torch/csrc/jit/passes/common_subexpression_elimination.cpp", - "torch/csrc/jit/passes/common_expression_hoisting.cpp", "torch/csrc/jit/passes/concat_opt.cpp", "torch/csrc/jit/passes/constant_pooling.cpp", "torch/csrc/jit/passes/constant_propagation.cpp", "torch/csrc/jit/passes/restore_mutation.cpp", "torch/csrc/jit/passes/create_autodiff_subgraphs.cpp", + "torch/csrc/jit/passes/cuda_graph_fuser.cpp", "torch/csrc/jit/passes/dead_code_elimination.cpp", "torch/csrc/jit/passes/eliminate_no_ops.cpp", "torch/csrc/jit/passes/remove_redundant_profiles.cpp", @@ -263,9 +257,11 @@ core_sources_full_mobile_no_backend_interface = [ "torch/csrc/jit/passes/peephole.cpp", "torch/csrc/jit/passes/peephole_non_tensor.cpp", "torch/csrc/jit/passes/create_functional_graphs.cpp", + "torch/csrc/jit/passes/refine_tuple_types.cpp", "torch/csrc/jit/passes/remove_mutation.cpp", "torch/csrc/jit/passes/prepack_folding.cpp", "torch/csrc/jit/passes/fold_conv_bn.cpp", + "torch/csrc/jit/passes/dbr_quantization/remove_redundant_aliases.cpp", "torch/csrc/jit/passes/frozen_concat_linear.cpp", "torch/csrc/jit/passes/frozen_conv_add_relu_fusion.cpp", "torch/csrc/jit/passes/frozen_conv_folding.cpp", @@ -279,6 +275,7 @@ core_sources_full_mobile_no_backend_interface = [ "torch/csrc/jit/passes/integer_value_refinement.cpp", "torch/csrc/jit/passes/replacement_of_old_operators.cpp", "torch/csrc/jit/passes/symbolic_shape_analysis.cpp", + "torch/csrc/jit/passes/symbolic_shape_cache.cpp", "torch/csrc/jit/passes/symbolic_shape_runtime_fusion.cpp", "torch/csrc/jit/passes/specialize_autogradzero.cpp", "torch/csrc/jit/passes/update_differentiable_graph_requires_grad.cpp", @@ -307,11 +304,15 @@ core_sources_full_mobile_no_backend_interface = [ "torch/csrc/jit/runtime/interpreter/preprocess_graph.cpp", "torch/csrc/jit/runtime/interpreter.cpp", "torch/csrc/jit/runtime/logging.cpp", + "torch/csrc/jit/runtime/simple_graph_executor_impl.cpp", "torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp", "torch/csrc/jit/runtime/profiling_record.cpp", "torch/csrc/jit/runtime/script_profile.cpp", "torch/csrc/jit/runtime/symbolic_script.cpp", "torch/csrc/jit/runtime/symbolic_shape_registry.cpp", + "torch/csrc/jit/runtime/decomposition_registry.cpp", + "torch/csrc/jit/runtime/decomposition_registry_util.cpp", + "torch/csrc/jit/runtime/serialized_shape_function_registry.cpp", "torch/csrc/jit/runtime/symbolic_shape_registry_util.cpp", "torch/csrc/jit/runtime/jit_trace.cpp", "torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp", @@ -328,6 +329,7 @@ core_sources_full_mobile_no_backend_interface = [ "torch/csrc/jit/tensorexpr/cpp_codegen.cpp", "torch/csrc/jit/tensorexpr/eval.cpp", "torch/csrc/jit/tensorexpr/expr.cpp", + "torch/csrc/jit/tensorexpr/external_functions_core.cpp", "torch/csrc/jit/tensorexpr/external_functions_registry.cpp", "torch/csrc/jit/tensorexpr/graph_opt.cpp", "torch/csrc/jit/tensorexpr/hash_provider.cpp", @@ -361,6 +363,7 @@ core_sources_full_mobile_no_backend_interface = [ "torch/csrc/jit/tensorexpr/unique_name_manager.cpp", "torch/csrc/jit/testing/file_check.cpp", "torch/csrc/jit/testing/hooks_for_testing.cpp", + "torch/csrc/utils/cpp_stacktraces.cpp", "torch/csrc/utils/tensor_flatten.cpp", "torch/csrc/utils/variadic.cpp", ] @@ -372,6 +375,7 @@ core_sources_full_mobile = core_sources_full_mobile_no_backend_interface + [ core_sources_full = core_sources_full_mobile + [ "torch/csrc/jit/runtime/static/fusion.cpp", + "torch/csrc/jit/runtime/static/generated_ops.cpp", "torch/csrc/jit/runtime/static/impl.cpp", "torch/csrc/jit/runtime/static/memory_planner.cpp", "torch/csrc/jit/runtime/static/native_ops.cpp", @@ -387,6 +391,7 @@ lazy_tensor_core_sources = [ "torch/csrc/lazy/backend/backend_interface.cpp", "torch/csrc/lazy/backend/lowering_context.cpp", "torch/csrc/lazy/core/config.cpp", + "torch/csrc/lazy/core/debug_util.cpp", "torch/csrc/lazy/core/hash.cpp", "torch/csrc/lazy/core/helpers.cpp", "torch/csrc/lazy/core/ir.cpp", @@ -397,33 +402,56 @@ lazy_tensor_core_sources = [ "torch/csrc/lazy/core/lazy_view.cpp", "torch/csrc/lazy/core/metrics.cpp", "torch/csrc/lazy/core/multi_wait.cpp", + "torch/csrc/lazy/core/ops/arithmetic_ir_ops.cpp", + "torch/csrc/lazy/core/ops/utils.cpp", "torch/csrc/lazy/core/permutation_util.cpp", "torch/csrc/lazy/core/shape.cpp", + "torch/csrc/lazy/core/shape_inference.cpp", "torch/csrc/lazy/core/tensor.cpp", "torch/csrc/lazy/core/tensor_impl.cpp", "torch/csrc/lazy/core/tensor_util.cpp", "torch/csrc/lazy/core/thread_pool.cpp", - "torch/csrc/lazy/core/view_ops/as_strided.cpp", - "torch/csrc/lazy/core/view_ops/as_strided_view_update.cpp", - "torch/csrc/lazy/core/view_ops/diagonal.cpp", - "torch/csrc/lazy/core/view_ops/diagonal_view_update.cpp", - "torch/csrc/lazy/core/view_ops/narrow.cpp", - "torch/csrc/lazy/core/view_ops/narrow_view_update.cpp", - "torch/csrc/lazy/core/view_ops/permute.cpp", - "torch/csrc/lazy/core/view_ops/resize.cpp", - "torch/csrc/lazy/core/view_ops/select.cpp", - "torch/csrc/lazy/core/view_ops/squeeze.cpp", - "torch/csrc/lazy/core/view_ops/unsqueeze.cpp", - "torch/csrc/lazy/core/view_ops/select_view_update.cpp", - "torch/csrc/lazy/core/view_ops/view.cpp", + "torch/csrc/lazy/core/trie.cpp", +] + +# We can't build all of the ts backend under certain build configurations, e.g. mobile, +# since it depends on things like autograd, meta functions, which may be disabled +lazy_tensor_ts_sources = [ "torch/csrc/lazy/ts_backend/config.cpp", - "torch/csrc/lazy/ts_backend/ops/arithmetic_ir_ops.cpp", + "torch/csrc/lazy/ts_backend/dynamic_ir.cpp", + "torch/csrc/lazy/ts_backend/ops/batch_norm_ops.cpp", + "torch/csrc/lazy/ts_backend/ops/random_ops.cpp", "torch/csrc/lazy/ts_backend/ops/cast.cpp", "torch/csrc/lazy/ts_backend/ops/device_data.cpp", "torch/csrc/lazy/ts_backend/ops/expand.cpp", "torch/csrc/lazy/ts_backend/ops/generic.cpp", "torch/csrc/lazy/ts_backend/ops/scalar.cpp", + "torch/csrc/lazy/ts_backend/view_ops/as_strided.cpp", + "torch/csrc/lazy/ts_backend/view_ops/as_strided_view_update.cpp", + "torch/csrc/lazy/ts_backend/view_ops/diagonal.cpp", + "torch/csrc/lazy/ts_backend/view_ops/diagonal_view_update.cpp", + "torch/csrc/lazy/ts_backend/view_ops/narrow.cpp", + "torch/csrc/lazy/ts_backend/view_ops/narrow_view_update.cpp", + "torch/csrc/lazy/ts_backend/view_ops/permute.cpp", + "torch/csrc/lazy/ts_backend/view_ops/resize.cpp", + "torch/csrc/lazy/ts_backend/view_ops/select.cpp", + "torch/csrc/lazy/ts_backend/view_ops/squeeze.cpp", + "torch/csrc/lazy/ts_backend/view_ops/unsqueeze.cpp", + "torch/csrc/lazy/ts_backend/view_ops/select_view_update.cpp", + "torch/csrc/lazy/ts_backend/view_ops/view.cpp", "torch/csrc/lazy/ts_backend/ts_node.cpp", + "torch/csrc/lazy/ts_backend/tensor_aten_ops.cpp", + "torch/csrc/lazy/ts_backend/ts_autograd_functions.cpp", + "torch/csrc/lazy/ts_backend/ts_backend_impl.cpp", + "torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp", + "torch/csrc/lazy/ts_backend/ts_lowering_context.cpp", + "torch/csrc/lazy/ts_backend/ts_native_functions.cpp", + "torch/csrc/lazy/ts_backend/ts_node_lowering.cpp", +] + +lazy_tensor_core_python_sources = [ + "torch/csrc/lazy/python/init.cpp", + "torch/csrc/lazy/python/python_util.cpp", ] libtorch_core_sources = sorted( @@ -449,9 +477,12 @@ libtorch_distributed_base_sources = [ "torch/csrc/distributed/c10d/TCPStore.cpp", "torch/csrc/distributed/c10d/Utils.cpp", "torch/csrc/distributed/c10d/comm.cpp", + "torch/csrc/distributed/c10d/debug.cpp", "torch/csrc/distributed/c10d/default_comm_hooks.cpp", "torch/csrc/distributed/c10d/exception.cpp", "torch/csrc/distributed/c10d/logger.cpp", + "torch/csrc/distributed/c10d/logging.cpp", + "torch/csrc/distributed/c10d/quantization/quantization.cpp", "torch/csrc/distributed/c10d/reducer.cpp", "torch/csrc/distributed/c10d/sequence_num.cpp", "torch/csrc/distributed/c10d/socket.cpp", @@ -604,7 +635,18 @@ libtorch_extra_sources = libtorch_core_jit_sources + [ ] def libtorch_sources(gencode_pattern = ":generate-code[{}]"): - return libtorch_generated_sources(gencode_pattern) + libtorch_core_sources + libtorch_distributed_sources + libtorch_extra_sources + enable_flatbuffer = bool(native.read_config("fbcode", "caffe2_enable_flatbuffer", None)) + flatbuffer_serializer_sources = [ + "torch/csrc/jit/serialization/flatbuffer_serializer.cpp", + "torch/csrc/jit/serialization/flatbuffer_serializer_jit.cpp", + ] + if enable_flatbuffer: + return ( + libtorch_generated_sources(gencode_pattern) + libtorch_core_sources + libtorch_distributed_sources + libtorch_extra_sources + + flatbuffer_serializer_sources + ) + else: + return libtorch_generated_sources(gencode_pattern) + libtorch_core_sources + libtorch_distributed_sources + libtorch_extra_sources libtorch_cuda_core_sources = [ "torch/csrc/CudaIPCTypes.cpp", @@ -616,6 +658,7 @@ libtorch_cuda_core_sources = [ "torch/csrc/jit/codegen/cuda/compute_at.cpp", "torch/csrc/jit/codegen/cuda/compute_at_map.cpp", "torch/csrc/jit/codegen/cuda/codegen.cpp", + "torch/csrc/jit/codegen/cuda/contiguity.cpp", "torch/csrc/jit/codegen/cuda/dispatch.cpp", "torch/csrc/jit/codegen/cuda/expr_evaluator.cpp", "torch/csrc/jit/codegen/cuda/executor.cpp", @@ -625,11 +668,14 @@ libtorch_cuda_core_sources = [ "torch/csrc/jit/codegen/cuda/executor_utils.cpp", "torch/csrc/jit/codegen/cuda/fusion.cpp", "torch/csrc/jit/codegen/cuda/graph_fuser.cpp", + "torch/csrc/jit/codegen/cuda/grouped_reduction.cpp", "torch/csrc/jit/codegen/cuda/index_compute.cpp", "torch/csrc/jit/codegen/cuda/index_reference_replay.cpp", "torch/csrc/jit/codegen/cuda/instrumentation.cpp", "torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp", + "torch/csrc/jit/codegen/cuda/ir_builder.cpp", "torch/csrc/jit/codegen/cuda/ir_cloner.cpp", + "torch/csrc/jit/codegen/cuda/ir_container.cpp", "torch/csrc/jit/codegen/cuda/ir_graphviz.cpp", "torch/csrc/jit/codegen/cuda/ir_nodes.cpp", "torch/csrc/jit/codegen/cuda/ir_iostream.cpp", @@ -639,28 +685,36 @@ libtorch_cuda_core_sources = [ "torch/csrc/jit/codegen/cuda/kernel_cache.cpp", "torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.cpp", "torch/csrc/jit/codegen/cuda/kernel_ir.cpp", - "torch/csrc/jit/codegen/cuda/kernel_ir_builder.cpp", - "torch/csrc/jit/codegen/cuda/kernel_ir_printer.cpp", + "torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.cpp", "torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp", - "torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp", "torch/csrc/jit/codegen/cuda/lower_allocation.cpp", + "torch/csrc/jit/codegen/cuda/lower_double_buffer.cpp", "torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp", + "torch/csrc/jit/codegen/cuda/lower_fused_reduction.cpp", + "torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.cpp", "torch/csrc/jit/codegen/cuda/lower_index.cpp", + "torch/csrc/jit/codegen/cuda/lower_index_hoist.cpp", "torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp", "torch/csrc/jit/codegen/cuda/lower_loops.cpp", "torch/csrc/jit/codegen/cuda/lower_magic_zero.cpp", "torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.cpp", "torch/csrc/jit/codegen/cuda/lower_predicate.cpp", + "torch/csrc/jit/codegen/cuda/lower_predicate_elimination.cpp", + "torch/csrc/jit/codegen/cuda/lower_replace_size.cpp", "torch/csrc/jit/codegen/cuda/lower_shift.cpp", + "torch/csrc/jit/codegen/cuda/lower_sync_information.cpp", "torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp", + "torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.cpp", "torch/csrc/jit/codegen/cuda/lower_trivial_reductions.cpp", "torch/csrc/jit/codegen/cuda/lower_unroll.cpp", "torch/csrc/jit/codegen/cuda/lower_utils.cpp", "torch/csrc/jit/codegen/cuda/lower_validation.cpp", + "torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp", "torch/csrc/jit/codegen/cuda/lower2device.cpp", "torch/csrc/jit/codegen/cuda/manager.cpp", "torch/csrc/jit/codegen/cuda/mutator.cpp", "torch/csrc/jit/codegen/cuda/non_divisible_split.cpp", + "torch/csrc/jit/codegen/cuda/ops/alias.cpp", "torch/csrc/jit/codegen/cuda/ops/composite.cpp", "torch/csrc/jit/codegen/cuda/ops/normalization.cpp", "torch/csrc/jit/codegen/cuda/parallel_dimension_map.cpp", @@ -687,6 +741,8 @@ libtorch_cuda_core_sources = [ "torch/csrc/jit/codegen/cuda/transform_view.cpp", "torch/csrc/jit/codegen/cuda/type.cpp", "torch/csrc/jit/codegen/cuda/utils.cpp", + "torch/csrc/jit/codegen/cuda/mma_type.cpp", + "torch/csrc/jit/codegen/cuda/scheduler/mma_utils.cpp", "torch/csrc/jit/passes/frozen_conv_add_relu_fusion_cuda.cpp", "torch/csrc/jit/tensorexpr/cuda_codegen.cpp", "torch/csrc/jit/runtime/register_cuda_ops.cpp", @@ -770,7 +826,6 @@ torch_cpp_srcs = [ "torch/csrc/api/src/optim/schedulers/step_lr.cpp", "torch/csrc/api/src/serialize/input-archive.cpp", "torch/csrc/api/src/serialize/output-archive.cpp", - "torch/csrc/utils/crash_handler.cpp", ] libtorch_python_cuda_core_sources = [ @@ -814,7 +869,6 @@ libtorch_python_core_sources = [ "torch/csrc/autograd/profiler_python.cpp", "torch/csrc/autograd/python_anomaly_mode.cpp", "torch/csrc/autograd/python_saved_variable_hooks.cpp", - "torch/csrc/autograd/python_mode.cpp", "torch/csrc/autograd/python_cpp_function.cpp", "torch/csrc/autograd/python_engine.cpp", "torch/csrc/autograd/python_function.cpp", @@ -824,9 +878,11 @@ libtorch_python_core_sources = [ "torch/csrc/autograd/python_variable.cpp", "torch/csrc/autograd/python_variable_indexing.cpp", "torch/csrc/jit/backends/backend_init.cpp", + "torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp", "torch/csrc/jit/python/init.cpp", "torch/csrc/jit/passes/onnx.cpp", "torch/csrc/jit/passes/onnx/cast_all_constant_to_floating.cpp", + "torch/csrc/jit/passes/onnx/deduplicate_initializers.cpp", "torch/csrc/jit/passes/onnx/eval_peephole.cpp", "torch/csrc/jit/passes/onnx/constant_fold.cpp", "torch/csrc/jit/passes/onnx/constant_map.cpp", @@ -843,6 +899,7 @@ libtorch_python_core_sources = [ "torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp", "torch/csrc/jit/passes/onnx/shape_type_inference.cpp", "torch/csrc/jit/passes/onnx/function_extraction.cpp", + "torch/csrc/jit/passes/onnx/onnx_log.cpp", "torch/csrc/jit/python/pybind_utils.cpp", "torch/csrc/jit/passes/onnx/pattern_conversion/common.cpp", "torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.cpp", @@ -885,12 +942,11 @@ libtorch_python_core_sources = [ "torch/csrc/utils/tensor_numpy.cpp", "torch/csrc/utils/tensor_types.cpp", "torch/csrc/utils/disable_torch_function.cpp", -] +] + lazy_tensor_core_python_sources libtorch_python_distributed_core_sources = [ "torch/csrc/distributed/c10d/init.cpp", "torch/csrc/distributed/c10d/python_comm_hook.cpp", - "torch/csrc/distributed/c10d/quantization/quantization.cpp", ] libtorch_python_distributed_sources = libtorch_python_distributed_core_sources + [ @@ -908,21 +964,21 @@ libtorch_python_distributed_sources = libtorch_python_distributed_core_sources + def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"): _libtorch_python_sources = [gencode_pattern.format(name) for name in [ - "autograd/generated/python_functions_0.cpp", - "autograd/generated/python_functions_1.cpp", - "autograd/generated/python_functions_2.cpp", - "autograd/generated/python_functions_3.cpp", - "autograd/generated/python_functions_4.cpp", - "autograd/generated/python_nn_functions.cpp", - "autograd/generated/python_fft_functions.cpp", - "autograd/generated/python_linalg_functions.cpp", - "autograd/generated/python_return_types.cpp", - "autograd/generated/python_sparse_functions.cpp", - "autograd/generated/python_special_functions.cpp", - "autograd/generated/python_torch_functions_0.cpp", - "autograd/generated/python_torch_functions_1.cpp", - "autograd/generated/python_torch_functions_2.cpp", - "autograd/generated/python_variable_methods.cpp", + "torch/csrc/autograd/generated/python_functions_0.cpp", + "torch/csrc/autograd/generated/python_functions_1.cpp", + "torch/csrc/autograd/generated/python_functions_2.cpp", + "torch/csrc/autograd/generated/python_functions_3.cpp", + "torch/csrc/autograd/generated/python_functions_4.cpp", + "torch/csrc/autograd/generated/python_nn_functions.cpp", + "torch/csrc/autograd/generated/python_fft_functions.cpp", + "torch/csrc/autograd/generated/python_linalg_functions.cpp", + "torch/csrc/autograd/generated/python_return_types.cpp", + "torch/csrc/autograd/generated/python_sparse_functions.cpp", + "torch/csrc/autograd/generated/python_special_functions.cpp", + "torch/csrc/autograd/generated/python_torch_functions_0.cpp", + "torch/csrc/autograd/generated/python_torch_functions_1.cpp", + "torch/csrc/autograd/generated/python_torch_functions_2.cpp", + "torch/csrc/autograd/generated/python_variable_methods.cpp", ]] _libtorch_python_sources.extend(libtorch_python_core_sources) @@ -945,11 +1001,13 @@ aten_cpu_source_non_codegen_list = [ "aten/src/ATen/MemoryOverlap.cpp", "aten/src/ATen/MapAllocator.cpp", "aten/src/ATen/NamedTensorUtils.cpp", + "aten/src/ATen/NestedTensorImpl.cpp", "aten/src/ATen/ParallelCommon.cpp", "aten/src/ATen/ParallelNative.cpp", "aten/src/ATen/ParallelNativeTBB.cpp", "aten/src/ATen/ParallelOpenMP.cpp", "aten/src/ATen/ParallelThreadPoolNative.cpp", + "aten/src/ATen/PythonTorchFunctionTLS.cpp", "aten/src/ATen/ScalarOps.cpp", "aten/src/ATen/SequenceNumber.cpp", "aten/src/ATen/SparseTensorImpl.cpp", @@ -991,7 +1049,7 @@ aten_cpu_source_non_codegen_list = [ "aten/src/ATen/core/op_registration/infer_schema.cpp", "aten/src/ATen/core/op_registration/op_registration.cpp", "aten/src/ATen/core/operator_name.cpp", - "aten/src/ATen/core/PythonModeTLS.cpp", + "aten/src/ATen/core/TorchDispatchModeTLS.cpp", "aten/src/ATen/core/register_symbols.cpp", "aten/src/ATen/core/class_type.cpp", "aten/src/ATen/core/type.cpp", @@ -1006,7 +1064,6 @@ aten_cpu_source_non_codegen_list = [ "aten/src/ATen/detail/ORTHooksInterface.cpp", "aten/src/ATen/metal/Context.cpp", "aten/src/ATen/native/AutogradComposite.cpp", - "aten/src/ATen/native/BatchLinearAlgebraKernel.cpp", "aten/src/ATen/native/DispatchStub.cpp", "aten/src/ATen/native/UpSample.cpp", "aten/src/ATen/native/mkl/LinearAlgebra.cpp", @@ -1024,6 +1081,7 @@ aten_cpu_source_non_codegen_list = [ "aten/src/ATen/native/mkldnn/MkldnnTensorMath.cpp", "aten/src/ATen/native/mkldnn/Normalization.cpp", "aten/src/ATen/native/mkldnn/Pooling.cpp", + "aten/src/ATen/native/mkldnn/Prelu.cpp", "aten/src/ATen/native/mkldnn/Relu.cpp", "aten/src/ATen/native/mkldnn/SoftMax.cpp", "aten/src/ATen/native/mkldnn/TensorFactories.cpp", @@ -1047,6 +1105,10 @@ aten_cpu_source_codegen_list = [ "aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp", ] +aten_ufunc_headers = [ + "aten/src/ATen/native/ufunc/add.h", +] + # When building lite interpreter in OSS, "aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp" will go through # codegen process. The codegen version of this file, like Activation.cpp.DEFAULT.cpp, will be included # in ${cpu_kernel_cpp} in aten/src/ATen/CMakeLists.txt. As a result, in aten/src/ATen/CMakeLists.txt, @@ -1079,6 +1141,7 @@ aten_native_source_codegen_list = [ "aten/src/ATen/native/cpu/MaxPoolKernel.cpp", "aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp", "aten/src/ATen/native/cpu/MultinomialKernel.cpp", + "aten/src/ATen/native/cpu/PixelShuffleKernel.cpp", "aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp", "aten/src/ATen/native/cpu/PowKernel.cpp", "aten/src/ATen/native/cpu/RangeFactoriesKernel.cpp", @@ -1099,6 +1162,7 @@ aten_native_source_codegen_list = [ "aten/src/ATen/native/cpu/batch_norm_kernel.cpp", "aten/src/ATen/native/cpu/group_norm_kernel.cpp", "aten/src/ATen/native/cpu/layer_norm_kernel.cpp", + "aten/src/ATen/native/cpu/WeightNormKernel.cpp", "aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp", ] @@ -1124,7 +1188,7 @@ aten_native_source_non_codegen_list = [ "aten/src/ATen/native/quantized/cpu/qconcat.cpp", "aten/src/ATen/native/quantized/cpu/qconv.cpp", "aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp", - "aten/src/ATen/native/quantized/cpu/qconv_unpack.cpp", + "aten/src/ATen/native/quantized/cpu/qconv_unpack_impl.cpp", "aten/src/ATen/native/quantized/cpu/qelu.cpp", "aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp", "aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp", @@ -1136,7 +1200,7 @@ aten_native_source_non_codegen_list = [ "aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp", "aten/src/ATen/native/quantized/cpu/qconv_dynamic.cpp", "aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp", - "aten/src/ATen/native/quantized/cpu/qlinear_unpack.cpp", + "aten/src/ATen/native/quantized/cpu/qlinear_unpack_impl.cpp", "aten/src/ATen/native/quantized/cpu/qmatmul.cpp", "aten/src/ATen/native/quantized/cpu/qmul.cpp", "aten/src/ATen/native/quantized/cpu/qnormalization.cpp", @@ -1144,6 +1208,7 @@ aten_native_source_non_codegen_list = [ "aten/src/ATen/native/quantized/cpu/qreduction.cpp", "aten/src/ATen/native/quantized/cpu/qrelu.cpp", "aten/src/ATen/native/quantized/cpu/qsigmoid.cpp", + "aten/src/ATen/native/quantized/cpu/qsoftmax.cpp", "aten/src/ATen/native/quantized/cpu/qsort.cpp", "aten/src/ATen/native/quantized/cpu/qtanh.cpp", "aten/src/ATen/native/quantized/cpu/qthreshold.cpp", @@ -1160,9 +1225,11 @@ aten_native_source_non_codegen_list = [ "aten/src/ATen/native/quantized/fake_quant_per_channel_affine.cpp", "aten/src/ATen/native/quantized/fake_quant_per_tensor_affine.cpp", "aten/src/ATen/native/quantized/library.cpp", + "aten/src/ATen/native/quantized/cpu/ruy_utils.cpp", + "aten/src/ATen/native/quantized/cpu/xnnpack_utils.cpp", + "aten/src/ATen/native/quantized/qlinear_unpack.cpp", "aten/src/ATen/quantized/QTensorImpl.cpp", "aten/src/ATen/quantized/Quantizer.cpp", - "aten/src/ATen/native/attention.cpp", "aten/src/ATen/native/Activation.cpp", "aten/src/ATen/native/AdaptiveAveragePooling.cpp", "aten/src/ATen/native/AdaptiveAveragePooling3d.cpp", @@ -1172,6 +1239,7 @@ aten_native_source_non_codegen_list = [ "aten/src/ATen/native/AveragePool2d.cpp", "aten/src/ATen/native/AveragePool3d.cpp", "aten/src/ATen/native/BatchLinearAlgebra.cpp", + "aten/src/ATen/native/BatchLinearAlgebraKernel.cpp", "aten/src/ATen/native/Batching.cpp", "aten/src/ATen/native/BinaryOps.cpp", "aten/src/ATen/native/Blas.cpp", @@ -1180,7 +1248,7 @@ aten_native_source_non_codegen_list = [ "aten/src/ATen/native/CPUBlas.cpp", "aten/src/ATen/native/ChanelShuffle.cpp", "aten/src/ATen/native/Col2Im.cpp", - "aten/src/ATen/native/ConstantPadNd.cpp", + "aten/src/ATen/native/PadNd.cpp", "aten/src/ATen/native/Convolution.cpp", "aten/src/ATen/native/ConvolutionMM2d.cpp", "aten/src/ATen/native/ConvolutionMM3d.cpp", @@ -1284,6 +1352,8 @@ aten_native_source_non_codegen_list = [ "aten/src/ATen/native/WeightNorm.cpp", "aten/src/ATen/native/group_norm.cpp", "aten/src/ATen/native/layer_norm.cpp", + "aten/src/ATen/native/nested/NestedTensorMath.cpp", + "aten/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp", "aten/src/ATen/native/sparse/ParamUtils.cpp", "aten/src/ATen/native/sparse/SoftMax.cpp", "aten/src/ATen/native/sparse/SparseBlas.cpp", @@ -1294,6 +1364,8 @@ aten_native_source_non_codegen_list = [ "aten/src/ATen/native/sparse/SparseTensorMath.cpp", "aten/src/ATen/native/sparse/SparseUnaryOps.cpp", "aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp", + "aten/src/ATen/native/transformers/attention.cpp", + "aten/src/ATen/native/transformers/transformer.cpp", "aten/src/ATen/native/utils/Factory.cpp", "aten/src/ATen/native/xnnpack/Activation.cpp", "aten/src/ATen/native/xnnpack/ChannelShuffle.cpp", @@ -1323,8 +1395,11 @@ aten_cuda_cu_source_list = [ "aten/src/ATen/cuda/CUDASparseBlas.cpp", "aten/src/ATen/cuda/CublasHandlePool.cpp", "aten/src/ATen/native/cuda/Activation.cpp", + "aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp", "aten/src/ATen/native/cuda/Blas.cpp", + "aten/src/ATen/native/cuda/Distributions.cpp", "aten/src/ATen/native/cuda/Equal.cpp", + "aten/src/ATen/native/cuda/GridSampler.cpp", "aten/src/ATen/native/cuda/IndexKernel.cpp", "aten/src/ATen/native/cuda/ReduceOps.cpp", "aten/src/ATen/native/cuda/ScanKernels.cpp", @@ -1334,6 +1409,7 @@ aten_cuda_cu_source_list = [ "aten/src/ATen/native/cuda/TensorShapeCUDA.cpp", "aten/src/ATen/native/cuda/TensorTopK.cpp", "aten/src/ATen/native/cuda/jit_utils.cpp", + "aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp", "aten/src/ATen/native/sparse/cuda/SparseBlas.cpp", "aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp", "aten/src/ATen/native/sparse/cuda/SparseBlasLegacy.cpp", diff --git a/tools/clang_format_hash/linux64/clang-format-linux64 b/tools/clang_format_hash/linux64/clang-format-linux64 deleted file mode 100644 index 40a85640a2aa..000000000000 --- a/tools/clang_format_hash/linux64/clang-format-linux64 +++ /dev/null @@ -1 +0,0 @@ -21ca53c291a88b53dac85751b7a0203ca610ac94b7adaff3c092cf30df4168f2 \ No newline at end of file diff --git a/tools/clang_format_hash/mac/clang-format-mojave b/tools/clang_format_hash/mac/clang-format-mojave deleted file mode 100644 index fe4f8f6bdd69..000000000000 --- a/tools/clang_format_hash/mac/clang-format-mojave +++ /dev/null @@ -1 +0,0 @@ -5fde7bccf65032da297dfb1f18e4a95e96e278fa397e9dcaf364dfe23ec46353 \ No newline at end of file diff --git a/tools/code_analyzer/gen_op_registration_allowlist.py b/tools/code_analyzer/gen_op_registration_allowlist.py index 00f880d6e9c7..65e56856a789 100644 --- a/tools/code_analyzer/gen_op_registration_allowlist.py +++ b/tools/code_analyzer/gen_op_registration_allowlist.py @@ -16,24 +16,26 @@ DepGraph = Dict[str, Set[str]] + def canonical_name(opname: str) -> str: # Skip the overload name part as it's not supported by code analyzer yet. - return opname.split('.', 1)[0] + return opname.split(".", 1)[0] + def load_op_dep_graph(fname: str) -> DepGraph: - with open(fname, 'r') as stream: + with open(fname, "r") as stream: result = defaultdict(set) for op in yaml.safe_load(stream): - op_name = canonical_name(op['name']) - for dep in op.get('depends', []): - dep_name = canonical_name(dep['name']) + op_name = canonical_name(op["name"]) + for dep in op.get("depends", []): + dep_name = canonical_name(dep["name"]) result[op_name].add(dep_name) return dict(result) def load_root_ops(fname: str) -> List[str]: result = [] - with open(fname, 'r') as stream: + with open(fname, "r") as stream: for op in yaml.safe_load(stream): result.append(canonical_name(op)) return result @@ -49,7 +51,7 @@ def gen_transitive_closure( # The dependency graph might contain a special entry with key = `__BASE__` # and value = (set of `base` ops to always include in custom build). - queue.append('__BASE__') + queue.append("__BASE__") # The dependency graph might contain a special entry with key = `__ROOT__` # and value = (set of ops reachable from C++ functions). Insert the special @@ -58,7 +60,7 @@ def gen_transitive_closure( # '__ROOT__' is only needed for full-jit. Keep it only for training. # TODO: when FL is migrated from full-jit to lite trainer, remove '__ROOT__' if train: - queue.append('__ROOT__') + queue.append("__ROOT__") while queue: cur = queue.pop() @@ -69,21 +71,25 @@ def gen_transitive_closure( return sorted(result) + def gen_transitive_closure_str(dep_graph: DepGraph, root_ops: List[str]) -> str: - return ' '.join(gen_transitive_closure(dep_graph, root_ops)) + return " ".join(gen_transitive_closure(dep_graph, root_ops)) if __name__ == "__main__": parser = argparse.ArgumentParser( - description='Util to produce transitive dependencies for custom build') + description="Util to produce transitive dependencies for custom build" + ) parser.add_argument( - '--op-dependency', - help='input yaml file of op dependency graph ' - '- can be omitted for custom build with static dispatch') + "--op-dependency", + help="input yaml file of op dependency graph " + "- can be omitted for custom build with static dispatch", + ) parser.add_argument( - '--root-ops', + "--root-ops", required=True, - help='input yaml file of root (directly used) operators') + help="input yaml file of root (directly used) operators", + ) args = parser.parse_args() deps = load_op_dep_graph(args.op_dependency) if args.op_dependency else {} diff --git a/tools/code_analyzer/gen_operators_yaml.py b/tools/code_analyzer/gen_operators_yaml.py new file mode 100644 index 000000000000..0daa27f0480e --- /dev/null +++ b/tools/code_analyzer/gen_operators_yaml.py @@ -0,0 +1,591 @@ +#!/usr/bin/env python3 +import argparse +import json +import sys +from typing import List, Optional, Dict, Any + +import yaml +from gen_op_registration_allowlist import ( + canonical_name, + gen_transitive_closure, + load_op_dep_graph, +) +from torchgen.selective_build.operator import ( + SelectiveBuildOperator, + merge_operator_dicts, +) +from torchgen.selective_build.selector import merge_kernel_metadata + +# Generate YAML file containing the operators used for a specific PyTorch model. +# ------------------------------------------------------------------------------ +# +# This binary is responsible for generating the model_operators.yaml file for +# each model from a pt_operator_library() BUCK macro invocation. +# +# Output YAML file format: +# ------------------------ +# +# +# include_all_non_op_selectives: False +# include_all_operators: False +# debug_info: +# - model1@v100 +# - model2@v50 +# operators: +# aten::add: +# is_root_operator: Yes +# is_used_for_training: Yes +# include_all_overloads: No +# debug_info: +# - model1@v100 +# - model2@v50 +# aten::add.int: +# is_root_operator: No +# is_used_for_training: No +# include_all_overloads: Yes +# kernel_metadata: +# add_kernel: +# - Int8 +# - UInt32 +# sub_kernel: +# - Int16 +# - Float +# +# +# There are a few main inputs to this application +# ----------------------------------------------- +# +# 1. Inference Root Operators (--root_ops): Root operators (called directly +# from TorchScript) used by inference use-cases. +# +# 2. Training Root Operators (--training_root_ops): Root operators used +# by training use-cases. Currently, this list is the list of all operators +# used by training, and not just the root operators. All Training ops are +# also considered for inference, so these are merged into inference ops. +# +# 3. Operator Depencency Graph (--dep_graph_yaml_path): A path to the +# operator dependency graph used to determine which operators depend on +# which other operators for correct functioning. This is used for +# generating the transitive closure of all the operators used by the +# model based on the root operators when static selective build is used. +# For tracing based selective build, we don't need to perform this +# transitive cloure. +# +# 4. Model Metadata (--model_name, --model_versions, --model_assets, +# --model_backends): Self-descriptive. These are used to tell this +# script which model operator lists to fetch from the Unified Model +# Build Metadata YAML file. +# +# 5. Unified Model YAML file (--models_yaml_path): A path to the Unified +# model YAML operator list file. This yaml file contains (for each +# model/version/asset/backend) the set of used root and traced +# operators. This is used to extract the actual set of operators +# needed to be included in the build. +# + + +def canonical_opnames(opnames: List[str]) -> List[str]: + return [canonical_name(opname) for opname in opnames] + + +def make_filter_from_options( + model_name: str, + model_versions: List[str], + model_assets: Optional[List[str]], + model_backends: Optional[List[str]], +): + def is_model_included(model_info): + model = model_info["model"] + if model["name"] != model_name: + return False + if str(model["version"]) not in model_versions: + return False + if model_assets is not None and model["asset"] not in model_assets: + return False + # TODO: Handle backend later + return True + + return is_model_included + + +# Returns if a the specified rule is a new or old style pt_operator_library +def is_new_style_rule(model_name: str, model_versions: Optional[List[str]]): + return model_name is not None and model_versions is not None + + +# Verifies that specified model_name, and all specified versions and assets +# appear in at least one model yaml. Throws if verification is failed, +# returns None on success +def verify_all_specified_present( + model_assets: Optional[List[str]], + model_versions: List[str], + selected_models_yaml: List[Dict[str, Any]], + rule_name: str, + model_name: str, + new_style_rule: bool, +): + def find_missing_items(model_items, key, selected_models_yaml): + missing_items = [] + if not new_style_rule or not model_items: + return missing_items + for item in model_items: + found = False + for model in selected_models_yaml: + if str(model["model"][key]) == item: + found = True + if not found: + missing_items.append(item) + return missing_items + + missing_assets = find_missing_items(model_assets, "asset", selected_models_yaml) + missing_versions = find_missing_items( + model_versions, "version", selected_models_yaml + ) + + if len(missing_versions) > 0 or len(missing_assets) > 0: # at least one is missing + name_warning = "" + if len(selected_models_yaml) == 0: + name_warning = ( + "WARNING: 0 yaml's were found for target rule. This could be because the " + + "provided model name: {name} is incorrect. Please check that field as well as " + + "the assets and versions." + ).format(name=model_name) + raise RuntimeError( + ( + "Error: From the pt_operator_library rule for Rule: {name}, at least one entry for the " + + "following fields was expected -- Model: {model_name} Expected Assets: {expected_assets}, Expected Versions: " + + "{expected_versions}. {name_warning} In all_mobile_models.yaml either no assets were on one of the " + + "specified versions, one of the specified assets was not present on any of the specified " + + "versions, or both. Assets not found: {missing_assets}, Versions not found: {missing_versions} " + + "For questions please ask in https://fb.workplace.com/groups/2148543255442743/" + ).format( + name=rule_name, + model_name=model_name, + expected_versions=model_versions, + expected_assets=model_assets + if model_assets + else "", + name_warning=name_warning, + missing_versions=missing_versions + if len(missing_versions) > 0 + else "", + missing_assets=missing_assets + if len(missing_assets) > 0 + else "", + ) + ) + + +# Uses the selected models configs and then combines them into one dictionary, +# formats them as a string, and places the string into output as a top level debug_info +def create_debug_info_from_selected_models( + output: Dict[str, object], + selected_models: List[dict], + new_style_rule: bool, +): + + model_dict = { + "asset_info": {}, # maps asset name -> dict of asset metadata like hashes + "is_new_style_rule": new_style_rule, + } + + for model in selected_models: + model_info = model["model"] + asset = model_info["asset"] + hash = model_info["md5_hash"] + + asset_info = model_dict["asset_info"].setdefault(asset, {}) + + asset_info.setdefault("md5_hash", []).append(hash) + + # Will later be used in gen_oplist to generate the model/version/asset checking + output["debug_info"] = [json.dumps(model_dict)] + + +def fill_output(output: Dict[str, object], options: object): + """Populate the output dict with the information required to serialize + the YAML file used for selective build. + """ + dept_graph = load_op_dep_graph(options.dep_graph_yaml_path) + + model_versions = ( + options.model_versions.split(",") if options.model_versions is not None else [] + ) + model_assets = ( + options.model_assets.split(",") if options.model_assets is not None else None + ) + + with open(options.models_yaml_path, "rb") as models_yaml_file: + all_models_yaml = yaml.safe_load(models_yaml_file) or [] + + model_filter_func = make_filter_from_options( + options.model_name, model_versions, model_assets, options.model_backends + ) + + selected_models_yaml = list(filter(model_filter_func, all_models_yaml)) + + verify_all_specified_present( + model_assets=model_assets, + model_versions=model_versions, + selected_models_yaml=selected_models_yaml, + rule_name=options.rule_name, + model_name=options.model_name, + new_style_rule=is_new_style_rule(options.model_name, options.model_versions), + ) + + create_debug_info_from_selected_models( + output, + selected_models_yaml, + is_new_style_rule(options.model_name, options.model_versions), + ) + + # initialize variables for static build from the pt_operator_library rule + if options.root_ops is not None: + static_root_ops = set(filter(lambda x: len(x) > 0, options.root_ops.split(","))) + else: + static_root_ops = set() + + static_training_root_ops = set( + filter( + lambda x: len(x) > 0, + (options.training_root_ops or "").split(","), + ) + ) + if len(static_training_root_ops) > 0: + static_root_ops = static_root_ops | static_training_root_ops + # end if + + root_ops_unexpand = set() + traced_ops = set() + training_root_ops_unexpand = set() + traced_training_ops = set() + all_kernel_metadata = [] + all_custom_classes = set() + all_build_features = set() + + # Go through each yaml file and retrieve operator information. + for model_info in selected_models_yaml: + if "traced_operators" not in model_info: + # If this YAML file doesn't specify any traced operators, then it is using + # the static analysis selective build approach of finding transitively + # used operators, and we should update root_ops with the set of root + # operators, all of whose overloads must be included. In addition, these + # root_ops will be further expanded using the transitive closure of + # operator dependencies. + static_root_ops = static_root_ops | set(model_info["root_operators"]) + else: + # If this YAML file specifies traced operators, then it is using + # the tracing based selective build approach of finding used + # operators, and we should update root_ops_unexpand with the set of root + # operators whose overloads don't need to be included. In addition, these + # root_ops_unexpand will NOT be further expanded. If the train flag is + # set then the ops will be used for training, so we put them in a separate + # set + if model_info["train"]: + training_root_ops_unexpand = training_root_ops_unexpand | set( + model_info["root_operators"] + ) + traced_training_ops = traced_training_ops | set( + model_info["traced_operators"] + ) + else: + root_ops_unexpand = root_ops_unexpand | set( + model_info["root_operators"] + ) + traced_ops = traced_ops | set(model_info["traced_operators"]) + + if "kernel_metadata" in model_info: + all_kernel_metadata.append(model_info["kernel_metadata"]) + + if "custom_classes" in model_info: + all_custom_classes = all_custom_classes | set(model_info["custom_classes"]) + + if "build_features" in model_info: + all_build_features = all_build_features | set(model_info["build_features"]) + + # This following section on transitive closure is relevant to static build only + canonical_root_ops = canonical_opnames(static_root_ops) + # If no canonical_root_ops exist, don't compute the transitive closure + # otherwise, we will include __BASE__ and __ROOT__ ops and mark them as required + # for inference. + if len(canonical_root_ops) > 0: + closure_op_list = gen_transitive_closure(dept_graph, canonical_root_ops) + else: + closure_op_list = set() + + canonical_training_root_ops = canonical_opnames(static_training_root_ops) + # If no canonical_training_root_ops exist, don't compute the transitive closure + # otherwise, we will include __BASE__ and __ROOT__ ops and mark them as required + # for training. + if len(canonical_training_root_ops) > 0: + closure_training_op_list = gen_transitive_closure( + dept_graph, canonical_training_root_ops, train=True + ) + else: + closure_training_op_list = set() + + # bucketed_ops holds sets of operators that correspond to specific semantic buckets. For + # example: + # + # 1. Root Operators not used for training w/o full overload inclusion + # 2. Root Operators not used for training w/ full overload inclusion + # 3. Root Operators used for training w/o full overload inclusion + # 4. Root Operators used for training w/ full overload inclusion + # 5. Non-root Operators not used for training w/o full overload inclusion + # etc... + # + # Basically for each of the 3 boolean conditional, there are 2 + # options (True/False). + # + bucketed_ops = [] + + # START STATIC BUILD OPS + static_root_ops_bucket = {} + for op_name in static_root_ops: + op = SelectiveBuildOperator.from_yaml_dict( + op_name, + { + "is_root_operator": True, + "is_used_for_training": False, + "include_all_overloads": True, + "debug_info": [options.model_name], + }, + ) + static_root_ops_bucket[op_name] = op + bucketed_ops.append(static_root_ops_bucket) + + closure_ops_bucket = {} + for op_name in closure_op_list: + op = SelectiveBuildOperator.from_yaml_dict( + op_name, + { + "is_root_operator": False, + "is_used_for_training": False, + "include_all_overloads": True, + "debug_info": [options.model_name], + }, + ) + closure_ops_bucket[op_name] = op + bucketed_ops.append(closure_ops_bucket) + + static_training_root_ops_bucket = {} + for op_name in static_training_root_ops: + op = SelectiveBuildOperator.from_yaml_dict( + op_name, + { + "is_root_operator": True, + "is_used_for_training": True, + "include_all_overloads": True, + "debug_info": [options.model_name], + }, + ) + static_training_root_ops_bucket[op_name] = op + bucketed_ops.append(static_training_root_ops_bucket) + + closure_training_ops_bucket = {} + for op_name in closure_training_op_list: + op = SelectiveBuildOperator.from_yaml_dict( + op_name, + { + "is_root_operator": False, + "is_used_for_training": True, + "include_all_overloads": True, + "debug_info": [options.model_name], + }, + ) + closure_training_ops_bucket[op_name] = op + bucketed_ops.append(closure_training_ops_bucket) + # END STATIC BUILD OPS + + # START TRACING BASED BUILD OPS + root_ops_unexpand_bucket = {} + for op_name in root_ops_unexpand: + op = SelectiveBuildOperator.from_yaml_dict( + op_name, + { + "is_root_operator": True, + "is_used_for_training": False, + "include_all_overloads": False, + "debug_info": [options.model_name], + }, + ) + root_ops_unexpand_bucket[op_name] = op + bucketed_ops.append(root_ops_unexpand_bucket) + + traced_ops_bucket = {} + for op_name in traced_ops: + op = SelectiveBuildOperator.from_yaml_dict( + op_name, + { + "is_root_operator": False, + "is_used_for_training": False, + "include_all_overloads": False, + "debug_info": [options.model_name], + }, + ) + traced_ops_bucket[op_name] = op + bucketed_ops.append(traced_ops_bucket) + + training_root_ops_unexpand_bucket = {} + for op_name in training_root_ops_unexpand: + op = SelectiveBuildOperator.from_yaml_dict( + op_name, + { + "is_root_operator": True, + "is_used_for_training": True, + "include_all_overloads": False, + "debug_info": [options.model_name], + }, + ) + training_root_ops_unexpand_bucket[op_name] = op + bucketed_ops.append(training_root_ops_unexpand_bucket) + + traced_training_ops_bucket = {} + for op_name in traced_training_ops: + op = SelectiveBuildOperator.from_yaml_dict( + op_name, + { + "is_root_operator": False, + "is_used_for_training": True, + "include_all_overloads": False, + "debug_info": [options.model_name], + }, + ) + traced_training_ops_bucket[op_name] = op + bucketed_ops.append(traced_training_ops_bucket) + # END TRACING BASED BUILD OPS + + # Merge dictionaries together to remove op duplication + operators: Dict[str, SelectiveBuildOperator] = {} + for ops_dict in bucketed_ops: + operators = merge_operator_dicts(operators, ops_dict) + + # Loop over all operators, and if any of the them specifies that + # all overloads need to be included, then set include_all_non_op_selectives + # to True, since it indicates that this operator list came from something + # other than a traced operator list. + include_all_non_op_selectives = False + for (op_name, op_info) in operators.items(): + include_all_non_op_selectives = ( + include_all_non_op_selectives or op_info.include_all_overloads + ) + + operators_as_dict = {} + for (k, v) in operators.items(): + operators_as_dict[k] = v.to_dict() + + output["operators"] = operators_as_dict + + output["custom_classes"] = all_custom_classes + + output["build_features"] = all_build_features + + output["include_all_non_op_selectives"] = include_all_non_op_selectives + if len(all_kernel_metadata) > 0: + kernel_metadata = {} + for kt in all_kernel_metadata: + kernel_metadata = merge_kernel_metadata(kernel_metadata, kt) + output["kernel_metadata"] = kernel_metadata + + +def get_parser_options(parser: argparse.ArgumentParser) -> argparse.Namespace: + parser.add_argument( + "--root_ops", + help="A comma separated list of root operators used by the model", + required=False, + ) + parser.add_argument( + "--training_root_ops", + help="A comma separated list of root operators used for training", + required=False, + ) + parser.add_argument( + "--output_path", + help="The location of the output yaml file.", + required=True, + ) + parser.add_argument( + "--dep_graph_yaml_path", + type=str, + help="A path to the Operator Dependency Graph YAML file.", + required=True, + ) + parser.add_argument( + "--model_name", + type=str, + help="The name of the model that uses the specified root operators.", + required=True, + ) + parser.add_argument( + "--model_versions", + type=str, + help="A comma separated list of model versions.", + required=False, + ) + parser.add_argument( + "--model_assets", + type=str, + help="A comma separate list of model asset names (if absent, defaults to all assets for this model).", + required=False, + ) + parser.add_argument( + "--model_backends", + type=str, + default="CPU", + help="A comma separated list of model backends.", + required=False, + ) + parser.add_argument( + "--models_yaml_path", + type=str, + help="The path to where the unified Mobile Model Config YAML resides.", + required=True, + ) + parser.add_argument( + "--include_all_operators", + action="store_true", + default=False, + help="Set this flag to request inclusion of all opeators (i.e. build is not selective).", + required=False, + ) + parser.add_argument( + "--rule_name", + type=str, + help="The name of pt_operator_library rule resulting in this generation", + required=True, + ) + options = parser.parse_args() + return options + + +def main(argv) -> None: + parser = argparse.ArgumentParser(description="Generate used operators YAML") + options = get_parser_options(parser) + + model_dict = { + "model_name": options.model_name, + "asset_info": {}, + "is_new_style_rule": False, + } + output = { + "debug_info": [json.dumps(model_dict)], + } + + if options.include_all_operators: + output["include_all_operators"] = True + output["operators"] = {} + output["kernel_metadata"] = {} + else: + fill_output(output, options) + + with open(options.output_path, "wb") as out_file: + out_file.write( + yaml.safe_dump( + output, + default_flow_style=False, + ).encode("utf-8") + ) + + +if __name__ == "__main__": + sys.exit(main(sys.argv)) diff --git a/tools/code_analyzer/gen_oplist.py b/tools/code_analyzer/gen_oplist.py index 010b420d8c9b..b5d31b922167 100644 --- a/tools/code_analyzer/gen_oplist.py +++ b/tools/code_analyzer/gen_oplist.py @@ -7,11 +7,15 @@ from typing import Set, List, Any import yaml -from tools.codegen.selective_build.selector import combine_selective_builders, SelectiveBuilder +from torchgen.selective_build.selector import ( + combine_selective_builders, + SelectiveBuilder, +) from tools.lite_interpreter.gen_selected_mobile_ops_header import ( write_selected_mobile_ops, ) + def extract_all_operators(selective_builder: SelectiveBuilder) -> Set[str]: ops = [] for (op_name, op) in selective_builder.operators.items(): @@ -125,7 +129,7 @@ def main(argv: List[Any]) -> None: ) options = parser.parse_args() - if (os.path.isfile(options.model_file_list_path)): + if os.path.isfile(options.model_file_list_path): print("Processing model file: ", options.model_file_list_path) model_dicts = [] model_dict = yaml.safe_load(open(options.model_file_list_path)) @@ -180,5 +184,6 @@ def main(argv: List[Any]) -> None: selective_builder, ) + if __name__ == "__main__": main(sys.argv) diff --git a/tools/code_coverage/README.md b/tools/code_coverage/README.md index 6e83dc593ed1..67adb445d053 100644 --- a/tools/code_coverage/README.md +++ b/tools/code_coverage/README.md @@ -3,7 +3,7 @@ ## Overview This tool is designed for calculating code coverage for Pytorch project. -It’s an integrated tool. You can use this tool to run and generate both file-level and line-level report for C++ and Python tests. It will also be the tool we use in *CircleCI* to generate report for each master commit. +It’s an integrated tool. You can use this tool to run and generate both file-level and line-level report for C++ and Python tests. It will also be the tool we use in *CircleCI* to generate report for each main commit. ### Simple * *Simple command to run:* @@ -30,11 +30,11 @@ This part will introduce about the arguments you can use when run this tool. The We have two different compilers, `gcc` and `clang`, and this tool supports both. But it is recommended to use `gcc` because it's much faster and use less disk place. The examples will also be divided to two parts, for `gcc` and `clang`. ## Preparation -The first step is to [build *Pytorch* from source](https://github.com/pytorch/pytorch#from-source) with `CODE_COVERAGE` option `ON`. You may also want to set `BUILD_TEST` option `ON` to get the test binaries. Besides, if you are under `gcc` compiler, to get accurate result, it is recommended to also select `CMAKE_BUILD_CONFIG=Debug`. +The first step is to [build *Pytorch* from source](https://github.com/pytorch/pytorch#from-source) with `USE_CPP_CODE_COVERAGE` option `ON`. You may also want to set `BUILD_TEST` option `ON` to get the test binaries. Besides, if you are under `gcc` compiler, to get accurate result, it is recommended to also select `CMAKE_BUILD_TYPE=Debug`. See: [how to adjust build options](https://github.com/pytorch/pytorch#adjust-build-options-optional) for reference. Following is one way to adjust build option: ``` # in build/ folder (all build artifacts must in `build/` folder) -cmake .. -DCODE_COVERAGE=ON -DBUILD_TEST=ON -DCMAKE_BUILD_CONFIG=Debug +cmake .. -DUSE_CPP_CODE_COVERAGE=ON -DBUILD_TEST=ON -DCMAKE_BUILD_TYPE=Debug ``` @@ -53,7 +53,7 @@ python oss_coverage.py --run-only=atest ``` This command will run `atest` binary in `build/bin/` folder and generate reoports over the entire *Pytorch* folder. You can find the reports in `profile/summary`. But you may only be interested in the `aten` folder, in this case, try: ``` -python oss_coverage.py --run-only=atest --interested-only=aten +python oss_coverage.py --run-only=atest --interest-only=aten ``` In *Pytorch*, `c++` tests located in `build/bin/` and `python` tests located in `test/`. If you want to run `python` test, try: ``` @@ -62,7 +62,7 @@ python oss_coverage.py --run-only=test_complex.py You may also want to specify more than one test or interested folder, in this case, try: ``` -python oss_coverage.py --run-only=atest c10_logging_test --interested-only aten/src/Aten c10/core +python oss_coverage.py --run-only=atest c10_logging_test --interest-only aten/src/Aten c10/core ``` That it is! With these two simple options, you can customize many different functionality according to your need. By default, the tool will run all tests in `build/bin` folder (by running all executable binaries in it) and `test/` folder (by running `run_test.py`), and then collect coverage over the entire *Pytorch* folder. If this is what you want, try: @@ -84,9 +84,9 @@ By default all steps will be run, but you can specify only run one of them. Foll `—summary` is useful when you have different interested folder. For example, ```bash # after run this command -python oss_coverage.py --run-only=atest --interested-folder=aten +python oss_coverage.py --run-only=atest --interest-only=aten # you may then want to learn atest's coverage over c10, instead of running the test again, you can: -python oss_coverage.py --run-only=atest --interested-folder=c10 --summary +python oss_coverage.py --run-only=atest --interest-only=c10 --summary ``` diff --git a/tools/codegen/api/autograd.py b/tools/codegen/api/autograd.py deleted file mode 100644 index 64b7547e78f0..000000000000 --- a/tools/codegen/api/autograd.py +++ /dev/null @@ -1,388 +0,0 @@ -from dataclasses import dataclass -import re -from typing import Optional, Sequence, Set, List, Tuple, Match - -from tools.codegen.api import cpp -from tools.codegen.api.types import Binding, NamedCType -from tools.codegen.model import NativeFunction, Type, SchemaKind -from tools.codegen.utils import IDENT_REGEX - -# Represents a saved attribute involved in backward calculation. -# Note that it can be a derived property of an input argument, e.g.: -# we could save `other.scalar_type()` instead of the entire `other` tensor. -@dataclass(frozen=True) -class SavedAttribute: - # The NamedCType holds the updated name and cpp type of the attribute - # for the name, Suffix is appended if it's derived property, e.g.: `other_scalar_type` - nctype: NamedCType - - # The expression to read the derived property at save time, e.g.: - # `other.scalar_type()`. - expr: str - -# Represents a backward formula that calculates derivatives for one -# or more tensors. -@dataclass(frozen=True) -class Derivative: - # The formula string (legit C++ expression). - # Note that expressions against input arguments have been replaced with the - # corresponding saved attributes. - # E.g.: - # raw formula: `mul_tensor_backward(grad, self, other.scalar_type())` - # here: `mul_tensor_backward(grad, self, other_scalar_type)` - formula: str - - # The formula string before input argument replacement - original_formula: str - - # Names of the arguments for which this formula calculates derivatives. - var_names: Tuple[str, ...] - - # Saved inputs that are referenced by the formula. - saved_inputs: Tuple[SavedAttribute, ...] - - # Saved outputs that are referenced by the formula. - saved_outputs: Tuple[SavedAttribute, ...] - - # Gradients that are referenced by name in the formula. - named_gradients: Set[str] - -# Represents a forward formula that calculates forward derivatives -# for one tensor. -@dataclass(frozen=True) -class ForwardDerivative: - # The formula string (legit C++ expression). - # Note that special keywords such as "linear" or "element_wise" have been - # replaced by the automatically generated formula. - formula: str - - # Name of the output argument for which this formula calculates forward - # derivatives - var_name: str - - # Type of the output argument for which this formula calculates forward - # derivatives - var_type: Type - - # Inputs for which the forward derivatives are required for this formula - required_inputs_fw_grad: Optional[Tuple[str, ...]] - - # Inputs for which the primal is required for this formula - required_inputs_primal: Optional[Tuple[str, ...]] - - # Flag to specify if this formula requires the original value of self - # This is only used by inplace operations - required_original_self_value: bool - - # If this formula is specified in derivatives.yaml or if we are re-using the - # out of place formula for inplace - is_reusing_outplace_formula: bool - -# Represents differentiability info for a NativeFunction. -@dataclass(frozen=True) -class DifferentiabilityInfo: - # The base name read from derivatives.yaml. - name: str - - # The matching native function. - # - # There can be multiple NativeFunction having the same base name: - # - different overloads with different types of input arguments; - # - in-place/out/functional variants of the same function; - # - # We first use the schema string (under the 'name' key) in derivatives.yaml - # to find the NativeFunction having the same schema string. - # Then we find the in-place/out/functional variants of the matching function. - # Among these variants, we choose the one having the same name as the - # derivatives.yaml entry. If there is no exact match, then we choose the - # in-place variant. - # TODO: maybe the logic to search for all variants is no longer necessary? - func: NativeFunction - - # The name of the generated autograd function. - # It's set only if we will calculate a derivative, i.e. - # 'args_with_derivatives' is not empty. - op: Optional[str] - - # The derivatives formulae for this function. - # Note that the length of this sequence is the number of differentiable inputs - derivatives: Sequence[Derivative] - - # The forward derivatives formulae for this function. - # Note that the length of this sequence is the number of differentiable outputs - forward_derivatives: Sequence[ForwardDerivative] - - # The union of 'saved_inputs' of all 'derivatives'. - all_saved_inputs: Sequence[SavedAttribute] - - # The union of 'saved_outputs' of all 'derivatives'. - all_saved_outputs: Sequence[SavedAttribute] - - # All named gradients that are available for use, in the same - # order as in the grads vector. - available_named_gradients: Sequence[str] - - # The named gradients that are used in any of the derivatives. - # Invariant: all(name in available_named_gradients for name in used_named_gradients) - used_named_gradients: Set[str] - - # The function's input arguments for which it calculates derivatives. - # It's the union of 'var_names' of all 'derivatives', sorted by the - # argument order in the function schema. - args_with_derivatives: Sequence[Binding] - - # Names of arguments whose derivative formula is 'non_differentiable'. - non_differentiable_arg_names: Sequence[str] - - # Raw data read from derivatives.yaml. - output_differentiability: Optional[List[bool]] - - # output_differentiability in derivatives.yaml can be a list of - # conditions that express if the output is differentiable. In this case, - # the number of conditions must match the number of outputs - # (NB: we only support one condition right now). - # output_differentiability gets populated with True for each condition, - # while output_differentiability_conditions gets populated with the conditions - output_differentiability_conditions: Optional[List[str]] - - @property - def has_derivatives(self) -> bool: - return len(self.args_with_derivatives) > 0 - -def uses_ident(info: Optional[DifferentiabilityInfo], ident: str) -> bool: - if info is None: - return False - for derivative in info.derivatives: - formula = derivative.formula - if re.search(IDENT_REGEX.format(ident), formula): - return True - return False - -def uses_retain_variables(info: Optional[DifferentiabilityInfo]) -> bool: - return uses_ident(info, 'retain_variables') - -def uses_single_grad(info: Optional[DifferentiabilityInfo]) -> bool: - return uses_ident(info, 'grad') - -# Represents a differentiable `Argument`. -# How is it different from the `Argument` type? -# - It's processed Arguments which are differentiable and only used in the -# context of the autograd codegen; -# - It can represent SelfArgument or regular Argument but not TensorOptionsArgument; -@dataclass(frozen=True) -class DifferentiableInput: - name: str - type: Type - - # TODO: only to keep it byte-for-byte compatible with the old codegen, should remove. - cpp_type: str - -# Represents a differentiable `Return`. -# How it it different from the `Return` type? -# - The name in `Return` is optional. Here it is always populated using the same -# `cpp.return_names()` method. -# TODO: some cpp naming logic (e.g. resolving name conflict) might be irrelevant? -# - It's processed Returns which are differentiable, in compliance with the -# `output_differentiability` field defined in derivatives.yaml (if specified), -# and are only used in the context of the autograd codegen; -@dataclass(frozen=True) -class DifferentiableOutput: - name: str - type: Type - - # TODO: only to keep it byte-for-byte compatible with the old codegen, should remove. - cpp_type: str - -@dataclass(frozen=True) -class NativeFunctionWithDifferentiabilityInfo: - func: NativeFunction - info: Optional[DifferentiabilityInfo] - fw_derivatives: Sequence[ForwardDerivative] - -# TODO: Update comment below since it is out of date. -def dispatch_strategy(fn: NativeFunctionWithDifferentiabilityInfo) -> str: - """How are we going to call the underlying implementation of a - declaration? There are two strategies: - - use_derived: we want to call the implementation on CPUDoubleType - (or a similar, derived Type instance). Because these derived - instances deal in Tensors, not Variables (it's a completely different - object, so it doesn't dispatch back to VariableType), code on - this dispatch path needs to wrap/unwrap tensors. If the - derived implementation takes and returns tensors, the - implementation is usually differentiable (although we also use - the derived dispatch path for non-differentiable functions - that we still want to dispatch on the derived Type instance; - e.g., size()) - - use_type: we want to call the implementation on Type, because - it is implemented concretely, and the functions it invokes will - get dispatched back to VariableType (which will ensure that they - are differentiable.) - """ - if fn.func.is_abstract or (fn.info is not None and fn.info.has_derivatives): - # If the function is abstract (not implemented on at::Type), we must - # call the implementation on the derived type with unpacked tensors. - - # If the function has a derivative specified and is concrete, we could - # call either implementation. We prefer the calling the derived - # type's implementation with unpacked tensors because it is more - # performant in some cases: any internal calls to other ATen functions - # won't have the history tracked. - - # If the function has a type dispatched argument (i.e. is a factory), - # we prefer calling the derived type's implementation both because it is - # more performant and to ensure factory functions return tensors with _version - # of 0 (probably not strictly necessary, but nice to have to keeps versions simple - # to understand. - - return 'use_derived' - else: - # If the function is concrete (we don't have to override it) and we - # didn't declare it in derivatives.yaml, we'll assume that it is - # actually implemented out of differentiable functions. (This - # assumption might not hold, but then you'll see gradcheck fail.) - return 'use_type' - -def match_differentiability_info( - native_functions: List[NativeFunction], - differentiability_infos: Sequence[DifferentiabilityInfo], -) -> List[NativeFunctionWithDifferentiabilityInfo]: - """Sets the "derivative" key on declarations to matching autograd function - In-place functions will use the out-of-place derivative definition if there - is no in-place specific derivative. - """ - - info_by_schema = {info.func.func: info for info in differentiability_infos} - functional_info_by_signature = { - info.func.func.signature(strip_default=True): info - for info in differentiability_infos - if info.func.func.kind() == SchemaKind.functional} - - def find_info(f: NativeFunction) -> Tuple[Optional[DifferentiabilityInfo], bool]: - if f.func in info_by_schema: - return info_by_schema[f.func], True - - # if there is no exact match look for the out-of-place signature. - # i.e mul() for mul_() or mul_out() - return functional_info_by_signature.get(f.func.signature(strip_default=True)), False - - result: List[NativeFunctionWithDifferentiabilityInfo] = [] - for f in native_functions: - info, is_exact_match = find_info(f) - - # Currently, the '.strides()' to 'strides_or_error' replacement does not support - # 'self' derivatives of an inplace function, so we must check for this case. - if f.func.kind() == SchemaKind.inplace and (info is not None): - for derivative in info.derivatives: - if 'self' in derivative.var_names: - for saved_input in derivative.saved_inputs: - assert 'strides_or_error' not in saved_input.expr, ( - "Calling '.strides()' in the 'self' derivative formula of an " - f"in-place function is not supported: {f.func}") - - # For functions that have a single def for out-of-place and inplace (like abs()) - if info and info.forward_derivatives: - forward_derivatives = info.forward_derivatives - - if f.func.kind() == SchemaKind.inplace: - # For inplace functions there is a little bit of work to do: - # 1) Validate the formula and make sure the input that is modified in not used: - # - If there is a formula for the inplace variant of the function (is_exact_match == True) then - # we make sure that the original value of the input that is being modified inplace (self_p) is - # not used in the formula. Note that the formula can use "original_self_p" here and that would - # trigger a clone of the original input. - # - If we are re-using the out of place formula (is_exact_match == False) then we replace every - # occurrence of self_p and self_t by original_self_p and original_self_t. These will be - # populated by cloned version of the original input (either the clone done by the backward AD - # logic if self is also used in a backward formula or a special clone that we add). - # 2) At this point, there cannot be a self_p in the formula. - # 3) Change "result" into "self_p" as by design, in the inplace function codegen, the result is - # simply called self (as it is modified inplace). - # 4) Update the required primals data in case it used to contain "result" but should now contain - # "self" - # 5) If it is not an exact match, the user formula is not modifying the existing forward grad - # inplace as it should. So add some code that makes sure that we do so if the forward grad - # already exists. - - assert len(info.forward_derivatives) == 1 # Only single output inplace should exist - fw_info = info.forward_derivatives[0] - formula = fw_info.formula - - def replace_self_with_original_self(formula: str, postfix: str) -> str: - def repl(m: Match[str]) -> str: - return f'{m.group(1)}original_self{postfix}{m.group(2)}' - return re.sub(IDENT_REGEX.format(f'self{postfix}'), repl, formula) - - if re.search(IDENT_REGEX.format("self_p"), formula): - if is_exact_match: - # For manually defined formulas, don't allow the original value to be used - raise RuntimeError(f'The formula for "{f.func.name}" is using the original value of self ' - 'that is being modified inplace. This would lead to wrong forward gradients. ' - 'Please use "result" in the formula only.') - else: - # When the original formula is out of place, we save a clone of the primal - # value to be able to access this value if needed - # replace "self_p"/"self_t" from the formula by "original_self_p"/"original_self_t" - formula = replace_self_with_original_self(formula, "_p") - formula = replace_self_with_original_self(formula, "_t") - - # replace "result" from the formula by "self_p" - def repl(m: Match[str]) -> str: - return f'{m.group(1)}self_p{m.group(2)}' - formula = re.sub(IDENT_REGEX.format("result"), repl, formula) - - required_primals = fw_info.required_inputs_primal - if re.search(IDENT_REGEX.format("self_p"), formula): - required_primals = required_primals + ("self",) if required_primals else ("self",) - - if not is_exact_match: - # Make sure that the forward grad is modified inplace when the original formula - # is out of place - formula = f"self_t_raw.defined() ? self_t_raw.copy_({formula}) : {formula}" - - required_original_self_value = bool(re.search(IDENT_REGEX.format("original_self_p"), formula)) - - forward_derivatives = [ForwardDerivative( - formula=formula, - var_name="self", - var_type=fw_info.var_type, - required_inputs_fw_grad=fw_info.required_inputs_fw_grad, - required_inputs_primal=required_primals, - required_original_self_value=required_original_self_value, - is_reusing_outplace_formula=not is_exact_match), ] - else: - forward_derivatives = [] - - result.append(NativeFunctionWithDifferentiabilityInfo( - func=f, - info=info, - fw_derivatives=forward_derivatives - )) - - return result - -def is_differentiable(name: str, type: Type, info: Optional[DifferentiabilityInfo]) -> bool: - return type.is_tensor_like() and (info is None or name not in info.non_differentiable_arg_names) - -def gen_differentiable_outputs(fn: NativeFunctionWithDifferentiabilityInfo) -> List[DifferentiableOutput]: - f = fn.func - info = fn.info - outputs: List[DifferentiableOutput] = [ - DifferentiableOutput(name=name, type=ret.type, cpp_type=cpp.return_type(ret).cpp_type()) - for name, ret in zip(cpp.return_names(f), f.func.returns)] - output_differentiability = info.output_differentiability if info else None - if output_differentiability is not None: - if len(output_differentiability) != len(outputs): - raise RuntimeError(f"The length of output_differentiability ({len(output_differentiability)}), " - f"does not match the number of outputs ({len(outputs)}).") - differentiable_outputs: List[DifferentiableOutput] = [] - if False in output_differentiability and f.func.kind() == SchemaKind.inplace: - raise RuntimeError("output_differentiability=False for inplace operation (version_counter won't get updated)") - for differentiable, output in zip(output_differentiability, outputs): - if differentiable: - differentiable_outputs.append(output) - return differentiable_outputs - candidate_differentiable_outputs = list(filter(lambda r: is_differentiable(r.name, r.type, info), outputs)) - if uses_single_grad(info): - return candidate_differentiable_outputs[:1] - else: - return candidate_differentiable_outputs diff --git a/tools/codegen/api/cpp.py b/tools/codegen/api/cpp.py deleted file mode 100644 index a485fc17acf6..000000000000 --- a/tools/codegen/api/cpp.py +++ /dev/null @@ -1,317 +0,0 @@ -from tools.codegen.model import (Argument, Arguments, BaseTy, BaseType, - FunctionSchema, ListType, NativeFunction, - OptionalType, Return, SelfArgument, - TensorOptionsArguments, Type) -from tools.codegen.api.types import (ArgName, BaseCType, Binding, ConstRefCType, NamedCType, CType, - MutRefCType, ArrayCType, ListCType, VectorCType, ArrayRefCType, - OptionalCType, TupleCType, SpecialArgName, boolT, scalarT, - tensorListT, dimnameListT, tensorT, voidT, longT, - BaseTypeToCppMapping, intArrayRefT, tensorOptionsT) -from tools.codegen import local -from tools.codegen.utils import assert_never -from typing import Optional, Sequence, Union, List, Set - -# This file describes the translation of JIT schema to the public C++ -# API, which is what people use when they call functions like at::add. -# -# Prominent characteristics of the C++ API: -# -# - dtype, layout, device and pin_memory are collected into -# a single C++ type TensorOptions (the native functions API -# also has this, but tensor options is really most relevant -# for the C++ API; it makes calling kwarg factory functions -# pleasant) -# -# - defaulting lives here (in fact, the dispatcher is completely -# oblivious of defaults!) -# -# BTW: policy on name collisions: we try not to have types with -# collisions, but functions are fair game to collide - -def name(func: FunctionSchema, *, faithful_name_for_out_overloads: bool = False) -> str: - name = str(func.name.name) - if func.is_out_fn(): - if faithful_name_for_out_overloads: - name += '_outf' - else: - name += '_out' - - return name - -# Translation of "value types" in JIT schema to C++ API type. Value -# types look the same no matter if they are argument types or return -# types. Returns None if the type in question is not a value type. -def valuetype_type(t: Type, *, binds: ArgName, remove_non_owning_ref_types: bool = False) -> Optional[NamedCType]: - if isinstance(t, BaseType): - if t.name == BaseTy.Tensor or t.name == BaseTy.Scalar: - return None - if remove_non_owning_ref_types: - if t.name == BaseTy.str: - raise AssertionError("string ref->value conversion: not implemented yet") - # All other BaseType currently map directly to BaseCppTypes. - return NamedCType(binds, BaseCType(BaseTypeToCppMapping[t.name])) - elif isinstance(t, OptionalType): - elem = valuetype_type(t.elem, binds=binds) - if elem is None: - return None - return NamedCType(binds, OptionalCType(elem.type)) - elif isinstance(t, ListType): - if str(t.elem) == 'bool': - assert t.size is not None - return NamedCType(binds, ArrayCType(BaseCType(boolT), t.size)) - else: - return None - else: - raise AssertionError(f"unrecognized type {repr(t)}") - -# Translation of types occuring in JIT arguments to a C++ argument type. -# If remove_non_owning_ref_types is set, we'll guarantee that the outputed CType is not a non-owning reference type. -# For example, we'll return std::vector instead of IntArrayRef. -# See Note [translation from C++ reference to value types] -def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName, remove_non_owning_ref_types: bool = False) -> NamedCType: - # If it's a value type, do the value type translation - r = valuetype_type(t, binds=binds, remove_non_owning_ref_types=remove_non_owning_ref_types) - if r is not None: - return r - - if isinstance(t, BaseType): - if t.name == BaseTy.Tensor: - if mutable and not local.use_const_ref_for_mutable_tensors(): - return NamedCType(binds, MutRefCType(BaseCType(tensorT))) - else: - return NamedCType(binds, ConstRefCType(BaseCType(tensorT))) - elif t.name == BaseTy.Scalar: - return NamedCType(binds, ConstRefCType(BaseCType(scalarT))) - else: - raise AssertionError(f"base type should have been value type {t}") - elif isinstance(t, OptionalType): - if str(t.elem) == 'Tensor': - if mutable and not local.use_const_ref_for_mutable_tensors(): - return NamedCType(binds, MutRefCType(BaseCType(tensorT))) # TODO: fix this discrepancy - else: - return NamedCType(binds, ConstRefCType(OptionalCType(BaseCType(tensorT)))) - elif str(t.elem) == 'Scalar': - return NamedCType(binds, ConstRefCType(OptionalCType(BaseCType(scalarT)))) - elem = argumenttype_type(t.elem, mutable=mutable, binds=binds) - return NamedCType(binds, OptionalCType(elem.type)) - elif isinstance(t, ListType): - # TODO: remove these special cases, ArrayRef fallthrough works fine - if str(t.elem) == 'int': - if remove_non_owning_ref_types: - return NamedCType(binds, VectorCType(BaseCType(longT))) - else: - return NamedCType(binds, BaseCType(intArrayRefT)) - elif str(t.elem) == 'Tensor': - return NamedCType(binds, BaseCType(tensorListT)) - elif str(t.elem) == 'Scalar': - return NamedCType(binds, ArrayRefCType(BaseCType(scalarT))) - elif str(t.elem) == 'Dimname': - return NamedCType(binds, BaseCType(dimnameListT)) - elif str(t.elem) == 'Tensor?': - return NamedCType(binds, ConstRefCType(ListCType(OptionalCType(BaseCType(tensorT))))) - elem = argumenttype_type(t.elem, mutable=mutable, binds=binds) - return NamedCType(binds, ArrayRefCType(elem.type)) - else: - raise AssertionError(f"unrecognized type {repr(t)}") - -# Translate a JIT argument into its C++ type -def argument_type(a: Argument, *, binds: ArgName) -> NamedCType: - return argumenttype_type(a.type, mutable=a.is_write, binds=binds) - -# Translation of a (non-multi) return type from JIT to C++ -# N.B: returntype_type returns a CType, not a NamedCType. -# This is mostly because of the mismatch between return types and return names. -# e.g. a function with a return type of 'void' has 0 return names, -# and a function with a return type of 'std::tuple' has >1 return name. -def returntype_type(t: Type, *, mutable: bool) -> CType: - # placeholder is ignored - r = valuetype_type(t, binds="__placeholder__") - if r is not None: - return r.type - - if isinstance(t, BaseType): - if t.name == BaseTy.Tensor: - if mutable: - if local.use_const_ref_for_mutable_tensors(): - return ConstRefCType(BaseCType(tensorT)) - else: - return MutRefCType(BaseCType(tensorT)) - else: - # Note [Tensor Copy Returns] - # Currently, we use "Argument.is_write" to determine - # whether or not Tensor return types should be copies or references. - # If that ever changes, take a look at other locations of this note! - return BaseCType(tensorT) - elif t.name == BaseTy.Scalar: - return BaseCType(scalarT) - elif isinstance(t, ListType): - elem = returntype_type(t.elem, mutable=mutable) - assert t.size is None, f"fixed size list returns not supported: {t}" - return VectorCType(elem) - - raise AssertionError(f"unrecognized return type {t}") - -# Translation of a single return to its C++ type -def return_type(r: Return) -> CType: - return returntype_type(r.type, mutable=r.is_write) - -# Translation of a full (possibly multi) return from JIT to its C++ type -def returns_type(rs: Sequence[Return]) -> CType: - if len(rs) == 0: - return BaseCType(voidT) - elif len(rs) == 1: - return return_type(rs[0]) - else: - return TupleCType([return_type(r) for r in rs]) - -def return_names(f: NativeFunction, *, fallback_name: str = 'result') -> Sequence[str]: - returns: List[str] = [] - for i, r in enumerate(f.func.returns): - # If we have an inplace function, the return argument is - # implicitly named self. - # TODO: Consider incorporating this into the data model - if f.func.name.name.inplace: - assert i == 0, "illegal inplace function with multiple returns" - name = 'self' - # If we are out function, the name is the name of the - # corresponding output function (r.name will get recorded - # in field_name later.) - elif f.func.is_out_fn(): - name = f.func.arguments.out[i].name - # If the return argument is explicitly named... - elif r.name: - name_conflict = any(r.name == a.name for a in f.func.schema_order_arguments()) - if name_conflict and not f.func.is_out_fn(): - name = f'{r.name}_return' - else: - name = r.name - # If there is no explicit name and no fallback name was passed in, we just name the output result, - # unless it's a multi-return, in which case it's result0, - # result1, etc (zero-indexed) - else: - name = fallback_name if len(f.func.returns) == 1 else f'{fallback_name}{i}' - returns.append(name) - return returns - -JIT_TO_CPP_DEFAULT = { - 'False': 'false', - 'True': 'true', - 'None': 'c10::nullopt', # UGH this one is type directed - 'Mean': 'at::Reduction::Mean', - '[]': '{}', - 'contiguous_format': 'MemoryFormat::Contiguous', - 'long': 'at::kLong', -} - -# Convert a JIT default into C++ expression representing the default -def default_expr(d: str, t: Type) -> str: - if d == 'None' and str(t) == 'Tensor?': - return '{}' - if isinstance(t, BaseType) and t.name is BaseTy.str: - # Schema allows single quotes but C++ needs double - if len(d) >= 2 and d[0] == "'" and d[-1] == "'": - s = '' - i = 1 - while i + 1 < len(d): - if d[i] != '\\': - if d[i] == '"': - s += '\\"' - else: - s += d[i] - i += 1 - else: - if d[i + 1] == "'": - s += "'" - else: - s += d[i:i + 2] - i += 2 - - return f'"{s}"' - - if isinstance(t, OptionalType): - if d == 'None': - return 'c10::nullopt' - - return default_expr(d, t.elem) - - if isinstance(t, ListType): - if (d.startswith('[') and d.endswith(']')): - return '{' + d[1:-1] + '}' - elif t.size is None: - # NOTE: Sized lists can have scalar defaults - raise ValueError(f"Expected a list default '[...]' but found: '{d}'") - - return JIT_TO_CPP_DEFAULT.get(d, d) - -# Convert an argument into its C++ API form - -def argument( - a: Union[Argument, TensorOptionsArguments, SelfArgument], - *, cpp_no_default_args: Set[str], method: bool, faithful: bool, - has_tensor_options: bool -) -> List[Binding]: - def sub_argument(a: Union[Argument, TensorOptionsArguments, SelfArgument]) -> List[Binding]: - return argument( - a, cpp_no_default_args=cpp_no_default_args, method=method, faithful=faithful, - has_tensor_options=has_tensor_options) - - if isinstance(a, Argument): - binds: ArgName - if a.name == "memory_format" and has_tensor_options: - binds = SpecialArgName.possibly_redundant_memory_format - else: - binds = a.name - default: Optional[str] = None - if a.name not in cpp_no_default_args and a.default is not None: - default = default_expr(a.default, a.type) - return [Binding( - nctype=argument_type(a, binds=binds), - name=a.name, - default=default, - argument=a, - )] - elif isinstance(a, TensorOptionsArguments): - if faithful: - return sub_argument(a.dtype) + sub_argument(a.layout) + \ - sub_argument(a.device) + sub_argument(a.pin_memory) - else: - default = None - # Enforced by NativeFunction.__post_init__ - assert 'options' not in cpp_no_default_args - if all(x.default == "None" for x in a.all()): - default = '{}' - elif a.dtype.default == "long": - default = 'at::kLong' # TODO: this is wrong - return [Binding( - nctype=NamedCType('options', BaseCType(tensorOptionsT)), - name='options', - default=default, - argument=a, - )] - elif isinstance(a, SelfArgument): - if method: - # Caller is responsible for installing implicit this in context! - return [] - else: - return sub_argument(a.argument) - else: - assert_never(a) - -def arguments( - arguments: Arguments, - *, faithful: bool, method: bool, cpp_no_default_args: Set[str] -) -> List[Binding]: - args: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = [] - if faithful: - args.extend(arguments.non_out) - args.extend(arguments.out) - else: - args.extend(arguments.out) - args.extend(arguments.non_out) - return [ - r.no_default() if faithful else r for a in args - for r in argument( - a, faithful=faithful, method=method, - has_tensor_options=arguments.tensor_options is not None, - cpp_no_default_args=cpp_no_default_args) - ] diff --git a/tools/codegen/api/dispatcher.py b/tools/codegen/api/dispatcher.py deleted file mode 100644 index 6738fbef5b49..000000000000 --- a/tools/codegen/api/dispatcher.py +++ /dev/null @@ -1,66 +0,0 @@ -from tools.codegen.model import (Argument, FunctionSchema, Return, - SelfArgument, TensorOptionsArguments, Type) - -from tools.codegen.api.types import ArgName, Binding, NamedCType, CType -from tools.codegen.api import cpp -from tools.codegen.utils import concatMap, assert_never - -import itertools -from typing import Sequence, List, Union - -# This file describes the translation of JIT schema to the dispatcher -# API, the *unboxed* calling convention by which invocations through -# the dispatcher are made. Historically, the dispatcher API matched -# the C++ API, but with the establishment of the boxed API, we've -# made changes to the dispatcher API to so that the unboxed API -# better aligns with the boxed API. The dispatcher API hooks heavily -# into our template based boxing/unboxing machinery, so changes -# to this convention will usually need template updates too. -# -# Prominent characteristics of the dispatcher API: -# -# - dtype, layout, device and pin_memory are represented as separate -# arguments. -# - -def name(func: FunctionSchema) -> str: - return cpp.name(func) - -def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName, remove_non_owning_ref_types: bool = False) -> NamedCType: - # This is a faux amis. If it makes sense in the future to add - # more special cases here, or invert things so cpp.argument_type - # calls this, or just completely inline the function, please do - # it. - return cpp.argumenttype_type(t, mutable=mutable, binds=binds, remove_non_owning_ref_types=remove_non_owning_ref_types) - -def argument_type(a: Argument, *, binds: ArgName, remove_non_owning_ref_types: bool = False) -> NamedCType: - return argumenttype_type(a.type, mutable=a.is_write, binds=binds, remove_non_owning_ref_types=remove_non_owning_ref_types) - -def returns_type(rs: Sequence[Return]) -> CType: - # At present, there is no difference. But there could be! - return cpp.returns_type(rs) - -def jit_arguments(func: FunctionSchema) -> List[Argument]: - def to_argument(a: Union[Argument, TensorOptionsArguments, SelfArgument]) -> List[Argument]: - if isinstance(a, Argument): - return [a] - elif isinstance(a, SelfArgument): - return [a.argument] - elif isinstance(a, TensorOptionsArguments): - return [a.dtype, a.layout, a.device, a.pin_memory] - else: - assert_never(a) - return list(concatMap(to_argument, itertools.chain( - func.arguments.positional, - func.arguments.kwarg_only, - func.arguments.out))) - -def argument(a: Argument, *, remove_non_owning_ref_types: bool = False) -> Binding: - return Binding( - nctype=argument_type(a, binds=a.name, remove_non_owning_ref_types=remove_non_owning_ref_types), - name=a.name, - argument=a - ) - -def arguments(func: FunctionSchema) -> List[Binding]: - return [argument(a) for a in jit_arguments(func)] diff --git a/tools/codegen/api/functionalization.py b/tools/codegen/api/functionalization.py deleted file mode 100644 index ebd30ab94c9d..000000000000 --- a/tools/codegen/api/functionalization.py +++ /dev/null @@ -1,114 +0,0 @@ -from tools.codegen.model import ( - FunctionSchema, BaseTy, BaseType, NativeFunction, Argument, Tag, -) -from tools.codegen.api.types import ( - Binding, NamedCType, ConstRefCType, BaseCType, CType, tensorT, longT -) -from tools.codegen.api import dispatcher -from typing import List, Optional - - -# This file describes the translation of JIT schema to API's used -# when creating view lambdas that are used by the functionalization pass. -# There are two types of lambdas: forward lambdas and reverse lambdas. -# These API's mostly follow the dispatcher API, with a few quirks: -# - The lambda capture has to convert reference types to value types -# - While the forward lambda just directly calls into the at::_ops API -# (following the dispatcher convention), the logic here for the reverse lambda -# is responsible for generating both the call-site, and the declarations -# (which are implemented manually in the at::functionalization::impl namespace). - -# The lambdas generated for each view op in the functionalization pass are of the form -# [capture_arguments](outer_arguments) -> returns_type { -# return name(inner_arguments); -# } - -# Define some specific lambda input arguments. -base_binding = Binding( - name='base', - nctype=NamedCType(name='base', type=ConstRefCType(BaseCType(tensorT))), - argument=Argument(name='base', type=BaseType(BaseTy.Tensor), default=None, annotation=None), - default=None) -mutated_view_binding = Binding( - name='mutated_view', - nctype=NamedCType(name='mutated_view', type=ConstRefCType(BaseCType(tensorT))), - argument=Argument(name='base', type=BaseType(BaseTy.Tensor), default=None, annotation=None), - default=None) -mutated_view_idx_binding = Binding( - name='mutated_view_idx', - nctype=NamedCType(name='mutated_view_idx', type=BaseCType(longT)), - argument=Argument(name='base', type=BaseType(BaseTy.Tensor), default=None, annotation=None), - default=None) - -# The lambda capture itself doesn't have a name. -# The name returned here corresponds to the name of the inner function called by the lambda. -def name(f: NativeFunction, *, functional_op: NativeFunction, is_reverse: bool, include_namespace: bool) -> str: - # For inplace_view ops, the lambda calls out to the corresponding functional view op - fn = functional_op if f.tag is Tag.inplace_view else f - name = fn.func.name.unambiguous_name() - if is_reverse: - # in the reverse case, we codegen both the call-sites (which need the full namespace) and the declarations (which don't) - if include_namespace: - return f'at::functionalization::FunctionalInverses::{name}_inverse' - else: - return f'{name}_inverse' - # in the forward case, we just diretly call into the at::_ops API (so we always need the namespace) - assert include_namespace - return f'at::_ops::{name}::call' - - -def capture_arguments(func: FunctionSchema, *, is_reverse: bool) -> List[Binding]: - # capture arguments include all arguments except `self`. - # Importantly, they don't include any C++ reference types (or else we'll get a dangling reference in the capture), - # So any reference types (IntArrayRef) need to be converted to value types (vector) - args = func.arguments.flat_all - assert args[0].type == BaseType(BaseTy.Tensor) - non_self_args = args[1:] - non_self_value_bindings = [dispatcher.argument(a, remove_non_owning_ref_types=True) for a in non_self_args] - return non_self_value_bindings - - -def returns_type(func: FunctionSchema) -> CType: - # Assertion: all view ops return tensor-like outputs - assert len(func.returns) >= 1 - for ret in func.returns: - assert ret.type.is_tensor_like() - # However, the return type of the lambda is always an individual tensor. - # For multi-tensor outputs, each tensor needs to be tracked individually. - return BaseCType(tensorT) - - -def outer_arguments(*, is_reverse: bool) -> List[Binding]: - if is_reverse: - return [base_binding, mutated_view_binding, mutated_view_idx_binding] - else: - return [base_binding, mutated_view_idx_binding] - - -def inner_call_index(func: FunctionSchema) -> Optional[Binding]: - # For view ops that return multiple tensors (like `split`), we generate a separate lambda for each output. - # When we replay a view op that returns multiple tensors, we need to index into the output appropriately - if len(func.returns) > 1 or (len(func.returns) == 1 and func.returns[0].type.is_list_like()): - return mutated_view_idx_binding - return None - - -def inner_arguments(func: FunctionSchema, is_reverse: bool) -> List[Binding]: - args = func.arguments.flat_all - assert args[0].type == BaseType(BaseTy.Tensor) - non_self_args = args[1:] - # The forward lambda calls the at::_ops API, while the reverse lambda calls the view inverse API. - # Both of these follow the dispatcher API. - non_self_bindings = [dispatcher.argument(a) for a in non_self_args] - if not is_reverse: - # the forward lambda swaps out the original tensor argument with the lambd arg "base" - return [base_binding] + non_self_bindings - else: - # the reverse lambda does the same, but with an additional "mutated_view" arg - # additionally, we have a calling convention: for view ops that return multiple tensor outputs - # their corresponding view_inverse function takes in an additional index argument. - index_binding = inner_call_index(func) - if index_binding is not None: - return [base_binding, mutated_view_binding, index_binding] + non_self_bindings - else: - return [base_binding, mutated_view_binding] + non_self_bindings diff --git a/tools/codegen/api/lazy.py b/tools/codegen/api/lazy.py deleted file mode 100644 index 3fe83936eef8..000000000000 --- a/tools/codegen/api/lazy.py +++ /dev/null @@ -1,172 +0,0 @@ -from typing import List, Union, Tuple -from tools.codegen.model import (Type, BaseTy, BaseType, OptionalType, - ListType, OperatorName, FunctionSchema, - Return) -from tools.codegen.api.types import (BaseCppType, BaseCType, OptionalCType, - ConstRefCType, NamedCType, - MutRefCType, - VectorCType, boolT, longT, doubleT, ListCType, stringT, - scalarT, scalarTypeT, ArrayRefCType, ArrayCType, TupleCType) - -valueT = BaseCppType('torch::lazy', 'Value') - - -def process_ir_type(typ: Type) -> Union[BaseCType, VectorCType, OptionalCType, ListCType]: - """ - This function takes a type from NativeFunctions and converts it for use with - lazy tensor codegen. Currently its output is used in several places, and so far - it has been possible for them to all use the same conversions, but that may not be - optimal or possible in the finished system. - - Type conversion for lazy currently consists of - (1) changing Tensor-like things into Value-like things - (2) wrapping everything in a BaseCType - (3) making reference types into values (e.g. vector instead of IntArrayRef) - - (1) converts Tensors to Values since Values are how Lazy IR represents tensors. There - is special handling for Optional[Tensor] or List[Tensor], etc- hence 'tensor-like' - - This is incomplete- there are assertions in places that it's expected to need to add - more types as the codegen is used with more operators. - """ - if isinstance(typ, BaseType): - if typ.name == BaseTy.Tensor: - return BaseCType(valueT) - elif typ.name == BaseTy.Scalar: - return BaseCType(scalarT) - elif typ.name == BaseTy.ScalarType: - return BaseCType(scalarTypeT) - elif typ.name == BaseTy.int: - return BaseCType(longT) - elif typ.name == BaseTy.bool: - return BaseCType(boolT) - elif typ.name == BaseTy.float: - return BaseCType(doubleT) - elif typ.name == BaseTy.str: - return BaseCType(stringT) - else: - raise AssertionError(f"TODO add support for type {repr(typ)}") - elif isinstance(typ, OptionalType): - return OptionalCType(process_ir_type(typ.elem)) - elif isinstance(typ, ListType): - if str(typ.elem) == 'Tensor?': - # TODO(whc) is this actually correct? or should it use a Vector like above - return ListCType(OptionalCType(BaseCType(valueT))) - else: - return VectorCType(process_ir_type(typ.elem)) - else: - raise AssertionError(f"unrecognized type {repr(typ)}") - - -def isValueType(typ: Union[Type, BaseCType, OptionalCType, ConstRefCType, MutRefCType, - ListCType, ArrayRefCType, ArrayCType, VectorCType, TupleCType]) -> bool: - """ - Given a type, determine if it is a Value-like type. This is equivalent to - being Tensor-like, but assumes the type has already been transformed. - """ - if isinstance(typ, BaseCType): - return typ.type == valueT - elif isinstance(typ, (OptionalCType, ListCType, VectorCType)): - return isValueType(typ.elem) - else: - return False - -# Inspired by a FunctionSchema object, a LazyIrSchema holds the schema of a Lazy IR node. -# Unlike a FunctionSchema, it has no round-trippable string form (relating to the YAML), -# but carries type information from a native FunctionSchema modified for use with IR nodes, -# and preserving original argument names. - - -class LazyIrSchema: - # The name of the operator this function schema describes. - name: 'OperatorName' - - positional_arg_types: Tuple[NamedCType, ...] - keyword_arg_types: Tuple[NamedCType, ...] - - # TODO: Need to handle collisions with argument names at some point - returns: Tuple['Return', ...] - - def __init__(self, func: FunctionSchema): - - positional_arg_types = [] - for arg_field in ["pre_self_positional", - "self_arg", - "post_self_positional"]: - if arg_field == "self_arg" and func.arguments.self_arg is not None: - arg = getattr(func.arguments, "self_arg").argument - positional_arg_types.append(NamedCType(arg.name, process_ir_type(arg.type))) - elif getattr(func.arguments, arg_field) is not None: - positional_arg_types.extend([ - NamedCType( - arg.name, - process_ir_type(arg.type)) for arg in getattr(func.arguments, arg_field)]) - self.positional_arg_types = tuple(positional_arg_types) - - keyword_arg_types = [] - for arg_field in ["pre_tensor_options_kwarg_only", - "tensor_options", - "post_tensor_options_kwarg_only", - "out"]: - if getattr(func.arguments, arg_field) is not None: - keyword_arg_types.extend([ - NamedCType( - arg.name, - process_ir_type(arg.type)) for arg in getattr(func.arguments, arg_field)]) - self.keyword_arg_types = tuple(keyword_arg_types) - self.name = func.name - self.returns = func.returns - - @property - def node_name(self) -> str: - """ - Return camel-case version of op in node. - - Note: This function also appends any `overload_name` in the operation. - For example, if the op is `bitwise_and.Tensor`, the returned name - will be `BitwiseAndTensor`. - """ - op_name = f"{self.name.name}_{self.name.overload_name}".lower() - return "".join(word.capitalize() or "" for word in op_name.split("_")) - - @property - def aten_name(self) -> str: - return f"{self.name.name}" - - @property - def base_name(self) -> str: - return f"{self.name.name.base}" - - def filtered_types(self, positional: bool = True, keyword: bool = True, - values: bool = True, scalars: bool = True) -> List[NamedCType]: - types: List[NamedCType] = [] - if positional: - types.extend(self.positional_arg_types) - if keyword: - types.extend(self.keyword_arg_types) - - if values and scalars: - return types - - if values: - return [t for t in types if isValueType(t.type)] - elif scalars: - return [t for t in types if not isValueType(t.type)] - - return [] - - @property - def positional_values(self) -> List[NamedCType]: - return self.filtered_types(positional=True, keyword=False, values=True, scalars=False) - - @property - def positional_scalars(self) -> List[NamedCType]: - return self.filtered_types(positional=True, keyword=False, values=False, scalars=True) - - @property - def keyword_values(self) -> List[NamedCType]: - return self.filtered_types(positional=False, keyword=True, values=True, scalars=False) - - @property - def keyword_scalars(self) -> List[NamedCType]: - return self.filtered_types(positional=False, keyword=True, values=False, scalars=True) diff --git a/tools/codegen/api/native.py b/tools/codegen/api/native.py deleted file mode 100644 index d072f20d4270..000000000000 --- a/tools/codegen/api/native.py +++ /dev/null @@ -1,111 +0,0 @@ -from tools.codegen.model import (Argument, FunctionSchema, Return, - SelfArgument, TensorOptionsArguments, Type) - -from tools.codegen.api.types import (ArgName, BaseCType, Binding, - ConstRefCType, NamedCType, CType, MutRefCType, ListCType, - OptionalCType, tensorT, scalarT, layoutT, - deviceT, boolT, scalarTypeT) -from tools.codegen.api import cpp -from tools.codegen import local -from tools.codegen.utils import assert_never - -from typing import Union, Sequence, List, Optional - -# This file describes the translation of JIT schema to the native functions API. -# This looks a lot like the C++ API (which makes historical sense, because the -# idea was you wrote native functions to implement functions in the C++ API), -# but over time we have evolved the C++ API without actually changing our -# native:: kernels. The intention is to make native API and dispatcher API -# line up as closely as possible, since this results in the least overhead -# (no translation is needed from dispatcher API to native API). - -def name(func: FunctionSchema) -> str: - name = str(func.name.name) - # TODO: delete this! - if func.is_out_fn(): - name += '_out' - if func.name.overload_name: - name += f'_{func.name.overload_name}' - return name - -def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName) -> NamedCType: - if str(t) == 'Tensor?': - tensor_type: OptionalCType = OptionalCType(BaseCType(tensorT)) - if mutable and not local.use_const_ref_for_mutable_tensors(): - return NamedCType(binds, MutRefCType(tensor_type)) - else: - return NamedCType(binds, ConstRefCType(tensor_type)) - elif str(t) == 'Tensor?[]': - return NamedCType(binds, ConstRefCType(ListCType(OptionalCType(BaseCType(tensorT))))) - elif str(t) == 'Scalar': - return NamedCType(binds, ConstRefCType(BaseCType(scalarT))) - elif str(t) == 'Scalar?': - return NamedCType(binds, ConstRefCType(OptionalCType(BaseCType(scalarT)))) - return cpp.argumenttype_type(t, mutable=mutable, binds=binds) - -def returns_type(rs: Sequence[Return]) -> CType: - return cpp.returns_type(rs) - -def argument_type(a: Argument, *, binds: ArgName) -> NamedCType: - return argumenttype_type(a.type, mutable=a.is_write, binds=binds) - -def argument(a: Union[Argument, SelfArgument, TensorOptionsArguments], *, is_out: bool) -> List[Binding]: - # Ideally, we NEVER default native functions. However, there are a number - # of functions that call native:: directly and rely on the defaulting - # existing. So for BC, we generate defaults for non-out variants (but not - # for out variants, where it is impossible to generate an appropriate - # default) - should_default = not is_out - if isinstance(a, Argument): - default: Optional[str] = None - if should_default and a.default is not None: - default = cpp.default_expr(a.default, a.type) - return [Binding( - nctype=argument_type(a, binds=a.name), - name=a.name, - default=default, - argument=a, - )] - elif isinstance(a, SelfArgument): - # Erase SelfArgument from the distinction - return argument(a.argument, is_out=is_out) - elif isinstance(a, TensorOptionsArguments): - default = None - if should_default: - default = '{}' - # TODO: Not sure why the arguments assigned here are for - # TensorOptionsArguments and not the constituent pieces. It seems - # to matter - return [ - Binding( - nctype=NamedCType('dtype', OptionalCType(BaseCType(scalarTypeT))), - name='dtype', - default=default, - argument=a, - ), - Binding( - nctype=NamedCType('layout', OptionalCType(BaseCType(layoutT))), - name='layout', - default=default, - argument=a, - ), - Binding( - nctype=NamedCType('device', OptionalCType(BaseCType(deviceT))), - name='device', - default=default, - argument=a, - ), - Binding( - nctype=NamedCType('pin_memory', OptionalCType(BaseCType(boolT))), - name='pin_memory', - default=default, - argument=a, - )] - else: - assert_never(a) - -def arguments(func: FunctionSchema) -> List[Binding]: - args: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = [] - args.extend(func.arguments.non_out) - args.extend(func.arguments.out) - return [r for arg in args for r in argument(arg, is_out=func.is_out_fn())] diff --git a/tools/codegen/api/python.py b/tools/codegen/api/python.py deleted file mode 100644 index 6c362cb87387..000000000000 --- a/tools/codegen/api/python.py +++ /dev/null @@ -1,1205 +0,0 @@ -from dataclasses import dataclass -from typing import Optional, Union, Sequence, Set, List, Dict, Tuple - -from tools.codegen.api.types import Binding, CppSignature, CppSignatureGroup -from tools.codegen.api import cpp -from tools.codegen.gen import pythonify_default -from tools.codegen.model import (Argument, BaseTy, BaseType, ListType, - NativeFunction, OptionalType, Return, Type, - Variant) - -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# -# Data Models -# -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# -# [Notes] python binding codegen -# -# The Python binding codegen produces code that takes the input list of -# PyObjects, finds the matching ATen C++ function using PythonArgParser, -# converts the PyObjects into C++ types and calls the ATen C++ function: -# -# +--------+ parsing +------------------------+ binding +-----------------------+ -# | PyObjs | ---------> | PythonArgParser Output | ---------> | Cpp Function Dispatch | -# +--------+ +------------------------+ +-----------------------+ -# -# The following examples demonstrate the data models the Python binding -# codegen needs to deal with and the tasks it needs to accomplish. It -# helps understand the purpose of the new data types we introduced below. -# -# - Function Schema (source of truth) -# -# aten::empty.names(int[] size, *, Dimname[]? names, -# ScalarType? dtype=None, Layout? layout=None, -# Device? device=None, bool? pin_memory=None, -# MemoryFormat? memory_format=None) -> Tensor -# -# - Python Signature -# -# It's used to generate input schema string for PythonArgParser. -# Note: TensorOptions fields are reordered and the additional -# 'requires_grad' field is added: -# -# empty(IntArrayRef size, *, DimnameList? names, -# MemoryFormat? memory_format=None, ScalarType dtype=None, -# Layout layout=torch.strided, Device device=None, -# bool pin_memory=False, bool requires_grad=False) -# -# - C++ Signature -# -# It's used to generate C++ lambda formals & dispatch call. -# Note: the scattered TensorOptions fields are packed into 'options'. -# -# auto dispatch_empty = -# [](IntArrayRef size, c10::optional names, -# const TensorOptions & options, -# c10::optional memory_format) -> Tensor { -# pybind11::gil_scoped_release no_gil; -# return torch::empty(size, names, options, memory_format); -# }; -# -# - Binding between Python Arguments and C++ Arguments -# -# Given a set of Python Arguments in scope, we need produce the -# binding expressions that translate the Python API into C++ API: -# -# Python Args Cpp Args Binding Exprs -# ----------------------------------------------------------------- -# 0: size size '_r.intlist(0)' -# 1: names names 'names' [special init] -# 2: memory_format -------+ -# 3: dtype -----+-|--> options 'options' [special packing] -# 4: layout / | -# 5: device / +--> memory_format '_r.memoryformatOptional(2)' -# 6: pin_memory / -# 7: requires_grad -+ -# -# So the full dispatch expression would look like: -# -# dispatch_empty(_r.intlist(0), names, options, -# _r.memoryformatOptional(2)) -# -# Where does 'names' come from? It involves special local init: -# -# auto __names = _r.toDimnameListOptional(1); -# c10::optional names = -# __names ? c10::make_optional(DimnameList(__names.value())) -# : c10::nullopt; -# -# Where does 'options' come from? It involves special local init -# for TensorOptions. Note that Python side has the additional -# 'requires_grad' field: -# -# const auto options = TensorOptions() -# .dtype(_r.scalartype(3)) -# .device(_r.device(5)) -# .layout(_r.layoutOptional(4)) -# .requires_grad(_r.toBool(7)) -# .pinned_memory(_r.toBool(6)); -# -# In some other cases one Python Argument can map to multiple C++ -# Arguments. For example: -# -# aten::max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -# -> (Tensor values, Tensor indices) -# -# Python Args Cpp Args Binding Exprs -# --------------------------------------------------------------------- -# +----> max 'out[0]' -# /-----> max_values 'out[1] -# 0: input / self '_r.tensor(0)' -# 1: dim / dim '_r.dimname(1)' -# 2: keepdim / keepdim '_r.toBool(2)' -# 3: out -----+ [local init] out '_r.tensorlist_n<2>(3)' -# -# As demonstrated above, the binding can involve reordering, -# packing, unpacking and special local inits. -# -# -# Let's look at a concrete example: -# -# static PythonArgParser parser({ -# "abs(Tensor input, *, Tensor out=None)", -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# ^ -# +--- Python Schema, represented by PythonSignature and PythonArgument -# -# }, /*traceable=*/true); -# -# ParsedArgs<2> parsed_args; -# auto _r = parser.parse(nullptr, args, kwargs, parsed_args); -# -# ... -# -# if (_r.isNone(1)) { -# ~~~~~~~~~~~~ <--- Scattered PythonArgParser output (arg name = 'out') -# represented by PythonArgParserOutputExpr -# -# // aten::abs(Tensor self) -> Tensor -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# ^ -# +--- NativeFunction schema, base version -# -# auto dispatch_abs = [](const Tensor & self) -> Tensor { -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# ^ -# +--- dispatch_lambda_args / dispatch_lambda_return_str -# generated from NativeFunction / CppSignature -# (deprecated PythonSignature is special) -# arguments are represented by DispatchLambdaArgument -# -# pybind11::gil_scoped_release no_gil; -# return self.abs(); -# ~~~~~~~~~~~ <--- cpp_dispatch_target / cpp_dispatch_exprs -# generated from NativeFunction / CppSignature -# }; -# return wrap(dispatch_abs(_r.tensor(0))); -# ~~~~~~~~~~~~~ -# ^ -# +--- dispatch_lambda_exprs -# binding PythonArgParserOutputExpr (python args) -# and DispatchLambdaArgument (c++ args) -# -# } else { -# // aten::abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# ^ -# +--- NativeFunction schema, out-variant -# -# auto dispatch_abs_out = [](Tensor out, const Tensor & self) -> Tensor { -# pybind11::gil_scoped_release no_gil; -# return at::abs_out(out, self); -# }; -# return wrap(dispatch_abs_out(_r.tensor(1), _r.tensor(0))); -# } -# -# -# [Notes] python interface codegen -# The python dataclasses below are used used to generate both python binding code -# and pyi type hint signatures. -# In theory these two should look very similar, but there are number of differences -# in how pyi signatures vs. python_arg_parser signatures are generated. -# These differences have been encapsulated in signature_str() vs. signature_str_pyi() -# to display the full signatures, and argument_str() vs argument_str_pyi() to display arguments. -# For examples, only pyi signatures include return types. - -@dataclass(frozen=True) -class PythonReturns: - returns: Tuple[Return, ...] - - def named_tuple_pyi(self) -> Optional[Tuple[str, str]]: - python_returns = [argument_type_str_pyi(r.type) for r in self.returns] - field_names = namedtuple_fieldnames(self.returns) - if field_names: - namedtuple_name = '_'.join(['namedtuple'] + field_names) - tuple_args = [f'("{name}", {typ})' for name, typ in zip(field_names, python_returns)] - namedtuple_def = f'NamedTuple("{namedtuple_name}", [{", ".join(tuple_args)}])' - return namedtuple_name, namedtuple_def - return None - - def returns_str_pyi(self) -> str: - named_tuple = self.named_tuple_pyi() - if named_tuple is not None: - namedtuple_name, _ = named_tuple - return namedtuple_name - - python_returns = [argument_type_str_pyi(r.type) for r in self.returns] - if len(python_returns) > 1: - return 'Tuple[' + ', '.join(python_returns) + ']' - if len(python_returns) == 1: - return python_returns[0] - return 'None' - - -@dataclass(frozen=True) -class PythonArgument: - name: str - type: Type - default: Optional[str] - - # Used to generate the default init expr for some PythonArgParser outputs, e.g.: - # - # _r.layoutWithDefault(3, layout_from_backend(self.options().backend()))) - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # ^ - # +--- default_init str - default_init: Optional[str] - - # Compute argument formal for python argument parsing. - # Needs to be consistent with torch/csrc/utils/python_arg_parser.h. - def argument_str(self, *, method: bool = False) -> str: - type_str = argument_type_str(self.type).replace('const ', '').replace(' &', '') - - name = self.name - # s/self/input/ outside method bindings - # [old codegen] TODO: remove this? doesn't rename in codegen, it's just - # for the parse string - if name == 'self' and type_str == 'Tensor' and not method: - name = 'input' - - # add default - if self.default is not None: - default = { - 'nullptr': 'None', - 'c10::nullopt': 'None', - '{}': 'None', - }.get(self.default, self.default) - return f'{type_str} {name}={default}' - else: - return f'{type_str} {name}' - - def argument_str_pyi(self, *, method: bool = False, deprecated: bool = False) -> str: - type_str = argument_type_str_pyi(self.type) - - name = self.name - # s/self/input/ outside method bindings - # [old codegen] TODO: remove this? doesn't rename in codegen, it's just - # for the parse string - if name == 'self' and type_str == 'Tensor' and not method and not deprecated: - name = 'input' - - if name == 'from': # from is a Python keyword... - name += '_' - - # pyi merges the _out and functional variants into the same signature, with an optional out arg - if name == 'out' and type_str == 'Tensor' and not deprecated: - type_str = 'Optional[' + type_str + ']' - - # pyi deprecated signatures don't get defaults for their out arg - treat_as_no_default = deprecated and isinstance(self, PythonOutArgument) and self.default == 'None' - - # add default - if self.default is not None and not treat_as_no_default: - if isinstance(self.type, ListType) and self.type.elem == BaseType(BaseTy.int) and \ - self.default.startswith('{') and self.default.endswith('}'): - default = '(' + self.default[1:-1] + ')' - else: - default = { - 'nullptr': 'None', - 'c10::nullopt': 'None', - '{}': 'None', - 'MemoryFormat::Contiguous': 'contiguous_format', - 'QScheme::PER_TENSOR_AFFINE': 'per_tensor_affine', - }.get(self.default, self.default) - return f'{name}: {type_str}={default}' - else: - return f'{name}: {type_str}' - -@dataclass(frozen=True) -class PythonOutArgument(PythonArgument): - # In Python signature multiple output fields are packed into one 'out' argument. - # When binding to C++, it's first binded to a local 'out' variable: - # 'auto out = _r.tensorlist_n<2>(2);', - # then binded to scattered C++ output arguments as 'out[0]', 'out[1]', and etc. - # TODO: maybe don't need keep scattered out fields for python signature? - outputs: Tuple[PythonArgument, ...] - - @staticmethod - def from_outputs(outputs: Tuple[PythonArgument, ...]) -> Optional['PythonOutArgument']: - if not outputs: - return None - - size = len(outputs) - if size == 1: - return PythonOutArgument( - name=outputs[0].name, - type=outputs[0].type, - default='None', - default_init=None, - outputs=outputs, - ) - elif size > 1: - if any(map(lambda a: not a.type.is_tensor_like(), outputs)): - raise RuntimeError(f'Unsupported output type: {outputs}') - return PythonOutArgument( - name='out', - # TODO: shouldn't this be OptionalType[ListType[...]], since it defaults to None? - type=ListType(BaseType(BaseTy.Tensor), size), - default='None', - default_init=None, - outputs=outputs, - ) - raise AssertionError(r'Unexpected PythonOutArgument size') - -@dataclass(frozen=True) -class PythonSignature: - # Base operator name, without inplace/outplace suffix. - name: str - - # Positional arguments. - # TODO: create a dedicated SelfArgument type for 'self'? - input_args: Tuple[PythonArgument, ...] - - # Keyword arguments excluding the 'out' argument and scattered kwargs belonging - # to TensorOptions (dtype, layout, device, pin_memory, requires_grad, etc). - input_kwargs: Tuple[PythonArgument, ...] - - output_args: Optional[PythonOutArgument] - - # Return types, which are only used by pyi - returns: PythonReturns - - # These are scattered kwargs arguments belonging to TensorOptions. - # When binding to C++, they are packed into a TensorOptions object 'options'. - # It's possible that the C++ signature doesn't take TensorOptions object (e.g. - # for out variant), in which case they will be used as scattered fields without - # being packed into 'options'. - # TODO: maybe create a PythonTensorOptionsArgument? - tensor_options_args: Tuple[PythonArgument, ...] - - # method or function signature? - method: bool - - @property - def deprecated(self) -> bool: - return False - - def arguments( - self, *, skip_outputs: bool = False, skip_tensor_options: bool = False - ) -> Tuple[Union[PythonArgument, PythonOutArgument], ...]: - result: List[Union[PythonArgument, PythonOutArgument]] = [] - result.extend(self.input_args) - result.extend(self.input_kwargs) - if self.output_args is not None and not skip_outputs: - result.append(self.output_args) - if not skip_tensor_options: - result.extend(self.tensor_options_args) - return tuple(result) - - def arguments_count(self) -> int: - return len(self.arguments()) - - def output_idx(self) -> int: - return len(self.input_args) + len(self.input_kwargs) - - # [old codegen] Compute the Python function signature for argument parsing, - # as specified in torch/csrc/utils/python_arg_parser.h. WARNING: - # this is NOT the same type signature as specified by PEP 484 - # as understood by mypy; our format was independently developed - # and has some quirks to make it more suitable specifically - # for error parsing. - # - # For a translation to mypy-valid type signatures, see - # signature_str_pyi(). - def signature_str(self, *, skip_outputs: bool = False) -> str: - args = self.arguments(skip_outputs=skip_outputs) - schema_formals: List[str] = list(map(lambda a: a.argument_str(method=self.method), args)) - positional_argc = len(self.input_args) - if len(schema_formals) > positional_argc: - schema_formals.insert(positional_argc, '*') - - return f'{self.name}({", ".join(schema_formals)})' - - def signature_str_pyi(self, *, skip_outputs: bool = False) -> str: - args = self.arguments(skip_outputs=skip_outputs) - schema_formals: List[str] = list(map(lambda a: a.argument_str_pyi(method=self.method), args)) - positional_argc = len(self.input_args) - if len(schema_formals) > positional_argc: - schema_formals.insert(positional_argc, '*') - - # only pyi signatures include returns - returns_str = self.returns.returns_str_pyi() - # pyi also includes self (with no typing/defaults) for methods - if self.method: - schema_formals.insert(0, "self") - return f'def {self.name}({", ".join(schema_formals)}) -> {returns_str}: ...' - - def signature_str_pyi_vararg(self, *, skip_outputs: bool = False) -> Optional[str]: - # only pyi uses vararg signatures - args = self.arguments(skip_outputs=skip_outputs) - schema_formals: List[str] = list(map(lambda a: a.argument_str_pyi(method=self.method), args)) - # vararg only applies to pyi signatures. vararg variants are not generated for all signatures - num_args = self.arguments_count() - num_positionalargs = len(self.input_args) - - have_vararg_version = False - if num_args > 0: - vararg_type = args[0].type - if isinstance(vararg_type, ListType) and str(vararg_type.elem) == 'int' and num_positionalargs == 1: - have_vararg_version = True - - if not have_vararg_version: - return None - # Below are the major changes in vararg vs. regular pyi signatures - # vararg signatures also omit the asterix - schema_formals[0] = '*' + args[0].name + ': _int' - - returns_str = self.returns.returns_str_pyi() - # pyi also includes self (with no typing/defaults) for methods - if self.method: - schema_formals.insert(0, "self") - return f'def {self.name}({", ".join(schema_formals)}) -> {returns_str}: ...' - -# The deprecated python signature involves some special logic, so create a -# dedicated data model to store these extra properties. -@dataclass(frozen=True) -class PythonSignatureDeprecated(PythonSignature): - # We need keep the order of arguments in deprecated signature. - # Particularly, method signature might have 'self' not at the beginning, e.g.: - # addmm(Scalar beta, Tensor self, Tensor mat1, Tensor mat2) - # When generating lambda function signature we need follow the exact order (even for method=True): - # [](Scalar beta, const Tensor & self, const Tensor & mat1, const Tensor & mat2) -> Tensor - deprecated_args_names: Tuple[str, ...] - - # The deprecated signature might miss some arguments that the corresponding - # C++ signature expects. We need store the constant default values to pass in. - # For example: - # [deprecate signature]: addmm(Scalar beta, Tensor self, Tensor mat1, Tensor mat2) - # [func schema]: aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor - # [func call]: self.addmm(mat1, mat2, beta, 1) - # We store ['self', 'mat1', 'mat2', 'beta', '1'] in this case. - deprecated_args_exprs: Tuple[str, ...] - - @property - def deprecated(self) -> bool: - return True - - def signature_str(self, *, skip_outputs: bool = False) -> str: - return PythonSignature.signature_str(self, skip_outputs=skip_outputs) + '|deprecated' - - def signature_str_pyi(self, *, skip_outputs: bool = False) -> str: - args = self.arguments(skip_outputs=skip_outputs) - schema_formals: List[str] = list(map(lambda a: a.argument_str_pyi(method=self.method, deprecated=True), args)) - positional_argc = len(self.input_args) - if len(schema_formals) > positional_argc: - schema_formals.insert(positional_argc, '*') - - returns_str = self.returns.returns_str_pyi() - return f'def {self.name}({", ".join(schema_formals)}) -> {returns_str}: ...' - - def signature_str_pyi_vararg(self, *, skip_outputs: bool = False) -> Optional[str]: - # the codegen doesn't include vararg variants for deprecated signatures - return None - -# This struct is used to hold the PythonSignature and its corresponding -# NativeFunction BEFORE grouping base and out-variant functions. -# Why not store NativeFunction in PythonSignature or construct PythonSignature -# from NativeFunction? Because they are not 1-1 mapped. -# One native function could have both deprecated and non-deprecated python -# signatures - NativeFunction doesn't contain information to construct the -# deprecated python signature. -# One python signature is used to handle both the base and the out-variant -# function - see 'PythonSignatureGroup'. -@dataclass(frozen=True) -class PythonSignatureNativeFunctionPair: - signature: PythonSignature - function: NativeFunction - -# We merge pairs of functions with signatures that are equivalent mod -# output arguments, and use a single entry in the python_arg_parser sig -# list for both (output arguments become optional). -@dataclass(frozen=True) -class PythonSignatureGroup: - # The signature used for Python argument parsing. The outplace signature - # is preferred if exists, because it can be used to parse inputs for both - # the out-place variant and the base version (with output omitted). - signature: PythonSignature - - # The regular ATen declaration (e.g. conv2d) - base: NativeFunction - - # The out variant (e.g. conv2d_out) - outplace: Optional[NativeFunction] - -# C++ function dispatch is wrapped in a lambda function. The lambda function -# has almost the same signature as the C++ function, only with some small -# variants - see details below. -# This data model is used to represent arguments of the lambda function -# signature. -@dataclass(frozen=True) -class DispatchLambdaArgument: - name: str - type_str: str - is_out_arg: bool - -# To pass PyObjects arguments to C++ function (via the lambda wrapper), -# we need first convert PyObjects into simple C++ objects. This work -# is done by PythonArgParser. -# This data model is used to represent the output of PythonArgParser. -# It has 1-1 mapping with PythonArgument in PythonSignature. -@dataclass(frozen=True) -class PythonArgParserOutputExpr: - # argument name - name: str - - # RHS expression to reference PythonArgParser output. - expr: str - - # In some special cases we need create different expr, e.g.: - # '_r.isNone(1)' instead of '_r.tensor(1)'. - index: int - - # The python argument it maps to. - argument: PythonArgument - - @property - def is_none_expr(self) -> str: - return f'_r.isNone({self.index})' - -# To pass PythonArgParser output to the lambda wrapper, we need bind -# PythonArgParserOutputExpr to DispatchLambdaArgument. -# They are not always 1-1 mapped, e.g. scattered TensorOptions fields -# need be packed into a TensorOptions object, which is the argument -# that the lambda function wrapper takes. -@dataclass(frozen=True) -class DispatchLambdaArgumentExprs: - # The exprs that provide the binding for lambda arguments, e.g.: - # - # 'self' -> '_r.tensor(0)' - # 'min' -> 'out[0]' / 'min_indices' -> 'out[1]' - # 'options' -> 'options' - # - # It has 1-1 mapping with DispatchLambdaArgument. - exprs: Sequence[str] - - # Special local inits, which might introduce new variables that - # the 'exprs' above reference, e.g.: - # - # 'auto out = _r.tensorlist_n<2>(2);' - # - inits: Sequence[str] - -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# -# Helper Functions -# -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # - -def _cpp_signature(f: NativeFunction, *, method: bool = False) -> CppSignature: - return CppSignatureGroup.from_native_function(f, method=method).signature - -def has_tensor_options(f: NativeFunction) -> bool: - return f.func.arguments.tensor_options is not None - -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# -# Python Signature -# -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # - -# 'simple_type' was introduced by the old codegen, which is slightly -# different from the python schema type, e.g.: doesn't have '?' suffix -# for optional Tensor/TensorList; doesn't have '[size]' suffix for list type. -def argument_type_str(t: Type, *, simple_type: bool = False) -> str: - if isinstance(t, BaseType): - if t.name == BaseTy.Tensor: - return 'Tensor' - elif t.name == BaseTy.int: - return 'int64_t' - elif t.name == BaseTy.float: - return 'double' - elif t.name == BaseTy.str: - return 'c10::string_view' - elif t.name in [BaseTy.bool, BaseTy.QScheme, BaseTy.Scalar, - BaseTy.ScalarType, BaseTy.Generator, BaseTy.Storage, - BaseTy.Layout, BaseTy.Device, BaseTy.MemoryFormat, - BaseTy.Dimname, BaseTy.Stream, BaseTy.ConstQuantizerPtr]: - # These python schema type names line up with their function schema names - return t.name.name - - elif isinstance(t, OptionalType): - if str(t.elem) == 'Tensor': - # Is it desired to keep '?' for simple_type with new style dispatcher? - return 'Tensor?' - elem = argument_type_str(t.elem, simple_type=simple_type) - if elem == 'Layout': - # TODO: fix this special case in PythonArgParser? - return 'Layout' - else: - return f'{elem}?' - - elif isinstance(t, ListType): - size = t.size if not simple_type else None - if str(t.elem) == 'bool': - assert t.size is not None - return f'::std::array' - elif str(t.elem) == 'int': - return f'IntArrayRef[{size}]' if size is not None else 'IntArrayRef' - elif str(t.elem) == 'Tensor': - return f'TensorList[{size}]' if size is not None else 'TensorList' - elif str(t.elem) == 'Scalar': - return f'ScalarList[{size}]' if size is not None else 'ScalarList' - elif str(t.elem) == 'Tensor?': - if simple_type: - return 'c10::List>' - else: - return 'const c10::List> &' - elif str(t.elem) == 'Dimname': - return f'DimnameList[{size}]' if size is not None else 'DimnameList' - elem = argument_type_str(t.elem, simple_type=simple_type) - return f'ArrayRef<{elem}>' - - raise RuntimeError(f'unrecognized type {repr(t)}') - -def argument_type_size(t: Type) -> Optional[int]: - l = t.is_list_like() - if l is not None and str(l.elem) != 'bool': - return l.size - else: - return None - -def argument(a: Argument) -> PythonArgument: - return PythonArgument( - name=a.name, - type=a.type, - # TODO: directly translate a.default to python default - default=str(pythonify_default(cpp.default_expr(a.default, a.type))) - if a.default is not None else None, - default_init=None, - ) - -# Generates a PythonSignature that can be used for either .pyi or PythonArgParser codegen -def signature(f: NativeFunction, *, method: bool = False, pyi: bool = False) -> PythonSignature: - args: List[Argument] = [] - args.extend(f.func.arguments.pre_self_positional) - # Skip SelfArgument if this is method. - if not method and f.func.arguments.self_arg is not None: - args.append(f.func.arguments.self_arg.argument) - args.extend(f.func.arguments.post_self_positional) - args.extend(f.func.arguments.pre_tensor_options_kwarg_only) - # Skip TensorOptionsArguments. Python side TensorOptions - # arguments are created based on different rules - see below. - args.extend(f.func.arguments.post_tensor_options_kwarg_only) - args.extend(f.func.arguments.out) - - input_arg_set = set(a.name for a in f.func.arguments.flat_positional) - kwarg_only_set = set(a.name for a in f.func.arguments.flat_kwarg_only) - out_arg_set = set(a.name for a in f.func.arguments.out) - - input_args = tuple(map(argument, filter(lambda a: a.name in input_arg_set, args))) - input_kwargs = tuple(map(argument, filter(lambda a: a.name in kwarg_only_set, args))) - outputs = tuple(map(argument, filter(lambda a: a.name in out_arg_set, args))) - - # Reintroduce the scattered fields of TensorOptions for Python. - # Compared to the cpp counterpart, the python arguments have new property - # (default_init) and a new argument 'requires_grad', which require some - # special handlings. - # [old codegen] TODO: because these aren't guaranteed to be 100% faithful - # to the original versions in the yaml, this recreation is a potential - # source of drift between eager and JIT. Pull this logic out to a shared place. - - has_tensor_input_arg = any(a.type.is_tensor_like() for a in f.func.arguments.flat_non_out) - if any(a.name == 'requires_grad' for a in f.func.schema_order_arguments()): - raise ValueError('argument named requires_grad is reserved, should not explicitly add it in the schema') - - # [old codegen] this probably won't work if one of the returns is not a tensor, - # but it will produce a compile-time error that is obvious. - has_tensor_return = any(r.type.is_tensor_like() for r in f.func.returns) - - name: str = cpp.name(f.func) - is_factory_function = f.category_override == 'factory' or (has_tensor_return and not has_tensor_input_arg) - is_like_or_new_function = f.category_override in ('new', 'like') or name.startswith('new_') or name.endswith('_like') - - tensor_options_args: List[PythonArgument] = [] - if is_factory_function or is_like_or_new_function: - tensor_options_args.append(PythonArgument( - name='dtype', - type=BaseType(BaseTy.ScalarType), - default='None' if pyi else _dtype_default_type_hack(name), - default_init='self.scalar_type()' if is_like_or_new_function else None, - )) - tensor_options_args.append(PythonArgument( - name='layout', - type=OptionalType(BaseType(BaseTy.Layout)), - default='strided' if pyi else 'torch.strided', - default_init='self.layout()' if is_like_or_new_function else None, - )) - tensor_options_args.append(PythonArgument( - name='device', - type=BaseType(BaseTy.Device), - default='None', - default_init='self.device()' if is_like_or_new_function else None, - )) - tensor_options_args.append(PythonArgument( - name='pin_memory', - type=BaseType(BaseTy.bool), - default='False', - default_init=None, - )) - tensor_options_args.append(PythonArgument( - name='requires_grad', - type=BaseType(BaseTy.bool), - default='False', - default_init=None, - )) - - returns = PythonReturns(returns=f.func.returns) - - return PythonSignature( - name=str(f.func.name.name), - input_args=input_args, - input_kwargs=input_kwargs, - output_args=PythonOutArgument.from_outputs(outputs), - tensor_options_args=tuple(tensor_options_args), - returns=returns, - method=method, - ) - -# TODO blowtorch -# note: removing this will be BC-breaking. A quick test shows that -# randperm will otherwise default its dtype to torch.float64 -def _dtype_default_type_hack(name: str) -> str: - if name.startswith('randperm') or name == 'tril_indices' or name == 'triu_indices': - return 'torch.int64' - else: - return 'None' -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# -# Python Interface -# -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # - -def namedtuple_fieldnames(returns: Tuple[Return, ...]) -> List[str]: - if len(returns) <= 1 or all(map(lambda r: r.name is None, returns)): - return [] - else: - if any(map(lambda r: r.name is None, returns)): - # When building on Windows, `PyStructSequence_UnnamedField` could not be - # resolved by the linker for some reason, which cause error in building: - # - # python_nn_functions.cpp.obj : error LNK2001: unresolved external symbol - # PyStructSequence_UnnamedField - # - # Thus, at this point in time, we do not support unnamed - # fields in namedtuple; you must either name all fields, - # or none of them. - raise ValueError("Unnamed field is not supported by codegen") - - return list(map(lambda r: str(r.name), returns)) - -def argument_type_str_pyi(t: Type) -> str: - add_optional = False - if isinstance(t, OptionalType): - t = t.elem - add_optional = True - - if isinstance(t, BaseType): - if t.name == BaseTy.int: - ret = '_int' - elif t.name == BaseTy.float: - ret = '_float' - elif t.name == BaseTy.str: - ret = 'str' - elif t.name == BaseTy.Scalar: - ret = 'Number' - elif t.name == BaseTy.ScalarType: - ret = '_dtype' - elif t.name == BaseTy.bool: - ret = '_bool' - elif t.name == BaseTy.QScheme: - ret = '_qscheme' - elif t.name == BaseTy.Layout: - ret = '_layout' - elif t.name == BaseTy.Device: - ret = 'Union[_device, str, None]' - elif t.name == BaseTy.MemoryFormat: - ret = 'memory_format' - elif t.name == BaseTy.Dimname: - ret = 'Union[str, ellipsis, None]' - elif t.name in [BaseTy.Tensor, BaseTy.Generator, - BaseTy.Storage, BaseTy.Stream]: - # These python schema type names line up with their function schema names - ret = t.name.name - - elif isinstance(t, ListType): - if str(t.elem) == 'int': - ret = 'Union[_int, _size]' if t.size is not None else '_size' - elif t.is_tensor_like(): - # TODO: this doesn't seem right... - # Tensor?[] currently translates to Optional[Union[Tuple[Tensor, ...], List[Tensor]]] - # It should probably translate to Union[Tuple[Optional[Tensor], ...], List[Optional[Tensor]]] - if isinstance(t.elem, OptionalType): - add_optional = True - ret = 'Union[Tensor, Tuple[Tensor, ...], List[Tensor]]' if t.size is not None else \ - 'Union[Tuple[Tensor, ...], List[Tensor]]' - elif str(t.elem) == 'float': - ret = 'Sequence[_float]' - else: - elem = argument_type_str_pyi(t.elem) - ret = f'Sequence[{elem}]' - - if add_optional: - ret = 'Optional[' + ret + ']' - return ret - - raise RuntimeError(f'unrecognized type {repr(t)}') - - -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# -# C++ Function Dispatch -# -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# This section provides APIs to generate the code that does C++ function -# dispatch. The C++ function call is wrapped by a lambda function. -# For example: -# -# // aten::selu_(Tensor(a!) self) -> Tensor(a!) -# auto dispatch_selu_ = [](Tensor self) -> Tensor { -# pybind11::gil_scoped_release no_gil; -# return at::selu_(self); -# }; -# -# The lambda function's signature follows the C++ signature in common -# cases, e.g.: -# -# // aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor -# [](const Tensor & self, const Tensor & other, Scalar alpha) -> Tensor -# -# For out variant the 'out' argument's type is changed from 'Tensor &' -# to 'Tensor'. It's because when calling the lambda it passes in the -# PythonArgParser output '_r.tensor(3)', which is stack allocated object -# and needs to pass by value. Also see comments in 'dispatch_lambda_return_str()'. -# -# // aten::add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) -# [](Tensor out, const Tensor & self, const Tensor & other, Scalar alpha) -> Tensor -# -# For multi-output case it can keep using reference type because the -# PythonArgParser output has been unpacked to local variables, e.g.: -# -# // aten::max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, -# // Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices) -# [](Tensor & max, Tensor & max_values, const Tensor & self, Dimname dim, bool keepdim) -> std::tuple -# -# For deprecated python signature, it should follow deprecated python arg order. -# TODO: This is to keep same byte-for-byte result as the old codegen - maybe unnecessary? - -def dispatch_lambda_args(ps: PythonSignature, f: NativeFunction) -> Tuple[DispatchLambdaArgument, ...]: - # Start with cpp arguments - dispatch lambda signature always include 'self' - cpp_args: Sequence[Binding] = _cpp_signature(f, method=False).arguments() - - # Special reorder logic for deprecated python signature - if isinstance(ps, PythonSignatureDeprecated): - m: Dict[str, Binding] = dict((a.name, a) for a in cpp_args) - # reorder according to the deprecated signature - # ignore 'out' argument when binding to non-output function. - ordered_args = filter(lambda n: n != 'out' or f.func.is_out_fn(), - ps.deprecated_args_names) - cpp_args = list(map(lambda n: m[n], ordered_args)) - - out_args: Set[str] = set(a.name for a in f.func.arguments.out) - - # Convert from cpp argument to lambda argument - def dispatch_lambda_arg(cpp_arg: Binding) -> DispatchLambdaArgument: - type_str = cpp_arg.type - is_out_arg = cpp_arg.name in out_args - if ps.method and cpp_arg.name == 'self': - # For method's 'self', we can use 'const Tensor &' and simply ignore mutability! - type_str = 'const at::Tensor &' - else: - # For other cases we need prevent dangling refs to temps (unless it's - # unpacked scattered output) - # The reason is explained in the comments above and in 'dispatch_lambda_return_str()'. - # TODO: avoid this special handling? - ensure_temp_safe = len(out_args) <= 1 or not is_out_arg - if ensure_temp_safe: - type_str = { - 'at::Tensor &': 'at::Tensor', - }.get(type_str, type_str) - return DispatchLambdaArgument( - name=cpp_arg.name, - type_str=type_str, - is_out_arg=is_out_arg, - ) - - return tuple(map(dispatch_lambda_arg, cpp_args)) - -# [old codegen] XXX: if you got here because of an assertion failure, it doesn't mean -# it's enough to just extend the list here. Before you do this, make sure -# to add an appropriate wrap() overload in torch/csrc/autograd/utils/wrap_outputs.h. -SUPPORTED_RETURN_TYPES = { - 'at::Tensor', - '::std::tuple', - '::std::tuple', - '::std::tuple', - '::std::tuple', - '::std::tuple', - '::std::tuple', - '::std::tuple', - '::std::tuple', - '::std::tuple', - '::std::vector', - 'at::Scalar', 'bool', 'int64_t', 'void*', 'void', - 'at::QScheme', 'double', - 'at::IntArrayRef', - 'at::ScalarType' -} - -def dispatch_lambda_return_str(f: NativeFunction) -> str: - # [old codegen] Remove type annotation (e.g. 'Tensor' rather than 'Tensor &') - # because the dispatch lambdas take mutable arguments *by value*, not - # by reference. If you then return a reference to such an argument, you - # will now have a pointer to a dangling stack entry. Not good. - # - # You want: - # - # auto dispatch_selu_ = [](Tensor self) -> Tensor { ...; return at::selu_(self); }; - # ^^^^^^ - # - # *not* - # - # auto dispatch_selu_ = [](Tensor self) -> Tensor& { ...; return at::selu_(self); }; - # ^^^^^^^ - # - # (NB: We can't make dispatch_selu_ take Tensor&, because the enclosing - # codegen looks like dispatch_selu_(_r.tensor(0)), and you can't take a - # mutable reference to temporary. Maybe we could assign it to a - # variable itself.) - returns_without_annotation = tuple(map(lambda r: Return(r.name, r.type, None), f.func.returns)) - return_str = cpp.returns_type(returns_without_annotation).cpp_type() - if return_str not in SUPPORTED_RETURN_TYPES: - raise RuntimeError(f'{f.func.name} returns unsupported type {return_str}') - return return_str - -def cpp_dispatch_target(f: NativeFunction) -> str: - name = cpp.name(f.func) - if Variant.method in f.variants: - return f'self.{name}' - if Variant.function in f.variants: - if has_tensor_options(f) or f.func.name.name.base.endswith('_like'): - namespace = 'torch' - else: - namespace = 'at' - return f'{namespace}::{name}' - raise RuntimeError(f'could not dispatch, neither function nor method: {f.func}') - -def cpp_dispatch_exprs(f: NativeFunction, *, - python_signature: Optional[PythonSignature] = None, - ) -> Tuple[str, ...]: - cpp_args: Sequence[Binding] = _cpp_signature(f, method=False).arguments() - - exprs: Tuple[str, ...] = tuple() - if not isinstance(python_signature, PythonSignatureDeprecated): - # By default the exprs are consistent with the C++ signature. - exprs = tuple(map(lambda a: a.name, cpp_args)) - else: - # For deprecated python signature we may need fill in some constants. - exprs = tuple(filter(lambda n: n != 'out' or f.func.is_out_fn(), - python_signature.deprecated_args_exprs)) - - if Variant.method in f.variants: - exprs = tuple(filter('self'.__ne__, exprs)) - - return exprs - -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# -# Python / C++ Args Binding -# -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # - -# We explicitly enumerate the PythonArgParser unpacking methods for all -# supported types. This might be more verbose than necessary, partially -# because of the irregularity of unpacking method naming, partially -# because we want to mimic the old codegen behavior - to reject -# unexpected and/or unsupported cases which the old codegen rejects. -# For certain cases it is intentionally more restrictive than necessary, -# e.g.: it doesn't accepts doublelist with definite size. -def arg_parser_unpack_method(t: Type, has_default: bool) -> str: - if has_default and str(t) not in ('ScalarType', 'Device', 'Layout?'): - raise RuntimeError(f'type \'{t}\' does not supported unpacking with default') - - if isinstance(t, BaseType): - if t.name in [BaseTy.Tensor, BaseTy.Stream, BaseTy.Storage, - BaseTy.Scalar, BaseTy.Dimname]: - # These unpack methods line up with their schema names - return t.name.name.lower() - elif t.name == BaseTy.ScalarType: - return 'scalartypeWithDefault' if has_default else 'scalartype' - elif t.name == BaseTy.Device: - return 'deviceWithDefault' if has_default else 'device' - elif t.name == BaseTy.int: - return 'toInt64' - elif t.name == BaseTy.bool: - return 'toBool' - elif t.name == BaseTy.float: - return 'toDouble' - elif t.name == BaseTy.str: - return 'stringView' - - elif isinstance(t, OptionalType): - if str(t.elem) == 'Tensor': - return 'optionalTensor' - - elif isinstance(t.elem, BaseType): - if t.elem.name in [BaseTy.ScalarType, BaseTy.Scalar, - BaseTy.int, BaseTy.bool, - BaseTy.float, BaseTy.str]: - # Regular cases: append 'Optional' to elem's unpacking method - return arg_parser_unpack_method(t.elem, False) + 'Optional' - elif t.elem.name == BaseTy.MemoryFormat: - return 'memoryformatOptional' - elif t.elem.name == BaseTy.Generator: - return 'generator' - elif t.elem.name == BaseTy.Layout: - return 'layoutWithDefault' if has_default else 'layoutOptional' - elif t.elem.name == BaseTy.Device: - return 'deviceWithDefault' if has_default else 'deviceOptional' - - elif isinstance(t.elem, ListType): - if str(t.elem.elem) == 'int': - # accept definite size - return 'intlistOptional' - elif str(t.elem) == 'float[]': - return 'doublelistOptional' - elif str(t.elem) == 'Dimname[]': - return 'toDimnameListOptional' - - elif isinstance(t, ListType): - if str(t.elem) == 'Tensor': - # accept and use definite size - if t.size is not None: - return f'tensorlist_n<{t.size}>' - else: - return 'tensorlist' - elif str(t.elem) == 'Tensor?': - return 'list_of_optional_tensors' - elif str(t.elem) == 'Dimname': - # accept definite size - return 'dimnamelist' - elif str(t.elem) == 'int': - # accept definite size - return 'intlist' - elif str(t) == 'float[]': - return 'doublelist' - elif str(t) == 'Scalar[]': - return 'scalarlist' - raise RuntimeError(f'type \'{t}\' is not supported by PythonArgParser') - -# Return RHS expression for python argument using PythonArgParser output. -# e.g. for arg name 'foo', arg type 'bool', arg_index = 2, returns '_r.toBool(2)' -def arg_parser_output_expr( - arg_index: int, a: PythonArgument -) -> PythonArgParserOutputExpr: - has_default = a.default_init is not None - unpack_method = arg_parser_unpack_method(a.type, has_default) - default = f', {a.default_init}' if has_default else '' - expr = f'_r.{unpack_method}({arg_index}{default})' - - return PythonArgParserOutputExpr( - name=a.name, - expr=expr, - index=arg_index, - argument=a, - ) - -# Returns a map with key = arg_name and value = PythonArgParserOutputExpr. -def arg_parser_output_exprs( - ps: PythonSignature, f: NativeFunction -) -> Dict[str, PythonArgParserOutputExpr]: - return {e.name: e for i, a in enumerate(ps.arguments()) - for e in (arg_parser_output_expr(i, a), )} - -# argument name to type for scattered tensor options fields -TENSOR_OPTIONS_FIELDS = { - 'dtype': 'ScalarType', - 'device': 'Device', - 'layout': 'Layout?', - 'pin_memory': 'bool', - 'requires_grad': 'bool', -} - -# bind arg parser outputs (python args) with dispatch lambda arguments (c++ args). -def dispatch_lambda_exprs( - ps: PythonSignature, f: NativeFunction -) -> DispatchLambdaArgumentExprs: - # This method is to bind 'arg_parser_outputs' and 'lambda_args' by producing - # 'inits' and 'lambda_args_exprs' for each lambda argument using arg parser - # outputs. - arg_parser_outputs = arg_parser_output_exprs(ps, f) - lambda_args = dispatch_lambda_args(ps, f) - inits: List[str] = [] - lambda_args_exprs: Dict[str, str] = dict() - - has_toptions = has_tensor_options(f) - - # 1. special inits/unpacking to provide binding exprs for lambda arguments. - for a in ps.arguments(skip_tensor_options=True): - name = a.name - arg_parser_expr = arg_parser_outputs[a.name].expr - - if has_toptions and name == 'self': - # TODO: why this needs to be special case? - inits.extend([ - f'auto self = {arg_parser_expr};', - ]) - lambda_args_exprs[name] = name - elif isinstance(a, PythonOutArgument) and len(a.outputs) > 1 and f.func.is_out_fn(): - inits.extend([ - f'auto out = {arg_parser_expr};', - ]) - for i, out_arg in enumerate(a.outputs): - lambda_args_exprs[out_arg.name] = f'out[{i}]' - elif str(a.type) == 'Dimname[]?': - # [old codegen] - # TODO: make this part of something more general, or get rid of it. - # optional> are special. The PythonArgParser returns an - # optional>, which cannot be implicitly converted to - # optional>. One needs to unwrap the optional and rewrap. - inits.extend([ - f'auto __{name} = {arg_parser_expr};', - f'c10::optional {name} = __{name} ? c10::make_optional(DimnameList(__{name}.value())) : c10::nullopt;', - ]) - lambda_args_exprs[name] = name - else: - # default case - directly using PythonArgParser output expr - lambda_args_exprs[name] = arg_parser_expr - - # method's self is passed directly to python binding, rather than parsed - if ps.method: - lambda_args_exprs['self'] = 'self' - - # 2. special packing/checking for TensorOptions. - tensor_options_args_names = list(map(lambda a: a.name, ps.tensor_options_args)) - if has_toptions: - if f.func.is_out_fn(): - raise RuntimeError(f'{f.func}: tensor options with output arg') - for a in ps.tensor_options_args: - if a.name not in TENSOR_OPTIONS_FIELDS: - raise RuntimeError( - f'{f.func}: unrecognized tensor options field \'{a.name}\' in python binding arguments') - if str(a.type) != TENSOR_OPTIONS_FIELDS.get(a.name): - raise RuntimeError( - f'{f.func}: unrecognized type \'{str(a.type)}\' for tensor options field \'{a.name}\'') - if not all(map(lambda a: a in tensor_options_args_names, TENSOR_OPTIONS_FIELDS.keys())): - raise RuntimeError( - f'{f.func}: incomplete tensor options args: {tensor_options_args_names}') - - inits.append(f'''\ -const auto options = TensorOptions() - .dtype({arg_parser_outputs['dtype'].expr}) - .device({arg_parser_outputs['device'].expr}) - .layout({arg_parser_outputs['layout'].expr}) - .requires_grad({arg_parser_outputs['requires_grad'].expr}) - .pinned_memory({arg_parser_outputs['pin_memory'].expr}); -torch::utils::maybe_initialize_cuda(options); -''') - lambda_args_exprs['options'] = 'options' - - # 3. special case - access scattered TensorOptions fields without packing - # TODO: maybe move to the generator side as it's not related to binding. - if not has_toptions and tensor_options_args_names: - if 'dtype' in tensor_options_args_names: - # we're an output-arg variant, check these args against output tensor - if not f.func.is_out_fn(): - raise RuntimeError( - f'{f.func}: dtype in tensor_options_args without output arg') - if not all(map(lambda a: a in tensor_options_args_names, ('layout', 'device'))): - raise RuntimeError( - f'{f.func}: incomplete tensor options for output check') - - inits.append(f"""\ -check_out_type_matches({arg_parser_outputs['out'].expr}, {arg_parser_outputs['dtype'].expr}, - {arg_parser_outputs['dtype'].is_none_expr}, {arg_parser_outputs['layout'].expr}, - {arg_parser_outputs['device'].expr}, {arg_parser_outputs['device'].is_none_expr}); -""") - # we'll set requires_grad on outgoing tensor - if 'requires_grad' not in tensor_options_args_names: - raise RuntimeError( - f'{f.func}: expected "requires_grad" in tensor_options_args absent, but found [{tensor_options_args_names}]') - - return DispatchLambdaArgumentExprs( - exprs=tuple(map(lambda a: lambda_args_exprs[a.name], lambda_args)), - inits=inits, - ) diff --git a/tools/codegen/api/translate.py b/tools/codegen/api/translate.py deleted file mode 100644 index 591b8d75e3b1..000000000000 --- a/tools/codegen/api/translate.py +++ /dev/null @@ -1,240 +0,0 @@ -from typing import Dict, Sequence, List, NoReturn, Union -from tools.codegen.api.types import (BaseCType, Binding, ConstRefCType, - Expr, MutRefCType, OptionalCType, - NamedCType, SpecialArgName, tensorT, - memoryFormatT, tensorOptionsT, scalarTypeT, - boolT, deviceT, layoutT, optionalTensorRefT, - scalarT, optionalScalarRefT, - VectorCType, longT, intArrayRefT) - -# This file implements a small program synthesis engine that implements -# conversions between one API to another. -# -# The key data type in this file in NamedCType, short for Named C++ semantic type. A NamedCType -# represents a C++ type, plus semantic information about what it represents. -# For example, consider the argument "bool pin_memory"; its normal C++ type is -# "bool", but its C++ semantic type also keeps track that this represents a -# "pin_memory"; you can't just use a random other boolean in a context where you -# need a "pin_memory"! -# -# The translator takes a list of needed NamedCTypes, and then figures out how -# to construct expressions with these NamedCTypes from the given bindings. Many -# of these expressions are trivial (I need a Tensor other; there's a Tensor -# other scope); others are more nontrivial and may require packing/unpacking. -# Some examples of non-trivial action: -# -# - Need the "dtype" binding? Well, maybe "dtype" isn't available -# in the context, instead, "options" is, and you need to extract -# it from there. (Gather) -# -# - Need the "context" binding? Well, maybe "context" isn't available -# in the context, and you need to construct it from "dtype", "device", -# etc. (Scatter) -# -# - Need the "memory_format" binding? Well, actually, it's available -# from both "memory_format" and "options", so you had better make sure -# they are consistent. (Join) - -options_ctype = NamedCType("options", ConstRefCType(BaseCType(tensorOptionsT))) - -longVec_ctype = VectorCType(BaseCType(longT)) -optionalScalar_ctype = OptionalCType(BaseCType(scalarT)) -optionalTensor_ctype = OptionalCType(BaseCType(tensorT)) - -class UnsatError(RuntimeError): - pass - -# Given a set of in-scope bindings and a set of target bindings, synthesize -# a list of expressions that uses only the in-scope bindings (bindings) that -# have all of the types of goals. You may want to use this function if -# you're generating code for a function like: -# -# void f({args}) { -# g({exprs}); // g is a different API -# } -# -# and you need to generate "exprs". -# -# Typically, a list of Bindings is convenient to get (you usually call something -# like arguments() to get them); but technically you only need less information: -# for 'bindings' an (un-ordered) list of Exprs is sufficient; similarly, for -# 'goals', an (ordered) list of NamedCType goals is sufficient. If you are doing -# something more complicated, e.g., tracking the set of bindings in a context, -# you may find using these smaller types more convenient. -def translate( - bindings: Sequence[Union[Expr, Binding]], - goals: Sequence[Union[NamedCType, Binding]], - *, method: bool = False, - allow_expensive_conversions: bool = False -) -> List[Expr]: - - binding_exprs: List[Expr] = [] - for b in bindings: - if isinstance(b, Binding): - binding_exprs.append(Expr( - expr=b.name, - type=b.nctype, - )) - else: - binding_exprs.append(b) - - goal_ctypes: List[NamedCType] = [] - for g in goals: - if isinstance(g, Binding): - goal_ctypes.append(g.nctype) - else: - goal_ctypes.append(g) - - # Add all the bindings to the context - ctx: Dict[NamedCType, str] = {} - for b in binding_exprs: - ctx[b.type] = b.expr - - # While we're at it, do some simple forward inference, looking through - # constructors. - # TODO: My kingdom for a pattern matcher - # https://www.python.org/dev/peps/pep-0634/ - # TODO: This could get us in recomputation trouble if b.expr is nontrivial - t = b.type - if isinstance(t, ConstRefCType) and isinstance(t.elem, OptionalCType) and \ - isinstance(t.elem.elem, BaseCType) and str(t.elem.elem.type) == 'at::Tensor': - ctx[NamedCType(t.elem.elem.name, ConstRefCType(BaseCType(tensorT)))] = \ - f'({b.expr}.has_value() ? *{b.expr} : at::Tensor())' - - if t.type == ConstRefCType(OptionalCType(BaseCType(tensorT))): - ctx[NamedCType(t.name, BaseCType(optionalTensorRefT))] = \ - f'(({b.expr}.has_value() && (*{b.expr}).defined()) ? at::OptionalTensorRef(*{b.expr}) : at::OptionalTensorRef())' - - if t.type == ConstRefCType(OptionalCType(BaseCType(scalarT))): - ctx[NamedCType(t.name, BaseCType(optionalScalarRefT))] = \ - f'({b.expr}.has_value() ? at::OptionalScalarRef(&({b.expr}.value())) : at::OptionalScalarRef())' - - # Add implicit bindings if the generated code is inside a Tensor method - if method: - ctx[NamedCType("self", MutRefCType(BaseCType(tensorT)))] = "const_cast(*this)" - ctx[NamedCType("self", ConstRefCType(BaseCType(tensorT)))] = "const_cast(*this)" - # This is better! Byte-for-byte compat - # ctx[NamedCType("self", ConstRefCType(BaseCType(tensorT)))] = "*this" - - def unsat(goal: NamedCType) -> NoReturn: - ctx_desc = '\n'.join(f" {t.cpp_type()} {t.name}; // {e}" for t, e in ctx.items()) - raise UnsatError(f''' -Failed to synthesize the expression "{goal.cpp_type()} {goal.name}". -When I failed, the following bindings were available in the context: - -{ctx_desc} - -This probably means there is a missing rule in the rules of tools.codegen.api.translate. -Check this module for more information. -''') - - # A shitty backtracking search implementation. It's shitty because it - # doesn't actually do backtracing or search. In particular, if - # direct=True, we won't try to do any fancy synthesis, just trivial - # conversions (e.g., "T a" is OK for "const T& a"). So all of the - # existing rules in this function simply try to solve immediately, - # and bail if things don't work out. - def solve(goal: NamedCType, *, direct: bool) -> str: - def direct_solve(goal: NamedCType) -> str: - return solve(goal, direct=True) - - if goal in ctx: - # Trivial - return ctx[goal] - - # const & is satisfied with mutable & - if isinstance(goal.type, ConstRefCType): - try: - # WARNING: not strictly decreasing; be careful not - # to add a direct conversion that goes satisfies - # mutable& with const& - return solve(NamedCType(goal.name, MutRefCType(goal.type.elem)), direct=direct) - except UnsatError: - pass - - # mutable & is satisfied with value - if isinstance(goal.type, MutRefCType): - try: - return solve(NamedCType(goal.name, goal.type.elem), direct=direct) - except UnsatError: - pass - - if direct: - unsat(goal) - - # For now, all of these rules are mutually exclusive. - if goal == NamedCType("memory_format", OptionalCType(BaseCType(memoryFormatT))): - memory_format = direct_solve( - NamedCType(SpecialArgName.possibly_redundant_memory_format, OptionalCType(BaseCType(memoryFormatT))) - ) - # No need to join "memory_format" and "options" if the target API takes "options" directly. - # Otherwise it will cause the redundant memory_format error. - if options_ctype in goal_ctypes: - return memory_format - try: - options = direct_solve(options_ctype) - return f"c10::impl::check_tensor_options_and_extract_memory_format({options}, {memory_format})" - except UnsatError: - return memory_format - - elif goal == NamedCType("options", BaseCType(tensorOptionsT)): - dtype = direct_solve(NamedCType("dtype", OptionalCType(BaseCType(scalarTypeT)))) - pin_memory = direct_solve(NamedCType("pin_memory", OptionalCType(BaseCType(boolT)))) - device = direct_solve(NamedCType("device", OptionalCType(BaseCType(deviceT)))) - layout = direct_solve(NamedCType("layout", OptionalCType(BaseCType(layoutT)))) - return f'TensorOptions().dtype({dtype}).layout({layout}).device({device}).pinned_memory({pin_memory})' - - elif goal == NamedCType("dtype", OptionalCType(BaseCType(scalarTypeT))): - options = direct_solve(options_ctype) - return f'optTypeMetaToScalarType({options}.dtype_opt())' - - elif goal == NamedCType("layout", OptionalCType(BaseCType(layoutT))): - options = direct_solve(options_ctype) - return f'{options}.layout_opt()' - - elif goal == NamedCType("device", OptionalCType(BaseCType(deviceT))): - options = direct_solve(options_ctype) - return f'{options}.device_opt()' - - elif goal == NamedCType("pin_memory", OptionalCType(BaseCType(boolT))): - options = direct_solve(options_ctype) - return f'{options}.pinned_memory_opt()' - - # We can always do translations from value types to reference types, like vector -> IntArrayRef - elif goal.type == BaseCType(intArrayRefT): - return direct_solve(NamedCType(goal.name, longVec_ctype)) - elif goal.type == BaseCType(optionalScalarRefT): - return direct_solve(NamedCType(goal.name, optionalScalar_ctype)) - elif goal.type == BaseCType(optionalTensorRefT): - return direct_solve(NamedCType(goal.name, optionalTensor_ctype)) - - - # Note [translation from C++ reference to value types] - # The below cases are all for when we have an argument with a reference type, - # and a corresponding goal with a value type. - # These are needed when we populate the inputs to a lambda capture and we need - # to guarantee the lifetime of each captured argument. - # We guard it with an explicit kwarg because converting to a value type is expensive - # (O(n)) to convert from IntArrayRef to vector), - # so the caller of translate() should be explicit that they need it. - if allow_expensive_conversions: - if goal.type == VectorCType(BaseCType(longT)): - intArrayRef_ctype = NamedCType(goal.name, BaseCType(intArrayRefT)) - argname = direct_solve(intArrayRef_ctype) - return f'{argname}.vec()' - elif goal.type == OptionalCType(BaseCType(scalarT)): - optionalScalarRef_ctype = NamedCType(goal.name, BaseCType(optionalScalarRefT)) - argname = direct_solve(optionalScalarRef_ctype) - return f'{argname}.has_value() ? c10::make_optional({argname}) : c10::nullopt' - elif goal.type == OptionalCType(BaseCType(scalarT)): - optionalTensorRef_ctype = NamedCType(goal.name, BaseCType(optionalTensorRefT)) - argname = direct_solve(optionalTensorRef_ctype) - return f'{argname}.has_value() ? c10::make_optional({argname}) : c10::nullopt' - # Technically, we also need to handle cases of C++ containers holding reference types. - # But there currently aren't any ops that require lambda capture codegen - # With arguments like std::vector. - # If that changes, we'll have to add the translation here. - - unsat(goal) - - return [Expr(solve(g, direct=False), g) for g in goal_ctypes] diff --git a/tools/codegen/api/types.py b/tools/codegen/api/types.py deleted file mode 100644 index d269f2c7a3ff..000000000000 --- a/tools/codegen/api/types.py +++ /dev/null @@ -1,618 +0,0 @@ -from tools.codegen.model import (Argument, FunctionSchema, NativeFunction, - BackendIndex, - SelfArgument, TensorOptionsArguments, BaseTy) -from dataclasses import dataclass -from typing import Optional, Union, Sequence, TypeVar, List, Set, Dict -from enum import Enum - -_T = TypeVar('_T') - -# An ArgName is just the str name of the argument in schema; -# but in some special circumstances, we may add a little extra -# context. The Enum SpecialArgName covers all of these cases; -# grep for their construction sites to see when they can occr. - -SpecialArgName = Enum('SpecialArgName', ( - 'possibly_redundant_memory_format', -)) -ArgName = Union[str, SpecialArgName] - -# This class shouldn't be created directly; instead, use/create one of the singletons below. -@dataclass(frozen=True) -class BaseCppType: - ns: Optional[str] - name: str - - def __str__(self) -> str: - if self.ns is None or self.ns == '': - return self.name - return f"{self.ns}::{self.name}" - -# The set of all non-templated, valid, fully-qualified names of C++ types that are used in the codegen. -# Templated types get their own dataclass, mainly to make namespace parsing easier. -byteT = BaseCppType('', 'uint8_t') -charT = BaseCppType('', 'int8_t') -shortT = BaseCppType('', 'int16_t') -# It would be more symmetric for this to be called intT, but it easy to mix -# this up with JIT int (which is int64_t in C++), so we intentionally don't -# define intT to make it obvious when you've stuffed it up -int32T = BaseCppType('', 'int32_t') -longT = BaseCppType('', 'int64_t') -halfT = BaseCppType('at', 'Half') -doubleT = BaseCppType('', 'double') -floatT = BaseCppType('', 'float') -complexHalfT = BaseCppType('c10', 'complex') # stuffing template param here is an abuse -complexFloatT = BaseCppType('c10', 'complex') -complexDoubleT = BaseCppType('c10', 'complex') -boolT = BaseCppType('', 'bool') -bfloat16T = BaseCppType('at', 'BFloat16') -voidT = BaseCppType('', 'void') -stringT = BaseCppType('c10', 'string_view') -generatorT = BaseCppType('at', 'Generator') -scalarTypeT = BaseCppType('at', 'ScalarType') -tensorT = BaseCppType('at', 'Tensor') -optionalTensorRefT = BaseCppType('at', 'OptionalTensorRef') -tensorListT = BaseCppType('at', 'TensorList') -dimnameT = BaseCppType('at', 'Dimname') -dimnameListT = BaseCppType('at', 'DimnameList') -layoutT = BaseCppType('at', 'Layout') -deviceT = BaseCppType('at', 'Device') -scalarT = BaseCppType('at', 'Scalar') -optionalScalarRefT = BaseCppType('at', 'OptionalScalarRef') -memoryFormatT = BaseCppType('at', 'MemoryFormat') -qschemeT = BaseCppType('at', 'QScheme') -storageT = BaseCppType('at', 'Storage') -streamT = BaseCppType('at', 'Stream') -intArrayRefT = BaseCppType('at', 'IntArrayRef') -tensorOptionsT = BaseCppType('at', 'TensorOptions') -typeAndSizeT = BaseCppType('torch::autograd::generated', 'TypeAndSize') -tensorGeometryT = BaseCppType('at', 'TensorGeometry') - -BaseTypeToCppMapping: Dict[BaseTy, BaseCppType] = { - BaseTy.int: longT, - BaseTy.float: doubleT, - BaseTy.bool: boolT, - BaseTy.str: stringT, - BaseTy.Generator: generatorT, - BaseTy.ScalarType: scalarTypeT, - BaseTy.Tensor: tensorT, - BaseTy.Dimname: dimnameT, - BaseTy.Layout: layoutT, - BaseTy.Device: deviceT, - BaseTy.Scalar: scalarT, - BaseTy.MemoryFormat: memoryFormatT, - BaseTy.QScheme: qschemeT, - BaseTy.Storage: storageT, - BaseTy.Stream: streamT, -} - -# CTypes encode C++ type structure as needed for translation. - -@dataclass(frozen=True) -class BaseCType: - type: BaseCppType - - def cpp_type(self, *, strip_ref: bool = False) -> str: - return str(self.type) - - # For BC reasons, we don't want to introduce at:: namespaces to RegistrationDeclarations.yaml - # TODO: Kill this when we eventually remove it! - def cpp_type_registration_declarations(self) -> str: - return str(self.type).replace('at::', '') - - def remove_const_ref(self) -> 'CType': - return self - -@dataclass(frozen=True) -class ConstRefCType: - elem: 'CType' - - def cpp_type(self, *, strip_ref: bool = False) -> str: - if strip_ref: - return self.elem.cpp_type(strip_ref=strip_ref) - return f'const {self.elem.cpp_type()} &' - - def cpp_type_registration_declarations(self) -> str: - return f'const {self.elem.cpp_type_registration_declarations()} &' - - def remove_const_ref(self) -> 'CType': - return self.elem.remove_const_ref() - -@dataclass(frozen=True) -class MutRefCType: - elem: 'CType' - - def cpp_type(self, *, strip_ref: bool = False) -> str: - if strip_ref: - return self.elem.cpp_type(strip_ref=strip_ref) - return f'{self.elem.cpp_type()} &' - - def cpp_type_registration_declarations(self) -> str: - return f'{self.elem.cpp_type_registration_declarations()} &' - - def remove_const_ref(self) -> 'CType': - return self.elem.remove_const_ref() - -@dataclass(frozen=True) -class OptionalCType: - elem: 'CType' - - def cpp_type(self, *, strip_ref: bool = False) -> str: - # Do not pass `strip_ref` recursively. - return f'c10::optional<{self.elem.cpp_type()}>' - - def cpp_type_registration_declarations(self) -> str: - return f'c10::optional<{self.elem.cpp_type_registration_declarations()}>' - - def remove_const_ref(self) -> 'CType': - return OptionalCType(self.elem.remove_const_ref()) - -@dataclass(frozen=True) -class ListCType: - elem: 'CType' - - def cpp_type(self, *, strip_ref: bool = False) -> str: - # Do not pass `strip_ref` recursively. - return f'c10::List<{self.elem.cpp_type()}>' - - def cpp_type_registration_declarations(self) -> str: - return f'c10::List<{self.elem.cpp_type_registration_declarations()}>' - - def remove_const_ref(self) -> 'CType': - return ListCType(self.elem.remove_const_ref()) - -@dataclass(frozen=True) -class ArrayRefCType: - elem: 'CType' - - def cpp_type(self, *, strip_ref: bool = False) -> str: - # Do not pass `strip_ref` recursively. - return f'at::ArrayRef<{self.elem.cpp_type()}>' - - def cpp_type_registration_declarations(self) -> str: - return f'ArrayRef<{self.elem.cpp_type_registration_declarations()}>' - - def remove_const_ref(self) -> 'CType': - return ArrayRefCType(self.elem.remove_const_ref()) - -@dataclass(frozen=True) -class VectorCType: - elem: 'CType' - - def cpp_type(self, *, strip_ref: bool = False) -> str: - # Do not pass `strip_ref` recursively. - return f'::std::vector<{self.elem.cpp_type()}>' - - def cpp_type_registration_declarations(self) -> str: - return f'::std::vector<{self.elem.cpp_type_registration_declarations()}>' - - def remove_const_ref(self) -> 'CType': - return VectorCType(self.elem.remove_const_ref()) - -@dataclass(frozen=True) -class ArrayCType: - elem: 'CType' - size: int - - def cpp_type(self, *, strip_ref: bool = False) -> str: - # Do not pass `strip_ref` recursively. - return f'::std::array<{self.elem.cpp_type()},{self.size}>' - - def cpp_type_registration_declarations(self) -> str: - return f'::std::array<{self.elem.cpp_type_registration_declarations()},{self.size}>' - - def remove_const_ref(self) -> 'CType': - return ArrayCType(self.elem.remove_const_ref(), self.size) - -@dataclass(frozen=True) -class TupleCType: - elems: List['CType'] - - def cpp_type(self, *, strip_ref: bool = False) -> str: - # Do not pass `strip_ref` recursively. - return f'::std::tuple<{",".join([e.cpp_type() for e in self.elems])}>' - - def cpp_type_registration_declarations(self) -> str: - return f'::std::tuple<{",".join([e.cpp_type_registration_declarations() for e in self.elems])}>' - - def remove_const_ref(self) -> 'CType': - return TupleCType([e.remove_const_ref() for e in self.elems]) - -CType = Union[ - BaseCType, - OptionalCType, - ConstRefCType, - MutRefCType, - ListCType, - ArrayRefCType, - ArrayCType, - VectorCType, - TupleCType -] - -# A NamedCType is short for Named C++ semantic type. A NamedCType represents a C++ type, plus -# semantic information about what it represents. For example, consider the -# argument "bool pin_memory"; its normal C++ type is "bool", but its C++ -# semantic type also keeps track that this represents a "pin_memory"; you can't -# just use a random other boolean in a context where you need a "pin_memory"! -# - -@dataclass(frozen=True) -class NamedCType: - name: ArgName - type: CType - - def cpp_type(self, *, strip_ref: bool = False) -> str: - return self.type.cpp_type(strip_ref=strip_ref) - - # For BC reasons, we don't want to introduce at:: namespaces to RegistrationDeclarations.yaml - # TODO: Kill this when we eventually remove it! - def cpp_type_registration_declarations(self) -> str: - return self.type.cpp_type_registration_declarations() - - def remove_const_ref(self) -> 'NamedCType': - return NamedCType(self.name, self.type.remove_const_ref()) - - def with_name(self, name: str) -> 'NamedCType': - return NamedCType(name, self.type) - -# A binding represents any C++ binding site for a formal parameter. -# We don't distinguish between binding sites for different APIs; -# instead, all of the important distinctions are encoded in CType, -# which you can use to figure out if a given Binding is appropriate -# for use in another context. (See tools.codegen.api.translate) - -@dataclass(frozen=True) -class Binding: - name: str - nctype: NamedCType - argument: Union[Argument, TensorOptionsArguments, SelfArgument] - # TODO: maybe don't represent default here - default: Optional[str] = None - - @property - def type(self) -> str: - return self.nctype.cpp_type() - - def no_default(self) -> 'Binding': - return Binding( - name=self.name, - nctype=self.nctype, - default=None, - argument=self.argument, - ) - - def decl(self, *, func_ptr_cast: bool = False) -> str: - mb_default = "" - if self.default is not None: - mb_default = f"={self.default}" - - # casting only needs to know the type - if func_ptr_cast: - return f"{self.type}" - else: - return f"{self.type} {self.name}{mb_default}" - - # For BC reasons, we don't want to introduce at:: namespaces to RegistrationDeclarations.yaml - # TODO: Kill this when we eventually remove it! - def decl_registration_declarations(self) -> str: - type_s = self.nctype.cpp_type_registration_declarations() - mb_default = "" - if self.default is not None: - mb_default = f"={self.default}" - return f"{type_s} {self.name}{mb_default}" - - def defn(self) -> str: - return f"{self.type} {self.name}" - - def with_name(self, name: str) -> 'Binding': - return Binding( - name=name, - nctype=self.nctype, - argument=self.argument, - default=self.default - ) - -# An Expr is a C++ expression. It has a C++ string representing its syntax, -# as well as a CType saying what it provides. - -@dataclass(frozen=True) -class Expr: - expr: str - type: NamedCType - -# A CppSignature represents a single overload in the C++ API. For -# any given function schema, there may be multiple CppSignatures -# corresponding to it, based on how we desugar to C++. See also -# CppSignatureGroup. -@dataclass(frozen=True) -class CppSignature: - # The schema this signature is derived from - func: FunctionSchema - - # Is this a C++ signature for a method, i.e. Tensor::my_op(...)? - method: bool - - # Is this a faithful C++ signature (i.e. following the JIT schema) or a convenience API - # (i.e. with a potential TensorOptions argument and out arguments in the front) - faithful: bool - - # The set of C++ arguments which should not have defaults applied to them - cpp_no_default_args: Set[str] - - # Is this a fallback C++ binding? Fallback bindings are enabled by - # manual_cpp_binding: True and are alternate, non-public API that - # lets manual C++ binding implementors access the binding that would - # have been automatically generated - fallback_binding: bool = False - - # Return the unpacked argument structure of this signature, - # discarding information about which arguments are semantically - # related to each other. - def arguments(self) -> Sequence[Binding]: - return cpp.arguments( - self.func.arguments, faithful=self.faithful, - method=self.method, cpp_no_default_args=self.cpp_no_default_args) - - def name(self) -> str: - n = cpp.name(self.func, faithful_name_for_out_overloads=self.faithful) - if self.fallback_binding: - n = f"__dispatch_{n}" - return n - - # Render the C++ declaration for this signature - def decl(self, *, name: Optional[str] = None, prefix: str = "", is_redispatching_fn: bool = False) -> str: - returns_type = cpp.returns_type(self.func.returns).cpp_type() - cpp_args = [a.decl() for a in self.arguments()] - if is_redispatching_fn: - cpp_args = ['c10::DispatchKeySet dispatchKeySet'] + cpp_args - cpp_args_str = ', '.join(cpp_args) - if name is None: - name = prefix + self.name() - return f"{returns_type} {name}({cpp_args_str})" - - # Render the C++ definition for this signature, not including - # the body (with curly braces) - def defn(self, *, name: Optional[str] = None, prefix: str = "", is_redispatching_fn: bool = False) -> str: - returns_type = cpp.returns_type(self.func.returns).cpp_type() - cpp_args = [a.defn() for a in self.arguments()] - if is_redispatching_fn: - cpp_args = ['c10::DispatchKeySet dispatchKeySet'] + cpp_args - cpp_args_str = ', '.join(cpp_args) - if name is None: - name = prefix + self.name() - return f"{returns_type} {name}({cpp_args_str})" - - def ptr_type(self) -> str: - args_types_str = ', '.join(a.type for a in self.arguments()) - return f'{cpp.returns_type(self.func.returns).cpp_type()} (*)({args_types_str})' - - # Return the C++ function type, e.g., something like int(bool) - def type(self) -> str: - args_types_str = ', '.join(a.type for a in self.arguments()) - return f'{cpp.returns_type(self.func.returns).cpp_type()} ({args_types_str})' - - -# Represents group of all CppSignatures associated with a -# FunctionSchema. Right now, that's the regular, user-visible -# signature, as well as a "faithful" signature which doesn't -# have grouping. -@dataclass(frozen=True) -class CppSignatureGroup: - func: FunctionSchema - signature: CppSignature - faithful_signature: Optional[CppSignature] - - def most_faithful_signature(self) -> CppSignature: - if self.faithful_signature: - return self.faithful_signature - else: - return self.signature - - @staticmethod - def from_native_function(f: NativeFunction, *, method: bool, fallback_binding: bool = False) -> 'CppSignatureGroup': - func = f.func - faithful_signature: Optional[CppSignature] - if func.arguments.tensor_options is not None or len(func.arguments.out) > 0: - faithful_signature = CppSignature( - func=func, - faithful=True, - method=method, - fallback_binding=fallback_binding, - cpp_no_default_args=f.cpp_no_default_args - ) - else: - faithful_signature = None - signature = CppSignature( - func=func, - faithful=False, - method=method, - fallback_binding=fallback_binding, - cpp_no_default_args=f.cpp_no_default_args - ) - return CppSignatureGroup( - func=func, - signature=signature, - faithful_signature=faithful_signature, - ) - -@dataclass(frozen=True) -class DispatcherSignature: - # The schema this signature is derived from - func: FunctionSchema - - # Allows you to prepend an arbitrary prefix to the signature name. - # This is useful for parts of the codegen that generate wrappers around kernels, - # and need to avoid naming collisions. - prefix: str = "" - - def arguments(self) -> List[Binding]: - return dispatcher.arguments(self.func) - - def name(self) -> str: - return self.prefix + dispatcher.name(self.func) - - def decl(self, name: Optional[str] = None) -> str: - args_str = ', '.join(a.decl() for a in self.arguments()) - if name is None: - name = self.name() - return f"{self.returns_type().cpp_type()} {name}({args_str})" - - def defn(self, name: Optional[str] = None, *, is_redispatching_fn: bool = False) -> str: - args = [a.defn() for a in self.arguments()] - if is_redispatching_fn: - args = ['c10::DispatchKeySet dispatchKeySet'] + args - args_str = ', '.join(args) - if name is None: - name = self.name() - return f"{self.returns_type().cpp_type()} {name}({args_str})" - - def exprs(self) -> List[Expr]: - return [Expr(a.name, a.nctype) for a in self.arguments()] - - def returns_type(self) -> CType: - return dispatcher.returns_type(self.func.returns) - - def ptr_type(self) -> str: - dispatcher_args_types_str = ', '.join(a.type for a in self.arguments()) - return f'{self.returns_type().cpp_type()} (*)({dispatcher_args_types_str})' - - # Return the C++ function type, e.g., something like int(bool) - def type(self) -> str: - dispatcher_args_types_str = ', '.join(a.type for a in self.arguments()) - return f'{self.returns_type().cpp_type()} ({dispatcher_args_types_str})' - - @staticmethod - def from_schema(func: FunctionSchema, *, prefix: str = '') -> 'DispatcherSignature': - return DispatcherSignature(func, prefix) - -@dataclass(frozen=True) -class NativeSignature: - # The schema this signature is derived from - func: FunctionSchema - - prefix: str = "" - - def name(self) -> str: - return self.prefix + native.name(self.func) - - def decl(self, name: Optional[str] = None) -> str: - args_str = ', '.join(a.decl() for a in self.arguments()) - if name is None: - name = self.name() - return f"{native.returns_type(self.func.returns).cpp_type()} {name}({args_str})" - - def defn(self, name: Optional[str] = None) -> str: - args_str = ', '.join(a.defn() for a in self.arguments()) - if name is None: - name = self.name() - return f"{native.returns_type(self.func.returns).cpp_type()} {name}({args_str})" - - def ptr_type(self) -> str: - # don't include defaults in type signature! - args_str = ', '.join(a.defn() for a in self.arguments()) - return f'{native.returns_type(self.func.returns).cpp_type()} (*)({args_str})' - - def arguments(self) -> List[Binding]: - return native.arguments(self.func) - - def returns_type(self) -> CType: - return native.returns_type(self.func.returns) - - def dispatcher_exprs(self) -> List[Expr]: - return translate.translate(self.arguments(), dispatcher.arguments(self.func), method=False) - -@dataclass(frozen=True) -class ViewInverseSignature: - # The NativeFunction this signature is derived from - f: NativeFunction - - def name(self) -> str: - return functionalization.name(self.f, functional_op=self.f, is_reverse=True, include_namespace=False) - - def decl(self) -> str: - return_type = functionalization.returns_type(self.f.func) - decls = [a.decl() for a in functionalization.inner_arguments(self.f.func, is_reverse=True)] - return f"static {return_type.cpp_type()} {self.name()}({', '.join(decls)});" - - @staticmethod - def from_func(f: NativeFunction) -> 'ViewInverseSignature': - # Some assertions: lambdas are only used for view ops - assert f.is_view_op - assert not f.func.name.name.inplace # only functional view ops need an inverse (e.g. not transpose_()) - return ViewInverseSignature(f) - -@dataclass(frozen=True) -class FunctionalizationLambda: - # The NativeFunction this signature is derived from - f: NativeFunction - - # The corresponding out-of-place variant of the above NativeFunction - # This only really matters for inplace-view ops. - # e.g. transpose_() -> transpose(). - functional_op: NativeFunction - - # are we generating the forward lambda or the reverse lambda? - is_reverse: bool - - def captures(self) -> List[Expr]: - # The lambda lives inside of a kernel following the dispatcher API, so its outer context is the dispatcher arguments - outer_ctx = dispatcher.arguments(self.f.func) - capture_bindings = functionalization.capture_arguments(self.f.func, is_reverse=self.is_reverse) - # allow_expensive_conversions is set because we want to convert - # some reference types (IntArrayRef) to value types (vector). - capture_exprs = translate.translate(outer_ctx, capture_bindings, method=False, allow_expensive_conversions=True) - return capture_exprs - - def decl(self) -> str: - return_type = functionalization.returns_type(self.f.func) - capture_str = ', '.join(f'{val.type.name} = {val.expr}' for val in self.captures()) - decls = [a.decl() for a in functionalization.outer_arguments(is_reverse=self.is_reverse)] - return f"[{capture_str}]({', '.join(decls)}) -> {return_type.cpp_type()}" - - def inner_call(self) -> str: - inner_call_name = functionalization.name( - self.f, functional_op=self.functional_op, is_reverse=self.is_reverse, include_namespace=True) - - arg_ctx = functionalization.outer_arguments(is_reverse=self.is_reverse) - capture_ctx = functionalization.capture_arguments(self.f.func, is_reverse=self.is_reverse) - full_ctx = arg_ctx + capture_ctx - - call_bindings = functionalization.inner_arguments(self.f.func, is_reverse=self.is_reverse) - maybe_index = functionalization.inner_call_index(self.f.func) - call_exprs = [e.expr for e in translate.translate(full_ctx, call_bindings, method=False)] - if not self.is_reverse and maybe_index is not None: - return f'{inner_call_name}({", ".join(call_exprs)})[{maybe_index.name}];' - else: - return f'{inner_call_name}({", ".join(call_exprs)});' - - @staticmethod - def from_func(f: NativeFunction, *, functional_op: NativeFunction, is_reverse: bool) -> 'FunctionalizationLambda': - # Some assertions: lambdas are only used for view ops - assert f.is_view_op - assert functional_op.is_view_op - # functional_op corresponds to the functional-variant of f, and is only actually used if f itself is an inplace_view op. - assert f.func.signature() == functional_op.func.signature() - return FunctionalizationLambda(f, functional_op, is_reverse) - - -# Helper functions - -def kernel_signature( - f: NativeFunction, backend_index: BackendIndex, *, prefix: str = '') -> Union['NativeSignature', 'DispatcherSignature']: - # Note [External Backends Follow Dispatcher API] - # Kernel signatures for in-tree backends follow the "native" API, - # while kernels for out-of-tree backends follow the dispatcher API. - # See the comments in `native.py` for details, but historically there have been - # some small differences in schema convention between them and the Dispatcher API. - # Any differences that require translating between the two will results in a runtime cost, - # so we'd like to keep the differences as small as possible. - # With external backends, we'd like to enforce that they write their kernels with schemas - # that match the Dispatcher API directly, if they can. - if backend_index.external: - return DispatcherSignature.from_schema(f.func, prefix=prefix) - else: - return NativeSignature(f.func, prefix) - -# Functions only, no types -from tools.codegen.api import cpp, dispatcher, native, translate, functionalization diff --git a/tools/codegen/code_template.py b/tools/codegen/code_template.py deleted file mode 100644 index 3b0b188834ef..000000000000 --- a/tools/codegen/code_template.py +++ /dev/null @@ -1,91 +0,0 @@ -import re -from typing import Match, Optional, Sequence, Mapping - -# match $identifier or ${identifier} and replace with value in env -# If this identifier is at the beginning of whitespace on a line -# and its value is a list then it is treated as -# block substitution by indenting to that depth and putting each element -# of the list on its own line -# if the identifier is on a line starting with non-whitespace and a list -# then it is comma separated ${,foo} will insert a comma before the list -# if this list is not empty and ${foo,} will insert one after. - - -class CodeTemplate: - # Python 2.7.5 has a bug where the leading (^[^\n\S]*)? does not work, - # workaround via appending another [^\n\S]? inside - - substitution_str = r'(^[^\n\S]*[^\n\S]?)?\$([^\d\W]\w*|\{,?[^\d\W]\w*\,?})' - - # older versions of Python have a bug where \w* does not work, - # so we need to replace with the non-shortened version [a-zA-Z0-9_]* - # https://bugs.python.org/issue18647 - - substitution_str = substitution_str.replace(r'\w', r'[a-zA-Z0-9_]') - - substitution = re.compile(substitution_str, re.MULTILINE) - - pattern: str - filename: str - - @staticmethod - def from_file(filename: str) -> 'CodeTemplate': - with open(filename, 'r') as f: - return CodeTemplate(f.read(), filename) - - def __init__(self, pattern: str, filename: str = "") -> None: - self.pattern = pattern - self.filename = filename - - def substitute(self, env: Optional[Mapping[str, object]] = None, **kwargs: object) -> str: - if env is None: - env = {} - - def lookup(v: str) -> object: - assert env is not None - return kwargs[v] if v in kwargs else env[v] - - def indent_lines(indent: str, v: Sequence[object]) -> str: - return "".join([indent + l + "\n" for e in v for l in str(e).splitlines()]).rstrip() - - def replace(match: Match[str]) -> str: - indent = match.group(1) - key = match.group(2) - comma_before = '' - comma_after = '' - if key[0] == "{": - key = key[1:-1] - if key[0] == ",": - comma_before = ', ' - key = key[1:] - if key[-1] == ',': - comma_after = ', ' - key = key[:-1] - v = lookup(key) - if indent is not None: - if not isinstance(v, list): - v = [v] - return indent_lines(indent, v) - elif isinstance(v, list): - middle = ', '.join([str(x) for x in v]) - if len(v) == 0: - return middle - return comma_before + middle + comma_after - else: - return str(v) - return self.substitution.sub(replace, self.pattern) - - -if __name__ == "__main__": - c = CodeTemplate("""\ - int foo($args) { - - $bar - $bar - $a+$b - } - int commatest(int a${,stuff}) - int notest(int a${,empty,}) - """) - print(c.substitute(args=["hi", 8], bar=["what", 7], - a=3, b=4, stuff=["things...", "others"], empty=[])) diff --git a/tools/codegen/context.py b/tools/codegen/context.py deleted file mode 100644 index ba21c86c7934..000000000000 --- a/tools/codegen/context.py +++ /dev/null @@ -1,67 +0,0 @@ -from tools.codegen.utils import S, T, context -from tools.codegen.model import (NativeFunction, NativeFunctionsGroup, BackendIndex, DispatchKey) -import tools.codegen.local as local - -import functools -from typing import TypeVar, Union, Iterator, Callable, Dict -import contextlib - -# Helper functions for defining generators on things in the model - -F = TypeVar( - 'F', - NativeFunction, - NativeFunctionsGroup, - Union[NativeFunction, NativeFunctionsGroup], -) - -@contextlib.contextmanager -def native_function_manager(g: Union[NativeFunctionsGroup, NativeFunction]) -> Iterator[None]: - if isinstance(g, NativeFunctionsGroup): - # By default, we associate all errors with structured native functions - # with the out variant. In some cases, it might be better to have - # a more specific place to hang things; if so, use - # native_function_manager again on the inside - f = g.out - else: - f = g - with context(lambda: f'in native_functions.yaml line {f.loc}:\n {f.func}'): - with local.parametrize(use_const_ref_for_mutable_tensors=f.use_const_ref_for_mutable_tensors): - yield - -# Given a function that operates on NativeFunction, wrap it into a new function -# that sets some appropriate context managers for that native function. -# YOU MUST WRAP FUNCTIONS IN THIS for calls to api modules to be sound -# (you will get an error if we try to access the local variables without having -# set them). -def with_native_function(func: Callable[[F], T]) -> Callable[[F], T]: - @functools.wraps(func) - def wrapper(f: F) -> T: - with native_function_manager(f): - return func(f) - return wrapper - -def method_with_native_function(func: Callable[[S, F], T]) -> Callable[[S, F], T]: - @functools.wraps(func) - def wrapper(slf: S, f: F) -> T: - with native_function_manager(f): - return func(slf, f) - return wrapper - -# Convenience decorator for functions that explicitly take in a BackendIndex, -# instead of indirectly taking one in as a closure -def with_native_function_and_index(func: Callable[[F, BackendIndex], T]) -> Callable[[F, BackendIndex], T]: - @functools.wraps(func) - def wrapper(f: F, backend_index: BackendIndex) -> T: - with native_function_manager(f): - return func(f, backend_index) - return wrapper - -def with_native_function_and_indices( - func: Callable[[F, Dict[DispatchKey, BackendIndex]], T] -) -> Callable[[F, Dict[DispatchKey, BackendIndex]], T]: - @functools.wraps(func) - def wrapper(f: F, backend_indices: Dict[DispatchKey, BackendIndex]) -> T: - with native_function_manager(f): - return func(f, backend_indices) - return wrapper diff --git a/tools/codegen/dest/__init__.py b/tools/codegen/dest/__init__.py deleted file mode 100644 index ce9265adf969..000000000000 --- a/tools/codegen/dest/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from .lazy_ir import LazyIR as LazyIR -from .lazy_ir import GenLazyShapeInferenceDefinition as GenLazyShapeInferenceDefinition -from .lazy_ir import GenLazyNativeFuncDefinition as GenLazyNativeFuncDefinition -from .register_dispatch_key import ( - RegisterDispatchKey as RegisterDispatchKey, - gen_registration_helpers as gen_registration_helpers, - gen_registration_headers as gen_registration_headers, -) -from .native_functions import compute_native_function_declaration as compute_native_function_declaration diff --git a/tools/codegen/dest/lazy_ir.py b/tools/codegen/dest/lazy_ir.py deleted file mode 100644 index d41b4edcd8ac..000000000000 --- a/tools/codegen/dest/lazy_ir.py +++ /dev/null @@ -1,264 +0,0 @@ -from typing import List, Union -from dataclasses import dataclass -from tools.codegen.context import method_with_native_function -from tools.codegen.model import (BackendIndex, NativeFunction, - NativeFunctionsGroup) -from tools.codegen.api.types import (BaseCType, OptionalCType, NamedCType, - VectorCType, kernel_signature) -import tools.codegen.api.dispatcher as dispatcher -from tools.codegen.api.lazy import LazyIrSchema, isValueType -from tools.codegen.dest.lazy_ts_lowering import ts_lowering_body - - -def node_ctor_arg_rvalue_string(arg: NamedCType) -> str: - """ - Given a NamedCType from a lazy IR schema, - generate a c++ string for materializing an rvalue of that arg for passing into - a lazy Node constructor. - """ - if isValueType(arg.type): - if isinstance(arg.type, BaseCType): - return f"lazy_{arg.name}.GetIrValue()" - elif isinstance(arg.type, OptionalCType): - return f"lazy_{arg.name} ? " \ - f"c10::make_optional(lazy_{arg.name}.GetIrValue()) : " \ - "c10::nullopt" - else: - raise AssertionError("TODO not sure if there are other valid types to handle here") - else: - if isinstance(arg.type, VectorCType) and isinstance(arg.type.elem, BaseCType): - return f"std::vector<{arg.type.elem.type}>({arg.name}.begin(), {arg.name}.end())" - elif (isinstance(arg.type, OptionalCType) and - isinstance(arg.type.elem, VectorCType) and - isinstance(arg.type.elem.elem, BaseCType)): - return f"torch::lazy::ToOptionalVector<{arg.type.elem.elem.type}>({arg.name})" - else: - return f"{arg.name}" - -def node_ctor_inputs(func: LazyIrSchema) -> str: - """ - Produce a formatted string with the arguments as passed into the constructor of a node class. - """ - node_ctor_values = [node_ctor_arg_rvalue_string(arg) for arg in func.filtered_types()] - return ",\n ".join(node_ctor_values) - - -@dataclass(frozen=True) -class LazyIR: - backend_index: BackendIndex - node_base: str - - @method_with_native_function - def __call__(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> List[str]: - func = f.functional.func if isinstance(f, NativeFunctionsGroup) else f.func - return self.gen(f) - - def gen(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> List[str]: - # for now, we just want one IR class decl and soon after also the method defs - # and we use the functional version not out/inplace. - func = f.functional.func if isinstance(f, NativeFunctionsGroup) else f.func - schema = LazyIrSchema(func) - all_types = schema.filtered_types() - value_types = schema.filtered_types(values=True, scalars=False) - scalar_types = schema.filtered_types(values=False, scalars=True) - - node_ctor_args = ", ".join([f"const {i.cpp_type()}& {i.name}" for i in all_types]) - scalar_initializers = ",\n ".join([f"{t.name}_({t.name})" for t in scalar_types]) - comma_if_scalar_initializers = ",\n" if len(scalar_initializers) else "" - scalar_decls = "\n ".join([f"{t.cpp_type()} {t.name}_;" for t in scalar_types]) - scalar_hashes = ", ".join([f"{f.name}" for f in scalar_types]) - base_ctor_value_args_list = [] - optional_values = [] - for t in value_types: - if isinstance(t.type, BaseCType): - base_ctor_value_args_list.append(f"{t.name}") - elif isinstance(t.type, OptionalCType): - base_ctor_value_args_list.append(f"{t.name}.value_or(kNullValue)") - optional_values.append(t.name) - else: - raise AssertionError("TODO not sure if there are other valid types to handle here") - base_ctor_value_args = ", ".join(base_ctor_value_args_list) - has_optional_decls = "\n ".join([f"bool has_{value}: 1;" for value in optional_values]) - has_optional_defs = "\n ".join([f"has_{value} = !!{value};" for value in optional_values]) - members_to_string = [] - for t in scalar_types: - if isinstance(t.type, OptionalCType): - members_to_string.append(f"""if ({t.name}_.has_value()) {{ - ss << ", {t.name}=" << {t.name}_.value(); -}} else {{ - ss << ", {t.name}=null"; -}}""") - else: - members_to_string.append(f'ss << ", {t.name}=" << {t.name}_;') - members_to_string_str = "\n ".join(members_to_string) - - return [f"""\ -// TODO(alanwaketan): Public members don't need to have _ suffix. -class {schema.node_name} : public {self.node_base} {{ - public: - {schema.node_name}({node_ctor_args}, std::vector&& shapes) - : {self.node_base}(torch::lazy::OpKind(at::aten::{schema.aten_name}), - {{{base_ctor_value_args}}}, std::move(shapes), - /* num_outputs */ {len(func.returns)}, - torch::lazy::MHash({scalar_hashes})){comma_if_scalar_initializers} - {scalar_initializers} - - {{ - {has_optional_defs} - }} - - std::string ToString() const override {{ - std::stringstream ss; - ss << TsNode::ToString(); - {members_to_string_str} - return ss.str(); - }} - - torch::lazy::TSOpVector Lower(std::shared_ptr function, - torch::lazy::TSLoweringContext* loctx) const override {{ - {ts_lowering_body(f)} - }} - - {scalar_decls} - {has_optional_decls} - -}}; - -""", ] - - -def lazy_tensor_decls(value_types: List[NamedCType], tensor_class: str) -> str: - lazy_tensor_decls: List[str] = [] - for t in value_types: - if isinstance(t.type, BaseCType): - lazy_tensor_decls.append( - f"{tensor_class} lazy_{t.name} = " - f"GetLtcTensorOrCreateForWrappedNumber({t.name}, *device);") - elif isinstance(t.type, OptionalCType): - # TODO(alanwaketan): Maybe we want to apply GetLtcTensorOrCreateForWrappedNumber here, but hold it - # until we encounter a real world example. - lazy_tensor_decls.append( - f" {tensor_class} lazy_{t.name} = TryGetLtcTensor({t.name}.value_or(at::Tensor()));") - else: - raise AssertionError("TODO not sure if there are other valid types to handle here") - return "\n ".join(lazy_tensor_decls) - -@dataclass(frozen=True) -class GenLazyNativeFuncDefinition: - class_method_name: str - backend_index: BackendIndex - tensor_class: str - - @method_with_native_function - def __call__(self, func: NativeFunction) -> List[str]: - sig = kernel_signature(func, self.backend_index) - - # Lazy IR stuff - schema = LazyIrSchema(func.func) - all_types = schema.filtered_types() - value_types = schema.filtered_types(values=True, scalars=False) - scalar_types = schema.filtered_types(values=False, scalars=True) - returns_length = len(schema.returns) - - value_types_names = ", ".join([f"{t.name}" for t in value_types]) - get_device_str = f"""auto device = bridge::GetBackendDevice({value_types_names});""" - lazy_tensor_decls_str = lazy_tensor_decls(value_types, self.tensor_class) - node_ctor_input_str = node_ctor_inputs(schema) - - # call the meta kernel if it exists, to compute output shape/dtype for our IR - if func.structured or func.structured_delegate is not None: - meta_out = """std::vector shapes{Shape(out_meta.scalar_type(), out_meta.sizes().vec())};""" - if returns_length > 1: - def this_shape(i: int) -> str: - return f"Shape(std::get<{i}>(out_meta).scalar_type(), std::get<{i}>(out_meta).sizes().vec())" - shapes_str = ','.join([this_shape(i) for i in range(returns_length)]) - meta_out = "std::vector shapes{" + shapes_str + "};" - - meta_str = f"""auto out_meta = at::meta::{schema.aten_name}({', '.join(str(t.name) for t in all_types)}); - {meta_out}""" - else: - shape_sig = ComputeShapeSignature(func) - meta_str = f""" - auto shapes = {shape_sig.shape_call};""" - meta_str += f""" - TORCH_INTERNAL_ASSERT(shapes.size() == {returns_length});""" - - node_str = f"""auto node = torch::lazy::MakeNode({node_ctor_input_str}, - std::move(shapes));""" - - assert len(value_types) > 0, f"Only supporting tensor ops so far, none found in {sig}" - first_tensor = value_types[0] - bridge_str = f"""auto result = CreateAtenFromLtcTensor(lazy_{first_tensor.name}.CreateFrom(node));""" - if returns_length > 1: - bridge_str = f"""std::vector<{self.tensor_class}> lazy_tensors; - for (int i = 0; i < {returns_length}; i++) {{ - lazy_tensors.push_back(lazy_{first_tensor.name}.CreateFrom(torch::lazy::Value(node, i))); - }} - auto result = TupleAtenFromLtcTensors<{returns_length}>(lazy_tensors);""" - if schema.name.name.inplace: - assert returns_length == 1, "We assumed there was no such case where an op is an in-place variant " \ - "and has tuple outputs." - bridge_str = f"""lazy_{first_tensor.name}.SetInPlaceIrValue(node); - auto& result = {first_tensor.name};""" - - - return [f"""\ - // TODO(alanwaketan): Quite a lot inefficient copy-by-value there. Let's optimize it. - {sig.decl(name=f"{self.class_method_name}::{schema.aten_name}")} {{ - TORCH_LAZY_FN_COUNTER("lazy::"); - {get_device_str} - {lazy_tensor_decls_str} - {meta_str} - {node_str} - {bridge_str} - return result; - }};\n - """] - -class ComputeShapeSignature: - """ - Here we use the base name as the suffix of the signature to avoid generating for in-place variants. - """ - @method_with_native_function - def __init__(self, f: NativeFunction): - self.__schema = LazyIrSchema(f.func) - self.__dispatch_args = ', '.join([a.decl() for a in dispatcher.arguments(f.func)]) - self.__call_args = ", ".join([f"{t.name}" for t in self.__schema.filtered_types()]) - - def __decl_suffix(self) -> str: - return f"{self.__schema.base_name}({self.__dispatch_args})" - - def __call_suffix(self) -> str: - return f"{self.__schema.base_name}({self.__call_args})" - - @property - def shape_decl(self) -> str: - return f"std::vector compute_shape_{self.__decl_suffix()}" - - @property - def shape_call(self) -> str: - return f"torch_lazy_tensors::ir::ops::compute_shape_{self.__call_suffix()}" - - -@dataclass(frozen=True) -class GenLazyShapeInferenceDefinition: - backend_index: BackendIndex - tensor_class: str - - @method_with_native_function - def __call__(self, f: NativeFunction) -> List[str]: - sig = kernel_signature(f, self.backend_index) - - # Lazy IR stuff - schema = LazyIrSchema(f.func) - value_types = schema.filtered_types(values=True, scalars=False) - lazy_tensor_decls_str = lazy_tensor_decls(value_types, self.tensor_class) - node_ctor_input_str = node_ctor_inputs(schema) - - # Only generate shape/dtype fn for non-structured kernels, - # since we just use the meta function for structured kernels - if not f.structured and f.structured_delegate is None: - shape_sig = ComputeShapeSignature(f) - return ["\n".join([f"{shape_sig.shape_decl};"])] - else: - return [] diff --git a/tools/codegen/dest/lazy_ts_lowering.py b/tools/codegen/dest/lazy_ts_lowering.py deleted file mode 100644 index 32d505cda7bf..000000000000 --- a/tools/codegen/dest/lazy_ts_lowering.py +++ /dev/null @@ -1,43 +0,0 @@ -from typing import Union -from tools.codegen.model import (NativeFunction, NativeFunctionsGroup) -from tools.codegen.api.lazy import LazyIrSchema, isValueType -from tools.codegen.api.types import OptionalCType - - -def ts_lowering_body(f: Union[NativeFunctionsGroup, NativeFunction]) -> str: - # for now, we just want one IR class decl and soon after also the method defs - # and we use the functional version not out/inplace. - func = f.functional.func if isinstance(f, NativeFunctionsGroup) else f.func - schema = LazyIrSchema(func) - - emplace_arguments = [] - for value in schema.positional_arg_types: - if isValueType(value.type): - if isinstance(value.type, OptionalCType): - emplace_arguments.append(f"has_{value.name} ? loctx->GetOutputOp(operand(i++)) : nullptr") - continue - emplace_arguments.append('loctx->GetOutputOp(operand(i++))') - continue - emplace_arguments.append(f'"{value.name}", {value.name}_') - - emplace_arguments_str = "\n ".join( - [f"arguments.emplace_back({a});" for a in emplace_arguments]) - emplace_kwarg_values = [f'loctx->GetOutputOp(operand({i}))' for i in range(len(schema.keyword_values))] - emplace_kwarg_scalars = [f'"{t.name}", {t.name}_' for t in schema.keyword_scalars] - assert len(schema.keyword_values) == 0, "TODO the logic for operand(i) is broken if there are kw values" - emplace_kwarguments = "\n ".join( - [f"kwarguments.emplace_back({a});" for a in emplace_kwarg_values + emplace_kwarg_scalars]) - return f"""\ - std::vector arguments; - std::vector kwarguments; - arguments.reserve({len(emplace_arguments)}); - kwarguments.reserve({len(emplace_kwarg_values + emplace_kwarg_scalars)}); - size_t i = 0; - {emplace_arguments_str} - {emplace_kwarguments} - torch::lazy::TSOpVector {schema.aten_name}_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments); - CHECK_EQ({schema.aten_name}_out.size(), {len(func.returns)}); - - // TODO: need to call GenerateClone sometimes? Or else return LowerBuiltIn() directly - return {schema.aten_name}_out; -""" diff --git a/tools/codegen/dest/native_functions.py b/tools/codegen/dest/native_functions.py deleted file mode 100644 index 5fbb297f9c50..000000000000 --- a/tools/codegen/dest/native_functions.py +++ /dev/null @@ -1,55 +0,0 @@ -from typing import List, Union, Optional - -from tools.codegen.context import with_native_function_and_index -from tools.codegen.utils import mapMaybe -from tools.codegen.model import NativeFunction, NativeFunctionsGroup, BackendIndex -from tools.codegen.api.types import kernel_signature -import tools.codegen.api.meta as meta -import tools.codegen.api.structured as structured - -@with_native_function_and_index -def gen_unstructured(f: NativeFunction, backend_index: BackendIndex) -> Optional[str]: - sig = kernel_signature(f, backend_index) - metadata = backend_index.get_kernel(f) - if metadata is None: - return None - if "legacy::" in metadata.kernel: - return None - else: - prefix = 'static' if backend_index.external else 'TORCH_API' - return f"{prefix} {sig.decl(name=metadata.kernel)};" - -@with_native_function_and_index -def gen_structured(g: NativeFunctionsGroup, backend_index: BackendIndex) -> List[str]: - meta_name = meta.name(g) - out_args = structured.impl_arguments(g) - metadata = backend_index.get_kernel(g) - if metadata is None: - return [] - prefix = '' if backend_index.external else 'TORCH_API ' - return [f"""\ -struct {prefix}structured_{metadata.kernel} : public at::meta::structured_{meta_name} {{ -void impl({', '.join(a.decl() for a in out_args)}); -}}; -"""] - -# Generates NativeFunctions.h, a list of forward declarations of all -# actual kernel definitions we keep in aten/src/ATen/native/ -@with_native_function_and_index -def compute_native_function_declaration( - g: Union[NativeFunctionsGroup, NativeFunction], - backend_index: BackendIndex -) -> List[str]: - metadata = backend_index.get_kernel(g) - if isinstance(g, NativeFunctionsGroup): - if metadata is not None and metadata.structured: - if backend_index.external: - # Structured hasn't been tested with external backends yet. - raise AssertionError("Structured external backend functions are not implemented yet.") - else: - return gen_structured(g, backend_index) - else: - return list(mapMaybe(lambda f: gen_unstructured(f, backend_index), g.functions())) - else: - x = gen_unstructured(g, backend_index) - return [] if x is None else [x] diff --git a/tools/codegen/dest/register_dispatch_key.py b/tools/codegen/dest/register_dispatch_key.py deleted file mode 100644 index c555768d08ce..000000000000 --- a/tools/codegen/dest/register_dispatch_key.py +++ /dev/null @@ -1,757 +0,0 @@ -from typing import List, Optional, Union -import itertools -from typing_extensions import Literal -from dataclasses import dataclass -import textwrap - -from tools.codegen.context import method_with_native_function, native_function_manager -from tools.codegen.utils import Target, mapMaybe, assert_never -from tools.codegen.model import (DispatchKey, NativeFunction, - NativeFunctionsGroup, SchemaKind, - TensorOptionsArguments, - DeviceCheckType, Argument, - is_cuda_dispatch_key, BackendIndex, - gets_generated_out_inplace_wrapper) -from tools.codegen.api.types import (BaseCType, Binding, ConstRefCType, - CppSignature, CppSignatureGroup, - Expr, MutRefCType, kernel_signature, - NativeSignature, tensorT, NamedCType, - DispatcherSignature) -import tools.codegen.api.meta as meta -import tools.codegen.api.cpp as cpp -import tools.codegen.api.structured as structured -from tools.codegen.api.translate import translate -from tools.codegen.selective_build.selector import SelectiveBuilder - -def gen_registration_headers( - backend_index: BackendIndex, - per_operator_headers: bool, - rocm: bool, -) -> List[str]: - if per_operator_headers: - headers = ["#include "] - else: - headers = ["#include "] - - if backend_index.dispatch_key in (DispatchKey.CPU, DispatchKey.Meta): - headers.append("#include ") - elif backend_index.dispatch_key == DispatchKey.CUDA: - if rocm: - headers.append("#include ") - else: - headers.append("#include ") - elif per_operator_headers: - headers += [ - "#include ", - "#include "] - else: - headers.append("#include ") - - return headers - -def gen_create_out_helper(backend_index: BackendIndex) -> List[str]: - if backend_index.dispatch_key == DispatchKey.Meta: - empty_options = "options.device(at::kMeta)" - else: - empty_options = "options" - - if backend_index.dispatch_key in ( - DispatchKey.Meta, DispatchKey.CPU, DispatchKey.CUDA): - dispatch = str(backend_index.dispatch_key).lower() - empty_impl = f"at::detail::empty_{dispatch}" - empty_strided_impl = f"at::detail::empty_strided_{dispatch}" - runtime_empty_supported_check = "" - elif backend_index.dispatch_key == DispatchKey.CompositeExplicitAutograd: - empty_impl = "at::empty" - empty_strided_impl = "at::empty_strided" - runtime_empty_supported_check = """\ - if (!c10::detail::backend_supports_empty_operator(options)) {{ - // The main purpose of this CompositeExplicitAutograd kernel is to provide - // a "free" implementation of out-of-place operators. - // If a backend hasn't implemented an out-of-place op but has implemented - // the out= variant, then this kernel will call their out= variant. - // It does that by using at::empty() to create the tensor to pass to the out= variant though, - // so this "default" kernel doesn't actually handle backends that don't support at::empty - // (e.g. quantized backends). - // Returning an undefined tensor here allows us to reach the out= kernel and give a better error. - // Longer term, this could be better fixed by https://github.com/pytorch/pytorch/issues/52680 - return at::Tensor(); - }} -""" - else: - return [] - - return [f""" -Tensor create_out(IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) {{ - {runtime_empty_supported_check} - if (strides.empty()) {{ - return {empty_impl}(sizes, {empty_options}); - }} else {{ - return {empty_strided_impl}(sizes, strides, {empty_options}); - }} -}} -"""] - - -def gen_resize_out_helper(backend_index: BackendIndex) -> List[str]: - return [""" -void resize_out(const Tensor &out, IntArrayRef sizes, IntArrayRef strides, const TensorOptions &options) { - TORCH_CHECK(options.dtype() == out.dtype(), - "Expected out tensor to have dtype ", options.dtype(), ", but got ", out.dtype(), " instead"); - TORCH_CHECK(options.device() == out.device(), - "Expected out tensor to have device ", options.device(), ", but got ", out.device(), " instead"); - const bool resized = at::native::resize_output(out, sizes); - // Only restride if a resize occurred; otherwise we ignore the (advisory) - // strides from the meta function and directly use the output tensor's - // preexisting strides - if (resized) { - if (!strides.empty()) { - TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value()); - at::native::as_strided_(out, sizes, strides); - } else if (options.memory_format_opt().has_value()) { - out.unsafeGetTensorImpl()->empty_tensor_restride(*options.memory_format_opt()); - } - } -} -"""] - -def gen_check_inplace_helper(backend_index: BackendIndex) -> List[str]: - return [""" -void check_inplace(const Tensor &self, IntArrayRef sizes, const TensorOptions &options) { - // These checks are needed on those operators that: - // 1) don't use 'TensorIterator' (e.g. 'addmm' and 'baddbmm') - // 2) have particular typing rules (e.g. 'cumsum' and 'cumprod') - // For other operators (e.g. 'add'), 'TensorIterator' already checks - // these things separately. - TORCH_CHECK(options.dtype() == self.dtype(), - "Bad in-place call: ", - "input tensor dtype ", self.dtype(), " and output tensor dtype ", options.dtype(), " should match"); - TORCH_CHECK(options.device() == self.device(), - "Bad in-place call: ", - "input tensor device ", self.device(), " and output tensor device ", options.device(), " should match"); - TORCH_CHECK(sizes == self.sizes(), - "Bad in-place call: ", - "input tensor size ", self.sizes(), " and output tensor size ", sizes, " should match"); -} -"""] - - -def gen_registration_helpers(backend_index: BackendIndex) -> List[str]: - return [ - *gen_create_out_helper(backend_index), - *gen_resize_out_helper(backend_index), - *gen_check_inplace_helper(backend_index) - ] - - -# Generates Register{dispatch}.cpp (e.g., RegisterCPU.cpp). -# -# - The primary function of this file is to register all of the -# implementations for the given dispatch key to the dispatcher, -# so they are available for use in PyTorch. If dispatch is -# None, we generate schema (def) registrations and catchall -# registrations. -# - The secondary function of this file is to generate a wrapper -# around functions. In CPUType these wrappers do nothing -# (and should be removed), but in other cases they handle -# DeviceGuard. A small extra benefit of wrappers is they -# are not overloaded, so they can be used in the registration -# API without having to disambiguate which overload you want -# (as would be the case if you directly registered native:: -# functions). -# - The tertiary function of this file is to generate *static* -# cpp API bindings which can be used to bypass dispatcher -# directly to kernels, but with user-friendly cpp-style API -@dataclass(frozen=True) -class RegisterDispatchKey: - backend_index: BackendIndex - - target: Union[ - Literal[Target.ANONYMOUS_DEFINITION], - Literal[Target.NAMESPACED_DEFINITION], - Literal[Target.NAMESPACED_DECLARATION], - Literal[Target.REGISTRATION] - ] - - # Selector object to determine which operators to generate - # registration code for. - selector: SelectiveBuilder - - # Whether or not we are actually code-genning for ROCm - rocm: bool - - # The namespace that the kernels are written in. This is just `at::native` for in-tree kernels. - cpp_namespace: str - - # The class that all unstructured native functions live under. This is used to improve - # compiler error messages when a kernel writer adds a native function with the wrong signature. - # This is only used in unstructured kernels, since structured kernels already live in a class. - # Finally, this field is currently Optional because it is only used by external backends. - # It would be nice if we can add the same logic to in-tree kernels too, but that requires updating - # all of the existing kernel signatures scattered across aten/src/ATen/native. - class_method_name: Optional[str] - - @staticmethod - def gen_device_check(type: DeviceCheckType, args: List[Argument], method_name: str) -> str: - if type == DeviceCheckType.NoCheck: - return ' // No device check\n' - - device_check = 'c10::optional common_device = nullopt;\n' - device_check += '(void)common_device; // Suppress unused variable warning\n' - for arg in args: - # Only tensor like arguments are eligible - if arg.type.is_tensor_like(): - device_check += f""" - c10::impl::check_and_update_common_device(common_device, {arg.name}, "{method_name}", "{arg.name}");""" - return device_check - - @method_with_native_function - def __call__(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> List[str]: - if isinstance(f, NativeFunctionsGroup): - g: NativeFunctionsGroup = f - # Note: We call gen_structured() if the operator is marked structured, regardless of the backend. - # gen_structured() has special logic to handle auto-generated kernels. - if g.structured: - return self.gen_structured(g) - else: - return list(mapMaybe(lambda f: self.gen_unstructured(f, g), g.functions())) - elif isinstance(f, NativeFunction): - r = self.gen_unstructured(f) - return [] if r is None else [r] - else: - assert_never(f) - - def wrapper_kernel_sig(self, f: NativeFunction) -> Union[NativeSignature, DispatcherSignature]: - # The prefix is just to ensure uniqueness. The Dispatcher API doesn't guarantee unique kernel names. - return kernel_signature(f, self.backend_index, prefix=f'wrapper_{f.func.name.overload_name}_') - - def gen_out_inplace_wrapper(self, f: NativeFunction, g: Optional[NativeFunctionsGroup]) -> Optional[str]: - if g is None: - return None - k = f.func.kind() - if k is SchemaKind.inplace: - copy_op = 'at::_copy_from' - elif k is SchemaKind.out: - copy_op = 'at::_copy_from_and_resize' - else: - raise AssertionError("gen_out_inplace_wrapper called on a functional op") - - sig = self.wrapper_kernel_sig(f) - name = sig.name() - - func_res = f'{name}_tmp' - return_names = cpp.return_names(f) - if len(return_names) > 1: - updates = '\n '.join( - f'{copy_op}(std::get<{i}>({func_res}), {ret_name});' - for i, ret_name in enumerate(return_names)) - returns = f'{sig.returns_type().cpp_type()}({", ".join(return_names)})' - else: - ret_name = return_names[0] - updates = f'{copy_op}({func_res}, {ret_name});' - returns = ret_name - - functional_sig = self.wrapper_kernel_sig(g.functional) - wrapper_name = sig.name() - - return f"""\ -{sig.defn(name=wrapper_name)} {{ - auto {func_res} = {functional_sig.name()}({", ".join(e.expr for e in translate(sig.arguments(), functional_sig.arguments()))}); - {updates} - return {returns}; -}} -""" - - def gen_structured(self, g: NativeFunctionsGroup) -> List[str]: - metadata = self.backend_index.get_kernel(g) - if self.backend_index.dispatch_key == DispatchKey.Meta: - assert not self.backend_index.has_kernel(g.out), \ - "Do not explicitly specify Meta dispatch key on structured " \ - "functions, they will be automatically generated for you" - elif self.backend_index.dispatch_key == DispatchKey.CompositeExplicitAutograd: - assert not self.backend_index.has_kernel(g.out), \ - "Do not explicitly specify CompositeExplicitAutograd dispatch key on structured " \ - "functions, they will be automatically generated for you" - elif metadata is None or not metadata.structured: - return list(mapMaybe(lambda f: self.gen_unstructured(f, g), g.functions())) - - structured_gen = StructuredRegisterDispatchKey( - self.backend_index, - self.target, - self.selector, - self.rocm, - self.cpp_namespace, - self.class_method_name, - g - ) - return list(mapMaybe(structured_gen.gen_one, g.functions())) - - def gen_unstructured(self, f: NativeFunction, g: Optional[NativeFunctionsGroup] = None) -> Optional[str]: - with native_function_manager(f): - inplace_meta = False - gets_out_inplace_wrapper = False - if not self.backend_index.has_kernel(f): - if (self.backend_index.dispatch_key == DispatchKey.Meta and - f.func.kind() is SchemaKind.inplace and - # Defer to composites for meta implementation - not f.has_composite_kernel and - # Inplace list operations are not supported - len(f.func.returns) == 1): - inplace_meta = True - elif (not self.backend_index.use_out_as_primary and - g is not None - and gets_generated_out_inplace_wrapper(f, g, self.backend_index)): - # We want to generate inplace/out wrappers, that don't have a kernel for the backend. - gets_out_inplace_wrapper = True - else: - return None - if f.manual_kernel_registration: - return None - - if self.target is Target.REGISTRATION and not self.selector.is_native_function_selected(f): - return None - - sig = self.wrapper_kernel_sig(f) - - name = sig.name() - returns_type = sig.returns_type().cpp_type() - args = sig.arguments() - args_str = ', '.join(a.defn() for a in args) - - # See Note [Direct dispatch bindings] - cpp_sig_group = CppSignatureGroup.from_native_function(f, method=False, fallback_binding=False) - - if self.target is Target.NAMESPACED_DECLARATION: - result = f"TORCH_API {cpp_sig_group.signature.decl()};\n" - if cpp_sig_group.faithful_signature is not None: - result += f"TORCH_API {cpp_sig_group.faithful_signature.decl()};\n" - return result - elif self.target is Target.NAMESPACED_DEFINITION: - def generate_defn(cpp_sig: CppSignature) -> str: - return f""" -{cpp_sig.defn()} {{ -return {sig.name()}({', '.join(e.expr for e in translate(cpp_sig.arguments(), sig.arguments()))}); -}} -""" - result = generate_defn(cpp_sig_group.signature) - if cpp_sig_group.faithful_signature is not None: - result += generate_defn(cpp_sig_group.faithful_signature) - return result - elif self.target is Target.ANONYMOUS_DEFINITION: - # short circuit for inplace_meta - if inplace_meta: - assert f.func.arguments.self_arg is not None - self_arg_name = f.func.arguments.self_arg.argument.name - # TODO: handle in place on tensor list - return f""" -{returns_type} {name}({args_str}) {{ - TORCH_CHECK_NOT_IMPLEMENTED({self_arg_name}.is_meta(), - "Cannot inplace into non-meta tensor with meta tensor argument"); - return {self_arg_name}; -}} -""" - - # short circuit for generated inplace/out wrappers - if gets_out_inplace_wrapper: - return self.gen_out_inplace_wrapper(f, g) - - metadata = self.backend_index.get_kernel(f) - if metadata is None: - return None - if self.class_method_name is None: - impl_name = f"{self.cpp_namespace}::{metadata.kernel}" - else: - impl_name = f"{self.cpp_namespace}::{self.class_method_name}::{metadata.kernel}" - - args_exprs_str = ', '.join(a.name for a in args) - - device_check = ' // No device check\n' - # Backends that require device guards presumably also require device checks. - if self.backend_index.device_guard: - device_check_args = itertools.chain( - f.func.arguments.out, - f.func.arguments.flat_positional - ) - device_check = RegisterDispatchKey.gen_device_check(f.device_check, list(device_check_args), name) - - device_guard = "// DeviceGuard omitted" # default - if f.device_guard and self.backend_index.device_guard: - has_tensor_options = any(isinstance(a.argument, TensorOptionsArguments) for a in args) - if has_tensor_options: - # kernel is creating a tensor - device_guard = """ - const DeviceGuard device_guard(device_or_default(device));""" - - # CUDA requires special handling - if is_cuda_dispatch_key(self.backend_index.dispatch_key): - device_guard = f"globalContext().lazyInitCUDA();\n{device_guard}" - else: - # kernel is operating on existing tensors - - # There is precedence for which argument we use to do - # device guard. This describes the precedence order. - self_arg = [f.func.arguments.self_arg.argument] if f.func.arguments.self_arg is not None else [] - candidate_args = itertools.chain( - self_arg, - f.func.arguments.out, - f.func.arguments.flat_positional - ) - - # Only tensor like arguments are eligible - device_of = next((f'{a.name}' for a in candidate_args if a.type.is_tensor_like()), None) - if device_of is not None: - device_guard = f"const OptionalDeviceGuard device_guard(device_of({device_of}));" - - return f"""\ -namespace {{ - -{returns_type} {name}({args_str}) {{ - {device_check} - - {device_guard} - return {impl_name}({args_exprs_str}); -}} - -}} // anonymous namespace -""" - - elif self.target is Target.REGISTRATION: - if f.manual_kernel_registration: - return None - else: - payload = f"TORCH_FN({name})" - return f'm.impl("{f.func.name}",\n{payload});\n' - else: - assert_never(self.target) - - -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# -# STRUCTURED -# -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # - -@dataclass(frozen=True) -class StructuredRegisterDispatchKey(RegisterDispatchKey): - g: NativeFunctionsGroup - - def gen_class_set_output(self, k: SchemaKind, parent_class: str, generate_super: bool) -> str: - if generate_super: - set_output_super = f"{parent_class}::set_output(output_idx, sizes, strides, options, names);" - else: - set_output_super = "" - maybe_star = "*" if k is SchemaKind.functional else "" - return f""" -void set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, - TensorOptions options, DimnameList names) override {{ -{textwrap.indent(self.gen_class_set_output_body(k), " ")} - if (!names.empty()) {{ - namedinference::propagate_names({maybe_star}outputs_[output_idx], names); - }} - // super must happen after, so that downstream can use maybe_get_output - // to retrieve the output -{textwrap.indent(set_output_super, " ")} -}} -""" - - def gen_class_set_output_body(self, k: SchemaKind) -> str: - if self.backend_index.dispatch_key in [DispatchKey.CUDA, DispatchKey.CompositeExplicitAutograd]: - maybe_set_guard = """ -auto current_device = guard_.current_device(); -if (C10_UNLIKELY(current_device.has_value())) { - TORCH_INTERNAL_ASSERT(*current_device == options.device(), - "structured kernels don't support multi-device outputs"); -} else { - guard_.reset_device(options.device()); -} -""" - maybe_set_guard_line = maybe_set_guard + "\n" - else: - maybe_set_guard_line = maybe_set_guard = '' - - if k is SchemaKind.functional: - assert self.backend_index.dispatch_key in ( - DispatchKey.Meta, DispatchKey.CPU, DispatchKey.CUDA, - DispatchKey.CompositeExplicitAutograd) - return f"""{maybe_set_guard_line} -outputs_[output_idx] = create_out(sizes, strides, options);""" - elif k is SchemaKind.inplace: - return f"""{maybe_set_guard_line} -const auto& out = outputs_[output_idx].get(); -check_inplace(out, sizes, options);""" - elif k is SchemaKind.out: - return f"""{maybe_set_guard_line} -const auto& out = outputs_[output_idx].get(); -resize_out(out, sizes, strides, options);""" - else: - assert_never(k) - - # returns the definition of a ctor, as well as how to construct - # this class to a variable named op - def gen_class_ctor(self, k: SchemaKind, class_name: str, returns: int) -> str: - if k is SchemaKind.functional: - return "" - elif k is SchemaKind.inplace: - # TODO: Make sure out argument is guaranteed to be self - return f"{class_name}(Tensor& self) : outputs_{{std::ref(self)}} {{}}" - elif k is SchemaKind.out: - out_args = ', '.join(f"Tensor& out{i}" for i in range(returns)) - out_refs = ', '.join(f"std::ref(out{i})" for i in range(returns)) - return f"{class_name}({out_args}) : outputs_{{ {out_refs} }} {{}}" - else: - assert_never(k) - - def gen_class( - self, f: NativeFunction, k: SchemaKind, *, class_name: str, parent_class: str, generate_super: bool - ) -> str: - maybe_star = '' - if k is SchemaKind.functional: - output_type = "c10::ExclusivelyOwned" - maybe_star = '*' - elif k is SchemaKind.inplace: - output_type = "std::reference_wrapper" - elif k is SchemaKind.out: - output_type = "std::reference_wrapper" - - if self.backend_index.dispatch_key == DispatchKey.CUDA: - if self.rocm: - guard_field = 'c10::hip::OptionalHIPGuardMasqueradingAsCUDA guard_;' - else: - guard_field = 'c10::cuda::OptionalCUDAGuard guard_;' - elif self.backend_index.dispatch_key == DispatchKey.CompositeExplicitAutograd: - guard_field = 'c10::OptionalDeviceGuard guard_;' - else: - guard_field = '' - - indent = " " * 4 - class_ctor_str = self.gen_class_ctor(k, class_name, len(f.func.returns)) - lines = ( - f"struct {class_name} final : public {parent_class} {{", - f"{textwrap.indent(class_ctor_str, indent)}", - f"{textwrap.indent(self.gen_class_set_output(k, parent_class, generate_super), indent)}", - " const Tensor& maybe_get_output(int64_t output_idx) override {", - f" return {maybe_star}outputs_[output_idx];", - " }", - f" std::array<{output_type}, {len(f.func.returns)}> outputs_;", - f"{textwrap.indent(guard_field, indent)}", - "};" - ) - return '\n'.join(line for line in lines if line) - - @method_with_native_function - def gen_one(self, f: NativeFunction) -> Optional[str]: - assert not f.manual_kernel_registration - - if self.target is Target.REGISTRATION and not self.selector.is_native_function_selected(f): - return None - - # TODO: Now, there is something interesting going on here. In the code below, - # we generate CompositeExplicitAutograd implementations of functional and inplace - # based on the out implementation. But in fact, out is definable by - # functional too (just not very efficiently), and this is honestly the - # MORE likely situation for a backend implementor. How do we pick? - # Well, taking a page from Haskell type classes and default methods, - # we could conceivably register a circular definition (out in terms - # of functional, and functional in terms of out) and just require - # someone to implement one or the other. We'd have to do a little bit - # of work to not register one of these "weak" definitions unless there - # is a strong definition somewhere in the DAG! So it's not implemented yet. - if self.backend_index.dispatch_key == DispatchKey.CompositeExplicitAutograd and f.func.kind() is SchemaKind.out: - # Never generate a default implementation for out, that's what you - # have to define as a backend implementor - return None - - # Note [Direct dispatch bindings] - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # Signature of the non-dispatched function we'll expose in a header - # (e.g., at::cpu::add). We don't generate methods (TODO: do this - # when CPUTensor class is a thing); nor do we generate fallback - # bindings for manual_cpp_binding functions. - cpp_sig_group = CppSignatureGroup.from_native_function(f, method=False, fallback_binding=False) - - # Signature of the wrapper function we'll register to the dispatcher - sig = NativeSignature(f.func, prefix="wrapper_") - - if self.target is Target.NAMESPACED_DECLARATION: - result = f"TORCH_API {cpp_sig_group.signature.decl()};\n" - if cpp_sig_group.faithful_signature is not None: - result += f"TORCH_API {cpp_sig_group.faithful_signature.decl()};\n" - return result - - elif self.target is Target.NAMESPACED_DEFINITION: - def generate_defn(cpp_sig: CppSignature) -> str: - return f""" -{cpp_sig.defn()} {{ -return {sig.name()}({', '.join(e.expr for e in translate(cpp_sig.arguments(), sig.arguments()))}); -}} -""" - result = generate_defn(cpp_sig_group.signature) - if cpp_sig_group.faithful_signature is not None: - result += generate_defn(cpp_sig_group.faithful_signature) - return result - - elif self.target is Target.ANONYMOUS_DEFINITION: - - k = f.func.kind() - - # Construct the body of the wrapper function with signature sig - sig_body = [] - # We'll use context to keep track of any variables we've brought - # into scope while generating code - context: List[Union[Binding, Expr]] = list(sig.arguments()) - - # Initialize the class corresponding to this structured - # operator; feeding it the output argument(s) if it is known - if self.backend_index.dispatch_key is DispatchKey.Meta: - class_name = f"structured_{meta.name(self.g)}_meta_{k.name}" - parent_class = f"at::meta::structured_{meta.name(self.g)}" - elif self.backend_index.dispatch_key is DispatchKey.CompositeExplicitAutograd: - # TODO: dedup this branch - class_name = f"structured_{meta.name(self.g)}_default_backend_{k.name}" - parent_class = f"at::meta::structured_{meta.name(self.g)}" - else: - metadata = self.backend_index.get_kernel(self.g) - assert metadata is not None - class_name = f"structured_{metadata.kernel}_{k.name}" - parent_class = f"{self.cpp_namespace}::structured_{metadata.kernel}" - - if self.backend_index.device_guard: - device_check_args = itertools.chain( - f.func.arguments.out, - f.func.arguments.flat_positional - ) - sig_body.append(RegisterDispatchKey.gen_device_check(f.device_check, list(device_check_args), sig.name())) - - if k is SchemaKind.functional: - sig_body.append(f"{class_name} op;") - elif k is SchemaKind.inplace: - sig_body.append(f"{class_name} op(self);") - elif k is SchemaKind.out: - out_args_str = ', '.join(a.name for a in f.func.arguments.out) - sig_body.append(f"{class_name} op({out_args_str});") - - # Translate the input native arguments into structured - # arguments for the meta call - meta_exprs = ', '.join( - e.expr for e in translate( - context, - structured.meta_arguments(self.g), - method=False - ) - ) - - if self.g.out.precomputed: - # If this function group has precomputed elements, the meta function - # returns a struct containing them which must be saved so that it - # can be unpacked when generating code to call the impl. - sig_body.append(f"auto precompute = op.meta({meta_exprs});") - - # Put all of the contents of the precompute struct into the context - # so that translate will be able to return the correct args for the - # call to the impl. - precomputed_values = [*self.g.out.precomputed.replace.values(), self.g.out.precomputed.add] - for precomputed_elems in precomputed_values: - for arg in precomputed_elems: - context.append(Expr( - expr=f"precompute.{arg.name}", - type=structured.argument_type(arg, binds=arg.name), - )) - - # Add a use of the precompute struct so FB internal compilers don't - # complain that there is an unused variable. - sig_body.append("(void)precompute;") - else: - sig_body.append(f"op.meta({meta_exprs});") - - - # After running meta, op.outputs_ is guaranteed to be valid; - # add it to the context - out_args = structured.out_arguments(self.g) - maybe_star = '*' if k is SchemaKind.functional else '' - for i, out_arg in enumerate(out_args): - assert ConstRefCType(BaseCType(tensorT)) == out_arg.nctype.type - context.append(Expr( - expr=f"{maybe_star}op.outputs_[{i}]", - # TODO: Stop hardcoding that the output type is a Tensor. Note - # that for the codegen here this is fine because outputs_ is - # hardcoded to be tensor already - type=NamedCType(out_arg.nctype.name, MutRefCType(BaseCType(tensorT))) - )) - - # With the expanded context, do the impl call (if not a meta - # function) - if self.backend_index.dispatch_key == DispatchKey.CompositeExplicitAutograd: - # TODO: https://github.com/pytorch/pytorch/issues/53023 - out_sig_group = CppSignatureGroup.from_native_function( - self.g.out, method=False, fallback_binding=f.manual_cpp_binding) - out_sig = out_sig_group.most_faithful_signature() - api_name = out_sig.name() - out_exprs = ', '.join( - e.expr for e in translate( - context, - out_sig.arguments(), - method=False - ) - ) - # TODO: I think this means structured won't work with method - # only functions (but maybe you're saved by faithful? iunno.) - # NB: Originally I wrote this as an at::redispatch call, but - # I got in trouble because that meant I needed a DispatchKeySet - # in the wrapper function, which meant I needed a DispatchKeySet - # in the DispatchKeyFunctions declarations, but the defined API - # there does NOT permit a dispatch key set. I think you can - # probably unwind this by calling some function to do the TLS - # fetch and get the DispatchKeySet when you don't have it, but - # I didn't do it for this version - sig_body.append(f"at::{api_name}({out_exprs});") - elif self.backend_index.dispatch_key != DispatchKey.Meta: - impl_exprs = ', '.join( - e.expr for e in translate( - context, - structured.impl_arguments(self.g), - method=False - ) - ) - sig_body.append(f"op.impl({impl_exprs});") - - # Destructively return the final tensors - # TODO: Do this in translate instead - if k is SchemaKind.functional: - if len(f.func.returns) == 1: - ret_expr = "std::move(op.outputs_[0]).take()" # small optimization - else: - moved = ', '.join(f"std::move(op.outputs_[{i}]).take()" for i in range(len(f.func.returns))) - ret_expr = f"std::make_tuple({moved})" - elif k is SchemaKind.inplace: - ret_expr = "self" - elif k is SchemaKind.out: - if len(f.func.returns) == 1: - ret_expr = f.func.arguments.out[0].name - else: - refs = ', '.join(a.name for a in f.func.arguments.out) - ret_expr = f"std::forward_as_tuple({refs})" - sig_body.append(f"return {ret_expr};") - - sig_body_str = "\n".join(sig_body) - - # For an overview of what this template code looks like, see - # https://github.com/pytorch/rfcs/pull/9 - return f"""\ -{self.gen_class( -f, k, -class_name=class_name, -parent_class=parent_class, -generate_super=self.g.out.structured_inherits is not None -)} - -{sig.defn()} {{ -{sig_body_str} -}} -""" - - elif self.target is Target.REGISTRATION: - return f'm.impl("{f.func.name}", TORCH_FN({sig.name()}));' - else: - assert_never(self.target) - # Silence mypy's "Missing return statement" error - return None diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py deleted file mode 100644 index 1c2c83029d0a..000000000000 --- a/tools/codegen/gen.py +++ /dev/null @@ -1,1704 +0,0 @@ -import os -from typing import List, Dict, Optional, Tuple, Set, Any, Union, Sequence, TypeVar -from typing_extensions import Literal -import yaml -from collections import OrderedDict, defaultdict, namedtuple -import argparse -import pathlib -import json -from dataclasses import dataclass - -from tools.codegen.model import (Argument, DispatchKey, FunctionSchema, - Location, NativeFunction, - NativeFunctionsGroup, OperatorName, - BackendIndex, BackendMetadata, - OptionalType, SchemaKind, SelfArgument, - TensorOptionsArguments, Type, Variant, - is_cuda_dispatch_key, - is_generic_dispatch_key, - Tag, BaseOperatorName) -from tools.codegen.api.types import (Binding, CppSignature, CppSignatureGroup, - DispatcherSignature, NativeSignature) -from tools.codegen.api import cpp -import tools.codegen.api.dispatcher as dispatcher -import tools.codegen.api.native as native -import tools.codegen.api.meta as meta -import tools.codegen.api.structured as structured -from tools.codegen.api.translate import translate -from tools.codegen.selective_build.selector import SelectiveBuilder -from tools.codegen.utils import ( - Target, concatMap, context, mapMaybe, YamlDumper, YamlLoader, FileManager, assert_never -) -from tools.codegen.context import (method_with_native_function, - native_function_manager, - with_native_function_and_indices, - with_native_function) -import tools.codegen.dest as dest -from tools.codegen.gen_functionalization_type import ( - needs_functionalization, - gen_functionalization_definition, - gen_functionalization_registration, - gen_functionalization_view_inverse_declaration -) - -T = TypeVar('T') - -# Welcome to the ATen code generator v2! The ATen code generator is -# responsible for parsing native_functions.yaml and then generating -# various generated files (e.g., TypeDefault.cpp) based on the operators -# defined in this file. This means that the code generator knows how to -# parse function schema, and then translate this into various C++ types -# and boilerplate code. -# -# Some things to know about this file when you modify it: -# -# - This file has STRICT mypy typechecking. Typecheck it with -# `mypy --config mypy-strict.ini` in the root source directory -# -# - Most of the heavy lifting lives in external modules: -# - 'model' has the data model for native_functions.yaml. The classes -# in those file represent what you see when you look at -# a native_functions.yaml -# - 'api' has conversions for how to translate JIT schema into -# the various C++ APIs that the codegen interacts with. There -# are in fact THREE different C++ APIs: the public C++ API, -# the dispatcher API, and the legacy disaptcher API. See each -# of these respective files for more information - -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# -# HELPER FUNCTIONS -# -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # - -# A custom loader for YAML to let us also keep track of line numbers -# of each entry in the YAML file -class LineLoader(YamlLoader): - def construct_mapping(self, node, deep=False): # type: ignore[no-untyped-def] - mapping = super().construct_mapping(node, deep=deep) # type: ignore[no-untyped-call] - # Add 1 so line numbering starts at 1 - mapping['__line__'] = node.start_mark.line + 1 - return mapping - -_GLOBAL_PARSE_NATIVE_YAML_CACHE = {} - -# Parse native_functions.yaml into a sequence of NativeFunctions and Backend Indices. -ParsedYaml = namedtuple('ParsedYaml', ['native_functions', 'backend_indices']) -def parse_native_yaml(path: str) -> ParsedYaml: - global _GLOBAL_PARSE_NATIVE_YAML_CACHE - if path not in _GLOBAL_PARSE_NATIVE_YAML_CACHE: - with open(path, 'r') as f: - es = yaml.load(f, Loader=LineLoader) - assert isinstance(es, list) - rs: List[NativeFunction] = [] - bs: Dict[DispatchKey, Dict[OperatorName, BackendMetadata]] = defaultdict(dict) - for e in es: - assert isinstance(e.get('__line__'), int), e - loc = Location(path, e['__line__']) - funcs = e.get('func') - with context(lambda: f'in {loc}:\n {funcs}'): - func, m = NativeFunction.from_yaml(e, loc) - rs.append(func) - BackendIndex.grow_index(bs, m) - error_check_native_functions(rs) - # Default dict is to prevent the codegen from barfing when we have a dispatch key that has no kernels yet. - indices: Dict[DispatchKey, BackendIndex] = defaultdict(lambda: BackendIndex( - dispatch_key=DispatchKey.Undefined, - use_out_as_primary=True, - external=False, - device_guard=False, - index={})) - for k, v in bs.items(): - # All structured in-tree operators are implemented in terms of their out operator. - indices[k] = BackendIndex( - dispatch_key=k, - use_out_as_primary=True, - external=False, - # Only cuda-like devices in tree require device guards - device_guard=is_cuda_dispatch_key(k), - index=v) - _GLOBAL_PARSE_NATIVE_YAML_CACHE[path] = ParsedYaml(rs, indices) - - return _GLOBAL_PARSE_NATIVE_YAML_CACHE[path] - -# Some assertions are already performed during parsing, but those are only within a single NativeFunction. -# Assertions here are meant to be performed across NativeFunctions. -def error_check_native_functions(funcs: Sequence[NativeFunction]) -> None: - func_map: Dict[OperatorName, NativeFunction] = {} - base_func_map: Dict[BaseOperatorName, List[NativeFunction]] = defaultdict(list) - for f in funcs: - func_map[f.func.name] = f - base_func_map[f.func.name.name].append(f) - for f in funcs: - if f.structured_delegate is not None: - delegate_func = func_map[f.structured_delegate] - assert delegate_func.structured, \ - f"{f.func.name} is marked as a structured_delegate pointing to " \ - f"{f.structured_delegate}, but {f.structured_delegate} is not marked as structured. " \ - f"Consider adding 'structured=True' to the delegated operator" - if f.tag is not None and f.tag is Tag.inplace_view: - base_name = f.func.name.name - overload_name = f.func.name.overload_name - assert base_name.inplace, \ - f"{f.func.name} is marked with tag: inplace_view, but it doesn't follow the naming " \ - "convention for inplace ops - the codegen expects the base name to have a trailing underscore. " - out_of_place_base_name = BaseOperatorName(base_name.base, False, base_name.dunder_method) - assert len(base_func_map[out_of_place_base_name]) > 0, \ - f"{f.func.name} is marked with tag: inplace_view. The codegen expects there to be a corresponding " \ - f"out-of-place view op with the name '{base_name}' and matching schema, but it didn't find one. " - - -def cpp_string(s: str) -> str: - """Convert a python string into a c++ string literal """ - s = s.replace('\\', '\\\\') - s = s.replace('"', '\\"') - s = s.replace('\a', '\\a') - s = s.replace('\b', '\\b') - s = s.replace('\f', '\\f') - s = s.replace('\n', '\\n') - s = s.replace('\v', '\\v') - s = s.replace('\t', '\\t') - return f'"{s}"' - -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# -# C++ CODE GENERATION -# -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # - -# Most functions in this section are curried: they consist of a function -# that takes some parameters (e.g., what is to be generated) which itself -# returns a function that actually maps NativeFunction to the code -# to be generated. This pattern makes it convenient to use map, concatMap -# and similar functional combinators. - -def static_dispatch_keys(backend: Optional[BackendIndex]) -> List[DispatchKey]: - if backend is None: - return [] - else: - return [ - backend.dispatch_key, - DispatchKey.CompositeImplicitAutograd, - DispatchKey.CompositeExplicitAutograd - ] - -def get_static_dispatch_backend(f: NativeFunction, backend_index: BackendIndex) -> Optional[DispatchKey]: - if (f.structured_delegate is not None or backend_index.has_kernel(f)): - # TODO: for ops with structured_delegate it should check the dispatch table of - # the out variant instead. For now, these structured ops all have CPU/CUDA kernels - # so we always dispatch to the `backend`, but this could be wrong when we - # migrate math/default_backend ops to use structured delegate. - return backend_index.dispatch_key - elif f.has_composite_explicit_autograd_kernel: - return DispatchKey.CompositeExplicitAutograd - elif f.has_composite_implicit_autograd_kernel: - return DispatchKey.CompositeImplicitAutograd - return None - - -def static_dispatch_ops_header( - f: NativeFunction, - backend_index: Optional[BackendIndex]) -> Optional[str]: - if backend_index is None or f.manual_kernel_registration: - return None - - dispatch_key = get_static_dispatch_backend(f, backend_index) - return (f'#include ' - if dispatch_key is not None else None) - - -def static_dispatch_extra_headers(backend: Optional[BackendIndex], skip_tensor_include: bool = False) -> List[str]: - if skip_tensor_include: - # See Note [Avoiding Include Cycles In Static Dispatch] - maybe_inl = '_inl' - else: - maybe_inl = '' - return [f'#include ' - for dispatch_key in static_dispatch_keys(backend)] - - -def static_dispatch( - f: NativeFunction, cpp_sig: CppSignature, - *, method: bool, backend_index: Optional[BackendIndex] -) -> Optional[str]: - if backend_index is None or f.manual_kernel_registration: - return None - target_sig = CppSignatureGroup.from_native_function(f, method=False, fallback_binding=False).signature - name = target_sig.name() - exprs = translate(cpp_sig.arguments(), target_sig.arguments(), method=method) - exprs_str = ', '.join(a.expr for a in exprs) - - dispatch_key = get_static_dispatch_backend(f, backend_index) - if dispatch_key is not None: - return f'return at::{dispatch_key.lower()}::{name}({exprs_str});' - - return f'TORCH_CHECK(false, "Static dispatch does not support {name} for {backend_index.dispatch_key}.");' - -# Generates RegisterSchema.cpp. Depending on the selector, either -# all schemas are registered, or only some are (in the case of -# selective build) -@dataclass(frozen=True) -class RegisterSchema: - selector: SelectiveBuilder - - @method_with_native_function - def __call__(self, f: NativeFunction) -> Optional[str]: - if not self.selector.is_native_function_selected(f): - return None - return f'm.def({cpp_string(str(f.func))});\n' - -# Generates Operators.h and Operators.cpp. -# These provide macros that, given an operator and overload name, allow users -# to access an "un-overloaded" function version of the operator. This -# is useful for extension writers who want to (1) want to decltype the operator -# and (2) don't want to worry about method-only operators. -@dataclass(frozen=True) -class ComputeOperators: - target: Union[ - Literal[Target.DECLARATION], - Literal[Target.DEFINITION] - ] - - @method_with_native_function - def __call__(self, f: NativeFunction) -> str: - sig = DispatcherSignature.from_schema(f.func) - name = f.func.name.unambiguous_name() - call_method_name = 'call' - redispatch_method_name = 'redispatch' - - if self.target is Target.DECLARATION: - # Note [The ATen Operators API] - # The ATen Operators API lives in the at::_ops namespace, and contains compile-time - # metadata about each operator + entry points into the Dispatcher. - # The C++ function, method, and redispatch API's are all implemented as wrappers - # into various bits of the structs defined here. - # - # Important characteristics about the Operators API: - # (1) It follows the Dispatcher API. - # This is kind of necessary to avoid overhead. - # For example: if it followed the C++ API, then all of the faithful C++ factory functions - # would need to wrap their arguments into TensorOptions only to unwrap them again. - # (2) Overload names are disambiguated. - # This is helpful for pytorch extenders who would like to decltype() an aten operator, - # that has overloads, e.g. decltype(at::_ops::mul_Tensor::call) - # (3) No argument defaulting is allowed. - # This is more of an implementation detail to avoid #include cycles, - # since TensorBody.h (which defines the Tensor class) needs to include this file. - # (4) manual_cpp_bindings and faithful names are not included in the API. - # This applies to stuff like __dispatch__is_complex(), and add_outf(). - # These aren't "real aten ops", they're just additional functions provided by the C++ API. - # They're implemented as wrappers in Functions.h that call into the actual operators - # defined here, i.e. at::_ops::is_complex::call() and at::_ops::add_out::call(). - # This means that ATEN_OP(is_complex) will not fastpath, and will go through the dispatcher. - return f""" -struct TORCH_API {name} {{ - using schema = {sig.type()}; - using ptr_schema = schema*; - // See Note [static constexpr char* members for windows NVCC] - STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::{f.func.name.name}") - STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "{f.func.name.overload_name}") - STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, {cpp_string(str(f.func))}) - static {sig.defn(name=call_method_name, is_redispatching_fn=False)}; - static {sig.defn(name=redispatch_method_name, is_redispatching_fn=True)}; -}};""" - elif self.target is Target.DEFINITION: - defns = f""" -STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA({name}, name, "aten::{f.func.name.name}") -STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA({name}, overload_name, "{f.func.name.overload_name}") -STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA({name}, schema_str, {cpp_string(str(f.func))}) - -// aten::{f.func} -static C10_NOINLINE c10::TypedOperatorHandle<{name}::schema> create_{name}_typed_handle() {{ - return c10::Dispatcher::singleton() - .findSchemaOrThrow({name}::name, {name}::overload_name) - .typed<{name}::schema>(); -}} -""" - - for is_redispatching_fn in [False, True]: - if is_redispatching_fn: - dispatcher_exprs_str = ', '.join(['dispatchKeySet'] + [a.name for a in sig.arguments()]) - dispatcher_call = 'redispatch' - method_name = f'{name}::{redispatch_method_name}' - else: - dispatcher_exprs_str = ', '.join([a.name for a in sig.arguments()]) - dispatcher_call = 'call' - method_name = f'{name}::{call_method_name}' - - defns += f""" -// aten::{f.func} -{sig.defn(name=method_name, is_redispatching_fn=is_redispatching_fn)} {{ - static auto op = create_{name}_typed_handle(); - return op.{dispatcher_call}({dispatcher_exprs_str}); -}} -""" - return defns - else: - assert_never(self.target) - - -# Generates Function.h, which provides the functional public C++ API, -# and the scaffolding to call into the dispatcher from these functions. -@dataclass(frozen=True) -class ComputeFunction: - static_dispatch_backend_index: Optional[BackendIndex] - - @method_with_native_function - def __call__(self, f: NativeFunction) -> Optional[str]: - if Variant.function not in f.variants: - return None - - sig_group = CppSignatureGroup.from_native_function(f, method=False, fallback_binding=f.manual_cpp_binding) - - def generate_defn(faithful: bool) -> str: - if faithful: - sig = sig_group.faithful_signature - assert sig is not None - else: - sig = sig_group.signature - - # See Note [The ATen Operators API] - target_sig = DispatcherSignature.from_schema(f.func) - exprs = translate(sig.arguments(), target_sig.arguments()) - exprs_str = ', '.join([e.expr for e in exprs]) - - static_dispatch_block = static_dispatch(f, sig, method=False, backend_index=self.static_dispatch_backend_index) - if static_dispatch_block is None: - return f""" -// aten::{f.func} -TORCH_API inline {sig.decl()} {{ - return at::_ops::{f.func.name.unambiguous_name()}::call({exprs_str}); -}} -""" - else: - return f""" -// aten::{f.func} -TORCH_API inline {sig.decl()} {{ - {static_dispatch_block} -}} -""" - result = generate_defn(False) - if sig_group.faithful_signature is not None: - result += generate_defn(True) - - return result - -# Generates TensorBody.h. This file provides the object-oriented (method-based) -# public C++ API, and the scaffolding to call into the dispatcher from these functions. -@dataclass(frozen=True) -class ComputeTensorMethod: - target: Union[ - Literal[Target.DECLARATION], - Literal[Target.DEFINITION] - ] - static_dispatch_backend_index: Optional[BackendIndex] - - @method_with_native_function - def __call__(self, f: NativeFunction) -> Optional[str]: - if Variant.method not in f.variants: - return None - - assert not f.func.is_out_fn() - assert f.func.arguments.self_arg is not None - - sig_group = CppSignatureGroup.from_native_function(f, method=True, fallback_binding=f.manual_cpp_binding) - - if self.target is Target.DECLARATION: - result = f"{sig_group.signature.decl()} const;\n" - if sig_group.faithful_signature is not None: - result += f"{sig_group.faithful_signature.decl()} const;\n" - return result - - if self.target is not Target.DEFINITION: - assert_never(self.target) - - def generate_defn(faithful: bool) -> str: - if faithful: - sig = sig_group.faithful_signature - assert sig is not None - else: - sig = sig_group.signature - - target_sig = DispatcherSignature.from_schema(f.func) - exprs = translate(sig.arguments(), target_sig.arguments(), method=True) - exprs_str = ', '.join([e.expr for e in exprs]) - - static_dispatch_block = static_dispatch(f, sig, method=True, backend_index=self.static_dispatch_backend_index) - if static_dispatch_block is None: - return f""" -// aten::{f.func} -inline {sig.defn(prefix="Tensor::")} const {{ - return at::_ops::{f.func.name.unambiguous_name()}::call({exprs_str}); -}} -""" - else: - return f""" -// aten::{f.func} -inline {sig.defn(prefix="Tensor::")} const {{ - {static_dispatch_block} -}} -""" - - result = generate_defn(faithful=False) - if sig_group.faithful_signature is not None: - result += generate_defn(faithful=True) - - return result - -# Generates RedispatchFunctions.h. -# This is similar to the C++ API defined in Functions.h, but provides access -# to the dispatcher's redispatch API. -@dataclass(frozen=True) -class ComputeRedispatchFunction: - - @method_with_native_function - def __call__(self, f: NativeFunction) -> Optional[str]: - # We unconditionally generate function variants of the redispatch API. - # This is mainly because we can namespace functions separately, but not methods, - sig_group = CppSignatureGroup.from_native_function(f, method=False, fallback_binding=f.manual_cpp_binding) - - def generate_defn(faithful: bool) -> str: - if faithful: - sig = sig_group.faithful_signature - assert sig is not None - else: - sig = sig_group.signature - - target_sig = DispatcherSignature.from_schema(f.func) - exprs = translate(sig.arguments(), target_sig.arguments()) - exprs_str = ', '.join(['dispatchKeySet'] + [a.expr for a in exprs]) - - return f""" -// aten::{f.func} -TORCH_API inline {sig.decl(is_redispatching_fn=True)} {{ - return at::_ops::{f.func.name.unambiguous_name()}::redispatch({exprs_str}); -}} -""" - result = generate_defn(False) - if sig_group.faithful_signature is not None: - result += generate_defn(True) - - return result - - -# Generates ATenOpList.cpp, a runtime accessible list of all aten -# operators. -# TODO: This was historically used to help some JIT interop code -# figure out whether or not to treat aten namespace'd operators -# one way or another, we should reevaluate if this is actually needed. -@with_native_function -def compute_aten_op(f: NativeFunction) -> str: - return f'{{"aten::{f.func.name.name}", "{f.func.name.overload_name}"}},' - -# Generates MetaFunctions.h -def compute_meta_function_declaration(g: NativeFunctionsGroup) -> Optional[str]: - if not g.structured: - return None - with native_function_manager(g.out): - name = meta.name(g) - args = structured.meta_arguments(g) - args_str = ', '.join(a.decl() for a in args) - parent_class = g.out.structured_inherits - if parent_class is None: - parent_class = "at::impl::MetaBase" - meta_return = "void" - precomputed = g.out.precomputed if g.structured else None - - if precomputed: - # Generate the template declaration with one bool parameter for each - # precomputed element. Each parameter is true if the corresponding (in - # terms of position) precomputed element has been set. - precomputed_values = [*precomputed.replace.values(), precomputed.add] - precomputed_elements = [elem for replace_list in precomputed_values for elem in replace_list] - precomputed_template_parameters = [elem.name.upper() for elem in precomputed_elements] - precomputed_template_params_str = ", ".join(f"bool {param} = false" for param in precomputed_template_parameters) - precompute_template_decl = f"template <{precomputed_template_params_str}>" - - # Generate a string containing declarations of all precomputed elements. - precomputed_elements_with_cpp_types = [ - structured.argument_type(elem, binds=elem.name) - for elem in precomputed_elements - ] - - precomputed_elements_decl = ";\n".join( - f"{elem.cpp_type(strip_ref=True)} {elem.name}" for elem in precomputed_elements_with_cpp_types - ) - - # Generate "setter" methods for each precomputed element. Each method will return - # a new instance of precompute_out with the template parameter that corresponds to - # the member set by the method to true (to indicate that it has been set). - setter_methods = [] - for i, elem in enumerate(precomputed_elements): - # Generate the signature. The return type will be the same - # as the type of `this` but with the template parameter - # corresponding to the element set by this method set to true. - # The assert generated below will ensure that this template - # parameter is false on the type of `this`. - return_ty_templates = ", ".join( - precomputed_template_parameters[:i] + ["true"] + precomputed_template_parameters[i + 1:] - ) - return_ty = f"precompute_out<{return_ty_templates}>" - elem_cpp_ty = precomputed_elements_with_cpp_types[i].cpp_type(strip_ref=True) - signature = f"{return_ty} set_{elem.name}({elem_cpp_ty} value)" - - # Generate an assert which checks that the - # template parameter corresponding to the precomputed - # element that is set by this method is false on the - # class corresponding to the object that `this` points to. - # This ensures that each element can be set only once. - assert_msg = f"\"{precomputed_elements[i].name} already set\"" - assert_stmt = f"static_assert({precomputed_template_parameters[i]} == false, {assert_msg});" - - # Generate the new object construction block. All state - # except the element that this method sets is copied from the - # object that `this` points to. The value for the element that - # the method sets is taken from a method parameter. - construction_stmts = [] - construction_stmts.append(f"{return_ty} ret;") - - for j, elem in enumerate(precomputed_elements): - if i == j: - construction_stmts.append(f"ret.{elem.name} = value;") - else: - construction_stmts.append(f"ret.{elem.name} = this->{elem.name};") - - construction_stmts.append("return ret;") - construction_block = "\n".join(construction_stmts) - - setter_methods.append(f""" - {signature} {{ - {assert_stmt} - {construction_block} - }} - """) - setter_methods_decl = "\n".join(setter_methods) - - # Meta should return an instance of the struct containing the precomputed elements. - meta_return_template_params = ", ".join(["true"] * len(precomputed_template_parameters)) - # This typedef (actually a using statement) is needed so that TORCH_META_FUNC can reuse the return - # type (which has a variable number of template parameters). - meta_return_typedef = f"using meta_return_ty = precompute_out <{meta_return_template_params}>;" - meta_return = "meta_return_ty" - precomputed_decl = f""" - {precompute_template_decl} - struct TORCH_API precompute_out {{ - {setter_methods_decl} - {precomputed_elements_decl}; - }};""" - else: - meta_return_typedef = "" - precomputed_decl = "" - - return f"""\ -struct TORCH_API structured_{name} : public {parent_class} {{ - {precomputed_decl} - {meta_return_typedef} - {meta_return} meta({args_str}); -}}; -""" - - -def needs_backend_select(f: NativeFunction, selector: SelectiveBuilder) -> bool: - name = str(f.func.name.name) - if name.endswith('_like') or name.startswith('new_'): - return False - if f.func.arguments.tensor_options is None: - return False - return selector.is_native_function_selected(f) - - -# Generates RegisterBackendSelect.cpp, a series of kernels which provide -# specialized computation of dispatch key for operator signatures which cannot -# be easily done automatically using templating. -@dataclass(frozen=True) -class ComputeBackendSelect: - target: Union[ - Literal[Target.DEFINITION], - Literal[Target.REGISTRATION] - ] - - # Selector object to determine which operators to generate - # registration code for. - selector: SelectiveBuilder - - @method_with_native_function - def __call__(self, f: NativeFunction) -> Optional[str]: - if not needs_backend_select(f, self.selector): - return None - - name = native.name(f.func) - native_sig = NativeSignature(f.func) - - native_tensor_args = [ - a for a in native_sig.arguments() - if isinstance(a.argument, Argument) and a.argument.type.is_tensor_like() - ] - - dispatcher_sig = DispatcherSignature.from_schema(f.func) - - sig: Union[NativeSignature, DispatcherSignature] - sig = dispatcher_sig - dispatcher_exprs = dispatcher_sig.exprs() - dispatch_key = "c10::computeDispatchKey(dtype, layout, device)" - - if self.target is Target.DEFINITION: - # I don't think there's actually a good reason to generate - # these two cases differently - # The first case could probably be improved though- it calls computeDispatchKeySet(), - # which looks at TLS dispatch keys- there should not be any by the time we reach backend select. - if native_tensor_args: - tensor_args = ', '.join(a.name for a in native_tensor_args) - compute_dk = f"""\ -DispatchKeySet _dk_set = c10::DispatchKeySet({dispatch_key}) | c10::detail::multi_dispatch_key_set({tensor_args}); - DispatchKeySet _dk_mask = c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, DispatchKey::BackendSelect); - DispatchKeySet _dk = c10::impl::computeDispatchKeySet(_dk_set, _dk_mask);""" - else: - compute_dk = f"DispatchKeySet _dk = c10::DispatchKeySet({dispatch_key});" - return f"""\ -// aten::{f.func} -C10_ALWAYS_INLINE -{sig.defn(name)} {{ - {compute_dk} - return at::_ops::{f.func.name.unambiguous_name()}::redispatch( - _dk, {', '.join(a.expr for a in dispatcher_exprs)}); -}} -""" - elif self.target is Target.REGISTRATION: - return f"""m.impl("aten::{f.func.name}", TORCH_FN({name}));""" - else: - assert_never(self.target) - -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# -# YAML CODE GENERATION -# -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # - -def format_yaml(data: object) -> str: - # Ignore alias in Dumper - YamlDumper.ignore_aliases = lambda self, data: True # type: ignore[assignment] - - # Support serializing OrderedDict - def dict_representer(dumper: Any, data: Any) -> Any: - return dumper.represent_dict(data.items()) - YamlDumper.add_representer(OrderedDict, dict_representer) # type: ignore[no-untyped-call] - # Some yaml parsers (e.g. Haskell's) don't understand line breaks. - # width=1e9 turns off optional line breaks and improves - # the portability of the outputted yaml. - return yaml.dump(data, default_flow_style=False, Dumper=YamlDumper, width=1e9) # type: ignore[no-any-return] - -# For some reason, some defaults we write to YAML are written as native -# YAML objects, rather than doing them uniformly as strings. This -# function detects those cases and converts them into native Python -# objects. -def pythonify_default(s: str) -> object: - if s == 'true': - return True - elif s == 'false': - return False - - try: - return int(s) - except ValueError: - try: - return float(s) - except ValueError: - return s - -# What is a dynamic type? Over time, the semantic meaning of -# dynamic type has degraded to meaninglessness (in the old days, -# it captured dtype-ness of types, but that has gone away with -# the removal of TH). These days, it's mostly the same thing as -# the C++ API argument type, except that Tensor and Tensor? -# arguments simply present as Tensor. -# -# TODO: Get rid of dynamic_type, after getting tools/autograd -# to use the new codegen framework -def dynamic_type(t: Type) -> str: - if isinstance(t, OptionalType): - return dynamic_type(t.elem) - # Note we don't use t.is_tensor_like() here because it would - # also include Tensor[] - if str(t) == 'Tensor': - return 'at::Tensor' - return cpp.argumenttype_type(t, mutable=False, binds='__placeholder__').cpp_type() - -def compute_method_of_yaml(variants: Set[Variant]) -> List[str]: - # This is written out explicitly to ensure that Tensor and - # namespace are put into the list in the right order - method_of = ['Type'] - if Variant.method in variants: - method_of.append('Tensor') - if Variant.function in variants: - method_of.append('namespace') - return method_of - -def compute_returns_yaml(f: NativeFunction) -> Tuple[List[Dict[str, str]], Dict[str, str]]: - # Note [name and field_name] - # ~~~~~~~~~~~~~~~~~~~~~~~~~~ - # To understand name_to_field_name, we must first talk about this - # schema: - # - # lstsq.X(Tensor self, Tensor A, *, Tensor(a!) X, Tensor(b!) qr) -> (Tensor(a!) solution, Tensor(b!) QR) - # - # There is something very odd about this schema: it is an out - # variant of the function (that is to say, it will convert into - # at::lstsq_out() in the C++ API), but the names of the output - # return arguments don't match the keyword argument names of - # the inputs. It TURNS OUT that in this situation, the historical - # Declarations.yaml we want to output is this (abbreviated to - # only show relevant fields): - # - # arguments: - # ... - # - field_name: solution - # name: X - # - field_name: QR - # name: qr - # ... - # - # returns: - # - field_name: solution - # name: X - # - field_name: QR - # name: qr - # - # The name of the return fields is stored in 'field_name', and the - # name of the arguments is stored in 'name'. So when we process - # arguments, we need a way to get at the corresponding return. At - # the moment, this is most conveniently done by constructing a - # mapping from name (the argument concept) to field_name (the - # return concept) while processing return arguments, since we don't - # directly maintain this correspondence in the modeling of function - # schema itself. - # - # See also https://github.com/pytorch/pytorch/issues/43114 - name_to_field_name: Dict[str, str] = {} - - # Compute the returns field of the YAML entry - names = cpp.return_names(f) - returns = [] - for i, (r, name) in enumerate(zip(f.func.returns, names)): - ret = { - 'dynamic_type': dynamic_type(r.type), - 'name': name, - 'type': cpp.return_type(r).cpp_type(), - } - - if r.name: - # See Note [name and field_name] - ret['field_name'] = r.name - if f.func.is_out_fn(): - name_to_field_name[f.func.arguments.out[i].name] = r.name - - returns.append(ret) - - return returns, name_to_field_name - -# arguments in yaml roughly corresponds to the public C++ API -def compute_cpp_argument_yaml(cpp_a: Binding, *, schema_order: bool, kwarg_only_set: Set[str], - out_arg_set: Set[str], name_to_field_name: Dict[str, str]) -> object: - if isinstance(cpp_a.argument, TensorOptionsArguments): - arg: Dict[str, object] = { - 'annotation': None, - 'dynamic_type': 'at::TensorOptions', - 'is_nullable': False, - 'name': cpp_a.name, - 'type': cpp_a.type, - 'kwarg_only': True, - } - if cpp_a.default is not None: - arg['default'] = cpp_a.default - return arg - elif isinstance(cpp_a.argument, SelfArgument): - raise AssertionError() - elif isinstance(cpp_a.argument, Argument): - return compute_argument_yaml( - cpp_a.argument, schema_order=schema_order, - kwarg_only_set=kwarg_only_set, out_arg_set=out_arg_set, name_to_field_name=name_to_field_name) - -def compute_argument_yaml(a: Argument, *, schema_order: bool, kwarg_only_set: Set[str], - out_arg_set: Set[str], name_to_field_name: Dict[str, str]) -> object: - arg: Dict[str, object] = { - 'annotation': str(a.annotation) if a.annotation else None, - 'dynamic_type': dynamic_type(a.type), - 'is_nullable': a.type.is_nullable(), - 'name': a.name, - 'type': cpp.argument_type(a, binds="__placeholder__").cpp_type(), - } - if a.default is not None: - arg['default'] = pythonify_default(cpp.default_expr(a.default, a.type)) - if a.name in kwarg_only_set: - arg['kwarg_only'] = True - if a.name in out_arg_set: - arg['output'] = True - arg['allocate'] = True - # See Note [name and field_name] - if a.name in name_to_field_name: - arg['field_name'] = name_to_field_name[a.name] - # Historically, booleans don't get their size recorded, because it - # is already built into the cpp type (e.g., std::array) - l = a.type.is_list_like() - if l is not None and l.size is not None and str(l.elem) != 'bool': - arg['size'] = l.size - return arg - -@with_native_function -def compute_declaration_yaml(f: NativeFunction) -> object: - returns, name_to_field_name = compute_returns_yaml(f) - - # These sets are used to conveniently test if an argument is a - # kwarg-only or out argument - kwarg_only_set = set(a.name for a in f.func.arguments.flat_kwarg_only) - out_arg_set = set(a.name for a in f.func.arguments.out) - - sig_group = CppSignatureGroup.from_native_function(f, method=False, fallback_binding=False) - cpp_args = sig_group.signature.arguments() - arguments = [ - compute_cpp_argument_yaml( - cpp_a, schema_order=False, - kwarg_only_set=kwarg_only_set, out_arg_set=out_arg_set, name_to_field_name=name_to_field_name) - for cpp_a in cpp_args - ] - - schema_order_jit_arguments = list(f.func.schema_order_arguments()) - - schema_order_arguments = [ - compute_argument_yaml( - a, schema_order=True, - kwarg_only_set=kwarg_only_set, out_arg_set=out_arg_set, name_to_field_name=name_to_field_name) - for a in schema_order_jit_arguments - ] - - cpp_schema_order_types = [ - # NB: method here doesn't matter - r.type for a in schema_order_jit_arguments - for r in cpp.argument( - a, method=False, cpp_no_default_args=set(), faithful=False, has_tensor_options=False) - ] - - cpp_returns = cpp.returns_type(f.func.returns).cpp_type() - schema_order_cpp_signature = f"{cpp_returns} ({', '.join(cpp_schema_order_types)})" - - is_factory_method = any(isinstance(a.argument, TensorOptionsArguments) for a in cpp_args) \ - and Variant.method not in f.variants - - return OrderedDict([ - ('name', cpp.name(f.func)), - ('operator_name', str(f.func.name.name)), - ('overload_name', str(f.func.name.overload_name)), - ('manual_kernel_registration', f.manual_kernel_registration), - ('category_override', f.category_override if f.category_override is not None else ''), - ('schema_string', f'aten::{f.func}'), - ('arguments', arguments), - ('schema_order_cpp_signature', schema_order_cpp_signature), - ('schema_order_arguments', schema_order_arguments), - ('method_of', compute_method_of_yaml(f.variants)), - ('mode', 'native'), - ('python_module', '' if f.python_module is None else f.python_module), - ('returns', returns), - ('inplace', f.func.name.name.inplace), - ('is_factory_method', is_factory_method), - ('abstract', f.is_abstract), - ('device_guard', f.device_guard), - ('with_gil', False), - ('deprecated', False), - ('has_math_kernel', f.has_composite_implicit_autograd_kernel), - ]) - -# See Note [Auto generated composite kernels] -def has_autogenerated_composite_kernel(f: NativeFunction) -> bool: - return (f.structured or f.structured_delegate is not None) and \ - (f.func.kind() == SchemaKind.functional or f.func.kind() == SchemaKind.inplace) - -@with_native_function_and_indices -def compute_registration_declarations(f: NativeFunction, backend_indices: Dict[DispatchKey, BackendIndex]) -> str: - name = dispatcher.name(f.func) - returns_type = dispatcher.returns_type(f.func.returns).cpp_type_registration_declarations() - args = dispatcher.arguments(f.func) - args_str = ', '.join(a.no_default().decl_registration_declarations() for a in args) - comment_data : Dict[str, str] = { - 'schema': f'aten::{f.func}', - # TODO: What exactly is the semantics of the 'dispatch' field? - 'dispatch': str({k for k, v in backend_indices.items() if v.has_kernel(f)} != {DispatchKey.CompositeImplicitAutograd}), - 'default': str(f.has_composite_kernel or has_autogenerated_composite_kernel(f)) - } - return f"""{returns_type} {name}({args_str}); // {json.dumps(comment_data)} -""" - -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# -# RUN IT ALL -# -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # - -def get_custom_build_selector( - provided_op_registration_allowlist: Optional[List[str]], - op_selection_yaml_path: Optional[str]) -> SelectiveBuilder: - assert not ( - provided_op_registration_allowlist is not None and - op_selection_yaml_path is not None), ( - "Both provided_op_registration_allowlist and " + - "op_selection_yaml_path can NOT be provided at the " + - "same time.") - - op_registration_allowlist: Optional[Set[str]] = None - if provided_op_registration_allowlist is not None: - op_registration_allowlist = set(provided_op_registration_allowlist) - - if op_registration_allowlist is not None: - selector = SelectiveBuilder.from_legacy_op_registration_allow_list( - op_registration_allowlist, - True, - False, - ) - elif op_selection_yaml_path is not None: - selector = SelectiveBuilder.from_yaml_path(op_selection_yaml_path) - else: - selector = SelectiveBuilder.get_nop_selector() - - return selector - -def pre_group_native_functions( - native_functions: Sequence[NativeFunction]) -> Dict[FunctionSchema, Dict[SchemaKind, NativeFunction]]: - pre_grouped_native_functions: Dict[FunctionSchema, Dict[SchemaKind, NativeFunction]] = defaultdict(dict) - for f in native_functions: - d = pre_grouped_native_functions[f.func.signature()] - assert f.func.kind() not in d - d[f.func.kind()] = f - return pre_grouped_native_functions - -def get_grouped_native_functions( - native_functions: Sequence[NativeFunction]) -> Sequence[Union[NativeFunction, NativeFunctionsGroup]]: - def flatten_pre_group(d: Dict[SchemaKind, NativeFunction]) -> Sequence[Union[NativeFunction, NativeFunctionsGroup]]: - r = NativeFunctionsGroup.from_dict(d) - if r is None: - return list(d.values()) - else: - return [r] - - # TODO: how come ValuesView isn't a Sequence lol - pre_grouped_native_functions = pre_group_native_functions(native_functions) - return list(concatMap(flatten_pre_group, list(pre_grouped_native_functions.values()))) - -def gen_aggregated_headers( - *, - native_functions: Sequence[NativeFunction], - grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]], - static_dispatch_idx: Optional[BackendIndex], - selector: SelectiveBuilder, - backend_indices: Dict[DispatchKey, BackendIndex], - cpu_fm: FileManager, - cuda_fm: FileManager, - functions_keys: Set[DispatchKey], - dispatch_keys: Sequence[DispatchKey], - rocm: bool, -) -> None: - # Buck doesn't support dynamic output files, so we aggregate all operator - # headers into a single file - structured_native_functions = [g for g in grouped_native_functions - if isinstance(g, NativeFunctionsGroup)] - cpu_fm.write('NativeMetaFunctions.h', lambda: { - 'NativeMetaFunctions_includes': [], - 'NativeMetaFunctions_declarations': list( - mapMaybe(compute_meta_function_declaration, structured_native_functions)), - }) - method_native_functions = [fn for fn in native_functions - if Variant.method in fn.variants] - non_method_native_functions = [fn for fn in native_functions - if fn not in method_native_functions] - cpu_fm.write('MethodOperators.h', lambda: { - 'MethodOperators_includes': [], - 'MethodOperators_declarations': list(mapMaybe(ComputeOperators( - Target.DECLARATION), method_native_functions)), - }) - cpu_fm.write('Operators.h', lambda: { - 'Operators_includes': ['#include '], - 'Operators_declarations': list(mapMaybe(ComputeOperators( - Target.DECLARATION), non_method_native_functions)), - }) - cpu_fm.write('Functions.h', lambda: { - 'static_dispatch_extra_headers': static_dispatch_extra_headers(static_dispatch_idx), - 'Functions_includes': ['#include '], - 'Functions_declarations': list(mapMaybe(ComputeFunction( - static_dispatch_backend_index=static_dispatch_idx), native_functions)), - }) - cpu_fm.write('NativeFunctions.h', lambda: { - 'NativeFunctions_includes': ['#include '], - 'NativeFunctions_declarations': list(concatMap( - # Convert to a set first to remove duplicate kernel names. - # Backends are allowed to repeat kernel names; only generate the declaration once! - lambda f: list(OrderedDict.fromkeys(concatMap( - lambda backend_idx: - dest.compute_native_function_declaration(f, backend_idx), - backend_indices.values()))), - grouped_native_functions)), - }) - - for dispatch_key in dispatch_keys: - fm = cuda_fm if is_cuda_dispatch_key(dispatch_key) else cpu_fm - if dispatch_key in functions_keys: - if dispatch_key in static_dispatch_keys(static_dispatch_idx): - # See Note [Avoiding Include Cycles In Static Dispatch] - inl_headers = '' - else: - inl_headers = f'#include ' - - fm.write_with_template(f'{dispatch_key}Functions.h', 'DispatchKeyFunctions.h', lambda: { - 'dispatch_key': str(dispatch_key), - 'inline_headers_for_nonstatic_build': inl_headers, - }) - fm.write_with_template(f'{dispatch_key}Functions_inl.h', 'DispatchKeyFunctions_inl.h', lambda: { - 'DispatchKeyFunctions_inl_includes': [], - 'dispatch_namespace': dispatch_key.lower(), - 'dispatch_namespaced_declarations': list(concatMap( - dest.RegisterDispatchKey( - backend_indices[dispatch_key], - Target.NAMESPACED_DECLARATION, - selector, - rocm=rocm, - cpp_namespace='at::native', - class_method_name=None), - grouped_native_functions - )), - }) - - del fm - -def gen_per_operator_headers( - *, - native_functions: Sequence[NativeFunction], - grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]], - static_dispatch_idx: Optional[BackendIndex], - selector: SelectiveBuilder, - backend_indices: Dict[DispatchKey, BackendIndex], - cpu_fm: FileManager, - cuda_fm: FileManager, - ops_fm: FileManager, - functions_keys: Set[DispatchKey], - dispatch_keys: Sequence[DispatchKey], - rocm: bool, -) -> None: - # For CMake builds, split operator declarations into separate headers in - # the ATen/ops folder to split up header dependencies - functions_by_root_name: Dict[str, List[NativeFunction]] = defaultdict(lambda: []) - for fn in native_functions: - functions_by_root_name[fn.root_name].append(fn) - - grouped_functions_by_root_name: Dict[str, List[Union[NativeFunction, NativeFunctionsGroup]]] = defaultdict(lambda: []) - for group in grouped_native_functions: - name = group.root_name - grouped_functions_by_root_name[name].append(group) - - for name, functions in functions_by_root_name.items(): - ops_fm.write_with_template( - f'{name}_ops.h', 'Operator.h', lambda: { - 'declarations': list(mapMaybe(ComputeOperators( - Target.DECLARATION), functions)), - }) - - ops_fm.write_with_template( - f'{name}.h', 'Function.h', lambda: { - 'static_dispatch_ops_headers': list(mapMaybe( - lambda fn: static_dispatch_ops_header(fn, backend_index=static_dispatch_idx), - functions)), - 'operator_includes': f'#include ', - 'function_definitions': list(mapMaybe(ComputeFunction( - static_dispatch_backend_index=static_dispatch_idx), functions)), - }) - - grouped_functions = grouped_functions_by_root_name.get(name, []) - structured_functions = [fn for fn in grouped_functions - if isinstance(fn, NativeFunctionsGroup) and fn.structured] - is_structured = len(structured_functions) > 0 - - - if is_structured: - ops_fm.write_with_template( - f'{name}_meta.h', 'NativeMetaFunction.h', lambda: { - 'meta_function_declarations': list(mapMaybe( - compute_meta_function_declaration, structured_functions)), - }) - - - ops_fm.write_with_template( - f'{name}_native.h', 'NativeFunction.h', lambda: { - 'extra_includes': (f'#include ' - if is_structured else []), - 'native_function_declarations': list(concatMap( - # Convert to a set first to remove duplicate kernel names. - # Backends are allowed to repeat kernel names; only generate the declaration once! - lambda f: list(OrderedDict.fromkeys(concatMap( - lambda backend_idx: - dest.compute_native_function_declaration(f, backend_idx), - backend_indices.values()))), - grouped_functions)), - }) - - for category, suffix in [ - ('Functions', ''), - ('Operators', '_ops'), - ('NativeMetaFunctions', '_meta'), - ('NativeFunctions', '_native'), - ]: - cpu_fm.write(f'{category}.h', lambda: { - 'static_dispatch_extra_headers': [], - f'{category}_includes': [ - f'#include ' - for name in sorted(functions_by_root_name.keys()) - ], - f'{category}_declarations': [], - }) - - for dispatch_key in dispatch_keys: - if dispatch_key not in functions_keys: - continue - - dispatch_namespace = dispatch_key.lower() - dispatch_names = [] - - for name, functions in functions_by_root_name.items(): - grouped_functions = grouped_functions_by_root_name.get(name, []) - declarations = list(concatMap( - dest.RegisterDispatchKey( - backend_indices[dispatch_key], - Target.NAMESPACED_DECLARATION, - selector, - rocm=rocm, - cpp_namespace='at::native', - class_method_name=None), - grouped_functions - )) - - if len(declarations) == 0: - continue - - dispatch_names.append(name) - ops_fm.write_with_template( - f'{name}_{dispatch_namespace}_dispatch.h', - 'DispatchKeyFunction.h', lambda: { - 'dispatch_namespace': dispatch_namespace, - 'dispatch_namespaced_declarations': declarations, - }) - - fm = cuda_fm if is_cuda_dispatch_key(dispatch_key) else cpu_fm - if dispatch_key in static_dispatch_keys(static_dispatch_idx): - # See Note [Avoiding Include Cycles In Static Dispatch] - inl_headers = '' - else: - inl_headers = f'#include ' - - fm.write_with_template(f'{dispatch_key}Functions.h', 'DispatchKeyFunctions.h', lambda: { - 'dispatch_key': str(dispatch_key), - 'inline_headers_for_nonstatic_build': inl_headers, - }) - fm.write_with_template(f'{dispatch_key}Functions_inl.h', 'DispatchKeyFunctions_inl.h', lambda: { - 'dispatch_namespace': dispatch_namespace, - 'DispatchKeyFunctions_inl_includes': [ - f'#include ' - for name in sorted(dispatch_names) - ], - 'dispatch_namespaced_declarations': [], - }) - del fm - - cpu_fm.write('MethodOperators.h', lambda: { - 'MethodOperators_includes': sorted( - f'#include ' - for name, functions in functions_by_root_name.items() - if any(Variant.method in fn.variants for fn in functions) - ), - 'MethodOperators_declarations': [], - }) - -def gen_headers( - *, - native_functions: Sequence[NativeFunction], - grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]], - static_dispatch_idx: Optional[BackendIndex], - selector: SelectiveBuilder, - backend_indices: Dict[DispatchKey, BackendIndex], - core_fm: FileManager, - cpu_fm: FileManager, - cuda_fm: FileManager, - ops_fm: FileManager, - dispatch_keys: Sequence[DispatchKey], - functions_keys: Set[DispatchKey], - rocm: bool, - per_operator_headers: bool, -) -> None: - if per_operator_headers: - gen_per_operator_headers( - native_functions=native_functions, - grouped_native_functions=grouped_native_functions, - static_dispatch_idx=static_dispatch_idx, - selector=selector, - backend_indices=backend_indices, - cpu_fm=cpu_fm, - cuda_fm=cuda_fm, - ops_fm=ops_fm, - dispatch_keys=dispatch_keys, - functions_keys=functions_keys, - rocm=rocm, - ) - else: - gen_aggregated_headers( - native_functions=native_functions, - grouped_native_functions=grouped_native_functions, - static_dispatch_idx=static_dispatch_idx, - selector=selector, - backend_indices=backend_indices, - cpu_fm=cpu_fm, - cuda_fm=cuda_fm, - dispatch_keys=dispatch_keys, - functions_keys=functions_keys, - rocm=rocm, - ) - - def static_dispatch_method_headers() -> List[str]: - return list(mapMaybe( - lambda fn: static_dispatch_ops_header(fn, backend_index=static_dispatch_idx), - [fn for fn in native_functions if Variant.method in fn.variants])) - - - core_fm.write('TensorBody.h', lambda: { - 'static_dispatch_ops_headers': ( - static_dispatch_method_headers() if per_operator_headers - else static_dispatch_extra_headers(static_dispatch_idx, skip_tensor_include=True)), - 'tensor_method_declarations': list(mapMaybe(ComputeTensorMethod( - target=Target.DECLARATION, static_dispatch_backend_index=static_dispatch_idx), native_functions)), - 'tensor_method_definitions': list(mapMaybe(ComputeTensorMethod( - target=Target.DEFINITION, static_dispatch_backend_index=static_dispatch_idx), native_functions)), - }) - - cpu_fm.write('RedispatchFunctions.h', lambda: { - 'function_redispatch_definitions': list(mapMaybe(ComputeRedispatchFunction(), native_functions)), - }) - - cpu_fm.write('RegistrationDeclarations.h', lambda: { - 'registration_declarations': [compute_registration_declarations(f, backend_indices) for f in native_functions], - }) - - cpu_fm.write('FunctionalInverses.h', lambda: { - 'view_inverse_declarations': list(mapMaybe(gen_functionalization_view_inverse_declaration, native_functions)) - }) - - - def gen_aten_interned_strings() -> Dict[str, str]: - attrs = set() # All function argument names - names = set() # All ATen function names - for func in native_functions: - names.add(str(func.func.name.name)) - # Some operators don't have a functional variant but we still create a - # symbol without the underscore - names.add(func.func.name.name.base) - - for arg in func.func.schema_order_arguments(): - attrs.add(arg.name) - - # These are keywords in C++, so aren't valid symbol names - # https://en.cppreference.com/w/cpp/language/operator_alternative - names -= set(['and', 'and_eq', 'bitand', 'bitor', 'compl', 'not', - 'not_eq', 'or', 'or_eq', 'xor', 'xor_eq']) - - return { - 'aten_symbols': ' \\\n'.join([ - f"_(aten, {name})" for name in sorted(names) - ]), - 'attr_symbols': ' \\\n'.join([ - f"_(attr, {name})" for name in sorted(attrs) - ]), - } - - core_fm.write('aten_interned_strings.h', gen_aten_interned_strings) - -def gen_source_files( - *, - native_functions: Sequence[NativeFunction], - grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]], - static_dispatch_idx: Optional[BackendIndex], - selector: SelectiveBuilder, - backend_indices: Dict[DispatchKey, BackendIndex], - core_fm: FileManager, - cpu_fm: FileManager, - cuda_fm: FileManager, - dispatch_keys: Sequence[DispatchKey], - functions_keys: Set[DispatchKey], - rocm: bool, - force_schema_registration: bool, - per_operator_headers: bool, -) -> None: - extra_cuda_headers = '''\ -#include -#include -#include -#include ''' - if rocm: - extra_cuda_headers = '''\ -#include -#include -#include -#include ''' - - for dispatch_key in dispatch_keys: - fm = cuda_fm if is_cuda_dispatch_key(dispatch_key) else cpu_fm - - if per_operator_headers: - def operator_headers() -> List[str]: - headers = [] - for fn in native_functions: - is_registered = backend_index.has_kernel(fn) or ( - fn.structured and dispatch_key in - (DispatchKey.Meta, DispatchKey.CompositeExplicitAutograd)) - if not is_registered: - continue - - headers.append(f"#include ") - if dispatch_key == DispatchKey.CompositeExplicitAutograd: - headers.append(f"#include ") - if dispatch_key in functions_keys: - headers.append( - f"#include ") - - return sorted(set(headers)) - else: - def operator_headers() -> List[str]: - headers = ["#include "] - if dispatch_key == DispatchKey.CompositeExplicitAutograd: - headers.append("#include ") - if dispatch_key in functions_keys: - headers.append(f"#include ") - return headers - - backend_index = backend_indices[dispatch_key] - dispatch_namespace = str(dispatch_key).lower() - fm.write_with_template(f'Register{dispatch_key}.cpp', 'RegisterDispatchKey.cpp', lambda: { - 'extra_cuda_headers': extra_cuda_headers if is_cuda_dispatch_key(dispatch_key) else '', - 'external_backend_headers': '', - 'dispatch_headers': dest.gen_registration_headers(backend_index, per_operator_headers, rocm), - 'ops_headers': operator_headers(), - 'DispatchKey': dispatch_key, - 'dispatch_namespace': dispatch_key.lower(), - 'dispatch_helpers': dest.gen_registration_helpers(backend_index), - 'dispatch_namespaced_definitions': list(concatMap( - dest.RegisterDispatchKey( - backend_index, - Target.NAMESPACED_DEFINITION, - selector, - rocm=rocm, - cpp_namespace='at::native', - class_method_name=None), - grouped_native_functions - )), - 'dispatch_anonymous_definitions': list(concatMap( - dest.RegisterDispatchKey( - backend_index, - Target.ANONYMOUS_DEFINITION, - selector, - rocm=rocm, - cpp_namespace='at::native', - class_method_name=None), - grouped_native_functions - )), - 'dispatch_registrations': list(concatMap( - dest.RegisterDispatchKey( - backend_index, - Target.REGISTRATION, - selector, - rocm=rocm, - cpp_namespace='at::native', - class_method_name=None), - grouped_native_functions - )), - }) - - # BackendSelect is generated specially - def gen_backend_select() -> Dict[str, List[str]]: - relevant_fns = [fn for fn in native_functions if needs_backend_select(fn, selector)] - return { - 'ops_headers': [f'#include ' for fn in relevant_fns], - 'backend_select_method_definitions': - list(mapMaybe(ComputeBackendSelect(Target.DEFINITION, selector), relevant_fns)), - 'backend_select_function_registrations': - list(mapMaybe(ComputeBackendSelect(Target.REGISTRATION, selector), relevant_fns)), - } - cpu_fm.write('RegisterBackendSelect.cpp', gen_backend_select) - - schema_selector = selector - if force_schema_registration: - schema_selector = SelectiveBuilder.get_nop_selector() - cpu_fm.write('RegisterSchema.cpp', lambda: { - 'schema_registrations': list(mapMaybe(RegisterSchema(schema_selector), native_functions)), - }) - - def key_func(fn: Union[NativeFunction, NativeFunctionsGroup]) -> str: - return fn.root_name - - cpu_fm.write_sharded( - 'Operators.cpp', - native_functions, - key_fn=key_func, - env_callable=lambda fn: { - 'operator_headers': [f'#include '], - 'definitions': [ComputeOperators(Target.DEFINITION)(fn)]}, - num_shards=5, - sharded_keys={'operator_headers', 'definitions'} - ) - - cpu_fm.write('Functions.cpp', lambda: {}) - - core_fm.write('TensorMethods.cpp', lambda: {}) - - core_fm.write('ATenOpList.cpp', lambda: { - 'aten_ops': list(mapMaybe(compute_aten_op, native_functions)), - }) - - # We need to easily map from [inplace_op_name] -> [functional_op] for the functionalization pass, - # so here I generate a mapping from every operator name to its corresponding functional NativeFunction (if it exist). - pre_grouped_d: Dict[FunctionSchema, Dict[SchemaKind, NativeFunction]] = pre_group_native_functions(native_functions) - to_functional_op: Dict[OperatorName, Optional[NativeFunction]] = { - k: v for d in [ - {f.func.name: pre_grouped_d[func][SchemaKind.functional] - if SchemaKind.functional in pre_grouped_d[func].keys() else None - for f in pre_grouped_d[func].values()} - for func in pre_grouped_d.keys()] - for k, v in d.items() - } - - - def functionalization_env_callable( - g: Union[NativeFunction, NativeFunctionsGroup] - ) -> Dict[str, List[str]]: - functions = [g] if isinstance(g, NativeFunction) else list(g.functions()) - functions_needing_functionalization = [ - fn for fn in functions if needs_functionalization(selector, fn)] - return { - 'ops_headers': ([ - f"#include ", - f"#include ", - ] if functions_needing_functionalization else []), - 'func_definitions': list(mapMaybe( - lambda f: gen_functionalization_definition(selector, f, to_functional_op[f.func.name]), - functions_needing_functionalization)), - 'func_registrations': list(mapMaybe( - lambda f: gen_functionalization_registration( - selector, f, backend_indices[DispatchKey.CompositeImplicitAutograd]), - functions_needing_functionalization)), - } - - - cpu_fm.write_sharded( - 'RegisterFunctionalization.cpp', - grouped_native_functions, - key_fn=key_func, - env_callable=functionalization_env_callable, - num_shards=4, - sharded_keys={'ops_headers', 'func_definitions', 'func_registrations'} - ) - - -def gen_declarations_yaml( - cpu_fm: FileManager, - native_functions: Sequence[NativeFunction]) -> None: - cpu_fm.write('Declarations.yaml', lambda: - format_yaml([compute_declaration_yaml(f) for f in native_functions])) - -def main() -> None: - parser = argparse.ArgumentParser(description='Generate ATen source files') - parser.add_argument( - '-s', - '--source-path', - help='path to source directory for ATen', - default='aten/src/ATen') - parser.add_argument( - '-o', - '--output-dependencies', - help='output a list of dependencies into the given file and exit') - parser.add_argument( - '--dry-run', action='store_true', - help='run without writing any files (still updates outputs)') - parser.add_argument( - '--per-operator-headers', action='store_true', - help='generate separate headers per operator in ATen/ops') - parser.add_argument( - '-d', '--install_dir', help='output directory', - default='build/aten/src/ATen') - parser.add_argument( - '--rocm', - action='store_true', - help='reinterpret CUDA as ROCm/HIP and adjust filepaths accordingly') - # TODO: --op_registration_whitelist will be removed when all call-sites - # for gen.py are moved over to using the operator YAML file for mobile - # custom build. - parser.add_argument( - '--op_registration_whitelist', - nargs='*', - help='filter op registrations by the whitelist (if set); ' - 'each item is `namespace`::`operator name` without overload name; ' - 'e.g.: aten::empty aten::conv2d ...') - parser.add_argument( - '--op_selection_yaml_path', - help='Provide a path to the operator selection (for custom build) YAML ' - 'that contains the information about the set of selected operators ' - 'and their categories (training, ...). Each operator is either a ' - 'full operator name with overload or just a bare operator name. ' - 'The operator names also contain the namespace prefix (e.g. aten::)') - parser.add_argument( - '--backend_whitelist', - nargs='*', - help='filter dispatch backend by the whitelist (if set), ' - 'e.g.: CPU CUDA QuantizedCPU ...') - parser.add_argument( - '--static_dispatch_backend', - help='generate static dispatch code for the specific backend (if set)') - parser.add_argument( - '--force_schema_registration', - action='store_true', - help='force it to generate schema-only registrations for all ops, including' - 'those that are not listed on --op_registration_whitelist') - parser.add_argument( - '--generate', - type=str, - nargs='*', - choices=['headers', 'sources', 'declarations_yaml'], - default=['headers', 'sources', 'declarations_yaml'], - help='Generate only a subset of files') - options = parser.parse_args() - - selector = get_custom_build_selector( - options.op_registration_whitelist, - options.op_selection_yaml_path, - ) - - native_yaml_path = os.path.join(options.source_path, 'native/native_functions.yaml') - parsed_yaml = parse_native_yaml(native_yaml_path) - native_functions, backend_indices = parsed_yaml.native_functions, parsed_yaml.backend_indices - grouped_native_functions = get_grouped_native_functions(native_functions) - - template_dir = os.path.join(options.source_path, "templates") - - # NB: It is mandatory to NOT use os.path.join here, as the install directory - # will eventually be ingested by cmake, which does not respect Windows style - # path slashes. If you switch this to use os.path.join, you'll get an error - # like: - # - # Syntax error in cmake code when parsing string - # - # C:/Jenkins/workspace/pytorch-builds/pytorch-win-ws2016-cuda9-cudnn7-py3-build/build/aten/src/ATen\core/TensorMethods.h - # - # Invalid character escape '\c'. - core_install_dir = f'{options.install_dir}/core' - pathlib.Path(core_install_dir).mkdir(parents=True, exist_ok=True) - ops_install_dir = f'{options.install_dir}/ops' - pathlib.Path(ops_install_dir).mkdir(parents=True, exist_ok=True) - - def make_file_manager(install_dir: str) -> FileManager: - return FileManager(install_dir=install_dir, template_dir=template_dir, dry_run=options.dry_run) - - core_fm = make_file_manager(core_install_dir) - cpu_fm = make_file_manager(options.install_dir) - cuda_fm = make_file_manager(options.install_dir) - ops_fm = make_file_manager(ops_install_dir) - - extra_cuda_headers = '''\ -#include -#include -#include -#include ''' - if options.rocm: - extra_cuda_headers = '''\ -#include -#include -#include -#include ''' - - dispatch_keys = [ - DispatchKey.CPU, - DispatchKey.SparseCPU, - DispatchKey.SparseCsrCPU, - DispatchKey.MkldnnCPU, - DispatchKey.CUDA, - DispatchKey.SparseCUDA, - DispatchKey.SparseCsrCUDA, - DispatchKey.QuantizedCPU, - DispatchKey.QuantizedCUDA, - DispatchKey.CompositeImplicitAutograd, - DispatchKey.CompositeExplicitAutograd, - # Meta is a magic key: it is automatically generated for structured - # kernels - DispatchKey.Meta, - DispatchKey.ZeroTensor, - ] - # Only a limited set of dispatch keys get CPUFunctions.h headers generated - # for them; this is the set - functions_keys = { - DispatchKey.CPU, - DispatchKey.CUDA, - DispatchKey.CompositeImplicitAutograd, - DispatchKey.CompositeExplicitAutograd, - DispatchKey.Meta, - } - if options.backend_whitelist: - dispatch_keys = [k for k in dispatch_keys if is_generic_dispatch_key(k) or str(k) in options.backend_whitelist] - - static_dispatch_idx: Optional[BackendIndex] = None - if options.static_dispatch_backend: - static_dispatch_idx = backend_indices[DispatchKey.parse(options.static_dispatch_backend)] - - if 'sources' in options.generate: - gen_source_files( - native_functions=native_functions, - grouped_native_functions=grouped_native_functions, - static_dispatch_idx=static_dispatch_idx, - selector=selector, - backend_indices=backend_indices, - core_fm=core_fm, - cpu_fm=cpu_fm, - cuda_fm=cuda_fm, - dispatch_keys=dispatch_keys, - functions_keys=functions_keys, - rocm=options.rocm, - force_schema_registration=options.force_schema_registration, - per_operator_headers=options.per_operator_headers, - ) - - if 'headers' in options.generate: - gen_headers( - native_functions=native_functions, - grouped_native_functions=grouped_native_functions, - static_dispatch_idx=static_dispatch_idx, - selector=selector, - backend_indices=backend_indices, - core_fm=core_fm, - cpu_fm=cpu_fm, - cuda_fm=cuda_fm, - ops_fm=ops_fm, - dispatch_keys=dispatch_keys, - functions_keys=functions_keys, - rocm=options.rocm, - per_operator_headers=options.per_operator_headers, - ) - - if 'declarations_yaml' in options.generate: - gen_declarations_yaml( - native_functions=native_functions, - cpu_fm=cpu_fm) - - if options.output_dependencies: - depfile_path = pathlib.Path(options.output_dependencies).resolve() - depfile_name = depfile_path.name - depfile_stem = depfile_path.stem - - for fm, prefix in [ - (cpu_fm, ""), - (core_fm, "core_"), - (cuda_fm, "cuda_"), - (ops_fm, "ops_"), - ]: - varname = prefix + depfile_stem - path = depfile_path.parent / (prefix + depfile_name) - fm.write_outputs(varname, str(path)) - - -if __name__ == '__main__': - main() diff --git a/tools/codegen/gen_backend_stubs.py b/tools/codegen/gen_backend_stubs.py deleted file mode 100644 index 7837a41cab6e..000000000000 --- a/tools/codegen/gen_backend_stubs.py +++ /dev/null @@ -1,325 +0,0 @@ -import pathlib -import argparse -import os -import yaml -import re -from collections import namedtuple, Counter, defaultdict -from typing import List, Dict, Union, Sequence, Optional -from tools.codegen.gen import get_grouped_native_functions, parse_native_yaml -from tools.codegen.model import (BackendIndex, BackendMetadata, DispatchKey, - NativeFunction, NativeFunctionsGroup, OperatorName) -from tools.codegen.selective_build.selector import SelectiveBuilder -from tools.codegen.utils import Target, concatMap, context, YamlLoader, FileManager -from tools.codegen.context import native_function_manager -import tools.codegen.dest as dest -import tools.codegen.api.dispatcher as dispatcher -from tools.codegen.api.types import DispatcherSignature - - -# Parses the external backend's yaml, and adds a new BackendIndex for the backend's dispatch key. -# Returns a Tuple of (backend_key, autograd_key, cpp_namespace, updated BackendIndex mapping) -ParsedExternalYaml = namedtuple('ParsedExternalYaml', [ - 'backend_key', 'autograd_key', 'cpp_namespace', 'backend_indices']) -def parse_backend_yaml( - backend_yaml_path: str, - grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]], - backend_indices: Dict[DispatchKey, BackendIndex] -) -> ParsedExternalYaml: - - native_functions_map: Dict[OperatorName, NativeFunction] = { - f.func.name: f - for f in concatMap(lambda f: [f] if isinstance(f, NativeFunction) else list(f.functions()), grouped_native_functions) - } - - with open(backend_yaml_path, 'r') as f: - yaml_values = yaml.load(f, Loader=YamlLoader) - assert isinstance(yaml_values, dict) - - valid_keys = ['backend', 'cpp_namespace', 'extra_headers', 'supported', 'autograd', 'full_codegen'] - - backend = yaml_values.pop('backend', None) - assert backend is not None, 'You must provide a value for "backend"' - - cpp_namespace = yaml_values.pop('cpp_namespace', None) - assert cpp_namespace is not None, 'You must provide a value for "cpp_namespace"' - - # Mostly just defaulting to false to stick with LazyTensor convention. - use_out_as_primary = yaml_values.pop('use_out_as_primary', False) - assert isinstance(use_out_as_primary, bool), \ - f'You must provide either True or False for use_out_as_primary. Provided: {use_out_as_primary}' - - use_device_guard = yaml_values.pop('device_guard', False) - assert isinstance(use_device_guard, bool), \ - f'You must provide either True or False for device_guard. Provided: {use_device_guard}' - - supported = yaml_values.pop('supported', []) - if supported is None: - supported = [] # Allow an empty list of supported ops - assert isinstance(supported, list), f'expected "supported" to be a list, but got: {supported} (of type {type(supported)})' - - supported_autograd = yaml_values.pop('autograd', []) - assert isinstance(supported_autograd, list), f'expected "autograd" to be a list, but got: {supported_autograd}' - - # full_codegen is ignored by parse_backend_yaml, and re-parsed in gen_lazy_tensor.py - full_codegen = yaml_values.pop('full_codegen', []) - supported.extend(full_codegen) - - assert len(yaml_values.keys()) == 0, \ - f'{backend_yaml_path} contains unexpected keys: {", ".join(yaml_values.keys())}. \ -Only the following keys are supported: {", ".join(valid_keys)}' - - def create_backend_index( - backend_ops: List[str], - dispatch_key: DispatchKey, - *, - use_out_as_primary: bool, - use_device_guard: bool - ) -> BackendIndex: - metadata: Dict[OperatorName, BackendMetadata] = {} - for op in backend_ops: - op_name = OperatorName.parse(op) - assert op_name in native_functions_map, f"Found an invalid operator name: {op_name}" - # See Note [External Backends Follow Dispatcher API] - kernel_name = dispatcher.name(native_functions_map[op_name].func) - # TODO: allow structured external backends later. - m = BackendMetadata(kernel=kernel_name, structured=False) - metadata[op_name] = m - return BackendIndex( - dispatch_key=dispatch_key, - use_out_as_primary=use_out_as_primary, - external=True, - device_guard=use_device_guard, - index=metadata) - - backend_key: Optional[DispatchKey] = None - if len(supported) > 0: - with context(lambda: f'The provided value for "backend" must be a valid DispatchKey, but got {backend}.'): - backend_key = DispatchKey.parse(backend) - - backend_idx = create_backend_index( - supported, backend_key, use_out_as_primary=use_out_as_primary, use_device_guard=use_device_guard) - assert backend_key not in backend_indices - backend_indices[backend_key] = backend_idx - - autograd_key: Optional[DispatchKey] = None - if len(supported_autograd) > 0: - with context(lambda: f'The "autograd" key was specified, which indicates that you would like to override \ -the behavior of autograd for some operators on your backend. However "Autograd{backend}" is not a valid DispatchKey.'): - autograd_key = DispatchKey.parse(f'Autograd{backend}') - - autograd_idx = create_backend_index( - supported_autograd, autograd_key, use_out_as_primary=use_out_as_primary, use_device_guard=use_device_guard) - assert autograd_key not in backend_indices - backend_indices[autograd_key] = autograd_idx - - for g in grouped_native_functions: - if isinstance(g, NativeFunction): - forward_kernels = [] if backend_key is None else \ - [m for m in [backend_indices[backend_key].get_kernel(g)] if m is not None] - backward_kernels = [] if autograd_key is None else \ - [m for m in [backend_indices[autograd_key].get_kernel(g)] if m is not None] - else: - forward_kernels = [] if backend_key is None else [m for m in [ - backend_indices[backend_key].get_kernel(f) for f in g.functions()] - if m is not None] - backward_kernels = [] if autograd_key is None else [m for m in [ - backend_indices[autograd_key].get_kernel(f) for f in g.functions()] - if m is not None] - - forward_kernels = [f for f in forward_kernels if f is not None] - backward_kernels = [f for f in backward_kernels if f is not None] - assert len(forward_kernels) == 0 or len(backward_kernels) == 0, \ - f'Currently, all variants of an op must either be registered to a backend key, or to a backend\'s \ -autograd key. They cannot be mix and matched. If this is something you need, feel free to create an issue! \ -{forward_kernels[0].kernel} is listed under "supported", but {backward_kernels[0].kernel} is listed under "autograd".' - - return ParsedExternalYaml(backend_key, autograd_key, cpp_namespace, backend_indices) - -def error_on_missing_kernels( - native_functions: Sequence[NativeFunction], - backend_indices: Dict[DispatchKey, BackendIndex], - backend_key: DispatchKey, - autograd_key: Optional[DispatchKey], - kernel_defn_file_path: str, - full_codegen: Optional[List[OperatorName]] = None, -) -> None: - try: - with open(kernel_defn_file_path, 'r') as f: - backend_defns = f.read() - except IOError: - raise AssertionError(f'Unable to read from the specified impl_path file: {kernel_defn_file_path}') - - if full_codegen is None: - full_codegen = [] - - class_name: Optional[str] = backend_indices[backend_key].native_function_class_name() - assert class_name is not None - - expected_backend_op_names: List[OperatorName] = \ - list(backend_indices[backend_key].index.keys()) + \ - [] if autograd_key is None else list(backend_indices[autograd_key].index.keys()) - expected_backend_native_funcs: List[NativeFunction] = [ - f for f in native_functions if f.func.name in expected_backend_op_names and f.func.name not in full_codegen] - expected_backend_kernel_name_counts: Dict[str, List[NativeFunction]] = defaultdict(list) - for native_f in expected_backend_native_funcs: - expected_backend_kernel_name_counts[dispatcher.name(native_f.func)].append(native_f) - - kernel_defn_regex = rf'{class_name}::([\w\d]*)\([^\)]*\)\s*{{' - actual_backend_kernel_name_counts = Counter(re.findall(kernel_defn_regex, backend_defns)) - - missing_kernels_err_msg = "" - for expected_name, funcs in expected_backend_kernel_name_counts.items(): - expected_overload_count = len(funcs) - actual_overload_count = actual_backend_kernel_name_counts[expected_name] - if expected_overload_count != actual_overload_count: - def create_decl(f: NativeFunction) -> str: - with native_function_manager(f): - return DispatcherSignature.from_schema(f.func).decl() - expected_schemas_str = '\n'.join([create_decl(f) for f in funcs]) - missing_kernels_err_msg += f""" -{class_name} is missing a kernel definition for {expected_name}. We found {actual_overload_count} kernel(s) with that name, -but expected {expected_overload_count} kernel(s). The expected function schemas for the missing operator are: -{expected_schemas_str} - -""" - assert missing_kernels_err_msg == "", missing_kernels_err_msg - -def main() -> None: - parser = argparse.ArgumentParser(description='Generate backend stub files') - parser.add_argument( - '-s', - '--source_yaml', - help='path to source yaml file containing operator external definitions') - parser.add_argument( - '-o', '--output_dir', help='output directory') - parser.add_argument( - '--dry_run', type=bool, default=False, help='output directory') - parser.add_argument( - '--impl_path', type=str, default=None, help='path to the source C++ file containing kernel definitions') - options = parser.parse_args() - - run(options.source_yaml, options.output_dir, options.dry_run, options.impl_path) - - -def gen_dispatchkey_nativefunc_headers( - fm: FileManager, - class_name: str, - cpp_namespace: str, - backend_indices: Dict[DispatchKey, BackendIndex], - grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]], - backend_dispatch_key: DispatchKey, - autograd_dispatch_key: Optional[DispatchKey]) -> None: - assert class_name is not None - generated_comment = 'Autogenerated file by gen_backend_stubs.py. Do not edit directly!' - - # Convert to a set first to remove duplicate kernel names. - # Backends are allowed to repeat kernel names; only generate the declaration once! - # Sort for deterministic output. - backend_declarations = list(sorted(set(concatMap( - lambda f: dest.compute_native_function_declaration(f, backend_indices[backend_dispatch_key]), - grouped_native_functions)))) - autograd_declarations = list(sorted(set(concatMap( - lambda f: [] if autograd_dispatch_key is None else - dest.compute_native_function_declaration(f, backend_indices[autograd_dispatch_key]), - grouped_native_functions)))) - - fm.write_with_template(f'{backend_dispatch_key}NativeFunctions.h', 'DispatchKeyNativeFunctions.h', lambda: { - 'generated_comment': generated_comment, - 'cpp_namespace': cpp_namespace, - 'class_name': class_name, - 'dispatch_declarations': backend_declarations + autograd_declarations, - }) - - -def gen_dispatcher_registrations( - fm: FileManager, - output_dir: str, - cpp_namespace: str, - backend_indices: Dict[DispatchKey, BackendIndex], - grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]], - backend_dispatch_key: DispatchKey, - dispatch_key: DispatchKey, - selector: 'SelectiveBuilder') -> None: - backend_index = backend_indices[dispatch_key] - fm.write_with_template(f'Register{dispatch_key}.cpp', 'RegisterDispatchKey.cpp', lambda: { - 'extra_cuda_headers': '', - 'external_backend_headers': f'#include "{output_dir}/{backend_dispatch_key}NativeFunctions.h"', - 'ops_headers': '#include ', - 'DispatchKey': dispatch_key, - 'dispatch_namespace': dispatch_key.lower(), - 'dispatch_headers': dest.gen_registration_headers(backend_index, per_operator_headers=False, rocm=False), - 'dispatch_helpers': dest.gen_registration_helpers(backend_index), - 'dispatch_namespaced_definitions': list(concatMap( - dest.RegisterDispatchKey( - backend_index, - Target.NAMESPACED_DEFINITION, - selector, - rocm=False, - cpp_namespace=cpp_namespace, - class_method_name=f'{backend_dispatch_key}NativeFunctions'), - grouped_native_functions - )), - 'dispatch_anonymous_definitions': list(concatMap( - dest.RegisterDispatchKey( - backend_index, - Target.ANONYMOUS_DEFINITION, - selector, - rocm=False, - cpp_namespace=cpp_namespace, - class_method_name=f'{backend_dispatch_key}NativeFunctions'), - grouped_native_functions - )), - 'dispatch_registrations': list(concatMap( - dest.RegisterDispatchKey( - backend_index, - Target.REGISTRATION, - selector, - rocm=False, - cpp_namespace=cpp_namespace, - class_method_name=f'{dispatch_key}NativeFunctions'), - grouped_native_functions - )), - }) - -def run(source_yaml: str, output_dir: str, dry_run: bool, impl_path: Optional[str] = None) -> None: - - # Assumes that this file lives at PYTORCH_ROOT/tools/codegen/gen_backend_stubs.py - pytorch_root = pathlib.Path(__file__).parent.parent.parent.absolute() - template_dir = os.path.join(pytorch_root, "aten/src/ATen/templates") - - def make_file_manager(install_dir: str) -> FileManager: - return FileManager(install_dir=install_dir, template_dir=template_dir, dry_run=dry_run) - - fm = make_file_manager(output_dir) - - native_yaml_path = os.path.join(pytorch_root, 'aten/src/ATen/native/native_functions.yaml') - parsed_yaml = parse_native_yaml(native_yaml_path) - native_functions, backend_indices = parsed_yaml.native_functions, parsed_yaml.backend_indices - grouped_native_functions = get_grouped_native_functions(native_functions) - parsed_backend_yaml = parse_backend_yaml(source_yaml, grouped_native_functions, backend_indices) - backend_key = parsed_backend_yaml.backend_key - autograd_key = parsed_backend_yaml.autograd_key - cpp_namespace = parsed_backend_yaml.cpp_namespace - backend_indices = parsed_backend_yaml.backend_indices - - selector = SelectiveBuilder.get_nop_selector() - - - if backend_key is None: - # This could be useful if a backend wants to quickly set up a noop yaml file but doesn't have any kernels ready yet. - return - - class_name = backend_indices[backend_key].native_function_class_name() - - if impl_path is not None: - error_on_missing_kernels(native_functions, backend_indices, backend_key, autograd_key, impl_path) - - - gen_dispatchkey_nativefunc_headers(fm, class_name, cpp_namespace, backend_indices, - grouped_native_functions, backend_key, autograd_key) - - for dispatch_key in [backend_key] if autograd_key is None else [backend_key, autograd_key]: - gen_dispatcher_registrations(fm, output_dir, cpp_namespace, backend_indices, grouped_native_functions, - backend_key, dispatch_key, selector) -if __name__ == '__main__': - main() diff --git a/tools/codegen/gen_functionalization_type.py b/tools/codegen/gen_functionalization_type.py deleted file mode 100644 index 6666a493be74..000000000000 --- a/tools/codegen/gen_functionalization_type.py +++ /dev/null @@ -1,365 +0,0 @@ -from tools.codegen.api import cpp -from tools.codegen.api.types import ( - DispatcherSignature, Binding, FunctionalizationLambda, ViewInverseSignature -) -from tools.codegen.api.translate import translate -from tools.codegen.context import with_native_function -from tools.codegen.model import ( - Argument, NativeFunction, SchemaKind, BackendIndex, - Tag, FunctionSchema, SelfArgument, TensorOptionsArguments, BaseType, BaseTy -) -from tools.codegen.selective_build.selector import SelectiveBuilder -from typing import List, Optional, Union, Tuple -from tools.codegen.utils import mapMaybe - -def modifies_arguments(f: NativeFunction) -> bool: - return f.func.kind() in [SchemaKind.inplace, SchemaKind.out] - -# This function constructs the return statement for the kernels that contain mutations -# It mostly just needs to special case multi-output returns to wrap the result in a tuple -def return_str(f: NativeFunction) -> str: - if len(f.func.arguments.out) != 0: - if len(f.func.arguments.out) > 1: - return_names = ', '.join(a.name for a in f.func.arguments.out) - return f'return {DispatcherSignature.from_schema(f.func).returns_type().cpp_type()}({return_names});' - else: - return f'return {f.func.arguments.out[0].name}' - if f.func.arguments.self_arg is not None: - return f'return {f.func.arguments.self_arg.argument.name}' - return '' - -def wrapper_name(func: FunctionSchema) -> str: - if func.name.overload_name: - return f'{cpp.name(func)}_{func.name.overload_name}' - else: - return cpp.name(func) - -def is_tensor_like(a: Union[Argument, TensorOptionsArguments, SelfArgument]) -> bool: - return isinstance(a, SelfArgument) or (isinstance(a, Argument) and a.type.is_tensor_like()) - -# unwraps all tensor-like arguments, returning: -# (1) a string containing all of the logic that does the unwrapping -# (2) a context, to be used by translate(), with all of the relevant bindings. -def unwrap_tensor_args(sig: DispatcherSignature) -> Tuple[str, List[Binding]]: - context: List[Binding] = [] - unwrapped_tensor_args: List[str] = [] - for arg in sig.arguments(): - if is_tensor_like(arg.argument): - # for tensor inputs, we want to unwrap them before passing them into the redispatch calls. - unwrapped_name = f'{arg.name}_' - unwrapped_tensor_args.append( - f'auto {unwrapped_name} = at::functionalization::impl::from_functional_tensor({arg.name});') - context.append(arg.with_name(unwrapped_name)) - else: - # for non-tensor inputs, we want to pass them directly into the redispatch calls. - context.append(arg) - unwrap_tensor_args_str = '\n '.join(unwrapped_tensor_args) - return unwrap_tensor_args_str, context - -# converts all tensor-like arguments to meta tensors, which are used to compute stride info. Returns: -# (1) a string containing all of the logic that does the conversions. -# (2) a context, to be used by translate(), with all of the relevant bindings. -def convert_to_meta_tensors(sig: DispatcherSignature) -> Tuple[str, List[Binding]]: - context: List[Binding] = [] - unwrapped_tensor_args: List[str] = [] - for arg in sig.arguments(): - if is_tensor_like(arg.argument): - # for tensor inputs, we want to unwrap them before passing them into the redispatch calls. - # for tensor inputs, we want to unwrap them before passing them into the redispatch calls. - a_ = arg.name - unwrapped_name = f'{arg.name}_meta' - unwrapped_tensor_args.append( - f"auto {unwrapped_name} = at::native::empty_strided_meta({a_}.sizes(), {a_}.strides(), \ -/*dtype=*/c10::make_optional({a_}.scalar_type()), /*layout=*/c10::make_optional({a_}.layout()), \ -/*device=*/c10::make_optional(c10::Device(kMeta)), /*pin_memory=*/c10::nullopt);" - ) - context.append(arg.with_name(unwrapped_name)) - else: - # for non-tensor inputs, we want to pass them directly into the redispatch calls. - context.append(arg) - unwrap_tensor_args_str = '\n '.join(unwrapped_tensor_args) - return unwrap_tensor_args_str, context - -# The functionalization codegen currently expects view op schemas to have this form: -# foo(Tensor(a), ...) -> Tensor(a) (e.g. transpose) -# foo(Tensor(a!), ...) -> Tensor(a!) (e.g. transpose_) -def assert_view_op_properties(func: FunctionSchema) -> None: - def is_alias(a: Argument) -> bool: - return a.annotation is not None - - args = func.arguments.flat_non_out - # The first argument is a tensor with an alias semantics (annotations) - assert len(args) > 0 and args[0].type == BaseType(BaseTy.Tensor), \ - f"""In the functionalization codegen, we expect the first argument of every view operator to be a tensor, -but found an argument of type {str(args[0].type)} for operator: {str(func.name)}.""" - # No other arguments have aliasing semantics - assert is_alias(args[0]) and not any(is_alias(a) for a in args[1:]), \ - """In the functionalization codegen, we expect the first argument of every view operator to alias the output. -View operators with multiple aliasing inputs aren't supported yet. Found an operator that doesn't satisfy this constraint""" - -# Generates the Functionalization kernel for: -# - ops that create aliases (e.g. transpose()) -# - ops that are views AND mutations (e.g. transpose_()) -def emit_view_functionalization_body( - f: NativeFunction, - functional_op: NativeFunction -) -> str: - # view op case - assert f.is_view_op - - if f.tag is Tag.inplace_view: - # This op is both an inplace op AND a view op. - # See Note [Functionalization Pass - Inplace View Ops] for details. - # I currently have the view meta call into the out-of-place variant of the view, to avoid - # having to define an extra ~20 inplace {view}_inverse_ functions. - # Most view ops don't have NativeFunctionGroup's both, because we don't define out= variants for view ops. - # I'm assuming that every inplace-view op has a corresponding out-of-place view op, - # with the same name but the trailing underscore removed. - # This is currently asserted at parse time in gen.py (see error_check_native_functions). - assert f.func.kind() is SchemaKind.inplace - # Requirement: Every inplace_view op needs to have a corresponding functional view op, which we paired together beforehand. - assert functional_op is not None - api_name = functional_op.func.name.unambiguous_name() - call_sig = DispatcherSignature.from_schema(functional_op.func) - else: - api_name = f.func.name.unambiguous_name() - call_sig = DispatcherSignature.from_schema(f.func) - - dispatcher_sig = DispatcherSignature.from_schema(f.func) - assert_view_op_properties(f.func) - view_tensor_name = dispatcher_sig.arguments()[0].name - - keyset = 'dispatchKeySet & c10::after_func_keyset' - return_type = dispatcher_sig.returns_type().remove_const_ref().cpp_type() - - unwrap_tensor_args_str, unwrapped_args_ctx = unwrap_tensor_args(dispatcher_sig) - view_redispatch_args = [keyset] + [e.expr for e in translate(unwrapped_args_ctx, call_sig.arguments(), method=False)] - - forward_lambda = FunctionalizationLambda.from_func(f, functional_op=functional_op, is_reverse=False) - reverse_lambda = FunctionalizationLambda.from_func(f, functional_op=functional_op, is_reverse=True) - - # The meta API call should use the same arguments, but convert all tensors to meta tensors first. - meta_conversion_str, meta_call_ctx = convert_to_meta_tensors(dispatcher_sig) - meta_call_args = [e.expr for e in translate(meta_call_ctx, call_sig.arguments(), method=False)] - - if f.tag is Tag.inplace_view: - # See Note [Functionalization Pass - Inplace View Ops] for more details - return f""" - at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta( - {forward_lambda.decl()} {{ - return {forward_lambda.inner_call()} - }}, - {reverse_lambda.decl()} {{ - return {reverse_lambda.inner_call()} - }} - ); - at::functionalization::impl::mutate_view_meta({view_tensor_name}, view_meta); - {unwrap_tensor_args_str} - {return_type} reference_tensor_output; - {{ - at::AutoDispatchSkipFunctionalize guard; - {meta_conversion_str} - reference_tensor_output = at::_ops::{api_name}::call({', '.join(meta_call_args)}); - }} - // See Note [Propagating strides in the functionalization pass] - at::functionalization::impl::set_sizes_strides_offset({view_tensor_name}, reference_tensor_output); - return {view_tensor_name}; -""" - - else: - return f""" - {unwrap_tensor_args_str} - {return_type} tmp_output; - {return_type} reference_tensor_output; - {{ - at::AutoDispatchSkipFunctionalize guard; - {meta_conversion_str} - reference_tensor_output = at::_ops::{api_name}::call({', '.join(meta_call_args)}); - tmp_output = at::_ops::{api_name}::redispatch({', '.join(view_redispatch_args)}); - // I'm fusing the [alias removal], [mutation removal], [add views back] passes together. - // Later, we'll want to turn them into separate passes (since e.g. vulkan only cares about alias removal). - }} - at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta( - {forward_lambda.decl()} {{ - return {forward_lambda.inner_call()} - }}, - {reverse_lambda.decl()} {{ - return {reverse_lambda.inner_call()} - }} - ); - auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, {view_tensor_name}, view_meta); - // See Note [Propagating strides in the functionalization pass] - at::functionalization::impl::set_sizes_strides_offset(out, reference_tensor_output); - return out; -""" - -# Generates the Functionalization kernel for inplace ops -def emit_inplace_functionalization_body( - f: NativeFunction, - functional_op: Optional[NativeFunction] -) -> str: - # mutation case - assert(modifies_arguments(f)) - - dispatcher_sig = DispatcherSignature.from_schema(f.func) - - keyset = 'dispatchKeySet & c10::after_func_keyset' - return_type = dispatcher_sig.returns_type().remove_const_ref().cpp_type() - - unwrap_tensor_args_str, unwrapped_args_ctx = unwrap_tensor_args(dispatcher_sig) - - maybe_return = '' if len(f.func.returns) == 0 else 'return ' - sync_tensor_args = '\n '.join(mapMaybe( - lambda arg: f'at::functionalization::impl::sync({arg.name});' - if arg.type.is_tensor_like() else None, - f.func.arguments.flat_all)) - - # Note [functionalizating copy_() and not preserving strides] - # copy_() can't be functionalized, since there doesn't exist an out-of-place variant. - # We could add one, but that would be sub-optimal for functorch: copy() would need to allocate a fresh tensor. - # This may seem like a large hack for one optimization, but copy_() is one of the most common inplace operators. - # Instead, we can replace `self.copy_(src)` with `src.to(self).expand_as(self)`. - # This maintains the exact same semantics, EXCEPT that we don't preserve the strides from `self`. - # This seems like a reasonable tradeoff, for a few reasons: - # - mutation removal is only used by functorch, and not by Vulkan or XLA. Functorch already doesn't preserve strides. - # - There are actually a few other places where the functionalization pass currently doesn't support strides: - # calls to slice/diagonal_scatter don't currently preserve the strides of their inputs (but maybe we should fix this). - if str(f.func.name) == 'copy_': - exprs = [keyset] + [a.name for a in unwrapped_args_ctx] - functional_call_str = f"""\ - auto tmp_intermediate = at::_ops::to_other::redispatch({keyset}, src_, self_, non_blocking, false, c10::nullopt); - tmp_output = at::_ops::expand_as::redispatch({keyset}, tmp_intermediate, self_);""" - elif functional_op is None: - # We can't functionalize this inplace op, since we don't know what the corresponding functional op is. - inplace_exprs = [keyset] + [e.expr for e in translate(unwrapped_args_ctx, dispatcher_sig.arguments(), method=False)] - warn_str = "Note: the functionalization pass encountered an operator ({}) that it could not functionalize, \ -because it couldn't find an out-of-place equivalent of the operator to call. \ -Instead, it's calling the inplace/view operator directly. \ -If this causes problems in your program, consider upstreaming the out-of-place op to PyTorch.".format(str(f.func.name)) - - return f""" - if (c10::impl::tls_local_dispatch_key_set().included_.has(c10::DispatchKey::Functionalize)) {{ - TORCH_WARN("{warn_str}"); - }} - {sync_tensor_args} - {unwrap_tensor_args_str} - at::AutoDispatchSkipFunctionalize guard; - // Redispatch as normally otherwise, since XLA has its own lowerings for special inplace ops. - {maybe_return}at::_ops::{f.func.name.unambiguous_name()}::redispatch({', '.join(inplace_exprs)}); -""" - else: - # call the out-of-place variant of the op - functional_sig = DispatcherSignature.from_schema(functional_op.func) - functional_exprs = [keyset] + [e.expr for e in translate(unwrapped_args_ctx, functional_sig.arguments(), method=False)] - functional_call_str = \ - f"tmp_output = at::_ops::{functional_op.func.name.unambiguous_name()}::redispatch({', '.join(functional_exprs)});" - - mutable_input_post_processing = '\n'.join([ - f""" - auto {a.name}_functional = at::functionalization::impl::unsafeGetFunctionalWrapper({a.name}); - {a.name}_functional->replace_(tmp_output); - {a.name}_functional->commit_update();""" - for a in f.func.arguments.flat_non_out - if a.annotation and a.annotation.is_write and a.type.is_tensor_like()]) - - return f""" - {sync_tensor_args} - {unwrap_tensor_args_str} - {return_type} tmp_output; - {{ - at::AutoDispatchSkipFunctionalize guard; - // The functionalization pass explicitly doesn't pass out= parameters to the redispatch - {functional_call_str} - }} - {mutable_input_post_processing} - {return_str(f)};""" - - -def emit_declaration_for_noncomposite_views(f: NativeFunction) -> str: - # For every view op, we need a corresponding "inverse view" function. - # This generates the declarations so we get a good compiler error when someone adds a new view. - view_inverse_sig = ViewInverseSignature(f) - return view_inverse_sig.decl() - - -# The below functions generate RegisterFunctionalization.cpp -# These files provide the kernels that run the functionalization pass, which can be opted into -# per backend (e.g. XLA or Vulkan), or as a composable transform (functionalize() in functorch). - -def needs_functionalization( - selector: SelectiveBuilder, - f: NativeFunction, -) -> bool: - return (selector.include_all_operators and - (f.is_view_op or modifies_arguments(f))) - - -def gen_functionalization_registration( - selector: SelectiveBuilder, - f: NativeFunction, - composite_implicit_autograd_index: BackendIndex -) -> Optional[str]: - @with_native_function - def emit_registration_helper(f: NativeFunction) -> Optional[str]: - # Note: for now, this logic is meant to avoid registering functionalization kernels for mobile. - # At some point, Vulkan we'll want to use functionalization and we'll need to change this. - if not needs_functionalization(selector, f): - return None - if f.is_view_op and f.has_composite_implicit_autograd_kernel: - metadata = composite_implicit_autograd_index.get_kernel(f) - assert metadata is not None - native_api_name = metadata.kernel - sig = DispatcherSignature.from_schema(f.func) - # Note [Composite view ops in the functionalization pass] - # We don't need to worry about implemententing functionalization kernels for views with - # CompositeImplicitAutograd kernels, because we can just decompose them into their base operators. - # We can't just opt the entire Functionalization dispatch key into the composite keyset though, - # because we don't want to decompose non-view ops that are composite, like `at::ones`. - registration_str = f'static_cast<{sig.ptr_type()}>(at::native::{native_api_name})' - else: - registration_str = f'TORCH_FN(functionalization::{wrapper_name(f.func)})' - - return f'm.impl("{f.func.name}", {registration_str});' - - return emit_registration_helper(f) - -def gen_functionalization_definition( - selector: SelectiveBuilder, - f: NativeFunction, - functional_op: Optional[NativeFunction] -) -> Optional[str]: - @with_native_function - def emit_definition_helper(f: NativeFunction) -> Optional[str]: - if not needs_functionalization(selector, f): - return None - if f.is_view_op and f.has_composite_implicit_autograd_kernel: - # See Note [Composite view ops in the functionalization pass] - return None - # order is important here, ops that are both views and mutations should hit the view path. - if f.is_view_op: - # Every view op is expected to have a functional counterpart (e.g. transpose_() -> transpose()) - assert functional_op is not None - body_str = emit_view_functionalization_body(f, functional_op) - else: - # inplace op - assert modifies_arguments(f) - body_str = emit_inplace_functionalization_body(f, functional_op) - sig = DispatcherSignature.from_schema(f.func) - return f""" - {sig.defn(name=wrapper_name(f.func), is_redispatching_fn=True)} {{ - {body_str} - }} - """ - - return emit_definition_helper(f) - -# See Note [Functionalization Pass: View Inverses]. -@with_native_function -def gen_functionalization_view_inverse_declaration(f: NativeFunction) -> Optional[str]: - # We only need to generate view_inverse declarations for view ops that: - # - aren't composite (since they'll decompose and we'll get them for free). - # - aren't inplace (since they should have a corresponding functional version, which we call instead). - if f.is_view_op and not f.has_composite_implicit_autograd_kernel and not modifies_arguments(f): - output = emit_declaration_for_noncomposite_views(f) - return output - return None diff --git a/tools/codegen/gen_lazy_tensor.py b/tools/codegen/gen_lazy_tensor.py deleted file mode 100644 index b2515d3d083c..000000000000 --- a/tools/codegen/gen_lazy_tensor.py +++ /dev/null @@ -1,227 +0,0 @@ -import pathlib -import argparse -import os -import yaml -from collections import namedtuple -from typing import List, Dict, Union, Sequence, Optional, Callable, Iterable, Iterator, Tuple -from tools.codegen.gen import get_grouped_native_functions, parse_native_yaml -from tools.codegen.model import (FunctionSchema, - NativeFunction, NativeFunctionsGroup, OperatorName) -from tools.codegen.selective_build.selector import SelectiveBuilder -from tools.codegen.utils import concatMap, YamlLoader, FileManager -import tools.codegen.dest as dest -from .gen_backend_stubs import (parse_backend_yaml, error_on_missing_kernels, - gen_dispatchkey_nativefunc_headers, - gen_dispatcher_registrations) - -# Parses the external backend's yaml, and adds a new BackendIndex for the backend's dispatch key. -# Returns a Tuple of (backend_key, autograd_key, cpp_namespace, updated BackendIndex mapping, full_codegen) -ParsedExternalYaml = namedtuple('ParsedExternalYaml', [ - 'backend_key', 'autograd_key', 'cpp_namespace', 'backend_indices', 'full_codegen']) - - -def parse_full_codegen_ops( - backend_yaml_path: str, - grouped_native_functions: Sequence[Union[NativeFunction, NativeFunctionsGroup]], -) -> List[OperatorName]: - - native_functions_map: Dict[OperatorName, NativeFunction] = { - f.func.name: f - for f in concatMap(lambda f: [f] if isinstance(f, NativeFunction) else list(f.functions()), grouped_native_functions) - } - - with open(backend_yaml_path, 'r') as f: - yaml_values = yaml.load(f, Loader=YamlLoader) - assert isinstance(yaml_values, dict) - - full_codegen = yaml_values.pop('full_codegen', []) - assert isinstance(full_codegen, list), f'expected "full_codegen" to be a list, but got: {full_codegen}' - full_codegen = [OperatorName.parse(name) for name in full_codegen] - - return full_codegen - - -def main() -> None: - parser = argparse.ArgumentParser(description='Generate Lazy Tensor backend files') - parser.add_argument( - '-s', - '--source_yaml', - help='path to source yaml file containing operator external definitions') - parser.add_argument( - '-o', '--output_dir', help='output directory') - parser.add_argument( - '--dry_run', type=bool, default=False, help='output directory') - parser.add_argument( - '--impl_path', type=str, default=None, help='path to the source C++ file containing kernel definitions') - parser.add_argument( - '--gen_ts_lowerings', action="store_true", help='Generate TorchScript lowerings in addition to Lazy IR and NativeFunctions') - parser.add_argument( - '--node_base', type=str, default="Node", help='Name of backend specific custom Lazy IR Node base class') - parser.add_argument( - '--node_base_hdr', type=str, default=None, help='Path to header file defining custom Lazy IR Node base class') - parser.add_argument( - '--tensor_class', type=str, default="LazyTensor", help='Name of backend specific custom Lazy Tensor class') - parser.add_argument( - '--tensor_class_hdr', type=str, default="lazy_tensor_core/csrc/tensor.h", - help='Path to header file defining custom Lazy Tensor class') - options = parser.parse_args() - - run(options.source_yaml, options.output_dir, options.dry_run, options.impl_path, - options.gen_ts_lowerings, options.node_base, options.node_base_hdr, - options.tensor_class, options.tensor_class_hdr) - - -def run(source_yaml: str, output_dir: str, dry_run: bool, impl_path: Optional[str], - gen_ts_lowerings: bool, node_base: str, node_base_hdr: Optional[str], - tensor_class: str, tensor_class_hdr: str) -> None: - - # Assumes that this file lives at PYTORCH_ROOT/tools/codegen/gen_backend_stubs.py - pytorch_root = pathlib.Path(__file__).parent.parent.parent.absolute() - template_dir = os.path.join(pytorch_root, "aten/src/ATen/templates") - - def make_file_manager(install_dir: str) -> FileManager: - return FileManager(install_dir=install_dir, template_dir=template_dir, dry_run=dry_run) - - fm = make_file_manager(output_dir) - - native_yaml_path = os.path.join(pytorch_root, 'aten/src/ATen/native/native_functions.yaml') - parsed_yaml = parse_native_yaml(native_yaml_path) - native_functions, backend_indices = parsed_yaml.native_functions, parsed_yaml.backend_indices - grouped_native_functions = get_grouped_native_functions(native_functions) - - def sort_native_function(f: Union[NativeFunctionsGroup, NativeFunction]) -> str: - """ - We sort the native function because of the note in concat_map_codegen. - TODO(alanwaketan): Remove this sorting hack once all ops are grouped properly. - """ - func = f.functional.func if isinstance(f, NativeFunctionsGroup) else f.func - return str(func.name.name) - - grouped_native_functions = sorted(grouped_native_functions, key=sort_native_function) - parsed_backend_yaml = parse_backend_yaml(source_yaml, grouped_native_functions, backend_indices) - backend_key = parsed_backend_yaml.backend_key - autograd_key = parsed_backend_yaml.autograd_key - cpp_namespace = parsed_backend_yaml.cpp_namespace - backend_indices = parsed_backend_yaml.backend_indices - full_codegen = parse_full_codegen_ops(source_yaml, grouped_native_functions) - - def concat_map_codegen(func: Callable[[NativeFunction], Sequence[str]], - xs: Iterable[Union[NativeFunctionsGroup, NativeFunction]], - *, codegenInplaceVariant: bool = False) -> Iterator[str]: - """ - We code-gen for the functional variant, which is all we need for IR classes/lowerings/shape inferences, but we - only code-gen additional entries for the inplace variant for the native functions. - Note: If xs is not sorted, there may be an edge case when generating IR classes. Considering relu and relu_, if - we encounter relu_ before relu. we will then generate an IR class with op = at::aten::relu_ for both relu and - relu_ which will cause problems for relu. - TODO(alanwaketan): Once all ops are grouped properly, we should no longer need this hack. - """ - generated = set() - - def gen_key(func: FunctionSchema) -> Tuple[str, str]: - # we want to generate unique entries for overloads of functional variants, - # but not for inplace variants unless explicitly told `codegenInplaceVariant` - return (func.name.name.base, func.name.overload_name) - - for x in xs: - f = x.functional if isinstance(x, NativeFunctionsGroup) else x - # For the 'or'd terms: - # 1. codegenInplaceVariant means we can generate the in-place variant corresponding items. - # 2. not f.func.name.name.inplace means the op is not a in-place variant, so we can generate the item. - # 3. f.func.name.name.base not in generated means even for in-place ops we still need to generate the item - # as if they were the functional variants for one time. - if f.func.name in full_codegen and \ - (codegenInplaceVariant or not f.func.name.name.inplace or gen_key(f.func) not in generated): - generated.add(gen_key(f.func)) - for r in func(f): - yield r - - selector = SelectiveBuilder.get_nop_selector() - - assert backend_key is not None - class_name = backend_indices[backend_key].native_function_class_name() - - if impl_path is not None: - error_on_missing_kernels(native_functions, backend_indices, backend_key, - autograd_key, impl_path, full_codegen) - - assert class_name is not None - - # Generate nativefunction declarations - gen_dispatchkey_nativefunc_headers(fm, class_name, cpp_namespace, backend_indices, - grouped_native_functions, backend_key, autograd_key) - - # Generate Dispatcher registrations which hook up the nativefunctions - for dispatch_key in [backend_key] if autograd_key is None else [backend_key, autograd_key]: - gen_dispatcher_registrations(fm, output_dir, cpp_namespace, backend_indices, grouped_native_functions, - backend_key, dispatch_key, selector) - - # Generate native function impls that build IR nodes - fm.write_with_template(f'{backend_key}NativeFunctions.cpp', 'DispatchKeyNativeFunctions.cpp', lambda: { - 'includes': [f'#include <{path}>' for path in [ - tensor_class_hdr, - "ATen/MetaFunctions.h", - "torch/csrc/lazy/core/metrics.h", - "torch/csrc/lazy/core/shape.h", - "lazy_tensor_core/csrc/aten_ltc_bridge.h", - "lazy_tensor_core/csrc/lazy_graph_executor.h", - f"{output_dir}/{backend_key}NativeFunctions.h", - f"{output_dir}/{backend_key}LazyIr.h", - f"{output_dir}/{backend_key}ShapeInference.h", - ]], - 'native_functions_include': '', - 'backend_namespace': 'torch_lazy_tensors', # this is wrong - 'native_function_definitions': - list(concat_map_codegen( - dest.GenLazyNativeFuncDefinition(f'{backend_key}NativeFunctions', - backend_indices[backend_key], - tensor_class), - grouped_native_functions, - codegenInplaceVariant=True - )), - }) - # Generate headers for shape/dtype funcs for non-meta kernels - fm.write_with_template(f'{backend_key}ShapeInference.h', 'ShapeInference.h', lambda: { - 'lazy_ir_sysinc': [f'#include <{path}>' for path in [ - "ATen/Tensor.h", - "c10/core/ScalarType.h", - "c10/util/Optional.h", - "torch/csrc/lazy/core/ir.h", - "torch/csrc/lazy/core/shape.h", - "vector", - ]], - 'lazy_ir_inc': [], - 'DispatchKey': backend_key, - 'dispatch_namespace': backend_key.lower(), - 'func_declarations': list(concat_map_codegen( - dest.GenLazyShapeInferenceDefinition(backend_indices[backend_key], - tensor_class), - grouped_native_functions - )), - }) - # Generate IR node classes - fm.write_with_template(f'{backend_key}LazyIr.h', 'LazyIr.h', lambda: { - 'lazy_ir_sysinc': [f'#include <{path}>' for path in [ - "ATen/core/Formatting.h", - "c10/core/ScalarType.h", - "c10/util/Optional.h", - "torch/csrc/lazy/core/hash.h", - "torch/csrc/lazy/core/ir.h", - "vector", - ]], - 'lazy_ir_inc': [f'#include "{path}"' for path in [ - node_base_hdr if node_base_hdr is not None else None - ] if path is not None], - 'external_backend_headers': f'#include "{output_dir}/{backend_key}NativeFunctions.h"', - 'namespaced_headers': '', - 'DispatchKey': backend_key, - 'dispatch_namespace': backend_key.lower(), - 'ir_declarations': list(concat_map_codegen( - dest.LazyIR(backend_indices[backend_key], node_base), - grouped_native_functions - )), - }) - - -if __name__ == '__main__': - main() diff --git a/tools/codegen/model.py b/tools/codegen/model.py deleted file mode 100644 index 6bc0d7df1002..000000000000 --- a/tools/codegen/model.py +++ /dev/null @@ -1,1639 +0,0 @@ -import re - -from tools.codegen.utils import assert_never - -from dataclasses import dataclass -from typing import List, Dict, Optional, Iterator, Tuple, Set, Sequence, Callable, Union -from enum import Enum, auto -import itertools - -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# -# DATA MODEL -# -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# -# Some general principles for our data model. -# -# - Stop using C++ data types as the internal data representation -# format. Instead, the internal data structures are centered -# around JIT schema representation. This avoid a big problem -# with the old codegen where we read in all the types from -# native_functions.yaml and then immediately had to retranslate -# them into C++ types. -# -# - More semantic data representation. Instead of representing -# everything as dicts and strings, we define dataclasses for -# every interesting entity the code generation has to deal with. -# These dataclasses have strong semantic invariants: for example, -# we generally require them to roundtrip losslessly into the -# form they were parsed from. These structures are immutable -# and you're expected to populate information once during -# construction. - -# Represent a source location; used for better error reporting -@dataclass(frozen=True) -class Location: - file: str - line: int - - def __str__(self) -> str: - return "{}:{}".format(self.file, self.line) - -# Valid values of the 'variants' field in native_functions.yaml -Variant = Enum('Variant', ('function', 'method')) - -# NOTE: Keep the list in sync with `DispatchKey` in c10/core/DispatchKey.h -class DispatchKey(Enum): - Undefined = 0 - CatchAll = Undefined - - CPU = auto() - CUDA = auto() - HIP = auto() - FPGA = auto() - ORT = auto() - XLA = auto() - Lazy = auto() - Vulkan = auto() - Metal = auto() - XPU = auto() - MKLDNN = auto() - OpenGL = auto() - OpenCL = auto() - IDEEP = auto() - QuantizedCPU = auto() - QuantizedCUDA = auto() - QuantizedXPU = auto() - CustomRNGKeyId = auto() - MkldnnCPU = auto() - SparseCPU = auto() - SparseCUDA = auto() - SparseCsrCPU = auto() - SparseCsrCUDA = auto() - SparseHIP = auto() - SparseXPU = auto() - NestedTensor = auto() - PrivateUse1 = auto() - PrivateUse2 = auto() - PrivateUse3 = auto() - EndOfBackendKeys = PrivateUse3 - - ZeroTensor = auto() - Meta = auto() - BackendSelect = auto() - Named = auto() - AutogradOther = auto() - AutogradCPU = auto() - AutogradCUDA = auto() - AutogradXLA = auto() - AutogradLazy = auto() - AutogradNestedTensor = auto() - AutogradXPU = auto() - AutogradPrivateUse1 = auto() - AutogradPrivateUse2 = auto() - AutogradPrivateUse3 = auto() - Tracer = auto() - Autocast = auto() - Batched = auto() - VmapMode = auto() - TESTING_ONLY_GenericWrapper = auto() - TESTING_ONLY_GenericMode = auto() - NumDispatchKeys = auto() - Autograd = auto() - CompositeImplicitAutograd = auto() - CompositeExplicitAutograd = auto() - EndOfAliasKeys = CompositeExplicitAutograd - - CPUTensorId = CPU - CUDATensorId = CUDA - PrivateUse1_PreAutograd = AutogradPrivateUse1 - PrivateUse2_PreAutograd = AutogradPrivateUse2 - PrivateUse3_PreAutograd = AutogradPrivateUse3 - - def __str__(self) -> str: - return self.name - - def lower(self) -> str: - return str(self).lower() - - @staticmethod - def parse(value: str) -> 'DispatchKey': - for k, v in DispatchKey.__members__.items(): - if k == value: - return v - raise AssertionError(f'unknown dispatch key {value}') - -STRUCTURED_DISPATCH_KEYS = {DispatchKey.CUDA, DispatchKey.CPU} - -# Dispatch keys that "support all backends". These codegen slightly differently -# then backend specific keys. -def is_generic_dispatch_key(dk: DispatchKey) -> bool: - return dk in {DispatchKey.CompositeExplicitAutograd, DispatchKey.CompositeImplicitAutograd} - -# CUDA specific dispatch keys -def is_cuda_dispatch_key(dk: DispatchKey) -> bool: - return dk in { - DispatchKey.CUDA, - DispatchKey.QuantizedCUDA, - DispatchKey.SparseCUDA, - DispatchKey.SparseCsrCUDA, - DispatchKey.AutogradCUDA, - DispatchKey.CUDATensorId, - } - -# Structured kernel generation is only supported for certain key types; -# otherwise use old-style -def is_structured_dispatch_key(dk: DispatchKey) -> bool: - return dk in STRUCTURED_DISPATCH_KEYS - -class DeviceCheckType(Enum): - NoCheck = 0 - ExactSame = 1 - -class Tag(Enum): - inplace_view = 0 - - def __str__(self) -> str: - return self.name - - @staticmethod - def parse(value: str) -> 'Tag': - for k, v in Tag.__members__.items(): - if k == value: - return v - raise AssertionError(f'unknown tag {value}') - -# The basic input to the code generation is native_functions.yaml. -# The name "native", BTW, comes from the distinction between native -# functions and legacy TH functions. The legacy TH functions are gone, -# but the "native" descriptor has stuck. -# -# NativeFunction models a single entry in native_functions.yaml. Its -# fields roughly correspond to what you would see in the YAML itself, -# but after canonicalization and parsing has occurred. -# -# You can see some of the overall design patterns for how we setup -# dataclasses in this class, but we will defer a complete discussion -# of this at FunctionSchema. -@dataclass(frozen=True) -class NativeFunction: - # The function schema of the operator in question. This schema - # has been parsed; see FunctionSchema for more about its structure. - # (This type is quoted as we are forward referencing a type - # defined later in the file. I opted for this ordering of the - # classes for expository clarity.) - func: 'FunctionSchema' - - # Whether or not to generate mutable tensor arguments like regular - # ones - use_const_ref_for_mutable_tensors: bool - - # Whether or not to omit automatic generation of a DeviceGuard - device_guard: bool - - # How to emit automatic generation of device check - device_check: DeviceCheckType - - # What python module to put the function in - python_module: Optional[str] - - # TODO: figure out what this does - category_override: Optional[str] - - # If no variants are specified in native_functions.yaml, this is - # assumed to be {'function'}. - variants: Set[Variant] - - # Whether or not we should skip generating registrations for - # this kernel. This is a bit of a double-edged sword, as manual - # registrations don't participate in codegen-based selective build! - manual_kernel_registration: bool - - # Whether or not to skip generating TensorMethod/Functions bindings - # for this kernel. Technically, this doesn't actually skip generating - # the binding; instead, the binding gets generated to __dispatch_{funcname} - # so you can make use of the normal binding if you need it. - manual_cpp_binding: bool - - # The location in the YAML file were this native function entry was - # defined. This is for conveniently reporting error messages! - loc: 'Location' - - # Whether or not this out functions is a "structured kernel". Structured - # kernels are defined a little differently from normal kernels; in - # particular, their shape checking logic is defined separately from - # the kernel. Only out functions can be structured; other functions - # delegate to the out function using the structured_delegate keyword. - # Every structured kernel must have at least an out and a functional - # variant. - structured: bool - - # Whether or not this non-out function is a structured kernel, defined - # in terms of the out kernel referenced by the string here. - structured_delegate: Optional['OperatorName'] - - # Only valid for structured kernels. Specifies alternative of what - # to inherit from when defining the meta class for the structured - # operator. This will usually be TensorIteratorBase. This also - # changes the semantics of set_output to call the parent class. - structured_inherits: Optional[str] - - # Structured kernels can declare elements as "precomputed". These elements - # are returned by the meta function in one struct and passed to the impl - # function in lieu of certain kernel arguments that these precomputed - # elements supersede. Information about the names and types of these - # precomputed elements and how they correspond to kernel arguments is stored - # in this member, if applicable. - precomputed: Optional['Precompute'] - - # Argument names whose default should be excluded from the C++ interface. - # Intended for resolving overload ambiguities between signatures. - cpp_no_default_args: Set[str] - - # Note [Abstract ATen methods] - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # An abstract ATen method is one whose dispatch differs between - # types. These are implemented in derived types (with a - # standard (throwing) definition in Type). A concrete ATen - # method is one which has the same dispatch for all types; - # we just implement it in the base Type. This is exposed - # in Declarations.yaml via a field named 'abstract'. - is_abstract: bool - - # Whether or not the NativeFunction contains a backend-agnostic kernel - has_composite_implicit_autograd_kernel: bool - has_composite_explicit_autograd_kernel: bool - - # Tags are used to describe semantic information about (groups of) operators, - # That aren't easily inferrable directly from the operator's schema. - # For now operators have at most one tag. - tag: Optional['Tag'] - - # NB: The benefit of defining a dataclass is that we automatically get - # a constructor defined for all the fields we specify. No need - # to explicitly write it out. - - # We parse both the NativeFunction + backend-specific information about it, which it stored in a corresponding BackendIndex. - @staticmethod - def from_yaml( - ei: Dict[str, object], - loc: 'Location' - ) -> Tuple['NativeFunction', Dict[DispatchKey, Dict['OperatorName', 'BackendMetadata']]]: - """ - Parse a NativeFunction from a dictionary as directly parsed - from native_functions.yaml - """ - e = ei.copy() - - funcs = e.pop('func') - assert isinstance(funcs, str), f'not a str: {funcs}' - func = FunctionSchema.parse(funcs) - - cpp_no_default_args_list = e.pop('cpp_no_default_args', []) - assert isinstance(cpp_no_default_args_list, list) - cpp_no_default_args = set(cpp_no_default_args_list) - - use_const_ref_for_mutable_tensors = e.pop('use_const_ref_for_mutable_tensors', False) - assert isinstance(use_const_ref_for_mutable_tensors, bool) - - variants_s = e.pop('variants', 'function') - assert isinstance(variants_s, str) - variants: Set[Variant] = set() - for v in variants_s.split(', '): - if v == 'function': - variants.add(Variant.function) - elif v == 'method': - variants.add(Variant.method) - else: - raise AssertionError(f'illegal variant {v}') - - manual_kernel_registration = e.pop('manual_kernel_registration', False) - assert isinstance(manual_kernel_registration, bool), f'not a bool: {manual_kernel_registration}' - - manual_cpp_binding = e.pop('manual_cpp_binding', False) - assert isinstance(manual_cpp_binding, bool), f'not a bool: {manual_cpp_binding}' - - device_guard = e.pop('device_guard', True) - assert isinstance(device_guard, bool), f'not a bool: {device_guard}' - - device_check_s = e.pop('device_check', None) - assert device_check_s is None or isinstance(device_check_s, str), f'not a str: {device_check_s}' - device_check: DeviceCheckType - if device_check_s is None: - device_check = DeviceCheckType.ExactSame - else: - device_check = DeviceCheckType[device_check_s] - - structured = e.pop('structured', False) - assert isinstance(structured, bool), f'not a bool: {structured}' - - structured_delegate_s = e.pop('structured_delegate', None) - assert structured_delegate_s is None or isinstance(structured_delegate_s, str), f'not a str: {structured_delegate}' - structured_delegate: Optional[OperatorName] = None - if structured_delegate_s is not None: - structured_delegate = OperatorName.parse(structured_delegate_s) - - structured_inherits = e.pop('structured_inherits', None) - assert structured_inherits is None or isinstance(structured_inherits, str), f'not a str: {structured_inherits}' - - python_module = e.pop('python_module', None) - assert python_module is None or isinstance(python_module, str), f'not a str: {python_module}' - - category_override = e.pop('category_override', None) - assert category_override is None or isinstance(category_override, str), f'not a str: {category_override}' - - precomputed_dict = e.pop('precomputed', None) - assert precomputed_dict is None or structured is True - precomputed = Precompute.parse(precomputed_dict) if precomputed_dict else None - - tag_str = e.pop('tags', None) - assert tag_str is None or isinstance(tag_str, str), f'not a str: {tag_str}' - tag = Tag.parse(tag_str) if tag_str else None - - from tools.codegen.api import cpp - - raw_dispatch = e.pop('dispatch', None) - assert raw_dispatch is None or isinstance(raw_dispatch, dict), e - dispatch: Dict[DispatchKey, str] = {} - if raw_dispatch is not None: - assert not manual_kernel_registration, \ - "cannot specify both manual_kernel_registration and dispatch; with " \ - "manual registration, dispatch has no effect!" - for ks, v in raw_dispatch.items(): - if ks == '__line__': - continue # not worth tracking line numbers for dispatch entries - assert isinstance(ks, str), e - assert isinstance(v, str), e - for k in ks.split(","): - dispatch_key = DispatchKey.parse(k.strip()) - dispatch[dispatch_key] = v - assert dispatch != {DispatchKey.CompositeImplicitAutograd: cpp.name(func)}, \ - "unnecessary dispatch table for this function; just delete the dispatch " \ - "key entirely" - # if a function is a structured delegate, deleting the dispatch - # table is NOT semantics preserving - assert structured_delegate or dispatch.keys() != {DispatchKey.CompositeImplicitAutograd}, \ - f"unexpected name for singleton CompositeImplicitAutograd dispatch entry: expected {cpp.name(func)} " \ - f"but got {dispatch[DispatchKey.CompositeImplicitAutograd]}. Rename your implementation to the expected " \ - "name, then delete the dispatch table" - elif not structured and structured_delegate is None: - dispatch[DispatchKey.CompositeImplicitAutograd] = cpp.name(func) - - assert not (DispatchKey.CompositeExplicitAutograd in dispatch and DispatchKey.CompositeImplicitAutograd in dispatch), \ - "cannot specify both CompositeExplicitAutograd and CompositeImplicitAutograd on a single kernel; each " \ - "strictly subsumes the other. If you wanted to provide an explicit autograd " \ - "implementation, specify CompositeExplicitAutograd; otherwise specify CompositeImplicitAutograd only" - - if structured_delegate: - # Structured functions MUST have a dispatch table - is_abstract = True - else: - is_abstract = dispatch.keys() != {DispatchKey.CompositeImplicitAutograd} - - has_composite_implicit_autograd_kernel = DispatchKey.CompositeImplicitAutograd in dispatch.keys() - has_composite_explicit_autograd_kernel = DispatchKey.CompositeExplicitAutograd in dispatch.keys() - - # BackendMetadata is used to store any information about a NativeFunction that is backend dependent. - # The most obvious information is the kernel name, which usually contains the name of the backend in it for cpu/cuda. - # Why is 'structured' included? External backends (e.g. XLA) opt into which ops are structured - # independently of which in-tree ops are structured - backend_metadata = {k: {func.name: BackendMetadata( - kernel=v, structured=structured and is_structured_dispatch_key(k))} for k, v in dispatch.items()} - - # don't care if it exists or not; make it easier to use this function - # with other yaml parsers that aren't setting __line__ in the dict - e.pop('__line__', None) - assert not e, f"leftover entries: {e}" - - # Asserts that we can't do in post_init, because they rely on backend-specific info - if structured_delegate is not None: - for key in STRUCTURED_DISPATCH_KEYS: - assert key not in dispatch, \ - f"if structured_delegate, then must not have {key} in dispatch dictionary " \ - "(it is delegated!)" - - return NativeFunction( - func=func, - use_const_ref_for_mutable_tensors=use_const_ref_for_mutable_tensors, - variants=variants, - structured=structured, - structured_delegate=structured_delegate, - structured_inherits=structured_inherits, - precomputed=precomputed, - manual_kernel_registration=manual_kernel_registration, - manual_cpp_binding=manual_cpp_binding, - python_module=python_module, - category_override=category_override, - device_guard=device_guard, - device_check=device_check, - loc=loc, - cpp_no_default_args=cpp_no_default_args, - is_abstract=is_abstract, - has_composite_implicit_autograd_kernel=has_composite_implicit_autograd_kernel, - has_composite_explicit_autograd_kernel=has_composite_explicit_autograd_kernel, - tag=tag, - ), backend_metadata - - - def validate_unstructured(self) -> None: - # TODO: probably better to accumulate these errors and report them all - # at once - assert not self.structured, "This function is structured, but there was " \ - "no valid functional variant of it." - assert self.structured_delegate, "This function delegates to another structured out function, " \ - "but no valid function was found (the delegate may not exist, or it has the wrong type)" - - # __post_init__ functions in dataclasses can be used to do extra - # validation after construction. - # - # Notice that we don't do any type validation here. In fact, we - # rely exclusively on mypy to check if you've done types correctly! - # Validation is for nontrivial invariants that cannot be (conveniently) - # encoded in the type system. - def __post_init__(self) -> None: - if self.func.arguments.out: - assert self.variants == {Variant.function}, "Native functions with out arguments MUST " \ - "be declared with only function variant; e.g., variants: function; " \ - "otherwise you will tickle a Python argument binding bug " \ - "(which usually manifests itself as the result variable being undefined.)" - if self.structured: - assert self.func.kind() == SchemaKind.out, "Put structured field on the out= " \ - "variant of a function; did you mean structured_delegate?" - assert self.device_guard, "device_guard: False is not respected by structured kernels" - if self.structured_delegate: - assert self.func.kind() != SchemaKind.out, "structured_delegate field not allowed " \ - "on out= functions; did you mean structured?" - assert self.device_guard, "device_guard: False is not respected by structured kernels" - # Technically, with the asserts above, this assert is impossible to - # happen - assert not (self.structured and self.structured_delegate), \ - "Cannot have both structured and structured_delegate on function" - defaulted_arguments = {a.name for a in self.func.schema_order_arguments() - if a.default is not None} - invalid_args = set.difference(self.cpp_no_default_args, defaulted_arguments) - assert len(invalid_args) == 0, f'Invalid cpp_no_default_args: {invalid_args}' - if self.structured_inherits is not None: - assert self.structured, "structured_inherits must also imply structured: True" - if str(self.func.name).startswith('_foreach'): - assert self.device_check == DeviceCheckType.NoCheck, \ - "foreach kernels fall back to slow path when tensor are on different devices, " \ - "device_check not allowed to be enabled" - - @property - def has_composite_kernel(self) -> bool: - return self.has_composite_implicit_autograd_kernel or self.has_composite_explicit_autograd_kernel - - @property - def is_view_op(self) -> bool: - rets = self.func.returns - is_non_mutating_view = len(rets) > 0 and any(r.annotation is not None and not r.annotation.is_write for r in rets) - is_inplace_view = self.tag is not None and self.tag is Tag.inplace_view - is_wildcard_view = any(inp.annotation is not None and - inp.annotation.alias_set_after != "" for inp in self.func.schema_order_arguments()) - return is_non_mutating_view or is_inplace_view or is_wildcard_view - - @property - def root_name(self) -> str: - return self.func.name.name.base - -SchemaKind = Enum('SchemaKind', ('functional', 'inplace', 'out')) - -# A structured kernel is guaranteed to have a functional and out variant, and -# optionally an inplace variant. -# -# NB: we create NativeFunctionsGroup *even if* the function is not -# actually annotated structured. Test the structured boolean to see if it -# actually is structured or not. -@dataclass(frozen=True) -class NativeFunctionsGroup: - functional: NativeFunction - inplace: Optional[NativeFunction] - out: NativeFunction - - @property - def structured(self) -> bool: - # Whether or not the operator has a meta() function. This information is backend-agnostic. - return self.out.structured - - def __post_init__(self) -> None: - test_sig: FunctionSchema = self.functional.func.signature() - for f in self.functions(): - if test_sig != f.func.signature(): - raise AssertionError( - "NativeFunctionsGroup constructed from two NativeFunctions " - f"that don't have matching signatures: {test_sig} != {f.func.signature()}" - ) - assert self.functional.func.kind() == SchemaKind.functional - assert self.out.func.kind() == SchemaKind.out - if self.inplace is not None: - assert self.inplace.func.kind() == SchemaKind.inplace - - if self.structured: - # For now, structured composite kernels are not supported (need some - # design work to figure out how to make the composite case work) - assert not self.out.has_composite_implicit_autograd_kernel - - assert self.functional.structured_delegate == self.out.func.name, \ - f"{self.functional.func.name} delegates to {self.functional.structured_delegate} " \ - f"but its actual delegate is {self.out.func.name}" - if self.inplace is not None: - assert self.inplace.structured_delegate == self.out.func.name - - def signature(self) -> 'FunctionSchema': - return self.out.func.signature() - - def functions(self) -> Iterator[NativeFunction]: - yield self.functional - yield self.out - if self.inplace is not None: - yield self.inplace - - @property - def root_name(self) -> str: - return self.functional.root_name - - @staticmethod - def from_dict(d: Dict[SchemaKind, NativeFunction]) -> Optional['NativeFunctionsGroup']: - assert d - if len(d) == 1: - return None - d = dict(d) # non-destructive updates please - functional = d.pop(SchemaKind.functional, None) - inplace = d.pop(SchemaKind.inplace, None) - out = d.pop(SchemaKind.out, None) - assert not d - assert functional is not None - # There are a few operators which only have functional/inplace variants; - # these don't count as structured for our purposes here - if out is None: - return None - - return NativeFunctionsGroup( - functional=functional, - inplace=inplace, - out=out, - ) - -def is_foreach_op(name: str) -> bool: - return str(name) in set([ - '_amp_foreach_non_finite_check_and_unscale_', - '_foreach_add_.ScalarList', - '_foreach_sub_.ScalarList', - '_foreach_mul_.ScalarList', - '_foreach_div_.ScalarList', - '_foreach_add_.Scalar', - '_foreach_sub_.Scalar', - '_foreach_mul_.Scalar', - '_foreach_div_.Scalar', - '_foreach_add_.List', - '_foreach_sub_.List', - '_foreach_mul_.List', - '_foreach_div_.List', - '_foreach_exp_', - '_foreach_sqrt_', - '_foreach_abs_', - '_foreach_acos_', - '_foreach_asin_', - '_foreach_atan_', - '_foreach_ceil_', - '_foreach_cos_', - '_foreach_cosh_', - '_foreach_erf_', - '_foreach_erfc_', - '_foreach_expm1_', - '_foreach_floor_', - '_foreach_log_', - '_foreach_log10_', - '_foreach_log1p_', - '_foreach_log2_', - '_foreach_neg_', - '_foreach_tan_', - '_foreach_tanh_', - '_foreach_sin_', - '_foreach_sinh_', - '_foreach_round_', - '_foreach_lgamma_', - '_foreach_frac_', - '_foreach_reciprocal_', - '_foreach_sigmoid_', - '_foreach_trunc_', - '_foreach_addcmul_.Scalar', - '_foreach_addcdiv_.Scalar', - '_foreach_addcmul_.ScalarList', - '_foreach_addcdiv_.ScalarList', - '_foreach_zero_']) - -@dataclass(frozen=True) -class BackendMetadata: - # The name of the backend kernel, for a given operator - # for in-tree backends. These names come directly from the 'dispatch" field - # in native_functions.yaml. The dispatch entry is optional; in that - # case, that is equivalent to having written: - # - # dispatch: - # CompositeImplicitAutograd: $operator_name - kernel: str - # Whether or not the operator has a structured kernel implemented, for this particular backend. - # For in-tree backends, they all have the same value for structured- this is listed - # in native_functions.yaml. - # However, external backends like XLA can indendently toggle which ops are structured. - structured: bool - # - - -# BackendIndex represents a backend. -# The BackendIndex encodes per-operator information that is potentially different -# for each backend. The most obvious example is the name of the kernel -# (the 'dispatch' entry in native_functions.yaml). -# However, there can be other examples of different backends having different information. -# External backends can choose to opt their kernels to be structured independently from in-tree backends, -# which means that this information isn't inherentely tied to a NativeFunction- it's different per backend. -@dataclass(frozen=True) -class BackendIndex: - dispatch_key: DispatchKey - # Mainly important for structured kernels, this determines which variant in the operator group is used to implement the others. - # All in-tree ops use out kernels, while XLA uses functional kernels. - use_out_as_primary: bool - # Whether the backend requires a device guard, and device checks. - # For in-tree backends, this is currently just CUDA/HIP - # For out-of-tree backends, this is currently just Intel XPU - device_guard: bool - # Whether the backend is in-tree (CPU/CUDA) or out-of-tree (XLA) - external: bool - # Other backend-specific information that is on a per-operator basis - index: Dict['OperatorName', BackendMetadata] - - @staticmethod - def grow_index( - parent_index: Dict[DispatchKey, Dict['OperatorName', BackendMetadata]], - child_index: Dict[DispatchKey, Dict['OperatorName', BackendMetadata]] - ) -> None: - for k, v in child_index.items(): - for op_name, metadata in v.items(): - assert op_name not in parent_index[k], f'duplicate operator {op_name} for dispatch key {k}' - parent_index[k][op_name] = metadata - - def primary(self, g: NativeFunctionsGroup) -> NativeFunction: - if self.use_out_as_primary: - return g.out - else: - return g.functional - - def has_kernel(self, g: Union[NativeFunction, NativeFunctionsGroup]) -> bool: - m = self.get_kernel(g) - return m is not None - - - def get_kernel(self, g: Union[NativeFunction, NativeFunctionsGroup]) -> Optional[BackendMetadata]: - if isinstance(g, NativeFunction): - f = g - elif isinstance(g, NativeFunctionsGroup): - f = self.primary(g) - else: - assert_never(f) - if f.func.name not in self.index: - return None - return self.index[f.func.name] - - def native_function_class_name(self) -> Optional[str]: - if self.external: - return f'{str(self.dispatch_key)}NativeFunctions' - else: - # TODO: This discrepancy isn't required; we could also generated - # a class for in-tree kernels. It'll just require carefully - # updating every kernel definition + callsite of every in-tree aten kernel. - return None - - -# The function schema is undoubtedly the most important data structure -# in all of the codegen, as it defines the type signature for operators, -# and most of the code generation we do is type directed (e.g., look at -# the types, decide what to do. Think about how we code generate -# C++ function stubs!) -# -# We will also see in this class the general structure for how we model -# data in this code generation. A few notable properties to point out -# ahead of time: -# -# - These dataclasses are a *lossless* representation of the strings -# they are parsed from. In fact, we assert that given the -# information stored in the dataclass, we can exactly reconstruct -# the string we parsed from (and assert this inside the parse -# definition). There are a few reasons for this: -# -# - If you find that it is difficult to reconstruct the string -# given a dataclass, that is a clue that you are data -# representation is wrong. -# -# - It helps ensure that all relevant information is present -# in the dataclass, so that downstream users aren't tempted -# to reparse the original string to get some information -# that was omitted. -# -# - It forces you to represent the data in-memory in the same way -# it is recorded textually, which makes the dataclasses easier -# to understand for someone who is familiar with the -# textual format. (As a tradeoff, it means you have to model -# the syntax, even when it is inconvenient. But maybe that means -# the syntax is bad!) If you don't understand the internal -# representation, go look at the printing code to see how -# it maps onto the surface syntax! -# -# - It makes it easy to test the parsing code, as parsing code -# that is inconsistent with the string code will fail early -# and loudly. (As a tradeoff, it makes the parsing code a bit -# brittle (in particular, with trivial whitespace changes you -# are likely to trigger an assert error). -# -# In general, try to make the __str__ code as simple as possible -# (even at the cost of more complex parsing logic.) Additionally, -# try to minimize redundancy in data representation. (Precomputed -# fields are OK though: they are defined as a simple function on -# the canonical representation in question.) -# -# - These dataclasses are all frozen; once constructed their -# values never change. This makes it easy to tell where any -# given data came from: just look to the constructor. As a -# tradeoff, you can't easily "decorate" a schema with extra -# information from a post-facto analysis. We impose this -# restriction to make these structures more understandable. -# -@dataclass(frozen=True) -class FunctionSchema: - # The name of the operator this function schema describes. - name: 'OperatorName' - - arguments: 'Arguments' - - # TODO: Need to handle collisions with argument names at some point - returns: Tuple['Return', ...] - - def schema_order_arguments(self) -> Iterator['Argument']: - return itertools.chain( - self.arguments.flat_positional, - self.arguments.flat_kwarg_only, - self.arguments.out - ) - - @staticmethod - def parse(func: str) -> 'FunctionSchema': - # We should probably get a proper parser here - assert ' -> ' in func, "function schema missing return type (spaces are mandatory)" - last_index = func.rfind(" -> ") - func_decl = func[:last_index] - return_decl = func[last_index + len(" -> "):] - ops, args = func_decl.split('(', 1) - assert args[-1] == ")", "Expecting closing )" - args = args[:-1] - name = OperatorName.parse(ops) - arguments = Arguments.parse(args) - returns = parse_returns(return_decl) - r = FunctionSchema( - name=name, - arguments=arguments, - returns=returns - ) - assert str(r) == func, f'{str(r)} != {func}' - return r - - def __post_init__(self) -> None: - for arg, ret in zip(self.arguments.out, self.returns): - assert arg.annotation == ret.annotation, \ - "Out arguments must have matching return Tensor; furthermore, " \ - "the ith-argument needs to correspond to the ith return" - # Invariant: we expect out arguments to appear as keyword arguments in the schema. - # This means that all mutable returns should be aliased to a keyword argument - # (except for "self", which we explicitly don't treat as an out argument because of its use in methods) - # See Note [is_out_fn] - out_and_self = list(self.arguments.out) + [arg for arg in self.arguments.flat_positional if arg.name == "self"] - mutable_returns = [ret for ret in self.returns if ret.annotation is not None and ret.annotation.is_write] - for ret in mutable_returns: - assert any([ret.annotation == arg.annotation for arg in out_and_self]), \ - "All mutable returns must be aliased either to a keyword argument, or to \"self\". " \ - "Did you forget to mark an out argument as keyword-only?" - if self.arguments.out: - assert len(self.arguments.out) == len(self.returns), \ - "Must return as many arguments as there are out arguments" - if self.name.name.inplace: - # TODO: fixme - if not is_foreach_op(str(self.name)): - assert len(self.returns) == 1 - - def is_out_fn(self) -> bool: - # Note [is_out_fn] - # - # out functions are the variants which take an explicit out= argument - # to populate into. We need to know if a schema corresponds to an - # out function for several reasons: - # - # - They codegen differently in C++ API - # - codegen to at::add_out rather than at::add - # - out argument is moved to front of C++ argument list - # - # out functions are DEFINED to be any function with a keyword-only - # argument that is mutable. In principle, this could lead to a - # false positive if you define a function that mutates a - # kwarg only argument, but this isn't the "true" output of this - # function. A more robust definition that would work in this - # case would also look at: - # - # - The output types. Out functions take in the arguments - # they mutate and then return them again; this is sort - # of "definitionally" what makes something an out function. - # Historically, we DO check this for consistency. - # - Correspondence with pure variant. An out function - # should have a signature equivalent to its pure variant, - # but just with extra kwargs for the output elements. This - # is difficult to actually check for and historically - # we only do this check in tools/ - return bool(self.arguments.out) - - def kind(self) -> SchemaKind: - """ - What kind of schema is this? A functional schema is one - that returns a newly allocated output; an inplace schema - modifies the self argument inplace; an out schema writes - the result into an explicitly provided out argument. - """ - is_inplace = self.name.name.inplace - is_out = bool(self.arguments.out) - assert not (is_inplace and is_out) - if is_inplace: - return SchemaKind.inplace - elif is_out: - return SchemaKind.out - else: - return SchemaKind.functional - - def signature(self, *, strip_default: bool = False) -> 'FunctionSchema': - """ - Certain schemas are 'related', in that they are simply - inplace/out/functional versions of the same function. This method - factors these schemas into the "core" functional signature which - is equal across all versions. - - Here is what normalization happens to the schema to convert - it to a signature: - - The overload name is stripped (name is retained, since - it expresses semantic content about what the function does) - - Inplace is set False - - Out arguments are stripped - - Mutability annotations are stripped (this is sound - because you cannot overload on mutability annotation) - - Return names are stripped since they are not overloadable and - some variants have return names but some not - """ - - def strip_ret_annotation(r: Return) -> Return: - return Return( - name=None, - type=r.type, - annotation=None, - ) - - return FunctionSchema( - name=OperatorName( - name=BaseOperatorName( - base=self.name.name.base, - inplace=False, - dunder_method=self.name.name.dunder_method, - ), - overload_name="", # stripped - ), - arguments=self.arguments.signature(strip_default=strip_default), - returns=tuple(map(strip_ret_annotation, self.returns)), - ) - - def __str__(self) -> str: - all_arguments_str = str(self.arguments) - if len(self.returns) == 1: - returns = str(self.returns[0]) # omit parentheses - else: - returns = '(' + ', '.join(map(str, self.returns)) + ')' - return f'{self.name}({all_arguments_str}) -> {returns}' - -# Here is the rest of the data model, described more briefly. - -# Simplified version for what actually shows up in built-ins. -# Look at alias_info.h for expanded syntax. If you need the structure, -# you also need to make this structure recursive so it can be lined -# up with the type components too. For primitives this isn't really -# necessary -@dataclass(frozen=True) -class Annotation: - # Typically only has one element. Not actually a set so - # we can conveniently assume it is canonically ordered - alias_set: Tuple[str, ...] - is_write: bool - alias_set_after: str - - @staticmethod - def parse(ann: str) -> 'Annotation': - # Only handling afterSet == Wildcard for now - becomes_wildcard_index = ann.find(" -> *") - if becomes_wildcard_index != -1: - after_set = "*" - # TODO: im not good enough with regexes to ignore -> * - m = re.match(r'^([a-z])(!?)(!?)$', ann[:becomes_wildcard_index] + ann[becomes_wildcard_index + len(" -> *"):]) - else: - after_set = "" - m = re.match(r'^([a-z])(!?)(!?)$', ann) - assert m is not None, f'unrecognized alias annotation {ann}' - alias_set = (m.group(1),) - is_write = m.group(2) == '!' - r = Annotation(alias_set=alias_set, is_write=is_write, alias_set_after=after_set) - assert str(r) == ann, f'{r} != {ann}' - return r - - def __str__(self) -> str: - alias_set = '|'.join(self.alias_set) - if self.alias_set_after: - alias_set = f'{alias_set}{" -> "}{self.alias_set_after}' - is_write = '!' if self.is_write else '' - return f'{alias_set}{is_write}' - -# The base class for the type system. This is also loosely modeled -# off of jit_type.h, but we've simplified the hierarchy to focus -# in on the aspects of the type system that matter for code generation -# (for example, there's no SingleElementType subclass anymore). -# You never actually construct a Type; usually it's going to be one -# of the subclasses. If Python had ADTs this would be one! -@dataclass(frozen=True) -class Type: - @staticmethod - def parse(t: str) -> 'Type': - r = Type._parse(t) - assert str(r) == t, f'{r} != {t}' - return r - - @staticmethod - def _parse(t: str) -> 'Type': - m = re.match(r'^(.+)\?$', t) - if m is not None: - return OptionalType(Type.parse(m.group(1))) - m = re.match(r'^(.+)\[([0-9]+)?\]$', t) - if m is not None: - size = int(m.group(2)) if m.group(2) is not None else None - return ListType(elem=Type.parse(m.group(1)), size=size) - try: - return BaseType(BaseTy[t]) - except KeyError: - raise RuntimeError(f"unrecognized type {t}") - - def __str__(self) -> str: - raise NotImplementedError - - # WARNING: These concepts are not very well-defined. For example, - # is "int?" nullable? How about "int?[]". They are defined - # so we can conveniently generate legacy Declarations.yaml but - # really we should probably just remove these at some point - - def is_tensor_like(self) -> bool: - raise NotImplementedError - - def is_nullable(self) -> bool: - raise NotImplementedError - - def is_list_like(self) -> Optional['ListType']: - raise NotImplementedError - -# Base types are simple, atomic types with no further structure -BaseTy = Enum('BaseTy', ( - 'Generator', - 'ScalarType', - 'Tensor', - 'int', - 'Dimname', - 'float', - 'str', - 'bool', - 'Layout', - 'Device', - 'Scalar', - 'MemoryFormat', - 'QScheme', - 'Storage', - 'Stream', - 'ConstQuantizerPtr', # TODO: rename -)) - -@dataclass(frozen=True) -class BaseType(Type): - name: BaseTy - - def __str__(self) -> str: - return f'{self.name.name}' - - def is_tensor_like(self) -> bool: - return self.name == BaseTy.Tensor - - def is_nullable(self) -> bool: - return False - - def is_list_like(self) -> Optional['ListType']: - return None - -# Optional types may be specified, or may also be validly given None -@dataclass(frozen=True) -class OptionalType(Type): - elem: Type - - def __str__(self) -> str: - return f'{self.elem}?' - - def is_tensor_like(self) -> bool: - return self.elem.is_tensor_like() - - def is_nullable(self) -> bool: - return True - - def is_list_like(self) -> Optional['ListType']: - return self.elem.is_list_like() - -# List types specify that we may have multiples of an element. We -# also support explicit sizes on list types, but these have -# some nontrivial semantics! (However, for C++ API purposes, explicit -# sizes are mostly erased from the type system.) -# -# DANGER WILL ROBINSON: C++ elaboration depends on elem type; e.g., -# int[] elaborates differently than bool[3]! -@dataclass(frozen=True) -class ListType(Type): - elem: Type - size: Optional[int] - - def __str__(self) -> str: - size = f'{self.size}' if self.size else '' - return f'{self.elem}[{size}]' - - def is_tensor_like(self) -> bool: - return self.elem.is_tensor_like() - - def is_nullable(self) -> bool: - return self.elem.is_nullable() - - def is_list_like(self) -> Optional['ListType']: - return self - -@dataclass(frozen=True) -class Argument: - # NB: I didn't put kwarg_only as a boolean field here, unlike - # c10::Argument, so that printing works correctly - - name: str - type: Type - default: Optional[str] - - # The semantics of the annotation field are a little strange. - # - # Alias annotations parametrize Tensors (since Tensors are the only things - # that can alias.) This motivates why I write Tensor(a!)? (and not, for - # example, Tensor?(a!)), because the (a!) describes aliasing on the tensor, - # which may be optional (i.e., the alias annotation should bind first to - # Tensor, before the optional postfix annotation). - # - # However, despite being a property of Tensor, we (and c10::Argument) - # store the annotation at the top level of the Argument, rather than - # inside the embedded Tensor type. In the C++ version of this - # class, we then go through great lengths to mimic the type - # structure in the annotation structure so we can correlate - # annotations with types. - # - # Now, it turns out, in all applications in code generation, the - # structure of annotated types is very simple. So we just hard - # code it here. But if we ever do get anything more complex, this - # model will have to change! - annotation: Optional[Annotation] - - @staticmethod - def parse(arg: str) -> 'Argument': - name: str - default: Optional[str] - type_and_annot, name_and_default = arg.rsplit(' ', 1) - if '=' in name_and_default: - name, default = name_and_default.split('=') - else: - name = name_and_default - default = None - # TODO: deduplicate annotation matching with Return - match = re.match(r'Tensor\((.+)\)(.*)', type_and_annot) - annotation: Optional[Annotation] - if match: - # If you update this, make sure the __str__ still works too - assert match.group(2) in ['', '?', '[]'], 'unrecognized alias analysis form with Tensor' - type_s = 'Tensor' + match.group(2) - annotation = Annotation.parse(match.group(1)) - else: - type_s = type_and_annot - annotation = None - type = Type.parse(type_s) - r = Argument( - name=name, - type=type, - default=default, - annotation=annotation, - ) - assert str(r) == arg, f'{str(r)} != {arg}' - return r - - @property - def is_write(self) -> bool: - return self.annotation is not None and self.annotation.is_write - - def __str__(self) -> str: - type = f'{self.type}' - if self.annotation: - assert type in ['Tensor', 'Tensor?', 'Tensor[]'] - type = type.replace('Tensor', f'Tensor({self.annotation})') - if self.name is None: - return type - else: - mb_default = '' - if self.default: - mb_default = f'={self.default}' - return f"{type} {self.name}{mb_default}" - - -@dataclass(frozen=True) -class Return: - name: Optional[str] - type: Type - annotation: Optional[Annotation] - - @staticmethod - def parse(arg: str) -> 'Return': - name: Optional[str] - if ' ' in arg: - type_and_annot, name = arg.rsplit(' ', 1) - else: - type_and_annot = arg - name = None - match = re.match(r'Tensor\((.+)\)(.*)', type_and_annot) - annotation: Optional[Annotation] - if match: - # If you update this, make sure the __str__ still works too - assert match.group(2) in ['', '?', '[]'], 'unrecognized alias analysis form with Tensor' - type_s = 'Tensor' + match.group(2) - annotation = Annotation.parse(match.group(1)) - else: - type_s = type_and_annot - annotation = None - type = Type.parse(type_s) - r = Return( - name=name, - type=type, - annotation=annotation, - ) - assert str(r) == arg, f'{str(r)} != {arg}' - return r - - @property - def is_write(self) -> bool: - return self.annotation is not None and self.annotation.is_write - - def __str__(self) -> str: - type = f'{self.type}' - if self.annotation: - assert type in ['Tensor', 'Tensor?', 'Tensor[]'] - type = type.replace('Tensor', f'Tensor({self.annotation})') - if self.name is None: - return type - else: - return f"{type} {self.name}" - - -# Represents the self argument for functions that may be methods -@dataclass(frozen=True) -class SelfArgument: - argument: Argument - -# Bundle of arguments that represent a TensorOptions. This is mostly -# relevant for the public C++ API but we bake it into the core data -# model because other APIs often have to interact with it -@dataclass(frozen=True) -class TensorOptionsArguments: - dtype: Argument - layout: Argument - device: Argument - pin_memory: Argument - - def all(self) -> Sequence[Argument]: - return [self.dtype, self.layout, self.device, self.pin_memory] - -@dataclass(frozen=True) -class Arguments: - # pre_self_positional is usually empty, but is notably non-empty - # for where.self, where the condition argument comes before the - # self argument - pre_self_positional: Tuple[Argument, ...] - self_arg: Optional[SelfArgument] - post_self_positional: Tuple[Argument, ...] - - pre_tensor_options_kwarg_only: Tuple[Argument, ...] - tensor_options: Optional[TensorOptionsArguments] - # post_tensor_options is typically memory format, which should be - # part of tensor options but isn't right now, and is usually - # placed after the tensor options arguments - post_tensor_options_kwarg_only: Tuple[Argument, ...] - - # Unlike in the previous codegen, we have factored out 'out' arguments - # in the canonical representation, removing them from kwarg - # arguments. This choice is justified by numerous downstream - # transformations which treat out arguments specially; additionally, - # you can see that canonicity is not violated! - out: Tuple[Argument, ...] # these are also kwarg-only - - @property - def flat_non_out(self) -> Sequence[Argument]: - ret: List[Argument] = [] - ret.extend(self.flat_positional) - ret.extend(self.flat_kwarg_only) - return ret - - @property - def flat_positional(self) -> Sequence[Argument]: - ret: List[Argument] = [] - ret.extend(self.pre_self_positional) - if self.self_arg is not None: - ret.append(self.self_arg.argument) - ret.extend(self.post_self_positional) - return ret - - # NB: doesn't contain out arguments - @property - def flat_kwarg_only(self) -> Sequence[Argument]: - ret: List[Argument] = [] - ret.extend(self.pre_tensor_options_kwarg_only) - if self.tensor_options is not None: - ret.extend(self.tensor_options.all()) - ret.extend(self.post_tensor_options_kwarg_only) - return ret - - @property - def flat_all(self) -> Sequence[Argument]: - ret: List[Argument] = [] - ret.extend(self.flat_positional) - ret.extend(self.flat_kwarg_only) - ret.extend(self.out) - return ret - - @property - def non_out(self) -> Sequence[Union[Argument, SelfArgument, TensorOptionsArguments]]: - ret: List[Union[Argument, SelfArgument, TensorOptionsArguments]] = [] - ret.extend(self.positional) - ret.extend(self.kwarg_only) - return ret - - @property - def positional(self) -> Sequence[Union[Argument, SelfArgument]]: - ret: List[Union[Argument, SelfArgument]] = [] - ret.extend(self.pre_self_positional) - if self.self_arg is not None: - ret.append(self.self_arg) - ret.extend(self.post_self_positional) - return ret - - @property - def kwarg_only(self) -> Sequence[Union[Argument, TensorOptionsArguments]]: - ret: List[Union[Argument, TensorOptionsArguments]] = [] - ret.extend(self.pre_tensor_options_kwarg_only) - if self.tensor_options is not None: - ret.append(self.tensor_options) - ret.extend(self.post_tensor_options_kwarg_only) - return ret - - @property - def all(self) -> Sequence[Union[Argument, SelfArgument, TensorOptionsArguments]]: - ret: List[Union[Argument, SelfArgument, TensorOptionsArguments]] = [] - ret.extend(self.positional) - ret.extend(self.kwarg_only) - ret.extend(self.out) - return ret - - def signature(self, *, strip_default: bool = False) -> 'Arguments': - # dataclasses.replace could be used here, but it is less - # type safe so for now I've opted to type everything out - def strip_arg_annotation(a: Argument) -> Argument: - return Argument( - name=a.name, - type=a.type, - default=a.default if not strip_default else None, - annotation=None, - ) - - return Arguments( - pre_self_positional=tuple(map(strip_arg_annotation, self.pre_self_positional)), - self_arg=SelfArgument( - strip_arg_annotation(self.self_arg.argument) - ) if self.self_arg is not None else None, - post_self_positional=tuple(map(strip_arg_annotation, self.post_self_positional)), - pre_tensor_options_kwarg_only=tuple(map(strip_arg_annotation, self.pre_tensor_options_kwarg_only)), - # NB: tensor_options guaranteed to not have any alias annotations - tensor_options=self.tensor_options, - post_tensor_options_kwarg_only=tuple(map(strip_arg_annotation, self.post_tensor_options_kwarg_only)), - # out arguments are dropped in signature - out=(), - ) - - - @staticmethod - def _preparse(args: str) -> Tuple[List[Argument], List[Argument], List[Argument]]: - positional: List[Argument] = [] - kwarg_only: List[Argument] = [] - out: List[Argument] = [] - arguments_acc = positional - - # TODO: Use a real parser here; this will get bamboozled - # by signatures that contain things like std::array (note the space) - for arg in args.split(', '): - if not arg: - continue - if arg == '*': - assert arguments_acc is positional, "invalid syntax: kwarg-only specifier * can only occur once" - arguments_acc = kwarg_only - continue - parg = Argument.parse(arg) - # Currently, we rely directly on the invariant that there are NO - # kwarg-only mutating arguments. If you want to relax this, - # we will need a more semantic way of matching that takes - # into account return arguments. In that case, you will have - # to manage out computation a level up, in FunctionSchema. See Note - # [is_out_fn] - if parg.annotation is not None and parg.annotation.is_write: - if arguments_acc is positional: - pass # do nothing - elif arguments_acc is kwarg_only: - arguments_acc = out - else: - assert arguments_acc is not out - arguments_acc.append(parg) - - return positional, kwarg_only, out - - @staticmethod - def parse(args: str) -> 'Arguments': - """ - Input: 'int x, int y, int z' - """ - - # We do this in two phases. First we parse into three - # main categories: positional, kwarg_only, out. - # Then, we reparse positional and kwarg_only to separate - # out the self argument and tensor options arguments. - - positional, kwarg_only, out = Arguments._preparse(args) - - # Split self argument - self_ix = None - for i, a in enumerate(positional): - if a.name == "self": - self_ix = i - break - pre_self_positional: List[Argument] - self_arg: Optional[SelfArgument] - post_self_positional: List[Argument] - if self_ix is not None: - pre_self_positional = positional[:self_ix] - self_arg = SelfArgument(positional[self_ix]) - post_self_positional = positional[self_ix + 1:] - else: - pre_self_positional = [] - self_arg = None - post_self_positional = positional - - # Group tensor options arguments - pre_tensor_options_kwarg_only: List[Argument] = [] - tensor_options: Optional[TensorOptionsArguments] = None - post_tensor_options_kwarg_only: List[Argument] = [] - kwarg_only_acc = pre_tensor_options_kwarg_only - - def pred(name: str, ty: Type) -> Callable[[Argument], bool]: - return lambda a: a.name == name and a.type in [ty, OptionalType(ty)] - predicates = [ # order matters - pred('dtype', Type.parse('ScalarType')), - pred('layout', Type.parse('Layout')), - pred('device', Type.parse('Device')), - pred('pin_memory', Type.parse('bool')), - ] - - i = 0 - while i < len(kwarg_only): - # If there is enough space... - if i <= len(kwarg_only) - len(predicates): - # And the next len(predicates) arguments look like TensorOptions arguments - if all(p(a) for p, a in zip(predicates, kwarg_only[i : i + len(predicates)])): - assert kwarg_only_acc is pre_tensor_options_kwarg_only - # Group them together as one argument - tensor_options = TensorOptionsArguments( - dtype=kwarg_only[i], - layout=kwarg_only[i + 1], - device=kwarg_only[i + 2], - pin_memory=kwarg_only[i + 3], - ) - i += len(predicates) - kwarg_only_acc = post_tensor_options_kwarg_only - continue - kwarg_only_acc.append(kwarg_only[i]) - i += 1 - - return Arguments( - pre_self_positional=tuple(pre_self_positional), - self_arg=self_arg, - post_self_positional=tuple(post_self_positional), - pre_tensor_options_kwarg_only=tuple(pre_tensor_options_kwarg_only), - tensor_options=tensor_options, - post_tensor_options_kwarg_only=tuple(post_tensor_options_kwarg_only), - out=tuple(out), - ) - - - def __str__(self) -> str: - all_arguments: List[str] = [] - all_arguments.extend(map(str, self.flat_positional)) - if self.flat_kwarg_only or self.out: - all_arguments.append('*') - all_arguments.extend(map(str, self.flat_kwarg_only)) - all_arguments.extend(map(str, self.out)) - return ', '.join(all_arguments) - - def __post_init__(self) -> None: - # TODO: These invariants are weirdly asymmetric? - # TODO: Fancier types? - if self.self_arg is None: - assert not self.pre_self_positional - if self.tensor_options is None: - assert not self.post_tensor_options_kwarg_only - - -# Names that validly are __iXXX__ indicating inplace operations. -# Taken from https://www.python.org/dev/peps/pep-0203/#new-methods -# NB: PyTorch hasn't actually implemented all of these -AUGMENTED_ASSIGNMENT_NAMES = ['add', 'sub', 'mul', 'div', 'mod', 'pow', 'lshift', 'rshift', 'and', 'xor', 'or'] - -# A BaseOperatorName is what we think of the operator name, without -# the overload name. Unusually, we don't represent this as just a -# string; instead, we directly represent a few important semantic -# bits of information we derive from the string: namely whether -# or not it's inplace (add_) and whether or not it's a double-underscore -# method (__add__) -@dataclass(frozen=True) -class BaseOperatorName: - base: str - inplace: bool - dunder_method: bool - - @staticmethod - def parse(op: str) -> 'BaseOperatorName': - assert op != '' - assert not op.endswith('_out'), \ - "_out suffix is reserved and not permitted for operator names; " \ - "did you mean to specify an out overload name instead?" - m = re.match(r'^__([^_]+)__$', op) - if m is not None: - dunder_method = True - base = m.group(1) - if any(base == f'i{n}' for n in AUGMENTED_ASSIGNMENT_NAMES): - inplace = True - base = base[1:] - else: - inplace = False - # temporary, this is not intrinsically true but - # has been historically true for dunder methods - # we support (but, if we ever got, say, __int__, this would - # be wrong!) - assert base[0] != 'i' - else: - dunder_method = False - base = op - if base[-1] == '_': - inplace = True - base = base[:-1] - else: - inplace = False - r = BaseOperatorName(base=base, inplace=inplace, dunder_method=dunder_method) - assert str(r) == op, f'{str(r)} != {op}' - return r - - def __str__(self) -> str: - if self.dunder_method: - i = 'i' if self.inplace else '' - return f'__{i}{self.base}__' - else: - i = '_' if self.inplace else '' - return f'{self.base}{i}' - -# Operator name is the base operator name along with the (typically not -# user visible) overload string. -@dataclass(frozen=True) -class OperatorName: - name: BaseOperatorName - overload_name: str - - @staticmethod - def parse(op_name: str) -> 'OperatorName': - if '.' in op_name: - name, overload_name = op_name.split('.', 1) - else: - name = op_name - overload_name = '' - r = OperatorName( - name=BaseOperatorName.parse(name), - overload_name=overload_name - ) - assert str(r) == op_name, f'{str(r)} != {op_name}' - return r - - def __str__(self) -> str: - if self.overload_name: - return f"{self.name}.{self.overload_name}" - else: - return f"{self.name}" - - # NB: This must be synchronized with the naming scheme in - # aten/src/ATen/templates/Operators.h - # Given a function schema "aten::op.overload(...)", - # If there is no overload name, this returns f"{op}" - # If there is an overload name, this returns f"{op}_{overload}" - def unambiguous_name(self) -> str: - if self.overload_name: - return f"{self.name}_{self.overload_name}" - else: - return f"{self.name}" - - def remove_inplace(self) -> 'OperatorName': - return OperatorName( - name=BaseOperatorName(base=self.name.base, inplace=False, dunder_method=self.name.dunder_method), - overload_name=self.overload_name - ) - - -def gets_generated_out_inplace_wrapper(f: NativeFunction, g: NativeFunctionsGroup, b: BackendIndex) -> bool: - return f.func.kind() is not SchemaKind.functional and \ - not b.has_kernel(f) and \ - b.has_kernel(g.functional) - -# Helper functions for parsing argument lists (both inputs and returns) - -def parse_returns(return_decl: str) -> Tuple[Return, ...]: - """ - Input: '()' - Output: [] - """ - if return_decl == '()': - return () - if return_decl[0] == '(' and return_decl[-1] == ')': - return_decl = return_decl[1:-1] - return tuple(Return.parse(arg) for arg in return_decl.split(', ')) - - -# A Precompute instance consists of a map from kernel argument name -# to the list of Argument instances that should replace that -# kernel argument in the impl function. -@dataclass(frozen=True) -class Precompute: - # A map from kernel argument name -> a list of precomputed - # elements that replaces/supersedes it. - replace: Dict[str, List[Argument]] - # List of precomputed args added without replacement - add: List[Argument] - - @staticmethod - def parse(src: object) -> 'Precompute': - assert isinstance(src, list) - - # src is a list of strings of the format: - # {kernel param name} -> {replacement decl}[, {replacement decl}, ...] - # [{add decl}[, {add decl}, ...]] - # The last line is optional and contains the precomputed parameters that are - # added without replacement. - # The other lines are parsed to get the names of which precomputed elements - # should replace which kernel arguments. - add_args = [] - if ' -> ' not in src[-1]: - add_list = src[-1].split(',') - add_args = [Argument.parse(name.strip()) for name in add_list] - src = src[:-1] - - replace = {} - for raw_replace_item in src: - assert isinstance(raw_replace_item, str) - assert ' -> ' in raw_replace_item, 'precomputed parameters without replacement' \ - ' are allowed only in the last line' - - arg, with_list_raw = raw_replace_item.split(' -> ') - with_list = with_list_raw.split(',') - with_list_args = [Argument.parse(name.strip()) for name in with_list] - replace[arg] = with_list_args - - r = Precompute(replace=replace, add=add_args) - assert r.to_list() == src, 'r.to_list() != src' - return r - - def to_list(self) -> List[str]: - replace_list = [] - for kernel_param, replacement_params in self.replace.items(): - replacements = ', '.join(str(param) for param in replacement_params) - replace_list.append(f'{kernel_param} -> {replacements}') - - return replace_list diff --git a/tools/codegen/operator_versions/gen_mobile_upgraders_constant.py b/tools/codegen/operator_versions/gen_mobile_upgraders_constant.py deleted file mode 100644 index 2adf6e793eeb..000000000000 --- a/tools/codegen/operator_versions/gen_mobile_upgraders_constant.py +++ /dev/null @@ -1,7 +0,0 @@ -MOBILE_UPGRADERS_HEADER_DESCRIPTION = """/** - * @generated - * This is an auto-generated file. Please do not modify it by hand. - * To re-generate, please run: - * cd ~/pytorch && python torch/csrc/jit/mobile/upgrader_mobile.cpp - */ -""" diff --git a/tools/codegen/selective_build/selector.py b/tools/codegen/selective_build/selector.py deleted file mode 100644 index b92a57958675..000000000000 --- a/tools/codegen/selective_build/selector.py +++ /dev/null @@ -1,270 +0,0 @@ -from typing import Dict, Set, Optional, Tuple, List -import yaml - -from dataclasses import dataclass - -from tools.codegen.model import NativeFunction -from tools.codegen.selective_build.operator import ( - SelectiveBuildOperator, merge_debug_info, merge_operator_dicts, - strip_operator_overload_name) - -# A SelectiveBuilder holds information extracted from the selective build -# YAML specification. -# -# It includes information about the build's selectivity, the debug_info -# associated with this selective build (opaque string), and the set of -# operators that should be included in the build. -# -@dataclass(frozen=True) -class SelectiveBuilder: - - # If true, then the build is not selective, and includes all - # operators. - include_all_operators: bool - - # Debug Information at the selective/custom build level. - _debug_info: Optional[Tuple[str, ...]] - - # A dictionary of operator -> operator metadata. - operators: Dict[str, SelectiveBuildOperator] - - # A dictionary of selected kernel tags and dtypes. Typically a - # PyTorch Operator Kernel (function) may have many code paths - # that are specialized for many many Tensor dtypes, so it's not - # one per kernel function, but there could be many per kernel - # function. The tag isn't a kernel function name, but some fragment - # of the kernel function implementation itself. - kernel_metadata: Dict[str, List[str]] - - # A set of all the custom torch bind classes used by the selected models - # Stored as a set internally to remove duplicates proactively, but written - # as a list to yamls - custom_classes: Set[str] - - # A set of all the build features used by the selected models - # Stored as a set internally to remove duplicates proactively, but written - # as a list to yamls - build_features: Set[str] - - # If true, then fragments for all dtypes for all kernel functions - # are included as well as all custom classes. This is typically set when any one of the - # operator lists is generated from a mechanism other than - # tracing based selective build. - include_all_non_op_selectives: bool - - @staticmethod - def get_nop_selector() -> 'SelectiveBuilder': - return SelectiveBuilder.from_yaml_dict({'include_all_operators': True}) - - @staticmethod - def from_yaml_dict(data: Dict[str, object]) -> 'SelectiveBuilder': - valid_top_level_keys = { - 'include_all_non_op_selectives', - 'include_all_operators', - 'debug_info', - 'operators', - 'kernel_metadata', - 'custom_classes', - 'build_features', - } - top_level_keys = set(data.keys()) - if len(top_level_keys - valid_top_level_keys) > 0: - raise Exception("Got unexpected top level keys: {}".format( - ",".join(top_level_keys - valid_top_level_keys), - )) - include_all_operators = data.get('include_all_operators', False) - assert isinstance(include_all_operators, bool) - - debug_info = None - if 'debug_info' in data: - di_list = data['debug_info'] - assert isinstance(di_list, list) - - debug_info = tuple(map(lambda x: str(x), di_list)) - - operators = {} - operators_dict = data.get('operators', {}) - assert isinstance(operators_dict, dict) - - for (k, v) in operators_dict.items(): - operators[k] = SelectiveBuildOperator.from_yaml_dict(k, v) - - kernel_metadata = {} - kernel_metadata_dict = data.get('kernel_metadata', {}) - assert isinstance(kernel_metadata_dict, dict) - - for (k, v) in kernel_metadata_dict.items(): - kernel_metadata[str(k)] = list(map(lambda dtype: str(dtype), v)) - - custom_classes = data.get('custom_classes', []) - custom_classes = set(custom_classes) # type: ignore[arg-type] - - build_features = data.get('build_features', []) - build_features = set(build_features) # type: ignore[arg-type] - - include_all_non_op_selectives = data.get('include_all_non_op_selectives', False) - assert isinstance(include_all_non_op_selectives, bool) - - return SelectiveBuilder( - include_all_operators, - debug_info, - operators, - kernel_metadata, - custom_classes, # type: ignore[arg-type] - build_features, # type: ignore[arg-type] - include_all_non_op_selectives, - ) - - @staticmethod - def from_yaml_str(config_contents: str) -> 'SelectiveBuilder': - contents = yaml.safe_load(config_contents) - return SelectiveBuilder.from_yaml_dict(contents) - - @staticmethod - def from_yaml_path(config_path: str) -> 'SelectiveBuilder': - with open(config_path, 'r') as f: - contents = yaml.safe_load(f) - return SelectiveBuilder.from_yaml_dict(contents) - - @staticmethod - def from_legacy_op_registration_allow_list( - allow_list: Set[str], - is_root_operator: bool, - is_used_for_training: bool) -> 'SelectiveBuilder': - operators = {} - for op in allow_list: - operators[op] = { - 'name': op, - 'is_root_operator': is_root_operator, - 'is_used_for_training': is_used_for_training, - 'include_all_overloads': True, - } - return SelectiveBuilder.from_yaml_dict({ - 'operators': operators, - 'include_all_non_op_selectives': True, - }) - - def is_operator_selected(self, name: str) -> bool: - if self.include_all_operators: - return True - - if name in self.operators: - return True - name = strip_operator_overload_name(name) - return name in self.operators and self.operators[name].include_all_overloads - - def is_native_function_selected(self, func: NativeFunction) -> bool: - op_name = op_name_from_native_function(func) - return self.is_operator_selected(op_name) - - def is_operator_selected_for_training(self, name: str) -> bool: - if not self.is_operator_selected(name): - return False - if self.include_all_operators: - return True - - not_training_op = SelectiveBuildOperator( - name='', - is_root_operator=False, - is_used_for_training=False, - include_all_overloads=False, - _debug_info=None, - ) - op = not_training_op - if name in self.operators: - op = self.operators[name] - - name = strip_operator_overload_name(name) - base_op = not_training_op - if name in self.operators: - base_op = self.operators[name] - - return ( - op.is_used_for_training or - (base_op.include_all_overloads and base_op.is_used_for_training) - ) - - def is_native_function_selected_for_training(self, func: NativeFunction) -> bool: - op_name = op_name_from_native_function(func) - return self.is_operator_selected_for_training(op_name) - - def is_root_operator(self, name: str) -> bool: - if not self.is_operator_selected(name): - return False - if self.include_all_operators: - return True - - if name in self.operators: - op: SelectiveBuildOperator = self.operators[name] - return op.is_root_operator - name = strip_operator_overload_name(name) - if name not in self.operators: - return False - base_op: SelectiveBuildOperator = self.operators[name] - return base_op.include_all_overloads and base_op.is_root_operator - - def is_kernel_dtype_selected(self, kernel_tag: str, dtype: str) -> bool: - if self.include_all_operators or self.include_all_non_op_selectives: - return True - - return kernel_tag in self.kernel_metadata and dtype in self.kernel_metadata[kernel_tag] - - def to_dict(self) -> Dict[str, object]: - ret: Dict[str, object] = { - 'include_all_non_op_selectives': self.include_all_non_op_selectives, - 'include_all_operators': self.include_all_operators, - } - operators = {} - for (op_name, op) in self.operators.items(): - operators[op_name] = op.to_dict() - ret['operators'] = operators - - if self._debug_info is not None: - ret['debug_info'] = sorted(self._debug_info) - - ret['kernel_metadata'] = {k: sorted(list(v)) for (k, v) in self.kernel_metadata.items()} - - ret['custom_classes'] = sorted(self.custom_classes) - - ret['build_features'] = sorted(self.build_features) - - return ret - - -def merge_kernel_metadata( - lhs: Dict[str, List[str]], - rhs: Dict[str, List[str]], -) -> Dict[str, List[str]]: - kernel_metadata: Dict[str, List[str]] = {} - for (tag_name, dtypes) in list(lhs.items()) + list(rhs.items()): - dtypes_copy = set(dtypes) - if tag_name in kernel_metadata: - dtypes_copy |= set(kernel_metadata[tag_name]) - - kernel_metadata[tag_name] = list(dtypes_copy) - - return kernel_metadata - -def combine_selective_builders(lhs: SelectiveBuilder, rhs: SelectiveBuilder) -> SelectiveBuilder: - include_all_operators = lhs.include_all_operators or rhs.include_all_operators - debug_info = merge_debug_info(lhs._debug_info, rhs._debug_info) - operators = merge_operator_dicts(lhs.operators, rhs.operators) - kernel_metadata = merge_kernel_metadata(lhs.kernel_metadata, rhs.kernel_metadata) - include_all_non_op_selectives = lhs.include_all_non_op_selectives or rhs.include_all_non_op_selectives - custom_classes = lhs.custom_classes.union(rhs.custom_classes) - build_features = lhs.build_features.union(rhs.build_features) - return SelectiveBuilder( - include_all_operators, - debug_info, - operators, - kernel_metadata, - custom_classes, - build_features, - include_all_non_op_selectives, - ) - - -def op_name_from_native_function(f: NativeFunction) -> str: - # This was originally read from the 'operator_name_with_overload' field in the - # declaration dict, which was the part before the first '(' in 'schema_string'. - return f'aten::{f.func.name}' diff --git a/tools/codegen/utils.py b/tools/codegen/utils.py deleted file mode 100644 index 48373a0db03c..000000000000 --- a/tools/codegen/utils.py +++ /dev/null @@ -1,237 +0,0 @@ -import contextlib -import functools -import hashlib -import os -import re -import textwrap -from typing import Tuple, List, Iterable, Iterator, Callable, Sequence, TypeVar, Optional, Dict, Any, Union, Set, NoReturn -from enum import Enum - -from tools.codegen.code_template import CodeTemplate - -# Safely load fast C Yaml loader/dumper if they are available -try: - from yaml import CSafeLoader as Loader -except ImportError: - from yaml import SafeLoader as Loader # type: ignore[misc] - -try: - from yaml import CSafeDumper as Dumper -except ImportError: - from yaml import SafeDumper as Dumper # type: ignore[misc] -YamlDumper = Dumper - -# A custom loader for YAML that errors on duplicate keys. -# This doesn't happen by default: see https://github.com/yaml/pyyaml/issues/165 -class YamlLoader(Loader): - def construct_mapping(self, node, deep=False): # type: ignore[no-untyped-def] - mapping = [] - for key_node, value_node in node.value: - key = self.construct_object(key_node, deep=deep) # type: ignore[no-untyped-call] - assert key not in mapping, f"Found a duplicate key in the yaml. key={key}, line={node.start_mark.line}" - mapping.append(key) - mapping = super().construct_mapping(node, deep=deep) # type: ignore[no-untyped-call] - return mapping - -# Many of these functions share logic for defining both the definition -# and declaration (for example, the function signature is the same), so -# we organize them into one function that takes a Target to say which -# code we want. -# -# This is an OPEN enum (we may add more cases to it in the future), so be sure -# to explicitly specify with Union[Literal[Target.XXX]] what targets are valid -# for your use. -Target = Enum('Target', ( - # top level namespace (not including at) - 'DEFINITION', - 'DECLARATION', - # TORCH_LIBRARY(...) { ... } - 'REGISTRATION', - # namespace { ... } - 'ANONYMOUS_DEFINITION', - # namespace cpu { ... } - 'NAMESPACED_DEFINITION', - 'NAMESPACED_DECLARATION', -)) - -# Matches "foo" in "foo, bar" but not "foobar". Used to search for the -# occurrence of a parameter in the derivative formula -IDENT_REGEX = r'(^|\W){}($|\W)' - -# TODO: Use a real parser here; this will get bamboozled -def split_name_params(schema: str) -> Tuple[str, List[str]]: - m = re.match(r'(\w+)(\.\w+)?\((.*)\)', schema) - if m is None: - raise RuntimeError(f'Unsupported function schema: {schema}') - name, _, params = m.groups() - return name, params.split(', ') - -T = TypeVar('T') -S = TypeVar('S') - -# These two functions purposely return generators in analogy to map() -# so that you don't mix up when you need to list() them - -# Map over function that may return None; omit Nones from output sequence -def mapMaybe(func: Callable[[T], Optional[S]], xs: Iterable[T]) -> Iterator[S]: - for x in xs: - r = func(x) - if r is not None: - yield r - -# Map over function that returns sequences and cat them all together -def concatMap(func: Callable[[T], Sequence[S]], xs: Iterable[T]) -> Iterator[S]: - for x in xs: - for r in func(x): - yield r - -# Conveniently add error context to exceptions raised. Lets us -# easily say that an error occurred while processing a specific -# context. -@contextlib.contextmanager -def context(msg_fn: Callable[[], str]) -> Iterator[None]: - try: - yield - except Exception as e: - # TODO: this does the wrong thing with KeyError - msg = msg_fn() - msg = textwrap.indent(msg, ' ') - msg = f'{e.args[0]}\n{msg}' if e.args else msg - e.args = (msg,) + e.args[1:] - raise - -# A little trick from https://github.com/python/mypy/issues/6366 -# for getting mypy to do exhaustiveness checking -# TODO: put this somewhere else, maybe -def assert_never(x: NoReturn) -> NoReturn: - raise AssertionError("Unhandled type: {}".format(type(x).__name__)) - -@functools.lru_cache(maxsize=None) -def _read_template(template_fn: str) -> CodeTemplate: - return CodeTemplate.from_file(template_fn) - - -# String hash that's stable across different executions, unlike builtin hash -def string_stable_hash(s: str) -> int: - sha1 = hashlib.sha1(s.encode('latin1')).digest() - return int.from_bytes(sha1, byteorder='little') - -# A small abstraction for writing out generated files and keeping track -# of what files have been written (so you can write out a list of output -# files) -class FileManager: - install_dir: str - template_dir: str - dry_run: bool - filenames: Set[str] - - def __init__(self, install_dir: str, template_dir: str, dry_run: bool) -> None: - self.install_dir = install_dir - self.template_dir = template_dir - self.filenames = set() - self.dry_run = dry_run - - def _write_if_changed(self, filename: str, contents: str) -> None: - old_contents: Optional[str] - try: - with open(filename, 'r') as f: - old_contents = f.read() - except IOError: - old_contents = None - if contents != old_contents: - # Create output directory if it doesn't exist - os.makedirs(os.path.dirname(filename), exist_ok=True) - with open(filename, 'w') as f: - f.write(contents) - - def write_with_template(self, filename: str, template_fn: str, - env_callable: Callable[[], Union[str, Dict[str, Any]]]) -> None: - filename = '{}/{}'.format(self.install_dir, filename) - assert filename not in self.filenames, "duplicate file write {filename}" - self.filenames.add(filename) - if not self.dry_run: - env = env_callable() - if isinstance(env, dict): - # TODO: Update the comment reference to the correct location - if 'generated_comment' not in env: - comment = "@" + "generated by tools/codegen/gen.py" - comment += " from {}".format(os.path.basename(template_fn)) - env['generated_comment'] = comment - template = _read_template(os.path.join(self.template_dir, template_fn)) - self._write_if_changed(filename, template.substitute(env)) - elif isinstance(env, str): - self._write_if_changed(filename, env) - else: - assert_never(env) - - - def write(self, filename: str, env_callable: Callable[[], Union[str, Union[str, Dict[str, Any]]]]) -> None: - self.write_with_template(filename, filename, env_callable) - - def write_sharded( - self, - filename: str, - items: Iterable[T], - *, - key_fn: Callable[[T], str], - env_callable: Callable[[T], Dict[str, List[str]]], - num_shards: int, - base_env: Optional[Dict[str, Any]] = None, - sharded_keys: Set[str] - ) -> None: - - everything: Dict[str, Any] = {'shard_id': 'Everything'} - shards: List[Dict[str, Any]] = [{'shard_id': f'_{i}'} for i in range(num_shards)] - all_shards = [everything] + shards - - if base_env is not None: - for shard in all_shards: - shard.update(base_env) - - for key in sharded_keys: - for shard in all_shards: - if key in shard: - assert isinstance(shard[key], list), "sharded keys in base_env must be a list" - shard[key] = shard[key].copy() - else: - shard[key] = [] - - def merge_env(into: Dict[str, List[str]], from_: Dict[str, List[str]]) -> None: - for k, v in from_.items(): - assert k in sharded_keys, f"undeclared sharded key {k}" - into[k] += v - - if self.dry_run: - # Dry runs don't write any templates, so incomplete environments are fine - items = () - - for item in items: - key = key_fn(item) - sid = string_stable_hash(key) % num_shards - env = env_callable(item) - - merge_env(shards[sid], env) - merge_env(everything, env) - - dot_pos = filename.rfind('.') - if dot_pos == -1: - dot_pos = len(filename) - base_filename = filename[:dot_pos] - extension = filename[dot_pos:] - - for shard in all_shards: - shard_id = shard['shard_id'] - self.write_with_template(f"{base_filename}{shard_id}{extension}", - filename, - lambda: shard) - - # filenames is used to track compiled files, but FooEverything.cpp isn't meant to be compiled - self.filenames.discard( - f"{self.install_dir}/{base_filename}Everything{extension}") - - def write_outputs(self, variable_name: str, filename: str) -> None: - """Write a file containing the list of all outputs which are - generated by this script.""" - content = 'set({}\n {})'.format( - variable_name, '\n '.join('"' + name + '"' for name in sorted(self.filenames))) - self._write_if_changed(filename, content) diff --git a/tools/coverage_plugins_package/setup.py b/tools/coverage_plugins_package/setup.py index c93f6129258d..012506945504 100644 --- a/tools/coverage_plugins_package/setup.py +++ b/tools/coverage_plugins_package/setup.py @@ -6,8 +6,8 @@ setuptools.setup( name="coverage-plugins", version="0.0.1", - author='PyTorch Team', - author_email='packages@pytorch.org', + author="PyTorch Team", + author_email="packages@pytorch.org", description="plug-in to coverage for PyTorch JIT", long_description=long_description, long_description_content_type="text/markdown", diff --git a/tools/coverage_plugins_package/src/coverage_plugins/jit_plugin.py b/tools/coverage_plugins_package/src/coverage_plugins/jit_plugin.py index 8dcd31397d2a..a64670b6ada3 100644 --- a/tools/coverage_plugins_package/src/coverage_plugins/jit_plugin.py +++ b/tools/coverage_plugins_package/src/coverage_plugins/jit_plugin.py @@ -1,4 +1,4 @@ -''' +""" This coverage plug-in attempts to cover JIT'd functions and methods that were previously missed in code coverage. Any function and method that was passed through/decorated with torch.jit.script or torch.jit.script_method should now be marked covered when coverage is run with this plug-in. @@ -6,39 +6,54 @@ DISCLAIMER: note that this will mark the entire JIT'd function/method as covered without seeking proof that the compiled code has been executed. This means that even if the code chunk is merely compiled and not run, it will get marked as covered. -''' +""" from coverage import CoveragePlugin, CoverageData # type: ignore[import] -from inspect import ismodule, isclass, ismethod, isfunction, iscode, getsourcefile, getsourcelines +from inspect import ( + ismodule, + isclass, + ismethod, + isfunction, + iscode, + getsourcefile, + getsourcelines, +) from time import time from typing import Any # All coverage stats resulting from this plug-in will be in a separate .coverage file that should be merged later with # `coverage combine`. The convention seems to be .coverage.dotted.suffix based on the following link: # https://coverage.readthedocs.io/en/coverage-5.5/cmd.html#combining-data-files-coverage-combine -cov_data = CoverageData(basename=f'.coverage.jit.{time()}') +cov_data = CoverageData(basename=f".coverage.jit.{time()}") def is_not_builtin_class(obj: Any) -> bool: - return isclass(obj) and not type(obj).__module__ == 'builtins' + return isclass(obj) and not type(obj).__module__ == "builtins" class JitPlugin(CoveragePlugin): # type: ignore[misc, no-any-unimported] - ''' + """ dynamic_context is an overridden function that gives us access to every frame run during the coverage process. We look for when the function being run is `should_drop`, as all functions that get passed into `should_drop` will be compiled and thus should be marked as covered. - ''' + """ + def dynamic_context(self, frame: Any) -> None: - if frame.f_code.co_name == 'should_drop': - obj = frame.f_locals['fn'] + if frame.f_code.co_name == "should_drop": + obj = frame.f_locals["fn"] # The many conditions in the if statement below are based on the accepted arguments to getsourcefile. Based # on its documentation (https://docs.python.org/3/library/inspect.html#inspect.getsourcefile), the argument # must be a module, class, method, function, traceback, frame, or code object AND it cannot be a built-in # module, class, or function. # Currently, we DO NOT include tracebacks or frames as they should not be JIT'd, and we have not checked for # built-in modules or functions as those do not seem to be JIT'd either. - if is_not_builtin_class(obj) or ismodule(obj) or ismethod(obj) or isfunction(obj) or iscode(obj): + if ( + is_not_builtin_class(obj) + or ismodule(obj) + or ismethod(obj) + or isfunction(obj) + or iscode(obj) + ): filename = getsourcefile(obj) # We don't want to report for filename = None if filename: @@ -51,9 +66,14 @@ def dynamic_context(self, frame: Any) -> None: except OSError: pass else: - line_data = {filename: range(starting_lineno, starting_lineno + len(sourcelines))} + line_data = { + filename: range( + starting_lineno, starting_lineno + len(sourcelines) + ) + } cov_data.add_lines(line_data) super().dynamic_context(frame) + def coverage_init(reg: Any, options: Any) -> None: reg.add_dynamic_context(JitPlugin()) diff --git a/tools/download_mnist.py b/tools/download_mnist.py index dfb0f95171ee..80894ad2bdbb 100644 --- a/tools/download_mnist.py +++ b/tools/download_mnist.py @@ -6,15 +6,15 @@ import sys MIRRORS = [ - 'http://yann.lecun.com/exdb/mnist/', - 'https://ossci-datasets.s3.amazonaws.com/mnist/', + "http://yann.lecun.com/exdb/mnist/", + "https://ossci-datasets.s3.amazonaws.com/mnist/", ] RESOURCES = [ - 'train-images-idx3-ubyte.gz', - 'train-labels-idx1-ubyte.gz', - 't10k-images-idx3-ubyte.gz', - 't10k-labels-idx1-ubyte.gz', + "train-images-idx3-ubyte.gz", + "train-labels-idx1-ubyte.gz", + "t10k-images-idx3-ubyte.gz", + "t10k-labels-idx1-ubyte.gz", ] @@ -25,23 +25,23 @@ def report_download_progress( ) -> None: if file_size != -1: percent = min(1, (chunk_number * chunk_size) / file_size) - bar = '#' * int(64 * percent) - sys.stdout.write('\r0% |{:<64}| {}%'.format(bar, int(percent * 100))) + bar = "#" * int(64 * percent) + sys.stdout.write("\r0% |{:<64}| {}%".format(bar, int(percent * 100))) def download(destination_path: str, resource: str, quiet: bool) -> None: if os.path.exists(destination_path): if not quiet: - print('{} already exists, skipping ...'.format(destination_path)) + print("{} already exists, skipping ...".format(destination_path)) else: for mirror in MIRRORS: url = mirror + resource - print('Downloading {} ...'.format(url)) + print("Downloading {} ...".format(url)) try: hook = None if quiet else report_download_progress urlretrieve(url, destination_path, reporthook=hook) except (URLError, ConnectionError) as e: - print('Failed to download (trying next):\n{}'.format(e)) + print("Failed to download (trying next):\n{}".format(e)) continue finally: if not quiet: @@ -49,32 +49,32 @@ def download(destination_path: str, resource: str, quiet: bool) -> None: print() break else: - raise RuntimeError('Error downloading resource!') + raise RuntimeError("Error downloading resource!") def unzip(zipped_path: str, quiet: bool) -> None: unzipped_path = os.path.splitext(zipped_path)[0] if os.path.exists(unzipped_path): if not quiet: - print('{} already exists, skipping ... '.format(unzipped_path)) + print("{} already exists, skipping ... ".format(unzipped_path)) return - with gzip.open(zipped_path, 'rb') as zipped_file: - with open(unzipped_path, 'wb') as unzipped_file: + with gzip.open(zipped_path, "rb") as zipped_file: + with open(unzipped_path, "wb") as unzipped_file: unzipped_file.write(zipped_file.read()) if not quiet: - print('Unzipped {} ...'.format(zipped_path)) + print("Unzipped {} ...".format(zipped_path)) def main() -> None: parser = argparse.ArgumentParser( - description='Download the MNIST dataset from the internet') + description="Download the MNIST dataset from the internet" + ) parser.add_argument( - '-d', '--destination', default='.', help='Destination directory') + "-d", "--destination", default=".", help="Destination directory" + ) parser.add_argument( - '-q', - '--quiet', - action='store_true', - help="Don't report about progress") + "-q", "--quiet", action="store_true", help="Don't report about progress" + ) options = parser.parse_args() if not os.path.exists(options.destination): @@ -86,8 +86,8 @@ def main() -> None: download(path, resource, options.quiet) unzip(path, options.quiet) except KeyboardInterrupt: - print('Interrupted') + print("Interrupted") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tools/extract_scripts.py b/tools/extract_scripts.py index fd90b1b9f0e5..7a9a29decc5a 100755 --- a/tools/extract_scripts.py +++ b/tools/extract_scripts.py @@ -18,82 +18,85 @@ class Script(TypedDict): def extract(step: Step) -> Optional[Script]: - run = step.get('run') + run = step.get("run") # https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#using-a-specific-shell - shell = step.get('shell', 'bash') + shell = step.get("shell", "bash") extension = { - 'bash': '.sh', - 'pwsh': '.ps1', - 'python': '.py', - 'sh': '.sh', - 'cmd': '.cmd', - 'powershell': '.ps1', + "bash": ".sh", + "pwsh": ".ps1", + "python": ".py", + "sh": ".sh", + "cmd": ".cmd", + "powershell": ".ps1", }.get(shell) - is_gh_script = step.get('uses', '').startswith('actions/github-script@') - gh_script = step.get('with', {}).get('script') + is_gh_script = step.get("uses", "").startswith("actions/github-script@") + gh_script = step.get("with", {}).get("script") if run is not None and extension is not None: script = { - 'bash': f'#!/usr/bin/env bash\nset -eo pipefail\n{run}', - 'sh': f'#!/usr/bin/env sh\nset -e\n{run}', + "bash": f"#!/usr/bin/env bash\nset -eo pipefail\n{run}", + "sh": f"#!/usr/bin/env sh\nset -e\n{run}", }.get(shell, run) - return {'extension': extension, 'script': script} + return {"extension": extension, "script": script} elif is_gh_script and gh_script is not None: - return {'extension': '.js', 'script': gh_script} + return {"extension": ".js", "script": gh_script} else: return None def main() -> None: parser = argparse.ArgumentParser() - parser.add_argument('--out', required=True) + parser.add_argument("--out", required=True) args = parser.parse_args() out = Path(args.out) if out.exists(): - sys.exit(f'{out} already exists; aborting to avoid overwriting') + sys.exit(f"{out} already exists; aborting to avoid overwriting") gha_expressions_found = False - for p in Path('.github/workflows').iterdir(): - with open(p) as f: + for p in Path(".github/workflows").iterdir(): + with open(p, "rb") as f: workflow = yaml.safe_load(f) - for job_name, job in workflow['jobs'].items(): + for job_name, job in workflow["jobs"].items(): job_dir = out / p / job_name - steps = job['steps'] + if "steps" not in job: + continue + steps = job["steps"] index_chars = len(str(len(steps) - 1)) for i, step in enumerate(steps, start=1): extracted = extract(step) if extracted: - script = extracted['script'] - step_name = step.get('name', '') - if '${{' in script: + script = extracted["script"] + step_name = step.get("name", "") + if "${{" in script: gha_expressions_found = True print( - f'{p} job `{job_name}` step {i}: {step_name}', - file=sys.stderr + f"{p} job `{job_name}` step {i}: {step_name}", + file=sys.stderr, ) job_dir.mkdir(parents=True, exist_ok=True) sanitized = re.sub( - '[^a-zA-Z_]+', '_', - f'_{step_name}', - ).rstrip('_') - extension = extracted['extension'] - filename = f'{i:0{index_chars}}{sanitized}{extension}' + "[^a-zA-Z_]+", + "_", + f"_{step_name}", + ).rstrip("_") + extension = extracted["extension"] + filename = f"{i:0{index_chars}}{sanitized}{extension}" (job_dir / filename).write_text(script) if gha_expressions_found: sys.exit( - 'Each of the above scripts contains a GitHub Actions ' - '${{ }} which must be replaced with an `env` variable' - ' for security reasons.' + "Each of the above scripts contains a GitHub Actions " + "${{ }} which must be replaced with an `env` variable" + " for security reasons." ) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tools/fast_nvcc/fast_nvcc.py b/tools/fast_nvcc/fast_nvcc.py index f1bb4fa6c9e6..0a1ae07c2342 100755 --- a/tools/fast_nvcc/fast_nvcc.py +++ b/tools/fast_nvcc/fast_nvcc.py @@ -14,12 +14,11 @@ import subprocess import sys import time -from typing import (Awaitable, DefaultDict, Dict, List, Match, Optional, Set, - cast) +from typing import Awaitable, DefaultDict, Dict, List, Match, Optional, Set, cast from typing_extensions import TypedDict -help_msg = '''fast_nvcc [OPTION]... -- [NVCC_ARG]... +help_msg = """fast_nvcc [OPTION]... -- [NVCC_ARG]... Run the commands given by nvcc --dryrun, in parallel. @@ -31,61 +30,61 @@ instance passing --help (after "--") doesn't work since the --help execution path doesn't compile anything, so adding --dryrun there gives nothing in stderr. -''' +""" parser = argparse.ArgumentParser(help_msg) parser.add_argument( - '--faithful', - action='store_true', + "--faithful", + action="store_true", help="don't modify the commands given by nvcc (slower)", ) parser.add_argument( - '--graph', - metavar='FILE.gv', - help='write Graphviz DOT file with execution graph', + "--graph", + metavar="FILE.gv", + help="write Graphviz DOT file with execution graph", ) parser.add_argument( - '--nvcc', - metavar='PATH', - default='nvcc', + "--nvcc", + metavar="PATH", + default="nvcc", help='path to nvcc (default is just "nvcc")', ) parser.add_argument( - '--save', - metavar='DIR', - help='copy intermediate files from each command into DIR', + "--save", + metavar="DIR", + help="copy intermediate files from each command into DIR", ) parser.add_argument( - '--sequential', - action='store_true', - help='sequence commands instead of using the graph (slower)', + "--sequential", + action="store_true", + help="sequence commands instead of using the graph (slower)", ) parser.add_argument( - '--table', - metavar='FILE.csv', - help='write CSV with times and intermediate file sizes', + "--table", + metavar="FILE.csv", + help="write CSV with times and intermediate file sizes", ) parser.add_argument( - '--verbose', - metavar='FILE.txt', - help='like nvcc --verbose, but expanded and into a file', + "--verbose", + metavar="FILE.txt", + help="like nvcc --verbose, but expanded and into a file", ) default_config = parser.parse_args([]) # docs about temporary directories used by NVCC -url_base = 'https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html' -url_vars = f'{url_base}#keeping-intermediate-phase-files' +url_base = "https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html" +url_vars = f"{url_base}#keeping-intermediate-phase-files" # regex for temporary file names -re_tmp = r'(? None: """ Warn the user about something regarding fast_nvcc. """ - print(f'warning (fast_nvcc): {warning}', file=sys.stderr) + print(f"warning (fast_nvcc): {warning}", file=sys.stderr) def warn_if_windows() -> None: @@ -95,7 +94,7 @@ def warn_if_windows() -> None: # use os.name instead of platform.system() because there is a # platform.py file in this directory, making it very difficult to # import the platform module from the Python standard library - if os.name == 'nt': + if os.name == "nt": fast_nvcc_warn("untested on Windows, might not work; see this URL:") fast_nvcc_warn(url_vars) @@ -104,24 +103,24 @@ def warn_if_tmpdir_flag(args: List[str]) -> None: """ Warn the user that using fast_nvcc with some flags might not work. """ - file_path_specs = 'file-and-path-specifications' - guiding_driver = 'options-for-guiding-compiler-driver' + file_path_specs = "file-and-path-specifications" + guiding_driver = "options-for-guiding-compiler-driver" scary_flags = { - '--objdir-as-tempdir': file_path_specs, - '-objtemp': file_path_specs, - '--keep': guiding_driver, - '-keep': guiding_driver, - '--keep-dir': guiding_driver, - '-keep-dir': guiding_driver, - '--save-temps': guiding_driver, - '-save-temps': guiding_driver, + "--objdir-as-tempdir": file_path_specs, + "-objtemp": file_path_specs, + "--keep": guiding_driver, + "-keep": guiding_driver, + "--keep-dir": guiding_driver, + "-keep-dir": guiding_driver, + "--save-temps": guiding_driver, + "-save-temps": guiding_driver, } for arg in args: for flag, frag in scary_flags.items(): - if re.match(fr'^{re.escape(flag)}(?:=.*)?$', arg): - fast_nvcc_warn(f'{flag} not supported since it interacts with') - fast_nvcc_warn('TMPDIR, so fast_nvcc may break; see this URL:') - fast_nvcc_warn(f'{url_base}#{frag}') + if re.match(rf"^{re.escape(flag)}(?:=.*)?$", arg): + fast_nvcc_warn(f"{flag} not supported since it interacts with") + fast_nvcc_warn("TMPDIR, so fast_nvcc may break; see this URL:") + fast_nvcc_warn(f"{url_base}#{frag}") class DryunData(TypedDict): @@ -135,18 +134,18 @@ def nvcc_dryrun_data(binary: str, args: List[str]) -> DryunData: Return parsed environment variables and commands from nvcc --dryrun. """ result = subprocess.run( # type: ignore[call-overload] - [binary, '--dryrun'] + args, + [binary, "--dryrun"] + args, capture_output=True, - encoding='ascii', # this is just a guess + encoding="ascii", # this is just a guess ) - print(result.stdout, end='') + print(result.stdout, end="") env = {} commands = [] for line in result.stderr.splitlines(): - match = re.match(r'^#\$ (.*)$', line) + match = re.match(r"^#\$ (.*)$", line) if match: - stripped, = match.groups() - mapping = re.match(r'^(\w+)=(.*)$', stripped) + (stripped,) = match.groups() + mapping = re.match(r"^(\w+)=(.*)$", stripped) if mapping: name, val = mapping.groups() env[name] = val @@ -154,14 +153,14 @@ def nvcc_dryrun_data(binary: str, args: List[str]) -> DryunData: commands.append(stripped) else: print(line, file=sys.stderr) - return {'env': env, 'commands': commands, 'exit_code': result.returncode} + return {"env": env, "commands": commands, "exit_code": result.returncode} def warn_if_tmpdir_set(env: Dict[str, str]) -> None: """ Warn the user that setting TMPDIR with fast_nvcc might not work. """ - if os.getenv('TMPDIR') or 'TMPDIR' in env: + if os.getenv("TMPDIR") or "TMPDIR" in env: fast_nvcc_warn("TMPDIR is set, might not work; see this URL:") fast_nvcc_warn(url_vars) @@ -183,17 +182,17 @@ def module_id_contents(command: List[str]) -> str: """ Guess the contents of the .module_id file contained within command. """ - if command[0] == 'cicc': + if command[0] == "cicc": path = command[-3] - elif command[0] == 'cudafe++': + elif command[0] == "cudafe++": path = command[-1] - middle = pathlib.PurePath(path).name.replace('-', '_').replace('.', '_') + middle = pathlib.PurePath(path).name.replace("-", "_").replace(".", "_") # this suffix is very wrong (the real one is far less likely to be # unique), but it seems difficult to find a rule that reproduces the # real suffixes, so here's one that, while inaccurate, is at least # hopefully as straightforward as possible suffix = hashlib.md5(str.encode(middle)).hexdigest()[:8] - return f'_{len(middle)}_{middle}_{suffix}' + return f"_{len(middle)}_{middle}_{suffix}" def unique_module_id_files(commands: List[str]) -> List[str]: @@ -206,14 +205,14 @@ def unique_module_id_files(commands: List[str]) -> List[str]: arr = [] def uniqueify(s: Match[str]) -> str: - filename = re.sub(r'\-(\d+)', r'-\1-' + str(i), s.group(0)) + filename = re.sub(r"\-(\d+)", r"-\1-" + str(i), s.group(0)) arr.append(filename) return filename - line = re.sub(re_tmp + r'.module_id', uniqueify, line) - line = re.sub(r'\s*\-\-gen\_module\_id\_file\s*', ' ', line) + line = re.sub(re_tmp + r".module_id", uniqueify, line) + line = re.sub(r"\s*\-\-gen\_module\_id\_file\s*", " ", line) if arr: - filename, = arr + (filename,) = arr if not module_id: module_id = module_id_contents(shlex.split(line)) uniqueified.append(f"echo -n '{module_id}' > '{filename}'") @@ -225,7 +224,7 @@ def make_rm_force(commands: List[str]) -> List[str]: """ Add --force to all rm commands. """ - return [f'{c} --force' if c.startswith('rm ') else c for c in commands] + return [f"{c} --force" if c.startswith("rm ") else c for c in commands] def print_verbose_output( @@ -238,12 +237,12 @@ def print_verbose_output( Human-readably write nvcc --dryrun data to stderr. """ padding = len(str(len(commands) - 1)) - with open(filename, 'w') as f: + with open(filename, "w") as f: for name, val in env.items(): print(f'#{" "*padding}$ {name}={val}', file=f) for i, command in enumerate(commands): - prefix = f'{str(i).rjust(padding)}$ ' - print(f'#{prefix}{command[0]}', file=f) + prefix = f"{str(i).rjust(padding)}$ " + print(f"#{prefix}{command[0]}", file=f) for part in command[1:]: print(f'#{" "*len(prefix)}{part}', file=f) @@ -262,7 +261,7 @@ def files_mentioned(command: str) -> List[str]: """ Return fully-qualified names of all tmp files referenced by command. """ - return [f'/tmp/{match.group(1)}' for match in re.finditer(re_tmp, command)] + return [f"/tmp/{match.group(1)}" for match in re.finditer(re_tmp, command)] def nvcc_data_dependencies(commands: List[str]) -> Graph: @@ -291,11 +290,11 @@ def nvcc_data_dependencies(commands: List[str]) -> Graph: for filename in fatbins[dep]: if filename in tmp_files: deps.add(tmp_files[filename]) - if tmp.endswith('.fatbin.c') and not line.startswith('fatbinary'): + if tmp.endswith(".fatbin.c") and not line.startswith("fatbinary"): fatbins[i].add(tmp) else: tmp_files[tmp] = i - if line.startswith('rm ') and not deps: + if line.startswith("rm ") and not deps: deps.add(i - 1) graph.append(deps) return graph @@ -329,7 +328,7 @@ def warn_if_not_weakly_connected(graph: Graph) -> None: Warn the user if the execution graph is not weakly connected. """ if not is_weakly_connected(graph): - fast_nvcc_warn('execution graph is not (weakly) connected') + fast_nvcc_warn("execution graph is not (weakly) connected") def print_dot_graph( @@ -341,18 +340,19 @@ def print_dot_graph( """ Print a DOT file displaying short versions of the commands in graph. """ + def name(k: int) -> str: return f'"{k} {os.path.basename(commands[k][0])}"' - with open(filename, 'w') as f: - print('digraph {', file=f) + + with open(filename, "w") as f: + print("digraph {", file=f) # print all nodes, in case it's disconnected for i in range(len(graph)): - print(f' {name(i)};', file=f) + print(f" {name(i)};", file=f) for i, deps in enumerate(graph): for j in deps: - print(f' {name(j)} -> {name(i)};', file=f) - print('}', file=f) - + print(f" {name(j)} -> {name(i)};", file=f) + print("}", file=f) class Result(TypedDict, total=False): @@ -378,7 +378,7 @@ async def run_command( for task in deps: dep_result = await task # abort if a previous step failed - if 'exit_code' not in dep_result or dep_result['exit_code'] != 0: + if "exit_code" not in dep_result or dep_result["exit_code"] != 0: return {} if gather_data: t1 = time.monotonic() @@ -390,17 +390,17 @@ async def run_command( ) stdout, stderr = await proc.communicate() code = cast(int, proc.returncode) - results: Result = {'exit_code': code, 'stdout': stdout, 'stderr': stderr} + results: Result = {"exit_code": code, "stdout": stdout, "stderr": stderr} if gather_data: t2 = time.monotonic() - results['time'] = t2 - t1 + results["time"] = t2 - t1 sizes = {} for tmp_file in files_mentioned(command): if os.path.exists(tmp_file): sizes[tmp_file] = os.path.getsize(tmp_file) else: sizes[tmp_file] = 0 - results['files'] = sizes + results["files"] = sizes if save: dest = pathlib.Path(save) / str(i) dest.mkdir() @@ -424,14 +424,18 @@ async def run_graph( tasks: List[Awaitable[Result]] = [] for i, (command, indices) in enumerate(zip(commands, graph)): deps = {tasks[j] for j in indices} - tasks.append(asyncio.create_task(run_command( # type: ignore[attr-defined] - command, - env=env, - deps=deps, - gather_data=gather_data, - i=i, - save=save, - ))) + tasks.append( + asyncio.create_task( + run_command( # type: ignore[attr-defined] + command, + env=env, + deps=deps, + gather_data=gather_data, + i=i, + save=save, + ) + ) + ) return [await task for task in tasks] @@ -440,8 +444,8 @@ def print_command_outputs(command_results: List[Result]) -> None: Print captured stdout and stderr from commands. """ for result in command_results: - sys.stdout.write(result.get('stdout', b'').decode('ascii')) - sys.stderr.write(result.get('stderr', b'').decode('ascii')) + sys.stdout.write(result.get("stdout", b"").decode("ascii")) + sys.stderr.write(result.get("stderr", b"").decode("ascii")) def write_log_csv( @@ -455,15 +459,15 @@ def write_log_csv( """ tmp_files: List[str] = [] for result in command_results: - tmp_files.extend(result.get('files', {}).keys()) - with open(filename, 'w', newline='') as csvfile: - fieldnames = ['command', 'seconds'] + list(dict.fromkeys(tmp_files)) + tmp_files.extend(result.get("files", {}).keys()) + with open(filename, "w", newline="") as csvfile: + fieldnames = ["command", "seconds"] + list(dict.fromkeys(tmp_files)) writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for i, result in enumerate(command_results): - command = f'{i} {os.path.basename(command_parts[i][0])}' - row = {'command': command, 'seconds': result.get('time', 0)} - writer.writerow({**row, **result.get('files', {})}) + command = f"{i} {os.path.basename(command_parts[i][0])}" + row = {"command": command, "seconds": result.get("time", 0)} + writer.writerow({**row, **result.get("files", {})}) def exit_code(results: List[Result]) -> int: @@ -471,7 +475,7 @@ def exit_code(results: List[Result]) -> int: Aggregate individual exit codes into a single code. """ for result in results: - code = result.get('exit_code', 0) + code = result.get("exit_code", 0) if code != 0: return code return 0 @@ -497,9 +501,9 @@ def fast_nvcc( warn_if_windows() warn_if_tmpdir_flag(args) dryrun_data = nvcc_dryrun_data(config.nvcc, args) - env = dryrun_data['env'] + env = dryrun_data["env"] warn_if_tmpdir_set(env) - commands = dryrun_data['commands'] + commands = dryrun_data["commands"] if not config.faithful: commands = make_rm_force(unique_module_id_files(commands)) @@ -523,13 +527,15 @@ def fast_nvcc( ) if config.sequential: graph = straight_line_dependencies(commands) - results = asyncio.run(run_graph( # type: ignore[attr-defined] - env=env, - commands=commands, - graph=graph, - gather_data=bool(config.table), - save=config.save, - )) + results = asyncio.run( + run_graph( # type: ignore[attr-defined] + env=env, + commands=commands, + graph=graph, + gather_data=bool(config.table), + save=config.save, + ) + ) print_command_outputs(results) if config.table: write_log_csv(command_parts, results, filename=config.table) @@ -537,10 +543,10 @@ def fast_nvcc( def our_arg(arg: str) -> bool: - return arg != '--' + return arg != "--" -if __name__ == '__main__': +if __name__ == "__main__": argv = sys.argv[1:] us = list(itertools.takewhile(our_arg, argv)) them = list(itertools.dropwhile(our_arg, argv)) diff --git a/tools/gdb/pytorch-gdb.py b/tools/gdb/pytorch-gdb.py index 46cdcdec2de2..0ed516078f76 100644 --- a/tools/gdb/pytorch-gdb.py +++ b/tools/gdb/pytorch-gdb.py @@ -2,6 +2,7 @@ import textwrap from typing import Any + class DisableBreakpoints: """ Context-manager to temporarily disable all gdb breakpoints, useful if @@ -20,6 +21,7 @@ def __exit__(self, etype: Any, evalue: Any, tb: Any) -> None: for b in self.disabled_breakpoints: b.enabled = True + class TensorRepr(gdb.Command): # type: ignore[misc, no-any-unimported] """ Print a human readable representation of the given at::Tensor. @@ -30,23 +32,26 @@ class TensorRepr(gdb.Command): # type: ignore[misc, no-any-unimported] internally creates a Python wrapper for the given tensor and call repr() on it. """ + __doc__ = textwrap.dedent(__doc__).strip() def __init__(self) -> None: - gdb.Command.__init__(self, 'torch-tensor-repr', - gdb.COMMAND_USER, gdb.COMPLETE_EXPRESSION) + gdb.Command.__init__( + self, "torch-tensor-repr", gdb.COMMAND_USER, gdb.COMPLETE_EXPRESSION + ) def invoke(self, args: str, from_tty: bool) -> None: args = gdb.string_to_argv(args) if len(args) != 1: - print('Usage: torch-tensor-repr EXP') + print("Usage: torch-tensor-repr EXP") return name = args[0] with DisableBreakpoints(): - res = gdb.parse_and_eval('torch::gdb::tensor_repr(%s)' % name) - print('Python-level repr of %s:' % name) + res = gdb.parse_and_eval("torch::gdb::tensor_repr(%s)" % name) + print("Python-level repr of %s:" % name) print(res.string()) # torch::gdb::tensor_repr returns a malloc()ed buffer, let's free it - gdb.parse_and_eval('(void)free(%s)' % int(res)) + gdb.parse_and_eval("(void)free(%s)" % int(res)) + TensorRepr() diff --git a/tools/generate_torch_version.py b/tools/generate_torch_version.py index 2ee17b76e52f..e47c61f55eb3 100644 --- a/tools/generate_torch_version.py +++ b/tools/generate_torch_version.py @@ -5,46 +5,59 @@ from setuptools import distutils # type: ignore[import] from typing import Optional, Union + def get_sha(pytorch_root: Union[str, Path]) -> str: try: - return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=pytorch_root).decode('ascii').strip() + return ( + subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=pytorch_root) + .decode("ascii") + .strip() + ) except Exception: - return 'Unknown' + return "Unknown" + def get_torch_version(sha: Optional[str] = None) -> str: pytorch_root = Path(__file__).parent.parent - version = open(pytorch_root / 'version.txt', 'r').read().strip() + version = open(pytorch_root / "version.txt", "r").read().strip() - if os.getenv('PYTORCH_BUILD_VERSION'): - assert os.getenv('PYTORCH_BUILD_NUMBER') is not None - build_number = int(os.getenv('PYTORCH_BUILD_NUMBER', "")) - version = os.getenv('PYTORCH_BUILD_VERSION', "") + if os.getenv("PYTORCH_BUILD_VERSION"): + assert os.getenv("PYTORCH_BUILD_NUMBER") is not None + build_number = int(os.getenv("PYTORCH_BUILD_NUMBER", "")) + version = os.getenv("PYTORCH_BUILD_VERSION", "") if build_number > 1: - version += '.post' + str(build_number) - elif sha != 'Unknown': + version += ".post" + str(build_number) + elif sha != "Unknown": if sha is None: sha = get_sha(pytorch_root) - version += '+git' + sha[:7] + version += "+git" + sha[:7] return version + if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Generate torch/version.py from build and environment metadata.") - parser.add_argument("--is_debug", type=distutils.util.strtobool, help="Whether this build is debug mode or not.") + parser = argparse.ArgumentParser( + description="Generate torch/version.py from build and environment metadata." + ) + parser.add_argument( + "--is_debug", + type=distutils.util.strtobool, + help="Whether this build is debug mode or not.", + ) parser.add_argument("--cuda_version", type=str) parser.add_argument("--hip_version", type=str) args = parser.parse_args() assert args.is_debug is not None - args.cuda_version = None if args.cuda_version == '' else args.cuda_version - args.hip_version = None if args.hip_version == '' else args.hip_version + args.cuda_version = None if args.cuda_version == "" else args.cuda_version + args.hip_version = None if args.hip_version == "" else args.hip_version pytorch_root = Path(__file__).parent.parent version_path = pytorch_root / "torch" / "version.py" sha = get_sha(pytorch_root) version = get_torch_version(sha) - with open(version_path, 'w') as f: + with open(version_path, "w") as f: f.write("__version__ = '{}'\n".format(version)) # NB: This is not 100% accurate, because you could have built the # library code with DEBUG, but csrc without DEBUG (in which case diff --git a/tools/git-pre-commit b/tools/git-pre-commit deleted file mode 100755 index 1c4340c6b434..000000000000 --- a/tools/git-pre-commit +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -set -e - -echo "Running pre-commit flake8" -python3 tools/linter/flake8_hook.py - -echo "Running pre-commit clang-tidy" -git diff HEAD > pr.diff -python3 -m tools.linter.clang_tidy --diff-file "pr.diff" -rm pr.diff - -echo "Running pre-commit clang-format" -tools/linter/git-clang-format HEAD~ --force diff --git a/tools/iwyu/fixup.py b/tools/iwyu/fixup.py index b4d6294cbae0..4ce80bb0f52b 100644 --- a/tools/iwyu/fixup.py +++ b/tools/iwyu/fixup.py @@ -2,7 +2,7 @@ import re QUOTE_INCLUDE_RE = re.compile(r'^#include "(.*)"') -ANGLE_INCLUDE_RE = re.compile(r'^#include <(.*)>') +ANGLE_INCLUDE_RE = re.compile(r"^#include <(.*)>") # By default iwyu will pick the C include, but we prefer the C++ headers STD_C_HEADER_MAP = { @@ -34,25 +34,27 @@ "": "", } + def main() -> None: for line in sys.stdin: # Convert all quoted includes to angle brackets match = QUOTE_INCLUDE_RE.match(line) if match is not None: - print(f"#include <{match.group(1)}>{line[match.end(0):]}", end='') + print(f"#include <{match.group(1)}>{line[match.end(0):]}", end="") continue match = ANGLE_INCLUDE_RE.match(line) if match is not None: path = f"<{match.group(1)}>" new_path = STD_C_HEADER_MAP.get(path, path) - tail = line[match.end(0):] + tail = line[match.end(0) :] if len(tail) > 1: - tail = ' ' + tail - print(f"#include {new_path}{tail}", end='') + tail = " " + tail + print(f"#include {new_path}{tail}", end="") continue - print(line, end='') + print(line, end="") + if __name__ == "__main__": main() diff --git a/tools/jit/BUILD.buck b/tools/jit/BUILD.buck new file mode 100644 index 000000000000..d79aece1ed24 --- /dev/null +++ b/tools/jit/BUILD.buck @@ -0,0 +1,13 @@ +python_library( + name = "jit", + srcs = glob([ + "*.py", + "templates/*", + ]), + base_module = "tools.jit", + visibility = ["PUBLIC"], + deps = [ + "//:aten_code_template", + "//torchgen:torchgen", + ], +) diff --git a/tools/jit/gen_unboxing.py b/tools/jit/gen_unboxing.py new file mode 100644 index 000000000000..154b4f527b7e --- /dev/null +++ b/tools/jit/gen_unboxing.py @@ -0,0 +1,247 @@ +# Generates RegisterCodegenUnboxedKernels.cpp, UnboxingFunctions.h and UnboxingFunctions.cpp. +import argparse +import os +import pathlib +from dataclasses import dataclass +from torchgen.api import unboxing +from torchgen.api.translate import translate +from torchgen.api.types import CppSignatureGroup +from torchgen.api.unboxing import convert_arguments +from torchgen.context import method_with_native_function +from torchgen.gen import parse_native_yaml, cpp_string, get_custom_build_selector +from torchgen.model import NativeFunction, NativeFunctionsGroup, Variant +from torchgen.selective_build.selector import SelectiveBuilder +from torchgen.utils import Target, FileManager, mapMaybe, make_file_manager +from typing import Union, Sequence +from typing_extensions import Literal + + +# Generates UnboxingFunctions.h & UnboxingFunctions.cpp. +@dataclass(frozen=True) +class ComputeUnboxingFunctions: + target: Union[Literal[Target.DECLARATION], Literal[Target.DEFINITION]] + selector: SelectiveBuilder + + @method_with_native_function + def __call__(self, f: NativeFunction) -> str: + if not self.selector.is_root_operator(f"aten::{f.func.name}"): + return "" + + if self.target is Target.DECLARATION: + # Note [The ATen Codegen Unboxing API] + # Similar to the ATen Operators API, ATen Codegen Unboxing API lives in the at::unboxing namespace, and + # will be used by codegen unboxing wrappers (CodegenUnboxingWrappers.cpp). + # The Wrappers will be registered into torch::jit::OperatorRegistry using RegisterOperators API. + # + # Important characteristics about the Codegen Unboxing API: + # (1) It follows the OperatorRegistry API. + # This is kind of necessary to avoid overhead. + # For example: if it followed the C++ API, then all of the faithful C++ factory functions + # would need to wrap their arguments into TensorOptions only to unwrap them again. + # (2) Under the hood it calls C++ API. + return f""" +// aten::{f.func} +TORCH_API void {f.func.name.unambiguous_name()}(Stack & stack); +""" + else: + sig_group = CppSignatureGroup.from_native_function( + f, method=(Variant.method in f.variants) + ) + sig = sig_group.most_faithful_signature() + # parse arguments into C++ code + binding_list, code_list = convert_arguments(f) + + # for each C++ argument, generate the conversion code + code_connector = "\n\t" + arg_connector = ", " + # function call and push back to stack + prefix = "self_base." if sig.method else "at::" + translated_args = translate( + binding_list, sig.arguments(), method=sig.method + ) + args_str = f"{arg_connector.join(e.expr for e in translated_args)}" + if len(f.func.returns) == 0: + ret_str = "" + push_str = "" + else: + ret_str = "auto result_ = " + push_str = """ + pack(stack, std::move(result_)); + """ + return f""" +// aten::{f.func} +TORCH_API void {f.func.name.unambiguous_name()}(Stack & stack) {{ + {code_connector.join(code_list)} + + drop(stack, {len(binding_list)}); + + {ret_str}{prefix}{sig.name()}({args_str}); + {push_str} +}} +""" + + +# Generates RegisterCodegenUnboxedKernels.cpp. +@dataclass(frozen=True) +class ComputeCodegenUnboxedKernels: + selector: SelectiveBuilder + + @method_with_native_function + def __call__(self, f: NativeFunction) -> str: + if not self.selector.is_root_operator(f"aten::{f.func.name}"): + return "" + # We unconditionally generate function wrappers, + sig_group = CppSignatureGroup.from_native_function(f, method=False) + + sig = sig_group.most_faithful_signature() + + # escape double quote in schema, get rid of extra double quotes + schema = cpp_string(str(sig.func))[1:-1] + + # arguments + args = sig.arguments() + connector = ",\n\t\t" + args_code = [] + for arg in args: + if not arg.default: + arg_cpp = "c10::IValue(c10::nullopt)" + elif arg.default.startswith("{"): + arg_cpp = f"c10::IntArrayRef({arg.default})" + else: + arg_cpp = f"c10::IValue({arg.default})" + args_code.append( + f"""c10::Argument("{arg.name}", nullptr, c10::nullopt, {arg_cpp})""" + ) + + returns = f.func.returns + returns_code = [] + for ret in returns: + returns_code.append(f"""c10::Argument("{ret.name if ret.name else ""}")""") + return f""" +// aten::{schema} +OperatorGenerator( + "aten::{f.func.name.name}", + "{f.func.name.overload_name}", + {{ + {connector.join(args_code)} + }}, + {{ + {connector.join(returns_code)} + }}, + [](Stack & stack) {{ + RECORD_FUNCTION("{sig.name()}", std::vector()); + at::unboxing::{unboxing.name(f)}(stack); + }}, + aliasAnalysisFromSchema() +), +""" + + +def gen_unboxing( + *, + native_functions: Sequence[NativeFunction], + cpu_fm: FileManager, + selector: SelectiveBuilder, +) -> None: + def key_func(fn: Union[NativeFunction, NativeFunctionsGroup]) -> str: + return fn.root_name + + cpu_fm.write_sharded( + "UnboxingFunctions.cpp", + native_functions, + key_fn=key_func, + env_callable=lambda fn: { + "definitions": [ComputeUnboxingFunctions(Target.DEFINITION, selector)(fn)] + }, + num_shards=5, + sharded_keys={"definitions"}, + ) + cpu_fm.write( + "UnboxingFunctions.h", + lambda: { + "declarations": list( + mapMaybe( + ComputeUnboxingFunctions(Target.DECLARATION, selector), + native_functions, + ) + ), + }, + ) + cpu_fm.write_sharded( + "RegisterCodegenUnboxedKernels.cpp", + native_functions, + key_fn=key_func, + env_callable=lambda fn: { + "unboxed_ops": [ComputeCodegenUnboxedKernels(selector)(fn)] + }, + num_shards=10, + sharded_keys={"unboxed_ops"}, + ) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Generate unboxing source files") + parser.add_argument( + "-s", + "--source-path", + help="path to source directory for ATen", + default="aten/src/ATen", + ) + parser.add_argument( + "-d", "--install_dir", help="output directory", default="build/aten/src/ATen" + ) + parser.add_argument( + "-o", + "--output-dependencies", + help="output a list of dependencies into the given file and exit", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="run without writing any files (still updates outputs)", + ) + parser.add_argument( + "--op_selection_yaml_path", + help="Provide a path to the operator selection (for custom build) YAML " + "that contains the information about the set of selected operators " + "and their categories (training, ...). Each operator is either a " + "full operator name with overload or just a bare operator name. " + "The operator names also contain the namespace prefix (e.g. aten::)", + ) + parser.add_argument( + "--op_registration_allowlist", + nargs="*", + help="filter op registrations by the allowlist (if set); " + "each item is `namespace`::`operator name` without overload name; " + "e.g.: aten::empty aten::conv2d ...", + ) + + options = parser.parse_args() + + selector = get_custom_build_selector( + options.op_registration_allowlist, + options.op_selection_yaml_path, + ) + + native_yaml_path = os.path.join(options.source_path, "native/native_functions.yaml") + tags_yaml_path = os.path.join(options.source_path, "native/tags.yaml") + parsed_yaml = parse_native_yaml(native_yaml_path, tags_yaml_path) + native_functions, backend_indices = ( + parsed_yaml.native_functions, + parsed_yaml.backend_indices, + ) + + cpu_fm = make_file_manager(options=options) + gen_unboxing(native_functions=native_functions, cpu_fm=cpu_fm, selector=selector) + + if options.output_dependencies: + depfile_path = pathlib.Path(options.output_dependencies).resolve() + depfile_name = depfile_path.name + depfile_stem = depfile_path.stem + + path = depfile_path.parent / depfile_name + cpu_fm.write_outputs(depfile_stem, str(path)) + + +if __name__ == "__main__": + main() diff --git a/tools/linter/adapters/actionlint_linter.py b/tools/linter/adapters/actionlint_linter.py new file mode 100644 index 000000000000..bbc93954eda4 --- /dev/null +++ b/tools/linter/adapters/actionlint_linter.py @@ -0,0 +1,138 @@ +import argparse +import os +import re +import json +import logging +import subprocess +import time +from enum import Enum +from typing import List, NamedTuple, Optional, Pattern + + +LINTER_CODE = "ACTIONLINT" + + +class LintSeverity(str, Enum): + ERROR = "error" + WARNING = "warning" + ADVICE = "advice" + DISABLED = "disabled" + + +class LintMessage(NamedTuple): + path: Optional[str] + line: Optional[int] + char: Optional[int] + code: str + severity: LintSeverity + name: str + original: Optional[str] + replacement: Optional[str] + description: Optional[str] + + +RESULTS_RE: Pattern[str] = re.compile( + r"""(?mx) + ^ + (?P.*?): + (?P\d+): + (?P\d+): + \s(?P.*) + \s(?P\[.*\]) + $ + """ +) + + +def run_command( + args: List[str], +) -> "subprocess.CompletedProcess[bytes]": + logging.debug("$ %s", " ".join(args)) + start_time = time.monotonic() + try: + return subprocess.run( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + finally: + end_time = time.monotonic() + logging.debug("took %dms", (end_time - start_time) * 1000) + + +def check_files( + binary: str, + files: List[str], +) -> List[LintMessage]: + try: + proc = run_command([binary] + files) + except OSError as err: + return [ + LintMessage( + path=None, + line=None, + char=None, + code=LINTER_CODE, + severity=LintSeverity.ERROR, + name="command-failed", + original=None, + replacement=None, + description=(f"Failed due to {err.__class__.__name__}:\n{err}"), + ) + ] + stdout = str(proc.stdout, "utf-8").strip() + return [ + LintMessage( + path=match["file"], + name=match["code"], + description=match["message"], + line=int(match["line"]), + char=int(match["char"]), + code=LINTER_CODE, + severity=LintSeverity.ERROR, + original=None, + replacement=None, + ) + for match in RESULTS_RE.finditer(stdout) + ] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="actionlint runner", + fromfile_prefix_chars="@", + ) + parser.add_argument( + "--binary", + required=True, + help="actionlint binary path", + ) + parser.add_argument( + "filenames", + nargs="+", + help="paths to lint", + ) + + args = parser.parse_args() + + if not os.path.exists(args.binary): + err_msg = LintMessage( + path="", + line=None, + char=None, + code=LINTER_CODE, + severity=LintSeverity.ERROR, + name="command-failed", + original=None, + replacement=None, + description=( + f"Could not find actionlint binary at {args.binary}," + " you may need to run `lintrunner init`." + ), + ) + print(json.dumps(err_msg._asdict()), flush=True) + exit(0) + + lint_messages = check_files(args.binary, args.filenames) + for lint_message in lint_messages: + print(json.dumps(lint_message._asdict()), flush=True) diff --git a/tools/linter/adapters/black_linter.py b/tools/linter/adapters/black_linter.py new file mode 100644 index 000000000000..9d259fe096b8 --- /dev/null +++ b/tools/linter/adapters/black_linter.py @@ -0,0 +1,228 @@ +import argparse +import concurrent.futures +import json +import logging +import os +import subprocess +import sys +import time +from enum import Enum +from typing import Any, List, NamedTuple, Optional, BinaryIO + + +IS_WINDOWS: bool = os.name == "nt" + + +def eprint(*args: Any, **kwargs: Any) -> None: + print(*args, file=sys.stderr, flush=True, **kwargs) + + +class LintSeverity(str, Enum): + ERROR = "error" + WARNING = "warning" + ADVICE = "advice" + DISABLED = "disabled" + + +class LintMessage(NamedTuple): + path: Optional[str] + line: Optional[int] + char: Optional[int] + code: str + severity: LintSeverity + name: str + original: Optional[str] + replacement: Optional[str] + description: Optional[str] + + +def as_posix(name: str) -> str: + return name.replace("\\", "/") if IS_WINDOWS else name + + +def _run_command( + args: List[str], + *, + stdin: BinaryIO, + timeout: int, +) -> "subprocess.CompletedProcess[bytes]": + logging.debug("$ %s", " ".join(args)) + start_time = time.monotonic() + try: + return subprocess.run( + args, + stdin=stdin, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=IS_WINDOWS, # So batch scripts are found. + timeout=timeout, + check=True, + ) + finally: + end_time = time.monotonic() + logging.debug("took %dms", (end_time - start_time) * 1000) + + +def run_command( + args: List[str], + *, + stdin: BinaryIO, + retries: int, + timeout: int, +) -> "subprocess.CompletedProcess[bytes]": + remaining_retries = retries + while True: + try: + return _run_command(args, stdin=stdin, timeout=timeout) + except subprocess.TimeoutExpired as err: + if remaining_retries == 0: + raise err + remaining_retries -= 1 + logging.warning( + "(%s/%s) Retrying because command failed with: %r", + retries - remaining_retries, + retries, + err, + ) + time.sleep(1) + + +def check_file( + filename: str, + retries: int, + timeout: int, +) -> List[LintMessage]: + try: + with open(filename, "rb") as f: + original = f.read() + with open(filename, "rb") as f: + proc = run_command( + [sys.executable, "-mblack", "--stdin-filename", filename, "-"], + stdin=f, + retries=retries, + timeout=timeout, + ) + except subprocess.TimeoutExpired: + return [ + LintMessage( + path=filename, + line=None, + char=None, + code="BLACK", + severity=LintSeverity.ERROR, + name="timeout", + original=None, + replacement=None, + description=( + "black timed out while trying to process a file. " + "Please report an issue in pytorch/pytorch with the " + "label 'module: lint'" + ), + ) + ] + except (OSError, subprocess.CalledProcessError) as err: + return [ + LintMessage( + path=filename, + line=None, + char=None, + code="BLACK", + severity=LintSeverity.ADVICE, + name="command-failed", + original=None, + replacement=None, + description=( + f"Failed due to {err.__class__.__name__}:\n{err}" + if not isinstance(err, subprocess.CalledProcessError) + else ( + "COMMAND (exit code {returncode})\n" + "{command}\n\n" + "STDERR\n{stderr}\n\n" + "STDOUT\n{stdout}" + ).format( + returncode=err.returncode, + command=" ".join(as_posix(x) for x in err.cmd), + stderr=err.stderr.decode("utf-8").strip() or "(empty)", + stdout=err.stdout.decode("utf-8").strip() or "(empty)", + ) + ), + ) + ] + + replacement = proc.stdout + if original == replacement: + return [] + + return [ + LintMessage( + path=filename, + line=None, + char=None, + code="BLACK", + severity=LintSeverity.WARNING, + name="format", + original=original.decode("utf-8"), + replacement=replacement.decode("utf-8"), + description="Run `lintrunner -a` to apply this patch.", + ) + ] + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Format files with black.", + fromfile_prefix_chars="@", + ) + parser.add_argument( + "--retries", + default=3, + type=int, + help="times to retry timed out black", + ) + parser.add_argument( + "--timeout", + default=90, + type=int, + help="seconds to wait for black", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="verbose logging", + ) + parser.add_argument( + "filenames", + nargs="+", + help="paths to lint", + ) + args = parser.parse_args() + + logging.basicConfig( + format="<%(threadName)s:%(levelname)s> %(message)s", + level=logging.NOTSET + if args.verbose + else logging.DEBUG + if len(args.filenames) < 1000 + else logging.INFO, + stream=sys.stderr, + ) + + with concurrent.futures.ThreadPoolExecutor( + max_workers=os.cpu_count(), + thread_name_prefix="Thread", + ) as executor: + futures = { + executor.submit(check_file, x, args.retries, args.timeout): x + for x in args.filenames + } + for future in concurrent.futures.as_completed(futures): + try: + for lint_message in future.result(): + print(json.dumps(lint_message._asdict()), flush=True) + except Exception: + logging.critical('Failed at "%s".', futures[future]) + raise + + +if __name__ == "__main__": + main() diff --git a/tools/linter/adapters/circleci_linter.py b/tools/linter/adapters/circleci_linter.py index 4eb13228845c..8a76ed396f9f 100644 --- a/tools/linter/adapters/circleci_linter.py +++ b/tools/linter/adapters/circleci_linter.py @@ -51,7 +51,11 @@ def run_command(args: List[str], cwd: str) -> "subprocess.CompletedProcess[bytes start_time = time.monotonic() try: return subprocess.run( - args, cwd=cwd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, + args, + cwd=cwd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=True, ) finally: end_time = time.monotonic() @@ -100,8 +104,8 @@ def run_check( return [ LintMessage( path=config_file, - line=1, - char=1, + line=None, + char=None, code="CIRCLECI", severity=LintSeverity.ERROR, name="config inconsistency", @@ -117,10 +121,13 @@ def run_check( if __name__ == "__main__": parser = argparse.ArgumentParser( - description="circleci consistency linter", fromfile_prefix_chars="@", + description="circleci consistency linter", + fromfile_prefix_chars="@", ) parser.add_argument( - "--config-yml", required=True, help="location of config.yml", + "--config-yml", + required=True, + help="location of config.yml", ) parser.add_argument( "--regen-script-working-dir", @@ -133,7 +140,9 @@ def run_check( help="location of the config generation script, relative to --regen-script-working-dir", ) parser.add_argument( - "--verbose", action="store_true", help="verbose logging", + "--verbose", + action="store_true", + help="verbose logging", ) args = parser.parse_args() diff --git a/tools/linter/adapters/clangformat_linter.py b/tools/linter/adapters/clangformat_linter.py index b4641306daf9..3445dee4e540 100644 --- a/tools/linter/adapters/clangformat_linter.py +++ b/tools/linter/adapters/clangformat_linter.py @@ -153,8 +153,8 @@ def check_file( return [ LintMessage( path=filename, - line=1, - char=1, + line=None, + char=None, code="CLANGFORMAT", severity=LintSeverity.WARNING, name="format", diff --git a/tools/linter/adapters/clangtidy_linter.py b/tools/linter/adapters/clangtidy_linter.py index a3a3bdd0143d..d7e19452df03 100644 --- a/tools/linter/adapters/clangtidy_linter.py +++ b/tools/linter/adapters/clangtidy_linter.py @@ -10,11 +10,22 @@ import time from enum import Enum from pathlib import Path +from sysconfig import get_paths as gp from typing import Any, List, NamedTuple, Optional, Pattern - +# PyTorch directory root +result = subprocess.run( + ["git", "rev-parse", "--show-toplevel"], + stdout=subprocess.PIPE, + check=True, +) +PYTORCH_ROOT = result.stdout.decode("utf-8").strip() IS_WINDOWS: bool = os.name == "nt" +# Returns '/usr/local/include/python' +def get_python_include_dir() -> str: + return gp()["include"] + def eprint(*args: Any, **kwargs: Any) -> None: print(*args, file=sys.stderr, flush=True, **kwargs) @@ -75,12 +86,14 @@ def run_command( logging.debug("took %dms", (end_time - start_time) * 1000) -# Severity is either "error" or "note": https://git.io/JiLOP +# Severity is either "error" or "note": +# https://github.com/python/mypy/blob/8b47a032e1317fb8e3f9a818005a6b63e9bf0311/mypy/errors.py#L46-L47 severities = { "error": LintSeverity.ERROR, "warning": LintSeverity.WARNING, } + def clang_search_dirs() -> List[str]: # Compilers are ordered based on fallback preference # We pick the first one that is available on the system @@ -116,8 +129,13 @@ def clang_search_dirs() -> List[str]: return search_paths + include_args = [] -include_dir = ["/usr/lib/llvm-11/include/openmp"] + clang_search_dirs() +include_dir = [ + "/usr/lib/llvm-11/include/openmp", + get_python_include_dir(), + os.path.join(PYTORCH_ROOT, "third_party/pybind11/include"), +] + clang_search_dirs() for dir in include_dir: include_args += ["--extra-arg", f"-I{dir}"] @@ -142,9 +160,7 @@ def check_file( name="command-failed", original=None, replacement=None, - description=( - f"Failed due to {err.__class__.__name__}:\n{err}" - ), + description=(f"Failed due to {err.__class__.__name__}:\n{err}"), ) ] lint_messages = [] @@ -190,8 +206,10 @@ def main() -> None: parser.add_argument( "--build_dir", required=True, - help=("Where the compile_commands.json file is located. " - "Gets passed to clang-tidy -p"), + help=( + "Where the compile_commands.json file is located. " + "Gets passed to clang-tidy -p" + ), ) parser.add_argument( "--verbose", diff --git a/tools/linter/adapters/exec_linter.py b/tools/linter/adapters/exec_linter.py index f263d11d5456..f00dc60afbb2 100644 --- a/tools/linter/adapters/exec_linter.py +++ b/tools/linter/adapters/exec_linter.py @@ -51,13 +51,17 @@ def check_file(filename: str) -> Optional[LintMessage]: if __name__ == "__main__": parser = argparse.ArgumentParser( - description="native functions linter", fromfile_prefix_chars="@", + description="exec linter", + fromfile_prefix_chars="@", ) parser.add_argument( - "--verbose", action="store_true", help="location of native_functions.yaml", + "--verbose", + action="store_true", ) parser.add_argument( - "filenames", nargs="+", help="paths to lint", + "filenames", + nargs="+", + help="paths to lint", ) args = parser.parse_args() diff --git a/tools/linter/adapters/flake8_linter.py b/tools/linter/adapters/flake8_linter.py index 50b257f41ff5..20274432566c 100644 --- a/tools/linter/adapters/flake8_linter.py +++ b/tools/linter/adapters/flake8_linter.py @@ -1,5 +1,4 @@ import argparse -import concurrent.futures import json import logging import os @@ -244,16 +243,15 @@ def get_issue_documentation_url(code: str) -> str: return "" -def check_file( - filename: str, - binary: str, +def check_files( + filenames: List[str], flake8_plugins_path: Optional[str], severities: Dict[str, LintSeverity], retries: int, ) -> List[LintMessage]: try: proc = run_command( - [binary, "--exit-zero", filename], + [sys.executable, "-mflake8", "--exit-zero"] + filenames, extra_env={"FLAKE8_PLUGINS_PATH": flake8_plugins_path} if flake8_plugins_path else None, @@ -262,7 +260,7 @@ def check_file( except (OSError, subprocess.CalledProcessError) as err: return [ LintMessage( - path=filename, + path=None, line=None, char=None, code="FLAKE8", @@ -314,11 +312,6 @@ def main() -> None: description="Flake8 wrapper linter.", fromfile_prefix_chars="@", ) - parser.add_argument( - "--binary", - required=True, - help="flake8 binary path", - ) parser.add_argument( "--flake8-plugins-path", help="FLAKE8_PLUGINS_PATH env value", @@ -369,28 +362,11 @@ def main() -> None: assert len(parts) == 2, f"invalid severity `{severity}`" severities[parts[0]] = LintSeverity(parts[1]) - with concurrent.futures.ThreadPoolExecutor( - max_workers=os.cpu_count(), - thread_name_prefix="Thread", - ) as executor: - futures = { - executor.submit( - check_file, - filename, - args.binary, - flake8_plugins_path, - severities, - args.retries, - ): filename - for filename in args.filenames - } - for future in concurrent.futures.as_completed(futures): - try: - for lint_message in future.result(): - print(json.dumps(lint_message._asdict()), flush=True) - except Exception: - logging.critical('Failed at "%s".', futures[future]) - raise + lint_messages = check_files( + args.filenames, flake8_plugins_path, severities, args.retries + ) + for lint_message in lint_messages: + print(json.dumps(lint_message._asdict()), flush=True) if __name__ == "__main__": diff --git a/tools/linter/adapters/grep_linter.py b/tools/linter/adapters/grep_linter.py index d160c4d5dc21..61a81ad12dc3 100644 --- a/tools/linter/adapters/grep_linter.py +++ b/tools/linter/adapters/grep_linter.py @@ -43,11 +43,17 @@ def as_posix(name: str) -> str: return name.replace("\\", "/") if IS_WINDOWS else name -def run_command(args: List[str],) -> "subprocess.CompletedProcess[bytes]": +def run_command( + args: List[str], +) -> "subprocess.CompletedProcess[bytes]": logging.debug("$ %s", " ".join(args)) start_time = time.monotonic() try: - return subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE,) + return subprocess.run( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) finally: end_time = time.monotonic() logging.debug("took %dms", (end_time - start_time) * 1000) @@ -116,13 +122,18 @@ def lint_file( def main() -> None: parser = argparse.ArgumentParser( - description="grep wrapper linter.", fromfile_prefix_chars="@", + description="grep wrapper linter.", + fromfile_prefix_chars="@", ) parser.add_argument( - "--pattern", required=True, help="pattern to grep for", + "--pattern", + required=True, + help="pattern to grep for", ) parser.add_argument( - "--linter-name", required=True, help="name of the linter", + "--linter-name", + required=True, + help="name of the linter", ) parser.add_argument( "--error-name", @@ -142,10 +153,14 @@ def main() -> None: ), ) parser.add_argument( - "--verbose", action="store_true", help="verbose logging", + "--verbose", + action="store_true", + help="verbose logging", ) parser.add_argument( - "filenames", nargs="+", help="paths to lint", + "filenames", + nargs="+", + help="paths to lint", ) args = parser.parse_args() @@ -160,7 +175,7 @@ def main() -> None: ) try: - proc = run_command(["grep", "-nPH", args.pattern, *args.filenames]) + proc = run_command(["grep", "-nEHI", args.pattern, *args.filenames]) except Exception as err: err_msg = LintMessage( path=None, diff --git a/tools/linter/adapters/mypy_linter.py b/tools/linter/adapters/mypy_linter.py index 687f8bf68066..65ee8850e667 100644 --- a/tools/linter/adapters/mypy_linter.py +++ b/tools/linter/adapters/mypy_linter.py @@ -1,5 +1,4 @@ import argparse -import concurrent.futures import json import logging import os @@ -8,6 +7,7 @@ import sys import time from enum import Enum +from pathlib import Path from typing import Any, Dict, List, NamedTuple, Optional, Pattern @@ -56,7 +56,6 @@ def as_posix(name: str) -> str: ) - def run_command( args: List[str], *, @@ -76,21 +75,22 @@ def run_command( logging.debug("took %dms", (end_time - start_time) * 1000) -# Severity is either "error" or "note": https://git.io/JiLOP +# Severity is either "error" or "note": +# https://github.com/python/mypy/blob/8b47a032e1317fb8e3f9a818005a6b63e9bf0311/mypy/errors.py#L46-L47 severities = { "error": LintSeverity.ERROR, "note": LintSeverity.ADVICE, } -def check_file( - filename: str, + +def check_files( + filenames: List[str], config: str, - binary: str, retries: int, ) -> List[LintMessage]: try: proc = run_command( - [binary, f"--config={config}", filename], + [sys.executable, "-mmypy", f"--config={config}"] + filenames, extra_env={}, retries=retries, ) @@ -105,9 +105,7 @@ def check_file( name="command-failed", original=None, replacement=None, - description=( - f"Failed due to {err.__class__.__name__}:\n{err}" - ), + description=(f"Failed due to {err.__class__.__name__}:\n{err}"), ) ] stdout = str(proc.stdout, "utf-8").strip() @@ -134,11 +132,6 @@ def main() -> None: description="mypy wrapper linter.", fromfile_prefix_chars="@", ) - parser.add_argument( - "--binary", - required=True, - help="mypy binary path", - ) parser.add_argument( "--retries", default=3, @@ -172,27 +165,26 @@ def main() -> None: stream=sys.stderr, ) - with concurrent.futures.ThreadPoolExecutor( - max_workers=os.cpu_count(), - thread_name_prefix="Thread", - ) as executor: - futures = { - executor.submit( - check_file, - filename, - args.config, - args.binary, - args.retries, - ): filename - for filename in args.filenames - } - for future in concurrent.futures.as_completed(futures): - try: - for lint_message in future.result(): - print(json.dumps(lint_message._asdict()), flush=True) - except Exception: - logging.critical('Failed at "%s".', futures[future]) - raise + # Use a dictionary here to preserve order. mypy cares about order, + # tragically, e.g. https://github.com/python/mypy/issues/2015 + filenames: Dict[str, bool] = {} + + # If a stub file exists, have mypy check it instead of the original file, in + # accordance with PEP-484 (see https://www.python.org/dev/peps/pep-0484/#stub-files) + for filename in args.filenames: + if filename.endswith(".pyi"): + filenames[filename] = True + continue + + stub_filename = filename.replace(".py", ".pyi") + if Path(stub_filename).exists(): + filenames[stub_filename] = True + else: + filenames[filename] = True + + lint_messages = check_files(list(filenames), args.config, args.retries) + for lint_message in lint_messages: + print(json.dumps(lint_message._asdict()), flush=True) if __name__ == "__main__": diff --git a/tools/linter/adapters/nativefunctions_linter.py b/tools/linter/adapters/nativefunctions_linter.py index dd6e3b03aab1..28065f2b7af4 100644 --- a/tools/linter/adapters/nativefunctions_linter.py +++ b/tools/linter/adapters/nativefunctions_linter.py @@ -44,7 +44,8 @@ class LintMessage(NamedTuple): if __name__ == "__main__": parser = argparse.ArgumentParser( - description="native functions linter", fromfile_prefix_chars="@", + description="native functions linter", + fromfile_prefix_chars="@", ) parser.add_argument( "--native-functions-yml", @@ -89,8 +90,8 @@ class LintMessage(NamedTuple): if contents != new_contents: msg = LintMessage( path=args.native_functions_yml, - line=1, - char=1, + line=None, + char=None, code="NATIVEFUNCTIONS", severity=LintSeverity.ERROR, name="roundtrip inconsistency", diff --git a/tools/linter/adapters/newlines_linter.py b/tools/linter/adapters/newlines_linter.py index 5ce5edca670a..f51254ad496a 100644 --- a/tools/linter/adapters/newlines_linter.py +++ b/tools/linter/adapters/newlines_linter.py @@ -67,7 +67,7 @@ def check_file(filename: str) -> Optional[LintMessage]: name="testestTrailing newline", original=None, replacement=None, - description="Trailing newline found. Run `lintunner --take NEWLINE -a` to apply changes.", + description="Trailing newline found. Run `lintrunner --take NEWLINE -a` to apply changes.", ) else: @@ -103,19 +103,24 @@ def check_file(filename: str) -> Optional[LintMessage]: name="Trailing newline", original=original, replacement=original.rstrip("\n") + "\n", - description="Trailing newline found. Run `lintunner --take NEWLINE -a` to apply changes.", + description="Trailing newline found. Run `lintrunner --take NEWLINE -a` to apply changes.", ) if __name__ == "__main__": parser = argparse.ArgumentParser( - description="native functions linter", fromfile_prefix_chars="@", + description="native functions linter", + fromfile_prefix_chars="@", ) parser.add_argument( - "--verbose", action="store_true", help="location of native_functions.yaml", + "--verbose", + action="store_true", + help="location of native_functions.yaml", ) parser.add_argument( - "filenames", nargs="+", help="paths to lint", + "filenames", + nargs="+", + help="paths to lint", ) args = parser.parse_args() diff --git a/tools/linter/adapters/pip_init.py b/tools/linter/adapters/pip_init.py index b4451beac644..db1f69d26b22 100644 --- a/tools/linter/adapters/pip_init.py +++ b/tools/linter/adapters/pip_init.py @@ -1,6 +1,7 @@ """ Initializer script that installs stuff to pip. """ +import os import argparse import logging import subprocess @@ -23,12 +24,18 @@ def run_command(args: List[str]) -> "subprocess.CompletedProcess[bytes]": if __name__ == "__main__": parser = argparse.ArgumentParser(description="pip initializer") parser.add_argument( - "packages", nargs="+", help="pip packages to install", + "packages", + nargs="+", + help="pip packages to install", ) parser.add_argument( - "--verbose", action="store_true", help="verbose logging", + "--verbose", + action="store_true", + help="verbose logging", + ) + parser.add_argument( + "--dry-run", help="do not install anything, just print what would be done." ) - parser.add_argument("--dry-run", help="do not install anything, just print what would be done.") args = parser.parse_args() @@ -45,7 +52,19 @@ def run_command(args: List[str]) -> "subprocess.CompletedProcess[bytes]": "Package {package_name} did not have a version specified. " "Please specify a version to product a consistent linting experience." ) - pip_args = ["pip3", "install", "--user"] + pip_args = ["pip3", "install"] + + # If we are in a global install, use `--user` to install so that you do not + # need root access in order to initialize linters. + # + # However, `pip install --user` interacts poorly with virtualenvs (see: + # https://bit.ly/3vD4kvl) and conda (see: https://bit.ly/3KG7ZfU). So in + # these cases perform a regular installation. + in_conda = os.environ.get("CONDA_PREFIX") is not None + in_virtualenv = os.environ.get("VIRTUAL_ENV") is not None + if not in_conda and not in_virtualenv: + pip_args.append("--user") + pip_args.extend(args.packages) dry_run = args.dry_run == "1" diff --git a/tools/linter/adapters/s3_init.py b/tools/linter/adapters/s3_init.py index f2bc9339776d..65fcef4bc291 100644 --- a/tools/linter/adapters/s3_init.py +++ b/tools/linter/adapters/s3_init.py @@ -16,12 +16,19 @@ HOST_PLATFORM = platform.system() # PyTorch directory root -result = subprocess.run( - ["git", "rev-parse", "--show-toplevel"], - stdout=subprocess.PIPE, - check=True, -) -PYTORCH_ROOT = result.stdout.decode("utf-8").strip() +try: + result = subprocess.run( + ["git", "rev-parse", "--show-toplevel"], + stdout=subprocess.PIPE, + check=True, + ) + PYTORCH_ROOT = result.stdout.decode("utf-8").strip() +except subprocess.CalledProcessError: + # If git is not installed, compute repo root as 3 folders up from this file + path_ = os.path.abspath(__file__) + for _ in range(4): + path_ = os.path.dirname(path_) + PYTORCH_ROOT = path_ DRY_RUN = False diff --git a/tools/linter/adapters/s3_init_config.json b/tools/linter/adapters/s3_init_config.json index 0f3619ad0fff..736ab6addb84 100644 --- a/tools/linter/adapters/s3_init_config.json +++ b/tools/linter/adapters/s3_init_config.json @@ -2,29 +2,31 @@ "clang-format": { "Darwin": { "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/mac/clang-format-mojave", - "hash": "1485a242a96c737ba7cdd9f259114f2201accdb46d87ac7a8650b1a814cd4d4d", - "object_name": "mac/clang-format-mojave", - "s3_bucket": "oss-clang-format" + "hash": "1485a242a96c737ba7cdd9f259114f2201accdb46d87ac7a8650b1a814cd4d4d" }, "Linux": { "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-format-linux64", - "hash": "e1c8b97b919541a99e0a355df5c3f9e8abebc64259dbee6f8c68e1ef90582856", - "object_name": "linux64/clang-format-linux64", - "s3_bucket": "oss-clang-format" + "hash": "e1c8b97b919541a99e0a355df5c3f9e8abebc64259dbee6f8c68e1ef90582856" } }, "clang-tidy": { "Darwin": { "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/macos/clang-tidy", - "hash": "541797a7b8fa795e2f3c1adcd8236cc336a40aa927028dc5bc79172e1d9eca36", - "object_name": "macos/clang-tidy", - "s3_bucket": "oss-clang-format" + "hash": "541797a7b8fa795e2f3c1adcd8236cc336a40aa927028dc5bc79172e1d9eca36" }, "Linux": { "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-tidy", - "hash": "49343a448fcb75cd1e0fb9d6b1f6c2ef4b008b6f91d6ff899d4ac6060f5e52a5", - "object_name": "linx64/clang-tidy", - "s3_bucket": "oss-clang-format" + "hash": "49343a448fcb75cd1e0fb9d6b1f6c2ef4b008b6f91d6ff899d4ac6060f5e52a5" + } + }, + "actionlint": { + "Darwin": { + "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/macos/actionlint", + "hash": "3ce2c94280c540e20b270acae60bdd9e72ad17d6cb35b688951b1ec1eb8cbdd6" + }, + "Linux": { + "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/actionlint", + "hash": "693f464106474760f0edf4a1778215095eacc4bd5f79aab5dc950892f120828b" } } } diff --git a/tools/linter/adapters/shellcheck_linter.py b/tools/linter/adapters/shellcheck_linter.py new file mode 100644 index 000000000000..d94c5a1ce047 --- /dev/null +++ b/tools/linter/adapters/shellcheck_linter.py @@ -0,0 +1,118 @@ +import argparse +import json +import logging +import subprocess +import time +import shutil +from enum import Enum +from typing import List, NamedTuple, Optional + + +LINTER_CODE = "SHELLCHECK" + + +class LintSeverity(str, Enum): + ERROR = "error" + WARNING = "warning" + ADVICE = "advice" + DISABLED = "disabled" + + +class LintMessage(NamedTuple): + path: Optional[str] + line: Optional[int] + char: Optional[int] + code: str + severity: LintSeverity + name: str + original: Optional[str] + replacement: Optional[str] + description: Optional[str] + + +def run_command( + args: List[str], +) -> "subprocess.CompletedProcess[bytes]": + logging.debug("$ %s", " ".join(args)) + start_time = time.monotonic() + try: + return subprocess.run( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + finally: + end_time = time.monotonic() + logging.debug("took %dms", (end_time - start_time) * 1000) + + +def check_files( + files: List[str], +) -> List[LintMessage]: + try: + proc = run_command( + ["shellcheck", "--external-sources", "--format=json1"] + files + ) + except OSError as err: + return [ + LintMessage( + path=None, + line=None, + char=None, + code=LINTER_CODE, + severity=LintSeverity.ERROR, + name="command-failed", + original=None, + replacement=None, + description=(f"Failed due to {err.__class__.__name__}:\n{err}"), + ) + ] + stdout = str(proc.stdout, "utf-8").strip() + results = json.loads(stdout)["comments"] + return [ + LintMessage( + path=result["file"], + name=f"SC{result['code']}", + description=result["message"], + line=result["line"], + char=result["column"], + code=LINTER_CODE, + severity=LintSeverity.ERROR, + original=None, + replacement=None, + ) + for result in results + ] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="shellcheck runner", + fromfile_prefix_chars="@", + ) + parser.add_argument( + "filenames", + nargs="+", + help="paths to lint", + ) + + if shutil.which("shellcheck") is None: + err_msg = LintMessage( + path="", + line=None, + char=None, + code=LINTER_CODE, + severity=LintSeverity.ERROR, + name="command-failed", + original=None, + replacement=None, + description="shellcheck is not installed, did you forget to run `lintrunner init`?", + ) + print(json.dumps(err_msg._asdict()), flush=True) + exit(0) + + args = parser.parse_args() + + lint_messages = check_files(args.filenames) + for lint_message in lint_messages: + print(json.dumps(lint_message._asdict()), flush=True) diff --git a/tools/linter/adapters/testowners_linter.py b/tools/linter/adapters/testowners_linter.py new file mode 100755 index 000000000000..b65cfde4d79d --- /dev/null +++ b/tools/linter/adapters/testowners_linter.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +""" +Test ownership was introduced in https://github.com/pytorch/pytorch/issues/66232. + +This lint verifies that every Python test file (file that matches test_*.py or *_test.py in the test folder) +has valid ownership information in a comment header. Valid means: + - The format of the header follows the pattern "# Owner(s): ["list", "of owner", "labels"] + - Each owner label actually exists in PyTorch + - Each owner label starts with "module: " or "oncall: " or is in ACCEPTABLE_OWNER_LABELS +""" +import json +import argparse +from enum import Enum +from typing import List, Any, Optional, NamedTuple +from urllib.request import urlopen + + +LINTER_CODE = "TESTOWNERS" + + +class LintSeverity(str, Enum): + ERROR = "error" + WARNING = "warning" + ADVICE = "advice" + DISABLED = "disabled" + + +class LintMessage(NamedTuple): + path: Optional[str] + line: Optional[int] + char: Optional[int] + code: str + severity: LintSeverity + name: str + original: Optional[str] + replacement: Optional[str] + description: Optional[str] + + +# Team/owner labels usually start with "module: " or "oncall: ", but the following are acceptable exceptions +ACCEPTABLE_OWNER_LABELS = ["NNC", "high priority"] +OWNERS_PREFIX = "# Owner(s): " + + +def get_pytorch_labels() -> Any: + labels = ( + urlopen("https://ossci-metrics.s3.amazonaws.com/pytorch_labels.json") + .read() + .decode("utf-8") + ) + return json.loads(labels) + + +PYTORCH_LABELS = get_pytorch_labels() +# Team/owner labels usually start with "module: " or "oncall: ", but the following are acceptable exceptions +ACCEPTABLE_OWNER_LABELS = ["NNC", "high priority"] +GLOB_EXCEPTIONS = ["**/test/run_test.py"] + + +def check_labels( + labels: List[str], filename: str, line_number: int +) -> List[LintMessage]: + lint_messages = [] + for label in labels: + if label not in PYTORCH_LABELS: + lint_messages.append( + LintMessage( + path=filename, + line=line_number, + char=None, + code=LINTER_CODE, + severity=LintSeverity.ERROR, + name="[invalid-label]", + original=None, + replacement=None, + description=( + f"{label} is not a PyTorch label " + "(please choose from https://github.com/pytorch/pytorch/labels)" + ), + ) + ) + + if ( + label.startswith("module:") + or label.startswith("oncall:") + or label in ACCEPTABLE_OWNER_LABELS + ): + continue + + lint_messages.append( + LintMessage( + path=filename, + line=line_number, + char=None, + code=LINTER_CODE, + severity=LintSeverity.ERROR, + name="[invalid-owner]", + original=None, + replacement=None, + description=( + f"{label} is not an acceptable owner " + "(please update to another label or edit ACCEPTABLE_OWNERS_LABELS " + "in tools/linters/adapters/testowners_linter.py" + ), + ) + ) + + return lint_messages + + +def check_file(filename: str) -> List[LintMessage]: + lint_messages = [] + has_ownership_info = False + + with open(filename) as f: + for idx, line in enumerate(f): + if not line.startswith(OWNERS_PREFIX): + continue + + has_ownership_info = True + labels = json.loads(line[len(OWNERS_PREFIX) :]) + lint_messages.extend(check_labels(labels, filename, idx + 1)) + + if has_ownership_info is False: + lint_messages.append( + LintMessage( + path=filename, + line=None, + char=None, + code=LINTER_CODE, + severity=LintSeverity.ERROR, + name="[no-owner-info]", + original=None, + replacement=None, + description="Missing a comment header with ownership information.", + ) + ) + + return lint_messages + + +def main() -> None: + parser = argparse.ArgumentParser( + description="test ownership linter", + fromfile_prefix_chars="@", + ) + parser.add_argument( + "filenames", + nargs="+", + help="paths to lint", + ) + + args = parser.parse_args() + lint_messages = [] + + for filename in args.filenames: + lint_messages.extend(check_file(filename)) + + for lint_message in lint_messages: + print(json.dumps(lint_message._asdict()), flush=True) + + +if __name__ == "__main__": + main() diff --git a/tools/linter/clang_format_all.py b/tools/linter/clang_format_all.py deleted file mode 100755 index 7792f15a77d1..000000000000 --- a/tools/linter/clang_format_all.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -""" -A script that runs clang-format on all C/C++ files in CLANG_FORMAT_ALLOWLIST. There is -also a diff mode which simply checks if clang-format would make any changes, which is useful for -CI purposes. - -If clang-format is not available, the script also downloads a platform-appropriate binary from -and S3 bucket and verifies it against a precommited set of blessed binary hashes. -""" -import argparse -import asyncio -import re -import os -import sys -from typing import List, Set - -from .clang_format_utils import get_and_check_clang_format, CLANG_FORMAT_PATH - -# Allowlist of directories to check. All files that in that directory -# (recursively) will be checked. -# If you edit this, please edit the allowlist in clang_format_ci.sh as well. -CLANG_FORMAT_ALLOWLIST = [ - "c10/", - "torch/csrc/jit/", - "test/cpp/jit/", - "test/cpp/tensorexpr/" -] - -# Only files with names matching this regex will be formatted. -CPP_FILE_REGEX = re.compile(".*\\.(h|cpp|cc|c|hpp)$") - - -def get_allowlisted_files() -> Set[str]: - """ - Parse CLANG_FORMAT_ALLOWLIST and resolve all directories. - Returns the set of allowlist cpp source files. - """ - matches = [] - for dir in CLANG_FORMAT_ALLOWLIST: - for root, dirnames, filenames in os.walk(dir): - for filename in filenames: - if CPP_FILE_REGEX.match(filename): - matches.append(os.path.join(root, filename)) - return set(matches) - - -async def run_clang_format_on_file( - filename: str, - semaphore: asyncio.Semaphore, - verbose: bool = False, -) -> None: - """ - Run clang-format on the provided file. - """ - # -style=file picks up the closest .clang-format, -i formats the files inplace. - cmd = "{} -style=file -i {}".format(CLANG_FORMAT_PATH, filename) - async with semaphore: - proc = await asyncio.create_subprocess_shell(cmd) - _ = await proc.wait() - if verbose: - print("Formatted {}".format(filename)) - - -async def file_clang_formatted_correctly( - filename: str, - semaphore: asyncio.Semaphore, - verbose: bool = False, -) -> bool: - """ - Checks if a file is formatted correctly and returns True if so. - """ - ok = True - # -style=file picks up the closest .clang-format - cmd = "{} -style=file {}".format(CLANG_FORMAT_PATH, filename) - - async with semaphore: - proc = await asyncio.create_subprocess_shell(cmd, stdout=asyncio.subprocess.PIPE) - # Read back the formatted file. - stdout, _ = await proc.communicate() - - formatted_contents = stdout.decode() - # Compare the formatted file to the original file. - with open(filename) as orig: - orig_contents = orig.read() - if formatted_contents != orig_contents: - ok = False - if verbose: - print("{} is not formatted correctly".format(filename)) - - return ok - - -async def run_clang_format( - max_processes: int, - diff: bool = False, - verbose: bool = False, -) -> bool: - """ - Run clang-format to all files in CLANG_FORMAT_ALLOWLIST that match CPP_FILE_REGEX. - """ - # Check to make sure the clang-format binary exists. - if not os.path.exists(CLANG_FORMAT_PATH): - print("clang-format binary not found") - return False - - # Gather command-line options for clang-format. - args = [CLANG_FORMAT_PATH, "-style=file"] - - if not diff: - args.append("-i") - - ok = True - - # Semaphore to bound the number of subprocesses that can be created at once to format files. - semaphore = asyncio.Semaphore(max_processes) - - # Format files in parallel. - if diff: - for f in asyncio.as_completed([file_clang_formatted_correctly(f, semaphore, verbose) for f in get_allowlisted_files()]): - ok &= await f - - if ok: - print("All files formatted correctly") - else: - print("Some files not formatted correctly") - else: - await asyncio.gather(*[run_clang_format_on_file(f, semaphore, verbose) for f in get_allowlisted_files()]) - - return ok - -def parse_args(args: List[str]) -> argparse.Namespace: - """ - Parse and return command-line arguments. - """ - parser = argparse.ArgumentParser( - description="Execute clang-format on your working copy changes." - ) - parser.add_argument( - "-d", - "--diff", - action="store_true", - default=False, - help="Determine whether running clang-format would produce changes", - ) - parser.add_argument("--verbose", "-v", action="store_true", default=False) - parser.add_argument("--max-processes", type=int, default=50, - help="Maximum number of subprocesses to create to format files in parallel") - return parser.parse_args(args) - - -def main(args: List[str]) -> bool: - # Parse arguments. - options = parse_args(args) - # Get clang-format and make sure it is the right binary and it is in the right place. - ok = get_and_check_clang_format(options.verbose) - # Invoke clang-format on all files in the directories in the allowlist. - if ok: - loop = asyncio.get_event_loop() - ok = loop.run_until_complete(run_clang_format(options.max_processes, options.diff, options.verbose)) - - # We have to invert because False -> 0, which is the code to be returned if everything is okay. - return not ok - - -if __name__ == "__main__": - sys.exit(main(sys.argv[1:])) diff --git a/tools/linter/clang_format_ci.sh b/tools/linter/clang_format_ci.sh deleted file mode 100755 index 6f5220e516d1..000000000000 --- a/tools/linter/clang_format_ci.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/sh -set -eux - -# Runs clang-format on allowlisted files. -# Requires a single argument, which is the argument to git-clang-format - -# If you edit this allowlist, please edit the one in clang_format_all.py as well -find . -type f \ - -path './c10/*' -or \ - -path './torch/csrc/jit/*' -or \ - -path './test/cpp/jit/*' -or \ - -path './test/cpp/tensorexpr/*' \ - | xargs tools/linter/git-clang-format --verbose "$1" -- diff --git a/tools/linter/clang_format_utils.py b/tools/linter/clang_format_utils.py deleted file mode 100644 index 021ba9162cca..000000000000 --- a/tools/linter/clang_format_utils.py +++ /dev/null @@ -1,20 +0,0 @@ -import os -from install.download_bin import download, PYTORCH_ROOT # type: ignore[import] - -# This dictionary maps each platform to the S3 object URL for its clang-format binary. -PLATFORM_TO_CF_URL = { - "Darwin": "https://oss-clang-format.s3.us-east-2.amazonaws.com/mac/clang-format-mojave", - "Linux": "https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-format-linux64", -} - -# This dictionary maps each platform to a relative path to a file containing its reference hash. -PLATFORM_TO_HASH = { - "Darwin": os.path.join("tools", "clang_format_hash", "mac", "clang-format-mojave"), - "Linux": os.path.join("tools", "clang_format_hash", "linux64", "clang-format-linux64"), -} - -CLANG_FORMAT_DIR = os.path.join(PYTORCH_ROOT, ".clang-format-bin") -CLANG_FORMAT_PATH = os.path.join(CLANG_FORMAT_DIR, "clang-format") - -def get_and_check_clang_format(verbose: bool = False) -> bool: - return bool(download("clang-format", CLANG_FORMAT_DIR, PLATFORM_TO_CF_URL, PLATFORM_TO_HASH)) diff --git a/tools/linter/clang_tidy/__main__.py b/tools/linter/clang_tidy/__main__.py deleted file mode 100644 index fa6403a64bb6..000000000000 --- a/tools/linter/clang_tidy/__main__.py +++ /dev/null @@ -1,210 +0,0 @@ -import argparse -import pathlib -import os -import shutil -import subprocess -import re -import sys -from typing import List - - -from tools.linter.clang_tidy.run import run -from tools.linter.clang_tidy.generate_build_files import generate_build_files -from tools.linter.install.clang_tidy import INSTALLATION_PATH -from tools.linter.install.download_bin import PYTORCH_ROOT - - -def clang_search_dirs() -> List[str]: - # Compilers are ordered based on fallback preference - # We pick the first one that is available on the system - compilers = ["clang", "gcc", "cpp", "cc"] - compilers = [c for c in compilers if shutil.which(c) is not None] - if len(compilers) == 0: - raise RuntimeError(f"None of {compilers} were found") - compiler = compilers[0] - - result = subprocess.run( - [compiler, "-E", "-x", "c++", "-", "-v"], - stdin=subprocess.DEVNULL, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - check=True, - ) - stderr = result.stderr.decode().strip().split("\n") - search_start = r"#include.*search starts here:" - search_end = r"End of search list." - - append_path = False - search_paths = [] - for line in stderr: - if re.match(search_start, line): - if append_path: - continue - else: - append_path = True - elif re.match(search_end, line): - break - elif append_path: - search_paths.append(line.strip()) - - # There are source files include , etc. - # under torch/csrc/api/include folder. Since torch/csrc/api/include is not - # a search path for clang-tidy, there will be clang-disagnostic errors - # complaing those header files not found. Change the source code to include - # full path like torch/csrc/api/include/torch/torch.h does not work well - # since torch/torch.h includes torch/all.h which inturn includes more. - # We would need recursively change mutliple files. - # Adding the include path to the lint script should be a better solution. - search_paths.append( - os.path.join(PYTORCH_ROOT, "torch/csrc/api/include"), - ) - return search_paths - - -DEFAULTS = { - "glob": [ - # The negative filters below are to exclude files that include onnx_pb.h or - # caffe2_pb.h, otherwise we'd have to build protos as part of this CI job. - # FunctionsManual.cpp is excluded to keep this diff clean. It will be fixed - # in a follow up PR. - # /torch/csrc/generic/*.cpp is excluded because those files aren't actually built. - # deploy/interpreter files are excluded due to using macros and other techniquies - # that are not easily converted to accepted c++ - "-torch/csrc/jit/passes/onnx/helper.cpp", - "-torch/csrc/jit/passes/onnx/shape_type_inference.cpp", - "-torch/csrc/jit/serialization/onnx.cpp", - "-torch/csrc/jit/serialization/export.cpp", - "-torch/csrc/jit/serialization/import.cpp", - "-torch/csrc/jit/serialization/import_legacy.cpp", - "-torch/csrc/onnx/init.cpp", - "-torch/csrc/cuda/nccl.*", - "-torch/csrc/cuda/python_nccl.cpp", - "-torch/csrc/autograd/FunctionsManual.cpp", - "-torch/csrc/generic/*.cpp", - "-torch/csrc/jit/codegen/cuda/runtime/*", - "-torch/csrc/deploy/interactive_embedded_interpreter.cpp", - "-torch/csrc/deploy/interpreter/interpreter.cpp", - "-torch/csrc/deploy/interpreter/interpreter.h", - "-torch/csrc/deploy/interpreter/interpreter_impl.h", - "-torch/csrc/deploy/interpreter/test_main.cpp", - "-torch/csrc/deploy/test_deploy_python_ext.cpp", - ], - "paths": ["torch/csrc/"], - "include-dir": ["/usr/lib/llvm-11/include/openmp"] + clang_search_dirs(), - "clang-tidy-exe": INSTALLATION_PATH, - "compile-commands-dir": "build", - "config-file": ".clang-tidy", - "disable-progress-bar": False, -} - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="clang-tidy wrapper script") - parser.add_argument( - "-e", - "--clang-tidy-exe", - default=DEFAULTS["clang-tidy-exe"], - help="Path to clang-tidy executable", - ) - parser.add_argument( - "-g", - "--glob", - action="append", - default=DEFAULTS["glob"], - help="Only lint files that match these glob patterns " - "(see documentation for `fnmatch` for supported syntax)." - "If a pattern starts with a - the search is negated for that pattern.", - ) - parser.add_argument( - "-x", - "--regex", - action="append", - default=[], - help="Only lint files that match these regular expressions (from the start of the filename). " - "If a pattern starts with a - the search is negated for that pattern.", - ) - parser.add_argument( - "-c", - "--compile-commands-dir", - default=DEFAULTS["compile-commands-dir"], - help="Path to the folder containing compile_commands.json", - ) - parser.add_argument( - "--diff-file", - help="File containing diff to use for determining files to lint and line filters", - ) - parser.add_argument( - "-p", - "--paths", - nargs="+", - default=DEFAULTS["paths"], - help="Lint only the given paths (recursively)", - ) - parser.add_argument( - "-n", - "--dry-run", - action="store_true", - help="Only show the command to be executed, without running it", - ) - parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output") - parser.add_argument("-q", "--quiet", action="store_true", help="Don't print output") - parser.add_argument( - "--config-file", - default=DEFAULTS["config-file"], - help="Path to a clang-tidy config file. Defaults to '.clang-tidy'.", - ) - parser.add_argument( - "--print-include-paths", - action="store_true", - help="Print the search paths used for include directives", - ) - parser.add_argument( - "-I", - "--include-dir", - action="append", - default=DEFAULTS["include-dir"], - help="Add the specified directory to the search path for include files", - ) - parser.add_argument( - "-s", - "--suppress-diagnostics", - action="store_true", - help="Add NOLINT to suppress clang-tidy violations", - ) - parser.add_argument( - "--disable-progress-bar", - action="store_true", - default=DEFAULTS["disable-progress-bar"], - help="Disable the progress bar", - ) - parser.add_argument( - "extra_args", nargs="*", help="Extra arguments to forward to clang-tidy" - ) - return parser.parse_args() - - -def main() -> None: - options = parse_args() - - if not pathlib.Path("build").exists(): - generate_build_files() - - # Check if clang-tidy executable exists - exists = os.access(options.clang_tidy_exe, os.X_OK) - - if not exists: - msg = ( - f"Could not find '{options.clang_tidy_exe}'\n" - + "We provide a custom build of clang-tidy that has additional checks.\n" - + "You can install it by running:\n" - + "$ python3 -m tools.linter.install.clang_tidy \n" - + "from the pytorch folder" - ) - raise RuntimeError(msg) - - result, _ = run(options) - sys.exit(result.returncode) - - -if __name__ == "__main__": - main() diff --git a/tools/linter/clang_tidy/generate_build_files.py b/tools/linter/clang_tidy/generate_build_files.py index 9e3db664ab0d..fff8bf492e0f 100644 --- a/tools/linter/clang_tidy/generate_build_files.py +++ b/tools/linter/clang_tidy/generate_build_files.py @@ -6,8 +6,15 @@ def run_cmd(cmd: List[str]) -> None: print(f"Running: {cmd}") - result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,) - stdout, stderr = result.stdout.decode("utf-8").strip(), result.stderr.decode("utf-8").strip() + result = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + stdout, stderr = ( + result.stdout.decode("utf-8").strip(), + result.stderr.decode("utf-8").strip(), + ) print(stdout) print(stderr) if result.returncode != 0: @@ -36,7 +43,7 @@ def run_autogen() -> None: [ sys.executable, "-m", - "tools.codegen.gen", + "torchgen.gen", "-s", "aten/src/ATen", "-d", @@ -51,8 +58,9 @@ def run_autogen() -> None: "tools/setup_helpers/generate_code.py", "--native-functions-path", "aten/src/ATen/native/native_functions.yaml", - "--nn-path", - "aten/src", + "--tags-path", + "aten/src/ATen/native/tags.yaml", + "--gen_lazy_ts_backend", ] ) diff --git a/tools/linter/clang_tidy/max_tokens_pragma.py b/tools/linter/clang_tidy/max_tokens_pragma.py deleted file mode 100644 index 4f7b152659f7..000000000000 --- a/tools/linter/clang_tidy/max_tokens_pragma.py +++ /dev/null @@ -1,111 +0,0 @@ -import argparse -import re -from typing import List - - -# > Why is DEFAULT_MAX_TOKEN_COUNT set to 1? -# -# clang-tidy doesn't have a direct way to query for token counts in the -# codebase. The workaround is to set the max token count to 1. This will cause -# clang-tidy to output a warning with the actual token count of the file. -# -# A non-destructive way to set the max token count to 1 would be to pass it -# through the -fmax-tokens option. However, this flag will be overridden if here -# exists a #pragma max_tokens_total statement in the file. This necessitates a -# destructive way to set the max token count to 1. -DEFAULT_MAX_TOKEN_COUNT = 1 -MAX_TOKENS_CHECK_DIAG_NAME = "misc-max-tokens" -MAX_TOKENS_PRAGMA_PATTERN = r"^#pragma\s+clang\s+max_tokens_total\s+(\d+)$" - - -def add_max_tokens_pragma(code: str, num_max_tokens: int) -> str: - lines = code.splitlines() - - found_pragma = False - pragma = f"#pragma clang max_tokens_total {num_max_tokens}" - - for idx, line in enumerate(lines): - match = re.match(MAX_TOKENS_PRAGMA_PATTERN, line.strip()) - if match: - found_pragma = True - token_count = match.group(1) - if int(token_count) != num_max_tokens: - lines[idx] = pragma - - if not found_pragma: - lines = [pragma] + lines - - return "\n".join(lines) - - -def strip_max_tokens_pragmas(code: str) -> str: - lines = code.splitlines() - lines = [ - line - for line in lines - if re.match(MAX_TOKENS_PRAGMA_PATTERN, line.strip()) is None - ] - return "\n".join(lines) - - -def add_max_tokens_pragma_to_files(files: List[str], num_max_tokens: int) -> None: - for filename in files: - with open(filename, "r+") as f: - data = f.read() - data = add_max_tokens_pragma(data, num_max_tokens) - - f.seek(0) - f.write(data) - f.truncate() - - -def strip_max_tokens_pragma_from_files(files: List[str]) -> None: - for filename in files: - with open(filename, "r+") as f: - data = f.read() - data = strip_max_tokens_pragmas(data) - - f.seek(0) - f.write(data) - f.truncate() - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - description="Add max_tokens_total pragmas to C/C++ source files" - ) - parser.add_argument( - "-n", - "--num-max-tokens", - default=DEFAULT_MAX_TOKEN_COUNT, - help="Set the token count to this value", - type=int, - ) - parser.add_argument( - "files", nargs="+", help="Add max_tokens_total pragmas to the specified files" - ) - parser.add_argument( - "-i", "--ignore", nargs="+", default=[], help="Ignore the specified files" - ) - parser.add_argument( - "-s", - "--strip", - action="store_true", - help="Remove max_tokens_total pragmas from the input files", - ) - return parser.parse_args() - - -def main() -> None: - options = parse_args() - - ignored = set(options.ignore) - files = [filename for filename in options.files if filename not in ignored] - if options.strip: - strip_max_tokens_pragma_from_files(files) - else: - add_max_tokens_pragma_to_files(files, options.num_max_tokens) - - -if __name__ == "__main__": - main() diff --git a/tools/linter/clang_tidy/requirements.txt b/tools/linter/clang_tidy/requirements.txt deleted file mode 100644 index faea93fd550a..000000000000 --- a/tools/linter/clang_tidy/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -unidiff==0.6.0 diff --git a/tools/linter/clang_tidy/run.py b/tools/linter/clang_tidy/run.py deleted file mode 100644 index 9e71333475fc..000000000000 --- a/tools/linter/clang_tidy/run.py +++ /dev/null @@ -1,516 +0,0 @@ -#!/usr/bin/env python3 -""" -A driver script to run clang-tidy on changes detected via git. - -By default, clang-tidy runs on all files you point it at. This means that even -if you changed only parts of that file, you will get warnings for the whole -file. This script has the ability to ask git for the exact lines that have -changed since a particular git revision, and makes clang-tidy only lint those. -This makes it much less overhead to integrate in CI and much more relevant to -developers. This git-enabled mode is optional, and full scans of a directory -tree are also possible. In both cases, the script allows filtering files via -glob or regular expressions. -""" - - -import collections -import fnmatch -import json -import os -import os.path -import re -import shutil -import sys -import asyncio -import shlex -import multiprocessing - -from typing import Any, Dict, Iterable, List, Set, Tuple - -Patterns = collections.namedtuple("Patterns", "positive, negative") - - -# NOTE: Clang-tidy cannot lint headers directly, because headers are not -# compiled -- translation units are, of which there is one per implementation -# (c/cc/cpp) file. -DEFAULT_FILE_PATTERN = re.compile(r"^.*\.c(c|pp)?$") -CLANG_WARNING_PATTERN = re.compile( - r"([^:]+):(\d+):\d+:\s+(warning|error):.*\[([^\]]+)\]" -) -# Set from command line arguments in main(). -VERBOSE = False -QUIET = False - - -def log(*args: Any, **kwargs: Any) -> None: - if not QUIET: - print(*args, **kwargs) - - -class CommandResult: - def __init__(self, returncode: int, stdout: str, stderr: str): - self.returncode = returncode - self.stdout = stdout.strip() - self.stderr = stderr.strip() - - def failed(self) -> bool: - return self.returncode != 0 - - def __add__(self, other: "CommandResult") -> "CommandResult": - return CommandResult( - self.returncode + other.returncode, - f"{self.stdout}\n{other.stdout}", - f"{self.stderr}\n{other.stderr}", - ) - - def __str__(self) -> str: - return f"{self.stdout}" - - def __repr__(self) -> str: - return ( - f"returncode: {self.returncode}\n" - + f"stdout: {self.stdout}\n" - + f"stderr: {self.stderr}" - ) - - -class ProgressMeter: - def __init__( - self, num_items: int, start_msg: str = "", disable_progress_bar: bool = False - ) -> None: - self.num_items = num_items - self.num_processed = 0 - self.width = 80 - self.disable_progress_bar = disable_progress_bar - - # helper escape sequences - self._clear_to_end = "\x1b[2K" - self._move_to_previous_line = "\x1b[F" - self._move_to_start_of_line = "\r" - self._move_to_next_line = "\n" - - if self.disable_progress_bar: - log(start_msg) - else: - self._write( - start_msg - + self._move_to_next_line - + "[>" - + (self.width * " ") - + "]" - + self._move_to_start_of_line - ) - self._flush() - - def _write(self, s: str) -> None: - sys.stderr.write(s) - - def _flush(self) -> None: - sys.stderr.flush() - - def update(self, msg: str) -> None: - if self.disable_progress_bar: - return - - # Once we've processed all items, clear the progress bar - if self.num_processed == self.num_items - 1: - self._write(self._clear_to_end) - return - - # NOP if we've already processed all items - if self.num_processed > self.num_items: - return - - self.num_processed += 1 - - self._write( - self._move_to_previous_line - + self._clear_to_end - + msg - + self._move_to_next_line - ) - - progress = int((self.num_processed / self.num_items) * self.width) - padding = self.width - progress - self._write( - self._move_to_start_of_line - + self._clear_to_end - + f"({self.num_processed} of {self.num_items}) " - + f"[{progress*'='}>{padding*' '}]" - + self._move_to_start_of_line - ) - self._flush() - - def print(self, msg: str) -> None: - if QUIET: - return - elif self.disable_progress_bar: - print(msg) - else: - self._write( - self._clear_to_end - + self._move_to_previous_line - + self._clear_to_end - + msg - + self._move_to_next_line - + self._move_to_next_line - ) - self._flush() - - -class ClangTidyWarning: - def __init__(self, name: str, occurrences: List[Tuple[str, int]]): - self.name = name - self.occurrences = occurrences - - def __str__(self) -> str: - base = f"[{self.name}] occurred {len(self.occurrences)} times\n" - for occ in self.occurrences: - base += f" {occ[0]}:{occ[1]}\n" - return base - - -async def run_shell_command( - cmd: List[str], on_completed: Any = None, *args: Any -) -> CommandResult: - """Executes a shell command and runs an optional callback when complete""" - if VERBOSE: - log("Running: ", " ".join(cmd)) - - proc = await asyncio.create_subprocess_shell( - " ".join(shlex.quote(x) for x in cmd), # type: ignore[attr-defined] - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - output = await proc.communicate() - result = CommandResult( - returncode=proc.returncode if proc.returncode is not None else -1, - stdout=output[0].decode("utf-8").strip(), - stderr=output[1].decode("utf-8").strip(), - ) - - if on_completed: - on_completed(result, *args) - - return result - - -async def _run_clang_tidy_in_parallel( - commands: List[Tuple[List[str], str]], disable_progress_bar: bool -) -> CommandResult: - progress_meter = ProgressMeter( - len(commands), - f"Processing {len(commands)} clang-tidy jobs", - disable_progress_bar=disable_progress_bar, - ) - - async def gather_with_concurrency(n: int, tasks: List[Any]) -> Any: - semaphore = asyncio.Semaphore(n) - - async def sem_task(task: Any) -> Any: - async with semaphore: - return await task - - return await asyncio.gather( - *(sem_task(task) for task in tasks), return_exceptions=True - ) - - async def helper() -> Any: - def on_completed(result: CommandResult, filename: str) -> None: - if result.failed(): - msg = str(result) if not VERBOSE else repr(result) - progress_meter.print(msg) - progress_meter.update(f"Processed {filename}") - - coros = [ - run_shell_command(cmd, on_completed, filename) - for (cmd, filename) in commands - ] - return await gather_with_concurrency(multiprocessing.cpu_count(), coros) - - results = await helper() - return sum(results, CommandResult(0, "", "")) - - -async def _run_clang_tidy( - options: Any, line_filters: List[Dict[str, Any]], files: Iterable[str] -) -> CommandResult: - """Executes the actual clang-tidy command in the shell.""" - - base = [options.clang_tidy_exe] - - # Apply common options - base += ["-p", options.compile_commands_dir] - if not options.config_file and os.path.exists(".clang-tidy"): - options.config_file = ".clang-tidy" - if options.config_file: - import yaml - - with open(options.config_file) as config: - # Here we convert the YAML config file to a JSON blob. - base += [ - "-config", - json.dumps(yaml.load(config, Loader=yaml.SafeLoader)), - ] - if options.print_include_paths: - base += ["--extra-arg", "-v"] - if options.include_dir: - for dir in options.include_dir: - base += ["--extra-arg", f"-I{dir}"] - base += options.extra_args - if line_filters: - base += ["-line-filter", json.dumps(line_filters)] - - # Apply per-file options - commands = [] - for f in files: - command = list(base) + [map_filename(options.compile_commands_dir, f)] - commands.append((command, f)) - - if options.dry_run: - return CommandResult(0, str([c for c, _ in commands]), "") - - return await _run_clang_tidy_in_parallel(commands, options.disable_progress_bar) - - -def extract_warnings( - output: str, base_dir: str = "." -) -> Tuple[Dict[str, Dict[int, Set[str]]], List[ClangTidyWarning]]: - warn2occ: Dict[str, List[Tuple[str, int]]] = {} - fixes: Dict[str, Dict[int, Set[str]]] = {} - for line in output.splitlines(): - p = CLANG_WARNING_PATTERN.match(line) - if p is None: - continue - if os.path.isabs(p.group(1)): - path = os.path.abspath(p.group(1)) - else: - path = os.path.abspath(os.path.join(base_dir, p.group(1))) - line_no = int(p.group(2)) - - # Filter out any options (which start with '-') - warning_names = set([w for w in p.group(4).split(",") if not w.startswith("-")]) - - for name in warning_names: - if name not in warn2occ: - warn2occ[name] = [] - warn2occ[name].append((path, line_no)) - - if path not in fixes: - fixes[path] = {} - if line_no not in fixes[path]: - fixes[path][line_no] = set() - fixes[path][line_no].update(warning_names) - - warnings = [ClangTidyWarning(name, sorted(occ)) for name, occ in warn2occ.items()] - - return fixes, warnings - - -def apply_nolint(fname: str, warnings: Dict[int, Set[str]]) -> None: - with open(fname, encoding="utf-8") as f: - lines = f.readlines() - - line_offset = -1 # As in .cpp files lines are numbered starting from 1 - for line_no in sorted(warnings.keys()): - nolint_diagnostics = ",".join(warnings[line_no]) - line_no += line_offset - indent = " " * (len(lines[line_no]) - len(lines[line_no].lstrip(" "))) - lines.insert(line_no, f"{indent}// NOLINTNEXTLINE({nolint_diagnostics})\n") - line_offset += 1 - - with open(fname, mode="w") as f: - f.write("".join(lines)) - - -# Functions for correct handling of "ATen/native/cpu" mapping -# Sources in that folder are not built in place but first copied into build folder with `.[CPUARCH].cpp` suffixes -def map_filename(build_folder: str, fname: str) -> str: - fname = os.path.relpath(fname) - native_cpu_prefix = "aten/src/ATen/native/cpu/" - build_cpu_prefix = os.path.join(build_folder, native_cpu_prefix, "") - default_arch_suffix = ".DEFAULT.cpp" - if fname.startswith(native_cpu_prefix) and fname.endswith(".cpp"): - return ( - f"{build_cpu_prefix}{fname[len(native_cpu_prefix):]}{default_arch_suffix}" - ) - if fname.startswith(build_cpu_prefix) and fname.endswith(default_arch_suffix): - return f"{native_cpu_prefix}{fname[len(build_cpu_prefix):-len(default_arch_suffix)]}" - return fname - - -def map_filenames(build_folder: str, fnames: Iterable[str]) -> List[str]: - return [map_filename(build_folder, fname) for fname in fnames] - - -def split_negative_from_positive_patterns(patterns: Iterable[str]) -> Patterns: - """Separates negative patterns (that start with a dash) from positive patterns""" - positive, negative = [], [] - for pattern in patterns: - if pattern.startswith("-"): - negative.append(pattern[1:]) - else: - positive.append(pattern) - - return Patterns(positive, negative) - - -def get_file_patterns(globs: Iterable[str], regexes: Iterable[str]) -> Patterns: - """Returns a list of compiled regex objects from globs and regex pattern strings.""" - # fnmatch.translate converts a glob into a regular expression. - # https://docs.python.org/2/library/fnmatch.html#fnmatch.translate - glob = split_negative_from_positive_patterns(globs) - regexes_ = split_negative_from_positive_patterns(regexes) - - positive_regexes = regexes_.positive + [fnmatch.translate(g) for g in glob.positive] - negative_regexes = regexes_.negative + [fnmatch.translate(g) for g in glob.negative] - - positive_patterns = [re.compile(regex) for regex in positive_regexes] or [ - DEFAULT_FILE_PATTERN - ] - negative_patterns = [re.compile(regex) for regex in negative_regexes] - - return Patterns(positive_patterns, negative_patterns) - - -def filter_files(files: Iterable[str], file_patterns: Patterns) -> Iterable[str]: - """Returns all files that match any of the patterns.""" - if VERBOSE: - log("Filtering with these file patterns: {}".format(file_patterns)) - for file in files: - if not any(n.match(file) for n in file_patterns.negative): - if any(p.match(file) for p in file_patterns.positive): - yield file - continue - if VERBOSE: - log(f"{file} omitted due to file filters") - - -async def get_all_files(paths: List[str]) -> List[str]: - """Returns all files that are tracked by git in the given paths.""" - output = await run_shell_command(["git", "ls-files"] + paths) - return str(output).strip().splitlines() - - -def find_changed_lines(diff: str) -> Dict[str, List[Tuple[int, int]]]: - # Delay import since this isn't required unless using the --diff-file - # argument, which for local runs people don't care about - try: - import unidiff # type: ignore[import] - except ImportError as e: - e.msg += ", run 'pip install unidiff'" # type: ignore[attr-defined] - raise e - - files: Any = collections.defaultdict(list) - - for file in unidiff.PatchSet(diff): - for hunk in file: - added_line_nos = [line.target_line_no for line in hunk if line.is_added] - - if len(added_line_nos) == 0: - continue - - # Convert list of line numbers to ranges - # Eg: [1, 2, 3, 12, 13, 14, 15] becomes [[1,3], [12, 15]] - i = 1 - ranges = [[added_line_nos[0], added_line_nos[0]]] - while i < len(added_line_nos): - if added_line_nos[i] != added_line_nos[i - 1] + 1: - ranges[-1][1] = added_line_nos[i - 1] - ranges.append([added_line_nos[i], added_line_nos[i]]) - i += 1 - ranges[-1][1] = added_line_nos[-1] - - files[file.path] += ranges - - return dict(files) - - -def filter_from_diff( - paths: List[str], diffs: List[str] -) -> Tuple[List[str], List[Dict[Any, Any]]]: - files = [] - line_filters = [] - - for diff in diffs: - changed_files = find_changed_lines(diff) - changed_files = { - filename: v - for filename, v in changed_files.items() - if any(filename.startswith(path) for path in paths) - } - line_filters += [ - {"name": name, "lines": lines} for name, lines, in changed_files.items() - ] - files += list(changed_files.keys()) - - return files, line_filters - - -def filter_from_diff_file( - paths: List[str], filename: str -) -> Tuple[List[str], List[Dict[Any, Any]]]: - with open(filename, "r") as f: - diff = f.read() - return filter_from_diff(paths, [diff]) - - -async def filter_default(paths: List[str]) -> Tuple[List[str], List[Dict[Any, Any]]]: - return await get_all_files(paths), [] - - -async def _run(options: Any) -> Tuple[CommandResult, List[ClangTidyWarning]]: - # These flags are pervasive enough to set it globally. It makes the code - # cleaner compared to threading it through every single function. - global VERBOSE - global QUIET - VERBOSE = options.verbose - QUIET = options.quiet - - # Normalize the paths first - paths = [path.rstrip("/") for path in options.paths] - - # Filter files - if options.diff_file: - files, line_filters = filter_from_diff_file(options.paths, options.diff_file) - else: - files, line_filters = await filter_default(options.paths) - - file_patterns = get_file_patterns(options.glob, options.regex) - files = list(filter_files(files, file_patterns)) - - # clang-tidy errors when it does not get input files. - if not files: - log("No files detected") - return CommandResult(0, "", ""), [] - - result = await _run_clang_tidy(options, line_filters, files) - fixes, warnings = extract_warnings( - result.stdout, base_dir=options.compile_commands_dir - ) - - if options.suppress_diagnostics: - for fname in fixes.keys(): - mapped_fname = map_filename(options.compile_commands_dir, fname) - log(f"Applying fixes to {mapped_fname}") - apply_nolint(fname, fixes[fname]) - if os.path.relpath(fname) != mapped_fname: - shutil.copyfile(fname, mapped_fname) - - if options.dry_run: - log(result) - elif result.failed(): - # If you change this message, update the error checking logic in - # .github/workflows/lint.yml - msg = "Warnings detected!" - log(msg) - log("Summary:") - for w in warnings: - log(str(w)) - - return result, warnings - - -def run(options: Any) -> Tuple[CommandResult, List[ClangTidyWarning]]: - loop = asyncio.get_event_loop() - return loop.run_until_complete(_run(options)) diff --git a/tools/linter/flake8_hook.py b/tools/linter/flake8_hook.py deleted file mode 100755 index b9ebd5b47931..000000000000 --- a/tools/linter/flake8_hook.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 - -import sys - -from flake8.main import git # type: ignore[import] - -if __name__ == '__main__': - sys.exit( - git.hook( - strict=True, - lazy=git.config_for('lazy'), - ) - ) diff --git a/tools/linter/git-clang-format b/tools/linter/git-clang-format deleted file mode 100755 index 13073b6ecbfa..000000000000 --- a/tools/linter/git-clang-format +++ /dev/null @@ -1,655 +0,0 @@ -#!/usr/bin/env python3 -# -# ===- git-clang-format - ClangFormat Git Integration ---------*- python -*--===# -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# ===------------------------------------------------------------------------===# - -r""" -clang-format git integration -============================ - -This file provides a clang-format integration for git. Put it somewhere in your -path and ensure that it is executable. Then, "git clang-format" will invoke -clang-format on the changes in current files or a specific commit. - -For further details, run: -git clang-format -h - -Requires Python 2.7 or Python 3 -""" - -from __future__ import absolute_import, division, print_function -import argparse -import collections -import contextlib -import errno -import os -import re -import subprocess -import sys -from clang_format_utils import get_and_check_clang_format, CLANG_FORMAT_PATH - -usage = "git clang-format [OPTIONS] [] [] [--] [...]" - -desc = """ -If zero or one commits are given, run clang-format on all lines that differ -between the working directory and , which defaults to HEAD. Changes are -only applied to the working directory. - -If two commits are given (requires --diff), run clang-format on all lines in the -second that differ from the first . - -If --binary is unspecified, we will try to fetch the correct clang-format -binary for PyTorch - -The following git-config settings set the default of the corresponding option: - clangFormat.binary - clangFormat.commit - clangFormat.extension - clangFormat.style -""" - -# Name of the temporary index file in which save the output of clang-format. -# This file is created within the .git directory. -temp_index_basename = "clang-format-index" - - -Range = collections.namedtuple("Range", "start, count") - - -def main(): - config = load_git_config() - - # In order to keep '--' yet allow options after positionals, we need to - # check for '--' ourselves. (Setting nargs='*' throws away the '--', while - # nargs=argparse.REMAINDER disallows options after positionals.) - argv = sys.argv[1:] - try: - idx = argv.index("--") - except ValueError: - dash_dash = [] - else: - dash_dash = argv[idx:] - argv = argv[:idx] - - default_extensions = ",".join( - [ - # From clang/lib/Frontend/FrontendOptions.cpp, all lower case - "c", - "h", # C - "m", # ObjC - "mm", # ObjC++ - "cc", - "cp", - "cpp", - "c++", - "cxx", - "hh", - "hpp", - "hxx", # C++ - "cu", # CUDA - # Other languages that clang-format supports - "proto", - "protodevel", # Protocol Buffers - "java", # Java - "js", # JavaScript - "ts", # TypeScript - "cs", # C Sharp - ] - ) - - p = argparse.ArgumentParser( - usage=usage, - formatter_class=argparse.RawDescriptionHelpFormatter, - description=desc, - ) - p.add_argument("--binary", default=None, help="path to clang-format"), - p.add_argument( - "--commit", - default=config.get("clangformat.commit", "HEAD"), - help="default commit to use if none is specified", - ), - p.add_argument( - "--diff", - action="store_true", - help="print a diff instead of applying the changes", - ) - p.add_argument( - "--extensions", - default=config.get("clangformat.extensions", default_extensions), - help=( - "comma-separated list of file extensions to format, " - "excluding the period and case-insensitive" - ), - ), - p.add_argument( - "-f", "--force", action="store_true", help="allow changes to unstaged files" - ) - p.add_argument( - "-p", "--patch", action="store_true", help="select hunks interactively" - ) - p.add_argument( - "-q", "--quiet", action="count", default=0, help="print less information" - ) - p.add_argument( - "--style", - default=config.get("clangformat.style", None), - help="passed to clang-format", - ), - p.add_argument( - "-v", "--verbose", action="count", default=0, help="print extra information" - ) - # We gather all the remaining positional arguments into 'args' since we need - # to use some heuristics to determine whether or not was present. - # However, to print pretty messages, we make use of metavar and help. - p.add_argument( - "args", - nargs="*", - metavar="", - help="revision from which to compute the diff", - ) - p.add_argument( - "ignored", - nargs="*", - metavar="...", - help="if specified, only consider differences in these files", - ) - opts = p.parse_args(argv) - - opts.verbose -= opts.quiet - del opts.quiet - - ok = get_and_check_clang_format(opts.verbose) - if not ok: - # We have to invert because False -> 0, which is the code to be returned if everything is okay. - return not ok - - if opts.binary is None: - opts.binary = CLANG_FORMAT_PATH - - commits, files = interpret_args(opts.args, dash_dash, opts.commit) - if len(commits) > 1: - if not opts.diff: - die("--diff is required when two commits are given") - else: - if len(commits) > 2: - die("at most two commits allowed; %d given" % len(commits)) - changed_lines = compute_diff_and_extract_lines(commits, files) - if opts.verbose >= 1: - ignored_files = set(changed_lines) - filter_by_extension(changed_lines, opts.extensions.lower().split(",")) - if opts.verbose >= 1: - ignored_files.difference_update(changed_lines) - if ignored_files: - print("Ignoring changes in the following files (wrong extension):") - for filename in ignored_files: - print(" %s" % filename) - if changed_lines: - print("Running clang-format on the following files:") - for filename in changed_lines: - print(" %s" % filename) - if not changed_lines: - print("no modified files to format") - return - # The computed diff outputs absolute paths, so we must cd before accessing - # those files. - cd_to_toplevel() - if len(commits) > 1: - old_tree = commits[1] - new_tree = run_clang_format_and_save_to_tree( - changed_lines, revision=commits[1], binary=opts.binary, style=opts.style - ) - else: - old_tree = create_tree_from_workdir(changed_lines) - new_tree = run_clang_format_and_save_to_tree( - changed_lines, binary=opts.binary, style=opts.style - ) - if opts.verbose >= 1: - print("old tree: %s" % old_tree) - print("new tree: %s" % new_tree) - if old_tree == new_tree: - if opts.verbose >= 0: - print("clang-format did not modify any files") - elif opts.diff: - print_diff(old_tree, new_tree) - else: - changed_files = apply_changes( - old_tree, new_tree, force=opts.force, patch_mode=opts.patch - ) - if (opts.verbose >= 0 and not opts.patch) or opts.verbose >= 1: - print("changed files:") - for filename in changed_files: - print(" %s" % filename) - - -def load_git_config(non_string_options=None): - """Return the git configuration as a dictionary. - - All options are assumed to be strings unless in `non_string_options`, in which - is a dictionary mapping option name (in lower case) to either "--bool" or - "--int".""" - if non_string_options is None: - non_string_options = {} - out = {} - for entry in run("git", "config", "--list", "--null").split("\0"): - if entry: - name, value = entry.split("\n", 1) - if name in non_string_options: - value = run("git", "config", non_string_options[name], name) - out[name] = value - return out - - -def interpret_args(args, dash_dash, default_commit): - """Interpret `args` as "[commits] [--] [files]" and return (commits, files). - - It is assumed that "--" and everything that follows has been removed from - args and placed in `dash_dash`. - - If "--" is present (i.e., `dash_dash` is non-empty), the arguments to its - left (if present) are taken as commits. Otherwise, the arguments are checked - from left to right if they are commits or files. If commits are not given, - a list with `default_commit` is used.""" - if dash_dash: - if len(args) == 0: - commits = [default_commit] - else: - commits = args - for commit in commits: - object_type = get_object_type(commit) - if object_type not in ("commit", "tag"): - if object_type is None: - die("'%s' is not a commit" % commit) - else: - die( - "'%s' is a %s, but a commit was expected" - % (commit, object_type) - ) - files = dash_dash[1:] - elif args: - commits = [] - while args: - if not disambiguate_revision(args[0]): - break - commits.append(args.pop(0)) - if not commits: - commits = [default_commit] - files = args - else: - commits = [default_commit] - files = [] - return commits, files - - -def disambiguate_revision(value): - """Returns True if `value` is a revision, False if it is a file, or dies.""" - # If `value` is ambiguous (neither a commit nor a file), the following - # command will die with an appropriate error message. - run("git", "rev-parse", value, verbose=False) - object_type = get_object_type(value) - if object_type is None: - return False - if object_type in ("commit", "tag"): - return True - die("`%s` is a %s, but a commit or filename was expected" % (value, object_type)) - - -def get_object_type(value): - """Returns a string description of an object's type, or None if it is not - a valid git object.""" - cmd = ["git", "cat-file", "-t", value] - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, stderr = p.communicate() - if p.returncode != 0: - return None - return convert_string(stdout.strip()) - - -def compute_diff_and_extract_lines(commits, files): - """Calls compute_diff() followed by extract_lines().""" - diff_process = compute_diff(commits, files) - changed_lines = extract_lines(diff_process.stdout) - diff_process.stdout.close() - diff_process.wait() - if diff_process.returncode != 0: - # Assume error was already printed to stderr. - sys.exit(2) - return changed_lines - - -def compute_diff(commits, files): - """Return a subprocess object producing the diff from `commits`. - - The return value's `stdin` file object will produce a patch with the - differences between the working directory and the first commit if a single - one was specified, or the difference between both specified commits, filtered - on `files` (if non-empty). Zero context lines are used in the patch.""" - git_tool = "diff-index" - if len(commits) > 1: - git_tool = "diff-tree" - cmd = ["git", git_tool, "-p", "-U0"] + commits + ["--"] - cmd.extend(files) - p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) - p.stdin.close() - return p - - -def extract_lines(patch_file): - """Extract the changed lines in `patch_file`. - - The return value is a dictionary mapping filename to a list of (start_line, - line_count) pairs. - - The input must have been produced with ``-U0``, meaning unidiff format with - zero lines of context. The return value is a dict mapping filename to a - list of line `Range`s.""" - matches = {} - for line in patch_file: - line = convert_string(line) - match = re.search(r"^\+\+\+\ [^/]+/(.*)", line) - if match: - filename = match.group(1).rstrip("\r\n") - match = re.search(r"^@@ -[0-9,]+ \+(\d+)(,(\d+))?", line) - if match: - start_line = int(match.group(1)) - line_count = 1 - if match.group(3): - line_count = int(match.group(3)) - if line_count > 0: - matches.setdefault(filename, []).append(Range(start_line, line_count)) - return matches - - -def filter_by_extension(dictionary, allowed_extensions): - """Delete every key in `dictionary` that doesn't have an allowed extension. - - `allowed_extensions` must be a collection of lowercase file extensions, - excluding the period.""" - allowed_extensions = frozenset(allowed_extensions) - for filename in list(dictionary.keys()): - base_ext = filename.rsplit(".", 1) - if len(base_ext) == 1 and "" in allowed_extensions: - continue - if len(base_ext) == 1 or base_ext[1].lower() not in allowed_extensions: - del dictionary[filename] - - -def cd_to_toplevel(): - """Change to the top level of the git repository.""" - toplevel = run("git", "rev-parse", "--show-toplevel") - os.chdir(toplevel) - - -def create_tree_from_workdir(filenames): - """Create a new git tree with the given files from the working directory. - - Returns the object ID (SHA-1) of the created tree.""" - return create_tree(filenames, "--stdin") - - -def run_clang_format_and_save_to_tree( - changed_lines, revision=None, binary="clang-format", style=None -): - """Run clang-format on each file and save the result to a git tree. - - Returns the object ID (SHA-1) of the created tree.""" - - def iteritems(container): - try: - return container.iteritems() # Python 2 - except AttributeError: - return container.items() # Python 3 - - def index_info_generator(): - for filename, line_ranges in iteritems(changed_lines): - if revision: - git_metadata_cmd = [ - "git", - "ls-tree", - "%s:%s" % (revision, os.path.dirname(filename)), - os.path.basename(filename), - ] - git_metadata = subprocess.Popen( - git_metadata_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE - ) - stdout = git_metadata.communicate()[0] - mode = oct(int(stdout.split()[0], 8)) - else: - mode = oct(os.stat(filename).st_mode) - # Adjust python3 octal format so that it matches what git expects - if mode.startswith("0o"): - mode = "0" + mode[2:] - blob_id = clang_format_to_blob( - filename, line_ranges, revision=revision, binary=binary, style=style - ) - yield "%s %s\t%s" % (mode, blob_id, filename) - - return create_tree(index_info_generator(), "--index-info") - - -def create_tree(input_lines, mode): - """Create a tree object from the given input. - - If mode is '--stdin', it must be a list of filenames. If mode is - '--index-info' is must be a list of values suitable for "git update-index - --index-info", such as " ". Any other mode - is invalid.""" - assert mode in ("--stdin", "--index-info") - cmd = ["git", "update-index", "--add", "-z", mode] - with temporary_index_file(): - p = subprocess.Popen(cmd, stdin=subprocess.PIPE) - for line in input_lines: - p.stdin.write(to_bytes("%s\0" % line)) - p.stdin.close() - if p.wait() != 0: - die("`%s` failed" % " ".join(cmd)) - tree_id = run("git", "write-tree") - return tree_id - - -def clang_format_to_blob( - filename, line_ranges, revision=None, binary="clang-format", style=None -): - """Run clang-format on the given file and save the result to a git blob. - - Runs on the file in `revision` if not None, or on the file in the working - directory if `revision` is None. - - Returns the object ID (SHA-1) of the created blob.""" - clang_format_cmd = [binary] - if style: - clang_format_cmd.extend(["-style=" + style]) - clang_format_cmd.extend( - [ - "-lines=%s:%s" % (start_line, start_line + line_count - 1) - for start_line, line_count in line_ranges - ] - ) - if revision: - clang_format_cmd.extend(["-assume-filename=" + filename]) - git_show_cmd = ["git", "cat-file", "blob", "%s:%s" % (revision, filename)] - git_show = subprocess.Popen( - git_show_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE - ) - git_show.stdin.close() - clang_format_stdin = git_show.stdout - else: - clang_format_cmd.extend([filename]) - git_show = None - clang_format_stdin = subprocess.PIPE - try: - clang_format = subprocess.Popen( - clang_format_cmd, stdin=clang_format_stdin, stdout=subprocess.PIPE - ) - if clang_format_stdin == subprocess.PIPE: - clang_format_stdin = clang_format.stdin - except OSError as e: - if e.errno == errno.ENOENT: - die('cannot find executable "%s"' % binary) - else: - raise - clang_format_stdin.close() - hash_object_cmd = ["git", "hash-object", "-w", "--path=" + filename, "--stdin"] - hash_object = subprocess.Popen( - hash_object_cmd, stdin=clang_format.stdout, stdout=subprocess.PIPE - ) - clang_format.stdout.close() - stdout = hash_object.communicate()[0] - if hash_object.returncode != 0: - die("`%s` failed" % " ".join(hash_object_cmd)) - if clang_format.wait() != 0: - die("`%s` failed" % " ".join(clang_format_cmd)) - if git_show and git_show.wait() != 0: - die("`%s` failed" % " ".join(git_show_cmd)) - return convert_string(stdout).rstrip("\r\n") - - -@contextlib.contextmanager -def temporary_index_file(tree=None): - """Context manager for setting GIT_INDEX_FILE to a temporary file and deleting - the file afterward.""" - index_path = create_temporary_index(tree) - old_index_path = os.environ.get("GIT_INDEX_FILE") - os.environ["GIT_INDEX_FILE"] = index_path - try: - yield - finally: - if old_index_path is None: - del os.environ["GIT_INDEX_FILE"] - else: - os.environ["GIT_INDEX_FILE"] = old_index_path - os.remove(index_path) - - -def create_temporary_index(tree=None): - """Create a temporary index file and return the created file's path. - - If `tree` is not None, use that as the tree to read in. Otherwise, an - empty index is created.""" - gitdir = run("git", "rev-parse", "--git-dir") - path = os.path.join(gitdir, temp_index_basename) - if tree is None: - tree = "--empty" - run("git", "read-tree", "--index-output=" + path, tree) - return path - - -def print_diff(old_tree, new_tree): - """Print the diff between the two trees to stdout.""" - # We use the porcelain 'diff' and not plumbing 'diff-tree' because the output - # is expected to be viewed by the user, and only the former does nice things - # like color and pagination. - # - # We also only print modified files since `new_tree` only contains the files - # that were modified, so unmodified files would show as deleted without the - # filter. - subprocess.check_call(["git", "diff", "--diff-filter=M", old_tree, new_tree, "--"]) - - -def apply_changes(old_tree, new_tree, force=False, patch_mode=False): - """Apply the changes in `new_tree` to the working directory. - - Bails if there are local changes in those files and not `force`. If - `patch_mode`, runs `git checkout --patch` to select hunks interactively.""" - changed_files = ( - run( - "git", - "diff-tree", - "--diff-filter=M", - "-r", - "-z", - "--name-only", - old_tree, - new_tree, - ) - .rstrip("\0") - .split("\0") - ) - if not force: - unstaged_files = run("git", "diff-files", "--name-status", *changed_files) - if unstaged_files: - print( - "The following files would be modified but " "have unstaged changes:", - file=sys.stderr, - ) - print(unstaged_files, file=sys.stderr) - print("Please commit, stage, or stash them first.", file=sys.stderr) - sys.exit(2) - if patch_mode: - # In patch mode, we could just as well create an index from the new tree - # and checkout from that, but then the user will be presented with a - # message saying "Discard ... from worktree". Instead, we use the old - # tree as the index and checkout from new_tree, which gives the slightly - # better message, "Apply ... to index and worktree". This is not quite - # right, since it won't be applied to the user's index, but oh well. - with temporary_index_file(old_tree): - subprocess.check_call(["git", "checkout", "--patch", new_tree]) - index_tree = old_tree - else: - with temporary_index_file(new_tree): - run("git", "checkout-index", "-a", "-f") - return changed_files - - -def run(*args, **kwargs): - stdin = kwargs.pop("stdin", "") - verbose = kwargs.pop("verbose", True) - strip = kwargs.pop("strip", True) - for name in kwargs: - raise TypeError("run() got an unexpected keyword argument '%s'" % name) - p = subprocess.Popen( - args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE - ) - stdout, stderr = p.communicate(input=stdin) - - stdout = convert_string(stdout) - stderr = convert_string(stderr) - - if p.returncode == 0: - if stderr: - if verbose: - print("`%s` printed to stderr:" % " ".join(args), file=sys.stderr) - print(stderr.rstrip(), file=sys.stderr) - if strip: - stdout = stdout.rstrip("\r\n") - return stdout - if verbose: - print("`%s` returned %s" % (" ".join(args), p.returncode), file=sys.stderr) - if stderr: - print(stderr.rstrip(), file=sys.stderr) - sys.exit(2) - - -def die(message): - print("error:", message, file=sys.stderr) - sys.exit(2) - - -def to_bytes(str_input): - # Encode to UTF-8 to get binary data. - if isinstance(str_input, bytes): - return str_input - return str_input.encode("utf-8") - - -def to_string(bytes_input): - if isinstance(bytes_input, str): - return bytes_input - return bytes_input.encode("utf-8") - - -def convert_string(bytes_input): - try: - return to_string(bytes_input.decode("utf-8")) - except AttributeError: # 'str' object has no attribute 'decode'. - return str(bytes_input) - except UnicodeError: - return str(bytes_input) - - -if __name__ == "__main__": - main() diff --git a/tools/linter/install/clang_tidy.py b/tools/linter/install/clang_tidy.py deleted file mode 100644 index 28b15edfd9bf..000000000000 --- a/tools/linter/install/clang_tidy.py +++ /dev/null @@ -1,21 +0,0 @@ -import os -from tools.linter.install.download_bin import download, PYTORCH_ROOT, HASH_PATH - -PLATFORM_TO_URL = { - "Linux": "https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-tidy", - "Darwin": "https://oss-clang-format.s3.us-east-2.amazonaws.com/macos/clang-tidy", -} - -PLATFORM_TO_HASH = { - "Linux": os.path.join(HASH_PATH, "clang-tidy-linux64"), - "Darwin": os.path.join(HASH_PATH, "clang-tidy-macos"), -} - -OUTPUT_DIR = os.path.join(PYTORCH_ROOT, ".clang-tidy-bin") -INSTALLATION_PATH = os.path.join(OUTPUT_DIR, "clang-tidy") - -if __name__ == "__main__": - ok = download("clang-tidy", OUTPUT_DIR, PLATFORM_TO_URL, PLATFORM_TO_HASH) - if not ok: - print("Installation failed!") - exit(1) diff --git a/tools/linter/install/download_bin.py b/tools/linter/install/download_bin.py deleted file mode 100644 index 3bb65baac118..000000000000 --- a/tools/linter/install/download_bin.py +++ /dev/null @@ -1,164 +0,0 @@ -import platform -import sys -import stat -import hashlib -import subprocess -import os -import urllib.request -import urllib.error - -from typing import Dict - -# String representing the host platform (e.g. Linux, Darwin). -HOST_PLATFORM = platform.system() - -# PyTorch directory root -result = subprocess.run( - ["git", "rev-parse", "--show-toplevel"], stdout=subprocess.PIPE, check=True, -) -PYTORCH_ROOT = result.stdout.decode("utf-8").strip() - -HASH_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "hashes") - - -def compute_file_sha256(path: str) -> str: - """Compute the SHA256 hash of a file and return it as a hex string.""" - # If the file doesn't exist, return an empty string. - if not os.path.exists(path): - return "" - - hash = hashlib.sha256() - - # Open the file in binary mode and hash it. - with open(path, "rb") as f: - for b in f: - hash.update(b) - - # Return the hash as a hexadecimal string. - return hash.hexdigest() - - -def report_download_progress( - chunk_number: int, chunk_size: int, file_size: int -) -> None: - """ - Pretty printer for file download progress. - """ - if file_size != -1: - percent = min(1, (chunk_number * chunk_size) / file_size) - bar = "#" * int(64 * percent) - sys.stdout.write("\r0% |{:<64}| {}%".format(bar, int(percent * 100))) - - -def download_bin(name: str, output_dir: str, platform_to_url: Dict[str, str]) -> bool: - """ - Downloads the binary appropriate for the host platform and stores it in the given output directory. - """ - if HOST_PLATFORM not in platform_to_url: - print(f"Unsupported platform: {HOST_PLATFORM}", file=sys.stderr) - return False - - url = platform_to_url[HOST_PLATFORM] - filename = os.path.join(output_dir, name) - - # Try to download binary. - print(f"Downloading {name} to {output_dir}", file=sys.stderr) - try: - urllib.request.urlretrieve( - url, - filename, - reporthook=report_download_progress if sys.stdout.isatty() else None, - ) - except urllib.error.URLError as e: - print(f"Error downloading {filename}: {e}", file=sys.stderr) - return False - finally: - print(file=sys.stderr) - - return True - - -def download( - name: str, - output_dir: str, - platform_to_url: Dict[str, str], - platform_to_hash: Dict[str, str], - verbose: bool = False, -) -> bool: - """ - Download a platform-appropriate binary if one doesn't already exist at the expected location and verifies - that it is the right binary by checking its SHA256 hash against the expected hash. - """ - - output_path = os.path.join(output_dir, name) - if not os.path.exists(output_dir): - # If the directory doesn't exist, try to create it. - try: - os.mkdir(output_dir) - except OSError as e: - print(f"Unable to create directory for {name} binary: {output_dir}", file=sys.stderr) - return False - finally: - if verbose: - print(f"Created directory {output_dir} for {name} binary", file=sys.stderr) - - # If the directory didn't exist, neither did the binary, so download it. - ok = download_bin(name, output_dir, platform_to_url) - - if not ok: - return False - else: - # If the directory exists but the binary doesn't, download it. - if not os.path.exists(output_path): - ok = download_bin(name, output_dir, platform_to_url) - - if not ok: - return False - else: - if verbose: - print(f"Found pre-existing {name} binary, skipping download", file=sys.stderr) - - # Now that the binary is where it should be, hash it. - actual_bin_hash = compute_file_sha256(output_path) - - # If the host platform is not in platform_to_hash, it is unsupported. - if HOST_PLATFORM not in platform_to_hash: - print(f"Unsupported platform: {HOST_PLATFORM}", file=sys.stderr) - return False - - # This is the path to the file containing the reference hash. - hashpath = os.path.join(PYTORCH_ROOT, platform_to_hash[HOST_PLATFORM]) - - if not os.path.exists(hashpath): - print("Unable to find reference binary hash", file=sys.stderr) - return False - - # Load the reference hash and compare the actual hash to it. - with open(hashpath, "r") as f: - reference_bin_hash = f.readline().strip() - - if verbose: - print(f"Reference Hash: {reference_bin_hash}", file=sys.stderr) - print(f"Actual Hash: {repr(actual_bin_hash)}", file=sys.stderr) - - if reference_bin_hash != actual_bin_hash: - print("The downloaded binary is not what was expected!", file=sys.stderr) - print(f"Downloaded hash: {repr(actual_bin_hash)} vs expected {reference_bin_hash}", file=sys.stderr) - - # Err on the side of caution and try to delete the downloaded binary. - try: - os.unlink(output_path) - print("The binary has been deleted just to be safe", file=sys.stderr) - except OSError as e: - print(f"Failed to delete binary: {e}", file=sys.stderr) - print("Delete this binary as soon as possible and do not execute it!", file=sys.stderr) - - return False - else: - # Make sure the binary is executable. - mode = os.stat(output_path).st_mode - mode |= stat.S_IXUSR - os.chmod(output_path, mode) - print(f"Using {name} located at {output_path}", file=sys.stderr) - - return True diff --git a/tools/linter/install/hashes/clang-tidy-linux64 b/tools/linter/install/hashes/clang-tidy-linux64 deleted file mode 100644 index 111d45175928..000000000000 --- a/tools/linter/install/hashes/clang-tidy-linux64 +++ /dev/null @@ -1 +0,0 @@ -49343a448fcb75cd1e0fb9d6b1f6c2ef4b008b6f91d6ff899d4ac6060f5e52a5 diff --git a/tools/linter/install/hashes/clang-tidy-macos b/tools/linter/install/hashes/clang-tidy-macos deleted file mode 100644 index 8b688a106156..000000000000 --- a/tools/linter/install/hashes/clang-tidy-macos +++ /dev/null @@ -1 +0,0 @@ -541797a7b8fa795e2f3c1adcd8236cc336a40aa927028dc5bc79172e1d9eca36 diff --git a/tools/linter/mypy_wrapper.py b/tools/linter/mypy_wrapper.py deleted file mode 100755 index fb1dbcbc65dd..000000000000 --- a/tools/linter/mypy_wrapper.py +++ /dev/null @@ -1,220 +0,0 @@ -#!/usr/bin/env python3 - -""" -This module is meant to be run as a script (see the docstring of main -below) and passed the filename of any Python file in this repo, to -typecheck that file using only the subset of our mypy configs that apply -to it. - -Since editors (e.g. VS Code) can be configured to use this wrapper -script in lieu of mypy itself, the idea is that this can be used to get -inline mypy results while developing, and have at least some degree of -assurance that those inline results match up with what you would get -from running the mypy lint from the .github/workflows/lint.yml file. - -See also these wiki pages: - -- https://github.com/pytorch/pytorch/wiki/Guide-for-adding-type-annotations-to-PyTorch -- https://github.com/pytorch/pytorch/wiki/Lint-as-you-type -""" - -import sys -from collections import defaultdict -from configparser import ConfigParser -from pathlib import Path, PurePath, PurePosixPath -from typing import Any, Dict, List, Optional, Set, Tuple - -import mypy.api -# not part of the public API, but this is the easiest way to ensure that -# we agree with what mypy actually does -import mypy.config_parser - - -def read_config(config_path: Path) -> Set[str]: - """ - Return the set of `files` in the `mypy` ini file at config_path. - """ - config = ConfigParser() - config.read(config_path) - # hopefully on Windows this gives posix paths - return set(mypy.config_parser.split_and_match_files( - config['mypy']['files'], - )) - - -# see tools/test/test_mypy_wrapper.py for examples of many of the -# following functions - - -def config_files() -> Dict[str, Set[str]]: - """ - Return a dict from all our `mypy` ini filenames to their `files`. - """ - return {str(ini): read_config(ini) for ini in Path().glob('mypy*.ini')} - - -def split_path(path: str) -> List[str]: - """ - Split a relative (not absolute) POSIX path into its segments. - """ - pure = PurePosixPath(path) - return [str(p.name) for p in list(reversed(pure.parents))[1:] + [pure]] - - -# mypy doesn't support recursive types yet -# https://github.com/python/mypy/issues/731 - -# but if it did, the `Any` here would be `Union[Set[str], 'Trie']`, -# although that is not completely accurate: specifically, every `None` -# key must map to a `Set[str]`, and every `str` key must map to a `Trie` -Trie = Dict[Optional[str], Any] - - -def make_trie(configs: Dict[str, Set[str]]) -> Trie: - """ - Return a trie from path prefixes to their `mypy` configs. - - Specifically, each layer of the trie represents a segment of a POSIX - path relative to the root of this repo. If you follow a path down - the trie and reach a `None` key, that `None` maps to the (nonempty) - set of keys in `configs` which explicitly include that path. - """ - trie: Trie = {} - for ini, files in configs.items(): - for f in files: - inner = trie - for segment in split_path(f): - inner = inner.setdefault(segment, {}) - inner.setdefault(None, set()).add(ini) - return trie - - -def lookup(trie: Trie, filename: str) -> Set[str]: - """ - Return the configs in `trie` that include a prefix of `filename`. - - A path is included by a config if any of its ancestors are included - by the wildcard-expanded version of that config's `files`. Thus, - this function follows `filename`'s path down the `trie` and - accumulates all the configs it finds along the way. - """ - configs = set() - inner = trie - for segment in split_path(filename): - inner = inner.get(segment, {}) - configs |= inner.get(None, set()) - return configs - - -def make_plan( - *, - configs: Dict[str, Set[str]], - files: List[str] -) -> Dict[str, List[str]]: - """ - Return a dict from config names to the files to run them with. - - The keys of the returned dict are a subset of the keys of `configs`. - The list of files in each value of returned dict should contain a - nonempty subset of the given `files`, in the same order as `files`. - """ - trie = make_trie(configs) - plan = defaultdict(list) - for filename in files: - for config in lookup(trie, filename): - plan[config].append(filename) - return plan - - -def run( - *, - args: List[str], - files: List[str], -) -> Tuple[int, List[str], List[str]]: - """ - Return the exit code and list of output lines from running `mypy`. - - The given `args` are passed verbatim to `mypy`. The `files` (each of - which must be an absolute path) are converted to relative paths - (that is, relative to the root of this repo) and then classified - according to which ones need to be run with each `mypy` config. - Thus, `mypy` may be run zero, one, or multiple times, but it will be - run at most once for each `mypy` config used by this repo. - """ - repo_root = Path.cwd() - plan = make_plan(configs=config_files(), files=[ - PurePath(f).relative_to(repo_root).as_posix() for f in files - ]) - mypy_results = [ - mypy.api.run( - # insert custom flags after args to avoid being overridden - # by existing flags in args - args + [ - # don't special-case the last line - '--no-error-summary', - f'--config-file={config}', - ] + filtered - ) - # by construction, filtered must be nonempty - for config, filtered in plan.items() - ] - return ( - # assume all mypy exit codes are nonnegative - # https://github.com/python/mypy/issues/6003 - max( - [exit_code for _, _, exit_code in mypy_results], - default=0, - ), - list(dict.fromkeys( # remove duplicates, retain order - item - for stdout, _, _ in mypy_results - for item in stdout.splitlines() - )), - [stderr for _, stderr, _ in mypy_results], - ) - - -def main(args: List[str]) -> None: - """ - Run mypy on one Python file using the correct config file(s). - - This function assumes the following preconditions hold: - - - the cwd is set to the root of this cloned repo - - args is a valid list of CLI arguments that could be passed to mypy - - some of args are absolute paths to files to typecheck - - all the other args are config flags for mypy, rather than files - - These assumptions hold, for instance, when mypy is run automatically - by VS Code's Python extension, so in your clone of this repository, - you could modify your .vscode/settings.json to look something like - this (assuming you use a conda environment named "pytorch"): - - { - "python.linting.enabled": true, - "python.linting.mypyEnabled": true, - "python.linting.mypyPath": - "${env:HOME}/miniconda3/envs/pytorch/bin/python", - "python.linting.mypyArgs": [ - "${workspaceFolder}/tools/linter/mypy_wrapper.py" - ] - } - - More generally, this should work for any editor sets the cwd to the - repo root, runs mypy on individual files via their absolute paths, - and allows you to set the path to the mypy executable. - """ - repo_root = str(Path.cwd()) - exit_code, mypy_issues, stderrs = run( - args=[arg for arg in args if not arg.startswith(repo_root)], - files=[arg for arg in args if arg.startswith(repo_root)], - ) - for issue in mypy_issues: - print(issue) - for stderr in stderrs: - print(stderr, end='', file=sys.stderr) - sys.exit(exit_code) - - -if __name__ == '__main__': - main(sys.argv[1:]) diff --git a/tools/linter/run_shellcheck.sh b/tools/linter/run_shellcheck.sh deleted file mode 100755 index e9d2dd40e8fd..000000000000 --- a/tools/linter/run_shellcheck.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/usr/bin/env bash -find "$@" -name '*.sh' -print0 | xargs -0 -n1 shellcheck --external-sources diff --git a/tools/linter/trailing_newlines.py b/tools/linter/trailing_newlines.py deleted file mode 100755 index ee743a4785f8..000000000000 --- a/tools/linter/trailing_newlines.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 - -import fileinput -import os -import sys - -NEWLINE, = b'\n' - - -def correct_trailing_newlines(filename: str) -> bool: - with open(filename, 'rb') as f: - a = len(f.read(2)) - if a == 0: - return True - elif a == 1: - # file is wrong whether or not the only byte is a newline - return False - else: - f.seek(-2, os.SEEK_END) - b, c = f.read(2) - # no ASCII byte is part of any non-ASCII character in UTF-8 - return b != NEWLINE and c == NEWLINE - - -def main() -> int: - # mimic git grep exit code behavior - exit_code = 1 - for line in fileinput.input(): - stripped = line.rstrip() - if not correct_trailing_newlines(stripped): - exit_code = 0 - print(stripped) - return exit_code - - -if __name__ == '__main__': - sys.exit(main()) diff --git a/tools/linter/translate_annotations.py b/tools/linter/translate_annotations.py deleted file mode 100755 index ed0147e4a62a..000000000000 --- a/tools/linter/translate_annotations.py +++ /dev/null @@ -1,180 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import json -import re -import subprocess -from bisect import bisect_right -from collections import defaultdict -from typing import (Callable, DefaultDict, Generic, List, Optional, Pattern, - Sequence, TypeVar, cast) - -from typing_extensions import TypedDict - - -class Hunk(TypedDict): - old_start: int - old_count: int - new_start: int - new_count: int - - -class Diff(TypedDict): - old_filename: Optional[str] - hunks: List[Hunk] - - -# @@ -start,count +start,count @@ -hunk_pattern = r'^@@\s+-(\d+)(?:,(\d+))?\s+\+(\d+)(?:,(\d+))?\s+@@' - - -def parse_diff(diff: str) -> Diff: - name = None - name_found = False - hunks: List[Hunk] = [] - for line in diff.splitlines(): - hunk_match = re.match(hunk_pattern, line) - if name_found: - if hunk_match: - old_start, old_count, new_start, new_count = hunk_match.groups() - hunks.append({ - 'old_start': int(old_start), - 'old_count': int(old_count or '1'), - 'new_start': int(new_start), - 'new_count': int(new_count or '1'), - }) - else: - assert not hunk_match - name_match = re.match(r'^--- (?:(?:/dev/null)|(?:a/(.*)))$', line) - if name_match: - name_found = True - name, = name_match.groups() - return { - 'old_filename': name, - 'hunks': hunks, - } - - -T = TypeVar('T') -U = TypeVar('U') - - -# we want to use bisect.bisect_right to find the closest hunk to a given -# line number, but the bisect module won't have a key function until -# Python 3.10 https://github.com/python/cpython/pull/20556 so we make an -# O(1) wrapper around the list of hunks that makes it pretend to just be -# a list of line numbers -# https://gist.github.com/ericremoreynolds/2d80300dabc70eebc790 -class KeyifyList(Generic[T, U]): - def __init__(self, inner: List[T], key: Callable[[T], U]) -> None: - self.inner = inner - self.key = key - - def __len__(self) -> int: - return len(self.inner) - - def __getitem__(self, k: int) -> U: - return self.key(self.inner[k]) - - -def translate(diff: Diff, line_number: int) -> Optional[int]: - if line_number < 1: - return None - - hunks = diff['hunks'] - if not hunks: - return line_number - - keyified = KeyifyList( - hunks, - lambda hunk: hunk['new_start'] + (0 if hunk['new_count'] > 0 else 1) - ) - i = bisect_right(cast(Sequence[int], keyified), line_number) - if i < 1: - return line_number - - hunk = hunks[i - 1] - d = line_number - (hunk['new_start'] + (hunk['new_count'] or 1)) - return None if d < 0 else hunk['old_start'] + (hunk['old_count'] or 1) + d - - -# we use camelCase here because this will be output as JSON and so the -# field names need to match the group names from here: -# https://github.com/pytorch/add-annotations-github-action/blob/3ab7d7345209f5299d53303f7aaca7d3bc09e250/action.yml#L23 -class Annotation(TypedDict): - filename: str - lineNumber: int - columnNumber: int - errorCode: str - errorDesc: str - - -def parse_annotation(regex: Pattern[str], line: str) -> Optional[Annotation]: - m = re.match(regex, line) - if m: - try: - line_number = int(m.group('lineNumber')) - column_number = int(m.group('columnNumber')) - except ValueError: - return None - return { - 'filename': m.group('filename'), - 'lineNumber': line_number, - 'columnNumber': column_number, - 'errorCode': m.group('errorCode'), - 'errorDesc': m.group('errorDesc'), - } - else: - return None - - -def translate_all( - *, - lines: List[str], - regex: Pattern[str], - commit: str -) -> List[Annotation]: - ann_dict: DefaultDict[str, List[Annotation]] = defaultdict(list) - for line in lines: - annotation = parse_annotation(regex, line) - if annotation is not None: - ann_dict[annotation['filename']].append(annotation) - ann_list = [] - for filename, annotations in ann_dict.items(): - raw_diff = subprocess.check_output( - ['git', 'diff-index', '--unified=0', commit, filename], - encoding='utf-8', - ) - diff = parse_diff(raw_diff) if raw_diff.strip() else None - # if there is a diff but it doesn't list an old filename, that - # means the file is absent in the commit we're targeting, so we - # skip it - if not (diff and not diff['old_filename']): - for annotation in annotations: - line_number: Optional[int] = annotation['lineNumber'] - if diff: - annotation['filename'] = cast(str, diff['old_filename']) - line_number = translate(diff, cast(int, line_number)) - if line_number: - annotation['lineNumber'] = line_number - ann_list.append(annotation) - return ann_list - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument('--file') - parser.add_argument('--regex') - parser.add_argument('--commit') - args = parser.parse_args() - with open(args.file, 'r') as f: - lines = f.readlines() - print(json.dumps(translate_all( - lines=lines, - regex=args.regex, - commit=args.commit - ))) - - -if __name__ == '__main__': - main() diff --git a/tools/lite_interpreter/BUILD.buck b/tools/lite_interpreter/BUILD.buck new file mode 100644 index 000000000000..10415c26aee7 --- /dev/null +++ b/tools/lite_interpreter/BUILD.buck @@ -0,0 +1,6 @@ +python_library( + name = "gen_selected_mobile_ops_header", + srcs = ["gen_selected_mobile_ops_header.py"], + base_module = "tools.lite_interpreter", + visibility = ["PUBLIC"], +) diff --git a/tools/lite_interpreter/gen_selected_mobile_ops_header.py b/tools/lite_interpreter/gen_selected_mobile_ops_header.py index e34b7bbfa5c7..37cd9e6903bf 100644 --- a/tools/lite_interpreter/gen_selected_mobile_ops_header.py +++ b/tools/lite_interpreter/gen_selected_mobile_ops_header.py @@ -2,8 +2,8 @@ import argparse import os from typing import Set -from tools.codegen.selective_build.selector import SelectiveBuilder -from tools.codegen.code_template import CodeTemplate +from torchgen.selective_build.selector import SelectiveBuilder +from torchgen.code_template import CodeTemplate import yaml @@ -44,6 +44,7 @@ """ + def extract_root_operators(selective_builder: SelectiveBuilder) -> Set[str]: ops = [] for (op_name, op) in selective_builder.operators.items(): @@ -51,18 +52,24 @@ def extract_root_operators(selective_builder: SelectiveBuilder) -> Set[str]: ops.append(op_name) return set(ops) + def get_selected_kernel_dtypes_code( - selective_builder: SelectiveBuilder, + selective_builder: SelectiveBuilder, ) -> str: # See https://www.internalfb.com/intern/paste/P153411698/ for an example of the # generated code in case all kernel dtypes are selected and in case some kernel # dtypes are selected (i.e. both cases). # body = "return true;" - if selective_builder.include_all_operators is False and selective_builder.include_all_non_op_selectives is False: + if ( + selective_builder.include_all_operators is False + and selective_builder.include_all_non_op_selectives is False + ): body_parts = [] for kernel_tag, dtypes in selective_builder.kernel_metadata.items(): - conditions = list(map(lambda x: 'scalar_type == at::ScalarType::' + x, dtypes)) + conditions = list( + map(lambda x: "scalar_type == at::ScalarType::" + x, dtypes) + ) body_parts.append( if_condition_template.substitute( kernel_tag_name=kernel_tag, @@ -79,8 +86,8 @@ def get_selected_kernel_dtypes_code( # 1. The selected root operators # 2. The selected kernel dtypes def write_selected_mobile_ops( - output_file_path: str, - selective_builder: SelectiveBuilder, + output_file_path: str, + selective_builder: SelectiveBuilder, ) -> None: root_ops = extract_root_operators(selective_builder) custom_classes = selective_builder.custom_classes @@ -90,16 +97,29 @@ def write_selected_mobile_ops( # This condition checks if we are in selective build. # if these lists are not defined the corresponding selective build macros trivially return the item in question was selected if not selective_builder.include_all_operators: - body_parts.append("#define TORCH_OPERATOR_WHITELIST " + (";".join(sorted(root_ops))) + ";\n\n") + body_parts.append( + "#define TORCH_OPERATOR_WHITELIST " + + (";".join(sorted(root_ops))) + + ";\n\n" + ) # This condition checks if we are in tracing based selective build if selective_builder.include_all_non_op_selectives is False: - body_parts.append("#define TORCH_CUSTOM_CLASS_ALLOWLIST " + (";".join(sorted(custom_classes))) + ";\n\n") - body_parts.append("#define TORCH_BUILD_FEATURE_ALLOWLIST " + (";".join(sorted(build_features))) + ";\n\n") + body_parts.append( + "#define TORCH_CUSTOM_CLASS_ALLOWLIST " + + (";".join(sorted(custom_classes))) + + ";\n\n" + ) + body_parts.append( + "#define TORCH_BUILD_FEATURE_ALLOWLIST " + + (";".join(sorted(build_features))) + + ";\n\n" + ) body_parts.append(get_selected_kernel_dtypes_code(selective_builder)) header_contents = "".join(body_parts) out_file.write(header_contents.encode("utf-8")) + # root_ops: a set of selected root operators for selective build # Write the file selected_mobile_ops.h with optionally: # 1. The selected root operators from root_ops @@ -110,7 +130,9 @@ def write_selected_mobile_ops_with_all_dtypes( ) -> None: with open(output_file_path, "wb") as out_file: body_parts = [selected_mobile_ops_preamble] - body_parts.append("#define TORCH_OPERATOR_WHITELIST " + (";".join(sorted(root_ops))) + ";\n\n") + body_parts.append( + "#define TORCH_OPERATOR_WHITELIST " + (";".join(sorted(root_ops))) + ";\n\n" + ) selective_builder = SelectiveBuilder.get_nop_selector() body_parts.append(get_selected_kernel_dtypes_code(selective_builder)) @@ -118,17 +140,25 @@ def write_selected_mobile_ops_with_all_dtypes( header_contents = "".join(body_parts) out_file.write(header_contents.encode("utf-8")) + def main() -> None: parser = argparse.ArgumentParser( description="Generate selected_mobile_ops.h for selective build." ) parser.add_argument( - "-p", "--yaml_file_path", type=str, required=True, help="Path to the yaml" - " file with a list of operators used by the model." + "-p", + "--yaml_file_path", + type=str, + required=True, + help="Path to the yaml" " file with a list of operators used by the model.", ) parser.add_argument( - "-o", "--output_file_path", type=str, required=True, help="Path to destination" - "folder where selected_mobile_ops.h will be written." + "-o", + "--output_file_path", + type=str, + required=True, + help="Path to destination" + "folder where selected_mobile_ops.h will be written.", ) parsed_args = parser.parse_args() model_file_name = parsed_args.yaml_file_path @@ -138,12 +168,13 @@ def main() -> None: with open(model_file_name, "rb") as model_file: loaded_model = yaml.load(model_file, Loader=Loader) - root_operators_set = set(loaded_model) print("Writing header file selected_mobile_ops.h: ", parsed_args.output_file_path) write_selected_mobile_ops_with_all_dtypes( os.path.join(parsed_args.output_file_path, "selected_mobile_ops.h"), - root_operators_set) + root_operators_set, + ) + if __name__ == "__main__": main() diff --git a/tools/lldb/deploy_debugger.py b/tools/lldb/deploy_debugger.py index deaf65d7edb9..5a1395898b5c 100644 --- a/tools/lldb/deploy_debugger.py +++ b/tools/lldb/deploy_debugger.py @@ -1,10 +1,12 @@ import lldb # type: ignore[import] + # load into lldb instance with: # command script import tools/lldb/deploy_debugger.py target = lldb.debugger.GetSelectedTarget() bp = target.BreakpointCreateByRegex("__deploy_register_code") -bp.SetScriptCallbackBody("""\ +bp.SetScriptCallbackBody( + """\ process = frame.thread.GetProcess() target = process.target symbol_addr = frame.module.FindSymbol("__deploy_module_info").GetStartAddress() @@ -31,4 +33,5 @@ lldb.debugger.HandleCommand(cmd2) return False -""") +""" +) diff --git a/tools/nightly.py b/tools/nightly.py index 7a46a011d232..32733c5d9477 100755 --- a/tools/nightly.py +++ b/tools/nightly.py @@ -40,8 +40,21 @@ import subprocess from ast import literal_eval from argparse import ArgumentParser -from typing import (Any, Callable, Dict, Generator, Iterable, Iterator, List, - Optional, Sequence, Set, Tuple, TypeVar, cast) +from typing import ( + Any, + Callable, + Dict, + Generator, + Iterable, + Iterator, + List, + Optional, + Sequence, + Set, + Tuple, + TypeVar, + cast, +) LOGGER: Optional[logging.Logger] = None URL_FORMAT = "{base_url}/{platform}/{dist_name}.tar.bz2" @@ -199,7 +212,13 @@ def check_branch(subcommand: str, branch: Optional[str]) -> Optional[str]: return "Branch name to checkout must be supplied with '-b' option" # next check that the local repo is clean cmd = ["git", "status", "--untracked-files=no", "--porcelain"] - p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, universal_newlines=True) + p = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=True, + universal_newlines=True, + ) if p.stdout.strip(): return "Need to have clean working tree to checkout!\n\n" + p.stdout # next check that the branch name doesn't already exist @@ -218,7 +237,7 @@ def timer(logger: logging.Logger, prefix: str) -> Iterator[None]: logger.info(f"{prefix} took {time.time() - start_time:.3f} [s]") -F = TypeVar('F', bound=Callable[..., Any]) +F = TypeVar("F", bound=Callable[..., Any]) def timed(prefix: str) -> Callable[[F], F]: @@ -325,7 +344,7 @@ def deps_install(deps: List[str], existing_env: bool, env_opts: List[str]) -> No @timed("Installing pytorch nightly binaries") def pytorch_install(url: str) -> "tempfile.TemporaryDirectory[str]": - """"Install pytorch into a temporary directory""" + """ "Install pytorch into a temporary directory""" pytdir = tempfile.TemporaryDirectory() cmd = ["conda", "create", "--yes", "--no-deps", "--prefix", pytdir.name, url] p = subprocess.run(cmd, check=True) @@ -369,7 +388,13 @@ def _nightly_version(spdir: str) -> str: # now cross reference with nightly version _ensure_commit(git_version) cmd = ["git", "show", "--no-patch", "--format=%s", git_version] - p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, universal_newlines=True) + p = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=True, + universal_newlines=True, + ) m = SHA1_RE.search(p.stdout) if m is None: raise RuntimeError( @@ -516,7 +541,13 @@ def move_nightly_files(spdir: str, platform: str) -> None: def _available_envs() -> Dict[str, str]: cmd = ["conda", "env", "list"] - p = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) + p = subprocess.run( + cmd, + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, + ) lines = p.stdout.splitlines() envs = {} for line in map(str.strip, lines): diff --git a/tools/onnx/update_default_opset_version.py b/tools/onnx/update_default_opset_version.py new file mode 100755 index 000000000000..dfdbf1f23c87 --- /dev/null +++ b/tools/onnx/update_default_opset_version.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 + +"""Updates the default value of opset_version. + +The current policy is that the default should be set to the +latest released version as of 18 months ago. + +Usage: +Run with no arguments. +""" + +import datetime +import os +import pathlib +import re +import subprocess +import sys +from subprocess import DEVNULL + +pytorch_dir = pathlib.Path(__file__).parent.parent.parent.resolve() +onnx_dir = pytorch_dir / "third_party" / "onnx" +os.chdir(onnx_dir) + +date = datetime.datetime.now() - datetime.timedelta(days=18 * 30) +onnx_commit = subprocess.check_output( + ("git", "log", f"--until={date}", "--max-count=1", "--format=%H"), encoding="utf-8" +).strip() +onnx_tags = subprocess.check_output( + ("git", "tag", "--list", f"--contains={onnx_commit}"), encoding="utf-8" +) +tag_tups = [] +semver_pat = re.compile(r"v(\d+)\.(\d+)\.(\d+)") +for tag in onnx_tags.splitlines(): + match = semver_pat.match(tag) + if match: + tag_tups.append(tuple(int(x) for x in match.groups())) + +version_str = "{}.{}.{}".format(*min(tag_tups)) + +print("Using ONNX release", version_str) + +head_commit = subprocess.check_output( + ("git", "log", "--max-count=1", "--format=%H", "HEAD"), encoding="utf-8" +).strip() + +new_default = None + +subprocess.check_call( + ("git", "checkout", f"v{version_str}"), stdout=DEVNULL, stderr=DEVNULL +) +try: + from onnx import helper # type: ignore[import] + + for version in helper.VERSION_TABLE: + if version[0] == version_str: + new_default = version[2] + print("found new default opset_version", new_default) + break + if not new_default: + sys.exit( + f"failed to find version {version_str} in onnx.helper.VERSION_TABLE at commit {onnx_commit}" + ) +finally: + subprocess.check_call( + ("git", "checkout", head_commit), stdout=DEVNULL, stderr=DEVNULL + ) + +os.chdir(pytorch_dir) + + +def read_sub_write(path: str, prefix_pat: str) -> None: + with open(path, encoding="utf-8") as f: + content_str = f.read() + content_str = re.sub(prefix_pat, r"\g<1>{}".format(new_default), content_str) + with open(path, "w", encoding="utf-8") as f: + f.write(content_str) + print("modified", path) + + +read_sub_write( + os.path.join("torch", "onnx", "_constants.py"), + r"(onnx_default_opset = )\d+", +) +read_sub_write( + os.path.join("torch", "onnx", "__init__.py"), r"(opset_version \(int, default )\d+" +) + +print("Updating operator .expect files") +subprocess.check_call(("python", "setup.py", "develop"), stdout=DEVNULL, stderr=DEVNULL) +subprocess.check_call( + ("python", os.path.join("test", "onnx", "test_operators.py"), "--accept"), + stdout=DEVNULL, + stderr=DEVNULL, +) diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py index faf1fdf06d36..94c89a906714 100644 --- a/tools/pyi/gen_pyi.py +++ b/tools/pyi/gen_pyi.py @@ -2,14 +2,21 @@ import collections from pprint import pformat -from tools.codegen.model import Variant -from tools.codegen.api.python import (PythonSignatureGroup, - PythonSignatureNativeFunctionPair) -from tools.codegen.gen import parse_native_yaml -from tools.codegen.utils import FileManager +from torchgen.model import Variant +from torchgen.api.python import ( + PythonSignatureGroup, + PythonSignatureNativeFunctionPair, + returns_named_tuple_pyi, +) +from torchgen.gen import parse_native_yaml +from torchgen.utils import FileManager from typing import Sequence, List, Dict -from tools.autograd.gen_python_functions import should_generate_py_binding, load_signatures, group_overloads +from tools.autograd.gen_python_functions import ( + should_generate_py_binding, + load_signatures, + group_overloads, +) """ This module implements generation of type stubs for PyTorch, @@ -35,23 +42,29 @@ read gen_pyi for the gory details. """ + def get_py_torch_functions( - python_funcs: Sequence[PythonSignatureNativeFunctionPair], - method: bool = False, + python_funcs: Sequence[PythonSignatureNativeFunctionPair], + method: bool = False, ) -> Sequence[PythonSignatureGroup]: """ Get declarations (grouped by name) which should be generated as either functions in the "torch" module or methods on Tensor. """ + def should_bind_function(python_func: PythonSignatureNativeFunctionPair) -> bool: - return (should_generate_py_binding(python_func.function) and - not python_func.function.python_module and - Variant.function in python_func.function.variants) + return ( + should_generate_py_binding(python_func.function) + and not python_func.function.python_module + and Variant.function in python_func.function.variants + ) def should_bind_method(python_func: PythonSignatureNativeFunctionPair) -> bool: - return (should_generate_py_binding(python_func.function) and - not python_func.function.python_module and - Variant.method in python_func.function.variants) + return ( + should_generate_py_binding(python_func.function) + and not python_func.function.python_module + and Variant.method in python_func.function.variants + ) should_bind = should_bind_method if method else should_bind_function return group_overloads([f for f in python_funcs if should_bind(f)]) @@ -61,76 +74,111 @@ def should_bind_method(python_func: PythonSignatureNativeFunctionPair) -> bool: # the stubs to read on the human eye. DEVICE_PARAM = "device: Union[_device, str, None]=None" -FACTORY_PARAMS = f"dtype: Optional[_dtype]=None, {DEVICE_PARAM}, requires_grad: _bool=False" +FACTORY_PARAMS = ( + f"dtype: Optional[_dtype]=None, {DEVICE_PARAM}, requires_grad: _bool=False" +) # this could be more precise w.r.t list contents etc. How to do Ellipsis? INDICES = "indices: Union[None, _int, slice, Tensor, List, Tuple]" blocklist = [ - '__init_subclass__', - '__new__', - '__subclasshook__', - 'cdist', - 'device', - 'grad', - 'requires_grad', - 'range', + "__init_subclass__", + "__new__", + "__subclasshook__", + "cdist", + "device", + "grad", + "requires_grad", + "range", # defined in functional - 'einsum', + "einsum", # reduction argument; these bindings don't make sense - 'binary_cross_entropy_with_logits', - 'ctc_loss', - 'cosine_embedding_loss', - 'hinge_embedding_loss', - 'kl_div', - 'margin_ranking_loss', - 'triplet_margin_loss', + "binary_cross_entropy_with_logits", + "ctc_loss", + "cosine_embedding_loss", + "hinge_embedding_loss", + "kl_div", + "margin_ranking_loss", + "triplet_margin_loss", # Somehow, these are defined in both _C and in functional. Ick! - 'broadcast_tensors', + "broadcast_tensors", # Manually define named tensor type stubs in __init__.pyi.in - 'align_tensors', - 'meshgrid', - 'cartesian_prod', - 'block_diag', - 'norm', - 'chain_matmul', - 'stft', - 'tensordot', - 'split', - 'unique_consecutive', - 'atleast_1d', - 'atleast_2d', - 'atleast_3d', + "align_tensors", + "meshgrid", + "cartesian_prod", + "block_diag", + "norm", + "chain_matmul", + "stft", + "tensordot", + "split", + "unique_consecutive", + "atleast_1d", + "atleast_2d", + "atleast_3d", # These are handled specially by python_arg_parser.cpp - 'add', - 'add_', - 'add_out', - 'sub', - 'sub_', - 'sub_out', - 'mul', - 'mul_', - 'mul_out', - 'div', - 'div_', - 'div_out', - 'true_divide', 'true_divide_', 'true_divide_out', - 'floor_divide', 'floor_divide_', 'floor_divide_out', + "add", + "add_", + "add_out", + "sub", + "sub_", + "sub_out", + "mul", + "mul_", + "mul_out", + "div", + "div_", + "div_out", + "true_divide", + "true_divide_", + "true_divide_out", + "floor_divide", + "floor_divide_", + "floor_divide_out", ] -binary_ops = ('add', 'sub', 'mul', 'div', 'pow', 'lshift', 'rshift', 'mod', 'truediv', - 'matmul', 'floordiv', - 'radd', 'rsub', 'rmul', 'rtruediv', 'rfloordiv', 'rpow', # reverse arithmetic - 'and', 'or', 'xor', 'rand', 'ror', 'rxor', # logic - 'iadd', 'iand', 'idiv', 'ilshift', 'imul', - 'ior', 'irshift', 'isub', 'ixor', 'ifloordiv', 'imod', # inplace ops - ) -symmetric_comparison_ops = ('eq', 'ne') -asymmetric_comparison_ops = ('ge', 'gt', 'lt', 'le') +binary_ops = ( + "add", + "sub", + "mul", + "div", + "pow", + "lshift", + "rshift", + "mod", + "truediv", + "matmul", + "floordiv", + "radd", + "rsub", + "rmul", + "rtruediv", + "rfloordiv", + "rpow", # reverse arithmetic + "and", + "or", + "xor", + "rand", + "ror", + "rxor", # logic + "iadd", + "iand", + "idiv", + "ilshift", + "imul", + "ior", + "irshift", + "isub", + "ixor", + "ifloordiv", + "imod", # inplace ops +) +symmetric_comparison_ops = ("eq", "ne") +asymmetric_comparison_ops = ("ge", "gt", "lt", "le") comparison_ops = symmetric_comparison_ops + asymmetric_comparison_ops -unary_ops = ('neg', 'abs', 'invert') -to_py_type_ops = ('bool', 'float', 'complex', 'long', 'index', 'int', 'nonzero') +unary_ops = ("neg", "abs", "invert") +to_py_type_ops = ("bool", "float", "complex", "long", "index", "int", "nonzero") all_ops = binary_ops + comparison_ops + unary_ops + to_py_type_ops @@ -141,32 +189,35 @@ def sig_for_ops(opname: str) -> List[str]: # we have to do this by hand, because they are hand-bound in Python - assert opname.endswith('__') and opname.startswith('__'), "Unexpected op {}".format(opname) + assert opname.endswith("__") and opname.startswith("__"), "Unexpected op {}".format( + opname + ) name = opname[2:-2] if name in binary_ops: - return ['def {}(self, other: Any) -> Tensor: ...'.format(opname)] + return ["def {}(self, other: Any) -> Tensor: ...".format(opname)] elif name in comparison_ops: - sig = 'def {}(self, other: Any) -> Tensor: ...'.format(opname) + sig = "def {}(self, other: Any) -> Tensor: ...".format(opname) if name in symmetric_comparison_ops: # unsafe override https://github.com/python/mypy/issues/5704 - sig += ' # type: ignore[override]' + sig += " # type: ignore[override]" return [sig] elif name in unary_ops: - return ['def {}(self) -> Tensor: ...'.format(opname)] + return ["def {}(self) -> Tensor: ...".format(opname)] elif name in to_py_type_ops: - if name in {'bool', 'float', 'complex'}: + if name in {"bool", "float", "complex"}: tname = name - elif name == 'nonzero': - tname = 'bool' + elif name == "nonzero": + tname = "bool" else: - tname = 'int' - if tname in {'float', 'int', 'bool', 'complex'}: - tname = 'builtins.' + tname - return ['def {}(self) -> {}: ...'.format(opname, tname)] + tname = "int" + if tname in {"float", "int", "bool", "complex"}: + tname = "builtins." + tname + return ["def {}(self) -> {}: ...".format(opname, tname)] else: raise Exception("unknown op", opname) + def generate_type_hints(sig_group: PythonSignatureGroup) -> List[str]: type_hints: List[str] = [] @@ -184,81 +235,98 @@ def generate_type_hints(sig_group: PythonSignatureGroup) -> List[str]: # PythonSignatureGroups that have both a functional + out variant get a single signature, with an optional out argument # Generates the out variant if one exists. Otherwise, generate the functional variant type_hint = sig_group.signature.signature_str_pyi( - skip_outputs=sig_group.outplace is None) + skip_outputs=sig_group.outplace is None + ) type_hints.append(type_hint) # Some operators also additionally have a vararg variant of their signature type_hint_vararg = sig_group.signature.signature_str_pyi_vararg( - skip_outputs=sig_group.outplace is None) + skip_outputs=sig_group.outplace is None + ) if type_hint_vararg: type_hints.append(type_hint_vararg) return type_hints + def gen_nn_functional(fm: FileManager) -> None: # Functions imported into `torch.nn.functional` from `torch`, perhaps being filtered # through an `_add_docstr` call imports = [ - 'conv1d', - 'conv2d', - 'conv3d', - 'conv_transpose1d', - 'conv_transpose2d', - 'conv_transpose3d', - 'conv_tbc', - 'avg_pool1d', - 'relu_', - 'selu_', - 'celu_', - 'rrelu_', - 'pixel_shuffle', - 'pixel_unshuffle', - 'channel_shuffle', - 'pdist', - 'cosine_similarity', + "conv1d", + "conv2d", + "conv3d", + "conv_transpose1d", + "conv_transpose2d", + "conv_transpose3d", + "conv_tbc", + "avg_pool1d", + "relu_", + "selu_", + "celu_", + "rrelu_", + "pixel_shuffle", + "pixel_unshuffle", + "channel_shuffle", + "native_channel_shuffle", + "pdist", + "cosine_similarity", ] # Functions generated by `torch._jit_internal.boolean_dispatch` dispatches = [ - 'fractional_max_pool2d', - 'fractional_max_pool3d', - 'max_pool1d', - 'max_pool2d', - 'max_pool3d', - 'adaptive_max_pool1d', - 'adaptive_max_pool2d', - 'adaptive_max_pool3d', + "fractional_max_pool2d", + "fractional_max_pool3d", + "max_pool1d", + "max_pool2d", + "max_pool3d", + "adaptive_max_pool1d", + "adaptive_max_pool2d", + "adaptive_max_pool3d", ] # Functions directly imported from `torch._C` from_c = [ - 'avg_pool2d', - 'avg_pool3d', - 'hardtanh_', - 'elu_', - 'leaky_relu_', - 'logsigmoid', - 'softplus', - 'softshrink', - 'one_hot', + "avg_pool2d", + "avg_pool3d", + "hardtanh_", + "elu_", + "leaky_relu_", + "logsigmoid", + "softplus", + "softshrink", + "one_hot", ] import_code = ["from .. import {0} as {0}".format(_) for _ in imports] # TODO make these types more precise dispatch_code = ["{}: Callable".format(_) for _ in (dispatches + from_c)] - fm.write_with_template('torch/nn/functional.pyi', 'torch/nn/functional.pyi.in', lambda: { - 'imported_hints': import_code, - 'dispatched_hints': dispatch_code, - }) + fm.write_with_template( + "torch/nn/functional.pyi", + "torch/nn/functional.pyi.in", + lambda: { + "imported_hints": import_code, + "dispatched_hints": dispatch_code, + }, + ) # functional.pyi already contains the definitions for those functions # so, we don't export then to it - from_c.extend(['hardtanh', 'leaky_relu', 'hardsigmoid']) + from_c.extend(["hardtanh", "leaky_relu", "hardsigmoid"]) dispatch_code = ["{}: Callable".format(_) for _ in (dispatches + from_c)] - fm.write_with_template('torch/_C/_nn.pyi', 'torch/_C/_nn.pyi.in', lambda: { - 'imported_hints': import_code, - 'dispatched_hints': dispatch_code, - }) - - -def gen_pyi(native_yaml_path: str, deprecated_yaml_path: str, fm: FileManager) -> None: + fm.write_with_template( + "torch/_C/_nn.pyi", + "torch/_C/_nn.pyi.in", + lambda: { + "imported_hints": import_code, + "dispatched_hints": dispatch_code, + }, + ) + + +def gen_pyi( + native_yaml_path: str, + tags_yaml_path: str, + deprecated_yaml_path: str, + fm: FileManager, +) -> None: """gen_pyi() This function generates a pyi file for torch. @@ -278,125 +346,218 @@ def gen_pyi(native_yaml_path: str, deprecated_yaml_path: str, fm: FileManager) - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ unsorted_function_hints: Dict[str, List[str]] = collections.defaultdict(list) - unsorted_function_hints.update({ - 'set_flush_denormal': ['def set_flush_denormal(mode: _bool) -> _bool: ...'], - 'get_default_dtype': ['def get_default_dtype() -> _dtype: ...'], - 'asarray': ['def asarray(obj: Any, *, dtype: Optional[_dtype]=None, ' - 'device: Union[_device, str, None]=None, copy: Optional[_bool]=None, ' - 'requires_grad: _bool=False) -> Tensor: ...'], - 'from_numpy': ['def from_numpy(ndarray) -> Tensor: ...'], - 'frombuffer': ['def frombuffer(buffer: Any, *, dtype: _dtype, count: int=-1, ' - 'offset: int=0, device: Union[_device, str, None]=None, ' - 'requires_grad: _bool=False) -> Tensor: ...'], - 'numel': ['def numel(self: Tensor) -> _int: ...'], - 'as_tensor': ["def as_tensor(data: Any, dtype: _dtype=None, device: Optional[_device]=None) -> Tensor: ..."], - 'get_num_threads': ['def get_num_threads() -> _int: ...'], - 'set_num_threads': ['def set_num_threads(num: _int) -> None: ...'], - 'init_num_threads': ['def init_num_threads() -> None: ...'], - 'get_num_interop_threads': ['def get_num_interop_threads() -> _int: ...'], - 'set_num_interop_threads': ['def set_num_interop_threads(num: _int) -> None: ...'], - # These functions are explicitly disabled by - # SKIP_PYTHON_BINDINGS because they are hand bound. - # Correspondingly, we must hand-write their signatures. - 'tensor': ["def tensor(data: Any, {}) -> Tensor: ...".format(FACTORY_PARAMS)], - 'sparse_coo_tensor': ['def sparse_coo_tensor(indices: Tensor, values: Union[Tensor,List],' - ' size: Optional[_size]=None, *, dtype: Optional[_dtype]=None,' - ' device: Union[_device, str, None]=None, requires_grad:_bool=False) -> Tensor: ...'], - 'sparse_csr_tensor' : ['def sparse_csr_tensor(crow_indices: Union[Tensor, List],' - 'col_indices: Union[Tensor, List],' - ' values: Union[Tensor, List], size: Optional[_size]=None,' - ' *, dtype: Optional[_dtype]=None,' - ' device: Union[_device, str, None]=None, requires_grad:_bool=False) -> Tensor: ...'], - '_sparse_coo_tensor_unsafe': ['def _sparse_coo_tensor_unsafe(indices: Tensor, values: Tensor, size: List[int],' - ' dtype: Optional[_dtype] = None, device: Optional[_device] = None,' - ' requires_grad: bool = False) -> Tensor: ...'], - '_sparse_csr_tensor_unsafe': ['def _sparse_csr_tensor_unsafe(crow_indices: Union[Tensor, List],' - 'col_indices: Union[Tensor, List],' - ' values: Union[Tensor, List], size: List[int],' - ' dtype: Optional[_dtype] = None, device: Optional[_device] = None,' - ' requires_grad: bool = False) -> Tensor: ...'], - 'range': ['def range(start: Number, end: Number,' - ' step: Number=1, *, out: Optional[Tensor]=None, {}) -> Tensor: ...' - .format(FACTORY_PARAMS)], - 'arange': ['def arange(start: Number, end: Number, step: Number, *,' - ' out: Optional[Tensor]=None, {}) -> Tensor: ...' - .format(FACTORY_PARAMS), - 'def arange(start: Number, end: Number, *, out: Optional[Tensor]=None, {}) -> Tensor: ...' - .format(FACTORY_PARAMS), - 'def arange(end: Number, *, out: Optional[Tensor]=None, {}) -> Tensor: ...' - .format(FACTORY_PARAMS)], - 'linspace': ['def linspace(start: Number, end: Number, steps: Optional[_int]=None, *,' - ' out: Optional[Tensor]=None, {}) -> Tensor: ...'.format(FACTORY_PARAMS)], - 'logspace': ['def logspace(start: Number, end: Number, steps: Optional[_int]=None, base: _float=10.0, *,' - ' out: Optional[Tensor]=None, {}) -> Tensor: ...'.format(FACTORY_PARAMS)], - 'randint': ['def randint(low: _int, high: _int, size: _size, *,' - ' generator: Optional[Generator]=None, {}) -> Tensor: ...' - .format(FACTORY_PARAMS), - 'def randint(high: _int, size: _size, *,' - ' generator: Optional[Generator]=None, {}) -> Tensor: ...' - .format(FACTORY_PARAMS)], - 'full': ['def full(size: _size, fill_value: Number, *,' - ' out: Optional[Tensor]=None,' - ' layout: _layout=strided, {}) -> Tensor: ...' - .format(FACTORY_PARAMS), - 'def full(size: _size, fill_value: Number, *,' - ' names: List[Union[str, None]],' - ' layout: _layout=strided, {}) -> Tensor: ...' - .format(FACTORY_PARAMS)], - 'is_grad_enabled': ['def is_grad_enabled() -> _bool: ...'], - 'is_inference_mode_enabled': ['def is_inference_mode_enabled() -> _bool: ...'], - 'nonzero': ['def nonzero(input: Tensor, *, as_tuple: Literal[False]=False, out: Optional[Tensor]=None) -> Tensor: ...', - 'def nonzero(input: Tensor, *, as_tuple: Literal[True]) -> Tuple[Tensor, ...]: ...'], - 'binary_cross_entropy_with_logits': ['def binary_cross_entropy_with_logits(input: Tensor, target: Tensor, ' - 'weight: Optional[Tensor] = None, size_average: Optional[bool] = None, ' - 'reduce: Optional[bool] = None, reduction: str = ..., ' - 'pos_weight: Optional[Tensor] = None) -> Tensor: ...'], - 'cosine_embedding_loss': ['def cosine_embedding_loss(input1: Tensor, input2: Tensor, ' - 'target: Tensor, margin: float = ..., size_average: Optional[bool] = ..., ' - 'reduce: Optional[bool] = ..., reduction: str = ...) -> Tensor: ...'], - 'ctc_loss': ['def ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: Tensor, target_lengths: Tensor,' - ' blank: int = ..., reduction: str = ..., zero_infinity: bool = ...) -> Tensor: ...'], - 'hinge_embedding_loss': ['def hinge_embedding_loss(input: Tensor, target: Tensor, margin: float = ...,' - ' size_average: Optional[bool] = ..., reduce: Optional[bool] = ..., ' - 'reduction: str = ...) -> Tensor: ...'], - 'kl_div': ['def kl_div(input: Tensor, target: Tensor, size_average: Optional[bool] = ..., ' - 'reduce: Optional[bool] = ..., reduction: str = ..., log_target: bool = ...) -> Tensor: ...'], - 'margin_ranking_loss': ['def margin_ranking_loss(input1: Tensor, input2: Tensor, target: Tensor,' - ' margin: float = ..., size_average: Optional[bool] = ..., ' - ' reduce: Optional[bool] = ..., reduction: str = ...) -> Tensor: ...'], - 'triplet_margin_loss': ['def triplet_margin_loss(anchor: Tensor, positive: Tensor, negative: Tensor, ' - 'margin: float = ..., p: float = ..., eps: float = ..., swap: bool = ..., ' - 'size_average: Optional[bool] = ..., ' - 'reduce: Optional[bool] = ..., reduction: str = ...) -> Tensor: ...'], - 'dsmm': ['def dsmm(input: Tensor, mat2: Tensor) -> Tensor: ...'], - 'hsmm': ['def hsmm(input: Tensor, mat2: Tensor) -> Tensor: ...'], - 'saddmm': ['def saddmm(input: Tensor, mat1: Tensor, mat2: Tensor, *, beta: Number=1, ' - 'alpha: Number=1, out: Optional[Tensor]=None) -> Tensor: ...'], - 'spmm': ['def spmm(input: Tensor, mat2: Tensor) -> Tensor: ...'], - 'div': ['def div(input: Union[Tensor, Number], other: Union[Tensor, Number], *, ' - 'rounding_mode: Optional[str] = None, out: Optional[Tensor]=None) -> Tensor: ...'], - }) - for binop in ['mul', 'true_divide', 'floor_divide']: + + for n, n1, n2 in [ + ("csr", "crow", "col"), + ("csc", "ccol", "row"), + ("bsr", "crow", "col"), + ("bsc", "ccol", "row"), + ]: + unsorted_function_hints.update( + { + f"sparse_{n}_tensor": [ + f"def sparse_{n}_tensor({n1}_indices: Union[Tensor, List]," + f"{n2}_indices: Union[Tensor, List]," + " values: Union[Tensor, List], size: Optional[_size]=None," + " *, dtype: Optional[_dtype]=None," + " device: Union[_device, str, None]=None, requires_grad:_bool=False) -> Tensor: ..." + ], + f"_sparse_{n}_tensor_unsafe": [ + f"def _sparse_{n}_tensor_unsafe({n1}_indices: Union[Tensor, List]," + f"{n2}_indices: Union[Tensor, List]," + " values: Union[Tensor, List], size: List[int]," + " dtype: Optional[_dtype] = None, device: Optional[_device] = None," + " requires_grad: bool = False) -> Tensor: ..." + ], + } + ) + + unsorted_function_hints.update( + { + "set_flush_denormal": ["def set_flush_denormal(mode: _bool) -> _bool: ..."], + "get_default_dtype": ["def get_default_dtype() -> _dtype: ..."], + "asarray": [ + "def asarray(obj: Any, *, dtype: Optional[_dtype]=None, " + "device: Union[_device, str, None]=None, copy: Optional[_bool]=None, " + "requires_grad: _bool=False) -> Tensor: ..." + ], + "from_numpy": ["def from_numpy(ndarray) -> Tensor: ..."], + "frombuffer": [ + "def frombuffer(buffer: Any, *, dtype: _dtype, count: int=-1, " + "offset: int=0, device: Union[_device, str, None]=None, " + "requires_grad: _bool=False) -> Tensor: ..." + ], + "numel": ["def numel(self: Tensor) -> _int: ..."], + "as_tensor": [ + "def as_tensor(data: Any, dtype: _dtype=None, device: Optional[_device]=None) -> Tensor: ..." + ], + "get_num_threads": ["def get_num_threads() -> _int: ..."], + "set_num_threads": ["def set_num_threads(num: _int) -> None: ..."], + "init_num_threads": ["def init_num_threads() -> None: ..."], + "get_num_interop_threads": ["def get_num_interop_threads() -> _int: ..."], + "set_num_interop_threads": [ + "def set_num_interop_threads(num: _int) -> None: ..." + ], + # These functions are explicitly disabled by + # SKIP_PYTHON_BINDINGS because they are hand bound. + # Correspondingly, we must hand-write their signatures. + "tensor": [ + "def tensor(data: Any, {}) -> Tensor: ...".format(FACTORY_PARAMS) + ], + "sparse_coo_tensor": [ + "def sparse_coo_tensor(indices: Tensor, values: Union[Tensor,List]," + " size: Optional[_size]=None, *, dtype: Optional[_dtype]=None," + " device: Union[_device, str, None]=None, requires_grad:_bool=False) -> Tensor: ..." + ], + "_sparse_coo_tensor_unsafe": [ + "def _sparse_coo_tensor_unsafe(indices: Tensor, values: Tensor, size: List[int]," + " dtype: Optional[_dtype] = None, device: Optional[_device] = None," + " requires_grad: bool = False) -> Tensor: ..." + ], + "sparse_compressed_tensor": [ + "def sparse_compressed_tensor(compressed_indices: Union[Tensor, List]," + "plain_indices: Union[Tensor, List]," + " values: Union[Tensor, List], size: Optional[_size]=None," + " *, dtype: Optional[_dtype]=None, layout: Optional[_layout] = None," + " device: Union[_device, str, None]=None, requires_grad:_bool=False) -> Tensor: ..." + ], + "_sparse_compressed_tensor_unsafe": [ + "def _sparse_compressed_tensor_unsafe(comp_indices: Union[Tensor, List]," + "plain_indices: Union[Tensor, List]," + " values: Union[Tensor, List], size: List[int]," + " dtype: Optional[_dtype] = None, layout: Optional[_layout] = None," + " device: Optional[_device] = None," + " requires_grad: bool = False) -> Tensor: ..." + ], + "range": [ + "def range(start: Number, end: Number," + " step: Number=1, *, out: Optional[Tensor]=None, {}) -> Tensor: ...".format( + FACTORY_PARAMS + ) + ], + "arange": [ + "def arange(start: Number, end: Number, step: Number, *," + " out: Optional[Tensor]=None, {}) -> Tensor: ...".format( + FACTORY_PARAMS + ), + "def arange(start: Number, end: Number, *, out: Optional[Tensor]=None, {}) -> Tensor: ...".format( + FACTORY_PARAMS + ), + "def arange(end: Number, *, out: Optional[Tensor]=None, {}) -> Tensor: ...".format( + FACTORY_PARAMS + ), + ], + "linspace": [ + "def linspace(start: Number, end: Number, steps: Optional[_int]=None, *," + " out: Optional[Tensor]=None, {}) -> Tensor: ...".format(FACTORY_PARAMS) + ], + "logspace": [ + "def logspace(start: Number, end: Number, steps: Optional[_int]=None, base: _float=10.0, *," + " out: Optional[Tensor]=None, {}) -> Tensor: ...".format(FACTORY_PARAMS) + ], + "randint": [ + "def randint(low: _int, high: _int, size: _size, *," + " generator: Optional[Generator]=None, {}) -> Tensor: ...".format( + FACTORY_PARAMS + ), + "def randint(high: _int, size: _size, *," + " generator: Optional[Generator]=None, {}) -> Tensor: ...".format( + FACTORY_PARAMS + ), + ], + "full": [ + "def full(size: _size, fill_value: Number, *," + " out: Optional[Tensor]=None," + " layout: _layout=strided, {}) -> Tensor: ...".format(FACTORY_PARAMS), + "def full(size: _size, fill_value: Number, *," + " names: List[Union[str, None]]," + " layout: _layout=strided, {}) -> Tensor: ...".format(FACTORY_PARAMS), + ], + "is_grad_enabled": ["def is_grad_enabled() -> _bool: ..."], + "is_inference_mode_enabled": [ + "def is_inference_mode_enabled() -> _bool: ..." + ], + "nonzero": [ + "def nonzero(input: Tensor, *, as_tuple: Literal[False]=False, out: Optional[Tensor]=None) -> Tensor: ...", + "def nonzero(input: Tensor, *, as_tuple: Literal[True]) -> Tuple[Tensor, ...]: ...", + ], + "binary_cross_entropy_with_logits": [ + "def binary_cross_entropy_with_logits(input: Tensor, target: Tensor, " + "weight: Optional[Tensor] = None, size_average: Optional[bool] = None, " + "reduce: Optional[bool] = None, reduction: str = ..., " + "pos_weight: Optional[Tensor] = None) -> Tensor: ..." + ], + "cosine_embedding_loss": [ + "def cosine_embedding_loss(input1: Tensor, input2: Tensor, " + "target: Tensor, margin: float = ..., size_average: Optional[bool] = ..., " + "reduce: Optional[bool] = ..., reduction: str = ...) -> Tensor: ..." + ], + "ctc_loss": [ + "def ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: Tensor, target_lengths: Tensor," + " blank: int = ..., reduction: str = ..., zero_infinity: bool = ...) -> Tensor: ..." + ], + "hinge_embedding_loss": [ + "def hinge_embedding_loss(input: Tensor, target: Tensor, margin: float = ...," + " size_average: Optional[bool] = ..., reduce: Optional[bool] = ..., " + "reduction: str = ...) -> Tensor: ..." + ], + "kl_div": [ + "def kl_div(input: Tensor, target: Tensor, size_average: Optional[bool] = ..., " + "reduce: Optional[bool] = ..., reduction: str = ..., log_target: bool = ...) -> Tensor: ..." + ], + "margin_ranking_loss": [ + "def margin_ranking_loss(input1: Tensor, input2: Tensor, target: Tensor," + " margin: float = ..., size_average: Optional[bool] = ..., " + " reduce: Optional[bool] = ..., reduction: str = ...) -> Tensor: ..." + ], + "triplet_margin_loss": [ + "def triplet_margin_loss(anchor: Tensor, positive: Tensor, negative: Tensor, " + "margin: float = ..., p: float = ..., eps: float = ..., swap: bool = ..., " + "size_average: Optional[bool] = ..., " + "reduce: Optional[bool] = ..., reduction: str = ...) -> Tensor: ..." + ], + "dsmm": ["def dsmm(input: Tensor, mat2: Tensor) -> Tensor: ..."], + "hsmm": ["def hsmm(input: Tensor, mat2: Tensor) -> Tensor: ..."], + "saddmm": [ + "def saddmm(input: Tensor, mat1: Tensor, mat2: Tensor, *, beta: Number=1, " + "alpha: Number=1, out: Optional[Tensor]=None) -> Tensor: ..." + ], + "spmm": ["def spmm(input: Tensor, mat2: Tensor) -> Tensor: ..."], + "div": [ + "def div(input: Union[Tensor, Number], other: Union[Tensor, Number], *, " + "rounding_mode: Optional[str] = None, out: Optional[Tensor]=None) -> Tensor: ..." + ], + } + ) + for binop in ["mul", "true_divide", "floor_divide"]: unsorted_function_hints[binop].append( - 'def {}(input: Union[Tensor, Number],' - ' other: Union[Tensor, Number],' - ' *, out: Optional[Tensor]=None) -> Tensor: ...'.format(binop)) - for binop in ['add', 'sub']: + "def {}(input: Union[Tensor, Number]," + " other: Union[Tensor, Number]," + " *, out: Optional[Tensor]=None) -> Tensor: ...".format(binop) + ) + for binop in ["add", "sub"]: unsorted_function_hints[binop].append( - 'def {}(input: Union[Tensor, Number],' - ' other: Union[Tensor, Number],' - ' *, alpha: Optional[Number]=1, out: Optional[Tensor]=None) -> Tensor: ...'.format(binop)) - - native_functions = parse_native_yaml(native_yaml_path).native_functions + "def {}(input: Union[Tensor, Number]," + " other: Union[Tensor, Number]," + " *, alpha: Optional[Number]=1, out: Optional[Tensor]=None) -> Tensor: ...".format( + binop + ) + ) + + native_functions = parse_native_yaml( + native_yaml_path, tags_yaml_path + ).native_functions native_functions = list(filter(should_generate_py_binding, native_functions)) - function_signatures = load_signatures(native_functions, deprecated_yaml_path, method=False, pyi=True) + function_signatures = load_signatures( + native_functions, deprecated_yaml_path, method=False, pyi=True + ) sig_groups = get_py_torch_functions(function_signatures) for group in sorted(sig_groups, key=lambda g: g.signature.name): name = group.signature.name unsorted_function_hints[name] += generate_type_hints(group) - named_tuple = group.signature.returns.named_tuple_pyi() + named_tuple = returns_named_tuple_pyi(group.signature) if named_tuple is not None and not group.signature.deprecated: # deprecated namedtuples are currently not included for torch functions tuple_name, tuple_def = named_tuple @@ -408,122 +569,193 @@ def gen_pyi(native_yaml_path: str, deprecated_yaml_path: str, fm: FileManager) - function_hints = [] for name, hints in sorted(unsorted_function_hints.items()): if len(hints) > 1: - hints = ['@overload\n' + h for h in hints] + hints = ["@overload\n" + h for h in hints] function_hints += hints # Generate type signatures for Tensor methods # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ unsorted_tensor_method_hints: Dict[str, List[str]] = collections.defaultdict(list) - unsorted_tensor_method_hints.update({ - 'size': ['def size(self) -> Size: ...', - 'def size(self, dim: _int) -> _int: ...'], - 'stride': ['def stride(self) -> Tuple[_int]: ...', - 'def stride(self, _int) -> _int: ...'], - 'new_ones': ['def new_ones(self, size: _size, {}) -> Tensor: ...'. - format(FACTORY_PARAMS)], - 'new_tensor': ["def new_tensor(self, data: Any, {}) -> Tensor: ...".format(FACTORY_PARAMS)], - # new and __init__ have the same signatures differ only in return type - # Adapted from legacy_tensor_ctor and legacy_tensor_new - 'new': ['def new(self, *args: Any, {}) ->Tensor: ...'.format(DEVICE_PARAM), - 'def new(self, storage: Storage) -> Tensor: ...', - 'def new(self, other: Tensor) -> Tensor: ...', - 'def new(self, size: _size, *, {}) -> Tensor: ...'.format(DEVICE_PARAM), - ], - '__init__': ['def __init__(self, *args: Any, {}) -> None: ...'.format(DEVICE_PARAM), - 'def __init__(self, storage: Storage) -> None: ...', - 'def __init__(self, other: Tensor) -> None: ...', - 'def __init__(self, size: _size, *, {}) -> None: ...'.format(DEVICE_PARAM), - ], - 'as_subclass': ["def as_subclass(self, cls: Tensor) -> Tensor: ..."], - '_make_subclass': ["def _make_subclass(cls, data: Tensor, require_grad: _bool = False) -> Tensor: ..."], - '__getitem__': ["def __getitem__(self, {}) -> Tensor: ...".format(INDICES)], - '__setitem__': ["def __setitem__(self, {}, val: Union[Tensor, Number])" - " -> None: ...".format(INDICES)], - 'tolist': ['def tolist(self) -> List: ...'], - 'requires_grad_': ['def requires_grad_(self, mode: _bool=True) -> Tensor: ...'], - 'element_size': ['def element_size(self) -> _int: ...'], - 'data_ptr': ['def data_ptr(self) -> _int: ...'], - 'dim': ['def dim(self) -> _int: ...'], - 'nonzero': ['def nonzero(self, *, as_tuple: Literal[False]=False) -> Tensor: ...', - 'def nonzero(self, *, as_tuple: Literal[True]) -> Tuple[Tensor, ...]: ...'], - 'numel': ['def numel(self) -> _int: ...'], - 'ndimension': ['def ndimension(self) -> _int: ...'], - 'nelement': ['def nelement(self) -> _int: ...'], - 'cuda': ['def cuda(self, device: Optional[Union[_device, _int, str]]=None, non_blocking: _bool=False) -> Tensor: ...'], - 'numpy': ['def numpy(self) -> Any: ...'], - 'apply_': ['def apply_(self, callable: Callable) -> Tensor: ...'], - 'map_': ['def map_(self, tensor: Tensor, callable: Callable) -> Tensor: ...'], - 'map2_': ['def map2_(self, x: Tensor, y: Tensor, callable: Callable) -> Tensor: ...'], - 'storage': ['def _storage(self) -> Storage: ...'], - 'storage_type': ['def storage_type(self) -> Storage: ...'], - 'type': ['def type(self, dtype: None=None, non_blocking: _bool=False) -> str: ...', - 'def type(self, dtype: Union[str, _dtype], non_blocking: _bool=False) -> Tensor: ...', - ], - 'get_device': ['def get_device(self) -> _int: ...'], - 'contiguous': ['def contiguous(self, memory_format=torch.contiguous_format) -> Tensor: ...'], - 'has_names': ['def has_names(self) -> _bool: ...'], - 'is_contiguous': ['def is_contiguous(self, memory_format=torch.contiguous_format) -> _bool: ...'], - '_is_view': ['def _is_view(self) -> _bool: ...'], - 'is_cuda': ['is_cuda: _bool'], - 'is_leaf': ['is_leaf: _bool'], - 'is_sparse': ['is_sparse: _bool'], - 'is_sparse_csr' : ['is_sparse_csr: _bool'], - 'is_quantized': ['is_quantized: _bool'], - 'is_meta': ['is_meta: _bool'], - 'is_ort': ['is_ort: _bool'], - 'is_mkldnn': ['is_mkldnn: _bool'], - 'is_vulkan': ['is_vulkan: _bool'], - 'storage_offset': ['def storage_offset(self) -> _int: ...'], - 'to': ['def to(self, dtype: _dtype, non_blocking: _bool=False, copy: _bool=False) -> Tensor: ...', - 'def to(self, device: Optional[Union[_device, str]]=None, dtype: Optional[_dtype]=None, ' - 'non_blocking: _bool=False, copy: _bool=False) -> Tensor: ...', - 'def to(self, other: Tensor, non_blocking: _bool=False, copy: _bool=False) -> Tensor: ...', - ], - 'item': ["def item(self) -> Number: ..."], - 'copy_': ["def copy_(self, src: Tensor, non_blocking: _bool=False) -> Tensor: ..."], - 'set_': ['def set_(self, storage: Union[Storage, TypedStorage], offset: _int, size: _size, stride: _size) -> Tensor: ...', - 'def set_(self, storage: Union[Storage, TypedStorage]) -> Tensor: ...'], - 'split': ['def split(self, split_size: _int, dim: _int=0) -> Sequence[Tensor]: ...', - 'def split(self, split_size: Tuple[_int, ...], dim: _int=0) -> Sequence[Tensor]: ...'], - 'div': ['def div(self, other: Union[Tensor, Number], *, rounding_mode: Optional[str] = None) -> Tensor: ...'], - 'div_': ['def div_(self, other: Union[Tensor, Number], *, rounding_mode: Optional[str] = None) -> Tensor: ...'], - }) - for binop in ['mul', 'true_divide', 'floor_divide']: + unsorted_tensor_method_hints.update( + { + "size": [ + "def size(self) -> Size: ...", + "def size(self, dim: _int) -> _int: ...", + ], + "stride": [ + "def stride(self) -> Tuple[_int]: ...", + "def stride(self, _int) -> _int: ...", + ], + "new_ones": [ + "def new_ones(self, size: _size, {}) -> Tensor: ...".format( + FACTORY_PARAMS + ) + ], + "new_tensor": [ + "def new_tensor(self, data: Any, {}) -> Tensor: ...".format( + FACTORY_PARAMS + ) + ], + # new and __init__ have the same signatures differ only in return type + # Adapted from legacy_tensor_ctor and legacy_tensor_new + "new": [ + "def new(self, *args: Any, {}) ->Tensor: ...".format(DEVICE_PARAM), + "def new(self, storage: Storage) -> Tensor: ...", + "def new(self, other: Tensor) -> Tensor: ...", + "def new(self, size: _size, *, {}) -> Tensor: ...".format(DEVICE_PARAM), + ], + "__init__": [ + "def __init__(self, *args: Any, {}) -> None: ...".format(DEVICE_PARAM), + "def __init__(self, storage: Storage) -> None: ...", + "def __init__(self, other: Tensor) -> None: ...", + "def __init__(self, size: _size, *, {}) -> None: ...".format( + DEVICE_PARAM + ), + ], + "as_subclass": ["def as_subclass(self, cls: Tensor) -> Tensor: ..."], + "_make_subclass": [ + "def _make_subclass(cls, data: Tensor, require_grad: _bool = False) -> Tensor: ..." + ], + "__getitem__": ["def __getitem__(self, {}) -> Tensor: ...".format(INDICES)], + "__setitem__": [ + "def __setitem__(self, {}, val: Union[Tensor, Number])" + " -> None: ...".format(INDICES) + ], + "tolist": ["def tolist(self) -> List: ..."], + "requires_grad_": [ + "def requires_grad_(self, mode: _bool=True) -> Tensor: ..." + ], + "element_size": ["def element_size(self) -> _int: ..."], + "data_ptr": ["def data_ptr(self) -> _int: ..."], + "dim": ["def dim(self) -> _int: ..."], + "nonzero": [ + "def nonzero(self, *, as_tuple: Literal[False]=False) -> Tensor: ...", + "def nonzero(self, *, as_tuple: Literal[True]) -> Tuple[Tensor, ...]: ...", + ], + "numel": ["def numel(self) -> _int: ..."], + "ndimension": ["def ndimension(self) -> _int: ..."], + "nelement": ["def nelement(self) -> _int: ..."], + "cuda": [ + "def cuda(self, device: Optional[Union[_device, _int, str]]=None, non_blocking: _bool=False) -> Tensor: ..." + ], + "numpy": ["def numpy(self) -> Any: ..."], + "apply_": ["def apply_(self, callable: Callable) -> Tensor: ..."], + "map_": [ + "def map_(self, tensor: Tensor, callable: Callable) -> Tensor: ..." + ], + "map2_": [ + "def map2_(self, x: Tensor, y: Tensor, callable: Callable) -> Tensor: ..." + ], + "storage": ["def _storage(self) -> Storage: ..."], + "storage_type": ["def storage_type(self) -> Storage: ..."], + "type": [ + "def type(self, dtype: None=None, non_blocking: _bool=False) -> str: ...", + "def type(self, dtype: Union[str, _dtype], non_blocking: _bool=False) -> Tensor: ...", + ], + "get_device": ["def get_device(self) -> _int: ..."], + "contiguous": [ + "def contiguous(self, memory_format=torch.contiguous_format) -> Tensor: ..." + ], + "has_names": ["def has_names(self) -> _bool: ..."], + "is_contiguous": [ + "def is_contiguous(self, memory_format=torch.contiguous_format) -> _bool: ..." + ], + "_is_view": ["def _is_view(self) -> _bool: ..."], + "is_cuda": ["is_cuda: _bool"], + "is_leaf": ["is_leaf: _bool"], + "is_nested": ["is_nested: _bool"], + "is_sparse": ["is_sparse: _bool"], + "is_sparse_csr": ["is_sparse_csr: _bool"], + "is_quantized": ["is_quantized: _bool"], + "is_meta": ["is_meta: _bool"], + "is_ort": ["is_ort: _bool"], + "is_mkldnn": ["is_mkldnn: _bool"], + "is_vulkan": ["is_vulkan: _bool"], + "is_ipu": ["is_ipu: _bool"], + "storage_offset": ["def storage_offset(self) -> _int: ..."], + "to": [ + "def to(self, dtype: _dtype, non_blocking: _bool=False, copy: _bool=False) -> Tensor: ...", + "def to(self, device: Optional[Union[_device, str]]=None, dtype: Optional[_dtype]=None, " + "non_blocking: _bool=False, copy: _bool=False) -> Tensor: ...", + "def to(self, other: Tensor, non_blocking: _bool=False, copy: _bool=False) -> Tensor: ...", + ], + "item": ["def item(self) -> Number: ..."], + "copy_": [ + "def copy_(self, src: Tensor, non_blocking: _bool=False) -> Tensor: ..." + ], + "set_": [ + "def set_(self, storage: Union[Storage, _TypedStorage], offset: _int, size: _size, stride: _size) -> Tensor: ...", + "def set_(self, storage: Union[Storage, _TypedStorage]) -> Tensor: ...", + ], + "split": [ + "def split(self, split_size: _int, dim: _int=0) -> Sequence[Tensor]: ...", + "def split(self, split_size: Tuple[_int, ...], dim: _int=0) -> Sequence[Tensor]: ...", + ], + "div": [ + "def div(self, other: Union[Tensor, Number], *, rounding_mode: Optional[str] = None) -> Tensor: ..." + ], + "div_": [ + "def div_(self, other: Union[Tensor, Number], *, rounding_mode: Optional[str] = None) -> Tensor: ..." + ], + } + ) + for binop in ["mul", "true_divide", "floor_divide"]: for inplace in [False, True]: - out_suffix = ', *, out: Optional[Tensor]=None' + out_suffix = ", *, out: Optional[Tensor]=None" if inplace: - binop += '_' - out_suffix = '' + binop += "_" + out_suffix = "" unsorted_tensor_method_hints[binop].append( - 'def {}(self, other: Union[Tensor, Number]{})' - ' -> Tensor: ...'.format(binop, out_suffix)) - for binop in ['add', 'sub']: + "def {}(self, other: Union[Tensor, Number]{})" + " -> Tensor: ...".format(binop, out_suffix) + ) + for binop in ["add", "sub"]: for inplace in [False, True]: - out_suffix = ', out: Optional[Tensor]=None' + out_suffix = ", out: Optional[Tensor]=None" if inplace: - binop += '_' - out_suffix = '' + binop += "_" + out_suffix = "" unsorted_tensor_method_hints[binop].append( - 'def {}(self, other: Union[Tensor, Number], ' - '*, alpha: Optional[Number]=1{})' - ' -> Tensor: ...'.format(binop, out_suffix)) - simple_conversions = ['byte', 'char', 'cpu', 'double', 'float', - 'half', 'int', 'long', 'short', 'bool', - 'bfloat16'] + "def {}(self, other: Union[Tensor, Number], " + "*, alpha: Optional[Number]=1{})" + " -> Tensor: ...".format(binop, out_suffix) + ) + simple_conversions = [ + "byte", + "char", + "cpu", + "double", + "float", + "half", + "int", + "long", + "short", + "bool", + "bfloat16", + ] for name in simple_conversions: - unsorted_tensor_method_hints[name].append('def {}(self) -> Tensor: ...'.format(name)) + unsorted_tensor_method_hints[name].append( + "def {}(self) -> Tensor: ...".format(name) + ) # pyi tensor methods don't currently include deprecated signatures for some reason # TODO: we should probably add them in - tensor_method_signatures = load_signatures(native_functions, deprecated_yaml_path, method=True, skip_deprecated=True, pyi=True) - tensor_method_sig_groups = get_py_torch_functions(tensor_method_signatures, method=True) + tensor_method_signatures = load_signatures( + native_functions, + deprecated_yaml_path, + method=True, + skip_deprecated=True, + pyi=True, + ) + tensor_method_sig_groups = get_py_torch_functions( + tensor_method_signatures, method=True + ) for group in sorted(tensor_method_sig_groups, key=lambda g: g.signature.name): name = group.signature.name unsorted_tensor_method_hints[name] += generate_type_hints(group) - named_tuple = group.signature.returns.named_tuple_pyi() + named_tuple = returns_named_tuple_pyi(group.signature) if named_tuple is not None and not group.signature.deprecated: # deprecated namedtuples are currently not included for torch functions tuple_name, tuple_def = named_tuple @@ -533,13 +765,13 @@ def gen_pyi(native_yaml_path: str, deprecated_yaml_path: str, fm: FileManager) - namedtuples[tuple_name] = tuple_def for op in all_ops: - name = '__{}__'.format(op) + name = "__{}__".format(op) unsorted_tensor_method_hints[name] += sig_for_ops(name) tensor_method_hints = [] for name, hints in sorted(unsorted_tensor_method_hints.items()): if len(hints) > 1: - hints = ['@overload\n' + h for h in hints] + hints = ["@overload\n" + h for h in hints] tensor_method_hints += hints # TODO: Missing type hints for nn @@ -547,92 +779,182 @@ def gen_pyi(native_yaml_path: str, deprecated_yaml_path: str, fm: FileManager) - # Generate namedtuple definitions # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - namedtuple_defs = ['{} = {}'.format(name, defn) for name, defn in namedtuples.items()] + namedtuple_defs = [ + "{} = {}".format(name, defn) for name, defn in namedtuples.items() + ] # Generate type signatures for legacy classes # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # TODO: These are deprecated, maybe we shouldn't type hint them legacy_storage_base_hints = [] - dt = ('Double', 'Float', 'Long', 'Int', - 'Short', 'Char', 'Byte', 'Bool', - 'Half', 'BFloat16', 'ComplexDouble', - 'ComplexFloat', 'QUInt8', 'QInt8', 'QInt32', 'QUInt4x2', 'QUInt2x4') + dt = ( + "Double", + "Float", + "Long", + "Int", + "Short", + "Char", + "Byte", + "Bool", + "Half", + "BFloat16", + "ComplexDouble", + "ComplexFloat", + "QUInt8", + "QInt8", + "QInt32", + "QUInt4x2", + "QUInt2x4", + ) for c in dt: - legacy_storage_base_hints.append('class {}StorageBase(object): ...'.format(c)) + legacy_storage_base_hints.append("class {}StorageBase(object): ...".format(c)) for c in dt: - legacy_storage_base_hints.append('class Cuda{}StorageBase(object): ...'.format(c)) + legacy_storage_base_hints.append( + "class Cuda{}StorageBase(object): ...".format(c) + ) legacy_class_hints = [] - for c in ('DoubleTensor', 'FloatTensor', 'LongTensor', 'IntTensor', - 'ShortTensor', 'HalfTensor', 'CharTensor', 'ByteTensor', 'BoolTensor'): - legacy_class_hints.append('class {}(Tensor): ...'.format(c)) + for c in ( + "DoubleTensor", + "FloatTensor", + "LongTensor", + "IntTensor", + "ShortTensor", + "HalfTensor", + "CharTensor", + "ByteTensor", + "BoolTensor", + ): + legacy_class_hints.append("class {}(Tensor): ...".format(c)) # Generate type signatures for dtype classes # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # TODO: don't explicitly list dtypes here; get it from canonical # source - dtype_class_hints = ['{}: dtype = ...'.format(n) - for n in - ['float32', 'float', 'float64', 'double', 'float16', 'bfloat16', 'half', - 'uint8', 'int8', 'int16', 'short', 'int32', 'int', 'int64', 'long', - 'complex32', 'complex64', 'cfloat', 'complex128', 'cdouble', - 'quint8', 'qint8', 'qint32', 'bool', 'quint4x2', 'quint2x4']] + dtype_class_hints = [ + "{}: dtype = ...".format(n) + for n in [ + "float32", + "float", + "float64", + "double", + "float16", + "bfloat16", + "half", + "uint8", + "int8", + "int16", + "short", + "int32", + "int", + "int64", + "long", + "complex32", + "complex64", + "cfloat", + "complex128", + "cdouble", + "quint8", + "qint8", + "qint32", + "bool", + "quint4x2", + "quint2x4", + ] + ] # Generate __all__ directive # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Include only the functions that contain hints, to prevent undefined # symbols to be included in the `__all__` directive. - hinted_function_names = [name for name, hint in unsorted_function_hints.items() if hint] + hinted_function_names = [ + name for name, hint in unsorted_function_hints.items() if hint + ] all_symbols = sorted(list(namedtuples.keys()) + hinted_function_names) - all_directive = pformat(all_symbols, width=100, compact=True).split('\n') - all_directive[0] = '__all__ = {}'.format(all_directive[0]) + all_directive = pformat(all_symbols, width=100, compact=True).split("\n") + all_directive[0] = "__all__ = {}".format(all_directive[0]) # Write out the stub # ~~~~~~~~~~~~~~~~~~ env = { - 'namedtuple_defs': namedtuple_defs, - 'function_hints': function_hints, - 'tensor_method_hints': tensor_method_hints, - 'legacy_class_hints': legacy_class_hints, - 'legacy_storage_base_hints': legacy_storage_base_hints, - 'dtype_class_hints': dtype_class_hints, - 'all_directive': all_directive + "namedtuple_defs": namedtuple_defs, + "function_hints": function_hints, + "tensor_method_hints": tensor_method_hints, + "legacy_class_hints": legacy_class_hints, + "legacy_storage_base_hints": legacy_storage_base_hints, + "dtype_class_hints": dtype_class_hints, + "all_directive": all_directive, } - fm.write_with_template('torch/_C/__init__.pyi', 'torch/_C/__init__.pyi.in', lambda: { - 'generated_comment': '@' + 'generated from torch/_C/__init__.pyi.in', - **env, - }) - fm.write_with_template('torch/_C/_VariableFunctions.pyi', 'torch/_C/_VariableFunctions.pyi.in', lambda: { - 'generated_comment': '@' + 'generated from torch/_C/_VariableFunctions.pyi.in', - **env, - }) - fm.write_with_template('torch/_VF.pyi', 'torch/_C/_VariableFunctions.pyi.in', lambda: { - 'generated_comment': '@' + 'generated from torch/_C/_VariableFunctions.pyi.in', - **env, - }) + fm.write_with_template( + "torch/_C/__init__.pyi", + "torch/_C/__init__.pyi.in", + lambda: { + "generated_comment": "@" + "generated from torch/_C/__init__.pyi.in", + **env, + }, + ) + fm.write_with_template( + "torch/_C/_VariableFunctions.pyi", + "torch/_C/_VariableFunctions.pyi.in", + lambda: { + "generated_comment": "@" + + "generated from torch/_C/_VariableFunctions.pyi.in", + **env, + }, + ) + fm.write_with_template( + "torch/_VF.pyi", + "torch/_C/_VariableFunctions.pyi.in", + lambda: { + "generated_comment": "@" + + "generated from torch/_C/_VariableFunctions.pyi.in", + **env, + }, + ) + fm.write_with_template( + "torch/return_types.pyi", + "torch/_C/return_types.pyi.in", + lambda: { + "generated_comment": "@" + "generated from torch/_C/return_types.pyi", + **env, + }, + ) gen_nn_functional(fm) def main() -> None: - parser = argparse.ArgumentParser( - description='Generate type stubs for PyTorch') - parser.add_argument('--native-functions-path', metavar='NATIVE', - default='aten/src/ATen/native/native_functions.yaml', - help='path to native_functions.yaml') - parser.add_argument('--deprecated-functions-path', metavar='DEPRECATED', - default='tools/autograd/deprecated.yaml', - help='path to deprecated.yaml') - parser.add_argument('--out', metavar='OUT', - default='.', - help='path to output directory') + parser = argparse.ArgumentParser(description="Generate type stubs for PyTorch") + parser.add_argument( + "--native-functions-path", + metavar="NATIVE", + default="aten/src/ATen/native/native_functions.yaml", + help="path to native_functions.yaml", + ) + parser.add_argument( + "--tags-path", + metavar="TAGS", + default="aten/src/ATen/native/tags.yaml", + help="path to tags.yaml", + ) + parser.add_argument( + "--deprecated-functions-path", + metavar="DEPRECATED", + default="tools/autograd/deprecated.yaml", + help="path to deprecated.yaml", + ) + parser.add_argument( + "--out", metavar="OUT", default=".", help="path to output directory" + ) args = parser.parse_args() - fm = FileManager(install_dir=args.out, template_dir='.', dry_run=False) - gen_pyi(args.native_functions_path, args.deprecated_functions_path, fm) + fm = FileManager(install_dir=args.out, template_dir=".", dry_run=False) + gen_pyi( + args.native_functions_path, args.tags_path, args.deprecated_functions_path, fm + ) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tools/render_junit.py b/tools/render_junit.py index 28e617af0e8f..68adadde0449 100644 --- a/tools/render_junit.py +++ b/tools/render_junit.py @@ -16,12 +16,15 @@ except ImportError: print("rich not found, for color output use 'pip install rich'") + def parse_junit_reports(path_to_reports: str) -> List[TestCase]: # type: ignore[no-any-unimported] def parse_file(path: str) -> List[TestCase]: # type: ignore[no-any-unimported] try: return convert_junit_to_testcases(JUnitXml.fromfile(path)) except Exception as err: - rich.print(f":Warning: [yellow]Warning[/yellow]: Failed to read {path}: {err}") + rich.print( + f":Warning: [yellow]Warning[/yellow]: Failed to read {path}: {err}" + ) return [] if not os.path.exists(path_to_reports): @@ -46,6 +49,7 @@ def convert_junit_to_testcases(xml: Union[JUnitXml, TestSuite]) -> List[TestCase testcases.append(item) return testcases + def render_tests(testcases: List[TestCase]) -> None: # type: ignore[no-any-unimported] num_passed = 0 num_skipped = 0 @@ -64,14 +68,15 @@ def render_tests(testcases: List[TestCase]) -> None: # type: ignore[no-any-unim else: num_skipped += 1 continue - rich.print(f"{icon} [bold red]{testcase.classname}.{testcase.name}[/bold red]") + rich.print( + f"{icon} [bold red]{testcase.classname}.{testcase.name}[/bold red]" + ) print(f"{result.text}") rich.print(f":white_check_mark: {num_passed} [green]Passed[green]") rich.print(f":dash: {num_skipped} [grey]Skipped[grey]") rich.print(f":rotating_light: {num_failed} [grey]Failed[grey]") - def parse_args() -> Any: parser = argparse.ArgumentParser( description="Render xunit output for failed tests", diff --git a/tools/setup_helpers/BUILD.bazel b/tools/setup_helpers/BUILD.bazel new file mode 100644 index 000000000000..28dcd1b5b47c --- /dev/null +++ b/tools/setup_helpers/BUILD.bazel @@ -0,0 +1,15 @@ +py_binary( + name = "generate_code", + srcs = ["generate_code.py"], + deps = [ + "//tools/autograd", + "//torchgen", + ], + visibility = ["//:__pkg__"], +) + +py_binary( + name = "gen_version_header", + srcs = ["gen_version_header.py"], + visibility = ["//:__pkg__"], +) diff --git a/tools/setup_helpers/BUILD.buck b/tools/setup_helpers/BUILD.buck new file mode 100644 index 000000000000..afcd31fb3a03 --- /dev/null +++ b/tools/setup_helpers/BUILD.buck @@ -0,0 +1,41 @@ +python_library( + name = "generate_code", + srcs = [ + "generate_code.py", + ], + base_module = "tools.setup_helpers", + deps = [ + "//tools/autograd:autograd", + "//tools/jit:jit", + "//torchgen:torchgen", + ], +) + +python_binary( + name = "generate_code_bin", + main_module = "tools.setup_helpers.generate_code", + visibility = ["PUBLIC"], + # package_style = "inplace", + zip_safe = False, + deps = [ + ":generate_code", + ], +) + +python_library( + name = "gen-version-header-lib", + srcs = [ + "gen_version_header.py", + ], + base_module = "tools.setup_helpers", + deps = [], +) + +python_binary( + name = "gen-version-header", + main_module = "tools.setup_helpers.gen_version_header", + visibility = ["PUBLIC"], + deps = [ + ":gen-version-header-lib", + ], +) diff --git a/tools/setup_helpers/__init__.py b/tools/setup_helpers/__init__.py index fa892dfb6e6f..4bf1747e80c6 100644 --- a/tools/setup_helpers/__init__.py +++ b/tools/setup_helpers/__init__.py @@ -8,8 +8,8 @@ def which(thefile: str) -> Optional[str]: for d in path: fname = os.path.join(d, thefile) fnames = [fname] - if sys.platform == 'win32': - exts = os.environ.get('PATHEXT', '').split(os.pathsep) + if sys.platform == "win32": + exts = os.environ.get("PATHEXT", "").split(os.pathsep) fnames += [fname + ext for ext in exts] for name in fnames: if os.access(name, os.F_OK | os.X_OK) and not os.path.isdir(name): diff --git a/tools/setup_helpers/build.bzl b/tools/setup_helpers/build.bzl new file mode 100644 index 000000000000..c5be13e4603b --- /dev/null +++ b/tools/setup_helpers/build.bzl @@ -0,0 +1,17 @@ +def define_targets(rules): + rules.py_binary( + name = "generate_code", + srcs = ["generate_code.py"], + visibility = ["//:__pkg__"], + deps = [ + rules.requirement("PyYAML"), + "//tools/autograd", + "//torchgen", + ], + ) + + rules.py_binary( + name = "gen_version_header", + srcs = ["gen_version_header.py"], + visibility = ["//:__pkg__"], + ) diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py index ff175771fd18..2c48e2807cdf 100644 --- a/tools/setup_helpers/cmake.py +++ b/tools/setup_helpers/cmake.py @@ -1,9 +1,9 @@ "Manages CMake." - import multiprocessing import os +import platform import re from subprocess import check_call, check_output, CalledProcessError import sys @@ -12,7 +12,7 @@ from typing import IO, Any, Dict, List, Optional, Union, cast from . import which -from .env import (BUILD_DIR, IS_64BIT, IS_DARWIN, IS_WINDOWS, check_negative_env_flag) +from .env import BUILD_DIR, IS_64BIT, IS_DARWIN, IS_WINDOWS, check_negative_env_flag from .numpy_ import USE_NUMPY, NUMPY_INCLUDE_DIR @@ -20,20 +20,23 @@ def _mkdir_p(d: str) -> None: try: os.makedirs(d, exist_ok=True) except OSError as e: - raise RuntimeError(f"Failed to create folder {os.path.abspath(d)}: {e.strerror}") from e + raise RuntimeError( + f"Failed to create folder {os.path.abspath(d)}: {e.strerror}" + ) from e # Ninja # Use ninja if it is on the PATH. Previous version of PyTorch required the # ninja python package, but we no longer use it, so we do not have to import it -USE_NINJA = (not check_negative_env_flag('USE_NINJA') and - which('ninja') is not None) +USE_NINJA = not check_negative_env_flag("USE_NINJA") and which("ninja") is not None CMakeValue = Optional[Union[bool, str]] -def convert_cmake_value_to_python_value(cmake_value: str, cmake_type: str) -> CMakeValue: +def convert_cmake_value_to_python_value( + cmake_value: str, cmake_type: str +) -> CMakeValue: r"""Convert a CMake value in a string form to a Python value. Args: @@ -46,18 +49,24 @@ def convert_cmake_value_to_python_value(cmake_value: str, cmake_type: str) -> CM cmake_type = cmake_type.upper() up_val = cmake_value.upper() - if cmake_type == 'BOOL': + if cmake_type == "BOOL": # https://gitlab.kitware.com/cmake/community/wikis/doc/cmake/VariablesListsStrings#boolean-values-in-cmake - return not (up_val in ('FALSE', 'OFF', 'N', 'NO', '0', '', 'NOTFOUND') or up_val.endswith('-NOTFOUND')) - elif cmake_type == 'FILEPATH': - if up_val.endswith('-NOTFOUND'): + return not ( + up_val in ("FALSE", "OFF", "N", "NO", "0", "", "NOTFOUND") + or up_val.endswith("-NOTFOUND") + ) + elif cmake_type == "FILEPATH": + if up_val.endswith("-NOTFOUND"): return None else: return cmake_value else: # Directly return the cmake_value. return cmake_value -def get_cmake_cache_variables_from_file(cmake_cache_file: IO[str]) -> Dict[str, CMakeValue]: + +def get_cmake_cache_variables_from_file( + cmake_cache_file: IO[str], +) -> Dict[str, CMakeValue]: r"""Gets values in CMakeCache.txt into a dictionary. Args: @@ -69,7 +78,7 @@ def get_cmake_cache_variables_from_file(cmake_cache_file: IO[str]) -> Dict[str, results = dict() for i, line in enumerate(cmake_cache_file, 1): line = line.strip() - if not line or line.startswith(('#', '//')): + if not line or line.startswith(("#", "//")): # Blank or comment line, skip continue @@ -82,19 +91,24 @@ def get_cmake_cache_variables_from_file(cmake_cache_file: IO[str]) -> Dict[str, # USE_CUDA:=ON # Intel(R) MKL-DNN_SOURCE_DIR:STATIC=/path/to/pytorch/third_party/ideep/mkl-dnn # "OpenMP_COMPILE_RESULT_CXX_openmp:experimental":INTERNAL=FALSE - matched = re.match(r'("?)(.+?)\1(?::\s*([a-zA-Z_-][a-zA-Z0-9_-]*)?)?\s*=\s*(.*)', line) + matched = re.match( + r'("?)(.+?)\1(?::\s*([a-zA-Z_-][a-zA-Z0-9_-]*)?)?\s*=\s*(.*)', line + ) if matched is None: # Illegal line - raise ValueError('Unexpected line {} in {}: {}'.format(i, repr(cmake_cache_file), line)) + raise ValueError( + "Unexpected line {} in {}: {}".format(i, repr(cmake_cache_file), line) + ) _, variable, type_, value = matched.groups() if type_ is None: - type_ = '' - if type_.upper() in ('INTERNAL', 'STATIC'): + type_ = "" + if type_.upper() in ("INTERNAL", "STATIC"): # CMake internal variable, do not touch continue results[variable] = convert_cmake_value_to_python_value(value, type_) return results + class CMake: "Manages cmake." @@ -109,31 +123,36 @@ def _cmake_cache_file(self) -> str: Returns: string: The path to CMakeCache.txt. """ - return os.path.join(self.build_dir, 'CMakeCache.txt') + return os.path.join(self.build_dir, "CMakeCache.txt") @staticmethod def _get_cmake_command() -> str: "Returns cmake command." - cmake_command = 'cmake' + cmake_command = "cmake" if IS_WINDOWS: return cmake_command - cmake3_version = CMake._get_version(which('cmake3')) - cmake_version = CMake._get_version(which('cmake')) + cmake3_version = CMake._get_version(which("cmake3")) + cmake_version = CMake._get_version(which("cmake")) _cmake_min_version = LooseVersion("3.10.0") - if all((ver is None or ver < _cmake_min_version for ver in [cmake_version, cmake3_version])): - raise RuntimeError('no cmake or cmake3 with version >= 3.10.0 found') + if all( + ( + ver is None or ver < _cmake_min_version + for ver in [cmake_version, cmake3_version] + ) + ): + raise RuntimeError("no cmake or cmake3 with version >= 3.10.0 found") if cmake3_version is None: - cmake_command = 'cmake' + cmake_command = "cmake" elif cmake_version is None: - cmake_command = 'cmake3' + cmake_command = "cmake3" else: if cmake3_version >= cmake_version: - cmake_command = 'cmake3' + cmake_command = "cmake3" else: - cmake_command = 'cmake' + cmake_command = "cmake" return cmake_command @staticmethod @@ -142,16 +161,16 @@ def _get_version(cmd: Optional[str]) -> Any: if cmd is None: return None - for line in check_output([cmd, '--version']).decode('utf-8').split('\n'): - if 'version' in line: - return LooseVersion(line.strip().split(' ')[2]) - raise RuntimeError('no version found') + for line in check_output([cmd, "--version"]).decode("utf-8").split("\n"): + if "version" in line: + return LooseVersion(line.strip().split(" ")[2]) + raise RuntimeError("no version found") def run(self, args: List[str], env: Dict[str, str]) -> None: "Executes cmake with arguments and an environment." command = [self._cmake_command] + args - print(' '.join(command)) + print(" ".join(command)) try: check_call(command, cwd=self.build_dir, env=env) except (CalledProcessError, KeyboardInterrupt) as e: @@ -165,7 +184,7 @@ def defines(args: List[str], **kwargs: CMakeValue) -> None: "Adds definitions to a cmake argument list." for key, value in sorted(kwargs.items()): if value is not None: - args.append('-D{}={}'.format(key, value)) + args.append("-D{}={}".format(key, value)) def get_cmake_cache_variables(self) -> Dict[str, CMakeValue]: r"""Gets values in CMakeCache.txt into a dictionary. @@ -189,45 +208,54 @@ def generate( if rerun and os.path.isfile(self._cmake_cache_file): os.remove(self._cmake_cache_file) - ninja_build_file = os.path.join(self.build_dir, 'build.ninja') + ninja_build_file = os.path.join(self.build_dir, "build.ninja") if os.path.exists(self._cmake_cache_file) and not ( - USE_NINJA and not os.path.exists(ninja_build_file)): + USE_NINJA and not os.path.exists(ninja_build_file) + ): # Everything's in place. Do not rerun. return args = [] if USE_NINJA: # Avoid conflicts in '-G' and the `CMAKE_GENERATOR` - os.environ['CMAKE_GENERATOR'] = 'Ninja' - args.append('-GNinja') + os.environ["CMAKE_GENERATOR"] = "Ninja" + args.append("-GNinja") elif IS_WINDOWS: - generator = os.getenv('CMAKE_GENERATOR', 'Visual Studio 15 2017') - supported = ['Visual Studio 15 2017', 'Visual Studio 16 2019'] + generator = os.getenv("CMAKE_GENERATOR", "Visual Studio 15 2017") + supported = ["Visual Studio 15 2017", "Visual Studio 16 2019"] if generator not in supported: - print('Unsupported `CMAKE_GENERATOR`: ' + generator) - print('Please set it to one of the following values: ') - print('\n'.join(supported)) + print("Unsupported `CMAKE_GENERATOR`: " + generator) + print("Please set it to one of the following values: ") + print("\n".join(supported)) sys.exit(1) - args.append('-G' + generator) + args.append("-G" + generator) toolset_dict = {} - toolset_version = os.getenv('CMAKE_GENERATOR_TOOLSET_VERSION') + toolset_version = os.getenv("CMAKE_GENERATOR_TOOLSET_VERSION") if toolset_version is not None: - toolset_dict['version'] = toolset_version - curr_toolset = os.getenv('VCToolsVersion') + toolset_dict["version"] = toolset_version + curr_toolset = os.getenv("VCToolsVersion") if curr_toolset is None: - print('When you specify `CMAKE_GENERATOR_TOOLSET_VERSION`, you must also ' - 'activate the vs environment of this version. Please read the notes ' - 'in the build steps carefully.') + print( + "When you specify `CMAKE_GENERATOR_TOOLSET_VERSION`, you must also " + "activate the vs environment of this version. Please read the notes " + "in the build steps carefully." + ) sys.exit(1) if IS_64BIT: - args.append('-Ax64') - toolset_dict['host'] = 'x64' + if platform.machine() == "ARM64": + args.append("-A ARM64") + else: + args.append("-Ax64") + toolset_dict["host"] = "x64" if toolset_dict: - toolset_expr = ','.join(["{}={}".format(k, v) for k, v in toolset_dict.items()]) - args.append('-T' + toolset_expr) - - base_dir = os.path.dirname(os.path.dirname(os.path.dirname( - os.path.abspath(__file__)))) + toolset_expr = ",".join( + ["{}={}".format(k, v) for k, v in toolset_dict.items()] + ) + args.append("-T" + toolset_expr) + + base_dir = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + ) install_dir = os.path.join(base_dir, "torch") _mkdir_p(install_dir) @@ -242,50 +270,54 @@ def generate( # Key: environment variable name. Value: Corresponding variable name to be passed to CMake. If you are # adding a new build option to this block: Consider making these two names identical and adding this option # in the block below. - '_GLIBCXX_USE_CXX11_ABI': 'GLIBCXX_USE_CXX11_ABI', - 'CUDNN_LIB_DIR': 'CUDNN_LIBRARY', - 'USE_CUDA_STATIC_LINK': 'CAFFE2_STATIC_LINK_CUDA', + "_GLIBCXX_USE_CXX11_ABI": "GLIBCXX_USE_CXX11_ABI", + "CUDNN_LIB_DIR": "CUDNN_LIBRARY", + "USE_CUDA_STATIC_LINK": "CAFFE2_STATIC_LINK_CUDA", } - additional_options.update({ - # Build options that have the same environment variable name and CMake variable name and that do not start - # with "BUILD_", "USE_", or "CMAKE_". If you are adding a new build option, also make sure you add it to - # CMakeLists.txt. - var: var for var in - ('BLAS', - 'BUILDING_WITH_TORCH_LIBS', - 'CUDA_HOST_COMILER', - 'CUDA_NVCC_EXECUTABLE', - 'CUDA_SEPARABLE_COMPILATION', - 'CUDNN_LIBRARY', - 'CUDNN_INCLUDE_DIR', - 'CUDNN_ROOT', - 'EXPERIMENTAL_SINGLE_THREAD_POOL', - 'INSTALL_TEST', - 'JAVA_HOME', - 'INTEL_MKL_DIR', - 'INTEL_OMP_DIR', - 'MKL_THREADING', - 'MKLDNN_CPU_RUNTIME', - 'MSVC_Z7_OVERRIDE', - 'CAFFE2_USE_MSVC_STATIC_RUNTIME', - 'Numa_INCLUDE_DIR', - 'Numa_LIBRARIES', - 'ONNX_ML', - 'ONNX_NAMESPACE', - 'ATEN_THREADING', - 'WERROR', - 'OPENSSL_ROOT_DIR') - }) + additional_options.update( + { + # Build options that have the same environment variable name and CMake variable name and that do not start + # with "BUILD_", "USE_", or "CMAKE_". If you are adding a new build option, also make sure you add it to + # CMakeLists.txt. + var: var + for var in ( + "BLAS", + "BUILDING_WITH_TORCH_LIBS", + "CUDA_HOST_COMILER", + "CUDA_NVCC_EXECUTABLE", + "CUDA_SEPARABLE_COMPILATION", + "CUDNN_LIBRARY", + "CUDNN_INCLUDE_DIR", + "CUDNN_ROOT", + "EXPERIMENTAL_SINGLE_THREAD_POOL", + "INSTALL_TEST", + "JAVA_HOME", + "INTEL_MKL_DIR", + "INTEL_OMP_DIR", + "MKL_THREADING", + "MKLDNN_CPU_RUNTIME", + "MSVC_Z7_OVERRIDE", + "CAFFE2_USE_MSVC_STATIC_RUNTIME", + "Numa_INCLUDE_DIR", + "Numa_LIBRARIES", + "ONNX_ML", + "ONNX_NAMESPACE", + "ATEN_THREADING", + "WERROR", + "OPENSSL_ROOT_DIR", + "STATIC_DISPATCH_BACKEND", + ) + } + ) # Aliases which are lower priority than their canonical option low_priority_aliases = { - 'CUDA_HOST_COMPILER': 'CMAKE_CUDA_HOST_COMPILER', - 'CUDAHOSTCXX': 'CUDA_HOST_COMPILER', - 'CMAKE_CUDA_HOST_COMPILER': 'CUDA_HOST_COMPILER', - 'CMAKE_CUDA_COMPILER': 'CUDA_NVCC_EXECUTABLE', - 'CUDACXX': 'CUDA_NVCC_EXECUTABLE' + "CUDA_HOST_COMPILER": "CMAKE_CUDA_HOST_COMPILER", + "CUDAHOSTCXX": "CUDA_HOST_COMPILER", + "CMAKE_CUDA_HOST_COMPILER": "CUDA_HOST_COMPILER", + "CMAKE_CUDA_COMPILER": "CUDA_NVCC_EXECUTABLE", + "CUDACXX": "CUDA_NVCC_EXECUTABLE", } - for var, val in my_env.items(): # We currently pass over all environment variables that start with "BUILD_", "USE_", and "CMAKE_". This is # because we currently have no reliable way to get the list of all build options we have specified in @@ -295,7 +327,9 @@ def generate( true_var = additional_options.get(var) if true_var is not None: build_options[true_var] = val - elif var.startswith(('BUILD_', 'USE_', 'CMAKE_')) or var.endswith(('EXITCODE', 'EXITCODE__TRYRUN_OUTPUT')): + elif var.startswith(("BUILD_", "USE_", "CMAKE_")) or var.endswith( + ("EXITCODE", "EXITCODE__TRYRUN_OUTPUT") + ): build_options[var] = val if var in low_priority_aliases: @@ -304,68 +338,81 @@ def generate( build_options[key] = val # The default value cannot be easily obtained in CMakeLists.txt. We set it here. - py_lib_path = sysconfig.get_path('purelib') - cmake_prefix_path = build_options.get('CMAKE_PREFIX_PATH', None) + py_lib_path = sysconfig.get_path("purelib") + cmake_prefix_path = build_options.get("CMAKE_PREFIX_PATH", None) if cmake_prefix_path: build_options["CMAKE_PREFIX_PATH"] = ( - cast(str, py_lib_path) + ";" + cast(str, cmake_prefix_path) + py_lib_path + ";" + cast(str, cmake_prefix_path) ) else: - build_options['CMAKE_PREFIX_PATH'] = py_lib_path + build_options["CMAKE_PREFIX_PATH"] = py_lib_path # Some options must be post-processed. Ideally, this list will be shrunk to only one or two options in the # future, as CMake can detect many of these libraries pretty comfortably. We have them here for now before CMake # integration is completed. They appear here not in the CMake.defines call below because they start with either # "BUILD_" or "USE_" and must be overwritten here. - build_options.update({ - # Note: Do not add new build options to this dict if it is directly read from environment variable -- you - # only need to add one in `CMakeLists.txt`. All build options that start with "BUILD_", "USE_", or "CMAKE_" - # are automatically passed to CMake; For other options you can add to additional_options above. - 'BUILD_PYTHON': build_python, - 'BUILD_TEST': build_test, - # Most library detection should go to CMake script, except this one, which Python can do a much better job - # due to NumPy's inherent Pythonic nature. - 'USE_NUMPY': USE_NUMPY, - }) + build_options.update( + { + # Note: Do not add new build options to this dict if it is directly read from environment variable -- you + # only need to add one in `CMakeLists.txt`. All build options that start with "BUILD_", "USE_", or "CMAKE_" + # are automatically passed to CMake; For other options you can add to additional_options above. + "BUILD_PYTHON": build_python, + "BUILD_TEST": build_test, + # Most library detection should go to CMake script, except this one, which Python can do a much better job + # due to NumPy's inherent Pythonic nature. + "USE_NUMPY": USE_NUMPY, + } + ) # Options starting with CMAKE_ cmake__options = { - 'CMAKE_INSTALL_PREFIX': install_dir, + "CMAKE_INSTALL_PREFIX": install_dir, } # We set some CMAKE_* options in our Python build code instead of relying on the user's direct settings. Emit an # error if the user also attempts to set these CMAKE options directly. specified_cmake__options = set(build_options).intersection(cmake__options) if len(specified_cmake__options) > 0: - print(', '.join(specified_cmake__options) + - ' should not be specified in the environment variable. They are directly set by PyTorch build script.') + print( + ", ".join(specified_cmake__options) + + " should not be specified in the environment variable. They are directly set by PyTorch build script." + ) sys.exit(1) build_options.update(cmake__options) - CMake.defines(args, - PYTHON_EXECUTABLE=sys.executable, - PYTHON_LIBRARY=cmake_python_library, - PYTHON_INCLUDE_DIR=sysconfig.get_path('include'), - TORCH_BUILD_VERSION=version, - NUMPY_INCLUDE_DIR=NUMPY_INCLUDE_DIR, - **build_options) - - expected_wrapper = '/usr/local/opt/ccache/libexec' + CMake.defines( + args, + PYTHON_EXECUTABLE=sys.executable, + PYTHON_LIBRARY=cmake_python_library, + PYTHON_INCLUDE_DIR=sysconfig.get_path("include"), + TORCH_BUILD_VERSION=version, + NUMPY_INCLUDE_DIR=NUMPY_INCLUDE_DIR, + **build_options, + ) + + expected_wrapper = "/usr/local/opt/ccache/libexec" if IS_DARWIN and os.path.exists(expected_wrapper): - if 'CMAKE_C_COMPILER' not in build_options and 'CC' not in os.environ: + if "CMAKE_C_COMPILER" not in build_options and "CC" not in os.environ: CMake.defines(args, CMAKE_C_COMPILER="{}/gcc".format(expected_wrapper)) - if 'CMAKE_CXX_COMPILER' not in build_options and 'CXX' not in os.environ: - CMake.defines(args, CMAKE_CXX_COMPILER="{}/g++".format(expected_wrapper)) + if "CMAKE_CXX_COMPILER" not in build_options and "CXX" not in os.environ: + CMake.defines( + args, CMAKE_CXX_COMPILER="{}/g++".format(expected_wrapper) + ) for env_var_name in my_env: - if env_var_name.startswith('gh'): + if env_var_name.startswith("gh"): # github env vars use utf-8, on windows, non-ascii code may # cause problem, so encode first try: my_env[env_var_name] = str(my_env[env_var_name].encode("utf-8")) except UnicodeDecodeError as e: - shex = ':'.join('{:02x}'.format(ord(c)) for c in my_env[env_var_name]) - print('Invalid ENV[{}] = {}'.format(env_var_name, shex), file=sys.stderr) + shex = ":".join( + "{:02x}".format(ord(c)) for c in my_env[env_var_name] + ) + print( + "Invalid ENV[{}] = {}".format(env_var_name, shex), + file=sys.stderr, + ) print(e, file=sys.stderr) # According to the CMake manual, we should pass the arguments first, # and put the directory as the last element. Otherwise, these flags @@ -381,7 +428,14 @@ def build(self, my_env: Dict[str, str]) -> None: from .env import build_type - build_args = ['--build', '.', '--target', 'install', '--config', build_type.build_type_string] + build_args = [ + "--build", + ".", + "--target", + "install", + "--config", + build_type.build_type_string, + ] # Determine the parallelism according to the following # priorities: @@ -391,7 +445,7 @@ def build(self, my_env: Dict[str, str]) -> None: # Allow the user to set parallelism explicitly. If unset, # we'll try to figure it out. - max_jobs = os.getenv('MAX_JOBS') + max_jobs = os.getenv("MAX_JOBS") if max_jobs is not None or not USE_NINJA: # Ninja is capable of figuring out the parallelism on its @@ -410,10 +464,10 @@ def build(self, my_env: Dict[str, str]) -> None: # build_args += ['-j', max_jobs] would be sufficient by # then. Until then, we use "--" to pass parameters to the # underlying build system. - build_args += ['--'] + build_args += ["--"] if IS_WINDOWS and not USE_NINJA: # We are likely using msbuild here - build_args += ['/p:CL_MPCount={}'.format(max_jobs)] + build_args += ["/p:CL_MPCount={}".format(max_jobs)] else: - build_args += ['-j', max_jobs] + build_args += ["-j", max_jobs] self.run(build_args, my_env) diff --git a/tools/setup_helpers/env.py b/tools/setup_helpers/env.py index d658acdb8d52..bf693cacc381 100644 --- a/tools/setup_helpers/env.py +++ b/tools/setup_helpers/env.py @@ -6,37 +6,41 @@ from typing import Iterable, List, Optional, cast -IS_WINDOWS = (platform.system() == 'Windows') -IS_DARWIN = (platform.system() == 'Darwin') -IS_LINUX = (platform.system() == 'Linux') +IS_WINDOWS = platform.system() == "Windows" +IS_DARWIN = platform.system() == "Darwin" +IS_LINUX = platform.system() == "Linux" -IS_CONDA = 'conda' in sys.version or 'Continuum' in sys.version or any([x.startswith('CONDA') for x in os.environ]) -CONDA_DIR = os.path.join(os.path.dirname(sys.executable), '..') +IS_CONDA = ( + "conda" in sys.version + or "Continuum" in sys.version + or any([x.startswith("CONDA") for x in os.environ]) +) +CONDA_DIR = os.path.join(os.path.dirname(sys.executable), "..") -IS_64BIT = (struct.calcsize("P") == 8) +IS_64BIT = struct.calcsize("P") == 8 -BUILD_DIR = 'build' +BUILD_DIR = "build" -def check_env_flag(name: str, default: str = '') -> bool: - return os.getenv(name, default).upper() in ['ON', '1', 'YES', 'TRUE', 'Y'] +def check_env_flag(name: str, default: str = "") -> bool: + return os.getenv(name, default).upper() in ["ON", "1", "YES", "TRUE", "Y"] -def check_negative_env_flag(name: str, default: str = '') -> bool: - return os.getenv(name, default).upper() in ['OFF', '0', 'NO', 'FALSE', 'N'] +def check_negative_env_flag(name: str, default: str = "") -> bool: + return os.getenv(name, default).upper() in ["OFF", "0", "NO", "FALSE", "N"] def gather_paths(env_vars: Iterable[str]) -> List[str]: - return list(chain(*(os.getenv(v, '').split(os.pathsep) for v in env_vars))) + return list(chain(*(os.getenv(v, "").split(os.pathsep) for v in env_vars))) def lib_paths_from_base(base_path: str) -> List[str]: - return [os.path.join(base_path, s) for s in ['lib/x64', 'lib', 'lib64']] + return [os.path.join(base_path, s) for s in ["lib/x64", "lib", "lib64"]] # We promised that CXXFLAGS should also be affected by CFLAGS -if 'CFLAGS' in os.environ and 'CXXFLAGS' not in os.environ: - os.environ['CXXFLAGS'] = os.environ['CFLAGS'] +if "CFLAGS" in os.environ and "CXXFLAGS" not in os.environ: + os.environ["CXXFLAGS"] = os.environ["CFLAGS"] class BuildType(object): @@ -55,39 +59,40 @@ def __init__(self, cmake_build_type_env: Optional[str] = None) -> None: self.build_type_string = cmake_build_type_env return - cmake_cache_txt = os.path.join(BUILD_DIR, 'CMakeCache.txt') + cmake_cache_txt = os.path.join(BUILD_DIR, "CMakeCache.txt") if os.path.isfile(cmake_cache_txt): # Found CMakeCache.txt. Use the build type specified in it. from .cmake import get_cmake_cache_variables_from_file + with open(cmake_cache_txt) as f: cmake_cache_vars = get_cmake_cache_variables_from_file(f) # Normally it is anti-pattern to determine build type from CMAKE_BUILD_TYPE because it is not used for # multi-configuration build tools, such as Visual Studio and XCode. But since we always communicate with # CMake using CMAKE_BUILD_TYPE from our Python scripts, this is OK here. - self.build_type_string = cast(str, cmake_cache_vars['CMAKE_BUILD_TYPE']) + self.build_type_string = cast(str, cmake_cache_vars["CMAKE_BUILD_TYPE"]) else: - self.build_type_string = os.environ.get('CMAKE_BUILD_TYPE', 'Release') + self.build_type_string = os.environ.get("CMAKE_BUILD_TYPE", "Release") def is_debug(self) -> bool: "Checks Debug build." - return self.build_type_string == 'Debug' + return self.build_type_string == "Debug" def is_rel_with_deb_info(self) -> bool: "Checks RelWithDebInfo build." - return self.build_type_string == 'RelWithDebInfo' + return self.build_type_string == "RelWithDebInfo" def is_release(self) -> bool: "Checks Release build." - return self.build_type_string == 'Release' + return self.build_type_string == "Release" # hotpatch environment variable 'CMAKE_BUILD_TYPE'. 'CMAKE_BUILD_TYPE' always prevails over DEBUG or REL_WITH_DEB_INFO. -if 'CMAKE_BUILD_TYPE' not in os.environ: - if check_env_flag('DEBUG'): - os.environ['CMAKE_BUILD_TYPE'] = 'Debug' - elif check_env_flag('REL_WITH_DEB_INFO'): - os.environ['CMAKE_BUILD_TYPE'] = 'RelWithDebInfo' +if "CMAKE_BUILD_TYPE" not in os.environ: + if check_env_flag("DEBUG"): + os.environ["CMAKE_BUILD_TYPE"] = "Debug" + elif check_env_flag("REL_WITH_DEB_INFO"): + os.environ["CMAKE_BUILD_TYPE"] = "RelWithDebInfo" else: - os.environ['CMAKE_BUILD_TYPE'] = 'Release' + os.environ["CMAKE_BUILD_TYPE"] = "Release" build_type = BuildType() diff --git a/tools/setup_helpers/gen.py b/tools/setup_helpers/gen.py index bdb52ee44efb..3ca9a8787906 100644 --- a/tools/setup_helpers/gen.py +++ b/tools/setup_helpers/gen.py @@ -6,6 +6,6 @@ root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.insert(0, root) -import tools.codegen.gen +import torchgen.gen -tools.codegen.gen.main() +torchgen.gen.main() diff --git a/tools/setup_helpers/gen_unboxing.py b/tools/setup_helpers/gen_unboxing.py new file mode 100644 index 000000000000..d2883f6d1e48 --- /dev/null +++ b/tools/setup_helpers/gen_unboxing.py @@ -0,0 +1,11 @@ +# Little stub file to get BUILD.bazel to play along + +import os.path +import sys + +root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +sys.path.insert(0, root) + +import tools.jit.gen_unboxing + +tools.jit.gen_unboxing.main() diff --git a/tools/setup_helpers/gen_version_header.py b/tools/setup_helpers/gen_version_header.py index 963db1dad1f1..bd576af6f111 100644 --- a/tools/setup_helpers/gen_version_header.py +++ b/tools/setup_helpers/gen_version_header.py @@ -76,7 +76,9 @@ def main(args: argparse.Namespace) -> None: help="Path to the template (i.e. version.h.in)", ) parser.add_argument( - "--version-path", required=True, help="Path to the file specifying the version", + "--version-path", + required=True, + help="Path to the file specifying the version", ) parser.add_argument( "--output-path", diff --git a/tools/setup_helpers/generate_code.py b/tools/setup_helpers/generate_code.py index ef90acc3935a..4440e6c2e0a2 100644 --- a/tools/setup_helpers/generate_code.py +++ b/tools/setup_helpers/generate_code.py @@ -1,8 +1,9 @@ import argparse import os +import pathlib import sys import yaml -from typing import Any, List, Optional, cast +from typing import Any, Optional, cast try: # use faster C loader if available @@ -10,56 +11,42 @@ except ImportError: from yaml import SafeLoader as YamlLoader # type: ignore[misc] -source_files = {'.py', '.cpp', '.h'} - -NATIVE_FUNCTIONS_PATH = 'aten/src/ATen/native/native_functions.yaml' - -# TODO: This is a little inaccurate, because it will also pick -# up setup_helper scripts which don't affect code generation -def all_generator_source() -> List[str]: - r = [] - for directory, _, filenames in os.walk('tools'): - for f in filenames: - if os.path.splitext(f)[1] in source_files: - full = os.path.join(directory, f) - r.append(full) - return sorted(r) - - -def generate_code(ninja_global: Optional[str] = None, - nn_path: Optional[str] = None, - native_functions_path: Optional[str] = None, - install_dir: Optional[str] = None, - subset: Optional[str] = None, - disable_autograd: bool = False, - force_schema_registration: bool = False, - operator_selector: Any = None) -> None: +NATIVE_FUNCTIONS_PATH = "aten/src/ATen/native/native_functions.yaml" +TAGS_PATH = "aten/src/ATen/native/tags.yaml" + + +def generate_code( + gen_dir: pathlib.Path, + native_functions_path: Optional[str] = None, + tags_path: Optional[str] = None, + install_dir: Optional[str] = None, + subset: Optional[str] = None, + disable_autograd: bool = False, + force_schema_registration: bool = False, + operator_selector: Any = None, +) -> None: from tools.autograd.gen_autograd import gen_autograd, gen_autograd_python from tools.autograd.gen_annotated_fn_args import gen_annotated - from tools.codegen.selective_build.selector import SelectiveBuilder - + from torchgen.selective_build.selector import SelectiveBuilder # Build ATen based Variable classes if install_dir is None: - install_dir = 'torch/csrc' - python_install_dir = 'torch/testing/_internal/generated' + install_dir = os.fspath(gen_dir / "torch/csrc") + python_install_dir = os.fspath(gen_dir / "torch/testing/_internal/generated") else: python_install_dir = install_dir - autograd_gen_dir = os.path.join(install_dir, 'autograd', 'generated') - jit_gen_dir = os.path.join(install_dir, 'jit', 'generated') - for d in (autograd_gen_dir, jit_gen_dir, python_install_dir): - if not os.path.exists(d): - os.makedirs(d) - runfiles_dir = os.environ.get("RUNFILES_DIR", None) - data_dir = os.path.join(runfiles_dir, 'pytorch') if runfiles_dir else '' - autograd_dir = os.path.join(data_dir, 'tools', 'autograd') - tools_jit_templates = os.path.join(data_dir, 'tools', 'jit', 'templates') + autograd_gen_dir = os.path.join(install_dir, "autograd", "generated") + for d in (autograd_gen_dir, python_install_dir): + os.makedirs(d, exist_ok=True) + autograd_dir = os.fspath(pathlib.Path(__file__).parent.parent / "autograd") if subset == "pybindings" or not subset: gen_autograd_python( native_functions_path or NATIVE_FUNCTIONS_PATH, + tags_path or TAGS_PATH, autograd_gen_dir, - autograd_dir) + autograd_dir, + ) if operator_selector is None: operator_selector = SelectiveBuilder.get_nop_selector() @@ -68,6 +55,7 @@ def generate_code(ninja_global: Optional[str] = None, gen_autograd( native_functions_path or NATIVE_FUNCTIONS_PATH, + tags_path or TAGS_PATH, autograd_gen_dir, autograd_dir, disable_autograd=disable_autograd, @@ -77,18 +65,20 @@ def generate_code(ninja_global: Optional[str] = None, if subset == "python" or not subset: gen_annotated( native_functions_path or NATIVE_FUNCTIONS_PATH, + tags_path or TAGS_PATH, python_install_dir, - autograd_dir) + autograd_dir, + ) def get_selector_from_legacy_operator_selection_list( - selected_op_list_path: str, + selected_op_list_path: str, ) -> Any: - with open(selected_op_list_path, 'r') as f: + with open(selected_op_list_path, "r") as f: # strip out the overload part # It's only for legacy config - do NOT copy this code! selected_op_list = { - opname.split('.', 1)[0] for opname in yaml.load(f, Loader=YamlLoader) + opname.split(".", 1)[0] for opname in yaml.load(f, Loader=YamlLoader) } # Internal build doesn't use this flag any more. Only used by OSS @@ -100,7 +90,8 @@ def get_selector_from_legacy_operator_selection_list( is_root_operator = True is_used_for_training = True - from tools.codegen.selective_build.selector import SelectiveBuilder + from torchgen.selective_build.selector import SelectiveBuilder + selector = SelectiveBuilder.from_legacy_op_registration_allow_list( selected_op_list, is_root_operator, @@ -117,12 +108,14 @@ def get_selector( # cwrap depends on pyyaml, so we can't import it earlier root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.insert(0, root) - from tools.codegen.selective_build.selector import SelectiveBuilder + from torchgen.selective_build.selector import SelectiveBuilder - assert not (selected_op_list_path is not None and - operators_yaml_path is not None), \ - ("Expected at most one of selected_op_list_path and " + - "operators_yaml_path to be set.") + assert not ( + selected_op_list_path is not None and operators_yaml_path is not None + ), ( + "Expected at most one of selected_op_list_path and " + + "operators_yaml_path to be set." + ) if selected_op_list_path is None and operators_yaml_path is None: return SelectiveBuilder.get_nop_selector() @@ -133,49 +126,105 @@ def get_selector( def main() -> None: - parser = argparse.ArgumentParser(description='Autogenerate code') - parser.add_argument('--native-functions-path') - parser.add_argument('--nn-path') - parser.add_argument('--ninja-global') - parser.add_argument('--install_dir') + parser = argparse.ArgumentParser(description="Autogenerate code") + parser.add_argument("--native-functions-path") + parser.add_argument("--tags-path") parser.add_argument( - '--subset', - help='Subset of source files to generate. Can be "libtorch" or "pybindings". Generates both when omitted.' + "--gen-dir", + type=pathlib.Path, + default=pathlib.Path("."), + help="Root directory where to install files. Defaults to the current working directory.", ) parser.add_argument( - '--disable-autograd', + "--install_dir", + help=( + "Deprecated. Use --gen-dir instead. The semantics are different, do not change " + "blindly." + ), + ) + parser.add_argument( + "--subset", + help='Subset of source files to generate. Can be "libtorch" or "pybindings". Generates both when omitted.', + ) + parser.add_argument( + "--disable-autograd", default=False, - action='store_true', - help='It can skip generating autograd related code when the flag is set', + action="store_true", + help="It can skip generating autograd related code when the flag is set", + ) + parser.add_argument( + "--selected-op-list-path", + help="Path to the YAML file that contains the list of operators to include for custom build.", ) parser.add_argument( - '--selected-op-list-path', - help='Path to the YAML file that contains the list of operators to include for custom build.', + "--operators_yaml_path", + help="Path to the model YAML file that contains the list of operators to include for custom build.", ) parser.add_argument( - '--operators_yaml_path', - help='Path to the model YAML file that contains the list of operators to include for custom build.', + "--force_schema_registration", + action="store_true", + help="force it to generate schema-only registrations for ops that are not" + "listed on --selected-op-list", ) parser.add_argument( - '--force_schema_registration', - action='store_true', - help='force it to generate schema-only registrations for ops that are not' - 'listed on --selected-op-list' + "--gen_lazy_ts_backend", + action="store_true", + help="Enable generation of the torch::lazy TorchScript backend", + ) + parser.add_argument( + "--per_operator_headers", + action="store_true", + help="Build lazy tensor ts backend with per-operator ATen headers, must match how ATen was built", ) options = parser.parse_args() generate_code( - options.ninja_global, - options.nn_path, + options.gen_dir, options.native_functions_path, + options.tags_path, options.install_dir, options.subset, options.disable_autograd, options.force_schema_registration, # options.selected_op_list - operator_selector=get_selector(options.selected_op_list_path, options.operators_yaml_path), + operator_selector=get_selector( + options.selected_op_list_path, options.operators_yaml_path + ), ) + if options.gen_lazy_ts_backend: + aten_path = os.path.dirname(os.path.dirname(options.native_functions_path)) + ts_backend_yaml = os.path.join(aten_path, "native/ts_native_functions.yaml") + ts_native_functions = "torch/csrc/lazy/ts_backend/ts_native_functions.cpp" + ts_node_base = "torch/csrc/lazy/ts_backend/ts_node.h" + install_dir = options.install_dir or os.fspath(options.gen_dir / "torch/csrc") + lazy_install_dir = os.path.join(install_dir, "lazy/generated") + os.makedirs(lazy_install_dir, exist_ok=True) + + assert os.path.isfile( + ts_backend_yaml + ), f"Unable to access ts_backend_yaml: {ts_backend_yaml}" + assert os.path.isfile( + ts_native_functions + ), f"Unable to access {ts_native_functions}" + from torchgen.gen_lazy_tensor import run_gen_lazy_tensor + from torchgen.dest.lazy_ir import GenTSLazyIR + + run_gen_lazy_tensor( + aten_path=aten_path, + source_yaml=ts_backend_yaml, + backend_name="TorchScript", + output_dir=lazy_install_dir, + dry_run=False, + impl_path=ts_native_functions, + node_base="TsNode", + node_base_hdr=ts_node_base, + build_in_tree=True, + lazy_ir_generator=GenTSLazyIR, + per_operator_headers=options.per_operator_headers, + gen_forced_fallback_code=True, + ) + if __name__ == "__main__": main() diff --git a/tools/setup_helpers/numpy_.py b/tools/setup_helpers/numpy_.py index 882de4be6e93..e93fcfd24707 100644 --- a/tools/setup_helpers/numpy_.py +++ b/tools/setup_helpers/numpy_.py @@ -10,7 +10,7 @@ # Set USE_NUMPY to what the user wants, because even if we fail here, cmake # will check for the presence of NumPy again (`cmake/Dependencies.cmake`). -USE_NUMPY = not check_negative_env_flag('USE_NUMPY') +USE_NUMPY = not check_negative_env_flag("USE_NUMPY") NUMPY_INCLUDE_DIR = None if USE_NUMPY: diff --git a/tools/shared/cwrap_common.py b/tools/shared/cwrap_common.py index 01ff97aabd9b..42548b9afa11 100644 --- a/tools/shared/cwrap_common.py +++ b/tools/shared/cwrap_common.py @@ -6,17 +6,18 @@ Arg = Dict[str, Any] + def parse_arguments(args: List[Union[str, Arg]]) -> List[Arg]: new_args = [] for arg in args: # Simple arg declaration of form " " if isinstance(arg, str): - t, _, name = arg.partition(' ') - new_args.append({'type': t, 'name': name}) + t, _, name = arg.partition(" ") + new_args.append({"type": t, "name": name}) elif isinstance(arg, dict): - if 'arg' in arg: - arg['type'], _, arg['name'] = arg['arg'].partition(' ') - del arg['arg'] + if "arg" in arg: + arg["type"], _, arg["name"] = arg["arg"].partition(" ") + del arg["arg"] new_args.append(arg) else: raise AssertionError() @@ -27,52 +28,66 @@ def parse_arguments(args: List[Union[str, Arg]]) -> List[Arg]: def set_declaration_defaults(declaration: Declaration) -> None: - if 'schema_string' not in declaration: + if "schema_string" not in declaration: # This happens for legacy TH bindings like # _thnn_conv_depthwise2d_backward - declaration['schema_string'] = '' - declaration.setdefault('arguments', []) - declaration.setdefault('return', 'void') - if 'cname' not in declaration: - declaration['cname'] = declaration['name'] - if 'backends' not in declaration: - declaration['backends'] = ['CPU', 'CUDA'] - assert 'api_name' not in declaration - declaration['api_name'] = declaration['name'] + declaration["schema_string"] = "" + declaration.setdefault("arguments", []) + declaration.setdefault("return", "void") + if "cname" not in declaration: + declaration["cname"] = declaration["name"] + if "backends" not in declaration: + declaration["backends"] = ["CPU", "CUDA"] + assert "api_name" not in declaration + declaration["api_name"] = declaration["name"] # NB: keep this in sync with gen_autograd.py - if declaration.get('overload_name'): - declaration['type_wrapper_name'] = "{}_{}".format( - declaration['name'], declaration['overload_name']) + if declaration.get("overload_name"): + declaration["type_wrapper_name"] = "{}_{}".format( + declaration["name"], declaration["overload_name"] + ) else: - declaration['type_wrapper_name'] = declaration['name'] + declaration["type_wrapper_name"] = declaration["name"] # TODO: Uggggh, parsing the schema string here, really??? - declaration['operator_name_with_overload'] = declaration['schema_string'].split('(')[0] - if declaration['schema_string']: - declaration['unqual_schema_string'] = declaration['schema_string'].split('::')[1] - declaration['unqual_operator_name_with_overload'] = declaration['operator_name_with_overload'].split('::')[1] + declaration["operator_name_with_overload"] = declaration["schema_string"].split( + "(" + )[0] + if declaration["schema_string"]: + declaration["unqual_schema_string"] = declaration["schema_string"].split("::")[ + 1 + ] + declaration["unqual_operator_name_with_overload"] = declaration[ + "operator_name_with_overload" + ].split("::")[1] else: - declaration['unqual_schema_string'] = '' - declaration['unqual_operator_name_with_overload'] = '' + declaration["unqual_schema_string"] = "" + declaration["unqual_operator_name_with_overload"] = "" # Simulate multiple dispatch, even if it's not necessary - if 'options' not in declaration: - declaration['options'] = [{ - 'arguments': copy.deepcopy(declaration['arguments']), - 'schema_order_arguments': copy.deepcopy(declaration['schema_order_arguments']), - }] - del declaration['arguments'] - del declaration['schema_order_arguments'] + if "options" not in declaration: + declaration["options"] = [ + { + "arguments": copy.deepcopy(declaration["arguments"]), + "schema_order_arguments": copy.deepcopy( + declaration["schema_order_arguments"] + ), + } + ] + del declaration["arguments"] + del declaration["schema_order_arguments"] # Parse arguments (some of them can be strings) - for option in declaration['options']: - option['arguments'] = parse_arguments(option['arguments']) - option['schema_order_arguments'] = parse_arguments(option['schema_order_arguments']) + for option in declaration["options"]: + option["arguments"] = parse_arguments(option["arguments"]) + option["schema_order_arguments"] = parse_arguments( + option["schema_order_arguments"] + ) # Propagate defaults from declaration to options - for option in declaration['options']: + for option in declaration["options"]: for k, v in declaration.items(): # TODO(zach): why does cwrap not propagate 'name'? I need it # propagaged for ATen - if k != 'options': + if k != "options": option.setdefault(k, v) + # TODO(zach): added option to remove keyword handling for C++ which cannot # support it. @@ -86,38 +101,41 @@ def filter_unique_options( remove_self: bool, ) -> List[Option]: def exclude_arg(arg: Arg) -> bool: - return arg['type'] == 'CONSTANT' # type: ignore[no-any-return] + return arg["type"] == "CONSTANT" # type: ignore[no-any-return] def exclude_arg_with_self_check(arg: Arg) -> bool: - return exclude_arg(arg) or (remove_self and arg['name'] == 'self') + return exclude_arg(arg) or (remove_self and arg["name"] == "self") def signature(option: Option, num_kwarg_only: int) -> str: if num_kwarg_only == 0: kwarg_only_count = None else: kwarg_only_count = -num_kwarg_only - arg_signature = '#'.join( - type_to_signature.get(arg['type'], arg['type']) - for arg in option['arguments'][:kwarg_only_count] - if not exclude_arg_with_self_check(arg)) + arg_signature = "#".join( + type_to_signature.get(arg["type"], arg["type"]) + for arg in option["arguments"][:kwarg_only_count] + if not exclude_arg_with_self_check(arg) + ) if kwarg_only_count is None: return arg_signature - kwarg_only_signature = '#'.join( - arg['name'] + '#' + arg['type'] - for arg in option['arguments'][kwarg_only_count:] - if not exclude_arg(arg)) + kwarg_only_signature = "#".join( + arg["name"] + "#" + arg["type"] + for arg in option["arguments"][kwarg_only_count:] + if not exclude_arg(arg) + ) return arg_signature + "#-#" + kwarg_only_signature + seen_signatures = set() unique = [] for option in options: # if only check num_kwarg_only == 0 if allow_kwarg == False - limit = len(option['arguments']) if allow_kwarg else 0 + limit = len(option["arguments"]) if allow_kwarg else 0 for num_kwarg_only in range(0, limit + 1): sig = signature(option, num_kwarg_only) if sig not in seen_signatures: if num_kwarg_only > 0: - for arg in option['arguments'][-num_kwarg_only:]: - arg['kwarg_only'] = True + for arg in option["arguments"][-num_kwarg_only:]: + arg["kwarg_only"] = True unique.append(option) seen_signatures.add(sig) break @@ -126,49 +144,48 @@ def signature(option: Option, num_kwarg_only: int) -> str: def sort_by_number_of_args(declaration: Declaration, reverse: bool = True) -> None: def num_args(option: Option) -> int: - return len(option['arguments']) - declaration['options'].sort(key=num_args, reverse=reverse) + return len(option["arguments"]) + declaration["options"].sort(key=num_args, reverse=reverse) -class Function(object): +class Function(object): def __init__(self, name: str) -> None: self.name = name - self.arguments: List['Argument'] = [] + self.arguments: List["Argument"] = [] - def add_argument(self, arg: 'Argument') -> None: + def add_argument(self, arg: "Argument") -> None: assert isinstance(arg, Argument) self.arguments.append(arg) def __repr__(self) -> str: - return self.name + '(' + ', '.join(a.__repr__() for a in self.arguments) + ')' + return self.name + "(" + ", ".join(a.__repr__() for a in self.arguments) + ")" class Argument(object): - def __init__(self, _type: str, name: str, is_optional: bool): self.type = _type self.name = name self.is_optional = is_optional def __repr__(self) -> str: - return self.type + ' ' + self.name + return self.type + " " + self.name def parse_header(path: str) -> List[Function]: - with open(path, 'r') as f: - lines: Iterable[Any] = f.read().split('\n') + with open(path, "r") as f: + lines: Iterable[Any] = f.read().split("\n") # Remove empty lines and prebackend directives - lines = filter(lambda l: l and not l.startswith('#'), lines) + lines = filter(lambda l: l and not l.startswith("#"), lines) # Remove line comments - lines = (l.partition('//') for l in lines) + lines = (l.partition("//") for l in lines) # Select line and comment part lines = ((l[0].strip(), l[2].strip()) for l in lines) # Remove trailing special signs - lines = ((l[0].rstrip(');').rstrip(','), l[1]) for l in lines) + lines = ((l[0].rstrip(");").rstrip(","), l[1]) for l in lines) # Split arguments - lines = ((l[0].split(','), l[1]) for l in lines) + lines = ((l[0].split(","), l[1]) for l in lines) # Flatten lines new_lines = [] for l, c in lines: @@ -182,32 +199,31 @@ def parse_header(path: str) -> List[Function]: lines = filter(lambda l: l[0], lines) generic_functions = [] for l, c in lines: - if l.startswith('TH_API void THNN_'): - fn_name = l[len('TH_API void THNN_'):] - if fn_name[0] == '(' and fn_name[-2] == ')': + if l.startswith("TH_API void THNN_"): + fn_name = l[len("TH_API void THNN_") :] + if fn_name[0] == "(" and fn_name[-2] == ")": fn_name = fn_name[1:-2] else: fn_name = fn_name[:-1] generic_functions.append(Function(fn_name)) - elif l.startswith('TORCH_CUDA_CPP_API void THNN_'): - fn_name = l[len('TORCH_CUDA_CPP_API void THNN_'):] - if fn_name[0] == '(' and fn_name[-2] == ')': + elif l.startswith("TORCH_CUDA_CPP_API void THNN_"): + fn_name = l[len("TORCH_CUDA_CPP_API void THNN_") :] + if fn_name[0] == "(" and fn_name[-2] == ")": fn_name = fn_name[1:-2] else: fn_name = fn_name[:-1] generic_functions.append(Function(fn_name)) - elif l.startswith('TORCH_CUDA_CU_API void THNN_'): - fn_name = l[len('TORCH_CUDA_CU_API void THNN_'):] - if fn_name[0] == '(' and fn_name[-2] == ')': + elif l.startswith("TORCH_CUDA_CU_API void THNN_"): + fn_name = l[len("TORCH_CUDA_CU_API void THNN_") :] + if fn_name[0] == "(" and fn_name[-2] == ")": fn_name = fn_name[1:-2] else: fn_name = fn_name[:-1] generic_functions.append(Function(fn_name)) elif l: t, name = l.split() - if '*' in name: - t = t + '*' + if "*" in name: + t = t + "*" name = name[1:] - generic_functions[-1].add_argument( - Argument(t, name, '[OPTIONAL]' in c)) + generic_functions[-1].add_argument(Argument(t, name, "[OPTIONAL]" in c)) return generic_functions diff --git a/tools/shared/module_loader.py b/tools/shared/module_loader.py index 7482047d4e8d..5e22fb4be4e0 100644 --- a/tools/shared/module_loader.py +++ b/tools/shared/module_loader.py @@ -5,7 +5,9 @@ def import_module(name: str, path: str) -> ModuleType: import importlib.util + spec = importlib.util.spec_from_file_location(name, path) + assert spec is not None module = importlib.util.module_from_spec(spec) cast(Loader, spec.loader).exec_module(module) return module diff --git a/tools/stats/export_slow_tests.py b/tools/stats/export_slow_tests.py index b9d71cfb6cb7..13afbf984a23 100644 --- a/tools/stats/export_slow_tests.py +++ b/tools/stats/export_slow_tests.py @@ -5,53 +5,74 @@ import os import statistics from collections import defaultdict -from tools.stats.s3_stat_parser import get_previous_reports_for_branch, Report, Version2Report +from tools.stats.s3_stat_parser import ( + get_previous_reports_for_branch, + Report, + Version2Report, +) from typing import cast, DefaultDict, Dict, List, Any from urllib.request import urlopen -SLOW_TESTS_FILE = '.pytorch-slow-tests.json' +SLOW_TESTS_FILE = ".pytorch-slow-tests.json" SLOW_TEST_CASE_THRESHOLD_SEC = 60.0 RELATIVE_DIFFERENCE_THRESHOLD = 0.1 +IGNORED_JOBS = ["asan", "periodic"] + def get_test_case_times() -> Dict[str, float]: - reports: List[Report] = get_previous_reports_for_branch('origin/viable/strict', "") + reports: List[Report] = get_previous_reports_for_branch("origin/viable/strict", "") # an entry will be like ("test_doc_examples (__main__.TestTypeHints)" -> [values])) test_names_to_times: DefaultDict[str, List[float]] = defaultdict(list) for report in reports: - if report.get('format_version', 1) != 2: # type: ignore[misc] + if report.get("format_version", 1) != 2: # type: ignore[misc] raise RuntimeError("S3 format currently handled is version 2 only") v2report = cast(Version2Report, report) - for test_file in v2report['files'].values(): - for suitename, test_suite in test_file['suites'].items(): - for casename, test_case in test_suite['cases'].items(): + + if any(job_name in str(report["build_job"]) for job_name in IGNORED_JOBS): + continue + + for test_file in v2report["files"].values(): + for suitename, test_suite in test_file["suites"].items(): + for casename, test_case in test_suite["cases"].items(): # The below attaches a __main__ as that matches the format of test.__class__ in # common_utils.py (where this data will be used), and also matches what the output # of a running test would look like. - name = f'{casename} (__main__.{suitename})' - succeeded: bool = test_case['status'] is None + name = f"{casename} (__main__.{suitename})" + succeeded: bool = test_case["status"] is None if succeeded: - test_names_to_times[name].append(test_case['seconds']) - return {test_case: statistics.mean(times) for test_case, times in test_names_to_times.items()} + test_names_to_times[name].append(test_case["seconds"]) + return { + test_case: statistics.mean(times) + for test_case, times in test_names_to_times.items() + } def filter_slow_tests(test_cases_dict: Dict[str, float]) -> Dict[str, float]: - return {test_case: time for test_case, time in test_cases_dict.items() if time >= SLOW_TEST_CASE_THRESHOLD_SEC} + return { + test_case: time + for test_case, time in test_cases_dict.items() + if time >= SLOW_TEST_CASE_THRESHOLD_SEC + } def get_test_infra_slow_tests() -> Dict[str, float]: url = "https://raw.githubusercontent.com/pytorch/test-infra/generated-stats/stats/slow-tests.json" - contents = urlopen(url, timeout=1).read().decode('utf-8') + contents = urlopen(url, timeout=1).read().decode("utf-8") return cast(Dict[str, float], json.loads(contents)) -def too_similar(calculated_times: Dict[str, float], other_times: Dict[str, float], threshold: float) -> bool: +def too_similar( + calculated_times: Dict[str, float], other_times: Dict[str, float], threshold: float +) -> bool: # check that their keys are the same if calculated_times.keys() != other_times.keys(): return False for test_case, test_time in calculated_times.items(): other_test_time = other_times[test_case] - relative_difference = abs((other_test_time - test_time) / max(other_test_time, test_time)) + relative_difference = abs( + (other_test_time - test_time) / max(other_test_time, test_time) + ) if relative_difference > threshold: return False return True @@ -60,38 +81,43 @@ def too_similar(calculated_times: Dict[str, float], other_times: Dict[str, float def export_slow_tests(options: Any) -> None: filename = options.filename if os.path.exists(filename): - print(f'Overwriting existent file: {filename}') - with open(filename, 'w+') as file: + print(f"Overwriting existent file: {filename}") + with open(filename, "w+") as file: slow_test_times: Dict[str, float] = filter_slow_tests(get_test_case_times()) if options.ignore_small_diffs: test_infra_slow_tests_dict = get_test_infra_slow_tests() - if too_similar(slow_test_times, test_infra_slow_tests_dict, options.ignore_small_diffs): + if too_similar( + slow_test_times, test_infra_slow_tests_dict, options.ignore_small_diffs + ): slow_test_times = test_infra_slow_tests_dict - json.dump(slow_test_times, file, indent=' ', separators=(',', ': '), sort_keys=True) - file.write('\n') + json.dump( + slow_test_times, file, indent=" ", separators=(",", ": "), sort_keys=True + ) + file.write("\n") def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( - description='Export a JSON of slow test cases in PyTorch unit test suite') + description="Export a JSON of slow test cases in PyTorch unit test suite" + ) parser.add_argument( - '-f', - '--filename', - nargs='?', + "-f", + "--filename", + nargs="?", type=str, default=SLOW_TESTS_FILE, const=SLOW_TESTS_FILE, - help='Specify a file path to dump slow test times from previous S3 stats. Default file path: .pytorch-slow-tests.json', + help="Specify a file path to dump slow test times from previous S3 stats. Default file path: .pytorch-slow-tests.json", ) parser.add_argument( - '--ignore-small-diffs', - nargs='?', + "--ignore-small-diffs", + nargs="?", type=float, const=RELATIVE_DIFFERENCE_THRESHOLD, - help='Compares generated results with stats/slow-tests.json in pytorch/test-infra. If the relative differences ' - 'between test times for each test are smaller than the threshold and the set of test cases have not ' - 'changed, we will export the stats already in stats/slow-tests.json. Else, we will export the calculated ' - 'results. The default threshold is 10%.', + help="Compares generated results with stats/slow-tests.json in pytorch/test-infra. If the relative differences " + "between test times for each test are smaller than the threshold and the set of test cases have not " + "changed, we will export the stats already in stats/slow-tests.json. Else, we will export the calculated " + "results. The default threshold is 10%.", ) return parser.parse_args() @@ -101,5 +127,5 @@ def main() -> None: export_slow_tests(options) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tools/stats/import_test_stats.py b/tools/stats/import_test_stats.py index f6250a182bef..7249c5fccb65 100644 --- a/tools/stats/import_test_stats.py +++ b/tools/stats/import_test_stats.py @@ -8,30 +8,34 @@ from typing import Any, Callable, Dict, List, Optional, cast from urllib.request import urlopen -# PYTORCH_IGNORE_DISABLED_ISSUES should only be set during CI (along with IN_CI) as a -# comma-separated list of issue numbers. The intent is to re-enable any disabled tests -# associated with the issues in this list. -# -# There is normally no reason to use this locally as the disabled tests list should not -# affect your local development and every test should be enabled. If for whatever reason -# you would like to use this during local development, please note the following caveat: -# -# Whenever you set OR reset PYTORCH_IGNORE_DISABLED_ISSUES, you should delete the existing -# .pytorch-disabled-tests.json and redownload/parse the file for your change to apply, as -# PYTORCH_IGNORE_DISABLED_ISSUES is used during the parsing stage. To download the files, -# run test/run_test.py with IN_CI=1. -IGNORE_DISABLED_ISSUES: List[str] = os.getenv('PYTORCH_IGNORE_DISABLED_ISSUES', '').split(',') - -SLOW_TESTS_FILE = '.pytorch-slow-tests.json' -DISABLED_TESTS_FILE = '.pytorch-disabled-tests.json' + +def get_disabled_issues() -> List[str]: + pr_body = os.getenv("PR_BODY", "") + commit_messages = os.getenv("COMMIT_MESSAGES", "") + # The below regex is meant to match all *case-insensitive* keywords that + # GitHub has delineated would link PRs to issues, more details here: + # https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue. + # E.g., "Close #62851", "fixES #62851" and "RESOLVED #62851" would all match, but not + # "closes #62851" --> extra space, "fixing #62851" --> not a keyword, nor "fix 62851" --> no # + regex = "(?i)(Close(d|s)?|Resolve(d|s)?|Fix(ed|es)?) (#|https://github.com/pytorch/pytorch/issues/)([0-9]+)" + issue_numbers = [x[5] for x in re.findall(regex, pr_body + commit_messages)] + print("Ignoring disabled issues: ", issue_numbers) + return issue_numbers + + +IGNORE_DISABLED_ISSUES: List[str] = get_disabled_issues() + +SLOW_TESTS_FILE = ".pytorch-slow-tests.json" +DISABLED_TESTS_FILE = ".pytorch-disabled-tests.json" FILE_CACHE_LIFESPAN_SECONDS = datetime.timedelta(hours=3).seconds + def fetch_and_cache( dirpath: str, name: str, url: str, - process_fn: Callable[[Dict[str, Any]], Dict[str, Any]] + process_fn: Callable[[Dict[str, Any]], Dict[str, Any]], ) -> Dict[str, Any]: """ This fetch and cache utils allows sharing between different process. @@ -54,18 +58,20 @@ def is_cached_file_valid() -> bool: for _ in range(3): try: - contents = urlopen(url, timeout=5).read().decode('utf-8') + contents = urlopen(url, timeout=5).read().decode("utf-8") processed_contents = process_fn(json.loads(contents)) with open(path, "w") as f: f.write(json.dumps(processed_contents)) return processed_contents except Exception as e: - print(f'Could not download {url} because: {e}.') - print(f'All retries exhausted, downloading {url} failed.') + print(f"Could not download {url} because: {e}.") + print(f"All retries exhausted, downloading {url} failed.") return {} -def get_slow_tests(dirpath: str, filename: str = SLOW_TESTS_FILE) -> Optional[Dict[str, float]]: +def get_slow_tests( + dirpath: str, filename: str = SLOW_TESTS_FILE +) -> Optional[Dict[str, float]]: url = "https://raw.githubusercontent.com/pytorch/test-infra/generated-stats/stats/slow-tests.json" try: return fetch_and_cache(dirpath, filename, url, lambda x: x) @@ -74,28 +80,38 @@ def get_slow_tests(dirpath: str, filename: str = SLOW_TESTS_FILE) -> Optional[Di return {} -def get_disabled_tests(dirpath: str, filename: str = DISABLED_TESTS_FILE) -> Optional[Dict[str, Any]]: +def get_disabled_tests( + dirpath: str, filename: str = DISABLED_TESTS_FILE +) -> Optional[Dict[str, Any]]: def process_disabled_test(the_response: Dict[str, Any]) -> Dict[str, Any]: disabled_test_from_issues = dict() - for item in the_response['items']: - title = item['title'] - key = 'DISABLED ' - issue_url = item['html_url'] - issue_number = issue_url.split('/')[-1] + for item in the_response["items"]: + title = item["title"] + key = "DISABLED " + issue_url = item["html_url"] + issue_number = issue_url.split("/")[-1] if title.startswith(key) and issue_number not in IGNORE_DISABLED_ISSUES: - test_name = title[len(key):].strip() - body = item['body'] + test_name = title[len(key) :].strip() + body = item["body"] platforms_to_skip = [] - key = 'platforms:' - for line in body.splitlines(): - line = line.lower() - if line.startswith(key): - pattern = re.compile(r"^\s+|\s*,\s*|\s+$") - platforms_to_skip.extend([x for x in pattern.split(line[len(key):]) if x]) - disabled_test_from_issues[test_name] = (item['html_url'], platforms_to_skip) + key = "platforms:" + # When the issue has no body, it is assumed that all platforms should skip the test + if body is not None: + for line in body.splitlines(): + line = line.lower() + if line.startswith(key): + pattern = re.compile(r"^\s+|\s*,\s*|\s+$") + platforms_to_skip.extend( + [x for x in pattern.split(line[len(key) :]) if x] + ) + disabled_test_from_issues[test_name] = ( + item["html_url"], + platforms_to_skip, + ) return disabled_test_from_issues + try: - url = 'https://raw.githubusercontent.com/pytorch/test-infra/generated-stats/stats/disabled-tests.json' + url = "https://raw.githubusercontent.com/pytorch/test-infra/generated-stats/stats/disabled-tests.json" return fetch_and_cache(dirpath, filename, url, process_disabled_test) except Exception: print("Couldn't download test skip set, leaving all tests enabled...") diff --git a/tools/stats/print_test_stats.py b/tools/stats/print_test_stats.py index 0555945e4786..44cd4e43dbb4 100755 --- a/tools/stats/print_test_stats.py +++ b/tools/stats/print_test_stats.py @@ -12,15 +12,36 @@ import time from collections import defaultdict from pathlib import Path -from typing import (Any, DefaultDict, Dict, Iterable, Iterator, List, Optional, - Set, Tuple, cast) +from typing import ( + Any, + DefaultDict, + Dict, + Iterable, + Iterator, + List, + Optional, + Set, + Tuple, + cast, +) from xml.dom import minidom from typing_extensions import TypedDict -from tools.stats.s3_stat_parser import (newify_case, get_S3_object_from_bucket, get_test_stats_summaries_for_job, - Report, Status, Commit, HAVE_BOTO3, Version2Case, VersionedReport, - Version1Report, Version2Report, ReportMetaMeta) -from tools.stats.scribe import send_to_scribe, rds_write, register_rds_schema, schema_from_sample +from tools.stats.s3_stat_parser import ( + newify_case, + get_S3_object_from_bucket, + get_test_stats_summaries_for_job, + Report, + Status, + Commit, + HAVE_BOTO3, + Version2Case, + VersionedReport, + Version1Report, + Version2Report, + ReportMetaMeta, +) +from tools.stats.scribe import send_to_scribe SimplerSuite = Dict[str, Version2Case] @@ -61,12 +82,12 @@ class SuiteDiff(TypedDict): # share a name (for version 2 reports) or using a list of cases rather # than a dict. def simplify(report: Report) -> SimplerReport: - if 'format_version' not in report: # version 1 implicitly + if "format_version" not in report: # version 1 implicitly v1report = cast(Version1Report, report) return { # we just don't have test filename information sadly, so we # just make one fake filename that is the empty string - '': { + "": { suite_name: { # This clobbers some cases that have duplicate names # because in version 1, we would merge together all @@ -80,35 +101,41 @@ def simplify(report: Report) -> SimplerReport: # we're only uploading in the new format (where # everything is also keyed by filename) going # forward, it shouldn't matter too much. - case['name']: newify_case(case) - for case in suite['cases'] + case["name"]: newify_case(case) + for case in suite["cases"] } - for suite_name, suite in v1report['suites'].items() + for suite_name, suite in v1report["suites"].items() } } else: v_report = cast(VersionedReport, report) - version = v_report['format_version'] + version = v_report["format_version"] if version == 2: v2report = cast(Version2Report, v_report) return { filename: { - suite_name: suite['cases'] - for suite_name, suite in file_data['suites'].items() + suite_name: suite["cases"] + for suite_name, suite in file_data["suites"].items() } - for filename, file_data in v2report['files'].items() + for filename, file_data in v2report["files"].items() } else: - raise RuntimeError(f'Unknown format version: {version}') + raise RuntimeError(f"Unknown format version: {version}") def plural(n: int) -> str: - return '' if n == 1 else 's' + return "" if n == 1 else "s" def get_base_commit(sha1: str) -> str: + default_branch = os.environ.get("GIT_DEFAULT_BRANCH") + # capture None and "" cases + if not default_branch: + default_branch = "master" + + default_remote = f"origin/{default_branch}" return subprocess.check_output( - ["git", "merge-base", sha1, "origin/master"], + ["git", "merge-base", sha1, default_remote], encoding="ascii", ).strip() @@ -118,28 +145,28 @@ def display_stat( format: Tuple[Tuple[int, int], Tuple[int, int]], ) -> str: spread_len = format[1][0] + 1 + format[1][1] - spread = x['spread'] + spread = x["spread"] if spread is not None: - spread_str = f' ± {spread:{spread_len}.{format[1][1]}f}s' + spread_str = f" ± {spread:{spread_len}.{format[1][1]}f}s" else: - spread_str = ' ' * (3 + spread_len + 1) + spread_str = " " * (3 + spread_len + 1) mean_len = format[0][0] + 1 + format[0][1] return f'{x["center"]:{mean_len}.{format[0][1]}f}s{spread_str}' def list_stat(l: List[float]) -> Stat: return { - 'center': statistics.mean(l), - 'spread': statistics.stdev(l) if len(l) > 1 else None + "center": statistics.mean(l), + "spread": statistics.stdev(l) if len(l) > 1 else None, } def zero_stat() -> Stat: - return {'center': 0, 'spread': None} + return {"center": 0, "spread": None} def recenter(was: Stat, now: float) -> Stat: - return {'center': now - was['center'], 'spread': was['spread']} + return {"center": now - was["center"], "spread": was["spread"]} def sum_normals(stats: Iterable[Stat]) -> Stat: @@ -151,29 +178,29 @@ def sum_normals(stats: Iterable[Stat]) -> Stat: """ l = list(stats) spread: Optional[float] - if any(stat['spread'] is not None for stat in l): - spread = math.sqrt(sum((stat['spread'] or 0)**2 for stat in l)) + if any(stat["spread"] is not None for stat in l): + spread = math.sqrt(sum((stat["spread"] or 0) ** 2 for stat in l)) else: spread = None return { - 'center': sum(stat['center'] for stat in l), - 'spread': spread, + "center": sum(stat["center"] for stat in l), + "spread": spread, } def format_seconds(seconds: List[float]) -> str: if len(seconds) > 0: x = list_stat(seconds) - return f'total time {display_stat(x, ((5, 2), (4, 2)))}'.strip() - return '' + return f"total time {display_stat(x, ((5, 2), (4, 2)))}".strip() + return "" def show_ancestors(num_commits: int) -> str: - return f' | : ({num_commits} commit{plural(num_commits)})' + return f" | : ({num_commits} commit{plural(num_commits)})" def unlines(lines: List[str]) -> str: - return ''.join(f'{line}\n' for line in lines) + return "".join(f"{line}\n" for line in lines) def matching_test_times( @@ -193,8 +220,8 @@ def matching_test_times( if suite: case = suite.get(case_name) if case: - t = case['seconds'] - s = case['status'] + t = case["seconds"] + s = case["status"] if s == status: times.append(t) return times @@ -206,7 +233,7 @@ def analyze( base_reports: Dict[Commit, List[SimplerReport]], ) -> List[SuiteDiff]: nonempty_shas = [sha for sha, reports in base_reports.items() if reports] - # most recent master ancestor with at least one S3 report, + # most recent main ancestor with at least one S3 report, # or empty list if there are none (will show all tests as added) base_report = base_reports[nonempty_shas[0]] if nonempty_shas else [] @@ -226,37 +253,49 @@ def analyze( for filename, suite_name in sorted(all_suites): case_diffs: List[CaseDiff] = [] head_suite = head_report.get(filename, {}).get(suite_name) - base_cases: Dict[str, Status] = dict(sorted(set.intersection(*[ - { - (n, case['status']) - for n, case - in report.get(filename, {}).get(suite_name, {}).items() - } - for report in base_report - ] or [set()]))) + base_cases: Dict[str, Status] = dict( + sorted( + set.intersection( + *[ + { + (n, case["status"]) + for n, case in report.get(filename, {}) + .get(suite_name, {}) + .items() + } + for report in base_report + ] + or [set()] + ) + ) + ) case_stats: Dict[str, Stat] = {} if head_suite: - now = sum(case['seconds'] for case in head_suite.values()) + now = sum(case["seconds"] for case in head_suite.values()) if any( filename in report and suite_name in report[filename] for report in base_report ): removed_cases: List[CaseDiff] = [] for case_name, case_status in base_cases.items(): - case_stats[case_name] = list_stat(matching_test_times( - base_reports=base_reports, - filename=filename, - suite_name=suite_name, - case_name=case_name, - status=case_status, - )) + case_stats[case_name] = list_stat( + matching_test_times( + base_reports=base_reports, + filename=filename, + suite_name=suite_name, + case_name=case_name, + status=case_status, + ) + ) if case_name not in head_suite: - removed_cases.append({ - 'margin': '-', - 'name': case_name, - 'was': (case_stats[case_name], case_status), - 'now': None, - }) + removed_cases.append( + { + "margin": "-", + "name": case_name, + "was": (case_stats[case_name], case_status), + "now": None, + } + ) modified_cases: List[CaseDiff] = [] added_cases: List[CaseDiff] = [] for head_case_name in sorted(head_suite): @@ -264,70 +303,86 @@ def analyze( if head_case_name in base_cases: stat = case_stats[head_case_name] base_status = base_cases[head_case_name] - if head_case['status'] != base_status: - modified_cases.append({ - 'margin': '!', - 'name': head_case_name, - 'was': (stat, base_status), - 'now': head_case, - }) + if head_case["status"] != base_status: + modified_cases.append( + { + "margin": "!", + "name": head_case_name, + "was": (stat, base_status), + "now": head_case, + } + ) else: - added_cases.append({ - 'margin': '+', - 'name': head_case_name, - 'was': None, - 'now': head_case, - }) + added_cases.append( + { + "margin": "+", + "name": head_case_name, + "was": None, + "now": head_case, + } + ) # there might be a bug calculating this stdev, not sure was = sum_normals(case_stats.values()) case_diffs = removed_cases + modified_cases + added_cases if case_diffs: - modified_suites.append({ - 'margin': ' ', - 'name': suite_name, - 'was': was, - 'now': now, - 'cases': case_diffs, - }) + modified_suites.append( + { + "margin": " ", + "name": suite_name, + "was": was, + "now": now, + "cases": case_diffs, + } + ) else: for head_case_name in sorted(head_suite): head_case = head_suite[head_case_name] - case_diffs.append({ - 'margin': ' ', - 'name': head_case_name, - 'was': None, - 'now': head_case, - }) - added_suites.append({ - 'margin': '+', - 'name': suite_name, - 'was': None, - 'now': now, - 'cases': case_diffs, - }) + case_diffs.append( + { + "margin": " ", + "name": head_case_name, + "was": None, + "now": head_case, + } + ) + added_suites.append( + { + "margin": "+", + "name": suite_name, + "was": None, + "now": now, + "cases": case_diffs, + } + ) else: for case_name, case_status in base_cases.items(): - case_stats[case_name] = list_stat(matching_test_times( - base_reports=base_reports, - filename=filename, - suite_name=suite_name, - case_name=case_name, - status=case_status, - )) - case_diffs.append({ - 'margin': ' ', - 'name': case_name, - 'was': (case_stats[case_name], case_status), - 'now': None, - }) - removed_suites.append({ - 'margin': '-', - 'name': suite_name, - # there might be a bug calculating this stdev, not sure - 'was': sum_normals(case_stats.values()), - 'now': None, - 'cases': case_diffs, - }) + case_stats[case_name] = list_stat( + matching_test_times( + base_reports=base_reports, + filename=filename, + suite_name=suite_name, + case_name=case_name, + status=case_status, + ) + ) + case_diffs.append( + { + "margin": " ", + "name": case_name, + "was": (case_stats[case_name], case_status), + "now": None, + } + ) + removed_suites.append( + { + "margin": "-", + "name": suite_name, + # there might be a bug calculating this stdev, not sure + "was": sum_normals(case_stats.values()), + "now": None, + "cases": case_diffs, + } + ) return removed_suites + modified_suites + added_suites @@ -337,24 +392,24 @@ def case_diff_lines(diff: CaseDiff) -> List[str]: case_fmt = ((3, 3), (2, 3)) - was = diff['was'] + was = diff["was"] if was: - was_line = f' # was {display_stat(was[0], case_fmt)}' + was_line = f" # was {display_stat(was[0], case_fmt)}" was_status = was[1] if was_status: - was_line += f' ({was_status})' + was_line += f" ({was_status})" lines.append(was_line) - now = diff['now'] + now = diff["now"] if now: - now_stat: Stat = {'center': now['seconds'], 'spread': None} - now_line = f' # now {display_stat(now_stat, case_fmt)}' - now_status = now['status'] + now_stat: Stat = {"center": now["seconds"], "spread": None} + now_line = f" # now {display_stat(now_stat, case_fmt)}" + now_status = now["status"] if now_status: - now_line += f' ({now_status})' + now_line += f" ({now_status})" lines.append(now_line) - return [''] + [f'{diff["margin"]} {l}' for l in lines] + return [""] + [f'{diff["margin"]} {l}' for l in lines] def display_suite_diff(diff: SuiteDiff) -> str: @@ -362,23 +417,23 @@ def display_suite_diff(diff: SuiteDiff) -> str: suite_fmt = ((4, 2), (3, 2)) - was = diff['was'] + was = diff["was"] if was: - lines.append(f' # was {display_stat(was, suite_fmt)}') + lines.append(f" # was {display_stat(was, suite_fmt)}") - now = diff['now'] + now = diff["now"] if now is not None: - now_stat: Stat = {'center': now, 'spread': None} - lines.append(f' # now {display_stat(now_stat, suite_fmt)}') + now_stat: Stat = {"center": now, "spread": None} + lines.append(f" # now {display_stat(now_stat, suite_fmt)}") - for case_diff in diff['cases']: - lines.extend([f' {l}' for l in case_diff_lines(case_diff)]) + for case_diff in diff["cases"]: + lines.extend([f" {l}" for l in case_diff_lines(case_diff)]) - return unlines([''] + [f'{diff["margin"]} {l}'.rstrip() for l in lines] + ['']) + return unlines([""] + [f'{diff["margin"]} {l}'.rstrip() for l in lines] + [""]) def anomalies(diffs: List[SuiteDiff]) -> str: - return ''.join(map(display_suite_diff, diffs)) + return "".join(map(display_suite_diff, diffs)) def graph( @@ -391,89 +446,91 @@ def graph( other_ancestors: int = 0, ) -> str: lines = [ - 'Commit graph (base is most recent master ancestor with at least one S3 report):', - '', - ' : (master)', - ' |', + "Commit graph (base is most recent master ancestor with at least one S3 report):", + "", + " : (master)", + " |", ] - head_time_str = f' {format_seconds([head_seconds])}' + head_time_str = f" {format_seconds([head_seconds])}" if on_master: - lines.append(f' * {head_sha[:10]} (HEAD) {head_time_str}') + lines.append(f" * {head_sha[:10]} (HEAD) {head_time_str}") else: - lines.append(f' | * {head_sha[:10]} (HEAD) {head_time_str}') + lines.append(f" | * {head_sha[:10]} (HEAD) {head_time_str}") if ancestry_path > 0: lines += [ - ' | |', + " | |", show_ancestors(ancestry_path), ] if other_ancestors > 0: lines += [ - ' |/|', + " |/|", show_ancestors(other_ancestors), - ' |', + " |", ] else: - lines.append(' |/') + lines.append(" |/") is_first = True for sha, seconds in base_seconds.items(): num_runs = len(seconds) prefix = str(num_runs).rjust(3) - base = '(base)' if is_first and num_runs > 0 else ' ' + base = "(base)" if is_first and num_runs > 0 else " " if num_runs > 0: is_first = False t = format_seconds(seconds) p = plural(num_runs) if t: - p = f'{p}, '.ljust(3) - lines.append(f' * {sha[:10]} {base} {prefix} report{p}{t}') + p = f"{p}, ".ljust(3) + lines.append(f" * {sha[:10]} {base} {prefix} report{p}{t}") - lines.extend([' |', ' :']) + lines.extend([" |", " :"]) return unlines(lines) def case_delta(case: CaseDiff) -> Stat: - was = case['was'] - now = case['now'] + was = case["was"] + now = case["now"] return recenter( was[0] if was else zero_stat(), - now['seconds'] if now else 0, + now["seconds"] if now else 0, ) def display_final_stat(stat: Stat) -> str: - center = stat['center'] - spread = stat['spread'] + center = stat["center"] + spread = stat["spread"] displayed = display_stat( - {'center': abs(center), 'spread': spread}, + {"center": abs(center), "spread": spread}, ((4, 2), (3, 2)), ) if center < 0: - sign = '-' + sign = "-" elif center > 0: - sign = '+' + sign = "+" else: - sign = ' ' - return f'{sign}{displayed}'.rstrip() + sign = " " + return f"{sign}{displayed}".rstrip() def summary_line(message: str, d: DefaultDict[str, List[CaseDiff]]) -> str: all_cases = [c for cs in d.values() for c in cs] tests = len(all_cases) suites = len(d) - sp = f'{plural(suites)})'.ljust(2) - tp = f'{plural(tests)},'.ljust(2) + sp = f"{plural(suites)})".ljust(2) + tp = f"{plural(tests)},".ljust(2) # there might be a bug calculating this stdev, not sure stat = sum_normals(case_delta(c) for c in all_cases) - return ''.join([ - f'{message} (across {suites:>4} suite{sp}', - f'{tests:>6} test{tp}', - f' totaling {display_final_stat(stat)}', - ]) + return "".join( + [ + f"{message} (across {suites:>4} suite{sp}", + f"{tests:>6} test{tp}", + f" totaling {display_final_stat(stat)}", + ] + ) def summary(analysis: List[SuiteDiff]) -> str: @@ -483,17 +540,17 @@ def summary(analysis: List[SuiteDiff]) -> str: for diff in analysis: # the use of 'margin' here is not the most elegant - name = diff['name'] - margin = diff['margin'] - cases = diff['cases'] - if margin == '-': + name = diff["name"] + margin = diff["margin"] + cases = diff["cases"] + if margin == "-": removed_tests[name] += cases - elif margin == '+': + elif margin == "+": added_tests[name] += cases else: - removed = list(filter(lambda c: c['margin'] == '-', cases)) - added = list(filter(lambda c: c['margin'] == '+', cases)) - modified = list(filter(lambda c: c['margin'] == '!', cases)) + removed = list(filter(lambda c: c["margin"] == "-", cases)) + added = list(filter(lambda c: c["margin"] == "+", cases)) + modified = list(filter(lambda c: c["margin"] == "!", cases)) if removed: removed_tests[name] += removed if added: @@ -501,11 +558,13 @@ def summary(analysis: List[SuiteDiff]) -> str: if modified: modified_tests[name] += modified - return unlines([ - summary_line('Removed ', removed_tests), - summary_line('Modified', modified_tests), - summary_line('Added ', added_tests), - ]) + return unlines( + [ + summary_line("Removed ", removed_tests), + summary_line("Modified", modified_tests), + summary_line("Added ", added_tests), + ] + ) def regression_info( @@ -525,7 +584,7 @@ def regression_info( and its test times. Since Python dicts maintain insertion order (guaranteed as part of the language spec since 3.7), the base_reports argument must list the head's several most recent - master commits, from newest to oldest (so the merge-base is + main commits, from newest to oldest (so the merge-base is list(base_reports)[0]). """ simpler_head = simplify(head_report) @@ -537,40 +596,49 @@ def regression_info( base_reports=simpler_base, ) - return '\n'.join([ - unlines([ - '----- Historic stats comparison result ------', - '', - f' job: {job_name}', - f' commit: {head_sha}', - ]), - - # don't print anomalies, because sometimes due to sharding, the - # output from this would be very long and obscure better signal - - # anomalies(analysis), - - graph( - head_sha=head_sha, - head_seconds=head_report['total_seconds'], - base_seconds={ - c: [r['total_seconds'] for r in rs] - for c, rs in base_reports.items() - }, - on_master=on_master, - ancestry_path=ancestry_path, - other_ancestors=other_ancestors, - ), - summary(analysis), - ]) + return "\n".join( + [ + unlines( + [ + "----- Historic stats comparison result ------", + "", + f" job: {job_name}", + f" commit: {head_sha}", + ] + ), + # don't print anomalies, because sometimes due to sharding, the + # output from this would be very long and obscure better signal + # anomalies(analysis), + graph( + head_sha=head_sha, + head_seconds=head_report["total_seconds"], + base_seconds={ + c: [r["total_seconds"] for r in rs] + for c, rs in base_reports.items() + }, + on_master=on_master, + ancestry_path=ancestry_path, + other_ancestors=other_ancestors, + ), + summary(analysis), + ] + ) class TestCase: def __init__(self, dom: Any) -> None: - self.class_name = str(dom.attributes['classname'].value) - self.name = str(dom.attributes['name'].value) - self.time = float(dom.attributes['time'].value) - error_elements = dom.getElementsByTagName('error') + self.class_name = str(dom.attributes["classname"].value) + self.name = str(dom.attributes["name"].value) + self.time = float(dom.attributes["time"].value) + # The following attribute is currently ONLY used in process_intentional_test_runs for validation + # reasons. The test filename that populates TestFile is calculated and passed down through the test report path. + # The reason we don't just use this attribute is because it doesn't exist for cpp tests, e.g., in test_libtorch + self.file = ( + str(dom.attributes["file"].value) + if dom.hasAttribute("file") + else "N/A - probably a cpp test" + ) + error_elements = dom.getElementsByTagName("error") # DISCLAIMER: unexpected successes and expected failures are currently not reported in assemble_s3_object self.expected_failure = False self.skipped = False @@ -579,25 +647,32 @@ def __init__(self, dom: Any) -> None: if len(error_elements) > 0: # We are only expecting 1 element here error_element = error_elements[0] - self.unexpected_success = (error_element.hasAttribute('type') and - error_element.attributes['type'].value == 'UnexpectedSuccess') + self.unexpected_success = ( + error_element.hasAttribute("type") + and error_element.attributes["type"].value == "UnexpectedSuccess" + ) self.errored = not self.unexpected_success - skipped_elements = dom.getElementsByTagName('skipped') + skipped_elements = dom.getElementsByTagName("skipped") if len(skipped_elements) > 0: # We are only expecting 1 element here skipped_element = skipped_elements[0] - self.expected_failure = (skipped_element.hasAttribute('type') and - skipped_element.attributes['type'].value == 'XFAIL') + self.expected_failure = ( + skipped_element.hasAttribute("type") + and skipped_element.attributes["type"].value == "XFAIL" + ) self.skipped = not self.expected_failure - self.failed = len(dom.getElementsByTagName('failure')) > 0 + self.failed = len(dom.getElementsByTagName("failure")) > 0 def __repr__(self) -> str: return self.__str__() def __str__(self) -> str: - return f'[TestCase name: {self.name} | class_name: {self.class_name} | time: {self.time} | ' \ - f'expected_failure: {self.expected_failure} | skipped: {self.skipped} | errored: {self.errored} | ' \ - f'unexpected_success: {self.unexpected_success} | failed: {self.failed}]' + return ( + f"[TestCase name: {self.name} | class_name: {self.class_name} | file: {self.file} | time: {self.time} | " + f"expected_failure: {self.expected_failure} | skipped: {self.skipped} | errored: {self.errored} | " + f"unexpected_success: {self.unexpected_success} | failed: {self.failed}]\n" + ) + class TestSuite: def __init__(self, name: str) -> None: @@ -612,10 +687,12 @@ def __init__(self, name: str) -> None: self.expected_failure_count = 0 def __repr__(self) -> str: - rc = f'{self.name} run_time: {self.total_time:.2f} tests: {len(self.test_cases)}' + rc = ( + f"{self.name} run_time: {self.total_time:.2f} tests: {len(self.test_cases)}" + ) if self.skipped_count > 0: - rc += f' skipped: {self.skipped_count}' - return f'TestSuite({rc})' + rc += f" skipped: {self.skipped_count}" + return f"TestSuite({rc})" def append(self, test_case: TestCase) -> None: self.test_cases[test_case.name] = test_case @@ -628,7 +705,9 @@ def append(self, test_case: TestCase) -> None: def update(self, test_case: TestCase) -> None: name = test_case.name - assert name in self.test_cases, f'Error: attempting to replace nonexistent test case {name}' + assert ( + name in self.test_cases + ), f"Error: attempting to replace nonexistent test case {name}" # Note that time for unexpected successes and expected failures are reported as 0s self.test_cases[name].time += test_case.time self.test_cases[name].failed |= test_case.failed @@ -637,54 +716,46 @@ def update(self, test_case: TestCase) -> None: self.test_cases[name].unexpected_success |= test_case.unexpected_success self.test_cases[name].expected_failure |= test_case.expected_failure - def print_report(self, num_longest: int = 3) -> None: - sorted_tests = sorted(self.test_cases.values(), key=lambda x: x.time) - test_count = len(sorted_tests) - print(f"class {self.name}:") - print( - f" tests: {test_count} failed: {self.failed_count} skipped: {self.skipped_count} " - f"errored: {self.errored_count} unexpected_success: {self.unexpected_success_count} " - f"expected_failure: {self.expected_failure_count}") - print(f" run_time: {self.total_time:.2f} seconds") - print(f" avg_time: {self.total_time/test_count:.2f} seconds") - if test_count >= 2: - print(f" median_time: {statistics.median(x.time for x in sorted_tests):.2f} seconds") - sorted_tests = sorted_tests[-num_longest:] - print(f" {len(sorted_tests)} longest tests:") - for test in reversed(sorted_tests): - print(f" {test.name} time: {test.time:.2f} seconds") - print("") + +# Tests that spawn duplicates (usually only twice) intentionally +MULTITESTS = [ + "test_cpp_extensions_aot", + "distributed/test_distributed_spawn", + "distributed\\test_distributed_spawn", # for windows + "distributed/test_c10d_gloo", + "distributed\\test_c10d_gloo", # for windows + "cpp", # The caffe2 cpp tests spawn duplicate test cases as well. +] + DuplicatedDict = Dict[str, Dict[str, List[TestCase]]] + class TestFile: def __init__(self, name: str) -> None: self.name = name self.total_time = 0.0 self.test_suites: Dict[str, TestSuite] = dict() - def append(self, test_case: TestCase, test_type: str, duplicated_tests_dict: DuplicatedDict) -> None: - is_multi_test = self.name == 'test_cpp_extensions_aot' or \ - self.name == 'distributed/test_distributed_spawn' or \ - self.name == 'distributed/test_c10d_gloo' or \ - self.name == 'cpp' # The caffe2 cpp tests spawn duplicate test cases as well. - if is_multi_test: - suite_name = test_case.class_name + '__' + test_type - else: - suite_name = test_case.class_name + def append( + self, test_case: TestCase, test_type: str, duplicated_tests_dict: DuplicatedDict + ) -> None: + suite_name = test_case.class_name if suite_name not in self.test_suites: self.test_suites[suite_name] = TestSuite(suite_name) if test_case.name in self.test_suites[suite_name].test_cases: - if is_multi_test: + if self.name in MULTITESTS: self.test_suites[suite_name].update(test_case) self.total_time += test_case.time - else: - # Gather up duplicated test cases - if suite_name not in duplicated_tests_dict: - duplicated_tests_dict[suite_name] = dict() - if test_case.name not in duplicated_tests_dict[suite_name]: - duplicated_tests_dict[suite_name][test_case.name] = [self.test_suites[suite_name].test_cases[test_case.name]] - duplicated_tests_dict[suite_name][test_case.name].append(test_case) + + # Gather up duplicated test cases to parse for flaky reruns + if suite_name not in duplicated_tests_dict: + duplicated_tests_dict[suite_name] = dict() + if test_case.name not in duplicated_tests_dict[suite_name]: + duplicated_tests_dict[suite_name][test_case.name] = [ + self.test_suites[suite_name].test_cases[test_case.name] + ] + duplicated_tests_dict[suite_name][test_case.name].append(test_case) else: self.test_suites[suite_name].append(test_case) self.total_time += test_case.time @@ -696,7 +767,7 @@ def parse_report(path: str) -> Iterator[TestCase]: except Exception as e: print(f"Error occurred when parsing {path}: {e}") return - for test_case in dom.getElementsByTagName('testcase'): + for test_case in dom.getElementsByTagName("testcase"): yield TestCase(test_case) @@ -716,11 +787,11 @@ def get_recursive_files(folder: str, extension: str) -> Iterable[str]: def parse_reports(folder: str) -> Tuple[Dict[str, TestFile], Dict[str, DuplicatedDict]]: tests_by_file = dict() - duplicated_tests_by_file : Dict[str, DuplicatedDict] = dict() + duplicated_tests_by_file: Dict[str, DuplicatedDict] = dict() for report in get_recursive_files(folder, ".xml"): report_path = Path(report) # basename of the directory of test-report is the test filename - test_filename = re.sub(r'\.', '/', report_path.parent.name) + test_filename = re.sub(r"\.", "/", report_path.parent.name) # test type is the parent directory (only applies to dist-*) # See: CUSTOM_HANDLERS in test/run_test.py test_type = report_path.parent.parent.name @@ -729,7 +800,9 @@ def parse_reports(folder: str) -> Tuple[Dict[str, TestFile], Dict[str, Duplicate if test_filename not in tests_by_file: tests_by_file[test_filename] = TestFile(test_filename) for test_case in parse_report(report): - tests_by_file[test_filename].append(test_case, test_type, duplicated_tests_by_file[test_filename]) + tests_by_file[test_filename].append( + test_case, test_type, duplicated_tests_by_file[test_filename] + ) return tests_by_file, duplicated_tests_by_file @@ -754,59 +827,74 @@ def process_intentional_test_runs(runs: List[TestCase]) -> Tuple[int, int]: else: num_pass += 1 - REPEAT_TEST_FOR_TYPES_TESTS = [ - "test_data_parallel_module", - "test_data_parallel_module_kwargs_only", - "test_data_parallel_module_kwargs_only_empty_list", - "test_data_parallel_module_kwargs_only_empty_dict", - "test_data_parallel_module_kwargs_only_empty_tuple" - ] - - # Do not run checks for tests that use repeat_test_for_types decorator as they do not go well with our retry - # functionality. Once issue https://github.com/pytorch/pytorch/issues/69865 is fixed, we should remove the exception - if not any([x in test_run.name for x in REPEAT_TEST_FOR_TYPES_TESTS]): - err_msg = f'Warning: unintentional test case duplicates found for {test_run.name} in suite {test_run.class_name}.' - report_only = os.getenv('PYTORCH_OVERRIDE_FLAKY_SIGNAL') != '1' - if report_only and num_fail + num_errored + num_unexpected_success < 1 or not report_only and num_expected_fail < 1: - raise RuntimeWarning(f'{err_msg} Intentional reruns are only triggered when the first run fails or errors, but' - ' we found no failures nor errors.') + # Do not run duplication checks for test files that spawn duplicate tests intentionally + # and are not necessarily flaky test reruns. + if not any(x in test_run.file for x in MULTITESTS): + err_msg = f"Warning: unintentional test case duplicates found for {test_run.name} in suite {test_run.class_name}." + report_only = os.getenv("PYTORCH_OVERRIDE_FLAKY_SIGNAL") != "1" + if ( + report_only + and num_fail + num_errored + num_unexpected_success < 1 + or not report_only + and num_expected_fail < 1 + ): + raise RuntimeWarning( + f"{err_msg} Intentional reruns are only triggered when the first run fails or errors, but" + " we found no failures nor errors." + ) if num_unexpected_success + num_expected_fail < 1: - raise RuntimeWarning(f'{err_msg} Intentional reruns should raise at least one unexpected success or expected ' - 'failure, but none have been found.') + raise RuntimeWarning( + f"{err_msg} Intentional reruns should raise at least one unexpected success or expected " + "failure, but none have been found." + ) if report_only and num_pass != num_unexpected_success: - raise RuntimeWarning(f'{err_msg} Every success in an intentional rerun is shadowed by one unexpected success.' - f'However, successes = {num_pass} and unexpected successes = {num_unexpected_success}') + raise RuntimeWarning( + f"{err_msg} Every success in an intentional rerun is shadowed by one unexpected success." + f"However, successes = {num_pass} and unexpected successes = {num_unexpected_success}" + ) if not report_only and num_pass > 1: - raise RuntimeWarning(f'{err_msg} There should be at most 1 successful run in an intentional rerun that stops' - f' at first success. The number of successful runs = {num_pass}') + raise RuntimeWarning( + f"{err_msg} There should be at most 1 successful run in an intentional rerun that stops" + f" at first success. The number of successful runs = {num_pass}" + ) if num_skipped > 0: - raise RuntimeWarning(f'{err_msg} No skips should occur in intentional reruns, but skips = {num_skipped}') - return max(num_unexpected_success, num_pass), num_fail + num_expected_fail + num_errored + raise RuntimeWarning( + f"{err_msg} No skips should occur in intentional reruns, but skips = {num_skipped}" + ) + return ( + max(num_unexpected_success, num_pass), + num_fail + num_expected_fail + num_errored, + ) -def assemble_flaky_test_stats(duplicated_tests_by_file: Dict[str, DuplicatedDict]) -> Any: +def write_flaky_test_stats_to_rockset( + duplicated_tests_by_file: Dict[str, DuplicatedDict] +) -> Any: flaky_tests = [] - workflow_id = os.environ.get("GITHUB_RUN_ID", os.environ.get("CIRCLE_WORKFLOW_ID", None)) + workflow_id = os.environ.get( + "GITHUB_RUN_ID", os.environ.get("CIRCLE_WORKFLOW_ID", None) + ) for file_name, suite_to_dict in duplicated_tests_by_file.items(): for suite_name, testcase_to_runs in suite_to_dict.items(): for testcase_name, list_of_runs in testcase_to_runs.items(): num_green, num_red = process_intentional_test_runs(list_of_runs) - if num_green > 0: # Otherwise, it's likely just a failing test - flaky_tests.append({ - "name": testcase_name, - "suite": suite_name, - "file": file_name, - "num_green": num_green, - "num_red": num_red, - }) + if ( + num_green > 0 and num_red > 0 + ): # Flaky tests show different results in consecutive reruns + flaky_tests.append( + { + "name": testcase_name, + "suite": suite_name, + "file": file_name, + "num_green": num_green, + "num_red": num_red, + } + ) if len(flaky_tests) > 0: - # write to RDS - register_rds_schema("flaky_tests", schema_from_sample(flaky_tests[0])) - rds_write("flaky_tests", flaky_tests, only_on_master=False) - - # write to S3 to go to Rockset as well import uuid + for flaky_test in flaky_tests: + flaky_test["job_id"] = os.environ["GHA_WORKFLOW_JOB_ID"] flaky_test["workflow_id"] = workflow_id key = f"flaky_tests/{workflow_id}/{uuid.uuid4()}.json" obj = get_S3_object_from_bucket("ossci-raw-job-status", key) @@ -818,11 +906,17 @@ def build_info() -> ReportMetaMeta: "build_pr": os.environ.get("PR_NUMBER", os.environ.get("CIRCLE_PR_NUMBER", "")), "build_tag": os.environ.get("TAG", os.environ.get("CIRCLE_TAG", "")), "build_sha1": os.environ.get("SHA1", os.environ.get("CIRCLE_SHA1", "")), - "build_base_commit": get_base_commit(os.environ.get("SHA1", os.environ.get("CIRCLE_SHA1", "HEAD"))), + "build_base_commit": get_base_commit( + os.environ.get("SHA1", os.environ.get("CIRCLE_SHA1", "HEAD")) + ), "build_branch": os.environ.get("BRANCH", os.environ.get("CIRCLE_BRANCH", "")), "build_job": os.environ.get("JOB_BASE_NAME", os.environ.get("CIRCLE_JOB", "")), - "build_workflow_id": os.environ.get("WORKFLOW_ID", os.environ.get("CIRCLE_WORKFLOW_ID", "")), - "build_start_time_epoch": str(int(os.path.getmtime(os.path.realpath(__file__)))), + "build_workflow_id": os.environ.get( + "WORKFLOW_ID", os.environ.get("CIRCLE_WORKFLOW_ID", "") + ), + "build_start_time_epoch": str( + int(os.path.getmtime(os.path.realpath(__file__))) + ), } @@ -830,7 +924,7 @@ def build_message( test_file: TestFile, test_suite: TestSuite, test_case: TestCase, - meta_info: ReportMetaMeta + meta_info: ReportMetaMeta, ) -> Dict[str, Dict[str, Any]]: return { "normal": { @@ -856,7 +950,9 @@ def send_report_to_scribe(reports: Dict[str, TestFile]) -> None: [ { "category": "perfpipe_pytorch_test_times", - "message": json.dumps(build_message(test_file, test_suite, test_case, meta_info)), + "message": json.dumps( + build_message(test_file, test_suite, test_case, meta_info) + ), "line_escape": False, } for test_file in reports.values() @@ -875,50 +971,50 @@ def assemble_s3_object( ) -> Version2Report: return { **build_info(), # type: ignore[misc] - 'total_seconds': total_seconds, - 'format_version': 2, - 'files': { + "total_seconds": total_seconds, + "format_version": 2, + "files": { name: { - 'total_seconds': test_file.total_time, - 'suites': { + "total_seconds": test_file.total_time, + "suites": { name: { - 'total_seconds': suite.total_time, - 'cases': { + "total_seconds": suite.total_time, + "cases": { name: { - 'seconds': case.time, - 'status': 'errored' if case.errored else - 'failed' if case.failed else - 'skipped' if case.skipped else None + "seconds": case.time, + "status": "errored" + if case.errored + else "failed" + if case.failed + else "skipped" + if case.skipped + else None, } for name, case in suite.test_cases.items() }, } for name, suite in test_file.test_suites.items() - } + }, } for name, test_file in reports.items() - } + }, } def send_report_to_s3(head_report: Version2Report) -> None: - job = os.getenv('JOB_BASE_NAME', os.environ.get('CIRCLE_JOB')) - sha1 = os.environ.get('SHA1', os.environ.get('CIRCLE_SHA1', '')) - branch = os.environ.get('BRANCH', os.environ.get('CIRCLE_BRANCH', '')) + job = os.getenv("JOB_BASE_NAME", os.environ.get("CIRCLE_JOB")) + sha1 = os.environ.get("SHA1", os.environ.get("CIRCLE_SHA1", "")) now = datetime.datetime.utcnow().isoformat() # SHARD_NUMBER and TEST_CONFIG are specific to GHA, as these details would be included in CIRCLE_JOB already - shard = os.environ.get('SHARD_NUMBER', '') - test_config = os.environ.get('TEST_CONFIG') + shard = os.environ.get("SHARD_NUMBER", "") + test_config = os.environ.get("TEST_CONFIG") - job_report_dirname = f'{job}{f"-{test_config}" if test_config is not None else ""}{shard}' - - if branch not in ['master', 'nightly'] and not branch.startswith("release/"): - pr = os.environ.get('PR_NUMBER', os.environ.get('CIRCLE_PR_NUMBER', 'unknown')) - key = f'pr_test_time/{pr}/{sha1}/{job_report_dirname}/{now}Z.json.bz2' # Z meaning UTC - else: - key = f'test_time/{sha1}/{job_report_dirname}/{now}Z.json.bz2' # Z meaning UTC - obj = get_S3_object_from_bucket('ossci-metrics', key) + job_report_dirname = ( + f'{job}{f"-{test_config}" if test_config is not None else ""}{shard}' + ) + key = f"test_time/{sha1}/{job_report_dirname}/{now}Z.json.bz2" # Z meaning UTC + obj = get_S3_object_from_bucket("ossci-metrics", key) # use bz2 because the results are smaller than gzip, and the # compression time penalty we pay is only about half a second for # input files of a few megabytes in size like these JSON files, and @@ -927,46 +1023,25 @@ def send_report_to_s3(head_report: Version2Report) -> None: obj.put(Body=bz2.compress(json.dumps(head_report).encode())) -def upload_failures_to_rds(reports: Dict[str, TestFile]) -> None: - """ - We have 40k+ tests, so saving every test for every commit is not very - feasible for PyTorch. Most of these are things we don't care about anyways, - so this code filters out failures and saves only those to the DB. - """ - # Gather all failures across the entire report - failures = [] - for file in reports.values(): - for suite in file.test_suites.values(): - for case in suite.test_cases.values(): - if case.errored or case.failed: - failures.append({ - "name": case.name, - "suite": suite.name, - "file": file.name, - "status": "failure" if case.failed else "error" - }) - - if len(failures) > 0: - register_rds_schema("test_failures", schema_from_sample(failures[0])) - rds_write("test_failures", failures, only_on_master=False) - - def print_regressions(head_report: Report, *, num_prev_commits: int) -> None: sha1 = os.environ.get("SHA1", os.environ.get("CIRCLE_SHA1", "HEAD")) base = get_base_commit(sha1) count_spec = f"{base}..{sha1}" - intermediate_commits = int(subprocess.check_output( - ["git", "rev-list", "--count", count_spec], - encoding="ascii" - )) - ancestry_path = int(subprocess.check_output( - ["git", "rev-list", "--ancestry-path", "--count", count_spec], - encoding="ascii", - )) + intermediate_commits = int( + subprocess.check_output( + ["git", "rev-list", "--count", count_spec], encoding="ascii" + ) + ) + ancestry_path = int( + subprocess.check_output( + ["git", "rev-list", "--ancestry-path", "--count", count_spec], + encoding="ascii", + ) + ) - # if current commit is already on master, we need to exclude it from + # if current commit is already on main, we need to exclude it from # this history; otherwise we include the merge-base commits = subprocess.check_output( ["git", "rev-list", f"--max-count={num_prev_commits+1}", base], @@ -989,15 +1064,18 @@ def print_regressions(head_report: Report, *, num_prev_commits: int) -> None: objects[commit].extend(summary) print() - print(regression_info( - head_sha=sha1, - head_report=head_report, - base_reports=objects, - job_name=job, - on_master=on_master, - ancestry_path=ancestry_path - 1, - other_ancestors=intermediate_commits - ancestry_path, - ), end="") + print( + regression_info( + head_sha=sha1, + head_report=head_report, + base_reports=objects, + job_name=job, + on_master=on_master, + ancestry_path=ancestry_path - 1, + other_ancestors=intermediate_commits - ancestry_path, + ), + end="", + ) def positive_integer(value: str) -> float: @@ -1022,9 +1100,10 @@ def reports_has_no_tests(reports: Dict[str, TestFile]) -> bool: return True -if __name__ == '__main__': +if __name__ == "__main__": import argparse import sys + parser = argparse.ArgumentParser( "Print statistics from test XML output.", formatter_class=argparse.ArgumentDefaultsHelpFormatter, @@ -1080,9 +1159,8 @@ def reports_has_no_tests(reports: Dict[str, TestFile]) -> bool: args = parser.parse_args() reports_by_file, duplicated_tests_by_file = parse_reports(args.folder) - assemble_flaky_test_stats(duplicated_tests_by_file) + write_flaky_test_stats_to_rockset(duplicated_tests_by_file) - upload_failures_to_rds(reports_by_file) if reports_has_no_tests(reports_by_file): print(f"No tests in reports found in {args.folder}") sys.exit(0) @@ -1092,16 +1170,10 @@ def reports_has_no_tests(reports: Dict[str, TestFile]) -> bool: except Exception as e: print(f"ERROR ENCOUNTERED WHEN UPLOADING TO SCRIBE: {e}") - # longest_tests can contain duplicates as the same tests can be spawned from different files - longest_tests: List[TestCase] = [] total_time = 0.0 for filename, test_filename in reports_by_file.items(): for suite_name, test_suite in test_filename.test_suites.items(): total_time += test_suite.total_time - if test_suite.total_time >= args.class_print_threshold: - test_suite.print_report(args.longest_of_class) - longest_tests.extend(test_suite.test_cases.values()) - longest_tests = sorted(longest_tests, key=lambda x: x.time)[-args.longest_of_run:] obj = assemble_s3_object(reports_by_file, total_seconds=total_time) @@ -1111,14 +1183,6 @@ def reports_has_no_tests(reports: Dict[str, TestFile]) -> bool: except Exception as e: print(f"ERROR ENCOUNTERED WHEN UPLOADING TO S3: {e}") - print(f"Total runtime is {datetime.timedelta(seconds=total_time)}") - print( - f"{len(longest_tests)} longest tests of entire run" - f" (ignoring suites totaling less than {args.class_print_threshold} seconds):" - ) - for test_case in reversed(longest_tests): - print(f" {test_case.class_name}.{test_case.name} time: {test_case.time:.2f} seconds") - if args.compare_with_s3: head_json = obj if args.use_json: diff --git a/tools/stats/s3_stat_parser.py b/tools/stats/s3_stat_parser.py index 71474bf487cd..666b9f6b4547 100644 --- a/tools/stats/s3_stat_parser.py +++ b/tools/stats/s3_stat_parser.py @@ -10,6 +10,7 @@ try: import boto3 # type: ignore[import] import botocore # type: ignore[import] + HAVE_BOTO3 = True except ImportError: HAVE_BOTO3 = False @@ -18,10 +19,10 @@ logger = logging.getLogger(__name__) -OSSCI_METRICS_BUCKET = 'ossci-metrics' +OSSCI_METRICS_BUCKET = "ossci-metrics" Commit = str # 40-digit SHA-1 hex string -Status = Optional[Literal['errored', 'failed', 'skipped']] +Status = Optional[Literal["errored", "failed", "skipped"]] class CaseMeta(TypedDict): @@ -85,8 +86,10 @@ class Version2Report(VersionedReport): Report = Union[Version1Report, VersionedReport] if HAVE_BOTO3: - S3_RESOURCE_READ_ONLY = boto3.resource("s3", config=botocore.config.Config(signature_version=botocore.UNSIGNED)) - S3_RESOURCE = boto3.resource('s3') + S3_RESOURCE_READ_ONLY = boto3.resource( + "s3", config=botocore.config.Config(signature_version=botocore.UNSIGNED) + ) + S3_RESOURCE = boto3.resource("s3") def get_S3_bucket_readonly(bucket_name: str) -> Any: @@ -98,16 +101,16 @@ def get_S3_object_from_bucket(bucket_name: str, object: str) -> Any: def case_status(case: Version1Case) -> Status: - for k in {'errored', 'failed', 'skipped'}: - if case[k]: # type: ignore[misc] + for k in {"errored", "failed", "skipped"}: + if case[k]: # type: ignore[literal-required] return cast(Status, k) return None def newify_case(case: Version1Case) -> Version2Case: return { - 'seconds': case['seconds'], - 'status': case_status(case), + "seconds": case["seconds"], + "status": case_status(case), } @@ -119,28 +122,28 @@ def get_cases( test_name: Optional[str], ) -> List[Version2Case]: cases: List[Version2Case] = [] - if 'format_version' not in data: # version 1 implicitly + if "format_version" not in data: # version 1 implicitly v1report = cast(Version1Report, data) - suites = v1report['suites'] + suites = v1report["suites"] for sname, v1suite in suites.items(): if not suite_name or sname == suite_name: - for v1case in v1suite['cases']: - if not test_name or v1case['name'] == test_name: + for v1case in v1suite["cases"]: + if not test_name or v1case["name"] == test_name: cases.append(newify_case(v1case)) else: v_report = cast(VersionedReport, data) - version = v_report['format_version'] + version = v_report["format_version"] if version == 2: v2report = cast(Version2Report, v_report) - for fname, v2file in v2report['files'].items(): + for fname, v2file in v2report["files"].items(): if fname == filename or not filename: - for sname, v2suite in v2file['suites'].items(): + for sname, v2suite in v2file["suites"].items(): if sname == suite_name or not suite_name: - for cname, v2case in v2suite['cases'].items(): + for cname, v2case in v2suite["cases"].items(): if not test_name or cname == test_name: cases.append(v2case) else: - raise RuntimeError(f'Unknown format version: {version}') + raise RuntimeError(f"Unknown format version: {version}") return cases @@ -148,19 +151,22 @@ def _parse_master_summaries(summaries: Any, jobs: List[str]) -> Dict[str, List[R summary_dict = defaultdict(list) for summary in summaries: # master summary format: "test_time/{sha}/{job}/file" - summary_job = summary.key.split('/')[2] + summary_job = summary.key.split("/")[2] if summary_job in jobs or len(jobs) == 0: binary = summary.get()["Body"].read() string = bz2.decompress(binary).decode("utf-8") summary_dict[summary_job].append(json.loads(string)) return summary_dict -def _parse_pr_summaries(summaries: Any, job_prefix: str) -> Dict[str, List[Tuple[Report, str]]]: + +def _parse_pr_summaries( + summaries: Any, job_prefix: str +) -> Dict[str, List[Tuple[Report, str]]]: summary_dict = defaultdict(list) for summary in summaries: # PR summary format: "pr_test_time/{pr}/{sha}/{job}/file" - summary_job = summary.key.split('/')[3] - summary_timestamp = summary.key.split('/')[4][:len("YYYY-MM-ddTHH:mm:ss")] + summary_job = summary.key.split("/")[3] + summary_timestamp = summary.key.split("/")[4][: len("YYYY-MM-ddTHH:mm:ss")] if not job_prefix or len(job_prefix) == 0 or summary_job.startswith(job_prefix): binary = summary.get()["Body"].read() string = bz2.decompress(binary).decode("utf-8") @@ -171,18 +177,25 @@ def _parse_pr_summaries(summaries: Any, job_prefix: str) -> Dict[str, List[Tuple # Collect and decompress S3 test stats summaries into JSON. # data stored on S3 buckets are pathed by {sha}/{job} so we also allow # optional jobs filter -def get_test_stats_summaries(*, sha: str, jobs: Optional[List[str]] = None) -> Dict[str, List[Report]]: +def get_test_stats_summaries( + *, sha: str, jobs: Optional[List[str]] = None +) -> Dict[str, List[Report]]: bucket = get_S3_bucket_readonly(OSSCI_METRICS_BUCKET) summaries = bucket.objects.filter(Prefix=f"test_time/{sha}") return _parse_master_summaries(summaries, jobs=list(jobs or [])) -def get_test_stats_summaries_for_job(*, sha: str, job_prefix: str) -> Dict[str, List[Report]]: +def get_test_stats_summaries_for_job( + *, sha: str, job_prefix: str +) -> Dict[str, List[Report]]: bucket = get_S3_bucket_readonly(OSSCI_METRICS_BUCKET) summaries = bucket.objects.filter(Prefix=f"test_time/{sha}/{job_prefix}") return _parse_master_summaries(summaries, jobs=list()) -def get_test_stats_summaries_for_pr(*, pr: str, job_prefix: str) -> Dict[str, List[Tuple[Report, str]]]: + +def get_test_stats_summaries_for_pr( + *, pr: str, job_prefix: str +) -> Dict[str, List[Tuple[Report, str]]]: bucket = get_S3_bucket_readonly(OSSCI_METRICS_BUCKET) summaries = bucket.objects.filter(Prefix=f"pr_test_time/{pr}/") return _parse_pr_summaries(summaries, job_prefix=job_prefix) @@ -191,35 +204,50 @@ def get_test_stats_summaries_for_pr(*, pr: str, job_prefix: str) -> Dict[str, Li # This function returns a list of S3 test time reports. This function can run into errors if HAVE_BOTO3 = False # or the S3 bucket is somehow unavailable. Even though this function goes through ten commits' reports to find a # non-empty report, it is still conceivable (though highly unlikely) for this function to return no reports. -def get_previous_reports_for_branch(branch: str, ci_job_prefix: str = "") -> List[Report]: +def get_previous_reports_for_branch( + branch: str, ci_job_prefix: str = "" +) -> List[Report]: commit_date_ts = subprocess.check_output( - ['git', 'show', '-s', '--format=%ct', 'HEAD'], - encoding="ascii").strip() + ["git", "show", "-s", "--format=%ct", "HEAD"], encoding="ascii" + ).strip() commit_date = datetime.fromtimestamp(int(commit_date_ts)) # We go a day before this current commit to avoiding pulling incomplete reports - day_before_commit = str(commit_date - timedelta(days=1)).split(' ')[0] + day_before_commit = str(commit_date - timedelta(days=1)).split(" ")[0] # something like git rev-list --before="2021-03-04" --max-count=10 --remotes="*origin/nightly" commits = subprocess.check_output( - ["git", "rev-list", f"--before={day_before_commit}", "--max-count=10", f"--remotes=*{branch}"], - encoding="ascii").splitlines() + [ + "git", + "rev-list", + f"--before={day_before_commit}", + "--max-count=10", + f"--remotes=*{branch}", + ], + encoding="ascii", + ).splitlines() reports: List[Report] = [] commit_index = 0 while len(reports) == 0 and commit_index < len(commits): commit = commits[commit_index] - logger.info(f'Grabbing reports from commit: {commit}') - summaries = get_test_stats_summaries_for_job(sha=commit, job_prefix=ci_job_prefix) + logger.info(f"Grabbing reports from commit: {commit}") + summaries = get_test_stats_summaries_for_job( + sha=commit, job_prefix=ci_job_prefix + ) for job_name, summary in summaries.items(): reports.append(summary[0]) if len(summary) > 1: - logger.warning(f'WARNING: Multiple summary objects found for {commit}/{job_name}') + logger.warning( + f"WARNING: Multiple summary objects found for {commit}/{job_name}" + ) commit_index += 1 return reports -def get_previous_reports_for_pr(pr: str, ci_job_prefix: str = "") -> List[Tuple[Report, str]]: +def get_previous_reports_for_pr( + pr: str, ci_job_prefix: str = "" +) -> List[Tuple[Report, str]]: reports: List[Tuple[Report, str]] = [] - logger.info(f'Grabbing reports from PR: {[pr]}') + logger.info(f"Grabbing reports from PR: {[pr]}") summaries = get_test_stats_summaries_for_pr(pr=pr, job_prefix=ci_job_prefix) for _, summary in summaries.items(): reports.extend(summary) diff --git a/tools/stats/test_history.py b/tools/stats/test_history.py index 24678aabba93..83751441bb7d 100755 --- a/tools/stats/test_history.py +++ b/tools/stats/test_history.py @@ -7,17 +7,12 @@ from signal import SIG_DFL, SIGPIPE, signal from typing import Dict, Iterator, List, Optional, Set, Tuple -from tools.stats.s3_stat_parser import (Report, get_cases, - get_test_stats_summaries) +from tools.stats.s3_stat_parser import Report, get_cases, get_test_stats_summaries -def get_git_commit_history( - *, - path: str, - ref: str -) -> List[Tuple[str, datetime]]: +def get_git_commit_history(*, path: str, ref: str) -> List[Tuple[str, datetime]]: rc = subprocess.check_output( - ['git', '-C', path, 'log', '--pretty=format:%H %ct', ref], + ["git", "-C", path, "log", "--pretty=format:%H %ct", ref], ).decode("latin-1") return [ (x[0], datetime.fromtimestamp(int(x[1]), tz=timezone.utc)) @@ -37,23 +32,20 @@ def make_column( num_length = digits + 1 + decimals if data: cases = get_cases( - data=data, - filename=filename, - suite_name=suite_name, - test_name=test_name + data=data, filename=filename, suite_name=suite_name, test_name=test_name ) if cases: case = cases[0] - status = case['status'] + status = case["status"] omitted = len(cases) - 1 if status: - return f'{status.rjust(num_length)} ', omitted + return f"{status.rjust(num_length)} ", omitted else: return f'{case["seconds"]:{num_length}.{decimals}f}s', omitted else: return f'{"absent".rjust(num_length)} ', 0 else: - return ' ' * (num_length + 1), 0 + return " " * (num_length + 1), 0 def make_columns( @@ -83,10 +75,10 @@ def make_columns( if job in omitted: total_omitted += omitted[job] if total_omitted > 0: - columns.append(f'({total_omitted} job re-runs omitted)') + columns.append(f"({total_omitted} job re-runs omitted)") if total_suites > 0: - columns.append(f'({total_suites} matching suites omitted)') - return ' '.join(columns) + columns.append(f"({total_suites} matching suites omitted)") + return " ".join(columns) def make_lines( @@ -108,17 +100,17 @@ def make_lines( ) if cases: case = cases[0] - status = case['status'] + status = case["status"] line = f'{job} {case["seconds"]}s{f" {status}" if status else ""}' if len(cases) > 1: - line += f' ({len(cases) - 1} matching suites omitted)' + line += f" ({len(cases) - 1} matching suites omitted)" lines.append(line) elif job in jobs: - lines.append(f'{job} (test not found)') + lines.append(f"{job} (test not found)") if lines: return lines else: - return ['(no reports in S3)'] + return ["(no reports in S3)"] def history_lines( @@ -142,26 +134,24 @@ def history_lines( summaries = get_test_stats_summaries(sha=sha) else: summaries = get_test_stats_summaries(sha=sha, jobs=jobs) - if mode == 'columns': + if mode == "columns": assert jobs is not None # we assume that get_test_stats_summaries here doesn't # return empty lists - omitted = { - job: len(l) - 1 - for job, l in summaries.items() - if len(l) > 1 - } - lines = [make_columns( - jobs=jobs, - jsons={job: l[0] for job, l in summaries.items()}, - omitted=omitted, - filename=filename, - suite_name=suite_name, - test_name=test_name, - digits=digits, - )] + omitted = {job: len(l) - 1 for job, l in summaries.items() if len(l) > 1} + lines = [ + make_columns( + jobs=jobs, + jsons={job: l[0] for job, l in summaries.items()}, + omitted=omitted, + filename=filename, + suite_name=suite_name, + test_name=test_name, + digits=digits, + ) + ] else: - assert mode == 'multiline' + assert mode == "multiline" lines = make_lines( jobs=set(jobs or []), jsons=summaries, @@ -181,7 +171,7 @@ class HelpFormatter( def description() -> str: - return r''' + return r""" Display the history of a test. Each line of (non-error) output starts with the timestamp and SHA1 hash @@ -193,55 +183,50 @@ def description() -> str: followed by the time of the specified test in that job at that commit. Example: - $ tools/stats/test_history.py --mode=multiline --ref=594a66 --sha-length=8 --test=test_set_dir \ - --job pytorch_linux_xenial_py3_6_gcc5_4_test --job pytorch_linux_xenial_py3_6_gcc7_test - 2021-02-10 11:13:34Z 594a66d7 pytorch_linux_xenial_py3_6_gcc5_4_test 0.36s - 2021-02-10 11:13:34Z 594a66d7 pytorch_linux_xenial_py3_6_gcc7_test 0.573s errored - 2021-02-10 10:13:25Z 9c0caf03 pytorch_linux_xenial_py3_6_gcc5_4_test 0.819s - 2021-02-10 10:13:25Z 9c0caf03 pytorch_linux_xenial_py3_6_gcc7_test 0.449s - 2021-02-10 10:09:14Z 602434bc pytorch_linux_xenial_py3_6_gcc5_4_test 0.361s - 2021-02-10 10:09:14Z 602434bc pytorch_linux_xenial_py3_6_gcc7_test 0.454s - 2021-02-10 10:09:10Z 2e35fe95 (no reports in S3) - 2021-02-10 10:09:07Z ff73be7e (no reports in S3) - 2021-02-10 10:05:39Z 74082f0d (no reports in S3) - 2021-02-10 07:42:29Z 0620c96f pytorch_linux_xenial_py3_6_gcc5_4_test 0.414s - 2021-02-10 07:42:29Z 0620c96f pytorch_linux_xenial_py3_6_gcc5_4_test 0.476s - 2021-02-10 07:42:29Z 0620c96f pytorch_linux_xenial_py3_6_gcc7_test 0.377s - 2021-02-10 07:42:29Z 0620c96f pytorch_linux_xenial_py3_6_gcc7_test 0.326s + $ tools/stats/test_history.py --mode=multiline --ref=86a961af879 --sha-length=8 \ + --test=test_composite_compliance_dot_cpu_float32 \ + --job linux-xenial-py3.7-gcc5.4-test-default1 --job linux-xenial-py3.7-gcc7-test-default1 + 2022-02-18 15:47:37Z 86a961af linux-xenial-py3.7-gcc5.4-test-default1 0.001s + 2022-02-18 15:47:37Z 86a961af linux-xenial-py3.7-gcc7-test-default1 0.001s + 2022-02-18 15:12:34Z f5e201e4 linux-xenial-py3.7-gcc5.4-test-default1 0.001s + 2022-02-18 15:12:34Z f5e201e4 linux-xenial-py3.7-gcc7-test-default1 0.001s + 2022-02-18 13:14:56Z 1c0df265 linux-xenial-py3.7-gcc5.4-test-default1 0.001s + 2022-02-18 13:14:56Z 1c0df265 linux-xenial-py3.7-gcc7-test-default1 0.001s + 2022-02-18 13:14:56Z e73eaffd (no reports in S3) + 2022-02-18 06:29:12Z 710f12f5 linux-xenial-py3.7-gcc5.4-test-default1 0.001s Another multiline example, this time with the --all flag: - $ tools/stats/test_history.py --mode=multiline --all --ref=321b9 --delta=12 --sha-length=8 \ - --test=test_qr_square_many_batched_complex_cuda - 2021-01-07 10:04:56Z 321b9883 pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_test2 424.284s - 2021-01-07 10:04:56Z 321b9883 pytorch_linux_xenial_cuda10_2_cudnn7_py3_slow_test 0.006s skipped - 2021-01-07 10:04:56Z 321b9883 pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_test 402.572s - 2021-01-07 10:04:56Z 321b9883 pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test 287.164s - 2021-01-06 20:58:28Z fcb69d2e pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_test2 436.732s - 2021-01-06 20:58:28Z fcb69d2e pytorch_linux_xenial_cuda10_2_cudnn7_py3_slow_test 0.006s skipped - 2021-01-06 20:58:28Z fcb69d2e pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_test 407.616s - 2021-01-06 20:58:28Z fcb69d2e pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test 287.044s + $ tools/stats/test_history.py --mode=multiline --all --ref=86a961af879 --delta=12 --sha-length=8 \ + --test=test_composite_compliance_dot_cuda_float32 + 2022-02-18 03:49:46Z 69389fb5 linux-bionic-cuda10.2-py3.9-gcc7-test-default1 0.001s skipped + 2022-02-18 03:49:46Z 69389fb5 linux-bionic-cuda10.2-py3.9-gcc7-test-slow1 0.001s skipped + 2022-02-18 03:49:46Z 69389fb5 linux-xenial-cuda11.3-py3.7-gcc7-test-default1 0.001s skipped + 2022-02-18 03:49:46Z 69389fb5 periodic-linux-bionic-cuda11.5-py3.7-gcc7-test-default1 0.001s skipped + 2022-02-18 03:49:46Z 69389fb5 periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-test-default1 0.001s skipped + 2022-02-18 03:49:46Z 69389fb5 periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-test-default1 0.001s skipped In columns mode, the name of the job isn't printed, but the order of the columns is guaranteed to match the order of the jobs passed on the command line. Example: - $ tools/stats/test_history.py --mode=columns --ref=3cf783 --sha-length=8 --test=test_set_dir \ - --job pytorch_linux_xenial_py3_6_gcc5_4_test --job pytorch_linux_xenial_py3_6_gcc7_test - 2021-02-10 12:18:50Z 3cf78395 0.644s 0.312s - 2021-02-10 11:13:34Z 594a66d7 0.360s errored - 2021-02-10 10:13:25Z 9c0caf03 0.819s 0.449s - 2021-02-10 10:09:14Z 602434bc 0.361s 0.454s - 2021-02-10 10:09:10Z 2e35fe95 - 2021-02-10 10:09:07Z ff73be7e - 2021-02-10 10:05:39Z 74082f0d - 2021-02-10 07:42:29Z 0620c96f 0.414s 0.377s (2 job re-runs omitted) - 2021-02-10 07:27:53Z 33afb5f1 0.381s 0.294s + $ tools/stats/test_history.py --mode=columns --ref=86a961af879 --sha-length=8 \ + --test=test_composite_compliance_dot_cpu_float32 \ + --job linux-xenial-py3.7-gcc5.4-test-default1 --job linux-xenial-py3.7-gcc7-test-default1 + 2022-02-18 15:47:37Z 86a961af 0.001s 0.001s + 2022-02-18 15:12:34Z f5e201e4 0.001s 0.001s + 2022-02-18 13:14:56Z 1c0df265 0.001s 0.001s + 2022-02-18 13:14:56Z e73eaffd + 2022-02-18 06:29:12Z 710f12f5 0.001s 0.001s + 2022-02-18 05:20:30Z 51b04f27 0.001s 0.001s + 2022-02-18 03:49:46Z 69389fb5 0.001s 0.001s + 2022-02-18 00:19:12Z 056b6260 0.001s 0.001s + 2022-02-17 23:58:32Z 39fb7714 0.001s 0.001s Minor note: in columns mode, a blank cell means that no report was found in S3, while the word "absent" means that a report was found but the indicated test was not found in that report. -''' +""" def parse_args(raw: List[str]) -> argparse.Namespace: @@ -251,61 +236,57 @@ def parse_args(raw: List[str]) -> argparse.Namespace: formatter_class=HelpFormatter, ) parser.add_argument( - '--mode', - choices=['columns', 'multiline'], - help='output format', - default='columns', + "--mode", + choices=["columns", "multiline"], + help="output format", + default="columns", ) parser.add_argument( - '--pytorch', - help='path to local PyTorch clone', - default='.', + "--pytorch", + help="path to local PyTorch clone", + default=".", ) parser.add_argument( - '--ref', - help='starting point (most recent Git ref) to display history for', - default='master', + "--ref", + help="starting point (most recent Git ref) to display history for", + default="master", ) parser.add_argument( - '--delta', + "--delta", type=int, - help='minimum number of hours between commits', + help="minimum number of hours between commits", default=0, ) parser.add_argument( - '--sha-length', + "--sha-length", type=int, - help='length of the prefix of the SHA1 hash to show', + help="length of the prefix of the SHA1 hash to show", default=40, ) parser.add_argument( - '--digits', + "--digits", type=int, - help='(columns) number of digits to display before the decimal point', + help="(columns) number of digits to display before the decimal point", default=4, ) parser.add_argument( - '--all', - action='store_true', - help='(multiline) ignore listed jobs, show all jobs for each commit', - ) - parser.add_argument( - '--file', - help='name of the file containing the test', + "--all", + action="store_true", + help="(multiline) ignore listed jobs, show all jobs for each commit", ) parser.add_argument( - '--suite', - help='name of the suite containing the test', + "--file", + help="name of the file containing the test", ) parser.add_argument( - '--test', - help='name of the test', - required=True + "--suite", + help="name of the suite containing the test", ) + parser.add_argument("--test", help="name of the test", required=True) parser.add_argument( - '--job', - help='names of jobs to display columns for, in order', - action='append', + "--job", + help="names of jobs to display columns for, in order", + action="append", default=[], ) args = parser.parse_args(raw) @@ -313,7 +294,7 @@ def parse_args(raw: List[str]) -> argparse.Namespace: args.jobs = None if args.all else args.job # We dont allow implicit or empty "--jobs", unless "--all" is specified. if args.jobs == []: - parser.error('No jobs specified.') + parser.error("No jobs specified.") return args diff --git a/tools/stats/upload_binary_size_to_scuba.py b/tools/stats/upload_binary_size_to_scuba.py index adf1d5076867..aacaf627ec95 100644 --- a/tools/stats/upload_binary_size_to_scuba.py +++ b/tools/stats/upload_binary_size_to_scuba.py @@ -55,7 +55,9 @@ def build_message(size: int) -> Dict[str, Any]: "build_num": os.environ.get("CIRCLE_BUILD_NUM"), "sha1": os.environ.get("SHA1", os.environ.get("CIRCLE_SHA1")), "branch": os.environ.get("BRANCH", os.environ.get("CIRCLE_BRANCH")), - "workflow_id": os.environ.get("WORKFLOW_ID", os.environ.get("CIRCLE_WORKFLOW_ID")), + "workflow_id": os.environ.get( + "WORKFLOW_ID", os.environ.get("CIRCLE_WORKFLOW_ID") + ), }, "int": { "time": int(time.time()), @@ -118,13 +120,17 @@ def gen_messages() -> Generator[Dict[str, Any], None, None]: "pkg_type": "{}/{}/{}".format(android_build_type, arch, lib), "cu_ver": "", # dummy value for derived field `build_name` "py_ver": "", # dummy value for derived field `build_name` - "pr": os.environ.get("PR_NUMBER", os.environ.get("CIRCLE_PR_NUMBER")), + "pr": os.environ.get( + "PR_NUMBER", os.environ.get("CIRCLE_PR_NUMBER") + ), # This is the only place where we use directly CIRCLE_BUILD_NUM, everywhere else CIRCLE_* vars # are used as fallback, there seems to be no direct analogy between circle build number and GHA IDs "build_num": os.environ.get("CIRCLE_BUILD_NUM"), "sha1": os.environ.get("SHA1", os.environ.get("CIRCLE_SHA1")), "branch": os.environ.get("BRANCH", os.environ.get("CIRCLE_BRANCH")), - "workflow_id": os.environ.get("WORKFLOW_ID", os.environ.get("CIRCLE_WORKFLOW_ID")), + "workflow_id": os.environ.get( + "WORKFLOW_ID", os.environ.get("CIRCLE_WORKFLOW_ID") + ), }, "int": { "time": int(time.time()), diff --git a/tools/stats/upload_test_stats.py b/tools/stats/upload_test_stats.py new file mode 100644 index 000000000000..bdc9c9f319da --- /dev/null +++ b/tools/stats/upload_test_stats.py @@ -0,0 +1,210 @@ +import argparse +import os +import requests +import shutil +import zipfile +import xml.etree.ElementTree as ET +from pathlib import Path +from typing import Dict, List, Any + +import rockset # type: ignore[import] +import boto3 # type: ignore[import] + +PYTORCH_REPO = "https://api.github.com/repos/pytorch/pytorch" +GITHUB_TOKEN = os.environ["GITHUB_TOKEN"] +REQUEST_HEADERS = { + "Accept": "application/vnd.github.v3+json", + "Authorization": "token " + GITHUB_TOKEN, +} +S3_RESOURCE = boto3.resource("s3") +TEMP_DIR = Path(os.environ["RUNNER_TEMP"]) / "tmp-test-stats" + + +def parse_xml_report( + report: Path, workflow_id: int, workflow_run_attempt: int +) -> List[Dict[str, Any]]: + """Convert a test report xml file into a JSON-serializable list of test cases.""" + # [Job id in artifacts] + # Retrieve the job id from the report path. In our GHA workflows, we append + # the job id to the end of the report name, so `report` looks like: + # unzipped-test-reports-foo_5596745227/test/test-reports/foo/TEST-foo.xml + # and we want to get `5596745227` out of it. + job_id = int(report.parts[0].rpartition("_")[2]) + + print(f"Parsing test report: {report}, job id: {job_id}") + root = ET.parse(report) + + test_cases = [] + for test_case in root.iter("testcase"): + case = process_xml_element(test_case) + case["workflow_id"] = workflow_id + case["workflow_run_attempt"] = workflow_run_attempt + case["job_id"] = job_id + test_cases.append(case) + + return test_cases + + +def process_xml_element(element: ET.Element) -> Dict[str, Any]: + """Convert a test suite element into a JSON-serializable dict.""" + ret: Dict[str, Any] = {} + + # Convert attributes directly into dict elements. + # e.g. + # + # becomes: + # {"name": "test_foo", "classname": "test_bar"} + ret.update(element.attrib) + + # By default, all attributes are strings. Apply a few special conversions + # here for well-known attributes so that they are the right type in Rockset. + line = ret.get("line") + if line: + ret["line"] = int(line) + time = ret.get("time") + if time: + ret["time"] = float(time) + + # Convert inner and outer text into special dict elements. + # e.g. + # my_inner_text my_tail + # becomes: + # {"text": "my_inner_text", "tail": " my_tail"} + if element.text and element.text.strip(): + ret["text"] = element.text + if element.tail and element.tail.strip(): + ret["tail"] = element.tail + + # Convert child elements recursively, placing them at a key: + # e.g. + # + # hello + # + # becomes + # {"foo": {"text": "hello"}} + for child in element: + ret[child.tag] = process_xml_element(child) + return ret + + +def get_artifact_urls(workflow_run_id: int) -> Dict[Path, str]: + """Get all workflow artifacts with 'test-report' in the name.""" + response = requests.get( + f"{PYTORCH_REPO}/actions/runs/{workflow_run_id}/artifacts?per_page=100", + ) + artifacts = response.json()["artifacts"] + while "next" in response.links.keys(): + response = requests.get(response.links["next"]["url"], headers=REQUEST_HEADERS) + artifacts.extend(response.json()["artifacts"]) + + artifact_urls = {} + for artifact in artifacts: + if "test-report" in artifact["name"]: + artifact_urls[Path(artifact["name"])] = artifact["archive_download_url"] + return artifact_urls + + +def unzip(p: Path) -> None: + """Unzip the provided zipfile to a similarly-named directory. + + Returns None if `p` is not a zipfile. + + Looks like: /tmp/test-reports.zip -> /tmp/unzipped-test-reports/ + """ + assert p.is_file() + unzipped_dir = p.with_name("unzipped-" + p.stem) + + with zipfile.ZipFile(p, "r") as zip: + zip.extractall(unzipped_dir) + + +def download_and_extract_artifact( + artifact_name: Path, artifact_url: str, workflow_run_attempt: int +) -> None: + # [Artifact run attempt] + # All artifacts on a workflow share a single namespace. However, we can + # re-run a workflow and produce a new set of artifacts. To avoid name + # collisions, we add `-runattempt1-` somewhere in the artifact name. + # + # This code parses out the run attempt number from the artifact name. If it + # doesn't match the one specified on the command line, skip it. + atoms = str(artifact_name).split("-") + for atom in atoms: + if atom.startswith("runattempt"): + found_run_attempt = int(atom[len("runattempt") :]) + if workflow_run_attempt != found_run_attempt: + print( + f"Skipping {artifact_name} as it is an invalid run attempt. " + f"Expected {workflow_run_attempt}, found {found_run_attempt}." + ) + + print(f"Downloading and extracting {artifact_name}") + + response = requests.get(artifact_url, headers=REQUEST_HEADERS) + with open(artifact_name, "wb") as f: + f.write(response.content) + unzip(artifact_name) + + +def download_and_extract_s3_reports( + workflow_run_id: int, workflow_run_attempt: int +) -> None: + bucket = S3_RESOURCE.Bucket("gha-artifacts") + objs = bucket.objects.filter( + Prefix=f"pytorch/pytorch/{workflow_run_id}/{workflow_run_attempt}/artifact/test-reports" + ) + + for obj in objs: + p = Path(Path(obj.key).name) + print(f"Downloading and extracting {p}") + with open(p, "wb") as f: + f.write(obj.get()["Body"].read()) + unzip(p) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Upload test stats to Rockset") + parser.add_argument( + "--workflow-run-id", + required=True, + help="id of the workflow to get artifacts from", + ) + parser.add_argument( + "--workflow-run-attempt", + type=int, + required=True, + help="which retry of the workflow this is", + ) + args = parser.parse_args() + + if TEMP_DIR.exists(): + print("rm: ", TEMP_DIR) + shutil.rmtree(TEMP_DIR) + + print("mkdir: ", TEMP_DIR) + TEMP_DIR.mkdir() + print("cd to ", TEMP_DIR) + os.chdir(TEMP_DIR) + + # Download and extract all the reports (both GHA and S3) + download_and_extract_s3_reports(args.workflow_run_id, args.workflow_run_attempt) + artifact_urls = get_artifact_urls(args.workflow_run_id) + for name, url in artifact_urls.items(): + download_and_extract_artifact(Path(name), url, args.workflow_run_attempt) + + # Parse the reports and transform them to JSON + test_cases = [] + for xml_report in Path(".").glob("**/*.xml"): + test_cases.extend( + parse_xml_report( + xml_report, int(args.workflow_run_id), int(args.workflow_run_attempt) + ) + ) + + # Write the JSON to rockset + print(f"Writing {len(test_cases)} test cases to Rockset") + client = rockset.Client( + api_server="api.rs2.usw2.rockset.com", api_key=os.environ["ROCKSET_API_KEY"] + ) + client.Collection.retrieve("test_run").add_docs(test_cases) + print("Done!") diff --git a/tools/test/test_actions_local_runner.py b/tools/test/test_actions_local_runner.py deleted file mode 100644 index ba4e6fd2cdb9..000000000000 --- a/tools/test/test_actions_local_runner.py +++ /dev/null @@ -1,191 +0,0 @@ -# -*- coding: utf-8 -*- - -import textwrap -import unittest -import sys -import contextlib -import io -import os -import subprocess -import multiprocessing -from typing import List, Dict, Any - -from tools import actions_local_runner - - -if sys.version_info >= (3, 8): - # actions_local_runner uses asyncio features not available in 3.6, and - # IsolatedAsyncioTestCase was added in 3.8, so skip testing on - # unsupported systems - class TestRunner(unittest.IsolatedAsyncioTestCase): - def run(self, *args: List[Any], **kwargs: List[Dict[str, Any]]) -> Any: - return super().run(*args, **kwargs) - - def test_step_extraction(self) -> None: - fake_job = { - "steps": [ - {"name": "test1", "run": "echo hi"}, - {"name": "test2", "run": "echo hi"}, - {"name": "test3", "run": "echo hi"}, - ] - } - - actual = actions_local_runner.grab_specific_steps(["test2"], fake_job) - expected = [ - {"name": "test2", "run": "echo hi"}, - ] - self.assertEqual(actual, expected) - - async def test_runner(self) -> None: - fake_step = {"name": "say hello", "run": "echo hi"} - f = io.StringIO() - with contextlib.redirect_stdout(f): - await actions_local_runner.YamlStep(fake_step, "test", True).run() - - result = f.getvalue() - self.assertIn("say hello", result) - - class TestEndToEnd(unittest.TestCase): - expected = [ - "cmakelint: Run cmakelint", - "quick-checks: Ensure no direct cub include", - "quick-checks: Ensure no unqualified type ignore", - "quick-checks: Ensure no unqualified noqa", - "quick-checks: Ensure canonical include", - "quick-checks: Ensure no non-breaking spaces", - "quick-checks: Ensure no tabs", - "flake8", - "quick-checks: Ensure correct trailing newlines", - "quick-checks: Ensure no trailing spaces", - "shellcheck: Regenerate workflows", - "shellcheck: Assert that regenerating the workflows didn't change them", - "shellcheck: Extract scripts from GitHub Actions workflows", - "shellcheck: Run ShellCheck", - ] - - def test_lint(self): - cmd = ["make", "lint", "-j", str(multiprocessing.cpu_count())] - proc = subprocess.run( - cmd, cwd=actions_local_runner.REPO_ROOT, stdout=subprocess.PIPE - ) - stdout = proc.stdout.decode() - - for line in self.expected: - self.assertIn(line, stdout) - - self.assertIn("mypy", stdout) - - def test_quicklint(self): - cmd = ["make", "quicklint", "-j", str(multiprocessing.cpu_count())] - proc = subprocess.run( - cmd, cwd=actions_local_runner.REPO_ROOT, stdout=subprocess.PIPE - ) - stdout = proc.stdout.decode() - - for line in self.expected: - self.assertIn(line, stdout) - - # TODO: See https://github.com/pytorch/pytorch/issues/57967 - self.assertIn("mypy (skipped typestub generation)", stdout) - - class TestQuicklint(unittest.IsolatedAsyncioTestCase): - test_files = [ - os.path.join("caffe2", "some_cool_file.py"), - os.path.join("torch", "some_cool_file.py"), - os.path.join("aten", "some_cool_file.py"), - os.path.join("torch", "some_stubs.pyi"), - os.path.join("test.sh"), - ] - test_py_files = [ - f for f in test_files if f.endswith(".py") or f.endswith(".pyi") - ] - test_sh_files = [f for f in test_files if f.endswith(".sh")] - maxDiff = None - - def setUp(self, *args, **kwargs): - for name in self.test_files: - bad_code = textwrap.dedent( - """ - some_variable = '2' - some_variable = None - some_variable = 11.2 - """ - ).rstrip("\n") - - with open(name, "w") as f: - f.write(bad_code) - - def tearDown(self, *args, **kwargs): - for name in self.test_files: - os.remove(name) - - def test_file_selection(self): - files = actions_local_runner.find_changed_files() - for name in self.test_files: - self.assertIn(name, files) - - async def test_flake8(self): - f = io.StringIO() - with contextlib.redirect_stdout(f): - await actions_local_runner.Flake8(self.test_py_files, True).run() - - # Should exclude the caffe2/ file - expected = textwrap.dedent( - """ - x flake8 - torch/some_cool_file.py:4:21: W292 no newline at end of file - aten/some_cool_file.py:4:21: W292 no newline at end of file - """ - ).lstrip("\n") - self.assertEqual(expected, f.getvalue()) - - async def test_shellcheck(self): - f = io.StringIO() - with contextlib.redirect_stdout(f): - await actions_local_runner.ShellCheck(self.test_sh_files, True).run() - - self.assertIn("SC2148: Tips depend on target shell", f.getvalue()) - self.assertIn("SC2283: Remove spaces around = to assign", f.getvalue()) - - async def test_mypy(self): - self.maxDiff = None - f = io.StringIO() - with contextlib.redirect_stdout(f): - # Quicklint assumes this has been run already and doesn't work - # without it - _, _, _ = await actions_local_runner.shell_cmd( - [ - f"{sys.executable}", - "tools/actions_local_runner.py", - "--job", - "mypy", - "--file", - ".github/workflows/lint.yml", - "--step", - "Run autogen", - ], - redirect=True, - ) - - await actions_local_runner.Mypy(self.test_py_files, True).run() - - # Should exclude the aten/ file; also, apparently mypy - # typechecks files in reverse order - expected = textwrap.dedent( - """ - x mypy (skipped typestub generation) - torch/some_stubs.pyi:3:17: error: Incompatible types in assignment (expression has type "None", variable has type "str") [assignment] - torch/some_stubs.pyi:4:17: error: Incompatible types in assignment (expression has type "float", variable has type "str") [assignment] - torch/some_cool_file.py:3:17: error: Incompatible types in assignment (expression has type "None", variable has type "str") [assignment] - torch/some_cool_file.py:4:17: error: Incompatible types in assignment (expression has type "float", variable has type "str") [assignment] - caffe2/some_cool_file.py:3:17: error: Incompatible types in assignment (expression has type "None", variable has type "str") [assignment] - caffe2/some_cool_file.py:4:17: error: Incompatible types in assignment (expression has type "float", variable has type "str") [assignment] - """ # noqa: B950 - ).lstrip( - "\n" - ) - self.assertEqual(expected, f.getvalue()) - - -if __name__ == "__main__": - unittest.main() diff --git a/tools/test/test_cmake.py b/tools/test/test_cmake.py index ecbce07f52d2..2c4bead6db3b 100644 --- a/tools/test/test_cmake.py +++ b/tools/test/test_cmake.py @@ -9,49 +9,60 @@ import tools.setup_helpers.cmake -T = typing.TypeVar('T') +T = typing.TypeVar("T") class TestCMake(unittest.TestCase): - - @unittest.mock.patch('multiprocessing.cpu_count') + @unittest.mock.patch("multiprocessing.cpu_count") def test_build_jobs(self, mock_cpu_count: unittest.mock.MagicMock) -> None: """Tests that the number of build jobs comes out correctly.""" mock_cpu_count.return_value = 13 cases = [ # MAX_JOBS, USE_NINJA, IS_WINDOWS, want - (( '8', True, False), ['-j', '8']), # noqa: E201,E241 - (( None, True, False), None), # noqa: E201,E241 - (( '7', False, False), ['-j', '7']), # noqa: E201,E241 - (( None, False, False), ['-j', '13']), # noqa: E201,E241 - (( '6', True, True), ['-j', '6']), # noqa: E201,E241 - (( None, True, True), None), # noqa: E201,E241 - (( '11', False, True), ['/p:CL_MPCount=11']), # noqa: E201,E241 - (( None, False, True), ['/p:CL_MPCount=13']), # noqa: E201,E241 + (("8", True, False), ["-j", "8"]), # noqa: E201,E241 + ((None, True, False), None), # noqa: E201,E241 + (("7", False, False), ["-j", "7"]), # noqa: E201,E241 + ((None, False, False), ["-j", "13"]), # noqa: E201,E241 + (("6", True, True), ["-j", "6"]), # noqa: E201,E241 + ((None, True, True), None), # noqa: E201,E241 + (("11", False, True), ["/p:CL_MPCount=11"]), # noqa: E201,E241 + ((None, False, True), ["/p:CL_MPCount=13"]), # noqa: E201,E241 ] for (max_jobs, use_ninja, is_windows), want in cases: - with self.subTest(MAX_JOBS=max_jobs, USE_NINJA=use_ninja, IS_WINDOWS=is_windows): + with self.subTest( + MAX_JOBS=max_jobs, USE_NINJA=use_ninja, IS_WINDOWS=is_windows + ): with contextlib.ExitStack() as stack: - stack.enter_context(env_var('MAX_JOBS', max_jobs)) - stack.enter_context(unittest.mock.patch.object(tools.setup_helpers.cmake, 'USE_NINJA', use_ninja)) - stack.enter_context(unittest.mock.patch.object(tools.setup_helpers.cmake, 'IS_WINDOWS', is_windows)) + stack.enter_context(env_var("MAX_JOBS", max_jobs)) + stack.enter_context( + unittest.mock.patch.object( + tools.setup_helpers.cmake, "USE_NINJA", use_ninja + ) + ) + stack.enter_context( + unittest.mock.patch.object( + tools.setup_helpers.cmake, "IS_WINDOWS", is_windows + ) + ) cmake = tools.setup_helpers.cmake.CMake() - with unittest.mock.patch.object(cmake, 'run') as cmake_run: + with unittest.mock.patch.object(cmake, "run") as cmake_run: cmake.build({}) cmake_run.assert_called_once() - call, = cmake_run.mock_calls + (call,) = cmake_run.mock_calls build_args, _ = call.args if want is None: - self.assertNotIn('-j', build_args) + self.assertNotIn("-j", build_args) else: self.assert_contains_sequence(build_args, want) @staticmethod - def assert_contains_sequence(sequence: Sequence[T], subsequence: Sequence[T]) -> None: + def assert_contains_sequence( + sequence: Sequence[T], subsequence: Sequence[T] + ) -> None: """Raises an assertion if the subsequence is not contained in the sequence.""" if len(subsequence) == 0: return # all sequences contain the empty subsequence @@ -63,7 +74,7 @@ def assert_contains_sequence(sequence: Sequence[T], subsequence: Sequence[T]) -> assert len(candidate) == len(subsequence) # sanity check if candidate == subsequence: return # found it - raise AssertionError(f'{subsequence} not found in {sequence}') + raise AssertionError(f"{subsequence} not found in {sequence}") @contextlib.contextmanager diff --git a/tools/test/test_codegen.py b/tools/test/test_codegen.py index 0dded01cc6ed..22b5470f6326 100644 --- a/tools/test/test_codegen.py +++ b/tools/test/test_codegen.py @@ -4,72 +4,77 @@ from tools.autograd import gen_autograd_functions from tools.autograd import load_derivatives -import tools.codegen.model +import torchgen.model -class TestCreateDerivative(unittest.TestCase): +class TestCreateDerivative(unittest.TestCase): def test_named_grads(self) -> None: - schema = tools.codegen.model.FunctionSchema.parse( - 'func(Tensor a, Tensor b) -> (Tensor x, Tensor y)') - native_function = dataclasses.replace(DEFAULT_NATIVE_FUNCTION, - func=schema) + schema = torchgen.model.FunctionSchema.parse( + "func(Tensor a, Tensor b) -> (Tensor x, Tensor y)" + ) + native_function = dataclasses.replace(DEFAULT_NATIVE_FUNCTION, func=schema) derivative = load_derivatives.create_derivative( native_function, - formula='func_backward(grad_x, grad_y)', + formula="func_backward(grad_x, grad_y)", var_names=(), - available_named_gradients=['grad_x', 'grad_y']) - self.assertSetEqual(derivative.named_gradients, {'grad_x', 'grad_y'}) + available_named_gradients=["grad_x", "grad_y"], + ) + self.assertSetEqual(derivative.named_gradients, {"grad_x", "grad_y"}) def test_non_differentiable_output(self) -> None: - specification = 'func(Tensor a, Tensor b) -> (Tensor x, bool y, Tensor z)' - schema = tools.codegen.model.FunctionSchema.parse(specification) - native_function = dataclasses.replace(DEFAULT_NATIVE_FUNCTION, - func=schema) + specification = "func(Tensor a, Tensor b) -> (Tensor x, bool y, Tensor z)" + schema = torchgen.model.FunctionSchema.parse(specification) + native_function = dataclasses.replace(DEFAULT_NATIVE_FUNCTION, func=schema) differentiability_info = load_derivatives.create_differentiability_info( - defn={'name': specification, - 'a': 'grads[0]', - 'b': 'grads[2]', - }, + defn={ + "name": specification, + "a": "grads[0]", + "b": "grads[2]", + }, functions_by_signature={schema.signature(): [native_function]}, functions_by_schema={specification: native_function}, op_counter=typing.Counter[str](), ) - self.assertSequenceEqual(differentiability_info.available_named_gradients, - # grad_y is not present because y is a - # bool and thus not differentiable. - ['grad_x', 'grad_z']) + self.assertSequenceEqual( + differentiability_info.available_named_gradients, + # grad_y is not present because y is a + # bool and thus not differentiable. + ["grad_x", "grad_z"], + ) def test_indexed_grads(self) -> None: - schema = tools.codegen.model.FunctionSchema.parse( - 'func(Tensor a, Tensor b) -> (Tensor x, Tensor y)') - native_function = dataclasses.replace(DEFAULT_NATIVE_FUNCTION, - func=schema) + schema = torchgen.model.FunctionSchema.parse( + "func(Tensor a, Tensor b) -> (Tensor x, Tensor y)" + ) + native_function = dataclasses.replace(DEFAULT_NATIVE_FUNCTION, func=schema) derivative = load_derivatives.create_derivative( native_function, - formula='func_backward(grads[0], grads[1])', + formula="func_backward(grads[0], grads[1])", var_names=(), - available_named_gradients=['grad_x', 'grad_y']) + available_named_gradients=["grad_x", "grad_y"], + ) self.assertSetEqual(derivative.named_gradients, set()) def test_named_grads_and_indexed_grads(self) -> None: - specification = 'func(Tensor a, Tensor b) -> (Tensor x, Tensor y)' - schema = tools.codegen.model.FunctionSchema.parse(specification) - native_function = dataclasses.replace(DEFAULT_NATIVE_FUNCTION, - func=schema) + specification = "func(Tensor a, Tensor b) -> (Tensor x, Tensor y)" + schema = torchgen.model.FunctionSchema.parse(specification) + native_function = dataclasses.replace(DEFAULT_NATIVE_FUNCTION, func=schema) - with self.assertRaisesRegex(RuntimeError, - 'illegally mixes use of "grad_RETURN_NAME"'): + with self.assertRaisesRegex( + RuntimeError, 'illegally mixes use of "grad_RETURN_NAME"' + ): load_derivatives.create_differentiability_info( - defn={'name': specification, - # Uh-oh, the derivatives reference gradients by - # name and by index. - 'a': 'grad_x', - 'b': 'grads[1]', - }, + defn={ + "name": specification, + # Uh-oh, the derivatives reference gradients by + # name and by index. + "a": "grad_x", + "b": "grads[1]", + }, functions_by_signature={schema.signature(): [native_function]}, functions_by_schema={specification: native_function}, op_counter=typing.Counter[str](), @@ -78,60 +83,61 @@ def test_named_grads_and_indexed_grads(self) -> None: class TestGenAutogradFunctions(unittest.TestCase): def test_non_differentiable_output_invalid_type(self) -> None: - specification = 'func(Tensor a, Tensor b) -> (Tensor x, bool y, Tensor z)' - schema = tools.codegen.model.FunctionSchema.parse(specification) - native_function = dataclasses.replace(DEFAULT_NATIVE_FUNCTION, - func=schema) + specification = "func(Tensor a, Tensor b) -> (Tensor x, bool y, Tensor z)" + schema = torchgen.model.FunctionSchema.parse(specification) + native_function = dataclasses.replace(DEFAULT_NATIVE_FUNCTION, func=schema) differentiability_info = load_derivatives.create_differentiability_info( - defn={'name': specification, - 'a': 'grad_x', - 'b': 'grad_z', - }, + defn={ + "name": specification, + "a": "grad_x", + "b": "grad_z", + }, functions_by_signature={schema.signature(): [native_function]}, functions_by_schema={specification: native_function}, op_counter=typing.Counter[str](), ) definition = gen_autograd_functions.process_function( - differentiability_info, - gen_autograd_functions.FUNCTION_DEFINITION) + differentiability_info, gen_autograd_functions.FUNCTION_DEFINITION + ) # grad_z should map to grads[1], not grads[2] because output 1 # (y) is not differentiable. - assert 'grad_z = grads[2]' not in definition - assert 'grad_z = grads[1]' in definition - + assert "grad_z = grads[2]" not in definition + assert "grad_z = grads[1]" in definition def test_non_differentiable_output_output_differentiability(self) -> None: - specification = 'func(Tensor a, Tensor b) -> (Tensor x, Tensor y, Tensor z)' - schema = tools.codegen.model.FunctionSchema.parse(specification) - native_function = dataclasses.replace(DEFAULT_NATIVE_FUNCTION, - func=schema) + specification = "func(Tensor a, Tensor b) -> (Tensor x, Tensor y, Tensor z)" + schema = torchgen.model.FunctionSchema.parse(specification) + native_function = dataclasses.replace(DEFAULT_NATIVE_FUNCTION, func=schema) differentiability_info = load_derivatives.create_differentiability_info( - defn={'name': specification, - 'a': 'grad_x', - 'b': 'grad_z', - 'output_differentiability': [True, False, True], - }, + defn={ + "name": specification, + "a": "grad_x", + "b": "grad_z", + "output_differentiability": [True, False, True], + }, functions_by_signature={schema.signature(): [native_function]}, functions_by_schema={specification: native_function}, op_counter=typing.Counter[str](), ) definition = gen_autograd_functions.process_function( - differentiability_info, - gen_autograd_functions.FUNCTION_DEFINITION) + differentiability_info, gen_autograd_functions.FUNCTION_DEFINITION + ) # grad_z should map to grads[1], not grads[2] because output 1 # (y) is not differentiable. - assert 'grad_z = grads[2]' not in definition - assert 'grad_z = grads[1]' in definition + assert "grad_z = grads[2]" not in definition + assert "grad_z = grads[1]" in definition # Represents the most basic NativeFunction. Use dataclasses.replace() # to edit for use. -DEFAULT_NATIVE_FUNCTION, _ = tools.codegen.model.NativeFunction.from_yaml( - {'func': 'func() -> bool'}, - loc=tools.codegen.model.Location(__file__, 1)) +DEFAULT_NATIVE_FUNCTION, _ = torchgen.model.NativeFunction.from_yaml( + {"func": "func() -> bool"}, + loc=torchgen.model.Location(__file__, 1), + valid_tags=set(), +) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tools/test/test_codegen_model.py b/tools/test/test_codegen_model.py new file mode 100644 index 000000000000..710e90697116 --- /dev/null +++ b/tools/test/test_codegen_model.py @@ -0,0 +1,145 @@ +# Owner(s): ["module: codegen"] + +import expecttest +import unittest +import yaml +import textwrap + +from torchgen.model import NativeFunctionsGroup, DispatchKey +import torchgen.dest as dest +import torchgen.gen as gen +from torchgen.gen import LineLoader, parse_native_yaml_struct + + +class TestCodegenModel(expecttest.TestCase): + def assertParseErrorInline(self, yaml_str: str, expect: str) -> None: + es = yaml.load(yaml_str, Loader=LineLoader) + try: + parse_native_yaml_struct(es, set()) + except AssertionError as e: + # hack to strip out the context + msg, _ = str(e).split(" in ", 2) + self.assertExpectedInline("\n".join(textwrap.wrap(msg)), expect, skip=1) + return + self.fail(msg="Did not raise when expected to") + + def assertUfuncErrorInline(self, yaml_str: str, expect: str) -> None: + # parse a single structured group out of the yaml to g + es = yaml.load(yaml_str, Loader=LineLoader) + parsed_yaml = parse_native_yaml_struct(es, set()) + native_functions, backend_indices = ( + parsed_yaml.native_functions, + parsed_yaml.backend_indices, + ) + grouped_native_functions = gen.get_grouped_native_functions(native_functions) + assert len(grouped_native_functions) == 1 + g = grouped_native_functions[0] + assert isinstance(g, NativeFunctionsGroup) + assert g.out.ufunc_inner_loop + # this is not ufunc codegen per se, but it does some basic sanity tests for + # ufunc generation + gen.compute_meta_function_declaration(g) + dest.compute_native_function_declaration(g, backend_indices[DispatchKey.CPU]) + dest.compute_native_function_declaration(g, backend_indices[DispatchKey.CUDA]) + try: + # the real kahuna + dest.compute_ufunc_cpu(g) + dest.compute_ufunc_cpu_kernel(g) + dest.compute_ufunc_cuda(g) + except AssertionError as e: + # hack to strip out the context + msg, _ = str(e).split(" in ", 2) + self.assertExpectedInline("\n".join(textwrap.wrap(msg)), expect, skip=1) + return + self.fail(msg="Did not raise when expected to") + + # NB: indent is hardcoded to be two here, so format your yaml accordingly + binop_out = ( + "func: binop.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)" + ) + ti_binop_out = f"""{binop_out} + structured: True + structured_inherits: TensorIteratorBase""" + ti_binop = """func: binop(Tensor self, Tensor other) -> Tensor + structured_delegate: binop.out +""" + + ti_unop_out = """func: unop.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase""" + ti_unop = """func: unop(Tensor self) -> Tensor + structured_delegate: unop.out +""" + + def test_nonstructured_ufunc(self) -> None: + yaml_str = f"""\ +- {self.binop_out} + ufunc_inner_loop: + Generic: binop (Bool) +""" + self.assertParseErrorInline( + yaml_str, + """\ +ufunc must be structured""", + ) + + def test_overlapping_ufunc_and_dispatch(self) -> None: + yaml_str = f"""\ +- {self.ti_binop_out} + ufunc_inner_loop: + Generic: binop (Bool) + dispatch: + CPU: binop_cpu +""" + self.assertParseErrorInline( + yaml_str, + """\ +ufunc should not have explicit dispatch entry for CPU""", + ) + + # See https://github.com/pytorch/pytorch/pull/65851#discussion_r810238456 + @unittest.expectedFailure + def test_scalaronly_shadowed(self) -> None: + yaml_str = f"""\ +- {self.ti_binop_out} + ufunc_inner_loop: + Generic: binop (Bool) + ScalarOnly: binop (Bool) +""" + self.assertParseErrorInline( + yaml_str, + """\ +""", + ) + + def test_conflicting_ufunc(self) -> None: + yaml_str = f"""\ +- {self.ti_binop_out} + ufunc_inner_loop: + Generic: binop (Bool) + ScalarOnly: binop_scalar (Bool) +- {self.ti_binop} +""" + self.assertUfuncErrorInline( + yaml_str, + """\ +ScalarOnly and Generic must have same ufunc name""", + ) + + def test_invalid_cudafunctoronself_for_binary_op(self) -> None: + yaml_str = f"""\ +- {self.ti_unop_out} + ufunc_inner_loop: + Generic: unop (All) + CUDAFunctorOnSelf: unop_self_cuda (All) +- {self.ti_unop} +""" + self.assertUfuncErrorInline( + yaml_str, + """\ +cannot use CUDAFunctorOnSelf on non-binary function""", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tools/test/test_extract_scripts.py b/tools/test/test_extract_scripts.py deleted file mode 100644 index 3126893c4bb3..000000000000 --- a/tools/test/test_extract_scripts.py +++ /dev/null @@ -1,85 +0,0 @@ -import unittest - -from tools import extract_scripts - -requirements_sh = ''' -#!/usr/bin/env bash -set -eo pipefail -pip install -r requirements.txt -'''.strip() - -hello_sh = ''' -#!/usr/bin/env sh -set -e -echo hello world -'''.strip() - - -class TestExtractScripts(unittest.TestCase): - def test_extract_none(self) -> None: - self.assertEqual( - extract_scripts.extract({ - 'name': 'Checkout PyTorch', - 'uses': 'zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9', - }), - None, - ) - - def test_extract_run_default_bash(self) -> None: - self.assertEqual( - extract_scripts.extract({ - 'name': 'Install requirements', - 'run': 'pip install -r requirements.txt', - }), - { - 'extension': '.sh', - 'script': requirements_sh, - }, - ) - - def test_extract_run_sh(self) -> None: - self.assertEqual( - extract_scripts.extract({ - 'name': 'Hello world', - 'run': 'echo hello world', - 'shell': 'sh', - }), - { - 'extension': '.sh', - 'script': hello_sh, - }, - ) - - def test_extract_run_py(self) -> None: - self.assertEqual( - extract_scripts.extract({ - 'name': 'Hello world', - 'run': 'print("Hello!")', - 'shell': 'python', - }), - { - 'extension': '.py', - 'script': 'print("Hello!")', - }, - ) - - def test_extract_github_script(self) -> None: - self.assertEqual( - # https://github.com/actions/github-script/tree/v3.1.1#reading-step-results - extract_scripts.extract({ - 'uses': 'actions/github-script@v3', - 'id': 'set-result', - 'with': { - 'script': 'return "Hello!"', - 'result-encoding': 'string', - }, - }), - { - 'extension': '.js', - 'script': 'return "Hello!"', - }, - ) - - -if __name__ == '__main__': - unittest.main() diff --git a/tools/test/test_gen_backend_stubs.py b/tools/test/test_gen_backend_stubs.py index ee2ee8a0f0b9..168ae8b1d7c7 100644 --- a/tools/test/test_gen_backend_stubs.py +++ b/tools/test/test_gen_backend_stubs.py @@ -5,233 +5,269 @@ import unittest import expecttest -from tools.codegen.gen_backend_stubs import run -from tools.codegen.gen import _GLOBAL_PARSE_NATIVE_YAML_CACHE # noqa: F401 +from torchgen.gen_backend_stubs import run +from torchgen.gen import _GLOBAL_PARSE_NATIVE_YAML_CACHE # noqa: F401 path = os.path.dirname(os.path.realpath(__file__)) -gen_backend_stubs_path = os.path.join(path, '../tools/codegen/gen_backend_stubs.py') +gen_backend_stubs_path = os.path.join(path, "../torchgen/gen_backend_stubs.py") # gen_backend_stubs.py is an integration point that is called directly by external backends. # The tests here are to confirm that badly formed inputs result in reasonable error messages. class TestGenBackendStubs(expecttest.TestCase): - def setUp(self) -> None: global _GLOBAL_PARSE_NATIVE_YAML_CACHE _GLOBAL_PARSE_NATIVE_YAML_CACHE.clear() - def assert_success_from_gen_backend_stubs(self, yaml_str: str) -> None: - with tempfile.NamedTemporaryFile(mode='w') as fp: + with tempfile.NamedTemporaryFile(mode="w") as fp: fp.write(yaml_str) fp.flush() - run(fp.name, '', True) + run(fp.name, "", True) def get_errors_from_gen_backend_stubs(self, yaml_str: str) -> str: - with tempfile.NamedTemporaryFile(mode='w') as fp: + with tempfile.NamedTemporaryFile(mode="w") as fp: fp.write(yaml_str) fp.flush() try: - run(fp.name, '', True) + run(fp.name, "", True) except AssertionError as e: # Scrub out the temp file name from any error messages to simplify assertions. - return str(e).replace(fp.name, '') - self.fail('Expected gen_backend_stubs to raise an AssertionError, but it did not.') + return str(e).replace(fp.name, "") + self.fail( + "Expected gen_backend_stubs to raise an AssertionError, but it did not." + ) def test_valid_single_op(self) -> None: - yaml_str = '''\ + yaml_str = """\ backend: XLA cpp_namespace: torch_xla supported: -- abs''' +- abs""" self.assert_success_from_gen_backend_stubs(yaml_str) def test_valid_multiple_ops(self) -> None: - yaml_str = '''\ + yaml_str = """\ backend: XLA cpp_namespace: torch_xla supported: - add.Tensor -- abs''' +- abs""" self.assert_success_from_gen_backend_stubs(yaml_str) def test_valid_zero_ops(self) -> None: - yaml_str = '''\ + yaml_str = """\ backend: XLA cpp_namespace: torch_xla -supported:''' +supported:""" self.assert_success_from_gen_backend_stubs(yaml_str) def test_valid_zero_ops_doesnt_require_backend_dispatch_key(self) -> None: - yaml_str = '''\ + yaml_str = """\ backend: BAD_XLA cpp_namespace: torch_xla -supported:''' +supported:""" # External codegen on a yaml file with no operators is effectively a no-op, # so there's no reason to parse the backend self.assert_success_from_gen_backend_stubs(yaml_str) def test_valid_with_autograd_ops(self) -> None: - yaml_str = '''\ + yaml_str = """\ backend: XLA cpp_namespace: torch_xla supported: - abs autograd: -- add.Tensor''' +- add.Tensor""" # External codegen on a yaml file with no operators is effectively a no-op, # so there's no reason to parse the backend self.assert_success_from_gen_backend_stubs(yaml_str) def test_missing_backend(self) -> None: - yaml_str = '''\ + yaml_str = """\ cpp_namespace: torch_xla supported: -- abs''' +- abs""" output_error = self.get_errors_from_gen_backend_stubs(yaml_str) - self.assertExpectedInline(output_error, '''You must provide a value for "backend"''') + self.assertExpectedInline( + output_error, '''You must provide a value for "backend"''' + ) def test_empty_backend(self) -> None: - yaml_str = '''\ + yaml_str = """\ backend: cpp_namespace: torch_xla supported: -- abs''' +- abs""" output_error = self.get_errors_from_gen_backend_stubs(yaml_str) - self.assertExpectedInline(output_error, '''You must provide a value for "backend"''') + self.assertExpectedInline( + output_error, '''You must provide a value for "backend"''' + ) def test_backend_invalid_dispatch_key(self) -> None: - yaml_str = '''\ + yaml_str = """\ backend: NOT_XLA cpp_namespace: torch_xla supported: -- abs''' +- abs""" output_error = self.get_errors_from_gen_backend_stubs(yaml_str) - self.assertExpectedInline(output_error, '''\ + self.assertExpectedInline( + output_error, + """\ unknown dispatch key NOT_XLA - The provided value for "backend" must be a valid DispatchKey, but got NOT_XLA.''') # noqa: B950 + The provided value for "backend" must be a valid DispatchKey, but got NOT_XLA.""", + ) # noqa: B950 def test_missing_cpp_namespace(self) -> None: - yaml_str = '''\ + yaml_str = """\ backend: XLA supported: -- abs''' +- abs""" output_error = self.get_errors_from_gen_backend_stubs(yaml_str) - self.assertExpectedInline(output_error, '''You must provide a value for "cpp_namespace"''') + self.assertExpectedInline( + output_error, '''You must provide a value for "cpp_namespace"''' + ) def test_whitespace_cpp_namespace(self) -> None: - yaml_str = '''\ + yaml_str = """\ backend: XLA cpp_namespace:\t supported: -- abs''' +- abs""" output_error = self.get_errors_from_gen_backend_stubs(yaml_str) - self.assertExpectedInline(output_error, '''You must provide a value for "cpp_namespace"''') + self.assertExpectedInline( + output_error, '''You must provide a value for "cpp_namespace"''' + ) # supported is a single item (it should be a list) def test_nonlist_supported(self) -> None: - yaml_str = '''\ + yaml_str = """\ backend: XLA cpp_namespace: torch_xla -supported: abs''' +supported: abs""" output_error = self.get_errors_from_gen_backend_stubs(yaml_str) - self.assertExpectedInline(output_error, '''expected "supported" to be a list, but got: abs (of type )''') + self.assertExpectedInline( + output_error, + """expected "supported" to be a list, but got: abs (of type )""", + ) # supported contains an op that isn't in native_functions.yaml def test_supported_invalid_op(self) -> None: - yaml_str = '''\ + yaml_str = """\ backend: XLA cpp_namespace: torch_xla supported: -- abs_BAD''' +- abs_BAD""" output_error = self.get_errors_from_gen_backend_stubs(yaml_str) - self.assertExpectedInline(output_error, '''Found an invalid operator name: abs_BAD''') + self.assertExpectedInline( + output_error, """Found an invalid operator name: abs_BAD""" + ) # The backend is valid, but doesn't have a valid autograd key. They can't override autograd kernels in that case. # Only using Vulkan here because it has a valid backend key but not an autograd key- if this changes we can update the test. def test_backend_has_no_autograd_key_but_provides_entries(self) -> None: - yaml_str = '''\ + yaml_str = """\ backend: Vulkan cpp_namespace: torch_vulkan supported: - add autograd: -- sub''' +- sub""" output_error = self.get_errors_from_gen_backend_stubs(yaml_str) - self.assertExpectedInline(output_error, '''Found an invalid operator name: add''') # noqa: B950 + self.assertExpectedInline( + output_error, """Found an invalid operator name: add""" + ) # noqa: B950 # in an operator group, currently all operators must either be registered to the backend or autograd kernel. # Here, functional and out mismatch def test_backend_autograd_kernel_mismatch_out_functional(self) -> None: - yaml_str = '''\ + yaml_str = """\ backend: XLA cpp_namespace: torch_xla supported: - add.Tensor autograd: -- add.out''' +- add.out""" output_error = self.get_errors_from_gen_backend_stubs(yaml_str) - self.assertExpectedInline(output_error, '''Currently, all variants of an op must either be registered to a backend key, or to a backend's autograd key. They cannot be mix and matched. If this is something you need, feel free to create an issue! add is listed under "supported", but add_out is listed under "autograd".''') # noqa: B950 + self.assertExpectedInline( + output_error, + """Currently, all variants of an op must either be registered to a backend key, or to a backend's autograd key. They cannot be mix and matched. If this is something you need, feel free to create an issue! add is listed under "supported", but add_out is listed under "autograd".""", # noqa: B950 + ) # in an operator group, currently all operators must either be registered to the backend or autograd kernel. # Here, functional and inplace mismatch def test_backend_autograd_kernel_mismatch_functional_inplace(self) -> None: - yaml_str = '''\ + yaml_str = """\ backend: XLA cpp_namespace: torch_xla supported: - add.Tensor autograd: -- add_.Tensor''' +- add_.Tensor""" output_error = self.get_errors_from_gen_backend_stubs(yaml_str) - self.assertExpectedInline(output_error, '''Currently, all variants of an op must either be registered to a backend key, or to a backend's autograd key. They cannot be mix and matched. If this is something you need, feel free to create an issue! add is listed under "supported", but add_ is listed under "autograd".''') # noqa: B950 + self.assertExpectedInline( + output_error, + """Currently, all variants of an op must either be registered to a backend key, or to a backend's autograd key. They cannot be mix and matched. If this is something you need, feel free to create an issue! add is listed under "supported", but add_ is listed under "autograd".""", # noqa: B950 + ) # Currently, the same operator can't be listed under both 'supported' and 'autograd', which would # involve registering the same kernel to both the XLA and AutogradXLA keys. # If we need that functionality in the future, we'll need to augment the codegen. def test_op_appears_in_supported_and_autograd_lists(self) -> None: - yaml_str = '''\ + yaml_str = """\ backend: XLA cpp_namespace: torch_xla supported: - add.Tensor autograd: -- add.Tensor''' +- add.Tensor""" output_error = self.get_errors_from_gen_backend_stubs(yaml_str) - self.assertExpectedInline(output_error, '''Currently, all variants of an op must either be registered to a backend key, or to a backend's autograd key. They cannot be mix and matched. If this is something you need, feel free to create an issue! add is listed under "supported", but add is listed under "autograd".''') # noqa: B950 + self.assertExpectedInline( + output_error, + """Currently, all variants of an op must either be registered to a backend key, or to a backend's autograd key. They cannot be mix and matched. If this is something you need, feel free to create an issue! add is listed under "supported", but add is listed under "autograd".""", # noqa: B950 + ) # unrecognized extra yaml key def test_unrecognized_key(self) -> None: - yaml_str = '''\ + yaml_str = """\ backend: XLA cpp_namespace: torch_xla supported: - abs -invalid_key: invalid_val''' +invalid_key: invalid_val""" output_error = self.get_errors_from_gen_backend_stubs(yaml_str) - self.assertExpectedInline(output_error, ''' contains unexpected keys: invalid_key. Only the following keys are supported: backend, cpp_namespace, extra_headers, supported, autograd, full_codegen''') # noqa: B950 + self.assertExpectedInline( + output_error, + """ contains unexpected keys: invalid_key. Only the following keys are supported: backend, class_name, cpp_namespace, extra_headers, supported, autograd, full_codegen""", # noqa: B950 + ) # if use_out_as_primary is provided, it must be a bool def test_use_out_as_primary_non_bool(self) -> None: - yaml_str = '''\ + yaml_str = """\ backend: XLA cpp_namespace: torch_xla use_out_as_primary: frue supported: -- abs''' +- abs""" output_error = self.get_errors_from_gen_backend_stubs(yaml_str) - self.assertExpectedInline(output_error, '''You must provide either True or False for use_out_as_primary. Provided: frue''') # noqa: B950 + self.assertExpectedInline( + output_error, + """You must provide either True or False for use_out_as_primary. Provided: frue""", + ) # noqa: B950 # if device_guard is provided, it must be a bool def test_device_guard_non_bool(self) -> None: - yaml_str = '''\ + yaml_str = """\ backend: XLA cpp_namespace: torch_xla device_guard: frue supported: -- abs''' +- abs""" output_error = self.get_errors_from_gen_backend_stubs(yaml_str) - self.assertExpectedInline(output_error, '''You must provide either True or False for device_guard. Provided: frue''') # noqa: B950 + self.assertExpectedInline( + output_error, + """You must provide either True or False for device_guard. Provided: frue""", + ) # noqa: B950 -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tools/test/test_import_test_stats.py b/tools/test/test_import_test_stats.py new file mode 100644 index 000000000000..ea9aad8df40d --- /dev/null +++ b/tools/test/test_import_test_stats.py @@ -0,0 +1,67 @@ +import os +import unittest +from tools.stats.import_test_stats import get_disabled_issues +from typing import List +from unittest.mock import patch + + +class TestGetDisabledIssues(unittest.TestCase): + def run_assert_disabled_issues( + self, pr_body: str, commit_messages: str, expected: List[str] + ) -> None: + with patch.dict( + os.environ, {"PR_BODY": pr_body, "COMMIT_MESSAGES": commit_messages} + ): + disabled_issues = get_disabled_issues() + self.assertEqual(disabled_issues, expected) + + # test variations of close in PR_BODY + def test_closes_pr_body(self) -> None: + pr_body = "closes #123 Close #143 ClOsE #345 closed #10283" + self.run_assert_disabled_issues(pr_body, "", ["123", "143", "345", "10283"]) + + # test variations of fix in COMMIT_MESSAGES + def test_fixes_commit_messages(self) -> None: + commit_messages = "fix #123 FixEd #143 fixes #345 FiXeD #10283" + self.run_assert_disabled_issues( + "", commit_messages, ["123", "143", "345", "10283"] + ) + + # test variations of resolve in PR_BODY and COMMIT_MESSAGES + def test_resolves_pr_commits(self) -> None: + pr_body = "resolve #123 resolveS #143" + commit_messages = "REsolved #345 RESOLVES #10283" + self.run_assert_disabled_issues( + pr_body, commit_messages, ["123", "143", "345", "10283"] + ) + + # test links + def test_issue_links(self) -> None: + pr_body = "closes https://github.com/pytorch/pytorch/issues/75198 fixes https://github.com/pytorch/pytorch/issues/75123" + self.run_assert_disabled_issues(pr_body, "", ["75198", "75123"]) + + # test strange spacing + def test_spacing(self) -> None: + pr_body = "resolve #123,resolveS #143Resolved #345\nRESOLVES #10283" + commit_messages = "Fixed #2348fixes https://github.com/pytorch/pytorch/issues/75123resolveS #2134" + self.run_assert_disabled_issues( + pr_body, + commit_messages, + ["123", "143", "345", "10283", "2348", "75123", "2134"], + ) + + # test bad things + def test_not_accepted(self) -> None: + pr_body = ( + "fixes189 fixeshttps://github.com/pytorch/pytorch/issues/75123 " + "closedhttps://githubcom/pytorch/pytorch/issues/75123" + ) + commit_messages = ( + "fix 234, fixes # 45, fixing #123, close 234, closes#45, closing #123 resolve 234, " + "resolves #45, resolving #123" + ) + self.run_assert_disabled_issues(pr_body, commit_messages, []) + + +if __name__ == "__main__": + unittest.main() diff --git a/tools/test/test_max_tokens_pragma.py b/tools/test/test_max_tokens_pragma.py deleted file mode 100644 index 746b51e39d03..000000000000 --- a/tools/test/test_max_tokens_pragma.py +++ /dev/null @@ -1,132 +0,0 @@ -import unittest -from tools.linter.clang_tidy.max_tokens_pragma import ( - add_max_tokens_pragma, - strip_max_tokens_pragmas, -) - - -def compare_code(a: str, b: str) -> bool: - a_lines = [line.strip() for line in a.splitlines()] - b_lines = [line.strip() for line in b.splitlines()] - return a_lines == b_lines - - -class TestMaxTokensPragma(unittest.TestCase): - def test_no_prior_pragmas(self) -> None: - input = """\ - // File without any prior pragmas - - int main() { - for (int i = 0; i < 10; i++); - return 0; - } - """ - - expected = """\ - #pragma clang max_tokens_total 42 - // File without any prior pragmas - - int main() { - for (int i = 0; i < 10; i++); - return 0; - } - """ - output = add_max_tokens_pragma(input, 42) - self.assertTrue(compare_code(output, expected)) - - output = strip_max_tokens_pragmas(output) - self.assertTrue(compare_code(output, input)) - - def test_single_prior_pragma(self) -> None: - input = """\ - // File with prior pragmas - - #pragma clang max_tokens_total 1 - - int main() { - for (int i = 0; i < 10; i++); - return 0; - } - """ - - expected = """\ - // File with prior pragmas - - #pragma clang max_tokens_total 42 - - int main() { - for (int i = 0; i < 10; i++); - return 0; - } - """ - stripped = """\ - // File with prior pragmas - - - int main() { - for (int i = 0; i < 10; i++); - return 0; - } - """ - - output = add_max_tokens_pragma(input, 42) - self.assertTrue(compare_code(output, expected)) - - output = strip_max_tokens_pragmas(output) - self.assertTrue(compare_code(output, stripped)) - - def test_multiple_prior_pragmas(self) -> None: - input = """\ - // File with multiple prior pragmas - - #pragma clang max_tokens_total 1 - - // Different pragma; script should ignore this - #pragma clang max_tokens_here 20 - - int main() { - for (int i = 0; i < 10; i++); - return 0; - } - - #pragma clang max_tokens_total 1 - """ - - expected = """\ - // File with multiple prior pragmas - - #pragma clang max_tokens_total 42 - - // Different pragma; script should ignore this - #pragma clang max_tokens_here 20 - - int main() { - for (int i = 0; i < 10; i++); - return 0; - } - - #pragma clang max_tokens_total 42 - """ - stripped = """\ - // File with multiple prior pragmas - - - // Different pragma; script should ignore this - #pragma clang max_tokens_here 20 - - int main() { - for (int i = 0; i < 10; i++); - return 0; - } - - """ - - output = add_max_tokens_pragma(input, 42) - self.assertTrue(compare_code(output, expected)) - - output = strip_max_tokens_pragmas(output) - self.assertTrue(compare_code(output, stripped)) - - -if __name__ == "__main__": - unittest.main() diff --git a/tools/test/test_mypy_wrapper.py b/tools/test/test_mypy_wrapper.py deleted file mode 100644 index df7b0ab9e27d..000000000000 --- a/tools/test/test_mypy_wrapper.py +++ /dev/null @@ -1,158 +0,0 @@ -import unittest - -from tools.linter import mypy_wrapper - - -class TestMypyWrapper(unittest.TestCase): - configs = { - 'foo.ini': { - 'file1.abc', - 'dir2', - 'dir3/file4.xyz', - }, - 'bar/baz.ini': { - 'file1.abc', - 'dir2/dir5/file6.def', - 'dir3/file7.abc', - }, - } - - trie: mypy_wrapper.Trie = { - 'file1.abc': {None: {'foo.ini', 'bar/baz.ini'}}, - 'dir2': { - None: {'foo.ini'}, - 'dir5': {'file6.def': {None: {'bar/baz.ini'}}}, - }, - 'dir3': { - 'file4.xyz': {None: {'foo.ini'}}, - 'file7.abc': {None: {'bar/baz.ini'}}, - }, - } - - def test_config_files(self) -> None: - self.assertEqual(mypy_wrapper.config_files().keys(), { - 'mypy.ini', - 'mypy-strict.ini', - }) - - def test_split_path(self) -> None: - self.assertEqual(mypy_wrapper.split_path('file1.abc'), ['file1.abc']) - self.assertEqual( - mypy_wrapper.split_path('dir3/file4.xyz'), - ['dir3', 'file4.xyz'], - ) - self.assertEqual( - mypy_wrapper.split_path('dir2/dir5/file6.def'), - ['dir2', 'dir5', 'file6.def'], - ) - - def test_make_trie(self) -> None: - self.assertEqual(mypy_wrapper.make_trie(self.configs), self.trie) - - def test_lookup(self) -> None: - self.assertEqual( - mypy_wrapper.lookup(self.trie, 'file1.abc'), - {'foo.ini', 'bar/baz.ini'}, - ) - self.assertEqual( - mypy_wrapper.lookup(self.trie, 'dir2/dir5/file6.def'), - {'foo.ini', 'bar/baz.ini'}, - ) - self.assertEqual( - mypy_wrapper.lookup(self.trie, 'dir3/file4.xyz'), - {'foo.ini'}, - ) - self.assertEqual( - mypy_wrapper.lookup(self.trie, 'dir3/file7.abc'), - {'bar/baz.ini'}, - ) - self.assertEqual( - mypy_wrapper.lookup(self.trie, 'file8.xyz'), - set(), - ) - self.assertEqual( - mypy_wrapper.lookup(self.trie, 'dir2/dir9/file10.abc'), - {'foo.ini'}, - ) - self.assertEqual( - mypy_wrapper.lookup(self.trie, 'dir3/file11.abc'), - set(), - ) - - # non-leaves shouldn't ever be passed to lookup in practice, but - # still, good to consider/test these cases - self.assertEqual( - mypy_wrapper.lookup(self.trie, 'dir2'), - {'foo.ini'}, - ) - self.assertEqual( - mypy_wrapper.lookup(self.trie, 'dir2/dir5'), - {'foo.ini'}, - ) - self.assertEqual( - mypy_wrapper.lookup(self.trie, 'dir3'), - set(), - ) - self.assertEqual( - mypy_wrapper.lookup(self.trie, 'dir2/dir9'), - {'foo.ini'}, - ) - self.assertEqual( - mypy_wrapper.lookup(self.trie, 'dir4'), - set(), - ) - - def test_make_plan(self) -> None: - self.assertEqual( - mypy_wrapper.make_plan(configs=self.configs, files=[ - 'file8.xyz', - 'dir3/file11.abc', - ]), - {} - ) - self.assertEqual( - mypy_wrapper.make_plan(configs=self.configs, files=[ - 'file8.xyz', - 'dir2/dir9/file10.abc', - 'dir3/file4.xyz', - 'dir3/file11.abc', - ]), - { - 'foo.ini': ['dir2/dir9/file10.abc', 'dir3/file4.xyz'], - } - ) - self.assertEqual( - mypy_wrapper.make_plan(configs=self.configs, files=[ - 'file8.xyz', - 'dir3/file11.abc', - 'dir3/file7.abc', - ]), - { - 'bar/baz.ini': ['dir3/file7.abc'], - } - ) - self.assertEqual( - mypy_wrapper.make_plan(configs=self.configs, files=[ - 'dir2/dir9/file10.abc', - 'dir2/dir5/file6.def', - 'dir3/file7.abc', - 'file1.abc', - 'dir3/file11.abc', - ]), - { - 'foo.ini': [ - 'dir2/dir9/file10.abc', - 'dir2/dir5/file6.def', - 'file1.abc', - ], - 'bar/baz.ini': [ - 'dir2/dir5/file6.def', - 'dir3/file7.abc', - 'file1.abc', - ], - } - ) - - -if __name__ == '__main__': - unittest.main() diff --git a/tools/test/test_stats.py b/tools/test/test_stats.py index 46ad28748608..2718308f66da 100644 --- a/tools/test/test_stats.py +++ b/tools/test/test_stats.py @@ -3,10 +3,16 @@ from typing import Dict, List from tools.stats import print_test_stats -from tools.stats.s3_stat_parser import (Commit, Report, ReportMetaMeta, - Status, Version1Case, - Version1Report, Version2Case, - Version2Report) +from tools.stats.s3_stat_parser import ( + Commit, + Report, + ReportMetaMeta, + Status, + Version1Case, + Version1Report, + Version2Case, + Version2Report, +) def fakehash(char: str) -> str: @@ -15,14 +21,14 @@ def fakehash(char: str) -> str: def dummy_meta_meta() -> ReportMetaMeta: return { - 'build_pr': '', - 'build_tag': '', - 'build_sha1': '', - 'build_base_commit': '', - 'build_branch': '', - 'build_job': '', - 'build_workflow_id': '', - 'build_start_time_epoch': '', + "build_pr": "", + "build_tag": "", + "build_sha1": "", + "build_base_commit": "", + "build_branch": "", + "build_job": "", + "build_workflow_id": "", + "build_start_time_epoch": "", } @@ -35,202 +41,210 @@ def makecase( skipped: bool = False, ) -> Version1Case: return { - 'name': name, - 'seconds': seconds, - 'errored': errored, - 'failed': failed, - 'skipped': skipped, + "name": name, + "seconds": seconds, + "errored": errored, + "failed": failed, + "skipped": skipped, } def make_report_v1(tests: Dict[str, List[Version1Case]]) -> Version1Report: suites = { suite_name: { - 'total_seconds': sum(case['seconds'] for case in cases), - 'cases': cases, + "total_seconds": sum(case["seconds"] for case in cases), + "cases": cases, } for suite_name, cases in tests.items() } return { **dummy_meta_meta(), # type: ignore[misc] - 'total_seconds': sum(s['total_seconds'] for s in suites.values()), - 'suites': suites, + "total_seconds": sum(s["total_seconds"] for s in suites.values()), + "suites": suites, } def make_case_v2(seconds: float, status: Status = None) -> Version2Case: return { - 'seconds': seconds, - 'status': status, + "seconds": seconds, + "status": status, } -def make_report_v2(tests: Dict[str, Dict[str, Dict[str, Version2Case]]]) -> Version2Report: +def make_report_v2( + tests: Dict[str, Dict[str, Dict[str, Version2Case]]] +) -> Version2Report: files = {} for file_name, file_suites in tests.items(): suites = { suite_name: { - 'total_seconds': sum(case['seconds'] for case in cases.values()), - 'cases': cases, + "total_seconds": sum(case["seconds"] for case in cases.values()), + "cases": cases, } for suite_name, cases in file_suites.items() } files[file_name] = { - 'suites': suites, - 'total_seconds': sum(suite['total_seconds'] for suite in suites.values()), + "suites": suites, + "total_seconds": sum(suite["total_seconds"] for suite in suites.values()), # type: ignore[type-var] } return { **dummy_meta_meta(), # type: ignore[misc] - 'format_version': 2, - 'total_seconds': sum(s['total_seconds'] for s in files.values()), - 'files': files, + "format_version": 2, + "total_seconds": sum(s["total_seconds"] for s in files.values()), + "files": files, } + + maxDiff = None + class TestPrintTestStats(unittest.TestCase): - version1_report: Version1Report = make_report_v1({ - # input ordering of the suites is ignored - 'Grault': [ - # not printed: status same and time similar - makecase('test_grault0', 4.78, failed=True), - # status same, but time increased a lot - makecase('test_grault2', 1.473, errored=True), - ], - # individual tests times changed, not overall suite - 'Qux': [ - # input ordering of the test cases is ignored - makecase('test_qux1', 0.001, skipped=True), - makecase('test_qux6', 0.002, skipped=True), - # time in bounds, but status changed - makecase('test_qux4', 7.158, failed=True), - # not printed because it's the same as before - makecase('test_qux7', 0.003, skipped=True), - makecase('test_qux5', 11.968), - makecase('test_qux3', 23.496), - ], - # new test suite - 'Bar': [ - makecase('test_bar2', 3.742, failed=True), - makecase('test_bar1', 50.447), - ], - # overall suite time changed but no individual tests - 'Norf': [ - makecase('test_norf1', 3), - makecase('test_norf2', 3), - makecase('test_norf3', 3), - makecase('test_norf4', 3), - ], - # suite doesn't show up if it doesn't change enough - 'Foo': [ - makecase('test_foo1', 42), - makecase('test_foo2', 56), - ], - }) + version1_report: Version1Report = make_report_v1( + { + # input ordering of the suites is ignored + "Grault": [ + # not printed: status same and time similar + makecase("test_grault0", 4.78, failed=True), + # status same, but time increased a lot + makecase("test_grault2", 1.473, errored=True), + ], + # individual tests times changed, not overall suite + "Qux": [ + # input ordering of the test cases is ignored + makecase("test_qux1", 0.001, skipped=True), + makecase("test_qux6", 0.002, skipped=True), + # time in bounds, but status changed + makecase("test_qux4", 7.158, failed=True), + # not printed because it's the same as before + makecase("test_qux7", 0.003, skipped=True), + makecase("test_qux5", 11.968), + makecase("test_qux3", 23.496), + ], + # new test suite + "Bar": [ + makecase("test_bar2", 3.742, failed=True), + makecase("test_bar1", 50.447), + ], + # overall suite time changed but no individual tests + "Norf": [ + makecase("test_norf1", 3), + makecase("test_norf2", 3), + makecase("test_norf3", 3), + makecase("test_norf4", 3), + ], + # suite doesn't show up if it doesn't change enough + "Foo": [ + makecase("test_foo1", 42), + makecase("test_foo2", 56), + ], + } + ) version2_report: Version2Report = make_report_v2( { - 'test_a': { - 'Grault': { - 'test_grault0': make_case_v2(4.78, 'failed'), - 'test_grault2': make_case_v2(1.473, 'errored'), + "test_a": { + "Grault": { + "test_grault0": make_case_v2(4.78, "failed"), + "test_grault2": make_case_v2(1.473, "errored"), + }, + "Qux": { + "test_qux1": make_case_v2(0.001, "skipped"), + "test_qux6": make_case_v2(0.002, "skipped"), + "test_qux4": make_case_v2(7.158, "failed"), + "test_qux7": make_case_v2(0.003, "skipped"), + "test_qux8": make_case_v2(11.968), + "test_qux3": make_case_v2(23.496), }, - 'Qux': { - 'test_qux1': make_case_v2(0.001, 'skipped'), - 'test_qux6': make_case_v2(0.002, 'skipped'), - 'test_qux4': make_case_v2(7.158, 'failed'), - 'test_qux7': make_case_v2(0.003, 'skipped'), - 'test_qux8': make_case_v2(11.968), - 'test_qux3': make_case_v2(23.496), - } }, - 'test_b': { - 'Bar': { - 'test_bar2': make_case_v2(3.742, 'failed'), - 'test_bar1': make_case_v2(50.447), + "test_b": { + "Bar": { + "test_bar2": make_case_v2(3.742, "failed"), + "test_bar1": make_case_v2(50.447), }, # overall suite time changed but no individual tests - 'Norf': { - 'test_norf1': make_case_v2(3), - 'test_norf2': make_case_v2(3), - 'test_norf3': make_case_v2(3), - 'test_norf4': make_case_v2(3), + "Norf": { + "test_norf1": make_case_v2(3), + "test_norf2": make_case_v2(3), + "test_norf3": make_case_v2(3), + "test_norf4": make_case_v2(3), }, }, - 'test_c': { - 'Foo': { - 'test_foo1': make_case_v2(42), - 'test_foo2': make_case_v2(56), + "test_c": { + "Foo": { + "test_foo1": make_case_v2(42), + "test_foo2": make_case_v2(56), }, - } - }) + }, + } + ) def test_simplify(self) -> None: self.assertEqual( { - '': { - 'Bar': { - 'test_bar1': {'seconds': 50.447, 'status': None}, - 'test_bar2': {'seconds': 3.742, 'status': 'failed'}, + "": { + "Bar": { + "test_bar1": {"seconds": 50.447, "status": None}, + "test_bar2": {"seconds": 3.742, "status": "failed"}, }, - 'Foo': { - 'test_foo1': {'seconds': 42, 'status': None}, - 'test_foo2': {'seconds': 56, 'status': None}, + "Foo": { + "test_foo1": {"seconds": 42, "status": None}, + "test_foo2": {"seconds": 56, "status": None}, }, - 'Grault': { - 'test_grault0': {'seconds': 4.78, 'status': 'failed'}, - 'test_grault2': {'seconds': 1.473, 'status': 'errored'}, + "Grault": { + "test_grault0": {"seconds": 4.78, "status": "failed"}, + "test_grault2": {"seconds": 1.473, "status": "errored"}, }, - 'Norf': { - 'test_norf1': {'seconds': 3, 'status': None}, - 'test_norf3': {'seconds': 3, 'status': None}, - 'test_norf2': {'seconds': 3, 'status': None}, - 'test_norf4': {'seconds': 3, 'status': None}, + "Norf": { + "test_norf1": {"seconds": 3, "status": None}, + "test_norf3": {"seconds": 3, "status": None}, + "test_norf2": {"seconds": 3, "status": None}, + "test_norf4": {"seconds": 3, "status": None}, }, - 'Qux': { - 'test_qux1': {'seconds': 0.001, 'status': 'skipped'}, - 'test_qux3': {'seconds': 23.496, 'status': None}, - 'test_qux4': {'seconds': 7.158, 'status': 'failed'}, - 'test_qux5': {'seconds': 11.968, 'status': None}, - 'test_qux6': {'seconds': 0.002, 'status': 'skipped'}, - 'test_qux7': {'seconds': 0.003, 'status': 'skipped'}, + "Qux": { + "test_qux1": {"seconds": 0.001, "status": "skipped"}, + "test_qux3": {"seconds": 23.496, "status": None}, + "test_qux4": {"seconds": 7.158, "status": "failed"}, + "test_qux5": {"seconds": 11.968, "status": None}, + "test_qux6": {"seconds": 0.002, "status": "skipped"}, + "test_qux7": {"seconds": 0.003, "status": "skipped"}, }, }, }, - print_test_stats.simplify(self.version1_report) + print_test_stats.simplify(self.version1_report), ) self.assertEqual( { - 'test_a': { - 'Grault': { - 'test_grault0': {'seconds': 4.78, 'status': 'failed'}, - 'test_grault2': {'seconds': 1.473, 'status': 'errored'}, + "test_a": { + "Grault": { + "test_grault0": {"seconds": 4.78, "status": "failed"}, + "test_grault2": {"seconds": 1.473, "status": "errored"}, }, - 'Qux': { - 'test_qux1': {'seconds': 0.001, 'status': 'skipped'}, - 'test_qux3': {'seconds': 23.496, 'status': None}, - 'test_qux4': {'seconds': 7.158, 'status': 'failed'}, - 'test_qux6': {'seconds': 0.002, 'status': 'skipped'}, - 'test_qux7': {'seconds': 0.003, 'status': 'skipped'}, - 'test_qux8': {'seconds': 11.968, 'status': None}, + "Qux": { + "test_qux1": {"seconds": 0.001, "status": "skipped"}, + "test_qux3": {"seconds": 23.496, "status": None}, + "test_qux4": {"seconds": 7.158, "status": "failed"}, + "test_qux6": {"seconds": 0.002, "status": "skipped"}, + "test_qux7": {"seconds": 0.003, "status": "skipped"}, + "test_qux8": {"seconds": 11.968, "status": None}, }, }, - 'test_b': { - 'Bar': { - 'test_bar1': {'seconds': 50.447, 'status': None}, - 'test_bar2': {'seconds': 3.742, 'status': 'failed'}, + "test_b": { + "Bar": { + "test_bar1": {"seconds": 50.447, "status": None}, + "test_bar2": {"seconds": 3.742, "status": "failed"}, }, - 'Norf': { - 'test_norf1': {'seconds': 3, 'status': None}, - 'test_norf2': {'seconds': 3, 'status': None}, - 'test_norf3': {'seconds': 3, 'status': None}, - 'test_norf4': {'seconds': 3, 'status': None}, + "Norf": { + "test_norf1": {"seconds": 3, "status": None}, + "test_norf2": {"seconds": 3, "status": None}, + "test_norf3": {"seconds": 3, "status": None}, + "test_norf4": {"seconds": 3, "status": None}, }, }, - 'test_c': { - 'Foo': { - 'test_foo1': {'seconds': 42, 'status': None}, - 'test_foo2': {'seconds': 56, 'status': None}, + "test_c": { + "Foo": { + "test_foo1": {"seconds": 42, "status": None}, + "test_foo2": {"seconds": 56, "status": None}, }, }, }, @@ -242,95 +256,101 @@ def test_analysis(self) -> None: base_reports: Dict[Commit, List[Report]] = { # bbbb has no reports, so base is cccc instead - fakehash('b'): [], - fakehash('c'): [ - make_report_v1({ - 'Baz': [ - makecase('test_baz2', 13.605), - # no recent suites have & skip this test - makecase('test_baz1', 0.004, skipped=True), - ], - 'Foo': [ - makecase('test_foo1', 43), - # test added since dddd - makecase('test_foo2', 57), - ], - 'Grault': [ - makecase('test_grault0', 4.88, failed=True), - makecase('test_grault1', 11.967, failed=True), - makecase('test_grault2', 0.395, errored=True), - makecase('test_grault3', 30.460), - ], - 'Norf': [ - makecase('test_norf1', 2), - makecase('test_norf2', 2), - makecase('test_norf3', 2), - makecase('test_norf4', 2), - ], - 'Qux': [ - makecase('test_qux3', 4.978, errored=True), - makecase('test_qux7', 0.002, skipped=True), - makecase('test_qux2', 5.618), - makecase('test_qux4', 7.766, errored=True), - makecase('test_qux6', 23.589, failed=True), - ], - }), + fakehash("b"): [], + fakehash("c"): [ + make_report_v1( + { + "Baz": [ + makecase("test_baz2", 13.605), + # no recent suites have & skip this test + makecase("test_baz1", 0.004, skipped=True), + ], + "Foo": [ + makecase("test_foo1", 43), + # test added since dddd + makecase("test_foo2", 57), + ], + "Grault": [ + makecase("test_grault0", 4.88, failed=True), + makecase("test_grault1", 11.967, failed=True), + makecase("test_grault2", 0.395, errored=True), + makecase("test_grault3", 30.460), + ], + "Norf": [ + makecase("test_norf1", 2), + makecase("test_norf2", 2), + makecase("test_norf3", 2), + makecase("test_norf4", 2), + ], + "Qux": [ + makecase("test_qux3", 4.978, errored=True), + makecase("test_qux7", 0.002, skipped=True), + makecase("test_qux2", 5.618), + makecase("test_qux4", 7.766, errored=True), + makecase("test_qux6", 23.589, failed=True), + ], + } + ), ], - fakehash('d'): [ - make_report_v1({ - 'Foo': [ - makecase('test_foo1', 40), - # removed in cccc - makecase('test_foo3', 17), - ], - 'Baz': [ - # not skipped, so not included in stdev - makecase('test_baz1', 3.14), - ], - 'Qux': [ - makecase('test_qux7', 0.004, skipped=True), - makecase('test_qux2', 6.02), - makecase('test_qux4', 20.932), - ], - 'Norf': [ - makecase('test_norf1', 3), - makecase('test_norf2', 3), - makecase('test_norf3', 3), - makecase('test_norf4', 3), - ], - 'Grault': [ - makecase('test_grault0', 5, failed=True), - makecase('test_grault1', 14.325, failed=True), - makecase('test_grault2', 0.31, errored=True), - ], - }), + fakehash("d"): [ + make_report_v1( + { + "Foo": [ + makecase("test_foo1", 40), + # removed in cccc + makecase("test_foo3", 17), + ], + "Baz": [ + # not skipped, so not included in stdev + makecase("test_baz1", 3.14), + ], + "Qux": [ + makecase("test_qux7", 0.004, skipped=True), + makecase("test_qux2", 6.02), + makecase("test_qux4", 20.932), + ], + "Norf": [ + makecase("test_norf1", 3), + makecase("test_norf2", 3), + makecase("test_norf3", 3), + makecase("test_norf4", 3), + ], + "Grault": [ + makecase("test_grault0", 5, failed=True), + makecase("test_grault1", 14.325, failed=True), + makecase("test_grault2", 0.31, errored=True), + ], + } + ), ], - fakehash('e'): [], - fakehash('f'): [ - make_report_v1({ - 'Foo': [ - makecase('test_foo3', 24), - makecase('test_foo1', 43), - ], - 'Baz': [ - makecase('test_baz2', 16.857), - ], - 'Qux': [ - makecase('test_qux2', 6.422), - makecase('test_qux4', 6.382, errored=True), - ], - 'Norf': [ - makecase('test_norf1', 0.9), - makecase('test_norf3', 0.9), - makecase('test_norf2', 0.9), - makecase('test_norf4', 0.9), - ], - 'Grault': [ - makecase('test_grault0', 4.7, failed=True), - makecase('test_grault1', 13.146, failed=True), - makecase('test_grault2', 0.48, errored=True), - ], - }), + fakehash("e"): [], + fakehash("f"): [ + make_report_v1( + { + "Foo": [ + makecase("test_foo3", 24), + makecase("test_foo1", 43), + ], + "Baz": [ + makecase("test_baz2", 16.857), + ], + "Qux": [ + makecase("test_qux2", 6.422), + makecase("test_qux4", 6.382, errored=True), + ], + "Norf": [ + makecase("test_norf1", 0.9), + makecase("test_norf3", 0.9), + makecase("test_norf2", 0.9), + makecase("test_norf4", 0.9), + ], + "Grault": [ + makecase("test_grault0", 4.7, failed=True), + makecase("test_grault1", 13.146, failed=True), + makecase("test_grault2", 0.48, errored=True), + ], + } + ), ], } @@ -344,7 +364,7 @@ def test_analysis(self) -> None: ) self.assertEqual( - '''\ + """\ - class Baz: - # was 15.23s ± 2.30s @@ -402,14 +422,14 @@ class Qux: + def test_bar2: ... + # now 3.742s (failed) -''', +""", print_test_stats.anomalies(analysis), ) def test_graph(self) -> None: # HEAD is on master self.assertEqual( - '''\ + """\ Commit graph (base is most recent master ancestor with at least one S3 report): : (master) @@ -420,21 +440,21 @@ def test_graph(self) -> None: * dddddddddd 0 reports | : -''', +""", print_test_stats.graph( - head_sha=fakehash('a'), + head_sha=fakehash("a"), head_seconds=502.99, base_seconds={ - fakehash('b'): [47.84], - fakehash('c'): [332.50], - fakehash('d'): [], + fakehash("b"): [47.84], + fakehash("c"): [332.50], + fakehash("d"): [], }, on_master=True, - ) + ), ) self.assertEqual( - '''\ + """\ Commit graph (base is most recent master ancestor with at least one S3 report): : (master) @@ -446,21 +466,21 @@ def test_graph(self) -> None: * dddddddddd 1 report, total time 1234.56s | : -''', +""", print_test_stats.graph( - head_sha=fakehash('a'), + head_sha=fakehash("a"), head_seconds=9988.77, base_seconds={ - fakehash('b'): [7598.77] * 60 + [7654.32] + [7709.87] * 60, - fakehash('c'): [5308.77] * 10 + [5802.33] * 10, - fakehash('d'): [1234.56], + fakehash("b"): [7598.77] * 60 + [7654.32] + [7709.87] * 60, + fakehash("c"): [5308.77] * 10 + [5802.33] * 10, + fakehash("d"): [1234.56], }, on_master=False, - ) + ), ) self.assertEqual( - '''\ + """\ Commit graph (base is most recent master ancestor with at least one S3 report): : (master) @@ -474,22 +494,22 @@ def test_graph(self) -> None: * dddddddddd (base) 15 reports, total time 58.92s ± 25.82s | : -''', +""", print_test_stats.graph( - head_sha=fakehash('a'), + head_sha=fakehash("a"), head_seconds=25.52, base_seconds={ - fakehash('b'): [], - fakehash('c'): [], - fakehash('d'): [52.25] * 14 + [152.26], + fakehash("b"): [], + fakehash("c"): [], + fakehash("d"): [52.25] * 14 + [152.26], }, on_master=False, ancestry_path=5, - ) + ), ) self.assertEqual( - '''\ + """\ Commit graph (base is most recent master ancestor with at least one S3 report): : (master) @@ -503,22 +523,22 @@ def test_graph(self) -> None: * dddddddddd 3 reports, total time 0.10s ± 0.05s | : -''', +""", print_test_stats.graph( - head_sha=fakehash('a'), + head_sha=fakehash("a"), head_seconds=0.08, base_seconds={ - fakehash('b'): [], - fakehash('c'): [0.09], - fakehash('d'): [0.05, 0.10, 0.15], + fakehash("b"): [], + fakehash("c"): [0.09], + fakehash("d"): [0.05, 0.10, 0.15], }, on_master=False, other_ancestors=1, - ) + ), ) self.assertEqual( - '''\ + """\ Commit graph (base is most recent master ancestor with at least one S3 report): : (master) @@ -534,24 +554,24 @@ def test_graph(self) -> None: * dddddddddd 10 reports, total time 5.84s ± 0.92s | : -''', +""", print_test_stats.graph( - head_sha=fakehash('a'), + head_sha=fakehash("a"), head_seconds=5.98, base_seconds={ - fakehash('b'): [4.81, 7.23], - fakehash('c'): [], - fakehash('d'): [4.97] * 5 + [6.71] * 5, + fakehash("b"): [4.81, 7.23], + fakehash("c"): [], + fakehash("d"): [4.97] * 5 + [6.71] * 5, }, on_master=False, ancestry_path=1, other_ancestors=7, - ) + ), ) def test_regression_info(self) -> None: self.assertEqual( - '''\ + """\ ----- Historic stats comparison result ------ job: foo_job @@ -571,41 +591,48 @@ def test_regression_info(self) -> None: Removed (across 1 suite) 1 test, totaling - 1.00s Modified (across 1 suite) 1 test, totaling - 41.48s ± 2.12s Added (across 1 suite) 1 test, totaling + 3.00s -''', +""", print_test_stats.regression_info( - head_sha=fakehash('a'), - head_report=make_report_v1({ - 'Foo': [ - makecase('test_foo', 0.02, skipped=True), - makecase('test_baz', 3), - ]}), + head_sha=fakehash("a"), + head_report=make_report_v1( + { + "Foo": [ + makecase("test_foo", 0.02, skipped=True), + makecase("test_baz", 3), + ] + } + ), base_reports={ - fakehash('b'): [ - make_report_v1({ - 'Foo': [ - makecase('test_foo', 40), - makecase('test_bar', 1), - ], - }), + fakehash("b"): [ + make_report_v1( + { + "Foo": [ + makecase("test_foo", 40), + makecase("test_bar", 1), + ], + } + ), ], - fakehash('c'): [ - make_report_v1({ - 'Foo': [ - makecase('test_foo', 43), - ], - }), + fakehash("c"): [ + make_report_v1( + { + "Foo": [ + makecase("test_foo", 43), + ], + } + ), ], }, - job_name='foo_job', + job_name="foo_job", on_master=False, ancestry_path=0, other_ancestors=0, - ) + ), ) def test_regression_info_new_job(self) -> None: self.assertEqual( - '''\ + """\ ----- Historic stats comparison result ------ job: foo_job @@ -629,25 +656,28 @@ def test_regression_info_new_job(self) -> None: Removed (across 0 suites) 0 tests, totaling 0.00s Modified (across 0 suites) 0 tests, totaling 0.00s Added (across 1 suite) 2 tests, totaling + 3.02s -''', +""", print_test_stats.regression_info( - head_sha=fakehash('a'), - head_report=make_report_v1({ - 'Foo': [ - makecase('test_foo', 0.02, skipped=True), - makecase('test_baz', 3), - ]}), + head_sha=fakehash("a"), + head_report=make_report_v1( + { + "Foo": [ + makecase("test_foo", 0.02, skipped=True), + makecase("test_baz", 3), + ] + } + ), base_reports={ - fakehash('b'): [], - fakehash('c'): [], + fakehash("b"): [], + fakehash("c"): [], }, - job_name='foo_job', + job_name="foo_job", on_master=False, ancestry_path=3, other_ancestors=2, - ) + ), ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tools/test/test_test_history.py b/tools/test/test_test_history.py index 8863c24a5d55..7851ca3f510f 100644 --- a/tools/test/test_test_history.py +++ b/tools/test/test_test_history.py @@ -16,36 +16,33 @@ class Example(TypedDict): def parse_block(block: List[str]) -> Optional[Example]: if block: - match = re.match(r'^\$ ([^ ]+) (.*)$', block[0]) + match = re.match(r"^\$ ([^ ]+) (.*)$", block[0]) if match: cmd, first = match.groups() args = [] for i, line in enumerate([first] + block[1:]): - if line.endswith('\\'): + if line.endswith("\\"): args.append(line[:-1]) else: args.append(line) break return { - 'cmd': cmd, - 'args': shlex.split(''.join(args)), - 'lines': block[i + 1:] + "cmd": cmd, + "args": shlex.split("".join(args)), + "lines": block[i + 1 :], } return None def parse_description(description: str) -> List[Example]: examples: List[Example] = [] - for block in description.split('\n\n'): - matches = [ - re.match(r'^ (.*)$', line) - for line in block.splitlines() - ] + for block in description.split("\n\n"): + matches = [re.match(r"^ (.*)$", line) for line in block.splitlines()] if all(matches): lines = [] for match in matches: assert match - line, = match.groups() + (line,) = match.groups() lines.append(line) example = parse_block(lines) if example: @@ -53,6 +50,7 @@ def parse_description(description: str) -> List[Example]: return examples +@unittest.skip("Skipping as this test is fragile, issue #73083") class TestTestHistory(unittest.TestCase): maxDiff = None @@ -61,14 +59,16 @@ def test_help_examples(self) -> None: self.assertEqual(len(examples), 3) for i, example in enumerate(examples): with self.subTest(i=i): - self.assertTrue(test_history.__file__.endswith(example['cmd'])) - expected = example['lines'] - actual = list(itertools.islice( - test_history.run(example['args']), - len(expected), - )) + self.assertTrue(test_history.__file__.endswith(example["cmd"])) + expected = example["lines"] + actual = list( + itertools.islice( + test_history.run(example["args"]), + len(expected), + ) + ) self.assertEqual(actual, expected) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tools/test/test_test_selections.py b/tools/test/test_test_selections.py index 5ea6fa8b3c62..b846bb53c0cb 100644 --- a/tools/test/test_test_selections.py +++ b/tools/test/test_test_selections.py @@ -7,37 +7,37 @@ class TestCalculateShards(unittest.TestCase): tests: List[str] = [ - 'super_long_test', - 'long_test1', - 'long_test2', - 'normal_test1', - 'normal_test2', - 'normal_test3', - 'short_test1', - 'short_test2', - 'short_test3', - 'short_test4', - 'short_test5', + "super_long_test", + "long_test1", + "long_test2", + "normal_test1", + "normal_test2", + "normal_test3", + "short_test1", + "short_test2", + "short_test3", + "short_test4", + "short_test5", ] test_times: Dict[str, float] = { - 'super_long_test': 55, - 'long_test1': 22, - 'long_test2': 18, - 'normal_test1': 9, - 'normal_test2': 7, - 'normal_test3': 5, - 'short_test1': 1, - 'short_test2': 0.6, - 'short_test3': 0.4, - 'short_test4': 0.3, - 'short_test5': 0.01, + "super_long_test": 55, + "long_test1": 22, + "long_test2": 18, + "normal_test1": 9, + "normal_test2": 7, + "normal_test3": 5, + "short_test1": 1, + "short_test2": 0.6, + "short_test3": 0.4, + "short_test4": 0.3, + "short_test5": 0.01, } def assert_shards_equal( self, expected_shards: List[Tuple[float, List[str]]], - actual_shards: List[Tuple[float, List[str]]] + actual_shards: List[Tuple[float, List[str]]], ) -> None: for expected, actual in zip(expected_shards, actual_shards): self.assertAlmostEqual(expected[0], actual[0]) @@ -45,53 +45,140 @@ def assert_shards_equal( def test_calculate_2_shards_with_complete_test_times(self) -> None: expected_shards = [ - (60, ['super_long_test', 'normal_test3']), - (58.31, ['long_test1', 'long_test2', 'normal_test1', 'normal_test2', 'short_test1', 'short_test2', - 'short_test3', 'short_test4', 'short_test5']) + (60, ["super_long_test", "normal_test3"]), + ( + 58.31, + [ + "long_test1", + "long_test2", + "normal_test1", + "normal_test2", + "short_test1", + "short_test2", + "short_test3", + "short_test4", + "short_test5", + ], + ), ] - self.assert_shards_equal(expected_shards, calculate_shards(2, self.tests, self.test_times)) + self.assert_shards_equal( + expected_shards, calculate_shards(2, self.tests, self.test_times) + ) + def test_calculate_1_shard_with_complete_test_times(self) -> None: + expected_shards = [ + ( + 118.31, + [ + "super_long_test", + "long_test1", + "long_test2", + "normal_test1", + "normal_test2", + "normal_test3", + "short_test1", + "short_test2", + "short_test3", + "short_test4", + "short_test5", + ], + ), + ] + self.assert_shards_equal( + expected_shards, calculate_shards(1, self.tests, self.test_times) + ) def test_calculate_5_shards_with_complete_test_times(self) -> None: expected_shards = [ - (55.0, ['super_long_test']), - (22.0, ['long_test1', ]), - (18.0, ['long_test2', ]), - (11.31, ['normal_test1', 'short_test1', 'short_test2', 'short_test3', 'short_test4', 'short_test5']), - (12.0, ['normal_test2', 'normal_test3']), + (55.0, ["super_long_test"]), + ( + 22.0, + [ + "long_test1", + ], + ), + ( + 18.0, + [ + "long_test2", + ], + ), + ( + 11.31, + [ + "normal_test1", + "short_test1", + "short_test2", + "short_test3", + "short_test4", + "short_test5", + ], + ), + (12.0, ["normal_test2", "normal_test3"]), ] - self.assert_shards_equal(expected_shards, calculate_shards(5, self.tests, self.test_times)) - + self.assert_shards_equal( + expected_shards, calculate_shards(5, self.tests, self.test_times) + ) def test_calculate_2_shards_with_incomplete_test_times(self) -> None: - incomplete_test_times = {k: v for k, v in self.test_times.items() if 'test1' in k} + incomplete_test_times = { + k: v for k, v in self.test_times.items() if "test1" in k + } expected_shards = [ - (22.0, ['long_test1', 'long_test2', 'normal_test3', 'short_test3', 'short_test5']), - (10.0, ['normal_test1', 'short_test1', 'super_long_test', 'normal_test2', 'short_test2', 'short_test4']), + ( + 22.0, + [ + "long_test1", + "long_test2", + "normal_test3", + "short_test3", + "short_test5", + ], + ), + ( + 10.0, + [ + "normal_test1", + "short_test1", + "super_long_test", + "normal_test2", + "short_test2", + "short_test4", + ], + ), ] - self.assert_shards_equal(expected_shards, calculate_shards(2, self.tests, incomplete_test_times)) - + self.assert_shards_equal( + expected_shards, calculate_shards(2, self.tests, incomplete_test_times) + ) def test_calculate_5_shards_with_incomplete_test_times(self) -> None: - incomplete_test_times = {k: v for k, v in self.test_times.items() if 'test1' in k} + incomplete_test_times = { + k: v for k, v in self.test_times.items() if "test1" in k + } expected_shards = [ - (22.0, ['long_test1', 'normal_test2', 'short_test5']), - (9.0, ['normal_test1', 'normal_test3']), - (1.0, ['short_test1', 'short_test2']), - (0.0, ['super_long_test', 'short_test3']), - (0.0, ['long_test2', 'short_test4']), + (22.0, ["long_test1", "normal_test2", "short_test5"]), + (9.0, ["normal_test1", "normal_test3"]), + (1.0, ["short_test1", "short_test2"]), + (0.0, ["super_long_test", "short_test3"]), + (0.0, ["long_test2", "short_test4"]), ] - self.assert_shards_equal(expected_shards, calculate_shards(5, self.tests, incomplete_test_times)) + self.assert_shards_equal( + expected_shards, calculate_shards(5, self.tests, incomplete_test_times) + ) def test_calculate_2_shards_against_optimal_shards(self) -> None: for _ in range(100): random.seed(120) random_times = {k: random.random() * 10 for k in self.tests} # all test times except first two - rest_of_tests = [i for k, i in random_times.items() if k != 'super_long_test' and k != 'long_test1'] + rest_of_tests = [ + i + for k, i in random_times.items() + if k != "super_long_test" and k != "long_test1" + ] sum_of_rest = sum(rest_of_tests) - random_times['super_long_test'] = max(sum_of_rest / 2, max(rest_of_tests)) - random_times['long_test1'] = sum_of_rest - random_times['super_long_test'] + random_times["super_long_test"] = max(sum_of_rest / 2, max(rest_of_tests)) + random_times["long_test1"] = sum_of_rest - random_times["super_long_test"] # An optimal sharding would look like the below, but we don't need to compute this for the test: # optimal_shards = [ # (sum_of_rest, ['super_long_test', 'long_test1']), @@ -103,10 +190,12 @@ def test_calculate_2_shards_against_optimal_shards(self) -> None: # The calculated shard should not have a ratio worse than 7/6 for num_shards = 2 self.assertGreaterEqual(7.0 / 6.0, max_shard_time / sum_of_rest) sorted_tests = sorted(self.tests) - sorted_shard_tests = sorted(calculated_shards[0][1] + calculated_shards[1][1]) + sorted_shard_tests = sorted( + calculated_shards[0][1] + calculated_shards[1][1] + ) # All the tests should be represented by some shard self.assertEqual(sorted_tests, sorted_shard_tests) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tools/test/test_trailing_newlines.py b/tools/test/test_trailing_newlines.py deleted file mode 100644 index 4f4b662b1036..000000000000 --- a/tools/test/test_trailing_newlines.py +++ /dev/null @@ -1,49 +0,0 @@ -from tools.linter import trailing_newlines -import unittest -import tempfile - - -def correct_trailing_newlines(file_contents: str) -> bool: - with tempfile.NamedTemporaryFile(mode='w', delete=False) as tmp: - filename = tmp.name - tmp.write(file_contents) - return trailing_newlines.correct_trailing_newlines(filename) - - -class TestTrailingNewlines(unittest.TestCase): - def test_empty(self) -> None: - self.assertTrue(correct_trailing_newlines('')) - - def test_single_byte(self) -> None: - self.assertFalse(correct_trailing_newlines('a')) - - def test_single_newline(self) -> None: - self.assertFalse(correct_trailing_newlines('\n')) - - def test_two_newlines(self) -> None: - self.assertFalse(correct_trailing_newlines('\n\n')) - - def test_three_newlines(self) -> None: - self.assertFalse(correct_trailing_newlines('\n\n\n')) - - def test_hello_world(self) -> None: - self.assertFalse(correct_trailing_newlines('hello world')) - - def test_hello_world_newline(self) -> None: - self.assertTrue(correct_trailing_newlines('hello world\n')) - - def test_hello_world_two_newlines(self) -> None: - self.assertFalse(correct_trailing_newlines('hello world\n\n')) - - def test_hello_world_three_newlines(self) -> None: - self.assertFalse(correct_trailing_newlines('hello world\n\n\n')) - - def test_hello_world_multiline(self) -> None: - self.assertFalse(correct_trailing_newlines('hello\nworld')) - - def test_hello_world_multiline_gap(self) -> None: - self.assertTrue(correct_trailing_newlines('hello\n\nworld\n')) - - -if __name__ == '__main__': - unittest.main() diff --git a/tools/test/test_translate_annotations.py b/tools/test/test_translate_annotations.py deleted file mode 100644 index 867decc4af1a..000000000000 --- a/tools/test/test_translate_annotations.py +++ /dev/null @@ -1,280 +0,0 @@ -import re -import unittest - -from tools.linter.translate_annotations import parse_annotation, parse_diff, translate - -flake8_regex \ - = r'^(?P.*?):(?P\d+):(?P\d+): (?P\w+\d+) (?P.*)' -clang_tidy_regex \ - = r'^(?P.*?):(?P\d+):(?P\d+): (?P.*?) \[(?P.*)\]' - -# in the below example patch, note that the filenames differ, so the -# translation should reflect that as well as the line numbers - -# $ git clone -b 1.0.2 https://github.com/cscorley/whatthepatch.git -# $ cd whatthepatch/tests/casefiles -# $ git diff --no-index --unified=0 lao tzu -lao_tzu_diff = ''' -diff --git a/lao b/tzu -index 635ef2c..5af88a8 100644 ---- a/lao -+++ b/tzu -@@ -1,2 +0,0 @@ --The Way that can be told of is not the eternal Way; --The name that can be named is not the eternal name. -@@ -4 +2,2 @@ The Nameless is the origin of Heaven and Earth; --The Named is the mother of all things. -+The named is the mother of all things. -+ -@@ -11,0 +11,3 @@ But after they are produced, -+They both may be called deep and profound. -+Deeper and more profound, -+The door of all subtleties! -'''.lstrip() - -sparser_diff = ''' -diff --git a/foo.txt b/bar.txt -index 27a6dad..6fae323 100644 ---- a/foo.txt -+++ b/bar.txt -@@ -4,3 +4,2 @@ lines --lines --lines --lines -+A change!! -+Wow -@@ -10,2 +8,0 @@ more lines --even more --even more -'''.lstrip() - -new_file_diff = ''' -diff --git a/torch/csrc/jit/tensorexpr/operators/conv2d.h b/torch/csrc/jit/tensorexpr/operators/conv2d.h -new file mode 100644 -index 0000000000..a81eeae346 ---- /dev/null -+++ b/torch/csrc/jit/tensorexpr/operators/conv2d.h -@@ -0,0 +1,19 @@ -+#pragma once -+ -+#include -+ -+namespace torch { -+namespace jit { -+namespace tensorexpr { -+ -+TORCH_API Tensor* conv2d_depthwise( -+ BufHandle input, -+ BufHandle weight, -+ BufHandle bias, -+ int stride, -+ int pad, -+ int groups); -+ -+} // namespace tensorexpr -+} // namespace jit -+} // namespace torch -'''.lstrip() - -# fun fact, this example fools VS Code's diff syntax highlighter -haskell_diff = ''' -diff --git a/hello.hs b/hello.hs -index ffb8d4ad14..0872ac9db6 100644 ---- a/hello.hs -+++ b/hello.hs -@@ -1 +1 @@ ---- a/hello/world/example -+main = putStrLn "Hello, world!" -'''.lstrip() - - -class TestTranslateAnnotations(unittest.TestCase): - maxDiff = None - - def test_parse_diff_lao_tzu(self) -> None: - self.assertEqual( - parse_diff(lao_tzu_diff), - { - 'old_filename': 'lao', - 'hunks': [ - { - 'old_start': 1, - 'old_count': 2, - 'new_start': 0, - 'new_count': 0, - }, - { - 'old_start': 4, - 'old_count': 1, - 'new_start': 2, - 'new_count': 2, - }, - { - 'old_start': 11, - 'old_count': 0, - 'new_start': 11, - 'new_count': 3, - }, - ], - }, - ) - - def test_parse_diff_new_file(self) -> None: - self.assertEqual( - parse_diff(new_file_diff), - { - 'old_filename': None, - 'hunks': [ - { - 'old_start': 0, - 'old_count': 0, - 'new_start': 1, - 'new_count': 19, - }, - ], - }, - ) - - def test_parse_diff_haskell(self) -> None: - self.assertEqual( - parse_diff(haskell_diff), - { - 'old_filename': 'hello.hs', - 'hunks': [ - { - 'old_start': 1, - 'old_count': 1, - 'new_start': 1, - 'new_count': 1, - }, - ], - }, - ) - - def test_translate_lao_tzu(self) -> None: - # we'll pretend that this diff represents the file lao being - # renamed to tzu and also modified - diff = parse_diff(lao_tzu_diff) - - # line numbers less than 1 are invalid so they map to None - self.assertEqual(translate(diff, -1), None) - self.assertEqual(translate(diff, 0), None) - - # the first two lines of the file were removed, so the first - # line of the new version corresponds to the third line of the - # original - self.assertEqual(translate(diff, 1), 3) - - # the second and third lines of the new file were not present in - # the original version, so they map to None - self.assertEqual(translate(diff, 2), None) - self.assertEqual(translate(diff, 3), None) - - # at this point, we have a stretch of lines that are identical - # in both versions of the file, but the original version of the - # file had 4 lines before this section whereas the new version - # has only 3 lines before this section - self.assertEqual(translate(diff, 4), 5) - self.assertEqual(translate(diff, 5), 6) - self.assertEqual(translate(diff, 6), 7) - self.assertEqual(translate(diff, 7), 8) - self.assertEqual(translate(diff, 8), 9) - self.assertEqual(translate(diff, 9), 10) - self.assertEqual(translate(diff, 10), 11) - - # these three lines were added in the new version of the file, - # so they map to None - self.assertEqual(translate(diff, 11), None) - self.assertEqual(translate(diff, 12), None) - self.assertEqual(translate(diff, 13), None) - - # the diff doesn't say how long the file is, so we keep mapping - # line numbers back; since we can look back at the original - # files, though, we can see that the original is two lines - # shorter than the new version, which explains why we are - # subtracting 2 here - self.assertEqual(translate(diff, 14), 12) - self.assertEqual(translate(diff, 15), 13) - - def test_translate_empty(self) -> None: - diff = parse_diff('--- a/foo') - - # again, we start numbering at 1 - self.assertEqual(translate(diff, -1), None) - self.assertEqual(translate(diff, 0), None) - - # this diff says there are no changes, so all line numbers - # greater than zero map to themselves - self.assertEqual(translate(diff, 1), 1) - self.assertEqual(translate(diff, 2), 2) - self.assertEqual(translate(diff, 3), 3) - self.assertEqual(translate(diff, 4), 4) - self.assertEqual(translate(diff, 5), 5) - - def test_translate_sparser(self) -> None: - diff = parse_diff(sparser_diff) - - # again, we start numbering at 1 - self.assertEqual(translate(diff, -1), None) - self.assertEqual(translate(diff, 0), None) - - # the first three lines are unchanged - self.assertEqual(translate(diff, 1), 1) - self.assertEqual(translate(diff, 2), 2) - self.assertEqual(translate(diff, 3), 3) - - # we removed three lines here and added two, so the two lines we - # added don't map back to anything in the original file - self.assertEqual(translate(diff, 4), None) - self.assertEqual(translate(diff, 5), None) - - # we have some unchanged lines here, but in the preceding hunk - # we removed 3 and added only 2, so we have an offset of 1 - self.assertEqual(translate(diff, 6), 7) - self.assertEqual(translate(diff, 7), 8) - - # since the unified diff format essentially subtracts 1 from the - # starting line number when the count is 0, and since we use - # bisect.bisect_right to decide which hunk to look at, an - # earlier version of translate had a bug that caused it to get - # confused because it would look at the second hunk (which lists - # 8 as its start line number) rather than the first hunk - self.assertEqual(translate(diff, 8), 9) - - # after the two lines that we removed in the second hunk, we've - # reduced the total length of the file by 3 lines, so once we - # reach the end of the diff, we just add 3 to every line number - self.assertEqual(translate(diff, 9), 12) - self.assertEqual(translate(diff, 10), 13) - self.assertEqual(translate(diff, 11), 14) - self.assertEqual(translate(diff, 12), 15) - - def test_parse_annotation_flake8(self) -> None: - regex = re.compile(flake8_regex) - self.assertEqual( - parse_annotation(regex, 'README.md:1:3: R100 make a better title'), - { - 'filename': 'README.md', - 'lineNumber': 1, - 'columnNumber': 3, - 'errorCode': 'R100', - 'errorDesc': 'make a better title', - }, - ) - - def test_parse_annotation_clang_tidy(self) -> None: - regex = re.compile(clang_tidy_regex) - self.assertEqual( - parse_annotation(regex, 'README.md:2:1: improve description [R200]'), - { - 'filename': 'README.md', - 'lineNumber': 2, - 'columnNumber': 1, - 'errorCode': 'R200', - 'errorDesc': 'improve description', - }, - ) - - -if __name__ == '__main__': - unittest.main() diff --git a/tools/testing/explicit_ci_jobs.py b/tools/testing/explicit_ci_jobs.py index 5944d226b0bc..3de04e1a18e9 100755 --- a/tools/testing/explicit_ci_jobs.py +++ b/tools/testing/explicit_ci_jobs.py @@ -45,7 +45,13 @@ def add_job( if requires is not None: for requirement in requires: dependency = past_jobs[requirement] - add_job(workflows, dependency["workflow_name"], dependency["type"], dependency["job"], past_jobs) + add_job( + workflows, + dependency["workflow_name"], + dependency["type"], + dependency["job"], + past_jobs, + ) workflows[workflow_name]["jobs"].append({type: job}) @@ -88,13 +94,16 @@ def get_filtered_circleci_config( def commit_ci(files: List[str], message: str) -> None: # Check that there are no other modified files than the ones edited by this # tool - stdout = subprocess.run(["git", "status", "--porcelain"], stdout=subprocess.PIPE).stdout.decode() + stdout = subprocess.run( + ["git", "status", "--porcelain"], stdout=subprocess.PIPE + ).stdout.decode() for line in stdout.split("\n"): if line == "": continue if line[0] != " ": - raise RuntimeError(f"Refusing to commit while other changes are already staged: {line}") - + raise RuntimeError( + f"Refusing to commit while other changes are already staged: {line}" + ) # Make the commit subprocess.run(["git", "add"] + files) @@ -107,10 +116,12 @@ def commit_ci(files: List[str], message: str) -> None: ) parser.add_argument("--job", action="append", help="job name", default=[]) parser.add_argument( - "--filter-gha", help="keep only these github actions (glob match)", default='' + "--filter-gha", help="keep only these github actions (glob match)", default="" ) parser.add_argument( - "--make-commit", action="store_true", help="add change to git with to a do-not-merge commit" + "--make-commit", + action="store_true", + help="add change to git with to a do-not-merge commit", ) args = parser.parse_args() @@ -118,7 +129,9 @@ def commit_ci(files: List[str], message: str) -> None: with open(CONFIG_YML, "r") as f: config_yml = yaml.safe_load(f.read()) - config_yml["workflows"] = get_filtered_circleci_config(config_yml["workflows"], args.job) + config_yml["workflows"] = get_filtered_circleci_config( + config_yml["workflows"], args.job + ) with open(CONFIG_YML, "w") as f: yaml.dump(config_yml, f) @@ -131,13 +144,15 @@ def commit_ci(files: List[str], message: str) -> None: path.resolve().unlink() if args.make_commit: - jobs_str = '\n'.join([f" * {job}" for job in args.job]) - message = textwrap.dedent(f""" + jobs_str = "\n".join([f" * {job}" for job in args.job]) + message = textwrap.dedent( + f""" [skip ci][do not merge] Edit config.yml to filter specific jobs Filter CircleCI to only run: {jobs_str} See [Run Specific CI Jobs](https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md#run-specific-ci-jobs) for details. - """).strip() + """ + ).strip() commit_ci([str(f.relative_to(REPO_ROOT)) for f in touched_files], message) diff --git a/tools/testing/test_selections.py b/tools/testing/test_selections.py index c83b0619f030..67ca627cc2c0 100644 --- a/tools/testing/test_selections.py +++ b/tools/testing/test_selections.py @@ -6,16 +6,16 @@ from tools.stats.s3_stat_parser import ( get_previous_reports_for_branch, get_previous_reports_for_pr, - Report, Version2Report, - HAVE_BOTO3) -from tools.stats.import_test_stats import ( - get_disabled_tests, - get_slow_tests + Report, + Version2Report, + HAVE_BOTO3, ) +from tools.stats.import_test_stats import get_disabled_tests, get_slow_tests from typing import Any, Dict, List, Optional, Tuple, cast from typing_extensions import TypedDict + class JobTimeJSON(TypedDict): commit: str JOB_BASE_NAME: str @@ -23,50 +23,55 @@ class JobTimeJSON(TypedDict): def _get_stripped_CI_job() -> str: - """E.g. convert 'pytorch_windows_vs2019_py36_cuda10.1_build' to 'pytorch_windows_vs2019_py36_cuda10.1'. - """ - job = os.environ.get("JOB_BASE_NAME", "").rstrip('0123456789') - if job.endswith('_slow_test'): - job = job[:len(job) - len('_slow_test')] - elif job.endswith('_test') or job.endswith('-test'): - job = job[:len(job) - len('_test')] - elif job.endswith('_build') or job.endswith('-build'): - job = job[:len(job) - len('_build')] + """E.g. convert 'pytorch_windows_vs2019_py36_cuda10.1_build' to 'pytorch_windows_vs2019_py36_cuda10.1'.""" + job = os.environ.get("JOB_BASE_NAME", "").rstrip("0123456789") + if job.endswith("_slow_test"): + job = job[: len(job) - len("_slow_test")] + elif job.endswith("_test") or job.endswith("-test"): + job = job[: len(job) - len("_test")] + elif job.endswith("_build") or job.endswith("-build"): + job = job[: len(job) - len("_build")] return job def _get_job_times_json(job_times: Dict[str, float]) -> JobTimeJSON: return { - 'commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'], encoding="ascii").strip(), - 'JOB_BASE_NAME': _get_stripped_CI_job(), - 'job_times': job_times, + "commit": subprocess.check_output( + ["git", "rev-parse", "HEAD"], encoding="ascii" + ).strip(), + "JOB_BASE_NAME": _get_stripped_CI_job(), + "job_times": job_times, } def _calculate_job_times(reports: List["Report"]) -> Dict[str, float]: - """Compute test runtime by filename: ("test_file_name" -> (current_avg, # values)) - """ + """Compute test runtime by filename: ("test_file_name" -> (current_avg, # values))""" jobs_to_times: Dict[str, Tuple[float, int]] = dict() for report in reports: v_report = cast(Version2Report, report) - assert 'format_version' in v_report.keys() and v_report.get('format_version') == 2, \ - "S3 format currently handled is version 2 only" - files: Dict[str, Any] = v_report['files'] + assert ( + "format_version" in v_report.keys() and v_report.get("format_version") == 2 + ), "S3 format currently handled is version 2 only" + files: Dict[str, Any] = v_report["files"] for name, test_file in files.items(): if name not in jobs_to_times: - jobs_to_times[name] = (test_file['total_seconds'], 1) + jobs_to_times[name] = (test_file["total_seconds"], 1) else: curr_avg, curr_count = jobs_to_times[name] new_count = curr_count + 1 - new_avg = (curr_avg * curr_count + test_file['total_seconds']) / new_count + new_avg = ( + curr_avg * curr_count + test_file["total_seconds"] + ) / new_count jobs_to_times[name] = (new_avg, new_count) return {job: time for job, (time, _) in jobs_to_times.items()} -def calculate_shards(num_shards: int, tests: List[str], job_times: Dict[str, float]) -> List[Tuple[float, List[str]]]: +def calculate_shards( + num_shards: int, tests: List[str], job_times: Dict[str, float] +) -> List[Tuple[float, List[str]]]: filtered_job_times: Dict[str, float] = dict() - unknown_jobs : List[str] = [] + unknown_jobs: List[str] = [] for test in tests: if test in job_times: filtered_job_times[test] = job_times[test] @@ -75,13 +80,18 @@ def calculate_shards(num_shards: int, tests: List[str], job_times: Dict[str, flo # The following attempts to implement a partition approximation greedy algorithm # See more at https://en.wikipedia.org/wiki/Greedy_number_partitioning - sorted_jobs = sorted(filtered_job_times, key=lambda j: filtered_job_times[j], reverse=True) + sorted_jobs = sorted( + filtered_job_times, key=lambda j: filtered_job_times[j], reverse=True + ) sharded_jobs: List[Tuple[float, List[str]]] = [(0.0, []) for _ in range(num_shards)] for job in sorted_jobs: min_shard_index = sorted(range(num_shards), key=lambda i: sharded_jobs[i][0])[0] curr_shard_time, curr_shard_jobs = sharded_jobs[min_shard_index] curr_shard_jobs.append(job) - sharded_jobs[min_shard_index] = (curr_shard_time + filtered_job_times[job], curr_shard_jobs) + sharded_jobs[min_shard_index] = ( + curr_shard_time + filtered_job_times[job], + curr_shard_jobs, + ) # Round robin the unknown jobs starting with the smallest shard index = sorted(range(num_shards), key=lambda i: sharded_jobs[i][0])[0] @@ -94,14 +104,20 @@ def calculate_shards(num_shards: int, tests: List[str], job_times: Dict[str, flo def _pull_job_times_from_S3() -> Dict[str, float]: if HAVE_BOTO3: ci_job_prefix = _get_stripped_CI_job() - s3_reports: List["Report"] = get_previous_reports_for_branch('origin/viable/strict', ci_job_prefix) + s3_reports: List["Report"] = get_previous_reports_for_branch( + "origin/viable/strict", ci_job_prefix + ) else: - print('Uh oh, boto3 is not found. Either it is not installed or we failed to import s3_stat_parser.') - print('If not installed, please install boto3 for automatic sharding and test categorization.') + print( + "Uh oh, boto3 is not found. Either it is not installed or we failed to import s3_stat_parser." + ) + print( + "If not installed, please install boto3 for automatic sharding and test categorization." + ) s3_reports = [] if len(s3_reports) == 0: - print('Gathered no reports from S3. Please proceed without them.') + print("Gathered no reports from S3. Please proceed without them.") return dict() return _calculate_job_times(s3_reports) @@ -116,20 +132,26 @@ def _query_past_job_times(test_times_file: Optional[str] = None) -> Dict[str, fl with open(test_times_file) as file: test_times_json: JobTimeJSON = json.load(file) - curr_commit = subprocess.check_output(['git', 'rev-parse', 'HEAD'], encoding="ascii").strip() - file_commit = test_times_json.get('commit', '') + curr_commit = subprocess.check_output( + ["git", "rev-parse", "HEAD"], encoding="ascii" + ).strip() + file_commit = test_times_json.get("commit", "") curr_ci_job = _get_stripped_CI_job() - file_ci_job = test_times_json.get('JOB_BASE_NAME', 'N/A') + file_ci_job = test_times_json.get("JOB_BASE_NAME", "N/A") if curr_commit != file_commit: - print(f'Current test times file is from different commit {file_commit}.') + print(f"Current test times file is from different commit {file_commit}.") elif curr_ci_job != file_ci_job: - print(f'Current test times file is for different CI job {file_ci_job}.') + print(f"Current test times file is for different CI job {file_ci_job}.") else: - print(f'Found stats for current commit: {curr_commit} and job: {curr_ci_job}. Proceeding with those values.') - return test_times_json.get('job_times', {}) + print( + f"Found stats for current commit: {curr_commit} and job: {curr_ci_job}. Proceeding with those values." + ) + return test_times_json.get("job_times", {}) # Found file, but commit or CI job in JSON doesn't match - print(f'Overwriting current file with stats based on current commit: {curr_commit} and CI job: {curr_ci_job}') + print( + f"Overwriting current file with stats based on current commit: {curr_commit} and CI job: {curr_ci_job}" + ) job_times = export_S3_test_times(test_times_file) @@ -142,21 +164,26 @@ def _query_failure_test_module(reports: List[Tuple["Report", str]]) -> List[str] return test_modules report = reports[0][0] v_report = cast(Version2Report, report) - assert 'format_version' in v_report.keys() and v_report.get('format_version') == 2, \ - "S3 format currently handled is version 2 only" - files: Dict[str, Any] = v_report['files'] + assert ( + "format_version" in v_report.keys() and v_report.get("format_version") == 2 + ), "S3 format currently handled is version 2 only" + files: Dict[str, Any] = v_report["files"] for fname, file in files.items(): contains_failure = any( - any(case['status'] == 'errored' or case['status'] == 'failed' - for _, case in suite['cases'].items()) - for _, suite in file['suites'].items()) + any( + case["status"] == "errored" or case["status"] == "failed" + for _, case in suite["cases"].items() + ) + for _, suite in file["suites"].items() + ) if contains_failure: test_modules.append(fname) return test_modules def _query_changed_test_files() -> List[str]: - cmd = ["git", "diff", "--name-only", "origin/master", "HEAD"] + default_branch = f"origin/{os.environ.get('GIT_DEFAULT_BRANCH', 'master')}" + cmd = ["git", "diff", "--name-only", default_branch, "HEAD"] proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if proc.returncode != 0: @@ -167,14 +194,19 @@ def _query_changed_test_files() -> List[str]: return lines -def get_shard_based_on_S3(which_shard: int, num_shards: int, tests: List[str], test_times_file: str) -> List[str]: - """Get sharded test allocation based on historic S3 data. - """ +# Get sharded test allocation based on historic S3 data. +def get_shard_based_on_S3( + which_shard: int, num_shards: int, tests: List[str], test_times_file: str +) -> List[str]: + # Short circuit and don't do any work if there's only 1 shard + if num_shards == 1: + return tests + jobs_to_times = _query_past_job_times(test_times_file) # Got no stats from S3, returning early to save runtime if len(jobs_to_times) == 0: - print('Gathered no stats from S3. Proceeding with default sharding plan.') + print("Gathered no stats from S3. Proceeding with default sharding plan.") return tests[which_shard - 1 :: num_shards] shards = calculate_shards(num_shards, tests, jobs_to_times) @@ -182,14 +214,15 @@ def get_shard_based_on_S3(which_shard: int, num_shards: int, tests: List[str], t return tests_from_shard -def get_slow_tests_based_on_S3(test_list: List[str], td_list: List[str], slow_test_threshold: int) -> List[str]: - """Get list of slow tests based on historic S3 data. - """ +def get_slow_tests_based_on_S3( + test_list: List[str], td_list: List[str], slow_test_threshold: int +) -> List[str]: + """Get list of slow tests based on historic S3 data.""" jobs_to_times: Dict[str, float] = _query_past_job_times() # Got no stats from S3, returning early to save runtime if len(jobs_to_times) == 0: - print('Gathered no stats from S3. No new slow tests calculated.') + print("Gathered no stats from S3. No new slow tests calculated.") return [] slow_tests: List[str] = [] @@ -201,38 +234,42 @@ def get_slow_tests_based_on_S3(test_list: List[str], td_list: List[str], slow_te def get_specified_test_cases(filename: str, tests: List[str]) -> Dict[str, List[str]]: - """Get test cases from a specified test case file. Usually exported manually or through CI system. - """ + """Get test cases from a specified test case file. Usually exported manually or through CI system.""" if not os.path.exists(filename): - print(f'Could not find specified tests file: {filename}. Proceeding with default behavior.') + print( + f"Could not find specified tests file: {filename}. Proceeding with default behavior." + ) return dict() # The below encoding is utf-8-sig because utf-8 doesn't properly handle the byte-order-mark character - with open(filename, mode='r', encoding="utf-8-sig") as csv_file: + with open(filename, mode="r", encoding="utf-8-sig") as csv_file: csv_reader = csv.DictReader(csv_file) line_count = 0 specified_test_case_dict: Dict[str, List[str]] = dict() for row in csv_reader: line_count += 1 if line_count == 1: - if 'test_filename' not in row or 'test_case_name' not in row: - print('Data is missing necessary columns for test specification. Proceeding with default behavior.') + if "test_filename" not in row or "test_case_name" not in row: + print( + "Data is missing necessary columns for test specification. Proceeding with default behavior." + ) return dict() - test_filename = row['test_filename'] - test_case_name = row['test_case_name'] + test_filename = row["test_filename"] + test_case_name = row["test_case_name"] if test_filename not in tests: - print(f'Specified test_filename {test_filename} not found in TESTS. Skipping.') + print( + f"Specified test_filename {test_filename} not found in TESTS. Skipping." + ) continue if test_filename not in specified_test_case_dict: specified_test_case_dict[test_filename] = [] specified_test_case_dict[test_filename].append(test_case_name) - print(f'Processed {line_count} test cases.') + print(f"Processed {line_count} test cases.") return specified_test_case_dict def get_reordered_tests(tests: List[str], is_reordering_by_pr: bool) -> List[str]: - """Get the reordered test filename list based on github PR history or git changed file. - """ + """Get the reordered test filename list based on github PR history or git changed file.""" prioritized_tests = [] # Try using historic stats from PR. if is_reordering_by_pr and HAVE_BOTO3: @@ -240,7 +277,8 @@ def get_reordered_tests(tests: List[str], is_reordering_by_pr: bool) -> List[str if len(pr_number): ci_job_prefix = _get_stripped_CI_job() s3_reports: List[Tuple["Report", str]] = get_previous_reports_for_pr( - pr_number, ci_job_prefix) + pr_number, ci_job_prefix + ) prioritized_tests = _query_failure_test_module(s3_reports) print("Prioritized test from previous CI info.") @@ -253,9 +291,11 @@ def get_reordered_tests(tests: List[str], is_reordering_by_pr: bool) -> List[str return tests prefix = f"test{os.path.sep}" - prioritized_tests = [f for f in changed_files if f.startswith(prefix) and f.endswith(".py")] - prioritized_tests = [f[len(prefix):] for f in prioritized_tests] - prioritized_tests = [f[:-len(".py")] for f in prioritized_tests] + prioritized_tests = [ + f for f in changed_files if f.startswith(prefix) and f.endswith(".py") + ] + prioritized_tests = [f[len(prefix) :] for f in prioritized_tests] + prioritized_tests = [f[: -len(".py")] for f in prioritized_tests] print("Prioritized test from test file changes.") bring_to_front = [] @@ -267,12 +307,16 @@ def get_reordered_tests(tests: List[str], is_reordering_by_pr: bool) -> List[str else: the_rest.append(test) if len(tests) == len(bring_to_front) + len(the_rest): - print(f"reordering tests for PR:\n" - f"prioritized: {bring_to_front}\nthe rest: {the_rest}\n") + print( + f"reordering tests for PR:\n" + f"prioritized: {bring_to_front}\nthe rest: {the_rest}\n" + ) return bring_to_front + the_rest else: - print(f"Something went wrong in CI reordering, expecting total of {len(tests)}:\n" - f"but found prioritized: {len(bring_to_front)}\nthe rest: {len(the_rest)}\n") + print( + f"Something went wrong in CI reordering, expecting total of {len(tests)}:\n" + f"but found prioritized: {len(bring_to_front)}\nthe rest: {len(the_rest)}\n" + ) return tests @@ -280,13 +324,13 @@ def get_reordered_tests(tests: List[str], is_reordering_by_pr: bool) -> List[str def export_S3_test_times(test_times_filename: Optional[str] = None) -> Dict[str, float]: test_times: Dict[str, float] = _pull_job_times_from_S3() if test_times_filename is not None: - print(f'Exporting S3 test stats to {test_times_filename}.') + print(f"Exporting S3 test stats to {test_times_filename}.") if os.path.exists(test_times_filename): - print(f'Overwriting existent file: {test_times_filename}') - with open(test_times_filename, 'w+') as file: + print(f"Overwriting existent file: {test_times_filename}") + with open(test_times_filename, "w+") as file: job_times_json = _get_job_times_json(test_times) - json.dump(job_times_json, file, indent=' ', separators=(',', ': ')) - file.write('\n') + json.dump(job_times_json, file, indent=" ", separators=(",", ": ")) + file.write("\n") return test_times diff --git a/tools/ufunc_defs.bzl b/tools/ufunc_defs.bzl new file mode 100644 index 000000000000..4490f05be015 --- /dev/null +++ b/tools/ufunc_defs.bzl @@ -0,0 +1,25 @@ +load("@bazel_skylib//lib:paths.bzl", "paths") +load(":build_variables.bzl", "aten_ufunc_headers") + +aten_ufunc_names = [ + paths.split_extension(paths.basename(h))[0] + for h in aten_ufunc_headers +] + +def aten_ufunc_generated_cpu_sources(gencode_pattern = "{}"): + return [gencode_pattern.format(name) for name in [ + "UfuncCPU_{}.cpp".format(n) + for n in aten_ufunc_names + ]] + +def aten_ufunc_generated_cpu_kernel_sources(gencode_pattern = "{}"): + return [gencode_pattern.format(name) for name in [ + "UfuncCPUKernel_{}.cpp".format(n) + for n in aten_ufunc_names + ]] + +def aten_ufunc_generated_cuda_sources(gencode_pattern = "{}"): + return [gencode_pattern.format(name) for name in [ + "UfuncCUDA_{}.cu".format(n) + for n in aten_ufunc_names + ]] diff --git a/tools/update_masked_docs.py b/tools/update_masked_docs.py new file mode 100644 index 000000000000..87ee0830e01b --- /dev/null +++ b/tools/update_masked_docs.py @@ -0,0 +1,61 @@ +"""This script updates the file torch/_masked/_docs.py that contains +the generated doc-strings for various masked operations. The update +should be triggered whenever a new masked operation is introduced to +torch._masked package. Running the script requires that torch package +is functional. +""" + +import os + + +def main() -> None: + + target = os.path.join("torch", "_masked", "_docs.py") + + try: + import torch + except ImportError as msg: + print(f"Failed to import torch required to build {target}: {msg}") + return + + if os.path.isfile(target): + with open(target) as _f: + current_content = _f.read() + else: + current_content = "" + + _new_content = [] + _new_content.append( + """\ +# -*- coding: utf-8 -*- +# This file is generated, do not modify it! +# +# To update this file, run the update masked docs script as follows: +# +# python tools/update_masked_docs.py +# +# The script must be called from an environment where the development +# version of torch package can be imported and is functional. +# +""" + ) + + for func_name in sorted(torch._masked.__all__): + func = getattr(torch._masked, func_name) + func_doc = torch._masked._generate_docstring(func) + _new_content.append(f'{func_name}_docstring = """{func_doc}"""\n') + + new_content = "\n".join(_new_content) + + if new_content == current_content: + print(f"Nothing to update in {target}") + return + + with open(target, "w") as _f: + _f.write(new_content) + + print(f"Successfully updated {target}") + + +if __name__ == "__main__": + main() diff --git a/tools/vscode_settings.py b/tools/vscode_settings.py index 88dbfb4fedf9..5c7fa8740c4f 100755 --- a/tools/vscode_settings.py +++ b/tools/vscode_settings.py @@ -5,17 +5,17 @@ def main() -> None: - folder = Path('.vscode') - recommended = json.loads((folder / 'settings_recommended.json').read_text()) - path = folder / 'settings.json' + folder = Path(".vscode") + recommended = json.loads((folder / "settings_recommended.json").read_text()) + path = folder / "settings.json" try: current = json.loads(path.read_text()) except Exception: current = {} - with open(path, 'w') as f: + with open(path, "w") as f: json.dump({**current, **recommended}, f, indent=2) - f.write('\n') + f.write("\n") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 00892ea09eae..15bad2039451 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -14,7 +14,7 @@ if(NOT BUILD_PYTHON) endif() if(USE_TBB) -include_directories(${TBB_INCLUDE_DIR}) + include_directories(${TBB_INCLUDE_DIR}) endif() set(TORCH_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}") @@ -44,6 +44,9 @@ set(TORCH_PYTHON_SRCS ) append_filelist("libtorch_python_core_sources" TORCH_PYTHON_SRCS) +list(APPEND TORCH_PYTHON_SRCS + ${TORCH_SRC_DIR}/csrc/init_flatbuffer_module.cpp) + # NB: This has to match the condition under which the JIT test directory # is included (at the time of writing that's in caffe2/CMakeLists.txt). if(BUILD_TEST) @@ -151,8 +154,8 @@ if(USE_CUDNN OR USE_ROCM) endif() endif() -if(USE_MLCOMPUTE) - list(APPEND TORCH_PYTHON_SRCS ${MLC_PYTHON_SRCS}) +if(USE_MPS) + list(APPEND TORCH_PYTHON_SRCS ${MPS_PYTHON_SRCS}) endif() if(USE_VALGRIND AND NOT WIN32) @@ -190,6 +193,7 @@ add_custom_target(torch_python_stubs DEPENDS "${TORCH_SRC_DIR}/_C/__init__.pyi" "${TORCH_SRC_DIR}/_C/_VariableFunctions.pyi" "${TORCH_SRC_DIR}/nn/functional.pyi" + "${TORCH_SRC_DIR}/utils/data/datapipes/datapipe.pyi" ) add_custom_command( OUTPUT @@ -199,6 +203,7 @@ add_custom_command( COMMAND "${PYTHON_EXECUTABLE}" -mtools.pyi.gen_pyi --native-functions-path "aten/src/ATen/native/native_functions.yaml" + --tags-path "aten/src/ATen/native/tags.yaml" --deprecated-functions-path "tools/autograd/deprecated.yaml" DEPENDS "${TORCH_SRC_DIR}/_C/__init__.pyi.in" @@ -206,10 +211,23 @@ add_custom_command( "${TORCH_SRC_DIR}/nn/functional.pyi.in" "${TOOLS_PATH}/pyi/gen_pyi.py" "${TORCH_ROOT}/aten/src/ATen/native/native_functions.yaml" + "${TORCH_ROOT}/aten/src/ATen/native/tags.yaml" "${TORCH_ROOT}/tools/autograd/deprecated.yaml" WORKING_DIRECTORY "${TORCH_ROOT}" ) +file(GLOB_RECURSE datapipe_files "${TORCH_SRC_DIR}/utils/data/datapipes/*.py") +add_custom_command( + OUTPUT + "${TORCH_SRC_DIR}/utils/data/datapipes/datapipe.pyi" + COMMAND + "${PYTHON_EXECUTABLE}" ${TORCH_SRC_DIR}/utils/data/datapipes/gen_pyi.py + DEPENDS + "${TORCH_SRC_DIR}/utils/data/datapipes/datapipe.pyi.in" + ${datapipe_files} + WORKING_DIRECTORY + "${TORCH_ROOT}" +) if(USE_DISTRIBUTED) if(WIN32) append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS) @@ -346,7 +364,6 @@ if(USE_NUMPY) target_compile_definitions(torch_python PRIVATE USE_NUMPY) endif() -list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS BUILD_CAFFE2) if(HAVE_SOVERSION) set_target_properties(torch_python PROPERTIES VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION}) @@ -376,6 +393,9 @@ set_source_files_properties( # Disable certain warnings for GCC-9.X if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0)) set_source_files_properties(${TORCH_SRC_DIR}/csrc/Module.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") + set_source_files_properties( + ${TORCH_SRC_DIR}/csrc/init_flatbuffer_module.cpp + PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") set_source_files_properties(${TORCH_SRC_DIR}/csrc/autograd/python_variable.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") endif() @@ -403,6 +423,10 @@ target_compile_options(torch_python PRIVATE ${TORCH_PYTHON_COMPILE_OPTIONS}) target_include_directories(torch_python PUBLIC ${TORCH_PYTHON_INCLUDE_DIRECTORIES}) +if(BUILD_ONEDNN_GRAPH) + target_compile_definitions(torch_python PRIVATE "-DBUILD_ONEDNN_GRAPH") + target_compile_definitions(torch_cpu PRIVATE "-DBUILD_ONEDNN_GRAPH") +endif() if(NOT TORCH_PYTHON_LINK_FLAGS STREQUAL "") set_target_properties(torch_python PROPERTIES LINK_FLAGS ${TORCH_PYTHON_LINK_FLAGS}) diff --git a/torch/_C/_VariableFunctions.pyi.in b/torch/_C/_VariableFunctions.pyi.in index 1b3a760c8cbd..75d566f131ab 100644 --- a/torch/_C/_VariableFunctions.pyi.in +++ b/torch/_C/_VariableFunctions.pyi.in @@ -5,13 +5,11 @@ from typing import List, Tuple, Optional, Union, Any, ContextManager, Callable, from typing_extensions import Literal from torch._six import inf -from torch.types import _int, _float, _bool, Number, _dtype, _device, _qscheme, _size, _layout +from torch.types import _int, _float, _bool, Number, _dtype, _device, _qscheme, _size, _layout, SymInt +import torch import builtins -# REDUNDANT! -${namedtuple_defs} - ${function_hints} ${all_directive} diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index ffa4da59707e..b986f87943b0 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -12,8 +12,8 @@ from typing import ( from typing_extensions import Literal from torch._six import inf -from torch.types import _int, _float, _bool, _dtype, _device, _qscheme, _size, _layout, Device, Number, Storage -from torch.storage import TypedStorage +from torch.types import _int, _float, _bool, _dtype, _device, _qscheme, _size, _layout, Device, Number, Storage, SymInt +from torch.storage import _TypedStorage import builtins @@ -22,6 +22,8 @@ import builtins from . import _nn as _nn from . import _onnx as _onnx from . import _VariableFunctions as _VariableFunctions +from . import _lazy as _lazy +from . import _lazy_ts_backend as _lazy_ts_backend T = TypeVar('T') @@ -52,7 +54,7 @@ class Stream: class Size(Tuple[_int, ...]): # TODO: __reduce__ - @overload + @overload # type: ignore[override] def __getitem__(self: Size, key: _int) -> _int: ... @overload @@ -107,6 +109,9 @@ def DisableTorchFunction(): ... strided : layout = ... sparse_coo : layout = ... sparse_csr : layout = ... +sparse_csc : layout = ... +sparse_bsr : layout = ... +sparse_bsc : layout = ... _mkldnn : layout = ... # Defined in torch/csrc/MemoryFormat.cpp @@ -195,7 +200,7 @@ def _is_tracing() -> _bool: ... def _jit_init() -> _bool: ... def _jit_flatten(arg: Any) -> Tuple[List[Tensor], IODescriptor]: ... def _jit_unflatten(vars: List[Tensor], desc: IODescriptor) -> Any: ... -def _jit_get_operation(op_name: str) -> Callable: ... +def _jit_get_operation(op_name: str) -> Tuple[Callable, List[str]]: ... def _get_operation_overload(op_name: str, op_overload_name: str) -> Callable: ... def _get_schema(op_name: str, overload_name: str) -> FunctionSchema: ... def _jit_pass_optimize_for_mobile(module: 'torch.jit.ScriptModule', @@ -211,9 +216,10 @@ def _jit_pass_metal_optimize_for_mobile(module: 'torch.jit.ScriptModule', def _jit_pass_inline(Graph) -> None: ... def _jit_pass_constant_propagation(Graph) -> None: ... def _jit_pass_propagate_shapes_on_graph(Graph) -> None: ... +def _jit_register_decomposition_for_schema(schema: FunctionSchema, Graph) -> None: ... def _jit_erase_non_input_shape_information(Graph) -> None: ... -def _jit_pass_common_expression_hoisting(Graph) -> None: ... def _jit_get_schemas_for_operator(name :str) -> List[FunctionSchema]: ... +def _jit_get_all_schemas() -> List[FunctionSchema]: ... def _jit_check_alias_annotation(g: Graph, args: Tuple[Any, ...], unqualified_op_name: str): ... def _jit_can_fuse_on_cpu() -> _bool: ... def _jit_can_fuse_on_gpu() -> _bool: ... @@ -222,6 +228,8 @@ def _debug_get_fusion_group_inlining() -> _bool: ... def _debug_set_fusion_group_inlining(enable: _bool): ... def _jit_texpr_fuser_enabled() -> _bool: ... def _jit_nvfuser_enabled() -> _bool: ... +def _jit_llga_enabled() -> _bool: ... +def _jit_set_llga_enabled(enable: _bool): ... def _llvm_enabled() -> _bool: ... def _jit_override_can_fuse_on_cpu(override: _bool): ... def _jit_override_can_fuse_on_gpu(override: _bool): ... @@ -233,7 +241,7 @@ def _jit_set_te_must_use_llvm_cpu(use_llvm: _bool): ... def _jit_set_nvfuser_enabled(enable: _bool) -> _bool: ... def _jit_cat_wo_conditionals(optimize_cat: _bool): ... def _jit_opt_conditionals(opt_conds: _bool): ... -def _jit_pass_canonicalize(graph: Graph): ... +def _jit_pass_canonicalize(graph: Graph, keep_unique_names: _bool = True): ... def _jit_pass_erase_shape_information(graph: Graph): ... def _jit_pass_fold_convbn(module: 'torch.jit.ScriptModule'): ... def _jit_pass_insert_observers(module: 'torch.jit.ScriptModule', @@ -260,7 +268,7 @@ ResolutionCallback = Callable[[str], Callable[..., Any]] # Defined in torch/csrc/jit/python/script_init.cpp # and torch/csrc/jit/python/init.cpp -def _create_function_from_graph(qualname: str, graph: Graph) -> Graph: ... +def _create_function_from_graph(qualname: str, graph: Graph) -> ScriptFunction: ... def _debug_set_autodiff_subgraph_inlining(disabled: _bool) -> None: ... def _ivalue_tags_match(lhs: ScriptModule, rhs: ScriptModule) -> _bool: ... def _jit_assert_is_instance(obj: Any, type: JitType): ... @@ -281,7 +289,7 @@ def _get_model_ops_and_info_from_buffer(buffer: BinaryIO): ... def _get_mobile_model_contained_types(filename: Union[str, Path]): ... def _get_mobile_model_contained_types_from_buffer(buffer: BinaryIO): ... def _logging_set_logger(logger: LoggerBase) -> LoggerBase: ... -def _get_graph_executor_optimize() -> _bool: ... +def _get_graph_executor_optimize(optimize: Optional[_bool] = None) -> _bool: ... def _set_graph_executor_optimize(optimize: _bool): ... def _export_opnames(module: ScriptModule) -> List[str]: ... def _create_function_from_trace( @@ -302,7 +310,7 @@ def _dump_upgraders_map() -> Dict[str, str]: ... def _test_only_populate_upgraders(content: Dict[str, str]) -> None: ... def _test_only_remove_upgraders(content: Dict[str, str]) -> None: ... def merge_type_from_type_comment(decl: Decl, type_annotation_decl: Decl, is_method: _bool) -> Decl: ... -def parse_ir(input: str) -> Graph: ... +def parse_ir(input: str, parse_tensor_constants: _bool) -> Graph: ... def parse_schema(schema: str) -> FunctionSchema: ... def get_device(input: Tensor) -> _int: ... def _resolve_type_from_object(obj: Any, range: SourceRange, rcb: ResolutionCallback) -> JitType: ... @@ -314,11 +322,11 @@ def _replace_overloaded_method_decl(overload_decl: Decl, implementation_def: Def def _jit_pass_lower_all_tuples(graph: Graph) -> None: ... def _jit_pass_onnx_set_dynamic_input_shape(graph: Graph, dynamic_axes: Dict[str, Dict[_int, str]], input_names: List[str]) -> None: ... def _jit_pass_onnx_graph_shape_type_inference(graph: Graph, paramsDict: Dict[str, IValue], opset_version: _int) -> None: ... -def _jit_pass_onnx_assign_output_shape(graph: Graph, tensors: List[Tensor], desc: IODescriptor, onnx_shape_inference: _bool = False) -> None: ... +def _jit_pass_onnx_assign_output_shape(graph: Graph, tensors: List[Tensor], desc: IODescriptor, onnx_shape_inference: _bool, is_script: _bool) -> None: ... def _jit_pass_onnx_remove_inplace_ops_for_onnx(graph: Graph, module: Module) -> None: ... def _jit_pass_remove_inplace_ops(graph: Graph) -> None: ... def _jit_pass_canonicalize_graph_fuser_ops(graph: Graph) -> None: ... -def _jit_pass_peephole(graph: Graph, addmm_fusion_enabled: _bool) -> None: ... +def _jit_pass_peephole(graph: Graph, disable_shape_peepholes: _bool = False) -> None: ... def _jit_pass_fuse_addmm(graph: Graph) -> None: ... def _jit_pass_onnx_preprocess(graph: Graph) -> None: ... def _jit_pass_prepare_division_for_onnx(graph: Graph) -> None: ... @@ -326,7 +334,8 @@ def _jit_pass_onnx_remove_print(graph: Graph) -> None: ... def _jit_pass_onnx_preprocess_caffe2(graph: Graph) -> None: ... def _jit_pass_onnx_unpack_quantized_weights( graph: Graph, - paramsDict: Dict[str, IValue] + paramsDict: Dict[str, IValue], + caffe2: _bool ) -> Dict[str, IValue]: ... def _jit_pass_onnx_quantization_insert_permutes( graph: Graph, @@ -342,8 +351,15 @@ def _jit_pass_onnx_peephole(graph: Graph, opset_version: _int, fixed_batch_size: def _jit_pass_dce_allow_deleting_nodes_with_side_effects(graph: Graph) -> None: ... def _jit_pass_onnx_function_substitution(graph: Graph) -> None: ... def _jit_pass_onnx_function_extraction(graph: Graph, module_names : Set[str], param_names : List[str]) -> Dict[Node, Dict[str, str]]: ... +def _jit_pass_onnx_clear_scope_records() -> None: ... +def _jit_pass_onnx_track_scope_attributes(graph: Graph, onnx_attrs: Dict[str, Any]) -> None: ... +def _jit_is_onnx_log_enabled() -> _bool: ... +def _jit_set_onnx_log_enabled(enabled: _bool) -> None: ... +def _jit_set_onnx_log_output_stream(stream_name: str) -> None: ... +def _jit_onnx_log(*args: Any) -> None: ... def _jit_pass_lower_graph(graph: Graph, m: Module) -> Tuple[Graph, List[IValue]]: ... def _jit_pass_inline_fork_wait(graph: Graph) -> None: ... +def _jit_pass_onnx_deduplicate_initializers(graph: Graph, params_dict: Dict[str, IValue], is_train: _bool) -> Dict[str, IValue]: ... def _jit_pass_onnx_eval_peephole(graph: Graph, paramsDict: Dict[str, IValue]) -> Dict[str, IValue]: ... def _jit_pass_onnx_constant_fold(graph: Graph, paramsDict: Dict[str, IValue], opset_version: _int) -> Dict[str, IValue]: ... def _jit_pass_onnx_eliminate_unused_items(graph: Graph, paramsDict: Dict[str, IValue]) -> Dict[str, IValue]: ... @@ -365,6 +381,10 @@ def _compile_graph_to_code_table(name: str, graph: Graph) -> IValue: ... def _generate_upgraders_graph() -> Dict[str, Graph]: ... +def _calculate_package_version_based_on_upgraders(val: _bool): ... + +def _get_version_calculator_flag() -> _bool: ... + def _jit_script_interface_compile(name: str, class_def: ClassDef, rcb: ResolutionCallback, is_module: _bool): ... def _jit_script_compile_overload( qualname: str, @@ -408,7 +428,7 @@ def _import_ir_module_from_package( ) -> ScriptModule: ... def _assign_output_shapes(graph: Graph, inputs: List[Tensor]) -> Graph: ... -def _check_onnx_proto(proto: str) -> None: ... +def _check_onnx_proto(proto: str, full_check: _bool = False) -> None: ... def _propagate_and_assign_input_shapes( graph: Graph, inputs: Tuple[Tensor, ...], @@ -426,12 +446,9 @@ class AliasDb: def __str__(self) -> str: ... ... -# Defined in torch/torch/csrc/jit/ir/ir.h -class Graph: - def eraseInput(self, i: _int) -> None: ... - def alias_db(self) -> AliasDb: ... - def inputs(self) -> List[Value]: ... - ... +class _InsertPoint: + def __enter__(self) -> None: ... + def __exit__(self, *args) -> None: ... # Defined in torch/csrc/jit/ir/ir.h class Value: @@ -442,10 +459,33 @@ class Value: # Defined in torch/csrc/jit/ir/ir.h class Block: + def inputs(self) -> List[Value]: ... + def outputs(self) -> List[Value]: ... ... # Defined in torch/csrc/jit/ir/ir.h class Node: + def schema(self) -> str: ... + def output(self) -> Value: ... + def outputs(self) -> List[Value]: ... + def outputsSize(self) -> _int: ... + def blocks(self) -> List[Block]: ... + def mustBeNone(self) -> _bool: ... + def kindOf(self, str) -> str: ... + def __getitem__(self, key: str) -> Any: ... + def namedInput(self, str) -> Value: ... + ... + +# Defined in torch/torch/csrc/jit/ir/ir.h +class Graph: + def eraseInput(self, i: _int) -> None: ... + def alias_db(self) -> AliasDb: ... + def inputs(self) -> List[Value]: ... + def setInsertPoint(self, n: Union[Block, Node]) -> None: ... + def insert_point_guard(self, n: Union[Block, Node]) -> _InsertPoint: ... + def insertPoint(self) -> Node: ... + def insertGraph(self, callee: Graph, inputs: List[Value]) -> List[Value]: ... + def makeMultiOutputIntoTuple(self) -> None: ... ... @@ -460,6 +500,8 @@ class Argument: class FunctionSchema: arguments: List[Argument] returns: List[Argument] + name: str + overload_name: str ... class _UpgraderEntry: @@ -510,6 +552,8 @@ class ConcreteModuleTypeBuilder: def __init__(self, obj: Any) -> None: ... def set_module_dict(self): ... def set_module_list(self): ... + def set_parameter_list(self): ... + def set_parameter_dict(self): ... def add_attribute(self, name: str, ty: JitType, is_param: _bool, is_buffer: _bool): ... def add_module(self, name: str, meta: ConcreteModuleType): ... def add_constant(self, name: str, value: Any): ... @@ -636,6 +680,8 @@ def _get_cudnn_allow_tf32() -> _bool: ... # THPModule_allowTF32CuDNN def _set_cudnn_allow_tf32(arg: _bool) -> None: ... # THPModule_setAllowTF32CuDNN def _get_cublas_allow_tf32() -> _bool: ... # THPModule_allowTF32CuBLAS def _set_cublas_allow_tf32(arg: _bool) -> None: ... # THPModule_setAllowTF32CuBLAS +def _get_float32_matmul_precision() -> str: ... #THPModule_float32MatmulPrecision +def _set_float32_matmul_precision(arg: str) -> None: ... #THPModule_setFloat32MatmulPrecision def _get_cublas_allow_fp16_reduced_precision_reduction() -> _bool: ... #THPModule_allowFP16ReductionCuBLAS def _set_cublas_allow_fp16_reduced_precision_reduction(arg: _bool) -> None: ... #THPModule_setAllowFP16ReductionCuBLAS # NB: There is no Capsule type in typing, see @@ -660,6 +706,7 @@ def _vmapmode_decrement_nesting() -> _int: ... # THPModule_vmapmode_decrement_n def _log_api_usage_once(str) -> None: ... # LogAPIUsageOnceFromPython def _demangle(str) -> str: ... # c10::demangle def _disabled_torch_function_impl(func: Callable, types: Iterable[Type], args: Tuple, kwargs: Dict) -> Any: ... # THPModule_disable_torch_function +def _disabled_torch_dispatch_impl(func: Callable, types: Iterable[Type], args: Tuple, kwargs: Dict) -> Any: ... # THPModule_disable_dispatch_function def _get_linalg_preferred_backend() -> torch._C._LinalgBackend: ... def _set_linalg_preferred_backend(arg: torch._C._LinalgBackend): ... class _LinalgBackend: @@ -674,6 +721,8 @@ def _valgrind_toggle_and_dump_stats() -> None: ... # CALLGRIND_TOGGLE_COLLECT a has_openmp: _bool has_mkl: _bool +has_mps: _bool +_is_mps_available: _bool has_lapack: _bool has_cuda: _bool has_mkldnn: _bool @@ -709,8 +758,20 @@ def __set_forward_AD_enabled(enabled: _bool) -> None: ... def __is_forward_AD_enabled() -> _bool: ... def _register_default_hooks(pack_hook: Callable, unpack_hook: Callable) -> None: ... def _reset_default_hooks() -> None: ... -def _enter_python_mode(cls: Type) -> None: ... -def _exit_python_mode() -> None: ... + +# Defined in torch/overrides.py +class TorchFunctionMode(object): + ... + +def _set_torch_function_mode(cls: Optional[Union[type, TorchFunctionMode]]) -> None: ... +def _get_torch_function_mode() -> Optional[Union[type, TorchFunctionMode]]: ... + +# Defined in torch/utils/_python_dispatch.py +class TorchDispatchMode(object): + ... + +def _set_torch_dispatch_mode(cls: Optional[Union[type, TorchDispatchMode]]) -> None: ... +def _get_torch_dispatch_mode() -> Optional[Union[type, TorchDispatchMode]]: ... class _InferenceMode(object): def __init__(self, mode: _bool) -> None: ... @@ -777,6 +838,12 @@ class Generator(object): def seed(self) -> _int: ... def initial_seed(self) -> _int: ... + +# Defined in torch/csrc/utils/python_dispatch.cpp +def _dispatch_library(kind: str, name: str, dispatch: str, file: str = "", linenum: Any = 0) -> Any: ... +def _dispatch_has_kernel_for_dispatch_key(name: str, dispatch: str) -> _bool: ... +def _dispatch_has_kernel(name: str) -> _bool: ... + # Defined in torch/csrc/utils/init.cpp class BenchmarkConfig(object): num_calling_threads: _int @@ -795,9 +862,6 @@ class ThroughputBenchmark(object): def run_once(self, *args: Any, **kwargs: Any) -> Any: ... def benchmark(self, config: BenchmarkConfig) -> BenchmarkExecutionStats: ... -# IDK if these are actually exposed here, hope they are -${namedtuple_defs} - # Defined in torch/csrc/generic/Storage.cpp ${legacy_storage_base_hints} @@ -869,6 +933,10 @@ def _cuda_memorySnapshot() -> List[Dict[str, Any]]: ... def _cuda_lock_mutex() -> None: ... def _cuda_unlock_mutex() -> None: ... def _cuda_canDeviceAccessPeer(device: _int, peer_device: _int) -> _bool: ... +def _cuda_jiterator_compile_and_launch_kernel(code_string: str, + kernel_name: str, + tensors: Tuple, + kwargs: Dict[str, Union[_int, _float, _bool]]) -> Tensor: ... def _nccl_version() -> _int: ... def _nccl_unique_id() -> bytes: ... def _nccl_init_rank(nranks: _int, comm_id: bytes, rank: _int) -> object: ... @@ -896,6 +964,7 @@ def _nccl_reduce_scatter(input: Sequence[Tensor], op: _int, streams: Optional[Sequence[_CudaStreamBase]], comms: Optional[Sequence[object]]) -> None: ... +def _rocm_is_backward_pass() -> _bool: ... class _CudaDeviceProperties: @@ -957,6 +1026,8 @@ class _CUDAGraph: def reset(self) -> None: ... def pool(self) -> Tuple[_int, _int]: ... +def _cuda_isCurrentStreamCapturing() -> _bool: ... + def _graph_pool_handle() -> Tuple[_int, _int]: ... # Defined in torch/csrc/DataLoader.cpp @@ -997,6 +1068,7 @@ class JitType: def isSubtypeOf(self, other: JitType) -> _bool: ... def with_dtype(self, dtype: _dtype) -> JitType: ... def with_sizes(self, sizes: List[Optional[_int]]) -> JitType: ... + def kind(self) -> str: ... class InferredType: def __init__(self, arg: Union[JitType, str]): ... @@ -1111,6 +1183,9 @@ class TensorType(JitType): def getInferred(cls) -> TensorType: ... def with_sizes(self, other: Optional[List[Optional[_int]]]) -> TensorType: ... def sizes(self) -> Optional[List[_int]]: ... + def strides(self) -> Optional[List[_int]]: ... + def device(self) -> Optional[_device]: ... + def dtype(self) -> Optional[_dtype]: ... @staticmethod def create_from_tensor(t: Tensor) -> TensorType: ... diff --git a/torch/_C/_autograd.pyi b/torch/_C/_autograd.pyi index 38ac7ccaea0c..b2a190c1e96c 100644 --- a/torch/_C/_autograd.pyi +++ b/torch/_C/_autograd.pyi @@ -28,13 +28,21 @@ class DeviceType(Enum): FPGA = ... ORT = ... XLA = ... - MLC = ... + MPS = ... HPU = ... Meta = ... Vulkan = ... Metal = ... ... +class _ExperimentalConfig: + def __init__( + self, + profiler_metrics: List[str] = ..., + profiler_measure_per_kernel: bool = ..., + ) -> None: ... + ... + class ProfilerConfig: def __init__( self, @@ -43,7 +51,8 @@ class ProfilerConfig: profile_memory: bool, with_stack: bool, with_flops: bool, - with_modules: bool + with_modules: bool, + experimental_config: _ExperimentalConfig, ) -> None: ... ... @@ -82,11 +91,15 @@ class _ProfilerResult: class SavedTensor: ... +class ActiveProfilerType: + ... + def _enable_profiler(config: ProfilerConfig, activities: Set[ProfilerActivity]) -> None: ... def _prepare_profiler(config: ProfilerConfig, activities: Set[ProfilerActivity]) -> None: ... def _disable_profiler() -> _ProfilerResult: ... def _profiler_enabled() -> bool: ... def _add_metadata_json(key: str, value: str) -> None: ... +def _kineto_step() -> None: ... def kineto_available() -> bool: ... def _record_function_with_args_enter(name: str, args: List[Any]) -> torch.Tensor: ... def _record_function_with_args_exit(handle: torch.Tensor) -> None: ... @@ -98,3 +111,4 @@ def _pop_saved_tensors_default_hooks() -> None: ... def _enable_profiler_legacy(config: ProfilerConfig) -> None: ... def _disable_profiler_legacy() -> List[List[ProfilerEvent]]: ... +def _profiler_type() -> ActiveProfilerType: ... diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi index 741d4d5562a1..6192b1f04388 100644 --- a/torch/_C/_distributed_c10d.pyi +++ b/torch/_C/_distributed_c10d.pyi @@ -52,9 +52,11 @@ class Logger: ): ... ... -def _get_debug_mode(): ... +def get_debug_level(): ... +def set_debug_level(): ... +def set_debug_level_from_env(): ... -class _DistributedDebugLevel(Enum): +class DebugLevel(Enum): OFF = ... INFO = ... DETAIL = ... @@ -132,7 +134,7 @@ class TCPStore(Store): self, host_name: str, port: int, - world_size: int = ..., + world_size: Optional[int] = ..., is_master: bool = ..., timeout: timedelta = ..., wait_for_workers: bool = ..., @@ -393,5 +395,7 @@ def _broadcast_coalesced( ): ... def _test_python_store(store: Store): ... def _verify_params_across_processes( - process_group: ProcessGroup, params: List[Tensor] + process_group: ProcessGroup, + params: List[Tensor], + logger: Optional[Logger], ): ... diff --git a/torch/_C/_distributed_rpc.pyi b/torch/_C/_distributed_rpc.pyi index d89f614123e1..06d7a6fcba3f 100644 --- a/torch/_C/_distributed_rpc.pyi +++ b/torch/_C/_distributed_rpc.pyi @@ -2,6 +2,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union, overload from datetime import timedelta import enum import torch +from torch.types import Device from . import Future from ._autograd import ProfilerConfig, ProfilerState, ProfilerEvent from ._distributed_c10d import ProcessGroup, Store @@ -32,7 +33,7 @@ class WorkerInfo: def __repr__(self) -> str: ... class RpcAgent: - def join(self, shutdown: bool = False): ... + def join(self, shutdown: bool = False, timeout: float = 0): ... def sync(self): ... def shutdown(self): ... @overload @@ -68,6 +69,7 @@ class PyRRef: class _TensorPipeRpcBackendOptionsBase(RpcBackendOptions): num_worker_threads: int device_maps: Dict[str, Dict[torch.device, torch.device]] + devices: List[torch.device] def __init__( self, num_worker_threads: int, @@ -85,12 +87,12 @@ class TensorPipeAgent(RpcAgent): store: Store, name: str, worker_id: int, - world_size: int, + world_size: Optional[int], opts: _TensorPipeRpcBackendOptionsBase, reverse_device_maps: Dict[str, Dict[torch.device, torch.device]], devices: List[torch.device], ): ... - def join(self): ... + def join(self, shutdown: bool = False, timeout: float = 0): ... def shutdown(self): ... @overload def get_worker_info(self) -> WorkerInfo: ... @@ -100,6 +102,17 @@ class TensorPipeAgent(RpcAgent): def get_worker_info(self, id: int) -> WorkerInfo: ... def get_worker_infos(self) -> List[WorkerInfo]: ... def _get_device_map(self, dst: WorkerInfo) -> Dict[torch.device, torch.device]: ... + def _update_group_membership( + self, + worker_info: WorkerInfo, + my_devices: List[torch.device], + reverse_device_map: Dict[str, Dict[torch.device, torch.device]], + is_join: bool): ... + def _get_backend_options(self) -> _TensorPipeRpcBackendOptionsBase: ... + @property + def is_static_group(self) -> bool: ... + @property + def store(self) -> Store: ... def _is_current_rpc_agent_set() -> bool: ... def _get_current_rpc_agent()-> RpcAgent: ... diff --git a/torch/_C/_lazy.pyi b/torch/_C/_lazy.pyi new file mode 100644 index 000000000000..e86b80837d58 --- /dev/null +++ b/torch/_C/_lazy.pyi @@ -0,0 +1,20 @@ +from typing import List +from torch import Tensor + +#defined in torch/csrc/lazy/python/init.cpp +def _mark_step(device: str, devices: List[str], wait: bool): ... +def _wait_device_ops(devices: List[str]): ... +def _reset_metrics(): ... +def _counter_names() -> List[str]: ... +def _counter_value(name: str) -> int: ... +def _get_graph_hash(tensors: List[Tensor]) -> str: ... +def _sync_multi(tensors: List[Tensor], devices: List[str], wait: bool = True, sync_ltc_data: bool = True): ... +def _get_tensor_id(tensor: Tensor) -> int: ... +def _get_tensors_text(tensors: List[Tensor]) -> str: ... +def _get_tensors_dot(tensors: List[Tensor]) -> str: ... +def _get_tensors_backend(tensors: List[Tensor]) -> str: ... +def _get_force_fallback() -> str: ... +def _set_force_fallback(newval: str): ... +def _clear_ir_cache(): ... +def _dump_ir_cache(filename: str): ... +def _set_reuse_ir(val: bool): ... diff --git a/torch/_C/_lazy_ts_backend.pyi b/torch/_C/_lazy_ts_backend.pyi new file mode 100644 index 000000000000..91575fe939bf --- /dev/null +++ b/torch/_C/_lazy_ts_backend.pyi @@ -0,0 +1,8 @@ +#defined in torch/csrc/lazy/python/init.cpp + +from typing import List, Tuple, Any +from torch import Tensor + +def _init(): ... +def _get_tensors_ts_device_data_node(tensors: List[Tensor]) -> Tuple[List[int], List[Any]]: ... +def _run_cached_graph(hash_str: str, graph_inputs: List[Any]) -> List[Tensor]: ... diff --git a/torch/_C/_nn.pyi.in b/torch/_C/_nn.pyi.in index b2b2bcbbefdd..1198c43da450 100644 --- a/torch/_C/_nn.pyi.in +++ b/torch/_C/_nn.pyi.in @@ -13,6 +13,9 @@ def mkldnn_linear(input: Tensor, weight: Tensor, bias: Optional[Tensor]) -> Tens def mkldnn_reorder_conv2d_weight(self: Tensor, padding: List, stride: List, dilatation: List, groups: int) -> Tensor: ... def mkldnn_reorder_conv3d_weight(self: Tensor, padding: List, stride: List, dilatation: List, groups: int) -> Tensor: ... +# Defined in aten/src/ATen/native/mkldnn/Prelu.cpp +def mkldnn_prelu(input: Tensor, weight: Tensor) -> Tensor: ... + # Defined at tools/autograd/templates/python_nn_functions.cpp @overload def _parse_to(device: _device, dtype: _dtype, non_blocking: _bool, copy: _bool, *, diff --git a/torch/_C/build.bzl b/torch/_C/build.bzl new file mode 100644 index 000000000000..230124eb69aa --- /dev/null +++ b/torch/_C/build.bzl @@ -0,0 +1,6 @@ +def define_targets(rules): + rules.filegroup( + name = "pyi.in", + srcs = rules.glob(["*.pyi.in"]), + visibility = ["//visibility:public"], + ) diff --git a/torch/_C/return_types.pyi.in b/torch/_C/return_types.pyi.in new file mode 100644 index 000000000000..aa540ea328b5 --- /dev/null +++ b/torch/_C/return_types.pyi.in @@ -0,0 +1,10 @@ +# ${generated_comment} + +from torch import Tensor, Generator, strided, memory_format, contiguous_format, strided +from typing import List, Tuple, Optional, Union, Any, ContextManager, Callable, overload, Iterator, NamedTuple, Sequence, TypeVar +from typing_extensions import Literal +from torch._six import inf + +from torch.types import _int, _float, _bool, Number, _dtype, _device, _qscheme, _size, _layout + +${namedtuple_defs} diff --git a/torch/_C_flatbuffer/__init__.pyi b/torch/_C_flatbuffer/__init__.pyi new file mode 100644 index 000000000000..3a2ff059b0ed --- /dev/null +++ b/torch/_C_flatbuffer/__init__.pyi @@ -0,0 +1,10 @@ +from torch._C import LiteScriptModule, ScriptModule + +def _load_mobile_module_from_file(filename: str): ... +def _load_mobile_module_from_bytes(bytes_: bytes): ... +def _load_jit_module_from_file(filename: str): ... +def _load_jit_module_from_bytes(bytes_: bytes): ... +def _save_mobile_module(m: LiteScriptModule, filename: str): ... +def _save_jit_module(m: ScriptModule, filename: str): ... +def _save_mobile_module_to_bytes(m: LiteScriptModule) -> bytes: ... +def _save_jit_module_to_bytes(m: ScriptModule) -> bytes: ... diff --git a/torch/__init__.py b/torch/__init__.py index 519ea3e607cd..6c1e5a88ab8f 100644 --- a/torch/__init__.py +++ b/torch/__init__.py @@ -15,6 +15,7 @@ import textwrap import ctypes import warnings +import inspect if sys.version_info < (3,): raise Exception("Python 2 has reached end-of-life and is no longer supported by PyTorch.") @@ -29,7 +30,7 @@ from ._six import string_classes as _string_classes -from typing import Set, Type, TYPE_CHECKING, Union +from typing import Set, Type, TYPE_CHECKING, Union, Callable import builtins __all__ = [ @@ -39,12 +40,14 @@ 'no_grad', 'enable_grad', 'rand', 'randn', 'inference_mode', 'DoubleStorage', 'FloatStorage', 'LongStorage', 'IntStorage', 'ShortStorage', 'CharStorage', 'ByteStorage', 'BoolStorage', + '_TypedStorage', 'DoubleTensor', 'FloatTensor', 'LongTensor', 'IntTensor', 'ShortTensor', 'CharTensor', 'ByteTensor', 'BoolTensor', 'Tensor', 'lobpcg', 'use_deterministic_algorithms', 'are_deterministic_algorithms_enabled', 'is_deterministic_algorithms_warn_only_enabled', 'set_deterministic_debug_mode', 'get_deterministic_debug_mode', + 'set_float32_matmul_precision', 'get_float32_matmul_precision', 'set_warn_always', 'is_warn_always_enabled', ] @@ -227,10 +230,15 @@ def _load_global_deps(): ''').strip()) from None raise # If __file__ is not None the cause is unknown, so just re-raise. - -__all__ += [name for name in dir(_C) - if name[0] != '_' and - not name.endswith('Base')] +for name in dir(_C): + if name[0] != '_' and not name.endswith('Base'): + __all__.append(name) + obj = getattr(_C, name) + if (isinstance(obj, Callable) or inspect.isclass(obj)): # type: ignore[arg-type] + if (obj.__module__ != 'torch'): + # TODO: fix their module from C++ side + if name not in ['DisableTorchFunction', 'Generator']: + obj.__module__ = 'torch' if not TYPE_CHECKING: # issue 38137 and python issue 43367. Submodules of a C extension are @@ -562,6 +570,23 @@ def get_deterministic_debug_mode() -> builtins.int: else: return 0 +def get_float32_matmul_precision() -> builtins.str: + r"""Returns the current value of float32 matrix multiplication precision. Refer to + :func:`torch.set_float32_matmul_precision` documentation for more details. + """ + return _C._get_float32_matmul_precision() + +def set_float32_matmul_precision(precision): + r"""Sets the precision of float32 matrix multiplication (one of HIGHEST, HIGH, MEDIUM). + Original RFC: https://github.com/pytorch/pytorch/issues/76440 + Args: + precision(str): default "highest": avoid internally reducing precision with + formats such as TF32. + If "high," allow TF32. + If "medium," allow TF32. + """ + _C._set_float32_matmul_precision(precision) + def set_warn_always(b): r"""When this flag is False (default) then some PyTorch warnings may only appear once per process. This helps avoid excessive warning information. @@ -594,104 +619,105 @@ def is_warn_always_enabled(): ################################################################################ from ._tensor import Tensor -from .storage import _StorageBase, TypedStorage +from .storage import _StorageBase, _TypedStorage, _LegacyStorage # NOTE: New Storage classes should never be added. When adding a new -# dtype, use torch.storage.TypedStorage directly. +# dtype, use torch.storage._TypedStorage directly. -class UntypedStorage(_C.ByteStorageBase, _StorageBase): +class _UntypedStorage(_C.ByteStorageBase, _StorageBase): pass -class ByteStorage(TypedStorage): +class ByteStorage(_LegacyStorage): @classproperty def dtype(self): return torch.uint8 -class DoubleStorage(TypedStorage): +class DoubleStorage(_LegacyStorage): @classproperty def dtype(self): return torch.double -class FloatStorage(TypedStorage): +class FloatStorage(_LegacyStorage): @classproperty def dtype(self): return torch.float -class HalfStorage(TypedStorage): +class HalfStorage(_LegacyStorage): @classproperty def dtype(self): return torch.half -class LongStorage(TypedStorage): +class LongStorage(_LegacyStorage): @classproperty def dtype(self): return torch.long -class IntStorage(TypedStorage): +class IntStorage(_LegacyStorage): @classproperty def dtype(self): return torch.int -class ShortStorage(TypedStorage): +class ShortStorage(_LegacyStorage): @classproperty def dtype(self): return torch.short -class CharStorage(TypedStorage): +class CharStorage(_LegacyStorage): @classproperty def dtype(self): return torch.int8 -class BoolStorage(TypedStorage): +class BoolStorage(_LegacyStorage): @classproperty def dtype(self): return torch.bool -class BFloat16Storage(TypedStorage): +class BFloat16Storage(_LegacyStorage): @classproperty def dtype(self): return torch.bfloat16 -class ComplexDoubleStorage(TypedStorage): +class ComplexDoubleStorage(_LegacyStorage): @classproperty def dtype(self): return torch.cdouble -class ComplexFloatStorage(TypedStorage): +class ComplexFloatStorage(_LegacyStorage): @classproperty def dtype(self): return torch.cfloat -class QUInt8Storage(TypedStorage): +class QUInt8Storage(_LegacyStorage): @classproperty def dtype(self): return torch.quint8 -class QInt8Storage(TypedStorage): +class QInt8Storage(_LegacyStorage): @classproperty def dtype(self): return torch.qint8 -class QInt32Storage(TypedStorage): +class QInt32Storage(_LegacyStorage): @classproperty def dtype(self): return torch.qint32 -class QUInt4x2Storage(TypedStorage): +class QUInt4x2Storage(_LegacyStorage): @classproperty def dtype(self): return torch.quint4x2 -class QUInt2x4Storage(TypedStorage): +class QUInt2x4Storage(_LegacyStorage): @classproperty def dtype(self): return torch.quint2x4 _storage_classes = { - UntypedStorage, DoubleStorage, FloatStorage, LongStorage, IntStorage, + _UntypedStorage, DoubleStorage, FloatStorage, LongStorage, IntStorage, ShortStorage, CharStorage, ByteStorage, HalfStorage, BoolStorage, QUInt8Storage, QInt8Storage, QInt32Storage, BFloat16Storage, ComplexFloatStorage, ComplexDoubleStorage, QUInt4x2Storage, QUInt2x4Storage, + _TypedStorage } # The _tensor_classes set is initialized by the call to _C._initialize_tensor_type_bindings() @@ -715,7 +741,7 @@ def manager_path(): raise RuntimeError("Unable to find torch_shm_manager at " + path) return path.encode('utf-8') -from .autocast_mode import autocast +from torch.amp import autocast # Shared memory manager needs to know the exact location of manager executable _C._initExtension(manager_path()) @@ -740,8 +766,11 @@ def manager_path(): for name in dir(_C._VariableFunctions): if name.startswith('__') or name in PRIVATE_OPS: continue - globals()[name] = getattr(_C._VariableFunctions, name) - __all__.append(name) + obj = getattr(_C._VariableFunctions, name) + obj.__module__ = 'torch' + globals()[name] = obj + if not name.startswith("_"): + __all__.append(name) ################################################################################ # Import interface functions defined in Python @@ -809,6 +838,7 @@ def _assert(condition, message): from torch import distributions as distributions from torch import testing as testing import torch.backends.cuda +import torch.backends.mps import torch.backends.cudnn import torch.backends.mkl import torch.backends.mkldnn @@ -871,6 +901,9 @@ def compiled_with_cxx11_abi(): # information. from . import _masked +# Import removed ops with error message about removal +from ._linalg_utils import solve + def _register_device_module(device_type, module): r"""Register an external runtime module of the specific :attr:`device_type` @@ -889,3 +922,6 @@ def _register_device_module(device_type, module): # expose return_types from . import return_types +if sys.executable != 'torch_deploy': + from . import library + from . import _meta_registrations diff --git a/torch/_decomp/__init__.py b/torch/_decomp/__init__.py new file mode 100644 index 000000000000..9d72a832538d --- /dev/null +++ b/torch/_decomp/__init__.py @@ -0,0 +1,105 @@ +import torch +import torch._ops +import torch.library +from typing import Callable, Union, Dict, Sequence, List +from torch.utils._pytree import tree_map +from collections import defaultdict + +__all__ = ["decomposition_table", "register_decomposition", "get_decompositions"] + +# TODO: relax key type here; torch registrations should be possible to; but +# right now this type is accurate +decomposition_table: Dict[torch._ops.OpOverload, Callable] = {} + + +meta_lib = torch.library.Library("aten", "IMPL", "Meta") + + +def register_decomposition(aten_op, registry=None, *, disable_meta: bool = False): + """ + A decorator to register a function as a decomposition to the Python + decomposition table. Use it like this:: + + @register_decomposition(torch.ops.aten.clamp_min) + def clamp_min(x): + return torch.clamp(self, min=min) + + If you are writing a new decomposition, consider contributing it + directly to PyTorch in torch._decomp.decompositions. + + This API is experimental; we are almost certainly going to extend + the API when we make decompositions eligible for use in transforms (e.g., + autograd) and not just backend tracing, where we then need to know if a + decomposition can be used to simulate a transform. + + By default, if the decomposition is for an operator that doesn't have + a Meta implementation, we will register it to the dispatcher. Use + `disable_meta` to disable this behavior. + """ + def decomposition_decorator(f): + nonlocal registry + if registry is None: + registry = decomposition_table + + def add_op_to_table(aten_op): + overloads = [] + if isinstance(aten_op, torch._ops.OpOverload): + overloads.append(aten_op) + else: + assert isinstance(aten_op, torch._ops.OpOverloadPacket) + for ol in aten_op.overloads(): + overloads.append(getattr(aten_op, ol)) + for op_overload in overloads: + if op_overload in registry: + raise RuntimeError(f"duplicate registrations for {op_overload}") + registry[op_overload] = f + # TODO: factor this logic into OpOverload or Library API + name = op_overload._schema.name + if op_overload._schema.overload_name: + name += "." + op_overload._schema.overload_name + if ( + not disable_meta + # TorchScript dumps a bunch of extra nonsense overloads + # which don't have corresponding dispatcher entries, we need + # to filter those out + and torch._C._dispatch_has_kernel(name) + and not torch._C._dispatch_has_kernel_for_dispatch_key(name, 'Meta') + ): + meta_lib.impl(op_overload, f) + + # To handle allowing multiple aten_ops at once + tree_map(add_op_to_table, aten_op) + return f + + return decomposition_decorator + + +def get_decompositions( + aten_ops: Sequence[Union[torch._ops.OpOverload, torch._ops.OpOverloadPacket]] +) -> Dict[torch._ops.OpOverload, Callable]: + """ + Retrieve a dictionary of decompositions corresponding to the list of + operator overloads and overload packets passed as input. Overload + packets will include all decomposed overloads in the packet. If there is + no decomposition for a requested operator, it is silently ignored. + + This API is experimental; we are almost certainly going to give an alternate, + more recommended formulation, where a user provides the set of operators + they know how to implement, and we provide decompositions for everything + not in this set. + """ + packets_to_overloads = defaultdict(list) + for opo in decomposition_table: + packets_to_overloads[opo.overloadpacket].append(opo) + decompositions = {} + for op in aten_ops: + if isinstance(op, torch._ops.OpOverloadPacket) and op in packets_to_overloads: + for op_overload in packets_to_overloads[op]: + decompositions[op_overload] = decomposition_table[op_overload] + elif isinstance(op, torch._ops.OpOverload) and op in decomposition_table: + decompositions[op] = decomposition_table[op] + return decompositions + +# populate the table +import torch._decomp.decompositions +import torch._refs diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py new file mode 100644 index 000000000000..5541506b72a5 --- /dev/null +++ b/torch/_decomp/decompositions.py @@ -0,0 +1,1289 @@ +import torch +from torch import Tensor +from torch._decomp import register_decomposition +from enum import Enum +from typing import Tuple, Optional, List, Callable +import torch.nn.functional as F +import functools +from torch.utils._pytree import tree_map, tree_flatten +import torch._prims.utils as utils + +# None of these functions are publicly accessible; get at them +# from torch._decomps +__all__: List[str] = [] + +aten = torch.ops.aten + + +class Reduction(Enum): + NONE = 0 + MEAN = 1 + SUM = 2 + + +# This wraps a decomposition and performs various type promotion logic within it, depending on the strategy provided +# We're currently re-using ELEMENTWISE_TYPE_PROMOTION_KIND, although some of the usages are on non-elementwise ops +# Will need to validate the non-elementwise uses +def type_casts(f: Callable, type_promotion: utils.ELEMENTWISE_TYPE_PROMOTION_KIND): + @functools.wraps(f) + def inner(*args, **kwargs): + flat_args = [x for x in tree_flatten((args, kwargs))[0] if isinstance(x, Tensor)] + computation_dtype, result_dtype = utils.elementwise_dtypes(*flat_args, + type_promotion_kind=type_promotion) + + # TODO: pretty sure this is not quite right + def increase_prec(x): + if isinstance(x, Tensor): + return x.to(computation_dtype) + else: + return x + + def decrease_prec(x): + if isinstance(x, Tensor): + return x.to(result_dtype) + else: + return x + + r = f(*tree_map(increase_prec, args), **tree_map(increase_prec, kwargs)) + return tree_map(decrease_prec, r) + + return inner + +pw_cast_for_opmath = functools.partial(type_casts, type_promotion=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT) +reduction_complex_to_real = functools.partial(type_casts, type_promotion=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT) +pw_cast_for_int_to_real = functools.partial(type_casts, type_promotion=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT) + +# This expands x until x.dim() == dim. Might be useful as an operator +def _unsqueeze_to_dim(x: Tensor, dim: int): + for _ in range(dim - x.dim()): + x = x.unsqueeze(-1) + return x + + +@register_decomposition(aten.tanh_backward) +@pw_cast_for_opmath +def tanh_backward(out_grad: Tensor, y: Tensor): + return out_grad * (1 - y * y).conj_physical() + + +@register_decomposition(aten.sigmoid_backward) +@pw_cast_for_opmath +def sigmoid_backward(out_grad: Tensor, y: Tensor): + return out_grad * (y * (1 - y)).conj_physical() + + +@register_decomposition(aten.softplus_backward) +@pw_cast_for_opmath +def softplus_backward(out_grad: Tensor, x: Tensor, beta: float, threshold: float): + z = (x * beta).exp() + return torch.where((x * beta) > threshold, out_grad, out_grad * z / (z + 1.0)) + + +@register_decomposition(aten.elu) +@pw_cast_for_opmath +def elu( + self: Tensor, alpha: float = 1, scale: float = 1, input_scale: float = 1 +) -> Tensor: + negcoef = alpha * scale + poscoef = scale + negiptcoef = input_scale + return torch.where( + self > 0, self * poscoef, (torch.exp(self * negiptcoef) - 1) * negcoef + ) + + +@register_decomposition(aten.elu_backward) +@pw_cast_for_opmath +def elu_backward( + grad_output: Tensor, + alpha: float, + scale: float, + input_scale: float, + is_result: bool, + self_or_result: Tensor, +): + negcoef = alpha * scale + poscoef = scale + negiptcoef = input_scale + if is_result: + return torch.where( + self_or_result <= 0, + grad_output * negiptcoef * (self_or_result + negcoef), + self_or_result * poscoef, + ) + else: + return torch.where( + self_or_result <= 0, + grad_output * negiptcoef * negcoef * torch.exp(self_or_result * negiptcoef), + grad_output * poscoef, + ) + + +@register_decomposition(aten.hardsigmoid) +@pw_cast_for_opmath +def hardsigmoid(self: Tensor) -> Tensor: + return torch.clamp(torch.clamp(self + 3, min=0), max=6) / 6 + + +@register_decomposition(aten.hardsigmoid_backward) +@pw_cast_for_opmath +def hardsigmoid_backward(grad_output: Tensor, self: Tensor): + return torch.where( + (self > -3.0) & (self < 3.0), + grad_output * (1.0 / 6.0), + grad_output.new_zeros(()), + ) + + +@register_decomposition(aten.hardtanh) +@pw_cast_for_opmath +def hardtanh(self: Tensor, min_val: float = -1, max_val: float = 1) -> Tensor: + return torch.clamp(self, min_val, max_val) + + +@register_decomposition(aten.hardtanh_backward) +@pw_cast_for_opmath +def hardtanh_backward( + grad_output: Tensor, self: Tensor, min_val: float, max_val: float +): + return torch.where( + (self <= min_val) | (self >= max_val), grad_output.new_zeros(()), grad_output + ) + + +@register_decomposition(aten.hardshrink_backward) +@pw_cast_for_opmath +def hardshrink_backward(grad_out: Tensor, self: Tensor, lambd: float): + return torch.where( + (self >= -lambd) & (self <= lambd), grad_out.new_zeros(()), grad_out + ) + + +@register_decomposition(aten.hardswish) +@pw_cast_for_opmath +def hardswish(self: Tensor) -> Tensor: + return self * torch.clamp(torch.clamp(self + 3, min=0), max=6) / 6 + + +@register_decomposition(aten.hardswish_backward) +@pw_cast_for_opmath +def hardswish_backward(grad_output: Tensor, self: Tensor) -> Tensor: + return torch.where( + self < -3, + grad_output.new_zeros(()), + torch.where(self <= 3, grad_output * ((self / 3) + 0.5), grad_output), + ) + + +@register_decomposition(aten.threshold_backward) +@pw_cast_for_opmath +def threshold_backward(grad_output: Tensor, self: Tensor, threshold: float): + return torch.where(self <= threshold, grad_output.new_zeros(()), grad_output) + + +@register_decomposition(aten.leaky_relu) +@pw_cast_for_opmath +def leaky_relu(self: Tensor, negative_slope: float = 0.01) -> Tensor: + return torch.where(self > 0, self, self * negative_slope) + + +@register_decomposition(aten.leaky_relu_backward) +@pw_cast_for_opmath +def leaky_relu_backward( + grad_output: Tensor, self: Tensor, negative_slope: float, self_is_result: bool +): + return torch.where(self > 0, grad_output, grad_output * negative_slope) + + + +@register_decomposition(aten.gelu) +@pw_cast_for_opmath +def gelu(self: Tensor, approximate: str = 'none') -> Tensor: + M_SQRT2 = 1.41421356237309504880 + M_SQRT1_2 = 0.70710678118654752440 + M_2_SQRTPI = 1.12837916709551257390 + if approximate == 'tanh': + kBeta = M_SQRT2 * M_2_SQRTPI * 0.5 + kKappa = 0.044715 + x_cube = self * self * self + inner = kBeta * (self + kKappa * x_cube) + return 0.5 * self * (1 + torch.tanh(inner)) + else: + kAlpha = M_SQRT1_2 + return self * 0.5 * (1 + torch.erf(self * kAlpha)) + + +@register_decomposition(aten.gelu_backward) +@pw_cast_for_opmath +def gelu_backward(grad: Tensor, self: Tensor, approximate: str = "none"): + M_SQRT2 = 1.41421356237309504880 + M_SQRT1_2 = 0.70710678118654752440 + M_2_SQRTPI = 1.12837916709551257390 + if approximate == 'tanh': + kBeta = M_SQRT2 * M_2_SQRTPI * 0.5 + kKappa = 0.044715 + x_sq = self * self + x_cube = x_sq * self + inner = kBeta * (self + kKappa * x_cube) + tanh_inner = torch.tanh(inner) + + left = 0.5 * self + right = 1 + tanh_inner + + left_derivative = 0.5 * right + + tanh_derivative = 1 - tanh_inner * tanh_inner + inner_derivative = kBeta * (1 + 3 * kKappa * x_sq) + right_derivative = left * tanh_derivative * inner_derivative + + return grad * (left_derivative + right_derivative) + else: + kAlpha = M_SQRT1_2 + kBeta = M_2_SQRTPI * M_SQRT1_2 * 0.5 + cdf = 0.5 * (1 + torch.erf(self * kAlpha)) + pdf = kBeta * torch.exp(self * self * -0.5) + return grad * (cdf + self * pdf) + + +@register_decomposition(aten.mish_backward) +@pw_cast_for_opmath +def mish_backward(grad_output: Tensor, input: Tensor): + input_tanh_softplus = torch.tanh(F.softplus(input)) + input_sigmoid = torch.sigmoid(input) + out = input * input_sigmoid * (1 - input_tanh_softplus * input_tanh_softplus) + return grad_output * (input_tanh_softplus + out) + + +@register_decomposition(aten.silu) +@pw_cast_for_opmath +def silu(self: Tensor) -> Tensor: + return self * torch.sigmoid(self) + + +@register_decomposition(aten.silu_backward) +@pw_cast_for_opmath +def silu_backward(grad_output: Tensor, self: Tensor) -> Tensor: + sigmoid = 1 / (1 + torch.exp(-self)) + return grad_output * sigmoid * (1 + self * (1 - sigmoid)) + + +@register_decomposition(aten.softshrink_backward) +def softshrink_backward(grad_output: Tensor, self: Tensor, lambd: float) -> Tensor: + return torch.where( + (self >= -lambd) & (self <= lambd), grad_output.new_zeros(()), grad_output + ) + + +@register_decomposition(aten.prelu_backward) +@pw_cast_for_opmath +def prelu_backward( + grad_output: Tensor, self: Tensor, weight: Tensor +) -> Tuple[Tensor, Tensor]: + # Logic is more complicated than I would like. Basically, weight can either + # be a scalar or a vector of size [C], and in the forward pass it's + # broadcast against [N, C, ...]. So now, we need to do the corresponding + # reduction, which is harder than we'd like... + cur_weight = weight + for _ in range(2, grad_output.dim()): + cur_weight = cur_weight.unsqueeze(-1) + input_grad = torch.where(self > 0, grad_output, cur_weight * grad_output) + weight_grad_collector = torch.where( + self > 0, grad_output.new_zeros(()), self * grad_output + ) + out = weight_grad_collector.sum_to_size(cur_weight.shape) + while out.dim() > weight.dim(): + out = out.squeeze(-1) + return (input_grad, out) + + +@register_decomposition(aten.rrelu_with_noise_backward) +@pw_cast_for_opmath +def rrelu_with_noise_backward( + grad_output: Tensor, + self: Tensor, + noise: Tensor, + lower: float, + upper: float, + training: bool, + self_is_result: bool, +) -> Tensor: + if training and upper - lower > 1e-6: + return grad_output.mul(noise) + else: + negative_slope = (lower + upper) / 2 + return aten.leaky_relu_backward(grad_output, self, negative_slope, self_is_result) + + +@register_decomposition(aten.log_sigmoid_backward) +@pw_cast_for_opmath +def log_sigmoid_backward(grad_output: Tensor, self: Tensor, buffer: Tensor) -> Tensor: + in_negative = self < 0 + max_deriv = torch.where(in_negative, 1, 0) + sign = torch.where(in_negative, 1, -1) + z = torch.exp(-torch.abs(self)) + return grad_output * (max_deriv - sign * (z / (1 + z))) + # CPU has a special formula that uses buffer, but disabled for convenience sake + # return (max_deriv - sign * (buffer / (1 + buffer))) * grad_output + + +def apply_loss_reduction(loss: Tensor, reduction: int): + if reduction == Reduction.MEAN.value: + return torch.mean(loss) + elif reduction == Reduction.SUM.value: + return torch.sum(loss) + else: + return loss + + +def to_real_dtype(dtype: torch.dtype): + if dtype == torch.complex32: + return torch.float16 + elif dtype == torch.complex64: + return torch.float32 + elif dtype == torch.complex128: + return torch.float64 + +# TODO: None of these loss castings are quite correct, see +# https://github.com/pytorch/pytorch/issues/76870. Also, the ATen kernels +# perform the pointwise portion in opmath, but don't maintain it between the +# pointwise portion and the reduction + +@register_decomposition(aten.l1_loss) +def l1_loss( + self: Tensor, target: Tensor, reduction: int = Reduction.MEAN.value +) -> Tensor: + loss = (self - target).abs() + # PyTorch semantics result in the output of l1_loss having the corresponding + # real dtype to self. This may not happen without explicit casting if say + # self: complex64 and target: float64, which results in loss: float64 + float_type = to_real_dtype(self.dtype) + return apply_loss_reduction(loss, reduction).to(float_type) + + +@register_decomposition(aten.l1_loss_backward) +@pw_cast_for_opmath +def l1_loss_backward( + grad_output: Tensor, + self: Tensor, + target: Tensor, + reduction: int = Reduction.MEAN.value, +): + sign = torch.sign(self - target) + + norm = sign / self.numel() if reduction == Reduction.MEAN.value else sign + return grad_output * norm + + +@register_decomposition(aten.mse_loss) +@pw_cast_for_opmath +def mse_loss( + self: Tensor, target: Tensor, reduction: int = Reduction.MEAN.value +) -> Tensor: + loss = (self - target) ** 2 + return apply_loss_reduction(loss, reduction) + + +@register_decomposition(aten.mse_loss_backward) +@pw_cast_for_opmath +def mse_loss_backward( + grad_output: Tensor, input: Tensor, target: Tensor, reduction: int +): + norm = 2.0 / input.numel() if reduction == Reduction.MEAN.value else 2.0 + return norm * (input - target) * grad_output + + +@register_decomposition(aten.huber_loss) +@pw_cast_for_opmath +def huber_loss( + self: Tensor, + target: Tensor, + reduction: int = Reduction.MEAN.value, + delta: float = 1.0, +) -> Tensor: + assert delta > 0, "huber_loss does not support non-positive values for delta." + z = (self - target).abs() + loss = torch.where(z < delta, 0.5 * z * z, delta * (z - 0.5 * delta)) + return apply_loss_reduction(loss, reduction) + + +@register_decomposition(aten.huber_loss_backward) +@pw_cast_for_opmath +def huber_loss_backward( + grad_output: Tensor, self: Tensor, target: Tensor, reduction: int, delta: float +): + norm = 1.0 / self.numel() if reduction == Reduction.MEAN.value else 1.0 + x = self - target + return torch.where( + x < -delta, + -norm * grad_output * delta, + torch.where(x > delta, norm * grad_output * delta, norm * x * grad_output), + ) + + +def _nll_loss_backward( + grad_output: Tensor, + self: Tensor, + target: Tensor, + weight: Optional[Tensor], + reduction: int, + ignore_index: int, + total_weight: Tensor, +) -> Tensor: + channel_dim = 0 if self.dim() < 2 else 1 + if reduction == Reduction.MEAN.value: + grad_output = grad_output / total_weight + + target = target.unsqueeze(channel_dim) + grad_input = torch.zeros_like(self) + grad_input = torch.scatter(grad_input, channel_dim, target, -1.0) + + if grad_input.dim() > grad_output.dim() > 0: + grad_output = grad_output.unsqueeze(channel_dim) + + if weight is not None: + new_shape = [1 for _ in range(self.dim())] + new_shape[channel_dim] = weight.shape[0] + weight = weight.reshape(new_shape) + grad_output = grad_output * weight + + has_ignore_index = ignore_index >= 0 + if has_ignore_index: + ignore_index_mask = target != ignore_index + grad_output = grad_output * ignore_index_mask + + return grad_input * grad_output + +@register_decomposition(aten.nll_loss_backward) +def nll_loss_backward( + grad_output: Tensor, + self: Tensor, + target: Tensor, + weight: Optional[Tensor], + reduction: int, + ignore_index: int, + total_weight: Tensor, +) -> Tensor: + assert 0 <= self.dim() <= 2, "input tensor should be 1D or 2D" + assert ( + target.dim() <= 1 + ), "0D or 1D target tensor expected, multi-target not supported" + + no_batch_dim = self.dim() == 1 and target.dim() == 0 + assert no_batch_dim or ( + self.shape[0] == target.shape[0] + ), f"size mismatch (got input: {self.shape}, target: {target.shape})" + assert total_weight.numel() == 1, ( + "expected total_weight to be a single element tensor, got: ", + f"{total_weight.shape} ({total_weight.numel()} elements)", + ) + + assert ( + weight is None or weight.numel() == self.shape[-1] + ), "weight tensor should be defined either for all or no classes" + + if reduction == Reduction.NONE.value and self.dim() == 2: + assert grad_output.dim() == 1 and grad_output.shape[0] == self.shape[0], ( + f"Expected a tensor of dimension 1 and tensor.size[0] == {self.shape[0]} but " + f"got: dimension {grad_output.dim()} and tensor.size[0] == {grad_output.shape[0]}" + ) + else: + assert ( + grad_output.dim() <= 1 and grad_output.numel() == 1 + ), f"Expected a single element grad_output tensor, but got: {grad_output.shape}" + + return _nll_loss_backward(grad_output, self, target, weight, reduction, ignore_index, total_weight) + + +@register_decomposition(aten.nll_loss2d_backward) +def nll_loss2d_backward( + grad_output: Tensor, + self: Tensor, + target: Tensor, + weight: Optional[Tensor], + reduction: int, + ignore_index: int, + total_weight: Tensor, +) -> Tensor: + assert ( + self.dim() == 4 + ), f"only batches of spatial inputs supported (4D tensors), but got input of dimension: {self.dim()}" + + assert ( + target.dim() == 3 + ), f"only batches of spatial targets supported (3D tensors) but got targets of dimension: {target.dim()}" + + assert( + self.shape[0] == target.shape[0] and self.shape[2] == target.shape[1] and self.shape[3] == target.shape[2] + ), f"size mismatch (got input: {self.shape}, target: {target.shape}" + + assert ( + total_weight.numel() == 1 + ), ( + "expected total_weight to be a single element tensor, " + f"got: {total_weight.shape} ( {total_weight.numel()}, elements)" + ) + + return _nll_loss_backward(grad_output, self, target, weight, reduction, ignore_index, total_weight) + + +@register_decomposition(aten.binary_cross_entropy) +@pw_cast_for_opmath +def binary_cross_entropy( + self: Tensor, + target: Tensor, + weight: Optional[Tensor] = None, + reduction: int = Reduction.MEAN.value, +) -> Tensor: + # We cannot currently model this without introducing data-dependent control flow + # TORCH_CHECK( + # (input_val >= 0) && (input_val <= 1), + # "all elements of input should be between 0 and 1" + # ) + loss = (target - 1) * torch.maximum( + torch.log(1 - self), self.new_full((), -100) + ) - target * torch.maximum(torch.log(self), self.new_full((), -100)) + if weight is not None: + loss = loss * weight + return apply_loss_reduction(loss, reduction) + + +@register_decomposition(aten.binary_cross_entropy_backward) +@pw_cast_for_opmath +def binary_cross_entropy_backward( + grad_output: Tensor, + self: Tensor, + target: Tensor, + weight: Optional[Tensor] = None, + reduction: int = Reduction.MEAN.value, +) -> Tensor: + EPSILON = 1e-12 + result = grad_output * (self - target) / torch.clamp(self * (1 - self), min=EPSILON) + if weight is not None: + result = result * weight + if reduction == Reduction.MEAN.value: + result = result / self.numel() + return result + + +@register_decomposition(aten._euclidean_dist) +def _euclidean_dist(x1: Tensor, x2: Tensor) -> Tensor: + x1_norm = x1.pow(2).sum(-1, True) + x1_pad = torch.ones_like(x1_norm, memory_format=torch.contiguous_format) + x2_norm = x2.pow(2).sum(-1, True) + x2_pad = torch.ones_like(x2_norm, memory_format=torch.contiguous_format) + x1_ = torch.cat([x1.mul(-2), x1_norm, x1_pad], -1) + x2_ = torch.cat([x2, x2_pad, x2_norm], -1) + result = x1_.matmul(x2_.mT) + return result.clamp_min(0).sqrt() + + +@register_decomposition(aten.slice_backward) +def slice_backward( + grad_output: Tensor, + input_sizes: List[int], + dim: int, + start: int, + end: int, + step: int, +): + grad_input = grad_output.new_zeros(input_sizes) + return torch.slice_scatter(grad_input, grad_output, dim, start, end, step) + + +@register_decomposition(aten.select_backward) +def select_backward(grad_output: Tensor, input_sizes: List[int], dim: int, index: int): + grad_input = grad_output.new_zeros(input_sizes) + return torch.select_scatter(grad_input, grad_output, dim, index) + + +@register_decomposition(aten.diagonal_backward) +def diagonal_backward( + grad_output: Tensor, input_sizes: List[int], offset: int, dim1: int, dim2: int +): + grad_input = grad_output.new_zeros(input_sizes) + return torch.diagonal_scatter(grad_input, grad_output, offset, dim1, dim2) + + +@register_decomposition(aten._softmax_backward_data) +@pw_cast_for_opmath +def _softmax_backward_data( + grad_output: Tensor, output: Tensor, dim: int, input_dtype: int +): + new_grad = grad_output * output + return new_grad - output * torch.sum(new_grad, dim=dim, keepdim=True) + + +@register_decomposition(aten._log_softmax_backward_data) +@pw_cast_for_opmath +def _log_softmax_backward_data( + grad_output: Tensor, output: Tensor, dim: int, input_dtype: int +): + grad_input = grad_output - torch.exp(output) * torch.sum( + grad_output, dim=dim, keepdim=True + ) + return grad_input + + +# TODO: the type annotations on arguments are not quite right + + +@register_decomposition(aten.im2col_backward) +def im2col_backward( + grad_output: Tensor, + input_size: List[int], + kernel_size: List[int], + dilation: List[int], + padding: List[int], + stride: List[int], +) -> Tensor: + return F.fold(grad_output, input_size, kernel_size, dilation, padding, stride) # type: ignore[arg-type] + + +@register_decomposition(aten.col2im_backward) +def col2im_backward( + grad_output: Tensor, + kernel_size: List[int], + dilation: List[int], + padding: List[int], + stride: List[int], +) -> Tensor: + return F.unfold(grad_output, kernel_size, dilation, padding, stride) # type: ignore[arg-type] + + +@register_decomposition(aten.masked_fill.Scalar) +def masked_fill_Scalar(self: Tensor, mask: Tensor, value: float) -> Tensor: + return torch.where(mask, utils.dtype_to_type(self.dtype)(value), self) + + +@register_decomposition(aten.masked_fill.Tensor) +def masked_fill_Tensor(self: Tensor, mask: Tensor, value: Tensor) -> Tensor: + return torch.where(mask, value, self) + + +@register_decomposition(aten.native_dropout_backward) +@pw_cast_for_opmath +def native_dropout_backward(grad_output: Tensor, mask: Tensor, scale: float): + return grad_output * (mask.type_as(grad_output) * scale) + + +@register_decomposition(aten.logit) +@pw_cast_for_int_to_real +def logit(self: Tensor, eps: Optional[float] = None) -> Tensor: + if eps is None: + eps = -1.0 + lo = eps + hi = 1 - eps + self = torch.clamp(self, lo, hi) + return (self / (1 - self)).log() + + +@register_decomposition(aten.logit_backward) +@pw_cast_for_opmath +def logit_backward( + grad_output: Tensor, self: Tensor, eps: Optional[float] = None +) -> Tensor: + if eps is not None: + lo = eps + hi = 1.0 - lo + return torch.where( + torch.logical_and(self >= lo, self <= hi), + grad_output / (self * (1.0 - self)), + self.new_zeros(()), + ) + else: + return torch.where( + torch.logical_and(self >= 0.0, self <= 1.0), + grad_output / (self * (1.0 - self)), + self.new_full((), float("nan")), + ) + + +@register_decomposition(aten.native_dropout) +@pw_cast_for_opmath +def native_dropout(input: Tensor, p: float, train: Optional[bool]): + if train: + bool_mask = torch.rand_like(input) < p + res = bool_mask * input * float(1.0 / p) + return (res, bool_mask) + else: + return (input, torch.ones_like(input, dtype=torch.bool)) + + +# TODO: Correct the type promotion semantics +@register_decomposition(aten._softmax) +@pw_cast_for_opmath +def _softmax(x: Tensor, dim: int, half_to_float: bool): + x_max = torch.max(x, dim, keepdim=True)[0] + unnormalized = torch.exp(x - x_max) + return unnormalized / torch.sum(unnormalized, dim, keepdim=True) + + +# TODO: Correct the type promotion semantics +@register_decomposition(aten._log_softmax) +@pw_cast_for_opmath +def _log_softmax(x: Tensor, dim: int, half_to_float: bool): + x_max = torch.max(x, dim, keepdim=True)[0] + shifted = x - x_max + shifted_logsumexp = torch.log(torch.sum(torch.exp(shifted), dim, keepdim=True)) + return shifted - shifted_logsumexp + + +@register_decomposition(aten.addcdiv) +@pw_cast_for_opmath +def addcdiv(self: Tensor, tensor1: Tensor, tensor2: Tensor, value: float = 1): + return self + value * (tensor1 / tensor2) + + +# Remove special case when https://github.com/pytorch/pytorch/pull/72949 is landed. +@register_decomposition(aten.addcmul) +@pw_cast_for_opmath +def addcmul(self: Tensor, tensor1: Tensor, tensor2: Tensor, value: float = 1): + if self.is_floating_point() or self.is_complex(): + return self + value * tensor1 * tensor2 + else: + return self + int(value) * tensor1 * tensor2 + + +@register_decomposition(aten.rsub.Tensor) +def rsub_Tensor(self: Tensor, other: Tensor, alpha: float = 1) -> Tensor: + return torch.sub(other, self, alpha=alpha) + + +@register_decomposition(aten.rsub.Scalar) +def rsub_Scalar(self: Tensor, other: float, alpha: float = 1) -> Tensor: + return torch.sub(other, self, alpha=alpha) + + +@register_decomposition(aten.embedding) +def embedding( + weight: Tensor, + indices: Tensor, + padding_idx: int = -1, + scale_grad_by_freq: bool = False, + sparse: bool = False, +) -> Tensor: + assert weight.dim() == 2, "'weight' must be 2-D" + # TODO: Assert not ported over yet + # auto indices_arg = TensorArg(indices, "indices", 1); + # checkScalarTypes("embedding", indices_arg, {kLong, kInt}); + + if indices.dim() == 1: + return weight.index_select(0, indices) + + size = list(indices.shape) + for d in weight.shape[1:]: + size.append(d) + + return weight.index_select(0, indices.reshape(-1)).view(size) + +# TODO: Correct the type promotion semantics +@register_decomposition(aten.embedding_dense_backward) +def embedding_dense_backward( + grad_output: Tensor, + indices: Tensor, + num_weights: int, + padding_idx: int, + scale_grad_by_freq: bool, +): + numel = indices.numel() + grad = grad_output.view(numel, grad_output.size(-1)) + grad_weight = grad_output.new_zeros((num_weights, grad_output.shape[-1])) + indices_rank1 = indices.view(numel) + if scale_grad_by_freq: + counts = indices.new_zeros((num_weights,)) + ones = indices.new_ones((numel,)) + counts = counts.index_put([indices_rank1], ones, accumulate=True) + grad_weights_scale = counts[indices_rank1] + grad = grad / grad_weights_scale.unsqueeze(1) + skip_padding = (indices_rank1 != padding_idx).unsqueeze(1) + skip_padding = skip_padding.expand_as(grad) + zero_grad = torch.full_like(grad, 0) + return grad_weight.index_put( + [indices_rank1], torch.where(skip_padding, grad, zero_grad), accumulate=True + ) + + +def prod(x: List[int]): + r = 1 + for i in x: + r *= i + return r + + +@register_decomposition(aten.split_with_sizes) +def split_with_sizes( + self: Tensor, split_sizes: List[int], dim: int = 0 +) -> List[Tensor]: + num_splits = len(split_sizes) + splits = [] + start_idx = 0 + for i in range(num_splits): + length = split_sizes[i] + splits.append(self.narrow(dim, start_idx, length)) + start_idx += length + return splits + + +@register_decomposition(aten.split.Tensor) +def split(self: Tensor, split_size: int, dim: int = 0) -> List[Tensor]: + input_sizes = self.shape + dim_size = input_sizes[dim] + if split_size == 0: + assert dim_size == 0 + return [self] + chunks = (dim_size + split_size - 1) // split_size + split_sizes = [split_size for i in range(chunks)] + split_sizes[chunks - 1] = split_size - (split_size * chunks - dim_size) + return torch.split(self, split_sizes, dim) + + +# TODO: this doesn't appear to have enough precision in bfloat16 +@register_decomposition(aten.addmm) +@pw_cast_for_opmath +def addmm(self: Tensor, mat1: Tensor, mat2: Tensor, beta: int = 1, alpha: int = 1): + if not self.is_floating_point() and not self.is_complex(): + beta = int(beta) + alpha = int(alpha) + out = alpha * torch.mm(mat1, mat2) + if beta == 0: + return out + return beta * self + out + + +# TODO: Correct the type promotion semantics +@register_decomposition(aten.native_layer_norm) +@pw_cast_for_opmath +def native_layer_norm( + input: Tensor, + normalized_shape: List[int], + weight: Optional[Tensor], + bias: Optional[Tensor], + eps: float, +) -> Tuple[Tensor, Tensor, Tensor]: + input_shape = input.shape + input_ndim = input.dim() + + axis = input_ndim - len(normalized_shape) + M = prod(input_shape[:axis]) # type: ignore[arg-type] + + # Hmm... not sure how I get around this... + # Basically, native_batch_norm doesn't support 0-entry tensors, while + # native_layer_norm does (and is tested by OpInfos!) + if M > 0: + input_reshaped = input.view(1, M, -1) + else: + return (input, input.new_zeros((0,)), input.new_zeros((0,))) + + # Unlike Batch Normalization, which applies scalar scale and bias for each + # entire channel/plane with the affine option, Layer Normalization applies + # per-element scale and bias. E.g. For input {N, C, H, W}, weight for + # batchnorm has shape {C} while weight for layernorm has shape {H, W} or {W}. + out, mean, rstd = aten.native_batch_norm( + input_reshaped, + weight=None, + bias=None, + running_mean=None, + running_var=None, + training=True, + momentum=0.0, + eps=eps, + ) + out = out.view(input_shape) + if weight is not None: + out = out * weight + if bias is not None: + out = out + bias + + stat_shape = list(input_shape[:axis]) + for _ in range(axis, input.dim()): + stat_shape.append(1) + mean = mean.view(stat_shape) + rstd = rstd.view(stat_shape) + return (out, mean, rstd) + + +# TODO: Correct the type promotion semantics +@register_decomposition(aten.native_layer_norm_backward) +@pw_cast_for_opmath +def native_layer_norm_backward( + grad_out: Tensor, + input: Tensor, + normalized_shape: List[int], + mean: Tensor, + rstd: Tensor, + weight: Optional[Tensor], + bias: Optional[Tensor], + output_mask: List[bool], +) -> Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor]]: + input_shape = input.shape + input_ndim = input.dim() + + axis = input_ndim - len(normalized_shape) + inner_dims = input_shape[axis:] + outer_dims = input_shape[:axis] + inner_dim_indices: List[int] = [] + outer_dim_indices: List[int] = [] + for i in range(input_ndim): + if i >= axis: + inner_dim_indices.append(i) + else: + outer_dim_indices.append(i) + + N = prod(inner_dims) # type: ignore[arg-type] + M = prod(outer_dims) # type: ignore[arg-type] + if M <= 0 or N <= 0: + return ( + input.new_zeros(input_shape), + input.new_zeros(input_shape[axis:]), + input.new_zeros(input_shape[axis:]), + ) + + x_hat = (input - mean) * rstd + if weight is not None: + grad_x_hat = grad_out * weight + else: + grad_x_hat = grad_out + a = grad_x_hat * N + b = torch.sum(grad_x_hat, inner_dim_indices, True) + c1 = torch.mul(grad_x_hat, x_hat) + c2 = torch.sum(c1, inner_dim_indices, True) + c3 = torch.mul(x_hat, c2) + + inner = a - b - c3 + + if output_mask[0]: + d_input: Optional[Tensor] = (rstd / N) * inner + else: + d_input = None + + if output_mask[1] and weight is not None: + if len(outer_dim_indices) > 0: + d_weight: Optional[Tensor] = torch.sum( + grad_out * x_hat, outer_dim_indices, False + ) + else: + d_weight = grad_out * x_hat + else: + d_weight = None + + if output_mask[2] and bias is not None: + if len(outer_dim_indices) > 0: + d_bias: Optional[Tensor] = torch.sum(grad_out, outer_dim_indices, False) + else: + d_bias = grad_out + else: + d_bias = None + return (d_input, d_weight, d_bias) + + +# TODO: Correct the type promotion semantics +@register_decomposition(aten.native_batch_norm) +@pw_cast_for_opmath +def native_batch_norm( + input: Tensor, + weight: Optional[Tensor], + bias: Optional[Tensor], + running_mean: Optional[Tensor], + running_var: Optional[Tensor], + training: bool, + momentum: float, + eps: float, +) -> Tuple[Tensor, Tensor, Tensor]: + reduction_dims = [0] + list(range(2, input.dim())) + if training: + # save_mean = torch.sum(input / (input.shape[0] * input.shape[2]), dim=reduction_dims) + biased_var, save_mean = torch.var_mean( + input, dim=reduction_dims, unbiased=False + ) + save_invstd = 1 / (torch.sqrt(biased_var + eps)) + + if running_mean is not None: + running_mean.copy_(momentum * save_mean + (1 - momentum) * running_mean) + if running_var is not None: + n = input.numel() / input.shape[1] + # This doesn't strictly match eager's numerics, which accumulates var sum and then directly applies the correction + # But... that would require re-implementing var here, for negligible numerics gain on a tensor whose + # numerics probably don't matter. + unbiased_var = biased_var * (n / (n - 1)) + running_var.copy_(momentum * unbiased_var + (1 - momentum) * running_var) + mean = save_mean + invstd = save_invstd + else: + assert running_mean is not None and running_var is not None + mean = running_mean + invstd = 1 / (torch.sqrt(running_var + eps)) + # Very annoying inconsistency where CPU and CUDA give different shapes + if input.device.type == "cuda": + save_mean = running_mean + save_invstd = invstd + else: + save_mean = input.new_zeros((0,)) + save_invstd = input.new_zeros((0,)) + + if weight is None: + weight = input.new_ones(()) + + if bias is None: + bias = input.new_zeros(()) + + mean = _unsqueeze_to_dim(mean, input.dim() - 1) + invstd = _unsqueeze_to_dim(invstd, input.dim() - 1) + weight = _unsqueeze_to_dim(weight, input.dim() - 1) + bias = _unsqueeze_to_dim(bias, input.dim() - 1) + output = ((input - mean) * invstd) * weight + bias + return output, save_mean, save_invstd + + +@register_decomposition(aten.clamp_min) +def clamp_min(self: Tensor, min: float): + return torch.clamp(self, min=min) + + +@register_decomposition(aten.clamp_max) +def clamp_max(self: Tensor, max: float): + return torch.clamp(self, max=max) + + +@register_decomposition(aten._fused_dropout) +@pw_cast_for_opmath +def _fused_dropout_decomposition(input, p, generator=None): + mask = (torch.rand_like(input) < p).to(dtype=torch.uint8) + res = mask.type_as(input) * input * (1.0 / p) + return (res, mask) + + +# TODO: these logical decomps are buggy for complex inputs +@register_decomposition(aten.logical_xor) +def logical_xor(self: Tensor, other: Tensor) -> Tensor: + return self.to(dtype=torch.bool) ^ other.to(dtype=torch.bool) + + +@register_decomposition(aten.logical_not) +def logical_not(self: Tensor) -> Tensor: + return ~self.to(dtype=torch.bool) + + +@register_decomposition(aten.xlogy.Tensor) +@pw_cast_for_int_to_real +def xlogy(self: Tensor, other: Tensor) -> Tensor: + return aten.where(aten.isnan(self), + self, + aten.where(self == aten.new_zeros(self, ()), + aten.new_zeros(self, ()), + self * aten.log(other))) + + +@register_decomposition(aten.var.correction) +@reduction_complex_to_real +def var_correction( + x: Tensor, + dims: Optional[List[int]], + correction: Optional[int] = None, + keepdim: bool = False, +): + if dims is None: + dims = [] + + if x.is_complex(): + # For complex, calculate variance of real and imaginary components + # separately then add to get overall variance. + real_in = x.real + var_real = torch.var(real_in, dims, correction=correction, keepdim=keepdim) + imag_in = x.imag + var_imag = torch.var(imag_in, dims, correction=correction, keepdim=keepdim) + return var_real + var_imag + + if correction is None: + correction = 0 + + if len(dims) == 0: + n = prod(x.shape) # type: ignore[arg-type] + else: + n = 1 + for dim in dims: + n *= x.shape[dim] + + mean = torch.mean(x, dims, True) + sub = x - mean + sq = sub * sub + sum = torch.sum(sq, dims, keepdim) + + if correction: + n = n - correction + + return sum / n + + +@register_decomposition(aten.std.correction) +@reduction_complex_to_real +def std_decomposition( + x: Tensor, dims: List[int], correction: int = 0, keepdim: bool = False +): + return torch.sqrt(torch.var(x, dims, correction=correction, keepdim=keepdim)) + + +# Questionable decompositions +# This is only valid if we're running the graph without autograd, such as if the backward pass has been traced. +# Note that this decomposition causes issues with in-place ops +@register_decomposition(aten.detach, disable_meta=True) +def detach_decomposition(x): + return x + + +@register_decomposition(aten.cudnn_batch_norm) +def cudnn_batch_norm( + input: Tensor, + weight: Tensor, + bias: Optional[Tensor], + running_mean: Optional[Tensor], + running_var: Optional[Tensor], + training: bool, + exponential_average_factor: float, + epsilon: float, +): + a, b, c = aten.native_batch_norm( + input, + weight, + bias, + running_mean, + running_var, + training, + exponential_average_factor, + epsilon, + ) + # Cudnn return running mean and variance when training is True + if training: + return (a, b, c, input.new_zeros((0,), dtype=torch.uint8)) + return (a, input.new_zeros((0,)), input.new_zeros((0,)), input.new_zeros((0,), dtype=torch.uint8)) + + +@register_decomposition(aten.cudnn_batch_norm_backward) +def cudnn_batch_norm_backward( + input: Tensor, + grad_output: Tensor, + weight: Tensor, + running_mean: Optional[Tensor], + running_var: Optional[Tensor], + save_mean: Optional[Tensor], + save_var: Optional[Tensor], + epsilon: float, + reserveSpace: Tensor, +): + return aten.native_batch_norm_backward( + grad_output, + input, + weight, + running_mean, + running_var, + save_mean, + save_var, + True, + epsilon, + [True, True, True], + ) + + +@register_decomposition(aten.rot90.default) +def rot90(self: Tensor, k: int = 1, dims: List[int] = [0, 1]) -> Tensor: # noqa: B006 + total_dims = self.dim() + total_rot_dims = len(dims) + assert total_rot_dims == 2, f"expected total rotation dims == 2, but got dims = {total_rot_dims}" + assert total_dims >= 2, f"expected total dims >= 2, but got total dims = {total_dims}" + assert dims[0] != dims[1] and abs(dims[0] - dims[1]) != total_dims,\ + f"expected rotation dims to be different, but got dim0 = {dims[0]} and dim1 = {dims[1]}" + assert dims[0] < total_dims and dims[0] >= -total_dims, f"Rotation dim0 out of range, dim0 = {dims[0]}" + assert dims[1] < total_dims and dims[1] >= -total_dims, f"Rotation dim1 out of range, dim1 = {dims[1]}" + k = k % 4 + if k == 1: + return self.flip(dims[1]).transpose(dims[0], dims[1]) + elif k == 2: + return self.flip(dims) + elif k == 3: + return self.flip(dims[0]).transpose(dims[0], dims[1]) + else: + return self.clone(memory_format=torch.contiguous_format) + + +@register_decomposition(aten.transpose.int) +def transpose_int(self: Tensor, dim0: int, dim1: int) -> Tensor: + dim0, dim1 = utils.canonicalize_dims(self.dim(), (dim0, dim1)) # type: ignore[misc] + + if self.dim() <= 1: + return self + + if dim0 == dim1: + return self + perm = list(range(self.dim())) + perm[dim0], perm[dim1] = perm[dim1], perm[dim0] + return torch.permute(self, perm) + + +@register_decomposition(aten.t.default) +def t(self: Tensor) -> Tensor: + return self.transpose(0, 0 if self.dim() < 2 else 1) + + +def check_stack_inputs(tensors: List[Tensor]): + entry_shape = tensors[0].shape + for i in range(1, len(tensors)): + assert tensors[i].shape == entry_shape, (f"stack expects each tensor to be equal size, but got {entry_shape} at entry 0" + f"and {tensors[i].shape} at entry {i}") + + +def get_stack_inputs(tensors: List[Tensor], dim: int): + check_stack_inputs(tensors) + return [t.unsqueeze(dim) for t in tensors] + + +@register_decomposition(aten.stack.default) +def stack(tensors: List[Tensor], dim: int = 0) -> Tensor: + assert len(tensors) > 0, "stack expects a non-empty TensorList" + wrapped_dim = utils.canonicalize_dim(tensors[0].dim() + 1, dim) + if wrapped_dim < tensors[0].dim() and not tensors[0].is_sparse: + check_stack_inputs(tensors) + result_sizes = list(tensors[0].shape) + result_sizes.insert(wrapped_dim, len(tensors)) + out = torch.cat(tensors, wrapped_dim) + return out.view(result_sizes) + else: + return torch.cat(get_stack_inputs(tensors, wrapped_dim), dim) + + +def _squeeze_multiple(self: Tensor, dims: List[int]) -> Tensor: + ndim = self.dim() + wrapped_dims = utils.canonicalize_dims(ndim, dims) + assert isinstance(wrapped_dims, tuple) + for idx in range(ndim - 1, -1, -1): + if idx in wrapped_dims: + self = self.squeeze(idx) + return self + + +@register_decomposition(aten.logsumexp.default) +@pw_cast_for_int_to_real +def logsumexp(self: Tensor, dim: List[int], keepdim: bool = False) -> Tensor: + if self.numel() == 0: + return torch.sum(torch.exp(self), dim, keepdim).log() + maxes = torch.amax(self, dim, keepdim=True) + maxes_squeezed = maxes if keepdim else _squeeze_multiple(maxes, dim) + maxes_squeezed = torch.masked_fill(maxes_squeezed, maxes_squeezed.abs() == float('inf'), 0) + result = torch.sum(torch.exp(self - maxes), dim, keepdim) + return result.log().add(maxes_squeezed) + + +@register_decomposition(aten.trace.default) +def trace(self: Tensor) -> Tensor: + return torch.sum(torch.diag(self)) + + +# nb: Should use acc_t, not op_math +@register_decomposition(aten.log_sigmoid_forward.default) +@pw_cast_for_opmath +def log_sigmoid_forward(self: Tensor) -> Tuple[Tensor, Tensor]: + min = torch.minimum(self.new_zeros(()), self) + z = torch.exp(-torch.abs(self)) + if self.is_cuda: + buffer = self.new_zeros((0,)) + else: + buffer = z + return min - torch.log1p(z), buffer diff --git a/torch/_deploy.py b/torch/_deploy.py index 4a27e3753d3d..4cdb6f6f92e1 100644 --- a/torch/_deploy.py +++ b/torch/_deploy.py @@ -17,8 +17,8 @@ def _save_storages(importer, obj): importers = sys_importer def persistent_id(obj): - if torch.is_storage(obj) or isinstance(obj, torch.storage.TypedStorage): - if isinstance(obj, torch.storage.TypedStorage): + if torch.is_storage(obj) or isinstance(obj, torch.storage._TypedStorage): + if isinstance(obj, torch.storage._TypedStorage): # TODO: Once we decide to break serialization FC, we can # remove this case storage = obj._storage @@ -59,10 +59,10 @@ def persistent_load(saved_id): if typename == 'storage': # TODO: Once we decide to break serialization FC, we can - # stop wrapping with TypedStorage + # stop wrapping with _TypedStorage storage = serialized_storages[data[0]] dtype = serialized_dtypes[data[0]] - return torch.storage.TypedStorage( + return torch.storage._TypedStorage( wrap_storage=storage._untyped(), dtype=dtype) @@ -82,7 +82,7 @@ def persistent_load(saved_id): importer = sys_importer unpickler = PackageUnpickler(importer, io.BytesIO(obj_bytes)) - unpickler.persistent_load = persistent_load + unpickler.persistent_load = persistent_load # type: ignore[assignment] result = _deploy_objects[id] = unpickler.load() return result diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py index 20616a978d45..3c067d5c1c53 100644 --- a/torch/_jit_internal.py +++ b/torch/_jit_internal.py @@ -18,6 +18,7 @@ import typing import io import pickle +import threading # This is needed. `torch._jit_internal` is imported before `torch.distributed.__init__`. # Explicitly ask to import `torch.distributed.__init__` first. # Otherwise, "AttributeError: module 'torch' has no attribute 'distributed'" is raised. @@ -977,7 +978,7 @@ def linear(x): # Retrieves a fully-qualified name (module hierarchy + classname) for a given obj. -def _qualified_name(obj) -> str: +def _qualified_name(obj, mangle_name=True) -> str: # This special case allows us to override the qualified name on a type. # It's currently used in conjunction with tracing, where we create a # fake module to filter only supported attributes. However, since this @@ -1026,13 +1027,16 @@ def _qualified_name(obj) -> str: module_name = module_name.replace("<", "_") module_name = module_name.replace(">", "_") - # __main__ is a builtin module, so rewrite it to "__torch__". - if module_name == "__main__": - module_name = "__torch__" - else: - # Everything else gets a "__torch__" prefix to avoid name collisions - # with the names of user values. - module_name = "__torch__." + module_name + # The PythonExceptionValue C++ class in torch/csrc/jit/python/python_sugared_value.h + # does not need mangle the python class name. + if mangle_name: + # __main__ is a builtin module, so rewrite it to "__torch__". + if module_name == "__main__": + module_name = "__torch__" + else: + # Everything else gets a "__torch__" prefix to avoid name collisions + # with the names of user values. + module_name = "__torch__." + module_name if "." in name: raise RuntimeError(f"Could not get qualified name for class '{name}': " @@ -1248,6 +1252,8 @@ def persistent_id(self, obj): return "" if isinstance(obj, torch.cuda.Event): return "" + if isinstance(obj, threading.Thread): + return "" return None diff --git a/torch/_lazy/__init__.py b/torch/_lazy/__init__.py new file mode 100644 index 000000000000..ff4e90c0edf2 --- /dev/null +++ b/torch/_lazy/__init__.py @@ -0,0 +1,33 @@ +import torch._C._lazy + + +def mark_step(device: str = "lazy:0", wait=False): + """Triggers a mark step, which amounts to + - collecting a group of 'live' lazy tensors to index into the compilation cache + (lowering/compiling their IR graphs if not cached) + - kicking off execution of the compiled function + - (optionally, wait=True) waiting for cpu-side execution to complete (does not sync the accelerator) + """ + # TODO(whc) expand this to include backend hooks and align with XLA backend needs + torch._C._lazy._mark_step(device, [], wait=wait) + +def wait_device_ops(devices=None): + """Waits for all the async operations on the given devices to complete. + Args: + devices (string..., optional): The devices whose async ops need to be waited + for. If empty, all the local devices will be waited for. + """ + if devices is None: + devices = [] + torch._C._lazy._wait_device_ops(devices=devices) + +def sync_multi(tensors, devices): + """ + Sync the list of lazy tensors so there IR get lowered for the activate backend + and the compiled computation graph get cached. + """ + torch._C._lazy._sync_multi(tensors, devices) + +def get_tensor_id(tensor): + """Return a unique id of the lazy tensor maintained by LTC""" + return torch._C._lazy._get_tensor_id(tensor) diff --git a/torch/_lazy/computation.py b/torch/_lazy/computation.py new file mode 100644 index 000000000000..7dd57cd7238d --- /dev/null +++ b/torch/_lazy/computation.py @@ -0,0 +1,23 @@ +import torch._C._lazy +import torch._C._lazy_ts_backend + +def get_tensors_ts_device_data_node(tensors): + """Return tensor ids and eager tensors for DeviceData nodes in the + IR for the passed in lazy tensors. + + TODO: This API is currently ts backend specific. We are working on + generalizing it to all backends including XLA. + """ + return torch._C._lazy_ts_backend._get_tensors_ts_device_data_node(tensors) + +def get_graph_hash(tensors): + """Return the graph hash for the passed in lazy tensors""" + return torch._C._lazy._get_graph_hash(tensors) + +def run_cached_graph(hash_str, graph_inputs): + """Running the cached computation graph with the given inputs + + TODO: This API is currently ts backend specific. We are working on + generalizing it to all backends including XLA. + """ + return torch._C._lazy_ts_backend._run_cached_graph(hash_str, graph_inputs) diff --git a/torch/_lazy/config.py b/torch/_lazy/config.py new file mode 100644 index 000000000000..c2e72bd7d60b --- /dev/null +++ b/torch/_lazy/config.py @@ -0,0 +1,13 @@ +import torch._C._lazy + +def get_force_fallback(): + """Get the config used to force LTC fallback""" + return torch._C._lazy._get_force_fallback() + +def set_force_fallback(configval): + """Set the config used to force LTC fallback""" + torch._C._lazy._set_force_fallback(configval) + +def set_reuse_ir(val: bool): + """Set the config to reuse IR nodes for faster tracing""" + torch._C._lazy._set_reuse_ir(val) diff --git a/torch/_lazy/debug.py b/torch/_lazy/debug.py new file mode 100644 index 000000000000..882056ca9c0f --- /dev/null +++ b/torch/_lazy/debug.py @@ -0,0 +1,20 @@ +import torch._C._lazy + + +def render_ir_graph(tensors): + """Return a text dump of the LTC IR graph in dot format for the tensors. + The text can be processed by tools like dot to be rendered in pdf,png etc.""" + return torch._C._lazy._get_tensors_dot(tensors) + +def dump_ir(tensors, ir_format): + """Return a dump of the tensors in the specified format. + Valid format are + - text: for LTC IR + - backend: for the activate backend IR + """ + if ir_format == "text": + return torch._C._lazy._get_tensors_text(tensors) + elif ir_format == "backend": + return torch._C._lazy._get_tensors_backend(tensors) + else: + raise RuntimeError(f"Unrecognized IR format: {ir_format}") diff --git a/torch/_lazy/extract_compiled_graph.py b/torch/_lazy/extract_compiled_graph.py new file mode 100644 index 000000000000..37d0e67f31f3 --- /dev/null +++ b/torch/_lazy/extract_compiled_graph.py @@ -0,0 +1,199 @@ +import torch._lazy.metrics as metrics +from torch._lazy.tensor_factory_functions import tensor_factory_functions +from torch._lazy import computation +from torch._lazy import debug as lazy_debug +import torch._lazy as lazy +import dataclasses +from typing import List, Dict, Any, Callable +import copy +from torch import fx +import torch +import itertools +import os + +debug = os.environ.get("debug_extract_compiled_graph") is not None + +@dataclasses.dataclass +class GraphInputMatcher: + """ + The GraphInputMatcher class setup the graph inputs for future calls after lazy tracing. + Specifically, those graph inputs corresponding to method parameters should be replaced with the + arguments for the current call. + + tensor_id_to_arg_idx maps the tensor id to the parameter index. + graph_input_tensor_ids, graph_input_ivalues list the tensor_id and ivalue for each of the + TS/XLA graph inputs. + """ + tensor_id_to_arg_idx: Dict[int, int] + graph_input_tensor_ids: List[int] + # there are 2 categories of graph_input_tensors. + # Category 1: those whose id are not found in tensor_id_to_arg_idx. These are + # most likely const tensors and we can get its content from graph_input_tensors + # Category 2: those whose id are found in tensor_id_to_arg_idx. We should get + # the tensor from method arguments + graph_input_ivalues: List[Any] + + # get the real graph input tensors + def __call__(self, args): + real_input = [] + for tensor_id, traced_ivalue in zip(self.graph_input_tensor_ids, self.graph_input_ivalues): + arg_idx = self.tensor_id_to_arg_idx.get(tensor_id, None) + if arg_idx is None: + inp = traced_ivalue + else: + inp = args[arg_idx] + real_input.append(inp) + return real_input + +class ReturnValueHandler: + r""" + When ltc_sync_multi is called on multi tensors, the compiled graph + will contain output only for unique tensors - if a tensor appears multiple + times in the input to _ltc_sync_multi, only the first occurance matters. + + However from python level, we still expect multi tensors returned with duplciation + even if the TS graph dedup the output. e.g. for method: + + def forward(self, a): + return a, a + + the TS graph captured by LTC will return a single tensor, but Python method expects 2. + + This class dedup the lazy tensors first to get the index that will be used + to duplicate the eager tensors later. + """ + def __init__(self, lazy_out_list): + self.index: List[List[int]] = [] + self.total_count = len(lazy_out_list) + + tensor_id_to_idx: Dict[int, int] = dict() + for dup_idx, lazy_tensor in enumerate(lazy_out_list): + uniq_idx = tensor_id_to_idx.get(id(lazy_tensor), None) + if uniq_idx is not None: + self.index[uniq_idx].append(dup_idx) + else: + uniq_idx = len(self.index) + self.index.append([dup_idx]) + tensor_id_to_idx[id(lazy_tensor)] = uniq_idx + + def duplicate_eager_tensors(self, eager_tensor_list): + duplicated_list = [None] * self.total_count + assert len(eager_tensor_list) == len(self.index) + + for uniq_idx, eager_tensor in enumerate(eager_tensor_list): + for dup_idx in self.index[uniq_idx]: + duplicated_list[dup_idx] = eager_tensor + return duplicated_list + +def force_lazy_device(model: fx.GraphModule): + """ + Factory methods in a Fx graph may create tensors for a specific eager devices. + If we take no actions, those eager tensors will be mixed with lazy tensors and + cause crash. This method overwrite those eager device to lazy device. + """ + def tolazydevice(dev): + if isinstance(dev, torch.device): + return torch.device("lazy", index=dev.index) + return dev + + def hasDeviceArg(args, kwargs): + return any(isinstance(arg, torch.device) for arg in itertools.chain(args, kwargs.values())) + + for nd in model.graph.nodes: + nd.args = tuple(tolazydevice(arg) for arg in nd.args) + nd.kwargs = {k: tolazydevice(v) for k, v in nd.kwargs.items()} + + # For torchbench like yolov3, hf_Bart, dynamo generates Fx graph that return + # eager tensors on the default device + # (check https://gist.github.com/shunting314/eabdf6c769c59bc384469717b8f9bb7f for yolove, + # and https://gist.github.com/shunting314/8d5e2d9348a3258959d3954186c48814 for hf_Bart). + # To force those tensors on the lazy device, we can not simply override + # the device argument since there is no explicit device argument. + # What we are doing here is, for the list of covered tensor factory methods + # we add a lazy device argument explicity. + # + # TODO: This solution is no ideal since we may miss some factory methods. In future + # when we support lazy mode, this method can be replaced by that. + if nd.target in tensor_factory_functions and not hasDeviceArg(nd.args, nd.kwargs): + kwargs = dict(nd.kwargs) # nd.kwargs is immutable. make a mutable copy. + kwargs["device"] = torch.device("lazy") + nd.kwargs = kwargs + + model.recompile() + +def get_fallback_ops(): + fallback_ops = [] + for opname in metrics.counter_names(): + if "aten::" not in opname: + continue + val = int(metrics.counter_value(opname)) + if val > 0: + fallback_ops.append(f"{opname}={val}") + + return fallback_ops + +def extract_compiled_graph(model: fx.GraphModule, example_inputs) -> Callable: + """ + Optimize an eager model with LTC and returns a wrapper to execute the + compiled graph directly without retracing. It depends on other mechanisms + like TorchDynamo guards to guarantee the returned wrapper is only called + when it's safe. + """ + lazy_args = [arg.to(device="lazy") for arg in example_inputs] + args_tensor_ids = [lazy.get_tensor_id(lazy_arg) for lazy_arg in lazy_args] + tensor_id_to_arg_idx = {tensor_id: i for i, tensor_id in enumerate(args_tensor_ids)} + lazy_model = copy.deepcopy(model).to(device=torch.device("lazy")) + force_lazy_device(lazy_model) + + # This line executes lazy tracing and enable us extracting compiled graph later + metrics.reset() + lazy_out = lazy_model(*lazy_args) + fallback_ops = get_fallback_ops() + metrics.reset() + + if len(fallback_ops) > 0: + raise RuntimeError(f"Fail to extact the compiled graph because of fallback: {','.join(fallback_ops)}") + + if not isinstance(lazy_out, (tuple, list)): + lazy_out = (lazy_out,) + + args_and_out = tuple(lazy_args) + tuple(lazy_out) + return_value_handler = ReturnValueHandler(args_and_out) + if debug: + print("Fx code:\n", model.code) + print("LTC IR:", lazy_debug.dump_ir(args_and_out, "text")) + + # TODO: this part is TS backend specific for now and will be generalized to + # support XLA + graph_input_tensor_ids, graph_input_ivalues = computation.get_tensors_ts_device_data_node(args_and_out) + assert len(graph_input_tensor_ids) == len(graph_input_ivalues) + graph_input_matcher = GraphInputMatcher(tensor_id_to_arg_idx, graph_input_tensor_ids, graph_input_ivalues) + + graph_hash = computation.get_graph_hash(args_and_out) + + if debug: + print("graph_hash", graph_hash) + print(f"args_tensor_ids {args_tensor_ids}") + print("tensor ids from device data:", graph_input_tensor_ids) + + # sync the list of output tensors so the computation graph for these + # tensors will be cached. Those computation graphs can be retrieved + # by graph hash later. + lazy.sync_multi(args_and_out, []) + + def optimized_mod(*args): + if len(args_and_out) == 0: + return () + graph_input = graph_input_matcher(args) + res = return_value_handler.duplicate_eager_tensors(computation.run_cached_graph(graph_hash, graph_input)) + + assert len(res) == len(args_and_out) + for i, arg in enumerate(args): + # only copy those tensors that get inplace updated + if arg is not res[i]: + arg.copy_(res[i]) + + # skip the args + return res[len(args):] + + return optimized_mod diff --git a/torch/_lazy/ir_cache.py b/torch/_lazy/ir_cache.py new file mode 100644 index 000000000000..04f1f103d286 --- /dev/null +++ b/torch/_lazy/ir_cache.py @@ -0,0 +1,11 @@ +import torch._C._lazy + +def dump(dot_file_name: str): + """Dump TrieCache in the dot format""" + return torch._C._lazy._dump_ir_cache(dot_file_name) + +def reset(): + """Clear TrieCache. This is needed in testing to avoid + node reusing between different tests. + """ + return torch._C._lazy._clear_ir_cache() diff --git a/torch/_lazy/metrics.py b/torch/_lazy/metrics.py new file mode 100644 index 000000000000..043db981bb71 --- /dev/null +++ b/torch/_lazy/metrics.py @@ -0,0 +1,13 @@ +import torch._C._lazy + +def reset(): + """Resets all metric counters.""" + torch._C._lazy._reset_metrics() + +def counter_names(): + """Retrieves all the currently active counter names.""" + return torch._C._lazy._counter_names() + +def counter_value(name: str): + """Return the value of the counter with the speficied name""" + return torch._C._lazy._counter_value(name) diff --git a/torch/_lazy/tensor_factory_functions.py b/torch/_lazy/tensor_factory_functions.py new file mode 100644 index 000000000000..47aa9c500466 --- /dev/null +++ b/torch/_lazy/tensor_factory_functions.py @@ -0,0 +1,48 @@ +import torch + +""" +tensor_factory_functions defines the list of torch functions that create tensors. +The list is grabbed by searching thru native_functions.yaml by the following +regular expression: + + cat native_functions.yaml | grep 'func:' | grep -v "Tensor.*->" | grep "[-]>.*Tensor" + +It's possible that new tensor factory functions are added making this list stale. +Use at your own risk or regenerate the list. +""" +tensor_factory_functions = ( + torch._cudnn_init_dropout_state, + torch.arange, + torch.bartlett_window, + torch.blackman_window, + torch._empty_affine_quantized, + torch.empty_strided, + torch.eye, + torch.full, + torch.from_file, + torch.hann_window, + torch.hamming_window, + torch.kaiser_window, + torch.linspace, + torch.logspace, + torch.ones, + torch.scalar_tensor, + torch.rand, + torch.randint, + torch.randn, + torch.randperm, + torch.range, + torch._efficientzerotensor, + torch.zeros, + torch.tril_indices, + torch.triu_indices, + # Note: the following functions match the regular expression search above but + # they are not available in the torch module. Comment out. + # torch._sparse_coo_tensor_with_dims, + # torch.fft_fftfreq, + # torch.fft_rfftfreq, +) + ( + # torch.tensor is special since it's not in native_functions.yaml + # add it separately + torch.tensor, +) diff --git a/torch/_lazy/ts_backend.py b/torch/_lazy/ts_backend.py new file mode 100644 index 000000000000..118de2dbefca --- /dev/null +++ b/torch/_lazy/ts_backend.py @@ -0,0 +1,5 @@ +import torch._C._lazy_ts_backend + +def init(): + """Initializes the lazy Torchscript backend""" + torch._C._lazy_ts_backend._init() diff --git a/torch/_linalg_utils.py b/torch/_linalg_utils.py index 568ae8b74aae..faa79f7f0cdb 100644 --- a/torch/_linalg_utils.py +++ b/torch/_linalg_utils.py @@ -99,3 +99,10 @@ def symeig(A: Tensor, largest: Optional[bool] = False) -> Tuple[Tensor, Tensor]: E = torch.flip(E, dims=(-1,)) Z = torch.flip(Z, dims=(-1,)) return E, Z + +# This function was deprecated and removed +# This nice error message can be removed in version 1.13+ +def solve(input: Tensor, A: Tensor, *, out=None) -> Tuple[Tensor, Tensor]: + raise RuntimeError( + "This function was deprecated since version 1.9 and is now removed. Please use the `torch.linalg.solve` function instead.", + ) diff --git a/torch/_lobpcg.py b/torch/_lobpcg.py index 560d9579e61f..cb7a6723683a 100644 --- a/torch/_lobpcg.py +++ b/torch/_lobpcg.py @@ -652,17 +652,16 @@ class LOBPCG(object): """ def __init__(self, - A, # type: Optional[Tensor] - B, # type: Optional[Tensor] - X, # type: Tensor - iK, # type: Optional[Tensor] - iparams, # type: Dict[str, int] - fparams, # type: Dict[str, float] - bparams, # type: Dict[str, bool] - method, # type: str - tracker # type: None - ): - # type: (...) -> None + A: Optional[Tensor], + B: Optional[Tensor], + X: Tensor, + iK: Optional[Tensor], + iparams: Dict[str, int], + fparams: Dict[str, float], + bparams: Dict[str, bool], + method: str, + tracker: None + ) -> None: # constant parameters self.A = A @@ -681,10 +680,10 @@ def __init__(self, self.E = torch.zeros((n, ), dtype=X.dtype, device=X.device) self.R = torch.zeros((m, n), dtype=X.dtype, device=X.device) self.S = torch.zeros((m, 3 * n), dtype=X.dtype, device=X.device) - self.tvars = {} # type: Dict[str, Tensor] - self.ivars = {'istep': 0} # type: Dict[str, int] - self.fvars = {'_': 0.0} # type: Dict[str, float] - self.bvars = {'_': False} # type: Dict[str, bool] + self.tvars: Dict[str, Tensor] = {} + self.ivars: Dict[str, int] = {'istep': 0} + self.fvars: Dict[str, float] = {'_': 0.0} + self.bvars: Dict[str, bool] = {'_': False} def __str__(self): lines = ['LOPBCG:'] @@ -941,17 +940,15 @@ def _get_rayleigh_ritz_transform(self, S): SBS = _utils.qform(B, S) d_row = SBS.diagonal(0, -2, -1) ** -0.5 d_col = d_row.reshape(d_row.shape[0], 1) + # TODO use torch.linalg.cholesky_solve once it is implemented R = torch.linalg.cholesky((SBS * d_row) * d_col, upper=True) - Id = torch.eye(R.size(-1), dtype=R.dtype, device=R.device) - Rinv = torch.triangular_solve(Id, R, upper=True).solution - return Rinv * d_col + return torch.linalg.solve_triangular(R, d_row.diag_embed(), upper=True, left=False) def _get_svqb(self, - U, # Tensor - drop, # bool - tau # float - ): - # type: (Tensor, bool, float) -> Tensor + U: Tensor, # Tensor + drop: bool, # bool + tau: float # float + ) -> Tensor: """Return B-orthonormal U. .. note:: When `drop` is `False` then `svqb` is based on the diff --git a/torch/_masked/__init__.py b/torch/_masked/__init__.py index a1b398cb2f49..d679817c8304 100644 --- a/torch/_masked/__init__.py +++ b/torch/_masked/__init__.py @@ -2,8 +2,10 @@ from typing import Optional, Tuple, List, Union, Any +import warnings import torch from torch import Tensor +from . import _docs # A workaround to support both TorchScript and MyPy: from typing import TYPE_CHECKING @@ -27,6 +29,26 @@ def _apply_docstring_templates(func): """Decorator that applies docstring templates to function docstring and returns the function instance. """ + + doc_string = getattr(_docs, f'{func.__name__}_docstring', None) + if doc_string is None: + warnings.warn( + f'No documentation string available for {func.__name__}.' + ' PyTorch team should run `python tools/update_masked_docs.py`' + ' to generate the missing docstrings.') + else: + func.__doc__ = doc_string + + # Expose function as public symbol + __all__.append(func.__name__) + + return func + + +def _generate_docstring(func): + """An utility function called from tools/update_masked_docs.py + script to update the module torch._masked._docs.py + """ docstring_templates = dict( reduction_signature='''\ {function_name}(input, {operation_args}, *, {operation_kwargs}) -> Tensor''', @@ -139,11 +161,16 @@ def _apply_docstring_templates(func): # be removed in the final documentation string. sum=(('dim',), ('keepdim=False', 'dtype=None', 'mask=None')), prod=(('dim',), ('keepdim=False', 'dtype=None', 'mask=None')), + cumsum=(('dim__as_int',), ('dtype=None', 'mask=None')), + cumprod=(('dim__as_int',), ('dtype=None', 'mask=None')), amin=(('dim',), ('keepdim=False', 'dtype=None', 'mask=None')), amax=(('dim',), ('keepdim=False', 'dtype=None', 'mask=None')), + argmin=(('dim__as_int',), ('keepdim=False', 'dtype=None', 'mask=None')), + argmax=(('dim__as_int',), ('keepdim=False', 'dtype=None', 'mask=None')), mean=(('dim',), ('keepdim=False', 'dtype=None', 'mask=None')), norm=(('ord', 'dim',), ('keepdim=False', 'dtype=None', 'mask=None')), var=(('dim', 'unbiased'), ('keepdim=False', 'dtype=None', 'mask=None')), + std=(('dim', 'unbiased'), ('keepdim=False', 'dtype=None', 'mask=None')), softmax=(('dim__as_int',), ('dtype=None', 'mask=None')), log_softmax=(('dim__as_int',), ('dtype=None', 'mask=None')), softmin=(('dim__as_int',), ('dtype=None', 'mask=None')), @@ -197,22 +224,35 @@ def _apply_docstring_templates(func): normalize='''\ Let ``x`` be a sequence of unmasked elements of one-dimensional slice of the :attr:`input` tensor. Normalize of i-th element in ``x`` is -defined as ``x[i]/max(norm(x, p), eps)``.''') +defined as ``x[i]/max(norm(x, p), eps)``.''', + cumsum='''\ +Let ``x`` be a sequence of unmasked elements of one-dimensional slice +of the :attr:`input` tensor. Cumsum of i-th element in ``x`` is +defined as ``sum(x[:i])``.''', + cumprod='''\ +Let ``x`` be a sequence of unmasked elements of one-dimensional slice +of the :attr:`input` tensor. Cumsum of i-th element in ``x`` is +defined as ``prod(x[:i])``.''') reduction_names = dict( sum='sum', prod='product', amax='maximum', amin='minimum', + argmax='argmax', + argmin='argmin', mean='mean', norm='norm', - var='variance') + var='variance', + std='standard_deviation') normalization_names = dict( softmax='softmax', log_softmax='log_softmax', softmin='softmin', - normalize='normalize') + normalize='normalize', + cumsum='cumulative_sum', + cumprod='cumulative_prod') operation_names = dict() operation_names.update(reduction_names) @@ -226,7 +266,7 @@ def _apply_docstring_templates(func): if func.__name__ in {'norm', 'normalize'}: example_args = (2.0, example_dim) example_input = example_input.to(dtype=torch.float32) - elif func.__name__ in {'var'}: + elif func.__name__ in {'var', 'std'}: example_args = (example_dim, False) else: example_args = (example_dim,) @@ -297,12 +337,7 @@ def _apply_docstring_templates(func): doc_template = '\n\n'.join([f'{{{op_kind}_{sec}}}' for sec in doc_sections]) else: doc_template = func.__doc__ - func.__doc__ = doc_template.format_map(templates) - - # Expose function as public symbol - __all__.append(func.__name__) - - return func + return doc_template.format_map(templates) def _reduction_identity(op_name: str, input: Tensor, *args): @@ -322,16 +357,16 @@ def _reduction_identity(op_name: str, input: Tensor, *args): dtype: DType = input.dtype device = input.device op_name = op_name.rsplit('.', 1)[-1] # lstrip module name when present - if op_name == 'sum': + if op_name in {'sum', 'cumsum'}: return torch.tensor(0, dtype=dtype, device=device) - elif op_name == 'prod': + elif op_name in {'prod', 'cumprod'}: return torch.tensor(1, dtype=dtype, device=device) - elif op_name == 'amax': + elif op_name in {'amax', 'argmax'}: if torch.is_floating_point(input): return torch.tensor(-torch.inf, dtype=dtype, device=device) elif torch.is_signed(input) or dtype == torch.uint8: return torch.tensor(torch.iinfo(dtype).min, dtype=dtype, device=device) - elif op_name == 'amin': + elif op_name in {'amin', 'argmin'}: if torch.is_floating_point(input): return torch.tensor(torch.inf, dtype=dtype, device=device) elif torch.is_signed(input) or dtype == torch.uint8: @@ -349,7 +384,7 @@ def _reduction_identity(op_name: str, input: Tensor, *args): assert torch.is_floating_point(input), input.dtype return torch.tensor(torch.inf, dtype=dtype, device=device) return torch.tensor(0, dtype=dtype, device=device) - elif op_name == 'var': + elif op_name in {'var', 'std'}: return None raise NotImplementedError(f'identity of {op_name} on {dtype} input') @@ -358,6 +393,12 @@ def _canonical_dim(dim: DimOrDims, ndim: int) -> Tuple[int, ...]: """Return dim argument as a tuple of sorted dim values. """ dims: List[int] = [] + if dim == (): + # Currently, `dim=()` in reductions operations means "reduce + # over all dimensions" while in future, it will read "no + # reduce". See https://github.com/pytorch/pytorch/issues/29137 + # When gh-29137 is resolved, this if-block must be deleted. + dim = None if dim is None: return tuple(range(ndim)) ndim = max(ndim, 1) @@ -371,31 +412,347 @@ def _canonical_dim(dim: DimOrDims, ndim: int) -> Tuple[int, ...]: return tuple(sorted(dims)) +def _sparse_coo_flatten_indices(indices: Tensor, shape: tuple): + # Flatted N-D indices to 1-D indices + flat_indices = indices.new_zeros(indices.size(1)) + for d, sz in enumerate(shape): + flat_indices.mul_(sz) + flat_indices.add_(indices[d]) + return flat_indices + + +def _any(input: Tensor, dim: tuple, keepdim: bool): + # Support torch.any with tuple dim argument. + # Workaround of https://github.com/pytorch/pytorch/issues/56586 + r = input + for d in reversed(dim): + r = r.any(dim=d, keepdim=keepdim) + return r + + +def _sparse_coo_where(mask: Tensor, input: Tensor, fill_value: Tensor) -> Tensor: + """Sparse variant of torch.where. Supports sparse COO and hybrid sparse COO tensors. + + _sparse_coo_where implements the following invariant: + + _sparse_coo_where(mask, input, fill_value).to_dense(fill_value) == + torch.where(mask.to_dense(), input.to_dense(), torch.full(input.shape, fill_value)) + + where `a == b` means `assertEqual(a, b)`, mask is boolean sparse + tensor, and `to_dense(fill_value)` is like `to_dense()` except + that the unspecified elements are mapped to `fill_value` rather + than to `0`. + + Returns a sparse COO tensor with the following features: + + - all specified elements correspond to masked-in elements that + have the values of the input tensor. If there exists a masked-in + element (as specified by mask) that is not specified in the + input, in the result tensor, the corresponding element has value + 0. In the dense part of the sparse tensor, the masked-out + elements are replaced with fill_value. + + - all unspecified elements correspond to masked-out elements. + """ + + assert input.layout == torch.sparse_coo + assert mask.layout == input.layout + assert mask.shape == input.shape + assert mask.dense_dim() == input.dense_dim() # TODO: eliminate this restriction + + input = input.coalesce() + + # For set operations on sparse tensor indices, we'll convert + # multi-dimensional indices to 1-D indices for efficiency. + input_flat_indices = _sparse_coo_flatten_indices(input.indices(), input.shape[:input.sparse_dim()]) + mask_flat_indices = _sparse_coo_flatten_indices(mask.indices(), mask.shape[:mask.sparse_dim()]) + + # the set of mask flat indices that define masked-in elements: + if mask.dense_dim() > 0: + mask_values = _any(mask.values(), tuple(range(1, input.sparse_dim() + 1)), False) + else: + mask_values = mask.values() + maskin_flat_indices = mask_flat_indices[mask_values.nonzero()[:, 0]] + + def intersection(i1, i2): + union, counts = torch.cat([i1, i2]).unique(return_counts=True) + return union, torch.where(counts.gt(1)) + + def minus(i1, i2): + union, counts = torch.cat([i1, i2]).unique(return_counts=True) + return intersection(union[torch.where(counts.eq(1))], i1) + + def _apply(a): + obj, w = a + return obj[w] + + # the set of input flat indices of specified and masked-in elements: + maskin_input_flat_indices = _apply(intersection(maskin_flat_indices, input_flat_indices)) + _, w = intersection(input_flat_indices, maskin_input_flat_indices) + + # the indices and values of masked-in elements + where_input_indices = input.indices()[(slice(None),) + w] + where_input_values = input.values()[w] + + if mask.dense_dim() > 0: + # apply mask to the dense part of the input values: + _, w1 = intersection(mask_flat_indices, maskin_input_flat_indices) + where_mask_values = mask.values()[w1] + where_input_values = torch.where(where_mask_values, where_input_values, + where_input_values.new_full([], fill_value.item())) + + # the set of flat indices of unspecified input and masked-in elements: + maskin_zero_flat_indices = _apply(minus(maskin_flat_indices, maskin_input_flat_indices)) + + # the indices of masked-in zero elements + _, w = intersection(mask_flat_indices, maskin_zero_flat_indices) + where_zero_indices = mask.indices()[(slice(None),) + w] + + # construct result + n = where_zero_indices.size(1) + if n == 0: + # the input is coalesced, hence input_flat_indices are ordered + # and the result is guaranteed to be coalesced: + result = torch.sparse_coo_tensor(where_input_indices, where_input_values, input.shape) + return result._coalesced_(True) + + where_indices = torch.cat([where_input_indices, where_zero_indices], dim=1) + where_values = torch.cat([where_input_values, where_input_values.new_zeros((n,) + where_input_values.shape[1:])]) + result = torch.sparse_coo_tensor(where_indices, where_values, input.shape) + + # appending zero elements leads to uncoalesced sparse tensor + return result.coalesce() + + +def _sparse_coo_scatter_reduction_helper(op, + mask_input: Tensor, + dims: Tuple[int, ...], + keepdim: bool, + dtype: Optional[DType] = None) -> Tensor: + reduce = op.__name__ + valid_reductions = ['sum', 'prod', 'amax', 'amin'] + if reduce not in valid_reductions: + raise ValueError(f"op must be one of {' '.join(valid_reductions)}, but got {reduce} instead") + + output_dtype = dtype + values, indices = mask_input._values(), mask_input._indices() + input_dims = mask_input.dim() + num_sparse_dims = mask_input.sparse_dim() + reduced_sparse_dims = [] + retained_sparse_dims = [] + reduced_dense_dims = [] + + # promote dtype if specified + if values.dtype != output_dtype: + values = values.to(output_dtype) + + if keepdim: + output_shape = tuple(1 if i in dims else si for (i, si) in enumerate(mask_input.shape)) + else: + output_shape = tuple(si for (i, si) in enumerate(mask_input.shape) if i not in dims) + + for d in dims: + if (d >= input_dims): + continue + + if d < num_sparse_dims: + reduced_sparse_dims.append(d) + else: + reduced_dense_dims.append(d + 1 - num_sparse_dims) + + # Reduce dense dimensions + if len(reduced_dense_dims) > 0: + if reduce == "sum": + new_values = values + new_values = op(new_values, dim=reduced_dense_dims, keepdim=bool(keepdim)) + else: + # FIXME: Implement reductions for dense dimensions for ops with non-zero reduction identities + return NotImplemented + else: + new_values = values.clone() + + # Reduce sparse dimensions + if len(reduced_sparse_dims) == num_sparse_dims: + if reduce in {'amax', 'amin'} and new_values.size(0) == 0: + # IndexError: amax(): Expected reduction dim 0 to have non-zero size. + # sum()/prod() return the reduction identity when dim has size 0 but amax()/amin() do not + # See https://github.com/pytorch/pytorch/issues/61901 + new_values = _reduction_identity(reduce, new_values) + else: + new_values = op(new_values, dim=0) + if (keepdim): + for _ in range(num_sparse_dims): + new_values = new_values.unsqueeze(0) + return new_values.to(dtype=output_dtype).to_sparse() + else: + new_indices = indices.clone() + if keepdim: + # zero out reduced sparse dimensions if keepdim = True + # ensures that the call to torch.unique folds duplicated indices together while preserving the dimension + new_indices[reduced_sparse_dims, :] = 0 + else: + # remove reduced sparse dimensions if keepdim = False + if (len(reduced_sparse_dims) > 0): + retained_sparse_dims = [i for i in range(num_sparse_dims) if i not in set(reduced_sparse_dims)] + new_indices = new_indices.index_select(0, torch.tensor(retained_sparse_dims).to(mask_input.device)) + + # Use scatter_reduce to reduce items in the new_values tensor that correspond to the same indices in new_indices + if (new_indices.numel() > 0): + # lexsort indices and get index tensor for scatter reduction + new_indices, inverse_indices = torch.unique(new_indices, return_inverse=True, dim=1) + out_shape = list(new_values.shape) + out_shape[0] = new_indices.shape[1] + for _ in range(new_values.ndim - 1): + inverse_indices = inverse_indices.unsqueeze(-1) + scatter_indices = inverse_indices.expand(new_values.shape) + # FIXME: temporary workaround for issue with bfloat16/float16 remove when acctype is implemented for scatter_reduce + if output_dtype in {torch.bfloat16, torch.float16}: + new_values = new_values.to(torch.float) + out = new_values.new_empty(out_shape) + new_values = out.scatter_reduce_(0, scatter_indices, new_values, reduce=reduce, include_self=False) + new_values = new_values.to(dtype=output_dtype) + else: + out = new_values.new_empty(out_shape) + new_values = out.scatter_reduce_(0, scatter_indices, new_values, reduce=reduce, include_self=False) + + return torch.sparse_coo_tensor(new_indices, new_values, output_shape, dtype=output_dtype, device=mask_input.device) + + +def _sparse_csr_where(mask: Tensor, input: Tensor, fill_value: Tensor) -> Tensor: + """Sparse variant of torch.where. Supports sparse CSR tensors. + """ + # TODO: implement sparse CSR specific where operator for efficiency + return _sparse_coo_where(mask.to_sparse_coo(), input.to_sparse_coo(), fill_value).to_sparse_csr() + + +def _where(mask: Tensor, input: Tensor, fill_value: Tensor) -> Tensor: + """torch.where with sparse inputs support. + + _where implements the following invariant: + + _where(mask, input, fill_value).to_dense(fill_value) == + torch.where(mask.to_dense(), input.to_dense(), torch.full(input.shape, fill_value)) + + where `a == b` means `assertEqual(a, b)`, mask is boolean sparse + tensor, and `to_dense(fill_value)` is like `to_dense()` except + that the unspecified elements are mapped to `fill_value` rather + than to `0`. + + Returns a sparse tensor with the following features: + + - all specified elements correspond to masked-in elements that + have the values of the input tensor. If there exists a masked-in + element (as specified by mask) that is not specified in the + input, in the result tensor, the corresponding element has value + 0. In the dense part of the sparse tensor, the masked-out + elements are replaced with fill_value. + + - all unspecified elements correspond to masked-out elements. + """ + if mask.layout == torch.strided: + if fill_value.dtype == torch.bool: + # Workaround internal assert failure in + # test_nvfuser_correctness__masked_mean_cuda_bool: We + # don't have an op for aten::new_full but it isn't a + # special case. Argument types: Tensor, int[], bool, int, + # int, Device, bool + fill = input.new_full([], int(fill_value.item())).to(dtype=torch.bool) + else: + fill = input.new_full([], fill_value.item()) + return torch.where(mask, input, fill) + elif mask.layout == torch.sparse_coo: + return _sparse_coo_where(mask, input, fill_value) + elif mask.layout == torch.sparse_csr: + return _sparse_csr_where(mask, input, fill_value) + else: + raise ValueError(f'_where expects strided or sparse COO or sparse CSR tensor but got {mask.layout}') + + def _input_mask(input: Tensor, *args, **kwargs) -> Tensor: """Return canonical input mask. - Canonical input mask is a boolean tensor with the same shape as - input and with (broadcasted) content of mask, if specified. + + A canonical input mask is defined as a boolean mask tensor that + shape and layout matches with the shape and the layout of the + input. + + The canonical input mask is computed from the :attr:`mask` tensor + content to meet the following criteria: + + 1. The shape of the canonical input mask is the same as the shape + of :attr:`input` tensor. If the mask tensor has a smaller shape + than the shape of the :attr:`input`, broadcasting rules will be + applied. Downcasting of mask is not supported. + + 2. The layout of the canonical input mask is the same as the + layout of the :attr:`input` tensor. If the mask has different + layout, it will be converted to the expected layout. In the + case of sparse COO layout, the canonical input mask will be + coalesced. + + 3. The dtype of the canonical input mask is torch.bool. If the + mask dtype is not bool then it will be converted to bool dtype + using `.to(dtype=bool)` method call. + + 4. The elements of the canonical input mask have boolean values + copied from the content of the :attr:`mask` tensor (after + possible broadcasting and dtype conversion transforms). In + general, the sparsity pattern of the sparse canonical input + mask need not to be the same as the sparsity pattern of the + sparse :attr:`input` tensor. + """ + if input.layout not in {torch.strided, torch.sparse_coo, torch.sparse_csr}: + raise ValueError(f'_input_mask expects strided or sparse COO or sparse CSR tensor but got {input.layout}') + mask = kwargs.get('mask') + + # default mask if mask is None: - inmask = input.new_ones(input.shape, dtype=torch.bool) - elif mask.ndim < input.ndim: - inmask = torch.broadcast_to(mask.clone(), input.shape).to(dtype=torch.bool) - elif mask.ndim > input.ndim: - raise IndexError("_input_mask expected broadcastable mask (got mask dimensionality higher than of the input)") - elif mask.shape != input.shape: - inmask = torch.broadcast_to(mask.clone(), input.shape).to(dtype=torch.bool) - else: - inmask = mask.to(dtype=torch.bool) - return inmask + raise ValueError('_input_mask requires explicit mask') + + # mask shape must match with input shape + if mask.shape != input.shape: + if mask.ndim > input.ndim: + raise IndexError("_input_mask expected broadcastable mask (got mask dimensionality higher than of the input)") + if mask.layout == torch.strided: + mask = torch.broadcast_to(mask.clone(), input.shape).to(dtype=torch.bool) + elif mask.layout == torch.sparse_coo: + mask = torch._sparse_broadcast_to(mask, input.shape) + else: + assert mask.layout == torch.sparse_csr + # Broadcasting of CSR tensors is not implemented. Working + # around by using COO layout. + mask = torch._sparse_broadcast_to(mask.to_sparse(), input.shape).to_sparse_csr() + + # mask layout must match with input layout + if mask.layout != input.layout: + if input.layout == torch.strided: + mask = mask.to_dense() + elif input.layout == torch.sparse_coo: + if mask.layout == torch.strided: + mask = mask.to_sparse(input.sparse_dim()) + else: + mask = mask.to_sparse() + else: + assert input.layout == torch.sparse_csr + mask = mask.to_sparse_csr() + + # sparse mask must be coalesced + if mask.layout == torch.sparse_coo: + mask = mask.coalesce() + + # mask is a boolean tensor + mask = mask.to(dtype=torch.bool) + + return mask def _output_mask(op, input: Tensor, *args, **kwargs) -> Tensor: """Return output mask of masked operation applied to given arguments. """ if callable(op): - is_reduction = op.__name__ in {'sum', 'prod', 'amax', 'amin', 'mean', 'norm', 'var'} - is_normalization = op.__name__ in {'softmax', 'log_softmax', 'softmin', 'normalize'} + is_reduction = op.__name__ in {'sum', 'prod', 'amax', 'amin', 'argmax', 'argmin', 'mean', 'norm', 'var', 'std'} + is_normalization = op.__name__ in {'softmax', 'log_softmax', 'softmin', 'normalize', 'cumsum', 'cumprod'} if is_reduction: if op.__name__ == 'norm': if args: @@ -404,10 +761,7 @@ def _output_mask(op, input: Tensor, *args, **kwargs) -> Tensor: outmask = _input_mask(input, *args, **kwargs) keepdim = kwargs.get('keepdim', False) dim_ = _canonical_dim(dim, input.ndim) - # Workaround https://github.com/pytorch/pytorch/issues/56586 - for d in reversed(dim_): - outmask = outmask.any(dim=d, keepdim=bool(keepdim)) - return outmask + return _any(outmask, dim_, bool(keepdim)) elif is_normalization: return _input_mask(input, *args, **kwargs) else: @@ -416,6 +770,19 @@ def _output_mask(op, input: Tensor, *args, **kwargs) -> Tensor: raise ValueError(f'_output_mask expected masked operation (got {type(op).__name__} object)') +def _combine_input_and_mask(op, input: Tensor, mask, *args) -> Tensor: + """Return input with masked-out elements eliminated for the given operations. + """ + if mask is None: + return input + canonical_mask = _input_mask(input, mask=mask) + if callable(op): + fill_value = _reduction_identity(op.__name__, input, *args) + return _where(canonical_mask, input, fill_value) + else: + raise ValueError(f'_combine_input_and_mask expected masked operation (got {type(op).__name__} object)') + + @_apply_docstring_templates def sum(input: Tensor, dim: DimOrDims = None, @@ -425,16 +792,28 @@ def sum(input: Tensor, mask: Optional[Tensor] = None) -> Tensor: # __doc__ is generated by _apply_docstring_templates decorator if dtype is None: - dtype = input.dtype - # TODO: What follows is a reference implementation of a masked sum - # operation that is to be replaced with an optimized one and - # extended to support other layouts. + # promote integer types to int64 when output dtype is not specified + if input.layout == torch.sparse_csr: + if input.dtype in {torch.uint8, torch.bool, torch.int8, torch.int16, torch.int32}: + # csr.to(dtype=torch.int64) is not implemented, so + # using coo.to on input to ensure the promoted dtype + input = input.to_sparse_coo().to(dtype=torch.int64).to_sparse_csr() + else: + dtype = input.dtype + else: + dtype = input.dtype + if input.dtype in {torch.uint8, torch.bool, torch.int8, torch.int16, torch.int32}: + dtype = torch.int64 + dim_ = _canonical_dim(dim, input.ndim) + mask_input = _combine_input_and_mask(sum, input, mask) if input.layout == torch.strided: - mask_input = input if mask is None else torch.where(mask, input, input.new_zeros([])) - dim_ = _canonical_dim(dim, input.ndim) return torch.sum(mask_input, dim_, bool(keepdim), dtype=dtype) + elif input.layout == torch.sparse_coo: + return _sparse_coo_scatter_reduction_helper(torch.sum, mask_input, dim_, bool(keepdim), dtype) + elif input.layout == torch.sparse_csr: + return torch._sparse_csr_sum(mask_input, dim=list(dim_), keepdim=bool(keepdim), dtype=dtype) else: - raise ValueError(f'masked sum expects strided tensor (got {input.layout} tensor)') + raise ValueError(f'masked sum expects strided, sparse_coo or sparse_csr tensor (got {input.layout} tensor)') @_apply_docstring_templates @@ -445,19 +824,81 @@ def prod(input: Tensor, dtype: Optional[DType] = None, mask: Optional[Tensor] = None) -> Tensor: # __doc__ is generated by _apply_docstring_templates decorator + if dtype is None: + # promote integer types to int64 when output dtype is not specified + if input.layout == torch.sparse_csr: + if input.dtype in {torch.uint8, torch.bool, torch.int8, torch.int16, torch.int32}: + # csr.to(dtype=torch.int64) is not implemented, so + # using coo.to on input to ensure the promoted dtype + input = input.to_sparse_coo().to(dtype=torch.int64).to_sparse_csr() + else: + dtype = input.dtype + else: + dtype = input.dtype + if input.dtype in {torch.uint8, torch.bool, torch.int8, torch.int16, torch.int32}: + dtype = torch.int64 + dim_ = _canonical_dim(dim, input.ndim) + mask_input = _combine_input_and_mask(prod, input, mask) if input.layout == torch.strided: - mask_input = input if mask is None else torch.where(mask, input, torch.ones_like(input)) - dim_ = _canonical_dim(dim, input.ndim) - # Workaround https://github.com/pytorch/pytorch/issues/56586 result = mask_input + result = result.to(dtype=dtype) for d in reversed(dim_): result = result.prod(dim=d, keepdim=bool(keepdim)) - if dtype is not None: - result = result.to(dtype=dtype) return result + elif input.layout == torch.sparse_coo: + if mask is None: + # See comment in the sparse_csr branch, the same issue arises for sparse_coo tensors + raise ValueError('masked prod expects explicit mask for sparse_coo tensor input') + return _sparse_coo_scatter_reduction_helper(torch.prod, mask_input, dim_, bool(keepdim), dtype) + elif input.layout == torch.sparse_csr: + if mask is None: + # mask is None corresponds to all-True mask. The + # unspecified elements in the CSR tensor correspond to + # zero values. Hence, the prod reduction result is + # automatically zero unless all elements are specified. + # A semi-optimal way to take this into account is to use: + # + # masked_prod(csr, ..., mask=None) == torch._sparse_csr_prod(csr, ...) * all(csr.nonzero(), ...) + # + # but that requires implementing `all` and `nonzero` + # support for sparse csr tensors. + raise ValueError('masked prod expects explicit mask for sparse_csr tensor input') + return torch._sparse_csr_prod(mask_input, dim=list(dim_), keepdim=bool(keepdim), dtype=dtype) + else: + raise ValueError(f'masked prod expects strided, sparse_coo or sparse_csr tensor (got {input.layout} tensor)') + + +@_apply_docstring_templates +def cumsum(input: Tensor, + dim: int, + *, + dtype: Optional[DType] = None, + mask: Optional[Tensor] = None) -> Tensor: + if dtype is None: + dtype = input.dtype + dim_ = _canonical_dim(dim, input.ndim)[0] + mask_input = _combine_input_and_mask(sum, input, mask) + if input.layout == torch.strided: + return torch.cumsum(mask_input, dim_, dtype=dtype).to(dtype=dtype) + else: + raise ValueError(f'masked cumsum expects strided tensor (got {input.layout} tensor)') + + +@_apply_docstring_templates +def cumprod(input: Tensor, + dim: int, + *, + dtype: Optional[DType] = None, + mask: Optional[Tensor] = None) -> Tensor: + if dtype is None: + dtype = input.dtype + dim_ = _canonical_dim(dim, input.ndim)[0] + mask_input = _combine_input_and_mask(prod, input, mask) + if input.layout == torch.strided: + return torch.cumprod(mask_input, dim_, dtype=dtype).to(dtype=dtype) else: - raise ValueError(f'masked prod expects strided tensor (got {input.layout} tensor)') + raise ValueError(f'masked cumprod expects strided tensor (got {input.layout} tensor)') @_apply_docstring_templates @@ -479,16 +920,19 @@ def amax(input: Tensor, {reduction_example}""" if dtype is None: dtype = input.dtype + + mask_input = _combine_input_and_mask(amax, input, mask) + dim_ = _canonical_dim(dim, mask_input.ndim) if input.layout == torch.strided: - if mask is None: - mask_input = input - else: - identity = input.new_full([], _reduction_identity('amax', input)) - mask_input = torch.where(mask, input, identity) - dim_ = _canonical_dim(dim, mask_input.ndim) return torch.amax(mask_input, dim_, bool(keepdim)).to(dtype=dtype) + elif input.layout == torch.sparse_coo: + if mask is None: + # See comment in the sparse_csr branch of prod, a similar issue arises here + # where unspecified elements along a dimension may need to be reduced with the result + raise ValueError('masked amax expects explicit mask for sparse_coo tensor input') + return _sparse_coo_scatter_reduction_helper(torch.amax, mask_input, dim_, bool(keepdim), dtype) else: - raise ValueError(f'masked amax expects strided tensor (got {input.layout} tensor)') + raise ValueError(f'masked amax expects strided or sparse_coo tensor (got {input.layout} tensor)') @_apply_docstring_templates @@ -510,16 +954,63 @@ def amin(input: Tensor, {reduction_example}""" if dtype is None: dtype = input.dtype + + mask_input = _combine_input_and_mask(amin, input, mask) + dim_ = _canonical_dim(dim, mask_input.ndim) if input.layout == torch.strided: - if mask is None: - mask_input = input - else: - identity = input.new_full([], _reduction_identity('amin', input)) - mask_input = torch.where(mask, input, identity) - dim_ = _canonical_dim(dim, mask_input.ndim) return torch.amin(mask_input, dim_, bool(keepdim)).to(dtype=dtype) + elif input.layout == torch.sparse_coo: + if mask is None: + # See comment in the sparse_csr branch of prod, a similar issue arises here + # where unspecified elements along a dimension may need to be reduced with the result + raise ValueError('masked amax expects explicit mask for sparse_coo tensor input') + return _sparse_coo_scatter_reduction_helper(torch.amin, mask_input, dim_, bool(keepdim), dtype) + else: + raise ValueError(f'masked amin expects strided or sparse_coo tensor (got {input.layout} tensor)') + + +@_apply_docstring_templates +def argmax(input: Tensor, + dim: int = None, + *, + keepdim: Optional[bool] = False, + dtype: Optional[DType] = None, + mask: Optional[Tensor] = None) -> Tensor: + """\ +{reduction_signature} +{reduction_descr} +{reduction_identity_dtype} +{reduction_args} +{reduction_example}""" + if dtype is None: + dtype = input.dtype + mask_input = _combine_input_and_mask(argmax, input, mask) + if input.layout == torch.strided: + return torch.argmax(mask_input, dim, bool(keepdim)).to(dtype=dtype) + else: + raise ValueError(f'masked argmax expects strided tensor (got {input.layout} tensor)') + + +@_apply_docstring_templates +def argmin(input: Tensor, + dim: int = None, + *, + keepdim: Optional[bool] = False, + dtype: Optional[DType] = None, + mask: Optional[Tensor] = None) -> Tensor: + """\ +{reduction_signature} +{reduction_descr} +{reduction_identity_dtype} +{reduction_args} +{reduction_example}""" + if dtype is None: + dtype = input.dtype + mask_input = _combine_input_and_mask(argmin, input, mask) + if input.layout == torch.strided: + return torch.argmin(mask_input, dim, bool(keepdim)).to(dtype=dtype) else: - raise ValueError(f'masked amin expects strided tensor (got {input.layout} tensor)') + raise ValueError(f'masked argmin expects strided tensor (got {input.layout} tensor)') @_apply_docstring_templates @@ -547,9 +1038,14 @@ def mean(input: Tensor, if dtype is None: dtype = input.dtype if input.layout == torch.strided: - inmask = _input_mask(input, mask=mask) - count = sum(inmask.new_ones(input.shape, dtype=torch.int64), dim, keepdim=keepdim, mask=inmask) - total = sum(input, dim, keepdim=keepdim, dtype=dtype, mask=inmask) + if mask is None: + # TODO: compute count analytically + count = sum(torch.ones(input.shape, dtype=torch.int64, device=input.device), dim, keepdim=keepdim) + total = sum(input, dim, keepdim=keepdim, dtype=dtype) + else: + inmask = _input_mask(input, mask=mask) + count = sum(inmask.new_ones(input.shape, dtype=torch.int64), dim, keepdim=keepdim, mask=inmask) + total = sum(input, dim, keepdim=keepdim, dtype=dtype, mask=inmask) return total / count else: raise ValueError(f'masked sum expects strided tensor (got {input.layout} tensor)') @@ -577,35 +1073,22 @@ def norm(input: Tensor, {reduction_example}""" if dtype is None: dtype = input.dtype + mask_input = _combine_input_and_mask(norm, input, mask, ord) if input.layout == torch.strided: - identity = input.new_full([], _reduction_identity('norm', input, ord)) - mask_input = input if mask is None else torch.where(mask, input, identity) dim_ = _canonical_dim(dim, input.ndim) return torch.linalg.vector_norm(mask_input, ord, dim_, bool(keepdim), dtype=dtype) else: raise ValueError(f'masked norm expects strided tensor (got {input.layout} tensor)') -@_apply_docstring_templates -def var(input: Tensor, - dim: DimOrDims = None, - unbiased: Optional[bool] = False, - *, - keepdim: Optional[bool] = False, - dtype: Optional[DType] = None, - mask: Optional[Tensor] = None) -> Tensor: - """\ -{reduction_signature} - -{reduction_descr} - -The identity value of sample variance operation is undefined. The -elements of output tensor with strided layout, that correspond to -fully masked-out elements, have ``nan`` values. - -{reduction_args} - -{reduction_example}""" +def std_var(input: Tensor, + dim: DimOrDims = None, + unbiased: Optional[bool] = False, + *, + keepdim: Optional[bool] = False, + dtype: Optional[DType] = None, + mask: Optional[Tensor] = None, + take_sqrt: Optional[bool] = False) -> Tensor: if dtype is None: dtype = input.dtype if not (dtype.is_floating_point or dtype.is_complex): @@ -614,23 +1097,88 @@ def var(input: Tensor, if not (compute_dtype.is_floating_point or compute_dtype.is_complex): compute_dtype = torch.float32 if input.layout == torch.strided: - inmask = _input_mask(input, mask=mask) - count = sum(inmask.new_ones(input.shape, dtype=torch.int64), dim, keepdim=True, mask=inmask) - sample_total = sum(input, dim, keepdim=True, dtype=dtype, mask=inmask) + if mask is None: + # TODO: compute count analytically + count = sum(torch.ones(input.shape, dtype=torch.int64, device=input.device), dim, keepdim=True) + sample_total = sum(input, dim, keepdim=True, dtype=dtype) + else: + inmask = _input_mask(input, mask=mask) + count = sum(inmask.new_ones(input.shape, dtype=torch.int64), dim, keepdim=True, mask=inmask) + sample_total = sum(input, dim, keepdim=True, dtype=dtype, mask=inmask) # TODO: replace torch.subtract/divide/square/maximum with # masked subtract/divide/square/maximum when these will be # available. sample_mean = torch.divide(sample_total, count) x = torch.subtract(input, sample_mean) - total = sum(x * x.conj(), dim, keepdim=keepdim, dtype=compute_dtype, mask=inmask) + if mask is None: + total = sum(x * x.conj(), dim, keepdim=keepdim, dtype=compute_dtype) + else: + total = sum(x * x.conj(), dim, keepdim=keepdim, dtype=compute_dtype, mask=inmask) if not keepdim: count = count.reshape(total.shape) if unbiased: count = torch.subtract(count, 1) count = torch.maximum(count, count.new_zeros([])) - return torch.divide(total, count).to(dtype=dtype) + output = torch.divide(total, count).to(dtype=dtype) + if take_sqrt: + output = torch.sqrt(output) + return output else: - raise ValueError(f'masked var expects strided tensor (got {input.layout} tensor)') + raise ValueError(f'masked std/var expects strided tensor (got {input.layout} tensor)') + + +@_apply_docstring_templates +def var(input: Tensor, + dim: DimOrDims = None, + unbiased: Optional[bool] = False, + *, + keepdim: Optional[bool] = False, + dtype: Optional[DType] = None, + mask: Optional[Tensor] = None) -> Tensor: + """\ +{reduction_signature} +{reduction_descr} +The identity value of sample variance operation is undefined. The +elements of output tensor with strided layout, that correspond to +fully masked-out elements, have ``nan`` values. +{reduction_args} +{reduction_example}""" + return std_var( + input=input, + dim=dim, + unbiased=unbiased, + keepdim=keepdim, + dtype=dtype, + mask=mask, + take_sqrt=False, + ) + + +@_apply_docstring_templates +def std(input: Tensor, + dim: DimOrDims = None, + unbiased: Optional[bool] = False, + *, + keepdim: Optional[bool] = False, + dtype: Optional[DType] = None, + mask: Optional[Tensor] = None) -> Tensor: + """\ +{reduction_signature} +{reduction_descr} +The identity value of sample standard deviation operation is undefined. The +elements of output tensor with strided layout, that correspond to +fully masked-out elements, have ``nan`` values. +{reduction_args} +{reduction_example}""" + return std_var( + input=input, + dim=dim, + unbiased=unbiased, + keepdim=keepdim, + dtype=dtype, + mask=mask, + take_sqrt=True + ) @_apply_docstring_templates @@ -642,10 +1190,8 @@ def softmax(input: Tensor, if dtype is None: dtype = input.dtype dim_ = _canonical_dim(dim, input.ndim)[0] + mask_input = _combine_input_and_mask(amax, input, mask) if input.layout == torch.strided: - fill = input.new_full([], _reduction_identity('amax', input)) - inmask = _input_mask(input, mask=mask) - mask_input = torch.where(inmask, input, fill) return torch.nn.functional.softmax(mask_input, dim_, dtype=dtype) else: raise ValueError(f'masked softmax expects strided tensor (got {input.layout} tensor)') @@ -660,10 +1206,8 @@ def log_softmax(input: Tensor, if dtype is None: dtype = input.dtype dim_ = _canonical_dim(dim, input.ndim)[0] + mask_input = _combine_input_and_mask(amax, input, mask) if input.layout == torch.strided: - fill = input.new_full([], _reduction_identity('amax', input)) - inmask = _input_mask(input, mask=mask) - mask_input = torch.where(inmask, input, fill) return torch.nn.functional.log_softmax(mask_input, dim_, dtype=dtype) else: raise ValueError(f'masked log_softmax expects strided tensor (got {input.layout} tensor)') @@ -678,10 +1222,8 @@ def softmin(input: Tensor, if dtype is None: dtype = input.dtype dim_ = _canonical_dim(dim, input.ndim)[0] + mask_input = _combine_input_and_mask(amin, input, mask) if input.layout == torch.strided: - fill = input.new_full([], _reduction_identity('amin', input)) - inmask = _input_mask(input, mask=mask) - mask_input = torch.where(inmask, input, fill) return torch.nn.functional.softmin(mask_input, dim_, dtype=dtype) else: raise ValueError(f'masked softmin expects strided tensor (got {input.layout} tensor)') @@ -698,13 +1240,12 @@ def normalize(input: Tensor, if dtype is None: dtype = input.dtype dim_ = _canonical_dim(dim, input.ndim)[0] + # TODO: eliminate mask_input as unnecessary when using masked divide. + mask_input = _combine_input_and_mask(sum, input, mask) if input.layout == torch.strided: nrm_ = norm(input, ord, dim, keepdim=True, dtype=dtype, mask=mask) # TODO: replace torch.maximum with masked maximum when available. denom = torch.maximum(nrm_, nrm_.new_full([], eps)) - # TODO: eliminate mask_input as unnecessary when using masked divide. - inmask = _input_mask(input, mask=mask) - mask_input = input if mask is None else torch.where(inmask, input, input.new_zeros([])) # TODO: replace torch.divide with masked divide when available. return torch.divide(mask_input, denom) else: diff --git a/torch/_masked/_docs.py b/torch/_masked/_docs.py new file mode 100644 index 000000000000..0949c5b21c19 --- /dev/null +++ b/torch/_masked/_docs.py @@ -0,0 +1,1046 @@ +# -*- coding: utf-8 -*- +# This file is generated, do not modify it! +# +# To update this file, run the update masked docs script as follows: +# +# python tools/update_masked_docs.py +# +# The script must be called from an environment where the development +# version of torch package can be imported and is functional. +# + +amax_docstring = """amax(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor + +Returns maximum of all the elements in the :attr:`input` +tensor along the given dimension(s) :attr:`dim` while the :attr:`input` +elements are masked out according to the boolean tensor +:attr:`mask`. + +The identity value of maximum operation, which is used to start the +reduction, depends on input dtype. For instance, for float32, uint8, +and int32 dtypes, the identity values are ``-inf``, ``0``, and ``-2147483648``, respectively. + +If :attr:`keepdim` is ``True``, the output tensor is of the same size +as :attr:`input` except in the dimension(s) :attr:`dim` where it is of +size 1. Otherwise, :attr:`dim` is squeezed (see +:func:`torch.squeeze`), resulting in the output tensor having 1 (or +``len(dim)``) fewer dimension(s). + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True +then the corresponding element in :attr:`input` tensor will be +included in maximum computation, otherwise the element is +ignored. + +When all elements of :attr:`input` along the given dimension +:attr:`dim` are ignored (fully masked-out), the corresponding element +of the output tensor will have undefined value: it may or may not +correspond to the identity value of maximum operation; the +choice may correspond to the value that leads to the most efficient +storage of :attr:`output` tensor. + +The mask of the output tensor can be computed as +``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim, +dtype=torch.bool)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + Default: None that is equivalent to ``tuple(range(input.ndim))``. + +Keyword args: + keepdim (bool, optional): whether the output tensor has + :attr:`dim` retained or not. Default: False. + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. + +Example:: + + >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]]) + >>> input + tensor([[-3, -2, -1], + [ 0, 1, 2]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.amax(input, 1, mask=mask) + tensor([ -1, -9223372036854775808]) +""" + +amin_docstring = """amin(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor + +Returns minimum of all the elements in the :attr:`input` +tensor along the given dimension(s) :attr:`dim` while the :attr:`input` +elements are masked out according to the boolean tensor +:attr:`mask`. + +The identity value of minimum operation, which is used to start the +reduction, depends on input dtype. For instance, for float32, uint8, +and int32 dtypes, the identity values are ``inf``, ``255``, and ``2147483647``, respectively. + +If :attr:`keepdim` is ``True``, the output tensor is of the same size +as :attr:`input` except in the dimension(s) :attr:`dim` where it is of +size 1. Otherwise, :attr:`dim` is squeezed (see +:func:`torch.squeeze`), resulting in the output tensor having 1 (or +``len(dim)``) fewer dimension(s). + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True +then the corresponding element in :attr:`input` tensor will be +included in minimum computation, otherwise the element is +ignored. + +When all elements of :attr:`input` along the given dimension +:attr:`dim` are ignored (fully masked-out), the corresponding element +of the output tensor will have undefined value: it may or may not +correspond to the identity value of minimum operation; the +choice may correspond to the value that leads to the most efficient +storage of :attr:`output` tensor. + +The mask of the output tensor can be computed as +``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim, +dtype=torch.bool)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + Default: None that is equivalent to ``tuple(range(input.ndim))``. + +Keyword args: + keepdim (bool, optional): whether the output tensor has + :attr:`dim` retained or not. Default: False. + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. + +Example:: + + >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]]) + >>> input + tensor([[-3, -2, -1], + [ 0, 1, 2]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.amin(input, 1, mask=mask) + tensor([ -3, 9223372036854775807]) +""" + +argmax_docstring = """argmax(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor +Returns argmax of all the elements in the :attr:`input` +tensor along the given dimension(s) :attr:`dim` while the :attr:`input` +elements are masked out according to the boolean tensor +:attr:`mask`. +The identity value of argmax operation, which is used to start the +reduction, depends on input dtype. For instance, for float32, uint8, +and int32 dtypes, the identity values are ``-inf``, ``0``, and ``-2147483648``, respectively. +If :attr:`keepdim` is ``True``, the output tensor is of the same size +as :attr:`input` except in the dimension(s) :attr:`dim` where it is of +size 1. Otherwise, :attr:`dim` is squeezed (see +:func:`torch.squeeze`), resulting in the output tensor having 1 (or +``len(dim)``) fewer dimension(s). + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True +then the corresponding element in :attr:`input` tensor will be +included in argmax computation, otherwise the element is +ignored. + +When all elements of :attr:`input` along the given dimension +:attr:`dim` are ignored (fully masked-out), the corresponding element +of the output tensor will have undefined value: it may or may not +correspond to the identity value of argmax operation; the +choice may correspond to the value that leads to the most efficient +storage of :attr:`output` tensor. + +The mask of the output tensor can be computed as +``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim, +dtype=torch.bool)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + dim (int): the dimension along which argmax is computed. + +Keyword args: + keepdim (bool, optional): whether the output tensor has + :attr:`dim` retained or not. Default: False. + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. +Example:: + + >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]]) + >>> input + tensor([[-3, -2, -1], + [ 0, 1, 2]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.argmax(input, 1, mask=mask) + tensor([2, 0]) +""" + +argmin_docstring = """argmin(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor +Returns argmin of all the elements in the :attr:`input` +tensor along the given dimension(s) :attr:`dim` while the :attr:`input` +elements are masked out according to the boolean tensor +:attr:`mask`. +The identity value of argmin operation, which is used to start the +reduction, depends on input dtype. For instance, for float32, uint8, +and int32 dtypes, the identity values are ``inf``, ``255``, and ``2147483647``, respectively. +If :attr:`keepdim` is ``True``, the output tensor is of the same size +as :attr:`input` except in the dimension(s) :attr:`dim` where it is of +size 1. Otherwise, :attr:`dim` is squeezed (see +:func:`torch.squeeze`), resulting in the output tensor having 1 (or +``len(dim)``) fewer dimension(s). + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True +then the corresponding element in :attr:`input` tensor will be +included in argmin computation, otherwise the element is +ignored. + +When all elements of :attr:`input` along the given dimension +:attr:`dim` are ignored (fully masked-out), the corresponding element +of the output tensor will have undefined value: it may or may not +correspond to the identity value of argmin operation; the +choice may correspond to the value that leads to the most efficient +storage of :attr:`output` tensor. + +The mask of the output tensor can be computed as +``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim, +dtype=torch.bool)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + dim (int): the dimension along which argmin is computed. + +Keyword args: + keepdim (bool, optional): whether the output tensor has + :attr:`dim` retained or not. Default: False. + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. +Example:: + + >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]]) + >>> input + tensor([[-3, -2, -1], + [ 0, 1, 2]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.argmin(input, 1, mask=mask) + tensor([0, 0]) +""" + +cumprod_docstring = """cumprod(input, dim, *, dtype=None, mask=None) -> Tensor + +Returns cumulative_prod of all the slices in the :attr:`input` tensor +along :attr:`dim` while the :attr:`input` elements are masked out +according to the boolean tensor :attr:`mask`. + +Let ``x`` be a sequence of unmasked elements of one-dimensional slice +of the :attr:`input` tensor. Cumsum of i-th element in ``x`` is +defined as ``prod(x[:i])``. + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True then +the corresponding element in :attr:`input` tensor will be included in +cumulative_prod computation, otherwise the element is ignored. + +The values of masked-out elements of the output tensor have undefined +value: it may or may not be set to zero or nan; the choice may correspond to +the value that leads to the most efficient storage of :attr:`output` +tensor. + +The mask of the cumulative_prod output tensor can be computed as +``torch.broadcast_to(mask, input.shape)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + dim (int): the dimension along which cumulative_prod is computed. + +Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. + +Example:: + + >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]]) + >>> input + tensor([[-3., -2., -1.], + [ 0., 1., 2.]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.cumprod(input, 1, mask=mask) + tensor([[-3., -3., 3.], + [ 1., 1., 1.]]) +""" + +cumsum_docstring = """cumsum(input, dim, *, dtype=None, mask=None) -> Tensor + +Returns cumulative_sum of all the slices in the :attr:`input` tensor +along :attr:`dim` while the :attr:`input` elements are masked out +according to the boolean tensor :attr:`mask`. + +Let ``x`` be a sequence of unmasked elements of one-dimensional slice +of the :attr:`input` tensor. Cumsum of i-th element in ``x`` is +defined as ``sum(x[:i])``. + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True then +the corresponding element in :attr:`input` tensor will be included in +cumulative_sum computation, otherwise the element is ignored. + +The values of masked-out elements of the output tensor have undefined +value: it may or may not be set to zero or nan; the choice may correspond to +the value that leads to the most efficient storage of :attr:`output` +tensor. + +The mask of the cumulative_sum output tensor can be computed as +``torch.broadcast_to(mask, input.shape)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + dim (int): the dimension along which cumulative_sum is computed. + +Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. + +Example:: + + >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]]) + >>> input + tensor([[-3., -2., -1.], + [ 0., 1., 2.]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.cumsum(input, 1, mask=mask) + tensor([[-3., -3., -4.], + [ 0., 0., 0.]]) +""" + +log_softmax_docstring = """log_softmax(input, dim, *, dtype=None, mask=None) -> Tensor + +Returns log_softmax of all the slices in the :attr:`input` tensor +along :attr:`dim` while the :attr:`input` elements are masked out +according to the boolean tensor :attr:`mask`. + +Let ``x`` be a sequence of unmasked elements of one-dimensional slice +of the :attr:`input` tensor. LogSoftmax of i-th element in ``x`` is +defined as ``log(exp(x[i])/sum(exp(x)))``. + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True then +the corresponding element in :attr:`input` tensor will be included in +log_softmax computation, otherwise the element is ignored. + +The values of masked-out elements of the output tensor have undefined +value: it may or may not be set to zero or nan; the choice may correspond to +the value that leads to the most efficient storage of :attr:`output` +tensor. + +The mask of the log_softmax output tensor can be computed as +``torch.broadcast_to(mask, input.shape)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + dim (int): the dimension along which log_softmax is computed. + +Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. + +Example:: + + >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]]) + >>> input + tensor([[-3., -2., -1.], + [ 0., 1., 2.]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.log_softmax(input, 1, mask=mask) + tensor([[-2.1269, -inf, -0.1269], + [ nan, nan, nan]]) +""" + +mean_docstring = """mean(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor + +Returns mean of all the elements in the :attr:`input` +tensor along the given dimension(s) :attr:`dim` while the :attr:`input` +elements are masked out according to the boolean tensor +:attr:`mask`. + +By definition, the identity value of a mean operation is the mean +value of the tensor. If all elements of the input tensor along given +dimension(s) :attr:`dim` are masked-out, the identity value of the +mean is undefined. Due to this ambiguity, the elements of output +tensor with strided layout, that correspond to fully masked-out +elements, have ``nan`` values. + +If :attr:`keepdim` is ``True``, the output tensor is of the same size +as :attr:`input` except in the dimension(s) :attr:`dim` where it is of +size 1. Otherwise, :attr:`dim` is squeezed (see +:func:`torch.squeeze`), resulting in the output tensor having 1 (or +``len(dim)``) fewer dimension(s). + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True +then the corresponding element in :attr:`input` tensor will be +included in mean computation, otherwise the element is +ignored. + +When all elements of :attr:`input` along the given dimension +:attr:`dim` are ignored (fully masked-out), the corresponding element +of the output tensor will have undefined value: it may or may not +correspond to the identity value of mean operation; the +choice may correspond to the value that leads to the most efficient +storage of :attr:`output` tensor. + +The mask of the output tensor can be computed as +``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim, +dtype=torch.bool)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + Default: None that is equivalent to ``tuple(range(input.ndim))``. + +Keyword args: + keepdim (bool, optional): whether the output tensor has + :attr:`dim` retained or not. Default: False. + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. + +Example:: + + >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]]) + >>> input + tensor([[-3, -2, -1], + [ 0, 1, 2]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.mean(input, 1, mask=mask) + tensor([-2., nan]) +""" + +norm_docstring = """norm(input, ord, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor + +Returns norm of all the elements in the :attr:`input` +tensor along the given dimension(s) :attr:`dim` while the :attr:`input` +elements are masked out according to the boolean tensor +:attr:`mask`. + +The identity value of norm operation, which is used to start the +reduction, is ``0.0``, except for ``ord=-inf`` it is +``inf``. + +If :attr:`keepdim` is ``True``, the output tensor is of the same size +as :attr:`input` except in the dimension(s) :attr:`dim` where it is of +size 1. Otherwise, :attr:`dim` is squeezed (see +:func:`torch.squeeze`), resulting in the output tensor having 1 (or +``len(dim)``) fewer dimension(s). + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True +then the corresponding element in :attr:`input` tensor will be +included in norm computation, otherwise the element is +ignored. + +When all elements of :attr:`input` along the given dimension +:attr:`dim` are ignored (fully masked-out), the corresponding element +of the output tensor will have undefined value: it may or may not +correspond to the identity value of norm operation; the +choice may correspond to the value that leads to the most efficient +storage of :attr:`output` tensor. + +The mask of the output tensor can be computed as +``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim, +dtype=torch.bool)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + ord (int, float, optional): the order of vector norm. Default: 2. + See :func:`torch.linalg.vector_norm` for a list of supported norms. + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + Default: None that is equivalent to ``tuple(range(input.ndim))``. + +Keyword args: + keepdim (bool, optional): whether the output tensor has + :attr:`dim` retained or not. Default: False. + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. + +Example:: + + >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]]) + >>> input + tensor([[-3., -2., -1.], + [ 0., 1., 2.]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.norm(input, 2.0, 1, mask=mask) + tensor([3.1623, 0.0000]) +""" + +normalize_docstring = """normalize(input, ord, dim, *, eps=1e-12, dtype=None, mask=None) -> Tensor + +Returns normalize of all the slices in the :attr:`input` tensor +along :attr:`dim` while the :attr:`input` elements are masked out +according to the boolean tensor :attr:`mask`. + +Let ``x`` be a sequence of unmasked elements of one-dimensional slice +of the :attr:`input` tensor. Normalize of i-th element in ``x`` is +defined as ``x[i]/max(norm(x, p), eps)``. + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True then +the corresponding element in :attr:`input` tensor will be included in +normalize computation, otherwise the element is ignored. + +The values of masked-out elements of the output tensor have undefined +value: it may or may not be set to zero or nan; the choice may correspond to +the value that leads to the most efficient storage of :attr:`output` +tensor. + +The mask of the normalize output tensor can be computed as +``torch.broadcast_to(mask, input.shape)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + ord (int, float): the order of vector norm. Default: 2. + See :func:`torch.linalg.vector_norm` for a list of supported norms. + dim (int): the dimension along which normalize is computed. + +Keyword args: + eps (float, optional): small value to avoid division by zero. Default: 1e-12. + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. + +Example:: + + >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]]) + >>> input + tensor([[-3., -2., -1.], + [ 0., 1., 2.]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.normalize(input, 2.0, 1, mask=mask) + tensor([[-0.9487, 0.0000, -0.3162], + [ 0.0000, 0.0000, 0.0000]]) +""" + +prod_docstring = """prod(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor + +Returns product of all the elements in the :attr:`input` +tensor along the given dimension(s) :attr:`dim` while the :attr:`input` +elements are masked out according to the boolean tensor +:attr:`mask`. + +The identity value of product operation, which is used to start the reduction, is ``1``. + +If :attr:`keepdim` is ``True``, the output tensor is of the same size +as :attr:`input` except in the dimension(s) :attr:`dim` where it is of +size 1. Otherwise, :attr:`dim` is squeezed (see +:func:`torch.squeeze`), resulting in the output tensor having 1 (or +``len(dim)``) fewer dimension(s). + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True +then the corresponding element in :attr:`input` tensor will be +included in product computation, otherwise the element is +ignored. + +When all elements of :attr:`input` along the given dimension +:attr:`dim` are ignored (fully masked-out), the corresponding element +of the output tensor will have undefined value: it may or may not +correspond to the identity value of product operation; the +choice may correspond to the value that leads to the most efficient +storage of :attr:`output` tensor. + +The mask of the output tensor can be computed as +``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim, +dtype=torch.bool)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + Default: None that is equivalent to ``tuple(range(input.ndim))``. + +Keyword args: + keepdim (bool, optional): whether the output tensor has + :attr:`dim` retained or not. Default: False. + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. + +Example:: + + >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]]) + >>> input + tensor([[-3, -2, -1], + [ 0, 1, 2]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.prod(input, 1, mask=mask) + tensor([3, 1]) +""" + +softmax_docstring = """softmax(input, dim, *, dtype=None, mask=None) -> Tensor + +Returns softmax of all the slices in the :attr:`input` tensor +along :attr:`dim` while the :attr:`input` elements are masked out +according to the boolean tensor :attr:`mask`. + +Let ``x`` be a sequence of unmasked elements of one-dimensional slice +of the :attr:`input` tensor. Softmax of i-th element in ``x`` is +defined as ``exp(x[i])/sum(exp(x))``. + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True then +the corresponding element in :attr:`input` tensor will be included in +softmax computation, otherwise the element is ignored. + +The values of masked-out elements of the output tensor have undefined +value: it may or may not be set to zero or nan; the choice may correspond to +the value that leads to the most efficient storage of :attr:`output` +tensor. + +The mask of the softmax output tensor can be computed as +``torch.broadcast_to(mask, input.shape)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + dim (int): the dimension along which softmax is computed. + +Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. + +Example:: + + >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]]) + >>> input + tensor([[-3., -2., -1.], + [ 0., 1., 2.]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.softmax(input, 1, mask=mask) + tensor([[0.1192, 0.0000, 0.8808], + [ nan, nan, nan]]) +""" + +softmin_docstring = """softmin(input, dim, *, dtype=None, mask=None) -> Tensor + +Returns softmin of all the slices in the :attr:`input` tensor +along :attr:`dim` while the :attr:`input` elements are masked out +according to the boolean tensor :attr:`mask`. + +Let ``x`` be a sequence of unmasked elements of one-dimensional slice +of the :attr:`input` tensor. Softmin of i-th element in ``x`` is +defined as ``exp(-x[i])/sum(exp(-x))``. + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True then +the corresponding element in :attr:`input` tensor will be included in +softmin computation, otherwise the element is ignored. + +The values of masked-out elements of the output tensor have undefined +value: it may or may not be set to zero or nan; the choice may correspond to +the value that leads to the most efficient storage of :attr:`output` +tensor. + +The mask of the softmin output tensor can be computed as +``torch.broadcast_to(mask, input.shape)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + dim (int): the dimension along which softmin is computed. + +Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. + +Example:: + + >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]]) + >>> input + tensor([[-3., -2., -1.], + [ 0., 1., 2.]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.softmin(input, 1, mask=mask) + tensor([[0.8808, 0.0000, 0.1192], + [ nan, nan, nan]]) +""" + +std_docstring = """std(input, dim, unbiased, *, keepdim=False, dtype=None, mask=None) -> Tensor +Returns standard_deviation of all the elements in the :attr:`input` +tensor along the given dimension(s) :attr:`dim` while the :attr:`input` +elements are masked out according to the boolean tensor +:attr:`mask`. +The identity value of sample standard deviation operation is undefined. The +elements of output tensor with strided layout, that correspond to +fully masked-out elements, have ``nan`` values. +If :attr:`keepdim` is ``True``, the output tensor is of the same size +as :attr:`input` except in the dimension(s) :attr:`dim` where it is of +size 1. Otherwise, :attr:`dim` is squeezed (see +:func:`torch.squeeze`), resulting in the output tensor having 1 (or +``len(dim)``) fewer dimension(s). + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True +then the corresponding element in :attr:`input` tensor will be +included in standard_deviation computation, otherwise the element is +ignored. + +When all elements of :attr:`input` along the given dimension +:attr:`dim` are ignored (fully masked-out), the corresponding element +of the output tensor will have undefined value: it may or may not +correspond to the identity value of standard_deviation operation; the +choice may correspond to the value that leads to the most efficient +storage of :attr:`output` tensor. + +The mask of the output tensor can be computed as +``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim, +dtype=torch.bool)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + Default: None that is equivalent to ``tuple(range(input.ndim))``. + unbiased (bool): when True, use Bessel’s correction, otherwise, compute + the uncorrected sample variance. + +Keyword args: + keepdim (bool, optional): whether the output tensor has + :attr:`dim` retained or not. Default: False. + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. +Example:: + + >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]]) + >>> input + tensor([[-3, -2, -1], + [ 0, 1, 2]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.std(input, 1, False, mask=mask) + tensor([1., nan]) +""" + +sum_docstring = """sum(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor + +Returns sum of all the elements in the :attr:`input` +tensor along the given dimension(s) :attr:`dim` while the :attr:`input` +elements are masked out according to the boolean tensor +:attr:`mask`. + +The identity value of sum operation, which is used to start the reduction, is ``0``. + +If :attr:`keepdim` is ``True``, the output tensor is of the same size +as :attr:`input` except in the dimension(s) :attr:`dim` where it is of +size 1. Otherwise, :attr:`dim` is squeezed (see +:func:`torch.squeeze`), resulting in the output tensor having 1 (or +``len(dim)``) fewer dimension(s). + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True +then the corresponding element in :attr:`input` tensor will be +included in sum computation, otherwise the element is +ignored. + +When all elements of :attr:`input` along the given dimension +:attr:`dim` are ignored (fully masked-out), the corresponding element +of the output tensor will have undefined value: it may or may not +correspond to the identity value of sum operation; the +choice may correspond to the value that leads to the most efficient +storage of :attr:`output` tensor. + +The mask of the output tensor can be computed as +``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim, +dtype=torch.bool)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + Default: None that is equivalent to ``tuple(range(input.ndim))``. + +Keyword args: + keepdim (bool, optional): whether the output tensor has + :attr:`dim` retained or not. Default: False. + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. + +Example:: + + >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]]) + >>> input + tensor([[-3, -2, -1], + [ 0, 1, 2]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.sum(input, 1, mask=mask) + tensor([-4, 0]) +""" + +var_docstring = """var(input, dim, unbiased, *, keepdim=False, dtype=None, mask=None) -> Tensor + +Returns variance of all the elements in the :attr:`input` +tensor along the given dimension(s) :attr:`dim` while the :attr:`input` +elements are masked out according to the boolean tensor +:attr:`mask`. + +The identity value of sample variance operation is undefined. The +elements of output tensor with strided layout, that correspond to +fully masked-out elements, have ``nan`` values. + +If :attr:`keepdim` is ``True``, the output tensor is of the same size +as :attr:`input` except in the dimension(s) :attr:`dim` where it is of +size 1. Otherwise, :attr:`dim` is squeezed (see +:func:`torch.squeeze`), resulting in the output tensor having 1 (or +``len(dim)``) fewer dimension(s). + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True +then the corresponding element in :attr:`input` tensor will be +included in variance computation, otherwise the element is +ignored. + +When all elements of :attr:`input` along the given dimension +:attr:`dim` are ignored (fully masked-out), the corresponding element +of the output tensor will have undefined value: it may or may not +correspond to the identity value of variance operation; the +choice may correspond to the value that leads to the most efficient +storage of :attr:`output` tensor. + +The mask of the output tensor can be computed as +``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim, +dtype=torch.bool)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + Default: None that is equivalent to ``tuple(range(input.ndim))``. + unbiased (bool): when True, use Bessel’s correction, otherwise, compute + the uncorrected sample variance. + +Keyword args: + keepdim (bool, optional): whether the output tensor has + :attr:`dim` retained or not. Default: False. + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. + +Example:: + + >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]]) + >>> input + tensor([[-3, -2, -1], + [ 0, 1, 2]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.var(input, 1, False, mask=mask) + tensor([1., nan]) +""" diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py new file mode 100644 index 000000000000..8fa3661e6714 --- /dev/null +++ b/torch/_meta_registrations.py @@ -0,0 +1,68 @@ +import torch + +meta_lib = torch.library.Library("aten", "IMPL", "Meta") + +def toRealValueType(dtype): + from_complex = { + torch.complex32: torch.half, + torch.cfloat: torch.float, + torch.cdouble: torch.double + } + return from_complex.get(dtype, dtype) + +# Implementations below are taken from https://github.com/albanD/subclass_zoo/blob/main/python_meta_tensor.py +@torch.library.impl(meta_lib, "index_select") +def meta_index_select(self, dim, index): + result_size = list(self.size()) + if self.dim() > 0: + result_size[dim] = index.numel() + return self.new_empty(result_size) + +@torch.library.impl(meta_lib, "index_select.out") +def meta_index_select_out(self, dim, index, out): + torch._resize_output_(out, self.size(), self.device) + return out.copy_(torch.index_select(self, dim, index)) + +@torch.library.impl(meta_lib, "abs") +def meta_abs(self): + if self.is_complex(): + float_type = toRealValueType(self.dtype) + return self.new_empty(self.size(), dtype=float_type) + else: + return self.new_empty(self.size()) + +@torch.library.impl(meta_lib, "abs.out") +def meta_abs_out(self, out): + torch._resize_output_(out, self.size(), self.device) + return out.copy_(torch.abs(self)) + +@torch.library.impl(meta_lib, "max") +def meta_max(self): + return self.new_empty(()) + +@torch.library.impl(meta_lib, "min") +def meta_min(self): + return self.new_empty(()) + +def squareCheckInputs(self, f_name): + assert self.dim() >= 2, f"{f_name}: The input tensor must have at least 2 dimensions." + # TODO: I think the error message has the -2 and -1 swapped. If you fix + # it fix the C++ squareCheckInputs too + assert self.size(-1) == self.size(-2), \ + f"{f_name}: A must be batches of square matrices, but they are {self.size(-1)} by {self.size(-2)} matrices" + +def checkUplo(uplo: str): + uplo_uppercase = uplo.upper() + assert len(uplo) == 1 and uplo_uppercase == 'U' or uplo_uppercase == 'L', \ + f"Expected UPLO argument to be 'L' or 'U', but got {uplo}" + +@torch.library.impl(meta_lib, "linalg_eigh") +def meta_linalg_eigh(self, uplo="L"): + squareCheckInputs(self, "linalg_eigh") + checkUplo(uplo) + real_dtype = toRealValueType(self.dtype) + assert self.dim() >= 2 + values = self.new_empty(self.shape, dtype=real_dtype) + values.transpose_(-2, -1) + vectors = self.new_empty(self.shape[:-1]) + return (values, vectors) diff --git a/torch/_ops.py b/torch/_ops.py index 9116d2256c03..9728998c5652 100644 --- a/torch/_ops.py +++ b/torch/_ops.py @@ -32,13 +32,17 @@ def __init__(self, overloadpacket, op, schema): self._op = op self._schema = schema self._overloadpacket = overloadpacket + self._overloadname = 'default' if schema.overload_name == '' else schema.overload_name + self.__name__ = "{}.{}".format(self._schema.name.split("::")[1], self._overloadname) + self.__module__ = overloadpacket.__module__ + op.__module__ = overloadpacket.__module__ # it's a no-op since OpOverload object is immutable and must be unique for a given op overload. def __deepcopy__(self, memo=None): return self - def __str__(self): - return "OpOverload(op='{}.{}', overload='{}')".format(*self._schema.name.split("::"), self.overload_name) + def __repr__(self): + return "".format(*self._schema.name.split("::"), self._overloadname) def __call__(self, *args, **kwargs): return self._op(*args, **kwargs or {}) @@ -46,17 +50,15 @@ def __call__(self, *args, **kwargs): def __getattr__(self, key): return getattr(self._op, key) - # `my_namespace::my_op` - @property - def name(self): - return "{}.{}".format(*self._schema.name.split("::")) + def __hash__(self): + return hash(self._op) - @property - def overload_name(self): - return self._schema.overload_name + # `my_namespace.my_op_name.overload_name` + def __str__(self): + return "{}.{}.{}".format(*self._schema.name.split("::"), self._overloadname) @property - def overload_packet(self): + def overloadpacket(self): return self._overloadpacket @property @@ -68,27 +70,26 @@ def op(self): # OpOverloadPacket class contains pointer to a base unresolved operator that doesn't correspond to a specific operator # You can obtain an OpOverload object through attribute query. class OpOverloadPacket: - def __init__(self, qualified_op_name, op_name, op): + def __init__(self, qualified_op_name, op_name, op, overload_names): # These attributes are accessible on the object through the properties # defined below but are immutable self._qualified_op_name = qualified_op_name - self._op_name = op_name + self.__name__ = op_name self._op = op + self._overload_names = overload_names # it's a no-op since OpOverloadPacket object is immutable and must be unique for a given op. def __deepcopy__(self, memo=None): return self - def __str__(self): - return "OpOverloadPacket(op='{}.{}')".format(*self._qualified_op_name.split("::")) + def __repr__(self): + return "".format(*self._qualified_op_name.split("::")) - @property - def qualified_op_name(self): - return "{}.{}".format(*self._qualified_op_name.split("::")) + def __hash__(self): + return hash(self._op) - @property - def op_name(self): - return self._op_name + def __str__(self): + return "{}.{}".format(*self._qualified_op_name.split("::")) @property def op(self): @@ -99,39 +100,67 @@ def __getattr__(self, key): if key == '__file__': return 'torch.ops' + # ensure that query for dunder attributes that does not exist on + # opoverloadpacket but instead exists on the self._op object does not unnecessarily call + # `_get_operation_overload` (which is an expensive operation). + # This is done to prevent any potential slowdown. This list can be extended + # if there exists other attributes like `__name__` that only exist on self._op and not on the + # opoverloadpacket. + # This is ok since we are guaranteed that an overload name for an aten op can't start with '__' try: + if key.startswith('__'): + return getattr(self._op, key) + except AttributeError: + # for consistency because it seems weird to + # throw an attribute error with a message containing + # an object name different from the one the attribute + # query was performed on. + raise AttributeError("'{}' can't have an overload name beginning with '__' and the " + "underlying op {} has no attribute {} either." + .format(str(self), str(self._op), key)) from None + + try: + # This is ok since we are guaranteed that an overload name for an aten op can't be 'default' use_key = '' if key == 'default' else key # TODO: disallow access to overloads registered by JIT - op_ = torch._C._get_operation_overload(self._qualified_op_name, use_key) + op_ = torch._C._get_operation_overload( + self._qualified_op_name, use_key) schema = torch._C._get_schema(self._qualified_op_name, use_key) overload = OpOverload(self, op_, schema) # cache the overload object setattr(self, key, overload) return overload except RuntimeError: - try: - # This is added to maintain bc in case the user queries an attribute that exists on `self._op` - # which used to be returned before instead of the OpOverloadPacket - out = getattr(self._op, key) - return out - except AttributeError: - raise AttributeError("'{}' object has no attribute '{}'".format(str(self), key)) from None + raise AttributeError( + "The underlying op of '{}' has no overload name '{}'".format(str(self), key) + ) from None def __call__(self, *args, **kwargs): - # overloading __call__ to ensure torch.ops.foo.bar() is still callable from JIT - # We save the function ptr as the `op` attribute on OpOverloadPacket to access it here. + # overloading __call__ to ensure torch.ops.foo.bar() + # is still callable from JIT + # We save the function ptr as the `op` attribute on + # OpOverloadPacket to access it here. return self._op(*args, **kwargs or {}) + # TODO: use this to make a __dir__ + def overloads(self): + return [n if n else "default" for n in self._overload_names] + # Resolution of torch.fn is different from torch.ops.aten.fn -# torch.fn uses the Python argparser, matches with the appropriate schema, and calls into the unboxed version of the method -# torch.ops.aten.fn resolution is done via the mechanism defined in JIT. JIT creates a stack of all the overloads and -# then tries to match the correct one at runtime and always calls into the boxed version of the method -# Autograd codegen creates VariableType, TracerType, inplace or view type and python bindings -# Aten codegen generates tensor methods for the the tensor class +# torch.fn uses the Python argparser, matches with the +# appropriate schema, and calls into the unboxed version of the method +# torch.ops.aten.fn resolution is done via the mechanism defined in JIT. +# JIT creates a stack of all the overloads and then tries to match the +# correct one at runtime and always calls into the boxed version of the method +# Autograd codegen creates VariableType, TracerType, +# inplace or view type and python bindings. +# Aten codegen generates tensor methods for the the tensor class. # _OpNamespace is a subclass of ModuleType because the torch script # allows attribute lookups on modules only. Since we want torch.ops.foo.bar() # to work from script, we need to ensure ops and foo are modules + + class _OpNamespace(types.ModuleType): """ An op namespace to dynamically bind Operators into Python. @@ -160,23 +189,29 @@ def __getattr__(self, op_name): # It is not a valid op_name when __file__ is passed in if op_name == '__file__': return 'torch.ops' + # Get the op `my_namespace::my_op` if available. This will also check # for overloads and raise an exception if there are more than one. namespace_name = self.name qualified_op_name = '{}::{}'.format(namespace_name, op_name) - op = torch._C._jit_get_operation(qualified_op_name) + try: + op, overload_names = torch._C._jit_get_operation(qualified_op_name) + except RuntimeError as e: + # Turn this into AttributeError so getattr(obj, key, default) + # works (this is called by TorchScript with __origin__) + raise AttributeError(f"'_OpNamespace' object has no attribute '{op_name}'") from e # let the script frontend know that op is identical to the builtin op # with qualified_op_name torch.jit._builtins._register_builtin(op, qualified_op_name) op.__module__ = self.__module__ + "." + namespace_name - # opoverloadpacket = OpOverloadPacket(qualified_op_name, op_name, op) - # opoverloadpacket.__module__ = self.__module__ + "." + namespace_name + opoverloadpacket = OpOverloadPacket(qualified_op_name, op_name, op, overload_names) + opoverloadpacket.__module__ = self.__module__ + "." + namespace_name # cache the opoverloadpacket to ensure that each op corresponds to # a unique OpOverloadPacket object - # setattr(self, op_name, opoverloadpacket) - setattr(self, op_name, op) - return op + setattr(self, op_name, opoverloadpacket) + return opoverloadpacket + class _Ops(types.ModuleType): __file__ = '_ops.py' @@ -220,5 +255,6 @@ def load_library(self, path): ctypes.CDLL(path) self.loaded_libraries.add(path) + # The ops "namespace" ops = _Ops() diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py new file mode 100644 index 000000000000..85242acd9120 --- /dev/null +++ b/torch/_prims/__init__.py @@ -0,0 +1,2020 @@ +import torch +from torch import Tensor, _TypedStorage + +import torch._prims.utils as utils +from torch._prims.utils import ( + TensorLike, + TensorLikeType, + TensorMeta, + ShapeType, + getnvFuserDtype, + DimsType, + DimsSequenceType, + StrideType, + Number, + NumberType, +) +from torch.overrides import has_torch_function, handle_torch_function +import torch.library +from torch.utils._pytree import tree_map + +from typing import Sequence, Optional, Union, Callable, List, Tuple, Any, Type +from functools import reduce, partial +from enum import Enum +import operator +import math + +prim = torch.library.Library("prims", "DEF") +prim_impl = torch.library.Library("prims", "IMPL", "CompositeExplicitAutograd") +prim_meta_impl = torch.library.Library("prims", "IMPL", "Meta") + +# Experimental module containing prototype "primitive" operations. + +__all__ = [ + # + # Common datastructures and helpers + # + "RETURN_TYPE", + # + # Elementwise unary prims + # + "abs", + "acos", + "acosh", + "asin", + "atan", + "cos", + "cosh", + "bessel_i0e", + "bessel_i1e", + "cbrt", + "ceil", + "digamma", + "erf", + "erf_inv", + "erfc", + "exp", + "expm1", + "floor", + "is_finite", + "lgamma", + "log", + "log1p", + "neg", + "reciprocal", + "round", + "sign", + "sin", + "sinh", + "sqrt", + "square", + "tan", + # + # Elementwise binary prims + # + "add", + "atan2", + "bitwise_and", + "bitwise_not", + "bitwise_or", + "bitwise_xor", + # 'complex', # needs custom meta + "div", + "eq", + "ge", + "gt", + "igamma", + "igammac", + "le", + "lt", + "maximum", + "minimum", + "mul", + "ne", + "nextafter", + "pow", + "rsqrt", + "shift_left", + "shift_right_arithmetic", + "shift_right_logical", # not implemented + # + # View prims + # + "as_strided", + "broadcast_in_dim", + "collapse_view", + "expand_dims", + "slice", + "slice_in_dim", # implemented using slice -- make this a ref? + "split_dim", + "squeeze", + "transpose", + "view_of", + # + # Shape prims + # + "collapse", + "concatenate", + "reshape", + "rev", + # + # Conditional prims + # + "select", + # + # Data conversion and movement prims + # + "clone", + "convert_element_type", + "device_put", + "to_dtype", + # + # Inplace prims + # + "copy_to", + "resize", + # "_set", # Commented out, see note below + # + # Reduction prims + # + "all", + "amax", + "amin", + "any", + "prod", + "sum", + # + # Tensor Creation + # + "empty", + "empty_like", + "full", + "full_like", +] + +# +# Common datastructures and helpers +# + +# Describes the return type of the primitive: +# +# - NEW, a new tensor is created +# - VIEW, a view of an input tensor is returned +# - INPLACE, one or more input tensors is modified +# +# these descriptors are mututally exclusive and exhaustive. +class RETURN_TYPE(Enum): + NEW = (0,) + VIEW = (1,) + INPLACE = (2,) + + +def _wrap_tensor_meta(f): + def wrap(t): + if isinstance(t, torch.Tensor): + return TensorMeta(t) + else: + return t + + def unwrap(t): + # TODO: doesn't setup aliasing relation on views correctly + if isinstance(t, TensorMeta): + return torch.empty_strided( + t.shape, t.stride(), dtype=t.dtype, device="meta" + ) + else: + return t + + def wrapper(*args, **kwargs): + wrapped_args = tree_map(wrap, args) + wrapped_kwargs = tree_map(wrap, kwargs) + return tree_map(unwrap, f(*wrapped_args, **wrapped_kwargs)) + + return wrapper + + +def _make_prim( + *, + schema: str, + meta: Callable, + impl_aten: Callable, + impl_nvfuser: Optional[Callable] = None, + return_type: RETURN_TYPE, + doc: str, +): + """ + Creates a primitive operation. + + """ + + prim.define(schema) + + def _prim_impl(*args, **kwargs): + # always run the meta function because aten implementation will + # typically accept more inputs (e.g., it will do promotion and + # broadcasting) which we want to reject + meta(*args, **kwargs) + return impl_aten(*args, **kwargs) + + name = schema.split("(")[0] + prim_impl.impl(name, _prim_impl) + prim_meta_impl.impl(name, _wrap_tensor_meta(meta)) + + _prim = getattr(torch.ops.prims, name).default + + _prim.__doc__ = doc + _prim.meta = meta # type: ignore[attr-defined] + _prim.impl_nvfuser = impl_nvfuser # type: ignore[attr-defined] + _prim.return_type = return_type # type: ignore[attr-defined] + + return _prim + + +class ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND(Enum): + DEFAULT = (0,) + ALWAYS_BOOL = (2,) + COMPLEX_TO_FLOAT = (3,) + + +# TODO: implement dtype validation here, too, or on the corresponding refs +def _elementwise_meta( + *args, type_promotion: ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND +) -> TensorMeta: + """ + Meta function for elementwise operations that produce outputs in the same dtype + as their inputs. + + Stride logic is currently incorrect. + """ + + assert len(args) > 0 + + utils.check_same_device(*args, allow_cpu_scalar_tensors=True) + utils.check_same_shape(*args, allow_cpu_scalar_tensors=True) + utils.check_same_dtype(*args) + + strides = utils.compute_elementwise_output_strides(*args) + + tensor = None + scalar_tensor = None + number = None + for arg in args: + if isinstance(arg, TensorLike): + if utils.is_cpu_scalar_tensor(arg) and scalar_tensor is None: + scalar_tensor = arg + if not utils.is_cpu_scalar_tensor(arg) and tensor is None: + tensor = arg + + elif isinstance(arg, Number): + if number is None: + number = arg + + # NOTE: type promotion behavior here is mostly hidden from tests because + # references will typically handle the type promotion properly even if this doesn't + # (but getting it wrong will cause too many casts to be inserted in traces!) + if tensor is not None or scalar_tensor is not None: + tensor = tensor if tensor is not None else scalar_tensor + assert tensor is not None # appease mypy + if type_promotion == ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT: + return TensorMeta(tensor, strides=strides) + if type_promotion == ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL: + return TensorMeta(tensor, strides=strides, dtype=torch.bool) + if type_promotion == ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT: + if utils.is_complex_dtype(tensor.dtype): + dtype = utils.corresponding_real_dtype(tensor.dtype) + else: + dtype = tensor.dtype + return TensorMeta(tensor, strides=strides, dtype=dtype) + + # Number case + # NOTE: this case is not currently exercised + # TODO: fix number type promotion (bool, complex->float) + return TensorMeta(number) + + +def _make_elementwise_unary_prim( + name: str, *, type_promotion: ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND, **kwargs +): + """ + Creates an elementwise unary prim. + """ + + return _make_prim( + schema=f"{name}(Tensor self) -> Tensor", + meta=partial(_elementwise_meta, type_promotion=type_promotion), + return_type=RETURN_TYPE.NEW, + **kwargs, + ) + + +def _make_elementwise_binary_prim( + name: str, *, type_promotion: ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND, **kwargs +): + """ + Creates an elementwise binary prim. + """ + + return _make_prim( + schema=f"{name}(Tensor self, Tensor other) -> Tensor", + meta=partial(_elementwise_meta, type_promotion=type_promotion), + return_type=RETURN_TYPE.NEW, + **kwargs, + ) + + +def _not_impl(*args, **kwargs): + raise NotImplementedError + + +# +# Elementwise unary operations +# + +abs = _make_elementwise_unary_prim( + "abs", + impl_aten=torch.abs, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT, +) + +acos = _make_elementwise_unary_prim( + "acos", + impl_aten=torch.acos, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +acosh = _make_elementwise_unary_prim( + "acosh", + impl_aten=torch.acosh, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +asin = _make_elementwise_unary_prim( + "asin", + impl_aten=torch.asin, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +atan = _make_elementwise_unary_prim( + "atan", + impl_aten=torch.atan, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +cos = _make_elementwise_unary_prim( + "cos", + impl_aten=torch.cos, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +cosh = _make_elementwise_unary_prim( + "cosh", + impl_aten=torch.cosh, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +bessel_i0e = _make_elementwise_unary_prim( + "bessel_i0e", + impl_aten=torch.special.i0e, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +bessel_i1e = _make_elementwise_unary_prim( + "bessel_i1e", + impl_aten=torch.special.i1e, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + + +def _cbrt_aten(a: torch.Tensor): + return pow(a, (1 / 3)) + + +cbrt = _make_elementwise_unary_prim( + "cbrt", + impl_aten=_cbrt_aten, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +ceil = _make_elementwise_unary_prim( + "ceil", + impl_aten=torch.ceil, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +digamma = _make_elementwise_unary_prim( + "digamma", + impl_aten=torch.digamma, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +erf = _make_elementwise_unary_prim( + "erf", + impl_aten=torch.erf, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +erf_inv = _make_elementwise_unary_prim( + "erf_inv", + impl_aten=torch.special.erfinv, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +erfc = _make_elementwise_unary_prim( + "erfc", + impl_aten=torch.special.erfc, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +exp = _make_elementwise_unary_prim( + "exp", + impl_aten=torch.exp, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +expm1 = _make_elementwise_unary_prim( + "expm1", + impl_aten=torch.special.expm1, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +floor = _make_elementwise_unary_prim( + "floor", + impl_aten=torch.floor, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +is_finite = _make_elementwise_unary_prim( + "is_finite", + impl_aten=torch.isfinite, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL, +) + +lgamma = _make_elementwise_unary_prim( + "lgamma", + impl_aten=torch.lgamma, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +log = _make_elementwise_unary_prim( + "log", + impl_aten=torch.log, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +log1p = _make_elementwise_unary_prim( + "log1p", + impl_aten=torch.log1p, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +reciprocal = _make_elementwise_unary_prim( + "reciprocal", + impl_aten=torch.reciprocal, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +neg = _make_elementwise_unary_prim( + "neg", + impl_aten=torch.neg, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +round = _make_elementwise_unary_prim( + "round", + impl_aten=torch.round, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +sign = _make_elementwise_unary_prim( + "sign", + impl_aten=torch.sign, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +sin = _make_elementwise_unary_prim( + "sin", + impl_aten=torch.sin, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +sinh = _make_elementwise_unary_prim( + "sinh", + impl_aten=torch.sinh, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +sqrt = _make_elementwise_unary_prim( + "sqrt", + impl_aten=torch.sqrt, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +square = _make_elementwise_unary_prim( + "square", + impl_aten=torch.square, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +tan = _make_elementwise_unary_prim( + "tan", + impl_aten=torch.tan, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +# +# Elementwise binary operations +# +# TODO: we should be able to stamp these out but it's a little tricky with FX's name resolution +def _add_nvfuser(fd: Any, a: TensorLikeType, b: TensorLikeType): + return fd.Ops.add(a, b) # type: ignore[attr-defined] + + +add = _make_elementwise_binary_prim( + name="add", + impl_aten=torch.add, + impl_nvfuser=_add_nvfuser, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +atan2 = _make_elementwise_binary_prim( + name="atan2", + impl_aten=torch.atan2, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +bitwise_and = _make_elementwise_binary_prim( + "bitwise_and", + impl_aten=torch.bitwise_and, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +bitwise_not = _make_elementwise_binary_prim( + "bitwise_not", + impl_aten=torch.bitwise_not, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +bitwise_or = _make_elementwise_binary_prim( + "bitwise_or", + impl_aten=torch.bitwise_or, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +bitwise_xor = _make_elementwise_binary_prim( + "bitwise_xor", + impl_aten=torch.bitwise_xor, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +# TODO: complex needs a special meta to account for its float -> complex behavior +# complex = _make_elementwise_binary_prim( +# impl_aten=torch.complex, +# doc="", +# ) + +# div prim performs truncation division on integer inputs +# and true division for floating and complex inputs +def _div_aten(a, b): + if isinstance(a, (bool, int)): + return torch.div(a, b, rounding_mode="trunc") + return torch.true_divide(a, b) + + +def _div_nvfuser(fd: Any, a: TensorLikeType, b: TensorLikeType): + return fd.Ops.div(a, b) # type: ignore[attr-defined] + + +div = _make_elementwise_binary_prim( + "div", + impl_aten=_div_aten, + impl_nvfuser=_div_nvfuser, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +eq = _make_elementwise_binary_prim( + "eq", + impl_aten=torch.eq, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL, +) + + +def _ge_nvfuser(fd: Any, a: TensorLikeType, b: TensorLikeType): + return fd.Ops.ge(a, b) # type: ignore[attr-defined] + + +ge = _make_elementwise_binary_prim( + "ge", + impl_aten=torch.ge, + impl_nvfuser=_ge_nvfuser, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL, +) + + +def _gt_nvfuser(fd: Any, a: TensorLikeType, b: TensorLikeType): + return fd.Ops.gt(a, b) # type: ignore[attr-defined] + + +gt = _make_elementwise_binary_prim( + "gt", + impl_aten=torch.gt, + impl_nvfuser=_gt_nvfuser, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL, +) + +igamma = _make_elementwise_binary_prim( + "igamma", + impl_aten=torch.special.gammainc, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +igammac = _make_elementwise_binary_prim( + "igammac", + impl_aten=torch.special.gammaincc, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + + +def _le_nvfuser(fd: Any, a: TensorLikeType, b: TensorLikeType): + return fd.Ops.le(a, b) # type: ignore[attr-defined] + + +le = _make_elementwise_binary_prim( + "le", + impl_aten=torch.le, + impl_nvfuser=_le_nvfuser, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL, +) + + +def _lt_nvfuser(fd: Any, a: TensorLikeType, b: TensorLikeType): + return fd.Ops.lt(a, b) # type: ignore[attr-defined] + + +lt = _make_elementwise_binary_prim( + "lt", + impl_aten=torch.lt, + impl_nvfuser=_lt_nvfuser, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL, +) + + +def _wrap_scalar(a: NumberType, *, dtype: torch.dtype = None) -> torch.Tensor: + """ + Wraps a Number into a Tensor of corresponding dtype. + + Note: this should not generally be used, but some torch functions don't + accept scalars, so it's necessary for their prims to do so. + """ + dtype = dtype if dtype is not None else utils.type_to_dtype(type(a)) + return torch.tensor(a, dtype=dtype) + + +# Note: the following impls are because torch.maximum and torch.mininum do not support scalar inputs +def _maximum_aten( + a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, NumberType] +) -> TensorLikeType: + if isinstance(a, TensorLike) and isinstance(b, Number): + b = _wrap_scalar(b, dtype=a.dtype) + elif isinstance(b, TensorLike) and isinstance(a, Number): + a = _wrap_scalar(a, dtype=b.dtype) + + return torch.maximum(a, b) # type: ignore[arg-type] + + +maximum = _make_elementwise_binary_prim( + "maximum", + impl_aten=_maximum_aten, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + + +def _minimum_aten( + a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, NumberType] +) -> TensorLikeType: + if isinstance(a, TensorLike) and isinstance(b, Number): + b = _wrap_scalar(b, dtype=a.dtype) + elif isinstance(b, TensorLike) and isinstance(a, Number): + a = _wrap_scalar(a, dtype=b.dtype) + + return torch.minimum(a, b) # type: ignore[arg-type] + + +minimum = _make_elementwise_binary_prim( + "minimum", + impl_aten=_minimum_aten, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + + +def _mul_nvfuser(fd: Any, a: TensorLikeType, b: TensorLikeType): + return fd.Ops.mul(a, b) # type: ignore[attr-defined] + + +mul = _make_elementwise_binary_prim( + "mul", + impl_aten=torch.mul, + impl_nvfuser=_mul_nvfuser, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +ne = _make_elementwise_binary_prim( + "ne", + impl_aten=torch.ne, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL, +) + +nextafter = _make_elementwise_binary_prim( + "nextafter", + impl_aten=torch.nextafter, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +pow = _make_elementwise_binary_prim( + "pow", + impl_aten=torch.pow, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +rsqrt = _make_elementwise_binary_prim( + "rsqrt", + impl_aten=torch.rsqrt, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +shift_left = _make_elementwise_binary_prim( + "shift_left", + impl_aten=torch.bitwise_left_shift, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +shift_right_arithmetic = _make_elementwise_binary_prim( + "shift_right_arithmetic", + impl_aten=torch.bitwise_right_shift, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +shift_right_logical = _not_impl + +sub = _make_elementwise_binary_prim( + "sub", + impl_aten=torch.sub, + doc="", + type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT, +) + +# +# View operations +# +# TODO: model view relationships +# TODO: model storage +def _as_strided_meta( + a: TensorLikeType, size: ShapeType, stride: StrideType, storage_offset: int +) -> TensorLikeType: + assert len(size) == len(stride) + assert storage_offset >= 0 + utils.validate_strides(stride) + utils.validate_shape(size) + + if reduce(operator.mul, size) == 0: + # NOTE: This special case is to avoid having to acquire the storage below + # as_strided to shapes with no elements are trivially valid, so it's OK + pass + elif isinstance(a, torch.Tensor): + utils.check_in_bounds_for_storage(a.storage(), size, stride, storage_offset) + + return TensorMeta(a, shape=size, strides=stride) + + +def _as_strided_aten( + a: Tensor, size: ShapeType, stride: StrideType, storage_offset: int +) -> Tensor: + return torch.as_strided(a, size, stride, storage_offset) + + +_as_strided_doc = """ + Creates a view of the tensor with the given shape (size), strides (stride) and + storage offset (storage_offset). +""" + +as_strided = _make_prim( + schema="as_strided(Tensor(a!) a, int[] size, int[] stride, int storage_offset) -> Tensor(a!)", + meta=_as_strided_meta, + impl_aten=_as_strided_aten, + return_type=RETURN_TYPE.VIEW, + doc=_as_strided_doc, +) + + +def _broadcast_in_dim_meta( + a: TensorLikeType, shape: ShapeType, broadcast_dimensions: Sequence[int] +): + # Type checks + assert isinstance(a, TensorLike) + assert isinstance(shape, Sequence) + assert isinstance(broadcast_dimensions, Sequence) + + # every dimension must be accounted for + assert a.ndim == len(broadcast_dimensions) + + # broadcast shape must have weakly more dimensions + assert len(shape) >= a.ndim + + # broadcast_dimensions must be an ascending sequence + # (no relative reordering of dims) of integers and + # each dimension must be within the new shape + def _greater_than_reduce(acc, x): + assert isinstance(x, int) + assert x > acc + assert x < len(shape) + + return x + + reduce(lambda acc, x: _greater_than_reduce(acc, x), broadcast_dimensions, -1) + + # shape must be broadcastable to + for idx, new_idx in enumerate(broadcast_dimensions): + assert a.shape[idx] == 1 or a.shape[idx] == shape[new_idx] + + new_strides = [] + original_idx = 0 + for idx in range(len(shape)): + if idx in broadcast_dimensions: + new_strides.append(a.stride()[original_idx]) + original_idx = original_idx + 1 + else: + new_strides.append(0) + + return TensorMeta(a, shape=shape, strides=new_strides) + + +def _broadcast_in_dim_aten(a, shape, broadcast_dimensions): + s = list(shape) + for broadcast_dimension in broadcast_dimensions: + s[broadcast_dimension] = -1 + + v = a + for idx, x in enumerate(s): + if x != -1: + v = v.unsqueeze(idx) + + return v.expand(shape) + + +def _broadcast_in_dim_nvfuser( + fd: Any, + a: torch.Tensor, + shape: ShapeType, + broadcast_dimensions: ShapeType, +): + return fd.Ops.broadcast_in_dim(a, shape, broadcast_dimensions) # type: ignore[attr-defined] + + +_broadcast_in_dim_doc = """ + Creates a view of a with the specified shape. + + Allows adding dimensions of any length and broadcasting + dimensions of length one in a to any length. + + The location of the broadcast dimensions must be specified + using the broadcast_dimensions argument. Changing the + relative order of dimensions is not supported. + """ + +broadcast_in_dim = _make_prim( + schema="broadcast_in_dim(Tensor(a) a, int[] shape, int[] broadcast_dimensions) -> Tensor(a)", + meta=_broadcast_in_dim_meta, + impl_aten=_broadcast_in_dim_aten, + impl_nvfuser=_broadcast_in_dim_nvfuser, + return_type=RETURN_TYPE.VIEW, + doc=_broadcast_in_dim_doc, +) + + +def _collapse_view_helper( + a: TensorLikeType, start: int, end: int +) -> Tuple[Optional[ShapeType], Optional[StrideType]]: + assert isinstance(a, TensorLike) + + # Special-case for zero dimensional tensors + if a.ndim == 0: + shape = (1,) + strides = (1,) + else: + shape = a.shape # type: ignore[assignment] + strides = a.stride() + + utils.validate_idx(len(shape), start) + utils.validate_exclusive_idx(len(shape), end) + + # Verifies end is strictly greater than start + # (Collapse requires a non-empty interval) + if end <= start: + msg = "Attempting to collapse but end, {0}, is less than or equal to start, {1}!".format( + end, start + ) + raise ValueError(msg) + + if a.ndim == 0 or (end - 1 == start): + return shape, strides + + length = shape[end - 1] + stride = strides[end - 1] + for idx in reversed(range(start, end - 1)): + if shape[idx] == 0 or shape[idx + 1] == 0: + length = 0 + stride = 0 + break + + if shape[idx] == 1: + continue + + length = length * shape[idx] + stride = min(stride, strides[idx]) + + if ( + a.numel() > 0 + and shape[idx + 1] != 1 + and not (strides[idx] == strides[idx + 1] * shape[idx + 1]) + ): + return None, None + + new_shape = shape[:start] + (length,) + shape[end:] + new_strides = strides[:start] + (stride,) + strides[end:] + + # NOTE: when the input has no elements it's restrided as if it were contiguous + if a.numel() == 0: + new_strides = utils.make_contiguous_strides_for(new_shape) + + return new_shape, new_strides + + +def _collapse_view_meta(a: TensorLikeType, start: int, end: int) -> TensorLikeType: + new_shape, new_strides = _collapse_view_helper(a, start, end) + + if new_shape is None: + msg = "Attempting to view a collapsed tensor, but no such view exists!" + raise ValueError(msg) + + return TensorMeta(a, shape=new_shape, strides=new_strides) + + +def _collapse_view_aten(a: Tensor, start: int, end: int) -> Tensor: + # Special-cases zero-dim tensors + if a.ndim == 0: + shape = (1,) + else: + shape = a.shape # type: ignore[assignment] + + dim_length = 1 + for idx in range(start, end): + dim_length = dim_length * shape[idx] + + new_shape = shape[0:start] + (dim_length,) + shape[end:] + + return a.view(new_shape) + + +_collapse_view_doc = """ + Creates a view of a with the dimensions between + start (inclusive) and end (exclusive) merged into a + single dimension. + + If it's not possible to take such a view then an error + is thrown. See collapse instead. + + The dimensions can be merged if and only if + they are all "nested" with each other. That is, they all + have the property that + + stride[i] = stride[i+1] * shape[i+1] + + for all i in [start, end - 1). + """ + +collapse_view = _make_prim( + schema="collapse_view(Tensor(a) a, int start, int end) -> Tensor(a)", + meta=_collapse_view_meta, + impl_aten=_collapse_view_aten, + return_type=RETURN_TYPE.VIEW, + doc=_collapse_view_doc, +) + + +def expand_dims(a: TensorLikeType, dimensions: DimsSequenceType) -> TensorLikeType: + """ + Creates a view of a with a.ndim + len(dimensions) dimensions, with new + dimensions of length one at the dimensions specified by dimensions. + """ + dims = sorted(utils.canonicalize_dims(a.ndim, dimensions)) # type: ignore[arg-type] + if len(set(dims)) != len(dims): + msg = "Received duplicate dimensions to expand in {0}".format(str(dimensions)) + raise ValueError(msg) + + new_shape = list(a.shape) + for idx in dims: + new_shape.insert(idx, 1) + + broadcast_dimensions = [ + idx for idx in range(len(new_shape)) if idx not in dimensions + ] + return broadcast_in_dim(a, new_shape, broadcast_dimensions) + + +# Note: saves the Python slice object because we're about to clobber its name with the slice prim +pyslice: Type[slice] = slice + + +def _slice_meta( + a: TensorLikeType, + start_indices: DimsSequenceType, + limit_indices: DimsSequenceType, + strides: Optional[StrideType] = None, +) -> TensorLikeType: + _strides = strides if strides is not None else [1] * len(start_indices) + + if a.ndim != len(start_indices): + msg = "Attempting to slice tensor of rank {0} with start_indices of length {1}!".format( + a.ndim, len(start_indices) + ) + raise ValueError(msg) + + if a.ndim != len(limit_indices): + msg = "Attempting to slice tensor of rank {0} with limit_indices of length {1}!".format( + a.ndim, len(limit_indices) + ) + raise ValueError(msg) + + if a.ndim != len(_strides): + msg = ( + "Attempting to slice tensor of rank {0} with strides of length {1}!".format( + a.ndim, len(limit_indices) + ) + ) + raise ValueError(msg) + + for x, y in zip(start_indices, a.shape): + if x < 0: + msg = "Attempting to slice a tensor with a negative start index of {0}!".format( + x + ) + raise ValueError(msg) + if x > y: + msg = ( + "Attempting to slice a tensor but a start index in {0} is greater than" + " the length of its corresponding dimension in shape {1}".format( + start_indices, a.shape + ) + ) + raise ValueError(msg) + + for x, y, z in zip(limit_indices, a.shape, start_indices): + if x < 0: + msg = "Attempting to slice a tensor with a negative stop index of {0}!".format( + x + ) + raise ValueError(msg) + if x > y: + msg = ( + "Attempting to slice a tensor but a stop index in {0} is greater than the length of " + " its corresponding dimension in shape {1}".format( + limit_indices, a.shape + ) + ) + raise ValueError(msg) + if x < z: + msg = ( + "Attempting to slice a tensor but a start index in {0} is greater than " + " its corresponding stop index {1}".format(x, z) + ) + + for x in _strides: + if x <= 0: + msg = ( + "Attempting to slice a tensor with a non-positive step of {0}!".format( + x + ) + ) + raise ValueError(msg) + + new_shape = [] + for x, y, z in zip(start_indices, limit_indices, _strides): + new_shape.append(math.floor((y - x) / z)) + + new_strides = [] + for x, y in zip(a.stride(), _strides): + new_strides.append(x * y) + + return TensorMeta(a, shape=new_shape, strides=new_strides) + + +def _slice_aten( + a: Tensor, + start_indices: DimsSequenceType, + limit_indices: DimsSequenceType, + strides: Optional[StrideType] = None, +) -> Tensor: + _strides = strides if strides is not None else [1] * len(start_indices) + + slices = [] + for start, stop, step in zip(start_indices, limit_indices, _strides): + slices.append(pyslice(start, stop, step)) + + return operator.getitem(a, slices) # type: ignore[call-overload] + + +_slice_doc = """ + Creates a view of a "bounding box" within the tensor. + + The bounding box is specified independently in each of the tensor's dimensions. + start_indices and limit_indices describe the box's boundaries for their corresponding + dimensions. If strides is specified then they specify the step size between elements + in their corresponding dimension. + + This operation is analogous to slicing in NumPy, but does not permit slices where + the stop indices are less than the start indices. + """ + +slice = _make_prim( + schema="slice(Tensor(a) a, int[] start_indices, int[] limit_indices, int[]? strides=None) -> Tensor(a)", + meta=_slice_meta, + impl_aten=_slice_aten, + return_type=RETURN_TYPE.VIEW, + doc=_slice_doc, +) + + +def _slice_in_dim_meta( + a: TensorLikeType, + start_index: int, + limit_index: int, + stride: int = 1, + axis: int = 0, +) -> TensorLikeType: + if axis < 0: + msg = "slice_in_dim: received a negative axis {0}".format(axis) + raise ValueError(msg) + if axis >= a.ndim: + msg = "slice_in_dim: axis {0} is greater or equal to the rank {1} of the tensor".format( + axis, a.ndim + ) + raise ValueError(msg) + + if start_index < 0: + msg = "slice_in_dim: received a negative start_index {0}".format(start_index) + raise ValueError(msg) + + if start_index > a.shape[axis]: + msg = "slice_in_dim: start_index is greater than the length {0} of dimension {1}".format( + start_index, axis + ) + raise ValueError(msg) + + if limit_index > a.shape[axis]: + msg = "slice_in_dim: limit_index is greater than the length {0} of dimension {1}".format( + limit_index, axis + ) + raise ValueError(msg) + + if limit_index < start_index: + msg = "slice_in_dim: received a limit_index {0} less than the start_index {1}".format( + limit_index, start_index + ) + raise ValueError(msg) + + if stride < 0: + msg = "slice_in_dim: received a non-positive stride of {0}!".format(stride) + raise ValueError(msg) + + start_indices = [0] * a.ndim + limit_indices = list(a.shape) + strides = [1] * a.ndim + + start_indices[axis] = start_index + limit_indices[axis] = limit_index + strides[axis] = stride + + return _slice_meta(a, start_indices, limit_indices, strides) + + +def _slice_in_dim_aten( + a: Tensor, + start_index: int, + limit_index: int, + stride: int = 1, + axis: int = 0, +) -> Tensor: + start_indices = [0] * a.ndim + limit_indices = list(a.shape) + strides = [1] * a.ndim + + start_indices[axis] = start_index + limit_indices[axis] = limit_index + strides[axis] = stride + + return slice(a, start_indices, limit_indices, strides) + + +_slice_in_dim_doc = """ + Convenience wrapper for slicing just one dimension using slice. + """ + +slice_in_dim = _make_prim( + schema="slice_in_dim(Tensor(a) a, int start_index, int limit_index, int stride=1, int axis=0) -> Tensor(a)", + meta=_slice_in_dim_meta, + impl_aten=_slice_in_dim_aten, + return_type=RETURN_TYPE.VIEW, + doc=_slice_in_dim_doc, +) + + +def _split_dim_meta(a: TensorLikeType, dim: int, outer_length: int) -> TensorLikeType: + assert isinstance(a, TensorLike) + utils.validate_idx(a.ndim, dim) + utils.validate_dim_length(outer_length) + + # Verifies the dim can be split with the specified lhs_length + _inner_length = a.shape[dim] / outer_length + inner_length: int = int(_inner_length) + + if inner_length != _inner_length: + msg = "Attempting to split dimension of length {0}, but outer length of {1} divides it with a remainder!".format( + a.shape[dim], outer_length + ) + raise ValueError(msg) + + new_shape: List[int] = [] + new_strides: List[int] = [] + for idx in range(a.ndim): + if idx == dim: + new_shape.extend((outer_length, inner_length)) + new_strides.extend((a.stride()[idx] * inner_length, a.stride()[idx])) + else: + new_shape.append(a.shape[idx]) + new_strides.append(a.stride()[idx]) + + return TensorMeta(a, shape=new_shape, strides=new_strides) + + +def _split_dim_aten(a: Tensor, dim: int, outer_length: int) -> Tensor: + inner_length = int(a.shape[dim] / outer_length) + new_shape = a.shape[0:dim] + (outer_length, inner_length) + a.shape[dim + 1 :] + + return a.view(new_shape) + + +_split_dim_doc = """ + Creates a view of a with the given dimension (of length l) split + into two dimensions, with the outer of the two having + length outer_length and the inner of the two having computed + length inner_length such outer_length * inner_length = l. + """ + +# TODO: consider renaming split_dim_view +split_dim = _make_prim( + schema="split_dim(Tensor(a) a, int dim, int outer_length) -> Tensor(a)", + meta=_split_dim_meta, + impl_aten=_split_dim_aten, + return_type=RETURN_TYPE.VIEW, + doc=_split_dim_doc, +) + +# Note: allows dimensions to be specified redundantly +def _squeeze_meta(a: TensorLikeType, dimensions: Sequence) -> TensorLikeType: + assert isinstance(a, TensorLike) + + for idx in dimensions: + utils.validate_idx(a.ndim, idx) + assert a.shape[idx] == 1 + + new_shape = [] + new_strides = [] + for idx in range(len(a.shape)): + if idx in dimensions: + continue + + new_shape.append(a.shape[idx]) + new_strides.append(a.stride()[idx]) + + return TensorMeta(a, shape=new_shape, strides=new_strides) + + +def _squeeze_aten(a: Tensor, dimensions: Sequence) -> Tensor: + squeezes = 0 + for idx in dimensions: + a = torch.squeeze(a, dim=(idx - squeezes)) + squeezes = squeezes + 1 + + return a + + +_squeeze_doc = """ + Creates a view of the tensor with the specified dimensions removed. + + The removed dimensions must each have length one. + """ + +squeeze = _make_prim( + schema="squeeze(Tensor(a) a, int[] dimensions) -> Tensor(a)", + meta=_squeeze_meta, + impl_aten=_squeeze_aten, + return_type=RETURN_TYPE.VIEW, + doc=_squeeze_doc, +) + + +def _transpose_meta(a: TensorLikeType, permutation: DimsSequenceType) -> TensorLikeType: + if a.ndim != len(permutation): + msg = "Attempting to permute a tensor of rank {0}, but received a permutation of length {1}!".format( + a.ndim, len(permutation) + ) + raise ValueError(msg) + + if not utils.is_valid_permutation(a.ndim, permutation): + msg = "Received an invalid permutation, {0}!".format(permutation) + raise ValueError(msg) + + new_shape = [0] * a.ndim + new_strides = [0] * a.ndim + for idx, dim in enumerate(permutation): + new_shape[idx] = a.shape[dim] + new_strides[idx] = a.stride()[dim] + + return TensorMeta(a, shape=tuple(new_shape), strides=tuple(new_strides)) + + +def _transpose_aten(a: Tensor, permutation: DimsSequenceType) -> Tensor: + return torch.permute(a, permutation) + + +_transpose_doc = """ + Creates a view of the tensor with its dimensions permuted. + + The length of the permutation must be the rank of the tensor, + and each element of the permutation specifies the new order + for the corresponding dimension. + """ + +transpose = _make_prim( + schema="transpose(Tensor(a) a, int[] permutation) -> Tensor(a)", + meta=_transpose_meta, + impl_aten=_transpose_aten, + return_type=RETURN_TYPE.VIEW, + doc=_transpose_doc, +) + + +def _view_of_meta(a: TensorLikeType) -> TensorLikeType: + return TensorMeta(a) + + +def _view_of_aten(a: Tensor) -> Tensor: + return a.view(a.shape) + + +_view_of_doc = """ + Creates a view of the tensor. + """ + +view_of = _make_prim( + schema="view_of(Tensor(a) a) -> Tensor", + meta=_view_of_meta, + impl_aten=_view_of_aten, + return_type=RETURN_TYPE.VIEW, + doc=_view_of_doc, +) + +# +# Shape operations +# +def collapse(a: Tensor, start: int, end: int) -> Tensor: + """ + Wrapper around reshape that collapses a span of dimensions. + + See collapse_view for the corresponding view operation. + """ + + dim_length = 1 + for idx in range(start, end): + dim_length = dim_length * a.shape[idx] + + new_shape = a.shape[0:start] + (dim_length,) + a.shape[end:] + return reshape(a, new_shape) + + +# TODO: review stride logic +def _concatenate_meta(tensors: Sequence[TensorLikeType], dim: int) -> TensorLikeType: + if len(tensors) == 0: + msg = "concatenate expects at least one tensor, but received zero!" + raise ValueError(msg) + + for tensor in tensors: + assert isinstance(tensor, TensorLike) + + utils.check_same_dtype(*tensors) + utils.check_same_device(*tensors, allow_cpu_scalar_tensors=False) + + shape = tensors[0].shape + utils.validate_idx(tensors[0].ndim, dim) + + # Verifies same shape (except in the concat dimension) + concat_length = 0 + for tensor in tensors: + for idx, (common_length, length) in enumerate(zip(shape, tensor.shape)): + if idx == dim: + concat_length = concat_length + length + else: + assert length == common_length + + new_shape = list(tensors[0].shape).copy() + new_shape[dim] = concat_length + return TensorMeta( + tensors[0], + shape=new_shape, + strides=utils.make_contiguous_strides_for(new_shape), + ) + + +def _concatenate_aten( + tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: int +) -> Tensor: + return torch.cat(tensors, dim) + + +_concatenate_doc = """ + Concatenates tensors along the specified dimension. + + The tensors' shapes must have the same rank and same length for other dimensions. + """ + +concatenate = _make_prim( + schema="concatenate(Tensor[] tensors, int dim) -> Tensor", + meta=_concatenate_meta, + impl_aten=_concatenate_aten, + return_type=RETURN_TYPE.NEW, + doc=_concatenate_doc, +) + + +def _reshape_meta(a: TensorLikeType, shape: ShapeType): + assert isinstance(a, TensorLike) + utils.validate_shape(shape) + + # Validates the tensor and the requested shape have the + # same number of elements + numel = reduce(operator.mul, shape) + if numel != a.numel(): + msg = "Attempting to reshape a tensor with {0} elements to a shape with {1} elements!".format( + a.numel(), numel + ) + raise ValueError(msg) + + return TensorMeta(a, shape=shape, strides=utils.make_contiguous_strides_for(shape)) + + +def _reshape_aten(a: Tensor, shape: ShapeType) -> Tensor: + return a.reshape(shape).contiguous().clone() + + +_reshape_doc = """ + Creates a contiguous tensor with the specified shape + containing a copy of the data in a. + """ +reshape = _make_prim( + schema="reshape(Tensor a, int[] shape) -> Tensor", + meta=_reshape_meta, + impl_aten=_reshape_aten, + return_type=RETURN_TYPE.NEW, + doc=_reshape_doc, +) + + +def _rev_meta(a: TensorLikeType, dims: DimsSequenceType) -> TensorLikeType: + utils.validate_dimension_indices(a.ndim, dims) + return TensorMeta(a) + + +_rev_doc = """ + Reverses the order of elements along the given dimensions. + """ + +rev = _make_prim( + schema="rev(Tensor a, int[] dims) -> Tensor", + meta=_rev_meta, + impl_aten=torch.flip, + return_type=RETURN_TYPE.NEW, + doc=_rev_doc, +) + +# +# Conditional prims +# + + +def _select_meta( + pred: TensorLikeType, a: TensorLikeType, b: TensorLikeType +) -> TensorLikeType: + utils.check_same_device(pred, a, b, allow_cpu_scalar_tensors=True) + utils.check_same_shape(pred, a, b, allow_cpu_scalar_tensors=True) + assert pred.dtype is torch.bool + + return _elementwise_meta( + a, b, type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT + ) + + +def _select_aten(pred: Tensor, a: Tensor, b: Tensor) -> Tensor: + return torch.where(pred, a, b) + + +_select_doc = """ + Selects elements from a and b according to pred. + + Where pred is true the result contains the element from a, and + where pred is false the result contains the element from b. + """ + +select = _make_prim( + schema="select(Tensor pred, Tensor a, Tensor b) -> Tensor", + meta=_select_meta, + impl_aten=_select_aten, + return_type=RETURN_TYPE.NEW, + doc=_select_doc, +) + +# +# Type conversions +# +# TODO: model memory format on TensorMeta +# TODO: make clone a reference following its implementation in TensorFactories.cpp +def _clone_meta( + a: TensorLikeType, *, memory_format: torch.memory_format +) -> TensorLikeType: + strides = utils.compute_elementwise_output_strides(a) + return TensorMeta(a, strides=strides) + + +def _clone_aten(a: Tensor, *, memory_format: torch.memory_format) -> Tensor: + return torch.clone(a, memory_format=memory_format) + + +_clone_doc = """ + Creates a copy of a tensors. +""" + +clone = _make_prim( + schema="clone(Tensor a, *, MemoryFormat memory_format) -> Tensor", + meta=_clone_meta, + impl_aten=_clone_aten, + return_type=RETURN_TYPE.NEW, + doc=_clone_doc, +) + + +def _convert_element_type_meta(a: TensorLikeType, dtype: torch.dtype) -> TensorLikeType: + # Type checks + assert isinstance(a, TensorLike) + assert isinstance(dtype, torch.dtype) + + strides = utils.compute_elementwise_output_strides(a) + + return TensorMeta(a, strides=strides, dtype=dtype) + + +def _convert_element_type_aten(a: Tensor, dtype: torch.dtype) -> Tensor: + # TODO: update meta objects so this can be acquired directly + try: + requires_grad = a.requires_grad + except Exception as e: + requires_grad = False + + result = empty_like(a, device=a.device, dtype=dtype, requires_grad=requires_grad) + with torch.no_grad(): + return copy_to(result, a) + + +def _convert_element_type_nvfuser(fd: Any, a: Tensor, dtype: torch.dtype) -> Tensor: + nvfuser_dtype = getnvFuserDtype(dtype) + return fd.Ops.cast(nvfuser_dtype, a) # type: ignore[attr-defined] + + +_convert_element_type_doc = """ + Creates a copy of a tensor with the given dtype. + """ + +convert_element_type = _make_prim( + schema="convert_element_type(Tensor a, ScalarType dtype) -> Tensor", + meta=_convert_element_type_meta, + impl_aten=_convert_element_type_aten, + impl_nvfuser=_convert_element_type_nvfuser, + return_type=RETURN_TYPE.NEW, + doc=_convert_element_type_doc, +) + + +def _device_put_meta( + a: TensorLikeType, device: Union[str, torch.device] +) -> TensorLikeType: + assert isinstance(a, TensorLike) + assert isinstance(device, (str, torch.device)) + + return TensorMeta(a, device=utils.wrap_device(device)) + + +def _device_put_aten(a: Tensor, device: Union[str, torch.device]) -> Tensor: + return a.to(device) + + +_device_put_doc = """ + Creates a copy of a tensor on the given device. + """ + +device_put = _make_prim( + schema="device_put(Tensor a, Device device) -> Tensor", + meta=_device_put_meta, + impl_aten=_device_put_aten, + return_type=RETURN_TYPE.NEW, + doc=_device_put_doc, +) + +# TODO: FIXME: strides are incorrect +def _to_dtype_meta(a: TensorLikeType, dtype: torch.dtype) -> TensorLikeType: + strides = utils.make_contiguous_strides_for(a.shape) + return TensorMeta(a, strides=strides, dtype=dtype) + + +def _to_dtype_aten(a: Tensor, dtype: torch.dtype) -> Tensor: + return a.to(dtype) + + +_to_dtype_doc = """ + Creates a contiguous copy of a tensor with the given dtype. +""" + +to_dtype = _make_prim( + schema=("to_dtype(Tensor a, ScalarType dtype) -> Tensor"), + meta=_to_dtype_meta, + impl_aten=_to_dtype_aten, + return_type=RETURN_TYPE.NEW, + doc=_to_dtype_doc, +) + +# +# Inplace operators +# + + +def _copy_to_meta(a: TensorLikeType, b: TensorLikeType): + assert isinstance(a, TensorLike) + assert isinstance(b, TensorLike) + + # Validates the cast is safe + # TODO: move this as an option on the reference + # a_typ = utils.dtype_to_type(a.dtype) + # b_typ = utils.dtype_to_type(b.dtype) + # if a_typ is not utils.get_higher_type(a_typ, b_typ): + # raise RuntimeError(str(b.dtype), " can't be cast safely to ", str(a.dtype), "!") + + # Validates the tensors have the same number of elements + if a.numel() != b.numel(): + msg = "Attempting to copy {0} elements to a tensor with {1} elements!".format( + b.numel(), a.numel() + ) + raise RuntimeError(msg) + + return a + + +def _copy_to_aten(a: Tensor, b: Tensor) -> Tensor: + return a.copy_(b) + + +_copy_to_doc = """ + Copies the data in b to a and returns the modified a. + """ + +# TODO: Remove safe casting and implement on reference instead +copy_to = _make_prim( + schema="copy_to(Tensor(a!) a, Tensor b) -> Tensor(a!)", + meta=_copy_to_meta, + impl_aten=_copy_to_aten, + return_type=RETURN_TYPE.INPLACE, + doc=_copy_to_doc, +) + + +def _resize_meta( + a: TensorLikeType, shape: Union[torch.Size, List[int], Tuple[int, ...]] +): + return TensorMeta(a, shape=shape, strides=utils.make_contiguous_strides_for(shape)) + + +def _resize_aten(a: Tensor, shape: ShapeType) -> Tensor: + return a.resize_(shape) + + +_resize_doc = """ + Gives a tensor with no elements a new shape, returning the modified tensor. + + The tensor's strides are contiguous and its values are unitialized. + """ + +# TODO: review support arbitrary resizes +resize = _make_prim( + schema="resize(Tensor(a!) a, int[] shape) -> Tensor(a!)", + meta=_resize_meta, + impl_aten=_resize_aten, + return_type=RETURN_TYPE.INPLACE, + doc=_resize_doc, +) + + +def _reduction_meta(inp, dims, *, output_dtype=None): + """ + Meta function for single output reduction operations + Stride logic is incorrect + """ + assert isinstance(inp, TensorLike) + if output_dtype is None: + output_dtype = inp.dtype + output_shape = utils.compute_reduction_output_shape(inp.shape, dims) + return TensorMeta( + shape=output_shape, + strides=utils.make_contiguous_strides_for(output_shape), + dtype=output_dtype, + device=inp.device, + ) + + +def _bool_return_reduction_meta(inp, dims): + return _reduction_meta(inp, dims, output_dtype=torch.bool) + + +_sum_doc = """ + Computes the sum of elements in the input tensor over the list of dimensions + specified in the dim argument + """ +_amax_doc = """ + Computes the maximum value of elements in the input tensor over the list of dimensions + specified in the dim argument + """ +_amin_doc = """ + Computes the minimum value of elements in the input tensor over the list of dimensions + specified in the dim argument + """ + + +def _make_reduction_prim(name: str, impl_aten, doc): + """Creates a reduction prim.""" + return _make_prim( + schema=f"{name}(Tensor inp, int[]? dims, *, ScalarType? output_dtype=None) -> Tensor", + meta=_reduction_meta, + impl_aten=impl_aten, + return_type=RETURN_TYPE.NEW, + doc=doc, + ) + + +def _make_bool_reduction_prim(name: str, impl_aten, doc): + """Creates a reduction prim that reduces to bool.""" + return _make_prim( + schema=f"{name}(Tensor inp, int[]? dims, *, ScalarType? output_dtype=None) -> Tensor", + meta=_bool_return_reduction_meta, + impl_aten=impl_aten, + return_type=RETURN_TYPE.NEW, + doc=doc, + ) + + +sum = _make_reduction_prim( + name="sum", + impl_aten=torch.sum, + doc=_sum_doc, +) + +prod = _make_reduction_prim( + name="prod", + impl_aten=torch.prod, + doc=_sum_doc, # TODO: fixme +) + +amax = _make_reduction_prim( + name="amax", + impl_aten=torch.amax, + doc=_amax_doc, +) + +amin = _make_reduction_prim( + name="amin", + impl_aten=torch.amin, + doc=_amin_doc, +) + +all = _make_bool_reduction_prim( + name="all", + impl_aten=torch.all, + doc="", +) + +any = _make_bool_reduction_prim( + name="any", + impl_aten=torch.any, + doc="", +) + +# TODO: layout, pin_memory, memory_format +# TODO: model requires_grad on TensorMeta +def _empty_meta( + shape: ShapeType, *, dtype: torch.dtype, device: torch.device, requires_grad: bool +) -> TensorLikeType: + strides = utils.make_contiguous_strides_for(shape) + return TensorMeta(shape=shape, strides=strides, dtype=dtype, device=device) + + +def _empty_aten( + shape: ShapeType, *, dtype: torch.dtype, device: torch.device, requires_grad: bool +) -> Tensor: + return torch.empty(shape, dtype=dtype, device=device, requires_grad=requires_grad) + + +_empty_doc = """ + Creates a tensor with uninitialized values and the specified shape, dtype, and device. +""" + +empty = _make_prim( + schema="empty(int[] shape, *, ScalarType dtype, Device device, bool requires_grad) -> Tensor", + meta=_empty_meta, + impl_aten=_empty_aten, + return_type=RETURN_TYPE.NEW, + doc=_empty_doc, +) + +# TODO: memory format +def _empty_like_meta( + a: TensorLikeType, *, dtype: torch.dtype, device: torch.device, requires_grad: bool +) -> TensorLikeType: + strides: Tuple[int, ...] + if a.numel() == 0: + strides = a.stride() + else: + strides = utils.compute_elementwise_output_strides(a) + + return TensorMeta(a, strides=strides, dtype=dtype, device=device) + + +def _empty_like_aten( + a: Tensor, *, dtype: torch.dtype, device: torch.device, requires_grad: bool +) -> Tensor: + return torch.empty_like(a, dtype=dtype, device=device, requires_grad=requires_grad) + + +_empty_like_doc = """ + Creates a tensor with uninitialized values, and the same shape, dtype, and device as the + given tensor by default. The dtype and device settings can be overridden + by specifying them explicitly. +""" + +empty_like = _make_prim( + schema="empty_like(Tensor a, *, ScalarType dtype, Device device, bool requires_grad) -> Tensor", + meta=_empty_like_meta, + impl_aten=_empty_like_aten, + return_type=RETURN_TYPE.NEW, + doc=_empty_like_doc, +) + + +def _full_meta( + shape: ShapeType, + fill_value: NumberType, + *, + dtype: torch.dtype, + device: torch.device, + requires_grad: bool, +) -> TensorLikeType: + strides = utils.make_contiguous_strides_for(shape) + return TensorMeta(shape=shape, strides=strides, dtype=dtype, device=device) + + +def _full_aten( + shape: ShapeType, + fill_value: NumberType, + *, + dtype: torch.dtype, + device: torch.device, + requires_grad: bool, +) -> Tensor: + # Note that Mypy thinks torch.full can't accept a complex fill_value + return torch.full( + shape, fill_value, dtype=dtype, device=device, requires_grad=requires_grad # type: ignore[arg-type] + ) + + +_full_doc = """ + Creates a tensor filled with the given fill value, and with the specified shape, dtype, and device. +""" + +# TODO: add layout +full = _make_prim( + schema="full(int[] shape, Scalar fill_value, *, ScalarType dtype, Device device, bool requires_grad) -> Tensor", + meta=_full_meta, + impl_aten=_full_aten, + return_type=RETURN_TYPE.NEW, + doc=_full_doc, +) + + +def _full_like_meta( + a: TensorLikeType, + fill_value: NumberType, + *, + dtype: torch.dtype, + device: torch.device, + requires_grad: bool, +) -> TensorLikeType: + strides = strides = utils.compute_elementwise_output_strides(a) + if a.numel() == 0: + strides = a.stride() + + return TensorMeta(a, strides=strides, dtype=dtype, device=device) + + +def _full_like_aten( + a: Tensor, + fill_value: NumberType, + *, + dtype: torch.dtype, + device: torch.device, + requires_grad: bool, +) -> Tensor: + # Note that Mypy thinks torch.full can't accept a complex fill_value + return torch.full_like( + a, fill_value, dtype=dtype, device=device, requires_grad=requires_grad # type: ignore[arg-type] + ) + + +_full_like_doc = """ + Creates a tensor filled with the given fill value, and the same shape, dtype, and device as the + given tensor by default. The dtype and device settings can be overridden + by specifying them explicitly. +""" + +full_like = _make_prim( + schema="full_like(Tensor a, Scalar fill_value, *, ScalarType dtype, Device device, bool requires_grad) -> Tensor", + meta=_full_like_meta, + impl_aten=_full_like_aten, + return_type=RETURN_TYPE.NEW, + doc=_full_like_doc, +) diff --git a/torch/_prims/context.py b/torch/_prims/context.py new file mode 100644 index 000000000000..fee316ff3af9 --- /dev/null +++ b/torch/_prims/context.py @@ -0,0 +1,158 @@ +import string +from typing import Callable, Sequence, Any, Dict +from itertools import chain + + +import torch +from torch.fx.graph import Graph, Node +import torch.overrides + +from torch._prims.utils import TensorMeta +import torch._refs as refs + + +# TODO: automap torch operations to references +# (need to throw a good assertion if the mapping doesn't exist) +_torch_to_reference_map = { + torch.add: refs.add, + # torch.div: refs.div, + torch.mul: refs.mul, + torch.ge: refs.ge, + torch.gt: refs.gt, + torch.le: refs.le, + torch.lt: refs.lt, +} + + +class PrimContext(torch.overrides.TorchFunctionMode): + """ + The prototype prim tracing context. + + Example usage: + + import torch._prims.utils as utils + from torch._prims.context import PrimContext + from torch._prims.executor import execute + from torch.overrides import push_torch_function_mode + + a = torch.randn((2, 2)) + b = torch.randn((2, 2)) + + with push_torch_function_mode(PrimContext): + meta_a = ctx.placeholder(utils.TensorMeta(a)) + meta_b = ctx.placeholder(utils.TensorMeta(b)) + result = torch.add(meta_a, meta_b) + ctx.output(result) + + exc_result = execute(ctx, a, b) + + Currently this only acquires a trace of prims, and + it does not account for control flow. As such, + execute must be called with tensors that have the + same metadata (dtype, device, shape...) as + the tensors used to trace the operations. + + The tracing context's FX graph can be acquired + using its graph attribute. + """ + + def __init__(self): + self.graph = Graph() + + # Private attributes for generating names + self._tensor_name_counter = 0 + self._dim_name_counter = 0 + self._shape_name_counter = 0 + self._lowercase = tuple(string.ascii_lowercase) + self._uppercase = tuple(string.ascii_uppercase) + + @staticmethod + def _create_name(idx, chars): + name = "" + while idx >= len(chars): + name = chars[idx % len(chars)] + name + idx = idx - len(chars) + name = chars[idx] + name + + return name + + def _tensor_name(self): + idx = self._tensor_name_counter + self._tensor_name_counter = self._tensor_name_counter + 1 + + return self._create_name(idx, self._lowercase) + + def _add_user(self, tm: TensorMeta, node: Node) -> None: + assert tm.node is not None + tm.node.users[node] = None + + def placeholder(self, a: Any): + name = self._tensor_name() + node = self.graph.placeholder(name) + + if isinstance(a, TensorMeta): + if a.node is not None: + raise ValueError("Attempting to reuse a TensorMeta in a new trace!") + a.tname = name + a.node = node + + return a + + def output(self, tm: TensorMeta): + # TODO: allow other output types + assert isinstance(tm, TensorMeta) + + node = self.graph.output(tm) + self._add_user(tm, node) + + def __torch_function__( + self, + func: Callable, + types: Sequence, + args: Sequence[Any] = (), + kwargs: Dict = None, + ): + """ + Determines which function to call. The order of which + function is called is determined by: + + - func's "meta" attribute, if it exists + - if func is a torch operation, its corresponding reference + - func + """ + + if kwargs is None: + kwargs = {} + + if hasattr(func, "meta"): + # TODO: add check that all args/kwargs are 'registered' properly + # to this trace + + output = func.meta(*args, **kwargs) # type: ignore[attr-defined] + + # Updates graph + # TODO: handle outputs with multiple tensors + # TODO: handle non-tensor outputs + assert isinstance(output, TensorMeta) + output_name = self._tensor_name() + node = self.graph.create_node( + "call_function", func, name=output_name, args=args, kwargs=kwargs + ) + output.tname = output_name + output.node = node + + # Marks uses + for x in ( + x for x in chain(args, kwargs.values()) if isinstance(x, TensorMeta) + ): + self._add_user(x, node) + + return output + + # Remaps torch operations to their references + if func in _torch_to_reference_map: + fn = _torch_to_reference_map[func] + with torch.overrides.enable_torch_function_mode(self, replace=self.inner): + return fn(*args, **kwargs) # type: ignore[operator] + + return func(*args, **kwargs) diff --git a/torch/_prims/executor.py b/torch/_prims/executor.py new file mode 100644 index 000000000000..4675b520ac1c --- /dev/null +++ b/torch/_prims/executor.py @@ -0,0 +1,113 @@ +from typing import Callable + +import torch + +from torch.fx import GraphModule +from torch._prims.utils import TensorMeta, getnvFuserDtype +from torch._prims.context import PrimContext +import torch.overrides + +if torch.cuda.is_available(): + from torch._C._nvfuser import Fusion, FusionDefinition # type: ignore[import] + + +def execute(ctx: PrimContext, *args, executor: str = "aten", **kwargs): + """ + Prototype ATen executor. + + Just executes the context's graph. + """ + + if executor == "aten": + gm = GraphModule({}, ctx.graph) + return gm.forward(*args, **kwargs) + elif executor == "nvfuser": + if not torch.cuda.is_available(): + raise RuntimeError( + "Attempting to use nvFuser trace executor but CUDA is not available!" + ) + + # PROTOTYPE nvfuser executor + # Only accepts tensor inputs and single tensor outputs + # Does not handle kwargs + # Does not support reusing the same ctx to execute! + assert len(kwargs) == 0 + # TODO: make this a proper trace -> trace transform that + # doesn't mutate the context + graph_fd = ctx.graph.placeholder("fd") + ctx.graph._root.append(graph_fd) + + fusion = Fusion() + with FusionDefinition(fusion) as fd: + # Transforms graph to call nvfuser lowerings + nv_args = [fd] + for arg in args: + if isinstance(arg, torch.Tensor): + x = fd.define_tensor( + arg.size(), arg.stride(), getnvFuserDtype(arg.dtype) + ) + fd.add_input(x) + nv_args.append(x) + else: + nv_args.append(x) + + for x in ctx.graph.nodes: + if x.op == "call_function": + x.target = x.target.impl_nvfuser + x.args = (graph_fd,) + x.args + + gm = GraphModule({}, ctx.graph) + out = gm.forward(*nv_args) + fd.add_output(out) + + return fusion.execute( + tuple(arg for arg in args if isinstance(arg, torch.Tensor)) + )[0] + + msg = "Received unexpected value for 'executor': {0}. Allowed values are: aten, nvfuser.".format( + executor + ) + raise ValueError(msg) + + +def make_traced(fn: Callable): + """ + Returns a function that, when called, will + trace its torch operations to prims and then + execute those prims on the requested trace executor + (possibly lowering them to that trace executor first). + + Only supports the torch operations defined in _torch_to_reference_map + in context.py and operations with positional args. All args must + be tensors and the function must return a single tensor. In the + near future all these restrictions will be lifted. + + Example usage: + + def foo(a, b): + return torch.add(a, b) + + traced_foo = make_traced(foo) + + a = torch.randn((1, 2, 3, 4, 5), device='cuda') + b = torch.randn((1, 2, 3, 4, 5), device='cuda') + result = traced_foo(a, b, executor='nvfuser') + + Executor may be either 'aten' or 'nvfuser'. + """ + + def _traced(*args, executor="aten"): + ctx: PrimContext + with torch.overrides.push_torch_function_mode(PrimContext) as ctx: # type: ignore[attr-defined, assignment] + placeholders = [] + for arg in args: + if isinstance(arg, torch.Tensor): + placeholders.append(ctx.placeholder(TensorMeta(arg))) + else: + placeholders.append(ctx.placeholder(arg)) + + result = fn(*placeholders) + ctx.output(result) + return execute(ctx, *args, executor=executor) + + return _traced diff --git a/torch/_prims/utils.py b/torch/_prims/utils.py new file mode 100644 index 000000000000..beb94b2069f4 --- /dev/null +++ b/torch/_prims/utils.py @@ -0,0 +1,1104 @@ +from __future__ import annotations + +from typing import Any, Union, Sequence, Optional, Callable, Dict, Tuple, List +from enum import Enum +from functools import reduce, cmp_to_key +import operator + +import torch + +# nvFuser imports are conditional on CUDA being available +if torch.cuda.is_available(): + from torch._C._nvfuser import DataType # type: ignore[import] + + _torch_dtype_to_nvfuser_dtype_map = { + torch.cdouble: DataType.ComplexDouble, + torch.cfloat: DataType.ComplexFloat, + torch.double: DataType.Double, + torch.float: DataType.Float, + torch.half: DataType.Half, + torch.bfloat16: DataType.BFloat16, + torch.long: DataType.Int, + torch.int: DataType.Int32, + torch.bool: DataType.Bool, + } +else: + _torch_dtype_to_nvfuser_dtype_map = {} + + +def getnvFuserDtype(dtype: torch.dtype): + """ + Translates from torch.dtype to nvFuser's DataType enum + """ + return _torch_dtype_to_nvfuser_dtype_map[dtype] + + +ShapeType = Union[torch.Size, List[int], Tuple[int, ...]] +StrideType = Union[List[int], Tuple[int, ...]] +DimsType = Union[int, List[int], Tuple[int, ...]] +DimsSequenceType = Union[List[int], Tuple[int, ...]] +NumberType = Union[bool, int, float, complex] +Number = (bool, int, float, complex) + + +class TensorMeta(torch.Tensor): + """ + Model tensor metadata. Not a stock meta tensor because device is modeled + as the original device (not meta device), also we have different behavior + for some high level Python bindings + """ + + # Note: this will be an fx Node if it's ever + # populated, but some Meta-internal jobs don't include fx + node: Optional[Any] + tname: str + + @staticmethod + def __new__( + cls, + tensorlike: Optional[Union[TensorMeta, NumberType, torch.Tensor]] = None, + *, + shape: Optional[ShapeType] = None, + strides: Optional[StrideType] = None, + dtype: Optional[torch.dtype] = None, + device: Optional[Union[torch.device, str]] = None, + ): + + if isinstance(tensorlike, Number): + assert not shape and (shape is None or isinstance(shape, Sequence)) + assert not strides and (strides is None or isinstance(strides, Sequence)) + inferred_shape: Tuple[int, ...] = () + inferred_strides: Tuple[int, ...] = () + inferred_dtype = type_to_dtype(type(tensorlike)) + inferred_device = torch.device("cpu") + # TODO: This looks wrong, a number that is wrapped into a tensor + # needs to behave differently than a scalar tensor for type + # promotion purposes + elif tensorlike is not None: + assert isinstance(tensorlike, (TensorMeta, torch.Tensor)) + inferred_shape = tuple(tensorlike.shape) + inferred_strides = tuple(tensorlike.stride()) + inferred_dtype = tensorlike.dtype + inferred_device = tensorlike.device + else: + # If no tensorlike "example" is given then all metadata + # must be provided explicitly + assert shape is not None + assert strides is not None + assert dtype is not None + assert device is not None + + shape = inferred_shape if shape is None else tuple(shape) + strides = inferred_strides if strides is None else tuple(strides) + dtype = inferred_dtype if dtype is None else dtype + device = inferred_device if device is None else device + + if isinstance(device, str): + device = torch.device(device) + + r = torch.Tensor._make_wrapper_subclass( # type: ignore[attr-defined] + cls, + shape, + strides=strides, + storage_offset=0, # TODO: this is inaccurate + dtype=dtype, + device=device, + requires_grad=False, + ) + + r.tname = "" + r.node = None + return r + + @classmethod + def __torch_function__( + cls, + func: Callable, + types: Sequence, + args: Sequence[Any] = (), + kwargs: Optional[Dict] = None, + ): + if kwargs is None: + kwargs = {} + + if func in { + torch.Tensor.ndim.__get__, # type: ignore[attr-defined] + torch.Tensor.numel, + torch.Tensor.stride, + torch.Tensor.dtype.__get__, # type: ignore[attr-defined] + torch.Tensor.shape.__get__, # type: ignore[attr-defined] + torch.Tensor.device.__get__, # type: ignore[attr-defined] + }: + return super().__torch_function__(func, types, args, kwargs) + + if not hasattr(func, "meta"): + raise ValueError(f"Callable {func} has no meta function!") + + return func.meta(*args, **kwargs) # type: ignore[attr-defined] + + @classmethod + def __torch_dispatch__( + cls, + func, + types, + args=(), + kwargs=None, + ): + raise RuntimeError("this should be unreachable") + + # TODO: fx uses dunder repr to print objects in code + def __repr__(self): + return self.tname + # return f"TensorMeta(dtype={self.dtype}, device={self.device}, shape={self.shape}, strides={self.stride()})" + + def __format__(self, format_spec): + return self.tname + + +TensorLikeType = Union[torch.Tensor, TensorMeta] +TensorLike = (torch.Tensor, TensorMeta) +TensorSequenceType = Union[List[TensorLikeType], Tuple[TensorLikeType, ...]] + + +# TODO: look at using torch.testing.assert_close instead with an option +# to just compare metadata +def compare_tensor_meta(a: TensorLikeType, b: TensorLikeType): + """ + Checks that two tensor likes have the same shape, + dtype and device. + + In the future this will validate additional metadata, like + strides. + """ + assert isinstance(a, TensorLike) + assert isinstance(b, TensorLike) + + for x, y in zip(a.shape, b.shape): + if x != y: + msg = "Shapes {0} and {1} are not equal!".format(a.shape, b.shape) + raise AssertionError(msg) + + if a.dtype != b.dtype: + msg = "Dtypes {0} and {1} are not equal!".format(a.dtype, b.dtype) + raise AssertionError(msg) + + if a.device != b.device: + # Handles special cuda:0 vs cuda case + # TODO: we should review why this happens and see about fixing it + if (str(a.device) == "cuda:0" or str(a.device) == "cuda") and ( + str(b.device) == "cuda:0" or str(b.device) == "cuda" + ): + pass + else: + msg = "Devices {0} and {1} are not equal!".format(a.device, b.device) + raise AssertionError(msg) + + same_strides, idx = check_significant_strides(a, b) + if not same_strides: + msg = "Stride mismatch! Strides are {0} and {1} (mismatched at {2})!".format( + a.stride(), b.stride(), idx + ) + raise RuntimeError(msg) + + +def check_significant_strides( + a: TensorLikeType, b: TensorLikeType +) -> Tuple[bool, Optional[int]]: + # NOTE: only on CUDA because CPU elementwise strides are incorrect in PyTorch + # See https://github.com/pytorch/pytorch/issues/77553 + # Only compares strides that are "meaningful" -- strides for dimensions with length > 1 + # and for tensors with more than one element + if (a.device.type == "cuda" or b.device.type == "cuda") and a.numel() > 0: + for idx in range(a.ndim): + if a.stride()[idx] != b.stride()[idx] and a.shape[idx] > 1: + return False, idx + + return True, None + + +def is_contiguous(a: TensorLikeType) -> bool: + """ + Tests whether a tensor is contiguous or not. + + Tensors are contiguous when they have no elements, + or when they have "nested" strides. + """ + if a.numel() == 0: + return True + + expected_stride = 1 + for x, y in reversed(tuple(zip(a.shape, a.stride()))): + # Skips checking strides when a dimension has length 1 + if x == 1: + continue + + if y != expected_stride: + return False + expected_stride = expected_stride * x + + return True + + +# NOTE: Based on the implementation in TensorIterator.cpp, but note that +# the note [Computing output strides] is incorrect, because it +# says that strides will be preserved even if they are not +# "non overlapping and dense", but this is incorrect. The +# output of elementwise operations are always given +# non overlapping and dense strides. +# This is also INCORRECT because it does not model TensorIterator's +# short-circuit, which can cause different strides. +def compute_elementwise_output_strides(*tensors) -> Tuple[int, ...]: + """ + Computes the output strides for elementwise operations. + """ + + if len(tensors) == 0: + msg = "Can't compute elementwise output strides for zero tensors!" + raise ValueError(msg) + + check_same_shape(*tensors, allow_cpu_scalar_tensors=True) + + # Filters the tensors to actual tensors + all_tensors = all(isinstance(a, TensorLike) for a in tensors) + tensors = tuple( + a for a in tensors if isinstance(a, TensorLike) and not is_cpu_scalar_tensor(a) + ) + + # Short-circuits for CPU scalar case + if len(tensors) == 0: + return () + + # Short-circuits for shapes with zero or one dimensions + # TODO: are these necessary? + ndim = tensors[0].ndim + if ndim == 0: + return () + if ndim == 1: + return (1,) + + shape = tensors[0].shape + + def _cmp(idx_a, idx_b): + for tensor in tensors: + stride_a = tensor.stride()[idx_a] + stride_b = tensor.stride()[idx_b] + + if stride_a == 0 or stride_b == 0: + continue + + if stride_a < stride_b: + return -1 + + if stride_a > stride_b: + return 1 + + # stride_a == stride_b + if shape[idx_a] > shape[idx_b]: + return 1 + + # NOTE: this case is missing in the C++ impl + if shape[idx_a] < shape[idx_b]: + return -1 + + # Note: this case is hit if all strides are zero, + # or all strides are equal and all dimensions have the same length + return 0 + + perm = tuple(range(ndim)) + perm = tuple(sorted(perm, key=cmp_to_key(_cmp), reverse=True)) + + permuted_shape = [-1] * ndim + for idx, x in enumerate(perm): + permuted_shape[idx] = shape[x] + + new_strides = make_contiguous_strides_for(permuted_shape) + # print(f"new_strides is {new_strides}") + # print(f"shape is {shape}") + # print(f"permuted_shape is {permuted_shape}") + permuted_strides = [-1] * ndim + for idx, x in enumerate(perm): + permuted_strides[x] = new_strides[idx] + + return tuple(permuted_strides) + + +# +# Common helper functions +# + + +def validate_dim_length(length: int): + """ + Validates that an object represents a valid + dimension length. + """ + + assert isinstance(length, int) + assert length >= 0 + + +def validate_shape(shape: ShapeType): + """ + Validates that a sequence represents a valid shape. + """ + + assert isinstance(shape, Sequence) + for l in shape: + validate_dim_length(l) + + +def validate_strides(strides: StrideType): + """ + Verifies the object specifies valid strides. + """ + + assert isinstance(strides, Sequence) + for stride in strides: + assert stride >= 0 + + +def validate_idx(rank: int, idx: int): + """ + Validates that idx is a valid index for the given shape. + Assumes the index is already canonicalized. + """ + + assert isinstance(idx, int) + assert isinstance(rank, int) + + assert idx >= 0 and idx < rank or idx == 0 + + +def validate_dimension_indices(rank: int, indices: DimsSequenceType): + for idx in indices: + validate_idx(rank, idx) + + +def validate_exclusive_idx(rank: int, ex_idx: int): + """ + Validates that ex_idx is a valid exclusive index + for the given shape. + """ + + assert isinstance(ex_idx, int) + assert isinstance(rank, int) + assert ex_idx > 0 and ex_idx <= rank + + +# "Wraps" a dim (up to one time) for the given rank, allowing +# dims to be specified using negative indices +def canonicalize_dim(rank: int, idx: int) -> int: + # TODO: add a comment for why this is + _rank = rank if rank != 0 else 1 + + if idx >= 0 and idx < _rank: + return idx + + if idx < 0: + _idx = idx + _rank + else: + _idx = idx + + if _idx < 0 or _idx > _rank: + msg = "Received out of bounds index {0} for tensor of rank {1}!".format( + idx, rank + ) + raise ValueError(msg) + + return _idx + + +# Takes a dimension or sequence of dimensions and "wraps" them, +# mapping negative offsets to positive ones +def canonicalize_dims(rank: int, indices: DimsType) -> DimsType: + if isinstance(indices, int): + return canonicalize_dim(rank, indices) + + return tuple(canonicalize_dim(rank, x) for x in indices) + + +def is_valid_permutation(rank: int, perm: DimsSequenceType) -> bool: + """ + Validates that perm is a permutation of length rank. + """ + + if not isinstance(perm, Sequence): + return False + + if not (tuple(sorted(perm)) == tuple(range(0, rank))): + return False + + return True + + +def is_same_shape(a: Sequence, b: Sequence) -> bool: + """ + Compares two shapes a and b, returning True if they are the same + (their ranks and corresponding lengths match) and False otherwise. + """ + + return tuple(a) == tuple(b) + + +def is_cpu_scalar_tensor(a: Any) -> bool: + return isinstance(a, TensorLike) and a.ndim == 0 and a.device.type == "cpu" + + +def check_same_device(*args, allow_cpu_scalar_tensors): + """ + Checks that all Tensors in args have the same device. + + Raises a RuntimeError when: + - args contains an object whose type is not Tensor or Number + - two Tensor objects in args have different devices, unless one is a CPU scalar tensor and allow_cpu_scalar_tensors is True + """ + # Short-circuits if all (one or fewer) arguments are trivially on the same device + if len(args) <= 1: + return + + # Note: cannot initialize device to the first arg's device (it may not have one) + device = None + for arg in args: + if isinstance(arg, Number): + continue + elif isinstance(arg, TensorLike): + if allow_cpu_scalar_tensors and is_cpu_scalar_tensor(arg): + continue + + if device is None: + device = arg.device + + if device != arg.device: + msg = ( + "Tensor on device " + + str(arg.device) + + " is not on the expected device " + + str(device) + + "!" + ) + raise RuntimeError(msg) + else: + msg = ( + "Unexpected type when checking for same device, " + str(type(arg)) + "!" + ) + raise RuntimeError(msg) + + +# Asserts if any of the following are true: +# - a non-scalar or non-Tensor is given +# - the shape of any tensors is distinct +def check_same_shape(*args, allow_cpu_scalar_tensors): + """ + Checks that all Tensors in args have the same shape. + + Raises a RuntimeError when: + - args contains an object whose type is not Tensor or Number + - two Tensor objects in args have different devices + """ + shape = None + + for arg in args: + if isinstance(arg, Number): + continue + elif isinstance(arg, TensorLike): + if allow_cpu_scalar_tensors and is_cpu_scalar_tensor(arg): + continue + + if shape is None: + shape = arg.shape + + if not is_same_shape(shape, arg.shape): + msg = "Shape {0} is not the expected shape {1}!".format( + arg.shape, shape + ) + raise RuntimeError(msg) + else: + msg = ( + "Unexpected type when checking for same shape, " + str(type(arg)) + "!" + ) + raise RuntimeError(msg) + + +_integer_dtypes = (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64) +_float_dtypes = (torch.float16, torch.bfloat16, torch.float32, torch.float64) +_complex_dtypes = (torch.complex32, torch.complex64, torch.complex128) + + +def is_boolean_dtype(dtype: torch.dtype) -> bool: + assert isinstance(dtype, torch.dtype) + return dtype is torch.bool + + +def is_integer_dtype(dtype: torch.dtype) -> bool: + assert isinstance(dtype, torch.dtype) + return dtype in _integer_dtypes + + +def is_float_dtype(dtype: torch.dtype) -> bool: + assert isinstance(dtype, torch.dtype) + return dtype in _float_dtypes + + +def is_complex_dtype(dtype: torch.dtype) -> bool: + assert isinstance(dtype, torch.dtype) + return dtype in _complex_dtypes + + +_complex_to_real_dtype_map = { + torch.complex128: torch.float64, + torch.complex64: torch.float32, + torch.complex32: torch.float16, +} + +_real_to_complex_dtype_map = { + torch.float16: torch.complex32, + torch.bfloat16: torch.complex64, + torch.float32: torch.complex64, + torch.float64: torch.complex128, +} + + +def corresponding_real_dtype(dtype: torch.dtype) -> torch.dtype: + return _complex_to_real_dtype_map[dtype] + + +def corresponding_complex_dtype(dtype: torch.dtype) -> torch.dtype: + return _real_to_complex_dtype_map[dtype] + + +def dtype_to_type(dtype: torch.dtype) -> type: + """ + Computes the corresponding Python type (AKA "type kind") for the + given dtype. + """ + assert isinstance(dtype, torch.dtype) + + if dtype is torch.bool: + return bool + if dtype in _integer_dtypes: + return int + if dtype in _float_dtypes: + return float + if dtype in _complex_dtypes: + return complex + + raise ValueError("Invalid dtype!") + + +_type_to_dtype_map = { + bool: torch.bool, + int: torch.int64, + float: torch.float64, + complex: torch.complex128, +} + + +def type_to_dtype(typ: type) -> torch.dtype: + """ + Computes the corresponding dtype for a Number type. + """ + return _type_to_dtype_map[typ] + + +_ordered_types = (bool, int, float, complex) + + +def get_higher_type(a: type, b: type) -> type: + """ + Returns the higher of the two given Number types. + + The types are ordered bool -> int -> float -> complex. + """ + # Type checking + assert a in _ordered_types + assert b in _ordered_types + + if a is b: + return a + + for typ in _ordered_types: + if a is typ: + return b + if b is typ: + return a + + raise ValueError("Unknown Python scalar type!") + + +# Returns the higher of two torch datatypes a and b or, if the two +# are not ordered relative to each other, the next +# higher datatype +def get_higher_dtype( + a: Optional[Union[torch.dtype, TensorLikeType, NumberType]], + b: Optional[Union[torch.dtype, TensorLikeType, NumberType]], +) -> Optional[torch.dtype]: + """ + Computes the "lowest" datatype that is weakly + "higher" than both a and b. + """ + + # Type checking + assert a is None or isinstance(a, (torch.dtype, TensorLike, Number)) + assert b is None or isinstance(b, (torch.dtype, TensorLike, Number)) + + def _extract_dtype( + x: Optional[Union[torch.dtype, TensorLikeType, NumberType]] + ) -> Optional[torch.dtype]: + if x is None: + return None + if isinstance(x, torch.dtype): + return x + if isinstance(x, TensorLike): + return x.dtype + if isinstance(x, Number): + return type_to_dtype(type(x)) + + raise RuntimeError("Unexpected type given to _extract_dtype!") + + a, b = _extract_dtype(a), _extract_dtype(b) + + if a is b: + return a + + if a is None: + return b + + if b is None: + return a + + ordered_datatypes = ( + (torch.bool,), + (torch.uint8, torch.int8), + (torch.int16,), + (torch.int32,), + (torch.int64,), + (torch.float16, torch.bfloat16), + (torch.float32,), + (torch.float64,), + (torch.complex32,), + (torch.complex64,), + (torch.complex128,), + ) + + for idx, dtypes in enumerate(ordered_datatypes): + if a in dtypes and b in dtypes: + return ordered_datatypes[idx + 1][0] + if a in dtypes: + return b + if b in dtypes: + return a + + raise RuntimeError("Unexpected termination!") + + +# TODO: maybe unify with can_cast_to? +def is_weakly_lesser_type(a: type, b: type) -> bool: + """ + Compares two types, a and b, returning True if a is weakly "less" than b. + + The comparison is determined by the following type ordering: bool, int, float, complex. + """ + ordered_types = ( + bool, + int, + float, + complex, + ) + + assert a in ordered_types + assert b in ordered_types + + for typ in ordered_types: + if a == typ: + return True + if b == typ: + return False + + raise RuntimeError("Unexpected termination!") + + +def can_safe_cast_to(*, cast_to: torch.dtype, cast_from: torch.dtype) -> bool: + for fn in (is_complex_dtype, is_float_dtype, is_integer_dtype, is_boolean_dtype): + if fn(cast_to): + return True + if fn(cast_from): + return False + + raise ValueError("Received unknown dtypes {0}, {1}!".format(cast_to, cast_from)) + + +def check_same_dtype(*args): + """ + Checks that all Tensors in args have the same device and that all Numbers have the + same corresponding Python type. + + Raises a RuntimeError when: + - args contains an object whose type is not Tensor or Number + - two Tensors objects in args have different dtypes + - two Number objects in args have different types + - there are Tensors and Numbers in args, and one of those Tensors corresponding + Python types is different from the type of one of those Numbers + """ + full_dtype = None + scalar_type = None + + for arg in args: + if isinstance(arg, Number): + # Scalar type checking is disabled (and may be removed in the future) + continue + # if scalar_type is None: + # scalar_type = type(arg) + + # if scalar_type is not type(arg): + # msg = ( + # "Scalar of type " + # + str(type(arg)) + # + " is not the expected type of " + # + str(scalar_type) + # + "!" + # ) + # raise RuntimeError(msg) + elif isinstance(arg, TensorLike): + if full_dtype is None: + full_dtype = arg.dtype + if scalar_type is None: + scalar_type = dtype_to_type(arg.dtype) + + if full_dtype is not arg.dtype: + msg = ( + "Tensor with dtype " + + str(arg.dtype) + + " is not the expected dtype of " + + str(full_dtype) + + "!" + ) + raise RuntimeError(msg) + + arg_type = dtype_to_type(arg.dtype) + if arg_type is not scalar_type: + msg = ( + "Tensor with corresponding Python type " + + str(arg_type) + + " is not the expected type of " + + str(scalar_type) + + "!" + ) + raise RuntimeError(msg) + else: + msg = ( + "Unexpected type when checking for same dtype, " + str(type(arg)) + "!" + ) + raise RuntimeError(msg) + + +# Maps datatypes to their computation types for elementwise operations +_computation_dtype_map = { + torch.bfloat16: torch.float32, + torch.float16: torch.float32, + torch.complex32: torch.complex64, +} + + +def _get_computation_dtype(dtype: torch.dtype) -> torch.dtype: + return _computation_dtype_map.get(dtype, dtype) + + +class ELEMENTWISE_TYPE_PROMOTION_KIND(Enum): + DEFAULT = (0,) + NO_OPMATH = (1,) + INT_TO_FLOAT = (2,) + ALWAYS_BOOL = (3,) + COMPLEX_TO_FLOAT = (4,) + BOOL_TO_LONG = (5,) + + +# TODO: document type promotion kinds +def elementwise_dtypes( + *_args, + type_promotion_kind: ELEMENTWISE_TYPE_PROMOTION_KIND, +) -> Tuple[torch.dtype, torch.dtype]: + """ + Computes the computation and result dtypes for elementwise type promotion + on the given arguments and with the given elementwise type promotion kind. + + Note that not all inputs to an elementwise operation necessarily participate in type promotion. + For example, the "alpha" parameter of torch.add does not participate in type promotion, + although it may be cast to the Python type corresponding to the computation dtype that + the type promotion algorithm determines. + + Default elementwise type promotion, which all other type promotion kinds tweak (see below), + first decides which of four ordered types to use: + + bool -> integer -> floating point -> complex + + The selected type is the "lowest" type in the above list such that all number arguments + have a weakly "lower" type and all tensor arguments have a weakly lower corresponding + type for their dtype. + + Once the type is determined, the particular result dtype is found. The dtypes are + partially ordered as follows: + + bool -> uint8, int8 -> int16 -> int32 -> int64 -> + float16, bfloat16 -> float32 -> float64 -> complex32 -> complex64 -> complex128 + + The result dtype is selected by: + - if no tensor's dtype has the same corresponding type as the one selected, + then the result dtype is the (default) dtype corresponding to the selected type + (for example, 1.5 + an integer tensor has a result dtype of the default floating point dtype) + - if the result type is complex then the dtype is: + - the default complex dtype if there are no floating point or complex tensors + - if there are floating point or complex tensors with one or more dimensions, then + the complex dtype corresponding to the highest corresponding complex dtype among those tensors + (for example, double + cfloat -> cdouble) + - if there are only floating point or complex tensors with zero dimensions, then + the complex dtype corresponding to the highest corresponding complex dtype among those tensors + - if the first two cases do not apply, the result dtype is the highest dtype among + all tensors with one or more dimensions of the output type, and if there are no such + tensors then it's the highest dtype among all tensors with zero dimensions of the output type + (for example, long + half -> half, even if the half tensor has zero dimensions) + + The "corresponding complex dtypes" are: + float16 -> complex32 + bfloat16 -> complex64 + float32 -> complex64 + float64 -> complex128 + complex32 -> complex32 + complex64 -> complex64 + complex128 -> complex128 + + The DEFAULT type promotion kind computes per above, and then uses the result dtype to pick a computation + dtype by mapping low precision floating point and complex dtypes as follows: + + float16 -> float32 + bfloat16 -> float32 + complex32 -> complex64 + + This is referred to as "op math", and the NO_OPMATH type promotion kind disables this mapping, making the + computation dtype the same as the result dtype when it's selected. NO_OPMATH is appropriate for kernels + which perform no mathematical operations on their tensors (see below for examples). + + The INT_TO_FLOAT type promotion kind maps boolean and integer maps result dtypes to the default floating point dtype, + and computation dtypes to the appropriate op math dtype. + + The COMPLEX_TO_FLOAT type promotion kind maps complex result dtypes to the corresponding float dtype, following this + mapping: + + complex32 -> float16 + complex64 -> float32 + complex128 -> float64 + + Note that COMPLEX_TO_FLOAT derives the computation dtype as the DEFAULT setting does. + + The BOOL_TO_LONG type promotion kind maps boolean computation and result dtypes to long. + + The ALWAYS_BOOL type promotion kind always sets the result dtype to bool. + + Example operators for each type promotion option: + DEFAULT : add + NO_OPMATH : where, nextafter, cat + INT_TO_FLOAT : sin + COMPLEX_TO_FLOAT : abs + BOOL_TO_LONG : pow + ALWAYS_BOOL : eq + + """ + + args = tuple(x for x in _args if x is not None) + + highest_type: type = bool + for x in args: + if not isinstance(x, (Number, TensorLike)): + msg = ( + "Unexpected type {0} when computing elementwise type promotion!".format( + str(type(x)) + ) + ) + raise ValueError(msg) + + if isinstance(x, Number): + highest_type = get_higher_type(highest_type, type(x)) + else: + # x is a TensorLike + highest_type = get_higher_type(highest_type, dtype_to_type(x.dtype)) + + result_dtype = None + + def _find_highest_dtype_filtered( + args, filter, *, float_as_complex=False, all_tensors_equal=False + ) -> Optional[torch.dtype]: + zero_dim_tensor_dtype = None + one_plus_dim_tensor_dtype = None + for x in args: + if isinstance(x, TensorLike) and filter(x.dtype): + _dtype = x.dtype + if float_as_complex and is_float_dtype(_dtype): + _dtype = corresponding_complex_dtype(_dtype) + if x.ndim == 0 and not all_tensors_equal: + zero_dim_tensor_dtype = get_higher_dtype( + zero_dim_tensor_dtype, _dtype + ) + else: + # x.ndim > 0 or all_tensors_equal + one_plus_dim_tensor_dtype = get_higher_dtype( + one_plus_dim_tensor_dtype, _dtype + ) + + # Prefers dtype of tensors with one or more dimensions + if one_plus_dim_tensor_dtype is not None: + return one_plus_dim_tensor_dtype + + return zero_dim_tensor_dtype + + if highest_type is float: + result_dtype = _find_highest_dtype_filtered(args, is_float_dtype) + result_dtype = ( + torch.get_default_dtype() if result_dtype is None else result_dtype + ) + elif highest_type is complex: + # NOTE: complex x float type promotion is incorrectly implemented in PyTorch today + # it will treat zero dim and non-zero-dim float and complex tensors equally + # unless there's a non-zero-dim complex tensor + # the following captures this oddity + has_one_plus_dim_complex_tensor = False + for x in args: + if isinstance(x, TensorLike) and x.ndim > 0 and is_complex_dtype(x.dtype): + has_one_plus_dim_complex_tensor = True + break + + if has_one_plus_dim_complex_tensor: + result_dtype = _find_highest_dtype_filtered( + args, + lambda x: is_float_dtype(x) or is_complex_dtype(x), + float_as_complex=True, + ) + else: + # no complex tensors of rank 1+ + # NOTE: bugged case where all tensors are equal + result_dtype = _find_highest_dtype_filtered( + args, + lambda x: is_float_dtype(x) or is_complex_dtype(x), + float_as_complex=True, + all_tensors_equal=True, + ) + + if result_dtype is None: + result_dtype = corresponding_complex_dtype(torch.get_default_dtype()) + elif highest_type is int: + result_dtype = _find_highest_dtype_filtered(args, is_integer_dtype) + result_dtype = torch.long if result_dtype is None else result_dtype + else: + # highest_type is bool + result_dtype = torch.bool + + if type_promotion_kind is ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT: + return _get_computation_dtype(result_dtype), result_dtype + elif type_promotion_kind is ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH: + return result_dtype, result_dtype + elif type_promotion_kind is ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT: + if is_integer_dtype(result_dtype) or is_boolean_dtype(result_dtype): + result_dtype = torch.get_default_dtype() + return _get_computation_dtype(result_dtype), result_dtype + elif type_promotion_kind is ELEMENTWISE_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT: + # NOTE: computation can still occur in a complex dtype + computation_dtype = _get_computation_dtype(result_dtype) + if is_complex_dtype(result_dtype): + result_dtype = corresponding_real_dtype(result_dtype) + return computation_dtype, result_dtype + elif type_promotion_kind is ELEMENTWISE_TYPE_PROMOTION_KIND.BOOL_TO_LONG: + if is_boolean_dtype(result_dtype): + return torch.long, torch.long + return _get_computation_dtype(result_dtype), result_dtype + elif type_promotion_kind is ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL: + return _get_computation_dtype(result_dtype), torch.bool + else: + raise ValueError( + "Unknown type promotion kind {0}".format(str(type_promotion_kind)) + ) + + +def wrap_device(d: Union[str, torch.device]) -> torch.device: + """ + Wraps strings into torch.device objects. + + Given torch.device objects are returned unmodified. + """ + + assert isinstance(d, (str, torch.device)) + if isinstance(d, str): + return torch.device(d) + + return d + + +def make_contiguous_strides_for(shape: ShapeType) -> Tuple[int, ...]: + validate_shape(shape) + if not shape: + return () + + multiplier = 1 + strides = [] + for l in reversed(shape): + if l != 0: + strides.append(multiplier) + multiplier = l * multiplier + else: + strides.append(multiplier) + + result = tuple(reversed(strides)) + return result + + +def compute_reduction_output_shape( + shape: ShapeType, dimensions: Sequence +) -> Tuple[int, ...]: + for idx in dimensions: + validate_idx(len(shape), idx) + + new_shape = [] + for idx in range(len(shape)): + if idx in dimensions: + continue + + new_shape.append(shape[idx]) + + return tuple(new_shape) + + +def validate_no_repeating_dims(dims: Sequence): + if len(dims) != len(set(dims)): + raise RuntimeError("duplicate value in the list of dims") + + +def reduction_dims(shape: ShapeType, dims: Optional[Sequence]) -> Tuple[int, ...]: + if dims is None: + return tuple(range(len(shape))) + dims = tuple(canonicalize_dim(len(shape), idx) for idx in dims) + validate_no_repeating_dims(dims) + return dims + + +def check_in_bounds_for_storage( + a: torch._TypedStorage, shape: ShapeType, strides: StrideType, storage_offset: int +): + """ + Determines if the given shape, strides, and offset are valid for the given storage. + """ + + # Short-circuits if the shape has no elements + if reduce(operator.mul, shape) == 0: + return + + length = a.size() - storage_offset + max_offset = 0 + for x, y in zip(shape, strides): + max_offset = max_offset + (x - 1) * y + + if max_offset >= length: + required_length = max_offset + storage_offset + msg = ( + "Can't view a storage of size {0} with an offset of {1}, shape of {2}, and strides of {3}, " + "which requires a storage of size {4}".format( + a.size(), storage_offset, str(shape), str(strides), required_length + ) + ) + raise ValueError(msg) diff --git a/torch/_prims/wrappers.py b/torch/_prims/wrappers.py new file mode 100644 index 000000000000..a4c358954fec --- /dev/null +++ b/torch/_prims/wrappers.py @@ -0,0 +1,195 @@ +import torch +import torch._prims as prims +from torch._prims.utils import ( + Number, + NumberType, + TensorLike, + TensorLikeType, + ELEMENTWISE_TYPE_PROMOTION_KIND, +) +import torch._prims.utils as utils +from torch.utils._pytree import tree_flatten + +from typing import Callable, Sequence, Union +import inspect +from functools import wraps, reduce +import operator +import warnings +from itertools import chain + +# TODO: implement ref.cast with an option to enforce safe casting +def _maybe_convert_to_dtype( + a: Union[TensorLikeType, NumberType, Sequence], dtype: torch.dtype +) -> Union[TensorLikeType, NumberType, Sequence]: + if isinstance(a, TensorLike): + if a.dtype != dtype: + # NOTE: this is incorrect on the CPU + # See https://github.com/pytorch/pytorch/issues/77553 + return prims.convert_element_type(a, dtype) + return a + if isinstance(a, Number): + return utils.dtype_to_type(dtype)(a) + if isinstance(a, Sequence): + return tuple(_maybe_convert_to_dtype(x, dtype) for x in a) + + raise ValueError( + "Received type {0} that is neither a tensor or a number!".format(type(a)) + ) + + +def _maybe_convert_to_type(a: NumberType, typ: type) -> NumberType: + if not isinstance(a, Number): + msg = "Found unknown type {0} when trying to convert scalars!".format(type(a)) + raise ValueError(msg) + if not utils.is_weakly_lesser_type(type(a), typ): + msg = "Scalar {0} of type {1} cannot be safely cast to type {2}!".format( + a, type(a), typ + ) + raise ValueError(msg) + + return typ(a) + + +def _annotation_has_type(*, typ, annotation): + if hasattr(annotation, "__args__"): + for a in annotation.__args__: + if _annotation_has_type(typ=typ, annotation=a): + return True + return False + + return typ is annotation + + +class elementwise_type_promotion_wrapper(object): + """ + Adds elementwise type promotion to a Python reference implementation. + + Takes two kwargs, type_promoting_args and type_promotion_kind. + + type_promoting_args must be a string Sequence specifiying the argument names of all + arguments that participate in type promotion (and should be type promoted). If the + arg specifies a Sequence-type then every element of the Sequence will participate in + type promotion. + + type_promotion_kind must be one of the kinds specified by ELEMENTWISE_TYPE_PROMOTION_KIND. + See its documentation for details. + + Other type promotion behavior, like validating the Python type of scalar arguments, must + be handled separately. + """ + + def __init__( + self, + *, + type_promotion_kind: ELEMENTWISE_TYPE_PROMOTION_KIND, + type_promoting_args: Sequence[str] = None, + ): + self.type_promoting_arg_names = type_promoting_args + self.type_promotion_kind = type_promotion_kind + + def __call__(self, fn: Callable) -> Callable: + sig = inspect.signature(fn) + + @wraps(fn) + def _fn(*args, **kwargs): + bound = sig.bind(*args, **kwargs) + type_promoting_args = tuple( + bound.arguments[x] + for x in self.type_promoting_arg_names # type: ignore[union-attr] + if x in bound.arguments.keys() + ) + + flattened_type_promoting_args = tree_flatten(type_promoting_args)[0] + compute_dtype, result_dtype = utils.elementwise_dtypes( + *flattened_type_promoting_args, + type_promotion_kind=self.type_promotion_kind, + ) + + promoted_args = { + x: _maybe_convert_to_dtype(bound.arguments[x], compute_dtype) + for x in self.type_promoting_arg_names # type: ignore[union-attr] + if x in bound.arguments.keys() + } + bound.arguments.update(promoted_args) + + result = fn(**bound.arguments) + + # FIXME?: assumes result is a single tensor + assert isinstance(result, TensorLike) + return _maybe_convert_to_dtype(result, result_dtype) + + _fn.__signature__ = sig # type: ignore[attr-defined] + return _fn + + +# TODO: handle tuples of tensors +def _maybe_resize_out(out: TensorLikeType, shape): + if out.numel() == 0: + return prims.resize(out, shape) + + if out.numel() != reduce(operator.mul, shape, 1): + msg = ( + "An output with one or more elements was resized since it had shape {0} " + "which does not match the required output shape {1}. " + "This behavior is deprecated, and in a future PyTorch release outputs will not " + "be resized unless they have zero elements. " + "You can explicitly reuse an out tensor t by resizing it, inplace, to zero elements with t.resize_(0).".format( + str(out.shape), str(shape) + ) + ) + warnings.warn(msg) + return prims.resize(out, shape) + + return out + + +def _safe_copy_out(*, copy_from: TensorLikeType, copy_to: TensorLikeType): + # Checks same device + if copy_from.device != copy_to.device: + msg = "Attempting to copy from device {0} to device {1}, but cross-device copies are not allowed!".format( + copy_from.device, copy_to.device + ) + raise RuntimeError(msg) + + # Checks safe cast + if not utils.can_safe_cast_to(cast_from=copy_from.dtype, cast_to=copy_to.dtype): + msg = "Attempting to cast from {0} to out tensor with dtype {1}, but this can't be cast because it is not safe!".format( + copy_from.dtype, copy_to.dtype + ) + raise RuntimeError(msg) + + return prims.copy_to(copy_to, copy_from) + + +# FIXME: only supports single tensor out +def out_wrapper(fn: Callable) -> Callable: + """ + Adds the out parameter to a Python reference. + + Note that this currently only supports operations that return a single tensor. + """ + + @wraps(fn) + def _fn(*args, out=None, **kwargs): + result = fn(*args, **kwargs) + if out is not None: + assert isinstance(out, TensorLike) + out = _maybe_resize_out(out, result.shape) + return _safe_copy_out(copy_from=result, copy_to=out) # type: ignore[arg-type] + return out + return result + + sig = inspect.signature(fn) + out_param = inspect.Parameter( + "out", + kind=inspect.Parameter.KEYWORD_ONLY, + default=None, + annotation=TensorLikeType, + ) + params = chain(sig.parameters.values(), (out_param,)) + _fn.__signature__ = inspect.Signature( # type: ignore[attr-defined] + parameters=params, return_annotation=sig.return_annotation # type: ignore[arg-type] + ) + _fn.__annotations__ = fn.__annotations__ + _fn.__annotations__["out"] = TensorLikeType + return _fn diff --git a/torch/_python_dispatcher.py b/torch/_python_dispatcher.py index aa19a18efb3b..ee2c7d279458 100644 --- a/torch/_python_dispatcher.py +++ b/torch/_python_dispatcher.py @@ -15,13 +15,13 @@ - CPU/AutogradCPU: represents in-tree backends which we usually have dedicated inference & autograd kernel in pytorch core library. E.g. CPU, CUDA -- QuantizedCPU/AutogradOther: represents in-tree backends which we usually have backend specific +- FPGA/AutogradOther: represents in-tree backends which we usually have backend specific inference kernels, but they share the same autograd kernel specified in AutogradOther. - E.g. QuantizedCPU, QuantizedCUDA + E.g. FPGA, SparseCsrCPU - XLA/AutogradXLA: represents out-of-tree backends which we don't have either inference or autograd kernel defined in pytorch core library. Backend owner is responsible for registering both inference & autograd kernels in their extensions(e.g. torch-xla) for the operators they support. - E.g. XLA, XPU, MLC + E.g. XLA, XPU, MPS - CompositeExplicitAutograd: alias key mapped to inference kernels of all backends like CPU, CUDA, XLA etc. Kernels registered to this key MUST work for inference for all backends. - Autograd: alias key mapped to autograd of all backends like AutogradCPU, AutogradXLA, AutogradOther. @@ -53,7 +53,7 @@ class PythonDispatcher: name = "foo" runtime_keys = [ "CPU", "AutogradCPU", - "QuantizedCPU", "AutogradOther", + "FPGA", "AutogradOther", "XLA", "AutogradXLA", "Lazy", "AutogradLazy", ] @@ -66,7 +66,7 @@ class PythonDispatcher: def __init__(self): C._dispatch_check_invariants(self.name) # type: ignore[attr-defined] - self.ref = C._dispatch_library("FRAGMENT", self.namespace, "") # type: ignore[attr-defined] + self.ref = C._dispatch_library("FRAGMENT", self.namespace, "") self.ref.def_("foo(Tensor x) -> Tensor") """ diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py new file mode 100644 index 000000000000..894baf3605bc --- /dev/null +++ b/torch/_refs/__init__.py @@ -0,0 +1,1448 @@ +import torch + +import torch._prims as prims +import torch._prims.utils as utils +from torch._prims.utils import ( + DimsType, + ShapeType, + StrideType, + TensorLike, + TensorLikeType, + DimsSequenceType, + TensorSequenceType, + Number, + NumberType, + ELEMENTWISE_TYPE_PROMOTION_KIND, +) +from torch._prims.wrappers import ( + elementwise_type_promotion_wrapper, + out_wrapper, + _maybe_convert_to_dtype, + _maybe_resize_out, +) + +from functools import reduce +from typing import Sequence, Optional, Union, Callable, List, Tuple +import operator +import warnings +import math +from enum import Enum + +# Experimental module containing prototype Python references for existing +# PyTorch operations. + +__all__ = [ + # + # Elementwise Unary References + # + "abs", + "acos", + "acosh", + "asin", + "atan", + # "bessel_i0e", # special.i0e + # "bessel_i1e", # special.i1e + # "cbrt", # No corresponding torch operation + "ceil", + "cos", + "cosh", + "digamma", + "erf", + "erfinv", + "erfc", + "exp", + "expm1", + "floor", + "isfinite", + "isnan", + "lgamma", + "log", + "log1p", + "neg", + "reciprocal", + "round", # TODO: model kwargs + "sign", + "sin", + "sinh", + "sqrt", + "square", + "tan", + # + # Elementwise Binary References + # + "add", + "atan2", + "bitwise_and", + "bitwise_left_shift", + "bitwise_or", + "bitwise_right_shift", + "bitwise_xor", + # "complex", + # 'copysign', # where + # 'div', # need to implement all rounding modes first + "eq", + "float_power", + # 'floor_divide', # requires floor + # 'fmax', # requires where + # 'fmod', + # 'gcd', + "ge", + "gt", + # 'heaviside', + # 'hypot', + "igamma", + "igammac", + "isclose", + # 'lcm', + # 'ldexp', + "le", + "logical_and", + "logical_or", + # 'logical_xor', + "lt", + # 'max', # implement with reductions + "maximum", + # 'min', # implement with reductions + "minimum", + "mul", + "ne", + "nextafter", + # 'polar', # abs, cos, sin + "pow", + # 'remainder', + # 'rsub', # unblocked + # # special.xlog1py + # # special.zeta + "sub", + "true_divide", + # 'xlogy', # where?, log, mul + # + # Conditional references + # + "where", # TODO: add opinfo + # + # Data conversion and movement references + # + "clone", + "copy_to", # TODO: add opinfo + # + # Reduction ops + # + "sum", + "amax", + "amin", + # + # View & Shape Ops + # + "as_strided", + "cat", + "chunk", + "flatten", + "flip", + "narrow", + "permute", + "reshape", + "stack", + "swap_axes", # alias for transpose + "squeeze", + "tensor_split", + "transpose", + "unsqueeze", + "view", + # + # Tensor Creation + # + "empty", + "empty_like", + "full", + "full_like", + "ones_like", +] + +Tensor = torch.Tensor + + +class REDUCTION_OUTPUT_TYPE_KIND(Enum): + SAME = (0,) + SAME_OR_REAL = (1,) # for complex types outputs corresponding real type + OP_MATH = (2,) # keep output in opmath type, needed for mean + ALWAYS_BOOL = (3,) + + +def _broadcast_shapes(*_shapes): + shapes = tuple(filter(lambda x: x is not None, _shapes)) + + # Short-circuits on no input + if len(shapes) == 0: + return None + + # Type checking + # TODO: make common validations available as utils + for shape in shapes: + assert isinstance(shape, Sequence) + + # Computes common shape + common_shape = [ + 1, + ] * reduce(max, (len(shape) for shape in shapes)) + for shape in shapes: + for idx in range(-1, -1 - len(shape), -1): + if common_shape[idx] == 1: + if shape[idx] < 0: + raise ValueError( + "Attempting to broadcast a dimension with negative length!" + ) + common_shape[idx] = shape[idx] + elif shape[idx] != 1: + if common_shape[idx] != shape[idx]: + raise RuntimeError( + "Attempting to broadcast a dimension of length ", + str(shape[idx]), + "!", + ) + + return common_shape + + +def _maybe_broadcast(*args, preserve_cpu_scalar_tensors=True): + # Computes common shape + common_shape = _broadcast_shapes( + *map(lambda t: t.shape if isinstance(t, TensorLike) else None, args) + ) + + def __maybe_broadcast(x, shape): + if x is None: + return None + elif isinstance(x, Number): + return x + elif isinstance(x, TensorLike): + if preserve_cpu_scalar_tensors and utils.is_cpu_scalar_tensor(x): + return x + + if tuple(x.shape) != common_shape: + common_rank = len(common_shape) + 1 + start = common_rank - (len(x.shape) + 1) + dims = tuple(range(start, len(x.shape) + start)) + return prims.broadcast_in_dim(x, common_shape, dims) + else: + raise RuntimeError( + "Unexpected type when broadcasting: " + str(type(x)) + "!" + ) + + return tuple(__maybe_broadcast(x, common_shape) for x in args) + + +# Utilities should come BEFORE this import +from torch._decomp import register_decomposition + +# +# Elementwise unary references +# + +infer_aten_op = object() + +# TODO: add type promotion support +def _make_elementwise_unary_reference( + prim: Callable, + *, + type_promotion_kind, + aten_op=infer_aten_op, + disable_meta=False, + extra_meta=None, +) -> Callable: + @out_wrapper + @elementwise_type_promotion_wrapper( + type_promoting_args=("a",), + type_promotion_kind=type_promotion_kind, + ) + def _ref(a: TensorLikeType) -> TensorLikeType: + if not isinstance(a, TensorLike): + raise RuntimeError( + "Expected a tensor input for an elementwise unary operation!" + ) + + if extra_meta is not None: + extra_meta(a) + + return prim(a) + + if aten_op is infer_aten_op: + aten_op = getattr(torch.ops.aten, prim.__name__.split(".")[0]) + if aten_op is not None: + register_decomposition(aten_op, disable_meta=disable_meta)(_ref) + + return _ref + + +abs = _make_elementwise_unary_reference( + prims.abs, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT, +) + +acos = _make_elementwise_unary_reference( + prims.acos, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, +) + +acosh = _make_elementwise_unary_reference( + prims.acosh, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, +) + +asin = _make_elementwise_unary_reference( + prims.asin, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, +) + +atan = _make_elementwise_unary_reference( + prims.atan, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, +) + +ceil = _make_elementwise_unary_reference( + prims.ceil, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT, +) + +cos = _make_elementwise_unary_reference( + prims.cos, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, +) + +cosh = _make_elementwise_unary_reference( + prims.cosh, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, +) + +digamma = _make_elementwise_unary_reference( + prims.digamma, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, +) + +erf = _make_elementwise_unary_reference( + prims.erf, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, +) + +erfinv = _make_elementwise_unary_reference( + prims.erf_inv, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, + aten_op=torch.ops.aten.erfinv, # prim/aten name mismatch +) + +erfc = _make_elementwise_unary_reference( + prims.erfc, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, +) + +exp = _make_elementwise_unary_reference( + prims.exp, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, +) + +expm1 = _make_elementwise_unary_reference( + prims.expm1, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, +) + +floor = _make_elementwise_unary_reference( + prims.floor, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT, +) + + +def _isfinite(a: TensorLikeType) -> TensorLikeType: + if utils.is_float_dtype(a.dtype) or utils.is_complex_dtype(a.dtype): + return prims.is_finite(a) + + return ones_like(a, dtype=torch.bool) + + +isfinite = _make_elementwise_unary_reference( + _isfinite, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL, + aten_op=None, # CompositeImplicitAutograd +) + + +def _isnan(a: TensorLikeType) -> TensorLikeType: + return prims.ne(a, a) + + +isnan = _make_elementwise_unary_reference( + _isnan, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL, + aten_op=torch.ops.aten.isnan, # prim/aten name mismatch +) + +lgamma = _make_elementwise_unary_reference( + prims.lgamma, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, +) + +log = _make_elementwise_unary_reference( + prims.log, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, +) + +log1p = _make_elementwise_unary_reference( + prims.log1p, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, +) + + +def _neg_meta(a: TensorLikeType): + if a.dtype is torch.bool: + msg = "neg is not supported on bool tensors." + raise RuntimeError(msg) + + +neg = _make_elementwise_unary_reference( + prims.neg, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT, + extra_meta=_neg_meta, +) + +reciprocal = _make_elementwise_unary_reference( + prims.reciprocal, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, +) + +# TODO: round takes additional kwargs +round = _make_elementwise_unary_reference( + prims.round, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT, + aten_op=None, # TODO: this does need a decomp, but kwarg handling is needed +) + +sign = _make_elementwise_unary_reference( + prims.sign, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT, +) + +sin = _make_elementwise_unary_reference( + prims.sin, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, +) + +sinh = _make_elementwise_unary_reference( + prims.sinh, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, +) + +sqrt = _make_elementwise_unary_reference( + prims.sqrt, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, +) + +square = _make_elementwise_unary_reference( + prims.square, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.BOOL_TO_LONG, + aten_op=None, # CompositeImplicitAutograd, +) + +tan = _make_elementwise_unary_reference( + prims.tan, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, +) + + +def _make_elementwise_binary_reference( + prim: Callable, + *, + type_promotion_kind, + aten_op=infer_aten_op, + has_out=True, + supports_lhs_python_scalar=True, + supports_rhs_python_scalar=True, + disable_meta=False, +) -> Callable: + @elementwise_type_promotion_wrapper( + type_promoting_args=("a", "b"), + type_promotion_kind=type_promotion_kind, + ) + def _ref( + a: Union[Tensor, NumberType], + b: Union[Tensor, NumberType], + ) -> Tensor: + if not supports_lhs_python_scalar and isinstance(a, Number): + raise ValueError( + "Received a lhs Python scalar to an elementwise binary operation that does not accept lhs scalars!" + ) + + if not supports_rhs_python_scalar and isinstance(b, Number): + raise ValueError( + "Received a rhs Python scalar to an elementwise binary operation that does not accept rhs scalars!" + ) + + # TODO: enable this for operations that support it, like add + if isinstance(a, Number) and isinstance(b, Number): + raise ValueError( + "Receive two Number inputs to an elementwise binary operation!" + ) + + a, b = _maybe_broadcast(a, b) + return prim(a, b) + + if has_out: + _ref = out_wrapper(_ref) + + if aten_op is infer_aten_op: + aten_op = getattr(torch.ops.aten, prim.__name__.split(".")[0]) + if aten_op is not None: + register_decomposition(aten_op, disable_meta=disable_meta)(_ref) + + return _ref + + +# Add has its own implementation because it has an alpha argument +@out_wrapper +@elementwise_type_promotion_wrapper( + type_promoting_args=("a", "b"), + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT, +) +def add( + a: Union[TensorLikeType, NumberType], + b: Union[TensorLikeType, NumberType], + *, + alpha: Optional[NumberType] = None, +): + """ + Reference implementation of torch.add + """ + + if isinstance(a, Number) and isinstance(b, Number): + raise ValueError( + "Receive two Number inputs to an elementwise binary operation!" + ) + + a, b = _maybe_broadcast(a, b) + + if alpha is not None: + dtype = a.dtype if isinstance(a, TensorLike) else b.dtype # type: ignore[union-attr] + python_type = utils.dtype_to_type(dtype) + if not utils.is_weakly_lesser_type(type(alpha), python_type): + msg = ( + "alpha argument of type {0} cannot be safely cast to type {1}!".format( + type(alpha), python_type + ) + ) + raise ValueError(msg) + b = prims.mul(b, alpha) + + return prims.add(a, b) + + +# TODO: add docstring +atan2 = _make_elementwise_binary_reference( + prims.atan2, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, + supports_lhs_python_scalar=False, + supports_rhs_python_scalar=False, +) + +# TODO: add docstring +bitwise_and = _make_elementwise_binary_reference( + prims.bitwise_and, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT, +) + +# TODO: add docstring +bitwise_left_shift = _make_elementwise_binary_reference( + prims.shift_left, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT, + aten_op=torch.ops.aten.bitwise_left_shift, # prim/aten name mismatch +) + +# TODO: add docstring +bitwise_or = _make_elementwise_binary_reference( + prims.bitwise_or, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT, +) + +# TODO: add docstring +bitwise_right_shift = _make_elementwise_binary_reference( + prims.shift_right_arithmetic, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT, + aten_op=torch.ops.aten.bitwise_right_shift, # prim/aten name mismatch +) + +# TODO: add docstring +bitwise_xor = _make_elementwise_binary_reference( + prims.bitwise_xor, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT, +) + +# TODO: add docstring +# complex = _make_elementwise_binary_reference(prims.complex, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT) + +# TODO: add docstring +eq = _make_elementwise_binary_reference( + prims.eq, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL, + supports_lhs_python_scalar=False, +) + +# TODO: add docstring +# Float power has its own implementation because it has unique type promotion. +# NB: aten_op not registered because CompositeExplicitAutograd +@out_wrapper +def float_power( + a: Union[TensorLikeType, NumberType], + b: Union[TensorLikeType, NumberType], +) -> Tensor: + + if isinstance(a, Number) and isinstance(b, Number): + raise ValueError( + "Receive two Number inputs to an elementwise binary operation!" + ) + + # Handles type promotion + dtype = utils.get_higher_dtype(a, b) + assert dtype is not None + if utils.is_complex_dtype(dtype): + dtype = torch.complex128 + else: + dtype = torch.float64 + + # Float power has the following contiguous cast behavior to be + # consistent with its C++ impl + if isinstance(a, TensorLike) and a.dtype != dtype: + a = prims.to_dtype(a, dtype) + if isinstance(b, TensorLike) and b.dtype != dtype: + b = prims.to_dtype(b, dtype) + + a, b = _maybe_broadcast(a, b) + return prims.pow(a, b) + + +# TODO: add docstring +ge = _make_elementwise_binary_reference( + prims.ge, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL, + supports_lhs_python_scalar=False, +) + +# TODO: add docstring +gt = _make_elementwise_binary_reference( + prims.gt, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL, + supports_lhs_python_scalar=False, +) + +igamma = _make_elementwise_binary_reference( + prims.igamma, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, + supports_lhs_python_scalar=False, + supports_rhs_python_scalar=False, +) + +igammac = _make_elementwise_binary_reference( + prims.igammac, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, + supports_lhs_python_scalar=False, + supports_rhs_python_scalar=False, +) + + +def isclose( + a: TensorLikeType, + b: TensorLikeType, + rtol: float = 1e-05, + atol: float = 1e-08, + equal_nan: bool = False, +) -> TensorLikeType: + if a.dtype != b.dtype: + msg = "Attempting to compare tensors of different dtypes {0} and {1}!".format( + a.dtype, b.dtype + ) + raise ValueError(a, b) + if rtol < 0: + msg = "rtol must be greater than or equal to zero, but got {0}!".format(rtol) + if atol < 0: + msg = "atol must be greater than or equal to zero, but got {0}!".format(atol) + + close = eq(a, b) + if equal_nan and (utils.is_float_dtype(a.dtype) or utils.is_complex_dtype(a.dtype)): + close = logical_or(close, logical_and(isnan(a), isnan(b))) + + # Note: In case of zero tolerances the closeness inequality degenerates to an equality check. + # In this case, the short-circuit prevents false positives as detailed in the paragraph below. + if atol == 0 and rtol == 0: + return close + + # Note [closeness error computation] + # atol and rtol are provided as doubles, so the computation + # rtol * other will produce a float or complex tensor. + # When the difference (self - other) is compared to it then the + # tensor representing the difference will also be cast to float or complex. + # However, since (self - other) in uint8 is very likely to produce a + # negative value, this moves the cast forward so the difference is + # always computed in a float or complex type. + # If the values of the integer tensors cannot be exactly represented + # by the default scalar type then this may cause an incorrect result. + if not utils.is_float_dtype(a.dtype) and not utils.is_complex_dtype(a.dtype): + a = prims.convert_element_type(a, torch.get_default_dtype()) + b = prims.convert_element_type(b, torch.get_default_dtype()) + + allowed_error = add(atol, abs(mul(b, rtol))) + actual_error = abs(sub(a, b)) + + # Computes finite closeness + result = logical_or( + close, logical_and(isfinite(actual_error), le(actual_error, allowed_error)) + ) + + return result + + +# TODO: add docstring +le = _make_elementwise_binary_reference( + prims.le, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL, + supports_lhs_python_scalar=False, +) + + +def _logical_and(a: TensorLikeType, b: TensorLikeType): + if not utils.is_boolean_dtype(a.dtype): + a = ne(a, 0) + if not utils.is_boolean_dtype(b.dtype): + b = ne(b, 0) + return bitwise_and(a, b) + + +logical_and = _make_elementwise_binary_reference( + _logical_and, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL, + aten_op=torch.ops.aten.logical_and, +) + + +def _logical_or(a: TensorLikeType, b: TensorLikeType): + if not utils.is_boolean_dtype(a.dtype): + a = ne(a, 0) + if not utils.is_boolean_dtype(b.dtype): + b = ne(b, 0) + return bitwise_or(a, b) + + +logical_or = _make_elementwise_binary_reference( + _logical_or, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL, + aten_op=torch.ops.aten.logical_or, +) + +# TODO: add docstring +lt = _make_elementwise_binary_reference( + prims.lt, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL, + supports_lhs_python_scalar=False, +) + +# TODO: add docstring +maximum = _make_elementwise_binary_reference( + prims.maximum, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT, +) + +# TODO: add docstring +minimum = _make_elementwise_binary_reference( + prims.minimum, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT, +) + +# TODO: add docstring +mul = _make_elementwise_binary_reference( + prims.mul, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT, +) + +# TODO: add docstring +ne = _make_elementwise_binary_reference( + prims.ne, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL, + supports_lhs_python_scalar=False, +) + +# TODO: add docstring +nextafter = _make_elementwise_binary_reference( + prims.nextafter, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH, + supports_lhs_python_scalar=False, + supports_rhs_python_scalar=False, +) + +# TODO: add docstring +pow = _make_elementwise_binary_reference( + prims.pow, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.BOOL_TO_LONG, +) + +# TODO: add docstring +# TODO: consider refactoring this with add impl +# sub has its own implementation because it has an alpha argument +@register_decomposition(torch.ops.aten.sub) +@out_wrapper +@elementwise_type_promotion_wrapper( + type_promoting_args=("a", "b"), + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT, +) +def sub( + a: Union[TensorLikeType, NumberType], + b: Union[TensorLikeType, NumberType], + *, + alpha: Optional[NumberType] = None, +): + """ + Reference implementation of torch.add + """ + + if isinstance(a, Number) and isinstance(b, Number): + raise ValueError( + "Receive two Number inputs to an elementwise binary operation!" + ) + + a, b = _maybe_broadcast(a, b) + + if alpha is not None: + dtype = a.dtype if isinstance(a, TensorLike) else b.dtype # type: ignore[union-attr] + python_type = utils.dtype_to_type(dtype) + if not utils.is_weakly_lesser_type(type(alpha), python_type): + msg = ( + "alpha argument of type {0} cannot be safely cast to type {1}!".format( + type(alpha), python_type + ) + ) + raise ValueError(msg) + b = prims.mul(b, alpha) + + return prims.sub(a, b) + + +# TODO: add docstring +true_divide = _make_elementwise_binary_reference( + prims.div, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, + aten_op=None, # CompositeImplicitAutograd +) + +# +# Conditional references +# + +# https://pytorch.org/docs/stable/generated/torch.where.html +# TODO: implement alternate where +@register_decomposition(torch.ops.aten.where) +@out_wrapper +@elementwise_type_promotion_wrapper( + type_promoting_args=("a", "b"), + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH, +) +def where( + pred: Tensor, + a: Optional[Union[TensorLikeType, NumberType]] = None, + b: Optional[Union[TensorLikeType, NumberType]] = None, +): + """ """ + + if a is None or b is None: + raise NotImplementedError + + pred, a, b = _maybe_broadcast(pred, a, b) + return prims.select(pred, a, b) + + +# +# Data Movement References +# +def clone( + a: TensorLikeType, *, memory_format: torch.memory_format = torch.preserve_format +) -> TensorLikeType: + + return prims.clone(a, memory_format=memory_format) + + +def copy_to(a: Tensor, b: Tensor, *, allow_cross_device=True): + if not allow_cross_device and a.device != b.device: + msg = "Attempting to copy from device {0} to device {1}, but cross-device copies are not allowed!".format( + b.device, a.device + ) + raise RuntimeError(msg) + + return prims.copy_to(a, b) + + +# +# Reduction references +# + + +def _reduction( + a: Tensor, + prim: Callable, + *, + has_identity: bool = True, + accepts_dim_tuple: bool = True, # to handle min/argmin that accept single dim only + dims: Optional[DimsType] = None, + keepdims: bool = False, + dtype: Optional[torch.dtype] = None, # should be specified for ops that support it + out: Optional[Tensor] = None, + output_dtype_kind: REDUCTION_OUTPUT_TYPE_KIND, +): # it is usually SAME, but I want + # ref writers to actually think about what to put here + assert isinstance(a, TensorLike) + if out is not None: + assert isinstance(out, TensorLike) + if dtype is not None: + # TODO - this is true for eager mode currently, but it's wrong behavior for complex norms + if dtype != out.dtype: + raise RuntimeError( + "dtype argument and out dtype must match in reduction" + ) + if not accepts_dim_tuple: + assert dims is None or isinstance(dims, int) + if isinstance(dims, int): + dims = (dims,) # type: ignore[assignment] + dims = utils.reduction_dims(a.shape, dims) + if not has_identity: + valid_shape = all(a.shape[i] for i in range(a.ndim) if i in dims) + if not valid_shape: + raise RuntimeError( + "reducing over zero-size dimension for reduction operation without identity" + ) + # even though some reductions, like amin or amax, don't strictly require type promotion, + # all the math ops (including comparisons) are still defined only for a computation type, + # so promotion will still happen. We are doing it explicitly here + inp_dtype = dtype if dtype is not None else a.dtype + computation_dtype = utils._get_computation_dtype(inp_dtype) + a_converted = prims.convert_element_type(a, computation_dtype) + result = prim(a_converted, dims) + + if keepdims: + output_shape = [a.shape[i] if i not in dims else 1 for i in range(a.ndim)] + broadcast_dims = [i for i in range(a.ndim) if i not in dims] + result = prims.broadcast_in_dim(result, output_shape, broadcast_dims) + if out is not None: + if dtype is None: + if output_dtype_kind == REDUCTION_OUTPUT_TYPE_KIND.SAME: + if out.dtype != a.dtype: + raise RuntimeError("Expected the dtype for input and out to match") + elif output_dtype_kind == REDUCTION_OUTPUT_TYPE_KIND.ALWAYS_BOOL: + if out.dtype != torch.bool: + raise RuntimeError("Expected the dtype for input and out to match") + out = _maybe_resize_out(out, result.shape) + return copy_to(out, result, allow_cross_device=False) # type: ignore[arg-type] + + if output_dtype_kind == REDUCTION_OUTPUT_TYPE_KIND.SAME: + result_dtype = dtype if dtype else a.dtype + if result.dtype != result_dtype: + result = prims.convert_element_type(result, result_dtype) + return result + + +# TODO: register decomp after stride logic is fixed +def sum( + a: Tensor, + dim: Union[Optional[int], Optional[List[int]]] = None, + keepdim: bool = False, + *, + dtype=None, + out: Optional[Tensor] = None, +): + if dtype is None: + if utils.is_boolean_dtype(a.dtype) or utils.is_integer_dtype(a.dtype): + dtype = torch.int64 + else: + dtype = a.dtype + # reduces over all dimensions if dim=() is passed + if dim == () or dim == []: + dim = None + return _reduction( + a, + prims.sum, + dims=dim, + keepdims=keepdim, + dtype=dtype, + out=out, + output_dtype_kind=REDUCTION_OUTPUT_TYPE_KIND.SAME, + ) + + +def amin( + a: Tensor, + dim: Union[Optional[int], Optional[List[int]]] = None, + keepdim: bool = False, + *, + out: Optional[Tensor] = None, +): + # reduces over all dimensions if dim=() is passed + if dim == () or dim == []: + dim = None + + if a.ndim > 64: + raise RuntimeError( + "Received a tensor with {0} dimensions, but only tensors with up to 64 dims are supported!".format( + a.ndim + ) + ) + + return _reduction( + a, + prims.amin, + dims=dim, + keepdims=keepdim, + dtype=None, + out=out, + has_identity=False, + output_dtype_kind=REDUCTION_OUTPUT_TYPE_KIND.SAME, + ) + + +def amax( + a: Tensor, + dim: Union[Optional[int], Optional[List[int]]] = None, + keepdim: bool = False, + *, + out: Optional[Tensor] = None, +): + # reduces over all dimensions if dim=() is passed + if dim == () or dim == []: + dim = None + + if a.ndim > 64: + raise RuntimeError( + "Received a tensor with {0} dimensions, only tensors with up to 64 dims are supported!".format( + a.ndim + ) + ) + + return _reduction( + a, + prims.amax, + dims=dim, + keepdims=keepdim, + dtype=None, + out=out, + has_identity=False, + output_dtype_kind=REDUCTION_OUTPUT_TYPE_KIND.SAME, + ) + + +def as_strided( + a: TensorLikeType, size: ShapeType, stride: StrideType, storage_offset: int = 0 +) -> TensorLikeType: + return prims.as_strided(a, size, stride, storage_offset) + + +@out_wrapper +@elementwise_type_promotion_wrapper( + type_promoting_args=("tensors",), + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH, +) +def cat(tensors: TensorSequenceType, dim: int = 0) -> TensorLikeType: + _dim = utils.canonicalize_dims(tensors[0].ndim, dim) + return prims.concatenate(tensors, _dim) + + +def chunk(a: TensorLikeType, chunks: int, dim: int = 0) -> Tuple[TensorLikeType, ...]: + if chunks <= 0: + msg = "Expected at least one chunk, but got {0}!".format(chunks) + raise ValueError(msg) + + dim = utils.canonicalize_dim(a.ndim, dim) + length = a.shape[dim] + chunk_size = math.ceil(length / chunks) + full_chunks = math.floor(length / chunk_size) + tail_chunk_size = length % chunk_size + + result = [] + for i in range(full_chunks): + result.append(narrow(a, dim, i * chunk_size, chunk_size)) + + if tail_chunk_size != 0: + result.append(narrow(a, dim, full_chunks * chunk_size, tail_chunk_size)) + + return tuple(result) + + +# Note: flatten, unlike prim.collapse and prim.collapse_view has an inclusive end_dim +# Note: flatten, unlike other shape operators, returns the input tensor on a no-op (unless +# a 0D tensor is flattened, in which case it's returned in 1D) +def flatten(a: TensorLikeType, start_dim: int = 0, end_dim: int = -1) -> TensorLikeType: + start_dim = utils.canonicalize_dim(a.ndim, start_dim) + end_dim = utils.canonicalize_dim(a.ndim, end_dim) + + # Short-circuits on no-op + if start_dim == end_dim and a.ndim != 0: + return a + + # Tries to take a view + # TODO: we could look at directing collapse_view to skip its meta function here (unsafe_collapse_view) + new_shape, new_strides = prims._collapse_view_helper(a, start_dim, end_dim + 1) + if new_shape is not None: + return prims.collapse_view(a, start_dim, end_dim + 1) + + # Makes a copy if it can't make a view + return prims.collapse(a, start_dim, end_dim + 1) + + +@register_decomposition(torch.ops.aten.flip) +def flip(a: TensorLikeType, dims: DimsSequenceType) -> TensorLikeType: + if not isinstance(dims, tuple) and not isinstance(dims, list): + raise ValueError("dims has to be a sequence of ints") + dims = utils.canonicalize_dims(a.ndim, dims) # type: ignore[assignment] + utils.validate_no_repeating_dims(dims) + return prims.rev(a, dims) + + +def narrow(a: TensorLikeType, dim: int, start: int, length: int) -> TensorLikeType: + dim = utils.canonicalize_dim(a.ndim, dim) + return prims.slice_in_dim(a, start, start + length, axis=dim) + + +def permute(a: TensorLikeType, dims: DimsSequenceType) -> TensorLikeType: + _permutation = utils.canonicalize_dims(a.ndim, dims) + return prims.transpose(a, _permutation) + + +def _reshape_view_helper( + a: TensorLikeType, shape: ShapeType, *, allow_copy: bool +) -> TensorLikeType: + # NOTE: Reshape may be given a shape with a -1 length + # This indicates that the dimension's length should be inferred + # Creates a valid shape + + for idx in range(len(shape)): + if shape[idx] == -1: + # Verifies there's only one dimension of length -1 in the shape + if shape.count(-1) > 1: + msg = "Can only infer the length of one dimension, but got shape {0}!".format( + str(shape) + ) + raise ValueError(msg) + + # TODO: improve error message + if a.numel() > 0: + length = reduce( + operator.floordiv, (x for x in shape if x != -1), a.numel() + ) + else: + msg = "Cannot reshape a tensor of zero elements into shape {0} because the unspecified length is ambiguous!".format( + str(shape) + ) + raise ValueError(msg) + + shape = list(shape) + shape[idx] = length + break + + # Short-circuits if shape is the same + utils.validate_shape(shape) + if tuple(a.shape) == tuple(shape): + return prims.view_of(a) + + numel = reduce(operator.mul, shape) if len(shape) > 0 else 1 + if a.numel() != numel: + msg = "Attempting to reshape a tensor with shape {0} and {1} elements to a shape {2} with {3} elements!".format( + str(a.shape), a.numel(), str(shape), numel + ) + raise ValueError(msg) + + # Special-cases tensors with no elements + if a.numel() == 0: + return as_strided(a, shape, utils.make_contiguous_strides_for(shape)) + + # Special-cases reshaping zero dim tensors + if a.ndim == 0: + _a = a + for length in shape: + assert length == 1 + _a = unsqueeze(_a, -1) + return _a + + # Special-cases reshaping to zero dim tensors + if len(shape) == 0: + _a = a + for length in a.shape: + assert length == 1 + _a = squeeze(_a, -1) + return _a + + # Handles general case: a 1+D tensor reshaped into a distinct 1+D shape + + # NOTE [Reshape Algorithm] + # This algorithm works by attempting to greedily construct the desired dimensions in + # the output shape, left to right. It does this by, conceptually, accumulating + # dimensions of the original tensor, also left to right, until the dimension + # can be constructed using prims.split_dim. + # The algorithm also has special handling for tail squeezes/unsqueezes, like + # if a reshape from (5, 5) to (5, 5, 1) or vice versa. + # + # This algorithm does not flatten the original tensor and then split dims as appropriate + # because that would create copies more often than this algorithm. flatten is the only + # operation below which can create a view or a copy, and while it prefers creating + # views it may sometimes create a copy if the tensor's strides do not permit a view. + # As a result, this algorithm tries to minimize flattening. + # + # Note that a better version of this algorithm may exist. Regions which could be + # flattened without creating a copy can be identified in advance, and that might + # allow fewer flatten calls or faster short-circuiting to make a copy. + idx = 0 + a_ = a + for length in shape: + # Handles tail unsqueezes + if idx >= a_.ndim: + assert length == 1 + last_dim = a_.ndim - 1 + # NOTE: using split_dim instead of unsqueeze may seem silly here, + # but it's necessary to get the strides correct + a_ = prims.split_dim(a_, last_dim, a_.shape[last_dim]) + idx = idx + 1 + continue + + # Skips dimensions that are already the correct length + if length == a_.shape[idx]: + idx = idx + 1 + continue + + # Gathers enough original dimensions such that this new dimension can be created + # Note that this accumulation will terminate because we've verified a and the shape + # specify the same number of elements above + accum = a_.shape[idx] + end = idx + while accum % length != 0: + end = end + 1 + accum = accum * a_.shape[end] + if end != idx: + # NOTE: in this case multiple dimensions must be flatten to create the desired dimension + # This flattening is why reshape sometimes creates a copy -- because flattening + # may return a view of a copy + + # Checks if collapse can be a view and short-circuits to copying reshape if it can't + new_shape, new_strides = prims._collapse_view_helper(a_, idx, end + 1) + if new_shape is None: + if allow_copy: + return prims.reshape(a, shape) + + msg = "Cannot view a tensor with shape {0} and strides {1} as a tensor with shape {2}!".format( + a.shape, a.stride(), shape + ) + raise ValueError(msg) + + a_ = flatten(a_, idx, end) + + # Splits the (possibly flattened) dimension to create the desired dim length + if accum != length: + a_ = prims.split_dim(a_, idx, length) + + idx = idx + 1 + + # Squeezes tail + while idx < a_.ndim: + assert a_.shape[idx] == 1 + a_ = squeeze(a_, idx) + + return a_ + + +def reshape(a: TensorLikeType, shape: ShapeType) -> TensorLikeType: + return _reshape_view_helper(a, shape, allow_copy=True) + + +# update to cat then view instead of unsqueezing each tensor +@out_wrapper +def stack(tensors: TensorSequenceType, dim: int = 0) -> TensorLikeType: + tensors = tuple(unsqueeze(a, dim) for a in tensors) + return cat(tensors, dim) + + +# Note: although squeeze is documented as having the out= kwarg it doesn't +def squeeze(a: TensorLikeType, dim: Optional[int] = None) -> TensorLikeType: + if dim is not None: + dim = utils.canonicalize_dim(a.ndim, dim) + # Short-circuits if the tensor has no dimensions + if len(a.shape) == 0: + assert dim == 0 + return prims.view_of(a) + + # Note: squeeze does not modify tensors when the given dim is not a dimension of length 1 + if a.shape[dim] != 1: + return prims.view_of(a) + return prims.squeeze(a, (dim,)) + + dims = tuple(idx for idx in range(len(a.shape)) if a.shape[idx] == 1) + return prims.squeeze(a, dims) + + +# Note: does not work with TensorMetas because of data-dependent control-flow +def tensor_split( + a: TensorLikeType, + indices_or_sections: Union[Tensor, DimsType], + dim: int = 0, +) -> Tuple[TensorLikeType, ...]: + _dim = utils.canonicalize_dim(a.ndim, dim) + if a.ndim == 0: + msg = "tensor_split: received a rank zero tensor, but expected a tensor of rank one or greater!" + raise ValueError(msg) + + # If indices_or_sections is a tensor, it must be a CPU Long tensor + if isinstance(indices_or_sections, TensorLike): + if indices_or_sections.device != torch.device("cpu"): + msg = "tensor_split: if indices_or_sections is a tensor it must be on the CPU, but received one on {0}".format( + indices_or_sections.device + ) + raise ValueError(msg) + if indices_or_sections.dtype != torch.long: + msg = "tensor_split: if indices_or_sections is a tensor it must have long dtype, " + " but received one with dtype {0}".format(indices_or_sections.dtype) + raise ValueError(msg) + + # Case 0 -- indices_or_sections is an integer or a scalar tensor n and a is split along dim into n parts of equal-ish length + if isinstance(indices_or_sections, int) or ( + isinstance(indices_or_sections, TensorLike) and indices_or_sections.ndim == 0 + ): + sections: int = ( + indices_or_sections # type: ignore[assignment] + if isinstance(indices_or_sections, Number) + else indices_or_sections.item() + ) + + if sections <= 0: + msg = "tensor_split: number of sections must be greater than 0, but was {0}".format( + sections + ) + raise ValueError(msg) + + splits = [] + dim_size = a.shape[_dim] + min_split_size = math.floor(dim_size / sections) + num_splits_one_extra = dim_size % sections + start_idx = 0 + for split_idx in range(sections): + split_size = ( + min_split_size + 1 + if (split_idx < num_splits_one_extra) + else min_split_size + ) + s = prims.slice_in_dim(a, start_idx, start_idx + split_size, axis=_dim) + splits.append(s) + start_idx = start_idx + split_size + + return tuple(splits) + # Case 1 -- indices_or_sections is a sequence of integers or a 1D tensor describing the splits + else: + indices = indices_or_sections + if isinstance(indices_or_sections, TensorLike): + if indices_or_sections.ndim != 1: + msg = "tensor_split: non-scalar indices_or_sections tensors must have only one dimension, " + "but received a tensor with {0} dimensions".format( + indices_or_sections.ndim + ) + raise ValueError(msg) + + indices = indices_or_sections.tolist() + + splits = [] + start_idx = 0 + for x in indices: + splits.append(prims.slice_in_dim(a, start_idx, x, axis=_dim)) + start_idx = x + splits.append(prims.slice_in_dim(a, start_idx, a.shape[_dim], axis=_dim)) + return tuple(splits) + + +def transpose(a: TensorLikeType, dim0: int, dim1: int) -> TensorLikeType: + _dim0, _dim1 = utils.canonicalize_dims(a.ndim, (dim0, dim1)) # type: ignore[misc] + + if a.ndim <= 1: + return prims.view_of(a) + + _permutation = list(range(0, a.ndim)) + _permutation[_dim0] = _dim1 + _permutation[_dim1] = _dim0 + return prims.transpose(a, _permutation) + + +# Aliases for transpose +swap_axes = transpose + + +def unsqueeze(a: TensorLikeType, dim: int) -> TensorLikeType: + # Note that unsqueeze canonicalizes with rank + 1 because it allows + # a new innermost dimension to be specified + dim = utils.canonicalize_dim(a.ndim + 1, dim) + return prims.expand_dims(a, (dim,)) + + +def view(a: TensorLikeType, shape: ShapeType) -> TensorLikeType: + return _reshape_view_helper(a, shape, allow_copy=False) + + +@out_wrapper +def empty( + *shape, + dtype: Optional[torch.dtype] = None, + device: Optional[torch.device] = None, + requires_grad: bool = False, +) -> TensorLikeType: + dtype = torch.get_default_dtype() if dtype is None else dtype + device = torch.device("cpu") if device is None else device + if len(shape) > 0 and isinstance(shape[0], tuple): + return prims.empty( + *shape, dtype=dtype, device=device, requires_grad=requires_grad + ) + return prims.empty(shape, dtype=dtype, device=device, requires_grad=requires_grad) + + +def empty_like( + a: TensorLikeType, + *, + dtype: Optional[torch.dtype] = None, + device: Optional[torch.device] = None, + requires_grad: bool = False, +) -> TensorLikeType: + dtype = a.dtype if dtype is None else dtype + device = a.device if device is None else device + return prims.empty_like(a, dtype=dtype, device=device, requires_grad=requires_grad) + + +@out_wrapper +def full( + shape: ShapeType, + fill_value: NumberType, + *, + dtype: torch.dtype, + device: torch.device, + requires_grad: bool, +) -> TensorLikeType: + dtype = torch.get_default_dtype() if dtype is None else dtype + device = torch.device("cpu") if device is None else device + return prims.full( + shape, fill_value, dtype=dtype, device=device, requires_grad=requires_grad + ) + + +def full_like( + a: TensorLikeType, + fill_value: NumberType, + *, + dtype: Optional[torch.dtype] = None, + device: Optional[torch.device] = None, + requires_grad: bool = False, +) -> TensorLikeType: + dtype = a.dtype if dtype is None else dtype + device = a.device if device is None else device + return prims.full_like( + a, fill_value, dtype=dtype, device=device, requires_grad=requires_grad + ) + + +def ones_like( + a: TensorLikeType, + *, + dtype: Optional[torch.dtype] = None, + device: Optional[torch.device] = None, + requires_grad: bool = False, +) -> TensorLikeType: + return full_like(a, 1, dtype=dtype, device=device, requires_grad=requires_grad) diff --git a/tools/codegen/api/__init__.py b/torch/_refs/nn/__init__.py similarity index 100% rename from tools/codegen/api/__init__.py rename to torch/_refs/nn/__init__.py diff --git a/torch/_refs/nn/functional/__init__.py b/torch/_refs/nn/functional/__init__.py new file mode 100644 index 000000000000..12ac19844c95 --- /dev/null +++ b/torch/_refs/nn/functional/__init__.py @@ -0,0 +1,47 @@ +import torch + +import torch._prims.utils as utils +from torch._prims.utils import ( + TensorLikeType, + NumberType, + ELEMENTWISE_TYPE_PROMOTION_KIND, +) +import torch._refs as refs +from torch._prims.wrappers import elementwise_type_promotion_wrapper + +from typing import Optional + +__all__ = [ + "elu", +] + +# elu is implemented specially because it has an alpha argument +@elementwise_type_promotion_wrapper( + type_promoting_args=("a",), + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT, +) +def elu( + a: TensorLikeType, alpha: Optional[NumberType] = None, inplace: bool = False +) -> TensorLikeType: + """ + Reference implementation of torch.nn.functional.elu + """ + + if inplace: + raise NotImplementedError + + rhs: TensorLikeType + if alpha is not None: + python_type = utils.dtype_to_type(a.dtype) + if not utils.is_weakly_lesser_type(type(alpha), python_type): + msg = ( + "alpha argument of type {0} cannot be safely cast to type {1}!".format( + type(alpha), python_type + ) + ) + raise ValueError(msg) + rhs = refs.mul(alpha, refs.expm1(a)) + else: + rhs = refs.expm1(a) + + return refs.where(refs.gt(a, 0), a, rhs) diff --git a/torch/_refs/special/__init__.py b/torch/_refs/special/__init__.py new file mode 100644 index 000000000000..ff8c92cd8fa4 --- /dev/null +++ b/torch/_refs/special/__init__.py @@ -0,0 +1,23 @@ +import torch + +import torch._prims as prims +import torch._prims.utils as utils +from torch._prims.utils import TensorLikeType +from torch._prims.wrappers import out_wrapper, elementwise_type_promotion_wrapper +from torch._refs import _make_elementwise_unary_reference + +__all__ = [ + "i0e", + "i1e", +] + +i0e = _make_elementwise_unary_reference( + prims.bessel_i0e, + type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, + aten_op=torch.ops.aten.special_i0e, +) +i1e = _make_elementwise_unary_reference( + prims.bessel_i1e, + type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, + aten_op=torch.ops.aten.special_i1e, +) diff --git a/torch/_tensor.py b/torch/_tensor.py index dc2f5c21624d..37383c17af28 100644 --- a/torch/_tensor.py +++ b/torch/_tensor.py @@ -18,16 +18,17 @@ import torch.utils.hooks as hooks -def _wrap_type_error_to_not_implemented(f): +def _handle_torch_function_and_wrap_type_error_to_not_implemented(f): # functools.wraps doesn't work well with methods in python 2 method_assignments = ('__name__', '__doc__') assigned = functools.WRAPPER_ASSIGNMENTS @functools.wraps(f, assigned=assigned) def wrapped(*args, **kwargs): - if has_torch_function(args): - return handle_torch_function(wrapped, args, *args, **kwargs) try: + # See https://github.com/pytorch/pytorch/issues/75462 + if has_torch_function(args): + return handle_torch_function(wrapped, args, *args, **kwargs) return f(*args, **kwargs) except TypeError: return NotImplemented @@ -46,7 +47,9 @@ def _rebuild_from_type_v2(func, new_type, args, state): if new_type is Tensor: return func(*args) - ret = func(*args).as_subclass(new_type) + ret = func(*args) + if type(ret) is not new_type: + ret = ret.as_subclass(new_type) # Tensor does define __setstate__ even though it doesn't define # __getstate__. So only use __setstate__ if it is NOT the one defined # on Tensor @@ -92,8 +95,17 @@ def __deepcopy__(self, memo): # does accurate alias tracking; however, the code below # doesn't work because of # https://github.com/pytorch/pytorch/issues/47442 - if self.is_sparse or self.device.type in ['xla', 'mlc', 'ort', 'meta', 'hpu']: + # Update the test in test_serialization if you remove 'meta' from here + if self.is_sparse or self.device.type in ['lazy', 'xla', 'mps', 'ort', 'meta', 'hpu'] or \ + (type(self) is not Tensor and self.data_ptr() == 0): new_tensor = self.clone() + if type(new_tensor) is not type(self): + raise RuntimeError("The default implementation of __deepcopy__() for wrapper subclasses " + "only works for subclass types that implement clone() and for which " + "cloning returns another instance of the same subclass. You should either " + "properly implement clone() for your subclass or override __deepcopy__() " + "if it is intended behavior for clone() to return an instance of a " + "different type.") else: new_storage = self.storage().__deepcopy__(memo) if self.is_quantized: @@ -109,9 +121,9 @@ def __deepcopy__(self, memo): else: raise RuntimeError(f"Unsupported qscheme {self.qscheme()} in deepcopy") # TODO: Once we decide to break serialization FC, no longer - # need to wrap with TypedStorage + # need to wrap with _TypedStorage new_tensor = torch._utils._rebuild_qtensor( - torch.storage.TypedStorage( + torch.storage._TypedStorage( wrap_storage=new_storage._untyped(), dtype=self.dtype), self.storage_offset(), @@ -120,19 +132,34 @@ def __deepcopy__(self, memo): quantizer_params, self.requires_grad, self._backward_hooks) + if type(new_tensor) is not type(self): + raise RuntimeError("The default implementation of __deepcopy__() for quantized tensors " + "expects the tensor returned by torch._utils._rebuild_qtensor() to " + "match the type of the instance being copied. If you encounter this, " + "please open an issue on PyTorch's GitHub.") else: new_tensor = self.new_empty([]) + if type(new_tensor) is not type(self): + raise RuntimeError("The default implementation of __deepcopy__() for non-wrapper subclasses " + "only works for subclass types that implement new_empty() and for which " + "that function returns another instance of the same subclass. You should " + "either properly implement new_empty() for your subclass or override " + "__deepcopy__() if it is intended behavior for new_empty() to return " + "an instance of a different type.") new_tensor.set_(new_storage, self.storage_offset(), self.size(), self.stride()) if self.is_conj(): new_tensor = new_tensor.conj_physical() if self.is_neg(): new_tensor = new_tensor.neg() - new_tensor.requires_grad = self.requires_grad + if self.requires_grad: + new_tensor.requires_grad_() if self.grad is not None: new_tensor.grad = self.grad.__deepcopy__(memo) if not type(self) is Tensor: - new_tensor = new_tensor.as_subclass(type(self)) # type: ignore[arg-type] + if type(new_tensor) is not type(self): + raise RuntimeError("Type of deepcopy result does not match the type of the source tensor. " + "If you encounter this, please open an issue on PyTorch's GitHub.") # Plain Tensors don't have slots slots_to_save = copyreg._slotnames(self.__class__) # type: ignore[attr-defined] @@ -175,21 +202,14 @@ def storage(self): if has_torch_function_unary(self): return handle_torch_function(Tensor.storage, (self,), self) - if self.dtype not in torch.storage._dtype_to_storage_type_map(): - raise RuntimeError(f'unsupported Storage type: {self.dtype}') - - storage = self._storage() - storage_name = torch.storage._dtype_to_storage_type_map()[self.dtype] - storage_class = eval(type(storage).__module__ + '.' + storage_name) - storage = storage_class(wrap_storage=storage) - return storage + return torch._TypedStorage(wrap_storage=self._storage(), dtype=self.dtype) def _reduce_ex_internal(self, proto): check_serializing_named_tensor(self) # See Note [Don't serialize hooks] torch.utils.hooks.warn_if_has_hooks(self) backward_hooks: Dict[Any, Any] = OrderedDict() - # Note: Numpy array is chosen to be the rebuild component for XLA, ORT, MLC Tensors. + # Note: Numpy array is chosen to be the rebuild component for XLA, ORT Tensors. # We considered a few options: # 1. CPU tensor can't be used here. # Otherwise in torch.load CPU storage is reconstructed with randomly @@ -199,7 +219,7 @@ def _reduce_ex_internal(self, proto): # 2. Python list is not a good fit due to performance reason. # `tolist()` converts every single element in the tensor into python objects # and serialize them one by one. - if self.device.type in ['xla', 'ort', 'mlc']: + if self.device.type in ['xla', 'ort', 'mps', 'hpu']: return (torch._utils._rebuild_device_tensor_from_numpy, (self.cpu().numpy(), self.dtype, str(self.device), @@ -232,9 +252,9 @@ def _reduce_ex_internal(self, proto): else: raise RuntimeError(f"Serialization is not supported for tensors of type {self.qscheme()}") # TODO: Once we decide to break serialization FC, no longer - # need to wrap with TypedStorage + # need to wrap with _TypedStorage args_qtensor = ( - torch.storage.TypedStorage( + torch.storage._TypedStorage( wrap_storage=self.storage()._untyped(), dtype=self.dtype), self.storage_offset(), @@ -265,11 +285,23 @@ def _reduce_ex_internal(self, proto): raise NotImplementedError( 'sparse csr tensor __reduce_ex__ for layout `%s`' % (self.layout)) return (torch._utils._rebuild_sparse_csr_tensor, args_sparse_csr) + elif self.data_ptr() == 0 and type(self) is not torch.Tensor: + arg_wrapper_subclass = ( + type(self), + self.dtype, + tuple(self.size()), + self.stride(), + self.storage_offset(), + self.layout, + self.device, + self.requires_grad + ) + return (torch._utils._rebuild_wrapper_subclass, arg_wrapper_subclass) else: # TODO: Once we decide to break serialization FC, no longer - # need to wrap with TypedStorage + # need to wrap with _TypedStorage args = ( - torch.storage.TypedStorage( + torch.storage._TypedStorage( wrap_storage=self.storage()._untyped(), dtype=self.dtype), self.storage_offset(), @@ -298,11 +330,12 @@ def __setstate__(self, state): # See Note [Don't serialize hooks] self.requires_grad, _, self._backward_hooks = state - def __repr__(self): + def __repr__(self, *, tensor_contents=None): if has_torch_function_unary(self): - return handle_torch_function(Tensor.__repr__, (self,), self) + return handle_torch_function(Tensor.__repr__, (self,), self, + tensor_contents=tensor_contents) # All strings are unicode in Python 3. - return torch._tensor_str._str(self) + return torch._tensor_str._str(self, tensor_contents=tensor_contents) def backward(self, gradient=None, retain_graph=None, create_graph=False, inputs=None): r"""Computes the gradient of current tensor w.r.t. graph leaves. @@ -497,6 +530,10 @@ def norm(self, p="fro", dim=None, keepdim=False, dtype=None): return handle_torch_function(Tensor.norm, (self,), self, p=p, dim=dim, keepdim=keepdim, dtype=dtype) return torch.norm(self, p, dim, keepdim, dtype=dtype) + def solve(self, other): + from ._linalg_utils import solve + return solve(self, other) + def lu(self, pivot=True, get_infos=False): r"""See :func:`torch.lu`""" # If get_infos is True, then we don't need to check for errors and vice versa @@ -597,47 +634,37 @@ def unique_consecutive(self, return_inverse=False, return_counts=False, dim=None ) return torch.unique_consecutive(self, return_inverse=return_inverse, return_counts=return_counts, dim=dim) - @_wrap_type_error_to_not_implemented + @_handle_torch_function_and_wrap_type_error_to_not_implemented def __rsub__(self, other): - if has_torch_function_variadic(self, other): - return handle_torch_function(Tensor.__rsub__, (self, other), self, other) return _C._VariableFunctions.rsub(self, other) - @_wrap_type_error_to_not_implemented + @_handle_torch_function_and_wrap_type_error_to_not_implemented def __rdiv__(self, other): - if has_torch_function_variadic(self, other): - return handle_torch_function(Tensor.__rdiv__, (self, other), self, other) return self.reciprocal() * other __rtruediv__ = __rdiv__ __itruediv__ = _C._TensorBase.__idiv__ - __pow__ = _wrap_type_error_to_not_implemented(_C._TensorBase.pow) + __pow__ = _handle_torch_function_and_wrap_type_error_to_not_implemented(_C._TensorBase.pow) + __ipow__ = _handle_torch_function_and_wrap_type_error_to_not_implemented(_C._TensorBase.pow_) - @_wrap_type_error_to_not_implemented + @_handle_torch_function_and_wrap_type_error_to_not_implemented def __rmod__(self, other): - if has_torch_function_variadic(self, other): - return handle_torch_function(Tensor.__rmod__, (self, other), self, other) return torch.remainder(other, self) def __format__(self, format_spec): if has_torch_function_unary(self): return handle_torch_function(Tensor.__format__, (self,), self, format_spec) - if self.dim() == 0: + if self.dim() == 0 and not self.is_meta: return self.item().__format__(format_spec) return object.__format__(self, format_spec) - def __ipow__(self, other): # type: ignore[misc] - if has_torch_function_variadic(self, other): - return handle_torch_function(Tensor.__ipow__, (self, other), self, other) - return NotImplemented - - @_wrap_type_error_to_not_implemented + @_handle_torch_function_and_wrap_type_error_to_not_implemented def __rpow__(self, other): dtype = torch.result_type(other, self) return torch.tensor(other, dtype=dtype, device=self.device) ** self - @_wrap_type_error_to_not_implemented + @_handle_torch_function_and_wrap_type_error_to_not_implemented def __floordiv__(self, other): warnings.warn("__floordiv__ is deprecated, and its behavior will change in a future version of pytorch. " "It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). " @@ -646,7 +673,7 @@ def __floordiv__(self, other): "or for actual floor division, use torch.div(a, b, rounding_mode='floor').", stacklevel=3) return torch.div(self, other, rounding_mode='trunc') - @_wrap_type_error_to_not_implemented + @_handle_torch_function_and_wrap_type_error_to_not_implemented def __rfloordiv__(self, other): warnings.warn("__rfloordiv__ is deprecated, and its behavior will change in a future version of pytorch. " "It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). " @@ -655,18 +682,16 @@ def __rfloordiv__(self, other): "or for actual floor division, use torch.div(a, b, rounding_mode='floor').", stacklevel=3) return torch.div(other, self, rounding_mode='trunc') - @_wrap_type_error_to_not_implemented + @_handle_torch_function_and_wrap_type_error_to_not_implemented def __rlshift__(self, other): return torch.bitwise_left_shift(other, self) - @_wrap_type_error_to_not_implemented + @_handle_torch_function_and_wrap_type_error_to_not_implemented def __rrshift__(self, other): return torch.bitwise_right_shift(other, self) - @_wrap_type_error_to_not_implemented + @_handle_torch_function_and_wrap_type_error_to_not_implemented def __rmatmul__(self, other): - if has_torch_function_variadic(self, other): - return handle_torch_function(Tensor.__rmatmul__, (self, other), self, other) return torch.matmul(other, self) __pos__ = _C._TensorBase.positive @@ -830,10 +855,10 @@ def storage_type(self): Returns the type of the underlying storage. """ - # NB: this returns old fashioned TypedStorage, e.g., FloatStorage, as it - # would be pretty pointless otherwise (it would always return - # UntypedStorage) - return type(self.storage()) + if has_torch_function_unary(self): + return handle_torch_function(Tensor.storage_type, (self,), self) + + return self.storage()._get_legacy_storage_class() def refine_names(self, *names): r"""Refines the dimension names of :attr:`self` according to :attr:`names`. @@ -1031,53 +1056,7 @@ def to_sparse_coo(self): 25 """ - if self.is_sparse: - return self - if self.is_sparse_csr: - crow_indices = self.crow_indices() - col_indices = self.col_indices() - indices = torch._convert_indices_from_csr_to_coo(crow_indices, col_indices, - out_int32=crow_indices.dtype == torch.int32) - return torch.sparse_coo_tensor(indices, - self.values(), - size=self.shape, - dtype=self.dtype, - device=self.device) - else: - return self.to_sparse() - - def to_sparse_csr(self): - """ Convert a tensor to compressed row storage format. Only works with 2D tensors. - - Examples:: - - >>> dense = torch.randn(5, 5) - >>> sparse = dense.to_sparse_csr() - >>> sparse._nnz() - 25 - - """ - shape = self.size() - fill_value = 0 - if len(shape) != 2: - raise RuntimeError("Only 2D tensors can be converted to the CSR format but got shape: ", shape) - - if self.is_sparse: - coalesced_self = self.coalesce() - row_indices = coalesced_self.indices()[0] - device = coalesced_self.values().device - crow_indices = torch._convert_indices_from_coo_to_csr( - row_indices, self.shape[0], out_int32=row_indices.dtype == torch.int32) - return torch.sparse_csr_tensor(crow_indices, - coalesced_self.indices()[1].contiguous(), - coalesced_self.values(), - size=coalesced_self.shape, - dtype=coalesced_self.dtype, - device=device) - elif self.is_sparse_csr: - return self - else: - return self.to_sparse().to_sparse_csr() + return self.to_sparse() def _update_names(self, names, inplace): if has_torch_function_unary(self): @@ -1145,6 +1124,8 @@ def __torch_function__(cls, func, types, args=(), kwargs=None): else: return _convert(ret, cls) + __torch_dispatch__ = _C._disabled_torch_dispatch_impl + def __dlpack__(self, stream=None): """ Creates a DLpack `capsule https://data-apis.org/array-api/latest/design_topics/data_interchange.html#data-interchange`_ diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py index 66ffffec87b5..3fb5f706e1a4 100644 --- a/torch/_tensor_docs.py +++ b/torch/_tensor_docs.py @@ -1060,6 +1060,24 @@ def add_docstr_all(method, docstr): {memory_format} """.format(**common_args)) +add_docstr_all('ipu', + r""" +ipu(device=None, non_blocking=False, memory_format=torch.preserve_format) -> Tensor + +Returns a copy of this object in IPU memory. + +If this object is already in IPU memory and on the correct device, +then no copy is performed and the original object is returned. + +Args: + device (:class:`torch.device`): The destination IPU device. + Defaults to the current IPU device. + non_blocking (bool): If ``True`` and the source is in pinned memory, + the copy will be asynchronous with respect to the host. + Otherwise, the argument has no effect. Default: ``False``. + {memory_format} +""".format(**common_args)) + add_docstr_all('xpu', r""" xpu(device=None, non_blocking=False, memory_format=torch.preserve_format) -> Tensor @@ -1798,6 +1816,12 @@ def add_docstr_all(method, docstr): length of :attr:`index` (which must be a vector), and all other dimensions must match :attr:`self`, or an error will be raised. +For a 3-D tensor the output is given as:: + + self[index[i], :, :] += alpha * src[i, :, :] # if dim == 0 + self[:, index[i], :] += alpha * src[:, i, :] # if dim == 1 + self[:, :, index[i]] += alpha * src[:, :, i] # if dim == 2 + Note: {forward_reproducibility_note} @@ -1912,6 +1936,73 @@ def add_docstr_all(method, docstr): Out-place version of :meth:`~Tensor.index_put_`. """) +add_docstr_all('index_reduce_', + r""" +index_reduce_(dim, index, source, reduce, *, include_self=True) -> Tensor + +Accumulate the elements of ``source`` into the :attr:`self` +tensor by accumulating to the indices in the order given in :attr:`index` +using the reduction given by the ``reduce`` argument. For example, if ``dim == 0``, +``index[i] == j``, ``reduce == prod`` and ``include_self == True`` then the ``i``\ th +row of ``source`` is multiplied by the ``j``\ th row of :attr:`self`. If +:obj:`include_self="True"`, the values in the :attr:`self` tensor are included +in the reduction, otherwise, rows in the :attr:`self` tensor that are accumulated +to are treated as if they were filled with the reduction identites. + +The :attr:`dim`\ th dimension of ``source`` must have the same size as the +length of :attr:`index` (which must be a vector), and all other dimensions must +match :attr:`self`, or an error will be raised. + +For a 3-D tensor with :obj:`reduce="prod"` and :obj:`include_self=True` the +output is given as:: + + self[index[i], :, :] *= src[i, :, :] # if dim == 0 + self[:, index[i], :] *= src[:, i, :] # if dim == 1 + self[:, :, index[i]] *= src[:, :, i] # if dim == 2 + +Note: + {forward_reproducibility_note} + +.. note:: + + This function only supports floating point tensors. + +.. warning:: + + This function is in beta and may change in the near future. + +Args: + dim (int): dimension along which to index + index (Tensor): indices of ``source`` to select from, + should have dtype either `torch.int64` or `torch.int32` + source (FloatTensor): the tensor containing values to accumulate + reduce (str): the reduction operation to apply + (:obj:`"prod"`, :obj:`"mean"`, :obj:`"amax"`, :obj:`"amin"`) + +Keyword args: + include_self (bool): whether the elements from the ``self`` tensor are + included in the reduction + +Example:: + + >>> x = torch.empty(5, 3).fill_(2) + >>> t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], dtype=torch.float) + >>> index = torch.tensor([0, 4, 2, 0]) + >>> x.index_reduce_(0, index, t, 'prod') + tensor([[20., 44., 72.], + [ 2., 2., 2.], + [14., 16., 18.], + [ 2., 2., 2.], + [ 8., 10., 12.]]) + >>> x = torch.empty(5, 3).fill_(2) + >>> x.index_reduce_(0, index, t, 'prod', include_self=False) + tensor([[10., 22., 36.], + [ 2., 2., 2.], + [ 7., 8., 9.], + [ 2., 2., 2.], + [ 4., 5., 6.]]) +""".format(**reproducibility_notes)) + add_docstr_all('index_select', r""" index_select(dim, index) -> Tensor @@ -3374,6 +3465,69 @@ def callable(a, b) -> number """.format(**reproducibility_notes)) +add_docstr_all('scatter_reduce_', r""" +scatter_reduce_(dim, index, src, reduce, *, include_self=True) -> Tensor + +Reduces all values from the :attr:`src` tensor to the indices specified in +the :attr:`index` tensor in the :attr:`self` tensor using the applied reduction +defined via the :attr:`reduce` argument (:obj:`"sum"`, :obj:`"prod"`, :obj:`"mean"`, +:obj:`"amax"`, :obj:`"amin"`). For each value in :attr:`src`, it is reduced to an +index in :attr:`self` which is specified by its index in :attr:`src` for +``dimension != dim`` and by the corresponding value in :attr:`index` for +``dimension = dim``. If :obj:`include_self="True"`, the values in the :attr:`self` +tensor are included in the reduction. + +:attr:`self`, :attr:`index` and :attr:`src` should all have +the same number of dimensions. It is also required that +``index.size(d) <= src.size(d)`` for all dimensions ``d``, and that +``index.size(d) <= self.size(d)`` for all dimensions ``d != dim``. +Note that ``index`` and ``src`` do not broadcast. + +For a 3-D tensor with :obj:`reduce="sum"` and :obj:`include_self=True` the +output is given as:: + + self[index[i][j][k]][j][k] += src[i][j][k] # if dim == 0 + self[i][index[i][j][k]][k] += src[i][j][k] # if dim == 1 + self[i][j][index[i][j][k]] += src[i][j][k] # if dim == 2 + +Note: + {forward_reproducibility_note} + +.. note:: + + The backward pass is implemented only for ``src.shape == index.shape``. + +.. warning:: + + This function is in beta and may change in the near future. + +Args: + dim (int): the axis along which to index + index (LongTensor): the indices of elements to scatter and reduce. + src (Tensor): the source elements to scatter and reduce + reduce (str): the reduction operation to apply for non-unique indices + (:obj:`"sum"`, :obj:`"prod"`, :obj:`"mean"`, :obj:`"amax"`, :obj:`"amin"`) + include_self (bool): whether elements from the :attr:`self` tensor are + included in the reduction + +Example:: + + >>> src = torch.tensor([1., 2., 3., 4., 5., 6.]) + >>> index = torch.tensor([0, 1, 0, 1, 2, 1]) + >>> input = torch.tensor([1., 2., 3., 4.]) + >>> input.scatter_reduce(0, index, src, reduce="sum") + tensor([5., 14., 8., 4.]) + >>> input.scatter_reduce(0, index, src, reduce="sum", include_self=False) + tensor([4., 12., 5., 4.]) + >>> input2 = torch.tensor([5., 4., 3., 2.]) + >>> input2.scatter_reduce(0, index, src, reduce="amax") + tensor([5., 6., 5., 2.]) + >>> input2.scatter_reduce(0, index, src, reduce="amax", include_self=False) + tensor([3., 6., 5., 2.]) + + +""".format(**reproducibility_notes)) + add_docstr_all('select', r""" select(dim, index) -> Tensor @@ -3540,13 +3694,6 @@ def callable(a, b) -> number """) -add_docstr_all('solve', - r""" -solve(A) -> Tensor, Tensor - -See :func:`torch.solve` -""") - add_docstr_all('sort', r""" sort(dim=-1, descending=False) -> (Tensor, LongTensor) @@ -3972,6 +4119,16 @@ def callable(a, b) -> number {memory_format} """.format(**common_args)) +add_docstr_all('chalf', + r""" +chalf(memory_format=torch.preserve_format) -> Tensor + +``self.chalf()`` is equivalent to ``self.to(torch.complex32)``. See :func:`to`. + +Args: + {memory_format} + """.format(**common_args)) + add_docstr_all('half', r""" half(memory_format=torch.preserve_format) -> Tensor @@ -4140,6 +4297,35 @@ def callable(a, b) -> number size=(3, 3), nnz=1, layout=torch.sparse_coo) """) +add_docstr_all('to_sparse_csr', + r""" +to_sparse_csr() -> Tensor +Convert a tensor to compressed row storage format. Only works with 2D tensors. + +Example:: + + >>> dense = torch.randn(5, 5) + >>> sparse = dense.to_sparse_csr() + >>> sparse._nnz() + 25 + +""") + +add_docstr_all('to_sparse_bsr', + r""" +to_sparse_bsr(blocksize) -> Tensor +Convert a CSR tensor to a block sparse row (BSR) storage format of given blocksize. + +Example:: + + >>> dense = torch.randn(10, 10) + >>> sparse = dense.to_sparse_csr() + >>> sparse_bsr = sparse.to_sparse_bsr((5, 5)) + >>> sparse_bsr.col_indices() + tensor([0, 1, 0, 1]) + +""") + add_docstr_all('to_mkldnn', r""" to_mkldnn() -> Tensor @@ -4746,6 +4932,13 @@ def callable(a, b) -> number Out-of-place version of :meth:`torch.Tensor.scatter_add_` """) +add_docstr_all('scatter_reduce', + r""" +scatter_reduce(dim, index, src, reduce, *, include_self=True) -> Tensor + +Out-of-place version of :meth:`torch.Tensor.scatter_reduce_` +""") + add_docstr_all('masked_scatter', r""" masked_scatter(mask, tensor) -> Tensor @@ -4862,6 +5055,11 @@ def callable(a, b) -> number Is ``True`` if the Tensor is stored on the GPU, ``False`` otherwise. """) +add_docstr_all('is_ipu', + r""" +Is ``True`` if the Tensor is stored on the IPU, ``False`` otherwise. +""") + add_docstr_all('is_xpu', r""" Is ``True`` if the Tensor is stored on the XPU, ``False`` otherwise. diff --git a/torch/_tensor_str.py b/torch/_tensor_str.py index b0bb6e93aaee..b1c53091bf60 100644 --- a/torch/_tensor_str.py +++ b/torch/_tensor_str.py @@ -80,6 +80,9 @@ def set_printoptions( PRINT_OPTS.linewidth = linewidth PRINT_OPTS.sci_mode = sci_mode +def tensor_totype(t): + dtype = torch.float if t.is_mps else torch.double + return t.to(dtype=dtype) class _Formatter(object): def __init__(self, tensor): @@ -104,9 +107,9 @@ def __init__(self, tensor): return # Convert to double for easy calculation. HalfTensor overflows with 1e8, and there's no div() on CPU. - nonzero_finite_abs = nonzero_finite_vals.abs().double() - nonzero_finite_min = nonzero_finite_abs.min().double() - nonzero_finite_max = nonzero_finite_abs.max().double() + nonzero_finite_abs = tensor_totype(nonzero_finite_vals.abs()) + nonzero_finite_min = tensor_totype(nonzero_finite_abs.min()) + nonzero_finite_max = tensor_totype(nonzero_finite_abs.max()) for value in nonzero_finite_vals: if value != torch.ceil(value): @@ -254,6 +257,9 @@ def _tensor_str(self, indent): if self.dtype is torch.float16 or self.dtype is torch.bfloat16: self = self.float() + if self.dtype is torch.complex32: + self = self.cfloat() + if self.dtype.is_complex: # handle the conjugate bit self = self.resolve_conj() @@ -297,10 +303,19 @@ def get_summarized_data(self): else: return torch.stack([get_summarized_data(x) for x in self]) -def _str_intern(inp): - prefix = 'tensor(' +def _str_intern(inp, *, tensor_contents=None): + is_plain_tensor = type(inp) is torch.Tensor or type(inp) is torch.nn.Parameter + if inp.is_nested: + prefix = "nested_tensor(" + elif is_plain_tensor: + prefix = 'tensor(' + else: + prefix = f"{type(inp).__name__}(" indent = len(prefix) suffixes = [] + custom_contents_provided = tensor_contents is not None + if custom_contents_provided: + tensor_str = tensor_contents # This is used to extract the primal value and thus disable the forward AD # within this function. @@ -315,7 +330,8 @@ def _str_intern(inp): # In other cases, we don't have a way to set them as default yet, # and we should always print out device for them. if self.device.type != torch._C._get_default_device()\ - or (self.device.type == 'cuda' and torch.cuda.current_device() != self.device.index): + or (self.device.type == 'cuda' and torch.cuda.current_device() != self.device.index)\ + or (self.device.type == 'mps'): suffixes.append('device=\'' + str(self.device) + '\'') # Tensor printing performs tensor operations like slice, indexing, etc to make it in a @@ -332,40 +348,52 @@ def _str_intern(inp): suffixes.append('nnz=' + str(self._nnz())) if not has_default_dtype: suffixes.append('dtype=' + str(self.dtype)) - indices_prefix = 'indices=tensor(' - indices = self._indices().detach() - indices_str = _tensor_str(indices, indent + len(indices_prefix)) - if indices.numel() == 0: - indices_str += ', size=' + str(tuple(indices.shape)) - values_prefix = 'values=tensor(' - values = self._values().detach() - values_str = _tensor_str(values, indent + len(values_prefix)) - if values.numel() == 0: - values_str += ', size=' + str(tuple(values.shape)) - tensor_str = indices_prefix + indices_str + '),\n' + ' ' * indent + values_prefix + values_str + ')' - elif self.is_sparse_csr: + if not custom_contents_provided: + indices_prefix = 'indices=tensor(' + indices = self._indices().detach() + indices_str = _tensor_str(indices, indent + len(indices_prefix)) + if indices.numel() == 0: + indices_str += ', size=' + str(tuple(indices.shape)) + values_prefix = 'values=tensor(' + values = self._values().detach() + values_str = _tensor_str(values, indent + len(values_prefix)) + if values.numel() == 0: + values_str += ', size=' + str(tuple(values.shape)) + tensor_str = indices_prefix + indices_str + '),\n' + ' ' * indent + values_prefix + values_str + ')' + elif self.layout in {torch.sparse_csr, torch.sparse_csc, torch.sparse_bsr, torch.sparse_bsc}: suffixes.append('size=' + str(tuple(self.shape))) suffixes.append('nnz=' + str(self._nnz())) if not has_default_dtype: suffixes.append('dtype=' + str(self.dtype)) - crow_indices_prefix = 'crow_indices=tensor(' - crow_indices = self.crow_indices().detach() - crow_indices_str = _tensor_str(crow_indices, indent + len(crow_indices_prefix)) - if crow_indices.numel() == 0: - crow_indices_str += ', size=' + str(tuple(crow_indices.shape)) - col_indices_prefix = 'col_indices=tensor(' - col_indices = self.col_indices().detach() - col_indices_str = _tensor_str(col_indices, indent + len(col_indices_prefix)) - if col_indices.numel() == 0: - col_indices_str += ', size=' + str(tuple(col_indices.shape)) - values_prefix = 'values=tensor(' - values = self.values().detach() - values_str = _tensor_str(values, indent + len(values_prefix)) - if values.numel() == 0: - values_str += ', size=' + str(tuple(values.shape)) - tensor_str = crow_indices_prefix + crow_indices_str + '),\n' + ' ' * indent +\ - col_indices_prefix + col_indices_str + '),\n' + ' ' * indent +\ - values_prefix + values_str + ')' + if not custom_contents_provided: + compressed_indices_method, plain_indices_method = { + torch.sparse_csr: (torch.Tensor.crow_indices, torch.Tensor.col_indices), + torch.sparse_csc: (torch.Tensor.ccol_indices, torch.Tensor.row_indices), + torch.sparse_bsr: (torch.Tensor.crow_indices, torch.Tensor.col_indices), + torch.sparse_bsc: (torch.Tensor.ccol_indices, torch.Tensor.row_indices), + }[self.layout] + if self.layout in {torch.sparse_csr, torch.sparse_bsr}: + cdimname, pdimname = 'row', 'column' + else: + cdimname, pdimname = 'column', 'row' + compressed_indices_prefix = f'c{cdimname[:3]}_indices=tensor(' + compressed_indices = compressed_indices_method(self).detach() + compressed_indices_str = _tensor_str(compressed_indices, indent + len(compressed_indices_prefix)) + if compressed_indices.numel() == 0: + compressed_indices_str += ', size=' + str(tuple(compressed_indices.shape)) + plain_indices_prefix = f'{pdimname[:3]}_indices=tensor(' + plain_indices = plain_indices_method(self).detach() + plain_indices_str = _tensor_str(plain_indices, indent + len(plain_indices_prefix)) + if plain_indices.numel() == 0: + plain_indices_str += ', size=' + str(tuple(plain_indices.shape)) + values_prefix = 'values=tensor(' + values = self.values().detach() + values_str = _tensor_str(values, indent + len(values_prefix)) + if values.numel() == 0: + values_str += ', size=' + str(tuple(values.shape)) + tensor_str = compressed_indices_prefix + compressed_indices_str + '),\n' + ' ' * indent +\ + plain_indices_prefix + plain_indices_str + '),\n' + ' ' * indent +\ + values_prefix + values_str + ')' elif self.is_quantized: suffixes.append('size=' + str(tuple(self.shape))) if not has_default_dtype: @@ -379,7 +407,14 @@ def _str_intern(inp): suffixes.append('scale=' + str(self.q_per_channel_scales())) suffixes.append('zero_point=' + str(self.q_per_channel_zero_points())) suffixes.append('axis=' + str(self.q_per_channel_axis())) - tensor_str = _tensor_str(self.dequantize(), indent) + if not custom_contents_provided: + tensor_str = _tensor_str(self.dequantize(), indent) + elif self.is_nested: + if not custom_contents_provided: + def indented_str(s, indent): + return "\n".join(f" {line}" for line in s.split("\n")) + strs = ",\n".join(indented_str(str(t), indent + 1) for t in torch.ops.aten.unbind.int(self, 0)) + tensor_str = f"[\n{strs}\n]" else: if self.is_meta: suffixes.append('size=' + str(tuple(self.shape))) @@ -387,7 +422,8 @@ def _str_intern(inp): suffixes.append('dtype=' + str(self.dtype)) # TODO: This implies that ellipses is valid syntax for allocating # a meta tensor, which it could be, but it isn't right now - tensor_str = '...' + if not custom_contents_provided: + tensor_str = '...' else: if self.numel() == 0 and not self.is_sparse: # Explicitly print the shape if it is not (0,), to match NumPy behavior @@ -398,15 +434,17 @@ def _str_intern(inp): # should be int64, so it must be shown explicitly. if self.dtype != torch.get_default_dtype(): suffixes.append('dtype=' + str(self.dtype)) - tensor_str = '[]' + if not custom_contents_provided: + tensor_str = '[]' else: if not has_default_dtype: suffixes.append('dtype=' + str(self.dtype)) - if self.layout != torch.strided: - tensor_str = _tensor_str(self.to_dense(), indent) - else: - tensor_str = _tensor_str(self, indent) + if not custom_contents_provided: + if self.layout != torch.strided: + tensor_str = _tensor_str(self.to_dense(), indent) + else: + tensor_str = _tensor_str(self, indent) if self.layout != torch.strided: suffixes.append('layout=' + str(self.layout)) @@ -427,8 +465,17 @@ def _str_intern(inp): if tangent is not None: suffixes.append('tangent={}'.format(tangent)) - return _add_suffixes(prefix + tensor_str, suffixes, indent, force_newline=self.is_sparse) + string_repr = _add_suffixes(prefix + tensor_str, suffixes, indent, force_newline=self.is_sparse) + + # Check if this instance is flagged as a parameter and change the repr accordingly. + # Unfortunately, this function has to be aware of this detail. + # NB: This is currently skipped for plain tensor parameters to maintain BC. In the future, + # this should be done for those as well to produce a valid repr. + if isinstance(self, torch.nn.Parameter) and not is_plain_tensor: + string_repr = f"Parameter({string_repr})" + + return string_repr -def _str(self): +def _str(self, *, tensor_contents=None): with torch.no_grad(): - return _str_intern(self) + return _str_intern(self, tensor_contents=tensor_contents) diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index 39433bda3482..620e78ad43e3 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -111,6 +111,10 @@ def merge_dicts(*dicts): "tf32_note": """This operator supports :ref:`TensorFloat32`.""" } +rocm_fp16_notes = { + "rocm_fp16_note": """On certain ROCm devices, when using float16 inputs this module will use \ +:ref:`different precision` for backward.""" +} reproducibility_notes = { "forward_reproducibility_note": """This operation may behave nondeterministically when given tensors on \ @@ -225,6 +229,12 @@ def merge_dicts(*dicts): See :meth:`~Tensor.index_add_` for function description. """) +add_docstr(torch.index_reduce, r""" +index_reduce(input, dim, index, source, reduce, *, include_self=True, out=None) -> Tensor + +See :meth:`~Tensor.index_reduce_` for function description. +""") + add_docstr(torch.add, r""" add(input, other, *, alpha=1, out=None) -> Tensor @@ -298,6 +308,8 @@ def merge_dicts(*dicts): {tf32_note} +{rocm_fp16_note} + Args: batch1 (Tensor): the first batch of matrices to be multiplied batch2 (Tensor): the second batch of matrices to be multiplied @@ -317,7 +329,7 @@ def merge_dicts(*dicts): tensor([[ 6.6311, 0.0503, 6.9768, -12.0362, -2.1653], [ -4.8185, -1.4255, -6.6760, 8.9453, 2.5743], [ -3.8202, 4.3691, 1.0943, -1.1109, 5.4730]]) -""".format(**common_args, **tf32_notes)) +""".format(**common_args, **tf32_notes, **rocm_fp16_notes)) add_docstr(torch.addcdiv, r""" addcdiv(input, tensor1, tensor2, *, value=1, out=None) -> Tensor @@ -427,6 +439,8 @@ def merge_dicts(*dicts): {tf32_note} +{rocm_fp16_note} + Args: input (Tensor): matrix to be added mat1 (Tensor): the first matrix to be matrix multiplied @@ -445,7 +459,7 @@ def merge_dicts(*dicts): >>> torch.addmm(M, mat1, mat2) tensor([[-4.8716, 1.4671, -1.3746], [ 0.7573, -3.9555, -2.8681]]) -""".format(**common_args, **tf32_notes)) +""".format(**common_args, **tf32_notes, **rocm_fp16_notes)) add_docstr(torch.adjoint, r""" @@ -1031,9 +1045,6 @@ def merge_dicts(*dicts): CPU device, and not share its memory. .. seealso:: - :func:`torch.as_tensor` creates a tensor that always shares memory if the input is a - tensor or a NumPy array, copying otherwise. - :func:`torch.tensor` creates a tensor that always copies the data from the input object. :func:`torch.from_numpy` creates a tensor that always shares memory from NumPy arrays. @@ -1041,7 +1052,7 @@ def merge_dicts(*dicts): :func:`torch.frombuffer` creates a tensor that always shares memory from objects that implement the buffer protocol. - :func:`torch.utils.dlpack.from_dlpack` creates a tensor that always shares memory from + :func:`torch.from_dlpack` creates a tensor that always shares memory from DLPack capsules. Args: @@ -1130,6 +1141,8 @@ def merge_dicts(*dicts): {tf32_note} +{rocm_fp16_note} + Args: input (Tensor): the tensor to be added batch1 (Tensor): the first batch of matrices to be multiplied @@ -1147,7 +1160,7 @@ def merge_dicts(*dicts): >>> batch2 = torch.randn(10, 4, 5) >>> torch.baddbmm(M, batch1, batch2).size() torch.Size([10, 3, 5]) -""".format(**common_args, **tf32_notes)) +""".format(**common_args, **tf32_notes, **rocm_fp16_notes)) add_docstr(torch.bernoulli, r""" @@ -1284,6 +1297,8 @@ def merge_dicts(*dicts): """ + r""" {tf32_note} +{rocm_fp16_note} + .. note:: This function does not :ref:`broadcast `. For broadcasting matrix products, see :func:`torch.matmul`. @@ -1301,7 +1316,7 @@ def merge_dicts(*dicts): >>> res = torch.bmm(input, mat2) >>> res.size() torch.Size([10, 3, 5]) -""".format(**common_args, **tf32_notes)) +""".format(**common_args, **tf32_notes, **rocm_fp16_notes)) add_docstr(torch.bitwise_and, r""" @@ -1374,12 +1389,14 @@ def merge_dicts(*dicts): bitwise_left_shift(input, other, *, out=None) -> Tensor Computes the left arithmetic shift of :attr:`input` by :attr:`other` bits. -The result will have the same dtype as :attr:`input`. +The input tensor must be of integral type. This operator supports +:ref:`broadcasting to a common shape ` and +:ref:`type promotion `. The operation applied is: .. math:: - \text{{out}}_i = \text{{input}}_i \times 2 ^ {{\text{{other}}_i}} + \text{{out}}_i = \text{{input}}_i << \text{{other}}_i Args: input (Tensor or Scalar): the first input tensor @@ -1399,12 +1416,14 @@ def merge_dicts(*dicts): bitwise_right_shift(input, other, *, out=None) -> Tensor Computes the right arithmetic shift of :attr:`input` by :attr:`other` bits. -The result will have the same dtype as :attr:`input`. +The input tensor must be of integral type. This operator supports +:ref:`broadcasting to a common shape ` and +:ref:`type promotion `. The operation applied is: .. math:: - \text{{out}}_i = \text{{input}}_i / 2 ^ {{\text{{other}}_i}} + \text{{out}}_i = \text{{input}}_i >> \text{{other}}_i Args: input (Tensor or Scalar): the first input tensor @@ -4320,77 +4339,6 @@ def merge_dicts(*dicts): Use :func:`torch.outer` instead. """) -add_docstr(torch.solve, - r""" -torch.solve(input, A, *, out=None) -> (Tensor, Tensor) - -This function returns the solution to the system of linear -equations represented by :math:`AX = B` and the LU factorization of -A, in order as a namedtuple `solution, LU`. - -`LU` contains `L` and `U` factors for LU factorization of `A`. - -`torch.solve(B, A)` can take in 2D inputs `B, A` or inputs that are -batches of 2D matrices. If the inputs are batches, then returns -batched outputs `solution, LU`. - -Supports real-valued and complex-valued inputs. - -.. warning:: - - :func:`torch.solve` is deprecated in favor of :func:`torch.linalg.solve` - and will be removed in a future PyTorch release. - :func:`torch.linalg.solve` has its arguments reversed and does not return the - LU factorization of the input. To get the LU factorization see :func:`torch.lu`, - which may be used with :func:`torch.lu_solve` and :func:`torch.lu_unpack`. - - ``X = torch.solve(B, A).solution`` should be replaced with - - .. code:: python - - X = torch.linalg.solve(A, B) - -.. note:: - - Irrespective of the original strides, the returned matrices - `solution` and `LU` will be transposed, i.e. with strides like - `B.contiguous().mT.stride()` and - `A.contiguous().mT.stride()` respectively. - -Args: - input (Tensor): input matrix :math:`B` of size :math:`(*, m, k)` , where :math:`*` - is zero or more batch dimensions. - A (Tensor): input square matrix of size :math:`(*, m, m)`, where - :math:`*` is zero or more batch dimensions. - -Keyword args: - out ((Tensor, Tensor), optional): optional output tuple. - -Example:: - - >>> A = torch.tensor([[6.80, -2.11, 5.66, 5.97, 8.23], - ... [-6.05, -3.30, 5.36, -4.44, 1.08], - ... [-0.45, 2.58, -2.70, 0.27, 9.04], - ... [8.32, 2.71, 4.35, -7.17, 2.14], - ... [-9.67, -5.14, -7.26, 6.08, -6.87]]).t() - >>> B = torch.tensor([[4.02, 6.19, -8.22, -7.57, -3.03], - ... [-1.56, 4.00, -8.67, 1.75, 2.86], - ... [9.81, -4.09, -4.57, -8.61, 8.99]]).t() - >>> X, LU = torch.solve(B, A) - >>> torch.dist(B, torch.mm(A, X)) - tensor(1.00000e-06 * - 7.0977) - - >>> # Batched solver example - >>> A = torch.randn(2, 3, 1, 4, 4) - >>> B = torch.randn(2, 3, 1, 4, 6) - >>> X, LU = torch.solve(B, A) - >>> torch.dist(B, A.matmul(X)) - tensor(1.00000e-06 * - 3.6386) - -""") - add_docstr(torch.get_default_dtype, r""" get_default_dtype() -> torch.dtype @@ -4529,6 +4477,98 @@ def merge_dicts(*dicts): (tensor([ 0., 0.9524, 0.3810, 0.]), tensor([0., 0.75, 1.5, 2.25, 3.])) """.format(**common_args)) +add_docstr(torch.histogramdd, + r""" +histogramdd(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor[]) + +Computes a multi-dimensional histogram of the values in a tensor. + +Interprets the elements of an input tensor whose innermost dimension has size N +as a collection of N-dimensional points. Maps each of the points into a set of +N-dimensional bins and returns the number of points (or total weight) in each bin. + +:attr:`input` must be a tensor with at least 2 dimensions. +If input has shape (M, N), each of its M rows defines a point in N-dimensional space. +If input has three or more dimensions, all but the last dimension are flattened. + +Each dimension is independently associated with its own strictly increasing sequence +of bin edges. Bin edges may be specified explicitly by passing a sequence of 1D +tensors. Alternatively, bin edges may be constructed automatically by passing a +sequence of integers specifying the number of equal-width bins in each dimension. + +For each N-dimensional point in input: + - Each of its coordinates is binned independently among the bin edges + corresponding to its dimension + - Binning results are combined to identify the N-dimensional bin (if any) + into which the point falls + - If the point falls into a bin, the bin's count (or total weight) is incremented + - Points which do not fall into any bin do not contribute to the output + +:attr:`bins` can be a sequence of N 1D tensors, a sequence of N ints, or a single int. + +If :attr:`bins` is a sequence of N 1D tensors, it explicitly specifies the N sequences +of bin edges. Each 1D tensor should contain a strictly increasing sequence with at +least one element. A sequence of K bin edges defines K-1 bins, explicitly specifying +the left and right edges of all bins. Every bin is exclusive of its left edge. Only +the rightmost bin is inclusive of its right edge. + +If :attr:`bins` is a sequence of N ints, it specifies the number of equal-width bins +in each dimension. By default, the leftmost and rightmost bin edges in each dimension +are determined by the minimum and maximum elements of the input tensor in the +corresponding dimension. The :attr:`range` argument can be provided to manually +specify the leftmost and rightmost bin edges in each dimension. + +If :attr:`bins` is an int, it specifies the number of equal-width bins for all dimensions. + +.. note:: + See also :func:`torch.histogram`, which specifically computes 1D histograms. + While :func:`torch.histogramdd` infers the dimensionality of its bins and + binned values from the shape of :attr:`input`, :func:`torch.histogram` + accepts and flattens :attr:`input` of any shape. + +Args: + {input} + bins: Tensor[], int[], or int. + If Tensor[], defines the sequences of bin edges. + If int[], defines the number of equal-width bins in each dimension. + If int, defines the number of equal-width bins for all dimensions. +Keyword args: + range (sequence of float): Defines the leftmost and rightmost bin edges + in each dimension. + weight (Tensor): By default, each value in the input has weight 1. If a weight + tensor is passed, each N-dimensional coordinate in input + contributes its associated weight towards its bin's result. + The weight tensor should have the same shape as the :attr:`input` + tensor excluding its innermost dimension N. + density (bool): If False (default), the result will contain the count (or total weight) + in each bin. If True, each count (weight) is divided by the total count + (total weight), then divided by the volume of its associated bin. +Returns: + hist (Tensor): N-dimensional Tensor containing the values of the histogram. + bin_edges(Tensor[]): sequence of N 1D Tensors containing the bin edges. + +Example:: + >>> torch.histogramdd(torch.tensor([[0., 1.], [1., 0.], [2., 0.], [2., 2.]]), bins=[3, 3], + ... weight=torch.tensor([1., 2., 4., 8.])) + torch.return_types.histogramdd( + hist=tensor([[0., 1., 0.], + [2., 0., 0.], + [4., 0., 8.]]), + bin_edges=(tensor([0.0000, 0.6667, 1.3333, 2.0000]), + tensor([0.0000, 0.6667, 1.3333, 2.0000]))) + + >>> torch.histogramdd(torch.tensor([[0., 0.], [1., 1.], [2., 2.]]), bins=[2, 2], + ... range=[0., 1., 0., 1.], density=True) + torch.return_types.histogramdd( + hist=tensor([[2., 0.], + [0., 2.]]), + bin_edges=(tensor([0.0000, 0.5000, 1.0000]), + tensor([0.0000, 0.5000, 1.0000]))) + +""") +# TODO: Fix via https://github.com/pytorch/pytorch/issues/75798 +torch.histogramdd.__module__ = "torch" + add_docstr(torch.hypot, r""" hypot(input, other, *, out=None) -> Tensor @@ -6686,6 +6726,8 @@ def merge_dicts(*dicts): {tf32_note} +{rocm_fp16_note} + Args: input (Tensor): the first matrix to be matrix multiplied mat2 (Tensor): the second matrix to be matrix multiplied @@ -6700,7 +6742,7 @@ def merge_dicts(*dicts): >>> torch.mm(mat1, mat2) tensor([[ 0.4851, 0.5037, -0.3633], [-0.0760, -3.6705, 2.4784]]) -""".format(**common_args, **tf32_notes)) +""".format(**common_args, **tf32_notes, **rocm_fp16_notes)) add_docstr(torch.hspmm, r""" @@ -6752,6 +6794,8 @@ def merge_dicts(*dicts): {tf32_note} +{rocm_fp16_note} + .. note:: The 1-dimensional dot product version of this function does not support an :attr:`out` parameter. @@ -6791,7 +6835,7 @@ def merge_dicts(*dicts): >>> torch.matmul(tensor1, tensor2).size() torch.Size([10, 3, 5]) -""".format(**common_args, **tf32_notes)) +""".format(**common_args, **tf32_notes, **rocm_fp16_notes)) add_docstr(torch.mode, r""" @@ -8547,6 +8591,12 @@ def merge_dicts(*dicts): Out-of-place version of :meth:`torch.Tensor.scatter_add_` """) +add_docstr(torch.scatter_reduce, r""" +scatter_reduce(input, dim, index, src, reduce, *, include_self=True) -> Tensor + +Out-of-place version of :meth:`torch.Tensor.scatter_reduce_` +""") + add_docstr(torch.select, r""" select(input, dim, index) -> Tensor @@ -8956,6 +9006,68 @@ def merge_dicts(*dicts): [-0.0881, 0.4370, 0.2275, 1.0284]]) """.format(**common_args)) +add_docstr(torch.sparse_compressed_tensor, + r""" +sparse_compressed_tensor(compressed_indices, plain_indices, values, size=None, + *, dtype=None, layout=None, device=None, requires_grad=False) -> Tensor + +Constructs a :ref:`sparse tensor in Compressed Sparse format - CSR, +CSC, BSR, or BSC - ` with specified values at the +given :attr:`compressed_indices` and :attr:`plain_indices`. Sparse +matrix multiplication operations in Compressed Sparse format are +typically faster than that for sparse tensors in COO format. Make you +have a look at :ref:`the note on the data type of the indices +`. + +Args: + compressed_indices (array_like): One-dimensional array of size + size[cdim] + 1 where cdim is 0 or 1 depending on the layout. + The last element is the number of non-zeros. This tensor + encodes the index in values and plain_indices depending on + where the given compressed dimension (row or column) + starts. Each successive number in the tensor subtracted by the + number before it denotes the number of elements in a given + compressed dimension. + plain_indices (array_like): Plain dimension (column or row) + co-ordinates of each element in values. Strictly one + dimensional tensor with the same length as values. + values (array_list): Initial values for the tensor. Can be a list, + tuple, NumPy ``ndarray``, scalar, and other types. For block + sparse formats, the dimensionality of values must be two plus + the dimensionality of plain_indices. + size (list, tuple, :class:`torch.Size`, optional): Size of the + sparse tensor. If not provided, the size will be inferred as + the minimum size big enough to hold all non-zero elements. + +Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of + returned tensor. Default: if None, infers data type from + :attr:`values`. + layout (:class:`torch.layout`, required): the desired layout of + returned tensor: :attr:`torch.sparse_csr`, + :attr:`torch.sparse_csc`, :attr:`torch.sparse_bsr`, or + :attr:`torch.sparse_bsc`. + device (:class:`torch.device`, optional): the desired device of + returned tensor. Default: if None, uses the current device + for the default tensor type (see + :func:`torch.set_default_tensor_type`). :attr:`device` will be + the CPU for CPU tensor types and the current CUDA device for + CUDA tensor types. + {requires_grad} + +Example:: + >>> compressed_indices = [0, 2, 4] + >>> plain_indices = [0, 1, 0, 1] + >>> values = [1, 2, 3, 4] + >>> torch.sparse_compressed_tensor(torch.tensor(compressed_indices, dtype=torch.int64), + ... torch.tensor(plain_indices, dtype=torch.int64), + ... torch.tensor(values), dtype=torch.double, layout=torch.sparse_csr) + tensor(crow_indices=tensor([0, 2, 4]), + col_indices=tensor([0, 1, 0, 1]), + values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_csr) +""".format(**factory_common_args)) + add_docstr(torch.sparse_csr_tensor, r""" sparse_csr_tensor(crow_indices, col_indices, values, size=None, *, dtype=None, device=None, requires_grad=False) -> Tensor @@ -8966,27 +9078,34 @@ def merge_dicts(*dicts): at :ref:`the note on the data type of the indices `. Args: - crow_indices (array_like): One-dimensional array of size size[0] + 1. The last element - is the number of non-zeros. This tensor encodes the index in values and col_indices - depending on where the given row starts. Each successive number in the tensor - subtracted by the number before it denotes the number of elements in a given row. - col_indices (array_like): Column co-ordinates of each element in values. Strictly one - dimensional tensor with the same length as values. - values (array_list): Initial values for the tensor. Can be a list, tuple, NumPy ``ndarray``, scalar, - and other types. - size (list, tuple, :class:`torch.Size`, optional): Size of the sparse tensor. If not provided, the - size will be inferred as the minimum size big enough to hold all non-zero elements. - -Keyword args: - dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. - Default: if None, infers data type from :attr:`values`. - device (:class:`torch.device`, optional): the desired device of returned tensor. - Default: if None, uses the current device for the default tensor type - (see :func:`torch.set_default_tensor_type`). :attr:`device` will be the CPU - for CPU tensor types and the current CUDA device for CUDA tensor types. + crow_indices (array_like): One-dimensional array of size size[0] + 1. + The last element is the number of non-zeros. This tensor + encodes the index in values and col_indices depending on where + the given row starts. Each successive number in the tensor + subtracted by the number before it denotes the number of + elements in a given row. + col_indices (array_like): Column co-ordinates of each element in + values. Strictly one dimensional tensor with the same length + as values. + values (array_list): Initial values for the tensor. Can be a list, + tuple, NumPy ``ndarray``, scalar, and other types. + size (list, tuple, :class:`torch.Size`, optional): Size of the + sparse tensor. If not provided, the size will be inferred as + the minimum size big enough to hold all non-zero elements. + +Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of + returned tensor. Default: if None, infers data type from + :attr:`values`. + device (:class:`torch.device`, optional): the desired device of + returned tensor. Default: if None, uses the current device + for the default tensor type (see + :func:`torch.set_default_tensor_type`). :attr:`device` will be + the CPU for CPU tensor types and the current CUDA device for + CUDA tensor types. {requires_grad} -Example :: +Example:: >>> crow_indices = [0, 2, 4] >>> col_indices = [0, 1, 0, 1] >>> values = [1, 2, 3, 4] @@ -8999,6 +9118,173 @@ def merge_dicts(*dicts): dtype=torch.float64, layout=torch.sparse_csr) """.format(**factory_common_args)) +add_docstr(torch.sparse_csc_tensor, + r""" +sparse_csc_tensor(ccol_indices, row_indices, values, size=None, *, dtype=None, device=None, requires_grad=False) -> Tensor + +Constructs a :ref:`sparse tensor in CSC (Compressed Sparse Column) +` with specified values at the given +:attr:`ccol_indices` and :attr:`row_indices`. Sparse matrix +multiplication operations in CSC format are typically faster than that +for sparse tensors in COO format. Make you have a look at :ref:`the +note on the data type of the indices `. + +Args: + ccol_indices (array_like): One-dimensional array of size size[1] + 1. + The last element is the number of non-zeros. This tensor + encodes the index in values and row_indices depending on where + the given column starts. Each successive number in the tensor + subtracted by the number before it denotes the number of + elements in a given column. + row_indices (array_like): Row co-ordinates of each element in + values. Strictly one dimensional tensor with the same length + as values. + values (array_list): Initial values for the tensor. Can be a list, + tuple, NumPy ``ndarray``, scalar, and other types. + size (list, tuple, :class:`torch.Size`, optional): Size of the + sparse tensor. If not provided, the size will be inferred as + the minimum size big enough to hold all non-zero elements. + +Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of + returned tensor. Default: if None, infers data type from + :attr:`values`. + device (:class:`torch.device`, optional): the desired device of + returned tensor. Default: if None, uses the current device + for the default tensor type (see + :func:`torch.set_default_tensor_type`). :attr:`device` will be + the CPU for CPU tensor types and the current CUDA device for + CUDA tensor types. + {requires_grad} + +Example:: + >>> ccol_indices = [0, 2, 4] + >>> row_indices = [0, 1, 0, 1] + >>> values = [1, 2, 3, 4] + >>> torch.sparse_csc_tensor(torch.tensor(ccol_indices, dtype=torch.int64), + ... torch.tensor(row_indices, dtype=torch.int64), + ... torch.tensor(values), dtype=torch.double) + tensor(ccol_indices=tensor([0, 2, 4]), + row_indices=tensor([0, 1, 0, 1]), + values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4, + dtype=torch.float64, layout=torch.sparse_csc) +""".format(**factory_common_args)) + +add_docstr(torch.sparse_bsr_tensor, + r""" +sparse_bsr_tensor(crow_indices, col_indices, values, size=None, *, dtype=None, device=None, requires_grad=False) -> Tensor + +Constructs a :ref:`sparse tensor in BSR (Block Compressed Sparse Row)) +` with specified 2-dimensional blocks at the given +:attr:`crow_indices` and :attr:`col_indices`. Sparse matrix +multiplication operations in BSR format are typically faster than that +for sparse tensors in COO format. Make you have a look at :ref:`the +note on the data type of the indices `. + +Args: + crow_indices (array_like): One-dimensional array of size size[0] + + 1. The last element is the number of non-zeros. This tensor + encodes the index in values and col_indices depending on where + the given row starts. Each successive number in the tensor + subtracted by the number before it denotes the number of + blocks in a given row. + col_indices (array_like): Column co-ordinates of each block in + values. Strictly one dimensional tensor with the same length + as values. + values (array_list): Initial values for the tensor. Can be a list, + tuple, NumPy ``ndarray``, scalar, and other types. The + dimensionality of values must be two plus the dimensionality + of col_indices. + size (list, tuple, :class:`torch.Size`, optional): Size of the + sparse tensor. If not provided, the size will be inferred as + the minimum size big enough to hold all non-zero blocks. + +Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of + returned tensor. Default: if None, infers data type from + :attr:`values`. + device (:class:`torch.device`, optional): the desired device of + returned tensor. Default: if None, uses the current device + for the default tensor type (see + :func:`torch.set_default_tensor_type`). :attr:`device` will be + the CPU for CPU tensor types and the current CUDA device for + CUDA tensor types. + {requires_grad} + +Example:: + >>> crow_indices = [0, 1, 2] + >>> col_indices = [0, 1] + >>> values = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]] + >>> torch.sparse_bsr_tensor(torch.tensor(crow_indices, dtype=torch.int64), + ... torch.tensor(col_indices, dtype=torch.int64), + ... torch.tensor(values), dtype=torch.double) + tensor(crow_indices=tensor([0, 1, 2]), + col_indices=tensor([0, 1]), + values=tensor([[[1., 2.], + [3., 4.]], + + [[5., 6.], + [7., 8.]]]), size=(2, 2), nnz=2, dtype=torch.float64, + layout=torch.sparse_bsr) +""".format(**factory_common_args)) + +add_docstr(torch.sparse_bsc_tensor, + r""" +sparse_bsc_tensor(ccol_indices, row_indices, values, size=None, *, dtype=None, device=None, requires_grad=False) -> Tensor + +Constructs a :ref:`sparse tensor in BSC (Block Compressed Sparse +Column)) ` with specified 2-dimensional blocks at the +given :attr:`ccol_indices` and :attr:`row_indices`. Sparse matrix +multiplication operations in BSC format are typically faster than that +for sparse tensors in COO format. Make you have a look at :ref:`the +note on the data type of the indices `. + +Args: + ccol_indices (array_like): One-dimensional array of size size[1] + + 1. The last element is the number of non-zeros. This tensor + encodes the index in values and row_indices depending on where + the given column starts. Each successive number in the tensor + subtracted by the number before it denotes the number of + elements in a given column. + row_indices (array_like): Row co-ordinates of each element in + values. Strictly one dimensional tensor with the same length + as values. + values (array_list): Initial blocks for the tensor. Can be a list, + tuple, NumPy ``ndarray``, and other types. The dimensionality + of values must be two plus the dimensionality of row_indices. + size (list, tuple, :class:`torch.Size`, optional): Size of the + sparse tensor. If not provided, the size will be inferred as + the minimum size big enough to hold all non-zero blocks. + +Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type of + returned tensor. Default: if None, infers data type from + :attr:`values`. + device (:class:`torch.device`, optional): the desired device of + returned tensor. Default: if None, uses the current device + for the default tensor type (see + :func:`torch.set_default_tensor_type`). :attr:`device` will be + the CPU for CPU tensor types and the current CUDA device for + CUDA tensor types. + {requires_grad} + +Example:: + >>> ccol_indices = [0, 1, 2] + >>> row_indices = [0, 1] + >>> values = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]] + >>> torch.sparse_bsc_tensor(torch.tensor(ccol_indices, dtype=torch.int64), + ... torch.tensor(row_indices, dtype=torch.int64), + ... torch.tensor(values), dtype=torch.double) + tensor(ccol_indices=tensor([0, 1, 2]), + row_indices=tensor([0, 1]), + values=tensor([[[1., 2.], + [3., 4.]], + + [[5., 6.], + [7., 8.]]]), size=(2, 2), nnz=2, dtype=torch.float64, + layout=torch.sparse_bsc) +""".format(**factory_common_args)) + add_docstr(torch.sparse_coo_tensor, r""" sparse_coo_tensor(indices, values, size=None, *, dtype=None, device=None, requires_grad=False) -> Tensor @@ -9747,10 +10033,10 @@ def merge_dicts(*dicts): r""" roll(input, shifts, dims=None) -> Tensor -Roll the tensor along the given dimension(s). Elements that are shifted beyond the -last position are re-introduced at the first position. If a dimension is not -specified, the tensor will be flattened before rolling and then restored -to the original shape. +Roll the tensor :attr:`input` along the given dimension(s). Elements that are +shifted beyond the last position are re-introduced at the first position. If +:attr:`dims` is `None`, the tensor will be flattened before rolling and then +restored to the original shape. Args: {input} @@ -9768,6 +10054,11 @@ def merge_dicts(*dicts): [3, 4], [5, 6], [7, 8]]) + >>> torch.roll(x, 1) + tensor([[8, 1], + [2, 3], + [4, 5], + [6, 7]]) >>> torch.roll(x, 1, 0) tensor([[7, 8], [1, 2], @@ -10760,12 +11051,6 @@ def merge_dicts(*dicts): .. note:: The tensors :attr:`condition`, :attr:`x`, :attr:`y` must be :ref:`broadcastable `. -.. note:: - Currently valid scalar and tensor combination are - 1. Scalar of floating dtype and torch.double - 2. Scalar of integral dtype and torch.long - 3. Scalar of complex dtype and torch.complex128 - Arguments: condition (BoolTensor): When True (nonzero), yield x, otherwise yield y x (Tensor or Scalar): value (if :attr:`x` is a scalar) or values selected at indices @@ -11955,3 +12240,141 @@ def merge_dicts(*dicts): tensor([[2, 3, 5], [2, 3, 5]]) """) + +add_docstr(torch.view_as_real_copy, + r""" +Performs the same operation as :func:`torch.view_as_real`, but all output tensors +are freshly created instead of aliasing the input. +""") + +add_docstr(torch.view_as_complex_copy, + r""" +Performs the same operation as :func:`torch.view_as_complex`, but all output tensors +are freshly created instead of aliasing the input. +""") + +add_docstr(torch.as_strided_copy, + r""" +Performs the same operation as :func:`torch.as_strided`, but all output tensors +are freshly created instead of aliasing the input. +""") + +add_docstr(torch.diagonal_copy, + r""" +Performs the same operation as :func:`torch.diagonal`, but all output tensors +are freshly created instead of aliasing the input. +""") + +add_docstr(torch.expand_copy, + r""" +Performs the same operation as :func:`torch.expand`, but all output tensors +are freshly created instead of aliasing the input. +""") + +add_docstr(torch.permute_copy, + r""" +Performs the same operation as :func:`torch.permute`, but all output tensors +are freshly created instead of aliasing the input. +""") + +add_docstr(torch.select_copy, + r""" +Performs the same operation as :func:`torch.select`, but all output tensors +are freshly created instead of aliasing the input. +""") + +add_docstr(torch.detach_copy, + r""" +Performs the same operation as :func:`torch.detach`, but all output tensors +are freshly created instead of aliasing the input. +""") + +add_docstr(torch.slice_copy, + r""" +Performs the same operation as :func:`torch.slice`, but all output tensors +are freshly created instead of aliasing the input. +""") + +add_docstr(torch.split_copy, + r""" +Performs the same operation as :func:`torch.split`, but all output tensors +are freshly created instead of aliasing the input. +""") + +add_docstr(torch.split_with_sizes_copy, + r""" +Performs the same operation as :func:`torch.split_with_sizes`, but all output tensors +are freshly created instead of aliasing the input. +""") + +add_docstr(torch.squeeze_copy, + r""" +Performs the same operation as :func:`torch.squeeze`, but all output tensors +are freshly created instead of aliasing the input. +""") + +add_docstr(torch.t_copy, + r""" +Performs the same operation as :func:`torch.t`, but all output tensors +are freshly created instead of aliasing the input. +""") + +add_docstr(torch.transpose_copy, + r""" +Performs the same operation as :func:`torch.transpose`, but all output tensors +are freshly created instead of aliasing the input. +""") + +add_docstr(torch.unsqueeze_copy, + r""" +Performs the same operation as :func:`torch.unsqueeze`, but all output tensors +are freshly created instead of aliasing the input. +""") + +add_docstr(torch.indices_copy, + r""" +Performs the same operation as :func:`torch.indices`, but all output tensors +are freshly created instead of aliasing the input. +""") + +add_docstr(torch.values_copy, + r""" +Performs the same operation as :func:`torch.values`, but all output tensors +are freshly created instead of aliasing the input. +""") + +add_docstr(torch.crow_indices_copy, + r""" +Performs the same operation as :func:`torch.crow_indices`, but all output tensors +are freshly created instead of aliasing the input. +""") + +add_docstr(torch.col_indices_copy, + r""" +Performs the same operation as :func:`torch.col_indices`, but all output tensors +are freshly created instead of aliasing the input. +""") + +add_docstr(torch.unbind_copy, + r""" +Performs the same operation as :func:`torch.unbind`, but all output tensors +are freshly created instead of aliasing the input. +""") + +add_docstr(torch.view_copy, + r""" +Performs the same operation as :func:`torch.view`, but all output tensors +are freshly created instead of aliasing the input. +""") + +add_docstr(torch.unfold_copy, + r""" +Performs the same operation as :func:`torch.unfold`, but all output tensors +are freshly created instead of aliasing the input. +""") + +add_docstr(torch.alias_copy, + r""" +Performs the same operation as :func:`torch.alias`, but all output tensors +are freshly created instead of aliasing the input. +""") diff --git a/torch/_utils.py b/torch/_utils.py index 862727731419..e19baa0c2684 100644 --- a/torch/_utils.py +++ b/torch/_utils.py @@ -128,7 +128,7 @@ def _get_async_or_non_blocking(function_name, non_blocking, kwargs): # TODO: Once we decide to break serialization FC, `storage` no longer needs to -# be a TypedStorage +# be a _TypedStorage def _rebuild_tensor(storage, storage_offset, size, stride): # first construct a tensor with the correct dtype/device t = torch.tensor([], dtype=storage.dtype, device=storage._untyped().device) @@ -202,15 +202,20 @@ def _rebuild_device_tensor_from_numpy(data, dtype, device, requires_grad): # Should not be used, only here to be able to load Tensors serialized with older versions of pytorch _rebuild_xla_tensor = _rebuild_device_tensor_from_numpy -_rebuild_mlc_tensor = _rebuild_device_tensor_from_numpy def _rebuild_meta_tensor_no_storage(dtype, size, stride, requires_grad): return torch.empty_strided(size, stride, dtype=dtype, device='meta', requires_grad=requires_grad) +def _rebuild_wrapper_subclass(cls, dtype, size, stride, storage_offset, layout, device, requires_grad): + return torch.Tensor._make_wrapper_subclass( # type: ignore[attr-defined] + cls, size, strides=stride, storage_offset=storage_offset, layout=layout, + device=device, requires_grad=requires_grad) + + # TODO: Once we decide to break serialization FC, `storage` no longer needs to -# be a TypedStorage +# be a _TypedStorage def _rebuild_qtensor(storage, storage_offset, size, stride, quantizer_params, requires_grad, backward_hooks): qscheme = quantizer_params[0] if qscheme == torch.per_tensor_affine: diff --git a/torch/amp/__init__.py b/torch/amp/__init__.py new file mode 100644 index 000000000000..e4fe09f55632 --- /dev/null +++ b/torch/amp/__init__.py @@ -0,0 +1 @@ +from .autocast_mode import autocast diff --git a/torch/amp/autocast_mode.py b/torch/amp/autocast_mode.py new file mode 100644 index 000000000000..072be3b91859 --- /dev/null +++ b/torch/amp/autocast_mode.py @@ -0,0 +1,276 @@ +import torch +import functools +import warnings + +from typing import Any, Optional +from torch.types import _dtype + +def autocast_decorator(autocast_instance, func): + @functools.wraps(func) + def decorate_autocast(*args, **kwargs): + with autocast_instance: + return func(*args, **kwargs) + decorate_autocast.__script_unsupported = '@autocast() decorator is not supported in script mode' # type: ignore[attr-defined] + return decorate_autocast + +class autocast(object): + r""" + Instances of :class:`autocast` serve as context managers or decorators that + allow regions of your script to run in mixed precision. + + In these regions, ops run in an op-specific dtype chosen by autocast + to improve performance while maintaining accuracy. + See the :ref:`Autocast Op Reference` for details. + + When entering an autocast-enabled region, Tensors may be any type. + You should not call ``half()`` or ``bfloat16()`` on your model(s) or inputs when using autocasting. + + :class:`autocast` should wrap only the forward pass(es) of your network, including the loss + computation(s). Backward passes under autocast are not recommended. + Backward ops run in the same type that autocast used for corresponding forward ops. + + Example for CUDA Devices:: + + # Creates model and optimizer in default precision + model = Net().cuda() + optimizer = optim.SGD(model.parameters(), ...) + + for input, target in data: + optimizer.zero_grad() + + # Enables autocasting for the forward pass (model + loss) + with autocast(): + output = model(input) + loss = loss_fn(output, target) + + # Exits the context manager before backward() + loss.backward() + optimizer.step() + + See the :ref:`CUDA Automatic Mixed Precision examples` for usage (along with gradient scaling) + in more complex scenarios (e.g., gradient penalty, multiple models/losses, custom autograd functions). + + :class:`autocast` can also be used as a decorator, e.g., on the ``forward`` method of your model:: + + class AutocastModel(nn.Module): + ... + @autocast() + def forward(self, input): + ... + + Floating-point Tensors produced in an autocast-enabled region may be ``float16``. + After returning to an autocast-disabled region, using them with floating-point + Tensors of different dtypes may cause type mismatch errors. If so, cast the Tensor(s) + produced in the autocast region back to ``float32`` (or other dtype if desired). + If a Tensor from the autocast region is already ``float32``, the cast is a no-op, + and incurs no additional overhead. + CUDA Example:: + + # Creates some tensors in default dtype (here assumed to be float32) + a_float32 = torch.rand((8, 8), device="cuda") + b_float32 = torch.rand((8, 8), device="cuda") + c_float32 = torch.rand((8, 8), device="cuda") + d_float32 = torch.rand((8, 8), device="cuda") + + with autocast(): + # torch.mm is on autocast's list of ops that should run in float16. + # Inputs are float32, but the op runs in float16 and produces float16 output. + # No manual casts are required. + e_float16 = torch.mm(a_float32, b_float32) + # Also handles mixed input types + f_float16 = torch.mm(d_float32, e_float16) + + # After exiting autocast, calls f_float16.float() to use with d_float32 + g_float32 = torch.mm(d_float32, f_float16.float()) + + CPU Training Example:: + + # Creates model and optimizer in default precision + model = Net() + optimizer = optim.SGD(model.parameters(), ...) + + for epoch in epochs: + for input, target in data: + optimizer.zero_grad() + + # Runs the forward pass with autocasting. + with torch.autocast(device_type="cpu", dtype=torch.bfloat16): + output = model(input) + loss = loss_fn(output, target) + + loss.backward() + optimizer.step() + + + CPU Inference Example:: + + # Creates model in default precision + model = Net().eval() + + with torch.autocast(device_type="cpu", dtype=torch.bfloat16): + for input in data: + # Runs the forward pass with autocasting. + output = model(input) + + CPU Inference Example with Jit Trace:: + + class TestModel(nn.Module): + def __init__(self, input_size, num_classes): + super(TestModel, self).__init__() + self.fc1 = nn.Linear(input_size, num_classes) + def forward(self, x): + return self.fc1(x) + + input_size = 2 + num_classes = 2 + model = TestModel(input_size, num_classes).eval() + + # For now, we suggest to disable the Jit Autocast Pass, + # As the issue: https://github.com/pytorch/pytorch/issues/75956 + torch._C._jit_set_autocast_mode(False) + + with torch.cpu.amp.autocast(cache_enabled=False): + model = torch.jit.trace(model, torch.randn(1, input_size)) + model = torch.jit.freeze(model) + # Models Run + for _ in range(3): + model(torch.randn(1, input_size)) + + Type mismatch errors *in* an autocast-enabled region are a bug; if this is what you observe, + please file an issue. + + ``autocast(enabled=False)`` subregions can be nested in autocast-enabled regions. + Locally disabling autocast can be useful, for example, if you want to force a subregion + to run in a particular ``dtype``. Disabling autocast gives you explicit control over + the execution type. In the subregion, inputs from the surrounding region + should be cast to ``dtype`` before use:: + + # Creates some tensors in default dtype (here assumed to be float32) + a_float32 = torch.rand((8, 8), device="cuda") + b_float32 = torch.rand((8, 8), device="cuda") + c_float32 = torch.rand((8, 8), device="cuda") + d_float32 = torch.rand((8, 8), device="cuda") + + with autocast(): + e_float16 = torch.mm(a_float32, b_float32) + with autocast(enabled=False): + # Calls e_float16.float() to ensure float32 execution + # (necessary because e_float16 was created in an autocasted region) + f_float32 = torch.mm(c_float32, e_float16.float()) + + # No manual casts are required when re-entering the autocast-enabled region. + # torch.mm again runs in float16 and produces float16 output, regardless of input types. + g_float16 = torch.mm(d_float32, f_float32) + + The autocast state is thread-local. If you want it enabled in a new thread, the context manager or decorator + must be invoked in that thread. This affects :class:`torch.nn.DataParallel` and + :class:`torch.nn.parallel.DistributedDataParallel` when used with more than one GPU per process + (see :ref:`Working with Multiple GPUs`). + + Args: + device_type(string, required): Whether to use 'cuda' or 'cpu' device + enabled(bool, optional, default=True): Whether autocasting should be enabled in the region. + dtype(torch_dtype, optional): Whether to use torch.float16 or torch.bfloat16. + cache_enabled(bool, optional, default=True): Whether the weight cache inside autocast should be enabled. + """ + def __init__(self, device_type : str, + dtype : Optional[_dtype] = None, + enabled : bool = True, + cache_enabled : Optional[bool] = None): + if torch._jit_internal.is_scripting(): + self._enabled = enabled + self.device = device_type + self.fast_dtype = dtype + # TODO: support get_autocast_gpu/cpu_dtype + assert dtype is not None + return + self.device = device_type + if self.device == 'cuda': + self.fast_dtype = torch.get_autocast_gpu_dtype() + elif self.device == 'cpu': + self.fast_dtype = torch.get_autocast_cpu_dtype() + elif self.device == 'xpu': + self.fast_dtype = torch.xpu.get_autocast_xpu_dtype() # type: ignore[attr-defined] + else: + raise RuntimeError('User specified autocast device_type must be \'cuda\' or \'cpu\'') + self._cache_enabled = torch.is_autocast_cache_enabled() + if torch.cuda.amp.common.amp_definitely_not_available() and self.device == 'cuda': + warnings.warn('User provided device_type of \'cuda\', but CUDA is not available. Disabling') + enabled = False + if dtype is not None: + self.fast_dtype = dtype + if cache_enabled is not None: + self._cache_enabled = cache_enabled + + if self.device == 'cpu': + supported_dtype = [torch.bfloat16] + if self.fast_dtype not in supported_dtype: + error_message = 'In CPU autocast, but the target dtype is not supported. Disabling autocast.\n' + error_message += 'CPU Autocast only supports dtype of torch.bfloat16 currently.' + warnings.warn(error_message) + enabled = False + if self.device == 'xpu': + supported_dtype = [torch.bfloat16, torch.float16] + if self.fast_dtype not in supported_dtype: + error_message = 'In XPU autocast, but the target dtype is not supported. Disabling autocast.\n' + error_message += 'XPU Autocast only supports dtype of torch.bfloat16 currently.' + warnings.warn(error_message) + enabled = False + if self.device == 'cuda': + if self.fast_dtype == torch.bfloat16 and not torch.cuda.is_bf16_supported(): + raise RuntimeError('Current CUDA Device does not support bfloat16. Please switch dtype to float16.') + self._enabled = enabled + + def __enter__(self): + if torch._jit_internal.is_scripting(): + assert self.fast_dtype is not None + return self + + self.prev_cache_enabled = torch.is_autocast_cache_enabled() + if self.device == 'cpu': + self.prev = torch.is_autocast_cpu_enabled() + self.prev_fastdtype = torch.get_autocast_cpu_dtype() + torch.set_autocast_cpu_enabled(self._enabled) + torch.set_autocast_cpu_dtype(self.fast_dtype) # type: ignore[arg-type] + torch.autocast_increment_nesting() + elif self.device == 'xpu': + self.prev = torch.xpu.is_autocast_xpu_enabled() # type: ignore[attr-defined] + self.prev_fastdtype = torch.xpu.get_autocast_xpu_dtype() # type: ignore[attr-defined] + torch.xpu.set_autocast_xpu_enabled(self._enabled) # type: ignore[attr-defined] + torch.xpu.set_autocast_xpu_dtype(self.fast_dtype) # type: ignore[attr-defined] + torch.autocast_increment_nesting() + else: + self.prev = torch.is_autocast_enabled() + self.prev_fastdtype = torch.get_autocast_gpu_dtype() + torch.set_autocast_gpu_dtype(self.fast_dtype) # type: ignore[arg-type] + torch.set_autocast_enabled(self._enabled) + torch.autocast_increment_nesting() + torch.set_autocast_cache_enabled(self._cache_enabled) + + def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any): # type: ignore[override] + if torch._jit_internal.is_scripting(): + return + + # Drop the cache when we exit to a nesting level that's outside any instance of autocast. + if self.device == 'cpu': + if torch.autocast_decrement_nesting() == 0: + torch.clear_autocast_cache() + torch.set_autocast_cpu_enabled(self.prev) + torch.set_autocast_cpu_dtype(self.prev_fastdtype) + elif self.device == 'xpu': + if torch.autocast_decrement_nesting() == 0: + torch.clear_autocast_cache() + torch.xpu.set_autocast_xpu_enabled(self.prev) # type: ignore[attr-defined] + torch.xpu.set_autocast_xpu_dtype(self.prev_fastdtype) # type: ignore[attr-defined] + else: + if torch.autocast_decrement_nesting() == 0: + torch.clear_autocast_cache() + torch.set_autocast_enabled(self.prev) + torch.set_autocast_gpu_dtype(self.prev_fastdtype) + torch.set_autocast_cache_enabled(self.prev_cache_enabled) + return False + + def __call__(self, func): + if torch._jit_internal.is_scripting(): + return func + return autocast_decorator(self, func) diff --git a/torch/ao/nn/sparse/quantized/linear.py b/torch/ao/nn/sparse/quantized/linear.py index dde8cd2563a8..c57122fbf411 100644 --- a/torch/ao/nn/sparse/quantized/linear.py +++ b/torch/ao/nn/sparse/quantized/linear.py @@ -169,14 +169,14 @@ def from_float(cls, mod): assert hasattr(mod, 'sparse_params'), \ ('Expecting the Linear to have `sparse_params`. Make sure you have provided arguments ' 'in the `sparsifier.squash_mask(params_to_save=("sparse_block_shape",))` method.') - sparse_block_shape = mod.sparse_params.get('sparse_block_shape', None) + sparse_block_shape = mod.sparse_params.get('sparse_block_shape', None) # type: ignore[operator, union-attr] assert isinstance(sparse_block_shape, (tuple, list)) assert len(sparse_block_shape) == 2 # TODO: Need to add options to qconfig to avoid the calibration. # TODO: Add calibration for the sparsity assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined' activation_post_process = mod.activation_post_process - weight_post_process = mod.qconfig.weight() + weight_post_process = mod.qconfig.weight() # type: ignore[operator, union-attr] # Assumption is that the weight is already sparsified by the # `sparsifier.convert` @@ -184,7 +184,7 @@ def from_float(cls, mod): weight_post_process(weight) dtype = weight_post_process.dtype - act_scale, act_zp = activation_post_process.calculate_qparams() + act_scale, act_zp = activation_post_process.calculate_qparams() # type: ignore[operator, union-attr] assert dtype == torch.qint8, 'Weight observer must have dtype torch.qint8' w_sc, w_zp = weight_post_process.calculate_qparams() if isinstance(w_zp, torch.Tensor): @@ -193,15 +193,15 @@ def from_float(cls, mod): assert w_zp == 0, 'Weight zero point must map to 0' qweight = _quantize_weight(weight.float(), weight_post_process) - row_block_size = mod.sparse_params['sparse_block_shape'][0] - col_block_size = mod.sparse_params['sparse_block_shape'][1] + row_block_size = mod.sparse_params['sparse_block_shape'][0] # type: ignore[index] + col_block_size = mod.sparse_params['sparse_block_shape'][1] # type: ignore[index] qlinear = cls(mod.in_features, mod.out_features, row_block_size, col_block_size, dtype=dtype) qlinear.set_weight_bias(qweight, mod.bias, - row_block_size, col_block_size) + row_block_size, col_block_size) # type: ignore[arg-type] qlinear.scale = float(act_scale) qlinear.zero_point = int(act_zp) return qlinear diff --git a/torch/ao/ns/_numeric_suite.py b/torch/ao/ns/_numeric_suite.py index 2db70b87a56a..2a54535678b2 100644 --- a/torch/ao/ns/_numeric_suite.py +++ b/torch/ao/ns/_numeric_suite.py @@ -436,6 +436,8 @@ def get_matching_activations( quantized_dict = get_logger_dict(q_module) act_dict: Dict[str, Dict] = {} for key in quantized_dict: + if len(quantized_dict[key]["tensor_val"]) == 0: + continue match_key = _find_match(sorted(float_dict, reverse=True), key, "stats") if match_key is not None: act_dict[key] = {} diff --git a/torch/ao/ns/_numeric_suite_fx.py b/torch/ao/ns/_numeric_suite_fx.py index 2e65fbec48f4..116b46240105 100644 --- a/torch/ao/ns/_numeric_suite_fx.py +++ b/torch/ao/ns/_numeric_suite_fx.py @@ -131,6 +131,9 @@ class OutputLogger(nn.Module): stats: List[torch.Tensor] stats_rnn: List[RNNReturnType] + # Mark as impure so that calls to it will not be removed during DCE. + _is_impure = True + def __init__( self, ref_node_name: str, diff --git a/torch/ao/ns/fx/graph_passes.py b/torch/ao/ns/fx/graph_passes.py index bcebadc95d5c..23e235c891db 100644 --- a/torch/ao/ns/fx/graph_passes.py +++ b/torch/ao/ns/fx/graph_passes.py @@ -12,6 +12,7 @@ get_target_type_str, get_arg_indices_of_inputs_to_log, get_node_input_qparams, + op_type_supports_shadowing, ) from .ns_types import ( @@ -220,6 +221,8 @@ def _insert_dtype_cast_after_node( """ dtype_cast_op = None dtype_cast_mod_cls = None + dtype_cast_method = None + dtype_cast_method_dtype = None dtype_cast_scale = None dtype_cast_zero_point = None node_input_type_a, _node_output_type_a = \ @@ -257,6 +260,12 @@ def _insert_dtype_cast_after_node( if node_a_input_qparams is not None: dtype_cast_op = torch.quantize_per_tensor # type: ignore[assignment] dtype_cast_scale, dtype_cast_zero_point = node_a_input_qparams + elif ( + node_input_type_a == NodeInputOrOutputType.FP16 and + node_input_type_c == NodeInputOrOutputType.FP32 + ): + dtype_cast_method = 'to' + dtype_cast_method_dtype = torch.float16 else: raise AssertionError( f"dtype cast from {node_input_type_c} {node_c.format_node()} to " + @@ -274,6 +283,10 @@ def _insert_dtype_cast_after_node( return graph_c.create_node( 'call_function', dtype_cast_op, (prev_node_c,), {}, new_dtype_cast_name) + elif dtype_cast_method: + return graph_c.create_node( + 'call_method', dtype_cast_method, + (prev_node_c, dtype_cast_method_dtype), {}, new_dtype_cast_name) else: assert dtype_cast_mod_cls dtype_cast_mod = dtype_cast_mod_cls() @@ -345,7 +358,54 @@ def _copy_node_from_a_to_c( else: raise AssertionError( - f"handling of node with op {node_a.op} is not implemented") + f"handling of node {node_a.format_node()} with op {node_a.op} is not implemented") + +def _can_insert_copy_of_subgraph_a( + subgraph_a: NSSubgraph, + gm_a: GraphModule, + num_non_param_args_node_a: int, +) -> bool: + """ + This function returns `False` if the input subgraph cannot be copied by + `_insert_copy_of_subgraph_a_after_input_node_c`. This usually means + that there is a corner case logic for which copy is not yet implemented. + """ + # populate the list of nodes we need to check + nodes = [] + cur_node = subgraph_a.end_node + while cur_node != subgraph_a.start_node: + nodes.append(cur_node) + cur_node = cur_node.args[0] # type: ignore[assignment] + nodes.append(cur_node) + nodes.reverse() + + def _can_insert(node_a_arg, gm_a): + if isinstance(node_a_arg, Node): + arg_a = return_first_non_observer_node(node_a_arg, gm_a) + if arg_a.op == 'call_method': + return arg_a.target in ('dequantize', 'to') + elif arg_a.op == 'get_attr': + return True + else: + return False + elif isinstance(node_a_arg, (list, tuple)): + for el in node_a_arg: + if not isinstance(el, Node): + return False + return True + + # For each node, check if we handle the copy behavior. This follows the + # logic in `_insert_copy_of_subgraph_a_after_input_node_c`. + for node_a_arg in nodes[0].args[num_non_param_args_node_a:]: + if not _can_insert(node_a_arg, gm_a): + return False + + for node in nodes[1:]: + for node_a_arg in node.args[1:]: + if not _can_insert(node_a_arg, gm_a): + return False + + return True def _insert_copy_of_subgraph_a_after_input_node_c( input_node_c: Union[Node, List[Node]], @@ -464,7 +524,7 @@ def _insert_copy_of_node_a_after_input_node_c( arg_a = return_first_non_observer_node(node_a_arg, gm_a) node_a_arg_copy = _copy_node_from_a_to_c(arg_a, gm_a, gm_b, graph_c) new_args.append(node_a_arg_copy) - elif isinstance(node_a_arg, (int, float)): + elif isinstance(node_a_arg, (int, float, torch.dtype)): new_args.append(node_a_arg) elif isinstance(node_a_arg, (list, tuple)): for el in node_a_arg: @@ -589,6 +649,26 @@ def load_arg(a): subgraph_a, ref_name, ref_node_type_a, ref_node_type_b = \ end_node_b_to_matched_subgraph_a_and_name[node_b] + if len(node_b.args) == 0: + print( + f'skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}' + + f', start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}' + + ', kwargs-only node not handled yet') + env_c[node_b.name] = graph_c.node_copy(node_b, load_arg) + continue + + all_op_types_support_shadowing = ( + op_type_supports_shadowing(subgraph_a.start_node) and + op_type_supports_shadowing(node_b) + ) + if not all_op_types_support_shadowing: + print( + f'skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}' + + f', start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}' + + ', unsupported') + env_c[node_b.name] = graph_c.node_copy(node_b, load_arg) + continue + # For both start_node and end_node verify that we know how to do # the dtype cast. If we do not, skip. node_input_type_a, node_output_type_a = \ @@ -630,6 +710,16 @@ def load_arg(a): env_c[node_b.name] = graph_c.node_copy(node_b, load_arg) continue + num_non_param_args_node_a = \ + get_number_of_non_param_args(subgraph_a.start_node, gm_a) + if not _can_insert_copy_of_subgraph_a(subgraph_a, gm_a, num_non_param_args_node_a): + print( + f'skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}' + + f', start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}' + + ', unhandled logic in subgraph copy') + env_c[node_b.name] = graph_c.node_copy(node_b, load_arg) + continue + fqn_base_a = _maybe_get_fqn(subgraph_a.base_op_node, gm_a) fqn_base_b = _maybe_get_fqn(subgraph_b.base_op_node, gm_b) diff --git a/torch/ao/ns/fx/mappings.py b/torch/ao/ns/fx/mappings.py index d27c5d165ad8..fc53a24fc53c 100644 --- a/torch/ao/ns/fx/mappings.py +++ b/torch/ao/ns/fx/mappings.py @@ -13,6 +13,10 @@ import torch.nn.intrinsic as nni import torch.nn.qat as nnqat import torch.nn.qat.dynamic as nnqatd +from torch.ao.quantization.backend_config import get_native_backend_config_dict +import torch.ao.quantization.fx._lower_to_native_backend as \ + _lower_to_native_backend +import torch.ao.quantization.quantization_mappings as quantization_mappings from .ns_types import NSNodeTargetType @@ -20,73 +24,35 @@ def get_base_name_to_sets_of_related_ops() -> Dict[str, Set[NSNodeTargetType]]: + # note: this set is modified below by items from backend_config_dict sets_of_related_ops: List[Set[NSNodeTargetType]] = [ # conv modules set([ nn.Conv1d, - nnq.Conv1d, - nnqd.Conv1d, - nniqat.ConvBn1d, - nniqat.ConvBnReLU1d, - nniq.ConvReLU1d, - nni.ConvReLU1d, ]), set([ nn.Conv2d, - nnq.Conv2d, - nnqd.Conv2d, - nnqat.Conv2d, - nniqat.ConvBn2d, - nniqat.ConvBnReLU2d, - nniqat.ConvReLU2d, - nniq.ConvReLU2d, - nni.ConvReLU2d, ]), set([ nn.Conv3d, - nnq.Conv3d, - nnqd.Conv3d, - nnqat.Conv3d, - nniqat.ConvBn3d, - nniqat.ConvBnReLU3d, - nniqat.ConvReLU3d, - nniq.ConvReLU3d, - nni.ConvReLU3d, ]), # conv functionals set([ F.conv1d, - toq.conv1d, - toq.conv1d_relu, ]), set([ F.conv2d, - toq.conv2d, - toq.conv2d_relu, ]), set([ F.conv3d, - toq.conv3d, - toq.conv3d_relu, ]), # linear modules set([ nn.Linear, - nnq.Linear, - nni.LinearReLU, - nniq.LinearReLU, - nniqd.LinearReLU, - nnqat.Linear, - nnqatd.Linear, - nnqd.Linear, - nniqat.LinearReLU, - nn.modules.linear.NonDynamicallyQuantizableLinear, ]), # linear functionals set([ F.linear, - toq.linear, - toq.linear_relu, ]), # average pool set([ @@ -117,26 +83,20 @@ def get_base_name_to_sets_of_related_ops() -> Dict[str, Set[NSNodeTargetType]]: # LSTM set([ nn.LSTM, - nnqd.LSTM, ]), # add set([ torch.add, - toq.add, operator.add, # x + y - toq.add_relu, ]), # cat set([ torch.cat, - toq.cat, ]), # mul set([ torch.mul, - toq.mul, operator.mul, - toq.mul_relu, ]), # relu set([ @@ -170,121 +130,82 @@ def get_base_name_to_sets_of_related_ops() -> Dict[str, Set[NSNodeTargetType]]: # BatchNorm set([ nn.BatchNorm2d, - nnq.BatchNorm2d, ]), set([ nn.BatchNorm3d, - nnq.BatchNorm3d, ]), # ConvTranspose set([ nn.ConvTranspose1d, - nnq.ConvTranspose1d, - nnqd.ConvTranspose1d, ]), set([ nn.ConvTranspose2d, - nnq.ConvTranspose2d, - nnqd.ConvTranspose2d, ]), set([ nn.ConvTranspose3d, - nnq.ConvTranspose3d, - nnqd.ConvTranspose3d, - ]), - set([ - nn.ConvTranspose3d, - nnq.ConvTranspose3d, ]), # ELU set([ nn.ELU, - nnq.ELU, ]), # Embedding set([ nn.Embedding, - nnq.Embedding, - nnqat.Embedding, ]), # EmbeddingBag set([ nn.EmbeddingBag, - nnq.EmbeddingBag, - nnqat.EmbeddingBag, ]), # GroupNorm set([ nn.GroupNorm, - nnq.GroupNorm, ]), # Hardswish set([ nn.Hardswish, - nnq.Hardswish, ]), # InstanceNorm set([ nn.InstanceNorm1d, - nnq.InstanceNorm1d, ]), set([ nn.InstanceNorm2d, - nnq.InstanceNorm2d, ]), set([ nn.InstanceNorm3d, - nnq.InstanceNorm3d, ]), # LayerNorm set([ nn.LayerNorm, - nnq.LayerNorm, ]), # LeakyReLU set([ nn.LeakyReLU, - nnq.LeakyReLU, ]), # ReLU6 set([ nn.ReLU6, F.relu6, - nnq.ReLU6, - ]), - # BNReLU2d - set([ - nni.BNReLU2d, - nniq.BNReLU2d, - ]), - set([ - nni.BNReLU3d, - nniq.BNReLU3d, ]), # F.elu set([ F.elu, - toq.elu, ]), # F.hardswish set([ F.hardswish, - toq.hardswish, ]), # F.instance_norm set([ F.instance_norm, - toq.instance_norm, ]), # F.layer_norm set([ F.layer_norm, - toq.layer_norm, ]), # F.leaky_relu set([ F.leaky_relu, - toq.leaky_relu, ]), # F.silu set([ @@ -376,20 +297,116 @@ def get_base_name_to_sets_of_related_ops() -> Dict[str, Set[NSNodeTargetType]]: # dropout set([ nn.Dropout, - nnq.Dropout, ]), # F.dropout set([ F.dropout, - toq.dropout, ]), # matmul set([ torch.matmul, - toq.matmul, ]), + # Softmax + set([ + nn.Softmax, + ]), + ] + + # for each floating point op, add versions of the op added by + # backend_config_dict + backend_config_dict = get_native_backend_config_dict() + + new_connections = [ + # technical debt edge case + (nn.Linear, nn.modules.linear.NonDynamicallyQuantizableLinear), ] + for config in backend_config_dict['configs']: + + if 'pattern' not in config: + continue + + # format: (c, (b, a)) + pattern = config['pattern'] + first_element = pattern + # look from the end, because pattern is in reverse order + while isinstance(first_element, (list, tuple)): + first_element = first_element[-1] + + if 'fused_module' in config: + # case 1: pattern fuses a pattern of ops into an op + # example: nn.Conv1d, nn.ReLU fused into nni.ConvReLU1d + new_connections.append((first_element, config['fused_module'])) + + if 'qat_module' in config: + # case 2: pattern swaps a module into a QAT module + # example: nni.ConvReLU1d swapped into nniqat.ConvReLU1d + new_connections.append((first_element, config['qat_module'])) + + if 'reference_quantized_module_for_root' in config: + # case 3: reference version of floating point module, such as + # nn.Conv2d and nnqr.Conv2d + new_connections.append( + (first_element, config['reference_quantized_module_for_root']) + ) + + # + # Add reference module swaps from default lowering path + # + + for source_to_target in ( + _lower_to_native_backend.STATIC_LOWER_MODULE_MAP, + _lower_to_native_backend.DYNAMIC_LOWER_MODULE_MAP, + _lower_to_native_backend.WEIGHT_ONLY_LOWER_MODULE_MAP, + _lower_to_native_backend.SPECIAL_PATTERN_LOWER_MODULE_MAP, + ): + for source, target in source_to_target.items(): # type: ignore[attr-defined] + new_connections.append((source, target)) + + for source_to_double_target in ( + _lower_to_native_backend.STATIC_LOWER_FUSED_MODULE_MAP, + _lower_to_native_backend.DYNAMIC_LOWER_FUSED_MODULE_MAP, + ): + for source, (target1, target2) in source_to_double_target.items(): # type: ignore[attr-defined] + new_connections.append((source, target1)) + new_connections.append((source, target2)) + + # + # Add function swaps from default lowering path + # + + for source, (target1, target2) in \ + _lower_to_native_backend.STATIC_LOWER_FUNCTIONAL_MAP.items(): + new_connections.append((source, target1)) + new_connections.append((source, target2)) + + for source_to_target in ( + _lower_to_native_backend.QBIN_OP_MAPPING, + _lower_to_native_backend.QBIN_RELU_OP_MAPPING, + quantization_mappings.DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS, + ): + for source, target in source_to_target.items(): + new_connections.append((source, target)) + + # + # Add other swaps, ideally in the future this could be removed + # after the lowering code stops using these. + # + for source_to_target in ( + quantization_mappings.DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS, + ): + for source, target in source_to_target.items(): + new_connections.append((source, target)) + + + # add the new connections from backend_config_dict + for item1, item2 in new_connections: + for set_of_related_ops in sets_of_related_ops: + if item1 in set_of_related_ops or item2 in set_of_related_ops: + set_of_related_ops.add(item1) + set_of_related_ops.add(item2) + break + base_name_to_sets_of_related_ops: Dict[str, Set[NSNodeTargetType]] = {} counter = 0 @@ -446,10 +463,10 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]: F.dropout, F.silu, F.mish, - # TODO(future PR): implement shadowing for binary ops and - # uncomment below - # operator.add, - # operator.mul, + operator.add, + torch.add, + operator.mul, + torch.mul, torch.sum, ]) @@ -512,6 +529,7 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]: torch.squeeze, torch.stack, torch.unsqueeze, + operator.add, ]) MODS_IO_TYPE_FP32: Set[NSNodeTargetType] = set([ @@ -523,9 +541,7 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]: nn.Conv1d, nn.Conv2d, nn.Conv3d, - nnqd.Conv1d, - nnqd.Conv2d, - nnqd.Conv3d, + nnqat.Conv1d, nnqat.Conv2d, nnqat.Conv3d, nnqat.Embedding, @@ -540,9 +556,6 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]: nn.ConvTranspose1d, nn.ConvTranspose2d, nn.ConvTranspose3d, - nnqd.ConvTranspose1d, - nnqd.ConvTranspose2d, - nnqd.ConvTranspose3d, nn.ELU, nn.GroupNorm, nn.InstanceNorm1d, @@ -554,12 +567,14 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]: nn.ReLU6, nn.SiLU, nn.Mish, + nn.Softmax, nni.BNReLU2d, nni.BNReLU3d, nni.ConvReLU1d, nni.ConvReLU2d, nni.ConvReLU3d, nni.LinearReLU, + nni.LinearBn1d, nni.ConvBn1d, nni.ConvBn2d, nni.ConvBn3d, @@ -569,9 +584,11 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]: nniqat.ConvBnReLU1d, nniqat.ConvBnReLU2d, nniqat.ConvBnReLU3d, + nniqat.ConvReLU1d, nniqat.ConvReLU2d, nniqat.ConvReLU3d, nniqat.LinearReLU, + nniqat.LinearBn1d, nniqd.LinearReLU, ]) @@ -579,26 +596,23 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]: nnq.Linear, nnq.Conv1d, nnq.Conv2d, - nniq.ConvReLU2d, nnq.Conv3d, nnq.BatchNorm2d, nnq.BatchNorm3d, nnq.Dropout, nnq.ConvTranspose1d, nnq.ConvTranspose2d, - nnq.ConvTranspose3d, nnq.ELU, - nnq.GroupNorm, nnq.InstanceNorm1d, nnq.InstanceNorm2d, nnq.InstanceNorm3d, nnq.LayerNorm, nnq.Hardswish, nnq.LeakyReLU, - nnq.ReLU6, nnq.Embedding, nnq.EmbeddingBag, nnq.Dropout, + nnq.Softmax, nniq.BNReLU2d, nniq.BNReLU3d, nniq.ConvReLU1d, diff --git a/torch/ao/ns/fx/pattern_utils.py b/torch/ao/ns/fx/pattern_utils.py index b0adb5faf95d..b8e6a0ee4dc1 100644 --- a/torch/ao/ns/fx/pattern_utils.py +++ b/torch/ao/ns/fx/pattern_utils.py @@ -8,7 +8,7 @@ from torch.ao.quantization.utils import getattr_from_fqn from .ns_types import NSNodeTargetType -from torch.ao.quantization.fx.pattern_utils import get_default_quant_patterns +from torch.ao.quantization.fx.backend_config_utils import get_native_quant_patterns from torch.ao.quantization import ( ObserverBase, FakeQuantizeBase, @@ -66,9 +66,18 @@ def get_reversed_fusions() -> List[Tuple[NSFusionType, int]]: # * multiple ops: (torch.nn.ReLU, torch.nn.Conv2d) # For fusions, we only care about patterns composed of multiple ops. # TODO(future PR): allow customizations from default patterns. - all_quant_patterns = get_default_quant_patterns() + all_quant_patterns = get_native_quant_patterns() + default_base_op_idx = 0 for quant_pattern, _quant_handler in all_quant_patterns.items(): + # TODO: this is a temporary hack to flatten the patterns from quantization so + # that it works with the ns matcher function, maybe we should use `is_match` + # in torch.ao.quantization.fx.match_utils to match the patterns + if isinstance(quant_pattern, tuple) and len(quant_pattern) == 2 and \ + isinstance(quant_pattern[1], tuple) and len(quant_pattern[1]) == 2: + # flatten the pattern with form (nn.ReLU, (nn.BatchNorm2d, nn.Conv2d)) + quant_pattern = (quant_pattern[0], quant_pattern[1][0], quant_pattern[1][1]) + # Only patterns of multiple ops are fusions, ignore # patterns which contain a single ops (they get matched # without caring about fusions). diff --git a/torch/ao/ns/fx/utils.py b/torch/ao/ns/fx/utils.py index 96a57c438e27..8f1f277aa8c4 100644 --- a/torch/ao/ns/fx/utils.py +++ b/torch/ao/ns/fx/utils.py @@ -60,10 +60,15 @@ def get_node_first_input_and_output_type( elif node.target in FUNS_IO_TYPE_INT8: return (NodeInputOrOutputType.INT8, NodeInputOrOutputType.INT8) elif node.target in FUNS_IO_TYPE_FP32_OR_INT8: - return ( - NodeInputOrOutputType.FP32_OR_INT8, - NodeInputOrOutputType.FP32_OR_INT8, + first_arg = node.args[0] + assert isinstance(first_arg, Node) + ( + _prev_node_input_type, + prev_node_output_type, + ) = get_node_first_input_and_output_type( + first_arg, gm, logger_cls, node_type_to_io_type_map ) + return (prev_node_output_type, prev_node_output_type) else: return (NodeInputOrOutputType.UNKNOWN, NodeInputOrOutputType.UNKNOWN) @@ -71,7 +76,13 @@ def get_node_first_input_and_output_type( assert node.op == "call_module" assert isinstance(node.target, str) mod = getattr_from_fqn(gm, node.target) - if isinstance(mod, (logger_cls, ObserverBase, FakeQuantizeBase)): # type: ignore[arg-type] + is_known_fp32_or_int8_input_module = any( + isinstance(mod, target_type) for target_type in MODS_IO_TYPE_FP32_OR_INT8 # type: ignore[arg-type] + ) + if ( + isinstance(mod, (logger_cls, ObserverBase, FakeQuantizeBase)) # type: ignore[arg-type] + or is_known_fp32_or_int8_input_module + ): # A logger or observer's input and output type is the output # type of the preceding node. first_arg = node.args[0] @@ -89,18 +100,10 @@ def get_node_first_input_and_output_type( is_known_int8_input_module = any( isinstance(mod, target_type) for target_type in MODS_IO_TYPE_INT8 # type: ignore[arg-type] ) - is_known_fp32_or_int8_input_module = any( - isinstance(mod, target_type) for target_type in MODS_IO_TYPE_FP32_OR_INT8 # type: ignore[arg-type] - ) if is_known_fp32_input_module: return (NodeInputOrOutputType.FP32, NodeInputOrOutputType.FP32) elif is_known_int8_input_module: return (NodeInputOrOutputType.INT8, NodeInputOrOutputType.INT8) - elif is_known_fp32_or_int8_input_module: - return ( - NodeInputOrOutputType.FP32_OR_INT8, - NodeInputOrOutputType.FP32_OR_INT8, - ) else: return (NodeInputOrOutputType.UNKNOWN, NodeInputOrOutputType.UNKNOWN) @@ -141,10 +144,15 @@ def get_node_first_input_and_output_type( return (prev_node_output_type, NodeInputOrOutputType.FP16) elif node.target in METHS_IO_TYPE_FP32_OR_INT8: - return ( - NodeInputOrOutputType.FP32_OR_INT8, - NodeInputOrOutputType.FP32_OR_INT8, + first_arg = node.args[0] + assert isinstance(first_arg, Node) + ( + _prev_node_input_type, + prev_node_output_type, + ) = get_node_first_input_and_output_type( + first_arg, gm, logger_cls, node_type_to_io_type_map ) + return (prev_node_output_type, prev_node_output_type) return (NodeInputOrOutputType.UNKNOWN, NodeInputOrOutputType.UNKNOWN) else: @@ -481,3 +489,10 @@ def compute_cosine_similarity(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: x = x.reshape(1, -1) y = y.reshape(1, -1) return torch.nn.functional.cosine_similarity(x, y) + +def op_type_supports_shadowing(node: Node) -> bool: + if node.op == 'call_function': + if node.target in (torch.add, torch.mul, operator.add, operator.mul, torch.cat, torch.stack): + # shadowing for ops with multiple tensor inputs is not implemented yet + return False + return True diff --git a/torch/ao/ns/fx/weight_utils.py b/torch/ao/ns/fx/weight_utils.py index 36e183efe1d8..2020593ddbfb 100644 --- a/torch/ao/ns/fx/weight_utils.py +++ b/torch/ao/ns/fx/weight_utils.py @@ -158,23 +158,27 @@ def get_op_to_type_to_weight_extraction_fn() -> Dict[str, Dict[Callable, Callabl op_to_type_to_weight_extraction_fn: Dict[str, Dict[Callable, Callable]] = { 'call_module': { - # Conv + # Conv1d nn.Conv1d: mod_weight_detach, - nn.Conv2d: mod_weight_detach, - nn.Conv3d: mod_weight_detach, nni.ConvReLU1d: mod_0_weight_detach, - nni.ConvReLU2d: mod_0_weight_detach, - nni.ConvReLU3d: mod_0_weight_detach, nnq.Conv1d: mod_weight_bias_0, + nnqat.Conv1d: mod_weight_detach, nniqat.ConvBn1d: mod_weight_detach, nniqat.ConvBnReLU1d: mod_weight_detach, + nniqat.ConvReLU1d: mod_weight_detach, nniq.ConvReLU1d: mod_weight_bias_0, + # Conv2d + nn.Conv2d: mod_weight_detach, + nni.ConvReLU2d: mod_0_weight_detach, nnq.Conv2d: mod_weight_bias_0, nnqat.Conv2d: mod_weight_detach, nniqat.ConvBn2d: mod_weight_detach, nniqat.ConvBnReLU2d: mod_weight_detach, nniqat.ConvReLU2d: mod_weight_detach, nniq.ConvReLU2d: mod_weight_bias_0, + # Conv3d + nn.Conv3d: mod_weight_detach, + nni.ConvReLU3d: mod_0_weight_detach, nnq.Conv3d: mod_weight_bias_0, nnqat.Conv3d: mod_weight_detach, nniqat.ConvBn3d: mod_weight_detach, @@ -189,6 +193,7 @@ def get_op_to_type_to_weight_extraction_fn() -> Dict[str, Dict[Callable, Callabl nnqat.Linear: mod_weight_detach, nnqd.Linear: mod_weight_bias_0, nniqat.LinearReLU: mod_weight_detach, + nniqat.LinearBn1d: mod_weight_detach, nn.modules.linear.NonDynamicallyQuantizableLinear: mod_weight_detach, # LSTM nn.LSTM: get_lstm_weight, diff --git a/torch/ao/quantization/_dbr/auto_trace.py b/torch/ao/quantization/_dbr/auto_trace.py index 86893a1ef4b8..c786c8628a7f 100644 --- a/torch/ao/quantization/_dbr/auto_trace.py +++ b/torch/ao/quantization/_dbr/auto_trace.py @@ -14,6 +14,8 @@ get_torch_function_hook_type, get_module_hook_type, OpQuantizeabilityType, + AutoQuantizationStateModuleDict, + get_fqn_valid_for_module_dict_key, ) from .model_utils import ( pack_weights_for_functionals, @@ -350,6 +352,8 @@ def _patched_module_call(self, *args, **kwargs): for _, child_child in child.named_modules(): leaves.add(child_child) + self._fqn_to_auto_quant_state_map = AutoQuantizationStateModuleDict() + for fqn, v in named_modules: # fqn is the global FQN, i.e. 'foo.bar.baz' @@ -366,14 +370,39 @@ def _patched_module_call(self, *args, **kwargs): if v is self: # for the top level module only, specify input # and output dtypes - v._auto_quant_state = AutoQuantizationState( + auto_quant_state = AutoQuantizationState( qconfig_dict, fqn, input_dtypes, output_dtypes) - pass else: - v._auto_quant_state = AutoQuantizationState( + auto_quant_state = AutoQuantizationState( qconfig_dict, fqn) + # The code below registers the auto_quant_state object + # of the child in the module hierarchy of the parent, + # and adds the auto_quant_state object to the child + # with a raw __setattr__, without registering it in + # the module hierarchy of the child. + # This is solving the problem of both storing extra state + # (observers) as well as not modifying the meaning of user + # code in child modules which iterates over all module + # children. + # + # This narrows down the issue of dynamically adding + # children to only affect the top level module and not + # the children. + + # On the parent, register this module in the FQN map + fqn_to_use_for_key = \ + get_fqn_valid_for_module_dict_key(fqn) + self._fqn_to_auto_quant_state_map[fqn_to_use_for_key] = \ + auto_quant_state + # On the child, manually set the attribute without + # going through the `torch.nn.Module.__setattr__` + # function, to prevent this object from appearing in + # the child's module hierarchy. + object.__setattr__( + v, '_auto_quant_state', auto_quant_state) + global_op_idx[0] = 0 output = super().__call__(*new_args, **new_kwargs) @@ -688,6 +717,6 @@ def rewrite_for_scripting(self): # checking the fix into `torch.nn.Sequential` to avoid the patch. def _nn_sequential_patched_forward(cls, input): for module in cls: - if not isinstance(module, AutoQuantizationState): + if not isinstance(module, AutoQuantizationStateModuleDict): input = module(input) return input diff --git a/torch/ao/quantization/_dbr/auto_trace_rewriter.py b/torch/ao/quantization/_dbr/auto_trace_rewriter.py index 79d19f410c3c..1189dbc879c4 100644 --- a/torch/ao/quantization/_dbr/auto_trace_rewriter.py +++ b/torch/ao/quantization/_dbr/auto_trace_rewriter.py @@ -8,7 +8,10 @@ import torch.fx from .mappings import conv_ops from .quantization_state import AutoQuantizationState -from .utils import get_packable_arg_idxs +from .utils import ( + get_packable_arg_idxs, + AutoQuantizationStateModuleDict, +) class AllModuleTracer(torch.fx.Tracer): """ @@ -40,10 +43,10 @@ def _maybe_update_args_with_quants(self, args, arg_quant_infos, target): new_first_arg.append(args[0][idx]) else: # create a quant node - scale, zp = input_arg_quant_info + scale, zp, dtype = input_arg_quant_info quant = super().create_node( 'call_function', torch.quantize_per_tensor, - (args[0][idx], scale.item(), zp.item(), torch.quint8), {}, None, None) + (args[0][idx], scale.item(), zp.item(), dtype), {}, None, None) new_first_arg.append(quant) new_args = [new_first_arg, *args[1:]] elif target == torch.cat: @@ -58,10 +61,10 @@ def _maybe_update_args_with_quants(self, args, arg_quant_infos, target): new_args.append(args[idx]) else: # create a quant node - scale, zp = input_arg_quant_info + scale, zp, dtype = input_arg_quant_info quant = super().create_node( 'call_function', torch.quantize_per_tensor, - (args[idx], scale.item(), zp.item(), torch.quint8), {}, None, None) + (args[idx], scale.item(), zp.item(), dtype), {}, None, None) new_args.append(quant) args = tuple(new_args) return args @@ -207,7 +210,7 @@ def linear_rewrite_args(input, weight, bias=None): # class. # TODO(future): remove the hack def call_module(self, m: torch.nn.Module, forward: Callable[..., Any], args : Tuple[Any, ...], kwargs : Dict[str, Any]) -> Any: - if isinstance(m, AutoQuantizationState): + if isinstance(m, AutoQuantizationStateModuleDict): return args[0] return super().call_module(m, forward, args, kwargs) diff --git a/torch/ao/quantization/_dbr/mappings.py b/torch/ao/quantization/_dbr/mappings.py index 1fcad0b61119..89c963f8795a 100644 --- a/torch/ao/quantization/_dbr/mappings.py +++ b/torch/ao/quantization/_dbr/mappings.py @@ -6,6 +6,7 @@ from torch.ao.quantization.quantization_mappings import ( DEFAULT_STATIC_QUANT_MODULE_MAPPINGS, DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS, + DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS, ) import operator @@ -67,6 +68,10 @@ set(DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS.keys()) module_types_supported_by_quantization |= \ set(DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS.values()) +module_types_supported_by_quantization |= \ + set(DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS.keys()) +module_types_supported_by_quantization |= \ + set(DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS.values()) module_types_supported_by_quantization |= set([ # these are quantizeable modules which do not need swaps nn.ReLU, @@ -144,6 +149,9 @@ for a, b in DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS.items(): a_related_to_b.add((a, b)) a_related_to_b.add((b, a)) +for a, b in DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS.items(): + a_related_to_b.add((a, b)) + a_related_to_b.add((b, a)) for a, b in fp32_to_int8_fun_mapping.items(): a_related_to_b.add((a, b)) a_related_to_b.add((b, a)) diff --git a/torch/ao/quantization/_dbr/model_utils.py b/torch/ao/quantization/_dbr/model_utils.py index ca668edce8b7..cd60de8a1ba4 100644 --- a/torch/ao/quantization/_dbr/model_utils.py +++ b/torch/ao/quantization/_dbr/model_utils.py @@ -118,9 +118,9 @@ def attach_scale_zp_values_to_model( if hasattr(module, '_auto_quant_state'): qstate: AutoQuantizationState = module._auto_quant_state # type: ignore[assignment] for tensor_id, observer in qstate.tensor_id_to_observer.items(): - activation_int8_quantized = \ - observer.dtype in [torch.quint8, torch.qint8] - if activation_int8_quantized: + activation_int8_or_int32_quantized = \ + observer.dtype in [torch.quint8, torch.qint8, torch.qint32] + if activation_int8_or_int32_quantized: scale, zp = observer.calculate_qparams() # tensor_id_to_observer is a ModuleDict which has to have string keys # tensor_id_to_scale_zp is a normal dict which can have int keys diff --git a/torch/ao/quantization/_dbr/module_swap_utils.py b/torch/ao/quantization/_dbr/module_swap_utils.py index 59e495ef6760..a95f8210286e 100644 --- a/torch/ao/quantization/_dbr/module_swap_utils.py +++ b/torch/ao/quantization/_dbr/module_swap_utils.py @@ -1,18 +1,23 @@ -from typing import Dict, Callable, Any +from typing import Dict, Callable, Any, Optional import torch from torch.nn.intrinsic import _FusedModule from ..utils import ( activation_is_int8_quantized, + activation_is_int32_quantized, op_is_int8_dynamically_quantized, ) from torch.ao.quantization import swap_module +from torch.ao.quantization.quantization_mappings import ( + DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS, +) def _swap_child_modules( module: torch.nn.Module, static_mappings: Dict[Callable, Any], dynamic_mappings: Dict[Callable, Any], + parent_fqn: Optional[str] = None, ) -> None: """ For each direct child of `module`, swaps it using `static_mappings` @@ -22,26 +27,52 @@ def _swap_child_modules( Recursively calls itself on each child. """ + qstate = getattr(module, '_auto_quant_state', None) + reassign = {} - for name, mod in module.named_children(): + for local_fqn, mod in module.named_children(): + if parent_fqn is None: + global_fqn = local_fqn + else: + global_fqn = f"{parent_fqn}.{local_fqn}" # both fused modules and observed custom modules are # swapped as one unit if not isinstance(mod, _FusedModule): - _swap_child_modules(mod, static_mappings, dynamic_mappings) + _swap_child_modules( + mod, static_mappings, dynamic_mappings, global_fqn) qconfig = getattr(mod, 'qconfig', None) if not qconfig: continue activation_int8_quantized = activation_is_int8_quantized(qconfig) op_int8_dynamically_quantized = op_is_int8_dynamically_quantized(qconfig) + activation_int32_quantized = activation_is_int32_quantized(qconfig) + + # Get the output observer from qstate and attach it to the module, + # to match the API for Eager mode module swaps + if qstate is not None: + output_obs = qstate.get_output_observer_from_fqn(global_fqn) + if output_obs is not None: + mod.activation_post_process = output_obs + if activation_int8_quantized: if not type(mod) in static_mappings: continue - reassign[name] = swap_module(mod, static_mappings, {}) + reassign[local_fqn] = swap_module(mod, static_mappings, {}) elif op_int8_dynamically_quantized: if not type(mod) in dynamic_mappings: continue - reassign[name] = swap_module(mod, dynamic_mappings, {}) + reassign[local_fqn] = swap_module(mod, dynamic_mappings, {}) + elif activation_int32_quantized: + # For now, only apply reference logic to modules quantized to + # int32. Do it automatically. + # TODO(future PR): extend this logic to more dtypes, and add + # the is_reference API flag instead of doing this automatically. + # Note: swap modules only does the swap if the mapping for this + # module exists. + reassign[local_fqn] = swap_module( + mod, DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS, {}) + # TODO(future PR): add support for other dtypes for key, value in reassign.items(): diff --git a/torch/ao/quantization/_dbr/quantization_state.py b/torch/ao/quantization/_dbr/quantization_state.py index f7f20b1ce224..db0ab0111d52 100644 --- a/torch/ao/quantization/_dbr/quantization_state.py +++ b/torch/ao/quantization/_dbr/quantization_state.py @@ -30,7 +30,6 @@ iterate_and_apply, get_op_packing_only_uses_module_attributes, get_packable_tensor_kwarg_names, - get_producer_of_seen_q_op_info, clone_detach_tensor_without_dispatch, get_input_args_quant_dequant_info, get_cur_qconfig, @@ -43,11 +42,15 @@ get_seen_q_op_info_of_end_of_fusion, ) +from torch.ao.quantization.utils import ( + activation_is_int32_quantized, +) + OpConvertInfo = Tuple[ # quantized equivalent of original op (None means keep original) Optional[Callable], - # arg_quant_infos, each element is (scale, zp) for quantized and None otherwise - List[Optional[Tuple[float, int]]], + # arg_quant_infos, each element is (scale, zp, dtype) for quantized and None otherwise + List[Optional[Tuple[float, int, torch.dtype]]], # arg_dequant_infos, each element is True if this arg needs a dequant List[bool], # packed param name, if the op has a packed param @@ -455,9 +458,11 @@ def op_convert_before_hook( quant_info = arg_quant_infos[tensor_arg_idx] dequant_info = arg_dequant_infos[tensor_arg_idx] if quant_info is not None: - scale, zp = quant_info - arg = torch.quantize_per_tensor(arg, scale, zp, torch.quint8) - elif dequant_info is True: + scale, zp, dtype = quant_info + arg = torch.quantize_per_tensor(arg, scale, zp, dtype) + if dequant_info is True: + # Note: both quant and dequant paths are taken for + # reference ops. arg = arg.dequantize() new_first_arg.append(arg) tensor_arg_idx += 1 @@ -471,9 +476,11 @@ def op_convert_before_hook( quant_info = arg_quant_infos[tensor_arg_idx] dequant_info = arg_dequant_infos[tensor_arg_idx] if quant_info is not None: - scale, zp = quant_info - arg = torch.quantize_per_tensor(arg, scale, zp, torch.quint8) - elif dequant_info is True: + scale, zp, dtype = quant_info + arg = torch.quantize_per_tensor(arg, scale, zp, dtype) + if dequant_info is True: + # Note: both quant and dequant paths are taken for + # reference ops. arg = arg.dequantize() new_args.append(arg) tensor_arg_idx += 1 @@ -519,10 +526,22 @@ def op_convert_after_hook( global_op_idx: List[int], ) -> Any: """ - This function is called aftern an op call in a converted model. - - TODO: add dequant, if needed + This function is called after an op call in a converted model. """ + # TODO(future PR): improve performance by moving this out of the + # path of non-reference ops + seen_q_op_info = self._get_cur_seen_q_op_info() + + if seen_q_op_info.is_reference_op_at_inference: + # given the current reference module design, + # we need to quantize to the target dtype + output_tensor_info = seen_q_op_info.output_tensor_infos[0] + tensor_id, inf_dtype = \ + output_tensor_info.id, output_tensor_info.inf_dtype + scale, zp = self.tensor_id_to_scale_zp[tensor_id] + output = torch.quantize_per_tensor( + output, scale, zp, inf_dtype) + if self.log_op_outputs: output_clone = clone_detach_tensor_without_dispatch(output) seen_q_op_info = self._get_cur_seen_q_op_info() @@ -796,11 +815,15 @@ def _first_call_op_prepare_before_hook_create_subgraphs( op_type_is_module = isinstance(op, torch.nn.Module) op_type = type(op) if op_type_is_module else op # type: ignore[assignment] qconfig = get_cur_qconfig(self.qconfig_dict, fqn, op_type) + # TODO(future PR): use API flag instead of qconfig for is_reference + is_reference_op_at_inference = \ + qconfig is not None and activation_is_int32_quantized(qconfig) self.idx_to_seen_q_op_infos[self.idx] = SeenQOpInfo( self.idx, op_type, op_type_is_module, fqn, arg_tensor_infos, [], packable_tensor_idx_to_name, packable_nontensor_idx_to_arg, packable_tensor_kwarg_name_to_name, - op_packing_only_uses_module_attributes, qconfig, None) + op_packing_only_uses_module_attributes, qconfig, None, + is_reference_op_at_inference) return args, kwargs @@ -826,19 +849,13 @@ def _first_call_op_prepare_after_hook_adjust_subgraphs( seen_q_op_info = self._get_cur_seen_q_op_info() func_output_dtype_type = get_func_output_dtype_type(seen_q_op_info) if func_output_dtype_type == FuncOutputDTypeType.DTYPE_DEPENDS_ON_QCONFIG: - if isinstance(op, torch.nn.Module): - # For now, assume that eager mode convert has attached qconfig - # objects to any leaf module which needs quantization - if hasattr(op, 'activation_post_process'): - dtype_to_use = op.activation_post_process.dtype - else: - dtype_to_use = torch.float + qconfig = get_cur_qconfig( + self.qconfig_dict, seen_q_op_info.fqn, + seen_q_op_info.type) + if qconfig is None: + dtype_to_use = torch.float else: - qconfig = get_cur_qconfig(self.qconfig_dict, seen_q_op_info.fqn, op) - if qconfig is None: - dtype_to_use = torch.float - else: - dtype_to_use = qconfig.activation().dtype + dtype_to_use = qconfig.activation().dtype elif func_output_dtype_type == FuncOutputDTypeType.DTYPE_DEFAULT_BC_UNSUPPORTED_SYNTAX: dtype_to_use = torch.float @@ -939,42 +956,8 @@ def _maybe_insert_output_observers( assert seen_q_op_info.input_tensor_infos[0] is not None first_input_tensor_id = seen_q_op_info.input_tensor_infos[0].id - first_input_obs = None - if str(first_input_tensor_id) in self.tensor_id_to_observer: - first_input_obs = \ - self.tensor_id_to_observer[str(first_input_tensor_id)] - else: - # This observer may be in a module (handled by eager - # convert), in which case it's not in our map. For now, - # copy it from the module. In the future, we could look - # into having a soft link. - # TODO: make this handle more cases - # TODO: handle module -> add_scalar -> add_scalar - prev_op = get_producer_of_seen_q_op_info( - self.idx_to_seen_q_op_infos, seen_q_op_info) - assert prev_op is not None - # TODO: the following line needs to only check fqn - # for modules, not for functions - fqn_last_part = prev_op.fqn.split('.')[-1] - if hasattr(root_module, fqn_last_part): - first_input_mod = getattr(root_module, fqn_last_part) - else: - first_input_mod = None - # Currently, both tracing for module fusion and tracing for - # quantization go through this code path. When tracing - # for module fusion, quantizeable modules do not have - # observers yet. For this path to not crash, we create one. - # When tracing for quantization, this will be ignored. - # TODO(future PR): refactor to avoid this. - if first_input_mod and hasattr(first_input_mod, 'activation_post_process'): - first_input_obs = first_input_mod.activation_post_process - else: - # TODO(future PR): check qconfig is None - qconfig = get_cur_qconfig( - self.qconfig_dict, seen_q_op_info.fqn, seen_q_op_info.type) - assert qconfig is not None - first_input_obs = qconfig.activation() - + first_input_obs = \ + self.tensor_id_to_observer[str(first_input_tensor_id)] self.tensor_id_to_observer[str(output_tensor_id)] = first_input_obs def insert_observers(self, root_module: torch.nn.Module): @@ -982,6 +965,15 @@ def insert_observers(self, root_module: torch.nn.Module): self._maybe_insert_input_observers(seen_q_op_info) self._maybe_insert_output_observers(seen_q_op_info, root_module) + def get_output_observer_from_fqn(self, fqn: str) -> Optional[torch.nn.Module]: + for idx, seen_q_op_info in self.idx_to_seen_q_op_infos.items(): + if seen_q_op_info.fqn != fqn: + continue + output_tensor_id = seen_q_op_info.output_tensor_infos[0].id + if str(output_tensor_id) in self.tensor_id_to_observer: + return self.tensor_id_to_observer[str(output_tensor_id)] + return None + # This is a hack to enable nn.Sequential to properly work with # this class. # TODO(future): remove the hack diff --git a/torch/ao/quantization/_dbr/torchscript_utils.py b/torch/ao/quantization/_dbr/torchscript_utils.py new file mode 100644 index 000000000000..2efbbe5fd938 --- /dev/null +++ b/torch/ao/quantization/_dbr/torchscript_utils.py @@ -0,0 +1,15 @@ +import torch +from torch.jit._recursive import wrap_cpp_module + +def remove_redundant_aliases(scripted_module: torch.nn.Module): + """ + Running torch.jit.trace on a model with DBR quantization introduces + extra alias ops, because we use `torch.Tensor.as_subclass` and tracing + through this results in an `aten::alias` function call in TorchScript. + This pass removes these alias calls when it is safe to do so. + """ + module_c = scripted_module._c + module_c = \ + torch._C._jit_pass_dbr_quant_remove_redundant_aliases(module_c) # type: ignore[attr-defined] + scripted_module = wrap_cpp_module(module_c) + return scripted_module diff --git a/torch/ao/quantization/_dbr/utils.py b/torch/ao/quantization/_dbr/utils.py index 4b3465c26150..83b641e80662 100644 --- a/torch/ao/quantization/_dbr/utils.py +++ b/torch/ao/quantization/_dbr/utils.py @@ -102,6 +102,8 @@ class SeenQOpInfo: qconfig: QConfigAny # fusion_info for the op, is None if no fusion is found fusion_info: Optional[FusionInfo] + # True if this op is a reference op during inference + is_reference_op_at_inference: bool def __repr__(self) -> str: s = f"(type): {self.type}\n" @@ -233,9 +235,6 @@ def get_func_output_obs_type( seen_q_op_info: SeenQOpInfo, ) -> FuncOutputObsType: op_type = seen_q_op_info.type - is_module = isinstance(op_type, type(torch.nn.Module)) - if is_module: - return FuncOutputObsType.NONE if seen_q_op_info.qconfig is None: return FuncOutputObsType.NONE @@ -267,6 +266,8 @@ def get_func_output_obs_type( seen_q_op_info.input_tensor_infos[0].inf_dtype in (torch.int32, torch.int64) ): return FuncOutputObsType.NONE + elif op_type in (torch.nn.LSTM,): + return FuncOutputObsType.NONE return FuncOutputObsType.NEW_OBS def converted_func_needs_scale_zp(seen_q_op_info: SeenQOpInfo) -> bool: @@ -583,10 +584,9 @@ def get_torch_function_hook_type( # the direct __dict__ accesses are for performance, because # the default `torch.nn.Module.__getattr__` has overhead. parent_module_has_qstate = parent_module is not None and \ - '_modules' in parent_module.__dict__ and \ - '_auto_quant_state' in parent_module.__dict__['_modules'] + '_auto_quant_state' in parent_module.__dict__ needs_op_hooks = parent_module_has_qstate and \ - parent_module.__dict__['_modules']['_auto_quant_state'].cur_op_needs_hooks(func) # type: ignore[union-attr, operator] + parent_module.__dict__['_auto_quant_state'].cur_op_needs_hooks(func) # type: ignore[union-attr, operator] if needs_op_hooks: return HookType.OP_HOOKS @@ -608,17 +608,15 @@ def get_module_hook_type( if cached_hook_type is not None: return cached_hook_type parent_module_has_qstate = parent_module is not None and \ - '_modules' in parent_module.__dict__ and \ - '_auto_quant_state' in parent_module.__dict__['_modules'] + '_auto_quant_state' in parent_module.__dict__ needs_op_hooks = parent_module_has_qstate and \ - parent_module.__dict__['_modules']['_auto_quant_state'].cur_op_needs_hooks(cur_module) # type: ignore[union-attr, operator] + parent_module.__dict__['_auto_quant_state'].cur_op_needs_hooks(cur_module) # type: ignore[union-attr, operator] # We need IO hooks if # * we are calling forward on a module (always True here) # * that module has quant state # * that module does not need op hooks for the parent needs_io_hooks = ( - '_modules' in cur_module.__dict__ and - '_auto_quant_state' in cur_module.__dict__['_modules'] and + '_auto_quant_state' in cur_module.__dict__ and (not needs_op_hooks) ) needs_arg_dequants = parent_module_has_qstate and not needs_op_hooks @@ -652,7 +650,7 @@ def clone_detach_tensor_without_dispatch(x: torch.Tensor) -> torch.Tensor: def get_input_args_quant_dequant_info( seen_q_op_info: SeenQOpInfo, tensor_id_to_scale_zp: Dict[int, Tuple[torch.Tensor, torch.Tensor]], -) -> Tuple[List[Optional[Tuple[float, int]]], List[bool], bool]: +) -> Tuple[List[Optional[Tuple[float, int, torch.dtype]]], List[bool], bool]: """ Returns a list of information about the tensor inputs to the current op. @@ -678,7 +676,7 @@ def get_input_args_quant_dequant_info( # dequants [False, False] """ - quant_infos: List[Optional[Tuple[float, int]]] = [] + quant_infos: List[Optional[Tuple[float, int, torch.dtype]]] = [] dequant_infos: List[bool] = [] # determine the expected output dtype @@ -694,12 +692,20 @@ def get_input_args_quant_dequant_info( tensor_id = input_arg.id if input_arg.inf_dtype != output_dtype: any_arg_quant_or_dequant_needed = True - if output_dtype == torch.quint8: + if output_dtype in (torch.quint8, torch.qint32): assert tensor_id in tensor_id_to_scale_zp scale, zp = tensor_id_to_scale_zp[tensor_id] # TODO: return this to the caller - quant_infos.append((scale, zp,)) # type: ignore[arg-type] - dequant_infos.append(False) + quant_infos.append((scale, zp, output_dtype)) # type: ignore[arg-type] + if output_dtype == torch.qint32: + # For now, we treat all qint32 ops as reference, so + # we add a dequant before the op. + # TODO(future PR): extend this to more dtypes + # TODO(future PR): use is_reference flag instead of + # assuming + dequant_infos.append(True) + else: + dequant_infos.append(False) else: quant_infos.append(None) dequant_infos.append(True) @@ -727,3 +733,18 @@ def get_cur_qconfig( qconfig_dict, cur_op_type, cur_fqn, global_qconfig) return qconfig + + +# We store quantization state for all children on the top level module in a +# ModuleDict. In order to properly special case this module from other +# ModuleDict instances, we create a marker class for it. +class AutoQuantizationStateModuleDict(torch.nn.ModuleDict): + pass + +def get_fqn_valid_for_module_dict_key(fqn: str) -> str: + """ + Modifies `fqn` to make it a valid key to a ModuleDict. + """ + if fqn == '': + fqn = ' ' + return fqn.replace('.', ':') diff --git a/torch/ao/quantization/_quantize_dbr.py b/torch/ao/quantization/_quantize_dbr.py index b0e7222a7839..dbcfac60a177 100644 --- a/torch/ao/quantization/_quantize_dbr.py +++ b/torch/ao/quantization/_quantize_dbr.py @@ -73,6 +73,14 @@ def prepare(model, qconfig_dict, example_inputs, inplace=False, allow_list=None, if len(module_fusion_fqns): model = torch.quantization.fuse_modules(model, module_fusion_fqns) + # Since we are reusing the auto_trace machinery to find fusion + # FQNs, we need to do some surgery to get qconfigs on modules + # after module fusion to be correct. + for _, child in model.named_modules(): + if isinstance(child, torch.nn.intrinsic._FusedModule): + if hasattr(child[0], 'qconfig'): + child.qconfig = child[0].qconfig + # delete all the DBR state from the model, so add_auto_observation # can start from a clean slate parents_to_delete_auto_quant_state = [] @@ -82,6 +90,15 @@ def prepare(model, qconfig_dict, example_inputs, inplace=False, allow_list=None, for v in parents_to_delete_auto_quant_state: del v._auto_quant_state + del model._fqn_to_auto_quant_state_map + + for p in model.parameters(): + if hasattr(p, '_qtensor_info'): + del p._qtensor_info + for b in model.buffers(): + if hasattr(b, '_qtensor_info'): + del b._qtensor_info + # the model hierarchy might have changed during fusion, so we # have to delete the cached module hook types for k, v in model.named_modules(): @@ -102,11 +119,10 @@ def prepare(model, qconfig_dict, example_inputs, inplace=False, allow_list=None, child.qconfig = None # type: ignore[assignment] elif isinstance(child, torch.nn.LSTM): # TODO: fix LSTM handling in eager mode static quant and remove this - child.qconfig = None + qconfig_dict['object_type'][torch.nn.LSTM] = None + + # TODO(future PR): do the QAT module swap - model = torch.quantization.prepare( - model, inplace, allow_list, observer_non_leaf_module_list, - prepare_custom_config_dict) assert not inplace model = add_auto_observation( model, qconfig_dict, example_inputs, diff --git a/torch/ao/quantization/_quantize_fx_do_not_use.py b/torch/ao/quantization/_quantize_fx_do_not_use.py deleted file mode 100644 index d39abe299393..000000000000 --- a/torch/ao/quantization/_quantize_fx_do_not_use.py +++ /dev/null @@ -1,34 +0,0 @@ -import torch -from torch.fx import GraphModule -from typing import Dict, Any, Optional -from .quantize_fx import ( - _check_is_graph_module, - check_is_valid_convert_custom_config_dict -) -from .fx._convert_do_not_use import _convert_do_not_use - -def _convert_fx_do_not_use( - graph_module: GraphModule, is_reference: bool = False, - convert_custom_config_dict: Dict[str, Any] = None, - _remove_qconfig: bool = True, - backend_config_dict: Optional[Dict[str, Any]] = None) -> torch.nn.Module: - """ - Please do not use, this is a temporary function to migrate convert_fx - to a new implementation - """ - assert is_reference - if convert_custom_config_dict is None: - convert_custom_config_dict = {} - - _check_is_graph_module(graph_module) - check_is_valid_convert_custom_config_dict(convert_custom_config_dict) - - quantized = _convert_do_not_use( - graph_module, is_reference, convert_custom_config_dict, - False, _remove_qconfig_flag=_remove_qconfig, - backend_config_dict=backend_config_dict) - - preserved_attributes = convert_custom_config_dict.get("preserved_attributes", []) - for attr_name in preserved_attributes: - setattr(quantized, attr_name, getattr(graph_module, attr_name)) - return quantized diff --git a/torch/ao/quantization/fx/backend_config/README.md b/torch/ao/quantization/backend_config/README.md similarity index 100% rename from torch/ao/quantization/fx/backend_config/README.md rename to torch/ao/quantization/backend_config/README.md diff --git a/torch/ao/quantization/backend_config/__init__.py b/torch/ao/quantization/backend_config/__init__.py new file mode 100644 index 000000000000..f62e344423d4 --- /dev/null +++ b/torch/ao/quantization/backend_config/__init__.py @@ -0,0 +1,11 @@ +from .tensorrt import get_tensorrt_backend_config_dict +from .native import get_native_backend_config_dict + +# TODO: add more validations +def validate_backend_config_dict(backend_config_dict): + return "configs" in backend_config_dict + +__all__ = [ + "get_native_backend_config_dict", + "get_tensorrt_backend_config_dict", +] diff --git a/torch/ao/quantization/backend_config/native.py b/torch/ao/quantization/backend_config/native.py new file mode 100644 index 000000000000..d1b254e08359 --- /dev/null +++ b/torch/ao/quantization/backend_config/native.py @@ -0,0 +1,722 @@ +from collections import namedtuple +from typing import List, Dict, Any +import operator +import torch +from torch.ao.quantization.backend_config.observation_type import ObservationType +import torch.nn.functional as F +import torch.nn as nn +import torch.nn.intrinsic as nni +import torch.nn.intrinsic.qat as nniqat +import torch.nn.qat as nnqat +import torch.nn.quantized._reference as nnqr +from ..observer import ( + default_fixed_qparams_range_0to1_observer, + default_fixed_qparams_range_neg1to1_observer, +) +from ..fake_quantize import FixedQParamsFakeQuantize +from ..fuser_method_mappings import ( + reverse_sequential_wrapper2, + reverse2, + reverse3, + fuse_conv_bn, + fuse_conv_bn_relu, + fuse_linear_bn, + fuse_convtranspose_bn, +) + +# TODO: rename to be more explict, e.g. qat_conv_relu +_ConvMetadata = namedtuple( + "_ConvMetadata", + ["root", "transpose", "bn", "reference", "transpose_reference", + "fused_conv_relu", "fused_conv_bn", "fused_conv_bn_relu", + "qat", "relu_qat", "bn_qat", "bn_relu_qat", + "func"]) +_Conv1dMetadata = _ConvMetadata( + nn.Conv1d, nn.ConvTranspose1d, nn.BatchNorm1d, nnqr.Conv1d, nnqr.ConvTranspose1d, + nni.ConvReLU1d, nni.ConvBn1d, nni.ConvBnReLU1d, + nnqat.Conv1d, nniqat.ConvReLU1d, nniqat.ConvBn1d, nniqat.ConvBnReLU1d, + F.conv1d) +_Conv2dMetadata = _ConvMetadata( + nn.Conv2d, nn.ConvTranspose2d, nn.BatchNorm2d, nnqr.Conv2d, nnqr.ConvTranspose2d, + nni.ConvReLU2d, nni.ConvBn2d, nni.ConvBnReLU2d, + nnqat.Conv2d, nniqat.ConvReLU2d, nniqat.ConvBn2d, nniqat.ConvBnReLU2d, + F.conv2d) +_Conv3dMetadata = _ConvMetadata( + nn.Conv3d, nn.ConvTranspose3d, nn.BatchNorm3d, nnqr.Conv3d, nnqr.ConvTranspose3d, + nni.ConvReLU3d, nni.ConvBn3d, nni.ConvBnReLU3d, + nnqat.Conv3d, nniqat.ConvReLU3d, nniqat.ConvBn3d, nniqat.ConvBnReLU3d, + F.conv3d) + +# =================== +# | DTYPE CONFIGS | +# =================== + +# weighted op int8 dtype config +# this is config for ops that has quantized weights, like linear, conv +weighted_op_int8_dtype_config = { + # optional, input activation dtype + "input_dtype": torch.quint8, + # optional, weight dtype + "weight_dtype": torch.qint8, + # optional, bias dtype + "bias_dtype": torch.float, + # optional, output activation dtype + "output_dtype": torch.quint8 +} + +default_op_quint8_dtype_config = { + # optional, input activation dtype + "input_dtype": torch.quint8, + # optional, output activation dtype + "output_dtype": torch.quint8, +} + +default_op_fp16_dtype_config = { + # optional, input activation dtype + "input_dtype": torch.float16, + # optional, weight dtype + "weight_dtype": torch.float16, + # optional, bias dtype + "bias_dtype": torch.float16, + # optional, output activation dtype + "output_dtype": torch.float16, +} + +default_dynamic_int8_dtype_config = { + "input_dtype": torch.quint8, + "weight_dtype": torch.qint8, + "bias_dtype": torch.float, + "output_dtype": torch.float, + # currently the dtype check is not yet enabled, so we provided the dtype_configs but + # it is not really used yet, + # we will enable it a bit later after we moved everything to backend_config_dict + "is_dynamic": True, +} + +default_dynamic_float16_dtype_config = { + "input_dtype": torch.float16, + "weight_dtype": torch.float16, + "bias_dtype": torch.float, + "output_dtype": torch.float, + # currently the dtype check is not yet enabled, so we provided the dtype_configs but + # it is not really used yet, + # we will enable it a bit later after we moved everything to backend_config_dict + "is_dynamic": True, +} + +weight_only_quint8_dtype_config = { + "input_dtype": torch.float, + "weight_dtype": torch.quint8, + "output_dtype": torch.float, +} + +weight_only_quint4x2_dtype_config = { + "input_dtype": torch.float, + "weight_dtype": torch.quint4x2, + "output_dtype": torch.float, +} + +# ====================== +# | OPERATOR CONFIGS | +# ====================== + +def _get_default_op_backend_config(op, dtype_configs): + return { + "pattern": op, + "observation_type": ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT, + "dtype_configs": dtype_configs, + } + +_DEFAULT_OP_INT8_CONFIGS = [ + _get_default_op_backend_config(op, [default_op_quint8_dtype_config]) for op in [ + torch.nn.ELU, + torch.nn.LeakyReLU, + torch.nn.Hardswish, + torch.nn.InstanceNorm1d, + torch.nn.InstanceNorm2d, + torch.nn.InstanceNorm3d, + torch.nn.LayerNorm, + torch.nn.Dropout, + torch.nn.functional.elu, + torch.nn.functional.hardswish, + torch.nn.functional.instance_norm, + torch.nn.functional.leaky_relu, + torch.nn.functional.dropout, + torch.nn.functional.layer_norm, + ]] + +def _get_linear_configs(dtype_configs): + """ + Return all configs related to linear modules and ops. + """ + observation_type = ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT + linear_configs = [] + + # (1) Single linear modules/functions + # ------------------------------------- + # linear module + linear_configs.append({ + # Please see README under this folder for pattern format + "pattern": torch.nn.Linear, + "observation_type": observation_type, + "dtype_configs": dtype_configs, + # the root module for the pattern, used to query the reference quantized module + # e.g. for a (torch.nn.ReLU, torch.nn.Linear) pattern, the root will be torch.nn.Linear + "root_module": torch.nn.Linear, + # the corresponding reference quantized module for the root module + "reference_quantized_module_for_root": nnqr.Linear, + "qat_module": nnqat.Linear, + }) + # linear qat module + linear_configs.append({ + "pattern": nnqat.Linear, + "observation_type": observation_type, + "dtype_configs": dtype_configs, + "root_module": torch.nn.Linear, + "reference_quantized_module_for_root": nnqr.Linear, + }) + # functional linear + linear_configs.append({ + "pattern": torch.nn.functional.linear, + "observation_type": observation_type, + "dtype_configs": dtype_configs, + }) + + # (2) Linear + relu + # ------------------- + # 2.1 linear module + relu fusion config + # linear relu, linear module + relu module + linear_configs.append({ + "pattern": (torch.nn.ReLU, torch.nn.Linear), + "dtype_configs": dtype_configs, + "fuser_method": reverse_sequential_wrapper2(nni.LinearReLU), + "fused_module": nni.LinearReLU, + }) + # linear relu, linear module + functional relu + linear_configs.append({ + "pattern": (torch.nn.functional.relu, torch.nn.Linear), + "dtype_configs": dtype_configs, + "fuser_method": reverse_sequential_wrapper2(nni.LinearReLU), + "fused_module": nni.LinearReLU, + }) + + # 2.2 linear module + relu, fused module configs + # linear relu, fused module + linear_configs.append({ + "pattern": nni.LinearReLU, + "observation_type": observation_type, + "dtype_configs": dtype_configs, + "root_module": torch.nn.Linear, + "reference_quantized_module_for_root": nnqr.Linear, + "qat_module": nniqat.LinearReLU, + }) + # linear relu, qat fused module + linear_configs.append({ + "pattern": nniqat.LinearReLU, + "observation_type": observation_type, + "dtype_configs": dtype_configs, + "root_module": torch.nn.Linear, + "reference_quantized_module_for_root": nnqr.Linear, + }) + # 2.3 functional linear + relu configs + # linear relu, functional linear + relu module + linear_configs.append({ + "pattern": (torch.nn.ReLU, F.linear), + "observation_type": observation_type, + "dtype_configs": dtype_configs, + }) + # linear relu, functional linear + functional relu + linear_configs.append({ + "pattern": (F.relu, F.linear), + "observation_type": observation_type, + "dtype_configs": dtype_configs, + }) + + # (3) Linear + batchnorm + # ------------------------ + # 3.1 linear bn fusion + linear_configs.append({ + "pattern": (nn.BatchNorm1d, nn.Linear), + "dtype_configs": dtype_configs, + "fuser_method": reverse2(fuse_linear_bn), + "fused_module": nni.LinearBn1d, + }) + + # 3.2 linear bn fused + # linear bn, fused module + linear_configs.append({ + "pattern": nni.LinearBn1d, + "observation_type": observation_type, + "dtype_configs": dtype_configs, + "root_module": torch.nn.Linear, + "reference_quantized_module_for_root": nnqr.Linear, + "qat_module": nniqat.LinearBn1d, + }) + # linear bn, qat fused module + linear_configs.append({ + "pattern": nniqat.LinearBn1d, + "observation_type": observation_type, + "dtype_configs": dtype_configs, + "root_module": torch.nn.Linear, + "reference_quantized_module_for_root": nnqr.Linear, + }) + return linear_configs + +def _get_conv_configs(): + """ + Return all configs related to conv modules and ops. + """ + conv_configs = [] + observation_type = ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT + dtype_configs = [weighted_op_int8_dtype_config] + for convs in [_Conv1dMetadata, _Conv2dMetadata, _Conv3dMetadata]: + + # (1) Single conv modules/functions + # ----------------------------------- + # conv module + conv_configs.append({ + "pattern": convs.root, + "observation_type": observation_type, + "dtype_configs": dtype_configs, + "root_module": convs.root, + "reference_quantized_module_for_root": convs.reference, + "qat_module": convs.qat, + }) + # conv qat module + conv_configs.append({ + "pattern": convs.qat, + "observation_type": observation_type, + "dtype_configs": dtype_configs, + "root_module": convs.root, + "reference_quantized_module_for_root": convs.reference, + }) + # functional conv + conv_configs.append({ + "pattern": convs.func, + "observation_type": observation_type, + "dtype_configs": dtype_configs, + }) + + # (2) Conv + relu + # ----------------- + # 2.1 conv module + relu fusion configs + # conv relu fusion, conv module + relu module + conv_configs.append({ + "pattern": (torch.nn.ReLU, convs.root), + "dtype_configs": dtype_configs, + "fuser_method": reverse_sequential_wrapper2(convs.fused_conv_relu), + "fused_module": convs.fused_conv_relu, + }) + # conv relu fusion, conv module + functional relu + conv_configs.append({ + "pattern": (F.relu, convs.root), + "dtype_configs": dtype_configs, + "fuser_method": reverse_sequential_wrapper2(convs.fused_conv_relu), + "fused_module": convs.fused_conv_relu, + }) + # 2.2 conv module + relu fused module configs + # conv relu, fused module + conv_configs.append({ + "pattern": convs.fused_conv_relu, + "observation_type": observation_type, + "dtype_configs": dtype_configs, + "root_module": convs.root, + "reference_quantized_module_for_root": convs.reference, + "qat_module": convs.relu_qat, + }) + # conv relu, qat fused module + conv_configs.append({ + "pattern": convs.relu_qat, + "observation_type": observation_type, + "dtype_configs": dtype_configs, + "root_module": convs.root, + "reference_quantized_module_for_root": convs.reference, + }) + # 2.3 functional conv + relu configs + # conv relu, functional conv + relu module + conv_configs.append({ + "pattern": (torch.nn.ReLU, convs.func), + "observation_type": observation_type, + "dtype_configs": dtype_configs, + }) + # conv relu, functional conv + functional relu + conv_configs.append({ + "pattern": (F.relu, convs.func), + "observation_type": observation_type, + "dtype_configs": dtype_configs, + }) + + # fused conv relu + conv_configs.append({ + "pattern": convs.fused_conv_relu, + "dtype_configs": dtype_configs, + "qat_module": convs.relu_qat, + }) + + conv_configs.append({ + "pattern": convs.relu_qat, + "dtype_configs": dtype_configs, + "root_module": convs.root, + "reference_quantized_module_for_root": convs.reference, + }) + + # (3) Conv + batchnorm (+ relu) + # ------------------------------- + # 3.1 conv bn fusion configs + # conv + bn fusion + conv_configs.append({ + "pattern": (convs.bn, convs.root), + "dtype_configs": dtype_configs, + "fuser_method": reverse2(fuse_conv_bn), + "fused_module": convs.fused_conv_bn, + }) + # conv + bn + relu module fusion + conv_configs.append({ + "pattern": (nn.ReLU, (convs.bn, convs.root)), + "dtype_configs": dtype_configs, + "fuser_method": reverse3(fuse_conv_bn_relu), + "fused_module": convs.fused_conv_bn_relu, + }) + # conv + bn + relu functional fusion + conv_configs.append({ + "pattern": (F.relu, (convs.bn, convs.root)), + "dtype_configs": dtype_configs, + "root_module": convs.root, + "fuser_method": reverse3(fuse_conv_bn_relu), + "fused_module": convs.fused_conv_bn_relu, + }) + # TODO: we can add fusion for torch.relu as well + + # 3.2 conv + bn (+ relu) fused module configs + # fused conv bn + conv_configs.append({ + "pattern": convs.fused_conv_bn, + "dtype_configs": dtype_configs, + "qat_module": convs.bn_qat, + }) + + # fused conv bn relu + conv_configs.append({ + "pattern": convs.fused_conv_bn_relu, + "dtype_configs": dtype_configs, + "qat_module": convs.bn_relu_qat, + }) + + # conv bn, qat fused module + conv_configs.append({ + "pattern": convs.bn_qat, + "observation_type": observation_type, + "dtype_configs": dtype_configs, + "root_module": convs.root, + "reference_quantized_module_for_root": convs.reference, + }) + # conv bn relu, qat fused module + conv_configs.append({ + "pattern": convs.bn_relu_qat, + "observation_type": observation_type, + "dtype_configs": dtype_configs, + "root_module": convs.root, + "reference_quantized_module_for_root": convs.reference, + }) + + # (4) conv transpose and its fusion + # 4.1 conv transpose config + conv_configs.append({ + "pattern": convs.transpose, + "dtype_configs": dtype_configs, + "root_module": convs.transpose, + "reference_quantized_module_for_root": convs.transpose_reference, + }) + + # 4.2 conv transpose + bn fusion + conv_configs.append({ + "pattern": (convs.bn, convs.transpose), + "dtype_configs": dtype_configs, + "fuser_method": reverse2(fuse_convtranspose_bn), + "root_module": convs.transpose, + "reference_quantized_module_for_root": convs.transpose_reference, + }) + + return conv_configs + +def _get_binary_op_configs(dtype_configs): + binary_op_configs: List[Dict[str, Any]] = [] + num_tensor_args_to_observation_type_mapping = { + # TODO: this is not used right now since we have extra check in prepare + # will need to change this to NO_OBSERVER later after we implemented + # Tensor dtype inference properly + 0: ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT, + 1: ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT, + 2: ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT, + } + for op_with_quantized_bop_scalar_variant in [ + operator.add, torch.add, operator.mul, torch.mul]: + binary_op_configs.append({ + "pattern": (torch.nn.ReLU, op_with_quantized_bop_scalar_variant), + "num_tensor_args_to_observation_type": num_tensor_args_to_observation_type_mapping, + "dtype_configs": dtype_configs, + }) + binary_op_configs.append({ + "pattern": (torch.nn.functional.relu, op_with_quantized_bop_scalar_variant), + "num_tensor_args_to_observation_type": num_tensor_args_to_observation_type_mapping, + "dtype_configs": dtype_configs, + }) + binary_op_configs.append({ + "pattern": (torch.relu, op_with_quantized_bop_scalar_variant), + "num_tensor_args_to_observation_type": num_tensor_args_to_observation_type_mapping, + "dtype_configs": dtype_configs, + }) + binary_op_configs.append({ + "pattern": op_with_quantized_bop_scalar_variant, + "num_tensor_args_to_observation_type": num_tensor_args_to_observation_type_mapping, + "dtype_configs": dtype_configs, + }) + return binary_op_configs + + +def _get_fixed_qparams_op_configs(): + fixed_qparams_op_configs = [] + for fixed_qparam_op, output_observer in [ + (torch.nn.Hardsigmoid, default_fixed_qparams_range_0to1_observer), + (torch.nn.functional.hardsigmoid, default_fixed_qparams_range_0to1_observer), + ("hardsigmoid", default_fixed_qparams_range_0to1_observer), + ("hardsigmoid_", default_fixed_qparams_range_0to1_observer), + (torch.nn.Sigmoid, default_fixed_qparams_range_0to1_observer), + (torch.sigmoid, default_fixed_qparams_range_0to1_observer), + ("sigmoid", default_fixed_qparams_range_0to1_observer), + ("sigmoid_", default_fixed_qparams_range_0to1_observer), + (torch.nn.Tanh, default_fixed_qparams_range_neg1to1_observer), + (torch.tanh, default_fixed_qparams_range_neg1to1_observer), + ("tanh", default_fixed_qparams_range_neg1to1_observer), + ("tanh_", default_fixed_qparams_range_neg1to1_observer), + (torch.nn.Softmax, default_fixed_qparams_range_0to1_observer), + ]: + fixed_qparams_op_configs.append({ + "pattern": fixed_qparam_op, + "observation_type": ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT, + # TODO: The following two keys are temporary, since we don't want to put observer in the configs + # we expect that it's provided by user + # What we want to put here is the requirement on observers, in this case dtype, + # quant_min, quant_max etc., but we need to first move all configs to + # backend_config_dict to do that, we'll remove these keys after we fully migrated + # everything to use backend_config_dict + "_overwrite_output_fake_quantizer": FixedQParamsFakeQuantize.with_args(observer=output_observer), + "_overwrite_output_observer": output_observer, + "dtype_configs": [ + weighted_op_int8_dtype_config, + default_op_fp16_dtype_config, + ], + }) + return fixed_qparams_op_configs + +_CAT_CONFIG = { + "pattern": torch.cat, + "observation_type": ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT, + "dtype_configs": [ + default_op_quint8_dtype_config, + ] +} + +def _get_bn_configs(): + """ Get configs related to batchnorm + """ + bn_configs = [] + bn_to_fused_bn = { + torch.nn.BatchNorm2d: nni.BNReLU2d, + torch.nn.BatchNorm3d: nni.BNReLU3d, + } + for bn in bn_to_fused_bn.keys(): + fused_bn = bn_to_fused_bn[bn] + # bn module + relu module fusion config + bn_configs.append({ + "pattern": (torch.nn.ReLU, bn), + "dtype_configs": [default_op_quint8_dtype_config], + "fuser_method": reverse_sequential_wrapper2(fused_bn), + "fused_module": fused_bn, + }) + # bn module + F.relu fusion config + bn_configs.append({ + "pattern": (torch.nn.functional.relu, bn), + "dtype_configs": [default_op_quint8_dtype_config], + "fuser_method": reverse_sequential_wrapper2(bn_to_fused_bn[bn]), + "fused_module": fused_bn, + }) + bn_configs.append({ + "pattern": bn, + "observation_type": ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT, + "dtype_configs": [default_op_quint8_dtype_config], + }) + + # fused bn configs + for fused_bn in bn_to_fused_bn.values(): + bn_configs.append({ + "pattern": fused_bn, + "observation_type": ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT, + "dtype_configs": [default_op_quint8_dtype_config], + }) + return bn_configs + +def _get_share_qparams_op_configs(dtype_configs): + """ Get the operator config for the operators that works for both float and quantized input + if input is quantized, the output Tensor shares the same quantization parameter + with input. + Example operator: avgpool2d, reshape, transpose, maxpool2d + Example observed operator: + observer_0 - avgpool2d - observer_0 (same observer instance as input) + """ + + def _get_share_qprams_op_backend_config(op): + return { + "pattern": op, + "observation_type": ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT, + "dtype_configs": dtype_configs, + } + + share_qparams_ops = [ + torch.nn.AdaptiveAvgPool1d, + torch.nn.AdaptiveAvgPool2d, + torch.nn.AdaptiveAvgPool3d, + torch.nn.AvgPool1d, + torch.nn.AvgPool2d, + torch.nn.AvgPool3d, + torch.nn.Hardtanh, + torch.nn.Identity, + torch.nn.MaxPool1d, + torch.nn.MaxPool2d, + torch.nn.MaxPool3d, + torch.nn.ReLU, + torch.nn.ReLU6, + torch.adaptive_avg_pool1d, + torch.nn.functional.adaptive_avg_pool2d, + torch.nn.functional.adaptive_avg_pool3d, + torch.nn.functional.hardtanh, + torch.nn.functional.hardtanh_, + torch.nn.functional.interpolate, + torch.nn.functional.max_pool1d, + torch.nn.functional.max_pool2d, + torch.nn.functional.max_pool3d, + torch.nn.functional.relu, + torch.nn.functional.relu6, + torch.avg_pool1d, + torch._C._nn.avg_pool2d, + torch._C._nn.avg_pool3d, + torch.clamp, + torch.flatten, + torch.mean, + torch.repeat_interleave, + torch.transpose, + torch.squeeze, + torch.stack, + torch.unsqueeze, + operator.floordiv, + "contiguous", + "clamp", + "detach", + "detach_", + "mean", + "permute", + "repeat", + "repeat_interleave", + "reshape", + "resize_", + "relu", + "relu_", + "shape", + "size", + "squeeze", + "squeeze_", + "transpose", + "unsqueeze", + "unsqueeze_", + "view" + ] + return [_get_share_qprams_op_backend_config(op) for op in share_qparams_ops] + +def _get_rnn_op_configs(): + rnn_op_configs = [] + for rnn_op, ref_rnn_op in [ + (nn.GRUCell, nnqr.GRUCell), + (nn.LSTMCell, nnqr.LSTMCell), + (nn.RNNCell, nnqr.RNNCell), + (nn.LSTM, nnqr.LSTM) + ]: + rnn_op_configs.append({ + "pattern": rnn_op, + "observation_type": ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT, + "dtype_configs": [default_dynamic_int8_dtype_config, default_dynamic_float16_dtype_config], + "root_module": rnn_op, + "reference_quantized_module_for_root": ref_rnn_op, + }) + return rnn_op_configs + +def _get_embedding_op_configs(): + embedding_op_configs = [] + for embedding_op, qat_embedding_op, ref_embedding_op in [ + (nn.Embedding, nnqat.Embedding, nnqr.Embedding), + (nn.EmbeddingBag, nnqat.EmbeddingBag, nnqr.EmbeddingBag), + ]: + embedding_op_configs.append({ + "pattern": embedding_op, + "observation_type": ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT, + "dtype_configs": [ + weight_only_quint8_dtype_config, + weight_only_quint4x2_dtype_config + ], + "qat_module": qat_embedding_op, + "root_module": embedding_op, + "reference_quantized_module_for_root": ref_embedding_op, + # This is temporary, and will be removed soon + "_input_output_observed": False + }) + # config for qat op + embedding_op_configs.append({ + "pattern": qat_embedding_op, + "observation_type": ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT, + "dtype_configs": [ + weight_only_quint8_dtype_config, + weight_only_quint4x2_dtype_config + ], + "root_module": embedding_op, + "reference_quantized_module_for_root": ref_embedding_op, + # This is temporary, and will be removed soon + "_input_output_observed": False + }) + return embedding_op_configs + +def get_native_backend_config_dict(): + """ Get backend_config_dict for PyTorch Native backend (fbgemm/qnnpack). """ + linear_dtype_configs = [ + weighted_op_int8_dtype_config, + default_dynamic_int8_dtype_config, + default_dynamic_float16_dtype_config, + # TODO: maybe remove this since fbgemm/qnnpack doesn't have kernels for it + default_op_fp16_dtype_config, + ] + binary_op_dtype_configs = [ + weighted_op_int8_dtype_config, + default_op_fp16_dtype_config, + ] + share_qparams_op_dtype_configs = [ + default_op_quint8_dtype_config, + default_op_fp16_dtype_config + ] + return { + # optional + "name": "native", + "configs": [ + *_DEFAULT_OP_INT8_CONFIGS, + *_get_linear_configs(linear_dtype_configs), + *_get_conv_configs(), + *_get_binary_op_configs(binary_op_dtype_configs), + *_get_fixed_qparams_op_configs(), + _CAT_CONFIG, + *_get_bn_configs(), + *_get_share_qparams_op_configs(share_qparams_op_dtype_configs), + *_get_rnn_op_configs(), + *_get_embedding_op_configs(), + ], + } + +__all__ = [ + "get_native_backend_config_dict", +] diff --git a/torch/ao/quantization/fx/backend_config/observation_type.py b/torch/ao/quantization/backend_config/observation_type.py similarity index 100% rename from torch/ao/quantization/fx/backend_config/observation_type.py rename to torch/ao/quantization/backend_config/observation_type.py diff --git a/torch/ao/quantization/fx/backend_config/tensorrt.py b/torch/ao/quantization/backend_config/tensorrt.py similarity index 84% rename from torch/ao/quantization/fx/backend_config/tensorrt.py rename to torch/ao/quantization/backend_config/tensorrt.py index 6504ce7a9331..94895215bb10 100644 --- a/torch/ao/quantization/fx/backend_config/tensorrt.py +++ b/torch/ao/quantization/backend_config/tensorrt.py @@ -3,8 +3,12 @@ import torch.nn.qat as nnqat import torch.nn.intrinsic as nni import torch.nn.intrinsic.qat as nniqat +# TODO: maybe refactor this to a separate util function +from .native import _get_binary_op_configs +from .native import _get_linear_configs +from .native import _get_share_qparams_op_configs -from ...fuser_method_mappings import reverse2 +from ..fuser_method_mappings import reverse_sequential_wrapper2 def get_tensorrt_backend_config_dict(): """ Get the backend config dictionary for tensorrt backend @@ -31,20 +35,6 @@ def get_tensorrt_backend_config_dict(): } # operator (module/functional/torch ops) configs - linear_module_config = { - # Please see README under this folder for pattern format - "pattern": torch.nn.Linear, - "observation_type": ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT, - "dtype_configs": [ - weighted_op_qint8_dtype_config, - ], - # the root module for the pattern, used to query the reference quantized module - # e.g. for a (torch.nn.ReLU, torch.nn.Linear) pattern, the root will be torch.nn.Linear - "root_module": torch.nn.Linear, - # the corresponding reference quantized module for the root module - "reference_quantized_module_for_root": torch.nn.quantized._reference.Linear, - "qat_module": nnqat.Linear, - } linear_qat_config = { "pattern": nnqat.Linear, "observation_type": ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT, @@ -63,7 +53,8 @@ def get_tensorrt_backend_config_dict(): "dtype_configs": [ weighted_op_qint8_dtype_config, ], - "fuser_method": reverse2(nni.LinearReLU), + "fuser_method": reverse_sequential_wrapper2(nni.LinearReLU), + "fused_module": nni.LinearReLU, } linear_relu_mf_config = { "pattern": (torch.nn.functional.relu, torch.nn.Linear), @@ -71,7 +62,8 @@ def get_tensorrt_backend_config_dict(): "dtype_configs": [ weighted_op_qint8_dtype_config, ], - "fuser_method": reverse2(nni.LinearReLU), + "fuser_method": reverse_sequential_wrapper2(nni.LinearReLU), + "fused_module": nni.LinearReLU, } linear_relu_fused_config = { @@ -156,7 +148,8 @@ def get_tensorrt_backend_config_dict(): "dtype_configs": [ weighted_op_qint8_dtype_config, ], - "fuser_method": reverse2(nni.ConvReLU2d), + "fuser_method": reverse_sequential_wrapper2(nni.ConvReLU2d), + "fused_module": nni.ConvReLU2d, } conv2d_relu_mm_config = { "pattern": (torch.nn.ReLU, torch.nn.Conv2d), @@ -164,7 +157,8 @@ def get_tensorrt_backend_config_dict(): "dtype_configs": [ weighted_op_qint8_dtype_config, ], - "fuser_method": reverse2(nni.ConvReLU2d), + "fuser_method": reverse_sequential_wrapper2(nni.ConvReLU2d), + "fused_module": nni.ConvReLU2d, } addmm_config = { "pattern": torch.addmm, @@ -186,18 +180,19 @@ def get_tensorrt_backend_config_dict(): non_weighted_op_qint8_dtype_config, ] } - identity_config = { - "pattern": torch.nn.Identity, - "observation_type": ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT, - "dtype_configs": [ - non_weighted_op_qint8_dtype_config, - ] - } + linear_dtype_configs = [ + weighted_op_qint8_dtype_config, + ] + binary_op_dtype_configs = [ + weighted_op_qint8_dtype_config, + ] + share_qparams_op_dtype_configs = [ + non_weighted_op_qint8_dtype_config, + ] return { # optional "name": "tensorrt", "configs": [ - linear_module_config, linear_qat_config, linear_relu_fused_config, linear_relu_qat_config, @@ -215,6 +210,12 @@ def get_tensorrt_backend_config_dict(): # conv3d_relu_fused_config, addmm_config, cat_config, - identity_config, + *_get_linear_configs(linear_dtype_configs), + *_get_binary_op_configs(binary_op_dtype_configs), + *_get_share_qparams_op_configs(share_qparams_op_dtype_configs), ] } + +__all__ = [ + "get_tensorrt_backend_config_dict", +] diff --git a/torch/ao/quantization/backend_config/utils.py b/torch/ao/quantization/backend_config/utils.py new file mode 100644 index 000000000000..95df3bf310c3 --- /dev/null +++ b/torch/ao/quantization/backend_config/utils.py @@ -0,0 +1,202 @@ +from typing import Dict, Any, List, Callable, Union, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from ..quantization_types import Pattern + +def get_pattern_to_dtype_configs( + backend_config_dict: Dict[str, Any]) -> Dict[Pattern, List[Dict[str, Any]]]: + pattern_to_dtype_configs: Dict[Pattern, List[Dict[str, torch.dtype]]] = dict() + for config in backend_config_dict.get("configs", []): + pattern = config["pattern"] + dtype_configs = config["dtype_configs"] + pattern_to_dtype_configs[pattern] = dtype_configs + return pattern_to_dtype_configs + +def get_qat_module_classes( + backend_config_dict: Dict[str, Any]) -> Tuple[type, ...]: + qat_module_classes = [] + for config in backend_config_dict.get("configs", []): + pattern = config["pattern"] + qat_module = config.get("qat_module", None) + if qat_module is not None: + qat_module_classes.append(qat_module) + return tuple(set(qat_module_classes)) + +def get_fused_module_classes( + backend_config_dict: Dict[str, Any]) -> Tuple[type, ...]: + fused_module_classes = [] + for config in backend_config_dict.get("configs", []): + pattern = config["pattern"] + fused_module = config.get("fused_module", None) + if fused_module is not None: + fused_module_classes.append(fused_module) + return tuple(set(fused_module_classes)) + +def get_pattern_to_input_type_to_index( + backend_config_dict: Dict[str, Any]) -> Dict[Pattern, Dict[str, int]]: + pattern_to_input_type_to_index: Dict[Pattern, Dict[str, int]] = dict() + for config in backend_config_dict.get("configs", []): + pattern = config["pattern"] + input_type_to_index = config.get("input_type_to_index", {}) + pattern_to_input_type_to_index[pattern] = input_type_to_index + return pattern_to_input_type_to_index + +def get_root_module_to_quantized_reference_module( + backend_config_dict: Dict[str, Any]) -> Dict[Callable, Callable]: + mapping: Dict[Callable, Callable] = dict() + for config in backend_config_dict.get("configs", []): + if "root_module" in config and "reference_quantized_module_for_root" in config: + mapping[config["root_module"]] = config["reference_quantized_module_for_root"] + return mapping + +def get_fuser_method_mapping( + backend_config_dict: Dict[str, Any]) -> Dict[Pattern, Union[nn.Sequential, Callable]]: + fuser_method_mapping : Dict[Pattern, Union[nn.Sequential, Callable]] = dict() + for config in backend_config_dict.get("configs", []): + if "fuser_method" in config: + pattern = config["pattern"] + fuser_method = config["fuser_method"] + fuser_method_mapping[pattern] = fuser_method + + return fuser_method_mapping + +def get_module_to_qat_module( + backend_config_dict: Dict[str, Any]) -> Dict[Callable, Callable]: + module_to_qat_module: Dict[Callable, Callable] = dict() + for config in backend_config_dict.get("configs", []): + if "pattern" in config and "qat_module" in config: + pattern = config["pattern"] + qat_module = config["qat_module"] + module_to_qat_module[pattern] = qat_module + + return module_to_qat_module + +def get_fusion_pattern_to_root_node_getter( + backend_config_dict: Dict[str, Any]) -> Dict[Pattern, Callable]: + """ Get a map from fusion pattern to a function that returns the root node + from the fusion pattern, e.g. the most common one is: + def get_root_node(node_pattern): + while not isinstance(node_pattern[-1], Node): + node_pattern = node_pattern[-1] + return node_pattern[-1] + This can work for all patterns whose root node is the "last node" in the pattern, + e.g. (torch.add, MatchAllNode, (torch.ReLU, torch.Conv2d)) + """ + root_node_getter_mapping: Dict[Pattern, Callable] = dict() + for config in backend_config_dict.get("configs", []): + if "root_node_getter" in config: + pattern = config["pattern"] + root_node_getter = config["root_node_getter"] + root_node_getter_mapping[pattern] = root_node_getter + + return root_node_getter_mapping + +def get_fusion_pattern_to_extra_inputs_getter( + backend_config_dict: Dict[str, Any]) -> Dict[Pattern, Callable]: + """ Get a map from fusion pattern to a function that returns extra input nodes + from the fusion pattern, in the order required by the root node. This is optional, + if not specified, we will not copy over any extra inputs for the root node. + Example: + # Let's say we have the pattern (torch.add, MatchAllNode, (torch.nn.BatchNorm2d, torch.nn.Conv2d)) + # and root node is torch.nn.Conv2d, and the node in MatchAllNode would be an extra + # argument to the fused module, we can unpack the pattern and return the node at + # MatchAllNode here + # we can implement extra_inputs_getter as follows: + def extra_inputs_getter(pattern) -> List[Any]: + add, extra_input, conv_pattern = pattern + return [extra_input] + """ + extra_inputs_getter_mapping: Dict[Pattern, Callable] = dict() + for config in backend_config_dict.get("configs", []): + if "extra_inputs_getter" in config: + pattern = config["pattern"] + extra_inputs_getter = config["extra_inputs_getter"] + extra_inputs_getter_mapping[pattern] = extra_inputs_getter + + return extra_inputs_getter_mapping + +def remove_boolean_dispatch_from_name(p) -> Any: + """ + Some ops have a default string representation such as + '.fn at 0x7ff1106bf280>', + this function replaces them with the hardcoded function names. + """ + if p is F.fractional_max_pool2d: + return "torch.nn.functional.fractional_max_pool2d" + elif p is F.fractional_max_pool3d: + return "torch.nn.functional.fractional_max_pool3d" + elif p is F.max_pool1d: + return "torch.nn.functional.max_pool1d" + elif p is F.max_pool2d: + return "torch.nn.functional.max_pool2d" + elif p is F.max_pool3d: + return "torch.nn.functional.max_pool3d" + elif p is F.adaptive_max_pool1d: + return "torch.nn.functional.adaptive_max_pool1d" + elif p is F.adaptive_max_pool2d: + return "torch.nn.functional.adaptive_max_pool2d" + elif p is F.adaptive_max_pool3d: + return "torch.nn.functional.adaptive_max_pool3d" + assert "boolean_dispatch" not in str(p), \ + f"{p} does not have a human readable representation in " + \ + "quantization documentation" + return p + +def pattern_to_human_readable(p) -> Any: + if isinstance(p, tuple): + # nested patterns, recurse + return tuple(pattern_to_human_readable(inner_p) for inner_p in p) + elif isinstance(p, str): + # method names are already human readable + return p + else: + p = remove_boolean_dispatch_from_name(p) + return p + +# TODO(future PR): move backend_config_dict to use dataclass and move this logic to +# the corresponding __str__ function +def entry_to_pretty_str(entry) -> str: + """ + Given a backend_config_dict entry, returns a string with the human readable + representation of it. + """ + s = "{\n" + + # always output the pattern first + if "pattern" in entry: + pattern_str = pattern_to_human_readable(entry["pattern"]) + + s += f" 'pattern': {pattern_str},\n" + + # custom output for dtype_configs to make it look nice + if "dtype_configs" in entry: + s += " 'dtype_configs': [\n" + for dtype_config in entry["dtype_configs"]: + s += " {\n" + for k, v in dtype_config.items(): + s += f" '{k}': {v},\n" + s += " },\n" + s += " ],\n" + + # custom output for num_tensor_args_to_observation_type to make it look nice + if "num_tensor_args_to_observation_type" in entry: + s += " 'num_tensor_args_to_observation_type': {\n" + for k, v in entry["num_tensor_args_to_observation_type"].items(): + s += f" {k}: {v},\n" + s += " },\n" + + # output all the other fields + custom_handled_fields = [ + "pattern", + "dtype_configs", + "num_tensor_args_to_observation_type", + ] + for field_name in entry: + if field_name in custom_handled_fields: + continue + s += f" '{field_name}': {entry[field_name]},\n" + + s += "}" + return s diff --git a/torch/ao/quantization/fake_quantize.py b/torch/ao/quantization/fake_quantize.py index 9e49a8392e3e..b4e295fbd4d0 100644 --- a/torch/ao/quantization/fake_quantize.py +++ b/torch/ao/quantization/fake_quantize.py @@ -6,14 +6,12 @@ import torch from torch.nn import Module from torch.ao.quantization.observer import ( - MinMaxObserver, MovingAverageMinMaxObserver, HistogramObserver, MovingAveragePerChannelMinMaxObserver, - PerChannelMinMaxObserver, FixedQParamsObserver, - default_affine_fixed_qparams_observer, - default_symmetric_fixed_qparams_observer, + default_fixed_qparams_range_0to1_observer, + default_fixed_qparams_range_neg1to1_observer, _with_args, ) import re @@ -92,30 +90,23 @@ class FakeQuantize(FakeQuantizeBase): * :attr:`zero_point` specifies the quantized value to which 0 in floating point maps to - * :attr:`quant_min` specifies the minimum allowable quantized value. - - * :attr:`quant_max` specifies the maximum allowable quantized value. - * :attr:`fake_quant_enabled` controls the application of fake quantization on tensors, note that statistics can still be updated. * :attr:`observer_enabled` controls statistics collection on tensors * :attr:`dtype` specifies the quantized dtype that is being emulated with fake-quantization, - allowable values are torch.qint8 and torch.quint8. The values of quant_min and - quant_max should be chosen to be consistent with the dtype + allowable values are torch.qint8 and torch.quint8. Args: observer (module): Module for observing statistics on input tensors and calculating scale and zero-point. - quant_min (int): The minimum allowable quantized value. - quant_max (int): The maximum allowable quantized value. observer_kwargs (optional): Arguments for the observer module Attributes: - observer (Module): User provided module that collects statistics on the input tensor and + activation_post_process (Module): User provided module that collects statistics on the input tensor and provides a method to calculate scale and zero-point. """ @@ -123,15 +114,27 @@ class FakeQuantize(FakeQuantizeBase): scale: torch.Tensor zero_point: torch.Tensor - def __init__(self, observer=MovingAverageMinMaxObserver, quant_min=0, quant_max=255, **observer_kwargs): + def __init__(self, observer=MovingAverageMinMaxObserver, quant_min=None, quant_max=None, **observer_kwargs): super().__init__() - assert quant_min <= quant_max, \ - 'quant_min must be less than or equal to quant_max' - self.quant_min = quant_min - self.quant_max = quant_max + # Populate quant_min/quant_max to observer_kwargs if valid + if quant_min is not None and quant_max is not None: + assert quant_min <= quant_max, \ + 'quant_min must be less than or equal to quant_max' + dtype = observer_kwargs.get("dtype", torch.quint8) + if hasattr(observer, "p"): + # In case observer is _PartialWrapper, dtype can be stored in + # observer.p.keywords["dtype"] + dtype = getattr(getattr(observer, "p", {}), "keywords", {}).get( + "dtype", dtype + ) + assert torch.iinfo(dtype).min <= quant_min, 'quant_min out of bound' + assert quant_max <= torch.iinfo(dtype).max, 'quant_max out of bound' + observer_kwargs.update({"quant_min": quant_min, "quant_max": quant_max}) self.activation_post_process = observer(**observer_kwargs) - assert torch.iinfo(self.activation_post_process.dtype).min <= quant_min, 'quant_min out of bound' - assert quant_max <= torch.iinfo(self.activation_post_process.dtype).max, 'quant_max out of bound' + # TODO: keeping self.quant_min/max for BC; remove after a couple releases + # Users should use self.activation_post_process.quant_min + self.quant_min = self.activation_post_process.quant_min + self.quant_max = self.activation_post_process.quant_max if _is_float_qparams(self.activation_post_process.qscheme): zero_point_dtype = torch.float else: @@ -167,11 +170,11 @@ def forward(self, X): if self.is_per_channel: X = torch.fake_quantize_per_channel_affine( X, self.scale, self.zero_point, - self.ch_axis, self.quant_min, self.quant_max) + self.ch_axis, self.activation_post_process.quant_min, self.activation_post_process.quant_max) else: X = torch.fake_quantize_per_tensor_affine( X, self.scale, self.zero_point, - self.quant_min, self.quant_max) + self.activation_post_process.quant_min, self.activation_post_process.quant_max) return X @torch.jit.export @@ -180,7 +183,7 @@ def extra_repr(self): 'quant_min={}, quant_max={}, dtype={}, qscheme={}, ch_axis={}, ' \ 'scale={}, zero_point={}'.format( self.fake_quant_enabled, self.observer_enabled, - self.quant_min, self.quant_max, + self.activation_post_process.quant_min, self.activation_post_process.quant_max, self.dtype, self.qscheme, self.ch_axis, self.scale, self.zero_point) def _save_to_state_dict(self, destination, prefix, keep_vars): @@ -233,8 +236,6 @@ def __init__(self, observer): assert type(self.activation_post_process) == FixedQParamsObserver,\ "%s's observer must be a %s" % (self.__class__.__name__, FixedQParamsObserver.__name__) self._observer_ctr = observer - self.quant_min = self.activation_post_process.quant_min - self.quant_max = self.activation_post_process.quant_max self.scale = self.activation_post_process.scale self.zero_point = self.activation_post_process.zero_point assert _is_per_tensor(self.qscheme), 'Only per tensor quantization is supported' + \ @@ -250,7 +251,7 @@ def extra_repr(self): 'dtype={}, quant_min={}, quant_max={}, qscheme={}'.format( self.fake_quant_enabled, self.observer_enabled, self.scale, self.zero_point, self.dtype, - self.quant_min, self.quant_max, self.qscheme) + self.activation_post_process.quant_min, self.activation_post_process.quant_max, self.qscheme) class FusedMovingAvgObsFakeQuantize(FakeQuantize): @@ -279,14 +280,10 @@ def __init__( super().__init__(observer, quant_min, quant_max, **observer_kwargs) assert isinstance(self.activation_post_process, (MovingAverageMinMaxObserver, MovingAveragePerChannelMinMaxObserver)),\ "Fused observer+fake_quant module only works with MovingAverageMinMaxObserver" - self.quant_min: int = quant_min - self.quant_max: int = quant_max self.register_buffer("fake_quant_enabled", torch.tensor([1], dtype=torch.long)) self.register_buffer("observer_enabled", torch.tensor([1], dtype=torch.long)) self.is_symmetric_quant = _is_symmetric_quant(self.activation_post_process.qscheme) - self.quant_min, self.quant_max = self.activation_post_process.quant_min, self.activation_post_process.quant_max - @torch.jit.export def calculate_qparams(self) -> Tuple[torch.Tensor, torch.Tensor]: return self.activation_post_process.calculate_qparams() @@ -301,8 +298,8 @@ def extra_repr(self) -> str: self.scale, self.zero_point, self.dtype, - self.quant_min, - self.quant_max, + self.activation_post_process.quant_min, + self.activation_post_process.quant_max, self.qscheme, self.activation_post_process.reduce_range, ) @@ -318,8 +315,8 @@ def forward(self, X: torch.Tensor) -> torch.Tensor: self.scale, self.zero_point, self.activation_post_process.averaging_constant, - self.quant_min, - self.quant_max, + self.activation_post_process.quant_min, + self.activation_post_process.quant_max, self.ch_axis, self.is_per_channel, self.is_symmetric_quant, @@ -335,16 +332,24 @@ def forward(self, X: torch.Tensor) -> torch.Tensor: dtype=torch.qint8, qscheme=torch.per_tensor_symmetric, reduce_range=False) """ Default fake_quant for weights. +Observer is memoryless since averaging_constant is 1. """ -default_dynamic_fake_quant = FakeQuantize.with_args(observer=MinMaxObserver, quant_min=0, quant_max=255, - dtype=torch.quint8, memoryless=True) +default_dynamic_fake_quant = FakeQuantize.with_args(observer=MovingAverageMinMaxObserver, quant_min=0, quant_max=255, + dtype=torch.quint8, averaging_constant=1) """ Default dynamic fake_quant for activations. """ -default_symmetric_fixed_qparams_fake_quant = FixedQParamsFakeQuantize.with_args(observer=default_symmetric_fixed_qparams_observer) -default_affine_fixed_qparams_fake_quant = FixedQParamsFakeQuantize.with_args(observer=default_affine_fixed_qparams_observer) +default_fixed_qparams_range_neg1to1_fake_quant = ( + FixedQParamsFakeQuantize.with_args(observer=default_fixed_qparams_range_neg1to1_observer) +) +default_fixed_qparams_range_0to1_fake_quant = ( + FixedQParamsFakeQuantize.with_args(observer=default_fixed_qparams_range_0to1_observer) +) +# TODO: the following 2 variables are kept for backwards compatibility; remove after a few releases +default_symmetric_fixed_qparams_fake_quant = default_fixed_qparams_range_neg1to1_fake_quant +default_affine_fixed_qparams_fake_quant = default_fixed_qparams_range_0to1_fake_quant default_per_channel_weight_fake_quant = FakeQuantize.with_args(observer=MovingAveragePerChannelMinMaxObserver, quant_min=-128, @@ -355,23 +360,25 @@ def forward(self, X: torch.Tensor) -> torch.Tensor: ch_axis=0) """ Default fake_quant for per-channel weights. +Observer is memoryless since averaging_constant is 1. """ -default_embedding_fake_quant = FakeQuantize.with_args(observer=PerChannelMinMaxObserver, +default_embedding_fake_quant = FakeQuantize.with_args(observer=MovingAveragePerChannelMinMaxObserver, qscheme=torch.per_channel_affine_float_qparams, dtype=torch.quint8, quant_min=0, quant_max=255, ch_axis=0, - memoryless=True) + averaging_constant=1) """ Default fake_quant for embeddings. +Observer is memoryless since averaging_constant is 1. """ -default_embedding_fake_quant_4bit = FakeQuantize.with_args(observer=PerChannelMinMaxObserver, +default_embedding_fake_quant_4bit = FakeQuantize.with_args(observer=MovingAveragePerChannelMinMaxObserver, qscheme=torch.per_channel_affine_float_qparams, ch_axis=0, dtype=torch.quint4x2, - memoryless=True) + averaging_constant=1) default_histogram_fake_quant = FakeQuantize.with_args(observer=HistogramObserver, quant_min=0, @@ -411,6 +418,27 @@ def forward(self, X: torch.Tensor) -> torch.Tensor: Fused version of `default_per_channel_weight_fake_quant`, with improved performance. """ +fused_wt_fake_quant_range_neg_127_to_127 = FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAverageMinMaxObserver, + quant_min=-127, + quant_max=127, + dtype=torch.qint8, + qscheme=torch.per_tensor_symmetric, + eps=2 ** -12) +""" +Fused version of `default_weight_fake_quant`, with the 8-bit values restricted to [-127, +127], excluding -128. +""" + +fused_per_channel_wt_fake_quant_range_neg_127_to_127 = FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAverageMinMaxObserver, + quant_min=-127, + quant_max=127, + dtype=torch.qint8, + qscheme=torch.per_channel_symmetric, + eps=2 ** -12) +""" +Fused version of `default_per_channel_weight_fake_quant`, with the 8-bit values restricted to [-127, +127], excluding -128. +""" + + def _is_fake_quant_script_module(mod): ''' Returns true if given mod is an instance of FakeQuantize script module. ''' diff --git a/torch/ao/quantization/fuse_modules.py b/torch/ao/quantization/fuse_modules.py index f276eea3c871..1f7027f5c8d5 100644 --- a/torch/ao/quantization/fuse_modules.py +++ b/torch/ao/quantization/fuse_modules.py @@ -7,6 +7,7 @@ # for backward compatiblity from torch.ao.quantization.fuser_method_mappings import fuse_conv_bn # noqa: F401 from torch.ao.quantization.fuser_method_mappings import fuse_conv_bn_relu # noqa: F401 +from torch.nn.utils.parametrize import type_before_parametrizations from typing import List, Optional @@ -41,7 +42,7 @@ def fuse_known_modules(mod_list, is_qat, additional_fuser_method_mapping=None): For these sequences, the first element in the output module list performs the fused operation. The rest of the elements are set to nn.Identity() """ - types = tuple(type(m) for m in mod_list) + types = tuple(type_before_parametrizations(m) for m in mod_list) fuser_method = get_fuser_method(types, additional_fuser_method_mapping) if fuser_method is None: raise NotImplementedError("Cannot fuse modules: {}".format(types)) diff --git a/torch/ao/quantization/fuser_method_mappings.py b/torch/ao/quantization/fuser_method_mappings.py index 23e5a1f4c35a..a2882f136047 100644 --- a/torch/ao/quantization/fuser_method_mappings.py +++ b/torch/ao/quantization/fuser_method_mappings.py @@ -5,7 +5,8 @@ from torch.ao.quantization.utils import Pattern from torch.ao.quantization.utils import get_combined_dict - +from torch.ao.quantization.utils import MatchAllNode +import itertools def fuse_conv_bn(is_qat, conv, bn): r"""Given the conv and bn modules, fuses them and returns the fused module @@ -32,8 +33,6 @@ def fuse_conv_bn(is_qat, conv, bn): } if is_qat: - # TODO: remove the assert later - assert conv.training, "qat is only supported when conv.training is True currently" assert bn.num_features == conv.out_channels, 'Output channel of Conv2d must match num_features of BatchNorm2d' assert bn.affine, 'Only support fusing BatchNorm2d with affine set to True' assert bn.track_running_stats, 'Only support fusing BatchNorm2d with tracking_running_stats set to True' @@ -65,8 +64,6 @@ def fuse_conv_bn_relu(is_qat, conv, bn, relu): "Conv and BN both must be in the same mode (train or eval)." fused_module : Optional[Type[nn.Sequential]] = None if is_qat: - # TODO: remove the assert later - assert conv.training, "qat is only supported when conv.training is True currently" map_to_fused_module_train = { nn.Conv1d: nni.ConvBnReLU1d, nn.Conv2d: nni.ConvBnReLU2d, @@ -112,9 +109,12 @@ def fuse_linear_bn(is_qat, linear, bn): "Linear and BN both must be in the same mode (train or eval)." if is_qat: - # TODO: remove the assert later - assert linear.training, "qat is only supported when linear.training is True currently" - raise Exception("Fusing Linear+BatchNorm not yet supported in training.") + assert bn.num_features == linear.out_features,\ + "Output features of Linear must match num_features of BatchNorm1d" + assert bn.affine, "Only support fusing BatchNorm1d with affine set to True" + assert bn.track_running_stats,\ + "Only support fusing BatchNorm1d with tracking_running_stats set to True" + return nni.LinearBn1d(linear, bn) else: return nn.utils.fusion.fuse_linear_bn_eval(linear, bn) @@ -136,8 +136,7 @@ def fuse_convtranspose_bn(is_qat, convt, bn): "ConvTranspose and BN both must be in the same mode (train or eval)." if is_qat: - assert convt.training, "qat is only supported when convt.training is True currently" - raise Exception("Fusing ConvTranspose+BatchNorm not yet supported in training.") + raise Exception("Fusing ConvTranspose+BatchNorm not yet supported in QAT.") else: return nn.utils.fusion.fuse_conv_bn_eval(convt, bn, transpose=True) @@ -221,6 +220,37 @@ def reversed(is_qat, x, w): (nn.BatchNorm3d, nn.ConvTranspose3d): reverse2(fuse_convtranspose_bn), } +def get_valid_patterns(op_pattern): + """ + Returns a list of valid patterns generated from the op_pattern, + since MatchAllNode can match all types of nodes, + e.g. pattern (torch.nn.Conv2d, torch.add) should also be able to match keys like + (MatchAllNode, torch.add) and (torch.nn.Conv2d, MatchAllNode) + + Example Input: + (torch.add, (torch.nn.ReLU, torch.nn.Conv2d)) + + Example Output: + [(torch.add, (torch.nn.ReLU, torch.nn.Conv2d)), + (torch.add, (torch.nn.ReLU, MatchAllNode)), + (torch.add, (MatchAllNode, torch.nn.Conv2d)), + (torch.add, (MatchAllNode, MatchAllNode)), + (MatchAllNode, (torch.nn.ReLU, torch.nn.Conv2d)), + (MatchAllNode, (torch.nn.ReLU, MatchAllNode)), + (MatchAllNode, (MatchAllNode, torch.nn.Conv2d)), + (MatchAllNode, (MatchAllNode, MatchAllNode)), + ] + """ + result = [] + if isinstance(op_pattern, (tuple, list)): + sub_combs = [] + for sub_pattern in op_pattern: + sub_combs.append(get_valid_patterns(sub_pattern)) + result = list(itertools.product(*sub_combs)) + else: + result = [op_pattern, MatchAllNode] + return result + def get_fuser_method_new( op_pattern: Pattern, fuser_method_mapping: Optional[Dict[Pattern, Union[nn.Sequential, Callable]]] = None): @@ -230,6 +260,11 @@ def get_fuser_method_new( if fuser_method_mapping is None: fuser_method_mapping = DEFAULT_PATTERN_TO_FUSER_METHOD - fuser_method = fuser_method_mapping.get(op_pattern, None) + op_patterns = get_valid_patterns(op_pattern) + fuser_method = None + for op_pattern in op_patterns: + fuser_method = fuser_method_mapping.get(op_pattern, None) + if fuser_method is not None: + break assert fuser_method is not None, "did not find fuser method for: {} ".format(op_pattern) return fuser_method diff --git a/torch/ao/quantization/fx/__init__.py b/torch/ao/quantization/fx/__init__.py index 08d613fae771..0e37eaaded97 100644 --- a/torch/ao/quantization/fx/__init__.py +++ b/torch/ao/quantization/fx/__init__.py @@ -1,4 +1,3 @@ from .prepare import prepare from .convert import convert -from .fuse import Fuser -from .backend_config import get_tensorrt_backend_config_dict +from .fuse import fuse diff --git a/torch/ao/quantization/fx/_convert_do_not_use.py b/torch/ao/quantization/fx/_convert_do_not_use.py deleted file mode 100644 index 29e8b71c2a93..000000000000 --- a/torch/ao/quantization/fx/_convert_do_not_use.py +++ /dev/null @@ -1,316 +0,0 @@ -from typing import Any, Dict, List, Optional -import torch -from torch.fx import ( - GraphModule, -) -from torch.fx.graph import ( - Graph, - Node, -) -from ..qconfig import QConfigAny -from ..utils import ( - activation_is_int8_quantized, - weight_is_statically_quantized, - get_qparam_dict, - _parent_name, -) -from .backend_config.utils import get_quantized_reference_module_mapping - -from .graph_module import ( - QuantizedGraphModule, - is_observed_standalone_module, -) -from ._equalize import update_obs_for_equalization, convert_eq_obs -from .utils import ( - get_custom_module_class_keys, - get_quantize_node_info, - create_getattr_from_value, -) - -from torch.ao.quantization.quantize import ( - _remove_qconfig, - is_activation_post_process, -) - -from .convert import restore_state - -# these are tuples so that they can work with isinstance(module, tuple_of_classes) -FUSED_MODULE_CLASSES = ( - torch.nn.intrinsic.LinearReLU, - torch.nn.intrinsic.ConvReLU1d, - torch.nn.intrinsic.ConvReLU2d, - torch.nn.intrinsic.ConvReLU3d, -) - -QAT_MODULE_CLASSES = ( - torch.nn.qat.Linear, - torch.nn.qat.Conv2d, - torch.nn.qat.Conv3d, - torch.nn.intrinsic.qat.LinearReLU, - torch.nn.intrinsic.qat.ConvBn2d, - torch.nn.intrinsic.qat.ConvBnReLU2d, - torch.nn.intrinsic.qat.ConvReLU2d, - torch.nn.intrinsic.qat.ConvBn3d, - torch.nn.intrinsic.qat.ConvBnReLU3d, - torch.nn.intrinsic.qat.ConvReLU3d -) - -def insert_dequantize_node( - node: Node, - graph: Graph): - """ Inserts dequantize node for `node` in `graph` - """ - with graph.inserting_after(node): - dequantize_node = graph.call_method("dequantize", (node,)) - for user_node in dict(node.users): - if user_node is not dequantize_node: - user_node.replace_input_with(node, dequantize_node) - -def _convert_do_not_use( - model: GraphModule, is_reference: bool = False, - convert_custom_config_dict: Dict[str, Any] = None, - is_standalone_module: bool = False, - _remove_qconfig_flag: bool = True, - backend_config_dict: Optional[Dict[str, Any]] = None) -> torch.nn.Module: - """ - We will convert an observed model (a module with observer calls) to a reference - quantized model, the rule is simple: - 1. for each observer module call in the graph, we'll convert it to calls to - quantize and dequantize functions based on the observer instance - 2. for weighted operations like linear/conv, we need to convert them to reference - quantized module, this requires us to know whether the dtype configured for the - weight is supported in the backend, this is done in prepare step and the result - is stored in observed_node_names, we can decide whether we need to swap the - module based on this set - - standalone_module means it a submodule that is not inlined in - parent module, and will be quantized separately as one unit. - - Returns a quantized standalone module, whether input/output is quantized is - specified by prepare_custom_config_dict, with - input_quantized_idxs, output_quantized_idxs, please - see docs for prepare_fx for details - """ - if convert_custom_config_dict is None: - convert_custom_config_dict = {} - patterns, node_name_to_scope, prepare_custom_config_dict, observed_node_names = restore_state(model) - qconfig_map: Dict[str, QConfigAny] = model._qconfig_map # type: ignore[assignment] - - assert is_reference, "_convert_do_not_use only supports reference option" - - # mapping from fully qualified module name to module instance - # for example, - # { - # '': Model(...), - # 'linear': Linear(...), - # 'linear.weight_fake_quant': PerChannelMinMaxObserver(...), - # } - # We use remove_duplicate=False here because torch.cat uses - # the same activation_post_process module instance but different names - modules = dict(model.named_modules(remove_duplicate=False)) - - custom_module_classes = get_custom_module_class_keys( - convert_custom_config_dict, - "observed_to_quantized_custom_module_class") - - if model._equalization_qconfig_map is not None: - # If we want to do equalization then do the following: - # Calculate the equalization scale, update the observers with the scaled - # inputs, and scale the weight - weight_eq_obs_dict = update_obs_for_equalization(model, modules) - convert_eq_obs(model, modules, weight_eq_obs_dict) - - graph_inputs: List[str] = [] - for node in model.graph.nodes: - if node.op == 'placeholder': - graph_inputs.append(node.name) - - def replace_observer_with_quantize_dequantize_node(graph: Graph, node: Node, modules: Dict[str, torch.nn.Module]) -> None: - """ Replace activation_post_process module call node with quantize and - dequantize node - - Before: - ... -> observer_0(x) -> ... - After: - ... -> torch.quantize_per_tensor(x, ...) -> x.dequantize() -> ... - """ - assert modules is not None - assert isinstance(node.target, str) - observer_module = modules[node.target] - root_module = modules[""] - if observer_module.dtype == torch.float32: - # remove the node for now - # TODO: support dynamic quant - with graph.inserting_before(node): - node.replace_all_uses_with(node.args[0]) - graph.erase_node(node) - elif observer_module.dtype in [torch.quint8, torch.qint8, torch.float16]: - node_type, quantize_op, qparams = get_quantize_node_info(observer_module) - # replace observer node with quant - dequant node - with graph.inserting_before(node): - input_node = node.args[0] - inputs = [input_node] - for key, value in qparams.items(): - if key in ['_scale_', '_zero_point_']: - # For scale and zero_point values we register them as buffers in the root module. - # TODO: maybe need more complex attr name here - qparam_node = create_getattr_from_value(root_module, graph, key, value) - inputs.append(qparam_node) - else: - # for qparams that are not scale/zero_point (like axis, dtype) we store them as literals in the graph. - inputs.append(value) - - quantized_node = graph.create_node(node_type, quantize_op, tuple(inputs), {}) - dequantized_node = graph.call_method("dequantize", args=(quantized_node,)) - node.replace_all_uses_with(dequantized_node) - graph.erase_node(node) - - - # additional state to override inputs to be quantized, if specified - # by the user - placeholder_node_seen_cnt = 0 - output_node_seen_cnt = 0 - input_quantized_idxs: List[int] = prepare_custom_config_dict.get( - "input_quantized_idxs", []) - output_quantized_idxs: List[int] = prepare_custom_config_dict.get( - "output_quantized_idxs", []) - - if backend_config_dict is None: - backend_config_dict = {} - quantized_reference_module_mapping = get_quantized_reference_module_mapping(backend_config_dict) - # convert tuples so that it can work with isinstance(module, tuple_of_classes) - weighted_module_classes = tuple(quantized_reference_module_mapping.keys()) - - for node in list(model.graph.nodes): - if node.op == 'placeholder': - cur_placeholder_node_idx = placeholder_node_seen_cnt - placeholder_node_seen_cnt += 1 - if cur_placeholder_node_idx in input_quantized_idxs: - # Inputs are assumed to be quantized if the user specifid the - # input_quantized_idxs override. - # we need to dequantize the inputs since all operators took - # floating point inputs in reference quantized models - insert_dequantize_node(node, model.graph) - elif node.op == "output": - cur_output_node_idx = output_node_seen_cnt - output_node_seen_cnt += 1 - if cur_output_node_idx in output_quantized_idxs: - # Result are kept quantized if the user specified the - # output_quantized_idxs override. - # Remove the dequantize operator in the end - maybe_dequantize_node = node.args[0] - if isinstance(maybe_dequantize_node, Node) and \ - maybe_dequantize_node.op == "call_method" and \ - maybe_dequantize_node.target == "dequantize": - quantize_node = maybe_dequantize_node.args[0] - maybe_dequantize_node.replace_all_uses_with(quantize_node) - model.graph.erase_node(maybe_dequantize_node) - elif node.op == "call_module": - if is_activation_post_process(modules[node.target]): - replace_observer_with_quantize_dequantize_node(model.graph, node, modules) - elif is_observed_standalone_module(modules[node.target]): - # TODO: move this to a separate function - convert = torch.ao.quantization._quantize_fx_do_not_use._convert_do_not_use # type: ignore[attr-defined] - # We know that observed standalone module is a GraphModule since - # it's produced by us - observed_standalone_module : GraphModule = modules[str(node.target)] # type: ignore[assignment] - sm_input_quantized_idxs = \ - observed_standalone_module \ - ._standalone_module_input_quantized_idxs\ - .tolist() # type: ignore[operator] - # remove the dequantize nodes for inputs - args = list(node.args) - for idx in range(len(args)): - if idx in sm_input_quantized_idxs: - arg = args[idx] - if arg.op == "call_method" and arg.target == "dequantize": - quantize_node = arg.args[0] - node.replace_input_with(arg, quantize_node) - if len(arg.users) == 0: - model.graph.erase_node(arg) - # add dequantize node for output - sm_output_quantized_idxs = \ - observed_standalone_module \ - ._standalone_module_output_quantized_idxs \ - .tolist() # type: ignore[operator] - if len(sm_output_quantized_idxs) > 0: - assert sm_output_quantized_idxs[0] == 0, "Currently only quantized" - "output idxs = [0] is supported" - - # if it's non-empty, then it means the output is kept in quantized form - # we'll just add a dequantize node after this node - insert_dequantize_node(node, model.graph) - - # TODO: allow convert_custom_config_dict to override backend_config_dict - # for standalone module - quantized_standalone_module = convert( - observed_standalone_module, - is_reference=True, - backend_config_dict=backend_config_dict) - parent_name, name = _parent_name(node.target) - # update the modules dict - setattr(modules[parent_name], name, quantized_standalone_module) - modules[str(node.target)] = quantized_standalone_module - elif type(modules[node.target]) in set( - weighted_module_classes).union(QAT_MODULE_CLASSES).union(FUSED_MODULE_CLASSES): - # TODO: refactor this part to a function - original_module = modules[node.target] - qconfig = original_module.qconfig - - is_observed = node.name in observed_node_names - is_activation_quantized = activation_is_int8_quantized(qconfig) - is_weight_quantized = weight_is_statically_quantized(qconfig) - # TODO: rename weight_is_statically_quantized to weight_is_int8_quantized - if qconfig is None or \ - not is_observed or \ - not is_weight_quantized or \ - not is_activation_quantized: - continue - - float_module = original_module - fused_module = None - if isinstance( - original_module, - QAT_MODULE_CLASSES): - # case 1. converting qat module to - # a float module, we need to attch - # weight fake_quant to the module, - # weight fake_quant is assumed to be run during - # QAT so we don't need to run it again here - float_module = original_module.to_float() # type: ignore[operator] - # change qat conv to conv - parent_name, name = _parent_name(node.target) - setattr(modules[parent_name], name, float_module) - if isinstance(float_module, torch.nn.intrinsic._FusedModule): - fused_module = float_module - float_module = fused_module[0] - weight_post_process = original_module.weight_fake_quant - else: - # case 2. converting a float module/fused float module - # to float module, we need to attach - # weight observer to the conv module and run it - # with conv weight - if isinstance(original_module, torch.nn.intrinsic._FusedModule): - fused_module = original_module - float_module = fused_module[0] # type: ignore[index] - assert qconfig is not None - weight_post_process = qconfig.weight() - # run weight observer - weight_post_process(float_module.weight) # type: ignore[operator] - weight_qparams = get_qparam_dict(weight_post_process) - # TODO: may need to change the mapping when we support dynamic quantization - ref_qmodule_cls = quantized_reference_module_mapping.get(type(float_module), None) - assert ref_qmodule_cls is not None, f"No reference quantized module class configured for {type(float_module)}" - ref_qmodule = ref_qmodule_cls.from_float(float_module, weight_qparams) # type: ignore[attr-defined] - if fused_module is not None: - fused_module[0] = ref_qmodule - else: - parent_name, name = _parent_name(node.target) - setattr(modules[parent_name], name, ref_qmodule) - - # removes qconfig and activation_post_process modules - if _remove_qconfig_flag: - _remove_qconfig(model) - preserved_attributes = set(convert_custom_config_dict.get("preserved_attributes", [])) - model = QuantizedGraphModule(model, model.graph, preserved_attributes) - return model diff --git a/torch/ao/quantization/fx/_lower_to_native_backend.py b/torch/ao/quantization/fx/_lower_to_native_backend.py index cc8cfa1cb01c..41fbb366934e 100644 --- a/torch/ao/quantization/fx/_lower_to_native_backend.py +++ b/torch/ao/quantization/fx/_lower_to_native_backend.py @@ -1,115 +1,854 @@ import torch +from torch.fx import map_arg, Node +from torch.fx.graph import Graph import torch.nn as nn +import torch.nn.functional as F import torch.nn.intrinsic as nni import torch.nn.intrinsic.quantized as nniq +import torch.nn.intrinsic.quantized.dynamic as nniqd import torch.nn.quantized as nnq +import torch.nn.quantized.dynamic as nnqd import torch.nn.quantized._reference as nnqr -from torch.nn.quantized.modules.utils import ReferenceableQuantizedModule -from . import subgraph_rewriter_FORKED_DO_NOT_USE +from torch.nn.quantized.modules.utils import WeightedQuantizedModule from .graph_module import QuantizedGraphModule -from .quantized_fusion_patterns_and_replacements import get_fbgemm_patterns_and_replacements -from .match_utils import is_match -from .match_utils import MatchAllNode -from ..utils import _parent_name, check_node -from typing import Dict, Tuple, Type, List -from torch.fx import Node - -# Mapping from reference module class to the replacement quantized module class for lowering -LOWER_MODULE_MAP: Dict[Type[nn.Module], Type[ReferenceableQuantizedModule]] = { +from .utils import ( + collect_producer_nodes, + get_linear_prepack_op_for_dtype, + get_new_attr_name_with_prefix, + get_qconv_prepack_op, + graph_module_from_producer_nodes, +) +from ..utils import _parent_name +from ..qconfig import QConfigAny +from ..quantization_mappings import get_quantized_operator +from .utils import create_node_from_old_node_preserve_meta +from typing import Dict, Tuple, Type, List, Callable, Any, Union, Set, Optional +import operator + +QOP_TO_ARG_NAMES_TO_SKIP = { + torch._ops.ops.quantized.hardswish: ['inplace'], + torch._ops.ops.quantized.elu: ['inplace'], + torch._ops.ops.quantized.dropout: ['inplace'], + torch._ops.ops.quantized.instance_norm: + ['running_mean', 'running_var', 'use_input_stats', 'momentum'], +} + +def _is_node_in_list(node, modules, func_list, method_list, module_type_list): + is_call_function = node.op == "call_function" and node.target in func_list + is_call_method = node.op == "call_method" and node.target in method_list + is_call_module = node.op == "call_module" and type(modules[str(node.target)]) in module_type_list + return is_call_function, is_call_method, is_call_module + +def is_fixed_qparams_node(node, modules): + func_list = [ + torch.nn.functional.hardsigmoid, + torch.nn.functional.sigmoid, + torch.sigmoid, + torch.tanh, + ] + method_list = [ + "hardsigmoid", + "hardsigmoid_", + "sigmoid", + "sigmoid_", + "tanh", + "tanh_", + ] + module_type_list = [ + torch.nn.Hardsigmoid, + torch.nn.Sigmoid, + torch.nn.Tanh, + torch.nn.Softmax, + ] + return _is_node_in_list(node, modules, func_list, method_list, module_type_list) + +def is_default_node(node, modules): + func_list = [ + torch.nn.functional.elu, + torch.nn.functional.hardswish, + torch.nn.functional.instance_norm, + torch.nn.functional.layer_norm, + torch.nn.functional.leaky_relu, + torch.nn.functional.dropout, + ] + method_list: List[Any] = [] + module_type_list = [ + nnqr.ConvTranspose1d, + nnqr.ConvTranspose2d, + torch.nn.ELU, + torch.nn.LeakyReLU, + torch.nn.Hardswish, + torch.nn.InstanceNorm1d, + torch.nn.InstanceNorm2d, + torch.nn.InstanceNorm3d, + torch.nn.LayerNorm, + torch.nn.Dropout, + torch.nn.BatchNorm2d, + torch.nn.BatchNorm3d, + torch.nn.intrinsic.BNReLU2d, + torch.nn.intrinsic.BNReLU3d, + ] + return _is_node_in_list(node, modules, func_list, method_list, module_type_list) + +def is_copy_node(node, modules): + func_list = [ + torch.adaptive_avg_pool1d, + torch.nn.functional.adaptive_avg_pool2d, + torch.nn.functional.adaptive_avg_pool3d, + torch.nn.functional.hardtanh, + torch.nn.functional.hardtanh_, + torch.nn.functional.interpolate, + torch.nn.functional.max_pool1d, + torch.nn.functional.max_pool2d, + torch.nn.functional.max_pool3d, + torch.nn.functional.relu, + torch.nn.functional.relu6, + torch.avg_pool1d, + torch._C._nn.avg_pool2d, + torch._C._nn.avg_pool3d, + torch.clamp, + torch.flatten, + torch.mean, + operator.floordiv, + ] + method_list = [ + "clamp", + "mean", + "relu", + "relu_", + ] + module_type_list = [ + torch.nn.AdaptiveAvgPool1d, + torch.nn.AdaptiveAvgPool2d, + torch.nn.AdaptiveAvgPool3d, + torch.nn.AvgPool1d, + torch.nn.AvgPool2d, + torch.nn.AvgPool3d, + torch.nn.Hardtanh, + torch.nn.MaxPool1d, + torch.nn.MaxPool2d, + torch.nn.MaxPool3d, + torch.nn.ReLU, + torch.nn.ReLU6, + ] + return _is_node_in_list(node, modules, func_list, method_list, module_type_list) + +def is_general_tensor_shape_node(node, modules): + func_list = [ + torch.transpose, + torch.repeat_interleave, + torch.squeeze, + torch.stack, + torch.unsqueeze, + ] + method_list = [ + "contiguous", + "detach", + "detach_", + "permute", + "repeat", + "repeat_interleave", + "reshape", + "resize_", + "shape", + "size", + "squeeze", + "squeeze_", + "transpose", + "unsqueeze", + "unsqueeze_", + "view", + ] + module_type_list = [ + torch.nn.Identity, + ] + return _is_node_in_list(node, modules, func_list, method_list, module_type_list) + +def is_other_node(node, modules): + func_list = [ + torch.cat, + ] + method_list: List[Any] = [] + module_type_list: List[Any] = [] + return _is_node_in_list(node, modules, func_list, method_list, module_type_list) + +def is_special_pattern_node(node, modules): + res_function, res_method, res_module = False, False, False + for checker in [is_fixed_qparams_node, is_default_node, is_copy_node, is_general_tensor_shape_node, is_other_node]: + is_call_function, is_call_method, is_call_module = checker(node, modules) + res_function = res_function or is_call_function + res_method = res_method or is_call_method + res_module = res_module or is_call_module + return res_function, res_method, res_module + +def is_dequantize_node(node): + return isinstance(node, Node) and node.op == "call_method" and node.target == "dequantize" + +def is_getattr_tensor_metadata_node(node): + return node.op == "call_function" and \ + node.target == getattr and \ + node.args[1] in ["shape"] + +def should_skip_lowering(op: torch.fx.node.Node, qconfig_map: Dict[str, QConfigAny]): + """ + Return True if the op is configured with a None qconfig, False otherwise. + Note: maybe need to generalize this to also check for the dtype, and we + only lower when dtype matches, but right now fbgemm/qnnpack only support + a single dtype, so it is OK for now. + """ + return op.name in qconfig_map and qconfig_map[op.name] is None + +# Mapping from reference module class to the replacement static quantized module class for lowering +STATIC_LOWER_MODULE_MAP: Dict[Type[nn.Module], Type[WeightedQuantizedModule]] = { nnqr.Linear: nnq.Linear, nnqr.Conv1d: nnq.Conv1d, nnqr.Conv2d: nnq.Conv2d, nnqr.Conv3d: nnq.Conv3d, } +# Mapping from reference module class to the replacement dynamic quantized module class for lowering +DYNAMIC_LOWER_MODULE_MAP: Dict[Type[nn.Module], Type[nn.Module]] = { + nnqr.Linear: nnqd.Linear, + nnqr.GRUCell: nnqd.GRUCell, + nnqr.LSTMCell: nnqd.LSTMCell, + nnqr.RNNCell: nnqd.RNNCell, + nnqr.LSTM: nnqd.LSTM, +} + +# Mapping from reference module class to the replacement weight only quantized module class for lowering +# TODO: correct the namespace for these modules +WEIGHT_ONLY_LOWER_MODULE_MAP: Dict[Type[nn.Module], Type[nn.Module]] = { + nnqr.Embedding: nnq.Embedding, + nnqr.EmbeddingBag: nnq.EmbeddingBag, +} + +# TODO: merge with STATIC_LOWER_MODULE_MAP after we merge +# _lower_static_weighted_ref_module and special_pattern_replacement +SPECIAL_PATTERN_LOWER_MODULE_MAP = { + nn.BatchNorm2d: nnq.BatchNorm2d, + nn.BatchNorm3d: nnq.BatchNorm3d, + nnqr.ConvTranspose1d: nnq.ConvTranspose1d, + nnqr.ConvTranspose2d: nnq.ConvTranspose2d, + nn.ELU: nnq.ELU, + nn.LeakyReLU: nnq.LeakyReLU, + nn.Hardswish: nnq.Hardswish, + nn.InstanceNorm1d: nnq.InstanceNorm1d, + nn.InstanceNorm2d: nnq.InstanceNorm2d, + nn.InstanceNorm3d: nnq.InstanceNorm3d, + nn.LayerNorm: nnq.LayerNorm, + nn.Dropout: nnq.Dropout, + nn.Softmax: nnq.Softmax, + nni.BNReLU2d: nniq.BNReLU2d, + nni.BNReLU3d: nniq.BNReLU3d, +} + # Mapping from fused module class to a 2-tuple of: # 1) The inner reference module class -# 2) The replacement quantized module class for lowering -LOWER_FUSED_MODULE_MAP: Dict[Type[nn.Module], Tuple[Type[nn.Module], Type[ReferenceableQuantizedModule]]] = { - nni.LinearReLU: (nnqr.Linear, nniq.LinearReLU) +# 2) The replacement static quantized module class for lowering +STATIC_LOWER_FUSED_MODULE_MAP: Dict[Type[nn.Module], Tuple[Type[nn.Module], Type[WeightedQuantizedModule]]] = { + nni.LinearReLU: (nnqr.Linear, nniq.LinearReLU), + nni.ConvReLU1d: (nnqr.Conv1d, nniq.ConvReLU1d), + nni.ConvReLU2d: (nnqr.Conv2d, nniq.ConvReLU2d), + nni.ConvReLU3d: (nnqr.Conv3d, nniq.ConvReLU3d), +} + +# Mapping from fused module class to a 2-tuple of: +# 1) The inner reference module class +# 2) The replacement dynamic quantized module class for lowering +DYNAMIC_LOWER_FUSED_MODULE_MAP: Dict[Type[nn.Module], Tuple[Type[nn.Module], Type[nn.Module]]] = { + nni.LinearReLU: (nnqr.Linear, nniqd.LinearReLU), +} + +# Mapping from a functional to lower to a 2-tuple of +# 1) The quantized version of the op +# 2) The quantized version of the op fused with relu, if it exists, else None +STATIC_LOWER_FUNCTIONAL_MAP: Dict[Callable, Tuple[Callable, Callable]] = { + F.linear: (torch.ops.quantized.linear, torch.ops.quantized.linear_relu), + F.conv1d: (torch.ops.quantized.conv1d, torch.ops.quantized.conv1d_relu), + F.conv2d: (torch.ops.quantized.conv2d, torch.ops.quantized.conv2d_relu), + F.conv3d: (torch.ops.quantized.conv3d, torch.ops.quantized.conv3d_relu), +} + +WEIGHT_PREPACK_OPS: Set[Callable] = { + torch._ops.ops.quantized.linear_prepack, + torch._ops.ops.quantized.linear_prepack_fp16, + torch._ops.ops.quantized.conv1d_prepack, + torch._ops.ops.quantized.conv2d_prepack, + torch._ops.ops.quantized.conv3d_prepack, +} + +# Mapping from a functional to a dictionary, where the key is a 2-tuple of +# (activation_compute_dtype, weight_dtype) and the value is a 2-tuple of +# 1) The dynamically quantized version of the op +# 2) The dynamically quantized version of the op fused with relu, if it exists, else None +DYNAMIC_LOWER_FUNCTIONAL_MAP: Dict[Callable, Dict[Tuple[torch.dtype, torch.dtype], Tuple[Callable, Optional[Callable]]]] = { + F.linear: { + (torch.quint8, torch.qint8): (torch.ops.quantized.linear_dynamic, + torch.ops.quantized.linear_relu_dynamic), + (torch.float16, torch.float16): (torch.ops.quantized.linear_dynamic_fp16, + torch.ops.quantized.linear_relu_dynamic_fp16) + }, + # dynamic conv + relu is not available yet + F.conv1d: { + (torch.quint8, torch.qint8): (torch.ops.quantized.conv1d_dynamic, None), + }, + F.conv2d: { + (torch.quint8, torch.qint8): (torch.ops.quantized.conv2d_dynamic, None), + }, + F.conv3d: { + (torch.quint8, torch.qint8): (torch.ops.quantized.conv3d_dynamic, None), + }, +} + +CONV_FUNCTIONAL_OPS: Set[Callable] = { + F.conv1d, + F.conv2d, + F.conv3d, } -def _lower_weighted_ref_module(model: QuantizedGraphModule) -> QuantizedGraphModule: +QBIN_OP_MAPPING: Dict[Union[Callable, str], Callable] = { + operator.add: torch.ops.quantized.add, + torch.add: torch.ops.quantized.add, + operator.mul: torch.ops.quantized.mul, + torch.mul: torch.ops.quantized.mul, + torch.matmul: torch.ops.quantized.matmul, +} +QBIN_RELU_OP_MAPPING: Dict[Union[Callable, str], Callable] = { + operator.add: torch.ops.quantized.add_relu, + torch.add: torch.ops.quantized.add_relu, + operator.mul: torch.ops.quantized.mul_relu, + torch.mul: torch.ops.quantized.mul_relu, +} + +def fold_weight( + quantized: QuantizedGraphModule, + node_name_to_scope: Dict[str, Tuple[str, type]] +) -> QuantizedGraphModule: + """ + Trace back from the weight node util we hit getattr, reconstruct the + graph module with the traced nodes and run the graph module to pack the + weight. then replace the original chain of ops with the packed weight. + """ + packed_weights = dict() + # map from folded node name to the prepacked weight name + folded_nodes = dict() + # get packed weights + for node in quantized.graph.nodes: + if node.op == 'call_function' and node.target in WEIGHT_PREPACK_OPS: + nodes_to_fold = collect_producer_nodes(node) + if nodes_to_fold is not None: + for node_to_fold in nodes_to_fold: + folded_nodes[node_to_fold.name] = node + + prepacking_module = graph_module_from_producer_nodes( + quantized, nodes_to_fold) + packed_weight = prepacking_module() + packed_weights[node.name] = packed_weight + + # remove folded nodes and replace the prepacking node with getattr + folded_graph = Graph() + env: Dict[Any, Any] = {} + + def load_arg(a): + return map_arg(a, lambda node: env[node.name]) + quantized_root = quantized + quantized_graph = quantized.graph + + for node in quantized_graph.nodes: + prepack_node = folded_nodes.get(node.name, None) + if prepack_node is node: + packed_weight = packed_weights[node.name] + # add a prepacked attribute to root + op_node = list(prepack_node.users)[0] + module_path, _ = node_name_to_scope[op_node.name] + get_new_packed_weight_name = \ + get_new_attr_name_with_prefix(module_path + '_packed_weight_') + packed_weight_name = get_new_packed_weight_name(quantized_root) + setattr(quantized_root, packed_weight_name, packed_weight) + # replace prepack node with a getattr node + env[node.name] = folded_graph.create_node( + 'get_attr', packed_weight_name, (), {}) + elif prepack_node is not None: + # remove the foled node + continue + else: + # copy other nodes + env[node.name] = folded_graph.node_copy(node, load_arg) + return QuantizedGraphModule(quantized_root, folded_graph, quantized_root.preserved_attr_names) + +def _get_module(node: Node, modules: Dict[str, nn.Module]) -> Optional[nn.Module]: + """ + Return the `torch.nn.Module` that corresponds to the specified node's target. + If no such node exists, return None. + """ + if node.op == "call_module" and str(node.target) in modules: + return modules[str(node.target)] + else: + return None + +def _match_static_pattern( + node: Node, + modules: Dict[str, nn.Module], + qconfig_map: Dict[str, QConfigAny], + matching_modules_or_ops: List[Callable], + dequantize_node_arg_indices: List[int] +) -> Union[Tuple[Node, Node, Node], Tuple[None, None, None]]: + """ + Match the pattern (dequantize - ref node - quantize) against the node provided. + + If there is a match, return a 3-tuple of: + 1) q_node: the quantize node, + 2) relu_node: a relu node wrapping the ref_node, and + 3) ref_node: a reference module or functional node to replace with its quantized counterpart + Otherwise, if there is no match, return a 3-tuple of (None, None, None). + + Parameters: + node: The `torch.fx.Node` to match against. + modules: A mapping from node names to modules in the model graph, used for module lookup. + qconfig_map: A mapping from node names to the qconfigs associated with the nodes. + If the corresponding qconfig for the reference node is None, then return no match. + matching_modules_or_ops: Either a list of functions or a list of `torch.nn.Module`s. + If the reference node is not in this list, then return no match. + dequantize_node_arg_indices: A list of indices in the reference node args where dequantize + nodes may be present. An empty list means skipping the check for dequantize nodes. + """ + SKIP_LOWERING_VALUE = (None, None, None) + + # Match quantize node + if node.op != "call_function" or node.target != torch.quantize_per_tensor: + return SKIP_LOWERING_VALUE + q_node = node + ref_node = q_node.args[0] + assert(isinstance(ref_node, Node)) + + # Handle cases where the node is wrapped in a ReLU + if (ref_node.op == "call_function" and ref_node.target in (F.relu, torch.relu)) or\ + (ref_node.op == "call_module" and type(_get_module(ref_node, modules)) == nn.ReLU): + relu_node = ref_node + ref_node = relu_node.args[0] + assert(isinstance(ref_node, Node)) + else: + relu_node = None + if should_skip_lowering(ref_node, qconfig_map): + return SKIP_LOWERING_VALUE + + # Match reference module or functional + if isinstance(matching_modules_or_ops[0], type) and issubclass(matching_modules_or_ops[0], nn.Module): + expected_op = "call_module" + match_key = type(_get_module(ref_node, modules)) + else: + expected_op = "call_function" + match_key = ref_node.target + if ref_node.op != expected_op or match_key not in matching_modules_or_ops: + return SKIP_LOWERING_VALUE + + # Match dequantize node(s). Both of the following conditions must pass: + # (1) All `torch.fx.Node`s at the matching indices must be a dequantize node + # (2) There must be at least one dequantize node + matched_dequantize = False + for i in dequantize_node_arg_indices: + assert i < len(ref_node.args),\ + "Dequantize index %s exceeded reference node's arg length %s" % (i, len(ref_node.args)) + arg = ref_node.args[i] + if is_dequantize_node(arg): + matched_dequantize = True + elif isinstance(arg, Node): + return SKIP_LOWERING_VALUE + if not matched_dequantize: + return SKIP_LOWERING_VALUE + + return (q_node, relu_node, ref_node) + +def _lower_static_weighted_ref_module( + model: QuantizedGraphModule, + qconfig_map: Dict[str, QConfigAny]): """ Traverse the graph and find dequantize - ref module - quantize patterns and replace them with the quantized version of the ref module. """ - for ref_class in list(LOWER_MODULE_MAP.keys()) + list(LOWER_FUSED_MODULE_MAP.keys()): - pattern = (torch.quantize_per_tensor, - (ref_class, "dequantize"), - MatchAllNode, MatchAllNode, MatchAllNode) - modules = dict(model.named_modules(remove_duplicate=False)) - nodes = list(model.graph.nodes) - # TODO: maybe orgnize this better (e.g. break down to more functions) - # to make this function more readable - for n in model.graph.nodes: - if not is_match(modules, n, pattern): - continue - q_node = n - ref_node = q_node.args[0] - dq_node = ref_node.args[0] - # get output scale/zero_point/dtype from the quantize node - scale_node = q_node.args[1] - zero_point_node = q_node.args[2] - dtype = q_node.args[3] - - # this can be removed if we add support for "get_attr" in is_match - if scale_node.op != "get_attr" or zero_point_node.op != "get_attr": - print("Find the pattern but scale_node and zero_point node are not `get_attr`," - f"got: {scale_node.format_node} {zero_point_node.format_node()}") + modules = dict(model.named_modules(remove_duplicate=False)) + nodes = list(model.graph.nodes) + for n in model.graph.nodes: + # Step 0: Find nodes that match this pattern (dequantize - ref module - quantize) + matching_modules = list(STATIC_LOWER_MODULE_MAP.keys()) + list(STATIC_LOWER_FUSED_MODULE_MAP.keys()) + (q_node, relu_node, ref_node) = _match_static_pattern( + n, modules, qconfig_map, matching_modules, dequantize_node_arg_indices=[0]) # type: ignore[arg-type] + if q_node is None: + continue + assert(ref_node is not None) + (_, scale_node, zero_point_node, _) = q_node.args + ref_module = _get_module(ref_node, modules) + ref_class = type(ref_module) + assert(isinstance(scale_node, Node)) + assert(isinstance(zero_point_node, Node)) + assert(issubclass(ref_class, nn.Module)) + + # Step 1: Change this pattern to use the corresponding quantized module + # For fused modules, we also check whether the inner module is a reference module + # If so, we replace the entire fused module with the corresponding quantized module + if ref_class in STATIC_LOWER_FUSED_MODULE_MAP: + inner_ref_class, q_class = STATIC_LOWER_FUSED_MODULE_MAP[ref_class] + if type(ref_module[0]) != inner_ref_class: # type: ignore[index] continue + else: + q_class = STATIC_LOWER_MODULE_MAP[ref_class] + output_scale = getattr(model, scale_node.target) + output_zero_point = getattr(model, zero_point_node.target) + q_module = q_class.from_reference(ref_module, output_scale, output_zero_point) + # replace reference module with quantized module + parent_name, module_name = _parent_name(ref_node.target) + setattr(modules[parent_name], module_name, q_module) + + # Step 2: Remove dq_node, q_node and its args + dq_node = ref_node.args[0] + assert(isinstance(dq_node, Node)) + dq_node.replace_all_uses_with(dq_node.args[0]) + model.graph.erase_node(dq_node) + q_node.replace_all_uses_with(ref_node) + model.graph.erase_node(q_node) + model.graph.erase_node(scale_node) + model.graph.erase_node(zero_point_node) + +def _lower_dynamic_weighted_ref_module(model: QuantizedGraphModule): + """ + Traverse the graph and find quantize_per_tensor_dynamic - dequantize - ref_module patterns + and replace them with the dynamically quantized version of the ref module. + """ + named_modules = dict(model.named_modules(remove_duplicate=False)) + for n in model.graph.nodes: + if n.op != "call_module" or \ + type(named_modules[str(n.target)]) not in \ + set(DYNAMIC_LOWER_MODULE_MAP.keys()).union( + set(DYNAMIC_LOWER_FUSED_MODULE_MAP.keys())): + continue + ref_node = n + dq_node = ref_node.args[0] + if dq_node.op != "call_method" or dq_node.target != "dequantize": + continue + # don't support lowering the pattern when the result of dequantize is used by + # multiple nodes + if len(dq_node.users) > 1: + continue + + input_dynamic_q_node = dq_node.args[0] + # don't support lowering the pattern when the result of quantize is used by + # multiple nodes + if len(input_dynamic_q_node.users) > 1: + continue - # this can be removed if we add support for constants in is_match - if dtype != torch.quint8: - print(f"Only qint8 output for quantized op is supported, got: {dtype}") + if input_dynamic_q_node.op != "call_function" or \ + input_dynamic_q_node.target != torch.quantize_per_tensor_dynamic: + continue + + activation_compute_dtype = input_dynamic_q_node.args[1] + is_fp16 = activation_compute_dtype == torch.float16 + is_int8 = activation_compute_dtype in [torch.quint8, torch.qint8] + if not is_int8 and not is_fp16: + continue + + ref_module = named_modules[str(ref_node.target)] + ref_class = type(ref_module) + if ref_class in DYNAMIC_LOWER_FUSED_MODULE_MAP: + inner_ref_class, q_class = DYNAMIC_LOWER_FUSED_MODULE_MAP[ref_class] + if type(ref_module[0]) != inner_ref_class: continue + else: + q_class = DYNAMIC_LOWER_MODULE_MAP.get(ref_class) # type: ignore[assignment] + # TODO: maybe define a WeightedDynamicallyQuantizedModule + q_module = q_class.from_reference(ref_module) # type: ignore[attr-defined] - # change this pattern to use the corresponding quantized module - ref_module = modules[ref_node.target] - output_scale = getattr(model, scale_node.target) - output_zero_point = getattr(model, zero_point_node.target) - # For fused modules, we also check whether the inner module is a reference module - # If so, we replace the entire fused module with the corresponding quantized module - if ref_class in LOWER_FUSED_MODULE_MAP: - inner_ref_class, q_class = LOWER_FUSED_MODULE_MAP[ref_class] - if type(ref_module[0]) != inner_ref_class: - continue - else: - q_class = LOWER_MODULE_MAP[type(ref_module)] - assert issubclass(q_class, ReferenceableQuantizedModule) # suppress mypy warnings - q_module = q_class.from_reference(ref_module, output_scale, output_zero_point) - - # replace reference module with quantized module - parent_name, module_name = _parent_name(ref_node.target) - setattr(modules[parent_name], module_name, q_module) - # remove dq node: - dq_node_input = dq_node.args[0] - - dq_node.replace_all_uses_with(dq_node_input) - model.graph.erase_node(dq_node) + # replace reference moduel with dynamically quantized module + parent_name, module_name = _parent_name(ref_node.target) + setattr(named_modules[parent_name], module_name, q_module) - # remove q node and args: - q_node.replace_all_uses_with(ref_node) - model.graph.erase_node(q_node) - model.graph.erase_node(scale_node) - model.graph.erase_node(zero_point_node) - model.recompile() - return model + # remove q - dq node + dq_node.replace_all_uses_with(input_dynamic_q_node) + model.graph.erase_node(dq_node) + input_dynamic_q_node.replace_all_uses_with(input_dynamic_q_node.args[0]) + model.graph.erase_node(input_dynamic_q_node) -def special_pattern_replacement(model: QuantizedGraphModule) -> QuantizedGraphModule: +def _lower_weight_only_weighted_ref_module(model: QuantizedGraphModule): + """ + Traverse the graph and find ref_module patterns + and replace them with the weight only quantized version of the ref module. + """ + named_modules = dict(model.named_modules(remove_duplicate=False)) + for n in model.graph.nodes: + if n.op != "call_module" or \ + type(named_modules[str(n.target)]) not in \ + set(WEIGHT_ONLY_LOWER_MODULE_MAP.keys()): + continue + ref_node = n + ref_module = named_modules[str(ref_node.target)] + ref_class = type(ref_module) + q_class = WEIGHT_ONLY_LOWER_MODULE_MAP.get(ref_class) + # TODO: WeightedQuantizedModule is currently assuming static quant apis + # with output_scale, output_zero_point in from_reference, we may want to + # relax that, or rename this + # TODO: maybe define a WeightedWeightOnlyQuantizedModule + q_module = q_class.from_reference(ref_module) # type: ignore[union-attr] + + # replace reference moduel with dynamically quantized module + parent_name, module_name = _parent_name(ref_node.target) + setattr(named_modules[parent_name], module_name, q_module) + +def _lower_static_weighted_ref_functional( + model: QuantizedGraphModule, + qconfig_map: Dict[str, QConfigAny]): + """ + Traverse the graph and replace functional reference patterns with their quantized versions. + """ modules = dict(model.named_modules(remove_duplicate=False)) nodes = list(model.graph.nodes) + for n in model.graph.nodes: + # Step 0: Find nodes that match this pattern (dequantize - functional op - quantize) + matching_ops = list(STATIC_LOWER_FUNCTIONAL_MAP.keys()) + (q_node, relu_node, func_node) = _match_static_pattern( + n, modules, qconfig_map, matching_ops, dequantize_node_arg_indices=[0, 1]) + if q_node is None: + continue + assert(func_node is not None) + (_, output_scale_node, output_zp_node, _) = q_node.args + (input_dq_node, weight_dq_node, *remaining_func_args) = func_node.args + assert(isinstance(output_zp_node, Node)) + assert(isinstance(input_dq_node, Node)) + assert(isinstance(weight_dq_node, Node)) + quantized_weight = weight_dq_node.args[0] + assert(isinstance(quantized_weight, Node)) + if quantized_weight.op != "call_function" or\ + quantized_weight.target not in (torch.quantize_per_tensor, torch.quantize_per_channel): + continue + + # Step 1: Replace quantized weights with packed weights, which will be folded later + # Use the right prepack op and prepare the corresponding args + # Linear prepack args: (quantized weights[, bias]) + # Conv prepack args: (quantized weights[, bias, stride, padding, dilation, groups]) + prepack_args = [quantized_weight] + remaining_func_args + if func_node.target == F.linear: + weight_dtype = quantized_weight.args[-1] + prepack_op = get_linear_prepack_op_for_dtype(weight_dtype) + elif func_node.target in CONV_FUNCTIONAL_OPS: + prepack_op = get_qconv_prepack_op(func_node.target) # type: ignore[arg-type] + # For conv1d, the stride, padding, and dilation args may be ints, + # in which case we need to convert them to tuples + if func_node.target == F.conv1d: + for i in [2, 3, 4]: + if len(prepack_args) > i and isinstance(prepack_args[i], int): + prepack_args[i] = (prepack_args[i],) + else: + raise ValueError("Lowering is not supported for op '%s'" % func_node.target) + with model.graph.inserting_before(output_scale_node): + packed_weight = model.graph.create_node("call_function", prepack_op, tuple(prepack_args), {}) + + # Step 2: Replace reference pattern with the corresponding quantized op + (q_func, q_relu_func) = STATIC_LOWER_FUNCTIONAL_MAP[func_node.target] # type: ignore[index] + func_node.target = q_relu_func if relu_node is not None else q_func + func_node.args = (input_dq_node.args[0], packed_weight, output_scale_node, output_zp_node) + q_node.replace_all_uses_with(func_node) + # Move func_node after output_zp_node in the graph + output_zp_node.append(func_node) + + # Clean up: Remove dequantize and quantize nodes, and the relu node if it exists + for dqn in [input_dq_node, weight_dq_node]: + dqn_input = dqn.args[0] + dqn.replace_all_uses_with(dqn_input) + model.graph.erase_node(dqn) + model.graph.erase_node(q_node) + if relu_node is not None: + model.graph.erase_node(relu_node) + +def _lower_dynamic_weighted_ref_functional( + model: QuantizedGraphModule, + qconfig_map: Dict[str, QConfigAny]): + """ + Traverse the graph and replace functional reference patterns with their dynamically + quantized versions. + Examples: + quantize_per_tensor_dynamic - dequantize - functional linear --> linear_dynamic + to(torch.float16) - dequantize - functional linear --> linear_dynamic_fp16 + """ + modules = dict(model.named_modules(remove_duplicate=False)) + nodes = list(model.graph.nodes) + # we want to search in reserved order so that we can match the larger patterns first + # e.g. we want to match linear - relu before linear. + for n in reversed(model.graph.nodes): + + # Step 0: Find nodes that match this pattern + # (quantize_per_tensor_dynamic - dequantize - dynamically quantized op) + # We search for the pattern backwards, starting with the quantize node + # Quantize node args: (func, scale, zp, dtype) + func_node = n + # Handle cases where the functional op is wrapped in a ReLU + if func_node.op == "call_function" and func_node.target == F.relu or \ + func_node.op == "call_module" and \ + type(modules[str(func_node.target)]) == torch.nn.ReLU: + relu_node = func_node + func_node = relu_node.args[0] + else: + relu_node = None + if should_skip_lowering(func_node, qconfig_map): + continue + # Linear args: (dequantized inputs, dequantized weights[, bias]) + # Conv args: (dequantized inputs, dequantized weights[, bias, stride, padding, dilation, groups]) + if func_node.op != "call_function" or func_node.target not in DYNAMIC_LOWER_FUNCTIONAL_MAP: + continue + (input_dq_node, weight_dq_node, *remaining_func_args) = func_node.args + if input_dq_node.op != "call_method" or input_dq_node.target != "dequantize" or \ + weight_dq_node.op != "call_method" or weight_dq_node.target != "dequantize": + continue + + input_dynamic_q_node = input_dq_node.args[0] + # don't support lowering the pattern when the result of quantize is used by + # multiple nodes + if len(input_dynamic_q_node.users) > 1: + continue + + if input_dynamic_q_node.op != "call_function" or \ + input_dynamic_q_node.target != torch.quantize_per_tensor_dynamic: + continue + + reduce_range_node = None + (pattern_input, activation_compute_dtype, reduce_range_node) = input_dynamic_q_node.args + is_fp16 = activation_compute_dtype == torch.float16 + is_int8 = activation_compute_dtype in [torch.quint8, torch.qint8] + if not is_int8 and not is_fp16: + continue + + quantized_weight = weight_dq_node.args[0] + weight_dtype = quantized_weight.args[-1] + + # Step 1: Try to select reference pattern with the corresponding quantized op + dynamic_quant_dtype_key = (activation_compute_dtype, weight_dtype) + if dynamic_quant_dtype_key not in DYNAMIC_LOWER_FUNCTIONAL_MAP[func_node.target]: + print(f"Didn't find dtype combination {dynamic_quant_dtype_key} during " + f"dynamic quantized op lowering for {func_node.target}") + continue + (q_func, q_relu_func) = DYNAMIC_LOWER_FUNCTIONAL_MAP[func_node.target][dynamic_quant_dtype_key] + + if q_func is None or q_relu_func is None: + print("Didn't find corresponding quantized function or quantized relu function " + f"for {func_node.target}, {dynamic_quant_dtype_key}") + continue + + # Step 2: Replace quantized weights with packed weights, which will be folded later + # Use the right prepack op and prepare the corresponding args + # Linear prepack args: (quantized weights[, bias]) + # Conv prepack args: (quantized weights[, bias, stride, padding, dilation, groups]) + prepack_args = [quantized_weight] + remaining_func_args + if func_node.target == F.linear: + prepack_op = get_linear_prepack_op_for_dtype(weight_dtype) + elif func_node.target in CONV_FUNCTIONAL_OPS: + prepack_op = get_qconv_prepack_op(func_node.target) + # For conv1d, the stride, padding, and dilation args may be ints, + # in which case we need to convert them to tuples + if func_node.target == F.conv1d: + for i in [2, 3, 4]: + if len(prepack_args) > i and isinstance(prepack_args[i], int): + prepack_args[i] = (prepack_args[i],) + else: + raise ValueError("Lowering is not supported for op '%s'" % func_node.target) + with model.graph.inserting_before(func_node): + packed_weight = model.graph.create_node("call_function", prepack_op, tuple(prepack_args), {}) + + # Step 3: Replace reference pattern with the corresponding quantized op + func_node.target = q_relu_func if relu_node is not None else q_func + if is_int8: + func_node.args = (pattern_input, packed_weight, reduce_range_node) + else: + func_node.args = (pattern_input, packed_weight) + + if relu_node is not None: + relu_node.replace_all_uses_with(func_node) + + # Step 4: Remove dequantize and quantize nodes, and the relu node if it exists + for dqn in [input_dq_node, weight_dq_node]: + dqn_input = dqn.args[0] + dqn.replace_all_uses_with(dqn_input) + model.graph.erase_node(dqn) + model.graph.erase_node(input_dynamic_q_node) + if relu_node is not None: + model.graph.erase_node(relu_node) + +def _lower_quantized_binary_op( + model: QuantizedGraphModule, + qconfig_map: Dict[str, QConfigAny]): + binary_ops_to_lower: List[Callable] = [operator.add, torch.add, operator.mul, torch.mul, torch.matmul] + modules = dict(model.named_modules(remove_duplicate=False)) + for n in model.graph.nodes: + # Step 0: Find nodes that match this pattern (dequantize - ref module - quantize) + (q_node, relu_node, bop_node) = _match_static_pattern( + n, modules, qconfig_map, binary_ops_to_lower, dequantize_node_arg_indices=[0, 1]) + if q_node is None: + continue + assert(bop_node is not None) + (_, scale_node, zero_point_node, _) = q_node.args + + # Step 1: Remove dequant nodes + num_dq_nodes = 0 + for arg in bop_node.args: + if not is_dequantize_node(arg): + continue + dq_node = arg + assert(isinstance(dq_node, Node)) + dn_input = dq_node.args[0] + dq_node.replace_all_uses_with(dn_input) + model.graph.erase_node(dq_node) + num_dq_nodes += 1 + assert(num_dq_nodes > 0) + + # Step 2: Swap binary op to quantized binary op + assert bop_node.target in QBIN_OP_MAPPING + binop_to_qbinop = QBIN_OP_MAPPING if relu_node is None else QBIN_RELU_OP_MAPPING + qbin_op = binop_to_qbinop[bop_node.target] + # prepare the args for quantized bianry op + # (x, y) + qop_node_args = list(bop_node.args) + # (x, y, scale, zero_point) + # add scale and zero_point arguments for Tensor - Tensor operation + if num_dq_nodes == 2: + qop_node_args.extend([scale_node, zero_point_node]) + # insert a call to quantized binary op and remove the original binary op + with model.graph.inserting_after(q_node): + qop_node = create_node_from_old_node_preserve_meta( + model.graph, + ("call_function", qbin_op, tuple(qop_node_args), {}), + bop_node) + q_node.replace_all_uses_with(qop_node) + + # Step 3: Remove quantize node, binary op node, and relu node if any + model.graph.erase_node(q_node) + if relu_node is not None: + model.graph.erase_node(relu_node) + model.graph.erase_node(bop_node) + +def special_pattern_replacement(model: QuantizedGraphModule): + modules = dict(model.named_modules(remove_duplicate=False)) for n in model.graph.nodes: q_node = n - if not (q_node.target == torch.quantize_per_tensor or - (q_node.op == "call_method" and q_node.target == "to" and q_node.args[1] == torch.float16)): + is_quantize = q_node.target == torch.quantize_per_tensor + is_to_fp16 = q_node.op == "call_method" and q_node.target == "to" and \ + len(q_node.args) == 2 and q_node.args[1] == torch.float16 + if not (is_quantize or is_to_fp16): continue ref_node = q_node.args[0] # get output scale/zero_point/dtype from the quantize node # ref_node, scale_node, zero_point_node, dtype = q_node.args # TODO: add safety checks that users for the ref_node and dq_node needs to be one + is_call_function, is_call_method, is_call_module = is_fixed_qparams_node(ref_node, modules) + if is_to_fp16 and (is_call_function or is_call_method or is_call_module): + # TODO: add a warning or error out here? (bc-breaking if error out) + # warnings.warn( + # "Only reference patterns are currently supported for {dtype} dtype with {op} op" + # "".format(dtype=dtypes, op=ref_node)) + continue + + is_call_function, is_call_method, is_call_module = is_default_node(ref_node, modules) + if is_to_fp16 and (is_call_function or is_call_method or is_call_module): + # TODO: add a warning or error out here? (bc-breaking if error out) + continue - is_call_function, is_call_method, is_call_module = check_node(ref_node, modules) + # This check includes all supported ops + is_call_function, is_call_method, is_call_module = is_special_pattern_node(ref_node, modules) if not (is_call_module or is_call_function or is_call_method): continue dq_node_or_nodes = ref_node.args[0] @@ -127,12 +866,19 @@ def special_pattern_replacement(model: QuantizedGraphModule) -> QuantizedGraphMo continue # TODO: enable we have patterns that needs to swap the modules - # if is_call_module: - # ref_module = modules[ref_node.target] - # # change this pattern to use the corresponding quantized module - # # replace reference module with quantized module - # parent_name, module_name = _parent_name(ref_node.target) - # setattr(modules[parent_name], module_name, ref_module) + if is_call_module: + ref_module = modules[ref_node.target] + if type(ref_module) in SPECIAL_PATTERN_LOWER_MODULE_MAP and is_quantize: + qmodule_cls = SPECIAL_PATTERN_LOWER_MODULE_MAP.get(type(ref_module)) + scale_node = q_node.args[1] + zero_point_node = q_node.args[2] + output_scale = getattr(model, scale_node.target) + output_zero_point = getattr(model, zero_point_node.target) + + qmodule = qmodule_cls.from_reference(ref_module, output_scale, output_zero_point) # type:ignore[union-attr] + # replace reference module with quantized module + parent_name, module_name = _parent_name(ref_node.target) + setattr(modules[parent_name], module_name, qmodule) # remove dq node: dq_nodes: List[Node] = [] @@ -147,30 +893,75 @@ def special_pattern_replacement(model: QuantizedGraphModule) -> QuantizedGraphMo model.graph.erase_node(dq_node) # store q node args - q_node_args = list(q_node.args)[1:] - + qnode_qparams = list(q_node.args)[1:] # replace uses of q node with input and remove q node q_node_input = q_node.args[0] q_node.replace_all_uses_with(q_node_input) model.graph.erase_node(q_node) - # remove q node args - for n in q_node_args: - if isinstance(n, Node): - model.graph.erase_node(n) - + is_call_function, is_call_method, is_call_module = is_default_node(ref_node, modules) + if is_call_function: + # pass scale/zer_point arguments from quantize_per_tensor to the default node operator + # insert an op after the zero_point node so that the scale/zero_point + # nodes are is available + qop = get_quantized_operator(ref_node.target) + args = list(ref_node.args) + kwargs = dict(ref_node.kwargs) + if qop in QOP_TO_ARG_NAMES_TO_SKIP: + args_to_skip = QOP_TO_ARG_NAMES_TO_SKIP[qop] + for arg in args_to_skip: + if arg in kwargs: + kwargs.pop(arg) + kwargs["output_scale"] = qnode_qparams[0] + kwargs["output_zero_point"] = qnode_qparams[1] + with model.graph.inserting_after(qnode_qparams[1]): + qop_node = create_node_from_old_node_preserve_meta( + model.graph, + ("call_function", qop, tuple(args), kwargs), + ref_node) + ref_node.replace_all_uses_with(qop_node) + model.graph.erase_node(ref_node) + else: + # remove scale/zero_point node for quantize node + for n in qnode_qparams: + if isinstance(n, Node): + model.graph.erase_node(n) - model.recompile() return model -def _lower_to_native_backend(model: QuantizedGraphModule) -> QuantizedGraphModule: +def _lower_getattr_tensor_metadta_op(model: QuantizedGraphModule): + """ Modified the graph of the model inplace, to skip extra dequantize op before + the general tensor shape ops when possible + """ + for n in model.graph.nodes: + if is_getattr_tensor_metadata_node(n): + maybe_dq = n.args[0] + if maybe_dq.op != "call_method" or maybe_dq.target != "dequantize": + continue + # skip the dequantize node + args = list(n.args) + args[0] = n.args[0].args[0] + n.args = tuple(args) + +def _lower_to_native_backend( + model: QuantizedGraphModule, + qconfig_map: Dict[str, QConfigAny], + node_name_to_scope: Dict[str, Tuple[str, type]] +) -> QuantizedGraphModule: """ Lower a quantized reference model (with reference quantized operator patterns) to the native backend in PyTorch (fbgemm/qnnpack), both backends shares the same operator signature so they can be lowered with the same function """ - model = _lower_weighted_ref_module(model) - for pattern, replacement in get_fbgemm_patterns_and_replacements(): - subgraph_rewriter_FORKED_DO_NOT_USE.replace_pattern(model, pattern, replacement) + _lower_static_weighted_ref_module(model, qconfig_map) + _lower_dynamic_weighted_ref_module(model) + _lower_weight_only_weighted_ref_module(model) + _lower_static_weighted_ref_functional(model, qconfig_map) + _lower_dynamic_weighted_ref_functional(model, qconfig_map) + _lower_quantized_binary_op(model, qconfig_map) + _lower_getattr_tensor_metadta_op(model) special_pattern_replacement(model) + model = fold_weight(model, node_name_to_scope) + model.graph.eliminate_dead_code() + model.recompile() model.graph.lint() return model diff --git a/torch/ao/quantization/fx/backend_config/__init__.py b/torch/ao/quantization/fx/backend_config/__init__.py deleted file mode 100644 index b595b660344e..000000000000 --- a/torch/ao/quantization/fx/backend_config/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .tensorrt import get_tensorrt_backend_config_dict - -# TODO: add more validations -def validate_backend_config_dict(backend_config_dict): - return "configs" in backend_config_dict diff --git a/torch/ao/quantization/fx/backend_config/fuse_handler.py b/torch/ao/quantization/fx/backend_config/fuse_handler.py deleted file mode 100644 index f98a40fa51e6..000000000000 --- a/torch/ao/quantization/fx/backend_config/fuse_handler.py +++ /dev/null @@ -1,5 +0,0 @@ -from ..fusion_patterns import DefaultFuseHandler - -# TODO: move DefaultFuseHandler -def get_fuse_handler_cls(): - return DefaultFuseHandler diff --git a/torch/ao/quantization/fx/backend_config/quantize_handler.py b/torch/ao/quantization/fx/backend_config/quantize_handler.py deleted file mode 100644 index fe932e31bd21..000000000000 --- a/torch/ao/quantization/fx/backend_config/quantize_handler.py +++ /dev/null @@ -1,18 +0,0 @@ -import torch -from typing import Dict -from torch.fx.graph import Node -from .observation_type import ObservationType -from ..quantization_patterns import QuantizeHandler - -def get_quantize_handler_cls(observation_type, dtype_configs): - - class ConfigurableQuantizeHandler(QuantizeHandler): - def __init__(self, node: Node, modules: Dict[str, torch.nn.Module]): - super().__init__(node, modules) - self.observation_type = observation_type - self.dtype_configs = dtype_configs - - def is_general_tensor_value_op(self) -> bool: - return observation_type == ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT - - return ConfigurableQuantizeHandler diff --git a/torch/ao/quantization/fx/backend_config/utils.py b/torch/ao/quantization/fx/backend_config/utils.py deleted file mode 100644 index 7affd58476ee..000000000000 --- a/torch/ao/quantization/fx/backend_config/utils.py +++ /dev/null @@ -1,83 +0,0 @@ -import torch -import torch.nn as nn -from .quantize_handler import get_quantize_handler_cls -from .fuse_handler import get_fuse_handler_cls -from typing import Dict, Any, List, Callable, Union -from ..quantization_types import Pattern, QuantizerCls - -def get_pattern_to_quantize_handlers( - backend_config_dict: Dict[str, Any]) -> Dict[Pattern, QuantizerCls]: - """ - Note: Quantize handler is just a holder for some check methods like - (should_insert_observer_for_output), maybe this can be a enum as well, - we can refactor this after we convert the path for fbgemm/qnnpack fully to the - new path, this is not exposed to backend developers - """ - pattern_to_quantize_handlers = dict() - for config in backend_config_dict.get("configs", []): - pattern = config["pattern"] - observation_type = config["observation_type"] - dtype_configs = config["dtype_configs"] - pattern_to_quantize_handlers[pattern] = \ - get_quantize_handler_cls(observation_type, dtype_configs) - - return pattern_to_quantize_handlers - -def get_pattern_to_dtype_configs( - backend_config_dict: Dict[str, Any]) -> Dict[Pattern, List[Dict[str, torch.dtype]]]: - pattern_to_dtype_configs: Dict[Pattern, List[Dict[str, torch.dtype]]] = dict() - for config in backend_config_dict.get("configs", []): - pattern = config["pattern"] - dtype_configs = config["dtype_configs"] - pattern_to_dtype_configs[pattern] = dtype_configs - return pattern_to_dtype_configs - -def get_pattern_to_input_type_to_index( - backend_config_dict: Dict[str, Any]) -> Dict[Pattern, Dict[str, int]]: - pattern_to_input_type_to_index: Dict[Pattern, Dict[str, int]] = dict() - for config in backend_config_dict.get("configs", []): - pattern = config["pattern"] - input_type_to_index = config.get("input_type_to_index", {}) - pattern_to_input_type_to_index[pattern] = input_type_to_index - return pattern_to_input_type_to_index - -def get_quantized_reference_module_mapping( - backend_config_dict: Dict[str, Any]) -> Dict[Callable, Callable]: - mapping: Dict[Callable, Callable] = dict() - for config in backend_config_dict.get("configs", []): - if "root_module" in config and "reference_quantized_module_for_root" in config: - mapping[config["root_module"]] = config["reference_quantized_module_for_root"] - return mapping - -def get_fusion_pattern_to_fuse_handler_cls( - backend_config_dict: Dict[str, Any]) -> Dict[Pattern, Callable]: - fusion_pattern_to_fuse_handlers = dict() - for config in backend_config_dict.get("configs", []): - if "fuser_method" in config: - pattern = config["pattern"] - fusion_pattern_to_fuse_handlers[pattern] = \ - get_fuse_handler_cls() - - return fusion_pattern_to_fuse_handlers - -def get_fuser_method_mapping( - backend_config_dict: Dict[str, Any]) -> Dict[Pattern, Union[nn.Sequential, Callable]]: - fuser_method_mapping : Dict[Pattern, Union[nn.Sequential, Callable]] = dict() - for config in backend_config_dict.get("configs", []): - if "fuser_method" in config: - pattern = config["pattern"] - fuser_method = config["fuser_method"] - fuser_method_mapping[pattern] = fuser_method - - return fuser_method_mapping - -def get_module_to_qat_module( - backend_config_dict: Dict[str, Any]) -> Dict[Callable, Callable]: - module_to_qat_module: Dict[Callable, Callable] = dict() - for config in backend_config_dict.get("configs", []): - if "pattern" in config and "qat_module" in config: - pattern = config["pattern"] - qat_module = config["qat_module"] - module_to_qat_module[pattern] = qat_module - - return module_to_qat_module diff --git a/torch/ao/quantization/fx/backend_config_utils.py b/torch/ao/quantization/fx/backend_config_utils.py new file mode 100644 index 000000000000..68a4823823e5 --- /dev/null +++ b/torch/ao/quantization/fx/backend_config_utils.py @@ -0,0 +1,141 @@ +import torch +from torch.ao.quantization.fx.pattern_utils import get_default_quant_patterns, sorted_patterns_dict +from torch.ao.quantization.backend_config import get_native_backend_config_dict +from torch.ao.quantization.backend_config.observation_type import ObservationType +from torch.ao.quantization.quantization_types import ( + Pattern, + NodePattern, + QuantizerCls, +) +from torch.ao.quantization.utils import ( + activation_dtype, + get_combined_dict, +) + +from .quantization_patterns import QuantizeHandler +from .fusion_patterns import DefaultFuseHandler + +from typing import Dict, Any, Callable, Optional + +def get_quantize_handler_cls( + observation_type, + dtype_configs, + num_tensor_args_to_observation_type, + overwrite_output_fake_quantizer, + overwrite_output_observer, + input_output_observed): + + class ConfigurableQuantizeHandler(QuantizeHandler): + def __init__( + self, + node_pattern: NodePattern, + modules: Dict[str, torch.nn.Module], + root_node_getter: Callable = None): + super().__init__(node_pattern, modules, root_node_getter) + if num_tensor_args_to_observation_type: + assert self.num_tensor_args in num_tensor_args_to_observation_type, \ + f"Must provide observation_type config for tensor number {self.num_tensor_args}" \ + f" in num_tensor_args_to_observation_type for {node_pattern}" + self.observation_type = num_tensor_args_to_observation_type[self.num_tensor_args] + else: + self.observation_type = observation_type + self.dtype_configs = dtype_configs + self.overwrite_output_fake_quantizer = overwrite_output_fake_quantizer + self.overwrite_output_observer = overwrite_output_observer + self.input_output_observed_ = input_output_observed + + def is_general_tensor_value_op(self) -> bool: + return self.observation_type == ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT + + # TODO: change this to output activation + def get_activation_ctr( + self, + qconfig: Any, + pattern: Pattern, + is_training: bool, + ) -> Optional[Callable]: + """ + Returns the constructor for the activation observer which should be + used for the pattern matched to this handler. Some handlers override + this to a different value than what is specified in the qconfig. + """ + act_dtype = activation_dtype(qconfig) + # TODO: change to is_qat + if is_training: + if act_dtype == torch.quint8 and self.overwrite_output_fake_quantizer is not None: + return self.overwrite_output_fake_quantizer + else: + if act_dtype == torch.quint8 and self.overwrite_output_observer is not None: + return self.overwrite_output_observer + return qconfig.activation + + # This is temporary, and will be removed soon + def input_output_observed(self): + return self.input_output_observed_ + + + return ConfigurableQuantizeHandler + +def get_pattern_to_quantize_handlers( + backend_config_dict: Dict[str, Any]) -> Dict[Pattern, QuantizerCls]: + """ + Note: Quantize handler is just a holder for some check methods like + (should_insert_observer_for_output), maybe this can be a enum as well, + we can refactor this after we convert the path for fbgemm/qnnpack fully to the + new path, this is not exposed to backend developers + """ + pattern_to_quantize_handlers = dict() + for config in backend_config_dict.get("configs", []): + pattern = config["pattern"] + observation_type = config.get("observation_type", None) + dtype_configs = config["dtype_configs"] + num_tensor_args_to_observation_type = config.get("num_tensor_args_to_observation_type", {}) + overwrite_fake_quantizer = config.get("_overwrite_output_fake_quantizer", None) + overwrite_observer = config.get("_overwrite_output_observer", None) + input_output_observed = config.get("_input_output_observed", True) + pattern_to_quantize_handlers[pattern] = \ + get_quantize_handler_cls( + observation_type, + dtype_configs, + num_tensor_args_to_observation_type, + overwrite_fake_quantizer, + overwrite_observer, + input_output_observed) + + return pattern_to_quantize_handlers + +def get_fusion_pattern_to_fuse_handler_cls( + backend_config_dict: Dict[str, Any]) -> Dict[Pattern, Callable]: + fusion_pattern_to_fuse_handlers: Dict[Pattern, Callable] = dict() + for config in backend_config_dict.get("configs", []): + if "fuser_method" in config: + pattern = config["pattern"] + fusion_pattern_to_fuse_handlers[pattern] = DefaultFuseHandler + + return fusion_pattern_to_fuse_handlers + +# TODO: remove when all uses are changed to backend_config_dict +def get_native_quant_patterns(additional_quant_patterns: Dict[Pattern, QuantizerCls] = None) -> Dict[Pattern, QuantizerCls]: + """ + Return a map from pattern to quantize handlers based on the default patterns and the native backend_config_dict. + The returned map is sorted such that longer patterns will be encountered first when iterating through it. + """ + patterns = get_default_quant_patterns() + if additional_quant_patterns is not None: + patterns = get_combined_dict(patterns, additional_quant_patterns) + # TODO: currently we just extend the quantize handlers generated from + # `get_native_backend_config_dict` + # in the future we can just assign backend_config_dict when everything is defined + for pattern, quantize_handler in get_pattern_to_quantize_handlers(get_native_backend_config_dict()).items(): + patterns[pattern] = quantize_handler + return sorted_patterns_dict(patterns) + +get_fusion_pattern_to_fuse_handler_cls.__module__ = "torch.ao.quantization.fx.backend_config_utils" +get_native_quant_patterns.__module__ = "torch.ao.quantization.fx.backend_config_utils" +get_pattern_to_quantize_handlers.__module__ = "torch.ao.quantization.fx.backend_config_utils" + +__all__ = [ + "get_fusion_pattern_to_fuse_handler_cls", + "get_native_quant_patterns", + "get_pattern_to_quantize_handlers", +] diff --git a/torch/ao/quantization/fx/common_quantization_patterns.py b/torch/ao/quantization/fx/common_quantization_patterns.py index a6e687cc6e91..a863c18a383e 100644 --- a/torch/ao/quantization/fx/common_quantization_patterns.py +++ b/torch/ao/quantization/fx/common_quantization_patterns.py @@ -1,73 +1,8 @@ -import torch -from torch.fx.graph import ( - Node, - Graph, -) - -from ..utils import ( - get_qconfig_dtypes, - activation_dtype, -) - -from .utils import ( - quantize_node, -) - from .quantization_patterns import ( QuantizeHandler, ) - -from ..qconfig import QConfigAny - -from typing import Any, Callable, Dict, Tuple - +# TODO: remove class CommonQuantizeHandler(QuantizeHandler): """ Common quantized op, first input and first output will be quantized """ - def __init__( - self, - node: Node, - modules: Dict[str, torch.nn.Module]): - super().__init__(node, modules) - if node.op == "call_function" or node.op == "call_method": - self.op = node.target - elif node.op == "call_module": - self.op = type(modules[str(node.target)]) - - def convert(self, - node: Node, - qconfig: QConfigAny, - modules: Dict[str, torch.nn.Module], - quantized_graph: Graph, - node_name_to_scope: Dict[str, Tuple[str, type]], - load_arg: Callable, - is_reference: bool = False, - convert_custom_config_dict: Dict[str, Any] = None) -> Node: - if not self.all_node_args_are_tensors: - return NotImplemented - assert node.op in ['call_module', 'call_function'], 'Only call_module and ' + \ - 'call_function are handled in DefaultNode' - assert is_reference - if convert_custom_config_dict is None: - convert_custom_config_dict = {} - additional_static_quant_mapping = convert_custom_config_dict.get("static", {}) - - dtypes = get_qconfig_dtypes(qconfig) - # We can produce reference for a dtypes including - # (torch.quint8, torch.qint8, torch.qint32, torch.float16) - act_dtype = activation_dtype(qconfig) - if act_dtype == torch.float: - op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float)) - return op_out - else: - activation_post_process = \ - self._maybe_get_last_node_only_observer(modules) - assert activation_post_process is not None - # make sure the input is quantized to act_dtype - load_arg(quantized={0: act_dtype})(node.args) - args = load_arg(quantized=torch.float)(node.args) - kwargs = load_arg(quantized=torch.float)(node.kwargs) - op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float)) - return quantize_node( - op_out, activation_post_process, - node, modules, quantized_graph, node_name_to_scope, is_input=False) + pass diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py index b27b68bed8b3..04d7a76fdbf7 100644 --- a/torch/ao/quantization/fx/convert.py +++ b/torch/ao/quantization/fx/convert.py @@ -1,29 +1,25 @@ -from typing import Any, Dict, Tuple, List, Callable, Optional, Union, Set -from collections import defaultdict -import copy +from typing import Any, Dict, List, Optional, Set, Callable, Tuple import torch +import copy +import warnings from torch.fx import ( GraphModule, - Proxy, - map_arg ) from torch.fx.graph import ( Graph, Node, + Argument, ) -from torch.fx.node import Argument -from .quantization_types import Pattern -from ..qconfig import QConfigAny, qconfig_equals -from .match_utils import ( - find_matches, -) -from .graph_module import ( - is_observed_module, - is_observed_standalone_module, - QuantizedGraphModule, +from ..utils import ( + activation_is_statically_quantized, + weight_is_quantized, + get_qparam_dict, + _parent_name, + get_swapped_custom_module_class, ) -from .quantization_patterns import ( - QuantizeHandler, +from ..qconfig import ( + QConfigAny, + qconfig_equals ) from ..qconfig_dict_utils import ( convert_dict_to_ordered_dict, @@ -33,16 +29,27 @@ generate_qconfig_map, compare_prepare_convert_qconfig_dict, update_qconfig_for_fusion, + is_qconfig_supported_by_dtype_configs, +) +from torch.ao.quantization.backend_config.utils import ( + get_root_module_to_quantized_reference_module, + get_pattern_to_dtype_configs, + get_fused_module_classes, + get_qat_module_classes, +) +from torch.ao.quantization.backend_config import get_native_backend_config_dict +from .graph_module import ( + QuantizedGraphModule, + is_observed_module, + is_observed_standalone_module, ) from ._equalize import update_obs_for_equalization, convert_eq_obs from .utils import ( - is_get_tensor_info_node, - node_return_type_is_int, - quantize_node, - get_new_attr_name_with_prefix, + get_custom_module_class_keys, + get_quantize_node_info, + create_getattr_from_value, collect_producer_nodes, graph_module_from_producer_nodes, - get_custom_module_class_keys, WEIGHT_INDEX_DICT, ) @@ -50,114 +57,65 @@ _remove_qconfig, is_activation_post_process, ) -from ..utils import ( - activation_is_statically_quantized, - activation_dtype, -) - from .lower_to_fbgemm import lower_to_fbgemm -from ..quantization_mappings import ( - DEFAULT_QAT_MODULE_MAPPINGS, -) -# weight prepacking ops -WEIGHT_PREPACK_OPS = { - torch._ops.ops.quantized.linear_prepack, - torch._ops.ops.quantized.linear_prepack_fp16, - torch._ops.ops.quantized.conv1d_prepack, - torch._ops.ops.quantized.conv2d_prepack, - torch._ops.ops.quantized.conv3d_prepack, -} +def restore_state( + observed: torch.nn.Module +) -> Tuple[Dict[str, Tuple[str, type]], + Dict[str, Any], + Set[str]]: + assert is_observed_module(observed), \ + 'incoming model must be produced by prepare_fx' + prepare_custom_config_dict: Dict[str, Any] = \ + observed._prepare_custom_config_dict # type: ignore[assignment] + node_name_to_scope: Dict[str, Tuple[str, type]] = observed._node_name_to_scope # type: ignore[assignment] + observed_node_names: Set[str] = observed._observed_node_names # type: ignore[assignment] + return node_name_to_scope, prepare_custom_config_dict, observed_node_names + +def has_none_qconfig(node: Argument, qconfig_map: Dict[str, QConfigAny]) -> bool: + """ Check if a node has a qconfig of None, i.e. user requested to not quantize + the node + """ + return isinstance(node, Node) and node.name in qconfig_map and qconfig_map[node.name] is None def run_weight_observers(observed: GraphModule) -> None: - r''' Extract the subgraph that produces the weight for dynamic quant + """ Extract the subgraph that produces the weight for dynamic quant or weight only quant node and run the subgraph to observe the weight. Note that the observers of dynamic quant or weight only quant ops are run during the convert step. - ''' - for node in observed.graph.nodes: - if node.op == 'call_function' and node.target in WEIGHT_INDEX_DICT: - for i, node_arg in enumerate(node.args): - if i in WEIGHT_INDEX_DICT[node.target]: - # node_arg is weight - weight_observer_nodes = collect_producer_nodes(node_arg) - if weight_observer_nodes is not None: - weight_observer_module = \ - graph_module_from_producer_nodes( - observed, weight_observer_nodes) - # run the weight observer - weight_observer_module() - -def fold_weight( - quantized: QuantizedGraphModule, - node_name_to_scope: Dict[str, Tuple[str, type]]) -> QuantizedGraphModule: """ - Trace back from the weight node util we hit getattr, reconstruct the - graph module with the traced nodes and run the graph module to pack the - weight. then replace the original chain of ops with the packed weight. - """ - packed_weights = dict() - # map from folded node name to the prepacked weight name - folded_nodes = dict() - # get packed weights - for node in quantized.graph.nodes: - if node.op == 'call_function' and node.target in WEIGHT_PREPACK_OPS: - nodes_to_fold = collect_producer_nodes(node) - if nodes_to_fold is not None: - for node_to_fold in nodes_to_fold: - folded_nodes[node_to_fold.name] = node - - prepacking_module = graph_module_from_producer_nodes( - quantized, nodes_to_fold) - packed_weight = prepacking_module() - packed_weights[node.name] = packed_weight - - # remove folded nodes and replace the prepacking node with getattr - folded_graph = Graph() - env: Dict[Any, Any] = {} - - def load_arg(a): - return map_arg(a, lambda node: env[node.name]) - quantized_root = quantized - quantized_graph = quantized.graph - - for node in quantized_graph.nodes: - prepack_node = folded_nodes.get(node.name, None) - if prepack_node is node: - packed_weight = packed_weights[node.name] - # add a prepacked attribute to root - op_node = list(prepack_node.users)[0] - module_path, _ = node_name_to_scope[op_node.name] - get_new_packed_weight_name = \ - get_new_attr_name_with_prefix(module_path + '_packed_weight_') - packed_weight_name = get_new_packed_weight_name(quantized_root) - setattr(quantized_root, packed_weight_name, packed_weight) - # replace prepack node with a getattr node - env[node.name] = folded_graph.create_node( - 'get_attr', packed_weight_name, (), {}) - elif prepack_node is not None: - # remove the foled node + for node in observed.graph.nodes: + if node.op != 'call_function' or node.target not in WEIGHT_INDEX_DICT: continue - else: - # copy other nodes - env[node.name] = folded_graph.node_copy(node, load_arg) - quantized = QuantizedGraphModule(quantized_root, folded_graph, quantized_root.preserved_attr_names) - return quantized - -def remove_quant_dequant_pairs(quantized: QuantizedGraphModule) -> QuantizedGraphModule: + for i, node_arg in enumerate(node.args): + if i not in WEIGHT_INDEX_DICT[node.target]: + continue + # node_arg is weight + weight_observer_nodes = collect_producer_nodes(node_arg) + if weight_observer_nodes is None: + continue + weight_observer_module = \ + graph_module_from_producer_nodes( + observed, weight_observer_nodes) + # run the weight observer + weight_observer_module() + +# this method is temporary will be removed soon +def duplicate_quantize_dynamic_node(quantized: QuantizedGraphModule) -> QuantizedGraphModule: quantized_root = quantized for node in quantized.graph.nodes: - if node.op == "call_function" and node.target in [torch.quantize_per_tensor, torch.quantize_per_channel]: + if (node.op == "call_function" and node.target == torch.quantize_per_tensor_dynamic): users = list(node.users) - user = users[0] if users else None - if len(users) == 1 and user.op == "call_method" and user.target == "dequantize": - user.replace_all_uses_with(node.args[0]) - quantized.graph.erase_node(user) - orig_args = list(node.args) + if len(users) > 1: + for user in users: + with quantized.graph.inserting_before(node): + new_node = quantized.graph.create_node( + "call_function", + torch.quantize_per_tensor_dynamic, + node.args, + node.kwargs) + user.replace_input_with(node, new_node) quantized.graph.erase_node(node) - for arg in orig_args: - if isinstance(arg, Node) and len(list(arg.users)) == 0: - quantized.graph.erase_node(arg) quantized = QuantizedGraphModule(quantized_root, quantized.graph, quantized_root.preserved_attr_names) return quantized @@ -204,28 +162,371 @@ def remove_extra_dequantize(quantized: QuantizedGraphModule) -> QuantizedGraphMo quantized = QuantizedGraphModule(quantized_root, quantized.graph, quantized_root.preserved_attr_names) return quantized +def remove_quant_dequant_pairs(quantized: QuantizedGraphModule) -> QuantizedGraphModule: + quantized_root = quantized + for node in quantized.graph.nodes: + if node.op == "call_function" and node.target in [torch.quantize_per_tensor, torch.quantize_per_channel]: + users = list(node.users) + user = users[0] if users else None + if len(users) == 1 and user.op == "call_method" and user.target == "dequantize": + user.replace_all_uses_with(node.args[0]) + quantized.graph.erase_node(user) + orig_args = list(node.args) + quantized.graph.erase_node(node) + for arg in orig_args: + if isinstance(arg, Node) and len(list(arg.users)) == 0: + quantized.graph.erase_node(arg) -def restore_state( - observed: torch.nn.Module -) -> Tuple[Dict[Pattern, QuantizeHandler], - Dict[str, Tuple[str, type]], - Dict[str, Any], - Set[str]]: - assert is_observed_module(observed), \ - 'incoming model must be produced by prepare_fx' - prepare_custom_config_dict: Dict[str, Any] = \ - observed._prepare_custom_config_dict # type: ignore[assignment] - node_name_to_scope: Dict[str, Tuple[str, type]] = observed._node_name_to_scope # type: ignore[assignment] - patterns: Dict[Pattern, QuantizeHandler] = observed._patterns # type: ignore[assignment] - observed_node_names: Set[str] = observed._observed_node_names # type: ignore[assignment] - return patterns, node_name_to_scope, prepare_custom_config_dict, observed_node_names - -def convert(model: GraphModule, is_reference: bool = False, - convert_custom_config_dict: Dict[str, Any] = None, - is_standalone_module: bool = False, - _remove_qconfig_flag: bool = True, - convert_qconfig_dict: Dict[str, Any] = None) -> torch.nn.Module: - """ standalone_module means it a submodule that is not inlined in + quantized = QuantizedGraphModule(quantized_root, quantized.graph, quantized_root.preserved_attr_names) + return quantized + +def maybe_recursive_remove_dequantize(arg: Any, node: Node, graph: Graph): + """ If the arg is a dequantize Node, or a list/tuple/dict of dequantize Node, + we'll recursively remove the dequantize Node + """ + if isinstance(arg, Node) and \ + arg.op == "call_method" and \ + arg.target == "dequantize": + quantize_node = arg.args[0] + # we only replace the specific use since dequantize could be used by other nodes + # as well + node.replace_input_with(arg, quantize_node) + elif isinstance(arg, (list, tuple)): + for arg_element in arg: + maybe_recursive_remove_dequantize(arg_element, node, graph) + elif isinstance(arg, dict): + for arg_element in arg.values(): + maybe_recursive_remove_dequantize(arg_element, node, graph) + else: + warnings.warn(f"Unsupported node type in recursive remove dequantize: {type(arg)}") + +def get_module_path_and_prefix( + obs_node: Node, + node_name_to_scope: Dict[str, Tuple[str, type]], + qconfig_map: Dict[str, QConfigAny]): + """ Given and observer node, get the `Scope` or the fully qualified name for + the submodule containing the observed node, also return a prefix of "_input" + when the observed node is an input of a F.linear op, and not the output of another + quantized op. + TODO: this logic is hacky, we should think about how to remove it or make it more + general + """ + observed_node = obs_node.args[0] + # an observer can be inserted for both input of the next operator or output of the previous + # operator (they can be the same) + # this flag identifies if the observer is inserted only because the observed node is + # the input of the next operator + assert isinstance(observed_node, Node), \ + f"Expecting observed node to be a Node, but got {observed_node}" + is_input_observer_only = qconfig_map[observed_node.name] is None if observed_node.name in qconfig_map else None + if is_input_observer_only: + # if the quantize function is at the input of op, then we find the first user of the observer_node + # to get the path. If a linear call_function is in the user list, we return the first instance + # of linear node to get the FQN. + users = list(obs_node.users) + first_linear_use_or_first_use = users[0] if users else None + linear_node = None + for n in users: + if n.op == "call_function" and n.target == torch.nn.functional.linear: + linear_node = n + break + if linear_node: + first_linear_use_or_first_use = linear_node + prefix = "_input" + else: + # if the quantize function is at the output of the op, we use the observer input node to get the path + first_linear_use_or_first_use = observed_node + prefix = "" + + if first_linear_use_or_first_use and first_linear_use_or_first_use.name in node_name_to_scope: + module_path, _ = node_name_to_scope[first_linear_use_or_first_use.name] + else: + # TODO: it's not used, so actually we can skip quantization + # but this requires changing return type of quantize_node + # we can fix it later if needed + module_path = "" + return module_path, prefix + +def insert_dequantize_node( + node: Node, + graph: Graph): + """ Inserts dequantize node for `node` in `graph` + """ + with graph.inserting_after(node): + dequantize_node = graph.call_method("dequantize", (node,)) + for user_node in dict(node.users): + if user_node is not dequantize_node: + user_node.replace_input_with(node, dequantize_node) + +def maybe_get_observer_for_node( + node: Node, + modules: Dict[str, torch.nn.Module] +) -> Optional[torch.nn.Module]: + """ + If the node is observed, return the observer + instance. Otherwise, return None. + """ + for maybe_obs_node, _ in node.users.items(): + if maybe_obs_node.op == 'call_module': + maybe_obs = modules[str(maybe_obs_node.target)] + if is_activation_post_process(maybe_obs): + return maybe_obs + return None + +def convert_standalone_module( + node: Node, + modules: Dict[str, torch.nn.Module], + model: torch.fx.GraphModule, + is_reference: bool, + backend_config_dict: Optional[Dict[str, Any]]): + """ Converts a observed standalone module to a quantized standalone module by calling + the fx convert api, currently using the same `is_reference` flag as parent, but we may + changing this behavior in the future (e.g. separating quantization and lowering for + standalone module as well) + + Args: + - node: The call_module node of the observed standalone module + - modules: named_module of original model + - model: original model + - is_reference: a flag from parent provided by user to decide if we want to + produce a reference model or a fbgemm/qnnpack model + - backend_config_dict: backend configuration of the target backend of quantization + """ + convert = torch.ao.quantization.quantize_fx.convert_fx # type: ignore[attr-defined] + # We know that observed standalone module is a GraphModule since + # it's produced by us + observed_standalone_module : GraphModule = modules[str(node.target)] # type: ignore[assignment] + sm_input_quantized_idxs = \ + observed_standalone_module \ + ._standalone_module_input_quantized_idxs\ + .tolist() # type: ignore[operator] + # remove the dequantize nodes for inputs + args = list(node.args) + for idx in range(len(args)): + if idx in sm_input_quantized_idxs: + arg = args[idx] + if arg.op == "call_method" and arg.target == "dequantize": # type: ignore[union-attr] + quantize_node = arg.args[0] # type: ignore[union-attr] + node.replace_input_with(arg, quantize_node) + if len(arg.users) == 0: # type: ignore[union-attr] + model.graph.erase_node(arg) + # add dequantize node for output + sm_output_quantized_idxs = \ + observed_standalone_module \ + ._standalone_module_output_quantized_idxs \ + .tolist() # type: ignore[operator] + if len(sm_output_quantized_idxs) > 0: + assert sm_output_quantized_idxs[0] == 0, "Currently only quantized" + "output idxs = [0] is supported" + + # if it's non-empty, then it means the output is kept in quantized form + # we'll just add a dequantize node after this node + insert_dequantize_node(node, model.graph) + + # TODO: allow convert_custom_config_dict to override backend_config_dict + # for standalone module + # TODO: think about how to handle `is_reference` here + quantized_standalone_module = convert( + observed_standalone_module, + is_reference=is_reference, + backend_config_dict=backend_config_dict) + parent_name, name = _parent_name(node.target) + # update the modules dict + setattr(modules[parent_name], name, quantized_standalone_module) + modules[str(node.target)] = quantized_standalone_module + +def convert_weighted_module( + node: Node, + modules: Dict[str, torch.nn.Module], + observed_node_names: Set[str], + qconfig_map: Dict[str, QConfigAny], + backend_config_dict: Dict[str, Any]): + """ Convert a weighted module to reference quantized module in the model + If the QConfig of a QAT module is not set, the module will still be converted to + a float module. + + Args: + - node: The call_module node of the observed standalone module + - modules: named_module of original model + - observed_node_names: names for the set of observed fx node, we can skip + this conversion if the node is not observed + """ + original_module = modules[str(node.target)] + qconfig: QConfigAny = original_module.qconfig # type: ignore[assignment] + weight_post_process = None + qat_module_classes = get_qat_module_classes(backend_config_dict) + + if isinstance( + original_module, + qat_module_classes): + # Converting qat module to a float module, we need to attch + # weight fake_quant to the module, weight fake_quant is assumed to be run during + # QAT so we don't need to run it again here + weight_post_process = original_module.weight_fake_quant + original_module = original_module.to_float() # type: ignore[operator] + # change qat module to float module + parent_name, name = _parent_name(node.target) + setattr(modules[parent_name], name, original_module) + + is_observed = node.name in observed_node_names + # If a qconfig is not defined for this node, then skip converting to a reference module + if qconfig is None or has_none_qconfig(node, qconfig_map) or not is_observed: + return + + # skip converting to reference quantized module if the qconfig is not supported + pattern_to_dtype_configs = get_pattern_to_dtype_configs(backend_config_dict) + dtype_configs = pattern_to_dtype_configs.get(type(original_module), []) + if not is_qconfig_supported_by_dtype_configs(qconfig, dtype_configs): + return + + # TODO: rename weight_is_statically_quantized to weight_is_int8_quantized + is_weight_quantized = weight_is_quantized(qconfig) + + # the condition for swapping the module to reference quantized module is: + # weights need to be quantized + if not is_weight_quantized: + return + + fused_module = None + float_module = original_module + # extract the inidividual float_module and fused module + if isinstance(original_module, torch.nn.intrinsic._FusedModule): + fused_module = float_module + float_module = fused_module[0] # type: ignore[index] + + # TODO: move this to the reference quantized module + # weight_qparams or weight_qparams dict + wq_or_wq_dict = {} + if isinstance(float_module, torch.nn.RNNCellBase): + weight_post_process_ih = qconfig.weight() # type: ignore[union-attr, operator] + weight_post_process_hh = qconfig.weight() # type: ignore[union-attr, operator] + weight_post_process_ih(float_module.weight_ih) + weight_post_process_hh(float_module.weight_hh) + weight_qparams_ih = get_qparam_dict(weight_post_process_ih) + weight_qparams_hh = get_qparam_dict(weight_post_process_hh) + wq_or_wq_dict = { + "weight_ih": weight_qparams_ih, + "weight_hh": weight_qparams_hh, + } + elif isinstance(float_module, torch.nn.LSTM): + # format for wq_or_wq_dict (flattened attributes): + # {"weight_ih_l0_scale": ..., "weight_ih_l0_qscheme": ..., ...} + for wn in float_module._flat_weights_names: + if hasattr(float_module, wn) and wn.startswith("weight"): + weight = getattr(float_module, wn) + weight_post_process = qconfig.weight() # type: ignore[union-attr, operator] + if weight_post_process.dtype == torch.qint8: # type: ignore[union-attr] + weight_post_process(weight) # type: ignore[operator, misc] + wq_or_wq_dict[wn] = get_qparam_dict(weight_post_process) + else: + # weight_post_process is None means the original module is not a QAT module + # we need to get weight_post_process from qconfig in this case + if weight_post_process is None: + weight_post_process = qconfig.weight() # type: ignore[union-attr, operator] + # run weight observer + # TODO: This is currently a hack for QAT to get the right shapes for scale and zero point. + # In the future, we should require the user to calibrate the model after calling prepare + # Issue: https://github.com/pytorch/pytorch/issues/73941 + weight_post_process(float_module.weight) # type: ignore[operator] + wq_or_wq_dict = get_qparam_dict(weight_post_process) + + # We use the same reference module for all modes of quantization: static, dynamic, weight_only + # root_module_to_quantized_reference_module: module mapping from root (floating point) module class + # to quantized reference module class, e.g. nn.Conv2d to nn.quantized._reference.Conv2d + root_module_to_quantized_reference_module = get_root_module_to_quantized_reference_module(backend_config_dict) + ref_qmodule_cls = root_module_to_quantized_reference_module.get(type(float_module), None) + assert ref_qmodule_cls is not None, f"No reference quantized module class configured for {type(float_module)}" + ref_qmodule = ref_qmodule_cls.from_float(float_module, wq_or_wq_dict) # type: ignore[attr-defined] + if fused_module is not None: + fused_module[0] = ref_qmodule # type: ignore[operator] + else: + parent_name, name = _parent_name(node.target) + setattr(modules[parent_name], name, ref_qmodule) + +def convert_custom_module( + node: Node, + graph: Graph, + modules: Dict[str, torch.nn.Module], + custom_module_class_mapping: Dict[Callable, Callable], + statically_quantized_custom_module_nodes: Set[Node]): + """ Converts an observed custom module to a quantized custom module based on + `custom_module_class_mapping` + For static quantization, we'll also remove the previous `dequantize` node and + attach the observer node for output to the module, the observer for the node + will be converted to a dequantize node instead of quantize-dequantize pairs + later in the graph. In the end we would have a quantized custom module that + has the same interface as a default quantized module in nn.quantized namespace, + i.e. quantized input and quantized output. + + Args: + - node: The call_module node of the observed standalone module + - graph: The graph containing the node + - modules: named_module of original model + - custom_module_class_mapping: mapping from observed custom module class to + quantized custom module class, used to swap custom modules + - statically_quantized_custom_module_nodes: we'll add the custom module node + if we find it is statically quantized, this will be used later when converting + observers to quant/dequant node pairs, if the observed node is a statically + quantized custom module nodes, we'll convert the observer to a dequantize node, + this is to keep the interface the same as the default quantized module. + TODO: maybe we want to redesign this part to align with reference model design + as well, but there has been some discussions around the interface, so we can do + it later. + """ + observed_custom_module = modules[str(node.target)] + maybe_obs = maybe_get_observer_for_node(node, modules) + qconfig = observed_custom_module.qconfig + if activation_is_statically_quantized(qconfig): + statically_quantized_custom_module_nodes.add(node) + # remove the previous dequant node + prev_node = node.args[0] + # expecting the input node for a custom module node to be a Node + assert isinstance(prev_node, Node), \ + f"Expecting the argument for custom module node to be a Node, but got {prev_node}" + if prev_node.op == "call_method" and prev_node.target == "dequantize": + # change the connection for custom module, we'll change the input + # of custom module node to quantize node: + # Before: quantize - dequantize - custom - module + # After: quantize - custom - module + # \ - dequantize + node.replace_input_with(prev_node, prev_node.args[0]) + + # Remove the dequantize node if it doesn't have other users + if len(prev_node.users) == 0: + graph.erase_node(prev_node) + + # absorb the following observer into the module conversion + activation_post_process = maybe_get_observer_for_node(node, modules) + assert activation_post_process is not None + observed_custom_module.activation_post_process = activation_post_process + + # swap the observed custom module to quantized custom module + quantized_custom_module_class = get_swapped_custom_module_class( + observed_custom_module, custom_module_class_mapping, qconfig) + quantized_custom_module = \ + quantized_custom_module_class.from_observed(observed_custom_module) + parent_name, name = _parent_name(node.target) + setattr(modules[parent_name], name, quantized_custom_module) + +def convert( + model: GraphModule, is_reference: bool = False, + convert_custom_config_dict: Dict[str, Any] = None, + is_standalone_module: bool = False, + _remove_qconfig_flag: bool = True, + convert_qconfig_dict: Dict[str, Any] = None, + backend_config_dict: Optional[Dict[str, Any]] = None) -> torch.nn.Module: + """ + We will convert an observed model (a module with observer calls) to a reference + quantized model, the rule is simple: + 1. for each observer module call in the graph, we'll convert it to calls to + quantize and dequantize functions based on the observer instance + 2. for weighted operations like linear/conv, we need to convert them to reference + quantized module, this requires us to know whether the dtype configured for the + weight is supported in the backend, this is done in prepare step and the result + is stored in observed_node_names, we can decide whether we need to swap the + module based on this set + + standalone_module means it a submodule that is not inlined in parent module, and will be quantized separately as one unit. Returns a quantized standalone module, whether input/output is quantized is @@ -235,7 +536,7 @@ def convert(model: GraphModule, is_reference: bool = False, """ if convert_custom_config_dict is None: convert_custom_config_dict = {} - patterns, node_name_to_scope, prepare_custom_config_dict, _ = restore_state(model) + node_name_to_scope, prepare_custom_config_dict, observed_node_names = restore_state(model) qconfig_map: Dict[str, QConfigAny] = model._qconfig_map # type: ignore[assignment] # TODO this should be removed now that gpu support for quantization is being supported. @@ -264,9 +565,7 @@ def convert(model: GraphModule, is_reference: bool = False, modules_copy = copy.deepcopy(modules) convert_dict_to_ordered_dict(convert_qconfig_dict) if model._is_qat: - additional_qat_module_mapping = prepare_custom_config_dict.get( - "additional_qat_module_mapping", {}) - convert_qconfig_dict = update_qconfig_for_qat(convert_qconfig_dict, additional_qat_module_mapping) + convert_qconfig_dict = update_qconfig_for_qat(convert_qconfig_dict, {}) convert_qconfig_dict = update_qconfig_for_fusion(model, convert_qconfig_dict) compare_prepare_convert_qconfig_dict(prepare_qconfig_dict, convert_qconfig_dict) # type: ignore[arg-type] @@ -283,10 +582,7 @@ def convert(model: GraphModule, is_reference: bool = False, custom_module_classes = get_custom_module_class_keys( convert_custom_config_dict, "observed_to_quantized_custom_module_class") - matches = find_matches( - model.graph, modules, patterns, - qconfig_map, - custom_module_classes=custom_module_classes) + custom_module_class_mapping = convert_custom_config_dict.get("observed_to_quantized_custom_module_class", {}) if model._equalization_qconfig_map is not None: # If we want to do equalization then do the following: @@ -299,354 +595,168 @@ def convert(model: GraphModule, is_reference: bool = False, # for dynamic quant ops or weight only quant ops run_weight_observers(model) - quantized_graph = Graph() - env: Dict[str, Dict[Optional[torch.dtype], Node]] = defaultdict(lambda: defaultdict(Node)) # type: ignore[arg-type] - graph_inputs: List[str] = [] for node in model.graph.nodes: if node.op == 'placeholder': graph_inputs.append(node.name) - def load_non_quantized(n: Node) -> Node: - assert n.name in env, \ - 'trying to load float node but did not find ' + \ - 'node:' + n.name + \ - ' in env: ' + \ - str(env) - dtype_to_node = env[n.name] - if torch.float in dtype_to_node: - return dtype_to_node[torch.float] - elif None in dtype_to_node: - return dtype_to_node[None] - else: - quantized_node = None - for dtype in [torch.quint8, torch.qint8, torch.float16]: - if dtype in dtype_to_node: - quantized_node = dtype_to_node[dtype] - break - assert quantized_node is not None, "Did not find a supported quantized dtype:{}".format(dtype_to_node) - env[n.name][torch.float] = Proxy(quantized_node).dequantize().node - return env[n.name][torch.float] - - def load_quantized(dtype: torch.dtype): - def load_quantized_impl(n: Node): - assert n.name in env, \ - 'trying to load quantized node but did not find node:' + \ - n.name + ' in environment:' + str(env) - dtype_to_node = env[n.name] - local_dtype : Optional[torch.dtype] = dtype - if local_dtype == torch.float and local_dtype not in dtype_to_node: - local_dtype = None - if local_dtype in [torch.float, None]: - return load_non_quantized(n) - assert local_dtype in dtype_to_node, f'Expecting {dtype} in {dtype_to_node}' - return dtype_to_node[local_dtype] - - return load_quantized_impl - - def load_x(n: Node) -> Node: - assert n.name in env, \ - 'node ' + n.name + ' does not exist in environment' - dtype_to_node = env[n.name] - dtypes = [torch.quint8, torch.qint8, torch.float16, torch.float32, None] - for dtype in dtypes: - if dtype in dtype_to_node: - return dtype_to_node[dtype] - raise Exception(f'dtype {dtype} not found in environment: {dtype_to_node} for node {n.name}') - - def load_arg( - quantized: Optional[Union[List[int], Dict[int, torch.dtype], torch.dtype, Tuple[int, ...]]] - ) -> Callable[[Node], Argument]: - """ - Input: quantized, which can be None, torch.dtype, list or tuple - - if quantized is None, then we'll load the node as long as it - exists - - if quantized is a dtype, then all args will be - quantized to the specific dtype - - if quantized is an empty list or tuple, then it is the same as load_arg(quantized=torch.float) - - if quantized is a list or tuple, then arg should be a list and - the args with corresponding indexes will be quantized to torch.quint8 - - - Output: fn which takes arg_or_args, and loads them from the - corresponding environment depending on the value of quantized. + # TODO: move this outside of this function + def replace_observer_with_quantize_dequantize_node( + model: torch.nn.Module, + graph: Graph, + node: Node, + modules: Dict[str, torch.nn.Module], + node_name_to_scope: Dict[str, Tuple[str, type]], + qconfig_map: Dict[str, QConfigAny]) -> None: + """ Replace activation_post_process module call node with quantize and + dequantize node + + Before: + ... -> observer_0(x) -> ... + After: + ... -> torch.quantize_per_tensor(x, ...) -> x.dequantize() -> ... """ - assert quantized is None or \ - isinstance(quantized, (tuple, list, dict, torch.dtype)), type(quantized) - if isinstance(quantized, (tuple, list, dict)) and len(quantized) == 0: - # empty tuple or list means nothing is quantized - quantized = torch.float - - def load_arg_impl(arg_or_args): - # we'll update the format of `quantized` - # to better match arg_or_args - updated_quantized: Optional[Union[List[int], torch.dtype, Dict[int, torch.dtype], Tuple[int, ...]]] = quantized - - if isinstance(quantized, (tuple, list)) and \ - len(quantized) == 1 and isinstance(arg_or_args, Node): - # when argument is one Node instead of tuple, we just need to check - # 0 is in the quantized list - if 0 in quantized: - updated_quantized = torch.quint8 - - if updated_quantized is None: - return map_arg(arg_or_args, load_x) - if isinstance(updated_quantized, torch.dtype): - return map_arg( - arg_or_args, - load_quantized(updated_quantized)) - elif isinstance(updated_quantized, (tuple, list)): - assert isinstance(arg_or_args, (tuple, list)), arg_or_args - loaded_args = [] - # for now, we only support quantizing positional arguments - for i, a in enumerate(arg_or_args): - if i in updated_quantized: - # Currently it's hardcoded to torch.quint8, we can extend this - # in the future to support all quantized - # dtypes - loaded_args.append(map_arg(a, load_quantized(torch.quint8))) - else: - loaded_args.append(map_arg(a, load_non_quantized)) - return type(arg_or_args)(loaded_args) - elif isinstance(updated_quantized, dict): - loaded_args = [] - for i, a in enumerate(arg_or_args): - if i in updated_quantized: - loaded_args.append(map_arg(a, load_quantized(updated_quantized[i]))) - else: - loaded_args.append(map_arg(a, load_non_quantized)) - return type(arg_or_args)(loaded_args) - return load_arg_impl - - def node_arg_is_quantized(node_arg: Any) -> bool: - if isinstance(node_arg, Node): - assert node_arg.name in env, \ - 'Expecting node_arg to be in the environment' - if node_arg.name in env: - dtype_to_node = env[node_arg.name] - return any([x in dtype_to_node for x in [torch.quint8, torch.qint8, torch.float16]]) - else: - return False - elif isinstance(node_arg, list): - quantized = map(node_arg_is_quantized, node_arg) - if all(quantized): - return True - elif not any(quantized): - return False - else: - raise Exception( - "partially quantized inputs in list not handled yet") - else: - return False - - def is_output_quantized( - node: Node, obj: QuantizeHandler, qconfig: QConfigAny, - modules: Dict[str, torch.nn.Module]) -> bool: - """ Check if output node is quantized or not """ - assert modules is not None - # for some ops the output is quantized only when `is_reference` is True - # and when `is_reference` is False, it has limited qconfig - # support, for example `add` - # ideally this check should not happen here, it should happen either in - # prepare or during lowering, we don't need this check - # after the default path is changed to produce reference patterns - quantized = obj.is_output_quantized(qconfig) - - # Need to get correct quantized/non-quantized state forn the output - # of FixedQParamsQuantizeHandler - # TODO: we may want to try to remove the special case here - # as well - if obj.should_mark_output_quantized_from_input_quantized_status(qconfig): - assert node.op in [ - 'call_module', - 'call_function', - 'call_method'], \ - 'FixedQParamsQuantizeHandler of type ' + node.op + ' is not handled' - # TODO: need to extend this to consider all relevant args instead of just arg[0] - quantized = node_arg_is_quantized(node.args[0]) - - # the output is unquantized if the node is not a CopyNode - # or the activation is not statically quantized - if not activation_is_statically_quantized(qconfig) or \ - not obj.input_output_observed(): - quantized = False - if node_return_type_is_int(node): - quantized = False - - return quantized - - def insert_quantize_node(node: Node, modules: Dict[str, torch.nn.Module]) -> None: - """ Given a activation_post_process module call node, insert a - quantize node""" assert modules is not None assert isinstance(node.target, str) + module_path, prefix = get_module_path_and_prefix(node, node_name_to_scope, qconfig_map) observer_module = modules[node.target] - prev_node = node.args[0] - if observer_module.dtype == torch.float32: - # copy the observer for fp32 dtype - env[node.name][torch.float] = quantized_graph.node_copy( - node, load_non_quantized) - elif isinstance(prev_node, Node) and prev_node.name in env: - # if previous node is already quantized, we'll just remove the - # activation_post_process - prev_dtype_to_node: Dict[Optional[torch.dtype], Node] = env[prev_node.name] - current_dtype: Optional[torch.dtype] = observer_module.dtype # type: ignore[assignment] - if current_dtype in prev_dtype_to_node: - env[node.name][current_dtype] = prev_dtype_to_node[current_dtype] - else: - root_module = modules[""] - assert isinstance(prev_node, Node) - observer_dtype: torch.dtype = observer_module.dtype # type: ignore[assignment] - env[node.name][observer_dtype] = \ - quantize_node( - load_non_quantized(prev_node), - observer_module, node, modules, quantized_graph, - node_name_to_scope, is_input=True) + maybe_quantize_node_info = get_quantize_node_info(observer_module) + # Skip replacing observers to quant/dequant nodes if the qconfigs of all + # consumers and producers of this observer are None + skip_replacement = all([ + has_none_qconfig(n, qconfig_map) for n in + list(node.args) + list(node.users.keys())]) + if skip_replacement or maybe_quantize_node_info is None: + # didn't find correponding quantize op and info for the observer_module + # so we just remove the observer + with graph.inserting_before(node): + node.replace_all_uses_with(node.args[0]) + graph.erase_node(node) else: - # replace activation post process with quantization ops - root_module = modules[""] - assert isinstance(node.args[0], Node) - dtype: torch.dtype = observer_module.dtype # type: ignore[assignment] - env[node.name][dtype] = \ - quantize_node( - load_non_quantized(node.args[0]), - observer_module, node, modules, - quantized_graph, - node_name_to_scope, is_input=True) + # otherwise, we can convert the observer moduel call to quantize/dequantize node + node_type, quantize_op, qparams = maybe_quantize_node_info + # replace observer node with quant - dequant node + with graph.inserting_before(node): + input_node = node.args[0] + inputs = [input_node] + for key, value in qparams.items(): + # TODO: we can add the information of whether a value needs to + # be registered as an attribute in qparams dict itself + if key in ['_scale_', '_zero_point_']: + # For scale and zero_point values we register them as buffers in the root module. + # TODO: maybe need more complex attr name here + qparam_node = create_getattr_from_value(model, graph, module_path + prefix + key, value) + inputs.append(qparam_node) + else: + # for qparams that are not scale/zero_point (like axis, dtype) we store them as literals in the graph. + inputs.append(value) + + quantized_node = graph.create_node(node_type, quantize_op, tuple(inputs), {}) + dequantized_node = graph.call_method("dequantize", args=(quantized_node,)) + node.replace_all_uses_with(dequantized_node) + graph.erase_node(node) + + # this is a temporary hack for custom module, we may want to implement + # this properly after the custom module class design is finalized + def replace_observer_with_dequantize_node(node: Node, graph: Graph): + call_custom_module_node = node.args[0] + assert isinstance(call_custom_module_node, Node), \ + f"Expecting the for call custom module node to be a Node, but got {call_custom_module_node}" + node.replace_all_uses_with(call_custom_module_node) + graph.erase_node(node) + insert_dequantize_node(call_custom_module_node, graph) # additional state to override inputs to be quantized, if specified # by the user placeholder_node_seen_cnt = 0 - output_node_seen_cnt = 0 input_quantized_idxs: List[int] = prepare_custom_config_dict.get( "input_quantized_idxs", []) output_quantized_idxs: List[int] = prepare_custom_config_dict.get( "output_quantized_idxs", []) - for node in model.graph.nodes: - if node.op == "output": - cur_output_node_idx = output_node_seen_cnt - output_node_seen_cnt += 1 - if cur_output_node_idx in output_quantized_idxs: - # Result are kept quantized if the user specified the - # output_quantized_idxs override. - graph_output = map_arg(node.args[0], load_x) - else: - graph_output = map_arg(node.args[0], load_non_quantized) - quantized_graph.output(graph_output) - continue - root_node, matched, matched_pattern, obj, qconfig = \ - matches.get(node.name, (None, None, None, None, None)) - if root_node is node: - is_observed_standalone_module_node = ( - node.op == 'call_module' and - is_observed_standalone_module( - modules[node.target]) - ) - if qconfig is None and not is_observed_standalone_module_node: - result = quantized_graph.node_copy( - node, load_non_quantized) - quantized = False - # If there are QAT swapped modules in the graph that we don't want to quantize, rever them back to FP32 ones. - if node.op == 'call_module' and type(modules[node.target]) in DEFAULT_QAT_MODULE_MAPPINGS.values(): - float_mod = modules[node.target].to_float() - setattr(model, node.name, float_mod) - with model.graph.inserting_before(node): - new_float_node = model.graph.create_node('call_module', node.name, node.args, node.kwargs) - else: - assert obj is not None - # We will get whether the output is quantized or not before - # convert for standalone module and after convert - # for non-standalone module, since _standalone_module_output_quantized_idxs - # is only available in observed standalone module - if is_observed_standalone_module_node: - out_quant_idxs = modules[node.target]._standalone_module_output_quantized_idxs.tolist() # noqa: B950 - assert len(out_quant_idxs) <= 1, "Currently standalone only support one output" - quantized = 0 in out_quant_idxs - - qconfig = qconfig_map[node.name] - # Note: load_arg can be overwritten in the convert method when used to - # create Node in graph - result = obj.convert( - node, qconfig, modules, quantized_graph, node_name_to_scope, load_arg, is_reference=is_reference, - convert_custom_config_dict=convert_custom_config_dict) - if not is_observed_standalone_module_node: - quantized = is_output_quantized(node, obj, qconfig, modules) - - if quantized: - env[node.name][activation_dtype(qconfig)] = result - else: - env[node.name][torch.float] = result - continue - elif root_node is not None: - if qconfig is None: - # This branch is hit if all of these conditions are met: - # 1. we are in a fusion pattern of multiple nodes (i.e. add-relu) - # 2. the current node is not the "root_node" of the pattern - # 3. quantization for this pattern is disabled - # - # In this case, we need to make sure to populate the env with - # intermediate nodes manually, because the QuantizeHandler.convert - # function will not be called. - result = quantized_graph.node_copy( - node, load_non_quantized) - env[node.name][torch.float] = result - continue + if backend_config_dict is None: + backend_config_dict = get_native_backend_config_dict() + root_module_to_quantized_reference_module = get_root_module_to_quantized_reference_module(backend_config_dict) + # convert tuples so that it can work with isinstance(module, tuple_of_classes) + root_module_classes = tuple(root_module_to_quantized_reference_module.keys()) + qat_module_classes = get_qat_module_classes(backend_config_dict) + fused_module_classes = get_fused_module_classes(backend_config_dict) + statically_quantized_custom_module_nodes: Set[Node] = set() - # handle activation post process calls - if node.op == 'call_module' and \ - is_activation_post_process(modules[node.target]): - insert_quantize_node(node, modules) - elif node.op == 'placeholder': + for node in list(model.graph.nodes): + if node.op == 'placeholder': cur_placeholder_node_idx = placeholder_node_seen_cnt placeholder_node_seen_cnt += 1 if cur_placeholder_node_idx in input_quantized_idxs: - env[node.name][torch.quint8] = quantized_graph.node_copy( - node, load_non_quantized) - else: - env[node.name][torch.float] = \ - quantized_graph.node_copy(node, load_non_quantized) - else: - # copy quantized or non-quantized node - # get_tensor_info_node like shape works for both - # quantized and non-quantized input and output a non-Tensor - # (we use None for dtype currently for non-Tensors) - if is_get_tensor_info_node(node): - env[node.name][None] = \ - quantized_graph.node_copy(node, load_x) + # Inputs are assumed to be quantized if the user specifid the + # input_quantized_idxs override. + # we need to dequantize the inputs since all operators took + # floating point inputs in reference quantized models + insert_dequantize_node(node, model.graph) + elif node.op == "output": + # If the argument is empty we don't need to do anything + if len(output_quantized_idxs) == 0: + continue + # Result are kept quantized if the user specified the + # output_quantized_idxs override. + # Remove the dequantize operator for the node in the end if any + return_node = node + output = node.args[0] + # outputs can be Node, list, tuple, dict, other cases are not supported yet + if isinstance(output, (list, tuple)): + for idx in output_quantized_idxs: + maybe_recursive_remove_dequantize(output[idx], return_node, model.graph) + elif isinstance(output, (Node, dict)): + # we treat dict as a single argument currently, but it can be extended + # to support {"key": dtype} after we change output_quantized_idxs to + # dict + if 0 in output_quantized_idxs: + maybe_recursive_remove_dequantize(output, return_node, model.graph) else: - env[node.name][torch.float] = \ - quantized_graph.node_copy(node, load_non_quantized) - - # remove activation post process - act_post_process_removed_graph = Graph() - remove_env: Dict[str, Node] = {} + warnings.warn(f"Unsupported node type for output_quantized_idxs: {type(output)}") + elif node.op == "call_module": + if is_activation_post_process(modules[node.target]): + observed_node = node.args[0] + if observed_node in statically_quantized_custom_module_nodes: + replace_observer_with_dequantize_node(node, model.graph) + else: + replace_observer_with_quantize_dequantize_node( + model, model.graph, node, modules, node_name_to_scope, + qconfig_map) + elif is_observed_standalone_module(modules[node.target]): + convert_standalone_module( + node, modules, model, is_reference, backend_config_dict) + elif type(modules[node.target]) in set( + root_module_classes).union(qat_module_classes).union(fused_module_classes): + # extra check for fused module classes to make sure they are fused module classes + # of target modules + if type(modules[node.target]) in fused_module_classes and \ + type(modules[node.target][0]) not in root_module_classes: + continue + convert_weighted_module( + node, modules, observed_node_names, qconfig_map, backend_config_dict) + elif type(modules[node.target]) in custom_module_classes: + convert_custom_module( + node, model.graph, modules, custom_module_class_mapping, + statically_quantized_custom_module_nodes) - def load_arg_remove(a: Argument) -> Argument: - return map_arg(a, lambda node: remove_env[node.name]) + preserved_attributes = set(convert_custom_config_dict.get("preserved_attributes", [])) + model = QuantizedGraphModule(model, copy.deepcopy(model.graph), preserved_attributes) - for node in quantized_graph.nodes: - if node.op == 'output': - act_post_process_removed_graph.output( - map_arg(node.args[0], load_arg_remove)) - continue - if node.op == 'call_module' and \ - is_activation_post_process(modules[node.target]): - # remove activation post process node - remove_env[node.name] = remove_env[node.args[0].name] - else: - remove_env[node.name] = act_post_process_removed_graph.node_copy( - node, load_arg_remove) + # remove deadcode after converting observers to quant/dequant ops + model.graph.eliminate_dead_code() + model.recompile() - # removes qconfig and activation_post_process modules - if _remove_qconfig_flag: - _remove_qconfig(model) - preserved_attributes = set(convert_custom_config_dict.get("preserved_attributes", [])) - model = QuantizedGraphModule(model, act_post_process_removed_graph, preserved_attributes) + # TODO: maybe move this to quantize_fx.py if not is_reference: model = duplicate_dequantize_node(model) - model = fold_weight(model, node_name_to_scope) - model = lower_to_fbgemm(model) + model = duplicate_quantize_dynamic_node(model) + model = lower_to_fbgemm(model, qconfig_map, node_name_to_scope) model = remove_quant_dequant_pairs(model) model = remove_extra_dequantize(model) + # TODO: this looks hacky, we want to check why we need this and see if we can + # remove this + # removes qconfig and activation_post_process modules + if _remove_qconfig_flag: + _remove_qconfig(model) return model diff --git a/torch/ao/quantization/fx/fuse.py b/torch/ao/quantization/fx/fuse.py index 60e7ccd28a59..0736f8273541 100644 --- a/torch/ao/quantization/fx/fuse.py +++ b/torch/ao/quantization/fx/fuse.py @@ -4,111 +4,130 @@ map_arg ) from torch.fx.graph import Graph -from ..utils import ( - get_combined_dict -) from .graph_module import ( FusedGraphModule ) -from .match_utils import is_match +from .match_utils import ( + is_match, + MatchAllNode, +) from .pattern_utils import ( - get_default_fusion_patterns, + sorted_patterns_dict, ) -from .backend_config.utils import get_fusion_pattern_to_fuse_handler_cls -from .backend_config.utils import get_fuser_method_mapping +from ..backend_config.utils import get_fuser_method_mapping +from ..backend_config.utils import get_fusion_pattern_to_root_node_getter +from ..backend_config.utils import get_fusion_pattern_to_extra_inputs_getter +from ..backend_config import get_native_backend_config_dict +from .backend_config_utils import get_fusion_pattern_to_fuse_handler_cls from .fusion_patterns import * # noqa: F401,F403 from typing import Callable, Tuple, Dict, Any, Optional, List -from .quantization_types import Pattern, NodePattern - -class Fuser: - def fuse( - self, - model: GraphModule, - is_qat: bool, - fuse_custom_config_dict: Optional[Dict[str, Any]] = None, - backend_config_dict: Optional[Dict[str, Any]] = None, - ) -> GraphModule: - if fuse_custom_config_dict is None: - fuse_custom_config_dict = {} - - input_root = model - input_graph = model.graph - self.modules = dict(input_root.named_modules()) - - if backend_config_dict is None: - additional_fusion_patterns = \ - fuse_custom_config_dict.get("additional_fusion_pattern", {}) - fusion_pattern_to_fuse_handler_cls = get_combined_dict( - get_default_fusion_patterns(), additional_fusion_patterns) - fuser_method_mapping = None +from torch.ao.quantization.quantization_types import Pattern, NodePattern + +def fuse( + model: GraphModule, + is_qat: bool, + fuse_custom_config_dict: Optional[Dict[str, Any]] = None, + backend_config_dict: Optional[Dict[str, Any]] = None, +) -> GraphModule: + if fuse_custom_config_dict is None: + fuse_custom_config_dict = {} + + input_root = model + input_graph = model.graph + named_modules = dict(input_root.named_modules()) + + if backend_config_dict is None: + backend_config_dict = get_native_backend_config_dict() + + fusion_pattern_to_fuse_handler_cls = sorted_patterns_dict(get_fusion_pattern_to_fuse_handler_cls(backend_config_dict)) + fuser_method_mapping = get_fuser_method_mapping(backend_config_dict) + fusion_pattern_to_root_node_getter = get_fusion_pattern_to_root_node_getter(backend_config_dict) + fusion_pattern_to_extra_inputs_getter = get_fusion_pattern_to_extra_inputs_getter(backend_config_dict) + + # find fusion + fusion_pairs = _find_matches( + input_root, input_graph, fusion_pattern_to_fuse_handler_cls) + fused_graph = Graph() + env: Dict[Any, Any] = {} + + def load_arg(a): + return map_arg(a, lambda node: env[node.name]) + + def default_root_node_getter(node_pattern): + while not isinstance(node_pattern[-1], Node): + node_pattern = node_pattern[-1] + return node_pattern[-1] + + for node in input_graph.nodes: + maybe_last_node, pattern, matched_node_pattern, obj, node_to_subpattern = \ + fusion_pairs.get(node.name, (None, None, None, None, None)) + # get the corresponding subpattern for the current node + if node_to_subpattern is not None: + node_subpattern = node_to_subpattern.get(node, None) else: - fusion_pattern_to_fuse_handler_cls = get_fusion_pattern_to_fuse_handler_cls(backend_config_dict) - fuser_method_mapping = get_fuser_method_mapping(backend_config_dict) - # find fusion - fusion_pairs = self._find_matches( - input_root, input_graph, fusion_pattern_to_fuse_handler_cls) - self.fused_graph = Graph() - env: Dict[Any, Any] = {} - - def load_arg(a): - return map_arg(a, lambda node: env[node.name]) - - def get_root_node(node_pattern): - while not isinstance(node_pattern[-1], Node): - node_pattern = node_pattern[-1] - return node_pattern[-1] - - for node in input_graph.nodes: - maybe_last_node, pattern, matched_node_pattern, obj = \ - fusion_pairs.get(node.name, (None, None, None, None)) - if maybe_last_node is node: - assert obj is not None - # TODO: currently we hard code the root node, which only works for - # a sequence of ops and assume the root node is the last node, - # we want to make this more general to support more complex patterns - root_node = get_root_node(matched_node_pattern) # type: ignore[index] - env[node.name] = obj.fuse( - self, load_arg, root_node, matched_node_pattern, # type: ignore[arg-type] - fuse_custom_config_dict, fuser_method_mapping, is_qat) - elif maybe_last_node is None: - env[node.name] = self.fused_graph.node_copy(node, load_arg) - # node matched in patterns and is not root is removed here - - preserved_attributes = set(fuse_custom_config_dict.get("preserved_attributes", [])) - model = FusedGraphModule(input_root, self.fused_graph, preserved_attributes) - return model - - def _find_matches( - self, root: GraphModule, graph: Graph, - patterns: Dict[Pattern, Callable] - ) -> Dict[str, Tuple[Node, Pattern, NodePattern, FuseHandler]]: - modules = dict(root.named_modules()) - match_map : Dict[str, Tuple[Node, Pattern, NodePattern, FuseHandler]] = {} # node name -> (root_node, match_value) - - def apply_match(pattern, node, match, matched_node_pattern): - if isinstance(pattern, tuple): - s, *args = pattern - current_node_pattern: List[Node] = [] - apply_match(s, node, match, current_node_pattern) - for subpattern, arg in zip(args, node.args): - apply_match(subpattern, arg, match, current_node_pattern) - matched_node_pattern.append(tuple(current_node_pattern)) - else: - # the first pattern matches will take precedence - if node.name not in match_map: - matched_node_pattern.append(node) - root_node, pattern, handler = match - match_map[node.name] = (root_node, pattern, matched_node_pattern, handler) + node_subpattern = None + if maybe_last_node is node: + assert obj is not None + root_node_getter = fusion_pattern_to_root_node_getter.get(pattern, default_root_node_getter) + root_node = root_node_getter(matched_node_pattern) # type: ignore[index] + extra_inputs_getter = fusion_pattern_to_extra_inputs_getter.get(pattern, None) + extra_inputs = [] + if extra_inputs_getter is not None: + extra_inputs = extra_inputs_getter(matched_node_pattern) + # TODO: add validation that root_node is a module and has the same type + # as the root_module in the configuration + env[node.name] = obj.fuse( + load_arg, named_modules, fused_graph, root_node, extra_inputs, matched_node_pattern, # type: ignore[arg-type] + fuse_custom_config_dict, fuser_method_mapping, is_qat) + elif maybe_last_node is None or node_subpattern is MatchAllNode: + env[node.name] = fused_graph.node_copy(node, load_arg) + # node matched in patterns and is not root is removed here + + preserved_attributes = set(fuse_custom_config_dict.get("preserved_attributes", [])) + model = FusedGraphModule(input_root, fused_graph, preserved_attributes) + return model - for node in reversed(graph.nodes): +def _find_matches( + root: GraphModule, graph: Graph, + patterns: Dict[Pattern, Callable] +) -> Dict[str, Tuple[Node, Pattern, NodePattern, FuseHandler, Dict[Node, Any]]]: + modules = dict(root.named_modules()) + # node name -> (root_node, match_value) + match_map : Dict[ + str, Tuple[Node, Pattern, NodePattern, FuseHandler, Dict[Node, Any]]] = {} + # a map from node to the matched subpattern + node_to_subpattern: Dict[Node, Any] = {} + + # TODO: dedup with quantization matching function in match_utils.py + def apply_match(pattern, node, match, matched_node_pattern, node_to_subpattern): + if isinstance(pattern, tuple): + s, *args = pattern + current_node_pattern: List[Node] = [] + apply_match(s, node, match, current_node_pattern, node_to_subpattern) + for subpattern, arg in zip(args, node.args): + apply_match(subpattern, arg, match, current_node_pattern, node_to_subpattern) + matched_node_pattern.append(tuple(current_node_pattern)) + else: + # the first pattern matches will take precedence if node.name not in match_map: - for pattern, value in patterns.items(): - matched_node_pattern: List[Node] = [] - if is_match(modules, node, pattern): - apply_match(pattern, node, (node, pattern, value(self, node)), matched_node_pattern) + matched_node_pattern.append(node) + # MatchAllNode here is actually MatchAllInputNode which should not + # be added to match_map + if pattern is not MatchAllNode: + node_to_subpattern[node] = pattern + root_node, pattern, handler = match + match_map[node.name] = (root_node, pattern, matched_node_pattern, handler, node_to_subpattern) + + for node in reversed(graph.nodes): + if node.name not in match_map: + for pattern, value in patterns.items(): + matched_node_pattern: List[Node] = [] + if is_match(modules, node, pattern): + apply_match(pattern, node, (node, pattern, value(node)), matched_node_pattern, node_to_subpattern) + break - return match_map + return match_map diff --git a/torch/ao/quantization/fx/fusion_patterns.py b/torch/ao/quantization/fx/fusion_patterns.py index 2a0b9ff6f1e5..95b0c96693a5 100644 --- a/torch/ao/quantization/fx/fusion_patterns.py +++ b/torch/ao/quantization/fx/fusion_patterns.py @@ -1,10 +1,7 @@ import torch -from torch.fx.graph import Node -from .pattern_utils import ( - register_fusion_pattern, -) +from torch.fx.graph import Node, Graph from ..utils import _parent_name -from .quantization_types import QuantizerCls, NodePattern, Pattern +from torch.ao.quantization.quantization_types import NodePattern, Pattern from ..fuser_method_mappings import get_fuser_method_new from abc import ABC, abstractmethod from typing import Any, Callable, Dict, Optional, Union, List @@ -18,97 +15,76 @@ class FuseHandler(ABC): """ Base handler class for the fusion patterns """ - def __init__(self, quantizer: QuantizerCls, node: Node): + def __init__(self, node: Node): pass @abstractmethod def fuse(self, - quantizer: QuantizerCls, load_arg: Callable, + named_modules: Dict[str, torch.nn.Module], + fused_graph: Graph, root_node: Node, + extra_inputs: List[Any], matched_node_pattern: NodePattern, fuse_custom_config_dict: Dict[str, Any], fuser_method_mapping: Optional[Dict[Pattern, Union[torch.nn.Sequential, Callable]]], is_qat: bool) -> Node: pass -@register_fusion_pattern((torch.nn.ReLU, torch.nn.Conv1d)) -@register_fusion_pattern((torch.nn.ReLU, torch.nn.Conv2d)) -@register_fusion_pattern((torch.nn.ReLU, torch.nn.Conv3d)) -@register_fusion_pattern((torch.nn.functional.relu, torch.nn.Conv1d)) -@register_fusion_pattern((torch.nn.functional.relu, torch.nn.Conv2d)) -@register_fusion_pattern((torch.nn.functional.relu, torch.nn.Conv3d)) -@register_fusion_pattern((torch.nn.functional.relu, torch.nn.Linear)) -@register_fusion_pattern((torch.nn.ReLU, torch.nn.Linear)) -@register_fusion_pattern((torch.nn.functional.relu, torch.nn.BatchNorm2d)) -@register_fusion_pattern((torch.nn.ReLU, torch.nn.BatchNorm2d)) -@register_fusion_pattern((torch.nn.functional.relu, torch.nn.BatchNorm3d)) -@register_fusion_pattern((torch.nn.ReLU, torch.nn.BatchNorm3d)) -@register_fusion_pattern((torch.nn.BatchNorm1d, torch.nn.Conv1d)) -@register_fusion_pattern((torch.nn.BatchNorm2d, torch.nn.Conv2d)) -@register_fusion_pattern((torch.nn.BatchNorm3d, torch.nn.Conv3d)) -@register_fusion_pattern((torch.nn.BatchNorm1d, torch.nn.Linear)) -@register_fusion_pattern((torch.nn.ReLU, (torch.nn.BatchNorm1d, torch.nn.Conv1d))) -@register_fusion_pattern((torch.nn.ReLU, (torch.nn.BatchNorm2d, torch.nn.Conv2d))) -@register_fusion_pattern((torch.nn.ReLU, (torch.nn.BatchNorm3d, torch.nn.Conv3d))) -@register_fusion_pattern((torch.nn.functional.relu, (torch.nn.BatchNorm1d, torch.nn.Conv1d))) -@register_fusion_pattern((torch.nn.functional.relu, (torch.nn.BatchNorm2d, torch.nn.Conv2d))) -@register_fusion_pattern((torch.nn.functional.relu, (torch.nn.BatchNorm3d, torch.nn.Conv3d))) -@register_fusion_pattern((torch.nn.BatchNorm1d, torch.nn.ConvTranspose1d)) -@register_fusion_pattern((torch.nn.BatchNorm2d, torch.nn.ConvTranspose2d)) -@register_fusion_pattern((torch.nn.BatchNorm3d, torch.nn.ConvTranspose3d)) +# TODO: move this to backend_config.fuse_handler class DefaultFuseHandler(FuseHandler): def __init__( self, - quantizer: QuantizerCls, node: Node): - super().__init__(quantizer, node) + super().__init__(node) def fuse(self, - quantizer: QuantizerCls, load_arg: Callable, + named_modules: Dict[str, torch.nn.Module], + fused_graph: Graph, root_node: Node, + extra_inputs: List[Any], matched_node_pattern: NodePattern, fuse_custom_config_dict: Dict[str, Any], fuser_method_mapping: Optional[Dict[Pattern, Union[torch.nn.Sequential, Callable]]], is_qat: bool) -> Node: - additional_fuser_method_mapping = fuse_custom_config_dict.get("additional_fuser_method_mapping", {}) assert root_node.op == "call_module", "Expecting module node to be a call_module Node" - root_module = quantizer.modules[root_node.target] - assert len(additional_fuser_method_mapping) == 0, "Fusion implementation is " - "undergoing changes, additoinal_fuser_method_mapping is not supported currently." - def get_modules(pattern, modules): + root_module = named_modules[str(root_node.target)] + + def get_modules(pattern): """ Given a node pattern, extract the corresponding modules e.g. input: (relu_node, (bn_node, conv_node)) output: (relu_module, (bn_module, conv_module)) """ if isinstance(pattern, (tuple, list)): n, *args = pattern - get_modules(n, modules) - arg_modules: List[torch.nn.Module] = [] + modules: List[torch.nn.Module] = [] + modules.append(get_modules(n)) for a in args: - get_modules(a, arg_modules) - arg_modules = tuple(arg_modules) if len(arg_modules) > 1 else arg_modules[0] # type: ignore[assignment] - modules.append(arg_modules) + modules.append(get_modules(a)) + return tuple(modules) else: n = pattern if n.op == "call_module": - modules.append(quantizer.modules[n.target]) + return named_modules[n.target] elif n.op == "call_function" and n.target == torch.nn.functional.relu: relu = torch.nn.ReLU() relu.training = root_module.training - modules.append(relu) + return relu + elif n.op == "call_function" or n.op == "call_method": + return n.target else: - modules.append(MatchAllNode) - return tuple(modules) + return MatchAllNode # since relu can be used multiple times, we'll need to create a relu module for each match - matched_modules = get_modules(matched_node_pattern, []) + matched_modules = get_modules(matched_node_pattern) def get_matched_types(m): if isinstance(m, tuple): return tuple(map(get_matched_types, m)) - return type(m) + if isinstance(m, torch.nn.Module): + return type(m) + return m matched_module_types = get_matched_types(matched_modules) module_parent_name, module_name = _parent_name(root_node.target) @@ -116,6 +92,12 @@ def get_matched_types(m): # TODO: change the signature for fuser_method to take matched module patterns # as input fused_module = fuser_method(is_qat, *matched_modules) - # TODO: maybe add a pass to cleanup bn modules? - setattr(quantizer.modules[module_parent_name], module_name, fused_module) - return quantizer.fused_graph.node_copy(root_node, load_arg) + setattr(named_modules[module_parent_name], module_name, fused_module) + extra_args = [] + for input in extra_inputs: + extra_args.append(load_arg(input)) + node = fused_graph.node_copy(root_node, load_arg) + args = list(node.args) + args.extend(extra_args) + node.args = tuple(args) + return node diff --git a/torch/ao/quantization/fx/graph_module.py b/torch/ao/quantization/fx/graph_module.py index ef43a42d030f..2e37e4a557e4 100644 --- a/torch/ao/quantization/fx/graph_module.py +++ b/torch/ao/quantization/fx/graph_module.py @@ -18,7 +18,7 @@ def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph, p def __deepcopy__(self, memo): fake_mod = torch.nn.Module() fake_mod.__dict__ = copy.deepcopy(self.__dict__) - return FusedGraphModule(fake_mod, self.graph, self.preserved_attr_names) + return FusedGraphModule(fake_mod, copy.deepcopy(self.graph), copy.deepcopy(self.preserved_attr_names)) class ObservedGraphModule(GraphModule): @@ -45,7 +45,7 @@ def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph, p def __deepcopy__(self, memo): fake_mod = torch.nn.Module() fake_mod.__dict__ = copy.deepcopy(self.__dict__) - return ObservedGraphModule(fake_mod, self.graph, self.preserved_attr_names) + return ObservedGraphModule(fake_mod, copy.deepcopy(self.graph), copy.deepcopy(self.preserved_attr_names)) def is_observed_module(module: Any) -> bool: return isinstance(module, ObservedGraphModule) @@ -60,7 +60,7 @@ def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph, p def __deepcopy__(self, memo): fake_mod = torch.nn.Module() fake_mod.__dict__ = copy.deepcopy(self.__dict__) - return ObservedStandaloneGraphModule(fake_mod, self.graph, self.preserved_attr_names) + return ObservedStandaloneGraphModule(fake_mod, copy.deepcopy(self.graph), copy.deepcopy(self.preserved_attr_names)) def is_observed_standalone_module(module: Any) -> bool: return isinstance(module, ObservedStandaloneGraphModule) @@ -104,4 +104,4 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, def __deepcopy__(self, memo): fake_mod = torch.nn.Module() fake_mod.__dict__ = copy.deepcopy(self.__dict__) - return QuantizedGraphModule(fake_mod, self.graph, self.preserved_attr_names) + return QuantizedGraphModule(fake_mod, copy.deepcopy(self.graph), copy.deepcopy(self.preserved_attr_names)) diff --git a/torch/ao/quantization/fx/lower_to_fbgemm.py b/torch/ao/quantization/fx/lower_to_fbgemm.py index fc76d135ee80..c8c413cacfee 100644 --- a/torch/ao/quantization/fx/lower_to_fbgemm.py +++ b/torch/ao/quantization/fx/lower_to_fbgemm.py @@ -1,8 +1,14 @@ from ._lower_to_native_backend import _lower_to_native_backend from .graph_module import QuantizedGraphModule +from ..qconfig import QConfigAny +from typing import Dict, Tuple -def lower_to_fbgemm(model: QuantizedGraphModule) -> QuantizedGraphModule: +def lower_to_fbgemm( + model: QuantizedGraphModule, + qconfig_map: Dict[str, QConfigAny], + node_name_to_scope: Dict[str, Tuple[str, type]] +) -> QuantizedGraphModule: """ Lower a quantized reference model (with reference quantized operator patterns) to fbgemm """ - return _lower_to_native_backend(model) + return _lower_to_native_backend(model, qconfig_map, node_name_to_scope) diff --git a/torch/ao/quantization/fx/lower_to_qnnpack.py b/torch/ao/quantization/fx/lower_to_qnnpack.py index 0a0ea9cd248c..e79de696e5e0 100644 --- a/torch/ao/quantization/fx/lower_to_qnnpack.py +++ b/torch/ao/quantization/fx/lower_to_qnnpack.py @@ -1,8 +1,14 @@ from ._lower_to_native_backend import _lower_to_native_backend from .graph_module import QuantizedGraphModule +from ..qconfig import QConfigAny +from typing import Dict, Tuple -def lower_to_qnnpack(model: QuantizedGraphModule) -> QuantizedGraphModule: +def lower_to_qnnpack( + model: QuantizedGraphModule, + qconfig_map: Dict[str, QConfigAny], + node_name_to_scope: Dict[str, Tuple[str, type]] +) -> QuantizedGraphModule: """ Lower a quantized reference model (with reference quantized operator patterns) to qnnpack """ - return _lower_to_native_backend(model) + return _lower_to_native_backend(model, qconfig_map, node_name_to_scope) diff --git a/torch/ao/quantization/fx/match_utils.py b/torch/ao/quantization/fx/match_utils.py index e759583b44a8..46f7b515e860 100644 --- a/torch/ao/quantization/fx/match_utils.py +++ b/torch/ao/quantization/fx/match_utils.py @@ -4,15 +4,16 @@ Graph, Node, ) -from .quantization_types import Pattern +from torch.ao.quantization.quantization_types import Pattern from .quantization_patterns import ( QuantizeHandler, - CustomModuleQuantizeHandler, - StandaloneModuleQuantizeHandler, ) from ..qconfig import ( QConfigAny, ) +from ..utils import ( + MatchAllNode +) from .graph_module import ( is_observed_standalone_module, ) @@ -22,12 +23,6 @@ MatchResult = Tuple[Node, List[Node], Optional[Pattern], QuantizeHandler, QConfigAny] -# TODO: maybe rename this to MatchInputNode -class MatchAllNode: - """ A node pattern that matches all nodes - """ - pass - # Note: The order of patterns is important! match function will take whatever is matched first, so we'll # need to put the fusion patterns before single patterns. For example, add_relu should be registered come before relu. # decorators are applied in the reverse order we see. Also when we match the nodes in the graph with these patterns, @@ -79,6 +74,7 @@ def find_matches( graph: Graph, modules: Dict[str, torch.nn.Module], patterns: Dict[Pattern, QuantizeHandler], + root_node_getter_mapping: Dict[Pattern, Callable], qconfig_map: Dict[str, QConfigAny], standalone_module_names: List[str] = None, standalone_module_classes: List[Callable] = None, @@ -117,29 +113,80 @@ def find_matches( match_map: Dict[str, MatchResult] = {} all_matched : Set[str] = set() - def record_match(pattern, node, matched): + def _recursive_record_node_in_match_map( + last_node, + match_map, + node_pattern, + matched_node_pattern, + pattern, + match_value, + qconfig): + if isinstance(node_pattern, Node): + match_map[node_pattern.name] = ( + last_node, matched_node_pattern, pattern, match_value, qconfig) + else: + for n in node_pattern: + _recursive_record_node_in_match_map(last_node, match_map, n, matched_node_pattern, pattern, match_value, qconfig) + + # TODO: 1. merge with fuse matcher 2. document the code + def record_match( + pattern, + node, + last_node, + matched_node_pattern, + match_map): if isinstance(pattern, tuple): s, *args = pattern - record_match(s, node, matched) + current_node_pattern: List[Node] = [] + record_match( + s, + node, + last_node, + matched_node_pattern, + match_map) if pattern[0] is not getattr: for subpattern, arg in zip(args, node.args): - record_match(subpattern, arg, matched) + record_match( + subpattern, + arg, + node, + current_node_pattern, + match_map) + if len(current_node_pattern) > 1: + matched_node_pattern.append(tuple(current_node_pattern)) + else: + matched_node_pattern.append(current_node_pattern[0]) else: - matched.append(node) + matched_node_pattern.append(node) - cache_for_no_tensor_check: Dict[Node, bool] = dict() for node in reversed(graph.nodes): if node.name not in match_map and node.name not in all_matched: - for pattern, value in patterns.items(): - if is_match(modules, node, pattern): - matched: List[Any] = [] - record_match(pattern, node, matched) - for n in matched: - match_map[n.name] = ( - node, matched, pattern, value(node, modules), # type: ignore[operator] - qconfig_map[n.name]) - all_matched.add(n.name) - # break after finding the first match + for pattern, quantize_handler_cls in patterns.items(): + root_node_getter = root_node_getter_mapping.get(pattern, None) + if is_match(modules, node, pattern) and node.name not in match_map: + matched_node_pattern: List[Node] = [] + record_match( + pattern, + node, + node, + matched_node_pattern, + match_map) + quantize_handler = quantize_handler_cls( # type: ignore[operator] + matched_node_pattern, + modules, + root_node_getter) + last_node = node + # record the match for all nodes in the pattern + _recursive_record_node_in_match_map( + last_node, + match_map, + # we need to record all nodes in the matched pattern in the match_map + matched_node_pattern, + # this is a part of the value corresponding to the node + matched_node_pattern, + pattern, + quantize_handler, + qconfig_map[node.name]) break # add custom module instances to the match result @@ -149,7 +196,7 @@ def record_match(pattern, node, matched): type(modules[node.target]) in custom_module_classes: custom_module_qconfig = qconfig_map[node.name] match_map[node.name] = ( - node, [node], None, CustomModuleQuantizeHandler(node, modules), + node, node, None, QuantizeHandler(node, modules, is_custom_module=True), custom_module_qconfig) def is_standalone_module(node_target: str, modules: Dict[str, torch.nn.Module]): @@ -165,10 +212,10 @@ def is_standalone_module(node_target: str, modules: Dict[str, torch.nn.Module]): (is_standalone_module(node.target, modules) or is_observed_standalone_module(modules[node.target])): # add node to matched nodes - custom_module_qconfig = qconfig_map[node.name] + standalone_module_qconfig = qconfig_map[node.name] match_map[node.name] = ( - node, [node], None, - StandaloneModuleQuantizeHandler(node, modules), - custom_module_qconfig) + node, node, None, + QuantizeHandler(node, modules, is_standalone_module=True), + standalone_module_qconfig) return match_map diff --git a/torch/ao/quantization/fx/pattern_utils.py b/torch/ao/quantization/fx/pattern_utils.py index bba17d730d6a..e7c4d70fc7f3 100644 --- a/torch/ao/quantization/fx/pattern_utils.py +++ b/torch/ao/quantization/fx/pattern_utils.py @@ -3,12 +3,12 @@ from torch.fx.graph import ( Node, ) -from .quantization_types import Pattern +from torch.ao.quantization.quantization_types import Pattern from ..qconfig import QConfigAny from ..fake_quantize import FixedQParamsFakeQuantize # from .quantization_patterns import BinaryOpQuantizeHandler from ..observer import ObserverBase - +import copy # TODO(future PR): fix the typing on QuantizeHandler (currently a circular dependency) QuantizeHandler = Any @@ -25,13 +25,13 @@ def insert(fn): return insert def get_default_fusion_patterns() -> Dict[Pattern, QuantizeHandler]: - return DEFAULT_FUSION_PATTERNS + return copy.copy(DEFAULT_FUSION_PATTERNS) DEFAULT_QUANTIZATION_PATTERNS = OrderedDict() # Mapping from pattern to activation_post_process(observer/fake_quant) constructor for output activation # e.g. pattern: torch.sigmoid, -# output_activation_post_process: default_affine_fixed_qparams_fake_quant +# output_activation_post_process: default_fixed_qparams_range_0to1_fake_quant DEFAULT_OUTPUT_FAKE_QUANTIZE_MAP = dict() DEFAULT_OUTPUT_OBSERVER_MAP = dict() @@ -47,15 +47,15 @@ def insert(fn): # Get patterns for both static quantization and qat def get_default_quant_patterns() -> Dict[Pattern, QuantizeHandler]: - return DEFAULT_QUANTIZATION_PATTERNS + return copy.copy(DEFAULT_QUANTIZATION_PATTERNS) # a map from pattern to output activation post process constructor # e.g. torch.sigmoid -> default_affine_fixed_qparam_fake_quant def get_default_output_activation_post_process_map(is_training) -> Dict[Pattern, ObserverBase]: if is_training: - return DEFAULT_OUTPUT_FAKE_QUANTIZE_MAP + return copy.copy(DEFAULT_OUTPUT_FAKE_QUANTIZE_MAP) else: - return DEFAULT_OUTPUT_OBSERVER_MAP + return copy.copy(DEFAULT_OUTPUT_OBSERVER_MAP) # Example use of register pattern function: # @register_fusion_pattern(torch.nn.ReLU, (torch.nn.BatchNorm2d, torch.nn.Conv2d))) @@ -63,3 +63,27 @@ def get_default_output_activation_post_process_map(is_training) -> Dict[Pattern, # def __init__(...): # ... # + +def sorted_patterns_dict(patterns_dict: Dict[Pattern, QuantizeHandler]) -> Dict[Pattern, QuantizeHandler]: + """ + Return a sorted version of the patterns dictionary such that longer patterns are matched first, + e.g. match (F.relu, F.linear) before F.relu. + This works for current use cases, but we may need to have a more clever way to sort + things to address more complex patterns + """ + + def get_len(pattern): + """ this will calculate the length of the pattern by counting all the entries + in the pattern. + this will make sure (nn.ReLU, (nn.BatchNorm, nn.Conv2d)) comes before + (nn.BatchNorm, nn.Conv2d) so that we can match the former first + """ + len = 0 + if isinstance(pattern, tuple): + for item in pattern: + len += get_len(item) + else: + len += 1 + return len + + return OrderedDict(sorted(patterns_dict.items(), key=lambda kv: -get_len(kv[0]) if isinstance(kv[0], tuple) else 1)) diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py index d0d951ce7aa3..086b65e13c90 100644 --- a/torch/ao/quantization/fx/prepare.py +++ b/torch/ao/quantization/fx/prepare.py @@ -30,11 +30,12 @@ from .quantization_patterns import ( QuantizeHandler, - CustomModuleQuantizeHandler, - StandaloneModuleQuantizeHandler, ) -from .quantization_types import Pattern +from torch.ao.quantization.quantization_types import ( + Pattern, + NodePattern +) from ._equalize import ( is_equalization_observer, @@ -48,7 +49,7 @@ from .pattern_utils import ( MatchResult, - get_default_quant_patterns, + sorted_patterns_dict, ) from .match_utils import ( @@ -60,40 +61,44 @@ get_custom_module_class_keys, all_node_args_have_no_tensors, assert_and_get_unique_device, - node_bool_tensor_arg_indexes, + get_non_observable_arg_indexes_and_types, get_new_attr_name_with_prefix, NON_QUANTIZABLE_WEIGHT_OPS, WEIGHT_INDEX_DICT, BIAS_INDEX_DICT, ) -from ..quantization_mappings import ( - get_default_qat_module_mappings, -) - from torch.ao.quantization.quantize import ( is_activation_post_process, convert ) from ..utils import ( - get_combined_dict, get_qconfig_dtypes, get_swapped_custom_module_class, activation_is_statically_quantized, activation_is_int8_quantized, ) -from .backend_config.utils import ( - get_pattern_to_quantize_handlers, +from ..backend_config.utils import ( get_pattern_to_dtype_configs, get_pattern_to_input_type_to_index, get_module_to_qat_module, + get_fusion_pattern_to_root_node_getter, +) +from ..backend_config import ( + get_native_backend_config_dict, +) +from .backend_config_utils import ( + get_pattern_to_quantize_handlers, ) from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Set from collections import defaultdict +# list of dtypes to not add observers to +DO_NOT_OBS_DTYPE_LIST = [int, float, torch.bool, None] + def is_activation_post_process_node(node: Node, modules: Dict[str, torch.nn.Module]) -> bool: return isinstance(node, torch.fx.Node) and node.op == "call_module" and \ is_activation_post_process(modules[str(node.target)]) @@ -125,7 +130,7 @@ def node_arg_is_bias(node: Node, arg: Any) -> bool: def is_input_arg_dtype_supported_by_backend( arg: Argument, node: Node, - node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]], + node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]], dtype_config: Dict[str, torch.dtype], ) -> bool: """ Check if the configured qconfig for the argument @@ -140,9 +145,17 @@ def is_input_arg_dtype_supported_by_backend( is_bias = node_arg_is_bias(node, arg) is_activation = not is_weight and not is_bias if is_activation: - input_activation_dtype = dtype_config.get("input_activation_dtype", None) - return input_activation_dtype is None or \ - node_name_to_target_dtype[node.name]["input_activation_dtype"] == input_activation_dtype + is_dynamic = dtype_config.get("is_dynamic", False) + if is_dynamic: + input_activation_dtype = dtype_config.get("input_dtype", None) + # TODO: change this after the is_dynamic refactor is landed + compute_dtype = node_name_to_target_dtype[node.name].get("input_activation_compute_dtype", None) + return input_activation_dtype is None or \ + compute_dtype == input_activation_dtype + else: + input_activation_dtype = dtype_config.get("input_dtype", None) + return input_activation_dtype is None or \ + node_name_to_target_dtype[node.name]["input_activation_dtype"] == input_activation_dtype elif is_weight: weight_dtype = dtype_config.get("weight_dtype", None) return weight_dtype is None or node_name_to_target_dtype[node.name]["weight_dtype"] == weight_dtype @@ -152,7 +165,7 @@ def is_input_arg_dtype_supported_by_backend( def is_output_dtype_supported_by_backend( node: Node, - node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]], + node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]], dtype_config: Dict[str, torch.dtype], ) -> bool: """ Check if the configured qconfig for the output @@ -162,10 +175,22 @@ def is_output_dtype_supported_by_backend( return output_dtype is None or \ output_dtype == node_name_to_target_dtype[node.name]["output_activation_dtype"] +def is_observer_in_same_graph(node, modules, node_name_to_target_dtype): + """ Check if observer in same graph + when the node output is not fp32 and input is 'placeholder' + the input is assumed to be quantized, so it is observed + in a different place rather than not observed. + """ + node_output_dtype = get_arg_target_dtype_as_output(node, modules, node_name_to_target_dtype) + if len(node.args) > 0 and isinstance(node.args[0], Node): + if node_output_dtype == torch.quint8 and node.args[0].op == 'placeholder': + return False + return True + def is_pattern_dtype_config_supported_by_backend( pattern: Optional[Pattern], - matched_nodes: Optional[List[Node]], - node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]], + matched_node_pattern: Optional[NodePattern], + node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]], backend_config_dict: Optional[Dict[str, Any]] ) -> bool: """ Check is the dtype configuration of a pattern is supported by @@ -173,14 +198,15 @@ def is_pattern_dtype_config_supported_by_backend( """ if backend_config_dict is None or pattern is None: return True - assert matched_nodes is not None and len(matched_nodes) >= 1 + assert matched_node_pattern is not None and len(matched_node_pattern) >= 1 pattern_to_dtype_configs = get_pattern_to_dtype_configs(backend_config_dict) - dtype_configs: List[Dict[str, torch.dtype]] = pattern_to_dtype_configs.get(pattern, []) + dtype_configs: List[Dict[str, Any]] = pattern_to_dtype_configs.get(pattern, []) - # TODO: this only checks one input and one output, need to generalize to multiple + # TODO: this only works for one input and one output patterns, need to generalize to multiple # inputs/output - input_node = matched_nodes[-1] - output_node = matched_nodes[0] + root_node = _default_root_node_getter(matched_node_pattern) + input_node = root_node + output_node = matched_node_pattern[0] for dtype_config in dtype_configs: # check if arg dtype are supported supported = True @@ -231,10 +257,21 @@ def qat_swap_modules( module_to_qat_module: Dict[Callable, Callable]) -> None: convert(root, mapping=module_to_qat_module, inplace=True, remove_qconfig=False) -# TODO: remove observed_op, looks like it's not used +def add_matched_node_name_to_set(matched_node_pattern: NodePattern, s: Set[str]): + if isinstance(matched_node_pattern, Node): + s.add(matched_node_pattern.name) + elif isinstance(matched_node_pattern, (list, tuple)): + for maybe_node in matched_node_pattern: + add_matched_node_name_to_set(maybe_node, s) + +# this is temporary, will be removed soon +def _default_root_node_getter(node_pattern): + while not isinstance(node_pattern, Node): + node_pattern = node_pattern[-1] + return node_pattern + def insert_observer( node: Node, - observed_op: Node, observer: ObserverBase, model: torch.nn.Module, modules: Dict[str, torch.nn.Module], @@ -271,7 +308,7 @@ def get_target_activation_dtype_for_node( qhandler: Optional[QuantizeHandler], modules: Dict[str, torch.nn.Module], cache_for_no_tensor_check: Dict[Node, bool], -) -> Dict[str, Optional[torch.dtype]]: +) -> Dict[str, Optional[Union[torch.dtype, type]]]: """ Returns the expected dtype of the input and output of this node after convert. If the value is not None, it represents the dtype of the @@ -317,7 +354,7 @@ def get_target_activation_dtype_for_node( # get qconfig to determine the eventual dtype of this node if qconfig is not None: - if qhandler is not None and qhandler.input_output_observed() and qhandler.is_output_quantized(qconfig): + if qhandler is not None and qhandler.input_output_observed(): act_dtype, weight_dtype, act_compute_dtype = \ get_qconfig_dtypes(qconfig) bias_dtype = torch.float16 \ @@ -325,6 +362,7 @@ def get_target_activation_dtype_for_node( else torch.float return { "input_activation_dtype": act_dtype, + "input_activation_compute_dtype": act_compute_dtype, "weight_dtype": weight_dtype, "bias_dtype": bias_dtype, "output_activation_dtype": act_dtype, @@ -360,8 +398,8 @@ def get_target_activation_dtype_for_node( def get_arg_target_dtype_as_output( arg: Node, modules: Dict[str, torch.nn.Module], - node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]], -) -> Optional[torch.dtype]: + node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]], +) -> Optional[Union[torch.dtype, type]]: """ Get the target output activation dtype for the argumnet in the original graph, skipping inserted observers We are assuming that the observers are inserted correctly, and the dtype for @@ -379,8 +417,8 @@ def get_arg_target_dtype_as_input_to_node( arg: Node, node: Node, modules: Dict[str, torch.nn.Module], - node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]], -) -> Optional[torch.dtype]: + node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]], +) -> Optional[Union[torch.dtype, type]]: """ Get the target argument dtype for the argument `arg`, as input to node `node` """ @@ -398,6 +436,24 @@ def get_arg_target_dtype_as_input_to_node( else: return node_name_to_target_dtype[node.name]["bias_dtype"] +def get_arg_target_compute_dtype_as_input_to_node( + arg: Node, + node: Node, + modules: Dict[str, torch.nn.Module], + node_name_to_target_dtype: Dict[str, Dict[str, Union[torch.dtype, type, None]]], +) -> Union[torch.dtype, type, None]: + """ Get the target argument dtype for the argument `arg`, as input + to node `node` + """ + assert isinstance(arg, Node) + is_weight = node_arg_is_weight(node, arg) + is_bias = node_arg_is_bias(node, arg) + is_activation = not is_weight and not is_bias + if is_activation and \ + "input_activation_compute_dtype" in node_name_to_target_dtype[node.name]: + return node_name_to_target_dtype[node.name]["input_activation_compute_dtype"] + else: + return None def maybe_insert_input_observer_for_arg_or_kwarg( node: Union[Node, Any], @@ -406,7 +462,7 @@ def maybe_insert_input_observer_for_arg_or_kwarg( model: torch.nn.Module, modules: Dict[str, torch.nn.Module], graph: Graph, - node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]], + node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]], qhandler: Optional[QuantizeHandler], prepare_custom_config_dict: Dict[str, Any], backend_config_dict: Optional[Dict[str, Any]], @@ -435,8 +491,7 @@ def maybe_insert_input_observer_for_arg_or_kwarg( # default (no observer) new_arg = arg - is_standalone_module = qhandler is not None and \ - isinstance(qhandler, StandaloneModuleQuantizeHandler) + is_standalone_module = qhandler is not None and qhandler.is_standalone_module() assert qconfig is not None if not is_standalone_module: # regular flow for most nodes, except standalone modules @@ -449,6 +504,9 @@ def maybe_insert_input_observer_for_arg_or_kwarg( arg_as_output_target_dtype = get_arg_target_dtype_as_output(arg, modules, node_name_to_target_dtype) arg_as_input_target_dtype = get_arg_target_dtype_as_input_to_node(arg, node, modules, node_name_to_target_dtype) + arg_as_input_target_compute_dtype = \ + get_arg_target_compute_dtype_as_input_to_node( + arg, node, modules, node_name_to_target_dtype) needs_obs = ( # if the dtypes are different, we need an observer (arg_as_output_target_dtype != arg_as_input_target_dtype) and @@ -457,10 +515,16 @@ def maybe_insert_input_observer_for_arg_or_kwarg( # TODO(future PR): change this so a placeholder is inserted for # future dequants, to make the logic easier to understand (arg_as_input_target_dtype != torch.float) and - # if arg is a bool tensor or not a tensor, do not insert observer - (arg_as_output_target_dtype not in (torch.bool, None)) and + # if arg output dtype is in DO_NOT_OBS_DTYPE_LIST do not insert observer + (arg_as_output_target_dtype not in DO_NOT_OBS_DTYPE_LIST) and # if qconfig is reuse_input qconfig, we won't insert extra observer for input - not is_reuse_input_qconfig_ + not is_reuse_input_qconfig_ or + # need to add input observer for dynamic quantization + # only add observer for first input for now, we may need to extend + # qconfig_dict and backend_config_dict to support more general configurations + # of dynamic quantization, e.g. dynamically quantizing second input, third + # input etc. + (arg_as_input_target_compute_dtype in [torch.quint8, torch.int8, torch.float16]) and arg is node.args[0] ) else: @@ -517,7 +581,7 @@ def maybe_insert_input_observer_for_arg_or_kwarg( if existing_obs_node is None: new_obs_node = insert_observer( - arg, node, new_obs_mod, model, modules, graph) + arg, new_obs_mod, model, modules, graph) # override this arg to be the observed arg new_arg = new_obs_node else: @@ -532,7 +596,7 @@ def maybe_insert_input_observers_for_node( model: torch.nn.Module, modules: Dict[str, torch.nn.Module], graph: Graph, - node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]], + node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]], qhandler: Optional[QuantizeHandler], prepare_custom_config_dict: Dict[str, Any], backend_config_dict: Optional[Dict[str, Any]], @@ -587,7 +651,7 @@ def maybe_insert_input_equalization_observers_for_node( model: torch.nn.Module, modules: Dict[str, torch.nn.Module], graph: Graph, - node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]], + node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]], is_branch: bool, ) -> None: """ @@ -618,7 +682,7 @@ def maybe_insert_input_equalization_observers_for_node( new_eq_obs_mod = act_eq_process_ctr() new_eq_obs_node = insert_observer( - arg, node, new_eq_obs_mod, model, modules, graph) + arg, new_eq_obs_mod, model, modules, graph) new_args.append(new_eq_obs_node) @@ -631,7 +695,7 @@ def maybe_insert_output_observer_for_node( modules: Dict[str, torch.nn.Module], graph: Graph, matches: Dict[str, MatchResult], - node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]], + node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]], matched_pattern: Any, qhandler: Optional[QuantizeHandler], is_qat: bool, @@ -642,7 +706,7 @@ def maybe_insert_output_observer_for_node( If `node` does not need an output observer, returns None. """ - root_node, matched_nodes, pattern, qhandler, qconfig = matches.get( + root_node, _, pattern, qhandler, qconfig = matches.get( node.name, (None, None, None, None, None)) if qhandler is None: @@ -651,13 +715,10 @@ def maybe_insert_output_observer_for_node( assert qconfig is not None assert node.op != 'output', 'observer insertion for outputs is handled elsewhere' - is_standalone_module = qhandler is not None and \ - isinstance(qhandler, StandaloneModuleQuantizeHandler) + is_standalone_module = qhandler is not None and qhandler.is_standalone_module() dtype = node_name_to_target_dtype[node.name]["output_activation_dtype"] - should_insert_observer = \ - qhandler.should_insert_observer_for_output( - qconfig, is_qat) and dtype not in (torch.bool, None, torch.float) + should_insert_observer = dtype not in DO_NOT_OBS_DTYPE_LIST + [torch.float] # TODO(future PR): move the following logic to # should_insert_observer_for_output should_insert_observer = should_insert_observer and \ @@ -676,7 +737,7 @@ def maybe_insert_output_observer_for_node( matched_pattern, is_qat) observer = act_post_process_ctr() - new_obs = insert_observer(node, node, observer, model, modules, graph) + new_obs = insert_observer(node, observer, model, modules, graph) return new_obs else: return None @@ -684,7 +745,7 @@ def maybe_insert_output_observer_for_node( def maybe_insert_observers_before_graph_output( graph_output_node: Node, output_quantized_idxs: List[int], - node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]], + node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]], qconfig_map: Dict[str, QConfigAny], model: torch.nn.Module, modules: Dict[str, torch.nn.Module], @@ -713,7 +774,7 @@ def maybe_insert_observers_before_graph_output( def _recursive_maybe_replace_node_with_obs( maybe_node: Argument, target_dtype: torch.dtype, - node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]], + node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]], qconfig_map: Dict[str, QConfigAny], model: torch.nn.Module, modules: Dict[str, torch.nn.Module], @@ -748,7 +809,7 @@ def _recursive_maybe_replace_node_with_obs( 'Quantizing the output node without a qconfig is not supported' observer_mod = qconfig.activation() observer_node = insert_observer( - maybe_node, maybe_node, observer_mod, model, modules, graph) + maybe_node, observer_mod, model, modules, graph) return observer_node else: return maybe_node @@ -784,8 +845,8 @@ def _recursive_maybe_replace_node_with_obs( def maybe_propagate_dtype_for_node( node: Node, - target_dtype: torch.dtype, - node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]], + target_dtype: Union[torch.dtype, type], + node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]], matches: Dict[str, MatchResult], ) -> None: """ @@ -797,9 +858,9 @@ def maybe_propagate_dtype_for_node( node_name_to_target_dtype[node.name]["input_activation_dtype"] = target_dtype node_name_to_target_dtype[node.name]["output_activation_dtype"] = target_dtype # if this is a copy node, propagate to first arg - root_node, matched_nodes, pattern, qhandler, qconfig = matches.get( + root_node, _, pattern, qhandler, qconfig = matches.get( node.name, (None, None, None, None, None)) - if qhandler is not None and qhandler.is_general_tensor_shape_op(): + if qhandler is not None and qhandler.is_general_tensor_value_op(): prev_node = node.args[0] if isinstance(prev_node, Node): maybe_propagate_dtype_for_node( @@ -807,7 +868,7 @@ def maybe_propagate_dtype_for_node( def propagate_dtypes_for_known_nodes( graph: Graph, - node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]], + node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]], matches: Dict[str, MatchResult], ) -> None: """ @@ -821,11 +882,26 @@ def propagate_dtypes_for_known_nodes( replace this with a better way to reason about dtypes of tensors. """ for node in graph.nodes: - bool_arg_idxs = node_bool_tensor_arg_indexes(node) - for bool_arg_idx in bool_arg_idxs: - cur_node = node.args[bool_arg_idx] - maybe_propagate_dtype_for_node( - cur_node, torch.bool, node_name_to_target_dtype, matches) + non_observable_arg_dict = get_non_observable_arg_indexes_and_types(node) + + for arg_type in non_observable_arg_dict: + non_observable_indices = non_observable_arg_dict[arg_type](node) + + for index in non_observable_indices: + arg = node.args[index] + + # when an argument is a tuple, it does not show up as another node so we need to go through + # all elements of the tuple manually + if isinstance(arg, tuple) or isinstance(arg, list): + arg_list = list(arg) + else: + arg_list = [arg] + + for cur_arg in arg_list: + # hard coded arguments show up but aren't `Node` typed and do not need dtype propgated + if isinstance(cur_arg, torch.fx.node.Node): + maybe_propagate_dtype_for_node( + cur_arg, arg_type, node_name_to_target_dtype, matches) def maybe_make_input_output_share_observers( node: Node, @@ -900,6 +976,9 @@ def maybe_make_input_output_share_observers( continue iteration_guard = 0 while not is_activation_post_process_node(input_arg, modules): + # failed to trace back since no input arg for the current node + if len(input_arg.args) < 1: + return False input_arg = input_arg.args[0] iteration_guard += 1 if iteration_guard > 10000: @@ -1009,7 +1088,7 @@ def insert_observers_for_model( # } # # TODO: rename this to node_name_to_target_dtype_info - node_name_to_target_dtype: Dict[str, Dict[str, Optional[torch.dtype]]] = defaultdict(dict) + node_name_to_target_dtype: Dict[str, Dict[str, Optional[Union[torch.dtype, type]]]] = defaultdict(dict) cache_for_no_tensor_check: Dict[Node, bool] = dict() inputs_seen_counter = 0 @@ -1021,7 +1100,7 @@ def insert_observers_for_model( # other nodes output dtype is specified by the qconfig modules = dict(model.named_modules(remove_duplicate=False)) for node in model.graph.nodes: - root_node, matched_nodes, pattern, qhandler, qconfig = matches.get( + root_node, _, pattern, qhandler, qconfig = matches.get( node.name, (None, None, None, None, None)) node_name_to_target_dtype[node.name] = get_target_activation_dtype_for_node( node, qconfig, inputs_seen_counter, outputs_seen_counter, @@ -1062,7 +1141,7 @@ def insert_observers_for_model( elif node.op in ('call_module', 'call_method', 'call_function', 'output'): # check for matches - root_node, matched_nodes, pattern, qhandler, qconfig = matches.get( + last_node, matched_node_pattern, pattern, qhandler, qconfig = matches.get( node.name, (None, None, None, None, None)) equalization_qconfig = equalization_config_map.get(node.name, None) @@ -1081,15 +1160,14 @@ def insert_observers_for_model( ) is_supported_by_backend = is_pattern_dtype_config_supported_by_backend( - pattern, matched_nodes, node_name_to_target_dtype, backend_config_dict) + pattern, matched_node_pattern, node_name_to_target_dtype, backend_config_dict) if not skip_inserting_observers and is_supported_by_backend: modules = dict(model.named_modules(remove_duplicate=False)) if node.op != 'output': - assert matched_nodes is not None + assert matched_node_pattern is not None # add matched nodes to the observed node name set - for n in matched_nodes: - observed_node_names.add(n.name) + add_matched_node_name_to_set(matched_node_pattern, observed_node_names) # This is currently only used for equalization. # Checks if the current node is in a branch in which the two @@ -1116,26 +1194,28 @@ def insert_observers_for_model( if user != node and is_user_quantized: is_quantized_branch = True - # this modifies node inplace - maybe_insert_input_observers_for_node( - node, qconfig, model, modules, graph, - node_name_to_target_dtype, - qhandler, - prepare_custom_config_dict, - backend_config_dict) - - # Insert equalization input observers if needed - maybe_insert_input_equalization_observers_for_node( - node, equalization_qconfig, model, modules, graph, - node_name_to_target_dtype, is_quantized_branch) - - is_last_node_of_pattern = root_node is node + # TODO: this only works for sequential fusion right now, extend it + # it to automatically detect all input nodes based on the pattern + # need to change find_matches function to return this information + root_node = _default_root_node_getter(matched_node_pattern) + is_input_node_of_the_pattern = node is root_node + if is_input_node_of_the_pattern: + # this modifies node inplace + maybe_insert_input_observers_for_node( + node, qconfig, model, modules, graph, + node_name_to_target_dtype, + qhandler, + prepare_custom_config_dict, + backend_config_dict) + + # Insert equalization input observers if needed + maybe_insert_input_equalization_observers_for_node( + node, equalization_qconfig, model, modules, graph, + node_name_to_target_dtype, is_quantized_branch) + + is_last_node_of_pattern = node is last_node is_general_tensor_value_op = \ (qhandler is not None and qhandler.is_general_tensor_value_op()) - - is_general_tensor_shape_op = \ - (qhandler is not None and qhandler.is_general_tensor_shape_op()) - is_reuse_input_qconfig_ = is_reuse_input_qconfig(qconfig) if is_last_node_of_pattern: @@ -1165,14 +1245,17 @@ def insert_observers_for_model( continue user_node.replace_input_with(node, maybe_output_obs_node) + is_observer_in_same_graph_ = is_observer_in_same_graph(node, modules, node_name_to_target_dtype) + # for general tensor value ops, we modify the graph # to make all inputs and outputs use the first input's # observer - if is_general_tensor_value_op or is_general_tensor_shape_op or is_reuse_input_qconfig_: + if (is_general_tensor_value_op and is_observer_in_same_graph_) or \ + is_reuse_input_qconfig_: if not maybe_make_input_output_share_observers(node, model, modules): remove_output_observer(node, model, modules) - if isinstance(qhandler, CustomModuleQuantizeHandler): + if qhandler is not None and qhandler.is_custom_module(): swap_custom_module_to_observed(node, qconfig, modules, prepare_custom_config_dict) else: # output @@ -1211,11 +1294,11 @@ def run_prepare_fx_on_standalone_modules( """ for ( node_name, - (root_node, matched_nodes, pattern, qhandler, qconfig), + (root_node, _, pattern, qhandler, qconfig), ) in matches.items(): if qhandler is None: continue - elif not isinstance(qhandler, StandaloneModuleQuantizeHandler): + elif not qhandler.is_standalone_module(): continue sm_qconfig_dict, sm_prepare_config_dict, sm_backend_config_dict = \ @@ -1246,14 +1329,12 @@ def save_state( observed: GraphModule, qconfig_map: Dict[str, QConfigAny], node_name_to_scope: Dict[str, Tuple[str, type]], - patterns: Dict[Pattern, QuantizeHandler], prepare_custom_config_dict: Dict[str, Any], equalization_qconfig_map: Dict[str, Any], qconfig_dict: Dict[str, Dict[Any, Any]], is_qat: bool, observed_node_names: Set[str], ) -> None: - observed._patterns = patterns # type: ignore[assignment] observed._qconfig_map = qconfig_map # type: ignore[assignment] observed._prepare_custom_config_dict = \ prepare_custom_config_dict # type: ignore[assignment] @@ -1297,8 +1378,6 @@ def prepare( if equalization_qconfig_dict is None: equalization_qconfig_dict = {} - additional_quant_patterns = \ - prepare_custom_config_dict.get("additional_quant_pattern", {}) # mapping from a tuple of nodes in reverse order to uninitialized # QuantizeHandler subclass. For example, # { @@ -1309,31 +1388,33 @@ def prepare( # ((, ): # ), # } + # TODO: rename to pattern_to_quantize_handler patterns: Dict[Pattern, QuantizeHandler] = {} if backend_config_dict is None: - quant_patterns = get_default_quant_patterns() - patterns = get_combined_dict( - quant_patterns, additional_quant_patterns) - else: - patterns = get_pattern_to_quantize_handlers(backend_config_dict) - - # TODO: make WEIGHT_INDEX_DICT and BIAS_INDEX_DICT an argument to the functions that needs them - # TODO: refactor this part to return WEIGHT_INDEX_DICT and BIAS_INDEX_DICT - pattern_to_input_type_to_index = get_pattern_to_input_type_to_index(backend_config_dict) - for pattern, input_type_to_index in pattern_to_input_type_to_index.items(): - for input_type, index in input_type_to_index.items(): - index_dicts = { - "weight": WEIGHT_INDEX_DICT, - "bias": BIAS_INDEX_DICT, - "input": {} # not used right now - } - assert input_type in index_dicts.keys(), \ - f"input type must be one of {index_dicts.keys()} but got: {input_type}" - index_dict = index_dicts[input_type] - if pattern in index_dict: # type: ignore[operator] - index_dict[pattern].append(index) # type: ignore[index] - else: - index_dict[pattern] = [index] # type: ignore[index] + backend_config_dict = get_native_backend_config_dict() + patterns = get_pattern_to_quantize_handlers(backend_config_dict) + patterns = sorted_patterns_dict(patterns) + + # TODO: make WEIGHT_INDEX_DICT and BIAS_INDEX_DICT an argument to the functions that needs them + # TODO: refactor this part to return WEIGHT_INDEX_DICT and BIAS_INDEX_DICT + pattern_to_input_type_to_index = get_pattern_to_input_type_to_index(backend_config_dict) + for pattern, input_type_to_index in pattern_to_input_type_to_index.items(): + for input_type, index in input_type_to_index.items(): + index_dicts = { + "weight": WEIGHT_INDEX_DICT, + "bias": BIAS_INDEX_DICT, + "input": {} # not used right now + } + assert input_type in index_dicts.keys(), \ + f"input type must be one of {index_dicts.keys()} but got: {input_type}" + index_dict = index_dicts[input_type] + if pattern in index_dict: # type: ignore[operator] + index_dict[pattern].append(index) # type: ignore[index] + else: + index_dict[pattern] = [index] # type: ignore[index] + + root_node_getter_mapping = \ + get_fusion_pattern_to_root_node_getter(backend_config_dict) convert_dict_to_ordered_dict(qconfig_dict) convert_dict_to_ordered_dict(equalization_qconfig_dict) @@ -1341,21 +1422,12 @@ def prepare( equalization_qconfig_dict = update_qconfig_for_fusion(model, equalization_qconfig_dict) flattened_qconfig_dict = get_flattened_qconfig_dict(qconfig_dict) # TODO: support regex as well - propagate_qconfig_(model, flattened_qconfig_dict) + propagate_qconfig_(model, flattened_qconfig_dict, prepare_custom_config_dict) if is_qat: - additional_qat_module_mapping = prepare_custom_config_dict.get( - "additional_qat_module_mapping", {}) - # this path will be deprecated after we fully migrate the convert path - # of fbgemm/qnnpack to use the reference path, it will stay - # here for a few months - if backend_config_dict is None: - module_to_qat_module = get_combined_dict( - get_default_qat_module_mappings(), additional_qat_module_mapping) - else: - module_to_qat_module = get_module_to_qat_module(backend_config_dict) + module_to_qat_module = get_module_to_qat_module(backend_config_dict) qat_swap_modules(model, module_to_qat_module) - qconfig_dict = update_qconfig_for_qat(qconfig_dict, additional_qat_module_mapping) + qconfig_dict = update_qconfig_for_qat(qconfig_dict, {}) # mapping from fully qualified module name to module instance # for example, @@ -1381,8 +1453,8 @@ def prepare( custom_module_classes = get_custom_module_class_keys( prepare_custom_config_dict, "float_to_observed_custom_module_class") matches = find_matches( - model.graph, modules, patterns, qconfig_map, standalone_module_names, - standalone_module_classes, custom_module_classes) + model.graph, modules, patterns, root_node_getter_mapping, qconfig_map, + standalone_module_names, standalone_module_classes, custom_module_classes) input_quantized_idxs: List[int] = prepare_custom_config_dict.get( "input_quantized_idxs", []) @@ -1407,7 +1479,7 @@ def prepare( observed_node_names, is_qat) - save_state(model, qconfig_map, node_name_to_scope, patterns, + save_state(model, qconfig_map, node_name_to_scope, prepare_custom_config_dict, equalization_qconfig_map, qconfig_dict, is_qat, observed_node_names) preserved_attributes = set(prepare_custom_config_dict.get("preserved_attributes", [])) diff --git a/torch/ao/quantization/fx/qconfig_utils.py b/torch/ao/quantization/fx/qconfig_utils.py index 80afa562a10f..4884ef08d0d6 100644 --- a/torch/ao/quantization/fx/qconfig_utils.py +++ b/torch/ao/quantization/fx/qconfig_utils.py @@ -1,6 +1,7 @@ import torch from collections import defaultdict -from typing import Callable, Any, Dict, Tuple, Set, Optional +from typing import Callable, Any, Dict, Tuple, Set, Optional, List +from torch.ao.quantization import QConfig from torch.ao.quantization.qconfig import add_module_to_qconfig_obs_ctr, QConfigAny, qconfig_equals from torch.ao.quantization.quantize import ( is_activation_post_process, @@ -13,7 +14,10 @@ ) from torch.nn.intrinsic import _FusedModule -from ..utils import _parent_name +from ..utils import ( + _parent_name, + get_qconfig_dtypes, +) from ..qconfig_dict_utils import ( get_object_type_qconfig, maybe_adjust_qconfig_for_module_type_or_name, @@ -213,10 +217,6 @@ def check_is_valid_prepare_custom_config_dict(prepare_custom_config_dict: Option "float_to_observed_custom_module_class", "non_traceable_module_name", "non_traceable_module_class", - "additional_fuser_method_mapping", - "additional_qat__module_mapping", - "additional_fusion_pattern", - "additional_quant_pattern", "input_quantized_idxs", "output_quantized_idxs", "preserved_attributes"} @@ -234,8 +234,7 @@ def check_is_valid_convert_custom_config_dict(convert_custom_config_dict: Option if not convert_custom_config_dict: return - convert_custom_config_dict_allowed_keys = {"additional_object_mapping", - "observed_to_quantized_custom_module_class", + convert_custom_config_dict_allowed_keys = {"observed_to_quantized_custom_module_class", "preserved_attributes"} check_is_valid_config_dict(convert_custom_config_dict, convert_custom_config_dict_allowed_keys, "convert_custom_config_dict") @@ -250,8 +249,7 @@ def check_is_valid_fuse_custom_config_dict(fuse_custom_config_dict: Optional[Dic if not fuse_custom_config_dict: return - fuse_custom_config_dict_allowed_keys = {"additional_fuser_method_mapping", - "preserved_attributes"} + fuse_custom_config_dict_allowed_keys = {"preserved_attributes"} check_is_valid_config_dict(fuse_custom_config_dict, fuse_custom_config_dict_allowed_keys, "fuse_custom_config_dict") @@ -284,6 +282,34 @@ def compare_prepare_convert_qconfig_dict(prepare_qconfig_dict: Dict[str, Dict[An else: assert "Unsupported key in convert_qconfig_dict {}".format(k) + +def is_qconfig_supported_by_dtype_configs(qconfig: QConfig, dtype_configs: List[Dict[str, Any]]): + for dtype_config in dtype_configs: + is_dynamic = dtype_config.get("is_dynamic", False) + input_dtype = dtype_config.get("input_dtype", torch.float) + weight_dtype = dtype_config.get("weight_dtype", torch.float) + bias_dtype = dtype_config.get("bias_dtype", torch.float) + output_dtype = dtype_config.get("output_dtype", torch.float) + qconfig_activation_dtype, qconfig_weight_dtype, qconfig_compute_dtype = \ + get_qconfig_dtypes(qconfig) + qconfig_bias_dtype = torch.float16 \ + if qconfig_activation_dtype == torch.float16 and \ + qconfig_weight_dtype == torch.float16 \ + else torch.float + + if is_dynamic: + is_match = input_dtype == qconfig_compute_dtype and \ + output_dtype == torch.float and \ + weight_dtype == qconfig_weight_dtype + else: + is_match = input_dtype == qconfig_activation_dtype and \ + output_dtype == qconfig_activation_dtype and \ + weight_dtype == qconfig_weight_dtype and \ + bias_dtype == qconfig_bias_dtype + if is_match: + return True + return False + # TODO: rename this file to config_utils def get_standalone_module_configs( module_name: str, diff --git a/torch/ao/quantization/fx/quantization_patterns.py b/torch/ao/quantization/fx/quantization_patterns.py index ccb2ae98f9f8..bacec65d0337 100644 --- a/torch/ao/quantization/fx/quantization_patterns.py +++ b/torch/ao/quantization/fx/quantization_patterns.py @@ -1,58 +1,25 @@ import torch -from torch.fx import GraphModule from torch.fx.graph import ( Node, - Graph, -) -from ..observer import ( - default_affine_fixed_qparams_observer, - default_symmetric_fixed_qparams_observer, -) - -from ..quantization_mappings import ( - get_static_quant_module_class, - get_dynamic_quant_module_class, - get_quantized_operator, -) -from ..utils import ( - get_swapped_custom_module_class, - activation_is_statically_quantized, - activation_is_int8_quantized, - weight_is_statically_quantized, - get_qconfig_dtypes, - activation_dtype, - get_qparam_dict, - check_node, -) - -from torch.ao.quantization.quantize import ( - is_activation_post_process, ) -from .pattern_utils import ( - register_quant_pattern, - get_default_output_activation_post_process_map, - Pattern, -) -from ..utils import _parent_name from .utils import ( all_node_args_have_no_tensors, - quantize_node, - get_per_tensor_qparams, - get_linear_prepack_op_for_dtype, - create_qparam_nodes, - get_qconv_prepack_op, - get_qconv_op, - create_node_from_old_node_preserve_meta, ) - -from ..qconfig import QConfigAny +from torch.ao.quantization.quantization_types import ( + Pattern, + NodePattern, +) from abc import ABC -import operator -import warnings +from typing import Any, Callable, Dict, Optional -from typing import Any, Callable, Dict, Union, Optional, Tuple, List +def _default_root_node_getter(node_pattern): + if node_pattern is None: + return node_pattern + while not isinstance(node_pattern, Node): + node_pattern = node_pattern[-1] + return node_pattern # ------------------------- # Pattern Registrations @@ -64,33 +31,37 @@ class QuantizeHandler(ABC): """ Base handler class for the quantizer patterns """ - def __init__(self, node: Node, modules: Dict[str, torch.nn.Module]): + def __init__( + self, + node_pattern: NodePattern, + modules: Dict[str, torch.nn.Module], + root_node_getter: Callable = None, + is_custom_module=False, + is_standalone_module=False): """ Records pattern information in __init__, which will be used in convert """ - # this is an indicator of whether all the inputs are Node or not - # since some op might be quantized differently depending on whether - # all inputs are tensors or not, e.g. add/mul - self.num_tensor_args = len(node.args) - self.all_node_args_are_tensors = True - # the last node of the matched pattern - self.last_node = node - - def _maybe_get_last_node_only_observer( - self, - modules: Dict[str, torch.nn.Module] - ) -> Optional[torch.nn.Module]: - """ - If the last node of the pattern is observed, return the observer - instance. Otherwise, return None. - """ - for maybe_obs_node, _ in self.last_node.users.items(): - if maybe_obs_node.op == 'call_module': - maybe_obs = modules[str(maybe_obs_node.target)] - if is_activation_post_process(maybe_obs): - return maybe_obs - return None - + self.node_pattern = node_pattern + self.modules = modules + if root_node_getter is None: + root_node_getter = _default_root_node_getter + self.root_node = root_node_getter(node_pattern) + self.is_custom_module_ = is_custom_module + self.is_standalone_module_ = is_standalone_module + self.num_tensor_args = 0 + # determine how many of the first two args are Tensors (versus scalars) + # this distinguishes things like "x + y" from "x + 2" or "2 + x" + if isinstance(self.root_node, Node): + cache_for_no_tensor_check: Dict[Node, bool] = dict() + for arg_idx in range(len(self.root_node.args)): + arg = self.root_node.args[arg_idx] + if isinstance(arg, Node) and ( + not all_node_args_have_no_tensors( + arg, self.modules, cache_for_no_tensor_check)): + self.num_tensor_args += 1 + + # TODO: can remove after the is_dynamic flag is defined, so that we can + # move embedding op to backend_config_dict def input_output_observed(self) -> bool: """ Returns True if the pattern matched to this qhandler could be @@ -102,44 +73,16 @@ def is_general_tensor_value_op(self) -> bool: """ Returns True if the operator works for both floating point and quantized input, and does some computation based on the input Tensor, + or the ops that only re-arranges the Tensor values or query some metadata + about the Tensor so we need to insert observer/fake_quant for the output of the - operator since the distribution of values is different for input and output - Tensors (for HistogramObserver) - while they share the same quantization parameters - Example: avgpool2d - """ - return False - - def is_general_tensor_shape_op(self) -> bool: - """ Similar to is_general_tensor_value_op, this is a check - for ops that works for both floating point and quantized input, - that only re-arranges the Tensor values or query some metadata about the Tensor - We don't insert observer/fake_quant for the output of these operators - Example: reshape, transpose, maxpool2d - """ - return False - - def should_insert_observer_for_output( - self, - qconfig: Any, - model_is_training: bool, - ) -> bool: - """ - Returns true if an observer should be inserted for the output of - the pattern matched to this QuantizeHandler instance during the - prepare step. - """ - # TODO(future PR): potentially clean up and deduplicate these - # mappings. - return self.all_node_args_are_tensors and self.input_output_observed() - - def should_mark_output_quantized_from_input_quantized_status( - self, - qconfig: QConfigAny - ) -> bool: - """ - Returns true if after convert, the output of the matched pattern is - quantized iff the first input is also quantized. + operator (same observer instance as input) + since the distribution of values is different for input and output + Tensors (for HistogramObserver) while they share the same quantization + parameters + Example operator: avgpool2d, reshape, transpose, maxpool2d + Example observed operator: + observer_0 - avgpool2d - observer_0 (same observer instance as input) """ return False @@ -156,1633 +99,62 @@ def get_activation_ctr( """ return qconfig.activation - def is_output_quantized(self, qconfig): - """ Returns true if the output node of convert is quantized - when is_reference is False, we would return float node when a certain dtype - combination is not supported (since fbgemm/qnnpack only support certain dtype - combinations), so the output may be float, but when is_reference is True, - we support all dtype combinations so the output will always be quantized. - - TODO: This is fragile, whether output is quantized should not depend on `is_reference` since - we want to make sure whether a Tensor is quantized - should be the same in prepare and convert and is_reference - is only available in convert currently + def is_custom_module(self): + return self.is_custom_module_ - """ - return True - - def convert(self, - node: Node, - qconfig: QConfigAny, - modules: Dict[str, torch.nn.Module], - quantized_graph: Graph, - node_name_to_scope: Dict[str, Tuple[str, type]], - load_arg: Callable, - is_reference: bool = False, - convert_custom_config_dict: Dict[str, Any] = None) -> Node: - """ Convert the given node to a quantized node and insert - it to the quantized graph - """ - return NotImplemented + def is_standalone_module(self): + return self.is_standalone_module_ - -# Binary op configs - -# Supported combinations are: -# quant_type | activation (compute_type) | weight -# static quint8 qint8 - -# tuple (activation_dtype, weight_dtype, compute_dtype) -# these are supported types for common binary ops like add/mul etc. -all_dtypes = [ - (torch.qint8, torch.qint8, None), - (torch.quint8, torch.qint8, None), - (torch.float16, torch.float16, None), -] -fp16_dtypes = [ - (torch.float16, torch.float16, None) -] -int8_dtypes = [ - (torch.qint8, torch.qint8, None), - (torch.quint8, torch.qint8, None), -] -binary_op_supported_dtypes : Dict[Union[Callable, str], List[Tuple[torch.dtype, torch.dtype, None]]] = { - operator.add: all_dtypes, - torch.add: all_dtypes, - operator.mul: all_dtypes, - torch.mul: all_dtypes, - torch.bmm: fp16_dtypes, - torch.sub: fp16_dtypes, - operator.sub: fp16_dtypes, - torch.div: fp16_dtypes, - operator.truediv: fp16_dtypes, - torch.matmul: int8_dtypes, -} - -default_op_supported_dtypes = { - torch.nn.ConvTranspose1d: int8_dtypes, - torch.nn.ConvTranspose2d: int8_dtypes, - torch.nn.ELU: int8_dtypes, - torch.nn.LeakyReLU: int8_dtypes, - torch.nn.Hardswish: int8_dtypes, - torch.nn.InstanceNorm1d: int8_dtypes, - torch.nn.InstanceNorm2d: int8_dtypes, - torch.nn.InstanceNorm3d: int8_dtypes, - torch.nn.LayerNorm: all_dtypes, - torch.nn.SiLU: fp16_dtypes, - torch.nn.Mish: fp16_dtypes, - torch.nn.GELU: int8_dtypes, - torch.nn.Dropout: int8_dtypes, - torch.nn.Softmax: int8_dtypes, - torch.nn.functional.elu: int8_dtypes, - torch.nn.functional.hardswish: int8_dtypes, - torch.nn.functional.instance_norm: int8_dtypes, - torch.nn.functional.layer_norm: all_dtypes, - torch.nn.functional.leaky_relu: int8_dtypes, - torch.nn.functional.silu: fp16_dtypes, - torch.nn.functional.mish: fp16_dtypes, - torch.nn.functional.gelu: int8_dtypes, - torch.nn.functional.softmax: int8_dtypes, - torch.nn.functional.dropout: int8_dtypes, - torch.sum: fp16_dtypes, -} - -QAT_CONV_MODULE_CLASSES = \ - (torch.nn.qat.Conv2d, - torch.nn.qat.Conv3d, - torch.nn.intrinsic.qat.ConvBn2d, - torch.nn.intrinsic.qat.ConvBnReLU2d, - torch.nn.intrinsic.qat.ConvReLU2d, - torch.nn.intrinsic.qat.ConvBn3d, - torch.nn.intrinsic.qat.ConvBnReLU3d, - torch.nn.intrinsic.qat.ConvReLU3d) - - -########################## -# Helper Functions -########################## - -def _load_weight_qparams( - self, state_dict, prefix, local_metadata, strict, - missing_keys, unexpected_keys, error_msgs): - key = prefix + "_weight_qparams" - if key in state_dict: - self._weight_qparams = state_dict[key] - state_dict.pop(key) - -def _save_weight_qparams(self, destination, prefix, keep_vars): - for attr_name in dir(self): - if "_weight_qparams" == attr_name and \ - isinstance(getattr(self, attr_name), dict): - weight_qparams = getattr(self, attr_name) - destination[prefix + attr_name] = weight_qparams - - -def _to_reference(float_module, weight_qparams): - """ Make a weighted float module (e.g. conv and linear )a reference module by - attaching _weight_qparams that records the qparams for weight - and change the name for the module so that it's recognized - when people print the model - """ - float_module._weight_qparams = weight_qparams - float_module._register_state_dict_hook(_save_weight_qparams) - float_module._register_load_state_dict_pre_hook(_load_weight_qparams, with_module=True) - - float_module_name = float_module._get_name() - - def _get_name(): - return float_module_name + "(Reference)" - - float_module._get_name = _get_name - -@register_quant_pattern(operator.add) -@register_quant_pattern(operator.sub) -@register_quant_pattern(operator.mul) -@register_quant_pattern(operator.truediv) -@register_quant_pattern(torch.add) -@register_quant_pattern(torch.sub) -@register_quant_pattern(torch.mul) -@register_quant_pattern(torch.div) -@register_quant_pattern(torch.bmm) -@register_quant_pattern((torch.nn.ReLU, operator.add)) -@register_quant_pattern((torch.nn.ReLU, operator.mul)) -@register_quant_pattern((torch.nn.ReLU, torch.add)) -@register_quant_pattern((torch.nn.ReLU, torch.mul)) -@register_quant_pattern((torch.nn.functional.relu, operator.add)) -@register_quant_pattern((torch.nn.functional.relu, operator.mul)) -@register_quant_pattern((torch.nn.functional.relu, torch.add)) -@register_quant_pattern((torch.nn.functional.relu, torch.mul)) -@register_quant_pattern((torch.relu, operator.add)) -@register_quant_pattern((torch.relu, operator.mul)) -@register_quant_pattern(torch.matmul) +# TODO: remove this class, this is still exposed in torch.quantization +# but we should be able to break bc class BinaryOpQuantizeHandler(QuantizeHandler): - def __init__( - self, - node: Node, - modules: Dict[str, torch.nn.Module]): - super().__init__(node, modules) - self.relu_node = None - if ( - node.op == 'call_function' and - node.target in (torch.nn.functional.relu, torch.relu) - ) or ( - node.op == 'call_module' and - isinstance(modules[str(node.target)], torch.nn.ReLU) - ): - self.relu_node = node - node = node.args[0] # type: ignore[assignment] - self.binary_op_node = node - self.binary_op = node.target - - # determine how many of the first two args are Tensors (versus scalars) - # this distinguishes things like "x + y" from "x + 2" or "2 + x" - self.num_tensor_args = 0 - cache_for_no_tensor_check: Dict[Node, bool] = dict() - for arg_idx in range(len(self.binary_op_node.args)): - arg = self.binary_op_node.args[arg_idx] - if isinstance(arg, Node) and (not all_node_args_have_no_tensors(arg, modules, cache_for_no_tensor_check)): - self.num_tensor_args += 1 - self.all_node_args_are_tensors = \ - (self.num_tensor_args == len(self.binary_op_node.args)) - - qbin_op_mapping: Dict[Union[Callable, str], Callable] = { - operator.add: torch.ops.quantized.add, - torch.add: torch.ops.quantized.add, - operator.mul: torch.ops.quantized.mul, - torch.mul: torch.ops.quantized.mul, - torch.matmul: torch.ops.quantized.matmul, - } - qbin_relu_op_mapping: Dict[Union[Callable, str], Callable] = { - operator.add: torch.ops.quantized.add_relu, - torch.add: torch.ops.quantized.add_relu, - operator.mul: torch.ops.quantized.mul_relu, - torch.mul: torch.ops.quantized.mul_relu, - } - # corresponding quantized op - self.quantized_binary_op: Optional[Callable] = None - if self.binary_op in qbin_op_mapping: - self.quantized_binary_op = qbin_relu_op_mapping[self.binary_op] \ - if self.relu_node is not None \ - else qbin_op_mapping[self.binary_op] - - def should_insert_observer_for_output( - self, - qconfig: Any, - model_is_training: bool, - ) -> bool: - """ - Returns true if an observer should be inserted for the output of - the pattern matched to this QuantizeHandler instance during the - prepare step. - """ - dtypes = get_qconfig_dtypes(qconfig) - if not (self.binary_op in binary_op_supported_dtypes and dtypes in binary_op_supported_dtypes[self.binary_op]): - return False - if self.num_tensor_args == 1: - return True - elif self.all_node_args_are_tensors and self.input_output_observed(): - return True - else: - return False - - def is_general_tensor_value_op(self) -> bool: - return self.num_tensor_args == 1 - - def input_output_observed(self): - # for x + y where x and y are scalars, we do not observe anything - return self.num_tensor_args > 0 - - def is_output_quantized(self, qconfig): - dtypes = get_qconfig_dtypes(qconfig) - return self.binary_op in binary_op_supported_dtypes and \ - dtypes in binary_op_supported_dtypes[self.binary_op] - - def convert(self, - node: Node, - qconfig: QConfigAny, - modules: Dict[str, torch.nn.Module], - quantized_graph: Graph, - node_name_to_scope: Dict[str, Tuple[str, type]], - load_arg: Callable, - is_reference: bool = False, - convert_custom_config_dict: Dict[str, Any] = None) -> Node: - - if self.num_tensor_args == 0: - # example: x + y, when x and y are scalars - return quantized_graph.node_copy( - node, load_arg(quantized=None)) - - dtypes = get_qconfig_dtypes(qconfig) - - if is_reference: - act_dtype = activation_dtype(qconfig) - dtypes = get_qconfig_dtypes(qconfig) - if act_dtype == torch.float or \ - not (self.binary_op in binary_op_supported_dtypes and dtypes in binary_op_supported_dtypes[self.binary_op]): - return quantized_graph.node_copy(node, load_arg(quantized=torch.float)) - else: - if self.num_tensor_args == 2: - # make sure both inputs are quantized to act_dtype - load_arg(quantized={0: act_dtype, 1: act_dtype})(self.binary_op_node.args) - args = load_arg(quantized=torch.float)(self.binary_op_node.args) - kwargs = load_arg(quantized=torch.float)(self.binary_op_node.kwargs) - op_out = quantized_graph.node_copy(self.binary_op_node, load_arg(quantized=torch.float)) - - def modified_load_arg(n: Node): - if n.name == self.binary_op_node.name: - return op_out - else: - return load_arg(quantized=torch.float)(n) - - if self.relu_node: - op_out = quantized_graph.node_copy(self.relu_node, modified_load_arg) - activation_post_process = \ - self._maybe_get_last_node_only_observer(modules) - assert activation_post_process is not None - return quantize_node( - op_out, activation_post_process, - node, modules, quantized_graph, node_name_to_scope, is_input=False) - elif not is_reference and self.binary_op in binary_op_supported_dtypes and \ - dtypes in binary_op_supported_dtypes[self.binary_op]: - if dtypes in [(torch.quint8, torch.qint8, None)]: - assert self.quantized_binary_op is not None - if self.num_tensor_args == 1: - # add/mul scalar - first_arg = self.binary_op_node.args[0] - cache_for_no_tensor_check: Dict[Node, bool] = dict() - if isinstance(first_arg, Node) and ( - not all_node_args_have_no_tensors( - first_arg, modules, cache_for_no_tensor_check)): - quantized_index = 0 - else: - quantized_index = 1 + pass - return create_node_from_old_node_preserve_meta( - quantized_graph, - ( - 'call_function', self.quantized_binary_op, - load_arg(quantized=[quantized_index])(self.binary_op_node.args), - self.binary_op_node.kwargs - ), - self.binary_op_node) - else: - activation_post_process = \ - self._maybe_get_last_node_only_observer(modules) - assert activation_post_process is not None - scale, zero_point = activation_post_process.calculate_qparams() # type: ignore[operator] - scale = float(scale) - zero_point = int(zero_point) - scale_arg, zero_point_arg = \ - create_qparam_nodes( - node.name, scale, zero_point, modules, - quantized_graph, node_name_to_scope) - kwargs = {**self.binary_op_node.kwargs} - add_args = (*load_arg(quantized=activation_dtype(qconfig))(self.binary_op_node.args), scale_arg, zero_point_arg) - op = create_node_from_old_node_preserve_meta( - quantized_graph, - ('call_function', self.quantized_binary_op, add_args, kwargs), - self.binary_op_node) - return op - else: - assert dtypes == (torch.float16, torch.float16, None) - # TODO (refactor) this is duplicated, maybe have a helper function - if self.relu_node: - op_out = quantized_graph.node_copy(self.binary_op_node, load_arg(quantized=torch.float)) - relu_args = [op_out] - relu_args.extend(load_arg(quantized=torch.float)(self.relu_node.args[1:])) - relu_kwargs = load_arg(quantized=torch.float)(self.relu_node.kwargs) - op_out = create_node_from_old_node_preserve_meta( - quantized_graph, - ("call_function", torch.nn.functional.relu, tuple(relu_args), relu_kwargs), - self.relu_node) - else: - op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float)) - return quantized_graph.create_node( - "call_method", "to", (op_out, torch.float16,), {} - ) - else: - # leave the op unquantized if the dtype,reference combination is not supported - warnings.warn( - "dtype combination: {} is not " - "supported by {} for is_reference={}. " - "Supported non-reference dtype combinations are: {} " - "".format(dtypes, - self.binary_op, - is_reference, - binary_op_supported_dtypes[self.binary_op] - ) - ) - if self.relu_node: - op_out = quantized_graph.node_copy(self.binary_op_node, load_arg(quantized=torch.float)) - relu_args = [op_out] - relu_args.extend(load_arg(quantized=torch.float)(self.relu_node.args[1:])) - relu_kwargs = load_arg(quantized=torch.float)(self.relu_node.kwargs) - return create_node_from_old_node_preserve_meta( - quantized_graph, - ("call_function", torch.nn.functional.relu, tuple(relu_args), relu_kwargs), - self.relu_node) - else: - return quantized_graph.node_copy(node, load_arg(quantized=torch.float)) - - -@register_quant_pattern(torch.cat) class CatQuantizeHandler(QuantizeHandler): - def is_general_tensor_value_op(self) -> bool: - return True - - def convert(self, - node: Node, - qconfig: QConfigAny, - modules: Dict[str, torch.nn.Module], - quantized_graph: Graph, - node_name_to_scope: Dict[str, Tuple[str, type]], - load_arg: Callable, - is_reference: bool = False, - convert_custom_config_dict: Dict[str, Any] = None) -> Node: - if not self.all_node_args_are_tensors: - return NotImplemented - act_dtype = activation_dtype(qconfig) - if act_dtype == torch.float: - op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float)) - return op_out - else: - activation_post_process = \ - self._maybe_get_last_node_only_observer(modules) - assert activation_post_process is not None - # make sure the first argument is quantized to act_dtype - load_arg(quantized={0: act_dtype})(node.args) - args = list(load_arg(quantized=torch.float)(node.args)) - kwargs = load_arg(quantized=torch.float)(node.kwargs) - op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float)) - return quantize_node( - op_out, - activation_post_process, - node, - modules, - quantized_graph, - node_name_to_scope, - is_input=False) + pass -# handle conv, maybe followed by relu -# NB: matching order is reversed, that is we match from the bottom of this list to the beginning -@register_quant_pattern(torch.nn.Conv1d) -@register_quant_pattern(torch.nn.Conv2d) -@register_quant_pattern(torch.nn.Conv3d) -@register_quant_pattern(torch.nn.functional.conv1d) -@register_quant_pattern(torch.nn.functional.conv2d) -@register_quant_pattern(torch.nn.functional.conv3d) -# TODO: add qat.Conv1d -@register_quant_pattern(torch.nn.qat.Conv2d) -@register_quant_pattern(torch.nn.qat.Conv3d) -@register_quant_pattern(torch.nn.intrinsic.ConvReLU1d) -@register_quant_pattern(torch.nn.intrinsic.ConvReLU2d) -@register_quant_pattern(torch.nn.intrinsic.ConvReLU3d) -@register_quant_pattern(torch.nn.intrinsic.qat.ConvBn1d) -@register_quant_pattern(torch.nn.intrinsic.qat.ConvBn2d) -@register_quant_pattern(torch.nn.intrinsic.qat.ConvBn3d) -@register_quant_pattern(torch.nn.intrinsic.qat.ConvBnReLU1d) -@register_quant_pattern(torch.nn.intrinsic.qat.ConvBnReLU2d) -@register_quant_pattern(torch.nn.intrinsic.qat.ConvBnReLU3d) -@register_quant_pattern(torch.nn.intrinsic.qat.ConvReLU2d) -@register_quant_pattern(torch.nn.intrinsic.qat.ConvReLU3d) -@register_quant_pattern((torch.nn.functional.relu, torch.nn.functional.conv1d)) -@register_quant_pattern((torch.nn.functional.relu, torch.nn.functional.conv2d)) -@register_quant_pattern((torch.nn.functional.relu, torch.nn.functional.conv3d)) -@register_quant_pattern((torch.nn.ReLU, torch.nn.functional.conv1d)) -@register_quant_pattern((torch.nn.ReLU, torch.nn.functional.conv2d)) -@register_quant_pattern((torch.nn.ReLU, torch.nn.functional.conv3d)) -# just for error checks -@register_quant_pattern((torch.nn.ReLU, torch.nn.Conv1d)) -@register_quant_pattern((torch.nn.ReLU, torch.nn.Conv2d)) -@register_quant_pattern((torch.nn.ReLU, torch.nn.Conv3d)) -@register_quant_pattern((torch.nn.functional.relu, torch.nn.Conv2d)) -@register_quant_pattern((torch.nn.functional.relu, torch.nn.Conv3d)) -# TODO: rename Relu -> ReLU to be more consistent with other classes +# TODO: remove this class class ConvReluQuantizeHandler(QuantizeHandler): - def __init__(self, node: Node, modules: Dict[str, torch.nn.Module]): - super().__init__(node, modules) - self.relu_node = None - if (node.op == 'call_function' and node.target is torch.nn.functional.relu) or \ - (node.op == 'call_module' and isinstance(modules[str(node.target)], torch.nn.ReLU)): - self.relu_node = node - node = node.args[0] # type: ignore[assignment] - self.conv_node = node - if node.op == "call_module": - self.conv = modules[str(self.conv_node.target)] - elif node.op == "call_function": - self.conv = node.target # type: ignore[assignment] - - def convert(self, - node: Node, - qconfig: QConfigAny, - modules: Dict[str, torch.nn.Module], - quantized_graph: Graph, - node_name_to_scope: Dict[str, Tuple[str, type]], - load_arg: Callable, - is_reference: bool = False, - convert_custom_config_dict: Dict[str, Any] = None) -> Node: - # Supported combinations are: - # quant_type | activation (compute_type) | weight - # static quint8 qint8 - - # tuple (activation_dtype, weight_dtype, compute_dtype) - supported_dtypes = [ - (torch.quint8, torch.qint8, None), - ] - - # TODO: is_reference option for conv module - dtypes = get_qconfig_dtypes(qconfig) - # leave the op unquantized if the dtype combination is not supported - if not is_reference and dtypes not in supported_dtypes: - warnings.warn( - "dtype combination: {} is not " - "supported by Conv " - "supported dtype combinations are: {}".format(dtypes, supported_dtypes)) - if self.relu_node: - conv_out = quantized_graph.node_copy(self.conv_node, load_arg(quantized=torch.float)) - relu_args = [conv_out] - relu_args.extend(load_arg(quantized=torch.float)(self.relu_node.args[1:])) - relu_kwargs = load_arg(quantized=torch.float)(self.relu_node.kwargs) - return create_node_from_old_node_preserve_meta( - quantized_graph, - ("call_function", torch.nn.functional.relu, tuple(relu_args), relu_kwargs), - self.relu_node) - else: - return quantized_graph.node_copy(node, load_arg(quantized=torch.float)) - - activation_int8_quantized = activation_is_int8_quantized(qconfig) - - if self.conv_node.op == 'call_module': - # note that relu should already be fused into conv module in the fusion step - assert self.relu_node is None, 'conv module and relu fusion is not executed, ' \ - 'please make sure to run fusion before prepare' - output_activation_post_process = \ - self._maybe_get_last_node_only_observer(modules) - assert output_activation_post_process is not None + pass - # We'll always produce reference pattern for torch.nn.Conv*d, - # will remove the else branch after we migrated all use cases - if is_reference or \ - type(self.conv) in [torch.nn.Conv1d, torch.nn.Conv2d, torch.nn.Conv3d] and \ - dtypes in [(torch.quint8, torch.qint8, None)]: - # produce dequant - float_op - quant pattern - dtype = torch.float - if activation_int8_quantized: - dtype = activation_dtype(qconfig) - activation = load_arg(quantized=dtype)(self.conv_node.args[0]) - args = load_arg(quantized=torch.float)(self.conv_node.args) - # Get the float conv and attach quantization scheme and quantization - # parameters of weight to the module - # and qparam is a dictionary of - # {"qscheme": ..., "scale": ..., "zero_point": ...} for per tensor quantization or - # {"qscheme": ..., "scale": ..., "zero_point": ..., "axis": ...} for per channel quantization - float_conv = self.conv - fused_conv = None - if isinstance( - float_conv, - QAT_CONV_MODULE_CLASSES): - # case 1. converting qat conv module to - # a float conv module, we need to attch - # weight fake_quant to the conv module, - # weight fake_quant is assumed to be run during - # QAT so we don't need to run it again here - float_conv = self.conv.to_float() # type: ignore[operator] - # change qat conv to conv - parent_name, name = _parent_name(self.conv_node.target) - setattr(modules[parent_name], name, float_conv) - if isinstance(float_conv, torch.nn.intrinsic._FusedModule): - fused_conv = float_conv - float_conv = float_conv[0] - weight_post_process = self.conv.weight_fake_quant - else: - # case 2. converting a conv module/fused conv module - # to float conv module, we need to attach - # weight observer to the conv module and run it - # with conv weight - if isinstance(float_conv, torch.nn.intrinsic._FusedModule): - fused_conv = float_conv - float_conv = float_conv[0] # type: ignore[index] - assert qconfig is not None - weight_post_process = qconfig.weight() - # run weight observer - weight_post_process(float_conv.weight) # type: ignore[operator] - weight_qparams = get_qparam_dict(weight_post_process) - # hardcoded for now, TODO: expose the api to user, - # we can have a map from module to reference module - # and allow user to register new ones - qconv_cls = get_static_quant_module_class( - type(float_conv), is_reference=True) - ref_conv = qconv_cls.from_float(float_conv, weight_qparams) # type: ignore[attr-defined] - # if the parent is a fused conv (Sequential), we can replace the first - # item to ref conv, otherwise we can update - # the conv instance in the module tree - if fused_conv is not None: - fused_conv[0] = ref_conv - else: - parent_name, name = _parent_name(self.conv_node.target) - setattr(modules[parent_name], name, ref_conv) - op_out = create_node_from_old_node_preserve_meta( - quantized_graph, - ('call_module', self.conv_node.target, args, {}), - self.conv_node) - if output_activation_post_process: - op_out = quantize_node( - op_out, - output_activation_post_process, - node, - modules, - quantized_graph, - node_name_to_scope, - is_input=False) - return op_out - else: - if convert_custom_config_dict is None: - convert_custom_config_dict = {} - additional_static_quant_mapping = convert_custom_config_dict.get("static", {}) - # 1. attach activation post process to module - self.conv.activation_post_process = output_activation_post_process - # 2. select quantized class - qconv_cls = get_static_quant_module_class( - type(self.conv), additional_static_quant_mapping, is_reference=is_reference) - quantized = qconv_cls.from_float(self.conv) - parent_name, name = _parent_name(self.conv_node.target) - setattr(modules[parent_name], name, quantized) - return create_node_from_old_node_preserve_meta( - quantized_graph, - ( - 'call_module', - self.conv_node.target, - (load_arg(quantized=torch.quint8)(self.conv_node.args[0]),), - {}, - ), - self.conv_node) - else: # call_function - assert self.conv_node.op == "call_function" - if is_reference: - # make sure the input and weight are quantized to torch.quint8, torch.qint8, respectively - load_arg(quantized={0: torch.quint8, 1: torch.qint8})(self.conv_node.args) - args = load_arg(quantized=torch.float)(self.conv_node.args) - kwargs = load_arg(quantized=torch.float)(self.conv_node.kwargs) - op_out = create_node_from_old_node_preserve_meta( - quantized_graph, - ("call_function", self.conv, args, kwargs), - self.conv_node) - if self.relu_node: - relu_args = [op_out] - relu_args.extend(load_arg(quantized=torch.float)(self.relu_node.args[1:])) - relu_kwargs = load_arg(quantized=torch.float)(self.relu_node.kwargs) - op_out = create_node_from_old_node_preserve_meta( - quantized_graph, - ("call_function", torch.nn.functional.relu, tuple(relu_args), relu_kwargs), - self.relu_node) - - if activation_int8_quantized: - root_module = modules[''] - act_post_process_name = self.relu_node.name if self.relu_node else self.conv_node.name - act_post_process_node = self.relu_node if self.relu_node else self.conv_node - activation_post_process = \ - self._maybe_get_last_node_only_observer(modules) - assert activation_post_process is not None - return quantize_node( - op_out, - activation_post_process, - act_post_process_node, - modules, - quantized_graph, - node_name_to_scope, - is_input=False) - else: - # output for dynamically quantized conv op is not quantized - return op_out - else: - assert len(self.conv_node.args) >= 7, \ - "only conv2d calls with all arguments specified is supported right now in is_reference=False option" - # make sure the input and weight are quantized to torch.quint8, torch.qint8, respectively - args = load_arg(quantized={0: torch.quint8, 1: torch.qint8})(self.conv_node.args) - # pack weight - weight = load_arg(quantized=torch.qint8)(self.conv_node.args[1]) - other_args = load_arg(quantized=torch.float)(self.conv_node.args[2:]) - bias, stride, padding, dilation, groups = other_args - if self.conv == torch.nn.functional.conv1d: - # F.conv1d can take `int` as well as `list[int]` for stride, - # padding, dilation, but the prepack op cannot. Convert - # these to lists if needed. - stride = [stride] if isinstance(stride, int) else stride - padding = [padding] if isinstance(padding, int) else padding - dilation = [dilation] if isinstance(dilation, int) else dilation - prepack_args = (weight, bias, stride, padding, dilation, groups) - prepack_op = get_qconv_prepack_op(self.conv) - packed_weight = quantized_graph.create_node( - "call_function", prepack_op, prepack_args, {}) - assert activation_int8_quantized, \ - "currently only static quantization is supported for conv" - # construct conv input - if activation_int8_quantized: - qconv_op = get_qconv_op(self.conv, self.relu_node is not None) - conv_input = load_arg(quantized=torch.quint8)(self.conv_node.args[0]) - - activation_post_process = \ - self._maybe_get_last_node_only_observer(modules) - assert activation_post_process is not None - - scale, zero_point, _ = get_per_tensor_qparams(activation_post_process) - scale_node, zero_point_node = \ - create_qparam_nodes( - self.conv_node.name, scale, zero_point, modules, - quantized_graph, node_name_to_scope) - qconv_args = (conv_input, packed_weight, scale_node, zero_point_node) - kwargs = load_arg(quantized=torch.float)(self.conv_node.kwargs) - op = create_node_from_old_node_preserve_meta( - quantized_graph, - ('call_function', qconv_op, qconv_args, kwargs), - self.conv_node) - # Store the name of the fused op to get the path of node after fusion as well. - # TODO: may need to change the key to Node regenerate the map in each transformation, - # since we might not be able to rely on the name - node_name_to_scope[op.name] = node_name_to_scope[self.conv_node.name] - return op - else: - # conv2d_dyanmic branch - raise Exception("Only static quant is supported for conv") - -@register_quant_pattern(torch.nn.Linear) -@register_quant_pattern(torch.nn.functional.linear) -@register_quant_pattern(torch.nn.qat.Linear) -@register_quant_pattern(torch.nn.intrinsic.LinearReLU) -@register_quant_pattern(torch.nn.intrinsic.qat.LinearReLU) -@register_quant_pattern((torch.nn.functional.relu, torch.nn.functional.linear)) -@register_quant_pattern((torch.nn.ReLU, torch.nn.functional.linear)) -# for error checks -@register_quant_pattern((torch.nn.ReLU, torch.nn.Linear)) -@register_quant_pattern((torch.nn.functional.relu, torch.nn.Linear)) +# TODO: remove this class class LinearReLUQuantizeHandler(QuantizeHandler): - def __init__( - self, - node: Node, - modules: Dict[str, torch.nn.Module]): - super().__init__(node, modules) - self.relu_node = None - if (node.op == 'call_function' and node.target is torch.nn.functional.relu) or \ - (node.op == 'call_module' and isinstance(modules[str(node.target)], torch.nn.ReLU)): - self.relu_node = node - node = node.args[0] # type: ignore[assignment] - self.linear_node = node - if node.op == 'call_module': - self.linear = modules[str(self.linear_node.target)] - - def convert(self, - node: Node, - qconfig: QConfigAny, - modules: Dict[str, torch.nn.Module], - quantized_graph: Graph, - node_name_to_scope: Dict[str, Tuple[str, type]], - load_arg: Callable, - is_reference: bool = False, - convert_custom_config_dict: Dict[str, Any] = None) -> Node: - if convert_custom_config_dict is None: - convert_custom_config_dict = {} - # Supported combinations are: - # quant_type | activation (compute_type) | weight - # static quint8 qint8 - # dynamic float32 (quint8) qint8 - # weight_only float32 float16 - # tuple (activation_dtype, weight_dtype, compute_dtype) - supported_dtypes = [ - (torch.quint8, torch.qint8, None), - (torch.float32, torch.qint8, torch.quint8), - (torch.float32, torch.float16, None), - # static float16 quantization - (torch.float16, torch.float16, None), - ] - dtypes = get_qconfig_dtypes(qconfig) - # leave the op unquantized if the dtype combination is not supported - if not is_reference and dtypes not in supported_dtypes: - warnings.warn( - "dtype combination: {} is not " - "supported by Linear " - "supported dtype combinations are: {}".format(dtypes, supported_dtypes)) - if self.relu_node: - op_out = quantized_graph.node_copy(self.linear_node, load_arg(quantized=torch.float)) - relu_args = [op_out] - relu_args.extend(load_arg(quantized=torch.float)(self.relu_node.args[1:])) - relu_kwargs = load_arg(quantized=torch.float)(self.relu_node.kwargs) - return create_node_from_old_node_preserve_meta( - quantized_graph, - ("call_function", torch.nn.functional.relu, tuple(relu_args), relu_kwargs), - self.relu_node) - else: - return quantized_graph.node_copy(node, load_arg(quantized=None)) - - activation_int8_quantized = activation_is_int8_quantized(qconfig) - activation_statically_quantized = activation_is_statically_quantized(qconfig) - weight_dtype = dtypes[1] - if self.linear_node.op == 'call_module': - - output_activation_post_process = \ - self._maybe_get_last_node_only_observer(modules) + pass - # note that relu should already be fused into linear modul in the fusion step - assert self.relu_node is None, 'linear module and relu fusion is not executed, ' \ - 'please make sure to run fusion before prepare' - # we'll always produce reference pattern for the following modules - # will remove the else branch after we migrated all use cases - module_allowlist = [ - torch.nn.Linear, - torch.nn.qat.Linear, - torch.nn.intrinsic.modules.fused.LinearReLU, - torch.nn.intrinsic.qat.modules.linear_relu.LinearReLU - ] - if is_reference or type(self.linear) in module_allowlist and dtypes in [(torch.quint8, torch.qint8, None)]: - # produce dequant - float_op - quant pattern - dtype = torch.float - if activation_int8_quantized: - dtype = activation_dtype(qconfig) - activation = load_arg(quantized=dtype)(self.linear_node.args[0]) - args = load_arg(quantized=torch.float)(self.linear_node.args) - - # Get the float linear and attach qscheme and qparams the the module - float_linear = self.linear - fused_linear = None - if isinstance(float_linear, (torch.nn.qat.Linear, torch.nn.intrinsic.qat.LinearReLU)): - float_linear = float_linear.to_float() - # change qat linear to linear - parent_name, name = _parent_name(self.linear_node.target) - setattr(modules[parent_name], name, float_linear) - # Attach weight fake quant to the linear module - if isinstance(float_linear, torch.nn.intrinsic.LinearReLU): - fused_linear = float_linear - float_linear = float_linear[0] - weight_post_process = self.linear.weight_fake_quant - else: - if isinstance(float_linear, torch.nn.intrinsic.LinearReLU): - fused_linear = float_linear - float_linear = self.linear[0] # type: ignore[index] - # Attach the weight observer to the module - weight_post_process = qconfig.weight() # type: ignore[union-attr] - - # Run weight observer - # TODO: This is currently a hack for QAT to get the right shapes for scale and zero point. - # In the future, we should require the user to calibrate the model after calling prepare - weight_post_process(float_linear.weight) # type: ignore[operator] - - weight_qparams = get_qparam_dict(weight_post_process) - # TODO: include the configuration in backend_config_dict - # we can have a map from module to reference module - # and allow user to register new ones - qlinear_cls = get_static_quant_module_class( - type(float_linear), is_reference=True) - ref_linear = qlinear_cls.from_float(float_linear, weight_qparams) - - # if the parent is a fused linear (Sequential), we can replace the first - # item to ref linear, otherwise we can update - # the linear instance in the module tree - if fused_linear is not None: - fused_linear[0] = ref_linear - else: - parent_name, name = _parent_name(self.linear_node.target) - setattr(modules[parent_name], name, ref_linear) - op_out = create_node_from_old_node_preserve_meta( - quantized_graph, - ('call_module', self.linear_node.target, args, {}), - self.linear_node) - if output_activation_post_process: - op_out = quantize_node( - op_out, - output_activation_post_process, - node, - modules, - quantized_graph, - node_name_to_scope, - is_input=False) - return op_out - # non-reference option - else: - # 1. attach output activation post process to linear module - if output_activation_post_process: - self.linear.activation_post_process = output_activation_post_process - - # 2. select corresponding quantized linear class for the float linear class - if activation_int8_quantized: - additional_static_quant_mapping = convert_custom_config_dict.get("static", {}) - qlinear = get_static_quant_module_class( - type(self.linear), additional_static_quant_mapping) - else: - assert dtypes in [ - (torch.float32, torch.qint8, torch.quint8), - (torch.float32, torch.float16, None), - ], f"dtype {dtypes} not supported yet" - additional_dynamic_quant_mapping = convert_custom_config_dict.get("dynamic", {}) - qlinear = get_dynamic_quant_module_class(type(self.linear), additional_dynamic_quant_mapping) - - quantized = qlinear.from_float(self.linear) - parent_name, name = _parent_name(self.linear_node.target) - setattr(modules[parent_name], name, quantized) - # activation needs to be quantized for static quantization - dtype = torch.float - if activation_int8_quantized: - dtype = activation_dtype(qconfig) - return create_node_from_old_node_preserve_meta( - quantized_graph, - ( - 'call_module', - self.linear_node.target, - (load_arg(quantized=dtype)(self.linear_node.args[0]),), {}, - ), - self.linear_node) - else: # call_function - assert self.linear_node.op == 'call_function' - if is_reference: - quantized_input_dtypes = [torch.float, torch.float] - if activation_int8_quantized: - quantized_input_dtypes[0] = torch.quint8 - if weight_is_statically_quantized(qconfig): - quantized_input_dtypes[1] = torch.qint8 - args = load_arg(quantized=quantized_input_dtypes)(self.linear_node.args) - args = load_arg(quantized=torch.float)(self.linear_node.args) - kwargs = load_arg(quantized=torch.float)(self.linear_node.kwargs) - op_out = create_node_from_old_node_preserve_meta( - quantized_graph, - ("call_function", torch.nn.functional.linear, args, kwargs), - self.linear_node) - if self.relu_node: - relu_args = [op_out] - relu_args.extend(load_arg(quantized=torch.float)(self.relu_node.args[1:])) - relu_kwargs = load_arg(quantized=torch.float)(self.relu_node.kwargs) - op_out = create_node_from_old_node_preserve_meta( - quantized_graph, - ("call_function", torch.nn.functional.relu, tuple(relu_args), relu_kwargs), - self.relu_node) - - if activation_statically_quantized: - # quantize output for statically quantized linear op - root_module = modules[''] - act_post_process_name = self.relu_node.name if self.relu_node else self.linear_node.name - act_post_process_node = self.relu_node if self.relu_node else self.linear_node - activation_post_process = \ - self._maybe_get_last_node_only_observer(modules) - assert activation_post_process is not None - return quantize_node( - op_out, - activation_post_process, - act_post_process_node, - modules, - quantized_graph, - node_name_to_scope, - is_input=False) - else: - # output for dynamically quantized linear op is not quantized - return op_out - else: # non-reference option - # prepacking weights for static int8 quant and dynamic quant - if dtypes != (torch.float16, torch.float16, None): - # linear args - # (x, weight, bias, ...) - # TODO: the name should be weight is int8 quantized - weight_quantized = weight_is_statically_quantized(qconfig) - dtype = weight_dtype if weight_quantized else torch.float - linear_weight = load_arg(quantized=dtype)(self.linear_node.args[1]) - - # get other arguments - kwargs = {**load_arg(quantized=torch.float)(self.linear_node.kwargs)} - # all args after bias, including bias - other_args = load_arg(quantized=torch.float)(self.linear_node.args[2:]) - # bias might be either positional, or a keyword argument - if len(self.linear_node.args) > 2: - bias = load_arg(quantized=torch.float)(self.linear_node.args[2]) - other_args = other_args[1:] # remove the bias argument - else: - bias = kwargs.pop('bias', None) - - prepack_args = (linear_weight, bias) - prepack_op = get_linear_prepack_op_for_dtype(weight_dtype) - packed_weight = quantized_graph.create_node( - 'call_function', prepack_op, prepack_args, {}) - # construct linear input - if activation_int8_quantized: - qlinear_op = torch.ops.quantized.linear_relu if self.relu_node else torch.ops.quantized.linear - linear_input = load_arg(quantized=torch.quint8)(self.linear_node.args[0]) - activation_post_process = \ - self._maybe_get_last_node_only_observer(modules) - assert activation_post_process is not None - scale, zero_point, _ = get_per_tensor_qparams(activation_post_process) - scale_node, zero_point_node = \ - create_qparam_nodes( - self.linear_node.name, scale, zero_point, modules, - quantized_graph, node_name_to_scope) - - qlinear_args = (linear_input, packed_weight, scale_node, zero_point_node) - op = create_node_from_old_node_preserve_meta( - quantized_graph, - ("call_function", qlinear_op, qlinear_args, kwargs), - self.linear_node) - # Store the name of the fused op to get the path of node after fusion as well. - # TODO: may need to change the key to Node regenerate the map in each transformation, - # since we might not be able to rely on the name - node_name_to_scope[op.name] = node_name_to_scope[self.linear_node.name] - return op - elif dtypes in [(torch.float32, torch.qint8, torch.quint8), - (torch.float32, torch.float16, None)]: - # choose linear dynamic or linear dynamic fp16 op based on weight dtype - if weight_dtype == torch.qint8: - if self.relu_node: - qlinear_op = torch.ops.quantized.linear_relu_dynamic - else: - qlinear_op = torch.ops.quantized.linear_dynamic - else: - if self.relu_node: - qlinear_op = torch.ops.quantized.linear_relu_dynamic_fp16 - else: - qlinear_op = torch.ops.quantized.linear_dynamic_fp16 - - linear_input = load_arg(quantized=torch.float)(self.linear_node.args[0]) - qlinear_args = (linear_input, packed_weight) # type: ignore[assignment] - op_out = create_node_from_old_node_preserve_meta( - quantized_graph, - ("call_function", qlinear_op, qlinear_args, kwargs), - self.linear_node) - # Store the name of the dynamic op to get the path of node after replacement as well. - # TODO: may need to change the key to Node regenerate the map in each transformation, - # since we might not be able to rely on the name - node_name_to_scope[op_out.name] = node_name_to_scope[self.linear_node.name] - return op_out - else: - assert dtypes == (torch.float16, torch.float16, None) - # TODO (refactor) this is duplicated, maybe have a helper function - if self.relu_node: - op_out = quantized_graph.node_copy(self.linear_node, load_arg(quantized=torch.float)) - relu_args = [op_out] - relu_args.extend(load_arg(quantized=torch.float)(self.relu_node.args[1:])) - relu_kwargs = load_arg(quantized=torch.float)(self.relu_node.kwargs) - op_out = create_node_from_old_node_preserve_meta( - quantized_graph, - ("call_function", torch.nn.functional.relu, tuple(relu_args), relu_kwargs), - self.relu_node) - else: - op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float)) - return quantized_graph.create_node( - "call_method", "to", (op_out, torch.float16), {}) - -@register_quant_pattern(torch.nn.BatchNorm2d) -@register_quant_pattern(torch.nn.BatchNorm3d) -@register_quant_pattern(torch.nn.intrinsic.BNReLU2d) -@register_quant_pattern(torch.nn.intrinsic.BNReLU3d) +# TODO: remove this class class BatchNormQuantizeHandler(QuantizeHandler): - def __init__( - self, - node: Node, - modules: Dict[str, torch.nn.Module]): - super().__init__(node, modules) - assert node.op == 'call_module' - self.bn_node = node - self.bn = modules[str(self.bn_node.target)] - - def convert(self, - node: Node, - qconfig: QConfigAny, - modules: Dict[str, torch.nn.Module], - quantized_graph: Graph, - node_name_to_scope: Dict[str, Tuple[str, type]], - load_arg: Callable, - is_reference: bool = False, - convert_custom_config_dict: Dict[str, Any] = None) -> Node: - if convert_custom_config_dict is None: - convert_custom_config_dict = {} - additional_static_quant_mapping = convert_custom_config_dict.get("static", {}) - # 1. attach activation post process to module - output_activation_post_process = \ - self._maybe_get_last_node_only_observer(modules) - assert output_activation_post_process is not None - if is_reference: - # produce dequant - float_op - quant pattern - dtype = activation_dtype(qconfig) - activation = load_arg(quantized=dtype)(self.bn_node.args[0]) - args = load_arg(quantized=torch.float)(self.bn_node.args) - op_out = create_node_from_old_node_preserve_meta( - quantized_graph, - ("call_module", self.bn_node.target, args, {}), - self.bn_node) - if output_activation_post_process: - op_out = quantize_node( - op_out, - output_activation_post_process, - node, - modules, - quantized_graph, - node_name_to_scope, - is_input=False) - return op_out - else: - self.bn.activation_post_process = output_activation_post_process - qbn_cls = get_static_quant_module_class(type(self.bn), additional_static_quant_mapping) - quantized = qbn_cls.from_float(self.bn) - parent_name, name = _parent_name(self.bn_node.target) - setattr(modules[parent_name], name, quantized) - return create_node_from_old_node_preserve_meta( - quantized_graph, - ( - 'call_module', - self.bn_node.target, - load_arg(quantized=[0])(self.bn_node.args), - load_arg(quantized=torch.float)(self.bn_node.kwargs), - ), - self.bn_node) + pass -@register_quant_pattern(torch.nn.qat.Embedding) -@register_quant_pattern(torch.nn.qat.EmbeddingBag) -@register_quant_pattern(torch.nn.Embedding) -@register_quant_pattern(torch.nn.EmbeddingBag) +# TODO: remove this class class EmbeddingQuantizeHandler(QuantizeHandler): - def __init__( - self, - node: Node, - modules: Dict[str, torch.nn.Module]): - super().__init__(node, modules) - - def input_output_observed(self) -> bool: - return False - - def convert(self, - node: Node, - qconfig: QConfigAny, - modules: Dict[str, torch.nn.Module], - quantized_graph: Graph, - node_name_to_scope: Dict[str, Tuple[str, type]], - load_arg: Callable, - is_reference: bool = False, - convert_custom_config_dict: Dict[str, Any] = None) -> Node: - # Supported combinations are: - # quant_type | activation | weight | activation_compute_type - # weight_only | float32 | quint8 | None - # weight_only | float32 | quint4x2 | None - # tuple (activation_dtype, weight_dtype, compute_dtype) - supported_dtypes = [ - (torch.float32, torch.quint8, None), - (torch.float32, torch.quint4x2, None), - ] - assert node.op == 'call_module' - emb_node = node - dtypes = get_qconfig_dtypes(qconfig) - # leave the op unquantized if the dtype combination is not supported - if dtypes not in supported_dtypes: - warnings.warn( - "dtype combination: {} is not " - "supported by Embedding/EmbeddingBag, " - "supported dtype combinations are: {}".format(dtypes, supported_dtypes)) - return quantized_graph.node_copy(node, load_arg(quantized=None)) - - emb = modules[str(emb_node.target)] - qemb = get_static_quant_module_class(type(emb)) - quantized = qemb.from_float(emb) - parent_name, name = _parent_name(emb_node.target) - setattr(modules[parent_name], name, quantized) - return create_node_from_old_node_preserve_meta( - quantized_graph, - ( - 'call_module', - emb_node.target, - load_arg(quantized=torch.float)(emb_node.args), - load_arg(quantized=torch.float)(emb_node.kwargs), - ), - emb_node) + pass -# TODO (maybe): merge with embedding quantize handler -@register_quant_pattern(torch.nn.GRUCell) -@register_quant_pattern(torch.nn.LSTMCell) -@register_quant_pattern(torch.nn.RNNCell) -@register_quant_pattern(torch.nn.LSTM) +# TODO: remove this class class RNNDynamicQuantizeHandler(QuantizeHandler): - def __init__( - self, - node: Node, - modules: Dict[str, torch.nn.Module]): - super().__init__(node, modules) - - def input_output_observed(self) -> bool: - return False - - def convert(self, - node: Node, - qconfig: QConfigAny, - modules: Dict[str, torch.nn.Module], - quantized_graph: Graph, - node_name_to_scope: Dict[str, Tuple[str, type]], - load_arg: Callable, - is_reference: bool = False, - convert_custom_config_dict: Dict[str, Any] = None) -> Node: - # Supported combinations are: - # quant_type | activation | weight | activation_compute_type - # dynamic | float32 | qint8 | quint8 - # dynamic | float32 | float16 | None - # tuple (activation_dtype, weight_dtype, compute_dtype) - supported_dtypes = [ - (torch.float32, torch.qint8, torch.quint8), - (torch.float32, torch.float16, None), - ] - assert node.op == 'call_module' - dtypes = get_qconfig_dtypes(qconfig) - # leave the op unquantized if the dtype combination is not supported - if dtypes not in supported_dtypes: - warnings.warn( - "dtype combination: {} is not " - "supported by Embedding/EmbeddingBag, " - "supported dtype combinations are: {}".format(dtypes, supported_dtypes)) - return quantized_graph.node_copy(node, load_arg(quantized=None)) - - module = modules[str(node.target)] - qmodule_cls = get_dynamic_quant_module_class(type(module)) - qmodule = qmodule_cls.from_float(module) - parent_name, name = _parent_name(node.target) - setattr(modules[parent_name], name, qmodule) - return create_node_from_old_node_preserve_meta( - quantized_graph, - ( - 'call_module', - node.target, - load_arg(quantized=torch.float)(node.args), - load_arg(quantized=torch.float)(node.kwargs), - ), - node) + pass -ARGS_TO_SKIP = { - torch._ops.ops.quantized.hardswish: ['inplace'], - torch._ops.ops.quantized.elu: ['inplace'], - torch._ops.ops.quantized.dropout: ['inplace'], - torch._ops.ops.quantized.instance_norm: - ['running_mean', 'running_var', 'use_input_stats', 'momentum'], -} -@register_quant_pattern(torch.nn.ConvTranspose1d) -@register_quant_pattern(torch.nn.ConvTranspose2d) -@register_quant_pattern(torch.nn.ELU) -@register_quant_pattern(torch.nn.LeakyReLU) -@register_quant_pattern(torch.nn.Hardswish) -@register_quant_pattern(torch.nn.InstanceNorm1d) -@register_quant_pattern(torch.nn.InstanceNorm2d) -@register_quant_pattern(torch.nn.InstanceNorm3d) -@register_quant_pattern(torch.nn.LayerNorm) -@register_quant_pattern(torch.nn.SiLU) -@register_quant_pattern(torch.nn.Mish) -@register_quant_pattern(torch.nn.Dropout) -# we currently only support reference patterns for these ops so they have been removed -# until they receive a proper fp16 kernel. To use the reference pattern, use a custom qconfig -# @register_quant_pattern(torch.nn.GELU) -# @register_quant_pattern(torch.nn.Softmax) -@register_quant_pattern(torch.nn.functional.elu) -@register_quant_pattern(torch.nn.functional.hardswish) -@register_quant_pattern(torch.nn.functional.instance_norm) -@register_quant_pattern(torch.nn.functional.layer_norm) -@register_quant_pattern(torch.nn.functional.leaky_relu) -@register_quant_pattern(torch.nn.functional.silu) -@register_quant_pattern(torch.nn.functional.mish) -@register_quant_pattern(torch.nn.functional.dropout) -# we currently only support reference patterns for these ops so they have been removed -# until they receive a proper fp16 kernel. To use the reference pattern, use a custom qconfig -# @register_quant_pattern(torch.nn.functional.gelu) -# @register_quant_pattern(torch.nn.functional.softmax) -@register_quant_pattern(torch.sum) +# TODO: remove this class class DefaultNodeQuantizeHandler(QuantizeHandler): """ Common quantized op, first input and first output will be quantized """ - def __init__( - self, - node: Node, - modules: Dict[str, torch.nn.Module]): - super().__init__(node, modules) - if node.op == "call_function" or node.op == "call_method": - self.op = node.target - elif node.op == "call_module": - self.op = type(modules[str(node.target)]) - - def is_output_quantized(self, qconfig): - dtypes = get_qconfig_dtypes(qconfig) - return self.op in default_op_supported_dtypes and \ - dtypes in default_op_supported_dtypes[self.op] - - def convert(self, - node: Node, - qconfig: QConfigAny, - modules: Dict[str, torch.nn.Module], - quantized_graph: Graph, - node_name_to_scope: Dict[str, Tuple[str, type]], - load_arg: Callable, - is_reference: bool = False, - convert_custom_config_dict: Dict[str, Any] = None) -> Node: - if not self.all_node_args_are_tensors: - return NotImplemented - assert node.op in ['call_module', 'call_function'], 'Only call_module and ' + \ - 'call_function are handled in DefaultNode' - if convert_custom_config_dict is None: - convert_custom_config_dict = {} - additional_static_quant_mapping = convert_custom_config_dict.get("static", {}) - - dtypes = get_qconfig_dtypes(qconfig) - if not is_reference and dtypes not in default_op_supported_dtypes[self.op]: - warnings.warn( - "dtype combination: {} is not " - "supported by {} " - "supported dtype combinations are: {}".format(dtypes, self.op, default_op_supported_dtypes[self.op])) - return quantized_graph.node_copy(node, load_arg(quantized=torch.float)) - # TODO: make helper functions for (torch.quint8, torch.qint8, None) - if not is_reference: - if dtypes in [(torch.quint8, torch.qint8, None)]: - activation_post_process = \ - self._maybe_get_last_node_only_observer(modules) - assert activation_post_process is not None - if node.op == 'call_module': - module = modules[str(node.target)] - module.activation_post_process = activation_post_process - quantized_module_cls = get_static_quant_module_class( - type(module), additional_static_quant_mapping) - quantized_module = quantized_module_cls.from_float(module) - parent_name, name = _parent_name(node.target) - setattr(modules[parent_name], name, quantized_module) - return create_node_from_old_node_preserve_meta( - quantized_graph, - ( - 'call_module', - node.target, - load_arg(quantized=[0])(node.args), - load_arg(quantized=torch.float)(node.kwargs), - ), - node) - else: - assert node.op == "call_function" - # call_function - scale, zero_point = activation_post_process.calculate_qparams() # type: ignore[operator] - scale = float(scale) - zero_point = int(zero_point) - scale_arg, zero_point_arg = \ - create_qparam_nodes( - node.name, scale, zero_point, modules, - quantized_graph, node_name_to_scope) + pass - assert not isinstance(node.target, str), "Expecting node.target for " - "call_function to be a function instead of a string" - quantized_op = get_quantized_operator(node.target) - args = load_arg(quantized=[0])(node.args) - kwargs = {**load_arg(quantized=torch.float)(node.kwargs), "output_scale": scale_arg, - "output_zero_point": zero_point_arg} - if quantized_op in ARGS_TO_SKIP: - args_to_skip = ARGS_TO_SKIP[quantized_op] - for arg in args_to_skip: - if arg in kwargs: - kwargs.pop(arg) - return create_node_from_old_node_preserve_meta( - quantized_graph, - ("call_function", quantized_op, args, kwargs), # type: ignore[arg-type] - node) - else: - assert dtypes in [(torch.float16, torch.float16, None)] - # Generally fp16 kernels don't exist for fp16 ops - warnings.warn( - "Only reference patterns are currently supported for {dtype} dtype with {op} op" - "".format(dtype=dtypes, op=self.op)) - op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float)) - return quantized_graph.create_node( - "call_method", "to", (op_out, torch.float16), {}) - else: - assert is_reference - # We can produce reference for a dtypes including - # (torch.quint8, torch.qint8, torch.qint32, torch.float16) - act_dtype = activation_dtype(qconfig) - if act_dtype == torch.float: - op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float)) - return op_out - else: - activation_post_process = \ - self._maybe_get_last_node_only_observer(modules) - assert activation_post_process is not None - # make sure the input is quantized to act_dtype - load_arg(quantized={0: act_dtype})(node.args) - args = load_arg(quantized=torch.float)(node.args) - kwargs = load_arg(quantized=torch.float)(node.kwargs) - op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float)) - return quantize_node( - op_out, activation_post_process, - node, modules, quantized_graph, node_name_to_scope, is_input=False) - -@register_quant_pattern(torch.nn.Hardsigmoid, default_affine_fixed_qparams_observer) -@register_quant_pattern(torch.nn.functional.hardsigmoid, default_affine_fixed_qparams_observer) -@register_quant_pattern('hardsigmoid', default_affine_fixed_qparams_observer) -@register_quant_pattern('hardsigmoid_', default_affine_fixed_qparams_observer) -@register_quant_pattern(torch.nn.Sigmoid, default_affine_fixed_qparams_observer) -@register_quant_pattern(torch.sigmoid, default_affine_fixed_qparams_observer) -@register_quant_pattern('sigmoid', default_affine_fixed_qparams_observer) -@register_quant_pattern('sigmoid_', default_affine_fixed_qparams_observer) -@register_quant_pattern(torch.nn.Tanh, default_symmetric_fixed_qparams_observer) -@register_quant_pattern(torch.tanh, default_symmetric_fixed_qparams_observer) -@register_quant_pattern('tanh', default_symmetric_fixed_qparams_observer) -@register_quant_pattern('tanh_', default_symmetric_fixed_qparams_observer) +# TODO: remove this class class FixedQParamsOpQuantizeHandler(QuantizeHandler): - def __init__(self, - node: Node, - modules: Dict[str, torch.nn.Module]): - super().__init__(node, modules) - self.node = node - - def should_mark_output_quantized_from_input_quantized_status( - self, - qconfig: QConfigAny - ) -> bool: - # FixQParamOps are the same as CopyNode in int8 quantization - return activation_dtype(qconfig) in [torch.quint8, torch.qint8] - - # some qhandlers override the activations constructor - def get_activation_ctr(self, qconfig, pattern, is_training) -> Optional[Callable]: - act_dtype = activation_dtype(qconfig) - if act_dtype == torch.quint8: - return get_default_output_activation_post_process_map(is_training).get( - pattern, qconfig.activation) - else: - return qconfig.activation + pass - def convert(self, - node: Node, - qconfig: QConfigAny, - modules: Dict[str, torch.nn.Module], - quantized_graph: Graph, - node_name_to_scope: Dict[str, Tuple[str, type]], - load_arg: Callable, - is_reference: bool = False, - convert_custom_config_dict: Dict[str, Any] = None) -> Node: - if not is_reference: - dtypes = get_qconfig_dtypes(qconfig) - if dtypes == (torch.float16, torch.float16, None): - op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float)) - return quantized_graph.create_node( - "call_method", "to", (op_out, torch.float16,), {} - ) - else: - return quantized_graph.node_copy(node, load_arg(quantized=None)) - else: - act_dtype = activation_dtype(qconfig) - if act_dtype == torch.float: - op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float)) - return op_out - else: - activation_post_process = \ - self._maybe_get_last_node_only_observer(modules) - assert activation_post_process is not None - # make sure the input is quantized to act_dtype - load_arg(quantized={0: act_dtype})(node.args) - args = load_arg(quantized=torch.float)(node.args) - kwargs = load_arg(quantized=torch.float)(node.kwargs) - op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float)) - return quantize_node( - op_out, activation_post_process, - node, modules, quantized_graph, node_name_to_scope, is_input=False) - -@register_quant_pattern(torch.nn.AdaptiveAvgPool1d) -@register_quant_pattern(torch.nn.AdaptiveAvgPool2d) -@register_quant_pattern(torch.nn.AdaptiveAvgPool3d) -@register_quant_pattern(torch.nn.AvgPool1d) -@register_quant_pattern(torch.nn.AvgPool2d) -@register_quant_pattern(torch.nn.AvgPool3d) -@register_quant_pattern(torch.nn.Hardtanh) -@register_quant_pattern(torch.nn.MaxPool1d) -@register_quant_pattern(torch.nn.MaxPool2d) -@register_quant_pattern(torch.nn.MaxPool3d) -@register_quant_pattern(torch.nn.ReLU) -@register_quant_pattern(torch.nn.ReLU6) -@register_quant_pattern(torch.adaptive_avg_pool1d) -@register_quant_pattern(torch.nn.functional.adaptive_avg_pool2d) -@register_quant_pattern(torch.nn.functional.adaptive_avg_pool3d) -@register_quant_pattern(torch.nn.functional.hardtanh) -@register_quant_pattern(torch.nn.functional.hardtanh_) -@register_quant_pattern(torch.nn.functional.interpolate) -@register_quant_pattern(torch.nn.functional.max_pool1d) -@register_quant_pattern(torch.nn.functional.max_pool2d) -@register_quant_pattern(torch.nn.functional.max_pool3d) -@register_quant_pattern(torch.nn.functional.relu) -@register_quant_pattern(torch.nn.functional.relu6) -@register_quant_pattern(torch.avg_pool1d) -@register_quant_pattern(torch._C._nn.avg_pool2d) -@register_quant_pattern(torch._C._nn.avg_pool3d) -@register_quant_pattern(torch.clamp) -@register_quant_pattern(torch.flatten) -@register_quant_pattern(torch.mean) -@register_quant_pattern(operator.floordiv) -@register_quant_pattern('clamp') -@register_quant_pattern('mean') -@register_quant_pattern('relu') -@register_quant_pattern('relu_') +# TODO: remove class CopyNodeQuantizeHandler(QuantizeHandler): - """ Operators that works on both float and quantized input - if input is quantized, the output Tensor shares - the same quantization parameter with input. - These ops will do computation on the input Tensor, e.g. average pool, so we will - insert extra observer/fake_quant for the output of these operators. - TODO: maybe rename this to TensorValueOpQuantizeHandler - """ - def should_mark_output_quantized_from_input_quantized_status( - self, - qconfig: QConfigAny - ) -> bool: - return True - - def is_general_tensor_value_op(self) -> bool: - return True - - def convert(self, - node: Node, - qconfig: QConfigAny, - modules: Dict[str, torch.nn.Module], - quantized_graph: Graph, - node_name_to_scope: Dict[str, Tuple[str, type]], - load_arg: Callable, - is_reference: bool = False, - convert_custom_config_dict: Dict[str, Any] = None) -> Node: + pass - is_call_function, is_call_method, is_call_module = check_node(node, modules) - if is_reference or (is_call_function or is_call_method or is_call_module): - # when activation dtype is torch.float, the node does not require - # observation - # e.g. dynamic quantization or weight_only quantization - act_dtype = activation_dtype(qconfig) - if act_dtype == torch.float: - op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float)) - return op_out - else: - activation_post_process = \ - self._maybe_get_last_node_only_observer(modules) - assert activation_post_process is not None - # make sure the input is quantized to act_dtype - load_arg(quantized={0: act_dtype})(node.args) - args = list(load_arg(quantized=torch.float)(node.args)) - kwargs = load_arg(quantized=torch.float)(node.kwargs) - op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float)) - return quantize_node( - op_out, - activation_post_process, - node, modules, quantized_graph, node_name_to_scope, is_input=False) - else: - return quantized_graph.node_copy(node, load_arg(quantized=None)) - -class CustomModuleQuantizeHandler(QuantizeHandler): - def convert(self, - node: Node, - qconfig: QConfigAny, - modules: Dict[str, torch.nn.Module], - quantized_graph: Graph, - node_name_to_scope: Dict[str, Tuple[str, type]], - load_arg: Callable, - is_reference: bool = False, - convert_custom_config_dict: Dict[str, Any] = None) -> Node: - """ Convert a float custom module to quantized custom module - """ - assert node.op == 'call_module' - assert convert_custom_config_dict is not None - custom_module_class_mapping = convert_custom_config_dict.get("observed_to_quantized_custom_module_class", None) - assert custom_module_class_mapping is not None - observed_custom_module = modules[str(node.target)] - if activation_is_statically_quantized(qconfig): - activation_post_process = \ - self._maybe_get_last_node_only_observer(modules) - assert activation_post_process is not None - observed_custom_module.activation_post_process = activation_post_process - quantized_custom_module_class = get_swapped_custom_module_class( - observed_custom_module, custom_module_class_mapping, qconfig) - quantized_custom_module = \ - quantized_custom_module_class.from_observed(observed_custom_module) - parent_name, name = _parent_name(node.target) - setattr(modules[parent_name], name, quantized_custom_module) - # hardcoded the quntized input to be None (take whatever is in the environemnt), - # we can extend this - # if there is a need, e.g. get the indexes of quantized inputs from some - # module attribute like module._QUANTIZED_INPUT_INDEXES - return quantized_graph.node_copy(node, load_arg(quantized=None)) - -@register_quant_pattern(torch.nn.Identity) -@register_quant_pattern(torch.transpose) -@register_quant_pattern(torch.repeat_interleave) -@register_quant_pattern(torch.squeeze) -@register_quant_pattern(torch.stack) -@register_quant_pattern(torch.unsqueeze) -@register_quant_pattern('contiguous') -@register_quant_pattern('detach') -@register_quant_pattern('detach_') -@register_quant_pattern('permute') -@register_quant_pattern('repeat') -@register_quant_pattern('repeat_interleave') -@register_quant_pattern('reshape') -@register_quant_pattern('resize_') -@register_quant_pattern('shape') -@register_quant_pattern('size') -@register_quant_pattern('squeeze') -@register_quant_pattern('squeeze_') -@register_quant_pattern('transpose') -@register_quant_pattern('unsqueeze') -@register_quant_pattern('unsqueeze_') -@register_quant_pattern('view') +# TODO: remove class GeneralTensorShapeOpQuantizeHandler(QuantizeHandler): - """ Operators that works on both float and quantized input - if input is quantized, the output Tensor shares - the same quantization parameter with input. - These ops only do rearrangement of Tensor values, for - example reshape, or just query the information about Tensor - e.g. size, and we do not insert extra observer/fake_quant - for the output of the operator. - """ - def is_general_tensor_shape_op(self) -> bool: - return True + pass - def should_mark_output_quantized_from_input_quantized_status( - self, - qconfig: QConfigAny - ) -> bool: - return True - - def convert(self, - node: Node, - qconfig: QConfigAny, - modules: Dict[str, torch.nn.Module], - quantized_graph: Graph, - node_name_to_scope: Dict[str, Tuple[str, type]], - load_arg: Callable, - is_reference: bool = False, - convert_custom_config_dict: Dict[str, Any] = None) -> Node: - # when activation dtype is torch.float, the node does not require - # observation - # e.g. dynamic quantization or weight_only quantization - act_dtype = activation_dtype(qconfig) - if act_dtype == torch.float: - op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float)) - return op_out - else: - activation_post_process = \ - self._maybe_get_last_node_only_observer(modules) - if activation_post_process is not None: - args = list(load_arg(quantized=torch.float)(node.args)) - kwargs = load_arg(quantized=torch.float)(node.kwargs) - op_out = quantized_graph.node_copy(node, load_arg(quantized=torch.float)) - return quantize_node( - op_out, - activation_post_process, - node, modules, quantized_graph, node_name_to_scope, is_input=False) - else: - return quantized_graph.node_copy(node, load_arg(quantized=torch.float)) +# TODO: not used, can be removed after torch.quantization namespace is deprecated +class CustomModuleQuantizeHandler(QuantizeHandler): + pass +# TODO: not used, can be removed after torch.quantization namespace is deprecated class StandaloneModuleQuantizeHandler(QuantizeHandler): - """ Converts an observed standalone module to quantized standalone module - by calling convert_fx on the observed standalone module. - """ - def convert(self, - node: Node, - qconfig: QConfigAny, - modules: Dict[str, torch.nn.Module], - quantized_graph: Graph, - node_name_to_scope: Dict[str, Tuple[str, type]], - load_arg: Callable, - is_reference: bool = False, - convert_custom_config_dict: Dict[str, Any] = None) -> Node: - assert node.op == 'call_module' - convert = torch.ao.quantization.quantize_fx._convert_standalone_module_fx # type: ignore[attr-defined] - # We know that observed standalone module is a GraphModule since - # it's produced by us - observed_standalone_module : GraphModule = modules[str(node.target)] # type: ignore[assignment] - input_quantized_idxs = observed_standalone_module._standalone_module_input_quantized_idxs.tolist() # type: ignore[operator] - quantized_standalone_module = convert(observed_standalone_module, is_reference=is_reference) - parent_name, name = _parent_name(node.target) - # update the modules dict - setattr(modules[parent_name], name, quantized_standalone_module) - modules[str(node.target)] = quantized_standalone_module - return quantized_graph.node_copy(node, load_arg(quantized=input_quantized_idxs)) + pass diff --git a/torch/ao/quantization/fx/quantization_types.py b/torch/ao/quantization/fx/quantization_types.py deleted file mode 100644 index 859f4b2d456a..000000000000 --- a/torch/ao/quantization/fx/quantization_types.py +++ /dev/null @@ -1,10 +0,0 @@ -from typing import Any, Tuple, Union -from torch.fx import Node -from ..utils import Pattern # noqa: F401 - -NodePattern = Union[Tuple[Node, Node], Tuple[Node, Tuple[Node, Node]], Any] - -# This is the Quantizer class instance from torch/quantization/fx/quantize.py. -# Define separately to prevent circular imports. -# TODO(future PR): improve this. -QuantizerCls = Any diff --git a/torch/ao/quantization/fx/quantized_fusion_patterns_and_replacements.py b/torch/ao/quantization/fx/quantized_fusion_patterns_and_replacements.py deleted file mode 100644 index ce23f17db71d..000000000000 --- a/torch/ao/quantization/fx/quantized_fusion_patterns_and_replacements.py +++ /dev/null @@ -1,152 +0,0 @@ -import torch - -def relu_inplace_pattern(x, scale, zero_point): - x = x.dequantize() - x = torch.nn.functional.relu(x, inplace=True) - x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8) - return x - -def relu_non_inplace_pattern(x, scale, zero_point): - x = x.dequantize() - x = torch.nn.functional.relu(x, inplace=False) - x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8) - return x - -def relu_replacement(x, scale, zero_point): - x = torch.nn.functional.relu(x) - return x - -def relu_method_pattern(x, scale, zero_point): - x = x.dequantize() - x = x.relu() - x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8) - return x - -def relu_method_replacement(x, scale, zero_point): - x = x.relu() - return x - -def relu_inplace_method_pattern(x, scale, zero_point): - x = x.dequantize() - x = x.relu_() - x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8) - return x - -def relu_inplace_method_replacement(x, scale, zero_point): - x = x.relu_() - return x - -def relu6_inplace_pattern(x, scale, zero_point): - x = x.dequantize() - x = torch.nn.functional.relu6(x, inplace=True) - x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8) - return x - -def relu6_non_inplace_pattern(x, scale, zero_point): - x = x.dequantize() - x = torch.nn.functional.relu6(x, inplace=False) - x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8) - return x - -def relu6_replacement(x, scale, zero_point): - x = torch.nn.functional.relu6(x) - return x - - -def hardtanh_pattern(x, scale, zero_point): - x = x.dequantize() - x = torch.nn.functional.hardtanh(x, inplace=True) - x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8) - return x - -def hardtanh_non_inplace_pattern(x, scale, zero_point): - x = x.dequantize() - x = torch.nn.functional.hardtanh(x, inplace=False) - x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8) - return x - -def hardtanh_replacement(x, scale, zero_point): - x = torch.nn.functional.hardtanh(x) - return x - -def hardtanh_inplace_pattern(x, scale, zero_point): - x = x.dequantize() - x = torch.nn.functional.hardtanh_(x) - x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8) - return x - -def hardtanh_inplace_replacement(x, scale, zero_point): - x = torch.nn.functional.hardtanh_(x) - return x - -def min_pattern(x, scale, zero_point): - x = x.dequantize() - x = torch.min(x) - x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8) - return x - -def min_replacement(x, scale, zero_point): - x = torch.min(x) - return x - -def max_pattern(x, scale, zero_point): - x = x.dequantize() - x = torch.max(x) - x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8) - return x - -def max_replacement(x, scale, zero_point): - x = torch.max(x) - return x - -def mean_pattern(x, scale, zero_point): - x = x.dequantize() - x = torch.mean(x) - x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8) - return x - -def mean_replacement(x, scale, zero_point): - x = torch.mean(x) - return x - -def mean_method_pattern(x, scale, zero_point): - x = x.dequantize() - x = x.mean() - x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8) - return x - -def mean_method_replacement(x, scale, zero_point): - x = x.mean() - return x - -def flatten_pattern(x, scale, zero_point): - x = x.dequantize() - x = torch.flatten(x) - x = torch.quantize_per_tensor(x, scale, zero_point, torch.quint8) - return x - -def flatten_replacement(x, scale, zero_point): - x = torch.flatten(x) - return x - -def _get_all_patterns_and_replacements(): - return [ - (relu_inplace_pattern, relu_replacement), - (relu_non_inplace_pattern, relu_replacement), - (relu_method_pattern, relu_method_replacement), - (relu_inplace_method_pattern, relu_inplace_method_replacement), - (relu6_inplace_pattern, relu6_replacement), - (relu6_non_inplace_pattern, relu6_replacement), - (hardtanh_pattern, hardtanh_replacement), - (hardtanh_non_inplace_pattern, hardtanh_replacement), - (hardtanh_inplace_pattern, hardtanh_inplace_replacement), - (mean_pattern, mean_replacement), - (mean_method_pattern, mean_method_replacement), - ] - - -def get_fbgemm_patterns_and_replacements(): - return _get_all_patterns_and_replacements() - -def get_qnnpack_patterns_and_replacements(): - return _get_all_patterns_and_replacements() diff --git a/torch/ao/quantization/fx/subgraph_rewriter_FORKED_DO_NOT_USE.py b/torch/ao/quantization/fx/subgraph_rewriter_FORKED_DO_NOT_USE.py deleted file mode 100644 index a64b537173a9..000000000000 --- a/torch/ao/quantization/fx/subgraph_rewriter_FORKED_DO_NOT_USE.py +++ /dev/null @@ -1,445 +0,0 @@ -from torch.fx.graph_module import GraphModule -from torch.fx.graph import Graph -from torch.fx.node import Node -from torch.fx._symbolic_trace import symbolic_trace -from torch.fx._compatibility import compatibility - -import copy -from typing import Callable, Dict, List, NamedTuple, Optional, Set -import torch - -@compatibility(is_backward_compatible=True) -class Match(NamedTuple): - # Node from which the match was found - anchor: Node - # Maps nodes in the pattern subgraph to nodes in the larger graph - nodes_map: Dict[Node, Node] - -class _SubgraphMatcher: - def __init__(self, pattern: Graph) -> None: - self.pattern = pattern - if len(pattern.nodes) == 0: - raise ValueError("_SubgraphMatcher cannot be initialized with an " - "empty pattern") - # `self.pattern_anchor` is the output Node in `pattern` - self.pattern_anchor = next(iter(reversed(pattern.nodes))) - # Ensure that there is only a single output value in the pattern - # since we don't support multiple outputs - assert len(self.pattern_anchor.all_input_nodes) == 1, \ - "Pattern matching on multiple outputs is not supported" - # Maps nodes in the pattern subgraph to nodes in the larger graph - self.nodes_map: Dict[Node, Node] = {} - - def matches_subgraph_from_anchor(self, anchor: Node) -> bool: - """ - Checks if the whole pattern can be matched starting from - ``anchor`` in the larger graph. - - Pattern matching is done by recursively comparing the pattern - node's use-def relationships against the graph node's. - """ - self.nodes_map = {} - return self._match_nodes(self.pattern_anchor, anchor) - - # Compare the pattern node `pn` against the graph node `gn` - def _match_nodes(self, pn: Node, gn: Node) -> bool: - - # Check if we've already matched these nodes in the current - # traversal - if pn in self.nodes_map: - return self.nodes_map[pn] == gn - - def attributes_are_equal(pn: Node, gn: Node) -> bool: - # Use placeholder and output nodes as wildcards. The - # only exception is that an output node can't match - # a placeholder - if (pn.op == "placeholder" - or (pn.op == "output" and gn.op != "placeholder")): - return True - return pn.op == gn.op and pn.target == gn.target - - # Terminate early if the node attributes are not equal - if not attributes_are_equal(pn, gn): - return False - - # Optimistically mark `pn` as a match for `gn` - self.nodes_map[pn] = gn - - # Traverse the use-def relationships to ensure that `pn` is a true - # match for `gn` - if pn.op == "placeholder": - return True - if (pn.op != "output" - and len(pn.all_input_nodes) != len(gn.all_input_nodes)): - return False - if pn.op == "output": - match_found = any(self._match_nodes(pn.all_input_nodes[0], gn_) - for gn_ in gn.all_input_nodes) - else: - match_found = (len(pn.all_input_nodes) == len(gn.all_input_nodes) - and all(self._match_nodes(pn_, gn_) for pn_, gn_ - in zip(pn.all_input_nodes, gn.all_input_nodes))) - if not match_found: - self.nodes_map.pop(pn) - return False - - return True - - -def _replace_submodules(gm: GraphModule, replacement: torch.nn.Module) -> None: - gm.delete_all_unused_submodules() - - if isinstance(replacement, GraphModule): - replacement.graph.lint() - - def try_get_submodule(mod: torch.nn.Module, target: str) -> Optional[torch.nn.Module]: - try: - mod_match = mod.get_submodule(target) - return mod_match - except AttributeError: - return None - - for node in gm.graph.nodes: - if node.op == "call_module" or node.op == "get_attr": - - gm_submod = try_get_submodule(gm, node.target) - - replacement_submod = try_get_submodule(replacement, node.target) - - # CASE 1: This target already exists as a submodule in our - # result GraphModule. Whether or not it exists in - # `replacement`, the existing submodule takes precedence. - if gm_submod is not None: - continue - - # CASE 2: The target exists as a submodule in `replacement` - # only, so we need to copy it over. - elif replacement_submod is not None: - new_submod = copy.deepcopy(getattr(replacement, node.target)) - gm.add_submodule(node.target, new_submod) - - # CASE 3: The target doesn't exist as a submodule in `gm` - # or `replacement` - else: - raise RuntimeError("Attempted to create a \"", node.op, - "\" node during subgraph rewriting " - f"with target {node.target}, but " - "the referenced submodule does not " - "exist in either the original " - "GraphModule `gm` or the replacement" - " GraphModule `replacement`") - - gm.graph.lint() - -@compatibility(is_backward_compatible=True) -def replace_pattern(gm: GraphModule, pattern: Callable, replacement: Callable) -> List[Match]: - """ - Matches all possible non-overlapping sets of operators and their - data dependencies (``pattern``) in the Graph of a GraphModule - (``gm``), then replaces each of these matched subgraphs with another - subgraph (``replacement``). - - Args: - ``gm``: The GraphModule that wraps the Graph to operate on - ``pattern``: The subgraph to match in ``gm`` for replacement - ``replacement``: The subgraph to replace ``pattern`` with - - Returns: - List[Match]: A list of ``Match`` objects representing the places - in the original graph that ``pattern`` was matched to. The list - is empty if there are no matches. ``Match`` is defined as: - - .. code-block:: python - - class Match(NamedTuple): - # Node from which the match was found - anchor: Node - # Maps nodes in the pattern subgraph to nodes in the larger graph - nodes_map: Dict[Node, Node] - - Examples: - - .. code-block:: python - - import torch - from torch.fx import symbolic_trace, subgraph_rewriter - - class M(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x, w1, w2): - m1 = torch.cat([w1, w2]).sum() - m2 = torch.cat([w1, w2]).sum() - return x + torch.max(m1) + torch.max(m2) - - def pattern(w1, w2): - return torch.cat([w1, w2]).sum() - - def replacement(w1, w2): - return torch.stack([w1, w2]) - - traced_module = symbolic_trace(M()) - - subgraph_rewriter.replace_pattern(traced_module, pattern, replacement) - - The above code will first match ``pattern`` in the ``forward`` - method of ``traced_module``. Pattern-matching is done based on - use-def relationships, not node names. For example, if you had - ``p = torch.cat([a, b])`` in ``pattern``, you could match - ``m = torch.cat([a, b])`` in the original ``forward`` function, - despite the variable names being different (``p`` vs ``m``). - - The ``return`` statement in ``pattern`` is matched based on its - value only; it may or may not match to the ``return`` statement in - the larger graph. In other words, the pattern doesn't have to extend - to the end of the larger graph. - - When the pattern is matched, it will be removed from the larger - function and replaced by ``replacement``. If there are multiple - matches for ``pattern`` in the larger function, each non-overlapping - match will be replaced. In the case of a match overlap, the first - found match in the set of overlapping matches will be replaced. - ("First" here being defined as the first in a topological ordering - of the Nodes' use-def relationships. In most cases, the first Node - is the parameter that appears directly after ``self``, while the - last Node is whatever the function returns.) - - One important thing to note is that the parameters of the - ``pattern`` Callable must be used in the Callable itself, - and the parameters of the ``replacement`` Callable must match - the pattern. The first rule is why, in the above code block, the - ``forward`` function has parameters ``x, w1, w2``, but the - ``pattern`` function only has parameters ``w1, w2``. ``pattern`` - doesn't use ``x``, so it shouldn't specify ``x`` as a parameter. - As an example of the second rule, consider replacing - - .. code-block:: python - - def pattern(x, y): - return torch.neg(x) + torch.relu(y) - - with - - .. code-block:: python - - def replacement(x, y): - return torch.relu(x) - - In this case, ``replacement`` needs the same number of parameters - as ``pattern`` (both ``x`` and ``y``), even though the parameter - ``y`` isn't used in ``replacement``. - - After calling ``subgraph_rewriter.replace_pattern``, the generated - Python code looks like this: - - .. code-block:: python - - def forward(self, x, w1, w2): - stack_1 = torch.stack([w1, w2]) - sum_1 = stack_1.sum() - stack_2 = torch.stack([w1, w2]) - sum_2 = stack_2.sum() - max_1 = torch.max(sum_1) - add_1 = x + max_1 - max_2 = torch.max(sum_2) - add_2 = add_1 + max_2 - return add_2 - """ - # Get the graphs for `gm`, `pattern`, `replacement` - original_graph = gm.graph - pattern_graph = symbolic_trace(pattern).graph - replacement_graph = symbolic_trace(replacement).graph - - # Find all possible pattern matches in original_graph. Note that - # pattern matches may overlap with each other. - matcher = _SubgraphMatcher(pattern_graph) - matches: List[Match] = [] - - # Consider each node as an "anchor" (deepest matching graph node) - for anchor in original_graph.nodes: - - if matcher.matches_subgraph_from_anchor(anchor): - - def pattern_is_contained(nodes_map: Dict[Node, Node]) -> bool: - # `lookup` represents all the nodes in `original_graph` - # that are part of `pattern` - lookup: Dict[Node, Node] = {v: k for k, v in nodes_map.items()} - for n in lookup.keys(): - - # Nodes that can "leak"... - - # Placeholders (by definition) - if n.op == "placeholder": - continue - # Pattern output (acts as a container) - if lookup[n].op == "output": - continue - # Result contained by pattern output (what we'll - # hook in to the new Graph, thus what we'll - # potentially use in other areas of the Graph as - # an input Node) - if (len(lookup[n].users) == 1 - and list(lookup[n].users.keys())[0].op == "output"): - continue - - for user in n.users: - # If this node has users that were not in - # `lookup`, then it must leak out of the - # pattern subgraph - if user not in lookup: - return False - return True - - # It's not a match if the pattern leaks out into the rest - # of the graph - if pattern_is_contained(matcher.nodes_map): - # Shallow copy nodes_map - matches.append(Match(anchor=anchor, - nodes_map=copy.copy({ - key: value - for key, value in matcher.nodes_map.items() - }))) - - # The set of all nodes in `original_graph` that we've seen thus far - # as part of a pattern match - replaced_nodes: Set[Node] = set() - # As we progressively replace nodes, we'll need to keep track of how the match results should change - match_changed_node: Dict[Node, Node] = dict() - - # Return True if one of the nodes in the current match has already - # been used as part of another match - def overlaps_with_prev_match(match: Match) -> bool: - for pn, gn in match.nodes_map.items(): - if pn.op in ["placeholder", "output"]: - continue - if gn in replaced_nodes and gn.op != "placeholder": - return True - return False - - for match in matches: - # Skip overlapping matches - if overlaps_with_prev_match(match): - continue - - # Map replacement graph nodes to their copy in `original_graph` - val_map: Dict[Node, Node] = {} - - pattern_placeholders = [n for n in pattern_graph.nodes - if n.op == "placeholder"] - assert len(pattern_placeholders) > 0 - replacement_placeholders = [n for n in replacement_graph.nodes - if n.op == "placeholder"] - assert len(pattern_placeholders) == len(replacement_placeholders) - placeholder_map = {r: p for r, p - in zip(replacement_placeholders, pattern_placeholders)} - - # node from `original_graph` that matched with the output node - # in `pattern` - subgraph_output: Node = match.anchor - - def mark_node_as_replaced(n: Node) -> None: - if n not in match.nodes_map.values(): - return - for n_ in n.all_input_nodes: - mark_node_as_replaced(n_) - replaced_nodes.add(n) - - for input_node in subgraph_output.all_input_nodes: - mark_node_as_replaced(input_node) - - # Initialize `val_map` with mappings from placeholder nodes in - # `replacement` to their corresponding node in `original_graph` - for replacement_node in replacement_placeholders: - # Get the `original_graph` placeholder node - # corresponding to the current `replacement_node` - pattern_node = placeholder_map[replacement_node] - original_graph_node = match_changed_node.get(match.nodes_map[pattern_node], match.nodes_map[pattern_node]) - - # Populate `val_map` - val_map[replacement_node] = original_graph_node - - # Copy the stack trace from the original graph to the replacement graph. - # Currently this is using a naive strategy: - # 1. find the first node with non-null stack trace in the original graph - # 2. if found, copy this stack trace to every node in the replacement graph - first_stack_trace = None - for pn, gn in match.nodes_map.items(): - if gn.stack_trace is not None: - first_stack_trace = gn.stack_trace - break - if first_stack_trace is not None: - for node in replacement_graph.nodes: - node.stack_trace = first_stack_trace - - # Copy the replacement graph over - with original_graph.inserting_before(subgraph_output): - copied_output = original_graph.graph_copy(replacement_graph, - val_map) - - # Clear out stack traces to prevent interference with next match - for node in replacement_graph.nodes: - node.stack_trace = None - - # Hook the output Node of the replacement subgraph in to the - # original Graph at the correct location - - # CASE 1: We need to hook the replacement subgraph in somewhere - # in the middle of the graph. We replace the Node in the - # original graph that corresponds to the end of the pattern - # subgraph - if subgraph_output.op != "output": - pattern_outputs = [n for n in pattern_graph.nodes - if n.op == "output"] - assert len(pattern_outputs) > 0 - replacement_outputs = [n for n in replacement_graph.nodes - if n.op == "output"] - assert len(replacement_outputs) == len(pattern_outputs) - outputs_map = {p: r for r, p - in zip(replacement_outputs, pattern_outputs)} - - for pn, gn in match.nodes_map.items(): - if gn.op == "placeholder": - continue - - # Search for the node corresponding to the output of the pattern - if pn.op != "output": - continue - assert subgraph_output == gn - - # Update all anchor inputs to the new nodes - rn = outputs_map[pn] - for pn_input, rn_input in zip(pn.all_input_nodes, rn.all_input_nodes): - gn_input = match.nodes_map[pn_input] - rn_input_in_original_graph = val_map[rn_input] - gn_input.replace_all_uses_with(rn_input_in_original_graph) - # We store the updated node point in case other nodes want to use it - match_changed_node[gn_input] = rn_input_in_original_graph - - assert subgraph_output.op != "output" - # CASE 2: The pattern subgraph match extends to the end of the - # original graph, so we need to change the current graph's - # output Node to reflect the insertion of the replacement graph. - # We'll keep the current output Node, but update its args and - # `_input_nodes` as necessary - else: - subgraph_output.args = ((copied_output,)) - if isinstance(copied_output, Node): - subgraph_output._input_nodes = {copied_output: None} - - assert isinstance(copied_output, Node) - # Erase the `pattern` nodes - for node in reversed(original_graph.nodes): - if len(node.users) == 0 and node.op != "output": - original_graph.erase_node(node) - - # Update the passed-in GraphModule to reflect the new state of - # `original_graph` - gm.recompile() - - # If `replacement` was an nn.Module, we'll need to make sure that - # all the submodules have been copied over correctly - if isinstance(replacement, torch.nn.Module): - _replace_submodules(gm, replacement) - - return matches diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py index 83b0caf5e531..70b852395ca9 100644 --- a/torch/ao/quantization/fx/utils.py +++ b/torch/ao/quantization/fx/utils.py @@ -12,7 +12,9 @@ ) from typing import Callable, Optional, List, Dict, Any, Set, Tuple, Union, Type +from collections import namedtuple import operator +import warnings # A dictionary for querying the weight index for a given op WEIGHT_INDEX_DICT = { @@ -111,12 +113,15 @@ def get_per_tensor_qparams(activation_post_process): dtype = activation_post_process.dtype return scale, zero_point, dtype -def get_quantize_node_info(activation_post_process: Callable) -> Tuple[str, Union[Callable, str], Dict[str, Any]]: +def get_quantize_node_info(activation_post_process: Callable) -> Optional[Tuple[str, Union[Callable, str], Dict[str, Any]]]: ''' Given an activation_post_process module, return node_type(e.g. call_function), quantize op(e.g. quantize_per_tensor) and a dictionary of extracted qparams from the module ''' dtype = activation_post_process.dtype # type: ignore[attr-defined] + compute_dtype = None + if hasattr(activation_post_process, "compute_dtype"): + compute_dtype = activation_post_process.compute_dtype # type: ignore[attr-defined] quantize_op : Optional[Union[Callable, str]] = None if dtype in [torch.quint8, torch.qint8]: node_type = "call_function" @@ -134,9 +139,17 @@ def get_quantize_node_info(activation_post_process: Callable) -> Tuple[str, Unio node_type = "call_method" quantize_op = "to" qparams = {"_dtype_": dtype} + elif dtype == torch.float32 and compute_dtype in [torch.quint8, torch.qint8, torch.float16]: + # dynamic quantization + node_type = "call_function" + quantize_op = torch.quantize_per_tensor_dynamic + # TODO: get reduce range from observer + # reduce_range = activation_post_process.reduce_range + reduce_range = torch.backends.quantized.engine == "fbgemm" + qparams = {"_dtype_": compute_dtype, "_reduce_range_": reduce_range} else: - raise Exception("Unsupported dtype in get_quantize_node_info:" + str(dtype)) - assert quantize_op is not None + warnings.warn(f"Unsupported activation_post_process in get_quantize_node_info: {activation_post_process}") + return None return node_type, quantize_op, qparams def quantize_node( @@ -146,7 +159,8 @@ def quantize_node( modules: Dict[str, torch.nn.Module], quantized_graph: Graph, node_name_to_scope: Dict[str, Tuple[str, type]], - is_input: bool) -> Node: + is_input: bool, + output_prefix: str = "_output") -> Node: ''' Add quantization nodes (eg. quantize_per_tensor/per_channel) for given node to graph with the qparams calculated from activation_post_process (obs_module). The observer node (obs_node) is used to find the FQN of the user of act_post_process. @@ -173,7 +187,7 @@ def quantize_node( else: # if the quantize function is at the output of the op, we use the observer input node to get the path first_linear_use_or_first_use = in_node - prefix = "_output" + prefix = output_prefix if first_linear_use_or_first_use and first_linear_use_or_first_use.name in node_name_to_scope: module_path, _ = node_name_to_scope[first_linear_use_or_first_use.name] @@ -184,7 +198,10 @@ def quantize_node( module_path = "" root_module = modules[''] graph = quantized_graph - node_type, quantize_op, qparams = get_quantize_node_info(obs_module) + maybe_quantize_node_info = get_quantize_node_info(obs_module) + assert maybe_quantize_node_info is not None, \ + f"Expecting quantize node info not to be None, observer: {obs_module}" + node_type, quantize_op, qparams = maybe_quantize_node_info inputs = [in_node] for key, value in qparams.items(): @@ -455,6 +472,74 @@ def all_node_args_have_no_tensors(node: Node, modules: Dict[str, torch.nn.Module cache[node] = result return result +def all_node_args_except_first(node: Node) -> List[int]: + """ + Returns all node arg indices after first + """ + return list(range(1, len(node.args))) + +def return_arg_list(arg_indices: List[int]) -> Callable[[Node], List[int]]: + """ + Constructs a function that takes a node as arg and returns the arg_indices + that are valid for node.args + """ + def arg_indices_func(node: Node) -> List[int]: + return [i for i in arg_indices if i < len(node.args)] + return arg_indices_func + +NodeInfo = namedtuple("NodeInfo", "op target") + +# this dict identifies which indices of a node are non tensors +# so that they can be propagated correctly since inserting observers +# for them would cause errors + +NON_OBSERVABLE_ARG_DICT: Dict[NodeInfo, Dict[Union[type, torch.dtype], Callable[[Node], List[int]]]] = { + NodeInfo("call_method", "masked_fill") : { + torch.bool: return_arg_list([1]), + float: return_arg_list([2]) + }, + NodeInfo("call_method", "permute") : { + int: all_node_args_except_first + }, + NodeInfo("call_method", "repeat") : { + int: all_node_args_except_first + }, + NodeInfo("call_method", "reshape") : { + int: all_node_args_except_first + }, + NodeInfo("call_method", "size") : { + int: return_arg_list([1]) + }, + NodeInfo("call_method", "transpose") : { + int: all_node_args_except_first + }, + NodeInfo("call_method", torch.transpose) : { + int: all_node_args_except_first + }, + NodeInfo("call_method", "unsqueeze") : { + int: return_arg_list([1]) + }, + NodeInfo("call_method", "unsqueeze_") : { + int: return_arg_list([1]) + }, + NodeInfo("call_method", torch.unsqueeze) : { + int: return_arg_list([1]) + }, + NodeInfo("call_method", "view") : { + int: all_node_args_except_first + }, +} + +EMPTY_ARG_DICT: Dict[Union[type, torch.dtype], Callable[[Node], List[int]]] = {} + +def get_non_observable_arg_indexes_and_types(node: Node) -> Dict[Union[type, torch.dtype], Callable[[Node], List[int]]]: + """ + Returns a dict with of non float tensor types as keys and values which correspond to a + function to retrieve the list (which takes the node as an argument) + """ + info = NodeInfo(node.op, node.target) + + return NON_OBSERVABLE_ARG_DICT.get(info, EMPTY_ARG_DICT) def node_return_type_is_int(node: Node) -> bool: """ @@ -463,13 +548,6 @@ def node_return_type_is_int(node: Node) -> bool: """ return node.op == 'call_method' and node.target == 'size' -def node_bool_tensor_arg_indexes(node: Node) -> List[int]: - """ - Returns indexes of boolean Tensor args - """ - if node.op == "call_method" and node.target == "masked_fill": - return [1] - return [] def is_get_tensor_info_node(node: Node) -> bool: """ Returns True if this node is a node that takes a Tensor as input and output some diff --git a/torch/ao/quantization/observer.py b/torch/ao/quantization/observer.py index 4263f4e40b68..7e86a39f1b17 100644 --- a/torch/ao/quantization/observer.py +++ b/torch/ao/quantization/observer.py @@ -8,7 +8,7 @@ from abc import ABCMeta, abstractmethod from collections import OrderedDict from functools import partial -from typing import Any, List, Tuple, Optional, Dict, Union +from typing import Any, List, Tuple, Optional, Dict import torch import torch.nn as nn @@ -114,12 +114,9 @@ def calculate_qparams(self, **kwargs): with_callable_args = classmethod(_with_callable_args) -class _ObserverBase(ObserverBase): - r"""Internal common base for all qint/quint8 observers. - - This base is for commonly used parameters used internally. - Users should use `~torch.ao.quantization.observer.ObserverBase` as a base class - for custom observers. +class UniformQuantizationObserverBase(ObserverBase): + r"""Common base for all observers using uniform quantization to calculate + scale and zero_point. Args: dtype: Quantized data type. @@ -128,6 +125,7 @@ class _ObserverBase(ObserverBase): This is sometimes required to avoid instruction overflow. quant_min: Minimum quantization value. If unspecified, it will follow the 8-bit setup. quant_max: Maximum quantization value. If unspecified, it will follow the 8-bit setup. + eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`. .. warning:: @@ -169,9 +167,10 @@ def __init__( quant_min=None, quant_max=None, factory_kwargs=None, + eps=torch.finfo(torch.float32).eps, ) -> None: factory_kwargs = torch.nn.factory_kwargs(factory_kwargs) - super(_ObserverBase, self).__init__(dtype=dtype) + super().__init__(dtype=dtype) self.qscheme = qscheme if reduce_range: warnings.warn( @@ -180,7 +179,7 @@ def __init__( ) self.reduce_range = reduce_range self.register_buffer( - "eps", torch.tensor([torch.finfo(torch.float32).eps], **factory_kwargs) + "eps", torch.tensor([eps], **factory_kwargs) ) assert self.qscheme in ( torch.per_tensor_affine, @@ -195,6 +194,7 @@ def __init__( torch.qint8, torch.quint8, torch.quint4x2, + torch.qint32, ), "Default Observer only works for qint8, quint8 and quint4x2 data type" self.has_customized_qrange = (quant_min is not None) and (quant_max is not None) if self.has_customized_qrange: @@ -331,7 +331,13 @@ def reset_min_max_vals(self): raise NotImplementedError("Cannot reset min/max values in the given observer.") -class MinMaxObserver(_ObserverBase): +# Originally, this class was called `_ObserverBase`. Keeping the old name around +# for backwards compatibility. +# TODO(after v1.13): delete this +_ObserverBase = UniformQuantizationObserverBase + + +class MinMaxObserver(UniformQuantizationObserverBase): r"""Observer module for computing the quantization parameters based on the running min and max values. @@ -345,8 +351,7 @@ class MinMaxObserver(_ObserverBase): reduce_range: Reduces the range of the quantized data type by 1 bit quant_min: Minimum quantization value. If unspecified, it will follow the 8-bit setup. quant_max: Maximum quantization value. If unspecified, it will follow the 8-bit setup. - memoryless: Boolean that controls whether observer removes old data when a new input is seen. - This is most useful for simulating dynamic quantization, especially during QAT. + eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`. Given running min/max as :math:`x_\text{min}` and :math:`x_\text{max}`, scale :math:`s` and zero point :math:`z` are computed as: @@ -405,7 +410,7 @@ def __init__( quant_min=None, quant_max=None, factory_kwargs=None, - memoryless=False, + eps=torch.finfo(torch.float32).eps, ) -> None: # For x86 quantized kernels, we need to ensure that the vpmaddubsw @@ -421,8 +426,8 @@ def __init__( quant_min=quant_min, quant_max=quant_max, factory_kwargs=factory_kwargs, + eps=eps, ) - self.memoryless = memoryless factory_kwargs = torch.nn.factory_kwargs(factory_kwargs) self.register_buffer("min_val", torch.tensor(float("inf"), **factory_kwargs)) self.register_buffer("max_val", torch.tensor(float("-inf"), **factory_kwargs)) @@ -440,8 +445,6 @@ def forward(self, x_orig): r"""Records the running minimum and maximum of ``x``.""" if x_orig.numel() == 0: return x_orig - elif self.memoryless: - self.reset_min_max_vals() x = x_orig.detach() # avoid keeping autograd tape x = x.to(self.min_val.dtype) min_val_cur, max_val_cur = torch.aminmax(x) @@ -482,6 +485,7 @@ class MovingAverageMinMaxObserver(MinMaxObserver): reduce_range: Reduces the range of the quantized data type by 1 bit quant_min: Minimum quantization value. If unspecified, it will follow the 8-bit setup. quant_max: Maximum quantization value. If unspecified, it will follow the 8-bit setup. + eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`. The moving average min/max is computed as follows @@ -518,6 +522,7 @@ def __init__( reduce_range=False, quant_min=None, quant_max=None, + eps=torch.finfo(torch.float32).eps, **kwargs ) -> None: self.averaging_constant = averaging_constant @@ -527,6 +532,7 @@ def __init__( reduce_range=reduce_range, quant_min=quant_min, quant_max=quant_max, + eps=eps, **kwargs ) @@ -548,7 +554,7 @@ def forward(self, x_orig): return x_orig -class PerChannelMinMaxObserver(_ObserverBase): +class PerChannelMinMaxObserver(UniformQuantizationObserverBase): r"""Observer module for computing the quantization parameters based on the running per channel min and max values. @@ -564,8 +570,7 @@ class PerChannelMinMaxObserver(_ObserverBase): reduce_range: Reduces the range of the quantized data type by 1 bit quant_min: Minimum quantization value. If unspecified, it will follow the 8-bit setup. quant_max: Maximum quantization value. If unspecified, it will follow the 8-bit setup. - memoryless: Boolean that controls whether observer removes old data when a new input is seen. - This is most useful for simulating dynamic quantization, especially during QAT. + eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`. The quantization parameters are computed the same way as in :class:`~torch.ao.quantization.observer.MinMaxObserver`, with the difference @@ -587,7 +592,7 @@ def __init__( quant_min=None, quant_max=None, factory_kwargs=None, - memoryless=False, + eps=torch.finfo(torch.float32).eps, ) -> None: super(PerChannelMinMaxObserver, self).__init__( dtype=dtype, @@ -596,8 +601,8 @@ def __init__( quant_min=quant_min, quant_max=quant_max, factory_kwargs=factory_kwargs, + eps=eps, ) - self.memoryless = memoryless factory_kwargs = torch.nn.factory_kwargs(factory_kwargs) self.ch_axis = ch_axis self.register_buffer("min_val", torch.tensor([], **factory_kwargs)) @@ -630,7 +635,7 @@ def _forward(self, x_orig): # are done in place and types need to match for comparisons y = y.to(self.min_val.dtype) y = torch.flatten(y, start_dim=1) - if min_val.numel() == 0 or max_val.numel() == 0 or self.memoryless: + if min_val.numel() == 0 or max_val.numel() == 0: min_val, max_val = torch.aminmax(y, dim=1) else: min_val_cur, max_val_cur = torch.aminmax(y, dim=1) @@ -651,7 +656,7 @@ def extra_repr(self): def _load_from_state_dict( self, - state_dict: Union[Dict[str, torch.Tensor], Dict[str, torch.Tensor]], + state_dict: Dict[str, Any], prefix: str, local_metadata: Dict[str, torch.Tensor], strict: bool, @@ -707,7 +712,7 @@ def _load_from_state_dict( def _load_from_state_dict_script( self, - state_dict: Union[Dict[str, torch.Tensor], Dict[str, torch.Tensor]], + state_dict: Dict[str, Any], prefix: str, local_metadata: Dict[str, torch.Tensor], strict: bool, @@ -750,6 +755,7 @@ class MovingAveragePerChannelMinMaxObserver(PerChannelMinMaxObserver): reduce_range: Reduces the range of the quantized data type by 1 bit quant_min: Minimum quantization value. If unspecified, it will follow the 8-bit setup. quant_max: Maximum quantization value. If unspecified, it will follow the 8-bit setup. + eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`. The quantization parameters are computed the same way as in :class:`~torch.ao.quantization.observer.MovingAverageMinMaxObserver`, with the @@ -769,6 +775,7 @@ def __init__( reduce_range=False, quant_min=None, quant_max=None, + eps=torch.finfo(torch.float32).eps, **kwargs ) -> None: super(MovingAveragePerChannelMinMaxObserver, self).__init__( @@ -778,6 +785,7 @@ def __init__( reduce_range=reduce_range, quant_min=quant_min, quant_max=quant_max, + eps=eps, **kwargs ) self.averaging_constant = averaging_constant @@ -809,7 +817,7 @@ def forward(self, x_orig): return x_orig -class HistogramObserver(_ObserverBase): +class HistogramObserver(UniformQuantizationObserverBase): r""" The module records the running histogram of tensor values along with min/max values. ``calculate_qparams`` will calculate scale and zero_point. @@ -821,6 +829,7 @@ class HistogramObserver(_ObserverBase): dtype: Quantized data type qscheme: Quantization scheme to be used reduce_range: Reduces the range of the quantized data type by 1 bit + eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`. The scale and zero point are computed as follows: @@ -847,6 +856,7 @@ def __init__( quant_min=None, quant_max=None, factory_kwargs=None, + eps=torch.finfo(torch.float32).eps, ) -> None: # bins: The number of bins used for histogram calculation. super(HistogramObserver, self).__init__( @@ -856,6 +866,7 @@ def __init__( quant_min=quant_min, quant_max=quant_max, factory_kwargs=factory_kwargs, + eps=eps, ) factory_kwargs = torch.nn.factory_kwargs(factory_kwargs) self.bins = bins @@ -1258,7 +1269,7 @@ def calculate_qparams(self): ) -class RecordingObserver(_ObserverBase): +class RecordingObserver(ObserverBase): r""" The module is mainly for debug and records the tensor values during runtime. @@ -1269,8 +1280,8 @@ class RecordingObserver(_ObserverBase): """ __annotations__ = {"tensor_val": List[Optional[torch.Tensor]]} - def __init__(self, **kwargs): - super(RecordingObserver, self).__init__(**kwargs) + def __init__(self, dtype=torch.quint8, **kwargs): + super(RecordingObserver, self).__init__(dtype=dtype, **kwargs) # type: ignore[call-arg] self.tensor_val = [] def forward(self, x): @@ -1434,6 +1445,13 @@ def load_observer_state_dict(mod, obs_dict): Default weight observer. """ +weight_observer_range_neg_127_to_127 = MinMaxObserver.with_args( + dtype=torch.qint8, qscheme=torch.per_tensor_symmetric, + quant_min=-127, quant_max=127, eps=2 ** -12) +""" +Symmetric weight observer with the 8-bit values restricted to [-127, +127], excluding -128. +""" + default_histogram_observer = HistogramObserver.with_args(quant_min=0, quant_max=127) """ Default histogram observer, usually used for PTQ. @@ -1447,6 +1465,13 @@ def load_observer_state_dict(mod, obs_dict): weight quantization is supported, such as `fbgemm`. """ +per_channel_weight_observer_range_neg_127_to_127 = MinMaxObserver.with_args( + dtype=torch.qint8, qscheme=torch.per_channel_symmetric, + quant_min=-127, quant_max=127, eps=2 ** -12) +""" +Per-channel, symmetric weight observer with the 8-bit values restricted to [-127, +127], excluding -128. +""" + default_dynamic_quant_observer = PlaceholderObserver.with_args( dtype=torch.float, compute_dtype=torch.quint8 ) @@ -1470,10 +1495,14 @@ def load_observer_state_dict(mod, obs_dict): # TODO(future PR): remove these defaults and enforce activation functions # to explicitly specify their output range -default_symmetric_fixed_qparams_observer = FixedQParamsObserver.with_args( +default_fixed_qparams_range_neg1to1_observer = FixedQParamsObserver.with_args( scale=2.0 / 256.0, zero_point=128, dtype=torch.quint8, quant_min=0, quant_max=255) -default_affine_fixed_qparams_observer = FixedQParamsObserver.with_args( +default_fixed_qparams_range_0to1_observer = FixedQParamsObserver.with_args( scale=1.0 / 256.0, zero_point=0, dtype=torch.quint8, quant_min=0, quant_max=255) +# TODO: the following 2 variables are kept for backwards compatibility; remove after a few releases +default_symmetric_fixed_qparams_observer = default_fixed_qparams_range_neg1to1_observer +default_affine_fixed_qparams_observer = default_fixed_qparams_range_0to1_observer + """ Default observers for fixed qparams operations. """ diff --git a/torch/ao/quantization/qconfig.py b/torch/ao/quantization/qconfig.py index bf8d185cfdb0..c093d71a6b00 100644 --- a/torch/ao/quantization/qconfig.py +++ b/torch/ao/quantization/qconfig.py @@ -16,6 +16,8 @@ default_fused_per_channel_wt_fake_quant, default_embedding_fake_quant, default_embedding_fake_quant_4bit, + fused_wt_fake_quant_range_neg_127_to_127, + fused_per_channel_wt_fake_quant_range_neg_127_to_127, ) from .observer import ( @@ -32,6 +34,8 @@ default_per_channel_weight_observer, default_placeholder_observer, default_weight_observer, + weight_observer_range_neg_127_to_127, + per_channel_weight_observer_range_neg_127_to_127, default_reuse_input_observer, ) import warnings @@ -113,7 +117,7 @@ def __new__(cls, activation=torch.nn.Identity, weight=torch.nn.Identity): Default dynamic qconfig. """ -float16_dynamic_qconfig = QConfig(activation=PlaceholderObserver.with_args(dtype=torch.float32), +float16_dynamic_qconfig = QConfig(activation=PlaceholderObserver.with_args(dtype=torch.float32, compute_dtype=torch.float16), weight=PlaceholderObserver.with_args(dtype=torch.float16)) """ Dynamic qconfig with weights quantized to `torch.float16`. @@ -179,28 +183,71 @@ def __new__(cls, activation=torch.nn.Identity, weight=torch.nn.Identity): Default qconfig for operators that reuse the observers from input Tensor, e.g. reshape """ -def get_default_qconfig(backend='fbgemm'): +def get_default_qconfig(backend='fbgemm', version=0): """ Returns the default PTQ qconfig for the specified backend. Args: - * `backend`: a string representing the target backend. Currently supports `fbgemm` - and `qnnpack`. + * `backend`: a string representing the target backend. Currently supports `fbgemm`, + `qnnpack` and `onednn`. Return: qconfig """ - - if backend == 'fbgemm': - qconfig = QConfig(activation=HistogramObserver.with_args(reduce_range=True), - weight=default_per_channel_weight_observer) - elif backend == 'qnnpack': - qconfig = QConfig(activation=HistogramObserver.with_args(reduce_range=False), - weight=default_weight_observer) + if version == 0: + if backend == 'fbgemm': + qconfig = QConfig(activation=HistogramObserver.with_args(reduce_range=True), + weight=default_per_channel_weight_observer) + elif backend == 'qnnpack': + qconfig = QConfig(activation=HistogramObserver.with_args(reduce_range=False), + weight=default_weight_observer) + elif backend == 'onednn': + qconfig = QConfig(activation=HistogramObserver.with_args(reduce_range=False), + weight=default_per_channel_weight_observer) + else: + qconfig = default_qconfig else: - qconfig = default_qconfig + raise AssertionError("Version number: " + str(version) + + " in get_default_qconfig is not supported. Version number must be 0") + return qconfig +""" +Default, symmetric PTQ qconfig for the specified backend. And a per_channel +variant of the same. + +Symmetric here applies to signed weights with zero point = 0, and additional +value restrictions. The activations are also signed 8-bit integers with this +qconfig. + + * Once this change is merged [as of 3/17/22], with backend or qengine = + 'qnnpack', some quantized operators with this symmetric qconfig may use + operators from xnnpack library. + + ** Support to use xnnpack ops with `qnnpack` backed for asymmetric + qconfig (returned by get_default_qconfig()) is not available yet. + + * This qconfig uses signed activations and weights. Weights have added + restrictions such as zero point is forced to be 0, making the weights + symmetric, hence the name. And the 8-bit quantized values are + restricting to to [-127, +127], excluding -128. + + * xnnpack has a requantization scale value restriction, 0x1p-32 <= + requantization_scale < 256.0 where, `requantization_scale = (input_scale + * kernel_scale) / (output_scale)`. Using this eps (w/ assumed max value + of 256) is to prevent requantization_scale to go below xnnpack lower + threshold. +""" +default_symmetric_qnnpack_qconfig = QConfig(activation=HistogramObserver.with_args(dtype=torch.qint8, + reduce_range=False, + eps=2 ** -12), + weight=weight_observer_range_neg_127_to_127) + +default_per_channel_symmetric_qnnpack_qconfig = QConfig(activation=HistogramObserver.with_args(dtype=torch.qint8, + reduce_range=False, + eps=2 ** -12), + weight=per_channel_weight_observer_range_neg_127_to_127) + default_embedding_qat_qconfig = QConfig(activation=NoopObserver.with_args(dtype=torch.float32), weight=default_embedding_fake_quant) @@ -212,15 +259,15 @@ def get_default_qat_qconfig(backend='fbgemm', version=1): Returns the default QAT qconfig for the specified backend. Args: - * `backend`: a string representing the target backend. Currently supports `fbgemm` - and `qnnpack`. + * `backend`: a string representing the target backend. Currently supports `fbgemm`, + `qnnpack` and `onednn`. * `version`: version, for backwards compatibility. Can be `None` or `1`. Return: qconfig """ # Histogram observer is too slow for quantization aware training - if version is None: + if version == 0: if backend == 'fbgemm': qconfig = QConfig(activation=FakeQuantize.with_args(observer=MovingAverageMinMaxObserver, quant_min=0, @@ -233,10 +280,15 @@ def get_default_qat_qconfig(backend='fbgemm', version=1): quant_max=255, reduce_range=False), weight=default_weight_fake_quant) + elif backend == 'onednn': + qconfig = QConfig(activation=FakeQuantize.with_args(observer=MovingAverageMinMaxObserver, + quant_min=0, + quant_max=255), + weight=default_per_channel_weight_fake_quant) else: qconfig = default_qat_qconfig - # Use the fused observer + fake_quant modules for doing QAT. - if version == 1: + # Use the fused observe + fake_quant modules for doing QAT. + elif version == 1: if backend == 'fbgemm': qconfig = QConfig(activation=FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAverageMinMaxObserver, quant_min=0, @@ -249,23 +301,86 @@ def get_default_qat_qconfig(backend='fbgemm', version=1): quant_max=255, reduce_range=False), weight=default_fused_wt_fake_quant) + elif backend == 'onednn': + qconfig = QConfig(activation=FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAverageMinMaxObserver, + quant_min=0, + quant_max=255), + weight=default_fused_per_channel_wt_fake_quant) else: qconfig = default_qat_qconfig_v2 + else: + raise AssertionError("Version number: " + str(version) + + "in get_default_qat_qconfig is not supported. Version number must be 0 or 1") + return qconfig -def get_default_qconfig_dict(backend='fbgemm', version=0): - qconfig = get_default_qconfig(backend) +""" +Default symmetric QAT qconfig for qnnpack. And its per channel weight variant. +""" +default_symmetric_qnnpack_qat_qconfig = QConfig( + activation=FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAverageMinMaxObserver, + quant_min=-128, + quant_max=127, + dtype=torch.qint8, + reduce_range=False, + eps=2 ** -12), + weight=fused_wt_fake_quant_range_neg_127_to_127) + +default_per_channel_symmetric_qnnpack_qat_qconfig = QConfig( + activation=FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAverageMinMaxObserver, + quant_min=-128, + quant_max=127, + dtype=torch.qint8, + reduce_range=False, + eps=2 ** -12), + weight=fused_per_channel_wt_fake_quant_range_neg_127_to_127) + +def _get_default_qconfig_dict_helper(qconfig, qconfig_transpose): return { "": qconfig, - "object_type": [("reshape", default_reuse_input_qconfig)] - } + "object_type": [("reshape", default_reuse_input_qconfig), + (torch.nn.Conv1d, qconfig), + (torch.nn.Conv2d, qconfig), + (torch.nn.Conv3d, qconfig), + (torch.nn.ConvTranspose1d, qconfig_transpose), + (torch.nn.ConvTranspose2d, qconfig_transpose), + (torch.nn.ConvTranspose3d, qconfig_transpose), + (torch.nn.Linear, qconfig), + (torch.nn.functional.conv1d, qconfig), + (torch.nn.functional.conv2d, qconfig), + (torch.nn.functional.conv3d, qconfig), + (torch.nn.functional.conv_transpose1d, qconfig_transpose), + (torch.nn.functional.conv_transpose2d, qconfig_transpose), + (torch.nn.functional.conv_transpose3d, qconfig_transpose), + (torch.nn.functional.linear, qconfig), + (torch.nn.ReLU, qconfig), + (torch.nn.functional.relu, qconfig), + (torch.relu, qconfig), + (torch.nn.BatchNorm1d, qconfig), + (torch.nn.BatchNorm2d, qconfig), + (torch.nn.BatchNorm3d, qconfig)]} + +def get_default_qconfig_dict(backend='fbgemm', version=0): + qconfig = get_default_qconfig(backend, version) + qconfig_transpose = qconfig + # default_per_channel_weight_observer is not currently compatible with fbgemm backend + # so we have to modify the weight observer to default_weight_observer or another + # per tensor supported observer. + # see https://github.com/pytorch/pytorch/issues/47535 + if backend == "fbgemm": + qconfig_transpose = QConfig(activation=qconfig.activation, weight=default_weight_observer) + return _get_default_qconfig_dict_helper(qconfig, qconfig_transpose) def get_default_qat_qconfig_dict(backend='fbgemm', version=1): - qconfig = get_default_qat_qconfig(backend, version=version) - return { - "": qconfig, - "object_type": [("reshape", default_reuse_input_qconfig)] - } + qconfig = get_default_qat_qconfig(backend, version) + qconfig_transpose = qconfig + # default_per_channel_weight_observer is not currently compatible with fbgemm backend + # so we have to modify the weight observer to default_weight_observer or another + # per tensor supported observer + # see https://github.com/pytorch/pytorch/issues/47535 + if backend == "fbgemm": + qconfig_transpose = QConfig(activation=qconfig.activation, weight=default_weight_fake_quant) + return _get_default_qconfig_dict_helper(qconfig, qconfig_transpose) def assert_valid_qconfig(qconfig: Optional[QConfig], mod: torch.nn.Module) -> None: @@ -369,9 +484,10 @@ def partial_equals(p1, p2): def activation_is_memoryless(qconfig: QConfig): """ Return whether the observer for activations defined in the given QConfig is memoryless. + This means a MovingAverage observer with averaging constant equal to 1. """ def _is_memoryless(observer): - return hasattr(observer, "memoryless") and observer.memoryless + return hasattr(observer, "averaging_constant") and observer.averaging_constant == 1 act = qconfig.activation() if isinstance(act, FakeQuantizeBase) and hasattr(act, "activation_post_process"): return _is_memoryless(act.activation_post_process) diff --git a/torch/ao/quantization/quantization_mappings.py b/torch/ao/quantization/quantization_mappings.py index 3f3ce8fff5df..ebaa693c7477 100644 --- a/torch/ao/quantization/quantization_mappings.py +++ b/torch/ao/quantization/quantization_mappings.py @@ -19,17 +19,29 @@ import torch.ao.nn as ao_nn from torch.ao.quantization.stubs import QuantStub, DeQuantStub from torch.ao.quantization.fake_quantize import ( - default_affine_fixed_qparams_fake_quant, - default_symmetric_fixed_qparams_fake_quant, + default_fixed_qparams_range_0to1_fake_quant, + default_fixed_qparams_range_neg1to1_fake_quant, ) from torch.ao.quantization.utils import get_combined_dict +from torch.nn.utils.parametrize import type_before_parametrizations # Default map for swapping float module to reference quantized modules DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS : Dict[Callable, Any] = { + QuantStub: nnq.Quantize, + DeQuantStub: nnq.DeQuantize, nn.Linear: nnqr.Linear, nn.Conv1d: nnqr.Conv1d, nn.Conv2d: nnqr.Conv2d, nn.Conv3d: nnqr.Conv3d, + nn.ConvTranspose1d: nnqr.ConvTranspose1d, + nn.ConvTranspose2d: nnqr.ConvTranspose2d, + nn.ConvTranspose3d: nnqr.ConvTranspose3d, + nn.Embedding: nnqr.Embedding, + nn.EmbeddingBag: nnqr.EmbeddingBag, + nn.GRUCell: nnqr.GRUCell, + nn.LSTMCell: nnqr.LSTMCell, + nn.RNNCell: nnqr.RNNCell, + nn.LSTM: nnqr.LSTM, } # Default map for swapping float module to quantized ones @@ -77,6 +89,7 @@ nniqat.ConvReLU2d: nniq.ConvReLU2d, nniqat.ConvReLU3d: nniq.ConvReLU3d, nniqat.LinearReLU: nniq.LinearReLU, + nniqat.LinearBn1d: nnq.Linear, # QAT modules: nnqat.Linear: nnq.Linear, nnqat.Conv2d: nnq.Conv2d, @@ -99,6 +112,7 @@ nni.ConvReLU2d: nniqat.ConvReLU2d, nni.ConvReLU3d: nniqat.ConvReLU3d, nni.LinearReLU: nniqat.LinearReLU, + nni.LinearBn1d: nniqat.LinearBn1d, } # Default map for swapping dynamic modules @@ -142,9 +156,10 @@ # mapping from module to output activation post process class DEFAULT_MODULE_TO_ACT_POST_PROCESS : Dict[Callable, Callable] = { - nn.Hardsigmoid: default_affine_fixed_qparams_fake_quant, - nn.Sigmoid: default_affine_fixed_qparams_fake_quant, - nn.Tanh: default_symmetric_fixed_qparams_fake_quant, + nn.Hardsigmoid: default_fixed_qparams_range_0to1_fake_quant, + nn.Sigmoid: default_fixed_qparams_range_0to1_fake_quant, + nn.Softmax: default_fixed_qparams_range_0to1_fake_quant, + nn.Tanh: default_fixed_qparams_range_neg1to1_fake_quant, } # Default map for swapping float module to static sparse quantized ones @@ -170,6 +185,11 @@ def get_default_static_quant_module_mappings() -> Dict[Callable, Any]: ''' return copy.deepcopy(DEFAULT_STATIC_QUANT_MODULE_MAPPINGS) +def get_default_static_quant_reference_module_mappings() -> Dict[Callable, Any]: + ''' Get reference module mapping for post training static quantization + ''' + return copy.deepcopy(DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS) + def get_embedding_static_quant_module_mappings() -> Dict[Callable, Any]: ''' Get module mapping, including mapping for embedding QAT ''' @@ -288,7 +308,7 @@ def _get_special_act_post_process(module: torch.nn.Module) -> Optional[Callable] input: torch.nn.Sigmoid output: default_affine_fixed_qparam_fake_quant """ - return DEFAULT_MODULE_TO_ACT_POST_PROCESS.get(type(module), None) + return DEFAULT_MODULE_TO_ACT_POST_PROCESS.get(type_before_parametrizations(module), None) def _has_special_act_post_process(module: torch.nn.Module) -> bool: return module.training and type(module) in DEFAULT_MODULE_TO_ACT_POST_PROCESS diff --git a/torch/ao/quantization/quantization_types.py b/torch/ao/quantization/quantization_types.py new file mode 100644 index 000000000000..b6cb5bef434e --- /dev/null +++ b/torch/ao/quantization/quantization_types.py @@ -0,0 +1,18 @@ +# TODO: the name of this file is probably confusing, remove this file and move the type +# definitions to somewhere else, e.g. to .utils +from typing import Any, Tuple, Union +from torch.fx import Node +from .utils import Pattern # noqa: F401 + +NodePattern = Union[Tuple[Node, Node], Tuple[Node, Tuple[Node, Node]], Any] + +# This is the Quantizer class instance from torch/quantization/fx/quantize.py. +# Define separately to prevent circular imports. +# TODO(future PR): improve this. +QuantizerCls = Any + +__all__ = [ + "Pattern", + "NodePattern", + "QuantizerCls", +] diff --git a/torch/ao/quantization/quantize.py b/torch/ao/quantization/quantize.py index 5afff09b64b8..f5aa195c94dd 100644 --- a/torch/ao/quantization/quantize.py +++ b/torch/ao/quantization/quantize.py @@ -10,13 +10,14 @@ from torch.ao.quantization.quantization_mappings import ( get_default_dynamic_quant_module_mappings, get_default_static_quant_module_mappings, + get_default_static_quant_reference_module_mappings, get_default_qat_module_mappings, get_default_qconfig_propagation_list, no_observer_set, _has_special_act_post_process, _get_special_act_post_process, ) - +from .utils import get_qparam_dict, has_no_children_ignoring_parametrizations from torch.ao.quantization.stubs import DeQuantStub, QuantWrapper from torch.ao.quantization.qconfig import ( add_module_to_qconfig_obs_ctr, @@ -25,6 +26,7 @@ float_qparams_weight_only_qconfig, float_qparams_weight_only_qconfig_4bit, activation_is_memoryless) +from torch.nn.utils.parametrize import type_before_parametrizations def is_activation_post_process(module): return (isinstance(module, torch.ao.quantization.ObserverBase) or @@ -32,7 +34,7 @@ def is_activation_post_process(module): def _propagate_qconfig_helper(module, qconfig_dict, - qconfig_parent=None, prefix=''): + qconfig_parent=None, prefix='', prepare_custom_config_dict=None): r"""This is a helper function for `propagate_qconfig_` Args: @@ -44,12 +46,14 @@ def _propagate_qconfig_helper(module, qconfig_dict, module prefix: corresponding prefix of the current module, used as key in qconfig_dict + prepare_custom_config_dict: dictionary for custom handling of modules + see docs for :func:`~torch.ao.quantization.prepare_fx` Return: None, module is modified inplace with qconfig attached """ - module_qconfig = qconfig_dict.get(type(module), qconfig_parent) + module_qconfig = qconfig_dict.get(type_before_parametrizations(module), qconfig_parent) module_qconfig = qconfig_dict.get(prefix, module_qconfig) module_qconfig = getattr(module, 'qconfig', module_qconfig) @@ -60,10 +64,16 @@ def _propagate_qconfig_helper(module, qconfig_dict, for name, child in module.named_children(): module_prefix = prefix + '.' + name if prefix else name - _propagate_qconfig_helper(child, qconfig_dict, - qconfig_with_device_check, module_prefix) + # do no not propagate qconfig to child if child is non traceable + if prepare_custom_config_dict is None or not ( + name in prepare_custom_config_dict.get("non_traceable_module_name", []) + or type(child) in prepare_custom_config_dict.get("non_traceable_module_class", []) + ): + _propagate_qconfig_helper( + child, qconfig_dict, qconfig_with_device_check, module_prefix + ) -def propagate_qconfig_(module, qconfig_dict=None): +def propagate_qconfig_(module, qconfig_dict=None, prepare_custom_config_dict=None): r"""Propagate qconfig through the module hierarchy and assign `qconfig` attribute on each leaf module @@ -73,13 +83,17 @@ def propagate_qconfig_(module, qconfig_dict=None): quantization configuration, qconfig applies to all submodules of a given module unless qconfig for the submodules are specified (when the submodule already has qconfig attribute) + prepare_custom_config_dict: dictionary for custom handling of modules + see docs for :func:`~torch.ao.quantization.prepare_fx` Return: None, module is modified inplace with qconfig attached """ if qconfig_dict is None: qconfig_dict = {} - _propagate_qconfig_helper(module, qconfig_dict) + if prepare_custom_config_dict is None: + prepare_custom_config_dict = {} + _propagate_qconfig_helper(module, qconfig_dict, prepare_custom_config_dict=prepare_custom_config_dict) def _observer_forward_hook(self, input, output): r"""Forward hook that calls observer on the output @@ -157,9 +171,9 @@ def insert_activation_post_process(m, special_act_post_process=None): for name, child in module.named_children(): # TODO remove Dropout special after codebase stable - if type(child) in [nn.Dropout]: + if type_before_parametrizations(child) in [nn.Dropout]: continue - elif type(child) in [nnq.FloatFunctional, nnq.QFunctional]: + elif type_before_parametrizations(child) in [nnq.FloatFunctional, nnq.QFunctional]: if needs_observation(child): child.activation_post_process = get_activation_post_process(child.qconfig, device) elif isinstance(child, _FusedModule): @@ -169,23 +183,23 @@ def insert_activation_post_process(m, special_act_post_process=None): elif _has_special_act_post_process(child): special_act_post_process = _get_special_act_post_process(child) insert_activation_post_process(child, special_act_post_process) - elif non_leaf_module_list is not None and type(child) in non_leaf_module_list: + elif non_leaf_module_list is not None and type_before_parametrizations(child) in non_leaf_module_list: if needs_observation(child): insert_activation_post_process(child) - elif needs_observation(child) and type(child) in custom_module_class_mapping: - observed_child = custom_module_class_mapping[type(child)].from_float(child) + elif needs_observation(child) and type_before_parametrizations(child) in custom_module_class_mapping: + observed_child = custom_module_class_mapping[type_before_parametrizations(child)].from_float(child) setattr(module, name, observed_child) # TODO: These are the modules that cannot be observed # Once there are more, we should move them to a separate list - if custom_module_class_mapping[type(child)] not in no_observer_set(): + if custom_module_class_mapping[type_before_parametrizations(child)] not in no_observer_set(): insert_activation_post_process(observed_child) else: add_observer_(child, qconfig_propagation_list, non_leaf_module_list, device, custom_module_class_mapping) # Insert observers only for leaf nodes, note that this observer is for # the output of the module, for input QuantStub will observe them - if len(module._modules) == 0 and not isinstance(module, torch.nn.Sequential) \ - and type(module) in qconfig_propagation_list: + if has_no_children_ignoring_parametrizations(module) and not isinstance(module, torch.nn.Sequential) \ + and type_before_parametrizations(module) in qconfig_propagation_list: insert_activation_post_process(module) def get_unique_devices_(module): @@ -207,7 +221,7 @@ def add_quant_dequant(module): wraps the input module, the latter case only happens when the input module is a leaf module and we want to quantize it. """ - if len(module._modules) == 0 and hasattr(module, 'qconfig') and module.qconfig: + if has_no_children_ignoring_parametrizations(module) and hasattr(module, 'qconfig') and module.qconfig: return QuantWrapper(module) for name, child in module.named_children(): @@ -472,7 +486,7 @@ def quantize_qat(model, run_fn, run_args, inplace=False): def convert( module, mapping=None, inplace=False, remove_qconfig=True, - convert_custom_config_dict=None): + is_reference=False, convert_custom_config_dict=None): r"""Converts submodules in input module to a different module according to `mapping` by calling `from_float` method on the target module class. And remove qconfig at the end if remove_qconfig is set to True. @@ -503,7 +517,7 @@ def convert( if not inplace: module = copy.deepcopy(module) _convert( - module, mapping, inplace=True, + module, mapping, inplace=True, is_reference=is_reference, convert_custom_config_dict=convert_custom_config_dict) if remove_qconfig: _remove_qconfig(module) @@ -511,7 +525,7 @@ def convert( def _convert( module, mapping=None, inplace=False, - convert_custom_config_dict=None): + is_reference=False, convert_custom_config_dict=None): r"""Converts submodules in input module to a different module according to `mapping` by calling `from_float` method on the target module class @@ -522,10 +536,12 @@ def _convert( Modules inplace: carry out model transformations in-place, the original module is mutated + is_reference: a flag to enable quantized reference module """ if mapping is None: - mapping = get_default_static_quant_module_mappings() + mapping = get_default_static_quant_reference_module_mappings() if is_reference \ + else get_default_static_quant_module_mappings() if convert_custom_config_dict is None: convert_custom_config_dict = {} custom_module_class_mapping = convert_custom_config_dict.get("observed_to_quantized_custom_module_class", {}) @@ -537,9 +553,9 @@ def _convert( # both fused modules and observed custom modules are # swapped as one unit if not isinstance(mod, _FusedModule) and \ - type(mod) not in custom_module_class_mapping: + type_before_parametrizations(mod) not in custom_module_class_mapping: _convert(mod, mapping, True, # inplace - convert_custom_config_dict) + is_reference, convert_custom_config_dict) reassign[name] = swap_module(mod, mapping, custom_module_class_mapping) for key, value in reassign.items(): @@ -561,11 +577,19 @@ def swap_module(mod, mapping, custom_module_class_mapping): new_mod = mod if hasattr(mod, 'qconfig') and mod.qconfig is not None: swapped = False - if type(mod) in custom_module_class_mapping: - new_mod = custom_module_class_mapping[type(mod)].from_observed(mod) + if type_before_parametrizations(mod) in custom_module_class_mapping: + new_mod = custom_module_class_mapping[type_before_parametrizations(mod)].from_observed(mod) swapped = True - elif type(mod) in mapping: - new_mod = mapping[type(mod)].from_float(mod) + elif type_before_parametrizations(mod) in mapping: + qmod = mapping[type_before_parametrizations(mod)] + if hasattr(qmod, '_IS_REFERENCE') and qmod._IS_REFERENCE: + assert mod.qconfig is not None + weight_post_process = mod.qconfig.weight() + weight_post_process(mod.weight) + weight_qparams = get_qparam_dict(weight_post_process) + new_mod = qmod.from_float(mod, weight_qparams) + else: + new_mod = qmod.from_float(mod) swapped = True if swapped: diff --git a/torch/ao/quantization/quantize_fx.py b/torch/ao/quantization/quantize_fx.py index 07a1eb6755b2..64de11818bd1 100644 --- a/torch/ao/quantization/quantize_fx.py +++ b/torch/ao/quantization/quantize_fx.py @@ -5,9 +5,10 @@ from torch.fx._symbolic_trace import Tracer from torch.fx.node import Target, Node, Argument from torch.nn.intrinsic import _FusedModule -from .fx import Fuser # noqa: F401 -from .fx import prepare, convert # noqa: F401 -from .fx import get_tensorrt_backend_config_dict # noqa: F401 +from .fx import fuse # noqa: F401 +from .fx import prepare # noqa: F401 +from .fx.convert import convert +from .backend_config import get_tensorrt_backend_config_dict # noqa: F401 from .fx.graph_module import ObservedGraphModule from .fx.qconfig_utils import ( check_is_valid_convert_custom_config_dict, @@ -57,9 +58,8 @@ def _fuse_fx( graph_module: GraphModule object from symbolic tracing (torch.fx.symbolic_trace) """ _check_is_graph_module(graph_module) - fuser = Fuser() - return fuser.fuse( - graph_module, is_qat, fuse_custom_config_dict, backend_config_dict) + return fuse( + graph_module, is_qat, fuse_custom_config_dict, backend_config_dict) # type: ignore[operator] class Scope(object): @@ -251,7 +251,7 @@ def _prepare_fx( equalization_qconfig_dict=equalization_qconfig_dict, backend_config_dict=backend_config_dict, is_standalone_module=is_standalone_module, - ) + ) # type: ignore[operator] for attr_name in preserved_attributes: setattr(prepared, attr_name, getattr(model, attr_name)) @@ -298,7 +298,8 @@ def _prepare_standalone_module_fx( def fuse_fx( - model: torch.nn.Module, fuse_custom_config_dict: Optional[Dict[str, Any]] = None + model: torch.nn.Module, fuse_custom_config_dict: Optional[Dict[str, Any]] = None, + backend_config_dict: Optional[Dict[str, Any]] = None, ) -> GraphModule: r""" Fuse modules like conv+bn, conv+bn+relu etc, model must be in eval mode. Fusion rules are defined in torch.quantization.fx.fusion_pattern.py @@ -309,10 +310,6 @@ def fuse_fx( * `fuse_custom_config_dict`: Dictionary for custom configurations for fuse_fx, e.g.:: fuse_custom_config_dict = { - "additional_fuser_method_mapping": { - (Module1, Module2): fuse_module1_module2 - } - # Attributes that are not used in forward function will # be removed when constructing GraphModule, this is a list of attributes # to preserve as an attribute of the GraphModule even when they are @@ -328,7 +325,6 @@ def fuse_fx( """ torch._C._log_api_usage_once("quantization_api.quantize_fx.fuse_fx") - assert not model.training, "fuse_fx only works on models in eval mode" check_is_valid_fuse_custom_config_dict(fuse_custom_config_dict) graph_module = torch.fx.symbolic_trace(model) preserved_attributes: Set[str] = set() @@ -338,7 +334,7 @@ def fuse_fx( ) for attr_name in preserved_attributes: setattr(graph_module, attr_name, getattr(model, attr_name)) - return _fuse_fx(graph_module, False, fuse_custom_config_dict) + return _fuse_fx(graph_module, False, fuse_custom_config_dict, backend_config_dict) def prepare_fx( @@ -439,27 +435,6 @@ def prepare_fx( NonTraceableModule ], - # Additional fuser_method mapping - "additional_fuser_method_mapping": { - (torch.nn.Conv2d, torch.nn.BatchNorm2d): fuse_conv_bn - }, - - # Additioanl module mapping for qat - "additional_qat_module_mapping": { - torch.nn.intrinsic.ConvBn2d: torch.nn.qat.ConvBn2d - }, - - # Additional fusion patterns - "additional_fusion_pattern": { - (torch.nn.BatchNorm2d, torch.nn.Conv2d): ConvReluFusionhandler - }, - - # Additional quantization patterns - "additional_quant_pattern": { - torch.nn.Conv2d: ConvReluQuantizeHandler, - (torch.nn.ReLU, torch.nn.Conv2d): ConvReluQuantizeHandler, - } - # By default, inputs and outputs of the graph are assumed to be in # fp32. Providing `input_quantized_idxs` will set the inputs with the # corresponding indices to be quantized. Providing @@ -511,7 +486,6 @@ def calibrate(model, data_loader): """ torch._C._log_api_usage_once("quantization_api.quantize_fx.prepare_fx") - assert not model.training, "prepare_fx only works for models in " + "eval mode" return _prepare_fx( model, qconfig_dict, @@ -560,7 +534,6 @@ def train_loop(model, train_data): """ torch._C._log_api_usage_once("quantization_api.quantize_fx.prepare_qat_fx") - assert model.training, "prepare_qat_fx only works for models in " + "train mode" return _prepare_fx( model, qconfig_dict, @@ -577,6 +550,7 @@ def _convert_fx( is_standalone_module: bool = False, _remove_qconfig: bool = True, qconfig_dict: Dict[str, Any] = None, + backend_config_dict: Dict[str, Any] = None, ) -> torch.nn.Module: """ `is_standalone_module`: see docs in :func:`~torch.ao.quantization.prepare_standalone_module_fx` """ @@ -593,6 +567,7 @@ def _convert_fx( is_standalone_module, _remove_qconfig_flag=_remove_qconfig, convert_qconfig_dict=qconfig_dict, + backend_config_dict=backend_config_dict, ) preserved_attributes = convert_custom_config_dict.get("preserved_attributes", []) @@ -607,6 +582,7 @@ def convert_fx( convert_custom_config_dict: Optional[Dict[str, Any]] = None, _remove_qconfig: bool = True, qconfig_dict: Dict[str, Any] = None, + backend_config_dict: Dict[str, Any] = None, ) -> torch.nn.Module: r""" Convert a calibrated or trained model to a quantized model @@ -618,20 +594,6 @@ def convert_fx( * `convert_custom_config_dict`: dictionary for custom configurations for convert function:: convert_custom_config_dict = { - - # additional object (module/operator) mappings that will overwrite the default - # module mappinng - "additional_object_mapping": { - "static": { - FloatModule: QuantizedModule, - float_op: quantized_op - }, - "dynamic": { - FloatModule: DynamicallyQuantizedModule, - float_op: dynamically_quantized_op - }, - }, - # user will manually define the corresponding quantized # module class which has a from_observed class method that converts # observed custom module to quantized custom module @@ -677,6 +639,11 @@ def convert_fx( ], } + * `backend_config_dict`: A configuration for the backend which describes how + operators should be quantized in the backend, this includes quantization + mode support (static/dynamic/weight_only), dtype support (quint8/qint8 etc.), + observer placement for each operators and fused operators. Detailed + documentation can be found in torch/ao/quantization/backend_config/README.md Return: A quantized model (GraphModule) @@ -694,6 +661,7 @@ def convert_fx( convert_custom_config_dict, _remove_qconfig=_remove_qconfig, qconfig_dict=qconfig_dict, + backend_config_dict=backend_config_dict, ) diff --git a/torch/ao/quantization/stubs.py b/torch/ao/quantization/stubs.py index 1f4c462e56e2..7ae526a8921e 100644 --- a/torch/ao/quantization/stubs.py +++ b/torch/ao/quantization/stubs.py @@ -21,9 +21,15 @@ def forward(self, x): class DeQuantStub(nn.Module): r"""Dequantize stub module, before calibration, this is same as identity, this will be swapped as `nnq.DeQuantize` in `convert`. + + Args: + qconfig: quantization configuration for the tensor, + if qconfig is not provided, we will get qconfig from parent modules """ - def __init__(self): + def __init__(self, qconfig=None): super(DeQuantStub, self).__init__() + if qconfig: + self.qconfig = qconfig def forward(self, x): return x @@ -48,7 +54,7 @@ def __init__(self, module): super(QuantWrapper, self).__init__() qconfig = module.qconfig if hasattr(module, 'qconfig') else None self.add_module('quant', QuantStub(qconfig)) - self.add_module('dequant', DeQuantStub()) + self.add_module('dequant', DeQuantStub(qconfig)) self.add_module('module', module) self.train(module.training) diff --git a/torch/ao/quantization/utils.py b/torch/ao/quantization/utils.py index e81c88993c4c..f42b5c1ce723 100644 --- a/torch/ao/quantization/utils.py +++ b/torch/ao/quantization/utils.py @@ -6,12 +6,20 @@ import torch from torch.ao.quantization.quant_type import QuantType, quant_type_to_str from typing import Tuple, Any, Union, Callable +from torch.nn.utils.parametrize import is_parametrized # Type for fusion patterns, it can be more complicated than the following actually, # see pattern.md for docs # TODO: not sure if typing supports recursive data types Pattern = Union[Callable, Tuple[Callable, Callable], Tuple[Callable, Tuple[Callable, Callable]], Any] +# TODO: maybe rename this to MatchInputNode +class MatchAllNode: + """ A node pattern that matches all nodes, used in defining + fusion patterns in FX Graph Mode Quantization + """ + pass + module_type_list = { torch.nn.ReLU, torch.nn.ReLU6, @@ -25,21 +33,37 @@ torch.nn.MaxPool2d, torch.nn.MaxPool3d, torch.nn.Identity, + torch.nn.Hardsigmoid, + torch.nn.Sigmoid, + torch.nn.Tanh, } func_list = { torch.nn.functional.adaptive_avg_pool1d, torch.nn.functional.adaptive_avg_pool2d, torch.nn.functional.adaptive_avg_pool3d, + torch.nn.functional.elu, + torch.nn.functional.hardswish, + torch.nn.functional.instance_norm, + torch.nn.functional.layer_norm, + torch.nn.functional.leaky_relu, + torch.nn.functional.silu, + torch.nn.functional.mish, + torch.nn.functional.dropout, torch.nn.functional.max_pool1d, torch.nn.functional.max_pool2d, torch.nn.functional.max_pool3d, torch.nn.functional.relu, torch.nn.functional.hardtanh, torch.nn.functional.hardtanh_, + torch.nn.functional.hardsigmoid, + torch.nn.functional.sigmoid, torch.transpose, torch.repeat_interleave, + torch.sigmoid, torch.squeeze, torch.stack, + torch.sum, + torch.tanh, torch.unsqueeze, torch.cat, } @@ -50,15 +74,21 @@ 'contiguous', 'detach', 'detach_', + 'hardsigmoid', + 'hardsigmoid_', 'permute', 'repeat', 'repeat_interleave', 'reshape', 'resize_', 'shape', + 'sigmoid', + 'sigmoid_', 'size', 'squeeze', 'squeeze_', + 'tanh', + 'tanh_', 'transpose', 'unsqueeze', 'unsqueeze_', @@ -66,6 +96,7 @@ } def check_node(node, modules): + # TODO: reuse is_fixed_qparam_node after we move this function to _lower_to_native_backend.py is_call_function = node.op == "call_function" and node.target in func_list is_call_method = node.op == "call_method" and node.target in method_list is_call_module = node.op == "call_module" and type(modules[str(node.target)]) in module_type_list @@ -154,17 +185,33 @@ def activation_is_statically_quantized(qconfig): """ return activation_dtype(qconfig) in [torch.quint8, torch.qint8, torch.float16] +def activation_is_dynamically_quantized(qconfig): + """ Given a qconfig, decide if the activation needs to be + dynamically quantized or not, this includes dynamically quantizing to + quint8, qint8 and float16 + """ + activation_dtype, _, activation_compute_dtype = \ + get_qconfig_dtypes(qconfig) + return activation_dtype == torch.float and \ + activation_compute_dtype in [torch.quint8, torch.qint8, torch.float16] + def activation_is_int8_quantized(qconfig): """ Given a qconfig, decide if the activation needs to be quantized to int8 or not, this includes quantizing to quint8, qint8 """ return activation_dtype(qconfig) in [torch.quint8, torch.qint8] +def activation_is_int32_quantized(qconfig): + """ Given a qconfig, decide if the activation needs to be + quantized to int32 or not + """ + return activation_dtype(qconfig) == torch.qint32 + def weight_is_quantized(qconfig): """ Given a qconfig, decide if the weight needs to be quantized or not """ - return weight_dtype(qconfig) in [torch.quint8, torch.qint8, torch.float16] + return weight_dtype(qconfig) in [torch.quint8, torch.qint8, torch.float16, torch.quint4x2] def weight_is_statically_quantized(qconfig): """ Given a qconfig, decide if the weight needs to be statically @@ -199,7 +246,7 @@ def get_quant_type(qconfig): assert qconfig is not None activation = qconfig.activation() weight = qconfig.weight() - static_dtypes = [torch.quint8, torch.qint8] + static_dtypes = [torch.quint8, torch.qint8, torch.quint4x2] if weight.dtype in static_dtypes: if activation.dtype in static_dtypes: return QuantType.STATIC @@ -253,11 +300,15 @@ def calculate_qmin_qmax(quant_min: int, quant_max: int, has_customized_qrange: b r"""Calculates actual qmin and qmax based on the quantization range, observer datatype and if range is reduced. """ + # TODO(jerryzh): Figure out why custom quant_min/quant_max are still adjusted. if has_customized_qrange: # This initialization here is to be resolve TorchScript compilation issues and allow # using of refinement to decouple initial_qmin and initial_qmax from quantization range. # The actual values of initial_qmin and initial_qmax will be reset below. - initial_quant_min, initial_quant_max = 0, 255 + if dtype == torch.qint32: + initial_quant_min, initial_quant_max = 0, 2**31 - 1 + else: + initial_quant_min, initial_quant_max = 0, 255 # The following assignment of self.qmin and self.qmax to the local variables and the if check refine the # attribute from Optional valid integers for use, based on TorchScript's requirements. custom_quant_min, custom_quant_max = quant_min, quant_max @@ -268,13 +319,14 @@ def calculate_qmin_qmax(quant_min: int, quant_max: int, has_customized_qrange: b ) qrange_len = initial_quant_max - initial_quant_min + 1 - assert ( - 0 < qrange_len <= 256 - ), "quantization range should be positive and not exceed the maximum bit range (=256)." if dtype == torch.qint8: - quant_min, quant_max = -qrange_len // 2, qrange_len // 2 - 1 - else: - quant_min, quant_max = 0, qrange_len - 1 + assert ( + 0 < qrange_len <= 256 + ), "quantization range should be positive and not exceed the maximum bit range (=256)." + elif dtype == torch.qint32: + assert ( + 0 < qrange_len <= 2**31 + ), "quantization range should be positive and not exceed the maximum bit range (=4294967296)." if reduce_range: quant_min, quant_max = quant_min // 2, quant_max // 2 else: @@ -289,6 +341,8 @@ def calculate_qmin_qmax(quant_min: int, quant_max: int, has_customized_qrange: b quant_min, quant_max = 0, 127 else: quant_min, quant_max = 0, 255 + elif dtype == torch.qint32: + quant_min, quant_max = -1 * (2 ** 31), (2 ** 31) - 1 else: quant_min, quant_max = 0, 15 return quant_min, quant_max @@ -303,3 +357,16 @@ def _parent_name(target): return '', r[0] else: return r[0], r[1] + +def has_no_children_ignoring_parametrizations(module): + """ + Checks if module._modules is empty or + if module is a parametrization, checks that module._modules only has + the 'parametrizations' module + """ + if len(module._modules) == 0: + return True + elif is_parametrized(module): + return len(module._modules) == 1 and 'parametrizations' in module._modules + else: + return False diff --git a/torch/autocast_mode.py b/torch/autocast_mode.py deleted file mode 100644 index daf2a34383fb..000000000000 --- a/torch/autocast_mode.py +++ /dev/null @@ -1,222 +0,0 @@ -import torch -import functools -import warnings - -from typing import Any, Optional -from .types import _dtype - -def autocast_decorator(autocast_instance, func): - @functools.wraps(func) - def decorate_autocast(*args, **kwargs): - with autocast_instance: - return func(*args, **kwargs) - decorate_autocast.__script_unsupported = '@autocast() decorator is not supported in script mode' # type: ignore[attr-defined] - return decorate_autocast - -class autocast(object): - r""" - Instances of :class:`autocast` serve as context managers or decorators that - allow regions of your script to run in mixed precision. - - In these regions, ops run in an op-specific dtype chosen by autocast - to improve performance while maintaining accuracy. - See the :ref:`Autocast Op Reference` for details. - - When entering an autocast-enabled region, Tensors may be any type. - You should not call ``half()`` or ``bfloat16()`` on your model(s) or inputs when using autocasting. - - :class:`autocast` should wrap only the forward pass(es) of your network, including the loss - computation(s). Backward passes under autocast are not recommended. - Backward ops run in the same type that autocast used for corresponding forward ops. - - Example for CUDA Devices:: - - # Creates model and optimizer in default precision - model = Net().cuda() - optimizer = optim.SGD(model.parameters(), ...) - - for input, target in data: - optimizer.zero_grad() - - # Enables autocasting for the forward pass (model + loss) - with autocast(): - output = model(input) - loss = loss_fn(output, target) - - # Exits the context manager before backward() - loss.backward() - optimizer.step() - - See the :ref:`Automatic Mixed Precision examples` for usage (along with gradient scaling) - in more complex scenarios (e.g., gradient penalty, multiple models/losses, custom autograd functions). - - :class:`autocast` can also be used as a decorator, e.g., on the ``forward`` method of your model:: - - class AutocastModel(nn.Module): - ... - @autocast() - def forward(self, input): - ... - - Floating-point Tensors produced in an autocast-enabled region may be ``float16``. - After returning to an autocast-disabled region, using them with floating-point - Tensors of different dtypes may cause type mismatch errors. If so, cast the Tensor(s) - produced in the autocast region back to ``float32`` (or other dtype if desired). - If a Tensor from the autocast region is already ``float32``, the cast is a no-op, - and incurs no additional overhead. - CUDA Example:: - - # Creates some tensors in default dtype (here assumed to be float32) - a_float32 = torch.rand((8, 8), device="cuda") - b_float32 = torch.rand((8, 8), device="cuda") - c_float32 = torch.rand((8, 8), device="cuda") - d_float32 = torch.rand((8, 8), device="cuda") - - with autocast(): - # torch.mm is on autocast's list of ops that should run in float16. - # Inputs are float32, but the op runs in float16 and produces float16 output. - # No manual casts are required. - e_float16 = torch.mm(a_float32, b_float32) - # Also handles mixed input types - f_float16 = torch.mm(d_float32, e_float16) - - # After exiting autocast, calls f_float16.float() to use with d_float32 - g_float32 = torch.mm(d_float32, f_float16.float()) - - CPU Example:: - - # Creates some tensors in default dtype (here assumed to be float32) - a_float32 = torch.rand((8, 8), device="cpu") - b_float32 = torch.rand((8, 8), device="cpu") - c_float32 = torch.rand((8, 8), device="cpu") - d_float32 = torch.rand((8, 8), device="cpu") - - with autocast(dtype=torch.bfloat16, device_type="cpu"): - # torch.mm is on autocast's list of ops that should run in bfloat16. - # Inputs are float32, but the op runs in bfloat16 and produces bfloat16 output. - # No manual casts are required. - e_bfloat16 = torch.mm(a_float32, b_float32) - # Also handles mixed input types - f_bfloat16 = torch.mm(d_float32, e_bfloat16) - - # After exiting autocast, calls f_float16.float() to use with d_float32 - g_float32 = torch.mm(d_float32, f_bfloat16.float()) - - Type mismatch errors *in* an autocast-enabled region are a bug; if this is what you observe, - please file an issue. - - ``autocast(enabled=False)`` subregions can be nested in autocast-enabled regions. - Locally disabling autocast can be useful, for example, if you want to force a subregion - to run in a particular ``dtype``. Disabling autocast gives you explicit control over - the execution type. In the subregion, inputs from the surrounding region - should be cast to ``dtype`` before use:: - - # Creates some tensors in default dtype (here assumed to be float32) - a_float32 = torch.rand((8, 8), device="cuda") - b_float32 = torch.rand((8, 8), device="cuda") - c_float32 = torch.rand((8, 8), device="cuda") - d_float32 = torch.rand((8, 8), device="cuda") - - with autocast(): - e_float16 = torch.mm(a_float32, b_float32) - with autocast(enabled=False): - # Calls e_float16.float() to ensure float32 execution - # (necessary because e_float16 was created in an autocasted region) - f_float32 = torch.mm(c_float32, e_float16.float()) - - # No manual casts are required when re-entering the autocast-enabled region. - # torch.mm again runs in float16 and produces float16 output, regardless of input types. - g_float16 = torch.mm(d_float32, f_float32) - - The autocast state is thread-local. If you want it enabled in a new thread, the context manager or decorator - must be invoked in that thread. This affects :class:`torch.nn.DataParallel` and - :class:`torch.nn.parallel.DistributedDataParallel` when used with more than one GPU per process - (see :ref:`Working with Multiple GPUs`). - - Args: - device_type(string, required): Whether to use 'cuda' or 'cpu' device - enabled(bool, optional, default=True): Whether autocasting should be enabled in the region. - dtype(torch_dtype, optional): Whether to use torch.float16 or torch.bfloat16. - cache_enabled(bool, optional, default=True): Whether the weight cache inside autocast should be enabled. - """ - def __init__(self, device_type : str, - dtype : Optional[_dtype] = None, - enabled : bool = True, - cache_enabled : Optional[bool] = None): - if torch._jit_internal.is_scripting(): - self._enabled = enabled - self.device = device_type - self.fast_dtype = dtype - # TODO: support get_autocast_gpu/cpu_dtype - assert dtype is not None - return - self.device = device_type - if self.device == 'cuda': - self.fast_dtype = torch.get_autocast_gpu_dtype() - elif self.device == 'cpu': - self.fast_dtype = torch.get_autocast_cpu_dtype() - else: - raise RuntimeError('User specified autocast device_type must be \'cuda\' or \'cpu\'') - self._cache_enabled = torch.is_autocast_cache_enabled() - if torch.cuda.amp.common.amp_definitely_not_available() and self.device == 'cuda': - warnings.warn('User provided device_type of \'cuda\', but CUDA is not available. Disabling') - enabled = False - if dtype is not None: - self.fast_dtype = dtype - if cache_enabled is not None: - self._cache_enabled = cache_enabled - - if self.device == 'cpu': - supported_dtype = [torch.bfloat16] - if self.fast_dtype not in supported_dtype: - error_message = 'In CPU autocast, but the target dtype is not supported. Disabling autocast.\n' - error_message += 'CPU Autocast only supports dtype of torch.bfloat16 currently.' - warnings.warn(error_message) - enabled = False - if self.device == 'cuda': - if self.fast_dtype == torch.bfloat16 and not torch.cuda.is_bf16_supported(): - raise RuntimeError('Current CUDA Device does not support bfloat16. Please switch dtype to float16.') - self._enabled = enabled - - def __enter__(self): - if torch._jit_internal.is_scripting(): - assert self.fast_dtype is not None - return self - - self.prev_cache_enabled = torch.is_autocast_cache_enabled() - if self.device == 'cpu': - self.prev = torch.is_autocast_cpu_enabled() - self.prev_fastdtype = torch.get_autocast_cpu_dtype() - torch.set_autocast_cpu_enabled(self._enabled) - torch.set_autocast_cpu_dtype(self.fast_dtype) # type: ignore[arg-type] - torch.autocast_increment_nesting() - else: - self.prev = torch.is_autocast_enabled() - self.prev_fastdtype = torch.get_autocast_gpu_dtype() - torch.set_autocast_gpu_dtype(self.fast_dtype) # type: ignore[arg-type] - torch.set_autocast_enabled(self._enabled) - torch.autocast_increment_nesting() - torch.set_autocast_cache_enabled(self._cache_enabled) - - def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any): # type: ignore[override] - if torch._jit_internal.is_scripting(): - return - - # Drop the cache when we exit to a nesting level that's outside any instance of autocast. - if self.device == 'cpu': - if torch.autocast_decrement_nesting() == 0: - torch.clear_autocast_cache() - torch.set_autocast_cpu_enabled(self.prev) - torch.set_autocast_cpu_dtype(self.prev_fastdtype) - else: - if torch.autocast_decrement_nesting() == 0: - torch.clear_autocast_cache() - torch.set_autocast_enabled(self.prev) - torch.set_autocast_gpu_dtype(self.prev_fastdtype) - torch.set_autocast_cache_enabled(self.prev_cache_enabled) - return False - - def __call__(self, func): - if torch._jit_internal.is_scripting(): - return func - return autocast_decorator(self, func) diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py index 28eb729ffcba..3fb02767efba 100644 --- a/torch/autograd/__init__.py +++ b/torch/autograd/__init__.py @@ -10,14 +10,14 @@ import warnings from torch.types import _TensorOrTensors -from typing import Any, Callable, List, Optional, Sequence, Tuple, Union +from typing import Any, Callable, List, Optional, Sequence, Tuple, Union, cast from .variable import Variable from .function import Function, NestedIOFunction from .gradcheck import gradcheck, gradgradcheck from .grad_mode import no_grad, enable_grad, set_grad_enabled, inference_mode from .anomaly_mode import detect_anomaly, set_detect_anomaly -from ..overrides import has_torch_function, handle_torch_function +from ..overrides import has_torch_function, handle_torch_function, is_tensor_like from . import functional from . import forward_ad from . import graph @@ -235,20 +235,21 @@ def grad( to show any performance warnings and file an issue on github if warnings exist for your use case. Defaults to ``False``. """ - outputs = (outputs,) if isinstance(outputs, torch.Tensor) else tuple(outputs) - inputs = (inputs,) if isinstance(inputs, torch.Tensor) else tuple(inputs) - overridable_args = outputs + inputs + t_outputs = cast(Tuple[torch.Tensor, ...], (outputs,) if is_tensor_like(outputs) else tuple(outputs)) + t_inputs = cast(Tuple[torch.Tensor, ...], (inputs,) if is_tensor_like(inputs) else tuple(inputs)) + overridable_args = t_outputs + t_inputs if has_torch_function(overridable_args): return handle_torch_function( grad, overridable_args, - outputs, - inputs, + t_outputs, + t_inputs, grad_outputs=grad_outputs, retain_graph=retain_graph, create_graph=create_graph, only_inputs=only_inputs, allow_unused=allow_unused, + is_grads_batched=is_grads_batched, ) if not only_inputs: @@ -256,8 +257,8 @@ def grad( "(defaults to True). To accumulate gradient for other " "parts of the graph, please use torch.autograd.backward.") - grad_outputs_ = _tensor_or_tensors_to_tuple(grad_outputs, len(outputs)) - grad_outputs_ = _make_grads(outputs, grad_outputs_, is_grads_batched=is_grads_batched) + grad_outputs_ = _tensor_or_tensors_to_tuple(grad_outputs, len(t_outputs)) + grad_outputs_ = _make_grads(t_outputs, grad_outputs_, is_grads_batched=is_grads_batched) if retain_graph is None: retain_graph = create_graph @@ -268,12 +269,12 @@ def grad( if is_grads_batched: def vjp(gO): return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass - outputs, gO, retain_graph, create_graph, inputs, + t_outputs, gO, retain_graph, create_graph, t_inputs, allow_unused, accumulate_grad=False) # Calls into the C++ engine to run the backward pass - return _vmap_internals._vmap(vjp, 0, 0, allow_none_pass_through=True)(grad_outputs) + return _vmap_internals._vmap(vjp, 0, 0, allow_none_pass_through=True)(grad_outputs_) else: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass - outputs, grad_outputs_, retain_graph, create_graph, inputs, + t_outputs, grad_outputs_, retain_graph, create_graph, t_inputs, allow_unused, accumulate_grad=False) # Calls into the C++ engine to run the backward pass @@ -295,8 +296,13 @@ def _is_checkpoint_valid(): def variable(*args, **kwargs): - warnings.warn("torch.autograd.variable(...) is deprecated, use torch.tensor(...) instead") - return torch.tensor(*args, **kwargs) + raise RuntimeError("torch.autograd.variable(...) is deprecated, use torch.tensor(...) instead") + +# Monkey patching variable.Variable to fix FX codegen. FX generates a call by roughly doing +# f"{fn.__module__}.{fn.__name__}(...). This yields torch.autograd.variable.Variable(...) in the +# output of an FX graph. Unfortunately the module name torch.autograd.variable is shadowed by the +# deprecated function - variable(...). +variable.Variable = Variable # type: ignore[attr-defined] if not torch._C._autograd_init(): raise RuntimeError("autograd initialization failed") @@ -309,7 +315,7 @@ def variable(*args, **kwargs): _supported_activities, _add_metadata_json, SavedTensor, _push_saved_tensors_default_hooks, _pop_saved_tensors_default_hooks) -from torch._C._autograd import (_ProfilerResult, _KinetoEvent, +from torch._C._autograd import (_ProfilerResult, _KinetoEvent, _kineto_step, _prepare_profiler, _enable_profiler, _disable_profiler) from . import profiler diff --git a/torch/autograd/anomaly_mode.py b/torch/autograd/anomaly_mode.py index f6ec3612674c..cca0ece338d0 100644 --- a/torch/autograd/anomaly_mode.py +++ b/torch/autograd/anomaly_mode.py @@ -3,6 +3,8 @@ from typing import Any +__all__ = ["detect_anomaly", "set_detect_anomaly"] + class detect_anomaly(object): r"""Context-manager that enable anomaly detection for the autograd engine. diff --git a/torch/autograd/forward_ad.py b/torch/autograd/forward_ad.py index 08f633d1fb75..b12b6c646276 100644 --- a/torch/autograd/forward_ad.py +++ b/torch/autograd/forward_ad.py @@ -4,6 +4,8 @@ from typing import Any +__all__ = ["UnpackedDualTensor", "enter_dual_level", "exit_dual_level", "make_dual", "unpack_dual", "dual_level"] + # Global variable used to make the python API simpler to use _current_level = -1 @@ -72,7 +74,12 @@ def make_dual(tensor, tangent, *, level=None): return torch._VF._make_dual(tensor, tangent, level=level) -UnpackedDualTensor = namedtuple('UnpackedDualTensor', ['primal', 'tangent']) +_UnpackedDualTensor = namedtuple('_UnpackedDualTensor', ['primal', 'tangent']) + +class UnpackedDualTensor(_UnpackedDualTensor): + r"""Namedtuple returned by :func:`unpack_dual` containing the primal and tangent components of the dual tensor. + See :func:`unpack_dual` for more details.""" + pass def unpack_dual(tensor, *, level=None): r"""Unpacks a "dual tensor" to get both its Tensor value and its forward AD gradient. diff --git a/torch/autograd/functional.py b/torch/autograd/functional.py index 6fe0b5ee09f3..1b941967875f 100644 --- a/torch/autograd/functional.py +++ b/torch/autograd/functional.py @@ -416,11 +416,12 @@ def _construct_standard_basis_for(tensors: Tuple[torch.Tensor, ...], tensor_nume assert len(tensors) == len(tensor_numels) assert len(tensors) > 0 total_numel = sum(tensor_numels) - diag_start_indices = (0, *torch.tensor(tensor_numels).cumsum(dim=0)[:-1].neg().unbind()) chunks = tuple(tensor.new_zeros(total_numel, tensor_numel) for tensor, tensor_numel in zip(tensors, tensor_numels)) - for chunk, diag_start_idx in zip(chunks, diag_start_indices): + diag_start_idx = 0 + for chunk, numel in zip(chunks, tensor_numels): chunk.diagonal(diag_start_idx).fill_(1) + diag_start_idx -= numel return chunks @@ -685,7 +686,7 @@ def vjp(grad_output): raise RuntimeError(msg) jac_i_el.append(torch.zeros_like(inp_el)) - jacobian += (tuple(torch.stack(jac_i_el, dim=0).view(out.size() + jacobian += (tuple(torch.stack(jac_i_el, dim=0).view(out.size() # type: ignore[operator] + inputs[el_idx].size()) for (el_idx, jac_i_el) in enumerate(jac_i)), ) jacobian = _grad_postprocess(jacobian, create_graph) diff --git a/torch/autograd/grad_mode.py b/torch/autograd/grad_mode.py index c57a16f80d76..552afa4e5243 100644 --- a/torch/autograd/grad_mode.py +++ b/torch/autograd/grad_mode.py @@ -111,7 +111,7 @@ class no_grad(_DecoratorContextManager): Example:: - >>> x = torch.tensor([1], requires_grad=True) + >>> x = torch.tensor([1.], requires_grad=True) >>> with torch.no_grad(): ... y = x * 2 >>> y.requires_grad @@ -123,12 +123,12 @@ class no_grad(_DecoratorContextManager): >>> z.requires_grad False """ - def __init__(self): + def __init__(self) -> None: if not torch._jit_internal.is_scripting(): super().__init__() self.prev = False - def __enter__(self): + def __enter__(self) -> None: self.prev = torch.is_grad_enabled() torch.set_grad_enabled(False) @@ -206,7 +206,7 @@ class set_grad_enabled(_DecoratorContextManager): Example:: - >>> x = torch.tensor([1], requires_grad=True) + >>> x = torch.tensor([1.], requires_grad=True) >>> is_train = False >>> with torch.set_grad_enabled(is_train): ... y = x * 2 diff --git a/torch/autograd/gradcheck.py b/torch/autograd/gradcheck.py index 3aac0fca5f6f..3ae3e208978f 100644 --- a/torch/autograd/gradcheck.py +++ b/torch/autograd/gradcheck.py @@ -9,9 +9,14 @@ from torch._vmap_internals import vmap, _vmap import functools +# Note: `get_*_jacobian` functions are added here even though we didn't intend to make them public +# since they have been exposed from before we added `__all__` and we already maintain BC for them +# We should eventually deprecate them and remove them from `__all__` +__all__ = ["gradcheck", "gradgradcheck", "GradcheckError", "get_numerical_jacobian", + "get_analytical_jacobian", "get_numerical_jacobian_wrt_specific_input"] class GradcheckError(RuntimeError): - # Custom error so that user errors are not caught in the gradcheck's try-catch + r"""Error raised by :func:`gradcheck` and :func:`gradgradcheck`""" pass @@ -257,7 +262,7 @@ def _prepare_input(input: torch.Tensor, maybe_perturbed_input: Optional[torch.Te return input -def check_outputs_same_dtype_and_shape(output1, output2, eps, idx=None) -> None: +def _check_outputs_same_dtype_and_shape(output1, output2, eps, idx=None) -> None: # Check that the returned outputs don't have different dtype or shape when you # perturb the input on_index = "on index {idx} " if idx is not None else "" @@ -284,7 +289,7 @@ def get_numerical_jacobian_wrt_specific_input(fn, input_idx, inputs, outputs, ep for x, idx, d_idx in _iter_tensor(input): wrapped_fn = _with_prepare_inputs(fn, inputs, input_idx, x) input_to_perturb = x[idx] - nbhd_checks_fn = functools.partial(check_outputs_same_dtype_and_shape, idx=idx, eps=eps) + nbhd_checks_fn = functools.partial(_check_outputs_same_dtype_and_shape, idx=idx, eps=eps) jvp_fn = _get_numerical_jvp_fn(wrapped_fn, input_to_perturb, eps, nbhd_checks_fn) jacobian_cols[d_idx] = _compute_numerical_jvps_wrt_specific_input(jvp_fn, eps, x.is_complex(), is_forward_ad) return _combine_jacobian_cols(jacobian_cols, outputs, input, input.numel()) @@ -428,7 +433,7 @@ def _get_numerical_jvp_wrt_specific_input(fn, input_idx, inputs, u, eps, is_forw input = inputs[input_idx] input_to_perturb = _get_input_to_perturb(input) wrapped_fn = _with_prepare_inputs(fn, inputs, input_idx, input_to_perturb, True) - nbhd_checks_fn = functools.partial(check_outputs_same_dtype_and_shape, eps=eps) + nbhd_checks_fn = functools.partial(_check_outputs_same_dtype_and_shape, eps=eps) jvp_fn = _get_numerical_jvp_fn(wrapped_fn, input_to_perturb, eps, nbhd_checks_fn) u = _reshape_tensor_or_tuple(u, input_to_perturb.shape) u = _mul_tensor_or_tuple(u, eps) @@ -504,7 +509,7 @@ def _stack_and_check_tensors(list_of_list_of_tensors, inputs, If the test - manually invokes gradcheck/gradgradcheck, then call gradcheck/gradgradcheck with `nondet_tol=` as a keyword argument. -- is OpInfo-based (e.g., in test_ops.py), then modify the OpInfo for the test +- is OpInfo-based (e.g., in test_ops_gradients.py), then modify the OpInfo for the test to have `gradcheck_nondet_tol=`. - is a Module test (e.g., in common_nn.py), then modify the corresponding module_test entry to have `gradcheck_nondet_tol=` @@ -637,7 +642,7 @@ def _get_analytical_vjps_wrt_specific_output(vjp_fn, sample_output, v) -> List[L def _check_inputs(tupled_inputs, check_sparse_nnz) -> bool: - if not check_sparse_nnz and any(t.is_sparse for t in tupled_inputs if isinstance(t, torch.Tensor)): + if not check_sparse_nnz and any(t.is_sparse or t.is_sparse_csr for t in tupled_inputs if isinstance(t, torch.Tensor)): raise GradcheckError('gradcheck expects all tensor inputs are dense when check_sparse_nnz is set to False.') # Make sure that gradients are saved for at least one input any_input_requiring_grad = False @@ -649,7 +654,12 @@ def _check_inputs(tupled_inputs, check_sparse_nnz) -> bool: 'is not a double precision floating point or complex. ' 'This check will likely fail if all the inputs are ' 'not of double precision floating point or complex. ') - content = inp._values() if inp.is_sparse else inp + if inp.is_sparse: + content = inp._values() + elif inp.is_sparse_csr: + content = inp.values() + else: + content = inp # TODO: To cover more problematic cases, replace stride = 0 check with # "any overlap in memory" once we have a proper function to check it. if content.layout is not torch._mkldnn: # type: ignore[attr-defined] @@ -712,7 +722,7 @@ def _check_no_differentiable_outputs_fast(func, func_out, all_inputs, inputs_ind If the test - manually invokes gradcheck/gradgradcheck, then call gradcheck/gradgradcheck with `check_batched_grad=False` as a keyword argument. -- is OpInfo-based (e.g., in test_ops.py), then modify the OpInfo for the test +- is OpInfo-based (e.g., in test_ops_gradients.py), then modify the OpInfo for the test to have `check_batched_grad=False` and/or `check_batched_gradgrad=False`. If you're modifying an existing operator that supports batched grad computation, @@ -738,7 +748,7 @@ def _check_no_differentiable_outputs_fast(func, func_out, all_inputs, inputs_ind If the test - manually invokes gradcheck/gradgradcheck, then call gradcheck/gradgradcheck with `check_batched_forward_grad=False` as a keyword argument. -- is OpInfo-based (e.g., in test_ops.py), then modify the OpInfo for the test +- is OpInfo-based (e.g., in test_ops_gradients.py), then modify the OpInfo for the test to have `check_batched_forward_grad=False` """ @@ -1191,7 +1201,7 @@ def _adjusted_atol(atol, u, v): If the test - manually invokes gradcheck/gradgradcheck, then call gradcheck/gradgradcheck with `fast_mode=False` as a keyword argument. -- is OpInfo-based (e.g., in test_ops.py), then modify the OpInfo for the test +- is OpInfo-based (e.g., in test_ops_gradients.py), then modify the OpInfo for the test to have `gradcheck_fast_mode=False` - is a Module test (e.g., in common_nn.py), then modify the corresponding module_test entry to have `gradcheck_fast_mode=False` @@ -1521,16 +1531,21 @@ def gradgradcheck( tupled_inputs = _as_tuple(inputs) if grad_outputs is None: - # If grad_outputs is not specified, create random Tensors of the same - # shape, type, and device as the outputs - def randn_like(x): - y = torch.randn_like( - x if (x.is_floating_point() or x.is_complex()) else x.double(), memory_format=torch.legacy_contiguous_format) - if gen_non_contig_grad_outputs: - y = torch.testing.make_non_contiguous(y) - return y.requires_grad_() + # If grad_outputs is not specified, create random Tensors of the same shape, type, and device as the outputs + outputs = _as_tuple(func(*tupled_inputs)) - tupled_grad_outputs = tuple(randn_like(x) for x in outputs) + tupled_grad_outputs = tuple( + torch.testing.make_tensor( + x.shape, + dtype=x.dtype if x.is_floating_point() or x.is_complex() else torch.double, + device=x.device, + low=-1, + high=1, + requires_grad=True, + noncontiguous=gen_non_contig_grad_outputs, + ) + for x in outputs + ) else: tupled_grad_outputs = _as_tuple(grad_outputs) diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index 91c8d40c0cd1..eb8c46f8f124 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -6,8 +6,9 @@ from torch.autograd import ( DeviceType, ProfilerActivity, ProfilerConfig, ProfilerState, kineto_available, _ProfilerResult, _disable_profiler, _enable_profiler, - _prepare_profiler, _supported_activities + _prepare_profiler, _supported_activities, _kineto_step, ) +from torch._C._autograd import _ExperimentalConfig import torch import torch.cuda from torch.futures import Future @@ -83,6 +84,10 @@ class profile(object): use_cpu (bool, optional): profile CPU events; setting to ``False`` requires ``use_kineto=True`` and can be used to lower the overhead for GPU-only profiling. + experimental_config (_ExperimentalConfig) : A set of experimental options + used by profiler libraries like Kineto. Note, backward compatibility is not guaranteed. + + .. warning: Enabling memory profiling or source attribution incurs additional profiler overhead @@ -127,7 +132,8 @@ def __init__( with_stack=False, with_modules=False, use_kineto=False, - use_cpu=True): + use_cpu=True, + experimental_config=None): self.enabled: bool = enabled if not self.enabled: return @@ -141,6 +147,9 @@ def __init__( self.with_stack = with_stack self.with_modules = with_modules self.use_cpu = use_cpu + if experimental_config is None: + experimental_config = _ExperimentalConfig() + self.experimental_config = experimental_config self.kineto_results: Optional[_ProfilerResult] = None if not self.use_cpu: @@ -175,7 +184,8 @@ def config(self): self.profile_memory, self.with_stack, self.with_flops, - self.with_modules) + self.with_modules, + self.experimental_config) def __enter__(self): if not self.enabled: @@ -569,7 +579,8 @@ def __enter__(self): False, False, False, - False), + False, + _ExperimentalConfig()), set() ) return self @@ -664,3 +675,10 @@ def parse_nvprof_trace(path): functions.sort(key=lambda evt: evt.time_range.start) return functions + + +def kineto_step(): + """ Notify kineto so it is aware of iteration boundaries for asynchronous + trace requests. + """ + _kineto_step() diff --git a/torch/autograd/profiler_legacy.py b/torch/autograd/profiler_legacy.py index 445decf333e5..0211ec8a2809 100644 --- a/torch/autograd/profiler_legacy.py +++ b/torch/autograd/profiler_legacy.py @@ -55,7 +55,10 @@ def config(self): self.profile_memory, self.with_stack, self.with_flops, - self.with_modules) + self.with_modules, + # avoid exposing _ExperimentalConfig this in legacy public API + torch._C._autograd._ExperimentalConfig(), + ) def __enter__(self): if not self.enabled: diff --git a/torch/autograd/profiler_util.py b/torch/autograd/profiler_util.py index 6062c097b253..dc505fbc210a 100644 --- a/torch/autograd/profiler_util.py +++ b/torch/autograd/profiler_util.py @@ -642,6 +642,7 @@ def _filter_name(name): filtered_out_names = [ MEMORY_EVENT_NAME, # used only for the top-level memory events "profiler::_record_function_enter", + "profiler::_record_function_enter_new", "profiler::_record_function_exit", "aten::is_leaf", "aten::output_nr", diff --git a/torch/backends/_coreml/preprocess.py b/torch/backends/_coreml/preprocess.py index 7f27e60e5acb..3884058cd0ec 100644 --- a/torch/backends/_coreml/preprocess.py +++ b/torch/backends/_coreml/preprocess.py @@ -1,7 +1,6 @@ import hashlib import json -from dataclasses import dataclass, astuple, field -from typing import Dict, Tuple, List +from typing import Dict, Tuple import coremltools as ct # type: ignore[import] import torch @@ -35,86 +34,56 @@ class CoreMLComputeUnit: ALL = "all" -@dataclass -class _TensorSpec: - shape: List[int] = field(default_factory=List[int]) - dtype: int = ScalarType.Float - - -def TensorSpec(*args, **kwargs): - """ - TensorSpec specifies the tensor information. The default dtype is float32 - Example: - ts = TensorSpec( - shape = [1, 3, 224, 224], - dtype = ScalarType.Float - ) - """ - return astuple(_TensorSpec(*args, **kwargs)) - - -@dataclass -class _CompileSpec: - inputs: Tuple[_TensorSpec] = () # type: ignore[assignment] - outputs: Tuple[_TensorSpec] = () # type: ignore[assignment] - backend: str = CoreMLComputeUnit.CPU - allow_low_precision: bool = True - - -def CompileSpec(*args, **kwargs): - """ - CompileSpec specifies the model information. - Example: - cs = CompileSpec( - inputs=( - TensorSpec( - shape=[1, 3, 224, 224], - ), - ), - outputs=( - TensorSpec( - shape=[1, 1000], - ), - ), - backend=CoreMLComputeUnit.CPU, - allow_low_precision=True, - ), - """ - return astuple(_CompileSpec(*args, **kwargs)) - - -def _convert_to_mil_type(spec: _TensorSpec, name: str): - ml_type = TensorType(shape=spec.shape, dtype=torch_to_mil_types[spec.dtype]) +def TensorSpec(shape, dtype=ScalarType.Float): + return (shape, dtype) + + +def CompileSpec(inputs, outputs, backend=CoreMLComputeUnit.CPU, allow_low_precision=True): + return (inputs, outputs, backend, allow_low_precision) + + +def _check_enumerated_shape(shape): + for s in shape: + if not isinstance(s, (list, tuple)): + return False + return True + + +def _convert_to_mil_type(shape, dtype, name: str): + mil_shape = shape + if _check_enumerated_shape(shape): + mil_shape = ct.EnumeratedShapes(shape) + ml_type = TensorType(shape=mil_shape, dtype=torch_to_mil_types[dtype]) ml_type.name = name return ml_type def preprocess(script_module: torch._C.ScriptObject, compile_spec: Dict[str, Tuple]): spec = compile_spec["forward"] - forward_spec = _CompileSpec(*spec) + input_specs, output_specs, backend, allow_low_precision = spec mil_inputs = [] inputs = [] - for index, input_spec in enumerate(forward_spec.inputs): - input_spec = _TensorSpec(*input_spec) # type: ignore[misc] + for index, input in enumerate(input_specs): + shape, dtype = input name = "input_" + str(index) - inputs.append([name, str(input_spec.dtype), str(input_spec.shape)]) - ml_type = _convert_to_mil_type(input_spec, name) + inputs.append([name, str(dtype), str(shape)]) + ml_type = _convert_to_mil_type(shape, dtype, name) mil_inputs.append(ml_type) model = torch.jit.RecursiveScriptModule._construct(script_module, lambda x: None) mlmodel = ct.convert(model, inputs=mil_inputs) spec = mlmodel.get_spec() - output_specs = forward_spec.outputs assert len(spec.description.output) == len(output_specs) # type: ignore[attr-defined] outputs = [] - for index, output_spec in enumerate(output_specs): - output_spec = _TensorSpec(*output_spec) # type: ignore[misc] + for index, output in enumerate(output_specs): + shape, dtype = output name = spec.description.output[index].name # type: ignore[attr-defined] - outputs.append([name, str(output_spec.dtype), str(output_spec.shape)]) + outputs.append([name, str(dtype), str(shape)]) mlmodel = ct.models.model.MLModel(spec) + print(mlmodel) config = { "spec_ver": str(spec.specificationVersion), # type: ignore[attr-defined] - "backend": forward_spec.backend, - "allow_low_precision": str(forward_spec.allow_low_precision), + "backend": backend, + "allow_low_precision": str(allow_low_precision), } metadata = { "coremltool_ver": mlmodel.user_defined_metadata[CT_METADATA_VERSION], diff --git a/torch/backends/_nnapi/serializer.py b/torch/backends/_nnapi/serializer.py index d29b5987295c..4bbf9b5e8530 100644 --- a/torch/backends/_nnapi/serializer.py +++ b/torch/backends/_nnapi/serializer.py @@ -1549,11 +1549,28 @@ def add_adaptive_avg_pool2d(self, node): self.add_operation(NNAPI_OperationCode.AVERAGE_POOL_2D, inputs, outputs) def add_upsample_nearest2d(self, node): - assert node.inputsSize() == 3 + assert node.inputsSize() == 3 or node.inputsSize() == 4 assert node.outputsSize() == 1 - image, size_jit, scale_jit = node.inputs() + if node.inputsSize() == 3: + image, size_jit, scale_jit = node.inputs() + else: + image, size_jit, scale_h_jit, scale_w_jit = node.inputs() size_ctype, size_arg = self.get_constant_value(size_jit) - scale_ctype, scale_arg = self.get_constant_value(scale_jit) + + if node.inputsSize() == 3: + scale_ctype, scale_arg = self.get_constant_value(scale_jit) + else: + scale_h_ctype, scale_h_arg = self.get_constant_value(scale_h_jit) + scale_w_ctype, scale_w_arg = self.get_constant_value(scale_w_jit) + + # The only way for the 4-argument overload of upsample_nearest2d to + # have been added to the graph without error is if the scale_h and + # scale_w arguments are None + assert scale_h_ctype.kind() == "NoneType" + assert scale_w_ctype.kind() == "NoneType" + + scale_ctype = scale_h_ctype + scale_arg = scale_h_arg image_id, image_oper = self.get_tensor_operand_by_jitval(image) assert len(image_oper.shape) == 4 diff --git a/torch/backends/cudnn/__init__.py b/torch/backends/cudnn/__init__.py index 4f05e06225bd..d89049b5f3ca 100644 --- a/torch/backends/cudnn/__init__.py +++ b/torch/backends/cudnn/__init__.py @@ -133,3 +133,4 @@ def __init__(self, m, name): enabled: bool deterministic: bool benchmark: bool +allow_tf32: bool diff --git a/torch/backends/mps/__init__.py b/torch/backends/mps/__init__.py new file mode 100644 index 000000000000..b0b9f90ac77a --- /dev/null +++ b/torch/backends/mps/__init__.py @@ -0,0 +1,13 @@ +import sys +import torch + +def is_built(): + r"""Returns whether PyTorch is built with MPS support. Note that this + doesn't necessarily mean MPS is available; just that if this PyTorch + binary were run a machine with working MPS drivers and devices, we + would be able to use it.""" + return torch._C.has_mps + +def is_available(): + r"""Returns a bool indicating if MPS is currently available.""" + return torch._C._is_mps_available diff --git a/torch/backends/quantized/__init__.py b/torch/backends/quantized/__init__.py index a24d88bcc6e6..6f7d479e90c4 100644 --- a/torch/backends/quantized/__init__.py +++ b/torch/backends/quantized/__init__.py @@ -11,6 +11,8 @@ def _get_qengine_id(qengine: str) -> int: ret = 1 elif qengine == 'qnnpack': ret = 2 + elif qengine == 'onednn': + ret = 3 else: ret = -1 raise RuntimeError("{} is not a valid value for quantized engine".format(qengine)) @@ -18,7 +20,7 @@ def _get_qengine_id(qengine: str) -> int: # This function should correspond to the enums present in c10/core/QEngine.h def _get_qengine_str(qengine: int) -> str: - all_engines = {0 : 'none', 1 : 'fbgemm', 2 : 'qnnpack'} + all_engines = {0 : 'none', 1 : 'fbgemm', 2 : 'qnnpack', 3 : 'onednn'} return all_engines.get(qengine, '*undefined') class _QEngineProp(object): diff --git a/torch/cpu/amp/autocast_mode.py b/torch/cpu/amp/autocast_mode.py index 49ffb5c11b42..03cbcdcda0fc 100644 --- a/torch/cpu/amp/autocast_mode.py +++ b/torch/cpu/amp/autocast_mode.py @@ -1,7 +1,7 @@ import torch from typing import Any -class autocast(torch.autocast_mode.autocast): +class autocast(torch.amp.autocast_mode.autocast): r""" See :class:`torch.autocast`. ``torch.cpu.amp.autocast(args...)`` is equivalent to ``torch.autocast("cpu", args...)`` diff --git a/torch/csrc/DynamicTypes.cpp b/torch/csrc/DynamicTypes.cpp index a2bf143aede5..502bb0fa29b0 100644 --- a/torch/csrc/DynamicTypes.cpp +++ b/torch/csrc/DynamicTypes.cpp @@ -66,7 +66,7 @@ PyTypeObject* getPyTypeObject(const at::Storage& storage) { scalarType); auto it = attype_to_py_storage_type.find(attype); TORCH_INTERNAL_ASSERT(it != attype_to_py_storage_type.end(), - "Failed to get the Python type of `UntypedStorage`."); + "Failed to get the Python type of `_UntypedStorage`."); return it->second; } } // namespace @@ -115,10 +115,10 @@ PyTypeObject* loadTypedStorageTypeObject() { PyObject* storage_module = PyImport_ImportModule("torch.storage"); TORCH_INTERNAL_ASSERT(storage_module && PyModule_Check(storage_module)); - PyObject* typed_storage_obj = PyObject_GetAttrString(storage_module, "TypedStorage"); + PyObject* typed_storage_obj = PyObject_GetAttrString(storage_module, "_TypedStorage"); TORCH_INTERNAL_ASSERT(typed_storage_obj && PyType_Check(typed_storage_obj)); return reinterpret_cast( - PyObject_GetAttrString(storage_module, "TypedStorage")); + PyObject_GetAttrString(storage_module, "_TypedStorage")); } PyTypeObject* getTypedStorageTypeObject() { @@ -169,7 +169,7 @@ at::Storage createStorageGetType(PyObject* obj, at::ScalarType& scalar_type, boo } if (obj_type == storage_type) { auto& type = *item.second; - // UntypedStorage should always be interpreted with byte dtype + // _UntypedStorage should always be interpreted with byte dtype scalar_type = at::kByte; return type.unsafeStorageFromTH(((THPVoidStorage*)obj)->cdata, true); } diff --git a/torch/csrc/Exceptions.cpp b/torch/csrc/Exceptions.cpp index 8bf89ae7cbd7..2eb8ae41898e 100644 --- a/torch/csrc/Exceptions.cpp +++ b/torch/csrc/Exceptions.cpp @@ -43,26 +43,6 @@ could not be completed because the input matrix is singular.", PyExc_RuntimeErro namespace torch { -static bool compute_cpp_stack_traces_enabled() { - auto envar = std::getenv("TORCH_SHOW_CPP_STACKTRACES"); - if (envar) { - if (strcmp(envar, "0") == 0) { - return false; - } - if (strcmp(envar, "1") == 0) { - return true; - } - TORCH_WARN("ignoring invalid value for TORCH_SHOW_CPP_STACKTRACES: ", envar, - " valid values are 0 or 1."); - } - return false; -} - -bool get_cpp_stacktraces_enabled() { - static bool enabled = compute_cpp_stack_traces_enabled(); - return enabled; -} - void replaceAll(std::string & str, const std::string & old_str, const std::string & new_str) { diff --git a/torch/csrc/Exceptions.h b/torch/csrc/Exceptions.h index 348dba3de064..4a644a0b45e8 100644 --- a/torch/csrc/Exceptions.h +++ b/torch/csrc/Exceptions.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -105,6 +106,11 @@ static inline void PyErr_SetString(PyObject* type, const std::string& message) { auto msg = torch::processErrorMsg(e.what()); \ PyErr_SetString(PyExc_TimeoutError, msg); \ retstmnt; \ + } \ + catch (const c10d::C10dError& e) { \ + auto msg = torch::processErrorMsg(e.what()); \ + PyErr_SetString(PyExc_RuntimeError, msg); \ + retstmnt; \ } #else #define CATCH_C10D_ERRORS(retstmnt) @@ -275,8 +281,6 @@ TORCH_PYTHON_API void translate_exception_to_python(const std::exception_ptr &); TORCH_PYTHON_API std::string processErrorMsg(std::string str); -TORCH_PYTHON_API bool get_cpp_stacktraces_enabled(); - // Abstract base class for exceptions which translate to specific Python types struct PyTorchError : public std::exception { // NOLINTNEXTLINE(modernize-pass-by-value) diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index faf132753c25..186dab63c4f1 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -55,16 +55,15 @@ #include #include #include -#include #include #include +#include #include #include #include #include #include #include -#include #include #ifdef USE_DISTRIBUTED @@ -76,10 +75,6 @@ #endif #endif -#if defined(USE_MLCOMPUTE) -#include -#endif - #if defined(USE_VALGRIND) #include #endif @@ -398,6 +393,27 @@ PyObject *THPModule_allowTF32CuDNN(PyObject *_unused, PyObject *noargs) else Py_RETURN_FALSE; } +PyObject *THPModule_setFloat32MatmulPrecision(PyObject *_unused, PyObject *arg) +{ + THPUtils_assert(THPUtils_checkString(arg), "set_float32_matmul_precision expects a str, " + "but got %s", THPUtils_typename(arg)); + std::string s = THPUtils_unpackString(arg); + at::globalContext().setFloat32MatmulPrecision(s); + Py_RETURN_NONE; +} + +PyObject *THPModule_float32MatmulPrecision(PyObject *_unused, PyObject *noargs) +{ + std::string s = "highest"; + auto p = at::globalContext().float32MatmulPrecision(); + if (p == at::Float32MatmulPrecision::HIGH) { + s = "high"; + } else if (p == at::Float32MatmulPrecision::MEDIUM) { + s = "medium"; + } + return THPUtils_packString(s); +} + PyObject *THPModule_setUserEnabledCuDNN(PyObject *_unused, PyObject *arg) { THPUtils_assert(PyBool_Check(arg), "set_enabled_cudnn expects a bool, " @@ -588,11 +604,10 @@ PyObject *THPModule_supportedQEngines(PyObject *_unused, PyObject *noargs) { auto qengines = at::globalContext().supportedQEngines(); auto list = THPObjectPtr(PyList_New(qengines.size())); + if (!list) return nullptr; for (const auto i : c10::irange(qengines.size())) { PyObject *i64 = THPUtils_packInt64(static_cast(qengines[i])); - if (!i64) { - throw python_error(); - } + if (!i64) return nullptr; PyList_SET_ITEM(list.get(), i, i64); } return list.release(); @@ -606,22 +621,18 @@ PyObject *THPModule_isEnabledXNNPACK(PyObject *_unused, PyObject *noargs) PyObject *THPModule_setDefaultMobileCPUAllocator(PyObject *_unused, PyObject *noargs) { - try { - at::globalContext().setDefaultMobileCPUAllocator(); - } catch (c10::Error& e) { - THPUtils_setError(e.what()); - } + HANDLE_TH_ERRORS + at::globalContext().setDefaultMobileCPUAllocator(); Py_RETURN_NONE; + END_HANDLE_TH_ERRORS } PyObject *THPModule_unsetDefaultMobileCPUAllocator(PyObject *_unused, PyObject *noargs) { - try { - at::globalContext().unsetDefaultMobileCPUAllocator(); - } catch (c10::Error& e) { - THPUtils_setError(e.what()); - } + HANDLE_TH_ERRORS + at::globalContext().unsetDefaultMobileCPUAllocator(); Py_RETURN_NONE; + END_HANDLE_TH_ERRORS } static PyObject * THPModule_vmapmode_increment_nesting(PyObject* _unused, PyObject *arg) { @@ -696,6 +707,8 @@ static PyMethodDef TorchMethods[] = { {"_set_warnAlways", THPModule_setWarnAlways, METH_O, nullptr}, {"_get_cublas_allow_tf32", THPModule_allowTF32CuBLAS, METH_NOARGS, nullptr}, {"_set_cublas_allow_tf32", THPModule_setAllowTF32CuBLAS, METH_O, nullptr}, + {"_get_float32_matmul_precision", THPModule_float32MatmulPrecision, METH_NOARGS, nullptr}, + {"_set_float32_matmul_precision", THPModule_setFloat32MatmulPrecision, METH_O, nullptr}, {"_get_cublas_allow_fp16_reduced_precision_reduction", THPModule_allowFP16ReductionCuBLAS, METH_NOARGS, nullptr}, {"_set_cublas_allow_fp16_reduced_precision_reduction", THPModule_setAllowFP16ReductionCuBLAS, METH_O, nullptr}, {"_vmapmode_increment_nesting", THPModule_vmapmode_increment_nesting, METH_NOARGS, nullptr}, @@ -715,6 +728,7 @@ static PyMethodDef TorchMethods[] = { {"_unset_default_mobile_cpu_allocator", THPModule_unsetDefaultMobileCPUAllocator, METH_NOARGS, nullptr}, {"_is_torch_function_enabled", THPModule_isEnabledTorchFunction, METH_NOARGS, nullptr}, {"_disabled_torch_function_impl", THPModule_disable_torch_function, METH_VARARGS, nullptr}, + {"_disabled_torch_dispatch_impl", THPModule_disable_torch_dispatch, METH_VARARGS, nullptr}, {"_has_torch_function", THPModule_has_torch_function, METH_O, nullptr}, {"_has_torch_function_unary", THPModule_has_torch_function_unary, METH_O, nullptr}, {"_has_torch_function_variadic", MAYBE_WRAP_FASTCALL(THPModule_has_torch_function_variadic), MAYBE_METH_FASTCALL, nullptr}, @@ -735,15 +749,6 @@ void initModule(PyObject *module); }} // namespace torch::cuda #endif -#ifdef USE_MLCOMPUTE -PyMethodDef* ModuleMLC_methods(); -namespace torch { namespace mlc { - -void initBindings(PyObject *module); - -}} // namespace torch::mlc -#endif - bool THDPByteStorage_init(PyObject *module); static std::vector methods; @@ -780,6 +785,9 @@ TORCH_API PyObject* initModule(); // separate decl and defn for msvc error C2491 PyObject* initModule() { HANDLE_TH_ERRORS + + c10::initLogging(); + at::internal::lazy_init_num_threads(); C10_LOG_API_USAGE_ONCE("torch.python.import"); @@ -794,9 +802,6 @@ PyObject* initModule() { #ifdef USE_CUDA THPUtils_addPyMethodDefs(methods, THCPModule_methods()); #endif -#ifdef USE_MLCOMPUTE - THPUtils_addPyMethodDefs(methods, ModuleMLC_methods()); -#endif #if defined(USE_DISTRIBUTED) && defined(USE_C10D) THPUtils_addPyMethodDefs(methods, torch::distributed::c10d::python_functions()); #ifndef _WIN32 @@ -836,7 +841,6 @@ PyObject* initModule() { torch::monitor::initMonitorBindings(module); torch::impl::dispatch::initDispatchBindings(module); torch::throughput_benchmark::initThroughputBenchmarkBindings(module); - torch::crash_handler::initCrashHandlerBindings(module); torch::autograd::initReturnTypes(module); torch::autograd::initNNFunctions(module); torch::autograd::initFFTFunctions(module); @@ -845,11 +849,9 @@ PyObject* initModule() { torch::autograd::initSpecialFunctions(module); torch::autograd::init_legacy_variable(module); torch::python::init_bindings(module); + torch::lazy::initLazyBindings(module); #ifdef USE_CUDA torch::cuda::initModule(module); -#endif -#ifdef USE_MLCOMPUTE - torch::mlc::init_bindings(module); #endif ASSERT_TRUE(THPByteStorage_init(module)); @@ -892,10 +894,6 @@ PyObject* initModule() { // Automatically translate errors thrown from pybind11 functions py::register_exception_translator([](std::exception_ptr e) { // NOLINT - if (torch::crash_handler::is_enabled_on_exceptions()) { - torch::crash_handler::write_minidump(); - } - try { if (e) { std::rethrow_exception(e); @@ -1019,15 +1017,16 @@ Call this whenever a new thread is created in order to propagate values from #else PyObject *has_cuda = Py_False; #endif -#ifdef USE_MLCOMPUTE - PyObject *has_mlc = Py_True; + +#ifdef USE_MPS + PyObject *has_mps = Py_True; #else - PyObject *has_mlc = Py_False; + PyObject *has_mps = Py_False; #endif - ASSERT_TRUE(set_module_attr("has_mlc", has_mlc)); - ASSERT_TRUE(set_module_attr("has_cuda", has_cuda)); + ASSERT_TRUE(set_module_attr("has_mps", has_mps)); + ASSERT_TRUE(set_module_attr("_is_mps_available", at::hasMPS() ? Py_True : Py_False)); ASSERT_TRUE(set_module_attr("has_mkldnn", at::hasMKLDNN() ? Py_True : Py_False)); @@ -1060,6 +1059,13 @@ Call this whenever a new thread is created in order to propagate values from #endif #undef SET_STR_DEFINE + py_module.def("_set_conj", [](const at::Tensor & x, bool conj) { + x._set_conj(conj); + }); + py_module.def("_set_neg", [](const at::Tensor & x, bool neg) { + x._set_neg(neg); + }); + const auto& defaultGenerator = at::detail::getDefaultCPUGenerator(); THPDefaultCPUGenerator = (THPGenerator*)THPGenerator_initDefaultGenerator(defaultGenerator); // This reference is meant to be given away, so no need to incref here. @@ -1067,6 +1073,8 @@ Call this whenever a new thread is created in order to propagate values from ASSERT_TRUE(set_module_attr("DisableTorchFunction", (PyObject*)THPModule_DisableTorchFunctionType(), /* incref= */ false)); torch::set_disabled_torch_function_impl(PyObject_GetAttrString(module, "_disabled_torch_function_impl")); ASSERT_TRUE(torch::disabled_torch_function_impl() != nullptr); + torch::set_disabled_torch_dispatch_impl(PyObject_GetAttrString(module, "_disabled_torch_dispatch_impl")); + ASSERT_TRUE(torch::disabled_torch_dispatch_impl() != nullptr); return module; END_HANDLE_TH_ERRORS } diff --git a/torch/csrc/THCGenerateByteType.h b/torch/csrc/THCGenerateByteType.h index e2c5f35d9118..23648de8025f 100644 --- a/torch/csrc/THCGenerateByteType.h +++ b/torch/csrc/THCGenerateByteType.h @@ -1,5 +1,5 @@ #ifndef THC_GENERIC_FILE -#error "You must define THC_GENERIC_FILE before including THGenerateByteType.h" +#error "You must define THC_GENERIC_FILE before including THCGenerateByteType.h" #endif #define scalar_t uint8_t diff --git a/torch/csrc/TypeInfo.cpp b/torch/csrc/TypeInfo.cpp index b75f4fee4236..08fd03236428 100644 --- a/torch/csrc/TypeInfo.cpp +++ b/torch/csrc/TypeInfo.cpp @@ -21,7 +21,7 @@ PyObject* THPFInfo_New(const at::ScalarType& type) { if (!self) throw python_error(); auto self_ = reinterpret_cast(self.get()); - self_->type = c10::toValueType(type); + self_->type = c10::toRealValueType(type); return self.release(); } diff --git a/torch/csrc/api/include/torch/data/example.h b/torch/csrc/api/include/torch/data/example.h index b43ef2ca1955..f302cdd9ed87 100644 --- a/torch/csrc/api/include/torch/data/example.h +++ b/torch/csrc/api/include/torch/data/example.h @@ -25,7 +25,7 @@ namespace example { using NoTarget = void; } // namespace example -/// A specialization for `Example` that does not have have a target. +/// A specialization for `Example` that does not have a target. /// /// This class exists so that code can be written for a templated `Example` /// type, and work both for labeled and unlabeled datasets. diff --git a/torch/csrc/api/include/torch/fft.h b/torch/csrc/api/include/torch/fft.h index 23ecbf1be0c6..71a3146c990f 100644 --- a/torch/csrc/api/include/torch/fft.h +++ b/torch/csrc/api/include/torch/fft.h @@ -44,7 +44,7 @@ inline Tensor ifft(const Tensor& self, /// torch::fft::fft2(t); /// ``` inline Tensor fft2(const Tensor& self, - c10::optional s=c10::nullopt, + OptionalIntArrayRef s=c10::nullopt, IntArrayRef dim={-2, -1}, c10::optional norm=c10::nullopt) { return torch::fft_fft2(self, s, dim, norm); @@ -59,7 +59,7 @@ inline Tensor fft2(const Tensor& self, /// torch::fft::ifft2(t); /// ``` inline Tensor ifft2(const Tensor& self, - c10::optional s=c10::nullopt, + at::OptionalIntArrayRef s=c10::nullopt, IntArrayRef dim={-2, -1}, c10::optional norm=c10::nullopt) { return torch::fft_ifft2(self, s, dim, norm); @@ -74,8 +74,8 @@ inline Tensor ifft2(const Tensor& self, /// torch::fft::fftn(t); /// ``` inline Tensor fftn(const Tensor& self, - c10::optional s=c10::nullopt, - c10::optional dim=c10::nullopt, + at::OptionalIntArrayRef s=c10::nullopt, + at::OptionalIntArrayRef dim=c10::nullopt, c10::optional norm=c10::nullopt) { return torch::fft_fftn(self, s, dim, norm); } @@ -89,8 +89,8 @@ inline Tensor fftn(const Tensor& self, /// torch::fft::ifftn(t); /// ``` inline Tensor ifftn(const Tensor& self, - c10::optional s=c10::nullopt, - c10::optional dim=c10::nullopt, + at::OptionalIntArrayRef s=c10::nullopt, + at::OptionalIntArrayRef dim=c10::nullopt, c10::optional norm=c10::nullopt) { return torch::fft_ifftn(self, s, dim, norm); } @@ -138,7 +138,7 @@ inline Tensor irfft(const Tensor& self, /// torch::fft::rfft2(t); /// ``` inline Tensor rfft2(const Tensor& self, - c10::optional s=c10::nullopt, + at::OptionalIntArrayRef s=c10::nullopt, IntArrayRef dim={-2, -1}, c10::optional norm=c10::nullopt) { return torch::fft_rfft2(self, s, dim, norm); @@ -153,7 +153,7 @@ inline Tensor rfft2(const Tensor& self, /// torch::fft::irfft2(t); /// ``` inline Tensor irfft2(const Tensor& self, - c10::optional s=c10::nullopt, + at::OptionalIntArrayRef s=c10::nullopt, IntArrayRef dim={-2, -1}, c10::optional norm=c10::nullopt) { return torch::fft_irfft2(self, s, dim, norm); @@ -168,8 +168,8 @@ inline Tensor irfft2(const Tensor& self, /// torch::fft::rfftn(t); /// ``` inline Tensor rfftn(const Tensor& self, - c10::optional s=c10::nullopt, - c10::optional dim=c10::nullopt, + at::OptionalIntArrayRef s=c10::nullopt, + at::OptionalIntArrayRef dim=c10::nullopt, c10::optional norm=c10::nullopt) { return torch::fft_rfftn(self, s, dim, norm); } @@ -183,8 +183,8 @@ inline Tensor rfftn(const Tensor& self, /// torch::fft::irfftn(t); /// ``` inline Tensor irfftn(const Tensor& self, - c10::optional s=c10::nullopt, - c10::optional dim=c10::nullopt, + at::OptionalIntArrayRef s=c10::nullopt, + at::OptionalIntArrayRef dim=c10::nullopt, c10::optional norm=c10::nullopt) { return torch::fft_irfftn(self, s, dim, norm); } @@ -238,7 +238,7 @@ inline Tensor ihfft(const Tensor& self, /// assert(T.is_floating_point() && T.numel() == 128 * 128); /// ``` inline Tensor hfft2(const Tensor& self, - c10::optional s=c10::nullopt, + at::OptionalIntArrayRef s=c10::nullopt, IntArrayRef dim={-2, -1}, c10::optional norm=c10::nullopt) { return torch::fft_hfft2(self, s, dim, norm); @@ -256,7 +256,7 @@ inline Tensor hfft2(const Tensor& self, /// assert(t.is_complex() && t.size(1) == 65); /// ``` inline Tensor ihfft2(const Tensor& self, - c10::optional s=c10::nullopt, + at::OptionalIntArrayRef s=c10::nullopt, IntArrayRef dim={-2, -1}, c10::optional norm=c10::nullopt) { return torch::fft_ihfft2(self, s, dim, norm); @@ -274,7 +274,7 @@ inline Tensor ihfft2(const Tensor& self, /// assert(T.is_floating_point() && T.numel() == 128 * 128); /// ``` inline Tensor hfftn(const Tensor& self, - c10::optional s=c10::nullopt, + at::OptionalIntArrayRef s=c10::nullopt, IntArrayRef dim={-2, -1}, c10::optional norm=c10::nullopt) { return torch::fft_hfftn(self, s, dim, norm); @@ -292,7 +292,7 @@ inline Tensor hfftn(const Tensor& self, /// assert(t.is_complex() && t.size(1) == 65); /// ``` inline Tensor ihfftn(const Tensor& self, - c10::optional s=c10::nullopt, + at::OptionalIntArrayRef s=c10::nullopt, IntArrayRef dim={-2, -1}, c10::optional norm=c10::nullopt) { return torch::fft_ihfftn(self, s, dim, norm); @@ -341,7 +341,7 @@ inline Tensor rfftfreq(int64_t n, const TensorOptions& options) { /// auto x = torch::randn({127, 4}); /// auto centred_fft = torch::fft::fftshift(torch::fft::fftn(x)); /// ``` -inline Tensor fftshift(const Tensor& x, c10::optional dim=c10::nullopt) { +inline Tensor fftshift(const Tensor& x, at::OptionalIntArrayRef dim=c10::nullopt) { return torch::fft_fftshift(x, dim); } @@ -356,7 +356,7 @@ inline Tensor fftshift(const Tensor& x, c10::optional dim=c10::null /// auto unshift = torch::fft::ifftshift(shift); /// assert(torch::allclose(x, unshift)); /// ``` -inline Tensor ifftshift(const Tensor& x, c10::optional dim=c10::nullopt) { +inline Tensor ifftshift(const Tensor& x, at::OptionalIntArrayRef dim=c10::nullopt) { return torch::fft_ifftshift(x, dim); } diff --git a/torch/csrc/api/include/torch/linalg.h b/torch/csrc/api/include/torch/linalg.h index e16c1f61e503..fe015c8320f3 100644 --- a/torch/csrc/api/include/torch/linalg.h +++ b/torch/csrc/api/include/torch/linalg.h @@ -76,6 +76,14 @@ inline std::tuple lu_factor_out(Tensor& LU, Tensor& pivots, co return torch::linalg_lu_factor_out(LU, pivots, self, pivot); } +inline std::tuple lu(const Tensor& self, const bool pivot) { + return torch::linalg_lu(self, pivot); +} + +inline std::tuple lu_out(Tensor& P, Tensor& L, Tensor& U, const Tensor& self, const bool pivot) { + return torch::linalg_lu_out(P, L, U, self, pivot); +} + inline std::tuple lstsq(const Tensor& self, const Tensor& b, c10::optional cond, c10::optional driver) { return torch::linalg_lstsq(self, b, cond, driver); } @@ -84,27 +92,27 @@ inline Tensor matrix_exp(const Tensor& self) { return torch::linalg_matrix_exp(self); } -inline Tensor norm(const Tensor& self, const optional& opt_ord, optional opt_dim, bool keepdim, optional opt_dtype) { +inline Tensor norm(const Tensor& self, const optional& opt_ord, OptionalIntArrayRef opt_dim, bool keepdim, optional opt_dtype) { return torch::linalg_norm(self, opt_ord, opt_dim, keepdim, opt_dtype); } -inline Tensor norm(const Tensor& self, c10::string_view ord, optional opt_dim, bool keepdim, optional opt_dtype) { +inline Tensor norm(const Tensor& self, c10::string_view ord, OptionalIntArrayRef opt_dim, bool keepdim, optional opt_dtype) { return torch::linalg_norm(self, ord, opt_dim, keepdim, opt_dtype); } -inline Tensor& norm_out(Tensor& result, const Tensor& self, const optional& opt_ord, optional opt_dim, bool keepdim, optional opt_dtype) { +inline Tensor& norm_out(Tensor& result, const Tensor& self, const optional& opt_ord, OptionalIntArrayRef opt_dim, bool keepdim, optional opt_dtype) { return torch::linalg_norm_out(result, self, opt_ord, opt_dim, keepdim, opt_dtype); } -inline Tensor& norm_out(Tensor& result, const Tensor& self, c10::string_view ord, optional opt_dim, bool keepdim, optional opt_dtype) { +inline Tensor& norm_out(Tensor& result, const Tensor& self, c10::string_view ord, OptionalIntArrayRef opt_dim, bool keepdim, optional opt_dtype) { return torch::linalg_norm_out(result, self, ord, opt_dim, keepdim, opt_dtype); } -inline Tensor vector_norm(const Tensor& self, Scalar ord, optional opt_dim, bool keepdim, optional opt_dtype) { +inline Tensor vector_norm(const Tensor& self, Scalar ord, OptionalIntArrayRef opt_dim, bool keepdim, optional opt_dtype) { return torch::linalg_vector_norm(self, ord, opt_dim, keepdim, opt_dtype); } -inline Tensor& vector_norm_out(Tensor& result, const Tensor& self, Scalar ord, optional opt_dim, bool keepdim, optional opt_dtype) { +inline Tensor& vector_norm_out(Tensor& result, const Tensor& self, Scalar ord, OptionalIntArrayRef opt_dim, bool keepdim, optional opt_dtype) { return torch::linalg_vector_norm_out(result, self, ord, opt_dim, keepdim, opt_dtype); } @@ -228,11 +236,11 @@ inline Tensor& tensorinv_out(Tensor& result,const Tensor& self, int64_t ind) { return torch::linalg_tensorinv_out(result, self, ind); } -inline Tensor tensorsolve(const Tensor& self, const Tensor& other, optional dims) { +inline Tensor tensorsolve(const Tensor& self, const Tensor& other, OptionalIntArrayRef dims) { return torch::linalg_tensorsolve(self, other, dims); } -inline Tensor& tensorsolve_out(Tensor& result, const Tensor& self, const Tensor& other, optional dims) { +inline Tensor& tensorsolve_out(Tensor& result, const Tensor& self, const Tensor& other, OptionalIntArrayRef dims) { return torch::linalg_tensorsolve_out(result, self, other, dims); } @@ -354,26 +362,26 @@ inline Tensor matrix_exp(const Tensor& input) { } // C10_DEPRECATED_MESSAGE("linalg_norm is deprecated, use norm instead.") -inline Tensor linalg_norm(const Tensor& self, const optional& opt_ord, optional opt_dim, bool keepdim, optional opt_dtype) { +inline Tensor linalg_norm(const Tensor& self, const optional& opt_ord, OptionalIntArrayRef opt_dim, bool keepdim, optional opt_dtype) { return detail::norm(self, opt_ord, opt_dim, keepdim, opt_dtype); } // C10_DEPRECATED_MESSAGE("linalg_norm is deprecated, use norm instead.") -inline Tensor linalg_norm(const Tensor& self, c10::string_view ord, optional opt_dim, bool keepdim, optional opt_dtype) { +inline Tensor linalg_norm(const Tensor& self, c10::string_view ord, OptionalIntArrayRef opt_dim, bool keepdim, optional opt_dtype) { return detail::norm(self, ord, opt_dim, keepdim, opt_dtype); } // C10_DEPRECATED_MESSAGE("linalg_norm_out is deprecated, use norm_out instead.") -inline Tensor& linalg_norm_out(Tensor& result, const Tensor& self, const optional& opt_ord, optional opt_dim, bool keepdim, optional opt_dtype) { +inline Tensor& linalg_norm_out(Tensor& result, const Tensor& self, const optional& opt_ord, OptionalIntArrayRef opt_dim, bool keepdim, optional opt_dtype) { return detail::norm_out(result, self, opt_ord, opt_dim, keepdim, opt_dtype); } // C10_DEPRECATED_MESSAGE("linalg_norm_out is deprecated, use norm_out instead.") -inline Tensor& linalg_norm_out(Tensor& result, const Tensor& self, c10::string_view ord, optional opt_dim, bool keepdim, optional opt_dtype) { +inline Tensor& linalg_norm_out(Tensor& result, const Tensor& self, c10::string_view ord, OptionalIntArrayRef opt_dim, bool keepdim, optional opt_dtype) { return detail::norm_out(result, self, ord, opt_dim, keepdim, opt_dtype); } -/// Computes the pivoted LU factorization +/// Computes the LU factorization with partial pivoting /// /// See https://pytorch.org/docs/master/linalg.html#torch.linalg.lu_factor inline std::tuple lu_factor(const Tensor& input, const bool pivot=true) { @@ -384,28 +392,39 @@ inline std::tuple lu_factor_out(Tensor& LU, Tensor& pivots, co return detail::lu_factor_out(LU, pivots, self, pivot); } -inline Tensor norm(const Tensor& self, const optional& opt_ord, optional opt_dim, bool keepdim, optional opt_dtype) { +/// Computes the LU factorization with partial pivoting +/// +/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.lu +inline std::tuple lu(const Tensor& input, const bool pivot=true) { + return detail::lu(input, pivot); +} + +inline std::tuple lu_out(Tensor& P, Tensor& L, Tensor& U, const Tensor& self, const bool pivot=true) { + return detail::lu_out(P, L, U, self, pivot); +} + +inline Tensor norm(const Tensor& self, const optional& opt_ord, OptionalIntArrayRef opt_dim, bool keepdim, optional opt_dtype) { return detail::norm(self, opt_ord, opt_dim, keepdim, opt_dtype); } -inline Tensor norm(const Tensor& self, std::string ord, optional opt_dim, bool keepdim, optional opt_dtype) { +inline Tensor norm(const Tensor& self, std::string ord, OptionalIntArrayRef opt_dim, bool keepdim, optional opt_dtype) { return detail::norm(self, ord, opt_dim, keepdim, opt_dtype); } -inline Tensor& norm_out(Tensor& result, const Tensor& self, const optional& opt_ord, optional opt_dim, bool keepdim, optional opt_dtype) { +inline Tensor& norm_out(Tensor& result, const Tensor& self, const optional& opt_ord, OptionalIntArrayRef opt_dim, bool keepdim, optional opt_dtype) { return detail::norm_out(result, self, opt_ord, opt_dim, keepdim, opt_dtype); } -inline Tensor& norm_out(Tensor& result, const Tensor& self, std::string ord, optional opt_dim, bool keepdim, optional opt_dtype) { +inline Tensor& norm_out(Tensor& result, const Tensor& self, std::string ord, OptionalIntArrayRef opt_dim, bool keepdim, optional opt_dtype) { return detail::norm_out(result, self, ord, opt_dim, keepdim, opt_dtype); } /// See https://pytorch.org/docs/master/linalg.html#torch.linalg.vector_norm -inline Tensor vector_norm(const Tensor& self, Scalar ord, optional opt_dim, bool keepdim, optional opt_dtype) { +inline Tensor vector_norm(const Tensor& self, Scalar ord, OptionalIntArrayRef opt_dim, bool keepdim, optional opt_dtype) { return detail::vector_norm(self, ord, opt_dim, keepdim, opt_dtype); } -inline Tensor& vector_norm_out(Tensor& result, const Tensor& self, Scalar ord, optional opt_dim, bool keepdim, optional opt_dtype) { +inline Tensor& vector_norm_out(Tensor& result, const Tensor& self, Scalar ord, OptionalIntArrayRef opt_dim, bool keepdim, optional opt_dtype) { return detail::vector_norm_out(result, self, ord, opt_dim, keepdim, opt_dtype); } @@ -501,6 +520,48 @@ inline std::tuple qr_out(Tensor& Q, Tensor& R, const Tensor& i return detail::qr_out(Q, R, input, mode); } +/// Computes the LDL decomposition +/// +/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.ldl_factor_ex +inline std::tuple ldl_factor_ex( + const Tensor& input, + bool hermitian, + bool check_errors) { + return torch::linalg_ldl_factor_ex(input, hermitian, check_errors); +} + +inline std::tuple ldl_factor_ex_out( + Tensor& LD, + Tensor& pivots, + Tensor& info, + const Tensor& input, + bool hermitian, + bool check_errors) { + return torch::linalg_ldl_factor_ex_out( + LD, pivots, info, input, hermitian, check_errors); +} + +/// Solve a system of linear equations using the LDL decomposition +/// +/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.ldl_solve +inline Tensor ldl_solve( + const Tensor& LD, + const Tensor& pivots, + const Tensor& B, + bool hermitian) { + return torch::linalg_ldl_solve(LD, pivots, B, hermitian); +} + +inline Tensor& ldl_solve_out( + Tensor& result, + const Tensor& LD, + const Tensor& pivots, + const Tensor& B, + bool hermitian) { + return torch::linalg_ldl_solve_out( + result, LD, pivots, B, hermitian); +} + /// Computes a tensor `x` such that `matmul(input, x) = other`. /// /// See https://pytorch.org/docs/master/linalg.html#torch.linalg.solve @@ -574,11 +635,11 @@ inline Tensor& tensorinv_out(Tensor& result, const Tensor& self, int64_t ind) { /// auto b = torch::randn(2*3, 4); /// auto x = torch::linalg::tensorsolve(a, b); /// ``` -inline Tensor tensorsolve(const Tensor& input, const Tensor& other, optional dims) { +inline Tensor tensorsolve(const Tensor& input, const Tensor& other, OptionalIntArrayRef dims) { return detail::tensorsolve(input, other, dims); } -inline Tensor& tensorsolve_out(Tensor& result, const Tensor& input, const Tensor& other, optional dims) { +inline Tensor& tensorsolve_out(Tensor& result, const Tensor& input, const Tensor& other, OptionalIntArrayRef dims) { return detail::tensorsolve_out(result, input, other, dims); } diff --git a/torch/csrc/api/include/torch/nn/functional/activation.h b/torch/csrc/api/include/torch/nn/functional/activation.h index b038f1bce6ba..2258dd0c4317 100644 --- a/torch/csrc/api/include/torch/nn/functional/activation.h +++ b/torch/csrc/api/include/torch/nn/functional/activation.h @@ -336,8 +336,16 @@ inline Tensor glu(const Tensor& input, const GLUFuncOptions& options = {}) { // ============================================================================ -inline Tensor gelu(const Tensor& input) { - return torch::gelu(input); +#ifndef DOXYGEN_SHOULD_SKIP_THIS +namespace detail { +inline Tensor gelu(const Tensor& input, string approximate) { + return torch::gelu(input, approximate); +} +} // namespace detail +#endif /* DOXYGEN_SHOULD_SKIP_THIS */ + +inline Tensor gelu(const Tensor& input, const GELUFuncOptions& options = {}) { + return detail::gelu(input, options.approximate()); } // ============================================================================ diff --git a/torch/csrc/api/include/torch/nn/functional/batchnorm.h b/torch/csrc/api/include/torch/nn/functional/batchnorm.h index bb8bddfcf83c..5603ec189e91 100644 --- a/torch/csrc/api/include/torch/nn/functional/batchnorm.h +++ b/torch/csrc/api/include/torch/nn/functional/batchnorm.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include @@ -20,7 +21,7 @@ inline Tensor batch_norm(const Tensor& input, if (training) { auto size = input.sizes(); int64_t size_prods = size[0]; - for (size_t i = 0; i < size.size() - 2; i++) { + for (const auto i : c10::irange(size.size() - 2)) { size_prods *= size[i + 2]; } TORCH_CHECK(size_prods != 1, diff --git a/torch/csrc/api/include/torch/nn/functional/padding.h b/torch/csrc/api/include/torch/nn/functional/padding.h index 611f407d9b7a..1b2f77626cdb 100644 --- a/torch/csrc/api/include/torch/nn/functional/padding.h +++ b/torch/csrc/api/include/torch/nn/functional/padding.h @@ -1,83 +1,36 @@ #pragma once #include +#include namespace torch { namespace nn { namespace functional { -inline Tensor _narrow_with_range(const Tensor& input, int64_t dim, int64_t start, int64_t end) { - return input.narrow(dim, start, end - start); -} - -inline Tensor _pad_circular(Tensor input, IntArrayRef padding) { - int padding_size = padding.size(); - input = torch::cat({input, _narrow_with_range(input, 2, 0, padding[-1 + padding_size])}, /*dim=*/2); - input = torch::cat({_narrow_with_range(input, 2, -(padding[-1 + padding_size] + padding[-2 + padding_size]), -padding[-1 + padding_size]), input}, /*dim=*/2); - - if (padding_size > 2) { - input = torch::cat({input, _narrow_with_range(input, 3, 0, padding[-3 + padding_size])}, /*dim=*/3); - input = torch::cat({_narrow_with_range(input, 3, -(padding[-3 + padding_size] + padding[-4 + padding_size]), -padding[-3 + padding_size]), input}, /*dim=*/3); - } - - if (padding_size > 4) { - input = torch::cat({input, _narrow_with_range(input, 4, 0, padding[-5 + padding_size])}, /*dim=*/4); - input = torch::cat({_narrow_with_range(input, 4, -(padding[-5 + padding_size] + padding[-6 + padding_size]), -padding[-5 + padding_size]), input}, /*dim=*/4); - } - - return input; -} - #ifndef DOXYGEN_SHOULD_SKIP_THIS namespace detail { inline Tensor pad(const Tensor& input, IntArrayRef pad, PadFuncOptions::mode_t mode, double value) { - TORCH_CHECK(pad.size() % 2 == 0, "Padding length must be divisible by 2"); - TORCH_CHECK(((int64_t)(pad.size() / 2)) <= input.dim(), "Padding length too large"); - if (c10::get_if(&mode)) { - return torch::constant_pad_nd(input, pad, value); - } else { - TORCH_CHECK( - value == 0, - "Padding mode \"", - torch::enumtype::get_enum_name(mode), - "\" doesn't take in value argument"); - if (pad.size() == 2 && (input.dim() == 2 || input.dim() == 3)) { - if (c10::get_if(&mode)) { - return torch::reflection_pad1d(input, pad); - } else if (c10::get_if(&mode)) { - return torch::replication_pad1d(input, pad); - } else if (c10::get_if(&mode)) { - return _pad_circular(input, pad); - } else { - TORCH_CHECK(false, "NotImplementedError"); - } - } else if(pad.size() == 4 && (input.dim() == 3 || input.dim() == 4)) { - if (c10::get_if(&mode)) { - return torch::reflection_pad2d(input, pad); - } else if (c10::get_if(&mode)) { - return torch::replication_pad2d(input, pad); - } else if (c10::get_if(&mode)) { - return _pad_circular(input, pad); - } else { - TORCH_CHECK(false, "NotImplementedError"); - } - } else if (pad.size() == 6 && (input.dim() == 4 || input.dim() == 5)) { - if (c10::get_if(&mode)) { - return torch::reflection_pad3d(input, pad); - } else if (c10::get_if(&mode)) { - return torch::replication_pad3d(input, pad); - } else if (c10::get_if(&mode)) { - return _pad_circular(input, pad); - } else { - TORCH_CHECK(false, "NotImplementedError"); - } - } else { - TORCH_CHECK(false, "Only 2D, 3D, 4D, 5D padding with non-constant padding are supported for now"); + const auto mode_enum = [&] { + if (c10::get_if(&mode)) { + return at::padding_mode::constant; + } else if (c10::get_if(&mode)) { + return at::padding_mode::reflect; + } else if (c10::get_if(&mode)) { + return at::padding_mode::replicate; + } else if (c10::get_if(&mode)) { + return at::padding_mode::circular; } + TORCH_CHECK(false, "Unrecognised padding mode"); + }(); + + c10::optional fill_value; + if (value != 0.0) { + fill_value = value; } + return at::_pad_enum(input, pad, static_cast(mode_enum), fill_value); } } // namespace detail #endif /* DOXYGEN_SHOULD_SKIP_THIS */ diff --git a/torch/csrc/api/include/torch/nn/functional/pooling.h b/torch/csrc/api/include/torch/nn/functional/pooling.h index 9da99e9fa33c..ae325fc8113e 100644 --- a/torch/csrc/api/include/torch/nn/functional/pooling.h +++ b/torch/csrc/api/include/torch/nn/functional/pooling.h @@ -770,8 +770,8 @@ inline std::tuple fractional_max_pool2d_with_indices( c10::optional> output_size_ = output_size; if (output_size_ == c10::nullopt) { TORCH_INTERNAL_ASSERT(output_ratio != c10::nullopt); - output_size_ = {(int64_t)(input.size(-2) * (*output_ratio.value())[0]), - (int64_t)(input.size(-1) * (*output_ratio.value())[1])}; + output_size_ = {(int64_t)(static_cast(input.size(-2)) * (*output_ratio.value())[0]), + (int64_t)(static_cast(input.size(-1)) * (*output_ratio.value())[1])}; } Tensor _random_samples_ = _random_samples; @@ -849,9 +849,9 @@ inline std::tuple fractional_max_pool3d_with_indices( c10::optional> output_size_ = output_size; if (output_size_ == c10::nullopt) { TORCH_INTERNAL_ASSERT(output_ratio != c10::nullopt); - output_size_ = {(int64_t)(input.size(-3) * (*output_ratio.value())[0]), - (int64_t)(input.size(-2) * (*output_ratio.value())[1]), - (int64_t)(input.size(-1) * (*output_ratio.value())[2])}; + output_size_ = {(int64_t)(static_cast(input.size(-3)) * (*output_ratio.value())[0]), + (int64_t)(static_cast(input.size(-2)) * (*output_ratio.value())[1]), + (int64_t)(static_cast(input.size(-1)) * (*output_ratio.value())[2])}; } Tensor _random_samples_ = _random_samples; diff --git a/torch/csrc/api/include/torch/nn/functional/upsampling.h b/torch/csrc/api/include/torch/nn/functional/upsampling.h index faa6e73368a8..fac6a9c6239b 100644 --- a/torch/csrc/api/include/torch/nn/functional/upsampling.h +++ b/torch/csrc/api/include/torch/nn/functional/upsampling.h @@ -64,7 +64,7 @@ inline std::vector _interp_output_size( std::vector ret; for (const auto i : c10::irange(dim)) { - ret.emplace_back(static_cast(floor(input.size(i + 2) * scale_factors[i]))); + ret.emplace_back(static_cast(floor(static_cast(input.size(i + 2)) * scale_factors[i]))); } return ret; } diff --git a/torch/csrc/api/include/torch/nn/modules/activation.h b/torch/csrc/api/include/torch/nn/modules/activation.h index 28225ee0f68b..e4fc02f310d5 100644 --- a/torch/csrc/api/include/torch/nn/modules/activation.h +++ b/torch/csrc/api/include/torch/nn/modules/activation.h @@ -570,12 +570,17 @@ TORCH_MODULE(GLU); // NOLINTNEXTLINE(bugprone-exception-escape) class TORCH_API GELUImpl : public torch::nn::Cloneable { public: + explicit GELUImpl(GELUOptions options_ = {}); + Tensor forward(const Tensor& input); void reset() override; /// Pretty prints the `GELU` module into the given `stream`. void pretty_print(std::ostream& stream) const override; + + /// The options with which this `Module` was constructed. + GELUOptions options; }; /// A `ModuleHolder` subclass for `GELUImpl`. diff --git a/torch/csrc/api/include/torch/nn/modules/container/modulelist.h b/torch/csrc/api/include/torch/nn/modules/container/modulelist.h index 067da4094c6f..dde24f2230dd 100644 --- a/torch/csrc/api/include/torch/nn/modules/container/modulelist.h +++ b/torch/csrc/api/include/torch/nn/modules/container/modulelist.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include @@ -207,8 +208,10 @@ class ModuleListImpl : public Cloneable { modules_.begin() + Iterator::difference_type(index), std::move(module)); - for (size_t i = index; i < size() - 1; ++i) + for (const auto i : c10::irange(index, size() - 1)) { + (void)i; // Suppress unused variable warning replace_module(c10::to_string(index), modules_[index]); + } register_module(c10::to_string(size() - 1), modules_.back()); } } diff --git a/torch/csrc/api/include/torch/nn/options/activation.h b/torch/csrc/api/include/torch/nn/options/activation.h index 651c800a84cb..16ab0245fbb6 100644 --- a/torch/csrc/api/include/torch/nn/options/activation.h +++ b/torch/csrc/api/include/torch/nn/options/activation.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -95,6 +96,33 @@ using GLUFuncOptions = GLUOptions; // ============================================================================ +/// Options for the `GELU` module. +/// +/// Example: +/// ``` +/// GELU model(GELUOptions().approximate("none")); +/// ``` +struct TORCH_API GELUOptions { + /// Specifies the approximation to apply to the output. + TORCH_ARG(std::string, approximate) = "none"; +}; + +namespace functional { +/// Options for `torch::nn::functional::gelu`. +/// +/// See the documentation for `torch::nn::GELUOptions` class to learn what +/// arguments are supported. +/// +/// Example: +/// ``` +/// namespace F = torch::nn::functional; +/// F::gelu(input, F::GELUFuncOptions().approximate("none")); +/// ``` +using GELUFuncOptions = GELUOptions; +} // namespace functional + +// ============================================================================ + /// Options for the `Hardshrink` module. /// /// Example: diff --git a/torch/csrc/api/include/torch/special.h b/torch/csrc/api/include/torch/special.h index 6e0ecc0fbcad..d667e094f993 100644 --- a/torch/csrc/api/include/torch/special.h +++ b/torch/csrc/api/include/torch/special.h @@ -215,6 +215,15 @@ inline Tensor& logsumexp_out(Tensor& result, const Tensor& self, IntArrayRef dim return torch::special_logsumexp_out(result, self, dims, keepdim); } +/// Computes the argument, x, for which the area under the Gaussian probability density +/// function (integrated from minus infinity to x) is equal to input, elementwise. +/// See https://pytorch.org/docs/master/special.html#torch.special.ndtri +/// +/// Example: +/// ``` +/// auto t = torch::rand(128, dtype=kDouble); +/// torch::special::ndtri(t); +/// ``` inline Tensor ndtri(const Tensor& self) { return torch::special_ndtri(self); } @@ -223,6 +232,23 @@ inline Tensor& ndtri_out(Tensor& result, const Tensor& self) { return torch::special_ndtri_out(result, self); } +/// Computes the log of area under the standard Gaussian probability density function, +/// integrated from minus infinity to :attr:`input`, elementwise +/// See https://pytorch.org/docs/master/special.html#torch.special.log_ndtr +/// +/// Example: +/// ``` +/// auto t = torch::randn(128, dtype=kDouble); +/// torch::special::log_ndtr(t); +/// ``` +inline Tensor log_ndtr(const Tensor& self) { + return torch::special_log_ndtr(self); +} + +inline Tensor& log_ndtr_out(Tensor& result, const Tensor& self) { + return torch::special_log_ndtr_out(result, self); +} + /// Computes the logit of input, elementwise. /// See https://pytorch.org/docs/master/special.html#torch.special.logit. /// diff --git a/torch/csrc/api/include/torch/utils.h b/torch/csrc/api/include/torch/utils.h index f664074deb03..3bb6363a4ced 100644 --- a/torch/csrc/api/include/torch/utils.h +++ b/torch/csrc/api/include/torch/utils.h @@ -5,7 +5,6 @@ #include #include #include -#include #include namespace torch { diff --git a/torch/csrc/api/src/nn/modules/activation.cpp b/torch/csrc/api/src/nn/modules/activation.cpp index 677c9e1cc836..001199e98edd 100644 --- a/torch/csrc/api/src/nn/modules/activation.cpp +++ b/torch/csrc/api/src/nn/modules/activation.cpp @@ -284,8 +284,10 @@ void GLUImpl::pretty_print(std::ostream& stream) const { // ============================================================================ +GELUImpl::GELUImpl(GELUOptions options_) : options(std::move(options_)) {} + Tensor GELUImpl::forward(const Tensor& input) { - return F::gelu(input); + return F::detail::gelu(input, options.approximate()); } void GELUImpl::reset() {} diff --git a/torch/csrc/api/src/nn/modules/adaptive.cpp b/torch/csrc/api/src/nn/modules/adaptive.cpp index 1f28d0c82816..6842b14550cd 100644 --- a/torch/csrc/api/src/nn/modules/adaptive.cpp +++ b/torch/csrc/api/src/nn/modules/adaptive.cpp @@ -94,7 +94,7 @@ ASMoutput AdaptiveLogSoftmaxWithLossImpl::forward(const Tensor& input_, const Te auto cutoff_values = cutoffs; cutoff_values.insert(cutoff_values.begin(), 0); - for (size_t i = 0; i < cutoff_values.size() - 1; ++i) { + for (const auto i : c10::irange(cutoff_values.size() - 1)) { int64_t low_idx = cutoff_values[i]; int64_t high_idx = cutoff_values[i + 1]; @@ -148,7 +148,7 @@ Tensor AdaptiveLogSoftmaxWithLossImpl::_get_full_log_prob(const Tensor& input, c out.index_put_({Slice(), Slice(None, shortlist_size)}, head_logprob.index({Slice(), Slice(None, shortlist_size)})); - for (size_t i = 0; i < cutoffs.size() - 1; ++i) { + for (const auto i : c10::irange(cutoffs.size() - 1)) { int64_t start_idx = cutoffs[i]; int64_t stop_idx = cutoffs[i+1]; const Tensor cluster_output = tail[i]->as()->forward(input); diff --git a/torch/csrc/api/src/optim/lbfgs.cpp b/torch/csrc/api/src/optim/lbfgs.cpp index d7d8dd002eb8..d3143b07ccdd 100644 --- a/torch/csrc/api/src/optim/lbfgs.cpp +++ b/torch/csrc/api/src/optim/lbfgs.cpp @@ -232,7 +232,6 @@ std::tuple _strong_wolfe(const Function& obj_fu auto d_norm = val(d.abs().max()); g = g.clone(at::MemoryFormat::Contiguous); // evaluate objective and gradient using initial step - auto obj_func_res = obj_func(x, t, d); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) double f_new; Tensor g_new; @@ -285,7 +284,6 @@ std::tuple _strong_wolfe(const Function& obj_fu f_prev = f_new; g_prev = g_new.clone(at::MemoryFormat::Contiguous); gtd_prev = gtd_new; - obj_func_res = obj_func(x, t, d); std::tie(f_new, g_new) = obj_func(x, t, d); ls_func_evals += 1; gtd_new = g_new.dot(d); @@ -335,9 +333,7 @@ std::tuple _strong_wolfe(const Function& obj_fu } // Evaluate new point - obj_func_res = obj_func(x, t, d); - f_new = std::get<0>(obj_func_res); - g_new = std::get<1>(obj_func_res); + std::tie(f_new, g_new) = obj_func(x, t, d); ls_func_evals += 1; gtd_new = g_new.dot(d); ls_iter += 1; diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp index b4bcc4e4316c..bcafabea3b4b 100644 --- a/torch/csrc/autograd/FunctionsManual.cpp +++ b/torch/csrc/autograd/FunctionsManual.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include @@ -12,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -188,7 +190,12 @@ Tensor norm_backward(const Tensor& grad, const Tensor& self, const optional & p_, Tensor norm, IntArrayRef dim, bool keepdim) { +Tensor norm_backward( + Tensor grad, const Tensor& self, const optional & p_, Tensor norm, IntArrayRef dim, bool keepdim) { + // NB: We mask fill the NaNs in the output to be zero but still do float division + // by zero, which ASAN complains about. One way to appease ASAN is to fill the problematic + // values with something arbitrary before the division, but we decide not to due to + // the perf hit. Instead we just silence ASAN where necessary size_t ndim = self.sizes().size(); double p = p_.value_or(2.0).toDouble(); Tensor self_scaled; @@ -204,34 +211,104 @@ Tensor norm_backward(Tensor grad, const Tensor& self, const optional & p } else if (p == 1.0) { return self.sgn() * grad; } else if (p == 2.0) { - self_scaled = self; - scale_v = grad / norm; + return self * (grad / norm).masked_fill_(norm == 0, 0); } else if (std::isinf(p)) { const auto self_isnan = self.isnan(); const auto norm_isnan = norm.isnan(); const auto& self_and_norm_isnan = areAnyTensorSubclassLike({self, norm}) ? self_isnan.logical_and(norm_isnan) : self_isnan.logical_and_(norm_isnan); - Tensor is_eq_max = (self.abs() == norm).logical_or_(self_and_norm_isnan).type_as(self); + auto is_eq_max = (self.abs() == norm).logical_or_(self_and_norm_isnan).type_as(self); self_scaled = self.sgn() * is_eq_max; - Tensor nb_max = is_eq_max.count_nonzero(dim); + auto nb_max = is_eq_max.count_nonzero(dim); if (self.dim() != 0) { nb_max = unsqueeze_multiple(nb_max, dim, ndim); } scale_v = grad / nb_max; + return self_scaled * scale_v; + } else if (p < 1.0) { + self_scaled = self.sgn() * self.abs().pow_(p - 1).masked_fill_(self == 0, 0); + return self_scaled * grad * norm.pow(1 - p); } else if (p < 2.0) { - self_scaled = self.sgn() * self.abs().pow(p - 1); + self_scaled = self.sgn() * self.abs().pow_(p - 1); scale_v = grad / norm.pow(p - 1); + scale_v.masked_fill_(norm == 0, 0); + return self_scaled * scale_v; } else { - self_scaled = self * self.abs().pow(p - 2); + self_scaled = self * self.abs().pow_(p - 2); scale_v = grad / norm.pow(p - 1); + scale_v.masked_fill_(norm == 0, 0); + return self_scaled * scale_v; + } +} + +// See norm_backward above for a note on ignoring the sanitizer +Tensor norm_jvp( + const Tensor& self_p, const Tensor& self_t, + const optional & p_, + Tensor norm, + IntArrayRef dim, + bool keepdim +) { + // NB: currently norm_jvp is also reused for dist's jvp (which haas two differentiable inputs) + // but self_t still cannot be a ZT because that would require both self_t and other_t to be ZT + TORCH_INTERNAL_ASSERT(!self_t._is_zerotensor()); + size_t ndim = self_p.dim(); // composite compliance? + double p = p_.value_or(2.0).toDouble(); + + if (p == 0.0) { + return at::zeros_like(norm); + } else if (p == 1.0) { + auto result = self_p.sgn(); + result = areAnyTensorSubclassLike({self_t}) ? result.mul(self_t.conj()) : result.mul_(self_t.conj()); + result = at::real(result); + return result.sum(dim, keepdim); + } else if (p == 2.0) { + auto result = self_p.mul(self_t.conj()); + result = at::real(result); + result = result.sum(dim, keepdim); + return result.div_(norm).masked_fill_(norm == 0, 0); + } else if (std::isinf(p)) { + if (!keepdim && self_p.dim() != 0) { + norm = unsqueeze_multiple(norm, dim, ndim); + } + const auto self_isnan = self_p.isnan(); + const auto norm_isnan = norm.isnan(); + const auto& self_and_norm_isnan = areAnyTensorSubclassLike({norm}) ? + self_isnan.logical_and(norm_isnan) : + self_isnan.logical_and_(norm_isnan); + const auto is_eq_max = (self_p.abs() == norm).logical_or_(self_and_norm_isnan).type_as(norm); + auto nb_max = is_eq_max.count_nonzero(dim); + if (self_p.dim() != 0) { + nb_max = unsqueeze_multiple(nb_max, dim, ndim); + } + return (at::real(self_p.sgn() * self_t.conj()) * is_eq_max / nb_max).sum(dim, keepdim); + } else if (p < 1.0) { + auto sumpow_t = (self_p.abs().pow_(p - 1).masked_fill_(self_p == 0, 0) * at::real(self_p.sgn() * self_t.conj())).sum(dim, keepdim); + return sumpow_t * norm.pow(1 - p); + } else if (p < 2.0) { + auto sumpow_t = (self_p.abs().pow_(p - 1) * at::real(self_p.sgn() * self_t.conj())).sum(dim, keepdim); + auto out = sumpow_t / norm.pow(p - 1); + return out.masked_fill_(norm == 0, 0); + } else { + auto sumpow_t = (self_p.abs().pow_(p - 2) * at::real(self_p * self_t.conj())).sum(dim, keepdim); + auto out = sumpow_t / norm.pow(p - 1); + return out.masked_fill_(norm == 0, 0); } - // handle case at 0 where we return a subgradient containing 0 - scale_v.masked_fill_(norm == 0, 0); - return self_scaled * scale_v; } -Tensor linalg_vector_norm_backward(Tensor grad, const Tensor& self, const Scalar& scalar_ord, Tensor norm, const optional& opt_dim, bool keepdim) { +Tensor norm_jvp(const Tensor& self_p, const Tensor& self_t, const optional & p_, Tensor norm) { + return norm_jvp(self_p, self_t, p_, norm, {}, true); +} + +Tensor linalg_vector_norm_jvp(const Tensor& self_p, const Tensor& self_t, const Scalar& scalar_ord, Tensor norm, const at::OptionalIntArrayRef& opt_dim, bool keepdim) { + // No need to handle the dtype arg as it's handled via broadcasting in the function + auto dim = opt_dim.value_or(IntArrayRef({})); + return norm_jvp(self_p, self_t, scalar_ord, norm, dim, keepdim); +} + +Tensor linalg_vector_norm_backward(Tensor grad, const Tensor& self, const Scalar& scalar_ord, Tensor norm, const at::OptionalIntArrayRef& opt_dim, bool keepdim) { + // No need to handle the dtype arg as it's handled via broadcasting in the function auto dim = opt_dim.value_or(IntArrayRef({})); return norm_backward(grad, self, scalar_ord, norm, dim, keepdim); } @@ -513,6 +590,7 @@ Tensor solve_backward_self(const Tensor & grad, const Tensor & self, const Tenso } Tensor solve_backward_A(const Tensor & grad, const Tensor & self, const Tensor & A, const Tensor & solution) { + at::NoTF32Guard disable_tf32; Tensor grad_self = solve_backward_self(grad, self, A); if (self.ndimension() == 2 && A.ndimension() == 2) { return -at::mm(grad_self, solution.mH()); @@ -548,7 +626,7 @@ Tensor logcumsumexp_backward(Tensor grad, const Tensor & self, Tensor result, in // Reference: https://github.com/tensorflow/tensorflow/blob/ // 2a5910906a0e0f3dbc186ff9db6386d81a63448c/tensorflow/python/ops/math_grad.py#L1832-L1863 - return AT_DISPATCH_FLOATING_TYPES( + return AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, at::typeMetaToScalarType(grad.dtype()), "logcumsumexp_backward", [grad, self, result, dim]() { @@ -715,6 +793,22 @@ std::tuple clamp_backward_min_max( return ret; } +at::Tensor clamp_jvp( + const Tensor& self_p, const Tensor& self_t, + const Tensor& min_p, const Tensor& min_t, + const Tensor& max_p, const Tensor& max_t +) { + if (min_p.defined() && max_p.defined()) { + return where(min_p > max_p, max_t, where(self_p < min_p, min_t, where(self_p > max_p, max_t, self_t))); + } else if (min_p.defined()) { + return where(self_p > min_p, self_t, min_t); + } else if (max_p.defined()) { + return where(self_p < max_p, self_t, max_t); + } else { + return self_t; + } +} + Tensor convolution_jvp( const Tensor& input_p, const Tensor& input_t, const Tensor& weight_p, const Tensor& weight_t, @@ -762,7 +856,7 @@ Tensor convolution_backward_jvp_grad_bias( } else { TORCH_INTERNAL_ASSERT( false, - "convolution_backward_jvp_grad_bias expected dim of grad_out_t to be 3, 4, or 4, but got: ", + "convolution_backward_jvp_grad_bias expected dim of grad_out_t to be 3, 4, or 5, but got: ", grad_out_t.dim()); } } @@ -795,46 +889,51 @@ at::IntArrayRef strides_or_error(const Tensor & input, c10::string_view const & "Please either use a strided tensor or set requires_grad=False for '", input_name, "'"); if (input.is_mkldnn()) return IntArrayRef({}); + if (input.is_sparse_csr()) return IntArrayRef({}); return input.strides(); } else { return IntArrayRef({}); } } -Tensor mm_mat1_backward(const Tensor & grad, const Tensor & mat2, at::IntArrayRef mat1_sizes, at::IntArrayRef mat1_strides, const Scalar & alpha) { - // if input was column-major, return grad as column-order for efficiency - if (mat1_strides[0] == 1 && mat1_strides[1] == mat1_sizes[0]) { - return maybe_multiply(mat2.conj().mm(grad.t()).t(), alpha.conj()); - } else { - return maybe_multiply(grad.mm(mat2.t().conj()), alpha.conj()); - } -} - -Tensor mm_mat2_backward(const Tensor & grad, const Tensor & mat1, IntArrayRef sizes, IntArrayRef strides, const Scalar & alpha) { - // if input was column-major, return grad as column-order for efficiency - if (strides[0] == 1 && strides[1] == sizes[0]) { - if (mat1.is_sparse()) { - // Since mm(dense, sparse) doesn't exist, - // pass a transposed output matrix to the underlying "addmm" - // function directly. - int64_t out_rows = mat1.size(1); - int64_t out_cols = grad.size(1); - Tensor t = at::zeros({}, grad.options()).expand({out_rows, out_cols}, true); - Tensor r = at::empty({out_cols, out_rows}, grad.options()).t(); - at::addmm_out(r, t, mat1.t(), grad, alpha, 1); - return r; +Tensor mm_mat1_backward(const Tensor& grad, const Tensor& mat2, at::IntArrayRef mat1_sizes, at::IntArrayRef mat1_strides, c10::Layout mat1_layout, const Scalar& alpha) { + if (grad.layout() == c10::kStrided && mat2.layout() == c10::kStrided && mat1_layout == c10::kStrided) { + // if input was column-major, return grad as column-order for efficiency + if (mat1_strides[0] == 1 && mat1_strides[1] == mat1_sizes[0]) { + return maybe_multiply(mat2.conj().mm(grad.t()).t(), alpha.conj()); } - return maybe_multiply(grad.t().mm(mat1.conj()).t(), alpha.conj()); - } else { - return maybe_multiply(mat1.t().conj().mm(grad), alpha.conj()); } + + // General fallback, should work for any layout + return maybe_multiply(grad.mm(mat2.t().conj()), alpha.conj()); } -Tensor _sparse_addmm_sparse_backward(const Tensor& grad, const Tensor& sparse_, const Tensor& dense, const Scalar& alpha) { - AT_ASSERT(sparse_.is_sparse()); - auto sparse = sparse_.coalesce(); - Tensor grad_sparse = maybe_multiply(grad.mm(dense.conj().t()), alpha); - return grad_sparse.sparse_mask(sparse); +Tensor mm_mat2_backward(const Tensor& grad, const Tensor& mat1, IntArrayRef mat2_sizes, IntArrayRef mat2_strides, c10::Layout mat2_layout, const Scalar& alpha) { + if (grad.layout() == c10::kStrided && mat1.layout() == c10::kStrided && mat2_layout == c10::kStrided) { + // if input was column-major, return grad as column-order for efficiency + if (mat2_strides[0] == 1 && mat2_strides[1] == mat2_sizes[0]) { + return maybe_multiply(grad.t().mm(mat1.conj()).t(), alpha.conj()); + } + } + + // General fallback, should work for any layout + return maybe_multiply(mat1.t().conj().mm(grad), alpha.conj()); +} + +Tensor mm_mat1_sparse_backward(const Tensor& grad, const Tensor& mat1, const Tensor& mat2, const Scalar& alpha) { + if (grad.layout() == c10::kStrided && mat2.layout() == c10::kStrided && mat1.is_sparse()) { + auto sparse = mat1.coalesce(); + Tensor grad_sparse = maybe_multiply(grad.mm(mat2.conj().t()), alpha); + return grad_sparse.sparse_mask(sparse); + } else if (grad.layout() == c10::kStrided && mat2.layout() == c10::kStrided && mat1.is_sparse_csr()) { + return at::sparse_sampled_addmm(at::zeros_like(mat1, mat1.options()), grad, mat2.mH(), 1.0, alpha); + } else if (grad.layout() == c10::kStrided && mat2.layout() == c10::kStrided && mat1.layout() == c10::kStrided) { + return maybe_multiply(grad.mm(mat2.mH()), alpha); + } + TORCH_CHECK(false, "sparse_addmm_sparse_backward: unsupported combination of layouts", + ", grad: ", grad.layout(), + ", mat1: ", mat1.layout(), + ", mat2: ", mat2.layout()); } // This function return a new SparseTensor with values from Tensor `input` filtered by indices of `mask` @@ -907,7 +1006,7 @@ Tensor renorm_backward(const Tensor & grad, const Tensor & self, const Scalar& p self, p, reduce_dims, /*keepdim=*/true); } - const auto real_acc_type = c10::toValueType(acc_type); + const auto real_acc_type = c10::toRealValueType(acc_type); auto grad_output = (self.conj() * grad); // vector_norm output is real, so grad_output must also be real if (real_acc_type != acc_type) { @@ -915,8 +1014,7 @@ Tensor renorm_backward(const Tensor & grad, const Tensor & self, const Scalar& p } grad_output = grad_output.sum( reduce_dims, /*keepdim=*/true, /*dtype=*/real_acc_type); - auto nb = linalg_vector_norm_backward( - grad_output, self, p, norm, reduce_dims, /*keepdim=*/true); + auto nb = norm_backward(grad_output, self, p, norm, reduce_dims, /*keepdim=*/true); auto invnorm = (norm + 1e-7).reciprocal(); auto grad_norm = maxnorm * invnorm * (grad - invnorm * nb); @@ -1048,7 +1146,7 @@ static Tensor var_backward(const Tensor & grad, const Tensor & self, int64_t cor return (2.0 / (self.numel() - correction)) * grad * (self - self.mean()); } -Tensor var_backward(Tensor grad, const Tensor& self, c10::optional dim_opt, +Tensor var_backward(Tensor grad, const Tensor& self, at::OptionalIntArrayRef dim_opt, c10::optional correction_opt, bool keepdim) { auto correction = correction_opt.value_or(1); if (self.dim() == 0 || !dim_opt.has_value()) { @@ -1063,7 +1161,7 @@ Tensor var_backward(Tensor grad, const Tensor& self, c10::optional return (2.0 / dof) * grad * (self - self.mean(dim, /*keepdim=*/true)); } -Tensor var_jvp(const Tensor& self_t, const Tensor& self_p, const Tensor& result, c10::optional dim_opt, +Tensor var_jvp(const Tensor& self_t, const Tensor& self_p, const Tensor& result, at::OptionalIntArrayRef dim_opt, c10::optional correction_opt, bool keepdim) { auto correction = correction_opt.value_or(1); if (self_p.dim() == 0 || !dim_opt.has_value()) { @@ -1076,7 +1174,7 @@ Tensor var_jvp(const Tensor& self_t, const Tensor& self_p, const Tensor& result, Tensor std_backward( const Tensor& result, const Tensor& grad, const Tensor& self, - c10::optional dim, c10::optional correction, bool keepdim) { + at::OptionalIntArrayRef dim, c10::optional correction, bool keepdim) { auto grad_var = (grad / (result * 2)).masked_fill_(result == 0, 0); return var_backward(grad_var, self, dim, correction, keepdim); } @@ -1091,7 +1189,7 @@ Tensor mean_backward(Tensor grad, const IntArrayRef sizes, int64_t numel) { static Tensor mean_backward( const Tensor& grad, const IntArrayRef sizes, int64_t numel, - c10::optional dim, bool keepdim) { + at::OptionalIntArrayRef dim, bool keepdim) { if (dim.has_value()) { return mean_backward(grad, sizes, *dim, keepdim); } else { @@ -1101,7 +1199,7 @@ static Tensor mean_backward( Tensor var_std_mean_backward( const variable_list& grads, const Tensor& self, const Tensor& r1, - const Tensor& r2, c10::optional dim, + const Tensor& r2, at::OptionalIntArrayRef dim, c10::optional correction, bool keepdim, bool is_std) { Tensor grad; if (grads[0].defined()) { @@ -1131,59 +1229,88 @@ Tensor masked_scatter_backward(const Tensor & grad, const Tensor & mask, IntArra return mask_selected.view(sizes); } -Tensor cholesky_jvp(const Tensor& input_tangent, const Tensor& L, bool upper) { - // Differentiation of the Cholesky decomposition, Iain Murray - // https://arxiv.org/abs/1602.07527 - // equation 8 - auto input_tangent_ = upper ? input_tangent.mH() : input_tangent; +Tensor cholesky_jvp(const Tensor& dA, const Tensor& L, bool upper) { + at::NoTF32Guard disable_tf32; + // Let A = LL^H + // dA = dLL^H + L(dL)^H + // L^{-1}dA(L^{-H}) = L^{-1}dL + (L^{-1}dL)^H + // = sym(L^{-1}dL) + // where sym(X) = X + X^H + // A short computaiton gives that the inverse of sym is given by + // \pi(X) = X.tril() - 0.5*diag(X) + // so + // dL = L\pi(L^{-1}dA(L^{-H})) + + // Precondition: dA is symmetric/Hermitian auto L_ = upper ? L.mH() : L; - - auto L_inverse = at::linalg_solve_triangular(L_, at::eye(L.size(-1), L.options()), /*upper=*/false); - auto phi = at::matmul(at::matmul(L_inverse, input_tangent_), L_inverse.mH()); - phi.tril_().diagonal(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1).mul_(0.5); - auto L_tangent = L_.matmul(phi); - return upper ? L_tangent.mH() : L_tangent; + auto dL = at::linalg_solve_triangular(L_, dA, /*upper=*/false, /*left=*/true); + dL = at::linalg_solve_triangular(L_.mH(), dL, /*upper=*/true, /*left=*/false); + dL = dL.tril() - dL.diagonal(0, -2, -1).mul(0.5).diag_embed(); + dL = L_.matmul(dL); + return upper ? dL.mH() : dL; } -Tensor cholesky_backward(Tensor grad, bool upper, Tensor L) { - // cf. Iain Murray (2016); arXiv 1602.07527 - // This gradient is symmetric, and not triangular. - // Cholesky additionally assumes that the input is symmetric, which is a subspace of - // R^{n x n}, and hence the derivative is not well-defined for off-diagonal - // elements. We resolve this by taking the gradient of the functionally independent - // elements of the matrix (i.e., the lower triangular portion of the input) and then - // reflect it on the upper triangular portion, thereby symmetrizing the gradient of - // the cholesky operation. The motivation behind this choice is that symmetric gradient - // leads to stable gradient updates, and retains symmetry of the updated matrix if it - // were updated by a gradient based algorithm. - if (upper) { - L = L.mH(); - grad = grad.mH(); - } - auto L_inverse = at::linalg_solve_triangular(L, at::eye(L.size(-1), L.options()), /*upper=*/false); - auto phi = at::matmul(L.mH(), grad); - phi.tril_().diagonal(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1).mul_(0.5); - - auto grad_input = at::matmul(at::matmul(L_inverse.mH(), phi), L_inverse); - return grad_input.add(grad_input.mH()).mul_(0.5); // Symmetrizing the gradient +Tensor cholesky_backward(const Tensor& gL, bool upper, const Tensor& L) { + at::NoTF32Guard disable_tf32; + // From cholesky_jvp we have that + // dL = L\pi(L^{-1}dA(L^-H)) + // + // Let gL be the projection into the lower-triangular gradient wrt L. Taking adjoints we have + // gA = L^{-H}\pi^*((L^HgL).tril())L^{-1} + // where \pi^*(X) = 0.5 * (X + X^H - diag(X)) + // The only non-standard point of this derivation is noting that the adjoint to multiplying + // on the left by a lower triangular matrix L is multiplying by L^H and then projecting back to + // the lower triangular matrices (hence the .tril() projection) + // Note that the gradient is symmetric and not triangular. + auto L_ = upper ? L.mH() : L; + auto gL_ = upper ? gL.mH() : gL; + + // Nb. We don't need to compute gL_ = gL.tril() as + // tril(L^H gL) = tril(L^H (triu(gL, 1) + tril(gL))) + // = tril(L^H tril(gL)) + tril(L^H triu(gL, 1)) + // = tril(L^H tril(gL)) + // since tril(L^H triu(gL, 1)) = 0, as L^H triu(gL, 1) is upper triangular + auto gA = L_.mH().matmul(gL_).tril(); + // Equivalent to 0.5 * (gA + gA^H - diag(gA)) + gA = 0.5 * (gA + gA.tril(-1).mH()); + gA = at::linalg_solve_triangular(L_.mH(), gA, /*upper=*/true, /*left=*/true); + gA = at::linalg_solve_triangular(L_, gA, /*upper=*/false, /*left=*/false); + return gA; } Tensor cholesky_inverse_backward(Tensor grad, Tensor L, bool upper, Tensor inverse) { + at::NoTF32Guard disable_tf32; Tensor grad_L; if (grad.defined()) { - Tensor common_term = grad + grad.mT(); + Tensor common_term = grad + grad.mH(); common_term = at::matmul(inverse, at::matmul(common_term, inverse)); if (upper) { grad_L = -at::matmul(L, common_term); } else { grad_L = -at::matmul(common_term, L); } - } else { - grad_L = at::zeros({1}, L.options()).expand_as(L); } + return grad_L; } +// If X = (L L^H)^{-1} with L lower-triangular with a real positive diagonal, +// then dX = K^H + K, where +// K = L^{-H} dL^{-1} [dL^{-1} = -L^{-1} dL L^{-1}] +// = -L^{-H} L^{-1} dL L^{-1} [L^{-H} L^{-1} = X] +// = -X dL L^{-1} [X = X^H = L^{-H} L^{-1} = L^{-1} L^{-H}] +// = -X dL X L^{H}. +// If X = (U^H U)^{-1} with U upper-triangular with a real positive diagonal, +// then K becomes +// K = -X dU^H X U +Tensor cholesky_inverse_jvp(const Tensor& F, const Tensor& dF, const Tensor& X, bool upper) { + at::NoTF32Guard disable_tf32; + const auto CF = upper ? F : F.mH(); + const auto dCF = upper ? dF.mH() : dF; + const auto partial_dX = -X.matmul(dCF).matmul(X).matmul(CF); + return partial_dX + partial_dX.mH(); +} + // The formula for forward AD is adapted from // // Golub, Gene H., and Victor Pereyra. "The Differentiation of Pseudo-Inverses and Nonlinear @@ -1308,11 +1435,18 @@ Tensor split_backward(const std::vector &grads, Tensor max_pool_double_backward(const Tensor & grad, const Tensor & indices, int dim) { AT_ASSERT(indices.dim() >= dim); - auto size = indices.sizes().slice(0, indices.dim() - dim).vec(); - size.push_back(-1); - auto indices_view = indices.view(size); - const auto memory_format = indices.suggest_memory_format(); - return grad.contiguous(memory_format).view(size).gather(-1, indices_view).view(indices.sizes()); + // handle non-empty inputs + if (indices.numel()) { + auto size = indices.sizes().slice(0, indices.dim() - dim).vec(); + size.push_back(-1); + auto indices_view = indices.view(size); + const auto memory_format = indices.suggest_memory_format(); + return grad.contiguous(memory_format).view(size).gather(-1, indices_view).view(indices.sizes()); + } + // handle empty inputs + else { + return at::empty_like(indices, grad.options()); + } } Tensor glu_double_backward(const Tensor & grad, const Tensor & grad_output, const Tensor & input, int64_t dim) { @@ -1445,6 +1579,45 @@ Tensor binary_cross_entropy_target_backward( return grad_target; } +Tensor binary_cross_entropy_double_backward_target( + const Tensor& grad, + const Tensor& grad_output, + const Tensor& self, + const Tensor& target, + const c10::optional& weight, + int64_t reduction +) { + auto res = -grad * grad_output; + + if (isDefined(weight)) { + res = isTensorSubclassLike(weight.value()) + ? res.mul(weight.value()) + : res.mul_(weight.value()); + } + + auto neg_self = 1 - self; + auto denom = isTensorSubclassLike(self) + ? neg_self.mul(self) + : neg_self.mul_(self); + { + at::NoGradGuard guard; + // Default eps in binary_cross_entropy for ALL dtypes + // TODO: probably change this to a dtype-dependent value + double eps = 1e-12; + denom.clamp_min_(eps); + } + + res = isTensorSubclassLike(denom) + ? res.div(denom) + : res.div_(denom); + + if (reduction == at::Reduction::Mean) { + res.div_(target.numel()); + } + + return res; +} + Tensor binary_cross_entropy_with_logits_target_backward(const Tensor& grad_output, const Tensor& self, const Tensor& target, const c10::optional& weight, const c10::optional& pos_weight, int64_t reduction) { Tensor grad_target; @@ -1497,8 +1670,8 @@ Tensor binary_cross_entropy_with_logits_jvp(const Tensor& input_t, const Tensor& } if (weight.defined()) { - grad_input.mul_(weight); - grad_target.mul_(weight); + grad_input = grad_input.mul(weight); + grad_target = grad_target.mul(weight); } return apply_loss_reduction(grad_target + grad_input, reduction); } @@ -2249,6 +2422,22 @@ std::tuple atan2_backward(const Tensor& grad, const Tensor& self output_mask[1] ? grad * -self * recip : Tensor() }; } +Tensor prelu_jvp(const Tensor& x, const Tensor& dx, const Tensor& w, const Tensor& dw) { + const auto ndim = x.dim(); + auto as_nd = [ndim](const Tensor& t) { + std::vector sizes(ndim, 1), strides(ndim, 0); + if (ndim >= 2) { + sizes[1] = t.dim() == 1 ? t.sizes()[0] : 1; + strides[1] = t.dim() == 1 ? t.strides()[0] : 0; + return t.as_strided(sizes, strides); + } + return t.as_strided(sizes, strides); + }; + auto w_ = as_nd(w); + auto dw_ = as_nd(dw); + return at::where(x >= 0, dx, w_ * dx + dw_ * x); +} + // TODO: Seriously consider writing the derivative formulas for // each output separately; there is not all that much sharing // of computation going on here. @@ -2338,6 +2527,47 @@ std::tuple prelu_double_backward( } } +Tensor gelu_double_backward( + const Tensor & ggI, + const Tensor & gO, + const Tensor & input, + c10::string_view approximate) { + //if (at::native::get_gelutype_enum(approximate) == at::native::GeluType::Tanh) { + if (approximate == "tanh") { + constexpr auto kBeta = M_SQRT2 * M_2_SQRTPI * 0.5; + constexpr auto kKappa = 0.044715; + + auto inner = kBeta * (input + kKappa * pow(input, 3)); + auto tanh_inner = tanh(inner); + auto sech_inner = 1 / cosh(inner); + + auto f = 0.5 * input; + auto g = 1 - tanh_inner * tanh_inner; + auto h = kBeta * (1 + 3 * kKappa * input * input); + + auto f_prime_gh = 0.5 * g * h; + + auto g_prime = (2 * sech_inner) * (-sech_inner * tanh_inner) * h; + auto g_prime_fh = f * h * g_prime; + + auto h_prime = 6 * kKappa * input * kBeta; + auto h_prime_fg = f * g * h_prime; + + // left_derivative = f_prime_gh + // right_derivative = f_prime_gh + g_prime_fh + h_prime_fg + // dgrad_dX = left_derivative + right_derivative + auto gI = ggI * gO * (2 * f_prime_gh + g_prime_fh + h_prime_fg); + return gI; + } else { + constexpr auto kBeta = M_2_SQRTPI * M_SQRT1_2 * 0.5; + auto input_sq = input * input; + auto pdf = kBeta * at::exp(-0.5 * input_sq); + auto dgrad_dInput = 2 * pdf - input_sq * pdf; + auto gI = ggI * gO * dgrad_dInput; + return gI; + } +} + Tensor elu_double_backward( const Tensor& grad, const Tensor& grad_output, @@ -2372,6 +2602,7 @@ std::tuple linalg_svd_jvp(const Tensor& dA, const Tensor& S, const Tensor& Vh_, const bool full_matrices) { + at::NoTF32Guard disable_tf32; // See svd_backward for the derivation // With sym(X) = X + X^H, we implement // dU = U (sym(dX S) / E + i Im(diag(dX)) / (2S)) @@ -2475,6 +2706,7 @@ Tensor svd_backward(const Tensor& gU, const Tensor& U, const Tensor& S, const Tensor& Vh) { + at::NoTF32Guard disable_tf32; // Throughout both the real and complex case we assume A has distinct singular values. // Furthermore, if A is rectangular or complex, we assume it's full-rank. // @@ -2684,6 +2916,7 @@ Tensor svd_backward(const Tensor& gU, // See the details below. Tensor eig_backward(const std::vector &grads, const Tensor& self, bool is_eigvec_tensor_nonempty, const Tensor& eigenvalues, const Tensor& eigenvectors) { + at::NoTF32Guard disable_tf32; TORCH_CHECK(is_eigvec_tensor_nonempty, "eig_backward: torch.eig(eigenvalues=False) is not differentiable. ", "Please use torch.linalg.eigvals"); @@ -2823,6 +3056,7 @@ Tensor linalg_eig_backward(const Tensor& gL, const Tensor& V, const bool is_hermitian, const bool symeig_eigenvectors) { + at::NoTF32Guard disable_tf32; // https://arxiv.org/pdf/1701.00392.pdf Eq 4.77 // For A = VLV^{-1}, denoting the gradients gA, gV and gL, we have // gA = V^{-H}(diag_embed(gL) + (V^H gV -V^HV diag(real(V^H gV))) / E*)V^H @@ -2905,6 +3139,7 @@ std::tuple linalg_eig_jvp(const Tensor& dA, const Tensor& L, const Tensor& V, const bool is_hermitian) { + at::NoTF32Guard disable_tf32; // https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf // see also https://arxiv.org/pdf/1701.00392.pdf Eqs. (4.60) and (4.63) // Note that neither of the formulas in these pdfs are correct, as they do not assume that @@ -2917,12 +3152,7 @@ std::tuple linalg_eig_jvp(const Tensor& dA, // E_{ij} = L_j - L_i if i != j // 1 otherwise - // Note: The Hermitian case is a simplification of this formula using that V^{-1} = V^H and that L is real - if (is_hermitian) { - TORCH_CHECK(at::allclose(dA, dA.mH(), /*rtol=*/1e-2, /*atol=*/1e-2), - "linalg_eig_jvp: The tangent part of the matrix A should also be ", (dA.is_complex() ? "Hermitian" : "symmetric.")); - } - + // Precondition: if is_hermitian == true, then dA is Hermitian const auto to_complex = [](const Tensor& A){ return A.to(c10::toComplexType(A.scalar_type())); }; const auto dP = is_hermitian ? at::matmul(at::matmul(V.mH(), dA), V) @@ -2952,6 +3182,7 @@ Tensor linalg_lstsq_jvp( const Tensor& dA, const Tensor& dB ) { + at::NoTF32Guard disable_tf32; auto pinvA = at::linalg_pinv(A); auto dpinvA = pinv_jvp(A, pinvA, dA); auto dX = dpinvA.matmul(B) + pinvA.matmul(dB); @@ -2966,6 +3197,7 @@ std::tuple linalg_lstsq_backward( const c10::optional driver, const std::array& grad_input_mask ) { + at::NoTF32Guard disable_tf32; Tensor A_grad, B_grad; if (!grad.defined()) { return std::make_tuple(A_grad, B_grad); @@ -2997,168 +3229,179 @@ std::tuple linalg_lstsq_backward( std::tuple linalg_qr_jvp( const Tensor& dA, const Tensor& Q, - const Tensor& R + const Tensor& R, + const c10::string_view mode ) { - auto m = dA.size(-2); - auto n = dA.size(-1); - auto k = std::min(m, n); - - auto dA1 = dA.narrow(-1, 0, k); - auto R1 = R.narrow(-1, 0, k); - - // dB1 = Q^H dA1 R1^{-1} - auto dB1 = at::linalg_solve_triangular(R1, Q.mH().matmul(dA1), /*upper=*/true, /*left=*/false); - - // dC1 = (dB1 + dB1^H).triu(-1) + (dB1 + dB1^H) * 0.5 I - auto dC1 = (dB1 + dB1.mH()).triu(); - dC1.diagonal(0, -2, -1).mul_(0.5); + // dA = dQR + QdR + // + // Case m >= n + // We can put dQ in terms of dR + // dQ = dAR^{-1} - QdRR^{-1} + // Then we have + // Q^H dA R^{-1} = Q^HdQ + dRR^{-1} + // where Q^HdQ is skew Hermitian and dRR^{-1} is upper triangular + // Define sym(X) = X + X^H + // sym(dRR^{-1}) = sym(Q^H dA R^{-1}) + // and define syminv(X) = triu(X) - 0.5 * diag(X) the inverse of + // sym : Triu(k, diag \in \mathbb{R}) -> Her(k) to give + // dR = syminv(sym(Q^H dA R^{-1}))R + // + // Case m < n + // Put dR as a function of dQ + // dR = Q^H dA - Q^H dQ R + // Let X_1 be the main m x m submatrix of a matrix X \in C^{m x n} + // Q^H A_1 R_1^{-1} = Q^H dQ + dR_1 R_1^{-1} + // Define trilIm(X) = X.tril(-1) + i * Im diag(X) + // trilIm(Q^H dQ) = trilIm(Q^H A_1 R_1^{-1}) + // and define trilIminv(X) = X - X^H - i*Im diag(X). This is the inverse of + // trilIm : Skew_C(m) -> Tril(m, imaginary diag) + // Note that it is just the inverse when the inputs are skew-Hermitian, not necessarily + // when the inputs are arbitrary matrices. We then get + // dQ = Q trilImInv(trilIm(Q^H A_1 R_1^{-1})) + at::NoTF32Guard disable_tf32; - auto dR1 = dC1.matmul(R1); + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + bool compute_q, reduced; + std::tie(compute_q, reduced) = at::native::_parse_qr_mode(mode); - // dQ = (dA1 - Q dR1) R1^{-1} - auto dQ = at::linalg_solve_triangular(R1, dA1 - Q.matmul(dR1), /*upper=*/true, /*left=*/false); + TORCH_CHECK(compute_q, "The derivative of linalg.qr depends on Q, which is not computed when " + "mode='r'. Please use linalg.qr(A, mode='reduced') if you are " + "going to differentiate through linalg.qr."); + auto m = dA.size(-2); + auto n = dA.size(-1); - Tensor dR; + TORCH_CHECK(reduced || m <= n, "The QR decomposition is not differentiable when " + "mode='complete' and nrows > ncols."); if (m >= n) { - dR = dR1; - } - else { - auto dA2 = dA.narrow(-1, k, n - k); - auto R2 = R.narrow(-1, k, n - k); - auto dR2 = Q.mH().matmul(dA2 - dQ.matmul(R2)); - dR = at::cat({dR1, dR2}, -1); - } - - return std::make_tuple(dQ, dR); -} + const auto sym = [](const Tensor& X) { return X + X.mH(); }; + const auto syminv = [](const Tensor& X) { + auto ret = X.triu(); + ret.diagonal(0, -2, -1).mul_(0.5); + return ret; + }; + auto dARinv = at::linalg_solve_triangular(R, dA, /*upper=*/true, /*left=*/false); + auto dR = syminv(sym(Q.mH().matmul(dARinv))); + auto dQ = dARinv - Q.matmul(dR); + dR = dR.matmul(R); + return std::make_tuple(std::move(dQ), std::move(dR)); + } else { + const auto trilim = [](const Tensor& X) { + if (X.is_complex()) { + auto ret = X.tril(); + at::real(ret.diagonal(0, -2, -1)).zero_(); + return ret; + } else { + return X.tril(-1); + } + }; + const auto triliminv = [](const Tensor& X) { + if (X.is_complex()) { + auto ret = X - X.mH(); + ret.diagonal(0, -2, -1).mul_(0.5); + return ret; + } else { + return X - X.mT() ; + } + }; -Tensor linalg_qr_jvp_Q( - const Tensor& dA, - const Tensor& Q, - const Tensor& R -) { - return std::get<0>(linalg_qr_jvp(dA, Q, R)); + auto QHdA = Q.mH().matmul(dA); + auto QHdA1Rinv = at::linalg_solve_triangular(R.narrow(-1, 0, m), QHdA.narrow(-1, 0, m), /*upper=*/true, /*left=*/false); + auto dQ = triliminv(trilim(QHdA1Rinv)); + auto dR = QHdA - dQ.matmul(R); + dQ = Q.matmul(dQ); + return std::make_tuple(std::move(dQ), std::move(dR)); + } } -Tensor linalg_qr_jvp_R( - const Tensor& dA, - const Tensor& Q, - const Tensor& R -) { - return std::get<1>(linalg_qr_jvp(dA, Q, R)); -} +Tensor linalg_qr_backward(const Tensor& gQ, const Tensor& gR, + const Tensor& Q, const Tensor& R, + const c10::string_view mode) { + // Nb. We won't be too formal below, as writing this proof formaly is a pain + // We'll link here a formal writing of all this at some point in the future + // + // Case m >= n + // dQ = dAR^{-1} - Qsyminv(sym(Q^H dA R^{-1})) + // dR = syminv(sym(Q^H dA R^{-1}))R + // + // With the notation from the JVP formla, the only two computations that we need are + // syminv*(R) = 0.5 * (R.triu() + R.triu()^H - Re diag(R)) + // sym*(X) = 2 * X + // Using these, after a few simplifications we get that + // gA = (gQ + syminvadj(triu(gR R^H - Q^H gQ)))R^{-H} + // + // Case m < n + // dR = Q^H dA - Q^H dQ R + // dQ = Q trilImInv(trilIm(Q^H A_1 R_1^{-1})) + // + // In this case trilIm*(X) = X (it's the trivial embedding) + // while trilImInv*(X) = tril(Y) - 0.5 * diag(Y) + // with Y = X - X^H + // + // We also have that if X \in C^{m, n} an dpi(X) = X_1, + // projects X into its leading m x m submatrix, + // pi*(X) = cat(X, 0_{m,n-m}, dim=-1) + // + // Using this, we get that + // gA = QgR + pi*(Q trilImInv*(Q^H gQ - gR R^H)R_1^{-H}) + at::NoTF32Guard disable_tf32; -Tensor linalg_qr_backward(const std::vector &grads, const Tensor& self, - c10::string_view mode, const Tensor& q, const Tensor& r){ // NOLINTNEXTLINE(cppcoreguidelines-init-variables) bool compute_q, reduced; std::tie(compute_q, reduced) = at::native::_parse_qr_mode(mode); - TORCH_CHECK(compute_q, "The derivative of qr is not implemented when mode='r'. " - "Please use torch.linalg.qr(..., mode='reduced')"); - - auto square_deep_case_backward = [](const Tensor& grad_Q, - const Tensor& grad_R, - const Tensor& A, - const Tensor& Q, - const Tensor& R) -> Tensor { - // For square and deep (tall) case we refer: - // Matthias Seeger, Asmus Hetzel, Zhenwen Dai, Eric Meissner, Neil D. Lawrence (2018). Auto-Differentiating Linear Algebra. - // https://arxiv.org/abs/1710.08717 Section 4.3 LQ Decomposition (Note that LQ decomposition is the transpose of QR decomposition) - // Hai-Jun Liao, Jin-Guo Liu, Lei Wang, Tao Xiang (2019). Differentiable Programming Tensor Networks. - // https://arxiv.org/abs/1903.09650 Section 3. QR factorization - // For derivations of complex-valued input case, see https://giggleliu.github.io/2019/04/02/einsumbp.html - - // Compute R grad_R^H - Tensor R_term; - if (grad_R.defined()) { - R_term = at::matmul(R, grad_R.mH()); - } else { - // R is ... x N x N, grad_R is ... x N x N and grad_R.T is ... x N x N - R_term = at::zeros_like(R, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - } - // Compute grad_Q^H Q - Tensor Q_term; - if (grad_Q.defined()) { - Q_term = at::matmul(grad_Q.mH(), Q); - } else { - // Q is ... x M x N, Q.T is ... x N x M and grad_Q is ... x M x N - Q_term = at::zeros_like(R, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - } + TORCH_CHECK(compute_q, "The derivative of linalg.qr depends on Q, which is not computed when " + "mode='r'. Please use linalg.qr(A, mode='reduced') if you are " + "going to differentiate through linalg.qr."); - Tensor M = R_term - Q_term; + auto m = Q.size(-2); + auto n = R.size(-1); - // Compute M = (tril(M) + tril(M).mH()) * 0.5 Identity - Tensor M_tril = at::tril(M); - M = M_tril + M_tril.mH(); - M.diagonal(0, -2, -1).mul_(0.5); + TORCH_CHECK(reduced || m <= n, "The QR decomposition is not differentiable when " + "mode='complete' and nrows > ncols."); - Tensor rhs_term; - if (grad_Q.defined()) { - rhs_term = grad_Q + at::matmul(Q, M); + if (!gQ.defined() && !gR.defined()) { + return {}; + } + + Tensor gA; + if (gQ.defined()) { + if (gR.defined()) { + gA = gR.matmul(R.mH()) - Q.mH().matmul(gQ); } else { - rhs_term = at::matmul(Q, M); + gA = -Q.mH().matmul(gQ); } - - // Compute rhs_term @ R^{-H} - Tensor grad_A = at::linalg_solve_triangular( - R.transpose(-2, -1).conj(), - rhs_term, - /*upper=*/false, - /*left=*/false, - /*unitriangular=*/false); - - return grad_A; - }; - - auto m = self.size(-2); - auto n = self.size(-1); - - TORCH_CHECK( - ((m <= n && (!reduced)) || reduced), - "The derivative of qr is not implemented when mode='complete' and nrows > ncols."); - - auto grad_Q = grads[0]; - auto grad_R = grads[1]; - - if (m >= n) { - return square_deep_case_backward(grad_Q, grad_R, self, q, r); } else { - // For wide (m < n) input matrices A, partition A = [X|Y] and R = [U|V] - // X and U are square full rank matrices. We will partition grads, - // grad_R = [grad_U | grad_V] and grad_A = [grad_X | grad_Y]. - // To obtain grad_X we reuse the gradient formula from the square case. - // Formulae: grad_X = square_case_grad(grad_Q_prime, grad_U, Q, U), - // where grad_Q_prime = grad_Q + Y @ grad_V^H - // and grad_Y = Q @ grad_V. - // Then concatenate grads to get grad_A = [grad_X | grad_Y]. - - auto Y = self.narrow(-1, m, n - m); - auto U = r.narrow(-1, 0, m); - Tensor grad_Y, grad_X, grad_V, grad_Q_prime; - - if (grad_R.defined()) { - grad_V = grad_R.narrow(-1, m, n - m); - // reuse grad_R to store grad_U - grad_R = grad_R.narrow(-1, 0, m); - // grad_Q_prime starts with the value of Y @ grad_V^H - grad_Q_prime = at::matmul(Y, grad_V.mH()); - } else { - // when grad_R is not defined then grad_V and grad_Q_prime - // get initialized with zeros - grad_V = at::zeros_like(Y, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - grad_Q_prime = at::zeros_like(q, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + gA = gR.matmul(R.mH()); + } + if (m >= n) { + const auto syminvadj = [](const Tensor& X) { + auto ret = X + X.mH(); + at::real(ret.diagonal(0, -2, -1)).mul_(0.5); + return ret; + }; + gA = Q.matmul(syminvadj(gA.triu())); + if (gQ.defined()) { + gA = gA + gQ; } - - if (grad_Q.defined()) { - // add the grad_Q term into grad_Q_prime when defined o/w is 0 - grad_Q_prime = grad_Q_prime + grad_Q; + gA = at::linalg_solve_triangular(R.mH(), gA, /*upper*/false, /*left*/false); + return gA; + } else { + auto trilImInvAdjSkew = [](const Tensor& X) { + auto ret = (X - X.mH()).tril(); + if (X.is_complex()) { + at::imag(ret.diagonal(0, -2, -1)).mul_(0.5); + } + return ret; + }; + gA = Q.matmul(trilImInvAdjSkew(-gA)); + gA = at::linalg_solve_triangular(R.narrow(-1, 0, m).mH(), gA, /*upper*/false, /*left*/false); + auto shape = R.sizes().vec(); + shape.end()[-1] = n - m; + gA = at::cat({gA, gA.new_zeros(shape)}, /*dim=*/-1); + if (gR.defined()) { + gA = gA + Q.matmul(gR); } - // Calculate grad_X using the helper. Grad_R contains the grad_U value - grad_X = square_deep_case_backward(grad_Q_prime, grad_R, self, q, U); - grad_Y = at::matmul(q, grad_V); - // Concatenate grad_X and grad_Y to get grad_A. - return at::cat({grad_X, grad_Y}, -1); + return gA; } } @@ -3239,7 +3482,7 @@ Tensor det_backward(const Tensor & grad, const Tensor& self, const Tensor& det) return svd_backward(u_grad, s_grad, vh_grad, u, s, vh); }; - auto eps = at::native::_get_epsilon(c10::toValueType(self.scalar_type())); + auto eps = at::native::_get_epsilon(c10::toRealValueType(self.scalar_type())); auto singular_det_cutoff = eps * at::linalg_matrix_norm(self); if (self.dim() == 2) { @@ -3440,6 +3683,7 @@ std::tuple triangular_solve_backward( const Tensor & b, const Tensor & a, const Tensor & x, const bool upper, const bool transpose, const bool unitriangular, std::array output_mask) { + at::NoTF32Guard disable_tf32; Tensor grad_b, grad_a; if (grad_x.defined() || grad_m.defined()) { if (grad_x.defined()) { @@ -3489,6 +3733,7 @@ Tensor linalg_solve_triangular_forward_AD( const bool upper, const bool left, const bool unitriangular) { + at::NoTF32Guard disable_tf32; // The forward AD formula (for left = true) is A^{-1}(B_t - A_tX) // For the derivation see: // [Note: Forward / Backward AD solve_triangular] @@ -3506,6 +3751,7 @@ std::tuple linalg_solve_triangular_backward( const bool left, const bool unitriangular, std::array output_mask) { + at::NoTF32Guard disable_tf32; const bool A_requires_grad = output_mask[0]; const bool B_requires_grad = output_mask[1]; // [Note: Forward / Backward AD solve_triangular] @@ -3556,6 +3802,7 @@ std::tuple linalg_solve_triangular_backward( std::tuple cholesky_solve_backward( const Tensor& grad_x, const Tensor& self, const Tensor& input2, const Tensor& result, const bool upper) { + at::NoTF32Guard disable_tf32; Tensor grad_self, grad_input2; if (grad_x.defined()) { grad_self = grad_x.cholesky_solve(input2, /*upper=*/upper); @@ -3579,6 +3826,7 @@ Tensor cholesky_solve_jvp( const Tensor& dB, const bool upper ) { + at::NoTF32Guard disable_tf32; auto dK = upper ? dU.mH().matmul(U) : dU.matmul(U.mH()); auto dA = dK + dK.mH(); @@ -3649,7 +3897,7 @@ Tensor fft_r2c_backward(const Tensor& grad, IntArrayRef dim, int64_t normalizati new_grad_shape[last_dim] = last_dim_size; const auto zero_length = last_dim_size - grad.size(dim.back()); - auto complex_full_grad = zero_length > 0 ? at::zeros(new_grad_shape, grad.options()) : grad; + auto complex_full_grad = zero_length > 0 ? grad.new_zeros(new_grad_shape) : grad; if (zero_length > 0) { complex_full_grad.slice(last_dim, 0, half_sizes[last_dim]).copy_(grad); } @@ -4505,6 +4753,7 @@ std::tuple lu_solve_backward( const Tensor& LU_data, const Tensor& LU_pivots, const std::array& grad_input_mask) { + at::NoTF32Guard disable_tf32; const bool B_requires_grad = grad_input_mask[0]; const bool LU_data_requires_grad = grad_input_mask[1]; if (!grad.defined() || (!B_requires_grad && !LU_data_requires_grad)) { @@ -4572,6 +4821,7 @@ Tensor lu_solve_jvp( const Tensor& dB, const Tensor& LU_pivots ) { + at::NoTF32Guard disable_tf32; Tensor L, U, dL, dU; std::tie(std::ignore, L, U) = at::lu_unpack(LU_data, LU_pivots, /*unpack_data=*/true, /*unpack_pivots=*/false); dL = dLU_data.tril(-1); @@ -4596,35 +4846,50 @@ Tensor lu_solve_jvp( } Tensor lu_unpack_backward( - const variable_list& grads, - const Tensor& LU_data, - bool unpack_data + const Tensor& L_grad, + const Tensor& U_grad, + const int64_t m, + const int64_t n ) { - auto L_grad = grads[1]; - auto U_grad = grads[2]; - - auto m = LU_data.size(-2); - auto n = LU_data.size(-1); - auto k = std::min(m, n); - - TORCH_CHECK(unpack_data, "lu_unpack_backward: cannot compute gradients unless unpack_data=True"); + if (!L_grad.defined() && !U_grad.defined()) { + return {}; + } + const auto k = std::min(m, n); - auto res = at::zeros(LU_data.sizes(), LU_data.options()); + // Getters for the principal and complementary part of the matrices + const auto get_L1 = [m, k](const Tensor& L) { return m == k ? L.tril(-1) : L.narrow(-2, 0, k).tril(-1); }; + const auto get_L2 = [m, k](const Tensor& L) { return L.narrow(-2, k, m - k); }; + const auto get_U1 = [n, k](const Tensor& U) { return n == k ? U.triu() : U.narrow(-1, 0, k).triu(); }; + const auto get_U2 = [n, k](const Tensor& U) { return U.narrow(-1, k, n - k); }; - Tensor L_grad_contrib; if (L_grad.defined()) { - L_grad_contrib = L_grad.tril(); - L_grad_contrib.diagonal(0, -2, -1).fill_(0); - res.narrow(-2, 0, m).narrow(-1, 0, k).add_(L_grad_contrib); - } - - Tensor U_grad_contrib; - if (U_grad.defined()) { - U_grad_contrib = U_grad.triu(); - res.narrow(-2, 0, k).narrow(-1, 0, n).add_(U_grad_contrib); + if (U_grad.defined()) { + if (m == n) { + return L_grad.tril(-1) + U_grad.triu(); + } else { + auto A1_grad = get_L1(L_grad) + get_U1(U_grad); + auto A2_grad = m > n ? get_L2(L_grad) : get_U2(U_grad); + const auto dim = m > n ? -2 : -1; + return at::cat({std::move(A1_grad), std::move(A2_grad)}, /*dim=*/dim); + } + } else { + if (m >= n) { + return L_grad.tril(-1); + } else { + auto size = L_grad.sizes().vec(); + size.end()[-1] = n - m; + return at::cat({L_grad.tril(-1), at::zeros(size, L_grad.options())}, /*dim=*/-1); + } + } + } else { + if (n >= m) { + return U_grad.triu(); + } else { + auto size = U_grad.sizes().vec(); + size.end()[-2] = m - n; + return at::cat({U_grad.triu(), at::zeros(size, U_grad.options())}, /*dim=*/-2); + } } - - return res; } Tensor cat_jvp(at::TensorList tensors, int64_t dim) { @@ -4639,7 +4904,7 @@ Tensor cat_jvp(at::TensorList tensors, int64_t dim) { std::vector fw_grads; for (auto& t: tensors) { - fw_grads.push_back(isFwGradDefined(t)? t._fw_grad(/*level*/ 0): at::zeros_like(t)); + fw_grads.push_back(isFwGradDefined(t)? t._fw_grad(/*level*/ 0): at::_efficientzerotensor(t.sizes(), t.options())); } out_fw_grad = at::cat(fw_grads, dim); @@ -4662,7 +4927,7 @@ Tensor stack_jvp(at::TensorList tensors, int64_t dim) { std::vector fw_grads; for (auto& t: tensors) { - fw_grads.push_back(isFwGradDefined(t)? t._fw_grad(/*level*/ 0): at::zeros_like(t)); + fw_grads.push_back(isFwGradDefined(t)? t._fw_grad(/*level*/ 0): at::_efficientzerotensor(t.sizes(), t.options())); } out_fw_grad = at::stack(fw_grads, dim); } @@ -4799,6 +5064,9 @@ Tensor batch_norm_jvp( TORCH_INTERNAL_ASSERT( running_mean.has_value() && running_var.has_value(), "Expect running_mean and running_var to have value when train=false"); + TORCH_CHECK( + !running_mean.value()._fw_grad(/*level=*/0).defined() && !running_var.value()._fw_grad(/*level=*/0).defined(), + "batch_norm is not differentiable wrt running_mean and running_var, they cannot have forward grad defined"); mean_p = running_mean.value().view(view_size); invstd_p = (1 / at::sqrt(running_var.value() + at::Scalar(eps))).view(view_size); result_t = input_t * invstd_p; @@ -4885,7 +5153,6 @@ Tensor group_norm_jvp( Tensor group_norm_mean_jvp( const Tensor& input_t, const Tensor& mean_p, int64_t groups) { int64_t N = input_t.size(0); - int64_t C = input_t.size(1); std::array view_shape = {1, N * groups, N ? -1 : 1}; auto input_t_reshaped = input_t.view(view_shape); return input_t_reshaped.mean({2}, false).view_as(mean_p); @@ -4896,7 +5163,6 @@ Tensor group_norm_invstd_jvp( const Tensor& mean_p, const Tensor& invstd_p, int64_t groups) { int64_t N = input_p.size(0); - int64_t C = input_p.size(1); std::vector view_shape = {1, N * groups, N ? -1 : 1}; @@ -4921,8 +5187,8 @@ Tensor gather_with_keepdimed_indices(const Tensor& input, int64_t dim, const Ten return out_fw_grad; } -// Let X in \C^{m \times n}, then its pivoted LU decomposition is -// X = P L U, where P is a permutation matrix. +// Let A in \C^{m \times n}, then its pivoted LU decomposition is +// A = P L U, where P is a permutation matrix. // // Useful notation: // Let o denote the elementwise, or Hadamard, product. @@ -4934,159 +5200,193 @@ Tensor gather_with_keepdimed_indices(const Tensor& input, int64_t dim, const Ten // // Below we derive the backward algorithm for the case when m <= n. // The case m > n could be obtained using the same idea. -// Since we assume m <= n, the LU decomposition of X could be written as -// X = (X1 | X2) = P L (U1 | U2) where X1, U1 in \C^{m \times m}, X2, U2 in \C^{m, n - m} +// Since we assume m <= n, the LU decomposition of A could be written as +// A = (A1 | A2) = P L (U1 | U2) where A1, U1 in \C^{m \times m}, A2, U2 in \C^{m, n - m} // // Forward AD: // -// dX = P dL U + P L dU => [left-multiply P^T] -// (P^T dX1 | P^T dX2) = (dL U1 + L dU1 | dL U2 + L dU2) (*) +// dA = P dL U + P L dU => [left-multiply P^T] +// (P^T dA1 | P^T dA2) = (dL U1 + L dU1 | dL U2 + L dU2) (*) // From (*): -// P^T dX1 = dL U1 + L dU1 => [left-multiply by L^{-1}, right-multiply by U1^{-1}] -// L^{-1} P^T dX1 U1^{-1} = L^{-1} dL + dU1 U1^{-1} (**). +// P^T dA1 = dL U1 + L dU1 => [left-multiply by L^{-1}, right-multiply by U1^{-1}] +// L^{-1} P^T dA1 U1^{-1} = L^{-1} dL + dU1 U1^{-1} (**). // Note, L is lower-triangular, and so is its inverse, hence L^{-1} dL is lower-triangular. // Also, since the diagonal of L (all ones) is never exposed explicity (packed representation), // the diagonal of dL is zero, and hence diag(L^{-1} dL) = 0. // Assuming that U1 is full-rank, similarly, dU1 U1^{-1} is upper-triangular. // Combining these observations we conclude: // -// L^{-1} dL = (L^{-1} P^T dX1 U1^{-1}) o 1_L, -// dU1 U1^{-1} = (L^{-1} P^T dX1 U1^{-1}) o 1_U. +// L^{-1} dL = (L^{-1} P^T dA1 U1^{-1}) o 1_L, +// dU1 U1^{-1} = (L^{-1} P^T dA1 U1^{-1}) o 1_U. // // Hence, -// dL = L [(L^{-1} P^T dX1 U1^{-1}) o 1_L], -// dU1 = [(L^{-1} P^T dX1 U1^{-1}) o 1_U] U1. +// dL = L [(L^{-1} P^T dA1 U1^{-1}) o 1_L], +// dU1 = [(L^{-1} P^T dA1 U1^{-1}) o 1_U] U1. // As for dU2, from (*) it follows -// P^T dX2 = dL U2 + L dU2 => -// dU2 = L^{-1} (P^T dX2 - dL U2). +// P^T dA2 = dL U2 + L dU2 => +// dU2 = L^{-1} (P^T dA2 - dL U2). // // Backward AD: // // The following equality comes very handy: // Tr(A (B o C)) = Tr((A o B^T) C) (!) +// or in other words, given that X -> B o X is a pointwise operation +// its Jacobian is diagonal, so its differential is self-adjoint +// = // -// Tr(X_grad^H dX) = Tr(L_grad^H dL) + Tr(U_grad^H dU), then +// Tr(A_grad^H dA) = Tr(L_grad^H dL) + Tr(U_grad^H dU), then // -// Tr(L_grad^H dL) = Tr(L_grad^H L [(L^{-1} P^T dX1 U1^{-1}) o 1_L] = [using (!)] -// = Tr((L_grad^H L o 1_L^T) L^{-1} P^T dX1 U1^{-1}) = [using the cyclic property of Tr] -// = Tr(U1^{-1} (L_grad^H L o 1_L^T) L^{-1} P^T dX1) +// Tr(L_grad^H dL) = Tr(L_grad^H L [(L^{-1} P^T dA1 U1^{-1}) o 1_L] = [using (!)] +// = Tr((L_grad^H L o 1_L^T) L^{-1} P^T dA1 U1^{-1}) = [using the cyclic property of Tr] +// = Tr(U1^{-1} (L_grad^H L o 1_L^T) L^{-1} P^T dA1) // // Similar, using (!) and the cyclic property of the trace operator: // Tr(U_grad^H dU) = Tr(U1_grad^H dU1) + Tr(U2_grad^H dU2) -// = Tr(U1^{-1} (U1 U1_grad^H o 1_U^T) L^{-1} P^T dX1) -// + Tr(U2_grad^H L^{-1} P^T dX2) -// - Tr(U1^{-1} (U2 U2_grad^H o 1_L^T) L^{-1} P^T dX1) +// = Tr(U1^{-1} (U1 U1_grad^H o 1_U^T) L^{-1} P^T dA1) +// + Tr(U2_grad^H L^{-1} P^T dA2) +// - Tr(U1^{-1} (U2 U2_grad^H o 1_L^T) L^{-1} P^T dA1) // -// By combining the matrices to the left from dX1 and dX2 and then applying conjugate transposition, +// By combining the matrices to the left from dA1 and dA2 and then applying conjugate transposition, // we finally arrive at: // -// X1_grad = P L^{-H} [L^H L_grad o 1_L + U1_grad U1^H o 1_U - U2_grad U2^H o 1_L] U1^{-H}, -// X2_grad = P L^{-H} U2_grad -Tensor plu_backward_base( - const variable_list& grads, - const Tensor& self, +// A1_grad = P L^{-H} [L^H L_grad o 1_L + U1_grad U1^H o 1_U - U2_grad U2^H o 1_L] U1^{-H}, +// A2_grad = P L^{-H} U2_grad +Tensor linalg_lu_backward( + const Tensor& L_grad, + const Tensor& U_grad, const Tensor& P, const Tensor& L, - const Tensor& U) { - auto L_grad = grads[0]; - auto U_grad = grads[1]; + const Tensor& U, + const bool pivot) { + at::NoTF32Guard disable_tf32; + // Return early if there's nothing to do + if (!L_grad.defined() && !U_grad.defined()) { + return {}; + } - auto m = self.size(-2); - auto n = self.size(-1); + // L.shape == (..., m, k) + // U.shape == (..., k, n) + auto m = L.size(-2); + auto n = U.size(-1); auto k = std::min(m, n); - auto L_principal = L.narrow(-2, 0, k).narrow(-1, 0, k); - auto L_principal_H = L_principal.mH(); - auto L_grad_principal = L_grad.narrow(-2, 0, k).narrow(-1, 0, k); - auto U_principal = U.narrow(-2, 0, k).narrow(-1, 0, k); - auto U_principal_H = U_principal.mH(); - auto U_grad_principal = U_grad.narrow(-2, 0, k).narrow(-1, 0, k); + if (m == n) { + // A_grad = P L^{-H} [L^H L_grad o 1_L + U_grad U^H o 1_U] U^{-H}, + auto A_grad = L_grad.defined() ? L.mH().matmul(L_grad).tril(-1) : Tensor{}; + if (U_grad.defined()) { + A_grad = A_grad.defined() ? A_grad + U_grad.matmul(U.mH()).triu() + : U_grad.matmul(U.mH()).triu(); + } + A_grad = at::linalg_solve_triangular(U.mH(), A_grad, + /*upper=*/false, + /*left=*/false); + A_grad = at::linalg_solve_triangular(L.mH(), A_grad, + /*upper=*/true, + /*left=*/true, + /*unitriangular=*/true); + + return pivot ? P.matmul(std::move(A_grad)) : A_grad; + } else if (m < n) { + // Wide case + // A1_grad = P L^{-H} [U1_grad + (L^H L_grad o 1_L - U_grad U^H o 1_U) U1^{-H}) U^{-H}] + // A2_grad = P L^{-H} U2_grad + const auto get_U1 = [n, k] (const Tensor& U) { return n == k ? U : U.narrow(-1, 0, k); }; + const auto get_U2 = [n, k] (const Tensor& U) { return U.narrow(-1, k, n - k); }; + + auto A_grad = L_grad.defined() ? L.mH().matmul(L_grad) : Tensor{}; + if (U_grad.defined()) { + A_grad = A_grad.defined() ? A_grad - U_grad.triu().matmul(U.mH()) + : - U_grad.triu().matmul(U.mH()); + } + A_grad = at::linalg_solve_triangular(get_U1(U).mH(), A_grad.tril(-1), + /*upper=*/false, + /*left=*/false); - auto phi_L = L_principal_H.matmul(L_grad_principal).tril(-1); - auto phi_U = U_grad_principal.matmul(U_principal_H).triu(); + if (U_grad.defined()) { + A_grad = at::cat({A_grad + get_U1(U_grad).triu(), get_U2(U_grad)}, /*dim=*/-1); + } - auto phi = phi_L + phi_U; + A_grad = at::linalg_solve_triangular(L.mH(), A_grad, + /*upper=*/true, + /*left=*/true, + /*unitriangular=*/true); - Tensor self_grad; - if (m <= n) { - auto U_complement = U.narrow(-2, 0, k).narrow(-1, k, n - k); - auto U_grad_complement = U_grad.narrow(-2, 0, k).narrow(-1, k, n - k); - - auto phi_complement = U_grad_complement.matmul(U_complement.mH()).tril(-1); - - // recall the result for X1_grad and X2_grad from above. - // It can be rewritten as - // (X1_grad | X2_grad) = P L^{-H} psi, where - // psi = (psi1 | psi2) - // = ([L^H L_grad o 1_L + U1_grad U1^H o 1_U - U2_grad U2^H o 1_L] U1^{-H} | U2_grad), - // so it is filled in parts. - - // solve for psi1 to avoid the inversion of U1^H - auto psi_principal = at::linalg_solve_triangular(U_principal_H, phi - phi_complement, - /*upper=*/false, - /*left=*/false, - /*unitriangular=*/false); - auto psi = at::cat({psi_principal, U_grad_complement}, /*dim=*/-1); - - self_grad = P.matmul(at::linalg_solve_triangular(L_principal_H, psi, - /*upper=*/true, - /*left=*/true, - /*unitriangular=*/true)); - } - else { - // variables psi and phi carry the same meaning as in the case (m <= n), - // albeit they are differently defined. - auto L_complement = L.narrow(-2, k, m - k).narrow(-1, 0, k); - auto L_grad_complement = L_grad.narrow(-2, k, m - k).narrow(-1, 0, k); + if (!U_grad.defined()) { + A_grad = at::cat({A_grad, at::zeros_like(get_U2(U))}, /*dim=*/-1); + } + if (pivot) { + A_grad = P.matmul(A_grad); + } + return A_grad; + } else { + // Tall case + // A1_grad = P [L1_grad + L^{-H} (U_grad U^H o 1_U - L^H L_grad o 1_L)]U^{-H} + // A2_grad = P L2_grad U^{-H} - auto phi_complement = L_complement.mH().matmul(L_grad_complement).triu(); + const auto get_L1 = [m, k] (const Tensor& L) { return m == k ? L : L.narrow(-2, 0, k); }; + const auto get_L2 = [m, k] (const Tensor& L) { return L.narrow(-2, k, m - k); }; + auto A_grad = U_grad.defined() ? U_grad.matmul(U.mH()) : Tensor{}; + if (L_grad.defined()) { + A_grad = A_grad.defined() ? A_grad - L.mH().matmul(L_grad.tril(-1)) + : - L.mH().matmul(L_grad.tril(-1)); + } + A_grad = at::linalg_solve_triangular(get_L1(L).mH(), A_grad.triu(), + /*upper=*/true, + /*left=*/true, + /*unitriangular=*/true); - auto psi_principal = at::linalg_solve_triangular(L_principal_H, phi - phi_complement, - /*upper=*/true, - /*left=*/true, - /*unitriangular=*/true); - auto psi = at::cat({psi_principal, L_grad_complement}, -2); + if (L_grad.defined()) { + A_grad = at::cat({A_grad + get_L1(L_grad).tril(-1), get_L2(L_grad)}, /*dim=*/-2); + } - self_grad = at::linalg_solve_triangular(U_principal_H, P.matmul(psi), - /*upper=*/false, - /*left=*/false, - /*unitriangular=*/false); - } + A_grad = at::linalg_solve_triangular(U.mH(), A_grad, + /*upper=*/false, + /*left=*/false); - return self_grad; + if (!L_grad.defined()) { + A_grad = at::cat({A_grad, at::zeros_like(get_L2(L))}, /*dim=*/-2); + } + if (pivot) { + A_grad = P.matmul(A_grad); + } + return A_grad; + } } Tensor lu_factor_ex_backward( const Tensor& grad, - const Tensor& self, const Tensor& LU, - const Tensor& pivs) { + const Tensor& pivs, + const bool pivot) { Tensor P, L, U; - std::tie(P, L, U) = at::lu_unpack(LU, pivs); - // Note that packed LU could be represented as - // LU = L + U - I, hence - // L_grad = LU_grad, - // U_grad = LU_grad. - return plu_backward_base({/*L_grad=*/grad, /*U_grad=*/grad}, self, P, L, U); + std::tie(P, L, U) = at::lu_unpack(LU, pivs, /*unpack_data=*/true, /*unpack_pivots*/pivot); + + // L.shape == (..., m, k) + // U.shape == (..., k, n) + const auto m = LU.size(-2); + const auto n = LU.size(-1); + const auto k = std::min(m, n); + const auto L_grad = grad.narrow(-1, 0, k); + const auto U_grad = grad.narrow(-2, 0, k); + return linalg_lu_backward(/*L_grad=*/L_grad, /*U_grad=*/U_grad, P, L, U, pivot); } -Tensor lu_factor_ex_jvp( +// This function is based on the forward AD derivations outlined +// in the description to the linalg_lu_backward function. +std::tuple linalg_lu_jvp( const Tensor& dA, - const Tensor& LU, - const Tensor& pivs -) { - // This function is based on the forward AD derivations outlined - // in the description to the plu_backward_base function. - - Tensor P, L, U; - std::tie(P, L, U) = at::lu_unpack(LU, pivs); + const Tensor& P, + const Tensor& L, + const Tensor& U, + const bool pivot) { + at::NoTF32Guard disable_tf32; - auto m = LU.size(-2); - auto n = LU.size(-1); + auto m = dA.size(-2); + auto n = dA.size(-1); auto k = std::min(m, n); - auto PdA = P.mT().matmul(dA); + auto PdA = pivot ? P.mT().matmul(dA) : dA; // similar to the backward implementation, we also consider block structures such as: // for a matrix A of size m x n we decompose it as @@ -5096,40 +5396,79 @@ Tensor lu_factor_ex_jvp( auto L1 = L.narrow(-2, 0, k).narrow(-1, 0, k); auto U1 = U.narrow(-2, 0, k).narrow(-1, 0, k); - // dK = L1^{-1} PdA1 + // We form using two triangular_solve the matrix, the second one in place + // dK = L1^{-1} PdA1 U2^{-1} auto dK = at::linalg_solve_triangular(L1, PdA1, /*upper=*/false, /*left=*/true, /*unitriangular*/true); - // dK <- dK U1^{-1} + + // TODO We should be able to do this in-place. At the moment it raises: + // RuntimeError: linalg_solve_triangular(): functions with out=... + // arguments don't support automatic differentiation, but one of the arguments requires grad. + + // at::linalg_solve_triangular_out(dK, U1, dK, /*upper=*/true, /*left=*/false); dK = at::linalg_solve_triangular(U1, dK, /*upper=*/true, /*left=*/false); auto dL1 = L1.matmul(dK.tril(-1)); auto dU1 = dK.triu().matmul(U1); - // since LU = L + U - I, we have that dLU = dL + dU - // if LU is of size m x n, we always have - // dLU1 = dL1 + dU1, where the block indexing follows the rules - // outlined above. if (m == n) { - return dL1 + dU1; + return std::make_tuple(std::move(dL1), std::move(dU1)); + } else if (m < n) { + // we only need to update dU2 defined as + // dU2 := L1^{-1} PdA2 - dK.tril(-1) U2) + const auto PdA2 = PdA.narrow(-1, k, n - k); + const auto U2 = U.narrow(-1, k, n - k); + auto dU2 = at::linalg_solve_triangular(L1, PdA2, /*upper=*/false, /*left=*/true, /*unitriangular*/true) - dK.tril(-1).matmul(U2); + return std::make_tuple(std::move(dL1), at::cat({dU1, dU2}, /*dim=*/-1)); + } else { + // we only need to update dL2 defined as + // dL2 := PdA2 U^{-1} - L2 dK.triu() + const auto PdA2 = PdA.narrow(-2, k, m - k); + const auto L2 = L.narrow(-2, k, m - k); + auto dL2 = at::linalg_solve_triangular(U1, PdA2, /*upper=*/true, /*left=*/false) - L2.matmul(dK.triu()); + return std::make_tuple(at::cat({dL1, dL2}, /*dim=*/-2), std::move(dU1)); } - else { - auto dLU1 = dL1 + dU1; - - if (m < n) { - // we only need to update dLU2 defined as - // dLU2 := L1^{-1} PdA2 - dK.tril(-1) U2 - auto PdA2 = PdA.narrow(-1, k, n - k); - auto U2 = U.narrow(-1, k, n - k); - auto dLU2 = at::linalg_solve_triangular(L1, PdA2, /*upper=*/false, /*left=*/true, /*unitriangular*/true) - dK.tril(-1).matmul(U2); - return at::cat({dLU1, dLU2}, /*dim=*/-1); - } - else { - // we only need to update dLU2 defined as - // dLU2 := PdA2 U1^{-1} - L2 dK.triu() - auto PdA2 = PdA.narrow(-2, k, m - k); - auto L2 = L.narrow(-2, k, m - k); - auto dLU2 = at::linalg_solve_triangular(U1, PdA2, /*upper=*/true, /*left=*/false) - L2.matmul(dK.triu()); - return at::cat({dLU1, dLU2}, /*dim=*/-2); - } +} + +Tensor lu_factor_ex_jvp( + const Tensor& dA, + const Tensor& LU, + const Tensor& pivs, + const bool pivot +) { + Tensor dL, dU; + { + Tensor P, L, U; + std::tie(P, L, U) = at::lu_unpack(LU, pivs, /*unpack_data=*/true, /*unpack_pivots=*/pivot); + std::tie(dL, dU) = linalg_lu_jvp(dA, P, L, U, pivot); + } + + auto m = dA.size(-2); + auto n = dA.size(-1); + if (m >= n) { + dL.narrow(-2, 0, n).add_(dU); + return dL; + } else { + dU.narrow(-1, 0, m).add_(dL); + return dU; + } +} + +Tensor logsumexp_jvp(const Tensor& self_p, const Tensor& self_t, IntArrayRef dim, bool keepdim) { + // NB: for simplicitly, we recompute some values that can be reused from forward + auto self_p_exp = (self_p - at::amax(self_p, dim, true)).exp(); // Use the exp-normalize trick + auto sumexp_p = self_p_exp.sum(dim, keepdim); + + // NB: it's OK for logsumexp_jvp to be reused for formulas like softmax/log_softmax + // that only have one differentiable input, because that means self_t are never zerotensors + TORCH_INTERNAL_ASSERT(!self_t._is_zerotensor()) + if (areAnyTensorSubclassLike({self_p, self_t})) { + auto result = (self_p_exp * self_t).sum(dim, keepdim); + result /= sumexp_p; + return result; + } else { + self_p_exp *= self_t; + auto sumexp_t = self_p_exp.sum(dim, keepdim); + return sumexp_t /= sumexp_p; } } @@ -5157,41 +5496,157 @@ std::tuple _cudnn_convolution_backward( return result; } -Tensor scatter_reduce_backward(const Tensor & grad, - const Tensor& input, - int dim, - const Tensor & index, - c10::string_view reduce, - const Tensor & result){ - Tensor grad_input; - +std::tuple scatter_reduce_backward( + const Tensor& grad, + const Tensor& self, + int dim, + const Tensor& index, + const Tensor& src, + c10::string_view reduce, + bool include_self, + const Tensor& result) { + Tensor grad_self, grad_src; + + // FIXME: complex gradients not handled correctly + // For now this is ok as scatter_reduce isn't added to the whitelist + // in tools/autograd/gen_variable_type.py - // TODO: gather doesn't support broadcasting of input and index - // currently this works because scatter_reduce doesn't support broadcasting yet but - // this needs to be fixed when scatter_reduce is upgraded to support broadcasting - // by broadcasting index here too. + if (!grad.defined()) { + return std::make_tuple(grad_self, grad_src); + } if (reduce == "sum") { - grad_input = grad.gather(dim, index); + grad_self = grad; + grad_src = grad.gather(dim, index); } else if (reduce == "prod") { - grad_input = (grad * result).gather(dim, index) / input; - // handle nans in above computation when input = 0, we know result = 0 (0 / 0 -> nan) - // so just replace with 0 - grad_input.masked_fill_(input == 0, 0); + // Explicitly compute exclusive prod for elements in self/src that are 0 + Tensor masked_self = self.masked_fill(self == 0, 1); + Tensor masked_self_result = masked_self.scatter_reduce(dim, index, src, reduce, include_self); + grad_self = grad * masked_self_result / masked_self; + Tensor src_zero = src == 0; + Tensor src_num_zeros = zeros_like(self).scatter_add(dim, index, src_zero.to(self.dtype())).gather(dim, index); + Tensor src_single_zero = bitwise_and(src_zero, src_num_zeros == 1); + // For src positions with src_single_zero, grad * result.gather(dim,index) / src.masked_fill(src_zero, 1) + // would incorrectly propagate zeros as the gradient + Tensor masked_src = src.masked_fill(src_single_zero, 1); + Tensor masked_src_result = self.scatter_reduce(dim, index, masked_src, reduce, include_self); + Tensor grad_src1 = where(src_single_zero, + (grad * masked_src_result).gather(dim, index), + (grad * result).gather(dim, index) / src.masked_fill(src_zero, 1)); + if ((src_num_zeros > 1).any().item()) { + auto node = std::make_shared( + "scatter_reduce(): Double backward is unsupported for src when >1 zeros in src are scattered to the same position in self", + /* num inputs */ 1); + auto result = node->apply({ grad_src1 }); + grad_src = result[0]; + } else { + grad_src = grad_src1; + } } else if (reduce == "mean") { - Tensor N = zeros_like(grad); - N.scatter_add_(dim, index, ones_like(input)); - Tensor N_input = N.gather(dim, index); - grad_input = grad.gather(dim, index) / N_input; - grad_input.masked_fill_(N_input == 0, 0); + Tensor N = include_self ? ones_like(grad) : zeros_like(grad); + N = N.scatter_add(dim, index, ones_like(src)); + N.masked_fill_(N == 0, 1); + grad_self = grad / N; + Tensor N_src = N.gather(dim, index); + grad_src = grad.gather(dim, index) / N_src; } else if (reduce == "amax" || reduce == "amin") { + // Evenly distribute gradient when there are multiple max/mins Tensor value = result.gather(dim, index); - grad_input = (input == value) * grad.gather(dim, index); + Tensor self_is_result = (self == result).to(self.scalar_type()); + Tensor src_is_result = (src == value).to(self.scalar_type()); + Tensor N_to_distribute = self_is_result.scatter_add(dim, index, src_is_result); + Tensor grad_distributed = grad / N_to_distribute; + grad_self = (self == result) * grad_distributed; + grad_src = (src == value) * grad_distributed.gather(dim, index); } else { AT_ERROR("Expected 'reduce' to be one of 'sum', 'prod', 'mean', 'amax', 'amin' but got ", reduce, "."); } - return grad_input; + if (!include_self) { + grad_self = grad_self.scatter(dim, index, 0); + } + + return std::make_tuple(grad_self, grad_src); + +} + +Tensor _to_copy_backward(const Tensor &grad_, const c10::TensorOptions &self_options) { + // Handle R->C copies without raising a warning + const auto self_type = self_options.dtype().toScalarType(); + auto grad = c10::MaybeOwned::borrowed(grad_); + if (!c10::isComplexType(self_type) && grad->is_complex()) { + grad = c10::MaybeOwned::owned(at::real(grad_)); + } + + return grad->to(self_options, /*non_blocking=*/false, /*copy=*/false); +} + +std::tuple index_reduce_backward( + const Tensor& grad, + const Tensor& self, + int dim, + const Tensor& index, + const Tensor& source, + c10::string_view reduce, + bool include_self, + const Tensor& result) { + Tensor grad_self, grad_src; + + // FIXME: index_add's backward formula has a special case for source.dim == 0 + // but this case seems to throw the error "IndexError: dimension specified as 0 but tensor has no dimensions" + // look into whether this case is reachable and should be covered here + + if (!grad.defined()) { + return std::make_tuple(grad_self, grad_src); + } + + if (reduce == "prod") { + Tensor masked_self = self.masked_fill(self == 0, 1); + Tensor masked_self_result = masked_self.index_reduce(dim, index, source, reduce, include_self); + grad_self = grad * masked_self_result / masked_self; + Tensor src_zero = source == 0; + Tensor src_num_zeros = zeros_like(self).index_add(dim, index, src_zero.to(self.dtype())).index_select(dim, index); + Tensor src_single_zero = bitwise_and(src_zero, src_num_zeros == 1); + // For src positions with src_single_zero, (grad * result).index_select(dim,index) / source.masked_fill(src_zero, 1) + // would incorrectly propagate zeros as the gradient + Tensor masked_src = source.masked_fill(src_single_zero, 1); + Tensor masked_src_result = self.index_reduce(dim, index, masked_src, reduce, include_self); + Tensor grad_src1 = where(src_single_zero, + (grad * masked_src_result).index_select(dim, index), + (grad * result).index_select(dim, index) / source.masked_fill(src_zero, 1)); + if ((src_num_zeros > 1).any().item()) { + auto node = std::make_shared( + "index_reduce(): Double backward is unsupported for source when >1 zeros in source are scattered to the same position in self", + /* num inputs */ 1); + auto result = node->apply({ grad_src1 }); + grad_src = result[0]; + } else { + grad_src = grad_src1; + } + } else if (reduce == "mean") { + Tensor N = include_self ? ones_like(grad) : zeros_like(grad); + N = N.index_add(dim, index, ones_like(source)); + N.masked_fill_(N == 0, 1); + grad_self = grad / N; + Tensor N_src = N.index_select(dim, index); + grad_src = grad.index_select(dim, index) / N_src; + } else if (reduce == "amax" || reduce == "amin") { + Tensor value = result.index_select(dim, index); + Tensor self_is_result = (self == result).to(self.scalar_type()); + Tensor source_is_result = (source == value).to(self.scalar_type()); + Tensor N_to_distribute = self_is_result.index_add(dim, index, source_is_result); + Tensor grad_distributed = grad / N_to_distribute; + grad_self = self_is_result * grad_distributed; + grad_src = source_is_result * grad_distributed.index_select(dim, index); + } else { + AT_ERROR("Expected 'reduce' to be one of 'prod', 'amax', 'amin' or 'mean' but got ", reduce, "."); + } + + if (!include_self) { + grad_self = grad_self.index_fill(dim, index, 0); + } + + return std::make_tuple(grad_self, grad_src); } diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h index 739b44b4d62f..3f8f162ad5b1 100644 --- a/torch/csrc/autograd/FunctionsManual.h +++ b/torch/csrc/autograd/FunctionsManual.h @@ -49,7 +49,16 @@ Tensor restore_reduced_dims(const Tensor &output, IntArrayRef dims, bool keepdim Tensor scale_grad_by_count(const Tensor &grad, const Tensor &mask, IntArrayRef dims); at::Tensor norm_backward(const at::Tensor & grad, const at::Tensor & self, const optional & p_, const at::Tensor & norm); at::Tensor norm_backward(at::Tensor grad, const at::Tensor & self, const optional & p_, at::Tensor norm, at::IntArrayRef dim, bool keepdim); -at::Tensor linalg_vector_norm_backward(at::Tensor grad, const at::Tensor & self, const at::Scalar & ord, at::Tensor norm, const c10::optional & opt_dim, bool keepdim); +Tensor norm_jvp( + const Tensor& self_p, const Tensor& self_t, + const optional & p_, + Tensor norm, + IntArrayRef dim, + bool keepdim +); +Tensor norm_jvp(const Tensor& grad, const Tensor& self, const optional & p_, Tensor norm); +Tensor linalg_vector_norm_jvp(const Tensor& self_p, const Tensor& self_t, const Scalar& scalar_ord, Tensor norm, const at::OptionalIntArrayRef& opt_dim, bool keepdim); +at::Tensor linalg_vector_norm_backward(at::Tensor grad, const at::Tensor & self, const at::Scalar & ord, at::Tensor norm, const at::OptionalIntArrayRef & opt_dim, bool keepdim); at::Tensor pow_backward(at::Tensor grad, const at::Tensor & self, const at::Scalar & exponent_); at::Tensor pow_backward_self(at::Tensor grad, const at::Tensor & self, const at::Tensor & exponent); at::Tensor pow_backward_exponent(at::Tensor grad, const at::Tensor& self, const at::Tensor& exponent, at::Tensor result); @@ -77,6 +86,7 @@ at::Tensor solve_backward_self(const at::Tensor & grad, const at::Tensor & self, at::Tensor solve_backward_A(const at::Tensor & grad, const at::Tensor & self, const at::Tensor & A, const at::Tensor & solution); at::Tensor cumsum_backward(const at::Tensor & grad, int64_t dim); at::Tensor logsumexp_backward(at::Tensor grad, const at::Tensor & self, at::Tensor result, at::IntArrayRef dim, bool keepdim); +at::Tensor logsumexp_jvp(const at::Tensor& self_p, const at::Tensor& self_t, IntArrayRef dim, bool keepdim); at::Tensor logcumsumexp_backward(at::Tensor grad, const at::Tensor & self, at::Tensor result, int64_t dim); at::Tensor unbind_backward(const variable_list& grads, int64_t dim); at::Tensor unsqueeze_to(const at::Tensor & self, at::IntArrayRef sizes); @@ -85,10 +95,15 @@ std::vector cat_tensors_backward(const at::Tensor & grad, const std: at::Tensor clamp_backward(const at::Tensor & grad, const at::Tensor &self, const optional& min, const optional& max); at::Tensor clamp_backward(const at::Tensor & grad, const at::Tensor &self, const at::Tensor& min, const at::Tensor& max); std::tuple clamp_backward_min_max(const at::Tensor& grad, const at::Tensor& self, const at::Tensor& min, const at::Tensor& max, const std::array&); +at::Tensor clamp_jvp( + const Tensor& self_p, const Tensor& self_t, + const Tensor& min_p, const Tensor& min_t, + const Tensor& max_p, const Tensor& max_t +); at::IntArrayRef strides_or_error(const Tensor & input, c10::string_view const & input_name); -at::Tensor mm_mat1_backward(const Tensor & grad, const Tensor & mat2, at::IntArrayRef mat1_sizes, at::IntArrayRef mat1_strides, const Scalar & alpha); -at::Tensor mm_mat2_backward(const at::Tensor & grad, const at::Tensor & mat1, at::IntArrayRef sizes, at::IntArrayRef strides, const at::Scalar & alpha); -at::Tensor _sparse_addmm_sparse_backward(const at::Tensor& grad, const at::Tensor& sparse_, const at::Tensor& dense, const at::Scalar& alpha); +at::Tensor mm_mat1_backward(const Tensor & grad, const Tensor & mat2, at::IntArrayRef mat1_sizes, at::IntArrayRef mat1_strides, c10::Layout mat1_layout, const Scalar & alpha); +at::Tensor mm_mat2_backward(const at::Tensor & grad, const at::Tensor & mat1, at::IntArrayRef sizes, at::IntArrayRef strides, c10::Layout layout, const at::Scalar & alpha); +at::Tensor mm_mat1_sparse_backward(const at::Tensor& grad, const at::Tensor& mat1, const at::Tensor& mat2, const at::Scalar& alpha); at::Tensor sparse_sparse_matmul_backward(const at::Tensor& grad, const at::Tensor& mat1, const at::Tensor& mat2,int64_t grad_order); at::Tensor renorm_backward(const at::Tensor & grad, const at::Tensor & self, const at::Scalar& p, int64_t dim, const at::Scalar& maxnorm); at::Tensor repeat_backward(at::Tensor grad, at::IntArrayRef repeats, at::IntArrayRef input_shape); @@ -97,16 +112,17 @@ at::Tensor infinitely_differentiable_native_dropout_backward(const at::Tensor& g at::Tensor native_dropout_double_backward(const at::Tensor& ggI, const at::Tensor& grad, const at::Tensor& mask, double scale); at::Tensor evenly_distribute_backward(at::Tensor grad, const at::Tensor & input, const at::Tensor & value); at::Tensor sgn_backward(Tensor result, Tensor grad, Tensor self); -at::Tensor var_backward(at::Tensor grad, const at::Tensor& self, c10::optional dim, c10::optional correction, bool keepdim); -at::Tensor var_jvp(const at::Tensor& self_t, const at::Tensor& self_p, const at::Tensor& result, c10::optional dim_opt, c10::optional correction_opt, bool keepdim); -at::Tensor std_backward(const at::Tensor& result, const at::Tensor& grad, const at::Tensor& self, c10::optional dim, c10::optional correction, bool keepdim); +at::Tensor var_backward(at::Tensor grad, const at::Tensor& self, at::OptionalIntArrayRef dim, c10::optional correction, bool keepdim); +at::Tensor var_jvp(const at::Tensor& self_t, const at::Tensor& self_p, const at::Tensor& result, at::OptionalIntArrayRef dim_opt, c10::optional correction_opt, bool keepdim); +at::Tensor std_backward(const at::Tensor& result, const at::Tensor& grad, const at::Tensor& self, at::OptionalIntArrayRef dim, c10::optional correction, bool keepdim); at::Tensor mean_backward(at::Tensor grad, const at::IntArrayRef sizes, at::IntArrayRef dim, bool keepdim); at::Tensor mean_backward(at::Tensor grad, const at::IntArrayRef sizes, int64_t numel); -at::Tensor var_std_mean_backward(const variable_list& grads, const at::Tensor& self, const at::Tensor& r1, const at::Tensor& r2, c10::optional dim, c10::optional correction, bool keepdim, bool is_std); +at::Tensor var_std_mean_backward(const variable_list& grads, const at::Tensor& self, const at::Tensor& r1, const at::Tensor& r2, at::OptionalIntArrayRef dim, c10::optional correction, bool keepdim, bool is_std); at::Tensor masked_scatter_backward(const at::Tensor & grad, const at::Tensor & mask, at::IntArrayRef sizes); -at::Tensor cholesky_backward(at::Tensor grad, bool upper, at::Tensor L); +at::Tensor cholesky_backward(const at::Tensor& grad, bool upper, const at::Tensor& L); at::Tensor cholesky_jvp(const at::Tensor& input_tangent, const at::Tensor& L, bool upper); at::Tensor cholesky_inverse_backward(at::Tensor grad, at::Tensor L, bool upper, at::Tensor inverse); +at::Tensor cholesky_inverse_jvp(const at::Tensor& F, const at::Tensor& dF, const at::Tensor& X, bool upper); Tensor pinv_jvp( const Tensor& A, const Tensor& pinvA, @@ -133,6 +149,14 @@ Tensor binary_cross_entropy_target_backward( const Tensor& target, const c10::optional& weight, int64_t reduction); +Tensor binary_cross_entropy_double_backward_target( + const Tensor& grad, + const Tensor& grad_output, + const Tensor& self, + const Tensor& target, + const c10::optional& weight, + int64_t reduction +); at::Tensor binary_cross_entropy_with_logits_target_backward(const at::Tensor& grad_output, const at::Tensor& self, const at::Tensor& target, const c10::optional& weight, const c10::optional& pos_weight, int64_t reduction); at::Tensor binary_cross_entropy_with_logits_jvp(const Tensor& input_t, const Tensor& target_t, const Tensor& input_p, const Tensor& target_p, const c10::optional& weight_opt, const c10::optional& pos_weight_opt, int64_t reduction); at::Tensor log_sigmoid_double_backward(const at::Tensor & grad, const at::Tensor & input); @@ -226,23 +250,9 @@ std::tuple linalg_solve_triangular_backward( std::tuple _trilinear_backward(const Tensor& grad_out, const Tensor& i1, const Tensor& i2, const Tensor& i3, IntArrayRef expand1, IntArrayRef expand2, IntArrayRef expand3, IntArrayRef sumdim, std::array grad_mask); -std::tuple linalg_qr_jvp( - const Tensor& dA, - const Tensor& Q, - const Tensor& R -); -Tensor linalg_qr_jvp_Q( - const Tensor& dA, - const Tensor& Q, - const Tensor& R -); -Tensor linalg_qr_jvp_R( - const Tensor& dA, - const Tensor& Q, - const Tensor& R -); -Tensor linalg_qr_backward(const std::vector &grads, const Tensor& self, - c10::string_view mode, const Tensor& Q, const Tensor& R); +std::tuple linalg_qr_jvp(const Tensor& dA, const Tensor& Q, const Tensor& R, + const c10::string_view mode); +Tensor linalg_qr_backward(const Tensor& gQ, const Tensor& gR, const Tensor& Q, const Tensor& R, const c10::string_view mode); Tensor eig_backward(const std::vector &grads, const Tensor& self, bool eigenvectors, const Tensor& lambda, const Tensor& v); Tensor linalg_matrix_exp_differential(const Tensor& self, const Tensor& grad, bool adjoint); @@ -297,12 +307,18 @@ infinitely_differentiable_native_group_norm_backward( int64_t group, double eps, std::array grad_input_mask); +Tensor prelu_jvp(const Tensor& x, const Tensor& dx, const Tensor& w, const Tensor& dw); std::tuple prelu_double_backward( const Tensor & grad_grad_input, const Tensor & grad_grad_weight, const Tensor & grad_out, const Tensor & input_, const Tensor & weight_); +Tensor gelu_double_backward( + const Tensor & ggI, + const Tensor & gO, + const Tensor & input, + c10::string_view approximate); Tensor as_strided_backward(Tensor grad, TensorGeometry input_geometry, IntArrayRef sizes, IntArrayRef strides, optional storage_offset_); std::tuple atan2_backward(const Tensor& grad, const Tensor& self, const Tensor& other, std::array output_mask); std::tuple layer_norm_double_backward( @@ -351,9 +367,10 @@ Tensor lu_solve_jvp( const Tensor& LU_pivots ); Tensor lu_unpack_backward( - const variable_list& grads, - const Tensor& LU_data, - bool unpack_data + const Tensor& L_grad, + const Tensor& U_grad, + const int64_t m, + const int64_t n ); Tensor _det_lu_based_helper_backward( @@ -373,23 +390,32 @@ std::tuple linalg_lstsq_backward( const std::array& grad_input_mask ); -Tensor lu_backward_base( - const variable_list& grads, - const Tensor& self, +Tensor linalg_lu_backward( + const Tensor& L_grad, + const Tensor& U_grad, const Tensor& P, const Tensor& L, - const Tensor& U -); + const Tensor& U, + const bool pivot); + +std::tuple linalg_lu_jvp( + const Tensor& dA, + const Tensor& P, + const Tensor& L, + const Tensor& U, + const bool pivot); + Tensor lu_factor_ex_backward( const Tensor& grad, - const Tensor& self, const Tensor& LU, - const Tensor& pivs + const Tensor& pivs, + const bool pivot ); Tensor lu_factor_ex_jvp( const Tensor& dX, const Tensor& LU, - const Tensor& pivs + const Tensor& pivs, + const bool pivot ); Tensor batch_norm_jvp( @@ -460,15 +486,29 @@ std::tuple _cudnn_convolution_backward( at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, bool transposed, int64_t groups, ::std::array output_mask); -Tensor scatter_reduce_backward( +std::tuple scatter_reduce_backward( const Tensor& grad, - const Tensor& input, + const Tensor& self, int dim, const Tensor& index, + const Tensor& src, c10::string_view reduce, + bool include_self, const Tensor& result ); +Tensor _to_copy_backward(const Tensor &grad, const c10::TensorOptions &self_options); + +std::tuple index_reduce_backward( + const Tensor& grad, + const Tensor& self, + int dim, + const Tensor& index, + const Tensor& source, + c10::string_view reduce, + bool include_self, + const Tensor& result +); } // namespace details } // namespace generated diff --git a/torch/csrc/autograd/TraceTypeManual.cpp b/torch/csrc/autograd/TraceTypeManual.cpp index 031b50215d8c..a96fa42abd17 100644 --- a/torch/csrc/autograd/TraceTypeManual.cpp +++ b/torch/csrc/autograd/TraceTypeManual.cpp @@ -283,7 +283,9 @@ void general_trace_function( AT_ASSERT(iter->isObject()); tracer::addOutput(node, iter->toObject()); } else { - throw std::runtime_error("unsupported output type: " + type->str()); + throw std::runtime_error( + "unsupported output type: " + type->str() + + ", from operator: " + toString(op.operator_name())); } } } diff --git a/torch/csrc/autograd/autograd_meta.cpp b/torch/csrc/autograd/autograd_meta.cpp index b3bb488c9641..bf8cccf62ea7 100644 --- a/torch/csrc/autograd/autograd_meta.cpp +++ b/torch/csrc/autograd/autograd_meta.cpp @@ -90,7 +90,7 @@ namespace { if (base.sizes()[i] != other.sizes()[i]) { return false; } - if (base.strides()[i] != other.strides()[i] && base.sizes()[i] != 1) { + if (base.strides()[i] != other.strides()[i] && base.sizes()[i] != 1 && base.sizes()[i] != 0) { return false; } } diff --git a/torch/csrc/autograd/custom_function.cpp b/torch/csrc/autograd/custom_function.cpp index 4acebe1266e7..553e8aa67470 100644 --- a/torch/csrc/autograd/custom_function.cpp +++ b/torch/csrc/autograd/custom_function.cpp @@ -112,7 +112,7 @@ void _process_forward_mode_AD(const variable_list &inputs, const auto num_forward_grads = forward_grads.size(); // contrary to backward mode, we don't allow returning too many gradients TORCH_CHECK(num_forward_grads == num_outputs, "Function's jvp returned " - "an invalid number of of forward gradients (expected ", num_outputs, + "an invalid number of forward gradients (expected ", num_outputs, " but got ", num_forward_grads, ")"); for (const auto i : c10::irange(num_outputs)) { diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp index 22f138e2a14f..401f679d3d89 100644 --- a/torch/csrc/autograd/engine.cpp +++ b/torch/csrc/autograd/engine.cpp @@ -50,12 +50,20 @@ static void forked_autograd_child() { in_bad_autograd_fork = true; } // Should be called before unsafe for forks (thread pool) calls static void track_bad_autograd_forks() { -#if !defined(WIN32) && !defined(__XROS__) +#if !defined(WIN32) static std::once_flag flag; std::call_once( flag, [&] { pthread_atfork(nullptr, nullptr, forked_autograd_child); }); #endif } + +inline bool should_run_in_cpu_ready_queue(c10::DeviceType device) { + if (device == c10::kCPU || device == c10::kMeta || device == c10::kLazy) { + return true; + } else { + return false; + } +} } // Threads spawned by the engine are assigned a 'worker_device' specifying @@ -92,9 +100,10 @@ C10_DEFINE_TLS_static(std::shared_ptr, tls_current_graph_task); // Engine::init_local_ready_queue() call in each corresponding thread before execution. // // The CUDA, XLA threads are shared among all invocations of backwards via -// device_ready_queues_, while CPU threads are dedicated to processing CPU work for -// the backward they invoked. So any given graph task maintains its own cpu_ready_queue_ -// where you should send work for it to be done +// device_ready_queues_, while the caller thread is dedicated to processing work for +// devices returning true in should_run_in_cpu_ready_queue (most notably the CPU device). +// So any given graph task maintains its own cpu_ready_queue_ where you should send work +// for it to be done. // // For reentrant backward calls, if we spawn new thread from the current thread // because we reached the maximum depth, the new thread will just reuse the same @@ -380,6 +389,11 @@ auto Engine::thread_main(const std::shared_ptr& graph_task) -> void { // backwards, user thread), this function is expected to exit once that // graph_task complete. +#ifdef USE_ROCM + // Keep track of backward pass for rocblas. + at::ROCmBackwardPassGuard in_backward; +#endif + // local_ready_queue should already been initialized when we get into thread_main TORCH_INTERNAL_ASSERT(local_ready_queue != nullptr); while (graph_task == nullptr || !graph_task->future_result_->completed()) { @@ -424,7 +438,7 @@ auto Engine::thread_main(const std::shared_ptr& graph_task) -> void { c10::str( "autograd::engine::evaluate_function: ", task.fn_.get()->name()), - std::vector()); + c10::ArrayRef()); evaluate_function( local_graph_task, task.fn_.get(), @@ -706,7 +720,8 @@ void validate_outputs( // In future, there will be an oppportunity to support more combinations of layouts if they are composable // (example., operations like addition etc., are well defined between tensors of different layouts.), // as well as all parts of autograd like AccumulateGrad correctly handle this. - if (!grad.is_sparse()) { + // We allow grad to be Strided when metadata is SparseCsr + if (!grad.is_sparse() && !(grad.layout() == at::kStrided && metadata.layout() == at::kSparseCsr)) { std::stringstream ss; ss << "invalid gradient at index " << i << " - expected layout "; ss << metadata.layout() << " but got " << grad.layout(); @@ -1044,7 +1059,6 @@ auto Engine::execute(const edge_list& roots, } void Engine::initialize_device_threads_pool() { - track_bad_autograd_forks(); TORCH_CHECK(!in_bad_autograd_fork, "Unable to handle autograd's threading in combination with fork-based multiprocessing. " "See https://github.com/pytorch/pytorch/wiki/Autograd-and-Fork"); @@ -1167,23 +1181,14 @@ void Engine::init_local_ready_queue(std::shared_ptr ready_queue) { } } -size_t Engine::ready_queue_size(const std::shared_ptr& graph_task, at::Device device) { - if (device_ready_queues_.empty()) { - // The vector device_ready_queues_ is initialized in start_device_threads, but this method - // can be called before start_device_threads. Adding this check to avoid index - // out of bound error. - return 0; - } - return ready_queue(graph_task->cpu_ready_queue_, device)->size(); -} - // CPU ready queue is per GraphTask, but CUDA device ready queues are shared across all graph tasks auto Engine::ready_queue(std::shared_ptr cpu_ready_queue, at::Device device) -> std::shared_ptr{ - if (device.type() == at::kCPU || device.type() == at::DeviceType::Meta) { + if (should_run_in_cpu_ready_queue(device.type())) { // return the cpu ready queue passed in TORCH_INTERNAL_ASSERT(cpu_ready_queue); return cpu_ready_queue; } else { + TORCH_INTERNAL_ASSERT(0 <= device.index() && device.index() < static_cast(device_ready_queues_.size())); // See Note [Allocating GPUs to autograd threads] return device_ready_queues_.at(device.index()); } @@ -1195,8 +1200,7 @@ auto Engine::ready_queue_by_index(std::shared_ptr cpu_ready_queue, i TORCH_INTERNAL_ASSERT(cpu_ready_queue); return cpu_ready_queue; } else { - // Static cast is ok here as the number of device should never overflow an int. - TORCH_INTERNAL_ASSERT(0 <= device_index && device_index < static_cast(device_ready_queues_.size())); + TORCH_INTERNAL_ASSERT(0 <= device_index && device_index < static_cast(device_ready_queues_.size())); // See Note [Allocating GPUs to autograd threads] // NB: This function would become obsolete if we truly allocated a CPU thread // per device, rather than colocate. @@ -1205,15 +1209,29 @@ auto Engine::ready_queue_by_index(std::shared_ptr cpu_ready_queue, i } auto Engine::start_device_threads() -> void { + // First always initialize the thread pool for re-entrant threads + thread_pool_shared_ = std::make_shared(); + + // Second, create special threads for each non-CPU device // See Note [Allocating GPUs to autograd threads] c10::DeviceIndex num_devices = 0; for (const auto& impl_atomic : c10::impl::device_guard_impl_registry) { auto* impl = impl_atomic.load(); - if (impl) { + // Only record the number of devices for device that don't run on the + // cpu ready queue. + if (impl && !should_run_in_cpu_ready_queue(impl->type())) { num_devices = std::max(num_devices, impl->deviceCount()); } } + // If there are no device except cpu, no need to create worker threads + if (num_devices == 0) { + return; + } + + // Since we're about to create threads, forking is not possible anymore + track_bad_autograd_forks(); + // allocate one thread for every GPU device (but colocate GPUs of different // types), and pre-allocate the device_ready_queues_ to ensure safe reading on it. device_ready_queues_ = std::vector>(num_devices); @@ -1221,8 +1239,6 @@ auto Engine::start_device_threads() -> void { queue = std::make_shared(); } - thread_pool_shared_ = std::make_shared(); - for (const auto i : c10::irange(num_devices)) { std::thread t(&Engine::thread_init, this, i, device_ready_queues_[i], true); t.detach(); @@ -1246,6 +1262,8 @@ void Engine::add_thread_pool_task(const std::weak_ptr& graph_task) { // Don't need to be holding the lock while actually creating the thread lck.unlock(); if (create_thread) { + // If we're creating a new thread, forking is not allowed anymore + track_bad_autograd_forks(); std::thread t(&Engine::reentrant_thread_init, this); t.detach(); } diff --git a/torch/csrc/autograd/engine.h b/torch/csrc/autograd/engine.h index ae0b32932184..6aae048432ce 100644 --- a/torch/csrc/autograd/engine.h +++ b/torch/csrc/autograd/engine.h @@ -340,8 +340,6 @@ struct TORCH_API Engine { bool is_checkpoint_valid(); - size_t ready_queue_size(const std::shared_ptr& graph_task, at::Device device); - // Should be called after fork to notify that worker threads are gone void release_workers(); diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h index cc5fa59e9ed6..dfeb1c973df5 100644 --- a/torch/csrc/autograd/function.h +++ b/torch/csrc/autograd/function.h @@ -151,24 +151,21 @@ struct TORCH_API Node : std::enable_shared_from_this { // probably operate with names. at::NoNamesGuard no_names_guard; - bool pre_sampled = false; - if (at::shouldRunRecordFunction(&pre_sampled)) { - // Using RecordFunction to trigger observers in the backward pass - at::RecordFunction guard(at::RecordScope::BACKWARD_FUNCTION, pre_sampled); - if (guard.isActive()) { - // Using sequence number and thread id to correlate with - // the forward pass function - guard.setForwardThreadId(thread_id_); - if (guard.needsInputs()) { - guard.before( - name(), - std::vector(inputs.begin(), inputs.end()), - sequence_nr()); - } else { - guard.before(name(), sequence_nr()); - } + auto step_callbacks = at::getStepCallbacks(at::RecordScope::BACKWARD_FUNCTION); + if (!step_callbacks.empty()) { + at::RecordFunction guard(std::move(step_callbacks)); + // Using sequence number and thread id to correlate with + // the forward pass function + guard.setForwardThreadId(thread_id_); + if (guard.needsInputs()) { + std::vector inputs_vec(inputs.begin(), inputs.end()); + guard.before( + name(), + c10::ArrayRef(inputs_vec.data(), inputs_vec.size()), + sequence_nr()); + } else { + guard.before(name(), sequence_nr()); } - // keeping stack guard object alive during the call return apply(std::move(inputs)); } else { return apply(std::move(inputs)); diff --git a/torch/csrc/autograd/functions/accumulate_grad.h b/torch/csrc/autograd/functions/accumulate_grad.h index 20074c57008d..4e269d9f4e55 100644 --- a/torch/csrc/autograd/functions/accumulate_grad.h +++ b/torch/csrc/autograd/functions/accumulate_grad.h @@ -18,7 +18,7 @@ namespace torch { namespace autograd { #define CHECK_RESULT(RESULT, VAR) \ - if (!(RESULT.is_sparse() || VAR.is_sparse())) { \ + if (!(RESULT.is_sparse() || VAR.is_sparse() || RESULT.is_sparse_csr() || VAR.is_sparse_csr())) { \ if (!utils::obeys_layout_contract(RESULT, VAR)) { \ TORCH_WARN_ONCE("grad and param do not obey the gradient layout contract. " \ "This is not an error, but may impair performance.\n" \ @@ -105,7 +105,8 @@ struct TORCH_API AccumulateGrad : public Node { const T& update_grad) { if (!variable_grad.defined()) { if (!GradMode::is_enabled() && - !new_grad.is_sparse() && + !new_grad.is_sparse() && !new_grad.is_sparse_csr() && + !(variable.is_sparse_csr() && new_grad.layout() == at::kStrided) && new_grad.use_count() <= num_expected_refs && (new_grad.is_mkldnn() || utils::obeys_layout_contract(new_grad, variable))) { // we aren't setting up for double-backward @@ -139,7 +140,7 @@ struct TORCH_API AccumulateGrad : public Node { new_grad.sizes(), new_grad.options())); } else { - if (new_grad.is_sparse()) { + if (new_grad.is_sparse() || new_grad.is_sparse_csr()) { update_grad(new_grad.clone()); } else { if (new_grad.is_mkldnn()) { diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index 890b7f715eae..2a5ec74f26e4 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -8,8 +9,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -18,22 +19,18 @@ #include #include #include -#include #include +#include #include #include +#include #include #include -struct DisableTorchDispatch { - DisableTorchDispatch() : guard_(c10::DispatchKey::Python) { - } - c10::impl::ExcludeDispatchKeyGuard guard_; -}; - PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { using namespace torch::autograd::profiler; + using namespace torch::profiler::impl; auto tensor_module = THPObjectPtr(PyImport_ImportModule("torch._tensor")); if (!tensor_module) return nullptr; @@ -75,17 +72,67 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { .value("KINETO", ProfilerState::KINETO) .value("KINETO_GPU_FALLBACK", ProfilerState::KINETO_GPU_FALLBACK); + using torch::profiler::impl::ActiveProfilerType; + py::enum_(m, "ActiveProfilerType") + .value("NONE", ActiveProfilerType::NONE) + .value("LEGACY", ActiveProfilerType::LEGACY) + .value("KINETO", ActiveProfilerType::KINETO) + .value("NVTX", ActiveProfilerType::NVTX); + py::enum_(m, "ProfilerActivity") .value("CPU", ActivityType::CPU) .value("CUDA", ActivityType::CUDA); + py::class_(m, "_ExperimentalConfig") + .def(py::init< + std::vector /* profiler_metrics */, + bool /* profiler_measure_per_kernel */ + >(), + "An experimental config for Kineto features. Please note that" + "backward compatibility is not guaranteed.\n" + " profiler_metrics : a list of CUPTI profiler metrics used\n" + " to measure GPU performance events.\n" + " If this list contains values Kineto runs in CUPTI profiler mode\n" + " profiler_measure_per_kernel (bool) : whether to profile metrics per kernel\n" + " or for the entire measurement duration.", + py::arg("profiler_metrics") = std::vector(), + py::arg("profiler_measure_per_kernel") = false) + .def(py::pickle( + [](const ExperimentalConfig &p) { // __getstate__ + py::list py_metrics; + for (const auto& metric : p.profiler_metrics) { + py::bytes mbytes(metric); + py_metrics.append(mbytes); + } + /* Return a tuple that fully encodes the state of the config */ + return py::make_tuple( + py_metrics, p.profiler_measure_per_kernel); + }, + [](py::tuple t) { // __setstate__ + if (t.size() != 2) { + throw std::runtime_error("Expected 2 values in state"); + } + + py::list py_metrics = t[0].cast(); + std::vector metrics{py_metrics.size()}; + + for (const auto& py_metric : py_metrics) { + metrics.push_back(py::str(py_metric)); + } + + return ExperimentalConfig(std::move(metrics), t[1].cast()); + } + )); + + py::class_(m, "ProfilerConfig") .def(py::init()); py::class_(m, "ProfilerEvent") @@ -123,7 +170,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { .value("ORT", c10::DeviceType::ORT) .value("XLA", c10::DeviceType::XLA) .value("Lazy", c10::DeviceType::Lazy) - .value("MLC", c10::DeviceType::MLC) + .value("MPS", c10::DeviceType::MPS) .value("HPU", c10::DeviceType::HPU) .value("Meta", c10::DeviceType::Meta) .value("Vulkan", c10::DeviceType::Vulkan) @@ -238,6 +285,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { m.def("_disable_profiler", disableProfiler); m.def("_prepare_profiler", prepareProfiler); m.def("_add_metadata_json", addMetadataJson); // Only if `USE_KINETO` is set + m.def("_kineto_step", profilerStep); // Only if `USE_KINETO` is set m.def("kineto_available", []() { return torch::profiler::kKinetoAvailable; }); // NOTICE: These record functions are not torch operators and may not show up @@ -246,28 +294,33 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { // Creates a new profiling scope using RecordFunction and invokes its starting // callbacks. m.def("_record_function_with_args_enter", [](const std::string& name, py::args args) { - auto rec = std::make_unique(at::RecordScope::USER_SCOPE); + using torch::autograd::profiler::PythonRecordFunction; + auto python_rec = c10::make_intrusive(at::RecordScope::USER_SCOPE); + auto *rec = &python_rec->record; if (rec->isActive()) { if (rec->needsInputs()) { auto iv_inputs = std::vector(); for (const auto& arg : args) { iv_inputs.push_back(torch::jit::toTypeInferredIValue(arg)); } - rec->before(name, iv_inputs); + rec->before(name, c10::ArrayRef(iv_inputs.data(), iv_inputs.size())); } else { rec->before(name); } } - return at::cpp_custom_type_hack::create(std::move(rec), at::TensorOptions()); + return torch::jit::toPyObject(std::move(python_rec)); }); // Ends the profiling scope created with record_function_with_param_enter. - m.def("_record_function_with_args_exit", [](const at::Tensor& handle) { - // We don't actually need to do anything with handle just need to persist the - // lifetime until now. - auto& rec = at::cpp_custom_type_hack::cast(handle); - rec.end(); - }); + m.def("_record_function_with_args_exit", + [](const py::object &obj) { + using torch::autograd::profiler::PythonRecordFunction; + auto python_record = torch::jit::toCustomClass(obj); + + // We don't actually need to do anything with handle just need to persist the + // lifetime until now. + python_record->record.end(); + }); m.def("_supported_activities", []() { std::set activities {ActivityType::CPU}; @@ -287,6 +340,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { disableProfilerLegacy, py::arg("profiler_disable_options") = ProfilerDisableOptions()); m.def("_profiler_enabled", profilerEnabled); + m.def("_profiler_type", torch::profiler::impl::profilerType); m.def("_enable_record_function", [](bool enable) { at::enableRecordFunction(enable); }); @@ -318,7 +372,11 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { py::class_(_C_m, "_InferenceMode") .def(py::init()); - py::class_(_C_m, "_DisableTorchDispatch") + py::class_(_C_m, "_RestorePythonTLSSnapshot") + .def(py::init<>()); + + // TODO: line up this binding with DisableTorchFunction + py::class_(_C_m, "_DisableTorchDispatch") .def(py::init<>()); py::class_(m, "SavedTensor") @@ -544,20 +602,57 @@ static PyObject * python_exit_dual_level(PyObject* _unused, PyObject* args, PyOb END_HANDLE_TH_ERRORS } -static PyObject * enter_python_mode(PyObject* _unused, PyObject* arg) { +static PyObject* set_torch_dispatch_mode(PyObject* _unused, PyObject* arg) { HANDLE_TH_ERRORS - PythonMode::enter(arg); + if (arg == Py_None) { + at::impl::TorchDispatchModeTLS::set_state(nullptr); + } else { + Py_INCREF(arg); + at::impl::TorchDispatchModeTLS::set_state( + std::make_shared(arg, getPyInterpreter())); + } Py_RETURN_NONE; END_HANDLE_TH_ERRORS } -static PyObject * exit_python_mode(PyObject* _unused, PyObject* arg) { +static PyObject* get_torch_dispatch_mode(PyObject* _unused, PyObject* _unused2) { HANDLE_TH_ERRORS - PythonMode::exit(); + const auto& mode = at::impl::TorchDispatchModeTLS::get_state(); + if (!mode) { + Py_RETURN_NONE; + } else { + auto* r = mode->ptr(getPyInterpreter()); + Py_INCREF(r); + return r; + } + END_HANDLE_TH_ERRORS +} + +static PyObject * set_torch_function_mode(PyObject* _unused, PyObject* arg) { + HANDLE_TH_ERRORS + if (arg == Py_None) { + at::impl::PythonTorchFunctionTLS::set_mode(nullptr); + } else { + Py_INCREF(arg); + at::impl::PythonTorchFunctionTLS::set_mode(std::make_shared(arg, getPyInterpreter())); + } Py_RETURN_NONE; END_HANDLE_TH_ERRORS } +static PyObject * get_torch_function_mode(PyObject* _unused, PyObject* _unused2) { + HANDLE_TH_ERRORS + const auto& mode = at::impl::PythonTorchFunctionTLS::get_mode(); + if (!mode) { + Py_RETURN_NONE; + } else { + auto* r = mode->ptr(getPyInterpreter()); + Py_INCREF(r); + return r; + } + END_HANDLE_TH_ERRORS +} + // autograd methods on torch._C static PyMethodDef methods[] = { // NOLINT {"_set_grad_enabled", set_grad_enabled, METH_O, nullptr}, @@ -580,8 +675,10 @@ static PyMethodDef methods[] = { // NOLINT {"is_anomaly_enabled", is_anomaly_mode_enabled, METH_NOARGS, nullptr}, {"_enter_dual_level", python_enter_dual_level, METH_NOARGS, nullptr}, {"_exit_dual_level", castPyCFunctionWithKeywords(python_exit_dual_level), METH_VARARGS | METH_KEYWORDS, nullptr}, - {"_enter_python_mode", enter_python_mode, METH_O, nullptr}, - {"_exit_python_mode", exit_python_mode, METH_NOARGS, nullptr}, + {"_set_torch_dispatch_mode", set_torch_dispatch_mode, METH_O, nullptr}, + {"_get_torch_dispatch_mode", get_torch_dispatch_mode, METH_NOARGS, nullptr}, + {"_set_torch_function_mode", set_torch_function_mode, METH_O, nullptr}, + {"_get_torch_function_mode", get_torch_function_mode, METH_NOARGS, nullptr}, {nullptr, nullptr, 0, nullptr} }; diff --git a/torch/csrc/autograd/input_buffer.cpp b/torch/csrc/autograd/input_buffer.cpp index 3663e1ee9915..71cc6e06d4d6 100644 --- a/torch/csrc/autograd/input_buffer.cpp +++ b/torch/csrc/autograd/input_buffer.cpp @@ -34,11 +34,14 @@ namespace { } else { switch (var.layout()) { case c10::kSparseCsr: + case c10::kSparseCsc: + case c10::kSparseBsr: + case c10::kSparseBsc: { auto* impl = at::sparse_csr::get_sparse_csr_impl(var); guard.recordDataPtrOnStream(impl->values().storage().data_ptr(), stream); - guard.recordDataPtrOnStream(impl->crow_indices().storage().data_ptr(), stream); - guard.recordDataPtrOnStream(impl->col_indices().storage().data_ptr(), stream); + guard.recordDataPtrOnStream(impl->compressed_indices().storage().data_ptr(), stream); + guard.recordDataPtrOnStream(impl->plain_indices().storage().data_ptr(), stream); break; } case c10::kSparse: diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index b8bba50c4063..db48e94e079f 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -4,11 +4,13 @@ #include #include #include +#include +#include +#include -#include -#include -#include #include +#include +#include #include #include @@ -116,56 +118,132 @@ void _push_reverse_order(PyTraceEvent* e, std::vector& names) { namespace { using torch::profiler::impl::ProfilerThreadLocalStateBase; using torch::profiler::impl::ActiveProfilerType; +using torch::profiler::impl::Result; +using torch::profiler::impl::kineto::annotation_t; +using torch::profiler::impl::shapesToStr; +using torch::profiler::impl::dtypesToStr; +using torch::profiler::impl::stacksToStr; + +struct MemoryEventData { + torch::profiler::impl::approx_time_t start_time; + void* ptr; + int64_t alloc_size; + int64_t total_allocated; + int64_t total_reserved; + uint64_t threadID; + torch::profiler::impl::kineto::DeviceAndResource kineto_info; + c10::DeviceType device_type; + c10::DeviceIndex device_index; +}; +static_assert(std::is_pod::value, "Non-POD member of MemoryEventData."); + +struct EventFieldsVisitor { + EventFieldsVisitor(const Result& result, KinetoEvent& kineto_event) + : result_{result}, kineto_event_{kineto_event} { + handleJIT(result_.get().jit_stack_, result_.get().jit_modules_); + c10::visit(*this, result.event_); + } + + void operator()(const torch::profiler::impl::OpEvent& op_event) { + kineto_event_.get() + .endThreadId(op_event.end_thread_id_) + .scope(op_event.record_function_scope_) + .setAsync(op_event.is_async_) + .debugHandle(op_event.debug_handle_); + + auto& shapes = result_.get().inputs_.shapes_; + if (!shapes.empty()) { + kineto_event_.get().shapes(shapes); + annotations_.emplace_back("Input Dims", shapesToStr(shapes)); + } + + auto& dtypes = result_.get().inputs_.dtypes_; + if (!dtypes.empty()) { + kineto_event_.get().dtypes(dtypes); + annotations_.emplace_back("Input type", dtypesToStr(dtypes)); + } + + if (!result_.get().extra_args_.empty()) { + kineto_event_.get().flops( + computeFlops(result_.get().name(), result_.get().extra_args_)); + } + kineto_event_.get().cuda_event_start_ = + result_.get().gpu_fallback_.cuda_event_start_; + kineto_event_.get().cuda_event_end_ = + result_.get().gpu_fallback_.cuda_event_end_; + + // add information about an associated forward op, if a sequence number + // is available (e.g. during training) + if (op_event.sequence_number_ >= 0) { + kineto_event_.get() + .sequenceNr(op_event.sequence_number_) + .fwdThreadId(op_event.forward_thread_id_); + annotations_.emplace_back( + "Fwd thread id", std::to_string(op_event.forward_thread_id_)); + annotations_.emplace_back( + "Sequence number", std::to_string(op_event.sequence_number_)); + } + } + + void operator()(const torch::profiler::impl::BackendEvent& backend_event) { + kineto_event_.get() + .endThreadId(result_.get().start_tid_) + .scope(backend_event.record_function_scope_) + .debugHandle(backend_event.debug_handle_) + .backend(backend_event.backend_); + + if (!backend_event.backend_.empty()) { + annotations_.emplace_back( + "Backend", "\"" + backend_event.backend_ + "\""); + } + } -// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) -struct OpEventData { - // POD members - int64_t start_us_; - int64_t end_us_; - uint64_t correlation_id_; - uint64_t start_thread_id_; - uint64_t end_thread_id_; - int64_t sequence_number_; - uint64_t forward_thread_id_; - uint8_t record_function_scope_; - bool is_async_; - int64_t debug_handle_; - torch::profiler::impl::kineto::DeviceAndResource kineto_info_; - - std::string name_; - - // report_input_shapes - std::vector> shapes_; - std::vector dtypes_; - - // with_stack - std::vector stack_; - - // with_modules - c10::optional> module_hierarchy_; - - // with_flops - std::unordered_map extra_args_; - - // reportBackendEventToActiveKinetoProfiler - c10::optional backend_; - - // ProfilerState::KINETO_GPU_FALLBACK - torch::profiler::impl::CUDAEventStub cuda_event_start_ = nullptr; - torch::profiler::impl::CUDAEventStub cuda_event_end_ = nullptr; + void handleJIT( + const std::vector& jit_stack, + const std::vector& jit_modules) { + if (!jit_stack.empty()) { + // NB: This is only for the JIT stack. The python stack (if applicable) + // is constructed later. + kineto_event_.get().stack(jit_stack); + annotations_.emplace_back( + "Call stack", torch::profiler::impl::stacksToStr(jit_stack, ";")); + } + + if (!jit_modules.empty()) { + kineto_event_.get().moduleHierarchy(jit_modules); + annotations_.emplace_back( + "Module Hierarchy", + torch::profiler::impl::stacksToStr(jit_modules, ".")); + } + } + + std::reference_wrapper result_; + std::reference_wrapper kineto_event_; + annotation_t annotations_; }; +auto getAnnotations(const MemoryEventData& event) { + torch::profiler::impl::kineto::annotation_t out{ + {"Device Type", std::to_string((int8_t)event.device_type)}, + {"Device Id", std::to_string(event.device_index)}, + {"Addr", std::to_string(reinterpret_cast(event.ptr))}, + {"Bytes", std::to_string(event.alloc_size)}}; + + if (event.total_allocated >= 0) { + out.emplace_back("Total Allocated", std::to_string(event.total_allocated)); + } + if (event.total_reserved >= 0) { + out.emplace_back("Total Reserved", std::to_string(event.total_reserved)); + } + return out; +} + // Assumption: Total threads number will not exceed 2^16-1, and total ops will // not exceed 2^48 -1. static inline uint64_t getForwardThreadKey(uint64_t tid, uint64_t seqNr) { return (((tid) << 48) | ((seqNr) & (((uint64_t)1 << 48) - 1))); } -struct KinetoObserverContext : public at::ObserverContext { - explicit KinetoObserverContext(OpEventData* data) : data_(data) {} - OpEventData* data_; -}; - struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase { explicit KinetoThreadLocalState( const ProfilerConfig& config, @@ -173,6 +251,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase { : ProfilerThreadLocalStateBase(config), start_time_(getTimeUs()), activities_(std::move(activities)), + record_queue_(config), cpu_trace_(start_time_, "PyTorch Profiler") {} ~KinetoThreadLocalState() override = default; @@ -191,12 +270,6 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase { return config().with_stack && activities_.count(ActivityType::CPU); } - std::unique_ptr newOpEvent() { - std::lock_guard guard(state_mutex_); - op_events_.emplace_back(); - return std::make_unique(&op_events_.back()); - } - void reportMemoryUsage( void* ptr, int64_t alloc_size, @@ -205,38 +278,24 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase { c10::Device device) override { if (config_.profile_memory && config_.state != ProfilerState::Disabled) { std::lock_guard guard(state_mutex_); - auto start_time = getTimeUs(); - if (cpu_trace_) { - torch::profiler::impl::kineto::recordThreadInfo(); - cpu_trace_.addMemoryUsageActivity( - kMemoryEventName, - torch::profiler::impl::kineto::kineto_ids(), - start_time, - device, - ptr, - alloc_size, - total_allocated, - total_reserved); - } - - kineto_events_.emplace_back(); - auto& evt = kineto_events_.back(); - evt.name(kMemoryEventName) - .startUs(start_time) - .deviceIndex(device.index()) - .deviceType(device.type()) - .nBytes(alloc_size) - .startThreadId(at::RecordFunction::currentThreadId()); + memory_events_.emplace_back( + torch::profiler::impl::getApproximateTime(), + ptr, + alloc_size, + total_allocated, + total_reserved, + at::RecordFunction::currentThreadId(), + torch::profiler::impl::kineto::kineto_ids(), + device.type(), + device.index()); } } - const std::function&)>& - getEventPostProcessingCallback() const { + const post_process_t& getEventPostProcessingCallback() const { return event_post_process_cb_; } - void setEventPostProcessingCallback( - std::function&)>&& cb) { + void setEventPostProcessingCallback(post_process_t&& cb) { event_post_process_cb_ = std::move(cb); } @@ -244,82 +303,90 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase { auto end_time = getTimeUs(); materializeOpEvents(); - // Call events post processing callback before finalizing trace, if there is - // one. - if (getEventPostProcessingCallback()) { - getEventPostProcessingCallback()(kineto_events_); - } - finalizeCPUTrace(cpu_trace_.get()); { std::lock_guard guard(state_mutex_); cpu_trace_.transferCpuTrace(end_time); } - auto trace = torch::profiler::impl::kineto::stopTrace(); - TORCH_CHECK(trace || !torch::profiler::kKinetoAvailable); - addTraceEvents(trace); - return trace; + if (config().state != ProfilerState::KINETO_ONDEMAND) { + auto trace = torch::profiler::impl::kineto::stopTrace(); + TORCH_CHECK(trace || !torch::profiler::kKinetoAvailable); + addTraceEvents(trace); + return trace; + } else { + return torch::profiler::impl::kineto::ActivityTraceWrapper(); + } } void materializeOpEvents() { std::lock_guard guard(state_mutex_); - for (const auto& e : op_events_) { - if (e.end_us_ < e.start_us_) { + auto converter = clock_converter_.makeConverter(); + + for (const auto& e : memory_events_) { + auto start_time_us = converter(e.start_time) / 1000; + cpu_trace_.addCPUActivity( + kMemoryEventName, + torch::profiler::impl::kineto::KinetoActivityType::CPU_INSTANT_EVENT, + e.kineto_info, + /*correlation_id=*/0, + start_time_us, + start_time_us, + getAnnotations(e)); + + kineto_events_.emplace_back(); + auto& evt = kineto_events_.back(); + evt.name(kMemoryEventName) + .startUs(start_time_us) + .deviceIndex(e.device_index) + .deviceType(e.device_type) + .nBytes(e.alloc_size) + .startThreadId(e.threadID); + } + memory_events_.clear(); + + for (auto& e : record_queue_.getRecords(converter)) { + // `take_data` handles time conversion. + int64_t start_us = e.start_time_us_; + int64_t end_us = e.end_time_us_; + + if (end_us < start_us) { // We initialize end_us_ to the smallest int64_t, so this means that // the op did not finish before we stopped profiling. continue; } - cpu_trace_.addCPUActivity( - e.name_, - e.kineto_info_, - e.correlation_id_, - e.start_us_, - e.end_us_); + // Call events post processing callback before finalizing trace, if there + // is one. + if (getEventPostProcessingCallback()) { + getEventPostProcessingCallback()( + c10::visit([](const auto& i) { return i.debug_handle_; }, e.event_), + e.jit_stack_, + e.jit_modules_); + } kineto_events_.emplace_back(); kineto_events_.back() - .name(e.name_) - .startUs(e.start_us_) - .durationUs(e.end_us_ - e.start_us_) - .correlationId(e.correlation_id_) + .name(e.name()) + .startUs(start_us) + .durationUs(end_us - start_us) + .correlationId(e.correlation_id()) .deviceType(c10::DeviceType::CPU) - .startThreadId(e.start_thread_id_) - .endThreadId(e.end_thread_id_) - .sequenceNr(e.sequence_number_) - .fwdThreadId(e.forward_thread_id_) - .scope(e.record_function_scope_) - .setAsync(e.is_async_) - .debugHandle(e.debug_handle_); - - if (!e.shapes_.empty()) { - kineto_events_.back().shapes(e.shapes_); - } - - if (!e.dtypes_.empty()) { - kineto_events_.back().dtypes(e.dtypes_); - } + .startThreadId(e.start_tid_); - if (!e.stack_.empty()) { - kineto_events_.back().stack(e.stack_); - } + // NB: also sets fields on `kineto_events_.back()`. + auto annotations = + EventFieldsVisitor(e, kineto_events_.back()).annotations_; - if (e.module_hierarchy_) { - kineto_events_.back().moduleHierarchy(*e.module_hierarchy_); - } - - if (!e.extra_args_.empty()) { - kineto_events_.back().flops( - computeFlops(std::string(e.name_), e.extra_args_)); - } - if (e.backend_) { - kineto_events_.back().backend(*e.backend_); - } - kineto_events_.back().cuda_event_start_ = e.cuda_event_start_; - kineto_events_.back().cuda_event_end_ = e.cuda_event_end_; + cpu_trace_.addCPUActivity( + e.name(), + e.kinetoType(), + e.kineto_info_, + e.correlation_id(), + start_us, + end_us, + annotations); } - op_events_.clear(); } void finalizeCPUTrace(std::unique_ptr& cpu_trace) { @@ -331,46 +398,29 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase { // startThreadId_seqNum to pointer of activity. // Low-16bits of startThreadId and low-48bits seqNum are concatenated into // one uint64_t variable as key. + + // From the time being, we need disable the forward/backward correlation feature to + // workaround the crash bug. + // TODO: by Mike Guo + // reenable the forward/backward correlation when kineto fix the following raw pointer + // GenericTraceActivity.flow.linkedActivity + + /* std::unordered_map tidSeq2activity; - uint64_t fwd_bwd_link_id = 1; for (const auto idx : c10::irange(cpu_trace->activities.size())) { auto& kineto_event = kineto_events_[idx]; auto& activity = cpu_trace->activities[idx]; - if (kineto_event.hasShapes()) { - activity.addMetadata("Input Dims", torch::profiler::impl::shapesToStr(kineto_event.shapes())); - } - if (kineto_event.hasStack()) { - // NB: This is only for the JIT stack. The python stack (if applicable) - // is constructed later. - activity.addMetadata( - "Call stack", torch::profiler::impl::stacksToStr(kineto_event.stack(), ";")); - } - if (kineto_event.hasModuleHierarchy()) { - activity.addMetadata( - "Module Hierarchy", - torch::profiler::impl::stacksToStr(kineto_event.moduleHierarchy(), ".")); - } - if (kineto_event.hasTypes()) { - activity.addMetadata("Input type", torch::profiler::impl::dtypesToStr(kineto_event.dtypes())); - } - if (!kineto_event.backend().empty()) { - activity.addMetadata("Backend", "\"" + kineto_event.backend() + "\""); - } - // add information about an associated forward op, if a sequence number // is available (e.g. during training) if (kineto_event.sequenceNr() >= 0) { - activity.addMetadata( - "Fwd thread id", std::to_string(kineto_event.fwdThreadId())); - activity.addMetadata( - "Sequence number", std::to_string(kineto_event.sequenceNr())); generateForwardBackwardLink( kineto_event, fwd_bwd_link_id, activity, tidSeq2activity); } } + */ addPythonEvents(cpu_trace); } @@ -406,7 +456,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase { py_event_indices_{ { nullptr, std::string("null") }}; - for (size_t i = 0; i < py_events.size(); i++) { + for (const auto i : c10::irange(py_events.size())) { py_event_indices_.insert({py_events[i].get(), std::to_string(i)}); } @@ -450,7 +500,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase { op_py_map.insert({t, py_stack.size() ? py_stack.back() : nullptr}); } - auto activities = std::move(cpu_trace->activities); + std::vector py_activities; auto py_events_it = py_events.begin(); auto py_device = libkineto::processId(); auto main_thread = libkineto::systemThreadId(); @@ -471,13 +521,13 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase { op.addMetadata("Python module id", module_id_map_.at(e->module_id_)); } - cpu_trace->activities.push_back(op); + py_activities.push_back(op); py_events_it++; }; - TORCH_INTERNAL_ASSERT(activities.size() == kineto_events_.size()); - for (const auto idx : c10::irange(activities.size())) { - auto& activity = activities[idx]; + TORCH_INTERNAL_ASSERT(cpu_trace->activities.size() == kineto_events_.size()); + for (const auto idx : c10::irange(cpu_trace->activities.size())) { + auto& activity = cpu_trace->activities[idx]; // Add any python events that occurred between this Kineto event and the // previous Kineto event. @@ -498,14 +548,14 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase { kineto_events_[idx].stack(py_names); activity.addMetadata("Call stack", torch::profiler::impl::stacksToStr(py_names, ";")); } - - cpu_trace->activities.push_back(activity); } // Add any Python events which finish after the last Kineto event. while (py_events_it != py_events.end()) { push_py_event(); } + + cpu_trace->activities.insert(cpu_trace->activities.end(), py_activities.begin(), py_activities.end()); } void generateForwardBackwardLink( @@ -521,12 +571,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase { auto iter = tidSeq2activity.find(key); if (iter != tidSeq2activity.end()) { libkineto::GenericTraceActivity* fwd = iter->second; -#ifdef USE_KINETO_UPDATED fwd->flow.start = true; -#else - activity.flow.linkedActivity = fwd; // Only destination side set this, - // to distinguish with start side. -#endif activity.flow.id = fwd->flow.id = fwd_bwd_link_id; activity.flow.type = fwd->flow.type = libkineto::kLinkFwdBwd; ++fwd_bwd_link_id; @@ -558,6 +603,9 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase { #ifdef USE_KINETO const auto& events = *(trace.get()->activities()); for (const auto& ev_ptr : events) { + if (ev_ptr == nullptr) { + continue; + } const auto& activity = *ev_ptr; // These events are already processed if (activity.type() != libkineto::ActivityType::CPU_OP && @@ -583,101 +631,100 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase { } uint64_t start_time_; + torch::profiler::impl::ApproximateClockToUnixTimeConverter clock_converter_; std::set activities_; - std::deque op_events_; + torch::profiler::impl::RecordQueue record_queue_; + torch::profiler::impl::AppendOnlyList memory_events_; torch::profiler::impl::kineto::TraceWrapper cpu_trace_; std::vector kineto_events_; // Optional, if event post-processing is enabled. - std::function&)> event_post_process_cb_; + post_process_t event_post_process_cb_; }; +static std::unique_ptr globalStatePtr; + +template +static void initGlobalState(Args... args) { + if (globalStatePtr) { + LOG(WARNING) << "GlobalStatePtr already exists!"; + } else { + globalStatePtr = std::make_unique(std::forward(args)...); + } +} + +static void resetGlobalState() { + TORCH_INTERNAL_ASSERT(globalStatePtr != nullptr, "Global state ptr cannot be null before resetting"); + globalStatePtr.reset(); +} + +template +static KinetoThreadLocalState* getStatePtr() { + return c10::guts::if_constexpr( + [] { return globalStatePtr.get(); }, + [] { return KinetoThreadLocalState::getTLS(); }); +} + +template +std::unique_ptr onFunctionEnter(const at::RecordFunction& fn) { + auto state_ptr = getStatePtr(); + if (!state_ptr) { + return nullptr; + } + auto corr_id = next_correlation_id(); + if (fn.scope() == at::RecordScope::USER_SCOPE) { + torch::profiler::impl::kineto::pushUserCorrelationId(corr_id); + } else { + torch::profiler::impl::kineto::pushCorrelationId(corr_id); + } + return state_ptr->record_queue_.getSubqueue()->begin_op(fn, corr_id); +} + +// @lint-ignore CLANGTIDY clang-diagnostic-unused-parameter +template +void onFunctionExit(const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) { + auto state_ptr = getStatePtr(); + if (!state_ptr) { + return; + } + const auto& config = state_ptr->config(); + auto* kineto_ctx_ptr = + static_cast(ctx_ptr); + TORCH_INTERNAL_ASSERT(kineto_ctx_ptr != nullptr); + kineto_ctx_ptr->event_->end_time_ = torch::profiler::impl::getApproximateTime(); + kineto_ctx_ptr->event_->end_thread_id_ = at::RecordFunction::currentThreadId(); + if (config.state == ProfilerState::KINETO_GPU_FALLBACK) { + try { + auto fallback = kineto_ctx_ptr->fallback_; + TORCH_INTERNAL_ASSERT(fallback != nullptr); + torch::profiler::impl::cudaStubs()->record( + nullptr, &fallback->cuda_event_end_, nullptr); + } catch (const std::exception& e) { + LOG(WARNING) << "Failed to record CUDA event. " << e.what(); + } + } + + if (fn.scope() == at::RecordScope::USER_SCOPE) { + torch::profiler::impl::kineto::popUserCorrelationId(); + } else { + torch::profiler::impl::kineto::popCorrelationId(); + } +} + +template void pushProfilingCallbacks(const std::unordered_set& scopes) { - auto registration_state_ptr = KinetoThreadLocalState::getTLS(); + auto registration_state_ptr = getStatePtr(); TORCH_INTERNAL_ASSERT(registration_state_ptr, "Expected profiler state set"); - auto handle = at::addThreadLocalCallback( + auto recordFunctionCallback = at::RecordFunctionCallback( - [](const at::RecordFunction& fn) - -> std::unique_ptr { - auto state_ptr = KinetoThreadLocalState::getTLS(); - if (!state_ptr) { - return nullptr; - } - const auto& config = state_ptr->config(); - auto corr_id = next_correlation_id(); - torch::profiler::impl::kineto::pushCorrelationId(corr_id); - - auto ctx_ptr = state_ptr->newOpEvent(); - auto data_ptr = ctx_ptr->data_; - - data_ptr->end_us_ = std::numeric_limits::min(); - data_ptr->correlation_id_ = corr_id; - data_ptr->start_thread_id_ = fn.threadId(); - data_ptr->sequence_number_ = fn.seqNr(); - data_ptr->forward_thread_id_ = fn.forwardThreadId(); - data_ptr->record_function_scope_ = (uint8_t)fn.scope(); - data_ptr->is_async_ = fn.isAsync(); - data_ptr->debug_handle_ = fn.debugHandle(); - data_ptr->kineto_info_ = torch::profiler::impl::kineto::kineto_ids(); - data_ptr->name_ = fn.name(); - if (config.report_input_shapes) { - data_ptr->shapes_ = torch::profiler::impl::inputSizes(fn); - data_ptr->dtypes_ = torch::profiler::impl::inputTypes(fn); - } -#if !defined BUILD_LITE_INTERPRETER && !defined C10_MOBILE - // backward nodes source range corresponds to the forward node - // TODO: consider using C++ stack trace - if (config.with_stack && - fn.scope() != at::RecordScope::BACKWARD_FUNCTION) { - auto cs = torch::profiler::impl::prepareCallstack(jit::currentCallstack()); - data_ptr->stack_ = callstackStr(cs); - } - if (config.with_modules && - fn.scope() != at::RecordScope::BACKWARD_FUNCTION) { - data_ptr->module_hierarchy_ = jit::currentModuleHierarchy(); - } -#endif - if (config.with_flops) { - data_ptr->extra_args_ = torch::profiler::impl::saveExtraArgs(fn); - } - data_ptr->start_us_ = getTimeUs(); - - if (config.state == ProfilerState::KINETO_GPU_FALLBACK) { - try { - torch::profiler::impl::cudaStubs()->record( - nullptr, &data_ptr->cuda_event_start_, nullptr); - } catch (const std::exception& e) { - LOG(WARNING) << "Failed to record CUDA event. " << e.what(); - } - } - return ctx_ptr; - }, - [](const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) { - auto state_ptr = KinetoThreadLocalState::getTLS(); - if (!state_ptr) { - return; - } - const auto& config = state_ptr->config(); - auto* kineto_ctx_ptr = - static_cast(ctx_ptr); - TORCH_INTERNAL_ASSERT(kineto_ctx_ptr != nullptr); - auto data_ptr = kineto_ctx_ptr->data_; - data_ptr->end_us_ = getTimeUs(); - data_ptr->end_thread_id_ = at::RecordFunction::currentThreadId(); - - if (config.state == ProfilerState::KINETO_GPU_FALLBACK) { - try { - torch::profiler::impl::cudaStubs()->record( - nullptr, &data_ptr->cuda_event_end_, nullptr); - } catch (const std::exception& e) { - LOG(WARNING) << "Failed to record CUDA event. " << e.what(); - } - } - - torch::profiler::impl::kineto::popCorrelationId(); - torch::profiler::impl::kineto::recordThreadInfo(); - }) + onFunctionEnter, + onFunctionExit) .needsInputs(registration_state_ptr->config().report_input_shapes) - .scopes(scopes)); + .scopes(scopes); + + auto handle = c10::guts::if_constexpr( + [&] { return at::addGlobalCallback(recordFunctionCallback); }, + [&] { return at::addThreadLocalCallback(recordFunctionCallback); + }); registration_state_ptr->setCallbackHandle(handle); } @@ -690,26 +737,21 @@ void reportBackendEventToActiveKinetoProfiler( const at::RecordScope scope, const std::string& event_name, const std::string& backend_name) { + TORCH_INTERNAL_ASSERT(globalStatePtr == nullptr, "On-demand profiling does not support post processing callback"); + auto state_ptr = KinetoThreadLocalState::getTLS(); if (!state_ptr) { return; } - auto ctx_ptr = state_ptr->newOpEvent(); - auto data_ptr = ctx_ptr->data_; - data_ptr->start_us_ = start_time_us; - data_ptr->end_us_ = end_time_us; - data_ptr->correlation_id_ = std::numeric_limits::max(); - data_ptr->start_thread_id_ = at::RecordFunction::currentThreadId(); - data_ptr->end_thread_id_ = data_ptr->start_thread_id_; - data_ptr->sequence_number_ = -1; - data_ptr->forward_thread_id_ = data_ptr->start_thread_id_; - data_ptr->record_function_scope_ = (uint8_t)scope; - data_ptr->is_async_ = false; - data_ptr->debug_handle_ = debug_handle; - data_ptr->kineto_info_ = torch::profiler::impl::kineto::kineto_ids(); - data_ptr->name_ = event_name; - data_ptr->backend_ = backend_name; + state_ptr->record_queue_.getSubqueue()->emplace_backend_event( + torch::profiler::impl::BackendEvent { + start_time_us, + end_time_us, + (uint8_t)scope, + debug_handle, + event_name, + backend_name}); /* no support for input shapes now? if (config.report_input_shapes) { @@ -717,8 +759,6 @@ void reportBackendEventToActiveKinetoProfiler( ctx_ptr->dtypes = inputTypes(fn); } */ - - torch::profiler::impl::kineto::recordThreadInfo(); } void prepareProfiler( @@ -732,17 +772,19 @@ void prepareProfiler( config.state == ProfilerState::KINETO_GPU_FALLBACK, "Supported only in Kineto profiler"); torch::profiler::impl::kineto::prepareTrace( - /*cpuOnly=*/!at::hasCUDA(), activities); + /*cpuOnly=*/!at::hasCUDA(), activities, config.experimental_config); } void enableProfilerWithEventPostProcess( const torch::profiler::impl::ProfilerConfig& config, const std::set& activities, - std::function&)>&& cb, + post_process_t&& cb, const std::unordered_set& scopes) { TORCH_CHECK( config.state != ProfilerState::NVTX, "NVTX does not support post processing callback."); + TORCH_INTERNAL_ASSERT(globalStatePtr == nullptr, "On-demand profiling does not support post processing callback"); + enableProfiler(config, activities, scopes); auto state_ptr = KinetoThreadLocalState::getTLS(); state_ptr->setEventPostProcessingCallback(std::move(cb)); @@ -760,36 +802,44 @@ void enableProfiler( TORCH_CHECK( config.state == ProfilerState::KINETO || - config.state == ProfilerState::KINETO_GPU_FALLBACK); + config.state == ProfilerState::KINETO_GPU_FALLBACK || + config.state == ProfilerState::KINETO_ONDEMAND); TORCH_CHECK( !activities.empty(), "No activities specified for Kineto profiler"); - auto state = std::make_shared(config, activities); - c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state); + if (config.state == ProfilerState::KINETO || + config.state == ProfilerState::KINETO_GPU_FALLBACK) { + auto state = std::make_shared(config, activities); + c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state); - if (state->tracePython()) { - python_tracer::call(python_tracer::Command::kStartOne); - } + if (state->tracePython()) { + python_tracer::call(python_tracer::Command::kStartOne); + } - if (activities.count(ActivityType::CPU)) { - pushProfilingCallbacks(scopes); + if (activities.count(ActivityType::CPU)) { + pushProfilingCallbacks(scopes); + } + torch::profiler::impl::kineto::startTrace(); } - torch::profiler::impl::kineto::startTrace(); + if (config.state == ProfilerState::KINETO_ONDEMAND) { + initGlobalState(config, activities); + + TORCH_INTERNAL_ASSERT(activities.count(ActivityType::CPU), "Ondemand profiling must enable CPU tracing"); + pushProfilingCallbacks(scopes); + } } std::unique_ptr disableProfiler() { - // all the DebugInfoBase objects are scope based and supposed to use - // DebugInfoGuard - auto state = - c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE); + auto state_ptr = static_cast( + (globalStatePtr == nullptr) ? getStatePtr() : getStatePtr()); - auto state_ptr = static_cast(state.get()); const auto& config = state_ptr->config(); TORCH_CHECK( state_ptr && (config.state == ProfilerState::KINETO || config.state == ProfilerState::KINETO_GPU_FALLBACK || + config.state == ProfilerState::KINETO_ONDEMAND || config.state == ProfilerState::NVTX), "Can't disable Kineto profiler when it's not running"); @@ -797,24 +847,42 @@ std::unique_ptr disableProfiler() { at::removeCallback(state_ptr->callbackHandle()); } - if (state_ptr->config().state == ProfilerState::NVTX) { + // Traces are converged via libkineto automatically for ondemand flow + if (state_ptr->config().state == ProfilerState::KINETO_ONDEMAND) { + auto kineto_state_ptr = static_cast(state_ptr); + auto trace = kineto_state_ptr->finalizeTrace(); + resetGlobalState(); return std::make_unique(); } - auto kineto_state_ptr = static_cast(state_ptr); - if (kineto_state_ptr->tracePython()) { - python_tracer::call(python_tracer::Command::kStop); + // Shared among NVTX, KINETO, KINETO_GPU_FALLBACK + std::unique_ptr result; + if (state_ptr->config().state == ProfilerState::NVTX) { + result = std::make_unique(); } - auto trace = kineto_state_ptr->finalizeTrace(); - if (kineto_state_ptr->tracePython()) { - python_tracer::call(python_tracer::Command::kClear); + if (config.state == ProfilerState::KINETO || + config.state == ProfilerState::KINETO_GPU_FALLBACK) { + auto kineto_state_ptr = static_cast(state_ptr); + if (kineto_state_ptr->tracePython()) { + python_tracer::call(python_tracer::Command::kStop); + } + + auto trace = kineto_state_ptr->finalizeTrace(); + if (kineto_state_ptr->tracePython()) { + python_tracer::call(python_tracer::Command::kClear); + } + + result = std::make_unique( + kineto_state_ptr->start_time_, + std::move(kineto_state_ptr->kineto_events_), + std::move(trace)); } - return std::make_unique( - kineto_state_ptr->start_time_, - std::move(kineto_state_ptr->kineto_events_), - std::move(trace)); + // Disable thread-local profiler. We can't pop until the very end as it would invalidate + // the `state_ptr` reference which we need to process the traces. + (void)c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE); + return result; } int64_t KinetoEvent::cudaElapsedUs() const { diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h index c7b130c9c250..c98009631766 100644 --- a/torch/csrc/autograd/profiler_kineto.h +++ b/torch/csrc/autograd/profiler_kineto.h @@ -344,10 +344,14 @@ TORCH_API void enableProfiler( * callback, via enableProfilerWithEventPostProcess, that takes these debug handles * and generates stack trace and module hierarchy information, once profiling is done. */ +using post_process_t = std::function&, + /*jit_modules */ std::vector&)>; TORCH_API void enableProfilerWithEventPostProcess( const torch::profiler::impl::ProfilerConfig& config, const std::set& activities, - std::function&)>&& cb, + post_process_t&& cb, const std::unordered_set& scopes = {}); TORCH_API std::unique_ptr disableProfiler(); diff --git a/torch/csrc/autograd/profiler_legacy.h b/torch/csrc/autograd/profiler_legacy.h index b0ebc3649cd9..5dad7de1a250 100644 --- a/torch/csrc/autograd/profiler_legacy.h +++ b/torch/csrc/autograd/profiler_legacy.h @@ -122,15 +122,15 @@ struct TORCH_API LegacyEvent { double cpuElapsedUs(const LegacyEvent& e) const { // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers) - return (e.cpu_ns_ - cpu_ns_)/(1000.0); + return static_cast(e.cpu_ns_ - cpu_ns_)/(1000.0); } void setCpuUs(int64_t cpu_us) { - cpu_ns_ = cpu_us * 1000.0; + cpu_ns_ = static_cast(cpu_us) * 1000.0; } double cpuUs() const { - return cpu_ns_ / (1000.0); + return static_cast(cpu_ns_) / (1000.0); } double cudaElapsedUs(const LegacyEvent& e) const; diff --git a/torch/csrc/autograd/profiler_python.cpp b/torch/csrc/autograd/profiler_python.cpp index 8ca06a3674bb..6c1675f121b0 100644 --- a/torch/csrc/autograd/profiler_python.cpp +++ b/torch/csrc/autograd/profiler_python.cpp @@ -31,69 +31,69 @@ namespace { // It is passed as the second argument when enabling tracing via // `PyEval_SetProfile`. struct TraceContext { - PyObject_HEAD - - // It is wasteful to store an entire PyThreadState* in RawEvent. So - // instead, we map thread ids down to a compact space that we can store in - // a single byte. - uint8_t thread_id_; - PyThreadState* thread_state_; - - // Likewise, int64_t is more precision than we need. By tracking when the - // profiler starts we can store "time since profile begin" which can fit - // into less space. - int64_t initial_us_; - - // TODO: - // Wall time is actually fairly expensive to compute. Empirically, it - // takes ~600 ns to call `now()`. This puts a hard lower bound on the - // overhead of the tracer. If we collected wall time less frequently, and - // used TSC (e.g. through __rdtsc) to interpolate it should be possible - // to reduce time spent on timestamps while retaining the same level of - // accuracy. + PyObject_HEAD + + // It is wasteful to store an entire PyThreadState* in RawEvent. So + // instead, we map thread ids down to a compact space that we can store in + // a single byte. + uint8_t thread_id_; + PyThreadState* thread_state_; + + // Likewise, int64_t is more precision than we need. By tracking when the + // profiler starts we can store "time since profile begin" which can fit + // into less space. + int64_t initial_us_; + + // TODO: + // Wall time is actually fairly expensive to compute. Empirically, it + // takes ~600 ns to call `now()`. This puts a hard lower bound on the + // overhead of the tracer. If we collected wall time less frequently, and + // used TSC (e.g. through __rdtsc) to interpolate it should be possible + // to reduce time spent on timestamps while retaining the same level of + // accuracy. }; // CPython boilerplate to define `TraceContext` as a proper python object. static PyTypeObject TraceContextType = { - PyVarObject_HEAD_INIT(nullptr, 0) - "TraceContext", /* tp_name */ - sizeof(TraceContext), /* tp_basicsize */ - 0, /* tp_itemsize */ - nullptr, /* tp_dealloc */ - 0, /* tp_vectorcall_offset */ // NOLINT: modernize-use-nullptr - nullptr, /* tp_getattr */ - nullptr, /* tp_setattr */ - nullptr, /* tp_reserved */ - nullptr, /* tp_repr */ - nullptr, /* tp_as_number */ - nullptr, /* tp_as_sequence */ - nullptr, /* tp_as_mapping */ - nullptr, /* tp_hash */ - nullptr, /* tp_call */ - nullptr, /* tp_str */ - nullptr, /* tp_getattro */ - nullptr, /* tp_setattro */ - nullptr, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT, /* tp_flags */ - "Python tracer TLS", /* tp_doc */ - nullptr, /* tp_traverse */ - nullptr, /* tp_clear */ - nullptr, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - nullptr, /* tp_iter */ - nullptr, /* tp_iternext */ - nullptr, /* tp_methods */ - nullptr, /* tp_members */ - nullptr, /* tp_getset */ - nullptr, /* tp_base */ - nullptr, /* tp_dict */ - nullptr, /* tp_descr_get */ - nullptr, /* tp_descr_set */ - 0, /* tp_dictoffset */ - nullptr, /* tp_init */ - nullptr, /* tp_alloc */ - PyType_GenericNew, /* tp_new */ - nullptr /* tp_free */ + PyVarObject_HEAD_INIT(nullptr, 0) + "TraceContext", /* tp_name */ + sizeof(TraceContext), /* tp_basicsize */ + 0, /* tp_itemsize */ + nullptr, /* tp_dealloc */ + 0, /* tp_vectorcall_offset */ // NOLINT: modernize-use-nullptr + nullptr, /* tp_getattr */ + nullptr, /* tp_setattr */ + nullptr, /* tp_reserved */ + nullptr, /* tp_repr */ + nullptr, /* tp_as_number */ + nullptr, /* tp_as_sequence */ + nullptr, /* tp_as_mapping */ + nullptr, /* tp_hash */ + nullptr, /* tp_call */ + nullptr, /* tp_str */ + nullptr, /* tp_getattro */ + nullptr, /* tp_setattro */ + nullptr, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + "Python tracer TLS", /* tp_doc */ + nullptr, /* tp_traverse */ + nullptr, /* tp_clear */ + nullptr, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + nullptr, /* tp_iter */ + nullptr, /* tp_iternext */ + nullptr, /* tp_methods */ + nullptr, /* tp_members */ + nullptr, /* tp_getset */ + nullptr, /* tp_base */ + nullptr, /* tp_dict */ + nullptr, /* tp_descr_get */ + nullptr, /* tp_descr_set */ + 0, /* tp_dictoffset */ + nullptr, /* tp_init */ + nullptr, /* tp_alloc */ + PyType_GenericNew, /* tp_new */ + nullptr /* tp_free */ }; // CPython has a more expressive set of events for tracing / profiling: @@ -105,12 +105,7 @@ static PyTypeObject TraceContextType = { // our replay stack), and we are not interested in `PyTrace_LINE` or // `PyTrace_OPCODE`. To simplify things we store our own enum when tracefunc is // called, and then use for all subsequent processing. -enum TraceTag { - kPy_Call = 0, - kPy_Return, - kC_Call, - kC_Return -}; +enum TraceTag { kPy_Call = 0, kPy_Return, kC_Call, kC_Return }; // When we are tracing a Python program, the general procedure is to record // every time we enter or exit a function and later replay these events during @@ -156,78 +151,76 @@ enum TraceTag { // `RawEvent` would grow to three words. (Not just 50% bigger, but also less // cache friendly.) struct RawEvent { - RawEvent(TraceTag tag, int lasti, TraceContext* ctx) - : tag_(static_cast(tag)), - thread_id_(ctx->thread_id_), - lasti_(static_cast(lasti)), - misc_() { - int64_t t = now() - ctx->initial_us_; - t_ = static_cast(t); - - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(lasti <= std::numeric_limits::max()); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t <= std::numeric_limits::max()); - } - - RawEvent(TraceTag tag, int lasti, TraceContext* ctx, PyCodeObject* f_code) - : RawEvent(tag, lasti, ctx) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tag == TraceTag::kPy_Call); - misc_.f_code_ = f_code; - } - - RawEvent(TraceTag tag, int lasti, TraceContext* ctx, PyObject* arg) - : RawEvent(tag, lasti, ctx) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tag == TraceTag::kC_Call); - misc_.arg_ = arg; - } - - uint8_t tag_; - uint8_t thread_id_; - uint16_t lasti_; - uint32_t t_; - union { - // TraceTag::kPy_Call - PyCodeObject* f_code_; - - // TraceTag::kC_Call - PyObject* arg_; - - // TraceTag::kPy_Return - // TraceTag::kC_Return - // ** Unused (placeholder) ** - void* null_; - } misc_; - - C10_NODISCARD TraceTag tag() const { - return static_cast(tag_); - } - - C10_NODISCARD int lasti() const { - // f_lasti is positive, with one exception: CPython intializes frames - // with `f_lasti = -1`. We don't want to give up half of the range by - // switching to int16_t. So instead we do the fast (underflowing) cast - // in the ctor, and rectify the value in this accessor which should - // only be called during trace post processing. - return lasti_ == std::numeric_limits::max() - ? (int)(-1) - : (int)lasti_; - } + RawEvent(TraceTag tag, int lasti, TraceContext* ctx) + : tag_(static_cast(tag)), + thread_id_(ctx->thread_id_), + lasti_(static_cast(lasti)), + misc_() { + int64_t t = now() - ctx->initial_us_; + t_ = static_cast(t); + + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + lasti <= std::numeric_limits::max()); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t <= std::numeric_limits::max()); + } + + RawEvent(TraceTag tag, int lasti, TraceContext* ctx, PyCodeObject* f_code) + : RawEvent(tag, lasti, ctx) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tag == TraceTag::kPy_Call); + misc_.f_code_ = f_code; + } + + RawEvent(TraceTag tag, int lasti, TraceContext* ctx, PyObject* arg) + : RawEvent(tag, lasti, ctx) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tag == TraceTag::kC_Call); + misc_.arg_ = arg; + } + + uint8_t tag_; + uint8_t thread_id_; + uint16_t lasti_; + uint32_t t_; + union { + // TraceTag::kPy_Call + PyCodeObject* f_code_; + + // TraceTag::kC_Call + PyObject* arg_; + + // TraceTag::kPy_Return + // TraceTag::kC_Return + // ** Unused (placeholder) ** + void* null_; + } misc_; + + C10_NODISCARD TraceTag tag() const { + return static_cast(tag_); + } + + C10_NODISCARD int lasti() const { + // f_lasti is positive, with one exception: CPython intializes frames + // with `f_lasti = -1`. We don't want to give up half of the range by + // switching to int16_t. So instead we do the fast (underflowing) cast + // in the ctor, and rectify the value in this accessor which should + // only be called during trace post processing. + return lasti_ == std::numeric_limits::max() ? (int)(-1) + : (int)lasti_; + } }; // Make sure the bit packing that we do in RawEvent actually results in the // desired size reduction. static_assert(sizeof(RawEvent) <= 16, "RawEvent is too large"); - // std::hash doesn't have a specialization for pairs so we have to define one. // A simple XOR is good enough for our purposes. struct hash_pair { - template - size_t operator() (const std::pair& pair) const { - return std::hash()(pair.first) ^ std::hash()(pair.second); - } + template + size_t operator()(const std::pair& pair) const { + return std::hash()(pair.first) ^ std::hash()(pair.second); + } }; - // ============================================================================ // == Tracing implementation ================================================== // ============================================================================ @@ -235,211 +228,221 @@ constexpr size_t max_py_threads = std::numeric_limits::max() + 1; class PythonTracer final { public: - // Static methods serve as external interfaces (which expect raw pointers) - // and handle forwarding to the singleton. - static void call(Command c); + // Static methods serve as external interfaces (which expect raw pointers) + // and handle forwarding to the singleton. + static void call(Command c); - static int pyProfileFn( - PyObject* obj, - PyFrameObject* frame, - int what, - PyObject* arg); + static int pyProfileFn( + PyObject* obj, + PyFrameObject* frame, + int what, + PyObject* arg); private: - PythonTracer(); - static PythonTracer& singleton(); - friend class PyTraceReplay; - - void start(size_t max_threads = max_py_threads); - void stop(); - void clear(); - - void recordPyCall(TraceContext* ctx, PyFrameObject* frame); - void recordCCall(TraceContext* ctx, PyFrameObject* frame, PyObject* arg); - void recordReturn(TraceContext* ctx, PyFrameObject* frame, TraceTag tag); - - void storeDescription(PyFrameObject* frame); - void trackModule(PyFrameObject* frame); - - // It is imperitive that we do not store strings for each python function, - // as that would do terrible things to our profiling overhead. So instead - // we store the much cheaper pair of `PyCodeObject*` and `int` which we can - // pack into `RawEvent`, and then store a mapping to the full strings the - // first time we see a function. - // - // TODO: - // In theory we should be able to use a combination of Py_INCREF on - // `f_code` and string interning to skip this step. (Effectively reusing - // work that the CPython interpreter has already done.) However it tends - // to segfault and simply caching the strings is inexpensive. - struct CodeDescription { - CodeDescription(int line_no, std::string filename, std::string funcname) - : line_no_(line_no), - filename_(std::move(filename)), - funcname_(std::move(funcname)) {} - int line_no_; - std::string filename_; - std::string funcname_; - }; - - struct ModuleForward { - ModuleForward(size_t event_index, PyObject* self) - : event_index_(event_index), self_(self) {} - size_t event_index_; - - // NB: - // This is a non-owning reference to keep `ModuleForward` POD; - // `PythonTracer` owns the contents instead. We Py_INCREF in - // `trackModule`, and `reset` is responsible for calling Py_DECREF - // when clearing `module_calls_`. - PyObject* self_; - }; - - bool active_; - PyObject* module_call_code_; - std::vector path_prefixes_; - std::vector trace_contexts_; - - std::vector events_; - std::vector module_calls_; - - using DescriptionKey = std::pair; - ska::flat_hash_map code_descriptions_; - ska::flat_hash_map c_function_reprs_; + PythonTracer(); + static PythonTracer& singleton(); + friend class PyTraceReplay; + + void start(size_t max_threads = max_py_threads); + void stop(); + void clear(); + + void recordPyCall(TraceContext* ctx, PyFrameObject* frame); + void recordCCall(TraceContext* ctx, PyFrameObject* frame, PyObject* arg); + void recordReturn(TraceContext* ctx, PyFrameObject* frame, TraceTag tag); + + void storeDescription(PyFrameObject* frame); + void trackModule(PyFrameObject* frame); + + // It is imperitive that we do not store strings for each python function, + // as that would do terrible things to our profiling overhead. So instead + // we store the much cheaper pair of `PyCodeObject*` and `int` which we can + // pack into `RawEvent`, and then store a mapping to the full strings the + // first time we see a function. + // + // TODO: + // In theory we should be able to use a combination of Py_INCREF on + // `f_code` and string interning to skip this step. (Effectively reusing + // work that the CPython interpreter has already done.) However it tends + // to segfault and simply caching the strings is inexpensive. + struct CodeDescription { + CodeDescription(int line_no, std::string filename, std::string funcname) + : line_no_(line_no), + filename_(std::move(filename)), + funcname_(std::move(funcname)) {} + int line_no_; + std::string filename_; + std::string funcname_; + }; + + struct ModuleForward { + ModuleForward(size_t event_index, PyObject* self) + : event_index_(event_index), self_(self) {} + size_t event_index_; + + // NB: + // This is a non-owning reference to keep `ModuleForward` POD; + // `PythonTracer` owns the contents instead. We Py_INCREF in + // `trackModule`, and `reset` is responsible for calling Py_DECREF + // when clearing `module_calls_`. + PyObject* self_; + }; + + bool active_; + PyObject* module_call_code_; + std::vector path_prefixes_; + std::vector trace_contexts_; + + std::vector events_; + std::vector module_calls_; + + using DescriptionKey = std::pair; + ska::flat_hash_map + code_descriptions_; + ska::flat_hash_map c_function_reprs_; }; PythonTracer& PythonTracer::singleton() { - static PythonTracer singleton_; - return singleton_; + static PythonTracer singleton_; + return singleton_; } PythonTracer::PythonTracer() : active_(false) { - path_prefixes_ = py::module::import("torch.profiler.python_tracer") - .attr("_prefix_regex")().cast>(); - - module_call_code_ = py::module::import("torch.nn") - .attr("Module") - .attr("__call__") - .attr("__code__") - .ptr(); + path_prefixes_ = py::module::import("torch.profiler.python_tracer") + .attr("_prefix_regex")().cast>(); + + module_call_code_ = py::module::import("torch.nn") + .attr("Module") + .attr("__call__") + .attr("__code__") + .ptr(); } void PythonTracer::start(size_t max_threads) { - TORCH_CHECK(!active_, "PythonTracer is already active") - TORCH_CHECK(!trace_contexts_.size(), "PythonTracer should not have active contexts"); - TORCH_CHECK(max_threads > 0, "max_threads must be positive, got ", max_threads); - TORCH_CHECK( - max_threads <= max_py_threads, - "max_threads must be less than or equal to ", max_py_threads); - - pybind11::gil_scoped_acquire gil; - auto t0 = now(); - - // Loop over all threads within the current interpreter. We will need to - // register a trace function with each thread. We set the current thread to - // position zero to ensure that it is traced, and so we can restore the - // thread state after registration. - std::vector thread_states { PyThreadState_Get() }; - if (max_threads > 1) { - auto thread_state = thread_states[0]; - while (thread_state != nullptr) { - if (thread_state != thread_states[0]) { - thread_states.push_back(thread_state); - } - thread_state = PyThreadState_Next(thread_state); - } - - if (thread_states.size() > max_threads) { - std::cout << "Warning: can only trace " << max_threads << " threads. " - << thread_states.size() << " are currently active." << std::endl; - thread_states.resize(max_threads); - } + TORCH_CHECK(!active_, "PythonTracer is already active") + TORCH_CHECK( + !trace_contexts_.size(), "PythonTracer should not have active contexts"); + TORCH_CHECK( + max_threads > 0, "max_threads must be positive, got ", max_threads); + TORCH_CHECK( + max_threads <= max_py_threads, + "max_threads must be less than or equal to ", + max_py_threads); + + pybind11::gil_scoped_acquire gil; + auto t0 = now(); + + // Loop over all threads within the current interpreter. We will need to + // register a trace function with each thread. We set the current thread to + // position zero to ensure that it is traced, and so we can restore the + // thread state after registration. + std::vector thread_states{PyThreadState_Get()}; + if (max_threads > 1) { + auto thread_state = thread_states[0]; + while (thread_state != nullptr) { + if (thread_state != thread_states[0]) { + thread_states.push_back(thread_state); + } + thread_state = PyThreadState_Next(thread_state); } - // Register the tracer in each thread. - for (const auto i : c10::irange(thread_states.size())) { - PyThreadState* thread_state = thread_states[i]; - PyThreadState_Swap(thread_state); - - auto ctx = (TraceContext*) TraceContextType.tp_alloc(&TraceContextType, 0); - ctx->thread_id_ = (uint8_t)i; - ctx->thread_state_ = thread_state; - ctx->initial_us_ = t0; - trace_contexts_.push_back(ctx); - - // When we begin profiling there are already frames on the Python - // interpreter stack. To ensure a complete trace, we must push calls - // to all the prior frames onto our event stack. (We stop at depth=128) - std::vector current_stack; - auto frame = PyEval_GetFrame(); - size_t depth = 0; // Make sure we can't infinite loop. - while (frame != nullptr && depth <= 128) { - current_stack.push_back(frame); - frame = frame->f_back; - depth++; - } - for (auto it = current_stack.rbegin(); it != current_stack.rend(); it++) { - recordPyCall(ctx, *it); - } - - // Note: - // This profile will not compose with other CPython profilers, and - // cannot be round tripped via `sys.settrace(sys.gettrace())` - PyEval_SetProfile(PythonTracer::pyProfileFn, (PyObject*)ctx); + if (thread_states.size() > max_threads) { + std::cout << "Warning: can only trace " << max_threads << " threads. " + << thread_states.size() << " are currently active." + << std::endl; + thread_states.resize(max_threads); + } + } + + // Register the tracer in each thread. + for (const auto i : c10::irange(thread_states.size())) { + PyThreadState* thread_state = thread_states[i]; + PyThreadState_Swap(thread_state); + + auto ctx = (TraceContext*)TraceContextType.tp_alloc(&TraceContextType, 0); + ctx->thread_id_ = (uint8_t)i; + ctx->thread_state_ = thread_state; + ctx->initial_us_ = t0; + trace_contexts_.push_back(ctx); + + // When we begin profiling there are already frames on the Python + // interpreter stack. To ensure a complete trace, we must push calls + // to all the prior frames onto our event stack. (We stop at depth=128) + std::vector current_stack; + auto frame = PyEval_GetFrame(); + size_t depth = 0; // Make sure we can't infinite loop. + while (frame != nullptr && depth <= 128) { + current_stack.push_back(frame); + frame = frame->f_back; + depth++; + } + for (auto it = current_stack.rbegin(); it != current_stack.rend(); it++) { + recordPyCall(ctx, *it); } - // Restore the thread state to its initial value. - PyThreadState_Swap(thread_states[0]); + // Note: + // This profile will not compose with other CPython profilers, and + // cannot be round tripped via `sys.settrace(sys.gettrace())` + PyEval_SetProfile(PythonTracer::pyProfileFn, (PyObject*)ctx); + } - active_ = true; + // Restore the thread state to its initial value. + PyThreadState_Swap(thread_states[0]); + + active_ = true; }; void PythonTracer::stop() { - TORCH_INTERNAL_ASSERT(active_, "PythonTracer is not running.") + TORCH_INTERNAL_ASSERT(active_, "PythonTracer is not running.") - pybind11::gil_scoped_acquire gil; + pybind11::gil_scoped_acquire gil; - PyThreadState* initial_thread_state = PyThreadState_Get(); - for (const auto i : trace_contexts_) { - PyThreadState_Swap(i->thread_state_); - PyEval_SetProfile(nullptr, nullptr); - } - PyThreadState_Swap(initial_thread_state); - active_ = false; + PyThreadState* initial_thread_state = PyThreadState_Get(); + for (const auto i : trace_contexts_) { + PyThreadState_Swap(i->thread_state_); + PyEval_SetProfile(nullptr, nullptr); + } + PyThreadState_Swap(initial_thread_state); + active_ = false; } - void PythonTracer::clear() { - TORCH_CHECK(!active_, "Cannot clear state while PythonTracer is active."); - for (auto i : trace_contexts_) { - Py_DECREF((PyObject*) i); - } - trace_contexts_.clear(); - events_.clear(); - code_descriptions_.clear(); - c_function_reprs_.clear(); - for (auto& i : module_calls_) { - Py_DECREF(i.self_); - } - module_calls_.clear(); + TORCH_CHECK(!active_, "Cannot clear state while PythonTracer is active."); + for (auto i : trace_contexts_) { + Py_DECREF((PyObject*)i); + } + trace_contexts_.clear(); + events_.clear(); + code_descriptions_.clear(); + c_function_reprs_.clear(); + for (auto& i : module_calls_) { + Py_DECREF(i.self_); + } + module_calls_.clear(); } void PythonTracer::recordPyCall(TraceContext* ctx, PyFrameObject* frame) { - events_.emplace_back(TraceTag::kPy_Call, frame->f_lasti, ctx, frame->f_code); - storeDescription(frame); - trackModule(frame); + events_.emplace_back(TraceTag::kPy_Call, frame->f_lasti, ctx, frame->f_code); + storeDescription(frame); + trackModule(frame); } -void PythonTracer::recordCCall(TraceContext* ctx, PyFrameObject* frame, PyObject* arg) { - events_.emplace_back(TraceTag::kC_Call, frame->f_lasti, ctx, arg); - const auto& it = c_function_reprs_.find(arg); - if C10_UNLIKELY(it == c_function_reprs_.end()) { - c_function_reprs_[arg] = py::repr(arg); - } +void PythonTracer::recordCCall( + TraceContext* ctx, + PyFrameObject* frame, + PyObject* arg) { + events_.emplace_back(TraceTag::kC_Call, frame->f_lasti, ctx, arg); + const auto& it = c_function_reprs_.find(arg); + if C10_UNLIKELY (it == c_function_reprs_.end()) { + c_function_reprs_[arg] = py::repr(arg); + } } -void PythonTracer::recordReturn(TraceContext* ctx, PyFrameObject* frame, TraceTag tag) { - events_.emplace_back(tag, frame->f_lasti, ctx); +void PythonTracer::recordReturn( + TraceContext* ctx, + PyFrameObject* frame, + TraceTag tag) { + events_.emplace_back(tag, frame->f_lasti, ctx); } // NB: @@ -448,272 +451,265 @@ void PythonTracer::recordReturn(TraceContext* ctx, PyFrameObject* frame, TraceTa // call rather than the return. (Otherwise we would get the line with the // return stmt.) void PythonTracer::storeDescription(PyFrameObject* frame) { - const auto& it = code_descriptions_.find({ frame->f_code, frame->f_lasti }); - if C10_UNLIKELY(it == code_descriptions_.end()) { - code_descriptions_.insert({ - { frame->f_code, frame->f_lasti }, - { - /*line_no=*/ PyCode_Addr2Line(frame->f_code, frame->f_lasti), - /*filename=*/ THPUtils_unpackString(frame->f_code->co_filename), - /*funcname=*/ THPUtils_unpackString(frame->f_code->co_name) - } - }); - } + const auto& it = code_descriptions_.find({frame->f_code, frame->f_lasti}); + if C10_UNLIKELY (it == code_descriptions_.end()) { + code_descriptions_.insert( + {{frame->f_code, frame->f_lasti}, + {/*line_no=*/PyCode_Addr2Line(frame->f_code, frame->f_lasti), + /*filename=*/THPUtils_unpackString(frame->f_code->co_filename), + /*funcname=*/THPUtils_unpackString(frame->f_code->co_name)}}); + } } void PythonTracer::trackModule(PyFrameObject* frame) { - if ((PyObject*)(frame->f_code) == module_call_code_) { - // By default, CPython stores locals in a "fast" format, with an array - // of names and an array of values. Consequently, frame->f_locals is - // NULL since the interpreter has no need to populate it. - // - // If these arrays were part of the public API then we could very - // quickly access `self`. Unfortunately they are not, and moreover are - // not stable across versions. As a result, we are forced to call - // `PyFrame_FastToLocals` which forces the interpreter to materialize - // the full dict of locals. - PyFrame_FastToLocals(frame); - auto self = PyDict_GetItemString(frame->f_locals, "self"); - Py_INCREF(self); - module_calls_.emplace_back( - /*event_index=*/events_.size() - 1, - /*self=*/self - ); - PyFrame_LocalsToFast(frame, 0); - } + if ((PyObject*)(frame->f_code) == module_call_code_) { + // By default, CPython stores locals in a "fast" format, with an array + // of names and an array of values. Consequently, frame->f_locals is + // NULL since the interpreter has no need to populate it. + // + // If these arrays were part of the public API then we could very + // quickly access `self`. Unfortunately they are not, and moreover are + // not stable across versions. As a result, we are forced to call + // `PyFrame_FastToLocals` which forces the interpreter to materialize + // the full dict of locals. + PyFrame_FastToLocals(frame); + auto self = PyDict_GetItemString(frame->f_locals, "self"); + Py_INCREF(self); + module_calls_.emplace_back( + /*event_index=*/events_.size() - 1, + /*self=*/self); + PyFrame_LocalsToFast(frame, 0); + } }; - // ============================================================================ // == Post processing ========================================================= // ============================================================================ class PyTraceReplay { public: - static std::vector> getEvents() { - return PyTraceReplay().replayStack(); - } + static std::vector> getEvents() { + return PyTraceReplay().replayStack(); + } private: - PyTraceReplay(); - std::vector> replayStack() const; + PyTraceReplay(); + std::vector> replayStack() const; - struct ReplayFrame { - std::unique_ptr event_; - size_t id_; - size_t parent_id_; - }; + struct ReplayFrame { + std::unique_ptr event_; + size_t id_; + size_t parent_id_; + }; - ska::flat_hash_map module_self_map_; - ska::flat_hash_map module_name_map_; + ska::flat_hash_map module_self_map_; + ska::flat_hash_map module_name_map_; }; PyTraceReplay::PyTraceReplay() { - ska::flat_hash_map module_names; - for (const auto& call : PythonTracer::singleton().module_calls_) { - if (module_names.find(call.self_) == module_names.end()) { - std::stringstream name_stream; - auto py_class_name = py::handle(call.self_) - .attr("__class__") - .attr("__name__"); - name_stream << "nn.Module: " << py::str(py_class_name); - module_names.insert({ call.self_, name_stream.str() }); - } - - module_self_map_.insert({ call.event_index_, call.self_ }); - module_name_map_.insert({ call.event_index_, module_names.at(call.self_) }); + ska::flat_hash_map module_names; + for (const auto& call : PythonTracer::singleton().module_calls_) { + if (module_names.find(call.self_) == module_names.end()) { + std::stringstream name_stream; + auto py_class_name = + py::handle(call.self_).attr("__class__").attr("__name__"); + name_stream << "nn.Module: " << py::str(py_class_name); + module_names.insert({call.self_, name_stream.str()}); } -} + module_self_map_.insert({call.event_index_, call.self_}); + module_name_map_.insert({call.event_index_, module_names.at(call.self_)}); + } +} // TODO: Use re2. void trimPrefix(std::string& s, const std::vector& prefixes) { - for (const auto& p : prefixes) { - if (s.compare(0, p.size(), p) == 0) { - s.erase(0, p.size()); - return; - } + for (const auto& p : prefixes) { + if (s.compare(0, p.size(), p) == 0) { + s.erase(0, p.size()); + return; } + } } - std::vector> PyTraceReplay::replayStack() const { - const auto& tracer = PythonTracer::singleton(); - - // We want to prune paths to a sensible prefix. For example - // `/foo/bar/baz/site-packages/torch/__init__.py` -> `torch/__init__.py` - // Pruning the path prefix is somewhat expensive, so we cache it. - ska::flat_hash_map filename_map; - for (const auto& i : tracer.code_descriptions_) { - if (filename_map.find(i.second.filename_) == filename_map.end()) { - std::string s(i.second.filename_); - trimPrefix(s, tracer.path_prefixes_); - filename_map[i.second.filename_] = s; - } + const auto& tracer = PythonTracer::singleton(); + + // We want to prune paths to a sensible prefix. For example + // `/foo/bar/baz/site-packages/torch/__init__.py` -> `torch/__init__.py` + // Pruning the path prefix is somewhat expensive, so we cache it. + ska::flat_hash_map filename_map; + for (const auto& i : tracer.code_descriptions_) { + if (filename_map.find(i.second.filename_) == filename_map.end()) { + std::string s(i.second.filename_); + trimPrefix(s, tracer.path_prefixes_); + filename_map[i.second.filename_] = s; } - - auto py_name = [&](const RawEvent& e) { - const auto& desc_it = tracer.code_descriptions_.find({e.misc_.f_code_, e.lasti()}); - if (desc_it != tracer.code_descriptions_.end()) { - std::stringstream name_stream; - name_stream << filename_map.at(desc_it->second.filename_) << "(" - << desc_it->second.line_no_ << "): " << desc_it->second.funcname_; - return name_stream.str(); - } - return std::string("Python: ???"); - }; - - size_t id_counter = 0; - std::vector> stacks(tracer.trace_contexts_.size()); - std::vector results; - - // Match calls and returns. - size_t event_idx = 0; - for (auto& raw_event : tracer.events_) { - auto& stack = stacks[raw_event.thread_id_]; - auto ctx = tracer.trace_contexts_[raw_event.thread_id_]; - auto t = static_cast(raw_event.t_) + ctx->initial_us_; - - auto push_frame = [&](std::string name, CallType call_type, size_t module_id = 0) { - stack.push_back(ReplayFrame { - /*event_=*/ std::make_unique(PyTraceEvent{ - /*startTime_=*/ t, - /*endTime_=*/ -1, // Placeholder - /*name_=*/ name, - /*thread_id_=*/ raw_event.thread_id_, - /*parent_=*/ nullptr, // Placeholder - /*call_type_=*/ call_type, - /*module_id_=*/ module_id, - /*call_idx_=*/ event_idx, - /*return_idx_=*/ 0 // Placeholder - }), - /*id_=*/ id_counter++, - /*parent_id_=*/ stack.size() ? stack.back().id_ : 0, - }); - }; - - switch (raw_event.tag()) { - case TraceTag::kPy_Call: - if (module_name_map_.find(event_idx) != module_name_map_.end()) { - push_frame( - module_name_map_.at(event_idx), - CallType::kPyModuleCall, - reinterpret_cast(module_self_map_.at(event_idx))); - } else { - push_frame(py_name(raw_event), CallType::kPyCall); - } - break; - - case TraceTag::kC_Call: - push_frame(tracer.c_function_reprs_.at(raw_event.misc_.arg_), CallType::kCCall); - break; - - case TraceTag::kPy_Return: - case TraceTag::kC_Return: - TORCH_INTERNAL_ASSERT(stack.size(), "Python replay stack is empty.") - stack.back().event_->endTime_ = t; - stack.back().event_->return_idx_ = event_idx; - results.push_back(std::move(stack.back())); - stack.pop_back(); - break; - } - event_idx++; + } + + auto py_name = [&](const RawEvent& e) { + const auto& desc_it = + tracer.code_descriptions_.find({e.misc_.f_code_, e.lasti()}); + if (desc_it != tracer.code_descriptions_.end()) { + std::stringstream name_stream; + name_stream << filename_map.at(desc_it->second.filename_) << "(" + << desc_it->second.line_no_ + << "): " << desc_it->second.funcname_; + return name_stream.str(); } + return std::string("Python: ???"); + }; + + size_t id_counter = 0; + std::vector> stacks(tracer.trace_contexts_.size()); + std::vector results; + + // Match calls and returns. + size_t event_idx = 0; + for (auto& raw_event : tracer.events_) { + auto& stack = stacks[raw_event.thread_id_]; + auto ctx = tracer.trace_contexts_[raw_event.thread_id_]; + auto t = static_cast(raw_event.t_) + ctx->initial_us_; + + auto push_frame = + [&](std::string name, CallType call_type, size_t module_id = 0) { + stack.push_back(ReplayFrame{ + /*event_=*/std::make_unique(PyTraceEvent{ + /*startTime_=*/t, + /*endTime_=*/-1, // Placeholder + /*name_=*/name, + /*thread_id_=*/raw_event.thread_id_, + /*parent_=*/nullptr, // Placeholder + /*call_type_=*/call_type, + /*module_id_=*/module_id, + /*call_idx_=*/event_idx, + /*return_idx_=*/0 // Placeholder + }), + /*id_=*/id_counter++, + /*parent_id_=*/stack.size() ? stack.back().id_ : 0, + }); + }; - // Cleanup by feining return to close out the stack. This is needed so - // frames above the one that called the profiler still appear in the trace. - const auto t_final = now(); - for (auto& stack : stacks) { - while (stack.size()) { - stack.back().event_->endTime_ = t_final; - stack.back().event_->return_idx_ = event_idx; - results.push_back(std::move(stack.back())); - stack.pop_back(); - event_idx++; + switch (raw_event.tag()) { + case TraceTag::kPy_Call: + if (module_name_map_.find(event_idx) != module_name_map_.end()) { + push_frame( + module_name_map_.at(event_idx), + CallType::kPyModuleCall, + reinterpret_cast(module_self_map_.at(event_idx))); + } else { + push_frame(py_name(raw_event), CallType::kPyCall); } + break; + + case TraceTag::kC_Call: + push_frame( + tracer.c_function_reprs_.at(raw_event.misc_.arg_), + CallType::kCCall); + break; + + case TraceTag::kPy_Return: + case TraceTag::kC_Return: + TORCH_INTERNAL_ASSERT(stack.size(), "Python replay stack is empty.") + stack.back().event_->endTime_ = t; + stack.back().event_->return_idx_ = event_idx; + results.push_back(std::move(stack.back())); + stack.pop_back(); + break; } - - // Convert to `PyTraceEvent`, and map id to pointer. - ska::flat_hash_map event_id_map {{0, nullptr}}; - std::vector> out; - for (auto& r : results) { - out.push_back(std::move(r.event_)); - event_id_map.insert({r.id_, out.back().get()}); + event_idx++; + } + + // Cleanup by feining return to close out the stack. This is needed so + // frames above the one that called the profiler still appear in the trace. + const auto t_final = now(); + for (auto& stack : stacks) { + while (stack.size()) { + stack.back().event_->endTime_ = t_final; + stack.back().event_->return_idx_ = event_idx; + results.push_back(std::move(stack.back())); + stack.pop_back(); + event_idx++; } - - // Link parents to children. - for (const auto i : c10::irange(results.size())) { - out[i]->parent_ = event_id_map.at(results[i].parent_id_); - } - return out; + } + + // Convert to `PyTraceEvent`, and map id to pointer. + ska::flat_hash_map event_id_map{{0, nullptr}}; + std::vector> out; + for (auto& r : results) { + out.push_back(std::move(r.event_)); + event_id_map.insert({r.id_, out.back().get()}); + } + + // Link parents to children. + for (const auto i : c10::irange(results.size())) { + out[i]->parent_ = event_id_map.at(results[i].parent_id_); + } + return out; } - // ============================================================================ // == API ===================================================================== // ============================================================================ - int PythonTracer::pyProfileFn( - PyObject* obj, - PyFrameObject* frame, - int what, - PyObject* arg) { - auto ctx = reinterpret_cast(obj); - switch (what) { - case PyTrace_CALL: - PythonTracer::singleton().recordPyCall(ctx, frame); - break; - - case PyTrace_C_CALL: - PythonTracer::singleton().recordCCall(ctx, frame, arg); - break; - - case PyTrace_EXCEPTION: - case PyTrace_RETURN: - PythonTracer::singleton().recordReturn(ctx, frame, TraceTag::kPy_Return); - break; - - case PyTrace_C_EXCEPTION: - case PyTrace_C_RETURN: - PythonTracer::singleton().recordReturn(ctx, frame, TraceTag::kC_Return); - break; - } - return 0; + PyObject* obj, + PyFrameObject* frame, + int what, + PyObject* arg) { + auto ctx = reinterpret_cast(obj); + switch (what) { + case PyTrace_CALL: + PythonTracer::singleton().recordPyCall(ctx, frame); + break; + + case PyTrace_C_CALL: + PythonTracer::singleton().recordCCall(ctx, frame, arg); + break; + + case PyTrace_EXCEPTION: + case PyTrace_RETURN: + PythonTracer::singleton().recordReturn(ctx, frame, TraceTag::kPy_Return); + break; + + case PyTrace_C_EXCEPTION: + case PyTrace_C_RETURN: + PythonTracer::singleton().recordReturn(ctx, frame, TraceTag::kC_Return); + break; + } + return 0; } void PythonTracer::call(Command c) { - switch (c) { - case Command::kStartOne: - PythonTracer::singleton().start(1); - break; - - case Command::kStartAll: - PythonTracer::singleton().start(); - break; - - case Command::kStop: - PythonTracer::singleton().stop(); - break; - - case Command::kClear: - PythonTracer::singleton().clear(); - break; - - default: - break; - } + switch (c) { + case Command::kStartOne: + PythonTracer::singleton().start(1); + break; + + case Command::kStartAll: + PythonTracer::singleton().start(); + break; + + case Command::kStop: + PythonTracer::singleton().stop(); + break; + + case Command::kClear: + PythonTracer::singleton().clear(); + break; + + default: + break; + } }; -} // namespace +} // namespace void init() { - pybind11::gil_scoped_acquire gil; - TORCH_CHECK(PyType_Ready(&TraceContextType) == 0); + pybind11::gil_scoped_acquire gil; + TORCH_CHECK(PyType_Ready(&TraceContextType) == 0); - registerFunctions( - /*call=*/&PythonTracer::call, - /*get_events=*/&PyTraceReplay::getEvents - ); + registerFunctions( + /*call=*/&PythonTracer::call, + /*get_events=*/&PyTraceReplay::getEvents); } - }}}} // namespace torch::autograd::profiler::python_tracer diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp index 9a6221130ed0..43911fe18b99 100644 --- a/torch/csrc/autograd/python_function.cpp +++ b/torch/csrc/autograd/python_function.cpp @@ -167,10 +167,16 @@ auto PyNode::is_traceable() -> bool { } auto PyNode::release_variables() -> void { - pybind11::gil_scoped_acquire gil; - auto f = (THPFunction*) obj; - f->saved_variables.clear(); - f->has_freed_buffers = 1; + // This function is called as part of the Node destructor! + // Since this object might be kept alive by C++, it is possible + // that the python interpreter is already dead here. In that case + // we just leak the saved objects. + if (Py_IsInitialized()) { + pybind11::gil_scoped_acquire gil; + auto f = (THPFunction*) obj; + f->saved_variables.clear(); + f->has_freed_buffers = 1; + } } auto PyNode::name() const -> std::string { @@ -564,6 +570,11 @@ static void _trace_post_record( } node->i_(jit::attr::inplace, is_inplace); + if (PyObject* module_name = PyDict_GetItemString(((PyTypeObject*)op_obj)->tp_dict, "__module__")) { + if (auto ptr = PyUnicode_AsUTF8(module_name)) { + node->s_(jit::attr::module, std::string(ptr)); + } + } // Isolate C variable ptrs in a vector int num_outputs = PyTuple_GET_SIZE(output_objects); @@ -671,10 +682,19 @@ PyObject* THPFunction_name(PyObject *self, PyObject* noargs) { PyObject *THPFunction_apply(PyObject *cls, PyObject *inputs) { HANDLE_TH_ERRORS + + // save a local copy of seq_id before it gets incremented + int seq_id = at::sequence_number::peek(); + auto info_pair = unpack_input(inputs); + UnpackedInput& unpacked_input = info_pair.first; + InputFlags& input_info = info_pair.second; + + // Call record function after all the inputs have been decoded, but + // before context has been allocated. RECORD_FUNCTION( ((PyTypeObject*)cls)->tp_name, - std::vector(), - at::sequence_number::peek()); + std::vector(unpacked_input.input_vars.begin(), unpacked_input.input_vars.end()), + seq_id); // Temporary hack to improve functorch UX. We'll find a better solution. const auto& functorch_tls = at::functorch::functorchTLSAccessor(); @@ -691,11 +711,6 @@ PyObject *THPFunction_apply(PyObject *cls, PyObject *inputs) auto cdata = std::shared_ptr(new PyNode(std::move(ctx_obj)), deleteNode); ctx->cdata = cdata; - // Prepare inputs and allocate context (grad fn) - auto info_pair = unpack_input(inputs); - UnpackedInput& unpacked_input = info_pair.first; - InputFlags& input_info = info_pair.second; - // Record input nodes if tracing auto* node = _trace_pre_record(cls, inputs, unpacked_input.input_vars); @@ -705,6 +720,7 @@ PyObject *THPFunction_apply(PyObject *cls, PyObject *inputs) ctx->needs_input_grad = input_info.needs_input_grad.release(); ctx->is_variable_input = std::move(input_info.is_variable_input); + // Prepend ctx to input_tuple, in preparation for static method call auto num_args = PyTuple_GET_SIZE(inputs); THPObjectPtr ctx_input_tuple(PyTuple_New(num_args + 1)); diff --git a/torch/csrc/autograd/python_mode.cpp b/torch/csrc/autograd/python_mode.cpp deleted file mode 100644 index cda38bdb7dff..000000000000 --- a/torch/csrc/autograd/python_mode.cpp +++ /dev/null @@ -1,27 +0,0 @@ -#include -#include -#include -#include -#include - -namespace torch { namespace autograd { - -void PythonMode::enter(PyObject* type) { - if (at::impl::PythonModeTLS::get_state()) { - TORCH_CHECK( - false, - "python mode has already been set. We do not yet support nested python ", - "mode. Please file us an issue and reset it before setting it again.") - } - // TorchDispatchTypeObject steals a reference, See NOTE [What is TorchDispatchTypeObject?] - Py_INCREF(type); - auto state = std::make_shared(type, getPyInterpreter()); - at::impl::PythonModeTLS::set_state(state); -} - -void PythonMode::exit() { - TORCH_INTERNAL_ASSERT(at::impl::PythonModeTLS::get_state(), "exiting Python Mode but it wasn't set!"); - at::impl::PythonModeTLS::reset_state(); -} - -}} diff --git a/torch/csrc/autograd/python_mode.h b/torch/csrc/autograd/python_mode.h deleted file mode 100644 index 03da51c1c49e..000000000000 --- a/torch/csrc/autograd/python_mode.h +++ /dev/null @@ -1,17 +0,0 @@ -#pragma once - -#include -#include - -namespace torch { namespace autograd { - -struct TORCH_API PythonMode { - // Enter python mode, causing all operators to dispatch to the type's __torch_dispatch__. - // `type` is the type of a Tensor subclass that has __torch_dispatch__. - static void enter(PyObject* type); - - // Exit the current python mode. - static void exit(); -}; - -}} diff --git a/torch/csrc/autograd/python_torch_functions_manual.cpp b/torch/csrc/autograd/python_torch_functions_manual.cpp index 5af6f3cc640b..e35af63ccf2e 100644 --- a/torch/csrc/autograd/python_torch_functions_manual.cpp +++ b/torch/csrc/autograd/python_torch_functions_manual.cpp @@ -359,8 +359,21 @@ static PyObject * THPVariable_randint(PyObject* self_, PyObject* args, PyObject* static PyObject * THPVariable_as_tensor(PyObject* self, PyObject* args, PyObject* kwargs) { HANDLE_TH_ERRORS + static PythonArgParser parser({ + "as_tensor(PyObject* data, *, ScalarType dtype=None, Device? device=None)", + }); + + ParsedArgs<3> parsed_args; + auto r = parser.parse(args, kwargs, parsed_args); + if (r.has_torch_function()) { + return handle_torch_function( + r, nullptr, args, kwargs, THPVariableFunctionsModule, "torch"); + } jit::tracer::warn("torch.as_tensor", jit::tracer::WARN_CONSTRUCTOR); - return THPVariable_Wrap(torch::utils::as_tensor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs)); + return THPVariable_Wrap(torch::utils::as_tensor( + torch::tensors::get_default_dispatch_key(), + torch::tensors::get_default_scalar_type(), + r)); END_HANDLE_TH_ERRORS } @@ -394,35 +407,90 @@ static std::vector dispatch_nonzero_numpy(const Tensor & self) { static PyObject * THPVariable_nonzero(PyObject* self, PyObject* args, PyObject* kwargs); -static PyObject * THPVariable_sparse_csr_tensor(PyObject* self, PyObject* args, PyObject* kwargs) -{ - HANDLE_TH_ERRORS - jit::tracer::warn("torch.sparse_csr_tensor", jit::tracer::WARN_CONSTRUCTOR); - return THPVariable_Wrap(torch::utils::sparse_csr_tensor_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs)); - END_HANDLE_TH_ERRORS -} - -static PyObject * THPVariable__sparse_csr_tensor_unsafe(PyObject* self, PyObject* args, PyObject* kwargs) -{ - HANDLE_TH_ERRORS - jit::tracer::warn("torch._sparse_csr_tensor_unsafe", jit::tracer::WARN_CONSTRUCTOR); - return THPVariable_Wrap(torch::utils::_sparse_csr_tensor_unsafe_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs)); - END_HANDLE_TH_ERRORS -} +#define THPVARIABLE_SPARSE_COMPRESSED_CTOR(NAME, NARGS, SIGNATURES) \ +static PyObject * THPVariable_ ## NAME(PyObject* self, PyObject* args, PyObject* kwargs) \ +{ \ + HANDLE_TH_ERRORS \ + static PythonArgParser parser SIGNATURES ; \ + ParsedArgs parsed_args; \ + auto r = parser.parse(args, kwargs, parsed_args); \ + if (r.has_torch_function()) { \ + return handle_torch_function(r, nullptr, args, kwargs, THPVariableFunctionsModule, "torch"); \ + } \ + jit::tracer::warn("torch." # NAME, jit::tracer::WARN_CONSTRUCTOR); \ + return THPVariable_Wrap(torch::utils::NAME ## _ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), r)); \ + END_HANDLE_TH_ERRORS \ +} + +THPVARIABLE_SPARSE_COMPRESSED_CTOR(sparse_compressed_tensor, 9, + ({"sparse_compressed_tensor(PyObject* compressed_indices, PyObject* plain_indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Layout? layout=None, Device? device=None, bool pin_memory=False, bool requires_grad=False)", + "sparse_compressed_tensor(PyObject* compressed_indices, PyObject* plain_indices, PyObject* values, *, ScalarType dtype=None, Layout? layout=None, Device? device=None, bool pin_memory=False, bool requires_grad=False)"})) +THPVARIABLE_SPARSE_COMPRESSED_CTOR(sparse_csr_tensor, 9, + ({"sparse_csr_tensor(PyObject* crow_indices, PyObject* col_indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Layout? layout=None, Device? device=None, bool pin_memory=False, bool requires_grad=False)", + "sparse_csr_tensor(PyObject* crow_indices, PyObject* col_indices, PyObject* values, *, ScalarType dtype=None, Layout? layout=None, Device? device=None, bool pin_memory=False, bool requires_grad=False)"})) +THPVARIABLE_SPARSE_COMPRESSED_CTOR(sparse_csc_tensor, 9, + ({"sparse_csc_tensor(PyObject* ccol_indices, PyObject* row_indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Layout? layout=None, Device? device=None, bool pin_memory=False, bool requires_grad=False)", + "sparse_csc_tensor(PyObject* ccol_indices, PyObject* row_indices, PyObject* values, *, ScalarType dtype=None, Layout? layout=None, Device? device=None, bool pin_memory=False, bool requires_grad=False)"})) +THPVARIABLE_SPARSE_COMPRESSED_CTOR(sparse_bsr_tensor, 9, + ({"sparse_bsr_tensor(PyObject* crow_indices, PyObject* col_indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Layout? layout=None, Device? device=None, bool pin_memory=False, bool requires_grad=False)", + "sparse_bsr_tensor(PyObject* crow_indices, PyObject* col_indices, PyObject* values, *, ScalarType dtype=None, Layout? layout=None, Device? device=None, bool pin_memory=False, bool requires_grad=False)"})) +THPVARIABLE_SPARSE_COMPRESSED_CTOR(sparse_bsc_tensor, 9, + ({"sparse_bsc_tensor(PyObject* ccol_indices, PyObject* row_indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Layout? layout=None, Device? device=None, bool pin_memory=False, bool requires_grad=False)", + "sparse_bsc_tensor(PyObject* ccol_indices, PyObject* row_indices, PyObject* values, *, ScalarType dtype=None, Layout? layout=None, Device? device=None, bool pin_memory=False, bool requires_grad=False)"})) + + +THPVARIABLE_SPARSE_COMPRESSED_CTOR(_sparse_compressed_tensor_unsafe, 8, + ({"_sparse_compressed_tensor_unsafe(PyObject* compressed_indices, PyObject* plain_indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Layout? layout=None, Device? device=None, bool requires_grad=False)"})) +THPVARIABLE_SPARSE_COMPRESSED_CTOR(_sparse_csr_tensor_unsafe, 7, + ({"_sparse_csr_tensor_unsafe(PyObject* crow_indices, PyObject* col_indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)"})) +THPVARIABLE_SPARSE_COMPRESSED_CTOR(_sparse_csc_tensor_unsafe, 7, + ({"_sparse_csc_tensor_unsafe(PyObject* ccol_indices, PyObject* row_indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)"})) +THPVARIABLE_SPARSE_COMPRESSED_CTOR(_sparse_bsr_tensor_unsafe, 7, + ({"_sparse_bsr_tensor_unsafe(PyObject* crow_indices, PyObject* col_indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)"})) +THPVARIABLE_SPARSE_COMPRESSED_CTOR(_sparse_bsc_tensor_unsafe, 7, + ({"_sparse_bsc_tensor_unsafe(PyObject* ccol_indices, PyObject* row_indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)"})) static PyObject * THPVariable_sparse_coo_tensor(PyObject* self, PyObject* args, PyObject* kwargs) { HANDLE_TH_ERRORS + static PythonArgParser parser({ + "sparse_coo_tensor(PyObject* indices, PyObject* values, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)", + "sparse_coo_tensor(PyObject* indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)", + "sparse_coo_tensor(IntArrayRef size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)", + }); + + ParsedArgs<6> parsed_args; + auto r = parser.parse(args, kwargs, parsed_args); + if (r.has_torch_function()) { + return handle_torch_function( + r, nullptr, args, kwargs, THPVariableFunctionsModule, "torch"); + } jit::tracer::warn("torch.sparse_coo_tensor", jit::tracer::WARN_CONSTRUCTOR); - return THPVariable_Wrap(torch::utils::sparse_coo_tensor_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs)); + return THPVariable_Wrap(torch::utils::sparse_coo_tensor_ctor( + torch::tensors::get_default_dispatch_key(), + torch::tensors::get_default_scalar_type(), + r)); END_HANDLE_TH_ERRORS } static PyObject * THPVariable__sparse_coo_tensor_unsafe(PyObject* self, PyObject* args, PyObject* kwargs) { HANDLE_TH_ERRORS + static PythonArgParser parser({ + "_sparse_coo_tensor_unsafe(PyObject* indices, PyObject* values, IntArrayRef size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)", + }); + + ParsedArgs<6> parsed_args; + auto r = parser.parse(args, kwargs, parsed_args); + if (r.has_torch_function()) { + return handle_torch_function( + r, nullptr, args, kwargs, THPVariableFunctionsModule, "torch"); + } jit::tracer::warn("torch._sparse_coo_tensor_unsafe", jit::tracer::WARN_CONSTRUCTOR); - return THPVariable_Wrap(torch::utils::_sparse_coo_tensor_unsafe_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs)); + return THPVariable_Wrap(torch::utils::_sparse_coo_tensor_unsafe_ctor( + torch::tensors::get_default_dispatch_key(), + torch::tensors::get_default_scalar_type(), + r)); END_HANDLE_TH_ERRORS } @@ -431,8 +499,22 @@ static PyObject * THPVariable__sparse_coo_tensor_unsafe(PyObject* self, PyObject static PyObject * THPVariable_tensor(PyObject* self, PyObject* args, PyObject* kwargs) { HANDLE_TH_ERRORS + static PythonArgParser parser({ + "tensor(PyObject* data, *, ScalarType dtype=None, Device? device=None, bool pin_memory=False, bool requires_grad=False, DimnameList? names=None)", + }); + + constexpr int ctor_num_args = 6; + ParsedArgs parsed_args; + auto r = parser.parse(args, kwargs, parsed_args); + if (r.has_torch_function()) { + return handle_torch_function( + r, nullptr, args, kwargs, THPVariableFunctionsModule, "torch"); + } jit::tracer::warn("torch.tensor", jit::tracer::WARN_CONSTRUCTOR); - return THPVariable_Wrap(torch::utils::tensor_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs)); + return THPVariable_Wrap(torch::utils::tensor_ctor( + torch::tensors::get_default_dispatch_key(), + torch::tensors::get_default_scalar_type(), + r)); END_HANDLE_TH_ERRORS } @@ -661,7 +743,19 @@ static PyObject * THPVariable__sync(PyObject *self, PyObject* args, PyObject* kw static PyObject * THPVariable__enable_functionalization(PyObject *self, PyObject* args, PyObject* kwargs) { HANDLE_TH_ERRORS + static PythonArgParser parser({"_enable_functionalization(*, bool reapply_views=False)"}, /*traceable=*/true); + ParsedArgs<1> parsed_args; + auto r = parser.parse(args, kwargs, parsed_args); + const auto reapply_views = r.toBool(0); + + if (c10::impl::tls_is_dispatch_key_included(at::DispatchKey::Functionalize)) { + TORCH_INTERNAL_ASSERT(false, "multiple layers of mode-style functionalization nesting is not" + " currently supported, outside of the functionalize() transform"); + } c10::impl::tls_set_dispatch_key_included(at::DispatchKey::Functionalize, true); + if (reapply_views) { + at::functionalization::impl::setFunctionalizationReapplyViewsTLS(true); + } Py_RETURN_NONE; END_HANDLE_TH_ERRORS } @@ -670,6 +764,7 @@ static PyObject * THPVariable__disable_functionalization(PyObject *self, PyObjec { HANDLE_TH_ERRORS c10::impl::tls_set_dispatch_key_included(at::DispatchKey::Functionalize, false); + at::functionalization::impl::setFunctionalizationReapplyViewsTLS(false); Py_RETURN_NONE; END_HANDLE_TH_ERRORS } @@ -701,8 +796,16 @@ static PyMethodDef torch_functions_manual[] = { {"range", castPyCFunctionWithKeywords(THPVariable_range), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, {"sparse_coo_tensor", castPyCFunctionWithKeywords(THPVariable_sparse_coo_tensor), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, {"_sparse_coo_tensor_unsafe", castPyCFunctionWithKeywords(THPVariable__sparse_coo_tensor_unsafe), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, + {"_sparse_compressed_tensor_unsafe", castPyCFunctionWithKeywords(THPVariable__sparse_compressed_tensor_unsafe), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, + {"sparse_compressed_tensor", castPyCFunctionWithKeywords(THPVariable_sparse_compressed_tensor), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, {"sparse_csr_tensor", castPyCFunctionWithKeywords(THPVariable_sparse_csr_tensor), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, + {"sparse_csc_tensor", castPyCFunctionWithKeywords(THPVariable_sparse_csc_tensor), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, + {"sparse_bsr_tensor", castPyCFunctionWithKeywords(THPVariable_sparse_bsr_tensor), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, + {"sparse_bsc_tensor", castPyCFunctionWithKeywords(THPVariable_sparse_bsc_tensor), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, {"_sparse_csr_tensor_unsafe", castPyCFunctionWithKeywords(THPVariable__sparse_csr_tensor_unsafe), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, + {"_sparse_csc_tensor_unsafe", castPyCFunctionWithKeywords(THPVariable__sparse_csc_tensor_unsafe), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, + {"_sparse_bsr_tensor_unsafe", castPyCFunctionWithKeywords(THPVariable__sparse_bsr_tensor_unsafe), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, + {"_sparse_bsc_tensor_unsafe", castPyCFunctionWithKeywords(THPVariable__sparse_bsc_tensor_unsafe), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, {"tensor", castPyCFunctionWithKeywords(THPVariable_tensor), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, {"get_device", castPyCFunctionWithKeywords(THPVariable_get_device), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, {"numel", castPyCFunctionWithKeywords(THPVariable_numel), METH_VARARGS | METH_KEYWORDS | METH_STATIC, nullptr}, diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp index eca3fce4a1da..cf3e263bb365 100644 --- a/torch/csrc/autograd/python_variable.cpp +++ b/torch/csrc/autograd/python_variable.cpp @@ -1,44 +1,43 @@ -#include - -#include +#include +#include +#include +#include +#include +#include +#include #include #include -#include #include +#include #include #include #include +#include +#include +#include #include #include #include -#include -#include -#include -#include #include #include +#include +#include #include -#include #include -#include #include -#include #include +#include #include -#include -#include -#include -#include -#include - +#include +#include +#include +#include #include #include -#include #include -#include #include #include @@ -53,6 +52,116 @@ using namespace at; using namespace torch; using namespace torch::autograd; +std::pair parseIValuesToPyArgsKwargs(const c10::OperatorHandle& op, const std::vector& arguments) { + TORCH_CHECK(PyGILState_Check(), "GIL must be held before you call parseIValuesToPyArgsKwargs"); + const auto& schema = op.schema(); + py::dict kwargs; + // About all the pointers: + // + // f(int x, int y = 0, *, int z = 0) + // ^- arguments.size() + // ^- kwarg_only_start + // ^- positional_default_start + // ^- 0 + + // Find the split point between kwarg-only and regular. Since most functions + // don't have kwarg-only arguments, it is more efficient to scan from the + // right (but ideally, this would just be precomputed in FunctionSchema + // itself). (NB: minus one in the loop is because we're testing if the + // *next* argument is kwarg-only before we advance the starting index) + int64_t kwarg_only_start = arguments.size(); + for (; kwarg_only_start > 0; kwarg_only_start--) { + const auto& arg = schema.arguments()[kwarg_only_start - 1]; + if (!arg.kwarg_only()) { + break; + } + } + + // Find the first positional argument that isn't defaulted + auto is_default = [&](int64_t idx) -> bool { + const auto& arg = schema.arguments()[idx]; + if (!arg.default_value().has_value()) { + return false; + } + const auto& default_ivalue = *arg.default_value(); + const auto& ivalue = arguments[idx]; + if (default_ivalue != ivalue) { + return false; + } + return true; + }; + + int64_t positional_default_start = kwarg_only_start; + for (; positional_default_start > 0; positional_default_start--) { + if (!is_default(positional_default_start - 1)) { + break; + } + } + + auto args = py::reinterpret_steal(PyTuple_New(positional_default_start)); + + auto schemaAwareToPyObject = [&](int64_t idx) -> py::object { + const auto& arg = schema.arguments()[idx]; + auto match = [&](c10::TypeKind kind) { + const auto& t = arg.real_type(); + if (t->kind() == kind) return true; + if (auto opt_t = t->cast()) { + if (opt_t->getElementType()->kind() == kind) return true; + } + return false; + }; + if (arguments[idx].isNone()) { + return py::none(); + } else if (match(c10::ScalarTypeType::Kind)) { + auto* obj = getTHPDtype(static_cast(arguments[idx].toInt())); + return py::reinterpret_borrow(reinterpret_cast(obj)); + } else if (match(c10::LayoutType::Kind)) { + auto* obj = getTHPLayout(static_cast(arguments[idx].toInt())); + return py::reinterpret_borrow(reinterpret_cast(obj)); + } else if (match(c10::MemoryFormatType::Kind)) { + return torch::utils::getTHPMemoryFormat(static_cast(arguments[idx].toInt())); + } else { + return torch::jit::toPyObject(arguments[idx]); + } + }; + + // Populate positional arguments + for (const auto idx : c10::irange(positional_default_start)) { + PyTuple_SET_ITEM(args.ptr(), idx, schemaAwareToPyObject(idx).release().ptr()); + } + + // Populate keyword arguments + for (const auto idx : c10::irange(kwarg_only_start, arguments.size())) { + // But don't populate default keyword arguments + if (is_default(idx)) continue; + const auto& arg = schema.arguments()[idx]; + kwargs[py::cast(arg.name())] = schemaAwareToPyObject(idx); + } + return std::make_pair(std::move(args), std::move(kwargs)); +} + +void pushPyOutToStack( + const c10::OperatorHandle& op, + torch::jit::Stack* stack, + py::object out, + const char* msg) { + TORCH_CHECK(PyGILState_Check(), "GIL must be held before you call pushPyOutToStack"); + auto schema_returns = op.schema().returns(); + const auto num_returns = schema_returns.size(); + if (num_returns == 0) { + // Check that we got a None return from Python. Anything else is an error. + TORCH_CHECK(out.is(py::none()), "Expected ", msg, " for ", op.operator_name(), + " to return None but it returned something else instead."); + } else if (num_returns == 1) { + torch::jit::push(stack, torch::jit::toIValue(out.ptr(), schema_returns[0].type())); + } else { + auto outs = py::cast(out); + for (const auto idx : c10::irange(outs.size())) { + torch::jit::push(stack, torch::jit::toIValue(outs[idx].ptr(), schema_returns[idx].type())); + } + } +} + namespace { std::string concrete_name_fn(const c10::impl::PyInterpreter* self) { @@ -104,7 +213,8 @@ void concrete_dispatch_fn( const c10::impl::PyInterpreter*, const c10::OperatorHandle& op, torch::jit::Stack* stack, - const std::shared_ptr& type); + const std::shared_ptr& type); +bool concrete_is_contiguous_fn(const c10::impl::PyInterpreter*, const c10::TensorImpl* self); class PyInterpreterHolder { public: @@ -113,7 +223,8 @@ class PyInterpreterHolder { &concrete_name_fn, &concrete_decref_fn, &concrete_detach_fn, - &concrete_dispatch_fn)) {} + &concrete_dispatch_fn, + &concrete_is_contiguous_fn)) {} // NB: intentionally leaks the memory ~PyInterpreterHolder() { impl_->disarm(); @@ -133,8 +244,6 @@ c10::impl::PyInterpreter* getPyInterpreter() { return self_interpreter.get(); } -namespace py = pybind11; - PyObject *THPVariableClass = nullptr; PyObject *ParameterClass = nullptr; @@ -151,10 +260,12 @@ static const char* VOLATILE_WARNING = static bool check_has_torch_dispatch(PyObject *obj) { PyTypeObject *tp = Py_TYPE(obj); + py::object attr = PyObject_FastGetAttrString(obj, "__torch_dispatch__"); return ( !THPVariable_CheckTypeExact(tp) && // TODO: test if Python key is disabled - PyObject_FastGetAttrString(obj, "__torch_dispatch__").ptr() != nullptr + attr.ptr() != nullptr && + attr.ptr() != torch::disabled_torch_dispatch_impl() ); } @@ -236,7 +347,106 @@ PyObject * THPVariable_Wrap(at::TensorBase var) (PyTypeObject*)THPVariableClass, std::move(var), status); } +bool isResurrectable(THPVariable* self) { + // We want to divide this check into 2 cases. + + // 1. C++ owns PyObject (in this case, self->cdata.unsafeIsBorrowed() is + // true). You might think that in this case, it is impossible for tp_clear to + // be called: surely the C++ reference to the PyObject is keeping it live? And + // you'd be right! In fact, when C++ owns the PyObject, we have an invariant + // that the refcount on the PyObject should be precisely one (because if you + // take out another reference to the PyObject, we're supposed to flip the + // ownership pointer back). In reality, you can violate this invariant + // temporarily with weak references, so we don't test for it in asserts. + + // 2. PyObject owns C++ (in this case, self->cdata.unsafeIsBorrowed() is + // false). In this case, tp_clear can get called if the PyObject is referenced + // from a dead cycle, and nowhere else. But if resurrection did not occur, + // then the reference to C++ from the PyObject must be the ONLY reference to + // the C++ object. + if (self->cdata.unsafeIsBorrowed()) { + return false; + } + auto const& tensor = THPVariable_Unpack(self); + if (!tensor.defined() || tensor.use_count() <= 1) { + return false; + } + return true; +} + +// returns true if successfully rezzed; if so, cancel the +// rest of deallocation +static bool THPVariable_tryResurrect(THPVariable* self) { + const auto& tensor = THPVariable_Unpack(self); + + if (!isResurrectable(self)) { + return false; + } + + // At this point, we are definitely going to resurrect the tensor. So, the + // tensor better be defined :) + TORCH_INTERNAL_ASSERT(tensor.defined()); + + // There are other C++ owners of the tensor. Flip ownership + // so that C++ owns this Python object, and cancel deallocation. + TORCH_INTERNAL_ASSERT(!tensor.unsafeGetTensorImpl()->owns_pyobj()); + + tensor.unsafeGetTensorImpl()->set_owns_pyobj(true); + +// Resurrect the Python object. This is something CPython does +// internally occasionally, see +// https://github.com/python/cpython/blob/b98eba5bc2ffbe7a0ed49d540ebc4f756ae61985/Objects/object.c#L248-L259 +// so we just copy the pattern here. Note that we don't have to worry +// about saving and restoring the refcount (as the quoted code does) +// because we actually DO need to reset the refcount to one here, we +// can't assume that some other code has taken care of it. +// NB: this will overreport _Py_RefTotal but based on inspection of object.c +// there is no way to avoid this +#ifdef Py_TRACE_REFS + _Py_AddToAllObjects(reinterpret_cast(self), 1); +#endif + Py_INCREF(self); + + // Flip THPVariable to be non-owning + // (near use-after-free miss here: fresh MaybeOwned is created breaking + // reference on Tensor in struct BEFORE we overwrite the old one) + self->cdata = MaybeOwned::borrowed(tensor); + + // NB: At this point, tensor *could* be dead (e.g., some other C++ thread + // decrefed it.) At this point, it is probably waiting on the GIL to + // deallocate the Python object and will kill self, BUT NOT YET. + + return true; +} + + static int THPVariable_clear(THPVariable* self) { + // Is it OK for an object to still be live after running + // tp_clear? Yes. When Python is breaking reference cycles, it can't assume + // that an object will dealloc after it's cleared. The source code explicitly + // handles this case: + // https://github.com/python/cpython/blob/4e661cd69164318c1f871faa476c68a04092ddc4/Modules/gcmodule.c#L1010-L1025 + + // Note that we don't need to actually resurrect here. There are 2 cases: + // 1. The PyObject is not part of a reference cycle. In this case, we don't + // need to do anything. The GC will move on to try and break the reference + // cycle on another object, which will eventually trigger tp_dealloc (and thus + // resurrection). + + // 2. The PyObject is part of a reference cycle. This case should not actually + // be possible, due to the logic in our tp_traverse (THPVariable_subclass_traverse). + + // In fact, resurrecting here breaks the invariant that "C++ owns Python only + // when PyObject's refcount would otherwise be 0". Most immediately, as we're + // merely breaking reference cycles here, there can be other references to the + // PyObject. *However*, if other objects in the refcycle resurrect, then we + // will be in a state where the PyObject has multiple Python references, yet + // C++ owns the PyObject. + + // See https://github.com/pytorch/pytorch/pull/75933 for more discussion. + if (isResurrectable((THPVariable*)self)) { + return 0; + } Py_CLEAR(self->backward_hooks); const auto& tensor = THPVariable_Unpack(self); if (tensor.defined()) { @@ -289,54 +499,11 @@ static int THPVariable_clear(THPVariable* self) { } } } + TORCH_INTERNAL_ASSERT(!isResurrectable((THPVariable*)self)); self->cdata = MaybeOwned(); return 0; } -// returns true if successfully rezzed; if so, cancel the -// rest of deallocation -static bool THPVariable_tryResurrect(THPVariable* self) { - const auto& tensor = THPVariable_Unpack(self); - - // Is this true or not??? Triggered by TestAutograd.test_variable_traverse - // TORCH_INTERNAL_ASSERT(tensor.defined()); - - // Check if there are other C++ owners - if (tensor.use_count() <= 1) { - return false; - } - - // There are other C++ owners of the tensor. Flip ownership - // so that C++ owns this Python object, and cancel deallocation. - TORCH_INTERNAL_ASSERT(!tensor.unsafeGetTensorImpl()->owns_pyobj()); - - tensor.unsafeGetTensorImpl()->set_owns_pyobj(true); - -// Resurrect the Python object. This is something CPython does -// internally occasionally, see -// https://github.com/python/cpython/blob/b98eba5bc2ffbe7a0ed49d540ebc4f756ae61985/Objects/object.c#L248-L259 -// so we just copy the pattern here. Note that we don't have to worry -// about saving and restoring the refcount (as the quoted code does) -// because we actually DO need to reset the refcount to one here, we -// can't assume that some other code has taken care of it. -// NB: this will overreport _Py_RefTotal but based on inspection of object.c -// there is no way to avoid this -#ifdef Py_TRACE_REFS - _Py_AddToAllObjects(reinterpret_cast(self), 1); -#endif - Py_INCREF(self); - - // Flip THPVariable to be non-owning - // (near use-after-free miss here: fresh MaybeOwned is created breaking - // reference on Tensor in struct BEFORE we overwrite the old one) - self->cdata = MaybeOwned::borrowed(tensor); - - // NB: At this point, tensor *could* be dead (e.g., some other C++ thread - // decrefed it.) At this point, it is probably waiting on the GIL to - // deallocate the Python object and will kill self, BUT NOT YET. - - return true; -} PyObject *THPVariable_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs); @@ -369,9 +536,9 @@ static PyObject* THPVariable_as_subclass(PyObject* _self, PyObject* args, PyObje static PyObject* THPVariable_make_subclass(PyObject* _ignored, PyObject* args, PyObject* kwargs) { HANDLE_TH_ERRORS static PythonArgParser parser({ - "_make_subclass(PyObject* cls, Tensor data, bool require_grad=False)", + "_make_subclass(PyObject* cls, Tensor data, bool require_grad=False, *, bool dispatch_strides=False)", }); - ParsedArgs<3> parsed_args{}; + ParsedArgs<4> parsed_args{}; auto r = parser.parse(args, kwargs, parsed_args); PyObject* cls = r.pyobject(0); if (!PyType_Check(cls)) { @@ -390,6 +557,9 @@ static PyObject* THPVariable_make_subclass(PyObject* _ignored, PyObject* args, P // ``` data.unsafeGetTensorImpl()->set_allow_tensor_metadata_change(true); data.set_requires_grad(r.toBool(2)); + if (r.toBool(3)) { + data.unsafeGetTensorImpl()->set_sizes_strides_policy(c10::TensorImpl::SizesStridesPolicy::CustomStrides); + } return THPVariable_NewWithVar( (PyTypeObject*)cls, std::move(data), @@ -402,9 +572,9 @@ static PyObject* THPVariable_make_wrapper_subclass(PyObject*, PyObject* args, Py // NB: pin_memory doesn't actually do anything // TODO: strides variant? static PythonArgParser parser({ - "_make_wrapper_subclass(PyObject* cls, IntArrayRef size, *, IntArrayRef? strides=None, int64_t? storage_offset=None, MemoryFormat? memory_format=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False)", + "_make_wrapper_subclass(PyObject* cls, IntArrayRef size, *, IntArrayRef? strides=None, int64_t? storage_offset=None, MemoryFormat? memory_format=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool pin_memory=False, bool requires_grad=False, bool dispatch_strides=False)", }); - ParsedArgs<10> parsed_args{}; + ParsedArgs<11> parsed_args{}; auto r = parser.parse(args, kwargs, parsed_args); PyObject* cls = r.pyobject(0); @@ -414,7 +584,12 @@ static PyObject* THPVariable_make_wrapper_subclass(PyObject*, PyObject* args, Py // to continue on to the underlying CPU/CUDA kernel advertised by the dispatch // key, which will immediately segfault because the data pointer is null. By // forcing users to define __torch_dispatch__ we ensure this does not happen - TORCH_CHECK_TYPE(PyObject_FastGetAttrString(cls, "__torch_dispatch__").ptr() != nullptr, + // TODO: This check is not complete; because the user can disable torch + // dispatch and then go again, triggering segfault. TBH I'm thinking I want + // to delete this function entirely + py::object attr = PyObject_FastGetAttrString(cls, "__torch_dispatch__"); + TORCH_CHECK_TYPE(attr.ptr() != nullptr && attr.ptr() != torch::disabled_torch_dispatch_impl() +, ((PyTypeObject*)cls)->tp_name, " must define __torch_dispatch__"); const auto options = TensorOptions() @@ -439,6 +614,10 @@ static PyObject* THPVariable_make_wrapper_subclass(PyObject*, PyObject* args, Py .make_tensor(); data.set_requires_grad(r.toBool(9)); + if (r.toBool(10)) { + data.unsafeGetTensorImpl()->set_sizes_strides_policy(c10::TensorImpl::SizesStridesPolicy::CustomStrides); + } + return THPVariable_NewWithVar( (PyTypeObject*)cls, std::move(data), @@ -894,6 +1073,16 @@ PyObject *THPVariable_is_cuda(THPVariable *self, void *unused) END_HANDLE_TH_ERRORS } +PyObject* THPVariable_is_ipu(THPVariable* self, void* unused) { + HANDLE_TH_ERRORS + if (check_has_torch_function((PyObject*)self)) { + return handle_torch_function_getter(self, "is_ipu"); + } + auto& self_ = THPVariable_Unpack(self); + return torch::autograd::utils::wrap(self_.is_ipu()); + END_HANDLE_TH_ERRORS +} + PyObject* THPVariable_is_xpu(THPVariable* self, void* unused) { HANDLE_TH_ERRORS if (check_has_torch_function((PyObject*)self)) { @@ -937,14 +1126,14 @@ PyObject *THPVariable_is_mkldnn(THPVariable *self, void *unused) END_HANDLE_TH_ERRORS } -PyObject *THPVariable_is_mlc(THPVariable *self, void *unused) +PyObject *THPVariable_is_mps(THPVariable *self, void *unused) { HANDLE_TH_ERRORS if (check_has_torch_function((PyObject *)self)) { - return handle_torch_function_getter(self, "is_mlc"); + return handle_torch_function_getter(self, "is_mps"); } auto& self_ = THPVariable_Unpack(self); - return torch::autograd::utils::wrap(self_.is_mlc()); + return torch::autograd::utils::wrap(self_.is_mps()); END_HANDLE_TH_ERRORS } @@ -1003,6 +1192,17 @@ PyObject *THPVariable_is_complex(THPVariable *self, void *unused) END_HANDLE_TH_ERRORS } +PyObject *THPVariable_is_nested(THPVariable *self, void *unused) +{ + HANDLE_TH_ERRORS + if (check_has_torch_function((PyObject *)self)) { + return handle_torch_function_getter(self, "is_nested"); + } + auto& self_ = THPVariable_Unpack(self); + return torch::autograd::utils::wrap(self_.is_nested()); + END_HANDLE_TH_ERRORS +} + static PyObject *THPVariable_dtype(THPVariable *self, void *unused) { HANDLE_TH_ERRORS @@ -1057,28 +1257,28 @@ PyObject *THPVariable_get_imag(THPVariable* self, void *unused) END_HANDLE_TH_ERRORS } -int THPVariable_set_real(THPVariable *self, THPVariable *real, void *unused) +int THPVariable_set_real(PyObject* self, PyObject* real, void *unused) { HANDLE_TH_ERRORS auto& self_ = THPVariable_Unpack(self); - auto& real_ = THPVariable_Unpack(real); + auto self_real = at::real(self_); + auto real_ = valueToTensor(self_real.options(), real, self_real.device()); { pybind11::gil_scoped_release no_gil; - auto self_real = at::real(self_); self_real.copy_(real_); return 0; } END_HANDLE_TH_ERRORS_RET(-1) } -int THPVariable_set_imag(THPVariable* self, THPVariable *imag, void *unused) +int THPVariable_set_imag(PyObject* self, PyObject* imag, void *unused) { HANDLE_TH_ERRORS auto& self_ = THPVariable_Unpack(self); - auto& imag_ = THPVariable_Unpack(imag); + auto self_imag = at::imag(self_); + auto imag_ = valueToTensor(self_imag.options(), imag, self_imag.device()); { pybind11::gil_scoped_release no_gil; - auto self_imag = at::imag(self_); self_imag.copy_(imag_); return 0; } @@ -1112,15 +1312,17 @@ static struct PyGetSetDef THPVariable_properties[] = { {"shape", (getter)THPVariable_get_shape, nullptr, nullptr, nullptr}, {"is_cuda", (getter)THPVariable_is_cuda, nullptr, nullptr, nullptr}, {"is_xpu", (getter)THPVariable_is_xpu, nullptr, nullptr, nullptr}, + {"is_ipu", (getter)THPVariable_is_ipu, nullptr, nullptr, nullptr}, {"is_sparse", (getter)THPVariable_is_sparse, nullptr, nullptr, nullptr}, {"is_sparse_csr", (getter)THPVariable_is_sparse_csr, nullptr, nullptr, nullptr}, {"is_mkldnn", (getter)THPVariable_is_mkldnn, nullptr, nullptr, nullptr}, - {"is_mlc", (getter)THPVariable_is_mlc, nullptr, nullptr, nullptr}, + {"is_mps", (getter)THPVariable_is_mps, nullptr, nullptr, nullptr}, {"is_ort", (getter)THPVariable_is_ort, nullptr, nullptr, nullptr}, {"is_vulkan", (getter)THPVariable_is_vulkan, nullptr, nullptr, nullptr}, {"is_complex", (getter)THPVariable_is_complex, nullptr, nullptr, nullptr}, {"is_quantized", (getter)THPVariable_is_quantized, nullptr, nullptr, nullptr}, {"is_meta", (getter)THPVariable_is_meta, nullptr, nullptr, nullptr}, + {"is_nested", (getter)THPVariable_is_nested, nullptr, nullptr, nullptr}, {"dtype", (getter)THPVariable_dtype, nullptr, nullptr, nullptr}, {"layout", (getter)THPVariable_layout, nullptr, nullptr, nullptr}, {"device", (getter)THPVariable_device, nullptr, nullptr, nullptr}, @@ -1260,7 +1462,7 @@ PyObject *THPVariable_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs HANDLE_TH_ERRORS TORCH_CHECK(type != &THPVariableType, "Cannot directly construct _TensorBase; subclass it and then construct that"); jit::tracer::warn("torch.Tensor", jit::tracer::WARN_CONSTRUCTOR); - auto tensor = torch::utils::legacy_tensor_ctor(torch::tensors::get_default_dispatch_key(), torch::tensors::get_default_scalar_type(), args, kwargs); + auto tensor = torch::utils::base_tensor_ctor(args, kwargs); // WARNING: tensor is NOT guaranteed to be a fresh tensor; e.g., if it was // given a raw pointer that will refcount bump return THPVariable_NewWithVar( @@ -1515,10 +1717,8 @@ static int THPVariable_subclass_traverse( // self is live, and nothing will get GC'ed anyway (resurrection cannot happen // if the C++ objects owns the PyObject) THPVariable* var = reinterpret_cast(self); - if (!var->cdata.unsafeIsBorrowed()) { - const auto& tensor = THPVariable_Unpack(self); - if (tensor.defined() && tensor.use_count() > 1) - return 0; + if (isResurrectable(var)) { + return 0; } // Crappy version of subtype_traverse; same deal as @@ -1655,10 +1855,38 @@ bool isPythonTensor(const Tensor& tensor) { return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Python); } + +py::object torchDispatchFromTensorImpl(const c10::TensorImpl* self, const char* func_name, PyObject* torch_api_function, const char* module_name) { + TORCH_CHECK(PyGILState_Check(), "GIL must be held before you call parseIValuesToPyArgsKwargs"); + + // Setup the arguments expected for the detach call + std::vector overloaded_args; + // TODO: there should be a shorter way to spell this + // TODO: fix the constness of target + Tensor self_t = Tensor(c10::intrusive_ptr::unsafe_reclaim_from_nonowning(const_cast(self))); + auto self_p = py::reinterpret_steal(THPVariable_Wrap(self_t)); + TORCH_INTERNAL_ASSERT(isPythonTensor(self_t)); + append_overloaded_tensor(&overloaded_args, self_p.ptr()); + auto args = py::reinterpret_steal(PyTuple_New(1)); + PyTuple_SET_ITEM(args.ptr(), 0, self_p.release().ptr()); + + py::dict kwargs; + + return py::reinterpret_steal( + handle_torch_function_no_python_arg_parser( + overloaded_args, + args.ptr(), + kwargs.ptr(), + func_name, + torch_api_function, + module_name, + TorchFunctionName::TorchDispatch)); +} + // NOTE [dispatch_fn's type argument] -// `type` is nullable and represents the PythonMode going on. -// Right now we only support a single PythonMode, but in the future we could -// change this to a stack of PythonModes. +// `type` is nullable and represents the TorchDispatchMode going on. +// Right now we only support a single TorchDispatchMode, but in the future we could +// change this to a stack of TorchDispatchModes. // // If `type` isn't null, then we consider the type for dispatch by prepending // it to the overloaded_args list. `handle_torch_funciton_no_python_arg_parser` @@ -1667,16 +1895,15 @@ void concrete_dispatch_fn( const c10::impl::PyInterpreter*, const c10::OperatorHandle& op, torch::jit::Stack* stack, - const std::shared_ptr& type) { + const std::shared_ptr& type) { const auto& schema = op.schema(); - const auto num_returns = schema.returns().size(); - const auto num_arguments = schema.arguments().size(); auto arguments = torch::jit::pop(*stack, num_arguments); // Parse the name into namespace and name (no overload_name) // TODO: put this into the library const auto& qualified_name = op.operator_name().name; + const auto& overload_name = schema.overload_name(); auto pos = qualified_name.find("::"); TORCH_INTERNAL_ASSERT(pos != std::string::npos, qualified_name); // Make me some null terminated strings @@ -1693,59 +1920,17 @@ void concrete_dispatch_fn( py::gil_scoped_acquire g; std::vector overloaded_args; - // For now, overloads get coalesced. Might be easier for users if they get - // overload resolution but is more complicated (need to expose separate - // functions per overload) py::handle torch_api_function = py::module::import("torch").attr("ops").attr(ns).attr(func_name); - std::string module_name_str = "torch.ops." + ns_str; - - // About all the pointers: - // - // f(int x, int y = 0, *, int z = 0) - // ^- arguments.size() - // ^- kwarg_only_start - // ^- positional_default_start - // ^- 0 - - // Find the split point between kwarg-only and regular. Since most functions - // don't have kwarg-only arguments, it is more efficient to scan from the - // right (but ideally, this would just be precomputed in FunctionSchema - // itself). (NB: minus one in the loop is because we're testing if the - // *next* argument is kwarg-only before we advance the starting index) - int64_t kwarg_only_start = arguments.size(); - for (; kwarg_only_start > 0; kwarg_only_start--) { - const auto& arg = schema.arguments()[kwarg_only_start - 1]; - if (!arg.kwarg_only()) { - break; - } - } - - // Find the first positional argument that isn't defaulted - auto is_default = [&](int64_t idx) -> bool { - const auto& arg = schema.arguments()[idx]; - if (!arg.default_value().has_value()) { - return false; - } - const auto& default_ivalue = *arg.default_value(); - const auto& ivalue = arguments[idx]; - if (default_ivalue != ivalue) { - return false; - } - return true; - }; - - int64_t positional_default_start = kwarg_only_start; - for (; positional_default_start > 0; positional_default_start--) { - if (!is_default(positional_default_start - 1)) { - break; - } + py::handle torch_api_function_overload; + if (overload_name == "") { + torch_api_function_overload = torch_api_function.attr("default"); + } else { + torch_api_function_overload = torch_api_function.attr(overload_name.c_str()); } - - auto args = py::reinterpret_steal(PyTuple_New(positional_default_start)); - py::dict kwargs; + std::string module_name_str = "torch.ops." + ns_str; if (type) { - append_overloaded_type(&overloaded_args, type->ptr()); + append_overloaded_type(&overloaded_args, type->ptr(getPyInterpreter())); } // Find overloaded tensors @@ -1770,72 +1955,59 @@ void concrete_dispatch_fn( } } - // Populate positional arguments - for (const auto idx : c10::irange(positional_default_start)) { - PyTuple_SET_ITEM(args.ptr(), idx, torch::jit::toPyObject(std::move(arguments[idx])).release().ptr()); - } - - // Populate keyword arguments - for (const auto idx : c10::irange(kwarg_only_start, arguments.size())) { - // But don't populate default keyword arguments - if (is_default(idx)) continue; - const auto& arg = schema.arguments()[idx]; - kwargs[py::cast(arg.name())] = torch::jit::toPyObject(std::move(arguments[idx])); - } - - auto out = py::reinterpret_steal(handle_torch_function_no_python_arg_parser( - overloaded_args, - args.ptr(), - kwargs.ptr(), - func_name, - torch_api_function.ptr(), - module_name_str.c_str(), - "__torch_dispatch__" - )); + auto args_kwargs = parseIValuesToPyArgsKwargs(op, arguments); + auto args = std::move(args_kwargs.first); + auto kwargs = std::move(args_kwargs.second); - if (num_returns == 0) { - // Check that we got a None return from Python. Anything else is an error. - TORCH_CHECK(out.is(py::none()), "Expected __torch_dispatch__ for ", op.operator_name(), - " to return None but it returned something else instead."); - } else if (num_returns == 1) { - torch::jit::push(stack, torch::jit::toIValue(out.ptr(), op.schema().returns()[0].type())); - } else { - auto outs = py::cast(out); - for (const auto idx : c10::irange(outs.size())) { - torch::jit::push(stack, torch::jit::toIValue(outs[idx].ptr(), op.schema().returns()[idx].type())); - } - } + PyObject* obj = handle_torch_function_no_python_arg_parser( + overloaded_args, + args.ptr(), + kwargs.ptr(), + func_name, + torch_api_function_overload.ptr(), + module_name_str.c_str(), + TorchFunctionName::TorchDispatch); + pushPyOutToStack(op, stack, py::reinterpret_steal(obj), "__torch_dispatch__"); } c10::intrusive_ptr concrete_detach_fn(const c10::impl::PyInterpreter*, const c10::TensorImpl* self) { pybind11::gil_scoped_acquire gil; - - // Setup the arguments expected for the detach call - std::vector overloaded_args; - // TODO: there should be a shorter way to spell this - // TODO: fix the constness of target - Tensor self_t = Tensor(c10::intrusive_ptr::unsafe_reclaim_from_nonowning(const_cast(self))); - auto self_p = py::reinterpret_steal(THPVariable_Wrap(self_t)); - TORCH_INTERNAL_ASSERT(isPythonTensor(self_t)); - append_overloaded_tensor(&overloaded_args, self_p.ptr()); - auto args = py::reinterpret_steal(PyTuple_New(1)); - PyTuple_SET_ITEM(args.ptr(), 0, self_p.release().ptr()); - - py::dict kwargs; - - auto out = py::reinterpret_steal(handle_torch_function_no_python_arg_parser( - overloaded_args, - args.ptr(), - kwargs.ptr(), - "detach", - py::module::import("torch").attr("ops").attr("aten").attr("detach").ptr(), - "torch.ops.aten", - "__torch_dispatch__" - )); + at::impl::MaybeSetTLSOnEntryGuard guard; + + auto out = torchDispatchFromTensorImpl( + self, + "detach", + py::module::import("torch") + .attr("ops") + .attr("aten") + .attr("detach") + .attr("default") + .ptr(), + "torch.ops.aten"); TORCH_CHECK(THPVariable_Check(out.ptr()), "detach returned invalid type ", py::detail::get_fully_qualified_tp_name(Py_TYPE(out.ptr())), ", expected Tensor"); const Tensor& res_t = THPVariable_Unpack(out.ptr()); return res_t.getIntrusivePtr(); } +bool concrete_is_contiguous_fn(const c10::impl::PyInterpreter*, const c10::TensorImpl* self) { + pybind11::gil_scoped_acquire gil; + at::impl::MaybeSetTLSOnEntryGuard guard; + + auto out = torchDispatchFromTensorImpl( + self, + "is_contiguous", + py::module::import("torch") + .attr("ops") + .attr("aten") + .attr("is_contiguous") + .attr("default") + .ptr(), + "torch.ops.aten"); + + TORCH_CHECK(PyBool_Check(out.ptr()), "is_contiguous returned invalid type ", py::detail::get_fully_qualified_tp_name(Py_TYPE(out.ptr())), ", expected bool"); + + return PyObject_IsTrue(out.ptr()); +} + } // anonymous namespace diff --git a/torch/csrc/autograd/python_variable.h b/torch/csrc/autograd/python_variable.h index b55e5c05127d..c4856cdd4d12 100644 --- a/torch/csrc/autograd/python_variable.h +++ b/torch/csrc/autograd/python_variable.h @@ -7,6 +7,10 @@ #include #include #include +#include +#include + +namespace py = pybind11; // Python object that backs torch.autograd.Variable // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) @@ -62,3 +66,7 @@ inline const at::Tensor& THPVariable_Unpack(PyObject* obj) { } TORCH_PYTHON_API c10::impl::PyInterpreter* getPyInterpreter(); + +std::pair parseIValuesToPyArgsKwargs(const c10::OperatorHandle& op, const std::vector& arguments); + +void pushPyOutToStack(const c10::OperatorHandle& op, torch::jit::Stack* stack, py::object out, const char* msg); diff --git a/torch/csrc/autograd/python_variable_indexing.cpp b/torch/csrc/autograd/python_variable_indexing.cpp index 8faa07066ead..27016f4edcc6 100644 --- a/torch/csrc/autograd/python_variable_indexing.cpp +++ b/torch/csrc/autograd/python_variable_indexing.cpp @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include @@ -88,7 +87,7 @@ static inline Variable sequenceToVariable(c10::TensorOptions options, PyObject* return torch::utils::indexing_tensor_from_data(options, kLong, c10::nullopt, seq); } -static inline Variable valueToTensor(c10::TensorOptions options, PyObject* value, const at::Device& device) { +inline Variable valueToTensor(c10::TensorOptions options, PyObject* value, const at::Device& device) { if (THPVariable_Check(value)) { return THPVariable_Unpack(value); } @@ -369,7 +368,7 @@ int THPVariable_setitem(PyObject* self, PyObject* index, PyObject* py_value) { } const auto& self_ = THPVariable_Unpack(self); - if (self_.is_sparse()) + if (self_.is_sparse() || self_.is_sparse_csr()) { throw TypeError("Cannot assign to a sparse tensor"); } diff --git a/torch/csrc/autograd/python_variable_indexing.h b/torch/csrc/autograd/python_variable_indexing.h index 398b77293810..027bffb6dc8a 100644 --- a/torch/csrc/autograd/python_variable_indexing.h +++ b/torch/csrc/autograd/python_variable_indexing.h @@ -1,6 +1,7 @@ #pragma once #include +#include namespace torch { namespace autograd { @@ -8,4 +9,6 @@ Py_ssize_t THPVariable_length(PyObject* self); PyObject* THPVariable_getitem(PyObject* self, PyObject* index); int THPVariable_setitem(PyObject* self, PyObject* index, PyObject* value); +Variable valueToTensor(c10::TensorOptions options, PyObject* value, const at::Device& device); + }} // namespace torch::autograd diff --git a/torch/csrc/autograd/record_function_ops.cpp b/torch/csrc/autograd/record_function_ops.cpp index 2cf427e04f60..f5f09b3fe940 100644 --- a/torch/csrc/autograd/record_function_ops.cpp +++ b/torch/csrc/autograd/record_function_ops.cpp @@ -1,8 +1,10 @@ +#include #include #include #include -#include +#include +#include namespace caffe2 { // Required for cpp_custom_type_hack to work @@ -16,47 +18,68 @@ namespace profiler { // Creates a new profiling scope using RecordFunction and invokes its starting // callbacks. -at::Tensor record_function_enter( +void record_function_enter( const std::string& name, - const c10::optional& args) { - auto rec = std::make_unique(at::RecordScope::USER_SCOPE); - if (rec->isActive()) { - if (rec->needsInputs() && args.has_value()) { - rec->before(name, std::vector{c10::IValue{args.value()}}); + const c10::optional& args, + at::RecordFunction &rec) { + if (rec.isActive()) { + if (rec.needsInputs() && args.has_value()) { + rec.before(name, c10::ArrayRef{c10::IValue{args.value()}}); } else { - rec->before(name); + rec.before(name); } } +} + +// Legacy signature using cpp_custom_type_hack +at::Tensor record_function_enter_legacy( + const std::string& name, + const c10::optional& args) { + auto rec = std::make_unique(at::RecordScope::USER_SCOPE); + record_function_enter(name, args, *rec); return at::cpp_custom_type_hack::create(std::move(rec), at::TensorOptions()); } +// New signature using custom_class +c10::intrusive_ptr record_function_enter_new( + const std::string &name, const c10::optional &args) { + auto rec = c10::make_intrusive(at::RecordScope::USER_SCOPE); + record_function_enter(name, args, rec->record); + return rec; +} + at::RecordFunction& getRecordFunctionFromTensor(const at::Tensor& handle) { auto& rec = at::cpp_custom_type_hack::cast(handle); return rec; } // Ends the profiling scope created with record_function_enter. -void record_function_exit(const at::Tensor& handle) { +void record_function_exit(at::RecordFunction &rec) { + rec.end(); +} + +// Legacy signature using cpp_custom_type_hack +void record_function_exit_legacy(const at::Tensor &handle) { // We don't actually need to do anything with handle just need to persist the // lifetime until now. auto& rec = getRecordFunctionFromTensor(handle); - rec.end(); + record_function_exit(rec); +} + +// New signature using custom_class +void record_function_exit_new(const c10::intrusive_ptr &record) { + record_function_exit(record->record); } +template c10::intrusive_ptr _call_end_callbacks_on_fut( - const at::Tensor& handle, + Func get_record, const c10::intrusive_ptr& fut) { // Profiling callback that ends the associated record_function // and returns the value of the passed in future. std::function futureProfilingFunc = - [handle](c10::ivalue::Future& fut) { - TORCH_INTERNAL_ASSERT( - handle.defined(), - "Undefined RecordFunction handle. This can happen if the handle is " - "not correctly persisted and is destroyed before the future is " - "realized."); - - auto& rec = getRecordFunctionFromTensor(handle); + [get_record = std::move(get_record)](c10::ivalue::Future& fut) { + auto& rec = get_record(); rec.end(); // Note: this future is returned to the user to ensure that a call to wait() // ensures that profiling callbacks have ran. To ensure that this is @@ -67,36 +90,74 @@ c10::intrusive_ptr _call_end_callbacks_on_fut( }; // Define a future that completes after the profiling callbacks are run. auto profiledFut = fut->then(at::wrapPropagateTLSState( - futureProfilingFunc), + std::move(futureProfilingFunc)), fut->elementType() ); return profiledFut; } -// Internal only, do not use directly, use Python's record_function() -TORCH_LIBRARY_FRAGMENT(profiler, m) { - m.def("_record_function_enter(str name, str? args=None) -> Tensor", &record_function_enter); - m.def("_record_function_exit", &record_function_exit); +// Legacy signature using cpp_custom_type_hack +c10::intrusive_ptr _call_end_callbacks_on_fut_legacy( + const at::Tensor &handle, + const c10::intrusive_ptr& fut) { + return _call_end_callbacks_on_fut( + [handle] () -> at::RecordFunction& { + TORCH_INTERNAL_ASSERT( + handle.defined(), + "Undefined RecordFunction handle. This can happen if the handle is " + "not correctly persisted and is destroyed before the future is " + "realized."); + + return getRecordFunctionFromTensor(handle); + }, + fut + ); } -// Needed to register JIT operator in operator registry below -c10::AliasAnalysisKind aliasAnalysisFromSchema() { - return c10::AliasAnalysisKind::FROM_SCHEMA; +// New signature using custom_class +c10::intrusive_ptr _call_end_callbacks_on_fut_new( + const c10::intrusive_ptr &record, + const c10::intrusive_ptr& fut) { + return _call_end_callbacks_on_fut( + [record] () -> at::RecordFunction& { return record->record; }, fut); } -jit::RegisterOperators reg_fut_ops({ - jit::Operator( +// Internal only, do not use directly, use Python's record_function() +TORCH_LIBRARY_FRAGMENT(profiler, m) { + m.class_("_RecordFunction"); + + m.def("_record_function_enter(str name, str? args=None) -> Tensor", + &record_function_enter_legacy); + m.def("_record_function_enter_new(str name, str? args=None) -> " + "__torch__.torch.classes.profiler._RecordFunction", + &record_function_enter_new); + m.def("_record_function_exit", &record_function_exit_legacy); + m.def("_record_function_exit._RecordFunction", &record_function_exit_new); + + torch::jit::registerOperator(torch::jit::Operator( "profiler::_call_end_callbacks_on_jit_fut(Tensor x, Future(t) y) -> Future(t)", [](jit::Stack& stack) { // Pop inputs, which should be a future and a tensor auto fut = jit::pop(stack).toFuture(); auto tensor = jit::pop(stack).toTensor(); - auto profiledFut = _call_end_callbacks_on_fut(tensor, fut); + auto profiledFut = _call_end_callbacks_on_fut_legacy(tensor, fut); // return future that completes when profiling callbacks have run. jit::push(stack, std::move(profiledFut)); }, - aliasAnalysisFromSchema()), -}); + c10::AliasAnalysisKind::FROM_SCHEMA)); + torch::jit::registerOperator(torch::jit::Operator( + "profiler::_call_end_callbacks_on_jit_fut._RecordFunction(" + "__torch__.torch.classes.profiler._RecordFunction x, Future(t) y) -> Future(t)", + [](c10::Stack &stack) { + // Pop inputs, which should be a future and a PythonRecordFunction + auto fut = torch::jit::pop(stack).toFuture(); + auto tensor = torch::jit::pop(stack).toCustomClass(); + auto profiledFut = _call_end_callbacks_on_fut_new(tensor, fut); + // return future that completes when profiling callbacks have run. + torch::jit::push(stack, std::move(profiledFut)); + }, + c10::AliasAnalysisKind::FROM_SCHEMA)); +} } // namespace profiler } // namespace autograd diff --git a/torch/csrc/autograd/record_function_ops.h b/torch/csrc/autograd/record_function_ops.h index 9042537aeabc..2c074f2dfe5b 100644 --- a/torch/csrc/autograd/record_function_ops.h +++ b/torch/csrc/autograd/record_function_ops.h @@ -1,17 +1,28 @@ #pragma once #include #include +#include namespace torch { namespace autograd { namespace profiler { + +struct PythonRecordFunction : public torch::CustomClassHolder { + at::RecordFunction record; + + explicit PythonRecordFunction( + at::RecordScope scope = at::RecordScope::FUNCTION) + : record(scope) {} +}; + // Creates a new profiling scope using RecordFunction and invokes its starting // callbacks. -TORCH_API at::Tensor record_function_enter(const std::string& name, const c10::optional& args = c10::nullopt); +TORCH_API c10::intrusive_ptr record_function_enter_new( + const std::string &name, const c10::optional &args = c10::nullopt); // Schedules RecordFunction's end callbacks to be run on completion of a future. -TORCH_API c10::intrusive_ptr _call_end_callbacks_on_fut( - const at::Tensor& handle, +TORCH_API c10::intrusive_ptr _call_end_callbacks_on_fut_new( + const c10::intrusive_ptr &record, const c10::intrusive_ptr& fut); } // namespace profiler diff --git a/torch/csrc/autograd/utils/grad_layout_contract.h b/torch/csrc/autograd/utils/grad_layout_contract.h index 4d1787d55c79..c7e1bad9fb8a 100644 --- a/torch/csrc/autograd/utils/grad_layout_contract.h +++ b/torch/csrc/autograd/utils/grad_layout_contract.h @@ -7,15 +7,39 @@ namespace autograd { namespace utils { // Helper functions to enforce the "Gradient Layout Contract" described in -// torch/csrc/autograd/AccumulateGrad.h. +// torch/csrc/autograd/functions/accumulate_grad.h. // Checks if grad obeys the contract with variable. inline bool obeys_layout_contract(const at::Tensor& grad, const at::Tensor& variable) { TORCH_INTERNAL_ASSERT(!grad.is_sparse()); TORCH_INTERNAL_ASSERT(!variable.is_sparse()); - return variable.is_non_overlapping_and_dense() ? - (grad.strides() == variable.strides()) : - grad.is_contiguous(at::MemoryFormat::Contiguous); + TORCH_INTERNAL_ASSERT(!grad.is_sparse_csr()); + TORCH_INTERNAL_ASSERT(!variable.is_sparse_csr()); + if (variable.is_non_overlapping_and_dense()) { + // Only look at stride for dimensions that are not of size 1. + const auto& grad_sizes = grad.sizes(); + const auto& grad_strides = grad.strides(); + const auto& variable_strides = variable.strides(); + for (const auto idx : c10::irange(grad_sizes.size())) { + if (grad_sizes[idx] != 1) { + if (grad_strides[idx] != variable_strides[idx]) { + return false; + } + } else { + // This should not be needed but we don't check if a Tensor has views + // before stashing it. And 0-strided Tensors of size 1 are actually views + // for ops like cat. + // TODO: Actually detect views in the accumulateGrad function so that this + // Tensor is not considered at all. + if (grad_strides[idx] == 0) { + return false; + } + } + } + return true; + } else { + return grad.is_contiguous(at::MemoryFormat::Contiguous); + } } // Creates a clone of new_grad that obeys the contract with variable. diff --git a/torch/csrc/autograd/utils/wrap_outputs.h b/torch/csrc/autograd/utils/wrap_outputs.h index 10439553fcc5..114b53487368 100644 --- a/torch/csrc/autograd/utils/wrap_outputs.h +++ b/torch/csrc/autograd/utils/wrap_outputs.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -77,117 +78,6 @@ inline PyObject* wrap(at::QScheme qscheme) { return thp_qscheme; } -inline PyObject* wrap(std::tuple tensors) { - auto r = THPObjectPtr{PyTuple_New(2)}; - if (!r) throw python_error(); - PyTuple_SET_ITEM(r.get(), 0, wrap(std::get<0>(tensors))); - PyTuple_SET_ITEM(r.get(), 1, wrap(std::get<1>(tensors))); - return r.release(); -} - -inline PyObject* wrap(PyTypeObject *type, std::tuple tensors) { - auto r = THPObjectPtr{PyStructSequence_New(type)}; - if (!r) throw python_error(); - PyStructSequence_SET_ITEM(r.get(), 0, wrap(std::get<0>(tensors))); - PyStructSequence_SET_ITEM(r.get(), 1, wrap(std::get<1>(tensors))); - return r.release(); -} - -inline PyObject* wrap(std::tuple tensors) { - auto r = THPObjectPtr{PyTuple_New(3)}; - if (!r) throw python_error(); - PyTuple_SET_ITEM(r.get(), 0, wrap(std::move(std::get<0>(tensors)))); - PyTuple_SET_ITEM(r.get(), 1, wrap(std::move(std::get<1>(tensors)))); - PyTuple_SET_ITEM(r.get(), 2, wrap(std::move(std::get<2>(tensors)))); - return r.release(); -} - -inline PyObject* wrap(PyTypeObject *type, std::tuple tensors) { - auto r = THPObjectPtr{PyStructSequence_New(type)}; - if (!r) throw python_error(); - PyStructSequence_SET_ITEM(r.get(), 0, wrap(std::get<0>(tensors))); - PyStructSequence_SET_ITEM(r.get(), 1, wrap(std::get<1>(tensors))); - PyStructSequence_SET_ITEM(r.get(), 2, wrap(std::get<2>(tensors))); - return r.release(); -} - -inline PyObject* wrap(PyTypeObject *type, std::tuple tensors) { - auto r = THPObjectPtr{PyStructSequence_New(type)}; - if (!r) throw python_error(); - PyStructSequence_SET_ITEM(r.get(), 0, wrap(std::get<0>(tensors))); - PyStructSequence_SET_ITEM(r.get(), 1, wrap(std::get<1>(tensors))); - PyStructSequence_SET_ITEM(r.get(), 2, wrap(std::get<2>(tensors))); - PyStructSequence_SET_ITEM(r.get(), 3, wrap(std::get<3>(tensors))); - return r.release(); -} - -inline PyObject* wrap(std::tuple tensors) { - auto r = THPObjectPtr{PyTuple_New(4)}; - if (!r) throw python_error(); - PyTuple_SET_ITEM(r.get(), 0, wrap(std::move(std::get<0>(tensors)))); - PyTuple_SET_ITEM(r.get(), 1, wrap(std::move(std::get<1>(tensors)))); - PyTuple_SET_ITEM(r.get(), 2, wrap(std::move(std::get<2>(tensors)))); - PyTuple_SET_ITEM(r.get(), 3, wrap(std::get<3>(tensors))); - return r.release(); -} - -inline PyObject* wrap(std::tuple tensors) { - auto r = THPObjectPtr{PyTuple_New(4)}; - if (!r) throw python_error(); - PyTuple_SET_ITEM(r.get(), 0, wrap(std::move(std::get<0>(tensors)))); - PyTuple_SET_ITEM(r.get(), 1, wrap(std::move(std::get<1>(tensors)))); - // NOLINTNEXTLINE(performance-move-const-arg) - PyTuple_SET_ITEM(r.get(), 2, wrap(std::move(std::get<2>(tensors)))); - // NOLINTNEXTLINE(performance-move-const-arg) - PyTuple_SET_ITEM(r.get(), 3, wrap(std::move(std::get<3>(tensors)))); - return r.release(); -} - -inline PyObject* wrap(std::tuple tensors) { - auto r = THPObjectPtr{PyTuple_New(5)}; - if (!r) throw python_error(); - PyTuple_SET_ITEM(r.get(), 0, wrap(std::move(std::get<0>(tensors)))); - PyTuple_SET_ITEM(r.get(), 1, wrap(std::move(std::get<1>(tensors)))); - PyTuple_SET_ITEM(r.get(), 2, wrap(std::move(std::get<2>(tensors)))); - PyTuple_SET_ITEM(r.get(), 3, wrap(std::move(std::get<3>(tensors)))); - PyTuple_SET_ITEM(r.get(), 4, wrap(std::get<4>(tensors))); - return r.release(); -} - -inline PyObject* wrap(std::tuple tensors) { - auto r = THPObjectPtr{PyTuple_New(5)}; - if (!r) throw python_error(); - PyTuple_SET_ITEM(r.get(), 0, wrap(std::move(std::get<0>(tensors)))); - PyTuple_SET_ITEM(r.get(), 1, wrap(std::move(std::get<1>(tensors)))); - // NOLINTNEXTLINE(performance-move-const-arg) - PyTuple_SET_ITEM(r.get(), 2, wrap(std::move(std::get<2>(tensors)))); - PyTuple_SET_ITEM(r.get(), 3, wrap(std::move(std::get<3>(tensors)))); - // NOLINTNEXTLINE(performance-move-const-arg) - PyTuple_SET_ITEM(r.get(), 4, wrap(std::move(std::get<4>(tensors)))); - return r.release(); -} - -inline PyObject* wrap(std::tuple tensors) { - auto r = THPObjectPtr{PyTuple_New(4)}; - if (!r) throw python_error(); - PyTuple_SET_ITEM(r.get(), 0, wrap(std::move(std::get<0>(tensors)))); - PyTuple_SET_ITEM(r.get(), 1, wrap(std::move(std::get<1>(tensors)))); - PyTuple_SET_ITEM(r.get(), 2, wrap(std::move(std::get<2>(tensors)))); - PyTuple_SET_ITEM(r.get(), 3, wrap(std::move(std::get<3>(tensors)))); - return r.release(); -} - -inline PyObject* wrap(std::tuple tensors) { - auto r = THPObjectPtr{PyTuple_New(5)}; - if (!r) throw python_error(); - PyTuple_SET_ITEM(r.get(), 0, wrap(std::move(std::get<0>(tensors)))); - PyTuple_SET_ITEM(r.get(), 1, wrap(std::move(std::get<1>(tensors)))); - PyTuple_SET_ITEM(r.get(), 2, wrap(std::move(std::get<2>(tensors)))); - PyTuple_SET_ITEM(r.get(), 3, wrap(std::move(std::get<3>(tensors)))); - PyTuple_SET_ITEM(r.get(), 4, wrap(std::move(std::get<4>(tensors)))); - return r.release(); -} - inline PyObject* wrap(at::TensorList tl) { auto r = THPObjectPtr{PyTuple_New(tl.size())}; if (!r) throw python_error(); @@ -206,13 +96,38 @@ inline PyObject* wrap(at::IntArrayRef list) { return r.release(); } -inline PyObject* wrap(std::tuple tensors) { - auto r = THPObjectPtr{PyTuple_New(2)}; +namespace detail { +template +void apply_with_idx_impl(const F &f, Tuple &t, std::index_sequence /*indices*/) { + (void)std::initializer_list { + (f(std::get(t), Is), 0)... + }; +} + +// For tuple(a, b, c), calls f(a, 0), f(b, 1), f(c, 2) +template +void apply_with_idx(const F & f, std::tuple &t) { + apply_with_idx_impl(f, t, std::index_sequence_for{}); +} +} // namespace detail + +template +PyObject* wrap(std::tuple values) { + auto r = THPObjectPtr{PyTuple_New(sizeof...(Ts))}; + if (!r) throw python_error(); + detail::apply_with_idx([&](auto &value, size_t idx) { + PyTuple_SET_ITEM(r.get(), idx, wrap(std::move(value))); + }, values); + return r.release(); +} + +template +PyObject* wrap(PyTypeObject *type, std::tuple values) { + auto r = THPObjectPtr{PyStructSequence_New(type)}; if (!r) throw python_error(); - // NOLINTNEXTLINE(performance-move-const-arg) - PyTuple_SET_ITEM(r.get(), 0, wrap(std::move(std::get<0>(tensors)))); - // NOLINTNEXTLINE(performance-move-const-arg) - PyTuple_SET_ITEM(r.get(), 1, wrap(std::move(std::get<1>(tensors)))); + detail::apply_with_idx([&](auto &value, size_t idx) { + PyStructSequence_SET_ITEM(r.get(), idx, wrap(std::move(value))); + }, values); return r.release(); } diff --git a/torch/csrc/cuda/Event.cpp b/torch/csrc/cuda/Event.cpp index 20821636a774..4312b3aaf7b0 100644 --- a/torch/csrc/cuda/Event.cpp +++ b/torch/csrc/cuda/Event.cpp @@ -119,7 +119,7 @@ static PyObject * THCPEvent_wait(PyObject *_self, PyObject *_stream) { { auto self = (THCPEvent*)_self; auto stream = (THCPStream*)_stream; - pybind11::gil_scoped_release no_gil; + pybind11::gil_scoped_release no_gil{}; self->cuda_event.block(stream->cuda_stream); } Py_RETURN_NONE; @@ -145,7 +145,7 @@ static PyObject * THCPEvent_synchronize(PyObject *_self, PyObject *noargs) { HANDLE_TH_ERRORS { auto self = (THCPEvent*)_self; - pybind11::gil_scoped_release no_gil; + pybind11::gil_scoped_release no_gil{}; self->cuda_event.synchronize(); } Py_RETURN_NONE; diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp index 0b76eefe92c4..6c00332c21a4 100644 --- a/torch/csrc/cuda/Module.cpp +++ b/torch/csrc/cuda/Module.cpp @@ -4,8 +4,10 @@ #include #include #include +#include #include #include +#include #ifdef USE_NCCL #include #endif @@ -217,6 +219,71 @@ PyObject * THCPModule_cudaCachingAllocator_raw_alloc(PyObject *_unused, PyObject END_HANDLE_TH_ERRORS } +// Unpack a PyObject to at::Scalar, throw an exception if it fails +at::Scalar as_scalar(PyObject* arg) { + // Zero-dim tensors are converted to Scalars as-is. Note this doesn't currently + // handle most NumPy scalar types except np.float64. + if (THPVariable_Check(arg)) { + return THPVariable_Unpack(arg).item(); + } + + if (THPUtils_checkLong(arg)) { + return at::Scalar(static_cast(THPUtils_unpackLong(arg))); + } + + if (PyBool_Check(arg)) { + return at::Scalar(THPUtils_unpackBool(arg)); + } + + if (PyComplex_Check(arg)) { + return at::Scalar(THPUtils_unpackComplexDouble(arg)); + } + return at::Scalar(THPUtils_unpackDouble(arg)); +} + +// Entrypoint for the callable created by torch.cuda.jiterator +// See jiterator.py for more details +PyObject * THCPModule_cudaJiteratorCompileAndLaunchKernel(PyObject *_unused, PyObject *args){ + HANDLE_TH_ERRORS + + PyObject* code_string_o = nullptr; + PyObject* kernel_name_o = nullptr; + PyObject* tensors_o = nullptr; + PyObject* kwargs_o = nullptr; + if(!PyArg_ParseTuple(args, "OOO|O", &code_string_o, &kernel_name_o, &tensors_o, &kwargs_o)) { + return nullptr; + } + + std::string code_string = THPUtils_unpackString(code_string_o); + std::string kernel_name = THPUtils_unpackString(kernel_name_o); + + THPUtils_assert(PyTuple_Check(tensors_o), "tensors argument is expected to " + "be a tuple, but got %s", THPUtils_typename(tensors_o)); + Py_ssize_t num_tensors = PyTuple_GET_SIZE(tensors_o); + + std::vector tensors; + for(const auto i : c10::irange(num_tensors)) { + PyObject *_tensor = PyTuple_GET_ITEM(tensors_o, i); + THPUtils_assert(THPVariable_Check(_tensor), "element %d of tensors " + "tuple is not a Tensor", i); + + tensors.emplace_back(THPVariable_Unpack(_tensor)); + } + + std::vector extra_args; + PyObject *key = nullptr; + PyObject *value = nullptr; + Py_ssize_t pos = 0; + while (PyDict_Next(kwargs_o, &pos, &key, &value)) { + extra_args.emplace_back(as_scalar(value)); + } + + at::Tensor output = at::cuda::CompileAndLaunchKernel(code_string, kernel_name, tensors, extra_args); + + return THPVariable_Wrap(output); + END_HANDLE_TH_ERRORS +} + PyObject * THCPModule_cudaCachingAllocator_raw_delete(PyObject *_unused, PyObject *obj){ HANDLE_TH_ERRORS void* mem_ptr = PyLong_AsVoidPtr(obj); @@ -564,6 +631,35 @@ PyObject * THCPModule_getCurrentBlasHandle_wrap(PyObject *self, PyObject *noargs END_HANDLE_TH_ERRORS } +PyObject * THCPModule_rocm_is_backward_pass(PyObject *_unused, PyObject *noargs) +{ + HANDLE_TH_ERRORS +#if USE_ROCM + if (at::ROCmBackwardPassGuard::is_backward_pass()) { + Py_RETURN_TRUE; + } + else { + Py_RETURN_FALSE; + } +#else + Py_RETURN_FALSE; +#endif + END_HANDLE_TH_ERRORS +} + +static PyObject * THCPModule_isCurrentStreamCapturing_wrap(PyObject *self, PyObject *noargs) +{ + HANDLE_TH_ERRORS + // If there's no cuda context, at::cuda::currentStreamCaptureStatus returns + // CaptureStatus::None without initializing a context. + if (at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None) { + Py_RETURN_FALSE; + } else { + Py_RETURN_TRUE; + } + END_HANDLE_TH_ERRORS +} + // NOLINTNEXTLINE(modernize-avoid-c-arrays, cppcoreguidelines-avoid-non-const-global-variables, cppcoreguidelines-avoid-c-arrays) static struct PyMethodDef _THCPModule_methods[] = { {"_cuda_init", THCPModule_initExtension, METH_NOARGS, nullptr}, @@ -578,6 +674,7 @@ static struct PyMethodDef _THCPModule_methods[] = { {"_cuda_getDefaultStream", THCPModule_getDefaultStream_wrap, METH_O, nullptr}, {"_cuda_getCurrentBlasHandle", THCPModule_getCurrentBlasHandle_wrap, METH_NOARGS, nullptr}, + {"_cuda_isCurrentStreamCapturing", THCPModule_isCurrentStreamCapturing_wrap, METH_NOARGS, nullptr}, {"_cuda_setStream", THCPModule_setStream_wrap, METH_O, nullptr}, {"_cuda_getCompiledVersion", THCPModule_getCompiledVersion, METH_NOARGS, nullptr}, {"_cuda_hasPrimaryContext", THCPModule_hasPrimaryContext, METH_O, nullptr}, @@ -597,6 +694,7 @@ static struct PyMethodDef _THCPModule_methods[] = { {"_cuda_unlock_mutex", THCPModule_cudaUnlockMutex, METH_NOARGS, nullptr}, {"_cuda_set_sync_debug_mode", THCPModule_cudaSetSyncDebugMode, METH_O, nullptr}, {"_cuda_get_sync_debug_mode", THCPModule_cudaGetSyncDebugMode, METH_NOARGS, nullptr}, + {"_cuda_jiterator_compile_and_launch_kernel", THCPModule_cudaJiteratorCompileAndLaunchKernel, METH_VARARGS, nullptr}, #ifdef USE_NCCL {"_nccl_version", THCPModule_nccl_version, METH_NOARGS, nullptr}, {"_nccl_unique_id", THCPModule_nccl_unique_id, METH_NOARGS, nullptr}, @@ -607,6 +705,7 @@ static struct PyMethodDef _THCPModule_methods[] = { {"_nccl_all_gather", THCPModule_nccl_all_gather, METH_VARARGS, nullptr}, {"_nccl_reduce_scatter", THCPModule_nccl_reduce_scatter, METH_VARARGS, nullptr}, #endif + {"_rocm_is_backward_pass", THCPModule_rocm_is_backward_pass, METH_NOARGS, nullptr}, {nullptr} }; diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp index ae61392ab542..5817449c1a49 100644 --- a/torch/csrc/cuda/nccl.cpp +++ b/torch/csrc/cuda/nccl.cpp @@ -650,6 +650,9 @@ void all2all_single_equal_split(at::Tensor& input, const auto* sendbuff = reinterpret_cast(input.data_ptr()); auto* recvbuff = reinterpret_cast(output.data_ptr()); auto comm = to_nccl_comm(_comm); +#if defined(USE_ROCM) && ROCM_VERSION >= 50000 + NCCL_CHECK(ncclAllToAll(sendbuff , recvbuff , count, type, comm, stream)); +#else NCCL_CHECK(ncclCommCount(comm, &numranks)); NCCL_CHECK(ncclGroupStart()); for(const auto r : c10::irange(numranks)) { @@ -661,6 +664,7 @@ void all2all_single_equal_split(at::Tensor& input, } } NCCL_CHECK(ncclGroupEnd()); +#endif #else AT_ERROR("all2all is only supported for NCCL lib version >= 2.7.0"); #endif @@ -833,8 +837,7 @@ void gather( if (cur_rank == root) { - for (int r = 0; r < numranks; r++) - { + for (const auto r : c10::irange(numranks)) { if (r != root) { auto* recvbuff = reinterpret_cast(outputs[r].data_ptr()); NCCL_CHECK(ncclRecv(recvbuff, count, type, r, comm, stream)); @@ -874,8 +877,7 @@ void scatter( NCCL_CHECK(ncclGroupStart()); if (cur_rank == root) { - for (int r = 0; r < numranks; r++) - { + for (const auto r : c10::irange(numranks)) { if (r != root) { size_t send_count = inputs[r].numel(); auto send_type = to_nccl_data_type(inputs[r]); diff --git a/torch/csrc/cuda/shared/cudart.cpp b/torch/csrc/cuda/shared/cudart.cpp index b93d921a16a9..b0af4c0884e9 100644 --- a/torch/csrc/cuda/shared/cudart.cpp +++ b/torch/csrc/cuda/shared/cudart.cpp @@ -49,8 +49,8 @@ void initCudartBindings(PyObject* module) { #endif cudart.def("cuda" "MemGetInfo", [](int device) -> std::pair { c10::cuda::CUDAGuard guard(device); - size_t device_free; - size_t device_total; + size_t device_free = 0; + size_t device_total = 0; cudaMemGetInfo(&device_free, &device_total); return {device_free, device_total}; }); diff --git a/torch/csrc/deploy/CMakeLists.txt b/torch/csrc/deploy/CMakeLists.txt index f8aa997eb109..61fe8c1bb892 100644 --- a/torch/csrc/deploy/CMakeLists.txt +++ b/torch/csrc/deploy/CMakeLists.txt @@ -1,6 +1,13 @@ set(DEPLOY_DIR "${CMAKE_CURRENT_SOURCE_DIR}") add_subdirectory(interpreter) +if(DEFINED GLIBCXX_USE_CXX11_ABI) + if(${GLIBCXX_USE_CXX11_ABI} EQUAL 1) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1") + set(TORCH_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=1") + endif() +endif() + # we do not want to have torch_deployinterpreter linked against libstdc++ or libc because # when loading it with RTLD_DEEPBIND it will resolve std::cout/stdout to the copy in libc++/libc instead of the # ones in the main process (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=42679). @@ -33,10 +40,23 @@ caffe2_interface_library(torch_deploy_internal torch_deploy) set(INTERPRETER_TEST_SOURCES ${DEPLOY_DIR}/test_deploy.cpp ) +set(INTERPRETER_TEST_SOURCES_GPU + ${DEPLOY_DIR}/test_deploy_gpu.cpp +) + add_executable(test_deploy ${INTERPRETER_TEST_SOURCES}) target_compile_definitions(test_deploy PUBLIC TEST_CUSTOM_LIBRARY) target_include_directories(test_deploy PRIVATE ${PYTORCH_ROOT}/torch) -target_link_libraries(test_deploy PUBLIC "-Wl,--no-as-needed" gtest dl torch_deploy) +target_link_libraries(test_deploy + PUBLIC "-Wl,--no-as-needed -rdynamic" gtest dl torch_deploy +) + +add_executable(test_deploy_gpu ${INTERPRETER_TEST_SOURCES_GPU}) +target_compile_definitions(test_deploy_gpu PUBLIC TEST_CUSTOM_LIBRARY) +target_include_directories(test_deploy_gpu PRIVATE ${PYTORCH_ROOT}/torch) +target_link_libraries(test_deploy_gpu + PUBLIC "-Wl,--no-as-needed -rdynamic" gtest dl torch_deploy +) add_library(test_deploy_lib SHARED test_deploy_lib.cpp) add_dependencies(test_deploy_lib cpython) @@ -45,14 +65,19 @@ target_link_libraries(test_deploy_lib PRIVATE pybind::pybind11) add_executable(deploy_benchmark ${DEPLOY_DIR}/example/benchmark.cpp) target_include_directories(deploy_benchmark PRIVATE ${PYTORCH_ROOT}/torch) -target_link_libraries(deploy_benchmark PUBLIC "-Wl,--no-as-needed" torch_deploy) +target_link_libraries(deploy_benchmark + PUBLIC "-Wl,--no-as-needed -rdynamic" torch_deploy +) add_executable(interactive_embedded_interpreter ${DEPLOY_DIR}/interactive_embedded_interpreter.cpp) target_include_directories(interactive_embedded_interpreter PRIVATE ${PYTORCH_ROOT}/torch) -target_link_libraries(interactive_embedded_interpreter PUBLIC "-Wl,--no-as-needed" torch_deploy) +target_link_libraries(interactive_embedded_interpreter + PUBLIC "-Wl,--no-as-needed -rdynamic" torch_deploy +) if(INSTALL_TEST) install(TARGETS test_deploy DESTINATION bin) + install(TARGETS test_deploy_gpu DESTINATION bin) endif() install(TARGETS torch_deploy DESTINATION lib) diff --git a/torch/csrc/deploy/Exception.h b/torch/csrc/deploy/Exception.h new file mode 100644 index 000000000000..f4311debeebc --- /dev/null +++ b/torch/csrc/deploy/Exception.h @@ -0,0 +1,47 @@ +#ifndef MULTIPY_EXCEPTION_H +#define MULTIPY_EXCEPTION_H + +#include + +#define MULTIPY_INTERNAL_ASSERT_WITH_MESSAGE(condition, message) \ + if (!(condition)) { \ + throw std::runtime_error( \ + "Internal Assertion failed: (" + std::string(#condition) + "), " + \ + "function " + __FUNCTION__ + ", file " + __FILE__ + ", line " + \ + std::to_string(__LINE__) + ".\n" + "Please report bug to Pytorch.\n" + \ + message + "\n"); \ + } + +#define MULTIPY_INTERNAL_ASSERT_NO_MESSAGE(condition) \ + MULTIPY_INTERNAL_ASSERT_WITH_MESSAGE(#condition, "") + +#define MULTIPY_INTERNAL_ASSERT_(x, condition, message, FUNC, ...) FUNC + +#define MULTIPY_INTERNAL_ASSERT(...) \ + MULTIPY_INTERNAL_ASSERT_( \ + , \ + ##__VA_ARGS__, \ + MULTIPY_INTERNAL_ASSERT_WITH_MESSAGE(__VA_ARGS__), \ + MULTIPY_INTERNAL_ASSERT_NO_MESSAGE(__VA_ARGS__)); + +#define MULTIPY_CHECK_WITH_MESSAGE(condition, message) \ + if (!(condition)) { \ + throw std::runtime_error( \ + "Check failed: (" + std::string(#condition) + "), " + "function " + \ + __FUNCTION__ + ", file " + __FILE__ + ", line " + \ + std::to_string(__LINE__) + ".\n" + message + "\n"); \ + } + +#define MULTIPY_CHECK_NO_MESSAGE(condition) \ + MULTIPY_CHECK_WITH_MESSAGE(#condition, "") + +#define MULTIPY_CHECK_(x, condition, message, FUNC, ...) FUNC + +#define MULTIPY_CHECK(...) \ + MULTIPY_CHECK_( \ + , \ + ##__VA_ARGS__, \ + MULTIPY_CHECK_WITH_MESSAGE(__VA_ARGS__), \ + MULTIPY_CHECK_NO_MESSAGE(__VA_ARGS__)); + +#endif // MULTIPY_EXCEPTION_H diff --git a/torch/csrc/deploy/README.md b/torch/csrc/deploy/README.md index 43f6f2c85fc5..dfe436ba79fa 100644 --- a/torch/csrc/deploy/README.md +++ b/torch/csrc/deploy/README.md @@ -20,3 +20,8 @@ Because CPython builds successfully when optional dependencies are missing, the To be safe, install the [complete list of dependencies for CPython](https://devguide.python.org/setup/#install-dependencies) for your platform, before trying to build torch with USE_DEPLOY=1. If you already built CPython without all the dependencies and want to fix it, just blow away the CPython folder under torch/csrc/deploy/third_party, install the missing system dependencies, and re-attempt the pytorch build command. + +# Example + +Read the [getting started guide](https://github.com/pytorch/pytorch/blob/master/docs/source/deploy.rst) for an +example on how to use `torch::deploy`. diff --git a/torch/csrc/deploy/benchmark.cpp b/torch/csrc/deploy/benchmark.cpp new file mode 100644 index 000000000000..82296a5e1a1d --- /dev/null +++ b/torch/csrc/deploy/benchmark.cpp @@ -0,0 +1,336 @@ +#include + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef void (*function_type)(const char*); + +bool cuda = false; + +constexpr auto latency_p = { + 25., + 50., + 95.}; //{1., 5., 25., 50., 75., 90., 95., 99., 99.25, 99.5, 99.75, 99.9}; + +// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) +struct Report { + std::string benchmark; + std::string strategy; + size_t n_threads; + size_t items_completed; + double work_items_per_second; + std::vector latencies; + static void report_header(std::ostream& out) { + out << "benchmark, strategy, n_threads, work_items_completed, work_items_per_second"; + for (double l : latency_p) { + out << ", p" << l << "_latency"; + } + out << ", device\n"; + } + void report(std::ostream& out) { + out << benchmark << ", " << strategy << ", " << n_threads << ", " + << items_completed << ", " << work_items_per_second; + for (double l : latencies) { + out << ", " << l; + } + out << ", " << (cuda ? "cuda" : "cpu") << "\n"; + } +}; + +const int min_items_to_complete = 1; + +struct RunPython { + static torch::deploy::ReplicatedObj load_and_wrap( + torch::deploy::Package& package) { + auto I = package.acquireSession(); + auto obj = I.self.attr("load_pickle")({"model", "model.pkl"}); + if (cuda) { + obj = I.global("gpu_wrapper", "GPUWrapper")({obj}); + } + return I.createMovable(obj); + } + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) + RunPython( + torch::deploy::Package& package, + std::vector eg, + const torch::deploy::Interpreter* interps) + : obj_(load_and_wrap(package)), eg_(std::move(eg)), interps_(interps) {} + void operator()(int i) { + auto I = obj_.acquireSession(); + if (cuda) { + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + std::vector eg2 = {i}; + eg2.insert(eg2.end(), eg_.begin(), eg_.end()); + I.self(eg2); + } else { + I.self(eg_); + } + } + torch::deploy::ReplicatedObj obj_; + std::vector eg_; + const torch::deploy::Interpreter* interps_; +}; + +// def to_device(i, d): +// if isinstance(i, torch.Tensor): +// return i.to(device=d) +// elif isinstance(i, (tuple, list)): +// return tuple(to_device(e, d) for e in i) +// else: +// raise RuntimeError('inputs are weird') + +static torch::IValue to_device(const torch::IValue& v, torch::Device to); + +static std::vector to_device_vec( + at::ArrayRef vs, + torch::Device to) { + std::vector results; + for (const torch::IValue& v : vs) { + results.push_back(to_device(v, to)); + } + return results; +} + +static torch::IValue to_device(const torch::IValue& v, torch::Device to) { + if (v.isTensor()) { + return v.toTensor().to(to); + } else if (v.isTuple()) { + auto tup = v.toTuple(); + return c10::ivalue::Tuple::create(to_device_vec(tup->elements(), to)); + } else if (v.isList()) { + auto converted = to_device_vec(v.toListRef(), to); + torch::List result(v.toList().elementType()); + for (const torch::IValue& v : converted) { + result.push_back(v); + } + return result; + } else { + MULTIPY_INTERNAL_ASSERT(false, "cannot to_device"); + } +} + +static bool exists(const std::string& fname) { + std::fstream jit_file(fname); + return jit_file.good(); +} + +struct RunJIT { + RunJIT(const std::string& file_to_run, std::vector eg) + : eg_(std::move(eg)) { + if (!cuda) { + models_.push_back(torch::jit::load(file_to_run + "_jit")); + } else { + for (const auto i : c10::irange(2)) { + auto d = torch::Device(torch::DeviceType::CUDA, i); + std::stringstream qualified; + qualified << file_to_run << "_jit_" << i; + auto loaded = exists(qualified.str()) + ? torch::jit::load(qualified.str(), d) + : torch::jit::load(file_to_run + "_jit", d); + loaded.to(d); + models_.push_back(loaded); + } + } + } + void operator()(int i) { + if (cuda) { + const auto device_id = i % models_.size(); + auto d = torch::Device(torch::DeviceType::CUDA, device_id); + to_device( + models_[device_id].forward(to_device_vec(eg_, d)), + torch::DeviceType::CPU); + } else { + models_[0].forward(eg_); + } + } + std::vector eg_; + std::vector models_; +}; + +struct Benchmark { + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) + Benchmark( + torch::deploy::InterpreterManager& manager, + size_t n_threads, + std::string strategy, + // NOLINTNEXTLINE(modernize-pass-by-value) + std::string file_to_run, + size_t n_seconds = 5) + : manager_(manager), + n_threads_(n_threads), + strategy_(strategy), + file_to_run_(file_to_run), + n_seconds_(n_seconds), + should_run_(true), + items_completed_(0), + reached_min_items_completed_(0) { + // NOLINTNEXTLINE(bugprone-branch-clone) + if (strategy == "one_python") { + manager.debugLimitInterpreters(1); + } else if (strategy == "multi_python") { + manager.debugLimitInterpreters(n_threads_); + } + } + + Report run() { + pthread_barrier_init(&first_run_, nullptr, n_threads_ + 1); + + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + torch::deploy::Package package = manager_.loadPackage(file_to_run_); + + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + std::vector eg; + { + auto I = package.acquireSession(); + + eg = I.global("builtins", "tuple")( + I.self.attr("load_pickle")({"model", "example.pkl"})) + .toIValue() + .toTupleRef() + .elements(); + } + + // NOLINTNEXTLINE(bugprone-branch-clone) + if (strategy_ == "jit") { + run_one_work_item = RunJIT(file_to_run_, std::move(eg)); + } else { + run_one_work_item = + RunPython(package, std::move(eg), manager_.allInstances().data()); + } + + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + std::vector> latencies(n_threads_); + + for (const auto i : c10::irange(n_threads_)) { + threads_.emplace_back([this, &latencies, i] { + torch::NoGradGuard guard; + // do initial work + run_one_work_item(i); + + pthread_barrier_wait(&first_run_); + size_t local_items_completed = 0; + while (should_run_) { + auto begin = std::chrono::steady_clock::now(); + run_one_work_item(i); + auto end = std::chrono::steady_clock::now(); + double work_seconds = + std::chrono::duration(end - begin).count(); + latencies[i].push_back(work_seconds); + local_items_completed++; + if (local_items_completed == min_items_to_complete) { + reached_min_items_completed_++; + } + } + items_completed_ += local_items_completed; + }); + } + + pthread_barrier_wait(&first_run_); + auto begin = std::chrono::steady_clock::now(); + auto try_stop_at = begin + std::chrono::seconds(n_seconds_); + std::this_thread::sleep_until(try_stop_at); + for (int i = 0; reached_min_items_completed_ < n_threads_; ++i) { + std::this_thread::sleep_until( + begin + (i + 2) * std::chrono::seconds(n_seconds_)); + } + should_run_ = false; + for (std::thread& thread : threads_) { + thread.join(); + } + auto end = std::chrono::steady_clock::now(); + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + double total_seconds = std::chrono::duration(end - begin).count(); + Report report; + report.benchmark = file_to_run_; + report.strategy = strategy_; + report.n_threads = n_threads_; + report.items_completed = items_completed_; + report.work_items_per_second = items_completed_ / total_seconds; + reportLatencies(report.latencies, latencies); + run_one_work_item = nullptr; + return report; + } + + private: + void reportLatencies( + std::vector& results, + const std::vector>& latencies) { + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + std::vector flat_latencies; + for (const auto& elem : latencies) { + flat_latencies.insert(flat_latencies.end(), elem.begin(), elem.end()); + } + std::sort(flat_latencies.begin(), flat_latencies.end()); + for (double target : latency_p) { + size_t idx = size_t(flat_latencies.size() * target / 100.0); + double time = flat_latencies.size() == 0 + ? 0 + : flat_latencies.at(std::min(flat_latencies.size() - 1, idx)); + results.push_back(time); + } + } + torch::deploy::InterpreterManager& manager_; + size_t n_threads_; + std::string strategy_; + std::string file_to_run_; + size_t n_seconds_; + pthread_barrier_t first_run_; + std::atomic should_run_; + std::atomic items_completed_; + std::atomic reached_min_items_completed_; + std::vector threads_; + std::function run_one_work_item; +}; + +// NOLINTNEXTLINE(bugprone-exception-escape) +int main(int argc, char* argv[]) { + int max_thread = atoi(argv[1]); + cuda = std::string(argv[2]) == "cuda"; + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + bool jit_enable = std::string(argv[3]) == "jit"; + Report::report_header(std::cout); + torch::deploy::InterpreterManager manager(max_thread); + + // make sure gpu_wrapper.py is in the import path + for (auto& interp : manager.allInstances()) { + auto I = interp.acquireSession(); + I.global("sys", "path").attr("append")({"torch/csrc/deploy/example"}); + } + + auto n_threads = {1, 2, 4, 8, 16, 32, 40}; + for (const auto i : c10::irange(4, argc)) { + std::string model_file = argv[i]; + for (int n_thread : n_threads) { + if (n_thread > max_thread) { + continue; + } + for (std::string strategy : {"one_python", "multi_python", "jit"}) { + if (strategy == "jit") { + if (!jit_enable) { + continue; + } + if (!exists(model_file + "_jit")) { + continue; + } + } + Benchmark b(manager, n_thread, strategy, model_file); + Report r = b.run(); + r.report(std::cout); + } + } + } + return 0; +} diff --git a/torch/csrc/deploy/deploy.cpp b/torch/csrc/deploy/deploy.cpp index 647c9a4e810b..680b8541873f 100644 --- a/torch/csrc/deploy/deploy.cpp +++ b/torch/csrc/deploy/deploy.cpp @@ -1,6 +1,8 @@ -#include +#include #include #include +#include + #include #include @@ -54,12 +56,13 @@ static bool writeDeployInterpreter(FILE* dst) { std::ifstream("/proc/self/cmdline") >> exePath; ElfFile elfFile(exePath.c_str()); for (const auto& s : pythonInterpreterSection) { - at::optional

payloadSection = elfFile.findSection(s.sectionName); - if (payloadSection != at::nullopt) { + multipy::optional
payloadSection = + elfFile.findSection(s.sectionName); + if (payloadSection != multipy::nullopt) { payloadStart = payloadSection->start; customLoader = s.customLoader; size = payloadSection->len; - TORCH_CHECK(payloadSection.has_value(), "Missing the payload section"); + MULTIPY_CHECK(payloadSection.has_value(), "Missing the payload section"); break; } } @@ -74,10 +77,10 @@ static bool writeDeployInterpreter(FILE* dst) { break; } } - TORCH_CHECK( + MULTIPY_CHECK( libStart != nullptr && libEnd != nullptr, - "torch::deploy requires a build-time dependency on embedded_interpreter or embedded_interpreter_cuda, neither of which were found. torch::cuda::is_available()=", - torch::cuda::is_available()); + "torch::deploy requires a build-time dependency on embedded_interpreter or embedded_interpreter_cuda, neither of which were found. torch::cuda::is_available()=" + + std::to_string(torch::cuda::is_available())); size = libEnd - libStart; payloadStart = libStart; @@ -91,6 +94,8 @@ InterpreterManager::InterpreterManager( size_t nInterp, std::shared_ptr env) : resources_(nInterp) { + C10_LOG_API_USAGE_ONCE("torch.deploy.InterpreterManager"); + TORCH_DEPLOY_TRY for (const auto i : c10::irange(nInterp)) { instances_.emplace_back(this, env); @@ -99,12 +104,12 @@ InterpreterManager::InterpreterManager( // can be used for balancing work across GPUs I.global("torch", "version").attr("__setattr__")({"interp", int(i)}); instances_.back().pImpl_->setFindModule( - [this](const std::string& name) -> at::optional { + [this](const std::string& name) -> multipy::optional { auto it = registeredModuleSource_.find(name); if (it != registeredModuleSource_.end()) { return it->second; } else { - return at::nullopt; + return multipy::nullopt; } }); } @@ -189,11 +194,11 @@ void ReplicatedObj::unload(const Interpreter* onThisInterpreter) { ReplicatedObj InterpreterSession::createMovable(Obj obj) { TORCH_DEPLOY_TRY - TORCH_CHECK( + MULTIPY_CHECK( manager_, "Can only create a movable object when the session was created from an interpreter that is part of a InterpreterManager"); - TORCH_CHECK( + MULTIPY_CHECK( impl_->isOwner(obj), "Cannot create movable from an object that lives in different session"); @@ -214,6 +219,11 @@ using dlopen_t = void* (*)(const char*, int); // function. static dlopen_t find_real_dlopen() { void* libc = dlopen("libdl.so.2", RTLD_NOLOAD | RTLD_LAZY | RTLD_LOCAL); + // libdl is gone on some newer systems. + if (!libc) { + // libc.so won't open with dlopen because it's a linker script. + libc = dlopen("libc.so.6", RTLD_NOLOAD | RTLD_LAZY | RTLD_LOCAL); + } TORCH_INTERNAL_ASSERT(libc); auto dlopen_ = (dlopen_t)dlsym(libc, "dlopen"); TORCH_INTERNAL_ASSERT(dlopen_); @@ -293,8 +303,7 @@ int LoadBalancer::acquire() { size_t minusers = SIZE_MAX; int minIdx = 0; for (size_t i = 0; i < n_; ++i, ++last) { - // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - if (last >= n_) { + if (last >= static_cast(n_)) { last = 0; } uint64_t prev = 0; diff --git a/torch/csrc/deploy/deploy.h b/torch/csrc/deploy/deploy.h index c6a4794a932d..b986093ed020 100644 --- a/torch/csrc/deploy/deploy.h +++ b/torch/csrc/deploy/deploy.h @@ -1,7 +1,7 @@ #pragma once -#include #include #include +#include #include #include #include @@ -95,7 +95,7 @@ struct TORCH_API LoadBalancer { } void setResourceLimit(size_t n) { TORCH_DEPLOY_TRY - TORCH_INTERNAL_ASSERT(n <= allocated_); + MULTIPY_INTERNAL_ASSERT(n <= allocated_); n_ = n; TORCH_DEPLOY_SAFE_CATCH_RETHROW } diff --git a/torch/csrc/deploy/elf_file.cpp b/torch/csrc/deploy/elf_file.cpp index 85eaaa19cc26..ca1e749868e5 100644 --- a/torch/csrc/deploy/elf_file.cpp +++ b/torch/csrc/deploy/elf_file.cpp @@ -1,5 +1,7 @@ #include +#include #include +#include namespace torch { namespace deploy { @@ -13,7 +15,7 @@ ElfFile::ElfFile(const char* filename) : memFile_(filename) { shdrList_ = (Elf64_Shdr*)(fileData + ehdr_->e_shoff); auto strtabSecNo = ehdr_->e_shstrndx; - TORCH_CHECK( + MULTIPY_CHECK( strtabSecNo >= 0 && strtabSecNo < numSections_, "e_shstrndx out of range"); @@ -25,9 +27,9 @@ ElfFile::ElfFile(const char* filename) : memFile_(filename) { } } -at::optional
ElfFile::findSection(const char* name) const { - TORCH_CHECK(name != nullptr, "Null name"); - at::optional
found = at::nullopt; +multipy::optional
ElfFile::findSection(const char* name) const { + MULTIPY_CHECK(name != nullptr, "Null name"); + multipy::optional
found = multipy::nullopt; for (const auto& section : sections_) { if (strcmp(name, section.name) == 0) { found = section; @@ -40,13 +42,13 @@ at::optional
ElfFile::findSection(const char* name) const { void ElfFile::checkFormat() const { // check the magic numbers - TORCH_CHECK( + MULTIPY_CHECK( (ehdr_->e_ident[EI_MAG0] == ELFMAG0) && (ehdr_->e_ident[EI_MAG1] == ELFMAG1) && (ehdr_->e_ident[EI_MAG2] == ELFMAG2) && (ehdr_->e_ident[EI_MAG3] == ELFMAG3), "Unexpected magic numbers"); - TORCH_CHECK( + MULTIPY_CHECK( ehdr_->e_ident[EI_CLASS] == ELFCLASS64, "Only support 64bit ELF file"); } diff --git a/torch/csrc/deploy/elf_file.h b/torch/csrc/deploy/elf_file.h index e27750c01139..31ea7976af88 100644 --- a/torch/csrc/deploy/elf_file.h +++ b/torch/csrc/deploy/elf_file.h @@ -1,7 +1,8 @@ #pragma once -#include #include +#include +#include #include #include @@ -30,7 +31,7 @@ struct Section { class ElfFile { public: explicit ElfFile(const char* filename); - at::optional
findSection(const char* name) const; + multipy::optional
findSection(const char* name) const; private: Section toSection(Elf64_Shdr* shdr) { @@ -40,7 +41,7 @@ class ElfFile { const char* name = ""; if (strtabSection_) { - TORCH_CHECK(nameOff >= 0 && nameOff < strtabSection_.len); + MULTIPY_CHECK(nameOff >= 0 && nameOff < strtabSection_.len); name = strtabSection_.start + nameOff; } const char* start = memFile_.data() + shOff; @@ -48,7 +49,7 @@ class ElfFile { } [[nodiscard]] const char* str(size_t off) const { - TORCH_CHECK(off < strtabSection_.len, "String table index out of range"); + MULTIPY_CHECK(off < strtabSection_.len, "String table index out of range"); return strtabSection_.start + off; } void checkFormat() const; diff --git a/torch/csrc/deploy/environment.h b/torch/csrc/deploy/environment.h index 4485a4e1d031..433ce6bcb3f6 100644 --- a/torch/csrc/deploy/environment.h +++ b/torch/csrc/deploy/environment.h @@ -1,5 +1,6 @@ #pragma once #include +#include #include #include #include @@ -27,7 +28,7 @@ class Environment { // load the zipped torch modules constexpr const char* ZIPPED_TORCH_NAME = ".torch_python_modules"; auto zippedTorchSection = elfFile.findSection(ZIPPED_TORCH_NAME); - TORCH_CHECK( + MULTIPY_CHECK( zippedTorchSection.has_value(), "Missing the zipped torch section"); const char* zippedTorchStart = zippedTorchSection->start; auto zippedTorchSize = zippedTorchSection->len; @@ -35,7 +36,7 @@ class Environment { std::string zipArchive = std::string(pythonAppDir) + "/torch_python_modules.zip"; auto zippedFile = fopen(zipArchive.c_str(), "wb"); - TORCH_CHECK( + MULTIPY_CHECK( zippedFile != nullptr, "Fail to create file: ", strerror(errno)); fwrite(zippedTorchStart, 1, zippedTorchSize, zippedFile); fclose(zippedFile); diff --git a/torch/csrc/deploy/example/examples.py b/torch/csrc/deploy/example/examples.py index 25bb54a0c606..73eeb2149b54 100644 --- a/torch/csrc/deploy/example/examples.py +++ b/torch/csrc/deploy/example/examples.py @@ -146,8 +146,7 @@ class MultiReturn(torch.nn.Module): def __init__(self): super(MultiReturn, self).__init__() - def forward(self, t): - # type: (Tuple[Tensor, Tensor]) -> Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor]] + def forward(self, t: Tuple[Tensor, Tensor]) -> Tuple[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor]]: a, b = t result = ((a.masked_fill_(b, 0.1), b), (torch.ones_like(a), b)) return result diff --git a/torch/csrc/deploy/interpreter/CMakeLists.txt b/torch/csrc/deploy/interpreter/CMakeLists.txt index 7f808335c82d..33b71e348396 100644 --- a/torch/csrc/deploy/interpreter/CMakeLists.txt +++ b/torch/csrc/deploy/interpreter/CMakeLists.txt @@ -1,8 +1,11 @@ SET(INTERPRETER_DIR "${DEPLOY_DIR}/interpreter" ) SET(INTERPRETER_DIR "${DEPLOY_DIR}/interpreter" PARENT_SCOPE) - SET(PYTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../../../") +if(NOT TORCH_INSTALL_LIB_DIR) + set(TORCH_INSTALL_LIB_DIR lib) +endif() + # Build cpython SET(PYTHON_INSTALL_DIR "${INTERPRETER_DIR}/cpython") SET(PYTHON_INC_DIR "${PYTHON_INSTALL_DIR}/include/python3.8") @@ -109,3 +112,6 @@ target_link_libraries(torch_deployinterpreter PRIVATE ${PYTHON_LIB} ${PYTHON_STD target_link_libraries(torch_deployinterpreter PRIVATE fmt::fmt-header-only protobuf::libprotobuf-lite) target_link_libraries(torch_deployinterpreter PRIVATE ${PYTHON_INSTALL_DIR}/lib/libssl.a ${PYTHON_INSTALL_DIR}/lib/libcrypto.a) target_link_libraries(torch_deployinterpreter PRIVATE pybind::pybind11) + +# expose torch_python_static for multipy +install(TARGETS torch_python_static DESTINATION "${TORCH_INSTALL_LIB_DIR}") diff --git a/torch/csrc/deploy/interpreter/Optional.hpp b/torch/csrc/deploy/interpreter/Optional.hpp new file mode 100644 index 000000000000..92b73d7f6fbb --- /dev/null +++ b/torch/csrc/deploy/interpreter/Optional.hpp @@ -0,0 +1,1107 @@ +// Copyright (C) 2011 - 2012 Andrzej Krzemienski. +// +// Use, modification, and distribution is subject to the Boost Software +// License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +// +// The idea and interface is based on Boost.Optional library +// authored by Fernando Luis Cacciola Carballal +// +// Source: https://github.com/akrzemi1/Optional + +#ifndef ___OPTIONAL_HPP___ +#define ___OPTIONAL_HPP___ + +#include +#include +#include +#include +#include +#include +#include + +#define TR2_OPTIONAL_REQUIRES(...) \ + typename std::enable_if<__VA_ARGS__::value, bool>::type = false + +#if defined __GNUC__ // NOTE: GNUC is also defined for Clang +#if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 8) +#define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___ +#elif (__GNUC__ > 4) +#define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___ +#endif + +#if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 7) +#define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___ +#elif (__GNUC__ > 4) +#define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___ +#endif + +#if (__GNUC__ == 4) && (__GNUC_MINOR__ == 8) && (__GNUC_PATCHLEVEL__ >= 1) +#define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ +#elif (__GNUC__ == 4) && (__GNUC_MINOR__ >= 9) +#define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ +#elif (__GNUC__ > 4) +#define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ +#endif +#endif + +#if defined __clang_major__ +#if (__clang_major__ == 3 && __clang_minor__ >= 5) +#define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ +#elif (__clang_major__ > 3) +#define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ +#endif +#if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ +#define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_ +#elif ( \ + __clang_major__ == 3 && __clang_minor__ == 4 && __clang_patchlevel__ >= 2) +#define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_ +#endif +#endif + +#if defined _MSC_VER +#if (_MSC_VER >= 1900) +#define TR2_OPTIONAL_MSVC_2015_AND_HIGHER___ +#endif +#endif + +#if defined __clang__ +#if (__clang_major__ > 2) || (__clang_major__ == 2) && (__clang_minor__ >= 9) +#define OPTIONAL_HAS_THIS_RVALUE_REFS 1 +#else +#define OPTIONAL_HAS_THIS_RVALUE_REFS 0 +#endif +#elif defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ +#define OPTIONAL_HAS_THIS_RVALUE_REFS 1 +#elif defined TR2_OPTIONAL_MSVC_2015_AND_HIGHER___ +#define OPTIONAL_HAS_THIS_RVALUE_REFS 1 +#else +#define OPTIONAL_HAS_THIS_RVALUE_REFS 0 +#endif + +#if defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___ +#define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 1 +#define OPTIONAL_CONSTEXPR_INIT_LIST constexpr +#else +#define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 0 +#define OPTIONAL_CONSTEXPR_INIT_LIST +#endif + +#if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ && (defined __cplusplus) && \ + (__cplusplus != 201103L) +#define OPTIONAL_HAS_MOVE_ACCESSORS 1 +#else +#define OPTIONAL_HAS_MOVE_ACCESSORS 0 +#endif + +// In C++11 constexpr implies const, so we need to make non-const members also +// non-constexpr +#if (defined __cplusplus) && (__cplusplus == 201103L) +#define OPTIONAL_MUTABLE_CONSTEXPR +#else +#define OPTIONAL_MUTABLE_CONSTEXPR constexpr +#endif + +namespace multipy { + +// BEGIN workaround for missing std::is_trivially_destructible +#if defined TR2_OPTIONAL_GCC_4_8_AND_HIGHER___ +// leave it: it is already there +#elif defined TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_ +// leave it: it is already there +#elif defined TR2_OPTIONAL_MSVC_2015_AND_HIGHER___ +// leave it: it is already there +#elif defined TR2_OPTIONAL_DISABLE_EMULATION_OF_TYPE_TRAITS +// leave it: the user doesn't want it +#else +template +using std::is_trivially_destructible = std::has_trivial_destructor; +#endif +// END workaround for missing std::is_trivially_destructible + +#if (defined TR2_OPTIONAL_GCC_4_7_AND_HIGHER___) +// leave it; our metafunctions are already defined. +#elif defined TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_ +// leave it; our metafunctions are already defined. +#elif defined TR2_OPTIONAL_MSVC_2015_AND_HIGHER___ +// leave it: it is already there +#elif defined TR2_OPTIONAL_DISABLE_EMULATION_OF_TYPE_TRAITS +// leave it: the user doesn't want it +#else + +// workaround for missing traits in GCC and CLANG +template +struct std::is_nothrow_move_constructible { + constexpr static bool value = std::is_nothrow_constructible::value; +}; + +template +struct is_assignable { + template + constexpr static bool has_assign(...) { + return false; + } + + template < + class X, + class Y, + size_t S = sizeof((std::declval() = std::declval(), true))> + // the comma operator is necessary for the cases where operator= returns void + constexpr static bool has_assign(bool) { + return true; + } + + constexpr static bool value = has_assign(true); +}; + +template +struct std::is_nothrow_move_assignable { + template + struct has_nothrow_move_assign { + constexpr static bool value = false; + }; + + template + struct has_nothrow_move_assign { + constexpr static bool value = + noexcept(std::declval() = std::declval()); + }; + + constexpr static bool value = + has_nothrow_move_assign::value>::value; +}; +// end workaround + +#endif + +// 20.5.4, optional for object types +template +class optional; + +// 20.5.5, optional for lvalue reference types +template +class optional; + +// workaround: std utility functions aren't constexpr yet +template +inline constexpr T&& constexpr_forward( + typename std::remove_reference::type& t) noexcept { + return static_cast(t); +} + +template +inline constexpr T&& constexpr_forward( + typename std::remove_reference::type&& t) noexcept { + static_assert(!std::is_lvalue_reference::value, "!!"); + return static_cast(t); +} + +template +inline constexpr typename std::remove_reference::type&& constexpr_move( + T&& t) noexcept { + return static_cast::type&&>(t); +} + +#if defined NDEBUG +#define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) (EXPR) +#else +#define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) \ + ((CHECK) ? (EXPR) : ([] { assert(!#CHECK); }(), (EXPR))) +#endif + +namespace detail_ { + +// static_addressof: a constexpr version of addressof +template +struct has_overloaded_addressof { + template + constexpr static bool has_overload(...) { + return false; + } + + template ().operator&())> + constexpr static bool has_overload(bool) { + return true; + } + + constexpr static bool value = has_overload(true); +}; + +template )> +constexpr T* static_addressof(T& ref) { + return &ref; +} + +template )> +T* static_addressof(T& ref) { + return std::addressof(ref); +} + +// the call to convert(b) has return type A and converts b to type A iff b +// decltype(b) is implicitly convertible to A +template +constexpr U convert(U v) { + return v; +} + +namespace swap_ns { +using std::swap; + +template +void adl_swap(T& t, T& u) noexcept(noexcept(swap(t, u))) { + swap(t, u); +} + +} // namespace swap_ns + +} // namespace detail_ + +constexpr struct trivial_init_t { +} trivial_init{}; + +// 20.5.6, In-place construction +constexpr struct in_place_t { +} in_place{}; + +// 20.5.7, Disengaged state indicator +struct nullopt_t { + struct init {}; + constexpr explicit nullopt_t(init) {} +}; +constexpr nullopt_t nullopt{nullopt_t::init()}; + +// 20.5.8, class bad_optional_access +class bad_optional_access : public std::logic_error { + public: + explicit bad_optional_access(const std::string& what_arg) + : std::logic_error{what_arg} {} + explicit bad_optional_access(const char* what_arg) + : std::logic_error{what_arg} {} +}; + +template +union storage_t { + unsigned char dummy_; + T value_; + + constexpr storage_t(trivial_init_t) noexcept : dummy_(){}; + + template + constexpr storage_t(Args&&... args) + : value_(constexpr_forward(args)...) {} + + ~storage_t() {} +}; + +template +union constexpr_storage_t { + unsigned char dummy_; + T value_; + + constexpr constexpr_storage_t(trivial_init_t) noexcept : dummy_(){}; + + template + constexpr constexpr_storage_t(Args&&... args) + : value_(constexpr_forward(args)...) {} + + ~constexpr_storage_t() = default; +}; + +template +struct optional_base { + bool init_; + storage_t storage_; + + constexpr optional_base() noexcept : init_(false), storage_(trivial_init){}; + + explicit constexpr optional_base(const T& v) : init_(true), storage_(v) {} + + explicit constexpr optional_base(T&& v) + : init_(true), storage_(constexpr_move(v)) {} + + template + explicit optional_base(in_place_t, Args&&... args) + : init_(true), storage_(constexpr_forward(args)...) {} + + template < + class U, + class... Args, + TR2_OPTIONAL_REQUIRES(std::is_constructible>)> + explicit optional_base( + in_place_t, + std::initializer_list il, + Args&&... args) + : init_(true), storage_(il, std::forward(args)...) {} + + ~optional_base() { + if (init_) + storage_.value_.T::~T(); + } +}; + +template +struct constexpr_optional_base { + bool init_; + constexpr_storage_t storage_; + + constexpr constexpr_optional_base() noexcept + : init_(false), storage_(trivial_init){}; + + explicit constexpr constexpr_optional_base(const T& v) + : init_(true), storage_(v) {} + + explicit constexpr constexpr_optional_base(T&& v) + : init_(true), storage_(constexpr_move(v)) {} + + template + explicit constexpr constexpr_optional_base(in_place_t, Args&&... args) + : init_(true), storage_(constexpr_forward(args)...) {} + + template < + class U, + class... Args, + TR2_OPTIONAL_REQUIRES(std::is_constructible>)> + OPTIONAL_CONSTEXPR_INIT_LIST explicit constexpr_optional_base( + in_place_t, + std::initializer_list il, + Args&&... args) + : init_(true), storage_(il, std::forward(args)...) {} + + ~constexpr_optional_base() = default; +}; + +template +using OptionalBase = typename std::conditional< + std::is_trivially_destructible::value, // if possible + constexpr_optional_base::type>, // use base with trivial destructor + optional_base::type>>::type; + +template +class optional : private OptionalBase { + static_assert( + !std::is_same::type, nullopt_t>::value, + "bad T"); + static_assert( + !std::is_same::type, in_place_t>::value, + "bad T"); + + constexpr bool initialized() const noexcept { + return OptionalBase::init_; + } + typename std::remove_const::type* dataptr() { + return std::addressof(OptionalBase::storage_.value_); + } + constexpr const T* dataptr() const { + return detail_::static_addressof(OptionalBase::storage_.value_); + } + +#if OPTIONAL_HAS_THIS_RVALUE_REFS == 1 + constexpr const T& contained_val() const& { + return OptionalBase::storage_.value_; + } +#if OPTIONAL_HAS_MOVE_ACCESSORS == 1 + OPTIONAL_MUTABLE_CONSTEXPR T&& contained_val() && { + return std::move(OptionalBase::storage_.value_); + } + OPTIONAL_MUTABLE_CONSTEXPR T& contained_val() & { + return OptionalBase::storage_.value_; + } +#else + T& contained_val() & { + return OptionalBase::storage_.value_; + } + T&& contained_val() && { + return std::move(OptionalBase::storage_.value_); + } +#endif +#else + constexpr const T& contained_val() const { + return OptionalBase::storage_.value_; + } + T& contained_val() { + return OptionalBase::storage_.value_; + } +#endif + + void clear() noexcept { + if (initialized()) + dataptr()->T::~T(); + OptionalBase::init_ = false; + } + + template + void initialize(Args&&... args) noexcept( + noexcept(T(std::forward(args)...))) { + assert(!OptionalBase::init_); + ::new (static_cast(dataptr())) T(std::forward(args)...); + OptionalBase::init_ = true; + } + + template + void initialize(std::initializer_list il, Args&&... args) noexcept( + noexcept(T(il, std::forward(args)...))) { + assert(!OptionalBase::init_); + ::new (static_cast(dataptr())) T(il, std::forward(args)...); + OptionalBase::init_ = true; + } + + public: + typedef T value_type; + + // 20.5.5.1, constructors + constexpr optional() noexcept : OptionalBase(){}; + constexpr optional(nullopt_t) noexcept : OptionalBase(){}; + + optional(const optional& rhs) : OptionalBase() { + if (rhs.initialized()) { + ::new (static_cast(dataptr())) T(*rhs); + OptionalBase::init_ = true; + } + } + + optional(optional&& rhs) noexcept( + std::is_nothrow_move_constructible::value) + : OptionalBase() { + if (rhs.initialized()) { + ::new (static_cast(dataptr())) T(std::move(*rhs)); + OptionalBase::init_ = true; + } + } + + constexpr optional(const T& v) : OptionalBase(v) {} + + constexpr optional(T&& v) : OptionalBase(constexpr_move(v)) {} + + template + explicit constexpr optional(in_place_t, Args&&... args) + : OptionalBase(in_place_t{}, constexpr_forward(args)...) {} + + template < + class U, + class... Args, + TR2_OPTIONAL_REQUIRES(std::is_constructible>)> + OPTIONAL_CONSTEXPR_INIT_LIST explicit optional( + in_place_t, + std::initializer_list il, + Args&&... args) + : OptionalBase(in_place_t{}, il, constexpr_forward(args)...) {} + + // 20.5.4.2, Destructor + ~optional() = default; + + // 20.5.4.3, assignment + optional& operator=(nullopt_t) noexcept { + clear(); + return *this; + } + + optional& operator=(const optional& rhs) { + if (initialized() == true && rhs.initialized() == false) + clear(); + else if (initialized() == false && rhs.initialized() == true) + initialize(*rhs); + else if (initialized() == true && rhs.initialized() == true) + contained_val() = *rhs; + return *this; + } + + optional& operator=(optional&& rhs) noexcept( + std::is_nothrow_move_assignable::value&& + std::is_nothrow_move_constructible::value) { + if (initialized() == true && rhs.initialized() == false) + clear(); + else if (initialized() == false && rhs.initialized() == true) + initialize(std::move(*rhs)); + else if (initialized() == true && rhs.initialized() == true) + contained_val() = std::move(*rhs); + return *this; + } + + template + auto operator=(U&& v) -> typename std::enable_if< + std::is_same::type, T>::value, + optional&>::type { + if (initialized()) { + contained_val() = std::forward(v); + } else { + initialize(std::forward(v)); + } + return *this; + } + + template + void emplace(Args&&... args) { + clear(); + initialize(std::forward(args)...); + } + + template + void emplace(std::initializer_list il, Args&&... args) { + clear(); + initialize(il, std::forward(args)...); + } + + // 20.5.4.4, Swap + void swap(optional& rhs) noexcept( + std::is_nothrow_move_constructible::value&& noexcept( + detail_::swap_ns::adl_swap(std::declval(), std::declval()))) { + if (initialized() == true && rhs.initialized() == false) { + rhs.initialize(std::move(**this)); + clear(); + } else if (initialized() == false && rhs.initialized() == true) { + initialize(std::move(*rhs)); + rhs.clear(); + } else if (initialized() == true && rhs.initialized() == true) { + using std::swap; + swap(**this, *rhs); + } + } + + // 20.5.4.5, Observers + + explicit constexpr operator bool() const noexcept { + return initialized(); + } + constexpr bool has_value() const noexcept { + return initialized(); + } + + constexpr T const* operator->() const { + return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), dataptr()); + } + +#if OPTIONAL_HAS_MOVE_ACCESSORS == 1 + + OPTIONAL_MUTABLE_CONSTEXPR T* operator->() { + assert(initialized()); + return dataptr(); + } + + constexpr T const& operator*() const& { + return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), contained_val()); + } + + OPTIONAL_MUTABLE_CONSTEXPR T& operator*() & { + assert(initialized()); + return contained_val(); + } + + OPTIONAL_MUTABLE_CONSTEXPR T&& operator*() && { + assert(initialized()); + return constexpr_move(contained_val()); + } + + constexpr T const& value() const& { + return initialized() + ? contained_val() + : (throw bad_optional_access("bad optional access"), contained_val()); + } + + OPTIONAL_MUTABLE_CONSTEXPR T& value() & { + return initialized() + ? contained_val() + : (throw bad_optional_access("bad optional access"), contained_val()); + } + + OPTIONAL_MUTABLE_CONSTEXPR T&& value() && { + if (!initialized()) + throw bad_optional_access("bad optional access"); + return std::move(contained_val()); + } + +#else + + T* operator->() { + assert(initialized()); + return dataptr(); + } + + constexpr T const& operator*() const { + return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), contained_val()); + } + + T& operator*() { + assert(initialized()); + return contained_val(); + } + + constexpr T const& value() const { + return initialized() + ? contained_val() + : (throw bad_optional_access("bad optional access"), contained_val()); + } + + T& value() { + return initialized() + ? contained_val() + : (throw bad_optional_access("bad optional access"), contained_val()); + } + +#endif + +#if OPTIONAL_HAS_THIS_RVALUE_REFS == 1 + + template + constexpr T value_or(V&& v) const& { + return *this ? **this : detail_::convert(constexpr_forward(v)); + } + +#if OPTIONAL_HAS_MOVE_ACCESSORS == 1 + + template + OPTIONAL_MUTABLE_CONSTEXPR T value_or(V&& v) && { + return *this + ? constexpr_move(const_cast&>(*this).contained_val()) + : detail_::convert(constexpr_forward(v)); + } + +#else + + template + T value_or(V&& v) && { + return *this + ? constexpr_move(const_cast&>(*this).contained_val()) + : detail_::convert(constexpr_forward(v)); + } + +#endif + +#else + + template + constexpr T value_or(V&& v) const { + return *this ? **this : detail_::convert(constexpr_forward(v)); + } + +#endif + + // 20.6.3.6, modifiers + void reset() noexcept { + clear(); + } +}; + +template +class optional { + static_assert(!std::is_same::value, "bad T"); + static_assert(!std::is_same::value, "bad T"); + T* ref; + + public: + // 20.5.5.1, construction/destruction + constexpr optional() noexcept : ref(nullptr) {} + + constexpr optional(nullopt_t) noexcept : ref(nullptr) {} + + constexpr optional(T& v) noexcept : ref(detail_::static_addressof(v)) {} + + optional(T&&) = delete; + + constexpr optional(const optional& rhs) noexcept : ref(rhs.ref) {} + + explicit constexpr optional(in_place_t, T& v) noexcept + : ref(detail_::static_addressof(v)) {} + + explicit optional(in_place_t, T&&) = delete; + + ~optional() = default; + + // 20.5.5.2, mutation + optional& operator=(nullopt_t) noexcept { + ref = nullptr; + return *this; + } + + // optional& operator=(const optional& rhs) noexcept { + // ref = rhs.ref; + // return *this; + // } + + // optional& operator=(optional&& rhs) noexcept { + // ref = rhs.ref; + // return *this; + // } + + template + auto operator=(U&& rhs) noexcept -> typename std::enable_if< + std::is_same::type, optional>::value, + optional&>::type { + ref = rhs.ref; + return *this; + } + + template + auto operator=(U&& rhs) noexcept -> typename std::enable_if< + !std::is_same::type, optional>::value, + optional&>::type = delete; + + void emplace(T& v) noexcept { + ref = detail_::static_addressof(v); + } + + void emplace(T&&) = delete; + + void swap(optional& rhs) noexcept { + std::swap(ref, rhs.ref); + } + + // 20.5.5.3, observers + constexpr T* operator->() const { + return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, ref); + } + + constexpr T& operator*() const { + return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, *ref); + } + + constexpr T& value() const { + return ref ? *ref + : (throw bad_optional_access("bad optional access"), *ref); + } + + explicit constexpr operator bool() const noexcept { + return ref != nullptr; + } + + constexpr bool has_value() const noexcept { + return ref != nullptr; + } + + template + constexpr typename std::decay::type value_or(V&& v) const { + return *this ? **this + : detail_::convert::type>( + constexpr_forward(v)); + } + + // x.x.x.x, modifiers + void reset() noexcept { + ref = nullptr; + } +}; + +template +class optional { + static_assert(sizeof(T) == 0, "optional rvalue references disallowed"); +}; + +// 20.5.8, Relational operators +template +constexpr bool operator==(const optional& x, const optional& y) { + return bool(x) != bool(y) ? false : bool(x) == false ? true : *x == *y; +} + +template +constexpr bool operator!=(const optional& x, const optional& y) { + return !(x == y); +} + +template +constexpr bool operator<(const optional& x, const optional& y) { + return (!y) ? false : (!x) ? true : *x < *y; +} + +template +constexpr bool operator>(const optional& x, const optional& y) { + return (y < x); +} + +template +constexpr bool operator<=(const optional& x, const optional& y) { + return !(y < x); +} + +template +constexpr bool operator>=(const optional& x, const optional& y) { + return !(x < y); +} + +// 20.5.9, Comparison with nullopt +template +constexpr bool operator==(const optional& x, nullopt_t) noexcept { + return (!x); +} + +template +constexpr bool operator==(nullopt_t, const optional& x) noexcept { + return (!x); +} + +template +constexpr bool operator!=(const optional& x, nullopt_t) noexcept { + return bool(x); +} + +template +constexpr bool operator!=(nullopt_t, const optional& x) noexcept { + return bool(x); +} + +template +constexpr bool operator<(const optional&, nullopt_t) noexcept { + return false; +} + +template +constexpr bool operator<(nullopt_t, const optional& x) noexcept { + return bool(x); +} + +template +constexpr bool operator<=(const optional& x, nullopt_t) noexcept { + return (!x); +} + +template +constexpr bool operator<=(nullopt_t, const optional&) noexcept { + return true; +} + +template +constexpr bool operator>(const optional& x, nullopt_t) noexcept { + return bool(x); +} + +template +constexpr bool operator>(nullopt_t, const optional&) noexcept { + return false; +} + +template +constexpr bool operator>=(const optional&, nullopt_t) noexcept { + return true; +} + +template +constexpr bool operator>=(nullopt_t, const optional& x) noexcept { + return (!x); +} + +// 20.5.10, Comparison with T +template +constexpr bool operator==(const optional& x, const T& v) { + return bool(x) ? *x == v : false; +} + +template +constexpr bool operator==(const T& v, const optional& x) { + return bool(x) ? v == *x : false; +} + +template +constexpr bool operator!=(const optional& x, const T& v) { + return bool(x) ? *x != v : true; +} + +template +constexpr bool operator!=(const T& v, const optional& x) { + return bool(x) ? v != *x : true; +} + +template +constexpr bool operator<(const optional& x, const T& v) { + return bool(x) ? *x < v : true; +} + +template +constexpr bool operator>(const T& v, const optional& x) { + return bool(x) ? v > *x : true; +} + +template +constexpr bool operator>(const optional& x, const T& v) { + return bool(x) ? *x > v : false; +} + +template +constexpr bool operator<(const T& v, const optional& x) { + return bool(x) ? v < *x : false; +} + +template +constexpr bool operator>=(const optional& x, const T& v) { + return bool(x) ? *x >= v : false; +} + +template +constexpr bool operator<=(const T& v, const optional& x) { + return bool(x) ? v <= *x : false; +} + +template +constexpr bool operator<=(const optional& x, const T& v) { + return bool(x) ? *x <= v : true; +} + +template +constexpr bool operator>=(const T& v, const optional& x) { + return bool(x) ? v >= *x : true; +} + +// Comparison of optional with T +template +constexpr bool operator==(const optional& x, const T& v) { + return bool(x) ? *x == v : false; +} + +template +constexpr bool operator==(const T& v, const optional& x) { + return bool(x) ? v == *x : false; +} + +template +constexpr bool operator!=(const optional& x, const T& v) { + return bool(x) ? *x != v : true; +} + +template +constexpr bool operator!=(const T& v, const optional& x) { + return bool(x) ? v != *x : true; +} + +template +constexpr bool operator<(const optional& x, const T& v) { + return bool(x) ? *x < v : true; +} + +template +constexpr bool operator>(const T& v, const optional& x) { + return bool(x) ? v > *x : true; +} + +template +constexpr bool operator>(const optional& x, const T& v) { + return bool(x) ? *x > v : false; +} + +template +constexpr bool operator<(const T& v, const optional& x) { + return bool(x) ? v < *x : false; +} + +template +constexpr bool operator>=(const optional& x, const T& v) { + return bool(x) ? *x >= v : false; +} + +template +constexpr bool operator<=(const T& v, const optional& x) { + return bool(x) ? v <= *x : false; +} + +template +constexpr bool operator<=(const optional& x, const T& v) { + return bool(x) ? *x <= v : true; +} + +template +constexpr bool operator>=(const T& v, const optional& x) { + return bool(x) ? v >= *x : true; +} + +// Comparison of optional with T +template +constexpr bool operator==(const optional& x, const T& v) { + return bool(x) ? *x == v : false; +} + +template +constexpr bool operator==(const T& v, const optional& x) { + return bool(x) ? v == *x : false; +} + +template +constexpr bool operator!=(const optional& x, const T& v) { + return bool(x) ? *x != v : true; +} + +template +constexpr bool operator!=(const T& v, const optional& x) { + return bool(x) ? v != *x : true; +} + +template +constexpr bool operator<(const optional& x, const T& v) { + return bool(x) ? *x < v : true; +} + +template +constexpr bool operator>(const T& v, const optional& x) { + return bool(x) ? v > *x : true; +} + +template +constexpr bool operator>(const optional& x, const T& v) { + return bool(x) ? *x > v : false; +} + +template +constexpr bool operator<(const T& v, const optional& x) { + return bool(x) ? v < *x : false; +} + +template +constexpr bool operator>=(const optional& x, const T& v) { + return bool(x) ? *x >= v : false; +} + +template +constexpr bool operator<=(const T& v, const optional& x) { + return bool(x) ? v <= *x : false; +} + +template +constexpr bool operator<=(const optional& x, const T& v) { + return bool(x) ? *x <= v : true; +} + +template +constexpr bool operator>=(const T& v, const optional& x) { + return bool(x) ? v >= *x : true; +} + +// 20.5.12, Specialized algorithms +template +void swap(optional& x, optional& y) noexcept(noexcept(x.swap(y))) { + x.swap(y); +} + +template +constexpr optional::type> make_optional(T&& v) { + return optional::type>(constexpr_forward(v)); +} + +template +constexpr optional make_optional(std::reference_wrapper v) { + return optional(v.get()); +} + +} // namespace multipy + +namespace std { +template +struct hash> { + typedef typename hash::result_type result_type; + typedef multipy::optional argument_type; + + constexpr result_type operator()(argument_type const& arg) const { + return arg ? std::hash{}(*arg) : result_type{}; + } +}; + +template +struct hash> { + typedef typename hash::result_type result_type; + typedef multipy::optional argument_type; + + constexpr result_type operator()(argument_type const& arg) const { + return arg ? std::hash{}(*arg) : result_type{}; + } +}; +} // namespace std + +#undef TR2_OPTIONAL_REQUIRES +#undef TR2_OPTIONAL_ASSERTED_EXPRESSION + +#endif //___OPTIONAL_HPP___ diff --git a/torch/csrc/deploy/interpreter/builtin_registry.cpp b/torch/csrc/deploy/interpreter/builtin_registry.cpp index a34768c2a009..611def2e7490 100644 --- a/torch/csrc/deploy/interpreter/builtin_registry.cpp +++ b/torch/csrc/deploy/interpreter/builtin_registry.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include namespace torch { @@ -44,7 +45,7 @@ BuiltinRegistryItem::BuiltinRegistryItem( fprintf( stderr, - "torch::deploy builtin %s contains %d modules\n", + "torch::deploy builtin %s contains %u modules\n", name, numModules); } @@ -65,6 +66,7 @@ void BuiltinRegistry::runPreInitialization() { const char* metaPathSetupTemplate = R"PYTHON( import sys +from importlib.metadata import DistributionFinder, Distribution # We need to register a custom meta path finder because we are registering # `torch._C` as a builtin module. # @@ -73,12 +75,36 @@ import sys # are top-level imports. Since `torch._C` is a submodule of `torch`, the # BuiltinImporter skips it. class F: + MODULES = {<<>>} + def find_spec(self, fullname, path, target=None): - if fullname in [<<>>]: + if fullname in self.MODULES: # Load this module using `BuiltinImporter`, but set `path` to None # in order to trick it into loading our module. return sys.meta_path[1].find_spec(fullname, path=None, target=None) return None + + def find_distributions(self, context=DistributionFinder.Context()): + modules = {"torch"} | self.MODULES + # Insert dummy distribution records for each builtin module so + # importlib.metadata.version(...) works. + if context.name is None: + for name in modules: + yield DummyDistribution(name) + if context.name in modules: + yield DummyDistribution(context.name) + +class DummyDistribution(Distribution): + def __init__(self, name): + self._metadata = { + "Name": name, + "Version": "0.0.1+fake_multipy", + } + + @property + def metadata(self): + return self._metadata + sys.meta_path.insert(0, F()) )PYTHON"; @@ -86,9 +112,9 @@ void BuiltinRegistry::runPostInitialization() { TORCH_INTERNAL_ASSERT(Py_IsInitialized()); std::string metaPathSetupScript(metaPathSetupTemplate); std::string replaceKey = "<<>>"; - auto itr = metaPathSetupScript.find(replaceKey); - if (itr != std::string::npos) { - metaPathSetupScript.replace(itr, replaceKey.size(), getBuiltinModulesCSV()); + size_t pos = metaPathSetupScript.find(replaceKey); + if (pos != std::string::npos) { + metaPathSetupScript.replace(pos, replaceKey.size(), getBuiltinModulesCSV()); } int r = PyRun_SimpleString(metaPathSetupScript.c_str()); TORCH_INTERNAL_ASSERT(r == 0); @@ -109,8 +135,8 @@ BuiltinRegistryItem* BuiltinRegistry::getItem(const std::string& name) { : get()->items_[itr->second].get(); } -int BuiltinRegistry::totalNumModules() { - int tot = 0; +unsigned BuiltinRegistry::totalNumModules() { + unsigned tot = 0; for (const auto& itemptr : get()->items_) { tot += itemptr->numModules; } @@ -119,7 +145,7 @@ int BuiltinRegistry::totalNumModules() { struct _frozen* BuiltinRegistry::getAllFrozenModules() { /* Allocate new memory for the combined table */ - int totNumModules = totalNumModules(); + size_t totNumModules = totalNumModules(); struct _frozen* p = nullptr; if (totNumModules > 0 && totNumModules <= SIZE_MAX / sizeof(struct _frozen) - 1) { @@ -134,7 +160,7 @@ struct _frozen* BuiltinRegistry::getAllFrozenModules() { memset(&p[0], 0, sizeof(p[0])); /* Copy the tables into the new memory */ - int off = 0; + unsigned off = 0; for (const auto& itemptr : items()) { if (itemptr->numModules > 0) { memcpy( diff --git a/torch/csrc/deploy/interpreter/builtin_registry.h b/torch/csrc/deploy/interpreter/builtin_registry.h index da7eb372de84..5f2726db67b6 100644 --- a/torch/csrc/deploy/interpreter/builtin_registry.h +++ b/torch/csrc/deploy/interpreter/builtin_registry.h @@ -22,7 +22,7 @@ * BuiltinRegisterer object. The constructor of BuiltinRegisterer does the real * registration work. */ -#include +#include #include #include #include @@ -49,7 +49,7 @@ struct BuiltinRegistryItem { std::vector>&& _builtinModules); const char* name; const struct _frozen* frozenModules; - int numModules; + unsigned numModules; std::vector> builtinModules; }; @@ -77,7 +77,7 @@ class BuiltinRegistry { static const std::vector>& items() { return get()->items_; } - static int totalNumModules(); + static unsigned totalNumModules(); static BuiltinRegistry* get(); static BuiltinRegistryItem* getItem(const std::string& name); static std::vector> getAllBuiltinModules(); diff --git a/torch/csrc/deploy/interpreter/import_find_sharedfuncptr.cpp b/torch/csrc/deploy/interpreter/import_find_sharedfuncptr.cpp index b8af5de3db20..2a89a96c623d 100644 --- a/torch/csrc/deploy/interpreter/import_find_sharedfuncptr.cpp +++ b/torch/csrc/deploy/interpreter/import_find_sharedfuncptr.cpp @@ -1,4 +1,5 @@ #include +#include #include using torch::deploy::CustomLibrary; diff --git a/torch/csrc/deploy/interpreter/interpreter_impl.cpp b/torch/csrc/deploy/interpreter/interpreter_impl.cpp index 1ff30f0afbb0..2af33582aa6d 100644 --- a/torch/csrc/deploy/interpreter/interpreter_impl.cpp +++ b/torch/csrc/deploy/interpreter/interpreter_impl.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -219,8 +220,8 @@ struct __attribute__((visibility("hidden"))) ConcreteInterpreterImpl } void setFindModule( - std::function(const std::string&)> find_module) - override { + std::function(const std::string&)> + find_module) override { std::function wrapped_find_module = [=](const std::string& name) -> py::object { auto r = find_module(name); diff --git a/torch/csrc/deploy/interpreter/interpreter_impl.h b/torch/csrc/deploy/interpreter/interpreter_impl.h index 10a1489740ec..a2dd57e9beeb 100644 --- a/torch/csrc/deploy/interpreter/interpreter_impl.h +++ b/torch/csrc/deploy/interpreter/interpreter_impl.h @@ -3,6 +3,7 @@ #include #include #include +#include /* Torch Deploy intentionally embeds multiple copies of c++ libraries providing python bindings necessary for torch::deploy users in the same @@ -15,8 +16,8 @@ the client application. It is safe to throw exception types that are defined once in - the context of the client application, such as c10::Error, which is defined - in libtorch, which isn't duplicated in torch::deploy interpreters. + the context of the client application, such as std::runtime_error, + which isn't duplicated in torch::deploy interpreters. ==> Use TORCH_DEPLOY_TRY, _SAFE_CATCH_RETHROW around _ALL_ torch::deploy APIs @@ -30,20 +31,17 @@ */ #define TORCH_DEPLOY_TRY try { -#define TORCH_DEPLOY_SAFE_CATCH_RETHROW \ - } \ - catch (std::exception & err) { \ - throw c10::Error( \ - std::string( \ - "Exception Caught inside torch::deploy embedded library: \n") + \ - err.what(), \ - ""); \ - } \ - catch (...) { \ - throw c10::Error( \ - std::string( \ - "Unknown Exception Caught inside torch::deploy embedded library"), \ - ""); \ +#define TORCH_DEPLOY_SAFE_CATCH_RETHROW \ + } \ + catch (std::exception & err) { \ + throw std::runtime_error( \ + std::string( \ + "Exception Caught inside torch::deploy embedded library: \n") + \ + err.what()); \ + } \ + catch (...) { \ + throw std::runtime_error(std::string( \ + "Unknown Exception Caught inside torch::deploy embedded library")); \ } namespace torch { namespace deploy { @@ -132,7 +130,7 @@ struct InterpreterSessionImpl { struct InterpreterImpl { virtual InterpreterSessionImpl* acquireSession() = 0; virtual void setFindModule( - std::function(const std::string&)> + std::function(const std::string&)> find_module) = 0; virtual ~InterpreterImpl() = default; // this will uninitialize python }; diff --git a/torch/csrc/deploy/loader.cpp b/torch/csrc/deploy/loader.cpp index f03a2d299a55..ab4d0c7c329e 100644 --- a/torch/csrc/deploy/loader.cpp +++ b/torch/csrc/deploy/loader.cpp @@ -53,8 +53,8 @@ // Get PAGE_SIZE and PAGE_MASK. #include -#include #include +#include #include #include @@ -300,15 +300,15 @@ struct __attribute__((visibility("hidden"))) SystemLibraryImpl SystemLibraryImpl(void* handle, bool steal) : handle_(handle), own_handle_(steal && handle != RTLD_DEFAULT) {} - at::optional sym(const char* name) const override { + multipy::optional sym(const char* name) const override { void* r = dlsym(handle_, name); if (!r) { - return at::nullopt; + return multipy::nullopt; } return (Elf64_Addr)r; } - at::optional tls_sym(const char* name) const override; + multipy::optional tls_sym(const char* name) const override; ~SystemLibraryImpl() override { if (own_handle_) { @@ -534,11 +534,11 @@ struct ElfDynamicInfo { } } - at::optional sym( + multipy::optional sym( const char* name, GnuHash* precomputed_hash = nullptr) const { if (!gnu_bucket_) { - return at::nullopt; // no hashtable was loaded + return multipy::nullopt; // no hashtable was loaded } GnuHash hash_obj = precomputed_hash ? *precomputed_hash : GnuHash(name); auto hash = hash_obj.hash; @@ -551,12 +551,12 @@ struct ElfDynamicInfo { const uint32_t h2 = (hash >> gnu_shift2_) % kBloomMaskBits; if ((1 & (bloom_word >> h1) & (bloom_word >> h2)) != 1) { - return at::nullopt; + return multipy::nullopt; } uint32_t sym_idx = gnu_bucket_[hash % gnu_nbucket_]; if (sym_idx == 0) { - return at::nullopt; + return multipy::nullopt; } uint32_t chain_value = 0; @@ -574,12 +574,12 @@ struct ElfDynamicInfo { ((ELF64_ST_TYPE(sym->st_info) == STT_TLS) ? 0 : load_bias_); } // symbol isn't defined - return at::nullopt; + return multipy::nullopt; } } ++sym_idx; } while ((chain_value & 1) == 0); - return at::nullopt; + return multipy::nullopt; } }; @@ -613,7 +613,7 @@ struct AlreadyLoadedSymTable { dyninfo_.initialize_from_dynamic_section(name, dynamic, load_bias, true); } - at::optional sym(const char* name) { + multipy::optional sym(const char* name) { return dyninfo_.sym(name); } }; @@ -626,8 +626,8 @@ static int iterate_cb(struct dl_phdr_info* info, size_t size, void* data) { // with a normal dlsym call. Instead we iterate through all loaded libraries and // check their symbol tables for the symbol. The value of the symbol is the TLS // offset. When we find the library we also get the module id. -at::optional slow_find_tls_symbol_offset(const char* sym_name) { - at::optional result = at::nullopt; +multipy::optional slow_find_tls_symbol_offset(const char* sym_name) { + multipy::optional result = multipy::nullopt; std::function cb = [&](struct dl_phdr_info* info, size_t size) { // std::cout << "SEARCHING .. " << info->dlpi_name << "\n"; @@ -650,10 +650,11 @@ at::optional slow_find_tls_symbol_offset(const char* sym_name) { return result; } -at::optional SystemLibraryImpl::tls_sym(const char* name) const { +multipy::optional SystemLibraryImpl::tls_sym(const char* name) const { if (!sym(name)) { - return at::nullopt; // before we do a bunch of slow lookups to find the - // module_id, check that this even defines the symbol + return multipy::nullopt; // before we do a bunch of slow lookups to find the + // module_id, check that this even defines the + // symbol } if (handle_ == RTLD_DEFAULT) { return slow_find_tls_symbol_offset(name); @@ -675,7 +676,7 @@ at::optional SystemLibraryImpl::tls_sym(const char* name) const { "failed to query dlinfo for module_id"); return TLSIndex{module_id, *r}; } - return at::nullopt; + return multipy::nullopt; } // dlopen does not accept additional search paths as an argument. @@ -966,7 +967,7 @@ struct __attribute__((visibility("hidden"))) CustomLibraryImpl dyninfo_.needed_); } - at::optional lookup_symbol(Elf64_Xword r_info) { + multipy::optional lookup_symbol(Elf64_Xword r_info) { const uint32_t r_type = ELF64_R_TYPE(r_info); const uint32_t r_sym = ELF64_R_SYM(r_info); @@ -999,10 +1000,10 @@ struct __attribute__((visibility("hidden"))) CustomLibraryImpl name_.c_str(), sym_name); } - return at::nullopt; + return multipy::nullopt; } - at::optional tls_lookup_symbol(Elf64_Xword r_info) { + multipy::optional tls_lookup_symbol(Elf64_Xword r_info) { const uint32_t r_sym = ELF64_R_SYM(r_info); if (r_sym == 0) { @@ -1030,7 +1031,7 @@ struct __attribute__((visibility("hidden"))) CustomLibraryImpl name_.c_str(), sym_name); } - return at::nullopt; + return multipy::nullopt; } void relocate_one(const Elf64_Rela& reloc) { @@ -1177,16 +1178,16 @@ struct __attribute__((visibility("hidden"))) CustomLibraryImpl f(argc_, argv_, environ); } - at::optional sym(const char* name) const override { + multipy::optional sym(const char* name) const override { return dyninfo_.sym(name); } - at::optional tls_sym(const char* name) const override { + multipy::optional tls_sym(const char* name) const override { auto r = dyninfo_.sym(name); if (r) { return TLSIndex{module_id(), *r}; } - return at::nullopt; + return multipy::nullopt; } void* tls_addr(size_t offset) { diff --git a/torch/csrc/deploy/loader.h b/torch/csrc/deploy/loader.h index eeff1a30174e..9e5a7fd4571d 100644 --- a/torch/csrc/deploy/loader.h +++ b/torch/csrc/deploy/loader.h @@ -1,7 +1,7 @@ #pragma once -#include #include #include +#include #include namespace torch { @@ -19,8 +19,8 @@ struct TLSIndex { struct SymbolProvider { SymbolProvider() = default; - virtual at::optional sym(const char* name) const = 0; - virtual at::optional tls_sym(const char* name) const = 0; + virtual multipy::optional sym(const char* name) const = 0; + virtual multipy::optional tls_sym(const char* name) const = 0; SymbolProvider(const SymbolProvider&) = delete; SymbolProvider& operator=(const SymbolProvider&) = delete; virtual ~SymbolProvider() = default; diff --git a/torch/csrc/deploy/mem_file.h b/torch/csrc/deploy/mem_file.h index c50889f8353b..df4fe941ca58 100644 --- a/torch/csrc/deploy/mem_file.h +++ b/torch/csrc/deploy/mem_file.h @@ -1,9 +1,9 @@ #pragma once -#include #include #include #include +#include #include #include #include @@ -20,18 +20,21 @@ namespace deploy { struct MemFile { explicit MemFile(const char* filename_) : fd_(0), mem_(nullptr), n_bytes_(0) { fd_ = open(filename_, O_RDONLY); - TORCH_CHECK(fd_ != -1, "failed to open {}: {}", filename_, strerror(errno)); + MULTIPY_CHECK( + fd_ != -1, "failed to open {}: {}" + filename_ + strerror(errno)); // NOLINTNEXTLINE struct stat s; if (-1 == fstat(fd_, &s)) { close(fd_); // destructors don't run during exceptions - TORCH_CHECK(false, "failed to stat {}: {}", filename_, strerror(errno)); + MULTIPY_CHECK( + false, "failed to stat {}: {}" + filename_ + strerror(errno)); } n_bytes_ = s.st_size; mem_ = mmap(nullptr, n_bytes_, PROT_READ, MAP_SHARED, fd_, 0); if (MAP_FAILED == mem_) { close(fd_); - TORCH_CHECK(false, "failed to mmap {}: {}", filename_, strerror(errno)); + MULTIPY_CHECK( + false, "failed to mmap {}: {}" + filename_ + strerror(errno)); } } MemFile(const MemFile&) = delete; diff --git a/torch/csrc/deploy/remove_dt_needed.cpp b/torch/csrc/deploy/remove_dt_needed.cpp index 5f4bb28c7c29..8b1cad535814 100644 --- a/torch/csrc/deploy/remove_dt_needed.cpp +++ b/torch/csrc/deploy/remove_dt_needed.cpp @@ -10,6 +10,7 @@ #include #include +#include #include #define ERROR(msg_fmt, ...) \ @@ -47,7 +48,7 @@ int main(int argc, const char** argv) { auto program_headers = (Elf64_Phdr*)(data + header->e_phoff); auto n_program_headers = header->e_phnum; Elf64_Dyn* dynamic = nullptr; - for (size_t i = 0; i < n_program_headers; ++i) { + for (const auto i : c10::irange(n_program_headers)) { const Elf64_Phdr* phdr = &program_headers[i]; if (phdr->p_type == PT_DYNAMIC) { dynamic = reinterpret_cast(data + phdr->p_offset); diff --git a/torch/csrc/deploy/test_deploy.cpp b/torch/csrc/deploy/test_deploy.cpp index 840720cc01f8..780937a51e7c 100644 --- a/torch/csrc/deploy/test_deploy.cpp +++ b/torch/csrc/deploy/test_deploy.cpp @@ -182,13 +182,14 @@ TEST(TorchpyTest, ErrorsReplicatingObj) { auto obj = session1.fromMovable(replicatedObj); // should throw an error when trying to access obj from different session // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto) - EXPECT_THROW(session2.createMovable(obj), c10::Error); + EXPECT_THROW(session2.createMovable(obj), std::runtime_error); try { session2.createMovable(obj); - } catch (c10::Error& error) { + } catch (std::runtime_error& error) { EXPECT_TRUE( - error.msg().find( - "Cannot create movable from an object that lives in different session") != + std::string(error.what()) + .find( + "Cannot create movable from an object that lives in different session") != std::string::npos); } } @@ -197,15 +198,15 @@ TEST(TorchpyTest, ThrowsSafely) { // See explanation in deploy.h torch::deploy::InterpreterManager manager(3); // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto) - EXPECT_THROW(manager.loadPackage("some garbage path"), c10::Error); + EXPECT_THROW(manager.loadPackage("some garbage path"), std::runtime_error); torch::deploy::Package p = manager.loadPackage(path("SIMPLE", simple)); // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto) - EXPECT_THROW(p.loadPickle("some other", "garbage path"), c10::Error); + EXPECT_THROW(p.loadPickle("some other", "garbage path"), std::runtime_error); auto model = p.loadPickle("model", "model.pkl"); // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto) - EXPECT_THROW(model(at::IValue("unexpected input")), c10::Error); + EXPECT_THROW(model(at::IValue("unexpected input")), std::runtime_error); } TEST(TorchpyTest, AcquireMultipleSessionsInTheSamePackage) { @@ -238,7 +239,7 @@ TEST(TorchpyTest, TensorSharingNotAllowed) { auto t = obj.toIValue().toTensor(); // try to feed it to the other interpreter, should error // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto) - ASSERT_THROW(I1.global("torch", "sigmoid")({t}), c10::Error); + ASSERT_THROW(I1.global("torch", "sigmoid")({t}), std::runtime_error); } TEST(TorchpyTest, TaggingRace) { @@ -259,7 +260,7 @@ TEST(TorchpyTest, TaggingRace) { try { I.fromIValue(t); success++; - } catch (const c10::Error& e) { + } catch (const std::runtime_error& e) { failed++; } } @@ -279,7 +280,7 @@ TEST(TorchpyTest, DisarmHook) { torch::deploy::InterpreterManager m(1); auto I = m.acquireOne(); // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto) - ASSERT_THROW(I.fromIValue(t), c10::Error); // NOT a segfault + ASSERT_THROW(I.fromIValue(t), std::runtime_error); // NOT a segfault } TEST(TorchpyTest, RegisterModule) { @@ -291,6 +292,7 @@ TEST(TorchpyTest, RegisterModule) { } } +#ifdef FBCODE_CAFFE2 TEST(TorchpyTest, FxModule) { size_t nthreads = 3; torch::deploy::InterpreterManager manager(nthreads); @@ -317,6 +319,7 @@ TEST(TorchpyTest, FxModule) { ASSERT_TRUE(ref_output.equal(outputs[i])); } } +#endif // Moving a tensor between interpreters should share the underlying storage. TEST(TorchpyTest, TensorSerializationSharing) { @@ -448,6 +451,18 @@ result = torch.Tensor([1,2,3]) EXPECT_TRUE(w_grad0.equal(w_grad1)); } +TEST(TorchpyTest, ImportlibMetadata) { + torch::deploy::InterpreterManager m(1); + m.registerModuleSource("importlib_test", R"PYTHON( +from importlib.metadata import version + +result = version("torch") +)PYTHON"); + auto I = m.allInstances()[0].acquireSession(); + auto ver = I.global("importlib_test", "result").toIValue().toString(); + ASSERT_EQ(ver->string(), "0.0.1+fake_multipy"); +} + // OSS build does not have bultin numpy support yet. Use this flag to guard the // test case. #if HAS_NUMPY @@ -479,6 +494,42 @@ TEST(TorchpyTest, TestPyYAML) { } #endif +TEST(TorchpyTest, PrintInstruction) { + const auto jit_script_with_print = R"JIT( + def forward(self, a): + print(a) + return a + a + )JIT"; + + auto input = torch::autograd::make_variable(at::randn({2, 3})); + auto expected_forward = input + input; + + auto module = std::make_shared( + "Module", std::make_shared()); + module->define(jit_script_with_print); + + std::vector inputs{at::IValue(input)}; + + // Checking that a module containing prim::Print() works fine. + auto result1 = (*module)(inputs); + EXPECT_TRUE(result1.toTensor().equal(expected_forward)); + + { + auto interpreterManager = + std::make_shared(1); + + // Checking that a module containing prim::Print() still works fine + // after Python environment was created. + auto result2 = (*module)(inputs); + EXPECT_TRUE(result2.toTensor().equal(expected_forward)); + } + + // Checking that a module containing prim::Print() still works fine + // after Python environment was created and then destroyed. + auto result3 = (*module)(inputs); + EXPECT_TRUE(result3.toTensor().equal(expected_forward)); +} + int main(int argc, char* argv[]) { ::testing::InitGoogleTest(&argc, argv); int rc = RUN_ALL_TESTS(); diff --git a/torch/csrc/deploy/test_deploy_gpu.cpp b/torch/csrc/deploy/test_deploy_gpu.cpp index 8fa154b80709..48660c79fefa 100644 --- a/torch/csrc/deploy/test_deploy_gpu.cpp +++ b/torch/csrc/deploy/test_deploy_gpu.cpp @@ -67,6 +67,7 @@ TEST(TorchDeployGPUTest, UsesDistributed) { } } +#ifdef FBCODE_CAFFE2 TEST(TorchDeployGPUTest, TensorRT) { if (!torch::cuda::is_available()) { GTEST_SKIP(); @@ -85,6 +86,7 @@ TEST(TorchDeployGPUTest, TensorRT) { output.allclose(model(at::IValue{input}).toIValue().toTensor())); } } +#endif // OSS build does not have bultin numpy support yet. Use this flag to guard the // test case. diff --git a/torch/csrc/deploy/test_deploy_missing_interpreter.cpp b/torch/csrc/deploy/test_deploy_missing_interpreter.cpp index 8ac602a3f2fc..b47f4556ad78 100644 --- a/torch/csrc/deploy/test_deploy_missing_interpreter.cpp +++ b/torch/csrc/deploy/test_deploy_missing_interpreter.cpp @@ -10,5 +10,5 @@ int main(int argc, char* argv[]) { TEST(TorchDeployMissingInterpreter, Throws) { // NOLINTNEXTLINE(hicpp-avoid-goto,cppcoreguidelines-avoid-goto) - EXPECT_THROW(torch::deploy::InterpreterManager(1), c10::Error); + EXPECT_THROW(torch::deploy::InterpreterManager(1), std::runtime_error); } diff --git a/torch/csrc/deploy/unity/xar_environment.cpp b/torch/csrc/deploy/unity/xar_environment.cpp index 3ff233b0c420..4bb764374525 100644 --- a/torch/csrc/deploy/unity/xar_environment.cpp +++ b/torch/csrc/deploy/unity/xar_environment.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include @@ -59,7 +60,7 @@ bool _fileExists(const std::string& filePath) { } void XarEnvironment::setupPythonApp() { - TORCH_CHECK( + MULTIPY_CHECK( !alreadySetupPythonApp_, "Already setup the python application. It should only been done once!"); @@ -67,7 +68,8 @@ void XarEnvironment::setupPythonApp() { constexpr const char* SECTION_NAME = ".torch_deploy_payload.unity"; ElfFile elfFile(exePath_.c_str()); auto payloadSection = elfFile.findSection(SECTION_NAME); - TORCH_CHECK(payloadSection != at::nullopt, "Missing the payload section"); + MULTIPY_CHECK( + payloadSection != multipy::nullopt, "Missing the payload section"); const char* pythonAppPkgStart = payloadSection->start; auto pythonAppPkgSize = payloadSection->len; LOG(INFO) << "Embedded binary size " << pythonAppPkgSize; @@ -107,23 +109,26 @@ void XarEnvironment::setupPythonApp() { * past runs. It should be pretty safe to discard them. */ std::string rmCmd = fmt::format("rm -rf {}", pythonAppDir_); - TORCH_CHECK(system(rmCmd.c_str()) == 0, "Fail to remove the directory."); + MULTIPY_CHECK(system(rmCmd.c_str()) == 0, "Fail to remove the directory."); // recreate the directory auto r = mkdir(pythonAppDir_.c_str(), 0777); - TORCH_CHECK(r == 0, "Failed to create directory: ", strerror(errno)); + MULTIPY_CHECK(r == 0, "Failed to create directory: " + strerror(errno)); std::string pythonAppArchive = std::string(pythonAppDir_) + "/python_app.xar"; auto fp = fopen(pythonAppArchive.c_str(), "wb"); - TORCH_CHECK(fp != nullptr, "Fail to create file: ", strerror(errno)); + MULTIPY_CHECK(fp != nullptr, "Fail to create file: " + strerror(errno)); auto written = fwrite(pythonAppPkgStart, 1, pythonAppPkgSize, fp); - TORCH_CHECK(written == pythonAppPkgSize, "Expected written == size"); + MULTIPY_CHECK(written == pythonAppPkgSize, "Expected written == size"); fclose(fp); std::string extractCommand = fmt::format( "unsquashfs -o 4096 -d {} {}", pythonAppRoot_, pythonAppArchive); r = system(extractCommand.c_str()); - TORCH_CHECK(r == 0, "Fail to extract the python package"); + MULTIPY_CHECK( + r == 0, + "Fail to extract the python package" + std::to_string(r) + + extractCommand.c_str()); alreadySetupPythonApp_ = true; } @@ -143,12 +148,9 @@ void XarEnvironment::preloadSharedLibraries() { << " does not exist in the python app root, skip loading it"; continue; } - TORCH_CHECK( + MULTIPY_CHECK( dlopen(preloadList[i], RTLD_GLOBAL | RTLD_LAZY) != nullptr, - "Fail to open the shared library ", - preloadList[i], - ": ", - dlerror()); + "Fail to open the shared library " + preloadList[i] + ": " + dlerror()); } } diff --git a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_resp.cpp b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_resp.cpp index 369f9f1242b3..c82d940cf3d3 100644 --- a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_resp.cpp +++ b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_resp.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -121,9 +122,9 @@ std::unique_ptr RpcWithProfilingResp::fromMessage( int profiledEventsSize = tupleElements[2].toInt(); std::vector remoteEvents; remoteEvents.reserve(profiledEventsSize); - for (int i = kProfileEventsStartIdx; - i < kProfileEventsStartIdx + profiledEventsSize; - ++i) { + for (const auto i : c10::irange( + kProfileEventsStartIdx, + kProfileEventsStartIdx + profiledEventsSize)) { // NOLINTNEXTLINE(clang-diagnostic-sign-compare) TORCH_CHECK(i < tupleElements.size()); // Reconstruct remote event from the ivalues. diff --git a/torch/csrc/distributed/c10d/NCCLUtils.cpp b/torch/csrc/distributed/c10d/NCCLUtils.cpp index 0c1dd97a1468..568c23ef7a20 100644 --- a/torch/csrc/distributed/c10d/NCCLUtils.cpp +++ b/torch/csrc/distributed/c10d/NCCLUtils.cpp @@ -36,9 +36,12 @@ std::string getNcclVersion() { if (status != ncclSuccess || version < 100) { versionString = "Unknown NCCL version"; } else { - auto ncclMajor = version / 1000; - auto ncclMinor = (version % 1000) / 100; - auto ncclPatch = version % (ncclMajor * 1000 + ncclMinor * 100); + // NCCL changed version coding starting 2.9 + const int majorBase = version < 2900 ? 1000 : 10000; + const int minorBase = 100; + auto ncclMajor = version / majorBase; + auto ncclMinor = (version % majorBase) / minorBase; + auto ncclPatch = version % (ncclMajor * majorBase + ncclMinor * minorBase); versionString = std::to_string(ncclMajor) + "." + std::to_string(ncclMinor) + "." + std::to_string(ncclPatch); } diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp index 9dabc0c8c3fc..7ca54d167ead 100644 --- a/torch/csrc/distributed/c10d/NCCLUtils.hpp +++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp @@ -25,7 +25,8 @@ const inline char* getNcclErrorDetailStr(ncclResult_t error, c10::optional #include +#include namespace c10d { @@ -49,7 +50,8 @@ std::string opTypeToString(OpType opType) { return "UNKNOWN"; } -bool isP2POp(OpType opType) { +bool isP2POp(OpType opType, bool batchP2P /*= false*/) { + if (batchP2P) return false; return opType == OpType::SEND || opType == OpType::RECV || opType == OpType::RECVANYSOURCE; } @@ -76,7 +78,7 @@ ProcessGroup::Work::Work( inputs.emplace_back(tensor); } } - recordingFunction->before(profilingTitle, inputs); + recordingFunction->before(profilingTitle, c10::ArrayRef(inputs.data(), inputs.size())); std::function end_handler = [recordingFunction]() { recordingFunction->end(); }; @@ -174,10 +176,14 @@ void ProcessGroup::Work::finishAndThrow(std::exception_ptr exception) { } ProcessGroup::ProcessGroup(int rank, int size) - : rank_(rank), size_(size), dist_debug_level_(parseDistDebugLevel()) { + : rank_(rank), size_(size), dist_debug_level_(debug_level()) { C10_LOG_API_USAGE_ONCE("c10d.process_group"); } ProcessGroup::~ProcessGroup() {} +void ProcessGroup::init() { + C10_LOG_API_USAGE_ONCE(fmt::format("c10d.process_group_{}", getBackendName())); +} + } // namespace c10d diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp index 999189b1fe90..af97bdc9bd8a 100644 --- a/torch/csrc/distributed/c10d/ProcessGroup.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp @@ -12,6 +12,7 @@ #include #include +#include #include // ************************************************************************* @@ -54,7 +55,7 @@ enum class OpType : std::uint8_t { TORCH_API std::string opTypeToString(OpType opType); // Whether or not an OP is an p2p op (SEND, RECV, RECVANYSOURCE) -TORCH_API bool isP2POp(OpType opType); +TORCH_API bool isP2POp(OpType opType, bool batchP2P = false); // ProcessGroup is a base class that captures collective and point to // point communication in a fixed set of processes. @@ -426,13 +427,17 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder { } protected: + // Implementations of this interface need to call this to setup + // appropriate logging etc. + void init(); + const int rank_; const int size_; // Optional sequence number structure for matching collectives. c10::optional sequenceNum_ = c10::nullopt; // Debug level setting. It is parsed once when ProcessGroup is constructed and // remains the same across use of this process group. - DistributedDebugLevel dist_debug_level_; + DebugLevel dist_debug_level_; }; } // namespace c10d diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp index d95afa32ec8e..f2b553ba1cc8 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp @@ -490,7 +490,7 @@ inline void ProcessGroupGloo::AsyncWork::recordAsyncWorkProfilingInfo( inputs.emplace_back(tensor); } } - recordingFunction->before(profilingTitle, inputs); + recordingFunction->before(profilingTitle, c10::ArrayRef(inputs.data(), inputs.size())); }; recordFunctionBeforeCallback_ = at::wrapPropagateTLSState(before_handler); std::function end_handler = [recordingFunction]() { @@ -763,6 +763,8 @@ ProcessGroupGloo::ProcessGroupGloo( for(const auto i : c10::irange(threads_.size())) { threads_[i] = std::thread(&ProcessGroupGloo::runLoop, this, i); } + + init(); } ProcessGroupGloo::~ProcessGroupGloo() { @@ -2814,7 +2816,7 @@ void ProcessGroupGloo::monitoredBarrier( TORCH_INTERNAL_ASSERT(!failedRanks.empty()); const std::string ranksStr = c10::Join(", ", failedRanks); const std::string error = c10::str( - "Ranks ", + "[Rank 0]: Ranks ", ranksStr, " failed to pass monitoredBarrier in ", monitoredBarrierTimeout.count(), @@ -2834,8 +2836,9 @@ void ProcessGroupGloo::monitoredBarrier( waitLoop(sendWorkMap); - auto elapsedTime = std::chrono::duration_cast( - std::chrono::steady_clock::now() - startTime); + using namespace std::chrono; + C10_UNUSED auto elapsedTime = duration_cast( + steady_clock::now() - startTime); } void ProcessGroupGloo::setSequenceNumberForGroup() { diff --git a/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp b/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp index 714f3a84deb6..556ab1388712 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp @@ -310,6 +310,8 @@ ProcessGroupMPI::ProcessGroupMPI(int rank, int size, MPI_Comm pgComm) // Start the worker thread accepting MPI calls workerThread_ = std::thread(&ProcessGroupMPI::runLoop, this); + + init(); } ProcessGroupMPI::~ProcessGroupMPI() { @@ -695,7 +697,7 @@ c10::intrusive_ptr ProcessGroupMPI::alltoall_base( "Tensor's dim 0 does not divide equally across group size"); std::function&)> runFunc = - [opts, this](std::unique_ptr& entry) { + [this](std::unique_ptr& entry) { auto srcdata = (entry->src)[0]; auto dstdata = (entry->dst)[0]; c10::DeviceGuard guard(srcdata.device()); @@ -722,7 +724,7 @@ c10::intrusive_ptr ProcessGroupMPI::alltoall_base( c10d::checkSplitSizes(inputSplitSizes, inputTensor, size_); c10d::checkSplitSizes(outputSplitSizes, outputTensor, size_); std::function&)> runFunc = - [opts, this, inputSplitSizes, outputSplitSizes]( + [this, inputSplitSizes, outputSplitSizes]( std::unique_ptr& entry) { auto srcdata = (entry->src)[0]; auto dstdata = (entry->dst)[0]; @@ -769,7 +771,7 @@ c10::intrusive_ptr ProcessGroupMPI::alltoall( outputTensors.size() == size_, "Number of output tensors are not equal to group size"); std::function&)> runFunc = - [opts, this](std::unique_ptr& entry) { + [this](std::unique_ptr& entry) { std::vector send_lengths(size_); std::vector recv_lengths(size_); std::vector send_offsets(size_); diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp index a48435c8f5a3..8d248b0571bc 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp @@ -1,4 +1,5 @@ #include +#include #include #ifdef USE_C10D_NCCL @@ -30,8 +31,6 @@ constexpr const char* const kNCCLAbortedCommStoreKey = "NCCLABORTEDCOMM"; namespace { -constexpr int kBytes = 8; - // RAII helper class to manage NCCL group API and CUDA free mutex. // The destructor is allowed to throw since this helper class only // manages group and lock lifetimes. @@ -376,11 +375,20 @@ bool ProcessGroupNCCL::WorkNCCL::startedGPUExecutionInternal() const { } bool ProcessGroupNCCL::WorkNCCL::finishedGPUExecutionInternal() const { - for (const auto i : c10::irange(devices_.size())) { - // Checking the work's corresponding CUDA events' status - if (!(*ncclEndEvents_)[i].query()) { - return false; + try { + for (const auto i : c10::irange(devices_.size())) { + // Checking the work's corresponding CUDA events' status + if (!(*ncclEndEvents_)[i].query()) { + return false; + } + } + } catch (const std::exception& e) { + if (std::string(e.what()).find("driver shutting down") == std::string::npos) { + throw; } + LOG(INFO) << "[Rank " << rank_ + << "] Event query failed with exception: " + << e.what(); } return true; } @@ -430,10 +438,6 @@ void ProcessGroupNCCL::WorkNCCL::synchronizeInternal( // In case of blocking, wait for the operation to complete. if (blockingWait_) { - // Use the passed in timeout if provided, otherwise use the default - // opTimeout for each WorkNCCL object. - std::chrono::milliseconds workTimeout = - timeout == kNoTimeout ? opTimeout_ : timeout; // Wait for the operation to complete. while (!isCompleted()) { if (timedOut()) { @@ -579,12 +583,7 @@ ProcessGroupNCCL::ProcessGroupNCCL( workCleanupThread_ = std::thread(&ProcessGroupNCCL::workCleanupLoop, this); } - const char* ncclDebugLevel = std::getenv("NCCL_DEBUG"); - - if (!ncclDebugLevel) { - ncclDebugLevel = "UNSET"; - } - + init(); LOG(INFO) << "[Rank " << rank_ << "] ProcessGroupNCCL initialized with following options:" << "\nNCCL_ASYNC_ERROR_HANDLING: " << asyncErrorHandling_ @@ -592,8 +591,27 @@ ProcessGroupNCCL::ProcessGroupNCCL( << "\nNCCL_BLOCKING_WAIT: " << blockingWait_ << "\nTIMEOUT(ms): " << options_->timeout.count() << "\nUSE_HIGH_PRIORITY_STREAM: " - << options_->is_high_priority_stream - << "\nNCCL_DEBUG: " << ncclDebugLevel; + << options_->is_high_priority_stream; + +#ifdef USE_NCCL_WITH_UCC + static std::once_flag initialize_ucc_lib_flag; + std::call_once(initialize_ucc_lib_flag, [&]{ + uccLib_ = loadTorchUCC(); + if (uccLib_ != nullptr) { + LOG(INFO) << "[Rank " << rank_ << "] torch_ucc.so loaded"; + } + }); + + if (uccLib_ != nullptr) { + LOG(INFO) << "[Rank " << rank_ << "] torch_ucc.so loaded"; + typedef c10::intrusive_ptr fn(const c10::intrusive_ptr& store, int rank, int size); + auto createProcessGroupUCC = reinterpret_cast(uccLib_->sym("createProcessGroupUCC")); + if (createProcessGroupUCC != nullptr) { + uccPG_ = createProcessGroupUCC(store, rank_, size_); + LOG(INFO) << "[Rank " << rank_ << "] ProcessGroupUCC created."; + } + } +#endif } void ProcessGroupNCCL::runHealthCheck() { @@ -983,7 +1001,7 @@ std::exception_ptr ProcessGroupNCCL::checkForNCCLErrorsInternal( void ProcessGroupNCCL::broadcastUniqueNCCLID( ncclUniqueId* ncclID, - OpType opType, + bool isSingleP2POp, const std::string& p2pKey, int p2pRank) { // For collective operations: @@ -993,7 +1011,7 @@ void ProcessGroupNCCL::broadcastUniqueNCCLID( // retrieving the contents of that key. A single process group // may create multiple NCCL communicators, so we use a sequence // number to differentiate between them. - // For point-to-point operations: + // For single point-to-point operations: // The sequence number will only be increased on 2 out of all the // processes in a Process Group. So all following collective // operations will see different sequence numbers which will cause @@ -1001,12 +1019,12 @@ void ProcessGroupNCCL::broadcastUniqueNCCLID( // of sequence number for p2p communications. std::string storeKey; - if (!isP2POp(opType)) { + if (!isSingleP2POp) { storeKey = std::to_string(ncclCommCounter_++); } else { storeKey = p2pKey; } - if (rank_ == 0 || (isP2POp(opType) && p2pRank == 0)) { + if (rank_ == 0 || (isSingleP2POp && p2pRank == 0)) { auto vec = std::vector( reinterpret_cast(ncclID), reinterpret_cast(ncclID) + NCCL_UNIQUE_ID_BYTES); @@ -1097,15 +1115,18 @@ std::vector>& ProcessGroupNCCL::getNCCLComm( // Create the unique NCCL ID and broadcast it ncclUniqueId ncclID; + // For batch_isend_irecv, ncclGroupStart() would be called upfront + bool batchP2P = ncclActiveGroupCounter_ > 0; + bool singleP2POp = isP2POp(opType, batchP2P); // For point-to-point communication, lower rank of the two will get unique id. - if (rank_ == 0 || (isP2POp(opType) && p2pRank == 0)) { + if (rank_ == 0 || (singleP2POp && p2pRank == 0)) { C10D_NCCL_CHECK(ncclGetUniqueId(&ncclID), c10::nullopt); } // For point-to-point communication on the same process, don't need broadcast. if (!isSendRecvSelf) { // Broadcast so that each process can have a unique NCCL ID - broadcastUniqueNCCLID(&ncclID, opType, devicesKey, p2pRank); + broadcastUniqueNCCLID(&ncclID, singleP2POp, devicesKey, p2pRank); } at::cuda::OptionalCUDAGuard gpuGuard; @@ -1141,7 +1162,8 @@ std::vector>& ProcessGroupNCCL::getNCCLComm( // GPU world size and GPU rank int numRanks, rank; - if (!isP2POp(opType)) { + if (!singleP2POp) { + // Collective, all-to-all, or batch P2P numRanks = getSize() * devices.size(); rank = getRank() * devices.size() + i; } else if (isSendRecvSelf) { @@ -1149,7 +1171,7 @@ std::vector>& ProcessGroupNCCL::getNCCLComm( numRanks = 1; rank = 0; } else { - // For point-to-point operation, there are only 2 processes involved so + // For single point-to-point operation, there are only 2 processes involved so // the GPU rank is either 0 or 1. numRanks = 2; rank = p2pRank; @@ -1168,6 +1190,12 @@ std::vector>& ProcessGroupNCCL::getNCCLComm( // [Note 2 ] C10D_NCCL_CHECK(ncclGroupEnd(), c10::nullopt); + // At this point NCCL should have been initialized, hence we can accurately get + // the env value even if NCCL sets it by reading from nccl.conf file + if (getRank() == 0) { + LOG(INFO) << "NCCL_DEBUG: " << parse_env("NCCL_DEBUG"); + } + // See [Group Start/End Note] for (const auto i : c10::irange(ncclActiveGroupCounter_)) { (void)i; @@ -1503,9 +1531,25 @@ c10::intrusive_ptr ProcessGroupNCCL::pointToPoint( PostProcess post, const char* profilingTitle) { const auto devices = getDeviceList(tensors); - const auto key = getKeySendRecv(rank_, peer); - int p2pRank = rank_ <= peer ? 0 : 1; - auto isSendRecvSelf = rank_ == peer; + std::string key; + int p2pRank = 0, p2pTargetRank = 0; + bool isSendRecvSelf = false; + // For batch_isend_irecv, ncclGroupStart() would be called upfront + bool batchP2P = ncclActiveGroupCounter_ > 0; + if (batchP2P) { + // For batch P2P, we need to treat it like a collective when selecting + // communicator, because other ranks can call into this batch other than my + // rank and my peer + key = getKeyFromDevices(devices); + p2pRank = rank_; + p2pTargetRank = peer; + } else { + // For single P2P, preserve the old two-rank behavior (to avoid perf diff) + key = getKeySendRecv(rank_, peer); + p2pRank = rank_ <= peer ? 0 : 1; + isSendRecvSelf = rank_ == peer; + p2pTargetRank = isSendRecvSelf ? 0 : 1 - p2pRank; + } auto& ncclComms = getNCCLComm(key, devices, opType, p2pRank, isSendRecvSelf); // First let NCCL streams wait for input tensors allocation streams @@ -1557,9 +1601,6 @@ c10::intrusive_ptr ProcessGroupNCCL::pointToPoint( for (const auto i : c10::irange(tensors.size())) { gpuGuard.set_index(devices[i].index()); at::cuda::CUDAStream& ncclStream = ncclStreams_[key][i]; - // For point-to-point communication, NCCL ranks can only - // be 0 or 1. - int p2pTargetRank = isSendRecvSelf ? 0 : 1 - p2pRank; C10D_NCCL_CHECK(fn( tensors[i], ncclComms[i]->getNcclComm(), ncclStream, p2pTargetRank), ncclComms[i]->getNcclCommFailureReason()); } @@ -2262,6 +2303,9 @@ c10::intrusive_ptr ProcessGroupNCCL::gather( invalidArgument("requires empty output on non-root"); } outputs = {}; + // append a empty tensor to the list, we don't use it but the + // `collective` template function requires it to invoke its function + outputs.emplace_back(); } return collective( @@ -2337,6 +2381,9 @@ c10::intrusive_ptr ProcessGroupNCCL::scatter( invalidArgument("requires empty input on non-root"); } inputs = {}; + // append a empty tensor to the list, we don't use it but the + // `collective` template function requires it to invoke its function + inputs.emplace_back(); } return collective( @@ -2408,6 +2455,18 @@ c10::intrusive_ptr ProcessGroupNCCL::_allgather_base( "nccl:_all_gather_base"); } +#ifdef USE_NCCL_WITH_UCC +std::shared_ptr ProcessGroupNCCL::uccLib_ = nullptr; +#endif + +bool ProcessGroupNCCL::isUCCAvailable() const { +#ifdef USE_NCCL_WITH_UCC + return (uccPG_ != nullptr); +#else + return false; +#endif +} + } // namespace c10d #endif // USE_C10D_NCCL diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp index 77d9bb3dd596..f86cf5e9d576 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp @@ -12,7 +12,9 @@ #include #include #include +#include +#include #include #include #include @@ -368,11 +370,14 @@ class TORCH_API ProcessGroupNCCL : public ProcessGroup { // may indicate that there is some sort of collective desynchronization. uint64_t getSequenceNumberForGroup() override; + // Tests if the UCC fallback path is available + bool isUCCAvailable() const; + protected: // Helper that broadcasts nccl unique ID to all ranks through the store void broadcastUniqueNCCLID( ncclUniqueId* ncclID, - OpType opType, + bool isSingleP2POp, const std::string& devicesKey, int p2pRank); @@ -623,6 +628,12 @@ class TORCH_API ProcessGroupNCCL : public ProcessGroup { // Counting for the sequential number of NCCL collective call. uint64_t seq_{0}; + +#ifdef USE_NCCL_WITH_UCC + // ProcessGroupUCC shared library handle and ProcessGroup pointer + static std::shared_ptr uccLib_; + c10::intrusive_ptr uccPG_; +#endif }; } // namespace c10d diff --git a/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp b/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp index a2f03f84501e..118ee3e19c3b 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp @@ -69,11 +69,11 @@ struct CollectiveFingerPrint { // Create output tensor data structure to pass into allgather. std::vector> output_tensors; output_tensors.reserve(tensors_to_verify.size()); - for (auto& tensor_shape : tensors_to_verify) { + for (const auto& tensor_shape : tensors_to_verify) { std::vector outputs; outputs.reserve(pg->getSize()); for (const auto i : c10::irange(pg->getSize())) { - (void)i; //Suppress unused variable warning + (void)i; // Suppress unused variable warning outputs.emplace_back(at::zeros_like(tensor_shape)); } output_tensors.emplace_back(outputs); @@ -143,12 +143,12 @@ std::ostream& operator<<( std::vector dtype_strs; std::vector device_type_strs; for (const auto& tensor_dtype : collective_fingerprint.tensor_dtypes_) { - dtype_strs.push_back( + dtype_strs.emplace_back( c10::toString(static_cast(tensor_dtype))); } for (const auto& tensor_device_type : collective_fingerprint.tensor_device_types_) { - device_type_strs.push_back( + device_type_strs.emplace_back( c10::toString(static_cast(tensor_device_type))); } diff --git a/torch/csrc/distributed/c10d/TCPStore.cpp b/torch/csrc/distributed/c10d/TCPStore.cpp index 111701738684..46dc29ec0f65 100644 --- a/torch/csrc/distributed/c10d/TCPStore.cpp +++ b/torch/csrc/distributed/c10d/TCPStore.cpp @@ -128,6 +128,7 @@ void BackgroundThread::closeStopSignal() { void BackgroundThread::stop() { if (controlPipeFd_[1] != -1) { + ::write(controlPipeFd_[1], "\0", 1); // close the write end of the pipe ::close(controlPipeFd_[1]); controlPipeFd_[1] = -1; @@ -534,8 +535,16 @@ void TCPStoreMasterDaemon::run() { void TCPStoreMasterDaemon::run() { std::vector fds; tcputil::addPollfd(fds, storeListenSocket_.handle(), POLLIN); - // Push the read end of the pipe to signal the stopping of the daemon run - tcputil::addPollfd(fds, controlPipeFd_[0], POLLHUP); + // Although we haven't found any documentation or literature describing this, + // we've seen cases that, under certain circumstances, the read end of the + // pipe won't receive POLLHUP when the write end is closed. However, under + // the same circumstances, writing to the pipe will guarantee POLLIN to be + // received on the read end. + // + // For more reliable termination, the main thread will write a byte to the + // pipe before closing it, and the background thread will poll for both + // POLLIN and POLLHUP. + tcputil::addPollfd(fds, controlPipeFd_[0], POLLIN | POLLHUP); // receive the queries bool finished = false; @@ -564,8 +573,9 @@ void TCPStoreMasterDaemon::run() { // The pipe receives an event which tells us to shutdown the daemon if (fds[1].revents != 0) { - // Will be POLLUP when the pipe is closed - if (fds[1].revents ^ POLLHUP) { + // The main thread will write a byte to the pipe then close it before + // joining the background thread + if (fds[1].revents & ~(POLLIN | POLLHUP)) { throw std::system_error( ECONNABORTED, std::system_category(), @@ -700,7 +710,16 @@ void TCPStoreWorkerDaemon::run() { #else void TCPStoreWorkerDaemon::run() { std::vector fds; - tcputil::addPollfd(fds, controlPipeFd_[0], POLLHUP); + // Although we haven't found any documentation or literature describing this, + // we've seen cases that, under certain circumstances, the read end of the + // pipe won't receive POLLHUP when the write end is closed. However, under + // the same circumstances, writing to the pipe will guarantee POLLIN to be + // received on the read end. + // + // For more reliable termination, the main thread will write a byte to the + // pipe before closing it, and the background thread will poll for both + // POLLIN and POLLHUP. + tcputil::addPollfd(fds, controlPipeFd_[0], POLLIN | POLLHUP); tcputil::addPollfd(fds, storeListenSocket_.handle(), POLLIN); while (true) { @@ -709,8 +728,9 @@ void TCPStoreWorkerDaemon::run() { // Check control and exit early if triggered // The pipe receives an event which tells us to shutdown the listener thread if (fds[0].revents != 0) { - // Will be POLLUP when the pipe is closed - if (fds[0].revents ^ POLLHUP) { + // The main thread will write a byte to the pipe then close it before + // joining the background thread + if (fds[0].revents & ~(POLLIN | POLLHUP)) { throw std::system_error( ECONNABORTED, std::system_category(), @@ -893,7 +913,9 @@ void TCPClient::setTimeout(std::chrono::milliseconds value) { static_cast((value.count() % 1000) * 1000)}; #else struct timeval timeoutTV = { - .tv_sec = value.count() / 1000, .tv_usec = (value.count() % 1000) * 1000}; + .tv_sec = value.count() / 1000, + .tv_usec = static_cast((value.count() % 1000) * 1000), + }; #endif SYSCHECK_ERR_RETURN_NEG1(::setsockopt( socket_.handle(), diff --git a/torch/csrc/distributed/c10d/UCCForNCCL.hpp b/torch/csrc/distributed/c10d/UCCForNCCL.hpp new file mode 100644 index 000000000000..ce38894faebc --- /dev/null +++ b/torch/csrc/distributed/c10d/UCCForNCCL.hpp @@ -0,0 +1,25 @@ +#pragma once + +#include +#include +#include +#include + +#include + +namespace c10d { + +inline std::shared_ptr loadTorchUCC() { + const char *path = std::getenv("TORCH_UCC_LIBRARY_PATH"); + if (path != nullptr) { + try { + return std::make_shared(path); + } catch (const c10::DynamicLibraryError &e) { + TORCH_WARN("TORCH_UCC_LIBRARY_PATH is set, " + "but the loading of torch_ucc.so failed with:", e.msg()); + } + } + return nullptr; +} + +} // namespace c10d diff --git a/torch/csrc/distributed/c10d/Utils.cpp b/torch/csrc/distributed/c10d/Utils.cpp index f8a38c8625d0..924d0a233682 100644 --- a/torch/csrc/distributed/c10d/Utils.cpp +++ b/torch/csrc/distributed/c10d/Utils.cpp @@ -8,11 +8,6 @@ namespace c10d { -const char* kDistDebugEnvVar = "TORCH_DISTRIBUTED_DEBUG"; -const char* kDistDebugDetailLogLevel = "DETAIL"; -const char* kDistDebugInfoLogLevel = "INFO"; -const char* kDistDebugOffLogLevel = "OFF"; - std::string parse_env(const char* env_var_name) { char* stringValue = std::getenv(env_var_name); std::string res = "N/A"; @@ -22,65 +17,15 @@ std::string parse_env(const char* env_var_name) { return res; } -DistributedDebugLevel parseDistDebugLevel() { - std::string debugLevel = parse_env(kDistDebugEnvVar); - const char* levelStr{nullptr}; - if (debugLevel.compare("N/A") == 0) { - levelStr = kDistDebugOffLogLevel; - } else { - levelStr = debugLevel.c_str(); - TORCH_CHECK( - strncmp( - levelStr, - kDistDebugDetailLogLevel, - strlen(kDistDebugDetailLogLevel)) == 0 || - strncmp( - levelStr, - kDistDebugInfoLogLevel, - strlen(kDistDebugInfoLogLevel)) == 0 || - strncmp( - levelStr, - kDistDebugOffLogLevel, - strlen(kDistDebugOffLogLevel)) == 0, - c10::str( - "Expected environment variable TORCH_DISTRIBUTED_DEBUG to be one of ", - kDistDebugDetailLogLevel, - " ", - kDistDebugInfoLogLevel, - " ", - kDistDebugOffLogLevel, - " ")); - C10_LOG_FIRST_N(INFO, 1) - << "TORCH_DISTRIBUTED_DEBUG level parsed as " << levelStr; - } - - static std::unordered_map mapping = { - {kDistDebugOffLogLevel, DistributedDebugLevel::OFF}, - {kDistDebugInfoLogLevel, DistributedDebugLevel::INFO}, - {kDistDebugDetailLogLevel, DistributedDebugLevel::DETAIL}}; - - auto it = mapping.find(levelStr); - TORCH_CHECK( - it != mapping.end(), - "Invalid string value for distributed debug mode: ", - levelStr); - return it->second; -} - std::vector getTensorShapes( const std::vector& tensors) { std::vector shapeTensors; shapeTensors.reserve(tensors.size()); for (const auto& tensor : tensors) { - auto shapesVec = tensor.sizes().vec(); - int64_t shapes_size = shapesVec.size(); - // Need to clone here otherwise the shapesVec.data() memory is not copied - // and can be released under the hood. - at::Tensor shapesTensor = at::from_blob( - shapesVec.data(), - {shapes_size}, - at::TensorOptions().dtype(at::kLong)) - .clone(); + // Use `at::tensor()` to copy the data underlying `sizes()` since it may be + // released elsewhere. + at::Tensor shapesTensor = + at::tensor(tensor.sizes(), at::TensorOptions().dtype(at::kLong)); shapeTensors.emplace_back(std::move(shapesTensor)); } return shapeTensors; diff --git a/torch/csrc/distributed/c10d/Utils.hpp b/torch/csrc/distributed/c10d/Utils.hpp index a8e5b1a83052..501993a728b7 100644 --- a/torch/csrc/distributed/c10d/Utils.hpp +++ b/torch/csrc/distributed/c10d/Utils.hpp @@ -32,23 +32,8 @@ typedef SSIZE_T ssize_t; namespace c10d { -// Distributed c10d debug levels -enum DistributedDebugLevel { - OFF = 0, - DETAIL = 1, - INFO = 2, -}; - -// String debug log levels -extern const char* kDistDebugEnvVar; -extern const char* kDistDebugDetailLogLevel; -extern const char* kDistDebugInfoLogLevel; -extern const char* kDistDebugOffLogLevel; - TORCH_API std::string parse_env(const char* env_var_name); -TORCH_API DistributedDebugLevel parseDistDebugLevel(); - // Retrieve tensor shapes from a given tensor. TORCH_API std::vector getTensorShapes(const std::vector& tensors); @@ -422,7 +407,7 @@ inline void checkSplitSizes( "Tensor's dim 0 does not divide equally across group size"); } else { TORCH_CHECK( - split_sizes.size() == group_size, + split_sizes.size() == static_cast(group_size), "Number of tensor splits not equal to group size"); const auto sum = c10::sum_integers(split_sizes); TORCH_CHECK( diff --git a/torch/csrc/distributed/c10d/debug.cpp b/torch/csrc/distributed/c10d/debug.cpp new file mode 100644 index 000000000000..a22f322576cd --- /dev/null +++ b/torch/csrc/distributed/c10d/debug.cpp @@ -0,0 +1,73 @@ +// Copyright (c) Meta Platforms, Inc. and its affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include +#include +#include +#include + +#include +#include + +namespace c10d { +namespace detail { +namespace { + +DebugLevel loadDebugLevelFromEnvironment() { + char* env_value = std::getenv("TORCH_DISTRIBUTED_DEBUG"); + + if (env_value == nullptr) { + return DebugLevel::Off; + } + + DebugLevel level{}; + + std::string level_str{env_value}; + + std::transform(level_str.begin(), level_str.end(), level_str.begin(), + [](unsigned char c) { + return toupper(c); + }); + + if (level_str == "OFF") { + level = DebugLevel::Off; + } else if (level_str == "INFO") { + level = DebugLevel::Info; + } else if (level_str == "DETAIL") { + level = DebugLevel::Detail; + } else { + throw C10dError{"The value of TORCH_DISTRIBUTED_DEBUG must be OFF, INFO, or DETAIL."}; + } + + C10D_INFO("The debug level is set to {}.", level_str); + + return level; +} + +} // namespace +} // namespace detail + +namespace { + +DebugLevel g_debug_level = DebugLevel::Off; + +} // namespace + +void setDebugLevel(DebugLevel level) { + g_debug_level = level; +} + +void setDebugLevelFromEnvironment() { + g_debug_level = detail::loadDebugLevelFromEnvironment(); +} + +DebugLevel debug_level() noexcept { + return g_debug_level; +} + +} // namespace c10d diff --git a/torch/csrc/distributed/c10d/debug.h b/torch/csrc/distributed/c10d/debug.h new file mode 100644 index 000000000000..ecfb49448295 --- /dev/null +++ b/torch/csrc/distributed/c10d/debug.h @@ -0,0 +1,27 @@ +// Copyright (c) Meta Platforms, Inc. and its affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include + +namespace c10d { + +enum class DebugLevel { + Off, + Info, + Detail +}; + +TORCH_API void setDebugLevel(DebugLevel level); + +// Sets the debug level based on the value of the `TORCH_DISTRIBUTED_DEBUG` +// environment variable. +TORCH_API void setDebugLevelFromEnvironment(); + +TORCH_API DebugLevel debug_level() noexcept; + +} // namespace c10d diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index 0084e4523a98..873b6b35f168 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -29,6 +29,7 @@ #include #include +#include #include #include @@ -449,6 +450,7 @@ An enum-like class for built-in communication hooks: ``ALLREDUCE`` and ``FP16_CO py::arg("output_device"), py::arg("broadcast_buffers"), py::arg("has_sync_bn"), + py::arg("static_graph"), py::call_guard()) .def( "set_runtime_stats_and_log", @@ -478,20 +480,24 @@ An enum-like class for built-in communication hooks: ``ALLREDUCE`` and ``FP16_CO &::c10d::Logger::set_static_graph, py::call_guard()); - py::enum_<::c10d::DistributedDebugLevel>(module, "_DistributedDebugLevel", R"( - An enum whose values correspond to different debug settings of the - torch.distributed package. Currently supporting settings are OFF, INFO, - and DETAIL, which can be set via the TORCH_DISTRIBUTED_DEBUG environment - variable. + py::enum_<::c10d::DebugLevel>(module, "DebugLevel", R"( + An enum whose values correspond to different debug levels of the + torch.distributed package. Currently supporting OFF, INFO, and DETAIL, + which can be set via the TORCH_DISTRIBUTED_DEBUG environment variable + or via ``set_debug_level()`` function. )") - .value("OFF", ::c10d::DistributedDebugLevel::OFF) - .value("INFO", ::c10d::DistributedDebugLevel::INFO) - .value("DETAIL", ::c10d::DistributedDebugLevel::DETAIL); + .value("OFF", ::c10d::DebugLevel::Off) + .value("INFO", ::c10d::DebugLevel::Info) + .value("DETAIL", ::c10d::DebugLevel::Detail); - module.def( - "_get_debug_mode", - &::c10d::parseDistDebugLevel, - py::call_guard()); + module + .def("get_debug_level", ::c10d::debug_level, + R"(Gets the debug level of the torch.distributed package.)") + .def("set_debug_level", ::c10d::setDebugLevel, + R"(Sets the debug level of the torch.distributed package.)") + .def("set_debug_level_from_env", ::c10d::setDebugLevelFromEnvironment, + R"(Sets the debug level of the torch.distributed package from the + ``TORCH_DISTRIBUTED_DEBUG`` environment variable.)"); py::enum_<::c10d::ReduceOp>(module, "ReduceOp", R"( An enum-like class for available reduction operations: ``SUM``, ``AVG``, @@ -649,11 +655,13 @@ Example:: .def( "get", [](::c10d::Store& store, const std::string& key) -> py::bytes { - auto value = store.get(key); + auto value = [&]() { + py::gil_scoped_release guard; + return store.get(key); + }(); return py::bytes( reinterpret_cast(value.data()), value.size()); }, - py::call_guard(), R"( Retrieves the value associated with the given ``key`` in the store. If ``key`` is not present in the store, the function will wait for ``timeout``, which is defined @@ -887,7 +895,7 @@ the server to establish a connection. Arguments: host_name (str): The hostname or IP Address the server store should run on. port (int): The port on which the server store should listen for incoming requests. - world_size (int, optional): The total number of store users (number of clients + 1 for the server). Default is -1 (a negative value indicates a non-fixed number of store users). + world_size (int, optional): The total number of store users (number of clients + 1 for the server). Default is None (None indicates a non-fixed number of store users). is_master (bool, optional): True when initializing the server store and False for client stores. Default is False. timeout (timedelta, optional): Timeout used by the store during initialization and for methods such as :meth:`~torch.distributed.store.get` and :meth:`~torch.distributed.store.wait`. Default is timedelta(seconds=300) wait_for_worker (bool, optional): Whether to wait for all the workers to connect with the server store. This is only applicable when world_size is a fixed value. Default is True. @@ -906,14 +914,14 @@ Example:: .def( py::init([](const std::string& host, uint16_t port, - int worldSize, + c10::optional worldSize, bool isServer, std::chrono::milliseconds timeout, bool waitWorkers, bool multiTenant) { c10::optional numWorkers = c10::nullopt; - if (worldSize > -1) { - numWorkers = static_cast(worldSize); + if (worldSize.has_value() && worldSize.value() > -1) { + numWorkers = static_cast(worldSize.value()); } ::c10d::TCPStoreOptions opts{ @@ -923,7 +931,7 @@ Example:: }), py::arg("host_name"), py::arg("port"), - py::arg("world_size") = -1, + py::arg("world_size") = py::none(), // using noconvert() requires this argument to be True or False // prevents accidental implicit conversion to bool py::arg("is_master").noconvert() = false, @@ -1423,7 +1431,9 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`). py::arg("timeout") = kProcessGroupDefaultTimeout, py::call_guard()) .def_property_readonly( - "options", &::c10d::ProcessGroupNCCL::getOptions); + "options", &::c10d::ProcessGroupNCCL::getOptions) + .def_property_readonly( + "is_ucc_available", &::c10d::ProcessGroupNCCL::isUCCAvailable); intrusive_ptr_class_<::c10d::ProcessGroupNCCL::Options>( processGroupNCCL, diff --git a/torch/csrc/distributed/c10d/logger.cpp b/torch/csrc/distributed/c10d/logger.cpp index bd6de0cfee81..93e8d05f2655 100644 --- a/torch/csrc/distributed/c10d/logger.cpp +++ b/torch/csrc/distributed/c10d/logger.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -21,7 +22,7 @@ std::ostream& operator<<(std::ostream& output, const Logger& logger) { auto& ddp_logging_data = (*logger.ddp_logging_data_); std::string loggerInfo = fmt::format( - "[Rank {} / {}] [iteration {}] Training {} unused_parameter_size={} \n " + "[Rank {} / {}] [before iteration {}] Training {} unused_parameter_size={} \n " "Avg forward compute time: {} \n Avg backward compute time: {} \n" "Avg backward comm. time: {} \n Avg backward comm/comp overlap time: {}", ddp_logging_data.ints_map["rank"], @@ -124,11 +125,11 @@ std::vector> Logger::get_per_bucket_variable_indices() { return per_bucket_variable_indices; } -std::vector Logger::get_bucket_sizes() { - std::vector bucket_sizes; +std::vector Logger::get_bucket_sizes() { + std::vector bucket_sizes; for (const auto& bucket : reducer_->buckets_) { - const auto& variables = bucket.replicas[0].variables; - int bucket_size = 0; + const auto& variables = bucket.variables; + int64_t bucket_size = 0; for (const auto& v : variables) { bucket_size += v.numel() * v.element_size(); } @@ -137,14 +138,6 @@ std::vector Logger::get_bucket_sizes() { return bucket_sizes; } -std::vector Logger::get_bucket_size_limits() { - std::vector bucket_size_limits; - for (const auto& bucket : reducer_->buckets_) { - bucket_size_limits.push_back(bucket.bucket_size_limit); - } - return bucket_size_limits; -} - // Communication hook. Empty string if not set, in which case it will not be // logged. void Logger::set_comm_hook(const std::string& hook) { @@ -167,9 +160,13 @@ void Logger::set_construction_data_and_log( const std::vector& device_ids, int output_device, bool broadcast_buffers, - bool has_sync_bn) { + bool has_sync_bn, + bool static_graph) { // No lock is needed, as it will be called in DistributedDataParallel // constructor. + if (static_graph) { + set_static_graph(); + } ddp_logging_data_->strs_map["module_name"] = module_name; ddp_logging_data_->ints_map["world_size"] = reducer_->process_group_->getSize(); @@ -185,9 +182,6 @@ void Logger::set_construction_data_and_log( // A list of bucket sizes (Bytes) calculated during construction time ddp_logging_data_->strs_map["bucket_sizes"] = c10::Join(", ", get_bucket_sizes()); - // A list of bucket size limits (bytes) specified during construction time - ddp_logging_data_->strs_map["initial_bucket_size_limits"] = - c10::Join(", ", get_bucket_size_limits()); set_env_variables(); // DistributedDataParallel constructor input parameters @@ -203,7 +197,7 @@ void Logger::set_construction_data_and_log( ddp_logging_data_->strs_map["backend_name"] = reducer_->process_group_->getBackendName(); - if (parseDistDebugLevel() != DistributedDebugLevel::OFF) { + if (debug_level() != DebugLevel::Off) { std::string initInfo = fmt::format( "[Rank {}]: DDP Initialized with: \n", ddp_logging_data_->ints_map["rank"]); @@ -294,8 +288,6 @@ void Logger::set_runtime_stats_and_log() { reducer_->has_rebuilt_bucket_; ddp_logging_data_->strs_map["rebuilt_bucket_sizes"] = c10::Join(", ", get_bucket_sizes()); - ddp_logging_data_->strs_map["rebuilt_bucket_size_limits"] = - c10::Join(", ", get_bucket_size_limits()); // Log per-bucket variable indices std::vector per_bucket_variable_indices; auto indices = get_per_bucket_variable_indices(); @@ -324,6 +316,14 @@ void Logger::set_runtime_stats_and_log() { ); return; } + if (!reducer_->params_[0].is_cuda() && !reducer_->params_[0].is_cpu()) { + TORCH_WARN_ONCE( + "Time stats are currently only collected for CPU and CUDA devices. " + "Please refer to CpuTimer or CudaTimer for how to register timer " + "for other device type." + ); + return; + } TORCH_INTERNAL_ASSERT(reducer_->timer_); calculate_avg_time( ddp_logging_data_->ints_map["avg_forward_compute_time"], @@ -377,7 +377,7 @@ void Logger::set_runtime_stats_and_log() { ); // Log runtime stats to stderr if TORCH_DISTRIBUTED_DEBUG=DETAIL is enabled. - if (parseDistDebugLevel() == DistributedDebugLevel::DETAIL) { + if (debug_level() == DebugLevel::Detail) { LOG(INFO) << *this; } diff --git a/torch/csrc/distributed/c10d/logger.hpp b/torch/csrc/distributed/c10d/logger.hpp index d47157805660..cd32c573a21e 100644 --- a/torch/csrc/distributed/c10d/logger.hpp +++ b/torch/csrc/distributed/c10d/logger.hpp @@ -15,7 +15,9 @@ class TORCH_API Logger { const std::vector& device_ids, int output_device, bool broadcast_buffers, - bool has_sync_bn); + bool has_sync_bn, + bool static_graph + ); void set_static_graph(); @@ -39,9 +41,7 @@ class TORCH_API Logger { // Set parameters stats. void set_parameter_stats(); // Get size of each bucket (Bytes). - std::vector get_bucket_sizes(); - // Get bucket size limits specified during DDP construction. - std::vector get_bucket_size_limits(); + std::vector get_bucket_sizes(); // Get variable indices for each bucket. std::vector> get_per_bucket_variable_indices(); // Set comm. hook, if used diff --git a/torch/csrc/distributed/c10d/logging.cpp b/torch/csrc/distributed/c10d/logging.cpp new file mode 100644 index 000000000000..c079906b878a --- /dev/null +++ b/torch/csrc/distributed/c10d/logging.cpp @@ -0,0 +1,39 @@ +// Copyright (c) Meta Platforms, Inc. and its affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +namespace c10d { +namespace detail { + +bool isLogLevelEnabled(LogLevel level) noexcept { + // c10 logger does not support debug and trace levels. In order to map higher + // levels we adjust our ordinal value. + int level_int = static_cast(level) - 2; + + if (level_int >= 0) { + return FLAGS_caffe2_log_level <= level_int; + } + + // Debug and trace levels are only enabled when c10 log level is set to INFO. + if (FLAGS_caffe2_log_level != 0) { + return false; + } + + if (level_int == -1) { + return debug_level() != DebugLevel::Off; + } + if (level_int == -2) { + return debug_level() == DebugLevel::Detail; + } + + return false; +} + +} // namespace detail +} // namespace c10d diff --git a/torch/csrc/distributed/c10d/logging.h b/torch/csrc/distributed/c10d/logging.h index 9e6d328d324c..57ee974a0d35 100644 --- a/torch/csrc/distributed/c10d/logging.h +++ b/torch/csrc/distributed/c10d/logging.h @@ -1,4 +1,4 @@ -// Copyright (c) Facebook, Inc. and its affiliates. +// Copyright (c) Meta Platforms, Inc. and its affiliates. // All rights reserved. // // This source code is licensed under the BSD-style license found in the @@ -6,24 +6,49 @@ #pragma once -#include +#include +#include #include +#include namespace c10d { namespace detail { + +enum class LogLevel { + Trace, + Debug, + Info, + Warning, + Error +}; + +TORCH_API bool isLogLevelEnabled(LogLevel level) noexcept; + template -std::string log_vformat(fmt::string_view fmt, T&&... args) { +std::string formatLogMessage(fmt::string_view fmt, T&&... args) { return fmt::vformat(fmt, fmt::make_format_args(args...)); } -} // namespace detail -} // namespace c10d + +} // namespace detail +} // namespace c10d #define C10D_ERROR(...)\ - LOG_IF(ERROR, FLAGS_caffe2_log_level <= 2) << c10d::detail::log_vformat(__VA_ARGS__) + LOG_IF(ERROR, c10d::detail::isLogLevelEnabled(c10d::detail::LogLevel::Error))\ + << "[c10d] " << c10d::detail::formatLogMessage(__VA_ARGS__) #define C10D_WARNING(...)\ - LOG_IF(WARNING, FLAGS_caffe2_log_level <= 1) << c10d::detail::log_vformat(__VA_ARGS__) + LOG_IF(WARNING, c10d::detail::isLogLevelEnabled(c10d::detail::LogLevel::Warning))\ + << "[c10d] " << c10d::detail::formatLogMessage(__VA_ARGS__) #define C10D_INFO(...)\ - LOG_IF(INFO, FLAGS_caffe2_log_level <= 0) << c10d::detail::log_vformat(__VA_ARGS__) + LOG_IF(INFO, c10d::detail::isLogLevelEnabled(c10d::detail::LogLevel::Info))\ + << "[c10d] " << c10d::detail::formatLogMessage(__VA_ARGS__) + +#define C10D_DEBUG(...)\ + LOG_IF(INFO, c10d::detail::isLogLevelEnabled(c10d::detail::LogLevel::Debug))\ + << "[c10d - debug] " << c10d::detail::formatLogMessage(__VA_ARGS__) + +#define C10D_TRACE(...)\ + LOG_IF(INFO, c10d::detail::isLogLevelEnabled(c10d::detail::LogLevel::Trace))\ + << "[c10d - trace] " << c10d::detail::formatLogMessage(__VA_ARGS__) diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp index 00b1b5cb3c0e..31d376b13a24 100644 --- a/torch/csrc/distributed/c10d/reducer.cpp +++ b/torch/csrc/distributed/c10d/reducer.cpp @@ -80,7 +80,7 @@ Reducer::Reducer( int64_t bucket_bytes_cap, bool find_unused_parameters, bool gradient_as_bucket_view, - std::unordered_map paramNames, + std::unordered_map param_names, int64_t first_bucket_bytes_cap) : params_(std::move(params)), process_group_(std::move(process_group)), @@ -99,14 +99,14 @@ Reducer::Reducer( div_factor_(kUnsetDivFactor), static_graph_(false), comm_hook_(nullptr), - ddp_debug_level_(parseDistDebugLevel()), - param_names_(std::move(paramNames)), + ddp_debug_level_(debug_level()), + param_names_(std::move(param_names)), first_bucket_bytes_cap_(first_bucket_bytes_cap) { C10_LOG_API_USAGE_ONCE("torch.distributed.ddp.reducer"); TORCH_INTERNAL_ASSERT( params_.size() >= 1, "Expected at least one parameter."); - if (ddp_debug_level_ != c10d::DistributedDebugLevel::OFF) { + if (ddp_debug_level_ != c10d::DebugLevel::Off) { LOG(INFO) << "Reducer initialized with bucket_bytes_cap: " << bucket_bytes_cap_ << " first_bucket_bytes_cap: " << first_bucket_bytes_cap; @@ -143,8 +143,7 @@ Reducer::Reducer( // This can be reinitialized later after capturing runtime information. { std::lock_guard lock(mutex_); - initialize_buckets( - std::move(bucket_indices), std::move(per_bucket_size_limits)); + initialize_buckets(std::move(bucket_indices)); } // All variables are expected to have their `grad_fn` set to the gradient @@ -332,17 +331,16 @@ void Reducer::check_grad_layout( } void Reducer::mark_variable_ready_dense(size_t variable_index) { - const auto replica_index = 0; const auto& bucket_index = variable_locators_[variable_index]; auto& bucket = buckets_[bucket_index.bucket_index]; - auto& replica = bucket.replicas[replica_index]; - auto& variable = replica.variables[bucket_index.intra_bucket_index]; - auto& bucket_view = replica.bucket_views_in[bucket_index.intra_bucket_index]; - - // Copy contents of gradient tensor to bucket tensor. - // If the gradient is not set, we assume it wasn't computed - // as part of the current backwards pass, and zero the part - // of the bucket it would otherwise hold. + auto& variable = bucket.variables[bucket_index.intra_bucket_index]; + auto& bucket_view = bucket.bucket_views_in[bucket_index.intra_bucket_index]; + + // Copy the contents of the gradient tensor to the corresponding part of the + // bucket's flattened gradient tensor. + // If the gradient is not set, we assume it wasn't computed as part of the + // current backwards pass, and we zero the part of the bucket it would + // otherwise hold. runGradCallbackForVariable(variable, [&](auto& grad) { if (grad.defined()) { this->check_grad_layout(grad, bucket_view); @@ -414,11 +412,9 @@ void Reducer::mark_variable_ready_dense(size_t variable_index) { } void Reducer::mark_variable_ready_sparse(size_t variable_index) { - const auto replica_index = 0; const auto& bucket_index = variable_locators_[variable_index]; auto& bucket = buckets_[bucket_index.bucket_index]; - auto& replica = bucket.replicas[replica_index]; - auto& variable = replica.variables[bucket_index.intra_bucket_index]; + auto& variable = bucket.variables[bucket_index.intra_bucket_index]; runGradCallbackForVariable(variable, [&](auto& grad) { REDUCER_CHECK( @@ -428,17 +424,16 @@ void Reducer::mark_variable_ready_sparse(size_t variable_index) { logger_, "Expected variable to have sparse gradient."); - // Sparse tensors cannot be grouped together with other sparse tensors - // in a single reduction operation like we can for dense tensors. - // Therefore, the `offsets` and `lengths` vectors in the bucket replica - // struct are empty, and there is no pre-existing accumulation tensor. - // Directly assign the sparse tensor to the `contents` field. - replica.contents = grad; - // If no DDP comm hook is registered, - // the allreduce only sums up the value, and a separate division is - // required. + // Sparse tensors cannot be grouped together with other sparse tensors in a + // single reduction operation like we can for dense tensors. Therefore, the + // `offsets` and `lengths` vectors in the bucket struct are empty, and + // there is no pre-existing accumulation tensor. + // Directly assign the sparse tensor to the `gradients` field. + bucket.gradients = grad; + // If no DDP comm hook is registered, the allreduce only sums up the + // value, and a separate division is required. if (comm_hook_ == nullptr) { - replica.contents.div_(div_factor_); + bucket.gradients.div_(div_factor_); } // The grad is modified in place and needs to be written back. return true; @@ -456,11 +451,11 @@ std::vector Reducer::get_grad_buckets( gradBuckets.emplace_back( i, buckets_.size(), - return_zero_tensors ? at::zeros_like(bucket.replicas[0].contents) - : bucket.replicas[0].contents, - bucket.replicas[0].offsets, - bucket.replicas[0].lengths, - bucket.replicas[0].sizes_vec, + return_zero_tensors ? at::zeros_like(bucket.gradients) + : bucket.gradients, + bucket.offsets, + bucket.lengths, + bucket.sizes_vec, variables_for_bucket); } return gradBuckets; @@ -693,16 +688,15 @@ void Reducer::all_reduce_local_used_map() { at::Tensor& Reducer::get_param_from_index(size_t index) { const auto& bucket_index = variable_locators_[index]; auto& bucket = buckets_[bucket_index.bucket_index]; - auto& replica = bucket.replicas[0]; - // Cannot simply access variable via replicas_[0][variable_index] since return - // value is used in runGradCallbackForVariable which does not accept const - // tensors. - auto& variable = replica.variables[bucket_index.intra_bucket_index]; + // Cannot simply access variable via `bucket.variables[variable_index]` since + // return value is used in `runGradCallbackForVariable()` which does not + // accept const tensors. + auto& variable = bucket.variables[bucket_index.intra_bucket_index]; return variable; } void Reducer::checkAndRaiseMarkedTwiceError(size_t index) { - // Something is wrong if all variables contained in this bucket replica have + // Something is wrong if all variables contained in this bucket have // already been marked as ready. // We don't expect the same variable to be marked ready twice. bool marked_twice = @@ -714,7 +708,7 @@ void Reducer::checkAndRaiseMarkedTwiceError(size_t index) { auto param_name = param_names_.find(index); const bool found_param_name = param_name != param_names_.end(); TORCH_INTERNAL_ASSERT( - ddp_debug_level_ == c10d::DistributedDebugLevel::OFF || + ddp_debug_level_ == c10d::DebugLevel::Off || found_param_name, "Expected to find parameter name in debug mode."); std::string paramInfo = c10::str( @@ -790,7 +784,6 @@ void Reducer::mark_variable_ready(size_t variable_index) { const auto& bucket_index = variable_locators_[variable_index]; auto& bucket = buckets_[bucket_index.bucket_index]; - auto& replica = bucket.replicas[0]; set_divide_factor(); @@ -802,16 +795,13 @@ void Reducer::mark_variable_ready(size_t variable_index) { // TODO(@pietern): Make this work for both CPU/CUDA tensors. // When using CPU tensors we don't need to do this. - // // Record event so that we can wait for all of them. - // auto& event = replica.events[bucket_index.intra_bucket_index]; + // Record event so that we can wait for all of them. + // auto& event = bucket.events[bucket_index.intra_bucket_index]; // event.record(); // Check if this was the final gradient for this bucket. - if (--replica.pending == 0) { - // Kick off reduction if all replicas for this bucket are ready. - if (--bucket.pending == 0) { - mark_bucket_ready(bucket_index.bucket_index); - } + if (--bucket.pending == 0) { + mark_bucket_ready(bucket_index.bucket_index); } // Run finalizer function and kick off reduction for local_used_map once the @@ -849,31 +839,24 @@ c10::intrusive_ptr Reducer::run_comm_hook( } void Reducer::all_reduce_bucket(Bucket& bucket) { - std::vector tensors; - tensors.reserve(bucket.replicas.size()); - for (const auto& replica : bucket.replicas) { - // TODO(@pietern): Ensure proper synchronization with the CUDA events - // that recorded copies into this contents tensor. If these copies are - // executed on non-default streams, the current stream for the device - // that holds the contents tensor must wait on these events. - // - // As long as autograd uses the default stream for every device, - // these operations are implicitly sequenced, and we don't need to - // do any extra synchronization here. - // - tensors.push_back(replica.contents); - } - auto variables_for_bucket = get_variables_for_bucket(next_bucket_, bucket); + // TODO(@pietern): Ensure proper synchronization with the CUDA events + // that recorded copies into this `gradients` tensor. If these copies are + // executed on non-default streams, the current stream for the device + // that holds the `gradients` tensor must wait on these events. + // + // As long as autograd uses the default stream for every device, + // these operations are implicitly sequenced, and we don't need to + // do any extra synchronization here. + const auto& tensor = bucket.gradients; + GradBucket grad_bucket( next_bucket_, buckets_.size(), - tensors[0], - // Since we only support single-process single-device - // mode, there is always only one replica in the bucket. - bucket.replicas[0].offsets, - bucket.replicas[0].lengths, - bucket.replicas[0].sizes_vec, + tensor, + bucket.offsets, + bucket.lengths, + bucket.sizes_vec, variables_for_bucket); bucket.future_work = run_comm_hook(grad_bucket); } @@ -890,12 +873,11 @@ std::vector Reducer::get_variables_for_bucket( std::vector variables_for_bucket; variables_for_bucket.reserve(bucket.variable_indices.size()); for (const auto& variable_index : bucket.variable_indices) { - auto& replica = bucket.replicas[0]; // Grab bucket index where gradient is located using variable_locators_. auto& bucket_index_for_variable = variable_locators_[variable_index]; // Grab the actual model parameter. auto& variable = - replica.variables[bucket_index_for_variable.intra_bucket_index]; + bucket.variables[bucket_index_for_variable.intra_bucket_index]; variables_for_bucket.emplace_back(variable); } @@ -945,9 +927,7 @@ void Reducer::install_futures(c10::List> } } -void Reducer::initialize_buckets( - std::vector> bucket_indices, - std::vector per_bucket_sizes) { +void Reducer::initialize_buckets(std::vector> bucket_indices) { // If initialize_buckets is called inside DDP constructor, then // it does not matter rpc context ptr is nullptr or not, as grad // will not be mutated. @@ -977,10 +957,8 @@ void Reducer::initialize_buckets( // Iterate over buckets. const auto bucket_count = bucket_indices.size(); buckets_.reserve(bucket_count); - TORCH_INTERNAL_ASSERT(bucket_count == per_bucket_sizes.size()); for (const auto bucket_index : c10::irange(bucket_count)) { Bucket bucket; - bucket.bucket_size_limit = per_bucket_sizes[bucket_index]; // TODO(@pietern): Validate indices. // Must be non-empty, unique, and unique across buckets. @@ -1004,24 +982,23 @@ void Reducer::initialize_buckets( } } - BucketReplica replica; if (bucket.expect_sparse_gradient) { const auto variable_index = bucket_indices[bucket_index].front(); const auto& variable = params_[variable_index]; TORCH_INTERNAL_ASSERT(bucket_indices[bucket_index].size() == 1); - replica.variables = {variable}; + bucket.variables = {variable}; } else { at::TensorOptions options; // The start index of the variable in the flattened tensor. size_t offset = 0; - // Reserve enough space for the per-variable fields stored in bucket - // replica for efficiency. + // Reserve enough space for the per-variable fields stored in the bucket + // for efficiency. const size_t num_variables = bucket_indices[bucket_index].size(); - replica.variables.reserve(num_variables); - replica.offsets.reserve(num_variables); - replica.lengths.reserve(num_variables); - replica.sizes_vec.reserve(num_variables); + bucket.variables.reserve(num_variables); + bucket.offsets.reserve(num_variables); + bucket.lengths.reserve(num_variables); + bucket.sizes_vec.reserve(num_variables); // Iterate over bucket variables. for (const auto variable_index : bucket_indices[bucket_index]) { @@ -1047,29 +1024,29 @@ void Reducer::initialize_buckets( "All parameters in a bucket must have the same dtype."); } const auto length = variable.numel(); - replica.variables.push_back(variable); - replica.offsets.push_back(offset); - replica.lengths.push_back(length); - replica.sizes_vec.push_back(variable.sizes()); + bucket.variables.push_back(variable); + bucket.offsets.push_back(offset); + bucket.lengths.push_back(length); + bucket.sizes_vec.push_back(variable.sizes()); offset += length; } - // Allocate bucket contents tensor. - replica.contents = at::empty({static_cast(offset)}, options); + // Allocate the bucket's flattened `gradients` tensor. + bucket.gradients = at::empty({static_cast(offset)}, options); // Note: "Gradient Layout Contract" // - // Here, create views into the contents tensor for each variable's grad. - // Views serve as entry points to copy_ each grad's data in/out of the - // flat contents tensor. + // Here, create views into the `gradients` tensor for each variable's + // grad. Views serve as entry points to `copy_()` each grad's data in/out + // of the flattened `gradients` tensor. // // Gradients may have dense memory but non-row-major-contiguous strides // (e.g. channels_last or channels_last_3d). For coalesced accesses // during copy_s, it's beneficial for each view's layout to match its // grad's layout. // - // Specifically, we expect torch/csrc/autograd/AccumulateGrad.h produces - // grads that obey there "Gradient Layout Contract": + // Specifically, we expect torch/csrc/autograd/functions/accumulate_grad.h + // produces grads that obey the "Gradient Layout Contract": // (1) if variable.is_non_overlapping_and_dense(), the stashed grad's // strides match variable. // (2) else, stashed grad is rowmajor contiguous. @@ -1095,14 +1072,10 @@ void Reducer::initialize_buckets( // Checking just once won't catch if someone messes with // param layouts over time, but not messing with params after DDP // construction is already a documented constraint. - initialize_bucket_views(replica, replica.contents); + initialize_bucket_views(bucket); } - // Add bucket replica to enclosing bucket. - bucket.replicas.push_back(std::move(replica)); - // Map participating variables to this bucket. - // This is identical across replicas so we only need to do this once. size_t intra_bucket_index = 0; for (const auto variable_index : bucket_indices[bucket_index]) { TORCH_INTERNAL_ASSERT( @@ -1118,29 +1091,28 @@ void Reducer::initialize_buckets( } // (see Note: "Gradient Layout Contract" in initialize_buckets). -void Reducer::initialize_bucket_views( - Reducer::BucketReplica& replica, - at::Tensor& contents) { - for (const auto i : c10::irange(replica.variables.size())) { - auto& v = replica.variables[i]; - const auto offset = replica.offsets[i]; - const auto length = replica.lengths[i]; +void Reducer::initialize_bucket_views(Reducer::Bucket& bucket) { + const auto& gradients = bucket.gradients; + for (const auto i : c10::irange(bucket.variables.size())) { + auto& v = bucket.variables[i]; + const auto offset = bucket.offsets[i]; + const auto length = bucket.lengths[i]; if (v.is_non_overlapping_and_dense()) { // If the param's memory is dense, match its layout, anticipating // the autograd engine (AccumulateGrad) will also create gradients // matching its layout. - replica.bucket_views_in.push_back( - contents.as_strided(v.sizes(), v.strides(), offset)); + bucket.bucket_views_in.push_back( + gradients.as_strided(v.sizes(), v.strides(), offset)); } else { // Fall back to a C-style contiguous view, again anticipating // AccumulateGrad will do the same when stashing grads for non-dense // params. - replica.bucket_views_in.push_back( - contents.narrow(0, offset, length).view(v.sizes())); + bucket.bucket_views_in.push_back( + gradients.narrow(0, offset, length).view(v.sizes())); } // By default `bucket_views_out` and `bucket_views_in` are // essentially the same thing. - replica.bucket_views_out = replica.bucket_views_in; + bucket.bucket_views_out = bucket.bucket_views_in; // If gradient_as_bucket_view_ is set as true, then there are two cases to // handle: initialize_bucket_views could be called inside initialize_buckets @@ -1152,7 +1124,7 @@ void Reducer::initialize_bucket_views( // bucket_view, because grads should be kept as being undefined for globally // unused parameters. if (gradient_as_bucket_view_) { - auto& bucket_view = replica.bucket_views_in.back(); + auto& bucket_view = bucket.bucket_views_in.back(); runGradCallbackForVariable(v, [&](auto& grad) { if (grad.defined() && !grad.is_alias_of(bucket_view)) { bucket_view.copy_(grad); @@ -1169,24 +1141,24 @@ void Reducer::initialize_bucket_views( // (see Note: "Gradient Layout Contract" in initialize_buckets). void Reducer::populate_bucket_views_out( - Reducer::BucketReplica& replica, + Reducer::Bucket& bucket, at::Tensor& tensor) { - replica.bucket_views_out.clear(); - for (const auto i : c10::irange(replica.variables.size())) { - const auto& v = replica.variables[i]; - const auto offset = replica.offsets[i]; - const auto length = replica.lengths[i]; + bucket.bucket_views_out.clear(); + for (const auto i : c10::irange(bucket.variables.size())) { + const auto& v = bucket.variables[i]; + const auto offset = bucket.offsets[i]; + const auto length = bucket.lengths[i]; if (v.is_non_overlapping_and_dense()) { // If the param's memory is dense, match its layout, anticipating // the autograd engine (AccumulateGrad) will also create gradients // matching its layout. - replica.bucket_views_out.push_back( + bucket.bucket_views_out.push_back( tensor.as_strided(v.sizes(), v.strides(), offset)); } else { // Fall back to a C-style contiguous view, again anticipating // AccumulateGrad will do the same when stashing grads for non-dense // params. - replica.bucket_views_out.push_back( + bucket.bucket_views_out.push_back( tensor.narrow(0, offset, length).view(v.sizes())); } } @@ -1207,10 +1179,7 @@ void Reducer::reset_bucket_counting() { num_buckets_ready_ = 0; for (auto& bucket : buckets_) { - for (auto& replica : bucket.replicas) { - replica.pending = replica.variables.size(); - } - bucket.pending = bucket.replicas.size(); + bucket.pending = bucket.variables.size(); } if (static_graph_) { @@ -1260,7 +1229,7 @@ void Reducer::search_unused_parameters( // If the accumulator function is present in the graph, we know // a gradient will be computed for the corresponding parameter. if (seen.count(it.first) == 0) { - if (ddp_debug_level_ == c10d::DistributedDebugLevel::DETAIL) { + if (ddp_debug_level_ == c10d::DebugLevel::Detail) { const auto param_info = param_names_.find(it.second); TORCH_INTERNAL_ASSERT( param_info != param_names_.end(), @@ -1339,16 +1308,16 @@ void Reducer::prepare_for_backward( void Reducer::copy_bucket_to_grad( at::Tensor& variable, - Reducer::BucketReplica& replica, + Reducer::Bucket& bucket, size_t intra_bucket_index, bool global_unused) { - const auto& bucket_view = replica.bucket_views_out[intra_bucket_index]; + const auto& bucket_view = bucket.bucket_views_out[intra_bucket_index]; runGradCallbackForVariable(variable, [&](auto& grad) { // If a parameter is globally unused, we keep its grad untouched. if (!global_unused) { if (!grad.defined()) { // Creates grad according to the "Gradient Layout Contract" - // (see torch/csrc/grad/AccumulateGrad.h) + // (see torch/csrc/autograd/functions/accumulate_grad.h) grad = torch::autograd::utils::clone_obey_contract(bucket_view, variable); } else { @@ -1387,10 +1356,8 @@ std::vector Reducer::getUnmarkedParamIndicesForIteration() { // A bucket with one or more dense tensors needs to be unflattened. void Reducer::finalize_bucket_dense(Bucket& bucket) { - size_t replica_index = 0; - auto& replica = bucket.replicas[replica_index]; - for (const auto intra_bucket_index : c10::irange(replica.variables.size())) { - auto& variable = replica.variables[intra_bucket_index]; + for (const auto intra_bucket_index : c10::irange(bucket.variables.size())) { + auto& variable = bucket.variables[intra_bucket_index]; bool global_unused = false; // See Note [Skip allreducing local_used_map_dev] @@ -1434,15 +1401,14 @@ void Reducer::finalize_bucket_dense(Bucket& bucket) { RECORD_FUNCTION( "torch.distributed.ddp.reducer::copy_bucket_to_grad", std::vector({variable})); - copy_bucket_to_grad(variable, replica, intra_bucket_index, global_unused); + copy_bucket_to_grad(variable, bucket, intra_bucket_index, global_unused); } else { const auto& bucket_view_out = - replica.bucket_views_out[intra_bucket_index]; - auto& bucket_view_in = replica.bucket_views_in[intra_bucket_index]; - // If communication_hook is registered, bucket_view_out stores - // allreduced results in a newly allocated tensor, copy bucket_view_out - // back to bucket_view_in that referring to replica.content tensor and - // grad. + bucket.bucket_views_out[intra_bucket_index]; + auto& bucket_view_in = bucket.bucket_views_in[intra_bucket_index]; + // If a communication hook is registered, then `bucket_view_out` stores + // the allreduced results in a newly allocated tensor, so we copy + // `bucket_view_out` back to `bucket_view_in` for this gradient. if (!bucket_view_in.is_alias_of(bucket_view_out)) { bucket_view_in.copy_(bucket_view_out); } @@ -1484,7 +1450,8 @@ void Reducer::finalize_backward() { TORCH_INTERNAL_ASSERT(require_finalize_); require_finalize_ = false; - // Wait for asynchronous reduction to complete and unflatten contents. + // Wait for asynchronous reduction to complete, and unflatten the bucket's + // flattened `gradients` tensor. for (auto& bucket : buckets_) { // See Note [DDP Communication Hook] TORCH_INTERNAL_ASSERT( @@ -1495,13 +1462,12 @@ void Reducer::finalize_backward() { auto future_result = comm_hook_ == nullptr ? detail::parseCppCommHookResult(bucket.future_work->value()) : comm_hook_->parseHookResult(bucket.future_work->value()); - auto& replica = bucket.replicas[0]; if (bucket.expect_sparse_gradient) { - replica.contents.copy_(future_result); + bucket.gradients.copy_(future_result); } else { // Reinitialize only `bucket_views_out` with the future_result by // following the same logic in `initialize_buckets`. - populate_bucket_views_out(replica, future_result); + populate_bucket_views_out(bucket, future_result); } // Unset allreduce division factor, as it may change in next backwards pass @@ -1708,7 +1674,7 @@ bool Reducer::rebuild_buckets() { std::reverse(per_bucket_size_limits.begin(), per_bucket_size_limits.end()); } - if (ddp_debug_level_ != c10d::DistributedDebugLevel::OFF) { + if (ddp_debug_level_ != c10d::DebugLevel::Off) { TORCH_INTERNAL_ASSERT( rebuilt_bucket_indices.size() == per_bucket_size_limits.size()) LOG(INFO) << rebuilt_bucket_indices.size() @@ -1726,8 +1692,7 @@ bool Reducer::rebuild_buckets() { rebuilt_params_.clear(); rebuilt_param_indices_.clear(); - initialize_buckets( - std::move(rebuilt_bucket_indices), std::move(per_bucket_size_limits)); + initialize_buckets(std::move(rebuilt_bucket_indices)); return true; } @@ -1833,7 +1798,7 @@ void Reducer::ensure_prior_reduction_finished() { ": ", unmarked_param_indices); - if (ddp_debug_level_ == DistributedDebugLevel::OFF) { + if (ddp_debug_level_ == DebugLevel::Off) { // Without debug mode, log unmarked_param_indices, as well as // recommendation to use debug mode to print parameter names. kBaseErrorMsg += unmarked_param_indices_info; @@ -1932,7 +1897,7 @@ namespace { // composite key of a tensor's type identifier and its device. struct BucketKey { BucketKey(c10::ScalarType type, c10::Device device) - : type(std::move(type)), device(std::move(device)) {} + : type(type), device(device) {} const c10::ScalarType type; const c10::Device device; @@ -2068,7 +2033,7 @@ compute_bucket_assignment_by_size( bucket_indices.reserve(result.size()); std::vector per_bucket_size_limits; per_bucket_size_limits.reserve(result.size()); - for (const auto & bucket_indices_with_size : result) { + for (const auto& bucket_indices_with_size : result) { bucket_indices.emplace_back(std::get<0>(bucket_indices_with_size)); per_bucket_size_limits.emplace_back(std::get<1>(bucket_indices_with_size)); } @@ -2081,6 +2046,47 @@ void verify_params_across_processes( const c10::intrusive_ptr& process_group, const std::vector& params, const c10::optional>& logger) { + + // First verify number of parameters to avoid inconsistent inputs into + // broadcast which can cause a crash. + // See https://github.com/pytorch/pytorch/issues/73547 + + at::TensorOptions param_size_options; + param_size_options = param_size_options.dtype(at::kLong); + param_size_options = param_size_options.device(params[0].device()); + // Note: Not using tensor building API because of + // https://github.com/pytorch/pytorch/issues/74114 + at::Tensor param_size_tensor = at::tensor( + {static_cast(params.size())}, param_size_options); + + // Allgather and verify parameter size. + std::vector> param_size_output_tensors; + param_size_output_tensors.emplace_back(std::vector{}); + auto world_size = process_group->getSize(); + for (size_t i = 0 ; i < world_size ; ++i) { + param_size_output_tensors.front().emplace_back( + at::empty_like(param_size_tensor) + ); + } + + std::vector param_size_vec{param_size_tensor}; + process_group->allgather(param_size_output_tensors, param_size_vec)->wait(); + auto result_size_tensors = param_size_output_tensors.front(); + for (size_t i = 0; i < world_size ; ++i ) { + auto param_size_for_rank = result_size_tensors[i][0].item(); + TORCH_CHECK( + param_size_for_rank == params.size(), + c10::str( + "DDP expects same model across all ranks, but Rank ", + process_group->getRank(), + " has ", params.size(), " params, while rank ", i, + " has inconsistent ", param_size_for_rank, + " params." + ) + ); + } + + // Continue with parameter shape verification. size_t i = 0; for (const auto& t : params) { i += 2 * t.dim(); @@ -2114,10 +2120,9 @@ void verify_params_across_processes( i = 0; for (const auto p : c10::irange(params.size())) { const auto& t = params[p]; - // I'd like to include which process we are in the message, - // but ProcessGroup::getRank is not public! for (const auto& sz : t.sizes()) { - auto msg = c10::str("params[", p, "] in this process", + auto msg = c10::str("[", process_group->getRank(), + "]: params[", p, "] in this process", " with sizes ", t.sizes(), " appears not to match sizes of the same param in process 0."); diff --git a/torch/csrc/distributed/c10d/reducer.hpp b/torch/csrc/distributed/c10d/reducer.hpp index 541e2c0802a8..cc14a1eb2be6 100644 --- a/torch/csrc/distributed/c10d/reducer.hpp +++ b/torch/csrc/distributed/c10d/reducer.hpp @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include #include @@ -27,77 +29,10 @@ constexpr int kDefaultFirstBucketBytes = int(1024 * 1024); constexpr int kDefaultBucketBytesCap = int(25 * 1024 * 1024); // Collect runtime stats once for every kDDPRuntimeLoggingSampleRate iterations. constexpr int kDDPRuntimeLoggingSampleRate = 100; -constexpr int kUnsetTime = -1; - -inline int64_t current_time_in_nanos() { - return torch::profiler::impl::getTime(); -} // Forward declaration class Logger; -class TORCH_API Timer { - private: - // The timestamp of forward call start time in each iteration. - int64_t forward_start_time = kUnsetTime; - // The timestamp of backward computation start and end time in each - // iteration. - int64_t backward_compute_start_time = kUnsetTime; - int64_t backward_compute_end_time = kUnsetTime; - // The timestamp of first communication call start time in each iteration. - int64_t backward_comm_start_time = kUnsetTime; - // The timestamp of last communication call end time in each iteration. - int64_t backward_comm_end_time = kUnsetTime; - public: - enum class Event { - kForwardStart, - kBackwardComputeStart, - kBackwardComputeEnd, - kBackwardCommStart, - kBackwardCommEnd, - }; - - // Record the current event, i.e., mark it as having occurred now. Default - // CPU implementation. - virtual void record(Event event) { - getTimeRef(event) = current_time_in_nanos(); - } - - // Return the difference between when two events occurred, in nanoseconds. - // Or nullopt if one of them hasn't been recorded. - virtual c10::optional measureDifference(Event start, Event end) = 0; - - virtual ~Timer() = default; - - // Return host-side timestamp, or nullopt if it has not yet been recorded. - c10::optional getTimestamp(Event event) { - auto time = getTimeRef(event); - if (time == kUnsetTime) { - return c10::nullopt; - } else { - return time; - } - } - - // Return host-side time member variable corresponding to the given event. - int64_t& getTimeRef(Event event) { - switch (event) { - case Event::kForwardStart: - return forward_start_time; - case Event::kBackwardComputeStart: - return backward_compute_start_time; - case Event::kBackwardComputeEnd: - return backward_compute_end_time; - case Event::kBackwardCommStart: - return backward_comm_start_time; - case Event::kBackwardCommEnd: - return backward_comm_end_time; - default: - TORCH_INTERNAL_ASSERT(false); - } - } -}; - // Local accumulator type for a single bucket. struct BucketAccumulator { std::vector indices; @@ -105,14 +40,13 @@ struct BucketAccumulator { size_t size_limit = 0; }; -C10_DECLARE_TYPED_REGISTRY(TimerRegistry, c10::DeviceType, Timer, std::unique_ptr, c10::Device); - class TORCH_API Reducer { public: - // The constructor takes a list of variables for every model replica. - // The bucket assignment for this reducer is specified as a list of - // buckets, each of which is specified as a list of indices into the - // variables list for **a single replica** (i.e. `variables[0]`). + // The constructor takes a list of variables (i.e. parameters) for this + // process's single model replica (as DDP assumes single-process + // single-device). The bucket assignment for this reducer, `bucket_indices`, + // is specified as a list of buckets, each of which is specified as a list of + // indices into the bucket's `variables` list. explicit Reducer( std::vector params, std::vector> bucket_indices, @@ -122,18 +56,16 @@ class TORCH_API Reducer { int64_t bucket_bytes_cap, bool find_unused_parameters, bool gradient_as_bucket_view, - std::unordered_map paramNames, + std::unordered_map param_names, int64_t first_bucket_bytes_cap); ~Reducer() noexcept(false); - // To (re-)initialize bucket assignment, pass a list of buckets, each - // of which is specified by a list of indices in the variables list. + // To (re-)initialize bucket assignment, pass a list of buckets, each of + // which is specified by a list of indices in the bucket's `variables` list. // This function performs validation that the variables within a bucket // all live on the same device and have the same dimensionality. - void initialize_buckets( - std::vector> bucket_indices, - std::vector per_bucket_sizes); + void initialize_buckets(std::vector> bucket_indices); // This function is called when the forward function has produced an output, // and the user wishes to reduce gradients in the backwards pass. @@ -311,7 +243,7 @@ class TORCH_API Reducer { void mark_bucket_ready(size_t bucket_index); - void finalize_bucket_dense(Bucket& replica); + void finalize_bucket_dense(Bucket& bucket); void finalize_backward(); @@ -344,117 +276,102 @@ class TORCH_API Reducer { #endif void runGradCallbackForVariable(at::Tensor& variable, GradCallback&& cb); - // A bucket replica represents [1..N] gradients to be reduced, - // with the same dtype, on the same device. - // - // Batching gradients together before reducing them can result in lower - // overhead and/or faster time to completion. Only gradients of the same type - // and on the same device can be batched. The tensor that represents the - // flattened gradient uses the same type and is placed on the same device. - // Buckets are filled as the gradients they hold are computed (triggered by - // autograd hooks). Buckets are reduced in a predetermined order that is - // identical across processes. - struct BucketReplica { - // Flattened (1 dimensional) contents of bucket. - at::Tensor contents; - - // Views into contents for each grad. Each view will be created with - // layout (sizes + strides) matching the grad's expected layout - // ("Gradient Layout Contract" in torch/csrc/autograd/AccumulateGrad.h). - // `bucket_views_in[i].copy_(grad)` and - // `grad.copy_(bucket_views_out[i])` - // provide convenient ways to move grad data in/out of contents. - // The reason we keep two states for bucket_views is that if DDP - // communication hook was registered, `bucket_views_out` could be - // re-initialized with the value of hook's `future_work`. We still need to - // keep a separate view reference to replica's original contents for - // `bucket_views_in[i].copy_(grad)` call. - std::vector bucket_views_in; - std::vector bucket_views_out; - - // Variables that contribute to this bucket replica. Use refcounted value - // here so that we can easily unflatten the bucket contents into the - // participating variables after reduction has completed. - std::vector variables; - - // Per-variable offset/length into the flat bucket contents tensor and grad - // bucket. - std::vector offsets; - std::vector lengths; - - // Per-variable sizes into the grad bucekt. - std::vector sizes_vec; - - // Number of tensors to be added before this bucket is complete. - // This is reset to `variables.size()` every iteration. - size_t pending; - - // TODO(@pietern) - // Memory copies from gradient tensors into the bucket are potentially - // done on different CUDA streams. We record an event for every copy - // so that we can synchronize with them prior to kicking off the reduction. - // std::vector events; - }; - - // This function is called inside `initialize_buckets`, it initializes both - // bucket_views_in and bucket_views_out into the contents tensor for each - // variable's grad. Views serve as entry points to copy_ each grad's data - // in/out of the flat contents tensor. - void initialize_bucket_views(BucketReplica& replica, at::Tensor& contents); + // This function is called inside `initialize_buckets()`. It initializes both + // `bucket_views_in` and `bucket_views_out` with views for each variable's + // gradient into the bucket's flattened `gradients` tensor. Views serve as + // entry points to `copy_()` each grad's data in/out of the flattened + // `gradients` tensor. + void initialize_bucket_views(Bucket& bucket); // This function is called inside `finalize_backward`, it happens only if // DDP communication hook was registered to recreate just bucket_views_out // with the result of `future_work`. - void populate_bucket_views_out(BucketReplica& replica, at::Tensor& tensor); + void populate_bucket_views_out(Bucket& bucket, at::Tensor& tensor); // If gradient_as_bucket_view_ is false, after allreduce buckets, // copy bucket results back to grads. void copy_bucket_to_grad( at::Tensor& variable, - Reducer::BucketReplica& replica, + Reducer::Bucket& bucket, size_t intra_bucket_index, bool global_unused); // Check layout of grad and bucket_view before copying the grad to bucket. void check_grad_layout(const at::Tensor& grad, const at::Tensor& bucket_view); - // A bucket holds N bucket replicas (1 per model replica). - // - // If every bucket in this struct is ready, the reduction can be kicked off. - // One bucket per replica. Reduction is kicked off when every bucket is ready. - // + // A bucket contains [1..N] gradients to be reduced, where the gradients + // have the same dtype and device. + // Coalescing gradients together before reducing can result in lower overhead + // and/or faster time to completion. Coalescing requires the constituent + // gradients to have the same dtype and device, and the resulting flattened + // tensor uses that common dtype and device. The flattened tensor is filled + // as the corresponding gradients are computed (triggered by autograd hooks), + // and the buckets are reduced in a predetermined order consistent across + // processes. struct Bucket { - std::vector replicas; + // Gradients of the bucket flattened into a 1-dimensional tensor + at::Tensor gradients; + + // Views into the `gradients` tensor for each individual gradient + // Each view is created with layout (size and stride) matching the + // gradient's expected layout (see the "Gradient Layout Contract" in + // torch/csrc/autograd/functions/accumulate_grad.h). + // `bucket_views_in[i].copy_(grad)` and `grad.copy_(bucket_views_out[i])` + // provide convenient ways to copy gradient data in/out of `gradients`, + // respectively. + // We keep both `bucket_views_in` and `bucket_views_out` because + // registering a DDP communication hook may re-initialize + // `bucket_views_out` with the value of the hook's `future_work` but we + // still need separate views into the bucket's original flattened gradient + // to copy in gradient data. + std::vector bucket_views_in; + std::vector bucket_views_out; - // Global indices of participating variables in the bucket - std::vector variable_indices; + // Variables whose gradients are held in this bucket + // We use refcounted tensors here so that we can easily unflatten the + // bucket's flattened `gradients` tensor into the participating variables + // after reduction has completed. + std::vector variables; - // Number of replicas to be marked done before this bucket is ready. + // Per-variable offset/length into the flattened `gradients` tensor and + // the corresponding `GradBucket` instance for communication hooks + std::vector offsets; + std::vector lengths; + + // Per-variable sizes slicing into the bucket's `gradients` tensor + std::vector sizes_vec; + + // Number of gradients left to be computed before the bucket is ready to + // be reduced size_t pending; - // Keep future work handle around DDP comm hook. - // If no hook is registered, a temporary vanilla allreduce hook will be - // used. + // Global indices of participating variables in the bucket + std::vector variable_indices; + + // Future work handle for DDP communication hook + // If no hook is registered, a temporary vanilla allreduce hook is used. c10::intrusive_ptr future_work; - // If this bucket should expect a single sparse gradient. - // Implies: replicas[i].variables.size() == 1. + // If this bucket should expect a single sparse gradient + // If `true`, then this implies that `bucket.variables.size() == 1`. bool expect_sparse_gradient = false; - // "Limit" of cumulative parameter sizes that this bucket manages. It is - // actually a soft limit because we don't shard parameters across buckets - // so a single parameter may push it over the cap. - size_t bucket_size_limit; + + // TODO(@pietern) + // Memory copies from gradient tensors into the bucket are potentially + // done on different CUDA streams. We record an event for every copy + // so that we can synchronize with them prior to kicking off the reduction. + // std::vector events; + }; std::vector buckets_; - // A variable locator locates a particular variable in the bucket - // structure. The `bucket_index` field points to the bucket in the `buckets_` - // vector. The `intra_bucket_index` field points to the index of the variable - // in any of the vector fields in the bucket replica. + // A variable locator locates a particular variable in the reducer's buckets struct VariableLocator { - // Index into the `buckets_` variable. + // Index of the bucket containing the variable in the `buckets_` vector size_t bucket_index; - // Index of parameter in single bucket replica. + // Index of the variable in the bucket, which may be used consistently + // across `bucket_views_in`, `bucket_views_out`, `variables`, `offsets`, + // `lengths`, `sizes_vec`, and `variable_indices` in `Bucket` size_t intra_bucket_index; VariableLocator() = default; @@ -568,7 +485,7 @@ class TORCH_API Reducer { std::unique_ptr comm_hook_; // Debug level setting. It is parsed once when Reducer is constructed, and // remains the same across a single invocation of DDP training. - DistributedDebugLevel ddp_debug_level_; + DebugLevel ddp_debug_level_; // Mapping of variable index to fully qualified name of model to notify users // about errors when certain parameters do not get gradient. std::unordered_map param_names_; diff --git a/torch/csrc/distributed/c10d/reducer_cuda.cpp b/torch/csrc/distributed/c10d/reducer_cuda.cpp index b836cddd8017..a1c570da5d59 100644 --- a/torch/csrc/distributed/c10d/reducer_cuda.cpp +++ b/torch/csrc/distributed/c10d/reducer_cuda.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include diff --git a/torch/csrc/distributed/c10d/reducer_timer.hpp b/torch/csrc/distributed/c10d/reducer_timer.hpp new file mode 100644 index 000000000000..ba696383b88e --- /dev/null +++ b/torch/csrc/distributed/c10d/reducer_timer.hpp @@ -0,0 +1,75 @@ +#pragma once +#include + +namespace c10d { +constexpr int kUnsetTime = -1; + +inline int64_t current_time_in_nanos() { + return torch::profiler::impl::getTime(); +} + +class TORCH_API Timer { + private: + // The timestamp of forward call start time in each iteration. + int64_t forward_start_time = kUnsetTime; + // The timestamp of backward computation start and end time in each + // iteration. + int64_t backward_compute_start_time = kUnsetTime; + int64_t backward_compute_end_time = kUnsetTime; + // The timestamp of first communication call start time in each iteration. + int64_t backward_comm_start_time = kUnsetTime; + // The timestamp of last communication call end time in each iteration. + int64_t backward_comm_end_time = kUnsetTime; + + public: + enum class Event { + kForwardStart, + kBackwardComputeStart, + kBackwardComputeEnd, + kBackwardCommStart, + kBackwardCommEnd, + }; + + // Record the current event, i.e., mark it as having occurred now. Default + // CPU implementation. + virtual void record(Event event) { + getTimeRef(event) = current_time_in_nanos(); + } + + // Return the difference between when two events occurred, in nanoseconds. + // Or nullopt if one of them hasn't been recorded. + virtual c10::optional measureDifference(Event start, Event end) = 0; + + virtual ~Timer() = default; + + // Return host-side timestamp, or nullopt if it has not yet been recorded. + c10::optional getTimestamp(Event event) { + auto time = getTimeRef(event); + if (time == kUnsetTime) { + return c10::nullopt; + } else { + return time; + } + } + + // Return host-side time member variable corresponding to the given event. + int64_t& getTimeRef(Event event) { + switch (event) { + case Event::kForwardStart: + return forward_start_time; + case Event::kBackwardComputeStart: + return backward_compute_start_time; + case Event::kBackwardComputeEnd: + return backward_compute_end_time; + case Event::kBackwardCommStart: + return backward_comm_start_time; + case Event::kBackwardCommEnd: + return backward_comm_end_time; + default: + TORCH_INTERNAL_ASSERT(false); + } + } +}; + +C10_DECLARE_TYPED_REGISTRY(TimerRegistry, c10::DeviceType, Timer, std::unique_ptr, c10::Device); +} // namespace c10d diff --git a/torch/csrc/distributed/c10d/socket.cpp b/torch/csrc/distributed/c10d/socket.cpp index c99950f85895..acd819ab631c 100644 --- a/torch/csrc/distributed/c10d/socket.cpp +++ b/torch/csrc/distributed/c10d/socket.cpp @@ -1,4 +1,4 @@ -// Copyright (c) Facebook, Inc. and its affiliates. +// Copyright (c) Meta Platforms, Inc. and its affiliates. // All rights reserved. // // This source code is licensed under the BSD-style license found in the @@ -273,7 +273,7 @@ std::unique_ptr SocketImpl::accept() const { addr.ai_addr = addr_ptr; addr.ai_addrlen = addr_len; - C10D_INFO("The server socket on {} has accepted a connection from {}.", *this, addr); + C10D_DEBUG("The server socket on {} has accepted a connection from {}.", *this, addr); auto impl = std::make_unique(hnd); @@ -414,17 +414,17 @@ SocketListenOp::SocketListenOp(std::uint16_t port, const SocketOptions& opts) std::unique_ptr SocketListenOp::run() { if (opts_->prefer_ipv6()) { - C10D_INFO("The server socket will attempt to listen on an IPv6 address."); + C10D_DEBUG("The server socket will attempt to listen on an IPv6 address."); if (tryListen(AF_INET6)) { return std::move(socket_); } - C10D_INFO("The server socket will attempt to listen on an IPv4 address."); + C10D_DEBUG("The server socket will attempt to listen on an IPv4 address."); if (tryListen(AF_INET)) { return std::move(socket_); } } else { - C10D_INFO("The server socket will attempt to listen on an IPv4 or IPv6 address."); + C10D_DEBUG("The server socket will attempt to listen on an IPv4 or IPv6 address."); if (tryListen(AF_UNSPEC)) { return std::move(socket_); } @@ -459,7 +459,7 @@ bool SocketListenOp::tryListen(int family) { addrinfo_ptr result{naked_result}; for (::addrinfo* addr = naked_result; addr != nullptr; addr = addr->ai_next) { - C10D_INFO("The server socket is attempting to listen on {}.", *addr); + C10D_DEBUG("The server socket is attempting to listen on {}.", *addr); if (tryListen(*addr)) { return true; } @@ -534,8 +534,7 @@ class SocketConnectOp { enum class ConnectResult { Success, Error, - Retry, - TimeOut + Retry }; public: @@ -550,6 +549,8 @@ class SocketConnectOp { ConnectResult tryConnectCore(const ::addrinfo& addr); + [[noreturn]] void throwTimeoutError() const; + template void recordError(fmt::string_view format, Args&&... args) { auto msg = fmt::vformat(format, fmt::make_format_args(args...)); @@ -576,25 +577,25 @@ SocketConnectOp::SocketConnectOp(const std::string& host, std::unique_ptr SocketConnectOp::run() { if (opts_->prefer_ipv6()) { - C10D_INFO("The client socket will attempt to connect to an IPv6 address of ({}, {}).", - host_, - port_); + C10D_DEBUG("The client socket will attempt to connect to an IPv6 address of ({}, {}).", + host_, + port_); if (tryConnect(AF_INET6)) { return std::move(socket_); } - C10D_INFO("The client socket will attempt to connect to an IPv4 address of ({}, {}).", - host_, - port_); + C10D_DEBUG("The client socket will attempt to connect to an IPv4 address of ({}, {}).", + host_, + port_); if (tryConnect(AF_INET)) { return std::move(socket_); } } else { - C10D_INFO("The client socket will attempt to connect to an IPv4 or IPv6 address of ({}, {}).", - host_, - port_); + C10D_DEBUG("The client socket will attempt to connect to an IPv4 or IPv6 address of ({}, {}).", + host_, + port_); if (tryConnect(AF_UNSPEC)) { return std::move(socket_); @@ -612,58 +613,66 @@ std::unique_ptr SocketConnectOp::run() { } bool SocketConnectOp::tryConnect(int family) { - ::addrinfo hints{}, *naked_result = nullptr; - + ::addrinfo hints{}; hints.ai_flags = AI_V4MAPPED | AI_ALL | AI_NUMERICSERV; hints.ai_family = family; hints.ai_socktype = SOCK_STREAM; - int r = ::getaddrinfo(host_, port_.c_str(), &hints, &naked_result); - if (r != 0) { - const char* gai_err = ::gai_strerror(r); - - recordError("The {}network addresses of ({}, {}) cannot be retrieved (gai error: {} - {}).", - family == AF_INET ? "IPv4 " : family == AF_INET6 ? "IPv6 " : "", - host_, - port_, - r, - gai_err); - - return false; - } - - addrinfo_ptr result{naked_result}; - deadline_ = Clock::now() + opts_->connect_timeout(); + std::size_t retry_attempt = 1; + bool retry; // NOLINT(cppcoreguidelines-init-variables) do { retry = false; errors_.clear(); - for (::addrinfo* addr = naked_result; addr != nullptr; addr = addr->ai_next) { - C10D_INFO("The client socket is attempting to connect to {}.", *addr); + ::addrinfo *naked_result = nullptr; + // patternlint-disable cpp-dns-deps + int r = ::getaddrinfo(host_, port_.c_str(), &hints, &naked_result); + if (r != 0) { + const char* gai_err = ::gai_strerror(r); + + recordError("The {}network addresses of ({}, {}) cannot be retrieved (gai error: {} - {}).", + family == AF_INET ? "IPv4 " : family == AF_INET6 ? "IPv6 " : "", + host_, + port_, + r, + gai_err); + retry = true; + } else { + addrinfo_ptr result{naked_result}; + + for (::addrinfo* addr = naked_result; addr != nullptr; addr = addr->ai_next) { + C10D_TRACE("The client socket is attempting to connect to {}.", *addr); + + ConnectResult cr = tryConnect(*addr); + if (cr == ConnectResult::Success) { + return true; + } - ConnectResult cr = tryConnect(*addr); - if (cr == ConnectResult::Success) { - return true; + if (cr == ConnectResult::Retry) { + retry = true; + } } + } - if (cr == ConnectResult::TimeOut) { - auto msg = fmt::format( - "The client socket has timed out after {} while trying to connect to ({}, {}).", - opts_->connect_timeout(), - host_, - port_); + if (retry) { + if (Clock::now() < deadline_ - delay_duration_) { + // Prevent our log output to be too noisy, warn only every 30 seconds. + if (retry_attempt == 30) { + C10D_INFO("No socket on ({}, {}) is listening yet, will retry.", host_, port_); - C10D_ERROR(msg); + retry_attempt = 0; + } - throw TimeoutError{msg}; - } + // Wait one second to avoid choking the server. + delay(delay_duration_); - if (cr == ConnectResult::Retry) { - retry = true; + retry_attempt++; + } else { + throwTimeoutError(); } } } while (retry); @@ -673,7 +682,7 @@ bool SocketConnectOp::tryConnect(int family) { SocketConnectOp::ConnectResult SocketConnectOp::tryConnect(const ::addrinfo& addr) { if (Clock::now() >= deadline_) { - return ConnectResult::TimeOut; + throwTimeoutError(); } SocketImpl::Handle hnd = ::socket(addr.ai_family, addr.ai_socktype, addr.ai_protocol); @@ -698,16 +707,9 @@ SocketConnectOp::ConnectResult SocketConnectOp::tryConnect(const ::addrinfo& add // Retry if the server is not yet listening or if its backlog is exhausted. if (err == std::errc::connection_refused || err == std::errc::connection_reset) { - C10D_WARNING("The server socket on {} is not yet listening {}, will retry.", addr, err); + C10D_TRACE("The server socket on {} is not yet listening {}, will retry.", addr, err); - if (Clock::now() < deadline_ - delay_duration_) { - // Wait a little to avoid choking the server. - delay(delay_duration_); - - return ConnectResult::Retry; - } else { - return ConnectResult::TimeOut; - } + return ConnectResult::Retry; } else { recordError("The client socket has failed to connect to {} {}.", addr, err); @@ -715,10 +717,6 @@ SocketConnectOp::ConnectResult SocketConnectOp::tryConnect(const ::addrinfo& add } } - if (cr == ConnectResult::TimeOut) { - return cr; - } - socket_->closeOnExec(); // TODO: Remove once we fully migrate to non-blocking mode. @@ -750,7 +748,7 @@ SocketConnectOp::ConnectResult SocketConnectOp::tryConnectCore(const ::addrinfo& Duration remaining = deadline_ - Clock::now(); if (remaining <= Duration::zero()) { - return ConnectResult::TimeOut; + throwTimeoutError(); } ::pollfd pfd{}; @@ -761,7 +759,7 @@ SocketConnectOp::ConnectResult SocketConnectOp::tryConnectCore(const ::addrinfo& r = pollFd(&pfd, 1, static_cast(ms.count())); if (r == 0) { - return ConnectResult::TimeOut; + throwTimeoutError(); } if (r == -1) { return ConnectResult::Error; @@ -785,6 +783,18 @@ SocketConnectOp::ConnectResult SocketConnectOp::tryConnectCore(const ::addrinfo& } } +void SocketConnectOp::throwTimeoutError() const { + auto msg = fmt::format( + "The client socket has timed out after {} while trying to connect to ({}, {}).", + opts_->connect_timeout(), + host_, + port_); + + C10D_ERROR(msg); + + throw TimeoutError{msg}; +} + } // namespace void Socket::initialize() { diff --git a/torch/csrc/distributed/c10d/socket.h b/torch/csrc/distributed/c10d/socket.h index e247a2a0816a..c26900760fbe 100644 --- a/torch/csrc/distributed/c10d/socket.h +++ b/torch/csrc/distributed/c10d/socket.h @@ -1,4 +1,4 @@ -// Copyright (c) Facebook, Inc. and its affiliates. +// Copyright (c) Meta Platforms, Inc. and its affiliates. // All rights reserved. // // This source code is licensed under the BSD-style license found in the diff --git a/torch/csrc/distributed/rpc/agent_utils.cpp b/torch/csrc/distributed/rpc/agent_utils.cpp index 45ffb2903bb0..dae9c162fe9d 100644 --- a/torch/csrc/distributed/rpc/agent_utils.cpp +++ b/torch/csrc/distributed/rpc/agent_utils.cpp @@ -41,6 +41,113 @@ std::unordered_map collectNames( return nameToId; } +std::vector splitString( + const std::string& s, + const std::string& delim) { + std::vector tokens; + size_t start = 0; + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + size_t end; + // Iterate through each delimiter + while ((end = s.find(delim, start)) != std::string::npos) { + tokens.emplace_back(s.substr(start, end - start)); + start = end + delim.length(); + } + tokens.emplace_back(s.substr(start)); + return tokens; +} + +const std::string allWorkerInfosKey = "_ALL_WORKER_INFOS"; + +std::unordered_map collectCurrentNames( + ::c10d::PrefixStore store, + const worker_id_t selfId, + const std::string& selfName) { + std::vector selfNameVector( + (uint8_t*)selfName.c_str(), + (uint8_t*)selfName.c_str() + selfName.length()); + + // Check that ID does not already exist and set {ID : NAME} + std::vector resultVector = store.compareSet( + c10::to_string(selfId), std::vector(), selfNameVector); + TORCH_CHECK( + resultVector == selfNameVector, + "RPC worker id ", + selfId, + " is not unique. Worker ", + resultVector, + " and already has ID and ", + selfNameVector, + " cannot be added."); + + store.set(c10::to_string(selfId), selfNameVector); + + std::unordered_map nameToId; + nameToId.emplace(selfName, selfId); + + // Check to see if there is list of worker names in the store + bool worker_names_available = + store.check(std::vector{allWorkerInfosKey}); + std::string allWorkerInfos; + if (worker_names_available) { + // Get the current list of workers + std::vector allWorkerInfosKeyVector = store.get(allWorkerInfosKey); + allWorkerInfos = std::string( + (char*)allWorkerInfosKeyVector.data(), allWorkerInfosKeyVector.size()); + // workerInfos are comma separated with a comma at the end (e.g. + // "Name1-Rank1,Name2-Rank2,Name3-Rank2,") parse list of workers. + if (!allWorkerInfos.empty()) { + for (const std::string& workerInfoString : splitString( + allWorkerInfos.substr(0, allWorkerInfos.size() - 1), ",")) { + auto workerInfoVec = splitString(workerInfoString, "-"); + std::string workerName = workerInfoVec.at(0); + int workerId = std::stoi(workerInfoVec.at(1)); + + TORCH_CHECK( + nameToId.find(workerName) == nameToId.end(), + "RPC worker name ", + workerName, + " is not unique. Workers ", + nameToId.find(workerName)->second, + " and ", + workerId, + " share the same name."); + + nameToId.emplace(workerName, workerId); + } + } + } + // Add own name to worker list + allWorkerInfos = fmt::format("{}{}-{},", allWorkerInfos, selfName, selfId); + std::vector allWorkerInfosVector( + (uint8_t*)allWorkerInfos.c_str(), + (uint8_t*)allWorkerInfos.c_str() + allWorkerInfos.length()); + store.set(allWorkerInfosKey, allWorkerInfosVector); + + return nameToId; +} + +void removeCurrentName( + ::c10d::PrefixStore store, + const worker_id_t selfId, + const std::string& selfName) { + // Get current list of names/ranks + std::vector allWorkerInfosKeyVector = store.get(allWorkerInfosKey); + std::string allWorkerInfos = std::string( + (char*)allWorkerInfosKeyVector.data(), allWorkerInfosKeyVector.size()); + + // Remove the current name and rank + std::string str_to_erase = fmt::format("{}-{},", selfName, selfId); + int start_position_to_erase = allWorkerInfos.find(str_to_erase); + allWorkerInfos.erase(start_position_to_erase, str_to_erase.length()); + + // Set the new data + std::vector newAllWorkerInfosVector( + (uint8_t*)allWorkerInfos.c_str(), + (uint8_t*)allWorkerInfos.c_str() + allWorkerInfos.length()); + store.set(allWorkerInfosKey, newAllWorkerInfosVector); +} + const string storeKeyBarrierId = "_ID_"; const string storeKeyProcessCount = "PROCESS_COUNT"; const string storeKeyActiveCallCount = "ACTIVE_CALLS"; diff --git a/torch/csrc/distributed/rpc/agent_utils.h b/torch/csrc/distributed/rpc/agent_utils.h index befa26b86037..0288e0c063bb 100644 --- a/torch/csrc/distributed/rpc/agent_utils.h +++ b/torch/csrc/distributed/rpc/agent_utils.h @@ -16,6 +16,24 @@ std::unordered_map collectNames( const std::string& selfName, const int worldSize); +// Ranks in dynamic RPC groups will initially call into this to establish the +// name-to-id mapping for the current peers in the group. The current rank will +// put its own worker info in the store and discover all the ranks that came +// before it. NOTE: This needs to be called with the Dynamic RPC group +// membership management token held. +std::unordered_map collectCurrentNames( + ::c10d::PrefixStore store, + const worker_id_t selfId, + const std::string& selfName); + +// Remove name frmo Store, used in dynamic RPC groups. +// NOTE: This needs to be called with the Dynamic RPC group +// membership management token held. +void removeCurrentName( + ::c10d::PrefixStore store, + const worker_id_t selfId, + const std::string& selfName); + // This performs a synchronization of all call counts by using store. // All RPC peers wait for others to join to exit at the same time. int syncCallCount( diff --git a/torch/csrc/distributed/rpc/init.cpp b/torch/csrc/distributed/rpc/init.cpp index fd6f3aca9485..7b8a2d1f18da 100644 --- a/torch/csrc/distributed/rpc/init.cpp +++ b/torch/csrc/distributed/rpc/init.cpp @@ -110,11 +110,26 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) { // c10::hash, so we need to use the qualified name // py::detail::hash, which unfortunately is in a detail namespace. .def(py::detail::hash(py::self)) // NOLINT - .def("__repr__", [](const WorkerInfo& workerInfo) { - std::ostringstream os; - os << workerInfo; - return os.str(); - }); + .def( + "__repr__", + [](const WorkerInfo& workerInfo) { + std::ostringstream os; + os << workerInfo; + return os.str(); + }) + .def(py::pickle( + /* __getstate__ */ + [](const WorkerInfo& workerInfo) { + return py::make_tuple(workerInfo.name_, workerInfo.id_); + }, + /* __setstate__ */ + [](py::tuple t) { + TORCH_CHECK(t.size() == 2, "Invalid WorkerInfo state."); + + WorkerInfo info( + t[0].cast(), t[1].cast()); + return info; + })); auto rpcAgent = shared_ptr_class_(module, "RpcAgent") @@ -122,7 +137,8 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) { "join", &RpcAgent::join, py::call_guard(), - py::arg("shutdown") = false) + py::arg("shutdown") = false, + py::arg("timeout") = 0) .def( "sync", &RpcAgent::sync, py::call_guard()) .def( @@ -561,7 +577,7 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) { [](const c10::intrusive_ptr<::c10d::Store>& store, std::string selfName, worker_id_t selfId, - int worldSize, + optional worldSize, TensorPipeRpcBackendOptions opts, std::unordered_map reverseDeviceMaps, std::vector devices) { @@ -588,7 +604,8 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) { "join", &TensorPipeAgent::join, py::call_guard(), - py::arg("shutdown") = false) + py::arg("shutdown") = false, + py::arg("timeout") = 0) .def( "shutdown", &TensorPipeAgent::shutdown, @@ -617,7 +634,17 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) { "_get_device_map", (DeviceMap(TensorPipeAgent::*)(const WorkerInfo& dst) const) & TensorPipeAgent::getDeviceMap, - py::call_guard()); + py::call_guard()) + .def( + "_get_backend_options", + &TensorPipeAgent::getBackendOptions, + py::call_guard()) + .def( + "_update_group_membership", + &TensorPipeAgent::updateGroupMembership, + py::call_guard()) + .def_readonly("is_static_group", &TensorPipeAgent::isStaticGroup_) + .def_property_readonly("store", &TensorPipeAgent::getStore); #endif // USE_TENSORPIPE diff --git a/torch/csrc/distributed/rpc/rpc_agent.h b/torch/csrc/distributed/rpc/rpc_agent.h index e50100e331f4..cd427d4a90ea 100644 --- a/torch/csrc/distributed/rpc/rpc_agent.h +++ b/torch/csrc/distributed/rpc/rpc_agent.h @@ -213,7 +213,7 @@ class TORCH_API RpcAgent { // Call sync and join all internal threads. This method should be called // before every RPC process exits. - virtual void join(bool shutdown = false) = 0; + virtual void join(bool shutdown = false, float timeout = 0) = 0; // Synchronize the this process with other ``RpcAgent`` processes. Block until // all ``RpcAgent``s reach this method and send all pending messages. diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp index aaaf3c673f75..7426eb20807a 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp +++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp @@ -41,7 +41,7 @@ std::vector getDevicesForTensors( const std::string& remoteName) { // If the deviceMap is overridden, use that instead. const auto errStr = c10::str( - "TensorPipe RPC backend only supports CPU tensors by default, please " + "TensorPipe RPC backend only supports CPU and Meta tensors by default, please " "move your tensors to CPU before sending them over RPC, or call " "`set_device_map` on `TensorPipeRpcBackendOptions` to explicitly " "configure device mapping. ", @@ -51,7 +51,9 @@ std::vector getDevicesForTensors( devices.reserve(tensors.size()); bool hasMappedDevice = false; for (const auto& t : tensors) { - if (t.device().is_cpu()) { + if (t.device().is_meta()) { + devices.emplace_back(c10::kMeta); + } else if (t.device().is_cpu()) { const auto deviceIter = deviceMap.find(c10::kCPU); if (deviceIter == deviceMap.end()) { devices.emplace_back(c10::kCPU); @@ -113,7 +115,7 @@ std::vector getDevicesOfTensors( size_t deviceCount = 0; std::vector indexBitset; for (const torch::Tensor& tensor : tensors) { - if (!tensor.is_cpu()) { + if (!tensor.is_cpu() && !tensor.is_meta()) { c10::Device device = tensor.device(); if (!impl.has_value()) { impl.emplace(device.type()); @@ -342,9 +344,15 @@ void TensorPipeAgent::removeFromTimeoutMap(uint64_t messageId) { } } -void TensorPipeAgent::prepareNames() { - auto nameToId = collectNames( - rankToNameStore_, workerInfo_.id_, workerInfo_.name_, worldSize_); +void TensorPipeAgent::prepareNames(bool isStaticGroup) { + std::unordered_map nameToId; + if (isStaticGroup) { + nameToId = collectNames( + rankToNameStore_, workerInfo_.id_, workerInfo_.name_, worldSize_); + } else { + nameToId = collectCurrentNames( + rankToNameStore_, workerInfo_.id_, workerInfo_.name_); + } for (const auto& entry : nameToId) { const auto& workerName = entry.first; @@ -354,11 +362,35 @@ void TensorPipeAgent::prepareNames() { } } +void TensorPipeAgent::checkAndSetStaticGroup( + const c10::intrusive_ptr<::c10d::Store>& store) { + std::string isStaticGroupKey("rpcIsStaticGroup"); + + std::string isStaticGroupStr = isStaticGroup_ ? "true" : "false"; + std::vector isStaticGroupVec( + (uint8_t*)isStaticGroupStr.c_str(), + (uint8_t*)isStaticGroupStr.c_str() + isStaticGroupStr.length()); + std::vector returnedVec; + returnedVec = store->compareSet( + isStaticGroupKey, std::vector(), isStaticGroupVec); + std::string returnedVal = std::string(returnedVec.begin(), returnedVec.end()); + // In both cases, the returned value should be the value of isStaticGroupStr, + // otherwise there is a discrepency with initialization among one of the + // members + TORCH_CHECK( + returnedVal == isStaticGroupStr, + fmt::format( + "RPC group mixes statically and dynamically initialized members which is not supported. ", + "Static group property is initialized as {} and is trying to be set as {} ", + isStaticGroup_, + returnedVal)); +} + TensorPipeAgent::TensorPipeAgent( const c10::intrusive_ptr<::c10d::Store>& store, std::string selfName, worker_id_t selfId, - int worldSize, + optional worldSize, TensorPipeRpcBackendOptions opts, std::unordered_map reverseDeviceMaps, std::vector devices, @@ -368,6 +400,8 @@ TensorPipeAgent::TensorPipeAgent( std::move(cb), std::chrono::milliseconds( (long)(opts.rpcTimeoutSeconds * kSecToMsConversion))), + isStaticGroup_(worldSize.has_value()), + store_(store), opts_(std::move(opts)), reverseDeviceMaps_(std::move(reverseDeviceMaps)), devices_(std::move(devices)), @@ -376,10 +410,16 @@ TensorPipeAgent::TensorPipeAgent( tensorpipe::ContextOptions().name(workerInfo_.name_))), rankToNameStore_("names", store), nameToAddressStore_("addrs", store), - shutdownStore_("shutdown", store), - worldSize_(worldSize) { + shutdownStore_("shutdown", store) { + if (isStaticGroup_) { + worldSize_ = worldSize.value(); + } + + // check the static group attribute against store + checkAndSetStaticGroup(store); + // collect worker names - prepareNames(); + prepareNames(isStaticGroup_); // Initialize the time-series metrics tracking map timeSeriesMetrics_.emplace(kGilAverageWaitTime, TimeSeriesMetricsTracker()); @@ -524,7 +564,11 @@ void TensorPipeAgent::pipeRead( return; } - std::vector streams = getStreamsFromPoolForDevices(devices_); + std::vector streams; + { + GroupMembershipLockGuard guard(groupMembershipMutex_, isStaticGroup_); + streams = getStreamsFromPoolForDevices(devices_); + } tensorpipe::Allocation tpAllocation; TensorpipeReadBuffers tpBuffers; std::tie(tpAllocation, tpBuffers) = @@ -604,24 +648,26 @@ void TensorPipeAgent::sendCompletedResponseMessage( for (const auto& tensor : responseMessage->tensors()) { const auto device = tensor.device(); - if (!device.is_cpu() && - std::find(devices_.begin(), devices_.end(), device) == - devices_.end()) { - std::ostringstream oss; - std::copy( - devices_.begin(), - devices_.end(), - std::ostream_iterator(oss, ", ")); - responseMessage = createExceptionResponse( - c10::str( - "RPC detected that a user-function output tensor on device ", - device, - ". This device is not one of the input tensor devices: ", - oss.str(), - "which is not yet supported. Please file a feature request " - "issue in PyTorch GitHub repo."), - messageId); - break; + if (!device.is_cpu() && !device.is_meta()) { + GroupMembershipLockGuard guard(groupMembershipMutex_, isStaticGroup_); + if (std::find(devices_.begin(), devices_.end(), device) == + devices_.end()) { + std::ostringstream oss; + std::copy( + devices_.begin(), + devices_.end(), + std::ostream_iterator(oss, ", ")); + responseMessage = createExceptionResponse( + c10::str( + "RPC detected that a user-function output tensor on device ", + device, + ". This device is not one of the input tensor devices: ", + oss.str(), + "which is not yet supported. Please file a feature request " + "issue in PyTorch GitHub repo."), + messageId); + break; + } } } @@ -784,7 +830,12 @@ c10::intrusive_ptr TensorPipeAgent::send( } ClientPipe& clientPipe = it->second; - auto futureResponseMessage = std::make_shared(devices_); + std::shared_ptr + futureResponseMessage; + { + GroupMembershipLockGuard guard(groupMembershipMutex_, isStaticGroup_); + futureResponseMessage = std::make_shared(devices_); + } uint64_t messageId = nextMessageID_++; requestMessage->setId(messageId); @@ -844,7 +895,11 @@ c10::intrusive_ptr TensorPipeAgent::send( VLOG(1) << "RPC agent for " << workerInfo_.name_ << " is sending request #" << messageId << " to " << clientPipe.pipe_->getRemoteName(); - std::vector streams = getStreamsFromPoolForDevices(devices_); + std::vector streams; + { + GroupMembershipLockGuard guard(groupMembershipMutex_, isStaticGroup_); + streams = getStreamsFromPoolForDevices(devices_); + } makeStreamsWaitOnOthers( streams, getCurrentStreamsForDevices( @@ -1011,9 +1066,27 @@ void TensorPipeAgent::pollTimeoutRpcs() { } } +void TensorPipeAgent::leaveGroup() { + std::unique_lock lock(callCountMutex_); + // local worker ActiveCallCount is 0 at this point and we will shutdown + // (any future calls will be dropped) + callCountCV_.wait(lock, [this] { return clientActiveCalls_ == 0; }); + + // Remove this agent's WorkerInfo from store + removeCurrentName(rankToNameStore_, workerInfo_.id_, workerInfo_.name_); + + // Set internal variable to be used during destructor + shuttingDown_ = true; +} + // TODO: Remove join() -void TensorPipeAgent::join(bool shutdown) { +void TensorPipeAgent::join(bool shutdown, float /* unused */) { VLOG(1) << "RPC agent for " << workerInfo_.name_ << " is joining"; + if (!isStaticGroup_) { + leaveGroup(); + return; + } + // This method behaves like a barrier, as it can only return once all workers // have no more requests pending, including "nested" requests (triggered from // within the remote code of another call) and "follow-up" requests (triggered @@ -1024,6 +1097,7 @@ void TensorPipeAgent::join(bool shutdown) { // It is enough to wait for there to be no more active client calls, since // each server call corresponds to a client call for some other worker. callCountCV_.wait(lock, [this] { return clientActiveCalls_ == 0; }); + // We'd like to immediately proceed with the allreduce, but it's a call // that may block for some time, as it waits for other workers to also // complete all their active client calls. While we call allreduce we must @@ -1096,16 +1170,34 @@ void TensorPipeAgent::shutdownImpl() { const WorkerInfo& TensorPipeAgent::getWorkerInfo( const std::string& workerName) const { - const auto& it = workerNameToInfo_.find(workerName); + std::unordered_map::const_iterator it; + { + GroupMembershipLockGuard guard(groupMembershipMutex_, isStaticGroup_); + it = workerNameToInfo_.find(workerName); + } TORCH_CHECK( - it != workerNameToInfo_.end(), "Unknown destination worker ", workerName); + it != workerNameToInfo_.end(), + fmt::format( + "name:{},rank:{} could not find destination name {}", + workerInfo_.name_, + workerInfo_.id_, + workerName)); return it->second; } const WorkerInfo& TensorPipeAgent::getWorkerInfo(worker_id_t workerId) const { - const auto& it = workerIdToInfo_.find(workerId); + std::unordered_map::const_iterator it; + { + GroupMembershipLockGuard guard(groupMembershipMutex_, isStaticGroup_); + it = workerIdToInfo_.find(workerId); + } TORCH_CHECK( - it != workerIdToInfo_.end(), "Unknown destination worker ", workerId); + it != workerIdToInfo_.end(), + fmt::format( + "name:{},rank:{} could not find destination id {}", + workerInfo_.name_, + workerInfo_.id_, + workerId)); return it->second; } @@ -1119,12 +1211,74 @@ std::vector TensorPipeAgent::getWorkerInfos() const { const std::string& TensorPipeAgent::findWorkerURL( const WorkerInfo& worker) const { - const auto it = workerNameToURL_.find(worker.name_); + std::unordered_map::const_iterator it; + { + GroupMembershipLockGuard guard(groupMembershipMutex_, isStaticGroup_); + it = workerNameToURL_.find(worker.name_); + } TORCH_CHECK( - it != workerNameToURL_.end(), "Unknown worker name: ", worker.name_); + it != workerNameToURL_.end(), + fmt::format( + "name:{},rank:{} could not find destination url for name {}", + workerInfo_.name_, + workerInfo_.id_, + worker.name_)); return it->second; } +void TensorPipeAgent::updateGroupMembership( + const WorkerInfo& workerInfo, + const std::vector devices, + const std::unordered_map reverseDeviceMaps, + bool isJoin) { + std::string name = workerInfo.name_; + worker_id_t id = workerInfo.id_; + // Rank with workerInfo is joining the group, update internal mappings + if (isJoin) { + GroupMembershipLockGuard guard(groupMembershipMutex_, isStaticGroup_); + workerIdToInfo_.emplace(id, workerInfo); + workerNameToInfo_.emplace(name, workerInfo); + + // TODO: we should get nodeAddrStr in the joining process, then pass in as + // an argument rather than getting from store each time + auto nodeAddrData = nameToAddressStore_.get(name); + auto nodeAddrStr = + std::string((const char*)nodeAddrData.data(), nodeAddrData.size()); + workerNameToURL_.insert({name, nodeAddrStr}); + + for (const auto& it : reverseDeviceMaps) { + if (reverseDeviceMaps_.find(it.first) == reverseDeviceMaps_.end()) { + reverseDeviceMaps_[it.first] = it.second; + } + } + // TODO: clean up mutex for devices_ usage + // Add devices that have not been added yet + for (const auto& it : devices) { + if (std::find(devices_.begin(), devices_.end(), it) == devices_.end()) { + devices_.push_back(it); + } + } + } else { + workerIdToInfo_.erase(id); + workerNameToInfo_.erase(name); + workerNameToURL_.erase(name); + + for (const auto& it : reverseDeviceMaps_) { + if (reverseDeviceMaps.find(it.first) == reverseDeviceMaps.end()) { + reverseDeviceMaps_.erase(it.first); + } + } + + auto iter = devices_.begin(); + while (iter != devices_.end()) { + if (std::find(devices.begin(), devices.end(), *iter) == devices.end()) { + iter = devices_.erase(iter); + } else { + iter++; + } + } + } +} std::unordered_map TensorPipeAgent::getMetrics() { std::unordered_map metrics; metrics[kThreadPoolSize] = c10::to_string(threadPool_.size()); @@ -1252,11 +1406,14 @@ void TensorPipeAgent::markFutureWithError( std::vector TensorPipeAgent::getDevicesForRemote( const std::string& remoteName, const Message& message) const { - const auto& deviceMaps = - message.isRequest() ? opts_.deviceMaps : reverseDeviceMaps_; + std::unordered_map deviceMaps; + { + GroupMembershipLockGuard guard(groupMembershipMutex_, isStaticGroup_); + deviceMaps = message.isRequest() ? opts_.deviceMaps : reverseDeviceMaps_; + } const auto errStr = c10::str( - "TensorPipe RPC backend only supports CPU tensors by default, please " + "TensorPipe RPC backend only supports CPU and Meta tensors by default, please " "move your tensors to CPU before sending them over RPC, or call " "`set_device_map` on `TensorPipeRpcBackendOptions` to explicitly " "configure device mapping. ", @@ -1268,7 +1425,7 @@ std::vector TensorPipeAgent::getDevicesForRemote( if (iter == deviceMaps.end()) { for (const auto& t : message.tensors()) { TORCH_CHECK( - t.device().is_cpu(), + t.device().is_cpu() || t.device().is_meta(), errStr, ", but found tensor on device: ", t.device()); @@ -1287,7 +1444,16 @@ DeviceMap TensorPipeAgent::getDeviceMap(const WorkerInfo& dst) const { return it->second; } +const c10::intrusive_ptr<::c10d::Store> TensorPipeAgent::getStore() const { + return store_; +} + +TensorPipeRpcBackendOptions TensorPipeAgent::getBackendOptions() const { + return opts_; +} + const std::vector& TensorPipeAgent::getDevices() const { + GroupMembershipLockGuard guard(groupMembershipMutex_, isStaticGroup_); return devices_; } diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.h b/torch/csrc/distributed/rpc/tensorpipe_agent.h index b76e1a099beb..2ad3ef6a0d75 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_agent.h +++ b/torch/csrc/distributed/rpc/tensorpipe_agent.h @@ -165,7 +165,7 @@ class TORCH_API TensorPipeAgent : public RpcAgent { const c10::intrusive_ptr<::c10d::Store>& store, std::string selfName, worker_id_t selfId, - int worldSize, + optional worldSize, TensorPipeRpcBackendOptions opts, std::unordered_map reverseDeviceMaps, std::vector devices, @@ -182,7 +182,7 @@ class TORCH_API TensorPipeAgent : public RpcAgent { // join() and sync() would be deprecated - // https://github.com/pytorch/pytorch/issues/27647 - void join(bool shutdown = false) override; + void join(bool shutdown = false, float timeout = 0) override; void sync() override{}; void startImpl() override; void shutdownImpl() override; @@ -192,11 +192,20 @@ class TORCH_API TensorPipeAgent : public RpcAgent { const WorkerInfo& getWorkerInfo(const std::string& workerName) const override; const WorkerInfo& getWorkerInfo(worker_id_t workerId) const override; std::vector getWorkerInfos() const override; + void updateGroupMembership( + const WorkerInfo& workerInfo, + const std::vector devices, + const std::unordered_map reverseDeviceMaps, + bool isJoin); std::unordered_map getMetrics() override; void addGilWaitTime(const std::chrono::microseconds gilWaitTime) override; + TensorPipeRpcBackendOptions getBackendOptions() const; + + const c10::intrusive_ptr<::c10d::Store> getStore() const; + DeviceMap getDeviceMap(const WorkerInfo& dest) const override; const std::vector& getDevices() const override; @@ -216,6 +225,8 @@ class TORCH_API TensorPipeAgent : public RpcAgent { size_t numPendingResponses(); size_t messageIdToTimeoutMapSize(); + const bool isStaticGroup_; + protected: // TensorPipe write function that could be used to write response // messages by server, and write request messages by client. This @@ -233,10 +244,16 @@ class TORCH_API TensorPipeAgent : public RpcAgent { void removeFromTimeoutMap(uint64_t messageId); // Populates workerIdToInfo_ and workerNameToInfo_ using addressStore_ - void prepareNames(); + void prepareNames(bool isStaticGroup); + + // Check the static group attribute with the value set in store + void checkAndSetStaticGroup(const c10::intrusive_ptr<::c10d::Store>& store); const std::string& findWorkerURL(const WorkerInfo& worker) const; + // Only use for Dynamic RPC groups, method to have worker leave group + void leaveGroup(); + // TensorPipe read function that could be used to read response messages // by client, and read request messages by server. void pipeRead( @@ -307,12 +324,16 @@ class TORCH_API TensorPipeAgent : public RpcAgent { pendingResponseMessage_; }; + const c10::intrusive_ptr<::c10d::Store> store_; + const TensorPipeRpcBackendOptions opts_; - const std::unordered_map reverseDeviceMaps_; + // For dynamic RPC, the reverse device maps are updated whenever a new rank + // joins or leaves the group + std::unordered_map reverseDeviceMaps_; // Local devices used by this agent. If application didn't specify this // field, it will be initialized using corresponding local devices in // opts_.deviceMaps and reverseDeviceMaps_; - const std::vector devices_; + std::vector devices_; ThreadPool threadPool_; std::shared_ptr context_; @@ -331,8 +352,7 @@ class TORCH_API TensorPipeAgent : public RpcAgent { // Store keys that will used to count joined processes and active calls during // the shutdown process ::c10d::PrefixStore shutdownStore_; - const int worldSize_; - + int worldSize_ = 0; std::atomic nextMessageID_{0}; // Metadata used for tracking of whether certain RPCs have timed out or not. @@ -410,6 +430,31 @@ class TORCH_API TensorPipeAgent : public RpcAgent { // Mutex to guard timeSeriesMetrics_ std::mutex metricsMutex_; + // Custom lock guard used to check if the RPC group is dynamic and lock the + // mutex if so + struct GroupMembershipLockGuard { + GroupMembershipLockGuard(std::mutex& mutex, bool isStaticGroup) + : ref_(mutex), isStaticGroup_(isStaticGroup) { + if (isStaticGroup_) { + ref_.lock(); + } + } + + ~GroupMembershipLockGuard() { + if (isStaticGroup_) { + ref_.unlock(); + } + } + + private: + GroupMembershipLockGuard(const GroupMembershipLockGuard&); + std::mutex& ref_; + bool isStaticGroup_; + }; + // Mutex to guard access to group membership data + // e.g. updates to (workerIdToInfo_, workerNameToInfo_, workerNameToURL_) + mutable std::mutex groupMembershipMutex_; + // Map to Track Network Data NetworkDataDict networkData_; // Mutex to guard networkData_ diff --git a/torch/csrc/distributed/rpc/tensorpipe_utils.cpp b/torch/csrc/distributed/rpc/tensorpipe_utils.cpp index ee66f3108e52..e59ba06044d4 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_utils.cpp +++ b/torch/csrc/distributed/rpc/tensorpipe_utils.cpp @@ -163,12 +163,20 @@ std::tuple tensorpipeSerialize( buffers.tensors = cloneSparseTensors(rpcMessage->tensors()).vec(); } + // The corresponding unpickler in `tensorpipeDeserialize` uses tensor id as an index in `buffers.tensors` + // (see tensorReadFunc). Meta tensors don't have data and are not present in buffers.tensors, so to skip + // meta tensors `non_meta_idx` is used to generate consecutive indices for non-meta tensors in `buffers.tensors`. + // The `meta_idx` is used to generate unique ids for the remaining meta tensors, but they are unused. + int non_meta_idx = 0; + int meta_idx = std::count_if(buffers.tensors.begin(), buffers.tensors.end(), [](auto& t) { return !t.is_meta(); }); torch::jit::Pickler pickler([&](const void* buf, size_t sz) -> size_t { buffers.pickle.insert( buffers.pickle.end(), static_cast(buf), static_cast(buf) + sz); return sz; + }, nullptr, nullptr, nullptr, [&](const at::Tensor& t) -> std::string { + return std::to_string(!t.is_meta() ? non_meta_idx++ : meta_idx++); }); pickler.protocol(); pickler.pushIValue(buffers.tensors); @@ -177,10 +185,19 @@ std::tuple tensorpipeSerialize( tpMessage.payloads.push_back(tensorpipe::Message::Payload{ buffers.pickle.data(), buffers.pickle.size()}); const std::vector& tensorDataVec = pickler.tensorData(); - tpMessage.tensors.reserve(tensorDataVec.size()); + // meta tensors don't have data and are not serialized to tpMessage.tensors + int nonMetaTensorsSize = std::count_if(tensorDataVec.begin(), tensorDataVec.end(), + [](auto& t) { return !t.is_meta(); }); + tpMessage.tensors.reserve(nonMetaTensorsSize); + int metaTensorsCounter = 0; for (const auto i : c10::irange(tensorDataVec.size())) { const torch::Tensor& tensor = tensorDataVec[i]; + if (tensor.is_meta()) { + metaTensorsCounter++; + continue; + } + const TensorpipeDeviceTypeConverter* converter = getDeviceTypeConverter(tensor.device().type()); TORCH_CHECK( @@ -188,11 +205,11 @@ std::tuple tensorpipeSerialize( "Attempting to send a Tensor with unexpected device type ", tensor.device()); - TORCH_INTERNAL_ASSERT(tpMessage.tensors.size() == i); + TORCH_INTERNAL_ASSERT(tpMessage.tensors.size() == i - metaTensorsCounter); c10::optional> maybeCopiedTensor = converter->prepareTensorForSending( tensor.storage(), streams, tpMessage); - TORCH_INTERNAL_ASSERT(tpMessage.tensors.size() == i + 1); + TORCH_INTERNAL_ASSERT(tpMessage.tensors.size() == i + 1 - metaTensorsCounter); tensorpipe::Device targetDevice = devices.empty() || devices[i].is_cpu() ? tensorpipe::Device{tensorpipe::kCpuDeviceType, 0} @@ -311,8 +328,13 @@ c10::intrusive_ptr tensorpipeDeserialize( tensors.emplace_back(std::move(t)); } - for (const auto i : c10::irange(tpDescriptor.tensors.size())) { - auto& tensor = tpDescriptor.tensors[i]; + int metaTensorsCounter = 0; + for (const auto i : c10::irange(tensors.size())) { + if (tensors[i].is_meta()) { + metaTensorsCounter++; + continue; + } + auto& tensor = tpDescriptor.tensors[i - metaTensorsCounter]; if (tensor.targetDevice.has_value() && tensor.targetDevice->type == tensorpipe::kCudaDeviceType) { TORCH_INTERNAL_ASSERT( diff --git a/torch/csrc/distributed/rpc/testing/init.cpp b/torch/csrc/distributed/rpc/testing/init.cpp index ae40f0897ce0..fc2dc156f7d5 100644 --- a/torch/csrc/distributed/rpc/testing/init.cpp +++ b/torch/csrc/distributed/rpc/testing/init.cpp @@ -98,7 +98,8 @@ PyObject* faulty_agent_init(PyObject* _unused, PyObject* noargs) { "join", &TensorPipeAgent::join, py::call_guard(), - py::arg("shutdown") = false) + py::arg("shutdown") = false, + py::arg("timeout") = 0) .def( "shutdown", &TensorPipeAgent::shutdown, diff --git a/torch/csrc/distributed/rpc/torchscript_functions.cpp b/torch/csrc/distributed/rpc/torchscript_functions.cpp index 464a290de1dc..8afbc8135914 100644 --- a/torch/csrc/distributed/rpc/torchscript_functions.cpp +++ b/torch/csrc/distributed/rpc/torchscript_functions.cpp @@ -21,10 +21,7 @@ c10::intrusive_ptr rpcTorchscript( std::vector& stack, const float rpcTimeoutSeconds, const bool isAsyncExecution) { - // This dummy tensor holds an at::RecordFunction when profiling is enabled. - // This is because at::RecordFunction is not yet registered as a TorchScript - // custom class (https://github.com/pytorch/pytorch/issues/35026) - at::Tensor handle = at::zeros(1); + c10::intrusive_ptr record; auto shouldProfile = torch::autograd::profiler::profilerEnabled() && !torch::distributed::rpc::RemoteProfilerManager::getInstance() .isCurrentKeySet(); @@ -35,7 +32,8 @@ c10::intrusive_ptr rpcTorchscript( .qualifiedName(), /* name of torchscript function being run */ RpcAgent::getCurrentRpcAgent()->getWorkerInfo().name_, dstWorkerName); - handle = torch::autograd::profiler::record_function_enter(rpcAsyncJitKey); + record = + torch::autograd::profiler::record_function_enter_new(rpcAsyncJitKey); auto& remoteProfilerManager = torch::distributed::rpc::RemoteProfilerManager::getInstance(); remoteProfilerManager.setCurrentKey(rpcAsyncJitKey); @@ -75,7 +73,8 @@ c10::intrusive_ptr rpcTorchscript( })); if (shouldProfile) { auto profiledFutPtr = - torch::autograd::profiler::_call_end_callbacks_on_fut(handle, futPtr); + torch::autograd::profiler::_call_end_callbacks_on_fut_new( + record, futPtr); return profiledFutPtr; } return futPtr; diff --git a/torch/csrc/generic/Storage.cpp b/torch/csrc/generic/Storage.cpp index 539c01cad245..4743ba1a8627 100644 --- a/torch/csrc/generic/Storage.cpp +++ b/torch/csrc/generic/Storage.cpp @@ -144,7 +144,7 @@ static PyObject * THPStorage_(get)(THPStorage *self, PyObject *index) int64_t nindex = THPUtils_unpackLong(index); if (nindex < 0) nindex += (self->cdata->nbytes() / sizeof(scalar_t)); - if (nindex < 0 || nindex >= (self->cdata->nbytes() / sizeof(scalar_t))) { + if (nindex < 0 || nindex >= static_cast(self->cdata->nbytes() / sizeof(scalar_t))) { PyErr_SetString(PyExc_IndexError, fmt::format( "index {} out of range for storage of size {}", nindex, self->cdata->nbytes() / sizeof(scalar_t))); @@ -344,7 +344,7 @@ bool THPStorage_(init)(PyObject *module) void THPStorage_(postInit)(PyObject *module) { - THPStorageClass = PyObject_GetAttrString(module, "UntypedStorage"); + THPStorageClass = PyObject_GetAttrString(module, "_UntypedStorage"); if (!THPStorageClass) throw python_error(); at::Backend backend = at::Backend::CPU; diff --git a/torch/csrc/generic/StorageSharing.cpp b/torch/csrc/generic/StorageSharing.cpp index 01cd5c49998b..701df7daaa0c 100644 --- a/torch/csrc/generic/StorageSharing.cpp +++ b/torch/csrc/generic/StorageSharing.cpp @@ -282,13 +282,9 @@ static PyObject * THPStorage_(shareCuda)(PyObject *_self, PyObject *noargs) // NOLINTNEXTLINE(cppcoreguidelines-init-variables) cudaIpcEventHandle_t ipc_event_handle; -#if !defined(USE_ROCM) if (sent_data->event_sync_required_) { C10_CUDA_CHECK(cudaIpcGetEventHandle(&ipc_event_handle, sent_data->event_)); } -#else - // ipc_event_handle unused in storage receiver, we can leave it uninitialized. -#endif _event_handle = PyBytes_FromStringAndSize((char *)&ipc_event_handle, CUDA_IPC_HANDLE_SIZE); _event_sync_required = PyBool_FromLong(sent_data->event_sync_required_); @@ -400,7 +396,6 @@ static PyObject * THPStorage_(newSharedCuda)(PyObject *_unused, PyObject *args) int64_t device = THPUtils_unpackLong(_device); at::cuda::CUDAGuard device_guard(device); -#if !defined(USE_ROCM) if (PyObject_IsTrue(_event_sync_required)) { // Ensure that producer prepared all tensor's data std::string s_ipc_event_handle = @@ -413,9 +408,6 @@ static PyObject * THPStorage_(newSharedCuda)(PyObject *_unused, PyObject *args) AT_CUDA_CHECK( cudaStreamWaitEvent(c10::cuda::getCurrentCUDAStream(device), event, 0)); } -#else - // Already synchronized inside producer stream -#endif std::string s_handle = THPStorage_(bytesAsHandleString)(_handle); std::shared_ptr basePtr = c10::cuda::CUDACachingAllocator::getIpcDevPtr(s_handle); diff --git a/torch/csrc/init_flatbuffer_module.cpp b/torch/csrc/init_flatbuffer_module.cpp new file mode 100644 index 000000000000..77bb302423fe --- /dev/null +++ b/torch/csrc/init_flatbuffer_module.cpp @@ -0,0 +1,116 @@ +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include // NOLINT +#include +#include +#include +#include +#include +#include + +namespace py = pybind11; + +static std::shared_ptr copyStr(const std::string& bytes) { + size_t size = (bytes.size() / FLATBUFFERS_MAX_ALIGNMENT + 1) * + FLATBUFFERS_MAX_ALIGNMENT; +#ifdef _WIN32 + std::shared_ptr bytes_copy( + static_cast(_aligned_malloc(size, FLATBUFFERS_MAX_ALIGNMENT)), + _aligned_free); +#elif defined(__APPLE__) + void* p; + ::posix_memalign(&p, FLATBUFFERS_MAX_ALIGNMENT, size); + TORCH_INTERNAL_ASSERT(p, "Could not allocate memory for flatbuffer"); + std::shared_ptr bytes_copy(static_cast(p), free); +#else + std::shared_ptr bytes_copy( + static_cast(aligned_alloc(FLATBUFFERS_MAX_ALIGNMENT, size)), free); +#endif + memcpy(bytes_copy.get(), bytes.data(), bytes.size()); + return bytes_copy; +} + +extern "C" +#ifdef _WIN32 + __declspec(dllexport) +#endif + PyObject* initModuleFlatbuffer() { + using namespace torch::jit; + PyMethodDef m[] = {{nullptr, nullptr, 0, nullptr}}; // NOLINT + static struct PyModuleDef torchmodule = { + PyModuleDef_HEAD_INIT, + "torch._C_flatbuffer", + nullptr, + -1, + m, + }; // NOLINT + PyObject* module = PyModule_Create(&torchmodule); + auto pym = py::handle(module).cast(); + pym.def("_load_mobile_module_from_file", [](const std::string& filename) { + return torch::jit::load_mobile_module_from_file(filename); + }); + pym.def("_load_mobile_module_from_bytes", [](const std::string& bytes) { + auto bytes_copy = copyStr(bytes); + return torch::jit::parse_and_initialize_mobile_module( + bytes_copy, bytes.size()); + }); + pym.def("_load_jit_module_from_file", [](const std::string& filename) { + ExtraFilesMap extra_files = ExtraFilesMap(); + return torch::jit::load_jit_module_from_file(filename, extra_files); + }); + pym.def("_load_jit_module_from_bytes", [](const std::string& bytes) { + auto bytes_copy = copyStr(bytes); + ExtraFilesMap extra_files = ExtraFilesMap(); + return torch::jit::parse_and_initialize_jit_module( + bytes_copy, bytes.size(), extra_files); + }); + pym.def( + "_save_mobile_module", + [](const torch::jit::mobile::Module& module, + const std::string& filename) { + return torch::jit::save_mobile_module(module, filename); + }); + pym.def( + "_save_jit_module", + [](const torch::jit::Module& module, const std::string& filename) { + return torch::jit::save_jit_module(module, filename); + }); + pym.def( + "_save_mobile_module_to_bytes", + [](const torch::jit::mobile::Module& module) { + auto detached_buffer = torch::jit::save_mobile_module_to_bytes(module); + return py::bytes( + reinterpret_cast(detached_buffer.data()), + detached_buffer.size()); + }); + pym.def("_save_jit_module_to_bytes", [](const torch::jit::Module& module) { + auto detached_buffer = torch::jit::save_jit_module_to_bytes(module); + return py::bytes( + reinterpret_cast(detached_buffer.data()), + detached_buffer.size()); + }); + pym.def("_get_module_info_from_flatbuffer", [](std::string flatbuffer_content) { + py::gil_scoped_acquire acquire; + py::dict result; + mobile::ModuleInfo minfo = torch::jit::get_module_info_from_flatbuffer( + &flatbuffer_content[0]); + result["bytecode_version"] = minfo.bytecode_version; + result["operator_version"] = minfo.operator_version; + result["function_names"] = minfo.function_names; + result["type_names"] = minfo.type_names; + result["opname_to_num_args"] = minfo.opname_to_num_args; + return result; + }); + + return module; +} diff --git a/torch/csrc/jit/api/function_impl.cpp b/torch/csrc/jit/api/function_impl.cpp index 774136f3f455..356a67b9dfe9 100644 --- a/torch/csrc/jit/api/function_impl.cpp +++ b/torch/csrc/jit/api/function_impl.cpp @@ -88,6 +88,9 @@ const c10::FunctionSchema& GraphFunction::getSchema() const { } GraphFunction::SpecializationKey GraphFunction::currentSpecialization() const { + if (force_no_amp_) { + return SpecializationKey::AutocastOff; + } #ifdef C10_MOBILE // disabling autodiff pass for mobile build since autocast APIs don't exist return SpecializationKey::AutocastOff; @@ -105,7 +108,7 @@ GraphFunction::SpecializationKey GraphFunction::currentSpecialization() const { #endif } -void preoptimizeGraph(std::shared_ptr& graph) { +void preoptimizeGraph(std::shared_ptr& graph, bool disable_autocast) { Inline(*graph); // Peephole Optimize cleans up many "is None" checks and creates constant prop @@ -125,7 +128,9 @@ void preoptimizeGraph(std::shared_ptr& graph) { // of the any optimizations // 2. AMP transformations would benefit from followup passes's cleanup // - Autocast(graph); + if (!disable_autocast) { + Autocast(graph); + } #endif ConstantPooling(graph); diff --git a/torch/csrc/jit/api/function_impl.h b/torch/csrc/jit/api/function_impl.h index c92e46a352e3..fb68e3e648d0 100644 --- a/torch/csrc/jit/api/function_impl.h +++ b/torch/csrc/jit/api/function_impl.h @@ -13,10 +13,14 @@ struct TORCH_API GraphFunction : public Function { GraphFunction( c10::QualifiedName name, std::shared_ptr graph, - std::function function_creator) + std::function function_creator, + c10::optional executor_execution_mode = + c10::nullopt) : name_(std::move(name)), graph_(std::move(graph)), - function_creator_(std::move(function_creator)) {} + function_creator_(std::move(function_creator)) { + executor_execution_mode_ = executor_execution_mode; + } bool isGraphFunction() const override { return true; @@ -44,7 +48,7 @@ struct TORCH_API GraphFunction : public Function { } optimized_graph = graph_->copy(); if (getGraphExecutorOptimize()) { - preoptimizeGraph(*optimized_graph); + preoptimizeGraph(*optimized_graph, force_no_amp_); } return *optimized_graph; } @@ -53,6 +57,19 @@ struct TORCH_API GraphFunction : public Function { return name_; } + // private/unstable api. sets the initial execution mode + // will not affect executor if there is an existing executor + // created for this function + void _set_initial_executor_execution_mode(ExecutorExecutionMode mode) { + executor_execution_mode_ = mode; + } + // private/unstable api. sets flag of whether or not to ignore amp. + // will not affect executor if there is an existing executor + // created for this function + void _set_ignore_amp(bool ignore_amp) { + force_no_amp_ = ignore_amp; + } + // if this isn't yet defined, run its method_creator function void ensure_defined() override; @@ -92,14 +109,20 @@ struct TORCH_API GraphFunction : public Function { return *executor; } check_single_output(); - executor = GraphExecutor(optimized_graph(), name_.name()); + const std::string& name = name_.name(); + std::shared_ptr opt_graph = optimized_graph(); + if (!executor_execution_mode_) { + executor = GraphExecutor(opt_graph, name); + } else { + executor = GraphExecutor(opt_graph, name, *executor_execution_mode_); + } return *executor; } using Function::call; bool call( Stack& stack, - size_t bailOut, + c10::optional bailOut, c10::function_ref f) override { f(get_executor().getPlanFor(stack, bailOut).code); return true; @@ -128,6 +151,13 @@ struct TORCH_API GraphFunction : public Function { // The original, non-optimized graph std::shared_ptr graph_; // for debugging and for inlining + // allows users to specify Simple/Profiling Executor for function + // TODO: add more executors + mutable c10::optional executor_execution_mode_; + + // if invoked on a graph that has already traced through amp + // don't invoke amp pass + mutable bool force_no_amp_ = false; // Optimized graph, computed lazily. Used for inlining. mutable std::array< c10::optional>, diff --git a/torch/csrc/jit/api/module.h b/torch/csrc/jit/api/module.h index c2506c6a9ecb..a6aa49278cbe 100644 --- a/torch/csrc/jit/api/module.h +++ b/torch/csrc/jit/api/module.h @@ -223,12 +223,14 @@ struct TORCH_API Module : public Object { void _save_for_mobile( std::ostream& out, const ExtraFilesMap& extra_files = ExtraFilesMap(), - bool save_mobile_debug_info = false) const; + bool save_mobile_debug_info = false, + bool use_flatbuffer = false) const; void _save_for_mobile( const std::string& filename, const ExtraFilesMap& extra_files = ExtraFilesMap(), - bool save_mobile_debug_info = false) const; + bool save_mobile_debug_info = false, + bool use_flatbuffer = false) const; Module copy() const; @@ -265,6 +267,10 @@ struct TORCH_API Module : public Object { return _ivalue() == y._ivalue(); } + void set_delete_memory(std::shared_ptr delete_mem) { + mem_to_delete_ = delete_mem; + } + private: Module clone_impl( std::unordered_map& type_remap, @@ -286,6 +292,9 @@ struct TORCH_API Module : public Object { const c10::optional& device, const c10::optional& dtype, bool non_blocking); + + // Extra handle for the module to delete when itself is deleted + std::shared_ptr mem_to_delete_; }; // C++ equivalent api of `torch.jit.freeze`. See documentation there for @@ -301,6 +310,45 @@ TORCH_API Module optimize_for_inference( Module& module, const std::vector& other_methods = {}); +enum class FusionBehavior { STATIC, DYNAMIC }; + +using FusionStrategy = std::vector>; +// clang-format off +/* +Sets the type and number of specializations that can occur during fusion. + +Usage: provide a list of pairs (type, depth) where type is one of STATIC or DYNAMIC +and depth is an integer. + +Behavior - static vs dynamic: + In STATIC fusion, fused ops are compiled to have fixed input shapes. The shape is determined + based on some initial profiling runs. + In DYNAMIC fusion, fused ops are compiled to have variable input shapes, so that multiple + shapes are possible. + +In both cases, we also recompile on new striding behavior, device, or dtype. + +Behavior - fallback functions & depth: + When an input doesn't match the format required by the specialized compiled op, it will run + a fallback function. Fallback functions are recursively be compiled and specialized based + on the observed tensor shapes. Since compilation can be slow, the "depth" parameter is provided to + limit the number of specializations that can be compiled, before giving up on recompiling and + falling back to a completely un-fused, un-specialized implementation. + +The list of (type, depth) pairs controls the type of specializations and the number of +specializations. For example: [(STATIC, 2), (DYNAMIC, 2)] indicates that the first +two specializations will use static fusions, the following two specializations will use +dynamic fusion, and any inputs that satisfy none of the 4 options will run an +unfused implementation. + +NB: in the future, if more as more fusion backends are added there may be more granular +apis for specific fusers. +*/ +// clang-format on +TORCH_API FusionStrategy getFusionStrategy(); +// returns previous strategy +TORCH_API FusionStrategy setFusionStrategy(FusionStrategy& fusion_strategy); + namespace detail { struct TORCH_API SlotCursor { diff --git a/torch/csrc/jit/api/module_save.cpp b/torch/csrc/jit/api/module_save.cpp index c8afa5efaf35..912c38612c35 100644 --- a/torch/csrc/jit/api/module_save.cpp +++ b/torch/csrc/jit/api/module_save.cpp @@ -16,25 +16,29 @@ void Module::save(const std::string& filename, const ExtraFilesMap& extra_files) void Module::_save_for_mobile( std::ostream& out, const ExtraFilesMap& extra_files, - bool save_mobile_debug_info) const { + bool save_mobile_debug_info, + bool use_flatbuffer) const { ExportModule( *this, out, extra_files, true /* bytecode_format */, - save_mobile_debug_info); + save_mobile_debug_info, + use_flatbuffer); } void Module::_save_for_mobile( const std::string& filename, const ExtraFilesMap& extra_files, - bool save_mobile_debug_info) const { + bool save_mobile_debug_info, + bool use_flatbuffer) const { ExportModule( *this, filename, extra_files, true /* bytecode_format */, - save_mobile_debug_info); + save_mobile_debug_info, + use_flatbuffer); } } // namespace jit diff --git a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm index 8492e1608b21..e395326e28ca 100644 --- a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm +++ b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm @@ -136,7 +136,7 @@ struct API_AVAILABLE(ios(11.0), macos(10.13)) CoreMLExecutorWrapper inputs_(inputs), outputs_(outputs), config_(config) {} - c10::List execute(c10::impl::GenericList inputs) { + c10::List execute(const c10::impl::GenericList& inputs) { std::vector inputSpecs; std::vector outputSpecs; int inputSpecIndex = 0; @@ -144,7 +144,7 @@ struct API_AVAILABLE(ios(11.0), macos(10.13)) CoreMLExecutorWrapper for (int i = 0; i < inputs.size(); ++i) { auto val = inputs.get(i); if (val.isTuple()) { - auto tuples = val.toTupleRef().elements(); + auto& tuples = val.toTupleRef().elements(); for (auto& ival : tuples) { TORCH_CHECK(ival.isTensor()); auto tensor = ival.toTensor(); diff --git a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLExecutor.mm b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLExecutor.mm index ab79bbbd8995..fbb7abe87b52 100644 --- a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLExecutor.mm +++ b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLExecutor.mm @@ -7,10 +7,31 @@ #import #endif +// Observer +#import + #include #include #include +// This is a utility macro that can be used to throw an exception when a CoreML +// API function produces a NSError. The exception will contain a message with +// useful info extracted from the NSError. +#define COREML_THROW_IF_ERROR(error, preamble) \ + do { \ + if C10_LIKELY(error) { \ + throw c10::Error( \ + {__func__, __FILE__, static_cast(__LINE__)}, \ + c10::str( \ + preamble, \ + " Error details: ", \ + " Localized_description: ", error.localizedDescription.UTF8String, \ + " Domain: ", error.domain.UTF8String, \ + " Code: ", error.code, \ + " User Info: ", error.userInfo.description.UTF8String)); \ + } \ + } while (false) + @implementation PTMCoreMLFeatureProvider { NSUInteger _coremlVersion; std::vector _specs; @@ -68,6 +89,14 @@ @implementation PTMCoreMLExecutor { MLModel* _mlModel; NSURL* _modelPath; NSURL* _compiledModelPath; + + int32_t _model_load_id; + int32_t _inferences; + + int32_t _sample_thresh; + int32_t _sample_every; + + size_t _init_mem_limit; } + (void)setModelCacheDirectory:(NSString*)dir { @@ -110,6 +139,24 @@ - (BOOL)compileMLModel:(const std::string&)modelSpecs [self _saveModel:modelSpecs]; NSError* error = nil; _compiledModelPath = [self _compiledModelFilePath:_modelPath.path]; + + // Get observer and create an instance key + PTMCoreMLObserver* observer = coreMLObserverConfig().getCoreMLObserver(); + int32_t instance_key = std::rand(); + _model_load_id = std::rand(); + _inferences = 0; + + _init_mem_limit = 0; + + _sample_thresh = + static_cast(1.0 / 1000.0 * static_cast(RAND_MAX)); + _sample_every = 500; + + if (observer) { + _init_mem_limit = observer->getRemainingMemory(); + observer->onEnterCompileModel(instance_key, _model_load_id); + } + // Compile the model when OS version changes if ([self _shouldRecompileModel]) { if (@available(iOS 11.0, macOS 10.13, *)) { @@ -128,17 +175,24 @@ - (BOOL)compileMLModel:(const std::string&)modelSpecs } } } else { + // Always log on failure + if (observer) { + observer->onExitCompileModel(instance_key, false, true); + } TORCH_CHECK(false, "CoreML is not available on your deivce"); } } if (error) { + // Always log on failure + if (observer) { + observer->onExitCompileModel(instance_key, false, true); + } + // remove cached models if compalition failed. [self cleanup]; - TORCH_CHECK( - false, - "Error compiling the MLModel", - [error localizedDescription].UTF8String); + + COREML_THROW_IF_ERROR(error, "Error compiling the MLModel file!"); return NO; } if (@available(iOS 12.0, macOS 10.14, *)) { @@ -158,40 +212,72 @@ - (BOOL)compileMLModel:(const std::string&)modelSpecs _mlModel = [MLModel modelWithContentsOfURL:_compiledModelPath error:&error]; } if (error || !_mlModel) { - TORCH_CHECK( - false, - "Error loading the MLModel", - error.localizedDescription.UTF8String); + // Always log on failure + if (observer) { + observer->onExitCompileModel(instance_key, false, true); + } + + COREML_THROW_IF_ERROR(error, "Error loading the MLModel file!"); + } + + if (observer) { + bool should_log = _model_load_id < _sample_thresh; + observer->onExitCompileModel(instance_key, true, should_log); } + return YES; } - (id)forwardWithInputs: (const std::vector&)inputs { - NSError* error = nil; - PTMCoreMLFeatureProvider* inputFeature = [[PTMCoreMLFeatureProvider alloc] - initWithFeatureSpecs:inputs - CoreMLVersion:self.coreMLVersion]; - if (inputFeature == nil) { - return nil; - } - if (@available(iOS 11.0, macOS 10.13, *)) { - MLPredictionOptions* options = [[MLPredictionOptions alloc] init]; - id outputFeature = - [_mlModel predictionFromFeatures:inputFeature - options:options - error:&error]; - if (error) { - TORCH_CHECK( - false, - "Error running the prediction", - error.localizedDescription.UTF8String); + @autoreleasepool { + // Get observer and create an instance key + PTMCoreMLObserver* observer = coreMLObserverConfig().getCoreMLObserver(); + int32_t instance_key = std::rand(); + + if (observer) { + observer->onEnterExecuteModel( + instance_key, _model_load_id, _init_mem_limit, _inferences); } - return outputFeature; - } else { - TORCH_CHECK(false, "Core ML is not available on your device"); - return nil; + NSError* error = nil; + PTMCoreMLFeatureProvider* inputFeature = [[PTMCoreMLFeatureProvider alloc] + initWithFeatureSpecs:inputs + CoreMLVersion:self.coreMLVersion]; + if (inputFeature == nil) { + return nil; + } + if (@available(iOS 11.0, macOS 10.13, *)) { + MLPredictionOptions* options = [[MLPredictionOptions alloc] init]; + id outputFeature = + [_mlModel predictionFromFeatures:inputFeature + options:options + error:&error]; + + COREML_THROW_IF_ERROR(error, "Error running CoreML inference!"); + + ++_inferences; + if (observer) { + // Check if this inference session is being logged. + // If so, only log every N inferences + bool should_log = _model_load_id < _sample_thresh && _inferences > 1; + if (should_log) { + should_log = _inferences % _sample_every == 0; + } + observer->onExitExecuteModel( + instance_key, _inferences, true, should_log); + } + + return outputFeature; + } else { + // Always log on failure + if (observer) { + observer->onExitExecuteModel(instance_key, _inferences, true, true); + } + + TORCH_CHECK(false, "Core ML is not available on your device"); + return nil; + } } } diff --git a/torch/csrc/jit/backends/coreml/observer/PTMCoreMLObserver.h b/torch/csrc/jit/backends/coreml/observer/PTMCoreMLObserver.h new file mode 100644 index 000000000000..57d11527ac9c --- /dev/null +++ b/torch/csrc/jit/backends/coreml/observer/PTMCoreMLObserver.h @@ -0,0 +1,47 @@ +#include + +class PTMCoreMLObserver { + public: + virtual ~PTMCoreMLObserver() = default; + + virtual size_t getRemainingMemory() { + return 0; + } + + virtual void onEnterCompileModel(const int32_t, const int32_t) {} + virtual void onExitCompileModel(const int32_t, bool, bool) {} + + virtual void onEnterExecuteModel( + const int32_t, + const int32_t, + const size_t, + const int32_t) {} + virtual void onExitExecuteModel(const int32_t, const int32_t, bool, bool) {} +}; + +class PTMCoreMLObserverConfig { + public: + PTMCoreMLObserverConfig(); + + // Do not allow copying/moving. + // There should be only one global instance of this class. + PTMCoreMLObserverConfig(const PTMCoreMLObserverConfig&) = delete; + PTMCoreMLObserverConfig& operator=(const PTMCoreMLObserverConfig&) = delete; + + PTMCoreMLObserverConfig(PTMCoreMLObserverConfig&&) = delete; + PTMCoreMLObserverConfig& operator=(PTMCoreMLObserverConfig&&) = delete; + + private: + std::unique_ptr observer_; + + public: + void setCoreMLObserver(std::unique_ptr observer) { + observer_ = std::move(observer); + } + + PTMCoreMLObserver* getCoreMLObserver() { + return observer_.get(); + } +}; + +PTMCoreMLObserverConfig& coreMLObserverConfig(); diff --git a/torch/csrc/jit/backends/coreml/observer/PTMCoreMLObserver.mm b/torch/csrc/jit/backends/coreml/observer/PTMCoreMLObserver.mm new file mode 100644 index 000000000000..372fc53622f7 --- /dev/null +++ b/torch/csrc/jit/backends/coreml/observer/PTMCoreMLObserver.mm @@ -0,0 +1,8 @@ +#import + +PTMCoreMLObserverConfig::PTMCoreMLObserverConfig() : observer_{nullptr} {} + +PTMCoreMLObserverConfig& coreMLObserverConfig() { + static PTMCoreMLObserverConfig global_instance; + return global_instance; +} diff --git a/torch/csrc/jit/backends/nnapi/nnapi_backend_lib.cpp b/torch/csrc/jit/backends/nnapi/nnapi_backend_lib.cpp index 7d9dc18c1258..ba4a2b25c23a 100644 --- a/torch/csrc/jit/backends/nnapi/nnapi_backend_lib.cpp +++ b/torch/csrc/jit/backends/nnapi/nnapi_backend_lib.cpp @@ -31,7 +31,7 @@ class NnapiBackend : public PyTorchBackendInterface { c10::impl::GenericDict compile( c10::IValue processed, c10::impl::GenericDict method_compile_spec) override { - // Wrap procesed in dictionary: {"forward": processed} + // Wrap processed in dictionary: {"forward": processed} auto dict = processed.toGenericDict(); c10::Dict handles( c10::StringType::get(), c10::AnyType::get()); @@ -64,7 +64,7 @@ class NnapiBackend : public PyTorchBackendInterface { auto inp_mem_fmts = dict.at("inp_mem_fmts").toIntList(); TORCH_CHECK(tensorInp.size() == inp_mem_fmts.size()); std::vector fixed_inputs; - for (int i = 0; i < tensorInp.size(); i++) { + for (auto i = 0U; i < tensorInp.size(); i++) { int fmt = inp_mem_fmts[i]; // These constants match the values in DimOrder in serializer.py // 0: NCHW, 1: NHWC @@ -84,7 +84,7 @@ class NnapiBackend : public PyTorchBackendInterface { // Adjust output memory formats auto out_mem_fmts = dict.at("out_mem_fmts").toIntList(); TORCH_CHECK(outputs.size() == out_mem_fmts.size()); - for (int i = 0; i < outputs.size(); i++) { + for (auto i = 0U; i < outputs.size(); i++) { int fmt = out_mem_fmts[i]; // These constants match the values in DimOrder in serializer.py // 0: NCHW, 1: NHWC diff --git a/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp b/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp index be0dbe18d90d..a787ecc6cbfd 100644 --- a/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp +++ b/torch/csrc/jit/backends/nnapi/nnapi_backend_preprocess.cpp @@ -96,7 +96,7 @@ c10::IValue preprocess( // transform Python lists to C++ c10::List c10::List weights( py::cast>(nnapi_processed[2])); - for (int i = 0; i < weights.size(); i++) { + for (auto i = 0U; i < weights.size(); i++) { weights.set(i, weights.get(i).contiguous()); } c10::List inp_mem_fmts( diff --git a/torch/csrc/jit/codegen/cuda/README.md b/torch/csrc/jit/codegen/cuda/README.md new file mode 100644 index 000000000000..0ea084905cc1 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/README.md @@ -0,0 +1,239 @@ +# NVFuser - A Fusion Code Generator for NVIDIA GPUs +_NVFuser is integrated as a backend for TorchScript's Profiling Graph Executor_ + +## Enabling NVFuser +_NVFuser is not currently the default fuser for NVIDIA GPUs._ + +**Fusions will only show up during the ~3rd iteration of execution, the exact number depends on profiling executor's optimization phases** + +### Enable by Context Manager + +``` +jit_model = torch.jit.script(model) + +with torch.jit.fuser("fuser2") : + for _ in range(5) : + outputs = jit_model(inputs) +``` + +### Enable by Specific Functions + +1. Disable cpu/gpu fusion for native/nnc fuser +``` +torch._C._jit_override_can_fuse_on_cpu(False) +torch._C._jit_override_can_fuse_on_gpu(False) +``` +2. Disable nnc fuser +``` +torch._C._jit_set_texpr_fuser_enabled(False) +``` +3. Enable nvfuser +``` +torch._C._jit_set_nvfuser_enabled(True) +``` + +## Simple knobs to change fusion behavior + +1. Allow single node fusion `torch._C._jit_set_nvfuser_single_node_mode(True)` +Fusion group is only created when two or more compatible ops are grouped together. Turn on single node fusion would allow fusion pass to create fusion group with a single node, this is very handy for testing and could be useful when single node generated kernel out-performs native cuda kernels in framework. + +2. Allow horizontal fusion `torch._C._jit_set_nvfuser_horizontal_mode(True)` +Fusion pass fuses producer to consumer, horizontal mode allows sibling nodes that shared tensor input to be fused together. This could save input memory bandwidth. + +3. Turn off guard for fusion `torch._C._jit_set_nvfuser_guard_mode(False)` +This disables the runtime check on fusion group pre-assumptions (tensor meta information / constant inputs / profiled constants), this really is only used for testing as we want to ensure generated kernels are indeed tested and you should avoid using this in training scripts. + +4. Turn off fusion for certain node kinds `torch._C._jit_set_nvfuser_skip_node_kind("aten::add", True)` +This disables fusion for certain nodes, but allows other nodes to continue being fused. The first parameter is the node kind, and the second parameter is whether to toggle the node on or off in fusion. + +## Fusion Debugging + +Given the following script as an example + +``` +import torch + +def forward(x): + o = x + 1.0 + o = o.relu() + return o + +shape = (2, 32, 128, 512) +input = torch.rand(*shape).cuda() +t = torch.jit.script(forward) + +with torch.jit.fuser("fuser2"): + for k in range(4): + o = t(input) +``` + +### TorchScript Based Debugging + +#### 1. TorchScript IR Graph + +##### Usage + +Two easy ways to checkout fusion for graph: The first one is to print out graph in python script after a few runs (for optimization to kick in). + +`print(t.graph_for(input))` + +The second way is to turn on graph dumping in profiling executor via command line below: + +``` +PYTORCH_JIT_LOG_LEVEL="profiling_graph_executor_impl" python +``` + +##### Example Output + +Graph print out is straight forward and you should look for `prim::CudaFusionGroup_X` for fused kernels. While profiling executor dumps many things, but the most important part is `Optimized Graph`. In this example, it shows a Fusion Group, which is an indication that fusion is happening and you should be expecting fused kernel! + +``` + Optimized Graph: + graph(%x.1 : Tensor): + %12 : bool = prim::CudaFusionGuard[types=[Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0)]](%x.1) + %11 : Tensor = prim::If(%12) + block0(): + %o.8 : Tensor = prim::CudaFusionGroup_0[cache_id=0](%x.1) + -> (%o.8) + block1(): + %18 : Function = prim::Constant[name="fallback_function", fallback=1]() + %19 : (Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0)) = prim::CallFunction(%18, %x.1) + %20 : Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0) = prim::TupleUnpack(%19) + -> (%20) + return (%11) + with prim::CudaFusionGroup_0 = graph(%2 : Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0)): + %4 : int = prim::Constant[value=1]() + %3 : float = prim::Constant[value=1.]() # test.py:6:12 + %o.1 : Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0) = aten::add(%2, %3, %4) # test.py:6:8 + %o.5 : Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0) = aten::relu(%o.1) # test.py:7:8 + return (%o.5) +``` + +Note that one thing that could prevents fusion when you are running training is autodiff. Fusion pass only runs within `prim::DifferentiableGraph`, so the first thing you should check is to that targetted ops are within differentiable graph subgraphs. +Graph dump could be quite confusing to look at, since it naively dumps all graphs executed by profiling executor and differentiable graphs are executed via a nested graph executor. So for each graph, you might see a few segmented `Optimized Graph` where each corresponds to a differentiable node in the original graph. + +#### 2. Cuda Fusion Graphs + +##### Usage + +Cuda fusion dump gives the input and output graph to fusion pass. This is a good place to check fusion pass logic. + +``` +PYTORCH_JIT_LOG_LEVEL="graph_fuser" python +``` + +##### Example Output + +Running the same script above, in the log, you should be looking for two graphs `Before Fusion` shows the subgraph where fusion pass runs on; `Before Compilation` shows the graph sent to codegen backend, where each `CudaFusionGroup` will trigger codegen runtime system to generate kernel(s) to execute the subgraph. + +``` + Before Fusion: + graph(%x.1 : Tensor): + %2 : float = prim::Constant[value=1.]() + %1 : int = prim::Constant[value=1]() + %3 : Tensor = prim::profile[profiled_type=Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0)](%x.1) + %o.10 : Tensor = aten::add(%3, %2, %1) # test.py:6:8 + %5 : Tensor = prim::profile[profiled_type=Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0)](%o.10) + %o.7 : Tensor = aten::relu(%5) # test.py:7:8 + %7 : Tensor = prim::profile[profiled_type=Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0)](%o.7) + %8 : Tensor = prim::profile[profiled_type=Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0)](%o.7) + return (%7, %8) + + Before Compilation: + graph(%x.1 : Tensor): + %13 : bool = prim::CudaFusionGuard[types=[Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0)]](%x.1) + %12 : Tensor = prim::If(%13) + block0(): + %o.11 : Tensor = prim::CudaFusionGroup_0(%x.1) + -> (%o.11) + block1(): + %o.7 : Tensor = prim::FallbackGraph_1(%x.1) + -> (%o.7) + return (%12, %12) + with prim::CudaFusionGroup_0 = graph(%2 : Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0)): + %4 : int = prim::Constant[value=1]() + %3 : float = prim::Constant[value=1.]() + %o.10 : Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0) = aten::add(%2, %3, %4) # test.py:6:8 + %o.7 : Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0) = aten::relu(%o.10) # test.py:7:8 + return (%o.7) + with prim::FallbackGraph_1 = graph(%x.1 : Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0)): + %1 : int = prim::Constant[value=1]() + %2 : float = prim::Constant[value=1.]() + %o.10 : Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0) = aten::add(%x.1, %2, %1) # test.py:6:8 + %o.7 : Float(2, 32, 128, 512, strides=[2097152, 65536, 512, 1], requires_grad=0, device=cuda:0) = aten::relu(%o.10) # test.py:7:8 + return (%o.7) +``` + +### General ideas of debug no-fusion + +Currently there we have a few consumers that utilizes nvfuser via lowering computations to TorchScript and executing that through a ProfilingExecutor. + +Without going into too much details about how the integration is done, a few notes on debugging no-fusion on ProfilingExecutor: + +1. Run TorchScript module multiple times (5 could be a lucky number) to enable fusion. + Because ProfilingExecutor takes the first (few) runs for profiling, later optimization (including the fusion pass the enables nvfuser) relies on profiling information to run, so your initial runs are not going to trigger fused kernels. + Note that the number of profiling runs is dependent on your model. + +2. Fused kernel should show up in TorchScript IR as `prim::CudaFusionGroup`. You can look at your TorchScript optimized graph to see if fusion is happening `jit_model.graph_for(*inputs)`. + +3. If your scripted model has inputs requiring gradient, fusion is only happening for graphs inside `prim::DifferentiableGraph`. + There are many reasons why your graph is not autodiff-able. Take a look at `/torch/csrc/jit/runtime/symbolic_scripts.cpp`, which lists all autodiff-able ops (note that this is a different list from autograd-supported ops). There's also a threshold where tiny autodiff graph are inlined/reverted, which could be disabled via `torch._C._debug_set_autodiff_subgraph_inlining(False)`. + +### General ideas of debug nvfuser mal-functioning + +Assuming we have ProfilingExecutor things worked out properly, that is, you see a region that's supposed to be fused but did not ended up in a fused kernel, here's ways to dig deeper: + +1. Dump fusion pass result: + `PYTORCH_JIT_LOG_LEVEL=graph_fuser python your_script.py &> log` + + Looks for graph dumped with `Before Fusion` & `Before Compilation`, which shows the portion of graph where fusion pass runs on and the result of fusion (`CudaFusionGroup`). + +2. Check out which ops are not fused and roughly why: + `PYTORCH_JIT_LOG_LEVEL=">partition:graph_fuser" python your_script.py &> log` + + Enabling GRAPH_UPDATE from partition.cpp dumps a log when a given node is rejected by fusion. + +3. Disabling FALLBACK path: + If you see a warning where a FALLBACK path has been taken while executing your model with nvfuser enabled, it's indicating that either codegen or fusion pass has failed unexpectedly. This is likely to cause regression on model performance, even though it's still functionally correct. We recommend to disable FALLBACK path, so error would be reported properly to open an informative issue. + + `PYTORCH_NVFUSER_DISABLE=fallback python your_script.py &> log` + +4. Pin point kernel/fusion pattern that's causing error: + With a larger model that includes multiple fusion patterns, it could be tricky to figure out which exact fusion is causing FALLBACK and build up a minimal python repro. + One quick thing to try is to run the example with a few knobs turned on: + + ``` + PYTORCH_NVFUSER_DISABLE=fallback \ + PYTORCH_JIT_LOG_LEVEL=">partition:graph_fuser:>>kernel_cache" \ + python your_script.py &> log + ``` + + This logs all TorchScript IR parsed to codegen IR as well as kernel generated and executed by nvfuser. Since fallback path is disabled, it's likely that the last log would indicate the failing fusion. + + Hint: look for last `Before Compilation:` that indicates a parsing failure, or `running GraphCache: xxxxx`, which indicates jit compilation/execution failure (also search for the GraphCache address, which would should have dumped a TorchScript IR earlier. + +### Query nvfuser codegen kernels + +There're a few debug dump that could be turned on via environment variables. Look for `PYTORCH_NVFUSER_DUMP` inside `[pytorch_source_path]/torch/csrc/jit/codegen/cuda/utils.cpp`. A few useful ones are: +1. `dump_eff_bandwidth`: print out effective bandwidth of each generated kernel. This naively measure the kernel time divided by I/O buffer size and is a good/simple metric of performance for bandwidth bound kernels +2. `cuda_kernel`: print out generated cuda kernels +3. `launch_param`: print out launch config of generated kernels +4. `print_args`: print out input output tensors of executed codegen kernels + +### FAQs + +1. There's regression after turning on nvfuser. + +First thing is to check that you have fusion kernel running properly. Try to run your model with fallback disabled to see if you hit any errors that caused fallback via `export PYTORCH_NVFUSER_DISABLE=fallback`. + +If turning on NVFuser produces unexpected outputs, set the `PYTORCH_NVFUSER_DISABLE` environment variable to disable some of the optional features, e.g.: +- `fma`: disable using FMA instructions +- `index_hoist`: disble optimization to hoist comon index expressions +- `predicate_elimination`: disble optimization to eliminate redundant predicates +- `unroll_with_rng`: disable unrolling when RNG is used + +For example, `export PYTORCH_NVFUSER_DISABLE=fma,index_hoist` would disable FMA and index hoisting. + +2. I didn't see any speedup with nvfuser. + +Check if there is fusion in your script model. Run your script with `PYTORCH_JIT_LOG_LEVEL="graph_fuser"`, you should see some log dump of before/after graph regarding fusion pass. If nothing shows up in the log, that means something in TorchScript is not right and fusion pass are not executed. Check [General ideals of debug no-fusion] for more details. diff --git a/torch/csrc/jit/codegen/cuda/arith.cpp b/torch/csrc/jit/codegen/cuda/arith.cpp index 2c9925cf8933..d8a9fc9751b9 100644 --- a/torch/csrc/jit/codegen/cuda/arith.cpp +++ b/torch/csrc/jit/codegen/cuda/arith.cpp @@ -1,8 +1,11 @@ #include +#include #include +#include #include #include +#include #include #include #include @@ -23,14 +26,18 @@ Val* newScalar(ValType vtype, DataType dtype) { case (ValType::Scalar): switch (dtype) { case DataType::Bool: - return new Bool(); + return IrBuilder::create(); case DataType::Double: case DataType::Float: case DataType::Half: case DataType::BFloat16: - return new Double(); + return IrBuilder::create(); + case DataType::Int32: case DataType::Int: - return new Int(); + return IrBuilder::create(); + case DataType::ComplexFloat: + case DataType::ComplexDouble: + return IrBuilder::create(); default: break; } @@ -103,10 +110,10 @@ TensorView* newOutputTV(const std::vector& vals, DataType dtype) { } for (const auto dim_i : c10::irange(out_domain.size())) { if (extent_vals[dim_i] != nullptr) { - out_domain[dim_i] = new IterDomain( - new Int(start_offsets[dim_i]), + out_domain[dim_i] = IrBuilder::create( + IrBuilder::create(start_offsets[dim_i]), extent_vals[dim_i], - new Int(stop_offsets[dim_i]), + IrBuilder::create(stop_offsets[dim_i]), ParallelType::Serial, iter_types[dim_i]); } else { @@ -121,13 +128,17 @@ TensorView* newOutputTV(const std::vector& vals, DataType dtype) { break; } } - out_domain[dim_i] = - new IterDomain(new Int(0), new Int(1), ParallelType::Serial, itype); + out_domain[dim_i] = IrBuilder::create( + FusionGuard::getCurFusion()->zeroVal(), + FusionGuard::getCurFusion()->oneVal(), + ParallelType::Serial, + itype); } } - return new TensorView( - new TensorDomain(out_domain, std::vector(out_domain.size(), true)), + return IrBuilder::create( + IrBuilder::create( + out_domain, std::vector(out_domain.size(), true)), dtype); } @@ -177,11 +188,84 @@ Val* newValLike(Val* val, DataType dtype) { return newScalar(vtype, dtype); } +// returns the minimum init value for reduction: +// -inf for floating type; +// lowest value for integer type; +// false for bool. +Val* getMinimumValue(DataType v) { + switch (v) { + case (DataType::Double): + return IrBuilder::create( + -std::numeric_limits::infinity()); + break; + case (DataType::Float): + return IrBuilder::create(-std::numeric_limits::infinity()); + break; + case (DataType::Half): + return IrBuilder::create( + static_cast(-std::numeric_limits::infinity())); + break; + case DataType::BFloat16: + return IrBuilder::create( + static_cast(-std::numeric_limits::infinity())); + break; + case (DataType::Int): + return IrBuilder::create(std::numeric_limits::lowest()); + break; + case (DataType::Int32): + return IrBuilder::create(std::numeric_limits::lowest()); + break; + case (DataType::Bool): + return IrBuilder::create(false); + break; + default: + TORCH_CHECK( + false, "Could not generate a min op for tensor with type: ", v); + } + return nullptr; +} + +// returns the maximum init value for reduction: +// inf for floating type; +// highest value for integer type; +// true for bool. +Val* getMaximumValue(DataType v) { + switch (v) { + case (DataType::Double): + return IrBuilder::create(std::numeric_limits::infinity()); + break; + case (DataType::Float): + return IrBuilder::create(std::numeric_limits::infinity()); + break; + case (DataType::Half): + return IrBuilder::create( + static_cast(std::numeric_limits::infinity())); + break; + case DataType::BFloat16: + return IrBuilder::create( + static_cast(std::numeric_limits::infinity())); + break; + case (DataType::Int): + return IrBuilder::create(std::numeric_limits::max()); + break; + case (DataType::Int32): + return IrBuilder::create(std::numeric_limits::max()); + break; + case (DataType::Bool): + return IrBuilder::create(true); + break; + default: + TORCH_CHECK( + false, "Could not generate a max op for tensor with type: ", v); + } + return nullptr; +} + } // namespace Val* castOp(DataType dtype, Val* v1) { if (v1->getDataType().value() == dtype) { - return v1; + return set(v1); } if (cast_func_str(std::make_pair(v1->getDataType().value(), dtype)) == @@ -195,7 +279,7 @@ Val* castOp(DataType dtype, Val* v1) { } Val* out = newValLike(v1, dtype); - new UnaryOp(UnaryOpType::Cast, out, v1); + IrBuilder::create(UnaryOpType::Cast, out, v1); return out; } @@ -203,6 +287,24 @@ TensorView* castOp(DataType dtype, TensorView* v1) { return castOp(dtype, v1->as())->as(); } +Val* bitCastOp(DataType dtype, Val* v1) { + if (v1->getDataType().value() == dtype) { + return v1; + } + + TORCH_CHECK( + dataTypeSize(v1->getDataType().value()) == dataTypeSize(dtype), + "BitCast only works for types of the same size"); + + Val* out = newValLike(v1, dtype); + IrBuilder::create(UnaryOpType::BitCast, out, v1); + return out; +} + +TensorView* bitCastOp(DataType dtype, TensorView* v1) { + return bitCastOp(dtype, v1->as())->as(); +} + Val* unaryOp(UnaryOpType type, Val* v1) { TORCH_INTERNAL_ASSERT( type != UnaryOpType::Address, @@ -219,7 +321,7 @@ Val* unaryOp(UnaryOpType type, Val* v1) { // } Val* out = newValLike(v1, v1->getDataType().value()); - new UnaryOp(type, out, v1); + IrBuilder::create(type, out, v1); return out; } @@ -227,17 +329,27 @@ TensorView* unaryOp(UnaryOpType type, TensorView* v1) { return unaryOp(type, v1->as())->as(); } +Val* unaryIsOp(UnaryOpType type, Val* v) { + Val* out = newValLike(v, DataType::Bool); + IrBuilder::create(type, out, v); + return out; +} + +TensorView* unaryIsOp(UnaryOpType type, TensorView* v) { + return unaryOp(type, v->asVal())->as(); +} + Val* unaryOp(UnaryOpType type, Val* v1, const TypePromotionConfig& config) { - auto casted_v1 = promoteValues(config, {v1}).front(); - return unaryOp(type, casted_v1); + auto cast_v1 = promoteValues(config, {v1}).front(); + return unaryOp(type, cast_v1); } TensorView* unaryOp( UnaryOpType type, TensorView* v1, const TypePromotionConfig& config) { - auto casted_v1 = promoteValues(config, {v1}).front(); - return unaryOp(type, casted_v1)->as(); + auto cast_v1 = promoteValues(config, {v1}).front(); + return unaryOp(type, cast_v1)->as(); } // UNARY OPERATIONS @@ -252,12 +364,9 @@ TensorView* unaryOp( NVFUSER_DEFINE_UNARY_OP(set, Set) NVFUSER_DEFINE_UNARY_OP(randlike, RandLike) -NVFUSER_DEFINE_UNARY_OP(abs, Abs) -NVFUSER_DEFINE_UNARY_OP(notOp, Not) NVFUSER_DEFINE_UNARY_OP(ceil, Ceil) NVFUSER_DEFINE_UNARY_OP(floor, Floor) NVFUSER_DEFINE_UNARY_OP(frac, Frac) -NVFUSER_DEFINE_UNARY_OP(gelu, Gelu) NVFUSER_DEFINE_UNARY_OP(neg, Neg) NVFUSER_DEFINE_UNARY_OP(relu, Relu) NVFUSER_DEFINE_UNARY_OP(round, Round) @@ -265,6 +374,41 @@ NVFUSER_DEFINE_UNARY_OP(silu, Silu) NVFUSER_DEFINE_UNARY_OP(trunc, Trunc) #undef NVFUSER_DEFINE_UNARY_OP +Val* bitwise_not(Val* v) { + TORCH_CHECK( + isIntegralType(v->dtype()) || v->dtype() == DataType::Bool, + "input must have integral or boolean type, but got ", + v->dtype()); + return unaryOp(UnaryOpType::Not, v); +} + +TensorView* bitwise_not(TensorView* tv) { + TORCH_CHECK( + isIntegralType(tv->dtype()) || tv->dtype() == DataType::Bool, + "input must have integral or boolean type, but got ", + tv->dtype()); + return unaryOp(UnaryOpType::Not, tv); +} + +// The output of abs(complex_tensor) are real numbers +Val* abs(Val* v) { + if (v->getDataType() == DataType::ComplexDouble) { + Val* out = newValLike(v, DataType::Double); + IrBuilder::create(UnaryOpType::Abs, out, v); + return out; + } + if (v->getDataType() == DataType::ComplexFloat) { + Val* out = newValLike(v, DataType::Float); + IrBuilder::create(UnaryOpType::Abs, out, v); + return out; + } + return unaryOp(UnaryOpType::Abs, v); +} + +TensorView* abs(TensorView* tv) { + return abs(tv->as())->as(); +} + // UNARY FLOAT CAST OPERATIONS #define NVFUSER_DEFINE_UNARY_FLOAT_OP(op_name, op_type) \ @@ -300,6 +444,22 @@ NVFUSER_DEFINE_UNARY_FLOAT_OP(tan, Tan) NVFUSER_DEFINE_UNARY_FLOAT_OP(tanh, Tanh) #undef NVFUSER_DEFINE_UNARY_FLOAT_OP +#define NVFUSER_DEFINE_UNARY_IS_OP(op_name, op_type) \ + Val* op_name(Val* v) { \ + return unaryIsOp(UnaryOpType::op_type, v); \ + } \ + TensorView* op_name(TensorView* tv) { \ + return unaryIsOp(UnaryOpType::op_type, tv); \ + } + +NVFUSER_DEFINE_UNARY_IS_OP(isfinite, IsFinite) +NVFUSER_DEFINE_UNARY_IS_OP(isinf, IsInf) +NVFUSER_DEFINE_UNARY_IS_OP(isnan, IsNan) +NVFUSER_DEFINE_UNARY_IS_OP(isneginf, IsNegInf) +NVFUSER_DEFINE_UNARY_IS_OP(isposinf, IsPosInf) +NVFUSER_DEFINE_UNARY_IS_OP(isreal, IsReal) +#undef NVFUSER_DEFINE_UNARY_IS_OP + // BINARY OPERATIONS namespace { @@ -379,7 +539,7 @@ Val* binaryOp(BinaryOpType type, Val* v1, Val* v2, DataType common_dtype) { } else { out = newScalar(out_vtype, out_dtype); } - new BinaryOp(type, out, vals[0], vals[1]); + IrBuilder::create(type, out, vals[0], vals[1]); return out; } @@ -414,9 +574,8 @@ Val* binaryOp( const TypePromotionConfig& config) { std::vector operands = {v1, v2}; auto common_dtype = computeTypes(config, operands); - auto casted_values = promoteValues(operands, common_dtype); - return binaryOp( - type, casted_values.front(), casted_values.back(), common_dtype); + auto cast_values = promoteValues(operands, common_dtype); + return binaryOp(type, cast_values.front(), cast_values.back(), common_dtype); } TensorView* binaryOp( @@ -426,11 +585,11 @@ TensorView* binaryOp( const TypePromotionConfig& config) { std::vector operands = {v1, v2}; auto common_dtype = computeTypes(config, operands); - auto casted_values = promoteValues(operands, common_dtype); + auto cast_values = promoteValues(operands, common_dtype); return binaryOp( type, - casted_values.front()->as(), - casted_values.back(), + cast_values.front()->as(), + cast_values.back(), common_dtype); } @@ -441,11 +600,11 @@ TensorView* binaryOp( const TypePromotionConfig& config) { std::vector operands = {v1, v2}; auto common_dtype = computeTypes(config, operands); - auto casted_values = promoteValues(operands, common_dtype); + auto cast_values = promoteValues(operands, common_dtype); return binaryOp( type, - casted_values.front(), - casted_values.back()->as(), + cast_values.front(), + cast_values.back()->as(), common_dtype); } @@ -456,11 +615,11 @@ TensorView* binaryOp( const TypePromotionConfig& config) { std::vector operands = {v1, v2}; auto common_dtype = computeTypes(config, operands); - auto casted_values = promoteValues(operands, common_dtype); + auto cast_values = promoteValues(operands, common_dtype); return binaryOp( type, - casted_values.front()->as(), - casted_values.back()->as(), + cast_values.front()->as(), + cast_values.back()->as(), common_dtype); } @@ -507,20 +666,111 @@ NVFUSER_DEFINE_BINARY_FLOAT_OP(atan2, Atan2) // Integer binary ops NVFUSER_DEFINE_BINARY_CAST_OP(mod, Mod) NVFUSER_DEFINE_BINARY_CAST_OP(ceilDiv, CeilDiv) - NVFUSER_DEFINE_BINARY_CAST_OP(add, Add) NVFUSER_DEFINE_BINARY_CAST_OP(fmod, Fmod) NVFUSER_DEFINE_BINARY_CAST_OP(mul, Mul) NVFUSER_DEFINE_BINARY_CAST_OP(pow, Pow) NVFUSER_DEFINE_BINARY_CAST_OP(remainder, Remainder) NVFUSER_DEFINE_BINARY_CAST_OP(sub, Sub) -NVFUSER_DEFINE_BINARY_CAST_OP(lshift, Lshift) -NVFUSER_DEFINE_BINARY_CAST_OP(rshift, Rshift) -NVFUSER_DEFINE_BINARY_CAST_OP(andOp, And) -NVFUSER_DEFINE_BINARY_CAST_OP(orOp, Or) -NVFUSER_DEFINE_BINARY_CAST_OP(xorOp, Xor) #undef NVFUSER_DEFINE_BINARY_CAST_OP +#define NVFUSER_DEFINE_BITWISE_OP(op_name, op_type) \ + Val* op_name(Val* v1, Val* v2) { \ + TORCH_CHECK( \ + (isIntegralType(v1->dtype()) || v1->dtype() == DataType::Bool) && \ + (isIntegralType(v2->dtype()) || v2->dtype() == DataType::Bool), \ + "input must have integral or boolean type, but got ", \ + v1->dtype(), \ + " and ", \ + v2->dtype()); \ + return binaryOp( \ + BinaryOpType::op_type, v1, v2, TypePromotion::default_op_config); \ + } \ + TensorView* op_name(TensorView* v1, Val* v2) { \ + TORCH_CHECK( \ + (isIntegralType(v1->dtype()) || v1->dtype() == DataType::Bool) && \ + (isIntegralType(v2->dtype()) || v2->dtype() == DataType::Bool), \ + "input must have integral or boolean type, but got ", \ + v1->dtype(), \ + " and ", \ + v2->dtype()); \ + return binaryOp( \ + BinaryOpType::op_type, v1, v2, TypePromotion::default_op_config); \ + } \ + TensorView* op_name(Val* v1, TensorView* v2) { \ + TORCH_CHECK( \ + (isIntegralType(v1->dtype()) || v1->dtype() == DataType::Bool) && \ + (isIntegralType(v2->dtype()) || v2->dtype() == DataType::Bool), \ + "input must have integral or boolean type, but got ", \ + v1->dtype(), \ + " and ", \ + v2->dtype()); \ + return binaryOp( \ + BinaryOpType::op_type, v1, v2, TypePromotion::default_op_config); \ + } \ + TensorView* op_name(TensorView* v1, TensorView* v2) { \ + TORCH_CHECK( \ + (isIntegralType(v1->dtype()) || v1->dtype() == DataType::Bool) && \ + (isIntegralType(v2->dtype()) || v2->dtype() == DataType::Bool), \ + "input must have integral or boolean type, but got ", \ + v1->dtype(), \ + " and ", \ + v2->dtype()); \ + return binaryOp( \ + BinaryOpType::op_type, v1, v2, TypePromotion::default_op_config); \ + } + +NVFUSER_DEFINE_BITWISE_OP(bitwise_and, And) +NVFUSER_DEFINE_BITWISE_OP(bitwise_or, Or) +NVFUSER_DEFINE_BITWISE_OP(bitwise_xor, Xor) +#undef NVFUSER_DEFINE_BITWISE_OP + +#define NVFUSER_DEFINE_BITWISE_SHIFT_OP(op_name, op_type) \ + Val* op_name(Val* v1, Val* v2) { \ + TORCH_CHECK( \ + isIntegralType(v1->dtype()) && isIntegralType(v2->dtype()), \ + "input must have integral type, but got ", \ + v1->dtype(), \ + " and ", \ + v2->dtype()); \ + return binaryOp( \ + BinaryOpType::op_type, v1, v2, TypePromotion::default_op_config); \ + } \ + TensorView* op_name(TensorView* v1, Val* v2) { \ + TORCH_CHECK( \ + isIntegralType(v1->dtype()) && isIntegralType(v2->dtype()), \ + "input must have integral type, but got ", \ + v1->dtype(), \ + " and ", \ + v2->dtype()); \ + return binaryOp( \ + BinaryOpType::op_type, v1, v2, TypePromotion::default_op_config); \ + } \ + TensorView* op_name(Val* v1, TensorView* v2) { \ + TORCH_CHECK( \ + isIntegralType(v2->dtype()) && isIntegralType(v2->dtype()), \ + "input must have integral type, but got ", \ + v1->dtype(), \ + " and ", \ + v2->dtype()); \ + return binaryOp( \ + BinaryOpType::op_type, v1, v2, TypePromotion::default_op_config); \ + } \ + TensorView* op_name(TensorView* v1, TensorView* v2) { \ + TORCH_CHECK( \ + isIntegralType(v1->dtype()) && isIntegralType(v2->dtype()), \ + "input must have integral type, but got ", \ + v1->dtype(), \ + " and ", \ + v2->dtype()); \ + return binaryOp( \ + BinaryOpType::op_type, v1, v2, TypePromotion::default_op_config); \ + } + +NVFUSER_DEFINE_BITWISE_SHIFT_OP(bitwise_left_shift, Lshift) +NVFUSER_DEFINE_BITWISE_SHIFT_OP(bitwise_right_shift, Rshift) +#undef NVFUSER_DEFINE_BITWISE_SHIFT_OP + #define NVFUSER_DEFINE_BINARY_COMPARE_OP(op_name, op_type) \ Val* op_name(Val* v1, Val* v2) { \ return binaryOp( \ @@ -589,7 +839,7 @@ static TensorView* newForReduction( " of tensor ", tv); - new_domain.push_back(new IterDomain( + new_domain.push_back(IrBuilder::create( id->start(), id->extent(), id->stopOffset(), @@ -597,12 +847,12 @@ static TensorView* newForReduction( isReduction ? IterType::Reduction : id->getIterType())); } - TensorDomain* td = - new TensorDomain(new_domain, std::vector(new_domain.size(), true)); + TensorDomain* td = IrBuilder::create( + new_domain, std::vector(new_domain.size(), true)); data_type = data_type == DataType::Null ? tv->getDataType().value() : data_type; - return new TensorView(td, data_type); + return IrBuilder::create(td, data_type); } TensorView* reductionOp( @@ -610,7 +860,8 @@ TensorView* reductionOp( const std::vector& axes, Val* init, TensorView* tv, - bool keep_dim /*=false*/) { + bool keep_dim /*=false*/, + DataType dtype /* DataType::Null */) { TORCH_CHECK( init->isConstScalar(), "Cannot create a reduction operation where the initial value is not a const scalar."); @@ -641,21 +892,22 @@ TensorView* reductionOp( uint_axes.push_back((unsigned int)axis); } - TensorView* out = newForReduction(tv, uint_axes); + TensorView* out = newForReduction(tv, uint_axes, dtype); const auto out_type = out->getDataType().value(); const auto init_type = init->getDataType().value(); TORCH_CHECK( (isFloatingPointType(out_type) && isFloatingPointType(init_type)) || + (isComplexType(out_type) && isComplexType(init_type)) || (isIntegralType(out_type) && isIntegralType(init_type)) || - (out_type == DataType::Bool && init_type == DataType::Bool), + (isBooleanType(out_type) && isBooleanType(init_type)), "Types should match for reduction ops but received: ", out_type, " and ", init_type); - new ReductionOp(reduction_op_type, init, out, tv); + IrBuilder::create(reduction_op_type, init, out, tv); if (keep_dim) { - auto tv_root = TensorDomain::noReductions(tv->getRootDomain()); + auto tv_root = TensorDomain::noReductions(tv->getMaybeRFactorDomain()); std::vector is_broadcast(tv_root.size(), false); for (auto axis : uint_axes) { is_broadcast.at(axis) = true; @@ -669,45 +921,44 @@ TensorView* reductionOp( TensorView* sum( TensorView* v1, const std::vector& axes, - bool keep_dim /*=false*/) { + bool keep_dim /*=false*/, + DataType dtype /* DataType::Null */) { + if (dtype == DataType::Null) { + auto initial_v1_dtype = v1->getDataType().value(); + if (isBooleanType(initial_v1_dtype) || isIntegralType(initial_v1_dtype)) { + dtype = DataType::Int; + } + } + + // Cast input tensor to dtype before the operation is performed + if (dtype != DataType::Null) { + v1 = optionalCastStrict(dtype, v1)->as(); + } + Val* init = nullptr; - auto dtype = v1->getDataType().value(); - if (isFloatingPointType(dtype)) { - init = new Double(0.0); - } else if (isIntegralType(dtype)) { - init = new Int(0); + auto v1_dtype = v1->getDataType().value(); + if (isFloatingPointType(v1_dtype)) { + init = IrBuilder::create(0.0); + } else if (isComplexType(v1_dtype)) { + init = IrBuilder::create(c10::complex(0.0, 0.0)); + } else if (isIntegralType(v1_dtype)) { + init = FusionGuard::getCurFusion()->zeroVal(); + } else if (isBooleanType(v1_dtype)) { + init = IrBuilder::create(false); } else { TORCH_CHECK( - false, - "Could not generate a sum op for tensor with type: ", - v1->getDataType().value()); + false, "Could not generate a sum op for tensor with type: ", v1_dtype); } - return reductionOp(BinaryOpType::Add, axes, init, v1, keep_dim); + return reductionOp(BinaryOpType::Add, axes, init, v1, keep_dim, dtype); } TensorView* max( TensorView* v1, const std::vector& axes, bool keep_dim /*=false*/) { - Val* init = nullptr; - switch (v1->getDataType().value()) { - case (DataType::Double): - init = new Double(std::numeric_limits::lowest()); - break; - case (DataType::Float): - init = new Double(std::numeric_limits::lowest()); - break; - case (DataType::Int): - init = new Int(INT_MIN); - break; - default: - TORCH_CHECK( - false, - "Could not generate a max op for tensor with type: ", - v1->getDataType().value()); - } - + Val* init = getMinimumValue(v1->getDataType().value()); + TORCH_CHECK(init != nullptr, "Missing initial value"); return reductionOp(BinaryOpType::Max, axes, init, v1, keep_dim); } @@ -715,24 +966,8 @@ TensorView* min( TensorView* v1, const std::vector& axes, bool keep_dim /*=false*/) { - Val* init = nullptr; - switch (v1->getDataType().value()) { - case (DataType::Double): - init = new Double(DBL_MAX); - break; - case (DataType::Float): - init = new Double(FLT_MAX); - break; - case (DataType::Int): - init = new Int(INT_MAX); - break; - default: - TORCH_CHECK( - false, - "Could not generate a min op for tensor with type: ", - v1->getDataType().value()); - } - + Val* init = getMaximumValue(v1->getDataType().value()); + TORCH_CHECK(init != nullptr, "Missing initial value"); return reductionOp(BinaryOpType::Min, axes, init, v1, keep_dim); } @@ -742,9 +977,12 @@ TensorView* broadcast( auto nBCastDims = is_broadcast_dim.size(); // Validate is_broadcast_dim unsigned int n_broadcasts = 0; - for (auto ent : is_broadcast_dim) - if (ent) + for (auto ent : is_broadcast_dim) { + if (ent) { n_broadcasts++; + } + } + TORCH_CHECK( nBCastDims - n_broadcasts == TensorDomain::noReductions(inp->getMaybeRFactorDomain()).size(), @@ -767,22 +1005,28 @@ TensorView* broadcast( size_t iinp = 0, ibdim = 0; while (ibdim < is_broadcast_dim.size()) { if (is_broadcast_dim[ibdim]) { - out_domain.push_back(new IterDomain( - new Int(0), - new Int(1), + out_domain.push_back(IrBuilder::create( + FusionGuard::getCurFusion()->zeroVal(), + FusionGuard::getCurFusion()->oneVal(), ParallelType::Serial, IterType::BroadcastWithoutStride)); } else { - out_domain.push_back(inp_domain[iinp]->clone()); + out_domain.push_back(IrBuilder::create( + inp_domain[iinp]->start(), + inp_domain[iinp]->extent(), + inp_domain[iinp]->stopOffset(), + inp_domain[iinp]->getParallelType(), + inp_domain[iinp]->getIterType())); iinp++; } ibdim++; } - TensorView* out_tensor = new TensorView( - new TensorDomain(out_domain, std::vector(out_domain.size(), true)), + TensorView* out_tensor = IrBuilder::create( + IrBuilder::create( + out_domain, std::vector(out_domain.size(), true)), inp->getDataType().value()); - new BroadcastOp(out_tensor, inp, is_broadcast_dim); + IrBuilder::create(out_tensor, inp, is_broadcast_dim); return out_tensor; } @@ -799,6 +1043,10 @@ WelfordResult Welford( TORCH_CHECK(tv->nDims() > 0, "Tried to reduce a 0-dim tensor"); TORCH_CHECK(axes.size() > 0, "No reduction axis specified"); + if (init_N == nullptr) { + init_N = FusionGuard::getCurFusion()->zeroVal(); + } + // Initial values for welford op are tensors, so their dims have to match the // output dim, // i.e. original_dims - dims_to_be_reduced @@ -819,8 +1067,8 @@ WelfordResult Welford( init_avg_val = init_avg; init_var_val = init_var; } else { - init_avg_val = new Double(0); - init_var_val = new Double(0); + init_avg_val = IrBuilder::create(0); + init_var_val = IrBuilder::create(0); } // Check and collect reduction axes @@ -845,9 +1093,9 @@ WelfordResult Welford( // Create tensor outputs TensorView* out_avg = newForReduction(tv, uint_axes); TensorView* out_var = newForReduction(tv, uint_axes); - TensorView* out_N = newForReduction(tv, uint_axes, DataType::Int); + TensorView* out_N = newForReduction(tv, uint_axes, DataType::Index); - new WelfordOp( + IrBuilder::create( out_avg, out_var, out_N, /*out var/avg/count */ @@ -855,8 +1103,8 @@ WelfordResult Welford( init_var_val, init_N, /*init var/avg/count */ tv, - nullptr, - new Int(1)); /*in var/avg/count */ + FusionGuard::getCurFusion()->zeroVal(), + FusionGuard::getCurFusion()->oneVal()); /*in var/avg/count */ return WelfordResult(out_avg, out_var, out_N); } @@ -872,26 +1120,28 @@ WelfordResult::WelfordResult( WelfordResult WelfordResult::rFactor(const std::vector& axes) { auto o_tv = avg->definition()->as()->out()->as(); - return o_tv->rFactor(axes, avg, var_sum, n); + auto rf_tvs = o_tv->rFactor(axes, std::vector{avg, var_sum, n}); + return WelfordResult{rf_tvs.at(0), rf_tvs.at(1), rf_tvs.at(2)}; } TensorView* transpose( TensorView* inp, const std::unordered_map& old2new) { - auto inp_domain = TensorDomain::noReductions(inp->getRootDomain()); + auto inp_domain = TensorDomain::noReductions(inp->getMaybeRFactorDomain()); std::vector out_domain(inp_domain.size()); auto new2old = ir_utils::normalizeOld2New(old2new, inp_domain.size()); for (const auto i : c10::irange(out_domain.size())) { auto in_id = inp_domain[new2old[i]]; - out_domain[i] = in_id->clone(); + out_domain[i] = in_id->cloneWithoutRFactor(); } - TensorView* out_tensor = new TensorView( - new TensorDomain(out_domain, std::vector(out_domain.size(), true)), + TensorView* out_tensor = IrBuilder::create( + IrBuilder::create( + out_domain, std::vector(out_domain.size(), true)), inp->getDataType().value()); - new TransposeOp(out_tensor, inp, new2old); + IrBuilder::create(out_tensor, inp, new2old); return out_tensor; } @@ -904,7 +1154,10 @@ Val* add_alpha(Val* v1, Val* v2, Val* s) { "Alpha value should be a Scalar Valtype and not ", s->getValType().value()); - auto vals = maybeBroadcast({v1, v2, s}); + std::vector operands = {v1, v2}; + auto common_dtype = computeTypes(TypePromotion::default_op_config, operands); + auto cast_values = promoteValues({v1, v2, s}, common_dtype); + auto vals = maybeBroadcast(cast_values); Val* intrm = mul(vals[1], vals[2]); return add(vals[0], intrm); } @@ -924,7 +1177,10 @@ Val* sub_alpha(Val* v1, Val* v2, Val* s) { "Alpha value should be a Scalar Valtype and not ", s->getValType().value()); - auto vals = maybeBroadcast({v1, v2, s}); + std::vector operands = {v1, v2}; + auto common_dtype = computeTypes(TypePromotion::default_op_config, operands); + auto cast_values = promoteValues({v1, v2, s}, common_dtype); + auto vals = maybeBroadcast(cast_values); Val* intrm = mul(vals[1], vals[2]); return sub(vals[0], intrm); } @@ -938,11 +1194,29 @@ TensorView* sub_alpha(TensorView* v1, TensorView* v2, Val* v3) { return arithOpOverloads(sub_alpha, v1, v2, v3); } // lerp -TORCH_CUDA_CU_API Val* lerp(Val* start, Val* end, Val* weight) { +Val* lerp(Val* start, Val* end, Val* weight) { + auto cast_values = + promoteValues(TypePromotion::default_op_config, {start, end, weight}); + start = cast_values[0]; + end = cast_values[1]; + weight = cast_values[2]; + + auto out_dtype = + promote_type(start->getDataType().value(), end->getDataType().value()); + auto out_vtype = + promote_type(start->getValType().value(), end->getValType().value()); + auto vals = maybeBroadcast({start, end, weight}); - Val* intrm1 = sub(vals[1], vals[0]); - Val* intrm2 = mul(vals[2], intrm1); - return add(vals[0], intrm2); + Val* out = nullptr; + if (out_vtype == ValType::TensorView) { + out = newOutputTV(vals, out_dtype); + } else { + out = newScalar(out_vtype, out_dtype); + } + + IrBuilder::create( + TernaryOpType::Lerp, out, vals[0], vals[1], vals[2]); + return out; } TensorView* lerp(TensorView* v1, Val* v2, Val* v3) { return arithOpOverloads(lerp, v1, v2, v3); @@ -972,7 +1246,10 @@ Val* addcmul(Val* v1, Val* v2, Val* v3, Val* s) { "Alpha value should be a Scalar Valtype and not ", s->getValType().value()); - auto vals = maybeBroadcast({v1, v2, v3, s}); + std::vector operands = {v1, v2, v3}; + auto common_dtype = computeTypes(TypePromotion::default_op_config, operands); + auto cast_values = promoteValues({v1, v2, v3, s}, common_dtype); + auto vals = maybeBroadcast(cast_values); Val* intrm1 = mul(vals[2], vals[3]); Val* intrm2 = mul(vals[1], intrm1); return add(vals[0], intrm2); @@ -1007,10 +1284,9 @@ Val* where(Val* c, Val* v1, Val* v2) { "Condition should be of DataType Bool, not ", c->getDataType().value()); - auto casted_values = - promoteValues(TypePromotion::default_op_config, {v1, v2}); - v1 = casted_values[0]; - v2 = casted_values[1]; + auto cast_values = promoteValues(TypePromotion::default_op_config, {v1, v2}); + v1 = cast_values[0]; + v2 = cast_values[1]; TORCH_CHECK(c->getDataType().value() == DataType::Bool); auto out_dtype = @@ -1024,7 +1300,8 @@ Val* where(Val* c, Val* v1, Val* v2) { } else { out = newScalar(out_vtype, out_dtype); } - new TernaryOp(TernaryOpType::Where, out, vals[0], vals[1], vals[2]); + IrBuilder::create( + TernaryOpType::Where, out, vals[0], vals[1], vals[2]); return out; } @@ -1064,7 +1341,8 @@ Val* threshold(Val* in, Val* thresh, Val* value) { value = optionalCast(in->getDataType().value(), value); Val* out = newValLike(in, in->getDataType().value()); - new TernaryOp(TernaryOpType::Threshold, out, in, thresh, value); + IrBuilder::create( + TernaryOpType::Threshold, out, in, thresh, value); return out; } @@ -1074,17 +1352,25 @@ TensorView* threshold(TensorView* in, Val* thresh, Val* value) { Val* clamp(Val* in, Val* min_val, Val* max_val) { TORCH_CHECK( - (min_val->getValType().value() == ValType::Scalar || + (min_val == nullptr || min_val->getValType().value() == ValType::Scalar || min_val->getValType().value() == ValType::NamedScalar) && - (max_val->getValType().value() == ValType::Scalar || + (max_val == nullptr || + max_val->getValType().value() == ValType::Scalar || max_val->getValType().value() == ValType::NamedScalar), "For Clamp operation: Min and Max values should be Scalars."); - min_val = optionalCast(in->getDataType().value(), min_val); - max_val = optionalCast(in->getDataType().value(), max_val); - Val* out = newValLike(in, in->getDataType().value()); + min_val = (min_val == nullptr) + ? getMinimumValue(in->getDataType().value()) + : optionalCast(in->getDataType().value(), min_val); + TORCH_CHECK(min_val != nullptr, "Missing minimum value"); - new TernaryOp(TernaryOpType::Clamp, out, in, min_val, max_val); + max_val = (max_val == nullptr) + ? getMaximumValue(in->getDataType().value()) + : optionalCast(in->getDataType().value(), max_val); + TORCH_CHECK(max_val != nullptr, "Missing maximum value"); + + Val* out = newValLike(in, in->getDataType().value()); + IrBuilder::create(TernaryOpType::Clamp, out, in, min_val, max_val); return out; } @@ -1095,7 +1381,7 @@ TensorView* clamp(TensorView* in, Val* min_val, Val* max_val) { // sum_to operator TensorView* sum_to(TensorView* in, const std::vector& sum_to_size) { - const auto& root = TensorDomain::noReductions(in->getRootDomain()); + const auto& root = TensorDomain::noReductions(in->getMaybeRFactorDomain()); TORCH_CHECK( root.size() >= sum_to_size.size(), @@ -1141,7 +1427,7 @@ TensorView* sum_to(TensorView* in, const std::vector& sum_to_size) { } TensorView* sum_to(TensorView* in, const std::vector& sum_to_size) { - const auto& root = TensorDomain::noReductions(in->getRootDomain()); + const auto& root = TensorDomain::noReductions(in->getMaybeRFactorDomain()); TORCH_CHECK( root.size() >= sum_to_size.size(), @@ -1186,125 +1472,157 @@ TensorView* sum_to(TensorView* in, const std::vector& sum_to_size) { } TensorView* shift(TensorView* inp, const std::vector& offsets, bool pad) { + // When pad is false, no padding is given. When it is true, padding + // sizes are set so that output domains have the same extents as + // input domains. + std::vector pad_width(offsets.size(), 0); + if (pad) { + for (const auto i : c10::irange(offsets.size())) { + pad_width[i] = std::abs(offsets[i]); + } + } + return shift(inp, offsets, pad_width); +} + +TensorView* shift( + TensorView* inp, + const std::vector& offsets, + const std::vector& pad_width_param) { + auto inp_dom = TensorDomain::noReductions(inp->getRootDomain()); + const auto ndims = inp_dom.size(); + + auto pad_width = pad_width_param; + // Default padding is set so that the extent is kept unchanged + if (pad_width.empty()) { + pad_width = offsets; + for (auto& p : pad_width) { + p = std::abs(p); + } + } + TORCH_CHECK( - TensorDomain::noReductions(inp->getRootDomain()).size() == offsets.size(), + ndims == offsets.size(), "Invalid shift offsets, number of entries in offsets expected to be ", - TensorDomain::noReductions(inp->getRootDomain()).size(), + ndims, " but received ", offsets.size()); + TORCH_CHECK( + ndims == pad_width.size(), + "Invalid padding width list, number of entries in pad_width expected to be ", + ndims, + " but received ", + pad_width.size()); + + std::for_each(pad_width.begin(), pad_width.end(), [](const auto& pad) { + TORCH_CHECK(pad >= 0, "Padding width must be >= 0: ", pad); + }); + TensorView* out = nullptr; - if (pad) { - out = newValLike(inp, inp->getDataType().value())->as(); - } else { - auto inp_dom = TensorDomain::noReductions(inp->getRootDomain()); - const auto ndims = inp_dom.size(); - std::vector out_dom; - for (const auto i : c10::irange(ndims)) { - const auto inp_axis = inp_dom[i]; - const auto offset = offsets[i]; - if (offset == 0) { - out_dom.push_back(inp_axis->clone()); - continue; - } + std::vector out_dom; + for (const auto i : c10::irange(ndims)) { + const auto inp_axis = inp_dom[i]; + const auto offset = offsets[i]; + const auto pad = pad_width[i]; - Int* current_start_offset = dynamic_cast(inp_axis->start()); - TORCH_INTERNAL_ASSERT( - current_start_offset != nullptr && current_start_offset->isConst(), - "Invalid IterDomain start value:", - current_start_offset); + if (offset == 0) { + out_dom.push_back(inp_axis->cloneWithoutRFactor()); + continue; + } - Int* current_stop_offset = dynamic_cast(inp_axis->stopOffset()); - TORCH_INTERNAL_ASSERT( - current_stop_offset != nullptr && current_stop_offset->isConst(), - "Invalid IterDomain stop offset value:", - current_stop_offset); - - const auto cur_start_offset_value = current_start_offset->value().value(); - const auto cur_stop_offset_value = current_stop_offset->value().value(); - - Val* out_start_offset = nullptr; - Val* out_stop_offset = nullptr; - - if (offset > 0) { - // shift to right; extent remains the same, start and stop - // positions are moved right - out_start_offset = new Int(cur_start_offset_value + offset); - out_stop_offset = - new Int(std::max(cur_stop_offset_value - offset, int64_t(0))); - } else { - // shift to left; extent remains the same, start and stop - // positions are moved left - out_start_offset = - new Int(std::max(cur_start_offset_value + offset, int64_t(0))); - out_stop_offset = new Int(cur_stop_offset_value - offset); - } + Int* current_start_offset = dynamic_cast(inp_axis->start()); + TORCH_INTERNAL_ASSERT( + current_start_offset != nullptr && current_start_offset->isConst(), + "Invalid IterDomain start value:", + current_start_offset); - out_dom.push_back(new IterDomain( - out_start_offset, - inp_axis->extent(), - out_stop_offset, - ParallelType::Serial, - inp_axis->getIterType())); + Int* current_stop_offset = dynamic_cast(inp_axis->stopOffset()); + TORCH_INTERNAL_ASSERT( + current_stop_offset != nullptr && current_stop_offset->isConst(), + "Invalid IterDomain stop offset value:", + current_stop_offset); + + const auto cur_start_offset_value = current_start_offset->value().value(); + const auto cur_stop_offset_value = current_stop_offset->value().value(); + + int64_t out_start_offset = 0; + int64_t out_stop_offset = 0; + + if (offset > 0) { + // shift to right; extent remains the same, start and stop + // positions are moved right + out_start_offset = cur_start_offset_value + offset - pad; + out_stop_offset = std::max(cur_stop_offset_value - offset, int64_t(0)); + // If pad > offset, the extent of the output ID could be larger than the + // input, and the start offset of the output domain could become + // negative, which is not supported. + TORCH_CHECK( + out_start_offset >= 0, + "Invalid shift offset and padding. Padding must not be larger than the absolute extent of shift offset. Padding: ", + pad, + ". Shift: ", + offset, + "."); + } else { + // shift to left; extent remains the same, start and stop + // positions are moved left + out_start_offset = std::max(cur_start_offset_value + offset, int64_t(0)); + out_stop_offset = cur_stop_offset_value - offset - pad; + // Similar to the above case whwere offset is positive, if pad > + // -offset (note offset is negative), the extent of the output + // ID could be larger than the input, and the stop offset of the + // output domain could become negative. + TORCH_CHECK( + out_stop_offset >= 0, + "Invalid shift offset and padding. Padding must not be larger than the absolute extent of shift offset. Padding: ", + pad, + ". Shift: ", + offset, + "."); } - out = new TensorView( - new TensorDomain(out_dom, std::vector(out_dom.size(), true)), - inp->getDataType().value()); + out_dom.push_back(IrBuilder::create( + IrBuilder::create(out_start_offset), + inp_axis->extent(), + IrBuilder::create(out_stop_offset), + ParallelType::Serial, + inp_axis->getIterType())); } - new ShiftOp(out, inp, offsets, pad); - return out; -} - -namespace { -std::vector convertToIntVector(const std::vector& x) { - std::vector converted; - std::transform(x.begin(), x.end(), std::back_inserter(converted), [](int x) { - return new Int(x); - }); - return converted; -} -} // namespace + out = IrBuilder::create( + IrBuilder::create( + out_dom, std::vector(out_dom.size(), true)), + inp->getDataType().value()); -TensorView* gather( - TensorView* inp, - const std::vector& window_shape, - const std::vector>& pad_width, - const std::vector& strides) { - std::vector window_shape_int = convertToIntVector(window_shape); - std::vector> pad_width_int; - std::transform( - pad_width.begin(), - pad_width.end(), - std::back_inserter(pad_width_int), - [](const std::vector& x) { return convertToIntVector(x); }); - return gather(inp, window_shape_int, pad_width_int, strides); + IrBuilder::create(out, inp, offsets, pad_width); + return out; } namespace { -// Return a new TensorDomain with given root domains. Apply strides if -// necessary. With non-unit strides, strided domains become an rfactor -// domain. +// Return a new TensorDomain with given root domains. Apply +// strides if necessary. With non-unit strides, strided domains become an +// rfactor domain. TensorDomain* generateTensorDomainWithStrides( const std::vector& root_domains, - const std::vector& strides) { + const std::vector& strides, + bool skip_unit_stride) { std::vector strided_domains; // If strides are just unit strides, don't apply striding - if (strides.empty() || std::all_of(strides.begin(), strides.end(), [](int s) { - return s == 1; - })) { - return new TensorDomain( + if (strides.empty() || + (skip_unit_stride && + std::all_of( + strides.begin(), strides.end(), [](int s) { return s == 1; }))) { + return IrBuilder::create( root_domains, std::vector(root_domains.size(), true)); } for (const auto i : c10::irange(root_domains.size())) { auto root_dom = root_domains.at(i); - if (i >= strides.size() || strides[i] == 1) { + if (i >= strides.size() || (skip_unit_stride && strides[i] == 1)) { strided_domains.push_back(root_dom); continue; } @@ -1317,7 +1635,7 @@ TensorDomain* generateTensorDomainWithStrides( auto contig_vector_size = strided_domains.size(); - auto strided_td = new TensorDomain( + auto strided_td = IrBuilder::create( root_domains, strided_domains, strided_domains, @@ -1330,10 +1648,11 @@ TensorDomain* generateTensorDomainWithStrides( TensorView* gather( TensorView* inp, - const std::vector& window_shape, - const std::vector>& pad_width, - const std::vector& strides) { - auto inp_dom = TensorDomain::noReductions(inp->getRootDomain()); + const std::vector& window_shape, + const std::vector>& pad_width, + const std::vector& strides, + bool trim_out_of_bounds) { + auto inp_dom = TensorDomain::noReductions(inp->getMaybeRFactorDomain()); const auto ndims = inp_dom.size(); TORCH_CHECK( @@ -1343,6 +1662,10 @@ TensorView* gather( " but received ", window_shape.size()); + std::for_each(window_shape.begin(), window_shape.end(), [](const auto& w) { + TORCH_CHECK(w > 0, "Window size must be > 0: ", w); + }); + TORCH_CHECK( ndims == pad_width.size(), "Invalid pad width: number of entries expected to be ", @@ -1354,6 +1677,10 @@ TensorView* gather( TORCH_CHECK( p.size() == 2, "Each entry of pad_width must have two non-negative integers."); + std::for_each(p.begin(), p.end(), [](const auto& p_left_or_right) { + TORCH_CHECK( + p_left_or_right >= 0, "Padding must be >= 0: ", p_left_or_right); + }); }); TORCH_CHECK( @@ -1363,6 +1690,10 @@ TensorView* gather( " but received ", strides.size()); + std::for_each(strides.begin(), strides.end(), [](const auto& s) { + TORCH_CHECK(s > 0, "Stride must be > 0: ", s); + }); + std::vector out_root_domains; std::vector out_gather_dom; @@ -1371,43 +1702,225 @@ TensorView* gather( const auto window_dim = window_shape[i]; const auto pad_left = pad_width[i][0]; const auto pad_right = pad_width[i][1]; + // This may be over-conservative TORCH_INTERNAL_ASSERT(inp_axis->start()->isZeroInt()); - Val* out_axis_dim = nullptr; - if (window_dim->isConst() && pad_left->isConst() && pad_right->isConst()) { - const int64_t extent_adjustment = - -(-window_dim->value().value() + 1 + pad_left->value().value() + - pad_right->value().value()); - out_axis_dim = extent_adjustment == 0 - ? inp_axis->extent() - : sub(inp_axis->extent(), new Int(extent_adjustment)); - } else { - out_axis_dim = - add(add(sub(inp_axis->extent(), window_dim), new Int(1)), - add(pad_left, pad_right)); - } - // TODO: out_axis_dim is assumed to be the same as the extent of - // the input domain. Throw an error if it isn't the case. - out_root_domains.push_back(new IterDomain( - new Int(0), - out_axis_dim, + const auto inp_stop_offset = inp_axis->stopOffset()->getInt(); + TORCH_INTERNAL_ASSERT( + inp_stop_offset.has_value(), + "Dynamic stop offset not supported: ", + inp_axis); + const auto extent_adjustment = window_dim - 1 - pad_left - pad_right; + TORCH_CHECK( + extent_adjustment >= 0, + "Invalid gather window and padding as output extent would be larger than input.", + " Window: ", + window_dim, + ". Padding left: ", + pad_left, + ". Padding right: ", + pad_right); + const auto out_stop_offset = inp_stop_offset.value() + extent_adjustment; + out_root_domains.push_back(IrBuilder::create( + FusionGuard::getCurFusion()->zeroVal(), + inp_axis->extent(), + IrBuilder::create(out_stop_offset), ParallelType::Serial, inp_axis->getIterType())); // create a new axis for the gathered domain - out_gather_dom.push_back(new IterDomain( - new Int(0), window_dim, ParallelType::Serial, IterType::Gather)); + out_gather_dom.push_back(IrBuilder::create( + FusionGuard::getCurFusion()->zeroVal(), + IrBuilder::create(window_dim), + ParallelType::Serial, + IterType::Gather)); } out_root_domains.insert( out_root_domains.end(), out_gather_dom.begin(), out_gather_dom.end()); - auto out_td = generateTensorDomainWithStrides(out_root_domains, strides); + TensorDomain* out_td = nullptr; - auto out_tv = new TensorView(out_td, inp->getDataType().value()); + if (trim_out_of_bounds) { + // If no stride vector is given, just use stride 1. It does not do + // any striding effect, but out-of-bounds values are trimmed. + auto s = strides.empty() ? std::vector(ndims, 1) : strides; + out_td = generateTensorDomainWithStrides(out_root_domains, strides, false); + } else { + out_td = generateTensorDomainWithStrides(out_root_domains, strides, true); + } + + auto out_tv = + IrBuilder::create(out_td, inp->getDataType().value()); - new GatherOp(out_tv, inp, window_shape, pad_width); + IrBuilder::create(out_tv, inp, window_shape, pad_width); return out_tv; } +TORCH_CUDA_CU_API TensorView* viewAsScalar(TensorView* inp) { + auto inp_type = inp->getDataType().value(); + TORCH_CHECK( + isVectorType(inp_type), + "Invalid type to viewAsScalar. A vector type is expected but ", + inp_type, + " is given."); + int vec_size = getVectorSizeFromType(inp_type); + auto out_type = getTypeFromVectorType(inp_type); + + std::vector out_domain; + auto inp_domain = TensorDomain::noReductions(inp->getMaybeRFactorDomain()); + out_domain.reserve(inp_domain.size()); + for (auto d : inp_domain) { + out_domain.push_back(d->cloneWithoutRFactor()); + } + + IterDomain* id = IrBuilder::create( + inp_domain[0]->container(), + inp_domain[0]->container()->zeroVal(), + IrBuilder::create(vec_size), + ParallelType::Serial, + IterType::VectorComponent, + false); + out_domain.push_back(id); + + auto out = IrBuilder::create( + inp->container(), + IrBuilder::create( + out_domain, std::vector(out_domain.size(), true)), + out_type); + + IrBuilder::create(inp->container(), out, inp, id); + + return out; +} + +namespace { + +//! Create new output for mma +static TensorView* newForMma( + TensorView* tv_a, + TensorView* tv_b, + const std::vector& axes, + DataType data_type = DataType::Float) { + auto orig_domain_a = + TensorDomain::noReductions(tv_a->getMaybeRFactorDomain()); + auto orig_domain_b = + TensorDomain::noReductions(tv_b->getMaybeRFactorDomain()); + + TORCH_INTERNAL_ASSERT( + orig_domain_a.size() == orig_domain_b.size(), + "MMA op: need matching dim input"); + + std::set axes_set(axes.begin(), axes.end()); + std::vector new_domain; + + TORCH_INTERNAL_ASSERT( + !axes_set.empty(), + "Asked for ouput of reduction, but no reduction axis provided."); + + TORCH_INTERNAL_ASSERT( + (*(axes_set.rbegin())) < orig_domain_a.size(), + "Error setting up reduction, reduction axis (", + *(axes_set.rbegin()), + ") is outside nDims (", + orig_domain_a.size(), + "). Keep in mind reductions are relative to root domains, not modified views."); + + auto axis_iter = axes_set.begin(); + for (const auto dim : c10::irange(orig_domain_a.size())) { + bool isReduction = false; + if (axis_iter != axes_set.end() && *axis_iter == dim) { + isReduction = true; + axis_iter++; + } + + const IterDomain* id = orig_domain_a[dim]->isBroadcast() + ? orig_domain_b[dim] + : orig_domain_a[dim]; + + TORCH_CHECK( + !(isReduction && id->isBroadcast() && !id->isImplicitBroadcast()), + "Cannot reduce an axis that is marked as broadcasted as it has an undetermined size. Tried to reduce ID = ", + id, + " of tensor ", + tv_a, + "and", + tv_b); + + new_domain.push_back(IrBuilder::create( + id->start(), + id->extent(), + id->stopOffset(), + ParallelType::Serial, + isReduction ? IterType::Reduction : id->getIterType())); + } + + TensorDomain* td = IrBuilder::create( + new_domain, std::vector(new_domain.size(), true)); + + return IrBuilder::create(td, data_type); +} + +} // namespace + +TensorView* fusedMultiplySum( + TensorView* tv_a, + TensorView* tv_b, + const std::vector& axes, + Val* init) { + if (init == nullptr) { + init = IrBuilder::create(0); + } + + // TODO: + // We will want to support initialize and rfactor with + // mma as well, for maybe fusing bias in prolog. + // TODO: check init type if given a tv, + // not supported currently though. + TORCH_CHECK( + init->isConstScalar(), + "Cannot create a reduction operation where the initial value is not a const scalar."); + + // TODO: + // Validate axis relationships between a and b + TORCH_CHECK(tv_a->nDims() > 0, "Tried to reduce a 0-dim tensor"); + + // TODO: + // Add tf32 and other mma data types + // Add fallback path for non-mma data types. + TORCH_CHECK(tv_a->getDataType().value() == DataType::Half); + TORCH_CHECK(tv_b->getDataType().value() == DataType::Half); + + TORCH_CHECK(axes.size() > 0, "No reduction axis specified"); + + // TODO: + // will lift this in a follow up when we have a + // more generic axes matching. + TORCH_CHECK( + axes.size() == 1, "Single axis reduction only for mma op instantiation.") + + std::vector uint_axes; + const int ndims = tv_a->domain()->noReductions().size(); + for (int axis : axes) { + if (axis < 0) { + axis += ndims; + } + + TORCH_CHECK( + axis >= 0 && axis < ndims, + "Reduction on invalid axis, recieved: ", + axis, + " however tensor view only has ", + ndims, + " non-reduction dims."); + + uint_axes.push_back((unsigned int)axis); + } + + TensorView* out = newForMma(tv_a, tv_b, uint_axes); + IrBuilder::create(out, tv_a, tv_b, init); + + return out; +} + } // namespace cuda } // namespace fuser } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/arith.h b/torch/csrc/jit/codegen/cuda/arith.h index 5652d68eab8e..53efba8f7301 100644 --- a/torch/csrc/jit/codegen/cuda/arith.h +++ b/torch/csrc/jit/codegen/cuda/arith.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -24,10 +24,14 @@ namespace cuda { TORCH_CUDA_CU_API Val* castOp(DataType dtype, Val* v1); TORCH_CUDA_CU_API TensorView* castOp(DataType dtype, TensorView* v1); +TORCH_CUDA_CU_API Val* bitCastOp(DataType dtype, Val* v1); +TORCH_CUDA_CU_API TensorView* bitCastOp(DataType dtype, TensorView* v1); + // Perform unary op type and return the output TORCH_CUDA_CU_API Val* unaryOp(UnaryOpType type, Val* v1); TORCH_CUDA_CU_API TensorView* unaryOp(UnaryOpType type, TensorView* v1); - +TORCH_CUDA_CU_API Val* unaryIsOp(UnaryOpType type, Val* v1); +TORCH_CUDA_CU_API TensorView* unaryIsOp(UnaryOpType type, TensorView* v1); TORCH_CUDA_CU_API Val* unaryOp( UnaryOpType type, Val* v1, @@ -88,7 +92,8 @@ TORCH_CUDA_CU_API TensorView* reductionOp( const std::vector& axes, Val* init, TensorView* v1, - bool keep_dim = false); + bool keep_dim = false, + DataType dtype = DataType::Null); //! Auxiliary Struct holding result of //! a single welford op in ternsorview @@ -114,7 +119,9 @@ TORCH_CUDA_CU_API WelfordResult Welford( const std::vector& axes, TensorView* init_avg = nullptr, TensorView* init_var = nullptr, - Int* init_N = new Int(0)); + // Initializes to 0 in function definition, doing this so we don't have to + // import IrBuilder just for this one interface. + Int* init_N = nullptr); // UNARY OPERATIONS // abs @@ -159,9 +166,6 @@ TORCH_CUDA_CU_API TensorView* floor(TensorView*); // frac TORCH_CUDA_CU_API Val* frac(Val*); TORCH_CUDA_CU_API TensorView* frac(TensorView*); -// gelu -TORCH_CUDA_CU_API Val* gelu(Val*); -TORCH_CUDA_CU_API TensorView* gelu(TensorView*); // silu TORCH_CUDA_CU_API Val* silu(Val*); TORCH_CUDA_CU_API TensorView* silu(TensorView*); @@ -222,9 +226,27 @@ TORCH_CUDA_CU_API TensorView* tanh(TensorView*); // trunc TORCH_CUDA_CU_API Val* trunc(Val*); TORCH_CUDA_CU_API TensorView* trunc(TensorView*); -// not -TORCH_CUDA_CU_API Val* notOp(Val*); -TORCH_CUDA_CU_API TensorView* notOp(TensorView*); +// bitwise_not +TORCH_CUDA_CU_API Val* bitwise_not(Val*); +TORCH_CUDA_CU_API TensorView* bitwise_not(TensorView*); +// isfinite +TORCH_CUDA_CU_API Val* isfinite(Val*); +TORCH_CUDA_CU_API TensorView* isfinite(TensorView*); +// isinf +TORCH_CUDA_CU_API Val* isinf(Val*); +TORCH_CUDA_CU_API TensorView* isinf(TensorView*); +// isnan +TORCH_CUDA_CU_API Val* isnan(Val*); +TORCH_CUDA_CU_API TensorView* isnan(TensorView*); +// isneginf +TORCH_CUDA_CU_API Val* isneginf(Val*); +TORCH_CUDA_CU_API TensorView* isneginf(TensorView*); +// isposinf +TORCH_CUDA_CU_API Val* isposinf(Val*); +TORCH_CUDA_CU_API TensorView* isposinf(TensorView*); +// isreal +TORCH_CUDA_CU_API Val* isreal(Val*); +TORCH_CUDA_CU_API TensorView* isreal(TensorView*); // Broadcasts v1 based on bool vector. Size of broadcast bool vector should be // the number of dims desired in the broadcasted tensor. This vector should be @@ -298,16 +320,36 @@ TORCH_CUDA_CU_API Val* ceilDiv(Val* v1, Val* v2); TORCH_CUDA_CU_API TensorView* ceilDiv(TensorView* v1, Val* v2); TORCH_CUDA_CU_API TensorView* ceilDiv(Val* v1, TensorView* v2); TORCH_CUDA_CU_API TensorView* ceilDiv(TensorView* v1, TensorView* v2); -// lshift -TORCH_CUDA_CU_API Val* lshift(Val* v1, Val* v2); -TORCH_CUDA_CU_API TensorView* lshift(TensorView* v1, Val* v2); -TORCH_CUDA_CU_API TensorView* lshift(Val* v1, TensorView* v2); -TORCH_CUDA_CU_API TensorView* lshift(TensorView* v1, TensorView* v2); -// rshift -TORCH_CUDA_CU_API Val* rshift(Val* v1, Val* v2); -TORCH_CUDA_CU_API TensorView* rshift(TensorView* v1, Val* v2); -TORCH_CUDA_CU_API TensorView* rshift(Val* v1, TensorView* v2); -TORCH_CUDA_CU_API TensorView* rshift(TensorView* v1, TensorView* v2); +// Bitwise binary ops +// bitwise_and +TORCH_CUDA_CU_API Val* bitwise_and(Val* v1, Val* v2); +TORCH_CUDA_CU_API TensorView* bitwise_and(TensorView* v1, Val* v2); +TORCH_CUDA_CU_API TensorView* bitwise_and(Val* v1, TensorView* v2); +TORCH_CUDA_CU_API TensorView* bitwise_and(TensorView* v1, TensorView* v2); +// bitwise_left_shift +TORCH_CUDA_CU_API Val* bitwise_left_shift(Val* v1, Val* v2); +TORCH_CUDA_CU_API TensorView* bitwise_left_shift(TensorView* v1, Val* v2); +TORCH_CUDA_CU_API TensorView* bitwise_left_shift(Val* v1, TensorView* v2); +TORCH_CUDA_CU_API TensorView* bitwise_left_shift( + TensorView* v1, + TensorView* v2); +// bitwise_right_shift +TORCH_CUDA_CU_API Val* bitwise_right_shift(Val* v1, Val* v2); +TORCH_CUDA_CU_API TensorView* bitwise_right_shift(TensorView* v1, Val* v2); +TORCH_CUDA_CU_API TensorView* bitwise_right_shift(Val* v1, TensorView* v2); +TORCH_CUDA_CU_API TensorView* bitwise_right_shift( + TensorView* v1, + TensorView* v2); +// bitwise_or +TORCH_CUDA_CU_API Val* bitwise_or(Val* v1, Val* v2); +TORCH_CUDA_CU_API TensorView* bitwise_or(TensorView* v1, Val* v2); +TORCH_CUDA_CU_API TensorView* bitwise_or(Val* v1, TensorView* v2); +TORCH_CUDA_CU_API TensorView* bitwise_or(TensorView* v1, TensorView* v2); +// bitwise_xor +TORCH_CUDA_CU_API Val* bitwise_xor(Val* v1, Val* v2); +TORCH_CUDA_CU_API TensorView* bitwise_xor(TensorView* v1, Val* v2); +TORCH_CUDA_CU_API TensorView* bitwise_xor(Val* v1, TensorView* v2); +TORCH_CUDA_CU_API TensorView* bitwise_xor(TensorView* v1, TensorView* v2); // Logical binary ops // eq TORCH_CUDA_CU_API Val* eq(Val* v1, Val* v2); @@ -340,27 +382,12 @@ TORCH_CUDA_CU_API TensorView* ne(TensorView* v1, Val* v2); TORCH_CUDA_CU_API TensorView* ne(Val* v1, TensorView* v2); TORCH_CUDA_CU_API TensorView* ne(TensorView* v1, TensorView* v2); -// andOp -TORCH_CUDA_CU_API Val* andOp(Val* v1, Val* v2); -TORCH_CUDA_CU_API TensorView* andOp(TensorView* v1, Val* v2); -TORCH_CUDA_CU_API TensorView* andOp(Val* v1, TensorView* v2); -TORCH_CUDA_CU_API TensorView* andOp(TensorView* v1, TensorView* v2); -// orOp -TORCH_CUDA_CU_API Val* orOp(Val* v1, Val* v2); -TORCH_CUDA_CU_API TensorView* orOp(TensorView* v1, Val* v2); -TORCH_CUDA_CU_API TensorView* orOp(Val* v1, TensorView* v2); -TORCH_CUDA_CU_API TensorView* orOp(TensorView* v1, TensorView* v2); -// xorOp -TORCH_CUDA_CU_API Val* xorOp(Val* v1, Val* v2); -TORCH_CUDA_CU_API TensorView* xorOp(TensorView* v1, Val* v2); -TORCH_CUDA_CU_API TensorView* xorOp(Val* v1, TensorView* v2); -TORCH_CUDA_CU_API TensorView* xorOp(TensorView* v1, TensorView* v2); - // REDUCTION OPERATIONS TORCH_CUDA_CU_API TensorView* sum( TensorView* v1, const std::vector& reduction_axes, - bool keep_dim = false); + bool keep_dim = false, + DataType dtype = DataType::Null); TORCH_CUDA_CU_API TensorView* max( TensorView* v1, @@ -484,19 +511,27 @@ TORCH_CUDA_CU_API TensorView* sum_to( //! t1[i, j] = 0, otherwise //! //! The pad option controls how out-of-boundary accesses are -//! handled. When pad is true, shifting works as if the source tensor -//! is padded by zero. Otherwise, it does not modify the output tensor -//! region whose source coordinates are out-of-boundry. In both cases, -//! the size of output tensor does not change. However, when pad is -//! false, the start or stop value of the shifted axis is adjusted -//! accordingly. For example, when a shift offset is one, the axis start -//! value would be incremented by one. +//! handled. It specifies how many zeros are logically padded. If no +//! pad option is given, it automatically pads the input tensor so +//! that the output tensor has the same extent for each axis. //! -//! \param pad If true, out-of-boundary access returns zero. +//! When a padding value is smaller than the absolute value of a shift +//! offset, the output axis still has the same extent but its start or +//! stop offset is moved inward to signify those outside of the offset +//! are invalid. +//! +//! It is not allowed to use padding values that are larger than shift +//! offsets, which would mean output extentes would be larger than +//! input extents +TORCH_CUDA_CU_API TensorView* shift( + TensorView* inp, + const std::vector& offsets, + const std::vector& pad_width = {}); + TORCH_CUDA_CU_API TensorView* shift( TensorView* inp, const std::vector& offsets, - bool pad = true); + bool pad); //! Gather a window of nearby elements for each element. //! @@ -508,8 +543,13 @@ TORCH_CUDA_CU_API TensorView* shift( //! implemented with strided split, whose outer output domain becomes //! the root domain for subsequent consumers. The inner output domain //! becomes a Stride domain, which is ignored by subsequent consumers. +//! Only valid input ranges are fed into strided splits. //! -//! Example: +//! When trim_out_of_bounds is true, the values at the first and last +//! ends that are outside of the start and stop offsets are +//! effetively trimmed by partial split by 1. +//! +//! Example 1: //! t0: 2D tensor of [N, M] //! t1 = gather(t0, {1, 3}, {{0, 0}, {1, 1}}); //! @@ -517,23 +557,61 @@ TORCH_CUDA_CU_API TensorView* shift( //! t1: [N, M, 1, 3] //! t1[i, j, k, l] = The value at the window position of [k, l] //! for t0[i, j] +//! +//! Example 2.1 (without trimming): +//! t0: 2D tensor of [N, M] +//! t1 = gather(t0, {2, 2}, {{0, 0}, {0, 0}}); +//! +//! then: +//! t1: [N (stop offset: 1), M (stop offset: 1, 2, 2)] +//! +//! Example 2.1 (with trimming) +//! t0: 2D tensor of [N, M] +//! t1 = gather(t0, {2, 2}, {{0, 0}, {0, 0}}, true); +//! +//! then: +//! t1: [ceilDiv(N - 1, 1), ceilDiv(M - 1, 1), 2, 2] +//! +//! Example 3: +//! t0: 2D tensor of [N, M] +//! t1 = gather(t0, {3, 3}, {{0, 0}, {0, 0}}, {3, 3}); +//! +//! then: +//! t1: [ceilDiv(N - 2, 3), ceilDiv(M - 2, 3), 2, 2] +//! TORCH_CUDA_CU_API TensorView* gather( TensorView* inp, const std::vector& window_shape, const std::vector>& pad_width, - const std::vector& strides = {}); - -//! Gather a window of nearby elements for each element. + const std::vector& strides = {}, + bool trim_out_of_bounds = false); + +// Append a new IterDomain to the end of a TenorView to allow +// iterating on a vector type. The input tensor must have +// vector dtype. +TORCH_CUDA_CU_API TensorView* viewAsScalar(TensorView* inp); + +//! A fused pointwise multiply and sum +//! operator that instantiates the following +//! fused pattern: +//! c = mul(tv_a, tv_b); +//! return sum(c, axes) //! -//! Same as the another gather interface but with Int* parameters. +//! \param tv_a first multiply operand +//! \param tv_b second multiply operand +//! \param axes axes to sum over +//! \param init sum initial value //! -//! TODO: Remove this interface as we do not intend to support dynamic -//! window shapes at this moment. -TORCH_CUDA_CU_API TensorView* gather( - TensorView* inp, - const std::vector& window_shape, - const std::vector>& pad_width, - const std::vector& strides = {}); +//! Note & TODO: +//! currently only support lowering to a mma op +//! through this interface and only support fp16 inputs. +//! will support converting back to multiply and reduce in +//! a follow up. +TORCH_CUDA_CU_API TensorView* fusedMultiplySum( + TensorView* tv_a, + TensorView* tv_b, + const std::vector& axes, + Val* init = nullptr); } // namespace cuda } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/codegen.cpp b/torch/csrc/jit/codegen/cuda/codegen.cpp index 709c810efe3e..ef223bae6d5b 100644 --- a/torch/csrc/jit/codegen/cuda/codegen.cpp +++ b/torch/csrc/jit/codegen/cuda/codegen.cpp @@ -3,6 +3,8 @@ #include #include #include +#include +#include #include #include @@ -19,7 +21,106 @@ namespace codegen { namespace { -class CudaKernelGenerator : private kir::IrVisitor { +std::string ptrType(DataType dt) { + std::stringstream ss; + ss << dt << "*"; + return ss.str(); +} + +std::string refType(DataType dt) { + std::stringstream ss; + ss << dt << "&"; + return ss.str(); +} + +//! Utility class to build an argument list +class ArgumentBuilder { + public: + //! Build an argument list where each argument is separated with a comma + ArgumentBuilder() = default; + + //! Build an argument list where each argument has its own line + ArgumentBuilder(int indent_level, const char* tab) { + std::stringstream ss; + for (const auto i : c10::irange(indent_level)) { + (void)i; // Suppress unused variable warning + ss << tab; + } + sep_ = ",\n" + ss.str(); + } + + //! Add a new argument + template + ArgumentBuilder& arg(const T& x) { + addSeparator(); + return append(x); + } + + //! Append to the last argument + template + ArgumentBuilder& append(const T& arg) { + ss_ << arg; + return *this; + } + + //! Get a string of the argument list + std::string str() const { + return ss_.str(); + } + + friend std::ostream& operator<<(std::ostream& os, const ArgumentBuilder& ab) { + return os << ab.str(); + } + + private: + void addSeparator() { + if (ss_.tellp() != 0) { + ss_ << sep_; + } + } + + private: + std::string sep_ = ", "; + std::stringstream ss_; +}; + +//! Append to the last argument +template <> +ArgumentBuilder& ArgumentBuilder::append(const bool& arg) { + ss_ << (arg ? "true" : "false"); + return *this; +} + +//! Returns "template_name" +template +std::string genTemplate( + const TemplateNameT& template_name, + const TemplateArgT& template_arg) { + std::stringstream ss; + ss << template_name << "<" << template_arg << ">"; + return ss.str(); +} + +//! Returns "func_name(func_arg)" +template +std::string genCall(const FuncNameT& func_name, const FuncArgT& func_arg) { + std::stringstream ss; + ss << func_name << "(" << func_arg << ")"; + return ss.str(); +} + +//! Returns "func_name(func_arg)" +template +std::string genCall( + const FuncNameT& func_name, + const TemplateArgT& template_arg, + const FuncArgT& func_arg) { + std::stringstream ss; + ss << func_name << "<" << template_arg << ">(" << func_arg << ")"; + return ss.str(); +} + +class CudaKernelGenerator : private OptOutConstDispatch { static constexpr const char* kTab = " "; public: @@ -45,48 +146,70 @@ class CudaKernelGenerator : private kir::IrVisitor { code_ << "__global__ void " << kernel_name << "("; - std::vector params; + std::unordered_set unique_args; + + std::vector params; // Inputs & Outputs for (auto val : kernel_->inputs()) { params.push_back(val); } for (auto val : kernel_->outputs()) { + TORCH_INTERNAL_ASSERT( + !val->isScalar(), "No scalar output is allowed: ", val->toString()); params.push_back(val); } // Generate parameter declarations - for (kir::Val* val : params) { - if (const auto tv = dynamic_cast(val)) { - code_ << "Tensor<" << val->dtype() << ", " - << TensorDomain::noReductions( - tv->fuserTv()->getMaybeRFactorDomain()) - .size() - << "> " << varName(tv); + unsigned int duplicate_counter = 0; + for (auto i : c10::irange(params.size())) { + std::stringstream var_name_ss; + if (params[i]->isA()) { + var_name_ss << varName(params[i]->as()); + } else { + var_name_ss << gen(params[i]); + } + + // If value is duplicate in arguments change the name to avoid name + // conflicts in args. + if (!unique_args.emplace(params[i]).second) { + var_name_ss << "_duplicate_" << duplicate_counter++; + } + + if (const auto tv = dynamic_cast(params[i])) { + if (tv->isCpuScalar()) { + code_ << " CpuScalarTensor<" << params[i]->dtype() << "> " + << var_name_ss.str(); + } else { + code_ + << "Tensor<" << params[i]->dtype() << ", " + << TensorDomain::noReductions(tv->getMaybeRFactorDomain()).size() + << "> " << var_name_ss.str(); + } } else { - TORCH_INTERNAL_ASSERT(val->isScalar()); // NOLINT (LLVM bug 48525) - TORCH_INTERNAL_ASSERT(val->definition() == nullptr); - code_ << val->dtype() << " " << gen(val); + TORCH_INTERNAL_ASSERT(params[i]->isScalar()); // NOLINT (LLVM bug 48525) + TORCH_INTERNAL_ASSERT(params[i]->definition() == nullptr); + code_ << params[i]->dtype() << " " << var_name_ss.str(); } - if (val != params.back()) { + if (i + 1 != params.size()) { code_ << ", "; } } // Global buffers for (auto allocate : kernel_summary.global_allocations) { - TORCH_INTERNAL_ASSERT(allocate->buffer()->isA()); - const auto tv = allocate->buffer()->as(); + TORCH_INTERNAL_ASSERT(allocate->buffer()->isA()); + const auto tv = allocate->buffer()->as(); const auto& maybe_rfactor_domain = tv->domain()->hasRFactor() - ? tv->domain()->rfactorDomain() - : tv->domain()->rootDomain(); + ? tv->domain()->getRFactorDomain() + : tv->domain()->getRootDomain(); const auto nDims = std::count_if( maybe_rfactor_domain.begin(), maybe_rfactor_domain.end(), - [](const kir::IterDomain* id) { + [](const IterDomain* id) { return !id->isReduction() && - id->iterType() != IterType::BroadcastWithoutStride; + id->getIterType() != IterType::BroadcastWithoutStride; }); code_ << ", Tensor<" << tv->dtype() << ", " << nDims << "> " << varName(tv); @@ -129,7 +252,7 @@ class CudaKernelGenerator : private kir::IrVisitor { if (has_dynamic_smem || has_reductions || has_parallel_welford) { indent() << "alignas(" #ifndef __HIP_PLATFORM_HCC__ - << dataTypeSize(kernel_summary.largest_smem_data_type) + << 16 // always align to 16B for any shared mem allocation #else << 8 // for HIP, we want 8-aligned even for smaller datatypes #endif @@ -177,7 +300,7 @@ class CudaKernelGenerator : private kir::IrVisitor { void genBody() { for (auto expr : kernel_->topLevelExprs()) { - expr->accept(this); + OptOutConstDispatch::handle(expr); } } @@ -204,139 +327,182 @@ class CudaKernelGenerator : private kir::IrVisitor { return code_; } - std::string gen(const kir::Node* node) { + std::string gen(const Statement* stmt) { std::stringstream tmp_code; std::swap(tmp_code, code_); - auto replacement = replacement_map_.find(node); - if (replacement != replacement_map_.end()) { - node = replacement->second; - } - node->accept(this); + OptOutConstDispatch::handle(stmt); std::swap(tmp_code, code_); return tmp_code.str(); } - // TODO(kir): consider automatic var naming - std::string varName(const kir::Val* val) { - std::string prefix = ""; - if (val->isA()) { - prefix = "T"; + std::string varName(const Val* val) { + std::stringstream name; + if (val->isA()) { + name << "T"; } else { - prefix = typePrefix(val->dtype()); + name << typePrefix(val->dtype()); } - - std::stringstream value_name; - if (val->name() != kInvalidStmName) { - value_name << prefix << val->name(); - } else { - value_name << "k" << prefix << val->id(); - } - return value_name.str(); + name << val->name(); + return name.str(); } - std::string genInline(const kir::Node* node) { + std::string genInline(const Statement* stmt) { const bool saved_inline = print_inline_; print_inline_ = true; - auto result = gen(node); + auto result = gen(stmt); print_inline_ = saved_inline; // NOLINTNEXTLINE(performance-no-automatic-move) return result; } - void visit(const kir::Predicate* node) final { - TORCH_INTERNAL_ASSERT(node->hasValue()); - code_ << gen(node->value()); + void handle(const kir::Predicate* pred) final { + TORCH_INTERNAL_ASSERT(pred->hasValue()); + code_ << gen(pred->value()); } - void visit(const kir::Bool* node) final { - const auto def = node->definition(); - if (print_inline_ && def != nullptr) { + void handle(const Bool* pred) final { + const auto def = pred->definition(); + const bool has_alloc = alloc_map_.find(pred) != alloc_map_.end(); + if (def != nullptr && !has_alloc) { code_ << "(" << gen(def) << ")"; - } else if (node->isConst()) { - code_ << (*node->value() ? "true" : "false"); + } else if (pred->isConst()) { + code_ << (*pred->value() ? "true" : "false"); } else { - code_ << varName(node); + code_ << varName(pred); } } - void visit(const kir::Double* node) final { - const auto def = node->definition(); - if (print_inline_ && def != nullptr) { + void handle(const Double* d) final { + const auto def = d->definition(); + const bool has_alloc = alloc_map_.find(d) != alloc_map_.end(); + if (def != nullptr && !has_alloc) { code_ << "(" << gen(def) << ")"; - } else if (node->isConst()) { - const int digits = std::numeric_limits::max_digits10; - code_ << std::setprecision(digits) << *node->value(); + } else if (d->isConst()) { + auto val = *d->value(); + // note: default inf/nan doesn't work and should be replaced with macros + // `NAN`, `POS_INFINITY` and `NEG_INFINITY` instead. + if (std::isinf(val)) { + if (val > 0) { + code_ << "POS_INFINITY"; + } else { + code_ << "NEG_INFINITY"; + } + } else if (std::isnan(val)) { + code_ << "NAN"; + } else { + const int digits = + std::numeric_limits::max_digits10; + code_ << std::setprecision(digits) << val; + } } else { - code_ << varName(node); + code_ << varName(d); } } - void visit(const kir::Int* node) final { - const auto def = node->definition(); - if (print_inline_ && def != nullptr) { + void handle(const Int* i) final { + const auto def = i->definition(); + const bool has_alloc = alloc_map_.find(i) != alloc_map_.end(); + if (def != nullptr && !has_alloc) { + code_ << "(" << genInline(def) << ")"; + } else if (i->isConst()) { + code_ << *i->value(); + } else { + code_ << varName(i); + } + } + + void handle(const ComplexDouble* c) final { + const auto def = c->definition(); + const bool has_alloc = alloc_map_.find(c) != alloc_map_.end(); + if (def != nullptr && !has_alloc) { code_ << "(" << gen(def) << ")"; - } else if (node->isConst()) { - code_ << *node->value(); + } else if (c->isConst()) { + const int digits = std::numeric_limits::max_digits10; + code_ << "std::complex" << std::setprecision(digits) + << *c->value(); } else { - code_ << varName(node); + code_ << varName(c); } } - void visit(const kir::NamedScalar* node) final { + void handle(const NamedScalar* ns) final { // dim3 components are unsigned int. Cast to signed integer to // support negative indexing - if (node->getParallelIndex().has_value() || - node->getParallelDim().has_value()) { - code_ << "((nvfuser_index_t)" << node->name() << ")"; + if (ns->getParallelIndex().has_value() || + ns->getParallelDim().has_value()) { + code_ << "((nvfuser_index_t)" << ns->name() << ")"; } else { - code_ << node->name(); + code_ << ns->name(); } } - void visit(const kir::TensorIndex* node) final { - code_ << varName(node->view()) << "["; - + void handle(const kir::TensorIndex* ti) final { bool first = true; - for (auto* ind : node->indices()) { + std::stringstream index; + for (auto* ind : ti->indices()) { if (!ind->isZeroInt()) { if (!first) { - code_ << " + "; + index << " + "; } - code_ << genInline(ind); + index << genInline(ind); first = false; } } if (first) { - code_ << "0"; + index << "0"; } + bool is_volatile = ti->view()->getMemoryType() == MemoryType::Global && + kernel_->summary().sync_map.needsRawSync(ti->view()).hasBID(); + if (is_volatile) { + code_ << "*(volatile " << ti->getDataType().value() << "*)&"; + } + code_ << varName(ti->view()) << "[" << index.str() << "]"; + } - code_ << "]"; + void handle(const ViewAsScalar* sv) final { + indent() << gen(sv->output(0)) << " = " << gen(sv->input(0)) << "[" + << gen(sv->index()) << "];\n"; } - void visit(const kir::IterDomain* node) final { - TORCH_INTERNAL_ASSERT(false && "Unreachable"); + void handle(const IterDomain*) final { + TORCH_INTERNAL_ASSERT(false, "Unreachable"); } - void visit(const kir::TensorDomain* node) final { - TORCH_INTERNAL_ASSERT(false && "Unreachable"); + void handle(const TensorDomain*) final { + TORCH_INTERNAL_ASSERT(false, "Unreachable"); } - void visit(const kir::TensorView* tv) final { - TORCH_INTERNAL_ASSERT(false && "Unreachable"); + void handle(const TensorView*) final { + TORCH_INTERNAL_ASSERT(false, "Unreachable"); } - void visit(const kir::UnaryOp* node) final { + void handle(const UnaryOp* uop) final { bool is_vector_op = false; size_t vector_word_size = 1; - if (vectorize_scope_ && node->out()->isA()) { - auto ti = node->out()->as(); + if (uop->out()->isA()) { + auto out_tv = uop->out()->as()->view(); + if (std::any_of( + out_tv->domain()->domain().begin(), + out_tv->domain()->domain().end(), + [&](IterDomain* id) { return id->isMma(); })) { + auto mma = dynamic_cast( + uop->out()->as()->view()->definition()); + TORCH_INTERNAL_ASSERT( + mma != nullptr, "CodeGen: mma op not in mma loop"); + genMmaInitialization(mma, uop); + return; + } + } + + if (vectorize_scope_ && uop->out()->isA()) { + auto ti = uop->out()->as(); bool vectorize_op = false; bool misaligned_op = false; - for (auto id : ti->view()->fuserTv()->domain()->domain()) { + for (auto id : ti->view()->domain()->domain()) { if (!isParallelTypeVectorize(id->getParallelType())) { continue; } @@ -358,84 +524,135 @@ class CudaKernelGenerator : private kir::IrVisitor { if (vectorize_op) { TORCH_INTERNAL_ASSERT( - node->operation() == UnaryOpType::Set, + uop->getUnaryOpType() == UnaryOpType::Set, "Cannot vectorize operations that are not sets. ", - "Use cache_before and cache_after to store/load with vectorized reads into buffers."); + "Use cacheBefore and cacheAfter to store/load with vectorized reads into buffers."); is_vector_op = true; } if (misaligned_op) { - is_vector_op = (node->operation() == UnaryOpType::Set); + is_vector_op = (uop->getUnaryOpType() == UnaryOpType::Set); } - if (is_vector_op && !node->in()->isScalar()) { + if (is_vector_op && !uop->in()->isScalar()) { TORCH_INTERNAL_ASSERT( - node->out()->dtype() == node->in()->dtype(), + uop->out()->dtype() == uop->in()->dtype(), "Vectorized store/load requires input and output datatypes match."); } - } - if (is_vector_op) { - if (node->in()->isScalar()) { - indent() << "reinterpret_cast<" - << "Array<" << node->out()->dtype() << ", " << vector_word_size - << ">*>" - << "(&" << gen(node->out()) << ")->set(" << gen(node->in()) - << ");\n"; - } else { - indent() << "*reinterpret_cast<" - << "Array<" << node->out()->dtype() << ", " << vector_word_size - << ">*>" - << "(&" << gen(node->out()) << ")" - << " = *reinterpret_cast<" - << "Array<" << node->in()->dtype() << ", " << vector_word_size - << ">*>" - << "(&" << gen(node->in()) << ");\n"; + if (is_vector_op) { + auto out_tv = uop->out()->as()->view(); + if (uop->in()->isScalar()) { + // Note: + // Double buffered local tensors need indexed initialization, + // so will need to use `arraySet` option. + if (out_tv->getMemoryType() == MemoryType::Local && + !out_tv->isDoubleBuffered()) { + // Vectorized initialization + indent() << varName(out_tv) << ".set(" << gen(uop->in()) << ");\n"; + } else { + // Note: currently arraySet option is not vectorized, so it will + // rely on auto vectorization pass of cuda compiler. + indent() << "arraySet<" << out_tv->getDataType().value() << ", " + << vector_word_size << ">(&" << gen(uop->out()) << ", " + << "(" << out_tv->getDataType().value() << ")" + << gen(uop->in()) << ");\n"; + } + } else { + // Vectorized load + TORCH_INTERNAL_ASSERT( + uop->in()->isA(), + "Invalid input to unary op with tensor output, found: ", + uop->in()->toString()); + + auto in_tv = uop->in()->as()->view(); + bool localToGlobal = out_tv->getMemoryType() == MemoryType::Global && + in_tv->getMemoryType() == MemoryType::Local; + + bool globalToLocal = out_tv->getMemoryType() == MemoryType::Local && + in_tv->getMemoryType() == MemoryType::Global; + + bool globalToGlobal = out_tv->getMemoryType() == MemoryType::Global && + in_tv->getMemoryType() == MemoryType::Global; + + bool is_volatile_to = out_tv->getMemoryType() == MemoryType::Global && + kernel_->summary().sync_map.needsRawSync(out_tv).hasBID(); + + bool is_volatile_from = + in_tv->getMemoryType() == MemoryType::Global && + kernel_->summary().sync_map.needsRawSync(in_tv).hasBID(); + + if (localToGlobal) { + indent() << "loadLocalToGlobal<" << uop->out()->dtype() << ", " + << vector_word_size << ", " + << (is_volatile_to ? "true" : "false") << ">("; + code_ << " &" << gen(uop->out()) << ", &" << gen(uop->in()) + << ");\n"; + } else if (globalToLocal) { + indent() << "loadGlobalToLocal<" << uop->out()->dtype() << ", " + << vector_word_size << ", " + << (is_volatile_from ? "true" : "false") << ">(&" + << gen(uop->out()) << ", "; + code_ << " &" << gen(uop->in()) << ");\n"; + } else if (globalToGlobal) { + indent() << "loadGlobalToGlobal<" << uop->out()->dtype() << ", " + << vector_word_size << ", " + << (is_volatile_to ? "true" : "false") << ", " + << (is_volatile_from ? "true" : "false") << ">("; + code_ << " &" << gen(uop->out()) << ", "; + code_ << " &" << gen(uop->in()) << ");\n"; + } else { + indent() << "loadGeneric<" << uop->out()->dtype() << ", " + << vector_word_size << ">("; + code_ << " &" << gen(uop->out()) << ", "; + code_ << " &" << gen(uop->in()) << ");\n"; + } + } + return; } - return; } - if (node->out()->isA()) { - const auto op_type = node->operation(); + if (uop->out()->isA()) { + const auto op_type = uop->getUnaryOpType(); if (auto op = inline_op_str(op_type)) { - indent() << gen(node->out()) << " = " << *op << genInline(node->in()) + indent() << gen(uop->out()) << " = " << *op << genInline(uop->in()) << ";\n"; } return; } if (!print_inline_) { - indent() << gen(node->out()); - if (!node->out()->isScalar() && !node->in()->isScalar()) { + indent() << gen(uop->out()); + if (!uop->out()->isScalar() && !uop->in()->isScalar()) { code_ << "\n"; indent() << kTab; } code_ << " = "; } - const auto op_type = node->operation(); + const auto op_type = uop->getUnaryOpType(); if (auto op = inline_op_str(op_type)) { if (alsoBooleanOperator(op_type) && - node->out()->dtype() == DataType::Bool) { - code_ << stringifyBooleanOp(op_type) << gen(node->in()); + uop->out()->dtype() == DataType::Bool) { + code_ << stringifyBooleanOp(op_type) << gen(uop->in()); } else { - code_ << *op << gen(node->in()); + code_ << *op << gen(uop->in()); } } else { if (op_type == UnaryOpType::Cast) { const auto cast_str = - cast_func_str({node->in()->dtype(), node->out()->dtype()}); + cast_func_str({uop->in()->dtype(), uop->out()->dtype()}); TORCH_INTERNAL_ASSERT( cast_str.has_value(), "Invalid cast. Input type: ", - node->in()->dtype(), + uop->in()->dtype(), ", output type: ", - node->out()->dtype()); + uop->out()->dtype()); code_ << cast_str.value(); } else { code_ << op_type; if (needFloatSuffix(op_type) && - node->out()->dtype() == DataType::Float) { + uop->out()->dtype() == DataType::Float) { code_ << "f"; } } @@ -444,7 +661,7 @@ class CudaKernelGenerator : private kir::IrVisitor { if (op_type == UnaryOpType::RandLike) { code_ << "rnd"; } else { - code_ << gen(node->in()); + code_ << gen(uop->in()); } code_ << ")"; } @@ -456,25 +673,28 @@ class CudaKernelGenerator : private kir::IrVisitor { std::string genBinaryOp( BinaryOpType op_type, - kir::Val* out, + DataType data_type, const std::string& lhs, const std::string& rhs) { std::stringstream expr; if (auto op = inline_op_str(op_type)) { expr << lhs << " "; - if (alsoBooleanOperator(op_type) && out->dtype() == DataType::Bool) { + if (alsoBooleanOperator(op_type) && data_type == DataType::Bool) { expr << stringifyBooleanOp(op_type); } else { expr << *op; } expr << " " << rhs; } else { - if (integer_op_str(op_type) && isIntegralType(out->dtype())) { + if (integer_op_str(op_type) && isIntegralType(data_type)) { auto int_op = integer_op_str(op_type); expr << *int_op; + } else if (bool_op_str(op_type) && isBooleanType(data_type)) { + auto bool_op = bool_op_str(op_type); + expr << *bool_op; } else { expr << op_type; - if (needFloatSuffix(op_type) && out->dtype() == DataType::Float) { + if (needFloatSuffix(op_type) && data_type == DataType::Float) { expr << "f"; } } @@ -485,7 +705,7 @@ class CudaKernelGenerator : private kir::IrVisitor { // If one argument is a tensorview and the other is a scalar, make sure we // cast the scalar to the tensorview type - std::string scalarCast(kir::Val* lhs, kir::Val* rhs) { + std::string scalarCast(Val* lhs, Val* rhs) { // If neither are scalars return if (!((lhs->isScalar() || rhs->isScalar()) && (lhs->isA() || rhs->isA()))) { @@ -520,18 +740,18 @@ class CudaKernelGenerator : private kir::IrVisitor { } // If possible, replace pow with mul. Return true when successful. - bool genPowerWithMul(const kir::BinaryOp* node) { - if (node->operation() != BinaryOpType::Pow) { + bool genPowerWithMul(const BinaryOp* bop) { + if (bop->getBinaryOpType() != BinaryOpType::Pow) { return false; } - auto rhs = node->rhs(); + auto rhs = bop->rhs(); c10::optional exponent; - if (auto val_int = dynamic_cast(rhs)) { + if (auto val_int = dynamic_cast(rhs)) { if (val_int->isConst()) { exponent = val_int->value().value(); } - } else if (auto val_float = dynamic_cast(rhs)) { + } else if (auto val_float = dynamic_cast(rhs)) { if (val_float->isConst()) { auto fp_exp = val_float->value().value(); double int_exp = 0; @@ -550,7 +770,7 @@ class CudaKernelGenerator : private kir::IrVisitor { return false; } - auto lhs = gen(node->lhs()); + auto lhs = gen(bop->lhs()); if (print_inline_) { code_ << lhs << " * " << lhs; @@ -558,8 +778,8 @@ class CudaKernelGenerator : private kir::IrVisitor { code_ << " * " << lhs; } } else { - indent() << gen(node->out()); - if (node->out()->isScalar()) { + indent() << gen(bop->out()); + if (bop->out()->isScalar()) { code_ << " = " << lhs << " * " << lhs; if (exponent.value() == 3) { code_ << " * " << lhs; @@ -579,24 +799,27 @@ class CudaKernelGenerator : private kir::IrVisitor { return true; } - void visit(const kir::BinaryOp* node) final { + void handle(const BinaryOp* bop) final { // Try replacing pow with mul - if (genPowerWithMul(node)) { + if (genPowerWithMul(bop)) { return; } - const auto op_type = node->operation(); + const auto op_type = bop->getBinaryOpType(); if (print_inline_) { // Inline expression: `lhs op rhs` code_ << genBinaryOp( - op_type, node->out(), gen(node->lhs()), gen(node->rhs())); + op_type, bop->out()->dtype(), gen(bop->lhs()), gen(bop->rhs())); } else { - indent() << gen(node->out()); - if (node->out()->isScalar()) { + indent() << gen(bop->out()); + if (bop->out()->isScalar()) { // Single line: `out = lhs op rhs;` code_ << " = " << genBinaryOp( - op_type, node->out(), gen(node->lhs()), gen(node->rhs())); + op_type, + bop->out()->dtype(), + gen(bop->lhs()), + gen(bop->rhs())); } else { // Split TensorView expressions across multiple lines: // @@ -605,64 +828,68 @@ class CudaKernelGenerator : private kir::IrVisitor { // op rhs; // - auto cast = scalarCast(node->lhs(), node->rhs()); + auto cast = scalarCast(bop->lhs(), bop->rhs()); if (auto op = inline_op_str(op_type)) { code_ << "\n"; - indent() << kTab << "= " << (node->lhs()->isScalar() ? cast : "") - << gen(node->lhs()) << "\n"; + indent() << kTab << "= " << (bop->lhs()->isScalar() ? cast : "") + << gen(bop->lhs()) << "\n"; indent() << kTab; if (alsoBooleanOperator(op_type) && - node->out()->dtype() == DataType::Bool) { + bop->out()->dtype() == DataType::Bool) { code_ << stringifyBooleanOp(op_type); } else { code_ << *op; } - code_ << " " << (node->rhs()->isScalar() ? cast : "") - << gen(node->rhs()); + code_ << " " << (bop->rhs()->isScalar() ? cast : "") + << gen(bop->rhs()); } else { - if (integer_op_str(op_type) && isIntegralType(node->out()->dtype())) { + if (integer_op_str(op_type) && isIntegralType(bop->out()->dtype())) { auto int_op = integer_op_str(op_type); code_ << " = " << *int_op << "(\n"; + } else if ( + bool_op_str(op_type) && isBooleanType(bop->out()->dtype())) { + auto bool_op = bool_op_str(op_type); + code_ << " = " << *bool_op << "(\n"; } else { std::stringstream op_str; op_str << op_type; if (needFloatSuffix(op_type) && - node->out()->dtype() == DataType::Float) { + bop->out()->dtype() == DataType::Float) { op_str << "f"; } code_ << " = " << op_str.str() << "(\n"; } - indent() << kTab << (node->lhs()->isScalar() ? cast : "") - << gen(node->lhs()) << ",\n"; - indent() << kTab << (node->rhs()->isScalar() ? cast : "") - << gen(node->rhs()) << ")"; + indent() << kTab << (bop->lhs()->isScalar() ? cast : "") + << gen(bop->lhs()) << ",\n"; + indent() << kTab << (bop->rhs()->isScalar() ? cast : "") + << gen(bop->rhs()) << ")"; } } code_ << ";\n"; } } - void visit(const kir::TernaryOp* node) final { + void handle(const TernaryOp* top) final { if (!print_inline_) { - indent() << gen(node->out()); - if (!node->out()->isScalar()) { + indent() << gen(top->out()); + if (!top->out()->isScalar()) { code_ << "\n"; indent() << kTab; } code_ << " = "; } - code_ << node->operation() << "(" << gen(node->in1()) << ", "; + code_ << top->getTernaryOpType() << "(" << gen(top->in1()) << ", "; // Make sure the two operands of where has the same // type. Note that compiling "where(0.0f, 0.0)" fails because of // the overloading ambiguity. - if (node->operation() == TernaryOpType::Where) { - auto cast = scalarCast(node->in2(), node->in3()); - code_ << (node->in2()->isScalar() ? cast : "") << gen(node->in2()) << ", " - << (node->in3()->isScalar() ? cast : "") << gen(node->in3()) << ")"; + if (top->getTernaryOpType() == TernaryOpType::Where) { + auto cast = scalarCast(top->in2(), top->in3()); + code_ << (top->in2()->isScalar() ? cast : "") << gen(top->in2()) << ", " + << (top->in3()->isScalar() ? cast : "") << gen(top->in3()) << ")"; } else { - code_ << gen(node->in2()) << ", " << gen(node->in3()) << ")"; + code_ << gen(top->in2()) << ", " << gen(top->in3()) << ")"; } if (!print_inline_) { @@ -670,56 +897,134 @@ class CudaKernelGenerator : private kir::IrVisitor { } } - std::string genReductionOp(BinaryOpType op_type, kir::Val* out) { + std::string genArchString(MmaOptions options) { + std::stringstream ss; + if (isVolta(options.macro)) { + ss << "Volta"; + } else if (isTuring(options.macro)) { + ss << "Turing"; + } else if (isAmpere(options.macro)) { + ss << "Ampere"; + } else { + TORCH_INTERNAL_ASSERT(false, "mma macro unknown arch"); + } + return ss.str(); + } + + std::string genMmaOp(const MmaOp* mma, bool init = false) { + std::stringstream ss; + auto options = mma->options(); + ss << genArchString(options) << "::"; + if (init) { + ss << "init"; + } + ss << toString(options.macro) << toString(options.operand_layout); + // TODO: additional parameter could be removed by swizzling iterdomain + auto acc_stride = mma->accStride(); + TORCH_INTERNAL_ASSERT(acc_stride > 0); + ss << "<" << acc_stride << ">"; + return ss.str(); + } + + void genMmaOperands(const MmaOp* mma) { + std::stringstream ss; + auto options = mma->options(); + auto in_a = mma->inA()->as()->view(); + auto dtype = in_a->getDataType().value(); + indent() << kTab << "reinterpret_cast*>(&" + << gen(mma->inA()) << "),\n"; + indent() << kTab << "reinterpret_cast*>(&" + << gen(mma->inB()) << ")"; + } + + void genMmaInitialization(const MmaOp* mma, const UnaryOp* uop) { + auto options = mma->options(); + + indent() << genMmaOp(mma, true) << "(reinterpret_castout()->getDataType().value() << "," + << getOutputRegisterSize(options.macro) << "," + << getOutputRegisterSize(options.macro) << ">*>" + << "(&" << gen(uop->out()) << "));\n"; + } + + void handle(const MmaOp* mma) final { + auto options = mma->options(); + auto out = mma->out()->as(); + indent() << genMmaOp(mma) << "(\n"; + indent() << kTab << "reinterpret_castview()->getDataType().value() << "," + << getOutputRegisterSize(options.macro) << "," + << getOutputRegisterSize(options.macro) << ">*>(&" + << gen(mma->out()) << "),\n"; + genMmaOperands(mma); + code_ << ");\n"; + } + + std::string genReductionOp(BinaryOpType op_type, DataType data_type) { std::stringstream lambda; - DataType data_type = out->dtype(); lambda << "[](" << data_type << " &a, " << data_type << " b) " - << "{ a = " << genBinaryOp(op_type, out, "a", "b") << "; }"; + << "{ a = " << genBinaryOp(op_type, data_type, "a", "b") << "; }"; return lambda.str(); } - void visit(const kir::BroadcastOp* node) final { - TORCH_INTERNAL_ASSERT(node->out()->isA()); - const auto tensor_index = node->out()->as(); - - const ParallelTypeBitmap domains = - kernel_->predicateMap().getParallelBroadcastDomains( - tensor_index->view()->fuserTv()); + void handle(const BroadcastOp* stmt) final { + TORCH_INTERNAL_ASSERT(stmt->out()->isA()); - const bool thread_x = domains.get(ParallelType::TIDx); - const bool thread_y = domains.get(ParallelType::TIDy); - const bool thread_z = domains.get(ParallelType::TIDz); - const bool block_x = domains.get(ParallelType::BIDx); - const bool block_y = domains.get(ParallelType::BIDy); - const bool block_z = domains.get(ParallelType::BIDz); + const ParallelTypeBitmap parallel_types = + kernel_->summary().broadcast_parallel_types.at(stmt); - const bool grid_broadcast_needed = block_x || block_y || block_z; - const bool block_broadcast_needed = thread_x || thread_y || thread_z; + if (parallel_types.none()) { + // Not parallelized + indent() << gen(stmt->out()) << "\n"; + indent() << kTab << " = " << gen(stmt->in()) << ";\n"; + return; + } TORCH_INTERNAL_ASSERT( - !grid_broadcast_needed, - "Parallel broadcast across blocks not supported"); - - if (block_broadcast_needed) { - const auto data_type = node->out()->dtype(); - indent() << "broadcast::blockBroadcast<" << (thread_x ? "true" : "false") - << ", " << (thread_y ? "true" : "false") << ", " - << (thread_z ? "true" : "false") << ">(\n"; - indent() << kTab << gen(node->out()) << ",\n"; - indent() << kTab << gen(node->in()) << ",\n"; - indent() << kTab << "static_cast<" << data_type << "*>(shared_mem),\n"; - TORCH_INTERNAL_ASSERT( - node->predicate() != nullptr && node->predicate()->hasValue()); - indent() << kTab << genInline(node->predicate()) << ");\n"; - } else { - indent() << gen(node->out()) << "\n"; - indent() << kTab << " = " << gen(node->in()) << ";\n"; + !parallel_types.hasBID(), + "Parallel broadcast across blocks should have been translated to a GridBroadcast IR node"); + + std::stringstream flags_str; + for (const ParallelType pt : kParallelTypeTIDs) { + const bool parallel_bcast = parallel_types.get(pt); + if (pt != kParallelTypeTIDs[0]) { + flags_str << ", "; + } + flags_str << (parallel_bcast ? "true" : "false"); } + + const auto data_type = stmt->out()->dtype(); + indent() << "broadcast::blockBroadcast<" << flags_str.str() << ">(\n"; + indent() << kTab << gen(stmt->out()) << ",\n"; + indent() << kTab << gen(stmt->in()) << ",\n"; + indent() << kTab << "static_cast<" << data_type << "*>(shared_mem),\n"; + TORCH_INTERNAL_ASSERT( + stmt->predicate() != nullptr && stmt->predicate()->hasValue()); + indent() << kTab << genInline(stmt->predicate()) << ");\n"; + } + + void genSerialReduction( + const kir::TensorIndex* output, + const Val* input, + BinaryOpType reduction_op_type) { + const auto gen_out = gen(output); + indent() << gen_out << " = " + << genBinaryOp( + reduction_op_type, output->dtype(), gen_out, gen(input)) + << ";\n"; + return; } - void genWarpReductionOp( - const kir::ReductionOp* node, - const IterDomain* reduction_id) { + void genWarpReduction( + const kir::TensorIndex* output, + const kir::TensorIndex* input, + const Val* init, + BinaryOpType reduction_op_type, + kir::Predicate* read_pred) { bool is_single_warp = kernel_->getWarpPaddedParallelInfo().is_tidx_single_warp; @@ -729,43 +1034,27 @@ class CudaKernelGenerator : private kir::IrVisitor { } else { code_ << "(\n"; } - indent() << kTab << gen(node->out()) << ",\n"; - indent() << kTab << gen(node->in()) << ",\n"; - indent() << kTab << genReductionOp(node->operation(), node->out()) << ",\n"; + indent() << kTab << gen(output) << ",\n"; + indent() << kTab << gen(input) << ",\n"; + indent() << kTab << genReductionOp(reduction_op_type, output->dtype()) + << ",\n"; indent() << kTab << "threadIdx,\n"; indent() << kTab << "blockDim,\n"; - indent() << kTab << "static_cast<" << node->out()->dtype() + indent() << kTab << "static_cast<" << output->dtype() << "*>(shared_mem),\n"; - TORCH_INTERNAL_ASSERT( - node->predicate() != nullptr && node->predicate()->hasValue()); - indent() << kTab << genInline(node->predicate()) << ",\n"; - indent() << kTab << node->out()->dtype() << "(" << genInline(node->init()) - << "));\n"; + TORCH_INTERNAL_ASSERT(read_pred != nullptr && read_pred->hasValue()); + indent() << kTab << genInline(read_pred) << ",\n"; + indent() << kTab << output->dtype() << "(" << genInline(init) << "));\n"; } - void visit(const kir::ReductionOp* node) final { - TORCH_INTERNAL_ASSERT(node->out()->isA()); - - const auto out = node->out()->as(); - const auto domain = out->view()->domain(); - - const bool has_block_reduce = domain->hasBlockReduction(); - const bool has_grid_reduce = domain->hasGridReduction(); - - if (!has_block_reduce && !has_grid_reduce) { - const auto gen_out = gen(out); - const auto op_type = node->operation(); - indent() << gen_out << " = " - << genBinaryOp(op_type, out, gen_out, gen(node->in())) << ";\n"; - return; - } - - if (auto reduction_id = ir_utils::getMaybeWarpReductionDim(node)) { - genWarpReductionOp(node, reduction_id.value()); - return; - } - - const auto par_domains = ir_utils::getParallelDomains(node->out()); + void genBlockReduction( + const kir::TensorIndex* output, + const kir::TensorIndex* input, + const Val* init, + BinaryOpType reduction_op_type, + kir::Predicate* read_pred, + kir::Predicate* write_pred) { + const auto par_domains = ir_utils::getParallelDomains(output); // Get parallel reduction domains const bool tidx = par_domains.find(ParallelType::TIDx) != par_domains.end() && @@ -777,59 +1066,80 @@ class CudaKernelGenerator : private kir::IrVisitor { par_domains.find(ParallelType::TIDz) != par_domains.end() && par_domains.at(ParallelType::TIDz)->isReduction(); - const auto data_type = node->out()->dtype(); - const auto op_type = node->operation(); + const auto data_type = output->dtype(); - if (has_block_reduce) { - if (has_grid_reduce) { - indent() << data_type << " " - << "block_result_" << block_reduce_name_ << "=" - << gen(node->init()) << ";\n"; - } - indent() << "blockReduce<" << (tidx ? "true" : "false") << ", " - << (tidy ? "true" : "false") << ", " << (tidz ? "true" : "false") - << ">(\n"; - if (has_grid_reduce) { - indent() << kTab << "block_result_" << block_reduce_name_ << ",\n"; - } else { - indent() << kTab << gen(node->out()) << ",\n"; - } - indent() << kTab << gen(node->in()) << ",\n"; - indent() << kTab << genReductionOp(op_type, node->out()) << ",\n"; - indent() << kTab << "threadIdx,\n"; - indent() << kTab << "blockDim,\n"; - indent() << kTab << "static_cast<" << data_type << "*>(shared_mem),\n"; - TORCH_INTERNAL_ASSERT( - node->predicate() != nullptr && node->predicate()->hasValue()); - auto read_pred = genInline(node->predicate()); - indent() << kTab << read_pred << ",\n"; - // Pass the write predicate if available and different from the - // default predicate. The blockReduce runtime function uses the - // default predicate for both read and write when only the - // default one is given. - if (node->writePredicate() != nullptr) { - TORCH_INTERNAL_ASSERT(node->writePredicate()->hasValue()); - auto write_pred = genInline(node->writePredicate()); - indent() << kTab << write_pred << ",\n"; - } - indent() << kTab << data_type << "(" << genInline(node->init()) - << "));\n"; + indent() << "blockReduce<" << (tidx ? "true" : "false") << ", " + << (tidy ? "true" : "false") << ", " << (tidz ? "true" : "false") + << ">(\n"; + indent() << kTab << gen(output) << ",\n"; + indent() << kTab << gen(input) << ",\n"; + indent() << kTab << genReductionOp(reduction_op_type, output->dtype()) + << ",\n"; + indent() << kTab << "threadIdx,\n"; + indent() << kTab << "blockDim,\n"; + indent() << kTab << "static_cast<" << data_type << "*>(shared_mem),\n"; + TORCH_INTERNAL_ASSERT(read_pred != nullptr && read_pred->hasValue()); + indent() << kTab << genInline(read_pred) << ",\n"; + // Pass the write predicate if available and different from the + // default predicate. The blockReduce runtime function uses the + // default predicate for both read and write when only the + // default one is given. + if (write_pred != nullptr) { + TORCH_INTERNAL_ASSERT(write_pred->hasValue()); + indent() << kTab << genInline(write_pred) << ",\n"; + } + indent() << kTab << data_type << "(" << genInline(init) << "));\n"; + } + + void handle(const ReductionOp* rop) final { + TORCH_INTERNAL_ASSERT(rop->out()->isA()); + + const auto output = rop->out()->as(); + const auto input = rop->in()->as(); + const auto domain = output->view()->domain(); + const auto op_type = rop->getReductionOpType(); + + const bool has_block_reduce = domain->hasBlockReduction(); + const bool has_grid_reduce = domain->hasGridReduction(); + + TORCH_INTERNAL_ASSERT( + !has_grid_reduce, + "ReductionOp does not support block parallelization. GridReductionOp must be used. ", + rop->toString()); + + if (!has_block_reduce) { + genSerialReduction(output, input, op_type); + } else if ( + auto reduction_id = ir_utils::getMaybeWarpReductionDim(output, input)) { + genWarpReduction(output, input, rop->init(), op_type, rop->predicate()); + } else { + genBlockReduction( + output, + input, + rop->init(), + op_type, + rop->predicate(), + rop->writePredicate()); } } - void visit(const kir::WelfordOp* node) final { - TORCH_INTERNAL_ASSERT(node->out()->isA()); + void handle(const WelfordOp* wop) final { + TORCH_INTERNAL_ASSERT(wop->out()->isA()); - const auto out = node->out()->as(); + const auto out = wop->out()->as(); const auto domain = out->view()->domain(); - const auto out_var = node->outVar(); - const auto out_avg = node->outAvg(); - const auto out_N = node->outN(); + const auto out_var = wop->outVar(); + const auto out_avg = wop->outAvg(); + const auto out_N = wop->outN(); + + const auto in_var = wop->inVar(); + const auto in_avg = wop->inAvg(); + const auto in_N = wop->inN(); - const auto in_var = node->inVar(); - const auto in_avg = node->inAvg(); - const auto in_N = node->inN(); + // inVar was allowed to be nullptr. Make sure it isn't. + TORCH_INTERNAL_ASSERT( + in_var != nullptr, "Welford var input nullptr not allowed"); const bool has_block_reduce = domain->hasBlockReduction(); const bool has_grid_reduce = domain->hasGridReduction(); @@ -838,21 +1148,17 @@ class CudaKernelGenerator : private kir::IrVisitor { if (!has_block_reduce && !has_grid_reduce) { indent() << "welfordCombine (" << "\n"; - indent() << " " << gen(out_avg) << ",\n"; - indent() << " " << gen(out_var) << ",\n"; - indent() << " " << gen(out_N) << ",\n"; - indent() << " " << gen(in_avg) << ",\n"; - if (in_var) { - indent() << " " << gen(in_var) << ",\n"; - } else { - indent() << " (" << in_avg->dtype() << ") 0" - << ",\n"; - } - indent() << " (" << out_N->dtype() << ")" << gen(in_N) << ");\n"; + indent() << kTab << gen(out_avg) << ",\n"; + indent() << kTab << gen(out_var) << ",\n"; + indent() << kTab << gen(out_N) << ",\n"; + indent() << kTab << gen(in_avg) << ",\n"; + indent() << kTab << "(" << out_avg->dtype() << ")" << gen(in_var) + << ",\n"; + indent() << kTab << "(" << out_N->dtype() << ")" << gen(in_N) << ");\n"; return; } - const auto par_domains = ir_utils::getParallelDomains(node->out()); + const auto par_domains = ir_utils::getParallelDomains(wop->out()); // Get parallel reduction domains const bool tidx = par_domains.find(ParallelType::TIDx) != par_domains.end() && @@ -864,57 +1170,52 @@ class CudaKernelGenerator : private kir::IrVisitor { par_domains.find(ParallelType::TIDz) != par_domains.end() && par_domains.at(ParallelType::TIDz)->isReduction(); - const auto data_type = node->out()->dtype(); + const auto data_type = wop->out()->dtype(); if (has_block_reduce) { if (has_grid_reduce) { // allocate block result indent() << data_type << " " << "block_result_avg_" << block_reduce_name_ << " = " - << gen(node->initAvg()) << ";\n"; + << gen(wop->initAvg()) << ";\n"; indent() << data_type << " " << "block_result_var_" << block_reduce_name_ << " = " - << gen(node->initVar()) << ";\n"; - indent() << DataType::Int << " " + << gen(wop->initVar()) << ";\n"; + indent() << out_N->dtype() << " " << "block_result_n_" << block_reduce_name_ << " = " - << gen(node->initN()) << ";\n"; + << gen(wop->initN()) << ";\n"; } indent() << "blockWelford<" << (tidx ? "true" : "false") << ", " << (tidy ? "true" : "false") << ", " << (tidz ? "true" : "false") << ">(\n"; if (has_grid_reduce) { - indent() << kTab << "block_result_avg_" << block_reduce_name_ << ",\n" - << kTab << "block_result_var_" << block_reduce_name_ << ",\n" - << kTab << "block_result_n_" << block_reduce_name_ << ",\n"; + indent() << kTab << "block_result_avg_" << block_reduce_name_ << ",\n"; + indent() << kTab << "block_result_var_" << block_reduce_name_ << ",\n"; + indent() << kTab << "block_result_n_" << block_reduce_name_ << ",\n"; } else { - indent() << kTab << gen(node->outAvg()) << ",\n"; - indent() << kTab << gen(node->outVar()) << ",\n"; - indent() << kTab << gen(node->outN()) << ",\n"; + indent() << kTab << gen(wop->outAvg()) << ",\n"; + indent() << kTab << gen(wop->outVar()) << ",\n"; + indent() << kTab << gen(wop->outN()) << ",\n"; } - indent() << " " << gen(in_avg) << ",\n"; - if (in_var) { - indent() << " " << gen(in_var) << ",\n"; - } else { - indent() << " (" << in_avg->dtype() << ") 0" - << ",\n"; - } - indent() << out_N->dtype() << "(" << gen(in_N) << "),\n"; + indent() << kTab << gen(in_avg) << ",\n"; + indent() << kTab << out_avg->dtype() << "(" << gen(in_var) << "),\n"; + indent() << kTab << out_N->dtype() << "(" << gen(in_N) << "),\n"; indent() << kTab << "threadIdx,\n"; indent() << kTab << "blockDim,\n"; indent() << kTab << "reinterpret_cast<" << data_type << "*>(shared_mem_avg),\n"; indent() << kTab << "reinterpret_cast<" << data_type << "*>(shared_mem_var),\n"; - indent() << kTab << "reinterpret_cast<" << DataType::Int + indent() << kTab << "reinterpret_cast<" << out_N->dtype() << "*>(shared_mem_n),\n"; - TORCH_INTERNAL_ASSERT(node->predicate() != nullptr); + TORCH_INTERNAL_ASSERT(wop->predicate() != nullptr); TORCH_INTERNAL_ASSERT( - node->predicate() != nullptr && node->predicate()->hasValue()); - auto read_pred = genInline(node->predicate()); + wop->predicate() != nullptr && wop->predicate()->hasValue()); + auto read_pred = genInline(wop->predicate()); indent() << kTab << read_pred << ",\n"; - if (node->writePredicate() != nullptr) { - TORCH_INTERNAL_ASSERT(node->writePredicate()->hasValue()); - auto write_pred = genInline(node->writePredicate()); + if (wop->writePredicate() != nullptr) { + TORCH_INTERNAL_ASSERT(wop->writePredicate()->hasValue()); + auto write_pred = genInline(wop->writePredicate()); indent() << kTab << write_pred << ",\n"; } indent() << kTab << data_type << "(0));\n"; @@ -926,8 +1227,12 @@ class CudaKernelGenerator : private kir::IrVisitor { std::string generateGridReduceTemplateFlags( const REDUCTION_OP* rop, const ParallelTypeBitmap& thread_pred) { + TORCH_INTERNAL_ASSERT( + !rop->isAllreduce(), + "This is not for the allreduce reduction kernel\n"); + const auto par_domains = ir_utils::getParallelDomains(rop->outputs()[0]); - std::stringstream flags; + ArgumentBuilder flags; for (const ParallelType pt : kParallelTypeThreads) { const bool parallel_reduction = par_domains.find(pt) != par_domains.end() && @@ -946,94 +1251,324 @@ class CudaKernelGenerator : private kir::IrVisitor { } else { flag = !pred && !parallel_reduction; } - if (pt != kParallelTypeThreads[0]) { - flags << ", "; + flags.arg(flag); + } + return flags.str(); + } + + // TODO: This should replace generateGridReduceTemplateFlags once + // GridWelford is refactored as GridReduction. + template + std::string generateGridReduceTemplateFlags2( + const REDUCTION_OP* rop, + const ParallelTypeBitmap& thread_pred) { + TORCH_INTERNAL_ASSERT( + !rop->isAllreduce(), + "This is not for the allreduce reduction kernel\n"); + + const auto par_domains = + ir_utils::getParallelDomains(ir_utils::getTvOutput(rop)); + ArgumentBuilder flags; + for (const ParallelType pt : kParallelTypeThreads) { + const bool parallel_reduction = + par_domains.find(pt) != par_domains.end() && + par_domains.at(pt)->isReduction(); + const bool pred = thread_pred.get(pt); + TORCH_INTERNAL_ASSERT( + !(parallel_reduction && pred), "Cannot reduce predicated axis: ", pt); + // Currently assumed that no dimensions parallelized with blocks + // are predicated. This assumption may be lifted, but + // gridReduction would need some changes. + if (isParallelTypeBlockDim(pt)) { + TORCH_INTERNAL_ASSERT( + !pred, "Predication on block dimensions not allowed: ", pt); } - flags << (flag ? "true" : "false"); + flags.arg(parallel_reduction); } return flags.str(); } - void visit(const kir::GridReduction* node) final { - const auto rop = node->reduction_op(); - TORCH_INTERNAL_ASSERT(rop->out()->isA()); + void handle(const kir::GridReduction* grop) final { + TORCH_INTERNAL_ASSERT(grop->out()->isA()); - const auto out = rop->out()->as(); + const auto out = grop->out()->as(); const auto domain = out->view()->domain(); TORCH_INTERNAL_ASSERT(domain->hasGridReduction()); - const auto data_type = rop->out()->dtype(); - const auto op_type = rop->operation(); + const auto data_type = grop->out()->dtype(); + const auto op_type = grop->getReductionOpType(); TORCH_INTERNAL_ASSERT( - node->reduction_buffer()->buffer()->isA()); + grop->reduction_buffer()->buffer()->isA()); + TORCH_INTERNAL_ASSERT(grop->sync_buffer()->buffer()->isA()); + const auto work_buffer = + grop->reduction_buffer()->buffer()->as(); + const auto sync_buffer = grop->sync_buffer()->buffer()->as(); + + if (grop->isAllreduce()) { + generateGridAllreduce(grop); + return; + } + + const std::string flags_str = + generateGridReduceTemplateFlags2(grop, grop->threadPredicate()); + + const bool persistent_sync = + kernel_->summary().has_cooperative_grid_reduction; + + // Since block-level reduction is already done, those dimensions + // with tidx/y/z being true do not participate in the grid + // reduction. + ArgumentBuilder template_args; + template_args.arg(flags_str).arg(persistent_sync); + + ArgumentBuilder func_args(block_nest_level_ + 1, kTab); + func_args.arg(gen(grop->out())); + func_args.arg(gen(grop->in())); + func_args.arg(genReductionOp(op_type, out->dtype())); + func_args.arg("&").append(varName(work_buffer)).append("[0]"); + func_args.arg("&").append(varName(sync_buffer)).append("[0]"); + func_args.arg(genCall("static_cast", ptrType(data_type), "shared_mem")); + // read and write predicates TORCH_INTERNAL_ASSERT( - node->sync_buffer()->buffer()->isA()); + grop->predicate() != nullptr && grop->predicate()->hasValue()); + const auto read_pred = genInline(grop->predicate()); + func_args.arg(read_pred); + if (grop->writePredicate() != nullptr) { + TORCH_INTERNAL_ASSERT(grop->writePredicate()->hasValue()); + func_args.arg(genInline(grop->writePredicate())); + } else { + func_args.arg(read_pred); + } + // Init val + func_args.arg(genCall(data_type, genInline(grop->init()))); + func_args.arg(genInline(grop->entrance_index())); + func_args.arg(genInline(grop->entrances())); + + indent() << "reduction::gridReduce<" << template_args << ">(\n"; + indent() << kTab << func_args << ");\n"; + } + + std::string genFusedReductionName(const TensorView* reduction_out) { + return varName(reduction_out) + "_reduction"; + } + + void generateGridAllreduce(const kir::GridReduction* grop) { + TORCH_INTERNAL_ASSERT(grop->isAllreduce()); + + const auto out = grop->out()->as(); + + const auto data_type = grop->out()->dtype(); + const auto op_type = grop->getReductionOpType(); + const auto work_buffer = - node->reduction_buffer()->buffer()->as(); + grop->reduction_buffer()->buffer()->as(); + const auto sync_buffer = grop->sync_buffer()->buffer()->as(); + + const auto reduction_name = genFusedReductionName(out->view()); + + // template + // __device__ __inline__ void reduce( + // RefTuple out, + // const LocalTuple& inp, + // VolatilePtrTuple global_work_buffer, + // int64_t* global_sync_buffer, // Allocated as product of all + // // non-participating Grid dimension + // PtrTuple shared_buf, + // bool read_pred, // Prevent reading from out of bounds memory + // bool write_pred, // Prevent from writing out of bounds + // const LocalTuple& init_val, + // Func reduction_op); + + indent() << reduction_name << ".reduce(\n"; + + ArgumentBuilder func_args(block_nest_level_ + 1, kTab); + // out + func_args.arg(genCall("RefTuple", data_type, gen(grop->out()))); + // inp + func_args.arg(genCall("ConstRefTuple", data_type, gen(grop->in()))); + // global_work_buffer + func_args.arg(genCall( + "VolatilePtrTuple", data_type, "&" + varName(work_buffer) + "[0]")); + // global_sync_buffer + func_args.arg("&").append(varName(sync_buffer)).append("[0]"); + // shared_buf + func_args.arg(genCall( + "PtrTuple", + data_type, + genCall("static_cast", ptrType(data_type), "shared_mem"))); + // read and write predicates + TORCH_INTERNAL_ASSERT( + grop->predicate() != nullptr && grop->predicate()->hasValue()); + const auto read_pred = genInline(grop->predicate()); + auto write_pred = read_pred; + if (grop->writePredicate() != nullptr) { + TORCH_INTERNAL_ASSERT(grop->writePredicate()->hasValue()); + write_pred = genInline(grop->writePredicate()); + } + func_args.arg(read_pred).arg(write_pred); + // init_val + func_args.arg(genCall("LocalTuple", data_type, genInline(grop->init()))); + // reduction_op + func_args.arg(genReductionOp(op_type, out->dtype())); + + indent() << kTab << func_args << ");\n"; + } + + void handle(const kir::GroupedGridReduction* grouped_grop) final { + const auto out = ir_utils::getTvOutput(grouped_grop); + const auto domain = out->domain(); + TORCH_INTERNAL_ASSERT(domain->hasGridReduction()); + + TORCH_INTERNAL_ASSERT( + grouped_grop->sync_buffer()->buffer()->isA()); const auto sync_buffer = - node->sync_buffer()->buffer()->as(); + grouped_grop->sync_buffer()->buffer()->as(); - const std::string flags_str = - generateGridReduceTemplateFlags(rop, node->threadPredicate()); + TORCH_INTERNAL_ASSERT( + grouped_grop->numReductions() == 2, + "Only grouping of 2 reductions is supported. ", + grouped_grop->toString()); + + if (grouped_grop->isAllreduce()) { + generateGridAllreduce(grouped_grop); + return; + } + + const std::string flags_str = generateGridReduceTemplateFlags2( + grouped_grop, grouped_grop->threadPredicate()); const bool persistent_sync = kernel_->summary().has_cooperative_grid_reduction; // Since block-level reduction is already done, those dimensions - // with tidx/y/z being true do not participate in the grid reduction. - indent() << "reduction::gridReduce<" << flags_str << ", " - << (persistent_sync ? "true" : "false") << ">(\n"; - indent() << kTab << gen(rop->out()) << ",\n"; - if (domain->hasBlockReduction()) { - indent() << kTab << "block_result_" << block_reduce_name_ << ",\n"; - block_reduce_name_++; + // with tidx/y/z being true do not participate in the grid + // reduction. + ArgumentBuilder template_args; + template_args.arg(flags_str).arg(persistent_sync); + + ArgumentBuilder func_args(block_nest_level_ + 1, kTab); + + // Apped arguments for each reduction + for (const auto i : c10::irange(grouped_grop->numReductions())) { + TORCH_INTERNAL_ASSERT( + grouped_grop->reduction_buffers().at(i)->buffer()->isA()); + const auto work_buffer = + grouped_grop->reduction_buffers().at(i)->buffer()->as(); + + func_args.arg(gen(grouped_grop->output(i))); + func_args.arg(gen(grouped_grop->input(i))); + func_args.arg(genCall( + grouped_grop->output(i)->dtype(), + genInline(grouped_grop->initVal(i)))); + func_args.arg(genReductionOp( + grouped_grop->getReductionOpType(i), + grouped_grop->output(i)->dtype())); + func_args.arg("&").append(varName(work_buffer)).append("[0]"); + } + + // The rest of the arguments are common between the reductions + func_args.arg("&").append(varName(sync_buffer)).append("[0]"); + func_args.arg("shared_mem"); + // read and write predicates + TORCH_INTERNAL_ASSERT( + grouped_grop->predicate() != nullptr && + grouped_grop->predicate()->hasValue()); + const auto read_pred = genInline(grouped_grop->predicate()); + func_args.arg(read_pred); + if (grouped_grop->writePredicate() != nullptr) { + TORCH_INTERNAL_ASSERT(grouped_grop->writePredicate()->hasValue()); + func_args.arg(genInline(grouped_grop->writePredicate())); } else { - indent() << kTab << gen(rop->in()) << ",\n"; + func_args.arg(read_pred); } - indent() << kTab << genReductionOp(op_type, out) << ",\n"; - indent() << kTab << "&" << varName(work_buffer) << "[0],\n"; - indent() << kTab << varName(sync_buffer) << ",\n"; - indent() << kTab << "static_cast<" << data_type << "*>(shared_mem),\n"; + + indent() << "reduction::gridReduceGroup<" << template_args << ">(\n"; + indent() << kTab << func_args << ");\n"; + } + + void generateGridAllreduce(const kir::GroupedGridReduction* grouped_grop) { + TORCH_INTERNAL_ASSERT(grouped_grop->isAllreduce()); + + // First, build a list of function arguments + ArgumentBuilder func_args(block_nest_level_ + 1, kTab); + + for (const auto i : c10::irange(grouped_grop->numReductions())) { + const auto data_type = grouped_grop->outputs().at(i)->dtype(); + TORCH_INTERNAL_ASSERT( + grouped_grop->reduction_buffers().at(i)->buffer()->isA()); + + // out + func_args.arg( + genCall("RefTuple", data_type, gen(grouped_grop->outputs().at(i)))); + + // inp + func_args.arg(genCall( + "ConstRefTuple", data_type, gen(grouped_grop->inputs().at(i)))); + + // global_work_buffer + const auto work_buffer = + grouped_grop->reduction_buffers().at(i)->buffer()->as(); + func_args.arg(genCall( + "VolatilePtrTuple", data_type, "&" + varName(work_buffer) + "[0]")); + + // init + func_args.arg(genCall( + "LocalTuple", data_type, genInline(grouped_grop->initVal(i)))); + + // reduction op + func_args.arg(genReductionOp( + grouped_grop->getReductionOpType(i), + grouped_grop->output(i)->dtype())); + } + + // global_sync_buffer + const auto sync_buffer = + grouped_grop->sync_buffer()->buffer()->as(); + func_args.arg("&").append(varName(sync_buffer)).append("[0]"); + + // shared_buf + func_args.arg("shared_mem"); + + // read and write predicates TORCH_INTERNAL_ASSERT( - node->predicate() != nullptr && node->predicate()->hasValue()); - auto read_pred = genInline(node->predicate()); - indent() << kTab << read_pred << ",\n"; - if (node->writePredicate() != nullptr) { - TORCH_INTERNAL_ASSERT(node->writePredicate()->hasValue()); - auto write_pred = genInline(node->writePredicate()); - indent() << kTab << write_pred << ",\n"; + grouped_grop->predicate() != nullptr && + grouped_grop->predicate()->hasValue()); + const auto read_pred = genInline(grouped_grop->predicate()); + func_args.arg(read_pred); + if (grouped_grop->writePredicate() != nullptr) { + TORCH_INTERNAL_ASSERT(grouped_grop->writePredicate()->hasValue()); + func_args.arg(genInline(grouped_grop->writePredicate())); } else { - indent() << kTab << read_pred << ",\n"; + func_args.arg(read_pred); } - indent() << kTab << data_type << "(" - << genInline(node->reduction_op()->init()) << "));\n"; + + indent() << genFusedReductionName(ir_utils::getTvOutput(grouped_grop)) + << ".reduceGroup(\n"; + indent() << kTab << func_args << ");\n"; } - void visit(const kir::GridBroadcast* node) final { - const auto bop = node->broadcast_op(); + void handle(const kir::GridBroadcast* grop) final { + const auto bop = grop->broadcast_op(); TORCH_INTERNAL_ASSERT(bop->out()->isA()); - const auto out = bop->out()->as(); - const auto domain = out->view()->domain(); - TORCH_INTERNAL_ASSERT(domain->hasGridBroadcast()); - - const auto data_type = bop->out()->dtype(); + const ParallelTypeBitmap parallel_types = + kernel_->summary().broadcast_parallel_types.at(bop); TORCH_INTERNAL_ASSERT( - node->broadcast_buffer()->buffer()->isA()); + parallel_types.hasBID(), + "GridBroadcast needs to be used with a broadcast op that is parallelized with the BID parallel types"); + TORCH_INTERNAL_ASSERT( - node->sync_buffer()->buffer()->isA()); + grop->broadcast_buffer()->buffer()->isA()); + TORCH_INTERNAL_ASSERT(grop->sync_buffer()->buffer()->isA()); const auto work_buffer = - node->broadcast_buffer()->buffer()->as(); - const auto sync_buffer = - node->sync_buffer()->buffer()->as(); + grop->broadcast_buffer()->buffer()->as(); + const auto sync_buffer = grop->sync_buffer()->buffer()->as(); - const auto par_domains = ir_utils::getParallelDomains(out); std::stringstream flags_str; for (const ParallelType pt : kParallelTypeThreads) { - const bool parallel_bcast = par_domains.find(pt) != par_domains.end() && - par_domains.at(pt)->isBroadcast(); + const bool parallel_bcast = parallel_types.get(pt); if (pt != kParallelTypeThreads[0]) { flags_str << ", "; } @@ -1041,7 +1576,7 @@ class CudaKernelGenerator : private kir::IrVisitor { } // Since block-level broadcast has not necessarily been performed before - // this function call, so grid broadcast may be broadcasting across both + // this function call, so grid broadcast may be broadcasting across both // the grid and the block level. indent() << "grid_broadcast::broadcast<" << flags_str.str() << ">(\n"; indent() << kTab << gen(bop->out()) << ",\n"; @@ -1049,12 +1584,12 @@ class CudaKernelGenerator : private kir::IrVisitor { indent() << kTab << "&" << varName(work_buffer) << "[0],\n"; indent() << kTab << varName(sync_buffer) << ",\n"; TORCH_INTERNAL_ASSERT( - node->predicate() != nullptr && node->predicate()->hasValue()); - indent() << kTab << genInline(node->predicate()) << ");\n"; + grop->predicate() != nullptr && grop->predicate()->hasValue()); + indent() << kTab << genInline(grop->predicate()) << ");\n"; } - void visit(const kir::GridWelford* node) final { - const auto wop = node->welford_op(); + void handle(const kir::GridWelford* gwop) final { + const auto wop = gwop->welford_op(); TORCH_INTERNAL_ASSERT(wop->outAvg()->isA()); const auto out = wop->out()->as(); @@ -1063,41 +1598,43 @@ class CudaKernelGenerator : private kir::IrVisitor { const auto data_type = out->dtype(); - TORCH_INTERNAL_ASSERT(node->var_buffer()->buffer()->isA()); - TORCH_INTERNAL_ASSERT( - node->sync_buffer()->buffer()->isA()); + TORCH_INTERNAL_ASSERT(gwop->var_buffer()->buffer()->isA()); + TORCH_INTERNAL_ASSERT(gwop->sync_buffer()->buffer()->isA()); - const auto avg_buffer = node->avg_buffer()->buffer()->as(); - const auto var_buffer = node->var_buffer()->buffer()->as(); - const auto n_buffer = node->N_buffer()->buffer()->as(); - const auto sync_buffer = - node->sync_buffer()->buffer()->as(); + const auto avg_buffer = gwop->avg_buffer()->buffer()->as(); + const auto var_buffer = gwop->var_buffer()->buffer()->as(); + const auto n_buffer = gwop->N_buffer()->buffer()->as(); + const auto sync_buffer = gwop->sync_buffer()->buffer()->as(); + + if (wop->isAllreduce()) { + generateGridAllreduce(gwop); + return; + } const bool persistent_sync = kernel_->summary().has_cooperative_grid_reduction; const std::string flags_str = - generateGridReduceTemplateFlags(wop, node->threadPredicate()); + generateGridReduceTemplateFlags(wop, gwop->threadPredicate()); // Since block-level reduction is already done, those dimensions // with tidx/y/z being true do not participate in the grid reduction. indent() << "welford::gridWelford<" << flags_str << ", " << (persistent_sync ? "true" : "false") << ">(\n"; - indent() << kTab << gen(wop->outAvg()) << ",\n" - << kTab << gen(wop->outVar()) << ",\n" - << kTab << gen(wop->outN()) << ",\n"; + indent() << kTab << gen(wop->outAvg()) << ",\n"; + indent() << kTab << gen(wop->outVar()) << ",\n"; + indent() << kTab << gen(wop->outN()) << ",\n"; if (domain->hasBlockReduction()) { - indent() << kTab << "block_result_avg_" << block_reduce_name_ << ",\n" - << kTab << "block_result_var_" << block_reduce_name_ << ",\n" - << kTab << "block_result_n_" << block_reduce_name_ << ",\n"; + indent() << kTab << "block_result_avg_" << block_reduce_name_ << ",\n"; + indent() << kTab << "block_result_var_" << block_reduce_name_ << ",\n"; + indent() << kTab << "block_result_n_" << block_reduce_name_ << ",\n"; block_reduce_name_++; } else { indent() << kTab << gen(wop->inAvg()) << ",\n"; - if (wop->inVar() == nullptr) { - indent() << kTab << "(" << data_type << ") 0,\n"; - } else { - indent() << kTab << gen(wop->inVar()) << ",\n"; - } + TORCH_INTERNAL_ASSERT( + wop->inVar() != nullptr, "Welford var input nullptr not allowed"); + indent() << kTab << "(" << wop->outVar()->dtype() << ")" + << gen(wop->inVar()) << ",\n"; indent() << kTab << "(" << wop->outN()->dtype() << ")" << gen(wop->inN()) << ",\n"; } @@ -1112,112 +1649,291 @@ class CudaKernelGenerator : private kir::IrVisitor { indent() << kTab << "reinterpret_cast<" << wop->outN()->dtype() << "*>(shared_mem_n),\n"; TORCH_INTERNAL_ASSERT( - node->predicate() != nullptr && node->predicate()->hasValue()); - auto read_pred = genInline(node->predicate()); + gwop->predicate() != nullptr && gwop->predicate()->hasValue()); + auto read_pred = genInline(gwop->predicate()); indent() << kTab << read_pred << ",\n"; - if (node->writePredicate() != nullptr) { - TORCH_INTERNAL_ASSERT(node->writePredicate()->hasValue()); - auto write_pred = genInline(node->writePredicate()); + if (gwop->writePredicate() != nullptr) { + TORCH_INTERNAL_ASSERT(gwop->writePredicate()->hasValue()); + auto write_pred = genInline(gwop->writePredicate()); indent() << kTab << write_pred << ",\n"; } else { indent() << kTab << read_pred << ",\n"; } // TODO : init value support or remove. - indent() << kTab << data_type << "(0));\n"; + indent() << kTab << data_type << "(0),\n"; + indent() << kTab << genInline(gwop->entrance_index()) << ",\n"; + indent() << kTab << genInline(gwop->entrances()); + code_ << ");\n"; + } + + void generateGridAllreduce(const kir::GridWelford* gwop) { + const auto wop = gwop->welford_op(); + TORCH_INTERNAL_ASSERT(wop->isAllreduce()); + + const auto out = wop->out()->as(); + + const auto data_type = wop->outAvg()->dtype(); + const auto index_type = wop->outN()->dtype(); + TORCH_INTERNAL_ASSERT(wop->outAvg()->dtype() == wop->outVar()->dtype()); + + ArgumentBuilder data_type_args; + data_type_args.arg(data_type).arg(data_type).arg(index_type); + + const auto sync_buffer = gwop->sync_buffer()->buffer()->as(); + + const auto reduction_name = genFusedReductionName(out->view()); + + // template + // __device__ __inline__ void reduce( + // RefTuple out, + // const LocalTuple& inp, + // VolatilePtrTuple global_work_buffer, + // int64_t* global_sync_buffer, // Allocated as product of all + // // non-participating Grid dimension + // PtrTuple shared_buf, + // bool read_pred, // Prevent reading from out of bounds memory + // bool write_pred, // Prevent from writing out of bounds + // const LocalTuple& init_val, + // Func reduction_op); + + ArgumentBuilder out_args; + out_args.arg(gen(wop->outAvg())); + out_args.arg(gen(wop->outVar())); + out_args.arg(gen(wop->outN())); + + ArgumentBuilder in_args; + in_args.arg(gen(wop->inAvg())); + if (wop->inVar() != nullptr) { + in_args.arg(gen(wop->inVar())); + } else { + in_args.arg("(").append(data_type).append(")0"); + } + in_args.arg(gen(wop->inN())); + + ArgumentBuilder init_args; + init_args.arg(gen(wop->initAvg())); + init_args.arg(gen(wop->initVar())); + init_args.arg(gen(wop->initN())); + + ArgumentBuilder work_buffer_args; + work_buffer_args.arg("&") + .append(varName(gwop->avg_buffer()->buffer()->as())) + .append("[0]"); + work_buffer_args.arg("&") + .append(varName(gwop->var_buffer()->buffer()->as())) + .append("[0]"); + work_buffer_args.arg("&") + .append(varName(gwop->N_buffer()->buffer()->as())) + .append("[0]"); + + ArgumentBuilder smem_buffer_args; + smem_buffer_args.arg( + genCall("reinterpret_cast", ptrType(data_type), "shared_mem_avg")); + smem_buffer_args.arg( + genCall("reinterpret_cast", ptrType(data_type), "shared_mem_var")); + smem_buffer_args.arg( + genCall("reinterpret_cast", ptrType(index_type), "shared_mem_n")); + + ArgumentBuilder func_args(block_nest_level_ + 1, kTab); + // out + func_args.arg(genCall("RefTuple", data_type_args, out_args)); + // inp + func_args.arg(genCall("ConstRefTuple", data_type_args, in_args)); + // global_work_buffer + func_args.arg( + genCall("VolatilePtrTuple", data_type_args, work_buffer_args)); + // global_sync_buffer + func_args.arg("&").append(varName(sync_buffer)).append("[0]"); + // shared_buf + func_args.arg(genCall("PtrTuple", data_type_args, smem_buffer_args)); + // read and write predicates + TORCH_INTERNAL_ASSERT( + gwop->predicate() != nullptr && gwop->predicate()->hasValue()); + const auto read_pred = genInline(gwop->predicate()); + auto write_pred = read_pred; + if (gwop->writePredicate() != nullptr) { + TORCH_INTERNAL_ASSERT(gwop->writePredicate()->hasValue()); + write_pred = genInline(gwop->writePredicate()); + } + func_args.arg(read_pred).arg(write_pred); + // init_val + func_args.arg(genCall("LocalTuple", data_type_args, init_args)); + // reduction_op + func_args.arg(genTemplate( + "welfordCombine", ArgumentBuilder().arg(data_type).arg(index_type))); + + indent() << reduction_name << ".reduce(\n"; + indent() << kTab << func_args << ");\n"; + } + + void handle(const kir::AllocateFusedReduction* alloc_fused_reduction) final { + // See the runtime file of the fused reduction + enum class ReductionParallelTypeState { Reduce, Iter, Pred, Inactive }; + + using ReductionParallelTypeStateArray = + ParallelTypeMap; + + ReductionParallelTypeStateArray states( + ReductionParallelTypeState::Inactive); + + for (const ParallelType pt : kParallelTypeThreads) { + // It may be better to predicate grid reductions on dimensions they don't + // actively use, however since that should generally be discouraged (they + // should be part of the iter portion of the operation, or they should be + // predciated out) we're just going to assume they're part of the iter + // dimension. This would cause more communication than strictly necessary + // but should not be a common use case. + auto pt_dim = kernel_->summary().parallel_dimension_map_.get(pt); + if (pt_dim == nullptr || pt_dim->isOneInt()) { + continue; + } + // Initialize pt_dim if used to an iter dimension. It may change to a + // reduction or predicated dimension later. + states[pt] = ReductionParallelTypeState::Iter; + } + + for (auto id : alloc_fused_reduction->out()->view()->domain()->domain()) { + auto pt = id->getParallelType(); + if (isParallelTypeThread(pt)) { + auto state = id->isReduction() ? ReductionParallelTypeState::Reduce + : ReductionParallelTypeState::Iter; + states[pt] = state; + } + } + + for (const auto predicated_pt : alloc_fused_reduction->threadPredicate()) { + auto& state = states[predicated_pt]; + TORCH_INTERNAL_ASSERT( + state != ReductionParallelTypeState::Reduce, + "Invalid thread predication: ", + predicated_pt); + state = ReductionParallelTypeState::Pred; + } + + ArgumentBuilder flags; + for (auto pt : kParallelTypeThreads) { + flags.arg(static_cast(states[pt])); + } + + // Persistent + flags.arg(true); + + // Broadcast is fused + flags.arg(true); + + const auto reduction_name = + genFusedReductionName(alloc_fused_reduction->out()->view()); + + indent() << genTemplate("fused_reduction::ParallelReduce", flags) << " " + << reduction_name << ";\n"; } void handleScope(const kir::Scope& scope) { for (auto expr : scope.exprs()) { - expr->accept(this); + OptOutConstDispatch::handle(expr); } } - void visit(const kir::ForLoop* node) final { - // TODO(kir): handle this during lowering - if (node->iter_domain()->isBroadcast()) { - handleScope(node->body()); - return; - } else if (node->vectorize()) { - vectorize_scope_ = node->vectorize(); - handleScope(node->body()); + void handleTrivialLoop(const kir::ForLoop* loop) { + if (loop->vectorize()) { + vectorize_scope_ = loop->vectorize(); + } + handleScope(loop->body()); + if (loop->vectorize()) { vectorize_scope_ = false; - return; - } else if (node->iter_domain()->isStride()) { - // A stride domain only executes the loop body with the loop - // index being zero. - indent() << "constexpr " - << "nvfuser_index_t" - << " " << gen(node->index()) << " = 0;\n"; - handleScope(node->body()); - return; } + } - // By default, a parallelized loop would look like: - // - // for (int x = threadIdx.x; x < stop; x += blockDim.x) { - // do_some_comp(x); - // } - // - // When stop is guaranteed to be smaller or equal to the number of - // threads, the for-loop is not necessary. In the above case, we - // would just generate the loop body without the for clause but - // references to the loop index replaced by the loop start value. - // - // When the loop end is the same as the IterDomain extent, the - // assumption can be safely made. This is more conservative than - // necessary since the loop stop value just needs to be <= the - // IterDomain extent. However, at this point, this conservative - // analysis seems sufficient. - if (node->stop() == node->iter_domain()->extent() && - node->iter_domain()->isThread()) { - // Register a replacement of references to the loop index with - // the loop start value. - replacement_map_.insert({node->index(), node->start()}); - handleScope(node->body()); - replacement_map_.erase(node->index()); - return; + void handle(const GroupedReductionOp* grouped_rop) final { + for (const auto i : c10::irange(grouped_rop->numReductions())) { + TORCH_INTERNAL_ASSERT(grouped_rop->output(i)->isA()); + + const auto output = grouped_rop->output(i)->as(); + const auto input = grouped_rop->input(i)->as(); + const auto domain = output->view()->domain(); + const auto op_type = grouped_rop->getReductionOpType(i); + + const bool has_block_reduce = domain->hasBlockReduction(); + const bool has_grid_reduce = domain->hasGridReduction(); + + TORCH_INTERNAL_ASSERT( + !has_grid_reduce, + "GroupedReductionOp does not support block parallelization. GroupedGridReductionOp must be used. ", + grouped_rop->toString()); + + if (!has_block_reduce) { + genSerialReduction(output, input, op_type); + } else if ( + auto reduction_id = + ir_utils::getMaybeWarpReductionDim(output, input)) { + genWarpReduction( + output, + input, + grouped_rop->initVal(i), + op_type, + grouped_rop->predicate()); + } else { + genBlockReduction( + output, + input, + grouped_rop->initVal(i), + op_type, + grouped_rop->predicate(), + grouped_rop->writePredicate()); + } } + } - if (node->start()->isZeroInt() && node->stop()->isOneInt()) { - indent() << "constexpr " - << "nvfuser_index_t" - << " " << gen(node->index()) << " = 0;\n"; - handleScope(node->body()); + void handle(const kir::ForLoop* loop) final { + if (loop->isTrivial()) { + handleTrivialLoop(loop); return; } - const auto gen_index = gen(node->index()); - const auto gen_start = genInline(node->start()); - const auto gen_stop = genInline(node->stop()); - const auto gen_step = genInline(node->step()); + const auto gen_index = gen(loop->index()); + const auto gen_start = genInline(loop->start()); + const auto gen_stop = genInline(loop->stop()); + const auto gen_step = genInline(loop->step()); std::stringstream step_code; - if (node->step()->isOneInt()) { + if (loop->step()->isOneInt()) { step_code << "++" << gen_index; } else { step_code << gen_index << " += " << gen_step; } - if (node->isUnrolled()) { + if (loop->isUnrolled()) { indent() << "#pragma unroll\n"; } else { indent() << "#pragma unroll 1\n"; } - indent() << "for(nvfuser_index_t " << gen_index << " = " << gen_start - << "; " << gen_index << " < " << gen_stop << "; " - << step_code.str() << ") "; + + indent() << "for(nvfuser_index_t " << gen_index; + if (loop->iter_domain()->isParallelized()) { + code_ << " = " << gen_start << "; "; + } else { + // Do not start at the start of the ID when not parallelized. Instead, + // start at 0. Predicates will protect buffers between 0 and ID->start(), + // however if we started at ID->start and extent == ID->start, we could + // have a "degenerate" loop (loop with no iterations). It may not be an + // issue to have a 0-sized loop, but all potential consequences haven't + // been covered. One example is WAR analysis which could incorrectly think + // a barrier inside a 0-sized loop actually provides protection. + code_ << " = 0; "; + } + code_ << gen_index << " < " << gen_stop << "; " << step_code.str() << ") "; startBlock(true); - handleScope(node->body()); + handleScope(loop->body()); endBlock(); } - void visit(const kir::IfThenElse* node) final { - auto conditional = node->predicate()->value(); + void handle(const kir::IfThenElse* ite) final { + auto conditional = ite->predicate()->value(); if (conditional->isConst()) { // If the conditional is a constant, then the IfThenElse is not required if (conditional->value().value()) { - handleScope(node->thenBody()); + handleScope(ite->thenBody()); } else { - handleScope(node->elseBody()); + handleScope(ite->elseBody()); } return; } @@ -1226,73 +1942,77 @@ class CudaKernelGenerator : private kir::IrVisitor { // "then" block startBlock(true); - handleScope(node->thenBody()); + handleScope(ite->thenBody()); // "else" block (optional) - if (node->hasElse()) { + if (ite->hasElse()) { endBlock(" else "); startBlock(true); - handleScope(node->elseBody()); + handleScope(ite->elseBody()); } endBlock(); } - // TODO(kir): fold initialization into Allocate - void visit(const kir::Allocate* node) final { - const auto buffer_dtype = node->buffer()->dtype(); + void handle(const kir::Allocate* alloc) final { + const auto buffer_dtype = alloc->buffer()->dtype(); - if (!node->buffer()->isA()) { - indent() << buffer_dtype << " " << gen(node->buffer()) << ";\n"; + TORCH_INTERNAL_ASSERT(alloc->buffer() != nullptr); + alloc_map_.emplace(alloc->buffer(), alloc); + + if (!alloc->buffer()->isA()) { + indent() << buffer_dtype << " " << gen(alloc->buffer()) << ";\n"; return; } - const auto tv = node->buffer()->as(); + const auto tv = alloc->buffer()->as(); - const auto size = node->size(); + const auto size = alloc->size(); TORCH_INTERNAL_ASSERT(size != nullptr); - if (node->alias() != nullptr) { - // Allocate alias another Allocate node - const auto alias_tv = node->alias()->buffer()->as(); - indent() << "// Alias Allocation - " << node->memoryType() << "\n"; - indent() << buffer_dtype << "* " << varName(tv) << " = " - << varName(alias_tv) << ";\n"; + if (alloc->alias() != nullptr) { + // Allocate alias another Allocate stmt + const auto alias_tv = alloc->alias()->buffer()->as(); + indent() << "// Alias Allocation - " << alloc->memoryType() << "\n"; + indent() << "auto& " << varName(tv) << " = " << varName(alias_tv) + << ";\n"; + } else { // Standard Memory Allocation - switch (tv->memoryType()) { + switch (tv->getMemoryType()) { case MemoryType::Global: indent() << "// Allocate global tensor " << varName(tv) << "\n"; break; case MemoryType::Shared: - if (kir::ExpressionEvaluator::isConst(size)) { - // Static shared memory - indent() << "__shared__ " << buffer_dtype << " " << varName(tv) - << "[" << genInline(size) << "];\n"; + // Align Offset Position + indent() << "offset = alignBufferSize(offset, " + // Always align to 128b / 16B + << 16 << ");\n"; + // Shared Memory Pointer + indent() << buffer_dtype << "* " << varName(tv) + << " = reinterpret_cast<" << buffer_dtype << "*>" + << "(array + offset);\n"; + // Increment Offset Position + indent() << "offset += (" << genInline(size) << " * sizeof(" + << buffer_dtype << "));\n"; + break; + case MemoryType::Local: { + auto va = kernel_->summary().vectorized_accesses; + if (va.find(tv) != va.end()) { + indent() << "Array<" << buffer_dtype << ", " << genInline(size) + << ", " << va.at(tv) << "> " << varName(tv) << ";\n"; } else { - // Align Offset Position - indent() << "offset = alignBufferSize(offset," - << dataTypeSize(buffer_dtype) << ");\n"; - // Shared Memory Pointer - indent() << buffer_dtype << "* " << varName(tv) - << " = reinterpret_cast<" << buffer_dtype << "*>" - << "(array + offset);\n"; - // Increment Offset Position - indent() << "offset += (" << genInline(size) << " * sizeof(" - << buffer_dtype << "));\n"; + indent() << buffer_dtype << " " << varName(tv) << "[" + << genInline(size) << "];\n"; } - break; - case MemoryType::Local: - indent() << buffer_dtype << " " << varName(tv) << "[" - << genInline(size) << "];\n"; - break; + } break; default: TORCH_INTERNAL_ASSERT(false, "Unexpected memory type"); } } } - void visit(const kir::Sync* node) final { + void handle(const kir::BlockSync*) final { // Use a custom synchronization method if enabled if (std::getenv("PYTORCH_NVFUSER_USE_BLOCK_SYNC_ATOMIC")) { indent() << "block_sync::sync();\n"; @@ -1301,11 +2021,43 @@ class CudaKernelGenerator : private kir::IrVisitor { } } - void visit(const kir::InitMagicZero* node) final { + void handle(const kir::GridSync* sync) final { + // Use a custom synchronization method if enabled + bool bidx = sync->syncDims().get(ParallelType::BIDx); + bool bidy = sync->syncDims().get(ParallelType::BIDy); + bool bidz = sync->syncDims().get(ParallelType::BIDz); + + ArgumentBuilder sync_call_template_parms; + sync_call_template_parms.arg(bidx).arg(bidy).arg(bidz).arg(true); + + auto sync_idx = genCall( + "index_utils::maskedOffset", + ArgumentBuilder().arg(!bidx).arg(!bidy).arg(!bidz), + ArgumentBuilder().arg("blockIdx").arg("gridDim")); + + auto sync_segment_size = genCall( + "index_utils::maskedSize", + ArgumentBuilder().arg(bidx).arg(bidy).arg(bidz), + ArgumentBuilder().arg("gridDim")); + + ArgumentBuilder sync_call_args; + sync_call_args.arg(varName(sync->syncBuffer())) + .append("[") + .append(sync_idx) + .append("]"); + sync_call_args.arg(sync_segment_size); + + auto sync_call = + genCall("grid_sync::sync", sync_call_template_parms, sync_call_args); + + indent() << sync_call << ";\n"; + } + + void handle(const kir::InitMagicZero*) final { indent() << "NVFUSER_DEFINE_MAGIC_ZERO\n"; } - void visit(const kir::UpdateMagicZero* node) final { + void handle(const kir::UpdateMagicZero*) final { indent() << "NVFUSER_UPDATE_MAGIC_ZERO\n"; } @@ -1314,15 +2066,14 @@ class CudaKernelGenerator : private kir::IrVisitor { const kir::Kernel* kernel_; int block_nest_level_ = 0; int block_reduce_name_ = 0; - - // TODO(kir): replace with explicit assignment statements bool print_inline_ = false; // Mark when we are inside of a vectorized for-loop bool vectorize_scope_ = false; - //! Holds active replacement mappings during codegen - std::unordered_map replacement_map_; + //! Keep track of Allocate node for Val. Used to determine if Val + //! should be inlined. + std::unordered_map alloc_map_; }; } // namespace diff --git a/torch/csrc/jit/codegen/cuda/codegen.h b/torch/csrc/jit/codegen/cuda/codegen.h index 2ffbb872155a..31e4fb707363 100644 --- a/torch/csrc/jit/codegen/cuda/codegen.h +++ b/torch/csrc/jit/codegen/cuda/codegen.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include diff --git a/torch/csrc/jit/codegen/cuda/compute_at.cpp b/torch/csrc/jit/codegen/cuda/compute_at.cpp index 45f744d7e2f1..77fc51363829 100644 --- a/torch/csrc/jit/codegen/cuda/compute_at.cpp +++ b/torch/csrc/jit/codegen/cuda/compute_at.cpp @@ -59,14 +59,8 @@ bool validateDomain(TensorView* tv, TensorDomain* new_td) { unsigned int getReplayablePosPasC( TensorView* producer, TensorView* consumer, - const ComputeAtRootDomainMap& root_map_, + const std::unordered_set& unmappable_producer_dims, ComputeAtMode mode) { - // Grab dimensions in producer and consumer that are mappable to eachother - // based on the computeAtRootDomainMap. This will tell us which dimensions - // can be inlined based on avoiding trying to inline reduction structures. - auto mappable_roots = - root_map_.getMappableDims(producer->domain(), consumer->domain()); - // Check if any consumer dimensions are marked as vectorize as producer can // not be inlined to vectorized dimensions in consumer. auto c_dom = consumer->domain()->domain(); @@ -124,9 +118,14 @@ unsigned int getReplayablePosPasC( if (std::any_of( consumer_root_dim_ids.begin(), consumer_root_dim_ids.end(), - [&mappable_roots, &c2p_root_map](IterDomain* root_id) { - return mappable_roots.find(root_id) == mappable_roots.end() && - c2p_root_map.find(root_id) != c2p_root_map.end(); + [&unmappable_producer_dims, &c2p_root_map](IterDomain* c_root_id) { + auto p_root_id_it = c2p_root_map.find(c_root_id); + if (p_root_id_it == c2p_root_map.end()) { + return false; + } + auto p_id = p_root_id_it->second; + return unmappable_producer_dims.find(p_id) != + unmappable_producer_dims.end(); })) { continue; } @@ -146,14 +145,8 @@ unsigned int getReplayablePosPasC( unsigned int getReplayablePosCasP( TensorView* consumer, TensorView* producer, - const ComputeAtRootDomainMap& root_map_, + const std::unordered_set& unmappable_producer_dims, ComputeAtMode mode) { - // Grab dimensions in producer and consumer that are mappable to eachother - // based on the computeAtRootDomainMap. This will tell us which dimensions - // can be inlined based on avoiding trying to inline reduction structures. - auto mappable_roots = - root_map_.getMappableDims(producer->domain(), consumer->domain()); - auto p_dom = producer->domain()->domain(); auto first_reduction = std::find_if(p_dom.begin(), p_dom.end(), [](IterDomain* id) { @@ -208,10 +201,11 @@ unsigned int getReplayablePosCasP( if (std::any_of( producer->getMaybeRFactorDomain().begin(), producer->getMaybeRFactorDomain().end(), - [&mappable_roots, &all_vals](IterDomain* root_id) { - return std::find(all_vals.begin(), all_vals.end(), root_id) != + [&unmappable_producer_dims, &all_vals](IterDomain* p_root_id) { + return std::find(all_vals.begin(), all_vals.end(), p_root_id) != all_vals.end() && - mappable_roots.find(root_id) == mappable_roots.end(); + unmappable_producer_dims.find(p_root_id) != + unmappable_producer_dims.end(); })) { continue; } @@ -446,7 +440,8 @@ unsigned int ComputeAt::backwardComputeAt_impl( FUSER_PERF_SCOPE("backwardComputeAt_impl"); auto max_consumer_compute_at_pos = - getReplayablePosPasC(producer, consumer, root_map_, mode_); + getReplayablePosPasC(producer, consumer, unmappable_dims_, mode_); + if (mode_ == ComputeAtMode::BestEffort) { consumer_compute_at_pos = std::min(consumer_compute_at_pos, max_consumer_compute_at_pos); @@ -477,7 +472,10 @@ unsigned int ComputeAt::backwardComputeAt_impl( } auto replay_producer_pair = TransformReplay::replayPasC( - producer, consumer, (int)consumer_compute_at_pos, root_map_); + producer, + consumer, + (int)consumer_compute_at_pos, + PairwiseRootDomainMap(producer, consumer)); if (replay_producer_pair.second == 0) { return 0; @@ -517,7 +515,7 @@ unsigned int ComputeAt::forwardComputeAt_impl( FUSER_PERF_SCOPE("forwardComputeAt_impl"); auto max_producer_compute_at_pos = - getReplayablePosCasP(consumer, producer, root_map_, mode_); + getReplayablePosCasP(consumer, producer, unmappable_dims_, mode_); if (mode_ == ComputeAtMode::BestEffort) { producer_compute_at_pos = @@ -549,7 +547,10 @@ unsigned int ComputeAt::forwardComputeAt_impl( } auto replay_consumer_pair = TransformReplay::replayCasP( - consumer, producer, (int)producer_compute_at_pos, root_map_); + consumer, + producer, + (int)producer_compute_at_pos, + PairwiseRootDomainMap(producer, consumer)); if (producer_compute_at_pos > producer->getComputeAtPosition()) { if (!producer->isFusionInput()) { @@ -657,7 +658,6 @@ void ComputeAt::traverseBackward() { running_consumer = running_producer; running_producer = tv_chain.back(); tv_chain.pop_back(); - running_consumer_pos = backwardComputeAt_impl( running_producer, running_consumer, running_consumer_pos); } @@ -790,16 +790,14 @@ void ComputeAt::updateSiblings() { id->parallelize(sibling_id->getParallelType()); } } - if (tv->getComputeAtPosition() > sibling_tv->getComputeAtPosition()) { - auto sibling_domain = TransformReplay::fullSelfReplay( - sibling_tv->domain(), tv->domain()); - validateDomain(sibling_tv, sibling_domain); - sibling_tv->setDomain(sibling_domain); - sibling_tv->setComputeAt(tv->getComputeAtPosition()); - sibling_tv->setMaxProducer(tv->getMaxProducerPosition()); - auto consumer_tvs = ir_utils::consumerTvsOf(sibling_tv); - consumers_to_update.insert(consumer_tvs.begin(), consumer_tvs.end()); - } + auto sibling_domain = + TransformReplay::fullSelfReplay(sibling_tv->domain(), tv->domain()); + validateDomain(sibling_tv, sibling_domain); + sibling_tv->setDomain(sibling_domain); + sibling_tv->setComputeAt(tv->getComputeAtPosition()); + sibling_tv->setMaxProducer(tv->getMaxProducerPosition()); + auto consumer_tvs = ir_utils::consumerTvsOf(sibling_tv); + consumers_to_update.insert(consumer_tvs.begin(), consumer_tvs.end()); } } @@ -865,6 +863,27 @@ void ComputeAt::runPass() { } } +void ComputeAt::buildUnmappableDims() { + auto all_tvs = ir_utils::allTvs(producer_->fusion()); + for (auto tv : all_tvs) { + auto consumers = ir_utils::consumerTvsOf(tv); + for (auto consumer : consumers) { + // Grab dimensions in producer and consumer that are mappable to eachother + // based on the computeAtRootDomainMap. This will tell us which dimensions + // can be inlined based on avoiding trying to inline non-trivial + // reduction structures. + auto mappable_roots = + root_map_.getMappableDims(tv->domain(), consumer->domain()); + for (auto tv_root_id : tv->getMaybeRFactorDomain()) { + if (mappable_roots.find(tv_root_id) == mappable_roots.end() && + !tv_root_id->isTrivialReduction()) { + unmappable_dims_.emplace(tv_root_id); + } + } + } + } +} + ComputeAt::ComputeAt( TensorView* _producer, TensorView* _consumer, @@ -903,6 +922,8 @@ ComputeAt::ComputeAt( setCommonConsumer(); root_map_.build(); + + buildUnmappableDims(); } } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/compute_at.h b/torch/csrc/jit/codegen/cuda/compute_at.h index 391225218db9..75fca5705ed9 100644 --- a/torch/csrc/jit/codegen/cuda/compute_at.h +++ b/torch/csrc/jit/codegen/cuda/compute_at.h @@ -2,11 +2,12 @@ #include +#include #include -#include #include #include +#include #include namespace torch { @@ -68,6 +69,10 @@ class ComputeAt { // call. void setCommonConsumer(); + // Iterate through all TVs and collect the dimensions of each TV that don't + // map to all its consumer TVs. + void buildUnmappableDims(); + // Propagate backward from consumer to producer, check if it increase // computeAt position on tensors, if so take it! void traverseBackward(); @@ -106,6 +111,9 @@ class ComputeAt { // Producer use chains set in, used in a few spots. std::deque> producer_use_chains_; + // Root domains in producer that's unmappable to any of its consumers + std::unordered_set unmappable_dims_; + ComputeAt( TensorView* _producer, TensorView* _consumer, diff --git a/torch/csrc/jit/codegen/cuda/compute_at_map.cpp b/torch/csrc/jit/codegen/cuda/compute_at_map.cpp index 6671fc375463..43382f865d43 100644 --- a/torch/csrc/jit/codegen/cuda/compute_at_map.cpp +++ b/torch/csrc/jit/codegen/cuda/compute_at_map.cpp @@ -1,7 +1,7 @@ #include +#include #include -#include #include #include #include @@ -12,255 +12,87 @@ namespace fuser { namespace cuda { namespace { -//! Class to figure out how many non-broadcast axes and how many broadcast axes -//! were used to produce an iter domain. This is important for figuring out what -//! the correct broadcasted extent is of an iteration domain. -//! -//! When GpuLower is available, trivial reductions are not counted as -//! concrete domains so that they should not be used to generate -//! for-loops. -class InputDomainCounter : public IterVisitor { - public: - // Returns number of {non-braodcast non-reduction iteration domains, broadcast - // and trivial reduction domains} used to generate the iteration domains in - // provided target domain. - static std::unordered_map> produceCounts( - const std::vector& domain, - GpuLower* gpu_lower) { - if (domain.empty()) { - return std::unordered_map>(); - } - - InputDomainCounter counter(domain); - - std::unordered_map> count_map; - for (const auto& entry : counter.domain_set_) { - auto id = entry.first; - auto input_id_set = entry.second; - int concrete_counts = 0; - int broadcast_counts = 0; - for (auto input_id : input_id_set) { - if (input_id->isBroadcast() || - (gpu_lower && - gpu_lower->trivialReductionInfo().isDerived(input_id))) { - broadcast_counts++; - } else { - concrete_counts++; - } - } - count_map[id] = {concrete_counts, broadcast_counts}; - } - - // Inputs may be root domains which wouldn't have any entries if no exprs - // were traversed, so manually insert their count - for (auto id : domain) { - if (count_map.find(id) == count_map.end()) { - count_map[id] = - (id->isBroadcast() || - (gpu_lower && gpu_lower->trivialReductionInfo().isDerived(id))) - ? std::make_pair(0, 1) - : std::make_pair(1, 0); - } - } - return count_map; - } - - private: - InputDomainCounter(const std::vector& domain_) { - traverseFrom( - domain_[0]->fusion(), - std::vector(domain_.begin(), domain_.end())); - } - - private: - std::unordered_set& getEntry(IterDomain* id) { - auto domain_set_it = domain_set_.find(id); - if (domain_set_it == domain_set_.end()) { - domain_set_it = - domain_set_ - .emplace(std::make_pair(id, std::unordered_set())) - .first; - domain_set_it->second.emplace(id); - } - - return domain_set_it->second; - } - - void handle(Expr* expr) override { - // If we end up moving swizzle to an Expr it would be identity here, instead - // of outputs being a function of all inputs - switch (expr->getExprType().value()) { - case (ExprType::Split): - case (ExprType::Merge): - break; - default: - TORCH_INTERNAL_ASSERT( - false, "Invalid expr type found in transform traversal."); - } - - // Gather all non-broadcast input domains - std::unordered_set resulting_set; - for (auto input_id : ir_utils::filterByType(expr->inputs())) { - auto input_entry = getEntry(input_id); - resulting_set.insert(input_entry.begin(), input_entry.end()); - } - for (auto output_id : ir_utils::filterByType(expr->outputs())) { - domain_set_.emplace(std::make_pair(output_id, resulting_set)); - } - } - - std::unordered_map> domain_set_; -}; - -// Only used once, consider removing. -template -std::deque deduplicateDeque(const std::deque& deque) { - std::unordered_set used; - std::deque deduped; - for (auto entry : deque) { - if (used.find(entry) == used.end()) { - deduped.push_back(entry); - used.emplace(entry); - } - } - return deduped; +// Is the provided IterDomain an Leaf of provided TensorView and within its +// computeAtPosition +bool idIsAComputeAtLeafDomain(IterDomain* id, TensorView* tv) { + auto begin = tv->domain()->domain().begin(); + auto end = tv->domain()->domain().begin() + tv->getComputeAtPosition(); + return std::find(begin, end, id) != end; } -void assertLowered(bool lowered) { - TORCH_INTERNAL_ASSERT( - lowered, - "Tried to accessed lowered values of compute at map,", - " however a valid lowering was not set when compute at map was created."); +// Is the provided IterDomain an Leaf of provided TensorView +bool idIsALeafDomain(IterDomain* id, TensorView* tv) { + auto begin = tv->domain()->domain().begin(); + auto end = tv->domain()->domain().end(); + return std::find(begin, end, id) != end; } } // namespace -void ComputeAtMap::mapIds(IterDomain* id0, IterDomain* id1) { - auto set_it_0 = disjoint_iter_set_maps_.find(id0); - auto set_it_1 = disjoint_iter_set_maps_.find(id1); - if (set_it_0 == disjoint_iter_set_maps_.end() && - set_it_1 == disjoint_iter_set_maps_.end()) { - // Neither iter domain has been mapped, so make a new disjoint set - auto new_set = std::make_shared>(); - new_set.get()->push_back(id0); - new_set.get()->push_back(id1); - disjoint_iter_set_maps_.emplace(std::make_pair(id0, new_set)); - disjoint_iter_set_maps_.emplace(std::make_pair(id1, new_set)); - disjoint_iter_sets_.push_back(new_set); - - // Update parallel type map - if (mapping_mode_ == MappingMode::PARALLEL) { - if (id0->isParallelized() && id1->isParallelized()) { - // Both are parallelized, make sure they're the same, set entry for - // parallel map - TORCH_INTERNAL_ASSERT( - id0->getParallelType() == id1->getParallelType(), - "Parallel type of ", - id0, - " should match ", - id1); - parallel_type_map_[new_set] = id0->getParallelType(); - } else if (id0->isParallelized() || id1->isParallelized()) { - // Only one is parallelized, set entry for parallel map - parallel_type_map_[new_set] = id0->isParallelized() - ? id0->getParallelType() - : id1->getParallelType(); - } - } - - } else if ( - set_it_0 != disjoint_iter_set_maps_.end() && - set_it_1 != disjoint_iter_set_maps_.end()) { - // Both iter domains have been mapped, so join their sets together - auto set0_ptr = set_it_0->second; - auto set1_ptr = set_it_1->second; - - // If the sets are already the same, do nothing - if (set0_ptr == set1_ptr) { - return; - } - - // Place everything in set1 into set0 and remap all ID's in set1 to set0 - auto& set1 = *set1_ptr; - for (auto id : set1) { - set0_ptr->push_back(id); - disjoint_iter_set_maps_[id] = set0_ptr; - } - - // set1 no longer needed as its IDs are copied into set0 - disjoint_iter_sets_.erase(std::find( - disjoint_iter_sets_.begin(), disjoint_iter_sets_.end(), set1_ptr)); - - // Update parallel type map - if (mapping_mode_ == MappingMode::PARALLEL) { - auto parallel_type_0_it = parallel_type_map_.find(set0_ptr); - auto parallel_type_1_it = parallel_type_map_.find(set1_ptr); - if (parallel_type_0_it != parallel_type_map_.end() && - parallel_type_1_it != parallel_type_map_.end()) { - // If both sets had a parallel type associated with them, make sure they - // are the same - TORCH_INTERNAL_ASSERT( - parallel_type_0_it->second == parallel_type_1_it->second); - } else if (parallel_type_1_it != parallel_type_map_.end()) { - // Set 1 has a parallel type, set 0 does not, set parallel entry - parallel_type_map_[set0_ptr] = parallel_type_1_it->second; - } - // Else set 0 already has the right parallel type set in the map, if at - // all - - // Remove set1 from the parallel type map as it shouldn't exist anymore - parallel_type_map_.erase(set1_ptr); - } +IterDomainGraph::IterDomainGraph(Fusion* fusion) { + build(fusion); +} - } else { - auto existing_set = set_it_0 != disjoint_iter_set_maps_.end() - ? set_it_0->second - : set_it_1->second; - auto missing_id = set_it_0 != disjoint_iter_set_maps_.end() ? id1 : id0; - existing_set->push_back(missing_id); - disjoint_iter_set_maps_[missing_id] = existing_set; - - // Update parallel type map - if (mapping_mode_ == MappingMode::PARALLEL) { - auto parallel_type_it = parallel_type_map_.find(existing_set); - if (parallel_type_it != parallel_type_map_.end() && - missing_id->isParallelized()) { - // existing_set has a parallel type already and missing_id has a - // parallel type, make sure they match. No need to update map - TORCH_INTERNAL_ASSERT( - parallel_type_it->second == missing_id->getParallelType()); - } else if ( - parallel_type_it == parallel_type_map_.end() && - id1->isParallelized()) { - // Set parallel type of existing_set as the newly added missing_id is - // parallel - parallel_type_map_[existing_set] = missing_id->getParallelType(); +void IterDomainGraph::build(Fusion* fusion) { + // Initialize a node for every iteration domain + for (auto tv : ir_utils::allTvs(fusion)) { + const auto& root_domain = tv->getRootDomain(); + const auto& domain = tv->domain()->domain(); + + // Grab all values in the history of the tensor view's domain + auto all_vals = DependencyCheck::getAllValsBetween( + {root_domain.begin(), root_domain.end()}, + {domain.begin(), domain.end()}); + + // Filter so we only have iteration domains (ignore Ints used in split) + auto all_ids = ir_utils::filterByType(all_vals); + + // Check is this domain is a consumer of a view-like operation + bool view_like_domain = tv->domain()->hasViewLikeRFactor(); + + for (auto id : all_ids) { + // Check if this id is a view like rfactor id + bool is_view_rfactor_id = false; + if (view_like_domain && id->isRFactorProduct()) { + // If the tensor domain is a view like domain, and the iteration domain + // is marked as an rfactor product and is in the rfactor domain, it's a + // view like rfactor iteration domain + const auto& rfactor_domain = tv->domain()->getMaybeRFactorDomain(); + if (std::find(rfactor_domain.begin(), rfactor_domain.end(), id) != + rfactor_domain.end()) { + is_view_rfactor_id = true; + } } + bool is_leaf_id = + std::find(domain.begin(), domain.end(), id) != domain.end(); + initializeId(id, is_view_rfactor_id, is_leaf_id); } } -} -void ComputeAtMap::build(Fusion* fusion, GpuLower* gpu_lower) { - // Consumers can only show up once in an expression, keep track of all of them - std::vector consumer_tvs; + // All ID's are initialized, start connecting them on the permissive, exact, + // and loop dimensions. for (auto expr : fusion->exprs()) { - if (!expr->outputs()[0]->isA()) { + if (!ir_utils::isTvOp(expr)) { continue; } auto tv_outputs = ir_utils::filterByType(expr->outputs()); TensorView* first_output_tv = nullptr; - for (auto c_tv : tv_outputs) { - consumer_tvs.push_back(c_tv); + for (auto c_tv : tv_outputs) { if (first_output_tv == nullptr) { first_output_tv = c_tv; } else { - // Map multi outputs of an expression to eachother. c is current output, - // and f as first output. Keep consistent with the later section of - // producer and consumers. Which here producer is now "first output", - // and consumer is still consumer. + // Map multi outputs of an expression to each other. c is current + // output, and f as first output. Keep consistent with the later section + // of producer and consumers. Which here producer is now "first output", + // and consumer is still consumer. One exception is how the + // domains left of CA positions are handled in the Parallel + // map. Those domains are not mapped in producer and consumer + // mappings as they do not share loops, but are mapped in the + // case of mapping multiple outputs since they do share the + // same loops. TORCH_INTERNAL_ASSERT( c_tv->getRootDomain().size() == @@ -275,7 +107,10 @@ void ComputeAtMap::build(Fusion* fusion, GpuLower* gpu_lower) { c_tv->getRootDomain()[i], first_output_tv->getRootDomain()[i])); } - // Multi output mapping + // Multi output mapping, outputs are required to have the same domain + // and same transformations, so they can be mapped in permissive/exact, + // and when within compute at position of domain()->domain() in the + // parallel map. auto replay_FasC = BestEffortReplay( first_output_tv->domain()->domain(), c_tv->domain()->domain(), @@ -283,35 +118,19 @@ void ComputeAtMap::build(Fusion* fusion, GpuLower* gpu_lower) { auto c2f_map = replay_FasC.getReplay(); - // If we're creating parallel map, only map the leaf - // axes. Also, the producer axis must be left of the CA - // point. - // Otherwise, map the entire replay map. - if (mapping_mode_ == MappingMode::PARALLEL) { - // Mark axes left of compute at point for parallel type tracking - std::unordered_set producer_axes_to_map( - first_output_tv->domain()->domain().begin(), - first_output_tv->domain()->domain().begin() + - first_output_tv->getComputeAtPosition()); - - for (auto c_id : c_tv->domain()->domain()) { - auto it = c2f_map.find(c_id); - if (it == c2f_map.end()) { - continue; - } - auto f_id = it->second; - if (producer_axes_to_map.find(f_id) == producer_axes_to_map.end()) { - continue; - } - mapIds(f_id, c_id); - } - } else { - for (auto entry : c2f_map) { - auto c_id = entry.first; - auto f_id = entry.second; - // Map the id's together - mapIds(f_id, c_id); + // Map the entire replay map between the multiple + // consumers even for the Parallel map as they share the same + // loop. + for (auto entry : c2f_map) { + auto c_id = entry.first; + auto f_id = entry.second; + // Map the id's together + permissive_nodes_.mapEntries(f_id, c_id); + exact_nodes_.mapEntries(f_id, c_id); + if (idIsALeafDomain(f_id, first_output_tv)) { + loop_nodes_.mapEntries(f_id, c_id); } + sibling_sets_.mapEntries(f_id, c_id); } } @@ -322,24 +141,9 @@ void ComputeAtMap::build(Fusion* fusion, GpuLower* gpu_lower) { // consumer/producer as their thread mappings could change as long as // it's across shared/global memory. auto pairwise_map = PairwiseRootDomainMap(p_tv, c_tv); - auto c2p_root_map = + const auto& permissive_c2p_root_map = pairwise_map.mapConsumerToProducer(c_tv->domain(), p_tv->domain()); - // For index map do not map any broadcast dimensions to non-broadcast - // dimensions - if (mapping_mode_ == MappingMode::INDEX) { - // Prevent any broadcasted axes being mapped to non-broadcasted axes. - for (auto it = c2p_root_map.begin(); it != c2p_root_map.end();) { - auto c_id = it->first; - auto p_id = it->second; - if (p_id->isBroadcast() != c_id->isBroadcast()) { - it = c2p_root_map.erase(it); - } else { - ++it; - } - } - } - // Look for matching ID transformations in producer and consumer, replay // producer as consumer. We want to replay producer as consumer instead // of the other way around since consumer may have some broadcasted axes @@ -348,304 +152,354 @@ void ComputeAtMap::build(Fusion* fusion, GpuLower* gpu_lower) { // mapping. If we're using this map for indexing, we do not want to // propagate broadcast mismatches. If we're using it to identify loop // nests, we do want to propagate mismatches. - auto replay_PasC = mapping_mode_ == MappingMode::LOOP || - mapping_mode_ == MappingMode::PARALLEL - ? BestEffortReplay::replayPasC(p_tv, c_tv, -1, pairwise_map) - : BestEffortReplay( - p_tv->domain()->domain(), - c_tv->domain()->domain(), - c2p_root_map); - - auto c2p_map = replay_PasC.getReplay(); - - // If we're creating parallel map, only map the leaf - // axes. Also, the producer axis must be left of the CA - // point. - // Otherwise, map the entire replay map. - if (mapping_mode_ == MappingMode::PARALLEL) { - // Mark axes left of compute at point for parallel type tracking - std::unordered_set producer_axes_to_map( - p_tv->domain()->domain().begin(), - p_tv->domain()->domain().begin() + p_tv->getComputeAtPosition()); - - for (auto c_id : c_tv->domain()->domain()) { - auto it = c2p_map.find(c_id); - if (it == c2p_map.end()) { - continue; - } - auto p_id = it->second; - if (producer_axes_to_map.find(p_id) == producer_axes_to_map.end()) { - continue; - } - mapIds(p_id, c_id); - } - } else { - for (auto entry : c2p_map) { - auto c_id = entry.first; - auto p_id = entry.second; - // Map the id's together - mapIds(p_id, c_id); - } + auto permissive_replay_PasC = + BestEffortReplay::replayPasC(p_tv, c_tv, -1, pairwise_map); + + const auto& permissive_c2p_map = permissive_replay_PasC.getReplay(); + + // For exact mapings do not map any broadcast dimensions to + // non-broadcast dimensions. Prevent any broadcasted axes being mapped + // to non-broadcasted axes. + auto exact_c2p_root_map = + PairwiseRootDomainMap(p_tv, c_tv, true) + .mapConsumerToProducer(c_tv->domain(), p_tv->domain()); + + // Same as permissive above but for exact + auto exact_replay_PasC = BestEffortReplay( + p_tv->domain()->domain(), + c_tv->domain()->domain(), + exact_c2p_root_map); + + const auto& exact_c2p_map = exact_replay_PasC.getReplay(); - // Make sure we always get root mapping for the loop map. Because of - // forwarding we could otherwise miss some root mappings. - if (mapping_mode_ == MappingMode::LOOP) { - for (auto entry : c2p_root_map) { - auto c_id = entry.first; - auto p_id = entry.second; - // Map the id's together - mapIds(p_id, c_id); - } + for (auto entry : exact_c2p_map) { + auto c_id = entry.first; + auto p_id = entry.second; + exact_nodes_.mapEntries(c_id, p_id); + consumers_.at(p_id).pushBack(c_id); + producers_.at(c_id).pushBack(p_id); + } + + for (auto entry : permissive_c2p_map) { + auto c_id = entry.first; + auto p_id = entry.second; + if (idIsAComputeAtLeafDomain(p_id, p_tv)) { + loop_nodes_.mapEntries(c_id, p_id); } + permissive_nodes_.mapEntries(c_id, p_id); + consumers_.at(p_id).pushBack(c_id); + producers_.at(c_id).pushBack(p_id); + } + + // Make sure we always get root mapping for the permissive map. Because + // of forwarding we could otherwise miss some root mappings. + for (auto entry : permissive_c2p_root_map) { + auto c_id = entry.first; + auto p_id = entry.second; + // Map the id's together + permissive_nodes_.mapEntries(c_id, p_id); + consumers_.at(p_id).pushBack(c_id); + producers_.at(c_id).pushBack(p_id); } } } } +} - // deduplicate iter domain entries in each set - for (const auto& iter_set : disjoint_iter_sets_) { - *iter_set = deduplicateDeque(*iter_set); +void IterDomainGraph::initializeId( + IterDomain* id, + bool is_view_rfactor_id, + bool is_leaf_id) { + permissive_nodes_.initializeSet(id); + exact_nodes_.initializeSet(id); + if (is_leaf_id) { + loop_nodes_.initializeSet(id); } + consumers_[id] = {}; + producers_[id] = {}; + sibling_sets_.initializeSet(id); - // For each IterDomain set we will track how many concrete root domains were - // used to generate the IterDomain. Used to populate conrete_id_map. Concrete - // ID has maximum of concrete ids, ties are decided based on n_broadcast_ids. - // Refer to AdvancedLowering5 for why we need to split ties with broadcast - // dims. - std::unordered_map n_concrete_ids_; - std::unordered_map n_broadcast_ids_; - - for (auto c_tv : consumer_tvs) { - auto counts = - InputDomainCounter::produceCounts(c_tv->domain()->domain(), gpu_lower); - std::transform( - counts.begin(), - counts.end(), - std::inserter(n_concrete_ids_, n_concrete_ids_.end()), - [](auto counts_entry) { - return std::make_pair(counts_entry.first, counts_entry.second.first); - }); - std::transform( - counts.begin(), - counts.end(), - std::inserter(n_broadcast_ids_, n_broadcast_ids_.end()), - [](auto counts_entry) { - return std::make_pair(counts_entry.first, counts_entry.second.second); - }); - } + all_ids_.pushBack(id); - for (auto inp_tv : ir_utils::filterByType(fusion->inputs())) { - auto counts = InputDomainCounter::produceCounts( - inp_tv->domain()->domain(), gpu_lower); - std::transform( - counts.begin(), - counts.end(), - std::inserter(n_concrete_ids_, n_concrete_ids_.end()), - [](auto counts_entry) { - return std::make_pair(counts_entry.first, counts_entry.second.first); - }); - std::transform( - counts.begin(), - counts.end(), - std::inserter(n_broadcast_ids_, n_broadcast_ids_.end()), - [](auto counts_entry) { - return std::make_pair(counts_entry.first, counts_entry.second.second); - }); + if (is_view_rfactor_id) { + view_rfactor_ids_.emplace(id); } +} - // Populate concrete id map - for (const auto& set : disjoint_iter_sets_) { - int max_concrete_count = -1; - int max_broadcast_count = -1; - IterDomain* concrete_id = nullptr; - for (auto id : *set) { - int concrete_count = n_concrete_ids_.at(id); - if (concrete_count >= max_concrete_count) { - int broadcast_count = n_broadcast_ids_.at(id); - if (concrete_count > max_concrete_count || - broadcast_count > max_broadcast_count) { - max_concrete_count = concrete_count; - max_broadcast_count = broadcast_count; - concrete_id = id; - } - } +ComputeAtMap::ComputeAtMap(Fusion* fusion) : id_graph_(fusion) { + build(fusion); +} + +void ComputeAtMap::build(Fusion* fusion) { + trivial_reduction_info_.build(fusion); + buildConcreteIds(); +} + +void ComputeAtMap::validateAndPropagatePType() { + for (const auto& loop_disjoint_set : id_graph_.loopNodes().disjointSets()) { + ParallelType common_ptype = ParallelType::Serial; + for (auto id : loop_disjoint_set->vector()) { + auto id_ptype = id->getParallelType(); + TORCH_INTERNAL_ASSERT( + id_ptype == common_ptype || id_ptype == ParallelType::Serial || + common_ptype == ParallelType::Serial, + "Issue validating parallel type disjoint ptype is, ", + common_ptype, + " but found in the set the id: ", + id->toString()); + common_ptype = + common_ptype == ParallelType::Serial ? id_ptype : common_ptype; } - TORCH_INTERNAL_ASSERT( - concrete_id != nullptr, "Could not concretize an IterDomain set."); - - for (auto id : *set) { - concrete_id_map_[id] = concrete_id; - if (mapping_mode_ == MappingMode::PARALLEL) { - auto parallel_map_it = parallel_type_map_.find(set); - // Parallelize all IterDomains to simplify lowering and codegen - if (parallel_map_it != parallel_type_map_.end()) { - // Don't propogate vectorize like other parallel types - if (parallel_map_it->second != ParallelType::Vectorize) { - id->parallelize(parallel_map_it->second); - } - } - } + for (auto id : loop_disjoint_set->vector()) { + id->parallelize(common_ptype); } } +} - if (gpu_lower != nullptr) { - convertToKir(fusion, gpu_lower); - } +bool ComputeAtMap::areMapped( + IterDomain* id0, + IterDomain* id1, + IdMappingMode mode) const { + return disjointSetOf(id0, mode)->has(id1); } -void ComputeAtMap::convertToKir(Fusion* fusion, GpuLower* gpu_lower) { - TORCH_INTERNAL_ASSERT(fusion != nullptr); - TORCH_INTERNAL_ASSERT(gpu_lower != nullptr); - - has_lowered_kir_ = true; - - std::unordered_map< - std::shared_ptr>, - std::shared_ptr>> - disjoint_set_2_kir; - - for (const auto& disjoint_iter_set : disjoint_iter_set_maps_) { - auto fusion_set = disjoint_iter_set.second; - auto kir_set_it = disjoint_set_2_kir.find(fusion_set); - std::shared_ptr> kir_set; - if (kir_set_it == disjoint_set_2_kir.end()) { - kir_set = std::make_shared>(); - std::transform( - fusion_set->begin(), - fusion_set->end(), - std::inserter(*kir_set, kir_set->begin()), - [&gpu_lower](IterDomain* id) { - return gpu_lower->lowerValue(id)->as(); - }); - disjoint_set_2_kir.emplace(std::make_pair(fusion_set, kir_set)); - } else { - kir_set = kir_set_it->second; - } - kir_disjoint_iter_set_maps_.emplace(std::make_pair( - gpu_lower->lowerValue(disjoint_iter_set.first)->as(), - kir_set)); +IterDomain* ComputeAtMap::computeConcreteId( + IterDomain* id, + IdMappingMode mode) { + const auto& disjoint_set_shared_ptr = disjointSetOf(id, mode); + + TORCH_INTERNAL_ASSERT( + disjoint_set_shared_ptr->vector().size(), + "Empty disjoint set found for ", + id->toString()); + + if (disjoint_set_shared_ptr->vector().size() == 1) { + return disjoint_set_shared_ptr->vector().front(); } - for (auto entry : concrete_id_map_) { - kir_concrete_id_map_.emplace(std::make_pair( - gpu_lower->lowerValue(entry.first)->as(), - gpu_lower->lowerValue(entry.second)->as())); + VectorOfUniqueEntries maybe_concrete_ids; + for (auto id : disjoint_set_shared_ptr->vector()) { + bool id_output = true; + for (auto consumer_id : id_graph_.consumers().at(id).vector()) { + if (disjoint_set_shared_ptr->has(consumer_id)) { + id_output = false; + break; + } + } + if (id_output) { + maybe_concrete_ids.pushBack(id); + } } - for (const auto& entry : disjoint_iter_set_maps_) { - kir_2_fusion_[gpu_lower->lowerValue(entry.first)->as()] = - entry.first; + TORCH_INTERNAL_ASSERT( + maybe_concrete_ids.vector().size(), + "No potential concrete_id's found for ", + id->toString()); + + if (maybe_concrete_ids.vector().size() == 1) { + return maybe_concrete_ids.vector().front(); } - // Make sure we have all IterDomains that could be used to generate a ForLoop - for (auto expr : fusion->exprs()) { - if (!expr->outputs()[0]->isA()) { - continue; - } + IterDomain* concrete_id = nullptr; + int max_iter_root_count = 0; + int max_bcast_root_count = 0; + + for (auto maybe_concrete_id : maybe_concrete_ids.vector()) { + std::unordered_set root_ids; + std::deque to_visit; + + to_visit.push_back(maybe_concrete_id); + while (to_visit.size()) { + auto current_id = to_visit.front(); + to_visit.pop_front(); + if (isViewRfactor(current_id)) { + root_ids.emplace(current_id); + continue; + } - auto tv_outputs = ir_utils::filterByType(expr->outputs()); + // push back producer IterDomains or add root if they don't exist + auto producer_vals = ir_utils::producerValsOf(current_id); + auto producer_ids = ir_utils::filterByType(producer_vals); - for (auto out : tv_outputs) { - for (auto entry : out->domain()->domain()) { - kir_2_fusion_[gpu_lower->lowerValue(entry)->as()] = - entry; + if (producer_ids.empty()) { + root_ids.emplace(current_id); + } else { + to_visit.insert( + to_visit.end(), producer_ids.begin(), producer_ids.end()); } } - } -} -bool ComputeAtMap::areMapped(IterDomain* id0, IterDomain* id1) const { - if (id0 == id1) { - return true; - } - auto set0_it = disjoint_iter_set_maps_.find(id0); - auto set1_it = disjoint_iter_set_maps_.find(id1); - if (set0_it == disjoint_iter_set_maps_.end() || - set1_it == disjoint_iter_set_maps_.end()) { - return false; - } - return (set0_it->second.get() == set1_it->second.get()); + int bcast_root_count = std::count_if( + root_ids.begin(), root_ids.end(), [&](IterDomain* root_id) { + return root_id->isBroadcast() + // TODO: This shouldn't have a negative impact, but (emperically) + // might not be necessary + || trivial_reduction_info_.isDerived(root_id); + }); + int iter_root_count = (int)root_ids.size() - bcast_root_count; + if (iter_root_count > max_iter_root_count || + (iter_root_count == max_iter_root_count && + bcast_root_count > max_bcast_root_count)) { + max_iter_root_count = iter_root_count; + max_bcast_root_count = bcast_root_count; + concrete_id = maybe_concrete_id; + } + } // end maybe_concrete_id + TORCH_INTERNAL_ASSERT( + concrete_id != nullptr, + "Something went wrong, could not find a concrete id."); + + return concrete_id; } -bool ComputeAtMap::areMapped(kir::IterDomain* id0, kir::IterDomain* id1) const { - assertLowered(has_lowered_kir_); - if (id0 == id1) { - return true; - } - auto set0_it = kir_disjoint_iter_set_maps_.find(id0); - auto set1_it = kir_disjoint_iter_set_maps_.find(id1); - if (set0_it == kir_disjoint_iter_set_maps_.end() || - set1_it == kir_disjoint_iter_set_maps_.end()) { - return false; +void ComputeAtMap::buildConcreteIds() { + for (const auto& disjoint_set_shared_ptr : + id_graph_.permissiveNodes().disjointSets()) { + TORCH_INTERNAL_ASSERT( + disjoint_set_shared_ptr->vector().size(), + "Cannot compute concrete id of empty set."); + auto first_id = disjoint_set_shared_ptr->vector().front(); + auto concrete_id = computeConcreteId(first_id, IdMappingMode::PERMISSIVE); + concrete_id_cache_[disjoint_set_shared_ptr] = concrete_id; } - return (set0_it->second.get() == set1_it->second.get()); -} -IterDomain* ComputeAtMap::getConcreteMappedID(IterDomain* id) const { - auto it = concrete_id_map_.find(id); - if (it != concrete_id_map_.end()) { - return it->second; + for (const auto& disjoint_set_shared_ptr : + id_graph_.exactNodes().disjointSets()) { + TORCH_INTERNAL_ASSERT( + disjoint_set_shared_ptr->vector().size(), + "Cannot compute concrete id of empty set."); + auto first_id = disjoint_set_shared_ptr->vector().front(); + auto concrete_id = computeConcreteId(first_id, IdMappingMode::EXACT); + concrete_id_cache_[disjoint_set_shared_ptr] = concrete_id; } - return id; -} -kir::IterDomain* ComputeAtMap::getConcreteMappedID(kir::IterDomain* id) const { - assertLowered(has_lowered_kir_); - auto it = kir_concrete_id_map_.find(id); - if (it != kir_concrete_id_map_.end()) { - return it->second; + for (const auto& disjoint_set_shared_ptr : + id_graph_.loopNodes().disjointSets()) { + TORCH_INTERNAL_ASSERT( + disjoint_set_shared_ptr->vector().size(), + "Cannot compute concrete id of empty set."); + auto first_id = disjoint_set_shared_ptr->vector().front(); + auto concrete_id = computeConcreteId(first_id, IdMappingMode::LOOP); + concrete_id_cache_[disjoint_set_shared_ptr] = concrete_id; } - return id; } -IterDomain* ComputeAtMap::toFusion(kir::IterDomain* kir) const { - assertLowered(has_lowered_kir_); - auto kir_2_fusion_it = kir_2_fusion_.find(kir); +IterDomain* ComputeAtMap::getConcreteMappedID( + IterDomain* id, + IdMappingMode mode) const { + auto disjoint_set_shared_ptr = disjointSetOf(id, mode); + TORCH_INTERNAL_ASSERT( - kir_2_fusion_it != kir_2_fusion_.end(), - "Kernel ir is not guarneteed to be reversible into fusion ir, could not find fusion entry. ", - kir::toString(kir, false)); - return kir_2_fusion_it->second; -} + disjoint_set_shared_ptr->vector().size() > 0, + "Empty disjoint set found for ", + id->toString()); -std::string ComputeAtMap::toString() const { - std::stringstream ss; + auto cache_it = concrete_id_cache_.find(disjoint_set_shared_ptr); - // We may not have cleaned up non active sets as this is intended for debug, - // so first grab unique entries and iterate over them. - std::unordered_set>> disjoint_sets; + TORCH_INTERNAL_ASSERT( + cache_it != concrete_id_cache_.end(), + "Could not find concrete id for: ", + id->toString(), + " with mode ", + mode); - for (const auto& entry : disjoint_iter_set_maps_) { - disjoint_sets.emplace(entry.second); - } + return cache_it->second; +} - for (const auto& disjoint_set : disjoint_sets) { - ss << " disjoint_set{ "; - TORCH_INTERNAL_ASSERT(disjoint_set->size() > 0); - auto concrete_id = concrete_id_map_.at(disjoint_set->front()); - for (auto it = disjoint_set->begin(); it != disjoint_set->end(); it++) { - if (it != disjoint_set->begin()) { - ss << ", "; - } - ss << (*it); - if (*it == concrete_id) { +namespace { + +std::string idGraphNodesToString( + const ComputeAtMap& ca_map, + IdMappingMode mode) { + std::stringstream ss; + const auto& disjoint_sets = ca_map.getIdSets(mode); + for (const auto& s_ptr : disjoint_sets.disjointSets()) { + const auto& set = *s_ptr; + IterDomain* concrete_id = nullptr; + if (!set.empty()) { + auto id = set.front(); + concrete_id = ca_map.getConcreteMappedID(id, mode); + } + ss << " {"; + for (auto entry : set.vector()) { + ss << abstractToString(entry); + if (entry == concrete_id) { ss << "*"; } - } - ss << " }"; - if (mapping_mode_ == MappingMode::PARALLEL) { - if (parallel_type_map_.find(disjoint_set) != parallel_type_map_.end()) { - ss << " -> " << parallel_type_map_.at(disjoint_set); - } else { - ss << " -> " << ParallelType::Serial; + if (entry != set.back()) { + ss << "; "; } } - ss << "\n"; + ss << " }\n"; } return ss.str(); } +} // namespace + +std::string ComputeAtMap::toString() const { + std::stringstream ss; + ss << "Compute at map { \n"; + ss << "Permissive map:\n" + << idGraphNodesToString(*this, IdMappingMode::PERMISSIVE); + ss << "Exact map:\n" << idGraphNodesToString(*this, IdMappingMode::EXACT); + ss << "Loop map:\n" << idGraphNodesToString(*this, IdMappingMode::LOOP); + ss << "Consumer maps:\n"; + for (auto entry : id_graph_.consumers()) { + ss << " " << entry.first->toString() << " :: " << entry.second.toString() + << "\n"; + } + + ss << "Producer maps:\n"; + for (auto entry : id_graph_.producers()) { + ss << " " << entry.first->toString() << " :: " << entry.second.toString() + << "\n"; + } + + ss << "Sibling map:\n" << id_graph_.siblings().toString() << "\n"; + + ss << "} compute at map" << std::endl; + return ss.str(); +} + +bool ComputeAtMap::isViewRfactor(IterDomain* ref_id) const { + return id_graph_.viewRfactorIds().find(ref_id) != + id_graph_.viewRfactorIds().end(); +} + +std::vector ComputeAtMap::getViewRfactorDomainsOfIdGroup( + IterDomain* ref_id, + IdMappingMode mode) const { + auto disjoint_set = disjointSetOf(ref_id, mode); + std::vector rfactor_ids; + for (auto disjoint_id : disjoint_set->vector()) { + if (id_graph_.viewRfactorIds().find(disjoint_id) != + id_graph_.viewRfactorIds().end()) { + rfactor_ids.push_back(disjoint_id); + } + } + return rfactor_ids; +} + +const std::shared_ptr>& ComputeAtMap:: + disjointSetOf(IterDomain* id, IdMappingMode mode) const { + return getIdSets(mode).disjointSetMap().at(id); +} + +const DisjointSets& ComputeAtMap::getIdSets( + IdMappingMode mode) const { + switch (mode) { + case IdMappingMode::PERMISSIVE: + return id_graph_.permissiveNodes(); + case IdMappingMode::EXACT: + return id_graph_.exactNodes(); + case IdMappingMode::LOOP: + return id_graph_.loopNodes(); + } + TORCH_INTERNAL_ASSERT(false, "Error with mapping mode provided."); +} + } // namespace cuda } // namespace fuser } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/compute_at_map.h b/torch/csrc/jit/codegen/cuda/compute_at_map.h index b2b70f8997d4..54bb7537a3f1 100644 --- a/torch/csrc/jit/codegen/cuda/compute_at_map.h +++ b/torch/csrc/jit/codegen/cuda/compute_at_map.h @@ -1,7 +1,9 @@ #pragma once +#include #include #include +#include #include #include @@ -11,126 +13,171 @@ namespace jit { namespace fuser { namespace cuda { -class GpuLower; - -class TORCH_CUDA_CU_API ComputeAtMap { +// There's three modes of these iter domain mappings all uniquely important in +// the lowering process. +// +// For EXACT/PERMISSIVE mode consider: +// +// consumer[i0, b1] = producer[i0] +// consumer->merge(0) (consumer will now be [i0 * b1]) +// When producer is replayed as consumer (the direction we use for mapping) +// with BestEffortReplay forward_bcast_mismatch = True the producer to +// consumer map will have both a mapping of consumer(i0) to producer(i0) as +// well as consumer(i0*b1) to producer(i0). This latter mapping is important +// for loop nest mappings as the consumer will generate a loop based on i0*b1 +// and the producer may be computeAt inside this loop nest. However, for +// indexing we do not want these two maps as producer may be indexed as i0*i1 +// depending on the loop nest structure and how it was built. Therefore we +// really need to carry (at least) two sets of maps around for lowering. +// +// LOOP mode is important if we have something like: +// consumer[i0o, threadIdx.x{i0i}] = producer[i0o, threadIdx.y{i0i}](computeAt +// = 1) which can easily happen when using shared memory. We want to make sure +// that the iteration domain used for loop construction (concreteId) has the +// proper parallelization strategy. In parallel mode we do typical iteration +// domain mapping, however we remove from it any iteration domains outside the +// computeAt of producer when mapping. This guarentees we won't map +// IterDomains that could have different parallelization strategies. We also +// propagate the parallel strategy in parallel mode so all mapped IDs that +// must have the same parallel type, do. +// +// IdMappingMode::LOOP +// Only maps leaf axes to left of compute at +// Forward broadcast axes in replay +// IdMappingMode::PERMISSIVE +// Forward broadcast axes in replay +// Map all iteration domains +// Always contain root mappings (otherwise they could have been forwarded in +// broadcast) +// IdMappingMode::EXACT +// Don't map any broadcast axes to non-broadcast axes +// Do not forward through any broadcast IDs +class TORCH_CUDA_CU_API IterDomainGraph { public: - // There's three modes of these iter domain mappings. For indexing, for loop - // nest mapping/generation, and to figure out the parallelization strategy. - // - // For index/loop mode consider: - // - // consumer[i0, b1] = producer[i0] - // consumer->merge(0) (consumer will now be [i0 * b1]) - // When producer is replayed as consumer (the direction we use for mapping) - // with BestEffortReplay forward_bcast_mismatch = True the producer to - // consumer map will have both a mapping of consumer(i0) to producer(i0) as - // well as consumer(i0*b1) to producer(i0). This latter mapping is important - // for loop nest mappings as the consumer will generate a loop based on i0*b1 - // and the producer may be computeAt inside this loop nest. However, for - // indexing we do not want these two maps as producer may be indexed as i0*i1 - // depending on the loop nest structure and how it was built. Therefore we - // really need to carry two sets of maps around for lowering. - // - // Parallel mode is important if we have something like: - // consumer[i0o, threadIdx.x{i0i}] = producer[i0o, threadIdx.y{i0i}](computeAt - // = 1) which can easily happen when using shared memory. We want to make sure - // that the iteration domain used for loop construction (concreteId) has the - // proper parallelization strategy. In parallel mode we do typical iteration - // domain mapping, however we remove from it any iteration domains outside the - // computeAt of producer when mapping. This guarentees we won't map - // IterDomains that could have different parallelization strategies. We also - // propagate the parallel strategy in parallel mode so all mapped IDs that - // must have the same parallel type, do. - // - // MappingMode::PARALLEL - // Only maps leaf axes to left of compute at - // Forward broadcast axes in replay - // MappingMode::LOOP - // Forward broadcast axes in replay - // Map all iteration domains - // Always contain root mappings (otherwise they could have been forwarded in - // broadcast) - // MappingMode::INDEX - // Don't map any broadcast axes to non-broadcast axes - // Do not forward through any broadcast IDs - enum class MappingMode { PARALLEL, LOOP, INDEX }; - - ComputeAtMap() = default; - ComputeAtMap(MappingMode mapping_mode) : mapping_mode_(mapping_mode) {} - - //! Builds all valid mappings. When gpu_lower is not nullptr, - //! equivalent mappings for KIR are also created. - void build(Fusion* fusion, GpuLower* gpu_lower = nullptr); - - //! Returns if id0 and id1 are mapped to eachother, meaning they represent the - //! same loop nest in the lowered code - bool areMapped(IterDomain* id0, IterDomain* id1) const; - - bool areMapped(kir::IterDomain* id0, kir::IterDomain* id1) const; + IterDomainGraph(Fusion* fusion); + + const DisjointSets& permissiveNodes() const { + return permissive_nodes_; + } + const DisjointSets& exactNodes() const { + return exact_nodes_; + } + const DisjointSets& loopNodes() const { + return loop_nodes_; + } + + // Consumers and producers is not symmetric like the other sets + const std::unordered_map>& + consumers() const { + return consumers_; + } + const std::unordered_map>& + producers() const { + return producers_; + } + + const DisjointSets& siblings() const { + return sibling_sets_; + } + + const VectorOfUniqueEntries& allIds() const { + return all_ids_; + } + + const std::unordered_set& viewRfactorIds() const { + return view_rfactor_ids_; + } - //! Returns an iter domain that is the maximum expanded size of all iter - //! domains the one provided maps to. Useful for opening loops to the correct - //! iteration size. Not guarenteed to return the same ID every call, but is - //! guarenteed to return iter domains in the same disjoint set. - IterDomain* getConcreteMappedID(IterDomain* id) const; + private: + void build(Fusion* fusion); - kir::IterDomain* getConcreteMappedID(kir::IterDomain* id) const; + void initializeId(IterDomain* id, bool is_view_rfactor_id, bool is_leaf_id); - // TODO: Would be great if we didn't need this, but we have nice functionality - // in iter_visitor that isn't moved over. Use of this is limited to indexing - // and this should definitely be removed by building out kernel ir to have - // better parity with fusion ir. - IterDomain* toFusion(kir::IterDomain* kir) const; + DisjointSets permissive_nodes_; + DisjointSets exact_nodes_; + DisjointSets loop_nodes_; - // Prints mapping information via Fusion IR - std::string toString() const; + // Consumers and producers is not symmetric like the other sets + std::unordered_map> + consumers_; + std::unordered_map> + producers_; - private: - bool has_lowered_kir_ = false; + DisjointSets sibling_sets_; - void mapIds(IterDomain* id0, IterDomain* id1); + VectorOfUniqueEntries all_ids_; - //! Convert everything to lowered structures (kernel ir), as we will use - //! this class frequently during lowering. - void convertToKir(Fusion* fusion, GpuLower* gpu_lower); + std::unordered_set view_rfactor_ids_; +}; - private: - MappingMode mapping_mode_ = MappingMode::LOOP; +class TrivialReductionInfo; - // This is actually only used when mapping mode == LOOP. Only used in expr - // sorting, it's actually maximum position where a loop is shared across any - // neighbor. - std::unordered_map produce_at_map_; +class TORCH_CUDA_CU_API ComputeAtMap { + public: + ComputeAtMap() = delete; + ComputeAtMap(Fusion* fusion); - // Disjoint sets of iter domains, only defined if iter domain is within - // compute at of a tensor view. Maps these iter domains to a set containing - // all other iter domains in the fusion that map to the same loop nest. - std::unordered_map>> - disjoint_iter_set_maps_; + //! Run through disjoint sets in the LOOP map, make sure there's only one + //! non-serial parallel type in each disjoint set, set the parallel type of + //! all IterDomains in the disjoint set to that PType. + void validateAndPropagatePType(); - std::unordered_map< - kir::IterDomain*, - std::shared_ptr>> - kir_disjoint_iter_set_maps_; + //! Returns if id0 and id1 are mapped to eachother with provided IdMappingMode + bool areMapped(IterDomain* id0, IterDomain* id1, IdMappingMode mode) const; - // Keep a list of disjoint_iter_sets that's deterministic to iterate over - std::deque>> disjoint_iter_sets_; + //! Returns an iter domain that is the maximum expanded size of all iter + //! domains the one provided maps to. Useful for opening loops to the correct + //! iteration size. Not guarenteed to return the same ID every call, but is + //! guarenteed to return iter domains in the same disjoint set. + IterDomain* getConcreteMappedID(IterDomain* id, IdMappingMode mode) const; - // Tracks if there's a parallel iter domain associated a disjoint iter domain - // set - std::unordered_map>, ParallelType> - parallel_type_map_; + // Prints mapping information, forwards to an internal IterDomainGraph + std::string toString() const; + + // Returns if the provided ID is a view like rfactor id + bool isViewRfactor(IterDomain* ref_id) const; + + // Returns all rfactor domains in rfactor_concrete_count_reset_domains_ that + // are in the disjoint set of the provided IterDomain. This will be every view + // like rfactor ID the provided ID "depends" on in the map. + std::vector getViewRfactorDomainsOfIdGroup( + IterDomain* ref_id, + IdMappingMode mode) const; - // For each IterDomain set we will track how many concrete root domains were - // used to generate the IterDomain - std::unordered_map concrete_id_map_; + const IterDomainGraph& idGraph() const { + return id_graph_; + } - std::unordered_map kir_concrete_id_map_; + //! Get the ID sets for a provided IdMappingMode + const DisjointSets& getIdSets(IdMappingMode mode) const; - // Map kir::IterDomain* back to the fusion IR IterDomain*. - // TODO: Would be great if we didn't need this. - std::unordered_map kir_2_fusion_; + private: + // Build id_graph_ + void build(Fusion* fusion); + + // Build concrete_id_cache_ + // Build a single entry in concrete_cache_id_ + IterDomain* computeConcreteId(IterDomain* id, IdMappingMode mode); + void buildConcreteIds(); + + // Produce the disjoint set containing provided id with mapping mode. + const std::shared_ptr>& disjointSetOf( + IterDomain* id, + IdMappingMode mode) const; + + // Should be built once and never modified again. + const IterDomainGraph id_graph_; + TrivialReductionInfo trivial_reduction_info_; + + // Prevent needing to recompute concrete_id's in compute at map. + // VectorOfUniqueEntries is unique across mapping modes, so don't need to use + // mapping mode directly in this cache. const + // VectorOfUniqueEntries& is what's returned by + // ComputeAtMap::disjointSetOf which can be used directly. + std::unordered_map< + std::shared_ptr>, + IterDomain*> + concrete_id_cache_; }; } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/contiguity.cpp b/torch/csrc/jit/codegen/cuda/contiguity.cpp new file mode 100644 index 000000000000..dbcc160bb8c6 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/contiguity.cpp @@ -0,0 +1,207 @@ +#include +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +ContigIDs::ContigIDs( + const std::vector& ids, + const std::vector& root_domain, + const std::vector& root_contiguity, + std::unordered_map concrete_to_ref, + std::unordered_map p2c_id_map, + bool ignore_halo_constraint, + bool ignore_indexability) + : root_domain_(root_domain), + root_contiguity_(root_contiguity), + concrete_to_ref_(std::move(concrete_to_ref)), + p2c_id_map_(std::move(p2c_id_map)), + ignore_indexability_(ignore_indexability) { + if (ids.empty()) { + return; + } + + TORCH_INTERNAL_ASSERT( + root_domain_.size() == root_contiguity_.size(), + "Arguments don't match ", + root_domain_.size(), + " != ", + root_contiguity_.size()); + + // GpuLower is required to honor halo constraints + if (!ignore_halo_constraint) { + TORCH_INTERNAL_ASSERT(GpuLower::hasCurrent(), "GpuLower not found"); + } + + for (const auto i : c10::irange(root_domain_.size())) { + auto root_domain_i = root_domain_[i]->as(); + root_to_indexed_id_[root_domain_i] = root_domain_i; + // Initialize to false + is_contig_root_[root_domain_i] = false; + // If a root domain has halo, can't use merged domain even if + // both inputs are contiguous. HaloInfo is also initialized for + // rfactor root domains, which should just return "zero" + // RootAxisInfo. This should be safe as no rfactor tensor should + // need halo. + if (root_contiguity_[i] && + (ignore_halo_constraint || + !GpuLower::current() + ->haloInfo() + .getRootAxisInfo(root_domain_i) + .hasHalo())) { + contig_ids_.emplace(root_domain_i); + is_contig_root_[root_domain_i] = true; + within_contig_ids_[root_domain_i] = std::unordered_set(); + } + } + + if (!contig_ids_.empty()) { + auto exprs = StmtSort::getExprs(ids[0]->fusion(), {ids.begin(), ids.end()}); + for (auto expr : exprs) { + handle(expr); + } + } +} + +void ContigIDs::handle(Merge* merge) { + // If either input is non-contiguous so is output. + const auto inner = merge->inner(); + const auto outer = merge->outer(); + const auto out = merge->out(); + + if (!isContig(inner) || !isContig(outer)) { + return; + } + + // Stop contig merging if the merge output is not indexable. + if (!ignore_indexability_ && !isIndexable(out)) { + return; + } + + // Grab inputs, make sure they're in root domain, check if they're + // contiguous. + + auto lhs_inputs = + ir_utils::iterDomainInputsOfOrderedAs({outer}, root_domain_); + auto rhs_inputs = + ir_utils::iterDomainInputsOfOrderedAs({inner}, root_domain_); + + TORCH_INTERNAL_ASSERT( + inRoot(lhs_inputs) && inRoot(rhs_inputs), + "Found an invalid merge operation, inputs of its arguments are not in the root domain."); + + std::deque ordered_inputs(lhs_inputs.begin(), lhs_inputs.end()); + ordered_inputs.insert( + ordered_inputs.end(), rhs_inputs.begin(), rhs_inputs.end()); + + // If any root input is not contig, output is not contig + if (!(std::all_of( + ordered_inputs.begin(), ordered_inputs.end(), [this](IterDomain* id) { + // Allow reduction tensors in contiguity check since we're using + // this to check contiguous vectors of reference tensors in + // schedulers (to set vectorization sizes), those reference tensors + // may have reduction dims, don't bail on contiguity just because + // it's a reduction dimension. + return is_contig_root_.at(id); + }))) { + return; + } + + std::deque root_copy(root_domain_.begin(), root_domain_.end()); + + // Forward to first matching argument + while (!root_copy.empty() && !ordered_inputs.empty()) { + if (root_copy.front() != ordered_inputs.front()) { + root_copy.pop_front(); + } else { + break; + } + } + + // Forward through all matching arguments + while (!root_copy.empty() && !ordered_inputs.empty()) { + if (root_copy.front() == ordered_inputs.front()) { + root_copy.pop_front(); + ordered_inputs.pop_front(); + } else if ( + root_copy.front()->isReduction() || root_copy.front()->isBroadcast()) { + // This was a cause of an error with + // ReductionSchedulerMultiDimNonFastest. The test no longer + // fails. + root_copy.pop_front(); + } else { + break; + } + } + + // If we matched all inputs, the output is contiguous. Only want to keep the + // top contig ID, lower ids should be placed in the "within_contig_ids" map + // of top id. + if (ordered_inputs.empty()) { + if (contig_ids_.find(inner) != contig_ids_.end()) { + contig_ids_.erase(inner); + } + + if (contig_ids_.find(outer) != contig_ids_.end()) { + contig_ids_.erase(outer); + } + + contig_ids_.emplace(out); + + std::unordered_set within_out; + within_out.emplace(inner); + if (within_contig_ids_.find(inner) != within_contig_ids_.end()) { + auto in_inner = within_contig_ids_.at(inner); + within_out.insert(in_inner.begin(), in_inner.end()); + within_contig_ids_.erase(inner); + } + + within_out.emplace(outer); + if (within_contig_ids_.find(outer) != within_contig_ids_.end()) { + auto in_outer = within_contig_ids_.at(outer); + within_out.insert(in_outer.begin(), in_outer.end()); + within_contig_ids_.erase(outer); + } + + within_contig_ids_[out] = within_out; + + for (auto root : lhs_inputs) { + root_to_indexed_id_[root] = out; + } + for (auto root : rhs_inputs) { + root_to_indexed_id_[root] = out; + } + } +} + +IterDomain* ContigIDs::getMappedId(IterDomain* id) const { + auto it = p2c_id_map_.find(id); + if (it != p2c_id_map_.end()) { + return it->second; + } else { + return id; + } +} + +IterDomain* ContigIDs::getCAIndexConcreteId(IterDomain* id) const { + TORCH_INTERNAL_ASSERT( + GpuLower::current() != nullptr, "GpuLower is not found"); + + auto c_id = GpuLower::current()->caMap()->getConcreteMappedID( + getMappedId(id), IdMappingMode::EXACT); + return c_id; +} + +bool ContigIDs::isIndexable(IterDomain* id) const { + auto c_id = getCAIndexConcreteId(id); + return concrete_to_ref_.find(c_id) != concrete_to_ref_.end(); +} + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/contiguity.h b/torch/csrc/jit/codegen/cuda/contiguity.h new file mode 100644 index 000000000000..24f0ffa6c7e5 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/contiguity.h @@ -0,0 +1,130 @@ +#pragma once + +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +// A merge is contiguous if: +// Inputs of outer are to the left in the root domain of the inputs of RHS. +// All inputs are contiguous in the root domain: +// - All marked as contiguous +// - Only gaps between inputs are broadcast or reductoin dims +// There are no split transformations performed on outer or inner +// All transformations on outer or inner are contiguous merges +// If this criteria holds, then we can index the input root domains of this +// merge with the indexing provided to the output of the merge in the backward +// index pass + +class ContigIDs : public OptInDispatch { + public: + ContigIDs() = delete; + + //! Check through the history of ids whose inputs map to root_domain with + //! contiguity root_contiguity. Return unordered_set of all merges that are + //! contiguous. Ignore root order is primarily used for predicate generation. + //! In this case we can linearize indexing of any ID that only consists of + //! merge operations. + //! + //! Mapping information from CA Index concrete to reference domains + //! is used to find if merged output domains can be indexed. If there's + //! no mapping to a reference domain, there's no corresponding + //! index, so it isn't marked as conting merge. + //! + //! p2c_id_map can be used when replayed producer domains are + //! analyzed, in which case producer-to-consumer maps should be + //! passed. + //! + //! If ignore_indexability and ignore_halo_constraint are true, + //! ignore the constraint on indexing and halo, respectively. It is + //! the caller that is responsible for its correctness. + //! + //! The function interface with many parameters looks ugly, but it + //! is also important to make ignore_indexability and + //! ignore_halo_constraint explicit to avoid any surprise. + //! + //! Not really sure why but clang-tidy only complains about + //! std::unordered_map if passed as a const reference. + ContigIDs( + const std::vector& ids, + const std::vector& root_domain, + const std::vector& root_contiguity, + std::unordered_map concrete_to_ref, + std::unordered_map p2c_id_map = {}, + bool ignore_indexability = false, + bool ignore_halo_constraint = false); + + const std::unordered_set& contigIDs() const { + return contig_ids_; + } + + const std::unordered_map>& + withinContigIDs() const { + return within_contig_ids_; + } + + const std::unordered_map& rootToIndexedID() const { + return root_to_indexed_id_; + } + + private: + using OptInDispatch::handle; + + bool inRoot(const std::vector& ids) { + return std::all_of(ids.begin(), ids.end(), [this](IterDomain* id) { + return is_contig_root_.find(id) != is_contig_root_.end(); + }); + } + + bool isContig(IterDomain* id) { + return contig_ids_.find(id) != contig_ids_.end(); + } + + // Split outputs are not contiguous, don't need to do anything. + void handle(Split*) override {} + + void handle(Merge* merge) override; + + IterDomain* getCAIndexConcreteId(IterDomain* id) const; + + //! True if an ID is indexable. + //! E.g., a merged domain with broadcast may not be indexable when + //! its corresponding reference tensor has non-broadcast domains. + bool isIndexable(IterDomain* id) const; + + //! Return an ID mapped with id_map_ or itself + IterDomain* getMappedId(IterDomain* id) const; + + private: + //! Root domains to analyze contiguity + const std::vector& root_domain_; + //! Contiguity of root_domain_ + const std::vector& root_contiguity_; + //! Mapping of concrete to reference domains. If a concrete domain + //! is not mapped, it is not indexable as there's no mapped index. + const std::unordered_map concrete_to_ref_; + //! Producer-to-consumer index map in the case of analyzing replayed + //! producer tensors + const std::unordered_map p2c_id_map_; + const bool ignore_indexability_ = false; + + //! Mapping of root domain to bool indicating contiguity + std::unordered_map is_contig_root_; + // Mark if ids are result of contigous merges + std::unordered_set contig_ids_; + // Given contiguous domain, return all iter domains within its history. + std::unordered_map> + within_contig_ids_; + //! Mapping of root domain to the actual indexed domain, which can + //! be itself or a contig merged domain if found. + std::unordered_map root_to_indexed_id_; +}; + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/disjoint_set.h b/torch/csrc/jit/codegen/cuda/disjoint_set.h index 99647a05496f..2b4dea404d74 100644 --- a/torch/csrc/jit/codegen/cuda/disjoint_set.h +++ b/torch/csrc/jit/codegen/cuda/disjoint_set.h @@ -3,169 +3,278 @@ #include #include +#include #include #include #include +// For printing of the set when using a Statement as the type for the set +#include + namespace torch { namespace jit { namespace fuser { namespace cuda { -//! Container class DisjointSet models equivalence relationships -//! -//! Each instance of this class keeps a set of equivalent classes -//! DisjointSet::join(a,b) makes the full class of a and b equivalent -//! DisjointSet::areEqual(a,b) checks if a and b belong same class +namespace { + +template +std::string abstractToString(T* ptr) { + return ptr->toString(); +} + +template +std::string abstractToString(T ref) { + return ref.toString(); +} + +} // namespace + +// Vector like class that will prevent adding duplicate entries by also +// maintaing a set template > -class DisjointSet { +class VectorOfUniqueEntries { public: - DisjointSet() = default; - - //! Joins the equivalent class that a and b belong to - //! areEqual(a',b') will be true for each a'=a and b'=b - //! - //! \param a An element from a equivalent class - //! will create a new equivalent class if a does - //! not belong to any - //! \param b An element from another equivalent class - //! will create a new equivalent class if b does - //! not belong to any - void join(T a, T b) { - // cases where either of the quiv class doesn't exist - if (!entry_map.count(a) && !entry_map.count(b)) { - createPoint(a); - entry_map[b] = fixedPoint(a); - } else if (!entry_map.count(a)) { - entry_map[a] = fixedPoint(b); - } else if (!entry_map.count(b)) { - entry_map[b] = fixedPoint(a); - } else { - // case where both equiv classes exist and need to join - const int i0 = fixedPoint(a); - const int i1 = fixedPoint(b); - int new_parent = 0; - int new_child = 0; - - // Either order here is correct but joining larger class to smaller class - // tend to be faster - std::tie(new_parent, new_child) = (weights[i0] < weights[i1]) - ? std::make_pair(i0, i1) - : std::make_pair(i1, i0); - weights[new_parent] += weights[new_child]; - set_map[new_child] = new_parent; + VectorOfUniqueEntries() = default; + + VectorOfUniqueEntries(const std::initializer_list& x) + : vector_(x), set_(x) {} + + // Returns if a node was actually added + bool pushBack(T entry) { + if (set_.emplace(entry).second) { + vector_.push_back(entry); + return true; } + return false; } - //! Checks if a and b belong to the same equivalent class - //! - //! \param a An element from a equivalent class - //! \param b An element from another equivalent class - //! \returns Boolean value representing if a and b are - //! recorded to be in the same equivalent class - //! will return false if any of a or b doesn't - //! have an equivalent class recorded - bool areEquivalent(T a, T b) const { - if (!entry_map.count(a) || !entry_map.count(b)) { - return false; + // Returns if any node was added + bool pushBack(const VectorOfUniqueEntries& other) { + bool any_added = false; + for (auto entry : other) { + any_added = any_added | pushBack(entry); } - return fixedPoint(a) == fixedPoint(b); + return any_added; } - //! Queries if an element exists in this set - bool contains(T a) const { - return entry_map.count(a) > 0; + // Returns a const vector useful for iterating on + const std::vector& vector() const { + return vector_; } - //! Returns all elements added to this set - std::vector getAllElements() const { - std::vector elms(entry_map.size()); - std::transform( - entry_map.begin(), - entry_map.end(), - elms.begin(), - [](const auto& entry_map_kv) { return entry_map_kv.first; }); - return elms; + // Returns first element in vector + T front() const { + return vector_.front(); } - //! Clears the equivalence relationships - void clear() { - set_map.clear(); - weights.clear(); - entry_map.clear(); - next_index_ = 0; - } - - //! Dumps the equivalent relationships - std::ostream& print(std::ostream& os) const { - std::unordered_map> fixedPointMap; - for (const auto& kv : entry_map) { - int fixed_point = fixedPoint(kv.first); - auto it = fixedPointMap.find(fixed_point); - if (it == fixedPointMap.end()) { - it = fixedPointMap.insert({fixed_point, {}}).first; - } - it->second.insert(kv.first); - } - os << "{\n"; - for (const auto& kv : fixedPointMap) { - os << "\t{ "; - for (const auto& val : kv.second) { - os << toString(val) << " "; + // Returns last element in vector + T back() const { + return vector_.back(); + } + + // Remove and returns the last element in vector + T popBack() { + T v = vector_.back(); + set_.erase(v); + vector_.pop_back(); + return v; + } + + // Returns if this container is empty + bool empty() const { + return vector_.empty(); + } + + // Returns if entry is in this vector + bool has(T entry) const { + return set_.find(entry) != set_.end(); + } + + std::string toString() { + std::stringstream ss; + ss << "{ "; + for (auto entry : vector()) { + ss << abstractToString(entry); + if (entry != vector().back()) { + ss << "; "; } - os << "}\n"; } - os << "}\n"; - return os; + ss << " }"; + return ss.str(); } private: - // Internal fixed point implementation: - // Returns the equivalent class that e belongs to - int getFixedPointForClass(int e) const { - TORCH_INTERNAL_ASSERT(static_cast(set_map.size()) > e); - while (set_map[e] != e) { - // Chasing to fixed point - e = set_map[e]; + std::vector vector_; + std::unordered_set set_; +}; + +//! Container class DisjointSet models equivalence relationships +//! +//! Each instance of this class keeps equivalence sets +//! DisjointSet::mapEntries(a,b) makes the full set of a and b equivalent +//! DisjointSet::*AreMapped(a,b) checks if a and b belong to the same disjoint +//! set +template > +class DisjointSets { + public: + DisjointSets() = default; + + // Warning: returned values should never be modified. This accessor isn't + // strictly safe as VectorOfUniqueEntries is not returned as a const. + const std:: + unordered_map>, Hash>& + disjointSetMap() const { + return disjoint_set_maps_; + } + + // Warning: returned values should never be modified. This accessor isn't + // strictly safe as VectorOfUniqueEntries is not returned as a const. + const std::vector>>& + disjointSets() const { + return disjoint_sets_; + } + + // Return the entire disjoint set of provided entry + const VectorOfUniqueEntries& getDisjointSetOf(T entry) const { + auto set_it = disjoint_set_maps_.find(entry); + TORCH_INTERNAL_ASSERT( + set_it != disjoint_set_maps_.end(), + "Could not find entry for ", + entry->toString()); + return *(set_it->second); + } + + // Initializes a new set for provided entry + // + // TODO: Return iterator + void initializeSet(T entry) { + disjoint_sets_.push_back( + std::make_shared>()); + disjoint_sets_.back()->pushBack(entry); + disjoint_set_maps_.emplace(std::make_pair(entry, disjoint_sets_.back())); + } + + // Adds all of the disjoint set belonging to entry1 to the disjoint set + // belonging to entry0, maps all entries of disjoint set belonging to entry1 + // to entry0, removes original disjoint set belonging to entry1. + void mapEntries(T entry0, T entry1) { + auto set_it_0 = disjoint_set_maps_.find(entry0); + auto set_it_1 = disjoint_set_maps_.find(entry1); + + // Track if we need to reset iterators, optimize for case where both entries + // exist + bool invalid_iterators = false; + if (set_it_0 == disjoint_set_maps_.end()) { + initializeSet(entry0); + invalid_iterators = true; + } + + if (set_it_1 == disjoint_set_maps_.end()) { + initializeSet(entry1); + invalid_iterators = true; } - return e; + + // TODO: We can avoid refinding one iterator if initialize set returns an + // iterator, though if we insert entry1 we'd have to refind entry0 as it + // could invalidate all iterators + if (invalid_iterators) { + set_it_0 = disjoint_set_maps_.find(entry0); + set_it_1 = disjoint_set_maps_.find(entry1); + } + + auto set0_shared_ptr = set_it_0->second; + auto set1_shared_ptr = set_it_1->second; + + // If the sets are already the same, do nothing + if (set0_shared_ptr == set1_shared_ptr) { + return; + } + + // Place everything in set1 into set0 and remap all entries in set1 to set0 + for (auto entry : set1_shared_ptr->vector()) { + set0_shared_ptr->pushBack(entry); + disjoint_set_maps_[entry] = set0_shared_ptr; + } + + // set1 no longer needed as its entries are copied into set0 + disjoint_sets_.erase(std::find( + disjoint_sets_.begin(), disjoint_sets_.end(), set1_shared_ptr)); } - //! Utility to check the class e belongs to: - //! - //! \param e element e to find the equiv class for - //! \returns the equivalent class that e belongs to - //! - int fixedPoint(T e) const { - // Handles case when i doesn't have an equivalence class - TORCH_INTERNAL_ASSERT(entry_map.count(e)); + // Will assert if provided entry0 is not in any disjoint set, otherwise + // returns if entry0 and entry1 are in the same disjoint set. + bool strictAreMapped(T entry0, T entry1) const { + auto entry_it = disjointSetMap().find(entry0); + TORCH_INTERNAL_ASSERT( + entry_it != disjointSetMap().end(), + "Strict mapping failed on element: ", + abstractToString(entry0), + " either an error occured, or non strict mapping should have been used."); + return entry_it->second->has(entry1); + } + + // If entry0 doesn't have a disjoint set returns false, otherwise returns if + // entry0 and entry1 are in the same disjoint set. + bool permissiveAreMapped(T entry0, T entry1) const { + auto entry_it = disjointSetMap().find(entry0); + if (entry_it == disjointSetMap().end()) { + return false; + } + return entry_it->second->has(entry1); + } - // Use fixed point as a representation for the equiv class - return getFixedPointForClass(entry_map.at(e)); + // Returns if a set exists with provided entry + bool mappingExists(T entry) const { + return disjoint_set_maps_.find(entry) != disjoint_set_maps_.end(); } - //! Utility to create a new equiv class for i + // Returns a deterministic list of all entries that have been added to any + // disjoint set. // - //! \param i Element i to create the equiv class for - void createPoint(T i) { - entry_map[i] = next_index_; - set_map.push_back(next_index_++); - weights.push_back(1); + // Warning: constructed on every call, consider caching result. + VectorOfUniqueEntries getAllElements() const { + VectorOfUniqueEntries all_elements; + for (auto set : disjoint_sets_) { + for (auto entry : set->vector()) { + all_elements.pushBack(entry); + } + } + return all_elements; + } + + // Completely clears all disjoint sets + void clear() { + disjoint_set_maps_.clear(); + disjoint_sets_.clear(); + } + + std::string toString() const { + std::stringstream ss; + ss << "disjoint sets{\n"; + for (auto s_ptr : disjoint_sets_) { + auto& set = *s_ptr; + ss << " { "; + for (auto entry : set.vector()) { + ss << abstractToString(entry); + // DomainKey defines == but not != + if (!(entry == set.back())) { + ss << "; "; + } + } + ss << " }\n"; + } + ss << "}"; + return ss.str(); } private: - // Internal representation of the equivalence class as integers - // set_map implements the "parent" relationship - std::vector set_map; - // Weights is used for preliminary perf optimization - std::vector weights; - - // Map the input of type T to its equivalence class - std::unordered_map entry_map; - - // Running counter for generating new index when - // Creating new equiv classes - int next_index_ = 0; + // Disjoint sets + std::unordered_map>, Hash> + disjoint_set_maps_; + + // Keep a list of disjoint_sets that's deterministic to iterate over + std::vector>> disjoint_sets_; }; } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/dispatch.cpp b/torch/csrc/jit/codegen/cuda/dispatch.cpp index cea8b24e7ff7..1306440d915e 100644 --- a/torch/csrc/jit/codegen/cuda/dispatch.cpp +++ b/torch/csrc/jit/codegen/cuda/dispatch.cpp @@ -37,7 +37,7 @@ T* ptr(T* obj) { * } * * And therefore dispatch should never call: - * ptr(mutator)->handle(this->as()); + * ptr(mutator)->mutate(this->as()); */ template @@ -52,12 +52,22 @@ void Val::dispatch(T handler, Val* val) { ptr(handler)->handle(val->as()); return; case DataType::Int: + case DataType::Int32: + // Dispatch to Int even with Int32 as we don't have Int32 IR + // node. ptr(handler)->handle(val->as()); return; + case DataType::ComplexDouble: + ptr(handler)->handle(val->as()); + return; default: break; } break; + case ValType::NamedScalar: + ptr(handler)->handle(val->as()); + return; + case ValType::IterDomain: ptr(handler)->handle(val->as()); return; @@ -67,8 +77,11 @@ void Val::dispatch(T handler, Val* val) { case ValType::TensorView: ptr(handler)->handle(val->as()); return; - case ValType::NamedScalar: - ptr(handler)->handle(val->as()); + case ValType::Predicate: + ptr(handler)->handle(val->as()); + return; + case ValType::TensorIndex: + ptr(handler)->handle(val->as()); return; default: break; @@ -79,12 +92,6 @@ void Val::dispatch(T handler, Val* val) { template void Expr::dispatch(T handler, Expr* expr) { switch (*(expr->getExprType())) { - case ExprType::Split: - ptr(handler)->handle(expr->as()); - return; - case ExprType::Merge: - ptr(handler)->handle(expr->as()); - return; case ExprType::UnaryOp: ptr(handler)->handle(expr->as()); return; @@ -97,12 +104,25 @@ void Expr::dispatch(T handler, Expr* expr) { case ExprType::ReductionOp: ptr(handler)->handle(expr->as()); return; + case ExprType::GroupedReductionOp: + ptr(handler)->handle(expr->as()); + return; case ExprType::WelfordOp: ptr(handler)->handle(expr->as()); return; + case ExprType::MmaOp: + ptr(handler)->handle(expr->as()); + return; case ExprType::BroadcastOp: ptr(handler)->handle(expr->as()); return; + + case ExprType::Split: + ptr(handler)->handle(expr->as()); + return; + case ExprType::Merge: + ptr(handler)->handle(expr->as()); + return; case ExprType::TransposeOp: ptr(handler)->handle(expr->as()); return; @@ -112,9 +132,49 @@ void Expr::dispatch(T handler, Expr* expr) { case ExprType::GatherOp: ptr(handler)->handle(expr->as()); return; + case ExprType::ViewAsScalar: + ptr(handler)->handle(expr->as()); + return; case ExprType::ViewOp: ptr(handler)->handle(expr->as()); return; + + case ExprType::Allocate: + ptr(handler)->handle(expr->as()); + return; + case ExprType::BlockSync: + ptr(handler)->handle(expr->as()); + return; + case ExprType::GridSync: + ptr(handler)->handle(expr->as()); + return; + case ExprType::InitMagicZero: + ptr(handler)->handle(expr->as()); + return; + case ExprType::UpdateMagicZero: + ptr(handler)->handle(expr->as()); + return; + case ExprType::ForLoop: + ptr(handler)->handle(expr->as()); + return; + case ExprType::IfThenElse: + ptr(handler)->handle(expr->as()); + return; + case ExprType::GridReduction: + ptr(handler)->handle(expr->as()); + return; + case ExprType::GroupedGridReduction: + ptr(handler)->handle(expr->as()); + return; + case ExprType::GridBroadcast: + ptr(handler)->handle(expr->as()); + return; + case ExprType::GridWelford: + ptr(handler)->handle(expr->as()); + return; + case ExprType::AllocateFusedReduction: + ptr(handler)->handle(expr->as()); + return; default: TORCH_INTERNAL_ASSERT(false, "Unknown exprtype in dispatch!"); } @@ -142,12 +202,22 @@ void Val::constDispatch(T handler, const Val* val) { ptr(handler)->handle(val->as()); return; case DataType::Int: + case DataType::Int32: + // Dispatch to Int even with Int32 as we don't have Int32 IR + // node. ptr(handler)->handle(val->as()); return; + case DataType::ComplexDouble: + ptr(handler)->handle(val->as()); + return; default: break; } break; + case ValType::NamedScalar: + ptr(handler)->handle(val->as()); + return; + case ValType::IterDomain: ptr(handler)->handle(val->as()); return; @@ -157,8 +227,11 @@ void Val::constDispatch(T handler, const Val* val) { case ValType::TensorView: ptr(handler)->handle(val->as()); return; - case ValType::NamedScalar: - ptr(handler)->handle(val->as()); + case ValType::Predicate: + ptr(handler)->handle(val->as()); + return; + case ValType::TensorIndex: + ptr(handler)->handle(val->as()); return; default: break; @@ -169,12 +242,6 @@ void Val::constDispatch(T handler, const Val* val) { template void Expr::constDispatch(T handler, const Expr* expr) { switch (*(expr->getExprType())) { - case ExprType::Split: - ptr(handler)->handle(expr->as()); - return; - case ExprType::Merge: - ptr(handler)->handle(expr->as()); - return; case ExprType::UnaryOp: ptr(handler)->handle(expr->as()); return; @@ -187,12 +254,25 @@ void Expr::constDispatch(T handler, const Expr* expr) { case ExprType::ReductionOp: ptr(handler)->handle(expr->as()); return; + case ExprType::GroupedReductionOp: + ptr(handler)->handle(expr->as()); + return; case ExprType::WelfordOp: ptr(handler)->handle(expr->as()); return; + case ExprType::MmaOp: + ptr(handler)->handle(expr->as()); + return; case ExprType::BroadcastOp: ptr(handler)->handle(expr->as()); return; + + case ExprType::Split: + ptr(handler)->handle(expr->as()); + return; + case ExprType::Merge: + ptr(handler)->handle(expr->as()); + return; case ExprType::TransposeOp: ptr(handler)->handle(expr->as()); return; @@ -202,9 +282,49 @@ void Expr::constDispatch(T handler, const Expr* expr) { case ExprType::GatherOp: ptr(handler)->handle(expr->as()); return; + case ExprType::ViewAsScalar: + ptr(handler)->handle(expr->as()); + return; case ExprType::ViewOp: ptr(handler)->handle(expr->as()); return; + + case ExprType::Allocate: + ptr(handler)->handle(expr->as()); + return; + case ExprType::BlockSync: + ptr(handler)->handle(expr->as()); + return; + case ExprType::GridSync: + ptr(handler)->handle(expr->as()); + return; + case ExprType::InitMagicZero: + ptr(handler)->handle(expr->as()); + return; + case ExprType::UpdateMagicZero: + ptr(handler)->handle(expr->as()); + return; + case ExprType::ForLoop: + ptr(handler)->handle(expr->as()); + return; + case ExprType::IfThenElse: + ptr(handler)->handle(expr->as()); + return; + case ExprType::GridReduction: + ptr(handler)->handle(expr->as()); + return; + case ExprType::GroupedGridReduction: + ptr(handler)->handle(expr->as()); + return; + case ExprType::GridBroadcast: + ptr(handler)->handle(expr->as()); + return; + case ExprType::GridWelford: + ptr(handler)->handle(expr->as()); + return; + case ExprType::AllocateFusedReduction: + ptr(handler)->handle(expr->as()); + return; default: TORCH_INTERNAL_ASSERT(false, "Unknown exprtype in dispatch!"); } @@ -232,28 +352,45 @@ void Statement::constDispatch(T handler, const Statement* stmt) { * ptr(mutator)->mutate(this->as()); */ template -Statement* Val::mutatorDispatch(T mutator, Val* val) { +void Val::mutatorDispatch(T mutator, Val* val) { switch (*(val->getValType())) { case ValType::Scalar: switch (*(val->getDataType())) { case DataType::Bool: - return ptr(mutator)->mutate(val->as()); + ptr(mutator)->mutate(val->as()); + return; case DataType::Double: - return ptr(mutator)->mutate(val->as()); + ptr(mutator)->mutate(val->as()); + return; case DataType::Int: - return ptr(mutator)->mutate(val->as()); + ptr(mutator)->mutate(val->as()); + return; + case DataType::ComplexDouble: + ptr(mutator)->mutate(val->as()); + return; default: break; } break; + case ValType::NamedScalar: + ptr(mutator)->mutate(val->as()); + return; + case ValType::IterDomain: - return ptr(mutator)->mutate(val->as()); + ptr(mutator)->mutate(val->as()); + return; case ValType::TensorDomain: - return ptr(mutator)->mutate(val->as()); + ptr(mutator)->mutate(val->as()); + return; case ValType::TensorView: - return ptr(mutator)->mutate(val->as()); - case ValType::NamedScalar: - return ptr(mutator)->mutate(val->as()); + ptr(mutator)->mutate(val->as()); + return; + case ValType::Predicate: + ptr(mutator)->mutate(val->as()); + return; + case ValType::TensorIndex: + ptr(mutator)->mutate(val->as()); + return; default: break; } @@ -261,44 +398,105 @@ Statement* Val::mutatorDispatch(T mutator, Val* val) { } template -Statement* Expr::mutatorDispatch(T mutator, Expr* expr) { +void Expr::mutatorDispatch(T mutator, Expr* expr) { switch (*(expr->getExprType())) { - case ExprType::Split: - return ptr(mutator)->mutate(expr->as()); - case ExprType::Merge: - return ptr(mutator)->mutate(expr->as()); case ExprType::UnaryOp: - return ptr(mutator)->mutate(expr->as()); + ptr(mutator)->mutate(expr->as()); + return; case ExprType::BinaryOp: - return ptr(mutator)->mutate(expr->as()); + ptr(mutator)->mutate(expr->as()); + return; case ExprType::TernaryOp: - return ptr(mutator)->mutate(expr->as()); + ptr(mutator)->mutate(expr->as()); + return; case ExprType::ReductionOp: - return ptr(mutator)->mutate(expr->as()); + ptr(mutator)->mutate(expr->as()); + return; + case ExprType::GroupedReductionOp: + ptr(mutator)->mutate(expr->as()); + return; case ExprType::WelfordOp: - return ptr(mutator)->mutate(expr->as()); + ptr(mutator)->mutate(expr->as()); + return; + case ExprType::MmaOp: + ptr(mutator)->mutate(expr->as()); + return; case ExprType::BroadcastOp: - return ptr(mutator)->mutate(expr->as()); + ptr(mutator)->mutate(expr->as()); + return; + + case ExprType::Split: + ptr(mutator)->mutate(expr->as()); + return; + case ExprType::Merge: + ptr(mutator)->mutate(expr->as()); + return; case ExprType::TransposeOp: - return ptr(mutator)->mutate(expr->as()); + ptr(mutator)->mutate(expr->as()); + return; case ExprType::ShiftOp: - return ptr(mutator)->mutate(expr->as()); + ptr(mutator)->mutate(expr->as()); + return; case ExprType::GatherOp: - return ptr(mutator)->mutate(expr->as()); + ptr(mutator)->mutate(expr->as()); + return; + case ExprType::ViewAsScalar: + ptr(mutator)->mutate(expr->as()); + return; case ExprType::ViewOp: - return ptr(mutator)->mutate(expr->as()); + ptr(mutator)->mutate(expr->as()); + return; + + case ExprType::Allocate: + ptr(mutator)->mutate(expr->as()); + return; + case ExprType::BlockSync: + ptr(mutator)->mutate(expr->as()); + return; + case ExprType::GridSync: + ptr(mutator)->mutate(expr->as()); + return; + case ExprType::InitMagicZero: + ptr(mutator)->mutate(expr->as()); + return; + case ExprType::UpdateMagicZero: + ptr(mutator)->mutate(expr->as()); + return; + case ExprType::ForLoop: + ptr(mutator)->mutate(expr->as()); + return; + case ExprType::IfThenElse: + ptr(mutator)->mutate(expr->as()); + return; + case ExprType::GridReduction: + ptr(mutator)->mutate(expr->as()); + return; + case ExprType::GroupedGridReduction: + ptr(mutator)->mutate(expr->as()); + return; + case ExprType::GridBroadcast: + ptr(mutator)->mutate(expr->as()); + return; + case ExprType::GridWelford: + ptr(mutator)->mutate(expr->as()); + return; + case ExprType::AllocateFusedReduction: + ptr(mutator)->mutate(expr->as()); + return; default: TORCH_INTERNAL_ASSERT(false, "Unknown exprtype in dispatch!"); } } template -Statement* Statement::mutatorDispatch(T mutator, Statement* stmt) { +void Statement::mutatorDispatch(T mutator, Statement* stmt) { if (stmt->isVal()) { - return ptr(mutator)->mutate(stmt->as()); + ptr(mutator)->mutate(stmt->as()); + return; } if (stmt->isExpr()) { - return ptr(mutator)->mutate(stmt->as()); + ptr(mutator)->mutate(stmt->as()); + return; } TORCH_INTERNAL_ASSERT(false, "Unknown stmttype in dispatch!"); } @@ -308,11 +506,11 @@ Statement* Statement::mutatorDispatch(T mutator, Statement* stmt) { * classes. Actual visitors/mutators should inhereit from these classes and call * ->dispatch(this) to avoid needing an explicit instantiation. */ -template void Statement::dispatch(OptOutDispatch, Statement*); +template void Statement::dispatch(OptOutDispatch&, Statement*); template void Statement::dispatch(OptOutDispatch*, Statement*); -template void Val::dispatch(OptOutDispatch, Val*); +template void Val::dispatch(OptOutDispatch&, Val*); template void Val::dispatch(OptOutDispatch*, Val*); -template void Expr::dispatch(OptOutDispatch, Expr*); +template void Expr::dispatch(OptOutDispatch&, Expr*); template void Expr::dispatch(OptOutDispatch*, Expr*); template void Statement::dispatch(OptInDispatch, Statement*); @@ -322,33 +520,26 @@ template void Val::dispatch(OptInDispatch*, Val*); template void Expr::dispatch(OptInDispatch, Expr*); template void Expr::dispatch(OptInDispatch*, Expr*); -template void Statement::constDispatch(OptOutConstDispatch, const Statement*); +template void Statement::constDispatch(OptOutConstDispatch&, const Statement*); template void Statement::constDispatch(OptOutConstDispatch*, const Statement*); -template void Val::constDispatch(OptOutConstDispatch, const Val*); +template void Val::constDispatch(OptOutConstDispatch&, const Val*); template void Val::constDispatch(OptOutConstDispatch*, const Val*); -template void Expr::constDispatch(OptOutConstDispatch, const Expr*); +template void Expr::constDispatch(OptOutConstDispatch&, const Expr*); template void Expr::constDispatch(OptOutConstDispatch*, const Expr*); -template void Statement::constDispatch(OptInConstDispatch, const Statement*); +template void Statement::constDispatch(OptInConstDispatch&, const Statement*); template void Statement::constDispatch(OptInConstDispatch*, const Statement*); -template void Val::constDispatch(OptInConstDispatch, const Val*); +template void Val::constDispatch(OptInConstDispatch&, const Val*); template void Val::constDispatch(OptInConstDispatch*, const Val*); -template void Expr::constDispatch(OptInConstDispatch, const Expr*); +template void Expr::constDispatch(OptInConstDispatch&, const Expr*); template void Expr::constDispatch(OptInConstDispatch*, const Expr*); -template Statement* Statement::mutatorDispatch(OptOutMutator, Statement*); -template Statement* Statement::mutatorDispatch(OptOutMutator*, Statement*); -template Statement* Val::mutatorDispatch(OptOutMutator, Val*); -template Statement* Val::mutatorDispatch(OptOutMutator*, Val*); -template Statement* Expr::mutatorDispatch(OptOutMutator, Expr*); -template Statement* Expr::mutatorDispatch(OptOutMutator*, Expr*); - -template Statement* Statement::mutatorDispatch(OptInMutator, Statement*); -template Statement* Statement::mutatorDispatch(OptInMutator*, Statement*); -template Statement* Val::mutatorDispatch(OptInMutator, Val*); -template Statement* Val::mutatorDispatch(OptInMutator*, Val*); -template Statement* Expr::mutatorDispatch(OptInMutator, Expr*); -template Statement* Expr::mutatorDispatch(OptInMutator*, Expr*); +template void Statement::mutatorDispatch(OptOutMutator&, Statement*); +template void Statement::mutatorDispatch(OptOutMutator*, Statement*); +template void Val::mutatorDispatch(OptOutMutator&, Val*); +template void Val::mutatorDispatch(OptOutMutator*, Val*); +template void Expr::mutatorDispatch(OptOutMutator&, Expr*); +template void Expr::mutatorDispatch(OptOutMutator*, Expr*); void OptOutDispatch::handle(Statement* s) { Statement::dispatch(this, s); @@ -362,18 +553,6 @@ void OptOutDispatch::handle(Val* v) { Val::dispatch(this, v); } -void OptInDispatch::handle(Statement* s) { - Statement::dispatch(this, s); -} - -void OptInDispatch::handle(Expr* e) { - Expr::dispatch(this, e); -} - -void OptInDispatch::handle(Val* v) { - Val::dispatch(this, v); -} - void OptOutConstDispatch::handle(const Statement* s) { Statement::constDispatch(this, s); } @@ -386,46 +565,266 @@ void OptOutConstDispatch::handle(const Val* v) { Val::constDispatch(this, v); } -void OptInConstDispatch::handle(const Statement* s) { - Statement::constDispatch(this, s); +void OptInConstDispatch::unhandled(const Statement* stmt) { + if (stmt->isExpr()) { + TORCH_INTERNAL_ASSERT( + false, "Handle not overriden for ", stmt->getExprType().value(), "."); + } else if (stmt->isVal()) { + TORCH_INTERNAL_ASSERT( + false, "Handle not overriden for ", stmt->getValType().value(), "."); + } else { + TORCH_INTERNAL_ASSERT(false, "Unrecognized statement type."); + } } -void OptInConstDispatch::handle(const Expr* e) { - Expr::constDispatch(this, e); +void OptInDispatch::unhandled(Statement* stmt) { + if (stmt->isExpr()) { + TORCH_INTERNAL_ASSERT( + false, "Handle not overriden for ", stmt->getExprType().value(), "."); + } else if (stmt->isVal()) { + TORCH_INTERNAL_ASSERT( + false, "Handle not overriden for ", stmt->getValType().value(), "."); + } else { + TORCH_INTERNAL_ASSERT(false, "Unrecognized statement type."); + } } -void OptInConstDispatch::handle(const Val* v) { - Val::constDispatch(this, v); +// Vals +void OptOutConstDispatch::handle(const Bool* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const Double* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const Int* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const ComplexDouble* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const NamedScalar* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const IterDomain* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const TensorDomain* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const TensorView* stmt) { + unhandled(stmt); +} + +void OptOutConstDispatch::handle(const kir::Predicate* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const kir::TensorIndex* stmt) { + unhandled(stmt); +} + +// Exprs +void OptOutConstDispatch::handle(const UnaryOp* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const BinaryOp* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const TernaryOp* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const ReductionOp* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const GroupedReductionOp* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const WelfordOp* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const MmaOp* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const BroadcastOp* stmt) { + unhandled(stmt); } -Statement* OptInMutator::mutate(Statement* s) { - return Statement::mutatorDispatch(this, s); +void OptOutConstDispatch::handle(const Split* stmt) { + unhandled(stmt); } +void OptOutConstDispatch::handle(const Merge* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const TransposeOp* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const ShiftOp* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const GatherOp* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const ViewAsScalar* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const ViewOp* stmt) { + unhandled(stmt); +} + +void OptOutConstDispatch::handle(const kir::Allocate* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const kir::BlockSync* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const kir::GridSync* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const kir::InitMagicZero* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const kir::UpdateMagicZero* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const kir::ForLoop* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const kir::IfThenElse* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const kir::GridReduction* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const kir::GroupedGridReduction* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const kir::GridBroadcast* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const kir::GridWelford* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const kir::AllocateFusedReduction* stmt) { + unhandled(stmt); +} + +void OptOutDispatch::unhandled(Statement*) {} -Statement* OptInMutator::mutate(Expr* e) { - return Expr::mutatorDispatch(this, e); +// Vals +void OptOutDispatch::handle(Bool* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(Double* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(Int* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(ComplexDouble* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(NamedScalar* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(IterDomain* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(TensorDomain* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(TensorView* stmt) { + unhandled(stmt); } -Statement* OptInMutator::mutate(Val* v) { - // If value is already mutated, return the mutation - if (mutations.find(v) != mutations.end()) - return mutations[v]; - return Val::mutatorDispatch(this, v); +void OptOutDispatch::handle(kir::Predicate* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(kir::TensorIndex* stmt) { + unhandled(stmt); } -Statement* OptOutMutator::mutate(Statement* s) { - return Statement::mutatorDispatch(this, s); +// Exprs +void OptOutDispatch::handle(UnaryOp* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(BinaryOp* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(TernaryOp* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(ReductionOp* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(GroupedReductionOp* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(WelfordOp* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(MmaOp* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(BroadcastOp* stmt) { + unhandled(stmt); } -Statement* OptOutMutator::mutate(Expr* e) { - return Expr::mutatorDispatch(this, e); +void OptOutDispatch::handle(Split* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(Merge* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(TransposeOp* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(ShiftOp* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(GatherOp* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(ViewAsScalar* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(ViewOp* stmt) { + unhandled(stmt); } -Statement* OptOutMutator::mutate(Val* v) { - // If value is already mutated, return the mutation - if (mutations.find(v) != mutations.end()) - return mutations[v]; - return Val::mutatorDispatch(this, v); +void OptOutDispatch::handle(kir::Allocate* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(kir::BlockSync* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(kir::GridSync* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(kir::InitMagicZero* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(kir::UpdateMagicZero* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(kir::ForLoop* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(kir::IfThenElse* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(kir::GridReduction* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(kir::GroupedGridReduction* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(kir::GridBroadcast* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(kir::GridWelford* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(kir::AllocateFusedReduction* stmt) { + unhandled(stmt); } } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/dispatch.h b/torch/csrc/jit/codegen/cuda/dispatch.h index c1be76eb950e..8c0b78702217 100644 --- a/torch/csrc/jit/codegen/cuda/dispatch.h +++ b/torch/csrc/jit/codegen/cuda/dispatch.h @@ -1,9 +1,9 @@ #pragma once -#include - +#include #include -#include + +#include #include @@ -48,7 +48,7 @@ namespace torch { namespace jit { namespace fuser { namespace cuda { - +class IrContainer; class Fusion; // Hierarchal dispatch functions for handle @@ -60,28 +60,56 @@ class Val; class IterDomain; class TensorDomain; class TensorView; + class Bool; class Double; class Int; +class ComplexDouble; class NamedScalar; // Exprs -class Split; -class Merge; class UnaryOp; class BinaryOp; class TernaryOp; class ReductionOp; +class GroupedReductionOp; class WelfordOp; +class MmaOp; class BroadcastOp; class TransposeOp; class ShiftOp; class GatherOp; +class ViewAsScalar; class ViewOp; +// Exprs +class Split; +class Merge; + +namespace kir { +class Predicate; +class TensorIndex; + +class Allocate; +class BlockSync; +class GridSync; +class ForLoop; +class IfThenElse; +class GridReduction; +class GroupedGridReduction; +class GridBroadcast; +class GridWelford; +class AllocateFusedReduction; +class InitMagicZero; +class UpdateMagicZero; +} // namespace kir + // By default, all IR nodes are handled in this dispatch, and will call an empty // function on all nodes. class TORCH_CUDA_CU_API OptOutConstDispatch : public PolymorphicBase { + protected: + virtual void unhandled(const Statement*) {} + public: // Hierarchal dispatch functions for handle virtual void handle(const Statement*); @@ -89,30 +117,54 @@ class TORCH_CUDA_CU_API OptOutConstDispatch : public PolymorphicBase { virtual void handle(const Val*); // Vals - virtual void handle(const IterDomain*) {} - virtual void handle(const TensorDomain*) {} - virtual void handle(const TensorView*) {} - virtual void handle(const Bool*) {} - virtual void handle(const Double*) {} - virtual void handle(const Int*) {} - virtual void handle(const NamedScalar*) {} + virtual void handle(const IterDomain* stmt); + virtual void handle(const TensorDomain* stmt); + virtual void handle(const TensorView* stmt); + virtual void handle(const Bool* stmt); + virtual void handle(const Double* stmt); + virtual void handle(const Int* stmt); + virtual void handle(const ComplexDouble* stmt); + virtual void handle(const NamedScalar* stmt); + + virtual void handle(const kir::Predicate*); + virtual void handle(const kir::TensorIndex*); // Exprs - virtual void handle(const Split*) {} - virtual void handle(const Merge*) {} - virtual void handle(const UnaryOp*) {} - virtual void handle(const BinaryOp*) {} - virtual void handle(const TernaryOp*) {} - virtual void handle(const ReductionOp*) {} - virtual void handle(const WelfordOp*) {} - virtual void handle(const BroadcastOp*) {} - virtual void handle(const TransposeOp*) {} - virtual void handle(const ShiftOp*) {} - virtual void handle(const GatherOp*) {} - virtual void handle(const ViewOp*) {} + virtual void handle(const UnaryOp* stmt); + virtual void handle(const BinaryOp* stmt); + virtual void handle(const TernaryOp* stmt); + virtual void handle(const ReductionOp* stmt); + virtual void handle(const GroupedReductionOp* stmt); + virtual void handle(const WelfordOp* stmt); + virtual void handle(const MmaOp* stmt); + virtual void handle(const BroadcastOp* stmt); + + virtual void handle(const Split* stmt); + virtual void handle(const Merge* stmt); + virtual void handle(const TransposeOp* stmt); + virtual void handle(const ShiftOp* stmt); + virtual void handle(const GatherOp* stmt); + virtual void handle(const ViewAsScalar* stmt); + virtual void handle(const ViewOp* stmt); + + virtual void handle(const kir::Allocate*); + virtual void handle(const kir::BlockSync*); + virtual void handle(const kir::GridSync*); + virtual void handle(const kir::InitMagicZero*); + virtual void handle(const kir::UpdateMagicZero*); + virtual void handle(const kir::ForLoop*); + virtual void handle(const kir::IfThenElse*); + virtual void handle(const kir::GridReduction*); + virtual void handle(const kir::GroupedGridReduction*); + virtual void handle(const kir::GridBroadcast*); + virtual void handle(const kir::GridWelford*); + virtual void handle(const kir::AllocateFusedReduction*); }; class TORCH_CUDA_CU_API OptOutDispatch : public PolymorphicBase { + protected: + virtual void unhandled(Statement*); + public: // Hierarchal dispatch functions for handle virtual void handle(Statement*); @@ -120,190 +172,95 @@ class TORCH_CUDA_CU_API OptOutDispatch : public PolymorphicBase { virtual void handle(Val*); // Vals - virtual void handle(IterDomain*) {} - virtual void handle(TensorDomain*) {} - virtual void handle(TensorView*) {} - virtual void handle(Bool*) {} - virtual void handle(Double*) {} - virtual void handle(Int*) {} - virtual void handle(NamedScalar*) {} + virtual void handle(Bool* stmt); + virtual void handle(Double* stmt); + virtual void handle(Int* stmt); + virtual void handle(ComplexDouble* stmt); + virtual void handle(NamedScalar* stmt); + virtual void handle(IterDomain* stmt); + virtual void handle(TensorDomain* stmt); + virtual void handle(TensorView* stmt); + + virtual void handle(kir::Predicate*); + virtual void handle(kir::TensorIndex*); // Exprs - virtual void handle(Split*) {} - virtual void handle(Merge*) {} - virtual void handle(UnaryOp*) {} - virtual void handle(BinaryOp*) {} - virtual void handle(TernaryOp*) {} - virtual void handle(ReductionOp*) {} - virtual void handle(WelfordOp*) {} - virtual void handle(BroadcastOp*) {} - virtual void handle(TransposeOp*) {} - virtual void handle(ShiftOp*) {} - virtual void handle(GatherOp*) {} - virtual void handle(ViewOp*) {} + virtual void handle(UnaryOp* stmt); + virtual void handle(BinaryOp* stmt); + virtual void handle(TernaryOp* stmt); + virtual void handle(ReductionOp* stmt); + virtual void handle(GroupedReductionOp* stmt); + virtual void handle(WelfordOp* stmt); + virtual void handle(MmaOp* stmt); + virtual void handle(BroadcastOp* stmt); + + virtual void handle(Split* stmt); + virtual void handle(Merge* stmt); + virtual void handle(TransposeOp* stmt); + virtual void handle(ShiftOp* stmt); + virtual void handle(GatherOp* stmt); + virtual void handle(ViewAsScalar* stmt); + virtual void handle(ViewOp* stmt); + + virtual void handle(kir::Allocate* stmt); + virtual void handle(kir::BlockSync* stmt); + virtual void handle(kir::GridSync* stmt); + virtual void handle(kir::InitMagicZero* stmt); + virtual void handle(kir::UpdateMagicZero* stmt); + virtual void handle(kir::ForLoop* stmt); + virtual void handle(kir::IfThenElse* stmt); + virtual void handle(kir::GridReduction* stmt); + virtual void handle(kir::GroupedGridReduction* stmt); + virtual void handle(kir::GridBroadcast* stmt); + virtual void handle(kir::GridWelford* stmt); + virtual void handle(kir::AllocateFusedReduction* stmt); }; -class TORCH_CUDA_CU_API OptInConstDispatch : public PolymorphicBase { +class TORCH_CUDA_CU_API OptInConstDispatch : public OptOutConstDispatch { public: - // Hierarchal dispatch functions for handle - virtual void handle(const Statement*); - virtual void handle(const Expr*); - virtual void handle(const Val*); - - // Vals - virtual void handle(const IterDomain*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for IterDomain."); - } - virtual void handle(const TensorDomain*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TensorDomain."); - } - virtual void handle(const TensorView*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TensorView."); - } - virtual void handle(const Bool*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Bool."); - } - virtual void handle(const Double*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Double."); - } - virtual void handle(const Int*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Int."); - } - virtual void handle(const NamedScalar*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for NamedScalar."); - } + using OptOutConstDispatch::handle; - // Exprs - virtual void handle(const Split*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Split."); - } - virtual void handle(const Merge*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Merge."); - } - virtual void handle(const UnaryOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for UnaryOp."); - } - virtual void handle(const BinaryOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for BinaryOp."); - } - virtual void handle(const WelfordOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for WelfordOp."); - } - virtual void handle(const TernaryOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TernaryOp."); - } - virtual void handle(const ReductionOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ReductionOp."); - } - virtual void handle(const BroadcastOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for BroadcastOp."); - } - virtual void handle(const TransposeOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TransposeOp."); - } - virtual void handle(const ShiftOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ShiftOp."); - } - virtual void handle(const GatherOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for GatherOp."); - } - virtual void handle(const ViewOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ViewOp."); - } + protected: + virtual void unhandled(const Statement* stmt) final; }; -class TORCH_CUDA_CU_API OptInDispatch : public PolymorphicBase { +class TORCH_CUDA_CU_API OptInDispatch : public OptOutDispatch { public: - // Hierarchal dispatch functions for handle - virtual void handle(Statement* s); - virtual void handle(Expr* e); - virtual void handle(Val* v); + using OptOutDispatch::handle; - // Vals - virtual void handle(IterDomain*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for IterDomain."); - } - virtual void handle(TensorDomain*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TensorDomain."); - } - virtual void handle(TensorView*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TensorView."); - } - virtual void handle(Bool*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Bool."); - } - virtual void handle(Double*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Double."); - } - virtual void handle(Int*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Int."); - } - virtual void handle(NamedScalar*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for NamedScalar."); - } - - // Exprs - virtual void handle(Split*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Split."); - } - virtual void handle(Merge*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Merge."); - } - virtual void handle(UnaryOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for UnaryOp."); - } - virtual void handle(BinaryOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for BinaryOp."); - } - virtual void handle(TernaryOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TernaryOp."); - } - virtual void handle(ReductionOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ReductionOp."); - } - virtual void handle(WelfordOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for WelfordOp."); - } - virtual void handle(BroadcastOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for BroadcastOp."); - } - virtual void handle(TransposeOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TransposeOp."); - } - virtual void handle(ShiftOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ShiftOp."); - } - virtual void handle(GatherOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for GatherOp."); - } - virtual void handle(ViewOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ViewOp."); - } + protected: + virtual void unhandled(Statement* stmt) final; }; +// Class to perform mutations on Fusion IR. Exprs can simply be redefined, but +// when mutating values they have to be registered through registerMutation so +// that exprs can detect there's been a muatation and know to modify all +// instances of that Val. This means each Val should be mutated "consistently". +// Otherwise behavior may be difficult to understand as it depends on which +// order mutate is called in. This class expects user to topologically call the +// statments of interest so inputs are called and mutated before exprs depending +// on them. +// +// Warning: TensorViews need to be treated carefully. As we don't generally +// register their mutation when their tensor domains only change. If a TV needs +// to be swapped out, it needs to be registered as a "proper" mutation like +// other vals, on top of TensorDomain being updated in the mutated TensorView. +// // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) class TORCH_CUDA_CU_API OptOutMutator : public PolymorphicBase { public: // Hierarchal dispatch functions for handle - virtual Statement* mutate(Statement* s); - virtual Statement* mutate(Expr* e); - virtual Statement* mutate(Val* v); - - // We always want to dispatch through a Val, so we can capture and dispatch - // correctly members of nodes like Split->TensorDomain If we don't call the - // below function or manually cast to use mutate(Val* v) we can't intercept - // and mutate by capturing mutate(Val* v), which is what we do when we want to - // replace all instances of a value. - Statement* mutateAsVal(Val* v) { - return mutate(v); - } + virtual void mutate(Statement* s); + virtual void mutate(Expr* e); + virtual void mutate(Val* v); + + void registerMutation(Val* val, Val* mutation); - void registerMutation(Val* val, Val* mutation) { - TORCH_INTERNAL_ASSERT( - mutations.find(val) == mutations.end(), - " The same value is incorrectly being mutated twice.", - " One mutation per mutation pass is allowed."); - mutations[val] = mutation; + Val* maybeMutated(Val* val) { + if (mutations.find(val) == mutations.end()) { + return val; + } + return mutations.at(val); } std::unordered_map mutations; @@ -311,105 +268,51 @@ class TORCH_CUDA_CU_API OptOutMutator : public PolymorphicBase { //****Functions below defined in mutator.cpp***** // Vals - virtual Statement* mutate(IterDomain*); - virtual Statement* mutate(TensorDomain*); - virtual Statement* mutate(TensorView*); - virtual Statement* mutate(Bool*); - virtual Statement* mutate(Double*); - virtual Statement* mutate(Int*); - virtual Statement* mutate(NamedScalar*); + virtual void mutate(Bool*); + virtual void mutate(Double*); + virtual void mutate(Int*); + virtual void mutate(ComplexDouble*); + virtual void mutate(NamedScalar*); + virtual void mutate(IterDomain*); + virtual void mutate(TensorDomain*); + virtual void mutate(TensorView*); + + virtual void mutate(kir::Predicate*); + virtual void mutate(kir::TensorIndex*); // Exprs - virtual Statement* mutate(Split*); - virtual Statement* mutate(Merge*); - virtual Statement* mutate(UnaryOp*); - virtual Statement* mutate(BinaryOp*); - virtual Statement* mutate(TernaryOp*); - virtual Statement* mutate(ReductionOp*); - virtual Statement* mutate(WelfordOp*); - virtual Statement* mutate(BroadcastOp*); - virtual Statement* mutate(TransposeOp*); - virtual Statement* mutate(ShiftOp*); - virtual Statement* mutate(GatherOp*); - virtual Statement* mutate(ViewOp*); -}; - -// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) -class TORCH_CUDA_CU_API OptInMutator : public PolymorphicBase { - public: - std::unordered_map mutations; - - public: - void registerMutation(Val* val, Val* mutation) { - TORCH_INTERNAL_ASSERT( - mutations.find(val) == mutations.end(), - " The same value is incorrectly being mutated twice.", - " One mutation per mutation pass is allowed."); - mutations[val] = mutation; - } - - // Hierarchal dispatch functions for mutate - virtual Statement* mutate(Statement*); - virtual Statement* mutate(Expr*); - virtual Statement* mutate(Val*); - - // Vals - virtual Statement* mutate(IterDomain*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for IterDomain."); - } - virtual Statement* mutate(TensorDomain*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for TensorDomain."); - } - virtual Statement* mutate(TensorView*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for TensorView."); - } - virtual Statement* mutate(Bool*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for Bool."); - } - virtual Statement* mutate(Int*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for Int."); - } - virtual Statement* mutate(NamedScalar*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for NamedScalar."); - } - - // Exprs - virtual Statement* mutate(Split*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for Split."); - } - virtual Statement* mutate(Merge*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for Merge."); - } - virtual Statement* mutate(UnaryOp*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for UnaryOp."); - } - virtual Statement* mutate(BinaryOp*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for BinaryOp."); - } - virtual Statement* mutate(TernaryOp*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for TernaryOp."); - } - virtual Statement* mutate(ReductionOp*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for ReductionOp."); - } - virtual Statement* mutate(WelfordOp*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for WelfordOp."); - } - virtual Statement* mutate(BroadcastOp*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for BroadcastOp."); - } - virtual Statement* mutate(TransposeOp*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for TransposeOp."); - } - virtual Statement* mutate(ShiftOp*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for ShiftOp."); - } - virtual Statement* mutate(GatherOp*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for GatherOp."); - } - virtual Statement* mutate(ViewOp*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for ViewOp."); - } + virtual void mutate(UnaryOp*); + virtual void mutate(BinaryOp*); + virtual void mutate(TernaryOp*); + virtual void mutate(ReductionOp*); + virtual void mutate(GroupedReductionOp*); + virtual void mutate(WelfordOp*); + virtual void mutate(MmaOp*); + virtual void mutate(BroadcastOp*); + + virtual void mutate(Split*); + virtual void mutate(Merge*); + virtual void mutate(TransposeOp*); + virtual void mutate(ShiftOp*); + virtual void mutate(GatherOp*); + virtual void mutate(ViewAsScalar*); + virtual void mutate(ViewOp*); + + virtual void mutate(kir::Allocate*); + virtual void mutate(kir::BlockSync*); + virtual void mutate(kir::GridSync*); + virtual void mutate(kir::InitMagicZero*); + virtual void mutate(kir::UpdateMagicZero*); + virtual void mutate(kir::ForLoop*); + virtual void mutate(kir::IfThenElse*); + virtual void mutate(kir::GridReduction*); + virtual void mutate(kir::GroupedGridReduction*); + virtual void mutate(kir::GridBroadcast*); + virtual void mutate(kir::GridWelford*); + virtual void mutate(kir::AllocateFusedReduction*); + + protected: + void removeExpr(IrContainer*, Expr*); }; } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/evaluator_common.cpp b/torch/csrc/jit/codegen/cuda/evaluator_common.cpp index 288dbb198b00..83107569dc54 100644 --- a/torch/csrc/jit/codegen/cuda/evaluator_common.cpp +++ b/torch/csrc/jit/codegen/cuda/evaluator_common.cpp @@ -1,9 +1,11 @@ -#include #include #include +#include #include #include +#include + namespace torch { namespace jit { namespace fuser { @@ -68,8 +70,8 @@ std::vector makeSortedEvaluationList(std::vector input) { //! Kernel IR utility, collects all the symbolic integers //! used in allocation nodes. void collectBufferSizes( - std::vector& into, - const std::vector& exprs) { + std::vector& into, + const std::vector& exprs) { for (auto expr : exprs) { if (auto allocate = dynamic_cast(expr)) { into.push_back(allocate->size()); @@ -82,56 +84,44 @@ void collectBufferSizes( } } -//! Kernel IR utility, collects all the kir symbolic +//! Kernel IR utility, collects all the kernel symbolic //! integers we will need at runtime, i.e. after the //! generated cuda kernel has already been compiled. //! The values are to be used for runtime logic, like //! `computeLaunchparams`. -std::vector collectRuntimeUsedIntegers( - Fusion* fusion, - GpuLower* lower) { - std::vector ret; - +std::vector collectRuntimeUsedIntegers(kir::Kernel* kernel) { + std::vector ret; + auto all_tvs = ir_utils::allTvs(kernel); // Collect extent and integer inputs - for (auto val : fusion->usedMathVals()) { - auto kir_val = lower->lowerValue(val); - if (auto kir_tv = dynamic_cast(kir_val)) { - for (auto id : kir_tv->domain()->domain()) { - ret.push_back(id->extent()); - } - } else if (val->isFusionInput()) { - if (kir_val->isA()) { - ret.push_back(kir_val); - } + for (auto tv : all_tvs) { + for (auto id : tv->domain()->domain()) { + ret.push_back(id->extent()); + } + } + for (auto inp : kernel->inputs()) { + if (inp->isA()) { + ret.push_back(inp); } } - // Collect allocation sizes: - collectBufferSizes(ret, lower->kernel()->topLevelExprs()); - + collectBufferSizes(ret, kernel->topLevelExprs()); return makeSortedEvaluationList(ret); } -//! Fusion IR utility, collects all the fusionIR symbolic -//! integers we will need at runtime, i.e. after the -//! generated cuda kernel has already been compiled. -//! The values are to be used for runtime logic, like -//! `canSchedule` in heuristic look up. + std::vector collectRuntimeUsedIntegers(Fusion* fusion) { std::vector ret; - + auto all_tvs = ir_utils::allTvs(fusion); // Collect extent and integer inputs - for (auto val : fusion->usedMathVals()) { - if (auto tv = dynamic_cast(val)) { - for (auto id : tv->domain()->domain()) { - ret.push_back(id->extent()); - } - } else if (val->isFusionInput()) { - if (val->isA()) { - ret.push_back(val); - } + for (auto tv : all_tvs) { + for (auto id : tv->domain()->domain()) { + ret.push_back(id->extent()); + } + } + for (auto inp : fusion->inputs()) { + if (inp->isA()) { + ret.push_back(inp); } } - return makeSortedEvaluationList(ret); } @@ -140,7 +130,7 @@ std::vector collectRuntimeUsedIntegers(Fusion* fusion) { template void PrecomputedIntegersBase::initializeValueList( typename IRContext::EVALUATOR_TYPE& const_evaluator, - const std::vector& sorted_value_list) { + const std::vector& sorted_value_list) { // Initialize workspace num_of_values_ = sorted_value_list.size(); defined_ = std::vector(num_of_values_, false); @@ -161,7 +151,7 @@ void PrecomputedIntegersBase::initializeValueList( template c10::optional PrecomputedIntegersBase::getMaybeValueFor( - const IR_VAL* val) { + const Val* val) { auto index = val->evaluatorIndex(); if (index < 0) { return c10::nullopt; @@ -172,6 +162,17 @@ c10::optional PrecomputedIntegersBase::getMaybeValueFor( return values_[index]; } +template +void PrecomputedIntegersBase::print() const { + std::cout << "Precomputed Integers:\n"; + for (auto i : c10::irange(symbols_.size())) { + if (defined_[i]) { + std::cout << symbols_[i]->toInlineString() << " = " << values_[i] + << std::endl; + } + } +} + template void PrecomputedIntegersBase::evaluate() { FUSER_PERF_SCOPE("PrecomputedIntegers::Evaluate"); @@ -208,10 +209,9 @@ NaiveIntegerMachine::NaiveIntegerMachine( for (auto val : precomputed_integers_.symbols_) { auto def = val->definition(); if (def) { - if (auto uop = dynamic_cast(def)) { + if (auto uop = dynamic_cast(def)) { makeUnaryOp(uop); - } else if ( - auto bop = dynamic_cast(def)) { + } else if (auto bop = dynamic_cast(def)) { makeBinaryOp(bop); } else { TORCH_INTERNAL_ASSERT(false, "Unsupported expr"); @@ -234,8 +234,7 @@ void NaiveIntegerMachine::run() { } template -void NaiveIntegerMachine::makeUnaryOp( - typename IRContext::UNARY_OP_TYPE* uop) { +void NaiveIntegerMachine::makeUnaryOp(UnaryOp* uop) { int in = uop->inputs()[0]->evaluatorIndex(); int out = uop->outputs()[0]->evaluatorIndex(); TORCH_INTERNAL_ASSERT(in >= 0, "Integer Machine: unknown input: ", uop); @@ -249,8 +248,7 @@ void NaiveIntegerMachine::makeUnaryOp( } template -void NaiveIntegerMachine::makeBinaryOp( - typename IRContext::BINARY_OP_TYPE* bop) { +void NaiveIntegerMachine::makeBinaryOp(BinaryOp* bop) { int in0 = bop->inputs()[0]->evaluatorIndex(); int in1 = bop->inputs()[1]->evaluatorIndex(); int out = bop->outputs()[0]->evaluatorIndex(); @@ -377,11 +375,8 @@ void NaiveIntegerMachine::runBinaryOp(int index) { precomputed_integers_.defined_[dest_index] = true; } -KernelPrecomputedIntegers::KernelPrecomputedIntegers( - Fusion* fusion, - GpuLower& lower) - : lower_(&lower) { - loadSymbols(collectRuntimeUsedIntegers(fusion, lower_)); +KernelPrecomputedIntegers::KernelPrecomputedIntegers(kir::Kernel* kernel) { + loadSymbols(collectRuntimeUsedIntegers(kernel)); kir::ExpressionEvaluator evaluator; initializeValueList(evaluator, symbols()); initializeNamedScalars(); @@ -389,11 +384,11 @@ KernelPrecomputedIntegers::KernelPrecomputedIntegers( } void KernelPrecomputedIntegers::bindTensorMetaData( - kir::TensorView* tv, + TensorView* tv, const at::Tensor& at_tensor) { - std::vector> ret; + std::vector> ret; const auto root_domain = - kir::TensorDomain::noReductions(tv->domain()->rootDomain()); + TensorDomain::noReductions(tv->domain()->getMaybeRFactorDomain()); TORCH_INTERNAL_ASSERT( at_tensor.ndimension() == static_cast(root_domain.size()), "Something went wrong configuring launch. Inputs do not match."); @@ -411,7 +406,7 @@ namespace { //! and returns the corresponding parallel type if a match //! is found. c10::optional getMaybeThreadSizeParallelType( - kir::NamedScalar* named_scalar) { + NamedScalar* named_scalar) { auto& var_name = named_scalar->name(); for (auto ptype : kParallelTypeThreads) { if (var_name == stringifyThreadSize(ptype)) { @@ -425,7 +420,7 @@ c10::optional getMaybeThreadSizeParallelType( void KernelPrecomputedIntegers::initializeNamedScalars() { for (auto val : symbols()) { - if (auto named_scalar = dynamic_cast(val)) { + if (auto named_scalar = dynamic_cast(val)) { auto maybe_parallel_type = getMaybeThreadSizeParallelType(named_scalar); if (maybe_parallel_type.has_value()) { auto& index_list = @@ -440,17 +435,17 @@ void KernelPrecomputedIntegers::initializeNamedScalars() { } void KernelPrecomputedIntegers::bindKernelInputs( + kir::Kernel* kernel, const at::ArrayRef& aten_inputs) { if (hasValidValues()) { invalidate(); } - auto kernel = lower_->kernel(); const auto& inputs = kernel->inputs(); for (const auto i : c10::irange(inputs.size())) { const auto input = inputs[i]; - if (auto tensor_input = dynamic_cast(input)) { + if (auto tensor_input = dynamic_cast(input)) { const auto aten_tensor = aten_inputs[i].toTensor(); bindTensorMetaData(tensor_input, aten_tensor); } else if (input->isScalar() && input->dtype() == DataType::Int) { diff --git a/torch/csrc/jit/codegen/cuda/evaluator_common.h b/torch/csrc/jit/codegen/cuda/evaluator_common.h index 0c16e2a8b046..7cbe37c602b9 100644 --- a/torch/csrc/jit/codegen/cuda/evaluator_common.h +++ b/torch/csrc/jit/codegen/cuda/evaluator_common.h @@ -35,18 +35,14 @@ class ExpressionEvaluator; //! Context for using generic logic on FusionIR class FusionIRContext { public: - using VAL_TYPE = Val; - using EXPR_TYPE = Expr; using TV_TYPE = TensorView; using EVALUATOR_TYPE = ExpressionEvaluator; - using BINARY_OP_TYPE = BinaryOp; - using UNARY_OP_TYPE = UnaryOp; - static BinaryOpType getOpType(BINARY_OP_TYPE* bop) { + static BinaryOpType getOpType(BinaryOp* bop) { return bop->getBinaryOpType(); } - static UnaryOpType getOpType(UNARY_OP_TYPE* uop) { + static UnaryOpType getOpType(UnaryOp* uop) { return uop->getUnaryOpType(); } }; @@ -54,19 +50,14 @@ class FusionIRContext { //! Context for using generic logic on KernelIR class KernelIRContext { public: - using VAL_TYPE = kir::Val; - using EXPR_TYPE = kir::Expr; - using TV_TYPE = kir::TensorView; using EVALUATOR_TYPE = kir::ExpressionEvaluator; - using BINARY_OP_TYPE = kir::BinaryOp; - using UNARY_OP_TYPE = kir::UnaryOp; - static BinaryOpType getOpType(BINARY_OP_TYPE* bop) { - return bop->operation(); + static BinaryOpType getOpType(BinaryOp* bop) { + return bop->getBinaryOpType(); } - static UnaryOpType getOpType(UNARY_OP_TYPE* uop) { - return uop->operation(); + static UnaryOpType getOpType(UnaryOp* uop) { + return uop->getUnaryOpType(); } }; @@ -97,10 +88,10 @@ class NaiveIntegerMachine { private: //! Convert an unary IR expr to an instruction - void makeUnaryOp(typename IRContext::UNARY_OP_TYPE* uop); + void makeUnaryOp(UnaryOp* uop); //! Convert an binary IR expr to an instruction - void makeBinaryOp(typename IRContext::BINARY_OP_TYPE* bop); + void makeBinaryOp(BinaryOp* bop); //! Create an empty instruction with all default values //! and place it at the end of the instruction buffer. @@ -169,11 +160,6 @@ class NaiveIntegerMachine { //! integers and store them in the workspace ahead of time. template class PrecomputedIntegersBase { - using IR_UNARY_OP = typename IRContext::UNARY_OP_TYPE; - using IR_BINARY_OP = typename IRContext::BINARY_OP_TYPE; - using IR_VAL = typename IRContext::VAL_TYPE; - using IR_EXPR = typename IRContext::EXPR_TYPE; - using IR_TV = typename IRContext::TV_TYPE; using INTEGER_MACHINE = NaiveIntegerMachine; public: @@ -190,7 +176,10 @@ class PrecomputedIntegersBase { //! Returns value for the given IR node if it's stored //! in the workspace and has been evaluated. - c10::optional getMaybeValueFor(const IR_VAL* val); + c10::optional getMaybeValueFor(const Val* val); + + //! Debugging helper, prints all the currently known values + void print() const; protected: //! Initialize the workspace before first use. @@ -198,7 +187,7 @@ class PrecomputedIntegersBase { //! been topologically sorted. void initializeValueList( typename IRContext::EVALUATOR_TYPE& evaluator, - const std::vector& sorted_value_list); + const std::vector& sorted_value_list); //! Bind concrete value to the given index //! if the index is valid. @@ -215,12 +204,12 @@ class PrecomputedIntegersBase { void invalidate(); //! Interface for subclasses to access symbols_ - void loadSymbols(std::vector symbols) { + void loadSymbols(std::vector symbols) { symbols_ = std::move(symbols); } //! Interface for subclasses to access symbols_ - std::vector& symbols() { + std::vector& symbols() { return symbols_; } @@ -267,7 +256,7 @@ class PrecomputedIntegersBase { std::vector values_; //! Stores the IR nodes corresponding to each index. - std::vector symbols_; + std::vector symbols_; //! An internal log to keep track of all the bindings //! used in each evaluation cycle. To be used for @@ -308,12 +297,14 @@ class KernelPrecomputedIntegers public: using ParallelExtentMap = - std::unordered_map, TypeHash>; + std::unordered_map, TypeHash>; - KernelPrecomputedIntegers(Fusion* fusion, GpuLower& lower); + KernelPrecomputedIntegers(kir::Kernel* kernel); //! Bind concrete values from fusion runtime inputs - void bindKernelInputs(const at::ArrayRef& aten_inputs); + void bindKernelInputs( + kir::Kernel* kernel, + const at::ArrayRef& aten_inputs); //! Bind concrete values from launch constraints void bindParallelExtents( @@ -326,7 +317,7 @@ class KernelPrecomputedIntegers void bindConcreteParallelTypeValue(ParallelType pt, int64_t value); private: - void bindTensorMetaData(kir::TensorView* tv, const at::Tensor& at_tensor); + void bindTensorMetaData(TensorView* tv, const at::Tensor& at_tensor); //! Iterate through all the named scalars corresponding //! to thread sizes and pre-group them by their parallel @@ -334,8 +325,6 @@ class KernelPrecomputedIntegers void initializeNamedScalars(); private: - GpuLower* lower_ = nullptr; - //! Contains all the named scalars correspond //! to thread size of each parallel type. std::unordered_map>, TypeHash> diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp index 647cf4ec0e2f..98bbb9e0324e 100644 --- a/torch/csrc/jit/codegen/cuda/executor.cpp +++ b/torch/csrc/jit/codegen/cuda/executor.cpp @@ -1,3 +1,4 @@ + #include #include @@ -8,21 +9,12 @@ #include #include #include -#include #include #include #include +#include #include - -#ifndef AT_PER_OPERATOR_HEADERS -#include -#include -#else -#include -#include -#endif - #include #include #include @@ -65,6 +57,18 @@ typedef unsigned long long int uint64_t; )"; } +static const std::string& defineComplexTypes() { + static std::string result = std::string(R"ESCAPE( +#define POS_INFINITY __int_as_float(0x7f800000) +#define INFINITY POS_INFINITY +#define NEG_INFINITY __int_as_float(0xff800000) +#define NAN __int_as_float(0x7fffffff) +)ESCAPE") + + at::cuda::get_traits_string() + at::cuda::get_complex_body_string() + + at::cuda::get_cmath_string() + at::cuda::get_complex_math_string(); + return result; +} + } // namespace std::string FusionExecutor::getStructuredCode(const std::string& kernel) { @@ -79,7 +83,7 @@ std::string FusionExecutor::getStructuredCode(const std::string& kernel) { #endif code += std::string("namespace ") + FusionExecutor::kernelNamespace() + " {\n" + defineIntegerTypes() + defineIndexMode(options_.index_mode) + - executor_utils::kernelPreamble() + kernel + "}\n"; + defineComplexTypes() + executor_utils::kernelPreamble() + kernel + "}\n"; if (isDebugDumpEnabled(DebugDumpOption::CudaKernel)) { std::cout << "\n======= Codegen output for kernel: " << kernelName() @@ -108,8 +112,6 @@ void FusionExecutor::debugCompileFusionFromStr( const std::string& name, int id, CompileOptions options) { - fusion_ = *fusion; - FusionGuard fg(&fusion_); options_ = options; if (isDebugDumpEnabled(DebugDumpOption::FusionIr)) { @@ -126,11 +128,12 @@ void FusionExecutor::debugCompileFusionFromStr( << std::endl; } - setUsedTVs(); + lowered_ = std::make_unique(fusion); + const auto kernel = lowered_->kernel(); + fusion_ = lowered_->kernel(); fusion_id_ = id; - lowered_ = GpuLower(&fusion_); - const auto kernel = lowered_.kernel(); + setUsedTVs(); if (isDebugDumpEnabled(DebugDumpOption::KernelIr)) { kernel->print(); @@ -144,20 +147,21 @@ void FusionExecutor::debugCompileFusionFromStr( const auto static_smem_size = computeSharedMemory( static_evaluator, kernel_summary.static_smem_allocations); TORCH_INTERNAL_ASSERT( - static_smem_size < max_device_smem, + static_smem_size < max_static_smem_, "The static shared memory allocation is larger than available memory."); } - compiled_kernel_ = executor_utils::nvrtcCompile(code, name, fusion_id_); + std::tie(compiled_kernel_, last_compiler_log_) = + executor_utils::nvrtcCompile(code, name, fusion_id_); TORCH_INTERNAL_ASSERT( fusion_id_ > 0, "assign a fusion_id_ <= 0 is not accepted."); } void FusionExecutor::compileFusion( Fusion* fusion, - CompileOptions options, const at::ArrayRef& inputs, - const LaunchParams& launch_constraints) { + const LaunchParams& launch_constraints, + CompileOptions options) { FUSER_PERF_SCOPE("compileFusion"); TORCH_INTERNAL_ASSERT( @@ -175,40 +179,50 @@ void FusionExecutor::compileFusion( fusion->printMath(); } - // Clone the fusion so we can store it - fusion_ = *fusion; - FusionGuard fg(&fusion_); options_ = options; c10::DeviceGuard dg(options_.device); TORCH_INTERNAL_ASSERT( - options.device.is_cuda(), "Provided device to CUDA fuser is the CPU."); - auto properties = at::cuda::getDeviceProperties(options.device.index()); - max_device_smem = properties->sharedMemPerBlock; + options_.device.is_cuda(), "Provided device to CUDA fuser is the CPU."); + auto properties = at::cuda::getDeviceProperties(options_.device.index()); + configured_device_smem_ = properties->sharedMemPerBlock; +#ifndef __HIP_PLATFORM_HCC__ + device_smem_limit_ = properties->sharedMemPerBlockOptin; +#else + // don't know if rocm supports opt-in shared memroy reconfiguration + device_smem_limit_ = properties->sharedMemPerBlock; +#endif warp_size_ = properties->warpSize; - setUsedTVs(); + lowered_ = std::make_unique( + fusion, + options_.index_mode == KernelIndexMode::INT64 ? DataType::Int + : DataType::Int32); + const auto kernel = lowered_->kernel(); + fusion_ = lowered_->kernel()->as(); fusion_id_ = ++fusion_id_counter_; - lowered_ = GpuLower(&fusion_); - const auto kernel = lowered_.kernel(); + setUsedTVs(); if (isDebugDumpEnabled(DebugDumpOption::KernelIr)) { kernel->print(); } - const auto kernel_code = codegen::generateCudaKernel(kernel, kernelName()); - const auto structured_code = getStructuredCode(kernel_code); + kernel_code_ = codegen::generateCudaKernel(kernel, kernelName()); + const auto structured_code = getStructuredCode(kernel_code_); const auto& kernel_summary = kernel->summary(); + // We currently shouldn't allocate any more shared mem + // tensors statically but could keep this path if + // needed in later development. if (!kernel_summary.static_smem_allocations.empty()) { kir::ExpressionEvaluator static_evaluator; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) const auto static_smem_size = computeSharedMemory( static_evaluator, kernel_summary.static_smem_allocations); TORCH_INTERNAL_ASSERT( - static_smem_size < max_device_smem, + static_smem_size < max_static_smem_, "The static shared memory allocation is larger than available memory."); } @@ -216,7 +230,7 @@ void FusionExecutor::compileFusion( std::stringstream ss; ss << "Allocations must be based on constant integers for local memory. However, found: "; for (auto alloc : kernel_summary.dynamic_lmem_allocations) { - ss << toString(alloc->buffer(), false) << ", "; + ss << alloc->buffer()->toString() << ", "; } ss << " have dynamic allocations but are placed in local memory."; TORCH_INTERNAL_ASSERT(false, ss.str()); @@ -233,20 +247,32 @@ void FusionExecutor::compileFusion( block_size > 0, "launch param inferred block size < 0"); } - compiled_kernel_ = executor_utils::nvrtcCompile( + block_size_high_water_mark = + block_size.has_value() ? block_size.value() : block_size_high_water_mark; + std::tie(compiled_kernel_, last_compiler_log_) = executor_utils::nvrtcCompile( structured_code, (kernelNamespace() + "::" + kernelName()).c_str(), fusion_id_, block_size); TORCH_INTERNAL_ASSERT( fusion_id_ > 0, "failed to assign a fusion_id_ after compilation."); + +#ifndef __HIP_PLATFORM_HCC__ + // The driver API call requires an int argument. + int max_dynamic_smem = 0; + AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuFuncGetAttribute( + &max_dynamic_smem, + CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, + compiled_kernel_.function)); + maybe_available_dynamic_smem_ = max_dynamic_smem; +#endif } namespace { at::Tensor inferAndAlloc( - const kir::TensorView* tv, - const std::vector& sizes, + const TensorView* tv, + const std::vector& sizes, kir::ExpressionEvaluator& expr_eval, const CompileOptions& options, bool zero_init = false) { @@ -260,9 +286,11 @@ at::Tensor inferAndAlloc( TORCH_INTERNAL_ASSERT( inferred_val.has_value(), "Could not launch kernel as program could not infer ", - kir::toString(size), - " for the buffer ", - kir::toString(tv)); + size->toString(), + "(", + size->name(), + ") for the buffer ", + tv->toString()); inferred_sizes.push_back(inferred_val.value()); } @@ -283,19 +311,20 @@ at::Tensor inferAndAlloc( } at::Tensor inferAndAllocOutput( - const kir::TensorView* tv, + const TensorView* tv, kir::ExpressionEvaluator& expr_eval, const CompileOptions& options, bool zero_init = false) { const auto domain = tv->domain(); - const auto maybe_rfactor_domain = - domain->hasRFactor() ? domain->rfactorDomain() : domain->rootDomain(); + const auto maybe_rfactor_domain = domain->hasRFactor() + ? domain->getRFactorDomain() + : domain->getRootDomain(); - std::vector sizes; + std::vector sizes; for (const auto id : maybe_rfactor_domain) { if (id->isReduction() || id->isStride() || - id->iterType() == IterType::BroadcastWithoutStride) { + id->getIterType() == IterType::BroadcastWithoutStride) { continue; } sizes.push_back(id->extent()); @@ -321,7 +350,8 @@ uint64_t FusionExecutor::computeSharedMemory( const uint64_t data_size = dataTypeSize(smem_alloc->buffer()->dtype()); // Add padding to align dynamic shared memory if (align_padding) { - total = ceilDiv(total, data_size) * data_size; + const int align_size = 16; // always align to 16B/128b. + total = ceilDiv(total, align_size) * align_size; } total += inferred_val.value() * data_size; } else { @@ -348,8 +378,7 @@ LaunchParams FusionExecutor::computeLaunchParams( auto data_cache = compileTimeDataCache(); - auto& lower = lowered_; - + auto lower = lowered_.get(); auto& used_tvs = getUsedTVs(); auto parallel_binding_ids_entry = executor_utils::caching::ExecutorCompileTimeEntry< @@ -364,9 +393,8 @@ LaunchParams FusionExecutor::computeLaunchParams( auto parallel_iter_extent_entry = executor_utils::caching::ExecutorCompileTimeEntry< executor_utils::caching::ParallelIterExtentMap>( - data_cache, [¶llel_binding_ids, &lower]() { - return executor_utils::getParallelIterExtents( - lower, parallel_binding_ids); + data_cache, [¶llel_binding_ids]() { + return executor_utils::getParallelIterExtents(parallel_binding_ids); }); auto& parallel_iter_extents = parallel_iter_extent_entry.get(); @@ -385,7 +413,7 @@ LaunchParams FusionExecutor::computeLaunchParams( executor_utils::caching::WarpPaddedParallelExtents>( data_cache, [¶llel_binding_ids, &lower]() { return executor_utils::getWarpPaddedExtentsInfo( - lower, parallel_binding_ids); + lower->kernel(), parallel_binding_ids); }); auto& warp_padded_extent_set = warp_padded_parallel_entry.get().warp_padded_extent_set; @@ -446,7 +474,9 @@ LaunchParams FusionExecutor::computeLaunchParams( auto val = expr_eval.evaluate(extent); TORCH_INTERNAL_ASSERT( val.has_value(), - "Tried to evaluate the extent of ", + "Tried to evaluate the extent, ", + extent->toInlineString(), + " for the ptype: ", p_type, " to set launch bounds but could not."); @@ -471,8 +501,12 @@ LaunchParams FusionExecutor::computeLaunchParams( } maximum_value = std::max(maximum_value, *val); } - expr_eval.bind(p_type, maximum_value); - launch_params.bind(maximum_value, p_type); + // Protect for size-0 tensors, they still have a value so would prefer to + // bind nothing than 0 + if (maximum_value > 0) { + expr_eval.bind(p_type, maximum_value); + launch_params.bind(maximum_value, p_type); + } } // Re-run the integer machine with all @@ -481,14 +515,15 @@ LaunchParams FusionExecutor::computeLaunchParams( expr_eval.precomputedIntegers()->evaluate(); } - const auto kernel = lowered_.kernel(); + const auto kernel = lowered_->kernel(); const auto& kernel_summary = kernel->summary(); // Calculate Dynamic Shared Memory Size // Add workspace for reduction and broadcast uint64_t reduction_broadcast_workspace = 0; const bool has_workspace = kernel_summary.has_block_reductions || - kernel_summary.has_grid_reductions || kernel_summary.has_block_broadcasts; + kernel_summary.has_grid_reductions || + kernel_summary.has_block_broadcasts || kernel_summary.has_grid_broadcasts; if (has_workspace && kernel_summary.largest_smem_data_type != DataType::Null) { // Not using nThreads here since it does not handle uninitialized value @@ -511,19 +546,35 @@ LaunchParams FusionExecutor::computeLaunchParams( true, reduction_broadcast_workspace); - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - const uint64_t static_smem_size = - computeSharedMemory(expr_eval, kernel_summary.static_smem_allocations); + // Check that requested smem size can be dynamically allocated. + // This check is only done once a kernel has been compiled, since + // maybe_available_dynamic_smem_ needs to be evaluated on + // a compiled kernel. + if (maybe_available_dynamic_smem_.has_value()) { + // Dynamic shared memory space that we can allocate without + // carving more space from L1. + const uint64_t available_dynamic_smem_without_reconfiguration = + maybe_available_dynamic_smem_.value(); + // Maximum additional shared memory size we could request + // if we do re-configuration. + const uint64_t additional_dynamic_smem_available_through_reconfiguration = + device_smem_limit_ - configured_device_smem_; + + TORCH_INTERNAL_ASSERT( + (dynamic_smem_size) < + (available_dynamic_smem_without_reconfiguration + + additional_dynamic_smem_available_through_reconfiguration), + "The total shared memory allocation is larger than available memory.", + " Dynamic size: ", + dynamic_smem_size, + ". Available size: ", + maybe_available_dynamic_smem_.value(), + ". Configured smem size: ", + configured_device_smem_, + ". Device limit size: ", + device_smem_limit_); + } - TORCH_INTERNAL_ASSERT( - (dynamic_smem_size + static_smem_size) < max_device_smem, - "The total shared memory allocation is larger than available memory.", - " Dynamic size: ", - dynamic_smem_size, - ". Static size: ", - static_smem_size, - ". Available size: ", - max_device_smem); launch_params.setSmem(dynamic_smem_size); return launch_params; @@ -533,14 +584,14 @@ FusionExecutor::GlobalBuffers FusionExecutor::allocGlobalVals( kir::ExpressionEvaluator& expr_eval) { FUSER_PERF_SCOPE("FusionExecutor::AllocGlobalVals"); GlobalBuffers global_buffers; - const auto kernel = lowered_.kernel(); - const auto& kernel_summary = lowered_.kernel()->summary(); + const auto kernel = lowered_->kernel(); + const auto& kernel_summary = kernel->summary(); for (auto alloc : kernel_summary.global_allocations) { TORCH_INTERNAL_ASSERT( - alloc->buffer()->isA(), + alloc->buffer()->isA(), "Cannot allocate global buffers that are not tensors."); - auto tv = alloc->buffer()->as(); - if (kernel->isOutput(tv)) { + auto tv = alloc->buffer()->as(); + if (tv->isFusionOutput()) { continue; } if (alloc->zeroInit()) { @@ -558,30 +609,48 @@ FusionExecutor::GlobalBuffers FusionExecutor::allocGlobalVals( } std::vector FusionExecutor::allocOutputs( + const at::ArrayRef& inputs, kir::ExpressionEvaluator& expr_eval, const std::unordered_set& alias_indices) { FUSER_PERF_SCOPE("FusionExecutor::AllocOutputs"); - const auto kernel = lowered_.kernel(); + const auto kernel = lowered_->kernel(); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) std::vector outputs; - for (const auto i : c10::irange(kernel->outputs().size())) { - TORCH_INTERNAL_ASSERT( - kernel->outputs()[i]->isA(), - "Cannot allocate outputs that are not tensors."); - auto output = kernel->outputs()[i]->as(); - if (alias_indices.count(i) == 0) { - outputs.push_back( - inferAndAllocOutput(output, expr_eval, options_, false)); + for (const auto out_i : c10::irange(kernel->outputs().size())) { + // Dummy output. + if (kernel->outputs()[out_i]->isFusionInput()) { + for (auto inp_i : c10::irange(kernel->inputs().size())) { + if (kernel->inputs()[inp_i] == kernel->outputs()[out_i]) { + TORCH_INTERNAL_ASSERT( + inp_i < inputs.size(), + "Issue with an input showing up as output, couldn't find input."); + TORCH_INTERNAL_ASSERT( + inputs[inp_i].isTensor(), + "Cannot register a scalar as an output in a fusion."); + outputs.push_back(inputs[inp_i].toTensor()); + break; + } + } } else { - // aliasing to inputs, no need to allocate real output - outputs.push_back(inferAndAlloc(output, {}, expr_eval, options_, false)); + TORCH_INTERNAL_ASSERT( + kernel->outputs()[out_i]->isA(), + "Cannot allocate outputs that are not tensors."); + auto output = kernel->outputs()[out_i]->as(); + if (alias_indices.count(out_i) == 0) { + outputs.push_back( + inferAndAllocOutput(output, expr_eval, options_, false)); + } else { + // aliasing to inputs, no need to allocate real output + outputs.push_back( + inferAndAlloc(output, {}, expr_eval, options_, false)); + } } } return outputs; } void FusionExecutor::setUsedTVs() { - auto used_vals = fusion_.usedMathVals(); + auto used_vals = fusion_->usedMathVals(); auto used_tvs = ir_utils::filterByType(used_vals); used_tvs_.clear(); @@ -595,24 +664,43 @@ std::vector FusionExecutor::runFusion( const LaunchParams& launch_constraints, const c10::optional& opt_code) { FUSER_PERF_SCOPE("FusionExecutor::RunFusion"); - + TORCH_INTERNAL_ASSERT(compiled()); TORCH_INTERNAL_ASSERT( fusion_id_ > 0, "Cannot run fusion, it was not compiled."); TORCH_INTERNAL_ASSERT( !opt_code.has_value() || outputs.empty(), "short cut input cache is not compatible with pre-allocated output"); + if (isDebugDumpEnabled(DebugDumpOption::FusionArgs)) { + std::cout << "Arguments for fusion" << fusion_id_ << ":" << std::endl + << "Inputs:" << std::endl; + for (const auto& input : inputs) { + if (input.isTensor()) { + const auto& input_tensor = input.toTensor(); + std::cout << " " << input_tensor.scalar_type() << " " + << input.toTensor().sizes() + << " (strides = " << input.toTensor().strides() << ")" + << std::endl; + } + } + std::cout << "Outputs:" << std::endl; + for (const auto& output : outputs) { + std::cout << " " << output.scalar_type() << " " << output.sizes() + << " (strides = " << output.strides() << ")" << std::endl; + } + std::cout << launch_constraints.toString(); + } + ExecutorEntry* executor_entry = nullptr; if (opt_code.has_value()) { executor_entry = &executor_entry_lookup_[*opt_code]; } - FusionGuard fg(&fusion_); c10::DeviceGuard dg(options_.device); auto stream = at::cuda::getCurrentCUDAStream(); executor_utils::initializeCudaContext(); - - LaunchParams launch_params; + TORCH_INTERNAL_ASSERT(lowered_); + launch_params_ = LaunchParams(); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) std::vector allocated_outputs = outputs; GlobalBuffers global_buffers; @@ -623,7 +711,7 @@ std::vector FusionExecutor::runFusion( // context manager to disable auto grad for `empty_cuda` calls later at::AutoDispatchBelowADInplaceOrView non_variable_type_mode; // take the short-cut for launch if we see a recorded input set again - launch_params = executor_entry->launch_params; + launch_params_ = executor_entry->launch_params; // only allocate outputs when not given if (outputs.empty()) { FUSER_PERF_SCOPE("ExecutorRunFusion::OutputAlloc"); @@ -642,7 +730,7 @@ std::vector FusionExecutor::runFusion( } } else { TORCH_INTERNAL_ASSERT( - outputs.size() == fusion_.outputs().size(), + outputs.size() == fusion_->outputs().size(), __func__, " provided number of outputs does match fusion output"); } @@ -672,38 +760,55 @@ std::vector FusionExecutor::runFusion( // code path to take when either: // 1. no opt_code is provided or // 2. `executor_entry` is not initialized - executor_utils::validateKernelInputs(&fusion_, inputs, options_.device); + executor_utils::validateKernelInputs(fusion_, inputs, options_.device); if (!evaluator_precomputed_integers_) { evaluator_precomputed_integers_ = - std::make_unique(&fusion_, lowered_); + std::make_unique(lowered_->kernel()); } kir::ExpressionEvaluator expr_eval; - evaluator_precomputed_integers_->bindKernelInputs(inputs); + evaluator_precomputed_integers_->bindKernelInputs( + lowered_->kernel(), inputs); expr_eval.precomputedIntegers() = evaluator_precomputed_integers_.get(); - launch_params = + launch_params_ = computeLaunchParams(launch_constraints, expr_eval, warp_size_); + // Recompile the kernel if the number of threads in the block has increased + if (launch_params_.nThreads() > block_size_high_water_mark) { + const auto kernel = lowered_->kernel(); + kernel_code_ = codegen::generateCudaKernel(kernel, kernelName()); + const auto structured_code = getStructuredCode(kernel_code_); + block_size_high_water_mark = launch_params_.nThreads(); + + std::tie(compiled_kernel_, last_compiler_log_) = + executor_utils::nvrtcCompile( + structured_code, + (kernelNamespace() + "::" + kernelName()).c_str(), + fusion_id_, + block_size_high_water_mark); + } + if (kernel()->summary().has_cooperative_grid_reduction) { #ifndef __HIP_PLATFORM_HCC__ int num_blocks_per_SM = -1; at::globalContext().getNVRTC().cuOccupancyMaxActiveBlocksPerMultiprocessor( &num_blocks_per_SM, compiled_kernel_.function, - (int)(launch_params.bdimx() * launch_params.bdimy() * launch_params.bdimz()), - (size_t)launch_params.smem()); + (int)(launch_params_.bdimx() * launch_params_.bdimy() * launch_params_.bdimz()), + (size_t)launch_params_.smem()); TORCH_INTERNAL_ASSERT( (int64_t)( num_blocks_per_SM * at::cuda::getDeviceProperties(options_.device.index()) - ->multiProcessorCount) >= launch_params.gdimx() * - launch_params.gdimy() * launch_params.gdimz(), + ->multiProcessorCount) >= launch_params_.gdimx() * + launch_params_.gdimy() * launch_params_.gdimz(), "Wanted to launch a cooperative kernel, however the number of blocks is greater than ", "what can be resident on the GPU at once. Need: ", - launch_params.gdimx() * launch_params.gdimy() * launch_params.gdimz(), + launch_params_.gdimx() * launch_params_.gdimy() * + launch_params_.gdimz(), " but limited to ", num_blocks_per_SM, " * ", @@ -716,16 +821,18 @@ std::vector FusionExecutor::runFusion( } executor_utils::validateVectorizedTensors( - &fusion_, inputs, outputs, lowered_, compileTimeDataCache(), expr_eval); - - auto& fusion = fusion_; + lowered_.get()->kernel(), + inputs, + outputs, + compileTimeDataCache(), + expr_eval); auto alias_indices_entry = executor_utils::caching::ExecutorCompileTimeEntry< executor_utils::caching::InputAliasIndices>( - compileTimeDataCache(), [&fusion]() { + compileTimeDataCache(), [&]() { return std::make_unique>>( - fusion.getInputAliasIndices()); + fusion_->getInputAliasIndices()); }); auto& alias_indices = alias_indices_entry.get(); @@ -736,14 +843,14 @@ std::vector FusionExecutor::runFusion( auto output_alias_indices_entry = executor_utils::caching::ExecutorCompileTimeEntry< executor_utils::caching::OutputAliasIndices>( - compileTimeDataCache(), [&fusion]() { + compileTimeDataCache(), [&]() { return std::make_unique>( - fusion.getOutputAliasIndices()); + fusion_->getOutputAliasIndices()); }); auto& output_alias_indices = output_alias_indices_entry.get(); - allocated_outputs = allocOutputs(expr_eval, output_alias_indices); + allocated_outputs = allocOutputs(inputs, expr_eval, output_alias_indices); for (const auto& entry : alias_indices) { TORCH_INTERNAL_ASSERT( @@ -753,7 +860,7 @@ std::vector FusionExecutor::runFusion( } else { // TODO: Update this as well; executor_utils::validateKernelOutputs( - &fusion_, allocated_outputs, options_.device); + fusion_, allocated_outputs, options_.device); } global_buffers = allocGlobalVals(expr_eval); @@ -768,7 +875,7 @@ std::vector FusionExecutor::runFusion( rand_offset = 4 * (std::ceil( allocated_outputs[0].numel() / - (4.0 * 128 * launch_params.gdimx())) + // NOLINT + (4.0 * 128 * launch_params_.gdimx())) + // NOLINT 1); } @@ -777,7 +884,7 @@ std::vector FusionExecutor::runFusion( if (executor_entry) { FUSER_PERF_SCOPE("ExecutorRunFusion::FillCacheEntry"); // record the the short-cut executor entry for the given input set; - executor_entry->launch_params = launch_params; + executor_entry->launch_params = launch_params_; executor_entry->io_alias_indices = alias_indices; for (const auto& output : allocated_outputs) { executor_entry->output_sizes.push_back(output.sizes().vec()); @@ -802,28 +909,31 @@ std::vector FusionExecutor::runFusion( kernel_arguments.push(inputs); kernel_arguments.push(allocated_outputs); kernel_arguments.push(global_buffers.buffers); - if (lowered_.kernel()->summary().is_stochastic) { + if (lowered_->kernel()->summary().is_stochastic) { kernel_arguments.appendPhiloxRNGSeed(rand_offset); } } if (isDebugDumpEnabled(DebugDumpOption::LaunchParam)) { - launch_params.print(); + launch_params_.print(); } - if (isDebugDumpEnabled(DebugDumpOption::PrintRuntimeArgs)) { + if (isDebugDumpEnabled(DebugDumpOption::KernelArgs)) { std::cout << "Arguments for kernel" << fusion_id_ << ":" << std::endl << "Inputs:" << std::endl; for (const auto& input : inputs) { if (input.isTensor()) { - std::cout << input.toTensor().scalar_type() << " " - << input.toTensor().sizes() << std::endl; + const auto& input_tensor = input.toTensor(); + std::cout << " " << input_tensor.scalar_type() << " " + << input.toTensor().sizes() + << " (strides = " << input.toTensor().strides() << ")" + << std::endl; } } std::cout << "Outputs:" << std::endl; for (const auto& output : allocated_outputs) { std::cout << " " << output.scalar_type() << " " << output.sizes() - << std::endl; + << " (strides = " << output.strides() << ")" << std::endl; } std::cout << "Reduction and semaphore buffers:" << std::endl; for (const auto& buffer : global_buffers.buffers) { @@ -836,24 +946,38 @@ std::vector FusionExecutor::runFusion( cudaEvent_t finish_event = {}; if (measure_kernel_time_ || - isDebugDumpEnabled(DebugDumpOption::EffectiveBandwidth)) { + isDebugDumpEnabled(DebugDumpOption::EffectiveBandwidth) || + isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) { cudaEventCreate(&start_event); cudaEventCreate(&finish_event); cudaEventRecord(start_event); } if (execute_kernel_) { + if (maybe_available_dynamic_smem_.has_value() && + launch_params_.smem() > maybe_available_dynamic_smem_.value()) { +#ifndef __HIP_PLATFORM_HCC__ + // Increase limit of dynamic shared memory if needed. + AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuFuncSetAttribute( + compiled_kernel_.function, + CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, + launch_params_.smem())); +#else + TORCH_INTERNAL_ASSERT( + false, "cuFuncSetAttribute not supported with HIP."); +#endif + } if (!kernel()->summary().has_cooperative_grid_reduction) { FUSER_PERF_SCOPE("ExecutorRunFusion::cuLaunchKernel"); AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuLaunchKernel( compiled_kernel_.function, - launch_params.gdimx(), - launch_params.gdimy(), - launch_params.gdimz(), - launch_params.bdimx(), - launch_params.bdimy(), - launch_params.bdimz(), - launch_params.smem(), + launch_params_.gdimx(), + launch_params_.gdimy(), + launch_params_.gdimz(), + launch_params_.bdimx(), + launch_params_.bdimy(), + launch_params_.bdimz(), + launch_params_.smem(), stream, kernel_arguments.getBuffer(), nullptr)); @@ -863,13 +987,13 @@ std::vector FusionExecutor::runFusion( AT_CUDA_DRIVER_CHECK( at::globalContext().getNVRTC().cuLaunchCooperativeKernel( compiled_kernel_.function, - launch_params.gdimx(), - launch_params.gdimy(), - launch_params.gdimz(), - launch_params.bdimx(), - launch_params.bdimy(), - launch_params.bdimz(), - launch_params.smem(), + launch_params_.gdimx(), + launch_params_.gdimy(), + launch_params_.gdimz(), + launch_params_.bdimx(), + launch_params_.bdimy(), + launch_params_.bdimz(), + launch_params_.smem(), stream, kernel_arguments.getBuffer())); #else @@ -880,7 +1004,8 @@ std::vector FusionExecutor::runFusion( } if (measure_kernel_time_ || - isDebugDumpEnabled(DebugDumpOption::EffectiveBandwidth)) { + isDebugDumpEnabled(DebugDumpOption::EffectiveBandwidth) || + isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) { cudaEventRecord(finish_event); cudaEventSynchronize(start_event); cudaEventSynchronize(finish_event); @@ -888,21 +1013,23 @@ std::vector FusionExecutor::runFusion( cudaEventDestroy(start_event); cudaEventDestroy(finish_event); - if (isDebugDumpEnabled(DebugDumpOption::EffectiveBandwidth)) { - size_t bytes = 0; - // Figure how many bytes are inputs, outputs, and temporary buffers - for (auto input : inputs) { - if (input.isTensor()) { - bytes += input.toTensor().numel() * - dataTypeSize(aten_to_data_type(input.toTensor().scalar_type())); - } - } - for (const auto& output : allocated_outputs) { - bytes += output.numel() * - dataTypeSize(aten_to_data_type(output.scalar_type())); + bytes_processed_ = 0; + // Figure how many bytes are inputs, outputs, and temporary buffers + for (auto input : inputs) { + if (input.isTensor()) { + bytes_processed_ += input.toTensor().numel() * + dataTypeSize(aten_to_data_type(input.toTensor().scalar_type())); } + } + for (const auto& output : allocated_outputs) { + bytes_processed_ += output.numel() * + dataTypeSize(aten_to_data_type(output.scalar_type())); + } + + if (isDebugDumpEnabled(DebugDumpOption::EffectiveBandwidth)) { double gb_per_s = - ((double)bytes / ((double)kernel_time_ms_ / 1000)) / (double)1.0e9; + ((double)bytes_processed_ / ((double)kernel_time_ms_ / 1000)) / + (double)1.0e9; std::cout << "kernel" << fusion_id_ << " run in " << kernel_time_ms_ << " ms, achieved: " << gb_per_s << " GB/s" << std::endl; } @@ -924,7 +1051,9 @@ void FusionExecutor::compileRtc( } fusion_id_ = 1; options_ = CompileOptions(); - compiled_kernel_ = executor_utils::nvrtcCompile(scode, name, fusion_id_); + + std::tie(compiled_kernel_, last_compiler_log_) = + executor_utils::nvrtcCompile(scode, name, fusion_id_); } void FusionExecutor::runRtc( diff --git a/torch/csrc/jit/codegen/cuda/executor.h b/torch/csrc/jit/codegen/cuda/executor.h index 523f2aa0e4b2..ab5175b22a13 100644 --- a/torch/csrc/jit/codegen/cuda/executor.h +++ b/torch/csrc/jit/codegen/cuda/executor.h @@ -35,9 +35,9 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable { void compileFusion( Fusion* fusion, - CompileOptions options = CompileOptions(), const at::ArrayRef& inputs = {}, - const LaunchParams& launch_constraints = LaunchParams()); + const LaunchParams& launch_constraints = LaunchParams(), + CompileOptions options = CompileOptions()); std::vector runFusion( const at::ArrayRef& inputs, @@ -55,7 +55,7 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable { // function to query whether a `FusionExecutor` has a compiled kernel to // execute bool compiled() const { - return fusion_id_ != -1; + return fusion_id_ != -1 && lowered_; }; void evictCache(size_t cache_id) { @@ -85,7 +85,8 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable { executor_utils::caching::ExecutorCompileTimeInfoCache; kir::Kernel* kernel() const { - return lowered_.kernel(); + TORCH_INTERNAL_ASSERT(lowered_); + return lowered_->kernel(); } //! Internal knob used for debugging/profiling only @@ -107,6 +108,32 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable { return measure_kernel_time_ ? kernel_time_ms_ : 0; } + //! Returns the number of bytes processed last kernel execution + int64_t bytesProcessed() const { + return bytes_processed_; + } + + //! Returns the launch parameters from the last kernel execution + LaunchParams lastLaunchParams() const { + return launch_params_; + } + + //! Returns the string of the compiled kernel + std::string kernelString() const { + return kernel_code_; + } + + //! Returns the latest compile log + std::string compilerLog() const { + return last_compiler_log_; + } + + std::string kernelName() const { + std::stringstream ss; + ss << "kernel" << fusion_id_; + return ss.str(); + } + //! Internal tests only. Compiles CUDA code with NVRTC directly from //! string. This util provides a path to test runtime code, i.e. the resource //! strings. @@ -132,12 +159,6 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable { std::vector zero_init; }; - std::string kernelName() const { - std::stringstream ss; - ss << "kernel" << fusion_id_; - return ss.str(); - } - static std::string kernelNamespace() { return "CudaCodeGen"; } @@ -164,6 +185,7 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable { // skip allocating real storage for those, but still maintain its spot to // maintain the indexing from output aliases to inputs std::vector allocOutputs( + const at::ArrayRef& inputs, kir::ExpressionEvaluator& expr_eval, const std::unordered_set& alias_indices = {}); @@ -178,10 +200,24 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable { } private: - Fusion fusion_; - CompileOptions options_; - size_t max_device_smem = std::numeric_limits().max(); + + //! Current configured total shared mem size from cudaDeviceProp + size_t configured_device_smem_ = std::numeric_limits().max(); + + //! Available shared memory space for dynamic allocation for the current + //! compiled kernel at the current shared memory/L1 configuration + c10::optional maybe_available_dynamic_smem_ = c10::nullopt; + + //! Absolute limit of all available shared mem space from cudaDeviceProp + size_t device_smem_limit_ = std::numeric_limits().max(); + + // Assuming sm70 or above: + // limit of statically allocated smem is 48 KB: + // See: + // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x + // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-8-x + const int max_static_smem_ = 48 << 10; int warp_size_ = 0; executor_utils::NvrtcFunction compiled_kernel_; @@ -192,12 +228,28 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable { int fusion_id_ = -1; static int fusion_id_counter_; - GpuLower lowered_; + std::unique_ptr lowered_; + // Copy of lowered_->kernel() + Fusion* fusion_ = nullptr; + + // Track the block size this kernel was compiled with. If the block size + // increases, recompile to adjust maxregister count. + int64_t block_size_high_water_mark = 1; // lookup table to take short cut to retrieve recorded information in order to // launch kernels without re-inference parameters. std::unordered_map executor_entry_lookup_; + // Compile time information caching. This is used for shape inference + // support. The cache stores graph information that are available + // without shape information so that each shape inference call will + // not need to re-compute them. + ExecutorCompileTimeInfoCache compile_time_info_cache_; + + // Cached expr eval + std::unique_ptr evaluator_precomputed_integers_ = + nullptr; + // Profiling support: knob to control wheter we actually execute the // kernel on the GPU or not bool execute_kernel_ = true; @@ -205,21 +257,24 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable { // Profiling support: knob to enable measuring kernel execution time bool measure_kernel_time_ = false; - // The last kernel execution time, if measure_kernel_time_ is true + // Profiling support: the last kernel execution time, if measure_kernel_time_ + // is true float kernel_time_ms_ = 0; + // Profiling support: the last kernel Bytes processed + int64_t bytes_processed_ = 0; + + // Profiling support: the last launch param used + LaunchParams launch_params_; + // Profiling support: knob to disable caching of launch params bool disable_parameter_cache_ = false; - // Compile time information caching. This is used for shape inference - // support. The cache stores graph information that are available - // without shape information so that each shape inference call will - // not need to re-compute them. - ExecutorCompileTimeInfoCache compile_time_info_cache_; + // Profiling support: kept copy of the cuda kernel + std::string kernel_code_; - // Cached expr eval - std::unique_ptr evaluator_precomputed_integers_ = - nullptr; + // Profiling support: nvrtc log for debugging + std::string last_compiler_log_; }; } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp b/torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp index 968570c1086d..da5667f9facc 100644 --- a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp +++ b/torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp @@ -1,4 +1,3 @@ -#include #include // Extract size and strides @@ -65,7 +64,7 @@ std::unique_ptr getTensorArg(int nDims) { false, "Tried to generate a tensor to run a generated kernel with ", nDims, - " dimensions, however it must be a size 0 to 8 dimensional tensor."); + " dimensions, however only 0 to 8 dimensional tensor are supported."); } return nullptr; } @@ -89,6 +88,10 @@ std::unique_ptr getTensorArg( return getTensorArg(nDims); case c10::ScalarType::Int: return getTensorArg(nDims); + case c10::ScalarType::ComplexFloat: + return getTensorArg, INDEX_MODE>(nDims); + case c10::ScalarType::ComplexDouble: + return getTensorArg, INDEX_MODE>(nDims); default: TORCH_CHECK( false, @@ -98,8 +101,6 @@ std::unique_ptr getTensorArg( } } -} // namespace - std::unique_ptr getTensorArg( c10::ScalarType dtype, int nDims, @@ -117,20 +118,73 @@ std::unique_ptr getTensorArg( return nullptr; } +} // namespace + // Push a tensor to the arguments void KernelArgumentHolder::push(const at::Tensor& tensor) { changed_ = true; - int nDims = tensor.ndimension(); - - c10::ScalarType dtype = tensor.scalar_type(); - std::unique_ptr tensor_arg = - getTensorArg(dtype, nDims, index_mode_); - tensor_arg->setPointer(tensor.data_ptr()); - for (const auto i : c10::irange(nDims)) { - tensor_arg->setSize(i, tensor.sizes()[i]); - tensor_arg->setStride(i, tensor.strides()[i]); + if (is_cpu_scalar(tensor)) { + switch (tensor.scalar_type()) { + case c10::ScalarType::Double: + arguments_.push_back( + std::make_unique< + CpuScalarTensorArg>>( + tensor.data_ptr()[0])); + break; + case c10::ScalarType::Float: + arguments_.push_back( + std::make_unique>>( + tensor.data_ptr()[0])); + break; + case c10::ScalarType::Half: + arguments_.push_back( + std::make_unique< + CpuScalarTensorArg>>( + tensor.data_ptr()[0])); + break; + case c10::ScalarType::BFloat16: + arguments_.push_back( + std::make_unique< + CpuScalarTensorArg>>( + tensor.data_ptr()[0])); + break; + case c10::ScalarType::Bool: + arguments_.push_back( + std::make_unique>>( + tensor.data_ptr()[0])); + break; + case c10::ScalarType::Long: + arguments_.push_back( + std::make_unique< + CpuScalarTensorArg>>( + tensor.data_ptr()[0])); + break; + case c10::ScalarType::Int: + arguments_.push_back( + std::make_unique< + CpuScalarTensorArg>>( + tensor.data_ptr()[0])); + break; + default: + TORCH_CHECK( + false, + "Dtype: ", + tensor.scalar_type(), + " not currently supported in code generated kernels."); + } + } else { + int nDims = tensor.ndimension(); + + c10::ScalarType dtype = tensor.scalar_type(); + std::unique_ptr tensor_arg = + getTensorArg(dtype, nDims, index_mode_); + tensor_arg->setPointer(tensor.data_ptr()); + for (const auto i : c10::irange(nDims)) { + tensor_arg->setSize(i, tensor.sizes()[i]); + tensor_arg->setStride(i, tensor.strides()[i]); + } + arguments_.push_back(std::move(tensor_arg)); } - arguments_.push_back(std::move(tensor_arg)); } // Push a scalar or integer to the arguments @@ -143,6 +197,10 @@ void KernelArgumentHolder::push(const IValue& val) { auto scalar_val = val.toScalar(); switch (scalar_val.type()) { // NOLINTNEXTLINE(bugprone-branch-clone) + case c10::ScalarType::ComplexDouble: + arguments_.push_back( + std::make_unique(scalar_val.toComplexDouble())); + return; case c10::ScalarType::Double: arguments_.push_back(std::make_unique(scalar_val.toDouble())); return; diff --git a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h b/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h index d306683c43dc..c135328a3acc 100644 --- a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h +++ b/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace torch { namespace jit { @@ -18,10 +19,8 @@ struct TensorArgCodegen { }; T* data; - // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays) - nvfuser_index_t size[N]; - // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays) - nvfuser_index_t stride[N]; + std::array size; + std::array stride; constexpr int nDims() { return N; } @@ -33,6 +32,7 @@ struct TensorArgCodegen { } }; +// 0-Dim GPU based tensor template struct TensorArgCodegen { T& operator[](nvfuser_index_t ind) { @@ -51,6 +51,17 @@ struct TensorArgCodegen { } }; +// Specialization for 0-dim case that's easy to pass in a CPU based tensor +// without memcpy +template +struct CpuScalarTensorCodegen { + T& operator[](int) { + return data; + }; + + T data; +}; + struct ArgAbstract { virtual ~ArgAbstract() = default; virtual void* arg() = 0; @@ -59,35 +70,39 @@ struct ArgAbstract { struct PhiloxCudaStateArg : public ArgAbstract { at::PhiloxCudaState val_; PhiloxCudaStateArg(at::PhiloxCudaState _val) : val_(_val){}; - // NOLINTNEXTLINE(modernize-use-override,cppcoreguidelines-explicit-virtual-functions) - void* arg() { + void* arg() override { return &val_; } }; struct LongArg : public ArgAbstract { int64_t val_; - explicit LongArg(int64_t _val) : val_(_val){}; - // NOLINTNEXTLINE(modernize-use-override,cppcoreguidelines-explicit-virtual-functions) - void* arg() { + explicit LongArg(int64_t _val) : val_(_val) {} + void* arg() override { return &val_; } }; struct DoubleArg : public ArgAbstract { double val_; - explicit DoubleArg(double _val) : val_(_val){}; - // NOLINTNEXTLINE(modernize-use-override,cppcoreguidelines-explicit-virtual-functions) - void* arg() { + explicit DoubleArg(double _val) : val_(_val) {} + void* arg() override { + return &val_; + } +}; + +struct ComplexDoubleArg : public ArgAbstract { + c10::complex val_; + explicit ComplexDoubleArg(c10::complex _val) : val_(_val) {} + void* arg() override { return &val_; } }; struct BoolArg : public ArgAbstract { bool val_; - explicit BoolArg(bool _val) : val_(_val){}; - // NOLINTNEXTLINE(modernize-use-override,cppcoreguidelines-explicit-virtual-functions) - void* arg() { + explicit BoolArg(bool _val) : val_(_val) {} + void* arg() override { return &val_; } }; @@ -119,9 +134,20 @@ struct TensorArg : public TensorArgAbstract { } }; -std::unique_ptr getTensorArg( - c10::ScalarType dtype, - int nDims); +template +struct CpuScalarTensorArg : public ArgAbstract { + CPU_TENSOR_TYPE instance_; + + CpuScalarTensorArg() = delete; + + explicit CpuScalarTensorArg(decltype(CPU_TENSOR_TYPE::data) _data) { + instance_.data = _data; + } + + void* arg() override { + return &instance_; + } +}; class KernelArgumentHolder { public: diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp index 13cdc29099ed..ef3d48aeb234 100644 --- a/torch/csrc/jit/codegen/cuda/executor_utils.cpp +++ b/torch/csrc/jit/codegen/cuda/executor_utils.cpp @@ -5,21 +5,24 @@ #include #include +#include #include #include #include #include -#include +#include #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -27,6 +30,9 @@ #include #include #include +#include +#include +#include #include #include @@ -69,9 +75,12 @@ std::string kernelPreamble() { // Base classes and helpers ss << nvfuser_resources::tensor_cu; + ss << nvfuser_resources::type_traits_cu; + ss << nvfuser_resources::array_cu; ss << nvfuser_resources::random_numbers_cu; ss << nvfuser_resources::helpers_cu; ss << nvfuser_resources::index_utils_cu; + ss << nvfuser_resources::tuple_cu; // Synchronization classes if (std::getenv("PYTORCH_NVFUSER_USE_BLOCK_SYNC_ATOMIC")) { @@ -88,6 +97,8 @@ std::string kernelPreamble() { ss << nvfuser_resources::broadcast_cu; ss << nvfuser_resources::welford_cu; ss << nvfuser_resources::warp_cu; + ss << nvfuser_resources::tensorcore_cu; + ss << nvfuser_resources::fused_reduction_cu; // Random utilities ss << nvfuser_resources::PhiloxCudaStateRaw_cu; @@ -110,13 +121,23 @@ bool validateKernelArgTensor( return false; } + if (is_cpu_scalar(arg) && !param->as()->isCpuScalar()) { + msg << "Argument is CPU Scalar Tensor, but parameter is not.\n"; + return false; + } + + if (!is_cpu_scalar(arg) && !arg.is_cuda()) { + msg << "Argumnet is a CPU tensor which is not supported in fusions.\n"; + return false; + } + // Check the rank of the tensors. size_t arg_dim = arg.dim(); // Note: This requires current Fusion to be active. // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - size_t param_dim = - TensorDomain::noReductions(param->as()->getRootDomain()) - .size(); + size_t param_dim = TensorDomain::noReductions( + param->as()->getMaybeRFactorDomain()) + .size(); // see [Note - broadcast support in integration] // Because of broadcasting support handled in integration, we relax the rank // check as necessary. @@ -126,7 +147,7 @@ bool validateKernelArgTensor( return false; } - if (arg.device() != device) { + if (!is_cpu_scalar(arg) && arg.device() != device) { msg << "Argument is on device that is not compiled for." << "\n"; return false; @@ -157,6 +178,12 @@ bool validateKernelArgTensor( case at::ScalarType::Bool: match = param_data_type == DataType::Bool; break; + case at::ScalarType::ComplexFloat: + match = param_data_type == DataType::ComplexFloat; + break; + case at::ScalarType::ComplexDouble: + match = param_data_type == DataType::ComplexDouble; + break; default: msg << "Argument element type, " << arg_data_type << ", is not supported." << "\n"; @@ -184,6 +211,10 @@ bool validateKernelArgScalar( case c10::ScalarType::Long: match = param_type == DataType::Int || param_type == DataType::Int32; break; + case c10::ScalarType::ComplexDouble: + match = param_type == DataType::ComplexDouble || + param_type == DataType::ComplexFloat; + break; case c10::ScalarType::Double: match = param_type == DataType::Double || param_type == DataType::Float || param_type == DataType::Half || param_type == DataType::BFloat16; @@ -245,6 +276,10 @@ bool checkSameStride(const std::vector& tensors) { // Return true if all the tensors are contiguous and have the same striding bool checkSameContiguity(const std::vector& tensors) { + if (tensors.size() < 2) { + return true; + } + auto reference = tensors.front(); if (!reference.isTensor()) { return false; @@ -277,6 +312,7 @@ bool checkValidMisalignedTensors( // Only check input tensors return checkSameStride(inp_tensors); } else if (!out_tv.empty() && out_tensors.empty()) { + // out_tensors is empty unless outputs are given to runFusion. // Assume out tensors are contiguous return checkSameContiguity(inp_tensors); } else { @@ -339,146 +375,289 @@ void validateKernelOutputs( !mismatch, "Found one or more invalid arguments: ", msg.str()); } -bool canVectorize(const IValue& aten_val, int word_size) { - if (!aten_val.isTensor()) { - return false; - } +namespace { - const auto& aten_tensor = aten_val.toTensor(); +// Finds a fusion input or output tensor to validate its stides +// for vectorization. +// Returns a pair consisting of a flag indicating it's a fusion input +// and an integer position within in the input or output tensor list. +std::vector> getVectorizedFusionInputOutput( + TensorView* producer_tv, + TensorView* consumer_tv, + Fusion* fusion) { + std::vector> vectorized_input_output; - if (reinterpret_cast(aten_tensor.data_ptr()) % - (word_size * aten_tensor.dtype().itemsize()) != - 0) { - return false; - } + // When the producer is a fusion input, validate only the producer + // and assume the consumer is contiguous. Similarly, when the + // consumer is a fusion output, validate the consumer and assume the + // producer is contiguous. - for (size_t i = aten_tensor.ndimension(); i > 0; i--) { - if (aten_tensor.size(i - 1) != 1) { - if (aten_tensor.size(aten_tensor.ndimension() - 1) % word_size != 0 || - aten_tensor.stride(aten_tensor.ndimension() - 1) != 1) { - return false; - } - break; - } + if (producer_tv->isFusionInput()) { + auto producer_it = std::find( + fusion->inputs().begin(), fusion->inputs().end(), producer_tv); + TORCH_INTERNAL_ASSERT( + producer_it != fusion->inputs().end(), + "Could not find ", + producer_tv, + " in fusion inputs."); + auto pos = std::distance(fusion->inputs().begin(), producer_it); + vectorized_input_output.push_back( + std::make_pair(true, static_cast(pos))); + } else { + // If not fusion input, assume it's fully contiguous, so nothing + // to check with respect to strides. + TORCH_INTERNAL_ASSERT( + std::all_of( + producer_tv->domain()->contiguity().begin(), + producer_tv->domain()->contiguity().end(), + [](bool contig) { return contig; }), + "Unsupported pattern of vectorization: ", + consumer_tv->definition()->toString()); } - for (auto stride : aten_tensor.strides()) { - if (stride != 1 && stride % word_size != 0) { - return false; - } + if (consumer_tv->isFusionOutput()) { + auto consumer_it = std::find( + fusion->outputs().begin(), fusion->outputs().end(), consumer_tv); + TORCH_INTERNAL_ASSERT( + consumer_it != fusion->outputs().end(), + "Could not find ", + consumer_tv, + " in fusion outputs."); + auto pos = std::distance(fusion->outputs().begin(), consumer_it); + vectorized_input_output.push_back( + std::make_pair(false, static_cast(pos))); + } else { + // If not fusion input, assume it's fully contiguous, so nothing + // to check with respect to strides. + TORCH_INTERNAL_ASSERT( + std::all_of( + consumer_tv->domain()->contiguity().begin(), + consumer_tv->domain()->contiguity().end(), + [](bool contig) { return contig; }), + "Unsupported pattern of vectorization: ", + consumer_tv->definition()->toString()); } - return true; + return vectorized_input_output; } -bool canVectorize( - TensorView* fusion_tv, - int word_size, - GpuLower& lower, - kir::ExpressionEvaluator& expr_eval) { - IterDomain* last_root_dim = nullptr; - // TODO: Should this be rfactor instead of root?? - for (size_t i = fusion_tv->getRootDomain().size(); i > 0; i--) { - auto r_id = fusion_tv->getRootDomain()[i - 1]; - if (r_id->isReduction() || r_id->isBroadcast()) { +//! Returns the information of vectorized input/output tensors +//! in the given fusion. +std::unique_ptr getVectorizedTensorValidationInfo( + kir::Kernel* kernel) { + auto vectorized_tensor_info_ptr = + std::make_unique(); + + for (const auto& vector_info : kernel->summary().vectorized_set_info) { + auto consumer_tv = vector_info.consumer_tv; + auto producer_tv = vector_info.producer_tv; + + auto vector_dim = vector_info.vectorized_leaf_id; + const auto is_aligned = + vector_dim->getParallelType() == ParallelType::Vectorize; + + // Find fusion inputs and outputs that are used with misaligned + // vectorization. + if (!is_aligned) { + TORCH_INTERNAL_ASSERT( + producer_tv->isFusionInput() || consumer_tv->isFusionOutput(), + "MisalignedVectorize is assumed to be used with either input or output tensor"); + if (consumer_tv->getMemoryType() == MemoryType::Global && + producer_tv->getMemoryType() == MemoryType::Local) { + vectorized_tensor_info_ptr->global_out_misaligned_tv.insert( + consumer_tv); + } else if ( + producer_tv->getMemoryType() == MemoryType::Global && + consumer_tv->getMemoryType() == MemoryType::Local) { + vectorized_tensor_info_ptr->global_inp_misaligned_tv.insert( + producer_tv); + } else { + TORCH_INTERNAL_ASSERT( + false, + "Unsupported memory configuration for misaligned vectorization."); + } + } + + // Collect information on corresponding fusion input and output + // tensors to verify strides. + auto inp_or_out_info = + getVectorizedFusionInputOutput(producer_tv, consumer_tv, kernel); + + // If both producer and consumer are contig and intermediate, + // nothing to validate with respect to strides. + if (inp_or_out_info.empty()) { continue; } - last_root_dim = r_id; - break; - } - if (last_root_dim == nullptr) { - return false; - } + // Misaligned vectorize only allows from input to local or local + // to output + if (!is_aligned) { + TORCH_INTERNAL_ASSERT(inp_or_out_info.size() == 1); + } - auto last_dim_size = - expr_eval.evaluate(lower.lowerValue(last_root_dim->extent())); + for (const auto& inp_or_out : inp_or_out_info) { + const bool is_input = inp_or_out.first; + const int pos = inp_or_out.second; - if (!last_dim_size.has_value()) { - return false; + if (is_aligned) { + auto& pos_list = is_input + ? vectorized_tensor_info_ptr->aligned_vectorized_inp_tensor_pos + : vectorized_tensor_info_ptr->aligned_vectorized_out_tensor_pos; + pos_list.push_back(pos); + } else { + auto& map = is_input + ? vectorized_tensor_info_ptr->inp_misaligned_tensors_pos + : vectorized_tensor_info_ptr->out_misaligned_tensors_pos; + map.emplace_back(pos); + } + } } - if (last_dim_size.value() % word_size != 0) { - return false; + return vectorized_tensor_info_ptr; +} + +// Make sure the root domain(s) comprising the vectorized leaf domain +// have the (merged) extent that is divisible by the vectorization +// word size. +void validateAlignedVectorizeExtents( + const VectorizedSetInfo& info, + kir::ExpressionEvaluator& expr_eval) { + TORCH_INTERNAL_ASSERT( + !info.contig_root_ids.empty(), + "No root ID found for vectorization with ", + info.consumer_tv->toString(), + " and ", + info.producer_tv->toString()); + + int64_t vectorized_merged_domain_extent = 1; + for (auto id : info.contig_root_ids) { + auto extent_val = expr_eval.evaluate(id->extent()); + TORCH_INTERNAL_ASSERT( + extent_val.has_value(), + "Error vectorizing, ", + info.consumer_tv->toString(), + " as the extent of a vectorized root domain, ", + id->toString(), + ", is unknown."); + vectorized_merged_domain_extent *= extent_val.value(); } - return true; + TORCH_INTERNAL_ASSERT( + vectorized_merged_domain_extent % info.word_size == 0, + "Error vectorizing, ", + info.consumer_tv->toString(), + " as the extent of the indexed domain, ", + vectorized_merged_domain_extent, + ", is not divisible by vector word size ", + info.word_size); } -namespace { +void validateAlignedVectorizedFusionInputOutput( + const IValue& aten_val, + int word_size, + TensorView* tv) { + TORCH_INTERNAL_ASSERT(aten_val.isTensor()); -// Check if there's any split that is non-divisible and vectorized. If -// found, Vectorize is illegal. -void validateVectorizedSplits( - kir::Kernel* kernel, - kir::ExpressionEvaluator& expr_eval) { - for (const auto& extent_factor : kernel->summary().splits_to_validate) { - auto input_extent = expr_eval.evaluate(extent_factor.first); - auto split_factor = expr_eval.evaluate(extent_factor.second); - TORCH_INTERNAL_ASSERT( - input_extent.has_value(), - "Could not check if a split with vectorization is divisible because the extent, ", - kir::toString(extent_factor.first), - ", is not possible to evaluate."); - TORCH_INTERNAL_ASSERT( - input_extent.has_value(), - "Could not check if a split with vectorization is divisible because the split factor, ", - kir::toString(extent_factor.second), - ", is not possible to evaluate."); + const auto& aten_tensor = aten_val.toTensor(); + + TORCH_INTERNAL_ASSERT( + reinterpret_cast(aten_tensor.data_ptr()) % + (word_size * aten_tensor.dtype().itemsize()) == + 0, + "Vectorization of ", + tv->toString(), + " not possible as the memory address is not aligned. ", + "Address: ", + aten_tensor.data_ptr(), + ", vector word size: ", + word_size, + ", data type: ", + aten_tensor.dtype()); + + // Traverse strides from the right-most domains. The rightmost + // domain must have stride 1. + int64_t cur_contig_stride = 1; + bool still_rightmost = true; + for (auto i = aten_tensor.ndimension() - 1; i >= 0; --i) { + const auto stride = aten_tensor.strides().at(i); + // If this domain is contiguous, then not necessary to check the + // stride. Otherwise, stride must be 1 if it's rightmost or + // divisible by word_size. TORCH_INTERNAL_ASSERT( - input_extent.value() % split_factor.value() == 0, - "Non-divisible split with vectorization is detected. ", - "Extent: ", - input_extent.value(), - ". Factor: ", - split_factor.value()); + stride == cur_contig_stride || (still_rightmost && stride == 1) || + (!still_rightmost && stride % word_size == 0), + "Vectorization of ", + tv->toString(), + " with word size ", + word_size, + " not possible due to invalid stride.", + " Domain: ", + tv->axis(i)->toString(), + ", stride: ", + stride) + // If the domain is size-1, the next domain is still considered + // rightmost. + const auto size = aten_tensor.sizes().at(i); + still_rightmost = still_rightmost && size == 1; + cur_contig_stride = stride * size; } } -} // namespace - -// Misaligned vectorization check. Currently misaligned vectorization is limited -// to global-register and register-global load/store patterns. However, this -// could be improved to include shared memory. -void validateVectorizedTensors( - Fusion* fusion, +void validateAlignedVectorizedTensors( + kir::Kernel* kernel, const at::ArrayRef& inputs, const std::vector& outputs, - GpuLower& lower, caching::ExecutorCompileTimeInfoCache* data_cache, kir::ExpressionEvaluator& expr_eval) { - FUSER_PERF_SCOPE("FusionExecutor::validateVectorizedTensors"); - auto tensor_vectorization_validation_entry = executor_utils::caching::ExecutorCompileTimeEntry< executor_utils::caching::VectorizedTensorValidation>( - data_cache, [fusion, &lower]() { - return executor_utils::getVectorizedTensorValidationInfo( - fusion, lower); + data_cache, [kernel]() { + return executor_utils::getVectorizedTensorValidationInfo(kernel); }); - // Validate all the canVectorizes: - for (auto it : tensor_vectorization_validation_entry.get() - .inp_pos_to_word_size_map_to_verify) { - TORCH_INTERNAL_ASSERT( - canVectorize(inputs[it.first], it.second), - "Error vectorizing, ", - fusion->inputs()[it.first], - " as input provided does not allowed vectorization by word size, ", - it.second); + // Verify extents of aligned vectorized tensors + for (const auto& vec_info : kernel->summary().vectorized_set_info) { + if (vec_info.vectorized_leaf_id->getParallelType() == + ParallelType::Vectorize) { + validateAlignedVectorizeExtents(vec_info, expr_eval); + } } - if (outputs.size() > 0) { - for (auto it : tensor_vectorization_validation_entry.get() - .out_pos_to_word_size_map_to_verify) { - TORCH_INTERNAL_ASSERT( - canVectorize(outputs[it.first], it.second), - "Error vectorizing, ", - fusion->outputs()[it.first], - " as output provided does not allowed vectorization by word size, ", - it.second); + // Validate input and output tensors with aligend + // vectorization. + for (auto pos : tensor_vectorization_validation_entry.get() + .aligned_vectorized_inp_tensor_pos) { + auto tv = kernel->inputs().at(pos)->as(); + auto word_size = kernel->summary().vectorized_accesses.at(tv); + validateAlignedVectorizedFusionInputOutput(inputs[pos], word_size, tv); + } + + if (!outputs.empty()) { + for (auto pos : tensor_vectorization_validation_entry.get() + .aligned_vectorized_out_tensor_pos) { + auto tv = kernel->outputs().at(pos)->as(); + auto word_size = kernel->summary().vectorized_accesses.at(tv); + validateAlignedVectorizedFusionInputOutput(outputs[pos], word_size, tv); } } +} + +// Misaligned vectorization check. Currently misaligned vectorization is limited +// to global-register and register-global load/store patterns. However, this +// could be improved to include shared memory. +void validateMisalignedVectorizedTensors( + kir::Kernel* kernel, + const at::ArrayRef& inputs, + const std::vector& outputs, + caching::ExecutorCompileTimeInfoCache* data_cache, + kir::ExpressionEvaluator& expr_eval) { + auto tensor_vectorization_validation_entry = + executor_utils::caching::ExecutorCompileTimeEntry< + executor_utils::caching::VectorizedTensorValidation>( + data_cache, [kernel]() { + return executor_utils::getVectorizedTensorValidationInfo(kernel); + }); std::vector inp_misaligned_tensors; std::vector out_misaligned_tensors; @@ -510,8 +689,53 @@ void validateVectorizedTensors( inp_misaligned_tensors, out_misaligned_tensors), "All global tensors must have the same stride for misaligned vectorization."); +} - validateVectorizedSplits(lower.kernel(), expr_eval); +// Check if there's any split that is non-divisible and vectorized. If +// found, Vectorize is illegal. +void validateVectorizedSplits( + kir::Kernel* kernel, + kir::ExpressionEvaluator& expr_eval) { + for (const auto& extent_factor : kernel->summary().splits_to_validate) { + auto input_extent = expr_eval.evaluate(extent_factor.first); + auto split_factor = expr_eval.evaluate(extent_factor.second); + TORCH_INTERNAL_ASSERT( + input_extent.has_value(), + "Could not check if a split with vectorization is divisible because the extent, ", + extent_factor.first->toString(), + ", is not possible to evaluate."); + TORCH_INTERNAL_ASSERT( + input_extent.has_value(), + "Could not check if a split with vectorization is divisible because the split factor, ", + extent_factor.second->toString(), + ", is not possible to evaluate."); + TORCH_INTERNAL_ASSERT( + input_extent.value() % split_factor.value() == 0, + "Non-divisible split with vectorization is detected. ", + "Extent: ", + input_extent.value(), + ". Factor: ", + split_factor.value()); + } +} + +} // namespace + +void validateVectorizedTensors( + kir::Kernel* kernel, + const at::ArrayRef& inputs, + const std::vector& outputs, + caching::ExecutorCompileTimeInfoCache* data_cache, + kir::ExpressionEvaluator& expr_eval) { + FUSER_PERF_SCOPE("FusionExecutor::validateVectorizedTensors"); + + validateAlignedVectorizedTensors( + kernel, inputs, outputs, data_cache, expr_eval); + + validateMisalignedVectorizedTensors( + kernel, inputs, outputs, data_cache, expr_eval); + + validateVectorizedSplits(kernel, expr_eval); } kir::ExpressionEvaluator bindKernelInputs( @@ -530,15 +754,15 @@ kir::ExpressionEvaluator bindKernelInputs( for (const auto i : c10::irange(inputs.size())) { const auto input = inputs[i]; - if (auto tensor_input = dynamic_cast(input)) { + if (auto tensor_input = dynamic_cast(input)) { TORCH_INTERNAL_ASSERT( aten_inputs[i].isTensor(), "Something went wrong configuring launch. Inputs no longer match at index:", i); const auto aten_tensor = aten_inputs[i].toTensor(); - const auto root_domain = - kir::TensorDomain::noReductions(tensor_input->domain()->rootDomain()); + const auto root_domain = TensorDomain::noReductions( + tensor_input->domain()->getMaybeRFactorDomain()); TORCH_INTERNAL_ASSERT( aten_tensor.ndimension() == static_cast(root_domain.size()), "Something went wrong configuring launch. Inputs no longer match."); @@ -546,6 +770,11 @@ kir::ExpressionEvaluator bindKernelInputs( for (const auto dim : c10::irange(root_domain.size())) { const auto extent = root_domain[dim]->extent(); const auto value = aten_tensor.sizes()[dim]; + if (value == 0 && tensor_input->uses().empty()) { + // If there's no uses, ignore there's a size-0 dimension. + continue; + } + TORCH_INTERNAL_ASSERT(value != 0, "Cannot handle size-0 dimensions"); bool should_bind = true; if (check_consistency) { const auto prev_value = expr_eval.evaluate(extent); @@ -553,7 +782,7 @@ kir::ExpressionEvaluator bindKernelInputs( TORCH_CHECK( *prev_value == value, "Attempting to bind ", - kir::toString(extent), + extent->toString(), " to ", value, "but it's already set to ", @@ -561,14 +790,16 @@ kir::ExpressionEvaluator bindKernelInputs( should_bind = false; } } - if (should_bind && !extent->isConst()) { + if (should_bind && !extent->isConstScalar()) { expr_eval.bind(extent, value); } } // NOLINTNEXTLINE: https://bugs.llvm.org/show_bug.cgi?id=48525 } else if (input->isScalar() && input->dtype() == DataType::Int) { TORCH_INTERNAL_ASSERT( - aten_inputs[i].type()->kind() == c10::TypeKind::IntType); + aten_inputs[i].type()->kind() == c10::TypeKind::IntType, + "kernel expected Scalar Int inputs, but found", + aten_inputs[i].type()->str()); expr_eval.bind(input, aten_inputs[i].toInt()); } } @@ -599,14 +830,19 @@ ExpressionEvaluator bindFusionInputs( "Something went wrong configuring launch. Inputs do not match."); auto aten_tensor = aten_inputs[i].toTensor(); - auto root_dom = TensorDomain::noReductions(cg_tensor->getRootDomain()); + auto root_dom = + TensorDomain::noReductions(cg_tensor->getMaybeRFactorDomain()); TORCH_INTERNAL_ASSERT( aten_tensor.ndimension() == (int64_t)root_dom.size(), "Something went wrong configuring launch. Inputs do not match."); - for (const auto dim : c10::irange(root_dom.size())) { const auto extent = root_dom[dim]->extent(); const auto value = aten_tensor.sizes()[dim]; + if (value == 0 && cg_tensor->uses().empty()) { + // If there's no uses, ignore there's a size-0 dimension. + continue; + } + TORCH_INTERNAL_ASSERT(value != 0, "Cannot handle size-0 dimensions"); const auto prev_value = evaluator.evaluate(extent); if (prev_value.has_value()) { TORCH_CHECK( @@ -625,7 +861,9 @@ ExpressionEvaluator bindFusionInputs( inputs[i]->getValType().value() == ValType::Scalar && inputs[i]->getDataType().value() == DataType::Int) { TORCH_INTERNAL_ASSERT( - aten_inputs[i].type()->kind() == c10::TypeKind::IntType); + aten_inputs[i].type()->kind() == c10::TypeKind::IntType, + "fusion expected Scalar Int inputs, but found", + aten_inputs[i].type()->str()); evaluator.bind(inputs[i], aten_inputs[i].toInt()); } } @@ -644,7 +882,7 @@ void initializeCudaContext() { } } -NvrtcFunction nvrtcCompile( +std::pair nvrtcCompile( const std::string& code, const std::string& func_name, int id, @@ -652,6 +890,8 @@ NvrtcFunction nvrtcCompile( FUSER_PERF_SCOPE("executor_utils::NVRTC"); initializeCudaContext(); + std::stringstream ptxas_log; + const auto prop = at::cuda::getCurrentDeviceProperties(); int major = 0, minor = 0; @@ -697,24 +937,19 @@ NvrtcFunction nvrtcCompile( "--std=c++14", compute.c_str(), "-default-device"}; #endif - const char* disable_fastmath = getenv("PYTORCH_NVFUSER_DISABLE_FASTMATH"); - if (!disable_fastmath || (atoi(disable_fastmath) == 0)) { - args.push_back("--use_fast_math"); - } else { - TORCH_WARN_ONCE( - "fast math disabled in nvfuser, try set `PYTORCH_NVFUSER_DISABLE_FASTMATH=0`"); - } - - const char* disable_fma = getenv("PYTORCH_NVFUSER_DISABLE_FMA"); - // int disable_fma_flag = disable_fma ? atoi(disable_fma) : 0; - if (disable_fma && atoi(disable_fma)) { + const bool disable_fma = isDisabled(DisableOption::Fma); #ifdef __HIP_PLATFORM_HCC__ + if (disable_fma) { TORCH_WARN_ONCE( "PYTORCH_CUDA_FUSER_DISABLE_FMA is not supported on ROCm, ignoring"); + } #else + if (disable_fma) { args.push_back("--fmad=false"); -#endif + } else { + args.push_back("--fmad=true"); } +#endif #ifndef NDEBUG // Add line info to generated kernels @@ -734,7 +969,8 @@ NvrtcFunction nvrtcCompile( std::vector info_log; unsigned int log_size = 8196; - if (isDebugDumpEnabled(DebugDumpOption::PrintPtxasLog)) { + if (isDebugDumpEnabled(DebugDumpOption::PrintPtxasLog) || + isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) { // show register usage in compilation log if (compile_to_sass) { args.push_back("--ptxas-options"); @@ -796,14 +1032,20 @@ NvrtcFunction nvrtcCompile( // The maximum possible count allowed by ptxas is 255 max_register = static_cast( std::min(effective_max_reg_per_warp / warp_size, 255)); - if (compile_to_sass) { max_register_usage += std::to_string(max_register); + args.push_back("--ptxas-options"); args.push_back(max_register_usage.c_str()); } else { options.push_back(CU_JIT_MAX_REGISTERS); option_vals.push_back((void*)(intptr_t)max_register); } + + ptxas_log << "\nCompile options: "; + for (auto arg : args) { + ptxas_log << arg << " "; + } + ptxas_log << " ; block size=" << opt_block_size.value() << "\n"; } #endif @@ -816,26 +1058,21 @@ NvrtcFunction nvrtcCompile( const auto result = at::globalContext().getNVRTC().nvrtcCompileProgram( program, args.size(), args.data()); - if (result != NVRTC_SUCCESS) { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - size_t logsize; - at::globalContext().getNVRTC().nvrtcGetProgramLogSize(program, &logsize); - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - std::vector log(logsize); - at::globalContext().getNVRTC().nvrtcGetProgramLog(program, log.data()); + size_t logsize = 0; + at::globalContext().getNVRTC().nvrtcGetProgramLogSize(program, &logsize); + std::vector log(logsize); + at::globalContext().getNVRTC().nvrtcGetProgramLog(program, log.data()); + + if (result != NVRTC_SUCCESS) { TORCH_INTERNAL_ASSERT( false, code.c_str(), "\nCUDA NVRTC compile error: ", log.data()); - } else if (isDebugDumpEnabled(DebugDumpOption::PrintPtxasLog)) { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - size_t logsize; - at::globalContext().getNVRTC().nvrtcGetProgramLogSize(program, &logsize); - std::vector log(logsize); - at::globalContext().getNVRTC().nvrtcGetProgramLog(program, log.data()); + } + ptxas_log << log.data() << std::endl; + if (isDebugDumpEnabled(DebugDumpOption::PrintPtxasLog)) { std::cout << log.data() << std::endl; } - AT_CUDA_NVRTC_CHECK(result); } @@ -976,7 +1213,7 @@ NvrtcFunction nvrtcCompile( compiled_kernel_.module, lowered_kernel_name)); - return compiled_kernel_; + return {compiled_kernel_, ptxas_log.str()}; } namespace caching { @@ -1037,7 +1274,7 @@ template class ExecutorCompileTimeEntry; } // namespace caching std::vector getParallelBindingsIterDomains( - GpuLower& lower, + GpuLower* lower, const std::vector& used_tvs) { std::vector parallel_ids; for (auto tv : used_tvs) { @@ -1047,8 +1284,8 @@ std::vector getParallelBindingsIterDomains( // Want to keep the broadcast dimensions if they are not resolved // TODO: piping down the parallel dimension map here would // be helpful - auto& parallel_map = lower.caParallelMap(); - if (parallel_map.getConcreteMappedID(id) == id) { + if (lower->caMap()->getConcreteMappedID(id, IdMappingMode::LOOP) == + id) { parallel_ids.push_back(id); } } else { @@ -1062,46 +1299,46 @@ std::vector getParallelBindingsIterDomains( return parallel_ids; } +namespace { + void insertParallelExtent( - GpuLower& lower, IterDomain* binding_id, const std::unique_ptr& parallel_iter_extents_ptr) { - auto kir_extent = lower.lowerValue(binding_id->extent()); + auto extent = binding_id->extent(); const auto it = parallel_iter_extents_ptr->find(binding_id->getParallelType()); if (it != parallel_iter_extents_ptr->end()) { - it->second.push_back(kir_extent); + it->second.push_back(extent); } else { parallel_iter_extents_ptr->operator[](binding_id->getParallelType()) = { - kir_extent}; + extent}; } } +} // namespace + std::unique_ptr getParallelIterExtents( - GpuLower& lower, std::vector& parallel_binding_ids) { auto parallel_iter_extents_ptr = std::make_unique(); for (auto id : parallel_binding_ids) { - insertParallelExtent(lower, id, parallel_iter_extents_ptr); + insertParallelExtent(id, parallel_iter_extents_ptr); } return parallel_iter_extents_ptr; } std::unique_ptr getSimplifiedParallelIterExtents( - GpuLower& lower, + GpuLower* lower, std::vector& parallel_binding_ids) { auto parallel_iter_extents_ptr = std::make_unique(); - auto& parallel_map = lower.caParallelMap(); + const auto& ca_map = lower->caMap(); std::vector mapped; - bool is_tidx_warp_padded = lower.getWarpPaddedParallelInfo().is_tidx_padded; + bool is_tidx_warp_padded = lower->getWarpPaddedParallelInfo().is_tidx_padded; for (auto id : parallel_binding_ids) { if (std::any_of( - mapped.begin(), - mapped.end(), - [id, ¶llel_map](IterDomain* mapped_id) { - return parallel_map.areMapped(mapped_id, id); + mapped.begin(), mapped.end(), [id, &ca_map](IterDomain* mapped_id) { + return ca_map->areMapped(mapped_id, id, IdMappingMode::LOOP); })) { if (id->getParallelType() != ParallelType::TIDx || !is_tidx_warp_padded) { continue; @@ -1109,7 +1346,8 @@ std::unique_ptr getSimplifiedParallelIterExtents( } insertParallelExtent( - lower, parallel_map.getConcreteMappedID(id), parallel_iter_extents_ptr); + ca_map->getConcreteMappedID(id, IdMappingMode::LOOP), + parallel_iter_extents_ptr); mapped.push_back(id); } @@ -1117,7 +1355,7 @@ std::unique_ptr getSimplifiedParallelIterExtents( } std::unique_ptr getWarpPaddedExtentsInfo( - GpuLower& lower, + kir::Kernel* kernel, std::vector& parallel_binding_ids) { auto warp_padded_extent_info_ptr = std::make_unique(); @@ -1125,7 +1363,6 @@ std::unique_ptr getWarpPaddedExtentsInfo( warp_padded_extent_info_ptr->warp_padded_extent_set; auto& warp_padded_constant = warp_padded_extent_info_ptr->warp_padded_constant; - auto kernel = lower.kernel(); bool has_warp_reduction = kernel->getWarpPaddedParallelInfo().has_warp_reduction; @@ -1135,11 +1372,11 @@ std::unique_ptr getWarpPaddedExtentsInfo( if (has_warp_reduction) { if (id->hasPaddingToMultipleOfWarp() || kernel->isParallelTypePadded(id->getParallelType())) { - auto kir_extent = lower.lowerValue(id->extent()); - warp_padded_extent_set.insert(kir_extent); + auto extent = id->extent(); + warp_padded_extent_set.insert(extent); auto padded_value = id->getMaybeSizeAfterPadding(); if (padded_value.has_value()) { - warp_padded_constant[kir_extent] = padded_value.value(); + warp_padded_constant[extent] = padded_value.value(); } } } @@ -1147,122 +1384,6 @@ std::unique_ptr getWarpPaddedExtentsInfo( return warp_padded_extent_info_ptr; } -std::unique_ptr getVectorizedTensorValidationInfo( - Fusion* fusion, - GpuLower& lower) { - auto vectorized_tensor_info_ptr = - std::make_unique(); - auto& tv_to_vector_word_size = - vectorized_tensor_info_ptr->tv_to_vector_word_size; - auto& global_inp_misaligned_tv = - vectorized_tensor_info_ptr->global_inp_misaligned_tv; - auto& global_out_misaligned_tv = - vectorized_tensor_info_ptr->global_out_misaligned_tv; - - kir::ExpressionEvaluator expr_eval; - - // Find all vectorized tensors and their word size - for (auto expr : fusion->exprs()) { - if (!expr->isA() || - expr->as()->getUnaryOpType() != UnaryOpType::Set) { - continue; - } - auto uop = expr->as(); - if (!uop->out()->isA() || !uop->in()->isA()) { - continue; - } - auto out_tv = uop->out()->as(); - auto in_tv = uop->in()->as(); - IterDomain* vector_dim = nullptr; - for (auto id : out_tv->domain()->domain()) { - if (id->getParallelType() == ParallelType::Vectorize || - id->getParallelType() == ParallelType::MisalignedVectorize) { - TORCH_INTERNAL_ASSERT( - vector_dim == nullptr, - "Found multiple vectorized dimensions on tensor ", - out_tv); - vector_dim = id; - } - } - if (vector_dim == nullptr) { - continue; - } - auto vector_word_size = - expr_eval.evaluate(lower.lowerValue(vector_dim->extent())); - TORCH_INTERNAL_ASSERT( - vector_word_size.has_value(), - "Non constant vector dimension found in ", - out_tv); - tv_to_vector_word_size[out_tv] = vector_word_size.value(); - tv_to_vector_word_size[in_tv] = vector_word_size.value(); - - if (vector_dim->getParallelType() == ParallelType::MisalignedVectorize) { - if (out_tv->getMemoryType() == MemoryType::Global && - in_tv->getMemoryType() == MemoryType::Local) { - global_out_misaligned_tv.insert(out_tv); - } else if ( - in_tv->getMemoryType() == MemoryType::Global && - out_tv->getMemoryType() == MemoryType::Local) { - global_inp_misaligned_tv.insert(in_tv); - } else { - TORCH_INTERNAL_ASSERT( - false, - "Unsupported memory configuration for misaligned vectorization."); - } - } - } - - // Check striding information on input and outputs as well as size information - // of all - auto& inp_misaligned_tensors_pos = - vectorized_tensor_info_ptr->inp_misaligned_tensors_pos; - auto& out_misaligned_tensors_pos = - vectorized_tensor_info_ptr->out_misaligned_tensors_pos; - auto& inp_pos_to_word_size_map_to_verify = - vectorized_tensor_info_ptr->inp_pos_to_word_size_map_to_verify; - auto& out_pos_to_word_size_map_to_verify = - vectorized_tensor_info_ptr->out_pos_to_word_size_map_to_verify; - - for (auto entry : tv_to_vector_word_size) { - auto tv = entry.first; - auto word_size = entry.second; - if (tv->isFusionInput()) { - auto inp_it = - std::find(fusion->inputs().begin(), fusion->inputs().end(), tv); - TORCH_INTERNAL_ASSERT( - inp_it != fusion->inputs().end(), - "Could not find ", - tv, - " in fusion inputs."); - auto inp_pos = std::distance(fusion->inputs().begin(), inp_it); - - if (global_inp_misaligned_tv.find(tv) != global_inp_misaligned_tv.end()) { - inp_misaligned_tensors_pos.emplace_back(inp_pos); - } else { - // Shouldn't visit same pos twice here, assert ? - inp_pos_to_word_size_map_to_verify[inp_pos] = word_size; - } - } else if (tv->isFusionOutput()) { - auto out_it = - std::find(fusion->outputs().begin(), fusion->outputs().end(), tv); - TORCH_INTERNAL_ASSERT( - out_it != fusion->outputs().end(), - "Could not find ", - tv, - " in provided fusion outputs."); - auto out_pos = std::distance(fusion->outputs().begin(), out_it); - - if (global_out_misaligned_tv.find(tv) != global_out_misaligned_tv.end()) { - out_misaligned_tensors_pos.emplace_back(out_pos); - } else { - out_pos_to_word_size_map_to_verify[out_pos] = word_size; - } - } - } - - return vectorized_tensor_info_ptr; -} - } // namespace executor_utils } // namespace cuda } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.h b/torch/csrc/jit/codegen/cuda/executor_utils.h index d851be48991f..37817838f386 100644 --- a/torch/csrc/jit/codegen/cuda/executor_utils.h +++ b/torch/csrc/jit/codegen/cuda/executor_utils.h @@ -28,28 +28,16 @@ namespace executor_utils { // Include all the functions we might need in generated code std::string kernelPreamble(); -// TODO(kir): rewrite in terms of Kernel inputs void validateKernelInputs( Fusion* fusion, const at::ArrayRef& inputs, const c10::Device& device); -// TODO(kir): rewrite in terms of Kernel outputs void validateKernelOutputs( Fusion* fusion, const std::vector& outputs, const c10::Device& device); -// Returns if vectorizing the aten value by word size is possible -bool canVectorize(const IValue& aten_val, int word_size); - -// Returns if vectorizing the aten value by word size is possible -bool canVectorize( - TensorView* fusion_tv, - int word_size, - GpuLower& lower, - kir::ExpressionEvaluator& expr_eval); - //! Bind kernel input values to runtime values kir::ExpressionEvaluator bindKernelInputs( const at::ArrayRef& aten_inputs, @@ -67,7 +55,8 @@ struct NvrtcFunction { void initializeCudaContext(); -NvrtcFunction nvrtcCompile( +// Returns executable function and the ptxas log from compilation +std::pair nvrtcCompile( const std::string& code, const std::string& func_name, int id, @@ -112,7 +101,7 @@ class ParallelBindingIterDomains { class ParallelIterExtentMap { public: using DataType = - std::unordered_map, TypeHash>; + std::unordered_map, TypeHash>; static const CompileTimeEntryType EntryType = CompileTimeEntryType::PARALLEL_ITER_EXTENT_MAP; }; @@ -133,7 +122,7 @@ class ParallelIterExtentMap { class SimplifiedParallelIterExtentMap { public: using DataType = - std::unordered_map, TypeHash>; + std::unordered_map, TypeHash>; static const CompileTimeEntryType EntryType = CompileTimeEntryType::SIMPLIFIED_PARALLEL_ITER_EXTENT_MAP; }; @@ -141,8 +130,8 @@ class SimplifiedParallelIterExtentMap { //! WarpPaddedExtentsInfo: //! Auxiliary data type for entry class WarpPaddedParallelExtents struct WarpPaddedExtentsInfo { - std::unordered_set warp_padded_extent_set; - std::unordered_map warp_padded_constant; + std::unordered_set warp_padded_extent_set; + std::unordered_map warp_padded_constant; }; //! Compile-time info to be cached in each FusionExecutor: @@ -159,13 +148,18 @@ class WarpPaddedParallelExtents { //! VectorizedTensorInfo: //! Auxiliary data type for entry class VectorizedTensorValidation struct VectorizedTensorInfo { + //! Aligned vectorized fusion inputs + std::vector aligned_vectorized_inp_tensor_pos; + //! Aligned vectorized fusion outputs + std::vector aligned_vectorized_out_tensor_pos; + //! Misaligned vectorized input tensors std::unordered_set global_inp_misaligned_tv; + //! Misaligned vectorized output tensors std::unordered_set global_out_misaligned_tv; - std::unordered_map tv_to_vector_word_size; + //! Positions of misaligned input tensors std::vector inp_misaligned_tensors_pos; + //! Positions of misaligned output tensors std::vector out_misaligned_tensors_pos; - std::unordered_map inp_pos_to_word_size_map_to_verify; - std::unordered_map out_pos_to_word_size_map_to_verify; }; //! Compile-time info to be cached in each FusionExecutor: @@ -284,42 +278,33 @@ class ExecutorCompileTimeEntry { //! Returns the vector of tensorviews that will be used to bind parallel //! dimensions. std::vector getParallelBindingsIterDomains( - GpuLower& lower, + GpuLower* lower, const std::vector& used_tvs); using ParallelExtentMap = - std::unordered_map, TypeHash>; + std::unordered_map, TypeHash>; //! Returns the extents of all parallel binding iterdomains corresponding //! to each parallel type. std::unique_ptr getParallelIterExtents( - GpuLower& lower, std::vector& parallel_binding_ids); //! Returns the simplified set of extents necessary for launch parameter //! binding. std::unique_ptr getSimplifiedParallelIterExtents( - GpuLower& lower, + GpuLower* lower, std::vector& parallel_binding_ids); //! Returns the symbolic or constant extetns of warp padded parallel //! iterdomains in the given vector. std::unique_ptr getWarpPaddedExtentsInfo( - GpuLower& lower, + kir::Kernel* lower, std::vector& parallel_binding_ids); -//! Returns the position information of vectorized input/output tensors -//! in the given fusion. -std::unique_ptr getVectorizedTensorValidationInfo( - Fusion* fusion, - GpuLower& lower); - -// TODO(kir): rewrite in terms of Kernel tensors void validateVectorizedTensors( - Fusion* fusion, + kir::Kernel* kernel, const at::ArrayRef& inputs, const std::vector& outputs, - GpuLower& lower, caching::ExecutorCompileTimeInfoCache* data_cache, kir::ExpressionEvaluator& expr_eval); diff --git a/torch/csrc/jit/codegen/cuda/expr_evaluator.h b/torch/csrc/jit/codegen/cuda/expr_evaluator.h index ced4b59a7831..5630743b6f69 100644 --- a/torch/csrc/jit/codegen/cuda/expr_evaluator.h +++ b/torch/csrc/jit/codegen/cuda/expr_evaluator.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include diff --git a/torch/csrc/jit/codegen/cuda/fusion.cpp b/torch/csrc/jit/codegen/cuda/fusion.cpp index d9d71e53c414..33cf499bc18b 100644 --- a/torch/csrc/jit/codegen/cuda/fusion.cpp +++ b/torch/csrc/jit/codegen/cuda/fusion.cpp @@ -8,10 +8,9 @@ #include #include #include +#include #include -#include - namespace torch { namespace jit { namespace fuser { @@ -31,19 +30,16 @@ FusionGuard::~FusionGuard() { Fusion* FusionGuard::getCurFusion() { return ACTIVE_FUSION; } +void FusionGuard::setCurFusion(Fusion* fusion) { + ACTIVE_FUSION = fusion; +} void swap(Fusion& a, Fusion& b) noexcept { FUSER_PERF_SCOPE("Fusion swap"); using std::swap; - // Swap the content - swap(a.val_set_, b.val_set_); - swap(a.expr_set_, b.expr_set_); - swap(a.val_deque_, b.val_deque_); - - swap(a.val_type_name_map_, b.val_type_name_map_); - swap(a.expr_name_counter_, b.expr_name_counter_); + swap(static_cast(a), static_cast(b)); swap(a.inputs_, b.inputs_); swap(a.outputs_, b.outputs_); @@ -51,27 +47,6 @@ void swap(Fusion& a, Fusion& b) noexcept { swap(a.io_alias_, b.io_alias_); swap(a.permuted_input_map_, b.permuted_input_map_); swap(a.permuted_output_map_, b.permuted_output_map_); - - // Fixup the Statement::fusion_ links for a - for (auto val : a.val_set_) { - val->fusion_ = &a; - } - for (auto expr : a.expr_set_) { - expr->fusion_ = &a; - } - - // Fixup the Statement::fusion_ links for b - for (auto val : b.val_set_) { - val->fusion_ = &b; - } - for (auto expr : b.expr_set_) { - expr->fusion_ = &b; - } -} - -Fusion::Fusion(const Fusion& other) { - FUSER_PERF_SCOPE("Fusion copy"); - Fusion::copy(&other, this); } std::unique_ptr Fusion::segment( @@ -82,30 +57,21 @@ std::unique_ptr Fusion::segment( IrCloner Fusion::copy(const Fusion* from, Fusion* to) { to->clear(); - IrCloner ir_cloner(to); + auto ir_cloner = IrContainer::copy(from, to); - for (auto val : from->val_set_) { - to->val_set_.insert(ir_cloner.clone(val)); - } - - for (auto expr : from->expr_set_) { - to->expr_set_.insert(ir_cloner.clone(expr)); - } - - for (auto val : from->val_deque_) { - to->val_deque_.push_back(ir_cloner.clone(val)); - } - - for (auto val : from->val_set_) { + for (auto val : from->vals_) { ir_cloner.clone(val)->setDefinition(ir_cloner.clone(val->definition_)); ir_cloner.clone(val)->setUses(ir_cloner.clone(val->uses_)); } - to->val_type_name_map_ = from->val_type_name_map_; - to->expr_name_counter_ = from->expr_name_counter_; - to->inputs_ = ir_cloner.clone(from->inputs_); to->outputs_ = ir_cloner.clone(from->outputs_); + for (auto inp : to->inputs_) { + inp->setIsFusionInput(true); + } + for (auto out : to->outputs_) { + out->setIsFusionOutput(true); + } // TODO: put this into ir_cloner instead for (const auto& entry : from->io_alias_) { @@ -117,9 +83,22 @@ IrCloner Fusion::copy(const Fusion* from, Fusion* to) { to->permuted_input_map_ = from->permuted_input_map_; to->permuted_output_map_ = from->permuted_output_map_; + to->all_tv_uses_valid_ = from->all_tv_uses_valid_; + // This should never be true on copy, but copying for completeness. + to->is_during_update_uses_ = from->is_during_update_uses_; + return ir_cloner; } +// Clang tidy complains when using default constructor for IrContainer instead +// of copy constructor. Fusion::copy has a call to IrContainer::copy, so it's +// redundant to use the IrContainer copy constructor, but it is harmless since +// Fusion::copy starts by calling clear(). +Fusion::Fusion(const Fusion& other) : IrContainer(other) { + FUSER_PERF_SCOPE("Fusion copy"); + Fusion::copy(&other, this); +} + Fusion::Fusion(Fusion&& other) noexcept { FUSER_PERF_SCOPE("Fusion move"); swap(*this, other); @@ -147,36 +126,22 @@ Fusion::~Fusion() { void Fusion::clear() noexcept { FUSER_PERF_SCOPE("Fusion clear"); - // Free the owned values - for (auto ptr : val_set_) { - delete ptr; - } - - // Free the owned expressions - for (auto ptr : expr_set_) { - delete ptr; - } - - val_set_.clear(); - val_deque_.clear(); - expr_set_.clear(); - - for (auto& kv : val_type_name_map_) { - kv.second = 0; - } - - expr_name_counter_ = 0; + IrContainer::clear(); inputs_.clear(); outputs_.clear(); io_alias_.clear(); + permuted_input_map_.clear(); permuted_output_map_.clear(); + + all_tv_uses_valid_ = false; + is_during_update_uses_ = false; } void Fusion::removeExpr(Expr* expr) { - assertInFusion(expr, "Cannot remove expr "); + assertInContainer(expr, "Cannot remove expr "); // If we hit this error too frequently, we could lighten the restrictions so // that removing something that doesn't exist simply does nothing. For now, // we're going with the strictest model which errors. @@ -194,13 +159,11 @@ void Fusion::removeExpr(Expr* expr) { } } - expr_set_.erase(expr); - - delete expr; + IrContainer::removeExpr(expr); } void Fusion::removeVal(Val* val) { - assertInFusion(val, "Cannot remove val "); + assertInContainer(val, "Cannot remove val "); TORCH_CHECK( !val->isFusionInput(), @@ -213,26 +176,26 @@ void Fusion::removeVal(Val* val) { if (orig != nullptr) removeExpr(val->definition()); - for (Expr* use : unordered_uses(val)) + for (Expr* use : unordered_uses(val)) { removeExpr(use); - - val_set_.erase(val); - - for (auto it = val_deque_.begin(); it != val_deque_.end(); it++) - if (*it == val) { - val_deque_.erase(it); - break; - } - - delete val; + } + IrContainer::removeVal(val); } void Fusion::addInput(Val* input) { - assertInFusion(input, "Cannot register input "); + assertInContainer(input, "Cannot register input "); + + TORCH_INTERNAL_ASSERT( + input->getDataType() != DataType::Index, + "Data type Index is a local compile time data type only, it cannot be used as an input in case it was generated from another kernel."); if (input->getValType().value() == ValType::TensorView) { auto tv = input->as(); tv->setMemoryType(MemoryType::Global); + } else if (input->getValType().value() == ValType::Scalar) { + TORCH_CHECK( + !input->isConst(), + "Immediate scalar value cannot be added as an input. It is not necessary to pass it as an input."); } inputs_.push_back(input); @@ -242,7 +205,20 @@ void Fusion::addInput(Val* input) { } void Fusion::addOutput(Val* output) { - assertInFusion(output, "Cannot register output "); + // We currently don't support explicitly outputing aliased inputs. This is + // because they are already marked as output for in-place update. It's tricky + // to allow marking them explicitly as real output, since that requires us to + // register/identify output not only by `Val*` pointer, but also by indices; + // it also requires us to magically arrange `outputs_` entries in proper order + // ^^^ this doesn't look intuitive on `outputs_` in fusion. + // I think we can solve this by marking addOutput on io_alias_ keys after + // fusion is fully defined. Tracking this in #1488 + // Apparently we can't do this neither at the time. I think segmentation + // unfortunately would call addOutput after we marked io_alias_ map. + // TORCH_CHECK(io_alias_.count(output) == 0, + // "can't register aliased output as real output"); + + assertInContainer(output, "Cannot register output "); if (output->getValType().value() == ValType::TensorView) { auto tv = output->as(); tv->setMemoryType(MemoryType::Global); @@ -285,7 +261,11 @@ void Fusion::replaceOutput(Val* output, Val* replacement) { TORCH_CHECK(find_output != outputs_.end(), "Unable to find output in Fusion"); if (find_output != outputs_.end()) { - *find_output = replacement; + std::replace_if( + outputs_.begin(), + outputs_.end(), + [&output](Val* v) { return v == output; }, + replacement); if (replacement->getValType().value() == ValType::TensorView) { replacement->setIsFusionOutput(true); @@ -307,27 +287,8 @@ void Fusion::replaceOutput(Val* output, Val* replacement) { } } -bool Fusion::inFusion(const Statement* stmt) const { - bool in_fusion = stmt->fusion() == this; - Statement* nonconst_stmt = const_cast(stmt); // NOLINT - - if (stmt->isExpr()) { - in_fusion &= expr_set_.find(nonconst_stmt->as()) != expr_set_.end(); - } - if (stmt->isVal()) { - in_fusion &= val_set_.find(nonconst_stmt->as()) != val_set_.end(); - } - - return in_fusion; -} - -void Fusion::assertInFusion(const Statement* stmt, const std::string& msg) - const { - TORCH_CHECK(inFusion(stmt), msg, " it was not found in the active fusion."); -} - std::vector Fusion::exprs() { - return ExprSort::getExprs(this); + return StmtSort::getExprs(this); } std::vector Fusion::inputsOf(Val* val) { @@ -341,12 +302,24 @@ void Fusion::validateInputs() { all_inputs.insert(input); } } + + std::unordered_set input_dims; + auto inp_tvs = ir_utils::filterByType(inputs()); + for (auto tv : inp_tvs) { + for (auto id : tv->getMaybeRFactorDomain()) { + input_dims.emplace(id->extent()); + } + } for (Val* input : all_inputs) { if (!input->isConstScalar()) { TORCH_CHECK( - hasInput(input) || inFusion(input), + input->isFusionInput() || + // TODO: Switch: + inContainer(input), + // to: input_dims.find(input) != input_dims.end(), + // https://github.com/csarofeen/pytorch/issues/1365 "Could not figure out how ", - input, + input->toString(), " is generated, however it was not specified as an input."); } } @@ -365,9 +338,13 @@ void Fusion::print() { std::cout << "}\n\n"; } -void Fusion::printKernel() { +void Fusion::printKernel(DataType index_type) { FUSER_PERF_SCOPE("Fusion::printKernel"); - std::cout << codegen::generateCudaKernel(GpuLower(this).kernel()); + TORCH_INTERNAL_ASSERT( + !this->isA(), + "Cannot \"print kernel\" of a kernel container. ", + "This would require lowering during lowering."); + std::cout << codegen::generateCudaKernel(GpuLower(this, index_type).kernel()); } void Fusion::printMath(bool from_outputs_only) { @@ -394,7 +371,7 @@ void Fusion::printMath(bool from_outputs_only) { leaf_vals.push_back(val); } } - exprs_for_print = ExprSort::getExprs(this, leaf_vals); + exprs_for_print = StmtSort::getExprs(this, leaf_vals); } std::cout << "\n%kernel_math {\n"; @@ -412,33 +389,36 @@ void Fusion::printTransforms() { t_exprs.handle(this); } -StmtNameType Fusion::registerVal(Val* val) { +void Fusion::registerVal(Val* val) { + if (inContainer(val)) { + return; + } + if (val->fusion()) { - if (val->fusion() != this) { - TORCH_CHECK(false, val, " was not found in the active fusion."); - } - if (inFusion(val)) { - return val->name(); - } + TORCH_CHECK( + val->fusion() == this, val, " was not found in the active fusion."); } - val_set_.emplace(val); - val_deque_.push_back(val); - return getValName(*(val->getValType())); + IrContainer::registerVal(val); } -StmtNameType Fusion::registerExpr(Expr* expr) { +void Fusion::registerExpr(Expr* expr) { + if (inContainer(expr)) { + return; + } + if (expr->fusion()) { - if (expr->fusion() != this) { - TORCH_CHECK(false, expr, " was not found in the active fusion."); - } - if (inFusion(expr)) { - return expr->name(); - } + TORCH_CHECK( + expr->fusion() == this, expr, " was not found in the active fusion."); } + IrContainer::registerExpr(expr); + + bool has_tv = false; + for (Val* input : expr->inputs()) { - assertInFusion(input, "Input to expr is invalid, "); + has_tv = has_tv || input->isA(); + assertInContainer(input, "Input to expr is invalid, "); auto uses_copy = input->uses(); if (std::find(uses_copy.begin(), uses_copy.end(), expr) == uses_copy.end()) { @@ -447,34 +427,25 @@ StmtNameType Fusion::registerExpr(Expr* expr) { } } + // Kernel is the only container type that is non-ssa. This is mainly (maybe + // only) because of initialization expressions which would overwrite tensor + // view definitions. + bool is_ssa = !this->isA(); + for (Val* output : expr->outputs()) { - assertInFusion(output, "Output to expr is invalid, "); - if (output->definition() != nullptr) { + has_tv = has_tv || output->isA(); + assertInContainer(output, "Output to expr is invalid, "); + if (output->definition() != nullptr && is_ssa) { removeExpr(output->definition()); } - output->setDefinition(expr); + if (is_ssa || (!is_ssa && output->definition() == nullptr)) { + output->setDefinition(expr); + } } - expr_set_.emplace(expr); - - resetTvUses(); - return getExprName(); -} - -StmtNameType Fusion::registerStatement(Statement* stmt) { - if (inFusion(stmt)) - return stmt->name(); - - if (stmt->isVal()) { - return registerVal(stmt->as()); - } else if (stmt->isExpr()) { - return registerExpr(stmt->as()); + if (has_tv) { + resetTvUses(); } - - TORCH_INTERNAL_ASSERT( - false, - "Could not register statement as Fusion could not recognize its type."); - return kInvalidStmName; } void Fusion::resetTvUses() { @@ -484,8 +455,8 @@ void Fusion::resetTvUses() { // getExprs only uses definition, so even if we've modified uses already to // remove dead exprs, this could reinsert them. getExprs is also boundeds by // inputs as registered inputs will return nullptr as their definition. - const auto all_tvs = ir_utils::filterByType(val_set_); - const auto used_exprs = ExprSort::getExprs(this); + const auto all_tvs = ir_utils::filterByType(vals_); + const auto used_exprs = StmtSort::getExprs(this); for (auto tv : all_tvs) { tv->setUses({}); @@ -507,14 +478,6 @@ void Fusion::resetTvUses() { is_during_update_uses_ = false; } -const std::unordered_set& Fusion::vals() const noexcept { - return val_set_; -} - -const std::deque& Fusion::deterministic_vals() const noexcept { - return val_deque_; -} - std::vector Fusion::usedMathVals() { // Note that using fusion->inputs() as the argument for the first // parameter of getAllValsBetween does not grab all used vals as @@ -553,37 +516,15 @@ std::vector Fusion::usedMathVals() { return used_math_vals; } -const std::unordered_set& Fusion::unordered_exprs() const noexcept { - return expr_set_; -} - std::unordered_set Fusion::unordered_uses(Val* val) const { return std::unordered_set(val->uses().begin(), val->uses().end()); } Expr* Fusion::definition(const Val* val) const { - assertInFusion(val, "Cannot detect the definition of val, "); + assertInContainer(val, "Cannot detect the definition of val, "); return val->definition(); } -bool Fusion::hasInput(const Val* val) const { - assertInFusion(val, "Cannot check if val is an input, "); - return val->isFusionInput(); -} - -bool Fusion::hasOutput(const Val* val) const { - assertInFusion(val, "Cannot check if val is an output, "); - return val->isFusionOutput(); -} - -StmtNameType Fusion::getValName(ValType vtype) { - return val_type_name_map_[vtype]++; -} - -StmtNameType Fusion::getExprName() { - return expr_name_counter_++; -} - // Indicate to kernel to set itself up to generate random numbers bool Fusion::isStochastic() { for (auto expr : exprs()) @@ -593,28 +534,6 @@ bool Fusion::isStochastic() { return false; } -bool Fusion::hasReduction() { - FUSER_PERF_SCOPE("Fusion::hasReduction"); - - for (auto expr : exprs()) - for (auto out : expr->outputs()) - if (out->getValType() == ValType::TensorView) - if (out->as()->hasReduction()) - return true; - - return false; -} - -bool Fusion::hasWelford() { - FUSER_PERF_SCOPE("Fusion::hasWelford"); - for (auto expr : exprs()) { - if (expr->isA()) { - return true; - } - } - return false; -} - std::vector Fusion::getTerminatingOutputs() { FUSER_PERF_SCOPE("getTerminatingOutputs"); @@ -682,6 +601,33 @@ bool Fusion::isAliasCompatible(Val* left, Val* right) { } void Fusion::aliasOutputToInput(Val* output, Val* input) { + // Because we could cast output when input is cast. + TORCH_INTERNAL_ASSERT( + !output->isFusionOutput(), + "Do NOT add aliased output to fusion output outside of `aliasOutputToInput"); + + if (!input->isFusionInput()) { + auto input_expr = input->definition(); + // TORCH_INTERNAL_ASSERT(input_def.etype() == ExprType::UnaryOp, "expected + // unary op for aliased input"); + TORCH_INTERNAL_ASSERT( + input_expr->isA(), "expected unary op for aliased input"); + auto input_uop = input_expr->as(); + TORCH_INTERNAL_ASSERT( + input_uop->getUnaryOpType() == UnaryOpType::Cast, + "expected aliased input to be output of cast op"); + input = input_uop->in(); + } + TORCH_INTERNAL_ASSERT( + input->getDataType().has_value() && output->getDataType().has_value(), + "requires DataType to be available for aliased output to input"); + + if (input->getDataType().value() != output->getDataType().value()) { + output = castOp(input->getDataType().value(), output); + } + // TODO: output should be marked at the end of fusion definition #1488 + addOutput(output); + TORCH_INTERNAL_ASSERT( isAliasCompatible(input, output), "The input and output values are not alias-compatible."); diff --git a/torch/csrc/jit/codegen/cuda/fusion.h b/torch/csrc/jit/codegen/cuda/fusion.h index c892bd8171c8..d67d0e2fea9a 100644 --- a/torch/csrc/jit/codegen/cuda/fusion.h +++ b/torch/csrc/jit/codegen/cuda/fusion.h @@ -1,10 +1,11 @@ #pragma once #include +#include #include -#include #include +#include #include #include @@ -65,18 +66,19 @@ class TORCH_CUDA_CU_API FusionGuard { ~FusionGuard(); static Fusion* getCurFusion(); + static void setCurFusion(Fusion* fusion); }; //! Fusion is mutable but unique. Nodes cannot be copied in any way from one //! Fusion to another. If anything like that is desired, it would require -//! duplicating all associated values and exprs. Fusion is considered to SSA, +//! duplicating all associated values and exprs. Fusion is considered to be SSA, //! though this could also change in the future if there is a good reason to do //! so. //! //! The Fusion owns the whole IR graph (Vals and Exprs) //! // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) -class TORCH_CUDA_CU_API Fusion final { +class TORCH_CUDA_CU_API Fusion : public IrContainer { typedef std::unordered_map> PermutationMap; public: @@ -96,45 +98,30 @@ class TORCH_CUDA_CU_API Fusion final { //! Break dependency chains associated with Expr, remove references to expr //! delete expr - void removeExpr(Expr* expr); + void removeExpr(Expr* expr) override; //! Completely remove val from the fusion, break all dependencies associated //! with it - void removeVal(Val* val); + void removeVal(Val* val) override; //! Register input as an input of the fusion - // TODO: Rename to register void addInput(Val* input); //! Register output as an output of the fusion - // TODO: Rename to register void addOutput(Val* output); //! Register output as an output of the fusion - // TODO: Rename to register void addOutput(WelfordResult& output); //! Deregister input as an input of the fusion - // TODO: Rename to register void removeInput(Val* input); //! Deregister output as an output of the fusion - // TODO: Rename to register void removeOutput(Val* output); //! Replace output with another value void replaceOutput(Val* output, Val* replacement); - //! Clear Expr's from TV uses that are not required to produce outputs from - //! inputs - void resetTvUses(); - - //! Check if stmt is properly registered with this fusion - bool inFusion(const Statement* stmt) const; - - //! Throw an error if stmt is not in this fusion - void assertInFusion(const Statement* stmt, const std::string& msg = "") const; - //! Assert that all leaves found from outputs are registered as an input void validateInputs(); @@ -149,18 +136,7 @@ class TORCH_CUDA_CU_API Fusion final { void printTransforms(); //! Lower the fusion and print a kernel - void printKernel(); - - //! Register the Val with this fusion - StmtNameType registerVal(Val* val); - - //! Register expr with this fusion. - //! When we register an expression, we want to update the dependency tracking - //! of Vals. We add expr to our general expr_set_, - StmtNameType registerExpr(Expr* expr); - - //! Register stmt with this fusion - StmtNameType registerStatement(Statement* stmt); + void printKernel(DataType index_type = DataType::Int); //! Return a list of topologically sorted expressions. This only includes //! exprs required to genereate registered outputs. @@ -169,12 +145,6 @@ class TORCH_CUDA_CU_API Fusion final { //! Return a vector of fusion inputs that feed this Val std::vector inputsOf(Val* val); - //! Return the set of Vals registered with this fusion - const std::unordered_set& vals() const noexcept; - - //! Return in insertion order - const std::deque& deterministic_vals() const noexcept; - //! Return all Vals in math expressions that cannot be eliminated. //! //! It is generally equivalent to vals that are used to generate @@ -183,11 +153,6 @@ class TORCH_CUDA_CU_API Fusion final { //! also included as they must show up in the final code. std::vector usedMathVals(); - //! Return the set of Exprs registered with this fusion. Warning: This will - //! return exprs outside inputs/outputs, so can be unsafe for use with - //! segmented fusions. - const std::unordered_set& unordered_exprs() const noexcept; - //! Return all Exprs that use val std::unordered_set unordered_uses(Val* val) const; @@ -197,12 +162,6 @@ class TORCH_CUDA_CU_API Fusion final { //! Indicate to kernel to set itself up to generate random numbers bool isStochastic(); - //! Indicate that the fusion contains reduction operations - bool hasReduction(); - - //! Indicate that the fusion contains welford operations - bool hasWelford(); - //! Run fusion segmentation algorithm to create a segmented fusion std::unique_ptr segment( const at::ArrayRef& inputs); @@ -217,9 +176,6 @@ class TORCH_CUDA_CU_API Fusion final { std::vector getTerminatingOutputs(); - bool hasInput(const Val* val) const; - bool hasOutput(const Val* val) const; - // Aliasing output to input value, this is a WAR to allow inplace update on // input tensor. // Note: this is not always safe and should be used with extra caution. @@ -262,36 +218,40 @@ class TORCH_CUDA_CU_API Fusion final { return is_during_update_uses_; } + const auto& ioAlias() const { + return io_alias_; + } + protected: friend SegmentCandidateFinder; friend SegmentedFusion; friend class TranslateApplicableWelford; + friend Val; static IrCloner copy(const Fusion* from, Fusion* to); - private: - // Return an int that monotonically increases for each val/expr, some are - // explicitly incremented by type. - StmtNameType getValName(ValType vtype); - StmtNameType getExprName(); + //! Register the Val with this fusion + virtual void registerVal(Val* val) override; + + //! Register expr with this fusion. + //! When we register an expression, we want to update the dependency tracking + //! of Vals. If this container is a not a Kernel, it will remove previous + //! definitions of outputs and register this Expr as the definition. Otherwise + //! will update definition if not previously set, but will not remove old + //! definitions. + virtual void registerExpr(Expr* expr) override; + //! Clear Expr's from TV uses that are not required to produce outputs from + //! inputs. Only other place this is used (other than Fusion) is in + //! Val::uses() + void resetTvUses(); + + private: // Determine if the two values are compatible for aliasing // Same DataType, ValType, and number of dimensions bool isAliasCompatible(Val* left, Val* right); private: - // Sets of all Vals/Exprs registered with this fusion - // (val_deque_ is not owning the objects) - std::unordered_set val_set_; - std::deque val_deque_; - std::unordered_set expr_set_; - - // Values names counters - std::unordered_map val_type_name_map_; - - // Expression names counter - StmtNameType expr_name_counter_ = 0; - // Fusion inputs and outputs std::vector inputs_; std::vector outputs_; diff --git a/torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp b/torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp index 9ff257808141..1138af0ca363 100644 --- a/torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp +++ b/torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -322,7 +323,7 @@ void SegmentedFusion::draw() { for (auto group : groups()) { for (auto expr : group->exprs()) { - if (ir_utils::isTVOp(expr)) { + if (ir_utils::isTvOp(expr)) { expr_color_map[expr] = group_index; } } @@ -559,7 +560,10 @@ std::vector groupExprPrintSorting(const std::vector& exprs) { std::unordered_set exprs_to_print_set(exprs.begin(), exprs.end()); std::unordered_set exprs_visited; std::vector sorted_list; - while (sorted_list.size() != exprs_to_print.size()) { + while (!std::all_of( + exprs_to_print.begin(), + exprs_to_print.end(), + [&exprs_visited](auto expr) { return exprs_visited.count(expr); })) { bool expr_added_to_sorted_list = false; for (auto expr : exprs_to_print) { if (!exprs_visited.count(expr)) { @@ -652,15 +656,15 @@ TensorView* castIntermediateValueInCompleteFusion( // Keep broadcast axes and remove reduction axes size_t i = 0; auto no_reduction_root_domain = - TensorDomain::noReductions(original_tv->getRootDomain()); + TensorDomain::noReductions(original_tv->getMaybeRFactorDomain()); std::vector new_root_domain(no_reduction_root_domain.size()); for (const auto& dom : no_reduction_root_domain) { - new_root_domain[i++] = dom->clone(); + new_root_domain[i++] = dom->cloneWithoutRFactor(); } // Create the actual domain and tv. - return new TensorView( - new TensorDomain( + return IrBuilder::create( + IrBuilder::create( new_root_domain, std::vector(new_root_domain.size(), true)), data_type); }; @@ -680,8 +684,8 @@ TensorView* castIntermediateValueInCompleteFusion( } // Insert the cast ops. - new UnaryOp(UnaryOpType::Cast, half_precision_tv, original_tv); - new UnaryOp(UnaryOpType::Cast, fp32_tv, half_precision_tv); + IrBuilder::create(UnaryOpType::Cast, half_precision_tv, original_tv); + IrBuilder::create(UnaryOpType::Cast, fp32_tv, half_precision_tv); // Return the new tv to replace original tv with // on the segmented edges. @@ -721,7 +725,7 @@ void SegmentedFusion::finalize() { // \ -> half2float -> other uses in group // The conversion back and forth from half precision can hurt numerics. // Collect expressions that use the edge value of concern within the from - // group to avoid replacing with the casted tensor. + // group to avoid replacing with the cast tensor. std::unordered_set uses_in_from_group; // All expressions in the from group of the edge @@ -1125,6 +1129,7 @@ std::ostream& operator<<( return group_order.at(edge_a->from) < group_order.at(edge_b->from); }); + os << "Segmented_Fusion Dump: -- fusion segments:\n"; os << "Segmented_Fusion{ \n"; os << "groups: \n"; for (const auto g : sorted_groups_to_print) { @@ -1143,6 +1148,9 @@ std::ostream& operator<<( } void SegmentedFusion::print() const { + std::cout << "Segmented_Fusion Dump: -- Re-written complete fusion:{\n"; + completeFusion()->printMath(); + std::cout << "} // {Re-written complete fusion}\n"; std::cout << this << "\n"; } @@ -1170,14 +1178,24 @@ std::unique_ptr SegmentedFusion::makeFusion(SegmentedGroup* sg) { fusion_segment->removeOutput(out); } + std::vector view_tvs; for (auto inp : getAllInputs(sg)) { - fusion_segment->addInput(complete_to_segment_map.clone(inp)); + auto clone_tv = complete_to_segment_map.clone(inp); + fusion_segment->addInput(clone_tv); + if (inp->isDefinitionType(ExprType::ViewOp)) { + TORCH_INTERNAL_ASSERT(clone_tv != nullptr && clone_tv->isA()); + view_tvs.push_back(clone_tv->as()); + } } for (auto out : getAllOutputs(sg)) { fusion_segment->addOutput(complete_to_segment_map.clone(out)); } + for (auto tv : view_tvs) { + tv->convertRfactorToRootDomain(); + } + return fusion_segment; } @@ -1570,6 +1588,8 @@ c10::optional tryMerge( SegmentedGroup* b = nullptr) { FusionSegmentGuard fsg(fusion, getAllInputs(a, b), getAllOutputs(a, b)); + scheduler_debug_utils::canScheduleMessage( + "\n**Segmenter** Considering fusion:\n", fusion); return SchedulerEntry::proposeHeuristics(fusion, runtime_info); } @@ -1581,6 +1601,8 @@ c10::optional tryMerge( fusion, allInputsIfTrueElseOutputs(segmented_groups, true), allInputsIfTrueElseOutputs(segmented_groups, false)); + scheduler_debug_utils::canScheduleMessage( + "\n**Segmenter** Considering fusion:\n", fusion); return SchedulerEntry::proposeHeuristics(fusion, runtime_info); } @@ -1740,9 +1762,10 @@ TranslateApplicableWelford::TranslateApplicableWelford( Fusion* fusion, const at::ArrayRef& runtime_inputs) : runtime_inputs_(runtime_inputs) { + auto exprs = fusion->exprs(); std::vector orignal_welfords( - ir_utils::filterByType(fusion->unordered_exprs()).begin(), - ir_utils::filterByType(fusion->unordered_exprs()).end()); + ir_utils::filterByType(exprs).begin(), + ir_utils::filterByType(exprs).end()); if (wouldTranslateToPersistent(orignal_welfords)) { for (auto welford : orignal_welfords) { @@ -1829,6 +1852,14 @@ bool TranslateApplicableWelford::wouldTranslateToPersistent( [&original_to_test_map](auto welford) { return original_to_test_map.clone(welford); }); + // Copied welfords will be invalidated on translation, but Vals will be + // reused, keep a reference to them. + std::vector welford_avgs; + std::vector welford_vars; + for (auto welford : copied_welfords) { + welford_avgs.push_back(welford->outAvg()); + welford_vars.push_back(welford->outVar()); + } // Translate the welford ops for (auto welford_to_translate : copied_welfords) { @@ -1860,6 +1891,21 @@ bool TranslateApplicableWelford::wouldTranslateToPersistent( return original_to_test_map.clone(out); }); + // If only average is used from welford, we should still translate, but we + // might not detect persistence if variance isn't actually used/marked as an + // output in the test. + for (auto outs_i : c10::irange(welford_avgs.size())) { + auto avg = welford_avgs[outs_i]; + auto var = welford_vars[outs_i]; + if (avg->uses().empty()) { + test_group_outputs_.push_back(avg); + } + + if (var->uses().empty()) { + test_group_outputs_.push_back(var); + } + } + // Temporarily localize test copy around // the group boundary FusionSegmentGuard fsg( @@ -1891,29 +1937,40 @@ void TranslateApplicableWelford::translateSingleWelford(WelfordOp* welford) { auto out_N = welford->outN()->as(); fusion->removeExpr(welford); + // Not safe to use welford anymore + welford = nullptr; // Create normalization based welford graph // largely taken from batchnorm cpp benchmark - auto& in_root = in_val->getRootDomain(); - auto& out_root = out_avg->getRootDomain(); + const auto& in_root = + TensorDomain::noReductions(in_val->getMaybeRFactorDomain()); + const auto& out_root = out_avg->getRootDomain(); std::vector red_axes; + TORCH_INTERNAL_ASSERT( + in_root.size() == out_root.size(), + "Invalid root domains of Welford input and output.", + " Input: ", + ir_utils::toString(in_root), + ". Output: ", + ir_utils::toString(out_root)); + // Create scalar version of the feature element // counting. - Val* num_features = new Double(1); + Val* num_features = IrBuilder::create(1); std::vector broadcast_mask(in_root.size(), false); for (const auto i : c10::irange(in_root.size())) { - if (out_root[i]->isReduction()) { + if (out_root.at(i)->isReduction()) { red_axes.push_back(i); broadcast_mask[i] = true; - num_features = mul(num_features, out_root[i]->extent()); + num_features = mul(num_features, out_root.at(i)->extent()); } } // Build a normalization expression group that is // equivalent to a welford operation. auto x_sum = sum(in_val, red_axes); - new BinaryOp(BinaryOpType::Div, out_avg, x_sum, num_features); + IrBuilder::create(BinaryOpType::Div, out_avg, x_sum, num_features); // welford.avg may be broadcast. Reuse it if found. TensorView* x_avg_bcast = nullptr; for (auto& use_expr : out_avg->uses()) { @@ -1949,8 +2006,12 @@ void TranslateApplicableWelford::translateSingleWelford(WelfordOp* welford) { } auto x_mean_sub_pow = mul(x_mean_sub, x_mean_sub); - new ReductionOp(BinaryOpType::Add, new Double(0.0), out_var, x_mean_sub_pow); - new UnaryOp(UnaryOpType::Set, out_N, num_features); + IrBuilder::create( + BinaryOpType::Add, + IrBuilder::create(0.0), + out_var, + x_mean_sub_pow); + IrBuilder::create(UnaryOpType::Set, out_N, num_features); // out_avg, out_N are now outputs of a pointwise ops and we // need to clear out its reduction domains. @@ -2584,7 +2645,8 @@ void SegmentCandidateFinder::findSegments() { while (!to_visit.empty()) { auto expr = to_visit.front(); to_visit.pop_front(); - if (expr->getExprType().value() != ExprType::UnaryOp) { + if (expr->getExprType().value() != ExprType::UnaryOp || + expr->output(0)->isFusionOutput()) { continue; } @@ -2687,14 +2749,20 @@ void SegmentCandidateFinder::findSegments() { } } + auto reduction_ops = ir_utils::getReductionOps( + segmented_fusion_->completeFusion(), true /* ignore_trivial */); + auto welford_ops = ir_utils::filterByType(reduction_ops); + if (options_.run_translate_welford && - segmented_fusion_->completeFusion()->hasWelford()) { + (welford_ops.begin() != welford_ops.end())) { TranslateApplicableWelford::run(segmented_fusion_.get(), runtime_inputs_); } for (auto group : groups()) { - // Set heuristics in case single reduction kernels were left out - group->setHeuristic(deriveHeuristic(group)); + if (!group->outputs().empty()) { + // Set heuristics in case single reduction kernels were left out + group->setHeuristic(deriveHeuristic(group)); + } } // Remove all scalar edges since they do not represent actual @@ -2764,12 +2832,12 @@ void SegmentCandidateFinder::findSegments() { if (options_.run_final_merge) { // TODO: consider interleaving herrmman merge and bruteforce merge, as - // bruteforce merge can introduce - // opportunities for more herrmann merge + // bruteforce merge can introduce opportunities for more herrmann merge finalMerge(); } finalize(); + if (isDebugDumpEnabled(DebugDumpOption::FusionSegmentsDrawing)) { segmented_fusion_->draw(); } @@ -2913,7 +2981,7 @@ void SegmentCandidateFinder::resolveInputsInGroup(SegmentedGroup* group) { group->input_vals = IterVisitor::getInputsTo(group->inputs()); // Grab all expressions needed to produce to_visit - auto input_exprs = ExprSort::getExprs(completeFusion(), to_visit); + auto input_exprs = StmtSort::getExprs(completeFusion(), to_visit); // Insert those expressions at the beginning of the group group->exprs_.insert( @@ -2978,6 +3046,7 @@ void SegmentCandidateFinder::finalize() { // Finalize each group, fill in the missing inputs, i.e. tensor dims. for (auto g : groups()) { + g->setHeuristic(deriveHeuristic(g)); g->finalize(); } } @@ -3102,8 +3171,7 @@ void SegmentedFusion::annotateFP16IntermediateTensors() { } } -TORCH_CUDA_CU_API std::string toString( - const SegmentCandidateFinderOptions& segment_options) { +std::string toString(const SegmentCandidateFinderOptions& segment_options) { std::stringstream ss; ss << "segmentation phases {\n"; if (segment_options.run_combine_reductions) { diff --git a/torch/csrc/jit/codegen/cuda/fusion_segmenter.h b/torch/csrc/jit/codegen/cuda/fusion_segmenter.h index 61fa966348e3..d9c4dfbd86af 100644 --- a/torch/csrc/jit/codegen/cuda/fusion_segmenter.h +++ b/torch/csrc/jit/codegen/cuda/fusion_segmenter.h @@ -129,7 +129,7 @@ class TORCH_CUDA_CU_API SegmentedGroup { int group_id_ = -1; //! The scheduler to use for compiling this group - ScheduleHeuristic heuristic_ = ScheduleHeuristic::PointWise; + ScheduleHeuristic heuristic_ = ScheduleHeuristic::None; //! Exprs that make up the group std::vector exprs_; @@ -275,7 +275,7 @@ class TORCH_CUDA_CU_API SegmentedFusion { } //! Returns the original un-segmented fusion - Fusion* completeFusion() { + Fusion* completeFusion() const { return complete_fusion_.get(); } @@ -288,11 +288,11 @@ class TORCH_CUDA_CU_API SegmentedFusion { } Val* findAlias(Val* val) const { - Val* alias_val = nullptr; - if (complete_fusion_->io_alias_.count(val) != 0) { - alias_val = complete_fusion_->io_alias_[val]; + auto alias_it = complete_fusion_->ioAlias().find(val); + if (alias_it != complete_fusion_->ioAlias().end()) { + return alias_it->second; } - return alias_val; + return nullptr; } //! Make a clone of the group and convert to fusion @@ -442,7 +442,8 @@ class TORCH_CUDA_CU_API SegmentCandidateFinder { SegmentCandidateFinderOptions options = SegmentCandidateFinderOptions()) { auto fusion_copy = std::make_unique(*fusion); if (isDebugDumpEnabled(DebugDumpOption::FusionSegments)) { - std::cout << "Segment the fusion: " << std::endl; + std::cout << "Segment the fusion (Original Fusion Un-modified): " + << std::endl; fusion_copy->printMath(); } SegmentCandidateFinder scf(std::move(fusion_copy), inputs, options); @@ -456,7 +457,8 @@ class TORCH_CUDA_CU_API SegmentCandidateFinder { SegmentCandidateFinderOptions options = SegmentCandidateFinderOptions()) { SegmentCandidateFinder scf(std::move(fusion), inputs, options); if (isDebugDumpEnabled(DebugDumpOption::FusionSegments)) { - std::cout << "Segment the fusion: " << std::endl; + std::cout << "Segment the fusion (Original Fusion Un-modified): " + << std::endl; scf.completeFusion()->printMath(); } return std::move(scf.segmented_fusion_); @@ -606,6 +608,7 @@ class TORCH_CUDA_CU_API SegmentCandidateFinder { const at::ArrayRef& runtime_inputs_; }; +// TODO: Make as member functions on classes instead of global scope TORCH_CUDA_CU_API std::string toString(const SegmentedGroup* group); TORCH_CUDA_CU_API std::string toString(const SegmentedEdge* edge); TORCH_CUDA_CU_API std::string toString(const SegmentedFusion* segmented_fusion); diff --git a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp b/torch/csrc/jit/codegen/cuda/graph_fuser.cpp index 08d3e89d21c5..c6ca212ccc29 100644 --- a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp +++ b/torch/csrc/jit/codegen/cuda/graph_fuser.cpp @@ -5,6 +5,8 @@ #include #include #include +#include +#include #include #include #include @@ -17,8 +19,10 @@ #include #include #include +#include #include +#include #include #include @@ -46,6 +50,13 @@ bool usedOnlyInDtype(Value* v) { Value* broadcastSizes(at::ArrayRef sizes) { AT_ASSERT(!sizes.empty()); Graph* graph = sizes[0]->owningGraph(); + Node* insertion_point = sizes[0]->node()->next(); + for (size_t i = 1; i < sizes.size(); i++) { + if (insertion_point->isBefore(sizes[i]->node()->next())) { + insertion_point = sizes[i]->node()->next(); + } + } + WithInsertPoint guard(insertion_point); Node* broadcast_n = graph->insertNode(graph->create(prim::BroadcastSizes, sizes)); broadcast_n->output()->setType(ListType::ofInts()); @@ -66,9 +77,13 @@ Value* createConditionalConstant(Node* profile_ivalue) { auto int_list = profile_ivalue->is(Symbol::attr("profiled_bool_list")); std::vector bool_list(int_list.begin(), int_list.end()); val = IValue(bool_list); - } else if (profile_ivalue->hasAttribute(Symbol::attr("profiled_size"))) { + } else if (profile_ivalue->hasAttribute( + Symbol::attr("profiled_reduction_size"))) { // int[] - val = IValue(profile_ivalue->is(Symbol::attr("profiled_size"))); + val = IValue(profile_ivalue->is(Symbol::attr("profiled_reduction_size"))); + } else if (profile_ivalue->hasAttribute(Symbol::attr("profiled_view_size"))) { + // int[] + val = IValue(profile_ivalue->is(Symbol::attr("profiled_view_size"))); } else if (profile_ivalue->hasAttribute(Symbol::attr("profiled_bool"))) { // bool val = IValue( @@ -77,6 +92,13 @@ Value* createConditionalConstant(Node* profile_ivalue) { // int val = IValue( static_cast(profile_ivalue->i(Symbol::attr("profiled_int")))); + } else if (profile_ivalue->hasAttribute(Symbol::attr("profiled_str"))) { + // str + val = IValue(static_cast( + profile_ivalue->s(Symbol::attr("profiled_str")))); + } else if (profile_ivalue->hasAttribute(Symbol::attr("profiled_ival"))) { + // ival + val = IValue(profile_ivalue->ival(Symbol::attr("profiled_ival"))); } else { GRAPH_DEBUG("profile_ivalue: ", *profile_ivalue); TORCH_WARN( @@ -97,6 +119,7 @@ struct CudaGraphFuser { std::unique_ptr aliasDb_; std::shared_ptr graph_; Symbol kind_ = prim::CudaFusionGroup; + std::unordered_map fusion_value_to_runtime_shape_; // nvrtc has a limit on the number of arguments allowed in a CUDA kernel. // The specific limit is a function of constant memory size, amount available @@ -556,7 +579,7 @@ struct CudaGraphFuser { Value* producer_for_chunk = *it; size_t producer_index = it - chunk->inputs().begin(); - // all uses of the chunk must be in in this consumer + // all uses of the chunk must be in this consumer for (auto s : chunk->outputs()) { for (auto u : s->uses()) { if (u.user != consumer) @@ -644,7 +667,7 @@ struct CudaGraphFuser { auto input_c_strides = input_strides.concrete_sizes().value(); auto output_c_sizes = producer_output_sizes.concrete_sizes().value(); int output_index = int(output_c_sizes.size()) - 1; - strides.resize(output_index); + strides.resize(output_index + 1); AT_ASSERT(output_index >= int(input_c_sizes.size()) - 1); for (int input_index = int(input_c_sizes.size()) - 1; input_index >= 0; input_index--, output_index--) { @@ -760,9 +783,11 @@ struct CudaGraphFuser { // longer valid so we rescan the new FusionGroup for more fusions... return std::make_pair(fusion_group.value()->reverseIterator(), true); } - // horizontal fusion only applies on tensor inputs + + // horizontal fusion only applies on non-scalar tensor inputs if (getHorizontalFusion() && - producer->type()->isSubtypeOf(*TensorType::get())) { + producer->type()->isSubtypeOf(*TensorType::get()) && + !is_cpu_scalar(*producer->type()->cast())) { // fusing nodes sharing inputs, this could save memory bandwidth by // reducing number of tensor read. for (const auto& u : producer->uses()) { @@ -834,6 +859,7 @@ struct CudaGraphFuser { // Builds up expressions that compute shapes of all intermediates (and // outputs) of the fusion group, based on the sizes of inputs. You should run // DCE to remove those that you end up not using. + // TODO: Add shape support for view, reshape, unsqueeze, and squeeze std::unordered_map buildShapeExpressions(Node* fusion_group) { WithInsertPoint insert_guard{fusion_group->next()}; std::unordered_map shape_of; @@ -846,7 +872,9 @@ struct CudaGraphFuser { AT_ASSERT(inputs.size() == sinputs.size()); for (const auto i : c10::irange(inputs.size())) { if (inputs[i]->type()->isSubtypeOf(*TensorType::get())) { - shape_of[sinputs[i]] = graph->insert(aten::size, {inputs[i]}); + auto sinput_value = graph->insert(aten::size, {inputs[i]}); + shape_of[sinputs[i]] = sinput_value; + sinput_value->node()->moveBefore(fusion_group); } } @@ -865,6 +893,26 @@ struct CudaGraphFuser { } } + // Place all the shape expressions for intermediates in fusion + // before the CudaFusionGroup + graph->setInsertPoint(fusion_group); + + // hmmm, do I need to setInsertPoint... + const auto map_inputs = [&](Value* v) -> Value* { + // if constant ever has an input, it has to come from + // profile_ivalue dependency + if (v->node()->kind() == prim::Param && + fusion_group->input(v->offset())->node()->kind() == + prim::profile_ivalue) { + // we need to map it along profile_ivalue dependency + return fusion_group->input(v->offset()); + } else { + throw std::runtime_error( + std::string("unexpected input from node") + + v->node()->kind().toDisplayString()); + } + }; + for (Node* n : subgraph->nodes()) { // XXX: Use of shape_of.emplace is crucial to the output shape // optimization! @@ -900,7 +948,11 @@ struct CudaGraphFuser { // extended shape expression support to reduction operations // TODO: `aten::sum` is too flexible, we should restrict for a better // match - if (n->kind() == aten::sum) { + // TODO: Add python tests where we check for existing ops and their + // shape expression logic. + static std::unordered_set reduction_ops( + {aten::sum, aten::mean, aten::var, aten::std}); + if (reduction_ops.find(n->kind()) != reduction_ops.end()) { // TODO: expand support to wire non-constant inputs, this is currently // blocked by profiling executor not capable of profiling scalar inputs. TORCH_INTERNAL_ASSERT( @@ -908,21 +960,6 @@ struct CudaGraphFuser { n->input(2)->node()->kind() == prim::Constant, "only supports reduction axes and keepdim being constant"); - // hmmm, do I need to setInsertPoint... - const auto map_inputs = [&](Value* v) -> Value* { - // if constant ever has an input, it has to come from - // profile_ivalue dependency - if (v->node()->kind() == prim::Param && - fusion_group->input(v->offset())->node()->kind() == - prim::profile_ivalue) { - // we need to map it along profile_ivalue dependency - return fusion_group->input(v->offset()); - } else { - throw std::runtime_error( - std::string("unexpected input from node") + - v->node()->kind().toDisplayString()); - } - }; Node* in1_const = graph->createClone(n->input(1)->node(), map_inputs); graph->insertNode(in1_const); Node* in2_const = graph->createClone(n->input(2)->node(), map_inputs); @@ -996,6 +1033,57 @@ struct CudaGraphFuser { } continue; } + if (n->kind() == aten::native_dropout) { + TORCH_INTERNAL_ASSERT( + shape_of.count(n->input(0)) > 0, + "buildShapeExpressions failed at accessing input shapes"); + shape_of.emplace(n->output(0), shape_of.at(n->input(0))); + shape_of.emplace(n->output(1), shape_of.at(n->input(0))); + continue; + } + if (n->kind() == prim::unsqueeze_copy) { + TORCH_INTERNAL_ASSERT( + shape_of.count(n->input(0)) > 0, + "buildShapeExpressions failed at accessing input shapes"); + TORCH_INTERNAL_ASSERT( + n->input(1)->node()->kind() == prim::Constant, + "only supports unsqueeze axes being constant"); + Node* dim_const = graph->createClone(n->input(1)->node(), map_inputs); + graph->insertNode(dim_const); + std::vector inputs = { + shape_of.at(n->input(0)), dim_const->output()}; + Node* size_node = graph->insertNode(graph->create( + Symbol::fromQualString("prim::infer_unsqueeze_size"), inputs, 1)); + Value* size = size_node->output(0); + size->setType(ListType::ofInts()); + shape_of.emplace(n->output(), size); + continue; + } + if (n->kind() == prim::squeeze_copy) { + TORCH_INTERNAL_ASSERT( + shape_of.count(n->input(0)) > 0, + "buildShapeExpressions failed at accessing input shapes"); + TORCH_INTERNAL_ASSERT( + n->inputs().size() == 2 || n->inputs().size() == 1, + "prim::squeeze_copy expects one or two inputs"); + std::vector inputs = {shape_of.at(n->input(0))}; + + if (n->inputs().size() == 2) { + TORCH_INTERNAL_ASSERT( + n->input(1)->node()->kind() == prim::Constant, + "only supports squeeze axes being constant"); + Node* dim_const = graph->createClone(n->input(1)->node(), map_inputs); + graph->insertNode(dim_const); + inputs.push_back(dim_const->output()); + } + Node* size_node = graph->insertNode(graph->create( + Symbol::fromQualString("prim::infer_squeeze_size"), inputs, 1)); + Value* size = size_node->output(0); + size->setType(ListType::ofInts()); + shape_of.emplace(n->output(), size); + continue; + } + auto tensor_inputs = filter(n->inputs(), [](Value* v) { return v->type()->isSubtypeOf(*TensorType::get()); }); @@ -1021,8 +1109,10 @@ struct CudaGraphFuser { // TODO: failure in buildShapeExpressions should not break fusion execution, // we can add a try/catch here to bailout from removeOutputsUsedOnlyInSize. GRAPH_DEBUG("before build shape expression: ", *graph_); - auto shape_of = buildShapeExpressions(fusion_group); + auto shape_map = buildShapeExpressions(fusion_group); + fusion_value_to_runtime_shape_.insert(shape_map.begin(), shape_map.end()); GRAPH_DEBUG("after build shape expression: ", *graph_); + auto outputs = fusion_group->outputs().vec(); auto soutputs = subgraph->outputs().vec(); // XXX: Iterating in this order is not only good for performance reasons! @@ -1031,12 +1121,12 @@ struct CudaGraphFuser { for (int64_t i = static_cast(outputs.size()) - 1; i >= 0; --i) { auto output = outputs[i]; auto soutput = soutputs[i]; - if (usedOnlyInDtypeAndSize(output) && shape_of.count(soutput) > 0) { + if (usedOnlyInDtypeAndSize(output) && shape_map.count(soutput) > 0) { bool has_dtype = usedInDtype(output); auto uses = output->uses(); for (Use u : uses) { if (u.user->matches("aten::size(Tensor self) -> int[]")) { - u.user->output()->replaceAllUsesWith(shape_of.at(soutput)); + u.user->output()->replaceAllUsesWith(shape_map.at(soutput)); u.user->destroy(); } else if (u.user->matches("prim::dtype(Tensor a) -> int")) { continue; @@ -1126,7 +1216,12 @@ struct CudaGraphFuser { for (Node* node : block_->nodes()) { for (Block* sub_block : node->blocks()) { - CudaGraphFuser(sub_block, graph_).run(); + CudaGraphFuser sub_block_cfg(sub_block, graph_); + sub_block_cfg.run(); + // Accumulate runtime shapes for all sub-blocks + fusion_value_to_runtime_shape_.insert( + sub_block_cfg.fusion_value_to_runtime_shape_.begin(), + sub_block_cfg.fusion_value_to_runtime_shape_.end()); } } } @@ -1282,6 +1377,55 @@ void PeepholeOptimizeShapeExpressions(Block* block) { } } +// view_sizes_runtime is the profiled-ivalue argument for view-size. +// view_sizes_constant_list is the constant list recorded during profiling runs. +Value* guardView( + Node* fusion, + std::unordered_map& fusion_value_to_runtime_size, + Node* versioning_if, + Node* view, + Value* view_sizes_runtime) { + // 1. Get self tensor sizes and view_sizes + auto self_value = view->inputs().front(); + auto self_type = self_value->type()->cast(); + auto self_sizes_constant_list = getTensorSizes(self_type); + + auto view_sizes_constant_list = + constant_as>(view->inputs().back()); + TORCH_INTERNAL_ASSERT(view_sizes_constant_list.has_value()); + + // 2. Get constraints for self tensor and view_sizes + auto constraints = analyzeViewConstraint( + self_sizes_constant_list, view_sizes_constant_list->vec()); + + // 3. Add constraints as constant to graph + auto self_tensor_constraint = fusion->owningGraph()->insertConstant( + IValue(constraints.original_constraint)); + self_tensor_constraint->node()->moveBefore(versioning_if); + auto view_sizes_constraint = + fusion->owningGraph()->insertConstant(IValue(constraints.new_constraint)); + view_sizes_constraint->node()->moveBefore(versioning_if); + + // 4. Create CudaFusionViewGuard using input tensor, profile_ivalue + // for view_sizes list, and constraints + TORCH_INTERNAL_ASSERT( + fusion_value_to_runtime_size.find(self_value) != + fusion_value_to_runtime_size.end(), + "Failed to find runtime size for fusion value:\t", + self_value->node()->kind().toDisplayString()); + Node* viewcheck_node = + fusion->owningGraph() + ->create( + c10::Symbol::fromQualString("prim::CudaFusionViewGuard"), + {fusion_value_to_runtime_size.at(self_value), + view_sizes_runtime, + self_tensor_constraint, + view_sizes_constraint}, + 1) + ->insertBefore(versioning_if); + return viewcheck_node->output(); +} + //! [ Note -- CudaFusionGuard implementation ] //! //! shamelessly copying code from NNC (tensorexpr_fuser) with very little @@ -1320,7 +1464,9 @@ void PeepholeOptimizeShapeExpressions(Block* block) { //! //! TODO: we also need to assert/check reduction axes and replace it with //! constants in `CudaFusionGroup` -void guardFusionGroup(Node* fusion) { +void guardFusionGroup( + Node* fusion, + std::unordered_map& fusion_value_to_runtime_size) { // Fixup types of the subgraph inputs std::vector guard_types; std::vector tensor_inputs_to_check; @@ -1371,10 +1517,12 @@ void guardFusionGroup(Node* fusion) { versioning_if->insertAfter(typecheck_node); + auto fusion_graph = fusion->g(attr::Subgraph); + std::vector check_flags = {}; + // Fill in the false block. It should contain the unoptimized // copy of the fused subgraph, unless we have conditional constants from // profiled_ivalue; - auto fusion_graph = fusion->g(attr::Subgraph); std::shared_ptr fb_graph; // resource holder; // Restore the dependency for constant introduced by profiled_ivalue within // the graph. @@ -1421,11 +1569,10 @@ void guardFusionGroup(Node* fusion) { // 2. REMOVE conditional constant dependency in fusion group size_t compensation = 0; - // get a constant false, which is used by `and` pattern later + // get a constant true, which is used by `and` pattern later auto const_true = fusion->owningGraph()->insertConstant(IValue(true)); const_true->node()->moveBefore(versioning_if); - std::vector check_flags = {}; for (const auto& original_offset : profiled_ivalue_indices) { size_t offset = original_offset - compensation; @@ -1453,7 +1600,7 @@ void guardFusionGroup(Node* fusion) { ->insertBefore(versioning_if) ->output(); } else if (fusion->input(offset)->node()->hasAttribute( - Symbol::attr("profiled_size"))) { + Symbol::attr("profiled_reduction_size"))) { // TODO(profile_size): check sizes here with special size comparison op // TORCH_INTERNAL_ASSERT(false, "not implemented yet"); ivalue_check = @@ -1464,6 +1611,40 @@ void guardFusionGroup(Node* fusion) { 1) ->insertBefore(versioning_if) ->output(); + } else if (fusion->input(offset)->node()->hasAttribute( + Symbol::attr("profiled_view_size"))) { + // TODO: Add support for dynamic split to view guard + + // Path from profile-ivalue to prim::view_copy operation + // profile-ivalue -> Constant -> CudaFusionGroup + // Get argument position in CudaFusionGroup + // Get argument in subgraph for CudaFusionGroup + // CudaFusionGroup argument -> Constant List -> prim::view_copy + auto subgraph_arg = fusion_graph->inputs()[offset]; + auto constant = subgraph_arg->uses().front().user->output(); + + TORCH_INTERNAL_ASSERT(!constant->uses().empty()); + auto view = constant->uses().front().user; + TORCH_INTERNAL_ASSERT( + view->kind() == prim::view_copy || + view->kind() == prim::reshape_copy); + + ivalue_check = guardView( + fusion, + fusion_value_to_runtime_size, + versioning_if, + view, + profiled_ival); + } else if (fusion->input(offset)->node()->hasAttribute( + Symbol::attr("profiled_ival"))) { + ivalue_check = + fusion->owningGraph() + ->create( + c10::Symbol::fromQualString("prim::CudaFusionIvalGuard"), + {profiled_ival, const_o}, + 1) + ->insertBefore(versioning_if) + ->output(); } else { ivalue_check = fusion->owningGraph() ->create(aten::eq, {profiled_ival, const_o}, 1) @@ -1491,22 +1672,24 @@ void guardFusionGroup(Node* fusion) { fusion_graph->eraseInput(offset); compensation++; } - - if (!check_flags.empty()) { - // attaching output from CudaFusionGuard to profile ivalue checks - check_flags.emplace_back(typecheck_result); - auto graph = fusion->owningGraph(); - auto bool_list_node = - graph->insertNode(graph->createList(BoolType::get(), check_flags)); - bool_list_node->moveBefore(versioning_if); - Value* bool_list = bool_list_node->output(); - // new typecheck_result - typecheck_result = graph->insert(aten::all, {bool_list}); - typecheck_result->node()->moveBefore(versioning_if); - } // update graph in fusion node fusion->g_(attr::Subgraph, fusion_graph); - } else { + } + + if (!check_flags.empty()) { + // attaching output from CudaFusionGuard to profile ivalue checks + check_flags.emplace_back(typecheck_result); + auto graph = fusion->owningGraph(); + auto bool_list_node = + graph->insertNode(graph->createList(BoolType::get(), check_flags)); + bool_list_node->moveBefore(versioning_if); + Value* bool_list = bool_list_node->output(); + // new typecheck_result + typecheck_result = graph->insert(aten::all, {bool_list}); + typecheck_result->node()->moveBefore(versioning_if); + } + + if (profiled_ivalue_indices.empty()) { WithInsertPoint guard(false_block->return_node()); const auto subgraph_outputs = insertGraph(*fusion->owningGraph(), *fusion_graph, fusion->inputs()); @@ -1532,11 +1715,13 @@ void guardFusionGroup(Node* fusion) { } } -void guardFusionGroups(Block* block) { +void guardFusionGroups( + Block* block, + std::unordered_map& fusion_value_to_runtime_size) { std::vector fusions; for (Node* n : block->nodes()) { for (Block* b : n->blocks()) { - guardFusionGroups(b); + guardFusionGroups(b, fusion_value_to_runtime_size); } if (n->kind() == prim::CudaFusionGroup) { fusions.push_back(n); @@ -1546,7 +1731,18 @@ void guardFusionGroups(Block* block) { // step 1: a. add prim::CudaFusionGuard and fallback logic // b. insert guard logic of profile_ivalue with if block // c. restore conditional constant to non-constant for fallback - guardFusionGroup(fusion); + guardFusionGroup(fusion, fusion_value_to_runtime_size); + } +} + +void dumpFusionGroups(std::shared_ptr& g) { + DepthFirstGraphNodeIterator it(g); + Node* n = nullptr; + GRAPH_DEBUG("Exporting all NVFuser fusions:"); + while ((n = it.next()) != nullptr) { + if (n->kind() == prim::FallbackGraph) { + GRAPH_EXPORT("", n->g(attr::Subgraph)); + } } } @@ -1840,23 +2036,6 @@ void ExtractProfileIValue(Node* profile_ivalue) { } } -void traverseProfileIValues( - Block* block, - const std::function& func) { - std::vector profile_ivalues; - for (Node* n : block->nodes()) { - for (Block* b : n->blocks()) { - traverseProfileIValues(b, func); - } - if (n->kind() == prim::profile_ivalue) { - profile_ivalues.push_back(n); - } - } - for (Node* profile_ivalue : profile_ivalues) { - func(profile_ivalue); - } -} - // break `linear` layer into `matmul` and `add_optional`. This allows us to fuse // the binary operation without supporting gemm. // Note that we are not breaking `linear` layer without bias. @@ -1866,7 +2045,7 @@ void decomposeLinearOps(Block* block) { for (Block* b : n->blocks()) { decomposeLinearOps(b); } - // only decompose `linear` layer with bias. + // only decompose `linear` layer with bias if (n->kind() == aten::linear && !n->input(2)->type()->isSubtypeOf( static_cast(NoneType::get()))) { @@ -1881,16 +2060,30 @@ void decomposeLinearOps(Block* block) { auto matmul = graph->insertNode( graph->create(aten::matmul, {n->input(0), weight_t->output()}, 1)); auto input_tensor_type = n->input(0)->type()->cast(); + if (!input_tensor_type) { + TORCH_WARN_ONCE( + "linear input 0 is required to be tensor for linear decompose"); + continue; + } auto mat0_size = input_tensor_type->sizes().concrete_sizes(); auto mat1_size = n->input(1)->type()->cast()->sizes().concrete_sizes(); - // TODO: The assert is not necessary when we can handle matmul, right now we - // are splitting the linear between matmul & bias_add. Our fuser can only - // take the second half and we would need the size information. - TORCH_INTERNAL_ASSERT( - mat0_size.has_value() && mat1_size.has_value(), - "concrete shape for linear input & weight are required"); + // TODO: Continuing here is not necessary when we can handle matmul, right + // now we are splitting the linear between matmul & bias_add. Our fuser can + // only take the second half and we would need the size information. + if (!mat0_size.has_value() || !mat1_size.has_value()) { + TORCH_WARN_ONCE( + "concrete shape for linear input & weight are required to decompose into matmul + bias"); + continue; + } + + // only decompose for input with nDims >= 4. since lower rank linear eager + // is already fused + if (mat0_size->size() < 4) { + continue; + } + auto out_size = mat0_size.value(); TORCH_INTERNAL_ASSERT( mat1_size->size() == 2 || mat1_size->size() == 1, @@ -1914,6 +2107,101 @@ void decomposeLinearOps(Block* block) { } } +// Replace 'operation' with 'operation_copy' to guard alias operations. +// Supports View, Reshape, Squeeze, and Unsqueeze +void replaceAliasOpsWithCopy(std::shared_ptr& graph, Block* block) { + static std::unordered_map alias_to_copy_mapping; + // TODO: revert disabled aten::view + // ({{aten::view, prim::view_copy}, + // {aten::reshape, prim::reshape_copy}, + // {aten::squeeze, prim::squeeze_copy}, + // {aten::unsqueeze, prim::unsqueeze_copy}, + // {aten::flatten, prim::flatten_copy}}); + + std::vector maybe_safe_alias_nodes; + for (Node* n : block->nodes()) { + for (Block* b : n->blocks()) { + replaceAliasOpsWithCopy(graph, b); + } + if (alias_to_copy_mapping.find(n->kind()) != alias_to_copy_mapping.end()) { + maybe_safe_alias_nodes.push_back(n); + } + } + + auto alias_db = std::make_unique(graph); + + auto safeToChangeAliasToCopy = [&alias_db](Node* n) { + return !alias_db->hasWriters(n->input(0)) && + !alias_db->hasWriters(n->output(0)); + }; + + auto replaceAliasWithCopy = [&graph, &alias_db](Node* n) { + WithInsertPoint guard(n); + auto copy_op = graph->insertNode( + graph->create(alias_to_copy_mapping[n->kind()], n->inputs(), 1)); + copy_op->output()->setType(n->output(0)->type()); + + // adding newly created value into alias_db; + alias_db->createValue(copy_op->output()); + + n->output()->replaceAllUsesWith(copy_op->output()); + n->destroy(); + }; + + for (Node* n : maybe_safe_alias_nodes) { + if (!safeToChangeAliasToCopy(n)) { + continue; + } + replaceAliasWithCopy(n); + } +} + +// Revert all 'operation_copy' with 'operation' except in CudaFusionGroup +// e.g., Any non-fused alias operation including within the prim::FallbackGraph +// Supports View, Reshape, Squeeze, and Unsqueeze +void revertAliasCopyOps(std::shared_ptr& graph, Block* block) { + static std::unordered_map copy_to_alias_mapping; + // TODO: revert disabled aten::view + // ({{prim::view_copy, aten::view}, + // {prim::flatten_copy, aten::flatten}, + // {prim::reshape_copy, aten::reshape}, + // {prim::squeeze_copy, aten::squeeze}, + // {prim::unsqueeze_copy, aten::unsqueeze}}); + + std::vector alias_copy_ops; + for (Node* n : block->nodes()) { + // Allow alias copy ops in CudaFusionGroup + if (n->kind() == prim::CudaFusionGroup) { + continue; + } + // Revert alias copy ops within FallbackGraph + if (n->kind() == prim::FallbackGraph) { + auto subgraph = n->g(attr::Subgraph); + revertAliasCopyOps(subgraph, subgraph->block()); + } + for (Block* b : n->blocks()) { + revertAliasCopyOps(graph, b); + } + // Revert any non-fused alias copy ops + if (copy_to_alias_mapping.find(n->kind()) != copy_to_alias_mapping.end()) { + alias_copy_ops.push_back(n); + } + } + + auto replaceCopyWithAlias = [&graph](Node* n) { + WithInsertPoint guard(n); + auto alias_op = graph->insertNode( + graph->create(copy_to_alias_mapping[n->kind()], n->inputs(), 1)); + alias_op->output()->setType(n->output(0)->type()); + n->output()->replaceAllUsesWith(alias_op->output()); + n->destroy(); + }; + + for (Node* n : alias_copy_ops) { + replaceCopyWithAlias(n); + } +} + // break `conv2d` layer into `conv2d` and `add_optional`. This allows us to fuse // the binary operation without supporting gemm. // Note that we are not breaking `conv2d` layer without bias. @@ -1941,9 +2229,11 @@ void decomposeConvOps(Block* block) { auto bias_tensor_type = n->input(2)->type()->cast(); auto bias_size_opt = bias_tensor_type->sizes().concrete_sizes(); - TORCH_INTERNAL_ASSERT( - bias_size_opt.has_value(), - "concrete shape for bias input to conv2d are required"); + if (!bias_size_opt.has_value()) { + TORCH_WARN_ONCE( + "concrete shape for bias input is required to decompose into conv + bias"); + continue; + } // bias shape (C) auto bias_size = bias_size_opt.value(); @@ -1994,6 +2284,67 @@ bool removeInplaceOperations(const std::shared_ptr& graph) { graph, [&](Node* node) { return inplace_ops.count(node->kind()) != 0; }); } +// Recursively traverse blocks, gather all nodes with given symbol, +// and then apply mutator function. +void mutateNode( + Block* block, + Symbol symbol, + const std::function& func) { + // Recursively call mutateNode on blocks + // Gather all nodes with given symbol + std::vector nodes; + for (Node* n : block->nodes()) { + for (Block* b : n->blocks()) { + mutateNode(b, symbol, func); + } + if (n->kind() == symbol) { + nodes.push_back(n); + } + } + + // Apply mutator funcion to every node + for (Node* n : nodes) { + func(n); + } +} + +// For the given CudaFusionGroup, separate nested views and remove any unused, +// intermediate views +void separateNestedViews(Node* cuda_fusion_group) { + TORCH_INTERNAL_ASSERT(cuda_fusion_group->kind() == prim::CudaFusionGroup); + + auto isView = [](Node* node) { + static std::unordered_set alias_op_set( + {prim::view_copy, prim::reshape_copy}); + return alias_op_set.find(node->kind()) != alias_op_set.end(); + }; + + // node -> input / output values + auto isNestedView = [&isView](Node* node) { + return isView(node) && isView(node->input(0)->node()); + }; + + auto subgraph = cuda_fusion_group->g(attr::Subgraph); + for (auto node : subgraph->block()->nodes()) { + if (isNestedView(node)) { + // grandparent -> (view / reshape) parent -> (view / reshape) node + auto parent_value = node->input(0); + auto parent = parent_value->node(); + + auto grandparent_value = parent->input(0); + C10_UNUSED auto grandparent = grandparent_value->node(); + + // Before: gp -> x -> n + // After: gp -> x / gp -> n + // Delete x if no more uses + node->replaceInputWith(parent_value, grandparent_value); + if (!parent->hasUses()) { + parent->destroy(); + } + } + } +} + } // anonymous namespace void CudaFuseGraph(std::shared_ptr& graph) { @@ -2004,7 +2355,7 @@ void CudaFuseGraph(std::shared_ptr& graph) { // I don't know how to store edge/node in attribute. so let's abuse data flow // dependency and add inputs to conditional constant generated by // aten::profile_ivalue - traverseProfileIValues(graph->block(), ExtractProfileIValue); + mutateNode(graph->block(), prim::profile_ivalue, ExtractProfileIValue); GRAPH_DEBUG("insert conditional constant from profile_ivalue: ", *graph); // TODO: we need to properly restore shape information after fusion. @@ -2026,12 +2377,16 @@ void CudaFuseGraph(std::shared_ptr& graph) { decomposeConvOps(graph->block()); GRAPH_DEBUG("After decompose decompose Conv Ops by nvfuser: ", *graph); - CudaGraphFuser(graph->block(), graph).run(); + replaceAliasOpsWithCopy(graph, graph->block()); + GRAPH_DEBUG("replace alias_op with alias_copy by nvfuser: ", *graph); + + CudaGraphFuser cgf(graph->block(), graph); + cgf.run(); GRAPH_DEBUG("After Fusion: ", *graph); // guard input types as well as conditional constants from // aten::profile_ivalue - guardFusionGroups(graph->block()); + guardFusionGroups(graph->block(), cgf.fusion_value_to_runtime_shape_); GRAPH_DEBUG("After Guard Fusion: ", *graph); // mutate `aten::_batch_norm_impl_index` and @@ -2040,7 +2395,7 @@ void CudaFuseGraph(std::shared_ptr& graph) { alterBatchNormImpls(graph->block()); GRAPH_DEBUG("After _batch_norm_impl_index: ", *graph); - traverseProfileIValues(graph->block(), RemoveProfileIValue); + mutateNode(graph->block(), prim::profile_ivalue, RemoveProfileIValue); GRAPH_DEBUG("Before remove missing profiling: ", *graph); removeFusionWithMissingProfilingInformation(graph->block()); @@ -2049,6 +2404,16 @@ void CudaFuseGraph(std::shared_ptr& graph) { // optimization targeting AMP removeOutputUsedOnlyInDtype(graph->block()); GRAPH_DEBUG("After removeOutputUsedOnlyInDtype: ", *graph); + + mutateNode(graph->block(), prim::CudaFusionGroup, separateNestedViews); + GRAPH_DEBUG( + "separate nested and delete redundant views in CudaFusionGroup:", *graph); + + revertAliasCopyOps(graph, graph->block()); + GRAPH_DEBUG("revert alias_copy ops by nvfuser: ", *graph); + + dumpFusionGroups(graph); + // After FuseGraph some common subexpressions may come back EliminateCommonSubexpression(graph); // We might have emitted a fair amount of useless shape propagating code, so diff --git a/torch/csrc/jit/codegen/cuda/grouped_reduction.cpp b/torch/csrc/jit/codegen/cuda/grouped_reduction.cpp new file mode 100644 index 000000000000..5931eb3427aa --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/grouped_reduction.cpp @@ -0,0 +1,210 @@ +#include +#include +#include +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +namespace { + +// Return if ref and other are transformed in the same way. +bool hasMatchingTransformations(TensorView* ref, TensorView* other) { + std::unordered_map ref_2_other; + for (const auto i : c10::irange(ref->getRootDomain().size())) { + ref_2_other.emplace( + ref->getRootDomain().at(i), other->getRootDomain().at(i)); + } + + auto replay = + BestEffortReplay( + other->domain()->domain(), ref->domain()->domain(), ref_2_other) + .getReplay(); + + for (const auto i : c10::irange(ref->nDims())) { + auto ref_id = ref->axis(i); + auto other_id = other->axis(i); + auto it = replay.find(ref_id); + if (it == replay.end() || it->second != other_id) { + return false; + } + } + + return true; +} + +// Validate grouping of reductions and return a new max producer position +unsigned int validateReductionGrouping( + const std::vector& inputs, + const std::vector& outputs) { + TORCH_INTERNAL_ASSERT(inputs.size() == outputs.size()); + TORCH_INTERNAL_ASSERT(!inputs.empty()); + + auto fusion = dynamic_cast(outputs[0]->container()); + TORCH_INTERNAL_ASSERT( + fusion != nullptr, "Grouping of reductions must be done within a Fusion"); + + ExactRootDomainMap exact_map(fusion); + + // Pick the first output TV as a reference and compare it with the + // rest. Do not allow grouping if any mismatch is detected. + auto ref_tv = outputs[0]->as(); + const auto ref_domain = ref_tv->getRootDomain(); + const auto num_root_dims = ref_domain.size(); + const auto num_dims = ref_tv->nDims(); + const auto ref_ca_pos = ref_tv->getComputeAtPosition(); + auto max_producer_pos = ref_tv->getMaxProducerPosition(); + for (const auto i : c10::irange(inputs.size())) { + auto output_tv = outputs.at(i)->as(); + const auto& output_domain = output_tv->getRootDomain(); + if (ref_tv == output_tv) { + continue; + } + TORCH_INTERNAL_ASSERT( + output_domain.size() == num_root_dims, + "Invalid grouped reduction due to mismatched number of root dimensions. " + "Expected: ", + num_root_dims, + ". Detected: ", + output_domain.size(), + ". Invalid output tensor: ", + output_tv->toString()); + TORCH_INTERNAL_ASSERT( + output_tv->nDims() == num_dims, + "Invalid grouped reduction due to mismatched number of dimensions. " + "Expected: ", + num_dims, + ". Detected: ", + output_tv->nDims(), + ". Invalid output tensor: ", + output_tv->toString()); + for (const auto i : c10::irange(num_root_dims)) { + auto ref_id = ref_domain.at(i); + auto output_id = output_domain.at(i); + // If an IterDomain is broadcast, require the other + // corresponding IterDomains are also broadcast. This may not be + // necessary but not completely certain. + TORCH_INTERNAL_ASSERT( + ref_id->isBroadcast() == output_id->isBroadcast(), + "Invalid grouped reduction due to mismatched broadcast root domains. ", + "Reference domain: ", + ref_id->toString(), + ". Mismatched domain: ", + output_id->toString(), + ". Invalid tensor: ", + output_tv->toString()); + if (ref_id->isBroadcast()) { + continue; + } + TORCH_INTERNAL_ASSERT( + ref_id->isReduction() == output_id->isReduction(), + "Invalid grouped reduction due to mismatched reduction root domains. ", + "Reference domain: ", + ref_id->toString(), + ". Mismatched domain: ", + output_id->toString(), + ". Invalid tensor: ", + output_tv->toString()); + TORCH_INTERNAL_ASSERT( + exact_map.areMapped(ref_id, output_id) || ref_id->sameAs(output_id), + "Invalid grouped reduction due to mismatched root domains. ", + "Reference domain: ", + ref_id->toString(), + ". Mismatched domain: ", + output_id->toString(), + ". Invalid tensor: ", + output_tv->toString()); + } + + TORCH_INTERNAL_ASSERT( + hasMatchingTransformations(ref_tv, output_tv), + "Invalid grouped reduction due to mismatched transformations. ", + "Reference tensor: ", + ref_tv->toString(), + ". Mismatched tensor: ", + output_tv->toString()); + + // Must have the same computeAt position + TORCH_INTERNAL_ASSERT( + output_tv->getComputeAtPosition() == ref_ca_pos, + "Invalid grouped reduction due to mismatched computeAt position. ", + "Reference tensor: ", + ref_tv->toString(), + ". Mismatched tensor: ", + output_tv->toString()); + + max_producer_pos = + std::max(max_producer_pos, output_tv->getMaxProducerPosition()); + } + + // Must not have any data dependency from outputs to inputs + const auto all_dep_vals = DependencyCheck::getAllValsBetween( + {outputs.begin(), outputs.end()}, inputs); + if (!all_dep_vals.empty()) { + std::stringstream ss; + ss << "Invalid dependency:"; + for (auto val : all_dep_vals) { + ss << " " << val->toString(); + } + TORCH_INTERNAL_ASSERT(all_dep_vals.empty(), ss.str()); + } + + return max_producer_pos; +} + +} // namespace + +void groupReductions(const std::vector& reduction_outputs) { + TORCH_CHECK(!reduction_outputs.empty(), "No tensor is given"); + + auto container = reduction_outputs[0]->container(); + + const auto num_reductions = reduction_outputs.size(); + + std::vector op_types(num_reductions); + std::vector init_vals(num_reductions); + std::vector outputs(num_reductions); + std::vector inputs(num_reductions); + + for (const auto i : c10::irange(num_reductions)) { + auto reduction_out = reduction_outputs.at(i); + TORCH_CHECK( + reduction_out->definition() != nullptr, + "Invalid tensor to group: ", + reduction_out->toString(), + ". Definition not found"); + auto rop = dynamic_cast(reduction_out->definition()); + TORCH_CHECK( + rop != nullptr, + "Invalid tensor to group: ", + reduction_out->toString(), + ". Not an output of a ReductionOp: ", + reduction_out->definition()->toString()); + // Fused reduction is only enabled during the lowering, so at this + // point it should be false. + TORCH_INTERNAL_ASSERT( + !rop->isAllreduce(), "Invalid ReductionOp: ", rop->toString()); + op_types.at(i) = rop->getReductionOpType(); + init_vals.at(i) = rop->init(); + outputs.at(i) = rop->out(); + inputs.at(i) = rop->in(); + } + + auto max_producer_pos = validateReductionGrouping(inputs, outputs); + + for (auto output : ir_utils::filterByType(outputs)) { + output->setMaxProducer(max_producer_pos); + } + + IrBuilder::create( + container, op_types, init_vals, outputs, inputs); +} + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/grouped_reduction.h b/torch/csrc/jit/codegen/cuda/grouped_reduction.h new file mode 100644 index 000000000000..39e6e0850e67 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/grouped_reduction.h @@ -0,0 +1,37 @@ +#pragma once + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +//! Horizontally fuse multiple reductions. +//! +//! Given a list of tensors produced by ReductionOp, create a new +//! GroupedReductionOp expression that takes the input tensors of the +//! original reductions and produces the given tensors, replacing +//! their defining expressions. +//! +//! GroupedReductionOp works just like ReductionOp with a potential +//! benefit of aggregating synchronizations across individual +//! reductions. See the reduction::gridReduce2 runtime function for a +//! two-input version of grid reduction. +//! +//! The grouped reductions must follow several constraints, which +//! include: +//! - There must not exist any data dependency between individual +//! reductions. +//! - All reduction output tensors must have the same number of +//! dimensions, the same transformations and the same axes to +//! reduce. +//! +//! \param reduction_outputs Tensors produced by ReductionOp +TORCH_CUDA_CU_API void groupReductions( + const std::vector& reduction_outputs); + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/index_compute.cpp b/torch/csrc/jit/codegen/cuda/index_compute.cpp index 39176a60c537..a000dca87a15 100644 --- a/torch/csrc/jit/codegen/cuda/index_compute.cpp +++ b/torch/csrc/jit/codegen/cuda/index_compute.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -10,13 +11,13 @@ #include #include #include -#include -#include #include +#include #include #include #include #include +#include #include #include #include @@ -28,212 +29,6 @@ namespace cuda { namespace { -// A merge is contiguous if: -// Inputs of outer are to the left in the root domain of the inputs of RHS. -// All inputs are contiguous in the root domain: -// - All marked as contiguous -// - Only gaps between inputs are broadcast or reductoin dims -// There are no split transformations performed on outer or inner -// All transformations on outer or inner are contiguous merges -// If this criteria holds, then we can index the input root domains of this -// merge with the indexing provided to the output of the merge in the backward -// index pass - -class ContigIDs : public OptInDispatch { - private: - using OptInDispatch::handle; - - // Mark if ids are result of contigous merges - std::unordered_set contig_ids; - // Given contiguous domain, return all iter domains within its history. - std::unordered_map> - within_contig_ids; - const std::vector& root_domain_; - const std::vector& root_contiguity_; - std::unordered_map is_contig_root; - - bool inRoot(const std::vector& ids) { - return std::all_of(ids.begin(), ids.end(), [this](IterDomain* id) { - return is_contig_root.find(id) != is_contig_root.end(); - }); - } - - bool isContig(kir::IterDomain* id) { - return contig_ids.find(id) != contig_ids.end(); - } - - // Split outputs are not contiguous, don't need to do anything. - void handle(Split*) override {} - - void handle(Merge* merge) override { - const auto gpu_lower = GpuLower::current(); - - // If either input is non-contiguous so is output. - const auto inner = merge->inner(); - const auto outer = merge->outer(); - - if ((!isContig(gpu_lower->lowerValue(inner)->as()) || - !isContig(gpu_lower->lowerValue(outer)->as()))) { - return; - } - - // Grab inputs, make sure they're in root domain, check if they're - // contiguous. - - auto lhs_inputs = - ir_utils::iterDomainInputsOfOrderedAs({outer}, root_domain_); - auto rhs_inputs = - ir_utils::iterDomainInputsOfOrderedAs({inner}, root_domain_); - - TORCH_INTERNAL_ASSERT( - inRoot(lhs_inputs) && inRoot(rhs_inputs), - "Found an invalid merge operation, inputs of its arguments are not in the root domain."); - - std::deque ordered_inputs( - lhs_inputs.begin(), lhs_inputs.end()); - ordered_inputs.insert( - ordered_inputs.end(), rhs_inputs.begin(), rhs_inputs.end()); - - // If any root input is not contig, output is not contig - if (!(std::all_of( - ordered_inputs.begin(), - ordered_inputs.end(), - [this](IterDomain* id) { - return is_contig_root.at(id) && !id->isBroadcast() && - !id->isReduction(); - }))) { - return; - } - - std::deque root_copy(root_domain_.begin(), root_domain_.end()); - - // Forward to first matching argument - while (!root_copy.empty() && !ordered_inputs.empty()) { - if (root_copy.front() != ordered_inputs.front()) { - root_copy.pop_front(); - } else { - break; - } - } - - // Forward through all matching arguments - while (!root_copy.empty() && !ordered_inputs.empty()) { - if (root_copy.front() == ordered_inputs.front()) { - root_copy.pop_front(); - ordered_inputs.pop_front(); - // This is no longer causing an error in: - // ReductionSchedulerMultiDimNonFastest TODO: test reenablement to make - // sure it does what's expected - // } else if ( - // root_copy.front()->isReduction() || - // root_copy.front()->isBroadcast()) { - // root_copy.pop_front(); - } else { - break; - } - } - - // If we matched all inputs, the output is contiguous. Only want to keep the - // top contig ID, lower ids should be placed in the "within_contig_ids" map - // of top id. - auto kir_inner = - gpu_lower->lowerValue(merge->inner())->as(); - auto kir_outer = - gpu_lower->lowerValue(merge->outer())->as(); - auto kir_out = gpu_lower->lowerValue(merge->out())->as(); - if (ordered_inputs.empty()) { - if (contig_ids.find(kir_inner) != contig_ids.end()) { - contig_ids.erase(kir_inner); - } - - if (contig_ids.find(kir_outer) != contig_ids.end()) { - contig_ids.erase(kir_outer); - } - - contig_ids.emplace(kir_out); - - std::unordered_set within_out; - within_out.emplace(kir_inner); - if (within_contig_ids.find(kir_inner) != within_contig_ids.end()) { - auto in_inner = within_contig_ids.at(kir_inner); - within_out.insert(in_inner.begin(), in_inner.end()); - within_contig_ids.erase(kir_inner); - } - - within_out.emplace(kir_outer); - if (within_contig_ids.find(kir_outer) != within_contig_ids.end()) { - auto in_outer = within_contig_ids.at(kir_outer); - within_out.insert(in_outer.begin(), in_outer.end()); - within_contig_ids.erase(kir_outer); - } - - within_contig_ids[kir_out] = within_out; - } - } - - public: - ContigIDs() = delete; - - // Check through the history of ids whose inputs map to root_domain with - // contiguity root_contiguity. Return unordered_set of all merges that are - // contiguous. Ignore root order is primarily used for predicate generation. - // In this case we can linearize indexing of any ID that only consists of - // merge operations. - ContigIDs( - const std::vector& ids, - const std::vector& root_domain, - const std::vector& root_contiguity) - : root_domain_(root_domain), root_contiguity_(root_contiguity) { - if (ids.empty()) { - return; - } - - TORCH_INTERNAL_ASSERT( - root_domain_.size() == root_contiguity_.size(), - "Arguments don't match ", - root_domain_.size(), - " != ", - root_contiguity_.size()); - - const auto gpu_lower = GpuLower::current(); - - for (const auto i : c10::irange(root_domain_.size())) { - // If a root domain has halo, can't use merged domain even if - // both inputs are contiguous. HaloInfo is also initialized for - // rfactor root domains, which should just return "zero" - // RootAxisInfo. This should be safe as no rfactor tensor should - // need halo. - if (root_contiguity_[i] && - !gpu_lower->haloInfo().getRootAxisInfo(root_domain_[i]).hasHalo()) { - auto kir_root_domain_i = - gpu_lower->lowerValue(root_domain_[i])->as(); - contig_ids.emplace(kir_root_domain_i); - within_contig_ids[kir_root_domain_i] = - std::unordered_set(); - is_contig_root[root_domain_[i]] = true; - } else { - is_contig_root[root_domain_[i]] = false; - } - } - - auto exprs = ExprSort::getExprs(ids[0]->fusion(), {ids.begin(), ids.end()}); - - for (auto expr : exprs) { - handle(expr); - } - } - - const std::unordered_set contigIDs() const { - return contig_ids; - } - - const std:: - unordered_map> - withinContigIDs() const { - return within_contig_ids; - } -}; - // Update the HaloInfo mappings for a reference tensor by propagating // the halo information from the consumer tensor. void updateHaloInfoForReference( @@ -248,8 +43,8 @@ void updateHaloInfoForReference( // First, propagate the halo information of the consumer root domain // to the reference root domain. for (auto consumer_root_id : consumer_tv->getRootDomain()) { - auto consumer_index_concrete_id = - gpu_lower->caIndexMap().getConcreteMappedID(consumer_root_id); + auto consumer_index_concrete_id = gpu_lower->caMap()->getConcreteMappedID( + consumer_root_id, IdMappingMode::EXACT); auto reference_it = reference.concrete_to_id.find(consumer_index_concrete_id); if (reference_it == reference.concrete_to_id.end()) { @@ -276,21 +71,18 @@ void updateHaloInfoForReference( // // ref_map: ref-to-consumer in consumer indexing; ref-to-producer in // producer indexing -std::unordered_map getReferenceHaloExtentMap( +std::unordered_map getReferenceHaloExtentMap( const ReferenceTensor& reference, const std::unordered_map& index_map_from_ref) { - const auto gpu_lower = GpuLower::current(); - - const auto& halo_info = gpu_lower->haloInfo(); + const auto& halo_info = GpuLower::current()->haloInfo(); - std::unordered_map reference_halo_extent_map; + std::unordered_map reference_halo_extent_map; // Propagate halo extents of the reference to the consumer or // producer tensor for (auto kv : index_map_from_ref) { - auto ref_id = gpu_lower->lowerValue(kv.first)->as(); - auto producer_or_consumer_id = - gpu_lower->lowerValue(kv.second)->as(); + auto ref_id = kv.first; + auto producer_or_consumer_id = kv.second; auto extent = halo_info.getExtent(ref_id); if (extent != nullptr) { reference_halo_extent_map[producer_or_consumer_id] = extent; @@ -302,7 +94,7 @@ std::unordered_map getReferenceHaloExtentMap( //! Offset of an index of a producer axis with respect to its //! corresponding consumer index -kir::Val* getProducerHaloOffset( +int getProducerHaloOffset( const TensorView* producer_tv, size_t producer_axis, const TensorView* consumer_tv) { @@ -325,41 +117,31 @@ kir::Val* getProducerHaloOffset( const auto p_pad = halo_map.getRootAxisInfo(producer_id).width(0); const auto c_pad = halo_map.getRootAxisInfo(consumer_id).width(0); - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - - kir::Val* offset = (p_pad->isConst() && c_pad->isConst()) - ? ir_builder.create( - p_pad->value().value() - c_pad->value().value()) - : ir_builder.subExpr(p_pad, c_pad); + auto offset = p_pad - c_pad; // If the consumer is a result of shifting the producer, adjust the // producer index per the offsets argument of the shift op. if (auto shift_op = dynamic_cast(consumer_tv->definition())) { - offset = ir_builder.subExpr( - offset, ir_builder.create(shift_op->offset(producer_axis))); + offset -= shift_op->offset(producer_axis); } return offset; } //! Offset producer index when necessary -kir::Val* getProducerIndexWithHalo( +Val* getProducerIndexWithHalo( const TensorView* producer_tv, size_t producer_axis, - kir::Val* producer_index, + Val* producer_index, const TensorView* consumer_tv) { const auto offset = getProducerHaloOffset(producer_tv, producer_axis, consumer_tv); - if (offset->isZeroInt()) { + if (offset == 0) { return producer_index; } - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - - producer_index = ir_builder.addExpr(producer_index, offset); + producer_index = SimplifyingIrBuilder::addExpr(producer_index, offset); return producer_index; } @@ -368,58 +150,58 @@ kir::Val* getProducerIndexWithHalo( //! //! \param consumer_root_axis Position of corresponding consumer axis //! \param consumer_tv Consumer TensorView +//! \param index_map Mappings from consumer or reference to indices +//! \param use_reference_map True when index_map maps reference domains //! \param concrete_to_ref_map Mappings from concrete to reference domains -//! \param ref_index_map Mappings from reference domains to indices -kir::Val* getProducerOffsetWithGather( +Val* getProducerOffsetWithGather( size_t consumer_root_axis, const TensorView* consumer_tv, - const std::unordered_map& concrete_to_ref_map, - const std::unordered_map& ref_index_map) { + const std::unordered_map& index_map, + bool use_reference_map = false, + const std::unordered_map& concrete_to_ref_map = + {}) { const auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); const auto gather_expr = dynamic_cast(consumer_tv->definition()); if (gather_expr == nullptr) { - return ir_builder.zeroVal(); + return gpu_lower->kernel()->zeroVal(); } // If the window extent is one, no specific offsetting // is necessary if (consumer_root_axis >= gather_expr->windowShape().size() || - gather_expr->windowShape()[consumer_root_axis]->isOneInt()) { - return ir_builder.zeroVal(); + gather_expr->windowShape()[consumer_root_axis] == 1) { + return gpu_lower->kernel()->zeroVal(); } // Basically, the goal is to build an expression of producer_index + // window_index, so we first need to locate the index expression // that corresponds to the window axis of this producer axis. - // Locate the root IterDomain of the reference that corresponds to the gather - // axis const auto window_axis = gather_expr->gatherAxis(consumer_root_axis); auto window_id = consumer_tv->getRootDomain().at(window_axis); - auto concrete_window_id = - gpu_lower->caIndexMap().getConcreteMappedID(window_id); - auto concrete_2_ref_it = concrete_to_ref_map.find(concrete_window_id); - TORCH_INTERNAL_ASSERT(concrete_2_ref_it != concrete_to_ref_map.end()); - IterDomain* reference_root_of_gather_axis = concrete_2_ref_it->second; - - // Now that reference_root_of_gather_axis is the IterDomain for the - // window axis, take its corresponding index from the index map - auto window_idx = - ref_index_map.at(gpu_lower->lowerValue(reference_root_of_gather_axis) - ->as()); - - // Positive (or negative) padding at offset zero means the indexing - // shifted to the negative (or positive) direction. + + // When index_map maps a reference tensor, find the corresponding + // reference ID of window_id. + if (use_reference_map) { + auto concrete_window_id = gpu_lower->caMap()->getConcreteMappedID( + window_id, IdMappingMode::EXACT); + auto concrete_2_ref_it = concrete_to_ref_map.find(concrete_window_id); + TORCH_INTERNAL_ASSERT(concrete_2_ref_it != concrete_to_ref_map.end()); + window_id = concrete_2_ref_it->second; + } + + auto window_idx = index_map.at(window_id); + + // Positive padding at offset zero means the indexing shifted to the + // negative direction. auto pad_width = gather_expr->padWidth()[consumer_root_axis][0]; // producer offset: window_index - padding - auto producer_offset = - ir_builder.subExpr(window_idx, ir_builder.create(pad_width)); + auto producer_offset = SimplifyingIrBuilder::subExpr( + window_idx, SimplifyingIrBuilder::create(pad_width)); return producer_offset; - ; } //! Offset a producer index of a gather expression @@ -428,13 +210,13 @@ kir::Val* getProducerOffsetWithGather( //! expression that accesses a window position that the current loop //! structure refers to. Use getGatherProducerOffset to create an //! offset Val. -kir::Val* getProducerIndexWithGather( - kir::Val* producer_index, +Val* getProducerIndexWithGather( + Val* producer_index, size_t producer_root_axis, const TensorView* producer_tv, const TensorView* consumer_tv, const std::unordered_map& concrete_to_ref_map, - const std::unordered_map& ref_index_map) { + const std::unordered_map& ref_index_map) { auto gather_op = dynamic_cast(consumer_tv->definition()); // Just return the producer index as is if this is not a gather @@ -460,22 +242,18 @@ kir::Val* getProducerIndexWithGather( ", producer_axis: ", producer_root_axis); - kir::SimplifyingIrBuilder ir_builder(GpuLower::current()->kernel()); auto offset = getProducerOffsetWithGather( - consumer_axis, consumer_tv, concrete_to_ref_map, ref_index_map); - return ir_builder.addExpr(producer_index, offset); + consumer_axis, consumer_tv, ref_index_map, true, concrete_to_ref_map); + return SimplifyingIrBuilder::addExpr(producer_index, offset); } // Adjusts a global consumer index when its root domain is partially // split. Note that non-global consumer indices don't need any // adjustment. -kir::Val* getGlobalConsumerOffsetWithPartialSplit(kir::IterDomain* root_id) { - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - - auto offset = gpu_lower->partialSplitMap().getStartOffset(root_id); +Val* getGlobalConsumerOffsetWithPartialSplit(IterDomain* root_id) { + auto offset = GpuLower::current()->partialSplitMap().getStartOffset(root_id); if (offset == nullptr) { - return ir_builder.zeroVal(); + return GpuLower::current()->kernel()->zeroVal(); } else { return offset; } @@ -488,13 +266,12 @@ kir::Val* getGlobalConsumerOffsetWithPartialSplit(kir::IterDomain* root_id) { // it needs to be added to the index. Also, when the producer itself // also has a non-zero split offset, that needs to be subtracted from // the index. -kir::Val* getProducerIndexWithPartialSplit( - kir::Val* producer_index, +Val* getProducerIndexWithPartialSplit( + Val* producer_index, IterDomain* producer_root_id, const TensorView* producer_tv, const TensorView* consumer_tv) { const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); auto p2c = PairwiseRootDomainMap(producer_tv, consumer_tv) @@ -509,31 +286,29 @@ kir::Val* getProducerIndexWithPartialSplit( auto consumer_offset = gpu_lower->partialSplitMap().getStartOffset(consumer_root_id); - auto consumer_offset_kir = consumer_offset == nullptr - ? ir_builder.zeroVal() - : gpu_lower->lowerValue(consumer_offset); + consumer_offset = consumer_offset == nullptr ? gpu_lower->kernel()->zeroVal() + : consumer_offset; auto producer_offset = gpu_lower->partialSplitMap().getStartOffset(producer_root_id); - auto producer_offset_kir = producer_offset == nullptr - ? ir_builder.zeroVal() - : gpu_lower->lowerValue(producer_offset); + producer_offset = producer_offset == nullptr ? gpu_lower->kernel()->zeroVal() + : producer_offset; // If the producer is on global memory, it's always allocated // without trimming the out-of-bounds region, so the consumer offset // should be added to the index. if (producer_tv->getMemoryType() == MemoryType::Global) { - if (consumer_offset_kir->isZeroInt()) { + if (consumer_offset->isZeroInt()) { return producer_index; } else { - return ir_builder.addExpr(producer_index, consumer_offset_kir); + return SimplifyingIrBuilder::addExpr(producer_index, consumer_offset); } } // Non-global case. Difference of the split offsets must be // accounted. - auto diff = ir_builder.subExpr(consumer_offset_kir, producer_offset_kir); + auto diff = SimplifyingIrBuilder::subExpr(consumer_offset, producer_offset); kir::ExpressionEvaluator ee; auto diff_eval = ee.evaluate(diff); // We currently only allow constant offsetting @@ -543,19 +318,16 @@ kir::Val* getProducerIndexWithPartialSplit( return producer_index; } - return ir_builder.addExpr( - producer_index, ir_builder.create(diff_eval.value())); + return SimplifyingIrBuilder::addExpr( + producer_index, SimplifyingIrBuilder::create(diff_eval.value())); } } // namespace void IndexCompute::handle(Split* split) { - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - - auto in_id = gpu_lower->lowerValue(split->in())->as(); - auto outer_id = gpu_lower->lowerValue(split->outer())->as(); - auto inner_id = gpu_lower->lowerValue(split->inner())->as(); + auto in_id = split->in()->as(); + auto outer_id = split->outer()->as(); + auto inner_id = split->inner()->as(); auto outer_it = index_map_.find(outer_id); auto inner_it = index_map_.find(inner_id); @@ -588,8 +360,8 @@ void IndexCompute::handle(Split* split) { } if (isZero(in_id)) { - index_map_[in_id] = ir_builder.create(0); - extent_map_[in_id] = ir_builder.create(0); + index_map_[in_id] = GpuLower::current()->kernel()->zeroVal(); + extent_map_[in_id] = GpuLower::current()->kernel()->zeroVal(); } else if (zero_merged_in && outer_zero) { index_map_[in_id] = inner_ind; extent_map_[in_id] = getExtent(inner_id); @@ -597,24 +369,22 @@ void IndexCompute::handle(Split* split) { index_map_[in_id] = outer_ind; extent_map_[in_id] = getExtent(outer_id); } else { - index_map_[in_id] = ir_builder.addExpr( - ir_builder.mulExpr(outer_ind, getExtent(inner_id)), inner_ind); + index_map_[in_id] = SimplifyingIrBuilder::addExpr( + SimplifyingIrBuilder::mulExpr(outer_ind, getExtent(inner_id)), + inner_ind); // The extent should be updated only when its allocation is // partial, i.e., zero_merged_in is true. See PR #1270. if (zero_merged_in) { - extent_map_[in_id] = - ir_builder.mulExpr(getExtent(outer_id), getExtent(inner_id)); + extent_map_[in_id] = SimplifyingIrBuilder::mulExpr( + getExtent(outer_id), getExtent(inner_id)); } } } void IndexCompute::handle(Merge* merge) { - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - - auto out_id = gpu_lower->lowerValue(merge->out())->as(); - auto outer_id = gpu_lower->lowerValue(merge->outer())->as(); - auto inner_id = gpu_lower->lowerValue(merge->inner())->as(); + auto out_id = merge->out(); + auto outer_id = merge->outer(); + auto inner_id = merge->inner(); auto out_it = index_map_.find(out_id); if (out_it == index_map_.end()) { @@ -622,7 +392,7 @@ void IndexCompute::handle(Merge* merge) { } auto out_ind = out_it->second; - auto zero = ir_builder.zeroVal(); + auto zero = GpuLower::current()->kernel()->zeroVal(); if (isZero(out_id)) { index_map_[outer_id] = zero; @@ -634,7 +404,7 @@ void IndexCompute::handle(Merge* merge) { return; } - if (!hasZeroMerged(out_id) && contig_ids.find(out_id) != contig_ids.end()) { + if (!hasZeroMerged(out_id) && contig_ids_.find(out_id) != contig_ids_.end()) { // Contiguous indexing path auto input_ids = ir_utils::iterDomainInputsOfOrderedAs( {merge->out()}, td_->getMaybeRFactorDomain()); @@ -642,18 +412,40 @@ void IndexCompute::handle(Merge* merge) { // Shouldn't hit this, but don't want to segfault if somehow we do. TORCH_INTERNAL_ASSERT(!input_ids.empty()); + // Try to find the last non broadcast entry to put the index in if it's a + // contiguous merge. This isn't strictly necessary but there's implicit + // assumptions in the indexing logic that assume broadcasted root domains + // can be ignored. This logic is just to try and match that logic. + // Initialize everything to zero. for (auto root_id : input_ids) { - index_map_[gpu_lower->lowerValue(root_id)->as()] = zero; + index_map_[root_id] = zero; + } + + // If all are broadcast we can just send the index to the last entry. + if (std::all_of(input_ids.begin(), input_ids.end(), [](IterDomain* id) { + // I don't think reductions can be in here, but strictly matching the + // logic in the indexing functions like + // getNonGlobalConsumerStridedIndices + return id->isBroadcast() || id->isReduction() || id->isStride(); + })) { + index_map_[*(input_ids.end() - 1)] = out_ind; + } else { + for (auto id_it = input_ids.rbegin(); id_it != input_ids.rend(); + id_it++) { + auto id = *id_it; + if (id->isBroadcast() || id->isReduction() || id->isStride()) { + continue; + } else { + index_map_[id] = out_ind; + break; + } + } } - index_map_[gpu_lower - ->lowerValue(*(input_ids.end() - 1)) - // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) - ->as()] = out_ind; return; } - kir::Val* inner_extent = getExtent(inner_id); + Val* inner_extent = getExtent(inner_id); // When the reference has halo extent for inner_id, that extent needs to // be used to un-merge @@ -718,8 +510,8 @@ void IndexCompute::handle(Merge* merge) { zero_merged_in_.emplace(inner_id); zero_merged_in_.emplace(outer_id); } else { - index_map_[outer_id] = ir_builder.divExpr(out_ind, inner_extent); - index_map_[inner_id] = ir_builder.modExpr(out_ind, inner_extent); + index_map_[outer_id] = SimplifyingIrBuilder::divExpr(out_ind, inner_extent); + index_map_[inner_id] = SimplifyingIrBuilder::modExpr(out_ind, inner_extent); } } @@ -735,17 +527,37 @@ void IndexCompute::handle(Expr* e) { BackwardVisitor::handle(e); } -// Otherwise warning on runBackward as it hides an overloaded virtual -// using TransformIter::runBackward; IndexCompute::IndexCompute( const TensorDomain* _td, - std::unordered_map initial_index_map, - std::unordered_map extent_map, - std::unordered_set zero_domains, - std::unordered_set zero_merged_in, - const std::vector& root_contiguity, - std::unordered_set preferred_paths, - std::unordered_map reference_halo_extent_map) + std::unordered_map initial_index_map, + std::unordered_map extent_map, + std::unordered_set zero_domains, + std::unordered_set zero_merged_in, + std::unordered_set preferred_paths, + std::unordered_map reference_halo_extent_map) + : IndexCompute( + _td, + std::move(initial_index_map), + std::move(extent_map), + std::move(zero_domains), + std::move(zero_merged_in), + ContigIDs( + _td->domain(), + _td->getMaybeRFactorDomain(), + std::vector(_td->getMaybeRFactorDomain().size(), false), + {}), + std::move(preferred_paths), + std::move(reference_halo_extent_map)) {} + +IndexCompute::IndexCompute( + const TensorDomain* _td, + std::unordered_map initial_index_map, + std::unordered_map extent_map, + std::unordered_set zero_domains, + std::unordered_set zero_merged_in, + const ContigIDs& contig_finder, + std::unordered_set preferred_paths, + std::unordered_map reference_halo_extent_map) : td_(_td), index_map_(std::move(initial_index_map)), extent_map_(std::move(extent_map)), @@ -757,20 +569,15 @@ IndexCompute::IndexCompute( // Make sure we recompute any indices we can that map to a contiguous access // in physical memory. - if (std::any_of(root_contiguity.begin(), root_contiguity.end(), [](bool b) { - return b; - })) { - ContigIDs contig_finder( - td_->domain(), td_->getMaybeRFactorDomain(), root_contiguity); - contig_ids = contig_finder.contigIDs(); - auto within_contig = contig_finder.withinContigIDs(); - for (auto contig_id : contig_ids) { - if (index_map_.find(contig_id) != index_map_.end()) { - TORCH_INTERNAL_ASSERT( - within_contig.find(contig_id) != within_contig.end()); - for (auto id : within_contig.at(contig_id)) { - index_map_.erase(id); - } + contig_ids_ = contig_finder.contigIDs(); + root_to_indexed_id_ = contig_finder.rootToIndexedID(); + const auto& within_contig = contig_finder.withinContigIDs(); + for (auto contig_id : contig_ids_) { + if (index_map_.find(contig_id) != index_map_.end()) { + TORCH_INTERNAL_ASSERT( + within_contig.find(contig_id) != within_contig.end()); + for (auto id : within_contig.at(contig_id)) { + index_map_.erase(id); } } } @@ -783,7 +590,7 @@ void IndexCompute::run() { traverseFrom(td_->fusion(), domain_vals, false); } -kir::Val* IndexCompute::getExtent(kir::IterDomain* id) { +Val* IndexCompute::getExtent(IterDomain* id) const { // Pick from extent_map_ if available. Previously parallel // dimensions were ued (e.g., blockDim.x), however, it would result // in out-of-bounds errors when the extent of IterDomain is smaller @@ -795,34 +602,30 @@ kir::Val* IndexCompute::getExtent(kir::IterDomain* id) { } } -bool IndexCompute::hasZeroMerged(kir::IterDomain* id) const { +bool IndexCompute::hasZeroMerged(IterDomain* id) const { return zero_merged_in_.find(id) != zero_merged_in_.end() || isZero(id); } -bool IndexCompute::isZero(kir::IterDomain* id) const { +bool IndexCompute::isZero(IterDomain* id) const { return zero_domains_.find(id) != zero_domains_.end(); } IndexCompute IndexCompute::updateIndexCompute( const TensorDomain* new_td, const std::unordered_map& id_map, - const std::vector& root_contiguity, - const std::unordered_map& - reference_halo_extent_map) { + const ContigIDs& contig_finder, + const std::unordered_map& reference_halo_extent_map) + const { FUSER_PERF_SCOPE("GpuLower::Lower::updateIndexCompute"); - const auto gpu_lower = GpuLower::current(); - - std::unordered_map updated_index_map; - std::unordered_map updated_extent_map; - std::unordered_set updated_zero_domains; - std::unordered_set updated_zero_merged_in; + std::unordered_map updated_index_map; + std::unordered_map updated_extent_map; + std::unordered_set updated_zero_domains; + std::unordered_set updated_zero_merged_in; for (auto id_entry : id_map) { - kir::IterDomain* prev_id = - gpu_lower->lowerValue(id_entry.first)->as(); - kir::IterDomain* new_id = - gpu_lower->lowerValue(id_entry.second)->as(); + IterDomain* prev_id = id_entry.first; + IterDomain* new_id = id_entry.second; if (index_map_.find(prev_id) != index_map_.end()) { updated_index_map[new_id] = index_map_.at(prev_id); @@ -845,7 +648,7 @@ IndexCompute IndexCompute::updateIndexCompute( updated_extent_map, updated_zero_domains, updated_zero_merged_in, - root_contiguity, + contig_finder, {}, reference_halo_extent_map); updated_index_compute.run(); @@ -859,8 +662,8 @@ class UpdateLeafIndices : public IterVisitor { public: UpdateLeafIndices( const TensorDomain* td, - std::unordered_map initial_index_map, - std::unordered_map extent_map) + std::unordered_map initial_index_map, + std::unordered_map extent_map) : td_(td), index_map_(std::move(initial_index_map)), extent_map_(std::move(extent_map)) { @@ -870,11 +673,11 @@ class UpdateLeafIndices : public IterVisitor { traverseFrom(td_->fusion(), domain_vals, false); } - const std::unordered_map& indexMap() const { + const std::unordered_map& indexMap() const { return index_map_; } - const std::unordered_map& extentMap() const { + const std::unordered_map& extentMap() const { return extent_map_; } @@ -882,13 +685,9 @@ class UpdateLeafIndices : public IterVisitor { using IterVisitor::handle; void handle(Split* split) override { - const auto gpu_lower = GpuLower::current(); - - auto in_id = gpu_lower->lowerValue(split->in())->as(); - auto outer_id = - gpu_lower->lowerValue(split->outer())->as(); - auto inner_id = - gpu_lower->lowerValue(split->inner())->as(); + auto in_id = split->in(); + auto outer_id = split->outer(); + auto inner_id = split->inner(); // Nothing need to be done when mappings for the output axes // already exist. @@ -899,22 +698,20 @@ class UpdateLeafIndices : public IterVisitor { return; } - kir::IrBuilder ir_builder(gpu_lower->kernel()); - auto factor = gpu_lower->lowerValue(split->factor()); - index_map_[inner_id] = ir_builder.modExpr(index_map_[in_id], factor); + auto factor = split->factor(); + index_map_[inner_id] = + SimplifyingIrBuilder::modExpr(index_map_[in_id], factor); extent_map_[inner_id] = factor; - index_map_[outer_id] = ir_builder.divExpr(index_map_[in_id], factor); - extent_map_[outer_id] = ir_builder.ceilDivExpr(getExtent(in_id), factor); + index_map_[outer_id] = + SimplifyingIrBuilder::divExpr(index_map_[in_id], factor); + extent_map_[outer_id] = + SimplifyingIrBuilder::ceilDivExpr(getExtent(in_id), factor); } void handle(Merge* merge) override { - const auto gpu_lower = GpuLower::current(); - - auto out_id = gpu_lower->lowerValue(merge->out())->as(); - auto outer_id = - gpu_lower->lowerValue(merge->outer())->as(); - auto inner_id = - gpu_lower->lowerValue(merge->inner())->as(); + auto out_id = merge->out(); + auto outer_id = merge->outer(); + auto inner_id = merge->inner(); // Nothing need to be done when mappings for the output axes // already exist. @@ -927,17 +724,17 @@ class UpdateLeafIndices : public IterVisitor { TORCH_INTERNAL_ASSERT( index_map_.find(inner_id) != index_map_.end(), "Inner ID not found"); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - index_map_[out_id] = ir_builder.mulExpr( + index_map_[out_id] = SimplifyingIrBuilder::mulExpr( index_map_[inner_id], - ir_builder.mulExpr(index_map_[outer_id], getExtent(inner_id))); + SimplifyingIrBuilder::mulExpr( + index_map_[outer_id], getExtent(inner_id))); extent_map_[out_id] = - ir_builder.mulExpr(getExtent(outer_id), getExtent(inner_id)); + SimplifyingIrBuilder::mulExpr(getExtent(outer_id), getExtent(inner_id)); } // return extent_map_[id] if exists, else return id->extent() - kir::Val* getExtent(kir::IterDomain* id) { + Val* getExtent(IterDomain* id) { if (extent_map_.find(id) != extent_map_.end()) { return extent_map_.at(id); } else { @@ -947,25 +744,21 @@ class UpdateLeafIndices : public IterVisitor { private: const TensorDomain* td_; - std::unordered_map index_map_; - std::unordered_map extent_map_; + std::unordered_map index_map_; + std::unordered_map extent_map_; }; // Returns halo-extended extent if id has halo. Otherwise, just // returns id->extent. -kir::Val* getHaloExtentOfRootAxis( - IterDomain* id, - kir::Val* normal_extent = nullptr) { - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - +Val* getHaloExtentOfRootAxis(IterDomain* id, Val* normal_extent = nullptr) { if (normal_extent == nullptr) { - normal_extent = gpu_lower->lowerValue(id->extent()); + normal_extent = id->extent(); } - const auto& halo = gpu_lower->haloInfo().getRootAxisInfo(id); + const auto& halo = GpuLower::current()->haloInfo().getRootAxisInfo(id); if (halo.hasHalo()) { - auto halo_extent = ir_builder.addExpr(normal_extent, halo.width()); + auto halo_extent = SimplifyingIrBuilder::addExpr( + normal_extent, SimplifyingIrBuilder::create(halo.width())); return halo_extent; } else { return normal_extent; @@ -976,17 +769,16 @@ kir::Val* getHaloExtentOfRootAxis( IndexSwizzle::IndexSwizzle( const TensorView* tv, - std::unordered_map initial_index_map, - std::unordered_map extent_map, - std::unordered_set zero_domains, - std::unordered_set zero_merged_in) + std::unordered_map initial_index_map, + std::unordered_map extent_map, + std::unordered_set zero_domains, + std::unordered_set zero_merged_in) : IndexCompute( tv->domain(), std::move(initial_index_map), std::move(extent_map), std::move(zero_domains), - std::move(zero_merged_in), - std::vector(tv->getRootDomain().size(), false)), + std::move(zero_merged_in)), tv_(tv), swizzle_type_(tv->swizzleType()), ids_to_swizzle_(tv->axesToSwizzle()) {} @@ -996,8 +788,6 @@ void IndexSwizzle::run() { swizzle_type_ == SwizzleType::NoSwizzle || swizzle_type_ == SwizzleType::Transpose, "Invalid swizzle type"); - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); if (swizzle_type_ == SwizzleType::Transpose) { // Shifts the second axis by the first axis as ((idx_1 + idx_2) % // ext). Alternatively, ((idx_1 - idx_2) & (ext - 1)) would also @@ -1013,20 +803,16 @@ void IndexSwizzle::run() { IterDomain* id_to_swizzle_i = ids_to_swizzle_.at(0); IterDomain* id_to_swizzle_j = ids_to_swizzle_.at(1); - kir::IterDomain* id_to_swizzle_i_kir = - gpu_lower->lowerValue(id_to_swizzle_i)->as(); - kir::IterDomain* id_to_swizzle_j_kir = - gpu_lower->lowerValue(id_to_swizzle_j)->as(); - - if (indexMap().find(id_to_swizzle_i_kir) != indexMap().end() && - indexMap().find(id_to_swizzle_j_kir) != indexMap().end()) { - auto idx_to_swizzle_i = indexMap().at(id_to_swizzle_i_kir); - auto idx_to_swizzle_j = indexMap().at(id_to_swizzle_j_kir); - - auto swizzled_idx = ir_builder.modExpr( - ir_builder.addExpr(idx_to_swizzle_i, idx_to_swizzle_j), - id_to_swizzle_j_kir->extent()); - index_map_[id_to_swizzle_j_kir] = swizzled_idx; + + if (indexMap().find(id_to_swizzle_i) != indexMap().end() && + indexMap().find(id_to_swizzle_j) != indexMap().end()) { + auto idx_to_swizzle_i = indexMap().at(id_to_swizzle_i); + auto idx_to_swizzle_j = indexMap().at(id_to_swizzle_j); + + auto swizzled_idx = SimplifyingIrBuilder::modExpr( + SimplifyingIrBuilder::addExpr(idx_to_swizzle_i, idx_to_swizzle_j), + id_to_swizzle_j->extent()); + index_map_[id_to_swizzle_j] = swizzled_idx; swizzled_ids_.insert(id_to_swizzle_j); IndexCompute::run(); } @@ -1055,18 +841,14 @@ namespace { // to loop indices as well as a set of loops that do not contribute to // indexing. std::pair< - std::unordered_map, + std::unordered_map, std::unordered_set> indexMapFromTV( const TensorView* tv, const std::vector& loops, - const std::pair& alloc_point, - bool as_consumer) { - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - - auto alloc_loop = alloc_point.first; - + kir::ForLoop* alloc_loop, + bool as_consumer, + kir::ForLoop* double_buffer_loop = nullptr) { bool within_alloc = false; if (alloc_loop == nullptr) { within_alloc = true; @@ -1076,7 +858,14 @@ indexMapFromTV( const bool is_shared = tv->getMemoryType() == MemoryType::Shared; const bool is_local = tv->getMemoryType() == MemoryType::Local; - std::unordered_map loop_to_ind_map; + std::unordered_map loop_to_ind_map; + + // Check if the current op has an implicit loop implemented + // within an mma instruction. + bool within_mma_loops = + std::any_of(loops.begin(), loops.end(), [](kir::ForLoop* fl) { + return fl->iter_domain()->isMma(); + }); // When indexed as a producer, the parallel types of the the // producer domains may not be the same as those of the loops, but @@ -1085,17 +874,18 @@ indexMapFromTV( // with zero isn't valid. That's only valid when there's a matching // IterDomain in the producer tensor that has the same parallel // type. - auto find_matching_parallel_domain = [tv](kir::IterDomain* id) -> bool { + auto find_matching_parallel_domain = [tv](IterDomain* id) -> bool { const auto gpu_lower = GpuLower::current(); auto it = std::find_if( tv->domain()->domain().begin(), tv->domain()->domain().end(), [&](IterDomain* tv_id) { - auto kir_tv_id = gpu_lower->lowerValue(tv_id)->as(); // Matching is done using the index and loop maps. See // validateParallelize as well. - return gpu_lower->caIndexMap().areMapped(id, kir_tv_id) || - (gpu_lower->caLoopMap().areMapped(id, kir_tv_id) && + return gpu_lower->caMap()->areMapped( + id, tv_id, IdMappingMode::EXACT) || + (GpuLower::current()->caMap()->areMapped( + id, tv_id, IdMappingMode::PERMISSIVE) && ir_utils::derivedFromRootCAAxes(tv, tv_id)); }); if (it == tv->domain()->domain().end()) { @@ -1103,7 +893,7 @@ indexMapFromTV( } auto corresponding_domain = *it; - return corresponding_domain->getParallelType() == id->parallelType(); + return corresponding_domain->getParallelType() == id->getParallelType(); }; // Track domains that do not contibute to the resulting @@ -1113,9 +903,16 @@ indexMapFromTV( std::unordered_set zero_loops; for (auto loop : loops) { - kir::Val* idx = nullptr; - const auto same_parallel_type = - as_consumer || find_matching_parallel_domain(loop->iter_domain()); + Val* idx = nullptr; + const auto same_parallel_type = as_consumer || + find_matching_parallel_domain(loop->iter_domain()) || + // Note && TODO: + // mma swizzled lane_id does not map naturally from producer + // to consumer but they should still be detected as same + // parallel type. In a follow up may want to extent + // find_matching_parallel_domain to cover this case. + (within_mma_loops && + loop->iter_domain()->getParallelType() == ParallelType::TIDx); // See also LoopNestGenerator::pushAlloc. // NOLINTNEXTLINE(bugprone-branch-clone) if (!within_alloc) { @@ -1123,7 +920,7 @@ indexMapFromTV( (loop->iter_domain()->isThread() && is_global)) { idx = loop->index(); } else { - idx = ir_builder.zeroVal(); + idx = GpuLower::current()->kernel()->zeroVal(); zero_loops.insert(loop); } } else if ( @@ -1143,16 +940,24 @@ indexMapFromTV( // Similarly for local memory tensors, zero replacement can be // only done when there's a matching domain with the same // parallel type - (loop->iter_domain()->isThread() && is_local && same_parallel_type) || - loop->vectorize()) { - idx = ir_builder.zeroVal(); - if (!loop->vectorize()) { - zero_loops.insert(loop); - } + (loop->iter_domain()->isThread() && is_local && same_parallel_type)) { + idx = GpuLower::current()->kernel()->zeroVal(); + zero_loops.insert(loop); } else { idx = loop->index(); } + // If the loop is trivial, the loop index can only be the loop + // start value. + if (idx == loop->index() && loop->isTrivial()) { + idx = loop->start(); + } + + if (loop == double_buffer_loop) { + idx = SimplifyingIrBuilder::addExpr( + idx, GpuLower::current()->kernel()->oneVal()); + } + loop_to_ind_map[loop] = idx; if (!within_alloc && loop == alloc_loop) { @@ -1184,8 +989,6 @@ void ensureStaticIndexing( within_alloc = true; } - const auto gpu_lower = GpuLower::current(); - for (auto loop : loops) { if (!within_alloc) { if (loop == alloc_loop) { @@ -1193,7 +996,7 @@ void ensureStaticIndexing( } continue; } - kir::IterDomain* loop_id = loop->iter_domain(); + IterDomain* loop_id = loop->iter_domain(); if (loop->vectorize() || loop_id->isThread()) { continue; } @@ -1203,7 +1006,7 @@ void ensureStaticIndexing( auto it = std::find_if( tv->domain()->domain().begin(), tv->domain()->domain().end(), - [loop_id, gpu_lower, &id_map](IterDomain* id) { + [loop_id, &id_map](IterDomain* id) { if (id->isBroadcast() || id->isReduction() || id->isStride()) { return false; } @@ -1211,8 +1014,8 @@ void ensureStaticIndexing( if (id_replacement != id_map.end()) { id = id_replacement->second; } - auto kir_id = gpu_lower->lowerValue(id)->as(); - return gpu_lower->caLoopMap().areMapped(loop_id, kir_id); + return GpuLower::current()->caMap()->areMapped( + loop_id, id, IdMappingMode::PERMISSIVE); }); if (it != tv->domain()->domain().end()) { loop->requireUnroll(); @@ -1229,7 +1032,7 @@ void ensureStaticIndexing( // operation. std::unordered_map indexMapReferenceTo( const TensorView* tv, - const ComputeAtMap& ca_map, + const std::unique_ptr& ca_map, const std::unordered_map& reference_concrete_to_id_map, bool root_only = false) { @@ -1237,7 +1040,8 @@ std::unordered_map indexMapReferenceTo( auto gen_map = [&](const auto& pids) { for (auto p_id : pids) { - auto concrete_id = ca_map.getConcreteMappedID(p_id); + auto concrete_id = + ca_map->getConcreteMappedID(p_id, IdMappingMode::EXACT); auto ref_id_it = reference_concrete_to_id_map.find(concrete_id); if (ref_id_it != reference_concrete_to_id_map.end()) { index_map_ref_to_producer[ref_id_it->second] = p_id; @@ -1258,18 +1062,153 @@ std::unordered_map indexMapReferenceTo( return index_map_ref_to_producer; } +Val* hoistConsumerIndex( + IterDomain* consumer_root_id, + const TensorView* consumer_tv, + const IndexCompute& consumer_indexing, + TensorDomain* ref_td, + const IndexCompute& ref_indexing, + const std::vector& loops, + Val* index) { + // If index has no defining expression, there's nothing to hoist + if (isDisabled(DisableOption::IndexHoist) || index->definition() == nullptr) { + return index; + } + + // The old swizzle interface, which should be deprecated, is not + // supported. + if (consumer_tv->swizzleType() != SwizzleType::NoSwizzle) { + return index; + } + + // auto indexed_consumer_id = consumer_root_id; + // Find the true indexed domain, which can be a merged contiguous domain. + auto contig_id_it = consumer_indexing.rootToContigID().find(consumer_root_id); + TORCH_INTERNAL_ASSERT( + contig_id_it != consumer_indexing.rootToContigID().end(), + "Consumer indexed ID not found: ", + consumer_root_id->toString()); + auto indexed_consumer_id = contig_id_it->second; + // Make sure this contig ID is indeed indexed + TORCH_INTERNAL_ASSERT( + consumer_indexing.indexMap().find(contig_id_it->second) != + consumer_indexing.indexMap().end(), + "Invalid contig index: ", + contig_id_it->second->toString()); + + // Insert the index into the common index map. A previously inserted + // val can be returned. + auto common_index = GpuLower::current() + ->commonIndexMap() + .insert( + indexed_consumer_id, + consumer_tv->domain(), + ref_td, + ref_indexing.indexMap(), + loops, + index) + .first; + + return common_index; +} + +std::unordered_map invertOneToOneMap( + const std::unordered_map& map) { + std::unordered_map inverted; + for (const auto& kv : map) { + bool inserted = inverted.emplace(kv.second, kv.first).second; + TORCH_INTERNAL_ASSERT( + inserted, + "Multiple mappings to the same value detected: ", + kv.second->toString()); + } + return inverted; +} + +Val* hoistProducerIndex( + IterDomain* producer_root_id, + const TensorView* producer_tv, + const IndexCompute& producer_indexing, + const TensorView* consumer_tv, + const std::unordered_map& p2c_map, + TensorDomain* ref_td, + const IndexCompute& ref_indexing, + const std::vector& loops, + Val* index) { + // If index has no defining expression, there's nothing to hoist + if (isDisabled(DisableOption::IndexHoist) || index->definition() == nullptr) { + return index; + } + + // The old swizzle interface, which should be deprecated, is not + // supported. + if (producer_tv->swizzleType() != SwizzleType::NoSwizzle) { + return index; + } + + // auto indexed_producer_id = producer_root_id; + auto contig_id_it = producer_indexing.rootToContigID().find(producer_root_id); + TORCH_INTERNAL_ASSERT( + contig_id_it != producer_indexing.rootToContigID().end(), + "Producer indexed ID not found: ", + producer_root_id->toString()); + auto indexed_producer_id = contig_id_it->second; + // Make sure this contig ID is indeed indexed + TORCH_INTERNAL_ASSERT( + producer_indexing.indexMap().find(indexed_producer_id) != + producer_indexing.indexMap().end(), + "Invalid contig id: ", + indexed_producer_id->toString()); + + // Use the corresponding consumer domain to find matching + // for-loops. Note that there's no CA mapping with the producer + // domains as the producer TensorDomain is a temporary replay + // domain. + + auto indexed_consumer_id_it = p2c_map.find(indexed_producer_id); + + // There can be no corresponding consumer ID. For example, consider: + // consumer: [b1, i2, i3] + // producer: [i2, i3]. + // Suppose the consumer is transformed as: + // consumer: [(b1*i2)*i3] + // Then the producer would be transformed when indexed: + // producer: [i2*i3] + // Assuming i2 and i3 are contiguous, the producer indexing is done + // with the mreged i2*i3 domain, but there's no domain in the + // cosumer that maps with the producer indexed domain. + // It seems non-trivial to support patterns like this. Skip for now. + if (indexed_consumer_id_it == p2c_map.end()) { + return index; + } + + IterDomain* indexed_consumer_id = indexed_consumer_id_it->second; + + auto common_index = GpuLower::current() + ->commonIndexMap() + .insert( + indexed_consumer_id, + consumer_tv->domain(), + ref_td, + ref_indexing.indexMap(), + loops, + index) + .first; + + return common_index; +} + } // namespace -std::vector Index::getGlobalProducerStridedIndices( +std::vector Index::getGlobalProducerStridedIndices( TensorView* producer_tv, const TensorView* consumer_tv, const std::vector& loops) { FUSER_PERF_SCOPE("GpuLower::Lower::getGlobalProducerIndex"); const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); // Get a reference tensor replayed as existing loop structure - auto reference = IndexReferenceReplay::getReference(loops); + auto reference = IndexReferenceReplay::getReference(loops, consumer_tv); auto reference_domain = reference.domain; auto reference_id_map = reference.concrete_to_id; @@ -1286,19 +1225,24 @@ std::vector Index::getGlobalProducerStridedIndices( // Map everything we can from reference to producer using compute at index // map. Use consumer as a proxy between producer and the generated reference. std::unordered_map index_map_ref_to_producer; - { - // This replay has to be consistent with compute at index map. - BestEffortReplay replay_producer_as_consumer( - producer_tv->domain()->domain(), - consumer_tv->domain()->domain(), - pairwise_map.mapConsumerToProducer( - consumer_tv->domain(), producer_tv->domain())); - const auto& c2p_map = replay_producer_as_consumer.getReplay(); + // Map sent to best effort replay needs to match the exact incantation for + // compute_at_mode.cpp with MappingMode::Index + auto c2p_root_map = + PairwiseRootDomainMap(producer_tv, consumer_tv, true) + .mapConsumerToProducer(consumer_tv->domain(), producer_tv->domain()); + // This replay has to be consistent with compute at index map. + BestEffortReplay replay_producer_as_consumer( + producer_tv->domain()->domain(), + consumer_tv->domain()->domain(), + c2p_root_map); + + const auto& c2p_map = replay_producer_as_consumer.getReplay(); + const auto p2c_map = invertOneToOneMap(c2p_map); + { std::unordered_map index_map_ref_to_consumer = - indexMapReferenceTo( - consumer_tv, gpu_lower->caIndexMap(), reference_id_map); + indexMapReferenceTo(consumer_tv, gpu_lower->caMap(), reference_id_map); for (auto entry : index_map_ref_to_consumer) { auto r_id = entry.first; @@ -1311,9 +1255,12 @@ std::vector Index::getGlobalProducerStridedIndices( } } + kir::ForLoop* db_loop = gpu_lower->doubleBufferInfo().getDoubleBufferLoop( + consumer_tv, loops, true); + // Index into the reference tensor. Reference indexing will handle vectorized // dims where index should be set to 0 - auto ref_compute = getReferenceIndexing(loops, reference_domain); + auto ref_compute = getReferenceIndexing(loops, reference_domain, db_loop); // Forward vectorized IDs to index into producer correctly // We want p_id to be vectorized like consumer just for the indexing, then we @@ -1338,11 +1285,18 @@ std::vector Index::getGlobalProducerStridedIndices( const auto reference_halo_extent_map = getReferenceHaloExtentMap(reference, index_map_ref_to_producer); + ContigIDs contig_finder( + producer_tv->domain()->domain(), + producer_tv->getMaybeRFactorDomain(), + producer_tv->domain()->contiguity(), + reference_id_map, + p2c_map); + // Index into producer using reference indexing auto producer_indexing = ref_compute.updateIndexCompute( producer_tv->domain(), index_map_ref_to_producer, - producer_tv->domain()->contiguity(), + contig_finder, reference_halo_extent_map); // Revert p_ids @@ -1355,25 +1309,25 @@ std::vector Index::getGlobalProducerStridedIndices( auto root_dom = producer_tv->getMaybeRFactorDomain(); // TODO: Abstract stride logic to reuse with consumer indexing - auto zero = ir_builder.create(0); - std::vector strides(root_dom.size(), nullptr); + std::vector strides(root_dom.size(), nullptr); { int stride_i = 0; for (const auto i : c10::irange(root_dom.size())) { if (root_dom[i]->isReduction() || root_dom[i]->getIterType() == IterType::BroadcastWithoutStride) { - strides[i] = zero; + strides[i] = GpuLower::current()->kernel()->oneVal(); continue; } std::stringstream ss; ss << "T" << producer_tv->name() << ".stride[" << stride_i++ << "]"; - strides[i] = ir_builder.create(ss.str(), DataType::Int); + strides[i] = + SimplifyingIrBuilder::create(ss.str(), DataType::Int); } } TORCH_INTERNAL_ASSERT( root_dom.size() == producer_tv->domain()->contiguity().size()); - kir::Val* cur_contig_stride = ir_builder.create(1); + Val* cur_contig_stride = GpuLower::current()->kernel()->oneVal(); for (const auto i : c10::irange(root_dom.size())) { auto dim = root_dom.size() - i - 1; if (root_dom[dim]->isReduction()) { @@ -1383,24 +1337,26 @@ std::vector Index::getGlobalProducerStridedIndices( continue; } - kir::Val* root_ind = nullptr; - auto kir_root_dom = - gpu_lower->lowerValue(root_dom[dim])->as(); - if (producer_indexing.indexMap().find(kir_root_dom) != + Val* root_ind = nullptr; + if (producer_indexing.indexMap().find(root_dom[dim]) != producer_indexing.indexMap().end()) { - root_ind = producer_indexing.indexMap().at(kir_root_dom); + root_ind = producer_indexing.indexMap().at(root_dom[dim]); } else if (root_dom[dim]->getIterType() == IterType::BroadcastWithStride) { - root_ind = zero; + root_ind = GpuLower::current()->kernel()->zeroVal(); } TORCH_INTERNAL_ASSERT( root_ind != nullptr, - "Couldn't find root mapping for TV", - producer_tv->name(), + "Couldn't find root mapping for ", + producer_tv->toString(), " dim: ", - i, + dim, " id: ", - root_dom[dim]); + root_dom[dim]->toString(), + ", reference domain: ", + reference_domain->toString(), + ", reference root: ", + ir_utils::toString(reference_domain->getRootDomain())); if (producer_tv->domain()->contiguity()[dim]) { // If contig, used the stored stride which may be the previous @@ -1410,12 +1366,13 @@ std::vector Index::getGlobalProducerStridedIndices( // by extent of this dimension auto root_dim_extent = getHaloExtentOfRootAxis(root_dom[dim]); cur_contig_stride = - ir_builder.mulExpr(cur_contig_stride, root_dim_extent); + SimplifyingIrBuilder::mulExpr(cur_contig_stride, root_dim_extent); } else { // If non contiguous dimension, keep local stride information, set cur // stride to local stride * local raw extent auto root_dim_extent = getHaloExtentOfRootAxis(root_dom[dim]); - cur_contig_stride = ir_builder.mulExpr(strides[dim], root_dim_extent); + cur_contig_stride = + SimplifyingIrBuilder::mulExpr(strides[dim], root_dim_extent); } } @@ -1423,7 +1380,8 @@ std::vector Index::getGlobalProducerStridedIndices( loops.empty() ? nullptr : loops.back()->vectorize_shift(); // Global striding - std::vector strided_inds(root_dom.size(), ir_builder.zeroVal()); + std::vector strided_inds( + root_dom.size(), GpuLower::current()->kernel()->zeroVal()); for (const auto i : c10::irange(root_dom.size())) { // If the domain is derived from a trivial reduction, no indexing // to create. @@ -1434,20 +1392,33 @@ std::vector Index::getGlobalProducerStridedIndices( continue; } - auto kir_root_dom_i = - gpu_lower->lowerValue(root_dom[i])->as(); - TORCH_INTERNAL_ASSERT( - producer_indexing.indexMap().find(kir_root_dom_i) != + producer_indexing.indexMap().find(root_dom[i]) != producer_indexing.indexMap().end(), "Couldn't find root mapping for TV", producer_tv->name(), " dim: ", i, " id: ", - kir::toString(kir_root_dom_i)); + root_dom[i]->toString(), + ", reference domain: ", + reference_domain->toString(), + ", reference root: ", + ir_utils::toString(reference_domain->getRootDomain())); + + auto root_ind = producer_indexing.indexMap().at(root_dom[i]); - auto root_ind = producer_indexing.indexMap().at(kir_root_dom_i); + // index hoist must be done before the adjustments for halo + root_ind = hoistProducerIndex( + root_dom[i], + producer_tv, + producer_indexing, + consumer_tv, + p2c_map, + reference.domain, + ref_compute, + loops, + root_ind); root_ind = getProducerIndexWithHalo(producer_tv, i, root_ind, consumer_tv); @@ -1465,28 +1436,71 @@ std::vector Index::getGlobalProducerStridedIndices( if (root_ind->isZeroInt()) { continue; } else { - auto strided_ind = ir_builder.mulExpr(root_ind, strides[i]); + auto strided_ind = SimplifyingIrBuilder::mulExpr(root_ind, strides[i]); if (i == root_dom.size() - 1 && vectorize_shift != nullptr) { - strided_inds[i] = ir_builder.addExpr(strided_ind, vectorize_shift); + strided_inds[i] = + SimplifyingIrBuilder::addExpr(strided_ind, vectorize_shift); } else { strided_inds[i] = strided_ind; } } } + // Save indexing info necessary for validating vectorization at launch time + fillProducerVectorizedContigRootDomains( + producer_tv, consumer_tv, c2p_map, contig_finder); + return strided_inds; } +namespace { + +// Maps all producer domains to consumer with broadcast +// forwarding. Used to find the allocation position. +std::unordered_map mapAllProducerDomainsToConsumer( + TensorView* producer_tv, + const TensorView* consumer_tv) { + // This map has forwarded broadcast axes, it should only be used to compute + // the allocation position of the producer, and to figure out which producer + // indices are mapped to consumer trivial reductions. + std::unordered_map p2c_alloc_map; + + // We want to replay producer as consumer instead of the other way around + // since consumer may have some broadcasted axes producer doesn't have + // merged into loops producer may use. If we did consumer as producer we + // wouldn't have this information in the mapping. + auto replay_PasC = BestEffortReplay::replayPasC( + producer_tv, + consumer_tv, + -1, + PairwiseRootDomainMap(producer_tv, consumer_tv)); + + // Grab consumer domain entries and reverse replay map. TODO: Maybe + // TransformReplay::replayPasC could return this map + for (auto id : consumer_tv->domain()->domain()) { + const auto& c2p_map = replay_PasC.getReplay(); + auto c2p_it = c2p_map.find(id); + if (c2p_it != c2p_map.end()) { + auto c_id = c2p_it->first; + auto p_id = c2p_it->second; + p2c_alloc_map[p_id] = c_id; + } + } + + return p2c_alloc_map; +} + +} // namespace + // Producer index for either shared or local memory -std::vector Index::getNonGlobalProducerStridedIndices( +std::vector Index::getNonGlobalProducerStridedIndices( TensorView* producer_tv, const TensorView* consumer_tv, const std::vector& loops) { const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); // Get a reference tensor replayed as existing loop structure - auto reference = IndexReferenceReplay::getReference(loops); + auto reference = IndexReferenceReplay::getReference(loops, consumer_tv); auto reference_domain = reference.domain; auto reference_id_map = reference.concrete_to_id; @@ -1500,57 +1514,38 @@ std::vector Index::getNonGlobalProducerStridedIndices( ir_utils::TVDomainGuard domain_guard( producer_tv, producer_replayed_as_consumer); - // This map has forwarded broadcast axes, it should only be used to compute - // the allocation position of the producer, and to figure out which producer - // indices are mapped to consumer trivial reductions. - std::unordered_map p2c_alloc_map; - { - // We want to play producer as consumer instead of the other way around - // since consumer may have some broadcasted axes producer doesn't have - // merged into loops producer may use. If we did consumer as producer we - // wouldn't have this information in the mapping. - auto replay_PasC = BestEffortReplay::replayPasC( - producer_tv, consumer_tv, -1, pairwise_map); - - auto c2p_map = replay_PasC.getReplay(); - - // Grab consumer domain entries and reverse replay map. TODO: Maybe - // TransformReplay::replayPasC could return this map - for (auto id : consumer_tv->domain()->domain()) { - auto c2p_it = c2p_map.find(id); - if (c2p_it != c2p_map.end()) { - auto c_id = c2p_it->first; - auto p_id = c2p_it->second; - p2c_alloc_map[p_id] = c_id; - } - } - } + const auto p2c_alloc_map = + mapAllProducerDomainsToConsumer(producer_tv, consumer_tv); + + kir::ForLoop* consumer_db_loop = + gpu_lower->doubleBufferInfo().getDoubleBufferLoop( + consumer_tv, loops, true); // Find allocation point of producer relative to loop nests. P2C map is // required because producer was replayed as consumer, so we can't use the // regular compute at maps to line up its iter domains with the for loops. - auto alloc_point = - loop_utils::getAllocPoint(producer_tv, loops, p2c_alloc_map, true); - std::unordered_map loop_to_ind_map; + auto alloc_info = + loop_utils::getAllocInformation(producer_tv, loops, p2c_alloc_map, true); + std::unordered_map loop_to_ind_map; std::unordered_set zero_loops; - std::tie(loop_to_ind_map, zero_loops) = - indexMapFromTV(producer_tv, loops, alloc_point, false); + std::tie(loop_to_ind_map, zero_loops) = indexMapFromTV( + producer_tv, loops, alloc_info.init_for_loop, false, consumer_db_loop); - ensureStaticIndexing(producer_tv, alloc_point.first, loops, p2c_alloc_map); + ensureStaticIndexing( + producer_tv, alloc_info.init_for_loop, loops, p2c_alloc_map); // Map loop nests to indicies, zeroing out those not used due to locality of // memory - std::unordered_map ref_id_to_ind_map; + std::unordered_map ref_id_to_ind_map; // Track which domains are not used - std::unordered_set ref_zero_domains; + std::unordered_set ref_zero_domains; // Due to rfactor/initialization reference_domain may be bigger than loop nest // structure, ignore IterDomains that aren't present in the loop nest when // indexing reference. TORCH_INTERNAL_ASSERT(loops.size() <= reference_domain->nDims()); for (const auto loop_i : c10::irange(loops.size())) { - auto ref_axis = gpu_lower->lowerValue(reference_domain->axis(loop_i)) - ->as(); + auto ref_axis = reference_domain->axis(loop_i); ref_id_to_ind_map[ref_axis] = loop_to_ind_map[loops[loop_i]]; if (zero_loops.count(loops[loop_i]) > 0) { ref_zero_domains.insert(ref_axis); @@ -1563,25 +1558,32 @@ std::vector Index::getNonGlobalProducerStridedIndices( // more conservative approach, which is to use the consumer as a proxy between // producer to reference. std::unordered_map index_map_ref_to_producer; + std::unordered_map c2p_index_map; + std::unordered_map p2c_index_map; { + // Map sent to best effort replay needs to match the exact incantation for + // compute_at_mode.cpp with MappingMode::Index + auto c2p_root_map = PairwiseRootDomainMap(producer_tv, consumer_tv, true) + .mapConsumerToProducer( + consumer_tv->domain(), producer_tv->domain()); + // This replay has to be consistent with compute at index map. BestEffortReplay replay_producer_as_consumer( producer_tv->domain()->domain(), consumer_tv->domain()->domain(), - pairwise_map.mapConsumerToProducer( - consumer_tv->domain(), producer_tv->domain())); + c2p_root_map); - const auto& c2p_map = replay_producer_as_consumer.getReplay(); + c2p_index_map = replay_producer_as_consumer.getReplay(); + p2c_index_map = invertOneToOneMap(c2p_index_map); std::unordered_map index_map_ref_to_consumer = - indexMapReferenceTo( - consumer_tv, gpu_lower->caIndexMap(), reference_id_map); + indexMapReferenceTo(consumer_tv, gpu_lower->caMap(), reference_id_map); for (auto entry : index_map_ref_to_consumer) { auto r_id = entry.first; auto c_id = entry.second; - auto c2p_it = c2p_map.find(c_id); - if (c2p_it != c2p_map.end()) { + auto c2p_it = c2p_index_map.find(c_id); + if (c2p_it != c2p_index_map.end()) { auto p_id = c2p_it->second; index_map_ref_to_producer[r_id] = p_id; } @@ -1637,10 +1639,17 @@ std::vector Index::getNonGlobalProducerStridedIndices( const auto reference_halo_extent_map = getReferenceHaloExtentMap(reference, index_map_ref_to_producer); + ContigIDs contig_finder( + producer_tv->domain()->domain(), + producer_tv->getMaybeRFactorDomain(), + producer_tv->domain()->contiguity(), + reference_id_map, + p2c_index_map); + auto producer_indexing = ref_compute.updateIndexCompute( producer_tv->domain(), index_map_ref_to_producer, - producer_tv->domain()->contiguity(), + contig_finder, reference_halo_extent_map); // Revert p_ids @@ -1677,8 +1686,7 @@ std::vector Index::getNonGlobalProducerStridedIndices( } // Already an entry for this root domain, continue - if (index_map.find(gpu_lower->lowerValue(root_id)->as()) != - index_map.end()) { + if (index_map.find(root_id) != index_map.end()) { continue; } @@ -1690,25 +1698,39 @@ std::vector Index::getNonGlobalProducerStridedIndices( } } - std::vector strided_inds(root_dom.size(), ir_builder.zeroVal()); + std::vector strided_inds( + root_dom.size(), GpuLower::current()->kernel()->zeroVal()); for (const auto i : c10::irange(root_dom.size())) { if (skip_indexing.count(root_dom[i])) { continue; } - auto kir_root_dom_i = - gpu_lower->lowerValue(root_dom[i])->as(); - TORCH_INTERNAL_ASSERT( - index_map.find(kir_root_dom_i) != index_map.end(), - "Couldn't find root mapping for TV", - producer_tv->name(), + index_map.find(root_dom[i]) != index_map.end(), + "Couldn't find root mapping for ", + producer_tv->toString(), " dim: ", i, " id: ", - kir::toString(kir_root_dom_i)); + root_dom[i]->toString(), + ", reference domain: ", + reference_domain->toString(), + ", reference root: ", + ir_utils::toString(reference_domain->getRootDomain())); + + auto root_ind_i = index_map.at(root_dom[i]); - auto root_ind_i = index_map.at(kir_root_dom_i); + // index hoist must be done before the adjustments for halo + root_ind_i = hoistProducerIndex( + root_dom[i], + producer_tv, + producer_indexing, + consumer_tv, + p2c_index_map, + reference.domain, + ref_compute, + loops, + root_ind_i); root_ind_i = getProducerIndexWithHalo(producer_tv, i, root_ind_i, consumer_tv); @@ -1729,66 +1751,85 @@ std::vector Index::getNonGlobalProducerStridedIndices( } // Compute striding for this index. - kir::Val* stride = nullptr; + Val* stride = nullptr; for (const auto j : c10::irange(i + 1, root_dom.size())) { if (skip_indexing.count(root_dom[j])) { continue; } - auto kir_root_dom_j = - gpu_lower->lowerValue(root_dom[j])->as(); - TORCH_INTERNAL_ASSERT( - index_map.find(kir_root_dom_j) != index_map.end(), - "Couldn't find root mapping for TV", - consumer_tv->name(), + index_map.find(root_dom[j]) != index_map.end(), + "Couldn't find root mapping for ", + producer_tv->name(), " dim: ", - i, + j, " id: ", - root_dom[i]); + root_dom[j]->toString(), + ", reference domain: ", + reference_domain->toString(), + ", reference root: ", + ir_utils::toString(reference_domain->getRootDomain())); - auto root_ext_j = extent_map.find(kir_root_dom_j) == extent_map.end() - ? kir_root_dom_j->extent() - : extent_map.at(kir_root_dom_j); + auto root_ext_j = extent_map.find(root_dom[j]) == extent_map.end() + ? root_dom[j]->extent() + : extent_map.at(root_dom[j]); root_ext_j = getHaloExtentOfRootAxis(root_dom[j], root_ext_j); - if (zero_domain_map.count(kir_root_dom_j) == 0) { + if (zero_domain_map.count(root_dom[j]) == 0) { if (stride == nullptr) { stride = root_ext_j; } else { - stride = ir_builder.mulExpr(stride, root_ext_j); + stride = SimplifyingIrBuilder::mulExpr(stride, root_ext_j); } } } if (stride != nullptr) { - strided_inds[i] = ir_builder.mulExpr(root_ind_i, stride); + strided_inds[i] = SimplifyingIrBuilder::mulExpr(root_ind_i, stride); } else { strided_inds[i] = root_ind_i; } } + if (producer_tv->isDoubleBuffered()) { + auto db_loop = gpu_lower->doubleBufferInfo().getDoubleBufferLoop( + producer_tv, loops, true); + if (db_loop != nullptr) { + auto loop_index = + db_loop->isTrivial() ? db_loop->start() : db_loop->index(); + auto db_switch_index = SimplifyingIrBuilder::modExpr( + loop_index, SimplifyingIrBuilder::create(2)); + auto original_alloc_size = + gpu_lower->doubleBufferInfo().getOriginalAllocSize(producer_tv); + auto db_strided_index = + SimplifyingIrBuilder::mulExpr(db_switch_index, original_alloc_size); + strided_inds.push_back(db_strided_index); + } + } + + // Save indexing info necessary for validating vectorization at launch time + fillProducerVectorizedContigRootDomains( + producer_tv, consumer_tv, c2p_index_map, contig_finder); + return strided_inds; } -std::vector Index::getGlobalConsumerStridedIndices( +std::vector Index::getGlobalConsumerStridedIndices( const TensorView* consumer_tv, const std::vector& loops) { FUSER_PERF_SCOPE("GpuLower::Lower::getGlobalConsumerIndex"); const auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); // Get a reference tensor replayed as existing loop structure - auto reference = IndexReferenceReplay::getReference(loops); + auto reference = IndexReferenceReplay::getReference(loops, consumer_tv); auto reference_domain = reference.domain; auto reference_id_map = reference.concrete_to_id; // Map everything we can from reference to consumer using compute at index // map. std::unordered_map index_map_ref_to_consumer = - indexMapReferenceTo( - consumer_tv, gpu_lower->caIndexMap(), reference_id_map); + indexMapReferenceTo(consumer_tv, gpu_lower->caMap(), reference_id_map); // Index into the reference tensor. Reference indexing will handle vectorized // dims where index should be set to 0 @@ -1802,10 +1843,16 @@ std::vector Index::getGlobalConsumerStridedIndices( const auto reference_halo_extent_map = getReferenceHaloExtentMap(reference, index_map_ref_to_consumer); + ContigIDs contig_finder( + consumer_tv->domain()->domain(), + consumer_tv->getMaybeRFactorDomain(), + consumer_tv->domain()->contiguity(), + reference_id_map); + auto consumer_indexing = ref_compute.updateIndexCompute( consumer_tv->domain(), index_map_ref_to_consumer, - consumer_tv->domain()->contiguity(), + contig_finder, reference_halo_extent_map); // Indices should now be mapped onto IterDomains in consumer, so just grab @@ -1813,26 +1860,27 @@ std::vector Index::getGlobalConsumerStridedIndices( auto root_dom = consumer_tv->getMaybeRFactorDomain(); // TODO: Abstract stride logic to reuse with producer indexing - auto zero = ir_builder.zeroVal(); - std::vector strides(root_dom.size(), zero); + std::vector strides( + root_dom.size(), GpuLower::current()->kernel()->oneVal()); { int stride_i = 0; for (const auto i : c10::irange(root_dom.size())) { if (root_dom[i]->isReduction() || root_dom[i]->getIterType() == IterType::BroadcastWithoutStride || root_dom[i]->isStride()) { - strides[i] = zero; + strides[i] = GpuLower::current()->kernel()->oneVal(); continue; } std::stringstream ss; ss << "T" << consumer_tv->name() << ".stride[" << stride_i++ << "]"; - strides[i] = ir_builder.create(ss.str(), DataType::Int); + strides[i] = + SimplifyingIrBuilder::create(ss.str(), DataType::Int); } } TORCH_INTERNAL_ASSERT( root_dom.size() == consumer_tv->domain()->contiguity().size()); - kir::Val* cur_contig_stride = ir_builder.oneVal(); + Val* cur_contig_stride = GpuLower::current()->kernel()->oneVal(); for (const auto i : c10::irange(root_dom.size())) { auto dim = root_dom.size() - i - 1; if (root_dom[dim]->isReduction() || root_dom[dim]->isStride()) { @@ -1842,24 +1890,26 @@ std::vector Index::getGlobalConsumerStridedIndices( continue; } - kir::Val* root_ind = nullptr; - auto kir_root_dom = - gpu_lower->lowerValue(root_dom[dim])->as(); - if (consumer_indexing.indexMap().find(kir_root_dom) != + Val* root_ind = nullptr; + if (consumer_indexing.indexMap().find(root_dom[dim]) != consumer_indexing.indexMap().end()) { - root_ind = consumer_indexing.indexMap().at(kir_root_dom); + root_ind = consumer_indexing.indexMap().at(root_dom[dim]); } else if (root_dom[dim]->getIterType() == IterType::BroadcastWithStride) { - root_ind = zero; + root_ind = GpuLower::current()->kernel()->zeroVal(); } TORCH_INTERNAL_ASSERT( root_ind != nullptr, - "Couldn't find root mapping for TV", - consumer_tv->name(), + "Couldn't find root mapping for ", + consumer_tv->toString(), " dim: ", - i, + dim, " id: ", - root_dom[dim]); + root_dom[dim]->toString(), + ", reference domain: ", + reference_domain->toString(), + ", reference root: ", + ir_utils::toString(reference_domain->getRootDomain())); if (consumer_tv->domain()->contiguity()[dim]) { // If contig, used the stored stride which may be the previous @@ -1869,11 +1919,11 @@ std::vector Index::getGlobalConsumerStridedIndices( // by extent of this dimension auto root_dim_extent = getHaloExtentOfRootAxis(root_dom[dim]); cur_contig_stride = - ir_builder.mulExpr(cur_contig_stride, root_dim_extent); + SimplifyingIrBuilder::mulExpr(cur_contig_stride, root_dim_extent); } else { // If non contiguous dimension, keep local stride information, set cur // stride to local stride * local raw extent - cur_contig_stride = ir_builder.mulExpr( + cur_contig_stride = SimplifyingIrBuilder::mulExpr( strides[dim], getHaloExtentOfRootAxis(root_dom[dim])); } } @@ -1882,7 +1932,8 @@ std::vector Index::getGlobalConsumerStridedIndices( loops.empty() ? nullptr : loops.back()->vectorize_shift(); // Global striding - std::vector strided_inds(root_dom.size(), ir_builder.zeroVal()); + std::vector strided_inds( + root_dom.size(), GpuLower::current()->kernel()->zeroVal()); for (const auto i : c10::irange(root_dom.size())) { // See a comment in indexing to root domains in getGlobalProducerIndex. if (root_dom[i]->isReduction() || @@ -1893,71 +1944,87 @@ std::vector Index::getGlobalConsumerStridedIndices( continue; } - auto kir_root_dom_i = - gpu_lower->lowerValue(root_dom[i])->as(); - TORCH_INTERNAL_ASSERT( - consumer_indexing.indexMap().find(kir_root_dom_i) != + consumer_indexing.indexMap().find(root_dom[i]) != consumer_indexing.indexMap().end(), - "Couldn't find root mapping for TV", - consumer_tv->name(), + "Couldn't find root mapping for ", + consumer_tv->toString(), " dim: ", i, " id: ", - kir::toString(kir_root_dom_i)); + root_dom[i]->toString(), + ", reference domain: ", + reference_domain->toString(), + ", reference root: ", + ir_utils::toString(reference_domain->getRootDomain())); - auto root_ind = consumer_indexing.indexMap().at(kir_root_dom_i); + auto root_ind = consumer_indexing.indexMap().at(root_dom[i]); - root_ind = ir_builder.addExpr( - root_ind, getGlobalConsumerOffsetWithPartialSplit(kir_root_dom_i)); + // index hoist must be done before the adjustments for halo + root_ind = hoistConsumerIndex( + root_dom[i], + consumer_tv, + consumer_indexing, + reference.domain, + ref_compute, + loops, + root_ind); + + root_ind = SimplifyingIrBuilder::addExpr( + root_ind, getGlobalConsumerOffsetWithPartialSplit(root_dom[i])); if (root_ind->isZeroInt()) { continue; } else { - auto strided_ind = ir_builder.mulExpr(root_ind, strides[i]); + auto strided_ind = SimplifyingIrBuilder::mulExpr(root_ind, strides[i]); if (i == root_dom.size() - 1 && vectorize_shift != nullptr) { - strided_inds[i] = ir_builder.addExpr(strided_ind, vectorize_shift); + strided_inds[i] = + SimplifyingIrBuilder::addExpr(strided_ind, vectorize_shift); } else { strided_inds[i] = strided_ind; } } } + TORCH_INTERNAL_ASSERT( + strided_inds.size() == consumer_tv->getMaybeRFactorDomain().size()); + + fillConsumerVectorizedContigRootDomains(consumer_tv, contig_finder); + return strided_inds; } // Consumer index for either shared or local memory -std::vector Index::getNonGlobalConsumerStridedIndices( +std::vector Index::getNonGlobalConsumerStridedIndices( const TensorView* consumer_tv, const std::vector& loops) { const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); // Get a reference tensor replayed as existing loop structure - auto reference = IndexReferenceReplay::getReference(loops); + auto reference = IndexReferenceReplay::getReference(loops, consumer_tv); + auto reference_domain = reference.domain; auto reference_id_map = reference.concrete_to_id; - auto alloc_point = loop_utils::getAllocPoint(consumer_tv, loops); - std::unordered_map loop_to_ind_map; + auto alloc_info = loop_utils::getAllocInformation(consumer_tv, loops); + std::unordered_map loop_to_ind_map; std::unordered_set zero_loops; std::tie(loop_to_ind_map, zero_loops) = - indexMapFromTV(consumer_tv, loops, alloc_point, true); + indexMapFromTV(consumer_tv, loops, alloc_info.init_for_loop, true); - ensureStaticIndexing(consumer_tv, alloc_point.first, loops); + ensureStaticIndexing(consumer_tv, alloc_info.init_for_loop, loops); // Map loop nests to indicies, zeroing out those not used due to locality of // memory - std::unordered_map ref_id_to_ind_map; - std::unordered_set ref_zero_domains; + std::unordered_map ref_id_to_ind_map; + std::unordered_set ref_zero_domains; // Due to rfactor/initialization reference_domain may be bigger than loop nest // structure, ignore IterDomains that aren't present in the loop nest when // indexing reference. TORCH_INTERNAL_ASSERT(loops.size() <= reference_domain->nDims()); for (const auto loop_i : c10::irange(loops.size())) { - auto ref_axis = gpu_lower->lowerValue(reference_domain->axis(loop_i)) - ->as(); + auto ref_axis = reference_domain->axis(loop_i); ref_id_to_ind_map[ref_axis] = loop_to_ind_map[loops[loop_i]]; if (zero_loops.count(loops[loop_i]) > 0) { ref_zero_domains.insert(ref_axis); @@ -1967,8 +2034,7 @@ std::vector Index::getNonGlobalConsumerStridedIndices( // Map everything we can from reference to consumer using compute at index // map. std::unordered_map index_map_ref_to_consumer = - indexMapReferenceTo( - consumer_tv, gpu_lower->caIndexMap(), reference_id_map); + indexMapReferenceTo(consumer_tv, gpu_lower->caMap(), reference_id_map); // Grab roots that map into consumer and save them into the preferred roots // set for references indexing @@ -1999,11 +2065,17 @@ std::vector Index::getNonGlobalConsumerStridedIndices( const auto reference_halo_extent_map = getReferenceHaloExtentMap(reference, index_map_ref_to_consumer); + ContigIDs contig_finder( + consumer_tv->domain()->domain(), + consumer_tv->getMaybeRFactorDomain(), + consumer_tv->domain()->contiguity(), + reference_id_map); + // Index into consumer using reference indexing auto consumer_indexing = ref_compute.updateIndexCompute( consumer_tv->domain(), index_map_ref_to_consumer, - consumer_tv->domain()->contiguity(), + contig_finder, reference_halo_extent_map); IndexSwizzle index_swizzle( @@ -2022,7 +2094,8 @@ std::vector Index::getNonGlobalConsumerStridedIndices( // Indices should now be mapped onto IterDomains in consumer, so just grab // and use them. auto root_dom = consumer_tv->getMaybeRFactorDomain(); - std::vector strided_inds(root_dom.size(), ir_builder.zeroVal()); + std::vector strided_inds( + root_dom.size(), GpuLower::current()->kernel()->zeroVal()); for (const auto i : c10::irange(root_dom.size())) { if (root_dom[i]->isReduction() || root_dom[i]->isBroadcast() || gpu_lower->trivialReductionInfo().isDerived(root_dom[i]) || @@ -2030,25 +2103,36 @@ std::vector Index::getNonGlobalConsumerStridedIndices( continue; } - auto kir_root_dom_i = - gpu_lower->lowerValue(root_dom[i])->as(); - TORCH_INTERNAL_ASSERT( - index_map.find(kir_root_dom_i) != index_map.end(), - "Couldn't find root mapping for TV", - consumer_tv->name(), + index_map.find(root_dom[i]) != index_map.end(), + "Couldn't find root mapping for ", + consumer_tv->toString(), " dim: ", i, " id: ", - kir::toString(kir_root_dom_i)); + root_dom[i]->toString(), + ", reference domain: ", + reference_domain->toString(), + ", reference root: ", + ir_utils::toString(reference_domain->getRootDomain())); - const auto root_ind_i = index_map.at(kir_root_dom_i); + auto root_ind_i = index_map.at(root_dom[i]); if (root_ind_i->isZeroInt()) { continue; } + // index hoist must be done before the adjustments for halo + root_ind_i = hoistConsumerIndex( + root_dom[i], + consumer_tv, + consumer_indexing, + reference.domain, + ref_compute, + loops, + root_ind_i); + // Compute striding for this index. - kir::Val* stride = nullptr; + Val* stride = nullptr; for (const auto j : c10::irange(i + 1, root_dom.size())) { if (root_dom[j]->isBroadcast() || root_dom[j]->isReduction() || gpu_lower->trivialReductionInfo().isDerived(root_dom[j]) || @@ -2056,57 +2140,81 @@ std::vector Index::getNonGlobalConsumerStridedIndices( continue; } - auto kir_root_dom_j = - gpu_lower->lowerValue(root_dom[j])->as(); - TORCH_INTERNAL_ASSERT( - index_map.find(kir_root_dom_j) != index_map.end(), - "Couldn't find root mapping for TV", - consumer_tv->name(), + index_map.find(root_dom[j]) != index_map.end(), + "Couldn't find root mapping for ", + consumer_tv->toString(), " dim: ", - i, + j, " id: ", - root_dom[i]); + root_dom[j]->toString(), + ", reference domain: ", + reference_domain->toString(), + ", reference root: ", + ir_utils::toString(reference_domain->getRootDomain())); - auto root_ext_j = extent_map.find(kir_root_dom_j) == extent_map.end() - ? kir_root_dom_j->extent() - : extent_map.at(kir_root_dom_j); + auto root_ext_j = extent_map.find(root_dom[j]) == extent_map.end() + ? root_dom[j]->extent() + : extent_map.at(root_dom[j]); root_ext_j = getHaloExtentOfRootAxis(root_dom[j], root_ext_j); - if (zero_domain_map.count(kir_root_dom_j) == 0) { + if (zero_domain_map.count(root_dom[j]) == 0) { if (stride == nullptr) { stride = root_ext_j; } else { - stride = ir_builder.mulExpr(stride, root_ext_j); + stride = SimplifyingIrBuilder::mulExpr(stride, root_ext_j); } } } if (stride != nullptr) { - strided_inds[i] = ir_builder.mulExpr(root_ind_i, stride); + strided_inds[i] = SimplifyingIrBuilder::mulExpr(root_ind_i, stride); } else { strided_inds[i] = root_ind_i; } } + // This check was originally done in getConsumerStridedIndices, but + // the number of strided index values depends on the loop where the + // consumer tensor is located. If it's double buffered and not in + // the prologue loop, strided_inds ends up having one more + // index, so it's just much simpler to check here before adding the + // additional index for double buffering. + TORCH_INTERNAL_ASSERT( + strided_inds.size() == consumer_tv->getMaybeRFactorDomain().size()); + + if (consumer_tv->isDoubleBuffered()) { + auto db_loop = gpu_lower->doubleBufferInfo().getDoubleBufferLoop( + consumer_tv, loops, true); + if (db_loop != nullptr) { + auto db_switch_index = SimplifyingIrBuilder::subExpr( + gpu_lower->kernel()->oneVal(), + SimplifyingIrBuilder::modExpr( + db_loop->index(), SimplifyingIrBuilder::create(2))); + auto original_alloc_size = + gpu_lower->doubleBufferInfo().getOriginalAllocSize(consumer_tv); + auto db_strided_index = + SimplifyingIrBuilder::mulExpr(db_switch_index, original_alloc_size); + strided_inds.push_back(db_strided_index); + } + } + return strided_inds; } -std::vector Index::getProducerStridedIndices( +std::vector Index::getProducerStridedIndices( TensorView* producer, const TensorView* consumer, const std::vector& loops) { FUSER_PERF_SCOPE("GpuLower::Lower::Index::getProducerStridedIndices"); - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - if (producer->domain()->noReductions().size() == 0) { - return std::vector( - producer->getMaybeRFactorDomain().size(), ir_builder.zeroVal()); + return std::vector( + producer->getMaybeRFactorDomain().size(), + GpuLower::current()->kernel()->zeroVal()); } - std::vector strided_indices; + std::vector strided_indices; if (producer->getMemoryType() == MemoryType::Global) { strided_indices = getGlobalProducerStridedIndices(producer, consumer, loops); @@ -2116,7 +2224,9 @@ std::vector Index::getProducerStridedIndices( } TORCH_INTERNAL_ASSERT( - strided_indices.size() == producer->getMaybeRFactorDomain().size()); + strided_indices.size() == + producer->getMaybeRFactorDomain().size() + + (producer->isDoubleBuffered() ? 1 : 0)); return strided_indices; } @@ -2126,35 +2236,28 @@ kir::TensorIndex* Index::getProducerIndex( TensorView* producer, const TensorView* consumer, const std::vector& loops) { - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - auto strided_indices = getProducerStridedIndices(producer, consumer, loops); - return ir_builder.create(producer, strided_indices); + return SimplifyingIrBuilder::create( + producer, strided_indices); } -std::vector Index::getConsumerStridedIndices( +std::vector Index::getConsumerStridedIndices( const TensorView* consumer, const std::vector& loops) { FUSER_PERF_SCOPE("GpuLower::Lower::Index::getConsumerStridedIndices"); - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - if (consumer->domain()->noReductions().size() == 0) { - return std::vector( - consumer->getMaybeRFactorDomain().size(), ir_builder.zeroVal()); + return std::vector( + consumer->getMaybeRFactorDomain().size(), + GpuLower::current()->kernel()->zeroVal()); } - std::vector strided_indices; + std::vector strided_indices; if (consumer->getMemoryType() == MemoryType::Global) { strided_indices = getGlobalConsumerStridedIndices(consumer, loops); } else { strided_indices = getNonGlobalConsumerStridedIndices(consumer, loops); } - TORCH_INTERNAL_ASSERT( - strided_indices.size() == consumer->getMaybeRFactorDomain().size()); - return strided_indices; } @@ -2162,11 +2265,9 @@ std::vector Index::getConsumerStridedIndices( kir::TensorIndex* Index::getConsumerIndex( const TensorView* consumer, const std::vector& loops) { - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - auto strided_indices = getConsumerStridedIndices(consumer, loops); - return ir_builder.create(consumer, strided_indices); + return SimplifyingIrBuilder::create( + consumer, strided_indices); } namespace { @@ -2184,37 +2285,23 @@ struct PredicateDomainInfo { bool is_non_divisible_split = false; }; -// Find iteration domains in the history of reference comprised only of -// merge operations. Only return iteration domains that are subsequently fed -// into a split, or are in the provided domain. In other words, we don't want to -// return every IterDomain that's contiguous, just the one closest to the -// leaves. Predicates are not associated with physical memory so we can treat -// all of them as contiguous merges. +// Find iteration domains in the history of a consumer to predicate comprised +// only of merge operations. Only return iteration domains that are subsequently +// fed into a split, or are in the provided domain. In other words, we don't +// want to return every IterDomain that's contiguous, just the one closest to +// the leaves. Predicates are not associated with physical memory so we can +// treat all of them as contiguous merges. +// +// TODO: This seems to have a large overlap with ContigIDs. Consider +// refactoring. std::vector getPredicateContigIds( - const ReferenceTensor& reference, TensorView* consumer_tv, - const std::unordered_map& ref_2_consumer) { + const std::unordered_map& consumer_index_map) { const auto gpu_lower = GpuLower::current(); - std::vector reference_predicated_root_domain; - for (const auto consumer_root : consumer_tv->getRootDomain()) { - if (consumer_root->isBroadcast()) { - continue; - } - auto consumer_root_concrete = - gpu_lower->caIndexMap().getConcreteMappedID(consumer_root); - auto it = reference.concrete_to_id.find(consumer_root_concrete); - // When initializing a reduction buffer, the reduction axis - // doesn't have a loop, so the reference tensor doesn't have a - // mapped domain. The reduction axis can be safely ignored. - if (it == reference.concrete_to_id.end()) { - continue; - } - auto reference_root = it->second; - reference_predicated_root_domain.emplace_back(reference_root); - } + const auto& consumer_root_domain = consumer_tv->getRootDomain(); - std::vector contiguous_ids = reference_predicated_root_domain; + std::vector contiguous_ids = consumer_root_domain; if (contiguous_ids.empty()) { return std::vector(); @@ -2227,20 +2314,25 @@ std::vector getPredicateContigIds( // about halo to do correct predication, so they must be excluded. std::unordered_set excluded_ids; - for (auto reference_predicated_id : reference_predicated_root_domain) { - if (GpuLower::current() - ->haloInfo() - .getRootAxisInfo(reference_predicated_id) - .hasHalo()) { + for (auto consumer_root_id : consumer_root_domain) { + if (gpu_lower->haloInfo().getRootAxisInfo(consumer_root_id).hasHalo()) { + excluded_ids.insert(consumer_root_id); continue; } - auto it = ref_2_consumer.find(reference_predicated_id); - if (it == ref_2_consumer.end()) { + if (consumer_root_id->maybePartial()) { + excluded_ids.insert(consumer_root_id); continue; } - auto consumer_root_id = it->second; - if (consumer_root_id->maybePartial()) { - excluded_ids.insert(reference_predicated_id); + // When consumer_root_id is a broadcast domain, do not allow contig + // predication as the merged output is not mapped with the + // reference unless the concrete domain is also a broadcast + // domain. + if (consumer_root_id->isBroadcast() && + !GpuLower::current() + ->caMap() + ->getConcreteMappedID(consumer_root_id, IdMappingMode::PERMISSIVE) + ->isBroadcast()) { + excluded_ids.insert(consumer_root_id); continue; } // Shifted or gathered axes need to be predicated at the root domain @@ -2252,15 +2344,16 @@ std::vector getPredicateContigIds( auto consumer_root_pos = consumer_tv->domain()->rootPosOf(consumer_root_id); if ((shift_expr && shift_expr->offset(consumer_root_pos) != 0) || (gather_expr && consumer_root_pos < gather_expr->windowShape().size() && - !gather_expr->windowShape().at(consumer_root_pos)->isOneInt())) { - excluded_ids.insert(reference_predicated_id); + gather_expr->windowShape().at(consumer_root_pos) != 1)) { + excluded_ids.insert(consumer_root_id); } } // Run through iteration domain history - auto exprs = ExprSort::getExprs( + auto exprs = StmtSort::getExprs( consumer_tv->fusion(), - {reference.domain->domain().begin(), reference.domain->domain().end()}); + {consumer_tv->domain()->domain().begin(), + consumer_tv->domain()->domain().end()}); for (auto expr : exprs) { // If not a merge, output is not contiguous @@ -2276,6 +2369,13 @@ std::vector getPredicateContigIds( continue; } + // Do not try to predicate the merge output domain if the output + // domain has not a predicate that is mapped from the reference. + // See FusionContigPredicate_CUDA for a concrete example. + if (consumer_index_map.find(merge->out()) == consumer_index_map.end()) { + continue; + } + if (inner_contig_it != contiguous_ids.end() && outer_contig_it != contiguous_ids.end()) { // If inner and outer are contiguous, out must be contiguous. Remove @@ -2296,8 +2396,7 @@ std::vector getPredicateContigIds( // reference_predicated_root_domain. auto contig_root_vals = IterVisitor::getInputsTo( {contig_id}, - {reference_predicated_root_domain.begin(), - reference_predicated_root_domain.end()}); + {consumer_root_domain.begin(), consumer_root_domain.end()}); auto contig_root_ids = ir_utils::filterByType(contig_root_vals); PredicateDomainInfo contig_id_info; contig_id_info.id = contig_id; @@ -2312,8 +2411,8 @@ IterDomain* getMappedReferenceDomain( IterDomain* id, const ReferenceTensor& reference) { // Partially overlaps with getPredicateContigIds() - const auto gpu_lower = GpuLower::current(); - auto concrete_id = gpu_lower->caIndexMap().getConcreteMappedID(id); + auto concrete_id = GpuLower::current()->caMap()->getConcreteMappedID( + id, IdMappingMode::EXACT); auto it = reference.concrete_to_id.find(concrete_id); if (it == reference.concrete_to_id.end()) { return nullptr; @@ -2321,9 +2420,8 @@ IterDomain* getMappedReferenceDomain( return it->second; } -std::vector getNonDivisibleReferenceDomainsToPredicate( - TensorView* consumer_tv, - const ReferenceTensor& reference) { +std::vector getNonDivisibleConsumerDomainsToPredicate( + TensorView* consumer_tv) { const auto& non_divisible_split_info = GpuLower::current()->nonDivisibleSplitInfo(); @@ -2337,11 +2435,7 @@ std::vector getNonDivisibleReferenceDomainsToPredicate( const auto& splits_to_predicate = it->second; for (auto split : splits_to_predicate) { - auto ref_id = getMappedReferenceDomain(split->in(), reference); - if (ref_id == nullptr) { - continue; - } - PredicateDomainInfo info{ref_id, {ref_id}, true}; + PredicateDomainInfo info{split->in(), {split->in()}, true}; pred_info_vec.emplace_back(info); } @@ -2352,9 +2446,8 @@ bool needsPadding(TensorView* tv) { auto shift_expr = dynamic_cast(tv->definition()); auto gather_expr = dynamic_cast(tv->definition()); - // Padding is only necessary for padded shift and - // gather - return (shift_expr != nullptr && shift_expr->pad()) || gather_expr != nullptr; + return (shift_expr != nullptr && shift_expr->hasPadding()) || + (gather_expr != nullptr && gather_expr->hasPadding()); } // Get an additional offset of a stop index when building a predicate @@ -2364,11 +2457,10 @@ bool needsPadding(TensorView* tv) { // compared with each other by just looking at the additional offsets. // // consumer_root_id: the domain for which a stop predicate is being built. -kir::Val* getUnswitchStopOffset( +int getUnswitchStopOffset( IterDomain* consumer_root_id, TensorView* consumer_tv) { const auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); AxisHaloInfo halo_info = gpu_lower->haloInfo().getRootAxisInfo(consumer_root_id); @@ -2376,7 +2468,7 @@ kir::Val* getUnswitchStopOffset( // If the consumer root domain to predicate does not have halo, no // adjustment is required. if (!halo_info.hasHalo()) { - return ir_builder.zeroVal(); + return 0; } // Find if this contig_id is used in the unswitched domains @@ -2400,22 +2492,14 @@ kir::Val* getUnswitchStopOffset( })) { return halo_info.width(); } else { - return ir_builder.zeroVal(); + return 0; } } -// Get offsets for the start and stop predicates. Similar to the -// gather case, but it's a little simpler as it does not (yet) -// dynamic shifting. -void adjustStartAndStopOffsetsForShift( - std::vector& start_offsets, - std::vector& stop_offsets, +std::pair getStartAndStopOffsetsForShift( TensorView* consumer_tv, IterDomain* consumer_id, bool padding_predicate) { - const auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); - TORCH_INTERNAL_ASSERT(consumer_id != nullptr); auto shift_expr = dynamic_cast(consumer_tv->definition()); @@ -2423,105 +2507,124 @@ void adjustStartAndStopOffsetsForShift( // Adjustment is not necessary if not shift. // Even so, padding predicate does not need any adjustment. if (shift_expr == nullptr || padding_predicate) { - return; + return { + GpuLower::current()->kernel()->zeroVal(), + GpuLower::current()->kernel()->zeroVal()}; } const auto root_axis_pos = consumer_tv->domain()->rootPosOf(consumer_id); - // Assume this adjustment is done first, so start and stop offsets - // just contain zeroVal. - TORCH_INTERNAL_ASSERT( - start_offsets.size() == 1 && start_offsets[0]->isZeroInt() && - stop_offsets.size() == 1 && stop_offsets[0]->isZeroInt()); - start_offsets.clear(); - stop_offsets.clear(); - - // The consumer offset is zero. - auto consumer_offset = 0; - // The producer offset is based off the consumer offset. - auto producer_offset = 0; - - // When the shift operation is not padded, the start and stop positions of the - // consumer axis, i.e., consumer_id->start and - // consumer_id->stop_ofset, are adjusted accordingly, which includes - // the effect of the shift offset, so using the consumer offset is - // sufficient as the only predicate is sufficient. - - if (shift_expr->pad()) { - // Positive shift offset means shifting the input tensor to the - // positive direction, so the producer offset becomes negative. - auto shift_offset = shift_expr->offset(root_axis_pos); - producer_offset = -shift_offset; - } - - // Since shift doesn't allow dynamic offsets, we can statically - // choose more restrictive offsets between the producer and consumer - // offsets. The start predicate uses greater-than, so using the - // smaller offset is sufficient. Similarly, for the stop predicate, - // using the larger offset is sufficient. - auto start_offset = std::min(consumer_offset, producer_offset); - auto stop_offset = std::max(consumer_offset, producer_offset); - - start_offsets.push_back(ir_builder.create(start_offset)); - stop_offsets.push_back(ir_builder.create(stop_offset)); + // The first or last N elements, where N is the padding width, + // correspond to the padding predicate. + + const auto shift_offset = shift_expr->offset(root_axis_pos); + const auto pad_width = shift_expr->padWidth().at(root_axis_pos); + + int start_offset = 0; + int stop_offset = 0; + + if (shift_offset > 0) { + start_offset = -pad_width; + } else if (shift_offset < 0) { + stop_offset = pad_width; + } + + return { + SimplifyingIrBuilder::create(start_offset), + SimplifyingIrBuilder::create(stop_offset)}; } -// Get offsets for the start and stop predicates. There can be two -// offsets because the shift offset is determined by a loop index. -void adjustStartAndStopOffsetsForGather( - std::vector& start_offsets, - std::vector& stop_offsets, +std::pair getStartAndStopOffsetsForGather( TensorView* consumer_tv, IterDomain* consumer_id, - const ReferenceTensor& reference, - const std::unordered_map& ref_start_index_map, - const std::unordered_map& ref_stop_index_map, + const std::unordered_map& ref_start_index_map, + const std::unordered_map& ref_stop_index_map, bool padding_predicate) { - const auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); - TORCH_INTERNAL_ASSERT(consumer_id != nullptr); // Adjustment is not necessary if not gather. Even so, padding // predicate does not need any adjustment. if (!consumer_tv->definition()->isA() || padding_predicate) { - return; + return { + GpuLower::current()->kernel()->zeroVal(), + GpuLower::current()->kernel()->zeroVal()}; } const auto root_axis_pos = consumer_tv->domain()->rootPosOf(consumer_id); - // Assume this adjustment is done first, so start and stop offsets - // just contain zeroVal. - TORCH_INTERNAL_ASSERT( - start_offsets.size() == 1 && start_offsets[0]->isZeroInt() && - stop_offsets.size() == 1 && stop_offsets[0]->isZeroInt()); - start_offsets.clear(); - stop_offsets.clear(); - auto producer_start_offset = getProducerOffsetWithGather( - root_axis_pos, - consumer_tv, - reference.concrete_to_id, - ref_start_index_map); + root_axis_pos, consumer_tv, ref_start_index_map); auto producer_stop_offset = getProducerOffsetWithGather( - root_axis_pos, consumer_tv, reference.concrete_to_id, ref_stop_index_map); + root_axis_pos, consumer_tv, ref_stop_index_map); + + auto consumer_start_offset = GpuLower::current()->kernel()->zeroVal(); + auto consumer_stop_offset = GpuLower::current()->kernel()->zeroVal(); - // The producer and consumer accesses must be predicated as it is - // not statically determined which is more restrictive. + if (producer_start_offset->isZeroInt() && producer_stop_offset->isZeroInt()) { + return {consumer_start_offset, consumer_stop_offset}; + } + + Val* start_offset = nullptr; + Val* stop_offset = nullptr; - // Consumer offsets are just zero. - start_offsets.push_back(ir_builder.zeroVal()); - stop_offsets.push_back(ir_builder.zeroVal()); + // In the normal case, take the minimum of the start and the + // maximum of the stop offsets. If there's no padding, the producer + // offset must be always larger than the consumer + // offset. So, the consumer and produce offsets can be always used + // for the start and stop offsets, respectively. + const auto pad_left = + consumer_tv->definition()->as()->padWidth()[root_axis_pos][0]; + const auto pad_right = + consumer_tv->definition()->as()->padWidth()[root_axis_pos][1]; + const auto window_size = + consumer_tv->definition()->as()->windowShape()[root_axis_pos]; - // Adds producer offsets if they are not zero. - if (!producer_start_offset->isZeroInt()) { - start_offsets.push_back(producer_start_offset); + // consumer index: index + // producer index: index + window_index - pad_left + // + // consumer extent: ext + // producer extent: ext + window_size - 1 - pad_left - pad_right + // + // consumer stop pred: index < ext + // producer stop pred: index + window_index - pad_left < ext + window_size - 1 + // - pad_left - pad_right + // -> index + window_index - pad_left - (window_size - 1 - + // pad_left - pad_right) < ext + // -> index + window_index - (window_size - 1 - pad_right) < + // ext + // + // consumer start pred: index >= 0 + // producer start pred: index + window_index - pad_left >= 0 + + const auto producer_ext_adj = window_size - 1 - pad_left - pad_right; + producer_stop_offset = SimplifyingIrBuilder::subExpr( + producer_stop_offset, + SimplifyingIrBuilder::create(producer_ext_adj)); + + // As commented above, when pad_left is zero, the consumer predicate + // is always more restrictive than the producer predicate. + if (pad_left == 0) { + start_offset = consumer_start_offset; + } else { + start_offset = SimplifyingIrBuilder::minExpr( + consumer_start_offset, producer_start_offset); } - if (!producer_stop_offset->isZeroInt()) { - stop_offsets.push_back(producer_stop_offset); + // As commented above, when pad_right is zero, the consumer + // predicate is always more restrictive than the producer + // predicate. + if (pad_right == 0) { + stop_offset = consumer_stop_offset; + } else { + stop_offset = SimplifyingIrBuilder::maxExpr( + consumer_stop_offset, producer_stop_offset); } + + TORCH_INTERNAL_ASSERT(start_offset != nullptr); + TORCH_INTERNAL_ASSERT(stop_offset != nullptr); + + return {start_offset, stop_offset}; } // Get the start and stop limit offsets that define the valid range to @@ -2530,18 +2633,16 @@ void adjustStartAndStopOffsetsForGather( // stop that's different from extent. Also, when IterDomain has halo, // the actual offsets of the logical start and stop positions are // shifted. -std::pair getStartAndStopLimitOffsets( +std::pair getStartAndStopLimitOffsets( IterDomain* consumer_id, bool padding_predicate, bool non_divisible_pred) { const auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); TORCH_INTERNAL_ASSERT(consumer_id != nullptr); - kir::Val* start_limit = gpu_lower->lowerValue(consumer_id->start()); - kir::Val* stop_limit = - ir_builder.negExpr(gpu_lower->lowerValue(consumer_id->stopOffset())); + Val* start_limit = consumer_id->start(); + Val* stop_limit = SimplifyingIrBuilder::negExpr(consumer_id->stopOffset()); if (!non_divisible_pred) { AxisHaloInfo halo_info = gpu_lower->haloInfo().getRootAxisInfo(consumer_id); @@ -2554,12 +2655,14 @@ std::pair getStartAndStopLimitOffsets( // [0, left halo)[start_limit, stop_limit)[0, right halo) // if (!padding_predicate) { - start_limit = ir_builder.addExpr(start_limit, halo_info.width(0)); - stop_limit = ir_builder.addExpr(stop_limit, halo_info.width(0)); + start_limit = + SimplifyingIrBuilder::addExpr(start_limit, halo_info.width(0)); + stop_limit = + SimplifyingIrBuilder::addExpr(stop_limit, halo_info.width(0)); } else { // In case of the padding predicate, the whole range, including both left // and right halo regions, is computed. - stop_limit = ir_builder.addExpr(stop_limit, halo_info.width()); + stop_limit = SimplifyingIrBuilder::addExpr(stop_limit, halo_info.width()); } } else { // For non-divisible predicates, the index must be predicated such @@ -2568,28 +2671,26 @@ std::pair getStartAndStopLimitOffsets( // isn't a root domain. if (gpu_lower->haloInfo().hasHaloWidth(consumer_id)) { auto halo = gpu_lower->haloInfo().getHaloWidth(consumer_id); - stop_limit = ir_builder.addExpr(stop_limit, halo); + stop_limit = SimplifyingIrBuilder::addExpr(stop_limit, halo); } } return {start_limit, stop_limit}; } -// Return an index map for a predicate reference tensor. Two different +// Return an IndexCompute for a predicate reference tensor. Two different // maps are used when generating predicates for unswitched expressions // as start and stop conditions need to use different loop-to-index // mappings. -std::unordered_map getPredicateReferenceIndexing( +auto getPredicateReferenceIndexing( const std::vector& loops, const ReferenceTensor& reference, kir::ForLoop* unswitch_or_vec_loop, + IterDomain* double_buffer_axis, bool start) { - const auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); - auto reference_domain = reference.domain; - std::unordered_map loop_to_ind_map; + std::unordered_map loop_to_ind_map; std::transform( loops.begin(), @@ -2606,7 +2707,7 @@ std::unordered_map getPredicateReferenceIndexing( // vectorized loop should be like this. bool vectorized_pred = - unswitch_or_vec_loop->iter_domain()->parallelType() == + unswitch_or_vec_loop->iter_domain()->getParallelType() == ParallelType::Vectorize; TORCH_INTERNAL_ASSERT( @@ -2614,12 +2715,11 @@ std::unordered_map getPredicateReferenceIndexing( "Invalid reference generated."); bool within_unswitch = false; - const auto one = ir_builder.oneVal(); for (const auto loop_i : c10::irange(loops.size())) { auto loop = loops[loop_i]; auto loop_id = loop->iter_domain(); - auto loop_pt = loop_id->parallelType(); + auto loop_pt = loop_id->getParallelType(); auto ref_id = reference_domain->axis(loop_i); if (loop == unswitch_or_vec_loop) { @@ -2668,20 +2768,21 @@ std::unordered_map getPredicateReferenceIndexing( if (loop->stop() == loop_id->extent()) { loop_to_ind_map[loop] = loop->start(); } else if (start) { - loop_to_ind_map[loop] = ir_builder.zeroVal(); + loop_to_ind_map[loop] = GpuLower::current()->kernel()->zeroVal(); } else { // Note that the parallel dimension is used rather than // loop-stop(). See the above comment. - loop_to_ind_map[loop] = ir_builder.subExpr( - gpu_lower->parallelDimensionMap().get(loop_pt), - ir_builder.create(1)); + loop_to_ind_map[loop] = SimplifyingIrBuilder::subExpr( + GpuLower::current()->parallelDimensionMap().get(loop_pt), + GpuLower::current()->kernel()->zeroVal()); } } else if (start) { - loop_to_ind_map[loop] = ir_builder.zeroVal(); + loop_to_ind_map[loop] = GpuLower::current()->kernel()->zeroVal(); } else { // Similar to the above, loop_id()->extent() is // used here instead of loop->stop(). See the above comment. - loop_to_ind_map[loop] = ir_builder.subExpr(loop_id->extent(), one); + loop_to_ind_map[loop] = SimplifyingIrBuilder::subExpr( + loop_id->extent(), GpuLower::current()->kernel()->oneVal()); } } @@ -2693,9 +2794,36 @@ std::unordered_map getPredicateReferenceIndexing( } } + for (const auto loop : loops) { + auto& idx = loop_to_ind_map.at(loop); + // If the loop is trivial, the loop index can only be the loop + // start value. + if (idx == loop->index() && loop->isTrivial()) { + idx = loop->start(); + } + } + + if (double_buffer_axis != nullptr) { + auto db_loop = GpuLower::current()->doubleBufferInfo().getDoubleBufferLoop( + double_buffer_axis, loops, true); + if (db_loop != nullptr) { + auto loop_to_ind_map_it = loop_to_ind_map.find(db_loop); + TORCH_INTERNAL_ASSERT(loop_to_ind_map_it != loop_to_ind_map.end()); + auto cur_index = loop_to_ind_map_it->second; + // if cur_index is not the same as the index of db_loop, it must + // be true that that index has been modified to support + // unswitch. In that case, it is not necessary to move ahead the + // index for double buffering. + if (cur_index == db_loop->index()) { + loop_to_ind_map[db_loop] = SimplifyingIrBuilder::addExpr( + cur_index, GpuLower::current()->kernel()->oneVal()); + } + } + } + // Add magic zero to a loop pretty far inside in indexing - kir::IterDomain* magic_zero_loop = nullptr; - std::unordered_map ref_id_to_ind_map; + IterDomain* magic_zero_loop = nullptr; + std::unordered_map ref_id_to_ind_map; // Due to rfactor/initialization reference_domain may be bigger than loop nest // structure TORCH_INTERNAL_ASSERT(loops.size() <= reference_domain->nDims()); @@ -2703,19 +2831,19 @@ std::unordered_map getPredicateReferenceIndexing( auto loop = loops[loop_i]; auto ind = loop_to_ind_map[loops[loop_i]]; auto ref_axis = reference_domain->axis(loop_i); - auto kir_ref_axis = gpu_lower->lowerValue(ref_axis)->as(); if (Index::protectWithMagicZero(loop, ref_axis, ind)) { - magic_zero_loop = kir_ref_axis; + magic_zero_loop = ref_axis; } - ref_id_to_ind_map[kir_ref_axis] = loop_to_ind_map[loop]; + ref_id_to_ind_map[ref_axis] = loop_to_ind_map[loop]; } if (ref_id_to_ind_map.count(magic_zero_loop)) { auto& ind = ref_id_to_ind_map[magic_zero_loop]; if (!ind->isConstScalar()) { - ind = ir_builder.addExpr(ind, ir_builder.magicZeroVal()); + ind = SimplifyingIrBuilder::addExpr( + ind, GpuLower::current()->kernel()->magicZeroVal()); } } @@ -2729,7 +2857,7 @@ std::unordered_map getPredicateReferenceIndexing( ref_self_map.insert({id, id}); }); - std::unordered_map reference_halo_extent_map = + std::unordered_map reference_halo_extent_map = getReferenceHaloExtentMap(reference, ref_self_map); // Index into the reference tensor @@ -2741,64 +2869,55 @@ std::unordered_map getPredicateReferenceIndexing( {}, reference_halo_extent_map); - return index_compute.indexMap(); + return index_compute; } // Get the offsets for the start and stop predicates. The offsets // are to be added to the index. -std::pair, std::vector> getStartAndStopOffsets( +std::pair getStartAndStopOffsets( IterDomain* consumer_id, TensorView* consumer_tv, const ReferenceTensor& reference, - const std::unordered_map& ref_start_index_map, - const std::unordered_map& ref_stop_index_map, + const std::unordered_map& consumer_start_index_map, + const std::unordered_map& consumer_stop_index_map, bool padding_predicate, bool unswitch, bool non_divisible_pred) { - const auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); - // By default, the offsets for the start and stop predicates are - // just zero. - std::vector start_offsets{ir_builder.zeroVal()}; - std::vector stop_offsets{ir_builder.zeroVal()}; - - if (consumer_id == nullptr) { - return {start_offsets, stop_offsets}; + // just zero. All halo-related adjustments are done at root domains, + // so consumer_id is not a root domain, no adjustment is required. + if (consumer_id->definition() != nullptr && !non_divisible_pred) { + return { + GpuLower::current()->kernel()->zeroVal(), + GpuLower::current()->kernel()->zeroVal()}; } auto consumer_def = consumer_tv->definition(); + Val* start_offset = GpuLower::current()->kernel()->zeroVal(); + Val* stop_offset = GpuLower::current()->kernel()->zeroVal(); + // These adjustments are not required when predicating non-divisible splits if (!non_divisible_pred) { if (consumer_def->isA()) { - adjustStartAndStopOffsetsForShift( - start_offsets, - stop_offsets, - consumer_tv, - consumer_id, - padding_predicate); + std::tie(start_offset, stop_offset) = getStartAndStopOffsetsForShift( + consumer_tv, consumer_id, padding_predicate); } else if (consumer_def->isA()) { - adjustStartAndStopOffsetsForGather( - start_offsets, - stop_offsets, + std::tie(start_offset, stop_offset) = getStartAndStopOffsetsForGather( consumer_tv, consumer_id, - reference, - ref_start_index_map, - ref_stop_index_map, + consumer_start_index_map, + consumer_stop_index_map, padding_predicate); } // Adjustment for partial split - auto partial_split_offset = getGlobalConsumerOffsetWithPartialSplit( - gpu_lower->lowerValue(consumer_id)->as()); - for (auto& start_offset : start_offsets) { - start_offset = ir_builder.addExpr(start_offset, partial_split_offset); - } - for (auto& stop_offset : stop_offsets) { - stop_offset = ir_builder.addExpr(stop_offset, partial_split_offset); - } + auto partial_split_offset = + getGlobalConsumerOffsetWithPartialSplit(consumer_id); + start_offset = + SimplifyingIrBuilder::addExpr(start_offset, partial_split_offset); + stop_offset = + SimplifyingIrBuilder::addExpr(stop_offset, partial_split_offset); // If generating a predicate for unswitch, adjust the stop offset to // accommodate the addition of halo to the loop stop. See the @@ -2808,9 +2927,8 @@ std::pair, std::vector> getStartAndStopOffsets !padding_predicate, "Unswitch should not use the padding predicate"); auto stop_unswitch_offset = getUnswitchStopOffset(consumer_id, consumer_tv); - for (auto& stop_offset : stop_offsets) { - stop_offset = ir_builder.addExpr(stop_offset, stop_unswitch_offset); - } + stop_offset = + SimplifyingIrBuilder::addExpr(stop_offset, stop_unswitch_offset); } } @@ -2830,39 +2948,48 @@ std::pair, std::vector> getStartAndStopOffsets // index + (start_offset - start_limit) >= 0 // index + (stop_offset - stop_limit) < extent - for (auto& start_offset : start_offsets) { - start_offset = ir_builder.subExpr(start_offset, limits.first); - } - for (auto& stop_offset : stop_offsets) { - stop_offset = ir_builder.subExpr(stop_offset, limits.second); - } + start_offset = SimplifyingIrBuilder::subExpr(start_offset, limits.first); + stop_offset = SimplifyingIrBuilder::subExpr(stop_offset, limits.second); - return {start_offsets, stop_offsets}; + return {start_offset, stop_offset}; } -bool canOmitStartPredicate(kir::Val* start_offset) { +// A partial value of a start offset is returned if determined to be +// safe. Nullptr is returned if it can be omitted completely. +Val* simplifyStartOffset(Val* start_offset) { // Start predicate can be omitted when start_offset >= 0. - auto offset_val = start_offset->as()->value(); - return offset_val.has_value() && offset_val.value() >= 0; + auto offset_val = start_offset->as()->value(); + if (offset_val.has_value() && offset_val.value() >= 0) { + return nullptr; + } + + // start_offset may look like min(0, window_index - pad). Then, can + // remove min and leave the rhs only. + auto def = dynamic_cast(start_offset->definition()); + if (def != nullptr && def->getBinaryOpType() == BinaryOpType::Min && + def->lhs()->isZeroInt()) { + return def->rhs(); + } + + return start_offset; } bool canOmitStopPredicate( - kir::Val* stop_index, - kir::Val* stop_offset, - kir::IterDomain* kir_contig_id) { + Val* stop_index, + Val* stop_offset, + IterDomain* contig_id) { bool index_simple = stop_index->definition() == nullptr; // The definition may be just adding the magic zero, which can be // effectively considered "simple" if (!index_simple && isProtectedWithMagicZero(stop_index)) { // Make sure the lhs of stop_index is simple. - auto lhs = stop_index->definition()->as()->lhs(); + auto lhs = stop_index->definition()->as()->lhs(); if (lhs->definition() == nullptr) { index_simple = true; } } - // Omit only when both the index and extent are "simple". - if (!(index_simple && kir_contig_id->extent()->definition() == nullptr)) { + if (!index_simple) { return false; } @@ -2873,33 +3000,38 @@ bool canOmitStopPredicate( // omitted if extent + halo + stop_offset < extent, i.e., halo + // stop_offset <= 0. - auto stop_offset_val = stop_offset->as()->value(); - - auto halo_ext = - gpu_lower->haloInfo().getRootAxisInfo(kir_contig_id).width()->value(); + auto stop_offset_val = stop_offset->as()->value(); // If they are not compile-time constant, can't prove the // condition. - if (!stop_offset_val.has_value() || !halo_ext.has_value()) { + if (!stop_offset_val.has_value()) { return false; } - if (halo_ext.value() + stop_offset_val.value() > 0) { + // Note that when a root domain is halo extended, it is the domain + // to be predicated, not its merged contig id even if it exists. So, + // if contig_id does not have root axis info, contig_id is + // guaranteed to have no halo. + auto halo_ext = gpu_lower->haloInfo().hasRootAxisInfo(contig_id) + ? gpu_lower->haloInfo().getRootAxisInfo(contig_id).width() + : 0; + + if (halo_ext + stop_offset_val.value() > 0) { return false; } // When the domain is parallelized, the parallel dimension must be // exact. Otherwise, there would be extra threads/blocks that need // to be predicated out. - if (isParallelTypeThread(kir_contig_id->parallelType())) { + if (isParallelTypeThread(contig_id->getParallelType())) { if (!gpu_lower->parallelDimensionMap().isExact( - kir_contig_id->parallelType())) { + contig_id->getParallelType())) { return false; } // If the domain has halo, the loop is expanded by the halo // extent, so we can't prove the loop extent is the same as the // parallel dimension. - if (!(halo_ext.has_value() && halo_ext.value() == 0)) { + if (halo_ext != 0) { return false; } } @@ -2907,55 +3039,143 @@ bool canOmitStopPredicate( return true; } +std::pair hoistPredicates( + Val* start_index, + Val* stop_index, + const std::vector& loops, + kir::ForLoop* unswitch_or_vec_loop, + IterDomain* predicated_consumer_id, + TensorView* predicated_consumer_tv, + TensorDomain* ref_td, + const std::unordered_map& ref_start_index_map, + const std::unordered_map& ref_stop_index_map) { + const std::pair same_indices{start_index, stop_index}; + + if (isDisabled(DisableOption::IndexHoist)) { + return same_indices; + } + + const auto start_is_same_as_stop = stop_index == start_index; + + Val* hoisted_stop_index = nullptr; + + if (stop_index->definition() == nullptr) { + // If the index doens't have an expression, nothing to hoist + hoisted_stop_index = stop_index; + } else { + bool inserted = false; + std::tie(hoisted_stop_index, inserted) = + GpuLower::current()->commonIndexMap().insert( + predicated_consumer_id, + predicated_consumer_tv->domain(), + ref_td, + ref_stop_index_map, + loops, + stop_index); + } + + Val* hoisted_start_index = nullptr; + if (start_is_same_as_stop) { + hoisted_start_index = hoisted_stop_index; + } else if (start_index->definition() == nullptr) { + hoisted_start_index = start_index; + } else { + bool inserted = false; + std::tie(hoisted_start_index, inserted) = + GpuLower::current()->commonIndexMap().insert( + predicated_consumer_id, + predicated_consumer_tv->domain(), + ref_td, + ref_start_index_map, + loops, + start_index); + } + + return {hoisted_start_index, hoisted_stop_index}; +} + } // namespace // Returns predicates and the concrete (by loop map) root domains they cover std::pair, ReferenceTensor> Index:: getReferenceRootPredicates( - const kir::TensorView* kir_consumer_tv, + TensorView* consumer_tv, const std::vector& loops, kir::ForLoop* unswitch_or_vec_loop, bool shift_padding) { FUSER_PERF_SCOPE("GpuLower::Lower::Index::getReferenceRootPredicates"); const auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); + + const bool is_unswitch = unswitch_or_vec_loop != nullptr; // Nothing needs to be done when padding is not required. - if (shift_padding && !needsPadding(kir_consumer_tv->fuserTv())) { + if (shift_padding && !needsPadding(consumer_tv)) { return {{RootPredicateInfo::getFalseInfo()}, ReferenceTensor{}}; } - auto consumer_tv = kir_consumer_tv->fuserTv(); - // Get a reference tensor replayed as existing loop structure - ReferenceTensor reference = IndexReferenceReplay::getReference(loops); + ReferenceTensor reference = + IndexReferenceReplay::getReference(loops, consumer_tv); // Generate halo information for reference. updateHaloInfoForReference(reference, consumer_tv); + const auto ref_2_consumer = indexMapReferenceTo( + consumer_tv, gpu_lower->caMap(), reference.concrete_to_id); + + const auto reference_halo_extent_map = + getReferenceHaloExtentMap(reference, ref_2_consumer); + + auto db_axis = gpu_lower->doubleBufferInfo().getDoubleBufferAxis(consumer_tv); + + // Indexing is done without considering contig merging. Actual + // predicated domains are determined by considering contiguity. + const ContigIDs contig_finder( + consumer_tv->domain()->domain(), + consumer_tv->getMaybeRFactorDomain(), + std::vector(consumer_tv->getMaybeRFactorDomain().size(), false), + {}); + // Both start and stop positions may need to be predicated. Indexing // differs when generating predicates for unswitch. // NOTE: If we could find-and-replace KIR nodes, we could just // generate one index map, clone it and replace the loop-to-index // mappings of unswitched loops for the start predicate. - const auto ref_stop_index_map = getPredicateReferenceIndexing( - loops, reference, unswitch_or_vec_loop, false); - // If not unswitch, share the same indexing map as the stop index map - const auto& ref_start_index_map = unswitch_or_vec_loop != nullptr - ? getPredicateReferenceIndexing( - loops, reference, unswitch_or_vec_loop, true) - : ref_stop_index_map; + auto ref_stop_indexing = getPredicateReferenceIndexing( + loops, reference, unswitch_or_vec_loop, db_axis, false); + const auto consumer_stop_indexing = ref_stop_indexing.updateIndexCompute( + consumer_tv->domain(), + ref_2_consumer, + contig_finder, + reference_halo_extent_map); + const auto& consumer_stop_index_map = consumer_stop_indexing.indexMap(); - auto ref_2_consumer = indexMapReferenceTo( - consumer_tv, gpu_lower->caIndexMap(), reference.concrete_to_id); + // If not unswitch, share the same indexing map as the stop index + // map + const auto& ref_start_indexing = is_unswitch + ? getPredicateReferenceIndexing( + loops, reference, unswitch_or_vec_loop, db_axis, true) + : ref_stop_indexing; + + std::unordered_map consumer_start_index_map; + if (is_unswitch) { + const auto consumer_start_indexing = ref_start_indexing.updateIndexCompute( + consumer_tv->domain(), + ref_2_consumer, + contig_finder, + reference_halo_extent_map); + consumer_start_index_map = consumer_start_indexing.indexMap(); + } else { + consumer_start_index_map = consumer_stop_index_map; + } // Get the contiguous ids we need to generate predicates for auto contig_id_infos = - getPredicateContigIds(reference, consumer_tv, ref_2_consumer); + getPredicateContigIds(consumer_tv, consumer_stop_index_map); auto non_divisible_splits = - getNonDivisibleReferenceDomainsToPredicate(consumer_tv, reference); + getNonDivisibleConsumerDomainsToPredicate(consumer_tv); contig_id_infos.insert( contig_id_infos.end(), non_divisible_splits.begin(), @@ -2972,52 +3192,22 @@ std::pair, ReferenceTensor> Index:: } auto root_ids = contig_id_entry.covered_ids; - auto kir_contig_id = - gpu_lower->lowerValue(contig_id)->as(); - const auto ref_stop_indexing_it = ref_stop_index_map.find(kir_contig_id); + const auto consumer_stop_indexing_it = + consumer_stop_index_map.find(contig_id); - // First condition below is due to broadcasts in consumers of consumer that - // are not in consumer there can be unresolved indexing in the reference - // tensor. This can happen when we have something like: TV3[i1o*i2, i1i] and - // TV1[i2] where tv3 and tv1 share their outer dimension. i1 will be part of - // reference tensors root domain, but when indexing into TV1 there aren't - // enough indices to resolve it. - // - // The condition also happens with Misaligned predicates, where + // First condition below happens with Misaligned predicates, where // inner-most vectorized loops are not included in the loops // parameter. Predicates involving vectorized loops are separately // generated in lower_misaligned_vectorization. // - // It can also happens with rfactored reductions. The reference - // tensor may include rfactored domains, so the contig id may be - // a root domain of the reference, not a rfactor root. Since - // there is no loop for rfactor domains, there's no indexing - // mapping for root domains. This seems safe as it can only happen - // with rfactor and rfactored tensors do not need predicates. - // // Second condition is simply to avoid predication on broadcasting axes as // it's not required. - if (ref_stop_indexing_it == ref_stop_index_map.end() || - ref_stop_indexing_it->second->isZeroInt()) { + if (consumer_stop_indexing_it == consumer_stop_index_map.end() || + consumer_stop_indexing_it->second->isZeroInt()) { continue; } - // Find a corresponding consumer root id if exists. Used to - // support shift. If a contig_id is a merged non-root domain, nothing - // is required to do for shift as shift-related domains are - // excluded from contig domains. - IterDomain* consumer_id = nullptr; - if (contig_id->definition() == nullptr || - contig_id_entry.is_non_divisible_split) { - auto it = ref_2_consumer.find(contig_id); - if (it != ref_2_consumer.end()) { - consumer_id = it->second; - } else { - continue; - } - } - RootPredicateInfo info; // Compute offsets for start and stop predicate. For non-shift, @@ -3032,53 +3222,61 @@ std::pair, ReferenceTensor> Index:: // The final predicates will look like: // (index + start_offset) >= 0 && (index + stop_offset) < extent. - std::tie(info.start_offsets_, info.stop_offsets_) = getStartAndStopOffsets( - consumer_id, + std::tie(info.start_offset_, info.stop_offset_) = getStartAndStopOffsets( + contig_id, consumer_tv, reference, - ref_start_index_map, - ref_stop_index_map, + consumer_start_index_map, + consumer_stop_index_map, shift_padding, unswitch_or_vec_loop != nullptr, contig_id_entry.is_non_divisible_split); - auto stop_index = ref_stop_indexing_it->second; - auto start_index = ref_start_index_map.at(kir_contig_id); + auto stop_index = consumer_stop_indexing_it->second; + auto start_index = consumer_start_index_map.at(contig_id); + + std::tie(start_index, stop_index) = hoistPredicates( + start_index, + stop_index, + loops, + unswitch_or_vec_loop, + contig_id, + consumer_tv, + reference.domain, + ref_start_indexing.indexMap(), + ref_stop_indexing.indexMap()); // Build predicates for start positions as: // start_index + start_offset >= 0 - for (auto start_offset : info.start_offsets_) { - if (canOmitStartPredicate(start_offset)) { - info.start_predicates_.push_back(ir_builder.trueVal()); - continue; - } + auto start_offset = simplifyStartOffset(info.start_offset_); + if (start_offset == nullptr) { + info.start_predicate_ = GpuLower::current()->kernel()->trueVal(); + } else { auto offsetted_start_index = - ir_builder.addExpr(start_index, start_offset); - auto pred = - ir_builder.geExpr(offsetted_start_index, ir_builder.zeroVal()) - ->as(); - info.start_predicates_.push_back(pred); + SimplifyingIrBuilder::addExpr(start_index, start_offset); + auto start_pred = + SimplifyingIrBuilder::geExpr( + offsetted_start_index, GpuLower::current()->kernel()->zeroVal()) + ->as(); + info.start_predicate_ = start_pred; } // Build predicates for stop positions as: // stop_index + stop_offset < IterDomain::extent - for (auto stop_offset : info.stop_offsets_) { - if (canOmitStopPredicate(stop_index, stop_offset, kir_contig_id)) { - info.stop_predicates_.push_back(ir_builder.trueVal()); - continue; - } - auto offsetted_stop_index = ir_builder.addExpr(stop_index, stop_offset); - auto pred = - ir_builder.ltExpr(offsetted_stop_index, kir_contig_id->extent()) - ->as(); - info.stop_predicates_.push_back(pred); + auto stop_offset = info.stop_offset_; + if (canOmitStopPredicate(stop_index, stop_offset, contig_id)) { + info.stop_predicate_ = GpuLower::current()->kernel()->trueVal(); + } else { + auto offsetted_stop_index = + SimplifyingIrBuilder::addExpr(stop_index, stop_offset); + auto stop_pred = SimplifyingIrBuilder::ltExpr( + offsetted_stop_index, contig_id->extent()) + ->as(); + info.stop_predicate_ = stop_pred; } - // Transform ids from reference to concrete and consumer domains - // (based on loop compute at map) - for (auto ref_id : contig_id_entry.covered_ids) { - info.root_ids_.insert(reference.id_to_concrete.at(ref_id)); - info.consumer_ids_.insert(ref_2_consumer.at(ref_id)); + for (auto consumer_id : contig_id_entry.covered_ids) { + info.root_ids_.insert(consumer_id); } pred_info_vec.emplace_back(info); } @@ -3089,7 +3287,7 @@ std::pair, ReferenceTensor> Index:: bool Index::protectWithMagicZero( kir::ForLoop* loop, IterDomain* reference_domain, - kir::Val* ind) { + Val* ind) { bool ref_dom_simple = (reference_domain == nullptr ? true : reference_domain->definition() != nullptr); @@ -3100,16 +3298,9 @@ bool Index::protectWithMagicZero( } RootPredicateInfo RootPredicateInfo::getFalseInfo() { - const auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); - RootPredicateInfo info; - info.start_predicates_.push_back(ir_builder.falseVal()); - info.stop_predicates_.push_back(ir_builder.falseVal()); - // These are just placeholder. When the predicate is false, the - // offset should not be used. - info.start_offsets_.push_back(nullptr); - info.stop_offsets_.push_back(nullptr); + info.start_predicate_ = GpuLower::current()->kernel()->falseVal(); + info.stop_predicate_ = GpuLower::current()->kernel()->falseVal(); return info; } diff --git a/torch/csrc/jit/codegen/cuda/index_compute.h b/torch/csrc/jit/codegen/cuda/index_compute.h index 83536067c19e..1a88b00fa25c 100644 --- a/torch/csrc/jit/codegen/cuda/index_compute.h +++ b/torch/csrc/jit/codegen/cuda/index_compute.h @@ -60,6 +60,8 @@ namespace jit { namespace fuser { namespace cuda { +class ContigIDs; + class IndexCompute : public BackwardVisitor { protected: using BackwardVisitor::handle; @@ -69,30 +71,30 @@ class IndexCompute : public BackwardVisitor { void handle(Expr*) override; // return extent_map_[id] if exists, else return id->extent() - kir::Val* getExtent(kir::IterDomain* id); + Val* getExtent(IterDomain* id) const; //! True if a domain is not used to index - bool isZero(kir::IterDomain* id) const; + bool isZero(IterDomain* id) const; //! True if any dependent of a domain is not used to index - bool hasZeroMerged(kir::IterDomain* id) const; + bool hasZeroMerged(IterDomain* id) const; // Tensor domain we're mapping back to root const TensorDomain* td_; // NOLINT // Map we update as we propagate backward, containing all IDs in the // propagation. Initial indices are mapped with this map at tv->domain() - // and are back propagated to tv->rootDomain(). This index_map_ keeps the + // and are back propagated to tv->getRootDomain(). This index_map_ keeps the // indices at intermediate IterDomain's in that back propagation. - std::unordered_map index_map_; // NOLINT + std::unordered_map index_map_; // NOLINT // Map from IterDomain to their broadcasted extent. If a TV has I0*I1 but its // producer has B0*I1 this map will contain a mapping from the ID{B0*I1} to // the extent I0*I1. Also contains updated extents if we merge in a 0 index. // See zero_merged_in_. - std::unordered_map extent_map_; // NOLINT + std::unordered_map extent_map_; // NOLINT // Keeps track of domains that do not contribute to indexing - std::unordered_set zero_domains_; // NOLINT + std::unordered_set zero_domains_; // NOLINT // This set keeps track of IterDomain's that have had a zero index merged into // them. This happens if we do something like tv->axis(0)->split(4) then @@ -100,56 +102,71 @@ class IndexCompute : public BackwardVisitor { // indexing would be (0, i) then when we do the backward computation that zero // and i would attempt to be merged together. We handle indices like these // specially. - std::unordered_set zero_merged_in_; + std::unordered_set zero_merged_in_; // IDs that are a result of contiguous merges - std::unordered_set contig_ids; + std::unordered_set contig_ids_; + + // Map from root to indexed domains + std::unordered_map root_to_indexed_id_; // Mentions if we should propagate an index down a particular IterDomain path // if there's an option - std::unordered_set preferred_paths_; + std::unordered_set preferred_paths_; // Map from IterDomains to halo-extended extents in corresponding // reference tensor - std::unordered_map reference_halo_extent_map_; + std::unordered_map reference_halo_extent_map_; public: - const std::unordered_map& indexMap() const { + const std::unordered_map& indexMap() const { return index_map_; } - const std::unordered_map& extentMap() const { + const std::unordered_map& extentMap() const { return extent_map_; } - const std::unordered_set& zeroDomains() const { + const std::unordered_set& zeroDomains() const { return zero_domains_; } - const std::unordered_set& zeroMergedIn() const { + const std::unordered_set& zeroMergedIn() const { return zero_merged_in_; } + const std::unordered_map& rootToContigID() const { + return root_to_indexed_id_; + } + // Propagate back from _td using initial_index_map IndexCompute( const TensorDomain* _td, - std::unordered_map initial_index_map, - std::unordered_map _extent_map, - std::unordered_set zero_domains, - std::unordered_set _zero_merged_in, - const std::vector& _root_contiguity, - std::unordered_set preferred_paths = {}, - std::unordered_map - reference_halo_extent_map = {}); + std::unordered_map initial_index_map, + std::unordered_map _extent_map, + std::unordered_set zero_domains, + std::unordered_set _zero_merged_in, + std::unordered_set preferred_paths = {}, + std::unordered_map reference_halo_extent_map = {}); + + IndexCompute( + const TensorDomain* _td, + std::unordered_map initial_index_map, + std::unordered_map _extent_map, + std::unordered_set zero_domains, + std::unordered_set _zero_merged_in, + const ContigIDs& contig_finder, + std::unordered_set preferred_paths = {}, + std::unordered_map reference_halo_extent_map = {}); // Updates index_map, extent_map, and zero_merged_in based on id_map and // returns a new IndexCompute ready to be used. IndexCompute updateIndexCompute( const TensorDomain* new_td, const std::unordered_map& id_map, - const std::vector& _root_contiguity, - const std::unordered_map& - reference_halo_extent_map = {}); + const ContigIDs& contig_finder, + const std::unordered_map& reference_halo_extent_map = + {}) const; virtual void run(); }; @@ -159,10 +176,10 @@ class IndexSwizzle : public IndexCompute { public: IndexSwizzle( const TensorView* tv, - std::unordered_map initial_index_map, - std::unordered_map extent_map, - std::unordered_set zero_domains, - std::unordered_set zero_merged_in); + std::unordered_map initial_index_map, + std::unordered_map extent_map, + std::unordered_set zero_domains, + std::unordered_set zero_merged_in); void run() override; @@ -183,51 +200,45 @@ class RootPredicateInfo { friend class Index; public: - const auto& startPredicates() const { - return start_predicates_; + const auto& startPredicate() const { + return start_predicate_; } - auto& startPredicates() { - return start_predicates_; + auto& startPredicate() { + return start_predicate_; } - const auto& startOffsets() const { - return start_offsets_; + const auto& startOffset() const { + return start_offset_; } - const auto& stopPredicates() const { - return stop_predicates_; + const auto& stopPredicate() const { + return stop_predicate_; } - const auto& stopOffsets() const { - return stop_offsets_; + const auto& stopOffset() const { + return stop_offset_; } const auto& rootIds() const { return root_ids_; } - const auto& consumerIds() const { - return consumer_ids_; - } - //! Return a false RootPredicateInfo, i.e., both start and stop //! predicates are false. static RootPredicateInfo getFalseInfo(); private: - // prdicates for lower end - std::vector start_predicates_; - // prdicates for upper end - std::vector stop_predicates_; - // Offsets of the start predicate - std::vector start_offsets_; - // Offsets of the stop predicate - std::vector stop_offsets_; + // prdicate for lower end + Bool* start_predicate_ = nullptr; + // prdicate for upper end + Bool* stop_predicate_ = nullptr; + // Offset of the start predicate + Val* start_offset_ = nullptr; + // Offset of the stop predicate + Val* stop_offset_ = nullptr; // Track which roots have been handled by the generated predicates std::unordered_set root_ids_; - // Consumer IDs that correspond to root_ids_ - std::unordered_set consumer_ids_; }; // Simple interface for IndexCompute @@ -236,24 +247,24 @@ class RootPredicateInfo { class Index { private: // Producer indexing if it's in shared or local memory - static std::vector getNonGlobalProducerStridedIndices( + static std::vector getNonGlobalProducerStridedIndices( TensorView* producer, const TensorView* consumer, const std::vector& loops); // Consumer indexing if it's in shared or local memory - static std::vector getNonGlobalConsumerStridedIndices( + static std::vector getNonGlobalConsumerStridedIndices( const TensorView* consumer, const std::vector& loops); // Producer if it's in global memory - static std::vector getGlobalProducerStridedIndices( + static std::vector getGlobalProducerStridedIndices( TensorView* producer, const TensorView* consumer, const std::vector& loops); // Consumer indexing if it's in global memory - static std::vector getGlobalConsumerStridedIndices( + static std::vector getGlobalConsumerStridedIndices( const TensorView* consumer, const std::vector& loops); @@ -276,7 +287,7 @@ class Index { //! root domain of a producer tensor. The size of the returned //! vector is guaranteed to be equal to the number of axes of the //! indexing root domain. - static std::vector getProducerStridedIndices( + static std::vector getProducerStridedIndices( TensorView* producer, const TensorView* consumer, const std::vector& loops); @@ -285,7 +296,7 @@ class Index { //! root domain of a consumer tensor. The size of the returned //! vector is guaranteed to be equal to the number of axes of the //! indexing root domain. - static std::vector getConsumerStridedIndices( + static std::vector getConsumerStridedIndices( const TensorView* consumer, const std::vector& loops); @@ -313,7 +324,7 @@ class Index { //! vectorized loop. static std::pair, ReferenceTensor> getReferenceRootPredicates( - const kir::TensorView* kir_consumer_tv, + TensorView* consumer_tv, const std::vector& loops, kir::ForLoop* unswitch_or_vec_loop, bool padding_predicate); @@ -328,7 +339,7 @@ class Index { static bool protectWithMagicZero( kir::ForLoop* loop, IterDomain* reference_domain = nullptr, - kir::Val* ind = nullptr); + Val* ind = nullptr); }; } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/index_reference_replay.cpp b/torch/csrc/jit/codegen/cuda/index_reference_replay.cpp index fcd0a8937ed8..bdb334ab044a 100644 --- a/torch/csrc/jit/codegen/cuda/index_reference_replay.cpp +++ b/torch/csrc/jit/codegen/cuda/index_reference_replay.cpp @@ -1,11 +1,11 @@ #include +#include #include +#include #include #include #include -#include -#include namespace torch { namespace jit { @@ -41,18 +41,15 @@ IterDomain* IndexReferenceReplay::idCopy(IterDomain* id) { // reduction. All we care about are the transformations, and trying to make // sure we track correctly a replaying with consistent reduction/broadcast // domains is challenging and unnecessary. - auto copied_id = - new IterDomain(id->start(), id->extent(), id->getParallelType()); + auto copied_id = SimplifyingIrBuilder::create( + id->container(), id->start(), id->extent(), id->getParallelType()); replayed_ids_.emplace_back(copied_id); return copied_id; } -IterDomain* IndexReferenceReplay::toFusionID(kir::IterDomain* kir_id) { - return ca_map_.toFusion(kir_id); -} - IterDomain* IndexReferenceReplay::toConcrete(IterDomain* id) { - return ca_map_.getConcreteMappedID(id); + return GpuLower::current()->caMap()->getConcreteMappedID( + id, IdMappingMode::EXACT); } void IndexReferenceReplay::handle(Split* split) { @@ -64,13 +61,14 @@ void IndexReferenceReplay::handle(Split* split) { // Don't produce the same values multiple times auto ref_outer = concreteToRefId(toConcrete(split->outer())); auto ref_inner = concreteToRefId(toConcrete(split->inner())); - if (ref_id_produced_.find(ref_outer) != ref_id_consumed_.end() || - ref_id_produced_.find(ref_inner) != ref_id_consumed_.end()) { + if (ref_id_produced_.find(ref_outer) != ref_id_produced_.end() || + ref_id_produced_.find(ref_inner) != ref_id_produced_.end()) { return; } // Replay the provided split operation and add it to the reference DAG - new Split( + SimplifyingIrBuilder::create( + split->container(), ref_outer, ref_inner, ref_in, @@ -96,12 +94,13 @@ void IndexReferenceReplay::handle(Merge* merge) { // Don't produce the same values multiple times auto ref_out = concreteToRefId(toConcrete(merge->out())); - if (ref_id_produced_.find(ref_out) != ref_id_consumed_.end()) { + if (ref_id_produced_.find(ref_out) != ref_id_produced_.end()) { return; } // Replay the provided merge operation and add it to the reference DAG - new Merge(ref_out, ref_outer, ref_inner); + SimplifyingIrBuilder::create( + merge->container(), ref_out, ref_outer, ref_inner); // Mark producers and consumers ref_id_consumed_.emplace(ref_outer); @@ -122,6 +121,56 @@ void IndexReferenceReplay::handle(Expr* e) { OptInDispatch::handle(e); } +namespace { + +bool isMappedWithAny(IterDomain* id, const std::vector& ids) { + return std::any_of(ids.begin(), ids.end(), [&](Val* val) { + return val->isA() && + GpuLower::current()->caMap()->areMapped( + id, val->as(), IdMappingMode::PERMISSIVE); + }); +} + +// Get an rfactor IterDomain that is mapped with an IterDomain. If +// multiple such IDs exist, select one whose input IDs are mapped with +// the consumer IDs. This is to ensure the path from the leaf +// IterDomains to the root matches with the consumer tensor. +IterDomain* getRfactorIDToTraverse( + IterDomain* id, + const std::vector& consumer_all_ids) { + const auto& rfactor_ids = + GpuLower::current()->caMap()->getViewRfactorDomainsOfIdGroup( + id, IdMappingMode::PERMISSIVE); + + if (rfactor_ids.empty()) { + return nullptr; + } + + for (auto rfactor_id : rfactor_ids) { + auto def = rfactor_id->definition(); + if (def == nullptr) { + continue; + } + + auto rfactor_id_inputs = ir_utils::filterByType(def->inputs()); + if (std::all_of( + rfactor_id_inputs.begin(), + rfactor_id_inputs.end(), + [&](IterDomain* rfactor_id_input) { + return isMappedWithAny(rfactor_id_input, consumer_all_ids); + })) { + return rfactor_id; + } + } + + // No mapped ID found, which means the consumer is a post-view + // tensor. In that case, it shouldn't matter which view path to + // traverse, so just return the first one. + return rfactor_ids.at(0); +} + +} // namespace + TensorDomain* IndexReferenceReplay::computeReplay() { // Throw an error when two loops are mapped with each other, which // violates an assumption that unique mappings between concrete @@ -139,7 +188,10 @@ TensorDomain* IndexReferenceReplay::computeReplay() { ++it_i) { for (auto it_j = it_i + 1; it_j != loop_structure_.end(); ++it_j) { TORCH_INTERNAL_ASSERT( - !ca_map_.areMapped((*it_i)->iter_domain(), (*it_j)->iter_domain()), + !GpuLower::current()->caMap()->areMapped( + (*it_i)->iter_domain(), + (*it_j)->iter_domain(), + IdMappingMode::EXACT), "Unsupported loop structure. Two loops are mapped together."); } } @@ -149,7 +201,13 @@ TensorDomain* IndexReferenceReplay::computeReplay() { loop_structure_.begin(), loop_structure_.end(), std::back_inserter(domain_ids), - [this](kir::ForLoop* fl) { return toFusionID(fl->iter_domain()); }); + [](kir::ForLoop* fl) { return fl->iter_domain(); }); + + const auto consumer_all_ids = DependencyCheck::getAllValsBetween( + {consumer_tv_->getRootDomain().begin(), + consumer_tv_->getRootDomain().end()}, + {consumer_tv_->domain()->domain().begin(), + consumer_tv_->domain()->domain().end()}); // IterVisitor based traversals don't work because we don't have all outputs. // backward traversal's traverseFrom(domain_ids) will throw "Invalid backward @@ -161,13 +219,21 @@ TensorDomain* IndexReferenceReplay::computeReplay() { // so their broadcast dimensions are "more" resolved than those towards the // inner most loops. std::deque to_visit(domain_ids.begin(), domain_ids.end()); - std::unordered_set visited; + std::unordered_set visited_exprs; + std::unordered_set visited_ids; while (!to_visit.empty()) { auto out_id = to_visit.front(); to_visit.pop_front(); + if (!visited_ids.emplace(out_id).second) { + continue; + } auto expr = out_id->definition(); + if (auto rfactor_id = getRfactorIDToTraverse(out_id, consumer_all_ids)) { + to_visit.emplace_front(rfactor_id); + } + // ID's will be copied for the reference as we replay transformations. If // there was no transformations on an iteration domain, a copy of the // iteration domain for the reference is made here. @@ -179,7 +245,7 @@ TensorDomain* IndexReferenceReplay::computeReplay() { continue; } - if (!visited.emplace(expr).second) { + if (!visited_exprs.emplace(expr).second) { continue; } @@ -194,14 +260,14 @@ TensorDomain* IndexReferenceReplay::computeReplay() { // Construct a tensor that's representitive of the replayed loop structure. std::vector loops_replayed_domain; for (auto loop : loop_structure_) { - auto loop_id = toFusionID(loop->iter_domain()); + auto loop_id = loop->iter_domain(); // Map to loops with the loop map, but make sure the replayed id is actually // a leaf in the replay. auto ref_id_it = std::find_if( replayed_ids_.begin(), replayed_ids_.end(), [&](IterDomain* ref_id) { return ref_id->uses().empty() && - GpuLower::current()->caLoopMap().areMapped( - refIdToConcrete(ref_id), loop_id); + GpuLower::current()->caMap()->areMapped( + refIdToConcrete(ref_id), loop_id, IdMappingMode::PERMISSIVE); }); TORCH_INTERNAL_ASSERT( @@ -216,16 +282,16 @@ TensorDomain* IndexReferenceReplay::computeReplay() { ref_id->parallelize(loop_id->getParallelType()); } + TensorDomain* domain = nullptr; // If no domains were replayed to make the reference, just return the root // domain. if (std::none_of( loops_replayed_domain.begin(), loops_replayed_domain.end(), [](IterDomain* id) { return id->definition() != nullptr; })) { - auto domain = new TensorDomain( + domain = SimplifyingIrBuilder::create( // If there was no replay only return a domain with a root domain. loops_replayed_domain); - return domain; } else { // Construct the root domain as the inputs of the replayed domain auto loops_replayed_domain_vals = @@ -257,35 +323,83 @@ TensorDomain* IndexReferenceReplay::computeReplay() { } // Create and return the reference. - auto domain = new TensorDomain( - {root_domain_ids.begin(), root_domain_ids.end()}, + domain = SimplifyingIrBuilder::create( + std::vector( + root_domain_ids.begin(), root_domain_ids.end()), loops_replayed_domain); - return domain; + } + + cleanUpMappingsOfUnusedDomains(domain); + return domain; +} + +void IndexReferenceReplay::cleanUpMappingsOfUnusedDomains( + TensorDomain* ref_domain) { + // The ref-to-concrete and concrete-to-ref maps can have mappings of + // domains that do not end up being used in the final reference + // domain. Drop them as they are not really part of reference + // tensor. + + const auto all_vals = DependencyCheck::getAllValsBetween( + {ref_domain->getRootDomain().begin(), ref_domain->getRootDomain().end()}, + {ref_domain->domain().begin(), ref_domain->domain().end()}); + + const std::unordered_set all_id_set( + ir_utils::filterByType(all_vals).begin(), + ir_utils::filterByType(all_vals).end()); + for (auto it = ref_id_to_concrete_.begin(); + it != ref_id_to_concrete_.end();) { + IterDomain* ref_id = it->first; + if (all_id_set.find(ref_id) == all_id_set.end()) { + it = ref_id_to_concrete_.erase(it); + } else { + ++it; + } + } + + for (auto it = concrete_to_ref_id_.begin(); + it != concrete_to_ref_id_.end();) { + IterDomain* ref_id = it->second; + if (all_id_set.find(ref_id) == all_id_set.end()) { + it = concrete_to_ref_id_.erase(it); + } else { + ++it; + } } } IndexCompute getReferenceIndexing( const std::vector& loop_structure, - TensorDomain* reference_tensor) { - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - + TensorDomain* reference_tensor, + kir::ForLoop* double_buffer_loop) { // Create a simple index mapping from loop iter domains to their local index. // This is only applicable to global memory buffers. - std::unordered_map initial_index_map; + std::unordered_map initial_index_map; TORCH_INTERNAL_ASSERT(loop_structure.size() <= reference_tensor->nDims()); int magic_zero_loop = -1; for (const auto loop_i : c10::irange(loop_structure.size())) { auto ref_axis = reference_tensor->axis(loop_i); - auto kir_ref_axis = gpu_lower->lowerValue(ref_axis)->as(); auto loop = loop_structure[loop_i]; auto ind = loop->index(); - ; - initial_index_map[kir_ref_axis] = ind; - if (loop->vectorize()) { - initial_index_map[kir_ref_axis] = ir_builder.create(0); + // If the loop is trivial, only the start value is used + if (loop->isTrivial()) { + initial_index_map[ref_axis] = loop->start(); + } else { + initial_index_map[ref_axis] = ind; + } + + if (double_buffer_loop == loop) { + TORCH_INTERNAL_ASSERT( + !loop->isTrivial(), "The double buffer loop must be materialized"); + // This version of getReferenceIndexing is only used for + // indexing global tensors. When indexing global producers, the + // index for a double buffered loop needs to be incremented. The + // parameter double_buffer_loop should be nullptr when indexing + // global consumers tensors. + initial_index_map[ref_axis] = SimplifyingIrBuilder::addExpr( + initial_index_map[ref_axis], GpuLower::current()->kernel()->oneVal()); } if (Index::protectWithMagicZero(loop, ref_axis, ind)) { @@ -295,10 +409,9 @@ IndexCompute getReferenceIndexing( // Add magic zero to a fairly inner most index if (magic_zero_loop >= 0) { - auto ref_id = gpu_lower->lowerValue(reference_tensor->axis(magic_zero_loop)) - ->as(); - initial_index_map[ref_id] = ir_builder.addExpr( - initial_index_map[ref_id], ir_builder.magicZeroVal()); + auto ref_id = reference_tensor->axis(magic_zero_loop); + initial_index_map[ref_id] = SimplifyingIrBuilder::addExpr( + initial_index_map[ref_id], FusionGuard::getCurFusion()->magicZeroVal()); } // Send to the other version of reference indexing that directly takes the @@ -310,19 +423,17 @@ IndexCompute getReferenceIndexing( IndexCompute getReferenceIndexing( const std::vector& loop_structure, TensorDomain* reference_tensor, - std::unordered_map index_map, - std::unordered_set zero_domains, + std::unordered_map index_map, + std::unordered_set zero_domains, std::unordered_set preferred_paths, - std::unordered_map halo_extent_map) { - auto gpu_lower = GpuLower::current(); - + std::unordered_map halo_extent_map) { // I thought this might be necesasry, but turns out it's not. I think it's // because of the root ordering above, however leaving it in case we find // out it is necessary in some cases. At the time of commiting, cuda-memcheck // passed without this. // - // std::unordered_map reference_extent_map; for (auto loop : loop_structure) { + // std::unordered_map reference_extent_map; for (auto loop : loop_structure) { // // If there's a broadcast merged in the for loop ID we want to track its // // extent // auto inputs = InputsOf::outputs( @@ -342,15 +453,13 @@ IndexCompute getReferenceIndexing( // } // } - // Convert to preferred_path to kir::IterDomain for IndexCompute - std::unordered_set kir_preferred_path; - std::transform( - preferred_paths.begin(), - preferred_paths.end(), - std::inserter(kir_preferred_path, kir_preferred_path.begin()), - [&gpu_lower](IterDomain* id) { - return gpu_lower->lowerValue(id)->as(); - }); + // No contig indexing is done in reference indexing + ContigIDs contig_finder( + reference_tensor->domain(), + reference_tensor->getMaybeRFactorDomain(), + std::vector( + reference_tensor->getMaybeRFactorDomain().size(), false), + {}); IndexCompute compute( reference_tensor, @@ -359,9 +468,9 @@ IndexCompute getReferenceIndexing( // in this function {}, zero_domains, - std::unordered_set(), - reference_tensor->contiguity(), - kir_preferred_path, + std::unordered_set(), + contig_finder, + preferred_paths, halo_extent_map); compute.run(); diff --git a/torch/csrc/jit/codegen/cuda/index_reference_replay.h b/torch/csrc/jit/codegen/cuda/index_reference_replay.h index c4626213e76b..144b295faa7e 100644 --- a/torch/csrc/jit/codegen/cuda/index_reference_replay.h +++ b/torch/csrc/jit/codegen/cuda/index_reference_replay.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -17,9 +17,10 @@ namespace cuda { class IndexReferenceReplay : public OptInDispatch { private: - IndexReferenceReplay(const std::vector& loop_structure) - : loop_structure_(loop_structure), - ca_map_(GpuLower::current()->caIndexMap()) {} + IndexReferenceReplay( + const std::vector& loop_structure, + const TensorView* consumer_tv) + : loop_structure_(loop_structure), consumer_tv_(consumer_tv) {} // Generate the replay. TensorDomain* computeReplay(); @@ -34,13 +35,13 @@ class IndexReferenceReplay : public OptInDispatch { // Make a new id for the reference replay based on the provided id IterDomain* idCopy(IterDomain* id); - // Use the compute at map to get the fusion IterDomain from the - // kir::IterDomain - IterDomain* toFusionID(kir::IterDomain* kir_id); - // Return the concrete entry of the non-reference id IterDomain* toConcrete(IterDomain* id); + //! Remove mappings of reference IDs that do not end up being used + //! in the final reference domain + void cleanUpMappingsOfUnusedDomains(TensorDomain* reference_domain); + using OptInDispatch::handle; void handle(Split* split) override; @@ -50,9 +51,8 @@ class IndexReferenceReplay : public OptInDispatch { private: // Hold the loop structure we're generating a reference for. const std::vector& loop_structure_; - - // Hold the compute at map used for the replay (index map) - const ComputeAtMap& ca_map_; + // The indexed or predicated consumer tensor + const TensorView* consumer_tv_ = nullptr; // Keep a vector of all iteration domains used in the reference (includes all // transformations) @@ -73,8 +73,9 @@ class IndexReferenceReplay : public OptInDispatch { public: // Generate the reference of the provided loop nest structure static ReferenceTensor getReference( - const std::vector& loop_structure) { - auto replay = IndexReferenceReplay(loop_structure); + const std::vector& loop_structure, + const TensorView* consumer_tv) { + auto replay = IndexReferenceReplay(loop_structure, consumer_tv); ReferenceTensor ref; ref.domain = replay.computeReplay(); ref.concrete_to_id = replay.concrete_to_ref_id_; @@ -87,16 +88,17 @@ class IndexReferenceReplay : public OptInDispatch { IndexCompute getReferenceIndexing( const std::vector& loop_structure, TensorDomain* reference_domain, - std::unordered_map index_map, - std::unordered_set zero_domains, + std::unordered_map index_map, + std::unordered_set zero_domains, std::unordered_set preferred_path, - std::unordered_map halo_extent_map = {}); + std::unordered_map halo_extent_map = {}); // Short cut for global TVs. Index into the reference based on all loop indicies // in the loop structure. IndexCompute getReferenceIndexing( const std::vector& loop_structure, - TensorDomain* reference_domain); + TensorDomain* reference_domain, + kir::ForLoop* double_buffer_loop = nullptr); // When indexing there are sometimes an option to propagate an index down // multiple paths. This will return the IterDomains in the history of the diff --git a/torch/csrc/jit/codegen/cuda/instrumentation.cpp b/torch/csrc/jit/codegen/cuda/instrumentation.cpp index 52e16b3a7afe..16b7f33a8e3a 100644 --- a/torch/csrc/jit/codegen/cuda/instrumentation.cpp +++ b/torch/csrc/jit/codegen/cuda/instrumentation.cpp @@ -1,6 +1,6 @@ #include -#include +#include #ifdef _WIN32 #include @@ -32,7 +32,7 @@ Trace::Trace() { logEvent('I', "TRACE_START"); } - if (getenv("PYTORCH_NVFUSER_DISABLE_NVTX")) { + if (isDisabled(DisableOption::Nvtx)) { record_nvtx_range_ = false; } } diff --git a/torch/csrc/jit/codegen/cuda/interface.cpp b/torch/csrc/jit/codegen/cuda/interface.cpp index bd54d30811dd..b6a1c4ab84da 100644 --- a/torch/csrc/jit/codegen/cuda/interface.cpp +++ b/torch/csrc/jit/codegen/cuda/interface.cpp @@ -15,13 +15,132 @@ C10_DEFINE_bool( C10_DEFINE_bool( torch_jit_nvfuser_horizontal_fusion, true, - "enable single node fusion for nvfuser"); + "enable horizontal fusion for nvfuser"); namespace torch { namespace jit { namespace fuser { namespace cuda { +static std::atomic cuda_fusion_guard_mode{true}; + +// There are 3 sources of information on whether to enable nvfuser: +// 1. assigned value from setEnabled() - takes precendence if it has been set +// 2. value from environment variable - only used if setEnabled() is unset +// 3. default value - used if both 1 and 2 are unset. +// +// If 1 or 2 tries to enable nvfuser when it cannot be enabled (e.g. cuda not +// available), then an error will be thrown. The default will not error. +class NVFuserEnabler { + private: + c10::optional runtime_assigned_fuser_enabled_ = c10::nullopt; + std::once_flag enabled_check_flag_; + std::mutex mutex_; + + static bool nvfuserCanBeEnabled() { +#ifdef USE_ROCM + return false; +#else + return at::globalContext().hasCUDA() && + NVFuserPassManager::isRegistered() && getExecutorMode(); +#endif + } + + static void assertFuserCanBeEnabled(bool is_enabled) { + if (!is_enabled) { + return; + } + TORCH_CHECK( + nvfuserCanBeEnabled(), + "Running CUDA fuser is only supported on CUDA builds."); + } + + static c10::optional getFuserEnabledEnvVar() { + static const char* enable_c_str = std::getenv("PYTORCH_JIT_ENABLE_NVFUSER"); + if (!enable_c_str) { + return c10::nullopt; + } + std::string enable(enable_c_str); + if (enable == "0" || enable == "OFF") { + return false; + } + return true; + } + + static c10::optional getCachedFuserEnabledEnvVar() { + static c10::optional default_enabled = getFuserEnabledEnvVar(); + return default_enabled; + } + + static bool getNNCNotNVFuser() { + static const char* env_c_str = + std::getenv("PYTORCH_JIT_USE_NNC_NOT_NVFUSER"); + if (!env_c_str) { + return false; + } + std::string env(env_c_str); + if (env == "1" || env == "ON") { + return true; + } + return false; + } + + static bool getCachedNNCNotNVFuser() { + static bool force_disable = getNNCNotNVFuser(); + return force_disable; + } + + bool isEnabledImpl() { + // 0. opportunity to force disable NVFuser + if (getCachedNNCNotNVFuser()) { + return false; + } + std::call_once(enabled_check_flag_, [&]() { + // if environment variable is setting the value, we must + if (!runtime_assigned_fuser_enabled_.has_value() && + getCachedFuserEnabledEnvVar().has_value()) { + assertFuserCanBeEnabled(*getCachedFuserEnabledEnvVar()); + } + }); + // 1. if user has explicitly assigned fuser value, that value takes + // precedence. + if (runtime_assigned_fuser_enabled_.has_value()) { + return *runtime_assigned_fuser_enabled_; + } + // 2. next precedence is any value assigned by + if (getCachedFuserEnabledEnvVar().has_value()) { + return *getCachedFuserEnabledEnvVar(); + } + // 3. default value (if you switch this to true, make sure + // to check nvfuserCanBeEnabled()) + return false; + } + + public: + bool setEnabled(bool is_enabled) { + std::lock_guard lock(mutex_); + assertFuserCanBeEnabled(is_enabled); + bool old_value = isEnabledImpl(); + runtime_assigned_fuser_enabled_ = is_enabled; + return old_value; + } + + bool isEnabled() { + std::lock_guard lock(mutex_); + return isEnabledImpl(); + } +}; + +static NVFuserEnabler nvfuser_enabler; + +bool isEnabled() { + return nvfuser_enabler.isEnabled(); +} + +bool setEnabled(bool is_enabled) { + return nvfuser_enabler.setEnabled(is_enabled); +} + bool getSingletonFusion() { return FLAGS_torch_jit_nvfuser_singleton_fusion; } @@ -42,8 +161,6 @@ bool setHorizontalFusion(bool value) { return old_value; } -static std::atomic cuda_fusion_guard_mode{true}; - std::atomic& getCudaFusionGuardMode() { return cuda_fusion_guard_mode; } @@ -68,6 +185,10 @@ void runFusionGroup(const Node* fusion_node, Stack& stack) { } void fuseGraph(std::shared_ptr& graph) { + if (!isEnabled()) { + return; + } + TORCH_CHECK( getFuserInterface()->fn_fuse_graph != nullptr, "Running the CUDA fuser requires a CUDA build."); @@ -90,6 +211,11 @@ bool profileNode(const Node* node) { getFuserInterface()->fn_profile_n(node); } +bool skipNode(const std::string& symbol_str, bool flip) { + return getFuserInterface()->fn_skip_n != nullptr && + getFuserInterface()->fn_skip_n(symbol_str, flip); +} + //! [ Note -- type guard logic in CudaFusionGuard ] //! //! CudaFusionGuard is used to Guard input tensor to `CudaFusionGroup` so that @@ -117,11 +243,15 @@ bool profileNode(const Node* node) { //! extra attention should be paid to contiguity across size-1 //! dimensions. //! c. size check: +//! c.1 broadcast check: //! making sure that broadcast semantics are identical. So we want to //! make sure a given dimension either are both size-1 for `tensor` & //! `guard_tensor_type`, or are both non-size-1. //! This is due to the fact that we specialize size-1 dimension as //! broadcasted dimension while translating PyTorch tensor to Fusion IR. +//! c.1 size-0 check: +//! we don't specialize this on codegen, but we do specialize fusion +//! logic for size-0 on reductoins, hence the check //! bool complyWith( const at::Tensor& tensor, @@ -133,13 +263,19 @@ bool complyWith( // check a. if num_dimension check fails or scalar type check fails if (*guard_tensor_type->dim() != static_cast(tensor.ndimension()) || (guard_tensor_type->scalarType().has_value() && - (guard_tensor_type->scalarType().value() != tensor.scalar_type()))) { + (guard_tensor_type->scalarType().value() != tensor.scalar_type())) || + (guard_tensor_type->device().has_value() && + (guard_tensor_type->device().value() != tensor.device())) || + (guard_tensor_type->requiresGrad().has_value() && + guard_tensor_type->requiresGrad().value() != + (tensor.requires_grad() && at::GradMode::is_enabled()))) { return false; } // TODO: should we get symbolic_size instead and check for size // consistency across tensors as well? const auto& sizes = guard_tensor_type->sizes(); + // see [ Note -- stirde_properties in tensor type ] const auto& stride_properties = guard_tensor_type->stride_properties(); const auto& t_sizes = tensor.sizes(); @@ -207,12 +343,18 @@ bool complyWith( } } - // check c, we go along semantic ordered dimensions + // check c.1, we go along semantic ordered dimensions // check broadcast / size-1: bool guard_bcast = sizes[j].has_value() && sizes[j].value() == 1; if (guard_bcast != (t_sizes[j] == 1)) { return false; } + + // check c.2, check for size-0 + bool guard_size_0 = sizes[j].has_value() && sizes[j].value() == 0; + if (guard_size_0 != (t_sizes[j] == 0)) { + return false; + } } return true; @@ -329,6 +471,238 @@ RegisterOperators reg_guard({ aliasAnalysisFromSchema()), }); +// Infer dynamic axis (-1) in view_sizes given tensor_sizes +bool inferViewShape( + c10::List tensor_sizes, + c10::List view_sizes) { + int64_t dynamic_index = -1; + size_t view_size_num_elements = 1; + for (size_t idx = 0; idx < view_sizes.size(); ++idx) { + if (view_sizes[idx] == -1) { + TORCH_INTERNAL_ASSERT( + dynamic_index == -1, "Only one dimension can by inferred.") + dynamic_index = idx; + } else { + TORCH_INTERNAL_ASSERT(view_sizes[idx] > 0); + view_size_num_elements *= view_sizes[idx]; + } + } + const size_t kNumElements = std::accumulate( + tensor_sizes.begin(), tensor_sizes.end(), 1, std::multiplies<>()); + + if (kNumElements % view_size_num_elements != 0) { + return false; + } + + if (dynamic_index != -1) { + view_sizes[dynamic_index] = kNumElements / view_size_num_elements; + } + + return true; +} + +//! [ Note -- type guard logic in CudaFusionViewGuard ] +//! +//! CudaFusionViewGuard is used to guard input tensors to a `CudaFusionGroup` +//! that contains view operations, so that we would not feed inputs that +//! violate the graph defined in `GraphCache`. +//! +//! output = view(self, view-sizes) +//! +//! View Guard Inputs: +//! 1. self tensor_sizes - dynamic size List[Int] +//! 2. view_sizes - profile_ivalue List[Int] +//! 3. tensor_constraint - Constant List[Int] +//! 4. view_sizes_constraint - Constant List[Int] +//! +//! Things that we check: +//! 1. The #dimensions are the same for self tensor and its constraint +//! 2. The #dimensions are the same for view-sizes and its constraint +//! 3. Self tensor does not violate its constraint +//! a. Queue unrestricted sizes +//! b. Calculate #elements in self tensor +//! 4. view-sizes does not violate its constraint +//! a. Pop unrestricted sizes from queue +//! b. Calculate #elements in view-sizes +//! 5. The #elements is the same for self tensor and view-sizes +//! +//! Constraints: +//! A restricted axis creates a graph constraint, so its sizes is static. +//! An unrestricted axis is allowed to have a dynamic size, if it is consistent +//! between self tensor and view-sizes. It is marked with -1 in the constraint. +//! Only iterDomains with the Keep transform are dynamic. All other transforms +//! create a static constraint. +//! +bool checkViewGuard( + c10::List tensor_sizes, + c10::List view_sizes, + c10::List tensor_constraint, + c10::List view_sizes_constraint) { + // 1: Num Dimensions Check + if (tensor_constraint.size() != tensor_sizes.size() || + view_sizes_constraint.size() != view_sizes.size()) { + return false; + } + + // If axis allows dynamic sizes, then add tensor size to this queue. + // For dynamic axes in view_sizes, check that it is consistent with + // the corresponding tensor size. + std::queue dynamic_axis_queue; + + // 2. Tensor Static Check + int64_t tensor_size_product = 1; + for (const auto idx : c10::irange(tensor_sizes.size())) { + if (tensor_constraint[idx] == -1) { + dynamic_axis_queue.push(tensor_sizes[idx]); + } else if (tensor_constraint[idx] != tensor_sizes[idx]) { + return false; + } + tensor_size_product *= tensor_sizes[idx]; + } + + // 3. View-Sizes Static Check + int64_t view_size_product = 1; + for (const auto idx : c10::irange(view_sizes.size())) { + auto dynamic_size = (view_sizes_constraint[idx] == -1) + ? dynamic_axis_queue.front() + : view_sizes_constraint[idx]; + if (dynamic_size != view_sizes[idx]) { + return false; + } + view_size_product *= dynamic_size; + if (view_sizes_constraint[idx] == -1) { + dynamic_axis_queue.pop(); + } + } + + // 4. Check view invariant + // The number of elements in the input and output tensors are the same. + return tensor_size_product == view_size_product; +} + +//! +//! CudaFusionViewGuard Example Graph: +//! +//! graph(%self : __torch__.BiasViewRelu, +//! %inputs.1 : Tensor): +//! %2 : int = prim::Constant[value=-1]() # dynamic_bvg.py:50:40 +//! %3 : int = prim::Constant[value=1]() # dynamic_bvg.py:50:25 +//! %4 : NoneType = prim::Constant() +//! %5 : int[] = prim::Constant[value=[2, 3]]() +//! %6 : int[] = aten::size(%inputs.1) # dynamic_bvg.py:50:25 +//! %7 : int[] = aten::slice(%6, %4, %2, %3) # dynamic_bvg.py:50:25 +//! %view_shape.1 : int[] = aten::add(%7, %5) # dynamic_bvg.py:50:25 +//! %bias : Tensor = prim::GetAttr[name="bias"](%self) +//! %10 : int[] = aten::size(%bias) +//! %11 : int[] = prim::BroadcastSizes(%6, %10) +//! %12 : bool = prim::CudaFusionGuard[types=[...]](%inputs.1, %bias) +//! %13 : int[] = prim::Constant[value=[-1, -1, -1, 6]]() +//! %14 : int[] = prim::Constant[value=[-1, -1, -1, 2, 3]]() +//! %15 : bool = prim::CudaFusionViewGuard(%11, %view_shape.1, %13, %14) +//! %16 : bool[] = prim::ListConstruct(%15, %12) +//! %17 : bool = aten::all(%16) +//! %18 : Tensor = prim::If(%17) +//! block0(): +//! %19 : Tensor = prim::CudaFusionGroup_0[cache_id=0](%inputs.1, %bias) +//! -> (%19) +//! block1(): +//! %20 : Function = prim::Constant[name="fallback_fn", fallback=1]() +//! %21 : (...) = prim::CallFunction(%20, %inputs.1, %bias, %view_shape.1) +//! %22 : Float(...) = prim::TupleUnpack(%21) +//! -> (%22) +//! return (%18) +//! with prim::CudaFusionGroup_0 = graph(%0 : Float(...), +//! %1 : Float(...)): +//! %2 : int[] = prim::Constant[value=[2, 3, 4, 2, 3]]() +//! %3 : int = prim::Constant[value=1]() # dynamic_bvg.py:50:25 +//! %o.1 : Float(...) = aten::add(%0, %1, %3) # dynamic_bvg.py:51:16 +//! %5 : Float(...) = prim::view_copy(%o.1, %2) +//! %6 : Float(...) = aten::relu(%5) # dynamic_bvg.py:53:19 +//! return (%6) +//! +RegisterOperators view_guard({ + Operator( + "prim::CudaFusionViewGuard(...) -> bool", + // prim::CudaFusionViewGuard returns a fresh Boolean type without + // aliasing. if we would ever return refined tensor, which would change + // aliasing analysis, we should update aliasdb pass. + [](const Node* node) -> Operation { + return [](Stack& stack) { + // view_sizes_constraint - Constant List[Int] + at::ArrayRef inputs = last(stack, 4); + + // tensor_sizes is the runtime size for the self tensor + // tensor_sizes - dynamic size List[Int] + TORCH_INTERNAL_ASSERT( + inputs[0].isIntList(), "tensor_sizes needs to be Int List"); + auto tensor_sizes = inputs[0].toIntList(); + + // profiled_view_sizes is the runtime view size + // profiled_view_sizes - profile_ivalue List[Int] + TORCH_INTERNAL_ASSERT( + inputs[1].isIntList(), + "profiled_view_sizes needs to be Int list"); + auto profiled_view_sizes = inputs[1].toIntList(); + + // tensor_constraint is a constant List[Int] + // used to guard tensor_sizes + TORCH_INTERNAL_ASSERT( + inputs[2].isIntList(), + "tensor constraint needs to be Int List"); + auto tensor_constraint = inputs[2].toIntList(); + + // view_sizes_constraint is a constant List[Int] + // used to guard profiled_view_sizes + TORCH_INTERNAL_ASSERT( + inputs[3].isIntList(), + "view_sizes constraint needs to be Int List"); + auto view_sizes_constraint = inputs[3].toIntList(); + + // Drop after gather all input arguments + // If an argument is moved, it is destroyed when dropped from stack + drop(stack, 4); + + auto status = inferViewShape(tensor_sizes, profiled_view_sizes); + if (!status) { + push(stack, IValue(false)); + return; + } + + if (!fuser::cuda::getCudaFusionGuardMode()) { + push(stack, IValue(true)); + return; + } + + auto guard_status = checkViewGuard( + tensor_sizes, + profiled_view_sizes, + tensor_constraint, + view_sizes_constraint); + push(stack, IValue(guard_status)); + return; + }; + }, + aliasAnalysisFromSchema()), +}); + +RegisterOperators ivalue_guard({ + Operator( + "prim::CudaFusionIvalGuard(...) -> bool", + [](const Node* node) -> Operation { + return [](Stack& stack) { + at::ArrayRef inputs = last(stack, 2); + drop(stack, 2); + if (!fuser::cuda::getCudaFusionGuardMode()) { + push(stack, IValue(true)); + return; + } + push(stack, inputs[0].equals(inputs[1])); + return; + }; + }, + aliasAnalysisFromSchema()), +}); + // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) RegisterOperators reg_add_optional({ Operator( @@ -346,6 +720,181 @@ RegisterOperators reg_add_optional({ }, aliasAnalysisFromSchema()), }); + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +RegisterOperators reg_view_copy({ + Operator( + "prim::view_copy(Tensor self, int[] size) -> Tensor", + [](const Node* node) -> Operation { + return [node](Stack& stack) { + TORCH_CHECK( + node->s(attr::name) == "CudaFusionGroup", + "view_copy is only used by nvfuser to identify non-mutating ", + "alias ops, should be restored after fusion pass!"); + IValue self, size; + pop(stack, self, size); + push(stack, at::native::view(self.toTensor(), size.toIntVector())); + }; + }, + aliasAnalysisFromSchema()), +}); + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +RegisterOperators reg_flatten_copy({ + Operator( + "prim::flatten_copy(Tensor self, int start_dim, int end_dim) -> Tensor", + [](const Node* node) -> Operation { + return [node](Stack& stack) { + TORCH_CHECK( + node->s(attr::name) == "CudaFusionGroup", + "flatten_copy is only used by nvfuser to identify non-mutating ", + "alias ops, should be restored after fusion pass!"); + IValue self, start_dim, end_dim; + pop(stack, self, start_dim, end_dim); + push( + stack, + at::native::flatten( + self.toTensor(), start_dim.toInt(), end_dim.toInt())); + }; + }, + aliasAnalysisFromSchema()), +}); + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +RegisterOperators reg_reshape_copy({ + Operator( + "prim::reshape_copy(Tensor self, int[] shape) -> Tensor", + [](const Node* node) -> Operation { + return [node](Stack& stack) { + TORCH_CHECK( + node->s(attr::name) == "CudaFusionGroup", + "reshape_copy is only used by nvfuser to identify non-mutating ", + "alias ops, should be restored after fusion pass!"); + IValue self, shape; + pop(stack, self, shape); + push( + stack, + at::native::reshape(self.toTensor(), shape.toIntVector())); + }; + }, + aliasAnalysisFromSchema()), +}); + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +RegisterOperators reg_squeeze_copy({ + Operator( + "prim::squeeze_copy(Tensor self) -> Tensor", + [](const Node* node) -> Operation { + return [node](Stack& stack) { + TORCH_CHECK( + node->s(attr::name) == "CudaFusionGroup", + "squeeze_copy is only used by nvfuser to identify non-mutating ", + "alias ops, should be restored after fusion pass!"); + IValue self; + pop(stack, self); + push(stack, at::squeeze(self.toTensor())); + }; + }, + aliasAnalysisFromSchema()), +}); + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +RegisterOperators reg_squeeze_dim_copy({ + Operator( + "prim::squeeze_copy.dim(Tensor self, int dim) -> Tensor", + [](const Node* node) -> Operation { + return [node](Stack& stack) { + TORCH_CHECK( + node->s(attr::name) == "CudaFusionGroup", + "squeeze_dim_copy is only used by nvfuser to identify non-mutating ", + "alias ops, should be restored after fusion pass!"); + IValue self, dim; + pop(stack, self, dim); + push(stack, at::squeeze(self.toTensor(), dim.toInt())); + }; + }, + aliasAnalysisFromSchema()), +}); + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +RegisterOperators reg_unsqueeze_copy({ + Operator( + "prim::unsqueeze_copy(Tensor self, int dim) -> Tensor", + [](const Node* node) -> Operation { + return [node](Stack& stack) { + TORCH_CHECK( + node->s(attr::name) == "CudaFusionGroup", + "unsqueeze_copy is only used by nvfuser to identify non-mutating ", + "alias ops, should be restored after fusion pass!"); + IValue self, dim; + pop(stack, self, dim); + push(stack, at::unsqueeze(self.toTensor(), dim.toInt())); + }; + }, + aliasAnalysisFromSchema()), +}); + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +RegisterOperators reg_infer_unsqueeze_size({ + Operator( + "prim::infer_unsqueeze_size(int[] a, int dim) -> int[]", + [](const Node* node) -> Operation { + return [](Stack& stack) { + auto dim = pop(stack).toInt(); + auto size = pop(stack).toIntVector(); + if (dim < 0) { + dim = dim + 1 + size.size(); + } + auto it = size.begin() + dim; + size.insert(it, 1); + push(stack, IValue(size)); + }; + }, + aliasAnalysisFromSchema()), +}); + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +RegisterOperators reg_infer_squeeze_dim_size({ + Operator( + "prim::infer_squeeze_size.dim(int[] a, int dim) -> int[]", + [](const Node* node) -> Operation { + return [](Stack& stack) { + auto dim = pop(stack).toInt(); + auto size = pop(stack).toIntVector(); + if (dim < 0) { + dim = dim + size.size(); + } + auto it = size.begin() + dim; + if (*it == 1) { + size.erase(it); + } + push(stack, IValue(size)); + }; + }, + aliasAnalysisFromSchema()), +}); + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +RegisterOperators reg_infer_squeeze_size({ + Operator( + "prim::infer_squeeze_size(int[] a) -> int[]", + [](const Node* node) -> Operation { + return [](Stack& stack) { + auto size = pop(stack).toIntVector(); + + for (auto it = size.begin(); it != size.end(); it++) { + if (*it == 1) { + auto pre = it - 1; + size.erase(it); + it = pre; + } + } + push(stack, IValue(size)); + }; + }, + aliasAnalysisFromSchema()), +}); + } // namespace } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/interface.h b/torch/csrc/jit/codegen/cuda/interface.h index 1ab9e6d80086..61daad880c4c 100644 --- a/torch/csrc/jit/codegen/cuda/interface.h +++ b/torch/csrc/jit/codegen/cuda/interface.h @@ -1,7 +1,8 @@ #pragma once -#include +#include #include +#include #include /* @@ -19,10 +20,10 @@ namespace cuda { TORCH_API std::atomic& getCudaFusionGuardMode(); -C10_EXPORT bool getSingletonFusion(); -C10_EXPORT bool setSingletonFusion(bool value); -C10_EXPORT bool getHorizontalFusion(); -C10_EXPORT bool setHorizontalFusion(bool value); +TORCH_API bool getSingletonFusion(); +TORCH_API bool setSingletonFusion(bool value); +TORCH_API bool getHorizontalFusion(); +TORCH_API bool setHorizontalFusion(bool value); // dummy struct to allow API registration struct CudaFuserInterface { @@ -32,22 +33,44 @@ struct CudaFuserInterface { bool (*fn_can_fuse_n)(const Node*) = nullptr; void (*fn_insert_profile_inodes)(ProfilingRecord* pr) = nullptr; bool (*fn_profile_n)(const Node*) = nullptr; + bool (*fn_skip_n)(const std::string&, bool flip) = nullptr; }; // Get interface, this is used by registration and user facing API internally -C10_EXPORT CudaFuserInterface* getFuserInterface(); +TORCH_API CudaFuserInterface* getFuserInterface(); -C10_EXPORT void compileFusionGroup(Node* fusion_node); -C10_EXPORT void runFusionGroup(const Node* fusion_node, Stack& stack); -C10_EXPORT void fuseGraph(std::shared_ptr&); -C10_EXPORT bool canFuseNode(const Node* node); -C10_EXPORT void InsertProfileNodesForCUDAFuser(ProfilingRecord* pr); -C10_EXPORT bool profileNode(const Node* node); +TORCH_API void compileFusionGroup(Node* fusion_node); +TORCH_API void runFusionGroup(const Node* fusion_node, Stack& stack); +TORCH_API void fuseGraph(std::shared_ptr&); +TORCH_API bool canFuseNode(const Node* node); +TORCH_API void InsertProfileNodesForCUDAFuser(ProfilingRecord* pr); +TORCH_API bool profileNode(const Node* node); -C10_EXPORT bool complyWith( +TORCH_API bool skipNode(const std::string& symbol_str, bool flip = true); + +TORCH_API bool complyWith( const at::Tensor& tensor, const c10::TensorTypePtr& guard_tensor_type); +TORCH_API bool isEnabled(); +TORCH_API bool setEnabled(bool is_enabled); + +struct TORCH_API NVFuserPassManager : public PassManager { + static bool registerPass(bool enabled) { + bool old_value = PassManager::isRegistered(); + if (enabled) { + PassManager::registerPass(fuseGraph); + } else { + PassManager::clearPass(); + } + return old_value; + } + + static bool isRegistered() { + return PassManager::isRegistered(); + } +}; + } // namespace cuda } // namespace fuser } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp b/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp index cf3d9c7a8c75..0d67f780886b 100644 --- a/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp +++ b/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp @@ -1,8 +1,12 @@ #include #include #include +#include #include #include +#include +#include +#include #include #include @@ -20,16 +24,20 @@ namespace jit { namespace fuser { namespace cuda { +Statement::Statement(IrBuilderPasskey passkey) { + ir_container_ = passkey.ir_container_; +} + Statement::Statement(const Statement* src, IrCloner* ir_cloner) { - // IRCloner when cloning to a new fusion will copy the names of the original - // fusion. If we're cloning into the same fusion, we let Val and Expr get - // their names as usual by registering with the current fusion in their - // constructors, so don't overwrite that here. - if (src->fusion() != ir_cloner->fusion()) { - name_ = src->name_; - } - fusion_ = ir_cloner->fusion(); - ir_cloner->registerClone(src, this); + ir_container_ = ir_cloner->container(); +} + +void Statement::setName(IrContainerPasskey, StmtNameType name) { + name_ = name; +} + +void Statement::setName(IrBuilderPasskey, StmtNameType name) { + name_ = name; } Val* Statement::asVal() { @@ -42,24 +50,37 @@ Expr* Statement::asExpr() { return this->as(); } -void Statement::print() const { - IrPrinter ir_printer(std::cout); +std::string Statement::toString() const { + std::stringstream ss; + IrPrinter ir_printer(ss); ir_printer.handle(this); - std::cout << std::endl; + return ss.str(); } -// When we create a Val we immediately register them with the active fusion. -Val::Val(ValType _vtype, DataType _dtype, bool register_val) - : vtype_(_vtype), dtype_(_dtype) { - Fusion* fusion = FusionGuard::getCurFusion(); - TORCH_CHECK( - fusion != nullptr, "No active fusion group found when creating a Val."); - fusion_ = fusion; - if (register_val) { - name_ = fusion_->registerVal(this); - } +std::string Statement::toInlineString() const { + std::stringstream ss; + IrPrinter ir_printer(ss); + ir_printer.print_inline(this); + return ss.str(); +} + +Fusion* Statement::fusion() const { + TORCH_INTERNAL_ASSERT( + ir_container_->isA(), "Statement does not belong to a fusion."); + return ir_container_->as(); } +kir::Kernel* Statement::kernel() const { + TORCH_INTERNAL_ASSERT( + ir_container_->isA(), + "Statement does not belong to a kernel."); + return ir_container_->as(); +} + +// When we create a Val we immediately register them with the active fusion. +Val::Val(IrBuilderPasskey passkey, ValType _vtype, DataType _dtype) + : Statement(passkey), vtype_(_vtype), dtype_(_dtype) {} + // NOTE: we don't clone the definition_ and uses_ here // since they may introduce cloning cycles. Instead, we copy // the original pointers and we'll fix them up later part of the @@ -67,16 +88,7 @@ Val::Val(ValType _vtype, DataType _dtype, bool register_val) // this constructor now leaving them to be resolved by later stages // Val::Val(const Val* src, IrCloner* ir_cloner) - : Statement(src, ir_cloner), - vtype_(src->vtype_), - dtype_(src->dtype_), - is_fusion_input_(src->is_fusion_input_), - is_fusion_output_(src->is_fusion_output_) { - // If we're "cloning" into the same fusion, register with the fusion - if (src->fusion() == ir_cloner->fusion()) { - name_ = src->fusion()->registerVal(this); - } -} + : Statement(src, ir_cloner), vtype_(src->vtype_), dtype_(src->dtype_) {} const std::vector& Val::uses() const { if (vtype_ == ValType::TensorView) { @@ -87,38 +99,59 @@ const std::vector& Val::uses() const { return uses_; } +// Converts the data type of TensorView or Scalar representing index +// values. The data type of the original input should be +// DataType::Index, but DataType::Int is also allowed as it is used +// for index expressions. +void Val::resolveIndexDtype() { + TORCH_INTERNAL_ASSERT( + vtype_ == ValType::TensorView || vtype_ == ValType::Scalar, + "Resolving index type is currently only supported on tensor view or scalar values. " + "Value type: ", + vtype_); + TORCH_INTERNAL_ASSERT( + dtype_ == DataType::Index || dtype_ == DataType::Int, + "Can only resolve index type if a Val has an Index or Int DataType. ", + "Data type: ", + dtype_); + TORCH_INTERNAL_ASSERT( + container()->isA(), + "Index type can only be resolved at compile time."); + dtype_ = container()->as()->indexType(); +} + namespace { // Traverse definition of all values involved in constructing the provided val. // Check if all values involved are constant values, meaning the provided // val is also a constant value. -class ConstCheck : OptOutConstDispatch { +class ConstCheck : private OptOutConstDispatch { private: bool is_const_ = true; - void handle(const Bool* b) override { + void handle(const Bool* b) final { is_const_ = is_const_ && b->isConst(); } - void handle(const Double* d) override { + void handle(const Double* d) final { is_const_ = is_const_ && d->isConst(); } - void handle(const Int* i) override { + void handle(const Int* i) final { is_const_ = is_const_ && i->isConst(); } - void handle(const NamedScalar* ns) override { + void handle(const NamedScalar* ns) final { is_const_ = is_const_ && false; } - void handle(const Expr* expr) override { + void handle(const Expr* expr) final { for (auto inp : expr->inputs()) { handle(inp); } } - void handle(const Val* val) override { + void handle(const Val* val) final { if (val->definition() != nullptr) { handle(val->definition()); } else { @@ -137,15 +170,18 @@ class ConstCheck : OptOutConstDispatch { } // namespace bool Val::isConstScalar() const { - if (!isScalar()) + if (!isScalar()) { return false; + } return ConstCheck::isConst(this); } c10::optional Val::getInt() const { if (isConstScalar() && isAnInt()) { if (this->getValType() == ValType::Scalar) { - return this->as()->value(); + if (this->isA()) { + return this->as()->value(); + } } } return c10::optional(); @@ -161,6 +197,16 @@ bool Val::isOneInt() const { return int_val.has_value() && int_val.value() == 1; } +bool Val::isDefinitionType(ExprType expression_type) const { + if (definition() != nullptr) { + auto def_expr_type = definition()->getExprType(); + if (def_expr_type.has_value() && def_expr_type.value() == expression_type) { + return true; + } + } + return false; +} + c10::optional Val::getDataType() const { TORCH_INTERNAL_ASSERT( dtype_ != DataType::Null, "Value does not have a data type."); @@ -169,7 +215,7 @@ c10::optional Val::getDataType() const { bool Val::isProducerOf(const Val* other) const { TORCH_INTERNAL_ASSERT(other != nullptr); - TORCH_INTERNAL_ASSERT(fusion() == other->fusion()); + TORCH_INTERNAL_ASSERT(container() == other->container()); if (definition() == nullptr) { return false; @@ -186,23 +232,14 @@ bool Val::isConsumerOf(const Val* other) const { // We don't register with the active fusion in Expr as this needs to be done // after inputs and outputs are registered with the Expr -Expr::Expr(ExprType type) : type_{type} { - Fusion* fusion = FusionGuard::getCurFusion(); - if (fusion == nullptr) - TORCH_CHECK(false, "No active fusion group found when creating an Expr."); - fusion_ = fusion; -} +Expr::Expr(IrBuilderPasskey passkey, ExprType etype) + : Statement(passkey), etype_{etype} {} Expr::Expr(const Expr* src, IrCloner* ir_cloner) : Statement(src, ir_cloner), - type_(src->type_), + etype_(src->etype_), inputs_(ir_cloner->clone(src->inputs_)), - outputs_(ir_cloner->clone(src->outputs_)) { - // If we're "cloning" into the same fusion, register with the fusion - if (src->fusion() == ir_cloner->fusion()) { - name_ = src->fusion()->registerExpr(this); - } -} + outputs_(ir_cloner->clone(src->outputs_)) {} bool Expr::sameAs(const Statement* other) const { if (this == other) { @@ -227,6 +264,30 @@ bool Expr::sameAs(const Statement* other) const { return true; } +kir::Predicate* Expr::predicate() const { + TORCH_INTERNAL_ASSERT( + container()->isA(), "Function invalid for fusion."); + return predicate_; +} + +void Expr::setPredicate(kir::Predicate* predicate) { + TORCH_INTERNAL_ASSERT( + container()->isA(), "Function invalid for fusion."); + predicate_ = predicate; +} + +kir::Predicate* Expr::writePredicate() const { + TORCH_INTERNAL_ASSERT( + container()->isA(), "Function invalid for fusion."); + return write_predicate_; +} + +void Expr::setWritePredicate(kir::Predicate* write_predicate) { + TORCH_INTERNAL_ASSERT( + container()->isA(), "Function invalid for fusion."); + write_predicate_ = write_predicate; +} + } // namespace cuda } // namespace fuser } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/ir_base_nodes.h b/torch/csrc/jit/codegen/cuda/ir_base_nodes.h index 2e0fa0885bd6..70f0b8f80fe5 100644 --- a/torch/csrc/jit/codegen/cuda/ir_base_nodes.h +++ b/torch/csrc/jit/codegen/cuda/ir_base_nodes.h @@ -1,9 +1,9 @@ #pragma once #include +#include #include #include -#include #include #include @@ -35,6 +35,8 @@ namespace jit { namespace fuser { namespace cuda { +using ValueId = int32_t; + using StmtNameType = unsigned int; constexpr StmtNameType kInvalidStmName = @@ -48,6 +50,22 @@ class UnaryOp; class BinaryOp; class IterDomain; class IrCloner; +class IrContainer; +class IrBuilderPasskey; +class IrContainerPasskey; + +namespace kir { +class Kernel; +class Predicate; +} // namespace kir + +// Passkey for container to register names with statements +class ExprPasskey { + friend class Expr; + + private: + explicit ExprPasskey() {} +}; TORCH_CUDA_CU_API void swap(Fusion& a, Fusion& b) noexcept; @@ -60,12 +78,12 @@ TORCH_CUDA_CU_API void swap(Fusion& a, Fusion& b) noexcept; //! is also important for the design to have a dispatch system for a Statment. //! Basically beinng able to succienctly traverse down the inhereitance stack of //! a Statment at runtime. This is currently implemented in dispatch.h -//! class TORCH_CUDA_CU_API Statement : public NonCopyable, public PolymorphicBase { friend void swap(Fusion&, Fusion&) noexcept; + friend void swap(IrContainer& a, IrContainer& b) noexcept; public: - Statement() = default; + Statement() = delete; // Cloning constructor Statement(const Statement* src, IrCloner* ir_cloner); @@ -78,7 +96,7 @@ class TORCH_CUDA_CU_API Statement : public NonCopyable, public PolymorphicBase { static void constDispatch(T handler, const Statement* const); template - static Statement* mutatorDispatch(T mutator, Statement*); + static void mutatorDispatch(T mutator, Statement*); // Accessor functions to types. Vals always have a DataType, Exprs never do virtual c10::optional getValType() const { @@ -106,8 +124,14 @@ class TORCH_CUDA_CU_API Statement : public NonCopyable, public PolymorphicBase { Expr* asExpr(); // Return the fusion this statement belongs to - Fusion* fusion() const { - return fusion_; + Fusion* fusion() const; + + // Return the kernel this statement belongs to + kir::Kernel* kernel() const; + + // Return the container this statement belongs to + IrContainer* container() const { + return ir_container_; } // Return the int that represents its name @@ -115,6 +139,13 @@ class TORCH_CUDA_CU_API Statement : public NonCopyable, public PolymorphicBase { return name_; } + // Set the statements' name. Typically the container will set the name, + // however if we're dealing with cloning, IrBuilder will set the name, this + // maybe should be from IrCloner, however I didn't want to add another + // passkey. + void setName(IrContainerPasskey, StmtNameType name); + void setName(IrBuilderPasskey, StmtNameType name); + virtual bool sameType(const Statement* const other) { if (isVal() && other->isVal()) return getValType().value() == other->getValType().value(); @@ -129,13 +160,17 @@ class TORCH_CUDA_CU_API Statement : public NonCopyable, public PolymorphicBase { return this == other; } - void print() const; + std::string toString() const; + std::string toInlineString() const; protected: + Statement(IrBuilderPasskey); + // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes) StmtNameType name_ = kInvalidStmName; + // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes) - Fusion* fusion_ = nullptr; + IrContainer* ir_container_ = nullptr; }; //! A Val represents a "value." These are objects, like tensors, scalars, and @@ -169,34 +204,43 @@ class TORCH_CUDA_CU_API Statement : public NonCopyable, public PolymorphicBase { //! class TORCH_CUDA_CU_API Val : public Statement { public: - // We may not want to register this value during Val's constructor. The reason - // for this is that if we register the val, then in a derived constructor try - // to throw, fusion's destructor will get called, but the pointer to this Val - // will be invalid. When fusion tries to delete this value it will cause a seg - // fault, instead of showing the thrown error. explicit Val( + IrBuilderPasskey, ValType _vtype, - DataType _dtype = DataType::Null, - bool register_val = true); + DataType _dtype = DataType::Null); Val(const Val* src, IrCloner* ir_cloner); - // TODO: why is this optional? - // + // Dispatch functions, definitions in dispatch.cpp + template + static void dispatch(T handler, Val*); + + template + static void constDispatch(T handler, const Val* const); + + template + static void mutatorDispatch(T mutator, Val*); + c10::optional getValType() const override { return vtype_; } + ValType vtype() const { + return vtype_; + } + + DataType dtype() const { + return dtype_; + } + // Throws if no DataType is found. Vals must have a DataType - // - // TODO: why is this optional? - // c10::optional getDataType() const override; bool isScalar() const { return vtype_ == ValType::Scalar || vtype_ == ValType::NamedScalar; } + // Returns if all dependencies are constant scalars bool isConstScalar() const; bool isAnInt() const { @@ -205,6 +249,11 @@ class TORCH_CUDA_CU_API Val : public Statement { c10::optional getInt() const; + // Returns if no dependencies and is a constant scalar. + virtual bool isConst() const { + return false; + } + bool isZeroInt() const; bool isOneInt() const; @@ -217,6 +266,9 @@ class TORCH_CUDA_CU_API Val : public Statement { return definition_; } + // Determine if value definition matches given expression type + bool isDefinitionType(ExprType expression_type) const; + const std::vector& uses() const; bool isFusionInput() const { @@ -254,42 +306,41 @@ class TORCH_CUDA_CU_API Val : public Statement { return evaluator_index_; } - // Dispatch functions, definitions in dispatch.cpp - template - static void dispatch(T handler, Val*); - - template - static void constDispatch(T handler, const Val* const); + // Following is managed by Fusion (or kirIrBuilder) and can change. + // TODO: Protect with a passkey. + void setDefinition(Expr* expr) { + definition_ = expr; + } - template - static Statement* mutatorDispatch(T mutator, Val*); + void resolveIndexDtype(); protected: friend Fusion; // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes) const ValType vtype_; - // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes) - const DataType dtype_; - - // Following is managed by Fusion and can change. - void setDefinition(Expr* expr) { - definition_ = expr; - } + // TODO: Add fusion passkey for this void setIsFusionInput(bool is_fusion_input) { is_fusion_input_ = is_fusion_input; } + // TODO: Add fusion passkey for this void setIsFusionOutput(bool is_fusion_output) { is_fusion_output_ = is_fusion_output; } + // TODO: Add fusion or container passkey for this void setUses(const std::vector& uses) { uses_ = uses; } private: + // There's only one instance where dtype can change, and that's through + // resolving the index data type from nvfuser to either Int or Int32 for + // welford operations. + DataType dtype_; + // Following is managed by Fusion and can change. bool is_fusion_input_ = false; bool is_fusion_output_ = false; @@ -297,6 +348,7 @@ class TORCH_CUDA_CU_API Val : public Statement { Expr* definition_ = nullptr; std::vector uses_; + // Expr evaluator idx; int evaluator_index_ = -1; }; @@ -342,15 +394,16 @@ class TORCH_CUDA_CU_API Val : public Statement { //! class TORCH_CUDA_CU_API Expr : public Statement { public: - explicit Expr(ExprType type); + explicit Expr(IrBuilderPasskey, ExprType type); + Expr(const Expr* src, IrCloner* ir_cloner); c10::optional getExprType() const override { - return type_; + return etype_; } - ExprType type() const { - return type_; + ExprType etype() const { + return etype_; } bool sameAs(const Statement* other) const override; @@ -380,23 +433,46 @@ class TORCH_CUDA_CU_API Expr : public Statement { static void constDispatch(T handler, const Expr* const); template - static Statement* mutatorDispatch(T mutator, Expr*); + static void mutatorDispatch(T mutator, Expr*); + + // TODO: Protect based on being in kernel container + kir::Predicate* predicate() const; + + // TODO: Protect based on being in kernel container + void setPredicate(kir::Predicate* predicate); + + // TODO: Protect based on being in kernel container + kir::Predicate* writePredicate() const; + + // TODO: Protect based on being in kernel container + void setWritePredicate(kir::Predicate* write_predicate); protected: + // TODO: Add Fusion passkey void addInput(Val* input) { TORCH_INTERNAL_ASSERT(input != nullptr); inputs_.push_back(input); } + // TODO: Add Fusion passkey void addOutput(Val* output) { TORCH_INTERNAL_ASSERT(output != nullptr); outputs_.push_back(output); } + ExprPasskey exprPasskey() { + return ExprPasskey(); + } + private: - ExprType type_ = ExprType::Invalid; + ExprType etype_ = ExprType::Invalid; std::vector inputs_; std::vector outputs_; + + kir::Predicate* predicate_ = nullptr; + + // Only used for reduction-related expressions + kir::Predicate* write_predicate_ = nullptr; }; } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/ir_builder.cpp b/torch/csrc/jit/codegen/cuda/ir_builder.cpp new file mode 100644 index 000000000000..6b990a2ea7be --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/ir_builder.cpp @@ -0,0 +1,429 @@ +#include +#include +#include +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +//! Clone an IR node, forwarding the arguments to the IrCloner constructor. +template +T* IrBuilder::clone(const T* src, IrCloner* ir_cloner) { + TORCH_INTERNAL_ASSERT( + ir_cloner != nullptr, + "Cannot use create when a cloner object is set. Use clone."); + + TORCH_INTERNAL_ASSERT( + ir_cloner->container() != nullptr, + "Cloner doesn't have a valid container to store cloned object."); + + T* dest = new T(src, ir_cloner); + const Statement* src_stmt = dynamic_cast(src); + Statement* dest_stmt = dynamic_cast(dest); + + auto dest_container = ir_cloner->container(); + auto src_container = src_stmt->container(); + + dest_container->registerStmt(IrBuilderPasskey(dest_container), dest_stmt); + + if (src_container != dest_container) { + dest_stmt->setName(IrBuilderPasskey(dest_container), src_stmt->name()); + } + + ir_cloner->registerClone(src_stmt, dest_stmt); + + return dest; +} + +#define IR_BUILDER_INSTANTIATE(T) \ + template T* IrBuilder::clone(const T* src, IrCloner* ir_cloner); + +// Vals +IR_BUILDER_INSTANTIATE(IterDomain) +IR_BUILDER_INSTANTIATE(TensorDomain) +IR_BUILDER_INSTANTIATE(TensorView) +IR_BUILDER_INSTANTIATE(Bool) +IR_BUILDER_INSTANTIATE(Double) +IR_BUILDER_INSTANTIATE(Int) +IR_BUILDER_INSTANTIATE(ComplexDouble) +IR_BUILDER_INSTANTIATE(NamedScalar) + +// Exprs +IR_BUILDER_INSTANTIATE(Split) +IR_BUILDER_INSTANTIATE(Merge) +IR_BUILDER_INSTANTIATE(TransposeOp) +IR_BUILDER_INSTANTIATE(ShiftOp) +IR_BUILDER_INSTANTIATE(GatherOp) +IR_BUILDER_INSTANTIATE(ViewAsScalar) +IR_BUILDER_INSTANTIATE(ViewOp) +IR_BUILDER_INSTANTIATE(UnaryOp) +IR_BUILDER_INSTANTIATE(BinaryOp) +IR_BUILDER_INSTANTIATE(TernaryOp) +IR_BUILDER_INSTANTIATE(ReductionOp) +IR_BUILDER_INSTANTIATE(GroupedReductionOp) +IR_BUILDER_INSTANTIATE(WelfordOp) +IR_BUILDER_INSTANTIATE(MmaOp) +IR_BUILDER_INSTANTIATE(BroadcastOp) + +Val* IrBuilder::newResult(DataType dtype) { + switch (dtype) { + case DataType::Bool: + return IrBuilder::create(c10::nullopt); + case DataType::Double: + return IrBuilder::create(c10::nullopt); + case DataType::Int: + return IrBuilder::create(c10::nullopt); + default: + TORCH_CHECK(false, "Unexpected data type"); + } +} + +Val* IrBuilder::newArithmeticExpr(BinaryOpType op_type, Val* lhs, Val* rhs) { + TORCH_CHECK( + lhs->dtype() == rhs->dtype(), + "Incompatible operand types: ", + lhs->dtype(), + " and ", + rhs->dtype()); + auto result = newResult(lhs->dtype()); + IrBuilder::create(op_type, result, lhs, rhs); + // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) + return result; +} + +Val* IrBuilder::newLogicExpr(BinaryOpType op_type, Val* lhs, Val* rhs) { + auto result = IrBuilder::create(c10::nullopt); + IrBuilder::create(op_type, result, lhs, rhs); + // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) + return result; +} + +Val* IrBuilder::whereExpr(Val* pred, Val* lhs, Val* rhs) { + TORCH_CHECK(lhs->dtype() == rhs->dtype(), "Incompatible operand types"); + auto result = newResult(lhs->dtype()); + IrBuilder::create(TernaryOpType::Where, result, pred, lhs, rhs); + return result; +} + +Val* IrBuilder::negExpr(Val* val) { + auto result = newResult(val->dtype()); + IrBuilder::create(UnaryOpType::Neg, result, val); + return result; +} + +Val* IrBuilder::notExpr(Val* val) { + auto result = newResult(val->dtype()); + IrBuilder::create(UnaryOpType::Not, result, val); + return result; +} + +Val* IrBuilder::setExpr(Val* val) { + auto result = newResult(val->dtype()); + IrBuilder::create(UnaryOpType::Set, result, val); + return result; +} + +Val* IrBuilder::setExprNamedScalar(const std::string& name, Val* val) { + auto result = IrBuilder::create(name, val->dtype()); + IrBuilder::create(UnaryOpType::Set, result, val); + return result; +} + +Val* IrBuilder::addressExprNamedScalar(const std::string& name, Val* val) { + auto result = IrBuilder::create(name, DataType::Int); + IrBuilder::create(UnaryOpType::Address, result, val); + return result; +} + +Val* IrBuilder::andExpr(Val* lhs, Val* rhs) { + return newLogicExpr(BinaryOpType::And, lhs, rhs); +} + +Val* IrBuilder::eqExpr(Val* lhs, Val* rhs) { + return newLogicExpr(BinaryOpType::Eq, lhs, rhs); +} + +Val* IrBuilder::gtExpr(Val* lhs, Val* rhs) { + return newLogicExpr(BinaryOpType::GT, lhs, rhs); +} + +Val* IrBuilder::ltExpr(Val* lhs, Val* rhs) { + return newLogicExpr(BinaryOpType::LT, lhs, rhs); +} + +Val* IrBuilder::leExpr(Val* lhs, Val* rhs) { + return newLogicExpr(BinaryOpType::LE, lhs, rhs); +} + +Val* IrBuilder::geExpr(Val* lhs, Val* rhs) { + return newLogicExpr(BinaryOpType::GE, lhs, rhs); +} + +Val* IrBuilder::addExpr(Val* lhs, Val* rhs) { + return newArithmeticExpr(BinaryOpType::Add, lhs, rhs); +} + +Val* IrBuilder::subExpr(Val* lhs, Val* rhs) { + return newArithmeticExpr(BinaryOpType::Sub, lhs, rhs); +} + +Val* IrBuilder::mulExpr(Val* lhs, Val* rhs) { + return newArithmeticExpr(BinaryOpType::Mul, lhs, rhs); +} + +Val* IrBuilder::divExpr(Val* lhs, Val* rhs) { + return newArithmeticExpr(BinaryOpType::Div, lhs, rhs); +} + +Val* IrBuilder::ceilDivExpr(Val* lhs, Val* rhs) { + return newArithmeticExpr(BinaryOpType::CeilDiv, lhs, rhs); +} + +Val* IrBuilder::modExpr(Val* lhs, Val* rhs) { + return newArithmeticExpr(BinaryOpType::Mod, lhs, rhs); +} + +Val* IrBuilder::maxExpr(Val* lhs, Val* rhs) { + return newArithmeticExpr(BinaryOpType::Max, lhs, rhs); +} + +Val* IrBuilder::minExpr(Val* lhs, Val* rhs) { + return newArithmeticExpr(BinaryOpType::Min, lhs, rhs); +} + +Val* SimplifyingIrBuilder::negExpr(Val* val) { + if (auto int_val = dynamic_cast(val)) { + if (int_val->isConst()) { + return IrBuilder::create(-int_val->value().value()); + } + } + return IrBuilder::negExpr(val); +} + +Val* SimplifyingIrBuilder::notExpr(Val* val) { + if (auto bool_val = dynamic_cast(val)) { + if (bool_val->isConst()) { + if (bool_val->value().value()) { + return FusionGuard::getCurFusion()->falseVal(); + } else { + return FusionGuard::getCurFusion()->trueVal(); + } + } + } + return IrBuilder::notExpr(val); +} + +Val* SimplifyingIrBuilder::addExpr(Int* lhs, Int::ScalarType rhs) { + if (rhs == 0) { + return lhs; + } else if (lhs == nullptr) { + return IrBuilder::IrBuilder::create(rhs); + } else if (lhs->isConst()) { + return IrBuilder::IrBuilder::create(lhs->value().value() + rhs); + } else if (rhs > 0) { + return IrBuilder::addExpr(lhs, IrBuilder::IrBuilder::create(rhs)); + } else { + return IrBuilder::subExpr(lhs, IrBuilder::IrBuilder::create(-rhs)); + } +} + +Val* SimplifyingIrBuilder::addExpr(Int* lhs, Int* rhs) { + if (rhs == nullptr) { + return lhs; + } else if (lhs == nullptr) { + return rhs; + } else if (lhs->isConst()) { + return addExpr(rhs, lhs->value().value()); + } else if (rhs->isConst()) { + return addExpr(lhs, rhs->value().value()); + } else { + return IrBuilder::addExpr(lhs, rhs); + } +} + +Val* SimplifyingIrBuilder::addExpr(Val* lhs, Val* rhs) { + TORCH_INTERNAL_ASSERT(lhs != nullptr || rhs != nullptr); + if (lhs == nullptr || lhs->isZeroInt()) { + return rhs; + } else if (rhs == nullptr || rhs->isZeroInt()) { + return lhs; + } + auto lhs_int = dynamic_cast(lhs); + auto rhs_int = dynamic_cast(rhs); + if (lhs_int != nullptr && rhs_int != nullptr) { + return addExpr(lhs_int, rhs_int); + } else { + return IrBuilder::addExpr(lhs, rhs); + } +} + +Val* SimplifyingIrBuilder::addExpr(Val* lhs, Int::ScalarType rhs) { + auto lhs_int = dynamic_cast(lhs); + if (lhs_int != nullptr) { + return addExpr(lhs_int, rhs); + } else { + return addExpr(lhs, IrBuilder::create(rhs)); + } +} + +Val* SimplifyingIrBuilder::subExpr(Val* lhs, Val* rhs) { + return addExpr(lhs, negExpr(rhs)); +} + +Val* SimplifyingIrBuilder::mulExpr(Int* lhs, Int::ScalarType rhs) { + if (rhs == 0) { + return lhs->container()->zeroVal(); + } else if (rhs == 1) { + return lhs; + } else if (lhs == nullptr) { + return IrBuilder::create(rhs); + } else if (lhs->isConst()) { + return IrBuilder::create(lhs->value().value() * rhs); + } else { + return IrBuilder::mulExpr(lhs, IrBuilder::create(rhs)); + } +} + +Val* SimplifyingIrBuilder::mulExpr(Val* lhs, Int::ScalarType rhs) { + auto lhs_int = dynamic_cast(lhs); + if (lhs_int != nullptr) { + return mulExpr(lhs_int, rhs); + } else { + return IrBuilder::mulExpr(lhs, IrBuilder::create(rhs)); + } +} + +Val* SimplifyingIrBuilder::mulExpr(Int* lhs, Int* rhs) { + if (rhs == nullptr) { + return lhs; + } else if (lhs == nullptr) { + return rhs; + } else if (lhs->isConst()) { + return mulExpr(rhs, lhs->value().value()); + } else if (rhs->isConst()) { + return mulExpr(lhs, rhs->value().value()); + } else { + return IrBuilder::mulExpr(lhs, rhs); + } +} + +Val* SimplifyingIrBuilder::mulExpr(Val* lhs, Val* rhs) { + TORCH_INTERNAL_ASSERT(lhs != nullptr || rhs != nullptr); + if (lhs == nullptr || lhs->isOneInt()) { + return rhs; + } else if (rhs == nullptr || rhs->isOneInt()) { + return lhs; + } else if (lhs->isZeroInt() || rhs->isZeroInt()) { + return lhs->container()->zeroVal(); + } + auto lhs_int = dynamic_cast(lhs); + auto rhs_int = dynamic_cast(rhs); + if (lhs_int != nullptr && rhs_int != nullptr) { + return mulExpr(lhs_int, rhs_int); + } else { + return IrBuilder::mulExpr(lhs, rhs); + } +} + +Val* SimplifyingIrBuilder::andExpr(Val* lhs, Val* rhs) { + TORCH_INTERNAL_ASSERT(!(lhs == nullptr && rhs == nullptr)); + + if (lhs == nullptr) { + return rhs; + } else if (rhs == nullptr) { + return lhs; + } + + bool lhs_definitely_true = false; + bool lhs_definitely_false = false; + auto lhs_bool = dynamic_cast(lhs); + if (lhs_bool && lhs_bool->isConst()) { + lhs_definitely_true = lhs_bool->value().value(); + lhs_definitely_false = !lhs_bool->value().value(); + } + auto rhs_bool = dynamic_cast(rhs); + bool rhs_definitely_true = false; + bool rhs_definitely_false = false; + if (rhs_bool && rhs_bool->isConst()) { + rhs_definitely_true = rhs_bool->value().value(); + rhs_definitely_false = !rhs_bool->value().value(); + } + + if (lhs_definitely_true && rhs_definitely_true) { + return FusionGuard::getCurFusion()->trueVal(); + } else if (lhs_definitely_false || rhs_definitely_false) { + return FusionGuard::getCurFusion()->falseVal(); + } else if (lhs_definitely_true) { + return rhs; + } else if (rhs_definitely_true) { + return lhs; + } + + return IrBuilder::andExpr(lhs, rhs); +} + +namespace { + +template +Val* minOrMaxExpr( + Int* lhs, + Int* rhs, + IrBuilderFunc ir_builder_func, + IntFunc int_func) { + if (rhs == nullptr) { + return lhs; + } else if (lhs == nullptr) { + return rhs; + } else if (lhs->isConst() && rhs->isConst()) { + return IrBuilder::create( + int_func(lhs->value().value(), rhs->value().value())); + } else { + return ir_builder_func(lhs, rhs); + } +} + +template +Val* minOrMaxExpr( + Val* lhs, + Val* rhs, + IrBuilderFunc ir_builder_func, + IntFunc int_func) { + TORCH_INTERNAL_ASSERT(lhs != nullptr || rhs != nullptr); + if (lhs == nullptr) { + return rhs; + } else if (rhs == nullptr || lhs == rhs) { + return lhs; + } + auto lhs_int = dynamic_cast(lhs); + auto rhs_int = dynamic_cast(rhs); + if (lhs_int != nullptr && rhs_int != nullptr) { + return minOrMaxExpr(lhs_int, rhs_int, ir_builder_func, int_func); + } else { + return ir_builder_func(lhs, rhs); + } +} + +} // namespace + +Val* SimplifyingIrBuilder::maxExpr(Val* lhs, Val* rhs) { + return minOrMaxExpr( + lhs, + rhs, + [](Val* lhs, Val* rhs) { return IrBuilder::maxExpr(lhs, rhs); }, + [](int64_t lhs, int64_t rhs) { return std::max(lhs, rhs); }); +} + +Val* SimplifyingIrBuilder::minExpr(Val* lhs, Val* rhs) { + return minOrMaxExpr( + lhs, + rhs, + [](Val* lhs, Val* rhs) { return IrBuilder::minExpr(lhs, rhs); }, + [](int64_t lhs, int64_t rhs) { return std::min(lhs, rhs); }); +} + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/ir_builder.h b/torch/csrc/jit/codegen/cuda/ir_builder.h new file mode 100644 index 000000000000..f122232f8fb8 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/ir_builder.h @@ -0,0 +1,131 @@ +#pragma once + +#include +#include +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +namespace kir { +class Kernel; +} + +class IrCloner; + +// Passkey for builder to register properties with statements, and to call +// functions in IrContainer +class TORCH_CUDA_CU_API IrBuilderPasskey { + friend class IrBuilder; + + public: + // TODO: Collapse ir_container and Kernel once Kernel inherits from + // IrContainer + IrContainer* const ir_container_ = nullptr; + + private: + explicit IrBuilderPasskey(IrContainer* ir_container); +}; + +//! IR builder interface +class TORCH_CUDA_CU_API IrBuilder { + public: + //! Allocate a new IR node, forwarding the arguments to the appropriate + //! constructor and registering with the container + template + static T* create(Args&&... args) { + auto container = FusionGuard::getCurFusion(); + // return create(container, std::forward(args)...); + TORCH_INTERNAL_ASSERT( + container != nullptr, "Need an active container to build IR."); + T* node = new T(IrBuilderPasskey(container), std::forward(args)...); + + container->registerStmt(IrBuilderPasskey(container), node); + + return node; + } + + //! Allocate a new IR node, forwarding the arguments to the appropriate + //! constructor and registering with the container + template + static T* create(IrContainer* container, Args&&... args) { + TORCH_INTERNAL_ASSERT( + container != nullptr, "Need an active container to build IR."); + T* node = new T(IrBuilderPasskey(container), std::forward(args)...); + + container->registerStmt(IrBuilderPasskey(container), node); + + return node; + } + + //! Clone an IR node, forwarding the arguments to the IrCloner constructor. + //! Register clones with IrCloner's target container. + template + static T* clone(const T* src, IrCloner* ir_cloner); + + // Unary operations + static Val* negExpr(Val* val); + static Val* notExpr(Val* val); + static Val* setExpr(Val* val); + static Val* setExprNamedScalar(const std::string& name, Val* val); + static Val* addressExprNamedScalar(const std::string& name, Val* val); + + // Binary operations + static Val* andExpr(Val* lhs, Val* rhs); + static Val* eqExpr(Val* lhs, Val* rhs); + static Val* gtExpr(Val* lhs, Val* rhs); + static Val* ltExpr(Val* lhs, Val* rhs); + static Val* leExpr(Val* lhs, Val* rhs); + static Val* geExpr(Val* lhs, Val* rhs); + static Val* addExpr(Val* lhs, Val* rhs); + static Val* subExpr(Val* lhs, Val* rhs); + static Val* mulExpr(Val* lhs, Val* rhs); + static Val* divExpr(Val* lhs, Val* rhs); + static Val* ceilDivExpr(Val* lhs, Val* rhs); + static Val* modExpr(Val* lhs, Val* rhs); + static Val* maxExpr(Val* lhs, Val* rhs); + static Val* minExpr(Val* lhs, Val* rhs); + + // Ternary operations + static Val* whereExpr(Val* pred, Val* lhs, Val* rhs); + + private: + static Val* newResult(DataType dtype); + static Val* newArithmeticExpr(BinaryOpType op_type, Val* lhs, Val* rhs); + static Val* newLogicExpr(BinaryOpType op_type, Val* lhs, Val* rhs); +}; + +//! A wrapper builder with static expression simplification +//! +//! Example: +//! - addExpr(new Int(1), new Int(2)) -> Int(3) +//! - addExpr(new Int(0), new NamedScalar("foo")) -> NamedScalar("foo") +//! +//! Designed to be used to simplify predicate and index expressions in +//! generated code. Also, the shift validation may fail without +//! this simplification. +class TORCH_CUDA_CU_API SimplifyingIrBuilder : public IrBuilder { + public: + static Val* negExpr(Val* val); + static Val* notExpr(Val* val); + + static Val* addExpr(Int* lhs, Int::ScalarType rhs); + static Val* addExpr(Val* lhs, Int::ScalarType rhs); + static Val* addExpr(Int* lhs, Int* rhs); + static Val* addExpr(Val* lhs, Val* rhs); + static Val* subExpr(Val* lhs, Val* rhs); + static Val* mulExpr(Int* lhs, Int::ScalarType rhs); + static Val* mulExpr(Val* lhs, Int::ScalarType rhs); + static Val* mulExpr(Int* lhs, Int* rhs); + static Val* mulExpr(Val* lhs, Val* rhs); + static Val* andExpr(Val* lhs, Val* rhs); + static Val* maxExpr(Val* lhs, Val* rhs); + static Val* minExpr(Val* lhs, Val* rhs); +}; + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/ir_cloner.cpp b/torch/csrc/jit/codegen/cuda/ir_cloner.cpp index 7e5a9cfa8bc3..5ad17fbe1930 100644 --- a/torch/csrc/jit/codegen/cuda/ir_cloner.cpp +++ b/torch/csrc/jit/codegen/cuda/ir_cloner.cpp @@ -2,12 +2,15 @@ #include #include +#include namespace torch { namespace jit { namespace fuser { namespace cuda { +IrCloner::IrCloner(IrContainer* container) : ir_container_(container) {} + Statement* IrCloner::clone(const Statement* statement) { if (statement == nullptr) { return nullptr; @@ -30,7 +33,6 @@ Statement* IrCloner::clone(const Statement* statement) { // that something went horribly wrong. TORCH_INTERNAL_ASSERT(new_node != nullptr); TORCH_INTERNAL_ASSERT(clones_map_[statement] == new_node); - TORCH_INTERNAL_ASSERT(new_node->fusion() == fusion_); return new_node; } @@ -39,7 +41,6 @@ Statement* IrCloner::clone(const Statement* statement) { void IrCloner::registerClone(const Statement* src, Statement* clone) { TORCH_CHECK(src != nullptr); TORCH_CHECK(clone != nullptr); - TORCH_CHECK(clone->fusion() == fusion_); TORCH_CHECK(clones_map_.insert({src, clone}).second); } @@ -56,79 +57,95 @@ void IrCloner::handle(const Expr* e) { } void IrCloner::handle(const TensorDomain* td) { - clone_ = new TensorDomain(td, this); + clone_ = IrBuilder::clone(td, this); } void IrCloner::handle(const IterDomain* id) { - clone_ = new IterDomain(id, this); + clone_ = IrBuilder::clone(id, this); } void IrCloner::handle(const Bool* b) { - clone_ = new Bool(b, this); + clone_ = IrBuilder::clone(b, this); } void IrCloner::handle(const Double* d) { - clone_ = new Double(d, this); + clone_ = IrBuilder::clone(d, this); } void IrCloner::handle(const Int* i) { - clone_ = new Int(i, this); + clone_ = IrBuilder::clone(i, this); +} + +void IrCloner::handle(const ComplexDouble* c) { + clone_ = IrBuilder::clone(c, this); } void IrCloner::handle(const NamedScalar* named_scalar) { - clone_ = new NamedScalar(named_scalar, this); + clone_ = IrBuilder::clone(named_scalar, this); } void IrCloner::handle(const TensorView* tv) { - clone_ = new TensorView(tv, this); + clone_ = IrBuilder::clone(tv, this); } void IrCloner::handle(const UnaryOp* op) { - clone_ = new UnaryOp(op, this); + clone_ = IrBuilder::clone(op, this); } void IrCloner::handle(const BinaryOp* op) { - clone_ = new BinaryOp(op, this); + clone_ = IrBuilder::clone(op, this); } void IrCloner::handle(const TernaryOp* op) { - clone_ = new TernaryOp(op, this); + clone_ = IrBuilder::clone(op, this); } void IrCloner::handle(const BroadcastOp* op) { - clone_ = new BroadcastOp(op, this); + clone_ = IrBuilder::clone(op, this); } void IrCloner::handle(const ReductionOp* op) { - clone_ = new ReductionOp(op, this); + clone_ = IrBuilder::clone(op, this); +} + +void IrCloner::handle(const GroupedReductionOp* op) { + clone_ = IrBuilder::clone(op, this); } void IrCloner::handle(const WelfordOp* op) { - clone_ = new WelfordOp(op, this); + clone_ = IrBuilder::clone(op, this); +} + +void IrCloner::handle(const MmaOp* op) { + clone_ = IrBuilder::clone(op, this); } void IrCloner::handle(const TransposeOp* op) { - clone_ = new TransposeOp(op, this); + clone_ = IrBuilder::clone(op, this); } void IrCloner::handle(const ShiftOp* op) { - clone_ = new ShiftOp(op, this); + clone_ = IrBuilder::clone(op, this); } void IrCloner::handle(const GatherOp* op) { - clone_ = new GatherOp(op, this); + clone_ = IrBuilder::clone(op, this); +} + +void IrCloner::handle(const ViewAsScalar* op) { + clone_ = IrBuilder::clone(op, this); } void IrCloner::handle(const ViewOp* op) { - clone_ = new ViewOp(op, this); + clone_ = IrBuilder::clone(op, this); } void IrCloner::handle(const Split* split) { - clone_ = new Split(split, this); + clone_ = IrBuilder::clone(split, this); } void IrCloner::handle(const Merge* merge) { - clone_ = new Merge(merge, this); + clone_ = IrBuilder::clone(merge, this); } TensorView* RecomputeTv::recompute(TensorView* tv) { @@ -141,7 +158,7 @@ TensorView* RecomputeTv::recompute(TensorView* tv) { "Cannot recompute buffers that are inputs of the fusion."); // Grab all the expressions used to generate the TensorView - auto exprs = ExprSort::getExprs(tv->fusion(), {tv}); + auto exprs = StmtSort::getExprs(tv->fusion(), {tv}, false); // Run the replicator RecomputeTv replicator(tv->fusion(), exprs); @@ -161,7 +178,7 @@ TensorView* RecomputeTv::recompute(TensorView* tv) { } RecomputeTv::RecomputeTv(Fusion* fusion, std::vector exprs) - : IrCloner(fusion) { + : IrCloner(fusion), fusion_(fusion) { // Add inputs to the clones map to prevent cloning them. for (const auto inp : fusion->inputs()) { clones_map_[inp] = inp; @@ -183,7 +200,7 @@ void RecomputeTv::handle(const TensorDomain* td) { // Make sure to recompute the history of the iteration domains, explicitly go // through the expressions and send them to IrCloner. auto exprs = - ExprSort::getExprs(fusion(), {td->domain().begin(), td->domain().end()}); + StmtSort::getExprs(fusion_, {td->domain().begin(), td->domain().end()}); for (auto expr : exprs) { IrCloner::handle(expr); diff --git a/torch/csrc/jit/codegen/cuda/ir_cloner.h b/torch/csrc/jit/codegen/cuda/ir_cloner.h index ac83d9edb097..5b70b0fd048f 100644 --- a/torch/csrc/jit/codegen/cuda/ir_cloner.h +++ b/torch/csrc/jit/codegen/cuda/ir_cloner.h @@ -1,7 +1,8 @@ #pragma once -#include +#include #include +#include #include #include @@ -11,7 +12,7 @@ namespace jit { namespace fuser { namespace cuda { -class Fusion; +class IrContainer; //! Clones nodes from an exiting Fusion //! @@ -21,10 +22,11 @@ class Fusion; //! class TORCH_CUDA_CU_API IrCloner : private OptInConstDispatch { friend class Statement; + friend class IrBuilder; public: // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) - explicit IrCloner(Fusion* new_fusion) : fusion_(new_fusion) {} + explicit IrCloner(IrContainer* container); Statement* clone(const Statement* statement); @@ -45,8 +47,8 @@ class TORCH_CUDA_CU_API IrCloner : private OptInConstDispatch { return copy; } - Fusion* fusion() const { - return fusion_; + IrContainer* container() const { + return ir_container_; } protected: @@ -63,6 +65,7 @@ class TORCH_CUDA_CU_API IrCloner : private OptInConstDispatch { void handle(const Bool*) override; void handle(const Double*) override; void handle(const Int*) override; + void handle(const ComplexDouble*) override; void handle(const NamedScalar*) override; void handle(const UnaryOp*) override; @@ -70,10 +73,13 @@ class TORCH_CUDA_CU_API IrCloner : private OptInConstDispatch { void handle(const TernaryOp*) override; void handle(const BroadcastOp*) override; void handle(const ReductionOp*) override; + void handle(const GroupedReductionOp*) override; void handle(const WelfordOp*) override; + void handle(const MmaOp*) override; void handle(const TransposeOp*) override; void handle(const ShiftOp*) override; void handle(const GatherOp*) override; + void handle(const ViewAsScalar*) override; void handle(const ViewOp*) override; void handle(const Split*) override; @@ -86,12 +92,15 @@ class TORCH_CUDA_CU_API IrCloner : private OptInConstDispatch { private: // The destination Fusion container - Fusion* fusion_ = nullptr; + IrContainer* ir_container_ = nullptr; // The dispatch interface doesn't allow returning values from // individual `handle()` methods, so they are storing the // result here Statement* clone_ = nullptr; + + // Builder to make all the new nodes + IrBuilder builder_; }; // Replicates all expressions used to generate the provided TensorView. Does not @@ -105,7 +114,9 @@ class RecomputeTv : private IrCloner { private: RecomputeTv(Fusion* fusion, std::vector exprs); - void handle(const TensorDomain*) override; + void handle(const TensorDomain*) final; + + Fusion* fusion_; }; } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/ir_container.cpp b/torch/csrc/jit/codegen/cuda/ir_container.cpp new file mode 100644 index 000000000000..e84418eb9733 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/ir_container.cpp @@ -0,0 +1,279 @@ +#include +#include +#include +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +void swap(IrContainer& a, IrContainer& b) noexcept { + FUSER_PERF_SCOPE("Fusion swap"); + + using std::swap; + + // Swap the content + swap(a.vals_up_, b.vals_up_); + swap(a.vals_, b.vals_); + + swap(a.exprs_up_, b.exprs_up_); + swap(a.exprs_, b.exprs_); + + swap(a.raw_ptrs_, b.raw_ptrs_); + + swap(a.val_type_name_map_, b.val_type_name_map_); + swap(a.expr_name_counter_, b.expr_name_counter_); + + // Fixup the Statement::fusion_ links for a + for (auto val : a.vals_) { + val->ir_container_ = &a; + } + for (auto expr : a.exprs_) { + expr->ir_container_ = &a; + } + + // Fixup the Statement::fusion_ links for b + for (auto val : b.vals_) { + val->ir_container_ = &a; + } + for (auto expr : b.exprs_) { + expr->ir_container_ = &a; + } +} + +IrCloner IrContainer::copy(const IrContainer* from, IrContainer* to) { + to->clear(); + IrCloner ir_cloner(to); + + for (auto val : from->vals_) { + to->vals_.insert(ir_cloner.clone(val)); + } + + for (auto expr : from->exprs_) { + to->exprs_.insert(ir_cloner.clone(expr)); + } + + to->val_type_name_map_ = from->val_type_name_map_; + to->expr_name_counter_ = from->expr_name_counter_; + + return ir_cloner; +} + +IrContainer::IrContainer() = default; + +IrContainer::IrContainer(const IrContainer& other) { + FUSER_PERF_SCOPE("IrContainer copy"); + IrContainer::copy(&other, this); +} + +IrContainer::IrContainer(IrContainer&& other) noexcept { + FUSER_PERF_SCOPE("IrContainer move"); + swap(*this, other); +} + +IrContainer& IrContainer::operator=(const IrContainer& other) { + FUSER_PERF_SCOPE("IrContainer copy assign"); + IrContainer copy(other); + clear(); + swap(*this, copy); + return *this; +} + +IrContainer& IrContainer::operator=(IrContainer&& other) noexcept { + FUSER_PERF_SCOPE("IrContainer move assign"); + clear(); + swap(*this, other); + return *this; +} + +IrContainer::~IrContainer() { + clear(); +} + +//! Register the Statement with this container +void IrContainer::registerStmt(IrBuilderPasskey, Statement* stmt) { + if (stmt->isVal()) { + registerVal(stmt->asVal()); + } else { + registerExpr(stmt->asExpr()); + } +} + +//! Register the Val with this container +void IrContainer::registerVal(IrBuilderPasskey, Val* val) { + registerVal(val); +} + +//! Register expr with this container. +void IrContainer::registerExpr(IrBuilderPasskey, Expr* expr) { + registerExpr(expr); +} + +void IrContainer::registerExpr(ExprPasskey, Expr* expr) { + registerExpr(expr); +} + +void IrContainer::removeExpr(Expr* expr) { + TORCH_INTERNAL_ASSERT( + exprs_.find(expr) != exprs_.end(), + "Wanted to remove an expression but it doesn't exist in this container."); + auto expr_in_deque = std::find_if( + exprs_up_.begin(), + exprs_up_.end(), + [expr](std::unique_ptr& expr_up) { return expr_up.get() == expr; }); + + TORCH_INTERNAL_ASSERT( + expr_in_deque != exprs_up_.end(), + "Wanted to remove an expression but its unique ptr is missing."); + + exprs_.erase(expr); + exprs_up_.erase(expr_in_deque); + raw_ptrs_.erase((void*)expr); +} + +//! Completely remove val from the fusion, break all dependencies associated +//! with it +void IrContainer::removeVal(Val* val) { + // Don't remove shortcuts + if (val == true_val_.get() || val == false_val_.get() || + val == one_val_.get() || val == zero_val_.get() || + val == magic_zero_val_.get()) { + return; + } + + TORCH_INTERNAL_ASSERT( + vals_.find(val) != vals_.end(), + "Wanted to remove a value but it doesn't exist in this container."); + auto val_in_deque = std::find_if( + vals_up_.begin(), vals_up_.end(), [val](std::unique_ptr& val_up) { + return val_up.get() == val; + }); + + TORCH_INTERNAL_ASSERT( + val_in_deque != vals_up_.end(), + "Wanted to remove a value but its unique ptr is missing."); + + vals_.erase(val); + vals_up_.erase(val_in_deque); + raw_ptrs_.erase((void*)val); +} + +//! Register the Val with this container +void IrContainer::registerVal(Val* val) { + if (inContainer(val)) { + return; + } + + vals_up_.emplace_back(std::unique_ptr(val)); + vals_.emplace(vals_up_.back().get()); + val->setName(IrContainerPasskey(), getValName(vals_up_.back()->vtype())); + raw_ptrs_.emplace((void*)vals_up_.back().get()); +} + +//! Register expr with this container. +void IrContainer::registerExpr(Expr* expr) { + if (inContainer(expr)) { + return; + } + exprs_up_.emplace_back(std::unique_ptr(expr)); + exprs_.emplace(exprs_up_.back().get()); + expr->setName(IrContainerPasskey(), getExprName()); + raw_ptrs_.emplace((void*)exprs_up_.back().get()); +} + +void IrContainer::clear() noexcept { + FUSER_PERF_SCOPE("IrContainer clear"); + vals_.clear(); + vals_up_.clear(); + exprs_.clear(); + exprs_up_.clear(); + raw_ptrs_.clear(); + + val_type_name_map_.clear(); + expr_name_counter_ = 0; +} + +bool IrContainer::inContainer(const Statement* stmt) const { + const void* const_void = (const void*)(stmt); + void* nonconst_void = const_cast(const_void); // NOLINT + if (raw_ptrs_.find(nonconst_void) == raw_ptrs_.end()) { + return false; + } + + TORCH_INTERNAL_ASSERT( + stmt->container() == this, + "Container claims to own stmt, but stmt disagrees."); + + Statement* nonconst_stmt = const_cast(stmt); // NOLINT + if (stmt->isExpr()) { + TORCH_INTERNAL_ASSERT( + exprs_.find(nonconst_stmt->as()) != exprs_.end(), + "Somehow container claims to and not to own an Expr."); + } + if (stmt->isVal()) { + TORCH_INTERNAL_ASSERT( + vals_.find(nonconst_stmt->as()) != vals_.end(), + "Somehow container claims to and not to own an Val."); + } + + return true; +} + +// Shortcuts for frequently used vals +Int* IrContainer::zeroVal() { + if (!zero_val_) { + auto zero_val = IrBuilder::create(this, 0); + TORCH_INTERNAL_ASSERT(vals_up_.back().get() == zero_val); + zero_val_ = std::unique_ptr(vals_up_.back().release()->as()); + vals_up_.pop_back(); + } + return zero_val_.get(); +} + +Int* IrContainer::oneVal() { + if (!one_val_) { + auto one_val = IrBuilder::create(this, 1); + TORCH_INTERNAL_ASSERT(vals_up_.back().get() == one_val); + one_val_ = std::unique_ptr(vals_up_.back().release()->as()); + vals_up_.pop_back(); + } + return one_val_.get(); +} + +Bool* IrContainer::falseVal() { + if (!false_val_) { + auto false_val = IrBuilder::create(this, false); + TORCH_INTERNAL_ASSERT(vals_up_.back().get() == false_val); + false_val_ = std::unique_ptr(vals_up_.back().release()->as()); + vals_up_.pop_back(); + } + return false_val_.get(); +} + +Bool* IrContainer::trueVal() { + if (!true_val_) { + auto true_val = IrBuilder::create(this, true); + TORCH_INTERNAL_ASSERT(vals_up_.back().get() == true_val); + true_val_ = std::unique_ptr(vals_up_.back().release()->as()); + vals_up_.pop_back(); + } + return true_val_.get(); +} + +NamedScalar* IrContainer::magicZeroVal() { + if (!magic_zero_val_) { + auto magic_zero = + IrBuilder::create(kMagicZeroName, DataType::Int); + TORCH_INTERNAL_ASSERT(vals_up_.back().get() == magic_zero); + magic_zero_val_ = std::unique_ptr( + vals_up_.back().release()->as()); + vals_up_.pop_back(); + } + return magic_zero_val_.get(); +} + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/ir_container.h b/torch/csrc/jit/codegen/cuda/ir_container.h new file mode 100644 index 000000000000..fb1aaeaf383c --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/ir_container.h @@ -0,0 +1,174 @@ +#pragma once + +#include + +#include +#include + +#include +#include +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +class IrBuilderPasskey; +class ExprPasskey; +class OptOutMutator; + +class Int; +class Bool; +class NamedScalar; + +// Passkey for container to register names with statements +class IrContainerPasskey { + friend class IrContainer; + + private: + explicit IrContainerPasskey() {} +}; + +class TORCH_CUDA_CU_API IrContainer : public PolymorphicBase { + public: + IrContainer(); + + IrContainer(const IrContainer& other); + IrContainer(IrContainer&& other) noexcept; + + IrContainer& operator=(const IrContainer& other); + IrContainer& operator=(IrContainer&& other) noexcept; + + virtual ~IrContainer(); + + bool inContainer(const Statement* stmt) const; + + void assertInContainer(const Statement* stmt, const std::string& msg) const { + TORCH_CHECK( + inContainer(stmt), msg, " it was not found in the active container."); + } + + //! Return in insertion order + const std::deque deterministic_vals() const noexcept { + std::deque vals_deque; + std::transform( + vals_up_.begin(), + vals_up_.end(), + std::back_inserter(vals_deque), + [](const std::unique_ptr& val_up) { return val_up.get(); }); + return vals_deque; + } + + //! Register the Statement with this container + virtual void registerStmt(IrBuilderPasskey, Statement* stmt); + + //! Register the Val with this container + virtual void registerVal(IrBuilderPasskey, Val* val); + + //! Register expr with this container. + virtual void registerExpr(IrBuilderPasskey, Expr* expr); + + //! Allow expr's to register themselves with a container, this is only used + //! for broadcastOp so it can register itself in its constructor so root maps + //! can be built. + virtual void registerExpr(ExprPasskey, Expr* expr); + + //! Return the set of Exprs registered with this fusion. Warning: This will + //! return exprs outside inputs/outputs, so can be unsafe for use with + //! segmented fusions. + const std::unordered_set& unordered_exprs() const noexcept { + return exprs_; + } + + //! Return the set of Vals registered with this fusion + const std::unordered_set& vals() const noexcept { + return vals_; + } + + // Shortcuts for frequently used vals + Int* zeroVal(); + Int* oneVal(); + Bool* falseVal(); + Bool* trueVal(); + NamedScalar* magicZeroVal(); + + protected: + static IrCloner copy(const IrContainer* from, IrContainer* to); + + friend void swap(IrContainer& a, IrContainer& b) noexcept; + + // Let mutator remove Exprs. + friend OptOutMutator; + + virtual void removeExpr(Expr* expr); + + //! Completely remove val from the fusion, break all dependencies associated + //! with it + virtual void removeVal(Val* val); + + //! Register the Val with this container + virtual void registerVal(Val* val); + + //! Register expr with this container. + virtual void registerExpr(Expr* expr); + + StmtNameType getValName(ValType vtype) { + if (val_type_name_map_.find(vtype) == val_type_name_map_.end()) { + val_type_name_map_[vtype] = 0; + } + return val_type_name_map_[vtype]++; + } + + StmtNameType getExprName() { + return expr_name_counter_++; + } + + void clear() noexcept; + + // Deque of unique pointer is the memory owning data structure + std::deque> vals_up_; + + // A convenient set to return when we just need an unordered set to do + // something like check if a Val is in this container + std::unordered_set vals_; + + // Deque of unique pointer is the memory owning data structure + std::deque> exprs_up_; + + // A convenient set to return when we just need an unordered set to do + // something like check if an Expr is in this container + std::unordered_set exprs_; + + // Used to implement a generic "inContainer" that can be passed an invalid + // pointer. Specifically a pointer to a Statement owned by another container + // that has been freed. We can't check normally with the unordered_sets we + // already have because it would require a const_cast from a constant + // expr/val, or a dynamic cast from a Statement. + std::unordered_set raw_ptrs_; + + // Values names counters + std::unordered_map val_type_name_map_; + + // Expression names counter + StmtNameType expr_name_counter_ = 0; + + // Manually store some persistent, frequently used nodes. It's very + // challenging to do this anything but manually as detecting when a container + // may or may not have one of these vals is tricky. Specifically because if + // the container doesn't own it, it's hard to understand from the outside if + // the node may have been removed then re-registered. It could also be tricky + // to know when we're using a different container as in FusionCopy_test + // demonstrates deleting then creating containers can result in the same + // pointer for the container. + std::unique_ptr true_val_; + std::unique_ptr false_val_; + std::unique_ptr one_val_; + std::unique_ptr zero_val_; + std::unique_ptr magic_zero_val_; +}; + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp b/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp index 5ca8d54aaa9d..941bf22dea76 100644 --- a/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp +++ b/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -303,13 +304,13 @@ void IrGraphGenerator::generateScheduleGraph() { // Maybe not the best way to handle the root domain, but should be okay addArc( tv, - new TensorDomain(tv->getRootDomain()), + IrBuilder::create(tv->getRootDomain()), "[style=dashed, color=green, arrowhead=none]"); if (tv->domain()->hasRFactor()) addArc( tv, - new TensorDomain(tv->domain()->getRFactorDomain()), + IrBuilder::create(tv->domain()->getRFactorDomain()), "[style=dashed, color=green, arrowhead=none]"); } } @@ -370,6 +371,10 @@ void IrGraphGenerator::handle(const Int* i) { printValue(i, IrNodeLabel::gen(i, detail_level_)); } +void IrGraphGenerator::handle(const ComplexDouble* i) { + printValue(i, IrNodeLabel::gen(i, detail_level_)); +} + void IrGraphGenerator::handle(const NamedScalar* i) { printValue(i, IrNodeLabel::gen(i, detail_level_)); } diff --git a/torch/csrc/jit/codegen/cuda/ir_graphviz.h b/torch/csrc/jit/codegen/cuda/ir_graphviz.h index 1144d95eb152..e5bbcac9157d 100644 --- a/torch/csrc/jit/codegen/cuda/ir_graphviz.h +++ b/torch/csrc/jit/codegen/cuda/ir_graphviz.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -79,6 +79,7 @@ class TORCH_CUDA_CU_API IrGraphGenerator : private OptInConstDispatch { void handle(const Bool*) override; void handle(const Double*) override; void handle(const Int*) override; + void handle(const ComplexDouble*) override; void handle(const NamedScalar*) override; void handle(const UnaryOp*) override; diff --git a/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h b/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h index 02c319d36653..0584e2f33743 100644 --- a/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h +++ b/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -19,6 +19,9 @@ namespace cuda { class WelfordResult; class ViewTransform; +class IrCloner; +class IrBuilderPasskey; + //! A Bool value //! //! This value can be a symbolic value (defined after the kernel @@ -26,17 +29,18 @@ class ViewTransform; //! class TORCH_CUDA_CU_API Bool : public Val { public: - Bool() : Val(ValType::Scalar, DataType::Bool), maybe_value_{c10::nullopt} {} + Bool(IrBuilderPasskey passkey); + + explicit Bool(IrBuilderPasskey passkey, bool value); - explicit Bool(bool value) - : Val(ValType::Scalar, DataType::Bool), maybe_value_{value} {} + explicit Bool(IrBuilderPasskey passkey, c10::optional value); Bool(const Bool* src, IrCloner* ir_cloner); bool isSymbolic() const { return !(maybe_value_.has_value()); } - bool isConst() const { + bool isConst() const final { return maybe_value_.has_value(); } c10::optional value() const { @@ -49,25 +53,25 @@ class TORCH_CUDA_CU_API Bool : public Val { const c10::optional maybe_value_; }; -//! A Float64 value. For now we don't have any other type besides -//! Float64. This value can be a symbolic value (defined after the kernel -//! is compiled) or a constant value (inlined into the kernel definition). +//! A Float64 value. This value can be a symbolic value (defined after the +//! kernel is compiled) or a constant value (inlined into the kernel +//! definition). class TORCH_CUDA_CU_API Double : public Val { public: using ScalarType = double; - Double() - : Val(ValType::Scalar, DataType::Double), maybe_value_{c10::nullopt} {} + Double(IrBuilderPasskey passkey); - explicit Double(ScalarType value) - : Val(ValType::Scalar, DataType::Double), maybe_value_{value} {} + explicit Double(IrBuilderPasskey passkey, ScalarType value); + + explicit Double(IrBuilderPasskey passkey, c10::optional value); Double(const Double* src, IrCloner* ir_cloner); bool isSymbolic() const { return !(maybe_value_.has_value()); } - bool isConst() const { + bool isConst() const final { return maybe_value_.has_value(); } c10::optional value() const { @@ -86,17 +90,51 @@ class TORCH_CUDA_CU_API Int : public Val { public: using ScalarType = int64_t; - Int() : Val(ValType::Scalar, DataType::Int), maybe_value_{c10::nullopt} {} + Int(IrBuilderPasskey passkey); + + explicit Int(IrBuilderPasskey passkey, ScalarType value); - explicit Int(ScalarType value) - : Val(ValType::Scalar, DataType::Int), maybe_value_{value} {} + explicit Int(IrBuilderPasskey passkey, c10::optional value); Int(const Int* src, IrCloner* ir_cloner); bool isSymbolic() const { return !(maybe_value_.has_value()); } - bool isConst() const { + bool isConst() const final { + return maybe_value_.has_value(); + } + c10::optional value() const { + return maybe_value_; + } + + bool sameAs(const Statement* other) const override; + + private: + const c10::optional maybe_value_; +}; + +//! An c10::complex value. This value can be a symbolic value (defined +//! after the kernel is compiled) or a constant value (inlined into the kernel +//! definition). +class TORCH_CUDA_CU_API ComplexDouble : public Val { + public: + using ScalarType = c10::complex; + + ComplexDouble(IrBuilderPasskey passkey); + + explicit ComplexDouble(IrBuilderPasskey passkey, ScalarType value); + + explicit ComplexDouble( + IrBuilderPasskey passkey, + c10::optional value); + + ComplexDouble(const ComplexDouble* src, IrCloner* ir_cloner); + + bool isSymbolic() const { + return !(maybe_value_.has_value()); + } + bool isConst() const final { return maybe_value_.has_value(); } c10::optional value() const { @@ -152,14 +190,18 @@ class TVDomainGuard; class TORCH_CUDA_CU_API TensorView : public Val { public: TensorView( + IrBuilderPasskey passkey, TensorDomain* domain, DataType dtype, MemoryType mtype = MemoryType::Local); - explicit TensorView(const std::shared_ptr& tensor_type); + explicit TensorView( + IrBuilderPasskey passkey, + const std::shared_ptr& tensor_type); - explicit TensorView(const std::shared_ptr& jit_value) - : TensorView(jit_value->type()->cast()) {} + explicit TensorView( + IrBuilderPasskey passkey, + const std::shared_ptr& jit_value); TensorView(const TensorView* src, IrCloner* ir_cloner); @@ -167,6 +209,13 @@ class TORCH_CUDA_CU_API TensorView : public Val { return domain_; } + //! This is for a TensorView with an rFactor domain that is an input to a + //! fusion segment. We convert the rfactor domain into a new root domain. + //! Any dynamic-sized rfactor iterDomains are given a new symbolic extent. + //! Concrete integer extents are kept. Output TensorViews of any subsequent + //! expressions that use this TensorView are also updated. + void convertRfactorToRootDomain(); + void setContiguity(const std::vector& contig) { domain()->setContiguity(contig); } @@ -187,6 +236,16 @@ class TORCH_CUDA_CU_API TensorView : public Val { //! trivial reductions bool hasAnyReduction() const; + //! Returns true if this tensor is zero dimensional, + //! i.e. a wrapped scalar or an empty placeholder. + bool isZeroDim() const { + return nDims() == 0; + } + + //! Returns true if this tensor does not contain + //! any value. + bool isEmptyTensor() const; + c10::optional getReductionAxis() const; const std::vector& getRootDomain() const; @@ -210,6 +269,24 @@ class TORCH_CUDA_CU_API TensorView : public Val { size_t nDims() const; + // sets cpu_scalar_ value, which is special handling for CPU based zero-dim + // tensors (i.e. CPU Tensors that only have one value). This is only used if + // on an input value, otherwise ignored. This is important as special handling + // because these "scalars" should be type promoted as a tensor, but we want to + // avoid explicit copying of the data, so we want to pass the data value as a + // standard kernel argument value. + void setCpuScalar(bool is_cpu_scalar); + + // returns cpu_scalar_ value, which is special handling for CPU based zero-dim + // tensors (i.e. CPU Tensors that only have one value). This is only used if + // on an input value, otherwise ignored. This is important as special handling + // because these "scalars" should be type promoted as a tensor, but we want to + // avoid explicit copying of the data, so we want to pass the data value as a + // standard kernel argument value. + bool isCpuScalar() const { + return cpu_scalar_; + } + // Returns the position that this tensor is produced at relative to its axes. unsigned int getComputeAtPosition() const { return compute_at_pos_; @@ -318,29 +395,27 @@ class TORCH_CUDA_CU_API TensorView : public Val { // TensorView* rFactor(const std::vector& axes); - //! Welford Version of rFactor, semantically similar with - //! the reduction version except that the rfactor is done - //! in a multi-output scan pattern - WelfordResult rFactor( + //! Multi-output version of rFactor, semantically similar with + //! the reduction version except that the rfactor is done + //! for all outputs in a consistent way + std::vector rFactor( const std::vector& axes, - TensorView* avg, - TensorView* var, - TensorView* n); + const std::vector& tvs); // Create a TensorView before the original tensor. A common use case is to // write results into shared memory or registers before moving to global // memory. Analogous to TVM Cache_Write - TensorView* cache_before(); + TensorView* cacheBefore(); // Create a TensorView after the original tensor. A common use case is to // read tensor into shared memory or registers. Analogous to TVM Cache_Read - TensorView* cache_after(); + TensorView* cacheAfter(); // For a fusion output with other uses, we want to avoid writing to global // memory and then reading the output again. We write to global memory // separately after an operation. We replace this fusion output with the // direct write TensorView. - TensorView* cache_fork(); + TensorView* cacheFork(); MemoryType getMemoryType() const { return memory_type_; @@ -356,12 +431,38 @@ class TORCH_CUDA_CU_API TensorView : public Val { return axes_to_swizzle_; } + // Apply double buffering transformation + void doubleBuffer(); + + bool isDoubleBuffered() const { + return is_double_buffered_; + } + + //! Fill in mma options in scheduling time. + //! Each mma op in Fusion IR must be configured once before lowering. + //! Mma options are configuration parameters used in lowering to mma + //! instrinsics, mainly the type of mma macro to use and input data layout + //! etc. + //! + //! TODO: This step will very likely be removed in a follow up PR. All of + //! the options configured here could actually be inferred from fusion IR + //! once we are feature complete. + void configureMma(MmaOptions options); + + //! Transforms the innermost iterdomains according to the given mma swizzle, + //! this should be used on the tvs that are either inputs/outputs of an + //! MmaOp, or any tv's that are involved in prolog/epilog fusions and need to + //! have a matching thread swizzle with the mma operand/result. + //! More detail on usage see [WarpMmaSwizzler] in scheduler/mma_utils.h . + void applyMmaSwizzle(MmaOptions options); + friend TORCH_CUDA_CU_API TransformPropagator; friend TORCH_CUDA_CU_API TransformReplay; friend TORCH_CUDA_CU_API OptOutMutator; friend ComputeAt; - friend void adjustMemoryTypes(Fusion* fusion); friend class ir_utils::TVDomainGuard; + friend TORCH_CUDA_CU_API void groupReductions( + const std::vector&); protected: void setDomain(TensorDomain* td) { @@ -380,9 +481,9 @@ class TORCH_CUDA_CU_API TensorView : public Val { return pos; } - //! A helper function to maintain the consistency of welford output - //! schedules when doing rfactor on welford ops. - TensorView* welfordRfactorHelper( + //! A helper function to maintain the consistency of schedules of + //! multiple outputs wheen doing rfactor on multi-output reduction ops. + TensorView* multiOutputRfactorHelper( TensorView* tv, const std::vector& axes); @@ -393,6 +494,14 @@ class TORCH_CUDA_CU_API TensorView : public Val { MemoryType memory_type_ = MemoryType::Local; SwizzleType swizzle_type_ = SwizzleType::NoSwizzle; std::vector axes_to_swizzle_; + bool is_double_buffered_ = false; + // special handling for CPU based zero-dim tensors (i.e. CPU Tensors that only + // have one value). This is only used if on an input value, otherwise ignored. + // This is important as special handling because these "scalars" should be + // type promoted as a tensor, but we want to avoid explicit copying of the + // data, so we want to pass the data value as a standard kernel argument + // value. + bool cpu_scalar_ = false; }; //! A simple TensorView builder diff --git a/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h b/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h index 8fd4475d2ddc..bf9d37867ee3 100644 --- a/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h +++ b/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h @@ -1,10 +1,12 @@ #pragma once -#include +#include #include #include #include +#include +#include //! Nodes in here should generally not be used by users. They should be behind //! the scenes and users shouldn't have to be aware of what they do to use the @@ -20,6 +22,8 @@ namespace fuser { namespace cuda { class ViewTransform; +class Scope; +class IrCloner; //! Returns true if both v1 and v2 are scalars, are the same type of scalars, //! and dispatches to the inherited Val type's `->sameAs` call. e.g. if both @@ -34,7 +38,7 @@ bool areEqualScalars(Val* v1, Val* v2); //! 4) split/merge class TORCH_CUDA_CU_API UnaryOp : public Expr { public: - UnaryOp(UnaryOpType type, Val* out, Val* in); + UnaryOp(IrBuilderPasskey, UnaryOpType type, Val* out, Val* in); UnaryOp(const UnaryOp* src, IrCloner* ir_cloner); @@ -63,7 +67,7 @@ class TORCH_CUDA_CU_API UnaryOp : public Expr { //! 2) LT (A < B) class TORCH_CUDA_CU_API BinaryOp : public Expr { public: - BinaryOp(BinaryOpType type, Val* out, Val* lhs, Val* rhs); + BinaryOp(IrBuilderPasskey, BinaryOpType type, Val* out, Val* lhs, Val* rhs); BinaryOp(const BinaryOp* src, IrCloner* ir_cloner); @@ -97,7 +101,11 @@ class TORCH_CUDA_CU_API BroadcastOp : public Expr { //! \param out The output tensor //! \param in The input tensor //! \param is_broadcast_dims True when output dim is a new broadcast domain - BroadcastOp(Val* out, Val* in, std::vector is_broadcast_dims); + BroadcastOp( + IrBuilderPasskey, + Val* out, + Val* in, + std::vector is_broadcast_dims); BroadcastOp(const BroadcastOp* src, IrCloner* ir_cloner); @@ -138,7 +146,14 @@ class TORCH_CUDA_CU_API BroadcastOp : public Expr { //! non-reduction/non-broadcast dimensions. class TORCH_CUDA_CU_API ReductionOp : public Expr { public: - ReductionOp(BinaryOpType reduction_op_type, Val* init, Val* out, Val* in); + ReductionOp( + IrBuilderPasskey, + BinaryOpType reduction_op_type, + Val* init, + Val* out, + Val* in, + bool is_allreduce = false, + ExprType expr_type = ExprType::ReductionOp); ReductionOp(const ReductionOp* src, IrCloner* ir_cloner); @@ -156,6 +171,10 @@ class TORCH_CUDA_CU_API ReductionOp : public Expr { return reduction_op_type_; } + bool isAllreduce() const { + return is_allreduce_; + } + bool sameAs(const Statement* other) const override; private: @@ -163,12 +182,67 @@ class TORCH_CUDA_CU_API ReductionOp : public Expr { Val* const init_ = nullptr; Val* const out_ = nullptr; Val* const in_ = nullptr; + //! True if broadcast is fused + bool is_allreduce_ = false; +}; + +//! Grouped reduction operation for horizontal fusions. It works like +//! batched GEMMs in the sense that multiple independent reductions are +//! performed together. The main benefit is when reducing tensors across thread +//! blocks, a single grid sync can be done for all individual +//! reductions. As grid sync is very expensive, this can be a +//! significant performance impact. +class TORCH_CUDA_CU_API GroupedReductionOp : public Expr { + public: + GroupedReductionOp( + IrBuilderPasskey, + std::vector reduction_op_type, + std::vector init, + std::vector out, + std::vector in, + bool is_allreduce = false, + ExprType expr_type = ExprType::GroupedReductionOp); + + GroupedReductionOp(const GroupedReductionOp* src, IrCloner* ir_cloner); + + size_t numReductions() const { + return reduction_op_types_.size(); + } + + const std::vector& initVals() const { + return init_vals_; + } + + Val* initVal(size_t index) const { + return init_vals_.at(index); + } + + const std::vector& getReductionOpTypes() const { + return reduction_op_types_; + } + + BinaryOpType getReductionOpType(size_t index) const { + return reduction_op_types_.at(index); + } + + bool isAllreduce() const { + return is_allreduce_; + } + + bool sameAs(const Statement* other) const override; + + private: + const std::vector reduction_op_types_; + const std::vector init_vals_; + //! True if using the fused reduction kernel + bool is_allreduce_ = false; }; //! Welford Scan operation. class TORCH_CUDA_CU_API WelfordOp : public Expr { public: WelfordOp( + IrBuilderPasskey, Val* out_avg, Val* out_var, Val* out_N, @@ -177,7 +251,8 @@ class TORCH_CUDA_CU_API WelfordOp : public Expr { Val* init_N, Val* in_avg, Val* in_var, - Val* in_N); + Val* in_N, + bool is_fused = false); WelfordOp(const WelfordOp* src, IrCloner* ir_cloner); @@ -189,10 +264,6 @@ class TORCH_CUDA_CU_API WelfordOp : public Expr { return in_avg_; } - Val* init() const { - return init_avg_; - } - bool sameAs(const Statement* const other) const override; // Welford Accessors @@ -241,6 +312,12 @@ class TORCH_CUDA_CU_API WelfordOp : public Expr { return !init_N_->isZeroInt(); } + bool isAllreduce() const { + return is_allreduce_; + } + + std::vector getInitVals() const; + private: Val* const out_avg_; Val* const out_var_; @@ -251,11 +328,72 @@ class TORCH_CUDA_CU_API WelfordOp : public Expr { Val* const in_avg_; Val* const in_var_; Val* const in_N_; + //! True if using the fused reduction kernel (not implemented yet) + bool is_allreduce_ = false; +}; + +//! Fused Matmul operation +class TORCH_CUDA_CU_API MmaOp : public Expr { + public: + MmaOp(IrBuilderPasskey, Val* out, Val* in_a, Val* in_b, Val* init); + + MmaOp( + IrBuilderPasskey, + Val* out, + Val* in_a, + Val* in_b, + Val* init, + MmaOptions options); + + MmaOp(const MmaOp* src, IrCloner* ir_cloner); + + Val* out() const { + return out_; + } + + Val* inA() const { + return in_a_; + } + + Val* inB() const { + return in_b_; + } + + Val* init() const { + return init_; + } + + const auto& options() const { + TORCH_INTERNAL_ASSERT(options_.has_value(), "MmaOp not configured:", this); + return options_.value(); + } + + bool sameAs(const Statement* const other) const override; + + auto accStride() const { + TORCH_INTERNAL_ASSERT(options_.has_value(), "MmaOp not configured:", this); + return options_->accumulator_stride; + } + + void configureOptions(MmaOptions options) { + options_ = options; + } + + private: + Val* const out_ = nullptr; + Val* const in_a_ = nullptr; + Val* const in_b_ = nullptr; + Val* const init_ = nullptr; + c10::optional options_ = c10::nullopt; }; class TORCH_CUDA_CU_API TransposeOp : public Expr { public: - TransposeOp(TensorView* out, TensorView* in, std::vector new2old); + TransposeOp( + IrBuilderPasskey, + TensorView* out, + TensorView* in, + std::vector new2old); TransposeOp(const TransposeOp* src, IrCloner* ir_cloner); @@ -279,7 +417,13 @@ class TORCH_CUDA_CU_API TransposeOp : public Expr { class TORCH_CUDA_CU_API TernaryOp : public Expr { public: - TernaryOp(TernaryOpType type, Val* out, Val* in1, Val* in2, Val* in3); + TernaryOp( + IrBuilderPasskey, + TernaryOpType type, + Val* out, + Val* in1, + Val* in2, + Val* in3); TernaryOp(const TernaryOp* src, IrCloner* ir_cloner); @@ -317,7 +461,12 @@ class TORCH_CUDA_CU_API ShiftOp : public Expr { //! \param out //! \param in //! \param offsets - ShiftOp(Val* out, Val* in, std::vector offsets, bool pad); + ShiftOp( + IrBuilderPasskey, + Val* out, + Val* in, + std::vector offsets, + std::vector pad_width); ShiftOp(const ShiftOp* src, IrCloner* ir_cloner); @@ -336,8 +485,14 @@ class TORCH_CUDA_CU_API ShiftOp : public Expr { return offsets_; } - bool pad() const { - return pad_; + const std::vector& padWidth() const { + return pad_width_; + } + + bool hasPadding() const { + return std::any_of(pad_width_.begin(), pad_width_.end(), [](const auto p) { + return p > 0; + }); } bool sameAs(const Statement* other) const override; @@ -349,17 +504,18 @@ class TORCH_CUDA_CU_API ShiftOp : public Expr { //! offsets_. The sign of each value indicates the direction of //! shifting. const std::vector offsets_; - const bool pad_; + const std::vector pad_width_; }; //! Gather a window around each element. class TORCH_CUDA_CU_API GatherOp : public Expr { public: GatherOp( + IrBuilderPasskey, Val* out, Val* in, - std::vector window_shape, - std::vector> pad_width); + std::vector window_shape, + std::vector> pad_width); GatherOp(const GatherOp* src, IrCloner* ir_cloner); @@ -381,20 +537,64 @@ class TORCH_CUDA_CU_API GatherOp : public Expr { return pad_width_; } + bool hasPadding() const { + return std::any_of(pad_width_.begin(), pad_width_.end(), [](const auto& p) { + return p[0] > 0 || p[1] > 0; + }); + } + bool sameAs(const Statement* other) const override; private: Val* const out_ = nullptr; Val* const in_ = nullptr; //! Shape of a window gathered for each element. - std::vector window_shape_; + std::vector window_shape_; //! The size of zero-padding of each axis. - std::vector> pad_width_; + std::vector> pad_width_; +}; + +class TORCH_CUDA_CU_API ViewAsScalar : public Expr { + public: + ViewAsScalar( + IrBuilderPasskey, + Val* out, + Val* in, + IterDomain* vector_id, + Val* index = nullptr); + + ViewAsScalar(const ViewAsScalar* src, IrCloner* ir_cloner); + + Val* out() const { + return out_; + } + + Val* in() const { + return in_; + } + + IterDomain* vector_id() const { + return vector_id_; + } + + Val* index() const { + return index_; + } + + private: + Val* const out_ = nullptr; + Val* const in_ = nullptr; + + // The IterDomain of type VectorComponent newly appended to the output + IterDomain* vector_id_ = nullptr; + + // The index that vector_id_ is lowered into + Val* index_ = nullptr; }; class TORCH_CUDA_CU_API ViewOp : public Expr { public: - ViewOp(TensorView* out, TensorView* in); + ViewOp(IrBuilderPasskey, TensorView* out, TensorView* in); ViewOp(const ViewOp* src, IrCloner* ir_cloner); @@ -422,39 +622,37 @@ class IndexReferenceReplay; class TORCH_CUDA_CU_API IterDomain : public Val { public: IterDomain( + IrBuilderPasskey, Val* start, Val* extent, ParallelType parallel_type = ParallelType::Serial, IterType iter_type = IterType::Iteration, - bool is_rfactor_domain = false); + bool is_rfactor_domain = false, + bool is_padded_dimension = false, + c10::optional padded_to_size_ = c10::nullopt, + bool is_mma_swizzled = false); + // Same as the above but can set the offset of the stop point IterDomain( + IrBuilderPasskey, Val* start, Val* extent, Val* stop_offset, ParallelType parallel_type = ParallelType::Serial, IterType iter_type = IterType::Iteration, - bool is_rfactor_domain = false); + bool is_rfactor_domain = false, + bool is_padded_dimension = false, + c10::optional padded_to_size_ = c10::nullopt, + bool is_mma_swizzled = false); IterDomain(const IterDomain* src, IrCloner* ir_cloner); bool sameAs(const Statement* other) const override; - // Returns a new IterDomain matching properties of this - // TODO: parallel_method->getParallelType - IterDomain* clone() const { - auto cloned = new IterDomain( - start(), - extent(), - stopOffset(), - getParallelType(), - getIterType(), - isRFactorProduct()); - - cloned->is_padded_dimension_ = is_padded_dimension_; - cloned->padded_to_size_ = padded_to_size_; - return cloned; - } + //! Returns a new IterDomain matching properties of this + //! + //! This does NOT copy the is_rfactor_domain flag. + IterDomain* cloneWithoutRFactor() const; //! Clone a vector domains static std::vector clone( @@ -504,6 +702,10 @@ class TORCH_CUDA_CU_API IterDomain : public Val { return getIterType() == IterType::Stride; } + bool isVectorComponent() const { + return getIterType() == IterType::VectorComponent; + } + bool isParallelized() const { return getParallelType() != ParallelType::Serial; } @@ -631,6 +833,55 @@ class TORCH_CUDA_CU_API IterDomain : public Val { //! domain. std::pair stridedSplit(int factor); + // TODO: Remove + bool isSimple() const { + return definition() == nullptr; + } + + //! Marks that this id represents a + //! instruction loop, mma use only. + //! + //! An instruction loop can be considered a generalization of + //! vectorization. It also represents a loop that's implemented + //! by an instruction and should not be realized by codegen and + //! cannot be inlined with. + //! As an example, if a mma macro, call it mma_eg implements: + //! for m in M + //! for n in N + //! for k in K + //! C[m,n] += A[m,k]*B[k,n], + //! But the generated code should simply be: + //! mma_eg(C,A,B) + //! without the 3 level loopnest, i.e. they're instruction loops. + //! + //! In the actual mma macros, the loopnests it implements is a + //! transformed version of above to match the mma swizzle. + //! So it's different implicit loopnest for different macros. + //! WarpMmaSwizzler will label the instruction loops case-by-case. + bool isMma() const { + return parallel_type_ == ParallelType::Mma; + } + + bool isMmaSwizzled() const { + return is_mma_swizzled_; + } + + //! Used by WarpMmaSwizzler, this is an utility for WarpMmaSwizzler + //! to lock the thread swizzled iterdomains. + //! Only true for the iterdomains produced by WarpMmaSwizzler. + //! Mma ops require specific swizzle patterns + //! and this label utility is to prevent any further transform on the + //! iterdomains involved in the swizzle so that the pattern remain correct in + //! generated code. + //! + //! Note: + //! Used only through WarpMmaSwizzler only and mma validation relies on + //! this + //! flag being set on the correct iterdomains. + void toMmaSwizzled() { + is_mma_swizzled_ = true; + } + protected: friend TensorDomain; friend ReplayTransformations; @@ -647,6 +898,15 @@ class TORCH_CUDA_CU_API IterDomain : public Val { bool is_rfactor_domain_ = false; bool is_padded_dimension_ = false; c10::optional padded_to_size_ = c10::nullopt; + + // TODO: Remove only used in kernel IR because IterDomains don't maintain + // definitions of split/merge. + bool is_simple_ = true; + + //! Tracks if this id represents a thread swizzled loop or + //! models an implicit loop within instructions. Should not make + //! any changes once an id is warp mapped. + bool is_mma_swizzled_ = false; }; //! TensorDomain holds a vector of IterDomains. It holds an IterDomain for every @@ -666,15 +926,18 @@ class TORCH_CUDA_CU_API IterDomain : public Val { class TORCH_CUDA_CU_API TensorDomain : public Val { public: explicit TensorDomain( + IrBuilderPasskey, std::vector root_domain, std::vector contiguity = std::vector()); TensorDomain( + IrBuilderPasskey, std::vector root_domain, std::vector domain, std::vector contiguity = std::vector()); TensorDomain( + IrBuilderPasskey, std::vector root_domain, std::vector rfactor_domain, std::vector domain, @@ -718,8 +981,14 @@ class TORCH_CUDA_CU_API TensorDomain : public Val { bool hasReduction() const; bool hasBlockReduction() const; bool hasGridReduction() const; + bool hasBlockBroadcast() const; + bool hasGridBroadcast() const; bool hasBroadcast() const; bool hasRFactor() const; + + // Returns if rfactor domain only consists of id's of iter type. + bool hasViewLikeRFactor() const; + bool hasVectorize() const; c10::optional getReductionAxis() const; @@ -786,6 +1055,8 @@ class TORCH_CUDA_CU_API TensorDomain : public Val { TensorDomain* view( const std::vector>& transforms); + TensorDomain* flatten(int64_t start_dim, int64_t end_dim); + static std::vector orderedAs( const std::vector& td, const std::unordered_map& old2new); @@ -821,6 +1092,7 @@ class TORCH_CUDA_CU_API Split : public Expr { // start_offset and stop_offset are distance from the left end and // right ends, respectively. Split( + IrBuilderPasskey, IterDomain* outer, IterDomain* inner, IterDomain* in, @@ -881,12 +1153,13 @@ class TORCH_CUDA_CU_API Split : public Expr { //! dictate which will be traversed first (inner). Both IterDomains must be of //! the same iter or reduction type, as well as the same parallelization //! strategy if there is one -//! -//! \todo Should this be a unary op type? -//! class TORCH_CUDA_CU_API Merge : public Expr { public: - Merge(IterDomain* out, IterDomain* outer, IterDomain* inner); + Merge( + IrBuilderPasskey, + IterDomain* out, + IterDomain* outer, + IterDomain* inner); Merge(const Merge* src, IrCloner* ir_cloner); @@ -918,9 +1191,7 @@ class TORCH_CUDA_CU_API Merge : public Expr { //! class TORCH_CUDA_CU_API NamedScalar : public Val { public: - // NOLINTNEXTLINE(modernize-pass-by-value) - NamedScalar(std::string name, DataType dtype) - : Val(ValType::NamedScalar, dtype), name_(name) {} + NamedScalar(IrBuilderPasskey passkey, std::string name, DataType dtype); NamedScalar(const NamedScalar* src, IrCloner* ir_cloner); @@ -931,9 +1202,11 @@ class TORCH_CUDA_CU_API NamedScalar : public Val { bool sameAs(const Statement* other) const override; //! Return the named scalar extent of a parallel dimension (e.g. blockDim.x) + //! WARNING: Only works with Fusion container at the moment static NamedScalar* getParallelDim(ParallelType p_type); //! Return the named scalar index of a parallel dimension (e.g. threadIdx.x) + //! WARNING: Only works with Fusion container at the moment static NamedScalar* getParallelIndex(ParallelType p_type); //! Return the parallel type of this NamedScalar if it is an extent of a diff --git a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp b/torch/csrc/jit/codegen/cuda/ir_iostream.cpp index a553c59fc2b0..0b83e07f784b 100644 --- a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp +++ b/torch/csrc/jit/codegen/cuda/ir_iostream.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -14,6 +15,23 @@ namespace jit { namespace fuser { namespace cuda { +namespace { +const char* boolLiteral(bool value) { + return value ? "true" : "false"; +} + +std::string varName(const Val* val) { + std::stringstream value_name; + if (val == nullptr) { + value_name << "$nullptr"; + } else { + value_name << val->name(); + } + return value_name.str(); +} + +} // namespace + // Make sure we can inline something, before we attempt to. static void checkInlineable(const Expr* expr) { for (auto input : expr->inputs()) { @@ -49,55 +67,52 @@ void IrPrinter::handle(Fusion* fusion) { } } -void IrPrinter::handle(const TensorDomain* td) { - if (td->nDims() == 0) { - os_ << "[ 0 ]"; - return; +void IrPrinter::handle(const kir::Kernel* kernel) { + TORCH_CHECK(kernel != nullptr); + + // kernel declaration + os_ << "\nKERNEL ("; + for (auto in : kernel->inputs()) { + handle(in); + if (in != kernel->inputs().back()) { + os_ << ", "; + } } - os_ << "[ "; - for (const auto i : c10::irange(td->nDims())) { - handle(td->axis(i)); - if (i != td->nDims() - 1) + os_ << ") -> ("; + for (auto out : kernel->outputs()) { + handle(out); + if (out != kernel->outputs().back()) { os_ << ", "; + } } - os_ << " ]"; + os_ << ") :\n"; + + // kernel body + indent_size_++; + for (auto expr : kernel->topLevelExprs()) { + handle(expr); + } + indent_size_--; + os_ << "END.\n\n"; } -void IrPrinter::handle(const TensorView* tv) { - if (tv->nDims() == 0) { - os_ << typePrefix(tv->getDataType().value()) << tv->name(); - } else { - os_ << "T" << tv->name(); - switch (tv->getMemoryType()) { - case MemoryType::Global: - os_ << "_g"; - break; - case MemoryType::Shared: - os_ << "_s"; - break; - case MemoryType::Local: - os_ << "_l"; - break; - } - handle(tv->domain()); +void IrPrinter::handle(kir::Kernel& kernel) { + handle(&kernel); +} - if (tv->getComputeAtPosition() > 0) { - os_ << " ca_pos( "; - os_ << tv->getComputeAtPosition(); - os_ << " )"; - } - if (tv->getMaxProducerPosition() > 0) { - os_ << " produce_pos( "; - os_ << tv->getMaxProducerPosition(); - os_ << ")"; - } +void IrPrinter::handleScope(const kir::Scope& scope) { + // Save the uses of the parent scope + indent_size_++; + for (auto expr : scope.exprs()) { + handle(expr); } + indent_size_--; } void IrPrinter::handle(const IterDomain* id) { os_ << id->getIterType(); os_ << id->getParallelType(); - os_ << id->name(); + os_ << varName(id); os_ << "{"; if (!id->start()->isZeroInt()) { print_inline(id->start()); @@ -116,6 +131,47 @@ void IrPrinter::handle(const IterDomain* id) { } } +void IrPrinter::handle(const TensorDomain* td) { + if (td->nDims() == 0) { + os_ << "[ 0 ]"; + return; + } + os_ << "[ "; + for (const auto i : c10::irange(td->nDims())) { + handle(td->axis(i)); + if (i != td->nDims() - 1) + os_ << ", "; + } + os_ << " ]"; +} + +void IrPrinter::handle(const TensorView* tv) { + os_ << "T" << varName(tv); + switch (tv->getMemoryType()) { + case MemoryType::Global: + os_ << "_g"; + break; + case MemoryType::Shared: + os_ << "_s"; + break; + case MemoryType::Local: + os_ << "_l"; + break; + } + handle(tv->domain()); + + if (tv->getComputeAtPosition() > 0) { + os_ << " ca_pos( "; + os_ << tv->getComputeAtPosition(); + os_ << " )"; + } + if (tv->getMaxProducerPosition() > 0) { + os_ << " produce_pos( "; + os_ << tv->getMaxProducerPosition(); + os_ << ")"; + } +} + void IrPrinter::handle(const Bool* b) { if (print_inline_ && b->definition() != nullptr) { os_ << "( "; @@ -124,10 +180,9 @@ void IrPrinter::handle(const Bool* b) { return; } - if (b->isSymbolic()) { - os_ << "b" << b->name(); - } else { - os_ << "bool(" << *(b->value()) << ")"; + os_ << "b" << varName(b); + if (b->isConst()) { + os_ << "(" << (b->value().value() ? "true" : "false") << ")"; } } @@ -140,7 +195,7 @@ void IrPrinter::handle(const Double* d) { } if (d->isSymbolic()) { - os_ << "d" << d->name(); + os_ << "d" << varName(d); } else { os_ << "double(" << std::setprecision( @@ -160,30 +215,39 @@ void IrPrinter::handle(const Int* i) { } if (i->isSymbolic()) { - os_ << "i" << i->name(); + os_ << "i" << varName(i); } else { os_ << *(i->value()); } } -void IrPrinter::handle(const NamedScalar* i) { - os_ << i->name(); -} +void IrPrinter::handle(const ComplexDouble* c) { + if (print_inline_) { + if (auto def = c->definition()) { + os_ << "( "; + handle(def); + os_ << " )"; + return; + } + } -static bool isTV(const Val* val) { - return val->getValType().value() == ValType::TensorView; + if (c->isSymbolic()) { + os_ << "c" << varName(c); + } else { + os_ << "std::complex" + << std::setprecision(std::numeric_limits::max_digits10) + << *(c->value()); + } } -// Check if we're a TensorView op that we can generate code for. -static bool isTVOp(const Expr* expr) { - return expr->outputs().size() == 1 && isTV(expr->outputs().front()); +void IrPrinter::handle(const NamedScalar* ns) { + os_ << ns->name(); } void IrPrinter::handle(const UnaryOp* uop) { - bool istvop = isTVOp(uop); + bool istvop = ir_utils::isTvOp(uop); if (!print_inline_) { - indent(); - os_ << uop->out(); + indent() << uop->out(); if (istvop) { os_ << "\n"; indent_size_++; @@ -230,10 +294,9 @@ void IrPrinter::handle(const UnaryOp* uop) { } void IrPrinter::handle(const BinaryOp* bop) { - bool istvop = isTVOp(bop); + bool istvop = ir_utils::isTvOp(bop); if (!print_inline_) { - indent(); - os_ << bop->out(); + indent() << bop->out(); // tensor operations tend to be long, break them up into multiple lines if (istvop) { @@ -286,7 +349,7 @@ void IrPrinter::handle(const BinaryOp* bop) { } void IrPrinter::handle(const TernaryOp* top) { - bool istvop = isTVOp(top); + bool istvop = ir_utils::isTvOp(top); if (!print_inline_) { indent(); os_ << top->out(); @@ -327,18 +390,32 @@ void IrPrinter::handle(const TernaryOp* top) { } void IrPrinter::handle(const ReductionOp* rop) { - indent(); - os_ << rop->out() << " = reduction( " << rop->in() - << ", op = " << rop->getReductionOpType() - << ", initial value = " << rop->init() << " )\n"; + indent() << rop->out() << "\n"; + indent() << " = reduction( " << rop->in() + << ", op = " << rop->getReductionOpType() + << ", initial value = " << rop->init() + << ", allreduce = " << rop->isAllreduce() << " )\n"; +} + +void IrPrinter::handle(const GroupedReductionOp* grouped_rop) { + indent() << "Grouped reduction(\n"; + ++indent_size_; + for (const auto i : c10::irange(grouped_rop->numReductions())) { + indent() << grouped_rop->output(i) << " = reduction( " + << grouped_rop->input(i) + << ", op = " << grouped_rop->getReductionOpType(i) + << ", initial value = " << grouped_rop->initVal(i) << " )\n"; + } + indent() << "allreduce = " << (grouped_rop->isAllreduce() ? "true" : "false") + << " )\n"; + --indent_size_; } void IrPrinter::handle(const WelfordOp* wop) { - indent(); - os_ << wop->outAvg() << "(Avg),\n" - << wop->outVar() << "(Var),\n" - << wop->outN() << "(Count)" - << "\n = Welford ( "; + indent() << wop->outAvg() << "(Avg),\n" + << wop->outVar() << "(Var),\n" + << wop->outN() << "(Count)" + << "\n = Welford ( "; if (wop->singleValue()) { os_ << wop->inAvg() << "(Avg), "; } else { @@ -349,28 +426,59 @@ void IrPrinter::handle(const WelfordOp* wop) { os_ << "\n initial value = " << wop->initAvg() << "(Avg)\n " << wop->initVar() << "(Var)\n " << wop->initN() << "(N)"; } + os_ << "\n allreduce = " << wop->isAllreduce(); os_ << " )\n"; } void IrPrinter::handle(const BroadcastOp* bop) { - indent(); - os_ << bop->out() << " = broadcast( " << bop->in() << " )\n"; + indent() << bop->out() << "\n"; + indent() << " = broadcast( " << bop->in() << " )\n"; +} + +void IrPrinter::handle(const Split* s) { + os_ << (s->innerSplit() ? "Split: " : "Outer split: "); + handle(s->in()); + os_ << " by factor " << s->factor() << " -> "; + handle(s->outer()); + os_ << ", "; + handle(s->inner()); + if (s->startOffset()) { + os_ << ", start offset: "; + handle(s->startOffset()); + } + if (s->stopOffset()) { + os_ << ", stop offset: "; + handle(s->stopOffset()); + } + os_ << "\n"; +} + +void IrPrinter::handle(const Merge* m) { + os_ << "Merge: "; + handle(m->outer()); + os_ << " and "; + handle(m->inner()); + os_ << " -> "; + handle(m->out()); + os_ << "\n"; } void IrPrinter::handle(const TransposeOp* top) { - indent(); - os_ << top->out() << " = transpose( " << top->in() << " )\n"; + indent() << top->out() << " = transpose( " << top->in() << " )\n"; } void IrPrinter::handle(const ShiftOp* sop) { - indent(); - os_ << sop->out() << " = shift( " << sop->in() << ", {" << sop->offsets() - << "}, padding = " << (sop->pad() ? "true" : "false") << " )\n"; + indent() << sop->out() << " = shift( " << sop->in() << ", {" << sop->offsets() + << "}, {" << sop->padWidth() << "} )\n"; +} + +void IrPrinter::handle(const MmaOp* mma) { + indent() << mma->out() << " = mma(" << mma->inA() << "," << mma->inB(); + os_ << ")\n"; } void IrPrinter::handle(const GatherOp* op) { - indent(); - os_ << op->out() << " = gather( " << op->in() << ", {"; + indent() << op->out() << " = gather( " << op->in() << ", {"; bool no_comma = true; for (const auto& s : op->windowShape()) { if (!no_comma) { @@ -391,37 +499,261 @@ void IrPrinter::handle(const GatherOp* op) { os_ << "} )\n"; } +void IrPrinter::handle(const ViewAsScalar* top) { + indent() << top->out() << " = view_as_scalar( " << top->in() << ", " + << top->vector_id() << " )\n"; +} + void IrPrinter::handle(const ViewOp* top) { + indent() << top->out() << " = view( " << top->in() << " )\n"; +} + +void IrPrinter::handle(const kir::Predicate* node) { + switch (node->predicate_type()) { + case PredicateType::Manual: { + os_ << node->value(); + break; + } + default: + os_ << node->predicate_type(); + break; + } +} + +void IrPrinter::handle(const kir::TensorIndex* ti) { + os_ << "T" << varName(ti); + switch (ti->view()->getMemoryType()) { + case MemoryType::Global: + os_ << "_g"; + break; + case MemoryType::Shared: + os_ << "_s"; + break; + case MemoryType::Local: + os_ << "_l"; + break; + } + os_ << "["; + for (auto index : ti->indices()) { + print_inline(index); + if (index != ti->indices().back()) { + os_ << ", "; + } + } + os_ << "]"; + os_ << " view( T" << varName(ti->view()) << " )"; +} + +void IrPrinter::handle(const kir::Allocate* node) { indent(); - os_ << top->out() << " = view( " << top->in() << " )\n"; + handle(node->buffer()); + os_ << " = ALLOCATE(" + << "mem_type=" << node->memoryType() << ", " + << "size="; + print_inline(node->size()); + os_ << ", " + << "zero_init=" << boolLiteral(node->zeroInit()) << ")\n"; + if (node->alias() != nullptr) { + indent() << kTab << ".alias="; + handle(node->alias()->buffer()); + os_ << "\n"; + } } -void IrPrinter::handle(const Split* s) { - os_ << (s->innerSplit() ? "Split: " : "Outer split: "); - handle(s->in()); - os_ << " by factor " << s->factor() << " -> "; - handle(s->outer()); - os_ << ", "; - handle(s->inner()); - if (s->startOffset()) { - os_ << ", start offset: "; - handle(s->startOffset()); +void IrPrinter::handle(const kir::BlockSync* node) { + indent() << "BLOCKSYNC(war_hazard=" << boolLiteral(node->isWarHazardSync()) + << ")\n"; +} + +void IrPrinter::handle(const kir::GridSync* node) { + indent() << "GRIDSYNC(" << node->syncDims().toString() << ", "; + handle(node->syncBuffer()); + os_ << ")\n"; +} + +void IrPrinter::handle(const kir::ForLoop* node) { + indent() << "FOR "; + handle(node->index()); + os_ << " in "; + handle(node->iter_domain()); + os_ << ":\n"; + handleScope(node->body()); +} + +void IrPrinter::handle(const kir::IfThenElse* node) { + indent() << "IF "; + handle(node->predicate()); + os_ << ":\n"; + handleScope(node->thenBody()); + if (node->hasElse()) { + indent() << "ELSE:\n"; + handleScope(node->elseBody()); } - if (s->stopOffset()) { - os_ << ", stop offset: "; - handle(s->stopOffset()); +} + +void IrPrinter::handle(const kir::GridBroadcast* node) { + const auto* broadcast_op = node->broadcast_op(); + indent(); + handle(broadcast_op->out()); + os_ << " = " + << "GRID_BROADCAST(in="; + handle(broadcast_op->in()); + os_ << ")\n"; + indent() << kTab << ".broadcast_buffer="; + handle(node->broadcast_buffer()->buffer()); + os_ << "\n"; + indent() << kTab << ".sync_buffer="; + handle(node->sync_buffer()->buffer()); + os_ << "\n"; +} + +void IrPrinter::handle(const kir::GridReduction* node) { + indent(); + handle(node->out()); + os_ << " = " + << "GRID_REDUCTION(op='" << node->getReductionOpType() << "'" + << ", in="; + handle(node->in()); + os_ << ", init="; + handle(node->init()); + os_ << ", read_pred="; + if (node->predicate() != nullptr) { + handle(node->predicate()); + } else { + os_ << "nullptr"; + } + os_ << ")\n"; + os_ << ", write_pred="; + if (node->writePredicate() != nullptr) { + handle(node->writePredicate()); + } else { + os_ << "nullptr"; } + os_ << ")\n"; + indent() << kTab << ".reduction_buffer="; + handle(node->reduction_buffer()->buffer()); + os_ << "\n"; + indent() << kTab << ".sync_buffer="; + handle(node->sync_buffer()->buffer()); os_ << "\n"; } -void IrPrinter::handle(const Merge* m) { - os_ << "Merge: "; - handle(m->outer()); - os_ << " and "; - handle(m->inner()); - os_ << " -> "; - handle(m->out()); +void IrPrinter::handle(const kir::GroupedGridReduction* node) { + indent() << "Grouped grid reduction(\n"; + ++indent_size_; + for (const auto i : c10::irange(node->numReductions())) { + indent(); + handle(node->output(i)); + os_ << " = " + << "reduction(op='" << node->getReductionOpType(i) << "'" + << ", in="; + handle(node->input(i)); + os_ << ", init="; + handle(node->initVal(i)); + os_ << "\n"; + } + indent() << kTab << ".read_pred="; + if (node->predicate() != nullptr) { + handle(node->predicate()); + } else { + os_ << "nullptr"; + } + os_ << "\n"; + indent() << kTab << ".write_pred="; + if (node->writePredicate() != nullptr) { + handle(node->writePredicate()); + } else { + os_ << "nullptr"; + } + os_ << "\n"; + for (const auto i : c10::irange(node->numReductions())) { + indent() << kTab << ".reduction_buffer="; + handle(node->reduction_buffers().at(i)->buffer()); + os_ << "\n"; + } + indent() << kTab << ".sync_buffer="; + handle(node->sync_buffer()->buffer()); + os_ << "\n"; +} + +void IrPrinter::handle(const kir::GridWelford* node) { + const auto* welford_op = node->welford_op(); + indent(); + handle(welford_op->outVar()); + os_ << ","; + handle(welford_op->outAvg()); + os_ << ","; + handle(welford_op->outN()); + os_ << " = " + << "GRID_WELFORD(" + << "inAvg="; + handle(welford_op->inAvg()); + if (!welford_op->inN()->isOneInt()) { + indent() << ", inVar="; + handle(welford_op->inVar()); + } + indent() << ", inN="; + handle(welford_op->inN()); + if (!welford_op->initN()->isZeroInt()) { + indent() << ", initVar="; + handle(welford_op->initVar()); + os_ << " initAvg="; + handle(welford_op->initAvg()); + os_ << " initN="; + handle(welford_op->initN()); + } + indent() << ", read_pred="; + if (welford_op->predicate() != nullptr) { + handle(welford_op->predicate()); + } else { + os_ << "nullptr"; + } + os_ << ")\n"; + indent() << ", write_pred="; + if (welford_op->writePredicate() != nullptr) { + handle(welford_op->writePredicate()); + } else { + os_ << "nullptr"; + } + os_ << ")\n"; + indent() << kTab << ".var_buffer="; + handle(node->var_buffer()->buffer()); + os_ << ".avg_buffer="; + handle(node->avg_buffer()->buffer()); + os_ << ".n_buffer="; + handle(node->N_buffer()->buffer()); + os_ << "\n"; + indent() << kTab << ".sync_buffer="; + handle(node->sync_buffer()->buffer()); os_ << "\n"; + indent() << kTab << ".grid_read_pred="; + if (node->predicate() != nullptr) { + handle(node->predicate()); + } else { + os_ << "nullptr"; + } + os_ << "\n"; + indent() << kTab << ".grid_write_pred="; + if (node->writePredicate() != nullptr) { + handle(node->writePredicate()); + } else { + os_ << "nullptr"; + } + os_ << "\n"; +} + +void IrPrinter::handle(const kir::InitMagicZero* node) { + indent() << "NVFUSER_DEFINE_MAGIC_ZERO\n"; +} + +void IrPrinter::handle(const kir::UpdateMagicZero* node) { + indent() << "NVFUSER_UPDATE_MAGIC_ZERO\n"; +} + +void IrPrinter::handle(const kir::AllocateFusedReduction* node) { + indent() << "AllocateFusedReduction(reduction buffer="; + handle(node->out()); + os_ << ")\n"; } void IrTransformPrinter::handle(Fusion* f) { @@ -450,7 +782,7 @@ void IrTransformPrinter::printTransforms(TensorView* tv) { os() << ")\n"; for (auto exp : all_exp) { - os() << " "; + os() << " "; IrPrinter::handle(exp); } } diff --git a/torch/csrc/jit/codegen/cuda/ir_iostream.h b/torch/csrc/jit/codegen/cuda/ir_iostream.h index c080c3f8f993..f5ccf6fc5ac9 100644 --- a/torch/csrc/jit/codegen/cuda/ir_iostream.h +++ b/torch/csrc/jit/codegen/cuda/ir_iostream.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include @@ -13,21 +13,30 @@ namespace jit { namespace fuser { namespace cuda { +class Fusion; +namespace kir { +class Kernel; +class Scope; +} // namespace kir + //! Define pretty printing functions for IR nodes //! //! This class is intended for debug printing, so it attempts //! to handle invalid states as well. //! class TORCH_CUDA_CU_API IrPrinter : public OptInConstDispatch { + static constexpr char const* kTab = " "; + public: explicit IrPrinter(std::ostream& os) : os_(os) {} // Indent the generated code - void indent() { + std::ostream& indent() { for (const auto i : c10::irange(indent_size_)) { (void)i; // Suppress unused variable warning os_ << " "; } + return os_; } void resetIndent() { @@ -38,6 +47,8 @@ class TORCH_CUDA_CU_API IrPrinter : public OptInConstDispatch { return print_inline_; } + using OptInConstDispatch::handle; + virtual void handle(Fusion* f); // handle calls some non const fusion ops, @@ -52,30 +63,57 @@ class TORCH_CUDA_CU_API IrPrinter : public OptInConstDispatch { handle(&f); } - void handle(const Statement* s) override; - void handle(const Val* v) override; - void handle(const Expr* e) override; - - void handle(const TensorDomain*) override; - void handle(const TensorView*) override; - void handle(const IterDomain*) override; - - void handle(const Bool*) override; - void handle(const Double*) override; - void handle(const Int*) override; - void handle(const NamedScalar*) override; - - void handle(const UnaryOp*) override; - void handle(const BinaryOp*) override; - void handle(const TernaryOp*) override; - void handle(const ReductionOp*) override; - void handle(const WelfordOp*) override; - void handle(const BroadcastOp*) override; - void handle(const TransposeOp*) override; - void handle(const ShiftOp*) override; - void handle(const GatherOp*) override; - void handle(const ViewOp*) override; - + virtual void handle(const kir::Kernel* kernel); + virtual void handle(kir::Kernel& kernel); + + void handleScope(const kir::Scope& scope); + + void handle(const Statement* s) final; + void handle(const Val* v) final; + void handle(const Expr* e) final; + + void handle(const IterDomain*) final; + void handle(const TensorDomain*) final; + void handle(const TensorView*) final; + + void handle(const Bool*) final; + void handle(const Double*) final; + void handle(const Int*) final; + void handle(const ComplexDouble*) final; + void handle(const NamedScalar*) final; + + void handle(const UnaryOp*) final; + void handle(const BinaryOp*) final; + void handle(const TernaryOp*) final; + void handle(const ReductionOp*) final; + void handle(const GroupedReductionOp*) final; + void handle(const WelfordOp*) final; + void handle(const MmaOp*) final; + void handle(const BroadcastOp*) final; + void handle(const TransposeOp*) final; + void handle(const ShiftOp*) final; + void handle(const GatherOp*) final; + void handle(const ViewAsScalar*) final; + void handle(const ViewOp*) final; + + void handle(const kir::Predicate*) final; + void handle(const kir::TensorIndex*) final; + + void handle(const kir::GridBroadcast*) final; + void handle(const kir::GridReduction*) final; + void handle(const kir::GroupedGridReduction*) final; + void handle(const kir::GridWelford*) final; + void handle(const kir::ForLoop*) final; + void handle(const kir::IfThenElse*) final; + void handle(const kir::Allocate*) final; + void handle(const kir::BlockSync*) final; + void handle(const kir::GridSync*) final; + void handle(const kir::InitMagicZero*) final; + void handle(const kir::UpdateMagicZero*) final; + void handle(const kir::AllocateFusedReduction*) final; + + // IR math printer overrides these to prevent them from printing, keep + // override void handle(const Split*) override; void handle(const Merge*) override; diff --git a/torch/csrc/jit/codegen/cuda/ir_nodes.cpp b/torch/csrc/jit/codegen/cuda/ir_nodes.cpp index 1465a88bef32..543fdd0941fa 100644 --- a/torch/csrc/jit/codegen/cuda/ir_nodes.cpp +++ b/torch/csrc/jit/codegen/cuda/ir_nodes.cpp @@ -4,7 +4,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -38,19 +40,19 @@ class ScalarCheck : OptInConstDispatch { } private: - void handle(const Bool* b) override { + void handle(const Bool* b) final { same_ = v1_->as()->sameAs(v2_->as()); } - void handle(const Double* d) override { + void handle(const Double* d) final { same_ = v1_->as()->sameAs(v2_->as()); } - void handle(const Int* i) override { + void handle(const Int* i) final { same_ = v1_->as()->sameAs(v2_->as()); } - void handle(const NamedScalar* ns) override { + void handle(const NamedScalar* ns) final { same_ = v1_->as()->sameAs(v2_->as()); } @@ -70,6 +72,16 @@ bool areEqualScalars(Val* v1, Val* v2) { return ScalarCheck::sameAs(v1, v2); } +Bool::Bool(IrBuilderPasskey passkey) + : Val(passkey, ValType::Scalar, DataType::Bool), + maybe_value_{c10::nullopt} {} + +Bool::Bool(IrBuilderPasskey passkey, bool value) + : Val(passkey, ValType::Scalar, DataType::Bool), maybe_value_{value} {} + +Bool::Bool(IrBuilderPasskey passkey, c10::optional value) + : Val(passkey, ValType::Scalar, DataType::Bool), maybe_value_{value} {} + Bool::Bool(const Bool* src, IrCloner* ir_cloner) : Val(src, ir_cloner), maybe_value_(src->maybe_value_) {} @@ -87,6 +99,16 @@ bool Bool::sameAs(const Statement* other) const { return false; } +Double::Double(IrBuilderPasskey passkey) + : Val(passkey, ValType::Scalar, DataType::Double), + maybe_value_{c10::nullopt} {} + +Double::Double(IrBuilderPasskey passkey, ScalarType value) + : Val(passkey, ValType::Scalar, DataType::Double), maybe_value_{value} {} + +Double::Double(IrBuilderPasskey passkey, c10::optional value) + : Val(passkey, ValType::Scalar, DataType::Double), maybe_value_{value} {} + Double::Double(const Double* src, IrCloner* ir_cloner) : Val(src, ir_cloner), maybe_value_(src->maybe_value_) {} @@ -103,6 +125,16 @@ bool Double::sameAs(const Statement* other) const { return false; } +Int::Int(IrBuilderPasskey passkey) + : Val(passkey, ValType::Scalar, DataType::Int), + maybe_value_{c10::nullopt} {} + +Int::Int(IrBuilderPasskey passkey, ScalarType value) + : Val(passkey, ValType::Scalar, DataType::Int), maybe_value_{value} {} + +Int::Int(IrBuilderPasskey passkey, c10::optional value) + : Val(passkey, ValType::Scalar, DataType::Int), maybe_value_{value} {} + Int::Int(const Int* src, IrCloner* ir_cloner) : Val(src, ir_cloner), maybe_value_(src->maybe_value_) {} @@ -120,11 +152,43 @@ bool Int::sameAs(const Statement* other) const { return false; } -UnaryOp::UnaryOp(UnaryOpType type, Val* out, Val* in) - : Expr(ExprType::UnaryOp), unary_op_type_{type}, out_{out}, in_{in} { +ComplexDouble::ComplexDouble(IrBuilderPasskey passkey) + : Val(passkey, ValType::Scalar, DataType::ComplexDouble), + maybe_value_{c10::nullopt} {} + +ComplexDouble::ComplexDouble(IrBuilderPasskey passkey, ScalarType value) + : Val(passkey, ValType::Scalar, DataType::ComplexDouble), + maybe_value_{value} {} + +ComplexDouble::ComplexDouble( + IrBuilderPasskey passkey, + c10::optional value) + : Val(passkey, ValType::Scalar, DataType::ComplexDouble), + maybe_value_{value} {} + +ComplexDouble::ComplexDouble(const ComplexDouble* src, IrCloner* ir_cloner) + : Val(src, ir_cloner), maybe_value_(src->maybe_value_) {} + +bool ComplexDouble::sameAs(const Statement* other) const { + if (this == other) { + return true; + } + if (!other->isA()) { + return false; + } + const auto other_complex = other->as(); + if (isConst() && other_complex->isConst()) + return *value() == *(other_complex->value()); + return false; +} + +UnaryOp::UnaryOp(IrBuilderPasskey passkey, UnaryOpType type, Val* out, Val* in) + : Expr(passkey, ExprType::UnaryOp), + unary_op_type_{type}, + out_{out}, + in_{in} { addOutput(out); addInput(in); - name_ = FusionGuard::getCurFusion()->registerExpr(this); } UnaryOp::UnaryOp(const UnaryOp* src, IrCloner* ir_cloner) @@ -146,8 +210,13 @@ bool UnaryOp::sameAs(const Statement* other) const { return Expr::sameAs(other); } -BinaryOp::BinaryOp(BinaryOpType type, Val* out, Val* lhs, Val* rhs) - : Expr(ExprType::BinaryOp), +BinaryOp::BinaryOp( + IrBuilderPasskey passkey, + BinaryOpType type, + Val* out, + Val* lhs, + Val* rhs) + : Expr(passkey, ExprType::BinaryOp), binary_op_type_{type}, out_{out}, lhs_{lhs}, @@ -155,7 +224,6 @@ BinaryOp::BinaryOp(BinaryOpType type, Val* out, Val* lhs, Val* rhs) addOutput(out); addInput(lhs); addInput(rhs); - name_ = FusionGuard::getCurFusion()->registerExpr(this); } BinaryOp::BinaryOp(const BinaryOp* src, IrCloner* ir_cloner) @@ -178,8 +246,14 @@ bool BinaryOp::sameAs(const Statement* other) const { return Expr::sameAs(other); } -TernaryOp::TernaryOp(TernaryOpType type, Val* out, Val* in1, Val* in2, Val* in3) - : Expr(ExprType::TernaryOp), +TernaryOp::TernaryOp( + IrBuilderPasskey passkey, + TernaryOpType type, + Val* out, + Val* in1, + Val* in2, + Val* in3) + : Expr(passkey, ExprType::TernaryOp), ternary_op_type_{type}, out_{out}, in1_{in1}, @@ -189,7 +263,6 @@ TernaryOp::TernaryOp(TernaryOpType type, Val* out, Val* in1, Val* in2, Val* in3) addInput(in1); addInput(in2); addInput(in3); - name_ = FusionGuard::getCurFusion()->registerExpr(this); } TernaryOp::TernaryOp(const TernaryOp* src, IrCloner* ir_cloner) @@ -213,8 +286,12 @@ bool TernaryOp::sameAs(const Statement* other) const { return Expr::sameAs(other); } -BroadcastOp::BroadcastOp(Val* out, Val* in, std::vector is_broadcast_dims) - : Expr(ExprType::BroadcastOp), +BroadcastOp::BroadcastOp( + IrBuilderPasskey passkey, + Val* out, + Val* in, + std::vector is_broadcast_dims) + : Expr(passkey, ExprType::BroadcastOp), out_(out), in_(in), is_broadcast_dims_(std::move(is_broadcast_dims)) { @@ -226,12 +303,18 @@ BroadcastOp::BroadcastOp(Val* out, Val* in, std::vector is_broadcast_dims) auto in_type = in->getValType().value(); TORCH_INTERNAL_ASSERT( - out_type == ValType::TensorView && in_type == ValType::TensorView, + (out_type == ValType::TensorView && in_type == ValType::TensorView) || + (out_type == ValType::TensorIndex && in_type == ValType::TensorIndex), "Cannot braodcast a non-tensor object."); addOutput(out); addInput(in); - name_ = FusionGuard::getCurFusion()->registerExpr(this); + + if (!out->isA() || !in->isA()) { + return; + } + + passkey.ir_container_->registerExpr(exprPasskey(), this); // This is a generic check that root dims of a consumer and producer match. // Maybe we shouldn't relegate it to this constructor. @@ -294,37 +377,100 @@ bool BroadcastOp::sameAs(const Statement* other) const { } ReductionOp::ReductionOp( + IrBuilderPasskey passkey, BinaryOpType reduction_op_type, Val* init, Val* out, - Val* in) - : Expr(ExprType::ReductionOp), + Val* in, + bool is_allreduce, + ExprType expr_type) + : Expr(passkey, expr_type), reduction_op_type_(reduction_op_type), init_(init), out_(out), - in_(in) { - TORCH_CHECK(out->getValType().value() == ValType::TensorView); + in_(in), + is_allreduce_(is_allreduce) { + TORCH_CHECK( + out->getValType().value() == ValType::TensorView || + out->getValType().value() == ValType::TensorIndex); TORCH_INTERNAL_ASSERT( - in->getValType() == ValType::TensorView && - out->getValType() == ValType::TensorView, + (in->getValType() == ValType::TensorView && + out->getValType() == ValType::TensorView) || + (in->getValType() == ValType::TensorIndex && + out->getValType() == ValType::TensorIndex), "Reduction operation was created that does not have tensor inputs and outputs."); - TORCH_INTERNAL_ASSERT( - TensorDomain::noReductions(in->as()->getMaybeRFactorDomain()) - .size() == out->as()->getRootDomain().size(), - "Reduction operation created with mismatched domains."); - + if (in->isA()) { + TORCH_INTERNAL_ASSERT( + TensorDomain::noReductions( + in->as()->getMaybeRFactorDomain()) + .size() == out->as()->getRootDomain().size(), + "Reduction operation created with mismatched domains."); + } TORCH_INTERNAL_ASSERT( init->isConstScalar(), "Tried to create a reduction operation whith an initial value that isn't a constant."); addOutput(out); addInput(in); - name_ = FusionGuard::getCurFusion()->registerExpr(this); +} + +GroupedReductionOp::GroupedReductionOp( + IrBuilderPasskey passkey, + std::vector reduction_op_types, + std::vector init_vals, + std::vector outputs, + std::vector inputs, + bool is_fused, + ExprType expr_type) + : Expr(passkey, expr_type), + reduction_op_types_(std::move(reduction_op_types)), + init_vals_(std::move(init_vals)), + is_allreduce_(is_fused) { + for (auto out : outputs) { + addOutput(out); + } + + for (auto in : inputs) { + addInput(in); + } +} + +GroupedReductionOp::GroupedReductionOp( + const GroupedReductionOp* src, + IrCloner* ir_cloner) + : Expr(src, ir_cloner), + reduction_op_types_(src->reduction_op_types_), + init_vals_(ir_cloner->clone(src->init_vals_)), + is_allreduce_(src->is_allreduce_) {} + +bool GroupedReductionOp::sameAs(const Statement* other) const { + if (this == other) { + return true; + } + + auto grouped_rop = dynamic_cast(other); + if (grouped_rop == nullptr) { + return false; + } + + if (!Expr::sameAs(other) || + getReductionOpTypes() != grouped_rop->getReductionOpTypes()) { + return false; + } + + for (const auto i : c10::irange(numReductions())) { + if (!initVal(i)->sameAs(grouped_rop->initVal(i))) { + return false; + } + } + + return true; } WelfordOp::WelfordOp( + IrBuilderPasskey passkey, Val* out_avg, Val* out_var, Val* out_N, @@ -333,8 +479,9 @@ WelfordOp::WelfordOp( Val* init_N, Val* in_avg, Val* in_var, - Val* in_N) - : Expr(ExprType::WelfordOp), + Val* in_N, + bool is_fused) + : Expr(passkey, ExprType::WelfordOp), out_avg_(out_avg), out_var_(out_var), out_N_(out_N), @@ -342,12 +489,19 @@ WelfordOp::WelfordOp( init_var_(init_var), init_N_(init_N), in_avg_(in_avg), - in_var_(in_var), - in_N_(in_N) { + in_var_(in_var == nullptr ? in_avg->container()->zeroVal() : in_var), + in_N_(in_N), + is_allreduce_(is_fused) { // Check output type - TORCH_INTERNAL_ASSERT(out_avg->getValType().value() == ValType::TensorView); - TORCH_INTERNAL_ASSERT(out_var->getValType().value() == ValType::TensorView); - TORCH_INTERNAL_ASSERT(out_N->getValType().value() == ValType::TensorView); + TORCH_INTERNAL_ASSERT( + out_avg->getValType().value() == ValType::TensorView || + out_avg->getValType().value() == ValType::TensorIndex); + TORCH_INTERNAL_ASSERT( + out_var->getValType().value() == ValType::TensorView || + out_var->getValType().value() == ValType::TensorIndex); + TORCH_INTERNAL_ASSERT( + out_N->getValType().value() == ValType::TensorView || + out_N->getValType().value() == ValType::TensorIndex); // check initial value TORCH_INTERNAL_ASSERT(init_N->getValType().value() == ValType::Scalar); @@ -356,36 +510,48 @@ WelfordOp::WelfordOp( // initial value with a count of 1 is un-common enough that I'll push // the responsibility of creating all-zero var tensors to the user TORCH_INTERNAL_ASSERT( - init_avg && init_avg->getValType().value() == ValType::TensorView); + init_avg && + (init_avg->getValType().value() == ValType::TensorView || + init_avg->getValType().value() == ValType::TensorIndex)); TORCH_INTERNAL_ASSERT( - init_var && init_var->getValType().value() == ValType::TensorView); + init_var && + (init_var->getValType().value() == ValType::TensorView || + init_var->getValType().value() == ValType::TensorIndex)); } TORCH_INTERNAL_ASSERT( - in_avg && in_avg->getValType().value() == ValType::TensorView); + in_avg && + (in_avg->getValType().value() == ValType::TensorView || + in_avg->getValType().value() == ValType::TensorIndex), + in_avg->getValType().value()); // check input TORCH_INTERNAL_ASSERT( in_N->getValType().value() == ValType::Scalar || - in_N->getValType().value() == ValType::TensorView); + in_N->getValType().value() == ValType::TensorView || + in_N->getValType().value() == ValType::TensorIndex); if (!in_N->isOneInt()) { // when input is only one value, only the value is required through avg // input the var part is implicitly 0 and codegen will handle that. TORCH_INTERNAL_ASSERT( - in_var && in_var->getValType().value() == ValType::TensorView); + in_var && + (in_var->getValType().value() == ValType::TensorView || + in_var->getValType().value() == ValType::TensorIndex)); + } else { + TORCH_INTERNAL_ASSERT( + in_var == nullptr || in_var->isZeroInt(), + "Invalid var input, which must be either nullptr or scalar zero when the N input is one."); } - addOutput(out_avg); - addOutput(out_var); - addOutput(out_N); + addOutput(out_avg_); + addOutput(out_var_); + addOutput(out_N_); - addInput(in_avg); - // Conditionally adding this input? - if (!in_N->isOneInt()) { - addInput(in_var); - } - addInput(in_N); - - name_ = FusionGuard::getCurFusion()->registerExpr(this); + addInput(in_avg_); + // Previously in_var_ was allowed to be null + TORCH_INTERNAL_ASSERT( + in_var_ != nullptr, "Welford var input nullptr not allowed"); + addInput(in_var_); + addInput(in_N_); } WelfordOp::WelfordOp(const WelfordOp* src, IrCloner* ir_cloner) @@ -398,7 +564,8 @@ WelfordOp::WelfordOp(const WelfordOp* src, IrCloner* ir_cloner) init_N_(ir_cloner->clone(src->init_N_)), in_avg_(ir_cloner->clone(src->in_avg_)), in_var_(src->in_var_ ? ir_cloner->clone(src->in_var_) : nullptr), - in_N_(ir_cloner->clone(src->in_N_)) {} + in_N_(ir_cloner->clone(src->in_N_)), + is_allreduce_(src->is_allreduce_) {} namespace { inline bool sameOptionalVal(Val* a, Val* b) { @@ -421,12 +588,80 @@ bool WelfordOp::sameAs(const Statement* other) const { return false; } +std::vector WelfordOp::getInitVals() const { + std::vector init_vals({init_avg_, init_var_, init_N_}); + return init_vals; +} + +MmaOp::MmaOp( + IrBuilderPasskey passkey, + Val* out, + Val* in_a, + Val* in_b, + Val* init) + : Expr(passkey, ExprType::MmaOp), + out_(out), + in_a_(in_a), + in_b_(in_b), + init_(init) { + // Check output type + TORCH_INTERNAL_ASSERT( + out->getValType().value() == ValType::TensorView || + out->getValType().value() == ValType::TensorIndex); + + TORCH_INTERNAL_ASSERT( + in_a->getValType().value() == ValType::TensorView || + in_a->getValType().value() == ValType::TensorIndex, + in_a->getValType().value()); + + TORCH_INTERNAL_ASSERT( + in_b->getValType().value() == ValType::TensorView || + in_b->getValType().value() == ValType::TensorIndex, + in_b->getValType().value()); + + addOutput(out); + addInput(in_a); + addInput(in_b); +} + +MmaOp::MmaOp( + IrBuilderPasskey passkey, + Val* out, + Val* in_a, + Val* in_b, + Val* init, + MmaOptions options) + : MmaOp(passkey, out, in_a, in_b, init) { + options_ = options; +} + +MmaOp::MmaOp(const MmaOp* src, IrCloner* ir_cloner) + : Expr(src, ir_cloner), + out_(ir_cloner->clone(src->out_)), + in_a_(ir_cloner->clone(src->in_a_)), + in_b_(ir_cloner->clone(src->in_b_)), + init_(ir_cloner->clone(src->init_)), + options_(src->options_) {} + +bool MmaOp::sameAs(const Statement* other) const { + if (this == other) { + return true; + } + if (auto other_mma = dynamic_cast(other)) { + return out_->sameAs(other_mma->out_) && in_a_->sameAs(other_mma->in_a_) && + in_b_->sameAs(other_mma->in_b_) && init_->sameAs(other_mma->init_) && + options_ == other_mma->options_; + } + return false; +} + ReductionOp::ReductionOp(const ReductionOp* src, IrCloner* ir_cloner) : Expr(src, ir_cloner), reduction_op_type_(src->reduction_op_type_), init_(ir_cloner->clone(src->init_)), out_(ir_cloner->clone(src->out_)), - in_(ir_cloner->clone(src->in_)) {} + in_(ir_cloner->clone(src->in_)), + is_allreduce_(src->is_allreduce_) {} bool ReductionOp::sameAs(const Statement* other) const { if (this == other) { @@ -444,10 +679,11 @@ bool ReductionOp::sameAs(const Statement* other) const { } TransposeOp::TransposeOp( + IrBuilderPasskey passkey, TensorView* out, TensorView* in, std::vector new2old) - : Expr(ExprType::TransposeOp), + : Expr(passkey, ExprType::TransposeOp), out_(out), in_(in), new2old_(std::move(new2old)) { @@ -481,7 +717,6 @@ TransposeOp::TransposeOp( addOutput(out); addInput(in); - name_ = FusionGuard::getCurFusion()->registerExpr(this); } TransposeOp::TransposeOp(const TransposeOp* src, IrCloner* ir_cloner) @@ -490,12 +725,17 @@ TransposeOp::TransposeOp(const TransposeOp* src, IrCloner* ir_cloner) in_(ir_cloner->clone(src->in_)), new2old_(src->new2old_) {} -ShiftOp::ShiftOp(Val* out, Val* in, std::vector offsets, bool pad) - : Expr(ExprType::ShiftOp), +ShiftOp::ShiftOp( + IrBuilderPasskey passkey, + Val* out, + Val* in, + std::vector offsets, + std::vector pad_width) + : Expr(passkey, ExprType::ShiftOp), out_(out), in_(in), offsets_(std::move(offsets)), - pad_(pad) { + pad_width_(std::move(pad_width)) { // clang-tidy complains about out_ that it may be null. TORCH_INTERNAL_ASSERT(out_ != nullptr); TORCH_INTERNAL_ASSERT(in_ != nullptr); @@ -514,9 +754,15 @@ ShiftOp::ShiftOp(Val* out, Val* in, std::vector offsets, bool pad) "Invalid offset vector: ", offsets_); + TORCH_INTERNAL_ASSERT( + pad_width_.size() == + TensorDomain::noReductions(in_->as()->getRootDomain()) + .size(), + "Invalid padding width vector: ", + pad_width_); + addOutput(out); addInput(in); - name_ = FusionGuard::getCurFusion()->registerExpr(this); } ShiftOp::ShiftOp(const ShiftOp* src, IrCloner* ir_cloner) @@ -524,7 +770,7 @@ ShiftOp::ShiftOp(const ShiftOp* src, IrCloner* ir_cloner) out_(ir_cloner->clone(src->out_)), in_(ir_cloner->clone(src->in_)), offsets_(src->offsets_), - pad_(src->pad_) {} + pad_width_(src->pad_width_) {} bool ShiftOp::sameAs(const Statement* other) const { if (this == other) { @@ -541,11 +787,12 @@ bool ShiftOp::sameAs(const Statement* other) const { } GatherOp::GatherOp( + IrBuilderPasskey passkey, Val* out, Val* in, - std::vector window_shape, - std::vector> pad_width) - : Expr(ExprType::GatherOp), + std::vector window_shape, + std::vector> pad_width) + : Expr(passkey, ExprType::GatherOp), out_(out), in_(in), window_shape_(std::move(window_shape)), @@ -578,28 +825,14 @@ GatherOp::GatherOp( addOutput(out); addInput(in); - name_ = FusionGuard::getCurFusion()->registerExpr(this); } GatherOp::GatherOp(const GatherOp* src, IrCloner* ir_cloner) : Expr(src, ir_cloner), out_(ir_cloner->clone(src->out_)), - in_(ir_cloner->clone(src->in_)) { - std::transform( - src->window_shape_.begin(), - src->window_shape_.end(), - std::back_inserter(window_shape_), - [&ir_cloner](const auto& x) { return ir_cloner->clone(x); }); - for (const auto& pad : src->pad_width_) { - std::vector pad_clone; - std::transform( - pad.begin(), - pad.end(), - std::back_inserter(pad_clone), - [&ir_cloner](const auto& x) { return ir_cloner->clone(x); }); - pad_width_.push_back(pad_clone); - } -} + in_(ir_cloner->clone(src->in_)), + window_shape_(src->window_shape_), + pad_width_(src->pad_width_) {} bool GatherOp::sameAs(const Statement* other) const { if (this == other) { @@ -609,23 +842,10 @@ bool GatherOp::sameAs(const Statement* other) const { return false; } const auto other_op = other->as(); - if (windowShape().size() != other_op->windowShape().size()) { - return false; - } - for (const auto i : c10::irange(windowShape().size())) { - if (!windowShape()[i]->sameAs(other_op->windowShape()[i])) { - return false; - } - } - if (padWidth().size() != other_op->padWidth().size()) { + if (windowShape() != other_op->windowShape() || + padWidth() != other_op->padWidth()) { return false; } - for (const auto i : c10::irange(padWidth().size())) { - if (!padWidth()[i][0]->sameAs(other_op->padWidth()[i][0]) || - !padWidth()[i][1]->sameAs(other_op->padWidth()[i][1])) { - return false; - } - } return Expr::sameAs(other); } @@ -638,11 +858,32 @@ int GatherOp::gatherAxis(int axis) const { return int(windowShape().size()) + axis; } -ViewOp::ViewOp(TensorView* out, TensorView* in) - : Expr(ExprType::ViewOp), out_(out), in_(in) { +ViewAsScalar::ViewAsScalar( + IrBuilderPasskey passkey, + Val* out, + Val* in, + IterDomain* vector_id, + Val* index) + : Expr(passkey, ExprType::ViewAsScalar), + out_(out), + in_(in), + vector_id_(vector_id), + index_(index) { + addOutput(out); + addInput(in); +} + +ViewAsScalar::ViewAsScalar(const ViewAsScalar* src, IrCloner* ir_cloner) + : Expr(src, ir_cloner), + out_(ir_cloner->clone(src->out_)), + in_(ir_cloner->clone(src->in_)), + vector_id_(ir_cloner->clone(src->vector_id_)), + index_(ir_cloner->clone(src->index_)) {} + +ViewOp::ViewOp(IrBuilderPasskey passkey, TensorView* out, TensorView* in) + : Expr(passkey, ExprType::ViewOp), out_(out), in_(in) { addOutput(out); addInput(in); - name_ = FusionGuard::getCurFusion()->registerExpr(this); } ViewOp::ViewOp(const ViewOp* src, IrCloner* ir_cloner) @@ -651,33 +892,50 @@ ViewOp::ViewOp(const ViewOp* src, IrCloner* ir_cloner) in_(ir_cloner->clone(src->in_)) {} IterDomain::IterDomain( + IrBuilderPasskey passkey, Val* start, Val* extent, ParallelType parallel_type, IterType iter_type, - bool is_rfactor_domain) + bool is_rfactor_domain, + bool is_padded_dimension, + c10::optional padded_to_size, + bool is_mma_swizzled) : IterDomain( + passkey, start, extent, nullptr, parallel_type, iter_type, - is_rfactor_domain) {} + is_rfactor_domain, + is_padded_dimension, + padded_to_size, + is_mma_swizzled) {} IterDomain::IterDomain( + IrBuilderPasskey passkey, Val* start, Val* extent, Val* stop_offset, ParallelType parallel_type, IterType iter_type, - bool is_rfactor_domain) - : Val(ValType::IterDomain, DataType::Int, false), + bool is_rfactor_domain, + bool is_padded_dimension, + c10::optional padded_to_size, + bool is_mma_swizzled) + : Val(passkey, ValType::IterDomain, DataType::Int), start_(start), extent_(extent), - stop_offset_(stop_offset == nullptr ? new Int(0) : stop_offset), + stop_offset_( + stop_offset == nullptr ? passkey.ir_container_->zeroVal() + : stop_offset), parallel_type_(parallel_type), iter_type_(iter_type), - is_rfactor_domain_(is_rfactor_domain) { + is_rfactor_domain_(is_rfactor_domain), + is_padded_dimension_(is_padded_dimension), + padded_to_size_(padded_to_size), + is_mma_swizzled_(is_mma_swizzled) { TORCH_CHECK( !(isRFactorProduct() && isBroadcast()), "IterDomain cannot be both a broadcast and rfactor domain."); @@ -693,8 +951,6 @@ IterDomain::IterDomain( "Cannot create an iter domain with a start that is not an int but received ", start, " ."); - - name_ = fusion_->registerVal(this); } IterDomain::IterDomain(const IterDomain* src, IrCloner* ir_cloner) @@ -706,7 +962,8 @@ IterDomain::IterDomain(const IterDomain* src, IrCloner* ir_cloner) iter_type_(src->iter_type_), is_rfactor_domain_(src->is_rfactor_domain_), is_padded_dimension_(src->is_padded_dimension_), - padded_to_size_(src->padded_to_size_) {} + padded_to_size_(src->padded_to_size_), + is_mma_swizzled_(src->is_mma_swizzled_) {} bool IterDomain::sameAs(const Statement* other) const { if (other == this) { @@ -720,7 +977,8 @@ bool IterDomain::sameAs(const Statement* other) const { const IterDomain* other_id = other->as(); bool is_same = isReduction() == other_id->isReduction() && - getParallelType() == other_id->getParallelType(); + getParallelType() == other_id->getParallelType() && + isVectorComponent() == other_id->isVectorComponent(); is_same = is_same && ScalarCheck::sameAs(extent(), other_id->extent()); is_same = is_same && ScalarCheck::sameAs(start(), other_id->start()); is_same = @@ -729,6 +987,24 @@ bool IterDomain::sameAs(const Statement* other) const { return is_same; } +// Returns a new IterDomain matching properties of this except for +// is_rfactor_domain_ +IterDomain* IterDomain::cloneWithoutRFactor() const { + auto cloned = IrBuilder::create( + ir_container_, + start(), + extent(), + stopOffset(), + getParallelType(), + getIterType(), + false, + is_padded_dimension_, + padded_to_size_, + is_mma_swizzled_); + + return cloned; +} + std::vector IterDomain::clone( const std::vector& domains) { std::vector cloned_domains; @@ -736,7 +1012,7 @@ std::vector IterDomain::clone( domains.begin(), domains.end(), std::back_inserter(cloned_domains), - [](auto id) { return id->clone(); }); + [](auto id) { return id->cloneWithoutRFactor(); }); return cloned_domains; } @@ -781,14 +1057,15 @@ IterDomain* IterDomain::merge(IterDomain* outer, IterDomain* inner) { itype = IterType::Iteration; } - IterDomain* merged_id = new IterDomain( - new Int(0), + IterDomain* merged_id = IrBuilder::create( + outer->container(), + outer->container()->zeroVal(), merged_id_size->as(), outer->getParallelType(), itype, outer->isRFactorProduct() || inner->isRFactorProduct()); - new Merge(merged_id, outer, inner); + IrBuilder::create(outer->container(), merged_id, outer, inner); return merged_id; } @@ -811,7 +1088,8 @@ std::pair IterDomain::split( if (factor->getValType() == ValType::Scalar) { TORCH_CHECK( factor->isConstScalar() || - FusionGuard::getCurFusion()->hasInput(factor), + (FusionGuard::getCurFusion() == factor->fusion() && + factor->isFusionInput()), factor, " is not a constant nor an input. It must be one or the other to be used in a split.", " If you want a symbolic split based on a thread dimension please use IterDomain::split(IterDomain*, ParallelType);"); @@ -832,24 +1110,33 @@ std::pair IterDomain::split( in->definition() == nullptr, "Partial split is only allowed with root domains"); } - // outer loop IterDomain - IterDomain* ido = new IterDomain( - new Int(0), + IterDomain* ido = IrBuilder::create( + in->container(), + in->container()->zeroVal(), inner_split ? remainder->as() : factor, in->getParallelType(), in->getIterType(), in->isRFactorProduct()); // inner loop IterDomain - IterDomain* idi = new IterDomain( - new Int(0), + IterDomain* idi = IrBuilder::create( + in->container(), + in->container()->zeroVal(), inner_split ? factor : remainder->as(), in->getParallelType(), in->getIterType(), in->isRFactorProduct()); - new Split(ido, idi, in, factor, inner_split, start_offset, stop_offset); + IrBuilder::create( + in->container(), + ido, + idi, + in, + factor, + inner_split, + start_offset, + stop_offset); return {ido, idi}; } @@ -864,7 +1151,9 @@ std::pair IterDomain::split( } std::pair IterDomain::stridedSplit(int factor) { - auto split_out = IterDomain::split(this, new Int(factor), true); + // Use partial split so that only valid values are retained + auto split_out = IterDomain::split( + this, IrBuilder::create(container(), factor), true, true); split_out.second->iter_type_ = IterType::Stride; split_out.first->is_rfactor_domain_ = true; @@ -877,7 +1166,11 @@ std::pair IterDomain::stridedSplit(int factor) { // vectorize to the left of the computeAt domain, and could allow us to do some // simple validation of vectorize as it's inputs are right most and contiguous. void IterDomain::parallelize(ParallelType t) { - parallel_type_ = t; + if (parallel_type_ == t) { + // No op, don't do any more checks, it was already set to this value. + return; + } + if (t == ParallelType::Unroll || isParallelTypeVectorize(t)) { TORCH_CHECK( start()->isZeroInt() && extent()->isConstScalar(), @@ -888,6 +1181,14 @@ void IterDomain::parallelize(ParallelType t) { extent(), " ."); } + + if (isMmaSwizzled()) { + TORCH_CHECK( + t == ParallelType::Vectorize, + "Parallel type other than vectorize not allowed for warp mapped ids"); + } + + parallel_type_ = t; } bool IterDomain::maybePartial() const { @@ -907,9 +1208,10 @@ Val* IterDomain::stop() const { } TensorDomain::TensorDomain( + IrBuilderPasskey passkey, std::vector root_domain, std::vector contiguity) - : Val(ValType::TensorDomain, DataType::Null, false), + : Val(passkey, ValType::TensorDomain, DataType::Null), root_domain_(std::move(root_domain)), contiguity_( contiguity.empty() ? std::vector(root_domain_.size(), false) @@ -925,14 +1227,14 @@ TensorDomain::TensorDomain( has_nontrivial_reduction_ = false; domain_ = root_domain_; resetDomains(); - name_ = fusion_->registerVal(this); } TensorDomain::TensorDomain( + IrBuilderPasskey passkey, std::vector root_domain, std::vector domain, std::vector contiguity) - : Val(ValType::TensorDomain, DataType::Null, false), + : Val(passkey, ValType::TensorDomain, DataType::Null), root_domain_(std::move(root_domain)), domain_(std::move(domain)), contiguity_( @@ -963,15 +1265,15 @@ TensorDomain::TensorDomain( // Just due to clang-tidy, correct value set in resetDomains has_nontrivial_reduction_ = false; resetDomains(); - name_ = fusion_->registerVal(this); } TensorDomain::TensorDomain( + IrBuilderPasskey passkey, std::vector root_domain, std::vector rfactor_domain, std::vector domain, std::vector contiguity) - : Val(ValType::TensorDomain, DataType::Null, false), + : Val(passkey, ValType::TensorDomain, DataType::Null), root_domain_(std::move(root_domain)), domain_(std::move(domain)), rfactor_domain_(std::move(rfactor_domain)), @@ -1013,7 +1315,6 @@ TensorDomain::TensorDomain( // Just due to clang-tidy, correct value set in resetDomains has_nontrivial_reduction_ = false; resetDomains(); - name_ = fusion_->registerVal(this); } TensorDomain::TensorDomain(const TensorDomain* src, IrCloner* ir_cloner) @@ -1026,6 +1327,30 @@ TensorDomain::TensorDomain(const TensorDomain* src, IrCloner* ir_cloner) contiguity_(src->contiguity()), has_nontrivial_reduction_(src->has_nontrivial_reduction_) {} +namespace { +std::vector lowerIterDomains( + const std::vector& domains) { + std::vector lowered_domains; + lowered_domains.reserve(domains.size()); + for (const auto iter_domain : domains) { + lowered_domains.push_back(iter_domain); + } + return lowered_domains; +}; +} // namespace + +bool TensorDomain::hasBlockBroadcast() const { + return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) { + return id->isBroadcast() && id->isThreadDim(); + }); +} + +bool TensorDomain::hasGridBroadcast() const { + return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) { + return id->isBroadcast() && id->isBlockDim(); + }); +} + bool TensorDomain::operator==(const TensorDomain& other) const { // Checks equality of each class field. Should not be necessary to // check no_bcast_domain_ and no_reduction_domain_ as they are just @@ -1123,6 +1448,22 @@ bool TensorDomain::hasRFactor() const { return !rfactor_domain_.empty(); } +bool TensorDomain::hasViewLikeRFactor() const { + if (!hasRFactor()) { + // Can't have view like rfactor if there is no rfactor domain + return false; + } + + // If there's an rfactor domain and no rfactor product is a reduction, this is + // a view like rfactor + return std::none_of( + getMaybeRFactorDomain().begin(), + getMaybeRFactorDomain().end(), + [](IterDomain* id) { + return id->isReduction() && id->isRFactorProduct(); + }); +} + bool TensorDomain::hasVectorize() const { return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) { return id->getParallelType() == ParallelType::Vectorize || @@ -1200,6 +1541,10 @@ void TensorDomain::split( "Partial split is only allowed with root domains"); } + TORCH_INTERNAL_ASSERT( + !id->isMmaSwizzled(), + "Further transformation on warp mapped id's not allowed."); + auto split_ids = IterDomain::split(id, factor, inner_split, trim_out_of_bounds); domain_.erase(domain_.begin() + axis_); @@ -1235,6 +1580,10 @@ void TensorDomain::merge(int axis_o, int axis_i) { IterDomain* first = axis(axis_o); IterDomain* second = axis(axis_i); + TORCH_INTERNAL_ASSERT( + !first->isMmaSwizzled() && !second->isMmaSwizzled(), + "Further transformation on warp mapped id's not allowed."); + IterDomain* merged_id = IterDomain::merge(first, second); domain_.erase(domain_.begin() + axis_i); @@ -1339,6 +1688,52 @@ TensorDomain* TensorDomain::view( return transformView(this, transforms); } +TensorDomain* TensorDomain::flatten(int64_t start_dim, int64_t end_dim) { + if (start_dim < 0) { + start_dim += nDims(); + } + if (end_dim < 0) { + end_dim += nDims(); + } + + std::vector new_root_domain; + auto inp_domain = noReductions(getMaybeRFactorDomain()); + new_root_domain.reserve(inp_domain.size()); + for (auto id : inp_domain) { + new_root_domain.push_back(id->cloneWithoutRFactor()); + } + + std::vector rfactor_domain; + rfactor_domain.reserve(new_root_domain.size() - (end_dim - start_dim)); + for (auto i : c10::irange(start_dim)) { + rfactor_domain.push_back(new_root_domain[i]); + } + + IterDomain* merged_id = new_root_domain[start_dim]; + for (auto i : c10::irange(start_dim + 1, end_dim + 1)) { + IterDomain* new_merged_id = IrBuilder::create( + merged_id->container(), + merged_id->container()->zeroVal(), + mul(merged_id->extent(), new_root_domain[i]->extent()), + ParallelType::Serial, + IterType::Iteration, + true); + IrBuilder::create(new_merged_id, merged_id, new_root_domain[i]); + merged_id = new_merged_id; + } + rfactor_domain.push_back(merged_id); + + for (auto i : c10::irange(end_dim + 1, nDims())) { + rfactor_domain.push_back(new_root_domain[i]); + } + + return IrBuilder::create( + new_root_domain, + rfactor_domain, + rfactor_domain, + std::vector(rfactor_domain.size(), true)); +} + // TODO: Rfactor a Welford // pair is in order where second is the consumer of first @@ -1389,6 +1784,7 @@ std::pair TensorDomain::rFactor( } Split::Split( + IrBuilderPasskey passkey, IterDomain* outer, IterDomain* inner, IterDomain* in, @@ -1396,14 +1792,18 @@ Split::Split( bool inner_split, Val* start_offset, Val* stop_offset) - : Expr(ExprType::Split), + : Expr(passkey, ExprType::Split), outer_{outer}, inner_{inner}, in_{in}, factor_{factor}, inner_split_{inner_split}, - start_offset_{start_offset != nullptr ? start_offset : new Int(0)}, - stop_offset_{stop_offset != nullptr ? stop_offset : new Int(0)} { + start_offset_{ + start_offset != nullptr ? start_offset + : passkey.ir_container_->zeroVal()}, + stop_offset_{ + stop_offset != nullptr ? stop_offset + : passkey.ir_container_->zeroVal()} { TORCH_INTERNAL_ASSERT( factor_->isAnInt(), "Attempted to create a Split node with a non-integer factor."); @@ -1412,7 +1812,6 @@ Split::Split( addInput(in); // TODO add factor as an input, need to check Split::Split during validation // and need to check BestEffortReplay::findFirstMismatchedID addInput(factor); - name_ = FusionGuard::getCurFusion()->registerExpr(this); } Split::Split(const Split* src, IrCloner* ir_cloner) @@ -1453,12 +1852,15 @@ bool Split::sameAs(const Statement* other) const { stopOffset()->sameAs(other->as()->stopOffset()); } -Merge::Merge(IterDomain* out, IterDomain* outer, IterDomain* inner) - : Expr(ExprType::Merge), out_{out}, outer_{outer}, inner_{inner} { +Merge::Merge( + IrBuilderPasskey passkey, + IterDomain* out, + IterDomain* outer, + IterDomain* inner) + : Expr(passkey, ExprType::Merge), out_{out}, outer_{outer}, inner_{inner} { addOutput(out); addInput(outer); addInput(inner); - name_ = FusionGuard::getCurFusion()->registerExpr(this); } Merge::Merge(const Merge* src, IrCloner* ir_cloner) @@ -1477,6 +1879,12 @@ bool Merge::sameAs(const Statement* other) const { return Expr::sameAs(other); } +NamedScalar::NamedScalar( + IrBuilderPasskey passkey, + std::string name, + DataType dtype) + : Val(passkey, ValType::NamedScalar, dtype), name_(std::move(name)) {} + NamedScalar::NamedScalar(const NamedScalar* src, IrCloner* ir_cloner) : Val(src, ir_cloner), name_(src->name_) {} @@ -1495,13 +1903,15 @@ NamedScalar* NamedScalar::getParallelDim(ParallelType p_type) { isParallelTypeThread(p_type), "Cannot get parallel dim of non thread type, received: ", p_type); + TORCH_INTERNAL_ASSERT(FusionGuard::getCurFusion() != nullptr); std::string parallel_dim = stringifyThreadSize(p_type); - return new NamedScalar(parallel_dim, DataType::Int); + return IrBuilder::create(parallel_dim, DataType::Int); } NamedScalar* NamedScalar::getParallelIndex(ParallelType p_type) { + TORCH_INTERNAL_ASSERT(FusionGuard::getCurFusion() != nullptr); std::string parallel_ind = stringifyThread(p_type); - return new NamedScalar(parallel_ind, DataType::Int); + return IrBuilder::create(parallel_ind, DataType::Int); } c10::optional NamedScalar::getParallelDim() const { diff --git a/torch/csrc/jit/codegen/cuda/ir_printer.h b/torch/csrc/jit/codegen/cuda/ir_printer.h index a2c14386147e..91d07b76b805 100644 --- a/torch/csrc/jit/codegen/cuda/ir_printer.h +++ b/torch/csrc/jit/codegen/cuda/ir_printer.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include diff --git a/torch/csrc/jit/codegen/cuda/ir_utils.cpp b/torch/csrc/jit/codegen/cuda/ir_utils.cpp index 5bf05b0f516f..6415733ba39e 100644 --- a/torch/csrc/jit/codegen/cuda/ir_utils.cpp +++ b/torch/csrc/jit/codegen/cuda/ir_utils.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -140,7 +141,8 @@ struct SubstituteInExpr : public OptInDispatch { reference_->sameAs(unary_expr->in()) ? substitute_ : unary_expr->in(); auto out = reference_->sameAs(unary_expr->out()) ? substitute_ : unary_expr->out(); - expr_ = new UnaryOp(unary_expr->getUnaryOpType(), out, in); + expr_ = IrBuilder::create( + unary_expr->container(), unary_expr->getUnaryOpType(), out, in); } void handle(BinaryOp* binary_expr) final { @@ -151,7 +153,12 @@ struct SubstituteInExpr : public OptInDispatch { auto out = reference_->sameAs(binary_expr->out()) ? substitute_ : binary_expr->out(); - expr_ = new BinaryOp(binary_expr->getBinaryOpType(), out, lhs, rhs); + expr_ = IrBuilder::create( + binary_expr->container(), + binary_expr->getBinaryOpType(), + out, + lhs, + rhs); } void handle(TernaryOp* ternary_expr) final { @@ -163,7 +170,13 @@ struct SubstituteInExpr : public OptInDispatch { : ternary_expr->in3(); auto out = reference_->sameAs(ternary_expr->out()) ? substitute_ : ternary_expr->out(); - expr_ = new TernaryOp(ternary_expr->getTernaryOpType(), out, in1, in2, in3); + expr_ = IrBuilder::create( + ternary_expr->container(), + ternary_expr->getTernaryOpType(), + out, + in1, + in2, + in3); } void handle(ReductionOp* reduction_expr) final { @@ -176,8 +189,42 @@ struct SubstituteInExpr : public OptInDispatch { auto in = reference_->sameAs(reduction_expr->in()) ? substitute_ : reduction_expr->in(); - expr_ = - new ReductionOp(reduction_expr->getReductionOpType(), init, out, in); + expr_ = IrBuilder::create( + reduction_expr->container(), + reduction_expr->getReductionOpType(), + init, + out, + in); + } + + void handle(GroupedReductionOp* grouped_reduction_expr) final { + std::vector outputs; + std::transform( + grouped_reduction_expr->outputs().begin(), + grouped_reduction_expr->outputs().end(), + std::back_inserter(outputs), + [&](Val* val) { return reference_->sameAs(val) ? substitute_ : val; }); + + std::vector inputs; + std::transform( + grouped_reduction_expr->inputs().begin(), + grouped_reduction_expr->inputs().end(), + std::back_inserter(inputs), + [&](Val* val) { return reference_->sameAs(val) ? substitute_ : val; }); + + std::vector init_vals; + std::transform( + grouped_reduction_expr->initVals().begin(), + grouped_reduction_expr->initVals().end(), + std::back_inserter(init_vals), + [&](Val* val) { return reference_->sameAs(val) ? substitute_ : val; }); + + expr_ = IrBuilder::create( + grouped_reduction_expr->container(), + grouped_reduction_expr->getReductionOpTypes(), + init_vals, + outputs, + inputs); } void handle(BroadcastOp* broadcast_expr) final { @@ -187,7 +234,11 @@ struct SubstituteInExpr : public OptInDispatch { auto in = reference_->sameAs(broadcast_expr->in()) ? substitute_ : broadcast_expr->in(); - expr_ = new BroadcastOp(out, in, broadcast_expr->getBroadcastDimFlags()); + expr_ = IrBuilder::create( + broadcast_expr->container(), + out, + in, + broadcast_expr->getBroadcastDimFlags()); } void handle(TransposeOp* transpose_expr) final { @@ -201,7 +252,8 @@ struct SubstituteInExpr : public OptInDispatch { auto in = reference_->sameAs(transpose_expr->in()) ? substitute_->as() : transpose_expr->in(); - expr_ = new TransposeOp(out, in, transpose_expr->new2old()); + expr_ = IrBuilder::create( + transpose_expr->container(), out, in, transpose_expr->new2old()); } void handle(ShiftOp* shift_expr) final { @@ -210,7 +262,12 @@ struct SubstituteInExpr : public OptInDispatch { auto in = reference_->sameAs(shift_expr->in()) ? substitute_ : shift_expr->in(); - expr_ = new ShiftOp(out, in, shift_expr->offsets(), shift_expr->pad()); + expr_ = IrBuilder::create( + shift_expr->container(), + out, + in, + shift_expr->offsets(), + shift_expr->padWidth()); } void handle(GatherOp* gather_expr) final { @@ -219,8 +276,25 @@ struct SubstituteInExpr : public OptInDispatch { auto in = reference_->sameAs(gather_expr->in()) ? substitute_ : gather_expr->in(); - expr_ = new GatherOp( - out, in, gather_expr->windowShape(), gather_expr->padWidth()); + expr_ = IrBuilder::create( + gather_expr->container(), + out, + in, + gather_expr->windowShape(), + gather_expr->padWidth()); + } + + void handle(ViewAsScalar* expr) final { + TORCH_INTERNAL_ASSERT( + substitute_->isA(), + "All args to view must be TensorView, but received a non-TensorView for replacement: ", + substitute_); + auto in = reference_->sameAs(expr->in()) ? substitute_->as() + : expr->in(); + auto out = reference_->sameAs(expr->out()) ? substitute_->as() + : expr->out(); + expr_ = IrBuilder::create( + expr->container(), out, in, expr->vector_id(), expr->index()); } void handle(ViewOp* view_expr) final { @@ -234,7 +308,7 @@ struct SubstituteInExpr : public OptInDispatch { auto out = reference_->sameAs(view_expr->out()) ? substitute_->as() : view_expr->out(); - expr_ = new ViewOp(out, in); + expr_ = IrBuilder::create(view_expr->container(), out, in); } void handle(WelfordOp* welford_expr) final { @@ -268,7 +342,8 @@ struct SubstituteInExpr : public OptInDispatch { welford_expr->initN() && reference_->sameAs(welford_expr->initN()) ? substitute_ : welford_expr->initN(); - expr_ = new WelfordOp( + expr_ = IrBuilder::create( + welford_expr->container(), out_avg, out_var, out_N, @@ -277,7 +352,29 @@ struct SubstituteInExpr : public OptInDispatch { init_N, in_avg, in_var, - in_N); + in_N, + welford_expr->isAllreduce()); + } + + void handle(MmaOp* mma_expr) final { + TORCH_INTERNAL_ASSERT( + substitute_->isA(), + "All args to MmaOp must be TensorView, but received a non-TensorView for replacement: ", + substitute_); + auto in_a = reference_->sameAs(mma_expr->inA()) + ? substitute_->as() + : mma_expr->inA(); + auto in_b = reference_->sameAs(mma_expr->inB()) + ? substitute_->as() + : mma_expr->inB(); + auto out = reference_->sameAs(mma_expr->out()) + ? substitute_->as() + : mma_expr->out(); + auto init = reference_->sameAs(mma_expr->init()) + ? substitute_->as() + : mma_expr->init(); + expr_ = IrBuilder::create( + mma_expr->container(), out, in_a, in_b, init, mma_expr->options()); } private: @@ -307,23 +404,24 @@ TensorView* rfactorHelper( auto w_var = welford->outVar()->as(); auto w_n = welford->outN()->as(); - WelfordResult rtvs = reduction_tv->rFactor(axes, w_avg, w_var, w_n); + auto rtvs = + reduction_tv->rFactor(axes, std::vector{w_avg, w_var, w_n}); if (reduction_tv == w_n) { - return rtvs.n; + return rtvs.at(2); } else if (reduction_tv == w_var) { - return rtvs.var_sum; + return rtvs.at(1); } else { - return rtvs.avg; + return rtvs.at(0); } } namespace { -std::vector uniqueEntries( - const std::vector& tv_deuqe) { - std::vector unique_entries; - std::unordered_set inserted; +template +std::vector uniqueEntries(const std::vector& tv_deuqe) { + std::vector unique_entries; + std::unordered_set inserted; for (auto tv_entry : tv_deuqe) { if (inserted.emplace(tv_entry).second) { unique_entries.emplace_back(tv_entry); @@ -334,13 +432,59 @@ std::vector uniqueEntries( } // namespace +// Return immediate producers of val +TORCH_CUDA_CU_API std::vector producerValsOf(Val* val) { + if (val->definition() == nullptr) { + return {}; + } + auto producer_vals = val->definition()->inputs(); + return uniqueEntries({producer_vals.begin(), producer_vals.end()}); +} + +// Return immediate consumers of val +TORCH_CUDA_CU_API std::vector consumerValsOf(Val* val) { + std::vector consumer_vals; + for (auto use_expr : val->uses()) { + auto outputs = use_expr->outputs(); + consumer_vals.insert(consumer_vals.end(), outputs.begin(), outputs.end()); + } + return uniqueEntries(consumer_vals); +} + +// Return immediate producers of val +TORCH_CUDA_CU_API std::vector producerValsOf( + const std::vector& vals) { + std::vector all_producer_vals; + for (auto val : vals) { + auto producer_vals = producerValsOf(val); + all_producer_vals.insert( + all_producer_vals.end(), producer_vals.begin(), producer_vals.end()); + } + + return uniqueEntries(all_producer_vals); +} + +// Return immediate consumers of val +TORCH_CUDA_CU_API std::vector consumerValsOf( + const std::vector& vals) { + std::vector all_consumer_vals; + for (auto val : vals) { + auto consumer_vals = consumerValsOf(val); + all_consumer_vals.insert( + all_consumer_vals.end(), consumer_vals.begin(), consumer_vals.end()); + } + + return uniqueEntries(all_consumer_vals); +} + std::vector producerTvsOf(TensorView* tv) { if (tv->definition() == nullptr) { return {}; } auto producer_vals = ir_utils::filterByType(tv->definition()->inputs()); - return uniqueEntries({producer_vals.begin(), producer_vals.end()}); + return uniqueEntries( + {producer_vals.begin(), producer_vals.end()}); } std::vector consumerTvsOf(TensorView* tv) { @@ -349,7 +493,7 @@ std::vector consumerTvsOf(TensorView* tv) { auto outputs = ir_utils::filterByType(use_expr->outputs()); consumer_tvs.insert(consumer_tvs.end(), outputs.begin(), outputs.end()); } - return uniqueEntries(consumer_tvs); + return uniqueEntries(consumer_tvs); } std::vector producerTvsOf(const std::vector& tvs) { @@ -360,7 +504,7 @@ std::vector producerTvsOf(const std::vector& tvs) { all_producer_tvs.end(), producer_tvs.begin(), producer_tvs.end()); } - return uniqueEntries(all_producer_tvs); + return uniqueEntries(all_producer_tvs); } std::vector consumerTvsOf(const std::vector& tvs) { @@ -371,7 +515,7 @@ std::vector consumerTvsOf(const std::vector& tvs) { all_consumer_tvs.end(), consumer_tvs.begin(), consumer_tvs.end()); } - return uniqueEntries(all_consumer_tvs); + return uniqueEntries(all_consumer_tvs); } std::vector inputTvsOf(TensorView* tv) { @@ -386,29 +530,177 @@ std::vector inputTvsOf(std::vector tvs) { auto inp_vals = IterVisitor::getInputsTo({tvs.begin(), tvs.end()}); auto filtered = ir_utils::filterByType(inp_vals); std::vector inp_tvs(filtered.begin(), filtered.end()); - return uniqueEntries(inp_tvs); + return uniqueEntries(inp_tvs); } std::vector outputTvsOf(std::vector tvs) { auto out_vals = DependencyCheck::getAllOutputsOf({tvs.begin(), tvs.end()}); auto filtered = ir_utils::filterByType(out_vals); std::vector out_tvs(filtered.begin(), filtered.end()); - return uniqueEntries(out_tvs); + return uniqueEntries(out_tvs); } std::vector allTvs(Fusion* fusion) { auto used_vals = fusion->usedMathVals(); auto used_tvs = ir_utils::filterByType(used_vals); - return uniqueEntries({used_tvs.begin(), used_tvs.end()}); + + // This shouldn't be necessary but FusionSegmentIoAlias_CUDA due to aliasing + // is having an input disconnected from outputs, and these iter domains are + // being checked in compute at maps in scheduling logic. This shouldn't hurt + // AFAICT. + auto tv_inputs = ir_utils::filterByType(fusion->inputs()); + + std::vector all_tvs({used_tvs.begin(), used_tvs.end()}); + // Sometimes inputs are not connected to outputs, however, we still include + // them when returning allTvs because they are registered as an input. + all_tvs.insert(all_tvs.end(), tv_inputs.begin(), tv_inputs.end()); + + // all_tvs has duplicates, to deduplicate it and return + return uniqueEntries(all_tvs); } -std::vector historyOf(TensorDomain* td) { - return ExprSort::getExprs( - td->fusion(), {td->domain().begin(), td->domain().end()}); +std::vector getReductionOps(Fusion* fusion, bool ignore_trivial) { + std::vector red_ops; + + auto isReduction = [&ignore_trivial](Val* out_val) { + if (out_val == nullptr || !out_val->isA()) { + return false; + } + auto out_tv = out_val->as(); + return std::any_of( + out_tv->getRootDomain().begin(), + out_tv->getRootDomain().end(), + [&ignore_trivial](IterDomain* id) { + return id->isReduction() && + !(ignore_trivial && id->isTrivialReduction()); + }); + }; + + for (auto expr : fusion->exprs()) { + bool is_reduction = false; + if (expr->isA()) { + is_reduction = isReduction(expr->as()->out()); + } else if (expr->isA()) { + is_reduction = std::any_of( + expr->as()->outputs().begin(), + expr->as()->outputs().end(), + isReduction); + } else if (expr->isA()) { + is_reduction = isReduction(expr->as()->outAvg()); + } + if (is_reduction) { + red_ops.push_back(expr); + } + } + + return red_ops; } -std::vector historyOf(TensorView* tv) { - return historyOf(tv->domain()); +namespace { + +class ValReplacementMutator : private OptOutMutator { + public: + ValReplacementMutator( + Fusion* fusion, + const std::unordered_map& replacement_map) + : replacement_map_(replacement_map) { + FusionGuard fg(fusion); + + // Welford makes this a little annoying since it holds a count which is + // typically not used by anything else. If we don't grab that count, then it + // would be a tensorview that doesn't get updated extents. Therefore, first + // grab all leaves towards outputs and grab stmts from there. + auto stmts = StmtSort::getStmts(fusion, allLeafOuts(fusion), true); + for (auto stmt : stmts) { + mutate(stmt); + } + } + + private: + using OptOutMutator::mutate; + void mutate(Val* val) final { + if (replacement_map_.find(val) == replacement_map_.end()) { + return OptOutMutator::mutate(val); + } + auto replaced_val = replacement_map_.at(val); + registerMutation(val, replaced_val); + } + + std::vector allLeafOuts(Fusion* fusion) { + auto exprs = StmtSort::getExprs(fusion, true); + std::unordered_set inputs; + std::unordered_set outputs; + std::vector ordered_outputs; + for (auto expr : exprs) { + inputs.insert(expr->inputs().begin(), expr->inputs().end()); + outputs.insert(expr->outputs().begin(), expr->outputs().end()); + ordered_outputs.insert( + ordered_outputs.end(), + expr->outputs().begin(), + expr->outputs().end()); + } + for (auto input : inputs) { + outputs.erase(input); + } + + std::vector ordered_leaf_outs; + for (auto out : ordered_outputs) { + if (outputs.find(out) != outputs.end()) { + ordered_leaf_outs.push_back(out); + } + } + return ordered_leaf_outs; + } + + const std::unordered_map& replacement_map_; +}; + +} // namespace + +void replaceValue( + Fusion* fusion, + const std::unordered_map& replacement_map) { + ValReplacementMutator(fusion, replacement_map); +} + +Val* getReductionInitValOf(TensorView* tv) { + auto def = tv->definition(); + if (def == nullptr) { + return nullptr; + } + + Val* init = nullptr; + if (auto rop = dynamic_cast(def)) { + init = rop->init(); + } else if (auto grop = dynamic_cast(def)) { + int output_idx = -1; + for (const auto i : c10::irange(grop->numReductions())) { + if (tv == grop->output(i)) { + output_idx = static_cast(i); + break; + } + } + TORCH_INTERNAL_ASSERT( + output_idx >= 0, + "Matching output not found for GroupedReductionOp: ", + tv->toString(), + ". Defined by: ", + def->toString()); + init = grop->initVal(output_idx); + } else if (auto wop = dynamic_cast(def)) { + if (tv == wop->outAvg()) { + init = wop->initAvg(); + } else if (tv == wop->outVar()) { + init = wop->initVar(); + } else { + TORCH_INTERNAL_ASSERT(tv == wop->outN()); + init = wop->initN(); + } + } else if (auto mma = dynamic_cast(def)) { + init = mma->init(); + } + + return init; } } // namespace ir_utils diff --git a/torch/csrc/jit/codegen/cuda/ir_utils.h b/torch/csrc/jit/codegen/cuda/ir_utils.h index c8dc2e6f6796..0b05b6fb5e86 100644 --- a/torch/csrc/jit/codegen/cuda/ir_utils.h +++ b/torch/csrc/jit/codegen/cuda/ir_utils.h @@ -10,8 +10,14 @@ namespace torch { namespace jit { namespace fuser { namespace cuda { + namespace ir_utils { +// Replace values in fusion using ValReplacementMutator +void replaceValue( + Fusion*, + const std::unordered_map& replacement_map); + template class FilterIterator { public: @@ -110,6 +116,9 @@ auto filterByType(InputIt first, InputIt last) { return FilteredView(first, last); } +template +auto filterByType(const ContainerType&& inputs) = delete; + template auto filterByType(const ContainerType& inputs) { return filterByType(inputs.cbegin(), inputs.cend()); @@ -144,17 +153,87 @@ TORCH_CUDA_CU_API TensorView* rfactorHelper( TensorView* red_tv, const std::vector& axes); -// Return immediate producers of tv +// Return immediate producers of val, this function can be used on any Val and +// will return producers through Exprs. +// +// Warning: returned val's are not guaranteed to be between fusion inputs and +// outputs. This function simply uses val->definition() or val->uses() which is +// limited to not go through fusion inputs/outputs, but if on a path that isn't +// strictly between fusion inputs/outputs, it could effectively return dead +// code. +TORCH_CUDA_CU_API std::vector producerValsOf(Val* val); + +// Return immediate consumers of val, this function can be used on any Val and +// will return consumers through Exprs. +// +// Warning: returned val's are not guaranteed to be between fusion inputs and +// outputs. This function simply uses val->definition() or val->uses() which is +// limited to not go through fusion inputs/outputs, but if on a path that isn't +// strictly between fusion inputs/outputs, it could effectively return dead +// code. +TORCH_CUDA_CU_API std::vector consumerValsOf(Val* val); + +// Return immediate producers of vals, this function can be used on any vals and +// will return producers through Exprs. +// +// Warning: returned val's are not guaranteed to be between fusion inputs and +// outputs. This function simply uses val->definition() or val->uses() which is +// limited to not go through fusion inputs/outputs, but if on a path that isn't +// strictly between fusion inputs/outputs, it could effectively return dead +// code. +TORCH_CUDA_CU_API std::vector producerValsOf( + const std::vector& vals); + +// Return immediate consumers of vals, this function can be used on any vals and +// will return consumers through Exprs. +// +// Warning: returned val's are not guaranteed to be between fusion inputs and +// outputs. This function simply uses val->definition() or val->uses() which is +// limited to not go through fusion inputs/outputs, but if on a path that isn't +// strictly between fusion inputs/outputs, it could effectively return dead +// code. +TORCH_CUDA_CU_API std::vector consumerValsOf( + const std::vector& vals); + +// Return immediate producers of tv, this function will return all immediate +// producers of tv through Exprs. +// +// Warning: returned tv's are not guaranteed to be between fusion inputs and +// outputs. This function simply uses tv->definition() or tv->uses() which is +// limited to not go through fusion inputs/outputs, but if on a path that isn't +// strictly between fusion inputs/outputs, it could effectively return dead +// code. TORCH_CUDA_CU_API std::vector producerTvsOf(TensorView* tv); -// Return immediate consumers of tv +// Return immediate consumers of tv, this function will return all immediate +// consumers of tv through Exprs. +// +// Warning: returned tv's are not guaranteed to be between fusion inputs and +// outputs. This function simply uses tv->definition() or tv->uses() which is +// limited to not go through fusion inputs/outputs, but if on a path that isn't +// strictly between fusion inputs/outputs, it could effectively return dead +// code. TORCH_CUDA_CU_API std::vector consumerTvsOf(TensorView* tv); -// Return immediate producers of tvs (can return tvs input) +// Return immediate producers of tvs, this function will return all immediate +// producers of tvs through Exprs. +// +// Warning: returned tv's are not guaranteed to be between fusion inputs and +// outputs. This function simply uses tv->definition() or tv->uses() which is +// limited to not go through fusion inputs/outputs, but if on a path that isn't +// strictly between fusion inputs/outputs, it could effectively return dead +// code. TORCH_CUDA_CU_API std::vector producerTvsOf( const std::vector& tvs); -// Return immediate consumers of tvs (can return tvs input) +// Return immediate consumers of tvs, this function will return all immediate +// consumers of tvs through Exprs. +// +// Warning: returned tv's are not guaranteed to be between fusion inputs and +// outputs. This function simply uses tv->definition() or tv->uses() which is +// limited to not go through fusion inputs/outputs, but if on a path that isn't +// strictly between fusion inputs/outputs, it could effectively return dead +// code. TORCH_CUDA_CU_API std::vector consumerTvsOf( const std::vector& tvs); @@ -175,11 +254,24 @@ TORCH_CUDA_CU_API std::vector outputTvsOf( // returns all tensor views in fusion that are used between outputs and inputs. TORCH_CUDA_CU_API std::vector allTvs(Fusion* fusion); -// Returns the history of expressions applied to the domains of td -TORCH_CUDA_CU_API std::vector historyOf(TensorDomain* td); - -// Returns the history of expressions applied to the domains of tv -TORCH_CUDA_CU_API std::vector historyOf(TensorView* tv); +TORCH_CUDA_CU_API std::vector getReductionOps( + Fusion* fusion, + bool ignore_trivial = true); + +// Returns the initialization value of tv or nullptr if not initialized. +TORCH_CUDA_CU_API Val* getReductionInitValOf(TensorView* tv); + +template +std::string toString(const T& nodes) { + std::stringstream ss; + for (Statement* stmt : nodes) { + if (ss.tellp() != 0) { + ss << ", "; + } + ss << stmt->toString(); + } + return ss.str(); +} } // namespace ir_utils } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/iter_visitor.cpp b/torch/csrc/jit/codegen/cuda/iter_visitor.cpp index 344df98f5a75..6ae4e7374df5 100644 --- a/torch/csrc/jit/codegen/cuda/iter_visitor.cpp +++ b/torch/csrc/jit/codegen/cuda/iter_visitor.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include namespace torch { @@ -31,21 +32,98 @@ void remove_visited( } } +// Return all dependencies of a node including members of the node. +class RecursiveDependencies : public OptInDispatch { + public: + static std::vector next(Statement* stmt) { + RecursiveDependencies find_next(stmt); + return find_next.next_stmts_; + } + + private: + RecursiveDependencies() = default; + + RecursiveDependencies(Statement* stmt) { + handle(stmt); + } + + using OptInDispatch::handle; + + void handle(Expr* expr) final { + FusionGuard::getCurFusion()->assertInContainer( + expr, + "IterVisitor.cpp::RecursiveDependencies::handle(Expr*) Cannot traverse expr, "); + next_stmts_.insert( + next_stmts_.end(), expr->inputs().begin(), expr->inputs().end()); + } + + void handle(Val* val) final { + FusionGuard::getCurFusion()->assertInContainer( + val, + "IterVisitor.cpp::RecursiveDependencies::handle(Val*) Cannot traverse val, "); + OptInDispatch::handle(val); + } + + void simpleVal(Val* val) { + if (val->definition() == nullptr) { + return; + } + next_stmts_.push_back(val->definition()); + } + + void handle(Bool* stmt) final { + simpleVal(stmt); + } + + void handle(Double* stmt) final { + simpleVal(stmt); + } + + void handle(Int* stmt) final { + simpleVal(stmt); + } + + void handle(ComplexDouble* stmt) final { + simpleVal(stmt); + } + + void handle(NamedScalar* stmt) final { + simpleVal(stmt); + } + + void handle(IterDomain* stmt) final { + next_stmts_.push_back(stmt->start()); + next_stmts_.push_back(stmt->extent()); + next_stmts_.push_back(stmt->stopOffset()); + simpleVal(stmt); + } + + void handle(TensorDomain* stmt) final { + next_stmts_.insert( + next_stmts_.end(), stmt->domain().begin(), stmt->domain().end()); + simpleVal(stmt); + } + + void handle(TensorView* tv) final { + next_stmts_.push_back(tv->domain()); + simpleVal(tv); + } + + std::vector next_stmts_; +}; + } // namespace std::vector IterVisitor::next(Statement* stmt) { if (stmt->isVal()) { return next(stmt->as()); - } else if (stmt->isExpr()) { - return next(stmt->as()); } else { - TORCH_INTERNAL_ASSERT( - false, "IterVisitor could not detect type in next_dispatch."); + return next(stmt->as()); } } std::vector IterVisitor::next(Val* v) { - FusionGuard::getCurFusion()->assertInFusion(v, "Cannot traverse val, "); + FusionGuard::getCurFusion()->assertInContainer(v, "Cannot traverse val, "); if (v->definition() != nullptr) { return {v->definition()}; } @@ -53,7 +131,8 @@ std::vector IterVisitor::next(Val* v) { } std::vector IterVisitor::next(Expr* expr) { - FusionGuard::getCurFusion()->assertInFusion(expr, "Cannot traverse expr, "); + FusionGuard::getCurFusion()->assertInContainer( + expr, "Cannot traverse expr, "); std::vector next_stmts{ expr->inputs().begin(), expr->inputs().end()}; return next_stmts; @@ -93,7 +172,8 @@ void IterVisitor::handle(Val* v) { void IterVisitor::traverseFrom( Fusion* fusion, const std::vector& from, - bool traverseAllPaths) { + bool traverseAllPaths, + bool traverseIntoMembers) { FusionGuard fg(fusion); std::unordered_set visited; @@ -137,7 +217,8 @@ void IterVisitor::traverseFrom( } else { // We're not ready to process this node, so add all its inputs to be // checked Visit input nodes. - auto next_stmts = next(stmt); + auto next_stmts = + traverseIntoMembers ? RecursiveDependencies::next(stmt) : next(stmt); // We may want to retraverse nodes, in that case revisit everything! if (!traverseAllPaths) { // If we don't want to retraverse, remove nodes we already visisted. @@ -180,17 +261,29 @@ namespace { // expressions. class Inputs : public IterVisitor { private: - //! Optional list of all input vals. If empty, vals with no defining - //! expression are considered as inputs. + //! Optional list of input vals. While traversing to inputs if a value in the + //! all_inputs list is found, that value will be added to the inputs_ and + //! traversal will not go into its definition. Otherwise traversal follows + //! definition paths until hitting a definition that is a nullptr (i.e. a + //! terminating input). const std::vector& all_inputs_; std::vector inputs_; Inputs(const std::vector& all_inputs) : all_inputs_(all_inputs) {} + std::vector next(Val* v) override { + if (std::find(inputs_.begin(), inputs_.end(), v) != inputs_.end()) { + return {}; + } + return IterVisitor::next(v); + } + void handle(Val* val) override { - if ((all_inputs_.empty() && val->definition() == nullptr) || + // If there's no definition to val, or val is within the provided inputs + if (val->definition() == nullptr || std::find(all_inputs_.begin(), all_inputs_.end(), val) != all_inputs_.end()) { + // if not already placed in the inputs if (std::find(inputs_.begin(), inputs_.end(), val) == inputs_.end()) { inputs_.push_back(val); } @@ -308,7 +401,7 @@ void BackwardVisitor::traverseFrom( auto vals = AllVals::get(fusion, from); - auto exprs = ExprSort::getExprs(fusion, from); + auto exprs = StmtSort::getExprs(fusion, from); { size_t pos = 0; @@ -516,6 +609,9 @@ class DependentVals : public IterVisitor { std::unordered_set outs_; // Boundary where we want to stop searching beyond + // TODO: Based on the todo below, shouldn't we stop just at the definition of? + // If we really wanted to make this traverse left, wouldn't we first check + // which outputs are outputs dependent on of? std::unordered_set boundary_; std::vector next(Val* v) override { @@ -539,6 +635,11 @@ class DependentVals : public IterVisitor { } // optimization to limit search path + // TODO: Is this valid? Couldn't something like: + // out0 = of + val0 + // out1 = out0 + val1 + // out2 = TernaryOp(out1, val0, of) + // Hide the dep of out1 on of? void createBoundary() { for (auto v_of : of_) { for (auto v_expr : v_of->uses()) { @@ -704,22 +805,41 @@ std::unordered_set DependencyCheck::getAllDependentVals( return DependentVals::getAllDependentVals(of); } -void ExprSort::handle(Expr* expr) { - exprs.push_back(expr); +void StmtSort::handle(Statement* stmt) { + stmts.push_back(stmt); } -std::vector ExprSort::getExprs(Fusion* fusion) { - ExprSort es; - es.traverse(fusion); - return es.exprs; +std::vector StmtSort::getExprs(Fusion* fusion, bool traverse_members) { + auto terminating_outputs = fusion->getTerminatingOutputs(); + return StmtSort::getExprs(fusion, terminating_outputs, traverse_members); } -std::vector ExprSort::getExprs( +std::vector StmtSort::getExprs( Fusion* fusion, - const std::vector& from) { - ExprSort es; - es.traverseFrom(fusion, from, false); - return es.exprs; + const std::vector& from, + bool traverse_members) { + StmtSort es; + es.traverseFrom(fusion, from, false, traverse_members); + auto stmts = StmtSort::getStmts(fusion, from, traverse_members); + auto filter = ir_utils::filterByType(stmts.begin(), stmts.end()); + std::vector exprs(filter.begin(), filter.end()); + return exprs; +} + +std::vector StmtSort::getStmts( + Fusion* fusion, + bool traverse_members) { + auto terminating_outputs = fusion->getTerminatingOutputs(); + return StmtSort::getStmts(fusion, terminating_outputs, traverse_members); +} + +std::vector StmtSort::getStmts( + Fusion* fusion, + const std::vector& from, + bool traverse_members) { + StmtSort es; + es.traverseFrom(fusion, from, false, traverse_members); + return es.stmts; } void InputsOf::handle(Val* v) { diff --git a/torch/csrc/jit/codegen/cuda/iter_visitor.h b/torch/csrc/jit/codegen/cuda/iter_visitor.h index d4aa56ea2fef..2447933d7373 100644 --- a/torch/csrc/jit/codegen/cuda/iter_visitor.h +++ b/torch/csrc/jit/codegen/cuda/iter_visitor.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -83,18 +83,21 @@ class TORCH_CUDA_CU_API IterVisitor : public OptOutDispatch { void traverseHelper(Fusion* fusion, bool traverse_all_paths = false); public: - // Starts at nodes provided in from, traverses from these nodes to inputs. - // Calls handle on all Statement*s in topological sorted order. - // traverseAllPaths = false only call handle on each Statement* once - // traverseAllPaths = true traverses all paths from nodes in from to inputs. - // Handle on a Statement* for every path from "from" nodes, to inputs. - // to argument allows specification of nodes to stop at if we want to stop - // beffore we hit all leaf nodes. This can be helpful when we want to traverse - // from TensorView::domain(), to the rfactor domain, instead of root domain. + //! Starts at nodes provided in from, traverses from these nodes to inputs. + //! Calls handle on all Statement*s in topological sorted order. + //! \param traverseAllPaths = false only call handle on each Statement* once + //! traverseAllPaths = true traverses all paths from nodes in from to + //! inputs. Calls handle on a Statement* for every path from "from" nodes, + //! to inputs. + //! \param traverseIntoMembers = When hitting nodes like TensorView, + //! TensorDomain, or IterDomain where there are members of the nodes that are + //! Val's a value of "true" will also traverse into those member Val's, a + //! value of "false" will not traverse into the members. void traverseFrom( Fusion* fusion, const std::vector& from, - bool traverseAllPaths = false); + bool traverseAllPaths = false, + bool traverseIntoMembers = false); // Iterates from terminating outputs registered with the fusion. Terminating // means value is not used to generate any other value used in producing @@ -246,18 +249,40 @@ class TORCH_CUDA_CU_API DependencyCheck { // Expr sort will take a fusion and return a topologically sorted list of // expressions. -class ExprSort : public IterVisitor { +class StmtSort : public IterVisitor { protected: - std::vector exprs; + std::vector stmts; - void handle(Expr* expr) override; + void handle(Statement* stmt) override; public: - static std::vector getExprs(Fusion* fusion); + // If traverse_members it will also extract all member nodes in the sorted + // expr list in the fusion. i.e. all expressions on IterDomains, extents, etc + static std::vector getExprs( + Fusion* fusion, + bool traverse_members = false); + // If traverse_members it will also extract all member nodes in the sorted + // expr list in the fusion. i.e. all expressions on IterDomains, extents, etc static std::vector getExprs( Fusion* fusion, - const std::vector& from); + const std::vector& from, + bool traverse_members = false); + + // If traverse_members it will also extract all member nodes in the sorted + // statement list in the fusion. i.e. all IterDomains, extents, and associated + // expressions of them + static std::vector getStmts( + Fusion* fusion, + bool traverse_members = false); + + // If traverse_members it will also extract all member nodes in the sorted + // expr list in the fusion. i.e. all IterDomains, extents, and associated + // expressions of them + static std::vector getStmts( + Fusion* fusion, + const std::vector& from, + bool traverse_members = false); }; class InputsOf : public IterVisitor { diff --git a/torch/csrc/jit/codegen/cuda/kernel.cpp b/torch/csrc/jit/codegen/cuda/kernel.cpp index d3ef9eeb95d5..cbbc4f53462e 100644 --- a/torch/csrc/jit/codegen/cuda/kernel.cpp +++ b/torch/csrc/jit/codegen/cuda/kernel.cpp @@ -1,7 +1,8 @@ #include +#include #include #include -#include +#include #include #include @@ -11,22 +12,24 @@ namespace torch { namespace jit { namespace fuser { namespace cuda { + +IrBuilderPasskey::IrBuilderPasskey(IrContainer* ir_container) + : ir_container_(ir_container) {} + namespace kir { namespace { //! Scan all primary expressions in the Kernel IR and build //! lists of specialized nodes and other interesting information -class KernelIrScanner : private kir::IrVisitor { +class KernelIrScanner : private IrVisitor { public: explicit KernelIrScanner(const Kernel* kernel) { - for (const auto& ir_node : kernel->irNodes()) { - ir_node->accept(this); - } + IrVisitor::handle(kernel->topLevelExprs()); const auto gpu_lower = GpuLower::current(); for (auto split : gpu_lower->nonDivisibleSplitInfo().splitsToValidate()) { - auto extent = gpu_lower->lowerValue(split->in()->extent()); - auto factor = gpu_lower->lowerValue(split->factor()); + auto extent = split->in()->extent(); + auto factor = split->factor(); summary_.splits_to_validate.emplace_back(extent, factor); } } @@ -36,7 +39,17 @@ class KernelIrScanner : private kir::IrVisitor { } private: - void visit(const kir::Sync* sync) final { + using IrVisitor::handle; + void handle(Expr* expr) final { + IrVisitor::handle(expr); + for (auto inp : expr->inputs()) { + handle(inp); + } + for (auto out : expr->outputs()) { + handle(out); + } + } + void handle(BlockSync* sync) final { // TODO: Move to a dedicated validation pass // which is not on the common execution/compilation path if (sync->isWarHazardSync()) { @@ -44,17 +57,17 @@ class KernelIrScanner : private kir::IrVisitor { } } - void visit(const kir::Allocate* allocate) final { + void handle(GridSync* sync) final { + summary_.has_cooperative_grid_reduction = true; + } + + void handle(Allocate* allocate) final { switch (allocate->memoryType()) { case MemoryType::Global: summary_.global_allocations.push_back(allocate); break; case MemoryType::Shared: - if (ExpressionEvaluator::isConst(allocate->size())) { - summary_.static_smem_allocations.push_back(allocate); - } else { - summary_.dynamic_smem_allocations.push_back(allocate); - } + summary_.dynamic_smem_allocations.push_back(allocate); break; case MemoryType::Local: if (!ExpressionEvaluator::isConst(allocate->size())) { @@ -65,28 +78,23 @@ class KernelIrScanner : private kir::IrVisitor { } } - void visit(const kir::UnaryOp* unary_op) final { - if (unary_op->operation() == UnaryOpType::RandLike) { + void handle(UnaryOp* unary_op) final { + if (unary_op->getUnaryOpType() == UnaryOpType::RandLike) { // This kernel is using random numbers summary_.is_stochastic = true; } } - void visit(const kir::TensorIndex* tensor_index) final { + void handle(TensorIndex* tensor_index) final { const auto tv = tensor_index->view(); const auto domain = tv->domain(); - // Do we have any reductions? summary_.has_block_reductions = summary_.has_block_reductions || domain->hasBlockReduction(); - // Do we have block broadcasts? - summary_.has_block_broadcasts = - summary_.has_block_broadcasts || domain->hasBlockBroadcast(); - // Update the largest smem data type if (domain->hasBlockReduction() || domain->hasGridReduction() || - tv->memoryType() == MemoryType::Shared) { + tv->getMemoryType() == MemoryType::Shared) { const auto data_type = tv->dtype(); const size_t type_size = dataTypeSize(data_type); if (type_size > max_smem_type_size_) { @@ -94,38 +102,54 @@ class KernelIrScanner : private kir::IrVisitor { summary_.largest_smem_data_type = data_type; } } + } - // Update Welford - if (tensor_index->definition() != nullptr && - tensor_index->definition()->isA()) { - summary_.has_welford = true; - summary_.has_block_welford = - summary_.has_block_welford || domain->hasBlockReduction(); - summary_.has_grid_welford = - summary_.has_grid_welford || domain->hasGridReduction(); + void handle(WelfordOp* welford_op) final { + summary_.has_welford = true; + TORCH_INTERNAL_ASSERT(welford_op->outAvg()->isA()); + auto out_dom = welford_op->outAvg()->as()->view()->domain(); + summary_.has_block_welford = + summary_.has_block_welford || out_dom->hasBlockReduction(); + } + + void handle(GridWelford* grid_welford) final { + summary_.has_welford = true; + summary_.has_grid_welford = true; + summary_.has_grid_reductions = true; + if (grid_welford->welford_op()->isAllreduce()) { + summary_.has_cooperative_grid_reduction = true; } } - void visit(const kir::GridWelford* grid_welford) final { - const auto dom = grid_welford->welford_op() - ->out() - ->as() - ->view() - ->domain(); - updateGridReductionInLoop(dom); + void handle(GridReduction* grid_reduction) final { + summary_.has_grid_reductions = true; + if (grid_reduction->isAllreduce()) { + summary_.has_cooperative_grid_reduction = true; + } } - void visit(const kir::GridReduction* grid_reduction) final { - const auto dom = grid_reduction->reduction_op() - ->out() - ->as() - ->view() - ->domain(); + void handle(GroupedGridReduction* grid_reduction) final { + summary_.has_grid_reductions = true; + const auto dom = ir_utils::getTvOutput(grid_reduction)->domain(); updateGridReductionInLoop(dom); } - void visit(const kir::GridBroadcast*) final { + void handle(GridBroadcast* grid_broadcast) final { summary_.has_cooperative_grid_reduction = true; + handle(grid_broadcast->broadcast_op()); + } + + void handle(BroadcastOp* bop) final { + const ParallelTypeBitmap parallel_types = + GpuLower::current()->threadPredMap().getParallelBroadcastDomains( + bop->out()->as()->view()); + summary_.broadcast_parallel_types.emplace(bop, parallel_types); + // Do we have block broadcasts? + summary_.has_block_broadcasts = + summary_.has_block_broadcasts || parallel_types.hasTID(); + // Do we have grid broadcasts? + summary_.has_grid_broadcasts = + summary_.has_grid_broadcasts || parallel_types.hasBID(); } private: @@ -134,12 +158,9 @@ class KernelIrScanner : private kir::IrVisitor { private: void updateGridReductionInLoop(TensorDomain* dom) { - summary_.has_grid_reductions = true; - - const auto gpu_lower = GpuLower::current(); for (const auto i : c10::irange(dom->nDims())) { - const auto id = - gpu_lower->caParallelMap().getConcreteMappedID(dom->domain()[i]); + const auto id = GpuLower::current()->caMap()->getConcreteMappedID( + dom->domain()[i], IdMappingMode::LOOP); summary_.has_cooperative_grid_reduction = summary_.has_cooperative_grid_reduction || @@ -169,7 +190,7 @@ class KernelIrScanner : private kir::IrVisitor { //! MemoryType::Global for tensors parallelized with blockIdx), it is //! assumed that allocation is properly extended for the iteration //! count. -class ValidateAllocation : private kir::IrVisitor { +class ValidateAllocation : private OptOutConstDispatch { public: static void validate(const Kernel* kernel) { ValidateAllocation validate_allocation(kernel); @@ -178,14 +199,14 @@ class ValidateAllocation : private kir::IrVisitor { private: explicit ValidateAllocation(const Kernel* kernel) { live_allocations_.emplace_back(std::vector()); - for (const auto& ir_node : kernel->topLevelExprs()) { - ir_node->accept(this); + for (const auto& expr : kernel->topLevelExprs()) { + OptOutConstDispatch::handle(expr); } live_allocations_.pop_back(); TORCH_INTERNAL_ASSERT(live_allocations_.empty()); } - void visit(const kir::Allocate* allocate) final { + void handle(const Allocate* allocate) final { TORCH_INTERNAL_ASSERT(!live_allocations_.empty()); live_allocations_.back().push_back(allocate); } @@ -195,53 +216,53 @@ class ValidateAllocation : private kir::IrVisitor { // during in the allocation lowering if it's thread-parallel and not // allocated on shared or global memories, or if it's block-parallel // ando not allocated on global memory. - void validate(const kir::ForLoop* for_loop) { + void validate(const ForLoop* for_loop) { const auto loop_id = for_loop->iter_domain(); - const auto gpu_lower = GpuLower::current(); for (const auto& allocations : live_allocations_) { for (const auto& allocate : allocations) { - const auto tv = dynamic_cast(allocate->buffer()); + const auto tv = dynamic_cast(allocate->buffer()); if (tv == nullptr) { continue; } for (const auto& axis : tv->domain()->domain()) { - if (!gpu_lower->caParallelMap().areMapped(loop_id, axis)) { + if (!GpuLower::current()->caMap()->areMapped( + loop_id, axis, IdMappingMode::LOOP)) { continue; } - if (isParallelTypeThreadDim(loop_id->parallelType())) { + if (isParallelTypeThreadDim(loop_id->getParallelType())) { TORCH_INTERNAL_ASSERT( - tv->memoryType() == MemoryType::Shared || - tv->memoryType() == MemoryType::Global, + tv->getMemoryType() == MemoryType::Shared || + tv->getMemoryType() == MemoryType::Global, "Tensor t", tv->name(), " must be allocated on SMEM or GMEM."); - } else if (isParallelTypeBlockDim(loop_id->parallelType())) { - TORCH_INTERNAL_ASSERT(tv->memoryType() == MemoryType::Global); + } else if (isParallelTypeBlockDim(loop_id->getParallelType())) { + TORCH_INTERNAL_ASSERT(tv->getMemoryType() == MemoryType::Global); } } } } } - void visit(const kir::ForLoop* for_loop) final { + void handle(const ForLoop* for_loop) final { if (for_loop->stop() != for_loop->iter_domain()->extent() && - isParallelTypeThread(for_loop->iter_domain()->parallelType())) { + isParallelTypeThread(for_loop->iter_domain()->getParallelType())) { validate(for_loop); } live_allocations_.emplace_back(std::vector()); for (const auto& expr : for_loop->body().exprs()) { - expr->accept(this); + OptOutConstDispatch::handle(expr); } live_allocations_.pop_back(); } - void visit(const kir::IfThenElse* ite) final { + void handle(const IfThenElse* ite) final { for (const auto& expr : ite->thenBody().exprs()) { - expr->accept(this); + OptOutConstDispatch::handle(expr); } for (const auto& expr : ite->elseBody().exprs()) { - expr->accept(this); + OptOutConstDispatch::handle(expr); } } @@ -252,14 +273,18 @@ class ValidateAllocation : private kir::IrVisitor { } // namespace // TODO(kir): Kernel IR validation -void Kernel::finalize(std::vector top_level_exprs) { - TORCH_CHECK(top_level_exprs_.empty()); +void Kernel::finalize(std::vector top_level_exprs) { + TORCH_INTERNAL_ASSERT(top_level_exprs_.empty()); top_level_exprs_ = std::move(top_level_exprs); - predicate_map_ = std::make_unique( - GpuLower::current()->threadPredMap()); warp_padded_parallel_info_ = GpuLower::current()->getWarpPaddedParallelInfo(); ValidateAllocation::validate(this); analyze(); + // Make sure this is after analyze as it sets summary_ + summary_.vectorized_accesses = GpuLower::current()->vectorizedAccesses(); + summary_.vectorized_set_info = GpuLower::current()->vectorizedSetInfo(); + summary_.sync_map = GpuLower::current()->syncMap(); + summary_.parallel_dimension_map_ = + GpuLower::current()->parallelDimensionMap(); } void Kernel::analyze() { @@ -270,8 +295,67 @@ void Kernel::analyze() { } void Kernel::print() const { - kir::IrPrinter ir_printer(std::cout); - ir_printer.printKernel(this); + IrPrinter ir_printer(std::cout); + ir_printer.handle(this); +} + +//! Register the Val with this fusion +void Kernel::registerVal(Val* val) { + if (inContainer(val)) { + return; + } + if (val->kernel()) { + TORCH_CHECK( + val->kernel() == this, + val->toString(), + " was not found in the active kernel."); + } + + Fusion::registerVal(val); +} + +//! Register expr with this fusion. +//! When we register an expression, we want to update the dependency tracking +//! of Vals. We add expr to our general expr_set_, +void Kernel::registerExpr(Expr* expr) { + if (inContainer(expr)) { + return; + } + + if (expr->kernel()) { + TORCH_CHECK( + expr->kernel() == this, + expr->toString(), + " was not found in the active kernel."); + } + + for (Val* input : expr->inputs()) { + TORCH_INTERNAL_ASSERT( + inContainer(input), + "Input\n", + input->toString(), + " to expr,\n", + expr->toString(), + ",\n is invalid because it is not in the same kernel."); + } + + for (Val* output : expr->outputs()) { + TORCH_INTERNAL_ASSERT( + inContainer(output), + "Output\n", + output->toString(), + " to expr,\n", + expr->toString(), + ",\n is invalid because it is not in the same kernel."); + } + + // Register expr is explicitly non-SSA when coming from a kernel. This is + // detected inside Fusion::registerExpr + Fusion::registerExpr(expr); +} + +std::vector& KernelInternalProxy::topLevelExprs() { + return kernel_->top_level_exprs_; } } // namespace kir diff --git a/torch/csrc/jit/codegen/cuda/kernel.h b/torch/csrc/jit/codegen/cuda/kernel.h index b273324e1e24..4930da1a2872 100644 --- a/torch/csrc/jit/codegen/cuda/kernel.h +++ b/torch/csrc/jit/codegen/cuda/kernel.h @@ -1,12 +1,18 @@ #pragma once -#include -#include -#include +#include + +#include +#include +#include +#include #include +#include #include +#include #include +#include #include #include @@ -47,6 +53,9 @@ struct KernelSummary { //! Do we have any block broadcasts? bool has_block_broadcasts = false; + //! Do we have any grid broadcasts? + bool has_grid_broadcasts = false; + //! Do we have any welford op? bool has_welford = false; @@ -67,85 +76,69 @@ struct KernelSummary { std::vector dynamic_lmem_allocations; //! ceilDiv extents that must be divisible - std::vector> splits_to_validate; + std::vector> splits_to_validate; + + //! Effective ParallelTypes of broadcast ops + std::unordered_map + broadcast_parallel_types; + + //! Track which tensor views are inputs or outputs of a vectorized operation + //! and their maximum vectorized access size + std::unordered_map vectorized_accesses; + + // Sync map is needed to figure out if global memory buffers need to be marked + // as volatile because they're used for communication. + SyncMap sync_map; + + // Parallel dimension map needed to set the correct properties of grid buffers + // (is a dim inactive) + ParallelDimensionMap parallel_dimension_map_; + + //! Track information on vectorized set operations for runtime validation + std::vector vectorized_set_info; }; +class KernelInternalProxy; + //! Container for a lowered Kernel IR //! -//! TODO(kir): currently, it is just pointing to nodes owned -//! by a Fusion object. The goal is to have the Kernel object -//! own the Kernel IR nodes -//! // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) -class TORCH_CUDA_CU_API Kernel final : public NonCopyable { +class TORCH_CUDA_CU_API Kernel final : public Fusion { + friend KernelInternalProxy; + public: - Kernel() = default; + // Kernel starts by grabbing all the nodes from the provided fusion. + // Kernel is not SSA, if a definition is not set, we should update it, but + // not remove previous definition if it is set. This is primarily because when + // we do something like generate an initialization statement for a reduction + // TV, we may want to continue to do fusion like analysis on the original + // expression. + // TODO: Assert index type is int or int32 + Kernel(Fusion* fusion, DataType index_type = DataType::Int) + : Fusion(*fusion), index_type_(index_type) {} + + Kernel() = delete; + + // No move or copy semantics + Kernel(const Kernel&) = delete; + Kernel& operator=(const Kernel&) = delete; //! Finalize a kernel definition //! //! At this point we have a complete kernel definition and we can - //! run analysis passes to build a KernelSummary - //! - void finalize(std::vector top_level_exprs); - - //! Register input as an input of the kernel - void addInput(Val* input) { - inputs_.push_back(input); - input_set_.insert(input); - } - - //! Register output as an output of the kernel - void addOutput(Val* output) { - outputs_.push_back(output); - output_set_.insert(output); - } - - const auto& inputs() const { - return inputs_; - } - - const auto& outputs() const { - return outputs_; - } - - bool isInput(Val* val) const { - return input_set_.find(val) != input_set_.end(); - } - - bool isOutput(Val* val) const { - return output_set_.find(val) != output_set_.end(); - } + //! run analysis passes to build a KernelSummary. + void finalize(std::vector top_level_exprs); - const auto& topLevelExprs() const { + const std::vector& topLevelExprs() const { return top_level_exprs_; } - const auto& irNodes() const { - return ir_nodes_; - } - const KernelSummary& summary() const { return summary_; } - const ThreadPredicateMap& predicateMap() const { - return *predicate_map_; - } - - //! Register a new Kernel IR node - //! - //! \note This is a specialized helper for kir::IrBuilder, not - //! intendted for general use - //! - void registerIrNode(kir::Passkey passkey, std::unique_ptr node) { - TORCH_CHECK(passkey.kernel == this); - ir_nodes_.push_back(std::move(node)); - } - - //! Allocates a new value identifier - kir::ValueId newValueId(kir::Passkey passkey) { - TORCH_CHECK(passkey.kernel == this); - return next_value_id_++; + DataType indexType() const { + return index_type_; } //! Checks if parallel type is padded @@ -161,35 +154,45 @@ class TORCH_CUDA_CU_API Kernel final : public NonCopyable { //! Debug dump of the Kernel IR void print() const; + protected: + //! Register the Val with this fusion + void registerVal(Val* val) override; + + //! Register expr with this fusion. + //! When we register an expression, we want to update the dependency tracking + //! of Vals. We add expr to our general expr_set_, + void registerExpr(Expr* expr) override; + private: // Analyze the kernel IR and caches the summary of interesting data void analyze(); - private: - // Kernel IR nodes - std::vector> ir_nodes_; - // Top level statements - std::vector top_level_exprs_; - - // Kernel inputs and outputs - std::vector inputs_; - std::vector outputs_; - std::unordered_set input_set_; - std::unordered_set output_set_; - - // Used to allocate unique value IDs - kir::ValueId next_value_id_ = 1; + std::vector top_level_exprs_; // Summary of interesting kernel data KernelSummary summary_; - // Predicate map - // TODO(kir): consider a simpler, kernel IR based version - std::unique_ptr predicate_map_; + // Is this kernel being compiled with int32 or int64 indexing. This + // information is required to resolve DataType::Index + DataType index_type_ = DataType::Int; + WarpPaddedParallelInfo warp_padded_parallel_info_; }; +//! A special debugging proxy for Kernel. +//! +//! Should not be used for other than testing and debugging. +class TORCH_CUDA_CU_API KernelInternalProxy { + public: + KernelInternalProxy(Kernel* kernel) : kernel_(kernel) {} + + std::vector& topLevelExprs(); + + private: + Kernel* kernel_ = nullptr; +}; + } // namespace kir } // namespace cuda } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp index 39350876bd2b..ccdbb2eb1d9b 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp +++ b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp @@ -3,10 +3,13 @@ #include #include #include +#include #include +#include #include #include +#include namespace torch { namespace jit { @@ -25,6 +28,10 @@ int getCommonDeviceCUDA(const at::ArrayRef& inputs) { continue; } const auto& device = input.toTensor().device(); + // skip cpu scalar tensor as they'll be promoted to scalar later + if (device.is_cpu() && is_cpu_scalar(input.toTensor())) { + continue; + } TORCH_CHECK(device.is_cuda(), "nvfuser only supports cuda device"); auto cur_index = device.index(); if (index != -1 && index != cur_index) { @@ -72,6 +79,11 @@ InputsIdLookup::IdLookupReturn InputsIdLookup::lookupId( encodeBuffer(stride, encoding_); encoding_.push_back(' '); } + encoding_.push_back('a'); + encodeBuffer( + SchedulerRuntimeInfo::computeAlignmentSize( + (size_t)input_tensor.data_ptr()), + encoding_); encoding_.push_back('d'); encodeBuffer(input_tensor.device().index(), encoding_); } else { @@ -80,9 +92,6 @@ InputsIdLookup::IdLookupReturn InputsIdLookup::lookupId( } encoding_.push_back(';'); } - if (additional_info) { - encodeBuffer(additional_info->getCommonAlignmentSize(), encoding_); - } auto& entry = encoding_lookup_[encoding_]; @@ -113,7 +122,11 @@ InputsIdLookup::IdLookupReturn InputsIdLookup::lookupId( } FusionExecutorCache::FusionExecutorCache(std::unique_ptr fusion) - : fusion_(std::move(fusion)) {} + : fusion_(std::move(fusion)) { + for (const auto& indices : fusion_->getOutputAliasIndices()) { + aliased_output_indices_.insert(indices); + } +} // Note [ Permutation support in nvfuser ] // @@ -182,6 +195,12 @@ std::vector FusionExecutorCache::runFusionWithInputs( outputs[pair.first] = outputs[pair.first].permute(pair.second); } + int offset = 0; + for (const auto& v : aliased_output_indices_) { + outputs.erase(outputs.begin() + v - offset); + offset++; + } + return outputs; } @@ -202,9 +221,9 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor( } // Access kernels associated with the common device id - auto dev_id = getCommonDeviceCUDA(inputs); - TORCH_INTERNAL_ASSERT(dev_id >= 0); - auto& kernel_runtimes = kernel_runtimes_[dev_id]; + auto device_index = getCommonDeviceCUDA(inputs); + TORCH_CHECK(device_index >= 0, "device is not coherent for fusion inputs"); + auto& kernel_runtimes = kernel_runtimes_[device_index]; // Check for re-use hit case // a kernel runtime is re-usable if all the compiled @@ -258,6 +277,8 @@ FusionKernelRuntime::FusionKernelRuntime( std::make_unique(fusion_copy.get()); //! Try to schedule the complete fusion + scheduler_debug_utils::canScheduleMessage( + "***Runtime***: Try to schedule fusion un-segmented:\n"); const auto maybe_complete_fusion_heuristic = SchedulerEntry::proposeHeuristics(fusion_copy.get(), runtime_info); @@ -277,14 +298,6 @@ FusionKernelRuntime::FusionKernelRuntime( } else { auto complete_fusion_heuristic = maybe_complete_fusion_heuristic.value(); - // Translate welfords if apply - if (fusion_copy->hasWelford()) { - bool translated = SegmentCandidateFinder::TranslateWelfordInFusion( - fusion_copy.get(), inputs); - if (translated) { - complete_fusion_heuristic = ScheduleHeuristic::Persistent; - } - } // Take ownership of the transformed fusion single_kernel_fusion_ = std::move(fusion_copy); @@ -358,7 +371,7 @@ std::vector FusionKernelRuntime::runKernelWithInput( launch_params = scheduler_entry->pointwiseParams().lparams; } executors_[group_id].compileFusion( - fusion_to_run.get(), options, inputs, launch_params); + fusion_to_run.get(), inputs, launch_params, options); } else { // Load launch params for reduction and normalization kernels if (scheduler_entry->hasReductionParam()) { @@ -370,7 +383,6 @@ std::vector FusionKernelRuntime::runKernelWithInput( if (profiling_) { most_recent_executor_log_.fusion_executor = &executors_[group_id]; - most_recent_executor_log_.launch_constraints = launch_params; if (scheduler_entry->hasReductionParam()) { most_recent_executor_log_.reduction_params = scheduler_entry->reductionParams(); @@ -380,7 +392,49 @@ std::vector FusionKernelRuntime::runKernelWithInput( } } - return executors_[group_id].runFusion(inputs, launch_params, input_id); + auto& executor = executors_[group_id]; + if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) { + executor.setMeasureKernelTimeFlag(true); + } + + auto outputs = executor.runFusion(inputs, launch_params, input_id); + + // Print relevant information all at once for easy debuging of perf + if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) { + std::cout << "\nRun kernel:\n"; + if (sg) { + segmented_fusion_->makeFusion(sg)->printMath(); + } else { + single_kernel_fusion_->printMath(); + } + std::cout << "With inputs:\n"; + for (auto inp : inputs) { + if (inp.isTensor()) { + auto inp_tensor = inp.toTensor(); + std::cout << " " << inp_tensor.dtype() << " " << inp_tensor.sizes() + << " " << inp_tensor.strides() << "\n"; + } else { + std::cout << " " << inp << "\n"; + } + } + std::cout << "Compiler log: " << executor.compilerLog() << "\n"; + if (scheduler_entry->hasReductionParam()) { + std::cout << scheduler_entry->reductionParams().toString() << "\n"; + } else { + std::cout << scheduler_entry->pointwiseParams().toString() << "\n"; + } + std::cout << "With arguments: " << executor.lastLaunchParams().toString(); + std::cout << executor.kernelName() << " " << executor.bytesProcessed() + << " bytes/ " << std::setprecision(3) << executor.kernelTimeMs() + << " ms " + << ((double)executor.bytesProcessed() / + ((double)executor.kernelTimeMs() / 1000)) / + (double)1.0e9 + << " GB/s" << std::endl; + executor.setMeasureKernelTimeFlag(false); + } + + return outputs; } void FusionKernelRuntime::prepareRuntimeOrder() { @@ -443,7 +497,9 @@ void FusionKernelRuntime::prepareRuntimeOrder() { std::vector FusionKernelRuntime::runWithInput( const at::ArrayRef& inputs, size_t input_id) { - if (is_segmented_) { + if (!is_segmented_) { + return runKernelWithInput(inputs, input_id); + } else { FUSER_PERF_SCOPE("FusionKernelRuntime::runMultiKernelWithInput"); TORCH_INTERNAL_ASSERT( @@ -453,6 +509,7 @@ std::vector FusionKernelRuntime::runWithInput( " inputs but expecting ", segmented_fusion_->inputs().size()); + c10::Device device(c10::DeviceType::CUDA, 0); int extent_index_ = 0; // Bind input in the tensor_map for (const auto i : c10::irange(inputs.size())) { @@ -466,6 +523,7 @@ std::vector FusionKernelRuntime::runWithInput( // more convenient and safer than replication if (inputs[i].isTensor()) { auto aten_tensor = inputs[i].toTensor(); + device = aten_tensor.device(); for (auto dim_size : aten_tensor.sizes()) { runtime_workspace_.tensor_map.emplace( runtime_workspace_.group_extent_binding_order[extent_index_++], @@ -474,6 +532,10 @@ std::vector FusionKernelRuntime::runWithInput( } } + if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) { + std::cout << "=================RUNNING FUSION SEGMENTS=================" + << std::endl; + } for (auto group_to_run : runtime_workspace_.group_run_order) { // Prepare input vector for (auto input : group_to_run->inputs()) { @@ -497,6 +559,10 @@ std::vector FusionKernelRuntime::runWithInput( runtime_workspace_.group_runtime_outputs.clear(); } + if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) { + std::cout << "=============FINISHED RUNNING FUSION SEGMENTS============" + << std::endl; + } // Produce final global output std::vector fusion_outputs; for (auto output : segmented_fusion_->outputs()) { @@ -504,14 +570,36 @@ std::vector FusionKernelRuntime::runWithInput( if (iter != runtime_workspace_.tensor_map.end()) { fusion_outputs.push_back(iter->second); } else { + bool empty_type_check = output->getDataType().has_value() && + output->getDataType().value() == DataType::Float; + + // Only support two cases of empty tensor here, since + // this is hot path. + auto out_tv = output->as(); + + // TODO: should be only one of the two once the "empty" + // definition has been unified throughout the ops. + bool empty_tensor_check = + out_tv->isZeroDim() || out_tv->isEmptyTensor(); + // This is the check for an empty tensor; TORCH_INTERNAL_ASSERT( - output->as()->nDims() == 0 && - output->getDataType().has_value() && - output->getDataType().value() == DataType::Float, - "Non empty tensor cannot be found at tensor_map in ", + empty_tensor_check && empty_type_check, + "Is empty tensor? ", + !empty_tensor_check, + " Is empty type check? ", + !empty_type_check, + " Output empty tensor check failed for tensor: ", + out_tv->toString(), + " In function: ", __FUNCTION__); - fusion_outputs.emplace_back(at::Tensor()); + + // TODO: would need to clean up this part when + // we have a unified and consistent way to generate + // size-0 tensors. + const auto tensor_options = + at::TensorOptions().dtype(at::kFloat).device(device); + fusion_outputs.emplace_back(at::empty({0}, tensor_options)); } } @@ -529,8 +617,6 @@ std::vector FusionKernelRuntime::runWithInput( runtime_workspace_.tensor_map.clear(); return fusion_output_tensors; - } else { - return runKernelWithInput(inputs, input_id); } } @@ -619,6 +705,8 @@ void GraphCache::createFusion(const std::shared_ptr& graph) { fusion_executor_cache_ = std::make_unique(parseJitIR(graph)); + + num_of_outputs_ = graph->outputs().size(); } // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) @@ -627,6 +715,8 @@ GraphCache::GraphCache(const std::shared_ptr& graph) { TORCH_INTERNAL_ASSERT( IsNewExecutorEnabled(), "legacy executor is not supported by nvfuser"); + GRAPH_DEBUG("GraphCache constructor: ", this); + GRAPH_DUMP("GraphCache created for graph", graph); createFusion(graph); } @@ -634,7 +724,16 @@ std::vector GraphCache::runGraphWithInputs( const at::ArrayRef& inputs) { FUSER_PERF_SCOPE("GraphCache::runGraphWithInputs"); - return fusion_executor_cache_->runFusionWithInputs(inputs); + GRAPH_DEBUG("running GraphCache: ", this); + auto outputs = fusion_executor_cache_->runFusionWithInputs(inputs); + TORCH_INTERNAL_ASSERT( + outputs.size() == num_of_outputs_, + "FusionExecutorCache returned ", + outputs.size(), + " outputs, doesn't match computational graph, which requires ", + num_of_outputs_); + + return outputs; } } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.h b/torch/csrc/jit/codegen/cuda/kernel_cache.h index ae84c25e4f23..2958822a2f81 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_cache.h +++ b/torch/csrc/jit/codegen/cuda/kernel_cache.h @@ -7,8 +7,8 @@ #include #include +#include #include -#include #include #include @@ -27,7 +27,6 @@ class SchedulerRuntimeInfo; struct ExecutorLog { c10::optional reduction_params = c10::nullopt; c10::optional pointwise_params = c10::nullopt; - c10::optional launch_constraints = c10::nullopt; FusionExecutor* fusion_executor = nullptr; }; @@ -127,9 +126,8 @@ class TORCH_CUDA_CU_API FusionKernelRuntime { private: //! Interface to run a single kernel, either one kernel for single-kernel - //! fusions, - //! or a kernel for a segmentedGrouup in a segmented fusion. Returns the - //! kernel outputs. + //! fusions, or a kernel for a segmentedGrouup in a segmented fusion. Returns + //! the kernel outputs. std::vector runKernelWithInput( const at::ArrayRef& inputs, size_t input_id, @@ -410,6 +408,11 @@ class TORCH_CUDA_CU_API FusionExecutorCache { //! TODO: this can be largely expanded to look at complete //! caching profiles. Currently it just makes it easier to test FusionKernelRuntime* most_recent_runtime_ = nullptr; + + //! indices of fusion outputs that are aliased to inputs. These are used only + //! to support in-place update and should have been dropped before pushing + //! outputs to stack. + std::set aliased_output_indices_; }; class GraphCache { @@ -426,15 +429,15 @@ class GraphCache { const at::ArrayRef& inputs); private: - //! Computation graph; - std::shared_ptr graph_; - //! construct FusionExecutorCache void createFusion(const std::shared_ptr& graph); private: //! FusionExecutorCache that performs schedule and kernel execution; std::unique_ptr fusion_executor_cache_; + + //! num of outputs + size_t num_of_outputs_ = 0; }; } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.cpp b/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.cpp index 7421d2e235a6..3605f7a4155f 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.cpp +++ b/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.cpp @@ -1,7 +1,6 @@ #include #include -#include #include @@ -16,11 +15,11 @@ void ExpressionEvaluator::bind( Int::ScalarType concrete_value) { TORCH_CHECK(value->isScalar()); TORCH_CHECK(value->dtype() == DataType::Int); - TORCH_CHECK(!value->isConst(), "Tried to bind to a constant value"); + TORCH_CHECK(!value->isConstScalar(), "Tried to bind to a constant value"); TORCH_CHECK( value->definition() == nullptr, "Tried to bind to a value that is computed in the kernel IR: ", - toString(value), + value->toString(), " with ", concrete_value); known_values_[value] = concrete_value; @@ -41,14 +40,18 @@ void ExpressionEvaluator::bind( c10::optional ExpressionEvaluator::evaluate(const Val* value) { if (precomputed_integers_ && precomputed_integers_->ready()) { - return precomputed_integers_->getMaybeValueFor(value); - } else if (value->isScalar() && value->isConst()) { + if (precomputed_integers_->getMaybeValueFor(value).has_value()) { + return precomputed_integers_->getMaybeValueFor(value); + } + } + + if (value->isScalar() && value->isConst()) { return value->as()->value(); } else { FUSER_PERF_SCOPE("kir::ExpressionEvaluator::evaluate"); - TORCH_CHECK(value->isScalar()); - TORCH_CHECK(value->dtype() == DataType::Int); + TORCH_CHECK(value->isScalar(), value->toString()); + TORCH_CHECK(value->dtype() == DataType::Int, value->toString()); // Is the value known (either explicit binding or memoized)? const auto pre_eval_it = known_values_.find(value); @@ -56,7 +59,7 @@ c10::optional ExpressionEvaluator::evaluate(const Val* value) { return pre_eval_it->second; } - value->accept(this); + OptOutConstDispatch::handle(value); const auto post_eval_it = known_values_.find(value); return post_eval_it != known_values_.end() @@ -74,24 +77,23 @@ void ExpressionEvaluator::print() const { std::cout << "\nEvaluation context\n"; std::cout << "--------------------\n"; for (const auto& kv : known_values_) { - std::cout << toString(kv.first) << " = " << kv.second << "\n"; + std::cout << kv.first->toString() << " = " << kv.second << "\n"; + } + std::cout << "\nPre-computed Values\n"; + if (precomputed_integers_ != nullptr) { + precomputed_integers_->print(); } std::cout << "--------------------\n\n"; } -void ExpressionEvaluator::unhandled(const void*) { - TORCH_INTERNAL_ASSERT( - false, "Kernel IR expression evaluation reached an unsupported node"); -} - -void ExpressionEvaluator::visit(const Int* value) { +void ExpressionEvaluator::handle(const Int* value) { TORCH_INTERNAL_ASSERT(!value->isConst()); if (auto def = value->definition()) { - def->accept(this); + OptOutConstDispatch::handle(def); } } -void ExpressionEvaluator::visit(const NamedScalar* named_scalar) { +void ExpressionEvaluator::handle(const NamedScalar* named_scalar) { const auto& name = named_scalar->name(); for (auto pt : kParallelTypeThreads) { auto pt_val_it = known_parallel_dimensions_.find(pt); @@ -105,10 +107,10 @@ void ExpressionEvaluator::visit(const NamedScalar* named_scalar) { } } -void ExpressionEvaluator::visit(const UnaryOp* unary_op) { +void ExpressionEvaluator::handle(const UnaryOp* unary_op) { const auto in = evaluate(unary_op->in()); if (in.has_value()) { - switch (unary_op->operation()) { + switch (unary_op->getUnaryOpType()) { case UnaryOpType::Neg: known_values_[unary_op->out()] = -*in; break; @@ -121,11 +123,11 @@ void ExpressionEvaluator::visit(const UnaryOp* unary_op) { } } -void ExpressionEvaluator::visit(const BinaryOp* binary_op) { +void ExpressionEvaluator::handle(const BinaryOp* binary_op) { const auto lhs = evaluate(binary_op->lhs()); const auto rhs = evaluate(binary_op->rhs()); if (lhs.has_value() && rhs.has_value()) { - switch (binary_op->operation()) { + switch (binary_op->getBinaryOpType()) { case BinaryOpType::Add: known_values_[binary_op->out()] = *lhs + *rhs; break; diff --git a/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h b/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h index 647913875430..63586857ad85 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h +++ b/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h @@ -1,7 +1,9 @@ #pragma once -#include +#include + +#include #include #include @@ -34,7 +36,7 @@ namespace kir { //! } //! ``` //! -class TORCH_CUDA_CU_API ExpressionEvaluator : private IrVisitor { +class TORCH_CUDA_CU_API ExpressionEvaluator : private OptInConstDispatch { public: //! Set a concrete value for a symbolic value void bind(const Val* value, Int::ScalarType concrete_value); @@ -56,11 +58,10 @@ class TORCH_CUDA_CU_API ExpressionEvaluator : private IrVisitor { } private: - void unhandled(const void*) final; - void visit(const Int* value) final; - void visit(const NamedScalar* named_scalar) final; - void visit(const UnaryOp* unary_op) final; - void visit(const BinaryOp* binary_op) final; + void handle(const Int* value) final; + void handle(const NamedScalar* named_scalar) final; + void handle(const UnaryOp* unary_op) final; + void handle(const BinaryOp* binary_op) final; private: std::unordered_map known_values_; diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir.cpp b/torch/csrc/jit/codegen/cuda/kernel_ir.cpp index eebfd41729cd..35537f7a4fcb 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_ir.cpp +++ b/torch/csrc/jit/codegen/cuda/kernel_ir.cpp @@ -1,8 +1,7 @@ +#include #include #include #include -#include -#include #include #include #include @@ -15,369 +14,52 @@ namespace fuser { namespace cuda { namespace kir { -void Node::print() const { - std::cout << "\n"; - IrPrinter(std::cout).printNode(this); - std::cout << "\n"; -} - -Val::Val(Passkey passkey, DataType dtype) : Node(passkey), dtype_(dtype) { - // NOLINTNEXTLINE: https://bugs.llvm.org/show_bug.cgi?id=48534 - id_ = passkey.kernel->newValueId(passkey); -} - -namespace { - -// Traverse definition of all values involved in constructing the provided val. -// Check if all values involved are constant values, meaning the provided -// val is also a constant value. -class ConstCheck : IrVisitor { - private: - bool is_const_ = true; - - using IrVisitor::visit; - - void visit(const Bool* b) override { - is_const_ = is_const_ && b->isConst(); - } - - void visit(const Double* d) override { - is_const_ = is_const_ && d->isConst(); - } - - void visit(const Int* i) override { - is_const_ = is_const_ && i->isConst(); - } - - void visit(const NamedScalar* ns) override { - is_const_ = is_const_ && false; - } - - void visit(const Expr* expr) { - for (auto inp : expr->inputs()) { - visit(inp); - } - } - - void visit(const Val* val) { - if (val->definition() != nullptr) { - visit(val->definition()); - } else { - val->accept(this); - } - } - - public: - static bool isConst(const Val* val) { - ConstCheck cc; - cc.visit(val); - return cc.is_const_; - } -}; - -} // namespace - -bool Val::isConstScalar() const { - if (!isScalar()) - return false; - return ConstCheck::isConst(this); -} - -Expr* Expr::parentScope() const { - if (scope()) { - return scope()->owner(); - } else { - return nullptr; - } -} - -NamedScalar* NamedScalar::getParallelDim(ParallelType p_type) { - std::string parallel_dim = stringifyThreadSize(p_type); - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); - return ir_builder.create(parallel_dim, DataType::Int); -} - -NamedScalar* NamedScalar::getParallelIndex(ParallelType p_type) { - std::string parallel_ind = stringifyThread(p_type); - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); - return ir_builder.create(parallel_ind, DataType::Int); -} - -c10::optional NamedScalar::getParallelDim() const { - if (stringifyThreadSize(ParallelType::TIDx).compare(name()) == 0) { - return c10::optional(ParallelType::TIDx); - } else if (stringifyThreadSize(ParallelType::TIDy).compare(name()) == 0) { - return c10::optional(ParallelType::TIDy); - } else if (stringifyThreadSize(ParallelType::TIDz).compare(name()) == 0) { - return c10::optional(ParallelType::TIDz); - } else if (stringifyThreadSize(ParallelType::BIDx).compare(name()) == 0) { - return c10::optional(ParallelType::BIDx); - } else if (stringifyThreadSize(ParallelType::BIDy).compare(name()) == 0) { - return c10::optional(ParallelType::BIDy); - } else if (stringifyThreadSize(ParallelType::BIDz).compare(name()) == 0) { - return c10::optional(ParallelType::BIDz); - } - return c10::nullopt; -} - -c10::optional NamedScalar::getParallelIndex() const { - if (stringifyThread(ParallelType::TIDx).compare(name()) == 0) { - return c10::optional(ParallelType::TIDx); - } else if (stringifyThread(ParallelType::TIDy).compare(name()) == 0) { - return c10::optional(ParallelType::TIDy); - } else if (stringifyThread(ParallelType::TIDz).compare(name()) == 0) { - return c10::optional(ParallelType::TIDz); - } else if (stringifyThread(ParallelType::BIDx).compare(name()) == 0) { - return c10::optional(ParallelType::BIDx); - } else if (stringifyThread(ParallelType::BIDy).compare(name()) == 0) { - return c10::optional(ParallelType::BIDy); - } else if (stringifyThread(ParallelType::BIDz).compare(name()) == 0) { - return c10::optional(ParallelType::BIDz); - } - return c10::nullopt; -} - -IterDomain::IterDomain(Passkey passkey, Val* start, Val* extent) - : Val(passkey, DataType::Int), - start_(start), - stop_(extent), - extent_(extent) {} - -IterDomain::IterDomain( - Passkey passkey, - const fuser::cuda::IterDomain* iter_domain) - : Val(passkey, iter_domain->getDataType().value()), - start_(GpuLower::current()->lowerValue(iter_domain->start())), - stop_(GpuLower::current()->lowerValue(iter_domain->stop())), - extent_(GpuLower::current()->lowerValue(iter_domain->extent())), - parallel_type_(iter_domain->getParallelType()), - iter_type_(iter_domain->getIterType()), - is_rfactor_domain_(iter_domain->isRFactorProduct()), - is_simple_(iter_domain->definition() == nullptr), - is_padded_dimension_(iter_domain->hasPaddingToMultipleOfWarp()) { - // preserve the fusion node's name - setName(iter_domain->name()); -} - -//! Note that the parallel dimension, if available, may be different -//! from the actual extent of this IterDomain as the parallel -//! dimension is determined by the largest extent of IterDomains -//! sharing the same loop. -Val* IterDomain::extent() const { - TORCH_INTERNAL_ASSERT(extent_ != nullptr); - return extent_; -} - -TensorDomain::TensorDomain(Passkey passkey, std::vector domain) - : Val(passkey, DataType::Null), root_domain_(std::move(domain)) { - domain_ = root_domain_; - resetDomains(); -} - -TensorDomain::TensorDomain( - Passkey passkey, - const fuser::cuda::TensorDomain* tensor_domain) - : Val(passkey, DataType::Null), contiguity_(tensor_domain->contiguity()) { - // preserve the fusion node's name - setName(tensor_domain->name()); - - const auto lowerIterDomains = - [](const std::vector& domains) { - std::vector lowered_domains; - lowered_domains.reserve(domains.size()); - for (const auto iter_domain : domains) { - lowered_domains.push_back( - GpuLower::current()->lowerValue(iter_domain)->as()); - } - return lowered_domains; - }; - - root_domain_ = lowerIterDomains(tensor_domain->getRootDomain()); - domain_ = lowerIterDomains(tensor_domain->domain()); - no_bcast_domain_ = lowerIterDomains(tensor_domain->noBroadcasts()); - no_reduction_domain_ = lowerIterDomains(tensor_domain->noReductions()); - rfactor_domain_ = lowerIterDomains(tensor_domain->getRFactorDomain()); -} - -bool TensorDomain::hasReduction() const { - return no_reduction_domain_.size() != domain_.size(); -} - -bool TensorDomain::hasBlockReduction() const { - return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) { - return id->isReduction() && id->isThreadDim(); - }); -} - -bool TensorDomain::hasGridReduction() const { - return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) { - return id->isReduction() && id->isBlockDim(); - }); -} - -bool TensorDomain::hasBlockBroadcast() const { - return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) { - return id->isBroadcast() && id->isThreadDim(); - }); -} - -bool TensorDomain::hasGridBroadcast() const { - return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) { - return id->isBroadcast() && id->isBlockDim(); - }); -} - -bool TensorDomain::hasBroadcast() const { - return no_bcast_domain_.size() != domain_.size(); -} - -bool TensorDomain::hasRFactor() const { - return !rfactor_domain_.empty(); -} - -bool TensorDomain::hasVectorize() const { - return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) { - return id->parallelType() == ParallelType::Vectorize || - id->parallelType() == ParallelType::MisalignedVectorize; - }); -} - -IterDomain* TensorDomain::axis(int i) const { - TORCH_INTERNAL_ASSERT(i >= 0 && i < int(domain_.size())); - return domain_[i]; -} - -std::vector TensorDomain::noReductions( - const std::vector& td) { - std::vector no_reduction_domains; - for (auto id : td) { - if (!id->isReduction()) { - no_reduction_domains.push_back(id); - } - } - return no_reduction_domains; -} - -std::vector TensorDomain::noBroadcasts( - const std::vector& td) { - std::vector no_broadcast_domains; - for (auto id : td) { - if (!id->isBroadcast()) { - no_broadcast_domains.push_back(id); - } - } - return no_broadcast_domains; -} - -TensorView::TensorView(Passkey passkey, const fuser::cuda::TensorView* tv) - : Val(passkey, tv->getDataType().value()), fuser_tv_(tv) { - setName(tv->name()); - domain_ = GpuLower::current()->lowerValue(tv->domain())->as(); - memory_type_ = tv->getMemoryType(); -} - -TensorView::TensorView( - Passkey passkey, - DataType dtype, - TensorDomain* domain, - MemoryType memory_type) - : Val(passkey, dtype), domain_(domain), memory_type_(memory_type) {} - -UnaryOp::UnaryOp(Passkey passkey, UnaryOpType operation, Val* out, Val* in) - : Expr(passkey), operation_(operation), out_(out), in_(in) { - addOutput(out); - addInput(in); -} - -BinaryOp::BinaryOp( - Passkey passkey, - BinaryOpType operation, - Val* out, - Val* lhs, - Val* rhs) - : Expr(passkey), operation_(operation), out_(out), lhs_(lhs), rhs_(rhs) { - addOutput(out); - addInput(lhs); - addInput(rhs); +Predicate::Predicate( + IrBuilderPasskey passkey, + PredicateType ptype, + const Expr* expr, + Bool* thread_pred) + : Val(passkey, ValType::Predicate, DataType::Bool), + ptype_(ptype), + expr_(expr), + thread_pred_(thread_pred) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); + TORCH_INTERNAL_ASSERT( + ptype != PredicateType::Unswitch && ptype != PredicateType::Manual); } -TernaryOp::TernaryOp( - Passkey passkey, - TernaryOpType operation, - Val* out, - Val* in1, - Val* in2, - Val* in3) - : Expr(passkey), - operation_(operation), - out_(out), - in1_(in1), - in2_(in2), - in3_(in3) { - addOutput(out); - addInput(in1); - addInput(in2); - addInput(in3); -} - -ReductionOp::ReductionOp( - Passkey passkey, - BinaryOpType operation, - Val* init, - Val* out, - Val* in) - : Expr(passkey), operation_(operation), init_(init), out_(out), in_(in) { - addOutput(out); - addInput(in); -} - -WelfordOp::WelfordOp( - Passkey passkey, - Val* out_var, - Val* out_avg, - Val* out_N, - Val* init_var, - Val* init_avg, - Val* init_N, - Val* in_var, - Val* in_avg, - Val* in_N) - : Expr(passkey), - out_var_(out_var), - out_avg_(out_avg), - out_N_(out_N), - init_var_(init_var), - init_avg_(init_avg), - init_N_(init_N), - in_var_(in_var), - in_avg_(in_avg), - in_N_(in_N) { - addOutput(out_avg); - addOutput(out_var); - addOutput(out_N); - - if (!in_N->isOneInt()) { - addInput(in_var); - } - addInput(in_avg); - addInput(in_N); +Predicate::Predicate(IrBuilderPasskey passkey, ForLoop* unrolled_loop) + : Val(passkey, ValType::Predicate, DataType::Bool), + ptype_(PredicateType::Unswitch), + unrolled_loop_(unrolled_loop) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); + TORCH_INTERNAL_ASSERT(unrolled_loop != nullptr); } -BroadcastOp::BroadcastOp(Passkey passkey, Val* out, Val* in) - : Expr(passkey), out_(out), in_(in) { - TORCH_CHECK(in->isA() || in->isA()); - TORCH_CHECK(out->isA() || out->isA()); - addOutput(out); - addInput(in); +Predicate::Predicate(IrBuilderPasskey passkey, Bool* value) + : Val(passkey, ValType::Predicate, DataType::Bool), + ptype_(PredicateType::Manual), + value_(value) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); + TORCH_INTERNAL_ASSERT(value != nullptr); } TensorIndex::TensorIndex( - Passkey passkey, - const fuser::cuda::TensorView* view, + IrBuilderPasskey passkey, + const TensorView* view, std::vector indices) - : Val(passkey, view->getDataType().value()), - view_(GpuLower::current()->lowerValue(view)->as()), + : Val(passkey, ValType::TensorIndex, view->getDataType().value()), + view_(view), indices_(indices) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); TORCH_INTERNAL_ASSERT( std::all_of( indices.begin(), @@ -392,20 +74,41 @@ TensorIndex::TensorIndex( indices_.end()); // If indices becomes empty, just put one ZeroInt if (indices_.empty()) { - indices_.push_back(kir::IrBuilder(GpuLower::current()->kernel()).zeroVal()); + indices_.push_back(FusionGuard::getCurFusion()->zeroVal()); } } -Sync::Sync(Passkey passkey, bool war_sync) - : Expr(passkey), war_sync_(war_sync) {} +BlockSync::BlockSync(IrBuilderPasskey passkey, bool war_sync) + : Expr(passkey, ExprType::BlockSync), war_sync_(war_sync) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); +} + +GridSync::GridSync( + IrBuilderPasskey passkey, + ParallelTypeBitmap sync_dims, + Val* sync_buffer) + : Expr(passkey, ExprType::GridSync), + sync_dims_(sync_dims), + sync_buffer_(sync_buffer) {} -InitMagicZero::InitMagicZero(Passkey passkey) : Expr(passkey) {} +InitMagicZero::InitMagicZero(IrBuilderPasskey passkey) + : Expr(passkey, ExprType::InitMagicZero) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); +} -UpdateMagicZero::UpdateMagicZero(Passkey passkey) : Expr(passkey) {} +UpdateMagicZero::UpdateMagicZero(IrBuilderPasskey passkey) + : Expr(passkey, ExprType::UpdateMagicZero) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); +} void Scope::insert(std::vector::const_iterator pos, Expr* expr) { exprs_.insert(pos, expr); - expr->setScope(this); } void Scope::insert_before(Expr* ref, Expr* expr) { @@ -439,12 +142,7 @@ void Scope::insert(size_t pos, Expr* expr) { void Scope::erase(std::vector::const_iterator pos) { // Remove the scope of the expr if this is the scope - auto expr = *pos; - TORCH_INTERNAL_ASSERT( - expr->scope() == this, - "Inconsistent scoping of expression detected: ", - kir::toString(expr)); - expr->setScope(nullptr); + C10_UNUSED auto expr = *pos; exprs_.erase(pos); } @@ -470,7 +168,7 @@ void Scope::clear() { } ForLoop::ForLoop( - Passkey passkey, + IrBuilderPasskey passkey, IterDomain* iter_domain, Val* index, Val* start, @@ -479,7 +177,7 @@ ForLoop::ForLoop( bool vectorize, Val* vectorize_shift, bool unroll_required) - : Expr(passkey), + : Expr(passkey, ExprType::ForLoop), iter_domain_{iter_domain}, index_(index), start_(start), @@ -489,43 +187,43 @@ ForLoop::ForLoop( vectorize_shift_(vectorize_shift), unroll_required_(unroll_required), body_(this) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); TORCH_INTERNAL_ASSERT(index->dtype() == DataType::Int); addInput(index); addInput(iter_domain); if (start_ == nullptr && iter_domain->isThread()) { - start_ = - IrBuilder(GpuLower::current()->kernel()) - .create( - stringifyThread(iter_domain->parallelType()), DataType::Int); + start_ = NamedScalar::getParallelIndex(iter_domain->getParallelType()); } if (step_ == nullptr) { if (iter_domain->isThread()) { - step_ = IrBuilder(GpuLower::current()->kernel()) - .create( - stringifyThreadSize(iter_domain->parallelType()), - DataType::Int); + step_ = NamedScalar::getParallelDim(iter_domain->getParallelType()); } else { - step_ = IrBuilder(GpuLower::current()->kernel()).oneVal(); + step_ = FusionGuard::getCurFusion()->oneVal(); } } } -ForLoop::ForLoop(Passkey passkey, IterDomain* iter_domain) +ForLoop::ForLoop(IrBuilderPasskey passkey, IterDomain* iter_domain) : ForLoop( passkey, iter_domain, - iter_domain->isBroadcast() - ? IrBuilder(GpuLower::current()->kernel()).zeroVal() - : IrBuilder(GpuLower::current()->kernel()) - .create(c10::nullopt), + iter_domain->isBroadcast() ? FusionGuard::getCurFusion()->zeroVal() + : IrBuilder::create(c10::nullopt), nullptr, nullptr, nullptr, - isParallelTypeVectorize(iter_domain->parallelType()), + !iter_domain->isBroadcast() && + isParallelTypeVectorize(iter_domain->getParallelType()), nullptr, - false) {} + false) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); +} -ForLoop::ForLoop(Passkey passkey, const ForLoop* other) +ForLoop::ForLoop(IrBuilderPasskey passkey, const ForLoop* other) : ForLoop( passkey, other->iter_domain(), @@ -535,7 +233,11 @@ ForLoop::ForLoop(Passkey passkey, const ForLoop* other) other->step(), other->vectorize(), other->vectorize_shift(), - other->isUnrollRequired()) {} + other->isUnrollRequired()) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); +} bool ForLoop::isUnrollable() const { // Start and stop must be constant, must not be a broadcast @@ -550,7 +252,7 @@ bool ForLoop::isUnrolled() const { if (isUnrollRequired() && !isUnrollable()) { TORCH_WARN( "Unroll required but not possible. Register allocation disabled. Loop index: ", - kir::toString(index_)); + index_->toString()); return false; } @@ -570,7 +272,7 @@ bool ForLoop::isUnrolled() const { } // Unrolling is technically possible but avoided - if (iter_domain()->parallelType() == ParallelType::Unswitch) { + if (iter_domain()->getParallelType() == ParallelType::Unswitch) { // Use ParallelType::Unroll if unrolling is desired. Note that // unswitched size-one loops are not unrolled as they are not // materialized as actual for-loops. @@ -605,8 +307,53 @@ Val* ForLoop::step() const { return step_; } -IfThenElse::IfThenElse(Passkey passkey, Predicate* cond) - : Expr(passkey), then_body_(this), else_body_(this) { +bool ForLoop::isTrivial() const { + // These loops are not materialized + if (vectorize() || iter_domain()->isBroadcast() || + iter_domain()->isStride() || iter_domain()->isMma()) { + return true; + } + + // By default, a parallelized loop would look like: + // + // for (int x = threadIdx.x; x < stop; x += blockDim.x) { + // do_some_comp(x); + // } + // + // When stop is guaranteed to be smaller or equal to the number of + // threads, the for-loop is not necessary. In the above case, we + // would just generate the loop body without the for clause but + // references to the loop index replaced by the loop start value. + // + // When the loop end is the same as the IterDomain extent, the + // assumption can be safely made. This is more conservative than + // necessary since the loop stop value just needs to be <= the + // IterDomain extent. However, at this point, this conservative + // analysis seems sufficient. + if (stop() == iter_domain()->extent() && iter_domain()->isThread()) { + return true; + } + + // Extent-1 loop: for (int i = 0; i < 1; ++i) { + if (start()->isZeroInt() && stop()->isOneInt() && step()->isOneInt()) { + return true; + } + + // Another extent-1 loop: for (int i = N - 1; i < N; ++i) { + if (start()->definition() != nullptr && + start()->definition()->isA() && + start()->definition()->as()->getBinaryOpType() == + BinaryOpType::Sub && + start()->definition()->as()->lhs() == stop() && + start()->definition()->as()->rhs()->isOneInt()) { + return true; + } + + return false; +} + +IfThenElse::IfThenElse(IrBuilderPasskey passkey, Predicate* cond) + : Expr(passkey, ExprType::IfThenElse), then_body_(this), else_body_(this) { setPredicate(cond); addInput(cond); } @@ -621,17 +368,19 @@ Val* TensorIndex::index(int i) const { } Allocate::Allocate( - Passkey passkey, + IrBuilderPasskey passkey, Val* buffer, MemoryType memory_type, std::vector shape, bool zero_init) - : Expr(passkey), + : Expr(passkey, ExprType::Allocate), buffer_(buffer), memory_type_(memory_type), shape_(std::move(shape)), zero_init_(zero_init) { - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); if (!shape_.empty()) { TORCH_INTERNAL_ASSERT( (shape_.size() == 1 && shape_[0]->isOneInt()) || @@ -639,7 +388,7 @@ Allocate::Allocate( } else { TORCH_INTERNAL_ASSERT(buffer_->isA()); TORCH_INTERNAL_ASSERT( - buffer_->as()->memoryType() == memory_type_); + buffer_->as()->getMemoryType() == memory_type_); const auto domain = buffer_->as()->domain(); for (auto axis : domain->noReductions()) { shape_.push_back(axis->extent()); @@ -650,19 +399,19 @@ Allocate::Allocate( if (size_ == nullptr) { size_ = s; } else { - size_ = ir_builder.mulExpr(size_, s); + size_ = IrBuilder::mulExpr(size_, s); } } if (size_ == nullptr) { - size_ = ir_builder.oneVal(); + size_ = FusionGuard::getCurFusion()->oneVal(); } addInput(size_); } Allocate::Allocate( - Passkey passkey, + IrBuilderPasskey passkey, Val* buffer, MemoryType memory_type, Val* size, @@ -672,31 +421,158 @@ Allocate::Allocate( buffer, memory_type, size == nullptr ? std::vector{} : std::vector{size}, - zero_init) {} + zero_init) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); +} GridReduction::GridReduction( - Passkey passkey, - ReductionOp* reduction_op, + IrBuilderPasskey passkey, + BinaryOpType reduction_op_type, + Val* init, + Val* out, + Val* in, Allocate* reduction_buffer, - Allocate* sync_buffer) - : Expr(passkey), - reduction_op_(reduction_op), + Allocate* sync_buffer, + Val* entrance_index, + Val* entrances, + bool is_allreduce) + : ReductionOp( + passkey, + reduction_op_type, + init, + out, + in, + is_allreduce, + ExprType::GridReduction), reduction_buffer_(reduction_buffer), - sync_buffer_(sync_buffer) {} + sync_buffer_(sync_buffer), + entrance_index_(entrance_index), + entrances_(entrances) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); +} + +GroupedGridReduction::GroupedGridReduction( + IrBuilderPasskey passkey, + std::vector reduction_op_types, + std::vector init_vals, + std::vector outputs, + std::vector inputs, + std::vector reduction_buffers, + Allocate* sync_buffer, + bool is_fused) + : GroupedReductionOp( + passkey, + std::move(reduction_op_types), + std::move(init_vals), + std::move(outputs), + std::move(inputs), + is_fused, + ExprType::GroupedGridReduction), + reduction_buffers_(std::move(reduction_buffers)), + sync_buffer_(sync_buffer) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); +} + +GridBroadcast::GridBroadcast( + IrBuilderPasskey passkey, + BroadcastOp* broadcast_op, + Allocate* broadcast_buffer, + Allocate* sync_buffer) + : Expr(passkey, ExprType::GridBroadcast), + broadcast_op_(broadcast_op), + broadcast_buffer_(broadcast_buffer), + sync_buffer_(sync_buffer) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); +} GridWelford::GridWelford( - Passkey passkey, + IrBuilderPasskey passkey, WelfordOp* welford_op, Allocate* var_buffer, Allocate* avg_buffer, Allocate* n_buffer, - Allocate* sync_buffer) - : Expr(passkey), + Allocate* sync_buffer, + Val* entrance_index, + Val* entrances) + : Expr(passkey, ExprType::GridWelford), welford_op_(welford_op), var_buffer_(var_buffer), avg_buffer_(avg_buffer), n_buffer_(n_buffer), - sync_buffer_(sync_buffer) {} + sync_buffer_(sync_buffer), + entrance_index_(entrance_index), + entrances_(entrances) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); +} + +AllocateFusedReduction::AllocateFusedReduction( + IrBuilderPasskey passkey, + GridReduction* grid_reduction) + : Expr(passkey, ExprType::AllocateFusedReduction), + grid_expr_(grid_reduction) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); +} + +AllocateFusedReduction::AllocateFusedReduction( + IrBuilderPasskey passkey, + GridWelford* grid_welford) + : Expr(passkey, ExprType::AllocateFusedReduction), + grid_expr_(grid_welford) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); +} + +AllocateFusedReduction::AllocateFusedReduction( + IrBuilderPasskey passkey, + GroupedGridReduction* grouped_grid_reduction) + : Expr(passkey, ExprType::AllocateFusedReduction), + grid_expr_(grouped_grid_reduction) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); +} + +TensorIndex* AllocateFusedReduction::out() const { + TORCH_INTERNAL_ASSERT(grid_expr_ != nullptr); + if (grid_expr_->isA() || + grid_expr_->isA()) { + return grid_expr_->outputs().at(0)->as(); + } else if (auto grid_welford = dynamic_cast(grid_expr_)) { + return grid_welford->welford_op()->out()->as(); + } else { + TORCH_INTERNAL_ASSERT( + false, "Invalid grid expression: ", grid_expr_->toString()); + } +} + +const ParallelTypeBitmap& AllocateFusedReduction::threadPredicate() const { + TORCH_INTERNAL_ASSERT(grid_expr_ != nullptr); + if (auto grid_reduction = dynamic_cast(grid_expr_)) { + return grid_reduction->threadPredicate(); + } else if (auto grid_welford = dynamic_cast(grid_expr_)) { + return grid_welford->threadPredicate(); + } else if ( + auto grouped_grid_reduction = + dynamic_cast(grid_expr_)) { + return grouped_grid_reduction->threadPredicate(); + } else { + TORCH_INTERNAL_ASSERT( + false, "Invalid grid expression: ", grid_expr_->toString()); + } +} } // namespace kir } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir.h b/torch/csrc/jit/codegen/cuda/kernel_ir.h index c1ac6052783d..99ebdba5bab3 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_ir.h +++ b/torch/csrc/jit/codegen/cuda/kernel_ir.h @@ -1,1163 +1,154 @@ #pragma once -#include -#include - -// TODO(kir): remove these once the Kernel IR is separated from Fusion IR -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include - -namespace torch { -namespace jit { -namespace fuser { -namespace cuda { -namespace kir { - -class IrBuilder; -class Kernel; - -// Abstract nodes -class Node; -class Val; -class Expr; - -// Values -class NamedScalar; -class Predicate; -class Bool; -class Double; -class Int; -class IterDomain; -class TensorDomain; -class TensorView; -class TensorIndex; - -// Expressions -class UnaryOp; -class BinaryOp; -class TernaryOp; -class ReductionOp; -class WelfordOp; -class BroadcastOp; - -// Statements -class Allocate; -class Sync; -class InitMagicZero; -class UpdateMagicZero; -class ForLoop; -class IfThenElse; -class GridReduction; -class GridBroadcast; -class GridWelford; - -// Expr container -class Scope; - -using ValueId = int32_t; - -//! Token used to restrict the access to Kernel IR creation -//! -//! A token is associated with a kernel, which is passed with the key -//! (Passkey::kernel) -//! -//! It is a "granular friendship" token, used to implement the "passkey" idiom: -//! https://www.spiria.com/en/blog/desktop-software/passkey-idiom-and-better-friendship-c -//! https://arne-mertz.de/2016/10/passkey-idiom -//! -class Passkey { - friend class IrBuilder; - - public: - Kernel* const kernel = nullptr; - - private: - explicit Passkey(Kernel* kernel) : kernel(kernel) {} -}; - -//! Kernel IR visitor interface -class TORCH_CUDA_CU_API IrVisitor : public PolymorphicBase { - public: - // TODO(kir): use Node* instead of void* - virtual void unhandled(const void* node) {} - - // Values - virtual void visit(const NamedScalar* named_scalar) { - unhandled(named_scalar); - } - virtual void visit(const Predicate* value) { - unhandled(value); - } - virtual void visit(const Bool* value) { - unhandled(value); - } - virtual void visit(const Double* value) { - unhandled(value); - } - virtual void visit(const Int* value) { - unhandled(value); - } - virtual void visit(const IterDomain* iter_domain) { - unhandled(iter_domain); - } - virtual void visit(const TensorDomain* tensor_domain) { - unhandled(tensor_domain); - } - virtual void visit(const TensorView* tensor_view) { - unhandled(tensor_view); - } - virtual void visit(const TensorIndex* tensor_index) { - unhandled(tensor_index); - } - - // Expressions - virtual void visit(const UnaryOp* node) { - unhandled(node); - } - virtual void visit(const BinaryOp* node) { - unhandled(node); - } - virtual void visit(const TernaryOp* node) { - unhandled(node); - } - virtual void visit(const ReductionOp* node) { - unhandled(node); - } - virtual void visit(const WelfordOp* node) { - unhandled(node); - } - virtual void visit(const BroadcastOp* node) { - unhandled(node); - } - - // Statements - virtual void visit(const Allocate* node) { - unhandled(node); - } - virtual void visit(const Sync* node) { - unhandled(node); - } - virtual void visit(const InitMagicZero* node) { - unhandled(node); - } - virtual void visit(const UpdateMagicZero* node) { - unhandled(node); - } - virtual void visit(const ForLoop* node) { - unhandled(node); - } - virtual void visit(const IfThenElse* node) { - unhandled(node); - } - virtual void visit(const GridReduction* node) { - unhandled(node); - } - virtual void visit(const GridBroadcast* node) { - unhandled(node); - } - virtual void visit(const GridWelford* node) { - unhandled(node); - } -}; - -//! Kernel IR visitor interface -class TORCH_CUDA_CU_API MutableIrVisitor : public PolymorphicBase { - public: - // TODO(kir): use Node* instead of void* - virtual void unhandled(const void*) {} - - // Values - virtual void visit(NamedScalar* named_scalar) { - unhandled(named_scalar); - } - virtual void visit(Predicate* value) { - unhandled(value); - } - virtual void visit(Bool* value) { - unhandled(value); - } - virtual void visit(Double* value) { - unhandled(value); - } - virtual void visit(Int* value) { - unhandled(value); - } - virtual void visit(IterDomain* iter_domain) { - unhandled(iter_domain); - } - virtual void visit(TensorDomain* tensor_domain) { - unhandled(tensor_domain); - } - virtual void visit(TensorView* tensor_view) { - unhandled(tensor_view); - } - virtual void visit(TensorIndex* tensor_index) { - unhandled(tensor_index); - } - - // Expressions - virtual void visit(UnaryOp* node) { - unhandled(node); - } - virtual void visit(BinaryOp* node) { - unhandled(node); - } - virtual void visit(TernaryOp* node) { - unhandled(node); - } - virtual void visit(ReductionOp* node) { - unhandled(node); - } - virtual void visit(BroadcastOp* node) { - unhandled(node); - } - - virtual void visit(WelfordOp* node) { - unhandled(node); - } - - // Statements - virtual void visit(Allocate* node) { - unhandled(node); - } - virtual void visit(Sync* node) { - unhandled(node); - } - virtual void visit(InitMagicZero* node) { - unhandled(node); - } - virtual void visit(UpdateMagicZero* node) { - unhandled(node); - } - virtual void visit(ForLoop* node) { - unhandled(node); - } - virtual void visit(IfThenElse* node) { - unhandled(node); - } - virtual void visit(GridReduction* node) { - unhandled(node); - } - virtual void visit(GridBroadcast* node) { - unhandled(node); - } - virtual void visit(GridWelford* node) { - unhandled(node); - } -}; - -//! Base class for Kernel IR nodes -class TORCH_CUDA_CU_API Node : public NonCopyable, public PolymorphicBase { - public: - explicit Node(Passkey) {} - - //! IR Visitor double-dispatch interface - //! (https://en.wikipedia.org/wiki/Visitor_pattern) - virtual void accept(IrVisitor* visitor) const = 0; - - //! Non constant IR Visitor - virtual void accept(MutableIrVisitor* visitor) = 0; - - //! Debug helper, prints the textual representation of an IR node - void print() const; -}; - -//! Generic value (scalar or tensor) -class TORCH_CUDA_CU_API Val : public Node { - public: - Val(Passkey passkey, DataType dtype); - - // TODO(kir): consider renaming - StmtNameType name() const { - return name_; - } - - void setName(StmtNameType name) { - name_ = name; - } - - ValueId id() const { - return id_; - } - - DataType dtype() const { - return dtype_; - } - - Expr* definition() const { - return definition_; - } - - void setDefinition(Expr* expr) { - // TODO(kir): extra checks on changing existing definitions? - definition_ = expr; - } - - virtual bool isScalar() const { - return false; - } - - bool isConstScalar() const; - - virtual bool isConst() const { - return false; - } - - // TODO(kir): revisit and find a better interface - virtual bool isZeroInt() const { - return false; - } - - virtual bool isOneInt() const { - return false; - } - - void setEvaluatorIndex(int to) { - TORCH_INTERNAL_ASSERT(evaluator_index_ == -1); - evaluator_index_ = to; - } - - int evaluatorIndex() const { - return evaluator_index_; - } - - private: - const DataType dtype_; - - // The expression which defines this value, or nullptr - Expr* definition_ = nullptr; - - // This is a value name preserved from the Fusion IR (optional) - StmtNameType name_ = kInvalidStmName; - - // All Kernel IR values have IDs (unique within the same Kernel) - ValueId id_ = -1; - - // Expr evaluator idx; - int evaluator_index_ = -1; -}; - -//! Base class for expressions and statements -//! -//! Expressions consume inputs and produce outputs (depending on the context -//! this may imply assignments). Currently some of the expressions -//! don't actually produce any outputs (ForLoop, IfThenElse) and they -//! model statements to be executed. -//! -//! TODO(kir): split the expressions, assignments and statements? -//! -class TORCH_CUDA_CU_API Expr : public Node { - public: - explicit Expr(Passkey passkey) : Node(passkey) {} - - const auto& inputs() const { - return inputs_; - } - - const auto& outputs() const { - return outputs_; - } - - Scope* scope() const { - return scope_; - } - - //! Set the current scope - void setScope(Scope* scope) { - scope_ = scope; - } - - Expr* parentScope() const; - - Predicate* predicate() const { - return predicate_; - } - - void setPredicate(Predicate* predicate) { - predicate_ = predicate; - } - - Predicate* writePredicate() const { - return write_predicate_; - } - - void setWritePredicate(Predicate* write_predicate) { - write_predicate_ = write_predicate; - } - - protected: - // TODO(kir): try to avoid this protected interface - void addInput(Val* input) { - inputs_.push_back(input); - } - - void addOutput(Val* output) { - output->setDefinition(this); - outputs_.push_back(output); - } - - private: - // TODO(kir): can we avoid this? - std::vector inputs_; - std::vector outputs_; - - // TODO(kir): revisit scope/nesting data structures - Scope* scope_ = nullptr; - - Predicate* predicate_ = nullptr; - // Only used for reduction-related expressions - Predicate* write_predicate_ = nullptr; -}; - -class TORCH_CUDA_CU_API NamedScalar final : public Val { - public: - // NOLINTNEXTLINE(modernize-pass-by-value) - NamedScalar(Passkey passkey, std::string name, DataType dtype) - : Val(passkey, dtype), name_(name) {} - - explicit NamedScalar(Passkey passkey, const fuser::cuda::NamedScalar* node) - : Val(passkey, node->getDataType().value()) { - name_ = node->name(); - } - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - - bool isScalar() const override { - return true; - } - - // TODO(kir): this is hiding and redefining Val::name() - const std::string& name() const { - return name_; - } - - // Return the named scalar extent of a parallel dimension (e.g. blockDim.x) - static NamedScalar* getParallelDim(ParallelType p_type); - - // Return the named scalar index of a parallel dimension (e.g. threadIdx.x) - static NamedScalar* getParallelIndex(ParallelType p_type); - - // Return the parallel type of this NamedScalar if it is an extent of a - // parallel dimension - c10::optional getParallelDim() const; - - // Return the parallel type of this NamedScalar if it is an index of a - // parallel dimension - c10::optional getParallelIndex() const; - - private: - std::string name_; -}; - -class TORCH_CUDA_CU_API Predicate final : public Val { - public: - explicit Predicate( - Passkey passkey, - PredicateType ptype, - const Expr* expr = nullptr, - Bool* thread_pred = nullptr) - : Val(passkey, DataType::Bool), - ptype_(ptype), - expr_(expr), - thread_pred_(thread_pred) { - TORCH_INTERNAL_ASSERT( - ptype != PredicateType::Unswitch && ptype != PredicateType::Manual); - } - - explicit Predicate(Passkey passkey, ForLoop* unrolled_loop) - : Val(passkey, DataType::Bool), - ptype_(PredicateType::Unswitch), - unrolled_loop_(unrolled_loop) { - TORCH_INTERNAL_ASSERT(unrolled_loop != nullptr); - } - - explicit Predicate(Passkey passkey, Bool* value) - : Val(passkey, DataType::Bool), - ptype_(PredicateType::Manual), - value_(value) { - TORCH_INTERNAL_ASSERT(value != nullptr); - } - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - - PredicateType predicate_type() const { - return ptype_; - } - - const Expr* expr() const { - TORCH_INTERNAL_ASSERT( - ptype_ != PredicateType::Unswitch && - ptype_ != PredicateType::Vectorize && ptype_ != PredicateType::Manual); - return expr_; - } - - Bool* thread_pred() { - TORCH_INTERNAL_ASSERT( - ptype_ == PredicateType::Inline || - ptype_ == PredicateType::Misaligned || ptype_ == PredicateType::Shift || - ptype_ == PredicateType::Padding || - ptype_ == PredicateType::ReductionWrite); - return thread_pred_; - } - - ForLoop* unrolled_loop() const { - TORCH_INTERNAL_ASSERT(ptype_ == PredicateType::Unswitch); - return unrolled_loop_; - } - - bool hasValue() const { - return value_ != nullptr; - } - - Bool* value() const { - TORCH_INTERNAL_ASSERT( - value_ != nullptr, - "The conditional expression for this Predicate is invalid."); - return value_; - } - - void setValue(Bool* value) { - TORCH_INTERNAL_ASSERT(value != nullptr, "The Bool expression is invalid."); - value_ = value; - } - - private: - PredicateType ptype_ = PredicateType::Manual; - - // For PredicateCompute::getInlinePredicate, - // ShiftPredicateInserter::getShiftPredicate and getPaddingPredicate - const Expr* expr_ = nullptr; - - // For PredicateCompute::getInlinePredicate - Bool* thread_pred_ = nullptr; - - // For ParallelType::Unswitch - UnswitchPredicate::get - ForLoop* unrolled_loop_ = nullptr; - - // The Bool conditional value - // The value is nullptr until lower_predicate pass - Bool* value_ = nullptr; -}; - -class TORCH_CUDA_CU_API Bool final : public Val { - public: - explicit Bool(Passkey passkey, const c10::optional& value) - : Val(passkey, DataType::Bool), maybe_value_(value) {} - - explicit Bool(Passkey passkey, const fuser::cuda::Bool* node) - : Val(passkey, DataType::Bool), maybe_value_(node->value()) { - setName(node->name()); - } - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - - bool isScalar() const override { - return true; - } - - bool isConst() const override { - return maybe_value_.has_value(); - } - - c10::optional value() const { - return maybe_value_; - } - - private: - const c10::optional maybe_value_; -}; - -class TORCH_CUDA_CU_API Double final : public Val { - public: - using ScalarType = double; - - explicit Double(Passkey passkey, const c10::optional& value) - : Val(passkey, DataType::Double), maybe_value_(value) {} - - explicit Double(Passkey passkey, const fuser::cuda::Double* node) - : Val(passkey, DataType::Double), maybe_value_(node->value()) { - setName(node->name()); - } - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - - bool isScalar() const override { - return true; - } - - bool isConst() const override { - return maybe_value_.has_value(); - } - - c10::optional value() const { - return maybe_value_; - } - - private: - const c10::optional maybe_value_; -}; - -class TORCH_CUDA_CU_API Int final : public Val { - public: - using ScalarType = int64_t; - - explicit Int(Passkey passkey, const c10::optional& value) - : Val(passkey, DataType::Int), maybe_value_(value) {} - - // SFINAE constructor to avoid 0 constant pointer ambiguity - template < - typename T, - typename = typename std::enable_if< - std::is_pointer::value && - std::is_convertible::value>::type> - explicit Int(Passkey passkey, T node) - : Val(passkey, DataType::Int), maybe_value_(node->value()) { - setName(node->name()); - } - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - - bool isScalar() const override { - return true; - } - - bool isConst() const override { - return maybe_value_.has_value(); - } - - bool isZeroInt() const override { - return maybe_value_.has_value() && *maybe_value_ == 0; - } - - bool isOneInt() const override { - return maybe_value_.has_value() && *maybe_value_ == 1; - } - - c10::optional value() const { - return maybe_value_; - } - - private: - const c10::optional maybe_value_; -}; - -class TORCH_CUDA_CU_API IterDomain final : public Val { - public: - IterDomain(Passkey passkey, Val* start, Val* extent); - - explicit IterDomain(Passkey, const fuser::cuda::IterDomain* iter_domain); - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - - bool isReduction() const { - return iterType() == IterType::Reduction; - } - - bool isRFactorProduct() const { - return is_rfactor_domain_; - } - - bool isBroadcast() const { - return iterType() == IterType::BroadcastWithStride || - iterType() == IterType::BroadcastWithoutStride; - } - - bool isGather() const { - return iterType() == IterType::Gather; - } - - bool isStride() const { - return iterType() == IterType::Stride; - } - - bool isParallelized() const { - return parallelType() != ParallelType::Serial; - } - - // Return if this iter domain is mapped to a grid dimension - bool isBlockDim() const { - return parallelType() == ParallelType::BIDz || - parallelType() == ParallelType::BIDy || - parallelType() == ParallelType::BIDx; - } - - // Return if this iter domain is mapped to a block dimension - bool isThreadDim() const { - return parallelType() == ParallelType::TIDz || - parallelType() == ParallelType::TIDy || - parallelType() == ParallelType::TIDx; - } - - // Return if this iter domain is either mapped to a block or grid dimension - bool isThread() const { - return isBlockDim() || isThreadDim(); - } - - ParallelType parallelType() const { - return parallel_type_; - } - - IterType iterType() const { - return iter_type_; - } - - Val* start() const { - return start_; - } - - Val* stop() const { - return stop_; - } - - Val* extent() const; - - bool isSimple() const { - return is_simple_; - } - - bool hasPaddingToMultipleOfWarp() const { - return is_padded_dimension_; - } - - private: - Val* const start_ = nullptr; - Val* const stop_ = nullptr; - Val* const extent_ = nullptr; - ParallelType parallel_type_ = ParallelType::Serial; - IterType iter_type_ = IterType::Iteration; - bool is_rfactor_domain_ = false; - - // An IterDomain is "simple" if the original Fusion IterDomain - // doesn't have a definition ("definition" expression) - // - // TODO(kir): this feels like a hack, revisit - // - bool is_simple_ = true; - - //! Indicates if this iterdomain is a padded parallel dimension - bool is_padded_dimension_ = false; -}; - -// TODO(kir): is this really a value? -class TORCH_CUDA_CU_API TensorDomain final : public Val { - public: - explicit TensorDomain(Passkey, std::vector domain); - - explicit TensorDomain( - Passkey passkey, - const fuser::cuda::TensorDomain* tensor_domain); - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - - std::vector::size_type nDims() const { - return domain_.size(); - } - - // TODO(kir): rename this - const std::vector& domain() const { - return domain_; - } - - const std::vector& contiguity() const { - return contiguity_; - } - - std::string getContiguityString() const { - std::stringstream ss; - for (auto b : contiguity()) { - ss << (b ? "t" : "f"); - } - return ss.str(); - } - - bool hasReduction() const; - bool hasBlockReduction() const; - bool hasGridReduction() const; - bool hasBlockBroadcast() const; - bool hasGridBroadcast() const; - bool hasBroadcast() const; - bool hasRFactor() const; - bool hasVectorize() const; - - const std::vector& noReductions() const { - return no_reduction_domain_; - } - - const std::vector& noBroadcasts() const { - return no_bcast_domain_; - } - - const std::vector& rootDomain() const { - return root_domain_; - }; - - const std::vector& rfactorDomain() const { - return rfactor_domain_; - }; - - void resetDomains() { - no_reduction_domain_ = noReductions(domain_); - no_bcast_domain_ = noBroadcasts(domain_); - } - - IterDomain* axis(int i) const; - - // TODO(kir): overloading non-static and static methods is not a good idea - static std::vector noReductions(const std::vector&); - static std::vector noBroadcasts(const std::vector&); - - private: - std::vector root_domain_; - std::vector domain_; - std::vector no_bcast_domain_; - std::vector no_reduction_domain_; - std::vector rfactor_domain_; - const std::vector contiguity_; -}; - -class TORCH_CUDA_CU_API TensorView final : public Val { - public: - explicit TensorView(Passkey, const fuser::cuda::TensorView* tv); - - TensorView( - Passkey, - DataType dtype, - TensorDomain* domain, - MemoryType memory_type); - - TensorDomain* domain() const { - return domain_; - } - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - - MemoryType memoryType() const { - return memory_type_; - } - - fuser::cuda::TensorView* fuserTv() const { - TORCH_INTERNAL_ASSERT(fuser_tv_ != nullptr); - // TODO(kir): remove the need for const_cast - return const_cast(fuser_tv_); // NOLINT - } - - private: - TensorDomain* domain_ = nullptr; - MemoryType memory_type_ = MemoryType::Local; - - // TODO(kir): remove temporary hack - const fuser::cuda::TensorView* fuser_tv_ = nullptr; -}; - -class TORCH_CUDA_CU_API UnaryOp final : public Expr { - public: - UnaryOp(Passkey passkey, UnaryOpType operation, Val* out, Val* in); - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - - Val* out() const { - return out_; - } - - Val* in() const { - return in_; - } - - UnaryOpType operation() const { - return operation_; - } - - private: - const UnaryOpType operation_; - Val* const out_ = nullptr; - Val* const in_ = nullptr; -}; - -class TORCH_CUDA_CU_API BinaryOp final : public Expr { - public: - BinaryOp( - Passkey passkey, - BinaryOpType operation, - Val* out, - Val* lhs, - Val* rhs); - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - - Val* out() const { - return out_; - } - - Val* lhs() const { - return lhs_; - } +#include +#include +#include +#include +#include - Val* rhs() const { - return rhs_; - } +#include +#include - BinaryOpType operation() const { - return operation_; - } +#include +#include +#include +#include - private: - const BinaryOpType operation_; - Val* const out_ = nullptr; - Val* const lhs_ = nullptr; - Val* const rhs_ = nullptr; -}; +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { -class TORCH_CUDA_CU_API TernaryOp final : public Expr { - public: - TernaryOp( - Passkey passkey, - TernaryOpType operation, - Val* out, - Val* in1, - Val* in2, - Val* in3); +class IrBuilderPasskey; - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } +// Abstract nodes +class Val; +class Expr; - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } +// Values +class Bool; +class Double; +class Int; +class NamedScalar; - Val* out() const { - return out_; - } +class IterDomain; +class TensorDomain; +class TensorView; - Val* in1() const { - return in1_; - } +// Expressions +class UnaryOp; +class BinaryOp; +class TernaryOp; +class ReductionOp; +class WelfordOp; +class BroadcastOp; - Val* in2() const { - return in2_; - } +namespace kir { +class Kernel; - Val* in3() const { - return in3_; - } +// Values +class Predicate; +class TensorIndex; - TernaryOpType operation() const { - return operation_; - } +// Expressions +class Allocate; +class BlockSync; +class GridSync; +class InitMagicZero; +class UpdateMagicZero; +class ForLoop; +class IfThenElse; +class GridReduction; +class GroupedGridReduction; +class GridBroadcast; +class GridWelford; +class AllocateFusedReduction; - private: - const TernaryOpType operation_; - Val* const out_ = nullptr; - Val* const in1_ = nullptr; - Val* const in2_ = nullptr; - Val* const in3_ = nullptr; -}; +// Expr container +class Scope; -class TORCH_CUDA_CU_API ReductionOp final : public Expr { +class TORCH_CUDA_CU_API Predicate final : public Val { public: - ReductionOp( - Passkey passkey, - BinaryOpType operation, - Val* init, - Val* out, - Val* in); - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - - Val* out() const { - return out_; - } - - Val* in() const { - return in_; - } - - Val* init() const { - return init_; - } - - BinaryOpType operation() const { - return operation_; - } - - private: - const BinaryOpType operation_; - Val* const init_ = nullptr; - Val* const out_ = nullptr; - Val* const in_ = nullptr; -}; + explicit Predicate( + IrBuilderPasskey passkey, + PredicateType ptype, + const Expr* expr = nullptr, + Bool* thread_pred = nullptr); -class TORCH_CUDA_CU_API WelfordOp final : public Expr { - public: - WelfordOp( - Passkey passkey, - Val* out_var, - Val* out_avg, - Val* out_N, - Val* init_var, - Val* init_avg, - Val* init_N, - Val* in_var, - Val* in_avg, - Val* in_N); + explicit Predicate(IrBuilderPasskey passkey, ForLoop* unrolled_loop); - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } + explicit Predicate(IrBuilderPasskey passkey, Bool* value); - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); + PredicateType predicate_type() const { + return ptype_; } - Val* out() const { - return out_avg_; + const Expr* expr() const { + TORCH_INTERNAL_ASSERT( + ptype_ != PredicateType::Unswitch && + ptype_ != PredicateType::Vectorize && ptype_ != PredicateType::Manual); + return expr_; } - Val* in() const { - return in_avg_; + Bool* thread_pred() { + TORCH_INTERNAL_ASSERT( + ptype_ == PredicateType::Inline || + ptype_ == PredicateType::Misaligned || ptype_ == PredicateType::Shift || + ptype_ == PredicateType::Padding || + ptype_ == PredicateType::ReductionWrite); + return thread_pred_; } - // Welford Specific accessors - // Almost wanted to add a new struct for {var, avg, N} - Val* outVar() const { - return out_var_; + ForLoop* unrolled_loop() const { + TORCH_INTERNAL_ASSERT(ptype_ == PredicateType::Unswitch); + return unrolled_loop_; } - Val* outAvg() const { - return out_avg_; + bool hasValue() const { + return value_ != nullptr; } - Val* outN() const { - return out_N_; + Bool* value() const { + TORCH_INTERNAL_ASSERT( + value_ != nullptr, + "The conditional expression for this Predicate is invalid."); + return value_; } - Val* initVar() const { - return init_var_; + void setValue(Bool* value) { + TORCH_INTERNAL_ASSERT(value != nullptr, "The Bool expression is invalid."); + value_ = value; } - Val* initAvg() const { - return init_avg_; + bool isConst() const final { + return hasValue() && value_->isConst(); } - Val* initN() const { - return init_N_; - } + private: + PredicateType ptype_ = PredicateType::Manual; - Val* inVar() const { - return in_var_; - } + // For PredicateCompute::getInlinePredicate, + // ShiftPredicateInserter::getShiftPredicate and getPaddingPredicate + const Expr* expr_ = nullptr; - Val* inAvg() const { - return in_avg_; - } + // For PredicateCompute::getInlinePredicate + Bool* thread_pred_ = nullptr; - Val* inN() const { - return in_N_; - } + // For ParallelType::Unswitch - UnswitchPredicate::get + ForLoop* unrolled_loop_ = nullptr; - private: - Val* const out_var_; - Val* const out_avg_; - Val* const out_N_; - Val* const init_var_; - Val* const init_avg_; - Val* const init_N_; - Val* const in_var_; - Val* const in_avg_; - Val* const in_N_; + // The Bool conditional value + // The value is nullptr until lower_predicate pass + Bool* value_ = nullptr; }; class TORCH_CUDA_CU_API TensorIndex final : public Val { public: TensorIndex( - Passkey, - const fuser::cuda::TensorView* view, + IrBuilderPasskey, + const TensorView* view, std::vector indices); - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - std::vector::size_type nDims() const { return indices_.size(); } @@ -1170,8 +161,7 @@ class TORCH_CUDA_CU_API TensorIndex final : public Val { TensorView* view() const { TORCH_INTERNAL_ASSERT(view_ != nullptr); - // TODO(kir): remove the need for const_cast - return const_cast(view_); // NOLINT + return const_cast(view_); // NOLINT } private: @@ -1179,46 +169,17 @@ class TORCH_CUDA_CU_API TensorIndex final : public Val { std::vector indices_; }; -class TORCH_CUDA_CU_API BroadcastOp final : public Expr { - public: - BroadcastOp(Passkey passkey, Val* out, Val* in); - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - - Val* out() const { - return out_; - } - - Val* in() const { - return in_; - } - - private: - Val* const out_ = nullptr; - Val* const in_ = nullptr; -}; - //! Allocate is a lower level Node that describes a buffer of memory that //! is required as an intermediate within a kernel. The extent is the expression //! of the size of the buffer that is generated from the TensorView that //! describes the output of an operation. -//! -//! TODO(kir): The components of Allocate like Type and Name could be separated -//! from the the assocated TensorView. Perhaps that is more appropriate? -//! class TORCH_CUDA_CU_API Allocate final : public Expr { public: //! Allocation of a multi-dimensional buffer //! //! param shape Size of each dimension explicit Allocate( - Passkey passkey, + IrBuilderPasskey passkey, Val* buffer, MemoryType memory_type, std::vector shape = {}, @@ -1228,20 +189,12 @@ class TORCH_CUDA_CU_API Allocate final : public Expr { //! //! param size Size of allocation explicit Allocate( - Passkey passkey, + IrBuilderPasskey passkey, Val* buffer, MemoryType memory_type, Val* size, bool zero_init = false); - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - Val* buffer() const { return buffer_; } @@ -1290,17 +243,9 @@ class TORCH_CUDA_CU_API Allocate final : public Expr { // // TODO(kir): change name to SyncThreads as we could have other barriers. // -class TORCH_CUDA_CU_API Sync final : public Expr { +class TORCH_CUDA_CU_API BlockSync final : public Expr { public: - explicit Sync(Passkey passkey, bool war_sync = false); - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } + explicit BlockSync(IrBuilderPasskey passkey, bool war_sync = false); bool isWarHazardSync() const { return war_sync_; @@ -1311,34 +256,40 @@ class TORCH_CUDA_CU_API Sync final : public Expr { bool war_sync_ = false; }; -// Simply prints "DEFINE_MAGIC_ZERO" in the code in accordance with magic_zero -// in helpers.cu -class TORCH_CUDA_CU_API InitMagicZero final : public Expr { +// Synchronize all blocks in device, implies cooperative group launch is +// required. +class TORCH_CUDA_CU_API GridSync final : public Expr { public: - explicit InitMagicZero(Passkey passkey); + explicit GridSync( + IrBuilderPasskey passkey, + ParallelTypeBitmap sync_dims, + Val* sync_buffer); - void accept(IrVisitor* visitor) const override { - visitor->visit(this); + ParallelTypeBitmap syncDims() const { + return sync_dims_; } - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); + Val* syncBuffer() const { + return sync_buffer_; } + + private: + ParallelTypeBitmap sync_dims_; + Val* sync_buffer_ = nullptr; +}; + +// Simply prints "DEFINE_MAGIC_ZERO" in the code in accordance with magic_zero +// in helpers.cu +class TORCH_CUDA_CU_API InitMagicZero final : public Expr { + public: + explicit InitMagicZero(IrBuilderPasskey passkey); }; // Simply prints "UPDATE_MAGIC_ZERO" in the code in accordance with magic_zero // in helpers.cu class TORCH_CUDA_CU_API UpdateMagicZero final : public Expr { public: - explicit UpdateMagicZero(Passkey passkey); - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } + explicit UpdateMagicZero(IrBuilderPasskey passkey); }; // TODO(kir): promote to IR node @@ -1377,7 +328,6 @@ class TORCH_CUDA_CU_API Scope { void push_back(Expr* e) { exprs_.push_back(e); - e->setScope(this); } // Erase expr at pos @@ -1425,7 +375,7 @@ class TORCH_CUDA_CU_API ForLoop final : public Expr { //! //! TODO: cleaner way to set options? ForLoop( - Passkey passkey, + IrBuilderPasskey passkey, IterDomain* iter_domain, Val* index, Val* start, @@ -1435,17 +385,9 @@ class TORCH_CUDA_CU_API ForLoop final : public Expr { Val* vectorize_shift, bool unroll_required); - ForLoop(Passkey passkey, IterDomain* iter_domain); - - ForLoop(Passkey passkey, const ForLoop* other); - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } + ForLoop(IrBuilderPasskey passkey, IterDomain* iter_domain); - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } + ForLoop(IrBuilderPasskey passkey, const ForLoop* other); Val* index() const { return index_; @@ -1465,6 +407,7 @@ class TORCH_CUDA_CU_API ForLoop final : public Expr { return iter_domain_; } + // TODO: Return pointer instead of reference to be more consistent Scope& body() { return body_; } @@ -1490,6 +433,9 @@ class TORCH_CUDA_CU_API ForLoop final : public Expr { unroll_required_ = true; } + //! True if no actual for-loop is materialized + bool isTrivial() const; + private: //! Returns if a loop could be unrolled. bool isUnrollable() const; @@ -1524,15 +470,7 @@ class TORCH_CUDA_CU_API ForLoop final : public Expr { //! class TORCH_CUDA_CU_API IfThenElse final : public Expr { public: - explicit IfThenElse(Passkey passkey, Predicate* cond); - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } + explicit IfThenElse(IrBuilderPasskey passkey, Predicate* cond); Scope& thenBody() { return then_body_; @@ -1565,28 +503,75 @@ class TORCH_CUDA_CU_API IfThenElse final : public Expr { //! //! This node provides FusionExecutor the information it needs to allocate the //! reduction and sync buffers. -class TORCH_CUDA_CU_API GridReduction final : public Expr { +class TORCH_CUDA_CU_API GridReduction final : public ReductionOp { public: - void accept(IrVisitor* visitor) const override { - visitor->visit(this); + GridReduction( + IrBuilderPasskey passkey, + BinaryOpType reduction_op_type, + Val* init, + Val* out, + Val* in, + Allocate* reduction_buffer, + Allocate* sync_buffer, + Val* entrance_index, + Val* entrances, + bool is_fused = false); + + Allocate* reduction_buffer() const { + return reduction_buffer_; } - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); + Allocate* sync_buffer() const { + return sync_buffer_; } - GridReduction( - Passkey passkey, - ReductionOp* reduction_op, - Allocate* reduction_buffer, - Allocate* sync_buffer); + // Which instance of entering this grid reduction is this iteration? + Val* entrance_index() const { + return entrance_index_; + } - ReductionOp* reduction_op() const { - return reduction_op_; + // How many times will this grid reduction be entered + Val* entrances() const { + return entrances_; } - Allocate* reduction_buffer() const { - return reduction_buffer_; + const ParallelTypeBitmap& threadPredicate() const { + return thread_predicate_; + } + + void setThreadPredicate(const ParallelTypeBitmap& thread_predicate) { + thread_predicate_ = thread_predicate; + } + + private: + Allocate* reduction_buffer_ = nullptr; + Allocate* sync_buffer_ = nullptr; + // gridReduce has template flags for thread predicates. In order to + // use them, the thread predicate is held here separately from + // Expr::predicate_. + ParallelTypeBitmap thread_predicate_; + Val* entrance_index_ = nullptr; + Val* entrances_ = nullptr; +}; + +class TORCH_CUDA_CU_API GroupedGridReduction final : public GroupedReductionOp { + public: + GroupedGridReduction( + IrBuilderPasskey passkey, + std::vector reduction_op_type, + std::vector init, + std::vector out, + std::vector in, + std::vector reduction_buffers, + Allocate* sync_buffer, + bool is_allreduce = false); + + const std::vector& reduction_buffers() const { + return reduction_buffers_; + } + + Allocate* reduction_buffer(size_t i) const { + return reduction_buffers_.at(i); } Allocate* sync_buffer() const { @@ -1602,8 +587,7 @@ class TORCH_CUDA_CU_API GridReduction final : public Expr { } private: - ReductionOp* reduction_op_ = nullptr; - Allocate* reduction_buffer_ = nullptr; + std::vector reduction_buffers_; Allocate* sync_buffer_ = nullptr; // gridReduce has template flags for thread predicates. In order to // use them, the thread predicate is held here separately from @@ -1620,23 +604,11 @@ class TORCH_CUDA_CU_API GridReduction final : public Expr { //! broadcast and sync buffers. class TORCH_CUDA_CU_API GridBroadcast final : public Expr { public: - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - GridBroadcast( - Passkey passkey, + IrBuilderPasskey passkey, BroadcastOp* broadcast_op, Allocate* broadcast_buffer, - Allocate* sync_buffer) - : Expr(passkey), - broadcast_op_(broadcast_op), - broadcast_buffer_(broadcast_buffer), - sync_buffer_(sync_buffer){}; + Allocate* sync_buffer); BroadcastOp* broadcast_op() const { return broadcast_op_; @@ -1665,21 +637,15 @@ class TORCH_CUDA_CU_API GridBroadcast final : public Expr { //! reduction and sync buffers. class TORCH_CUDA_CU_API GridWelford final : public Expr { public: - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - GridWelford( - Passkey passkey, + IrBuilderPasskey passkey, WelfordOp* welford_op, Allocate* var_buffer, Allocate* avg_buffer, Allocate* n_buffer, - Allocate* sync_buffer); + Allocate* sync_buffer, + Val* entrance_index, + Val* entrances); WelfordOp* welford_op() const { return welford_op_; @@ -1701,6 +667,16 @@ class TORCH_CUDA_CU_API GridWelford final : public Expr { return sync_buffer_; } + // Which instance of entering this grid reduction is this iteration? + Val* entrance_index() const { + return entrance_index_; + } + + // How many times will this grid reduction be entered + Val* entrances() const { + return entrances_; + } + const ParallelTypeBitmap& threadPredicate() const { return thread_predicate_; } @@ -1715,12 +691,42 @@ class TORCH_CUDA_CU_API GridWelford final : public Expr { Allocate* avg_buffer_ = nullptr; Allocate* n_buffer_ = nullptr; Allocate* sync_buffer_ = nullptr; + Val* entrance_index_ = nullptr; + Val* entrances_ = nullptr; // gridReduce has template flags for thread predicates. In order to // use them, the thread predicate is held here separately from // Expr::predicate_. ParallelTypeBitmap thread_predicate_; }; +// Allocate an instance of the fused reduction class. +class TORCH_CUDA_CU_API AllocateFusedReduction final : public Expr { + public: + explicit AllocateFusedReduction( + IrBuilderPasskey passkey, + GridReduction* grid_reduction); + + explicit AllocateFusedReduction( + IrBuilderPasskey passkey, + GridWelford* grid_welford); + + explicit AllocateFusedReduction( + IrBuilderPasskey passkey, + GroupedGridReduction* grouped_grid_reduction); + + Expr* gridExpr() const { + return grid_expr_; + } + + TensorIndex* out() const; + + const ParallelTypeBitmap& threadPredicate() const; + + private: + //! GridReduction, GridWelford or GroupedGridReduction + Expr* grid_expr_ = nullptr; +}; + } // namespace kir } // namespace cuda } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir_builder.cpp b/torch/csrc/jit/codegen/cuda/kernel_ir_builder.cpp deleted file mode 100644 index ce3e17d74d22..000000000000 --- a/torch/csrc/jit/codegen/cuda/kernel_ir_builder.cpp +++ /dev/null @@ -1,276 +0,0 @@ -#include - -namespace torch { -namespace jit { -namespace fuser { -namespace cuda { -namespace kir { - -Val* IrBuilder::newResult(DataType dtype) { - switch (dtype) { - case DataType::Bool: - return create(c10::nullopt); - case DataType::Double: - return create(c10::nullopt); - case DataType::Int: - return create(c10::nullopt); - default: - TORCH_CHECK(false, "Unexpected data type"); - } -} - -Val* IrBuilder::newArithmeticExpr(BinaryOpType op_type, Val* lhs, Val* rhs) { - TORCH_CHECK(lhs->dtype() == rhs->dtype(), "Incompatible operand types"); - auto result = newResult(lhs->dtype()); - create(op_type, result, lhs, rhs); - // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) - return result; -} - -Val* IrBuilder::newLogicExpr(BinaryOpType op_type, Val* lhs, Val* rhs) { - auto result = create(c10::nullopt); - create(op_type, result, lhs, rhs); - // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) - return result; -} - -Val* IrBuilder::whereExpr(Val* pred, Val* lhs, Val* rhs) { - TORCH_CHECK(lhs->dtype() == rhs->dtype(), "Incompatible operand types"); - auto result = newResult(lhs->dtype()); - create(TernaryOpType::Where, result, pred, lhs, rhs); - return result; -} - -Val* IrBuilder::negExpr(Val* val) { - auto result = newResult(val->dtype()); - create(UnaryOpType::Neg, result, val); - return result; -} - -Val* IrBuilder::notExpr(Val* val) { - auto result = newResult(val->dtype()); - create(UnaryOpType::Not, result, val); - return result; -} - -Val* IrBuilder::setExpr(Val* val) { - auto result = newResult(val->dtype()); - create(UnaryOpType::Set, result, val); - return result; -} - -Val* IrBuilder::setExprNamedScalar(const std::string& name, Val* val) { - auto result = create(name, val->dtype()); - create(UnaryOpType::Set, result, val); - return result; -} - -Val* IrBuilder::addressExprNamedScalar(const std::string& name, Val* val) { - auto result = create(name, DataType::Int); - create(UnaryOpType::Address, result, val); - return result; -} - -Val* IrBuilder::andExpr(Val* lhs, Val* rhs) { - return newLogicExpr(BinaryOpType::And, lhs, rhs); -} - -Val* IrBuilder::eqExpr(Val* lhs, Val* rhs) { - return newLogicExpr(BinaryOpType::Eq, lhs, rhs); -} - -Val* IrBuilder::gtExpr(Val* lhs, Val* rhs) { - return newLogicExpr(BinaryOpType::GT, lhs, rhs); -} - -Val* IrBuilder::ltExpr(Val* lhs, Val* rhs) { - return newLogicExpr(BinaryOpType::LT, lhs, rhs); -} - -Val* IrBuilder::leExpr(Val* lhs, Val* rhs) { - return newLogicExpr(BinaryOpType::LE, lhs, rhs); -} - -Val* IrBuilder::geExpr(Val* lhs, Val* rhs) { - return newLogicExpr(BinaryOpType::GE, lhs, rhs); -} - -Val* IrBuilder::addExpr(Val* lhs, Val* rhs) { - return newArithmeticExpr(BinaryOpType::Add, lhs, rhs); -} - -Val* IrBuilder::subExpr(Val* lhs, Val* rhs) { - return newArithmeticExpr(BinaryOpType::Sub, lhs, rhs); -} - -Val* IrBuilder::mulExpr(Val* lhs, Val* rhs) { - return newArithmeticExpr(BinaryOpType::Mul, lhs, rhs); -} - -Val* IrBuilder::divExpr(Val* lhs, Val* rhs) { - return newArithmeticExpr(BinaryOpType::Div, lhs, rhs); -} - -Val* IrBuilder::ceilDivExpr(Val* lhs, Val* rhs) { - return newArithmeticExpr(BinaryOpType::CeilDiv, lhs, rhs); -} - -Val* IrBuilder::modExpr(Val* lhs, Val* rhs) { - return newArithmeticExpr(BinaryOpType::Mod, lhs, rhs); -} - -Val* IrBuilder::maxExpr(Val* lhs, Val* rhs) { - return newArithmeticExpr(BinaryOpType::Max, lhs, rhs); -} - -Val* IrBuilder::minExpr(Val* lhs, Val* rhs) { - return newArithmeticExpr(BinaryOpType::Min, lhs, rhs); -} - -Int* IrBuilder::zeroVal() { - if (zero_ == nullptr) { - zero_ = create(0); - } - return zero_; -} - -Int* IrBuilder::oneVal() { - if (one_ == nullptr) { - one_ = create(1); - } - return one_; -} - -Bool* IrBuilder::falseVal() { - if (false_ == nullptr) { - false_ = create(false); - } - return false_; -} - -Bool* IrBuilder::trueVal() { - if (true_ == nullptr) { - true_ = create(true); - } - return true_; -} - -NamedScalar* IrBuilder::magicZeroVal() { - if (magic_zero_ == nullptr) { - magic_zero_ = create(kMagicZeroName, DataType::Int); - } - return magic_zero_; -} - -Val* SimplifyingIrBuilder::negExpr(Val* val) { - if (auto int_val = dynamic_cast(val)) { - if (int_val->isConst()) { - return create(-int_val->value().value()); - } - } - return IrBuilder::negExpr(val); -} - -Val* SimplifyingIrBuilder::notExpr(Val* val) { - if (auto bool_val = dynamic_cast(val)) { - if (bool_val->isConst()) { - if (bool_val->value().value()) { - return falseVal(); - } else { - return trueVal(); - } - } - } - return IrBuilder::notExpr(val); -} - -Val* SimplifyingIrBuilder::addExpr(Int* lhs, Int::ScalarType rhs) { - if (rhs == 0) { - return lhs; - } else if (lhs == nullptr) { - return IrBuilder::create(rhs); - } else if (lhs->isConst()) { - return IrBuilder::create(lhs->value().value() + rhs); - } else if (rhs > 0) { - return IrBuilder::addExpr(lhs, IrBuilder::create(rhs)); - } else { - return IrBuilder::subExpr(lhs, IrBuilder::create(-rhs)); - } -} - -Val* SimplifyingIrBuilder::addExpr(Int* lhs, Int* rhs) { - if (rhs == nullptr) { - return lhs; - } else if (lhs == nullptr) { - return rhs; - } else if (lhs->isConst()) { - return addExpr(rhs, lhs->value().value()); - } else if (rhs->isConst()) { - return addExpr(lhs, rhs->value().value()); - } else { - return IrBuilder::addExpr(lhs, rhs); - } -} - -Val* SimplifyingIrBuilder::addExpr(Val* lhs, Val* rhs) { - TORCH_INTERNAL_ASSERT(lhs != nullptr || rhs != nullptr); - if (lhs == nullptr || lhs->isZeroInt()) { - return rhs; - } else if (rhs == nullptr || rhs->isZeroInt()) { - return lhs; - } - auto lhs_int = dynamic_cast(lhs); - auto rhs_int = dynamic_cast(rhs); - if (lhs_int != nullptr && rhs_int != nullptr) { - return addExpr(lhs_int, rhs_int); - } else { - return IrBuilder::addExpr(lhs, rhs); - } -} - -Val* SimplifyingIrBuilder::subExpr(Val* lhs, Val* rhs) { - return addExpr(lhs, negExpr(rhs)); -} - -Val* SimplifyingIrBuilder::andExpr(Val* lhs, Val* rhs) { - TORCH_INTERNAL_ASSERT(!(lhs == nullptr && rhs == nullptr)); - - if (lhs == nullptr) { - return rhs; - } else if (rhs == nullptr) { - return lhs; - } - - bool lhs_definitely_true = false; - bool lhs_definitely_false = false; - auto lhs_bool = dynamic_cast(lhs); - if (lhs_bool && lhs_bool->isConst()) { - lhs_definitely_true = lhs_bool->value().value(); - lhs_definitely_false = !lhs_bool->value().value(); - } - auto rhs_bool = dynamic_cast(rhs); - bool rhs_definitely_true = false; - bool rhs_definitely_false = false; - if (rhs_bool && rhs_bool->isConst()) { - rhs_definitely_true = rhs_bool->value().value(); - rhs_definitely_false = !rhs_bool->value().value(); - } - - if (lhs_definitely_true && rhs_definitely_true) { - return trueVal(); - } else if (lhs_definitely_false || rhs_definitely_false) { - return falseVal(); - } else if (lhs_definitely_true) { - return rhs; - } else if (rhs_definitely_true) { - return lhs; - } - - return IrBuilder::andExpr(lhs, rhs); -} - -} // namespace kir -} // namespace cuda -} // namespace fuser -} // namespace jit -} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir_builder.h b/torch/csrc/jit/codegen/cuda/kernel_ir_builder.h deleted file mode 100644 index 17a095baf120..000000000000 --- a/torch/csrc/jit/codegen/cuda/kernel_ir_builder.h +++ /dev/null @@ -1,131 +0,0 @@ -#pragma once - -#include -#include -#include - -#include - -namespace torch { -namespace jit { -namespace fuser { -namespace cuda { -namespace kir { - -//! Kernel IR builder interface -//! -//! The only way to create new Kernel IR nodes is through the -//! kir::IrBuilder interface. An IrBuilder instance is attached to a -//! particular Kernel instance and it provides methods for creating -//! single nodes (kir::IrBuilder::create()) or basic composite expressions -//! (ex. kir::IrBuilder::addExpr()). -//! -//! If the Kernel object is readily available, an IrBuilder can be "wrapped" -//! around it directly: -//! -//! kir::IrBuilder ir_builder(kernel); -//! -//! During lowering, another option is to create an IrBuilder for the -//! kernel that is being created: -//! -//! kir::IrBuilder ir_builder(GpuLower::current()->kernel()); -//! -//! Once we have an IR builder instance, creating nodes looks like: -//! -//! auto new_node = ir_builder.create(1)); -//! auto result = ir_builder.mulExpr(lhs, rhs); -//! -class TORCH_CUDA_CU_API IrBuilder { - public: - explicit IrBuilder(Kernel* kernel) : kernel_(kernel) {} - - //! Allocate a new Kernel IR node, forwarding the arguments - //! to the appropriate constructor - template - T* create(Args&&... args) { - const kir::Passkey passkey(kernel_); - const auto node = new T(passkey, std::forward(args)...); - kernel_->registerIrNode(passkey, std::unique_ptr(node)); - return node; - } - - // Unary operations - Val* negExpr(Val* val); - Val* notExpr(Val* val); - Val* setExpr(Val* val); - Val* setExprNamedScalar(const std::string& name, Val* val); - Val* addressExprNamedScalar(const std::string& name, Val* val); - - // Binary operations - Val* andExpr(Val* lhs, Val* rhs); - Val* eqExpr(Val* lhs, Val* rhs); - Val* gtExpr(Val* lhs, Val* rhs); - Val* ltExpr(Val* lhs, Val* rhs); - Val* leExpr(Val* lhs, Val* rhs); - Val* geExpr(Val* lhs, Val* rhs); - Val* addExpr(Val* lhs, Val* rhs); - Val* subExpr(Val* lhs, Val* rhs); - Val* mulExpr(Val* lhs, Val* rhs); - Val* divExpr(Val* lhs, Val* rhs); - Val* ceilDivExpr(Val* lhs, Val* rhs); - Val* modExpr(Val* lhs, Val* rhs); - Val* maxExpr(Val* lhs, Val* rhs); - Val* minExpr(Val* lhs, Val* rhs); - - // Ternary operations - Val* whereExpr(Val* pred, Val* lhs, Val* rhs); - - // Shortcuts for frequently used vals - Int* zeroVal(); - Int* oneVal(); - Bool* falseVal(); - Bool* trueVal(); - - NamedScalar* magicZeroVal(); - - private: - Val* newResult(DataType dtype); - Val* newArithmeticExpr(BinaryOpType op_type, Val* lhs, Val* rhs); - Val* newLogicExpr(BinaryOpType op_type, Val* lhs, Val* rhs); - - private: - // Non-owning pointer to the kernel to be modified - Kernel* kernel_ = nullptr; - // Frequently used constant vals - Int* zero_ = nullptr; - Int* one_ = nullptr; - Bool* false_ = nullptr; - Bool* true_ = nullptr; - - // Magic zero corresponds to runtime/helpers.cu magic_zero - NamedScalar* magic_zero_ = nullptr; -}; - -//! A wrapper builder with static expression simplification -//! -//! Example: -//! - addExpr(new Int(1), new Int(2)) -> Int(3) -//! - addExpr(new Int(0), new NamedScalar("foo")) -> NamedScalar("foo") -//! -//! Designed to be used to simplify predicate and index expressions in -//! generated code. Also, the shift validation may fail without -//! this simplification. -class TORCH_CUDA_CU_API SimplifyingIrBuilder : public IrBuilder { - public: - explicit SimplifyingIrBuilder(Kernel* kernel) : IrBuilder(kernel) {} - - Val* negExpr(Val* val); - Val* notExpr(Val* val); - - Val* addExpr(Int* lhs, Int::ScalarType rhs); - Val* addExpr(Int* lhs, Int* rhs); - Val* addExpr(Val* lhs, Val* rhs); - Val* subExpr(Val* lhs, Val* rhs); - Val* andExpr(Val* lhs, Val* rhs); -}; - -} // namespace kir -} // namespace cuda -} // namespace fuser -} // namespace jit -} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.cpp b/torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.cpp new file mode 100644 index 000000000000..a64b07da4a05 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.cpp @@ -0,0 +1,213 @@ +#include +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { +namespace kir { +std::vector IrVisitor::handle(const std::vector& exprs) { + exprs_ = std::vector(exprs); + for (auto expr : exprs) { + handle(expr); + } + return exprs_; +} + +void IrVisitor::handle(ForLoop* fl) { + for_loops_.push_back(fl); + scope_.push_back(&fl->body()); + scope_exprs_.push_back(fl); + auto body_exprs = std::vector(fl->body().exprs()); + for (auto expr : body_exprs) { + handle(expr); + } + scope_exprs_.pop_back(); + scope_.pop_back(); + for_loops_.pop_back(); +} + +void IrVisitor::handle(IfThenElse* ite) { + scope_exprs_.push_back(ite); + scope_.push_back(&ite->thenBody()); + auto then_exprs = std::vector(ite->thenBody().exprs()); + for (auto expr : then_exprs) { + handle(expr); + } + scope_.pop_back(); + + scope_.push_back(&ite->elseBody()); + auto else_exprs = std::vector(ite->elseBody().exprs()); + for (auto expr : else_exprs) { + handle(expr); + } + scope_.pop_back(); + scope_exprs_.pop_back(); +} + +std::vector ExprMutator::mutate(bool reverse_order) { + if (insertions_.empty() && replacements_.empty() && removal_.empty()) { + return exprs_; + } + + auto run_insertion = [&](MutationInformation info) { + if (info.scope == nullptr) { + // If reference is nullptr and there are no expressions, simply insert the + // expr + if (exprs_.empty() && info.reference == nullptr) { + exprs_.push_back(info.new_expr); + return; + } + auto pos_it = std::find(exprs_.begin(), exprs_.end(), info.reference); + TORCH_INTERNAL_ASSERT( + pos_it != exprs_.end(), + "Issue finding reference expression for insertion."); + if (info.mode == MutationMode::BEFORE) { + exprs_.insert(pos_it, info.new_expr); + } else { + exprs_.insert(pos_it + 1, info.new_expr); + } + } else { + // If reference is nullptr and there are no expressions, simply insert the + // expr + if (info.scope->exprs().empty() && info.reference == nullptr) { + info.scope->push_back(info.new_expr); + return; + } + if (info.mode == MutationMode::BEFORE) { + info.scope->insert_before(info.reference, info.new_expr); + } else { + info.scope->insert_after(info.reference, info.new_expr); + } + } + }; + + if (reverse_order) { + for (auto it = insertions_.rbegin(); it != insertions_.rend(); ++it) { + run_insertion(*it); + } + } else { + for (auto insertion_info : insertions_) { + run_insertion(insertion_info); + } + } + + for (auto replacement_info : replacements_) { + if (replacement_info.scope == nullptr) { + auto pos_it = + std::find(exprs_.begin(), exprs_.end(), replacement_info.reference); + TORCH_INTERNAL_ASSERT( + pos_it != exprs_.end(), + "Issue finding reference expression for replacement."); + exprs_.insert(pos_it, replacement_info.new_expr); + // iterator can be invalidated from insertion + pos_it = + std::find(exprs_.begin(), exprs_.end(), replacement_info.reference); + exprs_.erase(pos_it); + } else { + replacement_info.scope->insert_before( + replacement_info.reference, replacement_info.new_expr); + replacement_info.scope->erase(replacement_info.reference); + } + } + + for (auto removal_info : removal_) { + if (removal_info.scope == nullptr) { + auto pos_it = + std::find(exprs_.begin(), exprs_.end(), removal_info.reference); + TORCH_INTERNAL_ASSERT( + pos_it != exprs_.end(), "Issue finding expression to remove."); + exprs_.erase(pos_it); + } else { + TORCH_INTERNAL_ASSERT( + removal_info.scope->contains(removal_info.reference), + "Expression to remove is not found in the given scope: ", + removal_info.reference->toString()); + removal_info.scope->erase(removal_info.reference); + } + } + + insertions_.clear(); + replacements_.clear(); + + return exprs_; +} + +std::vector ExprMutator::traverseAndInsert( + const std::vector& exprs, + bool reverse_order) { + IrVisitor::handle(exprs); + return mutate(reverse_order); +} + +void ExprMutator::registerMutation( + Expr* reference, + Expr* new_expr, + Scope* scope, + MutationMode mode) { + MutationInformation mutation; + mutation.reference = reference; + mutation.new_expr = new_expr; + mutation.scope = scope; + mutation.mode = mode; + if (mode == MutationMode::BEFORE || mode == MutationMode::AFTER) { + insertions_.push_back(mutation); + } else if (mode == MutationMode::REPLACE) { + replacements_.push_back(mutation); + } else if (mode == MutationMode::REMOVE) { + removal_.push_back(mutation); + } else { + TORCH_INTERNAL_ASSERT(false, "Invalid mutation type"); + } +} + +void ExprMutator::registerInsertBefore( + Expr* reference, + Expr* new_expr, + Scope* scope) { + registerMutation(reference, new_expr, scope, MutationMode::BEFORE); +} + +void ExprMutator::registerInsertAfter( + Expr* reference, + Expr* new_expr, + Scope* scope) { + registerMutation(reference, new_expr, scope, MutationMode::AFTER); +} + +void ExprMutator::registerReplace( + Expr* reference, + Expr* new_expr, + Scope* scope) { + registerMutation(reference, new_expr, scope, MutationMode::REPLACE); +} + +void ExprMutator::registerRemove(Expr* expr_to_remove, Scope* scope) { + registerMutation(expr_to_remove, nullptr, scope, MutationMode::REMOVE); +} + +void ExprMutator::registerInsertBefore(Expr* reference, Expr* new_expr) { + Scope* scope = scope_.empty() ? nullptr : scope_.back(); + registerInsertBefore(reference, new_expr, scope); +} + +void ExprMutator::registerInsertAfter(Expr* reference, Expr* new_expr) { + Scope* scope = scope_.empty() ? nullptr : scope_.back(); + registerInsertAfter(reference, new_expr, scope); +} + +void ExprMutator::registerReplace(Expr* reference, Expr* new_expr) { + Scope* scope = scope_.empty() ? nullptr : scope_.back(); + registerReplace(reference, new_expr, scope); +} + +void ExprMutator::registerRemove(Expr* expr_to_remove) { + Scope* scope = scope_.empty() ? nullptr : scope_.back(); + registerRemove(expr_to_remove, scope); +} + +} // namespace kir +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h b/torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h new file mode 100644 index 000000000000..d665c4a6fdf5 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h @@ -0,0 +1,126 @@ +#pragma once + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +class Expr; + +namespace kir { +class Predicate; +class TensorIndex; +class ForLoop; +class IfThenElse; +class Scope; + +// Base visitor class that visits all nodes in provided vector. +// +// Includes visiting through scopes like IfThenElse and ForLoop, and tracks +// them in scopes_ and for_loops_. +// +// Makes a copy of exprs at exprs_ which could be used to modify and return. +// +// When traversing through ITE/FLs it will use a copy +// of the provided expressions to make it safe to insert/delete nodes. +// +// Provides a simple base class to inherit from for typical lowering passes on +// Expr list +class TORCH_CUDA_CU_API IrVisitor : public OptOutDispatch { + public: + std::vector handle(const std::vector& expr); + + protected: + using OptOutDispatch::handle; + + virtual void handle(ForLoop*) override; + virtual void handle(IfThenElse*) override; + + protected: + std::vector for_loops_; + std::vector scope_; + std::vector scope_exprs_; + std::vector exprs_; +}; + +// Base Expr Mutator class that visits all nodes with IrVisitor, and then +// inserts new expressions, replaces expressions based on insertion/replace +// maps provided or removes existing expressions. These replacement +// maps are expected to accumulate during an initial traversal, then +// runs an insertion based on them after the overloaded traversal. +// +// Order of mutations may be important, mutations are ordered according to the +// following rules: +// Before/After insertions are ordered as registered when reverse_order == +// false, +// +// Before/After insertions are in reverse order as registered when +// reverse_order == true, +// +// Before/After insertions are done before Expr replacements, so reference for +// insertions must be on pre-replaced Exprs +// +// Removal of expressions is done after replacements. +// +// To place in a scope that is empty, simply provide a nullptr reference +// Since insertions are done in order, it's possible to insert an expression in +// an empty scope, and then use that inserted scope as a reference for +// subsequent mutations. +class ExprMutator : public IrVisitor { + protected: + std::vector traverseAndInsert( + const std::vector& expr, + bool reverse_order = false); + + std::vector mutate(bool reverse_order = false); + + using IrVisitor::handle; + // Registration function which *don't* need to be called "in place" during + // visiting. + void registerInsertBefore(Expr* reference, Expr* new_expr, Scope* scope); + void registerInsertAfter(Expr* reference, Expr* new_expr, Scope* scope); + void registerReplace(Expr* reference, Expr* new_expr, Scope* scope); + void registerRemove(Expr* expr_to_remove, Scope* scope); + + // Registration function which need to be called "in place" during visiting. + // I.E. + // if you want to insert before/after or replace an Expr, you must register + // when in handle(Expr*) of that expr. + void registerInsertBefore(Expr* reference, Expr* new_expr); + void registerInsertAfter(Expr* reference, Expr* new_expr); + void registerReplace(Expr* reference, Expr* new_expr); + void registerRemove(Expr* expr_to_remove); + + private: + enum class MutationMode { BEFORE, AFTER, REPLACE, REMOVE }; + + void registerMutation( + Expr* ref, + Expr* new_expr, + Scope* scope, + MutationMode mode); + + struct MutationInformation { + Expr* reference = nullptr; + Expr* new_expr = nullptr; + Scope* scope = nullptr; + MutationMode mode = MutationMode::BEFORE; + }; + + // Track insertions as they're registered + std::vector insertions_; + + // Track replacements as they're registered + std::vector replacements_; + + // Track removal as they're registered + std::vector removal_; +}; + +} // namespace kir +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir_printer.cpp b/torch/csrc/jit/codegen/cuda/kernel_ir_printer.cpp deleted file mode 100644 index e00da31423c1..000000000000 --- a/torch/csrc/jit/codegen/cuda/kernel_ir_printer.cpp +++ /dev/null @@ -1,451 +0,0 @@ -#include -#include - -#include -#include - -#include - -namespace torch { -namespace jit { -namespace fuser { -namespace cuda { -namespace kir { - -namespace { - -const char* boolLiteral(bool value) { - return value ? "true" : "false"; -} - -std::string varName(const kir::Val* val, const char* prefix) { - std::stringstream value_name; - if (val == nullptr) { - value_name << "$nullptr"; - } else if (val->name() != kInvalidStmName) { - value_name << prefix << val->name(); - } else { - value_name << "k" << prefix << val->id(); - } - return value_name.str(); -} - -} // namespace - -void IrPrinter::printNode(const kir::Node* node) { - os_ << gen(node, true); -} - -void IrPrinter::printKernel(const Kernel* kernel) { - TORCH_CHECK(kernel != nullptr); - - // kernel declaration - os_ << "\nKERNEL ("; - for (auto in : kernel->inputs()) { - os_ << gen(in); - if (in != kernel->inputs().back()) { - os_ << ", "; - } - } - os_ << ") -> ("; - for (auto out : kernel->outputs()) { - os_ << gen(out); - if (out != kernel->outputs().back()) { - os_ << ", "; - } - } - os_ << ") :\n"; - - // kernel body - startBlock(); - for (auto expr : kernel->topLevelExprs()) { - os_ << gen(expr, true); - } - endBlock(); - os_ << "END.\n\n"; -} - -std::ostream& IrPrinter::indent() { - for (const auto i : c10::irange(indent_level_)) { - (void)i; // Suppress unused variable warning - ir_str_ << kTab; - } - ir_str_ << margin_; - return ir_str_; -} - -std::string IrPrinter::gen(const kir::Node* node, bool top_level) { - if (node == nullptr) { - return "$nullptr"; - } - - // If we're generatign a top level statement we expect to start - // with an empty set of uses - TORCH_INTERNAL_ASSERT(!implicit_definition_ || uses_.empty() || !top_level); - - // Mark the node as generated - visited_.insert(node); - - // Generate the node itself - std::stringstream node_str; - std::swap(node_str, ir_str_); - node->accept(this); - std::swap(node_str, ir_str_); - - if (!implicit_definition_) { - return node_str.str(); - } - - if (top_level) { - // Implicitly mark top level nodes as used, so we - // get their definitions printed (useful for debugging) - if (auto val = dynamic_cast(node)) { - uses_.insert(val); - } - - // Make a copy of the node uses (and reset global state) - const auto node_uses = uses_; - uses_.clear(); - - std::stringstream top_level_str; - - // Hoist implicit definitions - for (auto use : node_uses) { - const auto def = use->definition(); - if (def && visited_.find(def) == visited_.end()) { - margin_ = "~ "; - top_level_str << gen(def, true); - margin_ = ""; - } - } - - top_level_str << node_str.str(); - return top_level_str.str(); - } else { - return node_str.str(); - } -} - -std::string IrPrinter::use(const kir::Val* val) { - if (val != nullptr) { - uses_.insert(val); - } - return gen(val); -} - -void IrPrinter::startBlock() { - ++indent_level_; -} - -void IrPrinter::endBlock() { - TORCH_CHECK(indent_level_ > 0); - --indent_level_; -} - -void IrPrinter::handleBlock(const kir::Scope& scope) { - // Save the uses of the parent scope - decltype(uses_) outer_uses; - std::swap(uses_, outer_uses); - - startBlock(); - for (auto expr : scope.exprs()) { - ir_str_ << gen(expr, true); - } - endBlock(); - - // Restore parent's uses - std::swap(uses_, outer_uses); -} - -void IrPrinter::visit(const kir::Bool* node) { - if (node->isConst()) { - ir_str_ << boolLiteral(*node->value()); - } else { - ir_str_ << varName(node, "b"); - } -} - -void IrPrinter::visit(const kir::Double* node) { - if (node->isConst()) { - const int digits = std::numeric_limits::max_digits10; - ir_str_ << "double(" << std::setprecision(digits) << *node->value() << ")"; - } else { - ir_str_ << varName(node, "d"); - } -} - -void IrPrinter::visit(const kir::Int* node) { - if (node->isConst()) { - ir_str_ << *node->value(); - } else { - ir_str_ << varName(node, "i"); - } -} - -void IrPrinter::visit(const kir::NamedScalar* node) { - ir_str_ << node->name(); -} - -void IrPrinter::visit(const kir::Predicate* node) { - switch (node->predicate_type()) { - case PredicateType::Inline: { - ir_str_ << "Inline"; - break; - } - case PredicateType::Manual: { - ir_str_ << node->value(); - break; - } - case PredicateType::Misaligned: { - ir_str_ << "Misaligned"; - break; - } - case PredicateType::Padding: { - ir_str_ << "Padding"; - break; - } - case PredicateType::Shift: { - ir_str_ << "Shift"; - break; - } - case PredicateType::Unswitch: { - ir_str_ << "Unswitch"; - break; - } - case PredicateType::Vectorize: { - ir_str_ << "Vectorize"; - break; - } - default: - break; - } -} - -void IrPrinter::visit(const kir::TensorIndex* node) { - ir_str_ << gen(node->view()) << "["; - for (auto index : node->indices()) { - ir_str_ << use(index); - if (index != node->indices().back()) { - ir_str_ << ", "; - } - } - ir_str_ << "]"; -} - -void IrPrinter::visit(const kir::IterDomain* node) { - ir_str_ << varName(node, "id") << "["; - if (node->isRFactorProduct()) { - ir_str_ << "rfactor."; - } - ir_str_ << node->parallelType() << "." << node->iterType() << "(" - << use(node->start()) << " .. " << use(node->extent()) << ")]"; -} - -void IrPrinter::visit(const kir::TensorDomain*) { - // TODO(kir): print Tensor shapes? - ir_str_ << "kir::TensorDomain"; -} - -void IrPrinter::visit(const kir::TensorView* node) { - // TODO(kir): print memory type too? - ir_str_ << varName(node, "T"); -} - -void IrPrinter::visit(const kir::UnaryOp* node) { - indent() << gen(node->out()) << " = "; - - auto op_type = node->operation(); - - if (auto op = inline_op_str(op_type)) { - if (alsoBooleanOperator(op_type) && - node->out()->dtype() == DataType::Bool) { - ir_str_ << stringifyBooleanOp(op_type) << gen(node->in()); - } else { - ir_str_ << *op << gen(node->in()); - } - } else { - if (op_type == UnaryOpType::Cast) { - const auto cast_str = - cast_func_str({node->in()->dtype(), node->out()->dtype()}); - ir_str_ << cast_str.value(); - } else { - ir_str_ << op_type; - if (needFloatSuffix(op_type) && node->out()->dtype() == DataType::Float) { - ir_str_ << "f"; - } - } - - if (op_type == UnaryOpType::RandLike) { - ir_str_ << "(RND"; - } else { - ir_str_ << "("; - ir_str_ << use(node->in()); - } - ir_str_ << ")"; - } - - ir_str_ << "\n"; -} - -void IrPrinter::visit(const kir::BinaryOp* node) { - indent() << gen(node->out()) << " = "; - - const auto op_type = node->operation(); - const auto lhs = use(node->lhs()); - const auto rhs = use(node->rhs()); - - if (auto op = inline_op_str(op_type)) { - ir_str_ << lhs << " "; - if (alsoBooleanOperator(op_type) && - node->out()->dtype() == DataType::Bool) { - ir_str_ << stringifyBooleanOp(op_type); - } else { - ir_str_ << *op; - } - ir_str_ << " " << rhs; - } else { - ir_str_ << op_type; - if (needFloatSuffix(op_type) && node->out()->dtype() == DataType::Float) { - ir_str_ << "f"; - } - ir_str_ << "(" << lhs << ", " << rhs << ")"; - } - - ir_str_ << "\n"; -} - -void IrPrinter::visit(const kir::TernaryOp* node) { - indent() << gen(node->out()) << " = " << node->operation() << "(" - << use(node->in1()) << ", " << use(node->in2()) << ", " - << use(node->in3()) << ")\n"; -} - -void IrPrinter::visit(const kir::ReductionOp* node) { - indent() << gen(node->out()) << " = " - << "REDUCTION(op='" << node->operation() << "'" - << ", in=" << use(node->in()) << ", init=" << use(node->init()) - << ", pred=" << use(node->predicate()) << ")\n"; -} - -void IrPrinter::visit(const kir::WelfordOp* node) { - indent() << gen(node->outVar()) << "," << gen(node->outAvg()) << "," - << gen(node->outN()) << " = " - << "Welford( inAvg=" << use(node->inAvg()); - if (!node->inN()->isOneInt()) { - indent() << " inVar=" << use(node->inVar()); - } - indent() << " inN=" << use(node->inN()); - if (!node->initN()->isZeroInt()) { - indent() << ", initVar=" << use(node->initVar()) - << " initAvg=" << use(node->initAvg()) - << " initN=" << use(node->initN()); - } - indent() << ", pred=" << use(node->predicate()) << ")\n"; -} - -void IrPrinter::visit(const kir::GridReduction* node) { - const auto* reduction_op = node->reduction_op(); - indent() << gen(reduction_op->out()) << " = " - << "GRID_REDUCTION(op='" << reduction_op->operation() << "'" - << ", in=" << use(reduction_op->in()) - << ", init=" << use(reduction_op->init()) - << ", pred=" << use(reduction_op->predicate()) << ")\n"; - indent() << kTab << kTab - << ".reduction_buffer=" << use(node->reduction_buffer()->buffer()) - << "\n"; - indent() << kTab << kTab - << ".sync_buffer=" << use(node->sync_buffer()->buffer()) << "\n"; - indent() << kTab << kTab << ".grid_pred=" << use(node->predicate()) << "\n"; -} - -void IrPrinter::visit(const kir::GridWelford* node) { - const auto* welford_op = node->welford_op(); - indent() << gen(welford_op->outVar()) << "," << gen(welford_op->outAvg()) - << "," << gen(welford_op->outN()) << " = " - << "GRID_WELFORD(" - << "inAvg=" << use(welford_op->inAvg()); - if (!welford_op->inN()->isOneInt()) { - indent() << ", inVar=" << use(welford_op->inVar()); - } - indent() << ", inN=" << use(welford_op->inN()); - if (!welford_op->initN()->isZeroInt()) { - indent() << ", initVar=" << use(welford_op->initVar()) - << " initAvg=" << use(welford_op->initAvg()) - << " initN=" << use(welford_op->initN()); - } - indent() << ", pred=" << use(welford_op->predicate()) << ")\n"; - indent() << kTab << kTab - << ".var_buffer=" << use(node->var_buffer()->buffer()) - << ".avg_buffer=" << use(node->avg_buffer()->buffer()) - << ".n_buffer=" << use(node->N_buffer()->buffer()) << "\n"; - indent() << kTab << kTab - << ".sync_buffer=" << use(node->sync_buffer()->buffer()) << "\n"; - indent() << kTab << kTab << ".grid_pred=" << use(node->predicate()) << "\n"; -} - -void IrPrinter::visit(const kir::BroadcastOp* node) { - indent() << gen(node->out()) << " = BROADCAST(" << use(node->in()) << ")\n"; -} - -void IrPrinter::visit(const kir::ForLoop* node) { - indent() << "FOR " << gen(node->index()) << " in " << gen(node->iter_domain()) - << ":\n"; - handleBlock(node->body()); -} - -void IrPrinter::visit(const kir::IfThenElse* node) { - indent() << "IF " << use(node->predicate()) << ":\n"; - handleBlock(node->thenBody()); - if (node->hasElse()) { - indent() << "ELSE:\n"; - handleBlock(node->elseBody()); - } -} - -void IrPrinter::visit(const kir::Allocate* node) { - indent() << gen(node->buffer()) << " = ALLOCATE(" - << "mem_type=" << node->memoryType() << ", " - << "size=" << use(node->size()) << ", " - << "zero_init=" << boolLiteral(node->zeroInit()) << ")\n"; - if (node->alias() != nullptr) { - indent() << kTab << kTab << ".alias=" << gen(node->alias()->buffer()) - << "\n"; - } -} - -void IrPrinter::visit(const kir::Sync* node) { - indent() << "SYNC(war_hazard=" << boolLiteral(node->isWarHazardSync()) - << ")\n"; -} - -void IrPrinter::visit(const kir::InitMagicZero* node) { - indent() << "NVFUSER_DEFINE_MAGIC_ZERO\n"; -} - -void IrPrinter::visit(const kir::UpdateMagicZero* node) { - indent() << "NVFUSER_UPDATE_MAGIC_ZERO\n"; -} - -std::string toString(const kir::Node* stmt, bool implicit_definitions) { - std::stringstream ss; - IrPrinter ir_printer(ss, implicit_definitions); - ir_printer.printNode(stmt); - return ss.str(); -} - -std::string toString( - const std::vector& exprs, - bool implicit_definitions) { - std::stringstream ss; - IrPrinter ir_printer(ss, implicit_definitions); - for (auto expr : exprs) { - ir_printer.printNode(expr); - } - return ss.str(); -} - -} // namespace kir -} // namespace cuda -} // namespace fuser -} // namespace jit -} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir_printer.h b/torch/csrc/jit/codegen/cuda/kernel_ir_printer.h deleted file mode 100644 index 115901a031a9..000000000000 --- a/torch/csrc/jit/codegen/cuda/kernel_ir_printer.h +++ /dev/null @@ -1,129 +0,0 @@ -#pragma once - -#include - -#include -#include - -#include -#include -#include -#include - -namespace torch { -namespace jit { -namespace fuser { -namespace cuda { -namespace kir { - -//! Define pretty printing functions for Kernel IR nodes -//! -//! This class is intended for debug printing, so it attempts -//! to handle invalid IR states as much as possible. -//! -//! implicit_definition_ = true will recurisvely print the definition of all -//! inputs to an expression if they haven't been printed. -class TORCH_CUDA_CU_API IrPrinter : private kir::IrVisitor { - static constexpr char const* kTab = " "; - - public: - //! Constructs a new IrPrinter which outputs to the specified stream - explicit IrPrinter(std::ostream& os, bool implicit_definition = true) - : os_(os), implicit_definition_(implicit_definition) {} - - //! Print a single Kernel IR node - void printNode(const kir::Node* node); - - //! Print a complete Kernel definition - void printKernel(const Kernel* kernel); - - private: - // Generates a string representation of an IR node - // - // If `top_level` is true, all the value uses are tracked and - // their definitions are implicitly printed before the node itself - // - std::string gen(const kir::Node* node, bool top_level = false); - - // Generate a string representation of an used value - // (this helps automatically tracking the value uses) - std::string use(const kir::Val* val); - - std::ostream& indent(); - - void startBlock(); - void endBlock(); - void handleBlock(const kir::Scope& scope); - - void visit(const kir::Bool*) final; - void visit(const kir::Double*) final; - void visit(const kir::Int*) final; - void visit(const kir::NamedScalar*) final; - void visit(const kir::Predicate*) final; - - void visit(const kir::TensorIndex*) final; - void visit(const kir::IterDomain*) final; - void visit(const kir::TensorDomain*) final; - void visit(const kir::TensorView*) final; - - void visit(const kir::UnaryOp*) final; - void visit(const kir::BinaryOp*) final; - void visit(const kir::TernaryOp*) final; - void visit(const kir::ReductionOp*) final; - void visit(const kir::WelfordOp*) final; - void visit(const kir::BroadcastOp*) final; - - void visit(const kir::GridReduction*) final; - void visit(const kir::GridWelford*) final; - void visit(const kir::ForLoop*) final; - void visit(const kir::IfThenElse*) final; - void visit(const kir::Allocate*) final; - void visit(const kir::Sync*) final; - void visit(const kir::InitMagicZero*) final; - void visit(const kir::UpdateMagicZero*) final; - - private: - std::ostream& os_; - - // Current indentation level - int indent_level_ = 0; - - // Internal IR generation stream - std::stringstream ir_str_; - - // Tracks the set of nodes which have been printed - std::unordered_set visited_; - - // Optional left margin printed after the indentation - const char* margin_ = ""; - - // The set of values used by the current top-level IR node - std::unordered_set uses_; - - // If the definition of all inputs to an expression haven't been printed - // already implicit_definition_ = true will print them before printing the - // requested node. - bool implicit_definition_ = true; -}; - -//! Returns the string representation of a Kernel IR node. If the definition of -//! all inputs to an expression haven't been printed already -//! implicit_definition_ = true will print them before printing the requested -//! node. -TORCH_CUDA_CU_API std::string toString( - const kir::Node* stmt, - bool implicit_definitions = true); - -//! Returns the string representation of a vector of kir::Expr, convenient -//! debugm echanism during lowering. If the definition of all inputs to an -//! expression haven't been printed already implicit_definition_ = true will -//! print them before printing the requested node. -TORCH_CUDA_CU_API std::string toString( - const std::vector& exprs, - bool implicit_definitions = true); - -} // namespace kir -} // namespace cuda -} // namespace fuser -} // namespace jit -} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower2device.cpp b/torch/csrc/jit/codegen/cuda/lower2device.cpp index 036eee58206a..3e644fc9a44d 100644 --- a/torch/csrc/jit/codegen/cuda/lower2device.cpp +++ b/torch/csrc/jit/codegen/cuda/lower2device.cpp @@ -6,18 +6,19 @@ #include #include #include -#include #include #include +#include #include +#include #include #include #include #include #include #include +#include #include -#include #include #include #include @@ -33,152 +34,15 @@ namespace jit { namespace fuser { namespace cuda { -// TODO(kir): revisit this thread_local GpuLower* active_gpu_lower = nullptr; // NOLINT namespace { -// Going to generate a map of tensor view root domain extents to reduce the -// number used during lowering. For example if we have: -// -// T2[i0, i1] = T1[i0, i1] + T2[i2, i3] -// -// We know it would be safe to use: -// -// T2[i0, i1] = T1[i0, i1] + T2[i0, i1] -// -// And that way we don't generate T2.size[0] and T2.size[1], instead we will -// reuse T1.size[0] and T1.size[1] -// This is important when doing CSE as T2 and T1 would otherwise look like -// they're using different values, even though we know they're the same -// -// There's some duplicate logic here that's in computeAt map, but it's not so -// concice there to pull out. May want to consider making this mapping its own -// class especially as it may be useful during scheduling. -std::unordered_map getSimplificationMap(Fusion* fusion) { - std::list> disjoint_root_sets; - std::unordered_map*> - id_to_disjoint_root_set; - - auto map_root_ids = [&disjoint_root_sets, &id_to_disjoint_root_set]( - IterDomain* id0, IterDomain* id1) { - if (id0->isBroadcast() || id1->isBroadcast()) { - return; - } - - auto disjoint_set_0_it = id_to_disjoint_root_set.find(id0); - auto disjoint_set_1_it = id_to_disjoint_root_set.find(id1); - bool set_0_found = disjoint_set_0_it != id_to_disjoint_root_set.end(); - bool set_1_found = disjoint_set_1_it != id_to_disjoint_root_set.end(); - - if (set_0_found && set_1_found) { - if (disjoint_set_0_it->second == disjoint_set_1_it->second) { - return; - } - // merge second disjoint set into first - auto* set_0 = disjoint_set_0_it->second; - auto* set_1 = disjoint_set_1_it->second; - for (auto id : *set_1) { - set_0->emplace(id); - id_to_disjoint_root_set[id] = set_0; - } - // remove second set from disjoint_root_sets - disjoint_root_sets.erase(std::find( - disjoint_root_sets.begin(), disjoint_root_sets.end(), *set_1)); - } else if (set_0_found || set_1_found) { - auto existing_set = - set_0_found ? disjoint_set_0_it->second : disjoint_set_1_it->second; - auto to_add_id = set_0_found ? id1 : id0; - existing_set->emplace(to_add_id); - id_to_disjoint_root_set[to_add_id] = existing_set; - // add entry into existing set - } else { - // create new set entry - disjoint_root_sets.emplace_back(std::unordered_set()); - auto* new_set = &disjoint_root_sets.back(); - new_set->emplace(id0); - new_set->emplace(id1); - id_to_disjoint_root_set[id0] = new_set; - id_to_disjoint_root_set[id1] = new_set; - } - }; - - auto fusion_vals = fusion->usedMathVals(); - for (auto producer_tv : ir_utils::filterByType(fusion_vals)) { - auto consumer_tvs = ir_utils::consumerTvsOf(producer_tv); - for (auto consumer_tv : consumer_tvs) { - auto pairwise_map = PairwiseRootDomainMap(producer_tv, consumer_tv); - auto c2p_root_map = pairwise_map.mapConsumerToProducer( - consumer_tv->domain(), producer_tv->domain()); - for (auto entry : c2p_root_map) { - auto c_id = entry.first; - auto p_id = entry.second; - map_root_ids(p_id, c_id); - } - } - } - - // Map each set to an input ID (if it exists) that has the smallest ->name() - // entry value - std::unordered_map*, IterDomain*> - set_to_input_id; - - // Loop over the root domains, of the inputs to the fusion. Pick an input ID - // to use as the representative ID of the collected sets. Only consider inputs - // as those are the ones that map to values like "T0.size[1]". They are he - // ID's that propagated their extents into the problem. We could also check - // the outputs as we do have C++ examples of using output dimensions for the - // problem size instead of inputs. However, we don't do anything where we can - // translate to those kinds of kernels integrated into PyTorch. - for (auto input_tv : ir_utils::filterByType(fusion->inputs())) { - for (auto id : - TensorDomain::noReductions(input_tv->getMaybeRFactorDomain())) { - auto id_set_it = id_to_disjoint_root_set.find(id); - if (id_set_it == id_to_disjoint_root_set.end()) { - continue; - } - auto* id_set = id_set_it->second; - if (set_to_input_id.find(id_set) == set_to_input_id.end()) { - set_to_input_id[id_set] = id; - } else { - auto input_id_of_set = set_to_input_id.at(id_set); - // Swap id's if new name is less than previously set - bool swap_ids = id->name() < input_id_of_set->name(); - // If new id is a const scalar but previously was'nt use the const - // scalar - swap_ids = swap_ids || - (id->extent()->isConstScalar() && - !input_id_of_set->extent()->isConstScalar()); - // If previous scalar was const and new isn't, don't swap - swap_ids = swap_ids && - !(input_id_of_set->extent()->isConstScalar() && - !id->extent()->isConstScalar()); - - if (swap_ids) { - set_to_input_id[id_set] = id; - } - } - } - } - - // Finally make map from ID extents to the representitive ID extent. - std::unordered_map extent_to_min_input_id_extent; - for (auto entry : set_to_input_id) { - auto* set = entry.first; - auto input_id = entry.second; - for (auto id : *set) { - extent_to_min_input_id_extent[id->extent()] = input_id->extent(); - } - } - return extent_to_min_input_id_extent; -} - -class KIRCleaner : public kir::MutableIrVisitor { +class KIRCleaner : public OptOutDispatch { public: //! Remove nop IR nodes - static std::vector cleanUp( - const std::vector& loop_nests) { + static std::vector cleanUp(const std::vector& loop_nests) { KIRCleaner cleaner; - std::vector out_loop_nests; + std::vector out_loop_nests; for (auto loop_nest : loop_nests) { cleaner.handle(loop_nest); // No need to keep the loop nest if it's determined to be nop @@ -190,16 +54,17 @@ class KIRCleaner : public kir::MutableIrVisitor { } private: - void handle(kir::Expr* expr) { + using OptOutDispatch::handle; + void handle(Expr* expr) final { if (expr->isA() || expr->isA()) { - expr->accept(this); + OptOutDispatch::handle(expr); } else { // Any non-scoping expr is not considered nop is_nop_ = false; } } - void visit(kir::ForLoop* fl) final { + void handle(kir::ForLoop* fl) final { auto exprs = fl->body().exprs(); fl->body().clear(); for (auto expr : exprs) { @@ -213,7 +78,7 @@ class KIRCleaner : public kir::MutableIrVisitor { is_nop_ = fl->body().empty(); } - void visit(kir::IfThenElse* ite) final { + void handle(kir::IfThenElse* ite) final { const auto conditional = ite->predicate()->value(); // Visit the then block @@ -248,9 +113,8 @@ class KIRCleaner : public kir::MutableIrVisitor { // conditional and move the exprs in the else block to the then // block. if (then_nop && !else_nop) { - kir::SimplifyingIrBuilder ir_builder(GpuLower::current()->kernel()); - kir::Bool* pred = ite->predicate()->value(); - kir::Bool* not_pred = ir_builder.notExpr(pred)->as(); + Bool* pred = ite->predicate()->value(); + Bool* not_pred = SimplifyingIrBuilder::notExpr(pred)->as(); ite->predicate()->setValue(not_pred); for (auto expr : ite->elseBody().exprs()) { ite->thenBody().push_back(expr); @@ -269,84 +133,6 @@ class KIRCleaner : public kir::MutableIrVisitor { } // namespace -void GpuLower::replaceSymbolicSizes() { - FUSER_PERF_SCOPE("GpuLower::Lower::replaceSymbolicSizes"); - - kir::IrBuilder ir_builder(kernel()); - - // Grab inputs and outputs - std::vector inputs_and_outputs; - for (auto val : fusion_->inputs()) { - if (ir_utils::isTV(val)) { - inputs_and_outputs.push_back(val->as()); - } - } - // Symbolic size is necessary for outputs if there are no inputs. - // Otherwise infer output sizes from the inputs via expression evaluation. - if (fusion_->inputs().empty()) { - for (auto val : fusion_->outputs()) { - if (ir_utils::isTV(val)) { - inputs_and_outputs.push_back(val->as()); - } - } - } - - // Generate map for all tensorview root domain values to map them to symbolic - // values. i.e. T0->getRootDomain()[0] would map to a named scalar - // "T0.size[0]". This map will be used when lowering fusion ir to kernel ir. - for (TensorView* tv : inputs_and_outputs) { - // Replace the domain with one based on Ti.size[j] - const std::vector& root_td = tv->getRootDomain(); - - size_t dim = 0; - for (auto id : root_td) { - const Val* orig_size = id->extent(); - - // Output sizes could have reduction axes, which isn't what gets output. - // NOLINTNEXTLINE(bugprone-branch-clone) - if (id->isReduction() || - (id->getIterType() == IterType::BroadcastWithoutStride)) { - continue; - } else if ( - id->isRFactorProduct() || - // NOLINTNEXTLINE(bugprone-branch-clone) - (id->getIterType() == IterType::BroadcastWithStride) || - orig_size->isConstScalar()) { - dim++; - continue; - } - - // TODO(kir): consider a different implementation which doesn't - // hijack the kir_val_map_ - // Currently turn off this part for inputs of segmented fusion, - // since FusionKernelRuntime will provide these as integer inputs - if (kir_val_map_.find(orig_size) == kir_val_map_.end() && - !orig_size->isFusionInput() && !orig_size->isConstScalar()) { - std::stringstream ss; - ss << "T" << tv->name() << ".size[" << dim++ << "]"; - kir_val_map_[orig_size] = ir_builder.create( - ss.str(), orig_size->getDataType().value()); - } else { - dim++; - } - } - } - - // Use a minimal number of sizes from provided tensors. - auto extent_simplification_map = getSimplificationMap(fusion_); - for (auto extent_entry : extent_simplification_map) { - auto orig_extent = extent_entry.first; - auto simplified_extent = extent_entry.second; - if (kir_val_map_.count(orig_extent)) { - if (kir_val_map_.count(simplified_extent)) { - kir_val_map_[orig_extent] = kir_val_map_[simplified_extent]; - } else { - kir_val_map_[orig_extent] = lowerValue(simplified_extent); - } - } - } -} - void GpuLower::collectPaddedParallelDims() { ExpressionEvaluator ee(fusion_); bool can_be_single_warp = true; @@ -357,8 +143,11 @@ void GpuLower::collectPaddedParallelDims() { for (auto tv : ir_utils::filterByType(used_vals)) { for (auto id : tv->domain()->domain()) { if (tv->definition()) { + // TODO: Support GroupedReductionOp if (auto reduction = dynamic_cast(tv->definition())) { - if (ir_utils::getMaybeWarpReductionDim(reduction).has_value()) { + if (ir_utils::getMaybeWarpReductionDim( + reduction->out(), reduction->in()) + .has_value()) { warp_pad_info_.has_warp_reduction = true; } } @@ -398,14 +187,12 @@ void GpuLower::collectPaddedParallelDims() { } } -void GpuLower::lower() { +void GpuLower::lower(Fusion* fusion, DataType index_type) { FUSER_PERF_SCOPE("GpuLower::lower"); - - TORCH_INTERNAL_ASSERT(fusion_ != nullptr); + TORCH_INTERNAL_ASSERT(fusion != nullptr); TORCH_INTERNAL_ASSERT( active_gpu_lower == nullptr, "Nested lowering passes are not supported"); - // TODO(kir): revisit this struct LowerGuard { LowerGuard(GpuLower* gpu_lower) { active_gpu_lower = gpu_lower; @@ -414,116 +201,158 @@ void GpuLower::lower() { active_gpu_lower = nullptr; } } lower_guard(this); + // Copy fusion into a new kernel for processing + kernel_ = std::make_unique(fusion, index_type); + // Alias the fusion kernel caries around as a view of itself. + fusion_ = kernel_.get(); + + // Convert tensor views of DataType::Index type to either Int or Int32 + for (auto tv : ir_utils::allTvs(fusion_)) { + if (tv->dtype() == DataType::Index) { + tv->resolveIndexDtype(); + } + } FusionGuard fg(fusion_); - - // Start with a fresh kernel - kernel_ = std::make_unique(); - // prepare for lowering validateIr(fusion_); - replaceSymbolicSizes(); + + // Checks if any TIDx dim is marked as padded to a warp. Also checks if we can + // determine the padding is explicitly a single warp. collectPaddedParallelDims(); - trivial_reduction_info_.build(fusion_, this); - // In the future we may directly use this map, but for now it will propagate - // and validate (to some extent) the parallelization strategy. - // This is the first time nodes will be lowered to kir nodes. Since for now we - // propagate the parallel strategy in some instances, we need to do it before - // lowering. - ca_parallel_map_ = ComputeAtMap(ComputeAtMap::MappingMode::PARALLEL); - ca_parallel_map_.build(fusion_, current()); + // Replaces integers that are tensor sizes by named scalars as "T0.size[0]" + replaceSymbolicSizes(fusion_); + + // Traverse through reductions and termine if any iteration domains are + // trivial reductions. Add these iteration domains to trivial_reduction_info_ + // which simply holds a map of which axes are trivial and which are not. + trivial_reduction_info_.build(fusion_); + // Replaces trivial reduction expressions (all id's being reduced are trivial) + // with set unary op + trivialReductionReplacement(fusion_, trivial_reduction_info_); + + // Build what's refered to as the compute at map. This map contains the + // mappings of all iteration domains across the fusion. There are three types + // of mappings Permissive, Exact, and Loop, see compute_at_map.h/cpp for more + // information. + compute_at_map_ = std::make_unique(fusion_); - // Want to run this after parallel map is created - validateVectorize(fusion_); + if (isDebugDumpEnabled(DebugDumpOption::ComputeAtMap)) { + std::cout << compute_at_map_->toString() << std::endl; + } - // Generate mappings to generate indices - ca_index_map_ = ComputeAtMap(ComputeAtMap::MappingMode::INDEX); - ca_index_map_.build(fusion_, current()); + compute_at_map_->validateAndPropagatePType(); - // Generate mappings to generate and map to loop nests - ca_loop_map_ = ComputeAtMap(ComputeAtMap::MappingMode::LOOP); - ca_loop_map_.build(fusion_, current()); + // Used in parallel dimension map + concretized_broadcast_domains_.build(fusion_); parallelDimensionMap().build(fusion_); if (isDebugDumpEnabled(DebugDumpOption::ParallelDimensions)) { - std::cout << parallelDimensionMap().toString(); + std::cout << "Parallel dimension map:" << std::endl; + std::cout << parallel_dimension_map_.toString() << std::endl; } + // Validate mma data format and compatibility if any on the fusion. + validateMma(fusion_); + // Compute thread predicates. Depends on parallel_dimension_map_ thread_pred_map_.build(fusion_); - // Depends on thread_pred_map_ - validateParallelize(fusion_); + // Fuse cetain patterns of reductions, such as a grid reduction + // followed by a grid broadcast. Only depends on parallelization and + // thread predicate map. + fuseReductionsAndBroadcasts(fusion_); // Scan the whole fusion and build mappings about halo extensions of // all IterDomains haloInfo().build(fusion_); + // Want to run this after parallel map and halo info map are + // created. vectorized_accesses_ and vectorized_set_info_ are filled. + validateAndCollectVectorizeInfo(fusion_); + + // Depends on thread_pred_map_, validates parallelization collects which + // tensor views need WAR or RAW syncs + sync_map_.build(fusion_); + partialSplitMap().build(fusion_); validatePartialSplit(fusion_); - // Detects all exprssions that don't need predicates - predicateElimination().build(fusion_); - nonDivisibleSplitInfo().build(fusion_); - // Set the kernel inputs & outputs - for (auto input : fusion_->inputs()) { - kernel_->addInput(GpuLower::lowerValue(input)); - } + // Detects all exprssions that don't need predicates. Depends on + // nonDivisibleSplitInfo. + predicateElimination().build(fusion_); - for (auto output : fusion_->outputs()) { - kernel_->addOutput(GpuLower::lowerValue(output)); - } + doubleBufferInfo().build(fusion_); // Run our passes keeping the lowered expressions and forwarding // them // Reorder expressions for loop-nest generation respecting computeAt // relationships - auto sorted_exprs = reorderExprsForComputeAt(); + const auto exprs_sorted = reorderExprsForComputeAt(); // Generate loop-nests and place each expression at its // corresponding loop - const auto lowered_exprs = LoopNestGenerator::loweredExprs(sorted_exprs); + const auto exprs_lowered = LoopNestGenerator::loweredExprs(exprs_sorted); + + // Replace trivial reductions, Transpose, Shift, Gather, and View ops with + // unary ops since they're not separately processed in lowering. + const auto exprs_unary_replaced = unarySetOpInserter(exprs_lowered); // Insert allocations - const auto alloced_exprs = insertAllocations(lowered_exprs); + const auto exprs_alloced = insertAllocations(exprs_unary_replaced); // Insert read after write smem syncs - const auto raw_sync_exprs = insertRawThreadSynchronization(alloced_exprs); + const auto exprs_raw_sync = insertRawThreadSynchronization(exprs_alloced); // Reuse memory locations - const auto reuse_mem_exprs = reuseMemoryAllocations(raw_sync_exprs); + const auto exprs_reuse_mem = reuseMemoryAllocations(exprs_raw_sync); - // Inserts predicates after this, need to be careful in later passes when - // inserting in loop nest structure as insertions could be on if then else - // instead of directly on a for loop - const auto unrolled_loops = UnrollPass::runPass(fusion_, reuse_mem_exprs); + // Insert SyncThreads at end of for-loop to avoid WAR race condition + const auto exprs_war_sync = insertWarThreadSynchronization(exprs_reuse_mem); - const auto unrolled_mv_loops = - processMisalignedVectorization(fusion_, unrolled_loops); + const auto exprs_double_buffered = DoubleBufferPass::run(exprs_war_sync); - // Insert SyncThreads at end of for-loop to avoid WAR race condition - const auto war_sync_exprs = insertWarThreadSynchronization(unrolled_mv_loops); + // This pass inserts predicates as well as branches in the code. Up until now + // the code is explicitly single shot for loop based. Need to be careful in + // later passes when doing any kind of insertions in loop nest structure as + // insertions could be on if then or else instead of directly on a for loop. + const auto exprs_unrolled_loops = + UnrollPass::runPass(fusion_, exprs_double_buffered); - const auto indexed_loops = IndexLowering::getIndexedExprs(war_sync_exprs); + const auto exprs_unrolled_mv_loops = + processMisalignedVectorization(exprs_unrolled_loops); - const auto exprs_with_fused_broadcast = fuseWarpReduce(indexed_loops); + const auto exprs_indexed_loops = + IndexLowering::getIndexedExprs(exprs_unrolled_mv_loops); - const auto conditional_loops = - generateConditionalFromPredicate(fusion_, exprs_with_fused_broadcast); + // TODO: It seems this type of optimization would be far easier to implement + // on fusion ir than kernel ir. We should likely refactor this to at least run + // before allocation insertion. + const auto exprs_with_fused_broadcast = fuseWarpReduce(exprs_indexed_loops); + + const auto exprs_conditional_loops = + generateConditionalFromPredicate(exprs_with_fused_broadcast); + + const auto exprs_common_index_allocated = + allocateCommonIndices(exprs_conditional_loops); // Insert fake zero updates to make sure nvrtc doesn't blow out register use // on index and predicate reuse - const auto register_adjusted = insertMagicZero(conditional_loops); + const auto exprs_register_adjusted = + insertMagicZero(exprs_common_index_allocated); - const auto cleaned_up_loops = KIRCleaner::cleanUp(register_adjusted); + const auto exprs_cleaned_up_loops = + KIRCleaner::cleanUp(exprs_register_adjusted); - // We now have the lowered expressions, finalize the kernel IR - kernel_->finalize(cleaned_up_loops); + // We now have the lowered expressions, finalize the kernel IR. This function + // will also copy over some relevant information for code generation from + // GpuLower. + kernel_->finalize(exprs_cleaned_up_loops); } kir::Kernel* GpuLower::kernel() const { @@ -531,214 +360,18 @@ kir::Kernel* GpuLower::kernel() const { return kernel_.get(); } -// Maps Fusion IR nodes to the Kernel IR counterparts -class GpuLower::KernelIrMapper : private OptInConstDispatch { - public: - explicit KernelIrMapper(GpuLower* gpu_lower) - : gpu_lower_(gpu_lower), ir_builder_(gpu_lower->kernel()) {} - - kir::Val* lowerValue(const Val* value) { - const auto it = gpu_lower_->kir_val_map_.find(value); - if (it != gpu_lower_->kir_val_map_.end()) { - return it->second; - } else { - handle(value); - const auto kir_value = gpu_lower_->kir_val_map_[value]; - TORCH_CHECK(kir_value != nullptr); - - // Lower the value definition, if any - if (value->isScalar()) { - if (auto def = value->definition()) { - const auto kir_def = lowerExpr(def); - TORCH_INTERNAL_ASSERT(kir_value->definition() == kir_def); - } - } - - return kir_value; - } - } - - kir::Expr* lowerExpr(const Expr* expr) { - const auto it = gpu_lower_->kir_expr_map_.find(expr); - if (it != gpu_lower_->kir_expr_map_.end()) { - return it->second; - } else { - handle(expr); - const auto lowered_node = gpu_lower_->kir_expr_map_[expr]; - TORCH_CHECK(lowered_node != nullptr); - return lowered_node; - } - // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) - } - - private: - void handle(const Statement* node) final { - OptInConstDispatch::handle(node); - } - - void handle(const Val* node) final { - OptInConstDispatch::handle(node); - } - - void handle(const Expr* node) final { - OptInConstDispatch::handle(node); - } - - void handle(const TensorDomain* node) final { - const auto lowered_node = ir_builder_.create(node); - TORCH_CHECK(gpu_lower_->kir_val_map_.insert({node, lowered_node}).second); - } - - void handle(const IterDomain* node) final { - const auto lowered_node = ir_builder_.create(node); - TORCH_CHECK(gpu_lower_->kir_val_map_.insert({node, lowered_node}).second); - } - - void handle(const TensorView* node) final { - const auto lowered_node = ir_builder_.create(node); - TORCH_CHECK(gpu_lower_->kir_val_map_.insert({node, lowered_node}).second); - } - - void handle(const Bool* node) final { - const auto lowered_node = ir_builder_.create(node); - TORCH_CHECK(gpu_lower_->kir_val_map_.insert({node, lowered_node}).second); - } - - void handle(const Double* node) final { - const auto lowered_node = ir_builder_.create(node); - TORCH_CHECK(gpu_lower_->kir_val_map_.insert({node, lowered_node}).second); - } - - void handle(const Int* node) final { - const auto lowered_node = ir_builder_.create(node); - TORCH_CHECK(gpu_lower_->kir_val_map_.insert({node, lowered_node}).second); - } - - void handle(const NamedScalar* node) final { - const auto lowered_node = ir_builder_.create( - node->name(), node->getDataType().value()); - TORCH_CHECK(gpu_lower_->kir_val_map_.insert({node, lowered_node}).second); - } - - void handle(const UnaryOp* node) final { - const auto lowered_node = ir_builder_.create( - node->getUnaryOpType(), - lowerValue(node->out()), - lowerValue(node->in())); - TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second); - } - - void handle(const BinaryOp* node) final { - const auto lowered_node = ir_builder_.create( - node->getBinaryOpType(), - lowerValue(node->out()), - lowerValue(node->lhs()), - lowerValue(node->rhs())); - TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second); - } - - void handle(const TernaryOp* node) final { - const auto lowered_node = ir_builder_.create( - node->getTernaryOpType(), - lowerValue(node->out()), - lowerValue(node->in1()), - lowerValue(node->in2()), - lowerValue(node->in3())); - TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second); - } - - void handle(const ReductionOp* node) final { - auto out_tv = node->out()->as(); - // If trivial reduction operation lower to set operation. - if (std::all_of( - out_tv->domain()->domain().begin(), - out_tv->domain()->domain().end(), - [&](IterDomain* id) { - // If id is a reduction axis, is it a trivial reduction? - if (id->isReduction()) { - return gpu_lower_->trivialReductionInfo().isDerived(id); - } else { - return true; - } - })) { - const auto lowered_node = ir_builder_.create( - UnaryOpType::Set, lowerValue(node->out()), lowerValue(node->in())); - TORCH_CHECK( - gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second); - return; - } - - const auto lowered_node = ir_builder_.create( - node->getReductionOpType(), - lowerValue(node->init()), - lowerValue(node->out()), - lowerValue(node->in())); - TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second); - } - - void handle(const WelfordOp* node) final { - auto lowerOptional = [&](Val* v) { return v ? lowerValue(v) : nullptr; }; - const auto lowered_node = ir_builder_.create( - lowerValue(node->outVar()), - lowerValue(node->outAvg()), - lowerValue(node->outN()), - lowerValue(node->initVar()), - lowerValue(node->initAvg()), - lowerValue(node->initN()), - lowerOptional(node->inVar()), - lowerValue(node->inAvg()), - lowerValue(node->inN())); - - TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second); - } - - void handle(const BroadcastOp* node) final { - const auto lowered_node = ir_builder_.create( - lowerValue(node->out()), lowerValue(node->in())); - TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second); - } - - void handle(const TransposeOp* node) final { - const auto lowered_node = ir_builder_.create( - UnaryOpType::Set, lowerValue(node->out()), lowerValue(node->in())); - TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second); - } - - void handle(const ShiftOp* node) final { - const auto lowered_node = ir_builder_.create( - UnaryOpType::Set, lowerValue(node->out()), lowerValue(node->in())); - TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second); - } - - void handle(const GatherOp* node) final { - const auto lowered_node = ir_builder_.create( - UnaryOpType::Set, lowerValue(node->out()), lowerValue(node->in())); - TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second); - } - - void handle(const ViewOp* node) final { - const auto lowered_node = ir_builder_.create( - UnaryOpType::Set, lowerValue(node->out()), lowerValue(node->in())); - TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second); - } - - private: - GpuLower* gpu_lower_ = nullptr; - kir::IrBuilder ir_builder_; -}; - -kir::Val* GpuLower::lowerValue(const Val* val) { - KernelIrMapper kir_mapper(this); - return kir_mapper.lowerValue(val); +GpuLower* GpuLower::current() { + TORCH_INTERNAL_ASSERT( + active_gpu_lower != nullptr, "No active GpuLower available"); + return active_gpu_lower; } -kir::Expr* GpuLower::lowerExpr(const Expr* expr) { - KernelIrMapper kir_mapper(this); - return kir_mapper.lowerExpr(expr); +bool GpuLower::hasCurrent() { + return active_gpu_lower != nullptr; } -GpuLower* GpuLower::current() { - return active_gpu_lower; +void GpuLower::propagateExprInfo(const Expr* old_expr, const Expr* new_expr) { + pred_elimination_.propagateRemovalInfo(old_expr, new_expr); } } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/lower2device.h b/torch/csrc/jit/codegen/cuda/lower2device.h index b807bb4d480a..686b5db1ebd5 100644 --- a/torch/csrc/jit/codegen/cuda/lower2device.h +++ b/torch/csrc/jit/codegen/cuda/lower2device.h @@ -1,23 +1,33 @@ #pragma once -#include +#include #include #include #include #include #include +#include +#include +#include #include +#include #include +#include +#include +#include #include #include #include #include #include #include +#include #include #include +#include +#include namespace torch { namespace jit { @@ -29,46 +39,48 @@ namespace cuda { // container for this information that we can reuse. Would be nice to generate // such a structure and propagate it through lowering. // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) -class TORCH_CUDA_CU_API GpuLower { +class TORCH_CUDA_CU_API GpuLower : public NonCopyable { class KernelIrMapper; public: - GpuLower() = default; + GpuLower() = delete; + // GpuLower lowers the provided fusion into a kernel which can be translated + // into cuda code. index_type allows to compile the kernel based on int32 + // indexing instead of int64 for additional performance. // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) - explicit GpuLower(Fusion* fusion) : fusion_(fusion) { - lower(); + explicit GpuLower(Fusion* fusion, DataType index_type = DataType::Int) { + lower(fusion, index_type); } kir::Kernel* kernel() const; - //! Converts a Fusion IR value into the Kernel IR equivalent - kir::Val* lowerValue(const Val* val); + //! Returns the currently active lowering object. + //! It's an error if no lowering is in progress. + static GpuLower* current(); - //! Converts a Fusion IR expression into the Kernel IR equivalent - kir::Expr* lowerExpr(const Expr* expr); + //! Query if lowering is in progress + static bool hasCurrent(); - //! Returns the currently active lowering object - //! (or nullptr if no lowering is in progress) - static GpuLower* current(); + ConcretizedBroadcastDomains& concretizedBroadcastDomains() { + return concretized_broadcast_domains_; + } const ThreadPredicateMap& threadPredMap() const { return thread_pred_map_; } - const ComputeAtMap& caLoopMap() const { - return ca_loop_map_; - } - - const ComputeAtMap& caIndexMap() const { - return ca_index_map_; + // Returns non-const reference. Necessary to reset a predicate flag + // when a broadcast expression is fused into a reduction. + ThreadPredicateMap& threadPredMap() { + return thread_pred_map_; } - const ComputeAtMap& caParallelMap() const { - return ca_parallel_map_; + const std::unique_ptr& caMap() const { + return compute_at_map_; } - const auto& trivialReductionInfo() const { + const TrivialReductionInfo& trivialReductionInfo() const { return trivial_reduction_info_; } @@ -120,16 +132,53 @@ class TORCH_CUDA_CU_API GpuLower { return non_divisible_split_info_; } - private: - void lower(); + DoubleBufferInfo& doubleBufferInfo() { + return double_buffer_info_; + } + + CommonIndexMap& commonIndexMap() { + return common_index_map_; + } - // TensorViews are all based on symbolic sizes. When we first initialize them - // we don't know if they're inputs or outputs which would mean that they have - // runtime shapes. Intermediate tensors (those not going to global memory) do - // not have this information. Since we need to have the correct information in - // the kernel being fetched for shapes, we want to replace input and output - // tensors to reference the runtime structure containing sizes. - void replaceSymbolicSizes(); + const auto& vectorizedAccesses() const { + return vectorized_accesses_; + } + + auto& vectorizedAccesses() { + return vectorized_accesses_; + } + + const auto& vectorizedSetInfo() const { + return vectorized_set_info_; + } + + auto& vectorizedSetInfo() { + return vectorized_set_info_; + } + + FusedReductionInfo& fusedReductionInfo() { + return fused_reduction_info_; + } + + const SyncMap& syncMap() const { + return sync_map_; + } + + // This is an interface to propagate information after expression + // replacement on the kernel IR. E.g.: + // for ... + // c = a + b (expr 0) + // after any pass that does replacement: + // for ... + // c1 = a1 + b1 (expr1) + // The previous analysis that was performed on expr0 might still + // be valid on expr1 but that info would be lost after replacement. + // This function provides an interface to manually update the info + // in any pass that performs replacement. + void propagateExprInfo(const Expr* old_expr, const Expr* new_expr); + + private: + void lower(Fusion* fusion, DataType index_type); // Goes through the parallelized iterdomains of the used TVs and find // the parallel dimensions that need to be padded to a multiples of @@ -140,16 +189,15 @@ class TORCH_CUDA_CU_API GpuLower { // Lowered Kernel IR std::unique_ptr kernel_; - // Fusion IR node to Kernel IR node mapping - std::unordered_map kir_val_map_; - std::unordered_map kir_expr_map_; - // Some stateful information during lowering + // TODO: A lot of this information uses a define class then call build. It + // would be safer to wrap all of these in unique pointers and remove the build + // interface and default constructor. That way they couldn't be accessed + // without being initialized. + ConcretizedBroadcastDomains concretized_broadcast_domains_; ThreadPredicateMap thread_pred_map_; PredicateElimination pred_elimination_; - ComputeAtMap ca_loop_map_; - ComputeAtMap ca_index_map_; - ComputeAtMap ca_parallel_map_; + std::unique_ptr compute_at_map_; TrivialReductionInfo trivial_reduction_info_; HaloInfo halo_info_; LocalAllocationInfoMap local_allocation_info_map_; @@ -157,6 +205,17 @@ class TORCH_CUDA_CU_API GpuLower { ParallelDimensionMap parallel_dimension_map_; PartialSplitMap partial_split_map_; NonDivisibleSplitInfo non_divisible_split_info_; + DoubleBufferInfo double_buffer_info_; + CommonIndexMap common_index_map_; + FusedReductionInfo fused_reduction_info_; + SyncMap sync_map_; + + // Track which tensor views are inputs or outputs of a vectorized operation + // and their maximum vectorized access size + // std::unordered_map vectorized_accesses_; + std::unordered_map vectorized_accesses_; + // Info on each vectorized set op + std::vector vectorized_set_info_; Fusion* fusion_ = nullptr; }; diff --git a/torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp b/torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp index 80e2e58c9cf2..ac1272c929af 100644 --- a/torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp @@ -1,10 +1,10 @@ #include #include +#include #include #include #include -#include #include #include @@ -22,40 +22,42 @@ namespace { //! Get string representation of Allocate size for symbolic comparison //! //! TODO: Some expr simplifications could also be helpful -class SymbolicSizePrinter : private kir::IrVisitor { +class SymbolicSizePrinter : private OptOutConstDispatch { public: static std::string printSize(const kir::Allocate* allocate) { SymbolicSizePrinter printer; - allocate->size()->accept(&printer); + printer.handle(allocate->size()); return printer.os_.str(); } private: - void visit(const kir::Int* node) final { + using OptOutConstDispatch::handle; + + void handle(const Int* node) final { if (auto def = node->definition()) { - def->accept(this); + OptOutConstDispatch::handle(def); } else if (node->isConst()) { os_ << *node->value(); } else { - os_ << "ki" << node->id(); + os_ << "ki" << node->name(); } } - void visit(const kir::NamedScalar* named_scalar) final { + void handle(const NamedScalar* named_scalar) final { os_ << "@" << named_scalar->name(); } - void visit(const kir::UnaryOp* unary_op) final { - os_ << unary_op->operation() << "("; - unary_op->in()->accept(this); + void handle(const UnaryOp* unary_op) final { + os_ << unary_op->getUnaryOpType() << "("; + OptOutConstDispatch::handle(unary_op); os_ << ")"; } - void visit(const kir::BinaryOp* binary_op) final { - os_ << binary_op->operation() << "("; - binary_op->lhs()->accept(this); + void handle(const BinaryOp* binary_op) final { + os_ << binary_op->getBinaryOpType() << "("; + OptOutConstDispatch::handle(binary_op->lhs()); os_ << ","; - binary_op->rhs()->accept(this); + OptOutConstDispatch::handle(binary_op->rhs()); os_ << ")"; } @@ -74,11 +76,11 @@ class BufferReuseDebugPrinter { DebugLineType line_type = DebugLineType::EXPR; }; - using DebugEntry = std::pair; + using DebugEntry = std::pair; using DebugEntryPtr = std::unique_ptr; public: - BufferReuseDebugPrinter() : ir_printer_(os_, false){}; + BufferReuseDebugPrinter() : ir_printer_(os_){}; std::string dumpDebugInfo() { os_.clear(); @@ -105,7 +107,7 @@ class BufferReuseDebugPrinter { private: friend class BufferUseDefInfo; - void pushBack(int lineno, kir::Expr* expr) { + void pushBack(int lineno, Expr* expr) { makeExprEntry(lineno, expr); } @@ -117,7 +119,7 @@ class BufferReuseDebugPrinter { makeScopeEntry(DebugLineType::END_BLOCK); } - void makeExprEntry(int lineno, kir::Expr* expr) { + void makeExprEntry(int lineno, Expr* expr) { auto debug_entry_ptr = std::make_unique(); debug_entry_ptr->first.lineno = lineno; debug_entry_ptr->second = expr; @@ -134,14 +136,14 @@ class BufferReuseDebugPrinter { debug_info_.emplace_back(std::move(debug_entry_ptr)); } - void handle(const kir::Expr* node) { + void handle(const Expr* node) { if (auto for_loop = dynamic_cast(node)) { handle(for_loop); } else if (auto ite = dynamic_cast(node)) { handle(ite); } else { indent(); - ir_printer_.printNode(node); + ir_printer_.handle(node); } if (auto alloc = dynamic_cast(node)) { printAllocInfo(alloc); @@ -151,9 +153,9 @@ class BufferReuseDebugPrinter { void handle(const kir::ForLoop* node) { indent(); os_ << "FOR "; - ir_printer_.printNode(node->index()); + ir_printer_.handle(node->index()); os_ << " in "; - ir_printer_.printNode(node->iter_domain()); + ir_printer_.handle(node->iter_domain()); os_ << ":\n"; } @@ -186,7 +188,7 @@ class BufferReuseDebugPrinter { private: std::stringstream os_; - kir::IrPrinter ir_printer_; + IrPrinter ir_printer_; int indent_level_ = 0; std::vector debug_info_; @@ -340,7 +342,7 @@ class BufferUseDefInfo { static constexpr long kRegisterSizeThreshold = 1; BufferUseDefInfo( - const std::vector& exprs, + const std::vector& exprs, BufferReuseDebugPrinter* debug_printer = nullptr) : debug_printer_(debug_printer) { if (debug_printer) { @@ -410,7 +412,7 @@ class BufferUseDefInfo { } private: - void handle(kir::Expr* expr) { + void handle(Expr* expr) { current_pos_++; if (debug_printer_) { debug_printer_->pushBack(current_pos_, expr); @@ -426,7 +428,7 @@ class BufferUseDefInfo { } } - void handleScope(const std::vector& exprs) { + void handleScope(const std::vector& exprs) { if (debug_printer_) { debug_printer_->pushScope(); } @@ -460,15 +462,15 @@ class BufferUseDefInfo { return; } - auto kir_tv = dynamic_cast(alloc->buffer()); - if (!kir_tv) { + auto tv = dynamic_cast(alloc->buffer()); + if (!tv) { return; } // Collect the allocate info data // Collect memory type, skip global buffers - auto mem_type = kir_tv->memoryType(); + auto mem_type = tv->getMemoryType(); if (mem_type != MemoryType::Local && mem_type != MemoryType::Shared) { return; } @@ -487,12 +489,12 @@ class BufferUseDefInfo { } } - auto data_type = kir_tv->dtype(); + auto data_type = tv->dtype(); auto size_print = SymbolicSizePrinter::printSize(alloc); // Make sure we don't have conflicting information on record TORCH_INTERNAL_ASSERT(!map_allocate_to_info_.count(alloc)); - TORCH_INTERNAL_ASSERT(!map_tv_to_allocations_.count(kir_tv->name())); + TORCH_INTERNAL_ASSERT(!map_tv_to_allocations_.count(tv->name())); // make AllocationUseDefInfo: auto alloc_info = makeUseDefInfo(); @@ -505,10 +507,10 @@ class BufferUseDefInfo { // record short cuts map_allocate_to_info_[alloc] = alloc_info; - map_tv_to_allocations_[kir_tv->name()] = alloc_info; + map_tv_to_allocations_[tv->name()] = alloc_info; } - void collectScopeUseDefInfo(const std::vector& exprs) { + void collectScopeUseDefInfo(const std::vector& exprs) { // Reset position pointer resetExprCounter(); TORCH_INTERNAL_ASSERT(global_scope_info_ != nullptr); @@ -516,14 +518,14 @@ class BufferUseDefInfo { handleScope(exprs); } - void collectScopeInfo(const std::vector& exprs) { + void collectScopeInfo(const std::vector& exprs) { // Reset position pointer resetExprCounter(); collectScopeInfoWithinLoop(exprs, nullptr); } void collectScopeInfoWithinLoop( - const std::vector& exprs, + const std::vector& exprs, kir::ForLoop* current_loop) { auto loop_info = makeScopeInfo(current_loop); for (auto expr : exprs) { @@ -584,22 +586,20 @@ class BufferUseDefInfo { // Iterate over the inputs and outputs of exprs and update // the liveness info of local buffers if applicaable. - void collectLivenessInfo(const kir::Expr* expr) { - if (!ir_utils::isTVOp(expr)) { + void collectLivenessInfo(const Expr* expr) { + if (!ir_utils::isTvOp(expr)) { return; } - auto out_tv = expr->outputs()[0]->as(); - auto fuser_out_tv = out_tv->fuserTv(); + auto out_tv = expr->outputs()[0]->as(); // Collect all tv's that resolves broadcast in this // expr. The current analysis isn't enough to capture // their liveness range. - for (auto input_tv : - ir_utils::filterByType(expr->inputs())) { + for (auto input_tv : ir_utils::filterByType(expr->inputs())) { auto maybe_alloc_info = getMaybeAllocInfoFromTV(input_tv); if (maybe_alloc_info.has_value()) { - if (isSerialBroadcastResolution(input_tv->fuserTv(), fuser_out_tv)) { + if (isSerialBroadcastResolution(input_tv, out_tv)) { maybe_alloc_info.value()->inner_live_interval->markRead(current_pos_); } else { // Disable inner alias info for this buffer, since line number based @@ -621,8 +621,7 @@ class BufferUseDefInfo { } } } - for (auto output_tv : - ir_utils::filterByType(expr->outputs())) { + for (auto output_tv : ir_utils::filterByType(expr->outputs())) { auto maybe_alloc_info = getMaybeAllocInfoFromTV(output_tv); if (maybe_alloc_info.has_value()) { maybe_alloc_info.value()->inner_live_interval->markWrite(current_pos_); @@ -675,8 +674,7 @@ class BufferUseDefInfo { return nullptr; } - c10::optional getMaybeAllocInfoFromTV( - kir::TensorView* tv) { + c10::optional getMaybeAllocInfoFromTV(TensorView* tv) { auto alloc_it = map_tv_to_allocations_.find(tv->name()); if (alloc_it == map_tv_to_allocations_.end()) { return c10::nullopt; @@ -810,11 +808,11 @@ void BufferReuseDebugPrinter::printAllocInfo(const kir::Allocate* alloc) { //! Reuse Allocation nodes via pointer aliasing class AllocateReuseModifier { public: - static void modify(const std::vector& exprs) { + static void modify(const std::vector& exprs) { AllocateReuseModifier modifier(exprs); } - static void debugPrint(const std::vector& exprs) { + static void debugPrint(const std::vector& exprs) { BufferReuseDebugPrinter debug_printer; AllocateReuseModifier modifier(exprs, &debug_printer); std::cout << debug_printer.dumpDebugInfo(); @@ -822,7 +820,7 @@ class AllocateReuseModifier { private: AllocateReuseModifier( - const std::vector& exprs, + const std::vector& exprs, BufferReuseDebugPrinter* debug_printer_ = nullptr) : buffer_info_(exprs, debug_printer_) { // Perform in-place sharing first and then outer liveness @@ -922,6 +920,31 @@ class AllocateReuseModifier { continue; } + if (alloc_info->alloc_expr->buffer()->isA()) { + if (!alloc_info->alloc_expr->buffer()->isA()) { + continue; + } + auto this_tv = alloc_info->alloc_expr->buffer()->as(); + auto reuse_tv = alloc_info->alloc_expr->buffer()->as(); + // Check that either both tv's are vectorized acceses, or neither are. + // Vectorized allocations require correct alignment so they can only + // alias with other allocations with the right alignment + const auto& va = GpuLower::current()->vectorizedAccesses(); + if ((va.find(this_tv) == va.end()) != + (va.find(reuse_tv) == va.end())) { + return false; + } + + // Shared memory is all aligned to 128 bits, local memory might not be + if (this_tv->getMemoryType() == MemoryType::Local && + va.find(this_tv) != va.end()) { + // Make sure alignment matches + if (va.at(this_tv) != va.at(reuse_tv)) { + return false; + } + } + } + // TODO: // Outer interval based sharing supports arbitrary re-indexing into // the same buffer and would require additional syncs if fully @@ -941,7 +964,7 @@ class AllocateReuseModifier { return false; } - void handle(kir::Expr* expr) { + void handle(Expr* expr) { if (auto ite = dynamic_cast(expr)) { handle(ite); } else if (auto for_loop = dynamic_cast(expr)) { @@ -961,7 +984,7 @@ class AllocateReuseModifier { "lower_alias_memory: IfThenElse before unrolling is not yet supported"); } - void handleScope(const std::vector& exprs) { + void handleScope(const std::vector& exprs) { current_visible_buffer_stack_.emplace_back( std::make_unique()); for (auto expr : exprs) { @@ -990,10 +1013,8 @@ class AllocateReuseModifier { } // Assume inputs are TV allocations, which should have been checked // before reaching this point. - auto this_tv = - alloc_info->alloc_expr->buffer()->as()->fuserTv(); - auto reuse_tv = - to_reuse->alloc_expr->buffer()->as()->fuserTv(); + auto this_tv = alloc_info->alloc_expr->buffer()->as(); + auto reuse_tv = to_reuse->alloc_expr->buffer()->as(); // Check the values in between the two buffers. auto vals_between_this_and_reuse = @@ -1055,7 +1076,7 @@ class AllocateReuseModifier { if (!tv_def) { continue; } - if (!isPointwiseTvOp(tv_def) && !isReductionTvOp(tv_def)) { + if (!isPointwiseTvOp(tv_def) && !ir_utils::isReductionTvOp(tv_def)) { if (isBroadcastTvOp(tv_def)) { info.has_broadcast_between = true; } else { @@ -1068,8 +1089,8 @@ class AllocateReuseModifier { } bool allocationDomainsIndexMapped( - std::vector& alloc_domains, - std::vector& reuse_domains) { + std::vector& alloc_domains, + std::vector& reuse_domains) { // Require that the allocated domains are exactly mapped. if (alloc_domains.size() != reuse_domains.size()) { return false; @@ -1077,8 +1098,10 @@ class AllocateReuseModifier { // Check index map for the corresponding axes. for (const auto id_it : c10::irange(alloc_domains.size())) { - if (!GpuLower::current()->caIndexMap().areMapped( - alloc_domains[id_it], reuse_domains[id_it])) { + if (!GpuLower::current()->caMap()->areMapped( + alloc_domains[id_it], + reuse_domains[id_it], + IdMappingMode::EXACT)) { return false; } } @@ -1099,24 +1122,16 @@ class AllocateReuseModifier { // Do we have a true pointwise op? // (ie. a TV op, excluding direct assignments and reductions) bool isPointwiseTvOp(const Expr* expr) { - if (ir_utils::isTVOp(expr)) { + if (ir_utils::isTvOp(expr)) { return expr->isA() || expr->isA() || expr->isA(); } return false; } - // Utility to capture reduction ops - bool isReductionTvOp(const Expr* expr) { - if (!ir_utils::isTVOp(expr)) { - return false; - } - return expr->isA() || expr->isA(); - } - // Utility to capture reduction ops bool isBroadcastTvOp(const Expr* expr) { - if (!ir_utils::isTVOp(expr)) { + if (!ir_utils::isTvOp(expr)) { return false; } return expr->isA(); @@ -1138,8 +1153,7 @@ class AllocateReuseModifier { } // namespace -std::vector reuseMemoryAllocations( - const std::vector& exprs) { +std::vector reuseMemoryAllocations(const std::vector& exprs) { FUSER_PERF_SCOPE("reuseMemoryAllocations"); bool debug_print = isDebugDumpEnabled(DebugDumpOption::BufferReuseInfo); if (debug_print) { diff --git a/torch/csrc/jit/codegen/cuda/lower_alias_memory.h b/torch/csrc/jit/codegen/cuda/lower_alias_memory.h index 26b33b6d5dc7..0d144b9f2f40 100644 --- a/torch/csrc/jit/codegen/cuda/lower_alias_memory.h +++ b/torch/csrc/jit/codegen/cuda/lower_alias_memory.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -28,8 +28,7 @@ namespace cuda { //! is not used after this op: //! then alias output Allocate to input Allocate. //! -std::vector reuseMemoryAllocations( - const std::vector& exprs); +std::vector reuseMemoryAllocations(const std::vector& exprs); } // namespace cuda } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/lower_allocation.cpp b/torch/csrc/jit/codegen/cuda/lower_allocation.cpp index 2f70c2758328..85d09e4ca080 100644 --- a/torch/csrc/jit/codegen/cuda/lower_allocation.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_allocation.cpp @@ -1,10 +1,8 @@ -#include #include #include #include #include -#include -#include +#include #include #include @@ -17,8 +15,12 @@ namespace cuda { namespace { -class AllocationInserter : public kir::MutableIrVisitor { +class AllocationInserter : public kir::ExprMutator { private: + using kir::ExprMutator::handle; + + // Expanded version of BasicAllocInfo in lower_utils.h helps to track + // additional information struct AllocationInformation { // The for loop that the initialization of this allocation must be // placed in, nullptr if not within a loop @@ -26,7 +28,7 @@ class AllocationInserter : public kir::MutableIrVisitor { // The expression that the initialization of this allocation must // be placed before - kir::Expr* init_place_before = nullptr; + Expr* init_place_before = nullptr; // Keep track of the actual allocation loop. This can be different // from init_for_loop only with unswitched shared memory allocations, @@ -37,143 +39,93 @@ class AllocationInserter : public kir::MutableIrVisitor { // The expression that this allocation must be placed // before. Similar to alloc_for_loop, this is different from // init_place_before only with unswitched shared memory allocations. - kir::Expr* alloc_place_before = nullptr; + Expr* alloc_place_before = nullptr; // The allocation position relative to buffer size_t alloc_pos = 0; // The buffer this allocation is for - kir::TensorView* buffer = nullptr; - - // The allocation expression - kir::Allocate* alloc_expr = nullptr; - - // Initialization - kir::Expr* init_expr = nullptr; + TensorView* buffer = nullptr; // Info to transfer to GPU lower bool has_halo = false; // Local Iterdomains that this allocation covers - std::unique_ptr> allocation_domains; + std::unique_ptr> allocation_domains; }; // Find allocation point - void findAllocationPosition(AllocationInformation& info, kir::Expr* expr) { - size_t alloc_pos = 0; - kir::ForLoop* init_for_loop = nullptr; - auto fuser_tv = info.buffer->fuserTv(); - size_t fl_idx_next = 0; - - bool outer_alloc_found = false; - kir::ForLoop* alloc_for_loop = nullptr; - size_t alloc_fl_idx_next = 0; - - for (auto fl : for_loops) { - if (alloc_pos == fuser_tv->getComputeAtPosition()) { - break; - } - - if (fuser_tv->axis(alloc_pos)->isReduction()) { - const auto outputs = - FusionGuard::getCurFusion()->getTerminatingOutputs(); - TORCH_INTERNAL_ASSERT( - std::find(outputs.begin(), outputs.end(), fuser_tv) != - outputs.end(), - "Invalid computeAt of T", - fuser_tv->name(), - ". A reducation axis is detected within computeAt axes even though it is not an output tensor."); - break; - } - - auto fl_id = fl->iter_domain(); - - if (fl_id->parallelType() == ParallelType::Unroll) { - break; - } - - // Shared memory must be allocated outside of unswitched - // domains. See issue #1133. - if (fl_id->parallelType() == ParallelType::Unswitch && - fuser_tv->getMemoryType() == MemoryType::Shared) { - outer_alloc_found = true; - } - - auto local_id = gpu_lower->lowerValue(fuser_tv->axis(alloc_pos)) - ->as(); - - if (gpu_lower->caLoopMap().areMapped(local_id, fl_id)) { - alloc_pos++; - } - - init_for_loop = fl; - ++fl_idx_next; - - if (!outer_alloc_found) { - alloc_for_loop = fl; - ++alloc_fl_idx_next; + // Fills info.buffer, info.alloc_pos, info.init_for_loop, + // info.init_place_before, info.alloc_for_loop, info.alloc_place_before + void fillAllocationInformation(AllocationInformation& info, Expr* expr) { + auto loop_alloc_info = + loop_utils::getAllocInformation(info.buffer, for_loops_); + + info.init_for_loop = loop_alloc_info.init_for_loop; + info.alloc_for_loop = loop_alloc_info.alloc_for_loop; + info.alloc_pos = loop_alloc_info.alloc_pos; + + auto next_fl = [](kir::ForLoop* fl, const std::vector fls) { + for (auto i : c10::irange(fls.size())) { + if (fl == fls[i]) { + if (i + 1 < fls.size()) { + return fls[i + 1]; + } + } } - } - - info.alloc_pos = alloc_pos; - info.init_for_loop = init_for_loop; + TORCH_INTERNAL_ASSERT(false, "Could not find desired loop."); + }; if (info.init_for_loop == nullptr) { - info.init_place_before = for_loops.size() > 0 ? for_loops[0] : expr; + info.init_place_before = for_loops_.size() > 0 ? for_loops_[0] : expr; } else { - if (info.init_for_loop == for_loops.back()) { + if (info.init_for_loop == for_loops_.back()) { // Inline allocation, place before expr info.init_place_before = expr; } else { // Place allocation after the last computeAt axis // TODO: may be more efficient to place before the first non-computeAt // axis - info.init_place_before = for_loops.at(fl_idx_next); + info.init_place_before = next_fl(info.init_for_loop, for_loops_); } } // Set the allocation loop and the place_before expression in the // same way as the initialization loop and place_before expression - if (!outer_alloc_found) { + if (info.alloc_for_loop == info.init_for_loop) { info.alloc_for_loop = info.init_for_loop; info.alloc_place_before = info.init_place_before; } else { - info.alloc_for_loop = alloc_for_loop; if (info.alloc_for_loop == nullptr) { - info.alloc_place_before = for_loops.size() > 0 ? for_loops[0] : expr; + info.alloc_place_before = for_loops_.size() > 0 ? for_loops_[0] : expr; } else { // Since there must be an inner unswitched domain, // alloc_for_loop should never be the inner-most loop. - TORCH_INTERNAL_ASSERT(info.alloc_for_loop != for_loops.back()); - info.alloc_place_before = for_loops.at(alloc_fl_idx_next); + TORCH_INTERNAL_ASSERT(info.alloc_for_loop != for_loops_.back()); + info.alloc_place_before = next_fl(info.alloc_for_loop, for_loops_); } } } // Create initialization expression if init_val is non-null. - void createInitExpr(AllocationInformation& info, kir::Val* init_val) { + Expr* createInitExpr(AllocationInformation& info, Val* init_val) { if (init_val == nullptr) { - info.init_expr = nullptr; - return; + return nullptr; } - auto fuser_tv = info.buffer->fuserTv(); - - std::vector init_dims; - for (const auto axis_i : c10::irange(info.alloc_pos, fuser_tv->nDims())) { - if (info.buffer->fuserTv()->axis(axis_i)->isReduction() || - info.buffer->fuserTv()->axis(axis_i)->isBroadcast()) { + std::vector init_dims; + for (const auto axis_i : + c10::irange(info.alloc_pos, info.buffer->nDims())) { + if (info.buffer->axis(axis_i)->isReduction() || + info.buffer->axis(axis_i)->isBroadcast()) { continue; } - auto concrete_id = - gpu_lower - ->lowerValue(gpu_lower->caParallelMap().getConcreteMappedID( - fuser_tv->axis(axis_i))) - ->as(); + auto concrete_id = gpu_lower->caMap()->getConcreteMappedID( + info.buffer->axis(axis_i), IdMappingMode::LOOP); init_dims.push_back(concrete_id); } - kir::Expr* init_expr = ir_builder.create( - UnaryOpType::Set, info.buffer, init_val); + Expr* init_expr = + IrBuilder::create(UnaryOpType::Set, info.buffer, init_val); for (auto init_loop_it = init_dims.rbegin(); init_loop_it != init_dims.rend(); ++init_loop_it) { @@ -181,9 +133,9 @@ class AllocationInserter : public kir::MutableIrVisitor { kir::ForLoop* new_loop = nullptr; auto extent_with_halo = gpu_lower->haloInfo().getExtent(id); if (extent_with_halo) { - new_loop = ir_builder.create( + new_loop = IrBuilder::create( id, - ir_builder.create(c10::nullopt), + IrBuilder::create(c10::nullopt), nullptr, extent_with_halo, nullptr, @@ -191,31 +143,33 @@ class AllocationInserter : public kir::MutableIrVisitor { nullptr, false); } else { - new_loop = ir_builder.create(id); + new_loop = IrBuilder::create(id); } new_loop->body().push_back(init_expr); init_expr = new_loop; } - info.init_expr = init_expr; + return init_expr; } - std::vector getGlobalAllocationSizes(AllocationInformation& info) { + std::vector getGlobalAllocationSizes(AllocationInformation& info) { const auto& domain = info.buffer->domain(); - const auto& maybe_rfactor_domain = - domain->hasRFactor() ? domain->rfactorDomain() : domain->rootDomain(); + const auto& maybe_rfactor_domain = domain->hasRFactor() + ? domain->getRFactorDomain() + : domain->getRootDomain(); - std::vector alloc_dims; + std::vector alloc_dims; for (const auto id : maybe_rfactor_domain) { if (id->isReduction() || id->isStride() || - id->iterType() == IterType::BroadcastWithoutStride) { + id->getIterType() == IterType::BroadcastWithoutStride) { continue; } auto extent = id->extent(); // Use halo-extended extent if found auto halo_extent = gpu_lower->haloInfo().getRootAxisInfo(id); if (halo_extent.hasHalo()) { - extent = ir_builder.addExpr(extent, halo_extent.width()); + extent = IrBuilder::addExpr( + extent, IrBuilder::create(halo_extent.width())); } alloc_dims.push_back(extent); } @@ -244,7 +198,7 @@ class AllocationInserter : public kir::MutableIrVisitor { // fall back to the leaf-based allocation. // // See the FusionShiftDoubleSplit test for an example case. - std::vector getNonGlobalAllocExprWithHalo( + std::vector getNonGlobalAllocExprWithHalo( TensorView* tv, const std::vector& alloc_domains) { std::vector start_vals; @@ -255,18 +209,18 @@ class AllocationInserter : public kir::MutableIrVisitor { [](IterDomain* dom) { return dom->as(); }); // Get all exprs involved in generating the allocation IDs - auto exprs = ExprSort::getExprs(tv->fusion(), start_vals); + auto exprs = StmtSort::getExprs(tv->fusion(), start_vals); // Get the halo extent if found auto getExtent = [this](IterDomain* id) { auto extent = gpu_lower->haloInfo().getExtent(id); if (extent == nullptr) { - extent = gpu_lower->lowerValue(id->extent()); + extent = id->extent(); } return extent; }; - std::unordered_map known_extents; + std::unordered_map known_extents; // IterDomains that are allocated fully. For example, if an ID is // split and only one of them is used for allocation, that's not @@ -314,7 +268,7 @@ class AllocationInserter : public kir::MutableIrVisitor { } else { known_extents.insert( {split->in(), - ir_builder.mulExpr(outer_it->second, inner_it->second)}); + IrBuilder::mulExpr(outer_it->second, inner_it->second)}); } known_extents.erase(inner_it); known_extents.erase(outer_it); @@ -330,7 +284,7 @@ class AllocationInserter : public kir::MutableIrVisitor { } } - std::vector alloc_dims; + std::vector alloc_dims; for (auto root_axis : tv->getRootDomain()) { auto it = known_extents.find(root_axis); @@ -355,24 +309,22 @@ class AllocationInserter : public kir::MutableIrVisitor { return alloc_dims; } - std::vector getNonGlobalAllocExpr(AllocationInformation& info) { - auto fuser_tv = info.buffer->fuserTv(); - const auto memory_type = info.buffer->memoryType(); + std::vector getNonGlobalAllocExpr(AllocationInformation& info) { + const auto memory_type = info.buffer->getMemoryType(); TORCH_INTERNAL_ASSERT( memory_type != MemoryType::Global, "Invalid memory type: ", memory_type); - std::vector alloc_dims; + std::vector alloc_dims; bool has_halo = false; std::vector alloc_domains; - info.allocation_domains = std::make_unique>(); + info.allocation_domains = std::make_unique>(); - for (const auto axis_i : c10::irange(fuser_tv->nDims())) { - const auto local_id = - gpu_lower->lowerValue(fuser_tv->axis(axis_i))->as(); + for (const auto axis_i : c10::irange(info.buffer->nDims())) { + const auto local_id = info.buffer->axis(axis_i); // Don't use reduction/stride/broadcast axis in the allocation // computation @@ -381,16 +333,14 @@ class AllocationInserter : public kir::MutableIrVisitor { continue; } - auto concrete_id = - gpu_lower - ->lowerValue(gpu_lower->caParallelMap().getConcreteMappedID( - fuser_tv->axis(axis_i))) - ->as(); + auto concrete_id = gpu_lower->caMap()->getConcreteMappedID( + info.buffer->axis(axis_i), IdMappingMode::LOOP); const bool is_block_dim = - isParallelTypeBlockDim(concrete_id->parallelType()); + isParallelTypeBlockDim(concrete_id->getParallelType()); const bool is_thread_dim = - isParallelTypeThreadDim(concrete_id->parallelType()); - const bool is_thread = isParallelTypeThread(concrete_id->parallelType()); + isParallelTypeThreadDim(concrete_id->getParallelType()); + const bool is_thread = + isParallelTypeThread(concrete_id->getParallelType()); if (axis_i < info.alloc_pos) { // Even when the axis is outside the allocation position, if the @@ -403,7 +353,7 @@ class AllocationInserter : public kir::MutableIrVisitor { (memory_type == MemoryType::Global && is_thread))) { continue; } - alloc_domains.push_back(fuser_tv->axis(axis_i)); + alloc_domains.push_back(info.buffer->axis(axis_i)); } else { if ( // If shared memory, don't use any IDs bound to a grid dimension @@ -413,12 +363,13 @@ class AllocationInserter : public kir::MutableIrVisitor { (memory_type == MemoryType::Local && is_thread)) { continue; } - alloc_domains.push_back(fuser_tv->axis(axis_i)); + alloc_domains.push_back(info.buffer->axis(axis_i)); } auto extent = concrete_id->extent(); - if (gpu_lower->haloInfo().getExtent(fuser_tv->axis(axis_i)) != nullptr) { + if (gpu_lower->haloInfo().getExtent(info.buffer->axis(axis_i)) != + nullptr) { has_halo = true; } @@ -430,20 +381,19 @@ class AllocationInserter : public kir::MutableIrVisitor { // the halo extents from leaf IDs to root IDs if (has_halo) { info.has_halo = true; - return getNonGlobalAllocExprWithHalo(fuser_tv, alloc_domains); + return getNonGlobalAllocExprWithHalo(info.buffer, alloc_domains); } return alloc_dims; } - void createAllocExpr(AllocationInformation& info, bool is_output) { + kir::Allocate* createAllocExpr(AllocationInformation& info, bool is_output) { if (is_output) { - info.alloc_expr = nullptr; - return; + return nullptr; } - std::vector alloc_dims; - const MemoryType memory_type = info.buffer->memoryType(); + std::vector alloc_dims; + const MemoryType memory_type = info.buffer->getMemoryType(); if (memory_type == MemoryType::Global) { alloc_dims = getGlobalAllocationSizes(info); @@ -453,60 +403,82 @@ class AllocationInserter : public kir::MutableIrVisitor { if (alloc_dims.size() == 0 && info.buffer->domain()->noReductions().size() != 0) { - alloc_dims.push_back(ir_builder.create(1)); + alloc_dims.push_back(info.buffer->container()->oneVal()); + } + + // Double the allocation size if double-buffered. Record the + // original size for indexing. + if (info.buffer->isDoubleBuffered()) { + Val* original_alloc_size = nullptr; + for (auto alloc_dim : alloc_dims) { + if (original_alloc_size == nullptr) { + original_alloc_size = alloc_dim; + } else { + original_alloc_size = + IrBuilder::mulExpr(original_alloc_size, alloc_dim); + } + } + GpuLower::current()->doubleBufferInfo().setOriginalAllocSize( + info.buffer, original_alloc_size); + alloc_dims.push_back(IrBuilder::create(2)); } // Create the allocation node - info.alloc_expr = ir_builder.create( - info.buffer, info.buffer->memoryType(), alloc_dims); + return IrBuilder::create( + info.buffer, info.buffer->getMemoryType(), alloc_dims); } - void handle(kir::Expr* expr) { - if (!ir_utils::isTVOp(expr) || expr->isA()) { - expr->accept(this); + void handle(Expr* expr) override { + if (!ir_utils::isTvOp(expr) || expr->isA()) { + ExprMutator::handle(expr); return; } // // Found where the allocation needs to be inserted - for (auto out : expr->outputs()) { - if (!out->isA()) { + for (const auto i : c10::irange(expr->outputs().size())) { + auto out = expr->output(i); + if (!out->isA()) { continue; } - auto out_tv = out->as(); - auto default_val = - gpu_lower->predicateElimination().getInitValue(out_tv->fuserTv()); + auto out_tv = out->as(); + auto default_val = gpu_lower->predicateElimination().getInitValue(out_tv); - kir::Val* init = nullptr; - if (expr->isA() && out_tv->fuserTv()->hasReduction()) { + Val* init = nullptr; + if (expr->isA() && out_tv->hasReduction()) { TORCH_INTERNAL_ASSERT( default_val == nullptr, "Reduction should not have a default initialization value for predicate elimination."); - init = expr->as()->init(); - } else if (expr->isA()) { + init = expr->as()->init(); + } else if (expr->isA() && out_tv->hasReduction()) { + TORCH_INTERNAL_ASSERT( + default_val == nullptr, + "Reduction should not have a default initialization value for predicate elimination."); + init = expr->as()->initVal(i); + } else if (expr->isA()) { + init = expr->as()->init(); + } else if (expr->isA()) { TORCH_INTERNAL_ASSERT( default_val == nullptr, "Welford should not have a default initialization value for predicate elimination."); - const auto welford = expr->as(); - if (out->id() == welford->outVar()->id()) { - init = welford->initVar() == nullptr - ? ir_builder.create(0) - : welford->initVar(); - } else if (out->id() == welford->outAvg()->id()) { - init = welford->initAvg() == nullptr - ? ir_builder.create(0) - : welford->initAvg(); + const auto welford = expr->as(); + if (out->name() == welford->outVar()->name()) { + init = welford->initVar() == nullptr ? IrBuilder::create(0) + : welford->initVar(); + } else if (out->name() == welford->outAvg()->name()) { + init = welford->initAvg() == nullptr ? IrBuilder::create(0) + : welford->initAvg(); } else { TORCH_INTERNAL_ASSERT( - out->id() == welford->outN()->id(), "Unreachable"); + out->name() == welford->outN()->name(), "Unreachable"); init = welford->initN(); } } else if (default_val != nullptr) { init = default_val; } - const bool is_output = gpu_lower->kernel()->isOutput(out); + const bool is_output = out->isFusionOutput(); // Don't need to alloc outputs, and if we don't need to initialize we're // done. @@ -516,150 +488,91 @@ class AllocationInserter : public kir::MutableIrVisitor { AllocationInformation allocation; allocation.buffer = out_tv; - findAllocationPosition(allocation, expr); - createAllocExpr(allocation, is_output); - createInitExpr(allocation, init); + fillAllocationInformation(allocation, expr); + + auto alloc_expr = createAllocExpr(allocation, is_output); + auto init_expr = createInitExpr(allocation, init); // Write information to GPULower - writeInfoToGPULower(allocation); + writeInfoToGPULower(allocation, alloc_expr); + + // Register allocations before initializations to keep them in the right + // order + if (alloc_expr != nullptr) { + if (allocation.buffer->getMemoryType() == MemoryType::Shared) { + // Shared allocations go at the begining of scope + TORCH_INTERNAL_ASSERT(!exprs_.empty()); + registerInsertBefore(exprs_[0], alloc_expr, nullptr); + } else { + TORCH_INTERNAL_ASSERT(allocation.alloc_place_before != nullptr); + kir::Scope* scope = allocation.alloc_for_loop == nullptr + ? nullptr + : &allocation.alloc_for_loop->body(); + registerInsertBefore( + allocation.alloc_place_before, alloc_expr, scope); + } + } - allocs.push_back(std::move(allocation)); + if (init_expr != nullptr) { + TORCH_INTERNAL_ASSERT(allocation.init_place_before != nullptr); + kir::Scope* scope = allocation.init_for_loop == nullptr + ? nullptr + : &allocation.init_for_loop->body(); + registerInsertBefore(allocation.init_place_before, init_expr, scope); + } } } - void writeInfoToGPULower(const AllocationInformation& allocation) { + // Sends alloc_expr, info.has_halo, info.allocation_domains to GpuLower + void writeInfoToGPULower( + const AllocationInformation& allocation, + kir::Allocate* alloc_expr) { auto& lower_alloc_info_map = GpuLower::current()->localAllocationInfoMap(); - if (allocation.alloc_expr == nullptr) { + if (alloc_expr == nullptr) { // Skip output allocation. return; } TORCH_INTERNAL_ASSERT( - !lower_alloc_info_map.count(allocation.alloc_expr), + !lower_alloc_info_map.count(alloc_expr), "duplicated allocation info entry"); // Create info entry for GPULower auto lower_alloc_info_ptr = std::make_unique(); - lower_alloc_info_ptr->alloc_expr = allocation.alloc_expr; + lower_alloc_info_ptr->alloc_expr = alloc_expr; lower_alloc_info_ptr->has_halo = allocation.has_halo; if (allocation.allocation_domains) { lower_alloc_info_ptr->alloc_domains = *(allocation.allocation_domains); } // Write entry to the stored map - lower_alloc_info_map[allocation.alloc_expr] = - std::move(lower_alloc_info_ptr); - } - - void visit(kir::ForLoop* fl) final { - for_loops.push_back(fl); - // Modifying in place, make a copy of the vector - const std::vector exprs = fl->body().exprs(); - for (auto expr : exprs) { - handle(expr); - } - for_loops.pop_back(); + lower_alloc_info_map[alloc_expr] = std::move(lower_alloc_info_ptr); } - void visit(kir::IfThenElse*) final { + void handle(kir::IfThenElse*) final { TORCH_INTERNAL_ASSERT( false, "Pass does not support conditional statements, ", "this pass should be run before any conditionals are placed in code."); } - AllocationInserter(std::vector _loop_nests) - : loop_nests_(std::move(_loop_nests)), - gpu_lower(GpuLower::current()), - ir_builder(gpu_lower->kernel()) { - // Compute all allocations - const std::vector exprs = loop_nests_; - for (auto expr : exprs) { - handle(expr); - } - - // First, place allocations of dynamic smem tensors at the very - // beginning of the expr list. Traverse backward as they should be - // placed in topological order. - for (auto it = allocs.rbegin(); it != allocs.rend(); ++it) { - const auto& alloc = *it; - if (alloc.alloc_expr == nullptr) { - continue; - } - // Dynamic smem exprs need to be at the begining of the kernel outside for - // loops - if (alloc.buffer->memoryType() == MemoryType::Shared && - !kir::ExpressionEvaluator::isConst(alloc.alloc_expr->size())) { - loop_nests_.insert(loop_nests_.begin(), alloc.alloc_expr); - } - } - - // Place the remaining allocations. - for (const auto& alloc : allocs) { - if (alloc.alloc_expr == nullptr) { - continue; - } - if (alloc.buffer->memoryType() == MemoryType::Shared && - !kir::ExpressionEvaluator::isConst(alloc.alloc_expr->size())) { - continue; - } - if (alloc.alloc_for_loop == nullptr) { - auto place_before_it = std::find( - loop_nests_.begin(), loop_nests_.end(), alloc.alloc_place_before); - TORCH_INTERNAL_ASSERT( - place_before_it != loop_nests_.end(), - "Could not figure out where to place allocation. ", - "Use of the buffer, ", - toString(alloc.buffer), - ", could not be found.", - toString(alloc.alloc_place_before)); - loop_nests_.insert(place_before_it, alloc.alloc_expr); - } else { - alloc.alloc_for_loop->body().insert_before( - alloc.alloc_place_before, alloc.alloc_expr); - } - } - - // Now that allocations are in place, place the initializations - for (const auto& alloc : allocs) { - if (alloc.init_expr == nullptr) { - continue; - } - if (alloc.init_for_loop == nullptr) { - auto place_before_it = std::find( - loop_nests_.begin(), loop_nests_.end(), alloc.init_place_before); - // Don't need a check here as if the allocation placement succeeded - // this will too - loop_nests_.insert(place_before_it, alloc.init_expr); - } else { - alloc.init_for_loop->body().insert_before( - alloc.init_place_before, alloc.init_expr); - } - } + AllocationInserter(const std::vector& exprs) + : gpu_lower(GpuLower::current()) { + kir::ExprMutator::traverseAndInsert(exprs); } private: - std::deque allocs; - - std::vector for_loops; - - std::vector loop_nests_; - GpuLower* gpu_lower; - kir::IrBuilder ir_builder; - public: - static std::vector insert( - const std::vector& loop_nests) { - AllocationInserter inserter(loop_nests); - return inserter.loop_nests_; + static std::vector insert(const std::vector& exprs) { + AllocationInserter inserter(exprs); + return inserter.exprs_; } }; } // namespace -std::vector insertAllocations( - const std::vector& exprs) { +std::vector insertAllocations(const std::vector& exprs) { FUSER_PERF_SCOPE("GpuLower::Lower::insertAllocations"); return AllocationInserter::insert(exprs); } diff --git a/torch/csrc/jit/codegen/cuda/lower_allocation.h b/torch/csrc/jit/codegen/cuda/lower_allocation.h index bc0344ca19f6..45ebeac03f77 100644 --- a/torch/csrc/jit/codegen/cuda/lower_allocation.h +++ b/torch/csrc/jit/codegen/cuda/lower_allocation.h @@ -1,8 +1,7 @@ #pragma once -#include +#include -#include #include #include @@ -17,7 +16,7 @@ namespace cuda { //! logic duplication struct LocalAllocationInfo { kir::Allocate* alloc_expr = nullptr; - std::vector alloc_domains; + std::vector alloc_domains; bool has_halo = false; }; @@ -25,7 +24,7 @@ using LocalAllocationInfoMap = std::unordered_map>; //! Insert buffer allocations -std::vector insertAllocations(const std::vector& exprs); +std::vector insertAllocations(const std::vector& exprs); } // namespace cuda } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/lower_double_buffer.cpp b/torch/csrc/jit/codegen/cuda/lower_double_buffer.cpp new file mode 100644 index 000000000000..b069f4cc8ebc --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/lower_double_buffer.cpp @@ -0,0 +1,506 @@ +#include +#include +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +unsigned int getDoubleBufferAxisPosition(const TensorView* tv) { + // Double-buffering prefetches the next subregion of the tensor by + // doubling the allocation. The subregion is defined by the axes + // at the CA position till the inner-most position. There must be + // at least one axis that is outside (left) of the CA position, + // which defines the loop where prefetching is applied. Therefore, + // the CA position must be larger than 0. + + TORCH_INTERNAL_ASSERT(tv->getComputeAtPosition() > 0); + + // Unroll must not exist outside of double-buffer axis + auto first_unroll_it = std::find_if( + tv->domain()->domain().begin(), + tv->domain()->domain().end(), + [](const auto axis) { + return axis->getParallelType() == ParallelType::Unroll; + }); + + const int first_unroll_pos = + std::distance(tv->domain()->domain().begin(), first_unroll_it); + + const int unroll_or_ca_pos = + std::min((int)tv->getComputeAtPosition(), first_unroll_pos); + + TORCH_INTERNAL_ASSERT( + unroll_or_ca_pos > 0, + "Invalid tensor to double-buffer. Valid double buffer axis not found due to Unroll. ", + tv->toString()); + + int valid_pos = -1; + // Skip parallelized or broadcast axes + for (int i = unroll_or_ca_pos - 1; i >= 0; --i) { + auto pt = tv->axis(i)->getParallelType(); + if (!isParallelTypeThread(pt) && !tv->axis(i)->isBroadcast()) { + valid_pos = i; + break; + } + } + + TORCH_INTERNAL_ASSERT( + valid_pos >= 0, + "Invalid tensor to double-buffer. Valid double buffer axis not found. ", + tv->toString()); + + return valid_pos; +} + +IterDomain* getDoubleBufferAxis(const TensorView* tv) { + return tv->axis((int)getDoubleBufferAxisPosition(tv)); +} + +void validateDoubleBufferedTensor(const TensorView* tv) { + auto double_buffer_pos = getDoubleBufferAxisPosition(tv); + + // Like vectorization, only UnaryOp::Set with another TensorView is + // considered. + auto def = tv->definition(); + TORCH_INTERNAL_ASSERT( + def->isA() && + def->as()->getUnaryOpType() == UnaryOpType::Set, + "Invalid tensor to double-buffer. Only tensor defined by UnaryOp::Set is supported: ", + def->toString()); + + TORCH_INTERNAL_ASSERT( + def->as()->in()->isA(), + "Invalid tensor to double-buffer. Only tensor defined by UnaryOp::Set with TensorView is supported: ", + def->toString()); + + // Require the producer tensor to have been computed entirely for + // the double-buffering loop. Otherwise, the producer itself would + // also need to be double-bufferred. + auto producer = def->as()->in()->as(); + TORCH_INTERNAL_ASSERT( + producer->getComputeAtPosition() <= double_buffer_pos, + "Invalid tensor to double-buffer. The computeAt position of the producer tensor must be moved left: ", + producer->toString()); + + // Not strictly necessary, but only gmem -> smem or local and smem -> local + // are allowed. + const auto p_mem_type = producer->getMemoryType(); + const auto c_mem_type = tv->getMemoryType(); + TORCH_INTERNAL_ASSERT( + (p_mem_type == MemoryType::Global && + (c_mem_type == MemoryType::Shared || c_mem_type == MemoryType::Local)) || + (p_mem_type == MemoryType::Shared && c_mem_type == MemoryType::Local), + "Invalid tensor to double-buffer: ", + tv->toString(), + ". Producer memory type: ", + p_mem_type, + ". Consumer memory type: ", + c_mem_type); + + return; +} + +namespace { + +// Initial inspection of a fusion to find and validate double buffered tensors +class DoubleBufferFusionInspector : private IterVisitor { + public: + DoubleBufferFusionInspector(Fusion* fusion, DoubleBufferInfo& db_info) + : db_info_(db_info) { + traverse(fusion); + } + + private: + using IterVisitor::handle; + + void handle(TensorView* tv) final { + if (!tv->isDoubleBuffered()) { + return; + } + + validateDoubleBufferedTensor(tv); + + auto db_axis = getDoubleBufferAxis(tv); + + db_info_.setDoubleBufferAxis(tv, db_axis); + } + + private: + DoubleBufferInfo& db_info_; +}; + +// The type of replicated double-buffer loops +enum class LoopType { Prologue, Main, Epilogue }; + +// The epilogue loop is only created when the producer of a double +// buffer tensor is on smem, in which case it would otherwise require +// an additional predicate to guard buffer overruns. When it's on +// gmem, that isn't the case, so it does not need to create an +// epilogue loop. +bool requireEpilogue(const std::vector& exprs) { + return std::any_of(exprs.begin(), exprs.end(), [](const UnaryOp* uop) { + return uop->in()->as()->getMemoryType() == MemoryType::Shared; + }); +} + +// Replicates double buffer loops for Prologue, Main, and +// Epilogue. Prologue only copies the load expressions of double +// buffered tensors, whereas Epilogue does any expression other than +// the loads. Main copies everything. +class DoubleBufferLoopCloner : public kir::IrVisitor { + public: + static kir::ForLoop* clone( + kir::ForLoop* double_buffer_loop, + const std::vector& double_buffer_load_exprs, + LoopType loop_type) { + DoubleBufferLoopCloner cloner( + double_buffer_loop, double_buffer_load_exprs, loop_type); + cloner.clone(); + return cloner.cloned_top_level_loop_; + } + + private: + DoubleBufferLoopCloner( + kir::ForLoop* double_buffer_loop, + const std::vector& double_buffer_load_exprs, + LoopType loop_type) + : double_buffer_loop_(double_buffer_loop), + double_buffer_load_exprs_(double_buffer_load_exprs), + loop_type_(loop_type) {} + + using kir::IrVisitor::handle; + + void clone() { + const auto gpu_lower = GpuLower::current(); + + // Cloning the double buffer loop as follows: + // + // Prologue: 0 to 1 + // Main: 0 to (extent-1) + // Epilogue: (extent-1) to extent + + auto index = IrBuilder::create(c10::nullopt); + auto start = double_buffer_loop_->start(); + auto stop = double_buffer_loop_->stop(); + + if (loop_type_ == LoopType::Prologue) { + TORCH_INTERNAL_ASSERT(start->isZeroInt()); + stop = gpu_lower->kernel()->oneVal(); + } else if ( + loop_type_ == LoopType::Main && + requireEpilogue(double_buffer_load_exprs_)) { + stop = IrBuilder::subExpr( + double_buffer_loop_->stop(), gpu_lower->kernel()->oneVal()); + } else if (loop_type_ == LoopType::Epilogue) { + TORCH_INTERNAL_ASSERT(requireEpilogue(double_buffer_load_exprs_)); + start = IrBuilder::subExpr( + double_buffer_loop_->stop(), gpu_lower->kernel()->oneVal()); + } + + cloned_top_level_loop_ = IrBuilder::create( + double_buffer_loop_->iter_domain(), + index, + start, + stop, + gpu_lower->kernel()->oneVal(), + false, + nullptr, + double_buffer_loop_->isUnrollRequired()); + + handle(double_buffer_loop_); + } + + void handle(kir::ForLoop* fl) final { + kir::ForLoop* cloned_loop = fl == double_buffer_loop_ + ? cloned_top_level_loop_ + : IrBuilder::create(fl); + + cloned_scopes_.push_back(&cloned_loop->body()); + + kir::IrVisitor::handle(fl); + + cloned_scopes_.pop_back(); + + // Add the cloned loop into the parent loop body only when the + // cloned loop contains expressions. + if (!cloned_loop->body().empty() && !cloned_scopes_.empty()) { + cloned_scopes_.back()->push_back(cloned_loop); + } + } + + void handle(kir::IfThenElse* ite) final { + TORCH_INTERNAL_ASSERT(false, "No IfThenElse should exist yet"); + } + + void handle(Expr* expr) final { + if (expr->isA() || expr->isA()) { + kir::IrVisitor::handle(expr); + return; + } + + TORCH_INTERNAL_ASSERT(!cloned_scopes_.empty()); + + if (loop_type_ == LoopType::Main) { + cloned_scopes_.back()->push_back(expr); + return; + } + + // In Prologue and Epilogue, either load expressions or anything + // else are copied. Note that there can be multiple exprs defining + // double buffered TVs (e.g., buffer initialization). + + auto out_tv = ir_utils::getTvOutput(expr); + const auto is_double_buffer_load_expr = std::any_of( + double_buffer_load_exprs_.begin(), + double_buffer_load_exprs_.end(), + [out_tv](const auto load_expr) { + auto double_buffer_tv = ir_utils::getTvOutput(load_expr); + TORCH_INTERNAL_ASSERT(double_buffer_tv != nullptr); + return out_tv == double_buffer_tv; + }); + if ((loop_type_ == LoopType::Prologue && is_double_buffer_load_expr) || + (loop_type_ == LoopType::Epilogue && !is_double_buffer_load_expr)) { + cloned_scopes_.back()->push_back(expr); + } + } + + private: + kir::ForLoop* double_buffer_loop_ = nullptr; + const std::vector& double_buffer_load_exprs_; + const LoopType loop_type_; + + kir::ForLoop* cloned_top_level_loop_ = nullptr; + std::deque cloned_scopes_; +}; + +using InsertionInfo = std::unordered_map>; + +// Traverse lowered loop-nests and find all double buffer loops and +// associated load expressions. +class DoubleBufferLoopNestInspector : private kir::IrVisitor { + public: + static InsertionInfo run(const std::vector& exprs) { + DoubleBufferLoopNestInspector inspector(exprs); + return inspector.insertion_info_; + } + + private: + DoubleBufferLoopNestInspector(const std::vector& exprs) { + handle(exprs); + } + + using kir::IrVisitor::handle; + + void handle(UnaryOp* uop) final { + const auto gpu_lower = GpuLower::current(); + + auto out_tv = ir_utils::getTvOutput(uop); + + if (out_tv == nullptr) { + return; + } + + // Ignore init loop + if (!out_tv->isDoubleBuffered() || !uop->in()->isA()) { + return; + } + + auto double_buffer_loop = + gpu_lower->doubleBufferInfo().getDoubleBufferLoop(out_tv, for_loops_); + + TORCH_INTERNAL_ASSERT( + double_buffer_loop != nullptr, + "No double buffer loop found for a double buffered tensor: ", + out_tv->toString()); + + validateDoubleBufferLoop(double_buffer_loop); + + insertion_info_[double_buffer_loop].push_back(uop); + } + + static void validateDoubleBufferLoop(kir::ForLoop* loop) { + TORCH_INTERNAL_ASSERT( + loop->start()->isZeroInt(), "Unsupported loop: ", loop->toString()); + TORCH_INTERNAL_ASSERT( + loop->step()->isOneInt(), "Unsupported loop: ", loop->toString()); + TORCH_INTERNAL_ASSERT( + !loop->vectorize(), + "Vectorized loop should not be the allocation loop for double-buffered tensor: ", + loop->toString()); + TORCH_INTERNAL_ASSERT( + !loop->vectorize_shift(), + "Vectorize shift loop should not be the allocation loop for double-buffered tensor: ", + loop->toString()); + } + + InsertionInfo insertion_info_; +}; + +// Apply double buffering transformations +class DoubleBufferInserter : private kir::ExprMutator { + public: + // When there exist multiple double buffer loops, apply + // transformations to inner-most loops first. A single ExprMutator + // pass can only process one loop. + static std::vector run( + const std::vector& exprs, + InsertionInfo insertion_info) { + auto inserted_exprs = exprs; + while (!insertion_info.empty()) { + DoubleBufferInserter inserter(inserted_exprs, insertion_info); + inserted_exprs = inserter.exprs_; + } + return inserted_exprs; + } + + private: + DoubleBufferInserter( + const std::vector& exprs, + InsertionInfo& insertion_info) + : insertion_info_(insertion_info) { + auto num_double_buffer_loops = insertion_info.size(); + traverseAndInsert(exprs); + TORCH_INTERNAL_ASSERT(processed_loop_ != nullptr); + TORCH_INTERNAL_ASSERT(insertion_info.size() == num_double_buffer_loops - 1); + } + + using kir::ExprMutator::handle; + + void handle(kir::ForLoop* loop) final { + kir::ExprMutator::handle(loop); + + // If another loop is already taken care of, no more loop should + // be done in the same pass + if (processed_loop_ != nullptr) { + return; + } + + auto it = insertion_info_.find(loop); + if (it == insertion_info_.end()) { + return; + } + + insert(loop, it->second); + processed_loop_ = loop; + insertion_info_.erase(loop); + } + + void insert( + kir::ForLoop* double_buffer_loop, + const std::vector& loads) { + auto prologue_loop = DoubleBufferLoopCloner::clone( + double_buffer_loop, loads, LoopType::Prologue); + registerInsertBefore(double_buffer_loop, prologue_loop); + + auto write_to_smem = + std::any_of(loads.begin(), loads.end(), [](const UnaryOp* uop) { + return uop->out()->as()->getMemoryType() == + MemoryType::Shared; + }); + + // RAW sync is not inserted for double buffered tensors. The only + // exception is the prologue load. + if (write_to_smem) { + auto sync = IrBuilder::create(); + registerInsertBefore(double_buffer_loop, sync); + } + + auto main_loop = DoubleBufferLoopCloner::clone( + double_buffer_loop, loads, LoopType::Main); + registerReplace(double_buffer_loop, main_loop); + + if (requireEpilogue(loads)) { + auto epilogue_loop = DoubleBufferLoopCloner::clone( + double_buffer_loop, loads, LoopType::Epilogue); + registerInsertAfter(double_buffer_loop, epilogue_loop); + } + } + + private: + InsertionInfo& insertion_info_; + kir::ForLoop* processed_loop_ = nullptr; +}; + +} // namespace + +void DoubleBufferInfo::build(Fusion* fusion) { + DoubleBufferFusionInspector inspector(fusion, *this); +} + +DoubleBufferInfo::TvInfo& DoubleBufferInfo::getTvInfo(const TensorView* tv) { + TORCH_INTERNAL_ASSERT( + tv->isDoubleBuffered(), "Not a double-buffered tensor: ", tv->toString()); + return map_[tv]; +} + +void DoubleBufferInfo::setDoubleBufferAxis( + const TensorView* tv, + IterDomain* axis) { + getTvInfo(tv).double_buffer_axis = axis; +} + +IterDomain* DoubleBufferInfo::getDoubleBufferAxis(const TensorView* tv) { + if (!tv->isDoubleBuffered()) { + return nullptr; + } + + return getTvInfo(tv).double_buffer_axis; +} + +kir::ForLoop* DoubleBufferInfo::getDoubleBufferLoop( + IterDomain* axis, + const std::vector& loops, + bool ignore_prologue) { + auto loop_it = std::find_if(loops.begin(), loops.end(), [&](const auto loop) { + return GpuLower::current()->caMap()->areMapped( + loop->iter_domain(), axis, IdMappingMode::EXACT) && + (!ignore_prologue || !loop->stop()->isOneInt()); + }); + + if (loop_it != loops.end()) { + return *loop_it; + } else { + return nullptr; + } +} + +kir::ForLoop* DoubleBufferInfo::getDoubleBufferLoop( + const TensorView* tv, + const std::vector& loops, + bool ignore_prologue) { + auto axis = getDoubleBufferAxis(tv); + + if (axis == nullptr) { + return nullptr; + } + + return getDoubleBufferLoop(axis, loops, ignore_prologue); +} + +void DoubleBufferInfo::setOriginalAllocSize( + const TensorView* tv, + Val* original_alloc_size) { + getTvInfo(tv).original_alloc_size = original_alloc_size; +} + +Val* DoubleBufferInfo::getOriginalAllocSize(const TensorView* tv) { + if (!tv->isDoubleBuffered()) { + return nullptr; + } + + return getTvInfo(tv).original_alloc_size; +} + +std::vector DoubleBufferPass::run(const std::vector& exprs) { + auto insertion_info = DoubleBufferLoopNestInspector::run(exprs); + return DoubleBufferInserter::run(exprs, insertion_info); +} + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower_double_buffer.h b/torch/csrc/jit/codegen/cuda/lower_double_buffer.h new file mode 100644 index 000000000000..96bc247f4ff6 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/lower_double_buffer.h @@ -0,0 +1,142 @@ +#pragma once + +#include + +#include +#include +#include + +// Double buffering a tensor doubles its allocation size and uses two +// buffers to facilitate computation and memory access +// overlapping. The basic form of code looks like as follows: +// +// Before: +// for i +// x[S]; // allocation +// for j: +// x[j] = y[i, j] +// for j: +// ... = x[j] +// +// After: +// X[S * 2]; // allocation +// for i in 0 to 1: // Prologue +// for j: +// x[j] = y[i, j] +// +// for i in 0 to N-1: // Main +// for j: +// x[j + (1 - i % 2) * S] = y[i + 1, j] +// for j: +// ... = x[j + (i % 2) * S] +// +// for i in N-1 to N: // Epilogue +// for j: +// ... = x[j + (i % 2) * S] +// +// Here, S is the original size of tensor x. +// +// The i loop is the double buffer loop of tensor x, where double +// buffering is applied to the tensor. The first step of lowering is +// to find the double buffering axis for each double buffered +// tensor. It must not be parallelized as it isn't possible to double +// buffer parallelized loops. Also, an unrolled axis expands the +// allocation and is intended to make the loop completely unrolled, +// which also conflicts with double buffering. So, basically, the double +// buffering axis is the inner-most axis within the axes left +// of the CA position. However, when it is parallelized or unrolled, a +// further left axis is picked. +// +// Once the double buffer axis is determined, the main task is to +// replicate the corresponding double buffer loop as illustrated +// above. The Prologue loop is to just fetch the first element to +// populate the buffer. The main loop is mostly the same as the +// original loop, except for the indexing change to switch the two +// buffers. When used as a consumer, an offset of (1 - i % 2) * S is +// added, whereas (i % 2) * S is added when used as a producer. Here, +// i is the index of the double buffer loop. The Epilogue loop is just +// for the last iteration of the loop. Since the main loop reads one +// element ahead of the producer of the double buffered tensor, it +// would require an additional guard to prevent buffer overruns with +// the producer if the main loop were also used for the last +// iteration. However, the value loaded by the invalid load would not +// be used, so instead of adding the additional predicate, the Epilogue +// loop is replicated from the original loop, except for the load +// expression since it's not used. Note that this overrun does not +// happen when the producer is on gmem, so in that case, this +// additional replication is not done. +// +// When creating those three types of loops, additional care must be +// taken when multiple tensors are double buffered. When multiple +// tensors use the same loop as their double buffer loop, one pass of +// replication takes care of them at once, meaning the same Prologue, +// Main, Epilogue loops are used for the multiple tensors. +// +// Other tasks to do for a double buffer tensor include: +// - Move allocation to outside of the double buffer loop +// - Double the allocation size +// - Omit the RAW sync in the Main and Epilogue loops + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +unsigned int getDoubleBufferAxisPosition(const TensorView* tv); + +IterDomain* getDoubleBufferAxis(const TensorView* tv); + +void validateDoubleBufferedTensor(const TensorView* tv); + +class TORCH_CUDA_CU_API DoubleBufferPass { + public: + //! Apply double buffering transformations + static std::vector run(const std::vector& exprs); +}; + +class TORCH_CUDA_CU_API DoubleBufferInfo { + // Lowering information of double buffered tensors. + struct TvInfo { + IterDomain* double_buffer_axis = nullptr; + Val* original_alloc_size = nullptr; + }; + + public: + void build(Fusion* fusion); + + void setDoubleBufferAxis(const TensorView* tv, IterDomain* id); + + IterDomain* getDoubleBufferAxis(const TensorView* tv); + + //! Get a loop that matches with a given double-buffer axis. If + //! ignore_prologue is true, a matched loop is ignored if it's a + //! prologue loop. + static kir::ForLoop* getDoubleBufferLoop( + IterDomain* axis, + const std::vector& loops, + bool ignore_prologue = false); + + //! Get a loop that matches with the double-buffer axis of a given + //! double-buffered tensor. If ignore_prologue is true, a matched + //! loop is ignored if it's a prologue loop. + kir::ForLoop* getDoubleBufferLoop( + const TensorView* tv, + const std::vector& loops, + bool ignore_prologue = false); + + void setOriginalAllocSize(const TensorView* tv, Val* size); + + Val* getOriginalAllocSize(const TensorView* tv); + + private: + TvInfo& getTvInfo(const TensorView* tv); + + private: + //! Keeps track of information for lowering double buffered tensors + std::unordered_map map_; +}; + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp b/torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp index 2353ea9bbf50..281fa05bb2bd 100644 --- a/torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp @@ -305,7 +305,7 @@ class ExprSegmentationSorter { std::deque to_visit_; - std::unordered_set to_merge_; + std::vector> to_merge_; Fusion* fusion_; @@ -541,7 +541,7 @@ ExprGroup* ExprSegmentationSorter::makeEmptyGroup() { ExprGroup* ExprSegmentationSorter::makeEmptyGroup(Expr* expr) { auto group = makeEmptyGroup(); group->exprs().push_back(expr); - if (ir_utils::isTVOp(expr)) { + if (ir_utils::isTvOp(expr)) { auto out_tv = expr->outputs()[0]->as(); // Grab all id's that are shared with other tensors. for (const auto tv_i : c10::irange(out_tv->getComputeAtPosition())) { @@ -649,68 +649,8 @@ ExprGroup* getProducer(ExprGroup* sg1, ExprGroup* sg2) { return nullptr; } -// Go through all expressions and compute a local ordering of loops. operator< -// is implemented based on the concrete_id_dependencies analysis done. If -// there's no dependency between two IDs then order doesn't mater, otherwise we -// can tell which is inner most by checking if there's any dependency -// relationships. -// -// Dependency relationships in concrete_id_dependencies has a "global" view in -// the fusion, so it can resolve ordering by only looking at id's and the -// dependency map. -// -// For example two expressions may have domains: [I0], [I1] Yet we -// won't know the ordering unless we see a domain with: [I0, I1]. This happened -// in advancedIndexing9 (also see AdvancedLowering6) test when merging T5 with -// the group containing T10 (cache of T5, which is post broadcasted output) and -// T6(pre broadcasted output). -// T5 had the domain [0, 1, 2, 3, 4] produce at 3 -// T6 had the domain [0, 3, 4] compute at 3 -// Merging [0, 1, 2] and [0, 3, 4] resulted in the domain [0, 3, 4, 1, 2] -// -// If ID's are not in filter, we don't care about their ordering and ignore -// them. This is because we're only focused on loops we will have to merge -// across groups. If the domain is not in a produce at position in the producer -// edges, or a compute at position in the consumer edges, the expressions we -// look at may not have a unique ordering. - -struct LocalDomainSorter { - LocalDomainSorter( - const std::unordered_map>& - concrete_id_dependencies) - : concrete_id_dependencies_(concrete_id_dependencies) {} - - // Return if id0 should be before id1 - inline bool operator()(IterDomain* id0, IterDomain* id1) { - auto concrete_id_0 = - GpuLower::current()->caLoopMap().getConcreteMappedID(id0); - auto concrete_id_1 = - GpuLower::current()->caLoopMap().getConcreteMappedID(id1); - - if (concrete_id_dependencies_.find(concrete_id_0) != - concrete_id_dependencies_.end()) { - const auto& dependencies_0 = concrete_id_dependencies_.at(concrete_id_0); - // if id0 depends on id1 it means id1 is inside id0, so id0 < id1 - return dependencies_0.count(concrete_id_1); - } - - if (concrete_id_dependencies_.find(concrete_id_1) != - concrete_id_dependencies_.end()) { - const auto& dependencies_1 = concrete_id_dependencies_.at(concrete_id_1); - // if id1 depends on id0 it means id0 is inside id1, so id1 < id0 - return !dependencies_1.count(concrete_id_0); - } - - return true; - } - - const std::unordered_map>& - concrete_id_dependencies_; -}; - std::vector getLocalDomainOrdering( const std::vector& exprs, - const ComputeAtMap& map, const std::unordered_set filter, const std::unordered_map>& concrete_id_dependencies) { @@ -718,10 +658,12 @@ std::vector getLocalDomainOrdering( return std::vector(); } + const auto& ca_map = GpuLower::current()->caMap(); + std::unordered_set domains; for (auto expr : exprs) { - if (!ir_utils::isTVOp(expr)) { + if (!ir_utils::isTvOp(expr)) { continue; } @@ -736,14 +678,17 @@ std::vector getLocalDomainOrdering( tv_input->getComputeAtPosition(), tv_input->getMaxProducerPosition()), std::back_inserter(domain), - [&map](IterDomain* id) { return map.getConcreteMappedID(id); }); + [&ca_map](IterDomain* id) { + return ca_map->getConcreteMappedID(id, IdMappingMode::LOOP); + }); domain.erase( std::remove_if( domain.begin(), domain.end(), - [&filter, &map](IterDomain* id) { - return filter.find(map.getConcreteMappedID(id)) == filter.end(); + [&filter, &ca_map](IterDomain* id) { + return filter.find(ca_map->getConcreteMappedID( + id, IdMappingMode::LOOP)) == filter.end(); }), domain.end()); @@ -755,7 +700,8 @@ std::vector getLocalDomainOrdering( std::sort( merged_domain.begin(), merged_domain.end(), - LocalDomainSorter(concrete_id_dependencies)); + IterDomainDependencySorter( + concrete_id_dependencies, GpuLower::current()->caMap())); return merged_domain; } } // namespace @@ -840,8 +786,8 @@ ExprGroup* ExprSegmentationSorter::makeMergedNode( if (producer_of_consumer_edge->isA()) { auto tv = producer_of_consumer_edge->as(); for (const auto tv_i : c10::irange(tv->getComputeAtPosition())) { - ca_ids.emplace(GpuLower::current()->caLoopMap().getConcreteMappedID( - tv->axis(tv_i))); + ca_ids.emplace(GpuLower::current()->caMap()->getConcreteMappedID( + tv->axis(tv_i), IdMappingMode::LOOP)); } } } @@ -855,8 +801,8 @@ ExprGroup* ExprSegmentationSorter::makeMergedNode( if (consumer_of_producer_edge->isA()) { auto tv = consumer_of_producer_edge->as(); for (const auto tv_i : c10::irange(tv->getMaxProducerPosition())) { - pa_ids.emplace(GpuLower::current()->caLoopMap().getConcreteMappedID( - tv->axis(tv_i))); + pa_ids.emplace(GpuLower::current()->caMap()->getConcreteMappedID( + tv->axis(tv_i), IdMappingMode::LOOP)); } } } @@ -865,10 +811,7 @@ ExprGroup* ExprSegmentationSorter::makeMergedNode( all_ca_pa_ids.insert(pa_ids.begin(), pa_ids.end()); auto ordered_ids = getLocalDomainOrdering( - joined_groups->exprs(), - GpuLower::current()->caLoopMap(), - all_ca_pa_ids, - concrete_id_dependencies); + joined_groups->exprs(), all_ca_pa_ids, concrete_id_dependencies); for (auto id : ordered_ids) { if (ca_ids.count(id)) { @@ -914,8 +857,8 @@ bool canReducePA(ExprGroup* group) { // it can't decide if it can be reduced bool has_matching_pa = false; for (const auto i : c10::irange(consumer_tv->getMaxProducerPosition())) { - if (GpuLower::current()->caLoopMap().areMapped( - consumer_tv->axis(i), group_pa_last_id)) { + if (GpuLower::current()->caMap()->areMapped( + consumer_tv->axis(i), group_pa_last_id, IdMappingMode::LOOP)) { has_matching_pa = true; break; } @@ -931,8 +874,10 @@ bool canReducePA(ExprGroup* group) { static_cast(producer_tv->getComputeAtPosition()); producer_pos_i > 0; producer_pos_i--) { - if (GpuLower::current()->caLoopMap().areMapped( - producer_tv->axis(producer_pos_i - 1), group_pa_last_id)) { + if (GpuLower::current()->caMap()->areMapped( + producer_tv->axis(producer_pos_i - 1), + group_pa_last_id, + IdMappingMode::LOOP)) { return false; } } @@ -990,10 +935,12 @@ void ExprSegmentationSorter::mergeNodes() { std::unordered_set clean_up_edges; while (!to_merge_.empty()) { - auto group1 = *to_merge_.begin(); - auto group2 = group1->payload()->merge_with; - to_merge_.erase(group1); - to_merge_.erase(group2); + ExprGroup *group1 = nullptr, *group2 = nullptr; + std::tie(group1, group2) = to_merge_.back(); + to_merge_.pop_back(); + TORCH_INTERNAL_ASSERT( + group2 == group1->payload()->merge_with, + "Expression Sorter: inconsistent to_merge packing"); clean_up_groups.emplace(group1); clean_up_groups.emplace(group2); makeMergedNode(group1, group2); @@ -1026,8 +973,8 @@ void ExprSegmentationSorter::initializeForLoopDependencies() { tv_id_i > 0; tv_id_i--) { auto tv_id = tv->axis((int)(tv_id_i - 1)); - auto concrete_id = - GpuLower::current()->caLoopMap().getConcreteMappedID(tv_id); + auto concrete_id = GpuLower::current()->caMap()->getConcreteMappedID( + tv_id, IdMappingMode::LOOP); if (concrete_id_dependencies.find(concrete_id) == concrete_id_dependencies.end()) { @@ -1038,8 +985,8 @@ void ExprSegmentationSorter::initializeForLoopDependencies() { } // Loops after tv_id are dependent on tv_id - dependencies.emplace( - GpuLower::current()->caLoopMap().getConcreteMappedID(tv_id)); + dependencies.emplace(GpuLower::current()->caMap()->getConcreteMappedID( + tv_id, IdMappingMode::LOOP)); } } @@ -1067,27 +1014,62 @@ void ExprSegmentationSorter::initializeForLoopDependencies() { std::back_inserter(to_visit), [](const auto& concrete_dep_entry) { return concrete_dep_entry.first; }); + size_t inf_loop_counter = to_visit.size(); + bool failed = false; + while (!to_visit.empty()) { auto id = to_visit.front(); to_visit.pop_front(); + if (inf_loop_counter-- == 0) { + failed = true; + break; + } + auto& dependencies = concrete_id_dependencies.at(id); - bool ready = std::all_of( - dependencies.begin(), dependencies.end(), [&visited](IterDomain* id) { - return visited.count(id); - }); + bool ready = dependencies.empty() || + std::all_of(dependencies.begin(), + dependencies.end(), + [&visited](IterDomain* id) { return visited.count(id); }); if (!ready) { to_visit.push_back(id); continue; } + inf_loop_counter = to_visit.size(); + for (auto dependency : dependencies) { auto dep_of_dep = concrete_id_dependencies.at(dependency); dependencies.insert(dep_of_dep.begin(), dep_of_dep.end()); } visited.emplace(id); } + if (failed) { + std::cerr + << "ERROR: Iteration domain sorting has failed, infinite loop detected." + << std::endl; + std::cerr << "Failed to sort out: " << std::endl; + for (auto entry : to_visit) { + std::cerr << entry->toString(); + if (entry != to_visit.back()) { + std::cerr << ", "; + } + } + + std::cerr << "Depdencies: " << std::endl; + for (const auto& dep_entry : concrete_id_dependencies) { + std::cerr << " Deps of " << dep_entry.first->toString() << std::endl + << " "; + + for (auto dep : dep_entry.second) { + std::cerr << dep->toString() << ", "; + } + std::cerr << std::endl; + } + + TORCH_INTERNAL_ASSERT(false); + } } // Checks if the for loop associated with the concrete ID is ready to be @@ -1145,8 +1127,6 @@ bool ExprSegmentationSorter::supportedMerge(ExprGroup* sg1, ExprGroup* sg2) { return false; } - const auto& loop_map = GpuLower::current()->caLoopMap(); - // If inner loop dependencies have not been resolved, cannot merge. if (!loopReady(producer_ca_domain.back()) || !loopReady(consumer_pa_domain.back())) { @@ -1182,11 +1162,13 @@ bool ExprSegmentationSorter::supportedMerge(ExprGroup* sg1, ExprGroup* sg2) { continue; } - if (!loop_map.areMapped(compute_at_dim, producer_ca_domain.back())) { + if (!GpuLower::current()->caMap()->areMapped( + compute_at_dim, producer_ca_domain.back(), IdMappingMode::LOOP)) { continue; } - if (loop_map.areMapped(compute_at_dim, consumer_pa_domain.back())) { + if (GpuLower::current()->caMap()->areMapped( + compute_at_dim, consumer_pa_domain.back(), IdMappingMode::LOOP)) { return true; } } @@ -1297,8 +1279,7 @@ void ExprSegmentationSorter::sort() { continue; } - to_merge_.emplace(group.get()); - to_merge_.emplace(*candidate_it); + to_merge_.emplace_back(std::make_pair(group.get(), *candidate_it)); group->payload()->merged = true; group->payload()->merge_with = *candidate_it; @@ -1350,8 +1331,7 @@ void ExprSegmentationSorter::sort() { if (testStillDag(group.get(), *candidate_it)) { // Mark in same style as default algorithm for convenience even // though we will only merge once with the fallback - to_merge_.emplace(group.get()); - to_merge_.emplace(*candidate_it); + to_merge_.emplace_back(std::make_pair(group.get(), *candidate_it)); group->payload()->merged = true; group->payload()->merge_with = *candidate_it; diff --git a/torch/csrc/jit/codegen/cuda/lower_fused_reduction.cpp b/torch/csrc/jit/codegen/cuda/lower_fused_reduction.cpp new file mode 100644 index 000000000000..213abda029a6 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/lower_fused_reduction.cpp @@ -0,0 +1,344 @@ +#include +#include +#include +#include + +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +namespace { + +//! An instance of reduction patterns to fuse +class FusedReductionBroadcastInfo : public PolymorphicBase { + public: + FusedReductionBroadcastInfo(ReductionOp* reduction, bool with_broadcast) + : reductions_({reduction}), with_broadcast_({with_broadcast}) {} + + FusedReductionBroadcastInfo(WelfordOp* welford, bool with_broadcast) + : reductions_({welford}), with_broadcast_({with_broadcast}) {} + + FusedReductionBroadcastInfo( + GroupedReductionOp* grouped_rop, + bool with_broadcast) + : reductions_({grouped_rop}), with_broadcast_({with_broadcast}) {} + + const std::vector& reductions() const { + return reductions_; + } + + const std::vector& withBroadcast() const { + return with_broadcast_; + } + + private: + // Holds ReductionOp, WelfordOp or GroupedReductionOp. + std::vector reductions_; + // True each reduction also broadcasts + std::vector with_broadcast_; +}; + +//! Inspect a fusion to detect eligible sequences of expressions to +//! use the fused reduction kernel +class FusionInspector : private IterVisitor { + public: + static std::vector run(Fusion* fusion) { + FusionInspector inspector(fusion); + return inspector.fusion_list_; + } + + private: + FusionInspector(Fusion* fusion) { + traverse(fusion); + } + + using IterVisitor::handle; + + void handle(ReductionOp* rop) final { + /// If it's a grid reduction, keep track of tensors that depend on + /// this reduction. + // Only consider when out is on register as that is assumed in the + // fused reduction kernel. + auto out = ir_utils::getTvOutput(rop); + if (out->getMemoryType() == MemoryType::Local && + out->domain()->hasGridReduction()) { + reduction_dep_[out].insert(rop); + } + } + + void handle(WelfordOp* wop) final { + /// If it's a grid reduction, keep track of tensors that depend on + /// this reduction. + // Only consider when out is on register as that is assumed in the + // fused reduction kernel. + auto out = ir_utils::getTvOutput(wop); + if (out->getMemoryType() == MemoryType::Local && + out->domain()->hasGridReduction()) { + reduction_dep_[out].insert(wop); + } + } + + void handle(GroupedReductionOp* grouped_rop) final { + auto out = ir_utils::getTvOutput(grouped_rop); + if (out->getMemoryType() == MemoryType::Local && + out->domain()->hasGridReduction()) { + reduction_dep_[out].insert(grouped_rop); + } + } + + void handle(Expr* expr) final { + IterVisitor::handle(expr); + for (auto in_tv : ir_utils::filterByType(expr->inputs())) { + for (auto reduction_op : reduction_dep_[in_tv]) { + if (fused_exprs_.find(reduction_op) != fused_exprs_.end()) { + continue; + } + for (auto out_tv : + ir_utils::filterByType(expr->outputs())) { + reduction_dep_[out_tv].insert(reduction_op); + } + } + } + } + + // In the case of welford, use the fused broadcast reduction when at + // least one of the outputs is broadcast. + void handle(BroadcastOp* bop) final { + // Detect a pattern where a reduction is followed by a broadcast + auto bop_out = bop->out()->as(); + auto bop_in = bop->in()->as(); + + for (Expr* preceding_expr : reduction_dep_[bop_in]) { + auto parallel_reduction_axes = + getReductionParallelTypeStates(preceding_expr); + + // If not matching, propagate the reduction further down to + // subsequent expressions + if (!isBroadcastFuseable(bop_out, parallel_reduction_axes)) { + continue; + } + + if (fused_exprs_.find(preceding_expr) != fused_exprs_.end()) { + // Already added to the fusion list. This can happen with + // welford as there can be multiple broadcast consumer + // expressions. + continue; + } + + if (preceding_expr->isA()) { + fusion_list_.emplace_back(preceding_expr->as(), true); + } else if (preceding_expr->isA()) { + fusion_list_.emplace_back( + preceding_expr->as(), true); + } else if (preceding_expr->isA()) { + fusion_list_.emplace_back(preceding_expr->as(), true); + } else { + TORCH_INTERNAL_ASSERT( + false, "Invalid preceding expr: ", preceding_expr->toString()); + } + + fused_exprs_.insert(preceding_expr); + } + } + + ParallelTypeBitmap getReductionParallelTypeStates(Expr* expr) { + ParallelTypeBitmap parallel_reduction_axes; + + for (auto id : ir_utils::getTvOutput(expr)->domain()->domain()) { + auto pt = id->getParallelType(); + if (id->isReduction() && isParallelTypeThread(pt)) { + parallel_reduction_axes.set(pt); + } + } + + return parallel_reduction_axes; + } + + // Requires reduction parallel dimensions to exactly match parallel broadcast + // dimensions + bool isBroadcastFuseable( + TensorView* broadcast_out, + const ParallelTypeBitmap& parallel_reduction_axes) { + const auto broadcast_parallel_types = + GpuLower::current()->threadPredMap().getParallelBroadcastDomains( + broadcast_out); + + // If no parallel broadcast, nothing to fuse + if (broadcast_parallel_types.none()) { + return false; + } + + // Make sure the broadcast parallel types are the types reduced by + // the preceding reduction op + for (auto id : broadcast_out->domain()->domain()) { + auto pt = id->getParallelType(); + if (!isParallelTypeThread(pt)) { + continue; + } + // Parallel broadcast must be included in reduction_states + if (id->isBroadcast() && broadcast_parallel_types.get(pt)) { + if (!parallel_reduction_axes.get(pt)) { + return false; + } + } + } + + return true; + } + + private: + //! List of expression sequences to fuse + std::vector fusion_list_; + //! Keep track of fused reduction/welford exprs to avoid duplication + std::unordered_set fused_exprs_; + //! Keep track of ReductionOp/WelfordOp expressions that are + //! (indirectly) input to a tensor + std::unordered_map> reduction_dep_; +}; + +//! Transform a fusion to use the fused reduction kernel. +class FusionTransformer { + public: + static void run( + Fusion* fusion, + const std::vector& fusion_list) { + FusionTransformer transformer(fusion, fusion_list); + } + + private: + FusionTransformer( + Fusion* fusion, + const std::vector& fusion_list) + : fusion_(fusion), fusion_list_(fusion_list) { + transform(); + } + + void transform() { + for (const auto& info : fusion_list_) { + transform(info); + } + // If the thread predicate map is modified, rebuild the + // map. build() only updates mappings that need to be updated. + if (thread_pred_map_modified_) { + GpuLower::current()->threadPredMap().build(fusion_); + } + } + + void transform(const FusedReductionBroadcastInfo& info) { + TORCH_INTERNAL_ASSERT( + info.reductions().size() == 1, "Horizontal fusion not supported yet"); + + for (const auto i : c10::irange(info.reductions().size())) { + const auto expr = info.reductions().at(i); + const auto with_broadcast = info.withBroadcast().at(i); + Expr* fused_expr = nullptr; + + if (auto reduction = dynamic_cast(expr)) { + TORCH_INTERNAL_ASSERT(!reduction->isAllreduce()); + + auto red_op_type = reduction->getReductionOpType(); + auto init = reduction->init(); + auto out = reduction->out(); + auto in = reduction->in(); + + fusion_->removeExpr(reduction); + + fused_expr = + IrBuilder::create(red_op_type, init, out, in, true); + } else if (auto welford = dynamic_cast(expr)) { + TORCH_INTERNAL_ASSERT(!welford->isAllreduce()); + + auto out_avg = welford->outAvg(); + auto out_var = welford->outVar(); + auto out_n = welford->outN(); + auto init_avg = welford->initAvg(); + auto init_var = welford->initVar(); + auto init_n = welford->initN(); + auto in_avg = welford->inAvg(); + auto in_var = welford->inVar(); + auto in_n = welford->inN(); + + fusion_->removeExpr(welford); + + fused_expr = IrBuilder::create( + out_avg, + out_var, + out_n, + init_avg, + init_var, + init_n, + in_avg, + in_var, + in_n, + true); + } else if (auto grouped_rop = dynamic_cast(expr)) { + TORCH_INTERNAL_ASSERT(!grouped_rop->isAllreduce()); + + auto op_types = grouped_rop->getReductionOpTypes(); + auto init_vals = grouped_rop->initVals(); + auto outputs = grouped_rop->outputs(); + auto inputs = grouped_rop->inputs(); + + fusion_->removeExpr(grouped_rop); + + fused_expr = IrBuilder::create( + op_types, init_vals, outputs, inputs, true); + } else { + TORCH_INTERNAL_ASSERT(false, "Invalid expr: ", expr->toString()); + } + + TORCH_INTERNAL_ASSERT(fused_expr != nullptr); + + // Do not just remove the broadcast but just reset the thread + // predicate of the broadcast op. Since fusion is applied only + // when all parallel broadcast domains are to be parallel + // reduction, all parallel types can be reset. + if (with_broadcast) { + // It may be just fine to remove the broadcast expr, but + // technically speaking that would violate the root domain mapping + // as broadcast domains would appear in the consumer of the + // broadcast output tensor without a broadcast expression. + for (auto reduction_out : + ir_utils::filterByType(fused_expr->outputs())) { + for (auto id : reduction_out->domain()->domain()) { + if (id->isReduction()) { + GpuLower::current()->fusedReductionInfo().markAsAllreduce(id); + GpuLower::current()->threadPredMap().markAsUpdated(reduction_out); + thread_pred_map_modified_ = true; + } + } + } + } + } + } + + private: + Fusion* fusion_ = nullptr; + const std::vector& fusion_list_; + bool thread_pred_map_modified_ = false; +}; + +} // namespace + +void fuseReductionsAndBroadcasts(Fusion* fusion) { + auto fusion_list = FusionInspector::run(fusion); + FusionTransformer::run(fusion, fusion_list); +} + +void FusedReductionInfo::markAsAllreduce(IterDomain* id) { + allreduce_ids_.insert(id); +} + +bool FusedReductionInfo::isAllreduce(IterDomain* id) const { + return allreduce_ids_.find(id) != allreduce_ids_.end(); +} + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower_fused_reduction.h b/torch/csrc/jit/codegen/cuda/lower_fused_reduction.h new file mode 100644 index 000000000000..4307a30bc512 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/lower_fused_reduction.h @@ -0,0 +1,34 @@ +#pragma once + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +//! Keep track of certain patterns of reductions. +//! +//! - Allreduce IterDomain: reduced and broadcast domain. +class FusedReductionInfo { + public: + void markAsAllreduce(IterDomain* id); + + bool isAllreduce(IterDomain* id) const; + + private: + // Reduction IterDomains that are also broadcast + std::unordered_set allreduce_ids_; +}; + +//! Detect reductions and broadcasts that are eligible for the fused +//! reduction kernel. When found, the predicate flags of the broadcast +//! is unset, which effectively makes the broadcast just a unary set +//! op. +//! TODO: Consider moving the warp-based fused reduction here. +void fuseReductionsAndBroadcasts(Fusion*); + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.cpp b/torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.cpp new file mode 100644 index 000000000000..b3e9b1776acf --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.cpp @@ -0,0 +1,149 @@ +#include +#include +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +namespace { + +// Replace trivial reductions with unary ops. +class TrivialReductionReplacement : private OptOutMutator { + public: + TrivialReductionReplacement( + Fusion* fusion, + const TrivialReductionInfo& trivial_reduction_info) + : trivial_reduction_info_(trivial_reduction_info) { + FusionGuard fg(fusion); + auto exprs = StmtSort::getExprs(fusion); + for (auto expr : exprs) { + mutate(expr); + } + } + + private: + using OptOutMutator::mutate; + void mutate(ReductionOp* rop) final { + if (ir_utils::isTvOp(rop)) { + auto out_tv = ir_utils::getTvOutput(rop); + if (std::all_of( + out_tv->domain()->domain().begin(), + out_tv->domain()->domain().end(), + [&](IterDomain* id) { + // If id is a reduction axis, is it a trivial reduction? + if (id->isReduction()) { + return trivial_reduction_info_.isDerived(id); + } else { + return true; + } + })) { + auto out = rop->out(); + auto in = rop->in(); + auto container = out->container(); + removeExpr(container, rop); + IrBuilder::create(container, UnaryOpType::Set, out, in); + } + } + } + + void mutate(GroupedReductionOp* grouped_rop) final { + if (ir_utils::isTvOp(grouped_rop)) { + // The inputs and outputs are all uniform in grouped reductions, + // so just checking one of the input and output pair should be + // sufficient. + auto out_tv = ir_utils::getTvOutput(grouped_rop); + if (std::all_of( + out_tv->domain()->domain().begin(), + out_tv->domain()->domain().end(), + [&](IterDomain* id) { + // If id is a reduction axis, is it a trivial reduction? + if (id->isReduction()) { + return trivial_reduction_info_.isDerived(id); + } else { + return true; + } + })) { + auto outputs = grouped_rop->outputs(); + auto inputs = grouped_rop->inputs(); + auto container = out_tv->container(); + removeExpr(container, grouped_rop); + for (const auto i : c10::irange(outputs.size())) { + IrBuilder::create( + container, UnaryOpType::Set, outputs.at(i), inputs.at(i)); + } + } + } + } + + const TrivialReductionInfo& trivial_reduction_info_; +}; + +// Replaces Transpose, Shift, Gather, and View Ops with Unary Ops. +class UnaryOpInserter : private kir::ExprMutator { + public: + static std::vector insert(const std::vector& exprs) { + UnaryOpInserter inserter(exprs); + return inserter.exprs_; + } + + private: + using kir::ExprMutator::handle; + + UnaryOpInserter(const std::vector& exprs) { + kir::ExprMutator::traverseAndInsert(exprs); + } + + void handle(TransposeOp* top) final { + auto out = top->out(); + auto in = top->in(); + auto container = out->container(); + registerReplace( + top, IrBuilder::create(container, UnaryOpType::Set, out, in)); + } + + void handle(ShiftOp* sop) final { + auto out = sop->out(); + auto in = sop->in(); + auto container = out->container(); + registerReplace( + sop, IrBuilder::create(container, UnaryOpType::Set, out, in)); + } + + void handle(GatherOp* gop) final { + auto out = gop->out(); + auto in = gop->in(); + auto container = out->container(); + registerReplace( + gop, IrBuilder::create(container, UnaryOpType::Set, out, in)); + } + + void handle(ViewOp* vop) final { + auto out = vop->out(); + auto in = vop->in(); + auto container = out->container(); + registerReplace( + vop, IrBuilder::create(container, UnaryOpType::Set, out, in)); + } +}; + +} // namespace + +void trivialReductionReplacement( + Fusion* fusion, + const TrivialReductionInfo& trivial_reduction_info) { + TrivialReductionReplacement replacement(fusion, trivial_reduction_info); +} + +// Transpose, Shift, Gather, and View Ops with Unary Set Ops +std::vector unarySetOpInserter(const std::vector& exprs) { + return UnaryOpInserter::insert(exprs); +} + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.h b/torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.h new file mode 100644 index 000000000000..e18f4a8f0778 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.h @@ -0,0 +1,26 @@ +#pragma once + +#include + +#include +#include +#include +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +// Replaces trivial reductions with Unary Set Ops +void trivialReductionReplacement(Fusion*, const TrivialReductionInfo&); + +// Transpose, Shift, Gather, and View Ops with Unary Set Ops +std::vector unarySetOpInserter(const std::vector& exprs); + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower_index.cpp b/torch/csrc/jit/codegen/cuda/lower_index.cpp index d92dd279b179..a1a658f580a0 100644 --- a/torch/csrc/jit/codegen/cuda/lower_index.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_index.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include #include #include @@ -13,30 +13,24 @@ namespace jit { namespace fuser { namespace cuda { -IndexLowering::IndexLowering() : ir_builder_(GpuLower::current()->kernel()) {} - -kir::Val* IndexLowering::lowerSrcIndex(kir::Val* src, kir::Val* dst) const { - if (auto tv = dynamic_cast(src)) { - TORCH_INTERNAL_ASSERT(dst->isA()); - return Index::getProducerIndex( - tv->fuserTv(), - dst->as()->fuserTv(), - scope_utils::getLoops(active_scope_expr_)); +Val* IndexLowering::lowerSrcIndex(Val* src, Val* dst) const { + if (auto tv = dynamic_cast(src)) { + TORCH_INTERNAL_ASSERT(dst->isA()); + return Index::getProducerIndex(tv, dst->as(), for_loops_); } else { return src; } } -kir::Val* IndexLowering::lowerDstIndex(kir::Val* dst) const { - if (auto tv = dynamic_cast(dst)) { - return Index::getConsumerIndex( - tv->fuserTv(), scope_utils::getLoops(active_scope_expr_)); +Val* IndexLowering::lowerDstIndex(Val* dst) const { + if (auto tv = dynamic_cast(dst)) { + return Index::getConsumerIndex(tv, for_loops_); } else { return dst; } } -void IndexLowering::pushBack(kir::Expr* expr) { +void IndexLowering::pushBack(Expr* expr) { if (active_scope_ == nullptr) { lowered_exprs_.push_back(expr); } else { @@ -44,78 +38,113 @@ void IndexLowering::pushBack(kir::Expr* expr) { } } -void IndexLowering::visit(const kir::IfThenElse* ite) { - const auto prev_scope_expr = active_scope_expr_; +Expr* IndexLowering::back() const { + if (active_scope_ == nullptr) { + TORCH_INTERNAL_ASSERT( + !lowered_exprs_.empty(), "IndexLowering::back: empty scope."); + return lowered_exprs_.back(); + } + TORCH_INTERNAL_ASSERT( + !active_scope_->empty(), "IndexLowering::back: empty scope."); + return active_scope_->exprs().back(); +} + +void IndexLowering::insertAtTopLevel(Expr* expr) { + TORCH_INTERNAL_ASSERT(!lowered_exprs_.empty()); + lowered_exprs_.insert(lowered_exprs_.end() - 1, expr); +} + +void IndexLowering::handle(const kir::IfThenElse* ite) { const auto prev_scope = active_scope_; - // TODO(kir): try to avoid recreating new nodes and leaving old ones around - auto new_ite = ir_builder_.create(ite->predicate()); + auto new_ite = IrBuilder::create(ite->predicate()); pushBack(new_ite); - active_scope_expr_ = new_ite; active_scope_ = &new_ite->thenBody(); for (auto expr : ite->thenBody().exprs()) { - expr->accept(this); + OptOutConstDispatch::handle(expr); } active_scope_ = &new_ite->elseBody(); for (auto expr : ite->elseBody().exprs()) { - expr->accept(this); + OptOutConstDispatch::handle(expr); } active_scope_ = prev_scope; - active_scope_expr_ = prev_scope_expr; } -void IndexLowering::visit(const kir::ForLoop* for_loop) { - const auto prev_scope_expr = active_scope_expr_; +void IndexLowering::handle(const kir::ForLoop* for_loop) { const auto prev_scope = active_scope_; - auto new_for_loop = ir_builder_.create(for_loop); + auto new_for_loop = IrBuilder::create(for_loop); pushBack(new_for_loop); - active_scope_expr_ = new_for_loop; active_scope_ = &new_for_loop->body(); + for_loops_.push_back(new_for_loop); for (auto expr : for_loop->body().exprs()) { - expr->accept(this); + OptOutConstDispatch::handle(expr); } + for_loops_.pop_back(); active_scope_ = prev_scope; - active_scope_expr_ = prev_scope_expr; } -void IndexLowering::visit(const kir::UnaryOp* uop) { +void IndexLowering::handle(const UnaryOp* uop) { const auto in = lowerSrcIndex(uop->in(), uop->out()); const auto out = lowerDstIndex(uop->out()); - pushBack(ir_builder_.create(uop->operation(), out, in)); + pushBack(IrBuilder::create(uop->getUnaryOpType(), out, in)); + GpuLower::current()->propagateExprInfo(uop, back()); } -void IndexLowering::visit(const kir::BinaryOp* bop) { +void IndexLowering::handle(const BinaryOp* bop) { const auto lhs = lowerSrcIndex(bop->lhs(), bop->out()); const auto rhs = lowerSrcIndex(bop->rhs(), bop->out()); const auto out = lowerDstIndex(bop->out()); - pushBack(ir_builder_.create(bop->operation(), out, lhs, rhs)); + pushBack(IrBuilder::create(bop->getBinaryOpType(), out, lhs, rhs)); + GpuLower::current()->propagateExprInfo(bop, back()); } -void IndexLowering::visit(const kir::TernaryOp* top) { +void IndexLowering::handle(const TernaryOp* top) { const auto in1 = lowerSrcIndex(top->in1(), top->out()); const auto in2 = lowerSrcIndex(top->in2(), top->out()); const auto in3 = lowerSrcIndex(top->in3(), top->out()); const auto out = lowerDstIndex(top->out()); - pushBack( - ir_builder_.create(top->operation(), out, in1, in2, in3)); + pushBack(IrBuilder::create( + top->getTernaryOpType(), out, in1, in2, in3)); + GpuLower::current()->propagateExprInfo(top, back()); +} + +void IndexLowering::handle(const ViewAsScalar* uop) { + const auto in = lowerSrcIndex(uop->in(), uop->out()); + const auto out = lowerDstIndex(uop->out()); + for (auto loop : for_loops_) { + if (GpuLower::current()->caMap()->areMapped( + loop->iter_domain(), + uop->vector_id()->as(), + IdMappingMode::LOOP)) { + Val* index = loop->index(); + pushBack( + IrBuilder::create(out, in, uop->vector_id(), index)); + GpuLower::current()->propagateExprInfo(uop, back()); + return; + } + } + TORCH_INTERNAL_ASSERT(false, "Can not find index for vector dim"); } namespace { // Get the size of the temporary work buffer for grid communication, this can be // grid reduction, broadcast, or grid welford. -kir::Val* getGridCommWorkBufferSize( - kir::IrBuilder& ir_builder, - const kir::TensorDomain* td) { +// expansion_factor can be optionally passed to expand the allocation +// size. For example, FusedReduction should double the work buffer size. +Val* getGridCommWorkBufferSize( + const TensorDomain* td, + const std::vector& for_loops = {}, + int expansion_factor = 1) { // The buffer size is the number of thread blocks multiplied by the // number of threads not used for reduction domains. // Note: Previously it was calculated based on the shape of the @@ -125,7 +154,11 @@ kir::Val* getGridCommWorkBufferSize( // size if the parallel dimensions are exact, but otherwise, just // computing the buffer size based on the tensor shape isn't // sufficient since there could be extra threads/blocks. - kir::Val* buffer_size = ir_builder.create(1); + TORCH_INTERNAL_ASSERT( + expansion_factor >= 1, "Invalid expansion factor: ", expansion_factor); + Val* buffer_size = expansion_factor == 1 + ? GpuLower::current()->kernel()->oneVal() + : IrBuilder::create(expansion_factor); for (auto pt : kParallelTypeThreads) { auto pt_dim = GpuLower::current()->parallelDimensionMap().get(pt); if (pt_dim == nullptr || pt_dim->isOneInt()) { @@ -133,172 +166,415 @@ kir::Val* getGridCommWorkBufferSize( } if (isParallelTypeThreadDim(pt) && std::any_of(td->domain().begin(), td->domain().end(), [&](auto out_id) { - return out_id->parallelType() == pt && + return out_id->getParallelType() == pt && (out_id->isReduction() || out_id->isBroadcast()); })) { continue; } - buffer_size = ir_builder.mulExpr(buffer_size, pt_dim); + buffer_size = SimplifyingIrBuilder::mulExpr(buffer_size, pt_dim); + } + + // All iteration domains require a separate entry in the buffer for re-entrant + // grid reductions. + for (auto fl : for_loops) { + if (fl->isTrivial()) { + continue; + } + if (fl->iter_domain()->isThread()) { + // already accounted for. + continue; + } + buffer_size = + SimplifyingIrBuilder::mulExpr(buffer_size, fl->iter_domain()->extent()); } + return buffer_size; } -kir::Val* getGridSyncBufferSize( - kir::IrBuilder& ir_builder, - const kir::TensorDomain* td) { +Val* getGridSyncBufferSize( + const TensorDomain* td, + const std::vector& for_loops = {}) { // See the comment above for getGridCommWorkBufferSize. - kir::Val* buffer_size = ir_builder.create(1); + Val* buffer_size = GpuLower::current()->kernel()->oneVal(); for (auto pt : kParallelTypeBIDs) { auto pt_dim = GpuLower::current()->parallelDimensionMap().get(pt); if (pt_dim == nullptr || pt_dim->isOneInt()) { continue; } if (std::any_of(td->domain().begin(), td->domain().end(), [&](auto out_id) { - return out_id->parallelType() == pt && + return out_id->getParallelType() == pt && (out_id->isReduction() || out_id->isBroadcast()); })) { continue; } - buffer_size = ir_builder.mulExpr(buffer_size, pt_dim); + buffer_size = SimplifyingIrBuilder::mulExpr(buffer_size, pt_dim); + } + + // All iteration domains require a separate semaphore for re-entrant grid + // reductions + for (auto fl : for_loops) { + if (fl->isTrivial()) { + continue; + } + if (fl->iter_domain()->isThread()) { + // already accounted for. + continue; + } + + buffer_size = + SimplifyingIrBuilder::mulExpr(buffer_size, fl->iter_domain()->extent()); } + return buffer_size; } -// Allocate global buffer for a grid communication calls, i.e. grid reduce, grid -// welford reduce, grid broadcast. -kir::Allocate* allocGlobalBufferForGridComm( - kir::IrBuilder& ir_builder, - kir::Val* buffer_size, - DataType dtype, - bool zero_init) { - const std::vector new_buffer_ids = { - ir_builder.create(ir_builder.zeroVal(), buffer_size)}; - const auto buffer_domain = - ir_builder.create(new_buffer_ids); - const auto buffer_tv = ir_builder.create( - dtype, buffer_domain, MemoryType::Global); - return ir_builder.create( - buffer_tv, buffer_tv->memoryType(), nullptr, zero_init); +Val* getEntranceCountGridReduce(std::vector& for_loops) { + Val* grid_reduction_entrances = GpuLower::current()->kernel()->oneVal(); + + for (const auto loop : for_loops) { + if (loop->isTrivial()) { + continue; + } + if (loop->iter_domain()->isThread()) { + // already accounted for. + continue; + } + // TODO: Does this work for shift/gather? + grid_reduction_entrances = SimplifyingIrBuilder::mulExpr( + grid_reduction_entrances, loop->iter_domain()->extent()); + } + return grid_reduction_entrances; +} + +// Linear indexing of for loops for multiple entrances into grid reduce +// TODO: What happens if there's a broadcast that's resolved (not present in the +// grid reduce) but the global buffer isn't expanded? +Val* getEntranceLinIndGridReduce(std::vector& for_loops) { + Val* linear_index = GpuLower::current()->kernel()->zeroVal(); + + for (const auto loop : for_loops) { + if (loop->isTrivial()) { + continue; + } + if (loop->iter_domain()->isThread()) { + // already accounted for. + continue; + } + // TODO: Does this work for shift/gather? + linear_index = SimplifyingIrBuilder::addExpr( + SimplifyingIrBuilder::mulExpr( + linear_index, loop->iter_domain()->extent()), + loop->index()); + } + return linear_index; } } // namespace -void IndexLowering::visit(const kir::ReductionOp* rop) { - TORCH_INTERNAL_ASSERT(ir_utils::isTVOp(rop)); +void IndexLowering::handle(const ReductionOp* rop) { + TORCH_INTERNAL_ASSERT(ir_utils::isTvOp(rop)); - const auto out_tv = rop->out()->as(); + const auto out_tv = rop->out()->as(); const auto out_domain = out_tv->domain(); - const bool is_block_reduce = out_domain->hasBlockReduction(); - const bool is_grid_reduce = out_domain->hasGridReduction(); + const bool has_block_reduce = out_domain->hasBlockReduction(); + const bool has_grid_reduce = out_domain->hasGridReduction(); + + const auto out = lowerDstIndex(rop->out()); + const auto in = lowerSrcIndex(rop->in(), rop->out()); + + if (has_grid_reduce) { + handleGridReduction(rop, out, in); + } else if (has_block_reduce) { + handleBlockReduction(rop, out, in); + } else { + pushBack( + IrBuilder::create(rop->getReductionOpType(), out, out, in)); + GpuLower::current()->propagateExprInfo(rop, back()); + } +} + +void IndexLowering::handleBlockReduction( + const ReductionOp* rop, + Val* out, + Val* in) { + TORCH_INTERNAL_ASSERT(ir_utils::isTvOp(rop)); + + ReductionOp* indexed_rop = IrBuilder::create( + rop->getReductionOpType(), rop->init(), out, in, rop->isAllreduce()); + if (rop->predicate()) { + indexed_rop->setPredicate(rop->predicate()); + } + if (rop->writePredicate()) { + indexed_rop->setWritePredicate(rop->writePredicate()); + } + + pushBack(indexed_rop); + GpuLower::current()->propagateExprInfo(rop, back()); +} + +void IndexLowering::handleGridReduction( + const ReductionOp* rop, + Val* out, + Val* in) { + const auto out_tv = out->as()->view(); + const auto out_domain = out_tv->domain(); + + TORCH_INTERNAL_ASSERT(out_domain->hasGridReduction()); // If we do a grid reduction we can't have a reduction axis that is not bound - // to a grid or block dim () - if (is_grid_reduce) { - TORCH_INTERNAL_ASSERT( - std::none_of( - out_domain->domain().begin(), - out_domain->domain().end(), - [](kir::IterDomain* id) { - return !id->isThread() && id->isReduction() && - !id->extent()->isOneInt(); - }), - "Found a reduction stage that has both a non-parallelized ", - "reduction and a grid reduction. This is not supported, ", - "please use rfactor to do the serialized reduction first, ", - "then the grid reduction."); + // to a grid or block dim. + TORCH_INTERNAL_ASSERT( + std::none_of( + out_domain->domain().begin(), + out_domain->domain().end(), + [](IterDomain* id) { + return !id->isThread() && id->isReduction() && + !id->extent()->isOneInt(); + }), + "Found a reduction stage that has both a non-parallelized ", + "reduction and a grid reduction. This is not supported, ", + "please use rfactor to do the serialized reduction first, ", + "then the grid reduction."); + + // When using the fused reduction in a loop, the global work buffer + // is double buffered to save global synchronizations. + auto is_within_a_loop = std::any_of( + out_domain->domain().begin(), + out_domain->domain().end(), + [](IterDomain* id) { return !isTrivialIterDomain(id); }); + + // Use a unique buffer for work and sync flag when called within a + // loop unless it's persistent. Grid all reduce means persistence is + // required. However, not being a grid all reduce does not mean + // non-persistence. Currently, if a cooperative grid reduction is + // required anywhere in the kernel, all grid reducitons are done in + // a persistent manner, so all grid reductions should be consulted. + // TODO: fix this + const bool privatize_buffer = !rop->isAllreduce(); + + const auto reduce_buffer = ir_utils::allocGlobalBufferForGridComm( + getGridCommWorkBufferSize( + out_domain, + privatize_buffer ? for_loops_ : std::vector(), + rop->isAllreduce() && is_within_a_loop ? 2 : 1), + out->dtype(), + false); + + const auto sync_buffer = ir_utils::allocGlobalBufferForGridComm( + getGridSyncBufferSize( + out_domain, + privatize_buffer ? for_loops_ : std::vector()), + DataType::Int, + true); + + const auto entrance_ind = privatize_buffer + ? getEntranceLinIndGridReduce(for_loops_) + : GpuLower::current()->kernel()->zeroVal(); + const auto n_entrances = privatize_buffer + ? getEntranceCountGridReduce(for_loops_) + : GpuLower::current()->kernel()->oneVal(); + + // The thread predicate for GridReduction needs to be set + // separately from the main predicate. Do not combine them like + // other expressions. + const auto& thread_pred = + GpuLower::current()->threadPredMap().getPredicatedParallelTypes(out_tv); + + auto grid_reduction = IrBuilder::create( + rop->getReductionOpType(), + rop->init(), + out, + in, + reduce_buffer, + sync_buffer, + entrance_ind, + n_entrances, + rop->isAllreduce()); + + grid_reduction->setThreadPredicate(thread_pred); + + if (rop->predicate()) { + grid_reduction->setPredicate(rop->predicate()); + } + if (rop->writePredicate()) { + grid_reduction->setWritePredicate(rop->writePredicate()); } - const auto out = lowerDstIndex(rop->out()); - const auto in = lowerSrcIndex(rop->in(), rop->out()); + pushBack(reduce_buffer); + pushBack(sync_buffer); + pushBack(grid_reduction); + GpuLower::current()->propagateExprInfo(rop, back()); + + if (rop->isAllreduce()) { + // When using the fused reduction, allocate the reduction object at + // the outer-most scope + auto fused_reduction_alloc_reduction = + IrBuilder::create(grid_reduction); + insertAtTopLevel(fused_reduction_alloc_reduction); + } +} - kir::ReductionOp* block_reduction_op = nullptr; +void IndexLowering::handle(const GroupedReductionOp* grouped_rop) { + TORCH_INTERNAL_ASSERT(ir_utils::isTvOp(grouped_rop)); - if (is_block_reduce) { - block_reduction_op = ir_builder_.create( - rop->operation(), rop->init(), out, in); - if (rop->predicate()) { - block_reduction_op->setPredicate(rop->predicate()); - } - if (rop->writePredicate()) { - block_reduction_op->setWritePredicate(rop->writePredicate()); - } - pushBack(block_reduction_op); - } - - if (is_grid_reduce) { - const auto reduce_buffer = allocGlobalBufferForGridComm( - ir_builder_, - getGridCommWorkBufferSize(ir_builder_, out_domain), - out->dtype(), - false); - - const auto sync_buffer = allocGlobalBufferForGridComm( - ir_builder_, - getGridSyncBufferSize(ir_builder_, out_domain), - DataType::Int, - true); - - const auto grid_reduction_op = (block_reduction_op == nullptr) - ? ir_builder_.create( - rop->operation(), rop->init(), out, in) - : block_reduction_op; - - // The thread predicate for GridReduction needs to be set - // separately from the main predicate. Do not combine them like - // other expressions. - const auto& thread_pred = - GpuLower::current()->threadPredMap().getPredicatedParallelTypes( - out_tv->fuserTv()); - auto grid_reduction = ir_builder_.create( - grid_reduction_op, reduce_buffer, sync_buffer); - grid_reduction->setThreadPredicate(thread_pred); - - if (rop->predicate()) { - // If preceded by a blockReduce, all thread blocks should have - // valid inputs to gridReduce. In fact, using the original - // predicate does not work when the write predicate of the - // blockReduce is different from the read predicate. - if (is_block_reduce) { - grid_reduction->setPredicate( - ir_builder_.create(ir_builder_.trueVal())); - } else { - grid_reduction->setPredicate(rop->predicate()); - } - } + const auto out_tv = ir_utils::getTvOutput(grouped_rop); + const auto out_domain = out_tv->domain(); + + const bool has_block_reduce = out_domain->hasBlockReduction(); + const bool has_grid_reduce = out_domain->hasGridReduction(); + + std::vector indexed_outputs(grouped_rop->numReductions()); + std::vector indexed_inputs(grouped_rop->numReductions()); + + for (const auto i : c10::irange(grouped_rop->numReductions())) { + indexed_outputs.at(i) = lowerDstIndex(grouped_rop->output(i)); + indexed_inputs.at(i) = + lowerSrcIndex(grouped_rop->input(i), grouped_rop->output(i)); + } - if (rop->writePredicate()) { - grid_reduction->setWritePredicate(rop->writePredicate()); + if (has_grid_reduce) { + handleGridReduction(grouped_rop, indexed_outputs, indexed_inputs); + } else if (has_block_reduce) { + handleBlockReduction(grouped_rop, indexed_outputs, indexed_inputs); + } else { + for (const auto i : c10::irange(grouped_rop->numReductions())) { + pushBack(IrBuilder::create( + grouped_rop->getReductionOpType(i), + indexed_outputs.at(i), + indexed_outputs.at(i), + indexed_inputs.at(i))); } + } +} +void IndexLowering::handleBlockReduction( + const GroupedReductionOp* grouped_rop, + const std::vector& outputs, + const std::vector& inputs) { + TORCH_INTERNAL_ASSERT(ir_utils::isTvOp(grouped_rop)); + + GroupedReductionOp* indexed_rop = IrBuilder::create( + grouped_rop->getReductionOpTypes(), + grouped_rop->initVals(), + outputs, + inputs, + grouped_rop->isAllreduce()); + if (grouped_rop->predicate()) { + indexed_rop->setPredicate(grouped_rop->predicate()); + } + if (grouped_rop->writePredicate()) { + indexed_rop->setWritePredicate(grouped_rop->writePredicate()); + } + + pushBack(indexed_rop); + GpuLower::current()->propagateExprInfo(grouped_rop, back()); +} + +void IndexLowering::handleGridReduction( + const GroupedReductionOp* grouped_rop, + const std::vector& outputs, + const std::vector& inputs) { + const auto out_tv = ir_utils::getTvOutput(grouped_rop); + const auto out_domain = out_tv->domain(); + + TORCH_INTERNAL_ASSERT(out_domain->hasGridReduction()); + + // If we do a grid reduction we can't have a reduction axis that is not bound + // to a grid or block dim. + TORCH_INTERNAL_ASSERT( + std::none_of( + out_domain->domain().begin(), + out_domain->domain().end(), + [](IterDomain* id) { + return !id->isThread() && id->isReduction() && + !id->extent()->isOneInt(); + }), + "Found a reduction stage that has both a non-parallelized ", + "reduction and a grid reduction. This is not supported, ", + "please use rfactor to do the serialized reduction first, ", + "then the grid reduction."); + + // When using the fused reduction in a loop, the global work buffer + // is double buffered to save global synchronizations. + auto is_within_a_loop = std::any_of( + out_domain->domain().begin(), + out_domain->domain().end(), + [](IterDomain* id) { return !isTrivialIterDomain(id); }); + + std::vector reduce_buffers; + std::transform( + outputs.begin(), + outputs.end(), + std::back_inserter(reduce_buffers), + [&](Val* output) { + return ir_utils::allocGlobalBufferForGridComm( + getGridCommWorkBufferSize( + out_domain, + for_loops_, + (grouped_rop->isAllreduce() && is_within_a_loop ? 2 : 1)), + output->dtype(), + false); + }); + + const auto sync_buffer = ir_utils::allocGlobalBufferForGridComm( + getGridSyncBufferSize(out_domain, for_loops_), DataType::Int, true); + + // The thread predicate for GridReduction needs to be set + // separately from the main predicate. Do not combine them like + // other expressions. + const auto& thread_pred = + GpuLower::current()->threadPredMap().getPredicatedParallelTypes(out_tv); + + auto grid_reduction = IrBuilder::create( + grouped_rop->getReductionOpTypes(), + grouped_rop->initVals(), + outputs, + inputs, + reduce_buffers, + sync_buffer, + grouped_rop->isAllreduce()); + + grid_reduction->setThreadPredicate(thread_pred); + + if (grouped_rop->predicate()) { + grid_reduction->setPredicate(grouped_rop->predicate()); + } + if (grouped_rop->writePredicate()) { + grid_reduction->setWritePredicate(grouped_rop->writePredicate()); + } + + for (auto reduce_buffer : reduce_buffers) { pushBack(reduce_buffer); - pushBack(sync_buffer); - pushBack(grid_reduction); } + pushBack(sync_buffer); + pushBack(grid_reduction); + GpuLower::current()->propagateExprInfo(grouped_rop, back()); - if (!is_block_reduce && !is_grid_reduce) { - // TODO(kir): this breaks our "SSA" form - pushBack(ir_builder_.create(rop->operation(), out, out, in)); + if (grouped_rop->isAllreduce()) { + auto fused_reduction_alloc_reduction = + IrBuilder::create(grid_reduction); + insertAtTopLevel(fused_reduction_alloc_reduction); } } -void IndexLowering::visit(const kir::WelfordOp* wop) { - TORCH_INTERNAL_ASSERT(ir_utils::isTVOp(wop)); +void IndexLowering::handle(const WelfordOp* wop) { + TORCH_INTERNAL_ASSERT(ir_utils::isTvOp(wop)); - const auto out_tv = wop->outAvg()->as(); + const auto out_tv = wop->outAvg()->as(); const auto out_domain = out_tv->domain(); - const bool is_block_reduce = out_domain->hasBlockReduction(); - const bool is_grid_reduce = out_domain->hasGridReduction(); + const bool has_block_reduce = out_domain->hasBlockReduction(); + const bool has_grid_reduce = out_domain->hasGridReduction(); // If we do a grid reduction we can't have a reduction axis that is not bound // to a grid or block dim () - if (is_grid_reduce) { + if (has_grid_reduce) { TORCH_INTERNAL_ASSERT( std::none_of( out_domain->domain().begin(), out_domain->domain().end(), - [](kir::IterDomain* id) { + [](IterDomain* id) { return !id->isThread() && id->isReduction(); }), "Found a reduction stage that has both a non-parallelized ", @@ -322,96 +598,159 @@ void IndexLowering::visit(const kir::WelfordOp* wop) { auto out_var = lowerDstIndex(wop->outVar()); auto out_N = lowerDstIndex(wop->outN()); - kir::WelfordOp* welford_op = ir_builder_.create( - out_var, + WelfordOp* indexed_wop = IrBuilder::create( out_avg, + out_var, out_N, - wop->initVar(), wop->initAvg(), + wop->initVar(), wop->initN(), - in_var, in_avg, - in_N); + in_var, + in_N, + wop->isAllreduce()); - kir::WelfordOp* block_welford_op = nullptr; + if (wop->predicate()) { + indexed_wop->setPredicate(wop->predicate()); + } + if (wop->writePredicate()) { + indexed_wop->setWritePredicate(wop->writePredicate()); + } - if (is_block_reduce) { - block_welford_op = welford_op; - if (wop->predicate()) { - block_welford_op->setPredicate(wop->predicate()); - } - if (wop->writePredicate()) { - block_welford_op->setWritePredicate(wop->writePredicate()); - } - pushBack(block_welford_op); - } - - if (is_grid_reduce) { - // Buffer allocation - const auto work_buffer_size = - getGridCommWorkBufferSize(ir_builder_, out_domain); - - const auto out_var_buffer = allocGlobalBufferForGridComm( - ir_builder_, work_buffer_size, out_var->dtype(), false); - const auto out_avg_buffer = allocGlobalBufferForGridComm( - ir_builder_, work_buffer_size, out_avg->dtype(), false); - const auto out_N_buffer = allocGlobalBufferForGridComm( - ir_builder_, work_buffer_size, out_N->dtype(), false); - - const auto sync_buffer = allocGlobalBufferForGridComm( - ir_builder_, - getGridSyncBufferSize(ir_builder_, out_domain), - DataType::Int, - true); - - // Grid Welford instantiation - const auto grid_welford_op = - (block_welford_op == nullptr) ? welford_op : block_welford_op; - - // The thread predicate for GridReduction needs to be set - // separately from the main predicate. Do not combine them like - // other expressions. - const auto& thread_pred = - GpuLower::current()->threadPredMap().getPredicatedParallelTypes( - out_tv->fuserTv()); - - auto grid_welford = ir_builder_.create( - grid_welford_op, - out_var_buffer, - out_avg_buffer, - out_N_buffer, - sync_buffer); - - grid_welford->setThreadPredicate(thread_pred); - - if (wop->predicate()) { - grid_welford->setPredicate(wop->predicate()); + // Serial welford + if (!has_block_reduce && !has_grid_reduce) { + pushBack(indexed_wop); + GpuLower::current()->propagateExprInfo(wop, back()); + return; + } + + // Block-only welford + if (!has_grid_reduce) { + pushBack(indexed_wop); + GpuLower::current()->propagateExprInfo(wop, back()); + return; + } + + handleGridWelford(indexed_wop); +} + +void IndexLowering::handleGridWelford(WelfordOp* indexed_wop) { + const auto out_tv = indexed_wop->out()->as()->view(); + const auto out_domain = out_tv->domain(); + + // Buffer allocation + // When using the fused reduction in a loop, the global work buffer + // is double buffered to save global synchronizations. + auto is_within_a_loop = std::any_of( + out_domain->domain().begin(), + out_domain->domain().end(), + [](IterDomain* id) { return !isTrivialIterDomain(id); }); + + // TODO: See the comment on the same variable in handleGridReduction + const bool privatize_buffer = !indexed_wop->isAllreduce(); + + const auto work_buffer_size = getGridCommWorkBufferSize( + out_domain, + privatize_buffer ? for_loops_ : std::vector(), + indexed_wop->isAllreduce() && is_within_a_loop ? 2 : 1); + + const auto out_var_buffer = ir_utils::allocGlobalBufferForGridComm( + work_buffer_size, indexed_wop->outVar()->dtype(), false); + const auto out_avg_buffer = ir_utils::allocGlobalBufferForGridComm( + work_buffer_size, indexed_wop->outAvg()->dtype(), false); + const auto out_N_buffer = ir_utils::allocGlobalBufferForGridComm( + work_buffer_size, indexed_wop->outN()->dtype(), false); + + const auto sync_buffer = ir_utils::allocGlobalBufferForGridComm( + getGridSyncBufferSize( + out_domain, + privatize_buffer ? for_loops_ : std::vector()), + DataType::Int, + true); + + const auto entrance_ind = privatize_buffer + ? getEntranceLinIndGridReduce(for_loops_) + : GpuLower::current()->kernel()->zeroVal(); + const auto n_entrances = privatize_buffer + ? getEntranceCountGridReduce(for_loops_) + : GpuLower::current()->kernel()->oneVal(); + + // The thread predicate for GridReduction needs to be set + // separately from the main predicate. Do not combine them like + // other expressions. + const auto& thread_pred = + GpuLower::current()->threadPredMap().getPredicatedParallelTypes(out_tv); + + auto grid_welford = IrBuilder::create( + indexed_wop, + out_var_buffer, + out_avg_buffer, + out_N_buffer, + sync_buffer, + entrance_ind, + n_entrances); + + grid_welford->setThreadPredicate(thread_pred); + + const bool block_reduce_separated = + out_domain->hasBlockReduction() && !indexed_wop->isAllreduce(); + + if (indexed_wop->predicate()) { + if (block_reduce_separated) { + grid_welford->setPredicate(IrBuilder::create( + GpuLower::current()->kernel()->trueVal())); + } else { + grid_welford->setPredicate(indexed_wop->predicate()); } + } - pushBack(out_var_buffer); - pushBack(out_avg_buffer); - pushBack(out_N_buffer); - pushBack(sync_buffer); - pushBack(grid_welford); + if (indexed_wop->writePredicate()) { + grid_welford->setWritePredicate(indexed_wop->writePredicate()); } - if (!is_block_reduce && !is_grid_reduce) { - pushBack(welford_op); + if (block_reduce_separated) { + pushBack(indexed_wop); + GpuLower::current()->propagateExprInfo(indexed_wop, back()); + } + + pushBack(out_var_buffer); + pushBack(out_avg_buffer); + pushBack(out_N_buffer); + pushBack(sync_buffer); + pushBack(grid_welford); + GpuLower::current()->propagateExprInfo(indexed_wop, back()); + + if (indexed_wop->isAllreduce()) { + // When using the fused reduction, allocate the reduction object at + // the outer-most scope + auto fused_reduction_alloc_reduction = + IrBuilder::create(grid_welford); + insertAtTopLevel(fused_reduction_alloc_reduction); } } -void IndexLowering::visit(const kir::BroadcastOp* bop) { - TORCH_INTERNAL_ASSERT(ir_utils::isTVOp(bop)); +void IndexLowering::handle(const MmaOp* mma) { + const auto a = lowerSrcIndex(mma->inA(), mma->out()); + const auto b = lowerSrcIndex(mma->inB(), mma->out()); + const auto out = lowerDstIndex(mma->out()); + auto mma_indexed = + IrBuilder::create(out, a, b, mma->init(), mma->options()); + pushBack(mma_indexed); + GpuLower::current()->propagateExprInfo(mma, back()); +} - const auto out_tv = bop->out()->as(); +void IndexLowering::handle(const BroadcastOp* bop) { + TORCH_INTERNAL_ASSERT(ir_utils::isTvOp(bop)); + + const auto out_tv = bop->out()->as(); const auto out = lowerDstIndex(bop->out()); const auto in = lowerSrcIndex(bop->in(), bop->out()); - auto indexed_expr = ir_builder_.create(out, in); + auto indexed_expr = + IrBuilder::create(out, in, bop->getBroadcastDimFlags()); const ParallelTypeBitmap parallel_bitmap = - GpuLower::current()->threadPredMap().getParallelBroadcastDomains( - out_tv->fuserTv()); + GpuLower::current()->threadPredMap().getParallelBroadcastDomains(out_tv); const bool block_x = parallel_bitmap.get(ParallelType::BIDx); const bool block_y = parallel_bitmap.get(ParallelType::BIDy); @@ -424,24 +763,19 @@ void IndexLowering::visit(const kir::BroadcastOp* bop) { const bool grid_broadcast_needed = block_x || block_y || block_z; if (!grid_broadcast_needed) { pushBack(indexed_expr); + GpuLower::current()->propagateExprInfo(bop, back()); return; } // Grid broadcast const auto out_domain = out_tv->domain(); - const auto broadcast_buffer = allocGlobalBufferForGridComm( - ir_builder_, - getGridCommWorkBufferSize(ir_builder_, out_domain), - out->dtype(), - false); + const auto broadcast_buffer = ir_utils::allocGlobalBufferForGridComm( + getGridCommWorkBufferSize(out_domain), out->dtype(), false); - const auto sync_buffer = allocGlobalBufferForGridComm( - ir_builder_, - getGridSyncBufferSize(ir_builder_, out_domain), - DataType::Int, - true); + const auto sync_buffer = ir_utils::allocGlobalBufferForGridComm( + getGridSyncBufferSize(out_domain), DataType::Int, true); - auto grid_broadcast = ir_builder_.create( + auto grid_broadcast = IrBuilder::create( indexed_expr, broadcast_buffer, sync_buffer); if (bop->predicate()) { @@ -451,21 +785,27 @@ void IndexLowering::visit(const kir::BroadcastOp* bop) { pushBack(broadcast_buffer); pushBack(sync_buffer); pushBack(grid_broadcast); + GpuLower::current()->propagateExprInfo(bop, back()); } -void IndexLowering::visit(const kir::Allocate* allocate) { +void IndexLowering::handle(const kir::Allocate* allocate) { // TODO(kir): remove the need for const_cast pushBack(const_cast(allocate)); // NOLINT } -void IndexLowering::visit(const kir::Sync* sync) { +void IndexLowering::handle(const kir::BlockSync* sync) { + // TODO(kir): remove the need for const_cast + pushBack(const_cast(sync)); // NOLINT +} + +void IndexLowering::handle(const kir::GridSync* sync) { // TODO(kir): remove the need for const_cast - pushBack(const_cast(sync)); // NOLINT + pushBack(const_cast(sync)); // NOLINT } -void IndexLowering::generate(const std::vector& exprs) { +void IndexLowering::generate(const std::vector& exprs) { for (auto expr : exprs) { - expr->accept(this); + OptOutConstDispatch::handle(expr); } } diff --git a/torch/csrc/jit/codegen/cuda/lower_index.h b/torch/csrc/jit/codegen/cuda/lower_index.h index 5eb27c78f283..dfb14933770e 100644 --- a/torch/csrc/jit/codegen/cuda/lower_index.h +++ b/torch/csrc/jit/codegen/cuda/lower_index.h @@ -1,10 +1,10 @@ #pragma once -#include +#include #include #include -#include +#include #include #include @@ -14,10 +14,11 @@ namespace jit { namespace fuser { namespace cuda { -class TORCH_CUDA_CU_API IndexLowering : private kir::IrVisitor { +// TODO: Replace with mutator as IndexLowering is replacing expr's with +// versions that are doing indexing +class TORCH_CUDA_CU_API IndexLowering : private OptOutConstDispatch { public: - static std::vector getIndexedExprs( - std::vector incoming_exprs) { + static std::vector getIndexedExprs(std::vector incoming_exprs) { FUSER_PERF_SCOPE("GpuLower::Lower::IndexLowering::getIndexedExprs"); IndexLowering il; il.generate(incoming_exprs); @@ -25,28 +26,56 @@ class TORCH_CUDA_CU_API IndexLowering : private kir::IrVisitor { } private: - IndexLowering(); + IndexLowering() = default; - void pushBack(kir::Expr*); + void pushBack(Expr*); - void visit(const kir::ForLoop*) final; - void visit(const kir::IfThenElse*) final; - void visit(const kir::UnaryOp*) final; - void visit(const kir::BinaryOp*) final; - void visit(const kir::TernaryOp*) final; - void visit(const kir::ReductionOp*) final; - void visit(const kir::WelfordOp*) final; - void visit(const kir::BroadcastOp*) final; - void visit(const kir::Allocate*) final; - void visit(const kir::Sync*) final; + // Return the most recently inserted + // expression in the current active + // scope or global scope. + Expr* back() const; - void generate(const std::vector& exprs); + // Insert an expression before the current top-level expression. + void insertAtTopLevel(Expr* expr); - kir::Val* lowerSrcIndex(kir::Val* val, kir::Val* dst) const; - kir::Val* lowerDstIndex(kir::Val* dst) const; + void handle(const ViewAsScalar*) final; + void handle(const UnaryOp*) final; + void handle(const BinaryOp*) final; + void handle(const TernaryOp*) final; + void handle(const ReductionOp*) final; + void handle(const GroupedReductionOp*) final; + void handle(const WelfordOp*) final; + void handle(const MmaOp*) final; + void handle(const BroadcastOp*) final; + + void handle(const kir::ForLoop*) final; + void handle(const kir::IfThenElse*) final; + void handle(const kir::Allocate*) final; + void handle(const kir::BlockSync*) final; + void handle(const kir::GridSync*) final; + + void generate(const std::vector& exprs); + + Val* lowerSrcIndex(Val* val, Val* dst) const; + + Val* lowerDstIndex(Val* dst) const; + + void handleBlockReduction(const ReductionOp* rop, Val* out, Val* in); + void handleGridReduction(const ReductionOp* rop, Val* out, Val* in); + + void handleBlockReduction( + const GroupedReductionOp* rop, + const std::vector& outputs, + const std::vector& inputs); + void handleGridReduction( + const GroupedReductionOp* rop, + const std::vector& outputs, + const std::vector& inputs); + + void handleGridWelford(WelfordOp* new_wop); private: - std::vector lowered_exprs_; + std::vector lowered_exprs_; // This is a slight work around as scope has a couple definitions, we have the // Scope that's in ForLoop/IfThenElse which is really just a wrapper around @@ -55,9 +84,10 @@ class TORCH_CUDA_CU_API IndexLowering : private kir::IrVisitor { // could be either the body or else body of the IfThenElse. However, we want // to understand the nesting of IfThenElse/ForLoop nodes. kir::Scope* active_scope_ = nullptr; - kir::Expr* active_scope_expr_ = nullptr; - kir::IrBuilder ir_builder_; + // Track for loops to send to indexing. Similar to what's done in + // kir::IrVisitor + std::vector for_loops_; }; } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/lower_index_hoist.cpp b/torch/csrc/jit/codegen/cuda/lower_index_hoist.cpp new file mode 100644 index 000000000000..309867477924 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/lower_index_hoist.cpp @@ -0,0 +1,338 @@ +#include +#include +#include +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +namespace { + +// Return leaf domains of a given domain. +std::unordered_set getUsedLeafIds( + IterDomain* id, + TensorDomain* td) { + const auto all_vals_between = DependencyCheck::getAllValsBetween( + {id}, {td->domain().begin(), td->domain().end()}); + + std::unordered_set used_leaf_ids; + + for (const auto leaf : td->domain()) { + if (std::find(all_vals_between.begin(), all_vals_between.end(), leaf) != + all_vals_between.end()) { + used_leaf_ids.insert(leaf); + } + } + + TORCH_INTERNAL_ASSERT( + !used_leaf_ids.empty(), + "No used id found: ", + id->toString(), + ", ", + td->toString()); + + return used_leaf_ids; +} + +} // namespace + +CommonIndexKey::CommonIndexKey( + IterDomain* consumer_indexed_id, + TensorDomain* consumer_td, + TensorDomain* ref_td, + const std::unordered_map& ref_index_map, + const std::vector& loops) { + auto gpu_lower = GpuLower::current(); + + concrete_indexed_id_ = gpu_lower->caMap()->getConcreteMappedID( + consumer_indexed_id, IdMappingMode::EXACT); + + const auto consumer_leaf_ids = + getUsedLeafIds(consumer_indexed_id, consumer_td); + + // Convert to Parallel concrete IDs to find matching loops. + std::unordered_set concrete_leaf_ids; + for (auto& id : consumer_leaf_ids) { + concrete_leaf_ids.insert( + gpu_lower->caMap()->getConcreteMappedID(id, IdMappingMode::LOOP)); + } + + // Find used loops and their index vals + for (const auto i : c10::irange(loops.size())) { + auto loop = loops.at(i); + auto loop_id = gpu_lower->caMap()->getConcreteMappedID( + loop->iter_domain(), IdMappingMode::LOOP); + auto it = concrete_leaf_ids.find(loop_id); + if (it != concrete_leaf_ids.end()) { + // This leaf reference id is used for indexing the consumer id + used_loops_.push_back(loop); + auto index_it = ref_index_map.find(ref_td->axis(i)); + TORCH_INTERNAL_ASSERT( + index_it != ref_index_map.end(), + "Index not found for leaf ID, ", + ref_td->axis(i)->toString()); + loop_index_vals_.push_back(index_it->second); + } + } + + TORCH_INTERNAL_ASSERT( + !used_loops_.empty(), + "No loop used for indexing found. ", + consumer_indexed_id->toString()); + + TORCH_INTERNAL_ASSERT( + consumer_leaf_ids.size() == used_loops_.size(), + "consumer_leaf_ids.size() = ", + consumer_leaf_ids.size(), + ", used_loops_.size() == ", + used_loops_.size(), + ", loops.size() == ", + loops.size()); +} + +bool CommonIndexKey::operator==(const CommonIndexKey& other) const { + auto gpu_lower = GpuLower::current(); + + if (concrete_indexed_id_ != other.concrete_indexed_id_) { + return false; + } + + if (used_loops_.size() != other.used_loops_.size()) { + return false; + } + + // Check if both CommonIndexKeys use the same loops. If not, it's + // still valid to share the same hoisted index as long as: 1) each + // loop pair is mapped with the CA index map, and 2) they are not + // instantiated as actual loops. + for (const auto i : c10::irange(used_loops_.size())) { + auto lhs_loop = used_loops_.at(i); + auto rhs_loop = other.used_loops_.at(i); + if (lhs_loop == rhs_loop) { + continue; + } + if (gpu_lower->caMap()->areMapped( + lhs_loop->iter_domain(), + rhs_loop->iter_domain(), + IdMappingMode::EXACT) && + lhs_loop->isTrivial() && rhs_loop->isTrivial()) { + continue; + } + return false; + } + + for (const auto i : c10::irange(loop_index_vals_.size())) { + auto lhs_index = loop_index_vals_.at(i); + auto rhs_index = other.loop_index_vals_.at(i); + if (lhs_index == rhs_index) { + continue; + } + // Initial index variables can have some additions such as magic + // zero and "1" when used in producer indexing for double buffered + // tensors. Thus, the initial variables themselves may be + // different, and its components need to be examined. An easy way + // is to flatten them to strings as follows. + auto lhs_str = loop_index_vals_.at(i)->toInlineString(); + auto rhs_str = other.loop_index_vals_.at(i)->toInlineString(); + if (lhs_str == rhs_str) { + continue; + } + + return false; + } + + return true; +} + +std::string CommonIndexKey::toString() const { + TORCH_INTERNAL_ASSERT(concrete_indexed_id_ != nullptr); + std::stringstream ss; + ss << "CommonIndexKey: " << concrete_indexed_id_->toString(); + ss << ", { "; + for (auto loop : used_loops_) { + ss << loop->iter_domain()->toString() << " "; + } + ss << "}"; + ss << ", { "; + for (auto val : loop_index_vals_) { + ss << val->toString() << " "; + } + ss << "}"; + return ss.str(); +} + +std::pair CommonIndexMap::insert( + IterDomain* indexed_consumer_id, + TensorDomain* consumer_td, + TensorDomain* ref_td, + const std::unordered_map& ref_index_map, + const std::vector& loops, + Val* index) { + if (index->definition() == nullptr) { + // Only expression is eligible to hoist + return {index, false}; + } + + const CommonIndexKey key( + indexed_consumer_id, consumer_td, ref_td, ref_index_map, loops); + + Val* hoisted_index = nullptr; + bool new_index_inserted = false; + + // If already mapped, return the previously mapped index + auto it = common_index_map_.find(key); + if (it != common_index_map_.end()) { + hoisted_index = it->second; + new_index_inserted = false; + ++use_counts_.at(key); + } else { + common_index_map_.emplace(key, index); + hoisted_index = index; + new_index_inserted = true; + use_counts_[key] = 1; + } + + return {hoisted_index, new_index_inserted}; +} + +namespace { + +//! Insertion point of allocation +struct CommonIndexInsertionInfo { + Expr* ref = nullptr; + kir::Scope* scope = nullptr; +}; + +// Inserts allocations of hoisted indices +class CommonIndexInserter : private kir::ExprMutator { + public: + static std::vector run( + const std::vector& exprs, + const CommonIndexMap& common_indices) { + CommonIndexInserter inserter(exprs, common_indices); + return inserter.exprs_; + } + + private: + CommonIndexInserter( + const std::vector& exprs, + const CommonIndexMap& common_index_map) + : common_index_map_(common_index_map) { + // Create a map to keys from loops where they should be inserted + for (const auto& kv : common_index_map.commonIndexMap()) { + const auto& key = kv.first; + // Only consider indices used multiple times + if (!usedMultipleTimes(key)) { + continue; + } + TORCH_INTERNAL_ASSERT(!key.usedLoops().empty()); + auto insertion_loop = key.usedLoops().back(); + innermost_used_loop_map_[insertion_loop].push_back(key); + } + + traverseAndInsert(exprs); + } + + CommonIndexInsertionInfo findInsertionPoint( + const CommonIndexKey& key, + kir::ForLoop* current_loop) const { + CommonIndexInsertionInfo info; + + // Allocation must be inside any used non-trivial loop. Since the + // loop index value is constant if a loop is trivial, allocation + // does not need to be inside trivial loops. + for (const auto loop : key.usedLoops()) { + if (!loop->isTrivial()) { + info.ref = loop->body()[0]; + info.scope = &(loop->body()); + } + } + + // If no non-trivial used loop is found, insert at the top-level + // scope just before the outer-most loop. + if (info.ref == nullptr) { + info.ref = scope_exprs_.empty() ? current_loop : scope_exprs_.at(0); + info.scope = nullptr; + } + + return info; + } + + using kir::ExprMutator::handle; + + void handle(kir::ForLoop* loop) final { + auto innermost_loop_map_it = innermost_used_loop_map_.find(loop); + if (innermost_loop_map_it == innermost_used_loop_map_.end()) { + kir::ExprMutator::handle(loop); + return; + } + + for (const auto& key : innermost_loop_map_it->second) { + auto common_index = common_index_map_.commonIndexMap().at(key); + + // Insert only when the index is used multiple times and is not + // yet inserted. + if (inserted_indices_.find(common_index) != inserted_indices_.end()) { + continue; + } + + // Make the type of the hoisted index be the index type of the + // kernel, which can be either int64_t or int. Not very clean, + // but this seems to be the quickest way to use the index type + // as we don't have a scalar IR node for the index type. + common_index->resolveIndexDtype(); + + auto alloc = IrBuilder::create( + common_index, + MemoryType::Local, + GpuLower::current()->kernel()->oneVal()); + const auto common_index_def = common_index->definition(); + TORCH_INTERNAL_ASSERT( + common_index_def != nullptr, + "Hosted index must have a definition. ", + common_index->toString()); + + const auto insertion_info = findInsertionPoint(key, loop); + registerInsertBefore(insertion_info.ref, alloc, insertion_info.scope); + registerInsertBefore( + insertion_info.ref, common_index_def, insertion_info.scope); + + // Track inserted index + inserted_indices_.emplace(common_index); + } + + kir::ExprMutator::handle(loop); + } + + bool usedMultipleTimes(const CommonIndexKey& key) { + auto it = common_index_map_.useCounts().find(key); + TORCH_INTERNAL_ASSERT( + it != common_index_map_.useCounts().end(), + "Key not found in the use-count map: ", + key.toString()); + return it->second > 1; + } + + private: + const CommonIndexMap& common_index_map_; + //! Map to CommonIndexKeys from their innermost used loops + std::unordered_map> + innermost_used_loop_map_; + //! Keep track of inserted indices + std::unordered_set inserted_indices_; +}; + +} // namespace + +std::vector allocateCommonIndices(const std::vector& exprs) { + return CommonIndexInserter::run(exprs, GpuLower::current()->commonIndexMap()); +} + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower_index_hoist.h b/torch/csrc/jit/codegen/cuda/lower_index_hoist.h new file mode 100644 index 000000000000..5e0256f9e844 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/lower_index_hoist.h @@ -0,0 +1,121 @@ +#pragma once + +#include + +#include +#include +#include + +// Hoisting common index subexpressions +// +// Class CommonIndexMap is updated during the lowering as new indices +// are inserted. An index is uniquely identified with CommonIndexKey, +// which consists of the concrete ID of the indexed/predicated domain, +// the for-loops used in the index, and the index vals of the use +// for-loops. +// +// Once all indices are inserted to CommonIndexMap, allocations of the +// the hoisted indices are inserted by allocateCommonIndices. Note +// that this assumes that the CUDA code generator does not inline a +// scalar Val with allocation (PR #1434). + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +//! Class to represent unique indexed domains for index +//! hoisting. Uniquenesss is determined with the indexed domain +//! itself, the for-loops and their index values. +class CommonIndexKey { + friend struct CommonIndexKeyHash; + + public: + //! \param consumer_indexed_id Indexed consumer domain + //! \param consumer_td TensorDomain of consumer_indexed_id + //! \param ref_td Reference domain at the time of indexing + //! \param ref_index_map Index map of the reference domain + //! \param loops Loop structure where this id is indexed + CommonIndexKey( + IterDomain* consumer_indexed_id, + TensorDomain* consumer_td, + TensorDomain* ref_td, + const std::unordered_map& ref_index_map, + const std::vector& loops); + + const IterDomain* concreteIndexedId() const { + return concrete_indexed_id_; + } + + const std::vector& usedLoops() const { + return used_loops_; + } + + const std::vector& loopIndexVals() const { + return loop_index_vals_; + } + + bool operator==(const CommonIndexKey& other) const; + + std::string toString() const; + + private: + //! Concrete domain of indexed domain + IterDomain* concrete_indexed_id_ = nullptr; + //! Loops used for the index + std::vector used_loops_; + //! Loop index vals for the used loops + std::vector loop_index_vals_; +}; + +struct CommonIndexKeyHash { + std::size_t operator()(const CommonIndexKey& key) const { + auto h = std::hash{}(key.concrete_indexed_id_); + // NOTE: do not use other fields as the pointers can be different + // even when two keys can share the same index + return h; + } +}; + +//! Map to hold hoisted common indices +class TORCH_CUDA_CU_API CommonIndexMap { + public: + //! Register an indexd consumer domain to hoist + //! + //! Returns a corresponding hoisted index and a flag indicating if a + //! new index is inserted. + //! + //! Consumer domains are used even for producer indexing since + //! producer domains in producer indexing are temporary replay + //! domains. + std::pair insert( + IterDomain* indexed_consumer_id, + TensorDomain* consumer_td, + TensorDomain* ref_td, + const std::unordered_map& ref_index_map, + const std::vector& loops, + Val* index); + + const auto& commonIndexMap() const { + return common_index_map_; + } + + const auto& useCounts() const { + return use_counts_; + } + + private: + //! Map to hold hoisted common indices + std::unordered_map + common_index_map_; + std::unordered_map use_counts_; +}; + +//! Insert allocations of hoisted indices. Must be called after +//! collecting all common indices. +std::vector allocateCommonIndices(const std::vector& exprs); + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp b/torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp index 0947ef0f5790..34f3068d0699 100644 --- a/torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp @@ -1,8 +1,8 @@ #include #include +#include #include -#include -#include +#include #include #include @@ -33,8 +33,8 @@ class SmemAllocMap { public: //! Insert a new node if it's a SMEM allocation void insert(kir::Allocate* alloc) { - if (auto tv = dynamic_cast(alloc->buffer())) { - if (tv->memoryType() == MemoryType::Shared) { + if (auto tv = dynamic_cast(alloc->buffer())) { + if (tv->getMemoryType() == MemoryType::Shared) { // Note that a TensorView can have two allocations due to // unswitch. auto p = map_.insert({tv, alloc}); @@ -50,290 +50,313 @@ class SmemAllocMap { } } - //! Get the buffer that is actually allocated for a given TV - kir::TensorView* getRealBuffer(kir::TensorView* tv) const { + //! Run through aliases to get the buffer that is actually allocated for a + //! given TV + TensorView* getRealBuffer(TensorView* tv) const { auto it = map_.find(tv); TORCH_INTERNAL_ASSERT( - it != map_.end(), "Allocation not found for ", kir::toString(tv)); + it != map_.end(), "Allocation not found for ", tv->toString()); const kir::Allocate* alloc = it->second; while (alloc->alias()) { alloc = alloc->alias(); } auto buf = alloc->buffer(); - TORCH_INTERNAL_ASSERT(buf->isA()); - return buf->as(); + TORCH_INTERNAL_ASSERT(buf->isA()); + return buf->as(); } private: - std::unordered_map map_; + std::unordered_map map_; }; -//! Insert WAR sync for a given ForLoop -class LocalSyncInserterForLoop { - using TvSet = std::unordered_set; +struct WarMemoryInfo { + // True if there's a sync after the last read within the alloc loop. + bool sync_after_read = false; - public: - //! Insert Sync nodes at the end of a given for-loop when a WAR - //! hazard may happen. - LocalSyncInserterForLoop(kir::ForLoop* fl, SmemAllocMap& alloc_map) - : alloc_map_(alloc_map) { - for (auto expr : fl->body().exprs()) { - handle(expr); - } + // True if there's a sync before the first write. There can be multiple writes + // from memory aliasing. + bool sync_before_write = false; - // No need to insert sync when the loop is not actually generated - if (fl->iter_domain()->isThread() || fl->iter_domain()->isBroadcast()) { - return; - } - - // Determine if any smem TV is written to at beginning of the for-loop - // and whether that smem TV is read from at the end of the for-loop - // Insert new SyncThreads at end of for-loop to prevent WAR race condition - // - // TODO: replace __syncthreads with __threadfence for alias ops - // - if (detectIntersection(initial_, final_) && - !fl->body().exprs().back()->isA() && !is_last_op_sync_) { - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); - fl->body().push_back(ir_builder.create(true)); - initial_sync_ = true; - is_last_op_sync_ = true; - final_.clear(); - } - } + // Has there been a read of this memory location + bool read_hit = false; - const auto& initial() const { - return initial_; - } + // Has there been *the* write to this memory location, assumes single write + // instruction (needs to be before conditionals added to code) + bool write_hit = false; - const auto& final() const { - return final_; - } + // For loop this TV is compute_at'ed in. + kir::ForLoop* ca_loop = nullptr; +}; - const auto& all_smem_inputs() const { - return all_smem_inputs_; +// To prevent shared memory from being over written before it is read, a +// synchronization point has to be inserted either between the allocation of an +// SMEM buffer and where we write into it, or after the buffer's last read +// before exiting the allocation's scope. +// +// e.g. +// for i: +// "alloc A" in shared memory - This is really marked by the compute_at point +// sync_loc_0 +// for j: +// sync_loc_1 +// for k: +// sync_loc_2 +// A = ... +// for k: +// ... = ... A +// for j: +// for k: +// ... = ... A +// sync_loc_3 +// sync_loc_4 +// sync_loc_5 +// +// All sync locations here provide valid protection that memory in A is finished +// being read before it is over written in the next iteration +// +// Insertion of sync threads will be done from the inner most position to the +// outer most. If a sync protecting the buffer is not already placed, the +// location prefered for the sync threads is the last possible position. One +// future optimization could be to not sync on the last iteration of the loop +// the sync is placed in. +class WarSyncInserter : private kir::ExprMutator { + public: + static std::vector insert(const std::vector& exprs) { + WarSyncInserter inserter(exprs); + return inserter.exprs_; } - const auto& all_smem_outputs() const { - return all_smem_outputs_; + private: + //! Insert Sync nodes at the end of a given for-loop when a WAR + //! hazard may happen. + WarSyncInserter(const std::vector& exprs) { + auto& lower_alloc_info_map = GpuLower::current()->localAllocationInfoMap(); + for (const auto& entry : lower_alloc_info_map) { + alloc_map_.insert(entry.first); + } + kir::ExprMutator::traverseAndInsert(exprs); } - void handle(kir::Expr* expr) { - if (ir_utils::isTVOp(expr)) { - is_last_op_sync_ = false; - - // For this SyncInserter - if (initial_sync_) { - addInputSmemTvs(expr, final_); - } else { - addInputSmemTvs(expr, final_); - addOutputSmemTvs(expr, initial_); + void handle(kir::IfThenElse* ite) final { + TORCH_INTERNAL_ASSERT( + ite->elseBody().empty(), + "Pass does not support conditional flow,", + " needs to be done before conditional execution is lowered."); + kir::ExprMutator::handle(ite); + } + + void handle(kir::BlockSync* sync) final { + // Register the sync for the active for loop + sync_hit_.back() = true; + // Run through the active allocations, if a read was hit, register there was + // a sync after the read. If there's subsequent reads on this buffer the + // sync_after_read will be cleared. + for (auto& entry : smem_allocations_) { + auto& alloc_stack = entry.second; + if (alloc_stack.back().read_hit) { + alloc_stack.back().sync_after_read = true; } - - // For parent SyncInserter - addOutputSmemTvs(expr, all_smem_outputs_); - addInputSmemTvs(expr, all_smem_inputs_); - } else if (auto sync = dynamic_cast(expr)) { - handle(sync); - } else if (auto ite = dynamic_cast(expr)) { - handle(ite); - } else if (auto for_loop = dynamic_cast(expr)) { - handle(for_loop); - } else if (auto alloc = dynamic_cast(expr)) { - alloc_map_.insert(alloc); } } - void handle(kir::Sync* sync) { - is_last_op_sync_ = true; - initial_sync_ = true; - final_.clear(); - } - - void handle(kir::IfThenElse* ite) { - for (auto expr : ite->thenBody().exprs()) { - handle(expr); - } - for (auto expr : ite->elseBody().exprs()) { - handle(expr); + void handle(kir::GridSync* sync) final { + // Register the sync for the active for loop + sync_hit_.back() = true; + // Run through the active allocations, if a read was hit, register there was + // a sync after the read. If there's subsequent reads on this buffer the + // sync_after_read will be cleared. + for (auto& entry : smem_allocations_) { + auto& alloc_stack = entry.second; + if (alloc_stack.back().read_hit) { + alloc_stack.back().sync_after_read = true; + } } } - void handle(kir::ForLoop* fl) { - LocalSyncInserterForLoop child_sync_inserter(fl, alloc_map_); - - const auto& child_inputs = child_sync_inserter.all_smem_inputs(); - const auto& child_outputs = child_sync_inserter.all_smem_outputs(); - const bool maybe_skipped = !fl->start()->isZeroInt() && - !isParallelTypeThread(fl->iter_domain()->parallelType()); - - // Default - Track all smem inputs / outputs - all_smem_inputs_.insert(child_inputs.begin(), child_inputs.end()); - all_smem_outputs_.insert(child_outputs.begin(), child_outputs.end()); - - // Propagate the last_op_sync flag from the child loop. If the - // child is deterministically executed at least once, just set the - // flag with the child flag. Otherwise, conservatively set the - // flag, i.e., if the current flag is true and the child flag is - // also true, we can say the last op is still sync. - if (!maybe_skipped) { - is_last_op_sync_ = child_sync_inserter.is_last_op_sync_; - } else { - is_last_op_sync_ = - is_last_op_sync_ && child_sync_inserter.is_last_op_sync_; + // Checks if fl or loops within it have hit a sync + bool syncWithin(kir::ForLoop* fl) { + // If outer most scope check the first sync_hit_ position + if (fl == nullptr) { + return sync_hit_[0]; } - // When the child is not guaranteed to have sync. - if (!child_sync_inserter.initial_sync_) { - // If no sync is yet found, add the child outputs to - // initial. - if (!initial_sync_) { - initial_.insert(child_outputs.begin(), child_outputs.end()); - } - // Add the child inputs to final even when inital_sync is false, - // which only means sync may not be found yet. - final_.insert(child_inputs.begin(), child_inputs.end()); - } else { - // Similar to the above case, but here, the child is guaranteed - // to have sync, so we only need to look at initial and final. - if (!initial_sync_) { - initial_.insert( - child_sync_inserter.initial().begin(), - child_sync_inserter.initial().end()); - } - if (!maybe_skipped) { - initial_sync_ = true; - final_.clear(); - } - final_.insert( - child_sync_inserter.final().begin(), - child_sync_inserter.final().end()); - } - } + // Find the for loop we want to look within + auto fl_it = std::find(for_loops_.begin(), for_loops_.end(), fl); - static bool detectIntersection(const TvSet& left, const TvSet& right) { - for (auto item : left) { - if (right.find(item) != right.end()) { + // Convert it to an index, but add one for the outer most scope + auto fl_i = std::distance(for_loops_.begin(), fl_it) + 1; + + // Start at that index and see if there's syncs within that for loop + for (auto i : c10::irange(fl_i, sync_hit_.size())) { + if (sync_hit_[i]) { return true; } } return false; } - void addOutputSmemTvs(const kir::Expr* expr, TvSet& set) { - for (auto out : expr->outputs()) { - if (auto tv = dynamic_cast(out)) { - if (tv->memoryType() == MemoryType::Shared) { - auto real_tv = alloc_map_.getRealBuffer(tv); - set.insert(real_tv); - } - } + void handle(Expr* expr) final { + // If not a tensor view expression continue with dispatch + if (!ir_utils::isTvOp(expr)) { + kir::ExprMutator::handle(expr); + return; } - } - void addInputSmemTvs(const kir::Expr* expr, TvSet& set) { - for (auto in : expr->inputs()) { - if (auto tv = dynamic_cast(in)) { - if (tv->memoryType() == MemoryType::Shared) { - auto real_tv = alloc_map_.getRealBuffer(tv); - set.insert(real_tv); - } + // Mark write has been hit for all output tvs + auto out_tvs = ir_utils::filterByType(expr->outputs()); + for (auto out_tv : out_tvs) { + if (out_tv->getMemoryType() != MemoryType::Shared || + GpuLower::current()->syncMap().needsRawSync(out_tv).none()) { + continue; } - } - } - private: - //! Allocation map of SMEM buffers - SmemAllocMap& alloc_map_; + auto& entry = getMemInfo(out_tv); - //! Track Shared Memory Inputs (Reads) for parent for-loop - TvSet all_smem_inputs_; + // If this is the first write and there's a sync in one of the loops after + // the compute at loop, then this buffer is protected. + if (syncWithin(entry.ca_loop) && !entry.write_hit) { + entry.sync_before_write = true; + } + entry.write_hit = true; + } - //! Track Shared Memory Outputs (Writes) for parent for-loop - TvSet all_smem_outputs_; + // Mark read was hit, if sync_after_read was set, clear it. + auto inp_tvs = ir_utils::filterByType(expr->inputs()); + for (auto inp_tv : inp_tvs) { + if (inp_tv->getMemoryType() != MemoryType::Shared || + GpuLower::current()->syncMap().needsRawSync(inp_tv).none()) { + continue; + } - //! Shared Memory Writes at beginning of the for-loop - //! before first SyncThreads - TvSet initial_; + auto& entry = getMemInfo(inp_tv); + entry.read_hit = true; + // Clear the sync_after_read if it was set because there was another write + entry.sync_after_read = false; + } + } - //! Shared Memory Reads at end of the for-loop - //! Cleared after each SyncThreads - TvSet final_; + void handle(kir::ForLoop* for_loop) final { + // Push loop scope information + auto prev_within_iter_loop_ = within_iter_loop_; + sync_hit_.push_back(false); - //! Track first sync deterministically found in for-loop. Even when a - //! child loop has a sync, if it may not be executed due to non-zero - //! start value, this flag remains false. - bool initial_sync_ = false; + // If there is no real iterating loop WAR syncs aren't necessary + within_iter_loop_ = within_iter_loop_ || !for_loop->isTrivial(); - //! Track if last op is sync - bool is_last_op_sync_ = false; -}; + // Process the expressions in the for loop + kir::ExprMutator::handle(for_loop); -class LocalSyncInserter { - public: - //! Write-After-Read race conditions are only found within for-loops. - //! Sync nodes are inserted directly into the for-loops. - //! The expressions are modified in-place and exprs is const. - static void insertSyncs(const std::vector& exprs) { - LocalSyncInserter inserter; - inserter.insert(exprs); - } + // Sync analysis and cleanup: + // + // Pop for loop stack inside WarMemoryInfo structs if they match this one. + // Erase empty entries so we don't continue to search over them + // + // Insert sync at end of this for loop if any of the entries require + std::vector to_erase; + bool insert_sync = false; + for (auto& entry : smem_allocations_) { + auto& alloc_stack = entry.second; + if (alloc_stack.size() && alloc_stack.back().ca_loop == for_loop) { + if (!alloc_stack.back().sync_after_read && + !alloc_stack.back().sync_before_write) { + insert_sync = within_iter_loop_; + } - private: - void insert(const std::vector& exprs) { - for (auto expr : exprs) { - if (auto fl = dynamic_cast(expr)) { - LocalSyncInserterForLoop sync_inserter(fl, alloc_map_); - } else if (auto ite = dynamic_cast(expr)) { - insert(ite->thenBody().exprs()); - insert(ite->elseBody().exprs()); - } else if (auto alloc = dynamic_cast(expr)) { - alloc_map_.insert(alloc); + alloc_stack.pop_back(); + if (alloc_stack.empty()) { + to_erase.push_back(entry.first); + } } } - } - private: + for (auto tv : to_erase) { + smem_allocations_.erase(tv); + } + + // WAR Sync is necessary in this loop, register its insertion. + if (insert_sync) { + auto sync_expr = IrBuilder::create(true); + kir::ExprMutator::registerInsertAfter( + for_loop->body().exprs().back(), sync_expr, &for_loop->body()); + handle(sync_expr); + } + + // Pop for loop scope information + sync_hit_.pop_back(); + within_iter_loop_ = prev_within_iter_loop_; + } + + // Create a new WarMemoryInfo entry if required and return a reference to it, + // else return the WarMemoryInfo associated with tv + WarMemoryInfo& getMemInfo(TensorView* tv) { + auto maybe_aliased_tv = alloc_map_.getRealBuffer(tv); + auto alloc_it = smem_allocations_.find(maybe_aliased_tv); + auto ca_loop = + loop_utils::getAllocInformation(tv, for_loops_).init_for_loop; + if (alloc_it == smem_allocations_.end()) { + WarMemoryInfo mem_info; + mem_info.ca_loop = ca_loop; + auto entry_it = + smem_allocations_ + .insert(std::make_pair( + maybe_aliased_tv, std::vector({mem_info}))) + .first; + return entry_it->second.back(); + } else if ( + maybe_aliased_tv != tv && alloc_it->second.back().ca_loop != ca_loop) { + WarMemoryInfo mem_info; + mem_info.ca_loop = ca_loop; + auto& alloc_stack = alloc_it->second; + alloc_stack.push_back(mem_info); + return alloc_stack.back(); + } + return alloc_it->second.back(); + } + + //! Allocation map of SMEM buffers. Needed because of SMEM buffer aliasing, + //! need to track the root of the alias to properly insert WAR hazard syncs SmemAllocMap alloc_map_; + + //! Is there a loop nest that has a non-trivial iteration (extent != 1) and + //! not bound to a block/thread. This indicates if a WAR sync is necessary, + //! otherwise the Expr is not in an iterating for loop. + bool within_iter_loop_ = false; + + // Track which loops have hit a sync. Used to see if there's a sync before + // write. + std::vector sync_hit_ = {false}; + + // Keep track of the active allocations we need to protect. Key is the + // "getRealBuffer", not the raw tv. There can be multiple WarMemoryInfo's + // because of aliasing. If the "getRealBuffer" tv has a compute at outside the + // alias tv, each aliased tv in a unique ca_loop has to be tracked separately + // for WAR insertion. + std::unordered_map> smem_allocations_; }; class ExprFlattener : private kir::IrVisitor { private: - void handle(kir::Expr* expr) { + using kir::IrVisitor::handle; + + void handle(Expr* expr) final { if (expr->isA() || expr->isA()) { - expr->accept(this); + kir::IrVisitor::handle(expr); } else { - exprs_.push_back(expr); - } - } - - void visit(const kir::ForLoop* fl) final { - for (auto expr : fl->body().exprs()) { - handle(expr); - } - } - - void visit(const kir::IfThenElse* ite) final { - for (auto expr : ite->thenBody().exprs()) { - handle(expr); - } - for (auto expr : ite->elseBody().exprs()) { - handle(expr); + flat_exprs_.push_back(expr); } } private: - std::vector exprs_; + std::vector flat_exprs_; public: //! Flattens scopes extracting out a single ordered list of exprs. - static std::vector flatten( - const std::vector& loop_nests) { + static std::vector flatten(const std::vector& loop_nests) { ExprFlattener flattener; for (auto expr : loop_nests) { flattener.handle(expr); } - return flattener.exprs_; + return flattener.flat_exprs_; } }; @@ -342,53 +365,70 @@ class ValidatePlacementAfterWrites : private kir::IrVisitor { //! Validate no expr in writes found under loop static void validate( kir::ForLoop* loop, - const std::unordered_set& writes) { + const std::unordered_set& writes) { ValidatePlacementAfterWrites validator(writes); validator.handle(loop); } private: - ValidatePlacementAfterWrites(const std::unordered_set& writes) + using kir::IrVisitor::handle; + + ValidatePlacementAfterWrites(const std::unordered_set& writes) : writes_(writes) {} - void handle(kir::Expr* expr) { + void handle(Expr* expr) final { if (expr->isA() || expr->isA()) { - expr->accept(this); + kir::IrVisitor::handle(expr); } else { TORCH_INTERNAL_ASSERT( writes_.find(expr) == writes_.end(), "Block sync must be placed after ", - kir::toString(expr)); - } - } - - void visit(const kir::ForLoop* fl) final { - for (auto expr : fl->body().exprs()) { - handle(expr); - } - } - - void visit(const kir::IfThenElse* ite) final { - for (auto expr : ite->thenBody().exprs()) { - handle(expr); - } - for (auto expr : ite->elseBody().exprs()) { - handle(expr); + expr->toString()); } } private: - const std::unordered_set& writes_; + const std::unordered_set& writes_; }; -class ReadAfterWriteSyncs : public kir::MutableIrVisitor { +namespace { + +Val* getGridSyncBufferSize(const ParallelTypeBitmap& ptb) { + // See the comment above for getGridCommWorkBufferSize. + TORCH_INTERNAL_ASSERT( + ptb.hasBID(), + "Detected needing a grid sync but no grid bits set in bitmap."); + Val* buffer_size = GpuLower::current()->kernel()->oneVal(); + for (auto pt : kParallelTypeBIDs) { + // Synchronized within pt, so all blocks of this PT use the same + // sync buffer location, and thus no need to expand the sync + // buffer size. + if (ptb.get(pt)) { + continue; + } + auto pt_dim = GpuLower::current()->parallelDimensionMap().get(pt); + if (pt_dim == nullptr || pt_dim->isOneInt()) { + continue; + } + buffer_size = IrBuilder::mulExpr(buffer_size, pt_dim); + } + return buffer_size; +} + +} // namespace + +class ReadAfterWriteSyncs : public kir::ExprMutator { private: + using kir::ExprMutator::handle; + //! Traverse up the loop stack from loops_it and if a halo loop is //! found, place a given sync expr before the outer-most halo loop. + // TODO: What needs to be done here for gmem comm? bool insertBeforeHaloLoop( std::vector::iterator loops_it, - kir::Sync* sync_expr, - const std::unordered_set& writes) { + Expr* sync_expr, + Expr* maybe_alloc, + const std::unordered_set& writes) { std::vector::iterator halo_loop_it; bool halo_loop_found = false; @@ -420,131 +460,159 @@ class ReadAfterWriteSyncs : public kir::MutableIrVisitor { if (halo_loop_it == for_loops_.begin()) { // place in global scope - auto place_before_it = - std::find(loop_nests_.begin(), loop_nests_.end(), halo_loop); - TORCH_INTERNAL_ASSERT(place_before_it != loop_nests_.end()); - loop_nests_.insert(place_before_it, sync_expr); + auto place_before_it = std::find(exprs_.begin(), exprs_.end(), halo_loop); + TORCH_INTERNAL_ASSERT(place_before_it != exprs_.end()); + exprs_.insert(place_before_it, sync_expr); } else { auto place_in = *(halo_loop_it - 1); - place_in->body().insert_before(halo_loop, sync_expr); + kir::ExprMutator::registerInsertBefore( + halo_loop, sync_expr, &place_in->body()); + if (maybe_alloc != nullptr) { + kir::ExprMutator::registerInsertBefore( + halo_loop, maybe_alloc, &place_in->body()); + } } return true; } - void handle(kir::Expr* expr) { - if (!ir_utils::isTVOp(expr) || expr->isA()) { - expr->accept(this); + void handle(Expr* expr) final { + if (!ir_utils::isTvOp(expr) || expr->isA()) { + kir::ExprMutator::handle(expr); return; } - if (sync_after_.size() > 0 && sync_after_.front() == expr) { - sync_after_.pop_front(); + if (sync_before_.size() > 0 && sync_before_.front().first == expr) { + auto sync_bitmap = sync_before_.front().second; + sync_before_.pop_front(); auto last_writes = last_writes_.front(); last_writes_.pop_front(); // Found that a sync is needed - TORCH_INTERNAL_ASSERT(expr->outputs()[0]->isA()); - auto out_tv = expr->outputs()[0]->as(); // Find where a sync needs to be inserted // This is very similar to how allocations are placed, simply place sync - // after the expression instead of placing like allocation where it goes - // before. - // TODO: This may be a common operation, could be worth making a utility - // out of or saving state for tensor view ID -> for loop + // before the expression at the common alloc point of producers (really + // last_writes because we may have other exprs we're syncing besides the + // producers of this one) // TODO: Explicitly test the 3 cases below + Expr* sync_expr = nullptr; + kir::Allocate* maybe_alloc = nullptr; + if (sync_bitmap.hasBID()) { + maybe_alloc = ir_utils::allocGlobalBufferForGridComm( + getGridSyncBufferSize(sync_bitmap), DataType::Int, true); + sync_expr = IrBuilder::create( + sync_bitmap, maybe_alloc->buffer()); + } else { + sync_expr = IrBuilder::create(); + } - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); - auto sync_expr = ir_builder.create(); - if (out_tv->fuserTv()->getComputeAtPosition() == 0) { - // Sync should be placed at global scope, after its outer most loop if - // it has one. - kir::Expr* place_after = for_loops_.size() > 0 ? for_loops_[0] : expr; - // Find location in loop_nests_ - auto place_after_it = - std::find(loop_nests_.begin(), loop_nests_.end(), place_after); + // The expressions in last_writes are those we're protecting the read + // from. To figure out which loop we need a syncthread in, take the inner + // most compute at for loop of all the outputs of the last writes. + std::unordered_set sync_within; + + for (auto last_write : last_writes) { + auto write_out_tv = ir_utils::getTvOutput(last_write); TORCH_INTERNAL_ASSERT( - place_after_it != loop_nests_.end(), - "Could not figure out where to place synchronization. ", - "Tried to place after, ", - toString(place_after), - ", but could not find this expression at the global scope."); - loop_nests_.insert(place_after_it + 1, sync_expr); - } else { - // Find the last loop in computeAt of out_tv, this is the loop where we - // would place an allocation for out_tv - auto fuser_tv = out_tv->fuserTv(); - auto lowered_local_id = - GpuLower::current() - ->lowerValue(fuser_tv->axis( - (int)out_tv->fuserTv()->getComputeAtPosition() - 1)) - ->as(); + write_out_tv != nullptr, + "Error in RAW sync insertion, expecting a TV expr, but didn't find one."); + if (write_out_tv->getComputeAtPosition() == 0) { + continue; + } + + auto local_id = + write_out_tv->axis((int)write_out_tv->getComputeAtPosition() - 1); auto loops_it = std::find_if( for_loops_.begin(), for_loops_.end(), - [&lowered_local_id](const auto& loop) { - return GpuLower::current()->caLoopMap().areMapped( - loop->iter_domain(), lowered_local_id) || - loop->iter_domain()->parallelType() == ParallelType::Unroll; + [&local_id](const auto& loop) { + return GpuLower::current()->caMap()->areMapped( + loop->iter_domain(), local_id, IdMappingMode::PERMISSIVE); }); - TORCH_INTERNAL_ASSERT(loops_it != for_loops_.end()); + TORCH_INTERNAL_ASSERT( + loops_it != for_loops_.end(), + "Could not find loop associated with the alloc position of ", + write_out_tv->toString()); + + sync_within.emplace(*loops_it); + } + + // The for loop the sync needs to be in + kir::ForLoop* sync_within_fl = nullptr; + for (auto fl : for_loops_) { + if (sync_within.count(fl)) { + sync_within_fl = fl; + } + } + + if (sync_within_fl == nullptr) { + // Sync should be placed at global scope, after its outer most loop if + // it has one. + Expr* place_before = for_loops_.size() > 0 ? for_loops_[0] : expr; + // Find location in exprs_ + auto place_before_it = + std::find(exprs_.begin(), exprs_.end(), place_before); + TORCH_INTERNAL_ASSERT( + place_before_it != exprs_.end(), + "Could not figure out where to place synchronization. ", + "Tried to place after, ", + place_before->toString(), + ", but could not find this expression at the global scope."); + if (maybe_alloc != nullptr) { + registerInsertBefore(place_before, maybe_alloc, nullptr); + } + registerInsertBefore(*(place_before_it), sync_expr, nullptr); + } else { + auto sync_within_loop_it = + std::find(for_loops_.begin(), for_loops_.end(), sync_within_fl); // block sync must be placed before halo-extended loops - if (insertBeforeHaloLoop(loops_it, sync_expr, last_writes)) { + if (insertBeforeHaloLoop( + sync_within_loop_it, sync_expr, maybe_alloc, last_writes)) { return; } - auto place_in = *loops_it; - kir::Expr* place_after = nullptr; + auto place_in = *sync_within_loop_it; + Expr* place_before = nullptr; - if (loops_it + 1 == for_loops_.end()) { - // Inline allocation, place after expr - place_after = expr; + if (sync_within_loop_it + 1 == for_loops_.end()) { + // Inline, place before expr + place_before = expr; } else { - // Place allocation after the last computeAt axis - // TODO: may be more efficient to place after the first non-computeAt - // axis - place_after = *(loops_it + 1); + place_before = *(sync_within_loop_it + 1); } - place_in->body().insert_after(place_after, sync_expr); + registerInsertBefore(place_before, sync_expr, &place_in->body()); + if (maybe_alloc != nullptr) { + registerInsertBefore(place_before, maybe_alloc, &place_in->body()); + } } } } - void visit(kir::ForLoop* fl) final { - for_loops_.push_back(fl); - // Modifying in place, make a copy of the vector - const std::vector exprs = fl->body().exprs(); - for (auto expr : exprs) { - handle(expr); - } - for_loops_.pop_back(); - } - - void visit(kir::IfThenElse*) final { + void handle(kir::IfThenElse*) final { TORCH_INTERNAL_ASSERT( false, "Pass does not support conditional statements, ", "this pass should be run before any conditionals are placed in code."); } - // Clear the modify status for all shared memory buffers - static void cleanSharedMemory( - std::unordered_map& smem) { - smem.clear(); - } - // Return a set of expressions that modify shared-memory // tensors. Expressions are excluded when syncthreads are already // placed. - std::unordered_set isModifiedSharedMemory( - const std::unordered_map& smem, - const std::vector& tvs) const { - std::unordered_set last_writes; - for (auto tv : tvs) { + std::unordered_set isModifiedSharedMemory( + const std::unordered_map& smem, + const std::vector& tvs) const { + std::unordered_set last_writes; + for (auto tv : ir_utils::filterByType(tvs)) { + if (GpuLower::current()->syncMap().needsRawSync(tv).none()) { + continue; + } + if (tv->getMemoryType() != MemoryType::Shared) { + continue; + } auto it = smem.find(tv); if (it != smem.end()) { last_writes.insert(it->second); @@ -553,93 +621,140 @@ class ReadAfterWriteSyncs : public kir::MutableIrVisitor { return last_writes; } - ReadAfterWriteSyncs(std::vector _loop_nests) - : loop_nests_(std::move(_loop_nests)) { + std::unordered_set isModifiedGlobalMemory( + const std::unordered_map& gmem, + const std::vector& tvs) const { + std::unordered_set last_writes; + for (auto tv : ir_utils::filterByType(tvs)) { + if (GpuLower::current()->syncMap().needsRawSync(tv).none()) { + continue; + } + auto it = gmem.find(tv); + if (it != gmem.end()) { + last_writes.insert(it->second); + } + } + return last_writes; + } + + ReadAfterWriteSyncs(const std::vector& _exprs) { // Fusion shared_memory values // Tracks if shared memory is modified - std::unordered_map smem; + std::unordered_map smem; + std::unordered_map gmem; // Flatten all the expressions - auto flattened_exprs = ExprFlattener::flatten(loop_nests_); + auto flattened_exprs = ExprFlattener::flatten(_exprs); - kir::Expr* prev_tv_expr = nullptr; + Expr* prev_tv_expr = nullptr; for (auto expr : flattened_exprs) { - if (!ir_utils::isTVOp(expr) || expr->isA()) { + if (!ir_utils::isTvOp(expr) || expr->isA()) { continue; } - auto last_writes = isModifiedSharedMemory(smem, expr->inputs()); - if (!last_writes.empty()) { + auto last_gmem_writes = isModifiedGlobalMemory(gmem, expr->inputs()); + if (!last_gmem_writes.empty()) { TORCH_INTERNAL_ASSERT( prev_tv_expr != nullptr, "Can't require sync on inputs, however, detected it's needed."); - sync_after_.push_back(prev_tv_expr); - last_writes_.push_back(last_writes); - cleanSharedMemory(smem); + ParallelTypeBitmap bitmap; + for (auto entry : gmem) { + TORCH_INTERNAL_ASSERT(entry.first->isA()); + auto sync_bits = GpuLower::current()->syncMap().needsRawSync( + entry.first->as()); + bitmap |= sync_bits; + } + + sync_before_.emplace_back(std::make_pair(expr, bitmap)); + last_writes_.push_back(last_gmem_writes); + gmem.clear(); } - for (auto out : expr->outputs()) { - if (out->isA()) { - if (out->as()->memoryType() == MemoryType::Shared) { - smem[out] = expr; + auto last_smem_writes = isModifiedSharedMemory(smem, expr->inputs()); + if (!last_smem_writes.empty()) { + TORCH_INTERNAL_ASSERT( + prev_tv_expr != nullptr, + "Can't require sync on inputs, however, detected it's needed."); + ParallelTypeBitmap bitmap; + bitmap.set(ParallelType::TIDx); + bitmap.set(ParallelType::TIDy); + bitmap.set(ParallelType::TIDz); + sync_before_.emplace_back(std::make_pair(expr, bitmap)); + + // Before clearing `smem`, put all the currently pending smem writes + // in last_writes_. This will make sure all the smem writes will + // be taken into consideration when deciding which loopnest level + // to insert the block sync. see FusionRAWSyncInsertionPlace4. + std::unordered_set smem_writes; + for (auto it : smem) { + // No need to keep track of shared mem writes that does not + // require a RAW block sync. + if (GpuLower::current() + ->syncMap() + .needsRawSync(it.first->as()) + .hasTID()) { + smem_writes.insert(it.second); } } + last_writes_.push_back(smem_writes); + smem.clear(); + } + + for (auto tv : ir_utils::filterByType(expr->outputs())) { + // Double buffered tensors do not need RAW sync to be inserted + // here, except for the initial load part, which is taken care + // separately by DoubleBufferInserter. + if (tv->getMemoryType() == MemoryType::Shared && + !tv->isDoubleBuffered()) { + smem[tv] = expr; + } + if (tv->getMemoryType() == MemoryType::Global) { + gmem[tv] = expr; + } } prev_tv_expr = expr; } - // Insert read after write syncs - const std::vector exprs = loop_nests_; - for (auto expr : exprs) { - handle(expr); - } + kir::ExprMutator::traverseAndInsert(_exprs); TORCH_INTERNAL_ASSERT( - sync_after_.empty(), "Didn't place all required syncs."); + sync_before_.empty(), "Didn't place all required syncs."); } private: //! Keep track of expressions that must be followed by syncthreads - std::deque sync_after_; + std::deque> sync_before_; //! Keep track of write expressions that must be placed before //! syncthreads. //! - //! syncthreads is placed after for each expression of - //! sync_after_. However, if it's inside a loop with halo, it must + //! syncthreads is placed before for each expression of + //! sync_before_. However, if it's inside a loop with halo, it must //! be placed before that. last_writes_ keeps track of expressions //! modifying the smem buffer each syncthreads is used for so that //! it is not placed before those write expressions. - std::deque> last_writes_; - - //! Keep track of for loops while inserting syncthreads - std::vector for_loops_; - - //! Loop-nests where syncthreads are inserted - std::vector loop_nests_; + std::deque> last_writes_; public: - static std::vector insert( - const std::vector& loop_nests) { + static std::vector insert(const std::vector& loop_nests) { ReadAfterWriteSyncs inserter(loop_nests); - return inserter.loop_nests_; + return inserter.exprs_; } }; } // namespace -std::vector insertRawThreadSynchronization( - const std::vector& exprs) { +std::vector insertRawThreadSynchronization( + const std::vector& exprs) { FUSER_PERF_SCOPE("GpuLower::Lower::insertRawThreadSynchronization"); return ReadAfterWriteSyncs::insert(exprs); } -std::vector insertWarThreadSynchronization( - const std::vector& exprs) { +std::vector insertWarThreadSynchronization( + const std::vector& exprs) { FUSER_PERF_SCOPE("GpuLower::Lower::insertWarThreadSynchronization"); - LocalSyncInserter::insertSyncs(exprs); - return exprs; + return WarSyncInserter::insert(exprs); } } // namespace cuda } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/lower_insert_syncs.h b/torch/csrc/jit/codegen/cuda/lower_insert_syncs.h index 506183734484..756462f0bd7c 100644 --- a/torch/csrc/jit/codegen/cuda/lower_insert_syncs.h +++ b/torch/csrc/jit/codegen/cuda/lower_insert_syncs.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -16,40 +16,14 @@ namespace cuda { //! //! WAR race condition occurs when the next iteration of the loop overwrites //! shared memory value before a previous operation has finished reading it. -//! -//! WAR Race Check: -//! Track all output shared memory TVs before first sync -//! Track all input shared memory TVs after last sync -//! If the intersection is non-empty, then there is a WAR race condition. -//! Recursively check each nested for-loop -//! -//! Parent-Child For-Loop Recursive Relationship -//! Notation: -//! None - Zero Syncs -//! 1+ - One or more Syncs -//! End - Sync is last op in for-loop to prevent WAR race condition -//! -//! Default: Track all shared memory inputs and outputs -//! -//! Parent - None -//! Child - None => Append All Child Outputs to Parent Initial -//! Child - 1+ => Parent first sync => Inherit Child Initial + Final -//! Child - End => Parent first sync => Keep Child Initial / Clear Parent Final -//! -//! Parent - 1+ -//! Child - None => Append All Child to Parent Last -//! Child - 1+ => Child Final to Parent Final / Discard Child Initial -//! Child - End => Clear Parent Last / Discard Child Initial -//! -//! If Child - End and Parent has zero remaining operations, then -//! Parent inherits Child End. -//! -std::vector insertWarThreadSynchronization( - const std::vector& exprs); +std::vector insertWarThreadSynchronization( + const std::vector& exprs); //! Insert syncs between writing to shared memory and then reading it. -std::vector insertRawThreadSynchronization( - const std::vector& exprs); +//! RAW pass is run before indexing, unrolling (loop duplication), memory +//! aliasing, and index (grid/block bcast/reduction) +std::vector insertRawThreadSynchronization( + const std::vector& exprs); } // namespace cuda } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/lower_loops.cpp b/torch/csrc/jit/codegen/cuda/lower_loops.cpp index e4396f9a864b..aa0ff1a44469 100644 --- a/torch/csrc/jit/codegen/cuda/lower_loops.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_loops.cpp @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include @@ -19,7 +18,7 @@ namespace jit { namespace fuser { namespace cuda { -std::vector LoopNestGenerator::loweredExprs( +std::vector LoopNestGenerator::loweredExprs( const std::vector& exprs) { FUSER_PERF_SCOPE("GpuLower::Lower::LoopNestGenerator::loweredExprs"); TORCH_INTERNAL_ASSERT(FusionGuard::getCurFusion() != nullptr); @@ -33,22 +32,20 @@ LoopNestGenerator::LoopNestGenerator(const std::vector& exprs) { namespace { -kir::ForLoop* openForHelper(kir::ForLoop* scope, kir::IterDomain* kir_id) { - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - auto extent_with_halo = gpu_lower->haloInfo().getExtent(kir_id); +kir::ForLoop* openForHelper(kir::ForLoop* scope, IterDomain* id) { + auto extent_with_halo = GpuLower::current()->haloInfo().getExtent(id); kir::ForLoop* new_scope = nullptr; if (extent_with_halo) { // When an axis is extended with halo, unrolling and vectorization // are assumed to not be used for now. TORCH_INTERNAL_ASSERT( - kir_id->parallelType() != ParallelType::Unroll && - !isParallelTypeVectorize(kir_id->parallelType())); + id->getParallelType() != ParallelType::Unroll && + !isParallelTypeVectorize(id->getParallelType())); // Use the extent that's extended by halo - new_scope = ir_builder.create( - kir_id, - kir_id->isBroadcast() ? ir_builder.zeroVal() - : ir_builder.create(c10::nullopt), + new_scope = IrBuilder::create( + id, + id->isBroadcast() ? GpuLower::current()->kernel()->zeroVal() + : IrBuilder::create(c10::nullopt), nullptr, extent_with_halo, nullptr, @@ -56,7 +53,7 @@ kir::ForLoop* openForHelper(kir::ForLoop* scope, kir::IterDomain* kir_id) { nullptr, false); } else { - new_scope = ir_builder.create(kir_id); + new_scope = IrBuilder::create(id); } if (scope != nullptr) { scope->body().insert(0, new_scope); @@ -66,13 +63,13 @@ kir::ForLoop* openForHelper(kir::ForLoop* scope, kir::IterDomain* kir_id) { } // namespace -void LoopNestGenerator::openFor(kir::IterDomain* kir_iter_domain) { +void LoopNestGenerator::openFor(IterDomain* id) { if (for_loops_.size() > 0) { - const auto new_scope = openForHelper(for_loops_.back(), kir_iter_domain); + const auto new_scope = openForHelper(for_loops_.back(), id); // for_loop_allocations_.insert({new_scope, 0}); for_loops_.push_back(new_scope); } else { - for_loops_.push_back(openForHelper(nullptr, kir_iter_domain)); + for_loops_.push_back(openForHelper(nullptr, id)); lowered_exprs_.insert(lowered_exprs_.begin(), for_loops_.back()); } } @@ -82,7 +79,7 @@ void LoopNestGenerator::closeFor() { for_loops_.pop_back(); } -void LoopNestGenerator::pushFront(kir::Expr* expr) { +void LoopNestGenerator::pushFront(Expr* expr) { if (for_loops_.size() == 0) { lowered_exprs_.insert(lowered_exprs_.begin(), expr); } else { @@ -91,18 +88,15 @@ void LoopNestGenerator::pushFront(kir::Expr* expr) { } void LoopNestGenerator::handle(Expr* expr) { - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - // Check if it's a tensor view expression we need to place in the loop nest // structure - if (!ir_utils::isTVOp(expr)) { + if (!ir_utils::isTvOp(expr)) { // Close all the loops, scalar operations cannot be inside for loops based // on expr sorting. while (!for_loops_.empty()) { closeFor(); } - pushFront(gpu_lower->lowerExpr(expr)); + pushFront(expr); for (auto out : expr->outputs()) { TORCH_INTERNAL_ASSERT( @@ -112,10 +106,8 @@ void LoopNestGenerator::handle(Expr* expr) { " cannot lower ", out->getValType().value()); - pushFront(ir_builder.create( - gpu_lower->lowerValue(out), - MemoryType::Local, - ir_builder.create(1))); + pushFront(IrBuilder::create( + out, MemoryType::Local, GpuLower::current()->kernel()->oneVal())); } return; } @@ -130,27 +122,19 @@ void LoopNestGenerator::handle(Expr* expr) { // Figure out what the entire loop structure should look like. std::vector loop_structure = loop_structures_.at(out_tv); - std::vector kir_loop_structure; - std::transform( - loop_structure.begin(), - loop_structure.end(), - std::back_inserter(kir_loop_structure), - [&gpu_lower](IterDomain* id) { - return gpu_lower->lowerValue(id)->as(); - }); // Ordering of loop_structure is global, so simply close loops we don't need, // and open the ones we do. while (!for_loops_.empty() && std::find( - kir_loop_structure.begin(), - kir_loop_structure.end(), - for_loops_.back()->iter_domain()) == kir_loop_structure.end()) { + loop_structure.begin(), + loop_structure.end(), + for_loops_.back()->iter_domain()) == loop_structure.end()) { closeFor(); } - for (auto loop : kir_loop_structure) { + for (auto loop : loop_structure) { auto find_it = std::find_if( for_loops_.begin(), for_loops_.end(), [loop](kir::ForLoop* fl) { return fl->iter_domain() == loop; @@ -160,47 +144,9 @@ void LoopNestGenerator::handle(Expr* expr) { } } - pushFront(gpu_lower->lowerExpr(expr)); + pushFront(expr); } -namespace { -// Copied verbatim from lower_expr_sort EXCEPT map is parallel map, not loop -// map, and direction is reversed -struct LocalDomainSorter { - LocalDomainSorter( - const std::unordered_map>& - concrete_id_dependencies) - : concrete_id_dependencies_(concrete_id_dependencies) {} - - // Return if id0 should be before id1 - inline bool operator()(IterDomain* id0, IterDomain* id1) { - auto concrete_id_0 = - GpuLower::current()->caParallelMap().getConcreteMappedID(id0); - auto concrete_id_1 = - GpuLower::current()->caParallelMap().getConcreteMappedID(id1); - - if (concrete_id_dependencies_.find(concrete_id_0) != - concrete_id_dependencies_.end()) { - const auto& dependencies_0 = concrete_id_dependencies_.at(concrete_id_0); - // if id0 depends on id1 it means id1 is outside id0, so id1 < id0 - return !dependencies_0.count(concrete_id_1); - } - - if (concrete_id_dependencies_.find(concrete_id_1) != - concrete_id_dependencies_.end()) { - const auto& dependencies_1 = concrete_id_dependencies_.at(concrete_id_1); - // if id1 depends on id0 it means id1 is inside id0, so id0 < id1 - return dependencies_1.count(concrete_id_0); - } - - return true; - } - - const std::unordered_map>& - concrete_id_dependencies_; -}; -} // namespace - // Generate the loop nest structure and place it in lowered_exprs_ void LoopNestGenerator::generate(const std::vector& exprs) { TORCH_INTERNAL_ASSERT(lowered_exprs_.empty()); @@ -209,11 +155,10 @@ void LoopNestGenerator::generate(const std::vector& exprs) { // for an example why see FusionAdvancedLowering6 // Grab iteration domain dependencies, similar to the logic in - // lower_expr_sort, EXCEPT it is based on parallel map not loop map, and - // dependencies are in opposite order, inner loops are dependant on outer - // loops. + // lower_expr_sort, EXCEPT dependencies are in opposite order, + // inner loops are dependant on outer loops. - const auto& parallel_map = GpuLower::current()->caParallelMap(); + const auto& ca_map = GpuLower::current()->caMap(); std::unordered_map> concrete_id_dependencies; @@ -221,7 +166,8 @@ void LoopNestGenerator::generate(const std::vector& exprs) { std::unordered_set dependencies; for (auto tv_id : tv->domain()->domain()) { - auto concrete_id = parallel_map.getConcreteMappedID(tv_id); + auto concrete_id = + ca_map->getConcreteMappedID(tv_id, IdMappingMode::LOOP); if (concrete_id_dependencies.find(concrete_id) == concrete_id_dependencies.end()) { @@ -232,7 +178,7 @@ void LoopNestGenerator::generate(const std::vector& exprs) { } // Loops after tv_id are dependent on tv_id - dependencies.emplace(parallel_map.getConcreteMappedID(tv_id)); + dependencies.emplace(concrete_id); } } @@ -290,8 +236,8 @@ void LoopNestGenerator::generate(const std::vector& exprs) { continue; } - auto last_id_concrete = - parallel_map.getConcreteMappedID(tv->axis((int)(tv->nDims() - 1))); + auto last_id_concrete = ca_map->getConcreteMappedID( + tv->axis((int)(tv->nDims() - 1)), IdMappingMode::LOOP); auto all_loops_it = concrete_id_dependencies.find(last_id_concrete); TORCH_INTERNAL_ASSERT( all_loops_it != concrete_id_dependencies.end(), @@ -301,10 +247,13 @@ void LoopNestGenerator::generate(const std::vector& exprs) { // Dependencies of last domain doesn't include last domain, include it // manually loop_structure.emplace_back(last_id_concrete); + // reverse sort (rbegin & rend) since we want the reverse of the order + // given by IterDomainDependencySorter std::sort( - loop_structure.begin(), - loop_structure.end(), - LocalDomainSorter(concrete_id_dependencies)); + loop_structure.rbegin(), + loop_structure.rend(), + IterDomainDependencySorter( + concrete_id_dependencies, GpuLower::current()->caMap())); loop_structures_[tv] = loop_structure; } diff --git a/torch/csrc/jit/codegen/cuda/lower_loops.h b/torch/csrc/jit/codegen/cuda/lower_loops.h index fbbdf079e89c..9b480d7eb6f8 100644 --- a/torch/csrc/jit/codegen/cuda/lower_loops.h +++ b/torch/csrc/jit/codegen/cuda/lower_loops.h @@ -1,13 +1,12 @@ #pragma once -#include +#include #include #include #include #include -#include #include namespace torch { @@ -30,20 +29,20 @@ namespace cuda { //! nests to initialize reduction buffers. class TORCH_CUDA_CU_API LoopNestGenerator { public: - static std::vector loweredExprs(const std::vector& exprs); + static std::vector loweredExprs(const std::vector& exprs); private: LoopNestGenerator(const std::vector& exprs); // Open a new inner most for loop, track which TV it was constructed from // according to the computeAt chain. - void openFor(kir::IterDomain*); + void openFor(IterDomain*); // Close the inner most for loop void closeFor(); // Appends an expression to the current scope - void pushFront(kir::Expr* expr); + void pushFront(Expr* expr); void handle(Expr* expr); @@ -52,7 +51,7 @@ class TORCH_CUDA_CU_API LoopNestGenerator { private: // Lowered exprs to return - std::vector lowered_exprs_; + std::vector lowered_exprs_; // Keep all for loops conveniently to make unrolling easier, basically just a // stack of the active for_loops diff --git a/torch/csrc/jit/codegen/cuda/lower_magic_zero.cpp b/torch/csrc/jit/codegen/cuda/lower_magic_zero.cpp index f5f5c72676a6..f17f91806d61 100644 --- a/torch/csrc/jit/codegen/cuda/lower_magic_zero.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_magic_zero.cpp @@ -2,7 +2,7 @@ #include #include -#include +#include #include namespace torch { @@ -12,11 +12,11 @@ namespace cuda { namespace { -class MagicZeroInserter : public kir::MutableIrVisitor { +class MagicZeroInserter : public kir::ExprMutator { public: - static std::vector insert(const std::vector& exprs) { + static std::vector insert(const std::vector& exprs) { MagicZeroInserter inserter(exprs); - return inserter.loop_nests_; + return inserter.exprs_; } private: @@ -25,94 +25,43 @@ class MagicZeroInserter : public kir::MutableIrVisitor { kir::ForLoop* fl = nullptr; }; - MagicZeroInserter(const std::vector& exprs) - : loop_nests_(exprs), ir_builder(GpuLower::current()->kernel()) { - loop_nests_.insert( - loop_nests_.begin(), ir_builder.create()); - for (auto expr : exprs) { - handle(expr); - } - insertAll(); - } - - void handle(kir::Expr* expr) { - if (auto ite = dynamic_cast(expr)) { - handle(ite); - } else if (auto for_loop = dynamic_cast(expr)) { - handle(for_loop); - } - } - - void handle(kir::IfThenElse* ite) { - scope_nest_.push_back(&ite->thenBody()); - for (auto expr : ite->thenBody().exprs()) { - handle(expr); - } - scope_nest_.pop_back(); - scope_nest_.push_back(&ite->elseBody()); - for (auto expr : ite->elseBody().exprs()) { - handle(expr); - } - scope_nest_.pop_back(); + MagicZeroInserter(const std::vector& exprs) { + TORCH_INTERNAL_ASSERT(exprs.size()); + kir::ExprMutator::registerInsertBefore( + exprs.front(), IrBuilder::create(), nullptr); + kir::ExprMutator::traverseAndInsert(exprs); } - void handle(kir::ForLoop* fl) { + void handle(kir::ForLoop* fl) final { if (fl->isUnrolled()) { - kir::Scope* scope = nullptr; - if (!scope_nest_.empty()) { - scope = scope_nest_.back(); - } - insertion_list_.push_back({scope, fl}); - } else { - scope_nest_.push_back(&fl->body()); - for (auto expr : fl->body().exprs()) { - handle(expr); - } - scope_nest_.pop_back(); - } - } - - void insertAll() { - for (const auto& info : insertion_list_) { - auto fl = info.fl; - auto scope = info.scope; - if (scope == nullptr) { - // place in global scope - auto loop_it = std::find(loop_nests_.begin(), loop_nests_.end(), fl); - TORCH_INTERNAL_ASSERT(loop_it != loop_nests_.end()); - // Place after the loop - loop_it++; - loop_nests_.insert(loop_it, ir_builder.create()); + if (scope_.empty()) { + kir::ExprMutator::registerInsertAfter( + fl, IrBuilder::create()); } else { - scope->insert_after(fl, ir_builder.create()); + TORCH_INTERNAL_ASSERT( + scope_.back()->exprs().size(), "Not expecting an empty loop."); + kir::ExprMutator::registerInsertAfter( + fl, IrBuilder::create(), scope_.back()); } + } else { + kir::ExprMutator::handle(fl); } } - //! Keep track for loop structure - std::vector scope_nest_; - - // Keep a copy of the expressions provided - std::vector loop_nests_; - - kir::IrBuilder ir_builder; - std::vector insertion_list_; }; } // namespace -std::vector insertMagicZero(const std::vector& exprs) { +std::vector insertMagicZero(const std::vector& exprs) { FUSER_PERF_SCOPE("GpuLower::Lower::insertMagicZero"); // Check if magic zero was even used, if not we don't have to define it or // update it. const auto gpu_lower = GpuLower::current(); auto kernel = gpu_lower->kernel(); - const bool has_magic_zero = std::any_of( - kernel->irNodes().begin(), - kernel->irNodes().end(), - [](const std::unique_ptr& ir_node) { - return ir_node->isA() && isMagicZero(ir_node->as()); + const bool has_magic_zero = + std::any_of(kernel->vals().begin(), kernel->vals().end(), [](Val* val) { + return isMagicZero(val); }); if (!has_magic_zero) { @@ -122,19 +71,21 @@ std::vector insertMagicZero(const std::vector& exprs) { return MagicZeroInserter::insert(exprs); } -bool isMagicZero(kir::Val* val) { - auto ns = dynamic_cast(val); - if (ns == nullptr) { +bool isMagicZero(const Val* val) { + if (!val->isA()) { return false; } + auto ns = val->as(); return ns->dtype() == DataType::Int && ns->name() == std::string(kMagicZeroName); } -bool isProtectedWithMagicZero(kir::Val* val) { - auto def = dynamic_cast(val->definition()); - return def && def->operation() == BinaryOpType::Add && - isMagicZero(def->rhs()); +bool isProtectedWithMagicZero(const Val* val) { + if (val->definition() == nullptr || !val->definition()->isA()) { + return false; + } + auto bop = val->definition()->as(); + return bop->getBinaryOpType() == BinaryOpType::Add && isMagicZero(bop->rhs()); } } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/lower_magic_zero.h b/torch/csrc/jit/codegen/cuda/lower_magic_zero.h index 03a37a46813c..942a33028017 100644 --- a/torch/csrc/jit/codegen/cuda/lower_magic_zero.h +++ b/torch/csrc/jit/codegen/cuda/lower_magic_zero.h @@ -14,15 +14,15 @@ namespace cuda { //! zero update after every (outer most) loop nest with a compile time extent. //! //! This will make sure nvrtc does not aggressively save predicate and indices. -std::vector insertMagicZero(const std::vector& exprs); +std::vector insertMagicZero(const std::vector& exprs); //! Check if val is a reference to the magic zero variable -bool isMagicZero(kir::Val* val); +bool isMagicZero(const Val* val); //! Check if val is protected with magic zero. //! //! Specifically, this returns true if val is defined as "x + magic_zero". -bool isProtectedWithMagicZero(kir::Val* val); +bool isProtectedWithMagicZero(const Val* val); } // namespace cuda } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.cpp b/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.cpp index b94c12c27c83..66b405ac8e2f 100644 --- a/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.cpp @@ -5,8 +5,7 @@ #include #include #include -#include -#include +#include #include #include #include @@ -18,85 +17,64 @@ namespace cuda { namespace { -class MisalignedVectorizationModifier { +class MisalignedVectorizationModifier : public kir::ExprMutator { public: - void process(const std::vector& exprs) { - FUSER_PERF_SCOPE( - "GpuLower::Lower::MisalignedVectorizationModifier::process"); - // Run through loop nests - // Find for-loops with misaligned vectorization domains - for (auto* expr : exprs) { - handle(expr); - } - } + MisalignedVectorizationModifier() = delete; - const std::unordered_map& replacementMap() const { - return expr_replacement_map_; + static std::vector processMisalignedVectorization( + const std::vector& exprs) { + FUSER_PERF_SCOPE("GpuLower::Lower::processMisalignedVectorization"); + MisalignedVectorizationModifier mvm(exprs); + return mvm.exprs_; } private: - void handle(kir::Expr* expr) { - if (auto for_loop = dynamic_cast(expr)) { - handle(for_loop); - } else if (auto ite = dynamic_cast(expr)) { - handle(ite); - } + MisalignedVectorizationModifier(const std::vector& exprs) { + FUSER_PERF_SCOPE("GpuLower::Lower::MisalignedVectorizationModifier"); + // Run through loop nests + // Find for-loops with misaligned vectorization domains + kir::ExprMutator::traverseAndInsert(exprs); } - void handle(kir::ForLoop* fl) { - for_loops_structure_.push_back(fl); - - // Make copy of exprs because we replace them inplace in fl - const auto exprs_copy = fl->body().exprs(); - + void handle(kir::ForLoop* fl) final { + kir::Scope* scope = scope_.empty() ? nullptr : scope_.back(); if (containsAnyDirectChildMisalignedVectorize(fl)) { - auto new_fl = handleMisalignedVectorize(for_loops_structure_, fl); - expr_replacement_map_.insert({fl, new_fl}); - } else { - for (auto expr : exprs_copy) { - handle(expr); - } - } + for_loops_.push_back(fl); + auto new_fl = handleMisalignedVectorize(for_loops_, fl); + for_loops_.pop_back(); - for_loops_structure_.pop_back(); - } - - void handle(kir::IfThenElse* ite) { - for (auto expr : ite->thenBody().exprs()) { - handle(expr); - } - for (auto expr : ite->elseBody().exprs()) { - handle(expr); + kir::ExprMutator::registerReplace(fl, new_fl, scope); + } else { + kir::ExprMutator::handle(fl); } } struct ReferenceTensors { // Input TensorView to Vectorize Set operation - kir::TensorView* in_tv = nullptr; + TensorView* in_tv = nullptr; // Output TensorView to Vectorize Set operation - kir::TensorView* out_tv = nullptr; + TensorView* out_tv = nullptr; // TensorView in global memory - kir::TensorView* global_tv = nullptr; + TensorView* global_tv = nullptr; // TensorView with vectorize IterDomain and not in global memory - kir::TensorView* vec_tv = nullptr; + TensorView* vec_tv = nullptr; }; - ReferenceTensors getReferenceTensors(kir::Expr* vectorized_expr) { + ReferenceTensors getReferenceTensors(Expr* vectorized_expr) { TORCH_INTERNAL_ASSERT(vectorized_expr != nullptr); TORCH_INTERNAL_ASSERT( - vectorized_expr->outputs().front()->isA()); - TORCH_INTERNAL_ASSERT( - vectorized_expr->inputs().front()->isA()); + vectorized_expr->outputs().front()->isA()); + TORCH_INTERNAL_ASSERT(vectorized_expr->inputs().front()->isA()); - auto in_tv = vectorized_expr->inputs().front()->as(); - auto out_tv = vectorized_expr->outputs().front()->as(); + auto in_tv = vectorized_expr->inputs().front()->as(); + auto out_tv = vectorized_expr->outputs().front()->as(); const bool global_vectorize_write_op = - (out_tv->memoryType() == MemoryType::Global && - in_tv->memoryType() == MemoryType::Local); + (out_tv->getMemoryType() == MemoryType::Global && + in_tv->getMemoryType() == MemoryType::Local); const bool global_vectorize_read_op = - (out_tv->memoryType() == MemoryType::Local && - in_tv->memoryType() == MemoryType::Global); + (out_tv->getMemoryType() == MemoryType::Local && + in_tv->getMemoryType() == MemoryType::Global); TORCH_INTERNAL_ASSERT( global_vectorize_write_op || global_vectorize_read_op, "Unsupported vectorize memory configuration detected."); @@ -104,25 +82,26 @@ class MisalignedVectorizationModifier { // TensorView on global memory. This is the tensor that may have // a non-aligned base address. auto global_tv = - (out_tv->memoryType() == MemoryType::Global) ? out_tv : in_tv; + (out_tv->getMemoryType() == MemoryType::Global) ? out_tv : in_tv; // TensorView with the misaligned vec iterDomain. It is the consumer // of vectorized load or the producer of vectorized store. It is // assumed that when the output TV is not on global memory, this // expression is a vectorized load, so the output TV is vec_tv. - auto vec_tv = (out_tv->memoryType() != MemoryType::Global) ? out_tv : in_tv; + auto vec_tv = + (out_tv->getMemoryType() != MemoryType::Global) ? out_tv : in_tv; return {in_tv, out_tv, global_tv, vec_tv}; } struct VectorizeData { - kir::Val* vector_size = nullptr; - kir::Val* shift = nullptr; - kir::Val* extent = nullptr; - kir::Val* remainder = nullptr; - kir::Val* extent_minus_remainder = nullptr; - kir::Val* last_root_domain_index = nullptr; - kir::Val* last_root_domain_index_shift = nullptr; + Val* vector_size = nullptr; + Val* shift = nullptr; + Val* extent = nullptr; + Val* remainder = nullptr; + Val* extent_minus_remainder = nullptr; + Val* last_root_domain_index = nullptr; + Val* last_root_domain_index_shift = nullptr; }; // Create constants for handling misaligned addresses @@ -130,48 +109,43 @@ class MisalignedVectorizationModifier { const std::vector& for_loop_structure, const ReferenceTensors& tensors, kir::IfThenElse* parent_scope_ite) { - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); - // Generate vectorize index - auto indices = (tensors.out_tv->memoryType() == MemoryType::Global) - ? Index::getConsumerStridedIndices( - tensors.out_tv->fuserTv(), for_loop_structure) + auto indices = (tensors.out_tv->getMemoryType() == MemoryType::Global) + ? Index::getConsumerStridedIndices(tensors.out_tv, for_loop_structure) : Index::getProducerStridedIndices( - tensors.in_tv->fuserTv(), - tensors.out_tv->fuserTv(), - for_loop_structure); + tensors.in_tv, tensors.out_tv, for_loop_structure); // >>>>>>>>>>>>> // Number of elements in vectorize access auto vector_size = - tensors.vec_tv->domain()->domain().back()->extent()->as(); + tensors.vec_tv->domain()->domain().back()->extent()->as(); // Size of memory type for the elements - kir::Int* data_size_in_bytes = - ir_builder.create(dataTypeSize(tensors.vec_tv->dtype())); + Int* data_size_in_bytes = + IrBuilder::create(dataTypeSize(tensors.vec_tv->dtype())); // The number of bytes in the vectorize access auto vector_size_in_bytes = - ir_builder.mulExpr(vector_size, data_size_in_bytes); + IrBuilder::mulExpr(vector_size, data_size_in_bytes); - auto index = ir_builder.create( - tensors.global_tv->fuserTv(), indices); + auto index = + IrBuilder::create(tensors.global_tv, indices); auto address = createNamedScalarFromValue( parent_scope_ite->thenBody(), index, "address", true); // offset_size = (address % vector_size_bytes) / data_type_size_bytes // shift_init = vector_size - offset_size - auto a = ir_builder.modExpr(address, vector_size_in_bytes); - auto b = ir_builder.divExpr(a, data_size_in_bytes); - auto c = ir_builder.subExpr(vector_size, b); + auto a = IrBuilder::modExpr(address, vector_size_in_bytes); + auto b = IrBuilder::divExpr(a, data_size_in_bytes); + auto c = IrBuilder::subExpr(vector_size, b); auto shift_init = createNamedScalarFromValue( parent_scope_ite->thenBody(), c, "shift_val"); // shift = (shift_init == vector_size) ? 0 : shift_init // The number of elements until the first aligned address - auto shift_pred = ir_builder.eqExpr(shift_init, vector_size); - auto shift_val = - ir_builder.whereExpr(shift_pred, ir_builder.zeroVal(), shift_init); + auto shift_pred = IrBuilder::eqExpr(shift_init, vector_size); + auto shift_val = IrBuilder::whereExpr( + shift_pred, GpuLower::current()->kernel()->zeroVal(), shift_init); // >>>>>>>>>>>>> auto shift = createNamedScalarFromValue( @@ -183,13 +157,13 @@ class MisalignedVectorizationModifier { // remainder = (extent - shift) % vector_size // The number of elements remaining not accessed by vectorized operations - auto remaining_extent = ir_builder.subExpr(extent, shift); - auto remainder_val = ir_builder.modExpr(remaining_extent, vector_size); + auto remaining_extent = IrBuilder::subExpr(extent, shift); + auto remainder_val = IrBuilder::modExpr(remaining_extent, vector_size); auto remainder = createNamedScalarFromValue( parent_scope_ite->thenBody(), remainder_val, "remainder"); // (extent - remainder) is the upper-bound for the vectorize section - auto extent_remainder_val = ir_builder.subExpr(extent, remainder); + auto extent_remainder_val = IrBuilder::subExpr(extent, remainder); // >>>>>>>>>>>>> auto extent_minus_remainder = createNamedScalarFromValue( @@ -203,7 +177,7 @@ class MisalignedVectorizationModifier { // >>>>>>>>>>>>> auto last_root_domain_index_shift = - ir_builder.addExpr(last_root_domain_index, shift); + IrBuilder::addExpr(last_root_domain_index, shift); return { vector_size, @@ -220,20 +194,18 @@ class MisalignedVectorizationModifier { kir::IfThenElse* createVectorizeSection( const std::vector& child_loops, const VectorizeData& params) { - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); - auto vectorized_child_loops = cloneForLoops( child_loops, params.vector_size, nullptr, true, params.shift); // Vectorize Range: [shift - (extent-remainder)) // (last_root_domain_index + shift) < (extent - remainder) - kir::Val* vectorize_cond = ir_builder.ltExpr( + Val* vectorize_cond = IrBuilder::ltExpr( params.last_root_domain_index_shift, params.extent_minus_remainder); kir::Predicate* vectorize_pred = - ir_builder.create(vectorize_cond->as()); + IrBuilder::create(vectorize_cond->as()); kir::IfThenElse* vectorize_ite = - ir_builder.create(vectorize_pred); + IrBuilder::create(vectorize_pred); for (auto cloned_loop : vectorized_child_loops) { vectorize_ite->thenBody().push_back(cloned_loop); @@ -247,20 +219,19 @@ class MisalignedVectorizationModifier { kir::IfThenElse* createInitialSection( const std::vector& child_loops, const VectorizeData& params) { - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); - auto pre_child_loops = cloneForLoops( child_loops, params.vector_size, params.shift, false, nullptr); // Initial Range: [0 - shift) // last_root_domain_index == 0 - kir::Val* initial_cond = - ir_builder.eqExpr(params.last_root_domain_index, ir_builder.zeroVal()); + Val* initial_cond = IrBuilder::eqExpr( + params.last_root_domain_index, + GpuLower::current()->kernel()->zeroVal()); kir::Predicate* initial_pred = - ir_builder.create(initial_cond->as()); + IrBuilder::create(initial_cond->as()); kir::IfThenElse* initial_ite = - ir_builder.create(initial_pred); + IrBuilder::create(initial_pred); for (auto cloned_loop : pre_child_loops) { initial_ite->thenBody().push_back(cloned_loop); @@ -274,23 +245,21 @@ class MisalignedVectorizationModifier { kir::IfThenElse* createRemainderSection( const std::vector& child_loops, const VectorizeData& params) { - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); - auto post_child_loops = cloneForLoops( child_loops, params.vector_size, params.remainder, false, params.shift); // Remainder Range: [(extent-remainder) - extent) // (extent - remainder) <= last_root_domain_index + shift < extent - kir::Val* lower_bound = ir_builder.geExpr( + Val* lower_bound = IrBuilder::geExpr( params.last_root_domain_index_shift, params.extent_minus_remainder); - kir::Val* upper_bound = - ir_builder.ltExpr(params.last_root_domain_index_shift, params.extent); - kir::Val* remainder_cond = ir_builder.andExpr(lower_bound, upper_bound); + Val* upper_bound = + IrBuilder::ltExpr(params.last_root_domain_index_shift, params.extent); + Val* remainder_cond = IrBuilder::andExpr(lower_bound, upper_bound); kir::Predicate* remainder_pred = - ir_builder.create(remainder_cond->as()); + IrBuilder::create(remainder_cond->as()); kir::IfThenElse* remainder_ite = - ir_builder.create(remainder_pred); + IrBuilder::create(remainder_pred); for (auto cloned_loop : post_child_loops) { remainder_ite->thenBody().push_back(cloned_loop); @@ -302,8 +271,6 @@ class MisalignedVectorizationModifier { kir::ForLoop* handleMisalignedVectorize( std::vector for_loop_structure, const kir::ForLoop* parent_for_loop) { - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); - auto child_loops = findChildForLoops(parent_for_loop); // Assumption: All vectorize operations have the same shift @@ -315,17 +282,19 @@ class MisalignedVectorizationModifier { // The parent_for_loop contains allocate, read, compute, write operations const auto new_parent_for_loop = - ir_builder.create(parent_for_loop); + IrBuilder::create(parent_for_loop); // Transfer all expressions except for-loops to new parent for-loop // All expressions are placed at the beginning of the new for-loop - moveExprsExceptForLoops(parent_for_loop, new_parent_for_loop); + copyExprsExceptForLoops(parent_for_loop, new_parent_for_loop); // Get the predicate for all but the last root domain - auto pred_except_last_root_domain = ir_builder.create( - PredicateType::Misaligned, vectorized_expr, ir_builder.trueVal()); + auto pred_except_last_root_domain = IrBuilder::create( + PredicateType::Misaligned, + vectorized_expr, + GpuLower::current()->kernel()->trueVal()); kir::IfThenElse* pred_ite = - ir_builder.create(pred_except_last_root_domain); + IrBuilder::create(pred_except_last_root_domain); new_parent_for_loop->body().push_back(pred_ite); auto constants = createVectorizeConstants( @@ -351,17 +320,17 @@ class MisalignedVectorizationModifier { // Determine that the expression is UnaryOpType::Set AND // the output TensorView domain is vectorized - bool isVectorizeSetOp(kir::ForLoop* fl, kir::Expr* expr) { - if (fl->iter_domain()->parallelType() != + bool isVectorizeSetOp(kir::ForLoop* fl, Expr* expr) { + if (fl->iter_domain()->getParallelType() != ParallelType::MisalignedVectorize) { return false; } - if (expr->isA()) { - auto unaryOp = expr->as(); - if (unaryOp->out()->isA()) { - auto out_tv = unaryOp->out()->as(); - return unaryOp->operation() == UnaryOpType::Set && + if (expr->isA()) { + auto unaryOp = expr->as(); + if (unaryOp->out()->isA()) { + auto out_tv = unaryOp->out()->as(); + return unaryOp->getUnaryOpType() == UnaryOpType::Set && out_tv->domain()->hasVectorize(); } } @@ -374,15 +343,14 @@ class MisalignedVectorizationModifier { // vectorize flag - Do not generate for loop header // shift value - Add shift to global indices generated within for loop std::vector cloneForLoops( - const std::vector& for_loops, - kir::Val* loop_stop, - kir::Val* pred_stop, + const std::vector& for_loops_, + Val* loop_stop, + Val* pred_stop, bool vectorize, - kir::Val* vectorize_shift) { - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); + Val* vectorize_shift) { std::vector cloned_for_loops; - for (auto fl : for_loops) { + for (auto fl : for_loops_) { auto first_expr = fl->body().exprs().front(); bool has_vectorize_op = isVectorizeSetOp(fl, first_expr); @@ -391,12 +359,12 @@ class MisalignedVectorizationModifier { TORCH_INTERNAL_ASSERT( !has_vectorize_op || fl->body().exprs().size() == 1); - const auto new_loop = ir_builder.create( + const auto new_loop = IrBuilder::create( fl->iter_domain(), fl->index(), - ir_builder.zeroVal(), + GpuLower::current()->kernel()->zeroVal(), loop_stop, - ir_builder.oneVal(), + GpuLower::current()->kernel()->oneVal(), vectorize && has_vectorize_op, vectorize_shift, fl->isUnrollRequired()); @@ -406,9 +374,9 @@ class MisalignedVectorizationModifier { // Predicate the loop body if pred_stop is not null. This is to // make sure the loop itself is completely unrollable. if (pred_stop != nullptr) { - auto body_pred = ir_builder.create( - ir_builder.ltExpr(new_loop->index(), pred_stop)->as()); - auto body_ite = ir_builder.create(body_pred); + auto body_pred = IrBuilder::create( + IrBuilder::ltExpr(new_loop->index(), pred_stop)->as()); + auto body_ite = IrBuilder::create(body_pred); body->push_back(body_ite); body = &body_ite->thenBody(); } @@ -423,7 +391,7 @@ class MisalignedVectorizationModifier { } // Add all expressions except for loops to new parent for loop - void moveExprsExceptForLoops( + void copyExprsExceptForLoops( const kir::ForLoop* for_loop, kir::ForLoop* new_loop) { std::vector loops; @@ -448,10 +416,10 @@ class MisalignedVectorizationModifier { // Find the first vectorize set - either read or write // Add child For-Loop to for_loop_structure // Enable vectorize flag in child For-Loop - kir::Expr* findFirstVectorizedSetOp( + Expr* findFirstVectorizedSetOp( std::vector& for_loop_structure, - const std::vector& for_loops) { - for (auto fl : for_loops) { + const std::vector& for_loops_) { + for (auto fl : for_loops_) { auto first_expr = fl->body().exprs().front(); bool has_vectorize_op = isVectorizeSetOp(fl, first_expr); if (has_vectorize_op) { @@ -463,38 +431,31 @@ class MisalignedVectorizationModifier { } // Get full extent for the inner-most, merged root domain - kir::Val* getVectorizeExtent( - kir::TensorView* producer_tv, - kir::TensorView* consumer_tv) { + Val* getVectorizeExtent(TensorView* producer_tv, TensorView* consumer_tv) { const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - - auto consumer_fuser_tv = consumer_tv->fuserTv(); - auto producer_fuser_tv = producer_tv->fuserTv(); - auto p2c = - PairwiseRootDomainMap(producer_fuser_tv, consumer_fuser_tv) - .mapProducerToConsumer( - producer_fuser_tv->domain(), consumer_fuser_tv->domain()); + auto p2c = PairwiseRootDomainMap(producer_tv, consumer_tv) + .mapProducerToConsumer( + producer_tv->domain(), consumer_tv->domain()); auto consumer_root_right_of_ca_domains = IterVisitor::getInputsTo( - {consumer_fuser_tv->domain()->domain().begin() + - consumer_fuser_tv->getComputeAtPosition(), - consumer_fuser_tv->domain()->domain().end()}); + {consumer_tv->domain()->domain().begin() + + consumer_tv->getComputeAtPosition(), + consumer_tv->domain()->domain().end()}); auto producer_root_right_of_ca_domains = IterVisitor::getInputsTo( - {producer_fuser_tv->domain()->domain().begin() + - producer_fuser_tv->getComputeAtPosition(), - producer_fuser_tv->domain()->domain().end()}); + {producer_tv->domain()->domain().begin() + + producer_tv->getComputeAtPosition(), + producer_tv->domain()->domain().end()}); - const auto& consumer_contig = consumer_fuser_tv->domain()->contiguity(); - const auto& producer_contig = producer_fuser_tv->domain()->contiguity(); + const auto& consumer_contig = consumer_tv->domain()->contiguity(); + const auto& producer_contig = producer_tv->domain()->contiguity(); - auto producer_root_domain = producer_fuser_tv->getMaybeRFactorDomain(); + auto producer_root_domain = producer_tv->getMaybeRFactorDomain(); // Calculate extent of merged root domains - kir::Val* extent = nullptr; + Val* extent = nullptr; auto consumer_root_idx = - int(consumer_fuser_tv->getMaybeRFactorDomain().size()) - 1; + int(consumer_tv->getMaybeRFactorDomain().size()) - 1; for (int i = int(producer_root_domain.size()) - 1; i >= 0; --i) { auto producer_root_id = producer_root_domain.at(i); @@ -533,11 +494,10 @@ class MisalignedVectorizationModifier { // We now know it's safe to extend the vectorization domain to these // axes. It shouldn't matter whether producer or consumer is used. - auto consumer_extent = gpu_lower->lowerValue(consumer_root_id->extent()); if (extent == nullptr) { - extent = consumer_extent; + extent = consumer_root_id->extent(); } else { - extent = ir_builder.mulExpr(extent, consumer_extent); + extent = IrBuilder::mulExpr(extent, consumer_root_id->extent()); } // If it's not contiguous, extending the vectorization domain @@ -554,57 +514,37 @@ class MisalignedVectorizationModifier { return extent; } - kir::Val* createNamedScalarFromValue( + Val* createNamedScalarFromValue( kir::Scope& body, - kir::Val* val, + Val* val, const std::string& name, bool address = false) { - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); - auto namedScalar = (address) ? ir_builder.addressExprNamedScalar(name, val) - : ir_builder.setExprNamedScalar(name, val); + auto namedScalar = (address) ? IrBuilder::addressExprNamedScalar(name, val) + : IrBuilder::setExprNamedScalar(name, val); TORCH_INTERNAL_ASSERT(namedScalar->definition() != nullptr); - auto alloc = ir_builder.create( - namedScalar, MemoryType::Local, ir_builder.oneVal()); + auto alloc = IrBuilder::create( + namedScalar, + MemoryType::Local, + GpuLower::current()->kernel()->oneVal()); body.push_back(alloc); body.push_back(namedScalar->definition()); return namedScalar; } - - private: - // We will track which loops in the incoming IR will be replaced and by what - std::unordered_map expr_replacement_map_; - - // A depth-first ordering of nested for loops - // It is used for indexing and predicate generation - std::vector for_loops_structure_; }; } // namespace -std::vector processMisalignedVectorization( - Fusion* fusion, - const std::vector& exprs) { - FUSER_PERF_SCOPE("GpuLower::Lower::processMisalignedVectorization"); - - MisalignedVectorizationModifier mvm; - mvm.process(exprs); - - std::vector mutated_exprs; - mutated_exprs.reserve(exprs.size()); - for (auto expr : exprs) { - mutated_exprs.push_back( - ir_utils::applyReplacements(mvm.replacementMap(), expr)); - } - - return mutated_exprs; +std::vector processMisalignedVectorization( + const std::vector& exprs) { + return MisalignedVectorizationModifier::processMisalignedVectorization(exprs); } bool containsAnyDirectChildMisalignedVectorize(const kir::ForLoop* fl) { for (auto expr : fl->body().exprs()) { if (expr->isA()) { auto child_fl = expr->as(); - if (child_fl->iter_domain()->parallelType() == + if (child_fl->iter_domain()->getParallelType() == ParallelType::MisalignedVectorize) { return true; } diff --git a/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.h b/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.h index 588d3787752b..bd7ae19d93a8 100644 --- a/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.h +++ b/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.h @@ -1,5 +1,5 @@ #pragma once -#include +#include #include @@ -106,9 +106,8 @@ namespace cuda { //! } //! } //! -std::vector processMisalignedVectorization( - Fusion* fusion, - const std::vector& exprs); +std::vector processMisalignedVectorization( + const std::vector& exprs); bool containsAnyDirectChildMisalignedVectorize(const kir::ForLoop* fl); diff --git a/torch/csrc/jit/codegen/cuda/lower_predicate.cpp b/torch/csrc/jit/codegen/cuda/lower_predicate.cpp index 838d5d85d9e4..cda210989f17 100644 --- a/torch/csrc/jit/codegen/cuda/lower_predicate.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_predicate.cpp @@ -1,16 +1,13 @@ #include #include -#include #include #include #include #include #include -#include -#include +#include #include -#include #include #include #include @@ -23,37 +20,65 @@ namespace cuda { namespace { -class ConditionalFromPredicateModifier { +class ConditionalFromPredicateModifier : public kir::IrVisitor { public: - ConditionalFromPredicateModifier(const std::vector& exprs) { + ConditionalFromPredicateModifier() = delete; + + static std::vector fillPredicates(const std::vector& exprs) { + ConditionalFromPredicateModifier cfpm(exprs); + return cfpm.exprs_; + } + + private: + ConditionalFromPredicateModifier(const std::vector& exprs) { FUSER_PERF_SCOPE( "GpuLower::Lower::ConditionalFromPredicateModifier::process"); - for (auto* expr : exprs) { - handle(expr); - } + kir::IrVisitor::handle(exprs); } - const std::unordered_map& replacementMap() const { - return expr_replacement_map_; - } + using kir::IrVisitor::handle; - private: - void handle(kir::Expr* expr) { - if (auto for_loop = dynamic_cast(expr)) { - handle(for_loop); - } else if (auto ite = dynamic_cast(expr)) { - handle(ite); - } else if (expr != nullptr && expr->predicate() != nullptr) { + void handle(Expr* expr) final { + if (expr != nullptr && expr->predicate() != nullptr) { // Replace expr predicate with bool conditional auto conditional = generateConditional(expr->predicate()); + if (expr->predicate()->predicate_type() == PredicateType::Vectorize) { + // TODO: This logic doesn't seem to fit well here, for unswitch the + // logic is in the unroll loop to set the thread predicate to the expr. + // I didn't have a quick way to do that so placing this here for now. + TORCH_INTERNAL_ASSERT( + expr->isA(), + "Predicate handling expects ITE statement."); + auto ite = expr->as(); + + TORCH_INTERNAL_ASSERT( + ite->thenBody().size() == 1, + "Expecting predicated body to only have one vectorized expression."); + auto vec_expr = ite->thenBody()[0]; + TORCH_INTERNAL_ASSERT( + vec_expr->isA(), + "Vectorize predicate exprs only supported on set operations."); + TORCH_INTERNAL_ASSERT( + ir_utils::isTvOp(vec_expr), + "Vectorize predicate exprs only supported on tensor view operations."); + if (!vec_expr->inputs()[0]->isConstScalar()) { + conditional = SimplifyingIrBuilder::andExpr( + conditional, + GpuLower::current()->threadPredMap().getPredicate( + ir_utils::getTvOutput(vec_expr))) + ->as(); + } + } TORCH_INTERNAL_ASSERT(conditional != nullptr); expr->predicate()->setValue(conditional); TORCH_INTERNAL_ASSERT(expr->predicate()->value() != nullptr); setWritePredicate(expr, conditional); } + + kir::IrVisitor::handle(expr); } - void setWritePredicate(kir::Expr* expr, kir::Bool* read_cond) { + void setWritePredicate(Expr* expr, Bool* read_cond) { if (expr->writePredicate() != nullptr) { auto write_cond = generateConditional(expr->writePredicate()); if (write_cond) { @@ -66,46 +91,25 @@ class ConditionalFromPredicateModifier { } } - void handle(kir::ForLoop* fl) { - for_loops_structure_.push_back(fl); - - const auto exprs_copy = fl->body().exprs(); - for (auto expr : exprs_copy) { - handle(expr); - } - - for_loops_structure_.pop_back(); - } - - void handle(kir::IfThenElse* ite) { + void handle(kir::IfThenElse* ite) final { TORCH_INTERNAL_ASSERT(ite->predicate() != nullptr); // If ite already has Bool conditional, handle internal expressions // Otherwise, generate conditional and update predicate - if (ite->predicate()->hasValue()) { - const auto then_exprs_copy = ite->thenBody().exprs(); - for (auto expr : then_exprs_copy) { - handle(expr); - } - - const auto else_exprs_copy = ite->elseBody().exprs(); - for (auto expr : else_exprs_copy) { - handle(expr); - } - } else { + if (!ite->predicate()->hasValue()) { auto conditional = generateConditional(ite->predicate()); TORCH_INTERNAL_ASSERT(conditional != nullptr); - TORCH_INTERNAL_ASSERT(conditional->isA()); + TORCH_INTERNAL_ASSERT(conditional->isA()); // Update bool conditional in-place ite->predicate()->setValue(conditional); - handle(ite); TORCH_INTERNAL_ASSERT(ite->predicate()->value() != nullptr); } + kir::IrVisitor::handle(ite); } // Generate conditional according to PredicateType - kir::Bool* generateConditional(kir::Predicate* pred) { + Bool* generateConditional(kir::Predicate* pred) { switch (pred->predicate_type()) { case PredicateType::Inline: case PredicateType::ReductionWrite: @@ -114,15 +118,16 @@ class ConditionalFromPredicateModifier { case PredicateType::Padding: { return PredicateCompute::getInlinePredicate( pred->expr(), - for_loops_structure_, + for_loops_, pred->thread_pred(), pred->predicate_type()); } case PredicateType::Vectorize: { std::vector outer_loops; kir::ForLoop* vectorized_loop = nullptr; - for (auto loop : for_loops_structure_) { - if (loop->iter_domain()->parallelType() == ParallelType::Vectorize) { + for (auto loop : for_loops_) { + if (loop->iter_domain()->getParallelType() == + ParallelType::Vectorize) { vectorized_loop = loop; break; } else { @@ -134,8 +139,7 @@ class ConditionalFromPredicateModifier { return UnswitchPredicate::get(outer_loops, vectorized_loop); } case PredicateType::Unswitch: { - return UnswitchPredicate::get( - for_loops_structure_, pred->unrolled_loop()); + return UnswitchPredicate::get(for_loops_, pred->unrolled_loop()); } case PredicateType::Manual: { return pred->value(); @@ -145,429 +149,13 @@ class ConditionalFromPredicateModifier { } return nullptr; } - - private: - // We will track which loops in the incoming IR will be replaced and by what - std::unordered_map expr_replacement_map_; - - // A depth-first ordering of nested for loops - // It is used for indexing and predicate generation - std::vector for_loops_structure_; -}; - -} // namespace - -std::vector generateConditionalFromPredicate( - Fusion* fusion, - const std::vector& exprs) { - FUSER_PERF_SCOPE("GpuLower::Lower::generateConditionalFromPredicate"); - - ConditionalFromPredicateModifier p2cm(exprs); - - std::vector mutated_exprs; - mutated_exprs.reserve(exprs.size()); - for (auto expr : exprs) { - mutated_exprs.push_back( - ir_utils::applyReplacements(p2cm.replacementMap(), expr)); - } - - return mutated_exprs; -} - -namespace { - -class PredicateAnalyzer : public OptOutDispatch { - public: - //! Checks if a predicate is needed to avoid out-of-bound accesses. - //! - //! Due to the way we allocate local-memory tensors, there should - //! never be out-of-bound accesses with consumer tensors when allocated on - //! local memory. However, accessing producer tensors still may - //! result in out-of-bound as they are replayed as consumers. - static bool needsPredicate(TensorView* producer, TensorView* consumer) { - // Both tensors must be on local memory. Global tensors must be - // predicated as allocation is done based on root domains. Smem - // and local tensors are allocated based on leaf domains, however, - // smem tensors are parallelized, which is highly likely, the size - // of the parallelized axis is the actual size of the axis, not - // the number of threads. Since the number of threads can be - // larger than the axis size, it's not safe to skip predication - if (!(producer->getMemoryType() == MemoryType::Local && - consumer->getMemoryType() == MemoryType::Local)) { - return true; - } - - auto pairwise_map = PairwiseRootDomainMap(producer, consumer); - auto c2p = - BestEffortReplay::replayPasC(producer, consumer, -1, pairwise_map) - .getReplay(); - - PredicateAnalyzer analyzer(c2p); - - for (auto id : consumer->domain()->domain()) { - if (analyzer.needsPredicate(id)) { - return true; - } - } - - return false; - } - - private: - PredicateAnalyzer(const std::unordered_map& c2p_map) - : c2p_map_(c2p_map) {} - - // Returns true if no out-of-bound accesses could occur with a - // producer - bool needsPredicate(IterDomain* consumer_id) { - needs_predicate_ = false; - handle(consumer_id); - return needs_predicate_; - } - - using OptOutDispatch::handle; - - void handle(IterDomain* consumer_id) override { - // The traversal should have ended if needs_predicate_ was true - TORCH_INTERNAL_ASSERT(!needs_predicate_); - - // If consumer_id is not going to be materialized as a loop (e.g., - // broadcast), no need to predicate - const auto gpu_lower = GpuLower::current(); - if (consumer_id->isBroadcast() || - gpu_lower->trivialReductionInfo().isDerived(consumer_id)) { - return; - } - - // If the producer has a matching domain, it should not cause - // out-of-bound accesses - if (c2p_map_.find(consumer_id) != c2p_map_.end()) { - return; - } - - // If no definition exists, stop traversing - if (consumer_id->definition() == nullptr) { - return; - } - - handle(consumer_id->definition()); - } - - // If it splits the input axis evenly, proceeds to check the input - // axis. Otherwise, we can't skip predication as it might cause - // out-bound accesses with the producer tensor - void handle(Split* split) override { - auto factor = split->factor()->getInt(); - if (!factor.has_value()) { - needs_predicate_ = true; - return; - } - - ExpressionEvaluator ee(split->fusion()); - const auto in_extent = ee.evaluate(split->in()->extent()); - - if (!in_extent.has_value() || ((in_extent.value() % factor.value()) != 0)) { - needs_predicate_ = true; - return; - } - - handle(split->in()); - } - - void handle(Merge* merge) override { - handle(merge->inner()); - if (needs_predicate_) { - return; - } - handle(merge->outer()); - } - - private: - //! BestEffort map from consumer IDs to producer IDs - const std::unordered_map& c2p_map_; - bool needs_predicate_ = false; }; } // namespace -bool PredicateElimination::needsPredicate(Expr* expr) const { - if (!ir_utils::isTVOp(expr)) { - return false; - } - - std::vector> filters; - - // Always predicate integer division and related ops as we don't - // know what values are in the out-of-bound region and they may - // cause exceptions - filters.emplace_back([](Expr* expr) { - auto dt = expr->outputs()[0]->getDataType().value(); - return ( - (dt == DataType::Int || dt == DataType::Int32) && - expr->isA() && - (expr->as()->getBinaryOpType() == BinaryOpType::Div || - expr->as()->getBinaryOpType() == BinaryOpType::Mod || - expr->as()->getBinaryOpType() == BinaryOpType::Remainder || - expr->as()->getBinaryOpType() == BinaryOpType::CeilDiv)); - }); - - // Skip if MisalignedVectorize is involved for now. This could be - // relaxed. - filters.emplace_back([](Expr* expr) { - std::vector*> inputs_and_outputs = { - &(expr->inputs()), &(expr->outputs())}; - for (const auto& inputs_or_outputs : inputs_and_outputs) { - for (auto tv : ir_utils::filterByType(*inputs_or_outputs)) { - if (std::any_of( - tv->domain()->domain().begin(), - tv->domain()->domain().end(), - [](IterDomain* axis) { - return axis->getParallelType() == - ParallelType::MisalignedVectorize; - })) { - return true; - } - } - } - return false; - }); - - // Shift is not supported yet. - filters.emplace_back([](Expr* expr) { - auto& halo_info = GpuLower::current()->haloInfo(); - auto input_tvs = ir_utils::filterByType(expr->inputs()); - return halo_info.needsShiftPredicate(expr) || - std::any_of(input_tvs.begin(), input_tvs.end(), [&](auto input_tv) { - return input_tv->definition() != nullptr && - halo_info.needsShiftPredicate(input_tv->definition()); - }); - }); - - // Predicates the expression if any producer-consumer pair of the - // expression needs to be predicated - filters.emplace_back([](Expr* expr) { - for (auto output : ir_utils::filterByType(expr->outputs())) { - for (auto input : ir_utils::filterByType(expr->inputs())) { - if (PredicateAnalyzer::needsPredicate(input, output)) { - return true; - } - } - } - return false; - }); - - // Predicates Welford ops - filters.emplace_back([](Expr* expr) { return expr->isA(); }); - - // If this is a reduction, and if we omit the predicate for the - // input, the input may have a garbabe value, which must not be used - // for this reduction. However, if the input is also an output of - // another reduction with the same binary op, which is a common - // pattern with rfactor, the input should be safe to use with no - // predication. - filters.emplace_back([this](Expr* expr) { - if (expr->isA()) { - auto input = expr->inputs()[0]->as(); - auto input_def = input->definition(); - // When input_def is null, input must be an input to the fusion, - // so that must be allocated on global memory. Since we don't omit - // predication for expressions involving global memory, this - // should never occur. - TORCH_INTERNAL_ASSERT( - input_def != nullptr, "Inconsistent input found: ", input); - - if (non_predicated_exprs_.find(input_def) != - non_predicated_exprs_.end() && - !(input_def->isA() && - (expr->as()->getReductionOpType() == - input_def->as()->getReductionOpType()))) { - return true; - } - } - return false; - }); - - // If any of the filters returns true, predicate must be used. - return std::any_of(filters.begin(), filters.end(), [expr](auto filter) { - return filter(expr); - }); -} - -void PredicateElimination::handle(Expr* expr) { - if (!ir_utils::isTVOp(expr)) { - return; - } - - if (needsPredicate(expr)) { - return; - } - - non_predicated_exprs_.insert(expr); - - // Ensure all inputs have some values set at the out-of-bound - // regions - for (auto input : ir_utils::filterByType(expr->inputs())) { - auto input_def = input->definition(); - // When input_def is null, input must be an input to the fusion, - // so that must be allocated on global memory. Since we don't omit - // predication for expressions involving global memory, this - // should never occur. - std::stringstream ss; - ss << input; - TORCH_INTERNAL_ASSERT( - input_def != nullptr, "Inconsistent input found: ", ss.str()); - - // If input is an output of reduction, it should be fully - // initialied as it's allocated on local memory. - if (input_def->isA() || input_def->isA()) { - continue; - } - - // If this expr is reduction, always initilize the input with the - // default value. NOTE: This can be done more - // intelligently. A garbage value can only cause a problem when - // it's reduced with non-garbage values, so if the non-reduction - // axes do not have any garbage, it should be just fine without - // explicit initialization. However, initialization cost should be - // cheap, so that further optimization should not make a large - // difference. - if (expr->isA()) { - setReductionInitValue(input, expr->as()->init()); - continue; - } - - // If an input does not need a predicate either, then it should - // have some value, so no need to set a default value - if (non_predicated_exprs_.find(input_def) != non_predicated_exprs_.end()) { - continue; - } - - // Make sure input is initialized - setDefaultInitValue(input); - } -} - -bool PredicateElimination::setDefaultInitValue(TensorView* tv) { - auto it = init_value_map_.find(tv); - // If there's already a mapping for tv, it should be mapped to a - // zero val or a reduction init. Either case, no need to modify - // the existing mapping. - if (it == init_value_map_.end()) { - init_value_map_.insert({tv, nullptr}); - } - return true; -} - -bool PredicateElimination::setReductionInitValue( - TensorView* tv, - Val* reduction_init) { - auto it = init_value_map_.find(tv); - if (it == init_value_map_.end()) { - init_value_map_.insert({tv, reduction_init}); - return true; - } - - auto existing_val = it->second; - if (existing_val == nullptr) { - // If the existing mapping returns nullptr, it means that a - // default init was set before. Overwrite with the reduction - // init val. - init_value_map_[tv] = reduction_init; - return true; - } else if (existing_val->sameAs(reduction_init)) { - return true; - } else { - TORCH_INTERNAL_ASSERT( - false, - "Incosistent setting of initialization value for t", - tv->name(), - ". Prev: ", - existing_val, - ", New: ", - reduction_init); - return false; - } -} - -bool PredicateElimination::canOmitPredicate(const Expr* expr) const { - TORCH_INTERNAL_ASSERT(expr != nullptr); - const auto out_tv = ir_utils::getTVOutput(expr); - TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Not a tensor expression"); - // No need to predicate local tensors to which a scalar is assigned - if (out_tv->getMemoryType() == MemoryType::Local) { - if (auto uop = dynamic_cast(expr)) { - if (uop->getUnaryOpType() == UnaryOpType::Set && uop->in()->isScalar()) { - return true; - } - } - } - if (non_predicated_exprs_.find(expr) != non_predicated_exprs_.end()) { - return true; - } - - return false; -} - -bool PredicateElimination::canOmitPredicate(const kir::Expr* kir_expr) const { - TORCH_INTERNAL_ASSERT(kir_expr != nullptr); - const auto out_tv = ir_utils::getTVOutput(kir_expr); - TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Not a tensor expression"); - // No need to predicate local tensors to which a scalar is assigned - if (out_tv->memoryType() == MemoryType::Local) { - if (auto uop = dynamic_cast(kir_expr)) { - if (uop->operation() == UnaryOpType::Set && uop->in()->isScalar()) { - return true; - } - } - } - const auto fuser_tv = out_tv->fuserTv(); - if (fuser_tv == nullptr) { - return false; - } - return canOmitPredicate(fuser_tv->definition()); -} - -kir::Val* PredicateElimination::getInitValue(TensorView* tv) const { - auto it = init_value_map_.find(tv); - if (it == init_value_map_.end()) { - return nullptr; - } - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - auto init_val = it->second; - if (init_val == nullptr) { - // No reduction restriction. Just use zero - return ir_builder.zeroVal(); - } else { - return gpu_lower->lowerValue(init_val); - } -} - -void PredicateElimination::build(Fusion* fusion) { - traverseFrom(fusion, fusion->outputs()); -} - -std::string PredicateElimination::toString() const { - std::stringstream ss; - ss << "Tensors that do not need predication:"; - for (auto expr : non_predicated_exprs_) { - for (auto out : expr->outputs()) { - TORCH_INTERNAL_ASSERT(out->isA()); - ss << " T" << out->name(); - } - } - ss << "\n"; - ss << "Init values:"; - for (auto kv : init_value_map_) { - ss << " T" << kv.first->name() << "->"; - if (kv.second == nullptr) { - ss << ""; - } else { - ss << kv.second; - } - } - ss << "\n"; - return ss.str(); +std::vector generateConditionalFromPredicate( + const std::vector& exprs) { + return ConditionalFromPredicateModifier::fillPredicates(exprs); } } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/lower_predicate.h b/torch/csrc/jit/codegen/cuda/lower_predicate.h index 393d0fa5c184..7f4926dad917 100644 --- a/torch/csrc/jit/codegen/cuda/lower_predicate.h +++ b/torch/csrc/jit/codegen/cuda/lower_predicate.h @@ -1,5 +1,5 @@ #pragma once -#include +#include #include #include @@ -13,49 +13,8 @@ namespace cuda { //! Update predicates with valid bool conditionals //! -std::vector generateConditionalFromPredicate( - Fusion* fusion, - const std::vector& exprs); - -class TORCH_CUDA_CU_API PredicateElimination : public IterVisitor { - public: - void build(Fusion* fusion); - - //! True if expr does not need a predicate - //! - //! \param expr Tensor expression - bool canOmitPredicate(const Expr* expr) const; - - //! True if expr does not need a predicate - //! - //! \param expr KIR tensor expr - bool canOmitPredicate(const kir::Expr* expr) const; - - //! Value to initialize out-of-bound regions - kir::Val* getInitValue(TensorView* tv) const; - - //! Dump to string for debugging - std::string toString() const; - - private: - using IterVisitor::handle; - - void handle(Expr* expr) override; - - //! Set a value to initialize out-of-bound regions - bool setDefaultInitValue(TensorView* tv); - //! Set a value to initialize out-of-bound regions of reduction tensors - bool setReductionInitValue(TensorView* tv, Val* reduction_init); - - //! Check if expr needs to be predicated - bool needsPredicate(Expr* expr) const; - - private: - //! Expressions that are found to be safe without predicates - std::unordered_set non_predicated_exprs_; - //! Tensors and their initialization values - std::unordered_map init_value_map_; -}; +std::vector generateConditionalFromPredicate( + const std::vector& exprs); } // namespace cuda } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/lower_predicate_elimination.cpp b/torch/csrc/jit/codegen/cuda/lower_predicate_elimination.cpp new file mode 100644 index 000000000000..53fccbdfc5c6 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/lower_predicate_elimination.cpp @@ -0,0 +1,715 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +namespace { + +// Warp primitives are currently limited to un-predicated usage, +// predicating these ops will require extra steps to ensure that +// the whole warp will get the same value. +void assertOnWarpOps(const Expr* expr) { + TORCH_INTERNAL_ASSERT( + !expr->isA(), + "Mma op: cannot eliminate predicate for mma op, tiling not valid. ", + expr->toString()); +} + +} // namespace + +namespace { + +class PredicateAnalyzer : public OptOutDispatch { + public: + //! Checks if a predicate is needed to avoid out-of-bound accesses. + //! + //! Due to the way we allocate local-memory tensors, there should + //! never be out-of-bound accesses with consumer tensors when allocated on + //! local memory. However, accessing producer tensors still may + //! result in out-of-bound as they are replayed as consumers. + static bool needsPredicate(TensorView* producer, TensorView* consumer) { + // Both tensors must be on local memory. Global tensors must be + // predicated as allocation is done based on root domains. Smem + // and local tensors are allocated based on leaf domains, however, + // smem tensors are parallelized, which is highly likely, the size + // of the parallelized axis is the actual size of the axis, not + // the number of threads. Since the number of threads can be + // larger than the axis size, it's not safe to skip predication + + // Check that parallel dimension will not generate out of bound index + if (!(producer->getMemoryType() == MemoryType::Local && + consumer->getMemoryType() == MemoryType::Local)) { + return true; + } + + auto pairwise_map = PairwiseRootDomainMap(producer, consumer); + auto c2p = + BestEffortReplay::replayPasC(producer, consumer, -1, pairwise_map) + .getReplay(); + + PredicateAnalyzer analyzer(c2p); + + for (auto id : consumer->domain()->domain()) { + if (analyzer.needsPredicate(id)) { + return true; + } + } + + return false; + } + + private: + PredicateAnalyzer(const std::unordered_map& c2p_map) + : c2p_map_(c2p_map) {} + + // Returns true if no out-of-bound accesses could occur with a + // producer + bool needsPredicate(IterDomain* consumer_id) { + needs_predicate_ = false; + handle(consumer_id); + return needs_predicate_; + } + + void handle(IterDomain* consumer_id) override { + // The traversal should have ended if needs_predicate_ was true + TORCH_INTERNAL_ASSERT(!needs_predicate_); + + // If consumer_id is not going to be materialized as a loop (e.g., + // broadcast), no need to predicate + if (consumer_id->isBroadcast() || + GpuLower::current()->trivialReductionInfo().isDerived(consumer_id)) { + return; + } + + // If the producer has a matching domain, it should not cause + // out-of-bound accesses + if (c2p_map_.find(consumer_id) != c2p_map_.end()) { + return; + } + + // If no definition exists, stop traversing + if (consumer_id->definition() == nullptr) { + return; + } + + OptOutDispatch::handle(consumer_id->definition()); + } + + // If it splits the input axis evenly, proceeds to check the input + // axis. Otherwise, we can't skip predication as it might cause + // out-bound accesses with the producer tensor + void handle(Split* split) override { + auto factor = split->factor()->getInt(); + if (!factor.has_value()) { + needs_predicate_ = true; + return; + } + + ExpressionEvaluator ee(split->fusion()); + const auto in_extent = ee.evaluate(split->in()->extent()); + + if (!in_extent.has_value() || ((in_extent.value() % factor.value()) != 0)) { + needs_predicate_ = true; + return; + } + + handle(split->in()); + } + + void handle(Merge* merge) override { + handle(merge->inner()); + if (needs_predicate_) { + return; + } + handle(merge->outer()); + } + + private: + //! BestEffort map from consumer IDs to producer IDs + const std::unordered_map& c2p_map_; + bool needs_predicate_ = false; +}; + +class PredicateChcker : public IterVisitor { + public: + static bool needsPredicate( + Expr* expr, + const std::unordered_set& non_predicated_exprs) { + if (!ir_utils::isTvOp(expr)) { + return false; + } + + PredicateChcker checker(non_predicated_exprs); + checker.handle(expr); + return checker.needs_predicate_; + } + + private: + PredicateChcker(const std::unordered_set& non_predicated_exprs) + : non_predicated_exprs_(non_predicated_exprs) {} + + using IterVisitor::handle; + + void handle(Expr* expr) final { + needs_predicate_ = predicateIntDiv(expr) || + predicateMisalignedVectorize(expr) || predicateShift(expr) || + predicateProducerConsumerPair(expr) || + predicateNonDivisibleRootDomains(expr) || + predicateNonDivisibleSplit(expr); + + if (needs_predicate_) { + return; + } + + // Check ExprType-specific conditions + IterVisitor::handle(expr); + } + + // All "predicateXYZ" functions return true if an expr needs to be + // predicated. + + // Always predicate integer division and related ops as we don't + // know what values are in the out-of-bound region and they may + // cause exceptions + bool predicateIntDiv(Expr* expr) const { + auto dt = expr->outputs()[0]->getDataType().value(); + return ( + (dt == DataType::Int || dt == DataType::Int32) && + expr->isA() && + (expr->as()->getBinaryOpType() == BinaryOpType::Div || + expr->as()->getBinaryOpType() == BinaryOpType::Mod || + expr->as()->getBinaryOpType() == BinaryOpType::Remainder || + expr->as()->getBinaryOpType() == BinaryOpType::CeilDiv)); + } + + // Skip if MisalignedVectorize is involved for now. This could be + // relaxed. + bool predicateMisalignedVectorize(Expr* expr) const { + std::vector*> inputs_and_outputs = { + &(expr->inputs()), &(expr->outputs())}; + for (const auto& inputs_or_outputs : inputs_and_outputs) { + for (auto tv : ir_utils::filterByType(*inputs_or_outputs)) { + if (std::any_of( + tv->domain()->domain().begin(), + tv->domain()->domain().end(), + [](IterDomain* axis) { + return axis->getParallelType() == + ParallelType::MisalignedVectorize; + })) { + return true; + } + } + } + return false; + } + + // Shift is not supported yet. + bool predicateShift(Expr* expr) const { + auto& halo_info = GpuLower::current()->haloInfo(); + auto input_tvs = ir_utils::filterByType(expr->inputs()); + return halo_info.needsShiftPredicate(expr) || + std::any_of(input_tvs.begin(), input_tvs.end(), [&](auto input_tv) { + return input_tv->definition() != nullptr && + halo_info.needsShiftPredicate(input_tv->definition()); + }); + } + + // Predicates the expression if any producer-consumer pair of the + // expression needs to be predicated + bool predicateProducerConsumerPair(Expr* expr) const { + for (auto output : ir_utils::filterByType(expr->outputs())) { + for (auto input : ir_utils::filterByType(expr->inputs())) { + if (PredicateAnalyzer::needsPredicate(input, output)) { + return true; + } + } + } + return false; + } + + // An index can exceed the logical extent of the indexed domain if + // it's split. It can cause a reduction op to reduce the same value + // multiple times. Even a pointwise op can be a problem if the + // consumer is an alias of the producer. This check excludes such + // expressions from predicate elimination. + // + // This is not an issue if the index includes a zero domain (as defined in + // index_compute.cpp), the extent is calculated by multiplying the + // split output domains, so it never cross the domain boundary. + // So, if a root domain is split and none of its descendants is a + // zero domain, the expr needs to be predicated. See + // FusionPredicateElimination6 for a concrete example. + // + // It would be also possible to avoid register aliasing instead of + // giving up predicate elimination. Since this condition should be + // rather uncommon, either would be fine as long as correctness is + // provided. + bool predicateNonDivisibleRootDomains(Expr* expr) const { + for (auto output : ir_utils::filterByType(expr->outputs())) { + const auto all_exprs = DependencyCheck::getAllExprsBetween( + {output->getMaybeRFactorDomain().begin(), + output->getMaybeRFactorDomain().end()}, + {output->domain()->domain().begin(), + output->domain()->domain().end()}); + std::unordered_set split_root; + std::copy_if( + output->getMaybeRFactorDomain().begin(), + output->getMaybeRFactorDomain().end(), + std::inserter(split_root, split_root.end()), + [&](auto rf_root) { + if (rf_root->isBroadcast() || + GpuLower::current()->trivialReductionInfo().isDerived( + rf_root)) { + return false; + } + for (Expr* use : rf_root->uses()) { + if (std::find(all_exprs.begin(), all_exprs.end(), use) == + all_exprs.end()) { + continue; + } + return use->isA(); + } + return false; + }); + // If no root domain is split, no need to predicate + if (split_root.empty()) { + continue; + } + TORCH_INTERNAL_ASSERT( + output->getMemoryType() == MemoryType::Local, + "Local memory tensor is assumed: ", + output->toString()); + std::vector zero_leaf_ids; + for (const auto i : c10::irange(output->nDims())) { + auto leaf_id = output->axis(i); + if (i < output->getComputeAtPosition() || leaf_id->isThread() || + leaf_id->isMma()) { + zero_leaf_ids.push_back(leaf_id); + } + } + if (zero_leaf_ids.empty()) { + return true; + } + const auto vals = + DependencyCheck::getAllValsBetween(split_root, zero_leaf_ids); + if (std::any_of( + split_root.begin(), + split_root.end(), + [&vals](auto split_root_id) { + return std::find(vals.begin(), vals.end(), split_root_id) == + vals.end(); + })) { + return true; + } + } + return false; + } + + // Always predicate if non-divisible split is found. It may be + // possible to make it less conservative. + // See FusionPredicateElimination7 for a concrete example. + bool predicateNonDivisibleSplit(Expr* expr) const { + const auto& non_divisible_split_info = + GpuLower::current()->nonDivisibleSplitInfo(); + for (auto output : ir_utils::filterByType(expr->outputs())) { + if (non_divisible_split_info.splitsToPredicate().find(output) != + non_divisible_split_info.splitsToPredicate().end()) { + return true; + } + } + return false; + } + + // If this is a reduction, and if we omit the predicate for the + // input, the input may have a garbabe value, which must not be used + // for this reduction. However, it is still legal to omit its + // predicate when: 1) the predicate of the input is not omitted and + // 2) the input can be initialized to the init value of this + // reduction. When the input is the output of another reduciton, the + // input is initialized to the init value of the reduction, so the + // two reductions must use the same init value. + // See FusionPredicateElimination3 and FusionPredicateElimination4 + // for concrete examples. + void handle(ReductionOp* rop) final { + auto input = rop->inputs()[0]->as(); + auto input_def = input->definition(); + // When input_def is null, input must be an input to the fusion, + // so that must be allocated on global memory. Since we don't omit + // predication for expressions involving global memory, this + // should never occur. + TORCH_INTERNAL_ASSERT( + input_def != nullptr, "Inconsistent input found: ", input); + + // The input needs to be initialized to the init value to omit + // the predicate, so if the input has its own init value, i.e., + // produced by another reduction, they must use the same init + // value. + Val* input_init = ir_utils::getReductionInitValOf(input); + if (input_init != nullptr && !rop->init()->sameAs(input_init)) { + needs_predicate_ = true; + return; + } + + // If input is not predicated, out-of-bound value may be + // overwritten by a garbage value. However, it doesn't matter if + // the input is also produced by another reduction. If the preceding + // reduction omits the predicate, it means its input must be + // initialized to its init value, so no predicate should be + // needed in both of the two reduction ops if they use the same + // init value, which is guaranteed by the above check, and the + // same reduction op. + if (auto input_def_rop = dynamic_cast(input_def)) { + if (rop->getReductionOpType() != input_def_rop->getReductionOpType() && + non_predicated_exprs_.find(input_def) != + non_predicated_exprs_.end()) { + needs_predicate_ = true; + return; + } + } else if ( + non_predicated_exprs_.find(input_def) != non_predicated_exprs_.end()) { + needs_predicate_ = true; + return; + } + } + + // Welford. See FusionPredicateElimination5. + void handle(WelfordOp* wop) final { + for (const auto i : c10::irange(3)) { + auto init = wop->getInitVals()[i]; + + // Welford input can be a scalar. Predicate is required unless + // the scalar value is equal to the init value. + auto input = wop->inputs().at(i); + if (input->isScalar()) { + if (!input->sameAs(init)) { + needs_predicate_ = true; + return; + } + continue; + } + + auto input_tv = dynamic_cast(input); + TORCH_INTERNAL_ASSERT(input_tv != nullptr); + + auto input_def = input->definition(); + + // When input_def is null, input must be an input to the fusion, + // so that must be allocated on global memory. Since we don't omit + // predication for expressions involving global memory, this + // should never occur. + TORCH_INTERNAL_ASSERT( + input_def != nullptr, "Inconsistent input found: ", input); + + // The input needs to be initialized to the init value to omit + // the predicate, so if the input has its own init value, i.e., + // produced by another reduction, they must use the same init + // value. + Val* input_init = ir_utils::getReductionInitValOf(input_tv); + if (input_init != nullptr && !init->sameAs(input_init)) { + needs_predicate_ = true; + return; + } + + // If input is not predicated, out-of-bound value may be + // overwritten by a garbage value. However, it doesn't matter if + // the input is also produced by another welford. + if (!input_def->isA() && + non_predicated_exprs_.find(input_def) != + non_predicated_exprs_.end()) { + needs_predicate_ = true; + } + } + } + + void handle(GroupedReductionOp* grouped_rop) final { + for (const auto i : c10::irange(grouped_rop->numReductions())) { + auto input = grouped_rop->input(i)->as(); + auto input_def = input->definition(); + // When input_def is null, input must be an input to the fusion, + // so that must be allocated on global memory. Since we don't omit + // predication for expressions involving global memory, this + // should never occur. + TORCH_INTERNAL_ASSERT( + input_def != nullptr, "Inconsistent input found: ", input); + + // The input needs to be initialized to the init value to omit + // the predicate, so if the input has its own init value, i.e., + // produced by another reduction, they must use the same init + // value. + Val* input_init = ir_utils::getReductionInitValOf(input); + if (input_init != nullptr && + !grouped_rop->initVal(i)->sameAs(input_init)) { + needs_predicate_ = true; + return; + } + + // If input is not predicated, out-of-bound value may be + // overwritten by a garbage value. However, it doesn't matter if + // the input is also produced by another reduction. If the preceding + // reduction omits the predicate, it means its input must be + // initialized to its init value, so no predicate should be + // needed in both of the two reduction ops if they use the same + // init value, which is guaranteed by the above check, and the + // same reduction op. + if (auto input_def_rop = dynamic_cast(input_def)) { + if (grouped_rop->getReductionOpType(i) != + input_def_rop->getReductionOpType() && + non_predicated_exprs_.find(input_def) != + non_predicated_exprs_.end()) { + needs_predicate_ = true; + return; + } + } else if ( + auto input_def_grouped_rop = + dynamic_cast(input_def)) { + auto input_index_as_output = std::distance( + input_def_grouped_rop->outputs().begin(), + std::find( + input_def_grouped_rop->outputs().begin(), + input_def_grouped_rop->outputs().end(), + input)); + if (grouped_rop->getReductionOpType(i) != + input_def_grouped_rop->getReductionOpType( + input_index_as_output) && + non_predicated_exprs_.find(input_def) != + non_predicated_exprs_.end()) { + needs_predicate_ = true; + return; + } + } else if ( + non_predicated_exprs_.find(input_def) != + non_predicated_exprs_.end()) { + needs_predicate_ = true; + return; + } + } + } + + // Similar to the above reduction constraint but for MMA + void handle(MmaOp* mma) final { + for (auto input : ir_utils::filterByType(mma->inputs())) { + auto input_def = input->definition(); + TORCH_INTERNAL_ASSERT( + input_def != nullptr, "Inconsistent input found: ", input); + + Val* input_init = ir_utils::getReductionInitValOf(input); + if (input_init != nullptr && !mma->init()->sameAs(input_init)) { + needs_predicate_ = true; + return; + } + + if (non_predicated_exprs_.find(input_def) != + non_predicated_exprs_.end()) { + needs_predicate_ = true; + return; + } + } + } + + private: + const std::unordered_set& non_predicated_exprs_; + bool needs_predicate_ = false; +}; + +} // namespace + +bool PredicateElimination::needsPredicate(Expr* expr) const { + return PredicateChcker::needsPredicate(expr, non_predicated_exprs_); +} + +void PredicateElimination::handle(Expr* expr) { + if (!ir_utils::isTvOp(expr)) { + return; + } + + if (needsPredicate(expr)) { + assertOnWarpOps(expr); + return; + } + + non_predicated_exprs_.insert(expr); + + // Ensure all inputs have some values set at the out-of-bound + // regions + for (const auto i : c10::irange(expr->inputs().size())) { + auto input = dynamic_cast(expr->inputs()[i]); + if (input == nullptr) { + continue; + } + auto input_def = input->definition(); + // When input_def is null, input must be an input to the fusion, + // so that must be allocated on global memory. Since we don't omit + // predication for expressions involving global memory, this + // should never occur. + TORCH_INTERNAL_ASSERT( + input_def != nullptr, "Inconsistent input found: ", input->toString()); + + // If input is an output of reduction, it should be fully + // initialied as it's allocated on local memory. + if (ir_utils::isReductionOp(input_def)) { + continue; + } + + if (expr->isA()) { + setReductionInitValue(input, expr->as()->init()); + continue; + } else if (expr->isA()) { + setReductionInitValue(input, expr->as()->initVal(i)); + continue; + } else if (auto wop = dynamic_cast(expr)) { + Val* init = wop->getInitVals().at(i); + setReductionInitValue(input, init); + continue; + } else if (expr->isA()) { + setReductionInitValue(input, expr->as()->init()); + continue; + } else if ( + non_predicated_exprs_.find(input_def) != non_predicated_exprs_.end()) { + // If an input does not need a predicate either, then it should + // have some value, so no need to set a default value + continue; + } else { + // Make sure input is initialized + setDefaultInitValue(input); + } + } +} + +bool PredicateElimination::setDefaultInitValue(TensorView* tv) { + auto it = init_value_map_.find(tv); + // If there's already a mapping for tv, it should be mapped to a + // zero val or a reduction init. Either case, no need to modify + // the existing mapping. + if (it == init_value_map_.end()) { + init_value_map_.insert({tv, nullptr}); + } + return true; +} + +bool PredicateElimination::setReductionInitValue( + TensorView* tv, + Val* reduction_init) { + TORCH_INTERNAL_ASSERT(tv != nullptr); + + auto it = init_value_map_.find(tv); + if (it == init_value_map_.end()) { + init_value_map_.insert({tv, reduction_init}); + return true; + } + + auto existing_val = it->second; + if (existing_val == nullptr) { + // If the existing mapping returns nullptr, it means that a + // default init was set before. Overwrite with the reduction + // init val. + init_value_map_[tv] = reduction_init; + return true; + } else if (existing_val->sameAs(reduction_init)) { + return true; + } else { + TORCH_INTERNAL_ASSERT( + false, + "Incosistent setting of initialization value for t", + tv->name(), + ". Prev: ", + existing_val, + ", New: ", + reduction_init); + return false; + } +} + +bool PredicateElimination::canOmitPredicate(const Expr* expr) const { + // Predicate elimination can be disabled with + // PYTORCH_NVFUSER_DISABLE=predicate_elimination + if (isDisabled(DisableOption::PredicateElimination)) { + assertOnWarpOps(expr); + return false; + } + + TORCH_INTERNAL_ASSERT(expr != nullptr); + const auto out_tv = ir_utils::getTvOutput(expr); + TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Not a tensor expression"); + // No need to predicate local tensors to which a scalar is assigned + if (out_tv->getMemoryType() == MemoryType::Local) { + if (auto uop = dynamic_cast(expr)) { + if (uop->getUnaryOpType() == UnaryOpType::Set && uop->in()->isScalar()) { + return true; + } + } + } + if (non_predicated_exprs_.find(expr) != non_predicated_exprs_.end()) { + return true; + } + + assertOnWarpOps(expr); + return false; +} + +void PredicateElimination::propagateRemovalInfo( + const Expr* from, + const Expr* to) { + if (non_predicated_exprs_.count(from)) { + non_predicated_exprs_.insert(to); + } +} + +Val* PredicateElimination::getInitValue(TensorView* tv) const { + auto it = init_value_map_.find(tv); + if (it == init_value_map_.end()) { + return nullptr; + } + auto init_val = it->second; + if (init_val == nullptr) { + // No reduction restriction. Just use zero + return GpuLower::current()->kernel()->zeroVal(); + } else { + return init_val; + } +} + +void PredicateElimination::build(Fusion* fusion) { + traverseFrom(fusion, fusion->outputs()); +} + +std::string PredicateElimination::toString() const { + std::stringstream ss; + ss << "Tensors that do not need predication:"; + for (auto expr : non_predicated_exprs_) { + for (auto out : expr->outputs()) { + TORCH_INTERNAL_ASSERT(out->isA()); + ss << " T" << out->name(); + } + } + ss << "\n"; + ss << "Init values:"; + for (auto kv : init_value_map_) { + ss << " T" << kv.first->name() << "->"; + if (kv.second == nullptr) { + ss << ""; + } else { + ss << kv.second; + } + } + ss << "\n"; + return ss.str(); +} + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower_predicate_elimination.h b/torch/csrc/jit/codegen/cuda/lower_predicate_elimination.h new file mode 100644 index 000000000000..557796ce9d4d --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/lower_predicate_elimination.h @@ -0,0 +1,64 @@ +#pragma once +#include + +#include +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +class TORCH_CUDA_CU_API PredicateElimination : public IterVisitor { + public: + void build(Fusion* fusion); + + //! True if expr does not need a predicate + //! + //! \param expr Tensor expression + bool canOmitPredicate(const Expr* expr) const; + + //! Value to initialize out-of-bound regions + Val* getInitValue(TensorView* tv) const; + + //! Dump to string for debugging + std::string toString() const; + + // A utility to set removal info of `to` the same as `from`. + // See issue #1641 + // We build predicate info before lowering but more expressions + // are created during lowering that this class also need to + // keep track of to make sure correct predicate removal is + // applied. + // This utility is a quick patch for the missing information + // since it might be better just to recompute predicate info + // if all expressions were mutated, but that'd take much more + // global info to reliably track. + void propagateRemovalInfo(const Expr* from, const Expr* to); + + private: + using IterVisitor::handle; + + void handle(Expr* expr) final; + + //! Set a value to initialize out-of-bound regions + bool setDefaultInitValue(TensorView* tv); + //! Set a value to initialize out-of-bound regions of reduction tensors + bool setReductionInitValue(TensorView* tv, Val* reduction_init); + + //! Check if expr needs to be predicated + bool needsPredicate(Expr* expr) const; + + private: + //! Expressions that are found to be safe without predicates + std::unordered_set non_predicated_exprs_; + //! Tensors and their initialization values + std::unordered_map init_value_map_; +}; + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower_replace_size.cpp b/torch/csrc/jit/codegen/cuda/lower_replace_size.cpp new file mode 100644 index 000000000000..beec550e537f --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/lower_replace_size.cpp @@ -0,0 +1,233 @@ +#include +#include +#include +#include +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +namespace { +// Going to generate a map of tensor view root domain extents to reduce the +// number used during lowering. For example if we have: +// +// T2[i0, i1] = T1[i0, i1] + T2[i2, i3] +// +// We know it would be safe to use: +// +// T2[i0, i1] = T1[i0, i1] + T2[i0, i1] +// +// And that way we don't generate T2.size[0] and T2.size[1], instead we will +// reuse T1.size[0] and T1.size[1] +// This is important when doing CSE as T2 and T1 would otherwise look like +// they're using different values, even though we know they're the same +// +// There's some duplicate logic here that's in computeAt map, but it's not so +// concice there to pull out. May want to consider making this mapping its own +// class especially as it may be useful during scheduling. +std::unordered_map getSimplificationMap(Fusion* fusion) { + std::list> disjoint_root_sets; + std::unordered_map*> + id_to_disjoint_root_set; + + auto map_root_ids = [&disjoint_root_sets, &id_to_disjoint_root_set]( + IterDomain* id0, IterDomain* id1) { + if (id0->isBroadcast() || id1->isBroadcast()) { + return; + } + + auto disjoint_set_0_it = id_to_disjoint_root_set.find(id0); + auto disjoint_set_1_it = id_to_disjoint_root_set.find(id1); + bool set_0_found = disjoint_set_0_it != id_to_disjoint_root_set.end(); + bool set_1_found = disjoint_set_1_it != id_to_disjoint_root_set.end(); + + if (set_0_found && set_1_found) { + if (disjoint_set_0_it->second == disjoint_set_1_it->second) { + return; + } + // merge second disjoint set into first + auto* set_0 = disjoint_set_0_it->second; + auto* set_1 = disjoint_set_1_it->second; + for (auto id : *set_1) { + set_0->emplace(id); + id_to_disjoint_root_set[id] = set_0; + } + // remove second set from disjoint_root_sets + disjoint_root_sets.erase(std::find( + disjoint_root_sets.begin(), disjoint_root_sets.end(), *set_1)); + } else if (set_0_found || set_1_found) { + auto existing_set = + set_0_found ? disjoint_set_0_it->second : disjoint_set_1_it->second; + auto to_add_id = set_0_found ? id1 : id0; + existing_set->emplace(to_add_id); + id_to_disjoint_root_set[to_add_id] = existing_set; + // add entry into existing set + } else { + // create new set entry + disjoint_root_sets.emplace_back(std::unordered_set()); + auto* new_set = &disjoint_root_sets.back(); + new_set->emplace(id0); + new_set->emplace(id1); + id_to_disjoint_root_set[id0] = new_set; + id_to_disjoint_root_set[id1] = new_set; + } + }; + + auto fusion_vals = fusion->usedMathVals(); + for (auto producer_tv : ir_utils::filterByType(fusion_vals)) { + auto consumer_tvs = ir_utils::consumerTvsOf(producer_tv); + for (auto consumer_tv : consumer_tvs) { + auto pairwise_map = PairwiseRootDomainMap(producer_tv, consumer_tv); + auto c2p_root_map = pairwise_map.mapConsumerToProducer( + consumer_tv->domain(), producer_tv->domain()); + for (auto entry : c2p_root_map) { + auto c_id = entry.first; + auto p_id = entry.second; + map_root_ids(p_id, c_id); + } + } + } + + // Map each set to an input ID (if it exists) that has the smallest ->name() + // entry value + std::unordered_map*, IterDomain*> + set_to_input_id; + + // Loop over the root domains, of the inputs to the fusion. Pick an input ID + // to use as the representative ID of the collected sets. Only consider inputs + // as those are the ones that map to values like "T0.size[1]". They are he + // ID's that propagated their extents into the problem. We could also check + // the outputs as we do have C++ examples of using output dimensions for the + // problem size instead of inputs. However, we don't do anything where we can + // translate to those kinds of kernels integrated into PyTorch. + for (auto input_tv : ir_utils::filterByType(fusion->inputs())) { + for (auto id : + TensorDomain::noReductions(input_tv->getMaybeRFactorDomain())) { + auto id_set_it = id_to_disjoint_root_set.find(id); + if (id_set_it == id_to_disjoint_root_set.end()) { + continue; + } + auto* id_set = id_set_it->second; + if (set_to_input_id.find(id_set) == set_to_input_id.end()) { + set_to_input_id[id_set] = id; + } else { + auto input_id_of_set = set_to_input_id.at(id_set); + // Swap id's if new name is less than previously set + bool swap_ids = id->name() < input_id_of_set->name(); + // If new id is a const scalar but previously was'nt use the const + // scalar + swap_ids = swap_ids || + (id->extent()->isConstScalar() && + !input_id_of_set->extent()->isConstScalar()); + // If previous scalar was const and new isn't, don't swap + swap_ids = swap_ids && + !(input_id_of_set->extent()->isConstScalar() && + !id->extent()->isConstScalar()); + + if (swap_ids) { + set_to_input_id[id_set] = id; + } + } + } + } + + // Finally make map from ID extents to the representitive ID extent. + std::unordered_map extent_to_min_input_id_extent; + for (auto entry : set_to_input_id) { + auto* set = entry.first; + auto input_id = entry.second; + for (auto id : *set) { + extent_to_min_input_id_extent[id->extent()] = input_id->extent(); + } + } + return extent_to_min_input_id_extent; +} + +} // namespace + +void replaceSymbolicSizes(Fusion* fusion) { + FUSER_PERF_SCOPE("GpuLower::Lower::replaceSymbolicSizes"); + std::unordered_map tensor_dim_map; + + // Grab inputs and outputs + std::vector inputs_and_outputs; + for (auto val : fusion->inputs()) { + if (ir_utils::isTV(val)) { + inputs_and_outputs.push_back(val->as()); + } + } + // Symbolic size is necessary for outputs if there are no inputs. + // Otherwise infer output sizes from the inputs via expression evaluation. + if (fusion->inputs().empty()) { + for (auto val : fusion->outputs()) { + if (ir_utils::isTV(val)) { + inputs_and_outputs.push_back(val->as()); + } + } + } + + // Generate map for all tensorview root domain values to map them to symbolic + // values. i.e. T0->getRootDomain()[0] would map to a named scalar + // "T0.size[0]". This map will be used when lowering fusion ir to kernel ir. + for (TensorView* tv : inputs_and_outputs) { + // Replace the domain with one based on Ti.size[j] + const std::vector& root_td = tv->getRootDomain(); + + size_t dim = 0; + for (auto id : root_td) { + Val* orig_size = id->extent(); + + // Output sizes could have reduction axes, which isn't what gets output. + // NOLINTNEXTLINE(bugprone-branch-clone) + if (id->isReduction() || + (id->getIterType() == IterType::BroadcastWithoutStride)) { + continue; + } else if ( + id->isRFactorProduct() || + // NOLINTNEXTLINE(bugprone-branch-clone) + (id->getIterType() == IterType::BroadcastWithStride) || + orig_size->isConstScalar()) { + dim++; + continue; + } + + // Currently turn off this part for inputs of segmented fusion, + // since FusionKernelRuntime will provide these as integer inputs + if (tensor_dim_map.find(orig_size) == tensor_dim_map.end() && + !orig_size->isFusionInput() && !orig_size->isConstScalar()) { + std::stringstream ss; + ss << "T" << tv->name() << ".size[" << dim++ << "]"; + tensor_dim_map[orig_size] = IrBuilder::create( + ss.str(), orig_size->getDataType().value()); + } else { + dim++; + } + } + } + + // Use a minimal number of sizes from provided tensors. + auto extent_simplification_map = getSimplificationMap(fusion); + for (auto extent_entry : extent_simplification_map) { + auto orig_extent = extent_entry.first; + auto simplified_extent = extent_entry.second; + if (tensor_dim_map.count(orig_extent)) { + if (tensor_dim_map.count(simplified_extent)) { + tensor_dim_map[orig_extent] = tensor_dim_map[simplified_extent]; + } else { + tensor_dim_map[orig_extent] = simplified_extent; + } + } + } + + // Run mutation on the fusion with the tensor_dim_map + ir_utils::replaceValue(fusion, tensor_dim_map); +} + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower_replace_size.h b/torch/csrc/jit/codegen/cuda/lower_replace_size.h new file mode 100644 index 000000000000..81cee9f6ffe0 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/lower_replace_size.h @@ -0,0 +1,25 @@ +#pragma once + +#include + +#include +#include +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +// TensorViews are all based on symbolic sizes. When we first initialize them +// we don't know if they're inputs or outputs which would mean that they have +// runtime shapes. Intermediate tensors (those not going to global memory) do +// not have this information. Since we need to have the correct information in +// the kernel being fetched for shapes, we want to replace input and output +// tensors to reference the runtime structure containing sizes. +void replaceSymbolicSizes(Fusion*); + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower_shift.cpp b/torch/csrc/jit/codegen/cuda/lower_shift.cpp index 8a4f6980e015..913b246e71ac 100644 --- a/torch/csrc/jit/codegen/cuda/lower_shift.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_shift.cpp @@ -5,8 +5,6 @@ #include #include #include -#include -#include #include #include #include @@ -19,19 +17,17 @@ namespace fuser { namespace cuda { void ShiftPredicateInserter::insert( - kir::Expr* expr, + Expr* expr, const std::vector& loops, - kir::Bool* thread_pred, + Bool* thread_pred, bool within_unswitch) { const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - kir::TensorView* out_tv = ir_utils::getTVOutput(expr); - TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Missing kir::TensorView output"); + TensorView* out_tv = ir_utils::getTvOutput(expr); + TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Missing TensorView output"); - TensorView* out_fuser_tv = out_tv->fuserTv(); const bool needs_shift_predicate = - gpu_lower->haloInfo().needsShiftPredicate(out_fuser_tv->definition()); + gpu_lower->haloInfo().needsShiftPredicate(out_tv->definition()); if (!needs_shift_predicate) { return; } @@ -48,12 +44,12 @@ void ShiftPredicateInserter::insert( kir::Predicate* thread_pred_expr = nullptr; if (within_unswitch) { - thread_pred_expr = ir_builder.create(thread_pred); + thread_pred_expr = IrBuilder::create(thread_pred); } kir::Predicate* shift_pred = within_unswitch ? thread_pred_expr - : ir_builder.create( + : IrBuilder::create( PredicateType::Shift, expr, thread_pred); // If the expr involves a thread-block barrier, set the predicate of @@ -64,7 +60,7 @@ void ShiftPredicateInserter::insert( return; } - auto shift_ite = ir_builder.create(shift_pred); + auto shift_ite = IrBuilder::create(shift_pred); auto& scope = loops.back()->body(); @@ -83,56 +79,33 @@ void ShiftPredicateInserter::insert( } // Padding by zero - kir::Predicate* padding_pred = ir_builder.create( + kir::Predicate* padding_pred = IrBuilder::create( PredicateType::Padding, expr, thread_pred); - auto bounds_ite = ir_builder.create(padding_pred); + auto bounds_ite = IrBuilder::create(padding_pred); const int pad_value = 0; - auto pad_expr = ir_builder.create( - UnaryOpType::Set, out_tv, ir_builder.create(pad_value)); + auto pad_expr = IrBuilder::create( + UnaryOpType::Set, out_tv, IrBuilder::create(pad_value)); bounds_ite->thenBody().push_back(pad_expr); // Insert the else block shift_ite->elseBody().push_back(bounds_ite); } -AxisHaloInfo::AxisHaloInfo() { - auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - setWidth(0, ir_builder.zeroVal()); - setWidth(1, ir_builder.zeroVal()); -} - -kir::Int* AxisHaloInfo::width() const { - auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); - return ir_builder.addExpr(width(0), width(1))->as(); +int AxisHaloInfo::width() const { + return width(0) + width(1); } -kir::Int* AxisHaloInfo::width(int pos) const { +int AxisHaloInfo::width(int pos) const { TORCH_INTERNAL_ASSERT(pos >= 0 && pos < 2); - TORCH_INTERNAL_ASSERT(widths_[pos] != nullptr); return widths_[pos]; } -void AxisHaloInfo::setWidth(int pos, kir::Int* width) { +void AxisHaloInfo::setWidth(int pos, int width) { TORCH_INTERNAL_ASSERT(pos >= 0 && pos < 2); widths_[pos] = width; } -void AxisHaloInfo::merge(int pos, kir::Int* other) { - auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - auto cur = width(pos); - kir::Int* new_width = nullptr; - if (cur->isConst() && other->isConst()) { - new_width = ir_builder.create( - std::max(cur->value().value(), other->value().value())); - } else if (cur->isZeroInt()) { - new_width = other; - } else if (other->isZeroInt()) { - new_width = cur; - } else { - new_width = ir_builder.maxExpr(width(pos), other)->as(); - } +void AxisHaloInfo::merge(int pos, int other) { + auto new_width = std::max(width(pos), other); setWidth(pos, new_width); } @@ -144,13 +117,12 @@ void AxisHaloInfo::merge(const AxisHaloInfo& other) { bool AxisHaloInfo::hasHalo() const { return std::any_of( - widths_.begin(), widths_.end(), [](auto w) { return !w->isZeroInt(); }); + widths_.begin(), widths_.end(), [](auto w) { return w != 0; }); } std::string AxisHaloInfo::toString() const { std::stringstream ss; - ss << "<" << kir::toString(width(0)) << ", " << kir::toString(width(1)) - << ">"; + ss << "<" << width(0) << ", " << width(1) << ">"; return ss.str(); } @@ -158,38 +130,21 @@ bool HaloInfo::hasRootAxisInfo(IterDomain* id) const { return root_axis_map_.find(id) != root_axis_map_.end(); } -bool HaloInfo::hasRootAxisInfo(kir::IterDomain* id) const { - return kir_root_axis_map_.find(id) != kir_root_axis_map_.end(); -} - const AxisHaloInfo& HaloInfo::getRootAxisInfo(IterDomain* id) const { + // TODO: Enable this check, was failing in many tests + // TORCH_INTERNAL_ASSERT( + // id->definition() == nullptr || id->isRFactorProduct(), + // "Invalid IterDomain: ", + // id); auto it = root_axis_map_.find(id); TORCH_INTERNAL_ASSERT( - it != root_axis_map_.end(), "Halo root axis info not found for ", id); - return it->second; -} - -AxisHaloInfo& HaloInfo::getRootAxisInfo(IterDomain* id) { - // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) - return const_cast( - // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) - const_cast(this)->getRootAxisInfo(id)); -} - -const AxisHaloInfo& HaloInfo::getRootAxisInfo(kir::IterDomain* id) const { - TORCH_INTERNAL_ASSERT( - id->definition() == nullptr || id->isRFactorProduct(), - "Invalid IterDomain: ", - id); - auto it = kir_root_axis_map_.find(id); - TORCH_INTERNAL_ASSERT( - it != kir_root_axis_map_.end(), + it != root_axis_map_.end(), "Halo root axis info not found for ", - kir::toString(id)); + id->toString()); return it->second; } -AxisHaloInfo& HaloInfo::getRootAxisInfo(kir::IterDomain* id) { +AxisHaloInfo& HaloInfo::getRootAxisInfo(IterDomain* id) { // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) return const_cast( // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) @@ -200,9 +155,6 @@ void HaloInfo::setRootAxisInfo( IterDomain* id, const AxisHaloInfo& root_axis_info) { root_axis_map_[id] = root_axis_info; - kir_root_axis_map_ - [GpuLower::current()->lowerValue(id)->as()] = - root_axis_info; initializeFromRootAxisInfo(id); return; @@ -283,9 +235,6 @@ void HaloInfo::propagateRootAxisInfo( const auto& c_root = consumer->getRootDomain(); - auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); - for (const auto i : c10::irange(c_root.size())) { auto c_id = c_root[i]; auto it = c2p.find(c_id); @@ -332,31 +281,19 @@ void HaloInfo::propagateRootAxisInfo( p_info.merge(c_info); } else { int pos = (offset > 0) ? 0 : 1; - p_info.merge( - pos, - ir_builder.addExpr(c_info.width(pos), std::abs(offset)) - ->as()); + p_info.merge(pos, c_info.width(pos) + std::abs(offset)); } } else if (auto gather_op = dynamic_cast(expr)) { - const auto window_dim = - gpu_lower->lowerValue(gather_op->windowShape()[i]); - if (window_dim->isOneInt()) { + const auto window_dim = gather_op->windowShape()[i]; + if (window_dim == 1) { p_info.merge(c_info); continue; } - const auto& pad_dim = gather_op->padWidth()[i]; - const auto pad_dim0 = gpu_lower->lowerValue(pad_dim[0])->as(); - p_info.merge( - 0, ir_builder.addExpr(c_info.width(0), pad_dim0)->as()); + const auto pad_dim0 = gather_op->padWidth()[i][0]; + p_info.merge(0, c_info.width(0) + pad_dim0); // The right-side halo is propagated as: // consumer_right_halo + (window_dim - 1 - left_padding) - p_info.merge( - 1, - ir_builder - .subExpr( - ir_builder.addExpr(c_info.width(1), window_dim), - ir_builder.addExpr(pad_dim0, 1)) - ->as()); + p_info.merge(1, c_info.width(1) + window_dim - 1 - pad_dim0); } else { p_info.merge(c_info); } @@ -389,31 +326,28 @@ void HaloInfo::insertToInheritanceMap( void HaloInfo::initializeFromRootAxisInfo(IterDomain* id) { TORCH_INTERNAL_ASSERT(hasRootAxisInfo(id)); - auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - const auto& halo_info = getRootAxisInfo(id); auto halo_width = halo_info.width(); if (!halo_info.hasHalo()) { - halo_width_map_[id] = ir_builder.zeroVal(); + setHaloWidth(id, 0); return; } auto expanded_extent = - ir_builder.addExpr(gpu_lower->lowerValue(id->extent()), halo_width); - kir_extent_map_[gpu_lower->lowerValue(id)->as()] = - expanded_extent; + IrBuilder::addExpr(id->extent(), IrBuilder::create(halo_width)); + extent_map_[id] = expanded_extent; halo_width_map_[id] = halo_width; inheritance_map_[id] = {id}; } +void HaloInfo::setHaloWidth(IterDomain* id, int halo_width) { + halo_width_map_[id] = halo_width; +} + // Propagate extent information from root axes to descendants void HaloInfo::build(TensorDomain* td) { - auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - auto exprs = DependencyCheck::getAllExprsBetween( {td->getMaybeRFactorDomain().begin(), td->getMaybeRFactorDomain().end()}, {td->domain().begin(), td->domain().end()}); @@ -459,33 +393,29 @@ void HaloInfo::build(TensorDomain* td) { auto in_id = split->in(); - const auto& halo_width_it = halo_width_map_.find(in_id); - // If no halo info is found, nothing needs to be done. This ID // must be an ancestor of a domain set by setRootAxisInfo. - if (halo_width_it == halo_width_map_.end()) { + if (!hasHaloWidth(in_id)) { continue; } - const auto halo_width = halo_width_it->second; + const auto halo_width = getHaloWidth(in_id); - if (halo_width->isZeroInt()) { - halo_width_map_.insert({split->outer(), halo_width}); - halo_width_map_.insert({split->inner(), halo_width}); + if (halo_width == 0) { + setHaloWidth(split->outer(), 0); + setHaloWidth(split->inner(), 0); continue; } // propagate to inner domain auto out_id = split->inner(); - auto expanded_extent = ir_builder.addExpr( - gpu_lower->lowerValue(out_id->extent()), halo_width); - kir_extent_map_.insert( - {gpu_lower->lowerValue(out_id)->as(), - expanded_extent}); + auto expanded_extent = + SimplifyingIrBuilder::addExpr(out_id->extent(), halo_width); + extent_map_.insert({out_id, expanded_extent}); - halo_width_map_.insert({split->outer(), ir_builder.zeroVal()}); - halo_width_map_.insert({split->inner(), halo_width}); + setHaloWidth(split->outer(), 0); + setHaloWidth(split->inner(), halo_width); insertToInheritanceMap(td, in_id, split->inner()); } else if (auto merge = dynamic_cast(expr)) { @@ -495,25 +425,24 @@ void HaloInfo::build(TensorDomain* td) { auto outer_extent = getExtent(merge->outer()); if (inner_extent != nullptr || outer_extent != nullptr) { if (inner_extent == nullptr) { - inner_extent = gpu_lower->lowerValue(merge->inner()->extent()); + inner_extent = merge->inner()->extent(); } else { insertToInheritanceMap(td, merge->inner(), merge->out()); } if (outer_extent == nullptr) { - outer_extent = gpu_lower->lowerValue(merge->outer()->extent()); + outer_extent = merge->outer()->extent(); } else { insertToInheritanceMap(td, merge->outer(), merge->out()); } - auto expanded_extent = ir_builder.mulExpr(outer_extent, inner_extent); - kir_extent_map_.insert( - {gpu_lower->lowerValue(merge->out())->as(), - expanded_extent}); + auto expanded_extent = + SimplifyingIrBuilder::mulExpr(outer_extent, inner_extent); + extent_map_.insert({merge->out(), expanded_extent}); // Splitting the output of this merge is not allowed, so // remember it merged_shifted_ids.insert(merge->out()); // Note that halo_width_map_ is not updated } else { - halo_width_map_.insert({merge->out(), ir_builder.zeroVal()}); + setHaloWidth(merge->out(), 0); } } else { TORCH_INTERNAL_ASSERT(false, "Unsupported expr: ", expr); @@ -540,12 +469,11 @@ void HaloInfo::build(TensorDomain* td) { //! vectorization. Vectorization should be eventually supported but //! needs further work. void HaloInfo::validate(TensorView* tv) const { - const auto& par_map = GpuLower::current()->caParallelMap(); - const auto& loop_map = GpuLower::current()->caLoopMap(); const auto mem_type = tv->getMemoryType(); for (auto axis : tv->domain()->domain()) { - auto concrete_id = par_map.getConcreteMappedID(axis); + auto concrete_id = GpuLower::current()->caMap()->getConcreteMappedID( + axis, IdMappingMode::LOOP); // The extent is assumed to be the same TORCH_INTERNAL_ASSERT( @@ -579,7 +507,7 @@ void HaloInfo::validate(TensorView* tv) const { bool shared_mem_needed = false; for (auto use : tv->uses()) { - if (!ir_utils::isTVOp(use)) { + if (!ir_utils::isTvOp(use)) { continue; } if (use->isA() || use->isA()) { @@ -592,7 +520,8 @@ void HaloInfo::validate(TensorView* tv) const { consumer->domain()->domain().begin(), consumer->domain()->domain().end(), [&](IterDomain* consumer_axis) { - return loop_map.areMapped(axis, consumer_axis); + return GpuLower::current()->caMap()->areMapped( + axis, consumer_axis, IdMappingMode::PERMISSIVE); }); if (it == consumer->domain()->domain().end()) { continue; @@ -629,21 +558,16 @@ void HaloInfo::validate(TensorView* tv) const { return; } -kir::Val* HaloInfo::getExtent(IterDomain* id) const { - auto kir_id = GpuLower::current()->lowerValue(id)->as(); - return getExtent(kir_id); -} - -kir::Val* HaloInfo::getExtent(kir::IterDomain* id) const { - auto it = kir_extent_map_.find(id); - if (it != kir_extent_map_.end()) { +Val* HaloInfo::getExtent(IterDomain* id) const { + auto it = extent_map_.find(id); + if (it != extent_map_.end()) { return it->second; } else { return nullptr; } } -kir::Int* HaloInfo::getHaloWidth(IterDomain* id) const { +int HaloInfo::getHaloWidth(IterDomain* id) const { auto it = halo_width_map_.find(id); TORCH_INTERNAL_ASSERT(it != halo_width_map_.end()); return it->second; @@ -699,7 +623,8 @@ bool extentCompare( Cmp cmp) { auto gpu_lower = GpuLower::current(); TORCH_INTERNAL_ASSERT( - gpu_lower->caLoopMap().areMapped(id1, id2), "Invalid axes to compare"); + gpu_lower->caMap()->areMapped(id1, id2, IdMappingMode::PERMISSIVE), + "Invalid axes to compare"); // It's invalid to compare two axes and when only either of them has // halo. @@ -736,63 +661,11 @@ bool extentCompare( } // namespace bool HaloInfo::extentLessEqual(IterDomain* id1, IterDomain* id2) const { - auto cmp = [](kir::Int* x, kir::Int* y) { - if (x == y) { - return true; - } - auto xv = x->value(); - auto yv = y->value(); - return xv.has_value() && yv.has_value() && xv.value() <= yv.value(); - }; - return extentCompare(*this, id1, id2, cmp); + return extentCompare(*this, id1, id2, std::less_equal<>()); } bool HaloInfo::extentEqual(IterDomain* id1, IterDomain* id2) const { - // Returns true only when x and y are proven to be the same. The - // analysis is not comprehensive and can prove in rather trivial - // cases only. Specifically: - // - x and y are the same pointers - // - Both have static values and they are the same - // - Both are defined by the same expression and the inputs are - // proven to be equal - std::function cmp = [&](kir::Int* x, - kir::Int* y) { - if (x == y) { - return true; - } - - auto xv = x->value(); - auto yv = y->value(); - if (xv.has_value() && yv.has_value() && xv.value() == yv.value()) { - return true; - } - - // Check if both are defined by an expression of the same type. If - // so, recursively check the input operands. - auto x_def = x->definition(); - auto y_def = y->definition(); - if (x_def && y_def && - ((x_def->isA() && y_def->isA() && - x_def->as()->operation() == - y_def->as()->operation()) || - (x_def->isA() && y_def->isA() && - x_def->as()->operation() == - y_def->as()->operation()))) { - for (const auto i : c10::irange(x_def->inputs().size())) { - auto x_input = dynamic_cast(x_def->inputs()[i]); - auto y_input = dynamic_cast(y_def->inputs()[i]); - // Both must be kir::Int - TORCH_INTERNAL_ASSERT(x_input && y_input); - if (!cmp(x_input, y_input)) { - return false; - } - } - return true; - } - - return false; - }; - return extentCompare(*this, id1, id2, cmp); + return extentCompare(*this, id1, id2, std::equal_to<>()); } std::string HaloInfo::toString() const { @@ -822,16 +695,19 @@ std::string HaloInfo::toString() const { } bool HaloInfo::needsShiftPredicate(Expr* expr) const { - auto consumer_td = ir_utils::getTVOutput(expr)->domain(); - auto shift_expr = dynamic_cast(expr); - auto gather_expr = dynamic_cast(expr); + // In lowering shift and gather turn into a unary op. We really need the shift + // expr. Do a round about trick to grab it: + auto tv_out = ir_utils::getTvOutput(expr); + auto consumer_td = tv_out->domain(); + auto shift_expr = dynamic_cast(tv_out->definition()); + auto gather_expr = dynamic_cast(tv_out->definition()); for (const auto i : c10::irange(consumer_td->getRootDomain().size())) { auto consumer_id = consumer_td->getRootDomain()[i]; const auto consumer_halo_info = getRootAxisInfo(consumer_id); if (consumer_halo_info.hasHalo() || (shift_expr != nullptr && shift_expr->offset(i) != 0 && !consumer_id->isBroadcast()) || - (gather_expr != nullptr && !gather_expr->windowShape()[i]->isOneInt() && + (gather_expr != nullptr && gather_expr->windowShape()[i] != 1 && !consumer_id->isBroadcast())) { return true; } @@ -839,13 +715,6 @@ bool HaloInfo::needsShiftPredicate(Expr* expr) const { return false; } -bool HaloInfo::needsShiftPredicate(kir::Expr* expr) const { - const auto out_tv = expr->outputs()[0]->as(); - auto fuser_expr = out_tv->fuserTv()->definition(); - TORCH_INTERNAL_ASSERT(fuser_expr != nullptr); - return needsShiftPredicate(fuser_expr); -} - } // namespace cuda } // namespace fuser } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/lower_shift.h b/torch/csrc/jit/codegen/cuda/lower_shift.h index 378709ca4430..c0fea8c1eadd 100644 --- a/torch/csrc/jit/codegen/cuda/lower_shift.h +++ b/torch/csrc/jit/codegen/cuda/lower_shift.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -16,16 +16,14 @@ namespace cuda { //! Auxiliary class to represent information about halo of an axis class AxisHaloInfo { public: - AxisHaloInfo(); - //! Width of halo. //! //! pos is either 0 or 1. The width of halo at offset zero is set //! when pos is 0. - kir::Int* width(int pos) const; + int width(int pos) const; //! Sum of the widths of both widths - kir::Int* width() const; + int width() const; const auto& widths() const { return widths_; @@ -34,10 +32,10 @@ class AxisHaloInfo { //! Set the halo width of either side. //! pos is either 0 or 1. The width of halo at offset zero is set //! when pos is 0. - void setWidth(int pos, kir::Int* width); + void setWidth(int pos, int width); //! Extend the halo width to account for another axis. - void merge(int pos, kir::Int* other); + void merge(int pos, int other); //! Extend the halo width to account for another axis. void merge(const AxisHaloInfo& other); @@ -53,7 +51,7 @@ class AxisHaloInfo { //! widths_[0] is non-zero and designates the size of the //! halo. Similarly, non-zero widths_[1] means the axis has halo at //! the other end of the axis. - std::array widths_ = {nullptr, nullptr}; + std::array widths_ = {0, 0}; }; //! Helper class for lowering tensors with halo. Only valid at the @@ -77,7 +75,6 @@ class TORCH_CUDA_CU_API HaloInfo { //! Returns true if id has the root halo information set by //! setRootAxisInfo. bool hasRootAxisInfo(IterDomain* id) const; - bool hasRootAxisInfo(kir::IterDomain* id) const; //! Returns the registed AxisHaloInfo of a root axis. //! @@ -85,9 +82,6 @@ class TORCH_CUDA_CU_API HaloInfo { //! non-root axes. const AxisHaloInfo& getRootAxisInfo(IterDomain* id) const; AxisHaloInfo& getRootAxisInfo(IterDomain* id); - //! KIR version - const AxisHaloInfo& getRootAxisInfo(kir::IterDomain* id) const; - AxisHaloInfo& getRootAxisInfo(kir::IterDomain* id); //! Query if an axis has a halo width. //! @@ -98,12 +92,11 @@ class TORCH_CUDA_CU_API HaloInfo { //! //! It's an error if queried for an axis with no halo width //! information. - kir::Int* getHaloWidth(IterDomain* id) const; + int getHaloWidth(IterDomain* id) const; //! Returns an extent if id is extended for halo. Nullptr is //! returned otherwise. - kir::Val* getExtent(IterDomain* id) const; - kir::Val* getExtent(kir::IterDomain* id) const; + Val* getExtent(IterDomain* id) const; //! Returns all child domains of a root domain that inherits the //! halo of the root domain. @@ -135,7 +128,6 @@ class TORCH_CUDA_CU_API HaloInfo { //! interior and another for padding. Predicate insertion is done in //! the ShiftPredicateInserter class below. bool needsShiftPredicate(Expr* expr) const; - bool needsShiftPredicate(kir::Expr* expr) const; std::string toString() const; @@ -166,14 +158,14 @@ class TORCH_CUDA_CU_API HaloInfo { //! Validate shift usage void validate(TensorView* td) const; + void setHaloWidth(IterDomain* id, int halo_width); + private: //! Halo information of root axes std::unordered_map root_axis_map_; - //! KIR version - std::unordered_map kir_root_axis_map_; //! Halo-extended extents. No mapping for axes without halo extension - std::unordered_map kir_extent_map_; + std::unordered_map extent_map_; //! The halo width of an axis. //! @@ -209,7 +201,7 @@ class TORCH_CUDA_CU_API HaloInfo { //! inner axis is merged with another axis of extent M, we know that //! the extent of the resulting output axis is 5*M, but we don't //! create its mapping. - std::unordered_map halo_width_map_; + std::unordered_map halo_width_map_; //! Mappings from root domains to child domains that inherit halo std::unordered_map> @@ -224,9 +216,9 @@ class ShiftPredicateInserter { //! the usual predicated expression, so the insertion is also done //! here. static void insert( - kir::Expr* expr, + Expr* expr, const std::vector& loops, - kir::Bool* thread_pred, + Bool* thread_pred, bool within_unswitch); }; diff --git a/torch/csrc/jit/codegen/cuda/lower_sync_information.cpp b/torch/csrc/jit/codegen/cuda/lower_sync_information.cpp new file mode 100644 index 000000000000..5f3eebceb303 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/lower_sync_information.cpp @@ -0,0 +1,483 @@ + +#include +#include +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +namespace { + +// Validate parallelization of a single tensor +void validateParallelizationOfTensor(TensorView* tv) { + // Each ParallelType can be used only once. + ParallelTypeBitmap pt_map; + for (size_t i = 0; i < tv->nDims(); ++i) { + auto axis = tv->axis(i); + auto ptype = axis->getParallelType(); + if (!isParallelTypeThread(ptype)) { + continue; + } + + // It doesn't matter if this axis is a non-concretized broadcast + // TODO: merging broadcast and non-broadcast + if (axis->isBroadcast() && + !GpuLower::current()->concretizedBroadcastDomains().isConcretized( + axis)) { + continue; + } + + TORCH_INTERNAL_ASSERT( + !pt_map.get(ptype), + "Multiple use of ", + ptype, + " in tensor t", + tv->name(), + ": ", + tv); + pt_map.set(ptype); + } + + // If this tensor is predicated by a paralel type, it should not be + // used to parallelize any domain of this tensor + + const auto thread_pred = + GpuLower::current()->threadPredMap().getPredicateInfo(tv); + + auto predicated_parallel_types = pt_map & thread_pred.limited_types; + + TORCH_INTERNAL_ASSERT( + predicated_parallel_types.none(), + "Invalid parallelization of tensor t", + tv->name(), + ". The tensor is parallelized with ", + predicated_parallel_types.toString(), + ", but it's invalid to use the types as the tensor is also predicated with them.", + ", thread pred: ", + thread_pred.limited_types.toString()); +} + +//! Return true if axis is derived from a root axis that is an input +//! to a CA leaf axis. +bool derivedFromRootCAAxes(TensorView* tv, IterDomain* axis) { + std::vector ca_axes( + tv->domain()->domain().begin(), + tv->domain()->domain().begin() + tv->getComputeAtPosition()); + + auto ca_root_vals = IterVisitor::getInputsTo( + std::vector(ca_axes.begin(), ca_axes.end())); + + auto root_vals = IterVisitor::getInputsTo({axis}); + + return std::any_of( + root_vals.begin(), root_vals.end(), [&ca_root_vals](auto root) { + return std::find(ca_root_vals.begin(), ca_root_vals.end(), root) != + ca_root_vals.end(); + }); +} + +} // namespace + +void SyncMap::build(Fusion* fusion) { + FUSER_PERF_SCOPE("GpuLower::Lower::validateParallelize"); + FusionGuard fg(fusion); + + const auto& ca_map = GpuLower::current()->caMap(); + const auto& pred_map = GpuLower::current()->threadPredMap(); + + auto exprs = StmtSort::getExprs(fusion); + + // Run through expressions and check for communication across threads/blocks + // occuring from producer to consumer of the expression + for (auto expr : exprs) { + if (!ir_utils::isTvOp(expr)) { + continue; + } + + // Validate parallelization of each consumer by itself + for (auto consumer : ir_utils::filterByType(expr->outputs())) { + validateParallelizationOfTensor(consumer); + } + + // It's probably enough to just check all producers to one consumer as + // multi-consumers are guaranteed to be transformed/parallelized the same, + // but to be conservative for now checking every producer <-> consumer + // relationship. + for (auto producer : ir_utils::filterByType(expr->inputs())) { + // Parallelization on input tensors have no effect. + if (producer->isFusionInput()) { + continue; + } + + ParallelTypeBitmap raw_dims; + + const auto parallel_bcast_doms = + pred_map.getParallelBroadcastDomains(producer); + + // Stash information about parallelized producer iteration domains + std::vector producer_parallel_ids( + ParallelTypeBitmap::kNumParallelTypes, nullptr); + ParallelTypeBitmap producer_parallel_bitmap; + + // Tracking for quick check later + std::unordered_set producer_within_compute_at; + + // Get the parallel types that producer will be predicated off in producer + // writes. + // In this case we need a sync whether the producer-consumer axes are + // mapped or not since the predicate pass will generate pattern like + // below to eliminate redundant writes: if(threadIdx.x == 0) + // shared[threadIdx.x + i] = ... + // We will need a raw sync after this pattern for correctness. + auto producer_redundant_types = GpuLower::current() + ->threadPredMap() + .getPredicateInfo(producer) + .redundant_types; + + for (const auto producer_i : c10::irange(producer->nDims())) { + auto producer_axis = producer->axis(producer_i); + auto producer_ptype = + ca_map->getConcreteMappedID(producer_axis, IdMappingMode::LOOP) + ->getParallelType(); + + if (!isParallelTypeThread(producer_ptype)) { + continue; + } + + // Producer reductions shouldn't map to consumers + if (producer_axis->isReduction()) { + continue; + } + + if (producer_i < producer->getComputeAtPosition()) { + producer_within_compute_at.emplace(producer_axis); + } + + producer_parallel_bitmap.set(producer_ptype); + producer_parallel_ids[getParallelTypeBitMapOffset(producer_ptype)] = + producer_axis; + } + + for (auto consumer : + ir_utils::filterByType(expr->outputs())) { + // Stash information about parallelized consumer iteration domains + std::vector consumer_parallel_ids( + ParallelTypeBitmap::kNumParallelTypes, nullptr); + ParallelTypeBitmap consumer_parallel_bitmap; + + for (const auto consumer_i : c10::irange(consumer->nDims())) { + auto consumer_axis = consumer->axis(consumer_i); + auto consumer_ptype = + ca_map->getConcreteMappedID(consumer_axis, IdMappingMode::LOOP) + ->getParallelType(); + + if (!isParallelTypeThread(consumer_ptype)) { + continue; + } + + // When the consumer axis is a broadcast, it is not really + // parallelized unless thread-predicated and eventually concretized + if (consumer_axis->isBroadcast() && + (!parallel_bcast_doms.get(consumer_ptype) || + !GpuLower::current() + ->concretizedBroadcastDomains() + .isConcretized(consumer_axis))) { + continue; + } + + consumer_parallel_bitmap.set(consumer_ptype); + consumer_parallel_ids[getParallelTypeBitMapOffset(consumer_ptype)] = + consumer_axis; + } + + // At this point each parallel type that's present in the consumer or + // the producer will be present in their corresponding `_parallel_ids` + // map going from parallel index type (only size 6 for grid/block dims) + // to the iteration domain of that parallel type. + for (auto parallel_type : kParallelTypeThreads) { + // TIDx is reserved for lane_id in the case of mma ops. + // It is swizzled and handled separately in validateMma. + if (parallel_type == ParallelType::TIDx && expr->isA()) { + continue; + } + + auto parallel_type_i = getParallelTypeBitMapOffset(parallel_type); + + auto p_id = producer_parallel_ids[parallel_type_i]; + auto c_id = consumer_parallel_ids[parallel_type_i]; + + // If consumer is parallelized with this type but producer is + // predicated redundant on this type. This parallel dimension + // is a RAW dimension. See test: FusionSeriaSmemWriteParallelRead1/2 + // + // Even if consumer is not parallelized with this type, would still + // need a raw sync unless all use chain of the producer end with an + // output with the same redundant type. + // TODO: need a separate pass to detect the case where no raw sync + // is needed in this case, i.e. all use-def chains are redundant. + if (producer_redundant_types.get(parallel_type)) { + raw_dims.set(parallel_type); + continue; + } + + if (p_id == nullptr && c_id == nullptr) { + continue; + } else if (p_id != nullptr && c_id != nullptr) { + if (GpuLower::current()->caMap()->areMapped( + p_id, c_id, IdMappingMode::PERMISSIVE)) { + const auto halo_info = GpuLower::current()->haloInfo(); + + if (halo_info.hasHaloWidth(p_id) != + halo_info.hasHaloWidth(c_id) || + (halo_info.hasHaloWidth(p_id) && + halo_info.hasHaloWidth(c_id) && + halo_info.getHaloWidth(p_id) != + halo_info.getHaloWidth(c_id))) { + raw_dims.set(parallel_type); + continue; + } + } + } else { + if (p_id != nullptr) { + auto it = std::find_if( + consumer->domain()->domain().begin(), + consumer->domain()->domain().end(), + [&](IterDomain* c_id) { + return GpuLower::current()->caMap()->areMapped( + p_id, c_id, IdMappingMode::PERMISSIVE); + }); + + // If there isn't a mapping from producer to a consumer domain, + // need to assume there's communication across this parallel + // dimension. + c_id = it == consumer->domain()->domain().end() ? nullptr : *it; + // i.e. if producer is parallelized across threadIdx.x in a + // certain split, if the consumer doesn't map to this split, + // then we need to assume it has to be in smem with proper + // syncs. + } else { + auto it = std::find_if( + producer->domain()->domain().begin(), + producer->domain()->domain().end(), + [&](IterDomain* p_id) { + return GpuLower::current()->caMap()->areMapped( + p_id, c_id, IdMappingMode::PERMISSIVE); + }); + if (it == producer->domain()->domain().end()) { + // Can't infer anything if producer doesn't have a matching axis + // to parallel consumer dim. + continue; + } + p_id = *it; + } + } + + // Comm pattern options (when parallel types don't have matching + // axes) and required memory, Chart is producer parallel type, + // consumer parallel type Parallel types are Serial(S), + // threadIdx(T), blockIdx(B), Memory required for the producer is + // Local(L), Shared(S), Global(G), Sync is None (N/A), blockSync(B), + // grid_sync(G) + // + // P C Mem Req Sync Type + // S S L N/A + // S T L N/A + // S B L N/A + // T S S B + // T T S B + // T B S B + // B S G G + // B T G G + // B B G G + + auto producer_ptype = + ca_map->getConcreteMappedID(p_id, IdMappingMode::LOOP) + ->getParallelType(); + auto consumer_ptype = c_id == nullptr + ? ParallelType::Serial + : ca_map->getConcreteMappedID(c_id, IdMappingMode::LOOP) + ->getParallelType(); + + if (!p_id->isBroadcast() && isParallelTypeThread(producer_ptype) && + !(isParallelTypeThread(consumer_ptype) && + parallel_bcast_doms.get(consumer_ptype)) && + // Being in compute at means consumer and producer rely on the + // same loop size + !producer_within_compute_at.count(p_id) && + // For usage of derivedFromRootCAAxes check + // NVFuserTest.FusionAdvancedIndexing1_CUDA + (c_id == nullptr || !derivedFromRootCAAxes(producer, p_id))) { + // There must be a consumer axis that uses the same indexing + // with the same parallel type as the producer axis. The index + // map is used to to find such an axis. In addition, even when + // no mapped axis is found in the index map, but when an mapped + // axis exists in the loop map, the producer and consumer axes + // may still use the same indexing. That only happens when the + // producer is derived from a root axis that is an input to any + // leaf CA axes. In such a case, the axis in the reference + // tensor that maps to the producer axis is created based on the + // consumer, so both the producer and consumer axes should have + // the same indexing. See issue #995 as well as the + // FusionValidateParallelize6 test for a concrete example. + auto it = std::find_if( + consumer->domain()->domain().begin(), + consumer->domain()->domain().end(), + [&](IterDomain* c_id_) { + return ca_map->areMapped(p_id, c_id_, IdMappingMode::EXACT); + }); + if (it == consumer->domain()->domain().end()) { + if (isParallelTypeThread(producer_ptype)) { + raw_dims.set(producer_ptype); + } + if (isParallelTypeThread(consumer_ptype)) { + raw_dims.set(consumer_ptype); + } + } + } + + // In shift or gather operations, if a thread or block + // domain's root ID is shifted or gathered, it can overlap + // in shared or global memory. This doesn't + // require a RAW sync since each thread would still write every value + // it would read, but it can require a WAR sync for Shared Memory. + // Since there isn't a separate structure for WAR than RAW for now + // we'll flag it on RAW which will trigger the WAR. + // See test FusionValidateParallelizeShift_CUDA for a + // concrete example where this sync is required. + if ((expr->getExprType() == ExprType::GatherOp || + expr->getExprType() == ExprType::ShiftOp) && + producer->getMemoryType() == MemoryType::Shared && + isParallelTypeThreadDim(producer_ptype)) { + std::unordered_set shifted_rfactor_ids; + if (expr->getExprType() == ExprType::GatherOp) { + auto gather_op = expr->as(); + for (auto root_i : + c10::irange(producer->getMaybeRFactorDomain().size())) { + auto rfactor_id = producer->getMaybeRFactorDomain()[root_i]; + // If the window shape is 1, it just copies the + // producer to the consumer + if (gather_op->windowShape()[root_i] != 1) { + shifted_rfactor_ids.insert(rfactor_id); + } + } + } else if (expr->getExprType() == ExprType::ShiftOp) { + auto shift_op = expr->as(); + for (auto root_i : + c10::irange(producer->getMaybeRFactorDomain().size())) { + auto rfactor_id = producer->getMaybeRFactorDomain()[root_i]; + // If the shift offset is 0, it doesn't actually shift + if (shift_op->offsets()[root_i] != 0) { + shifted_rfactor_ids.insert(rfactor_id); + } + } + } + + // Grab all values between shifted rfactor domains and p_id so we + // can identify which rfactor domains are inputs to the p_id + auto p_id_dep_vals = + DependencyCheck::getAllValsBetween(shifted_rfactor_ids, {p_id}); + // If this shifted rfactor domain is an input to p_id, we + // must have a WAR sync. Mark raw sync so it will be generated. + if (!p_id_dep_vals.empty()) { + raw_dims.set(producer_ptype); + } + } + + // If same parallel type and mapped, no need for syncs unless + // producer is in smem, producer parallel type is a thread + // dimension, and consumer concretizes the dimension. This sync is + // due to the redundant predicate omission in lower thread + // predicate. + auto redundant_preds = GpuLower::current() + ->threadPredMap() + .getPredicateInfo(producer) + .redundant_types; + + if (p_id->isBroadcast() && + GpuLower::current()->concretizedBroadcastDomains().isConcretized( + p_id) && + producer->getMemoryType() == MemoryType::Shared && + redundant_preds.hasTID()) { + redundant_preds.clearAllBID(); + raw_dims |= redundant_preds; + continue; + } + + // When the producer axis is a broadcast, it is not really + // parallelized unless thread-predicated and concretized + if (isParallelTypeThread(producer_ptype) && p_id->isBroadcast() && + (!parallel_bcast_doms.get(producer_ptype) || + !GpuLower::current() + ->concretizedBroadcastDomains() + .isConcretized(p_id))) { + continue; + } + + // If matching dims and matching parallel types, no comm is necessary. + if (producer_ptype == consumer_ptype && + GpuLower::current()->caMap()->areMapped( + p_id, c_id, IdMappingMode::PERMISSIVE)) { + continue; + } + + // Set parallel dimensions that communication is occuring over. + if (isParallelTypeThread(producer_ptype)) { + raw_dims.set(producer_ptype); + } + } // end for ptypes + + if (raw_dims.hasBID()) { + TORCH_INTERNAL_ASSERT( + producer->getMemoryType() == MemoryType::Global, + "Inconsistent parallelization found between TV", + producer->name(), + " (", + producer->toString(), + ") and TV", + consumer->name(), + "(", + consumer->toString(), + "). Producer is required to be in Global Memory based on parallelization strategy."); + } else if (raw_dims.hasTID()) { + TORCH_INTERNAL_ASSERT( + producer->getMemoryType() == MemoryType::Global || + producer->getMemoryType() == MemoryType::Shared, + "Inconsistent parallelization found between TV", + producer->name(), + " (", + producer->toString(), + ") and TV", + consumer->name(), + "(", + consumer->toString(), + "). Producer is required to be in Global or Shared Memory based on parallelization strategy."); + } + + } // end for consumers + + if (raw_dims.any()) { + needs_raw_sync_[producer] = raw_dims; + } + + } // end producer + } +} + +std::string SyncMap::toString() const { + std::stringstream ss; + ss << "TVs requiring RAW:" << std::endl; + for (auto entry : needs_raw_sync_) { + ss << " " << entry.first->toString() << " :: " << entry.second.toString() + << std::endl; + } + return ss.str(); +} + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower_sync_information.h b/torch/csrc/jit/codegen/cuda/lower_sync_information.h new file mode 100644 index 000000000000..09fcf9eabd7f --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/lower_sync_information.h @@ -0,0 +1,45 @@ +#pragma once + +#include +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +class SyncMap { + public: + std::string toString() const; + + //! Validates all tensors are consistently parallelized. Basically, + //! when a producer axis is threaded, either with threadIdx or + //! blockIdx, there must be a mapped consumer axis with the + //! same ParallelType with some exceptions. + //! + //! This function assumes Loop and Parallel ComputeAtMaps are already + //! built as they are used to validate consistency. + //! + //! Fills needs_raw_sync with output TVs if they need a raw sync if on smem or + //! gmem. The second entry in this map is the parallel dimensions being + //! communicated across. + void build(Fusion* fusion); + + ParallelTypeBitmap needsRawSync(TensorView* tv) const { + auto it = needs_raw_sync_.find(tv); + if (it != needs_raw_sync_.end()) { + return it->second; + } + return ParallelTypeBitmap(); + } + + private: + std::unordered_map needs_raw_sync_; +}; + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp b/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp index a7f8768883d0..3769c9c9d974 100644 --- a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp @@ -4,7 +4,6 @@ #include #include #include -#include #include #include @@ -17,55 +16,49 @@ namespace cuda { namespace { -kir::Bool* getPredicatePerParallelType( +Bool* getPredicatePerParallelType( ParallelType pt, const ThreadPredicateMap::PredicateInfo& pred_info) { - kir::SimplifyingIrBuilder ir_builder(GpuLower::current()->kernel()); - auto pt_dim = GpuLower::current()->parallelDimensionMap().get(pt); // If pt is not used or is proven to be one, no need to predicate. if (pt_dim == nullptr || pt_dim->isOneInt()) { - return ir_builder.trueVal(); + return GpuLower::current()->kernel()->trueVal(); } - // When BID needs to be predicated, that means it's an output of a grid // reduction and only the last block index in that dimension has the right // value from the grid reduce. if (isParallelTypeBlockDim(pt) && pred_info.limited_types.get(pt)) { - return ir_builder - .eqExpr( - kir::NamedScalar::getParallelIndex(pt), - ir_builder.subExpr( - kir::NamedScalar::getParallelDim(pt), ir_builder.oneVal())) - ->as(); + return SimplifyingIrBuilder::eqExpr( + NamedScalar::getParallelIndex(pt), + SimplifyingIrBuilder::subExpr( + NamedScalar::getParallelDim(pt), + GpuLower::current()->kernel()->oneVal())) + ->as(); } // Otherwise, only thread of index 0 executes the computation - return ir_builder - .eqExpr(kir::NamedScalar::getParallelIndex(pt), ir_builder.zeroVal()) - ->as(); + return SimplifyingIrBuilder::eqExpr( + NamedScalar::getParallelIndex(pt), + GpuLower::current()->kernel()->zeroVal()) + ->as(); } } // namespace -kir::Bool* ThreadPredicateMap::getPredicateFromPredicateInfo( +Bool* ThreadPredicateMap::getPredicateFromPredicateInfo( const ThreadPredicateMap::PredicateInfo& pred_info) { - kir::SimplifyingIrBuilder ir_builder(GpuLower::current()->kernel()); - const auto pred_types = pred_info.limited_types | pred_info.redundant_types; if (pred_types.none()) { - return ir_builder.trueVal(); + return GpuLower::current()->kernel()->trueVal(); } - kir::Bool* pred = nullptr; - + Bool* pred = nullptr; for (const auto pt : pred_types) { const auto tp = getPredicatePerParallelType(pt, pred_info); - pred = ir_builder.andExpr(pred, tp)->as(); + pred = SimplifyingIrBuilder::andExpr(pred, tp)->as(); } - TORCH_INTERNAL_ASSERT(pred != nullptr); return pred; @@ -79,22 +72,44 @@ ParallelTypeBitmap avoidRedundantWrites(const TensorView* out_tv) { // If the memory type is Local, it's fine to write into it always as // it's thread local. If it's Global, it's also fine to let each // thread do its own write, unless out_tv is an output of a - // reduction. Reduction reads from and writes to the tensor, so the - // result would be incorrect if the buffer is shared by redundant - // threads. Correctness issues here come from smem aliasing or grid reductions - // because the reduction itself performs an update to a value, not just a set. - const bool is_reduction = out_tv->definition()->isA() || - out_tv->definition()->isA(); + // reduction. Standard reductions (forget gridReduce for the sake of this + // argument) directly into global memory buffers accumulate into the global + // memory buffer. If this is done redundantly then it could lead to incorrect + // results. Correctness issues here can come from smem aliasing, smem + // reductions or gmem reductions because the reduction itself performs an + // update to a value, not just a set. For performance it's safe to ommit the + // redundant writes to gmem or smem, this comment is just specifying it's not + // always just a performance optimization, but can also be a correctness + // requirement. + // + // For now this is enabled for shared memory buffers, global memory buffers + // undergoing a reduction, and global memory buffers with terminating outputs. + // This could be extended to all global memory buffer transactions, but in the + // test AdvancedIndexing11 there's a case where an intermediate global buffer + // is set and used to perform a broadcast. At the moment a grid sync is not + // being inserted here, and it's generally safe since it's just a set. We + // could enable this more generally for global memory buffers, but would have + // to insert a sync or a grid broadcast in that example. For now the + // approach is to only do this on a grid buffer (not undergoing a reduction) + // if there are no other uses in the kernel. + // + // TODO: Revisit if something like AdvancedIndexing11 could be happening at + // the same time of a global reduction in a way that could produce an + // incorrect result. + const bool is_reduction = ir_utils::isReductionOp(out_tv->definition()); if (!(out_tv->getMemoryType() == MemoryType::Shared || - (out_tv->getMemoryType() == MemoryType::Global && is_reduction))) { + (out_tv->getMemoryType() == MemoryType::Global && is_reduction) || + (out_tv->getMemoryType() == MemoryType::Global && + out_tv->uses().empty()))) { return ParallelTypeBitmap(); } + ParallelTypeBitmap pred; // Track which TID types are not used to find redundant parallel - // types. Only TID types are checked as the tensor is on shared - // memory. + // types. Only TID types are checked if the tensor is on shared + // memory otherwise on global memory all TID and BID types are checked. ParallelTypeBitmap unused_types; - // Initially all types are conservatively assumed to be used. + // Initially all types are conservatively assumed to not be used. unused_types = ~unused_types; for (auto out_tv_id : out_tv->domain()->domain()) { auto pt = out_tv_id->getParallelType(); @@ -104,8 +119,22 @@ ParallelTypeBitmap avoidRedundantWrites(const TensorView* out_tv) { // If the axis is a broadcast domain and is parallelized by TID, // it is sufficient to use just one thread since the tensor is on // shared memory. - if (out_tv->getMemoryType() == MemoryType::Shared && - out_tv_id->isBroadcast() && isParallelTypeThreadDim(pt)) { + if ((out_tv->getMemoryType() == MemoryType::Shared && + out_tv_id->isBroadcast() && isParallelTypeThreadDim(pt)) || + // Protect against global memory and is_reduction as we don't want to + // predicate grid dimensions as codegen will complain predication on + // block dimensions is not allowed in grid reductions. The old + // grid reduction runtime kernel does not differentiate + // non-reduction and predicated parallel types, so the sync + // integer buffer would need to be expanded even for + // predicated parallel types, which is not what + // getGridSyncBufferSize does. The right thing here is either: + // retire the old grid reduction kernel, or update the kernel + // to propertly ignore predicated types. The new kernel is + // significantly complex and has not been tested, so the + // latter option seems more reasonable for now. See #1671. + (!is_reduction && out_tv->getMemoryType() == MemoryType::Global && + out_tv_id->isBroadcast() && isParallelTypeThread(pt))) { pred.set(pt); } unused_types.clear(pt); @@ -138,7 +167,7 @@ ParallelTypeBitmap getReductionPredicateForUnusedParallelTypes( const TensorView* tv, const ThreadPredicateMap::PredicateInfo& pred_info) { auto tv_def = tv->definition(); - if (!(tv_def && (tv_def->isA() || tv_def->isA()) && + if (!(tv_def && ir_utils::isReductionOp(tv_def) && tv->getMemoryType() == MemoryType::Global)) { return {}; } @@ -153,6 +182,21 @@ ParallelTypeBitmap getReductionPredicateForUnusedParallelTypes( void ThreadPredicateMap::updateBitSet(const Expr* expr) { FUSER_PERF_SCOPE("GpuLower::Lower::ThreadPredicateMap::updateBitSet"); + // If all of the inputs are not updated and all of the outputs have + // already mappings, don't do anything + if (std::all_of( + ir_utils::filterByType(expr->inputs()).begin(), + ir_utils::filterByType(expr->inputs()).end(), + [this](TensorView* tv) { + return updated_tvs_.find(tv) == updated_tvs_.end(); + }) && + std::all_of( + ir_utils::filterByType(expr->outputs()).begin(), + ir_utils::filterByType(expr->outputs()).end(), + [this](TensorView* tv) { return find(tv) != end(); })) { + return; + } + // Which predicates were set for the inputs ParallelTypeBitmap input_preds; @@ -188,10 +232,13 @@ void ThreadPredicateMap::updateBitSet(const Expr* expr) { for (auto id : tv_inp->domain()->domain()) { if (id->isThread()) { id_ptypes.set(id->getParallelType()); - if (id->isReduction()) { + if (id->isReduction() && + !GpuLower::current()->fusedReductionInfo().isAllreduce(id)) { id_reductions.set(id->getParallelType()); } - if (id->isBroadcast()) { + if (id->isBroadcast() && + GpuLower::current()->concretizedBroadcastDomains().isConcretized( + id)) { id_bcasts.set(id->getParallelType()); } } @@ -233,9 +280,8 @@ void ThreadPredicateMap::updateBitSet(const Expr* expr) { // Run through outputs and set bitset predicates for (auto* out_tv : ir_utils::filterByType(expr->outputs())) { - TORCH_INTERNAL_ASSERT(find(out_tv) == end()); auto redundant_types = avoidRedundantWrites(out_tv); - insert(out_tv, output_preds, redundant_types); + update(out_tv, output_preds, redundant_types); } } @@ -245,12 +291,13 @@ void ThreadPredicateMap::build(Fusion* fusion) { // Initialize mapping for input tensors for (auto inp : fusion->inputs()) { if (auto tv = dynamic_cast(inp)) { - insert(tv, ParallelTypeBitmap(), ParallelTypeBitmap()); + update(tv, ParallelTypeBitmap(), ParallelTypeBitmap()); } } for (auto expr : fusion->exprs()) { updateBitSet(expr); } + updated_tvs_.clear(); } ThreadPredicateMap::const_iterator ThreadPredicateMap::find( @@ -289,20 +336,34 @@ ParallelTypeBitmap ThreadPredicateMap::getPredicatedParallelTypes( return pred_info.limited_types | pred_info.redundant_types; } -void ThreadPredicateMap::insert( +bool ThreadPredicateMap::update( const TensorView* tv, - const ParallelTypeBitmap& valid_types, + const ParallelTypeBitmap& limited_types, const ParallelTypeBitmap& redundant_types) { - insert(tv, {valid_types, redundant_types}); + return update(tv, {limited_types, redundant_types}); } -void ThreadPredicateMap::insert( +bool ThreadPredicateMap::update( const TensorView* tv, const PredicateInfo& pred_info) { - thread_predicates_.insert({tv, pred_info}); + auto existing_mapping_it = thread_predicates_.find(tv); + if (existing_mapping_it != end()) { + PredicateInfo& existing_info = existing_mapping_it->second; + if (existing_info == pred_info) { + return false; + } else { + existing_info = pred_info; + markAsUpdated(tv); + return true; + } + } else { + thread_predicates_.insert({tv, pred_info}); + markAsUpdated(tv); + return true; + } } -kir::Bool* ThreadPredicateMap::getPredicate(const TensorView* tv) const { +Bool* ThreadPredicateMap::getPredicate(const TensorView* tv) const { TORCH_INTERNAL_ASSERT(find(tv) != end(), "Couldn't find ", tv); auto pred_info = getPredicateInfo(tv); return getPredicateFromPredicateInfo(pred_info); @@ -326,7 +387,8 @@ ParallelTypeBitmap ThreadPredicateMap::getParallelBroadcastDomains( const bool output_smem = tv->getMemoryType() == MemoryType::Shared; for (auto id : iter_domains) { - if (!id->isBroadcast()) { + if (!id->isBroadcast() || + !GpuLower::current()->concretizedBroadcastDomains().isConcretized(id)) { continue; } if (id->isBlockDim() || (!output_smem && id->isThreadDim())) { @@ -337,6 +399,10 @@ ParallelTypeBitmap ThreadPredicateMap::getParallelBroadcastDomains( return parallel_broadcast & at(tv).limited_types; } +void ThreadPredicateMap::markAsUpdated(const TensorView* tv) { + updated_tvs_.insert(tv); +} + void ThreadPredicateMap::print() const { std::cout << "\nThreadPredicateMap\n"; std::cout << "--------------------------------\n"; diff --git a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.h b/torch/csrc/jit/codegen/cuda/lower_thread_predicate.h index 256e0385aeb1..2fb115953c6e 100644 --- a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.h +++ b/torch/csrc/jit/codegen/cuda/lower_thread_predicate.h @@ -1,7 +1,7 @@ #pragma once -#include +#include #include #include @@ -48,6 +48,10 @@ class TORCH_CUDA_CU_API ThreadPredicateMap { ParallelTypeBitmap limited_types; // Parallel types where only one thread/block is enough. ParallelTypeBitmap redundant_types; + bool operator==(const PredicateInfo& other) const { + return limited_types == other.limited_types && + redundant_types == other.redundant_types; + } }; using MapType = std::unordered_map; @@ -69,7 +73,7 @@ class TORCH_CUDA_CU_API ThreadPredicateMap { ParallelTypeBitmap getPredicatedParallelTypes(const TensorView* tv) const; //! Returns a Bool predicate for a given TensorView. - kir::Bool* getPredicate(const TensorView* tv) const; + Bool* getPredicate(const TensorView* tv) const; //! Returns a ParallelTypeBitmap representing which domain needs //! blockBroadcast. @@ -78,10 +82,14 @@ class TORCH_CUDA_CU_API ThreadPredicateMap { //! blockBroadcast unless it is predicated by limited_types_ ParallelTypeBitmap getParallelBroadcastDomains(const TensorView* tv) const; + //! Mark tv as updated so that rebuilding the map should recompute + //! its predicates and those of its dependents. + void markAsUpdated(const TensorView* tv); + void print() const; //! Generate a Bool value from PredicateInfo. - static kir::Bool* getPredicateFromPredicateInfo( + static Bool* getPredicateFromPredicateInfo( const ThreadPredicateMap::PredicateInfo& pred_info); private: @@ -94,17 +102,19 @@ class TORCH_CUDA_CU_API ThreadPredicateMap { const PredicateInfo& at(const TensorView* tv) const; PredicateInfo& at(const TensorView* tv); - //! Insert a new mapping - void insert( + //! Update a mapping + bool update( const TensorView* tv, - const ParallelTypeBitmap& valid_types, + const ParallelTypeBitmap& limited_types, const ParallelTypeBitmap& redundant_types); - //! Insert a new mapping - void insert(const TensorView* tv, const PredicateInfo& pred_and_src); + //! Update a mapping + bool update(const TensorView* tv, const PredicateInfo& pred_and_src); private: MapType thread_predicates_; + //! Keep track of updated tensors that need predicates to be computed + std::unordered_set updated_tvs_; }; } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.cpp b/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.cpp new file mode 100644 index 000000000000..ab62530591ab --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.cpp @@ -0,0 +1,119 @@ +#include +#include +#include +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +void ConcretizedBroadcastDomains::build(Fusion* fusion) { + // Initialize the origin map with input broadcast domains + for (const auto fusion_input_tv : + ir_utils::filterByType(fusion->inputs())) { + for (auto root_id : fusion_input_tv->getRootDomain()) { + if (root_id->isBroadcast()) { + broadcast_origin_map_.emplace( + root_id, std::unordered_set({root_id})); + } + } + } + traverse(fusion); +} + +bool ConcretizedBroadcastDomains::isConcretized(IterDomain* id) const { + auto it = concretized_domains_.find(id); + return it != concretized_domains_.end(); +} + +void ConcretizedBroadcastDomains::handle(BroadcastOp* bop) { + // Create a new entry for each of new broadcast domains + auto out = bop->out()->as(); + for (const auto i : c10::irange(out->getRootDomain().size())) { + if (bop->getBroadcastDimFlags().at(i)) { + auto new_bcast_id = out->getRootDomain().at(i); + broadcast_origin_map_.emplace( + new_bcast_id, std::unordered_set({new_bcast_id})); + } + } +} + +void ConcretizedBroadcastDomains::handle(Expr* expr) { + IterVisitor::handle(expr); + + // Propagate broadcast origin info from producers to consumers + for (auto producer : ir_utils::filterByType(expr->inputs())) { + std::unordered_set producer_broadcasts; + // This assumes there's no merged broadcast axes between root and rfactor + // domains which is not possible at the moment. If this assumption is ever + // invalidated we would need to manaually propagate root IDs to rfactor IDs. + for (auto producer_id : producer->getMaybeRFactorDomain()) { + if (producer_id->isBroadcast()) { + producer_broadcasts.insert(producer_id); + } + } + if (producer_broadcasts.empty()) { + continue; + } + + for (auto consumer : ir_utils::filterByType(expr->outputs())) { + auto p2c_map = + PairwiseRootDomainMap(producer, consumer) + .mapProducerToConsumer( + producer->domain(), consumer->domain(), producer_broadcasts); + for (const auto& kv : p2c_map) { + auto p_id = kv.first; + auto c_id = kv.second; + const bool is_concretized = !c_id->isBroadcast(); + auto it = broadcast_origin_map_.find(p_id); + TORCH_INTERNAL_ASSERT( + it != broadcast_origin_map_.end(), + "Broadcast origin info not found for producer broadcast domain: ", + p_id->toString(), + " of ", + producer->toString()); + const auto& producer_origins = it->second; + if (is_concretized) { + // Keep track of all the origin domains as concretized + for (auto origin : producer_origins) { + // concretized_root_domains_.insert(origin); + markAsConcretized(origin); + } + } else { + // Not concretized yet. Propagate forward the origin info. + auto& consumer_origins = broadcast_origin_map_[c_id]; + for (auto origin : producer_origins) { + consumer_origins.insert(origin); + } + consumer_origins.insert(c_id); + } + } + } + } +} + +void ConcretizedBroadcastDomains::markAsConcretized(IterDomain* root_domain) { + std::deque child_domains({root_domain}); + while (!child_domains.empty()) { + auto child = child_domains.front(); + child_domains.pop_front(); + if (!concretized_domains_.emplace(child).second) { + continue; + } + const auto& child_uses = child->uses(); + for (auto child_use : child_uses) { + for (auto out_id : + ir_utils::filterByType(child_use->outputs())) { + child_domains.push_back(out_id); + } + } + } +} + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.h b/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.h new file mode 100644 index 000000000000..9dd50e8afc1d --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.h @@ -0,0 +1,51 @@ +#pragma once + +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +//! Traverse and collect all concretized broadcast domains. +//! +//! The traversal first initializes the origin map with broadcast +//! domains in input tensors. Then, a new entry is added to the origin +//! map when a broadcast op is encountered during a forward traversal +//! of the given fusion. For non-broadcast ops, mappings are just +//! propagated forward using PairwiseRootDomainMap. +//! +//! When the mapped consumer domain is not broadcast, it means the +//! producer broadcast domain is concretized, and its origin broadcast +//! domains are marked as concretized. +class TORCH_CUDA_CU_API ConcretizedBroadcastDomains : private IterVisitor { + public: + void build(Fusion* fusion); + + bool isConcretized(IterDomain* id) const; + + private: + using IterVisitor::handle; + + void handle(BroadcastOp* bop) final; + + void handle(Expr* expr) final; + + void markAsConcretized(IterDomain* root_domain); + + private: + //! Maps each broadcast domain to its original broadcast + //! domains. Their can be multiple original domains due to, e.g., + //! binary ops with broadcast domains in both inputs. + std::unordered_map> + broadcast_origin_map_; + //! Set of all concretized original domains + std::unordered_set concretized_domains_; +}; + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.cpp b/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.cpp index 33651785d43c..9922b243e4ee 100644 --- a/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.cpp @@ -18,6 +18,7 @@ namespace { bool analyzeIfDerivedFromTrivialReduction(TensorView* tv, IterDomain* id); +// Checks the producer of tv to see if the bool traverseToRFactorTensor(TensorView* tv, IterDomain* root_id) { TORCH_INTERNAL_ASSERT( root_id->definition() == nullptr, "Not root IterDomain: ", root_id); @@ -29,6 +30,7 @@ bool traverseToRFactorTensor(TensorView* tv, IterDomain* root_id) { const auto& inputs = tv->definition()->inputs(); + // Check the reduction expression that produces tv if (inputs.size() != 1 || !inputs[0]->isA() || (tv->definition()->getExprType() != ExprType::ReductionOp && tv->definition()->getExprType() != ExprType::WelfordOp)) { @@ -63,8 +65,10 @@ bool analyzeIfDerivedFromTrivialReduction(TensorView* tv, IterDomain* id) { continue; } // If not possible to prove the root ID is trivial, see if the ID - // is derived from a rfactor tensor and, if so, continue the - // analysis at the rfactor tensor. + // is derived from a rfactor tensor. This may mean that the iteration domain + // was merged or split in another expression through rfactor. Trace back + // through rfactor expressions to find original roots and determine there if + // trivial. if (!traverseToRFactorTensor(tv, root_id)) { return false; } @@ -74,7 +78,7 @@ bool analyzeIfDerivedFromTrivialReduction(TensorView* tv, IterDomain* id) { } // namespace -void TrivialReductionInfo::build(Fusion* fusion, GpuLower* gpu_lower) { +void TrivialReductionInfo::build(Fusion* fusion) { auto used_vals = fusion->usedMathVals(); for (auto tv : ir_utils::filterByType(used_vals)) { @@ -99,20 +103,6 @@ void TrivialReductionInfo::build(Fusion* fusion, GpuLower* gpu_lower) { } } } - - buildKir(fusion, gpu_lower); -} - -void TrivialReductionInfo::buildKir(Fusion* fusion, GpuLower* gpu_lower) { - for (auto id : domains_) { - auto kir_trivial_id = gpu_lower->lowerValue(id)->as(); - kir_domains_.insert(kir_trivial_id); - } - - for (auto id : domains_derived_from_root_) { - auto kir_trivial_id = gpu_lower->lowerValue(id)->as(); - kir_domains_derived_from_root_.insert(kir_trivial_id); - } } bool TrivialReductionInfo::isDerived(IterDomain* id) const { @@ -124,15 +114,6 @@ bool TrivialReductionInfo::isDerivedFromRoot(IterDomain* id) const { domains_derived_from_root_.end(); } -bool TrivialReductionInfo::isDerived(kir::IterDomain* id) const { - return kir_domains_.find(id) != kir_domains_.end(); -} - -bool TrivialReductionInfo::isDerivedFromRoot(kir::IterDomain* id) const { - return kir_domains_derived_from_root_.find(id) != - kir_domains_derived_from_root_.end(); -} - } // namespace cuda } // namespace fuser } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.h b/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.h index c16439ed4f03..655d64a04179 100644 --- a/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.h +++ b/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -13,23 +13,16 @@ namespace jit { namespace fuser { namespace cuda { -class GpuLower; - //! Detect almost all IterDomains that are derived from trivial //! reductons. class TORCH_CUDA_CU_API TrivialReductionInfo { public: - void build(Fusion* fusion, GpuLower* gpu_lower); + void build(Fusion* fusion); bool isDerived(IterDomain* id) const; - bool isDerivedFromRoot(IterDomain* id) const; - - bool isDerived(kir::IterDomain* id) const; - bool isDerivedFromRoot(kir::IterDomain* id) const; - private: - //! Convert the sets to KIR sets - void buildKir(Fusion* fusion, GpuLower* gpu_lower); + // TODO: Not used, cleanup + bool isDerivedFromRoot(IterDomain* id) const; private: //! IterDomains that are derived only from trivial @@ -48,9 +41,6 @@ class TORCH_CUDA_CU_API TrivialReductionInfo { //! trivial reductions. These domains do not need to manifest as //! for-loops. std::unordered_set domains_derived_from_root_; - - std::unordered_set kir_domains_; - std::unordered_set kir_domains_derived_from_root_; }; } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/lower_unroll.cpp b/torch/csrc/jit/codegen/cuda/lower_unroll.cpp index 08f91ba59bd7..434d1711d9c8 100644 --- a/torch/csrc/jit/codegen/cuda/lower_unroll.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_unroll.cpp @@ -6,8 +6,6 @@ #include #include #include -#include -#include #include #include #include @@ -22,8 +20,7 @@ namespace { // Provide a new for loop matching the one provided kir::ForLoop* cloneLoopNest(const kir::ForLoop* for_loop) { - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); - const auto new_loop = ir_builder.create(for_loop); + const auto new_loop = IrBuilder::create(for_loop); for (auto expr : for_loop->body().exprs()) { if (auto nested_for_loop = dynamic_cast(expr)) { expr = cloneLoopNest(nested_for_loop); @@ -35,20 +32,20 @@ kir::ForLoop* cloneLoopNest(const kir::ForLoop* for_loop) { // Returns true if expr is an expression that initializes a reduction // buffer. -bool isReductionInitExpr(const kir::Expr* expr) { +bool isReductionInitExpr(const Expr* expr) { // False if its output isn't a TensorView - if (!ir_utils::isTVOp(expr)) { + if (!ir_utils::isTvOp(expr)) { return false; } // False if it doesn't have any reduction axis - const auto out_tv = expr->outputs()[0]->as(); + const auto out_tv = expr->outputs()[0]->as(); if (!out_tv->domain()->hasReduction()) { return false; } // False if it has have TensorView inputs as initialization should // never use TensorViews const auto tv_filter_inp_view = - ir_utils::filterByType(expr->inputs()); + ir_utils::filterByType(expr->inputs()); if (tv_filter_inp_view.begin() != tv_filter_inp_view.end()) { return false; } @@ -57,28 +54,27 @@ bool isReductionInitExpr(const kir::Expr* expr) { } // namespace -void UnrollPass::handle(kir::Expr* expr) { - if (ir_utils::isTVOp(expr)) { +void UnrollPass::handle(Expr* expr) { + if (ir_utils::isTvOp(expr)) { // If tv op, predicate it - const auto out_tv = ir_utils::getTVOutput(expr); + const auto out_tv = ir_utils::getTvOutput(expr); const bool should_predicate = !for_loops_.empty() || - out_tv->memoryType() == MemoryType::Global || - out_tv->memoryType() == MemoryType::Shared; + out_tv->getMemoryType() == MemoryType::Global || + out_tv->getMemoryType() == MemoryType::Shared; if (!should_predicate) { return; } - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); const auto thread_pred = isReductionInitExpr(expr) - ? ir_builder.trueVal() - : GpuLower::current()->threadPredMap().getPredicate(out_tv->fuserTv()); + ? GpuLower::current()->kernel()->trueVal() + : GpuLower::current()->threadPredMap().getPredicate(out_tv); // When this expr is in an unswitched block, only attach the // thread predicate to the expr as thread predicates are not // grouped to the unswitch predicate. kir::Predicate* thread_pred_expr = nullptr; if (unswitched_loop_) { - thread_pred_expr = ir_builder.create(thread_pred); + thread_pred_expr = IrBuilder::create(thread_pred); } non_trivial_pred_found_ = true; @@ -95,7 +91,7 @@ void UnrollPass::handle(kir::Expr* expr) { if (!isReductionInitExpr(expr) && out_tv->domain()->hasReduction()) { const auto write_pred = unswitched_loop_ ? thread_pred_expr - : ir_builder.create( + : IrBuilder::create( PredicateType::ReductionWrite, expr, thread_pred); expr->setWritePredicate(write_pred); } @@ -105,7 +101,7 @@ void UnrollPass::handle(kir::Expr* expr) { if (ir_utils::hasBlockSync(expr, GpuLower::current()->threadPredMap())) { const auto pred = unswitched_loop_ ? thread_pred_expr - : ir_builder.create( + : IrBuilder::create( PredicateType::Inline, expr, thread_pred); expr->setPredicate(pred); return; @@ -116,28 +112,28 @@ void UnrollPass::handle(kir::Expr* expr) { if (!unswitched_loop_ && std::any_of( for_loops_.begin(), for_loops_.end(), [](const kir::ForLoop* fl) { - return fl->iter_domain()->parallelType() == + return fl->iter_domain()->getParallelType() == ParallelType::Vectorize; })) { - pred = ir_builder.create(PredicateType::Vectorize); + pred = IrBuilder::create(PredicateType::Vectorize); } if (pred == nullptr) { pred = unswitched_loop_ ? thread_pred_expr - : ir_builder.create( + : IrBuilder::create( PredicateType::Inline, expr, thread_pred); } // If we need a predicate, put expr inside an if then else - kir::IfThenElse* inline_ite = ir_builder.create(pred); + kir::IfThenElse* inline_ite = IrBuilder::create(pred); if (for_loops_.empty()) { // Special handling for top level output expressions that still // need predicates. One motivating example is a reduction op that // reduces to a scalar (issue #491) - expr_replacement_map_.insert({expr, inline_ite}); + kir::ExprMutator::registerReplace(expr, inline_ite, nullptr); } else { - for_loops_.back()->body().insert_before(expr, inline_ite); - for_loops_.back()->body().erase(expr); + kir::ExprMutator::registerReplace( + expr, inline_ite, &for_loops_.back()->body()); } inline_ite->thenBody().push_back(expr); } else if (auto for_loop = dynamic_cast(expr)) { @@ -150,8 +146,8 @@ void UnrollPass::handle(kir::Expr* expr) { void UnrollPass::handle(kir::ForLoop* fl) { // Setup for loop scoping const bool is_unroll = - fl->iter_domain()->parallelType() == ParallelType::Unroll || - fl->iter_domain()->parallelType() == ParallelType::Unswitch; + fl->iter_domain()->getParallelType() == ParallelType::Unroll || + fl->iter_domain()->getParallelType() == ParallelType::Unswitch; // If we're not looking for an unroll loop, or didn't find one, process as // normal. @@ -172,10 +168,9 @@ void UnrollPass::handle(kir::ForLoop* fl) { return; } - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); - auto unroll_pred = ir_builder.create(fl); + auto unroll_pred = IrBuilder::create(fl); - kir::IfThenElse* unroll_ite = ir_builder.create(unroll_pred); + kir::IfThenElse* unroll_ite = IrBuilder::create(unroll_pred); // Get the loop nest for the unrolled path kir::ForLoop* unrolled_loop_nest = cloneLoopNest(fl); @@ -199,12 +194,18 @@ void UnrollPass::handle(kir::ForLoop* fl) { handle(inlined_loop); look_for_unroll_ = true; if (!non_trivial_pred_found_) { - expr_replacement_map_.insert({fl, inlined_loop}); + kir::ExprMutator::registerReplace( + fl, + inlined_loop, + for_loops_.empty() ? nullptr : &for_loops_.back()->body()); } else { if (!canOmitElseClause(fl)) { unroll_ite->elseBody().push_back(inlined_loop); } - expr_replacement_map_.insert({fl, unroll_ite}); + kir::ExprMutator::registerReplace( + fl, + unroll_ite, + for_loops_.empty() ? nullptr : &for_loops_.back()->body()); } } @@ -221,31 +222,22 @@ bool UnrollPass::canOmitElseClause(kir::ForLoop* fl) { // If there's any expression that requires barrier // synchronization, the else part can't be omitted for (auto expr : loop->body().exprs()) { - if (expr->isA()) { - const ParallelTypeBitmap domains = pred_map.getParallelBroadcastDomains( - expr->outputs()[0]->as()->fuserTv()); - if (domains.any()) { - return false; - } - } else if (expr->isA() || expr->isA()) { - auto td = ir_utils::getTVOutput(expr)->domain(); - if (td->hasBlockReduction() || td->hasGridReduction()) { - return false; - } + if (ir_utils::hasBlockSync(expr, pred_map)) { + return false; } } // If the number of visits of the loop body per thread is one, the // unswitch predicate is sufficient. // When the loop stop is the same as the extent of its IterDomain, // the per-thread visit count is guaranteed to be one at most (see - // CudaKernelGenerator::visit(kir::ForLoop*) as well. Also, when a + // CudaKernelGenerator::handle(kir::ForLoop*) as well. Also, when a // loop is vectorized (not misaligned), the count must be one at // most. Even if not parallelized nor vectoirzed, it is also // sufficient if the loop stop is in fact one. bool visit_once = false; auto id = loop->iter_domain(); if ((id->isThread() && (loop->stop() == id->extent())) || - id->parallelType() == ParallelType::Vectorize) { + id->getParallelType() == ParallelType::Vectorize) { visit_once = true; } if (!visit_once) { @@ -273,30 +265,18 @@ bool UnrollPass::canOmitElseClause(kir::ForLoop* fl) { } // Generate the loop nest structure and place it in lowered_exprs -UnrollPass::UnrollPass(const std::vector& exprs) { +UnrollPass::UnrollPass(const std::vector& exprs) { FUSER_PERF_SCOPE("GpuLower::Lower::UnrollPass::computeMap"); - - // Run through loop nests and further lower the expressions - for (auto* expr : exprs) { - handle(expr); - } + kir::ExprMutator::traverseAndInsert(exprs); } -std::vector UnrollPass::runPass( +std::vector UnrollPass::runPass( Fusion* fusion, - const std::vector& exprs) { + const std::vector& exprs) { FUSER_PERF_SCOPE("GpuLower::Lower::UnrollPass::runPass"); UnrollPass unroll_pass(exprs); - - std::vector mutated_exprs; - mutated_exprs.reserve(exprs.size()); - for (auto expr : exprs) { - mutated_exprs.push_back( - ir_utils::applyReplacements(unroll_pass.replacementMap(), expr)); - } - - return mutated_exprs; + return unroll_pass.exprs_; } } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/lower_unroll.h b/torch/csrc/jit/codegen/cuda/lower_unroll.h index bec4966dd946..14725c405b77 100644 --- a/torch/csrc/jit/codegen/cuda/lower_unroll.h +++ b/torch/csrc/jit/codegen/cuda/lower_unroll.h @@ -1,7 +1,8 @@ #pragma once -#include +#include #include +#include #include #include #include @@ -51,33 +52,32 @@ namespace cuda { //! predicate still in the inner most loop, making sure that we cover edges and //! corners. //! -class TORCH_CUDA_CU_API UnrollPass { +class TORCH_CUDA_CU_API UnrollPass : kir::ExprMutator { public: // Take the incoming exprs and run loop unrolling, returning the new IR - static std::vector runPass( + static std::vector runPass( Fusion* fusion, - const std::vector& exprs); + const std::vector& exprs); static bool canOmitElseClause(kir::ForLoop* fl); private: // Generate the for Expr replacement map - UnrollPass(const std::vector& exprs); + UnrollPass(const std::vector& exprs); - const std::unordered_map& replacementMap() const { + const std::unordered_map& replacementMap() const { return expr_replacement_map_; } - void handle(kir::ForLoop* fl); + using OptOutDispatch::handle; - void handle(kir::Expr* expr); + void handle(kir::ForLoop* fl) final; + + void handle(Expr* expr) final; private: // We will track which loops in the incoming IR will be replaced and by what - std::unordered_map expr_replacement_map_; - - // Keep all for loops conveniently to make unrolling easier - std::vector for_loops_; + std::unordered_map expr_replacement_map_; // keep track if we're within an unrolled loop bool look_for_unroll_ = true; diff --git a/torch/csrc/jit/codegen/cuda/lower_utils.cpp b/torch/csrc/jit/codegen/cuda/lower_utils.cpp index 5d015c450d9f..620d38fd04b5 100644 --- a/torch/csrc/jit/codegen/cuda/lower_utils.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_utils.cpp @@ -6,8 +6,7 @@ #include #include #include -#include -#include +#include #include #include #include @@ -23,38 +22,14 @@ namespace cuda { namespace scope_utils { -std::vector getLoops(kir::Expr* scope) { - std::vector loops; - while (scope != nullptr) { - if (auto loop = dynamic_cast(scope)) { - loops.push_back(loop); - } - scope = scope->parentScope(); - } - std::reverse(loops.begin(), loops.end()); - return loops; -} - -void insertBefore(kir::Expr* scope, kir::Expr* ref, kir::Expr* expr) { - if (auto ite = dynamic_cast(scope)) { - ite->thenBody().insert_before(ref, expr); - } else if (auto for_loop = dynamic_cast(scope)) { - for_loop->body().insert_before(ref, expr); - } else { - TORCH_INTERNAL_ASSERT(false, "Unexpected scope expression"); - } -} - //! Create an **empty** Forloop and copy the metadata. -kir::ForLoop* cloneForLoop(kir::IrBuilder& ir_builder, kir::ForLoop* for_loop) { - return ir_builder.create(for_loop); +kir::ForLoop* cloneForLoop(kir::ForLoop* for_loop) { + return IrBuilder::create(for_loop); } //! Create an **empty** IfThenElse and copy the metadata. -kir::IfThenElse* cloneIfThenElse( - kir::IrBuilder& ir_builder, - kir::IfThenElse* ite) { - return ir_builder.create(ite->predicate()); +kir::IfThenElse* cloneIfThenElse(kir::IfThenElse* ite) { + return IrBuilder::create(ite->predicate()); } } // namespace scope_utils @@ -103,46 +78,53 @@ std::vector iterDomainInputsOfOrderedAs( } bool isTV(const Val* val) { - return val->getValType().value() == ValType::TensorView; + return val->getValType().value() == ValType::TensorView || + val->getValType().value() == ValType::TensorIndex; } // Check if we're a TensorView op that we can generate code for. -bool isTVOp(const Expr* expr) { +bool isTvOp(const Expr* expr) { if (std::any_of( expr->outputs().begin(), expr->outputs().end(), [](Val* v) { return isTV(v); }) && - (expr->getExprType().value() == ExprType::BinaryOp || - expr->getExprType().value() == ExprType::UnaryOp || + (expr->getExprType().value() == ExprType::UnaryOp || + expr->getExprType().value() == ExprType::BinaryOp || expr->getExprType().value() == ExprType::TernaryOp || expr->getExprType().value() == ExprType::ReductionOp || + expr->getExprType().value() == ExprType::GroupedReductionOp || expr->getExprType().value() == ExprType::WelfordOp || + expr->getExprType().value() == ExprType::MmaOp || expr->getExprType().value() == ExprType::BroadcastOp || expr->getExprType().value() == ExprType::TransposeOp || expr->getExprType().value() == ExprType::ShiftOp || expr->getExprType().value() == ExprType::GatherOp || - expr->getExprType().value() == ExprType::ViewOp)) { + expr->getExprType().value() == ExprType::ViewAsScalar || + expr->getExprType().value() == ExprType::ViewOp || + expr->getExprType().value() == ExprType::GridReduction || + expr->getExprType().value() == ExprType::GridBroadcast || + expr->getExprType().value() == ExprType::GridWelford)) { return true; } return false; } -bool isTVOp(const kir::Expr* expr) { - const auto& outputs = expr->outputs(); - return outputs.size() >= 1 && outputs[0]->isA(); +TensorView* getTv(Val* val) { + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) + return const_cast(getTv(const_cast(val))); } -kir::TensorView* getTv(kir::Val* val) { - if (auto tv = dynamic_cast(val)) { - return tv; - } else if (auto ti = dynamic_cast(val)) { - return ti->view(); +const TensorView* getTv(const Val* val) { + if (val->isA()) { + return val->as(); + } else if (val->isA()) { + return val->as()->view(); } return nullptr; } -std::vector getTvs(const std::vector& vals) { - std::vector tvs; +std::vector getTvs(const std::vector& vals) { + std::vector tvs; for (auto val : vals) { auto tv = ir_utils::getTv(val); if (tv) { @@ -152,32 +134,7 @@ std::vector getTvs(const std::vector& vals) { return tvs; } -kir::TensorView* asTv(kir::Val* val) { - auto tv = getTv(val); - TORCH_INTERNAL_ASSERT(tv != nullptr, "Neigher TensorView nor TensorIndex"); - return tv; -} - -std::vector asTvs(const std::vector vals) { - std::vector tvs; - for (auto val : vals) { - auto tv = ir_utils::asTv(val); - tvs.emplace_back(tv); - } - return tvs; -} - -// TODO: why do we assume there's a single TV output? -TensorView* getTVOutput(const Expr* expr) { - for (auto out : expr->outputs()) { - if (out->getValType().value() == ValType::TensorView) { - return out->as(); - } - } - return nullptr; -} - -kir::TensorView* getTVOutput(const kir::Expr* expr) { +TensorView* getTvOutput(const Expr* expr) { for (auto out : expr->outputs()) { if (auto tv = getTv(out)) { return tv; @@ -186,6 +143,16 @@ kir::TensorView* getTVOutput(const kir::Expr* expr) { return nullptr; } +bool isReductionOp(const Expr* expr) { + // Note that GridReduction inherits ReductionOp + return expr->isA() || expr->isA() || + expr->isA() || expr->isA(); +} + +bool isReductionTvOp(const Expr* expr) { + return isTvOp(expr) && isReductionOp(expr); +} + bool isScalarOp(const Expr* expr) { for (auto out : expr->outputs()) if (!out->isScalar()) @@ -193,25 +160,21 @@ bool isScalarOp(const Expr* expr) { return true; } -Expr* asExpr(Statement* stmt) { - TORCH_INTERNAL_ASSERT(stmt->isExpr()); - return stmt->as(); -} - -TensorView* asTV(Val* val) { - TORCH_INTERNAL_ASSERT(isTV(val)); - return val->as(); -} - bool hasBlockSync(const Expr* expr, const ThreadPredicateMap& pred_map) { - if (!isTVOp(expr)) { + if (!isTvOp(expr)) { return false; } - auto tv = getTVOutput(expr); + if (!(isReductionOp(expr) || expr->isA() || + expr->isA())) { + return false; + } - if ((expr->isA() || expr->isA()) && - (tv->hasBlockReduction() || tv->hasGridReduction())) { + // GroupedReductionOp can have multiple output TVs, but they must be + // parallelized in the same way, so just checking one of them is enough. + auto tv = getTvOutput(expr); + + if (tv->hasBlockReduction() || tv->hasGridReduction()) { return true; } else if (expr->isA()) { const ParallelTypeBitmap pt_map = @@ -222,64 +185,23 @@ bool hasBlockSync(const Expr* expr, const ThreadPredicateMap& pred_map) { return false; } -bool hasBlockSync(const kir::Expr* expr, const ThreadPredicateMap& pred_map) { - if (expr->isA() || expr->isA() || - expr->isA() || expr->isA() || - expr->isA() || expr->isA()) { - auto fuser_tv = getTVOutput(expr)->fuserTv(); - auto fuser_expr = fuser_tv->definition(); - TORCH_INTERNAL_ASSERT(fuser_expr != nullptr); - return hasBlockSync(fuser_expr, pred_map); - } - - return false; -} - -kir::Expr* applyReplacements( - const std::unordered_map& expr_replacement_map, - kir::Expr* expr) { - auto handle_scope = [&](kir::Scope& scope) { - for (const auto i : c10::irange(scope.size())) { - scope[i] = applyReplacements(expr_replacement_map, scope[i]); - } - }; - - const auto it = expr_replacement_map.find(expr); - if (it != expr_replacement_map.end()) { - return it->second; - } else { - if (auto for_loop = dynamic_cast(expr)) { - handle_scope(for_loop->body()); - } else if (auto ite = dynamic_cast(expr)) { - handle_scope(ite->thenBody()); - handle_scope(ite->elseBody()); - } - return expr; - } -} - c10::optional getMaybeWarpReductionDim( - const kir::ReductionOp* node) { - auto kir_tv = ir_utils::getTVOutput(node); - if (!kir_tv) { + const Val* output, + const Val* input) { + auto tv_out = getTv(output); + if (tv_out == nullptr) { return c10::nullopt; } - auto fuser_reduction = kir_tv->fuserTv()->definition()->as(); - return getMaybeWarpReductionDim(fuser_reduction); -} - -c10::optional getMaybeWarpReductionDim(const ReductionOp* node) { - auto fuser_tv_out = node->out()->as(); - auto fuser_tv_in = node->in()->as(); + auto tv_in = getTv(input); // only support reducing to registers for now. - if (fuser_tv_in->getMemoryType() != MemoryType::Local || - fuser_tv_out->getMemoryType() != MemoryType::Local) { + if (tv_in->getMemoryType() != MemoryType::Local || + tv_out->getMemoryType() != MemoryType::Local) { return c10::nullopt; } IterDomain* reduction_on_xdim = nullptr; - for (auto id : fuser_tv_out->domain()->domain()) { + for (auto id : tv_out->domain()->domain()) { // Currently warp reduction only allows // serial and block.x parallel reductions if (id->isReduction() && id->isParallelized()) { @@ -302,7 +224,7 @@ c10::optional getMaybeWarpReductionDim(const ReductionOp* node) { return c10::optional(reduction_on_xdim); } - if (reduction_on_xdim->extent()->isConstScalar()) { + if (reduction_on_xdim->extent()->isConst()) { auto extent_value = reduction_on_xdim->extent()->getInt().value(); if (extent_value % at::cuda::warp_size() == 0) { return c10::optional(reduction_on_xdim); @@ -329,54 +251,98 @@ bool derivedFromRootCAAxes(const TensorView* tv, IterDomain* axis) { }); } -std::unordered_map getParallelDomains( - kir::Val* val) { - kir::TensorView* kir_tv = nullptr; - if (val->isA()) { - kir_tv = val->as(); +std::unordered_map getParallelDomains( + const Val* val) { + const TensorView* tv = nullptr; + if (val->isA()) { + tv = val->as(); } else if (val->isA()) { - kir_tv = val->as()->view(); + tv = val->as()->view(); } else { TORCH_INTERNAL_ASSERT( false, "Provided val is not TensorIndex or TensorView."); } - std::unordered_map parallel_domains; - for (auto d : kir_tv->domain()->domain()) { + std::unordered_map parallel_domains; + for (auto d : tv->domain()->domain()) { if (d->isThread()) { - parallel_domains.insert(std::make_pair(d->parallelType(), d)); + parallel_domains.insert(std::make_pair(d->getParallelType(), d)); } } return parallel_domains; } +kir::Allocate* allocGlobalBufferForGridComm( + Val* buffer_size, + DataType dtype, + bool zero_init) { + const std::vector new_buffer_ids = { + IrBuilder::create( + GpuLower::current()->kernel()->zeroVal(), buffer_size)}; + const auto buffer_domain = IrBuilder::create(new_buffer_ids); + const auto buffer_tv = + IrBuilder::create(buffer_domain, dtype, MemoryType::Global); + return IrBuilder::create( + buffer_tv, buffer_tv->getMemoryType(), nullptr, zero_init); +} + } // namespace ir_utils namespace loop_utils { -// TODO: Clean this up, Naoya added a mechanism we should be able to reuse. -std::pair getAllocPoint( +BasicAllocInfo getAllocInformation( const TensorView* tv, - const std::vector& loops, + const std::vector& for_loops, const std::unordered_map& id_map, bool use_id_map) { - const auto gpu_lower = GpuLower::current(); + BasicAllocInfo info; + auto gpu_lower = GpuLower::current(); - // If in global memory, it can be all the way outside the loops. - if (tv->getMemoryType() == MemoryType::Global) { - return {nullptr, 0}; - } + bool outer_alloc_found = false; - // Figure out where we want to place alloc/reduction initialization. We want - // outside an unroll loop, or inside our computeAt point. - kir::ForLoop* alloc_loop = nullptr; + for (auto fl : for_loops) { + if (info.alloc_pos == tv->getComputeAtPosition()) { + break; + } + + if (tv->axis(info.alloc_pos)->isReduction()) { + const auto outputs = FusionGuard::getCurFusion()->getTerminatingOutputs(); + TORCH_INTERNAL_ASSERT( + std::find(outputs.begin(), outputs.end(), tv) != outputs.end(), + "Invalid computeAt of T", + tv->name(), + ". A reducation axis is detected outside computeAt point even though it is not an output tensor."); + break; + } + + auto fl_id = fl->iter_domain(); - auto loops_it = loops.begin(); - // Look at each axis individually in out's domain - for (const auto tv_i : c10::irange((int64_t)tv->getComputeAtPosition())) { - // Grab the axis ID + if (fl_id->getParallelType() == ParallelType::Unroll) { + break; + } + + // Shared memory must be allocated outside of unswitched + // domains. See issue #1133. + if (fl_id->getParallelType() == ParallelType::Unswitch && + tv->getMemoryType() == MemoryType::Shared) { + outer_alloc_found = true; + } + + // Assume global memory is allocated at outer most scope. + if (tv->getMemoryType() == MemoryType::Global) { + outer_alloc_found = true; + } + + // Allocation of a double buffered tensor is placed outside its + // double buffer axis. + if (tv->isDoubleBuffered() && + tv->axis(info.alloc_pos) == + gpu_lower->doubleBufferInfo().getDoubleBufferAxis(tv)) { + outer_alloc_found = true; + } + + auto local_id = tv->axis(info.alloc_pos); - auto local_id = tv->axis(tv_i); if (use_id_map) { auto id_it = id_map.find(local_id); if (id_it != id_map.end()) { @@ -384,91 +350,46 @@ std::pair getAllocPoint( } } - if (gpu_lower->trivialReductionInfo().isDerivedFromRoot(local_id)) { - continue; + if (GpuLower::current()->caMap()->areMapped( + local_id, fl_id, IdMappingMode::PERMISSIVE)) { + info.alloc_pos++; } - auto lowered_local_id = - gpu_lower->lowerValue(local_id)->as(); - loops_it = std::find_if( - loops_it, loops.end(), [&lowered_local_id](const auto& loop) { - return GpuLower::current()->caLoopMap().areMapped( - lowered_local_id, loop->iter_domain()) || - loop->iter_domain()->parallelType() == ParallelType::Unroll; - }); + info.init_for_loop = fl; - TORCH_INTERNAL_ASSERT( - loops_it != loops.end(), - "Could not find all required axes for indexing when trying to index into ", - tv); - if ((*loops_it)->iter_domain()->parallelType() == ParallelType::Unroll) { - return {alloc_loop, tv_i}; + if (!outer_alloc_found) { + info.alloc_for_loop = fl; } - - alloc_loop = *loops_it; - ++loops_it; } - return {alloc_loop, (int64_t)tv->getComputeAtPosition()}; -} - -std::pair getAllocPoint( - const TensorView* tv, - const std::vector& loops) { - return getAllocPoint(tv, loops, {}, false); + return info; } } // namespace loop_utils namespace { -class ReplaceExprInput : public kir::MutableIrVisitor { +class ReplaceExprInput : private kir::ExprMutator { public: - static kir::Expr* replace( - kir::Expr* expr, - const std::unordered_map& replacement_map) { - ReplaceExprInput replacer(expr, replacement_map); - TORCH_INTERNAL_ASSERT(expr != nullptr); - expr->accept(&replacer); - TORCH_INTERNAL_ASSERT(replacer.replaced_expr_ != nullptr); - auto ret_expr = replacer.replaced_expr_; - - // Copy predicates if the original expr is predicated - if (ret_expr != expr) { - ret_expr->setPredicate(expr->predicate()); - ret_expr->setWritePredicate(expr->writePredicate()); - } - return ret_expr; - } - - static std::vector replace( - const std::vector& scope, - const std::unordered_map& replacement_map) { - std::vector ret_expr; - ret_expr.reserve(scope.size()); - - for (auto expr : scope) { - ret_expr.push_back(replace(expr, replacement_map)); - } - - return ret_expr; + static std::vector replace( + const std::vector& exprs, + const std::unordered_map& replacement_map) { + ReplaceExprInput replacer(replacement_map); + replacer.traverseAndInsert(exprs); + return replacer.exprs_; } private: - ReplaceExprInput( - kir::Expr* expr, - const std::unordered_map& replacement_map) - : gpu_lower_(GpuLower::current()), - ir_builder_(gpu_lower_->kernel()), - replacement_map_(replacement_map) { - replaced_expr_ = expr; - } + ReplaceExprInput(const std::unordered_map& replacement_map) + : replacement_map_(replacement_map) {} + + using kir::ExprMutator::handle; - c10::optional> - getMaybeInputReplacementMap(kir::Expr* expr) { + c10::optional> getMaybeInputReplacementMap( + Expr* expr) { bool need_replacement = false; - std::unordered_map replaced_val; + std::unordered_map replaced_val; for (auto in : expr->inputs()) { auto replace_it = replacement_map_.find(in); if (replace_it != replacement_map_.end()) { @@ -479,98 +400,103 @@ class ReplaceExprInput : public kir::MutableIrVisitor { } } if (need_replacement) { - return c10::optional>( - replaced_val); + return c10::optional>(replaced_val); } else { return c10::nullopt; } } - // IR visitor interface - void visit(kir::ForLoop* for_loop) final { - auto new_for_loop = ir_builder_.create(for_loop); - - auto replaced_loop_body = - replace(for_loop->body().exprs(), replacement_map_); - - for (auto new_expr : replaced_loop_body) { - new_for_loop->body().push_back(new_expr); - } - replaced_expr_ = new_for_loop; + // Copy predicates and register expression replacement + void registerReplaceWithPredicate(Expr* old_expr, Expr* new_expr) { + new_expr->setPredicate(old_expr->predicate()); + new_expr->setWritePredicate(old_expr->writePredicate()); + registerReplace(old_expr, new_expr); } - void visit(kir::IfThenElse* ite) final { - auto new_ite = ir_builder_.create(ite->predicate()); - auto replaced_then_body = - replace(ite->thenBody().exprs(), replacement_map_); - for (auto new_expr : replaced_then_body) { - new_ite->thenBody().push_back(new_expr); - } - if (ite->hasElse()) { - auto replaced_else_body = - replace(ite->elseBody().exprs(), replacement_map_); - for (auto new_expr : replaced_else_body) { - new_ite->elseBody().push_back(new_expr); - } - } - replaced_expr_ = new_ite; - } - - void visit(kir::UnaryOp* node) final { + void handle(UnaryOp* node) final { auto replaced_inputs = getMaybeInputReplacementMap(node); if (replaced_inputs.has_value()) { - replaced_expr_ = ir_builder_.create( - node->operation(), + auto replacement = IrBuilder::create( + node->getUnaryOpType(), node->out(), replaced_inputs.value().at(node->in())); + registerReplaceWithPredicate(node, replacement); } } - void visit(kir::BinaryOp* node) final { + + void handle(BinaryOp* node) final { auto replaced_inputs = getMaybeInputReplacementMap(node); if (replaced_inputs.has_value()) { - replaced_expr_ = ir_builder_.create( - node->operation(), + auto replacement = IrBuilder::create( + node->getBinaryOpType(), node->out(), replaced_inputs.value().at(node->lhs()), replaced_inputs.value().at(node->rhs())); + registerReplaceWithPredicate(node, replacement); } } - void visit(kir::TernaryOp* node) final { + void handle(TernaryOp* node) final { auto replaced_inputs = getMaybeInputReplacementMap(node); if (replaced_inputs.has_value()) { - replaced_expr_ = ir_builder_.create( - node->operation(), + auto replacement = IrBuilder::create( + node->getTernaryOpType(), node->out(), replaced_inputs.value().at(node->in1()), replaced_inputs.value().at(node->in2()), replaced_inputs.value().at(node->in3())); + registerReplaceWithPredicate(node, replacement); } } - void visit(kir::ReductionOp* node) final { + void handle(ReductionOp* node) final { auto replaced_inputs = getMaybeInputReplacementMap(node); if (replaced_inputs.has_value()) { - replaced_expr_ = ir_builder_.create( - node->operation(), + auto replacement = IrBuilder::create( + node->getReductionOpType(), node->init(), node->out(), - replaced_inputs.value().at(node->in())); + replaced_inputs.value().at(node->in()), + node->isAllreduce()); + registerReplaceWithPredicate(node, replacement); } } - void visit(kir::BroadcastOp* node) final { + void handle(GroupedReductionOp* node) final { auto replaced_inputs = getMaybeInputReplacementMap(node); if (replaced_inputs.has_value()) { - replaced_expr_ = ir_builder_.create( - node->out(), replaced_inputs.value().at(node->in())); + const auto& map = replaced_inputs.value(); + auto inputs = node->inputs(); + for (auto& input : inputs) { + auto it = map.find(input); + if (it != map.end()) { + input = it->second; + } + } + auto replacement = IrBuilder::create( + node->getReductionOpTypes(), + node->initVals(), + node->outputs(), + inputs, + node->isAllreduce()); + registerReplaceWithPredicate(node, replacement); + } + } + void handle(BroadcastOp* node) final { + auto replaced_inputs = getMaybeInputReplacementMap(node); + if (replaced_inputs.has_value()) { + auto replacement = IrBuilder::create( + node->out(), + replaced_inputs.value().at(node->in()), + node->getBroadcastDimFlags()); + registerReplaceWithPredicate(node, replacement); } } - void visit(kir::WelfordOp* node) final { + void handle(WelfordOp* node) final { auto replaced_inputs = getMaybeInputReplacementMap(node); if (replaced_inputs.has_value()) { - replaced_expr_ = ir_builder_.create( + auto replacement = IrBuilder::create( node->outAvg(), node->outVar(), node->outN(), @@ -580,24 +506,44 @@ class ReplaceExprInput : public kir::MutableIrVisitor { replaced_inputs.value().at(node->inAvg()), replaced_inputs.value().at(node->inVar()), replaced_inputs.value().at(node->inN())); + registerReplaceWithPredicate(node, replacement); + } + } + + void handle(MmaOp* node) final { + auto replaced_inputs = getMaybeInputReplacementMap(node); + if (replaced_inputs.has_value()) { + auto replacement = IrBuilder::create( + node->out(), + replaced_inputs.value().at(node->inA()), + replaced_inputs.value().at(node->inB()), + node->init(), + node->options()); + registerReplaceWithPredicate(node, replacement); } } private: - GpuLower* gpu_lower_; - kir::IrBuilder ir_builder_; - kir::Expr* replaced_expr_ = nullptr; - const std::unordered_map& replacement_map_; + const std::unordered_map& replacement_map_; }; } // namespace -std::vector replaceInputsInExpr( - const std::vector& exprs, - const std::unordered_map& replacement_map) { +std::vector replaceInputsInExpr( + const std::vector& exprs, + const std::unordered_map& replacement_map) { return ReplaceExprInput::replace(exprs, replacement_map); } +bool isTrivialIterDomain(IterDomain* id) { + auto pt = id->getParallelType(); + return id->isReduction() || id->isBroadcast() || id->isStride() || + (id->extent()->isOneInt() && id->start()->isZeroInt()) || + pt == ParallelType::Vectorize || + (isParallelTypeThread(pt) && + !GpuLower::current()->haloInfo().hasHaloWidth(id)); +} + } // namespace cuda } // namespace fuser } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/lower_utils.h b/torch/csrc/jit/codegen/cuda/lower_utils.h index 1c8a0df5cd79..50cce7d96b9e 100644 --- a/torch/csrc/jit/codegen/cuda/lower_utils.h +++ b/torch/csrc/jit/codegen/cuda/lower_utils.h @@ -1,8 +1,9 @@ #pragma once -#include +#include +#include #include #include #include @@ -19,27 +20,15 @@ namespace cuda { class ThreadPredicateMap; -using IterDomainMap = std::unordered_map; +using IterDomainMap = std::unordered_map; namespace scope_utils { -//! Returns the list of nesting loops starting at `scope` -// Primarily used in indexing, maybe could be moved there -std::vector getLoops(kir::Expr* scope); - -//! Insert expr in scope before ref -//! -//! \warning for kir::IfThenElse we implicitly insert in the "then" branch! -//! -void insertBefore(kir::Expr* scope, kir::Expr* ref, kir::Expr* expr); - //! Create an **empty** Forloop and copy the metadata. -kir::ForLoop* cloneForLoop(kir::IrBuilder& ir_builder, kir::ForLoop* for_loop); +kir::ForLoop* cloneForLoop(kir::ForLoop* for_loop); //! Create an **empty** IfThenElse and copy the metadata. -kir::IfThenElse* cloneIfThenElse( - kir::IrBuilder& ir_builder, - kir::IfThenElse* ite); +kir::IfThenElse* cloneIfThenElse(kir::IfThenElse* ite); } // namespace scope_utils @@ -74,107 +63,157 @@ std::vector iterDomainInputsOfOrderedAs( const std::vector& of, const std::vector& order); +// Returns if Val is a TensorView or TensorIndex bool isTV(const Val* const); -TORCH_CUDA_CU_API bool isTVOp(const Expr*); - -bool isTVOp(const kir::Expr* expr); - -TensorView* getTVOutput(const Expr*); -kir::TensorView* getTVOutput(const kir::Expr*); - -bool isScalarOp(const Expr*); - -// TODO(kir): remove -Expr* asExpr(Statement*); +// Returns if Expr is a TensorView or TensorIndex Expr. +TORCH_CUDA_CU_API bool isTvOp(const Expr*); -// TODO(kir): Remove in favor of ->as() -TensorView* asTV(Val*); +// Returns the first output of Expr that is a TensorView +TensorView* getTvOutput(const Expr*); -//! Get kir::TensorView potentially via kir::TensorIndex. Returns nullptr if -//! cast fails. -kir::TensorView* getTv(kir::Val*); - -//! Get only kir::TensorView potentially via kir::TensorIndex. -std::vector getTvs(const std::vector& vals); +// Returns if Expr is a reduction op +TORCH_CUDA_CU_API bool isReductionOp(const Expr*); -//! Get kir::TensorView potentially via kir::TensorIndex. Error if cast fails. -kir::TensorView* asTv(kir::Val*); - -//! Get kir::TensorView potentially via kir::TensorIndex. Error if cast fails. -std::vector asTvs(const std::vector& vals); +// Returns if Expr is a reduction op with TensorView or TensorIndex +TORCH_CUDA_CU_API bool isReductionTvOp(const Expr*); bool hasBlockSync(const Expr* expr, const ThreadPredicateMap& pred_map); -bool hasBlockSync(const kir::Expr* expr, const ThreadPredicateMap& pred_map); -// expr_replacement_map maps an expression to its replacement. -// -// The applyReplacement function serves two purposes. -// -// 1. If expr is found in expr_replacement_map, return the value for expr key. -// Otherwise, return the original expression. -// -// 2. If a replacement is not found and the expression is a ForLoop or an -// IfThenElse, it modifies the expressions in its scope by running the -// handle_scope function -// -// The handle_scope function iterates over the expressions in the scope. -// For each expression, it updates the expression the value returned by -// applyReplacement. -kir::Expr* applyReplacements( - const std::unordered_map& expr_replacement_map, - kir::Expr* expr); - -//! Returns the Fuser iterdomain that maps to the thread dimension grouped +//! Returns the iterdomain that maps to the thread dimension grouped //! to warps. Returns nullopt if the reduction is not to be lowered to //! a warp reduction. c10::optional getMaybeWarpReductionDim( - const kir::ReductionOp* node); + const Val* output, + const Val* input); -c10::optional getMaybeWarpReductionDim(const ReductionOp* node); +bool isScalarOp(const Expr*); + +//! Get TensorView potentially via kir::TensorIndex. Returns nullptr if +//! cast fails. +TensorView* getTv(Val*); +const TensorView* getTv(const Val*); + +//! Get only TensorView potentially via kir::TensorIndex. +std::vector getTvs(const std::vector& vals); //! Return true if axis is derived from a root axis that is an input //! to a CA leaf axis. bool derivedFromRootCAAxes(const TensorView* tv, IterDomain* axis); -std::unordered_map getParallelDomains( - kir::Val* val); +std::unordered_map getParallelDomains( + const Val* val); + +// Allocate global buffer for a grid communication calls, i.e. grid reduce, grid +// welford reduce, grid broadcast. +kir::Allocate* allocGlobalBufferForGridComm( + Val* buffer_size, + DataType dtype, + bool zero_init); } // namespace ir_utils namespace loop_utils { -// I wanted to make the tv's in these util functions constant, but that started -// a long const-ness project going into TensorView (making functions const -// there) then into lower_loops where we sort exprs. -// TODO: We should fix this when we have some time. - -// Figure out which loop the allocation needs to be in. Returns nullptr if -// outside the first loop in loops. Also find out which index in tv the -// first dimension that needs to be allocated is. Meaning we need to allocate -// that local axis and above. -// TODO: Only remaining use of this is in index compute, remove use from there, -// or refactor and use in lower_allocation -std::pair getAllocPoint( - const TensorView* tv, - const std::vector& loops, - const std::unordered_map& id_map, - bool use_id_map); +struct BasicAllocInfo { + // The for loop that the initialization of this allocation must be + // placed in, nullptr if not within a loop + kir::ForLoop* init_for_loop = nullptr; + + // Keep track of the actual allocation loop. This can be different + // from init_for_loop only with unswitched shared memory allocations, + // which are moved outer loops to avoid duplicated allocations. This means + // that the alloc position may be outside what's expected. Most applications + // outside lower_allocation is likely looking for init_for_loop which is + // more directly related to how large an allocation is and how it's used. + // (see issue #1133). + kir::ForLoop* alloc_for_loop = nullptr; + + // The allocation position relative to buffer IDs, it could be outside the + // compute at position if it's shared memory with a compute at inside an + // unswitch + size_t alloc_pos = 0; +}; -std::pair getAllocPoint( +// Fill the above allocation struct based on provided information. id_map is +// used if we're looking at a producer tensor but loops on a consumer tensor. +BasicAllocInfo getAllocInformation( const TensorView* tv, - const std::vector& loops); + const std::vector& loops, + const std::unordered_map& id_map = {}, + bool use_id_map = false); } // namespace loop_utils // Replace value pass on Kernel IR. -// Replace each use of any kir::Val* that apears in the given `replacement_map` +// Replace each use of any Val* that apears in the given `replacement_map` // Keeps the predicate carried by each expr // // Warning: Blindly replaces all use based on pointer // Warning: May invalidate indexing if replacing uses of allocated values -std::vector replaceInputsInExpr( - const std::vector& exprs, - const std::unordered_map& replacement_map); +std::vector replaceInputsInExpr( + const std::vector& exprs, + const std::unordered_map& replacement_map); + +// True if an IterDomain does not materialize a loop +bool isTrivialIterDomain(IterDomain* id); + +// Go through all expressions and compute a local ordering of loops. operator< +// is implemented based on the concrete_id_dependencies analysis done. If +// there's no dependency between two IDs then order doesn't mater, otherwise we +// can tell which is inner most by checking if there's any dependency +// relationships. +// +// Dependency relationships in concrete_id_dependencies has a "global" view in +// the fusion, so it can resolve ordering by only looking at id's and the +// dependency map. +// +// For example two expressions may have domains: [I0], [I1] Yet we +// won't know the ordering unless we see a domain with: [I0, I1]. This happened +// in advancedIndexing9 (also see AdvancedLowering6) test when merging T5 with +// the group containing T10 (cache of T5, which is post broadcasted output) and +// T6(pre broadcasted output). +// T5 had the domain [0, 1, 2, 3, 4] produce at 3 +// T6 had the domain [0, 3, 4] compute at 3 +// Merging [0, 1, 2] and [0, 3, 4] resulted in the domain [0, 3, 4, 1, 2] +// +// If ID's are not in filter, we don't care about their ordering and ignore +// them. This is because we're only focused on loops we will have to merge +// across groups. If the domain is not in a produce at position in the producer +// edges, or a compute at position in the consumer edges, the expressions we +// look at may not have a unique ordering. + +struct TORCH_CUDA_CU_API IterDomainDependencySorter { + IterDomainDependencySorter( + const std::unordered_map>& + concrete_id_dependencies, + const std::unique_ptr& compute_at_map) + : concrete_id_dependencies_(concrete_id_dependencies), + compute_at_map_(compute_at_map) {} + + // Return true if id0 should be before id1 + // Orders such that if x maps to {y}, x comes before y in final ordering. + inline bool operator()(IterDomain* id0, IterDomain* id1) { + auto concrete_id_0 = + compute_at_map_->getConcreteMappedID(id0, IdMappingMode::LOOP); + auto concrete_id_1 = + compute_at_map_->getConcreteMappedID(id1, IdMappingMode::LOOP); + + if (concrete_id_dependencies_.find(concrete_id_0) != + concrete_id_dependencies_.end()) { + const auto& dependencies_0 = concrete_id_dependencies_.at(concrete_id_0); + // if id0 depends on id1 it means id1 is inside id0, so id0 < id1 + if (dependencies_0.count(concrete_id_1)) { + return true; + } + } + + return false; + } + + const std::unordered_map>& + concrete_id_dependencies_; + const std::unique_ptr& compute_at_map_; +}; } // namespace cuda } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/lower_validation.cpp b/torch/csrc/jit/codegen/cuda/lower_validation.cpp index 0579e44dcd6b..241e45f3eaaa 100644 --- a/torch/csrc/jit/codegen/cuda/lower_validation.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_validation.cpp @@ -1,16 +1,18 @@ #include +#include #include #include #include #include #include -#include #include #include +#include #include #include +#include #include namespace torch { @@ -20,23 +22,80 @@ namespace cuda { namespace { -//! A parallel type validation pass to make sure all the outputs of -//! welford ops are parallelized the same way. Will infer and modify serial -//! parallel types if other output/s are parallelized, so that -//! user wouldn't have to specify the same parallelization -//! 3 times. Will throw if conflicts are detected, i.e. -//! TIDx vs BIDx etc. -class ValidateParallelType : public IterVisitor { +//! Validate multiple output tensors of the same expression, i.e., +//! siblings, have valid domains and parallel types. Since siblings +//! are placed in the same loop nest, they must be parallelized the +//! same way. Will infer and modify serial parallel types if other +//! output/s are parallelized, so that user wouldn't have to specify +//! the same parallelization 3 times. Will throw if conflicts are +//! detected, i.e. TIDx vs BIDx etc. +class ValidateSiblings : public IterVisitor { public: static void validate(Fusion* fusion) { - ValidateParallelType VPT; - VPT.traverse(fusion); + ValidateSiblings validator; + validator.traverse(fusion); } private: using IterVisitor::handle; + + void handle(Expr* expr) final { + if (!ir_utils::isTvOp(expr) || expr->outputs().size() < 2) { + IterVisitor::handle(expr); + return; + } + + auto ref_output = expr->outputs().at(0)->as(); + auto ref_ndims = ref_output->nDims(); + const auto& ref_root = ref_output->getRootDomain(); + std::unordered_map id_map; + + for (const auto sibling : + ir_utils::filterByType(expr->outputs())) { + if (ref_output == sibling) { + continue; + } + + TORCH_INTERNAL_ASSERT( + sibling->nDims() == ref_ndims, + "Mismatched dimensionality detected. Expr: ", + expr->toString(), + "Ref output: ", + ref_output->toString(), + ". Sibling: ", + sibling->toString()); + + for (const auto i : c10::irange(ref_ndims)) { + validateParallelTypes(ref_output->axis(i), sibling->axis(i)); + } + + for (const auto i : c10::irange(ref_root.size())) { + id_map[ref_root[i]] = sibling->getRootDomain().at(i); + } + + BestEffortReplay replay( + sibling->domain()->domain(), ref_output->domain()->domain(), id_map); + for (const auto i : c10::irange(ref_ndims)) { + auto it = replay.getReplay().find(ref_output->axis(i)); + TORCH_INTERNAL_ASSERT( + it != replay.getReplay().end(), + "Matching sibling ID not found. Expr: ", + expr->toString(), + "Ref ID: ", + ref_output->axis(i)->toString()); + auto sibling_id = it->second; + TORCH_INTERNAL_ASSERT( + sibling->axis(i) == sibling_id, + "Invalid matching sinbling ID detected. Expr: ", + expr->toString(), + "Sibling ID: ", + sibling_id->toString()); + } + } + } + // Parallelize id1 and id0 consistently if one is serial and the other isn't - void convertIterDomain(IterDomain* id0, IterDomain* id1) { + void validateParallelTypes(IterDomain* id0, IterDomain* id1) { const auto ptype0 = id0->getParallelType(); const auto ptype1 = id1->getParallelType(); @@ -64,20 +123,6 @@ class ValidateParallelType : public IterVisitor { } } } - - void handle(WelfordOp* wop) override { - auto out_avg = wop->outAvg()->as(); - auto out_var = wop->outVar()->as(); - auto out_n = wop->outN()->as(); - TORCH_INTERNAL_ASSERT(out_avg->nDims() == out_var->nDims()); - TORCH_INTERNAL_ASSERT(out_avg->nDims() == out_n->nDims()); - for (const auto i : c10::irange(out_avg->nDims())) { - // TODO: can be cleaner. - convertIterDomain(out_avg->axis(i), out_var->axis(i)); - convertIterDomain(out_avg->axis(i), out_n->axis(i)); - convertIterDomain(out_n->axis(i), out_var->axis(i)); - } - } }; // Make sure all IterDomains are only used for a unique @@ -151,7 +196,7 @@ void validateIr(Fusion* fusion) { } // Validate Parallelization - ValidateParallelType::validate(fusion); + ValidateSiblings::validate(fusion); validateIterDomainUsage(fusion); } @@ -261,6 +306,35 @@ class VectorizeValidator : public OptInDispatch { domains_.insert(m->inner()); } + // For the producer tensor, it's indexed first by transformed like + // the consumer. So, to find its contig merged domain, use the + // consumer TensorDomain with the producer contiguity info. + static std::vector mapProducerContiguity( + TensorView* producer_tv, + TensorView* consumer_tv) { + const auto c2p = PairwiseRootDomainMap(producer_tv, consumer_tv) + .mapConsumerToProducer( + consumer_tv->domain(), producer_tv->domain()); + + std::vector producer_contiguity; + + for (auto consumer_root_id : consumer_tv->getRootDomain()) { + auto producer_root_id = c2p.at(consumer_root_id); + auto producer_root_it = std::find( + producer_tv->getMaybeRFactorDomain().begin(), + producer_tv->getMaybeRFactorDomain().end(), + producer_root_id); + TORCH_INTERNAL_ASSERT( + producer_root_it != producer_tv->getMaybeRFactorDomain().end()); + auto producer_root_id_offset = std::distance( + producer_tv->getMaybeRFactorDomain().begin(), producer_root_it); + producer_contiguity.push_back( + producer_tv->domain()->contiguity().at(producer_root_id_offset)); + } + + return producer_contiguity; + } + private: std::unordered_set domains_; IterDomain* vectorized_id_ = nullptr; @@ -285,8 +359,10 @@ class VectorizeValidator : public OptInDispatch { } } - // If no vectorized id's found simply return; - if (v_id == nullptr) { + // If no vectorized ids found simply return. If vectorized access is + // broadcast, it won't generate an actual vector instruction, so can safely + // be ignore + if (v_id == nullptr || v_id->isBroadcast()) { return; } @@ -319,7 +395,10 @@ class VectorizeValidator : public OptInDispatch { vector_size, " however, vector sizes only upto and including 16 bytes are supported."); - auto replay_exprs = ExprSort::getExprs(fusion, {v_id}); + auto replay_exprs = DependencyCheck::getAllExprsBetween( + {tv->getMaybeRFactorDomain().begin(), + tv->getMaybeRFactorDomain().end()}, + {v_id}); VectorizeValidator validator(v_id); @@ -377,12 +456,54 @@ class VectorizeValidator : public OptInDispatch { "Vectorized dim has to be from a contiguous inner most position: ", tv, "\n"); + + // Save info required to lowering and runtime validation + auto consumer_word_size_it = + GpuLower::current()->vectorizedAccesses().find(tv); + if (consumer_word_size_it != + GpuLower::current()->vectorizedAccesses().end()) { + consumer_word_size_it->second = std::max( + (int)vector_size_optional.value(), consumer_word_size_it->second); + } else { + GpuLower::current()->vectorizedAccesses().emplace( + tv, (int)vector_size_optional.value()); + } + auto producer_tv = tv->definition()->inputs().at(0)->as(); + auto producer_word_size_it = + GpuLower::current()->vectorizedAccesses().find(producer_tv); + if (producer_word_size_it != + GpuLower::current()->vectorizedAccesses().end()) { + producer_word_size_it->second = std::max( + (int)vector_size_optional.value(), producer_word_size_it->second); + } else { + GpuLower::current()->vectorizedAccesses().emplace( + producer_tv, (int)vector_size_optional.value()); + } + + VectorizedSetInfo vectorized_set_info; + vectorized_set_info.consumer_tv = tv; + vectorized_set_info.producer_tv = producer_tv; + // Note that VectorizedSetInfo is about each instance of + // vectorized set operations, so the word size is the size of this + // specific vectorized set. + vectorized_set_info.word_size = (int)vector_size_optional.value(); + vectorized_set_info.vectorized_leaf_id = v_id; + vectorized_set_info.vectorized_root_id = validator.vectorized_id_; + // For aligned vectorize, the extent of a vectorized domain must + // be divisible by the vector word size. The domain is usually + // just one of the root domains, but can be a merged domain of + // contiguous domains. Those domains are saved in + // VectorizedSetInfo.contig_root_ids at the time of indexing. + GpuLower::current()->vectorizedSetInfo().emplace_back(vectorized_set_info); } }; } // namespace -void validateVectorize(Fusion* fusion) { +// Uses ContigIDs to find root contig domains that a vectorized domain +// depends on. As ContigIDs depends on HaloInfo, this must be done +// after HaloInfo is created. +void validateAndCollectVectorizeInfo(Fusion* fusion) { FUSER_PERF_SCOPE("GpuLower::Lower::validateVectorize"); FusionGuard fg(fusion); @@ -403,7 +524,8 @@ void validateVectorize(Fusion* fusion) { for (const auto i : c10::irange(tv->nDims())) { IterDomain* id = tv->axis(i); IterDomain* concrete_id = - GpuLower::current()->caParallelMap().getConcreteMappedID(id); + GpuLower::current()->caMap()->getConcreteMappedID( + id, IdMappingMode::LOOP); auto ptype = concrete_id->getParallelType(); @@ -444,6 +566,10 @@ void validateVectorize(Fusion* fusion) { "TensorView: ", tv); } + // Validate the vectorized domain maps to the innermost domain of + // tv. Note that we don't need to validate its producer tv as + // both Vectorize and MisalignedVectorize can only be used with + // UnaryOp::Set. if (has_vectorize_dim || has_misaligned_vectorize_dim) { VectorizeValidator::validate(tv); } @@ -452,164 +578,106 @@ void validateVectorize(Fusion* fusion) { namespace { -// Validate parallelization of a single tensor -void validateParallelizationOfTensor(TensorView* tv) { - // Each ParallelType can be used only once. - ParallelTypeBitmap pt_map; - for (size_t i = 0; i < tv->nDims(); ++i) { - auto axis = tv->axis(i); - auto ptype = axis->getParallelType(); - if (!isParallelTypeThread(ptype)) { - continue; - } +void fillVectorizedContigRootDomains( + const TensorView* tv, + const ContigIDs& contig_finder, + IterDomain* vectorized_root_id, + VectorizedSetInfo& info) { + const auto& root_dom = tv->getMaybeRFactorDomain(); + + // Find the root domains that are dependency of the merged contig + // domain. + auto consumer_indexed_it = + contig_finder.rootToIndexedID().find(vectorized_root_id); + TORCH_INTERNAL_ASSERT( + consumer_indexed_it != contig_finder.rootToIndexedID().end(), + "Contiguity information not found for root domain: ", + vectorized_root_id->toString()); + auto consumer_indexed_id = consumer_indexed_it->second; + + // Actual indexed root domains for this root domain. If + // contig merge is done, multiple root domains are included. + std::unordered_set indexed_root_ids; + + if (consumer_indexed_id == vectorized_root_id) { + // Indexed domain is equal to the root domain, meaning no contig + // merge is involved. + indexed_root_ids.insert(vectorized_root_id); + } else { + auto consumer_within_contig_it = + contig_finder.withinContigIDs().find(consumer_indexed_id); TORCH_INTERNAL_ASSERT( - !pt_map.get(ptype), - "Multiple use of ", - ptype, - " in tensor t", - tv->name(), - ": ", - tv); - pt_map.set(ptype); + consumer_within_contig_it != contig_finder.withinContigIDs().end()); + const auto& within_ids = consumer_within_contig_it->second; + std::copy_if( + root_dom.begin(), + root_dom.end(), + std::inserter(indexed_root_ids, indexed_root_ids.end()), + [&](IterDomain* root_id) { + return within_ids.find(root_id) != within_ids.end(); + }); } - // If this tensor is predicated by a paralel type, it should not be - // used to parallelize any domain of this tensor + // Store the contig merged root domains. If it is already set, pick + // the smaller one as it is used for validating divisibility of the + // merged extent. + if (info.contig_root_ids.empty() || + indexed_root_ids.size() < info.contig_root_ids.size()) { + info.contig_root_ids = indexed_root_ids; + } +} - const auto thread_pred = - GpuLower::current()->threadPredMap().getPredicateInfo(tv); +} // namespace - auto predicated_parallel_types = pt_map & thread_pred.limited_types; +void fillConsumerVectorizedContigRootDomains( + const TensorView* consumer_tv, + const ContigIDs& contig_finder) { + auto& info_vector = GpuLower::current()->vectorizedSetInfo(); + auto it = std::find_if( + info_vector.begin(), info_vector.end(), [&consumer_tv](auto& info) { + return info.consumer_tv == consumer_tv; + }); + if (it == info_vector.end()) { + return; + } - TORCH_INTERNAL_ASSERT( - predicated_parallel_types.none(), - "Invalid parallelization of tensor t", - tv->name(), - ". The tensor is parallelized with ", - predicated_parallel_types.toString(), - ", but it's invalid to use the types as the tensor is also predicated with them.", - ", thread prd: ", - thread_pred.limited_types.toString()); + VectorizedSetInfo& info = *it; + + // info.vectorized_root_id is validated at this point to be the + // last concrete root domain in consumer. + auto consumer_root_id = info.vectorized_root_id; + + fillVectorizedContigRootDomains( + consumer_tv, contig_finder, consumer_root_id, info); } -} // namespace +void fillProducerVectorizedContigRootDomains( + const TensorView* producer_tv, + const TensorView* consumer_tv, + const std::unordered_map& c2p_map, + const ContigIDs& contig_finder) { + auto& info_vector = GpuLower::current()->vectorizedSetInfo(); + auto it = std::find_if( + info_vector.begin(), + info_vector.end(), + [&producer_tv, &consumer_tv](auto& info) { + return info.consumer_tv == consumer_tv && + info.producer_tv == producer_tv; + }); + if (it == info_vector.end()) { + return; + } -void validateParallelize(Fusion* fusion) { - FUSER_PERF_SCOPE("GpuLower::Lower::validateParallelize"); - FusionGuard fg(fusion); + VectorizedSetInfo& info = *it; - const auto& par_map = GpuLower::current()->caParallelMap(); - const auto& loop_map = GpuLower::current()->caLoopMap(); - const auto& pred_map = GpuLower::current()->threadPredMap(); + // info.vectorized_root_id is validated at this point to be the + // last concrete root domain in consumer. + auto consumer_root_id = info.vectorized_root_id; - auto exprs = ExprSort::getExprs(fusion); + auto root_id = c2p_map.at(consumer_root_id); - for (auto expr : exprs) { - if (!ir_utils::isTVOp(expr)) { - continue; - } - // Validate parallelization of each consumer by itself - for (auto consumer : ir_utils::filterByType(expr->outputs())) { - validateParallelizationOfTensor(consumer); - } - // Validate parallelization between a producer and a consumer - for (auto producer : ir_utils::filterByType(expr->inputs())) { - // Parallelization on input tensors have no effect. - if (producer->isFusionInput()) { - continue; - } - const auto parallel_bcast_doms = - pred_map.getParallelBroadcastDomains(producer); - for (const auto i : c10::irange(producer->nDims())) { - // If a producer axis is threaded, either with threadIdx or - // blockIdx, there must be a mapped consumer axis with the - // same ParallelType. An exception is when the producer is - // allocated on shared memory and its parallelized with - // threadIdx. In that case, there is no parallelization - // constraint on the consumer as syncthreads will be inserted - // when necessary. - auto producer_axis = producer->axis(i); - auto producer_ptype = - par_map.getConcreteMappedID(producer_axis)->getParallelType(); - if (!isParallelTypeThread(producer_ptype)) { - continue; - } - // When the producer axis is a broadcast, it is not really - // parallelized unless thread-predicated - if (producer_axis->isBroadcast() && - !parallel_bcast_doms.get(producer_ptype)) { - continue; - } - // No constraint on the consumer tensor when the producer - // axis is parallelized with threadIdx and allocates on - // shared memory - if (isParallelTypeThreadDim(producer_ptype) && - producer->getMemoryType() == MemoryType::Shared) { - continue; - } - // There should be also nothing to validate when the producer - // axis is reduction. - if (producer_axis->isReduction()) { - continue; - } - // There must be a consumer axis that uses the same indexing - // with the same parallel type as the producer axis. The loop - // map is used to to find such an axis. Broadcast forwarding - // does not cause any inconsistent parallelization as indexing - // takes care of the forwarding. - for (auto consumer : - ir_utils::filterByType(expr->outputs())) { - auto it = std::find_if( - consumer->domain()->domain().begin(), - consumer->domain()->domain().end(), - [&](IterDomain* consumer_axis) { - return loop_map.areMapped(producer_axis, consumer_axis); - }); - TORCH_INTERNAL_ASSERT( - it != consumer->domain()->domain().end(), - "Inconsistent parallelization found between TV", - producer->name(), - " (", - producer, - ") and TV", - consumer->name(), - "(", - consumer, - "). ", - "TV", - consumer->name(), - " does not have a matching axis for parallelized producer axis, ", - producer_axis, - ". CA Map: ", - loop_map.toString()); - auto consumer_axis = *it; - auto consumer_ptype = - par_map.getConcreteMappedID(consumer_axis)->getParallelType(); - TORCH_INTERNAL_ASSERT( - producer_ptype == consumer_ptype, - "Inconsistent parallelization found between TV", - producer->name(), - " (", - producer, - ") and TV", - consumer->name(), - "(", - consumer, - "). " - "Producer axis, ", - producer_axis, - " is parallelized with ", - stringifyThread(producer_ptype), - ", but the parallel type of its matching consumer axis, ", - consumer_axis, - " is ", - stringifyThread(consumer_ptype), - "."); - } - } - } - } + fillVectorizedContigRootDomains(producer_tv, contig_finder, root_id, info); } namespace { @@ -630,7 +698,7 @@ namespace { // each tensor that needs to be computed. std::unordered_map> getLiveRangeOffsets( Fusion* fusion) { - auto exprs = ExprSort::getExprs(fusion); + auto exprs = StmtSort::getExprs(fusion); std::unordered_map> map; @@ -760,7 +828,9 @@ void validatePartialSplit(Fusion* fusion) { auto range_info = getLiveRangeOffsets(fusion); for (auto tv : ir_utils::allTvs(fusion)) { - auto exprs = ir_utils::historyOf(tv); + auto exprs = StmtSort::getExprs( + tv->fusion(), + {tv->domain()->domain().begin(), tv->domain()->domain().end()}); for (auto split : ir_utils::filterByType(exprs)) { // When the start and stop offsets are not zero, make sure the // range defined by the split includes the required range to @@ -793,6 +863,95 @@ void validatePartialSplit(Fusion* fusion) { } } +namespace { + +//! Utility to make sure targeted gpu capability is +//! higher than provided major.minor. +void validateMinimumArch(int major, int minor) { + auto prop = at::cuda::getCurrentDeviceProperties(); + TORCH_INTERNAL_ASSERT(prop->major >= major); + if (prop->major == major) { + TORCH_INTERNAL_ASSERT(prop->minor >= minor); + } +} + +//! Validates that the operand and result tensors +//! of mma ops are swizzled and also validates +//! specialization of tidx as lane id. +void validateMmaTensors(MmaOp* mma) { + bool tidx_validated = false; + std::vector to_validate = { + mma->inA()->as(), + mma->inB()->as(), + mma->out()->as()}; + + for (auto tv : to_validate) { + for (auto id : tv->domain()->domain()) { + auto ptype = id->getParallelType(); + if (ptype == ParallelType::TIDx) { + TORCH_INTERNAL_ASSERT( + id->isMmaSwizzled(), + "TIDx for mma input/output must be set by WarpMmaSwizzler", + id, + tv); + if (!tidx_validated) { + // Check that TIDx is exact lane_id + const auto& paralel_dim_map = + GpuLower::current()->parallelDimensionMap(); + TORCH_INTERNAL_ASSERT( + paralel_dim_map.isExact(ptype) && + paralel_dim_map.get(ptype)->getInt().has_value() && + paralel_dim_map.get(ptype)->getInt().value() == + at::cuda::warp_size(), + "TIDx is reserved for lane id in mma kernels, and it needs to be exactly a warp"); + tidx_validated = true; + } + } + } + } + + // Note: this check will be relaxed in a follow up. + auto validate_operand_ids = [](const TensorView* tv) { + TORCH_INTERNAL_ASSERT( + std::all_of( + tv->domain()->domain().begin() + tv->getComputeAtPosition(), + tv->domain()->domain().end(), + [](IterDomain* id) { + return id->isMmaSwizzled() || + (id->isBroadcast() && + id->getParallelType() == ParallelType::Serial); + }), + "All id's on the right of CA pos needs to be mma-swizzled by WarpMmaSwizzler\n", + tv); + }; + + validate_operand_ids(mma->inA()->as()); + validate_operand_ids(mma->inB()->as()); +} + +} // namespace + +//! Validate data format and GPU arch compatibility of scheduled +//! mma operators on the fusion. +void validateMma(Fusion* fusion) { + auto exprs = StmtSort::getExprs(fusion); + + for (auto expr : exprs) { + if (auto mma = dynamic_cast(expr)) { + validateMmaTensors(mma); + + switch (mma->options().macro) { + case MmaOptions::MacroType::Volta_16_16_4: + validateMinimumArch(7, 0); + break; + default: + TORCH_INTERNAL_ASSERT(false, "validate mma: unsupported macro"); + break; + } + } + } +} + } // namespace cuda } // namespace fuser } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/lower_validation.h b/torch/csrc/jit/codegen/cuda/lower_validation.h index 89de85026ee7..d8c95d8d1f05 100644 --- a/torch/csrc/jit/codegen/cuda/lower_validation.h +++ b/torch/csrc/jit/codegen/cuda/lower_validation.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include @@ -9,18 +9,28 @@ namespace jit { namespace fuser { namespace cuda { -void validateIr(Fusion* fusion); +class ContigIDs; -void validateVectorize(Fusion* fusion); +void validateIr(Fusion* fusion); -//! Validates all tensors are consistently parallelized. Basically, -//! when a producer axis is threaded, either with threadIdx or -//! blockIdx, there must be a mapped consumer axis with the -//! same ParallelType with some exceptions. -//! -//! This function assumes Loop and Parallel ComputeAtMaps are already -//! built as they are used to validate consistency. -void validateParallelize(Fusion* fusion); +//! Validate vectorization and collect information on vectorization +//! used in code generation as well as runtime validation. +void validateAndCollectVectorizeInfo(Fusion* fusion); + +//! Find the contig root domains that a vectorized leaf domain +//! of a consumer TV depends on. Required for runtime validation. +void fillConsumerVectorizedContigRootDomains( + const TensorView* consumer_tv, + const ContigIDs& contig_finder); + +//! Find the contig root domains that a vectorized leaf domain +//! of a producer TV depends on. Required for runtime validation. +//! Producer must be transformed as consumer. +void fillProducerVectorizedContigRootDomains( + const TensorView* producer_tv, + const TensorView* consumer_tv, + const std::unordered_map& c2p_map, + const ContigIDs& contig_finder); //! Validates partial split expressions. Partial split only uses an //! inner subdomain specified by start and stop offsets, ignoring the @@ -30,6 +40,10 @@ void validateParallelize(Fusion* fusion); //! calculated that are necessary for output values. void validatePartialSplit(Fusion* fusion); +//! Validate data format and GPU arch compatibility of scheduled +//! mma operators on the fusion. +void validateMma(Fusion* fusion); + } // namespace cuda } // namespace fuser } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp b/torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp index eaddf7faea32..1d87790c014f 100644 --- a/torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include #include #include @@ -13,25 +13,63 @@ namespace cuda { namespace { +//! A helper class for EliminateDeadBroadcastAndAllocate. Eliminate +//! dead Allocate and Broadcast detected by EliminateDeadBroadcastAndAllocate. +class DeadTvEliminator : private kir::ExprMutator { + public: + static std::vector run( + const std::vector& exprs, + const std::unordered_set& dead_tvs) { + return DeadTvEliminator(exprs, dead_tvs).exprs_; + } + + private: + DeadTvEliminator( + const std::vector& exprs, + const std::unordered_set& dead_tvs) + : dead_tvs_(dead_tvs) { + traverseAndInsert(exprs); + } + + using kir::ExprMutator::handle; + + void handle(kir::Allocate* allocate) final { + if (auto buffer_tv = dynamic_cast(allocate->buffer())) { + if (dead_tvs_.count(buffer_tv)) { + registerRemove(allocate); + } + } + } + + void handle(BroadcastOp* broadcast) final { + if (auto out_ti = dynamic_cast(broadcast->out())) { + if (dead_tvs_.count(out_ti->view())) { + registerRemove(broadcast); + } + } + } + + private: + const std::unordered_set& dead_tvs_; +}; + //! A simple DCE for eliminating the //! parallel broadcasts that has been fused //! and their corresponding allocations class EliminateDeadBroadcastAndAllocate { public: - static std::vector run(const std::vector& exprs) { + static std::vector run(const std::vector& exprs) { EliminateDeadBroadcastAndAllocate dce(exprs); - return dce.result_exprs_; + return DeadTvEliminator::run(exprs, dce.dead_tvs_); } private: - EliminateDeadBroadcastAndAllocate(const std::vector& exprs) - : ir_builder_(GpuLower::current()->kernel()) { + EliminateDeadBroadcastAndAllocate(const std::vector& exprs) { findLiveTvs(exprs); findDeadTvs(); - eliminateDeadCode(exprs); } - void findLiveTvs(const std::vector& exprs) { + void findLiveTvs(const std::vector& exprs) { for (auto expr : exprs) { if (auto for_loop = dynamic_cast(expr)) { findLiveTvs(for_loop->body().exprs()); @@ -44,11 +82,10 @@ class EliminateDeadBroadcastAndAllocate { if (auto allocate = dynamic_cast(expr)) { if (allocate->memoryType() == MemoryType::Local) { - if (auto kir_tv = - dynamic_cast(allocate->buffer())) { + if (auto tv = dynamic_cast(allocate->buffer())) { // We know only tvs that we'd want to consider are broadcast outputs - if (kir_tv->fuserTv()->definition()->isA()) { - candidate_tv_set_.insert(kir_tv); + if (tv->definition()->isA()) { + candidate_tv_set_.insert(tv); } } } @@ -72,95 +109,10 @@ class EliminateDeadBroadcastAndAllocate { } } - void eliminateDeadCode(const std::vector& exprs) { - result_exprs_ = eliminateDeadCodeInScope(exprs); - } - - bool shouldEliminate(kir::Expr* expr) { - if (auto allocate = dynamic_cast(expr)) { - if (auto buffer_tv = dynamic_cast(allocate->buffer())) { - if (dead_tvs_.count(buffer_tv)) { - return true; - } - } - } else if (auto broadcast = dynamic_cast(expr)) { - if (auto out_ti = dynamic_cast(broadcast->out())) { - if (dead_tvs_.count(out_ti->view())) { - return true; - } - } - } - return false; - } - - //! Returns a new vector of exprs with dead exprs - //! eliminated. - std::vector eliminateDeadCodeInScope( - const std::vector& exprs) { - std::vector result_exprs; - - for (auto expr : exprs) { - auto result_expr = expr; - if (auto for_loop = dynamic_cast(expr)) { - result_expr = eliminateDeadCode(for_loop); - } else if (auto ite = dynamic_cast(expr)) { - result_expr = eliminateDeadCode(ite); - } else { - if (shouldEliminate(expr)) { - result_expr = nullptr; - } - } - - // Push the result expr if not eliminated - if (result_expr) { - result_exprs.push_back(result_expr); - } - } - - return result_exprs; - } - - kir::ForLoop* eliminateDeadCode(kir::ForLoop* for_loop) { - auto new_loop_body = eliminateDeadCodeInScope(for_loop->body().exprs()); - if (new_loop_body.empty()) { - return nullptr; - } - - // TODO: we will need a kernel_ir cloner to make this - // kind of logic re-usable. - auto new_loop = scope_utils::cloneForLoop(ir_builder_, for_loop); - - for (auto expr : new_loop_body) { - new_loop->body().push_back(expr); - } - return new_loop; - } - - kir::IfThenElse* eliminateDeadCode(kir::IfThenElse* ite) { - auto new_then_body = eliminateDeadCodeInScope(ite->thenBody().exprs()); - auto new_else_body = eliminateDeadCodeInScope(ite->elseBody().exprs()); - if (new_then_body.empty() && new_else_body.empty()) { - return nullptr; - } - - auto new_ite = scope_utils::cloneIfThenElse(ir_builder_, ite); - - for (auto expr : new_then_body) { - new_ite->thenBody().push_back(expr); - } - for (auto expr : new_else_body) { - new_ite->elseBody().push_back(expr); - } - return new_ite; - } - private: - std::unordered_set live_tvs_; - std::unordered_set dead_tvs_; - std::unordered_set candidate_tv_set_; - - std::vector result_exprs_; - kir::IrBuilder ir_builder_; + std::unordered_set live_tvs_; + std::unordered_set dead_tvs_; + std::unordered_set candidate_tv_set_; }; //! A pass to eliminate redundant parallel broadcasts that are consumers @@ -189,9 +141,9 @@ class EliminateDeadBroadcastAndAllocate { //! //! 3. EliminateDeadBroadcastAndAllocate removes the broadcast ops //! and corresponding allocations if they're un-used after step 2. -class FuseBroadcastWithWarpReduce { +class FuseBroadcastWithWarpReduce : private kir::IrVisitor { public: - static std::vector fuse(const std::vector& exprs) { + static std::vector fuse(const std::vector& exprs) { FuseBroadcastWithWarpReduce fuse_broadcast_map(exprs); const auto replaced_inputs = replaceInputsInExpr(exprs, fuse_broadcast_map.val_replacement_map_); @@ -199,70 +151,51 @@ class FuseBroadcastWithWarpReduce { } private: - FuseBroadcastWithWarpReduce(const std::vector& exprs) { + FuseBroadcastWithWarpReduce(const std::vector& exprs) { // open stack space for global scope - // The scope stack for kir_tv_to_allocate wouldn't be needed + // The scope stack for tv_to_allocate wouldn't be needed // if the allocations are guaranteed to be once and unique, // which can currently be assumed but this pass tries not // to rely on this assumption. - running_kir_tv_to_allocate_map_.emplace_back( - std::make_unique< - std::unordered_map>()); + running_tv_to_allocate_map_.emplace_back( + std::make_unique>()); running_visible_allocation_stack_.emplace_back( std::make_unique>()); - - for (auto expr : exprs) { - handle(expr); - } + kir::IrVisitor::handle(exprs); } - void handle(kir::Expr* expr) { - if (auto for_loop = dynamic_cast(expr)) { - handle(for_loop); - return; - } else if (auto ite = dynamic_cast(expr)) { - handle(ite); - return; - } - - // Process expr inputs if needs replacement - for (auto inp : expr->inputs()) { - if (auto input_ti = dynamic_cast(inp)) { - auto replace = findMaybeReplacedTensorIndex(input_ti); - if (replace.has_value()) { - val_replacement_map_[input_ti] = replace.value(); + void handle(Expr* expr) final { + if (ir_utils::isTvOp(expr)) { + // Process expr inputs if needs replacement + for (auto inp : expr->inputs()) { + if (auto input_ti = dynamic_cast(inp)) { + auto replace = findMaybeReplacedTensorIndex(input_ti); + if (replace.has_value()) { + val_replacement_map_[input_ti] = replace.value(); + } } } } - - // Handle reduction definitions - if (auto reduction = dynamic_cast(expr)) { - handle(reduction); - } else if (auto broadcast = dynamic_cast(expr)) { - handle(broadcast); - } else if (auto allocate = dynamic_cast(expr)) { - handle(allocate); - } + kir::IrVisitor::handle(expr); } - bool openLoopNestLevel(kir::IterDomain* id) { - if (id->isThread() || id->parallelType() == ParallelType::Unswitch) { + bool openLoopNestLevel(IterDomain* id) { + if (id->isThread() || id->getParallelType() == ParallelType::Unswitch) { return false; } - if (id->parallelType() == ParallelType::Serial || - id->parallelType() == ParallelType::Unroll) { + if (id->getParallelType() == ParallelType::Serial || + id->getParallelType() == ParallelType::Unroll) { return !id->isBroadcast(); } return true; } - void handle(kir::ForLoop* for_loop) { + void handle(kir::ForLoop* for_loop) final { // Keep track of visible reduction outputs bool open_nest_level = openLoopNestLevel(for_loop->iter_domain()); if (open_nest_level) { - running_kir_tv_to_allocate_map_.emplace_back( - std::make_unique< - std::unordered_map>()); + running_tv_to_allocate_map_.emplace_back( + std::make_unique>()); running_visible_allocation_stack_.emplace_back( std::make_unique>()); } @@ -270,12 +203,12 @@ class FuseBroadcastWithWarpReduce { handle(expr); } if (open_nest_level) { - running_kir_tv_to_allocate_map_.pop_back(); + running_tv_to_allocate_map_.pop_back(); running_visible_allocation_stack_.pop_back(); } } - void handle(kir::IfThenElse* ite) { + void handle(kir::IfThenElse* ite) final { running_visible_allocation_stack_.emplace_back( std::make_unique>()); for (auto expr : ite->thenBody().exprs()) { @@ -292,15 +225,14 @@ class FuseBroadcastWithWarpReduce { //! Place this allocate on the list of currently visible allocations, //! organized by loop nest level. - void handle(kir::Allocate* allocate) { + void handle(kir::Allocate* allocate) final { if (allocate->memoryType() != MemoryType::Local) { return; } - if (auto kir_tv = dynamic_cast(allocate->buffer())) { - auto fuser_tv = kir_tv->fuserTv(); - if (fuser_tv->definition()) { - if (fuser_tv->definition()->isA() || - fuser_tv->definition()->isA()) { + if (auto tv = dynamic_cast(allocate->buffer())) { + if (tv->definition()) { + if (tv->definition()->isA() || + tv->definition()->isA()) { running_visible_allocation_stack_.back()->push_back(allocate); } } @@ -311,18 +243,18 @@ class FuseBroadcastWithWarpReduce { //! returns the replaced TensorIndex if so. c10::optional findMaybeReplacedTensorIndex( kir::TensorIndex* tensor_index) { - auto kir_tv = tensor_index->view(); - auto tensor_index_it = running_tv_replacement_map_.find(kir_tv); + auto tv = tensor_index->view(); + auto tensor_index_it = running_tv_replacement_map_.find(tv); if (tensor_index_it != running_tv_replacement_map_.end()) { return tensor_index_it->second; } return c10::nullopt; } - //! Iteratve backwards on the currently visible loop scopes + //! Iterate backwards on the currently visible loop scopes //! and find the first allocation corresponding to the //! given tv. - kir::Allocate* getActiveAllocateFor(kir::TensorView* tv) { + kir::Allocate* getActiveAllocateFor(TensorView* tv) { for (auto frame_it = running_visible_allocation_stack_.rbegin(); frame_it != running_visible_allocation_stack_.rend(); frame_it++) { @@ -340,19 +272,10 @@ class FuseBroadcastWithWarpReduce { return nullptr; } - Expr* getFuserTVExpr(kir::Expr* expr) { - auto out = expr->outputs()[0]; - auto out_ti = dynamic_cast(out); - if (!out_ti) { - return nullptr; - } - return out_ti->view()->fuserTv()->definition(); - } - - bool isOpInputRegisterTV(kir::Expr* expr) { + bool isOpInputRegisterTV(Expr* expr) { for (auto inp : expr->inputs()) { if (auto inp_ti = dynamic_cast(inp)) { - if (inp_ti->view()->memoryType() != MemoryType::Local) { + if (inp_ti->view()->getMemoryType() != MemoryType::Local) { return false; } } @@ -361,10 +284,10 @@ class FuseBroadcastWithWarpReduce { return true; } - bool isOpOutputRegisterTV(kir::Expr* expr) { + bool isOpOutputRegisterTV(Expr* expr) { for (auto out : expr->outputs()) { if (auto out_ti = dynamic_cast(out)) { - if (out_ti->view()->memoryType() != MemoryType::Local) { + if (out_ti->view()->getMemoryType() != MemoryType::Local) { return false; } } @@ -374,8 +297,8 @@ class FuseBroadcastWithWarpReduce { } //! Updates map of serially visible reduction tvs, see comment on - //! running_kir_tv_to_allocate_map_. - void handle(kir::ReductionOp* reduction) { + //! running_tv_to_allocate_map_. + void handle(ReductionOp* reduction) final { if (!isOpOutputRegisterTV(reduction)) { return; } @@ -386,11 +309,11 @@ class FuseBroadcastWithWarpReduce { // keep track of which reduction buffer this expr writes into auto reduction_allocate = getActiveAllocateFor(reduction_ti_out->view()); - running_kir_tv_to_allocate_map_.back()->operator[]( - reduction_ti_out->view()) = reduction_allocate; + running_tv_to_allocate_map_.back()->operator[](reduction_ti_out->view()) = + reduction_allocate; } - void handle(kir::BroadcastOp* broadcast) { + void handle(BroadcastOp* broadcast) final { if (!isOpInputRegisterTV(broadcast) || !isOpOutputRegisterTV(broadcast)) { return; } @@ -400,9 +323,9 @@ class FuseBroadcastWithWarpReduce { //! Detects if this broadcast can be fused with the producer reduction. //! adds the output of broadcast to replacement map if all above mentioned //! conditions check. - void tryAddOutputToReplaceMap(kir::BroadcastOp* broadcast) { + void tryAddOutputToReplaceMap(BroadcastOp* broadcast) { if (auto in_ti = dynamic_cast(broadcast->in())) { - if (!in_ti->view()->fuserTv()->definition()->isA()) { + if (!in_ti->view()->definition()->isA()) { return; } auto out_ti = broadcast->out()->as(); @@ -410,15 +333,14 @@ class FuseBroadcastWithWarpReduce { // check reduction-broadcast mapping: if (!canFuseBroadcastWithWarpReduction( - out_tv->fuserTv()->definition()->as())) { + out_tv->definition()->as())) { return; } // check buffers are size-1 auto reduction_allocate_it = - running_kir_tv_to_allocate_map_.back()->find(in_ti->view()); - if (reduction_allocate_it == - running_kir_tv_to_allocate_map_.back()->end()) { + running_tv_to_allocate_map_.back()->find(in_ti->view()); + if (reduction_allocate_it == running_tv_to_allocate_map_.back()->end()) { // The producer reduction is not in the serially visible scope, // as defined in openLoopNestLevel. There still could be some // cases that we could fuse but disabled for simplicity. @@ -444,7 +366,7 @@ class FuseBroadcastWithWarpReduce { return; } - // Write the kir_tv in to the replacement map + // Write the tv in to the replacement map // so the future uses of this tv will put // the tensorIndex's in the actual replacement map. running_tv_replacement_map_[out_tv] = in_ti; @@ -515,7 +437,7 @@ class FuseBroadcastWithWarpReduce { //! could need some extension for more precise scope based analysis in the //! future especially if we have more complex IfThenElse blocks than //! predicates and unroll. - std::unordered_map + std::unordered_map running_tv_replacement_map_; //! Keeps track of the allocated buffers that the exprs will write/read @@ -531,21 +453,20 @@ class FuseBroadcastWithWarpReduce { //! visibility on the generated kernel. The model of IfThenElse assumes the //! only ITE's we have are predicates and unrolls, which might need to be //! more precise. - std::vector< - std::unique_ptr>> - running_kir_tv_to_allocate_map_; + std::vector>> + running_tv_to_allocate_map_; //! This map is the final output of this pass and a val replacement map will //! be run using //! it. All keys and values are TensorIndex's, and before this pass each //! TensorIndex is uniquely generated by lower_index pass for each access of - //! a kir_tv. - std::unordered_map val_replacement_map_; + //! a tv. + std::unordered_map val_replacement_map_; }; } // namespace -std::vector fuseWarpReduce(const std::vector exprs) { +std::vector fuseWarpReduce(const std::vector exprs) { return FuseBroadcastWithWarpReduce::fuse(exprs); } diff --git a/torch/csrc/jit/codegen/cuda/lower_warp_reduce.h b/torch/csrc/jit/codegen/cuda/lower_warp_reduce.h index 785c0b59122e..7480809c7dce 100644 --- a/torch/csrc/jit/codegen/cuda/lower_warp_reduce.h +++ b/torch/csrc/jit/codegen/cuda/lower_warp_reduce.h @@ -13,7 +13,7 @@ struct WarpPaddedParallelInfo { bool has_warp_reduction = false; }; -std::vector fuseWarpReduce(const std::vector exprs); +std::vector fuseWarpReduce(const std::vector exprs); } // namespace cuda } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/manager.cpp b/torch/csrc/jit/codegen/cuda/manager.cpp index ee1bea815359..4fef32286c8e 100644 --- a/torch/csrc/jit/codegen/cuda/manager.cpp +++ b/torch/csrc/jit/codegen/cuda/manager.cpp @@ -8,7 +8,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -51,6 +53,38 @@ namespace cuda { namespace { +// TODO remove this (75983): +// we don't need this any more. I think we can use revertAliasCopyOps. +// Similar refactor should be done infallback graph used by fusion guard. +// implementation of xxxx_copy ops should be removed. +// +// Mark string attribute in alias-copy nodes to enable its implementation +// in the fallback path. +void enableAliasCopyNodes(const std::shared_ptr& graph, Block* block) { + static std::unordered_set alias_copy_op( + {prim::view_copy, + prim::reshape_copy, + prim::squeeze_copy, + prim::unsqueeze_copy}); + + for (Node* n : block->nodes()) { + for (Block* b : n->blocks()) { + enableAliasCopyNodes(graph, b); + } + if (alias_copy_op.find(n->kind()) != alias_copy_op.end()) { + n->s_(attr::name, "CudaFusionGroup"); + } + } +} + +static std::unique_ptr createFallbackCode(const Node* fusion_node) { + auto copied_graph = fusion_node->g(attr::Subgraph)->copy(); + EraseShapeInformation(copied_graph); + enableAliasCopyNodes(copied_graph, copied_graph->block()); + auto code = std::make_unique(copied_graph, "fallback_cuda_fuser"); + return code; +} + // CudaFusionManager is not thread safe! // TODO: we should make the tradeoff here to use thread_local instead of global // singleton; @@ -68,8 +102,6 @@ class CudaFusionManager { // have identical contiguity information! (So identical stride + shape // is even more restricting in a good way) int32_t registerOrGetCacheId(std::shared_ptr& graph) { - std::lock_guard guard(mutex_); - // prepare graph for lowering; // We should not call `EraseShapeInformation(graph);`, graph representation // does not incorporate static sizes, but just rank of input tensors, which @@ -77,6 +109,7 @@ class CudaFusionManager { auto canonical_graph = Canonicalize(graph, false); auto repr = canonical_graph->toString(false); + std::lock_guard guard(mutex_); // create new graph_cache_ids_ entry if none existed yet; if (graph_cache_ids_.count(repr) == 0) { int32_t kernel_id = getNextUniqueID(); @@ -88,6 +121,12 @@ class CudaFusionManager { return graph_cache_ids_[repr]; }; + // get fallback kernel id + int32_t getFallbackKernelId() { + std::lock_guard guard(mutex_); + return getNextUniqueID(); + } + void unregisterCacheId(std::shared_ptr& graph) { auto canonical_graph = Canonicalize(graph, false); auto repr = canonical_graph->toString(false); @@ -109,6 +148,27 @@ class CudaFusionManager { return graph_cache_[kernel_id]->runGraphWithInputs(inputs); } + bool hasFallbackCode(int32_t kernel_id) { + std::lock_guard guard(mutex_); + return fallback_cache_.count(kernel_id); + } + + Code* getFallbackCode(int32_t kernel_id, const Node* fusion_node) { + { + std::lock_guard guard(mutex_); + auto it = fallback_cache_.find(kernel_id); + if (it != fallback_cache_.end()) { + return it->second.get(); + } + } + + std::unique_ptr code = createFallbackCode(fusion_node); + + std::lock_guard guard(mutex_); + auto it = fallback_cache_.insert({kernel_id, std::move(code)}).first; + return it->second.get(); + } + private: // TODO: Dimension collapsing should be abstracted out and integrated into // graph caching. @@ -137,6 +197,7 @@ class CudaFusionManager { std::unordered_map graph_cache_ids_; std::unordered_map> graph_cache_; + std::unordered_map> fallback_cache_; int32_t next_unique_id_ = 0; }; @@ -163,7 +224,6 @@ void compileCudaFusionGroup(Node* fusion_node) { // node only insert meta information after itself). PropagateShapesOnGraph(graph); TypePropagate(graph); - PropagateShapesOnGraph(graph); int32_t fusion_cache_id = CudaFusionManager::getManager().registerOrGetCacheId(graph); @@ -175,38 +235,66 @@ void compileCudaFusionGroup(Node* fusion_node) { compile_fusion(); } catch (...) { TORCH_WARN( - "FALLBACK path has been taken. This is an indication that codegen" - "Failed for some reason. To debug try disable codegen fallback path" - "via setting the env variable" - "`export PYTORCH_NVFUSER_DISABLE_FALLBACK=1`"); + "FALLBACK path has been taken inside: ", + __FUNCTION__, + ". This is an indication that codegen Failed for some reason.\n" + "To debug try disable codegen fallback path via setting the env" + " variable `export PYTORCH_NVFUSER_DISABLE=fallback`\n" + "To report the issue, try enable logging via setting the env" + "variable ` export PYTORCH_JIT_LOG_LEVEL=manager.cpp`\n"); + GRAPH_DUMP("`compile_fusion` hits fallback on graph\n", graph); CudaFusionManager::getManager().unregisterCacheId(graph); } } else { compile_fusion(); } + + // Assigning a cache_id to facilitate graph execution and fallback + if (!fusion_node->hasAttribute(attr::cache_id)) { + int32_t fusion_cache_id = + CudaFusionManager::getManager().getFallbackKernelId(); + fusion_node->i_(attr::cache_id, fusion_cache_id); + } } void runCudaFusionGroup(const Node* fusion_node, Stack& stack) { FUSER_PERF_SCOPE("nvFuser::Manager::runCudaFusionGroup"); + TORCH_CHECK( + fusion_node->hasAttribute(attr::cache_id), + "node prim::CudaFusionGroup has not been compiled yet"); // Fallback to use if anything goes wrong - auto take_fallback = [&]() { - // copying graph here since we are eliminating shape information; - auto copied_graph = fusion_node->g(attr::Subgraph)->copy(); - EraseShapeInformation(copied_graph); - InterpreterState{Code(copied_graph, "fallback_cuda_fuser")}.run(stack); + auto take_fallback = [&](Stack& stack) { + std::unique_ptr fallback_code_unique; + Code* fallback_code; + int32_t kernel_id = fusion_node->i(attr::cache_id); + fallback_code = + CudaFusionManager::getManager().getFallbackCode(kernel_id, fusion_node); + InterpreterState{*fallback_code}.run(stack); }; + c10::optional stack_copy; + auto compare_callback = getCudaFuserComparisonCallback(); + if (compare_callback.run_fallback) { + // make a copy of the stack + int64_t inputs_size = + static_cast(fusion_node->g(attr::Subgraph)->inputs().size()); + TORCH_INTERNAL_ASSERT(stack.size() >= inputs_size); + stack_copy = Stack(); + stack_copy->insert( + stack_copy->end(), stack.begin(), stack.end() - inputs_size); + // deepcopy the last (inputs_size) stack items + std::transform( + stack.end() - inputs_size, + stack.end(), + std::back_inserter(*stack_copy), + [](const c10::IValue& ivalue) { return ivalue.deepcopy(); }); + } + auto run_fusion = [&]() { TORCH_CHECK( fusion_node->kind() == prim::CudaFusionGroup, "prim::CudaFusionGroup expected"); - // TODO: should we support runtime compilation with updated dynamic shape; - // shape inference would be needed so we can allocate output; - TORCH_CHECK( - fusion_node->hasAttribute(attr::cache_id), - "node prim::CudaFusionGroup has not been compiled yet"); - int32_t kernel_id = fusion_node->i(attr::cache_id); // Currently we just construct I/O tensors for static graph; @@ -226,18 +314,63 @@ void runCudaFusionGroup(const Node* fusion_node, Stack& stack) { if (useFallback()) { try { - run_fusion(); + // if fusion failed once, it's likely to fail again; and failures are + // slow. So if the fusion fails, then record the failure and always use + // the fallback instead + int32_t kernel_id = fusion_node->i(attr::cache_id); + bool force_fallback = + CudaFusionManager::getManager().hasFallbackCode(kernel_id); + if (force_fallback) { + take_fallback(stack); + } else { + run_fusion(); + } } catch (...) { TORCH_WARN( - "FALLBACK path has been taken. This is an indication that codegen" - "Failed for some reason. To debug try disable codegen fallback path" - "via setting the env variable" - "`export PYTORCH_NVFUSER_DISABLE_FALLBACK=1`"); - take_fallback(); + "FALLBACK path has been taken inside: ", + __FUNCTION__, + ". This is an indication that codegen Failed for some reason.\n" + "To debug try disable codegen fallback path via setting the env" + " variable `export PYTORCH_NVFUSER_DISABLE=fallback`\n"); + take_fallback(stack); } } else { run_fusion(); } + + if (compare_callback.callback != nullptr) { + Stack fused_outputs; + Stack fallback_outputs; + int64_t output_count = + static_cast(fusion_node->g(attr::Subgraph)->outputs().size()); + TORCH_CHECK( + output_count <= stack.size(), + "Expected ", + output_count, + " outputs but found only ", + stack.size(), + " items on the stack"); + + fused_outputs.insert( + fused_outputs.begin(), stack.end() - output_count, stack.end()); + + if (stack_copy) { + take_fallback(*stack_copy); + TORCH_CHECK( + stack_copy->size() == stack.size(), + "Fused graph returns stack with ", + stack.size(), + " items, compared to ", + stack_copy->size(), + " from unfused graph"); + fallback_outputs.insert( + fallback_outputs.begin(), + stack_copy->end() - output_count, + stack_copy->end()); + } + auto graph_str = fusion_node->g(attr::Subgraph)->toString(); + compare_callback.callback(fused_outputs, fallback_outputs, graph_str); + } } } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/manager.h b/torch/csrc/jit/codegen/cuda/manager.h index 39c97478effe..4b725cd80bc6 100644 --- a/torch/csrc/jit/codegen/cuda/manager.h +++ b/torch/csrc/jit/codegen/cuda/manager.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include /* diff --git a/torch/csrc/jit/codegen/cuda/mma_type.cpp b/torch/csrc/jit/codegen/cuda/mma_type.cpp new file mode 100644 index 000000000000..3751cdea6bcf --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/mma_type.cpp @@ -0,0 +1,139 @@ +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +MmaBuilder::MmaBuilder( + MmaOptions::MacroType macro, + MatMulTileOptions gemm_tile) { + option_.macro = macro; + // Calculate accumulator stride, will be removed once transpose swizzle ready + int outer_stride = gemm_tile.warp_tile.n / gemm_tile.instruction_tile.n; + switch (macro) { + // Numbers depend on actual output layout of mma instruction + case MmaOptions::MacroType::Volta_16_16_4: + option_.accumulator_stride = outer_stride * 4; + break; + default: + TORCH_CHECK(false, "unsupported macro"); + break; + } +} + +MmaBuilder& MmaBuilder::layout(MmaOptions::MmaInputLayout layout) { + option_.operand_layout = layout; + return *this; +} + +MmaBuilder& MmaBuilder::operand(MmaOptions::Operand a_or_b) { + option_.operand = a_or_b; + return *this; +} + +// TODO: validate op config +MmaOptions MmaBuilder::build() const { + return option_; +} + +bool isVolta(MmaOptions::MacroType macro) { + return macro == MmaOptions::MacroType::Volta_16_16_4; +} + +bool isTuring(MmaOptions::MacroType macro) { + return macro == MmaOptions::MacroType::Turing_16_8_16; +} + +bool isAmpere(MmaOptions::MacroType macro) { + return false; +} + +int getOutputRegisterSize(MmaOptions::MacroType macro) { + switch (macro) { + case MmaOptions::MacroType::Volta_16_16_4: + return 8; + break; + default: + TORCH_INTERNAL_ASSERT(false, "unknown macro"); + break; + } + return -1; +} + +int getInputARegisterSize(MmaOptions::MacroType macro) { + switch (macro) { + case MmaOptions::MacroType::Volta_16_16_4: + return 4; + break; + default: + TORCH_INTERNAL_ASSERT(false, "unknown macro"); + break; + } + return -1; +} + +int getInputBRegisterSize(MmaOptions::MacroType macro) { + switch (macro) { + case MmaOptions::MacroType::Volta_16_16_4: + return 4; + break; + default: + TORCH_INTERNAL_ASSERT(false, "unknown macro"); + break; + } + return -1; +} + +bool isOperandTransposed(MmaOptions options) { + switch (options.operand) { + case MmaOptions::Operand::A: + return options.operand_layout == MmaOptions::MmaInputLayout::TT || + options.operand_layout == MmaOptions::MmaInputLayout::TN; + case MmaOptions::Operand::B: + return options.operand_layout == MmaOptions::MmaInputLayout::TT || + options.operand_layout == MmaOptions::MmaInputLayout::NT; + default: + TORCH_CHECK(false, "isOperandTransposed: please specify operand"); + } + return false; +} + +std::string toString(MmaOptions::MmaInputLayout input_layout) { + std::stringstream ss; + switch (input_layout) { + case MmaOptions::MmaInputLayout::TT: + ss << "TT"; + break; + case MmaOptions::MmaInputLayout::TN: + ss << "TN"; + break; + case MmaOptions::MmaInputLayout::NT: + ss << "NT"; + break; + default: + TORCH_INTERNAL_ASSERT(false, "unsupported operand layout"); + } + return ss.str(); +} + +std::string toString(MmaOptions::MacroType mt) { + std::stringstream ss; + switch (mt) { + case MmaOptions::MacroType::NoMMA: + ss << "NoOp"; + break; + case MmaOptions::MacroType::Volta_16_16_4: + ss << "M16N16K4"; + break; + default: + TORCH_INTERNAL_ASSERT(false, "undefined mma type"); + break; + } + return ss.str(); +} + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/mma_type.h b/torch/csrc/jit/codegen/cuda/mma_type.h new file mode 100644 index 000000000000..5f42d41ded65 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/mma_type.h @@ -0,0 +1,132 @@ +#pragma once +#include +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +//! Utility data structure for recording gemm tiles +struct GemmTile { + int m, n, k; + GemmTile(int m_, int n_, int k_) : m(m_), n(n_), k(k_) {} + + bool operator==(const GemmTile& other) { + return m == other.m && n == other.n && k == other.k; + } + + GemmTile operator/(const GemmTile& other) { + return GemmTile(m / other.m, n / other.n, k / other.k); + } +}; + +//! Utility data structure for recording gemm tiles +struct TORCH_CUDA_CU_API MatMulTileOptions { + GemmTile cta_tile = GemmTile(128, 128, 32); + GemmTile warp_tile = GemmTile(64, 64, 32); + GemmTile instruction_tile = GemmTile(16, 8, 16); + + MatMulTileOptions() = default; + MatMulTileOptions( + GemmTile cta_tile_, + GemmTile warp_tile_, + GemmTile instruction_tile_) + : cta_tile(cta_tile_), + warp_tile(warp_tile_), + instruction_tile(instruction_tile_) {} + + bool operator==(const MatMulTileOptions& other) { + return cta_tile == other.cta_tile && warp_tile == other.warp_tile && + instruction_tile == other.instruction_tile; + } +}; + +//! Information for configuring and lowering mma ops +struct MmaOptions { + //! Type of mma instrinsic macro to use + //! This will translate to which mma intrinsic from runtime string + //! to be generated to implement the mma op. The current plan + //! is to have exactly one macro for each + //! (arch, datatype, operand layout) triple, though there + //! exists multiple possibilities for some cases, e.g. for Turing and fp16 + //! one can use 16_8_8 or 16_8_16. + //! Will consider adding more choices that the scheduler can pick from + //! when our perf target becomes more fine grained, which is more likely in + //! latency bound kernels. + enum class MacroType { + NoMMA = 0, + Volta_16_16_4, + Turing_16_8_16, // place holder for turing/ampere mma + Ampere_16_8_8 // place holder for tf32 + }; + + //! [Operand Layout Convention] + //! Operand layout, T=transposed/row_major, N=normal/col_major + //! We don't support calling NN mma directly since it implies + //! a fused transpose. User needs to swap the operands and use + //! TT mma to make the transpose explicit. + //! Ordered by position of K + //! NT : K,M x K,N -> K,M,N + //! TT : M,K X K,N -> M,K,N + //! TN : M,K X N,K -> M,N,K + enum class MmaInputLayout { NT = 0, TT, TN }; + + //! Utility to annotate which input of mma this option struct describes + enum class Operand { NotOperand = 0, A, B }; + + //! Utility to annotate which mma macro this config uses. + MacroType macro = MacroType::NoMMA; + + //! Utility to annotate transposition of operands + MmaInputLayout operand_layout = MmaInputLayout::TT; + + //! Utility to annotate which input of mma this option struct describes + Operand operand = Operand::A; + + //! Accumulator register stride, will be removed when the swizzle op + //! is introduced and the output can be labeled with a transpose swizzle. + int accumulator_stride = 0; + + bool operator==(const MmaOptions& other) const { + return macro == other.macro && operand_layout == other.operand_layout && + operand == other.operand && + accumulator_stride == other.accumulator_stride; + } +}; + +//! User interface generating mma options for mma op +class TORCH_CUDA_CU_API MmaBuilder { + public: + MmaBuilder(MmaOptions::MacroType macro, MatMulTileOptions gemm_tile); + MmaBuilder& layout(MmaOptions::MmaInputLayout layout); + MmaBuilder& operand(MmaOptions::Operand a_or_b); + MmaOptions build() const; + + private: + MmaOptions option_; +}; + +//! GPU arch check for macro type +bool isVolta(MmaOptions::MacroType macro); +bool isTuring(MmaOptions::MacroType macro); +bool isAmpere(MmaOptions::MacroType macro); + +//! Returns true if the given option describes a transposed operand +bool isOperandTransposed(MmaOptions options); + +// Unpacked constants from macro type: +// exact numbers are defined by each individual instruction. +int getOutputRegisterSize(MmaOptions::MacroType macro); +int getInputARegisterSize(MmaOptions::MacroType macro); +int getInputBRegisterSize(MmaOptions::MacroType macro); + +// MMA stringify utils +std::string toString(MmaOptions::MacroType macro); +std::string toString(MmaOptions::MmaInputLayout input_layout); +std::string toString(MmaOptions::MacroType mt); + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/mutator.cpp b/torch/csrc/jit/codegen/cuda/mutator.cpp index 8d13f1e299e2..feccb5608cbc 100644 --- a/torch/csrc/jit/codegen/cuda/mutator.cpp +++ b/torch/csrc/jit/codegen/cuda/mutator.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include @@ -10,143 +11,215 @@ namespace jit { namespace fuser { namespace cuda { -// MUTATE FUNCTIONS FOR VALS +void OptOutMutator::mutate(Statement* s) { + Statement::mutatorDispatch(this, s); +} + +void OptOutMutator::mutate(Expr* e) { + Expr::mutatorDispatch(this, e); +} + +void OptOutMutator::mutate(Val* v) { + Val::mutatorDispatch(this, v); +} + +void OptOutMutator::registerMutation(Val* val, Val* mutation) { + bool val_is_ns = val->vtype() == ValType::NamedScalar; + bool mutation_is_ns = mutation->vtype() == ValType::NamedScalar; + bool val_is_scalar = val->vtype() == ValType::Scalar; + bool mutation_is_scalar = mutation->vtype() == ValType::Scalar; + TORCH_INTERNAL_ASSERT( + mutation->dtype() == val->dtype() && + (mutation->vtype() == val->vtype() || + ((val_is_ns && mutation_is_scalar) || + (mutation_is_ns && val_is_scalar))), + "Mutations are not allowed to change types, tried to go from: (", + val->vtype(), + ", ", + val->dtype(), + ") to: (", + mutation->vtype(), + ", ", + mutation->dtype(), + ")"); + mutations[val] = mutation; +} + +void OptOutMutator::mutate(Bool* b) {} + +void OptOutMutator::mutate(Double* d) {} + +void OptOutMutator::mutate(Int* i) {} + +void OptOutMutator::mutate(ComplexDouble* c) {} -Statement* OptOutMutator::mutate(IterDomain* id) { - Val* start = mutateAsVal(id->start())->asVal(); - Val* extent = mutateAsVal(id->extent())->asVal(); - Val* stop_offset = mutateAsVal(id->stopOffset())->asVal(); +void OptOutMutator::mutate(NamedScalar* ns) {} + +void OptOutMutator::mutate(IterDomain* id) { + Val* start = maybeMutated(id->start()); + Val* extent = maybeMutated(id->extent()); + Val* stop_offset = maybeMutated(id->stopOffset()); if (start->sameAs(id->start()) && extent->sameAs(id->extent()) && stop_offset->sameAs(id->stopOffset())) { - return id; + return; } - Val* mutated_val = new IterDomain( + Val* mutated_val = IrBuilder::create( + id->container(), start, extent, stop_offset, id->getParallelType(), id->getIterType(), id->isRFactorProduct()); + if (id->hasPaddingToMultipleOfWarp()) { + mutated_val->as()->padToMultipleOfWarp( + id->getMaybeSizeAfterPadding()); + } registerMutation(id, mutated_val); - return mutated_val; } -Statement* OptOutMutator::mutate(TensorDomain* td) { - std::vector dom; +void OptOutMutator::mutate(TensorDomain* td) { bool mutated = false; - for (const auto i : c10::irange(td->nDims())) { - IterDomain* id = mutateAsVal(td->axis(i))->as(); - dom.push_back(id); - if (!id->sameAs(td->axis(i))) - mutated = true; - } - if (mutated) { - Val* mutated_val = new TensorDomain( - td->getRootDomain(), td->getRFactorDomain(), dom, td->contiguity()); - registerMutation(td, mutated_val); - return mutated_val; + auto updateIdVec = [&](const std::vector& ids) { + std::vector updated_ids; + for (auto id : ids) { + auto updated_id = maybeMutated(id)->as(); + updated_ids.push_back(updated_id); + if (!updated_id->sameAs(id)) { + mutated = true; + } + } + return updated_ids; + }; + + std::vector root_dom = updateIdVec(td->getRootDomain()); + std::vector rfactor_dom = td->hasRFactor() + ? updateIdVec(td->getMaybeRFactorDomain()) + : std::vector(); + std::vector domain = updateIdVec(td->domain()); + + if (!mutated) { + return; } - return td; -} -Statement* OptOutMutator::mutate(TensorView* tv) { - TensorDomain* td = mutateAsVal(tv->domain())->as(); + Val* mutated_val = IrBuilder::create( + td->container(), root_dom, rfactor_dom, domain, td->contiguity()); + registerMutation(td, mutated_val); +} +void OptOutMutator::mutate(TensorView* tv) { + TensorDomain* td = maybeMutated(tv->domain())->as(); if (!tv->domain()->sameAs(td)) { - TensorView* mutated_tv = new TensorView(td, tv->getDataType().value()); - registerMutation(tv, mutated_tv); - return mutated_tv; + tv->setDomain(td); } - return tv; + // Don't register tv mutations as we just want to update the TD } -Statement* OptOutMutator::mutate(Bool* b) { - return b; +void OptOutMutator::mutate(kir::Predicate*) { + TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); } -Statement* OptOutMutator::mutate(Double* d) { - return d; +void OptOutMutator::mutate(kir::TensorIndex*) { + TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); } -Statement* OptOutMutator::mutate(Int* i) { - return i; -} +// MUTATE FUNCTIONS FOR EXPRESSIONS. +void OptOutMutator::mutate(UnaryOp* uop) { + Val* out = maybeMutated(uop->out()); + Val* in = maybeMutated(uop->in()); -Statement* OptOutMutator::mutate(NamedScalar* ns) { - return ns; + if (out->sameAs(uop->out()) && in->sameAs(uop->in())) { + return; + } + auto container = uop->container(); + auto uop_type = uop->getUnaryOpType(); + container->removeExpr(uop); + IrBuilder::create(container, uop_type, out, in); } -// MUTATE FUNCTIONS FOR EXPRESSIONS. +void OptOutMutator::mutate(BinaryOp* bop) { + Val* out = maybeMutated(bop->out()); + Val* lhs = maybeMutated(bop->lhs()); + Val* rhs = maybeMutated(bop->rhs()); -Statement* OptOutMutator::mutate(Split* s) { - IterDomain* ot = mutateAsVal(s->outer())->as(); - IterDomain* inr = mutateAsVal(s->inner())->as(); - IterDomain* in = mutateAsVal(s->in())->as(); - Val* fact = mutateAsVal(s->factor())->as(); - - if (ot->sameAs(s->outer()) && inr->sameAs(s->inner()) && - in->sameAs(s->in()) && areEqualScalars(fact, s->factor())) { - return s; + if (out == bop->out() && lhs == bop->lhs() && rhs == bop->rhs()) { + return; } - FusionGuard::getCurFusion()->removeExpr(s); - return new Split(ot, inr, in, fact, s->innerSplit()); + + auto container = bop->container(); + auto bop_type = bop->getBinaryOpType(); + container->removeExpr(bop); + IrBuilder::create(container, bop_type, out, lhs, rhs); } -Statement* OptOutMutator::mutate(Merge* m) { - IterDomain* ot = mutateAsVal(m->out())->as(); - IterDomain* otr = mutateAsVal(m->outer())->as(); - IterDomain* in = mutateAsVal(m->inner())->as(); +void OptOutMutator::mutate(TernaryOp* top) { + Val* out = maybeMutated(top->out()); + Val* in1 = maybeMutated(top->in1()); + Val* in2 = maybeMutated(top->in2()); + Val* in3 = maybeMutated(top->in3()); - if (ot->sameAs(m->out()) && otr->sameAs(m->outer()) && in->sameAs(m->inner())) - return m; + if (out == top->out() && in1 == top->in1() && in2 == top->in2() && + in3 == top->in3()) { + return; + } - FusionGuard::getCurFusion()->removeExpr(m); - return new Merge(ot, otr, in); + auto container = top->container(); + auto top_type = top->getTernaryOpType(); + container->removeExpr(top); + IrBuilder::create(container, top_type, out, in1, in2, in3); } -Statement* OptOutMutator::mutate(UnaryOp* uop) { - Val* out = mutateAsVal(uop->out())->asVal(); - Val* in = mutateAsVal(uop->in())->asVal(); +void OptOutMutator::mutate(ReductionOp* rop) { + Val* out = maybeMutated(rop->out()); + Val* in = maybeMutated(rop->in()); + Val* init = rop->init(); + if (out->sameAs(rop->out()) && in->sameAs(rop->in()) && + init->sameAs(rop->init())) { + return; + } - if (out->sameAs(uop->out()) && in->sameAs(uop->in())) - return uop; - FusionGuard::getCurFusion()->removeExpr(uop); - return new UnaryOp(uop->getUnaryOpType(), out, in); + auto container = rop->container(); + auto rop_type = rop->getReductionOpType(); + container->removeExpr(rop); + IrBuilder::create( + container, rop_type, init, out, in, rop->isAllreduce()); } -Statement* OptOutMutator::mutate(BinaryOp* bop) { - Val* out = mutateAsVal(bop->out())->asVal(); - Val* lhs = mutateAsVal(bop->lhs())->asVal(); - Val* rhs = mutateAsVal(bop->rhs())->asVal(); - if (out == bop->out() && lhs == bop->lhs() && rhs == bop->rhs()) - return bop; - FusionGuard::getCurFusion()->removeExpr(bop); - return new BinaryOp(bop->getBinaryOpType(), out, lhs, rhs); -} +void OptOutMutator::mutate(GroupedReductionOp* rop) { + bool is_same = true; -Statement* OptOutMutator::mutate(TernaryOp* top) { - Val* out = mutateAsVal(top->out())->asVal(); - Val* in1 = mutateAsVal(top->in1())->asVal(); - Val* in2 = mutateAsVal(top->in2())->asVal(); - Val* in3 = mutateAsVal(top->in3())->asVal(); - if (out == top->out() && in1 == top->in1() && in2 == top->in2() && - in3 == top->in3()) - return top; - FusionGuard::getCurFusion()->removeExpr(top); - return new TernaryOp(top->getTernaryOpType(), out, in1, in2, in3); -} + std::vector outputs; + for (auto out : rop->outputs()) { + auto maybe_mutated = maybeMutated(out); + is_same = is_same && maybe_mutated->sameAs(out); + outputs.push_back(maybe_mutated); + } -Statement* OptOutMutator::mutate(ReductionOp* rop) { - Val* out = mutateAsVal(rop->out())->asVal(); - Val* in = mutateAsVal(rop->in())->asVal(); - Val* init = rop->init(); - if (out->sameAs(rop->out()) && in->sameAs(rop->in()) && - init->sameAs(rop->init())) - return rop; + std::vector inputs; + for (auto in : rop->inputs()) { + auto maybe_mutated = maybeMutated(in); + is_same = is_same && maybe_mutated->sameAs(in); + inputs.push_back(maybe_mutated); + } + + std::vector init_vals; + for (auto init : rop->initVals()) { + auto maybe_mutated = maybeMutated(init); + is_same = is_same && maybe_mutated->sameAs(init); + init_vals.push_back(maybe_mutated); + } + + if (is_same) { + return; + } - return new ReductionOp(rop->getReductionOpType(), init, out, in); + auto container = rop->container(); + const auto& rop_types = rop->getReductionOpTypes(); + container->removeExpr(rop); + IrBuilder::create( + container, rop_types, init_vals, outputs, inputs, rop->isAllreduce()); } namespace { @@ -159,20 +232,18 @@ inline bool compareOptional(Val* a, Val* b) { } // namespace -Statement* OptOutMutator::mutate(WelfordOp* wop) { - Val* out_avg = mutateAsVal(wop->outAvg())->asVal(); - Val* out_var = mutateAsVal(wop->outVar())->asVal(); - Val* out_N = mutateAsVal(wop->outN())->asVal(); +void OptOutMutator::mutate(WelfordOp* wop) { + Val* out_avg = maybeMutated(wop->outAvg()); + Val* out_var = maybeMutated(wop->outVar()); + Val* out_N = maybeMutated(wop->outN()); - Val* in_avg = mutateAsVal(wop->inAvg())->asVal(); - Val* in_var = wop->inVar() ? mutateAsVal(wop->inVar())->asVal() : nullptr; - Val* in_N = mutateAsVal(wop->inN())->asVal(); + Val* in_avg = maybeMutated(wop->inAvg()); + Val* in_var = wop->inVar() ? maybeMutated(wop->inVar()) : nullptr; + Val* in_N = maybeMutated(wop->inN()); - Val* init_avg = - wop->initAvg() ? mutateAsVal(wop->initAvg())->asVal() : nullptr; - Val* init_var = - wop->initVar() ? mutateAsVal(wop->initVar())->asVal() : nullptr; - Val* init_N = mutateAsVal(wop->initN())->asVal(); + Val* init_avg = wop->initAvg() ? maybeMutated(wop->initAvg()) : nullptr; + Val* init_var = wop->initVar() ? maybeMutated(wop->initVar()) : nullptr; + Val* init_N = maybeMutated(wop->initN()); const bool out_compare = out_avg->sameAs(wop->outAvg()) && out_var->sameAs(wop->outVar()) && out_N->sameAs(wop->outN()); @@ -182,56 +253,205 @@ Statement* OptOutMutator::mutate(WelfordOp* wop) { compareOptional(init_var, wop->initVar()) && init_N->sameAs(wop->initN()); if (out_compare && init_compare && in_compare) { - return wop; - } else { - return new WelfordOp( - out_avg, - out_var, - out_N, - init_avg, - init_var, - init_N, - in_avg, - in_var, - in_N); + return; + } + + auto container = wop->container(); + container->removeExpr(wop); + IrBuilder::create( + container, + out_avg, + out_var, + out_N, + init_avg, + init_var, + init_N, + in_avg, + in_var, + in_N, + wop->isAllreduce()); +} + +void OptOutMutator::mutate(MmaOp* mma) { + Val* out = maybeMutated(mma->out()); + Val* in_a = maybeMutated(mma->inA()); + Val* in_b = maybeMutated(mma->inB()); + Val* init = mma->init(); + + if (out->sameAs(mma->out()) && in_a->sameAs(mma->inA()) && + in_b->sameAs(mma->inB())) { + return; } + + auto container = mma->container(); + auto options = mma->options(); + container->removeExpr(mma); + C10_UNUSED auto new_mma = + IrBuilder::create(container, out, in_a, in_b, init, options); } -Statement* OptOutMutator::mutate(BroadcastOp* bop) { - return bop; +void OptOutMutator::mutate(BroadcastOp* bop) { + Val* out = maybeMutated(bop->out()); + Val* in = maybeMutated(bop->in()); + + if (out->sameAs(bop->out()) && in->sameAs(bop->in())) { + return; + } + + auto container = bop->container(); + auto flags = bop->getBroadcastDimFlags(); + container->removeExpr(bop); + IrBuilder::create(container, out, in, flags); } -Statement* OptOutMutator::mutate(TransposeOp* top) { - return top; +void OptOutMutator::mutate(TransposeOp* top) { + TensorView* out = maybeMutated(top->out())->as(); + TensorView* in = maybeMutated(top->in())->as(); + + if (out->sameAs(top->out()) && in->sameAs(top->in())) { + return; + } + + auto container = top->container(); + auto new2old = top->new2old(); + container->removeExpr(top); + IrBuilder::create(container, out, in, new2old); } -Statement* OptOutMutator::mutate(ShiftOp* sop) { - Val* out = mutateAsVal(sop->out())->asVal(); - Val* in = mutateAsVal(sop->in())->asVal(); +void OptOutMutator::mutate(ShiftOp* sop) { + Val* out = maybeMutated(sop->out())->asVal(); + Val* in = maybeMutated(sop->in())->asVal(); + + if (out->sameAs(sop->out()) && in->sameAs(sop->in())) { + return; + } - if (out->sameAs(sop->out()) && in->sameAs(sop->in())) - return sop; auto offsets = sop->offsets(); - FusionGuard::getCurFusion()->removeExpr(sop); - return new ShiftOp(out, in, offsets, sop->pad()); + auto pad_width = sop->padWidth(); + auto container = sop->container(); + container->removeExpr(sop); + IrBuilder::create(container, out, in, offsets, pad_width); } -Statement* OptOutMutator::mutate(GatherOp* op) { - Val* out = mutateAsVal(op->out())->asVal(); - Val* in = mutateAsVal(op->in())->asVal(); +void OptOutMutator::mutate(GatherOp* op) { + Val* out = maybeMutated(op->out())->asVal(); + Val* in = maybeMutated(op->in())->asVal(); + + if (out->sameAs(op->out()) && in->sameAs(op->in())) { + return; + } - if (out->sameAs(op->out()) && in->sameAs(op->in())) - return op; auto window_shape = op->windowShape(); auto pad_width = op->padWidth(); - FusionGuard::getCurFusion()->removeExpr(op); - return new GatherOp(out, in, window_shape, pad_width); + auto container = op->container(); + container->removeExpr(op); + IrBuilder::create(container, out, in, window_shape, pad_width); +} + +void OptOutMutator::mutate(ViewAsScalar* vop) { + TensorView* out = maybeMutated(vop->out())->as(); + TensorView* in = maybeMutated(vop->in())->as(); + + if (out->sameAs(vop->out()) && in->sameAs(vop->in())) { + return; + } + + auto container = vop->container(); + container->removeExpr(vop); + IrBuilder::create( + container, out, in, vop->vector_id(), vop->index()); +} + +void OptOutMutator::mutate(ViewOp* vop) { + TensorView* out = maybeMutated(vop->out())->as(); + TensorView* in = maybeMutated(vop->in())->as(); + + if (out->sameAs(vop->out()) && in->sameAs(vop->in())) { + return; + } + + auto container = vop->container(); + container->removeExpr(vop); + IrBuilder::create(container, out, in); +} + +void OptOutMutator::mutate(Split* s) { + IterDomain* ot = maybeMutated(s->outer())->as(); + IterDomain* inr = maybeMutated(s->inner())->as(); + IterDomain* in = maybeMutated(s->in())->as(); + Val* fact = maybeMutated(s->factor())->as(); + Val* start_offset = maybeMutated(s->startOffset()); + Val* stop_offset = maybeMutated(s->stopOffset()); + + if (ot->sameAs(s->outer()) && inr->sameAs(s->inner()) && + in->sameAs(s->in()) && areEqualScalars(fact, s->factor()) && + start_offset->sameAs(s->startOffset()) && + stop_offset->sameAs(s->stopOffset())) { + return; + } + + auto container = s->container(); + auto inner_split = s->innerSplit(); + container->removeExpr(s); + C10_UNUSED auto new_node = IrBuilder::create( + container, ot, inr, in, fact, inner_split, start_offset, stop_offset); } -Statement* OptOutMutator::mutate(ViewOp* vop) { - return vop; +void OptOutMutator::mutate(Merge* m) { + IterDomain* ot = maybeMutated(m->out())->as(); + IterDomain* otr = maybeMutated(m->outer())->as(); + IterDomain* in = maybeMutated(m->inner())->as(); + + if (ot->sameAs(m->out()) && otr->sameAs(m->outer()) && + in->sameAs(m->inner())) { + return; + } + + auto container = m->container(); + container->removeExpr(m); + C10_UNUSED auto new_node = IrBuilder::create(container, ot, otr, in); +} + +void OptOutMutator::mutate(kir::Allocate*) { + TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); +} +void OptOutMutator::mutate(kir::BlockSync*) { + TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); +} +void OptOutMutator::mutate(kir::GridSync*) { + TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); +} +void OptOutMutator::mutate(kir::InitMagicZero*) { + TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); +} +void OptOutMutator::mutate(kir::UpdateMagicZero*) { + TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); +} +void OptOutMutator::mutate(kir::ForLoop*) { + TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); +} +void OptOutMutator::mutate(kir::IfThenElse*) { + TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); +} +void OptOutMutator::mutate(kir::GridReduction*) { + TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); +} +void OptOutMutator::mutate(kir::GroupedGridReduction*) { + TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); +} +void OptOutMutator::mutate(kir::GridBroadcast*) { + TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); +} +void OptOutMutator::mutate(kir::GridWelford*) { + TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); +} +void OptOutMutator::mutate(kir::AllocateFusedReduction*) { + TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); } +void OptOutMutator::removeExpr(IrContainer* container, Expr* expr) { + container->removeExpr(expr); +} } // namespace cuda } // namespace fuser } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/mutator.h b/torch/csrc/jit/codegen/cuda/mutator.h index f9ec40ca9f57..433de485cf19 100644 --- a/torch/csrc/jit/codegen/cuda/mutator.h +++ b/torch/csrc/jit/codegen/cuda/mutator.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include diff --git a/torch/csrc/jit/codegen/cuda/non_divisible_split.cpp b/torch/csrc/jit/codegen/cuda/non_divisible_split.cpp index 426bcadb2c5e..3a2ab5f5eb5b 100644 --- a/torch/csrc/jit/codegen/cuda/non_divisible_split.cpp +++ b/torch/csrc/jit/codegen/cuda/non_divisible_split.cpp @@ -128,8 +128,8 @@ void NonDivisibleSplitInfo::removeRedundancy() { std::unordered_set split_to_validate_outer; for (auto it = splits_to_validate_.begin(); it != splits_to_validate_.end();) { - auto outer_concrete = - gpu_lower->caIndexMap().getConcreteMappedID((*it)->outer()); + auto outer_concrete = gpu_lower->caMap()->getConcreteMappedID( + (*it)->outer(), IdMappingMode::EXACT); auto new_domain = split_to_validate_outer.insert(outer_concrete).second; if (!new_domain) { it = splits_to_validate_.erase(it); @@ -150,8 +150,10 @@ void NonDivisibleSplitInfo::removeRedundancy() { splits_to_validate_.begin(), splits_to_validate_.end(), [&](Split* split_to_validate) { - return gpu_lower->caIndexMap().areMapped( - split_to_validate->outer(), split_to_predicate->outer()); + return gpu_lower->caMap()->areMapped( + split_to_validate->outer(), + split_to_predicate->outer(), + IdMappingMode::EXACT); })) { it = splits.erase(it); } else { diff --git a/torch/csrc/jit/codegen/cuda/non_divisible_split.h b/torch/csrc/jit/codegen/cuda/non_divisible_split.h index f17bf2d62468..6706c9f072d3 100644 --- a/torch/csrc/jit/codegen/cuda/non_divisible_split.h +++ b/torch/csrc/jit/codegen/cuda/non_divisible_split.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include diff --git a/torch/csrc/jit/codegen/cuda/nvfuser.cmake b/torch/csrc/jit/codegen/cuda/nvfuser.cmake new file mode 100644 index 000000000000..5dc211eb4f6c --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/nvfuser.cmake @@ -0,0 +1,58 @@ +if(BUILD_SPLIT_CUDA) + set(TORCHLIB_FLAVOR torch_cuda_cu) # chose torch_cuda_cu here since JIT is in torch_cuda_cpp +elseif(USE_CUDA) + set(TORCHLIB_FLAVOR torch_cuda) +elseif(USE_ROCM) + set(TORCHLIB_FLAVOR torch_hip) +endif() + +# The list of NVFUSER runtime files +list(APPEND NVFUSER_RUNTIME_FILES + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/array.cu + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/block_reduction.cu + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/block_sync_default.cu + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/broadcast.cu + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/fp16_support.cu + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/fused_reduction.cu + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/bf16_support.cu + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/grid_broadcast.cu + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/grid_reduction.cu + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/grid_sync.cu + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/helpers.cu + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/index_utils.cu + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/random_numbers.cu + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/tensor.cu + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/tuple.cu + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/type_traits.cu + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/welford.cu + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/warp.cu + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/tensorcore.cu + ${TORCH_ROOT}/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh + ${TORCH_ROOT}/aten/src/ATen/cuda/detail/UnpackRaw.cuh +) + +file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/include/nvfuser_resources") + +# "stringify" NVFUSER runtime sources +# (generate C++ header files embedding the original input as a string literal) +set(NVFUSER_STRINGIFY_TOOL "${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/tools/stringify_file.py") +foreach(src ${NVFUSER_RUNTIME_FILES}) + get_filename_component(filename ${src} NAME_WE) + set(dst "${CMAKE_BINARY_DIR}/include/nvfuser_resources/${filename}.h") + add_custom_command( + COMMENT "Stringify NVFUSER runtime source file" + OUTPUT ${dst} + DEPENDS ${src} + COMMAND ${PYTHON_EXECUTABLE} ${NVFUSER_STRINGIFY_TOOL} -i ${src} -o ${dst} + ) + add_custom_target(nvfuser_rt_${filename} DEPENDS ${dst}) + add_dependencies(${TORCHLIB_FLAVOR} nvfuser_rt_${filename}) + + # also generate the resource headers during the configuration step + # (so tools like clang-tidy can run w/o requiring a real build) + execute_process(COMMAND + ${PYTHON_EXECUTABLE} ${NVFUSER_STRINGIFY_TOOL} -i ${src} -o ${dst}) +endforeach() + +target_include_directories(${TORCHLIB_FLAVOR} PRIVATE "${CMAKE_BINARY_DIR}/include") diff --git a/torch/csrc/jit/codegen/cuda/ops/alias.cpp b/torch/csrc/jit/codegen/cuda/ops/alias.cpp new file mode 100644 index 000000000000..d5bbd4878828 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/ops/alias.cpp @@ -0,0 +1,199 @@ +#include +#include +#include +#include +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +namespace { + +//! Transform TensorView according to keep, merge, and split transformations. +//! Trivial reduction and broadcast transformations are handled separately. +//! It is recommend to use the composite ops view function, which will call +//! the analyzeView function to generate the appropriate transformations. +//! +//! For example: +//! original sizes = [2, 10, 40] +//! new_size = [2, 10, 2, 20] +//! auto analysis = analyzeView(TV0, original_sizes, new_sizes) +//! auto TV1 = TV0->view(analysis.transforms); +//! +//! Transforms = [(Keep I0), (Keep I1), (Split I2 by 2)] +//! Before: TV0[I0, I1, I2] +//! After: TV0[I0, I1, 2, ceilDiv(I2, 2)] +//! +TensorView* applyViewTransforms( + TensorView* tv, + const std::vector>& transforms) { + TORCH_INTERNAL_ASSERT( + !tv->hasComputeAt(), + "Cannot modify rfactor domain after compute at has been set."); + + TORCH_INTERNAL_ASSERT(tv->nDims() > 0, "Tried to view a 0-dim TensorView"); + + TORCH_CHECK( + !tv->domain()->hasRFactor(), + "Cannot call view on the same TensorView twice."); + + TORCH_INTERNAL_ASSERT(!transforms.empty()); + + TensorView* consumer = IrBuilder::create( + tv->container(), + tv->domain()->view(transforms), + tv->getDataType().value()); + + IrBuilder::create(tv->container(), consumer, tv); + + return consumer; +} + +} // namespace + +TensorView* view(TensorView* x, DataType dtype) { + if (x->getDataType() == dtype) { + return x; + } + + auto input_type = x->getDataType().value(); + auto input_size = dataTypeSize(input_type); + auto newsize = dataTypeSize(dtype); + + if (input_size == newsize) { + return bitCastOp(dtype, x); + } + // TODO: support view(dtype) for dtypes where input_size != newsize + TORCH_INTERNAL_ASSERT(false, "Unsupported reinterpret casting view"); +} + +TensorView* view( + TensorView* x, + const std::vector& original_sizes, + const std::vector& new_sizes) { + TORCH_INTERNAL_ASSERT( + TensorDomain::noReductions(x->getMaybeRFactorDomain()).size() == + original_sizes.size()); + + auto analyze_view = analyzeView(x, original_sizes, new_sizes); + + auto reduction = (!analyze_view.trivial_reduction_axes.empty()) + ? sum(x, + analyze_view.trivial_reduction_axes, + false /* keep_dim */, + x->getDataType().value()) + : x; + + auto view = (!analyze_view.transforms.empty()) + ? applyViewTransforms(reduction, analyze_view.transforms) + : reduction; + + return (analyze_view.has_broadcast) + ? broadcast(view, analyze_view.broadcast_axes) + : view; +} + +TensorView* flatten(TensorView* x, int64_t start_dim, int64_t end_dim) { + if (start_dim < 0) { + start_dim += x->nDims(); + } + if (end_dim < 0) { + end_dim += x->nDims(); + } + TORCH_CHECK( + start_dim >= 0 && start_dim < x->nDims(), + "Invalid start_dim ", + start_dim); + TORCH_CHECK( + end_dim >= 0 && end_dim < x->nDims(), "Invalid end_dim ", end_dim); + TORCH_CHECK(start_dim <= end_dim, "start_dim must be <= end_dim"); + + if (start_dim == end_dim) { + return x; + } + + auto out = IrBuilder::create( + x->container(), + x->domain()->flatten(start_dim, end_dim), + x->getDataType().value()); + + IrBuilder::create(out, x); + return out; +} + +TensorView* squeeze(TensorView* x, const std::vector& sizes) { + const auto ndims = static_cast(x->domain()->noReductions().size()); + + TORCH_INTERNAL_ASSERT( + ndims == sizes.size(), + "Invalid sizes for squeeze: ", + sizes, + ". Input tensor: ", + x->toString()); + + std::vector trivial_reduction_axes; + for (const auto idx : c10::irange(sizes.size())) { + if (sizes[idx] == 1) { + trivial_reduction_axes.push_back(idx); + } + } + return (trivial_reduction_axes.empty()) ? x + : sum(x, + trivial_reduction_axes, + false /* keep_dim */, + x->getDataType().value()); +} + +TensorView* squeeze(TensorView* x, const std::vector& sizes, int dim) { + const auto ndims = static_cast(x->domain()->noReductions().size()); + + TORCH_INTERNAL_ASSERT( + ndims == sizes.size(), + "Invalid sizes for squeeze: ", + sizes, + ". Input tensor: ", + x->toString()); + + if (dim < 0) { + dim = ndims + dim; + } + + TORCH_INTERNAL_ASSERT( + dim >= 0 && dim < ndims, + "Invalid position to squeeze: ", + dim, + ". Input tensor: ", + x->toString()); + + if (sizes[dim] == 1) { + return sum(x, {dim}, false /* keep_dim */, x->getDataType().value()); + } else { + return set(x); + } +} + +TensorView* unsqueeze(TensorView* x, int dim) { + const auto ndims = static_cast(x->domain()->noReductions().size()); + + if (dim < 0) { + dim = ndims + dim + 1; + } + + TORCH_INTERNAL_ASSERT( + dim >= 0 && dim <= ndims, + "Invalid position to unsqueeze: ", + dim, + ". Input tensor: ", + x->toString()); + + std::vector broadcast_axes(ndims + 1, false); + broadcast_axes[dim] = true; + return broadcast(x, broadcast_axes); +} + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/ops/alias.h b/torch/csrc/jit/codegen/cuda/ops/alias.h new file mode 100644 index 000000000000..f33a5a745a89 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/ops/alias.h @@ -0,0 +1,45 @@ +#pragma once + +#include + +#include +#include + +// +// The operations defined in this header is intended as user facing functions. +// The user will provide the necessary input TensorViews and the function will +// create the correct intermediate nodes and return the output TensorViews. +// + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +TORCH_CUDA_CU_API TensorView* view(TensorView* x, DataType dtype); + +TORCH_CUDA_CU_API TensorView* view( + TensorView* x, + const std::vector& original_sizes, + const std::vector& new_sizes); + +TORCH_CUDA_CU_API TensorView* flatten( + TensorView* x, + int64_t start_dim = 0, + int64_t end_dim = -1); + +TORCH_CUDA_CU_API TensorView* squeeze( + TensorView* x, + const std::vector& sizes); + +TORCH_CUDA_CU_API TensorView* squeeze( + TensorView* x, + const std::vector& sizes, + int dim); + +TORCH_CUDA_CU_API TensorView* unsqueeze(TensorView* x, int dim); + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/ops/all_ops.h b/torch/csrc/jit/codegen/cuda/ops/all_ops.h index 1ebd2bb87f1b..07d3eb944e89 100644 --- a/torch/csrc/jit/codegen/cuda/ops/all_ops.h +++ b/torch/csrc/jit/codegen/cuda/ops/all_ops.h @@ -1,4 +1,5 @@ #pragma once #include +#include #include #include diff --git a/torch/csrc/jit/codegen/cuda/ops/composite.cpp b/torch/csrc/jit/codegen/cuda/ops/composite.cpp index 06bcf2d0494a..08c58d2becb5 100644 --- a/torch/csrc/jit/codegen/cuda/ops/composite.cpp +++ b/torch/csrc/jit/codegen/cuda/ops/composite.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -8,9 +9,10 @@ namespace fuser { namespace cuda { ForwardDropoutResult dropout(TensorView* x, Val* prob) { - auto p1m = sub(new Double(1.), prob); - auto zero_check = add(eq(p1m, new Double(0.)), p1m); - auto scale = div(new Double(1.), zero_check); + auto p1m = sub(IrBuilder::create(x->container(), 1.), prob); + auto zero_check = + add(eq(p1m, IrBuilder::create(x->container(), 0.)), p1m); + auto scale = div(IrBuilder::create(x->container(), 1.), zero_check); return dropout(x, p1m, scale); } @@ -47,18 +49,6 @@ TensorView* dropout_backward(TensorView* dy, TensorView* mask, Val* scale) { return dx; } -Val* softplus(Val* x, Val* beta, Val* threshold) { - TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid."); - TORCH_INTERNAL_ASSERT(beta != nullptr, "Beta is invalid."); - TORCH_INTERNAL_ASSERT( - threshold != nullptr, "Threshold is not a valid Double."); - - auto op_beta = mul(x, beta); - auto maybe_result = div(log1p(exp(op_beta)), beta); - auto y = where(gt(op_beta, threshold), x, maybe_result); - return y; -} - LstmResult lstm( TensorView* prev_cell, TensorView* in_x, @@ -83,7 +73,53 @@ LstmResult lstm( return {cell, hidden}; } -Val* fast_gelu(Val* x) { +TensorView* softplus(TensorView* x, Val* beta, Val* threshold) { + TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid."); + TORCH_INTERNAL_ASSERT(beta != nullptr, "Beta is invalid."); + TORCH_INTERNAL_ASSERT( + threshold != nullptr, "Threshold is not a valid Double."); + + auto op_beta = mul(x, beta); + auto maybe_result = div(log1p(exp(op_beta)), beta); + auto y = where(gt(op_beta, threshold), x, maybe_result); + return y; +} + +TensorView* gelu(TensorView* x) { + TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid"); + + auto kappa = IrBuilder::create(x->container(), M_SQRT1_2); + auto half = IrBuilder::create(x->container(), 0.5); + auto one = IrBuilder::create(x->container(), 1.); + + auto cdf = mul(half, add(one, erf(mul(x, kappa)))); + auto y = mul(x, cdf); + return y; +} + +TensorView* gelu_backward(TensorView* dy, TensorView* x) { + TORCH_INTERNAL_ASSERT(dy != nullptr, "Grad Output is invalid."); + TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid"); + + constexpr double kAlpha = M_2_SQRTPI * M_SQRT1_2 * 0.5; + const double kHalf = 0.5; + + auto cdf_1 = mul(x, IrBuilder::create(x->container(), M_SQRT1_2)); + auto cdf_2 = erf(cdf_1); + auto cdf_3 = add(cdf_2, IrBuilder::create(x->container(), 1.)); + auto cdf_4 = mul(cdf_3, IrBuilder::create(x->container(), kHalf)); + + auto pdf_1 = mul(x, x); + auto pdf_2 = mul(pdf_1, IrBuilder::create(x->container(), -kHalf)); + auto pdf_3 = exp(pdf_2); + + auto out = addcmul( + cdf_4, x, pdf_3, IrBuilder::create(x->container(), kAlpha)); + auto dx = mul(out, dy); + return dx; +} + +TensorView* tanh_gelu(TensorView* x) { TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid"); constexpr double kBeta = M_SQRT2 * M_2_SQRTPI * 0.5; @@ -91,17 +127,18 @@ Val* fast_gelu(Val* x) { auto x_cube = mul(x, mul(x, x)); - auto inner_1 = mul(new Double(kKappa), x_cube); + auto inner_1 = mul(IrBuilder::create(x->container(), kKappa), x_cube); auto inner_2 = add(x, inner_1); - auto inner_3 = mul(new Double(kBeta), inner_2); + auto inner_3 = mul(IrBuilder::create(x->container(), kBeta), inner_2); auto tanh_inner = tanh(inner_3); - auto out = mul(x, add(new Double(1.), tanh_inner)); - auto y = mul(new Double(0.5), out); + auto out = + mul(x, add(IrBuilder::create(x->container(), 1.), tanh_inner)); + auto y = mul(IrBuilder::create(x->container(), 0.5), out); return y; } -Val* fast_gelu_backward(Val* dy, Val* x) { +TensorView* tanh_gelu_backward(TensorView* dy, TensorView* x) { TORCH_INTERNAL_ASSERT(dy != nullptr, "Grad Output is invalid."); TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid"); @@ -111,107 +148,51 @@ Val* fast_gelu_backward(Val* dy, Val* x) { auto x_sq = mul(x, x); auto x_cube = mul(x, x_sq); - auto inner_1 = mul(new Double(kKappa), x_cube); + auto inner_1 = mul(IrBuilder::create(x->container(), kKappa), x_cube); auto inner_2 = add(x, inner_1); - auto inner_3 = mul(new Double(kBeta), inner_2); + auto inner_3 = mul(IrBuilder::create(x->container(), kBeta), inner_2); auto tanh_inner = tanh(inner_3); - auto left = mul(new Double(0.5), x); - auto right = add(new Double(1.), tanh_inner); + auto left = mul(IrBuilder::create(x->container(), 0.5), x); + auto right = add(IrBuilder::create(x->container(), 1.), tanh_inner); - auto left_derivative = mul(new Double(0.5), right); + auto left_derivative = + mul(IrBuilder::create(x->container(), 0.5), right); auto tanh_inner_sq = mul(tanh_inner, tanh_inner); - auto tanh_derivative = sub(new Double(1), tanh_inner_sq); + auto tanh_derivative = + sub(IrBuilder::create(x->container(), 1), tanh_inner_sq); - auto constant_mul_x_sq = mul(new Double(kBeta * 3 * kKappa), x_sq); - auto inner_derivative = add(new Double(kBeta), constant_mul_x_sq); + auto constant_mul_x_sq = + mul(IrBuilder::create(x->container(), kBeta * 3 * kKappa), x_sq); + auto inner_derivative = + add(IrBuilder::create(x->container(), kBeta), constant_mul_x_sq); auto right_derivative = mul(left, mul(tanh_derivative, inner_derivative)); auto dx = mul(dy, add(left_derivative, right_derivative)); return dx; } -Val* gelu_backward(Val* dy, Val* x) { +TensorView* tanh_backward(TensorView* dy, TensorView* tanh_x) { TORCH_INTERNAL_ASSERT(dy != nullptr, "Grad Output is invalid."); - TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid"); - - constexpr double kAlpha = M_2_SQRTPI * M_SQRT1_2 * 0.5; - const double kHalf = 0.5; - - auto cdf_1 = mul(x, new Double(M_SQRT1_2)); - auto cdf_2 = erf(cdf_1); - auto cdf_3 = add(cdf_2, new Double(1.)); - auto cdf_4 = mul(cdf_3, new Double(kHalf)); + TORCH_INTERNAL_ASSERT(tanh_x != nullptr, "Input is invalid"); - auto pdf_1 = mul(x, x); - auto pdf_2 = mul(pdf_1, new Double(-kHalf)); - auto pdf_3 = exp(pdf_2); - - auto out = addcmul(cdf_4, x, pdf_3, new Double(kAlpha)); - auto dx = mul(out, dy); + auto one = IrBuilder::create(tanh_x->container(), 1.); + auto tanh_sq = mul(tanh_x, tanh_x); + auto sub_tanh_sq = sub(one, tanh_sq); + auto dx = mul(dy, sub_tanh_sq); return dx; } -namespace { - -//! Transform TensorView according to keep, merge, and split transformations. -//! Trivial reduction and broadcast transformations are handled separately. -//! It is recommend to use the composite ops view function, which will call -//! the analyzeView function to generate the appropriate transformations. -//! -//! For example: -//! original sizes = [2, 10, 40] -//! new_size = [2, 10, 2, 20] -//! auto analysis = analyzeView(TV0, original_sizes, new_sizes) -//! auto TV1 = TV0->view(analysis.transforms); -//! -//! Transforms = [(Keep I0), (Keep I1), (Split I2 by 2)] -//! Before: TV0[I0, I1, I2] -//! After: TV0[I0, I1, 2, ceilDiv(I2, 2)] -//! -TensorView* applyViewTransforms( - TensorView* tv, - const std::vector>& transforms) { - TORCH_INTERNAL_ASSERT( - !tv->hasComputeAt(), - "Cannot modify rfactor domain after compute at has been set."); - - TORCH_INTERNAL_ASSERT(tv->nDims() > 0, "Tried to view a 0-dim TensorView"); - +TensorView* view_as_real(TensorView* x) { + auto input_type = x->getDataType().value(); TORCH_CHECK( - !tv->domain()->hasRFactor(), - "Cannot call view on the same TensorView twice."); - - TORCH_INTERNAL_ASSERT(!transforms.empty()); - - TensorView* consumer = - new TensorView(tv->domain()->view(transforms), tv->getDataType().value()); - - new ViewOp(consumer, tv); - - return consumer; -} - -} // namespace - -TensorView* view( - TensorView* x, - const std::vector& original_sizes, - const std::vector& new_sizes) { - auto analyze_view = analyzeView(x, original_sizes, new_sizes); - - auto reduction = (!analyze_view.trivial_reduction_axes.empty()) - ? sum(x, analyze_view.trivial_reduction_axes) - : x; - - auto view = (!analyze_view.transforms.empty()) - ? applyViewTransforms(reduction, analyze_view.transforms) - : reduction; + isComplexType(input_type), + "Operand of view_as_real must have complex type"); - return (analyze_view.has_broadcast) - ? broadcast(view, analyze_view.broadcast_axes) - : view; + auto vec_type = getVectorType(getTypeFromComplexType(input_type), 2); + auto tv_vector = bitCastOp(vec_type, x); + return viewAsScalar(tv_vector); } } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/ops/composite.h b/torch/csrc/jit/codegen/cuda/ops/composite.h index 4470f0cc6f05..d73be9c469da 100644 --- a/torch/csrc/jit/codegen/cuda/ops/composite.h +++ b/torch/csrc/jit/codegen/cuda/ops/composite.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -31,8 +31,6 @@ TORCH_CUDA_CU_API TensorView* dropout_backward( TensorView* mask, Val* scale); -TORCH_CUDA_CU_API Val* softplus(Val* x, Val* beta, Val* threshold); - struct LstmResult { TensorView* cell = nullptr; TensorView* hidden = nullptr; @@ -45,14 +43,17 @@ TORCH_CUDA_CU_API LstmResult lstm( TensorView* cell_x, TensorView* out_x); -TORCH_CUDA_CU_API Val* fast_gelu(Val* x); -TORCH_CUDA_CU_API Val* fast_gelu_backward(Val* dy, Val* x); -TORCH_CUDA_CU_API Val* gelu_backward(Val* dy, Val* x); - -TORCH_CUDA_CU_API TensorView* view( +TORCH_CUDA_CU_API TensorView* softplus( TensorView* x, - const std::vector& x_sizes, - const std::vector& new_sizes); + Val* beta, + Val* threshold); +TORCH_CUDA_CU_API TensorView* gelu(TensorView* x); +TORCH_CUDA_CU_API TensorView* gelu_backward(TensorView* dy, TensorView* x); +TORCH_CUDA_CU_API TensorView* tanh_gelu(TensorView* x); +TORCH_CUDA_CU_API TensorView* tanh_gelu_backward(TensorView* dy, TensorView* x); +TORCH_CUDA_CU_API TensorView* tanh_backward(TensorView* dy, TensorView* tanh_x); + +TORCH_CUDA_CU_API TensorView* view_as_real(TensorView* x); } // namespace cuda } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/ops/normalization.cpp b/torch/csrc/jit/codegen/cuda/ops/normalization.cpp index 19201687553b..00b013bdc524 100644 --- a/torch/csrc/jit/codegen/cuda/ops/normalization.cpp +++ b/torch/csrc/jit/codegen/cuda/ops/normalization.cpp @@ -1,4 +1,5 @@ #include +#include #include namespace torch { @@ -6,6 +7,64 @@ namespace jit { namespace fuser { namespace cuda { +int nonNegativeAxis(int axis, int ndims) { + return (axis >= 0) ? axis : (ndims + axis); +} + +Val* numFeatures(TensorView* x, const std::vector& dims, int ndims) { + Val* num_features = IrBuilder::create(x->container(), 1); + for (const auto dim : dims) { + const int axis = nonNegativeAxis(dim, ndims); + num_features = mul(num_features, x->domain()->domain()[axis]->extent()); + } + return num_features; +} + +TensorView* mean(TensorView* x, const std::vector& dims, bool keepdim) { + TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid."); + + const int kNumberOfDims = + TensorDomain::noReductions(x->getMaybeRFactorDomain()).size(); + + auto sum_x = sum(x, dims, keepdim); + auto y = div(sum_x, numFeatures(x, dims, kNumberOfDims)); + return y; +} + +TensorView* variance( + TensorView* x, + const std::vector& dims, + bool unbiased, + bool keepdim) { + TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid."); + + const int kNumberOfDims = + TensorDomain::noReductions(x->getMaybeRFactorDomain()).size(); + + auto bcast_mean = mean(x, dims, true /* keepdim */); + auto x_mean_sub = sub(x, bcast_mean); + auto x_mean_sub_sq = mul(x_mean_sub, x_mean_sub); + auto sum_x_mean_sub_sq = sum(x_mean_sub_sq, dims, keepdim); + + auto num_features = numFeatures(x, dims, kNumberOfDims); + if (unbiased) { + num_features = + sub(num_features, IrBuilder::create(x->container(), 1.)); + } + auto y = div(sum_x_mean_sub_sq, num_features); + + return y; +} + +TensorView* standard_deviation( + TensorView* x, + const std::vector& dims, + bool unbiased, + bool keepdim) { + TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid."); + return sqrt(variance(x, dims, unbiased, keepdim)); +} + TensorView* softmax(TensorView* x, int dim) { TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid."); @@ -23,7 +82,7 @@ TensorView* softmax(TensorView* x, int dim) { auto exp_val = exp(x_max_sub); auto sum_exp = sum(exp_val, {kReductionAxis}); auto bcast_sum = broadcast(sum_exp, broadcast_mask); - auto y = div(exp_val, bcast_sum); + auto y = mul(exp_val, reciprocal(bcast_sum)); return y; } @@ -49,6 +108,45 @@ TensorView* softmax_backward(TensorView* dy, TensorView* y, int dim) { return dx; } +TensorView* log_softmax(TensorView* x, int dim) { + TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid."); + + const int kNumberOfDims = + TensorDomain::noReductions(x->getMaybeRFactorDomain()).size(); + const int kReductionAxis = (dim < 0) ? dim + kNumberOfDims : dim; + TORCH_INTERNAL_ASSERT(kReductionAxis >= 0 && kReductionAxis < kNumberOfDims); + + std::vector broadcast_mask(kNumberOfDims, false); + broadcast_mask[kReductionAxis] = true; + + auto max_val = max(x, {kReductionAxis}); + auto bcast_max = broadcast(max_val, broadcast_mask); + auto x_max_sub = sub(x, bcast_max); + auto exp_val = exp(x_max_sub); + auto bcast_sum = sum(exp_val, {kReductionAxis}, true /* keepdim */); + auto log_sum_exp = log(bcast_sum); + auto y = sub(x_max_sub, log_sum_exp); + + return y; +} + +TensorView* log_softmax_backward(TensorView* dy, TensorView* y, int dim) { + TORCH_INTERNAL_ASSERT(dy != nullptr, "Grad Output is invalid."); + TORCH_INTERNAL_ASSERT(y != nullptr, "Output is invalid."); + + const int kNumberOfDims = + TensorDomain::noReductions(y->getMaybeRFactorDomain()).size(); + const int kReductionAxis = (dim < 0) ? dim + kNumberOfDims : dim; + TORCH_INTERNAL_ASSERT(kReductionAxis >= 0 && kReductionAxis < kNumberOfDims); + + auto bcast_sum_grad = sum(dy, {kReductionAxis}, true /* keepdim */); + auto softmax = exp(y); + auto softmax_sum_mul = mul(softmax, bcast_sum_grad); + auto dx = sub(dy, softmax_sum_mul); + + return dx; +} + ForwardNormResult layer_norm( TensorView* x, const std::vector& norm_shape, @@ -58,18 +156,9 @@ ForwardNormResult layer_norm( return layer_norm(x, norm_shape.size(), weight, bias, eps); } -ForwardNormResult layer_norm( - TensorView* x, - const size_t kNormShapeNumDims, - TensorView* weight, - TensorView* bias, - Val* eps) { - TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid."); - TORCH_INTERNAL_ASSERT( - eps != nullptr && eps->getDataType().has_value() && - eps->getDataType().value() == DataType::Double, - "Epsilon (eps) is not a valid Double."); - +auto norm_properties_from_num_dims( + const TensorView* x, + const size_t kNormShapeNumDims) { // (B, C, H, W, D) tensor // norm_shape = [H, W, D] // M = outer = product of remaining dimensions = B * C @@ -81,28 +170,57 @@ ForwardNormResult layer_norm( std::vector outer_reduction_axes(kOuterNumDims); std::vector outer_broadcast_mask(kNumberOfDims, false); + std::vector inner_reduction_axes(kNormShapeNumDims); + std::vector inner_broadcast_mask(kNumberOfDims, false); + for (const auto idx : c10::irange(kOuterNumDims)) { outer_reduction_axes[idx] = idx; outer_broadcast_mask[idx] = true; } - std::vector inner_reduction_axes(kNormShapeNumDims); - std::vector inner_broadcast_mask(kNumberOfDims, false); - Val* num_features = new Double(1); + Val* num_features = IrBuilder::create(x->container(), 1); for (const auto idx : c10::irange(kNormShapeNumDims)) { const size_t axis = kNumberOfDims - 1 - idx; inner_reduction_axes[idx] = axis; inner_broadcast_mask[axis] = true; num_features = mul(num_features, x->domain()->domain()[axis]->extent()); } + struct result { + std::vector outer_reduction_axes; + std::vector outer_broadcast_mask; + std::vector inner_reduction_axes; + std::vector inner_broadcast_mask; + Val* num_features = nullptr; + } r; + r.outer_reduction_axes = outer_reduction_axes; + r.outer_broadcast_mask = outer_broadcast_mask; + r.inner_reduction_axes = inner_reduction_axes; + r.inner_broadcast_mask = inner_broadcast_mask; + r.num_features = num_features; + return r; +} + +ForwardNormResult layer_norm( + TensorView* x, + const size_t kNormShapeNumDims, + TensorView* weight, + TensorView* bias, + Val* eps) { + TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid."); + TORCH_INTERNAL_ASSERT( + eps != nullptr && eps->getDataType().has_value() && + eps->getDataType().value() == DataType::Double, + "Epsilon (eps) is not a valid Double."); + + auto r = norm_properties_from_num_dims(x, kNormShapeNumDims); // Main algorithm - auto welford_out = Welford(x, inner_reduction_axes); - auto mean_bcast = broadcast(welford_out.avg, inner_broadcast_mask); + auto welford_out = Welford(x, r.inner_reduction_axes); + auto mean_bcast = broadcast(welford_out.avg, r.inner_broadcast_mask); auto x_sub_mean = sub(x, mean_bcast); - auto var_sum_bcast = broadcast(welford_out.var_sum, inner_broadcast_mask); - auto var = div(var_sum_bcast, num_features); + auto var_sum_bcast = broadcast(welford_out.var_sum, r.inner_broadcast_mask); + auto var = mul(var_sum_bcast, reciprocal(r.num_features)); auto var_eps = add(var, eps); auto invstd = rsqrt(var_eps); @@ -110,19 +228,58 @@ ForwardNormResult layer_norm( // Optional: norm * weight if (weight != nullptr) { - auto weight_bcast = broadcast(weight, outer_broadcast_mask); + auto weight_bcast = broadcast(weight, r.outer_broadcast_mask); y = mul(y, weight_bcast); } // Optional: norm * weight + bias if (bias != nullptr) { - auto bias_bcast = broadcast(bias, outer_broadcast_mask); + auto bias_bcast = broadcast(bias, r.outer_broadcast_mask); y = add(y, bias_bcast); } return {y, mean_bcast, invstd}; } +ForwardRMSNormResult rms_norm( + TensorView* x, + const std::vector& norm_shape, + TensorView* weight, + Val* eps) { + return rms_norm(x, norm_shape.size(), weight, eps); +} + +ForwardRMSNormResult rms_norm( + TensorView* x, + const size_t kNormShapeNumDims, + TensorView* weight, + Val* eps) { + TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid."); + TORCH_INTERNAL_ASSERT( + eps != nullptr && eps->getDataType().has_value() && + eps->getDataType().value() == DataType::Double, + "Epsilon (eps) is not a valid Double."); + + auto r = norm_properties_from_num_dims(x, kNormShapeNumDims); + + // Main algorithm + auto var_sum = sum(mul(x, x), r.inner_reduction_axes); + auto var_sum_bcast = broadcast(var_sum, r.inner_broadcast_mask); + auto var = mul(var_sum_bcast, reciprocal(r.num_features)); + auto var_eps = add(var, eps); + auto invstd = rsqrt(var_eps); + + auto y = mul(x, invstd); + + // Optional: norm * weight + if (weight != nullptr) { + auto weight_bcast = broadcast(weight, r.outer_broadcast_mask); + y = mul(y, weight_bcast); + } + + return {y, invstd}; +} + BackwardNormResult layer_norm_backward( TensorView* dy, TensorView* x, @@ -137,55 +294,30 @@ BackwardNormResult layer_norm_backward( TORCH_INTERNAL_ASSERT(mean != nullptr, "Mean is invalid."); TORCH_INTERNAL_ASSERT(invstd != nullptr, "Inv std is invalid."); - // (B, C, H, W, D) tensor - // norm_shape = [H, W, D] - // M = outer = product of remaining dimensions = B * C - // N = reduction = product of norm_shape = H * W * D - // weight = bias = norm_shape tensor - const size_t kNumberOfDims = - TensorDomain::noReductions(x->getMaybeRFactorDomain()).size(); - const size_t kNormShapeNumDims = norm_shape.size(); - const size_t kOuterNumDims = kNumberOfDims - kNormShapeNumDims; - - std::vector outer_reduction_axes(kOuterNumDims); - std::vector outer_broadcast_mask(kNumberOfDims, false); - for (const auto idx : c10::irange(kOuterNumDims)) { - outer_reduction_axes[idx] = idx; - outer_broadcast_mask[idx] = true; - } - - std::vector inner_reduction_axes(kNormShapeNumDims); - std::vector inner_broadcast_mask(kNumberOfDims, false); - Val* num_features = new Double(1); - for (const auto idx : c10::irange(kNormShapeNumDims)) { - const size_t axis = kNumberOfDims - 1 - idx; - inner_reduction_axes[idx] = axis; - inner_broadcast_mask[axis] = true; - num_features = mul(num_features, x->domain()->domain()[axis]->extent()); - } + auto r = norm_properties_from_num_dims(x, norm_shape.size()); auto x_hat = mul(sub(x, mean), invstd); TensorView* grad_x_hat = nullptr; if (weight != nullptr) { - auto* bcast_weight = broadcast(weight, outer_broadcast_mask); + auto* bcast_weight = broadcast(weight, r.outer_broadcast_mask); grad_x_hat = mul(dy, bcast_weight); } else { grad_x_hat = dy; } - auto a = mul(num_features, grad_x_hat); + auto a = mul(r.num_features, grad_x_hat); - auto b = sum(grad_x_hat, inner_reduction_axes); - auto bcast_b = broadcast(b, inner_broadcast_mask); + auto b = sum(grad_x_hat, r.inner_reduction_axes); + auto bcast_b = broadcast(b, r.inner_broadcast_mask); auto c1 = mul(grad_x_hat, x_hat); - auto c2 = sum(c1, inner_reduction_axes); - auto bcast_c2 = broadcast(c2, inner_broadcast_mask); + auto c2 = sum(c1, r.inner_reduction_axes); + auto bcast_c2 = broadcast(c2, r.inner_broadcast_mask); auto c3 = mul(x_hat, bcast_c2); auto inner = sub(sub(a, bcast_b), c3); - auto reciprocal_size = reciprocal(num_features); + auto reciprocal_size = reciprocal(r.num_features); TensorView* dx = nullptr; if (output_mask[0]) { @@ -194,16 +326,65 @@ BackwardNormResult layer_norm_backward( TensorView* dw = nullptr; if (output_mask[1] && weight != nullptr) { - dw = sum(mul(dy, x_hat), outer_reduction_axes); + dw = sum(mul(dy, x_hat), r.outer_reduction_axes); } TensorView* db = nullptr; if (output_mask[2] && bias != nullptr) { - db = sum(dy, outer_reduction_axes); + db = sum(dy, r.outer_reduction_axes); } return {dx, dw, db}; } +BackwardRMSNormResult rms_norm_backward( + TensorView* dy, + TensorView* x, + const std::vector& norm_shape, + TensorView* invstd, + TensorView* weight, + const std::vector& output_mask) { + TORCH_INTERNAL_ASSERT(dy != nullptr, "Grad Output is invalid."); + TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid."); + TORCH_INTERNAL_ASSERT(invstd != nullptr, "Inv std is invalid."); + + auto r = norm_properties_from_num_dims(x, norm_shape.size()); + + auto x_hat = mul(x, invstd); + + TensorView* grad_x_hat = nullptr; + if (weight != nullptr) { + auto* bcast_weight = broadcast(weight, r.outer_broadcast_mask); + grad_x_hat = mul(dy, bcast_weight); + } else { + grad_x_hat = dy; + } + + auto a = mul(r.num_features, grad_x_hat); + + auto b = sum(grad_x_hat, r.inner_reduction_axes); + auto bcast_b = broadcast(b, r.inner_broadcast_mask); + + auto c1 = mul(grad_x_hat, x_hat); + auto c2 = sum(c1, r.inner_reduction_axes); + auto bcast_c2 = broadcast(c2, r.inner_broadcast_mask); + auto c3 = mul(x_hat, bcast_c2); + + auto inner = sub(sub(a, bcast_b), c3); + auto reciprocal_size = reciprocal(r.num_features); + + TensorView* dx = nullptr; + if (output_mask[0]) { + dx = mul(mul(reciprocal_size, invstd), inner); + } + + TensorView* dw = nullptr; + if (output_mask[1] && weight != nullptr) { + dw = sum(mul(dy, x_hat), r.outer_reduction_axes); + } + + return {dx, dw}; +} + ForwardNormResult batch_norm( TensorView* x, TensorView* weight, @@ -243,7 +424,7 @@ ForwardNormResult batch_norm( std::vector reduction_axes; std::vector broadcast_mask(kNumberOfDims, false); - Val* num_features = new Double(1); + Val* num_features = IrBuilder::create(x->container(), 1); for (const auto axis : c10::irange(kNumberOfDims)) { if (axis != c_axis) { @@ -267,22 +448,24 @@ ForwardNormResult batch_norm( kTraining, "When running stats are provided, batch stats should only be computed during training"); - auto rev_momentum = sub(new Double(1.0), momentum); + auto rev_momentum = + sub(IrBuilder::create(x->container(), 1.0), momentum); auto current_mean_hat = mul(welford_out.avg, momentum); auto mean_hat = mul(running_mean, rev_momentum); auto new_mean_hat = add(mean_hat, current_mean_hat); - auto num_feature_decrement = sub(num_features, new Int(1)); - auto unbiased_var = div(welford_out.var_sum, num_feature_decrement); + auto num_feature_decrement = sub(num_features, x->container()->oneVal()); + auto unbiased_var = + mul(welford_out.var_sum, reciprocal(num_feature_decrement)); auto current_var_hat = mul(unbiased_var, momentum); auto var_hat = mul(running_var, rev_momentum); auto new_var_hat = add(var_hat, current_var_hat); - // when inputs have been casted by parser. We want to alias the output to - // the pre-casted input, so we can still update running stats + // when inputs have been cast by parser. We want to alias the output to + // the pre-cast input, so we can still update running stats auto cast_to_input_dtype = [fusion]( - Val* casted_input, Val* aliased_output) { - auto unary_op = casted_input->definition(); + Val* cast_input, Val* aliased_output) { + auto unary_op = cast_input->definition(); TORCH_INTERNAL_ASSERT( unary_op->isA() && unary_op->as()->getUnaryOpType() == UnaryOpType::Cast, @@ -295,21 +478,18 @@ ForwardNormResult batch_norm( TORCH_INTERNAL_ASSERT( rm_dtype.has_value(), "Input running stats must have dtype defined"); - auto casted_output = castOp(*rm_dtype, aliased_output); + auto cast_output = castOp(*rm_dtype, aliased_output); - fusion->addOutput(casted_output); - fusion->aliasOutputToInput(casted_output, input_to_cast); + fusion->aliasOutputToInput(cast_output, input_to_cast); }; - if (fusion->hasInput(running_mean)) { - fusion->addOutput(new_mean_hat); + if (running_mean->isFusionInput()) { fusion->aliasOutputToInput(new_mean_hat, running_mean); } else { cast_to_input_dtype(running_mean, new_mean_hat); } - if (fusion->hasInput(running_var)) { - fusion->addOutput(new_var_hat); + if (running_var->isFusionInput()) { fusion->aliasOutputToInput(new_var_hat, running_var); } else { cast_to_input_dtype(running_var, new_var_hat); @@ -320,7 +500,7 @@ ForwardNormResult batch_norm( auto mean_bcast = broadcast(mean, broadcast_mask); auto x_sub_mean = sub(x, mean_bcast); - auto var = div(welford_out.var_sum, num_features); + auto var = mul(welford_out.var_sum, reciprocal(num_features)); auto var_eps = add(var, eps); invstd = rsqrt(var_eps); auto invstd_bcast = broadcast(invstd, broadcast_mask); @@ -414,19 +594,6 @@ BackwardNormResult batch_norm_backward( mean = broadcast(mean, broadcast_mask); - TensorView* weight_val = nullptr; - if (weight == nullptr) { - weight_val = TensorViewBuilder() - .ndims(kNumberOfDims) - .dtype(input->getDataType().value()) - .shape(std::vector(kNumberOfDims, 1)) - .build(); - new UnaryOp( - UnaryOpType::Set, weight_val->as(), (new Double(1.0))->as()); - } else { - weight_val = broadcast(weight, broadcast_mask); - } - auto norm = reciprocal(num_features); auto grad_output_sum = sum(grad_output, reduction_axes); @@ -435,7 +602,16 @@ BackwardNormResult batch_norm_backward( auto grad_mean = broadcast(mul(grad_output_sum, norm), broadcast_mask); auto proj_scale = broadcast(mul(mul(dot_p, norm), mul(invstd, invstd)), broadcast_mask); - auto grad_scale = mul(broadcast(invstd, broadcast_mask), weight_val); + TensorView* grad_scale = nullptr; + + if (weight == nullptr) { + grad_scale = + mul(broadcast(invstd, broadcast_mask), + IrBuilder::create(input->container(), 1)); + } else { + grad_scale = mul( + broadcast(invstd, broadcast_mask), broadcast(weight, broadcast_mask)); + } TensorView* grad_input = nullptr; if (kTraining) { @@ -466,7 +642,8 @@ ForwardNormResult instance_norm( TensorView* running_var, const bool kUseInputStats, Val* momentum, - Val* eps) { + Val* eps, + bool channels_last) { auto fusion = FusionGuard::getCurFusion(); TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid."); @@ -490,13 +667,13 @@ ForwardNormResult instance_norm( // N = reduction = H * W * D // weight = bias = C tensor const size_t kBatchDim = 0; - const size_t kChannelsDim = 1; const size_t kNumberOfDims = TensorDomain::noReductions(x->getMaybeRFactorDomain()).size(); + const size_t kChannelsDim = channels_last ? kNumberOfDims - 1 : 1; std::vector x_reduction_axes; std::vector x_broadcast_mask(kNumberOfDims, false); - Val* N = new Double(1); + Val* N = IrBuilder::create(x->container(), 1); for (const auto axis : c10::irange(kNumberOfDims)) { if (axis != kBatchDim && axis != kChannelsDim) { x_reduction_axes.push_back(axis); @@ -504,7 +681,7 @@ ForwardNormResult instance_norm( N = mul(N, x->domain()->domain()[axis]->extent()); } } - Val* B = new Double(1); + Val* B = IrBuilder::create(x->container(), 1); B = mul(B, x->domain()->domain()[kBatchDim]->extent()); std::vector channels_only_broadcast_mask(kNumberOfDims, false); @@ -523,29 +700,51 @@ ForwardNormResult instance_norm( // updating running mean and running var if (running_mean != nullptr && running_var != nullptr) { - auto rev_momentum = sub(new Double(1.0), momentum); + auto _running_mean = running_mean; + auto _running_var = running_var; + if (_running_mean->getDataType().value() == DataType::Half || + _running_mean->getDataType().value() == DataType::BFloat16) { + _running_mean = castOp(DataType::Float, _running_mean); + } + if (_running_var->getDataType().value() == DataType::Half || + _running_var->getDataType().value() == DataType::BFloat16) { + _running_var = castOp(DataType::Float, running_var); + } + auto rev_momentum = + sub(IrBuilder::create(x->container(), 1.0), momentum); auto current_mean_hat = mul(welford_out.avg, momentum); - auto mean_hat = mul(running_mean, rev_momentum); + auto mean_hat = mul(_running_mean, rev_momentum); auto new_mean_hat = add(mean_hat, current_mean_hat); // NS: static_cast to workaround VC++ error, see // https://godbolt.org/z/6Prd77xYs auto new_mean_sum = sum(new_mean_hat, {static_cast(kBatchDim)}); - auto new_mean_channels_only = div(new_mean_sum, B); - fusion->addOutput(new_mean_channels_only); + auto new_mean_channels_only = mul(new_mean_sum, reciprocal(B)); + if (running_mean->getDataType().value() == DataType::Half || + running_mean->getDataType().value() == DataType::BFloat16) { + new_mean_channels_only = + castOp(running_mean->getDataType().value(), new_mean_channels_only); + } + // fusion->addOutput(new_mean_channels_only); fusion->aliasOutputToInput(new_mean_channels_only, running_mean); - auto num_feature_decrement = sub(N, new Int(1)); - auto unbiased_var = div(welford_out.var_sum, num_feature_decrement); + auto num_feature_decrement = sub(N, x->container()->oneVal()); + auto unbiased_var = + mul(welford_out.var_sum, reciprocal(num_feature_decrement)); auto current_var_hat = mul(unbiased_var, momentum); - auto var_hat = mul(running_var, rev_momentum); + auto var_hat = mul(_running_var, rev_momentum); auto new_var_hat = add(var_hat, current_var_hat); // NS: static_cast to workaround VC++ error, see // https://godbolt.org/z/6Prd77xYs auto new_var_sum = sum(new_var_hat, {static_cast(kBatchDim)}); - auto new_var_channels_only = div(new_var_sum, B); - fusion->addOutput(new_var_channels_only); + auto new_var_channels_only = mul(new_var_sum, reciprocal(B)); + if (running_var->getDataType().value() == DataType::Half || + running_var->getDataType().value() == DataType::BFloat16) { + new_var_channels_only = + castOp(running_var->getDataType().value(), new_var_channels_only); + } + // fusion->addOutput(new_var_channels_only); fusion->aliasOutputToInput(new_var_channels_only, running_var); } @@ -553,7 +752,7 @@ ForwardNormResult instance_norm( auto mean_bcast = broadcast(mean, x_broadcast_mask); auto x_sub_mean = sub(x, mean_bcast); - auto var = div(welford_out.var_sum, N); + auto var = mul(welford_out.var_sum, reciprocal(N)); auto var_eps = add(var, eps); invstd = rsqrt(var_eps); auto invstd_bcast = broadcast(invstd, x_broadcast_mask); @@ -589,6 +788,121 @@ ForwardNormResult instance_norm( return {y, mean, invstd}; } +BackwardNormResult instance_norm_backward( + TensorView* input, + TensorView* grad_output, + TensorView* weight, + TensorView* running_mean, + TensorView* running_var, + TensorView* save_mean, + TensorView* save_invstd, + const bool kTraining, + Val* eps, + const std::vector& output_mask, + bool channels_last) { + TORCH_INTERNAL_ASSERT(input != nullptr, "Input is invalid."); + TORCH_INTERNAL_ASSERT(grad_output != nullptr, "Grad Output is invalid."); + TORCH_INTERNAL_ASSERT( + eps != nullptr && eps->getDataType().has_value() && + eps->getDataType().value() == DataType::Double, + "Epsilon (eps) is not a valid Double."); + + // (B, C, H, W, D) tensor + // M = outer = channels + // N = reduction = B * H * W * D + // weight = bias = (C) tensor + const size_t kNumberOfDims = + TensorDomain::noReductions(input->getMaybeRFactorDomain()).size(); + // channels last format means C dimension is at axis kNumberOfDims-1 at x / + // grad_out + const size_t b_axis = 0; // for clarity + const size_t c_axis = channels_last ? kNumberOfDims - 1 : 1; + + std::vector reduction_axes; + std::vector broadcast_mask(kNumberOfDims, false); + // weight has its own broadcast mask as it is broadcast for the batch unlike + // mean/var + std::vector weight_broadcast_mask(kNumberOfDims, false); + Val* num_features = nullptr; + for (const auto axis : c10::irange(kNumberOfDims)) { + if (axis != c_axis) { + weight_broadcast_mask[axis] = true; + if (axis != b_axis) { + reduction_axes.push_back(axis); + broadcast_mask[axis] = true; + if (num_features == nullptr) { + num_features = castOp( + DataType::Double, input->domain()->domain()[axis]->extent()); + } else { + num_features = + mul(num_features, input->domain()->domain()[axis]->extent()); + } + } + } + } + + auto mean = save_mean; + auto invstd = save_invstd; + if (kTraining) { + TORCH_INTERNAL_ASSERT( + save_mean != nullptr && save_invstd != nullptr, + "When training=True, save_mean and save_invstd are required."); + } else { + mean = running_mean; + invstd = rsqrt(add(running_var, eps)); + } + mean = broadcast(mean, broadcast_mask); + + auto norm = reciprocal(num_features); + + auto grad_output_sum = sum(grad_output, reduction_axes); + auto dot_p = sum(mul(grad_output, sub(input, mean)), reduction_axes); + + auto grad_mean = broadcast(mul(grad_output_sum, norm), broadcast_mask); + + auto proj_scale = + broadcast(mul(mul(dot_p, norm), mul(invstd, invstd)), broadcast_mask); + + TensorView* grad_scale = nullptr; + + if (weight == nullptr) { + grad_scale = + mul(broadcast(invstd, broadcast_mask), + IrBuilder::create(input->container(), 1)); + } else { + grad_scale = + mul(broadcast(invstd, broadcast_mask), + broadcast(weight, weight_broadcast_mask)); + } + + TensorView* grad_input = nullptr; + if (kTraining) { + auto proj = mul(sub(input, mean), proj_scale); + grad_input = mul(sub(sub(grad_output, proj), grad_mean), grad_scale); + } else { + grad_input = mul(grad_output, grad_scale); + } + + TensorView* grad_weight = nullptr; + TensorView* grad_weight_reduced = nullptr; + if (output_mask[1]) { + grad_weight = mul(dot_p, invstd); + // TODO: grad weight needs to be reduced across batch-dim but is this the + // most efficient place or can reduction happen earlier? + grad_weight_reduced = sum(grad_weight, {0}); + } + + TensorView* grad_bias = nullptr; + TensorView* grad_bias_reduced = nullptr; + if (output_mask[2]) { + grad_bias = grad_output_sum; + // TODO: same as above for grad weight + grad_bias_reduced = sum(grad_bias, {0}); + } + + return {grad_input, grad_weight_reduced, grad_bias_reduced}; +} + } // namespace cuda } // namespace fuser } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/ops/normalization.h b/torch/csrc/jit/codegen/cuda/ops/normalization.h index dae58462b929..74d8cc4ab650 100644 --- a/torch/csrc/jit/codegen/cuda/ops/normalization.h +++ b/torch/csrc/jit/codegen/cuda/ops/normalization.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -28,6 +28,33 @@ struct BackwardNormResult { TensorView* grad_bias = nullptr; }; +struct ForwardRMSNormResult { + TensorView* output = nullptr; + TensorView* invstd = nullptr; +}; + +struct BackwardRMSNormResult { + TensorView* grad_input = nullptr; + TensorView* grad_weight = nullptr; +}; + +TORCH_CUDA_CU_API TensorView* mean( + TensorView* x, + const std::vector& dims, + bool keepdim); + +TORCH_CUDA_CU_API TensorView* variance( + TensorView* x, + const std::vector& dims, + bool unbiased, + bool keepdim); + +TORCH_CUDA_CU_API TensorView* standard_deviation( + TensorView* x, + const std::vector& dims, + bool unbiased, + bool keepdim); + TORCH_CUDA_CU_API TensorView* softmax(TensorView* x, int dim); TORCH_CUDA_CU_API TensorView* softmax_backward( @@ -35,6 +62,13 @@ TORCH_CUDA_CU_API TensorView* softmax_backward( TensorView* y, const int dim); +TORCH_CUDA_CU_API TensorView* log_softmax(TensorView* x, int dim); + +TORCH_CUDA_CU_API TensorView* log_softmax_backward( + TensorView* dy, + TensorView* y, + const int dim); + TORCH_CUDA_CU_API ForwardNormResult layer_norm( TensorView* x, const std::vector& norm_shape, @@ -49,6 +83,18 @@ TORCH_CUDA_CU_API ForwardNormResult layer_norm( TensorView* bias, Val* eps); +TORCH_CUDA_CU_API ForwardRMSNormResult rms_norm( + TensorView* x, + const std::vector& norm_shape, + TensorView* weight, + Val* eps); + +TORCH_CUDA_CU_API ForwardRMSNormResult rms_norm( + TensorView* x, + const size_t kNormShapeNumDims, + TensorView* weight, + Val* eps); + TORCH_CUDA_CU_API BackwardNormResult layer_norm_backward( TensorView* dy, TensorView* x, @@ -59,6 +105,14 @@ TORCH_CUDA_CU_API BackwardNormResult layer_norm_backward( TensorView* bias, const std::vector& output_mask); +TORCH_CUDA_CU_API BackwardRMSNormResult rms_norm_backward( + TensorView* dy, + TensorView* x, + const std::vector& norm_shape, + TensorView* rstd, + TensorView* weight, + const std::vector& output_mask); + TORCH_CUDA_CU_API ForwardNormResult batch_norm( TensorView* x, TensorView* weight, @@ -89,9 +143,23 @@ TORCH_CUDA_CU_API ForwardNormResult instance_norm( TensorView* bias, TensorView* running_mean, TensorView* running_var, - const bool kUseInputStats, + const bool kUseInputStats, // kTraining? Val* momentum, - Val* eps); + Val* eps, + bool channels_last = false); + +TORCH_CUDA_CU_API BackwardNormResult instance_norm_backward( + TensorView* x, + TensorView* dy, + TensorView* weight, + TensorView* running_mean, + TensorView* running_var, + TensorView* save_mean, + TensorView* save_invstd, + const bool kTraining, + Val* eps, + const std::vector& output_mask, + bool channels_last = false); } // namespace cuda } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/parallel_dimension_map.cpp b/torch/csrc/jit/codegen/cuda/parallel_dimension_map.cpp index 3dcb58335a44..fd468a8b792e 100644 --- a/torch/csrc/jit/codegen/cuda/parallel_dimension_map.cpp +++ b/torch/csrc/jit/codegen/cuda/parallel_dimension_map.cpp @@ -5,8 +5,6 @@ #include #include #include -#include -#include #include #include @@ -45,28 +43,22 @@ void ParallelDimensionMap::build(Fusion* fusion) { } void ParallelDimensionMap::registerConstantExtent(IterDomain* id) { - ExpressionEvaluator ee(id->fusion()); - auto extent_int = ee.evaluate(id->extent()); - if (!extent_int.has_value()) { + if (!id->extent()->isConstScalar()) { // Nothing to do if not constant return; } - auto const_extent = extent_int.value(); + ExpressionEvaluator ee(id->fusion()); + auto extent_int = ee.evaluate(id->extent()); + TORCH_INTERNAL_ASSERT( + extent_int.has_value(), + "Extent of ", + id->toString(), + " should have been constant, but could not be evaluated at compile time."); - // Ignore if this is derived from a size-1 domain as it is likely a - // size-1 broadcast domain and that does not represent the actual - // dimension even if it's constant. Being size-1 may not always mean - // it's a broadcast domain, but it'd be safe to assume it is mostly - // the case. If it is not a broadcast, ignoring this domain does not - // impact the correctness. - auto extent_inputs = InputsOf::output(id->fusion(), id->extent()); - if (std::any_of(extent_inputs.begin(), extent_inputs.end(), [](Val* input) { - return input->isOneInt(); - })) { - return; - } + auto const_extent = extent_int.value(); + // Uses index map auto concrete_id = getCAMappedConcreteDomain(id); auto existing_it = constant_extent_map_.find(id); @@ -101,25 +93,21 @@ void ParallelDimensionMap::populateDimensionMapWithSingleCASet( const std::unordered_set& dom_set) { TORCH_INTERNAL_ASSERT(dom_set.size() == 1); - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - // pt is used by only one concrete domain auto id = *dom_set.begin(); auto it = constant_extent_map_.find(id); if (it != constant_extent_map_.end()) { - if (it->second.size() == 1) { - dim_map_.insert({pt, ir_builder.create(*(it->second.begin()))}); - exact_types_.insert(pt); - } else { - // Multiple constant dimensions found; Use the corresponding - // symbolic parallel dim - dim_map_.insert({pt, kir::NamedScalar::getParallelDim(pt)}); - } + TORCH_INTERNAL_ASSERT( + it->second.size() == 1, + "Only one value found mapped to parallel type ", + stringifyThread(pt), + " yet its bound to multiple extents."); + dim_map_.insert({pt, IrBuilder::create(*(it->second.begin()))}); + exact_types_.insert(pt); } else { // Prefer to use blockDim/gridDim if not constant - dim_map_.insert({pt, kir::NamedScalar::getParallelDim(pt)}); + dim_map_.insert({pt, NamedScalar::getParallelDim(pt)}); exact_types_.insert(pt); } } @@ -129,12 +117,9 @@ void ParallelDimensionMap::populateDimensionMapWithMultipleCASet( const std::unordered_set& dom_set) { TORCH_INTERNAL_ASSERT(dom_set.size() > 1); - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - bool all_equal = true; // Use nullptr to signal it's not initialied yet - kir::Val* known_dimension = nullptr; + Val* known_dimension = nullptr; // Use -1 to signal it's not initialied yet int64_t known_const = -1; @@ -172,7 +157,7 @@ void ParallelDimensionMap::populateDimensionMapWithMultipleCASet( // At this point, it still remains undetermined whether this id // matches with those previously looked at. Constant check failed, // but symbolic matching may succeed. - auto this_dimension = gpu_lower->lowerValue(concrete_id->extent()); + auto this_dimension = concrete_id->extent(); if (known_dimension == nullptr) { // No previous dimension found yet known_dimension = this_dimension; @@ -191,21 +176,22 @@ void ParallelDimensionMap::populateDimensionMapWithMultipleCASet( } // Use the const value, if found, as its dimension if (all_equal && known_const != -1) { - dim_map_.insert({pt, ir_builder.create(known_const)}); + dim_map_.insert({pt, IrBuilder::create(known_const)}); } else { - dim_map_.insert({pt, kir::NamedScalar::getParallelDim(pt)}); + dim_map_.insert({pt, NamedScalar::getParallelDim(pt)}); } } void ParallelDimensionMap::adjustMappingsForWarpPadding() { const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); // If TIDx is padded to a multiple of the warp size, mark it as // non-exact. auto& warp_info = gpu_lower->getWarpPaddedParallelInfo(); - if (!warp_info.is_tidx_padded) { + // TIDx isn't really padded if there isn't a warp reduction (this could + // change) + if (!(warp_info.is_tidx_padded && warp_info.has_warp_reduction)) { return; } @@ -215,7 +201,7 @@ void ParallelDimensionMap::adjustMappingsForWarpPadding() { // If the dimension of TIDx is actually a multple of the warp size // before padding, it can be left as exact if (isExact(tidx_pt)) { - auto tidx_dim = dynamic_cast(get(tidx_pt)); + auto tidx_dim = dynamic_cast(get(tidx_pt)); if (tidx_dim && tidx_dim->isConst()) { auto tidx_dim_val = tidx_dim->value().value(); if (tidx_dim_val % warp_size == 0) { @@ -223,23 +209,36 @@ void ParallelDimensionMap::adjustMappingsForWarpPadding() { return; } } + // If tidx is strictly defined as blockDim.x then it must be set to a + // multiple of the warp and can be considered exact + bool tidx_def_trivial = true; + for (auto entry : concrete_dom_map_.at(tidx_pt)) { + if (!entry->isA() || + !entry->as()->sameAs( + NamedScalar::getParallelDim(tidx_pt))) { + tidx_def_trivial = false; + } + } + if (tidx_def_trivial) { + return; + } } // TIDx is padded to a multiple of warp. If it's known to be a // single warp, use the constant warp size as the dimension of - // TIDx. Otherwise, jsut use blockDim.x. + // TIDx. Otherwise, just use blockDim.x. if (warp_info.is_tidx_single_warp) { - dim_map_.at(ParallelType::TIDx) = ir_builder.create(warp_size); + dim_map_.at(ParallelType::TIDx) = IrBuilder::create(warp_size); } else { dim_map_.at(ParallelType::TIDx) = - kir::NamedScalar::getParallelDim(ParallelType::TIDx); + NamedScalar::getParallelDim(ParallelType::TIDx); } // TIDx is no longer exact exact_types_.erase(ParallelType::TIDx); } -kir::Val* ParallelDimensionMap::get(ParallelType pt) const { +Val* ParallelDimensionMap::get(ParallelType pt) const { TORCH_INTERNAL_ASSERT(isParallelTypeThread(pt), "Invalid ParallelType: ", pt); auto it = dim_map_.find(pt); if (it == dim_map_.end()) { @@ -254,14 +253,13 @@ bool ParallelDimensionMap::isExact(ParallelType pt) const { } IterDomain* ParallelDimensionMap::getCAMappedConcreteDomain(IterDomain* id) { - const auto gpu_lower = GpuLower::current(); - const auto& ca_map = gpu_lower->caIndexMap(); - return ca_map.getConcreteMappedID(id); + return GpuLower::current()->caMap()->getConcreteMappedID( + id, IdMappingMode::EXACT); } // Symbolically compares equality of two KIR vals. Comparison is done // conservatively, so returning false does not guarantee non-equality. -bool ParallelDimensionMap::equalDim(kir::Val* dim1, kir::Val* dim2) { +bool ParallelDimensionMap::equalDim(Val* dim1, Val* dim2) { TORCH_INTERNAL_ASSERT(dim1 != nullptr && dim2 != nullptr); if (dim1 == dim2) { @@ -269,8 +267,8 @@ bool ParallelDimensionMap::equalDim(kir::Val* dim1, kir::Val* dim2) { } // When Both are Int, they are same if both have the same constant - auto dim1_int = dynamic_cast(dim1); - auto dim2_int = dynamic_cast(dim2); + auto dim1_int = dynamic_cast(dim1); + auto dim2_int = dynamic_cast(dim2); if (dim1_int && dim2_int) { if (dim1_int->isConst() && dim2_int->isConst()) { return dim1_int->value() == dim2_int->value(); @@ -279,8 +277,8 @@ bool ParallelDimensionMap::equalDim(kir::Val* dim1, kir::Val* dim2) { // When both are NamedScalar, they are same if Both have the same // name - auto dim1_ns = dynamic_cast(dim1); - auto dim2_ns = dynamic_cast(dim2); + auto dim1_ns = dynamic_cast(dim1); + auto dim2_ns = dynamic_cast(dim2); if (dim1_ns && dim2_ns) { return dim1_ns->name() == dim2_ns->name(); } @@ -297,12 +295,19 @@ bool ParallelDimensionMap::equalDim(kir::Val* dim1, kir::Val* dim2) { // If both are BinaryOp or UnaryOp, check their inputs. Since these // Vals are IterDomain extents, UnaryOp should not occur, but // checking shouldn't be harmful. - if ((dim1_def->isA() && dim2_def->isA() && - (dim1_def->as()->operation() == - dim2_def->as()->operation())) || - (dim1_def->isA() && dim2_def->isA() && - (dim1_def->as()->operation() == - dim2_def->as()->operation()))) { + // TODO: + // We might be able to replace this with dim1->toInlineString() == + // dim2->toInlineString() + // If we want this less conservative we could make an "exact map" which + // could be another mode in compute at that maps all iter domains, but not + // concretized broadcast axes and only forwards through non-concretized + // broadcast axes. + if ((dim1_def->isA() && dim2_def->isA() && + (dim1_def->as()->getBinaryOpType() == + dim2_def->as()->getBinaryOpType())) || + (dim1_def->isA() && dim2_def->isA() && + (dim1_def->as()->getUnaryOpType() == + dim2_def->as()->getUnaryOpType()))) { for (const auto i : c10::irange(dim1_def->inputs().size())) { (void)i; // Suppress unused variable warning if (!equalDim(dim1_def->inputs()[0], dim2_def->inputs()[0])) { @@ -321,7 +326,7 @@ std::string ParallelDimensionMap::toString() const { ss << pt << ": "; auto dim = get(pt); if (dim != nullptr) { - ss << kir::toString(dim); + ss << dim->toString(); if (isExact(pt)) { ss << ", exact"; } else { diff --git a/torch/csrc/jit/codegen/cuda/parallel_dimension_map.h b/torch/csrc/jit/codegen/cuda/parallel_dimension_map.h index d05c17adea29..03bd513396f9 100644 --- a/torch/csrc/jit/codegen/cuda/parallel_dimension_map.h +++ b/torch/csrc/jit/codegen/cuda/parallel_dimension_map.h @@ -21,7 +21,7 @@ class TORCH_CUDA_CU_API ParallelDimensionMap { //! Returns the dimension of a ParallelType. nullptr is returned if //! a ParallelType is unused. - kir::Val* get(ParallelType pt) const; + Val* get(ParallelType pt) const; //! True if the dimension of a ParallelType is known to be exact bool isExact(ParallelType pt) const; @@ -29,7 +29,7 @@ class TORCH_CUDA_CU_API ParallelDimensionMap { std::string toString() const; //! Symbolically analyze if two extent vals are equal - static bool equalDim(kir::Val* dim1, kir::Val* dim2); + static bool equalDim(Val* dim1, Val* dim2); private: //! Register the extent of an IterDomain if its constant @@ -54,7 +54,7 @@ class TORCH_CUDA_CU_API ParallelDimensionMap { private: //! Maps from parallel types to dimensions, which are constant if //! a unique value is found. - std::unordered_map dim_map_; + std::unordered_map dim_map_; //! Set of parallel types whose dimensions are identified to be //! exactly the same as extents of mapped domains. std::unordered_set exact_types_; diff --git a/torch/csrc/jit/codegen/cuda/parallel_type_bitmap.h b/torch/csrc/jit/codegen/cuda/parallel_type_bitmap.h index 0bf8ae39277b..642017a3c097 100644 --- a/torch/csrc/jit/codegen/cuda/parallel_type_bitmap.h +++ b/torch/csrc/jit/codegen/cuda/parallel_type_bitmap.h @@ -1,8 +1,9 @@ #pragma once -#include +#include #include +#include #include #include #include @@ -160,6 +161,20 @@ class ParallelTypeBitmap { *this |= ParallelTypeBitmap(kBIDBits); } + //! Clear all of the TID flags + void clearAllTID() { + auto tid_bits = ParallelTypeBitmap(kTIDBits); + auto not_tid_bits = ~tid_bits; + *this &= not_tid_bits; + } + + //! Clear all of the BID flags + void clearAllBID() { + auto bid_bits = ParallelTypeBitmap(kBIDBits); + auto not_bid_bits = ~bid_bits; + *this &= not_bid_bits; + } + //! Get an iterator to traverse set types Iterator begin() const { return Iterator::begin(*this); @@ -271,6 +286,52 @@ inline ParallelTypeBitmap::Iterator ParallelTypeBitmap::Iterator::end( return Iterator(map, kOffsetEnd); } +//! Map from ParallelType to template type T +template +class ParallelTypeMap { + public: + ParallelTypeMap() = default; + + ParallelTypeMap(const T& init) { + std::fill(map_.begin(), map_.end(), init); + } + + T& operator[](ParallelType pt) { + return map_[getParallelTypeBitMapOffset(pt)]; + } + + const T& operator[](ParallelType pt) const { + return map_[getParallelTypeBitMapOffset(pt)]; + } + + T& at(ParallelType pt) { + return map_.at(getParallelTypeBitMapOffset(pt)); + } + + const T& at(ParallelType pt) const { + return map_.at(getParallelTypeBitMapOffset(pt)); + } + + auto begin() { + return map_.begin(); + } + + auto begin() const { + return map_.begin(); + } + + auto end() { + return map_.begin(); + } + + auto end() const { + return map_.begin(); + } + + private: + std::array map_; +}; + } // namespace cuda } // namespace fuser } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/parser.cpp b/torch/csrc/jit/codegen/cuda/parser.cpp index a33b33895c5b..187230dd6758 100644 --- a/torch/csrc/jit/codegen/cuda/parser.cpp +++ b/torch/csrc/jit/codegen/cuda/parser.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -12,6 +13,8 @@ #include #include +#include + #include #include @@ -26,30 +29,29 @@ namespace cuda { constexpr auto kNumUnaryOps = 10; constexpr auto kNumUnaryFloatOps = 23; +constexpr auto kNumUnaryIsOps = 6; constexpr auto kNumBinaryFloatOps = 3; constexpr auto kNumBinaryComparisonOps = 12; -constexpr auto kNumBinaryCastOps = 14; +constexpr auto kNumBinaryCastOps = 19; -constexpr auto kNumBinaryOpsWithAlpha = 4; +constexpr auto kNumBinaryOpsWithAlpha = 6; constexpr auto kNumLerpOps = 2; constexpr auto kNumLayernormFwd = 2; constexpr auto kNumBatchnormFwd = 3; +constexpr auto kNumBatchnormBwd = 2; constexpr auto kNumInstancenormFwd = 1; constexpr auto kNumSumToSize = 2; constexpr auto kNumAutocastOps = 2; -// constexpr auto kNumViewSize = 2; +constexpr auto kNumAliasDimOps = 2; +constexpr auto kNumViewOps = 2; +constexpr auto kNumVarOps = 2; +constexpr auto kNumSoftmaxFwd = 2; +constexpr auto kNumSoftmaxBwd = 2; +constexpr auto kNumAminAmaxOps = 2; namespace { -std::vector getTensorSizes(TensorTypePtr const& tensor_type) { - TORCH_INTERNAL_ASSERT(tensor_type != nullptr, "Input must be a Tensor."); - auto optional_sizes = tensor_type->sizes().concrete_sizes(); - TORCH_INTERNAL_ASSERT( - optional_sizes.has_value(), "Missing size information for the tensor."); - return optional_sizes.value(); -} - #define REGISTER_PARSE_RULE(op, func_body, ...) \ registerParseRule( \ op, \ @@ -57,15 +59,53 @@ std::vector getTensorSizes(TensorTypePtr const& tensor_type) { -> void func_body, \ __VA_ARGS__) -const auto& sizeAttr = Symbol::attr("profiled_size"); +const auto& reductionSizeAttr = Symbol::attr("profiled_reduction_size"); +const auto& viewSizeAttr = Symbol::attr("profiled_view_size"); const auto& intListAttr = Symbol::attr("profiled_int_list"); const auto& intAttr = Symbol::attr("profiled_int"); const auto& boolListAttr = Symbol::attr("profiled_bool_list"); const auto& boolAttr = Symbol::attr("profiled_bool"); +const auto& strAttr = Symbol::attr("profiled_str"); +const auto& ivalAttr = Symbol::attr("profiled_ival"); +const auto& profileFailedAttr = Symbol::attr("profile_failed"); typedef Val* CgValue; typedef Expr* CgOp; +bool isReductionNonCompatibleTensor( + const std::shared_ptr& tensor_type) { + return is_zero_dim_tensor(tensor_type) || is_zero_sized_tensor(tensor_type); +} + +bool isInputNonSizeZeroTensor(const Node* node) { + for (const auto& val : node->inputs()) { + auto tensor_type = val->type()->cast(); + if (tensor_type && is_zero_sized_tensor(tensor_type)) { + return false; + } + } + return true; +} + +bool isScalarTypeCompatible(const Node* node, size_t offset) { + auto val = node->input(offset); + // return true if it's not specified + if (val->type()->isSubtypeOf(static_cast(NoneType::get()))) { + return true; + } + // return false if it's runtime value + if (val->node()->kind() != prim::Constant) { + return false; + } + auto dtype = toIValue(val)->toScalarType(); + + // we do NOT support half math type yet + if (dtype == at::ScalarType::Half || dtype == at::ScalarType::BFloat16) { + return false; + } + return true; +} + // Note [ Permutation Bookkeeping and Propagation in Parser ] // // The goal in supporting permutation propagation in parser is to: @@ -120,17 +160,33 @@ struct MemoryFormat { // e.g. for an channels-last tensor, permutation_ would be (n-1)123...(n-2); // Note: we are omitting the leading '0' when applicable, and apparently this // encoding only works with rank < 10 + // see [ Note: MemoryFormat and Stride Order ] size_t permutation_ = 0; // default to non-permuted tensor MemoryFormat() = default; + // [ Note: MemoryFormat and Stride Order ] // stride_order is extracted from // `TensorType::stride_properties()::stride_index_`, it describes the // index of axes from fastest to slowest. + // or a 4d tensor, if we have stride_order = {x0, x1, x2, x3}, The i-th + // fastest dimension would be stride_order[i]. + // // Look at comment for c10::Stride in aten/src/ATen/core/jit_type.h - // e.g. for rank 4 non-permuted tensor, stride_order would be {3, 2, 1, 0} - // for rank 4 channels last tensor, stride_order would be {1, 3, 2, 0} + // + // eg0. for rank 4 non-permuted tensor, stride_order would be {3, 2, 1, 0}, it + // means the fastest dimension is axis-3. the next one would be 2, e.t.c.. So + // it's a non-permuted tensor. + // it should be encoded as permutation_ = 3210 (we special case it to 0) + // + // eg1. for rank 4 channels-last tensor, stride_order would be {1, 3, 2, 0}, + // it means the fastest dimension is axis-1. the next one would be 3, and then + // 2, and then 0. So this is a channels last tensor (NCHW). + // it will be encoded as permutation_ = 1320 + // + // eg2. for a rank 4 permuted tensor, stride_order can be {0, 3, 2, 1} + // it will be encoded as permutation_ = 321 (omitting leading '0') void setPermutation(const std::vector& stride_order) { int rank = stride_order.size(); TORCH_INTERNAL_ASSERT( @@ -139,20 +195,111 @@ struct MemoryFormat { // storing stride_order in `permuted_order` for a simpler life, so we don't // have to decode `permutation_` when we want to apply/restore permutation_. permuted_order_ = stride_order; - bool has_permutation_ = false; + bool has_permutation = false; + permutation_ = 0; for (const auto i : c10::irange(rank)) { permutation_ = permutation_ * 10 + stride_order[i]; - if (!has_permutation_ && stride_order[i] != rank - 1 - i) { - has_permutation_ = true; + if (!has_permutation && stride_order[i] != rank - 1 - i) { + has_permutation = true; } } // special case permutation_ to reflect non-permuted tensor - if (!has_permutation_) { + if (!has_permutation) { permutation_ = 0; } } + // returns the stride order for given MemoryFormat encoding permutation_ + // + // see details for encoding in [ Note: MemoryFormat and Stride Order ] + std::vector toStrideOrder() const { + std::vector stride_order; + // return empty vector for no permutation + if (hasPermutation()) { + // be generous with reserved space + stride_order.reserve(10); + bool encountered_zero = false; + size_t permutation = permutation_; + while (permutation != 0) { + int order = static_cast(permutation % 10); + permutation /= 10; + if (order == 0) { + encountered_zero = true; + } + stride_order.push_back(order); + } + if (!encountered_zero) { + // in case leading '0' is omitted, push it back + stride_order.push_back(0); + } + // since we use push_back, our stride_order is reversed. + std::reverse(stride_order.begin(), stride_order.end()); + } + return stride_order; + } + + // returns c10::nullopt when it's not safe to broadcast current permutation to + // rank + c10::optional broadcastToRank(size_t rank) const { + auto ret = Contiguous(); + if (hasPermutation()) { + auto stride_order = toStrideOrder(); + auto cur_rank = stride_order.size(); + // no op for (cur_rank == 0) || (cur_rank == rank) + if (cur_rank < rank) { + // broadcasting to hight rank can be done by: + // 1. incrementing all existing stride order by rank_diff; + // 2. push back decrementing elements starting with rank_diff; + // where rank_diff = rank - cur_rank + // + // see [ Note: MemoryFormat and Stride Order] + // e.g. + // taking broadcasted bias for channels last as an example + // stride_order = {0, 2, 1} broadcasted to rank == 4 would give us + // rank_diff = 4 - 3 = 1 + // take step 1 -> {1, 3, 2} + // take step 2 -> {1, 3, 2, 0} + int rank_diff = static_cast(rank - cur_rank); + for (auto& val : stride_order) { + val += rank_diff; + } + for (int i = rank_diff - 1; i >= 0; i--) { + stride_order.push_back(i); + } + } else if (cur_rank > rank) { + // shrink permutation to lower rank. We can simply discard higher rank + // stride order when they are not permuted to lower rank bit, because in + // those instance we can't obey broadcasting semantics while preserving + // permutation. We check for stride order and ensure that the lower + // `rank` bits are all permuted within the lower rank. Afterwards, we + // update stride_order by decrement each entry by rank_diff to reflect + // correct stride order. + // + // see [ Note: MemoryFormat and Stride Order] + // e.g. for rank 4 channels last {1, 3, 2, 0}: + // 1. format can safely shrink to rank 3, since any@{1, 3, 2} >= + // (4-3); We ditch last (4-3) rank and decrement each element by (4-1) + // that gives us {0, 2, 1}; + // 2. but when we shrink it to rank 2, we have {1, 3} where 1 < (4-2) + // and it can't be handled, we return c10::nullopt. + int collapsed_ranks = static_cast(cur_rank - rank); + for (size_t i = 0; i < rank; i++) { + if (stride_order[i] < collapsed_ranks) { + // illegal collapsing, return c10::nullopt + return c10::nullopt; + } + // update collapsed stride_order + stride_order[i] -= collapsed_ranks; + } + // discard higher rank stride order. + stride_order.resize(rank); + } + ret.setPermutation(stride_order); + } + return ret; + } + // returns non-permuted format static MemoryFormat Contiguous() { return MemoryFormat(); @@ -276,19 +423,29 @@ class ValueHolder { // returns Val in target format if it exists, otherwise, transpose an existing // copy and add that to bookkeeping. CgValue maybeConvertValue(const MemoryFormat& format) { - auto iter_val = vals_.find(format); - if (iter_val != vals_.end()) { - return iter_val->second; - } - // patching scalar value, because memory format doesn't carry real meaning. - if (!is_tensor_view_) { + auto cur_rank = rank(); + // scalar (tensor) where cur_rank == 0, memory format doesn't carry meaning + // and should just return the value as-is. same for non-tensor where + // cur_rank == -1 + if (cur_rank <= 0) { return std::get<1>(getEntry()); } MemoryFormat format_s; CgValue value_s = nullptr; std::tie(format_s, value_s) = getEntry(); - auto val = convertValue(format, format_s, value_s); - vals_[format] = val; + + auto opt_format_d = format.broadcastToRank(static_cast(cur_rank)); + TORCH_INTERNAL_ASSERT( + opt_format_d.has_value(), + "maybeConvertValue requested for illegal permutation"); + MemoryFormat format_d = opt_format_d.value(); + + auto iter_val = vals_.find(format_d); + if (iter_val != vals_.end()) { + return iter_val->second; + } + auto val = convertValue(format_d, format_s, value_s); + vals_[format_d] = val; return val; } @@ -435,6 +592,79 @@ std::pair> getConsistentValues( return std::make_pair(format, list_val); } +// iterate through all vals and return the output MemoryFormat and copies of +// vals. +// 1. When `forced_format == c10::nullopt`, target MemoryFormat returns the +// format of the first val in `vals`, this is to achieve a coherent +// behavior as with eager TensorIterator; +// 2. The target can be overwritten vias specifying `forced_format`. +// +// Note: take `Values&` by reference, since `maybeConvertValue` needs to modify +// the entry and we want that to be updated in `value_map_` +template +std::pair> getPWFormatValues( + c10::optional forced_format, + Values&... vals) { + MemoryFormat format; + if (forced_format.has_value()) { + format = forced_format.value(); + } else { + // get maximum rank on vals + std::vector formats; + std::vector ranks; + auto max_rank_func = [&ranks](const ValueHolder& val, int rank = 0) { + int v_rank = val.rank(); + ranks.push_back(v_rank); + return std::max(rank, v_rank); + }; + int max_rank = iterate(max_rank_func, vals...); + + // going through all permutation, keeping consistency with TensorIterator + // behavior and the first tensor with highest rank dictates output + // permutation + auto format_func = [&formats, &max_rank]( + const ValueHolder& val, + MemoryFormat f = MemoryFormat::Contiguous()) { + auto cur_format = std::get<0>(val.getEntry()); + formats.push_back(cur_format); + return val.rank() == max_rank ? cur_format : f; + }; + format = iterate(format_func, vals...); + + // we need to do pair-wise comparison to ensure that all permutation are + // compatible since permutation could have changed semantics among + // broadcasted tensors. Consider pointwise operation between three tensor + // [N, C, H, W] + [C, H, W] + [H, W] + for (size_t i = 0; i < formats.size() && format.hasPermutation(); i++) { + for (size_t j = 0; j < formats.size(); j++) { + // don't compare scalar tensor or scalar + if (ranks[i] <= 0 || ranks[j] <= 0 || i == j) { + continue; + } + size_t lower_rank = std::min(ranks[i], ranks[j]); + auto i_format = formats[i].broadcastToRank(lower_rank); + auto j_format = formats[j].broadcastToRank(lower_rank); + + // breaks permutation if any: + // 1. i_format can't be broadcasted to lower_rank; + // 2. j_format can't be broadcasted to lower_rank; + if (!i_format.has_value() || !j_format.has_value()) { + format = MemoryFormat::Contiguous(); + } + } + } + } + + auto convert_func = [format]( + ValueHolder& val, std::list list_val = {}) { + list_val.push_front(val.maybeConvertValue(format)); + return list_val; + }; + auto list_val = iterate(convert_func, vals...); + + return std::make_pair(format, list_val); +} + typedef void ( *ParseFuncPtr)(const Node*, std::unordered_map&); typedef bool (*MergeQueryFuncPtr)(const Node*); @@ -502,7 +732,7 @@ class IrParser { "Failure when register value: ", *(val->node()), " with type: ", - val->type()); + val->type()->repr_str()); MemoryFormat format; Val* operand = nullptr; std::tie(format, operand) = value_map_[val->unique()].getEntry(); @@ -520,7 +750,6 @@ class IrParser { (opt_dtype.value() == DataType::Half || opt_dtype.value() == DataType::BFloat16)) { Val* promoted_val = castOp(DataType::Float, operand); - // value_map_.emplace(val->unique(), ValueHolder(promoted_val, format)); value_map_[val->unique()] = ValueHolder(promoted_val, format); } } @@ -540,13 +769,10 @@ class IrParser { auto tensor_type = jit_output->type()->cast(); TORCH_INTERNAL_ASSERT( tensor_type, "output of fusion group is not TensorType."); - if (tensor_type->scalarType() == at::ScalarType::Half) { - // No need to update value_map_ after this point. - out = castOp(DataType::Half, out)->as(); - } - if (tensor_type->scalarType() == at::ScalarType::BFloat16) { - // No need to update value_map_ after this point. - out = castOp(DataType::BFloat16, out)->as(); + if (tensor_type->scalarType().has_value()) { + out = optionalCastStrict( + aten_to_data_type(*tensor_type->scalarType()), out) + ->as(); } fusion->addOutput(out); @@ -574,11 +800,17 @@ class IrParser { static bool lookupInSymbolSet(const Node* node) { initRegistry(); + std::lock_guard lock(parser_mutex_); return parser_symbol_set_.count(node->kind()) != 0; } // return nullptr if entry does not exist static const RegistrationEntry* lookupInRegistry(const Node* node) { + std::lock_guard lock(parser_mutex_); + + if (parser_skip_set_.count(node->kind()) != 0) { + return nullptr; + } // we need to use maybeSchema for nodes like prim::Constant, which doesn't // have a schema auto schema_ptr = node->maybeSchema(); @@ -602,12 +834,28 @@ class IrParser { return nullptr; } + static bool querySkipSymbolSet(c10::Symbol symbol, bool flip) { + initRegistry(); + + std::lock_guard lock(parser_mutex_); + // no need to init registry here (unlike `lookupInSymbolSet`, as + // `parser_skip_set_` is not initialized via initialization + bool ret = parser_skip_set_.count(symbol) != 0; + if (flip) { + if (ret) { + parser_skip_set_.erase(symbol); + } else { + parser_skip_set_.insert(symbol); + } + } + return ret; + } + static void initRegistry() { - if (init_registry_) { - // TODO: mutex this guy; + std::call_once(once_flag_, []() { + std::lock_guard lock(parser_mutex_); registerJitOperator(); - init_registry_ = false; - } + }); } static bool canParseNode(const Node* node) { @@ -685,7 +933,9 @@ class IrParser { "aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor", "aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor", "aten::sub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor", - "aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor"}; + "aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor", + "aten::rsub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor", + "aten::rsub(Tensor self, Scalar other, Scalar alpha) -> Tensor"}; for (auto signature : BinaryOpWithAlpha) { auto ptr_op = getOperatorForLiteral(signature); REGISTER_PARSE_RULE( @@ -701,13 +951,17 @@ class IrParser { BinaryOpType::Add, static_cast(&add_alpha))}, {aten::sub, + std::make_pair( + BinaryOpType::Sub, + static_cast(&sub_alpha))}, + {aten::rsub, std::make_pair( BinaryOpType::Sub, static_cast(&sub_alpha))}}); // TODO: handle scaling factor when it's not constant 1; MemoryFormat format; std::list list_val; - std::tie(format, list_val) = getConsistentValues( + std::tie(format, list_val) = getPWFormatValues( c10::nullopt, value_map[node->inputs()[0]->unique()], value_map[node->inputs()[1]->unique()]); @@ -720,14 +974,16 @@ class IrParser { auto out = alpha->isOneInt() ? binaryOp( op_mapping[node->kind()].first, - lhs, - rhs, + node->kind() == aten::rsub ? rhs : lhs, + node->kind() == aten::rsub ? lhs : rhs, TypePromotion::default_op_config) - : op_mapping[node->kind()].second(lhs, rhs, alpha); + : (node->kind() == aten::rsub + ? op_mapping[node->kind()].second(rhs, lhs, alpha) + : op_mapping[node->kind()].second(lhs, rhs, alpha)); value_map.emplace( node->output()->unique(), ValueHolder(out, format)); }, - nullptr, + isInputNonSizeZeroTensor, nullptr); } @@ -746,7 +1002,7 @@ class IrParser { MemoryFormat format; std::list list_val; - std::tie(format, list_val) = getConsistentValues( + std::tie(format, list_val) = getPWFormatValues( c10::nullopt, value_map[node->inputs()[0]->unique()], value_map[node->inputs()[1]->unique()]); @@ -763,7 +1019,7 @@ class IrParser { value_map.emplace( node->output()->unique(), ValueHolder(out, format)); }, - nullptr, + isInputNonSizeZeroTensor, nullptr); } @@ -777,10 +1033,15 @@ class IrParser { "aten::pow(Scalar self, Tensor exponent) -> Tensor", "aten::remainder(Tensor self, Tensor other) -> Tensor", "aten::fmod(Tensor self, Tensor other) -> Tensor", + "aten::bitwise_and(Tensor self, Tensor other) -> Tensor", "aten::__and__(Tensor self, Tensor other) -> Tensor", + "aten::bitwise_or(Tensor self, Tensor other) -> Tensor", "aten::__or__(Tensor self, Tensor other) -> Tensor", + "aten::bitwise_xor(Tensor self, Tensor other) -> Tensor", "aten::__xor__(Tensor self, Tensor other) -> Tensor", + "aten::bitwise_left_shift(Tensor self, Tensor other) -> Tensor", "aten::__lshift__(Tensor self, Tensor other) -> Tensor", + "aten::bitwise_right_shift(Tensor self, Tensor other) -> Tensor", "aten::__rshift__(Tensor self, Tensor other) -> Tensor"}; for (auto signature : BinaryCastOp) { auto ptr_op = getOperatorForLiteral(signature); @@ -794,15 +1055,20 @@ class IrParser { {aten::pow, BinaryOpType::Pow}, {aten::remainder, BinaryOpType::Remainder}, {aten::fmod, BinaryOpType::Fmod}, + {aten::bitwise_and, BinaryOpType::And}, {aten::__and__, BinaryOpType::And}, + {aten::bitwise_or, BinaryOpType::Or}, {aten::__or__, BinaryOpType::Or}, + {aten::bitwise_xor, BinaryOpType::Xor}, {aten::__xor__, BinaryOpType::Xor}, + {aten::bitwise_left_shift, BinaryOpType::Lshift}, {aten::__lshift__, BinaryOpType::Lshift}, + {aten::bitwise_right_shift, BinaryOpType::Rshift}, {aten::__rshift__, BinaryOpType::Rshift}}); MemoryFormat format; std::list list_val; - std::tie(format, list_val) = getConsistentValues( + std::tie(format, list_val) = getPWFormatValues( c10::nullopt, value_map[node->inputs()[0]->unique()], value_map[node->inputs()[1]->unique()]); @@ -819,7 +1085,7 @@ class IrParser { value_map.emplace( node->output()->unique(), ValueHolder(out, format)); }, - nullptr, + isInputNonSizeZeroTensor, nullptr); } @@ -851,7 +1117,7 @@ class IrParser { MemoryFormat format; std::list list_val; - std::tie(format, list_val) = getConsistentValues( + std::tie(format, list_val) = getPWFormatValues( c10::nullopt, value_map[node->inputs()[0]->unique()], value_map[node->inputs()[1]->unique()]); @@ -868,7 +1134,7 @@ class IrParser { value_map.emplace( node->output()->unique(), ValueHolder(out, format)); }, - nullptr, + isInputNonSizeZeroTensor, nullptr); } @@ -911,7 +1177,7 @@ class IrParser { value_map.emplace( node->output()->unique(), ValueHolder(out, format)); }, - nullptr, + isInputNonSizeZeroTensor, nullptr); } @@ -982,7 +1248,41 @@ class IrParser { value_map.emplace( node->output()->unique(), ValueHolder(out, format)); }, - nullptr, + isInputNonSizeZeroTensor, + nullptr); + } + + std::array UnaryIsOp = { + "aten::isfinite(Tensor self) -> Tensor", + "aten::isinf(Tensor self) -> Tensor", + "aten::isnan(Tensor self) -> Tensor", + "aten::isneginf(Tensor self) -> Tensor", + "aten::isposinf(Tensor self) -> Tensor", + "aten::isreal(Tensor self) -> Tensor"}; + for (auto signature : UnaryIsOp) { + auto ptr_op = getOperatorForLiteral(signature); + REGISTER_PARSE_RULE( + ptr_op, + { + static std::unordered_map op_mapping({ + {aten::isfinite, UnaryOpType::IsFinite}, + {aten::isinf, UnaryOpType::IsInf}, + {aten::isnan, UnaryOpType::IsNan}, + {aten::isneginf, UnaryOpType::IsNegInf}, + {aten::isposinf, UnaryOpType::IsPosInf}, + {aten::isreal, UnaryOpType::IsReal}, + }); + MemoryFormat format; + std::list list_val; + std::tie(format, list_val) = getConsistentValues( + c10::nullopt, value_map[node->inputs()[0]->unique()]); + auto operand = list_val.front(); + list_val.pop_front(); + auto out = unaryIsOp(op_mapping[node->kind()], operand); + value_map.emplace( + node->output()->unique(), ValueHolder(out, format)); + }, + isInputNonSizeZeroTensor, nullptr); } @@ -995,15 +1295,49 @@ class IrParser { MemoryFormat format; std::list list_val; std::tie(format, list_val) = getConsistentValues( - MemoryFormat::Contiguous(), - value_map[node->inputs()[0]->unique()]); + c10::nullopt, value_map[node->inputs()[0]->unique()]); auto operand = list_val.front(); list_val.pop_front(); + if (!node->input(3)->type()->isSubtypeOf( + static_cast(NoneType::get()))) { + auto device = constant_as(node->input(3)); + TORCH_INTERNAL_ASSERT( + device.has_value() && device->is_cuda(), + "rand_like in nvfuser is not on cuda device"); + auto input_tensor_type = + node->input(0)->type()->cast(); + // device->index() == -1 indicating that we don't change device + // index + if (device->index() != -1 && input_tensor_type) { + auto input_device = input_tensor_type->device(); + // we expect device index to be consistent with input and it + // should have already been handled by partition + TORCH_INTERNAL_ASSERT( + !input_device.has_value() || + input_device->index() == device->index(), + "rand_like in nvfuser is not on cuda device"); + } + } + auto out = randlike(operand); - value_map.emplace(node->output()->unique(), out); + value_map.emplace( + node->output()->unique(), ValueHolder(out, format)); + }, + [](const Node* node) -> bool { + if (!isInputNonSizeZeroTensor(node)) { + return false; + } + if (!node->input(1)->type()->isSubtypeOf( + static_cast(NoneType::get())) || + !node->input(2)->type()->isSubtypeOf( + static_cast(NoneType::get())) || + !node->input(5)->type()->isSubtypeOf( + static_cast(NoneType::get()))) { + return false; + } + return true; }, - nullptr, nullptr); } @@ -1016,16 +1350,16 @@ class IrParser { MemoryFormat format; std::list list_val; std::tie(format, list_val) = getConsistentValues( - MemoryFormat::Contiguous(), - value_map[node->inputs()[0]->unique()]); - auto operand = list_val.front(); + c10::nullopt, value_map[node->inputs()[0]->unique()]); + auto operand = list_val.front()->as(); list_val.pop_front(); auto& beta = value_map[node->inputs()[1]->unique()]; auto& threshold = value_map[node->inputs()[2]->unique()]; auto out = softplus(operand, beta, threshold); - value_map.emplace(node->output()->unique(), out); + value_map.emplace( + node->output()->unique(), ValueHolder(out, format)); }, - nullptr, + isInputNonSizeZeroTensor, nullptr); } @@ -1038,17 +1372,17 @@ class IrParser { MemoryFormat format; std::list list_val; std::tie(format, list_val) = getConsistentValues( - MemoryFormat::Contiguous(), - value_map[node->inputs()[0]->unique()]); + c10::nullopt, value_map[node->inputs()[0]->unique()]); auto operand = list_val.front(); list_val.pop_front(); auto& th = value_map[node->inputs()[1]->unique()]; auto& value = value_map[node->inputs()[2]->unique()]; auto out = threshold(operand, th, value); - value_map.emplace(node->output()->unique(), out); + value_map.emplace( + node->output()->unique(), ValueHolder(out, format)); }, - nullptr, + isInputNonSizeZeroTensor, nullptr); } @@ -1060,7 +1394,7 @@ class IrParser { { MemoryFormat format; std::list list_val; - std::tie(format, list_val) = getConsistentValues( + std::tie(format, list_val) = getPWFormatValues( c10::nullopt, value_map[node->inputs()[0]->unique()], value_map[node->inputs()[1]->unique()]); @@ -1080,7 +1414,7 @@ class IrParser { value_map.emplace( node->output()->unique(), ValueHolder(out, format)); }, - nullptr, + isInputNonSizeZeroTensor, nullptr); } @@ -1096,17 +1430,18 @@ class IrParser { c10::nullopt, value_map[node->inputs()[0]->unique()]); auto operand = list_val.front(); list_val.pop_front(); - Val* low = value_map.count(node->inputs()[1]->unique()) != 0 + Val* min = value_map.count(node->inputs()[1]->unique()) != 0 ? *value_map[node->inputs()[1]->unique()] - : new Double(std::numeric_limits::min()); - Val* high = value_map.count(node->inputs()[2]->unique()) != 0 + : nullptr; + Val* max = value_map.count(node->inputs()[2]->unique()) != 0 ? *value_map[node->inputs()[2]->unique()] - : new Double(std::numeric_limits::max()); + : nullptr; - auto out = clamp(operand, low, high); - value_map.emplace(node->output()->unique(), out); + Val* out = clamp(operand, min, max); + value_map.emplace( + node->output()->unique(), ValueHolder(out, format)); }, - nullptr, + isInputNonSizeZeroTensor, nullptr); } @@ -1118,8 +1453,8 @@ class IrParser { { MemoryFormat format; std::list list_val; - std::tie(format, list_val) = getConsistentValues( - MemoryFormat::Contiguous(), + std::tie(format, list_val) = getPWFormatValues( + c10::nullopt, value_map[node->inputs()[0]->unique()], value_map[node->inputs()[1]->unique()], value_map[node->inputs()[2]->unique()]); @@ -1134,7 +1469,7 @@ class IrParser { value_map.emplace( node->output()->unique(), ValueHolder(out, format)); }, - nullptr, + isInputNonSizeZeroTensor, nullptr); } @@ -1149,8 +1484,8 @@ class IrParser { { MemoryFormat format; std::list list_val; - std::tie(format, list_val) = getConsistentValues( - MemoryFormat::Contiguous(), + std::tie(format, list_val) = getPWFormatValues( + c10::nullopt, value_map[node->inputs()[0]->unique()], value_map[node->inputs()[1]->unique()], value_map[node->inputs()[2]->unique()]); @@ -1165,7 +1500,7 @@ class IrParser { value_map.emplace( node->output()->unique(), ValueHolder(out, format)); }, - nullptr, + isInputNonSizeZeroTensor, nullptr); } } @@ -1178,7 +1513,7 @@ class IrParser { { MemoryFormat format; std::list list_val; - std::tie(format, list_val) = getConsistentValues( + std::tie(format, list_val) = getPWFormatValues( c10::nullopt, value_map[node->inputs()[0]->unique()], value_map[node->inputs()[1]->unique()], @@ -1197,7 +1532,7 @@ class IrParser { value_map.emplace( node->output()->unique(), ValueHolder(out, format)); }, - nullptr, + isInputNonSizeZeroTensor, nullptr); } @@ -1210,7 +1545,7 @@ class IrParser { MemoryFormat format; std::list list_val; std::tie(format, list_val) = getConsistentValues( - MemoryFormat::Contiguous(), + c10::nullopt, value_map[node->inputs()[0]->unique()], value_map[node->inputs()[1]->unique()]); auto input = list_val.front(); @@ -1225,8 +1560,11 @@ class IrParser { if (train.value()) { auto result = dropout(input->as(), prob); - value_map.emplace(node->output(0)->unique(), result.output); - value_map.emplace(node->output(1)->unique(), result.mask); + value_map.emplace( + node->output(0)->unique(), + ValueHolder(result.output, format)); + value_map.emplace( + node->output(1)->unique(), ValueHolder(result.mask, format)); } else { value_map.emplace(node->output(0)->unique(), input); value_map.emplace( @@ -1234,7 +1572,15 @@ class IrParser { ValueHolder(TensorViewBuilder().build(), format)); } }, - nullptr, + [](const Node* node) -> bool { + if (!isInputNonSizeZeroTensor(node)) { + return false; + } + if (node->inputs()[2]->node()->kind() != prim::Constant) { + return false; + } + return true; + }, nullptr); } @@ -1247,7 +1593,7 @@ class IrParser { MemoryFormat format; std::list list_val; std::tie(format, list_val) = getConsistentValues( - MemoryFormat::Contiguous(), + c10::nullopt, value_map[node->inputs()[0]->unique()], value_map[node->inputs()[1]->unique()]); auto input = list_val.front(); @@ -1262,12 +1608,22 @@ class IrParser { if (train.value()) { auto result = dropout(input->as(), prob); - value_map.emplace(node->output()->unique(), result.output); + value_map.emplace( + node->output()->unique(), ValueHolder(result.output, format)); } else { - value_map.emplace(node->output()->unique(), input); + value_map.emplace( + node->output()->unique(), ValueHolder(input, format)); + } + }, + [](const Node* node) -> bool { + if (!isInputNonSizeZeroTensor(node)) { + return false; + } + if (node->inputs()[2]->node()->kind() != prim::Constant) { + return false; } + return true; }, - nullptr, nullptr); } @@ -1279,8 +1635,8 @@ class IrParser { { MemoryFormat format; std::list list_val; - std::tie(format, list_val) = getConsistentValues( - MemoryFormat::Contiguous(), + std::tie(format, list_val) = getPWFormatValues( + c10::nullopt, value_map[node->inputs()[0]->unique()], value_map[node->inputs()[1]->unique()], value_map[node->inputs()[2]->unique()]); @@ -1293,9 +1649,10 @@ class IrParser { auto output = dropout_backward( grad->as(), mask->as(), scale); - value_map.emplace(node->output()->unique(), output); + value_map.emplace( + node->output()->unique(), ValueHolder(output, format)); }, - nullptr, + isInputNonSizeZeroTensor, nullptr); } @@ -1307,8 +1664,6 @@ class IrParser { REGISTER_PARSE_RULE( ptr_op, { - auto fusion = FusionGuard::getCurFusion(); - // TODO: handle channels last MemoryFormat format; std::list list_val; @@ -1336,9 +1691,6 @@ class IrParser { static_cast(NoneType::get()))) { running_mean = value_map[node->input(3)->unique()]->as(); - TORCH_INTERNAL_ASSERT( - fusion->hasInput(running_mean), - "IO_tensor `instance_norm::running_mean` can only be input tensor to fusion"); } TensorView* running_var = nullptr; @@ -1346,9 +1698,6 @@ class IrParser { static_cast(NoneType::get()))) { running_var = value_map[node->input(4)->unique()]->as(); - TORCH_INTERNAL_ASSERT( - fusion->hasInput(running_var), - "IO_tensor `instance_norm::running_var` can only be input tensor to fusion"); } // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) @@ -1361,7 +1710,7 @@ class IrParser { Val* momentum_ptr = nullptr; // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) if (auto momentum = constant_as(node->input(6))) { - momentum_ptr = new Double(momentum.value()); + momentum_ptr = IrBuilder::create(momentum.value()); } else { // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) momentum_ptr = value_map[node->input(6)->unique()]; @@ -1370,7 +1719,7 @@ class IrParser { Val* eps_ptr = nullptr; // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) if (auto eps = constant_as(node->input(7))) { - eps_ptr = new Double(eps.value()); + eps_ptr = IrBuilder::create(eps.value()); } else { // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) eps_ptr = value_map[node->input(7)->unique()]; @@ -1391,7 +1740,13 @@ class IrParser { value_map.emplace(node->output()->unique(), result.output); } }, - [](const Node* node) -> bool { return true; }, + [](const Node* node) -> bool { + if (isReductionNonCompatibleTensor( + node->input(0)->type()->cast())) { + return false; + } + return true; + }, [](const Node* node) -> OperatorType { return OperatorType::Normalization; }); @@ -1455,7 +1810,7 @@ class IrParser { Val* momentum_ptr = nullptr; // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) if (auto momentum = constant_as(node->input(6))) { - momentum_ptr = new Double(momentum.value()); + momentum_ptr = IrBuilder::create(momentum.value()); } else { // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) momentum_ptr = value_map[node->input(6)->unique()]; @@ -1464,7 +1819,7 @@ class IrParser { Val* eps_ptr = nullptr; // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) if (auto eps = constant_as(node->input(7))) { - eps_ptr = new Double(eps.value()); + eps_ptr = IrBuilder::create(eps.value()); } else { // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) eps_ptr = value_map[node->input(7)->unique()]; @@ -1502,7 +1857,16 @@ class IrParser { ValueHolder(result.output, format)); } }, - [](const Node* node) -> bool { return true; }, + [](const Node* node) -> bool { + if (isReductionNonCompatibleTensor( + node->input(0)->type()->cast())) { + return false; + } + if (node->input(5)->node()->kind() != prim::Constant) { + return false; + } + return true; + }, [](const Node* node) -> OperatorType { return OperatorType::Normalization; }); @@ -1510,156 +1874,233 @@ class IrParser { } { - auto ptr_op = getOperatorForLiteral( - "aten::_batch_norm_impl_index_backward(int impl_index, Tensor input, Tensor grad_output, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var_transform, bool train, float eps, bool[3] output_mask, Tensor reservedSpace) -> (Tensor, Tensor, Tensor)"); - REGISTER_PARSE_RULE( - ptr_op, - { - // discard impl_index and reservedSpace since we don't use them - MemoryFormat format; - std::list list_val; - std::tie(format, list_val) = getConsistentValues( - c10::nullopt, - value_map[node->inputs()[1]->unique()], - value_map[node->inputs()[2]->unique()]); - if (format.hasPermutation() && !format.isChannelsLast()) { + std::array BatchNormBwd = { + "aten::_batch_norm_impl_index_backward(int impl_index, Tensor input, Tensor grad_output, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var_transform, bool train, float eps, bool[3] output_mask, Tensor reservedSpace) -> (Tensor, Tensor, Tensor)", + "aten::native_batch_norm_backward(Tensor grad_out, Tensor input, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_invstd, bool train, float eps, bool[3] output_mask) -> (Tensor, Tensor, Tensor)"}; + for (auto signature : BatchNormBwd) { + auto ptr_op = getOperatorForLiteral(signature); + REGISTER_PARSE_RULE( + ptr_op, + { + JitValue* ts_input = nullptr; + JitValue* ts_grad_output; + JitValue* ts_weight = nullptr; + JitValue* ts_r_mean = nullptr; + JitValue* ts_r_var = nullptr; + JitValue* ts_save_mean = nullptr; + JitValue* ts_save_invstd = nullptr; + JitValue* ts_train = nullptr; + JitValue* ts_eps = nullptr; + JitValue* ts_mask = nullptr; + if (node->kind() == + c10::Symbol::fromQualString( + "aten::_batch_norm_impl_index_backward")) { + ts_input = node->input(1); + ts_grad_output = node->input(2); + ts_weight = node->input(3); + ts_r_mean = node->input(4); + ts_r_var = node->input(5); + ts_save_mean = node->input(6); + ts_save_invstd = node->input(7); + ts_train = node->input(8); + ts_eps = node->input(9); + ts_mask = node->input(10); + } else if ( + node->kind() == + c10::Symbol::fromQualString( + "aten::native_batch_norm_backward")) { + ts_grad_output = node->input(0); + ts_input = node->input(1); + ts_weight = node->input(2); + ts_r_mean = node->input(3); + ts_r_var = node->input(4); + ts_save_mean = node->input(5); + ts_save_invstd = node->input(6); + ts_train = node->input(7); + ts_eps = node->input(8); + ts_mask = node->input(9); + } else { + TORCH_INTERNAL_ASSERT( + false, + "Forgot to register the key for BN variation: ", + node->kind().toDisplayString()); + } + + // discard impl_index and reservedSpace since we don't use them + MemoryFormat format; + std::list list_val; std::tie(format, list_val) = getConsistentValues( - MemoryFormat::Contiguous(), - value_map[node->inputs()[1]->unique()], - value_map[node->inputs()[2]->unique()]); - } - auto operand0 = list_val.front(); - list_val.pop_front(); - auto operand1 = list_val.front(); - list_val.pop_front(); - auto input = operand0->as(); - auto grad_out = operand1->as(); + c10::nullopt, + value_map[ts_input->unique()], + value_map[ts_grad_output->unique()]); + if (format.hasPermutation() && !format.isChannelsLast()) { + std::tie(format, list_val) = getConsistentValues( + MemoryFormat::Contiguous(), + value_map[ts_input->unique()], + value_map[ts_grad_output->unique()]); + } + auto operand0 = list_val.front(); + list_val.pop_front(); + auto operand1 = list_val.front(); + list_val.pop_front(); + auto input = operand0->as(); + auto grad_out = operand1->as(); - TensorView* weight = nullptr; - if (!node->input(3)->type()->isSubtypeOf( - static_cast(NoneType::get()))) { - weight = value_map[node->input(3)->unique()]->as(); - } + TensorView* weight = nullptr; + if (!ts_weight->type()->isSubtypeOf( + static_cast(NoneType::get()))) { + weight = value_map[ts_weight->unique()]->as(); + } - TensorView* running_mean = nullptr; - if (!node->input(4)->type()->isSubtypeOf( - static_cast(NoneType::get()))) { - running_mean = - value_map[node->input(4)->unique()]->as(); - } + TensorView* running_mean = nullptr; + if (!ts_r_mean->type()->isSubtypeOf( + static_cast(NoneType::get()))) { + running_mean = value_map[ts_r_mean->unique()]->as(); + } - TensorView* running_var = nullptr; - if (!node->input(5)->type()->isSubtypeOf( - static_cast(NoneType::get()))) { - running_var = - value_map[node->input(5)->unique()]->as(); - } + TensorView* running_var = nullptr; + if (!ts_r_var->type()->isSubtypeOf( + static_cast(NoneType::get()))) { + running_var = value_map[ts_r_var->unique()]->as(); + } - TensorView* save_mean = nullptr; - // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) - if (!node->input(6)->type()->isSubtypeOf( - static_cast(NoneType::get()))) { + TensorView* save_mean = nullptr; // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) - save_mean = value_map[node->input(6)->unique()]->as(); - } - - TensorView* save_invstd = nullptr; - // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) - if (!node->input(7)->type()->isSubtypeOf( - static_cast(NoneType::get()))) { - save_invstd = - // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) - value_map[node->input(7)->unique()]->as(); - } + if (!ts_save_mean->type()->isSubtypeOf( + static_cast(NoneType::get()))) { + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) + save_mean = value_map[ts_save_mean->unique()]->as(); + } - // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) - auto training = constant_as(node->input(8)); - TORCH_INTERNAL_ASSERT( - training.has_value(), - "The training (bool) parameter is required."); - const bool kTraining = training.value(); + TensorView* save_invstd = nullptr; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) + if (!ts_save_invstd->type()->isSubtypeOf( + static_cast(NoneType::get()))) { + save_invstd = + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) + value_map[ts_save_invstd->unique()]->as(); + } - // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) - Val* eps_ptr = nullptr; - // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) - if (auto eps = constant_as(node->input(9))) { - eps_ptr = new Double(eps.value()); - } else { // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) - eps_ptr = value_map[node->input(7)->unique()]; - } + auto training = constant_as(ts_train); + TORCH_INTERNAL_ASSERT( + training.has_value(), + "The training (bool) parameter is required."); + const bool kTraining = training.value(); - // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) - auto out_mask_list = constant_as>(node->input(10)); - TORCH_INTERNAL_ASSERT( - out_mask_list.has_value(), - "output mask for batch_norm_backward"); - std::vector output_mask; - for (const auto value : out_mask_list->vec()) { - output_mask.emplace_back(static_cast(value)); - } + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) + Val* eps_ptr = nullptr; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) + if (auto eps = constant_as(ts_eps)) { + eps_ptr = IrBuilder::create(eps.value()); + } else { + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) + eps_ptr = value_map[ts_eps->unique()]; + } - // TODO: merge this loop below. - if (kTraining) { - TORCH_INTERNAL_ASSERT( - save_mean != nullptr && save_invstd != nullptr, - "When training=True, save_mean and save_invstd are required."); - } else { - // TODO: this is not a legit assumption? Can't we run with - // track_running_stats == false && training == false - // which should just run through the case above. + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) + auto out_mask_list = constant_as>(ts_mask); TORCH_INTERNAL_ASSERT( - running_mean != nullptr && running_var != nullptr, - "When training=False, running_mean and running_invstd are required."); - } + out_mask_list.has_value(), + "output mask for batch_norm_backward"); + std::vector output_mask; + for (const auto value : out_mask_list->vec()) { + output_mask.emplace_back(static_cast(value)); + } - auto grads = batch_norm_backward( - input, - grad_out, - weight, - running_mean, - running_var, - save_mean, - save_invstd, - kTraining, - eps_ptr, - output_mask, - format.isChannelsLast()); + // TODO: merge this loop below. + if (kTraining) { + TORCH_INTERNAL_ASSERT( + save_mean != nullptr && save_invstd != nullptr, + "When training=True, save_mean and save_invstd are required."); + } else { + // TODO: this is not a legit assumption? Can't we run with + // track_running_stats == false && training == false + // which should just run through the case above. + TORCH_INTERNAL_ASSERT( + running_mean != nullptr && running_var != nullptr, + "When training=False, running_mean and running_invstd are required."); + } - if (output_mask[0]) { - TORCH_INTERNAL_ASSERT(grads.grad_input != nullptr); - value_map.emplace( - node->output(0)->unique(), - ValueHolder(grads.grad_input, format)); - } else { - TORCH_INTERNAL_ASSERT(grads.grad_input == nullptr); - value_map.emplace( - node->output(0)->unique(), - ValueHolder(TensorViewBuilder().build(), format)); - } + auto grads = batch_norm_backward( + input, + grad_out, + weight, + running_mean, + running_var, + save_mean, + save_invstd, + kTraining, + eps_ptr, + output_mask, + format.isChannelsLast()); - if (output_mask[1]) { - TORCH_INTERNAL_ASSERT(grads.grad_weight != nullptr); - value_map.emplace(node->output(1)->unique(), grads.grad_weight); - } else { - TORCH_INTERNAL_ASSERT(grads.grad_weight == nullptr); - value_map.emplace( - node->output(1)->unique(), TensorViewBuilder().build()); - } + if (output_mask[0]) { + TORCH_INTERNAL_ASSERT(grads.grad_input != nullptr); + value_map.emplace( + node->output(0)->unique(), + ValueHolder(grads.grad_input, format)); + } else { + TORCH_INTERNAL_ASSERT(grads.grad_input == nullptr); + value_map.emplace( + node->output(0)->unique(), + ValueHolder(TensorViewBuilder().build(), format)); + } - if (output_mask[2]) { - TORCH_INTERNAL_ASSERT(grads.grad_bias != nullptr); - value_map.emplace(node->output(2)->unique(), grads.grad_bias); - } else { - TORCH_INTERNAL_ASSERT(grads.grad_bias == nullptr); - value_map.emplace( - node->output(2)->unique(), TensorViewBuilder().build()); - } - }, - [](const Node* node) -> bool { return true; }, - [](const Node* node) -> OperatorType { - return OperatorType::Normalization; - }); + if (output_mask[1]) { + TORCH_INTERNAL_ASSERT(grads.grad_weight != nullptr); + value_map.emplace(node->output(1)->unique(), grads.grad_weight); + } else { + TORCH_INTERNAL_ASSERT(grads.grad_weight == nullptr); + value_map.emplace( + node->output(1)->unique(), TensorViewBuilder().build()); + } + + if (output_mask[2]) { + TORCH_INTERNAL_ASSERT(grads.grad_bias != nullptr); + value_map.emplace(node->output(2)->unique(), grads.grad_bias); + } else { + TORCH_INTERNAL_ASSERT(grads.grad_bias == nullptr); + value_map.emplace( + node->output(2)->unique(), TensorViewBuilder().build()); + } + }, + [](const Node* node) -> bool { + if (isReductionNonCompatibleTensor( + node->input(1)->type()->cast())) { + return false; + } + if (node->kind() == + c10::Symbol::fromQualString( + "aten::_batch_norm_impl_index_backward")) { + if (node->inputs()[8]->node()->kind() != prim::Constant) { + return false; + } + if (node->inputs()[10]->node()->kind() != prim::Constant) { + return false; + } + } else if ( + node->kind() == + c10::Symbol::fromQualString( + "aten::native_batch_norm_backward")) { + if (node->inputs()[7]->node()->kind() != prim::Constant) { + return false; + } + if (node->inputs()[9]->node()->kind() != prim::Constant) { + return false; + } + } else { + TORCH_INTERNAL_ASSERT( + false, + "Forgot to update profiled constant check for", + node->kind().toDisplayString()); + } + return true; + }, + [](const Node* node) -> OperatorType { + return OperatorType::Normalization; + }); + } } { @@ -1701,7 +2142,7 @@ class IrParser { Val* eps_ptr = nullptr; if (auto eps = constant_as(node->input(4))) { - eps_ptr = new Double(eps.value()); + eps_ptr = IrBuilder::create(eps.value()); } else { eps_ptr = value_map[node->input(4)->unique()]; } @@ -1721,7 +2162,16 @@ class IrParser { } }, // TODO: #ProfileIValue List should update this - [](const Node* node) -> bool { return true; }, + [](const Node* node) -> bool { + if (isReductionNonCompatibleTensor( + node->input(0)->type()->cast())) { + return false; + } + if (node->inputs()[1]->node()->kind() != prim::Constant) { + return false; + } + return true; + }, [](const Node* node) -> OperatorType { return OperatorType::Normalization; }); @@ -1819,42 +2269,15 @@ class IrParser { } }, // TODO: #ProfileIValue List should update this - [](const Node* node) -> bool { return true; }, - [](const Node* node) -> OperatorType { - return OperatorType::Normalization; - }); - } - - { - auto ptr_op = getOperatorForLiteral( - "aten::softmax.int(Tensor self, int dim, int? dtype) -> Tensor"); - REGISTER_PARSE_RULE( - ptr_op, - { - MemoryFormat format; - std::list list_val; - std::tie(format, list_val) = getConsistentValues( - MemoryFormat::Contiguous(), - value_map[node->inputs()[0]->unique()]); - auto input_t = list_val.front(); - list_val.pop_front(); - auto input = input_t->as(); - - auto dim_value = constant_as(node->input(1)); - TORCH_INTERNAL_ASSERT( - dim_value.has_value(), "dim in softmax is not valid"); - - auto output = softmax(input, dim_value.value()); - value_map.emplace(node->output()->unique(), output); - }, [](const Node* node) -> bool { - if (node->inputs()[1]->node()->kind() != prim::Constant) { + if (isReductionNonCompatibleTensor( + node->input(0)->type()->cast())) { return false; } - // TODO: support dynamic input by profiling it - if (!node->inputs()[2]->type()->isSubtypeOf( - static_cast(NoneType::get())) && - node->inputs()[2]->node()->kind() != prim::Constant) { + if (node->inputs()[2]->node()->kind() != prim::Constant) { + return false; + } + if (node->inputs()[7]->node()->kind() != prim::Constant) { return false; } return true; @@ -1864,6 +2287,67 @@ class IrParser { }); } + { + std::array SoftmaxFwd = { + "aten::softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor", + "aten::log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor"}; + for (auto signature : SoftmaxFwd) { + auto ptr_op = getOperatorForLiteral(signature); + REGISTER_PARSE_RULE( + ptr_op, + { + MemoryFormat format; + std::list list_val; + std::tie(format, list_val) = getConsistentValues( + MemoryFormat::Contiguous(), + value_map[node->inputs()[0]->unique()]); + auto input_t = list_val.front(); + list_val.pop_front(); + auto input = input_t->as(); + + auto dim_value = constant_as(node->input(1)); + TORCH_INTERNAL_ASSERT( + dim_value.has_value(), "dim in softmax is not valid"); + + auto data_type = DataType::Null; + if (const auto opt_ivalue = toIValue(node->input(2))) { + if (!opt_ivalue.value().isNone()) { + data_type = aten_to_data_type(opt_ivalue->toScalarType()); + } + } + + input = (data_type != DataType::Null) + ? optionalCastStrict(data_type, input)->as() + : input; + + bool is_log_softmax = node->kind() == + c10::Symbol::fromQualString("aten::log_softmax"); + + auto output = (is_log_softmax) + ? log_softmax(input, dim_value.value()) + : softmax(input, dim_value.value()); + + value_map.emplace(node->output()->unique(), output); + }, + [](const Node* node) -> bool { + if (isReductionNonCompatibleTensor( + node->input(0)->type()->cast())) { + return false; + } + if (node->inputs()[1]->node()->kind() != prim::Constant) { + return false; + } + if (!isScalarTypeCompatible(node, 2)) { + return false; + } + return true; + }, + [](const Node* node) -> OperatorType { + return OperatorType::Normalization; + }); + } + } + { // LTC uses this op for softmax auto ptr_op = getOperatorForLiteral( "aten::_softmax(Tensor self, int dim, bool half_to_float) -> Tensor"); @@ -1887,6 +2371,10 @@ class IrParser { value_map.emplace(node->output()->unique(), output); }, [](const Node* node) -> bool { + if (isReductionNonCompatibleTensor( + node->input(0)->type()->cast())) { + return false; + } if (node->inputs()[1]->node()->kind() != prim::Constant) { return false; } @@ -1911,35 +2399,115 @@ class IrParser { } { - auto ptr_op = getOperatorForLiteral( - "aten::_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor"); - REGISTER_PARSE_RULE( - ptr_op, - { - auto grad_output = - value_map[node->input(0)->unique()]->as(); + std::array SoftmaxBwd = { + "aten::_log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor", + "aten::_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor"}; + for (auto signature : SoftmaxBwd) { + auto ptr_op = getOperatorForLiteral(signature); + REGISTER_PARSE_RULE( + ptr_op, + { + MemoryFormat format; + std::list list_val; + std::tie(format, list_val) = getConsistentValues( + MemoryFormat::Contiguous(), + value_map[node->inputs()[0]->unique()], + value_map[node->inputs()[1]->unique()]); + auto grad_output_t = list_val.front(); + list_val.pop_front(); + auto grad_output = grad_output_t->as(); - auto output = value_map[node->input(1)->unique()]->as(); + auto output_t = list_val.front(); + list_val.pop_front(); + auto output = output_t->as(); - auto dim_value = constant_as(node->input(2)); - TORCH_INTERNAL_ASSERT( - dim_value.has_value(), "dim in softmax is not valid"); + auto dim_value = constant_as(node->input(2)); + TORCH_INTERNAL_ASSERT( + dim_value.has_value(), "dim in softmax is not valid"); - // input_dtype here is ignored! type_inference handles it - auto grad_input = - softmax_backward(grad_output, output, dim_value.value()); + // input_dtype here is ignored! type_inference handles it + bool is_log_softmax = node->kind() == + c10::Symbol::fromQualString( + "aten::_log_softmax_backward_data"); + auto grad_input = (is_log_softmax) + ? log_softmax_backward(grad_output, output, dim_value.value()) + : softmax_backward(grad_output, output, dim_value.value()); - value_map.emplace(node->output()->unique(), grad_input); - }, - [](const Node* node) -> bool { - if (node->inputs()[2]->node()->kind() != prim::Constant) { - return false; - } - return true; - }, - [](const Node* node) -> OperatorType { - return OperatorType::Normalization; - }); + value_map.emplace(node->output()->unique(), grad_input); + }, + [](const Node* node) -> bool { + if (isReductionNonCompatibleTensor( + node->input(0)->type()->cast())) { + return false; + } + if (node->inputs()[2]->node()->kind() != prim::Constant) { + return false; + } + if (node->inputs()[3]->node()->kind() != prim::Constant) { + return false; + } + return true; + }, + [](const Node* node) -> OperatorType { + return OperatorType::Normalization; + }); + } + } + + { + std::array Variance = { + "aten::var.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor", + "aten::std.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor"}; + for (auto signature : Variance) { + auto ptr_op = getOperatorForLiteral(signature); + REGISTER_PARSE_RULE( + ptr_op, + { + MemoryFormat format; + std::list list_val; + std::tie(format, list_val) = getConsistentValues( + MemoryFormat::Contiguous(), + value_map[node->inputs()[0]->unique()]); + auto input_t = list_val.front(); + list_val.pop_front(); + auto input = input_t->as(); + + bool is_variance = + node->kind() == c10::Symbol::fromQualString("aten::var"); + + auto dims_list = constant_as>(node->input(1)); + TORCH_INTERNAL_ASSERT( + dims_list.has_value(), "Cannot fuse with dynamic axes"); + std::vector dims; + for (const auto dim : dims_list->vec()) { + dims.emplace_back(static_cast(dim)); + } + + auto unbiased = constant_as(node->input(2)); + TORCH_INTERNAL_ASSERT( + unbiased.has_value(), "Cannot fuse with dynamic unbiased"); + + auto keepdim = constant_as(node->input(3)); + TORCH_INTERNAL_ASSERT( + keepdim.has_value(), "Cannot fuse with dynamic keepdim"); + + auto output = (is_variance) + ? variance(input, dims, unbiased.value(), keepdim.value()) + : standard_deviation( + input, dims, unbiased.value(), keepdim.value()); + value_map.emplace(node->output()->unique(), output); + }, + [](const Node* node) -> bool { + if (isReductionNonCompatibleTensor( + node->input(0)->type()->cast())) { + return false; + } + return true; + }, + [](const Node* node) -> OperatorType { + return OperatorType::Normalization; + }); + } } { @@ -1961,8 +2529,13 @@ class IrParser { dims_list.has_value(), "aten::sum cannot be fused with dynamic axes"); std::vector dims; - for (const auto dim : dims_list->vec()) { - dims.emplace_back(static_cast(dim)); + if (!dims_list->empty()) { + for (const auto dim : dims_list->vec()) { + dims.emplace_back(static_cast(dim)); + } + } else { + dims.resize(self->as()->nDims()); + std::iota(dims.begin(), dims.end(), 0); } auto keepdim = constant_as(node->input(2)); TORCH_INTERNAL_ASSERT( @@ -1972,20 +2545,20 @@ class IrParser { value_map.emplace(node->output()->unique(), out); }, [](const Node* node) -> bool { + if (isReductionNonCompatibleTensor( + node->input(0)->type()->cast())) { + return false; + } // TODO: support cast of output types if (!node->inputs()[3]->type()->isSubtypeOf( static_cast(NoneType::get()))) { // We can only handle output as half, float, and double; if (const auto opt_ivalue = toIValue(node->input(3))) { const auto scalar_type = opt_ivalue->toScalarType(); - if (scalar_type == at::ScalarType::Double || - scalar_type == at::ScalarType::Float || - scalar_type == at::ScalarType::BFloat16 || - scalar_type == at::ScalarType::Half) { - return true; + if (!at::isFloatingType(scalar_type)) { + return false; } } - return false; } // we don't support dynamic reduction axes; if (node->inputs()[1]->node()->kind() != prim::Constant) { @@ -2021,15 +2594,20 @@ class IrParser { dims_list.has_value(), "aten::mean cannot be fused with dynamic axes"); std::vector dims; - for (const auto dim : dims_list->vec()) { - dims.emplace_back(static_cast(dim)); + if (!dims_list->empty()) { + for (const auto dim : dims_list->vec()) { + dims.emplace_back(static_cast(dim)); + } + } else { + dims.resize(self->as()->nDims()); + std::iota(dims.begin(), dims.end(), 0); } auto keepdim = constant_as(node->input(2)); TORCH_INTERNAL_ASSERT( keepdim.has_value(), "aten::mean cannot be fused with dynamic keepdim"); auto o_sum = sum(self, dims, keepdim.value()); - Val* num_features = new Double(1); + Val* num_features = IrBuilder::create(1); for (auto axis : dims) { if (axis < 0) { axis += int(self->nDims()); @@ -2041,20 +2619,20 @@ class IrParser { value_map.emplace(node->output()->unique(), out); }, [](const Node* node) -> bool { + if (isReductionNonCompatibleTensor( + node->input(0)->type()->cast())) { + return false; + } // TODO: support cast of output types if (!node->inputs()[3]->type()->isSubtypeOf( static_cast(NoneType::get()))) { // We can only handle output as half, float, and double; if (const auto opt_ivalue = toIValue(node->input(3))) { const auto scalar_type = opt_ivalue->toScalarType(); - if (scalar_type == at::ScalarType::Double || - scalar_type == at::ScalarType::Float || - scalar_type == at::ScalarType::BFloat16 || - scalar_type == at::ScalarType::Half) { - return true; + if (!at::isFloatingType(scalar_type)) { + return false; } } - return false; } // we don't support dynamic reduction axes; if (node->inputs()[1]->node()->kind() != prim::Constant) { @@ -2091,7 +2669,13 @@ class IrParser { size_to.has_value(), "aten::sum cannot be fused with dynamic axes"); if (!size_to->empty()) { - auto out = sum_to(self->as(), size_to->vec()); + auto input = self->as(); + auto out = sum_to(input, size_to->vec()); + // this copy is not necessary, but making copy avoids tricky + // computational graph where no-op could be challenging. + if (out == input) { + out = set(input); + } value_map.emplace(node->output()->unique(), out); } else { // We are introducing alias here! @@ -2099,13 +2683,15 @@ class IrParser { } }, [](const Node* node) -> bool { + if (isReductionNonCompatibleTensor( + node->input(0)->type()->cast())) { + return false; + } // we don't support dynamic reduction axes; if (node->inputs()[1]->node()->kind() != prim::Constant) { return false; } return true; - // auto size_to = constant_as>(node->input(1)); - // return size_to.has_value() && !size_to->empty(); }, [](const Node* node) -> OperatorType { auto size_to = constant_as>(node->input(1)); @@ -2140,7 +2726,7 @@ class IrParser { value_map.emplace( node->output()->unique(), ValueHolder(out, format)); }, - nullptr, + isInputNonSizeZeroTensor, nullptr); } } @@ -2178,7 +2764,20 @@ class IrParser { value_map.emplace( node->output()->unique(), ValueHolder(out, format)); }, - nullptr, + [](const Node* node) -> bool { + if (!isInputNonSizeZeroTensor(node)) { + return false; + } + if (node->inputs()[1]->node()->kind() != prim::Constant) { + return false; + } + // we do not support explicit memory_format on output + if (!node->inputs()[4]->type()->isSubtypeOf( + static_cast(NoneType::get()))) { + return false; + } + return true; + }, nullptr); } @@ -2207,7 +2806,7 @@ class IrParser { value_map.emplace( node->output()->unique(), ValueHolder(out, format)); }, - nullptr, + isInputNonSizeZeroTensor, nullptr); } @@ -2225,12 +2824,8 @@ class IrParser { TORCH_INTERNAL_ASSERT(false, "not implemented yet"); }, [](const Node* node) -> bool { - // We only profile `linear` layer with bias. - if (node->input(2)->type()->isSubtypeOf( - static_cast(NoneType::get()))) { - return false; - } - return true; + // We only profile `linear` layer but not fusing it. + return false; }); } @@ -2250,7 +2845,7 @@ class IrParser { } else { MemoryFormat format; std::list list_val; - std::tie(format, list_val) = getConsistentValues( + std::tie(format, list_val) = getPWFormatValues( c10::nullopt, value_map[node->inputs()[0]->unique()], value_map[node->inputs()[1]->unique()]); @@ -2268,12 +2863,13 @@ class IrParser { node->output()->unique(), ValueHolder(out, format)); } }, - nullptr, + isInputNonSizeZeroTensor, nullptr); } { - auto ptr_op = getOperatorForLiteral("aten::gelu(Tensor self) -> Tensor"); + auto ptr_op = getOperatorForLiteral( + "aten::gelu(Tensor self, *, str approximate='none') -> Tensor"); REGISTER_PARSE_RULE( ptr_op, { @@ -2281,117 +2877,364 @@ class IrParser { std::list list_val; std::tie(format, list_val) = getConsistentValues( c10::nullopt, value_map[node->inputs()[0]->unique()]); - auto self = list_val.front(); + auto self = list_val.front()->as(); list_val.pop_front(); - auto out = gelu(self); + + auto approximate = constant_as(node->input(1)); + TORCH_INTERNAL_ASSERT( + approximate.has_value(), + "The approximate parameter is required."); + const auto kTanhGelu = + at::native::get_gelutype_enum(approximate.value()) == + at::native::GeluType::Tanh; + + auto out = (kTanhGelu) ? tanh_gelu(self) : gelu(self); value_map.emplace( node->output()->unique(), ValueHolder(out, format)); }, - nullptr, + [](const Node* node) -> bool { + if (!isInputNonSizeZeroTensor(node)) { + return false; + } + if (node->input(1)->node()->kind() != prim::Constant) { + return false; + } + return true; + }, nullptr); } { auto ptr_op = getOperatorForLiteral( - "aten::gelu_backward(Tensor grad, Tensor self) -> Tensor"); + "aten::gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor"); REGISTER_PARSE_RULE( ptr_op, { MemoryFormat format; std::list list_val; - std::tie(format, list_val) = getConsistentValues( + std::tie(format, list_val) = getPWFormatValues( c10::nullopt, value_map[node->inputs()[0]->unique()], value_map[node->inputs()[1]->unique()]); - auto grad_out = list_val.front(); + auto grad_out = list_val.front()->as(); list_val.pop_front(); - auto self = list_val.front(); + auto self = list_val.front()->as(); + list_val.pop_front(); + + auto approximate = constant_as(node->input(2)); + TORCH_INTERNAL_ASSERT( + approximate.has_value(), + "The approximate parameter is required."); + const auto kTanhGelu = + at::native::get_gelutype_enum(approximate.value()) == + at::native::GeluType::Tanh; + + auto grad_in = (kTanhGelu) ? tanh_gelu_backward(grad_out, self) + : gelu_backward(grad_out, self); + value_map.emplace( + node->output()->unique(), ValueHolder(grad_in, format)); + }, + [](const Node* node) -> bool { + if (!isInputNonSizeZeroTensor(node)) { + return false; + } + if (node->input(2)->node()->kind() != prim::Constant) { + return false; + } + return true; + }, + nullptr); + } + + { + auto ptr_op = getOperatorForLiteral( + "aten::tanh_backward(Tensor grad_output, Tensor output) -> Tensor"); + REGISTER_PARSE_RULE( + ptr_op, + { + MemoryFormat format; + std::list list_val; + std::tie(format, list_val) = getPWFormatValues( + c10::nullopt, + value_map[node->inputs()[0]->unique()], + value_map[node->inputs()[1]->unique()]); + auto grad_out = list_val.front()->as(); + list_val.pop_front(); + auto self = list_val.front()->as(); list_val.pop_front(); - auto grad_in = gelu_backward(grad_out, self); + auto grad_in = tanh_backward(grad_out, self); value_map.emplace( node->output()->unique(), ValueHolder(grad_in, format)); }, - nullptr, + isInputNonSizeZeroTensor, + nullptr); + } + + { + std::array BinaryFloatOp = { + "aten::amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor", + "aten::amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor"}; + for (auto signature : BinaryFloatOp) { + auto ptr_op = getOperatorForLiteral(signature); + REGISTER_PARSE_RULE( + ptr_op, + { + MemoryFormat format; + std::list list_val; + std::tie(format, list_val) = getConsistentValues( + MemoryFormat::Contiguous(), + value_map[node->inputs()[0]->unique()]); + auto self = list_val.front(); + list_val.pop_front(); + auto dims_list = constant_as>(node->input(1)); + TORCH_INTERNAL_ASSERT( + dims_list.has_value(), + "aten::amax/amin cannot be fused with dynamic axes"); + std::vector dims; + if (!dims_list->empty()) { + for (const auto dim : dims_list->vec()) { + dims.emplace_back(static_cast(dim)); + } + } else { + dims.resize(self->as()->nDims()); + std::iota(dims.begin(), dims.end(), 0); + } + auto keepdim = constant_as(node->input(2)); + TORCH_INTERNAL_ASSERT( + keepdim.has_value(), + "aten::amax/amin cannot be fused with dynamic keepdim"); + + TensorView* out = nullptr; + if (node->kind() == c10::Symbol::fromQualString("aten::amax")) { + out = max(self->as(), dims, keepdim.value()); + } else if ( + node->kind() == c10::Symbol::fromQualString("aten::amin")) { + out = min(self->as(), dims, keepdim.value()); + } else { + TORCH_INTERNAL_ASSERT( + false, "unrecognized operation in aten::amax/amin"); + } + value_map.emplace(node->output()->unique(), out); + }, + [](const Node* node) -> bool { + if (isReductionNonCompatibleTensor( + node->input(0)->type()->cast())) { + return false; + } + // we don't support dynamic reduction axes; + if (node->inputs()[1]->node()->kind() != prim::Constant) { + return false; + } + // we don't support dynamic keepdim yet; + if (node->inputs()[2]->node()->kind() != prim::Constant) { + return false; + } + return true; + }, + [](const Node* node) -> OperatorType { + return OperatorType::Reduction; + }); + } + } + + { + std::array ViewOps = { + "prim::reshape_copy(Tensor self, int[] shape) -> Tensor", + "prim::view_copy(Tensor self, int[] size) -> Tensor"}; + for (auto signature : ViewOps) { + auto ptr_op = getOperatorForLiteral(signature); + REGISTER_PARSE_RULE( + ptr_op, + { + auto self_value = node->inputs()[0]; + MemoryFormat format; + std::list list_val; + std::tie(format, list_val) = getConsistentValues( + MemoryFormat::Contiguous(), value_map[self_value->unique()]); + auto self = list_val.front()->as(); + list_val.pop_front(); + + auto self_type = self_value->type()->cast(); + TORCH_INTERNAL_ASSERT(self_type != nullptr); + auto self_sizes = getTensorSizes(self_type); + + auto view_sizes = constant_as>(node->input(1)); + TORCH_INTERNAL_ASSERT( + view_sizes.has_value(), "The size parameter is required."); + + auto output = view(self, self_sizes, view_sizes->vec()); + value_map.emplace(node->output()->unique(), output); + }, + [](const Node* node) -> bool { + auto self_value = node->inputs()[0]; + auto tensor_type = self_value->type()->cast(); + if (tensor_type == nullptr) { + return false; + } + if (!tensor_type->sizes().concrete_sizes().has_value()) { + // Shape information for input tensor is required. + return false; + } + + if (!isInputNonSizeZeroTensor(node)) { + return false; + } + // Reject fusing node if view_sizes contains an inferred dimension + auto view_sizes = constant_as>(node->input(1)); + if (!view_sizes.has_value()) { + // The size parameter is required. + return false; + } + + for (auto axis_size : view_sizes->vec()) { + if (axis_size == -1) { + return false; + } + } + return true; + }, + nullptr); + } + } + + { + auto flatten_op = getOperatorForLiteral( + "prim::flatten_copy(Tensor self, int start_dim, int end_dim) -> Tensor"); + REGISTER_PARSE_RULE( + flatten_op, + { + auto self_value = node->inputs()[0]; + MemoryFormat format; + std::list list_val; + std::tie(format, list_val) = getConsistentValues( + MemoryFormat::Contiguous(), value_map[self_value->unique()]); + auto self = list_val.front()->as(); + list_val.pop_front(); + + auto start_dim_value = constant_as(node->input(1)); + TORCH_INTERNAL_ASSERT( + start_dim_value.has_value(), "start_dim is not valid"); + auto end_dim_value = constant_as(node->input(2)); + TORCH_INTERNAL_ASSERT( + end_dim_value.has_value(), "end_dim is not valid"); + + TensorView* output = + flatten(self, start_dim_value.value(), end_dim_value.value()); + value_map.emplace(node->output()->unique(), output); + }, + [](const Node* node) -> bool { + // we don't support dynamic start_dim; + if (node->inputs()[1]->node()->kind() != prim::Constant) { + return false; + } + // we don't support dynamic end_dim yet; + if (node->inputs()[2]->node()->kind() != prim::Constant) { + return false; + } + return true; + }, nullptr); } { - auto ptr_op = getOperatorForLiteral( - "aten::amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor"); + auto ptr_op = + getOperatorForLiteral("prim::squeeze_copy(Tensor self) -> Tensor"); REGISTER_PARSE_RULE( ptr_op, { + auto self_value = node->inputs()[0]; MemoryFormat format; std::list list_val; std::tie(format, list_val) = getConsistentValues( - MemoryFormat::Contiguous(), - value_map[node->inputs()[0]->unique()]); - auto self = list_val.front(); + MemoryFormat::Contiguous(), value_map[self_value->unique()]); + auto self = list_val.front()->as(); list_val.pop_front(); - auto dims_list = constant_as>(node->input(1)); - TORCH_INTERNAL_ASSERT( - dims_list.has_value(), - "aten::amax cannot be fused with dynamic axes"); - std::vector dims; - for (const auto dim : dims_list->vec()) { - dims.emplace_back(static_cast(dim)); - } - auto keepdim = constant_as(node->input(2)); - TORCH_INTERNAL_ASSERT( - keepdim.has_value(), - "aten::amax cannot be fused with dynamic keepdim"); - auto out = max(self->as(), dims, keepdim.value()); - value_map.emplace(node->output()->unique(), out); + auto self_type = self_value->type()->cast(); + TORCH_INTERNAL_ASSERT(self_type != nullptr); + auto self_sizes = getTensorSizes(self_type); + + TensorView* output = nullptr; + if (self_sizes.empty()) { + // squeeze on scalar tensor should just return itself; + output = set(self); + } else { + output = squeeze(self, self_sizes); + } + value_map.emplace(node->output()->unique(), output); }, [](const Node* node) -> bool { - // we don't support dynamic reduction axes; - if (node->inputs()[1]->node()->kind() != prim::Constant) { + // Shape information for input tensor is required. + auto self_value = node->inputs()[0]; + auto tensor_type = self_value->type()->cast(); + if (tensor_type == nullptr) { return false; } - // we don't support dynamic keepdim yet; - if (node->inputs()[2]->node()->kind() != prim::Constant) { + if (!isInputNonSizeZeroTensor(node)) { return false; } - return true; + return tensor_type->sizes().concrete_sizes().has_value(); }, - [](const Node* node) -> OperatorType { - return OperatorType::Reduction; - }); + nullptr); } - /* - // TODO: Enable view in parser by detecting non-alias view operation { - std::array View = { - "aten::view(Tensor(a) self, int[] size) -> Tensor(a)", - "aten::reshape(Tensor(a) self, int[] shape) -> Tensor(a)"}; - for (auto signature : View) { + std::array AliasOpWithDim = { + "prim::squeeze_copy.dim(Tensor self, int dim) -> Tensor", + "prim::unsqueeze_copy(Tensor self, int dim) -> Tensor"}; + for (auto signature : AliasOpWithDim) { auto ptr_op = getOperatorForLiteral(signature); REGISTER_PARSE_RULE( ptr_op, { auto self_value = node->inputs()[0]; - auto self = value_map[self_value->unique()]->as(); - - auto self_type = self_value->type()->cast(); - TORCH_INTERNAL_ASSERT(self_type != nullptr); - auto self_sizes = getTensorSizes(self_type); + MemoryFormat format; + std::list list_val; + std::tie(format, list_val) = getConsistentValues( + MemoryFormat::Contiguous(), + value_map[node->inputs()[0]->unique()]); + auto self = list_val.front()->as(); + list_val.pop_front(); - auto size_optional = - constant_as>(node->input(1)); - TORCH_INTERNAL_ASSERT( - size_optional.has_value(), "The size parameter is required."); + auto dim_value = constant_as(node->input(1)); + TORCH_INTERNAL_ASSERT(dim_value.has_value(), "dim is not valid"); - auto output = view(self, self_sizes, size_optional->vec()); + TensorView* output = nullptr; + if (node->kind() == prim::unsqueeze_copy) { + output = unsqueeze(self, dim_value.value()); + } else { + auto self_type = self_value->type()->cast(); + TORCH_INTERNAL_ASSERT(self_type != nullptr); + auto self_sizes = getTensorSizes(self_type); + if (self_sizes.empty()) { + // squeeze on scalar tensor should just return itself; + output = set(self); + } else { + output = squeeze(self, self_sizes, dim_value.value()); + } + } value_map.emplace(node->output()->unique(), output); }, - nullptr, + [](const Node* node) -> bool { + // Shape information for input tensor is required. + auto self_value = node->inputs()[0]; + auto tensor_type = self_value->type()->cast(); + if (tensor_type == nullptr) { + return false; + } + if (!isInputNonSizeZeroTensor(node)) { + return false; + } + if (node->input(1)->node()->kind() != prim::Constant) { + return false; + } + auto optional_sizes = tensor_type->sizes().concrete_sizes(); + return tensor_type->sizes().concrete_sizes().has_value(); + }, nullptr); } } - */ } void processJitNode(const JitOp* node) { @@ -2425,9 +3268,9 @@ class IrParser { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) CgValue cg_val; if (auto ival = constant_as(val)) { - cg_val = new Double(ival.value()); + cg_val = IrBuilder::create(ival.value()); } else { - cg_val = new Double(); + cg_val = IrBuilder::create(); } value_map_.emplace(val->unique(), cg_val); return true; @@ -2436,9 +3279,9 @@ class IrParser { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) CgValue cg_val; if (auto ival = constant_as(val)) { - cg_val = new Int(ival.value()); + cg_val = IrBuilder::create(ival.value()); } else { - cg_val = new Int(); + cg_val = IrBuilder::create(); } value_map_.emplace(val->unique(), cg_val); return true; @@ -2447,21 +3290,31 @@ class IrParser { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) CgValue cg_val; if (auto ival = constant_as(val)) { - cg_val = new Bool(ival.value()); + cg_val = IrBuilder::create(ival.value()); } else { - cg_val = new Bool(); + cg_val = IrBuilder::create(); } value_map_.emplace(val->unique(), cg_val); return true; - } else if (val->type()->isSubtypeOf( - static_cast(NoneType::get()))) { + } else if ( + val->type()->isSubtypeOf( + static_cast(StringType::get())) || + val->type()->isSubtypeOf( + static_cast(DeviceObjType::get())) || + val->type()->isSubtypeOf(static_cast(NoneType::get()))) { // TODO: should we consider adding support for NoneType; + // Note: String/Device scalars are only used in parsing rules, do not + // register string with codegen IR. return true; } else if (val->type()->cast()) { // TODO: we don't support list type in codegen yet; // This is a WAR to allow axes of reduction to be passed as constant list; // We simply ignore conversion if the scalar value is a constant; - return toIValue(val).has_value(); + auto ivalue = toIValue(val); + TORCH_INTERNAL_ASSERT( + ivalue.has_value(), + "List[T] is not supported as an argument by NvFuser. Use a Constant List."); + return true; } return false; } @@ -2521,7 +3374,6 @@ class IrParser { nhwc_stride_vec[i]->stride_index_ = n_dim - i - 1; } - // auto updated_tensor_type = c10::TensorType::create( tensor_type = c10::TensorType::create( tensor_type->scalarType(), tensor_type->device(), @@ -2531,7 +3383,10 @@ class IrParser { tensor_type->undefined()); } - cg_val = new TensorView(tensor_type); + cg_val = IrBuilder::create(tensor_type); + if (is_cpu_scalar(*tensor_type)) { + cg_val->as()->setCpuScalar(true); + } value_map_.emplace(val->unique(), ValueHolder(cg_val, format)); return true; } @@ -2544,6 +3399,8 @@ class IrParser { std::unordered_map value_map_; static std::unordered_set parser_symbol_set_; + static std::unordered_set parser_skip_set_; + static std::mutex parser_mutex_; // parsing rule registry. static std::unordered_map @@ -2554,16 +3411,18 @@ class IrParser { cached_registry_lookup_; // NOLINT // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) - static bool init_registry_; + static std::once_flag once_flag_; }; std::unordered_set IrParser::parser_symbol_set_; // NOLINT +std::unordered_set IrParser::parser_skip_set_; // NOLINT +std::mutex IrParser::parser_mutex_; std::unordered_map IrParser::jit_operator_registry_; // NOLINT std::unordered_map IrParser::cached_registry_lookup_; // NOLINT // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) -bool IrParser::init_registry_ = true; +std::once_flag IrParser::once_flag_; ProfileIValueOp* insertProfileIValueOp( Node* node, @@ -2576,7 +3435,7 @@ ProfileIValueOp* insertProfileIValueOp( return pn; } -void profileSize(ProfilingRecord* pr, Node* node, size_t offset) { +void profileReductionSize(ProfilingRecord* pr, Node* node, size_t offset) { auto pn = insertProfileIValueOp(node, offset, pr); const auto ivalue_profiler = [pr, pn](Stack& stack) { @@ -2596,20 +3455,76 @@ void profileSize(ProfilingRecord* pr, Node* node, size_t offset) { size_vec.clear(); } else { TORCH_INTERNAL_ASSERT( - false, "profileSize does not support data type: ", value.tagKind()); + false, + "profileReductionSize does not support data type: ", + value.tagKind()); + } + // We stop profiling when it has failed + if (!pn->hasAttribute(profileFailedAttr)) { + if (!pn->hasAttribute(reductionSizeAttr)) { + pn->is_(reductionSizeAttr, size_vec); + } else { + auto profiled_ints = pn->is(reductionSizeAttr); + if (profiled_ints.size() != size_vec.size() || + !std::equal( + profiled_ints.begin(), profiled_ints.end(), size_vec.begin())) { + TORCH_WARN( + __FUNCTION__, + " sees varying value in profiling, ignoring and this should be handled by GUARD logic"); + pn->s_(profileFailedAttr, "varying profile values"); + pn->removeAttribute(reductionSizeAttr); + } + } + } else { + TORCH_INTERNAL_ASSERT( + !pn->hasAttribute(reductionSizeAttr), + "profiled attribute should have been removed when profiling is marked as failed"); } - if (!pn->hasAttribute(sizeAttr)) { - pn->is_(sizeAttr, size_vec); + push(stack, value); + }; + pn->setCallback(ivalue_profiler); +} + +void profileViewSize(ProfilingRecord* pr, Node* node, size_t offset) { + auto pn = insertProfileIValueOp(node, offset, pr); + + const auto ivalue_profiler = [pr, pn](Stack& stack) { + std::lock_guard lock(pr->mutex_); + + // TODO: we don't care about merging multiple profiling runs as we don't + // support it at all; + int64_t frame_id = 0; + pop(stack, frame_id); + IValue value; + pop(stack, value); + TORCH_INTERNAL_ASSERT( + value.isIntList(), "profiling seeing the wrong data type"); + if (!pn->hasAttribute(profileFailedAttr)) { + if (!pn->hasAttribute(viewSizeAttr)) { + pn->is_(viewSizeAttr, value.toIntVector()); + } else { + auto profiled_ints = pn->is(viewSizeAttr); + auto input_ints = value.toIntList(); + if (profiled_ints.size() != input_ints.size() || + !std::equal( + profiled_ints.begin(), + profiled_ints.end(), + input_ints.begin())) { + TORCH_WARN( + __FUNCTION__, + " sees varying value in profiling, ignoring and this should be handled by GUARD logic"); + pn->s_(profileFailedAttr, "varying profile values"); + pn->removeAttribute(viewSizeAttr); + } + } } else { - auto profiled_ints = pn->is(sizeAttr); TORCH_INTERNAL_ASSERT( - profiled_ints.size() == size_vec.size() && - std::equal( - profiled_ints.begin(), profiled_ints.end(), size_vec.begin()), - "profiling ivalue doesn't support merge"); + !pn->hasAttribute(viewSizeAttr), + "profiled attribute should have been removed when profiling is marked as failed"); } push(stack, value); }; + pn->setCallback(ivalue_profiler); } @@ -2627,18 +3542,67 @@ void profileIntList(ProfilingRecord* pr, Node* node, size_t offset) { pop(stack, value); TORCH_INTERNAL_ASSERT( value.isIntList(), "profiling seeing the wrong data type"); - if (!pn->hasAttribute(intListAttr)) { - pn->is_(intListAttr, value.toIntVector()); + if (!pn->hasAttribute(profileFailedAttr)) { + if (!pn->hasAttribute(intListAttr)) { + pn->is_(intListAttr, value.toIntVector()); + } else { + auto profiled_ints = pn->is(intListAttr); + auto input_ints = value.toIntList(); + if (profiled_ints.size() != input_ints.size() || + !std::equal( + profiled_ints.begin(), + profiled_ints.end(), + input_ints.begin())) { + TORCH_WARN( + __FUNCTION__, + " sees varying value in profiling, ignoring and this should be handled by GUARD logic"); + pn->s_(profileFailedAttr, "varying profile values"); + pn->removeAttribute(intListAttr); + } + } } else { - auto profiled_ints = pn->is(intListAttr); - auto input_ints = value.toIntList(); TORCH_INTERNAL_ASSERT( - profiled_ints.size() == input_ints.size() && - std::equal( - profiled_ints.begin(), - profiled_ints.end(), - input_ints.begin()), - "profiling ivalue doesn't support merge"); + !pn->hasAttribute(intListAttr), + "profiled attribute should have been removed when profiling is marked as failed"); + } + push(stack, value); + }; + + pn->setCallback(ivalue_profiler); +} + +void profileString(ProfilingRecord* pr, Node* node, size_t offset) { + auto pn = insertProfileIValueOp(node, offset, pr); + + const auto ivalue_profiler = [pr, pn](Stack& stack) { + std::lock_guard lock(pr->mutex_); + + // TODO: we don't care about merging multiple profiling runs as we don't + // support it at all; + int64_t frame_id = 0; + pop(stack, frame_id); + IValue value; + pop(stack, value); + TORCH_INTERNAL_ASSERT( + value.isString(), "profiling seeing the wrong data type"); + if (!pn->hasAttribute(profileFailedAttr)) { + if (!pn->hasAttribute(strAttr)) { + pn->s_(strAttr, value.toStringRef()); + } else { + const auto& profiled_str = pn->s(strAttr); + const auto& input_str = value.toStringRef(); + if (input_str != profiled_str) { + TORCH_WARN( + __FUNCTION__, + " sees varying value in profiling, ignoring and this should be handled by GUARD logic"); + pn->s_(profileFailedAttr, "varying profile values"); + pn->removeAttribute(strAttr); + } + } + } else { + TORCH_INTERNAL_ASSERT( + !pn->hasAttribute(strAttr), + "profiled attribute should have been removed when profiling is marked as failed"); } push(stack, value); }; @@ -2660,14 +3624,24 @@ void profileBool(ProfilingRecord* pr, Node* node, size_t offset) { pop(stack, value); TORCH_INTERNAL_ASSERT( value.isBool(), "profiling seeing the wrong data type"); - if (!pn->hasAttribute(boolAttr)) { - pn->i_(boolAttr, value.toBool()); + if (!pn->hasAttribute(profileFailedAttr)) { + if (!pn->hasAttribute(boolAttr)) { + pn->i_(boolAttr, value.toBool()); + } else { + auto profiled_bool = pn->i(boolAttr); + auto input_bool = value.toBool(); + if (input_bool != profiled_bool) { + TORCH_WARN( + __FUNCTION__, + " sees varying value in profiling, ignoring and this should be handled by GUARD logic"); + pn->s_(profileFailedAttr, "varying profile values"); + pn->removeAttribute(boolAttr); + } + } } else { - auto profiled_bool = pn->i(boolAttr); - auto input_bool = value.toBool(); TORCH_INTERNAL_ASSERT( - input_bool == profiled_bool, - "profiling ivalue doesn't support merge"); + !pn->hasAttribute(boolAttr), + "profiled attribute should have been removed when profiling is marked as failed"); } push(stack, value); }; @@ -2689,13 +3663,61 @@ void profileInt(ProfilingRecord* pr, Node* node, size_t offset) { pop(stack, value); TORCH_INTERNAL_ASSERT( value.isInt(), "profiling seeing the wrong data type"); - if (!pn->hasAttribute(intAttr)) { - pn->i_(intAttr, value.toInt()); + if (!pn->hasAttribute(profileFailedAttr)) { + if (!pn->hasAttribute(intAttr)) { + pn->i_(intAttr, value.toInt()); + } else { + auto profiled_int = pn->i(intAttr); + auto input_int = value.toInt(); + if (input_int != profiled_int) { + TORCH_WARN( + __FUNCTION__, + " sees varying value in profiling, ignoring and this should be handled by GUARD logic"); + pn->s_(profileFailedAttr, "varying profile values"); + pn->removeAttribute(intAttr); + } + } + } else { + TORCH_INTERNAL_ASSERT( + !pn->hasAttribute(intAttr), + "profiled attribute should have been removed when profiling is marked as failed"); + } + push(stack, value); + }; + + pn->setCallback(ivalue_profiler); +} + +// profile ivalue, used for optional arguments +void profileIval(ProfilingRecord* pr, Node* node, size_t offset) { + auto pn = insertProfileIValueOp(node, offset, pr); + + const auto ivalue_profiler = [pr, pn](Stack& stack) { + std::lock_guard lock(pr->mutex_); + + // TODO: we don't care about merging multiple profiling runs as we don't + // support it at all; + int64_t frame_id = 0; + pop(stack, frame_id); + IValue value; + pop(stack, value); + if (!pn->hasAttribute(profileFailedAttr)) { + if (!pn->hasAttribute(ivalAttr)) { + pn->ival_(ivalAttr, value); + } else { + auto profiled_ival = pn->ival(ivalAttr); + if (value != profiled_ival) { + TORCH_WARN( + __FUNCTION__, + " sees varying value in profiling, ignoring and this should be handled by GUARD logic"); + pn->s_(profileFailedAttr, "varying profile values"); + pn->removeAttribute(ivalAttr); + } + } } else { - auto profiled_int = pn->i(intAttr); - auto input_int = value.toInt(); TORCH_INTERNAL_ASSERT( - input_int == profiled_int, "profiling ivalue doesn't support merge"); + !pn->hasAttribute(ivalAttr), + "profiled attribute should have been removed when profiling is marked as failed"); } push(stack, value); }; @@ -2717,20 +3739,30 @@ void profileBoolList(ProfilingRecord* pr, Node* node, size_t offset) { pop(stack, value); TORCH_INTERNAL_ASSERT( value.isBoolList(), "profiling seeing the wrong data type"); - if (!pn->hasAttribute(boolListAttr)) { - auto list = value.toBoolList(); - std::vector val(list.begin(), list.end()); - pn->is_(boolListAttr, val); + if (!pn->hasAttribute(profileFailedAttr)) { + if (!pn->hasAttribute(boolListAttr)) { + auto list = value.toBoolList(); + std::vector val(list.begin(), list.end()); + pn->is_(boolListAttr, val); + } else { + auto profiled_ints = pn->is(boolListAttr); + auto input_bools = value.toBoolList(); + if (profiled_ints.size() != input_bools.size() || + !std::equal( + input_bools.begin(), + input_bools.end(), + profiled_ints.begin())) { + TORCH_WARN( + __FUNCTION__, + " sees varying value in profiling, ignoring and this should be handled by GUARD logic"); + pn->s_(profileFailedAttr, "varying profile values"); + pn->removeAttribute(boolListAttr); + } + } } else { - auto profiled_ints = pn->is(boolListAttr); - auto input_bools = value.toBoolList(); TORCH_INTERNAL_ASSERT( - profiled_ints.size() == input_bools.size() && - std::equal( - input_bools.begin(), - input_bools.end(), - profiled_ints.begin()), - "profiling ivalue doesn't support merge"); + !pn->hasAttribute(boolListAttr), + "profiled attribute should have been removed when profiling is marked as failed"); } push(stack, value); }; @@ -2788,6 +3820,11 @@ bool shouldProfileNode(const Node* node) { return IrParser::lookupInSymbolSet(node); } +bool skipNodeKind(const std::string& symbol_str, bool flip) { + return IrParser::querySkipSymbolSet( + c10::Symbol::fromQualString(symbol_str), flip); +} + bool insertProfileIValue(ProfilingRecord* pr, Node* node, size_t offset) { // is skip constant necessary? if (node->input(offset)->node()->kind() == prim::Constant) { @@ -2798,23 +3835,11 @@ bool insertProfileIValue(ProfilingRecord* pr, Node* node, size_t offset) { getOperatorForLiteral( "aten::dropout(Tensor input, float p, bool train) -> Tensor") ->schema(); - if (node->matches(dropout_schema)) { - switch (offset) { - // argument 2: Is training? - case 2: - profileBool(pr, node, offset); - break; - default: - return false; - } - return true; - } - static auto native_dropout_schema = getOperatorForLiteral( "aten::native_dropout(Tensor input, float p, bool? train) -> (Tensor, Tensor)") ->schema(); - if (node->matches(native_dropout_schema)) { + if (node->matches(dropout_schema) || node->matches(native_dropout_schema)) { switch (offset) { // argument 2: Is training? case 2: @@ -2830,7 +3855,11 @@ bool insertProfileIValue(ProfilingRecord* pr, Node* node, size_t offset) { getOperatorForLiteral( "aten::amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor") ->schema(); - if (node->matches(amax_schema)) { + static auto amin_schema = + getOperatorForLiteral( + "aten::amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor") + ->schema(); + if (node->matches(amax_schema) || node->matches(amin_schema)) { switch (offset) { // argument 1: reduction axes; case 1: @@ -2880,7 +3909,7 @@ bool insertProfileIValue(ProfilingRecord* pr, Node* node, size_t offset) { // argument 1: reduction sizes; case 1: // TODO(profile_size): double check optional[size]? - profileSize(pr, node, offset); + profileReductionSize(pr, node, offset); break; default: return false; @@ -2888,28 +3917,74 @@ bool insertProfileIValue(ProfilingRecord* pr, Node* node, size_t offset) { return true; } - /* - // TODO: Enable view in parser by detecting non-alias view operation - static auto view_schema = + static auto reshape_schema = + getOperatorForLiteral("aten::reshape(Tensor self, int[] shape) -> Tensor") + ->schema(); + static auto reshape_copy_schema = getOperatorForLiteral( - "aten::view(Tensor(a) self, int[] size) -> Tensor(a)") + "prim::reshape_copy(Tensor self, int[] shape) -> Tensor") ->schema(); - static auto reshape_schema = + static auto view_schema = + getOperatorForLiteral("aten::view(Tensor self, int[] size) -> Tensor") + ->schema(); + static auto view_copy_schema = getOperatorForLiteral( - "aten::reshape(Tensor(a) self, int[] shape) -> Tensor(a)") + "prim::view_copy(Tensor self, int[] size) -> Tensor") ->schema(); - if (node->matches(view_schema) || node->matches(reshape_schema)) { + if (node->matches(reshape_schema) || node->matches(reshape_copy_schema) || + node->matches(view_schema) || node->matches(view_copy_schema)) { switch (offset) { // argument 1: new tensor size; case 1: - profileSize(pr, node, offset); + profileViewSize(pr, node, offset); + break; + default: + return false; + } + return true; + } + + static auto flatten_schema1 = + getOperatorForLiteral( + "aten::flatten.using_ints(Tensor self, int start_dim=0, int end_dim=-1) -> Tensor") + ->schema(); + static auto flatten_schema2 = + getOperatorForLiteral( + "prim::flatten_copy(Tensor self, int start_dim, int end_dim) -> Tensor") + ->schema(); + if (node->matches(flatten_schema1) || node->matches(flatten_schema2)) { + switch (offset) { + // argument 1: start_dim; + // argument 2: end_dim; + case 1: + case 2: + profileInt(pr, node, offset); + break; + default: + return false; + } + return true; + } + + static auto squeeze_dim_schema = + getOperatorForLiteral( + "prim::squeeze_copy.dim(Tensor self, int dim) -> Tensor") + ->schema(); + static auto unsqueeze_schema = + getOperatorForLiteral( + "prim::unsqueeze_copy(Tensor self, int dim) -> Tensor") + ->schema(); + if (node->matches(squeeze_dim_schema) || node->matches(unsqueeze_schema)) { + switch (offset) { + // argument 1: unsqueeze dim; + case 1: + profileInt(pr, node, offset); break; default: return false; } return true; } - */ static auto batch_norm_impl_index_schema = getOperatorForLiteral( @@ -2941,6 +4016,38 @@ bool insertProfileIValue(ProfilingRecord* pr, Node* node, size_t offset) { return true; } + static auto gelu_schema = + getOperatorForLiteral( + "aten::gelu(Tensor self, *, str approximate='none') -> Tensor") + ->schema(); + if (node->matches(gelu_schema)) { + switch (offset) { + // argument 1: approximate; + case 1: + profileString(pr, node, offset); + break; + default: + return false; + } + return true; + } + + static auto gelu_backward_schema = + getOperatorForLiteral( + "aten::gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor") + ->schema(); + if (node->matches(gelu_backward_schema)) { + switch (offset) { + // argument 2: approximate; + case 2: + profileString(pr, node, offset); + break; + default: + return false; + } + return true; + } + static auto native_layer_norm_schema = getOperatorForLiteral( "aten::native_layer_norm(Tensor input, int[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor)") @@ -2982,6 +4089,26 @@ bool insertProfileIValue(ProfilingRecord* pr, Node* node, size_t offset) { return true; } + static auto batch_norm_backward_schema = + getOperatorForLiteral( + "aten::native_batch_norm_backward(Tensor grad_out, Tensor input, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_invstd, bool train, float eps, bool[3] output_mask) -> (Tensor, Tensor, Tensor)") + ->schema(); + if (node->matches(batch_norm_backward_schema)) { + switch (offset) { + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) + case 7: // argument 8: training; + profileBool(pr, node, offset); + break; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) + case 9: + profileBoolList(pr, node, offset); + break; + default: + return false; + } + return true; + } + static auto native_layer_norm_backward_schema = getOperatorForLiteral( "aten::native_layer_norm_backward(Tensor grad_out, Tensor input, int[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor)") @@ -3015,12 +4142,39 @@ bool insertProfileIValue(ProfilingRecord* pr, Node* node, size_t offset) { } } + static auto log_softmax_data_schema = + getOperatorForLiteral( + "aten::log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor") + ->schema(); + static auto softmax_data_schema = + getOperatorForLiteral( + "aten::softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor") + ->schema(); + if (node->matches(log_softmax_data_schema) || + node->matches(softmax_data_schema)) { + switch (offset) { + case 2: + profileIval(pr, node, offset); + return true; + default: + return false; + } + } + + static auto log_softmax_backward_data_schema = + getOperatorForLiteral( + "aten::_log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor") + ->schema(); static auto softmax_backward_data_schema = getOperatorForLiteral( "aten::_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor") ->schema(); - if (node->matches(softmax_backward_data_schema)) { + if (node->matches(log_softmax_backward_data_schema) || + node->matches(softmax_backward_data_schema)) { switch (offset) { + case 2: + profileInt(pr, node, offset); + return true; case 3: profileInt(pr, node, offset); return true; diff --git a/torch/csrc/jit/codegen/cuda/parser.h b/torch/csrc/jit/codegen/cuda/parser.h index 4b2fcf50f992..ddfbf7762742 100644 --- a/torch/csrc/jit/codegen/cuda/parser.h +++ b/torch/csrc/jit/codegen/cuda/parser.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -44,6 +44,8 @@ TORCH_CUDA_CU_API bool isElementWiseNode(const Node* node); TORCH_CUDA_CU_API bool isNodeParsible(const Node* node); TORCH_CUDA_CU_API bool shouldProfileNode(const Node* node); +TORCH_CUDA_CU_API bool skipNodeKind(const std::string& symbol_str, bool flip); + void InsertProfileNodes(ProfilingRecord* pr); // lowers PyTorch jit graph to `Fusion`. diff --git a/torch/csrc/jit/codegen/cuda/partial_split_map.cpp b/torch/csrc/jit/codegen/cuda/partial_split_map.cpp index e7b6db4d165f..dd8fb05a0493 100644 --- a/torch/csrc/jit/codegen/cuda/partial_split_map.cpp +++ b/torch/csrc/jit/codegen/cuda/partial_split_map.cpp @@ -8,11 +8,10 @@ namespace fuser { namespace cuda { void PartialSplitMap::build(Fusion* fusion) { - const auto gpu_lower = GpuLower::current(); auto used_vals = ir_utils::allTvs(fusion); for (auto tv : ir_utils::filterByType(used_vals)) { - auto exprs = ExprSort::getExprs( + auto exprs = StmtSort::getExprs( fusion, {tv->domain()->domain().begin(), tv->domain()->domain().end()}); for (auto split : ir_utils::filterByType(exprs)) { // Only needs to check root domains as partial split is only @@ -24,18 +23,10 @@ void PartialSplitMap::build(Fusion* fusion) { continue; } auto root_domain = split->in(); - auto kir_root_domain = - gpu_lower->lowerValue(split->in())->as(); auto start_offset = split->startOffset(); start_offset_map_.insert({root_domain, start_offset}); - kir_start_offset_map_.insert( - {kir_root_domain, - gpu_lower->lowerValue(start_offset)->as()}); auto stop_offset = split->stopOffset(); stop_offset_map_.insert({root_domain, stop_offset}); - kir_stop_offset_map_.insert( - {kir_root_domain, - gpu_lower->lowerValue(stop_offset)->as()}); } } } @@ -49,15 +40,6 @@ Val* PartialSplitMap::getStartOffset(IterDomain* root_domain) const { } } -kir::Val* PartialSplitMap::getStartOffset(kir::IterDomain* root_domain) const { - auto it = kir_start_offset_map_.find(root_domain); - if (it == kir_start_offset_map_.end()) { - return nullptr; - } else { - return it->second; - } -} - Val* PartialSplitMap::getStopOffset(IterDomain* root_domain) const { auto it = stop_offset_map_.find(root_domain); if (it == stop_offset_map_.end()) { @@ -67,15 +49,6 @@ Val* PartialSplitMap::getStopOffset(IterDomain* root_domain) const { } } -kir::Val* PartialSplitMap::getStopOffset(kir::IterDomain* root_domain) const { - auto it = kir_stop_offset_map_.find(root_domain); - if (it == kir_stop_offset_map_.end()) { - return nullptr; - } else { - return it->second; - } -} - } // namespace cuda } // namespace fuser } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/partial_split_map.h b/torch/csrc/jit/codegen/cuda/partial_split_map.h index be432bd5a161..8ec489915b79 100644 --- a/torch/csrc/jit/codegen/cuda/partial_split_map.h +++ b/torch/csrc/jit/codegen/cuda/partial_split_map.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -20,15 +20,11 @@ class TORCH_CUDA_CU_API PartialSplitMap { void build(Fusion* fusion); Val* getStartOffset(IterDomain* root_domain) const; - kir::Val* getStartOffset(kir::IterDomain* root_domain) const; Val* getStopOffset(IterDomain* root_domain) const; - kir::Val* getStopOffset(kir::IterDomain* root_domain) const; private: std::unordered_map start_offset_map_; - std::unordered_map kir_start_offset_map_; std::unordered_map stop_offset_map_; - std::unordered_map kir_stop_offset_map_; }; } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/partition.cpp b/torch/csrc/jit/codegen/cuda/partition.cpp index 004c836ec4ed..7e2c04b56c56 100644 --- a/torch/csrc/jit/codegen/cuda/partition.cpp +++ b/torch/csrc/jit/codegen/cuda/partition.cpp @@ -5,12 +5,16 @@ #include #include #include +#include +#include namespace torch { namespace jit { namespace fuser { namespace cuda { +const c10::DeviceIndex INVALID_INDEX = -2; + namespace { bool hasNonElementWiseOperation(const Node* node) { @@ -38,26 +42,109 @@ static c10::optional getDevice(const Value* value) { // not tensor type, return false as the op is not outputing scalar. return c10::nullopt; } - return value->type()->expectRef().device(); + auto tensor_type = value->type()->expectRef(); + // special case for scalar tensor: return c10::nullopt instead of cpu device. + // this allows us to fuse scalar cpu tensor with cuda tensor, while avoid + // merging ops with pure scalar cpu tensors. + if (is_cpu_scalar(tensor_type)) { + return c10::nullopt; + } + return tensor_type.device(); +} + +static bool hasBfloat(const Node* node) { + auto has_bfloat = [](const Value* value) { + if (!value->type()->isSubtypeOf(*TensorType::get())) { + return false; + } + auto opt_scalar_type = value->type()->expectRef().scalarType(); + if (opt_scalar_type.has_value() && + opt_scalar_type.value() == at::ScalarType::BFloat16) { + return true; + } + return false; + }; + + if (std::any_of(node->inputs().begin(), node->inputs().end(), has_bfloat) || + std::any_of(node->outputs().begin(), node->outputs().end(), has_bfloat)) { + return true; + } + return false; } static c10::optional getDevice(const Node* node) { - auto outputs = node->outputs(); - for (auto output : outputs) { - auto device = getDevice(output); + c10::optional ret = c10::nullopt; + auto merge_devices = [&ret](const c10::optional& device) { if (device.has_value()) { - return device; + if (ret.has_value()) { + if (ret.value() != device.value()) { + // invalidate device to reflect conflicts + ret->set_index(INVALID_INDEX); + // return false to indicate early termination + return false; + } else { + // same device, do nothing + return true; + } + } else { + // initialize return device + ret = device.value(); + return true; + } + } + // no device information, do nothing + return true; + }; + for (auto val : node->inputs()) { + if (!merge_devices(getDevice(val))) { + return ret; + } + } + for (auto val : node->outputs()) { + if (!merge_devices(getDevice(val))) { + return ret; } } - return c10::nullopt; + return ret; } -static bool isFusibleDevice(const Node* node, const c10::Device device) { - for (auto value : node->outputs()) { - auto output_device = getDevice(value); - if (output_device.has_value() && output_device.value() != device) { - return false; - } +static bool isDeviceCompatible(const Node* node, const c10::Device& device) { + // only fuses cuda device + if (!device.is_cuda()) { + GRAPH_UPDATE("rejecting node (non-cuda device): ", *node); + return false; + } + const auto major = at::cuda::getDeviceProperties(device.index())->major; + // disable non-elementwise fusion on pre-volta devices + if (major < 7 && hasNonElementWiseOperation(node)) { + GRAPH_UPDATE( + "rejecting node (non element-wise op not supported on SM < 7X): ", + *node); + return false; + } + // disable bfloat fusion on pre-ampere devices + if (major < 8 && hasBfloat(node)) { + GRAPH_UPDATE("rejecting node (bfloat not supported on SM < 8X): ", *node); + return false; + } + return true; +} + +static bool isFusibleDevice(const Node* node, const c10::Device& device) { + TORCH_INTERNAL_ASSERT( + device.index() != INVALID_INDEX, "fusible device needs to be validate"); + auto opt_device = getDevice(node); + // we can be more relaxed here as we known that this function tries to merge + // node into an existing `device` + if (opt_device.has_value() && + (opt_device->index() == INVALID_INDEX || opt_device != device)) { + GRAPH_UPDATE( + "rejecting node from fusion (outputs device not matching fusion): ", + *node); + return false; + } + if (!isDeviceCompatible(node, device)) { + return false; } return true; } @@ -65,12 +152,16 @@ static bool isFusibleDevice(const Node* node, const c10::Device device) { // TODO: we need to check input type when we handle `to()` static bool isFusibleDevice(const Node* node) { auto device = getDevice(node); - if (!device.has_value()) { - return true; + // be conservative and only fuse cuda operations, this avoids us initializing + // operations that produces cpu scalar outputs + if (!device.has_value() || device->index() == INVALID_INDEX) { + return false; } - return device->is_cuda() && - (at::cuda::getDeviceProperties(device->index())->major >= 7 || - !hasNonElementWiseOperation(node)); + + if (!isDeviceCompatible(node, device.value())) { + return false; + } + return true; } bool compatibleType(const torch::jit::Value* val) { @@ -80,6 +171,15 @@ bool compatibleType(const torch::jit::Value* val) { DataType::Null) { return false; } + // Complex is disabled until its support is completely added + // TODO: remove this logic + if (isComplexType(aten_to_data_type(tensor_type->scalarType().value()))) { + return false; + } + } + // magic number 8 here since our kernel argument only supports rank <= 8 + if (tensor_type->dim().has_value() && (tensor_type->dim().value() > 8)) { + return false; } } return true; @@ -121,268 +221,35 @@ bool checkOutputTensorTypes(const Node* node) { } inline bool isFusibleNode(const Node* node) { + // Check if already part of a fusion group if (node->kind() == prim::CudaFusionGroup) return true; // Check we have a parsing rule - bool isFusible = isNodeParsible(node); - // Check if we have a tensor type it's one we support - isFusible = isFusible && checkInputTensorTypes(node); - isFusible = isFusible && checkOutputTensorTypes(node); - // Check if already part of a fusion group - return isFusible; -} - -bool maybeBroadcast( - const TensorTypePtr& type, - const std::vector>& shape) { - if (type->dim()) { - if (type->dim().value() < shape.size()) { - // no broadcast for reduction operation; - return false; - } else if (type->dim().value() > shape.size()) { - // increased rank means there is reduction; - return true; - } else { - // same rank, we need to iterate through sizes and check if size-1 - // exists in input `shape` - for (const auto& opt_size : shape) { - // TODO: not sure if we need to check for output size != 1, since we - // are currently marking all size-1 dimension as broadcast in codegen. - if (opt_size.has_value() && opt_size.value() == 1) { - return true; - } - } + if (!isNodeParsible(node)) { + // ignoring profile nodes & constant nodes to avoid noise from debugging + if (node->kind() != prim::Constant && + node->kind() != prim::profile_ivalue && node->kind() != prim::profile && + node->kind() != prim::Param) { + GRAPH_UPDATE("rejecting node from fusion (node not parsible): ", *node); } + return false; } - return false; -} - -// utility function to check if the node implies broadcast on a given shape ( -// assumed to be shape of an input tensor) -// limitations: -// 1. we rely on shape information to judge this. so we would require output -// shape to be available; -// 2. we basically compares given shape to the shape of the only output of -// the node and return true if it implies broadcast from the former to the -// latter. -bool maybeBroadcastOnShape( - const Node* n, - const std::vector>& shape) { - // TODO: we are only checking output 0. This means that our current check for - // normalization is not complete. - // assumes that if output is not a tensor type, it's not broadcasting - if (auto out_type = n->output(0)->type()->cast()) { - return maybeBroadcast(out_type, shape); - } - return false; -}; - -// return true if node is pointwise operation and input tensors all have -// identical shape. -bool isNonBroadcastElementWise(const Node* n) { - if (hasNonElementWiseOperation(n)) { + // Check if we have a tensor type it's one we support + if (!checkInputTensorTypes(node)) { + GRAPH_UPDATE( + "rejecting node from fusion (input scalar type not supported): ", + *node); return false; } - - for (const auto output : n->outputs()) { - const auto& n_output_type = output->type()->cast(); - - // TODO: we need to stay on safer side instead of "default to return true - // when shape information is not available.", Change that when we enable - // profiling on autodiff FW execution. - if (n_output_type != nullptr && n_output_type->sizes().sizes()) { - const std::vector>& n_output_shape = - n_output_type->sizes().sizes().value(); - - for (auto input : n->inputs()) { - if (auto t_type = input->type()->cast()) { - if (maybeBroadcast(t_type, n_output_shape)) { - return false; - } - } - } - } + if (!checkOutputTensorTypes(node)) { + GRAPH_UPDATE( + "rejecting node from fusion (output scalar type not supported): ", + *node); + return false; } - return true; } -//! [ Note - tricky broadcasting ] -//! -//! github issue # 190 -//! -//! To extend the issue further, we consider two difficult broadcasting cases -//! that is difficult to naively schedule: -//! scenario 1: single tensor with multiple broadcasting semantics; -//! ``` -//! %t = op(...) -//! %t0_o = op0(%t, %t0) -//! %t1_o = op1(%t, %t1) -//! ``` -//! It's hard to check/validate whether `%t0` and `%t1` implies -//! identical broadcasting for `%t` so that we can simply -//! broadcast it to their common shape and use the broadcasted -//! tensor view in both `op0` and `op1`; or, if `%t0` and `%t1` -//! has different shapes, we would need differently broadcasted -//! `%t` for the two ops. Even with this condition sorted out, -//! scheduling is challenging. As we cannot inline the computation -//! of `%t` to the downstream consumer of `%t0_o` and `%t1_o` -//! easily, because `computeAt` could propagate contradicting -//! transformations on the common ancestor `%t`. See footnote*; -//! scenario 2: output tensor_view which is broadcasted later; -//! ``` -//! %t = op(...) -//! %t0_o = op0(%t, %t0) -//! return (%t, %t0_o) -//! ``` -//! Similarly, if we need to broadcast `%t` to `%t0` for `op0`, -//! and use it as output, it also complicates schedule. -//! -//! Currently we just avoid the two cases in our graph partitioning. -//! -//! We bake the implementation along with our partition, where we merge nodes -//! from producer to consumer. In the example down, we list all "type"s of edges -//! among producer/consumer and the out side world. -//! -//! %input_t0, %input_t1, %input_t2 # inputs from outside world feeding -//! # producer/consumer pair -//! %p_out_t0, %p_out_t1 = producer(%input_t0, %input_t1) -//! %c_out_t, ... = consumer(%input_t0, %input_t2, %p_out_t0) -//! -//! producer/consumer : the nodes that we are trying to merge, each node could -//! be -//! a parsible real operation or a `CudaFusionGroup`. -//! %input_t0 : inputs shared by both producer & consumer -//! %input_t1 : inputs feed only to producer, but not to consumer -//! %input_t2 : inputs feed only to consumer, but not to producer -//! %p_put_t0 : outputs of producer that is fed to consumer -//! %p_put_t1 : outputs of producer that is not fed to consumer -//! %c_put_t0 : outputs of consumer -//! -//! We can see that after merging consumer & producer, we will have: -//! %input_t0, %input_t1, %input_t2 # inputs from outside world feeding -//! # producer/consumer pair -//! %p_out_t, %c_out_t = group(%input_t0, %input_t1, %input_t2) -//! -//! Under the assumption that any existing `CudaFusionGroup` does not have -//! violating broadcasting semantics mentioned above. -//! -//! If we examine the `group`, new cases of scenario 1 (multiple broadcast) -//! could only be created by merging new edges in the new `group`, that is: -//! case 1. `%input_t0`, shared by `producer` and `consumer` -//! case 2. `%p_out_t0`, produced by `producer` and fed to `consumer` -//! -//! new cases of scenario 2 (output was broadcasted later) could only be added -//! via: -//! case 3. `%p_out_t0`, produced by `producer` and fed to `consumer`, which -//! could be broadcasted in the consumer subgraph. -//! -//! footnote*: -//! We are only disabling multiple broadcast right on the tensor, instead of -//! tracing all the broadcast further down. -//! I don't think we need to worry about broadcasting further down the -//! dependency chain, as those would create new IterDomain, which doesn't have -//! th problem of conflicting broadcasting. -bool createTrickyBroadcast(const Node* consumer, const Node* producer) { - auto count_broadcasting_in_node = - [](const Node* node, - const std::vector>& shape, - size_t offset) { - int num_broadcasting = 0; - if (node->kind() == prim::CudaFusionGroup) { - // be careful here as `subgraph_input`, as its name suggests, is in a - // different fraph from `node`. - const auto& subgraph_input = - node->g(attr::Subgraph)->inputs()[offset]; - for (const auto& use : subgraph_input->uses()) { - if (maybeBroadcastOnShape(use.user, shape)) { - num_broadcasting++; - } - } - } else { - if (maybeBroadcastOnShape(node, shape)) { - num_broadcasting++; - } - } - return num_broadcasting; - }; - - // case 1. We check shared inputs to `producer` & `consumer`; - for (const auto i : c10::irange(producer->inputs().size())) { - auto n_input = producer->input(i); - auto n_input_type = n_input->type()->cast(); - if (n_input_type != nullptr && n_input_type->sizes().sizes()) { - std::vector> n_input_shape = - n_input_type->sizes().sizes().value(); - int num_broadcasting = 0; - - // check broadcasting for the n_input inside `consumer`; - for (const auto& use : n_input->uses()) { - if (use.user == consumer) { - num_broadcasting += - count_broadcasting_in_node(consumer, n_input_shape, use.offset); - } - } - - // if no broadcasting happened for consumer, there's no point check - // multiple broadcasting in producer alone; - if (num_broadcasting == 0) { - continue; - } - - // check broadcasting for n_input inside `producer`; - num_broadcasting += - count_broadcasting_in_node(producer, n_input_shape, i); - - // encounted multiple broadcasting scheme for a single TV, we will not be - // able to schedule this, prevent the fusion; (case 1) - if (num_broadcasting > 1) { - return true; - } - } - } - - // case 2. We check input to `consumer` that is also the output from - // `producer` - for (const auto i : c10::irange(producer->outputs().size())) { - auto n_output = producer->output(i); - auto n_output_type = n_output->type()->cast(); - if (n_output_type != nullptr && n_output_type->sizes().sizes()) { - std::vector> n_output_shape = - n_output_type->sizes().sizes().value(); - int num_broadcasting = 0; - // If we only look at case 1 & case 2, we need to check broadcast of - // `n_output` inside `producer`, if it is a `prim::CudaFusionGroup`. - // this is actually not necessary when we consider case 3, as we avoid - // broadcasting on outputs already; - - // TODO: merge this code with case 1. - // check broadcasting for the n_output inside `consumer`; - bool use_as_output = false; - for (const auto& use : n_output->uses()) { - if (use.user == consumer) { - num_broadcasting += - count_broadcasting_in_node(consumer, n_output_shape, use.offset); - } else { - // case 3. output is used by other nodes not the consumer, no - // broadcasting is allowed; - use_as_output = true; - } - } - - // encounted multiple broadcasting scheme for a single TV, we will not be - // able to schedule this, prevent the fusion; (case 2) - // Alternatively, if use_as_output is true, we would not permit broadcast - // at all. (case 3) - if (num_broadcasting > (use_as_output ? 0 : 1)) { - return true; - } - } - } - - return false; -} - } // namespace bool isFusibleCudaFusionGroup(const Node* node) { @@ -400,7 +267,7 @@ bool isFusibleCudaFusionGroup(const Node* fusion, const Node* node) { bool fused = false; // TODO: lift the restriction of not fusing producer containing reduction when // we have proper scheduling. - if (isFusibleCudaFusionGroup(node)) { + if (isFusibleNode(node)) { // ensure if the node has a designated device, it's on the same device with // fusion. // TODO: is there a danger of us fusing operations that's supposed to be on @@ -408,7 +275,6 @@ bool isFusibleCudaFusionGroup(const Node* fusion, const Node* node) { auto device = getDevice(fusion); fused = (!device.has_value() || isFusibleDevice(node, device.value())); } - return fused; } diff --git a/torch/csrc/jit/codegen/cuda/partition.h b/torch/csrc/jit/codegen/cuda/partition.h index 0d8baca47007..b295cb582e57 100644 --- a/torch/csrc/jit/codegen/cuda/partition.h +++ b/torch/csrc/jit/codegen/cuda/partition.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include /* diff --git a/torch/csrc/jit/codegen/cuda/predicate_compute.cpp b/torch/csrc/jit/codegen/cuda/predicate_compute.cpp index b501a6133f60..9cafd20c7010 100644 --- a/torch/csrc/jit/codegen/cuda/predicate_compute.cpp +++ b/torch/csrc/jit/codegen/cuda/predicate_compute.cpp @@ -6,8 +6,6 @@ #include #include #include -#include -#include #include #include @@ -20,27 +18,24 @@ namespace cuda { namespace { -bool isTensorIndexOp(kir::Expr* expr) { +bool isTensorIndexOp(Expr* expr) { const auto& outputs = expr->outputs(); return outputs.size() >= 1 && outputs[0]->isA(); } -bool isOutputLocal(const kir::Expr* expr) { +bool isOutputLocal(const Expr* expr) { return std::all_of( - expr->outputs().begin(), - expr->outputs().end(), - [](const kir::Val* output) { - return !output->isA() || - output->as()->memoryType() == MemoryType::Local; + expr->outputs().begin(), expr->outputs().end(), [](const Val* output) { + return !output->isA() || + output->as()->getMemoryType() == MemoryType::Local; }); } } // namespace -bool ParallelizedDomainPredicate::PredicateInfo::addDomain( - kir::IterDomain* id) { - const auto gpu_lower = GpuLower::current(); - auto concrete_id = gpu_lower->caIndexMap().getConcreteMappedID(id); +bool ParallelizedDomainPredicate::PredicateInfo::addDomain(IterDomain* id) { + auto concrete_id = GpuLower::current()->caMap()->getConcreteMappedID( + id, IdMappingMode::EXACT); if (std::find(ids_.begin(), ids_.end(), concrete_id) == ids_.end()) { ids_.push_back(concrete_id); return true; @@ -49,21 +44,20 @@ bool ParallelizedDomainPredicate::PredicateInfo::addDomain( } } -kir::Bool* ParallelizedDomainPredicate::PredicateInfo::getPredicate() const { - const auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); - - kir::Bool* pred = nullptr; +Bool* ParallelizedDomainPredicate::PredicateInfo::getPredicate() const { + Bool* pred = nullptr; - auto index = - ir_builder.create(stringifyThread(pt_), DataType::Int); + auto index = SimplifyingIrBuilder::create( + stringifyThread(pt_), DataType::Int); for (const auto& pred_id : ids()) { // Just sanity check that pred_id is concrete TORCH_INTERNAL_ASSERT( - pred_id == gpu_lower->caIndexMap().getConcreteMappedID(pred_id)); - auto new_pred = ir_builder.ltExpr(index, pred_id->extent()); - pred = ir_builder.andExpr(pred, new_pred)->as(); + pred_id == + GpuLower::current()->caMap()->getConcreteMappedID( + pred_id, IdMappingMode::EXACT)); + auto new_pred = SimplifyingIrBuilder::ltExpr(index, pred_id->extent()); + pred = SimplifyingIrBuilder::andExpr(pred, new_pred)->as(); } return pred; @@ -74,16 +68,12 @@ namespace { std::unordered_set getNonUnswitchedRootDomains( const std::vector& loops, size_t unswitched_loop_index) { - const auto gpu_lower = GpuLower::current(); - std::vector non_unswited_leaf_domains; std::transform( loops.begin(), loops.begin() + unswitched_loop_index, std::back_inserter(non_unswited_leaf_domains), - [&](kir::ForLoop* loop) { - return gpu_lower->caIndexMap().toFusion(loop->iter_domain()); - }); + [&](kir::ForLoop* loop) { return loop->iter_domain(); }); auto non_unswitched_inputs = IterVisitor::getInputsTo(non_unswited_leaf_domains); @@ -100,26 +90,25 @@ std::unordered_set getNonUnswitchedRootDomains( non_unswitched_concrete_root_domains, non_unswitched_concrete_root_domains.end()), [&](auto root_dom) { - return gpu_lower->caIndexMap().getConcreteMappedID(root_dom); + return GpuLower::current()->caMap()->getConcreteMappedID( + root_dom, IdMappingMode::EXACT); }); return non_unswitched_concrete_root_domains; } bool isFullyUnswitched( - kir::IterDomain* loop_id, + IterDomain* loop_id, const std::unordered_set& non_unswitched_root_domains) { - const auto gpu_lower = GpuLower::current(); - - auto root_vals = - IterVisitor::getInputsTo({gpu_lower->caIndexMap().toFusion(loop_id)}); + auto root_vals = IterVisitor::getInputsTo({loop_id}); auto root_domains = ir_utils::filterByType(root_vals); return std::none_of( root_domains.begin(), root_domains.end(), [&](auto root_dom) { auto concrete_root_dom = - gpu_lower->caIndexMap().getConcreteMappedID(root_dom); + GpuLower::current()->caMap()->getConcreteMappedID( + root_dom, IdMappingMode::EXACT); return non_unswitched_root_domains.count(concrete_root_dom) > 0; }); } @@ -131,12 +120,10 @@ std::unordered_map< ParallelizedDomainPredicate::PredicateInfo, TypeHash> ParallelizedDomainPredicate::getPredicateMap( - const kir::Expr* expr, + const Expr* expr, const std::vector& loops, kir::ForLoop* unswitched_loop) { const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - auto output_tvs = ir_utils::getTvs(expr->outputs()); if (output_tvs.empty()) { @@ -167,7 +154,7 @@ ParallelizedDomainPredicate::getPredicateMap( } auto loop_id = loop->iter_domain(); - auto loop_ptype = loop_id->parallelType(); + auto loop_ptype = loop_id->getParallelType(); // Not necessary to add a predicate if the paralle type is exact if (!isParallelTypeThread(loop_ptype) || @@ -187,13 +174,14 @@ ParallelizedDomainPredicate::getPredicateMap( tv->domain()->domain().begin(), tv->domain()->domain().end(), [&](auto tv_id) { - return gpu_lower->caIndexMap().areMapped(loop_id, tv_id); + return gpu_lower->caMap()->areMapped( + loop_id, tv_id, IdMappingMode::EXACT); }); if (it == tv->domain()->domain().end()) { continue; } - kir::IterDomain* tv_id = *it; + IterDomain* tv_id = *it; // If the corresponding domain is a broadcast, it's not really used. if (tv_id->isBroadcast()) { @@ -203,9 +191,9 @@ ParallelizedDomainPredicate::getPredicateMap( // If it's a root domain, it should be covered by the root // predicates, so no extra predicate is required. if (std::find( - tv->domain()->rootDomain().begin(), - tv->domain()->rootDomain().end(), - tv_id) != tv->domain()->rootDomain().end()) { + tv->domain()->getRootDomain().begin(), + tv->domain()->getRootDomain().end(), + tv_id) != tv->domain()->getRootDomain().end()) { continue; } @@ -218,29 +206,24 @@ ParallelizedDomainPredicate::getPredicateMap( return map; } -kir::Bool* ParallelizedDomainPredicate::getPredicate( - const kir::Expr* expr, +Bool* ParallelizedDomainPredicate::getPredicate( + const Expr* expr, const std::vector& loops) { - kir::SimplifyingIrBuilder ir_builder(GpuLower::current()->kernel()); - auto pred_map = getPredicateMap(expr, loops); - kir::Val* pred = ir_builder.trueVal(); + Val* pred = GpuLower::current()->kernel()->trueVal(); for (auto pt : kParallelTypeThreads) { auto pred_info_it = pred_map.find(pt); if (pred_info_it != pred_map.end()) { const auto& pred_info = pred_info_it->second; auto tid_pred = pred_info.getPredicate(); - pred = ir_builder.andExpr(pred, tid_pred); + pred = SimplifyingIrBuilder::andExpr(pred, tid_pred); } } - if (pred) { - return pred->as(); - } else { - return nullptr; - } + TORCH_INTERNAL_ASSERT(pred != nullptr); + return pred->as(); } UnswitchPredicateKey::UnswitchPredicateKey() @@ -256,61 +239,55 @@ UnswitchPredicateKey::UnswitchPredicateKey() // concrete domains are used to uniquely collect all necessary // unswitch predicates. UnswitchPredicateKey::UnswitchPredicateKey( - IterDomain* predicated_concrete_id, - const ReferenceTensor& reference) + IterDomain* predicated_consumer_id, + TensorView* consumer_tv, + IterDomain* predicated_concrete_id) : predicated_concrete_id_(predicated_concrete_id) { // Initialize the parallelized domain map for (auto pt : kParallelTypeThreads) { parallel_concrete_ids_.insert({pt, nullptr}); } - // The id parameter is a concrete domain. Needs to find the - // corresponding reference domain to find leaf domains that are - // parallelized. - IterDomain* predicated_ref_id = - reference.concrete_to_id.at(predicated_concrete_id_); - TensorDomain* ref_td = reference.domain; - - std::vector all_parallelized_ref_leaf_ids; + std::vector all_parallelized_consumer_leaf_ids; std::copy_if( - ref_td->domain().begin(), - ref_td->domain().end(), - std::back_inserter(all_parallelized_ref_leaf_ids), + consumer_tv->domain()->domain().begin(), + consumer_tv->domain()->domain().end(), + std::back_inserter(all_parallelized_consumer_leaf_ids), [](IterDomain* x) { return isParallelTypeThread(x->getParallelType()); }); - // If the reference is not parallelized at all, no need to + // If the consumer domais are not parallelized at all, no need to // differentiate keys based on how the predicated id is parallelized - if (all_parallelized_ref_leaf_ids.empty()) { + if (all_parallelized_consumer_leaf_ids.empty()) { return; } - // All domains that are parallelized descendants of predicated_ref_id - auto all_parallelized_ref_ids = DependencyCheck::getAllValsBetween( - {predicated_ref_id}, all_parallelized_ref_leaf_ids); + // All domains that are parallelized descendants of predicated_consumer_id + auto all_parallelized_consumer_ids = DependencyCheck::getAllValsBetween( + {predicated_consumer_id}, all_parallelized_consumer_leaf_ids); // Just pick leaf domains - std::vector parallelized_ref_leaf_ids; + std::vector parallelized_consumer_leaf_ids; std::copy_if( - ref_td->domain().begin(), - ref_td->domain().end(), - std::back_inserter(parallelized_ref_leaf_ids), + consumer_tv->domain()->domain().begin(), + consumer_tv->domain()->domain().end(), + std::back_inserter(parallelized_consumer_leaf_ids), [&](IterDomain* x) { return std::find( - all_parallelized_ref_ids.begin(), - all_parallelized_ref_ids.end(), - x) != all_parallelized_ref_ids.end(); + all_parallelized_consumer_ids.begin(), + all_parallelized_consumer_ids.end(), + x) != all_parallelized_consumer_ids.end(); }); - if (parallelized_ref_leaf_ids.empty()) { - // None of the parallelized leaf domains are derived from predicated_ref_id + if (parallelized_consumer_leaf_ids.empty()) { + // None of the parallelized leaf domains are derived from + // predicated_consumer_id return; } // Find the corresponding concrete id for each parallel type - for (auto ref_leaf : parallelized_ref_leaf_ids) { - auto pt = ref_leaf->getParallelType(); - auto it = reference.id_to_concrete.find(ref_leaf); - TORCH_INTERNAL_ASSERT(it != reference.id_to_concrete.end()); - auto concrete_leaf = it->second; + for (auto consumer_leaf : parallelized_consumer_leaf_ids) { + auto pt = consumer_leaf->getParallelType(); + auto concrete_leaf = GpuLower::current()->caMap()->getConcreteMappedID( + consumer_leaf, IdMappingMode::EXACT); parallel_concrete_ids_.at(pt) = concrete_leaf; } } @@ -344,19 +321,18 @@ std::size_t UnswitchPredicateKeyHash::operator()( return h; }; -kir::Bool* PredicateCompute::getInlinePredicate( - const kir::Expr* expr, +Bool* PredicateCompute::getInlinePredicate( + const Expr* expr, const std::vector& loops, - kir::Bool* thread_pred, + Bool* thread_pred, PredicateType pred_type) { FUSER_PERF_SCOPE("GpuLower::Lower::getInlinePredicate"); const auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); // If outputs are registers, no need to predicate for threads if (isOutputLocal(expr)) { - thread_pred = ir_builder.trueVal(); + thread_pred = gpu_lower->kernel()->trueVal(); } if (loops.empty()) { @@ -364,11 +340,18 @@ kir::Bool* PredicateCompute::getInlinePredicate( return thread_pred; } - auto out_tv = ir_utils::getTVOutput(expr); - TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Missing kir::TensorView output"); + auto out_tv = ir_utils::getTvOutput(expr); + TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Missing TensorView output"); + + // Predicates for non-exact parallel dimensions must be used even + // when PredicateElimination::canOmitPredicate is true. + auto parallel_dom_pred = + ParallelizedDomainPredicate::getPredicate(expr, loops); + TORCH_INTERNAL_ASSERT(parallel_dom_pred != nullptr); if (gpu_lower->predicateElimination().canOmitPredicate(expr)) { - return thread_pred; + return SimplifyingIrBuilder::andExpr(thread_pred, parallel_dom_pred) + ->as(); } auto pred_info_vec = @@ -376,7 +359,7 @@ kir::Bool* PredicateCompute::getInlinePredicate( out_tv, loops, nullptr, pred_type == PredicateType::Padding) .first; - std::vector preds; + std::vector preds; // When pred_type is ReductionWrite, filter out predicates for // reduction axes. For blockReduce, this is necessary when reduction @@ -388,7 +371,7 @@ kir::Bool* PredicateCompute::getInlinePredicate( bool non_zero_start_found = false; for (const auto& pred_info : pred_info_vec) { if (pred_type == PredicateType::ReductionWrite) { - const auto& consumer_ids = pred_info.consumerIds(); + const auto& consumer_ids = pred_info.rootIds(); bool pred_for_reduction_axis = false; for (auto consumer_id : consumer_ids) { if (consumer_id->isReduction()) { @@ -404,64 +387,52 @@ kir::Bool* PredicateCompute::getInlinePredicate( continue; } } - for (auto pred : pred_info.startPredicates()) { - TORCH_INTERNAL_ASSERT(pred != nullptr); - preds.push_back(pred); - } - for (auto pred : pred_info.stopPredicates()) { - TORCH_INTERNAL_ASSERT(pred != nullptr); - preds.push_back(pred); - } + preds.push_back(pred_info.startPredicate()); + preds.push_back(pred_info.stopPredicate()); } // When generating a predicate for blockReduce writes and not for // gridReduce, if all reduction axes start with zero, we can just // use the same predicate for reads. nullptr is returned then. if (pred_type == PredicateType::ReductionWrite && !non_zero_start_found && - !out_tv->fuserTv()->domain()->hasGridReduction()) { + !out_tv->domain()->hasGridReduction()) { return nullptr; } - auto parallel_dom_pred = - ParallelizedDomainPredicate::getPredicate(expr, loops); - if (parallel_dom_pred) { - preds.push_back(parallel_dom_pred); - } + preds.push_back(parallel_dom_pred); if (thread_pred != nullptr) { preds.push_back(thread_pred); } if (preds.empty()) { - return ir_builder.trueVal(); + return GpuLower::current()->kernel()->trueVal(); } - kir::Val* cond = preds[0]; + Val* cond = preds[0]; for (const auto i : c10::irange(1, preds.size())) { - cond = ir_builder.andExpr(cond, preds[i]); + cond = SimplifyingIrBuilder::andExpr(cond, preds[i]); } - return cond->as(); + return cond->as(); } -kir::Bool* UnswitchPredicate::get( +Bool* UnswitchPredicate::get( const std::vector& outer_loops, kir::ForLoop* unrolled_loop) { FUSER_PERF_SCOPE("GpuLower::Lower::UnswitchPredicate::get"); - kir::SimplifyingIrBuilder ir_builder(GpuLower::current()->kernel()); - UnswitchPredicate up(outer_loops, unrolled_loop); - kir::Val* unswitch_pred = ir_builder.trueVal(); + Val* unswitch_pred = GpuLower::current()->kernel()->trueVal(); for (auto pred : up.predicates_) { - unswitch_pred = ir_builder.andExpr(unswitch_pred, pred); + unswitch_pred = SimplifyingIrBuilder::andExpr(unswitch_pred, pred); } - return unswitch_pred->as(); + return unswitch_pred->as(); } -void UnswitchPredicate::predicateOn(kir::Expr* tv_expr) { +void UnswitchPredicate::predicateOn(Expr* tv_expr) { FUSER_PERF_SCOPE("GpuLower::Lower::UnswitchPredicate::predicateOn"); if (for_loops_.empty()) { @@ -469,18 +440,16 @@ void UnswitchPredicate::predicateOn(kir::Expr* tv_expr) { } const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - if (gpu_lower->predicateElimination().canOmitPredicate(tv_expr)) { + addParallelizedDomainPredicates(tv_expr); return; } - auto out_tv = ir_utils::getTVOutput(tv_expr); - TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Missing kir::TensorView output"); + auto out_tv = ir_utils::getTvOutput(tv_expr); + TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Missing TensorView output"); auto ref_pred_info = Index::getReferenceRootPredicates( out_tv, for_loops_, unrolled_loop_, false); - const ReferenceTensor& reference = ref_pred_info.second; // If RootPredicateInfo has a static predicate that is more // restrictive than the current one, replace the current with the @@ -491,10 +460,8 @@ void UnswitchPredicate::predicateOn(kir::Expr* tv_expr) { // predicates are generated in the finalize function. for (const auto& pred_info : ref_pred_info.first) { - if (pred_info.startPredicates().empty() && - pred_info.stopPredicates().empty()) { - continue; - } + TORCH_INTERNAL_ASSERT(pred_info.startPredicate() != nullptr); + TORCH_INTERNAL_ASSERT(pred_info.stopPredicate() != nullptr); const auto& root_ids = pred_info.rootIds(); @@ -505,13 +472,14 @@ void UnswitchPredicate::predicateOn(kir::Expr* tv_expr) { bool first_key_set = false; for (auto root_id : root_ids) { - auto kir_root_id = gpu_lower->lowerValue(root_id)->as(); + auto concrete_root_id = gpu_lower->caMap()->getConcreteMappedID( + root_id, IdMappingMode::EXACT); - if (kir_root_id->isBroadcast()) { + if (root_id->isBroadcast()) { continue; } - UnswitchPredicateKey key(root_id, reference); + UnswitchPredicateKey key(root_id, out_tv, concrete_root_id); auto inserted = predicated_keys_.insert(key).second; add_pred = add_pred || inserted; @@ -573,20 +541,23 @@ void UnswitchPredicate::predicateOn(kir::Expr* tv_expr) { // start and stop offsets. if (merged_pred_it != pending_predicates_.end()) { mergeUnswitchPredicateOffsets( - pred_info.startPredicates(), - pred_info.startOffsets(), + pred_info.startPredicate(), + pred_info.startOffset(), merged_pred_it->start, true); mergeUnswitchPredicateOffsets( - pred_info.stopPredicates(), - pred_info.stopOffsets(), + pred_info.stopPredicate(), + pred_info.stopOffset(), merged_pred_it->stop, false); } } - // Adds new predicates for parallelized domains + addParallelizedDomainPredicates(tv_expr); +} + +void UnswitchPredicate::addParallelizedDomainPredicates(Expr* tv_expr) { auto pred_map = ParallelizedDomainPredicate::getPredicateMap( tv_expr, for_loops_, unrolled_loop_); for (auto pt : kParallelTypeThreads) { @@ -613,7 +584,7 @@ void UnswitchPredicate::openLoop(kir::ForLoop* fl) { for_loops_.push_back(fl); for (auto expr : fl->body().exprs()) { - if (ir_utils::isTVOp(expr) || isTensorIndexOp(expr)) { + if (ir_utils::isTvOp(expr) || isTensorIndexOp(expr)) { predicateOn(expr); } else if (auto ite = dynamic_cast(expr)) { openIte(ite); @@ -630,7 +601,7 @@ void UnswitchPredicate::openIte(kir::IfThenElse* ite) { // only expand the ite thenBody for (auto expr : ite->thenBody().exprs()) { - if (ir_utils::isTVOp(expr) || isTensorIndexOp(expr)) { + if (ir_utils::isTvOp(expr) || isTensorIndexOp(expr)) { predicateOn(expr); } else if (auto ite = dynamic_cast(expr)) { openIte(ite); @@ -641,7 +612,6 @@ void UnswitchPredicate::openIte(kir::IfThenElse* ite) { } void UnswitchPredicate::finalize() { - kir::SimplifyingIrBuilder ir_builder(GpuLower::current()->kernel()); for (const auto& merged_pred : pending_predicates_) { const auto& start_info = merged_pred.start; if (start_info.static_pred) { @@ -661,12 +631,10 @@ void UnswitchPredicate::finalize() { } void UnswitchPredicate::mergeUnswitchPredicateOffsets( - const std::vector& predicates, - const std::vector& offsets, + Bool* predicate, + Val* offset, MergedPredicates::Info& merged_predicate_info, bool is_start) { - TORCH_INTERNAL_ASSERT(predicates.size() == offsets.size()); - auto is_more_restrictive = [&is_start](int64_t new_val, int64_t current_val) { if (is_start) { return new_val < current_val; @@ -675,25 +643,21 @@ void UnswitchPredicate::mergeUnswitchPredicateOffsets( } }; - for (const auto i : c10::irange(predicates.size())) { - auto pred = predicates.at(i); - auto offset = offsets.at(i); - auto offset_int = dynamic_cast(offset); - // If it's a static predicate, replace the current one if it's - // more restrictive. If it's dynamic, just adds it to the dynamic - // predicate list. - if (offset_int && offset_int->isConst()) { - auto offset_const = offset_int->value().value(); - auto& static_pred = merged_predicate_info.static_pred; - auto& static_offset = merged_predicate_info.static_offset; - if (static_pred == nullptr || - is_more_restrictive(offset_const, static_offset)) { - static_pred = pred; - static_offset = offset_const; - } - } else { - merged_predicate_info.dynamic_preds.push_back(pred); + auto offset_int = dynamic_cast(offset); + // If it's a static predicate, replace the current one if it's + // more restrictive. If it's dynamic, just adds it to the dynamic + // predicate list. + if (offset_int && offset_int->isConst()) { + auto offset_const = offset_int->value().value(); + auto& static_pred = merged_predicate_info.static_pred; + auto& static_offset = merged_predicate_info.static_offset; + if (static_pred == nullptr || + is_more_restrictive(offset_const, static_offset)) { + static_pred = predicate; + static_offset = offset_const; } + } else { + merged_predicate_info.dynamic_preds.push_back(predicate); } } diff --git a/torch/csrc/jit/codegen/cuda/predicate_compute.h b/torch/csrc/jit/codegen/cuda/predicate_compute.h index 989bffb3bd18..6cf3609d3151 100644 --- a/torch/csrc/jit/codegen/cuda/predicate_compute.h +++ b/torch/csrc/jit/codegen/cuda/predicate_compute.h @@ -16,10 +16,10 @@ class PredicateCompute { // ignore_internal_syncthread_ops will prevent creation of predicates on // block/grid broadcast/reduce as these have syncthread calls within them // so all threads need to execute the function. - static kir::Bool* getInlinePredicate( - const kir::Expr* expr, + static Bool* getInlinePredicate( + const Expr* expr, const std::vector& loops, - kir::Bool* thread_pred, + Bool* thread_pred, PredicateType pred_type); }; @@ -40,31 +40,31 @@ class ParallelizedDomainPredicate { explicit PredicateInfo(ParallelType pt) : pt_(pt) {} //! Adds a domain that is parallized by the same paralell type - bool addDomain(kir::IterDomain* id); + bool addDomain(IterDomain* id); - const std::vector& ids() const { + const std::vector& ids() const { return ids_; } //! Generates a predicate Val from predicate information - kir::Bool* getPredicate() const; + Bool* getPredicate() const; private: ParallelType pt_; //! Domains parallelized by the same parallel type - std::vector ids_; + std::vector ids_; }; //! Returns a predicate Val for parallelied domains of an expression. - static kir::Bool* getPredicate( - const kir::Expr* expr, + static Bool* getPredicate( + const Expr* expr, const std::vector& loops); //! Returns predicate information for parallelied domains of an //! expression. static std::unordered_map getPredicateMap( - const kir::Expr* expr, + const Expr* expr, const std::vector& loops, kir::ForLoop* unswitched_loop = nullptr); }; @@ -80,8 +80,9 @@ class UnswitchPredicateKey { UnswitchPredicateKey(); UnswitchPredicateKey( - IterDomain* predicated_concrete_id, - const ReferenceTensor& reference); + IterDomain* predicated_consumer_id, + TensorView* consumer_tv, + IterDomain* predicated_concrete_id); bool operator==(const UnswitchPredicateKey& other) const { return predicated_concrete_id_ == other.predicated_concrete_id_ && @@ -121,7 +122,7 @@ struct UnswitchPredicateKeyHash { class TORCH_CUDA_CU_API UnswitchPredicate { public: - static kir::Bool* get( + static Bool* get( const std::vector& outer_loops, kir::ForLoop* unrolled_loop); @@ -132,11 +133,11 @@ class TORCH_CUDA_CU_API UnswitchPredicate { struct Info { //! Most restrictive static predicate. Nullptr if no static //! predicate found. - kir::Bool* static_pred = nullptr; + Bool* static_pred = nullptr; //! The offset value of static_pred int64_t static_offset = 0; //! List of dynamic predicates. - std::vector dynamic_preds; + std::vector dynamic_preds; }; UnswitchPredicateKey predicate_key; Info start; @@ -147,7 +148,7 @@ class TORCH_CUDA_CU_API UnswitchPredicate { std::vector outer_loops, kir::ForLoop* unrolled_loop); - void predicateOn(kir::Expr*); + void predicateOn(Expr*); void openLoop(kir::ForLoop*); @@ -160,11 +161,14 @@ class TORCH_CUDA_CU_API UnswitchPredicate { //! static, only pick the most restrictive one, e.g., the one with the //! minimum offset for the start predication. void mergeUnswitchPredicateOffsets( - const std::vector& predicates, - const std::vector& offsets, + Bool* predicate, + Val* offset, MergedPredicates::Info& merged_predicate_info, bool is_start); + //! Adds new predicates for parallelized domains + void addParallelizedDomainPredicates(Expr*); + private: //! Track which iter domains have been predicated std::unordered_set @@ -181,7 +185,7 @@ class TORCH_CUDA_CU_API UnswitchPredicate { parallelized_dom_predicates_; //! The predicates that have been generated. - std::vector predicates_; + std::vector predicates_; std::vector for_loops_; diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/examples/double_half_cast.py b/torch/csrc/jit/codegen/cuda/python_frontend/examples/double_half_cast.py new file mode 100644 index 000000000000..fbd85fa197e8 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/python_frontend/examples/double_half_cast.py @@ -0,0 +1,33 @@ +import torch + +from torch._C._nvfuser import Fusion, FusionDefinition, DataType + +# Construct and Define Fusion +fusion = Fusion() + +with FusionDefinition(fusion) as fd : + t0 = fd.define_tensor(2, DataType.Double) + t1 = fd.define_tensor(2, DataType.Double) + + fd.add_input(t0) + fd.add_input(t1) + + t0h = fd.Ops.cast(DataType.Half, t0) + t1h = fd.Ops.cast(DataType.Half, t1) + t2 = fd.Ops.add(t0h, t1h) + t3 = fd.Ops.relu(t2) + + fd.add_output(t3) + +fusion.print_ir() + +# Execute Fusion +input1 = torch.ones(2, 4, device='cuda', dtype=torch.float64) +input2 = torch.ones(2, 4, device='cuda', dtype=torch.float64) + +# Kernel compilation should be cached for the 2nd iteration +# with input tensors of the same shape +for _ in range(5) : + outputs = fusion.execute([input1, input2]) + +print(outputs[0]) diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/examples/half_double_cast.py b/torch/csrc/jit/codegen/cuda/python_frontend/examples/half_double_cast.py new file mode 100644 index 000000000000..faa71fbba8ac --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/python_frontend/examples/half_double_cast.py @@ -0,0 +1,31 @@ +import torch + +from torch._C._nvfuser import Fusion, FusionDefinition, DataType + +# Construct and Define Fusion +fusion = Fusion() + +with FusionDefinition(fusion) as fd : + t0 = fd.define_tensor(2, DataType.Half) + t1 = fd.define_tensor(2, DataType.Double) + + fd.add_input(t0) + fd.add_input(t1) + + t2 = fd.Ops.add(t0, t1) + t5 = fd.Ops.relu(t2) + + fd.add_output(t5) + +fusion.print_ir() + +# Execute Fusion +input1 = torch.ones(2, 4, device='cuda', dtype=torch.float16) +input2 = torch.ones(2, 4, device='cuda', dtype=torch.float64) + +# Kernel compilation should be cached for the 2nd iteration +# with input tensors of the same shape +for _ in range(5) : + outputs = fusion.execute([input1, input2]) + +print(outputs[0]) diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example.py b/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example.py new file mode 100644 index 000000000000..ce6e490ac997 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example.py @@ -0,0 +1,41 @@ +import torch +from torch._C._nvfuser import Fusion, FusionDefinition, DataType + +# Construct and Define Fusion +fusion = Fusion() + +with FusionDefinition(fusion) as fd : + t0 = fd.define_tensor(3) + t1 = fd.define_tensor(1) + s0 = fd.define_scalar() + + fd.add_input(t0) + fd.add_input(t1) + fd.add_input(s0) + + c0 = fd.define_constant(3.0) + + t1_b = fd.Ops.broadcast(t1, [True, True, False]) + t2 = fd.Ops.add(t0, t1) + t3 = fd.Ops.mul(t2, c0) + t4 = fd.Ops.atan2(t3, s0) + t5 = fd.Ops.relu(t4) + t6 = fd.Ops.sum(t5, [-1], False, DataType.Float) + t7 = fd.Ops.isfinite(t6) + + fd.add_output(t6) + fd.add_output(t7) + +fusion.print_ir() + +# Execute Fusion +input1 = torch.ones(2, 4, 8, device='cuda') +input2 = torch.ones(8, device='cuda') + +# Kernel compilation should be cached for the 2nd iteration +# with input tensors of the same shape +for _ in range(5) : + outputs = fusion.execute([input1, input2, 2.0]) + +print(outputs[0]) +print(outputs[1]) diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example_broadcast_in_dim.py b/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example_broadcast_in_dim.py new file mode 100644 index 000000000000..aa2fb2016de8 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example_broadcast_in_dim.py @@ -0,0 +1,58 @@ +import torch + +from torch._C._nvfuser import Fusion, FusionDefinition + +# Construct and Define Fusion +fusion1 = Fusion() + +with FusionDefinition(fusion1) as fd : + t0 = fd.define_tensor(1) + t1 = fd.define_tensor(3) + + fd.add_input(t0) + fd.add_input(t1) + + t0_b = fd.Ops.broadcast_in_dim(t0, [2, 3, 4], [1]) + t2 = fd.Ops.add(t0_b, t1) + + fd.add_output(t2) + +fusion1.print_ir() + +# Execute Fusion +input1 = torch.ones(3, device='cuda') +input2 = torch.ones(2, 3, 4, device='cuda') + +# Kernel compilation should be cached for the 2nd iteration +# with input tensors of the same shape +for _ in range(5) : + outputs = fusion1.execute([input1, input2]) + +print(outputs[0]) + +fusion2 = Fusion() + +input1 = torch.ones(1, 1, 4, device='cuda') +input2 = torch.ones(2, 3, 4, device='cuda') + +with FusionDefinition(fusion2) as fd : + t0 = fd.define_tensor(sizes=input1.size(), strides=input1.stride()) + t1 = fd.define_tensor(sizes=input2.size(), strides=input2.stride()) + + fd.add_input(t0) + fd.add_input(t1) + + t0_b = fd.Ops.broadcast_in_dim(t0, [2, 3, 4], [0, 1, 2]) + print("Broadcast TensorView", t0_b) + t2 = fd.Ops.add(t0_b, t1) + + fd.add_output(t2) + +fusion2.print_ir() + +# Kernel compilation should be cached for the 2nd iteration +# with input tensors of the same shape +for _ in range(5) : + outputs = fusion2.execute([input1, input2]) + +print(outputs[0]) diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example_fp16.py b/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example_fp16.py new file mode 100644 index 000000000000..e707a863dc86 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/python_frontend/examples/python_example_fp16.py @@ -0,0 +1,40 @@ +import torch + +from torch._C._nvfuser import Fusion, FusionDefinition, DataType + +# Construct and Define Fusion +fusion = Fusion() + +with FusionDefinition(fusion) as fd : + t0 = fd.define_tensor(3, DataType.Half) + t1 = fd.define_tensor(1, DataType.Half) + s0 = fd.define_scalar() + + fd.add_input(t0) + fd.add_input(t1) + fd.add_input(s0) + + c0 = fd.define_constant(3.0) + + t1_b = fd.Ops.broadcast(t1, [True, True, False]) + t2 = fd.Ops.add(t0, t1) + t3 = fd.Ops.mul(t2, c0) + t4 = fd.Ops.mul(t3, s0) + t5 = fd.Ops.relu(t4) + t6 = fd.Ops.sum(t5, [-1], False, DataType.Float) + + t7 = fd.Ops.cast(DataType.Half, t6) + fd.add_output(t7) + +fusion.print_ir() + +# Execute Fusion +input1 = torch.ones(2, 4, 8, device='cuda', dtype=torch.float16) +input2 = torch.ones(8, device='cuda', dtype=torch.float16) + +# Kernel compilation should be cached for the 2nd iteration +# with input tensors of the same shape +for _ in range(5) : + outputs = fusion.execute([input1, input2, 2.0]) + +print(outputs[0]) diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp b/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp new file mode 100644 index 000000000000..c619b557fa12 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp @@ -0,0 +1,641 @@ +#include + +#ifdef USE_CUDA +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace torch::jit::fuser::cuda; + +namespace { + +class PythonFusionOwner { + public: + PythonFusionOwner() : executor_cache_(std::make_unique()) {} + + // Non-copyable + PythonFusionOwner(const PythonFusionOwner&) = delete; + PythonFusionOwner& operator=(const PythonFusionOwner&) = delete; + + std::vector execute(const at::ArrayRef& inputs) { + return executor_cache_.runFusionWithInputs(inputs); + } + Fusion* fusionPtr() { + return executor_cache_.fusion(); + } + + void printIr() { + executor_cache_.printFusion(); + } + void printKernel() { + executor_cache_.fusion()->printKernel(); + } + + private: + FusionExecutorCache executor_cache_; +}; + +// Manually applying the fusion guard via a context manager +class FusionDefinitionContextManager { + public: + FusionDefinitionContextManager(PythonFusionOwner* fusion_owner) + : fusion_owner_(fusion_owner), prev_fusion_(nullptr) {} + + // Context Manager Methods + FusionDefinitionContextManager* enter() { + prev_fusion_ = FusionGuard::getCurFusion(); + FusionGuard::setCurFusion(fusionPtr()); + return this; + } + + void exit() { + FusionGuard::setCurFusion(prev_fusion_); + prev_fusion_ = nullptr; + } + + void addInput(torch::jit::fuser::cuda::Val* input) { + fusionPtr()->addInput(input); + } + void addOutput(torch::jit::fuser::cuda::Val* output) { + fusionPtr()->addOutput(output); + } + + Fusion* fusionPtr() { + return fusion_owner_->fusionPtr(); + } + + // An Empty namespace to add arith ops + struct Ops {}; + + private: + PythonFusionOwner* fusion_owner_; + Fusion* prev_fusion_; +}; + +} // namespace + +namespace torch { +namespace jit { + +void initNvFuserPythonBindings(PyObject* module) { + auto m = py::handle(module).cast(); + + auto nvfuser = m.def_submodule("_nvfuser"); + + // DataTypes supported by NVFuser in Fusion Definition + // Types not related to values found in fusion defintions + // were purposely left out. + // NOTE: DataType was ambiguous under torch::jit without full qualification. + py::enum_(nvfuser, "DataType") + .value("Double", torch::jit::fuser::cuda::DataType::Double) + .value("Float", torch::jit::fuser::cuda::DataType::Float) + .value("Half", torch::jit::fuser::cuda::DataType::Half) + .value("Int", torch::jit::fuser::cuda::DataType::Int) + .value("Int32", torch::jit::fuser::cuda::DataType::Int32) + .value("Bool", torch::jit::fuser::cuda::DataType::Bool) + .value("BFloat16", torch::jit::fuser::cuda::DataType::BFloat16) + .value("ComplexFloat", torch::jit::fuser::cuda::DataType::ComplexFloat) + .value("ComplexDouble", torch::jit::fuser::cuda::DataType::ComplexDouble); + + // Binding an object that owns a FusionExecutorCache instance and provides an + // interface + py::class_ fusion(nvfuser, "Fusion"); + fusion.def(py::init<>()) + .def( + "execute", + [](PythonFusionOwner& self, const py::iterable& iter) { + std::vector inputs; + for (py::handle obj : iter) { + inputs.push_back(toIValue(obj, c10::AnyType::get())); + } + return self.execute(inputs); + }, + py::return_value_policy::reference) + .def("print_ir", [](PythonFusionOwner& self) { self.printIr(); }) + .def("print_kernel", [](PythonFusionOwner& self) { self.printKernel(); }); + + // Bindings to Types required for Tensor/Scalar Creation + py::class_(nvfuser, "TensorView") + .def( + "__str__", + [](TensorView& self) -> std::string { + std::stringstream ss; + TORCH_CHECK( + self.getDataType().has_value(), + "TensorView does not have DataType?"); + ss << self.getDataType().value(); + return self.toString() + " DataType: " + ss.str() + + " Contiguity: " + self.domain()->getContiguityString(); + }, + py::return_value_policy::reference); + py::class_(nvfuser, "Val") + .def( + "__str__", + [](torch::jit::fuser::cuda::Val& self) -> std::string { + return self.toString(); + }, + py::return_value_policy::reference); + + // C++ Side of Context Manager used to mimic the FusionGuard as a way + // to programatically distinguish code used to define the Fusion instead + // of having the user mysteriously create an object prior to adding definition + // code where the object is not used. + py::class_ fusion_def( + nvfuser, "FusionDefinition"); + fusion_def.def(py::init()) + .def( + "__enter__", + [](FusionDefinitionContextManager& self) { return self.enter(); }) + .def( + "__exit__", + [](FusionDefinitionContextManager& self, + void* exc_type, + void* exc_value, + void* traceback) { self.exit(); }) + .def( + "add_input", + [](FusionDefinitionContextManager& self, + torch::jit::fuser::cuda::Val* input) { self.addInput(input); }) + .def( + "add_input", + [](FusionDefinitionContextManager& self, TensorView* input) { + self.addInput(input); + }) + .def( + "add_output", + [](FusionDefinitionContextManager& self, + torch::jit::fuser::cuda::Val* output) { self.addOutput(output); }) + .def( + "add_output", + [](FusionDefinitionContextManager& self, TensorView* output) { + self.addOutput(output); + }) + .def( + "define_tensor", + [](FusionDefinitionContextManager& self, + size_t ndims, + torch::jit::fuser::cuda::DataType dtype = + torch::jit::fuser::cuda::DataType::Float) -> TensorView* { + return TensorViewBuilder() + .ndims(ndims) + .dtype(dtype) + .contiguity(std::vector(ndims, true)) + .build(); + }, + py::arg("ndims"), + py::arg("dtype") = torch::jit::fuser::cuda::DataType::Float, + py::return_value_policy::reference) + .def( + "define_tensor", + [](FusionDefinitionContextManager& self, + std::vector sizes, + std::vector strides, + torch::jit::fuser::cuda::DataType dtype = + torch::jit::fuser::cuda::DataType::Float) -> TensorView* { + TORCH_CHECK( + sizes.size() == strides.size(), + "The number of sizes does not match the number of strides.", + sizes.size(), + strides.size()); + + std::vector domain_sizes; + for (const auto i : c10::irange(sizes.size())) { + if (sizes[i] == 1) { + domain_sizes.push_back(IrBuilder::create( + self.fusionPtr()->zeroVal(), + self.fusionPtr()->oneVal(), + ParallelType::Serial, + IterType::BroadcastWithStride)); + } else { + domain_sizes.push_back(IrBuilder::create( + self.fusionPtr()->zeroVal(), IrBuilder::create())); + } + } + + std::vector contig_info(strides.size(), false); + for (int i = contig_info.size() - 1; i >= 0; --i) { + if (i == static_cast(contig_info.size() - 1)) { + contig_info[i] = (strides[i] == 1); + } else { + contig_info[i] = + (strides[i] == (strides[i + 1] * sizes[i + 1])); + } + } + + return IrBuilder::create( + IrBuilder::create(domain_sizes, contig_info), + dtype); + }, + py::arg("sizes"), + py::arg("strides"), + py::arg("dtype") = torch::jit::fuser::cuda::DataType::Float, + py::return_value_policy::reference) + .def( + "define_constant", + [](FusionDefinitionContextManager& self, + double val) -> torch::jit::fuser::cuda::Val* { + return IrBuilder::create(val); + }, + py::return_value_policy::reference) + .def( + "define_constant", + [](FusionDefinitionContextManager& self, + bool val) -> torch::jit::fuser::cuda::Val* { + return IrBuilder::create(val); + }, + py::return_value_policy::reference) + .def( + "define_constant", + [](FusionDefinitionContextManager& self, + int64_t val) -> torch::jit::fuser::cuda::Val* { + return IrBuilder::create(val); + }, + py::return_value_policy::reference) + .def( + "define_scalar", + [](FusionDefinitionContextManager& self, + torch::jit::fuser::cuda::DataType dtype = + torch::jit::fuser::cuda::DataType::Double) + -> torch::jit::fuser::cuda::Val* { + if (dtype == torch::jit::fuser::cuda::DataType::Double) { + return IrBuilder::create(); + } else if (dtype == torch::jit::fuser::cuda::DataType::Bool) { + return IrBuilder::create(); + } else if (dtype == torch::jit::fuser::cuda::DataType::Int) { + return IrBuilder::create(); + } else { + TORCH_CHECK(false, "Dtype is not supported:", dtype); + } + }, + py::arg("dtype") = torch::jit::fuser::cuda::DataType::Double, + py::return_value_policy::reference); + + py::class_ nvf_ops(fusion_def, "Ops"); + + // ******************** INSERT OP BINDINGS BELOW HERE ******************** + +#define NVFUSER_PYTHON_BINDING_UNARY_OP(op_str, op_name) \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast(&torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast( \ + &torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); + + NVFUSER_PYTHON_BINDING_UNARY_OP("abs", abs) + NVFUSER_PYTHON_BINDING_UNARY_OP("acos", acos) + NVFUSER_PYTHON_BINDING_UNARY_OP("asin", asin) + NVFUSER_PYTHON_BINDING_UNARY_OP("atan", atan) + NVFUSER_PYTHON_BINDING_UNARY_OP("atanh", atanh) + NVFUSER_PYTHON_BINDING_UNARY_OP("ceil", ceil) + NVFUSER_PYTHON_BINDING_UNARY_OP("cos", cos) + NVFUSER_PYTHON_BINDING_UNARY_OP("cosh", cosh) + NVFUSER_PYTHON_BINDING_UNARY_OP("exp", exp) + NVFUSER_PYTHON_BINDING_UNARY_OP("expm1", expm1) + NVFUSER_PYTHON_BINDING_UNARY_OP("erf", erf) + NVFUSER_PYTHON_BINDING_UNARY_OP("erfc", erfc) + NVFUSER_PYTHON_BINDING_UNARY_OP("floor", floor) + NVFUSER_PYTHON_BINDING_UNARY_OP("frac", frac) + NVFUSER_PYTHON_BINDING_UNARY_OP("lgamma", lgamma) + NVFUSER_PYTHON_BINDING_UNARY_OP("log", log) + NVFUSER_PYTHON_BINDING_UNARY_OP("log10", log10) + NVFUSER_PYTHON_BINDING_UNARY_OP("log1p", log1p) + NVFUSER_PYTHON_BINDING_UNARY_OP("log2", log2) + NVFUSER_PYTHON_BINDING_UNARY_OP("neg", neg) + NVFUSER_PYTHON_BINDING_UNARY_OP("bitwise_not", bitwise_not) + NVFUSER_PYTHON_BINDING_UNARY_OP("relu", relu) + NVFUSER_PYTHON_BINDING_UNARY_OP("rand_like", randlike) + NVFUSER_PYTHON_BINDING_UNARY_OP("reciprocal", reciprocal) + NVFUSER_PYTHON_BINDING_UNARY_OP("round", round) + NVFUSER_PYTHON_BINDING_UNARY_OP("rsqrt", rsqrt) + NVFUSER_PYTHON_BINDING_UNARY_OP("set", set) + NVFUSER_PYTHON_BINDING_UNARY_OP("sigmoid", sigmoid) + NVFUSER_PYTHON_BINDING_UNARY_OP("silu", silu) + NVFUSER_PYTHON_BINDING_UNARY_OP("sin", sin) + NVFUSER_PYTHON_BINDING_UNARY_OP("sinh", sinh) + NVFUSER_PYTHON_BINDING_UNARY_OP("sqrt", sqrt) + NVFUSER_PYTHON_BINDING_UNARY_OP("tan", tan) + NVFUSER_PYTHON_BINDING_UNARY_OP("tanh", tanh) + NVFUSER_PYTHON_BINDING_UNARY_OP("trunc", trunc) + NVFUSER_PYTHON_BINDING_UNARY_OP("isfinite", isfinite) + NVFUSER_PYTHON_BINDING_UNARY_OP("isinf", isinf) + NVFUSER_PYTHON_BINDING_UNARY_OP("isnan", isnan) + NVFUSER_PYTHON_BINDING_UNARY_OP("isneginf", isneginf) + NVFUSER_PYTHON_BINDING_UNARY_OP("isposinf", isposinf) + NVFUSER_PYTHON_BINDING_UNARY_OP("isreal", isreal) +#undef NVFUSER_PYTHON_BINDING_UNARY_OP + +#define NVFUSER_PYTHON_BINDING_BINARY_OP(op_str, op_name) \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast( \ + &torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast( \ + &torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast( \ + &torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast< \ + torch::jit::fuser::cuda::Val*, \ + torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); + + NVFUSER_PYTHON_BINDING_BINARY_OP("add", add) + NVFUSER_PYTHON_BINDING_BINARY_OP("atan2", atan2) + NVFUSER_PYTHON_BINDING_BINARY_OP("div", div) + NVFUSER_PYTHON_BINDING_BINARY_OP("fmod", fmod) + NVFUSER_PYTHON_BINDING_BINARY_OP("mul", mul) + NVFUSER_PYTHON_BINDING_BINARY_OP("pow", pow) + NVFUSER_PYTHON_BINDING_BINARY_OP("remainder", remainder) + NVFUSER_PYTHON_BINDING_BINARY_OP("sub", sub) + NVFUSER_PYTHON_BINDING_BINARY_OP("mod", mod) + NVFUSER_PYTHON_BINDING_BINARY_OP("eq", eq) + NVFUSER_PYTHON_BINDING_BINARY_OP("ge", ge) + NVFUSER_PYTHON_BINDING_BINARY_OP("gt", gt) + NVFUSER_PYTHON_BINDING_BINARY_OP("le", le) + NVFUSER_PYTHON_BINDING_BINARY_OP("lt", lt) + NVFUSER_PYTHON_BINDING_BINARY_OP("ne", ne) + NVFUSER_PYTHON_BINDING_BINARY_OP("bitwise_and", bitwise_and) + NVFUSER_PYTHON_BINDING_BINARY_OP("bitwise_or", bitwise_or) + NVFUSER_PYTHON_BINDING_BINARY_OP("bitwise_xor", bitwise_xor) + NVFUSER_PYTHON_BINDING_BINARY_OP("bitwise_left_shift", bitwise_left_shift) + NVFUSER_PYTHON_BINDING_BINARY_OP("bitwise_right_shift", bitwise_left_shift) +#undef NVFUSER_PYTHON_BINDING_BINARY_OP + +#define NVFUSER_PYTHON_BINDING_TERNARY_OP(op_str, op_name) \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast( \ + &torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast< \ + TensorView*, \ + TensorView*, \ + torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast< \ + TensorView*, \ + torch::jit::fuser::cuda::Val*, \ + TensorView*>(&torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast< \ + TensorView*, \ + torch::jit::fuser::cuda::Val*, \ + torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast< \ + torch::jit::fuser::cuda::Val*, \ + TensorView*, \ + TensorView*>(&torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast< \ + torch::jit::fuser::cuda::Val*, \ + TensorView*, \ + torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast< \ + torch::jit::fuser::cuda::Val*, \ + torch::jit::fuser::cuda::Val*, \ + TensorView*>(&torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast< \ + torch::jit::fuser::cuda::Val*, \ + torch::jit::fuser::cuda::Val*, \ + torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); + + NVFUSER_PYTHON_BINDING_TERNARY_OP("lerp", lerp) + NVFUSER_PYTHON_BINDING_TERNARY_OP("where", where) +#undef NVFUSER_PYTHON_BINDING_TERNARY_OP + +#define NVFUSER_PYTHON_BINDING_TERNARY_ABRV1_OP(op_str, op_name) \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast< \ + TensorView*, \ + torch::jit::fuser::cuda::Val*, \ + torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast< \ + torch::jit::fuser::cuda::Val*, \ + torch::jit::fuser::cuda::Val*, \ + torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); + + NVFUSER_PYTHON_BINDING_TERNARY_ABRV1_OP("clamp", clamp) + NVFUSER_PYTHON_BINDING_TERNARY_ABRV1_OP("threshold", threshold) +#undef NVFUSER_PYTHON_BINDING_TERNARY_ABRV1_OP + +#define NVFUSER_PYTHON_BINDING_TERNARY_ABRV2_OP(op_str, op_name) \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast< \ + TensorView*, \ + TensorView*, \ + torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast< \ + TensorView*, \ + torch::jit::fuser::cuda::Val*, \ + torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast< \ + torch::jit::fuser::cuda::Val*, \ + TensorView*, \ + torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast< \ + torch::jit::fuser::cuda::Val*, \ + torch::jit::fuser::cuda::Val*, \ + torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); + + NVFUSER_PYTHON_BINDING_TERNARY_ABRV2_OP("add_alpha", add_alpha) + NVFUSER_PYTHON_BINDING_TERNARY_ABRV2_OP("sub_alpha", sub_alpha) +#undef NVFUSER_PYTHON_BINDING_TERNARY_ABRV2_OP + +#define NVFUSER_PYTHON_BINDING_QUAD_ABRV3_OP(op_str, op_name) \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast< \ + TensorView*, \ + TensorView*, \ + TensorView*, \ + torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast< \ + TensorView*, \ + TensorView*, \ + torch::jit::fuser::cuda::Val*, \ + torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast< \ + TensorView*, \ + torch::jit::fuser::cuda::Val*, \ + TensorView*, \ + torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast< \ + TensorView*, \ + torch::jit::fuser::cuda::Val*, \ + torch::jit::fuser::cuda::Val*, \ + torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast< \ + torch::jit::fuser::cuda::Val*, \ + TensorView*, \ + TensorView*, \ + torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast< \ + torch::jit::fuser::cuda::Val*, \ + TensorView*, \ + torch::jit::fuser::cuda::Val*, \ + torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast< \ + torch::jit::fuser::cuda::Val*, \ + torch::jit::fuser::cuda::Val*, \ + TensorView*, \ + torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); \ + nvf_ops.def_static( \ + op_str, \ + py::overload_cast< \ + torch::jit::fuser::cuda::Val*, \ + torch::jit::fuser::cuda::Val*, \ + torch::jit::fuser::cuda::Val*, \ + torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::op_name), \ + py::return_value_policy::reference); + + NVFUSER_PYTHON_BINDING_QUAD_ABRV3_OP("addcmul", addcmul) +#undef NVFUSER_PYTHON_BINDING_QUAD_ABRV3_OP + + // Reduction Operations + nvf_ops.def_static( + "max", &torch::jit::fuser::cuda::max, py::return_value_policy::reference); + nvf_ops.def_static( + "min", &torch::jit::fuser::cuda::min, py::return_value_policy::reference); + nvf_ops.def_static( + "sum", &torch::jit::fuser::cuda::sum, py::return_value_policy::reference); + + // Broadcast operations + nvf_ops.def_static( + "broadcast", + &torch::jit::fuser::cuda::broadcast, + py::return_value_policy::reference); + // TODO: We don't have a way to realize a tensor if the operation creates + // the output of a fusion. + nvf_ops.def_static( + "broadcast_in_dim", + [](TensorView* input, + std::vector& output_shape, + std::vector& broadcast_dims) -> TensorView* { + TORCH_CHECK( + output_shape.size() >= input->nDims(), + "The new shape is expected to be greater-then-or-equal to the input", + output_shape.size(), + input->nDims()); + TORCH_CHECK( + input->nDims() == broadcast_dims.size(), + "The broadcast dimensions should match the input dimensions.", + input->nDims(), + broadcast_dims.size()); + + std::vector is_broadcast_dim(output_shape.size(), true); + for (const auto idx : c10::irange(broadcast_dims.size())) { + if (idx > 0) { + TORCH_CHECK( + broadcast_dims[idx - 1] < broadcast_dims[idx], + "Broadcast dimension is not greater than the previous value."); + } + TORCH_CHECK( + broadcast_dims[idx] < static_cast(output_shape.size()), + "Invalid broadcast_dims value."); + is_broadcast_dim.at(broadcast_dims[idx]) = false; + } + + return torch::jit::fuser::cuda::broadcast(input, is_broadcast_dim); + }, + py::return_value_policy::reference); + + // Cast Operations + nvf_ops.def_static( + "cast", + py::overload_cast( + &torch::jit::fuser::cuda::castOp), + py::return_value_policy::reference); + nvf_ops.def_static( + "cast", + py::overload_cast< + torch::jit::fuser::cuda::DataType, + torch::jit::fuser::cuda::Val*>(&torch::jit::fuser::cuda::castOp), + py::return_value_policy::reference); +} + +} // namespace jit +} // namespace torch + +#else + +namespace torch { +namespace jit { + +void initNvFuserPythonBindings(PyObject* module) {} + +} // namespace jit +} // namespace torch + +#endif // USE_CUDA diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.h b/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.h new file mode 100644 index 000000000000..c5785bc31de3 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.h @@ -0,0 +1,10 @@ +#pragma once + +#include +#include + +namespace torch { +namespace jit { +void initNvFuserPythonBindings(PyObject* module); +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/reference_tensor.h b/torch/csrc/jit/codegen/cuda/reference_tensor.h index 2220831dc09f..07c83bb6ed74 100644 --- a/torch/csrc/jit/codegen/cuda/reference_tensor.h +++ b/torch/csrc/jit/codegen/cuda/reference_tensor.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include diff --git a/torch/csrc/jit/codegen/cuda/register_interface.cpp b/torch/csrc/jit/codegen/cuda/register_interface.cpp index a3fba4b62975..c89f8c5a7a6a 100644 --- a/torch/csrc/jit/codegen/cuda/register_interface.cpp +++ b/torch/csrc/jit/codegen/cuda/register_interface.cpp @@ -25,10 +25,21 @@ class RegisterInterface { ptr->fn_can_fuse_n = &isFusibleCudaFusionGroup; ptr->fn_insert_profile_inodes = &InsertProfileNodes; ptr->fn_profile_n = &shouldProfileNode; + ptr->fn_skip_n = &skipNodeKind; } }; static RegisterInterface register_interface_; + +class RegisterNVFuserPass { + public: + RegisterNVFuserPass() { + NVFuserPassManager::registerPass(true); + } +}; + +static RegisterNVFuserPass register_nvfuser_pass_; + } // namespace } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/root_domain_map.cpp b/torch/csrc/jit/codegen/cuda/root_domain_map.cpp index ddb92371baa2..f7d00799e83e 100644 --- a/torch/csrc/jit/codegen/cuda/root_domain_map.cpp +++ b/torch/csrc/jit/codegen/cuda/root_domain_map.cpp @@ -47,8 +47,9 @@ std::unordered_map RootDomainMap:: PairwiseRootDomainMap::PairwiseRootDomainMap( const TensorView* producer, - const TensorView* consumer) - : producer_tv_(producer), consumer_tv_(consumer) { + const TensorView* consumer, + bool is_exact) + : producer_tv_(producer), consumer_tv_(consumer), is_exact_(is_exact) { TORCH_INTERNAL_ASSERT(producer != nullptr); TORCH_INTERNAL_ASSERT(consumer != nullptr); TORCH_INTERNAL_ASSERT(producer->fusion() == consumer->fusion()); @@ -100,6 +101,14 @@ std::unordered_map PairwiseRootDomainMap::map( continue; } + // In exact mapping, do not map broadcast domains with + // non-broadcast domains + if (is_exact_ && producer_id->isBroadcast() != consumer_id->isBroadcast()) { + itc++; + itp++; + continue; + } + IterDomain* map_key_id = producer_id; IterDomain* map_value_id = consumer_id; if (!producer_to_consumer) { @@ -134,9 +143,17 @@ std::unordered_map PairwiseRootDomainMap:: for (const auto i : c10::irange(consumer_root.size())) { IterDomain* map_key_id = producer_root[new2old[i]]; IterDomain* map_value_id = consumer_root[i]; + + // In exact mapping, do not map broadcast domains with + // non-broadcast domains + if (is_exact_ && map_key_id->isBroadcast() != map_value_id->isBroadcast()) { + continue; + } + if (!producer_to_consumer) { std::swap(map_key_id, map_value_id); } + if (root_dims_to_map.find(map_key_id) != root_dims_to_map.end()) { dom_map.insert(std::make_pair(map_key_id, map_value_id)); } @@ -144,10 +161,14 @@ std::unordered_map PairwiseRootDomainMap:: return dom_map; } -std::string toString(const PairwiseRootDomainMap& root_map) { +std::string PairwiseRootDomainMap::toString() const { std::stringstream ss; - ss << "{producer: " << root_map.producer() - << ", consumer: " << root_map.consumer() << "}"; + ss << "{producer: " << producer() << ", consumer: " << consumer(); + auto p2c = mapProducerToConsumer(producer()->domain(), consumer()->domain()); + for (auto pair : p2c) { + ss << ", " << pair.first->toString() << " -> " << pair.second->toString(); + } + ss << "}"; return ss.str(); } @@ -167,23 +188,23 @@ auto ensureMapping( } // namespace -std::string toString(const DomainKey& key) { +std::string DomainKey::toString() const { std::stringstream ss; ss << "{"; - if (key.td()) { - ss << key.td() << " (root: " << key.td()->getRootDomain() - << ", maybe rfactor: " << key.td()->getMaybeRFactorDomain() << ")"; + if (td()) { + ss << td() << " (root: " << td()->getRootDomain() + << ", maybe rfactor: " << td()->getMaybeRFactorDomain() << ")"; } else { ss << "null"; } ss << ", "; - if (key.id()) { - ss << key.id(); + if (id()) { + ss << id(); } else { ss << "null"; } - if (key.concreteId()) { - ss << " (" << key.concreteId() << ")"; + if (concreteId()) { + ss << " (" << concreteId() << ")"; } ss << "}"; return ss.str(); @@ -196,7 +217,7 @@ UnmappableReductionDomains::UnmappableReductionDomains() { namespace { -//! Find all domains that a given domain is depeendent on +//! Find all domains that a given domain is dependent on class FindInputDomains : BackwardVisitor { private: FindInputDomains(TensorView* tv, const IterDomain* id) @@ -285,6 +306,19 @@ void UnmappableReductionDomains::handle(ReductionOp* op) { handleReductionOutput(out_tv); } +void UnmappableReductionDomains::handle(GroupedReductionOp* op) { + // Builds a map from reduction domains to consumer domains. + for (auto out : op->outputs()) { + handleReductionOutput(out->as()); + } +} + +void UnmappableReductionDomains::handle(MmaOp* mma) { + // Builds a map from reduction domains to consumer domains. + TensorView* out_tv = mma->out()->as(); + handleReductionOutput(out_tv); +} + void UnmappableReductionDomains::handle(WelfordOp* op) { // Builds a map from reduction domains to consumer domains. handleReductionOutput(op->outAvg()->as()); @@ -446,7 +480,7 @@ bool ComputeAtRootDomainMap::canMap( bool ComputeAtRootDomainMap::canMap( const DomainKey& key_a, const DomainKey& key_b) const { - return key_a == key_b || eq_set_.areEquivalent(key_a, key_b); + return key_a == key_b || eq_set_.permissiveAreMapped(key_a, key_b); } void ComputeAtRootDomainMap::setAlias( @@ -463,10 +497,11 @@ void ComputeAtRootDomainMap::setAlias( } bcast_map_ = tmp_bcast_map; - for (const auto& key : eq_set_.getAllElements()) { + auto all_elements = eq_set_.getAllElements(); + for (const auto& key : all_elements.vector()) { if (key.td() == td) { DomainKey alias_key(td_alias, key.id(), key.concreteId()); - eq_set_.join(key, alias_key); + eq_set_.mapEntries(key, alias_key); } } @@ -485,7 +520,7 @@ std::vector ComputeAtRootDomainMap::getConcretizedKeys( const IterDomain* id) const { DomainKey key(td, id); auto it = bcast_map_.find(key); - TORCH_INTERNAL_ASSERT(it != bcast_map_.end(), "Not found: ", toString(key)); + TORCH_INTERNAL_ASSERT(it != bcast_map_.end(), "Not found: ", key.toString()); std::vector domains; std::transform( it->second.begin(), @@ -501,7 +536,7 @@ std::unordered_set& ComputeAtRootDomainMap:: getConcretizedDomains(const TensorDomain* td, const IterDomain* id) { DomainKey key(td, id); auto it = bcast_map_.find(key); - TORCH_INTERNAL_ASSERT(it != bcast_map_.end(), "Not found: ", toString(key)); + TORCH_INTERNAL_ASSERT(it != bcast_map_.end(), "Not found: ", key.toString()); return it->second; } @@ -548,13 +583,15 @@ std::unordered_map ComputeAtRootDomainMap::map( if (id_map.find(from_id) != id_map.end()) { continue; } - // Matching ID not found. It's an error unless from_id is a new - // broadcast of a consumer domain; or from_id is a window axis of - // a consumer domain. Note that reduction domains are removed from - // the producer root domain. + // Matching ID not found. It's an error unless the following three cases: + // 1. from_id is a new broadcast of a consumer domain; or + // 2. from_id is a window axis of a consumer domain; or + // 3. from_id is a ViewAsScalar domain + // Note that reduction domains are removed from the producer root domain. if (!producer_to_consumer && (new_broadcast_domains_.find(DomainKey(from_td, from_id)) != new_broadcast_domains_.end() || + from_id->getIterType() == IterType::VectorComponent || (window_axes_.count(from_id) > 0))) { continue; } @@ -570,7 +607,7 @@ std::unordered_map ComputeAtRootDomainMap::map( ". Consumer root: ", consumer_root, ". Mapping: ", - toString(*this)); + this->toString()); } return id_map; } @@ -578,27 +615,30 @@ std::unordered_map ComputeAtRootDomainMap::map( std::unordered_set ComputeAtRootDomainMap::getMappableDims( const TensorDomain* producer, const TensorDomain* consumer) const { + //! This funciton previously used mapBestEffort but it can fail when + //! a domain is mapped to multitple domains, which can happen with + //! views. Since we only need to find mappable domains, just + //! grab any domain that is mapped in a pairwise way. + const auto& producer_root = producer->getMaybeRFactorDomain(); const auto& consumer_root = consumer->getRootDomain(); - std::unordered_map id_map = - mapBestEffort(producer, producer_root, consumer, consumer_root); - std::unordered_set mappable_ids; - for (auto& from_id : producer_root) { - if (id_map.find(from_id) != id_map.end()) { - mappable_ids.emplace(from_id); - mappable_ids.emplace(id_map.at(from_id)); + for (const auto& p_id : producer_root) { + for (const auto& c_id : consumer_root) { + if (canMap(producer, p_id, consumer, c_id)) { + mappable_ids.emplace(p_id); + mappable_ids.emplace(c_id); + } } } + return mappable_ids; } -std::string toString(const ComputeAtRootDomainMap& root_map) { - std::stringstream ss; - root_map.eq_set_.print(ss); - return ss.str(); +std::string ComputeAtRootDomainMap::toString() const { + return eq_set_.toString(); } ComputeAtRootDomainMapBuilder::ComputeAtRootDomainMapBuilder( @@ -614,9 +654,9 @@ ComputeAtRootDomainMapBuilder::ComputeAtRootDomainMapBuilder( std::stringstream ss; ss << "pending map:\n"; for (auto& kv : pending_map_) { - ss << "\t" << toString(kv.first) << "\n"; + ss << "\t" << kv.first.toString() << "\n"; for (auto& dk : kv.second) { - ss << "\t\t" << toString(dk) << "\n"; + ss << "\t\t" << dk.toString() << "\n"; } } std::cerr << ss.str(); @@ -638,10 +678,14 @@ void ComputeAtRootDomainMapBuilder::initializeBcastMap( return; } - // This initialization should be only used for fusion output tensors and - // outputs of multi-consumer expressions that are not fusion outputs. + // This initialization should be only used for: 1) fusion output + // tensors, 2) outputs of multi-consumer expressions that are not + // fusion outputs, and 3) view outputs as broadcasts can be merged + // with non-broadcast domains, resulting in non-broadcast rfactor + // domains. TORCH_INTERNAL_ASSERT( - tv->isFusionOutput() || tv->definition()->outputs().size() > 1, + tv->isFusionOutput() || tv->definition()->outputs().size() > 1 || + tv->isDefinitionType(ExprType::ViewOp), "Invalid tensor to initialize bcast map: t", tv->name()); root_map_.bcast_map_.insert({key, {id}}); @@ -658,7 +702,59 @@ void ComputeAtRootDomainMapBuilder::addToPendingList( void ComputeAtRootDomainMapBuilder::setMapped( const DomainKey& producer, const DomainKey& consumer) { - root_map_.eq_set_.join(producer, consumer); + root_map_.eq_set_.mapEntries(producer, consumer); +} + +void ComputeAtRootDomainMapBuilder::setInvalid( + const DomainKey& key1, + const DomainKey& key2) { + invalid_mappings_.emplace_back(key1, key2); +} + +bool ComputeAtRootDomainMapBuilder::isInvalid( + const std::vector& domains) const { + // First, collect all invalid mappings for each of the keys in domains + DomainKeyMap invalid_key_map; + for (const auto& key : domains) { + DomainKeySet invalid_keys; + for (const auto& invalid_pair : invalid_mappings_) { + if (root_map_.canMap(key, invalid_pair.first)) { + invalid_keys.insert(invalid_pair.second); + } else if (root_map_.canMap(key, invalid_pair.second)) { + invalid_keys.insert(invalid_pair.first); + } + } + invalid_key_map.emplace(key, invalid_keys); + } + + // Next, check if any pair is invalid to map. + const auto num_keys = domains.size(); + for (const auto i : c10::irange(num_keys)) { + const auto& key_i = domains[i]; + // If no invalid keys found for key_i, it can be skipped. + const auto invalid_key_map_it = invalid_key_map.find(key_i); + if (invalid_key_map_it == invalid_key_map.end()) { + continue; + } + + // Set of keys that are invalid to be mapped with key_i. + const DomainKeySet& invalid_keys_for_i = invalid_key_map_it->second; + + // If any other key in domains is identified mappable with any of + // the keys in this set, the mapping with key_i is invalid. + for (const auto j : c10::irange(i + 1, num_keys)) { + const auto& key_j = domains[j]; + if (std::any_of( + invalid_keys_for_i.begin(), + invalid_keys_for_i.end(), + [&](const auto& invalid_key_for_i) { + return root_map_.canMap(key_j, invalid_key_for_i); + })) { + return true; + } + } + } + return false; } void ComputeAtRootDomainMapBuilder::setMaybeMapped( @@ -693,7 +789,7 @@ void ComputeAtRootDomainMapBuilder::setMaybeMapped( TORCH_INTERNAL_ASSERT( !consumer_id->isBroadcast(), "No concrete domain found for a broadcast domain: ", - toString(consumer_key)); + consumer_key.toString()); auto producer_concrete_key = producer_key; if (producer_id->isBroadcast()) { const auto concrete_id = consumer_id; @@ -730,10 +826,10 @@ void ComputeAtRootDomainMapBuilder::mapPointwiseOrReductionOp(Expr* e) { // Record equalities from output to all the inputs // ignores un-concretizable broadcasts - for (auto* i : ir_utils::filterByType(e->inputs())) { - const TensorDomain* in_td = i->domain(); + for (auto* in_tv : ir_utils::filterByType(e->inputs())) { + const TensorDomain* in_td = in_tv->domain(); std::vector in_root = - TensorDomain::noReductions(i->getMaybeRFactorDomain()); + TensorDomain::noReductions(in_tv->getMaybeRFactorDomain()); TORCH_INTERNAL_ASSERT( in_root.size() == out_root.size(), "\nExpression: ", @@ -745,7 +841,10 @@ void ComputeAtRootDomainMapBuilder::mapPointwiseOrReductionOp(Expr* e) { for (const auto it : c10::irange(in_root.size())) { if (e->outputs().size() > 1) { TORCH_INTERNAL_ASSERT( - e->isA(), "Only supported multioutput op is welford"); + e->isA() || e->isA(), + "Multi-output mapping assumes WelforddOp or GroupedReductionOp but, ", + e->getExprType().value(), + " is found"); for (auto o : e->outputs()) { auto o_tv = o->as(); auto o_td = o_tv->domain(); @@ -807,6 +906,36 @@ void ComputeAtRootDomainMapBuilder::handle(BroadcastOp* op) { } } +void ComputeAtRootDomainMapBuilder::handle(ViewAsScalar* op) { + const TensorView* out_tv = op->output(0)->as(); + const TensorDomain* out_td = out_tv->domain(); + const auto& out_root = out_td->getRootDomain(); + + const TensorView* in_tv = op->input(0)->as(); + const TensorDomain* in_td = in_tv->domain(); + + std::vector in_root = + TensorDomain::noReductions(in_tv->getMaybeRFactorDomain()); + TORCH_INTERNAL_ASSERT( + in_root.size() + 1 == out_root.size(), + "\nExpression: ", + op, + "\nInput root domain: ", + in_root, + "\nOutput root domain: ", + out_root); + auto in_it = in_root.begin(); + auto out_it = out_root.begin(); + while (in_it != in_root.end() && out_it != out_root.end()) { + setMaybeMapped(in_td, *in_it, out_td, *out_it); + ++in_it; + ++out_it; + } + TORCH_INTERNAL_ASSERT( + (*out_it)->isVectorComponent(), + "The last dim of ViewDtypeOp's output must be a ViewAsScalar"); +} + void ComputeAtRootDomainMapBuilder::handle(TransposeOp* op) { const TensorDomain* in_td = op->in()->as()->domain(); std::vector in_root = @@ -843,37 +972,77 @@ void ComputeAtRootDomainMapBuilder::handle(GatherOp* op) { } } -bool ComputeAtRootDomainMapBuilder::mapAllConsumers( - const DomainKey& producer_key) { - auto it = pending_map_.find(producer_key); +void ComputeAtRootDomainMapBuilder::mapAllPendingMappings( + const DomainKey& key) { + auto it = pending_map_.find(key); if (it == pending_map_.end()) { - return false; + return; } - const auto& consumer_set = it->second; + const auto& pending_set = it->second; // All entries in key_set must be equivalent with each other. - TORCH_INTERNAL_ASSERT(consumer_set.size() > 0); - bool consistent = safeToMap(consumer_set); - if (consistent) { - for (const auto pending_consumer : consumer_set) { - setMapped(producer_key, pending_consumer); + TORCH_INTERNAL_ASSERT(pending_set.size() > 0); + bool consistent = safeToMap(pending_set); + for (const auto pending_key : pending_set) { + if (consistent) { + setMapped(key, pending_key); + } else { + setInvalid(key, pending_key); } } // This entry should never be used again, so remove it. pending_map_.erase(it); - return consistent; +} + +void ComputeAtRootDomainMapBuilder::mapAllPendingMappings( + const TensorDomain* td, + IterDomain* id) { + if (id->isBroadcast()) { + for (const auto& key : root_map_.getConcretizedKeys(td, id)) { + mapAllPendingMappings(key); + } + } else { + mapAllPendingMappings(DomainKey(td, id)); + } } void ComputeAtRootDomainMapBuilder::handle(TensorView* tv) { const TensorDomain* td = tv->domain(); - const auto root = TensorDomain::noReductions(td->getMaybeRFactorDomain()); - for (auto id : root) { + const auto rfactor = TensorDomain::noReductions(td->getMaybeRFactorDomain()); + for (auto id : rfactor) { if (id->isBroadcast()) { initializeBcastMap(tv, id); - for (const auto& key : root_map_.getConcretizedKeys(td, id)) { - mapAllConsumers(key); + } + mapAllPendingMappings(td, id); + } + + // When tv has a rfactor domain, propagate the domain mappings from + // each of the rfactor axes to the dependent root axes. + if (td->hasViewLikeRFactor()) { + std::unordered_set root_set( + {td->getRootDomain().begin(), td->getRootDomain().end()}); + for (auto rf_id : rfactor) { + if (!rf_id->isRFactorProduct()) { + continue; + } + auto dep = DependencyCheck::getAllValsBetween(root_set, {rf_id}); + for (auto id : ir_utils::filterByType(dep)) { + if (root_set.find(id) == root_set.end() || rf_id == id) { + continue; + } + setMaybeMapped(td, id, td, rf_id); } - } else { - mapAllConsumers(DomainKey(td, id)); + } + // Once mappings for rfactor axes are propagated to root axes, + // aggregates them at each root axis + for (auto id : tv->getRootDomain()) { + if (id->isBroadcast()) { + // There can be broadcast domains that appear at root domains but + // are removed at rfactor domains as they are merged into + // non-reduction domains. Initialize the map for those broadcast + // domains. + initializeBcastMap(tv, id); + } + mapAllPendingMappings(td, id); } } } @@ -931,9 +1100,90 @@ bool ComputeAtRootDomainMapBuilder::safeToMap(const DomainKeySet& domains) { !map_through_reduction_) { return false; } + // Make sure mapping these domains won't cause any invalid mapping + if (isInvalid(unique_domains)) { + return false; + } return true; } +namespace { +class ExactRootDomainMapBuilder : private IterVisitor { + public: + ExactRootDomainMapBuilder( + Fusion* fusion, + DisjointSets& eq_sets) + : eq_sets_(eq_sets) { + traverseFrom(fusion, fusion->outputs()); + } + + private: + using IterVisitor::handle; + + void handle(Expr* expr) final { + for (auto producer : ir_utils::filterByType(expr->inputs())) { + for (auto consumer : + ir_utils::filterByType(expr->outputs())) { + PairwiseRootDomainMap pwise_map(producer, consumer, true); + const auto mappings = pwise_map.mapProducerToConsumer( + producer->domain(), consumer->domain()); + for (const auto& mapping : mappings) { + eq_sets_.mapEntries(mapping.first, mapping.second); + } + } + } + } + + private: + DisjointSets& eq_sets_; +}; + +} // namespace + +ExactRootDomainMap::ExactRootDomainMap(Fusion* fusion) { + ExactRootDomainMapBuilder builder(fusion, eq_sets_); +} + +bool ExactRootDomainMap::areMapped( + const IterDomain* id_a, + const IterDomain* id_b) const { + return eq_sets_.strictAreMapped(id_a, id_b); +} + +std::unordered_map ExactRootDomainMap::map( + const TensorDomain* producer, + const TensorDomain* consumer, + const std::unordered_set& root_dims_to_map, + bool producer_to_consumer) const { + const auto& producer_root = + TensorDomain::noReductions(producer->getMaybeRFactorDomain()); + const auto& consumer_root = consumer->getRootDomain(); + const auto& from_ids = producer_to_consumer ? producer_root : consumer_root; + const auto& to_ids = producer_to_consumer ? consumer_root : producer_root; + + std::unordered_map id_map; + + for (auto& from_id : from_ids) { + if (root_dims_to_map.find(from_id) == root_dims_to_map.end()) { + continue; + } + for (const auto& to_id : to_ids) { + if (areMapped(from_id, to_id)) { + TORCH_INTERNAL_ASSERT( + id_map.insert({from_id, to_id}).second, + "Multiple matching ID detected for ", + from_id); + } + } + } + + return id_map; +} + +std::string ExactRootDomainMap::toString() const { + return eq_sets_.toString(); +} + } // namespace cuda } // namespace fuser } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/root_domain_map.h b/torch/csrc/jit/codegen/cuda/root_domain_map.h index 23ada0fb1201..2054e3272686 100644 --- a/torch/csrc/jit/codegen/cuda/root_domain_map.h +++ b/torch/csrc/jit/codegen/cuda/root_domain_map.h @@ -5,7 +5,7 @@ #include #include -#include +#include namespace torch { namespace jit { @@ -82,7 +82,8 @@ class TORCH_CUDA_CU_API PairwiseRootDomainMap : public RootDomainMap { //! \param consumer The consumer tensor of a producer-consumer pair. explicit PairwiseRootDomainMap( const TensorView* producer, - const TensorView* consumer); + const TensorView* consumer, + bool is_exact = false); const TensorView* producer() const { return producer_tv_; @@ -92,6 +93,8 @@ class TORCH_CUDA_CU_API PairwiseRootDomainMap : public RootDomainMap { return consumer_tv_; } + std::string toString() const; + protected: std::unordered_map map( const TensorDomain* producer, @@ -108,10 +111,10 @@ class TORCH_CUDA_CU_API PairwiseRootDomainMap : public RootDomainMap { private: const TensorView* producer_tv_ = nullptr; const TensorView* consumer_tv_ = nullptr; + //! If true, does not map broadcast IDs with non-broadcast IDs + const bool is_exact_ = false; }; -std::string toString(const PairwiseRootDomainMap& root_map); - //! Represents an iteration domain of a TensorDomain. Only used for //! root domain mapping. //! @@ -143,14 +146,14 @@ class DomainKey { concreteId() == other.concreteId(); } + std::string toString() const; + private: const TensorDomain* td_ = nullptr; const IterDomain* id_ = nullptr; const IterDomain* concrete_id_ = nullptr; }; -std::string toString(const DomainKey& key); - struct DomainKeyHash { std::size_t operator()(const DomainKey& key) const { return std::hash{}(key.td()) ^ @@ -186,7 +189,9 @@ class TORCH_CUDA_CU_API UnmappableReductionDomains : private IterVisitor { private: using IterVisitor::handle; void handle(ReductionOp* op) override; + void handle(GroupedReductionOp* op) override; void handle(WelfordOp* op) override; + void handle(MmaOp* op) override; void handleReductionOutput(TensorView* out_tv); @@ -204,9 +209,14 @@ class TORCH_CUDA_CU_API UnmappableReductionDomains : private IterVisitor { //! example: //! T2 [i0,i1] = T1[i2,i3] + T0[i4,i5] //! This will create mappings between i0, i2 and i4. +//! +//! Note that with views, there can be multiple domains mapped with +//! the same domain. Thus, obtaining one-to-one maps can +//! fail. Currently, the only use of this class is getMappableDims, +//! which just grabs any domain that is mappable, which works no +//! matter view is used or not. class TORCH_CUDA_CU_API ComputeAtRootDomainMap : public RootDomainMap { friend class ComputeAtRootDomainMapBuilder; - friend std::string toString(const ComputeAtRootDomainMap&); public: //! Builds a mapping table by analyzing the current @@ -252,7 +262,11 @@ class TORCH_CUDA_CU_API ComputeAtRootDomainMap : public RootDomainMap { //! be a producer-consumer pair. Since they may not be a //! producer-consumer pair, this function requires proper root //! domains, which may be root or rfactor domains. Also, no error - //! check is done as we do not assume producer-consumer relationship. + //! check is done as we do not assume producer-consumer + //! relationship. + //! + //! Note that an exception is thrown when a domain is found to be + //! mapped to multiple domains, which can happen with views. //! //! \param from_td A TensorDomain from which a map is created //! \param from_root A root domain of from_td @@ -283,8 +297,8 @@ class TORCH_CUDA_CU_API ComputeAtRootDomainMap : public RootDomainMap { const TensorDomain* td_b, const IterDomain* id_b) const; - //! Returns if key_a and key_b are mapped to eachother (equivalent), or are - //! the same key. + //! Returns if key_a and key_b are mapped to each other (equivalent), or are + //! the same key. Returns false if two keys are not known to be mapped. bool canMap(const DomainKey& key_a, const DomainKey& key_b) const; //! Returns the set of (non-broadcast) DomainKeys that id in td is @@ -312,9 +326,11 @@ class TORCH_CUDA_CU_API ComputeAtRootDomainMap : public RootDomainMap { const std::unordered_set& root_dims_to_map, bool producer_to_consumer) const override; + std::string toString() const; + private: //! Disjoint set of all mapped keys to determine axes equivalency - DisjointSet eq_set_; + DisjointSets eq_set_; //! All IterDomains in the mapping that are a broadcast ID DomainKeyMap> bcast_map_; @@ -327,12 +343,10 @@ class TORCH_CUDA_CU_API ComputeAtRootDomainMap : public RootDomainMap { std::unordered_set window_axes_; }; -std::string toString(const ComputeAtRootDomainMap& root_map); - -//! Create a DisjointSet of root IterDomains by traversing the +//! Create a DisjointSets of root IterDomains by traversing the //! current fusion entirely. IterDomains that can be mapped each //! other with computeAt are grouped into the same subset in the -//! DisjointSet. +//! DisjointSets. class TORCH_CUDA_CU_API ComputeAtRootDomainMapBuilder : private BackwardVisitor { public: @@ -347,6 +361,12 @@ class TORCH_CUDA_CU_API ComputeAtRootDomainMapBuilder //! Set a pair of producer-consumer domain keys as mappable void setMapped(const DomainKey& producer, const DomainKey& consumer); + //! Records two domains are invalid to map + void setInvalid(const DomainKey& key1, const DomainKey& key2); + + //! Check if no pair of domains is invalid to map + bool isInvalid(const std::vector& domains) const; + //! Track a pair of producer-consumer domains as potentially mappable. Inserts //! entries into pending_map_, but does not add anything into the root_map_ //! (added when handle is called on a TensorView). Maybe mapped will, however, @@ -383,10 +403,18 @@ class TORCH_CUDA_CU_API ComputeAtRootDomainMapBuilder mapPointwiseOrReductionOp(op); } + void handle(GroupedReductionOp* op) override { + mapPointwiseOrReductionOp(op); + } + void handle(WelfordOp* wop) override { mapPointwiseOrReductionOp(wop); } + void handle(MmaOp* wop) override { + mapPointwiseOrReductionOp(wop); + } + void handle(ShiftOp* op) override { mapPointwiseOrReductionOp(op); } @@ -395,6 +423,8 @@ class TORCH_CUDA_CU_API ComputeAtRootDomainMapBuilder mapPointwiseOrReductionOp(op); } + void handle(ViewAsScalar* op) override; + void handle(BroadcastOp* op) override; void handle(TransposeOp* op) override; @@ -403,11 +433,15 @@ class TORCH_CUDA_CU_API ComputeAtRootDomainMapBuilder void handle(TensorView* tv) override; - //! Maps all consumers with a producer. + //! Maps all pending mappings. //! This is called for each of TensorViews in a backward traversal, //! recursively building mappings from the output tensors to the //! input tensors. - bool mapAllConsumers(const DomainKey& producer_key); + void mapAllPendingMappings(const DomainKey& key); + + //! Maps all pending mappings for id of td. When id is a broadcast, + //! mapping is done separately for each concrete domain. + void mapAllPendingMappings(const TensorDomain* td, IterDomain* id); bool hasMatchingDomains(const std::vector& unique_domains); @@ -415,16 +449,40 @@ class TORCH_CUDA_CU_API ComputeAtRootDomainMapBuilder private: ComputeAtRootDomainMap& root_map_; - //! Keep track of what we want to try and map. Set in attemptToProveId. + //! Keep track of what we want to try and map DomainKeyMap pending_map_; std::unordered_set visited_; + //! Helper class to find invalid mappings due to reductions UnmappableReductionDomains incompatible_domains_; + //! Running vector of domain pairs that are invalid to map + std::vector> invalid_mappings_; //! Disable UnmappableReductions check, should //! always be false for compute_at use cases bool map_through_reduction_ = false; }; +//! Maps root domains of an entire fusion. Does not map broadcast +//! domains with non-broadcast domains. +class TORCH_CUDA_CU_API ExactRootDomainMap : public RootDomainMap { + public: + ExactRootDomainMap(Fusion* fusion); + + bool areMapped(const IterDomain* id_a, const IterDomain* id_b) const; + + std::string toString() const; + + protected: + std::unordered_map map( + const TensorDomain* producer, + const TensorDomain* consumer, + const std::unordered_set& root_dims_to_map, + bool producer_to_consumer) const override; + + private: + DisjointSets eq_sets_; +}; + } // namespace cuda } // namespace fuser } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/runtime/array.cu b/torch/csrc/jit/codegen/cuda/runtime/array.cu new file mode 100644 index 000000000000..2f06ddd92e18 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/runtime/array.cu @@ -0,0 +1,231 @@ +// aligned register array for vectorized load/store +template +struct alignas(sizeof(scalar_t) * align_size) Array { + scalar_t array[size]; + + __device__ void set(scalar_t v) { +#pragma unroll + for (int i = 0; i < size; ++i) { + array[i] = v; + } + } + + __device__ scalar_t& operator[](const unsigned int i) { + return array[i]; + } +}; + +// Used for vectorized allocations that are not in registers +template +__device__ void arraySet(scalar_t* buff, scalar_t val) { +#pragma unroll + for (int i = 0; i < vec_size; ++i) { + buff[i] = val; + } +} + +template +__device__ void loadGeneric(scalar_t* to, scalar_t* from) { + // It would be really nice to use memcpy here, but one example was failing + // with: + // + // memcpy(to, from, vec_size * sizeof(scalar_t)); + // + // Yet passing with: + // + // for(int i = 0; i < vec_size; i++){ + // to[i] = from[i]; + // } + + switch (sizeof(scalar_t) * vec_size) { + case 1: + *reinterpret_cast(to) = *reinterpret_cast(from); + break; + case 2: + *reinterpret_cast(to) = *reinterpret_cast(from); + break; + case 4: + *reinterpret_cast(to) = *reinterpret_cast(from); + break; + case 8: + *reinterpret_cast(to) = *reinterpret_cast(from); + break; + case 12: + *reinterpret_cast(to) = *reinterpret_cast(from); + break; + case 16: + *reinterpret_cast(to) = *reinterpret_cast(from); + break; + } +} + +// Volatile version only works with c++ fundamnetal types +template < + typename scalar_t, + int vec_size, + bool is_volatile_to, + bool is_volatile_from> +__device__ void loadGenericVolatile( + typename MaybeVolatile::type* to, + typename MaybeVolatile::type* from) { + switch (sizeof(scalar_t) * vec_size) { + // Reinterpret cast like this with volatile types only works for C++ + // fundamental types otherwise the = operator is not defined + case 1: + *reinterpret_cast< + typename MaybeVolatile::type*>(to) = + *reinterpret_cast< + typename MaybeVolatile::type*>( + from); + break; + case 2: + *reinterpret_cast::type*>( + to) = + *reinterpret_cast< + typename MaybeVolatile::type*>(from); + break; + case 4: + *reinterpret_cast< + typename MaybeVolatile::type*>(to) = + *reinterpret_cast< + typename MaybeVolatile::type*>( + from); + break; + case 8: + *reinterpret_cast::type*>( + to) = + *reinterpret_cast< + typename MaybeVolatile::type*>(from); + break; + } +} + +template +__device__ void loadLocalToGlobal( + typename MaybeVolatile::type* to, + scalar_t* from) { + switch (sizeof(scalar_t) * vec_size) { + case 1: + case 2: + case 4: + loadGenericVolatile(to, from); + break; + case 8: { + uint2 const& data = *reinterpret_cast(from); + if (is_volatile) { + asm volatile( + "st.volatile.global.v2.s32 [%0], {%1,%2};" ::"l"( + (typename MaybeVolatile::type*)to), + "r"(data.x), + "r"(data.y)); + } else { + asm volatile( + "st.global.cs.v2.s32 [%0], {%1,%2};" ::"l"( + (typename MaybeVolatile::type*)to), + "r"(data.x), + "r"(data.y)); + } + break; + } + case 16: { + uint4 const& data = *reinterpret_cast(from); + if (is_volatile) { + asm volatile( + "st.volatile.global.v4.s32 [%0], {%1,%2,%3,%4};" ::"l"( + (typename MaybeVolatile::type*)to), + "r"(data.x), + "r"(data.y), + "r"(data.z), + "r"(data.w)); + } else { + asm volatile( + "st.global.cs.v4.s32 [%0], {%1,%2,%3,%4};" ::"l"( + (typename MaybeVolatile::type*)to), + "r"(data.x), + "r"(data.y), + "r"(data.z), + "r"(data.w)); + } + break; + } + } +} + +template +__device__ void loadGlobalToLocal( + scalar_t* to, + typename MaybeVolatile::type* from) { + switch (sizeof(scalar_t) * vec_size) { + case 1: + case 2: + case 4: + loadGenericVolatile(to, from); + break; + case 8: { + if (is_volatile) { + uint2& data = *reinterpret_cast(to); + asm volatile("ld.volatile.global.v2.s32 {%0,%1}, [%2];" + : "=r"(data.x), "=r"(data.y) + : "l"((uint2*)from)); + break; + } else { + uint2& data = *reinterpret_cast(to); + asm volatile("ld.global.cs.v2.s32 {%0,%1}, [%2];" + : "=r"(data.x), "=r"(data.y) + : "l"((uint2*)from)); + } + break; + } + case 16: { + if (is_volatile) { + uint4& data = *reinterpret_cast(to); + asm volatile("ld.volatile.global.v4.s32 {%0,%1,%2,%3}, [%4];" + : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w) + : "l"((uint4*)from)); + } else { + uint4& data = *reinterpret_cast(to); + asm volatile("ld.global.cs.v4.s32 {%0,%1,%2,%3}, [%4];" + : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w) + : "l"((uint4*)from)); + } + break; + } + } +} + +template < + typename scalar_t, + int vec_size, + bool is_volatile_to, + bool is_volatile_from> +__device__ void loadGlobalToGlobal( + typename MaybeVolatile::type* to, + typename MaybeVolatile::type* from) { + switch (sizeof(scalar_t) * vec_size) { + // Reinterpret cast like this with volatile types only works for C++ + // fundamental types otherwise the = operator is not defined + case 1: + case 2: + case 4: + case 8: + loadGenericVolatile( + to, from); + break; + case 12: { + uint3 local_intermediate; + loadGlobalToLocal( + reinterpret_cast(&local_intermediate), from); + loadLocalToGlobal( + to, reinterpret_cast(&local_intermediate)); + break; + } + case 16: { + uint4 local_intermediate; + loadGlobalToLocal( + reinterpret_cast(&local_intermediate), from); + loadLocalToGlobal( + to, reinterpret_cast(&local_intermediate)); + break; + } + } +} diff --git a/torch/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu b/torch/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu index ed366132689d..fcbc98e7818c 100644 --- a/torch/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu +++ b/torch/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu @@ -41,10 +41,8 @@ __device__ void sync() { // threads have incremented the counter. while (local_sync_counter < next && old < local_sync_counter) { #if __CUDA_ARCH__ >= 700 - __nanosleep(backoff); -#else - // __nanosleep is not available for sm < 70 - assert(false); + // __nanosleep only available on compute capability 7.0 or higher + __nanosleep(backoff); // avoids busy waiting #endif if (backoff < backoff_max) { backoff *= 2; diff --git a/torch/csrc/jit/codegen/cuda/runtime/fp16_support.cu b/torch/csrc/jit/codegen/cuda/runtime/fp16_support.cu index 4bd402e84c60..46564c981f18 100644 --- a/torch/csrc/jit/codegen/cuda/runtime/fp16_support.cu +++ b/torch/csrc/jit/codegen/cuda/runtime/fp16_support.cu @@ -31,13 +31,24 @@ __device__ float __half2float(const __half h) { return val; } -// aligned vector generates vectorized load/store on CUDA -template -struct alignas(sizeof(scalar_t) * vec_size) Array { - scalar_t val[vec_size]; - __device__ void set(scalar_t v) { - for (int i = 0; i < vec_size; ++i) { - val[i] = v; - } - } -}; +__device__ __half __double2half(const double d) { +#if __CUDA_ARCH__ >= 700 + __half val; + asm("{ cvt.rn.f16.f64 %0, %1;}\n" + : "=h"(__NVFUSER_HALF_TO_US(val)) + : "d"(d)); + return val; +#else + return __float2half(static_cast(d)); +#endif +} + +__device__ double __half2double(const __half h) { +#if __CUDA_ARCH__ >= 700 + double val; + asm("{ cvt.f64.f16 %0, %1;}\n" : "=d"(val) : "h"(__NVFUSER_HALF_TO_CUS(h))); + return val; +#else + return static_cast(__half2float(h)); +#endif +} diff --git a/torch/csrc/jit/codegen/cuda/runtime/fused_reduction.cu b/torch/csrc/jit/codegen/cuda/runtime/fused_reduction.cu new file mode 100644 index 000000000000..6fd6f398eb06 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/runtime/fused_reduction.cu @@ -0,0 +1,1081 @@ +namespace fused_reduction { + +// We have 6 dimensions, 3 in the grid, 3 in the block +// They can be 1 of 3 states, +// Reduction Domain - TEMPLATE STATE 0 +// - Participating in the reduction, has values coming in, one value coming +// out across the dimension +// Iteration Domain - TEMPLATE STATE 1 +// - Not participating in the reduction, has values across the dimension after +// the reduction +// Collapsed Domain - TEMPLATE STATE 2 +// - Previously reduced, doesn't need to be reduced on that dimension, doesn't +// have values across that dimension +constexpr __device__ bool isReduce(int STATE) { + return STATE == 0; +} + +constexpr __device__ bool isIter(int STATE) { + return STATE == 1; +} + +constexpr __device__ bool isPred(int STATE) { + return STATE == 2; +} + +constexpr __device__ bool inactive(int STATE) { + return STATE == 3; +} + +constexpr __device__ bool activeNotIter(int STATE) { + return STATE != 3 && STATE != 1; +} + +// When generating an index into the reduction, we have to stride by iteration +// domains and reduction domains. Collapsed domains we can ignore, but we need +// to make sure they never read or write (need to be predicated to correct +// participation). + +// All inclusive reduction with option to re-broadcast. This reduction class +// does not use predication of parallelization in the read or write predicates. +// Instead there are 3 states each dimension of parallelization can have, +// described above. Predication, indexing, and reduction will be done based on +// this information. +template < + int X_BLOCK, + int Y_BLOCK, + int Z_BLOCK, + int X_THREAD, + int Y_THREAD, + int Z_THREAD, + bool PERSISTENT_REDUCTION, + bool BROADCAST> +class ParallelReduce { + static constexpr bool BLOCK_REDUCE = + isReduce(X_THREAD) || isReduce(Y_THREAD) || isReduce(Z_THREAD); + + static constexpr bool GRID_REDUCE = + isReduce(X_BLOCK) || isReduce(Y_BLOCK) || isReduce(Z_BLOCK); + + // ping-pong between global buffers to avoid a second sync + bool flip = false; + + public: + __device__ ParallelReduce() {} + + template + __device__ __inline__ void reduce( + RefTuple out, + const ConstRefTuple& inp, + VolatilePtrTuple global_work_buffer, + int64_t* global_sync_buffer, // Allocated as product of all + // non-participating Grid dimension + PtrTuple shared_buf, + bool read_pred, // Prevent reading from out of bounds memory + bool write_pred, // Prevent from writing out of bounds + const LocalTuple& init_val, + Func reduction_op) { + // If no reduction needed, just return input + if (!BLOCK_REDUCE && !GRID_REDUCE) { + if (read_pred && write_pred) { + out = inp; + } + return; + } + + // Don't read/write in temporary buffers if in a predicated dimension + bool block_reduce_participate = index_utils:: + maskedIsZero( + threadIdx); + + // Initialize block result + LocalTuple block_result = init_val; + + // Grab input data if participating in the reduction, set to block_result in + // the case there is no block reduction + if (block_reduce_participate && read_pred) { + block_result = inp; + } + + // Only threads that with id == 0 in the dimensions being reduced will + // have a valid result + bool has_block_result = index_utils::maskedIsZero< + isReduce(X_THREAD), + isReduce(Y_THREAD), + isReduce(Z_THREAD)>(threadIdx); + + if (BLOCK_REDUCE) { + // -- START BLOCK REDUCTION -- // + + // Size of the block reduction segment, can be an int since it's limited + // to number of threads + int block_reduction_size = index_utils::maskedSize< + isReduce(X_THREAD), + isReduce(Y_THREAD), + isReduce(Z_THREAD)>(blockDim); + + // Index in the reduction segment, can be an int since it's limited to + // number of threads + int tid_in_block_reduction = index_utils::maskedOffset< + isReduce(X_THREAD), + isReduce(Y_THREAD), + isReduce(Z_THREAD)>(threadIdx, blockDim); + + // ID of the block reduction this thread is participating in + // + // If any of the parallel dimensions are predicated out, that means + // they've already been reduced, so we only care about the first thread in + // that dimension. Therefore don't expand the reduction_idx by that + // dimension + int block_reduction_idx = index_utils:: + maskedOffset( + threadIdx, blockDim); + + // Shared memory buffer is 2D + // [iter dimension, reduction dimension] + + // Offset into smem for the current thread + int block_reduce_smem_offset = + block_reduction_idx * block_reduction_size + tid_in_block_reduction; + + // Initialize shared memory + if (block_reduce_participate) { + copyTuple(shared_buf, block_reduce_smem_offset, block_result); + } + + // Sync to make sure smem is completely initialized + block_sync::sync(); + + // Round reduction size down to nearest power of 2 + int np2 = 1 << (31 - __clz(block_reduction_size)); + + // Perform an initial reduction leaving np2 elements + if (block_reduce_participate && tid_in_block_reduction < np2 && + tid_in_block_reduction + np2 < block_reduction_size) { + reduce( + shared_buf, + block_reduce_smem_offset, + shared_buf, + block_reduce_smem_offset + np2, + reduction_op); + } + + // Always need to sync while operating on shared memory + block_sync::sync(); + + // Reduce down until 2 values, leaving 2 values allows us to manually + // perform the last reduction and avoid a syncthreads + for (int factor = np2 / 2; factor > 1; factor >>= 1) { + if (tid_in_block_reduction < factor && block_reduce_participate) { + reduce( + shared_buf, + block_reduce_smem_offset, + shared_buf, + block_reduce_smem_offset + factor, + reduction_op); + } + block_sync::sync(); + } + + // Accumulate that last valid result + if (has_block_result) { + copyTuple(block_result, shared_buf, block_reduce_smem_offset); + if (block_reduction_size > 1) { + reduce( + block_result, + 0, + shared_buf, + block_reduce_smem_offset + 1, + reduction_op); + } + } + + // ===== BLOCK REDUCTION CLEANUP ======= + if (!GRID_REDUCE) { + // If no grid reduction, we don't have to continue. Either broadcast + // back across the block or return the correct reduction + if (has_block_result && write_pred) { + reduce(block_result, 0, out, 0, reduction_op); + out = block_result; + } + if (BROADCAST) { + // No grid reduce, but need to broadcast, perform block broadcast + if (has_block_result && write_pred) { + // Put result back in shared memory, put in the first entry of the + // reduction segment's buffer + copyTuple( + shared_buf, + block_reduction_idx * block_reduction_size, + block_result); + } + + // Sync threads to make sure result is in smem + block_sync::sync(); + // If the thread is participating, and is not attempting to write out + // of bounds, return the broadcasted value. + if (block_reduce_participate && write_pred) { + copyTuple( + out, shared_buf, block_reduction_idx * block_reduction_size); + } + } + + // Forward protect shared memory, don't want threads to continue to + // another reduction/broadcast and pollute shared memory before the + // reduction is completely finished. + // + // This could be avoided in some cases if we added thread syncs from + // block reductions in the syncthread insertion pass. + block_sync::sync(); + return; + } + } + + // -- START GRID REDUCTION -- // + // Grid reductions are more challenging for two reasons, (1) the reduction + // itself is 3D instead of 2D because we now have an iter domain space in + // the grid dimension. (2) a tree reduction isn't performed, instead all + // blocks will populate GMEM and one block will finish the grid reduction. + + // What is the grid reduction size, block reduction already performed so + // that doesn't have to be taken into consideration + const auto grid_red_size = index_utils:: + maskedSize( + gridDim); + + // Which ID in the reduction is this block. Threads can participate in + // multiple grid reductions, but the block will have the same relative index + // in those reductions + const auto idx_in_grid_red = index_utils:: + maskedOffset( + blockIdx, gridDim); + + if (PERSISTENT_REDUCTION && flip) { + auto global_buffer_size = + index_utils:: + maskedSize( + gridDim) * + grid_red_size; + global_work_buffer += global_buffer_size; + } + flip = ~flip; + + // How many grid reductions have to be performed, in the grid dimension + const auto num_block_iters = index_utils:: + maskedSize(gridDim); + + // Which grid reduction does this block participate in, in the grid + // dimension + const auto block_red_idx_offset = index_utils:: + maskedOffset( + blockIdx, gridDim); + + // How many grid reductions have to be performed, in the block dimension + const auto num_thread_iters = index_utils:: + maskedSize( + blockDim); + + // Which grid reduction does this thread participate in, in the block + // dimension + const auto thread_red_idx_offset = index_utils:: + maskedOffset( + threadIdx, blockDim); + + // 3D buffer of reductions: + // [reduction_offset(grid), iter_offset(grid), iter_offset(block)] + // Offset into the work buffer + const auto work_buf_offset = + (idx_in_grid_red * num_block_iters + block_red_idx_offset) * + num_thread_iters + + thread_red_idx_offset; + + // Don't read/write in temporary buffers if in a predicated dimension + bool grid_reduce_participate = index_utils:: + maskedIsZero( + blockIdx); + + if (grid_reduce_participate && block_reduce_participate) { + if (has_block_result) { + copyTuple(global_work_buffer, work_buf_offset, block_result); + } + } + + // -- GLOBAL BUFFER FILLED -- // + + bool last_block = index_utils:: + maskedIsLast( + blockIdx, gridDim); + + if (grid_reduce_participate) { + // Don't need to sync up blocks that are not participating in this + // reduction + grid_sync::sync< + isReduce(X_BLOCK), + isReduce(Y_BLOCK), + isReduce(Z_BLOCK), + PERSISTENT_REDUCTION>( + global_sync_buffer[block_red_idx_offset], grid_red_size, last_block); + } + + // -- START BLOCK CLEANUP -- // + // All blocks perform the last cleanup, so every block, and every thread + // will have the final result + + // Initialize block result + LocalTuple last_block_result(init_val); + + if ((PERSISTENT_REDUCTION || last_block) && grid_reduce_participate) { + // Can use the last block to reduce all the values the blocks filled in. + // Can use any thread that has been predicated, or has been reduced to do + // this reduction, cannot use any block that's associated with an + // iteration domain + + // Start with non-block reduction + + // Index in the reduction segment + int tid_in_block_reduction_2 = index_utils::maskedOffset< + activeNotIter(X_THREAD), + activeNotIter(Y_THREAD), + activeNotIter(Z_THREAD)>(threadIdx, blockDim); + + int block_reduction_size_2 = index_utils::maskedSize< + activeNotIter(X_THREAD), + activeNotIter(Y_THREAD), + activeNotIter(Z_THREAD)>(blockDim); + + // 3D buffer of reductions: + // [reduction_offset(grid), iter_offset(grid), iter_offset(block)] + // Change the offset, we want to keep the last two dimensions, but the + // first dimension is what we will reduce over + const auto work_buf_offset_2 = + block_red_idx_offset * num_thread_iters + thread_red_idx_offset; + for (auto reduction_i = tid_in_block_reduction_2; + reduction_i < grid_red_size; + reduction_i += block_reduction_size_2) { + reduce( + last_block_result, + 0, + global_work_buffer, + work_buf_offset_2 + + reduction_i * num_block_iters * + num_thread_iters, // Iterating over the outer most + // dimension, so need to stride by the + // total number of grid reductions. Could + // come back and change it so this is the + // contiguous dimension + reduction_op); + } + + // -- START LAST BLOCK - BLOCK REDUCTION -- // + + // Reduced so we have one value per thread, we need to further reduce any + // dimension that is not an iter dimension + + // Which block reduction this thread is participating in + int block_reduction_idx = index_utils:: + maskedOffset( + threadIdx, blockDim); + + // Offset in smem for this thread's result + auto smem_offset = block_reduction_idx * block_reduction_size_2 + + tid_in_block_reduction_2; + + // Similar as before, reduce down to nearest power of 2 so we can do a + // tree reduction + int np2 = 1 << (31 - __clz(min(block_reduction_size_2, grid_red_size))); + + // Threads values are initialized, so all can participate here + if (tid_in_block_reduction_2 >= np2) { + copyTuple(shared_buf, smem_offset, last_block_result); + } + + block_sync::sync(); + + if (tid_in_block_reduction_2 < np2 && + tid_in_block_reduction_2 + np2 < + min(block_reduction_size_2, grid_red_size)) { + reduce( + last_block_result, 0, shared_buf, smem_offset + np2, reduction_op); + } + + if (tid_in_block_reduction_2 < np2) { + copyTuple(shared_buf, smem_offset, last_block_result); + } + + // Always sync when communicating across smem + block_sync::sync(); + + // Reduce down to 2 values, last thread will do the final reduction and + // can save a syncthreads this way + for (int factor = np2 / 2; factor > 1; factor >>= 1) { + if (tid_in_block_reduction_2 < factor) { + reduce( + shared_buf, + smem_offset, + shared_buf, + smem_offset + factor, + reduction_op); + } + block_sync::sync(); + } + + // If this thread in each block has the final result before broadcasting + // to all other threads in block + bool has_block_result_2 = index_utils::maskedIsZero< + activeNotIter(X_THREAD), + activeNotIter(Y_THREAD), + activeNotIter(Z_THREAD)>(threadIdx); + // Do the last reduction, protected by the write predicate + copyTuple(last_block_result, shared_buf, smem_offset); + if (has_block_result && grid_reduce_participate) { + reduce(last_block_result, 0, out, 0, reduction_op); + if (min(block_reduction_size_2, grid_red_size) > 1) { + reduce( + last_block_result, 0, shared_buf, smem_offset + 1, reduction_op); + } + } + if (grid_reduce_participate && PERSISTENT_REDUCTION) { + // If persistent reduction, always broadcast reduced values + copyTuple(shared_buf, smem_offset, last_block_result); + block_sync::sync(); + if (write_pred && block_reduce_participate) { + copyTuple( + out, shared_buf, block_reduction_idx * block_reduction_size_2); + } + // For persistent kernels we double the global buffer allocation so we + // don't need to protect those buffers every iteration preventing the + // need of an additional grid_sync. Since we flip back and forth between + // sections of the buffer, the one grid sync protects the other part of + // the buffer. + + } else { + // Forward protect the smem used in this reduction + if (grid_reduce_participate) { + if (last_block && has_block_result && block_reduce_participate && + write_pred) { + copyTuple( + out, shared_buf, block_reduction_idx * block_reduction_size_2); + } + } + block_sync::sync(); + } + } + } + + // Only unary tuples are supported, i.e., no Welford tuple is allowed. + template < + typename Func1, + typename DataType1, + typename Func2, + typename DataType2> + __device__ __inline__ void reduceGroup( + RefTuple out1, + const ConstRefTuple& inp1, + VolatilePtrTuple global_work_buffer1, + const LocalTuple& init_val1, + Func1 reduction_op1, + RefTuple out2, + const ConstRefTuple& inp2, + VolatilePtrTuple global_work_buffer2, + const LocalTuple& init_val2, + Func2 reduction_op2, + int64_t* global_sync_buffer, // Allocated as product of all + // non-participating Grid dimension + void* shared_mem, + bool read_pred, // Prevent reading from out of bounds memory + bool write_pred) { // Prevent from writing out of bounds + // If no reduction needed, just return input + if (!BLOCK_REDUCE && !GRID_REDUCE) { + if (read_pred && write_pred) { + out1 = inp1; + out2 = inp2; + } + return; + } + + // Don't read/write in temporary buffers if in a predicated dimension + const bool block_reduce_participate = index_utils:: + maskedIsZero( + threadIdx); + + // Only threads that with id == 0 in the dimensions being reduced will + // have a valid result + const bool has_block_result = index_utils::maskedIsZero< + isReduce(X_THREAD), + isReduce(Y_THREAD), + isReduce(Z_THREAD)>(threadIdx); + + // Block reduction only + if (!GRID_REDUCE) { + reduceBlock( + out1, + inp1, + init_val1, + reduction_op1, + shared_mem, + read_pred, + write_pred, + block_reduce_participate, + has_block_result); + reduceBlock( + out2, + inp2, + init_val2, + reduction_op2, + shared_mem, + read_pred, + write_pred, + block_reduce_participate, + has_block_result); + return; + } + + // -- START GRID REDUCTION -- // + // Grid reductions are more challenging for two reasons, (1) the reduction + // itself is 3D instead of 2D because we now have an iter domain space in + // the grid dimension. (2) a tree reduction isn't performed, instead all + // blocks will populate GMEM and one block will finish the grid reduction. + + // What is the grid reduction size, block reduction already performed so + // that doesn't have to be taken into consideration + const auto grid_red_size = index_utils:: + maskedSize( + gridDim); + + // Which ID in the reduction is this block. Threads can participate in + // multiple grid reductions, but the block will have the same relative index + // in those reductions + const auto idx_in_grid_red = index_utils:: + maskedOffset( + blockIdx, gridDim); + + // How many grid reductions have to be performed, in the grid dimension + const auto num_block_iters = index_utils:: + maskedSize(gridDim); + + // Which grid reduction does this block participate in, in the grid + // dimension + const auto block_red_idx_offset = index_utils:: + maskedOffset( + blockIdx, gridDim); + + // How many grid reductions have to be performed, in the block dimension + const auto num_thread_iters = index_utils:: + maskedSize( + blockDim); + + // Which grid reduction does this thread participate in, in the block + // dimension + const auto thread_red_idx_offset = index_utils:: + maskedOffset( + threadIdx, blockDim); + + // 3D buffer of reductions: + // [reduction_offset(grid), iter_offset(grid), iter_offset(block)] + // Offset into the work buffer + const auto work_buf_offset = + (idx_in_grid_red * num_block_iters + block_red_idx_offset) * + num_thread_iters + + thread_red_idx_offset; + + // Don't read/write in temporary buffers if in a predicated dimension + bool grid_reduce_participate = index_utils:: + maskedIsZero( + blockIdx); + + if (PERSISTENT_REDUCTION && flip) { + auto global_buffer_size = + index_utils:: + maskedSize( + gridDim) * + grid_red_size; + global_work_buffer1 += global_buffer_size; + global_work_buffer2 += global_buffer_size; + } + flip = ~flip; + + // Per-block partial reduction to global work buffer + { + const auto block_result = reduceBlock( + out1, + inp1, + init_val1, + reduction_op1, + shared_mem, + read_pred, + write_pred, + block_reduce_participate, + has_block_result); + if (grid_reduce_participate && block_reduce_participate) { + if (has_block_result) { + copyTuple(global_work_buffer1, work_buf_offset, block_result); + } + } + } + { + const auto block_result = reduceBlock( + out2, + inp2, + init_val2, + reduction_op2, + shared_mem, + read_pred, + write_pred, + block_reduce_participate, + has_block_result); + if (grid_reduce_participate && block_reduce_participate) { + if (has_block_result) { + copyTuple(global_work_buffer2, work_buf_offset, block_result); + } + } + } + + // -- GLOBAL BUFFER FILLED -- // + + bool last_block = index_utils:: + maskedIsLast( + blockIdx, gridDim); + + if (grid_reduce_participate) { + // Don't need to sync up blocks that are not participating in this + // reduction + grid_sync::sync< + isReduce(X_BLOCK), + isReduce(Y_BLOCK), + isReduce(Z_BLOCK), + PERSISTENT_REDUCTION>( + global_sync_buffer[block_red_idx_offset], grid_red_size, last_block); + } + + // -- START BLOCK CLEANUP -- // + reduceLastBlock( + out1, + global_work_buffer1, + init_val1, + reduction_op1, + shared_mem, + block_red_idx_offset, + num_thread_iters, + num_block_iters, + thread_red_idx_offset, + grid_red_size, + write_pred, + last_block, + block_reduce_participate, + grid_reduce_participate, + has_block_result); + reduceLastBlock( + out2, + global_work_buffer2, + init_val2, + reduction_op2, + shared_mem, + block_red_idx_offset, + num_thread_iters, + num_block_iters, + thread_red_idx_offset, + grid_red_size, + write_pred, + last_block, + block_reduce_participate, + grid_reduce_participate, + has_block_result); + } + + private: + // Almost exact copy of the initial block reduction part in the + // reduce function, but only unary tuples are supported as there's + // only one shared-memory buffer. As such, this can't be used with + // the non-group reduce function. + template + __device__ __inline__ LocalTuple reduceBlock( + RefTuple& out, + const ConstRefTuple& inp, + const LocalTuple& init_val, + Func reduction_op, + void* shared_mem, + bool read_pred, + bool write_pred, + bool block_reduce_participate, + bool has_block_result) { + PtrTuple shared_buf(static_cast(shared_mem)); + + // Initialize block result + LocalTuple block_result = init_val; + + // Grab input data if participating in the reduction, set to block_result in + // the case there is no block reduction + if (block_reduce_participate && read_pred) { + block_result = inp; + } + + // Size of the block reduction segment, can be an int since it's limited + // to number of threads + int block_reduction_size = index_utils:: + maskedSize( + blockDim); + + // Index in the reduction segment, can be an int since it's limited to + // number of threads + int tid_in_block_reduction = index_utils::maskedOffset< + isReduce(X_THREAD), + isReduce(Y_THREAD), + isReduce(Z_THREAD)>(threadIdx, blockDim); + + // ID of the block reduction this thread is participating in + // + // If any of the parallel dimensions are predicated out, that means + // they've already been reduced, so we only care about the first thread in + // that dimension. Therefore don't expand the reduction_idx by that + // dimension + int block_reduction_idx = index_utils:: + maskedOffset( + threadIdx, blockDim); + + // Shared memory buffer is 2D + // [iter dimension, reduction dimension] + + // Offset into smem for the current thread + int block_reduce_smem_offset = + block_reduction_idx * block_reduction_size + tid_in_block_reduction; + + // Initialize shared memory + if (block_reduce_participate) { + copyTuple(shared_buf, block_reduce_smem_offset, block_result); + } + + // Sync to make sure smem is completely initialized + block_sync::sync(); + + // Round reduction size down to nearest power of 2 + int np2 = 1 << (31 - __clz(block_reduction_size)); + + // Perform an initial reduction leaving np2 elements + if (block_reduce_participate && tid_in_block_reduction < np2 && + tid_in_block_reduction + np2 < block_reduction_size) { + reduce( + shared_buf, + block_reduce_smem_offset, + shared_buf, + block_reduce_smem_offset + np2, + reduction_op); + } + + // Always need to sync while operating on shared memory + block_sync::sync(); + + // Reduce down until 2 values, leaving 2 values allows us to manually + // perform the last reduction and avoid a syncthreads + for (int factor = np2 / 2; factor > 1; factor >>= 1) { + if (tid_in_block_reduction < factor && block_reduce_participate) { + reduce( + shared_buf, + block_reduce_smem_offset, + shared_buf, + block_reduce_smem_offset + factor, + reduction_op); + } + block_sync::sync(); + } + + // Accumulate that last valid result + if (has_block_result) { + copyTuple(block_result, shared_buf, block_reduce_smem_offset); + if (block_reduction_size > 1) { + reduce( + block_result, + 0, + shared_buf, + block_reduce_smem_offset + 1, + reduction_op); + } + } + + // ===== BLOCK REDUCTION CLEANUP ======= + if (!GRID_REDUCE) { + // If no grid reduction, we don't have to continue. Either broadcast + // back across the block or return the correct reduction + if (has_block_result && write_pred) { + reduce(block_result, 0, out, 0, reduction_op); + out = block_result; + } + if (BROADCAST) { + // No grid reduce, but need to broadcast, perform block broadcast + if (has_block_result && write_pred) { + // Put result back in shared memory, put in the first entry of the + // reduction segment's buffer + copyTuple( + shared_buf, + block_reduction_idx * block_reduction_size, + block_result); + } + + // Sync threads to make sure result is in smem + block_sync::sync(); + // If the thread is participating, and is not attempting to write out + // of bounds, return the broadcasted value. + if (block_reduce_participate && write_pred) { + copyTuple( + out, shared_buf, block_reduction_idx * block_reduction_size); + } + } + + // Forward protect shared memory, don't want threads to continue to + // another reduction/broadcast and pollute shared memory before the + // reduction is completely finished. + // + // This could be avoided in some cases if we added thread syncs from + // block reductions in the syncthread insertion pass. + block_sync::sync(); + } + + return block_result; + } + + // Almost exact copy of the last-block reduction in the reduce + // function, but only unary tuples are supported as there's only one + // shared-memory buffer. As such, this can't be used with the + // non-group reduce function. + template + __device__ __inline__ void reduceLastBlock( + RefTuple& out, + const VolatilePtrTuple& global_work_buffer, + const LocalTuple& init_val, + Func reduction_op, + void* shared_mem, + nvfuser_index_t block_red_idx_offset, + nvfuser_index_t num_thread_iters, + nvfuser_index_t num_block_iters, + nvfuser_index_t thread_red_idx_offset, + nvfuser_index_t grid_red_size, + bool write_pred, + bool last_block, + bool block_reduce_participate, + bool grid_reduce_participate, + bool has_block_result) { + // Initialize block result + LocalTuple last_block_result(init_val); + + PtrTuple shared_buf(static_cast(shared_mem)); + + if ((PERSISTENT_REDUCTION || last_block) && grid_reduce_participate) { + // Can use the last block to reduce all the values the blocks filled in. + // Can use any thread that has been predicated, or has been reduced to do + // this reduction, cannot use any block that's associated with an + // iteration domain + + // Start with non-block reduction + + // Index in the reduction segment + int tid_in_block_reduction_2 = index_utils::maskedOffset< + activeNotIter(X_THREAD), + activeNotIter(Y_THREAD), + activeNotIter(Z_THREAD)>(threadIdx, blockDim); + + int block_reduction_size_2 = index_utils::maskedSize< + activeNotIter(X_THREAD), + activeNotIter(Y_THREAD), + activeNotIter(Z_THREAD)>(blockDim); + + // 3D buffer of reductions: + // [reduction_offset(grid), iter_offset(grid), iter_offset(block)] + // Change the offset, we want to keep the last two dimensions, but the + // first dimension is what we will reduce over + const auto work_buf_offset_2 = + block_red_idx_offset * num_thread_iters + thread_red_idx_offset; + for (auto reduction_i = tid_in_block_reduction_2; + reduction_i < grid_red_size; + reduction_i += block_reduction_size_2) { + reduce( + last_block_result, + 0, + global_work_buffer, + work_buf_offset_2 + + reduction_i * num_block_iters * + num_thread_iters, // Iterating over the outer most + // dimension, so need to stride by the + // total number of grid reductions. Could + // come back and change it so this is the + // contiguous dimension + reduction_op); + } + + // -- START LAST BLOCK - BLOCK REDUCTION -- // + + // Reduced so we have one value per thread, we need to further reduce any + // dimension that is not an iter dimension + + // Which block reduction this thread is participating in + int block_reduction_idx = index_utils:: + maskedOffset( + threadIdx, blockDim); + + // Offset in smem for this thread's result + auto smem_offset = block_reduction_idx * block_reduction_size_2 + + tid_in_block_reduction_2; + + // Similar as before, reduce down to nearest power of 2 so we can do a + // tree reduction + int np2 = 1 << (31 - __clz(min(block_reduction_size_2, grid_red_size))); + + // Threads values are initialized, so all can participate here + if (tid_in_block_reduction_2 >= np2) { + copyTuple(shared_buf, smem_offset, last_block_result); + } + + block_sync::sync(); + + if (tid_in_block_reduction_2 < np2 && + tid_in_block_reduction_2 + np2 < + min(block_reduction_size_2, grid_red_size)) { + reduce( + last_block_result, 0, shared_buf, smem_offset + np2, reduction_op); + } + + if (tid_in_block_reduction_2 < np2) { + copyTuple(shared_buf, smem_offset, last_block_result); + } + + // Always sync when communicating across smem + block_sync::sync(); + + // Reduce down to 2 values, last thread will do the final reduction and + // can save a syncthreads this way + for (int factor = np2 / 2; factor > 1; factor >>= 1) { + if (tid_in_block_reduction_2 < factor) { + reduce( + shared_buf, + smem_offset, + shared_buf, + smem_offset + factor, + reduction_op); + } + block_sync::sync(); + } + + // If this thread in each block has the final result before broadcasting + // to all other threads in block + + // Do the last reduction, protected by the write predicate + copyTuple(last_block_result, shared_buf, smem_offset); + if (has_block_result && grid_reduce_participate) { + reduce(last_block_result, 0, out, 0, reduction_op); + if (min(block_reduction_size_2, grid_red_size) > 1) { + reduce( + last_block_result, 0, shared_buf, smem_offset + 1, reduction_op); + } + } + + if (grid_reduce_participate && PERSISTENT_REDUCTION) { + // If persistent reduction, always broadcast reduced values + copyTuple(shared_buf, smem_offset, last_block_result); + block_sync::sync(); + if (write_pred && block_reduce_participate) { + copyTuple( + out, shared_buf, block_reduction_idx * block_reduction_size_2); + } + // For persistent kernels we double the global buffer allocation so we + // don't need to protect those buffers every iteration preventing the + // need of an additional grid_sync. Since we flip back and forth between + // sections of the buffer, the one grid sync protects the other part of + // the buffer. + + } else { + // Forward protect the smem used in this reduction + if (grid_reduce_participate) { + if (last_block && has_block_result && block_reduce_participate && + write_pred) { + copyTuple( + out, shared_buf, block_reduction_idx * block_reduction_size_2); + } + } + block_sync::sync(); + } + } + } + + template + __inline__ __device__ static void reduce( + TupleType0& val0, + nvfuser_index_t offset0, + const TupleType1& val1, + nvfuser_index_t offset1, + Func reduction_op) { + static_assert( + TupleType0::num_vals == TupleType1::num_vals, + "Invalid number of values"); + TupleReduce::reduce( + val0, offset0, val1, offset1, reduction_op); + } + + template < + typename TupleType0, + typename TupleType1, + typename Func, + int num_vals> + struct TupleReduce {}; + + template + struct TupleReduce { + __inline__ __device__ static void reduce( + TupleType0& val0, + nvfuser_index_t offset0, + const TupleType1& val1, + nvfuser_index_t offset1, + Func reduction_op) { + static_assert( + IsSameType< + typename TupleType0::ValTypes, + typename TupleType1::ValTypes>::value, + "Invalid value types"); + reduction_op(val0.val<0>(offset0), val1.val<0>(offset1)); + } + }; + + template + struct TupleReduce { + __inline__ __device__ static void reduce( + TupleType0& val0, + nvfuser_index_t offset0, + const TupleType1& val1, + nvfuser_index_t offset1, + Func reduction_op) { + static_assert( + IsSameType< + typename TupleType0::ValTypes, + typename TupleType1::ValTypes>::value, + "Invalid value types"); + reduction_op( + val0.val<0>(offset0), + val0.val<1>(offset0), + val1.val<0>(offset1), + val1.val<1>(offset1)); + } + }; + + template + struct TupleReduce { + __inline__ __device__ static void reduce( + TupleType0& val0, + nvfuser_index_t offset0, + const TupleType1& val1, + nvfuser_index_t offset1, + Func reduction_op) { + static_assert( + IsSameType< + typename TupleType0::ValTypes, + typename TupleType1::ValTypes>::value, + "Invalid value types"); + reduction_op( + val0.val<0>(offset0), + val0.val<1>(offset0), + val0.val<2>(offset0), + val1.val<0>(offset1), + val1.val<1>(offset1), + val1.val<2>(offset1)); + } + }; + + // End Parallel reduce class +}; + +} // namespace fused_reduction diff --git a/torch/csrc/jit/codegen/cuda/runtime/grid_reduction.cu b/torch/csrc/jit/codegen/cuda/runtime/grid_reduction.cu index a75d0d5904a5..d3a15be0ae80 100644 --- a/torch/csrc/jit/codegen/cuda/runtime/grid_reduction.cu +++ b/torch/csrc/jit/codegen/cuda/runtime/grid_reduction.cu @@ -69,7 +69,7 @@ template < typename Func> __device__ void gridReduceLastBlock( T& out, - const T* in, + const volatile T* in, const nvfuser_index_t grid_reduction_segment_size, // Number of reductions across // grid reduce dimensions @@ -129,7 +129,7 @@ __device__ void gridReduceLastBlock( } } -// Reduces per-thread values across thread blocks. +// Reduces per-thread values across threads and thread blocks. // // Function parameters: // - out: Per-thread output location @@ -143,14 +143,8 @@ __device__ void gridReduceLastBlock( // reduction dimension // // Template parameters: -// - X/Y/Z_BLOCK: When true, reduces across thread blocks along the X/Y/Z +// - X/Y/Z_BLOCK/THREAD: When true, reduces across thread blocks along the X/Y/Z // dimensions -// - X/Y/Z_THREAD: When true, all threads along the X/Y/Z dimensions participate -// in the cross-block reduction. Otherwise, only threads at offset 0 do. -// These are set to true if the dimension in the block has not been reduced -// previously in producer tensors, and does not participate in the reduction -// (right now they can't), so it's just a "pure" iteration domain as far as -// the grid reduce is concerned. // - PERSISTENT_REDUCTION: Indicates grid reduction will be called in a loop, or // the result of the grid reduction will be broadcasted and used across the // grid. These requires cross grid communication and the grid synchronizations @@ -173,21 +167,18 @@ __device__ void gridReduceLastBlock( // blocks that have the same blockDim.x. There will be blockDim.y*blockDim.z // such segments. // -// X/Y/Z_THREAD defines a sub region of a thread block that should be reduced -// with the sub regions of other thread blocks. We call it a reduction block. -// E.g., -// -// Case 1: X/Y/Z_THREAD == false/false/false -> Only thread 0 participates in -// the cross-block reductions. The reduction block is 1x1x1 with thread 0. -// -// Case 2: X/Y/Z_THREAD == true/true/true-> All threads in a thread block -// participate in the cross-block reductions. The reduction block in this case -// is equivalent to the thread block. +// X/Y/Z_THREAD also works similarly as X/Y/Z_BLOCK and defines a +// group of threads that are reduced togather. // // After the function completes, only one thread block per reduction segment // gets valid reduction results. There is no guarantee which particular block // gets the final results. // +// entrance_ind and n_entrances are allowed when PERSISTENT_REDUCTION = false. +// If a grid reduction call is only called once per thread, entrance_ind == 0 +// and n_entrances == 1. However, grid reduction can be called in a loop in a +// thread, in that case entrance_ind is the count of times the function has been +// called, and n_entrances is the total number of times it will be called. template < bool X_BLOCK, bool Y_BLOCK, @@ -203,11 +194,35 @@ __device__ void gridReduce( const T& inp_val, Func reduction_op, volatile T* work_buf, - Tensor sync_flags, + int64_t* sync_flags, T* shared_buf, bool read_pred, bool write_pred, - T init_val) { + T init_val, + const nvfuser_index_t entrance_ind, + const nvfuser_index_t n_entrances) { + T block_reduction_val = init_val; + + // entrance index only matters for non-persistent re-entrant grid reductions. + const nvfuser_index_t entrance_ind_ = PERSISTENT_REDUCTION ? 0 : entrance_ind; + const nvfuser_index_t n_entrances_ = PERSISTENT_REDUCTION ? 1 : n_entrances; + + // Do block reduction when required + if (X_THREAD || Y_THREAD || Z_THREAD) { + blockReduce( + block_reduction_val, + inp_val, + reduction_op, + threadIdx, + blockDim, + shared_buf, + read_pred, + true, + init_val); + } else if (read_pred) { + block_reduction_val = inp_val; + } + // Number of values to reduce in the reduction segment const auto grid_reduction_segment_size = index_utils::maskedSize(gridDim); @@ -221,38 +236,46 @@ __device__ void gridReduce( // Number of threads we can use in final reduction, Seems to assume all // threads in the block participate const auto block_reduction_segment_size = - index_utils::maskedSize(blockDim); + index_utils::maskedSize(blockDim); + + // Number of reductions in the grid + const nvfuser_index_t grid_segment_size = PERSISTENT_REDUCTION + ? 1 + : index_utils::maskedSize(gridDim); // advance to the offset for this segment // index of reduction * size of the reduction * size of threads - work_buf += idx_in_grid_segment * grid_reduction_segment_size * - block_reduction_segment_size; + work_buf += (entrance_ind * grid_segment_size + idx_in_grid_segment) * + grid_reduction_segment_size * block_reduction_segment_size; - if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) && - (Z_THREAD || threadIdx.z == 0)) { + if ((!X_THREAD || threadIdx.x == 0) && (!Y_THREAD || threadIdx.y == 0) && + (!Z_THREAD || threadIdx.z == 0)) { auto block_offset = index_utils::maskedOffset(blockIdx, gridDim); auto thread_offset = - index_utils::maskedOffset( + index_utils::maskedOffset( threadIdx, blockDim); auto work_buf_offset = block_offset * block_reduction_segment_size + thread_offset; - if (read_pred) { - work_buf[work_buf_offset] = inp_val; - } else { - work_buf[work_buf_offset] = init_val; - } + work_buf[work_buf_offset] = block_reduction_val; } + if (PERSISTENT_REDUCTION) { + grid_sync::sync( + sync_flags[idx_in_grid_segment], grid_reduction_segment_size); - grid_sync::sync( - sync_flags[idx_in_grid_segment], grid_reduction_segment_size); + } else { + // Use a different sync flag for each call + grid_sync::sync( + sync_flags[entrance_ind_ * grid_segment_size + idx_in_grid_segment], + grid_reduction_segment_size); + } bool last_block = index_utils::maskedIsLast(blockIdx, gridDim); if (last_block) { // Cleanup with block reduction - gridReduceLastBlock( + gridReduceLastBlock( out, (T*)work_buf, grid_reduction_segment_size, @@ -271,7 +294,175 @@ __device__ void gridReduce( } } -} // namespace reduction +template < + bool X_BLOCK, + bool Y_BLOCK, + bool Z_BLOCK, + bool X_THREAD, + bool Y_THREAD, + bool Z_THREAD, + typename T, + typename Func> +__device__ void gridReduce2PartialReduction( + const T& inp_val, + T init_val, + Func reduction_op, + volatile T* work_buf, + T* shared_buf, + bool read_pred, + nvfuser_index_t grid_reduction_segment_size, + nvfuser_index_t idx_in_grid_segment, + nvfuser_index_t block_reduction_segment_size) { + T block_reduction_val = init_val; + + // Do block reduction when required + if (X_THREAD || Y_THREAD || Z_THREAD) { + blockReduce( + block_reduction_val, + inp_val, + reduction_op, + threadIdx, + blockDim, + shared_buf, + read_pred, + true, + init_val); + } else if (read_pred) { + block_reduction_val = inp_val; + } + + if ((!X_THREAD || threadIdx.x == 0) && (!Y_THREAD || threadIdx.y == 0) && + (!Z_THREAD || threadIdx.z == 0)) { + auto block_offset = + index_utils::maskedOffset(blockIdx, gridDim); + auto thread_offset = + index_utils::maskedOffset( + threadIdx, blockDim); + auto work_buf_offset = + block_offset * block_reduction_segment_size + thread_offset; + work_buf[work_buf_offset] = block_reduction_val; + } +} + +// 2-way horizontally fused grid reduction +template < + bool X_BLOCK, + bool Y_BLOCK, + bool Z_BLOCK, + bool X_THREAD, + bool Y_THREAD, + bool Z_THREAD, + bool PERSISTENT_REDUCTION, + typename T1, + typename Func1, + typename T2, + typename Func2> +__device__ void gridReduceGroup( + T1& out1, + const T1& inp_val1, + T1 init_val1, + Func1 reduction_op1, + volatile T1* work_buf1, + T2& out2, + const T2& inp_val2, + T2 init_val2, + Func2 reduction_op2, + volatile T2* work_buf2, + int64_t* sync_flags, + void* shared_buf, + bool read_pred, + bool write_pred) { + // Number of values to reduce in the reduction segment + const auto grid_reduction_segment_size = + index_utils::maskedSize(gridDim); + + // Index of the reduction we're performing out of the + // grid_reduction_segment_size + const auto idx_in_grid_segment = + index_utils::maskedOffset( + blockIdx, gridDim); + + // Number of threads we can use in final reduction, Seems to assume all + // threads in the block participate + const auto block_reduction_segment_size = + index_utils::maskedSize(blockDim); + + // advance to the offset for this segment + // index of reduction * size of the reduction * size of threads + work_buf1 += idx_in_grid_segment * grid_reduction_segment_size * + block_reduction_segment_size; + + work_buf2 += idx_in_grid_segment * grid_reduction_segment_size * + block_reduction_segment_size; + + gridReduce2PartialReduction< + X_BLOCK, + Y_BLOCK, + Z_BLOCK, + X_THREAD, + Y_THREAD, + Z_THREAD>( + inp_val1, + init_val1, + reduction_op1, + work_buf1, + (T1*)shared_buf, + read_pred, + grid_reduction_segment_size, + idx_in_grid_segment, + block_reduction_segment_size); -#undef isize -#undef ioffset + gridReduce2PartialReduction< + X_BLOCK, + Y_BLOCK, + Z_BLOCK, + X_THREAD, + Y_THREAD, + Z_THREAD>( + inp_val2, + init_val2, + reduction_op2, + work_buf2, + (T2*)shared_buf, + read_pred, + grid_reduction_segment_size, + idx_in_grid_segment, + block_reduction_segment_size); + + grid_sync::sync( + sync_flags[idx_in_grid_segment], grid_reduction_segment_size); + + bool last_block = + index_utils::maskedIsLast(blockIdx, gridDim); + + if (last_block) { + // Cleanup with block reduction + gridReduceLastBlock( + out1, + work_buf1, + grid_reduction_segment_size, + block_reduction_segment_size, + reduction_op1, + (T1*)shared_buf, + write_pred, + init_val1); + gridReduceLastBlock( + out2, + work_buf2, + grid_reduction_segment_size, + block_reduction_segment_size, + reduction_op2, + (T2*)shared_buf, + write_pred, + init_val2); + } + + if (PERSISTENT_REDUCTION) { + // Make sure we're done with global memory before we allow the kernel to + // continue + grid_sync::sync( + sync_flags[idx_in_grid_segment], grid_reduction_segment_size); + } +} + +} // namespace reduction diff --git a/torch/csrc/jit/codegen/cuda/runtime/grid_sync.cu b/torch/csrc/jit/codegen/cuda/runtime/grid_sync.cu index 0ccb07142aaa..1a6d7437d925 100644 --- a/torch/csrc/jit/codegen/cuda/runtime/grid_sync.cu +++ b/torch/csrc/jit/codegen/cuda/runtime/grid_sync.cu @@ -18,7 +18,10 @@ __device__ T globalAsVolatile(volatile T& global_val) { // [X,Y,Z]_BLOCK. The granularity of this sync are those dimensions. I.E. // Marking X and Y but not Z means there should be Z semaphores of size X*Y. template -__device__ void sync(int64_t& semaphore, const uint64_t& segment_size) { +__device__ void sync( + int64_t& semaphore, + const uint64_t& segment_size, + const bool last_block) { // Finish all global memory transactions before synchronizing __threadfence(); @@ -36,8 +39,6 @@ __device__ void sync(int64_t& semaphore, const uint64_t& segment_size) { // Makes the assumption that blocks are in increasing order, this is not // guaranteed by CUDA but this is the current behavior, and unlikely to // change. - bool last_block = - index_utils::maskedIsLast(blockIdx, gridDim); if (last_block) { semaphore_increment = FIRST_UINT64_BIT - (segment_size - 1); } @@ -48,21 +49,86 @@ __device__ void sync(int64_t& semaphore, const uint64_t& segment_size) { // If for persistent kernels, lock all blocks until the semaphore has been // reached. Make sure we access semaphore as a volatile address so we get // the global memory updates. + unsigned int ns = 8; while ((PERSISTENT || last_block) && ((oldArrive ^ globalAsVolatile(semaphore)) & FIRST_UINT64_BIT) == 0) { // Put a sleep here so we have some breaks in probing the global // semaphore, giving a better chance for other warps/blocks to catch up. #if __CUDA_ARCH__ >= 700 - __nanosleep(200); -#else - // __nanosleep is not available for sm < 70 - assert(false); + // __nanosleep only available on compute capability 7.0 or higher + __nanosleep(ns); // avoids busy waiting + if (ns < 256) { + ns *= 2; + } +#endif + } + } + + // Sync block to make sure all other threads are waiting on the sync + block_sync::sync(); +} + +template +__device__ void sync(int64_t& semaphore, const uint64_t& segment_size) { + sync( + semaphore, + segment_size, + index_utils::maskedIsLast(blockIdx, gridDim)); +} + +// Grid sync that can be called multiple times in the same kernel without all +// blocks being resident on device. This allows grid sync to be called multiple +// times as long as it's not broadcasted on the parallel axis it was reduced on. +// +// n_entrances is how many times every block is expected to enter into this +// function. All blocks must enter n_entrances times. The last block is only +// allowed to proceed once all other blocks have entered n_entrance +// times. +// +// Note that this is not currently used by grid and welford reduction +// as they use a separate sync flag for each each grid sync call. +template +__device__ void sync( + int64_t& semaphore, + const uint64_t& segment_size, + const nvfuser_index_t n_entrances) { + // Finish all global memory transactions before synchronizing + __threadfence(); + + // Synchronize all threads in a block before synchronizing blocks + block_sync::sync(); + + // Only allow linear_tid == 0 to participate in the synchronization + if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) { + // Makes the assumption that blocks are in increasing order, this is not + // guaranteed by CUDA but this is the current behavior, and unlikely to + // change. + bool last_block = + index_utils::maskedIsLast(blockIdx, gridDim); + if (last_block) { + int64_t finished_val = + ((int64_t)(index_utils::maskedSize(gridDim) - 1)) * + ((int64_t)n_entrances); + + unsigned int ns = 8; + // Last block needs to wait for all other blocks to finish + while (globalAsVolatile(semaphore) < finished_val) { +#if __CUDA_ARCH__ >= 700 + // __nanosleep only available on compute capability 7.0 or higher + __nanosleep(ns); // avoids busy waiting + if (ns < 256) { + ns *= 2; + } #endif + } + } else { + auto old = atomicAdd(reinterpret_cast(&semaphore), 1); } } // Sync block to make sure all other threads are waiting on the sync block_sync::sync(); } + } // namespace grid_sync diff --git a/torch/csrc/jit/codegen/cuda/runtime/helpers.cu b/torch/csrc/jit/codegen/cuda/runtime/helpers.cu index 61dccb4dff21..027e6ceadbdb 100644 --- a/torch/csrc/jit/codegen/cuda/runtime/helpers.cu +++ b/torch/csrc/jit/codegen/cuda/runtime/helpers.cu @@ -27,20 +27,64 @@ __device__ constexpr int64_t ceilDiv(int a, int64_t b) { return ceilDiv((int64_t)a, b); } +// Monotonic and precise lerp is described here: +// https://math.stackexchange.com/a/1798323 +__device__ double lerp(double start, double end, double weight) { + if (weight < 0.5) { + return start + weight * (end - start); + } else { + return end - (end - start) * (1.0 - weight); + } +} + +__device__ float lerp(float start, float end, float weight) { + if (weight < 0.5f) { + return start + weight * (end - start); + } else { + return end - (end - start) * (1.0f - weight); + } +} + +__device__ std::complex lerp( + std::complex start, + std::complex end, + std::complex weight) { + if (abs(weight) < 0.5) { + return start + weight * (end - start); + } else { + return end - (end - start) * (1.0 - weight); + } +} + +__device__ std::complex lerp( + std::complex start, + std::complex end, + std::complex weight) { + if (abs(weight) < 0.5f) { + return start + weight * (end - start); + } else { + return end - (end - start) * (1.0f - weight); + } +} + +__device__ float lerp(float start, float end, double weight) { + return lerp(start, end, static_cast(weight)); +} + __device__ constexpr int max(int a, int b) { - return ::max(a, b); + return a > b ? a : b; } __device__ constexpr int64_t max(int64_t a, int b) { - return ::max(a, (int64_t)b); + return a > (int64_t)b ? a : (int64_t)b; } __device__ constexpr int64_t max(int a, int64_t b) { - return ::max((int64_t)a, b); + return (int64_t)a > b ? (int64_t)a : b; } __device__ constexpr int64_t max(int64_t a, int64_t b) { - return ::max(a, b); + return a > b ? a : b; } __device__ double fmax(double a, double b) { @@ -50,7 +94,7 @@ __device__ double fmax(double a, double b) { } else if (b != b) { return b; } else { - return ::fmax(a, b); + return a > b ? a : b; } } @@ -61,24 +105,24 @@ __device__ float fmax(float a, float b) { } else if (b != b) { return b; } else { - return ::fmax(a, b); + return a > b ? a : b; } } __device__ constexpr int min(int a, int b) { - return ::min(a, b); + return a > b ? b : a; } __device__ constexpr int64_t min(int64_t a, int b) { - return ::min(a, (int64_t)b); + return (int64_t)a > b ? b : (int64_t)a; } __device__ constexpr int64_t min(int a, int64_t b) { - return ::min((int64_t)a, b); + return a > (int64_t)b ? (int64_t)b : a; } __device__ constexpr int64_t min(int64_t a, int64_t b) { - return ::min(a, b); + return a > b ? b : a; } __device__ double fmin(double a, double b) { @@ -88,7 +132,7 @@ __device__ double fmin(double a, double b) { } else if (b != b) { return b; } else { - return ::fmin(a, b); + return a > b ? b : a; } } @@ -99,7 +143,7 @@ __device__ float fmin(float a, float b) { } else if (b != b) { return b; } else { - return ::fmin(a, b); + return a > b ? b : a; } } @@ -108,27 +152,27 @@ __device__ constexpr int alignBufferSize(int buffer, int size) { } __device__ double clamp(double x, double minv, double maxv) { - return x < minv ? minv : (x > maxv ? maxv : x); + return fmin(fmax(x, minv), maxv); } __device__ float clamp(float x, double minv, double maxv) { - return x < minv ? minv : (x > maxv ? maxv : x); + return fmin(fmax((double)x, minv), maxv); } -__device__ double frac(double x) { - return x - trunc(x); +__device__ int clamp(int x, int64_t minv, int64_t maxv) { + return min(max((int64_t)x, minv), maxv); } -__device__ float frac(float x) { - return x - trunc(x); +__device__ int64_t clamp(int64_t x, int64_t minv, int64_t maxv) { + return min(max(x, minv), maxv); } -__device__ double gelu(double x) { - return x * normcdf(x); +__device__ double frac(double x) { + return x - trunc(x); } -__device__ float gelu(float x) { - return x * normcdf(x); +__device__ float frac(float x) { + return x - trunc(x); } __device__ double reciprocal(double x) { @@ -139,6 +183,14 @@ __device__ float reciprocal(float x) { return 1 / x; } +__device__ std::complex reciprocal(std::complex x) { + return 1.0 / x; +} + +__device__ std::complex reciprocal(std::complex x) { + return 1.0f / x; +} + __device__ double relu(double x) { return x <= 0 ? 0 : x; } @@ -170,11 +222,19 @@ __device__ float remainder(float a, float b) { } __device__ double sigmoid(double x) { - return 1 / (1 + exp(-x)); + return 1.0 / (1.0 + exp(-x)); } __device__ float sigmoid(float x) { - return 1 / (1 + exp(-x)); + return 1.0f / (1.0f + exp(-x)); +} + +__device__ std::complex sigmoid(std::complex x) { + return 1.0 / (1.0 + exp(-x)); +} + +__device__ std::complex sigmoid(std::complex x) { + return 1.0f / (1.0f + exp(-x)); } __device__ double silu(double x) { @@ -193,6 +253,28 @@ __device__ float threshold(float x, double t, double v) { return x <= t ? v : x; } +__device__ std::complex where( + bool c, + std::complex a, + std::complex b) { + return c ? a : b; +} + +__device__ std::complex where( + bool c, + std::complex a, + std::complex b) { + return c ? a : b; +} + +__device__ int threshold(int x, int64_t t, int64_t v) { + return x <= t ? v : x; +} + +__device__ int64_t threshold(int64_t x, int64_t t, int64_t v) { + return x <= t ? v : x; +} + __device__ double where(bool c, double a, double b) { return c ? a : b; } @@ -205,6 +287,18 @@ __device__ int64_t where(bool c, int64_t a, int64_t b) { return c ? a : b; } +__device__ int where(bool c, int a, int b) { + return c ? a : b; +} + +__device__ int64_t where(bool c, int64_t a, int b) { + return c ? a : b; +} + +__device__ int64_t where(bool c, int a, int64_t b) { + return c ? a : b; +} + __device__ double randLike(Philox& rnd) { return uniform(rnd(), rnd()); } @@ -267,15 +361,161 @@ __device__ T pow(T a, T b) { } } -template int pow(int a, int b); -template int64_t pow(int64_t a, int64_t b); +template __device__ int pow(int a, int b); +template __device__ int64_t pow(int64_t a, int64_t b); template <> -float pow(float a, float b) { +__device__ float pow(float a, float b) { return ::pow(a, b); } template <> -double pow(double a, double b) { +__device__ double pow(double a, double b) { return ::pow(a, b); } + +__device__ float pow(float a, int b) { + return pow(a, (float)b); +} + +__device__ double pow(double a, int b) { + return pow(a, (double)b); +} + +__device__ float pow(float a, int64_t b) { + return pow(a, (float)b); +} + +__device__ double pow(double a, int64_t b) { + return pow(a, (double)b); +} + +int64_t pow(int64_t a, int b) { + return pow(a, (int64_t)b); +} + +int64_t pow(int a, int64_t b) { + return pow((int64_t)a, b); +} + +template +struct alignas(align) TypelessData { + int8_t data[size]; + + template _ = 0> + TypelessData(T x) { + *reinterpret_cast(data) = x; + } + + template _ = 0> + operator T() { + return *reinterpret_cast(data); + } +}; + +template +TypelessData erase_type(T x) { + return x; +} + +template +bool isfinite(T x) { + return ::isfinite(x); +} + +template +bool isfinite(std::complex x) { + return ::isfinite(std::real(x)) && ::isfinite(std::imag(x)); +} + +template +bool isinf(T x) { + return ::isinf(x); +} + +template +bool isinf(std::complex x) { + return ::isinf(std::real(x)) || ::isinf(std::imag(x)); +} + +//////////////////////////////////////////////////////////// +// TODO: the following overloads are only needed for CUDA // +// 10.2 Please remove when CUDA 10.2 support is dropped // +//////////////////////////////////////////////////////////// + +bool isinf(int64_t x) { + return false; +} + +bool isinf(int x) { + return false; +} + +bool isinf(short x) { + return false; +} + +bool isinf(char x) { + return false; +} + +bool isinf(unsigned char x) { + return false; +} + +bool isinf(bool x) { + return false; +} + +bool isfinite(int64_t x) { + return true; +} + +bool isfinite(int x) { + return true; +} + +bool isfinite(short x) { + return true; +} + +bool isfinite(char x) { + return true; +} + +bool isfinite(unsigned char x) { + return true; +} + +bool isfinite(bool x) { + return true; +} + +//////////////////////////////////////////////////////////// +// End TODO // +//////////////////////////////////////////////////////////// + +template +bool isnan(T x) { + return x != x; +} + +template +bool isneginf(T x) { + return x < 0 && isinf(x); +} + +template +bool isposinf(T x) { + return x > 0 && isinf(x); +} + +template +bool isreal(T x) { + return true; +} + +template +bool isreal(std::complex x) { + return std::imag(x) == 0; +} diff --git a/torch/csrc/jit/codegen/cuda/runtime/tensor.cu b/torch/csrc/jit/codegen/cuda/runtime/tensor.cu index aab51a8f1585..ac4f2069b3b1 100644 --- a/torch/csrc/jit/codegen/cuda/runtime/tensor.cu +++ b/torch/csrc/jit/codegen/cuda/runtime/tensor.cu @@ -19,3 +19,13 @@ struct Tensor { T* data; }; + +// Specialization for 0-dim case that's easy to pass in a CPU based tensor. +template +struct CpuScalarTensor { + __device__ T& operator[](int) { + return data; + }; + + T data; +}; diff --git a/torch/csrc/jit/codegen/cuda/runtime/tensorcore.cu b/torch/csrc/jit/codegen/cuda/runtime/tensorcore.cu new file mode 100644 index 000000000000..f95978e84475 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/runtime/tensorcore.cu @@ -0,0 +1,215 @@ +// Utility macro for this file +#define DEVICE_INLINE __device__ inline + +// MMA instruction wrappers: +// The wrappers are subroutines that implement matrix of size +// A(M,K) X B(K,N) = C(M,N) +// The naming of the wrappers follow similar naming conventions +// as the mma instructions. +// All the mma macros follow the namespace and naming like +// Arch::M (M-dim) N (N-dim) K(K-dim) (Layout), eg. +// Volta::M16N16K4TT, +// with the dimensions describing the size of the sub-matrices being +// multiplied by this wrapper. +// see [Operand Layout Convention] in mma_type.h for details on the layout +// notation. +namespace Volta { + +namespace util { +// MMA instruction wrappers (sm_70+): +// The instruction wrappers below are quarter-warp macros, which currently +// nvfuser +// doesn't explicitly model. So they are currently only meant to be +// used as building blocks in warp level mma macros + +// 8x8x4 mma instruction, per quarter warp (8 threads), fp32 accumulate +// per thread register: +// A[4] x B[4] -> C[8] +DEVICE_INLINE void mmaM8n8k4tt( + Array* C, + Array<__half, 4, 4>* A, + Array<__half, 4, 4>* B) { + unsigned const* _A = reinterpret_cast(A); + unsigned const* _B = reinterpret_cast(B); + unsigned* _C = reinterpret_cast(C); + + asm("mma.sync.aligned.m8n8k4.row.row.f32.f16.f16.f32 {%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, {%12,%13,%14,%15,%16,%17,%18,%19};\n" + : "=r"(_C[0]), + "=r"(_C[1]), + "=r"(_C[2]), + "=r"(_C[3]), + "=r"(_C[4]), + "=r"(_C[5]), + "=r"(_C[6]), + "=r"(_C[7]) + : "r"(_A[0]), + "r"(_A[1]), + "r"(_B[0]), + "r"(_B[1]), + "r"(_C[0]), + "r"(_C[1]), + "r"(_C[2]), + "r"(_C[3]), + "r"(_C[4]), + "r"(_C[5]), + "r"(_C[6]), + "r"(_C[7])); +} + +DEVICE_INLINE void mmaM8n8k4tn( + Array* C, + Array<__half, 4, 4>* A, + Array<__half, 4, 4>* B) { + unsigned const* _A = reinterpret_cast(A); + unsigned const* _B = reinterpret_cast(B); + unsigned* _C = reinterpret_cast(C); + + asm("mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32 {%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, {%12,%13,%14,%15,%16,%17,%18,%19};\n" + : "=r"(_C[0]), + "=r"(_C[1]), + "=r"(_C[2]), + "=r"(_C[3]), + "=r"(_C[4]), + "=r"(_C[5]), + "=r"(_C[6]), + "=r"(_C[7]) + : "r"(_A[0]), + "r"(_A[1]), + "r"(_B[0]), + "r"(_B[1]), + "r"(_C[0]), + "r"(_C[1]), + "r"(_C[2]), + "r"(_C[3]), + "r"(_C[4]), + "r"(_C[5]), + "r"(_C[6]), + "r"(_C[7])); +} + +DEVICE_INLINE void mmaM8n8k4nt( + Array* C, + Array<__half, 4, 4>* A, + Array<__half, 4, 4>* B) { + unsigned const* _A = reinterpret_cast(A); + unsigned const* _B = reinterpret_cast(B); + unsigned* _C = reinterpret_cast(C); + + asm("mma.sync.aligned.m8n8k4.col.row.f32.f16.f16.f32 {%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, {%12,%13,%14,%15,%16,%17,%18,%19};\n" + : "=r"(_C[0]), + "=r"(_C[1]), + "=r"(_C[2]), + "=r"(_C[3]), + "=r"(_C[4]), + "=r"(_C[5]), + "=r"(_C[6]), + "=r"(_C[7]) + : "r"(_A[0]), + "r"(_A[1]), + "r"(_B[0]), + "r"(_B[1]), + "r"(_C[0]), + "r"(_C[1]), + "r"(_C[2]), + "r"(_C[3]), + "r"(_C[4]), + "r"(_C[5]), + "r"(_C[6]), + "r"(_C[7])); +} + +// TODO: in a follow up, +// lift this part onto iterdomain ops, once the +// swizzle ops are ready. +template +DEVICE_INLINE Array accToMma(float* _C) { + float C_data[8] = { + _C[0], + _C[1], + _C[acc_stride], + _C[acc_stride + 1], + _C[2], + _C[3], + _C[acc_stride + 2], + _C[acc_stride + 3], + }; + + return *reinterpret_cast*>(&C_data[0]); +} + +template +DEVICE_INLINE void mmaToAcc(float* _C, Array& C) { + float* C_data = reinterpret_cast(&C); + _C[0] = C_data[0]; + _C[1] = C_data[1]; + _C[acc_stride] = C_data[2]; + _C[acc_stride + 1] = C_data[3]; + _C[2] = C_data[4]; + _C[3] = C_data[5]; + _C[acc_stride + 2] = C_data[6]; + _C[acc_stride + 3] = C_data[7]; +} + +// Should be able to lift this with transpose op as well. +template +DEVICE_INLINE void initM16N16K4(Array& accumulator) { + float* _C = reinterpret_cast(&accumulator); + float zeros[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + mmaToAcc(_C, *reinterpret_cast*>(&zeros[0])); +} + +} // namespace util + +template +DEVICE_INLINE void M16N16K4TT( + Array* C, + Array<__half, 4, 4>* A, + Array<__half, 4, 4>* B) { + float* _C = reinterpret_cast(C); + Array C_data = util::accToMma(_C); + util::mmaM8n8k4tt(&C_data, A, B); + util::mmaToAcc(_C, C_data); +} + +template +DEVICE_INLINE void M16N16K4TN( + Array* C, + Array<__half, 4, 4>* A, + Array<__half, 4, 4>* B) { + float* _C = reinterpret_cast(C); + Array C_data = util::accToMma(_C); + util::mmaM8n8k4tn(&C_data, A, B); + util::mmaToAcc(_C, C_data); +} + +template +DEVICE_INLINE void M16N16K4NT( + Array* C, + Array<__half, 4, 4>* A, + Array<__half, 4, 4>* B) { + float* _C = reinterpret_cast(C); + Array C_data = util::accToMma(_C); + util::mmaM8n8k4nt(&C_data, A, B); + util::mmaToAcc(_C, C_data); +} + +// Same initialization for now, will be different in interleaved +// macros +template +DEVICE_INLINE void initM16N16K4TT(Array* accumulator) { + util::initM16N16K4(*accumulator); +} + +template +DEVICE_INLINE void initM16N16K4TN(Array* accumulator) { + util::initM16N16K4(*accumulator); +} + +template +DEVICE_INLINE void initM16N16K4NT(Array* accumulator) { + util::initM16N16K4(*accumulator); +} + +} // namespace Volta + +#undef DEVICE_INLINE diff --git a/torch/csrc/jit/codegen/cuda/runtime/tuple.cu b/torch/csrc/jit/codegen/cuda/runtime/tuple.cu new file mode 100644 index 000000000000..8e67dba7da72 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/runtime/tuple.cu @@ -0,0 +1,322 @@ +// std::tuple-like type +template +struct Tuple; + +template +struct Tuple { + T0 val0; + + __device__ Tuple(T0 _val0) : val0(_val0) {} + + // Only valid when instantiated for pointer types + __device__ void operator+=(nvfuser_index_t offset) { + static_assert(IsPointerType::value, "Invalid for non-pointer types"); + val0 += offset; + } +}; + +template +struct Tuple { + T0 val0; + T1 val1; + + __device__ Tuple(T0 _val0, T1 _val1) : val0(_val0), val1(_val1) {} + + // Only valid when instantiated for pointer types + __device__ void operator+=(nvfuser_index_t offset) { + static_assert(IsPointerType::value, "Invalid for non-pointer types"); + static_assert(IsPointerType::value, "Invalid for non-pointer types"); + val0 += offset; + val1 += offset; + } +}; + +template +struct Tuple { + T0 val0; + T1 val1; + T2 val2; + + __device__ Tuple(T0 _val0, T1 _val1, T2 _val2) + : val0(_val0), val1(_val1), val2(_val2) {} + + // Only valid when instantiated for pointer types + __device__ void operator+=(nvfuser_index_t offset) { + static_assert(IsPointerType::value, "Invalid for non-pointer types"); + static_assert(IsPointerType::value, "Invalid for non-pointer types"); + static_assert(IsPointerType::value, "Invalid for non-pointer types"); + val0 += offset; + val1 += offset; + val2 += offset; + } +}; + +// Accessor for Tuple +template +struct get; + +template <> +struct get<0> { + template + __device__ auto& operator()(Tuple& vals) { + return vals.val0; + } + template + __device__ const auto& operator()(const Tuple& vals) { + return vals.val0; + } +}; + +template <> +struct get<1> { + template + __device__ auto& operator()(Tuple& vals) { + return vals.val1; + } + template + __device__ const auto& operator()(const Tuple& vals) { + return vals.val1; + } +}; + +template <> +struct get<2> { + template + __device__ auto& operator()(Tuple& vals) { + return vals.val2; + } + template + __device__ const auto& operator()(const Tuple& vals) { + return vals.val2; + } +}; + +template +__inline__ __device__ static void copyTuple( + DstType& dst, + nvfuser_index_t dst_offset, + const SrcType& src, + nvfuser_index_t src_offset = 0); + +template +__inline__ __device__ static void copyTuple( + DstType& dst, + const SrcType& src, + nvfuser_index_t src_offset = 0); + +template +class LocalTuple { + public: + static constexpr int num_vals = sizeof...(Types); + using ValTypes = TypeList; + + __device__ LocalTuple(Types... args) : vals_(args...) {} + + __device__ LocalTuple(const LocalTuple& other) : vals_(other.vals_) {} + + template